From 145449b1e420787bb99721a429341fa6be3adfb6 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Sun, 3 Jul 2022 16:10:23 +0200
Subject: Vendor import of llvm-project main
 llvmorg-15-init-15358-g53dc0f107877.

---
 llvm/include/llvm-c/Core.h                         |    46 +-
 llvm/include/llvm-c/DisassemblerTypes.h            |    16 +-
 llvm/include/llvm-c/Object.h                       |    32 +-
 llvm/include/llvm-c/Orc.h                          |   110 +-
 llvm/include/llvm-c/TargetMachine.h                |     4 +-
 llvm/include/llvm-c/Transforms/Coroutines.h        |    56 -
 llvm/include/llvm-c/Transforms/IPO.h               |     3 -
 .../include/llvm-c/Transforms/PassManagerBuilder.h |     6 -
 llvm/include/llvm-c/Transforms/Scalar.h            |     3 -
 llvm/include/llvm-c/blake3.h                       |    79 +
 llvm/include/llvm/ADT/APFloat.h                    |     3 +-
 llvm/include/llvm/ADT/APInt.h                      |    58 +-
 llvm/include/llvm/ADT/AddressRanges.h              |    79 +
 llvm/include/llvm/ADT/ArrayRef.h                   |    41 +-
 llvm/include/llvm/ADT/BitmaskEnum.h                |    12 +-
 llvm/include/llvm/ADT/BreadthFirstIterator.h       |     2 +-
 llvm/include/llvm/ADT/DenseMap.h                   |     1 +
 llvm/include/llvm/ADT/EpochTracker.h               |     4 +-
 llvm/include/llvm/ADT/EquivalenceClasses.h         |     3 +-
 llvm/include/llvm/ADT/FloatingPointMode.h          |    28 +-
 llvm/include/llvm/ADT/FoldingSet.h                 |    55 +-
 llvm/include/llvm/ADT/GenericCycleImpl.h           |    54 +-
 llvm/include/llvm/ADT/GenericCycleInfo.h           |    18 +
 llvm/include/llvm/ADT/IntervalMap.h                |    34 +-
 llvm/include/llvm/ADT/IntrusiveRefCntPtr.h         |     4 +-
 llvm/include/llvm/ADT/Optional.h                   |   195 +-
 llvm/include/llvm/ADT/PointerIntPair.h             |    10 +-
 llvm/include/llvm/ADT/PointerSumType.h             |     5 +-
 llvm/include/llvm/ADT/PointerUnion.h               |    74 +-
 llvm/include/llvm/ADT/SCCIterator.h                |    11 +-
 llvm/include/llvm/ADT/STLExtras.h                  |    56 +-
 llvm/include/llvm/ADT/SmallVector.h                |     5 +-
 llvm/include/llvm/ADT/Statistic.h                  |    38 +-
 llvm/include/llvm/ADT/StringRef.h                  |     4 +
 llvm/include/llvm/ADT/Triple.h                     |    91 +-
 llvm/include/llvm/ADT/edit_distance.h              |    38 +-
 llvm/include/llvm/Analysis/AliasAnalysis.h         |    12 +-
 .../include/llvm/Analysis/AliasAnalysisEvaluator.h |     4 +-
 llvm/include/llvm/Analysis/AliasSetTracker.h       |     7 -
 llvm/include/llvm/Analysis/AssumeBundleQueries.h   |     6 +-
 llvm/include/llvm/Analysis/BasicAliasAnalysis.h    |     2 -
 .../include/llvm/Analysis/BlockFrequencyInfoImpl.h |    24 +-
 llvm/include/llvm/Analysis/BranchProbabilityInfo.h |     2 -
 llvm/include/llvm/Analysis/CFGPrinter.h            |     4 +-
 llvm/include/llvm/Analysis/CFLAliasAnalysisUtils.h |     2 +
 .../include/llvm/Analysis/CFLAndersAliasAnalysis.h |     2 +-
 .../include/llvm/Analysis/CFLSteensAliasAnalysis.h |     2 -
 llvm/include/llvm/Analysis/CGSCCPassManager.h      |    24 +-
 llvm/include/llvm/Analysis/CallGraph.h             |     5 +-
 llvm/include/llvm/Analysis/CallPrinter.h           |    14 +
 llvm/include/llvm/Analysis/CaptureTracking.h       |    35 +-
 llvm/include/llvm/Analysis/CmpInstAnalysis.h       |    37 +-
 llvm/include/llvm/Analysis/CodeMetrics.h           |     7 +-
 llvm/include/llvm/Analysis/ConstantFolding.h       |    36 +-
 llvm/include/llvm/Analysis/ConstraintSystem.h      |    19 +-
 llvm/include/llvm/Analysis/DDG.h                   |     4 +-
 llvm/include/llvm/Analysis/DDGPrinter.h            |     3 +-
 llvm/include/llvm/Analysis/DOTGraphTraitsPass.h    |   195 +-
 llvm/include/llvm/Analysis/Delinearization.h       |    15 +-
 llvm/include/llvm/Analysis/DependenceAnalysis.h    |     6 +-
 llvm/include/llvm/Analysis/DivergenceAnalysis.h    |     8 +-
 llvm/include/llvm/Analysis/DomPrinter.h            |   118 +-
 llvm/include/llvm/Analysis/DomTreeUpdater.h        |    43 -
 llvm/include/llvm/Analysis/DominanceFrontierImpl.h |     1 -
 llvm/include/llvm/Analysis/EHPersonalities.h       |     1 -
 .../llvm/Analysis/FunctionPropertiesAnalysis.h     |    49 +-
 llvm/include/llvm/Analysis/GlobalsModRef.h         |    11 +-
 .../include/llvm/Analysis/IRSimilarityIdentifier.h |    41 +-
 llvm/include/llvm/Analysis/IVDescriptors.h         |    52 +-
 llvm/include/llvm/Analysis/IVUsers.h               |     2 -
 llvm/include/llvm/Analysis/InlineAdvisor.h         |    51 +-
 llvm/include/llvm/Analysis/InlineCost.h            |    14 +-
 .../include/llvm/Analysis/InlineModelFeatureMaps.h |     4 +-
 llvm/include/llvm/Analysis/InlineOrder.h           |    99 +-
 llvm/include/llvm/Analysis/InstSimplifyFolder.h    |   141 +-
 llvm/include/llvm/Analysis/InstructionSimplify.h   |    94 +-
 llvm/include/llvm/Analysis/IntervalIterator.h      |     3 +-
 llvm/include/llvm/Analysis/LazyCallGraph.h         |    11 +-
 llvm/include/llvm/Analysis/LazyValueInfo.h         |     3 +
 llvm/include/llvm/Analysis/Loads.h                 |     4 +-
 llvm/include/llvm/Analysis/LoopAccessAnalysis.h    |    91 +-
 llvm/include/llvm/Analysis/LoopAnalysisManager.h   |     1 -
 llvm/include/llvm/Analysis/LoopCacheAnalysis.h     |    24 +-
 llvm/include/llvm/Analysis/LoopInfo.h              |    20 +-
 llvm/include/llvm/Analysis/LoopInfoImpl.h          |    15 +-
 llvm/include/llvm/Analysis/LoopPass.h              |     3 +-
 llvm/include/llvm/Analysis/LoopUnrollAnalyzer.h    |     7 +-
 llvm/include/llvm/Analysis/MLInlineAdvisor.h       |    36 +-
 llvm/include/llvm/Analysis/MLModelRunner.h         |    21 +-
 llvm/include/llvm/Analysis/MemoryBuiltins.h        |    33 +-
 llvm/include/llvm/Analysis/MemoryLocation.h        |     1 +
 llvm/include/llvm/Analysis/MemorySSA.h             |    48 +-
 llvm/include/llvm/Analysis/MemorySSAUpdater.h      |     3 +-
 .../llvm/Analysis/ModelUnderTrainingRunner.h       |     7 +-
 .../include/llvm/Analysis/ModuleDebugInfoPrinter.h |     2 +-
 llvm/include/llvm/Analysis/MustExecute.h           |     2 +-
 .../include/llvm/Analysis/NoInferenceModelRunner.h |    12 +-
 llvm/include/llvm/Analysis/ObjCARCUtil.h           |     4 +-
 llvm/include/llvm/Analysis/OverflowInstAnalysis.h  |     4 +-
 llvm/include/llvm/Analysis/PhiValues.h             |     1 -
 llvm/include/llvm/Analysis/PostDominators.h        |     5 +-
 llvm/include/llvm/Analysis/ProfileSummaryInfo.h    |     4 +-
 llvm/include/llvm/Analysis/PtrUseVisitor.h         |    11 +-
 llvm/include/llvm/Analysis/RegionInfo.h            |     6 +-
 llvm/include/llvm/Analysis/RegionInfoImpl.h        |     4 +-
 llvm/include/llvm/Analysis/RegionIterator.h        |     2 +-
 llvm/include/llvm/Analysis/RegionPass.h            |     3 +-
 llvm/include/llvm/Analysis/RegionPrinter.h         |    10 +
 .../include/llvm/Analysis/ReleaseModeModelRunner.h |    44 +-
 llvm/include/llvm/Analysis/ReplayInlineAdvisor.h   |    14 +-
 llvm/include/llvm/Analysis/ScalarEvolution.h       |   137 +-
 .../llvm/Analysis/ScalarEvolutionAliasAnalysis.h   |     7 +-
 .../llvm/Analysis/ScalarEvolutionExpressions.h     |     9 +-
 .../llvm/Analysis/ScalarEvolutionNormalization.h   |     2 +-
 llvm/include/llvm/Analysis/ScalarFuncs.def         |   117 +
 llvm/include/llvm/Analysis/SparsePropagation.h     |     1 +
 llvm/include/llvm/Analysis/StackLifetime.h         |     3 +-
 .../include/llvm/Analysis/SyncDependenceAnalysis.h |     6 +-
 llvm/include/llvm/Analysis/SyntheticCountsUtils.h  |     2 +-
 llvm/include/llvm/Analysis/TargetFolder.h          |   162 +-
 llvm/include/llvm/Analysis/TargetLibraryInfo.h     |    14 +-
 llvm/include/llvm/Analysis/TargetTransformInfo.h   |   139 +-
 .../llvm/Analysis/TargetTransformInfoImpl.h        |    78 +-
 llvm/include/llvm/Analysis/TensorSpec.h            |   132 +
 llvm/include/llvm/Analysis/TypeMetadataUtils.h     |     2 +-
 llvm/include/llvm/Analysis/Utils/TFUtils.h         |   102 +-
 llvm/include/llvm/Analysis/ValueLattice.h          |     6 +-
 llvm/include/llvm/Analysis/ValueTracking.h         |    40 +-
 llvm/include/llvm/Analysis/VectorUtils.h           |    30 +-
 llvm/include/llvm/AsmParser/LLLexer.h              |     2 +-
 llvm/include/llvm/AsmParser/LLParser.h             |    23 +-
 llvm/include/llvm/AsmParser/LLToken.h              |    98 +-
 llvm/include/llvm/AsmParser/Parser.h               |     4 +-
 llvm/include/llvm/BinaryFormat/COFF.h              |     5 +-
 llvm/include/llvm/BinaryFormat/DXContainer.h       |   131 +
 llvm/include/llvm/BinaryFormat/Dwarf.h             |     4 +
 llvm/include/llvm/BinaryFormat/DynamicTags.def     |     1 +
 llvm/include/llvm/BinaryFormat/ELF.h               |    90 +-
 .../llvm/BinaryFormat/ELFRelocs/LoongArch.def      |    62 +
 llvm/include/llvm/BinaryFormat/GOFF.h              |    33 +
 llvm/include/llvm/BinaryFormat/MachO.h             |    45 +-
 llvm/include/llvm/BinaryFormat/Magic.h             |     3 +
 llvm/include/llvm/BinaryFormat/Swift.def           |     7 +
 llvm/include/llvm/BinaryFormat/Wasm.h              |    22 +-
 llvm/include/llvm/BinaryFormat/XCOFF.h             |    30 +
 llvm/include/llvm/Bitcode/BitcodeAnalyzer.h        |     3 +-
 llvm/include/llvm/Bitcode/BitcodeReader.h          |     7 +-
 llvm/include/llvm/Bitcode/BitcodeWriter.h          |     2 +-
 llvm/include/llvm/Bitcode/BitcodeWriterPass.h      |     1 -
 llvm/include/llvm/Bitcode/LLVMBitCodes.h           |    27 +-
 llvm/include/llvm/Bitstream/BitCodeEnums.h         |    90 +
 llvm/include/llvm/Bitstream/BitCodes.h             |    71 +-
 llvm/include/llvm/Bitstream/BitstreamReader.h      |    50 +-
 llvm/include/llvm/Bitstream/BitstreamWriter.h      |    21 +-
 llvm/include/llvm/CodeGen/AccelTable.h             |     8 +-
 llvm/include/llvm/CodeGen/Analysis.h               |     5 +-
 llvm/include/llvm/CodeGen/AsmPrinter.h             |    36 +-
 .../llvm/CodeGen/BasicBlockSectionsProfileReader.h |   109 +
 llvm/include/llvm/CodeGen/BasicTTIImpl.h           |   248 +-
 llvm/include/llvm/CodeGen/CFIFixup.h               |    38 +
 llvm/include/llvm/CodeGen/CalcSpillWeights.h       |    12 -
 llvm/include/llvm/CodeGen/CallingConvLower.h       |     2 -
 llvm/include/llvm/CodeGen/CodeGenCommonISel.h      |     8 +-
 llvm/include/llvm/CodeGen/CodeGenPassBuilder.h     |    11 +-
 llvm/include/llvm/CodeGen/CommandFlags.h           |    14 +-
 llvm/include/llvm/CodeGen/DFAPacketizer.h          |     3 +-
 .../llvm/CodeGen/DbgEntityHistoryCalculator.h      |     2 +-
 llvm/include/llvm/CodeGen/DwarfStringPoolEntry.h   |    94 +-
 llvm/include/llvm/CodeGen/FastISel.h               |    11 +-
 llvm/include/llvm/CodeGen/FaultMaps.h              |     1 -
 llvm/include/llvm/CodeGen/FunctionLoweringInfo.h   |     4 +
 .../llvm/CodeGen/GlobalISel/CSEMIRBuilder.h        |     2 +-
 .../include/llvm/CodeGen/GlobalISel/CallLowering.h |     5 +-
 llvm/include/llvm/CodeGen/GlobalISel/Combiner.h    |     1 -
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h       |    46 +-
 .../llvm/CodeGen/GlobalISel/GISelWorkList.h        |    12 +-
 .../llvm/CodeGen/GlobalISel/GenericMachineInstrs.h |    32 +
 .../include/llvm/CodeGen/GlobalISel/IRTranslator.h |     9 +-
 .../llvm/CodeGen/GlobalISel/InstructionSelect.h    |     4 +-
 .../llvm/CodeGen/GlobalISel/InstructionSelector.h  |     9 +-
 .../CodeGen/GlobalISel/InstructionSelectorImpl.h   |     5 +-
 .../GlobalISel/LegalizationArtifactCombiner.h      |     8 +-
 llvm/include/llvm/CodeGen/GlobalISel/Legalizer.h   |     8 +-
 .../llvm/CodeGen/GlobalISel/LegalizerHelper.h      |    20 +-
 .../llvm/CodeGen/GlobalISel/LegalizerInfo.h        |    38 +-
 .../include/llvm/CodeGen/GlobalISel/LoadStoreOpt.h |    13 +-
 llvm/include/llvm/CodeGen/GlobalISel/Localizer.h   |     5 +-
 .../llvm/CodeGen/GlobalISel/MIPatternMatch.h       |    42 +
 .../llvm/CodeGen/GlobalISel/MachineIRBuilder.h     |    40 +-
 .../llvm/CodeGen/GlobalISel/RegBankSelect.h        |     2 +-
 .../include/llvm/CodeGen/GlobalISel/RegisterBank.h |    98 -
 .../llvm/CodeGen/GlobalISel/RegisterBankInfo.h     |   775 -
 llvm/include/llvm/CodeGen/GlobalISel/Utils.h       |    61 +-
 llvm/include/llvm/CodeGen/ISDOpcodes.h             |    77 +-
 llvm/include/llvm/CodeGen/IntrinsicLowering.h      |     2 -
 .../llvm/CodeGen/LazyMachineBlockFrequencyInfo.h   |     2 +-
 llvm/include/llvm/CodeGen/LiveInterval.h           |    14 +-
 llvm/include/llvm/CodeGen/LiveIntervalUnion.h      |    14 +-
 llvm/include/llvm/CodeGen/LiveIntervals.h          |     2 +-
 llvm/include/llvm/CodeGen/LivePhysRegs.h           |     2 +
 llvm/include/llvm/CodeGen/LiveRangeCalc.h          |     1 -
 llvm/include/llvm/CodeGen/LiveRangeEdit.h          |    10 +-
 llvm/include/llvm/CodeGen/LiveRegMatrix.h          |    12 +-
 llvm/include/llvm/CodeGen/LiveStacks.h             |     6 +-
 llvm/include/llvm/CodeGen/LiveVariables.h          |     1 +
 llvm/include/llvm/CodeGen/MIRFSDiscriminator.h     |    21 +-
 llvm/include/llvm/CodeGen/MIRParser/MIRParser.h    |    11 +-
 llvm/include/llvm/CodeGen/MIRSampleProfile.h       |    28 +-
 llvm/include/llvm/CodeGen/MIRYamlMapping.h         |    19 +-
 llvm/include/llvm/CodeGen/MachineBasicBlock.h      |    35 +-
 .../llvm/CodeGen/MachineBranchProbabilityInfo.h    |     2 -
 llvm/include/llvm/CodeGen/MachineCombinerPattern.h |     4 +
 llvm/include/llvm/CodeGen/MachineCycleAnalysis.h   |    26 +-
 llvm/include/llvm/CodeGen/MachineDominators.h      |     5 +
 llvm/include/llvm/CodeGen/MachineFrameInfo.h       |    34 +-
 llvm/include/llvm/CodeGen/MachineFunction.h        |    67 +-
 llvm/include/llvm/CodeGen/MachineInstr.h           |    45 +-
 llvm/include/llvm/CodeGen/MachineLoopInfo.h        |     1 -
 llvm/include/llvm/CodeGen/MachineMemOperand.h      |     3 +-
 llvm/include/llvm/CodeGen/MachineModuleInfo.h      |    71 +-
 llvm/include/llvm/CodeGen/MachineOperand.h         |    15 +-
 .../CodeGen/MachineOptimizationRemarkEmitter.h     |     3 +-
 llvm/include/llvm/CodeGen/MachineOutliner.h        |   138 +-
 llvm/include/llvm/CodeGen/MachinePassManager.h     |     6 +-
 llvm/include/llvm/CodeGen/MachinePassRegistry.def  |     3 +-
 llvm/include/llvm/CodeGen/MachinePipeliner.h       |    20 +-
 llvm/include/llvm/CodeGen/MachineRegisterInfo.h    |    33 +-
 llvm/include/llvm/CodeGen/MachineSSAContext.h      |    10 +-
 llvm/include/llvm/CodeGen/MachineScheduler.h       |     4 +-
 llvm/include/llvm/CodeGen/MachineStableHash.h      |     4 +
 llvm/include/llvm/CodeGen/ModuloSchedule.h         |     7 +-
 llvm/include/llvm/CodeGen/PBQP/ReductionRules.h    |     2 +-
 llvm/include/llvm/CodeGen/Passes.h                 |    17 +-
 llvm/include/llvm/CodeGen/PseudoSourceValue.h      |    19 +-
 llvm/include/llvm/CodeGen/RDFGraph.h               |     1 -
 llvm/include/llvm/CodeGen/RegAllocPBQP.h           |    17 +-
 llvm/include/llvm/CodeGen/Register.h               |     2 +-
 llvm/include/llvm/CodeGen/RegisterBank.h           |    98 +
 llvm/include/llvm/CodeGen/RegisterBankInfo.h       |   775 +
 llvm/include/llvm/CodeGen/RegisterClassInfo.h      |     7 +-
 llvm/include/llvm/CodeGen/RegisterPressure.h       |     1 -
 llvm/include/llvm/CodeGen/RegisterScavenging.h     |    20 +
 llvm/include/llvm/CodeGen/RegisterUsageInfo.h      |     2 +-
 llvm/include/llvm/CodeGen/ReplaceWithVeclib.h      |     4 +-
 llvm/include/llvm/CodeGen/ScheduleDAG.h            |     2 +-
 llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h      |     2 +-
 llvm/include/llvm/CodeGen/SelectionDAG.h           |   222 +-
 .../llvm/CodeGen/SelectionDAGAddressAnalysis.h     |     2 +-
 llvm/include/llvm/CodeGen/SelectionDAGISel.h       |     4 +-
 llvm/include/llvm/CodeGen/SelectionDAGNodes.h      |   178 +-
 llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h |     4 +-
 llvm/include/llvm/CodeGen/SlotIndexes.h            |     1 -
 llvm/include/llvm/CodeGen/StackMaps.h              |     2 +-
 llvm/include/llvm/CodeGen/StackProtector.h         |     1 -
 .../include/llvm/CodeGen/SwiftErrorValueTracking.h |     2 -
 llvm/include/llvm/CodeGen/TailDuplicator.h         |     5 +-
 llvm/include/llvm/CodeGen/TargetCallingConv.h      |     3 +-
 llvm/include/llvm/CodeGen/TargetFrameLowering.h    |    12 +
 llvm/include/llvm/CodeGen/TargetInstrInfo.h        |    26 +-
 llvm/include/llvm/CodeGen/TargetLowering.h         |   284 +-
 .../llvm/CodeGen/TargetLoweringObjectFileImpl.h    |    11 +
 llvm/include/llvm/CodeGen/TargetPassConfig.h       |     3 +
 llvm/include/llvm/CodeGen/TargetRegisterInfo.h     |    47 +-
 llvm/include/llvm/CodeGen/TargetSubtargetInfo.h    |    10 +-
 llvm/include/llvm/CodeGen/TileShapeInfo.h          |     4 +-
 llvm/include/llvm/CodeGen/ValueTypes.h             |     7 +-
 llvm/include/llvm/CodeGen/ValueTypes.td            |   403 +-
 llvm/include/llvm/DWARFLinker/DWARFLinker.h        |    61 +-
 .../llvm/DWARFLinker/DWARFLinkerCompileUnit.h      |     8 +-
 .../llvm/DWARFLinker/DWARFLinkerDeclContext.h      |    10 +-
 llvm/include/llvm/DWARFLinker/DWARFStreamer.h      |     3 +-
 llvm/include/llvm/DWP/DWPStringPool.h              |     2 +-
 .../DebugInfo/CodeView/AppendingTypeTableBuilder.h |     2 +-
 .../llvm/DebugInfo/CodeView/CVSymbolVisitor.h      |    10 +-
 .../llvm/DebugInfo/CodeView/CVTypeVisitor.h        |     5 +-
 .../llvm/DebugInfo/CodeView/CodeViewRecordIO.h     |    10 +-
 .../DebugInfo/CodeView/ContinuationRecordBuilder.h |    10 +-
 .../DebugInfo/CodeView/DebugChecksumsSubsection.h  |     4 +-
 .../DebugInfo/CodeView/DebugCrossExSubsection.h    |     3 +-
 .../DebugInfo/CodeView/DebugCrossImpSubsection.h   |     3 +-
 .../DebugInfo/CodeView/DebugFrameDataSubsection.h  |     6 +-
 .../CodeView/DebugInlineeLinesSubsection.h         |     1 -
 .../llvm/DebugInfo/CodeView/DebugLinesSubsection.h |     3 +-
 .../llvm/DebugInfo/CodeView/DebugSubsection.h      |     6 +-
 .../DebugInfo/CodeView/DebugSubsectionVisitor.h    |     1 -
 llvm/include/llvm/DebugInfo/CodeView/EnumTables.h  |     2 +-
 llvm/include/llvm/DebugInfo/CodeView/Formatters.h  |     2 +
 .../DebugInfo/CodeView/GlobalTypeTableBuilder.h    |     4 +-
 llvm/include/llvm/DebugInfo/CodeView/Line.h        |     1 -
 .../DebugInfo/CodeView/MergingTypeTableBuilder.h   |     6 +-
 llvm/include/llvm/DebugInfo/CodeView/RecordName.h  |     7 +-
 .../llvm/DebugInfo/CodeView/RecordSerialization.h  |     3 +-
 .../llvm/DebugInfo/CodeView/StringsAndChecksums.h  |     6 +-
 .../include/llvm/DebugInfo/CodeView/SymbolDumper.h |     8 +-
 .../include/llvm/DebugInfo/CodeView/SymbolRecord.h |     4 +-
 .../llvm/DebugInfo/CodeView/SymbolSerializer.h     |     4 +-
 .../llvm/DebugInfo/CodeView/TypeCollection.h       |     2 +-
 .../llvm/DebugInfo/CodeView/TypeDumpVisitor.h      |    10 +-
 llvm/include/llvm/DebugInfo/CodeView/TypeHashing.h |     6 +-
 llvm/include/llvm/DebugInfo/CodeView/TypeIndex.h   |     2 +-
 .../llvm/DebugInfo/CodeView/TypeIndexDiscovery.h   |     6 +-
 .../llvm/DebugInfo/CodeView/TypeRecordMapping.h    |     5 +-
 .../llvm/DebugInfo/CodeView/TypeStreamMerger.h     |     3 +-
 llvm/include/llvm/DebugInfo/DIContext.h            |    10 +
 .../DebugInfo/DWARF/DWARFAbbreviationDeclaration.h |     4 +-
 .../llvm/DebugInfo/DWARF/DWARFAddressRange.h       |     3 +
 .../llvm/DebugInfo/DWARF/DWARFCompileUnit.h        |     7 +-
 llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h   |    36 +-
 llvm/include/llvm/DebugInfo/DWARF/DWARFDebugAddr.h |     5 +-
 .../llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h     |     3 +-
 .../llvm/DebugInfo/DWARF/DWARFDebugAranges.h       |     7 +-
 .../include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h |     8 +-
 .../llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h     |     2 +-
 llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h |     8 +-
 llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h  |     9 +-
 .../include/llvm/DebugInfo/DWARF/DWARFDebugMacro.h |     1 -
 .../llvm/DebugInfo/DWARF/DWARFDebugPubTable.h      |     5 +-
 .../llvm/DebugInfo/DWARF/DWARFDebugRangeList.h     |     6 +-
 .../llvm/DebugInfo/DWARF/DWARFDebugRnglists.h      |    11 +-
 llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h       |     9 +-
 .../include/llvm/DebugInfo/DWARF/DWARFExpression.h |     5 +-
 llvm/include/llvm/DebugInfo/DWARF/DWARFFormValue.h |    14 +-
 llvm/include/llvm/DebugInfo/DWARF/DWARFGdbIndex.h  |     2 +-
 llvm/include/llvm/DebugInfo/DWARF/DWARFListTable.h |     1 -
 llvm/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h  |     1 +
 .../llvm/DebugInfo/DWARF/DWARFTypePrinter.h        |    67 +
 llvm/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h  |     3 +-
 llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h      |    30 +-
 llvm/include/llvm/DebugInfo/DWARF/DWARFUnitIndex.h |    21 +-
 llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h  |    28 +-
 .../include/llvm/DebugInfo/GSYM/DwarfTransformer.h |     2 +-
 llvm/include/llvm/DebugInfo/GSYM/ExtractRanges.h   |    81 +
 llvm/include/llvm/DebugInfo/GSYM/FunctionInfo.h    |    17 +-
 llvm/include/llvm/DebugInfo/GSYM/GsymCreator.h     |     2 +-
 llvm/include/llvm/DebugInfo/GSYM/InlineInfo.h      |     3 +-
 llvm/include/llvm/DebugInfo/GSYM/LineEntry.h       |     2 +-
 llvm/include/llvm/DebugInfo/GSYM/LookupResult.h    |     2 +-
 llvm/include/llvm/DebugInfo/GSYM/Range.h           |   130 -
 llvm/include/llvm/DebugInfo/GSYM/StringTable.h     |     2 +-
 llvm/include/llvm/DebugInfo/MSF/MSFBuilder.h       |     4 +-
 llvm/include/llvm/DebugInfo/PDB/IPDBEnumChildren.h |     1 +
 .../DebugInfo/PDB/Native/DbiModuleDescriptor.h     |     6 +-
 .../PDB/Native/DbiModuleDescriptorBuilder.h        |    10 +-
 llvm/include/llvm/DebugInfo/PDB/Native/DbiStream.h |    14 +-
 .../llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h   |    22 +-
 .../include/llvm/DebugInfo/PDB/Native/EnumTables.h |     2 +-
 .../include/llvm/DebugInfo/PDB/Native/FormatUtil.h |   133 +
 .../llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h   |    14 +-
 .../llvm/DebugInfo/PDB/Native/GlobalsStream.h      |    14 +-
 llvm/include/llvm/DebugInfo/PDB/Native/HashTable.h |     3 -
 .../include/llvm/DebugInfo/PDB/Native/InfoStream.h |    10 +-
 .../llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h  |     8 +-
 .../DebugInfo/PDB/Native/InjectedSourceStream.h    |     9 +-
 llvm/include/llvm/DebugInfo/PDB/Native/InputFile.h |   231 +
 .../llvm/DebugInfo/PDB/Native/LinePrinter.h        |   185 +
 .../llvm/DebugInfo/PDB/Native/ModuleDebugStream.h  |    13 +-
 .../llvm/DebugInfo/PDB/Native/NamedStreamMap.h     |     1 -
 .../llvm/DebugInfo/PDB/Native/NativeEnumGlobals.h  |     2 +-
 .../DebugInfo/PDB/Native/NativeEnumLineNumbers.h   |     5 +-
 .../llvm/DebugInfo/PDB/Native/NativeEnumSymbols.h  |     2 +-
 .../llvm/DebugInfo/PDB/Native/NativeEnumTypes.h    |     7 +-
 .../llvm/DebugInfo/PDB/Native/NativeExeSymbol.h    |     5 +-
 .../DebugInfo/PDB/Native/NativeFunctionSymbol.h    |     7 +-
 .../DebugInfo/PDB/Native/NativeInlineSiteSymbol.h  |     6 +-
 .../llvm/DebugInfo/PDB/Native/NativeLineNumber.h   |     4 +-
 .../llvm/DebugInfo/PDB/Native/NativePublicSymbol.h |     5 +-
 .../llvm/DebugInfo/PDB/Native/NativeSession.h      |    12 +-
 .../llvm/DebugInfo/PDB/Native/NativeSourceFile.h   |     5 +-
 .../DebugInfo/PDB/Native/NativeSymbolEnumerator.h  |     8 +-
 .../llvm/DebugInfo/PDB/Native/NativeTypeEnum.h     |     8 +-
 .../DebugInfo/PDB/Native/NativeTypeFunctionSig.h   |     8 +-
 .../llvm/DebugInfo/PDB/Native/NativeTypePointer.h  |     5 +-
 .../llvm/DebugInfo/PDB/Native/NativeTypeTypedef.h  |     9 +-
 .../llvm/DebugInfo/PDB/Native/NativeTypeUDT.h      |     8 +-
 .../llvm/DebugInfo/PDB/Native/NativeTypeVTShape.h  |     6 +-
 llvm/include/llvm/DebugInfo/PDB/Native/PDBFile.h   |     2 -
 .../llvm/DebugInfo/PDB/Native/PDBFileBuilder.h     |    14 +-
 .../llvm/DebugInfo/PDB/Native/PDBStringTable.h     |     6 -
 .../llvm/DebugInfo/PDB/Native/PublicsStream.h      |    13 +-
 .../llvm/DebugInfo/PDB/Native/SymbolCache.h        |    16 +-
 .../llvm/DebugInfo/PDB/Native/SymbolStream.h       |     5 +-
 llvm/include/llvm/DebugInfo/PDB/Native/TpiStream.h |     7 +-
 .../llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h   |    15 +-
 llvm/include/llvm/DebugInfo/PDB/PDBContext.h       |     2 +
 llvm/include/llvm/DebugInfo/PDB/PDBSymbol.h        |     5 +-
 .../llvm/DebugInfo/PDB/PDBSymbolAnnotation.h       |     1 -
 llvm/include/llvm/DebugInfo/PDB/PDBSymbolBlock.h   |     2 -
 .../llvm/DebugInfo/PDB/PDBSymbolCompilandDetails.h |     1 -
 .../llvm/DebugInfo/PDB/PDBSymbolCompilandEnv.h     |     1 -
 llvm/include/llvm/DebugInfo/PDB/PDBSymbolCustom.h  |     2 -
 llvm/include/llvm/DebugInfo/PDB/PDBSymbolData.h    |     6 +-
 llvm/include/llvm/DebugInfo/PDB/PDBSymbolFunc.h    |    11 +-
 .../llvm/DebugInfo/PDB/PDBSymbolFuncDebugEnd.h     |     2 -
 .../llvm/DebugInfo/PDB/PDBSymbolFuncDebugStart.h   |     1 -
 llvm/include/llvm/DebugInfo/PDB/PDBSymbolLabel.h   |     1 -
 .../llvm/DebugInfo/PDB/PDBSymbolPublicSymbol.h     |     1 -
 llvm/include/llvm/DebugInfo/PDB/PDBSymbolThunk.h   |     1 -
 .../llvm/DebugInfo/PDB/PDBSymbolTypeArray.h        |     1 -
 .../llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h    |     6 +-
 .../llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h      |     1 -
 .../llvm/DebugInfo/PDB/PDBSymbolTypeCustom.h       |     1 -
 .../llvm/DebugInfo/PDB/PDBSymbolTypeDimension.h    |     1 -
 .../include/llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h |     8 +-
 .../llvm/DebugInfo/PDB/PDBSymbolTypeFriend.h       |     1 -
 .../llvm/DebugInfo/PDB/PDBSymbolTypeFunctionArg.h  |     1 -
 .../llvm/DebugInfo/PDB/PDBSymbolTypeManaged.h      |     1 -
 .../llvm/DebugInfo/PDB/PDBSymbolTypePointer.h      |     1 -
 .../llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h      |     1 -
 llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h |     9 +-
 .../llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h       |     1 -
 .../llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h  |     1 -
 llvm/include/llvm/DebugInfo/PDB/PDBSymbolUnknown.h |     1 -
 .../llvm/DebugInfo/PDB/PDBSymbolUsingNamespace.h   |     1 -
 llvm/include/llvm/DebugInfo/PDB/PDBTypes.h         |     3 +-
 llvm/include/llvm/DebugInfo/PDB/UDTLayout.h        |     1 -
 llvm/include/llvm/DebugInfo/Symbolize/DIFetcher.h  |    51 +
 llvm/include/llvm/DebugInfo/Symbolize/Markup.h     |   120 +
 .../llvm/DebugInfo/Symbolize/MarkupFilter.h        |    76 +
 .../DebugInfo/Symbolize/SymbolizableObjectFile.h   |   103 +
 llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h  |   103 +-
 llvm/include/llvm/Debuginfod/DIFetcher.h           |    34 +
 llvm/include/llvm/Debuginfod/HTTPClient.h          |    44 +-
 llvm/include/llvm/Demangle/Demangle.h              |     4 +-
 llvm/include/llvm/Demangle/ItaniumDemangle.h       |  2289 +-
 llvm/include/llvm/Demangle/ItaniumNodes.def        |    95 +
 llvm/include/llvm/Demangle/Utility.h               |   114 +-
 .../JITLink/DWARFRecordSectionSplitter.h           |    35 +
 .../include/llvm/ExecutionEngine/JITLink/JITLink.h |    21 +-
 .../llvm/ExecutionEngine/JITLink/MachO_arm64.h     |    27 -
 .../llvm/ExecutionEngine/JITLink/MemoryFlags.h     |    10 +-
 .../include/llvm/ExecutionEngine/JITLink/aarch64.h |   339 +-
 llvm/include/llvm/ExecutionEngine/JITLink/riscv.h  |    17 +-
 llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h |     9 +-
 llvm/include/llvm/ExecutionEngine/Orc/Core.h       |    21 +-
 llvm/include/llvm/ExecutionEngine/Orc/DebugUtils.h |     3 +
 .../llvm/ExecutionEngine/Orc/ELFNixPlatform.h      |     3 +-
 .../ExecutionEngine/Orc/EPCDebugObjectRegistrar.h  |     2 -
 .../llvm/ExecutionEngine/Orc/EPCIndirectionUtils.h |     2 +-
 .../ExecutionEngine/Orc/JITTargetMachineBuilder.h  |     2 +-
 llvm/include/llvm/ExecutionEngine/Orc/LLJIT.h      |    20 +-
 .../llvm/ExecutionEngine/Orc/MachOPlatform.h       |   146 +-
 .../llvm/ExecutionEngine/Orc/MemoryMapper.h        |   115 +
 .../llvm/ExecutionEngine/Orc/OrcABISupport.h       |    39 +
 .../ExecutionEngine/Orc/Shared/ExecutorAddress.h   |    13 +-
 .../Orc/Shared/SimplePackedSerialization.h         |     2 +-
 .../include/llvm/ExecutionEngine/Orc/Speculation.h |     9 +-
 .../llvm/ExecutionEngine/Orc/SymbolStringPool.h    |     7 +
 llvm/include/llvm/FileCheck/FileCheck.h            |     8 +-
 llvm/include/llvm/Frontend/OpenMP/OMP.td           |   279 +-
 llvm/include/llvm/Frontend/OpenMP/OMPConstants.h   |   129 +-
 llvm/include/llvm/Frontend/OpenMP/OMPContext.h     |     8 +-
 llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h   |   261 +-
 llvm/include/llvm/Frontend/OpenMP/OMPKinds.def     |    12 +-
 llvm/include/llvm/FuzzMutate/FuzzerCLI.h           |    27 +-
 llvm/include/llvm/FuzzMutate/IRMutator.h           |    26 +
 llvm/include/llvm/FuzzMutate/OpDescriptor.h        |     6 +-
 llvm/include/llvm/FuzzMutate/RandomIRBuilder.h     |    13 +-
 llvm/include/llvm/IR/AbstractCallSite.h            |     6 +-
 llvm/include/llvm/IR/Argument.h                    |     1 -
 llvm/include/llvm/IR/Assumptions.h                 |     4 +
 llvm/include/llvm/IR/Attributes.h                  |    51 +-
 llvm/include/llvm/IR/Attributes.td                 |    23 +-
 llvm/include/llvm/IR/AttributesAMDGPU.td           |    14 -
 llvm/include/llvm/IR/AutoUpgrade.h                 |    16 +-
 llvm/include/llvm/IR/BasicBlock.h                  |     9 +-
 llvm/include/llvm/IR/CFG.h                         |     1 -
 llvm/include/llvm/IR/ConstantFold.h                |    60 +
 llvm/include/llvm/IR/ConstantFolder.h              |   178 +-
 llvm/include/llvm/IR/ConstantRange.h               |     3 +
 llvm/include/llvm/IR/Constants.h                   |    15 +-
 llvm/include/llvm/IR/DIBuilder.h                   |    23 +-
 llvm/include/llvm/IR/DataLayout.h                  |     2 +-
 llvm/include/llvm/IR/DebugInfoMetadata.h           |   183 +-
 llvm/include/llvm/IR/DerivedTypes.h                |     9 +-
 llvm/include/llvm/IR/DiagnosticInfo.h              |    20 +
 llvm/include/llvm/IR/Dominators.h                  |     4 +
 llvm/include/llvm/IR/FMF.h                         |   121 +
 llvm/include/llvm/IR/FPEnv.h                       |    19 +
 llvm/include/llvm/IR/FixedMetadataKinds.def        |     2 +
 llvm/include/llvm/IR/Function.h                    |    23 +-
 llvm/include/llvm/IR/GCStrategy.h                  |    17 +-
 llvm/include/llvm/IR/GlobalIFunc.h                 |     5 +
 llvm/include/llvm/IR/GlobalObject.h                |     5 +-
 llvm/include/llvm/IR/GlobalValue.h                 |    52 +-
 llvm/include/llvm/IR/IRBuilder.h                   |   303 +-
 llvm/include/llvm/IR/IRBuilderFolder.h             |    71 +-
 llvm/include/llvm/IR/InlineAsm.h                   |    25 +-
 llvm/include/llvm/IR/InstVisitor.h                 |     3 +-
 llvm/include/llvm/IR/InstrTypes.h                  |    50 +-
 llvm/include/llvm/IR/Instruction.h                 |     1 -
 llvm/include/llvm/IR/Instructions.h                |    57 +-
 llvm/include/llvm/IR/IntrinsicInst.h               |    71 +-
 llvm/include/llvm/IR/Intrinsics.h                  |    14 +-
 llvm/include/llvm/IR/Intrinsics.td                 |   420 +-
 llvm/include/llvm/IR/IntrinsicsAArch64.td          |   166 +-
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td           |   528 +-
 llvm/include/llvm/IR/IntrinsicsARM.td              |   194 +-
 llvm/include/llvm/IR/IntrinsicsBPF.td              |    20 +-
 llvm/include/llvm/IR/IntrinsicsDirectX.td          |    20 +
 llvm/include/llvm/IR/IntrinsicsHexagon.td          |    13 +-
 llvm/include/llvm/IR/IntrinsicsMips.td             |  1342 +-
 llvm/include/llvm/IR/IntrinsicsNVVM.td             |  1449 +-
 llvm/include/llvm/IR/IntrinsicsPowerPC.td          |   746 +-
 llvm/include/llvm/IR/IntrinsicsRISCV.td            |   589 +-
 llvm/include/llvm/IR/IntrinsicsSPIRV.td            |    31 +
 llvm/include/llvm/IR/IntrinsicsSystemZ.td          |    56 +-
 llvm/include/llvm/IR/IntrinsicsVE.td               |    15 +-
 llvm/include/llvm/IR/IntrinsicsVEVL.gen.td         |  2470 +--
 llvm/include/llvm/IR/IntrinsicsWebAssembly.td      |    22 +-
 llvm/include/llvm/IR/IntrinsicsX86.td              |  2332 ++-
 llvm/include/llvm/IR/IntrinsicsXCore.td            |     8 +-
 llvm/include/llvm/IR/LLVMContext.h                 |    22 +-
 llvm/include/llvm/IR/LegacyPassManagers.h          |     8 +-
 llvm/include/llvm/IR/MDBuilder.h                   |     4 +
 llvm/include/llvm/IR/MatrixBuilder.h               |    20 +-
 llvm/include/llvm/IR/Metadata.h                    |   158 +-
 llvm/include/llvm/IR/Module.h                      |    22 +-
 llvm/include/llvm/IR/NoFolder.h                    |   164 +-
 llvm/include/llvm/IR/Operator.h                    |   100 +-
 llvm/include/llvm/IR/PatternMatch.h                |   116 +-
 llvm/include/llvm/IR/RuntimeLibcalls.def           |    16 +
 llvm/include/llvm/IR/Statepoint.h                  |     5 +-
 llvm/include/llvm/IR/Type.h                        |    18 +-
 llvm/include/llvm/IR/User.h                        |     4 +-
 llvm/include/llvm/IR/VPIntrinsics.def              |   158 +-
 llvm/include/llvm/IR/ValueMap.h                    |     6 +-
 llvm/include/llvm/IR/VectorBuilder.h               |    99 +
 llvm/include/llvm/IRReader/IRReader.h              |     4 +-
 llvm/include/llvm/InitializePasses.h               |    38 +-
 llvm/include/llvm/InterfaceStub/ELFObjHandler.h    |    11 +-
 llvm/include/llvm/InterfaceStub/IFSHandler.h       |     6 +-
 llvm/include/llvm/InterfaceStub/IFSStub.h          |     5 +-
 llvm/include/llvm/LTO/Config.h                     |    10 +-
 llvm/include/llvm/LTO/LTO.h                        |    13 +-
 llvm/include/llvm/LTO/legacy/LTOCodeGenerator.h    |     2 +-
 .../include/llvm/LTO/legacy/ThinLTOCodeGenerator.h |     7 -
 llvm/include/llvm/LinkAllPasses.h                  |    28 +-
 llvm/include/llvm/Linker/IRMover.h                 |     7 +-
 llvm/include/llvm/MC/ConstantPools.h               |     3 +-
 llvm/include/llvm/MC/MCAsmBackend.h                |     8 +-
 llvm/include/llvm/MC/MCAsmInfo.h                   |    22 +
 llvm/include/llvm/MC/MCAssembler.h                 |    19 +-
 llvm/include/llvm/MC/MCCodeView.h                  |    13 +-
 llvm/include/llvm/MC/MCContext.h                   |  1432 +-
 llvm/include/llvm/MC/MCDXContainerStreamer.h       |    49 +
 llvm/include/llvm/MC/MCDXContainerWriter.h         |    45 +
 llvm/include/llvm/MC/MCDecoderOps.h                |    33 +
 llvm/include/llvm/MC/MCDirectives.h                |     1 +
 .../llvm/MC/MCDisassembler/MCDisassembler.h        |    28 +-
 .../llvm/MC/MCDisassembler/MCExternalSymbolizer.h  |     5 +-
 llvm/include/llvm/MC/MCDisassembler/MCSymbolizer.h |     5 +-
 llvm/include/llvm/MC/MCDwarf.h                     |    11 +-
 llvm/include/llvm/MC/MCELFStreamer.h               |     9 +-
 llvm/include/llvm/MC/MCFixedLenDisassembler.h      |    33 -
 llvm/include/llvm/MC/MCFragment.h                  |     7 +-
 llvm/include/llvm/MC/MCInstrAnalysis.h             |     3 +
 llvm/include/llvm/MC/MCInstrDesc.h                 |    10 +-
 llvm/include/llvm/MC/MCInstrInfo.h                 |     1 +
 llvm/include/llvm/MC/MCLinkerOptimizationHint.h    |     2 +-
 llvm/include/llvm/MC/MCMachObjectWriter.h          |     2 +
 llvm/include/llvm/MC/MCObjectFileInfo.h            |    16 +-
 llvm/include/llvm/MC/MCObjectStreamer.h            |     8 +-
 llvm/include/llvm/MC/MCObjectWriter.h              |    12 +-
 llvm/include/llvm/MC/MCParser/MCAsmLexer.h         |     2 -
 llvm/include/llvm/MC/MCParser/MCAsmParser.h        |     6 +-
 .../llvm/MC/MCParser/MCAsmParserExtension.h        |     3 +-
 llvm/include/llvm/MC/MCParser/MCParsedAsmOperand.h |    12 +-
 llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h  |    15 +-
 llvm/include/llvm/MC/MCPseudoProbe.h               |    16 +-
 llvm/include/llvm/MC/MCRegisterInfo.h              |     8 +
 llvm/include/llvm/MC/MCSPIRVObjectWriter.h         |    40 +
 llvm/include/llvm/MC/MCSPIRVStreamer.h             |    50 +
 llvm/include/llvm/MC/MCSection.h                   |     8 +-
 llvm/include/llvm/MC/MCSectionCOFF.h               |     6 +-
 llvm/include/llvm/MC/MCSectionDXContainer.h        |    38 +
 llvm/include/llvm/MC/MCSectionELF.h                |     8 +-
 llvm/include/llvm/MC/MCSectionGOFF.h               |    15 +-
 llvm/include/llvm/MC/MCSectionMachO.h              |     4 +-
 llvm/include/llvm/MC/MCSectionSPIRV.h              |    41 +
 llvm/include/llvm/MC/MCSectionWasm.h               |     4 +-
 llvm/include/llvm/MC/MCSectionXCOFF.h              |    29 +-
 llvm/include/llvm/MC/MCStreamer.h                  |   124 +-
 llvm/include/llvm/MC/MCSubtargetInfo.h             |     3 +-
 llvm/include/llvm/MC/MCSymbol.h                    |     2 +-
 llvm/include/llvm/MC/MCSymbolWasm.h                |    14 +-
 llvm/include/llvm/MC/MCSymbolXCOFF.h               |     3 +-
 llvm/include/llvm/MC/MCTargetOptions.h             |    21 +-
 llvm/include/llvm/MC/MCTargetOptionsCommandFlags.h |     3 +
 llvm/include/llvm/MC/MCValue.h                     |     1 -
 llvm/include/llvm/MC/MCWin64EH.h                   |     8 +-
 llvm/include/llvm/MC/MCWinCOFFStreamer.h           |    20 +-
 llvm/include/llvm/MC/MCWinEH.h                     |    10 +-
 llvm/include/llvm/MC/MCXCOFFStreamer.h             |     4 +
 llvm/include/llvm/MC/SectionKind.h                 |     7 +
 llvm/include/llvm/MC/StringTableBuilder.h          |     1 -
 llvm/include/llvm/MC/SubtargetFeature.h            |     3 +-
 llvm/include/llvm/MC/TargetRegistry.h              |    56 +-
 llvm/include/llvm/MCA/CustomBehaviour.h            |     5 +
 llvm/include/llvm/MCA/IncrementalSourceMgr.h       |    92 +
 llvm/include/llvm/MCA/InstrBuilder.h               |    30 +
 llvm/include/llvm/MCA/Instruction.h                |    45 +-
 llvm/include/llvm/MCA/Pipeline.h                   |    12 +-
 llvm/include/llvm/MCA/SourceMgr.h                  |    57 +-
 llvm/include/llvm/MCA/Stages/EntryStage.h          |     3 +-
 llvm/include/llvm/MCA/Stages/Stage.h               |    13 +
 llvm/include/llvm/ObjCopy/COFF/COFFConfig.h        |    27 +
 llvm/include/llvm/ObjCopy/COFF/COFFObjcopy.h       |    36 +
 llvm/include/llvm/ObjCopy/CommonConfig.h           |   271 +
 llvm/include/llvm/ObjCopy/ConfigManager.h          |    50 +
 llvm/include/llvm/ObjCopy/ELF/ELFConfig.h          |    38 +
 llvm/include/llvm/ObjCopy/ELF/ELFObjcopy.h         |    53 +
 llvm/include/llvm/ObjCopy/MachO/MachOConfig.h      |    46 +
 llvm/include/llvm/ObjCopy/MachO/MachOObjcopy.h     |    45 +
 llvm/include/llvm/ObjCopy/MultiFormatConfig.h      |    39 +
 llvm/include/llvm/ObjCopy/ObjCopy.h                |    42 +
 llvm/include/llvm/ObjCopy/XCOFF/XCOFFConfig.h      |    21 +
 llvm/include/llvm/ObjCopy/XCOFF/XCOFFObjcopy.h     |    35 +
 llvm/include/llvm/ObjCopy/wasm/WasmConfig.h        |    21 +
 llvm/include/llvm/ObjCopy/wasm/WasmObjcopy.h       |    35 +
 llvm/include/llvm/Object/Archive.h                 |    12 +-
 llvm/include/llvm/Object/ArchiveWriter.h           |     5 +
 llvm/include/llvm/Object/Binary.h                  |     6 +-
 llvm/include/llvm/Object/COFF.h                    |    12 +-
 llvm/include/llvm/Object/COFFImportFile.h          |     3 +-
 llvm/include/llvm/Object/COFFModuleDefinition.h    |     2 +-
 llvm/include/llvm/Object/DXContainer.h             |   124 +
 llvm/include/llvm/Object/Decompressor.h            |     6 +-
 llvm/include/llvm/Object/ELF.h                     |     2 +-
 llvm/include/llvm/Object/ELFObjectFile.h           |    32 +-
 llvm/include/llvm/Object/ELFTypes.h                |    12 +
 llvm/include/llvm/Object/Error.h                   |     1 +
 llvm/include/llvm/Object/IRObjectFile.h            |     1 -
 llvm/include/llvm/Object/MachO.h                   |   130 +
 llvm/include/llvm/Object/MachOUniversal.h          |     2 +-
 llvm/include/llvm/Object/MachOUniversalWriter.h    |    13 +-
 llvm/include/llvm/Object/ObjectFile.h              |    13 +-
 llvm/include/llvm/Object/OffloadBinary.h           |   169 +
 llvm/include/llvm/Object/RelocationResolver.h      |    15 +-
 llvm/include/llvm/Object/SymbolicFile.h            |     8 +-
 llvm/include/llvm/Object/TapiFile.h                |    15 +-
 llvm/include/llvm/Object/TapiUniversal.h           |     6 +-
 llvm/include/llvm/Object/Wasm.h                    |     1 -
 llvm/include/llvm/Object/WindowsResource.h         |     2 +-
 llvm/include/llvm/Object/XCOFFObjectFile.h         |    15 +-
 llvm/include/llvm/ObjectYAML/DXContainerYAML.h     |   101 +
 llvm/include/llvm/ObjectYAML/ELFYAML.h             |    45 +-
 llvm/include/llvm/ObjectYAML/MachOYAML.h           |     1 +
 llvm/include/llvm/ObjectYAML/ObjectYAML.h          |     4 +
 llvm/include/llvm/ObjectYAML/OffloadYAML.h         |    79 +
 llvm/include/llvm/ObjectYAML/WasmYAML.h            |    22 +-
 llvm/include/llvm/ObjectYAML/yaml2obj.h            |    11 +
 llvm/include/llvm/Option/ArgList.h                 |    14 +-
 llvm/include/llvm/Pass.h                           |    10 +
 llvm/include/llvm/Passes/PassBuilder.h             |    43 +-
 .../include/llvm/Passes/StandardInstrumentations.h |    31 +-
 .../llvm/ProfileData/Coverage/CoverageMapping.h    |     4 +-
 llvm/include/llvm/ProfileData/GCOV.h               |     4 -
 llvm/include/llvm/ProfileData/InstrProf.h          |    41 +-
 .../include/llvm/ProfileData/InstrProfCorrelator.h |     9 +-
 llvm/include/llvm/ProfileData/InstrProfData.inc    |     4 +-
 llvm/include/llvm/ProfileData/InstrProfReader.h    |    49 +-
 llvm/include/llvm/ProfileData/InstrProfWriter.h    |    34 +-
 llvm/include/llvm/ProfileData/MIBEntryDef.inc      |    47 +
 llvm/include/llvm/ProfileData/MemProf.h            |   613 +
 llvm/include/llvm/ProfileData/MemProfData.inc      |   143 +-
 llvm/include/llvm/ProfileData/RawMemProfReader.h   |   127 +-
 llvm/include/llvm/ProfileData/SampleProf.h         |    97 +-
 llvm/include/llvm/ProfileData/SampleProfReader.h   |    17 +-
 llvm/include/llvm/ProfileData/SampleProfWriter.h   |     4 -
 llvm/include/llvm/Remarks/RemarkSerializer.h       |     1 -
 llvm/include/llvm/Support/AArch64TargetParser.def  |    55 +-
 llvm/include/llvm/Support/AMDHSAKernelDescriptor.h |    14 +-
 llvm/include/llvm/Support/ARMBuildAttributes.h     |    39 +-
 llvm/include/llvm/Support/ARMTargetParser.def      |     8 +-
 llvm/include/llvm/Support/ARMWinEH.h               |     5 +-
 llvm/include/llvm/Support/Alignment.h              |    73 +-
 llvm/include/llvm/Support/Allocator.h              |     7 +-
 llvm/include/llvm/Support/BLAKE3.h                 |   124 +
 llvm/include/llvm/Support/Base64.h                 |     1 +
 llvm/include/llvm/Support/BinaryStreamArray.h      |     2 +
 llvm/include/llvm/Support/BinaryStreamRef.h        |     6 +-
 llvm/include/llvm/Support/BranchProbability.h      |     1 +
 llvm/include/llvm/Support/CSKYAttributeParser.h    |    43 +
 llvm/include/llvm/Support/CSKYAttributes.h         |    95 +
 llvm/include/llvm/Support/CSKYTargetParser.def     |   524 +
 llvm/include/llvm/Support/CSKYTargetParser.h       |   203 +
 llvm/include/llvm/Support/Casting.h                |   769 +-
 llvm/include/llvm/Support/CodeGen.h                |    36 +-
 llvm/include/llvm/Support/CommandLine.h            |   290 +-
 llvm/include/llvm/Support/Compiler.h               |    64 +-
 llvm/include/llvm/Support/Compression.h            |     4 +-
 llvm/include/llvm/Support/ConvertUTF.h             |    21 +
 llvm/include/llvm/Support/CrashRecoveryContext.h   |     3 +
 llvm/include/llvm/Support/Debug.h                  |     4 +-
 llvm/include/llvm/Support/Errno.h                  |     1 -
 llvm/include/llvm/Support/Error.h                  |     4 +-
 llvm/include/llvm/Support/ErrorHandling.h          |    25 +-
 llvm/include/llvm/Support/FileUtilities.h          |    21 +
 llvm/include/llvm/Support/FormatProviders.h        |     2 +-
 llvm/include/llvm/Support/FormatVariadic.h         |     2 +-
 llvm/include/llvm/Support/HashBuilder.h            |     7 +-
 llvm/include/llvm/Support/Host.h                   |     1 +
 llvm/include/llvm/Support/KnownBits.h              |     8 +-
 llvm/include/llvm/Support/LowLevelTypeImpl.h       |    12 +
 llvm/include/llvm/Support/MD5.h                    |    29 +-
 llvm/include/llvm/Support/MachineValueType.h       |   439 +-
 llvm/include/llvm/Support/MathExtras.h             |    52 +-
 llvm/include/llvm/Support/Parallel.h               |     4 +-
 llvm/include/llvm/Support/Path.h                   |     1 -
 llvm/include/llvm/Support/PluginLoader.h           |     6 +-
 llvm/include/llvm/Support/Printable.h              |     8 +-
 llvm/include/llvm/Support/Process.h                |     1 -
 llvm/include/llvm/Support/Program.h                |     2 +-
 llvm/include/llvm/Support/RISCVISAInfo.h           |     2 +
 llvm/include/llvm/Support/RWMutex.h                |     4 +-
 llvm/include/llvm/Support/SHA1.h                   |    13 +-
 llvm/include/llvm/Support/SHA256.h                 |    13 +-
 llvm/include/llvm/Support/ScopedPrinter.h          |     7 +-
 llvm/include/llvm/Support/Signals.h                |     1 +
 llvm/include/llvm/Support/Signposts.h              |     2 +-
 llvm/include/llvm/Support/SourceMgr.h              |    30 +
 llvm/include/llvm/Support/TargetOpcodes.def        |     6 +
 llvm/include/llvm/Support/TargetParser.h           |    14 +-
 llvm/include/llvm/Support/ThreadPool.h             |    97 +-
 llvm/include/llvm/Support/Threading.h              |    22 +-
 llvm/include/llvm/Support/TrigramIndex.h           |     2 +-
 llvm/include/llvm/Support/TypeSize.h               |    19 +
 llvm/include/llvm/Support/Unicode.h                |    42 +-
 llvm/include/llvm/Support/VersionTuple.h           |    14 +-
 llvm/include/llvm/Support/VirtualFileSystem.h      |   140 +-
 llvm/include/llvm/Support/Win64EH.h                |    36 +-
 llvm/include/llvm/Support/WithColor.h              |    18 +-
 .../llvm/Support/X86DisassemblerDecoderCommon.h    |     2 -
 llvm/include/llvm/Support/X86TargetParser.def      |    72 +-
 llvm/include/llvm/Support/YAMLParser.h             |     1 -
 llvm/include/llvm/Support/YAMLTraits.h             |    67 +-
 llvm/include/llvm/Support/circular_raw_ostream.h   |    11 +-
 llvm/include/llvm/Support/raw_sha1_ostream.h       |     2 +-
 llvm/include/llvm/TableGen/Parser.h                |    34 +
 llvm/include/llvm/TableGen/Record.h                |   305 +-
 llvm/include/llvm/Target/CGPassBuilderOption.h     |     1 +
 llvm/include/llvm/Target/GenericOpcodes.td         |    13 +
 llvm/include/llvm/Target/GlobalISel/Combine.td     |    73 +-
 llvm/include/llvm/Target/Target.td                 |    54 +
 .../include/llvm/Target/TargetLoweringObjectFile.h |     3 +-
 llvm/include/llvm/Target/TargetMachine.h           |    22 +-
 llvm/include/llvm/Target/TargetOptions.h           |    33 +-
 llvm/include/llvm/Target/TargetSelectionDAG.td     |   138 +
 llvm/include/llvm/Testing/Support/SupportHelpers.h |     8 +-
 llvm/include/llvm/TextAPI/Symbol.h                 |     1 -
 .../AggressiveInstCombine/AggressiveInstCombine.h  |     4 +-
 llvm/include/llvm/Transforms/Coroutines.h          |    37 -
 .../llvm/Transforms/Coroutines/CoroCleanup.h       |     4 +-
 .../Transforms/Coroutines/CoroConditionalWrapper.h |    30 +
 .../include/llvm/Transforms/Coroutines/CoroEarly.h |     4 +-
 llvm/include/llvm/Transforms/IPO.h                 |     7 -
 llvm/include/llvm/Transforms/IPO/AlwaysInliner.h   |     4 +-
 .../llvm/Transforms/IPO/ArgumentPromotion.h        |     6 +-
 llvm/include/llvm/Transforms/IPO/Attributor.h      |   514 +-
 .../llvm/Transforms/IPO/DeadArgumentElimination.h  |    41 +-
 .../llvm/Transforms/IPO/ForceFunctionAttrs.h       |     3 +-
 llvm/include/llvm/Transforms/IPO/FunctionAttrs.h   |    17 +-
 llvm/include/llvm/Transforms/IPO/GlobalDCE.h       |     9 +-
 llvm/include/llvm/Transforms/IPO/IROutliner.h      |    41 +-
 .../llvm/Transforms/IPO/InferFunctionAttrs.h       |     4 +-
 llvm/include/llvm/Transforms/IPO/Inliner.h         |     8 +-
 llvm/include/llvm/Transforms/IPO/Internalize.h     |     1 -
 llvm/include/llvm/Transforms/IPO/ModuleInliner.h   |     9 +-
 .../llvm/Transforms/IPO/PassManagerBuilder.h       |     4 -
 .../llvm/Transforms/IPO/ProfiledCallGraph.h        |    13 +-
 .../llvm/Transforms/IPO/SampleContextTracker.h     |   114 +-
 llvm/include/llvm/Transforms/IPO/SampleProfile.h   |     2 +-
 .../llvm/Transforms/IPO/SampleProfileProbe.h       |    12 +-
 .../llvm/Transforms/IPO/StripDeadPrototypes.h      |     3 +-
 .../llvm/Transforms/IPO/ThinLTOBitcodeWriter.h     |     3 +-
 .../llvm/Transforms/IPO/WholeProgramDevirt.h       |     5 +-
 .../llvm/Transforms/InstCombine/InstCombine.h      |     1 +
 llvm/include/llvm/Transforms/Instrumentation.h     |    39 +-
 .../Transforms/Instrumentation/AddressSanitizer.h  |   110 +-
 .../Instrumentation/AddressSanitizerCommon.h       |    45 -
 .../Instrumentation/AddressSanitizerOptions.h      |     5 +-
 .../Transforms/Instrumentation/BoundsChecking.h    |     3 +-
 .../llvm/Transforms/Instrumentation/CGProfile.h    |     2 +-
 .../Instrumentation/ControlHeightReduction.h       |     1 -
 .../Transforms/Instrumentation/DataFlowSanitizer.h |     2 +-
 .../Instrumentation/HWAddressSanitizer.h           |    12 +-
 .../Transforms/Instrumentation/InstrProfiling.h    |     4 +-
 .../llvm/Transforms/Instrumentation/MemProfiler.h  |     7 +-
 .../Transforms/Instrumentation/MemorySanitizer.h   |    11 +-
 .../Transforms/Instrumentation/SanitizerCoverage.h |     3 +-
 .../Transforms/Instrumentation/ThreadSanitizer.h   |     6 +-
 llvm/include/llvm/Transforms/Scalar.h              |    22 +-
 llvm/include/llvm/Transforms/Scalar/BDCE.h         |     3 +-
 .../llvm/Transforms/Scalar/CallSiteSplitting.h     |     3 +-
 .../llvm/Transforms/Scalar/ConstantHoisting.h      |     1 -
 llvm/include/llvm/Transforms/Scalar/DCE.h          |     3 +-
 .../llvm/Transforms/Scalar/DFAJumpThreading.h      |     3 +-
 llvm/include/llvm/Transforms/Scalar/Float2Int.h    |    11 +-
 llvm/include/llvm/Transforms/Scalar/GVN.h          |     5 +-
 .../include/llvm/Transforms/Scalar/GuardWidening.h |     5 +-
 .../llvm/Transforms/Scalar/IVUsersPrinter.h        |     8 +-
 .../include/llvm/Transforms/Scalar/JumpThreading.h |     6 +-
 llvm/include/llvm/Transforms/Scalar/LICM.h         |    60 +-
 .../Transforms/Scalar/LoopAccessAnalysisPrinter.h  |     8 +-
 .../llvm/Transforms/Scalar/LoopBoundSplit.h        |     4 +-
 .../llvm/Transforms/Scalar/LoopDataPrefetch.h      |     3 +-
 llvm/include/llvm/Transforms/Scalar/LoopDeletion.h |     6 +-
 llvm/include/llvm/Transforms/Scalar/LoopFlatten.h  |     4 +-
 .../llvm/Transforms/Scalar/LoopInterchange.h       |     5 +-
 .../llvm/Transforms/Scalar/LoopPassManager.h       |     3 +-
 .../llvm/Transforms/Scalar/LoopPredication.h       |     5 +-
 llvm/include/llvm/Transforms/Scalar/LoopRotation.h |     5 +-
 .../llvm/Transforms/Scalar/LoopSimplifyCFG.h       |     6 +-
 llvm/include/llvm/Transforms/Scalar/LoopSink.h     |     4 +-
 .../llvm/Transforms/Scalar/LoopUnrollAndJamPass.h  |     4 +-
 .../llvm/Transforms/Scalar/LoopVersioningLICM.h    |     4 +-
 llvm/include/llvm/Transforms/Scalar/LowerAtomic.h  |    35 -
 .../llvm/Transforms/Scalar/LowerAtomicPass.h       |    30 +
 .../Transforms/Scalar/LowerConstantIntrinsics.h    |     3 +-
 .../llvm/Transforms/Scalar/LowerExpectIntrinsic.h  |     3 +-
 .../llvm/Transforms/Scalar/MemCpyOptimizer.h       |     4 +-
 .../llvm/Transforms/Scalar/MergedLoadStoreMotion.h |     3 +-
 .../Transforms/Scalar/PartiallyInlineLibCalls.h    |     2 +-
 llvm/include/llvm/Transforms/Scalar/SCCP.h         |    18 +-
 .../Transforms/Scalar/ScalarizeMaskedMemIntrin.h   |     2 +-
 llvm/include/llvm/Transforms/Scalar/Scalarizer.h   |    21 +-
 .../llvm/Transforms/Scalar/SimpleLoopUnswitch.h    |     9 +-
 llvm/include/llvm/Transforms/Scalar/Sink.h         |     3 +-
 .../llvm/Transforms/Scalar/SpeculativeExecution.h  |     2 +-
 .../llvm/Transforms/Scalar/TLSVariableHoist.h      |   131 +
 .../Transforms/Scalar/TailRecursionElimination.h   |     3 +-
 .../llvm/Transforms/Scalar/WarnMissedTransforms.h  |     3 +-
 llvm/include/llvm/Transforms/Utils.h               |     6 +
 .../llvm/Transforms/Utils/AssumeBundleBuilder.h    |     7 +-
 .../llvm/Transforms/Utils/BasicBlockUtils.h        |    17 +-
 .../llvm/Transforms/Utils/BreakCriticalEdges.h     |     3 +-
 llvm/include/llvm/Transforms/Utils/BuildLibCalls.h |    69 +-
 .../llvm/Transforms/Utils/CallGraphUpdater.h       |     5 +-
 .../llvm/Transforms/Utils/CallPromotionUtils.h     |    10 +
 .../llvm/Transforms/Utils/CanonicalizeAliases.h    |     3 +-
 .../Transforms/Utils/CanonicalizeFreezeInLoops.h   |     2 +-
 llvm/include/llvm/Transforms/Utils/CodeExtractor.h |    15 +-
 llvm/include/llvm/Transforms/Utils/CtorUtils.h     |     8 +-
 llvm/include/llvm/Transforms/Utils/Debugify.h      |    38 +-
 .../llvm/Transforms/Utils/EscapeEnumerator.h       |     5 +-
 llvm/include/llvm/Transforms/Utils/Evaluator.h     |     5 +-
 .../llvm/Transforms/Utils/FunctionComparator.h     |     2 +-
 llvm/include/llvm/Transforms/Utils/GlobalStatus.h  |     3 +
 .../llvm/Transforms/Utils/InjectTLIMappings.h      |     1 +
 llvm/include/llvm/Transforms/Utils/Local.h         |    20 +-
 llvm/include/llvm/Transforms/Utils/LoopUtils.h     |    45 +-
 .../include/llvm/Transforms/Utils/LoopVersioning.h |     7 +-
 llvm/include/llvm/Transforms/Utils/LowerAtomic.h   |    37 +
 .../llvm/Transforms/Utils/LowerGlobalDtors.h       |    28 +
 .../llvm/Transforms/Utils/LowerMemIntrinsics.h     |    24 +-
 .../llvm/Transforms/Utils/MemoryTaggingSupport.h   |    82 +
 llvm/include/llvm/Transforms/Utils/MisExpect.h     |    77 +
 llvm/include/llvm/Transforms/Utils/ModuleUtils.h   |    11 +-
 .../llvm/Transforms/Utils/NameAnonGlobals.h        |     1 -
 llvm/include/llvm/Transforms/Utils/PredicateInfo.h |     2 +-
 .../Transforms/Utils/RelLookupTableConverter.h     |     3 +-
 llvm/include/llvm/Transforms/Utils/SCCPSolver.h    |    42 +-
 .../include/llvm/Transforms/Utils/SSAUpdaterImpl.h |    26 +
 .../llvm/Transforms/Utils/SampleProfileInference.h |     1 -
 .../Transforms/Utils/SampleProfileLoaderBaseImpl.h |     5 +-
 .../Transforms/Utils/SampleProfileLoaderBaseUtil.h |     8 +-
 .../Transforms/Utils/ScalarEvolutionExpander.h     |    24 +-
 .../llvm/Transforms/Utils/SimplifyCFGOptions.h     |     5 +
 .../include/llvm/Transforms/Utils/SimplifyIndVar.h |     7 +-
 .../llvm/Transforms/Utils/SimplifyLibCalls.h       |    12 +-
 llvm/include/llvm/Transforms/Utils/SizeOpts.h      |     1 -
 llvm/include/llvm/Transforms/Utils/SplitModule.h   |     2 +-
 llvm/include/llvm/Transforms/Utils/UnrollLoop.h    |     9 +-
 .../Transforms/Vectorize/LoadStoreVectorizer.h     |     3 +-
 .../Vectorize/LoopVectorizationLegality.h          |    31 +-
 .../llvm/Transforms/Vectorize/SLPVectorizer.h      |     4 +-
 llvm/include/llvm/WindowsDriver/MSVCPaths.h        |   107 +
 llvm/include/llvm/WindowsDriver/MSVCSetupApi.h     |   523 +
 llvm/include/llvm/module.modulemap                 |     7 +-
 llvm/lib/Analysis/AliasAnalysis.cpp                |    25 +-
 llvm/lib/Analysis/AliasAnalysisEvaluator.cpp       |   110 +-
 llvm/lib/Analysis/AliasSetTracker.cpp              |    33 +-
 llvm/lib/Analysis/Analysis.cpp                     |    16 +-
 llvm/lib/Analysis/AssumeBundleQueries.cpp          |     4 +-
 llvm/lib/Analysis/AssumptionCache.cpp              |     4 +-
 llvm/lib/Analysis/BasicAliasAnalysis.cpp           |    75 +-
 llvm/lib/Analysis/BlockFrequencyInfo.cpp           |     1 -
 llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp       |     5 +-
 llvm/lib/Analysis/BranchProbabilityInfo.cpp        |    17 +-
 llvm/lib/Analysis/CFG.cpp                          |     6 +-
 llvm/lib/Analysis/CFGPrinter.cpp                   |     2 +-
 llvm/lib/Analysis/CFLAndersAliasAnalysis.cpp       |     4 +-
 llvm/lib/Analysis/CFLGraph.h                       |     4 +-
 llvm/lib/Analysis/CFLSteensAliasAnalysis.cpp       |    14 +-
 llvm/lib/Analysis/CGSCCPassManager.cpp             |    39 +-
 llvm/lib/Analysis/CallGraph.cpp                    |     4 +-
 llvm/lib/Analysis/CallGraphSCCPass.cpp             |     7 +-
 llvm/lib/Analysis/CallPrinter.cpp                  |   106 +-
 llvm/lib/Analysis/CaptureTracking.cpp              |   331 +-
 llvm/lib/Analysis/CmpInstAnalysis.cpp              |    16 +-
 llvm/lib/Analysis/CodeMetrics.cpp                  |    15 +-
 llvm/lib/Analysis/ConstantFolding.cpp              |   266 +-
 llvm/lib/Analysis/ConstraintSystem.cpp             |     1 -
 llvm/lib/Analysis/CostModel.cpp                    |     4 +-
 llvm/lib/Analysis/CycleAnalysis.cpp                |     6 +-
 llvm/lib/Analysis/DDG.cpp                          |     9 +-
 llvm/lib/Analysis/DDGPrinter.cpp                   |     4 +-
 llvm/lib/Analysis/Delinearization.cpp              |    40 +-
 llvm/lib/Analysis/DemandedBits.cpp                 |     6 -
 llvm/lib/Analysis/DependenceAnalysis.cpp           |   102 +-
 llvm/lib/Analysis/DependenceGraphBuilder.cpp       |     1 +
 llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp |     8 +-
 llvm/lib/Analysis/DivergenceAnalysis.cpp           |     3 +-
 llvm/lib/Analysis/DomPrinter.cpp                   |   305 +-
 llvm/lib/Analysis/DomTreeUpdater.cpp               |    93 +-
 llvm/lib/Analysis/DominanceFrontier.cpp            |     1 -
 llvm/lib/Analysis/EHPersonalities.cpp              |     6 +-
 llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp   |   233 +-
 llvm/lib/Analysis/GlobalsModRef.cpp                |    51 +-
 llvm/lib/Analysis/IRSimilarityIdentifier.cpp       |    96 +-
 llvm/lib/Analysis/IVDescriptors.cpp                |   266 +-
 llvm/lib/Analysis/IVUsers.cpp                      |     6 +-
 .../lib/Analysis/IndirectCallPromotionAnalysis.cpp |    13 +-
 llvm/lib/Analysis/InlineAdvisor.cpp                |   102 +-
 llvm/lib/Analysis/InlineCost.cpp                   |   178 +-
 llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp  |    25 +-
 llvm/lib/Analysis/InstructionSimplify.cpp          |  1310 +-
 llvm/lib/Analysis/Interval.cpp                     |     1 -
 llvm/lib/Analysis/LazyCallGraph.cpp                |     8 +-
 llvm/lib/Analysis/LazyValueInfo.cpp                |    32 +-
 llvm/lib/Analysis/LegacyDivergenceAnalysis.cpp     |     1 +
 llvm/lib/Analysis/Lint.cpp                         |   220 +-
 llvm/lib/Analysis/Loads.cpp                        |     9 +-
 llvm/lib/Analysis/LoopAccessAnalysis.cpp           |   486 +-
 llvm/lib/Analysis/LoopAnalysisManager.cpp          |     3 -
 llvm/lib/Analysis/LoopCacheAnalysis.cpp            |   129 +-
 llvm/lib/Analysis/LoopInfo.cpp                     |    11 +-
 llvm/lib/Analysis/LoopNestAnalysis.cpp             |     3 +-
 llvm/lib/Analysis/LoopPass.cpp                     |     8 +-
 llvm/lib/Analysis/LoopUnrollAnalyzer.cpp           |    11 +-
 llvm/lib/Analysis/MLInlineAdvisor.cpp              |   141 +-
 llvm/lib/Analysis/MemDepPrinter.cpp                |     1 -
 llvm/lib/Analysis/MemDerefPrinter.cpp              |     3 -
 llvm/lib/Analysis/MemoryBuiltins.cpp               |   524 +-
 llvm/lib/Analysis/MemoryDependenceAnalysis.cpp     |    64 +-
 llvm/lib/Analysis/MemoryLocation.cpp               |     2 -
 llvm/lib/Analysis/MemorySSA.cpp                    |    36 +-
 llvm/lib/Analysis/MemorySSAUpdater.cpp             |    23 +-
 llvm/lib/Analysis/ModelUnderTrainingRunner.cpp     |    29 +-
 llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp       |     2 +-
 llvm/lib/Analysis/ModuleSummaryAnalysis.cpp        |    15 +-
 llvm/lib/Analysis/MustExecute.cpp                  |     7 +-
 llvm/lib/Analysis/NoInferenceModelRunner.cpp       |    16 +-
 llvm/lib/Analysis/ObjCARCAliasAnalysis.cpp         |     2 -
 llvm/lib/Analysis/OptimizationRemarkEmitter.cpp    |     4 +-
 llvm/lib/Analysis/OverflowInstAnalysis.cpp         |     1 -
 llvm/lib/Analysis/PHITransAddr.cpp                 |     9 +-
 llvm/lib/Analysis/ProfileSummaryInfo.cpp           |    13 +-
 llvm/lib/Analysis/PtrUseVisitor.cpp                |     1 -
 llvm/lib/Analysis/RegionInfo.cpp                   |     1 +
 llvm/lib/Analysis/RegionPass.cpp                   |     8 +-
 llvm/lib/Analysis/RegionPrinter.cpp                |    69 +-
 llvm/lib/Analysis/ReplayInlineAdvisor.cpp          |    22 +-
 llvm/lib/Analysis/ScalarEvolution.cpp              |  1323 +-
 llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp |     1 +
 llvm/lib/Analysis/ScalarEvolutionDivision.cpp      |     2 -
 llvm/lib/Analysis/ScalarEvolutionNormalization.cpp |     1 +
 llvm/lib/Analysis/ScopedNoAliasAA.cpp              |     1 -
 llvm/lib/Analysis/StackLifetime.cpp                |     7 +-
 llvm/lib/Analysis/StackSafetyAnalysis.cpp          |     5 +-
 llvm/lib/Analysis/StratifiedSets.h                 |     6 +-
 llvm/lib/Analysis/SyncDependenceAnalysis.cpp       |     8 +-
 llvm/lib/Analysis/SyntheticCountsUtils.cpp         |     7 +-
 llvm/lib/Analysis/TFUtils.cpp                      |   163 +-
 llvm/lib/Analysis/TargetLibraryInfo.cpp            |    12 +-
 llvm/lib/Analysis/TargetTransformInfo.cpp          |    70 +-
 llvm/lib/Analysis/TensorSpec.cpp                   |   144 +
 llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp       |     1 -
 llvm/lib/Analysis/TypeMetadataUtils.cpp            |     1 -
 llvm/lib/Analysis/VFABIDemangling.cpp              |     2 -
 llvm/lib/Analysis/ValueLatticeUtils.cpp            |     9 +-
 llvm/lib/Analysis/ValueTracking.cpp                |   509 +-
 llvm/lib/Analysis/VectorUtils.cpp                  |   132 +-
 llvm/lib/AsmParser/LLLexer.cpp                     |    94 +-
 llvm/lib/AsmParser/LLParser.cpp                    |   225 +-
 llvm/lib/AsmParser/Parser.cpp                      |     2 -
 llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp   |     4 +-
 llvm/lib/BinaryFormat/COFF.cpp                     |    57 +
 llvm/lib/BinaryFormat/Magic.cpp                    |    14 +
 llvm/lib/BinaryFormat/Wasm.cpp                     |    29 +-
 llvm/lib/Bitcode/Reader/BitReader.cpp              |     1 -
 llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp        |    16 +-
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp          |  1774 +-
 llvm/lib/Bitcode/Reader/MetadataLoader.cpp         |    90 +-
 llvm/lib/Bitcode/Reader/MetadataLoader.h           |     6 +-
 llvm/lib/Bitcode/Reader/ValueList.cpp              |   195 +-
 llvm/lib/Bitcode/Reader/ValueList.h                |    61 +-
 llvm/lib/Bitcode/Writer/BitcodeWriter.cpp          |    90 +-
 llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp      |     1 -
 llvm/lib/Bitcode/Writer/ValueEnumerator.cpp        |   136 +-
 llvm/lib/Bitstream/Reader/BitstreamReader.cpp      |    57 +-
 llvm/lib/CodeGen/Analysis.cpp                      |     3 -
 llvm/lib/CodeGen/AsmPrinter/AIXException.cpp       |    19 +-
 llvm/lib/CodeGen/AsmPrinter/ARMException.cpp       |    15 +-
 llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp         |     3 +-
 llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp        |     4 +-
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp         |   460 +-
 llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp    |    12 +-
 .../lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp |    11 +-
 llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp      |   123 +-
 llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h        |    70 +-
 llvm/lib/CodeGen/AsmPrinter/DIE.cpp                |    11 +-
 llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp            |     1 -
 .../AsmPrinter/DbgEntityHistoryCalculator.cpp      |     3 +-
 llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp   |     1 -
 llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp  |    12 +-
 llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp   |    23 +-
 llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h     |     1 -
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp         |    60 +-
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h           |    19 +-
 llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp    |    11 +-
 llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp          |     4 +-
 llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp    |    10 +-
 llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp          |    28 +-
 llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp         |     7 +-
 llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp    |     6 +-
 llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp     |    12 +-
 llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp |     2 +-
 llvm/lib/CodeGen/AsmPrinter/WasmException.cpp      |     2 +
 llvm/lib/CodeGen/AsmPrinter/WasmException.h        |     5 +-
 llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp         |    19 +-
 llvm/lib/CodeGen/AsmPrinter/WinException.cpp       |    58 +-
 llvm/lib/CodeGen/AtomicExpandPass.cpp              |   332 +-
 llvm/lib/CodeGen/BasicBlockSections.cpp            |   181 +-
 .../CodeGen/BasicBlockSectionsProfileReader.cpp    |   144 +
 llvm/lib/CodeGen/BranchFolding.cpp                 |     8 +-
 llvm/lib/CodeGen/BranchFolding.h                   |     1 -
 llvm/lib/CodeGen/BranchRelaxation.cpp              |     1 -
 llvm/lib/CodeGen/BreakFalseDeps.cpp                |     4 +-
 llvm/lib/CodeGen/CFIFixup.cpp                      |   225 +
 llvm/lib/CodeGen/CFIInstrInserter.cpp              |     4 +-
 llvm/lib/CodeGen/CalcSpillWeights.cpp              |     5 -
 llvm/lib/CodeGen/CallingConvLower.cpp              |    16 +-
 llvm/lib/CodeGen/CodeGen.cpp                       |     4 +
 llvm/lib/CodeGen/CodeGenCommonISel.cpp             |    34 +-
 llvm/lib/CodeGen/CodeGenPrepare.cpp                |   192 +-
 llvm/lib/CodeGen/CommandFlags.cpp                  |    33 +-
 llvm/lib/CodeGen/DFAPacketizer.cpp                 |     2 +-
 llvm/lib/CodeGen/DeadMachineInstructionElim.cpp    |     1 -
 llvm/lib/CodeGen/DetectDeadLanes.cpp               |    20 +-
 llvm/lib/CodeGen/EHContGuardCatchret.cpp           |     2 -
 llvm/lib/CodeGen/EarlyIfConversion.cpp             |     7 +-
 llvm/lib/CodeGen/ExpandMemCmp.cpp                  |    14 +-
 llvm/lib/CodeGen/ExpandPostRAPseudos.cpp           |    10 +-
 llvm/lib/CodeGen/ExpandReductions.cpp              |     2 -
 llvm/lib/CodeGen/ExpandVectorPredication.cpp       |    27 +-
 llvm/lib/CodeGen/FEntryInserter.cpp                |     3 -
 llvm/lib/CodeGen/FaultMaps.cpp                     |     2 +-
 llvm/lib/CodeGen/FinalizeISel.cpp                  |     2 -
 llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp    |    12 +-
 llvm/lib/CodeGen/GCMetadata.cpp                    |     3 -
 llvm/lib/CodeGen/GCRootLowering.cpp                |     5 +-
 llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp            |     5 +-
 llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp      |    45 +-
 llvm/lib/CodeGen/GlobalISel/CallLowering.cpp       |    10 +-
 llvm/lib/CodeGen/GlobalISel/Combiner.cpp           |     8 +-
 llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp     |   313 +-
 llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp     |    41 +
 llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp         |     1 -
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp       |    58 +-
 llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp  |    10 +-
 llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp  |     5 +-
 .../lib/CodeGen/GlobalISel/InstructionSelector.cpp |    11 +-
 llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp |     7 +
 llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp  |    21 +
 llvm/lib/CodeGen/GlobalISel/Legalizer.cpp          |     6 +-
 llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp    |   150 +-
 llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp      |    10 +-
 llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp       |     7 +
 llvm/lib/CodeGen/GlobalISel/Localizer.cpp          |     1 +
 llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp   |    56 +-
 llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp      |     8 +-
 llvm/lib/CodeGen/GlobalISel/RegisterBank.cpp       |   110 -
 llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp   |   805 -
 llvm/lib/CodeGen/GlobalISel/Utils.cpp              |   154 +-
 llvm/lib/CodeGen/GlobalMerge.cpp                   |    14 +
 llvm/lib/CodeGen/HardwareLoops.cpp                 |     3 -
 llvm/lib/CodeGen/IfConversion.cpp                  |     4 +-
 llvm/lib/CodeGen/IndirectBrExpandPass.cpp          |     6 +-
 llvm/lib/CodeGen/InlineSpiller.cpp                 |    14 +-
 llvm/lib/CodeGen/InterferenceCache.h               |     2 +-
 llvm/lib/CodeGen/InterleavedAccessPass.cpp         |     2 +-
 llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp    |    22 +-
 llvm/lib/CodeGen/JMCInstrumenter.cpp               |   233 +
 llvm/lib/CodeGen/LLVMTargetMachine.cpp             |    38 +-
 llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp |     3 +-
 .../CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp  |   625 +-
 .../CodeGen/LiveDebugValues/InstrRefBasedImpl.h    |   142 +-
 .../CodeGen/LiveDebugValues/LiveDebugValues.cpp    |    15 +-
 llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h |    11 +-
 .../CodeGen/LiveDebugValues/VarLocBasedImpl.cpp    |    22 +-
 llvm/lib/CodeGen/LiveDebugVariables.cpp            |    29 +-
 llvm/lib/CodeGen/LiveInterval.cpp                  |    19 +-
 llvm/lib/CodeGen/LiveIntervalCalc.cpp              |    11 +-
 llvm/lib/CodeGen/LiveIntervalUnion.cpp             |    15 +-
 llvm/lib/CodeGen/LiveIntervals.cpp                 |    14 +-
 llvm/lib/CodeGen/LiveRangeCalc.cpp                 |     2 -
 llvm/lib/CodeGen/LiveRangeEdit.cpp                 |     2 +-
 llvm/lib/CodeGen/LiveRangeShrink.cpp               |     1 -
 llvm/lib/CodeGen/LiveRegMatrix.cpp                 |    17 +-
 llvm/lib/CodeGen/LiveStacks.cpp                    |     5 +-
 llvm/lib/CodeGen/LocalStackSlotAllocation.cpp      |    19 +-
 llvm/lib/CodeGen/LowLevelType.cpp                  |     1 -
 llvm/lib/CodeGen/LowerEmuTLS.cpp                   |     1 -
 llvm/lib/CodeGen/MIRCanonicalizerPass.cpp          |    10 +-
 llvm/lib/CodeGen/MIRFSDiscriminator.cpp            |     7 +-
 llvm/lib/CodeGen/MIRNamerPass.cpp                  |     4 -
 llvm/lib/CodeGen/MIRParser/MILexer.cpp             |     3 +-
 llvm/lib/CodeGen/MIRParser/MIParser.cpp            |    74 +-
 llvm/lib/CodeGen/MIRParser/MIRParser.cpp           |    30 +-
 llvm/lib/CodeGen/MIRPrinter.cpp                    |    32 +-
 llvm/lib/CodeGen/MIRSampleProfile.cpp              |     8 +
 llvm/lib/CodeGen/MIRVRegNamerUtils.cpp             |     1 -
 llvm/lib/CodeGen/MLRegallocEvictAdvisor.cpp        |   119 +-
 llvm/lib/CodeGen/MachineBasicBlock.cpp             |    27 +-
 llvm/lib/CodeGen/MachineBlockPlacement.cpp         |    15 +-
 llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp  |     2 -
 llvm/lib/CodeGen/MachineCSE.cpp                    |     6 +-
 llvm/lib/CodeGen/MachineCheckDebugify.cpp          |    18 +-
 llvm/lib/CodeGen/MachineCombiner.cpp               |     3 +-
 llvm/lib/CodeGen/MachineCopyPropagation.cpp        |   426 +-
 llvm/lib/CodeGen/MachineCycleAnalysis.cpp          |   110 +-
 llvm/lib/CodeGen/MachineDebugify.cpp               |     3 -
 llvm/lib/CodeGen/MachineDominanceFrontier.cpp      |     3 +-
 llvm/lib/CodeGen/MachineDominators.cpp             |     2 +
 llvm/lib/CodeGen/MachineFunction.cpp               |   114 +-
 llvm/lib/CodeGen/MachineFunctionPass.cpp           |     1 +
 llvm/lib/CodeGen/MachineFunctionSplitter.cpp       |     9 +-
 llvm/lib/CodeGen/MachineInstr.cpp                  |    49 +-
 llvm/lib/CodeGen/MachineInstrBundle.cpp            |     5 +-
 llvm/lib/CodeGen/MachineLICM.cpp                   |    20 +-
 llvm/lib/CodeGen/MachineLoopInfo.cpp               |     5 +-
 llvm/lib/CodeGen/MachineLoopUtils.cpp              |    20 +-
 llvm/lib/CodeGen/MachineModuleInfo.cpp             |   218 +-
 llvm/lib/CodeGen/MachineOperand.cpp                |     2 -
 .../CodeGen/MachineOptimizationRemarkEmitter.cpp   |     4 +-
 llvm/lib/CodeGen/MachineOutliner.cpp               |    53 +-
 llvm/lib/CodeGen/MachinePipeliner.cpp              |   133 +-
 llvm/lib/CodeGen/MachineRegisterInfo.cpp           |    16 +-
 llvm/lib/CodeGen/MachineSSAContext.cpp             |     2 +
 llvm/lib/CodeGen/MachineScheduler.cpp              |    15 +-
 llvm/lib/CodeGen/MachineSink.cpp                   |   290 +-
 llvm/lib/CodeGen/MachineStableHash.cpp             |    56 +-
 llvm/lib/CodeGen/MachineStripDebug.cpp             |     4 +-
 llvm/lib/CodeGen/MachineVerifier.cpp               |    86 +-
 llvm/lib/CodeGen/MacroFusion.cpp                   |     3 +-
 llvm/lib/CodeGen/ModuloSchedule.cpp                |    88 +-
 llvm/lib/CodeGen/NonRelocatableStringpool.cpp      |     4 +-
 llvm/lib/CodeGen/OptimizePHIs.cpp                  |     1 -
 llvm/lib/CodeGen/PHIElimination.cpp                |     2 -
 llvm/lib/CodeGen/ParallelCG.cpp                    |     3 +-
 llvm/lib/CodeGen/PatchableFunction.cpp             |     4 +-
 llvm/lib/CodeGen/PeepholeOptimizer.cpp             |    16 +-
 llvm/lib/CodeGen/PostRAHazardRecognizer.cpp        |    12 +-
 llvm/lib/CodeGen/PostRASchedulerList.cpp           |     8 +-
 llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp      |     2 -
 llvm/lib/CodeGen/ProcessImplicitDefs.cpp           |    11 +-
 llvm/lib/CodeGen/PrologEpilogInserter.cpp          |   184 +-
 llvm/lib/CodeGen/PseudoProbeInserter.cpp           |     4 +-
 llvm/lib/CodeGen/PseudoSourceValue.cpp             |    46 +-
 llvm/lib/CodeGen/RDFGraph.cpp                      |    16 +-
 llvm/lib/CodeGen/RDFLiveness.cpp                   |     6 +-
 llvm/lib/CodeGen/ReachingDefAnalysis.cpp           |    14 +-
 llvm/lib/CodeGen/RegAllocBase.cpp                  |     9 +-
 llvm/lib/CodeGen/RegAllocBase.h                    |    10 +-
 llvm/lib/CodeGen/RegAllocBasic.cpp                 |    33 +-
 llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp       |    38 +-
 llvm/lib/CodeGen/RegAllocEvictionAdvisor.h         |    48 +-
 llvm/lib/CodeGen/RegAllocFast.cpp                  |    36 +-
 llvm/lib/CodeGen/RegAllocGreedy.cpp                |   564 +-
 llvm/lib/CodeGen/RegAllocGreedy.h                  |   187 +-
 llvm/lib/CodeGen/RegAllocPBQP.cpp                  |     1 +
 llvm/lib/CodeGen/RegAllocScore.cpp                 |    22 +-
 llvm/lib/CodeGen/RegAllocScore.h                   |    19 +-
 llvm/lib/CodeGen/RegUsageInfoCollector.cpp         |     5 +-
 llvm/lib/CodeGen/RegUsageInfoPropagate.cpp         |     3 +-
 llvm/lib/CodeGen/RegisterBank.cpp                  |   110 +
 llvm/lib/CodeGen/RegisterBankInfo.cpp              |   802 +
 llvm/lib/CodeGen/RegisterClassInfo.cpp             |    19 +-
 llvm/lib/CodeGen/RegisterCoalescer.cpp             |     2 +-
 llvm/lib/CodeGen/RegisterScavenging.cpp            |     2 -
 llvm/lib/CodeGen/RegisterUsageInfo.cpp             |     2 -
 llvm/lib/CodeGen/RemoveRedundantDebugValues.cpp    |     3 +-
 llvm/lib/CodeGen/RenameIndependentSubregs.cpp      |     2 +-
 llvm/lib/CodeGen/ReplaceWithVeclib.cpp             |     5 +-
 llvm/lib/CodeGen/SafeStack.cpp                     |    57 +-
 llvm/lib/CodeGen/SafeStackLayout.cpp               |     1 -
 llvm/lib/CodeGen/SafeStackLayout.h                 |     2 +-
 llvm/lib/CodeGen/ScheduleDAGInstrs.cpp             |    10 +-
 llvm/lib/CodeGen/ScheduleDAGPrinter.cpp            |     5 -
 llvm/lib/CodeGen/SelectOptimize.cpp                |   989 +
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp      |  1973 +-
 llvm/lib/CodeGen/SelectionDAG/FastISel.cpp         |    40 +-
 .../CodeGen/SelectionDAG/FunctionLoweringInfo.cpp  |    28 +-
 llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp     |    22 +-
 llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h       |     3 +-
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp      |   262 +-
 .../CodeGen/SelectionDAG/LegalizeFloatTypes.cpp    |    24 +-
 .../CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp  |   388 +-
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp    |    92 +-
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h      |    13 +-
 .../lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp |    46 +-
 .../CodeGen/SelectionDAG/LegalizeVectorTypes.cpp   |   936 +-
 .../CodeGen/SelectionDAG/ResourcePriorityQueue.cpp |    14 +-
 llvm/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h     |     1 +
 llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp  |    42 +-
 .../lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp |    19 +-
 .../CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp    |     5 +-
 llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp  |     4 -
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp     |   860 +-
 .../SelectionDAG/SelectionDAGAddressAnalysis.cpp   |     2 +-
 .../CodeGen/SelectionDAG/SelectionDAGBuilder.cpp   |   491 +-
 .../lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h |    26 +-
 .../CodeGen/SelectionDAG/SelectionDAGDumper.cpp    |    12 +-
 llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp |    36 +-
 .../CodeGen/SelectionDAG/SelectionDAGPrinter.cpp   |    28 +-
 .../CodeGen/SelectionDAG/StatepointLowering.cpp    |    59 +-
 llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp   |   888 +-
 llvm/lib/CodeGen/ShadowStackGCLowering.cpp         |     3 +-
 llvm/lib/CodeGen/SjLjEHPrepare.cpp                 |     2 +-
 llvm/lib/CodeGen/SplitKit.cpp                      |    89 +-
 llvm/lib/CodeGen/SplitKit.h                        |    23 +-
 llvm/lib/CodeGen/StackColoring.cpp                 |    10 +-
 llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp      |     2 +-
 llvm/lib/CodeGen/StackMaps.cpp                     |     4 +-
 llvm/lib/CodeGen/StackProtector.cpp                |     4 +-
 llvm/lib/CodeGen/StackSlotColoring.cpp             |     1 -
 llvm/lib/CodeGen/TailDuplication.cpp               |     4 +-
 llvm/lib/CodeGen/TailDuplicator.cpp                |    23 +-
 llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp       |     9 +-
 llvm/lib/CodeGen/TargetInstrInfo.cpp               |    14 +-
 llvm/lib/CodeGen/TargetLoweringBase.cpp            |   209 +-
 llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp  |    91 +-
 llvm/lib/CodeGen/TargetOptionsImpl.cpp             |     1 -
 llvm/lib/CodeGen/TargetPassConfig.cpp              |    80 +-
 llvm/lib/CodeGen/TargetRegisterInfo.cpp            |     3 +-
 llvm/lib/CodeGen/TargetSchedule.cpp                |     1 -
 llvm/lib/CodeGen/TargetSubtargetInfo.cpp           |     4 -
 llvm/lib/CodeGen/TwoAddressInstructionPass.cpp     |    72 +-
 llvm/lib/CodeGen/TypePromotion.cpp                 |   137 +-
 llvm/lib/CodeGen/UnreachableBlockElim.cpp          |    14 +-
 llvm/lib/CodeGen/VLIWMachineScheduler.cpp          |    10 +-
 llvm/lib/CodeGen/ValueTypes.cpp                    |    15 +-
 llvm/lib/CodeGen/WasmEHPrepare.cpp                 |    12 +-
 llvm/lib/CodeGen/WinEHPrepare.cpp                  |     6 +-
 llvm/lib/DWARFLinker/DWARFLinker.cpp               |   150 +-
 llvm/lib/DWARFLinker/DWARFLinkerCompileUnit.cpp    |     6 +-
 llvm/lib/DWARFLinker/DWARFLinkerDeclContext.cpp    |     1 +
 llvm/lib/DWARFLinker/DWARFStreamer.cpp             |    49 +-
 llvm/lib/DWP/DWP.cpp                               |    17 +-
 .../CodeView/AppendingTypeTableBuilder.cpp         |     9 +-
 llvm/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp    |    75 +-
 llvm/lib/DebugInfo/CodeView/CVTypeVisitor.cpp      |     5 +-
 llvm/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp   |    33 +-
 .../CodeView/ContinuationRecordBuilder.cpp         |     8 +-
 .../DebugInfo/CodeView/DebugCrossExSubsection.cpp  |     1 +
 .../CodeView/DebugFrameDataSubsection.cpp          |     2 +
 .../CodeView/DebugInlineeLinesSubsection.cpp       |     1 +
 llvm/lib/DebugInfo/CodeView/DebugSubsection.cpp    |     4 +-
 .../DebugInfo/CodeView/DebugSubsectionRecord.cpp   |     1 -
 .../DebugInfo/CodeView/DebugSubsectionVisitor.cpp  |     3 +-
 .../DebugInfo/CodeView/DebugSymbolsSubsection.cpp  |     1 +
 llvm/lib/DebugInfo/CodeView/Formatters.cpp         |     4 +-
 .../DebugInfo/CodeView/GlobalTypeTableBuilder.cpp  |    10 +-
 .../CodeView/LazyRandomTypeCollection.cpp          |     5 +-
 .../DebugInfo/CodeView/MergingTypeTableBuilder.cpp |    11 +-
 llvm/lib/DebugInfo/CodeView/RecordName.cpp         |     6 +-
 .../lib/DebugInfo/CodeView/RecordSerialization.cpp |     2 +-
 .../DebugInfo/CodeView/SimpleTypeSerializer.cpp    |     5 +-
 .../lib/DebugInfo/CodeView/StringsAndChecksums.cpp |     1 -
 llvm/lib/DebugInfo/CodeView/SymbolDumper.cpp       |     4 +-
 .../lib/DebugInfo/CodeView/SymbolRecordHelpers.cpp |     2 +-
 llvm/lib/DebugInfo/CodeView/SymbolSerializer.cpp   |     6 +-
 llvm/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp    |     5 +-
 llvm/lib/DebugInfo/CodeView/TypeHashing.cpp        |     3 +-
 llvm/lib/DebugInfo/CodeView/TypeIndex.cpp          |     1 +
 llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp  |    38 +-
 llvm/lib/DebugInfo/CodeView/TypeStreamMerger.cpp   |     6 +-
 .../lib/DebugInfo/CodeView/TypeTableCollection.cpp |     5 +-
 .../DWARF/DWARFAbbreviationDeclaration.cpp         |     2 +-
 llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp |     1 -
 llvm/lib/DebugInfo/DWARF/DWARFAddressRange.cpp     |     2 +-
 llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp      |     3 +-
 llvm/lib/DebugInfo/DWARF/DWARFContext.cpp          |   127 +-
 llvm/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp    |     4 +-
 llvm/lib/DebugInfo/DWARF/DWARFDebugAddr.cpp        |     1 +
 llvm/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp     |    15 +-
 llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp       |    13 +-
 llvm/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp   |     3 +-
 llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp        |    14 +-
 llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp         |    12 +-
 llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp       |     7 +-
 llvm/lib/DebugInfo/DWARF/DWARFDie.cpp              |   648 +-
 llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp        |     5 +-
 llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp         |     2 +-
 llvm/lib/DebugInfo/DWARF/DWARFTypePrinter.cpp      |   608 +
 llvm/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp         |     2 -
 llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp             |   119 +-
 llvm/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp        |     1 +
 llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp         |    70 +-
 llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp       |    29 +-
 llvm/lib/DebugInfo/GSYM/ExtractRanges.cpp          |    79 +
 llvm/lib/DebugInfo/GSYM/FunctionInfo.cpp           |    14 +-
 llvm/lib/DebugInfo/GSYM/GsymCreator.cpp            |     6 +-
 llvm/lib/DebugInfo/GSYM/GsymReader.cpp             |     2 +-
 llvm/lib/DebugInfo/GSYM/InlineInfo.cpp             |    16 +-
 llvm/lib/DebugInfo/GSYM/LookupResult.cpp           |     3 +-
 llvm/lib/DebugInfo/GSYM/Range.cpp                  |   123 -
 llvm/lib/DebugInfo/MSF/MappedBlockStream.cpp       |     1 -
 .../PDB/Native/DbiModuleDescriptorBuilder.cpp      |    12 +-
 llvm/lib/DebugInfo/PDB/Native/DbiModuleList.cpp    |     1 +
 llvm/lib/DebugInfo/PDB/Native/DbiStream.cpp        |     2 -
 llvm/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp |    13 +-
 llvm/lib/DebugInfo/PDB/Native/EnumTables.cpp       |     1 +
 llvm/lib/DebugInfo/PDB/Native/FormatUtil.cpp       |   207 +
 llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp |     9 +-
 llvm/lib/DebugInfo/PDB/Native/GlobalsStream.cpp    |     3 +-
 llvm/lib/DebugInfo/PDB/Native/HashTable.cpp        |     3 -
 llvm/lib/DebugInfo/PDB/Native/InfoStream.cpp       |     4 +-
 .../lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp |     4 +-
 .../DebugInfo/PDB/Native/InjectedSourceStream.cpp  |     2 +-
 llvm/lib/DebugInfo/PDB/Native/InputFile.cpp        |   587 +
 llvm/lib/DebugInfo/PDB/Native/LinePrinter.cpp      |   340 +
 .../lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp |     7 +-
 llvm/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp   |     4 +-
 .../DebugInfo/PDB/Native/NativeCompilandSymbol.cpp |     2 -
 .../lib/DebugInfo/PDB/Native/NativeEnumGlobals.cpp |     4 +-
 .../PDB/Native/NativeEnumInjectedSources.cpp       |     4 +-
 .../DebugInfo/PDB/Native/NativeEnumLineNumbers.cpp |     8 +-
 .../lib/DebugInfo/PDB/Native/NativeEnumModules.cpp |     5 +-
 .../lib/DebugInfo/PDB/Native/NativeEnumSymbols.cpp |     4 +-
 llvm/lib/DebugInfo/PDB/Native/NativeEnumTypes.cpp  |     9 +-
 llvm/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp  |     6 +-
 .../DebugInfo/PDB/Native/NativeFunctionSymbol.cpp  |    10 +-
 .../PDB/Native/NativeInlineSiteSymbol.cpp          |    68 +-
 llvm/lib/DebugInfo/PDB/Native/NativeLineNumber.cpp |     1 +
 .../DebugInfo/PDB/Native/NativePublicSymbol.cpp    |     5 +-
 llvm/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp  |     1 -
 llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp    |    26 +-
 llvm/lib/DebugInfo/PDB/Native/NativeSourceFile.cpp |     2 +
 .../PDB/Native/NativeSymbolEnumerator.cpp          |     4 +-
 llvm/lib/DebugInfo/PDB/Native/NativeTypeArray.cpp  |     9 +-
 .../lib/DebugInfo/PDB/Native/NativeTypeBuiltin.cpp |     3 +-
 llvm/lib/DebugInfo/PDB/Native/NativeTypeEnum.cpp   |    20 +-
 .../DebugInfo/PDB/Native/NativeTypeFunctionSig.cpp |     5 +-
 .../lib/DebugInfo/PDB/Native/NativeTypePointer.cpp |     5 +-
 .../lib/DebugInfo/PDB/Native/NativeTypeTypedef.cpp |     4 +-
 llvm/lib/DebugInfo/PDB/Native/NativeTypeUDT.cpp    |    13 +-
 .../lib/DebugInfo/PDB/Native/NativeTypeVTShape.cpp |     5 +-
 llvm/lib/DebugInfo/PDB/Native/PDBFile.cpp          |     1 -
 llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp   |    21 +-
 llvm/lib/DebugInfo/PDB/Native/PDBStringTable.cpp   |     1 -
 .../DebugInfo/PDB/Native/PDBStringTableBuilder.cpp |     7 +-
 llvm/lib/DebugInfo/PDB/Native/PublicsStream.cpp    |     4 +-
 llvm/lib/DebugInfo/PDB/Native/SymbolCache.cpp      |    15 +-
 llvm/lib/DebugInfo/PDB/Native/SymbolStream.cpp     |     5 +-
 llvm/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp |     6 +-
 llvm/lib/DebugInfo/PDB/PDB.cpp                     |     1 -
 llvm/lib/DebugInfo/PDB/PDBContext.cpp              |     9 +
 llvm/lib/DebugInfo/PDB/PDBExtras.cpp               |     2 +-
 llvm/lib/DebugInfo/PDB/PDBSymbol.cpp               |     2 +-
 llvm/lib/DebugInfo/PDB/PDBSymbolAnnotation.cpp     |     2 -
 llvm/lib/DebugInfo/PDB/PDBSymbolBlock.cpp          |     3 -
 llvm/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp      |     3 +-
 .../DebugInfo/PDB/PDBSymbolCompilandDetails.cpp    |     3 -
 llvm/lib/DebugInfo/PDB/PDBSymbolCompilandEnv.cpp   |     4 +-
 llvm/lib/DebugInfo/PDB/PDBSymbolCustom.cpp         |     3 -
 llvm/lib/DebugInfo/PDB/PDBSymbolData.cpp           |     3 +-
 llvm/lib/DebugInfo/PDB/PDBSymbolExe.cpp            |     4 +-
 llvm/lib/DebugInfo/PDB/PDBSymbolFunc.cpp           |     2 +
 llvm/lib/DebugInfo/PDB/PDBSymbolFuncDebugEnd.cpp   |     3 -
 llvm/lib/DebugInfo/PDB/PDBSymbolFuncDebugStart.cpp |     4 +-
 llvm/lib/DebugInfo/PDB/PDBSymbolLabel.cpp          |     2 -
 llvm/lib/DebugInfo/PDB/PDBSymbolPublicSymbol.cpp   |     4 +-
 llvm/lib/DebugInfo/PDB/PDBSymbolThunk.cpp          |     2 -
 llvm/lib/DebugInfo/PDB/PDBSymbolTypeArray.cpp      |     2 -
 llvm/lib/DebugInfo/PDB/PDBSymbolTypeBaseClass.cpp  |     4 +-
 llvm/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp    |     2 -
 llvm/lib/DebugInfo/PDB/PDBSymbolTypeCustom.cpp     |     3 -
 llvm/lib/DebugInfo/PDB/PDBSymbolTypeDimension.cpp  |     3 -
 llvm/lib/DebugInfo/PDB/PDBSymbolTypeEnum.cpp       |     3 +-
 llvm/lib/DebugInfo/PDB/PDBSymbolTypeFriend.cpp     |     3 -
 .../lib/DebugInfo/PDB/PDBSymbolTypeFunctionArg.cpp |     2 -
 llvm/lib/DebugInfo/PDB/PDBSymbolTypeManaged.cpp    |     3 -
 llvm/lib/DebugInfo/PDB/PDBSymbolTypePointer.cpp    |     3 -
 llvm/lib/DebugInfo/PDB/PDBSymbolTypeTypedef.cpp    |     2 -
 llvm/lib/DebugInfo/PDB/PDBSymbolTypeUDT.cpp        |    10 +-
 llvm/lib/DebugInfo/PDB/PDBSymbolTypeVTable.cpp     |     2 -
 .../lib/DebugInfo/PDB/PDBSymbolTypeVTableShape.cpp |     3 -
 llvm/lib/DebugInfo/PDB/PDBSymbolUnknown.cpp        |     3 -
 llvm/lib/DebugInfo/PDB/PDBSymbolUsingNamespace.cpp |     3 -
 llvm/lib/DebugInfo/PDB/UDTLayout.cpp               |     3 +
 llvm/lib/DebugInfo/Symbolize/DIFetcher.cpp         |    57 +
 llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp         |     6 +-
 llvm/lib/DebugInfo/Symbolize/Markup.cpp            |   202 +
 llvm/lib/DebugInfo/Symbolize/MarkupFilter.cpp      |   143 +
 .../DebugInfo/Symbolize/SymbolizableObjectFile.cpp |    10 +-
 .../DebugInfo/Symbolize/SymbolizableObjectFile.h   |   103 -
 llvm/lib/DebugInfo/Symbolize/Symbolize.cpp         |   316 +-
 llvm/lib/Debuginfod/DIFetcher.cpp                  |    28 +
 llvm/lib/Debuginfod/Debuginfod.cpp                 |    63 +-
 llvm/lib/Debuginfod/HTTPClient.cpp                 |    88 +-
 llvm/lib/Demangle/Demangle.cpp                     |     2 +-
 llvm/lib/Demangle/ItaniumDemangle.cpp              |    58 +-
 llvm/lib/Demangle/MicrosoftDemangle.cpp            |    37 +-
 llvm/lib/Demangle/MicrosoftDemangleNodes.cpp       |     4 +-
 llvm/lib/Demangle/RustDemangle.cpp                 |    58 +-
 .../ExecutionEngine/GDBRegistrationListener.cpp    |     5 +-
 llvm/lib/ExecutionEngine/Interpreter/Interpreter.h |     2 +-
 .../JITLink/DWARFRecordSectionSplitter.cpp         |   117 +
 .../lib/ExecutionEngine/JITLink/EHFrameSupport.cpp |   564 +-
 .../ExecutionEngine/JITLink/EHFrameSupportImpl.h   |    53 +-
 .../JITLink/ELFLinkGraphBuilder.cpp                |     2 +-
 llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp   |   317 +-
 llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp     |    72 +-
 llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp    |    26 +-
 llvm/lib/ExecutionEngine/JITLink/JITLink.cpp       |    11 +-
 .../lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp |     2 +-
 .../JITLink/JITLinkMemoryManager.cpp               |     2 +-
 .../JITLink/MachOLinkGraphBuilder.cpp              |    45 +-
 llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp   |   493 +-
 llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp  |     7 +-
 llvm/lib/ExecutionEngine/JITLink/aarch64.cpp       |    52 +-
 llvm/lib/ExecutionEngine/JITLink/riscv.cpp         |     4 +
 llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp           |     1 +
 llvm/lib/ExecutionEngine/MCJIT/MCJIT.h             |     3 +-
 llvm/lib/ExecutionEngine/Orc/CompileUtils.cpp      |     1 +
 llvm/lib/ExecutionEngine/Orc/Core.cpp              |    76 +-
 .../Orc/DebugObjectManagerPlugin.cpp               |     2 +-
 llvm/lib/ExecutionEngine/Orc/DebugUtils.cpp        |     7 +
 .../ExecutionEngine/Orc/DebuggerSupportPlugin.cpp  |    11 +-
 llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp    |    88 +-
 .../Orc/EPCDebugObjectRegistrar.cpp                |     3 +-
 .../ExecutionEngine/Orc/EPCIndirectionUtils.cpp    |    11 +-
 llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp    |     8 +-
 .../ExecutionEngine/Orc/ExecutorProcessControl.cpp |     4 +-
 llvm/lib/ExecutionEngine/Orc/IRCompileLayer.cpp    |     2 +-
 llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp  |    16 +-
 .../Orc/JITTargetMachineBuilder.cpp                |     1 +
 llvm/lib/ExecutionEngine/Orc/LLJIT.cpp             |    56 +-
 llvm/lib/ExecutionEngine/Orc/Layer.cpp             |     4 +-
 llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp     |     4 +
 .../ExecutionEngine/Orc/LookupAndRecordAddrs.cpp   |     4 +-
 llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp     |   433 +-
 llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp      |   152 +
 .../ExecutionEngine/Orc/ObjectFileInterface.cpp    |    11 +-
 .../lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp |    14 +-
 llvm/lib/ExecutionEngine/Orc/OrcABISupport.cpp     |   171 +
 llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp    |   252 +-
 .../Orc/Shared/SimpleRemoteEPCUtils.cpp            |     4 +-
 llvm/lib/ExecutionEngine/Orc/Speculation.cpp       |     4 +-
 .../Orc/TargetProcess/SimpleRemoteEPCServer.cpp    |     4 +-
 llvm/lib/ExecutionEngine/Orc/TaskDispatch.cpp      |     2 +-
 .../RuntimeDyld/RTDyldMemoryManager.cpp            |     8 +-
 .../ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp    |     4 +-
 .../RuntimeDyld/RuntimeDyldChecker.cpp             |     3 +-
 .../ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp |     9 +-
 llvm/lib/ExecutionEngine/SectionMemoryManager.cpp  |     2 +-
 llvm/lib/FileCheck/FileCheck.cpp                   |    28 +-
 llvm/lib/Frontend/OpenMP/OMPContext.cpp            |     5 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp          |  1143 +-
 llvm/lib/FuzzMutate/FuzzerCLI.cpp                  |    48 -
 llvm/lib/FuzzMutate/IRMutator.cpp                  |    56 +-
 llvm/lib/FuzzMutate/Operations.cpp                 |    17 +-
 llvm/lib/FuzzMutate/RandomIRBuilder.cpp            |    16 +-
 llvm/lib/IR/AbstractCallSite.cpp                   |     1 -
 llvm/lib/IR/AsmWriter.cpp                          |    32 +-
 llvm/lib/IR/Assumptions.cpp                        |     1 +
 llvm/lib/IR/AttributeImpl.h                        |     2 +
 llvm/lib/IR/Attributes.cpp                         |   181 +-
 llvm/lib/IR/AutoUpgrade.cpp                        |   188 +-
 llvm/lib/IR/BasicBlock.cpp                         |     6 -
 llvm/lib/IR/BuiltinGCs.cpp                         |     2 +-
 llvm/lib/IR/ConstantFold.cpp                       |    36 +-
 llvm/lib/IR/ConstantFold.h                         |    57 -
 llvm/lib/IR/ConstantRange.cpp                      |    77 +-
 llvm/lib/IR/Constants.cpp                          |   132 +-
 llvm/lib/IR/ConstantsContext.h                     |    37 -
 llvm/lib/IR/Core.cpp                               |    40 +-
 llvm/lib/IR/DIBuilder.cpp                          |    22 +-
 llvm/lib/IR/DebugInfoMetadata.cpp                  |   188 +-
 llvm/lib/IR/DiagnosticHandler.cpp                  |     9 +-
 llvm/lib/IR/DiagnosticInfo.cpp                     |    11 +
 llvm/lib/IR/Dominators.cpp                         |     1 -
 llvm/lib/IR/FPEnv.cpp                              |    45 +
 llvm/lib/IR/Function.cpp                           |   123 +-
 llvm/lib/IR/GVMaterializer.cpp                     |     2 +-
 llvm/lib/IR/Globals.cpp                            |    25 +-
 llvm/lib/IR/IRBuilder.cpp                          |   178 +-
 llvm/lib/IR/Instruction.cpp                        |     5 +-
 llvm/lib/IR/Instructions.cpp                       |    60 +-
 llvm/lib/IR/IntrinsicInst.cpp                      |   107 +-
 llvm/lib/IR/LLVMContext.cpp                        |    37 +-
 llvm/lib/IR/LLVMContextImpl.cpp                    |    20 +-
 llvm/lib/IR/LLVMContextImpl.h                      |    34 +-
 llvm/lib/IR/LegacyPassManager.cpp                  |    14 +-
 llvm/lib/IR/MDBuilder.cpp                          |     8 +
 llvm/lib/IR/Mangler.cpp                            |     2 +-
 llvm/lib/IR/Metadata.cpp                           |   174 +-
 llvm/lib/IR/Module.cpp                             |    33 +-
 llvm/lib/IR/Pass.cpp                               |    10 +
 llvm/lib/IR/ReplaceConstant.cpp                    |     1 +
 llvm/lib/IR/SafepointIRVerifier.cpp                |    11 +
 llvm/lib/IR/Use.cpp                                |     4 -
 llvm/lib/IR/User.cpp                               |    12 +-
 llvm/lib/IR/Value.cpp                              |    17 +-
 llvm/lib/IR/VectorBuilder.cpp                      |   103 +
 llvm/lib/IR/Verifier.cpp                           |  3888 ++--
 llvm/lib/InterfaceStub/ELFObjHandler.cpp           |   139 +-
 llvm/lib/InterfaceStub/IFSHandler.cpp              |    48 +-
 llvm/lib/InterfaceStub/IFSStub.cpp                 |     2 +-
 llvm/lib/LTO/LTO.cpp                               |   106 +-
 llvm/lib/LTO/LTOBackend.cpp                        |    46 +-
 llvm/lib/LTO/LTOCodeGenerator.cpp                  |     6 +-
 llvm/lib/LTO/LTOModule.cpp                         |     2 +-
 llvm/lib/LTO/SummaryBasedOptimizations.cpp         |     2 +-
 llvm/lib/LTO/ThinLTOCodeGenerator.cpp              |    51 +-
 llvm/lib/LineEditor/LineEditor.cpp                 |     4 +-
 llvm/lib/Linker/IRMover.cpp                        |    69 +-
 llvm/lib/Linker/LinkModules.cpp                    |    13 +-
 llvm/lib/MC/ConstantPools.cpp                      |    25 +-
 llvm/lib/MC/ELFObjectWriter.cpp                    |    50 +-
 llvm/lib/MC/MCAsmBackend.cpp                       |    10 +-
 llvm/lib/MC/MCAsmInfo.cpp                          |     5 +-
 llvm/lib/MC/MCAsmStreamer.cpp                      |   204 +-
 llvm/lib/MC/MCAssembler.cpp                        |    15 +-
 llvm/lib/MC/MCCodeView.cpp                         |     7 +-
 llvm/lib/MC/MCContext.cpp                          |    90 +-
 llvm/lib/MC/MCDXContainerStreamer.cpp              |    31 +
 llvm/lib/MC/MCDXContainerWriter.cpp                |   143 +
 llvm/lib/MC/MCDisassembler/Disassembler.cpp        |     1 -
 llvm/lib/MC/MCDisassembler/Disassembler.h          |     2 +-
 llvm/lib/MC/MCDisassembler/MCDisassembler.cpp      |    17 +-
 .../lib/MC/MCDisassembler/MCExternalSymbolizer.cpp |    16 +-
 llvm/lib/MC/MCDisassembler/MCRelocationInfo.cpp    |     2 +-
 llvm/lib/MC/MCDwarf.cpp                            |    55 +-
 llvm/lib/MC/MCELFStreamer.cpp                      |    25 +-
 llvm/lib/MC/MCExpr.cpp                             |     6 +-
 llvm/lib/MC/MCFragment.cpp                         |     2 +-
 llvm/lib/MC/MCInstPrinter.cpp                      |     1 +
 llvm/lib/MC/MCInstrAnalysis.cpp                    |     7 +-
 llvm/lib/MC/MCInstrDesc.cpp                        |     1 -
 llvm/lib/MC/MCMachOStreamer.cpp                    |    33 +-
 llvm/lib/MC/MCNullStreamer.cpp                     |    18 +-
 llvm/lib/MC/MCObjectFileInfo.cpp                   |    52 +-
 llvm/lib/MC/MCObjectStreamer.cpp                   |    10 +-
 llvm/lib/MC/MCObjectWriter.cpp                     |     4 +-
 llvm/lib/MC/MCParser/AsmLexer.cpp                  |     8 +-
 llvm/lib/MC/MCParser/AsmParser.cpp                 |    83 +-
 llvm/lib/MC/MCParser/COFFAsmParser.cpp             |    45 +-
 llvm/lib/MC/MCParser/COFFMasmParser.cpp            |    27 +-
 llvm/lib/MC/MCParser/DarwinAsmParser.cpp           |    19 +-
 llvm/lib/MC/MCParser/ELFAsmParser.cpp              |    33 +-
 llvm/lib/MC/MCParser/GOFFAsmParser.cpp             |    11 +-
 llvm/lib/MC/MCParser/MCAsmLexer.cpp                |     1 -
 llvm/lib/MC/MCParser/MCAsmParser.cpp               |     2 +-
 llvm/lib/MC/MCParser/MCAsmParserExtension.cpp      |     2 +
 llvm/lib/MC/MCParser/MasmParser.cpp                |   139 +-
 llvm/lib/MC/MCParser/WasmAsmParser.cpp             |    15 +-
 llvm/lib/MC/MCParser/XCOFFAsmParser.cpp            |     9 +-
 llvm/lib/MC/MCPseudoProbe.cpp                      |   176 +-
 llvm/lib/MC/MCRegisterInfo.cpp                     |    11 +
 llvm/lib/MC/MCSPIRVStreamer.cpp                    |    45 +
 llvm/lib/MC/MCSchedule.cpp                         |     4 +-
 llvm/lib/MC/MCSection.cpp                          |     2 +-
 llvm/lib/MC/MCSectionCOFF.cpp                      |    12 +-
 llvm/lib/MC/MCSectionDXContainer.cpp               |    15 +
 llvm/lib/MC/MCSectionELF.cpp                       |    15 +-
 llvm/lib/MC/MCSectionMachO.cpp                     |    17 +-
 llvm/lib/MC/MCSectionWasm.cpp                      |     5 +-
 llvm/lib/MC/MCSectionXCOFF.cpp                     |    10 +-
 llvm/lib/MC/MCStreamer.cpp                         |   117 +-
 llvm/lib/MC/MCSymbol.cpp                           |     1 -
 llvm/lib/MC/MCSymbolELF.cpp                        |     1 -
 llvm/lib/MC/MCTargetOptions.cpp                    |     9 +-
 llvm/lib/MC/MCTargetOptionsCommandFlags.cpp        |    19 +-
 llvm/lib/MC/MCWasmStreamer.cpp                     |    18 +-
 llvm/lib/MC/MCWin64EH.cpp                          |  1320 +-
 llvm/lib/MC/MCWinCOFFStreamer.cpp                  |    43 +-
 llvm/lib/MC/MCWinEH.cpp                            |     9 +-
 llvm/lib/MC/MCXCOFFStreamer.cpp                    |     5 +
 llvm/lib/MC/MachObjectWriter.cpp                   |    25 +-
 llvm/lib/MC/SPIRVObjectWriter.cpp                  |    76 +
 llvm/lib/MC/SubtargetFeature.cpp                   |     4 -
 llvm/lib/MC/TargetRegistry.cpp                     |     4 +-
 llvm/lib/MC/WasmObjectWriter.cpp                   |   135 +-
 llvm/lib/MC/WinCOFFObjectWriter.cpp                |    51 +-
 llvm/lib/MC/XCOFFObjectWriter.cpp                  |   480 +-
 llvm/lib/MCA/CustomBehaviour.cpp                   |     2 +-
 llvm/lib/MCA/HardwareUnits/LSUnit.cpp              |    32 +-
 llvm/lib/MCA/IncrementalSourceMgr.cpp              |    51 +
 llvm/lib/MCA/InstrBuilder.cpp                      |    96 +-
 llvm/lib/MCA/Instruction.cpp                       |    12 +
 llvm/lib/MCA/Pipeline.cpp                          |    15 +-
 llvm/lib/MCA/Stages/DispatchStage.cpp              |     6 +-
 llvm/lib/MCA/Stages/EntryStage.cpp                 |    23 +-
 llvm/lib/MCA/Stages/ExecuteStage.cpp               |     4 +-
 llvm/lib/MCA/Stages/InOrderIssueStage.cpp          |    11 +-
 llvm/lib/MCA/Stages/Stage.cpp                      |     1 +
 llvm/lib/ObjCopy/Archive.cpp                       |   110 +
 llvm/lib/ObjCopy/Archive.h                         |    31 +
 llvm/lib/ObjCopy/COFF/COFFObjcopy.cpp              |   311 +
 llvm/lib/ObjCopy/COFF/COFFObject.cpp               |   132 +
 llvm/lib/ObjCopy/COFF/COFFObject.h                 |   212 +
 llvm/lib/ObjCopy/COFF/COFFReader.cpp               |   226 +
 llvm/lib/ObjCopy/COFF/COFFReader.h                 |    41 +
 llvm/lib/ObjCopy/COFF/COFFWriter.cpp               |   466 +
 llvm/lib/ObjCopy/COFF/COFFWriter.h                 |    63 +
 llvm/lib/ObjCopy/CommonConfig.cpp                  |    50 +
 llvm/lib/ObjCopy/ConfigManager.cpp                 |    97 +
 llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp                |   821 +
 llvm/lib/ObjCopy/ELF/ELFObject.cpp                 |  2795 +++
 llvm/lib/ObjCopy/ELF/ELFObject.h                   |  1108 +
 llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp      |   441 +
 llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.h        |    97 +
 llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp            |   550 +
 llvm/lib/ObjCopy/MachO/MachOObject.cpp             |   214 +
 llvm/lib/ObjCopy/MachO/MachOObject.h               |   374 +
 llvm/lib/ObjCopy/MachO/MachOReader.cpp             |   374 +
 llvm/lib/ObjCopy/MachO/MachOReader.h               |    62 +
 llvm/lib/ObjCopy/MachO/MachOWriter.cpp             |   662 +
 llvm/lib/ObjCopy/MachO/MachOWriter.h               |    76 +
 llvm/lib/ObjCopy/ObjCopy.cpp                       |    90 +
 llvm/lib/ObjCopy/XCOFF/XCOFFObjcopy.cpp            |    45 +
 llvm/lib/ObjCopy/XCOFF/XCOFFObject.h               |    48 +
 llvm/lib/ObjCopy/XCOFF/XCOFFReader.cpp             |   101 +
 llvm/lib/ObjCopy/XCOFF/XCOFFReader.h               |    35 +
 llvm/lib/ObjCopy/XCOFF/XCOFFWriter.cpp             |   125 +
 llvm/lib/ObjCopy/XCOFF/XCOFFWriter.h               |    48 +
 llvm/lib/ObjCopy/wasm/WasmObjcopy.cpp              |   160 +
 llvm/lib/ObjCopy/wasm/WasmObject.cpp               |    34 +
 llvm/lib/ObjCopy/wasm/WasmObject.h                 |    47 +
 llvm/lib/ObjCopy/wasm/WasmReader.cpp               |    39 +
 llvm/lib/ObjCopy/wasm/WasmReader.h                 |    31 +
 llvm/lib/ObjCopy/wasm/WasmWriter.cpp               |    79 +
 llvm/lib/ObjCopy/wasm/WasmWriter.h                 |    49 +
 llvm/lib/Object/Archive.cpp                        |    18 +-
 llvm/lib/Object/ArchiveWriter.cpp                  |   234 +-
 llvm/lib/Object/Binary.cpp                         |     7 +-
 llvm/lib/Object/COFFImportFile.cpp                 |     4 +
 llvm/lib/Object/COFFModuleDefinition.cpp           |     2 -
 llvm/lib/Object/COFFObjectFile.cpp                 |   161 +-
 llvm/lib/Object/DXContainer.cpp                    |   111 +
 llvm/lib/Object/Decompressor.cpp                   |     2 +-
 llvm/lib/Object/ELF.cpp                            |    29 +-
 llvm/lib/Object/ELFObjectFile.cpp                  |    87 +-
 llvm/lib/Object/Error.cpp                          |     2 +
 llvm/lib/Object/IRObjectFile.cpp                   |    16 +-
 llvm/lib/Object/IRSymtab.cpp                       |     1 -
 llvm/lib/Object/MachOObjectFile.cpp                |   229 +-
 llvm/lib/Object/MachOUniversal.cpp                 |     6 +-
 llvm/lib/Object/MachOUniversalWriter.cpp           |    12 +-
 llvm/lib/Object/ModuleSymbolTable.cpp              |     3 -
 llvm/lib/Object/Object.cpp                         |     2 +
 llvm/lib/Object/ObjectFile.cpp                     |    15 +-
 llvm/lib/Object/OffloadBinary.cpp                  |   164 +
 llvm/lib/Object/RecordStreamer.h                   |     8 +-
 llvm/lib/Object/RelocationResolver.cpp             |    45 +
 llvm/lib/Object/SymbolicFile.cpp                   |     9 +-
 llvm/lib/Object/TapiFile.cpp                       |     6 +-
 llvm/lib/Object/TapiUniversal.cpp                  |     5 +-
 llvm/lib/Object/WasmObjectFile.cpp                 |   123 +-
 llvm/lib/Object/WindowsResource.cpp                |     2 -
 llvm/lib/Object/XCOFFObjectFile.cpp                |    27 +-
 llvm/lib/ObjectYAML/COFFEmitter.cpp                |     7 +-
 llvm/lib/ObjectYAML/COFFYAML.cpp                   |     3 +
 llvm/lib/ObjectYAML/CodeViewYAMLSymbols.cpp        |     1 +
 llvm/lib/ObjectYAML/CodeViewYAMLTypes.cpp          |     5 +-
 llvm/lib/ObjectYAML/DWARFEmitter.cpp               |     6 +-
 llvm/lib/ObjectYAML/DWARFYAML.cpp                  |     2 +-
 llvm/lib/ObjectYAML/DXContainerEmitter.cpp         |   190 +
 llvm/lib/ObjectYAML/DXContainerYAML.cpp            |    61 +
 llvm/lib/ObjectYAML/ELFEmitter.cpp                 |    43 +-
 llvm/lib/ObjectYAML/ELFYAML.cpp                    |    48 +-
 llvm/lib/ObjectYAML/MachOEmitter.cpp               |    22 +-
 llvm/lib/ObjectYAML/MachOYAML.cpp                  |     9 +-
 llvm/lib/ObjectYAML/MinidumpEmitter.cpp            |     2 +-
 llvm/lib/ObjectYAML/ObjectYAML.cpp                 |     7 +
 llvm/lib/ObjectYAML/OffloadEmitter.cpp             |    68 +
 llvm/lib/ObjectYAML/OffloadYAML.cpp                |    78 +
 llvm/lib/ObjectYAML/WasmEmitter.cpp                |    62 +-
 llvm/lib/ObjectYAML/WasmYAML.cpp                   |    69 +-
 llvm/lib/ObjectYAML/XCOFFEmitter.cpp               |   162 +-
 llvm/lib/ObjectYAML/yaml2obj.cpp                   |     4 +
 llvm/lib/Option/ArgList.cpp                        |     7 +
 llvm/lib/Passes/PassBuilder.cpp                    |    41 +-
 llvm/lib/Passes/PassBuilderPipelines.cpp           |   229 +-
 llvm/lib/Passes/PassRegistry.def                   |    54 +-
 llvm/lib/Passes/StandardInstrumentations.cpp       |    84 +-
 llvm/lib/ProfileData/Coverage/CoverageMapping.cpp  |    12 +-
 .../ProfileData/Coverage/CoverageMappingReader.cpp |     4 +-
 .../ProfileData/Coverage/CoverageMappingWriter.cpp |     8 +-
 llvm/lib/ProfileData/GCOV.cpp                      |     8 +-
 llvm/lib/ProfileData/InstrProf.cpp                 |    81 +-
 llvm/lib/ProfileData/InstrProfCorrelator.cpp       |    11 +-
 llvm/lib/ProfileData/InstrProfReader.cpp           |   146 +-
 llvm/lib/ProfileData/InstrProfWriter.cpp           |   143 +-
 llvm/lib/ProfileData/MemProf.cpp                   |   110 +
 llvm/lib/ProfileData/ProfileSummaryBuilder.cpp     |    32 +-
 llvm/lib/ProfileData/RawMemProfReader.cpp          |   543 +-
 llvm/lib/ProfileData/SampleProf.cpp                |    32 +-
 llvm/lib/ProfileData/SampleProfReader.cpp          |    31 +-
 llvm/lib/ProfileData/SampleProfWriter.cpp          |    30 +-
 llvm/lib/Remarks/BitstreamRemarkSerializer.cpp     |     1 +
 llvm/lib/Remarks/RemarkLinker.cpp                  |     7 +-
 llvm/lib/Remarks/RemarkParser.cpp                  |     2 +-
 llvm/lib/Remarks/YAMLRemarkSerializer.cpp          |    10 +-
 llvm/lib/Support/AArch64TargetParser.cpp           |    64 +-
 llvm/lib/Support/APFixedPoint.cpp                  |    20 +-
 llvm/lib/Support/APFloat.cpp                       |     9 +-
 llvm/lib/Support/APInt.cpp                         |   126 +-
 llvm/lib/Support/ARMAttributeParser.cpp            |     2 +-
 llvm/lib/Support/ARMWinEH.cpp                      |    21 +-
 llvm/lib/Support/AddressRanges.cpp                 |    59 +
 llvm/lib/Support/BLAKE3/LICENSE                    |   330 +
 llvm/lib/Support/BLAKE3/README.md                  |   296 +
 llvm/lib/Support/BLAKE3/blake3.c                   |   627 +
 llvm/lib/Support/BLAKE3/blake3_avx2.c              |   326 +
 llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_unix.S  |  1826 ++
 .../BLAKE3/blake3_avx2_x86-64_windows_gnu.S        |  1817 ++
 .../BLAKE3/blake3_avx2_x86-64_windows_msvc.asm     |  1828 ++
 llvm/lib/Support/BLAKE3/blake3_avx512.c            |  1207 ++
 .../lib/Support/BLAKE3/blake3_avx512_x86-64_unix.S |  2601 +++
 .../BLAKE3/blake3_avx512_x86-64_windows_gnu.S      |  2615 +++
 .../BLAKE3/blake3_avx512_x86-64_windows_msvc.asm   |  2634 +++
 llvm/lib/Support/BLAKE3/blake3_dispatch.c          |   277 +
 llvm/lib/Support/BLAKE3/blake3_impl.h              |   312 +
 llvm/lib/Support/BLAKE3/blake3_neon.c              |   356 +
 llvm/lib/Support/BLAKE3/blake3_portable.c          |   160 +
 llvm/lib/Support/BLAKE3/blake3_sse2.c              |   566 +
 llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_unix.S  |  2307 ++
 .../BLAKE3/blake3_sse2_x86-64_windows_gnu.S        |  2332 +++
 .../BLAKE3/blake3_sse2_x86-64_windows_msvc.asm     |  2350 +++
 llvm/lib/Support/BLAKE3/blake3_sse41.c             |   560 +
 llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_unix.S |  2044 ++
 .../BLAKE3/blake3_sse41_x86-64_windows_gnu.S       |  2069 ++
 .../BLAKE3/blake3_sse41_x86-64_windows_msvc.asm    |  2089 ++
 llvm/lib/Support/BinaryStreamWriter.cpp            |    10 +-
 llvm/lib/Support/CSKYAttributeParser.cpp           |   155 +
 llvm/lib/Support/CSKYAttributes.cpp                |    33 +
 llvm/lib/Support/CSKYTargetParser.cpp              |   181 +
 llvm/lib/Support/CodeGenCoverage.cpp               |     2 +-
 llvm/lib/Support/CommandLine.cpp                   |    94 +-
 llvm/lib/Support/Compression.cpp                   |    12 +-
 llvm/lib/Support/ConvertUTFWrapper.cpp             |   102 +-
 llvm/lib/Support/CrashRecoveryContext.cpp          |    20 +-
 llvm/lib/Support/Debug.cpp                         |     2 +-
 llvm/lib/Support/DebugCounter.cpp                  |     3 +-
 llvm/lib/Support/DeltaAlgorithm.cpp                |     3 +-
 llvm/lib/Support/DynamicLibrary.cpp                |     7 +-
 llvm/lib/Support/Errno.cpp                         |     3 +-
 llvm/lib/Support/ErrorHandling.cpp                 |     5 +-
 llvm/lib/Support/FileUtilities.cpp                 |    66 +
 llvm/lib/Support/FoldingSet.cpp                    |    48 -
 llvm/lib/Support/FormatVariadic.cpp                |     2 +-
 llvm/lib/Support/Host.cpp                          |   115 +-
 llvm/lib/Support/ItaniumManglingCanonicalizer.cpp  |    14 -
 llvm/lib/Support/JSON.cpp                          |    20 +-
 llvm/lib/Support/KnownBits.cpp                     |    12 +-
 llvm/lib/Support/LineIterator.cpp                  |     2 +-
 llvm/lib/Support/MD5.cpp                           |    14 +-
 llvm/lib/Support/MathExtras.cpp                    |     2 +-
 llvm/lib/Support/Memory.cpp                        |     1 -
 llvm/lib/Support/MemoryBuffer.cpp                  |    13 +-
 llvm/lib/Support/NativeFormatting.cpp              |    10 +-
 llvm/lib/Support/Parallel.cpp                      |    10 +-
 llvm/lib/Support/Path.cpp                          |    16 +-
 llvm/lib/Support/Process.cpp                       |     2 +-
 llvm/lib/Support/Program.cpp                       |     1 -
 llvm/lib/Support/RISCVISAInfo.cpp                  |   106 +-
 llvm/lib/Support/SHA1.cpp                          |    21 +-
 llvm/lib/Support/SHA256.cpp                        |    21 +-
 llvm/lib/Support/ScopedPrinter.cpp                 |     9 +-
 llvm/lib/Support/Signals.cpp                       |    18 +-
 llvm/lib/Support/Signposts.cpp                     |     5 +-
 llvm/lib/Support/SourceMgr.cpp                     |    16 +-
 llvm/lib/Support/SpecialCaseList.cpp               |     2 +-
 llvm/lib/Support/Statistic.cpp                     |     6 +-
 llvm/lib/Support/StringMap.cpp                     |    76 +-
 llvm/lib/Support/StringRef.cpp                     |     7 +
 llvm/lib/Support/TargetParser.cpp                  |    27 +-
 llvm/lib/Support/ThreadPool.cpp                    |   171 +-
 llvm/lib/Support/TrigramIndex.cpp                  |     1 +
 llvm/lib/Support/Triple.cpp                        |   194 +-
 llvm/lib/Support/TypeSize.cpp                      |     5 +-
 llvm/lib/Support/Unicode.cpp                       |   452 +-
 llvm/lib/Support/UnicodeNameToCodepoint.cpp        |   551 +
 .../Support/UnicodeNameToCodepointGenerated.cpp    | 20911 +++++++++++++++++++
 llvm/lib/Support/Unix/COM.inc                      |     2 +-
 llvm/lib/Support/Unix/Memory.inc                   |     1 +
 llvm/lib/Support/Unix/Path.inc                     |    24 +-
 llvm/lib/Support/Unix/Process.inc                  |    39 +-
 llvm/lib/Support/Unix/Signals.inc                  |    12 +-
 llvm/lib/Support/Unix/ThreadLocal.inc              |    12 -
 llvm/lib/Support/Unix/Threading.inc                |    37 +-
 llvm/lib/Support/VirtualFileSystem.cpp             |   592 +-
 llvm/lib/Support/Windows/Path.inc                  |     2 +-
 llvm/lib/Support/Windows/Process.inc               |    10 +-
 llvm/lib/Support/Windows/Program.inc               |     3 +-
 llvm/lib/Support/Windows/Signals.inc               |    54 +-
 llvm/lib/Support/Windows/Threading.inc             |     8 +-
 llvm/lib/Support/WithColor.cpp                     |    20 +-
 llvm/lib/Support/YAMLParser.cpp                    |    71 +-
 llvm/lib/Support/Z3Solver.cpp                      |     8 +-
 llvm/lib/Support/raw_ostream.cpp                   |     4 +-
 llvm/lib/Support/regcomp.c                         |    26 +-
 llvm/lib/Support/regengine.inc                     |    39 +-
 llvm/lib/Support/xxhash.cpp                        |     1 -
 llvm/lib/TableGen/Error.cpp                        |     4 +-
 llvm/lib/TableGen/Parser.cpp                       |    39 +
 llvm/lib/TableGen/Record.cpp                       |   493 +-
 llvm/lib/TableGen/TGLexer.cpp                      |     7 +-
 llvm/lib/TableGen/TGLexer.h                        |     5 +-
 llvm/lib/TableGen/TGParser.cpp                     |   247 +-
 llvm/lib/TableGen/TGParser.h                       |     2 +-
 llvm/lib/Target/AArch64/AArch64.h                  |     3 +
 llvm/lib/Target/AArch64/AArch64.td                 |   113 +-
 llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp    |     1 +
 llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp      |    49 +-
 .../lib/Target/AArch64/AArch64CallingConvention.td |     6 +-
 llvm/lib/Target/AArch64/AArch64CollectLOH.cpp      |     6 +-
 llvm/lib/Target/AArch64/AArch64Combine.td          |     4 +-
 .../Target/AArch64/AArch64ConditionalCompares.cpp  |     4 +-
 .../Target/AArch64/AArch64ExpandPseudoInsts.cpp    |    34 +
 llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp   |     2 +-
 llvm/lib/Target/AArch64/AArch64FastISel.cpp        |    11 +-
 llvm/lib/Target/AArch64/AArch64FrameLowering.cpp   |  1098 +-
 llvm/lib/Target/AArch64/AArch64FrameLowering.h     |    19 +-
 llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp    |   294 +-
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp    |  4083 ++--
 llvm/lib/Target/AArch64/AArch64ISelLowering.h      |    83 +-
 llvm/lib/Target/AArch64/AArch64InstrAtomics.td     |    37 +-
 llvm/lib/Target/AArch64/AArch64InstrFormats.td     |   257 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp       |   608 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.h         |    56 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td        |   876 +-
 .../Target/AArch64/AArch64LoadStoreOptimizer.cpp   |   201 +-
 llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp   |   236 +-
 .../Target/AArch64/AArch64MachineFunctionInfo.cpp  |    49 +-
 .../Target/AArch64/AArch64MachineFunctionInfo.h    |    30 +-
 .../lib/Target/AArch64/AArch64MachineScheduler.cpp |    82 +
 llvm/lib/Target/AArch64/AArch64MachineScheduler.h  |    33 +
 llvm/lib/Target/AArch64/AArch64MacroFusion.cpp     |    15 +-
 llvm/lib/Target/AArch64/AArch64PerfectShuffle.h    | 13169 ++++++------
 llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp    |   114 +-
 llvm/lib/Target/AArch64/AArch64RegisterInfo.h      |     5 +-
 llvm/lib/Target/AArch64/AArch64RegisterInfo.td     |    51 +-
 llvm/lib/Target/AArch64/AArch64SLSHardening.cpp    |     4 +-
 llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td     |    73 +-
 llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td     |   583 +-
 llvm/lib/Target/AArch64/AArch64SchedA55.td         |   127 +-
 llvm/lib/Target/AArch64/AArch64SchedA64FX.td       |    12 +-
 llvm/lib/Target/AArch64/AArch64SchedAmpere1.td     |  1136 +
 llvm/lib/Target/AArch64/AArch64SchedPredAmpere.td  |    25 +
 llvm/lib/Target/AArch64/AArch64SchedPredExynos.td  |     5 +-
 llvm/lib/Target/AArch64/AArch64SchedPredicates.td  |   149 +-
 llvm/lib/Target/AArch64/AArch64SchedTSV110.td      |     3 +-
 .../lib/Target/AArch64/AArch64SelectionDAGInfo.cpp |    34 +-
 llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h  |     2 +-
 llvm/lib/Target/AArch64/AArch64StackTagging.cpp    |   203 +-
 .../Target/AArch64/AArch64StackTaggingPreRA.cpp    |     1 -
 llvm/lib/Target/AArch64/AArch64Subtarget.cpp       |    37 +-
 llvm/lib/Target/AArch64/AArch64Subtarget.h         |   365 +-
 llvm/lib/Target/AArch64/AArch64SystemOperands.td   |    12 +-
 llvm/lib/Target/AArch64/AArch64TargetMachine.cpp   |    36 +-
 llvm/lib/Target/AArch64/AArch64TargetMachine.h     |     2 +-
 .../Target/AArch64/AArch64TargetTransformInfo.cpp  |   383 +-
 .../Target/AArch64/AArch64TargetTransformInfo.h    |    24 +-
 .../Target/AArch64/AsmParser/AArch64AsmParser.cpp  |   134 +-
 .../AArch64/Disassembler/AArch64Disassembler.cpp   |   580 +-
 .../AArch64/Disassembler/AArch64Disassembler.h     |     8 +-
 .../Disassembler/AArch64ExternalSymbolizer.cpp     |     6 +-
 .../Disassembler/AArch64ExternalSymbolizer.h       |     3 +-
 .../Target/AArch64/GISel/AArch64CallLowering.cpp   |    38 +-
 .../AArch64/GISel/AArch64InstructionSelector.cpp   |   590 +-
 .../Target/AArch64/GISel/AArch64LegalizerInfo.cpp  |    12 +-
 .../AArch64/GISel/AArch64PostLegalizerCombiner.cpp |     6 +-
 .../AArch64/GISel/AArch64PostLegalizerLowering.cpp |     2 +-
 .../AArch64/GISel/AArch64PostSelectOptimize.cpp    |     2 +-
 .../AArch64/GISel/AArch64PreLegalizerCombiner.cpp  |     8 +-
 .../AArch64/GISel/AArch64RegisterBankInfo.cpp      |     9 +-
 .../Target/AArch64/GISel/AArch64RegisterBankInfo.h |     2 +-
 .../AArch64/MCTargetDesc/AArch64AsmBackend.cpp     |    26 +-
 .../AArch64/MCTargetDesc/AArch64ELFStreamer.cpp    |     1 +
 .../AArch64/MCTargetDesc/AArch64InstPrinter.cpp    |     7 +-
 .../AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp  |     2 +-
 .../Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp  |     1 +
 .../AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp   |    38 +-
 .../AArch64/MCTargetDesc/AArch64MCTargetDesc.h     |    11 +-
 .../AArch64/MCTargetDesc/AArch64TargetStreamer.cpp |     4 +-
 .../MCTargetDesc/AArch64WinCOFFObjectWriter.cpp    |     1 +
 .../MCTargetDesc/AArch64WinCOFFStreamer.cpp        |    31 +-
 llvm/lib/Target/AArch64/SMEInstrFormats.td         |   538 +-
 llvm/lib/Target/AArch64/SVEInstrFormats.td         |   378 +-
 llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp       |    10 +-
 llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h    |     3 +-
 llvm/lib/Target/AMDGPU/AMDGPU.h                    |    13 +-
 llvm/lib/Target/AMDGPU/AMDGPU.td                   |   280 +-
 .../Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp  |   144 +-
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp        |    95 +-
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h          |     3 +
 llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp   |     6 +
 llvm/lib/Target/AMDGPU/AMDGPUAttributes.def        |    31 +
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp        |   266 +-
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp      |    50 +-
 llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td        |    66 +-
 llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp    |     2 +-
 llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp    |     3 +-
 llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp  |     2 +-
 llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp  |     2 +-
 .../Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp    |    64 -
 llvm/lib/Target/AMDGPU/AMDGPUGISel.td              |    29 +-
 llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp   |    11 +
 llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h     |     4 +-
 .../Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp    |    91 +-
 llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h |     2 +-
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp          |   439 +
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h            |    22 +
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp      |   253 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h        |    17 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp      |   401 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h        |    11 +-
 llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp    |   457 +
 .../Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp   |    78 +-
 llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td          |     6 +-
 .../Target/AMDGPU/AMDGPUInstructionSelector.cpp    |   770 +-
 llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h |    31 +-
 llvm/lib/Target/AMDGPU/AMDGPUInstructions.td       |   158 +-
 llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp     |   824 +-
 llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h       |    17 +
 llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp          |    11 +-
 llvm/lib/Target/AMDGPU/AMDGPULibFunc.h             |     4 +-
 llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp   |     4 -
 .../Target/AMDGPU/AMDGPULowerKernelArguments.cpp   |     7 +-
 .../Target/AMDGPU/AMDGPULowerKernelAttributes.cpp  |    38 +-
 .../lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp |    27 +-
 llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp       |    15 +-
 llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h         |     2 +-
 llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp      |     9 +-
 llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h        |     2 +-
 .../Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp |     2 +-
 llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp   |    50 +-
 llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h     |    20 +-
 llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp |     1 +
 .../AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp   |     3 +-
 llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp  |    18 +-
 .../Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp   |     2 +-
 .../Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp   |    12 +-
 llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp     |   215 +-
 .../Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp |    64 +-
 llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   663 +-
 llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h    |     7 +-
 llvm/lib/Target/AMDGPU/AMDGPUReleaseVGPRs.cpp      |   140 +
 .../AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp      |     4 +-
 .../Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp  |    26 +-
 .../Target/AMDGPU/AMDGPUResourceUsageAnalysis.h    |    12 +-
 .../Target/AMDGPU/AMDGPURewriteOutArguments.cpp    |   152 +-
 llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td   |   168 +-
 llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp   |   166 +
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp         |   158 +-
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h           |    42 +-
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp     |    88 +-
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h       |     6 +-
 .../Target/AMDGPU/AMDGPUTargetTransformInfo.cpp    |    54 +-
 llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h |    23 +-
 llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp    |  1638 --
 llvm/lib/Target/AMDGPU/AMDKernelCodeT.h            |     2 +-
 .../Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp    |  1146 +-
 llvm/lib/Target/AMDGPU/BUFInstructions.td          |   891 +-
 llvm/lib/Target/AMDGPU/DSInstructions.td           |   546 +-
 .../AMDGPU/Disassembler/AMDGPUDisassembler.cpp     |   470 +-
 .../AMDGPU/Disassembler/AMDGPUDisassembler.h       |    87 +-
 llvm/lib/Target/AMDGPU/EXPInstructions.td          |    79 +-
 llvm/lib/Target/AMDGPU/FLATInstructions.td         |  1038 +-
 llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp           |    18 +-
 llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp     |   901 +-
 llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h       |    25 +
 llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp          |    29 +-
 llvm/lib/Target/AMDGPU/GCNProcessors.td            |    28 +
 llvm/lib/Target/AMDGPU/GCNRegPressure.h            |     2 +-
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp        |   356 +-
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.h          |    36 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h              |   349 +-
 llvm/lib/Target/AMDGPU/LDSDIRInstructions.td       |   116 +
 .../Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp    |     6 +-
 llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h |     4 +-
 .../AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp       |    29 +-
 .../AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp  |     5 +-
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp      |   257 +-
 .../Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h |    17 +-
 .../AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h      |    56 +-
 .../AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp     |     2 +
 .../AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h       |     2 -
 .../AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp   |    38 +-
 .../AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp      |     4 +-
 .../Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h  |     1 -
 .../Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp |   165 +-
 llvm/lib/Target/AMDGPU/MIMGInstructions.td         |   618 +-
 llvm/lib/Target/AMDGPU/R600.h                      |     2 +-
 llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp          |     4 +-
 llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp     |     3 +-
 .../lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp |     1 +
 llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp   |     5 +-
 llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp |     2 +
 llvm/lib/Target/AMDGPU/R600FrameLowering.cpp       |     1 +
 llvm/lib/Target/AMDGPU/R600ISelLowering.cpp        |   183 +-
 llvm/lib/Target/AMDGPU/R600InstrInfo.cpp           |    19 +-
 llvm/lib/Target/AMDGPU/R600InstrInfo.h             |     3 -
 .../Target/AMDGPU/R600MachineCFGStructurizer.cpp   |  1640 ++
 llvm/lib/Target/AMDGPU/R600Packetizer.cpp          |     2 +-
 llvm/lib/Target/AMDGPU/R600Subtarget.cpp           |     2 -
 llvm/lib/Target/AMDGPU/R600Subtarget.h             |    16 +-
 llvm/lib/Target/AMDGPU/R600TargetMachine.cpp       |     4 +-
 llvm/lib/Target/AMDGPU/R600TargetMachine.h         |     4 +-
 llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp   |    58 +-
 llvm/lib/Target/AMDGPU/SIDefines.h                 |   196 +-
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp          |   189 +-
 llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp     |     2 +-
 llvm/lib/Target/AMDGPU/SIFrameLowering.cpp         |   230 +-
 llvm/lib/Target/AMDGPU/SIFrameLowering.h           |     3 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp          |  1927 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.h            |    24 +-
 llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp     |    77 +-
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp        |   354 +-
 llvm/lib/Target/AMDGPU/SIInstrFormats.td           |    83 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp             |   667 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h               |    68 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.td              |   625 +-
 llvm/lib/Target/AMDGPU/SIInstructions.td           |   244 +-
 llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp    |    12 +-
 llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp    |   842 +-
 llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp      |    42 +-
 llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp         |    33 +-
 llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp       |    16 +-
 llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp   |   126 +-
 llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h     |   179 +-
 llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp      |    58 +-
 llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp       |   484 +-
 llvm/lib/Target/AMDGPU/SIModeRegister.cpp          |    17 +-
 llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp   |   251 +-
 .../Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp   |   110 +-
 llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp |   125 +-
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp          |    12 +-
 llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp    |    21 +-
 llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp       |    28 +-
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp          |   603 +-
 llvm/lib/Target/AMDGPU/SIRegisterInfo.h            |    28 +-
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td           |   127 +-
 llvm/lib/Target/AMDGPU/SISchedule.td               |    65 +
 llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp    |   435 +-
 llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp         |    42 +-
 llvm/lib/Target/AMDGPU/SMInstructions.td           |   410 +-
 llvm/lib/Target/AMDGPU/SOPInstructions.td          |   425 +-
 llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp    |   314 +-
 llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h      |    56 +-
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |   686 +-
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h      |   133 +-
 llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp    |   144 -
 llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h      |    38 -
 llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp |   220 +
 llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h   |    51 +
 llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp |     5 +
 llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h   |     4 +
 llvm/lib/Target/AMDGPU/VIInstrFormats.td           |     2 +-
 llvm/lib/Target/AMDGPU/VINTERPInstructions.td      |   180 +
 llvm/lib/Target/AMDGPU/VOP1Instructions.td         |   380 +-
 llvm/lib/Target/AMDGPU/VOP2Instructions.td         |   626 +-
 llvm/lib/Target/AMDGPU/VOP3Instructions.td         |   453 +-
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td        |   671 +-
 llvm/lib/Target/AMDGPU/VOPCInstructions.td         |   873 +-
 llvm/lib/Target/AMDGPU/VOPDInstructions.td         |   159 +
 llvm/lib/Target/AMDGPU/VOPInstructions.td          |   658 +-
 llvm/lib/Target/ARC/ARCMachineFunctionInfo.cpp     |     7 +
 llvm/lib/Target/ARC/ARCMachineFunctionInfo.h       |     6 +-
 llvm/lib/Target/ARC/ARCOptAddrMode.cpp             |     8 +-
 llvm/lib/Target/ARC/ARCTargetMachine.cpp           |     4 +-
 llvm/lib/Target/ARC/ARCTargetMachine.h             |     2 +-
 .../Target/ARC/Disassembler/ARCDisassembler.cpp    |    78 +-
 llvm/lib/Target/ARM/A15SDOptimizer.cpp             |     3 +-
 llvm/lib/Target/ARM/ARM.h                          |     2 +
 llvm/lib/Target/ARM/ARM.td                         |   163 +-
 llvm/lib/Target/ARM/ARMAsmPrinter.cpp              |   143 +-
 llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp           |   227 +-
 llvm/lib/Target/ARM/ARMBaseInstrInfo.h             |    29 +-
 llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp        |    26 +-
 llvm/lib/Target/ARM/ARMBaseRegisterInfo.h          |    39 +-
 llvm/lib/Target/ARM/ARMBlockPlacement.cpp          |     3 +-
 llvm/lib/Target/ARM/ARMCallingConv.td              |    21 +-
 llvm/lib/Target/ARM/ARMConstantIslandPass.cpp      |     2 +-
 llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp       |    17 +-
 llvm/lib/Target/ARM/ARMFastISel.cpp                |    45 +-
 .../Target/ARM/ARMFixCortexA57AES1742098Pass.cpp   |   432 +
 llvm/lib/Target/ARM/ARMFrameLowering.cpp           |   846 +-
 llvm/lib/Target/ARM/ARMFrameLowering.h             |     1 +
 llvm/lib/Target/ARM/ARMHazardRecognizer.cpp        |     2 +
 llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp            |    35 +-
 llvm/lib/Target/ARM/ARMISelLowering.cpp            |   413 +-
 llvm/lib/Target/ARM/ARMISelLowering.h              |    12 +-
 llvm/lib/Target/ARM/ARMInstrFormats.td             |    26 +-
 llvm/lib/Target/ARM/ARMInstrInfo.td                |    27 +-
 llvm/lib/Target/ARM/ARMInstrMVE.td                 |    89 +-
 llvm/lib/Target/ARM/ARMInstrNEON.td                |     3 +
 llvm/lib/Target/ARM/ARMInstrThumb2.td              |     7 +-
 llvm/lib/Target/ARM/ARMInstrVFP.td                 |    96 +-
 llvm/lib/Target/ARM/ARMInstructionSelector.cpp     |    16 +-
 llvm/lib/Target/ARM/ARMLegalizerInfo.cpp           |     1 +
 llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp      |    12 +-
 llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp        |     4 +-
 llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp     |     7 +
 llvm/lib/Target/ARM/ARMMachineFunctionInfo.h       |     8 +
 llvm/lib/Target/ARM/ARMParallelDSP.cpp             |     5 +
 llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp        |     7 +-
 llvm/lib/Target/ARM/ARMRegisterBankInfo.h          |     2 +-
 llvm/lib/Target/ARM/ARMRegisterInfo.cpp            |     2 +-
 llvm/lib/Target/ARM/ARMSLSHardening.cpp            |     4 +-
 llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp        |     9 +-
 llvm/lib/Target/ARM/ARMSelectionDAGInfo.h          |     1 +
 llvm/lib/Target/ARM/ARMSubtarget.cpp               |    43 +-
 llvm/lib/Target/ARM/ARMSubtarget.h                 |   476 +-
 llvm/lib/Target/ARM/ARMTargetMachine.cpp           |    28 +-
 llvm/lib/Target/ARM/ARMTargetMachine.h             |     2 +-
 llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp     |    50 +-
 llvm/lib/Target/ARM/ARMTargetTransformInfo.h       |     3 +-
 llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp     |   317 +-
 .../Target/ARM/Disassembler/ARMDisassembler.cpp    |  1287 +-
 .../lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp |   132 +-
 llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp  |     4 +-
 .../Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp   |     2 -
 .../Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp    |    12 -
 llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h |     4 +-
 .../ARM/MCTargetDesc/ARMMachObjectWriter.cpp       |     6 +-
 .../Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp  |    29 +-
 .../Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp |   227 +
 llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp   |    57 +-
 llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp    |     7 +-
 .../Target/ARM/MVETPAndVPTOptimisationsPass.cpp    |    14 +-
 llvm/lib/Target/ARM/MVEVPTBlockPass.cpp            |     3 +-
 llvm/lib/Target/ARM/Thumb1FrameLowering.cpp        |   758 +-
 llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp          |     6 +-
 llvm/lib/Target/ARM/Thumb2InstrInfo.cpp            |     6 +-
 llvm/lib/Target/ARM/Thumb2SizeReduction.cpp        |    32 +-
 llvm/lib/Target/ARM/ThumbRegisterInfo.cpp          |    50 +-
 llvm/lib/Target/AVR/AVR.h                          |     4 +-
 llvm/lib/Target/AVR/AVRAsmPrinter.cpp              |    43 +
 llvm/lib/Target/AVR/AVRCallingConv.td              |     4 +
 llvm/lib/Target/AVR/AVRDevices.td                  |   165 +-
 llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp       |   446 +-
 llvm/lib/Target/AVR/AVRFrameLowering.cpp           |   123 +-
 llvm/lib/Target/AVR/AVRISelLowering.cpp            |   160 +-
 llvm/lib/Target/AVR/AVRISelLowering.h              |     3 +
 llvm/lib/Target/AVR/AVRInstrFormats.td             |     4 +-
 llvm/lib/Target/AVR/AVRInstrInfo.cpp               |    23 +-
 llvm/lib/Target/AVR/AVRInstrInfo.td                |    97 +-
 llvm/lib/Target/AVR/AVRMachineFunctionInfo.h       |     7 +
 llvm/lib/Target/AVR/AVRRegisterInfo.cpp            |    34 +-
 llvm/lib/Target/AVR/AVRRelaxMemOperations.cpp      |   144 -
 llvm/lib/Target/AVR/AVRSubtarget.h                 |    12 +-
 llvm/lib/Target/AVR/AVRTargetMachine.cpp           |     4 +-
 llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp     |    17 +
 .../Target/AVR/Disassembler/AVRDisassembler.cpp    |   200 +-
 .../Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp |     2 +-
 .../lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp |     1 +
 llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h  |     3 +
 .../Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp   |     1 -
 llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h       |     2 +-
 llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h |     1 -
 llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp     |     1 +
 llvm/lib/Target/BPF/BPF.h                          |     2 +
 llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp    |    50 +-
 llvm/lib/Target/BPF/BPFAdjustOpt.cpp               |    10 +-
 llvm/lib/Target/BPF/BPFCORE.h                      |     2 +
 llvm/lib/Target/BPF/BPFISelLowering.cpp            |     3 +-
 llvm/lib/Target/BPF/BPFInstrFormats.td             |     1 +
 llvm/lib/Target/BPF/BPFInstrInfo.cpp               |     3 +-
 llvm/lib/Target/BPF/BPFInstrInfo.td                |     2 +
 llvm/lib/Target/BPF/BPFMIChecking.cpp              |     1 +
 llvm/lib/Target/BPF/BPFMIPeephole.cpp              |     7 +-
 llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp     |    24 +-
 llvm/lib/Target/BPF/BPFPreserveDIType.cpp          |     1 +
 llvm/lib/Target/BPF/BPFTargetMachine.cpp           |     4 +-
 llvm/lib/Target/BPF/BPFTargetMachine.h             |     2 +-
 llvm/lib/Target/BPF/BPFTargetTransformInfo.h       |     9 +
 llvm/lib/Target/BPF/BTF.def                        |     1 +
 llvm/lib/Target/BPF/BTF.h                          |    10 +
 llvm/lib/Target/BPF/BTFDebug.cpp                   |   197 +-
 llvm/lib/Target/BPF/BTFDebug.h                     |    26 +-
 .../Target/BPF/Disassembler/BPFDisassembler.cpp    |    16 +-
 llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp |     5 +
 .../lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp |     1 +
 llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h    |     2 -
 .../Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp   |     6 +-
 llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h |     3 +-
 llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp   |   271 +-
 llvm/lib/Target/CSKY/CSKY.h                        |     2 +
 llvm/lib/Target/CSKY/CSKY.td                       |   523 +
 llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp            |   105 +-
 llvm/lib/Target/CSKY/CSKYAsmPrinter.h              |    14 +-
 llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp    |     6 +-
 llvm/lib/Target/CSKY/CSKYFrameLowering.cpp         |    23 +-
 llvm/lib/Target/CSKY/CSKYISelDAGToDAG.cpp          |   219 +
 llvm/lib/Target/CSKY/CSKYISelLowering.cpp          |   180 +-
 llvm/lib/Target/CSKY/CSKYISelLowering.h            |     6 +
 llvm/lib/Target/CSKY/CSKYInstrAlias.td             |    38 +
 llvm/lib/Target/CSKY/CSKYInstrFormats.td           |     2 +-
 llvm/lib/Target/CSKY/CSKYInstrInfo.cpp             |     9 +-
 llvm/lib/Target/CSKY/CSKYInstrInfo.h               |     2 +-
 llvm/lib/Target/CSKY/CSKYInstrInfo.td              |    32 +-
 llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td       |   131 +
 llvm/lib/Target/CSKY/CSKYMachineFunctionInfo.h     |    11 +-
 llvm/lib/Target/CSKY/CSKYRegisterInfo.cpp          |    20 +-
 llvm/lib/Target/CSKY/CSKYRegisterInfo.td           |    14 +-
 llvm/lib/Target/CSKY/CSKYSubtarget.cpp             |    33 +-
 llvm/lib/Target/CSKY/CSKYSubtarget.h               |   102 +-
 llvm/lib/Target/CSKY/CSKYTargetMachine.cpp         |    12 +-
 llvm/lib/Target/CSKY/CSKYTargetObjectFile.cpp      |    25 +
 llvm/lib/Target/CSKY/CSKYTargetObjectFile.h        |    24 +
 .../Target/CSKY/Disassembler/CSKYDisassembler.cpp  |   553 +
 .../Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp    |   184 +-
 llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h |    13 +
 .../CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp      |   110 +-
 .../Target/CSKY/MCTargetDesc/CSKYELFStreamer.cpp   |   335 +
 .../lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.h |   148 +
 .../Target/CSKY/MCTargetDesc/CSKYInstPrinter.cpp   |    68 +-
 .../Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.cpp |   161 +-
 .../Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.h   |    12 +
 llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCExpr.cpp   |     2 +
 .../Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp  |    88 +
 .../Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.h    |     4 +-
 .../CSKY/MCTargetDesc/CSKYTargetStreamer.cpp       |   143 +
 .../Target/CSKY/MCTargetDesc/CSKYTargetStreamer.h  |   110 +
 llvm/lib/Target/DirectX/DXIL.td                    |   144 +
 llvm/lib/Target/DirectX/DXILConstants.h            |    25 +
 llvm/lib/Target/DirectX/DXILOpLowering.cpp         |   265 +
 llvm/lib/Target/DirectX/DXILPointerType.cpp        |    66 +
 llvm/lib/Target/DirectX/DXILPointerType.h          |    52 +
 llvm/lib/Target/DirectX/DXILPrepare.cpp            |   184 +
 llvm/lib/Target/DirectX/DXILStubs.td               |    18 +
 llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp  |   121 +
 .../DirectX/DXILWriter/DXILBitcodeWriter.cpp       |  2963 +++
 .../Target/DirectX/DXILWriter/DXILBitcodeWriter.h  |    82 +
 .../DirectX/DXILWriter/DXILValueEnumerator.cpp     |  1147 +
 .../DirectX/DXILWriter/DXILValueEnumerator.h       |   308 +
 .../Target/DirectX/DXILWriter/DXILWriterPass.cpp   |   100 +
 .../lib/Target/DirectX/DXILWriter/DXILWriterPass.h |    37 +
 llvm/lib/Target/DirectX/DirectX.h                  |    43 +
 llvm/lib/Target/DirectX/DirectX.td                 |    54 +
 llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp      |    57 +
 llvm/lib/Target/DirectX/DirectXFrameLowering.h     |    35 +
 llvm/lib/Target/DirectX/DirectXInstrInfo.cpp       |    20 +
 llvm/lib/Target/DirectX/DirectXInstrInfo.h         |    30 +
 llvm/lib/Target/DirectX/DirectXRegisterInfo.cpp    |    24 +
 llvm/lib/Target/DirectX/DirectXRegisterInfo.h      |    28 +
 llvm/lib/Target/DirectX/DirectXSubtarget.cpp       |    29 +
 llvm/lib/Target/DirectX/DirectXSubtarget.h         |    56 +
 llvm/lib/Target/DirectX/DirectXTargetLowering.h    |    31 +
 llvm/lib/Target/DirectX/DirectXTargetMachine.cpp   |   144 +
 llvm/lib/Target/DirectX/DirectXTargetMachine.h     |    51 +
 .../Target/DirectX/DirectXTargetTransformInfo.h    |    39 +
 .../MCTargetDesc/DirectXContainerObjectWriter.cpp  |    28 +
 .../MCTargetDesc/DirectXContainerObjectWriter.h    |    24 +
 .../DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp   |   152 +
 .../DirectX/MCTargetDesc/DirectXMCTargetDesc.h     |    29 +
 llvm/lib/Target/DirectX/PointerTypeAnalysis.cpp    |   119 +
 llvm/lib/Target/DirectX/PointerTypeAnalysis.h      |    43 +
 .../DirectX/TargetInfo/DirectXTargetInfo.cpp       |    30 +
 .../Target/DirectX/TargetInfo/DirectXTargetInfo.h  |    18 +
 .../Target/Hexagon/AsmParser/HexagonAsmParser.cpp  |     8 +-
 llvm/lib/Target/Hexagon/BitTracker.cpp             |     3 +-
 .../Hexagon/Disassembler/HexagonDisassembler.cpp   |   160 +-
 llvm/lib/Target/Hexagon/HexagonArch.h              |    31 -
 llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp      |     8 +-
 llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp     |    84 +-
 .../lib/Target/Hexagon/HexagonBranchRelaxation.cpp |     5 +-
 llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp       |     7 +-
 llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp  |    12 +-
 .../lib/Target/Hexagon/HexagonConstPropagation.cpp |    22 +-
 llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp   |    18 +-
 llvm/lib/Target/Hexagon/HexagonDepArch.h           |    88 +-
 llvm/lib/Target/Hexagon/HexagonDepDecoders.inc     |    44 +-
 llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp     |     4 +-
 llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp  |     2 +-
 llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp   |    61 +-
 llvm/lib/Target/Hexagon/HexagonGenInsert.cpp       |    51 +-
 llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp   |    10 +-
 .../lib/Target/Hexagon/HexagonHazardRecognizer.cpp |     2 +-
 llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp    |    11 +-
 llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp |     2 +-
 llvm/lib/Target/Hexagon/HexagonISelLowering.cpp    |   101 +-
 llvm/lib/Target/Hexagon/HexagonISelLowering.h      |     2 +-
 llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp |   145 +-
 llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp       |    65 +-
 .../Target/Hexagon/HexagonLoopIdiomRecognition.cpp |    13 +-
 .../Target/Hexagon/HexagonMachineFunctionInfo.cpp  |     6 +
 .../Target/Hexagon/HexagonMachineFunctionInfo.h    |     4 +
 llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp    |     3 +-
 llvm/lib/Target/Hexagon/HexagonPatterns.td         |     6 +
 llvm/lib/Target/Hexagon/HexagonPatternsHVX.td      |    19 +-
 llvm/lib/Target/Hexagon/HexagonPeephole.cpp        |    28 +-
 llvm/lib/Target/Hexagon/HexagonPseudo.td           |    22 +
 llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp          |     2 +-
 llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp    |     2 +-
 llvm/lib/Target/Hexagon/HexagonSubtarget.cpp       |    58 +-
 llvm/lib/Target/Hexagon/HexagonSubtarget.h         |     2 +-
 llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp   |    75 +-
 llvm/lib/Target/Hexagon/HexagonTargetMachine.h     |     2 +-
 .../lib/Target/Hexagon/HexagonTargetObjectFile.cpp |     7 +-
 .../Target/Hexagon/HexagonTargetTransformInfo.cpp  |     3 +-
 .../Target/Hexagon/HexagonTargetTransformInfo.h    |    10 +-
 llvm/lib/Target/Hexagon/HexagonVExtract.cpp        |    12 +-
 llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp  |    25 +-
 llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp   |     4 +-
 .../Hexagon/HexagonVectorLoopCarriedReuse.cpp      |     6 +-
 .../Target/Hexagon/HexagonVectorLoopCarriedReuse.h |     2 +-
 llvm/lib/Target/Hexagon/HexagonVectorPrint.cpp     |     6 +-
 .../Hexagon/MCTargetDesc/HexagonAsmBackend.cpp     |     1 +
 .../Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp      |     1 -
 .../Hexagon/MCTargetDesc/HexagonMCChecker.cpp      |     5 +-
 .../Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp  |     1 -
 .../Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp  |     4 +-
 .../Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp  |     1 +
 .../Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp   |    25 +-
 .../Hexagon/MCTargetDesc/HexagonMCTargetDesc.h     |     1 -
 .../Hexagon/MCTargetDesc/HexagonShuffler.cpp       |     4 +-
 llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp |     4 +-
 .../Lanai/Disassembler/LanaiDisassembler.cpp       |    45 +-
 llvm/lib/Target/Lanai/LanaiISelLowering.cpp        |     6 +-
 llvm/lib/Target/Lanai/LanaiInstrInfo.cpp           |     4 +-
 llvm/lib/Target/Lanai/LanaiMachineFunctionInfo.cpp |     7 +
 llvm/lib/Target/Lanai/LanaiMachineFunctionInfo.h   |     4 +
 llvm/lib/Target/Lanai/LanaiTargetMachine.cpp       |     4 +-
 llvm/lib/Target/Lanai/LanaiTargetMachine.h         |     2 +-
 .../Target/Lanai/MCTargetDesc/LanaiInstPrinter.h   |     3 +-
 .../Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp      |     1 -
 .../Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h  |     1 -
 .../LoongArch/AsmParser/LoongArchAsmParser.cpp     |   556 +
 .../Disassembler/LoongArchDisassembler.cpp         |   145 +
 llvm/lib/Target/LoongArch/LoongArch.h              |    38 +
 llvm/lib/Target/LoongArch/LoongArch.td             |   139 +
 llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp  |    48 +
 llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h    |    46 +
 llvm/lib/Target/LoongArch/LoongArchCallingConv.td  |    23 +
 .../Target/LoongArch/LoongArchFloat32InstrInfo.td  |   177 +
 .../Target/LoongArch/LoongArchFloat64InstrInfo.td  |   188 +
 .../Target/LoongArch/LoongArchFloatInstrFormats.td |   241 +
 .../Target/LoongArch/LoongArchFrameLowering.cpp    |    55 +
 llvm/lib/Target/LoongArch/LoongArchFrameLowering.h |    38 +
 .../lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp |   132 +
 llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h  |    55 +
 .../lib/Target/LoongArch/LoongArchISelLowering.cpp |   531 +
 llvm/lib/Target/LoongArch/LoongArchISelLowering.h  |    95 +
 llvm/lib/Target/LoongArch/LoongArchInstrFormats.td |   404 +
 llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp   |    49 +
 llvm/lib/Target/LoongArch/LoongArchInstrInfo.h     |    36 +
 llvm/lib/Target/LoongArch/LoongArchInstrInfo.td    |   730 +
 llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp |    66 +
 .../LoongArch/LoongArchMachineFunctionInfo.h       |    57 +
 .../lib/Target/LoongArch/LoongArchRegisterInfo.cpp |   115 +
 llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h  |    50 +
 llvm/lib/Target/LoongArch/LoongArchRegisterInfo.td |   161 +
 llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp   |    54 +
 llvm/lib/Target/LoongArch/LoongArchSubtarget.h     |    89 +
 .../Target/LoongArch/LoongArchTargetMachine.cpp    |   118 +
 llvm/lib/Target/LoongArch/LoongArchTargetMachine.h |    46 +
 .../LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp |    68 +
 .../LoongArch/MCTargetDesc/LoongArchAsmBackend.h   |    63 +
 .../LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp   |    40 +
 .../LoongArch/MCTargetDesc/LoongArchBaseInfo.h     |    44 +
 .../MCTargetDesc/LoongArchELFObjectWriter.cpp      |    64 +
 .../MCTargetDesc/LoongArchInstPrinter.cpp          |    63 +
 .../LoongArch/MCTargetDesc/LoongArchInstPrinter.h  |    49 +
 .../LoongArch/MCTargetDesc/LoongArchMCAsmInfo.cpp  |    34 +
 .../LoongArch/MCTargetDesc/LoongArchMCAsmInfo.h    |    30 +
 .../MCTargetDesc/LoongArchMCCodeEmitter.cpp        |   127 +
 .../MCTargetDesc/LoongArchMCTargetDesc.cpp         |   114 +
 .../LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h |    54 +
 .../LoongArch/MCTargetDesc/LoongArchMatInt.cpp     |    51 +
 .../LoongArch/MCTargetDesc/LoongArchMatInt.h       |    30 +
 .../LoongArch/TargetInfo/LoongArchTargetInfo.cpp   |    30 +
 .../LoongArch/TargetInfo/LoongArchTargetInfo.h     |    21 +
 llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp   |     1 +
 .../Target/M68k/Disassembler/M68kDisassembler.cpp  |   618 +-
 llvm/lib/Target/M68k/GISel/M68kCallLowering.cpp    |    98 +-
 llvm/lib/Target/M68k/GISel/M68kCallLowering.h      |    12 +
 .../lib/Target/M68k/GISel/M68kRegisterBankInfo.cpp |     4 +-
 llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.h  |     2 +-
 llvm/lib/Target/M68k/M68kCollapseMOVEMPass.cpp     |     2 +-
 llvm/lib/Target/M68k/M68kExpandPseudo.cpp          |     2 +-
 llvm/lib/Target/M68k/M68kISelDAGToDAG.cpp          |    30 +-
 llvm/lib/Target/M68k/M68kISelLowering.cpp          |   107 +-
 llvm/lib/Target/M68k/M68kISelLowering.h            |     2 +
 llvm/lib/Target/M68k/M68kInstrArithmetic.td        |   717 +-
 llvm/lib/Target/M68k/M68kInstrBits.td              |    75 +-
 llvm/lib/Target/M68k/M68kInstrControl.td           |   166 +-
 llvm/lib/Target/M68k/M68kInstrData.td              |   653 +-
 llvm/lib/Target/M68k/M68kInstrFormats.td           |   136 +
 llvm/lib/Target/M68k/M68kInstrInfo.cpp             |    53 +-
 llvm/lib/Target/M68k/M68kInstrInfo.td              |   106 +-
 llvm/lib/Target/M68k/M68kInstrShiftRotate.td       |    54 +-
 llvm/lib/Target/M68k/M68kMachineFunction.cpp       |     7 +
 llvm/lib/Target/M68k/M68kMachineFunction.h         |     9 +-
 llvm/lib/Target/M68k/M68kRegisterInfo.cpp          |     1 +
 llvm/lib/Target/M68k/M68kRegisterInfo.h            |     8 +
 llvm/lib/Target/M68k/M68kSubtarget.h               |     2 +-
 .../Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp |   452 +-
 .../Target/M68k/MCTargetDesc/M68kMCTargetDesc.h    |     1 -
 .../Target/MSP430/AsmParser/MSP430AsmParser.cpp    |     1 +
 .../MSP430/Disassembler/MSP430Disassembler.cpp     |    14 +-
 .../MSP430/MCTargetDesc/MSP430AsmBackend.cpp       |     2 +-
 .../MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp  |     2 +-
 .../MSP430/MCTargetDesc/MSP430ELFStreamer.cpp      |     3 +-
 .../MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp    |     3 +-
 .../MSP430/MCTargetDesc/MSP430MCTargetDesc.h       |     1 -
 llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp        |     4 +-
 llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp      |     3 +-
 llvm/lib/Target/MSP430/MSP430ISelLowering.cpp      |    34 +-
 llvm/lib/Target/MSP430/MSP430InstrInfo.cpp         |     3 +-
 .../Target/MSP430/MSP430MachineFunctionInfo.cpp    |     7 +
 llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h |     5 +
 llvm/lib/Target/MSP430/MSP430TargetMachine.cpp     |     6 +-
 llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp   |    17 +-
 .../Target/Mips/Disassembler/MipsDisassembler.cpp  |   866 +-
 .../Target/Mips/MCTargetDesc/MipsABIFlagsSection.h |     2 +-
 llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp  |     2 +
 .../Target/Mips/MCTargetDesc/MipsAsmBackend.cpp    |    13 +
 .../Mips/MCTargetDesc/MipsELFObjectWriter.cpp      |     2 +
 .../Target/Mips/MCTargetDesc/MipsELFStreamer.cpp   |     4 +-
 .../lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h |     2 +-
 .../Target/Mips/MCTargetDesc/MipsInstPrinter.cpp   |   162 +-
 .../lib/Target/Mips/MCTargetDesc/MipsInstPrinter.h |    50 +-
 .../Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp |     2 -
 .../Target/Mips/MCTargetDesc/MipsMCTargetDesc.h    |     2 -
 .../Target/Mips/MCTargetDesc/MipsOptionRecord.cpp  |     8 +-
 .../Mips/MCTargetDesc/MipsTargetStreamer.cpp       |    36 +-
 llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td     |     4 +
 llvm/lib/Target/Mips/MicroMipsInstrFPU.td          |    28 +-
 llvm/lib/Target/Mips/MicroMipsInstrInfo.td         |     5 +
 llvm/lib/Target/Mips/MicroMipsSizeReduction.cpp    |     2 +-
 llvm/lib/Target/Mips/Mips.h                        |     2 +
 llvm/lib/Target/Mips/Mips.td                       |     6 +
 llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp        |     2 +-
 llvm/lib/Target/Mips/Mips16RegisterInfo.cpp        |     2 +-
 llvm/lib/Target/Mips/Mips32r6InstrInfo.td          |     2 +
 llvm/lib/Target/Mips/MipsAsmPrinter.cpp            |    28 +-
 llvm/lib/Target/Mips/MipsBranchExpansion.cpp       |    44 +-
 llvm/lib/Target/Mips/MipsCallLowering.cpp          |     4 +-
 llvm/lib/Target/Mips/MipsCombine.td                |    15 +
 llvm/lib/Target/Mips/MipsConstantIslandPass.cpp    |     6 +-
 llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp       |     2 +-
 llvm/lib/Target/Mips/MipsExpandPseudo.cpp          |     2 +-
 llvm/lib/Target/Mips/MipsFastISel.cpp              |    18 +-
 llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp          |     2 +-
 llvm/lib/Target/Mips/MipsISelLowering.cpp          |    98 +-
 llvm/lib/Target/Mips/MipsISelLowering.h            |    10 +-
 llvm/lib/Target/Mips/MipsInstrInfo.cpp             |    44 +-
 llvm/lib/Target/Mips/MipsInstrInfo.h               |    13 +
 llvm/lib/Target/Mips/MipsInstrInfo.td              |     4 +
 llvm/lib/Target/Mips/MipsLegalizerInfo.cpp         |     4 +-
 llvm/lib/Target/Mips/MipsMachineFunction.cpp       |     9 +-
 llvm/lib/Target/Mips/MipsMachineFunction.h         |     5 +
 llvm/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp    |     3 +-
 llvm/lib/Target/Mips/MipsOptimizePICCall.cpp       |     4 +-
 llvm/lib/Target/Mips/MipsOs16.cpp                  |     1 +
 llvm/lib/Target/Mips/MipsPostLegalizerCombiner.cpp |   148 +
 llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp  |     4 +-
 llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp      |     8 +-
 llvm/lib/Target/Mips/MipsRegisterBankInfo.h        |     2 +-
 llvm/lib/Target/Mips/MipsSEFrameLowering.cpp       |     2 +-
 llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp        |    36 +-
 llvm/lib/Target/Mips/MipsSEISelLowering.cpp        |    24 +-
 llvm/lib/Target/Mips/MipsSERegisterInfo.cpp        |     2 +-
 llvm/lib/Target/Mips/MipsScheduleGeneric.td        |     8 +-
 llvm/lib/Target/Mips/MipsSubtarget.cpp             |    15 +-
 llvm/lib/Target/Mips/MipsSubtarget.h               |    11 +-
 llvm/lib/Target/Mips/MipsTargetMachine.cpp         |    15 +-
 llvm/lib/Target/Mips/MipsTargetMachine.h           |     2 +-
 llvm/lib/Target/Mips/MipsTargetStreamer.h          |     2 +-
 llvm/lib/Target/Mips/MipsTargetTransformInfo.cpp   |    17 +
 llvm/lib/Target/Mips/MipsTargetTransformInfo.h     |    40 +
 .../Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp   |    11 +
 .../NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp     |     2 +-
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp          |   133 +-
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h            |     5 +-
 llvm/lib/Target/NVPTX/NVPTXAtomicLower.cpp         |     2 +-
 llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp       |    17 +-
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp        |    45 +-
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp        |   520 +-
 llvm/lib/Target/NVPTX/NVPTXISelLowering.h          |    21 +
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp           |     2 +-
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td            |    53 +-
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td           |   352 +-
 llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp     |     3 +-
 llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp           |   119 +-
 llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h   |     7 +
 llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp |     8 +-
 llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp           |     2 +-
 llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp       |     4 +-
 llvm/lib/Target/NVPTX/NVPTXTargetMachine.h         |     2 +-
 llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h      |     2 +-
 llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp |    65 +-
 llvm/lib/Target/NVPTX/NVVMReflect.cpp              |    12 +-
 llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp |    63 +-
 .../PowerPC/Disassembler/PPCDisassembler.cpp       |    98 +-
 llvm/lib/Target/PowerPC/GISel/PPCCallLowering.cpp  |     1 +
 .../Target/PowerPC/GISel/PPCRegisterBankInfo.cpp   |     3 +-
 .../lib/Target/PowerPC/GISel/PPCRegisterBankInfo.h |     4 +-
 .../Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp  |     2 +
 .../PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp    |     2 +
 .../Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp |     6 +-
 .../Target/PowerPC/MCTargetDesc/PPCFixupKinds.h    |     4 +
 .../PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp      |    11 +-
 llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp |    13 +-
 .../Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h  |     1 -
 .../PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp  |    23 +-
 .../PowerPC/MCTargetDesc/PPCXCOFFStreamer.cpp      |     1 +
 llvm/lib/Target/PowerPC/P10InstrResources.td       |     8 +-
 llvm/lib/Target/PowerPC/P9InstrResources.td        |    10 +-
 llvm/lib/Target/PowerPC/PPC.h                      |     8 +-
 llvm/lib/Target/PowerPC/PPC.td                     |    12 +-
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp          |    88 +-
 llvm/lib/Target/PowerPC/PPCBack2BackFusion.def     |     2 +
 llvm/lib/Target/PowerPC/PPCCTRLoops.cpp            |   421 +-
 llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp      |   185 +
 llvm/lib/Target/PowerPC/PPCCallingConv.td          |    22 +
 llvm/lib/Target/PowerPC/PPCFastISel.cpp            |     2 +-
 llvm/lib/Target/PowerPC/PPCFrameLowering.cpp       |    42 +-
 .../lib/Target/PowerPC/PPCGenScalarMASSEntries.cpp |   149 +
 llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp        |    14 +-
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp        |   485 +-
 llvm/lib/Target/PowerPC/PPCISelLowering.h          |    43 +-
 llvm/lib/Target/PowerPC/PPCInstr64Bit.td           |    19 +-
 llvm/lib/Target/PowerPC/PPCInstrInfo.cpp           |    91 +-
 llvm/lib/Target/PowerPC/PPCInstrInfo.h             |    99 +
 llvm/lib/Target/PowerPC/PPCInstrInfo.td            |   543 +-
 llvm/lib/Target/PowerPC/PPCInstrMMA.td             |   628 +
 llvm/lib/Target/PowerPC/PPCInstrP10.td             |  2315 ++
 llvm/lib/Target/PowerPC/PPCInstrPrefix.td          |  2889 ---
 llvm/lib/Target/PowerPC/PPCInstrVSX.td             |    76 +-
 llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp   |     1 -
 llvm/lib/Target/PowerPC/PPCMCInstLower.cpp         |     2 +-
 llvm/lib/Target/PowerPC/PPCMIPeephole.cpp          |     3 +-
 llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp |     7 +
 llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h   |     5 +
 llvm/lib/Target/PowerPC/PPCMacroFusion.cpp         |     5 +-
 llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp     |    37 +
 llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp        |   171 +-
 llvm/lib/Target/PowerPC/PPCRegisterInfo.h          |     2 +
 llvm/lib/Target/PowerPC/PPCRegisterInfo.td         |   655 +-
 llvm/lib/Target/PowerPC/PPCRegisterInfoMMA.td      |   106 +
 llvm/lib/Target/PowerPC/PPCScheduleP10.td          |     2 +-
 llvm/lib/Target/PowerPC/PPCScheduleP9.td           |     3 +-
 llvm/lib/Target/PowerPC/PPCSubtarget.cpp           |     2 +
 llvm/lib/Target/PowerPC/PPCSubtarget.h             |     4 +-
 llvm/lib/Target/PowerPC/PPCTargetMachine.cpp       |    34 +-
 llvm/lib/Target/PowerPC/PPCTargetMachine.h         |     2 +-
 llvm/lib/Target/PowerPC/PPCTargetStreamer.h        |     1 +
 llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp |    20 +-
 llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h   |     7 +-
 llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp      |     2 +
 llvm/lib/Target/PowerPC/README_P9.txt              |     9 +-
 llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp |   172 +-
 .../RISCV/Disassembler/RISCVDisassembler.cpp       |   118 +-
 .../Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp  |    10 +-
 .../Target/RISCV/MCTargetDesc/RISCVAsmBackend.h    |     6 +-
 .../Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp    |    25 +-
 llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h |    57 +-
 .../RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp    |     2 +-
 .../Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp |   101 +-
 .../Target/RISCV/MCTargetDesc/RISCVELFStreamer.h   |     5 +
 .../Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp |    10 +-
 .../Target/RISCV/MCTargetDesc/RISCVInstPrinter.h   |     4 +-
 .../RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp      |     8 +-
 llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp |     1 +
 .../RISCV/MCTargetDesc/RISCVMCObjectFileInfo.cpp   |     1 +
 .../RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp       |     6 +-
 .../Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h  |     1 -
 llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp |   179 +-
 llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h   |    10 +
 .../RISCV/MCTargetDesc/RISCVTargetStreamer.cpp     |    16 +-
 .../RISCV/MCTargetDesc/RISCVTargetStreamer.h       |     6 +
 llvm/lib/Target/RISCV/RISCV.h                      |    11 +-
 llvm/lib/Target/RISCV/RISCV.td                     |   108 +-
 llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp          |    21 +-
 llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp   |    11 +-
 llvm/lib/Target/RISCV/RISCVFrameLowering.cpp       |   468 +-
 llvm/lib/Target/RISCV/RISCVFrameLowering.h         |     5 +-
 .../Target/RISCV/RISCVGatherScatterLowering.cpp    |    26 +-
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp        |   970 +-
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h          |    20 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp        |  4026 ++--
 llvm/lib/Target/RISCV/RISCVISelLowering.h          |   115 +-
 llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp       |  1772 +-
 llvm/lib/Target/RISCV/RISCVInstrFormats.td         |     5 +-
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp           |   206 +-
 llvm/lib/Target/RISCV/RISCVInstrInfo.h             |    26 +-
 llvm/lib/Target/RISCV/RISCVInstrInfo.td            |   325 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoA.td           |    30 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoD.td           |   239 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoF.td           |   327 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoM.td           |     8 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoV.td           |    57 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td    |   987 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td |   575 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td |  1227 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoZb.td          |   264 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td         |   245 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoZicbo.td       |    71 +
 llvm/lib/Target/RISCV/RISCVMCInstLower.cpp         |    13 +-
 llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.cpp |    37 +
 llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h   |    35 +
 llvm/lib/Target/RISCV/RISCVMacroFusion.cpp         |    67 +
 llvm/lib/Target/RISCV/RISCVMacroFusion.h           |    28 +
 llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp    |   382 +
 llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp     |   280 +-
 .../Target/RISCV/RISCVRedundantCopyElimination.cpp |   179 +
 llvm/lib/Target/RISCV/RISCVRegisterBankInfo.cpp    |     7 +-
 llvm/lib/Target/RISCV/RISCVRegisterBankInfo.h      |     2 +-
 llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp        |    15 +-
 llvm/lib/Target/RISCV/RISCVRegisterInfo.td         |    36 +
 llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp        |   275 +-
 llvm/lib/Target/RISCV/RISCVSchedRocket.td          |     5 +
 llvm/lib/Target/RISCV/RISCVSchedSiFive7.td         |     5 +
 llvm/lib/Target/RISCV/RISCVScheduleB.td            |   206 +
 llvm/lib/Target/RISCV/RISCVSubtarget.cpp           |    58 +-
 llvm/lib/Target/RISCV/RISCVSubtarget.h             |   100 +-
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp       |    84 +-
 llvm/lib/Target/RISCV/RISCVTargetMachine.h         |    10 +-
 llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp |   231 +-
 llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h   |   106 +-
 .../Target/SPIRV/MCTargetDesc/SPIRVAsmBackend.cpp  |    63 +
 .../Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp    |  1072 +
 llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h |   739 +
 .../Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp |   556 +
 .../Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.h   |    94 +
 .../Target/SPIRV/MCTargetDesc/SPIRVMCAsmInfo.cpp   |    34 +
 .../lib/Target/SPIRV/MCTargetDesc/SPIRVMCAsmInfo.h |    29 +
 .../SPIRV/MCTargetDesc/SPIRVMCCodeEmitter.cpp      |   132 +
 .../SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp       |   102 +
 .../Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.h  |    52 +
 .../SPIRV/MCTargetDesc/SPIRVObjectTargetWriter.cpp |    25 +
 .../SPIRV/MCTargetDesc/SPIRVTargetStreamer.cpp     |    18 +
 .../SPIRV/MCTargetDesc/SPIRVTargetStreamer.h       |    28 +
 llvm/lib/Target/SPIRV/SPIRV.h                      |    34 +
 llvm/lib/Target/SPIRV/SPIRV.td                     |    43 +
 llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp          |   348 +
 llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp        |   223 +
 llvm/lib/Target/SPIRV/SPIRVCallLowering.h          |    50 +
 llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp      |   433 +
 llvm/lib/Target/SPIRV/SPIRVEnums.td                |    51 +
 llvm/lib/Target/SPIRV/SPIRVFrameLowering.h         |    39 +
 llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp      |   459 +
 llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h        |   174 +
 llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp        |    45 +
 llvm/lib/Target/SPIRV/SPIRVISelLowering.h          |    47 +
 llvm/lib/Target/SPIRV/SPIRVInstrFormats.td         |    31 +
 llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp           |   195 +
 llvm/lib/Target/SPIRV/SPIRVInstrInfo.h             |    54 +
 llvm/lib/Target/SPIRV/SPIRVInstrInfo.td            |   732 +
 llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp |  1268 ++
 llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp       |   301 +
 llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h         |    36 +
 llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp         |    58 +
 llvm/lib/Target/SPIRV/SPIRVMCInstLower.h           |    29 +
 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp      |   250 +
 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h        |   137 +
 llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp        |   440 +
 llvm/lib/Target/SPIRV/SPIRVRegisterBankInfo.cpp    |    47 +
 llvm/lib/Target/SPIRV/SPIRVRegisterBankInfo.h      |    38 +
 llvm/lib/Target/SPIRV/SPIRVRegisterBanks.td        |    15 +
 llvm/lib/Target/SPIRV/SPIRVRegisterInfo.cpp        |    32 +
 llvm/lib/Target/SPIRV/SPIRVRegisterInfo.h          |    36 +
 llvm/lib/Target/SPIRV/SPIRVRegisterInfo.td         |    39 +
 llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp           |    68 +
 llvm/lib/Target/SPIRV/SPIRVSubtarget.h             |    93 +
 llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp       |   186 +
 llvm/lib/Target/SPIRV/SPIRVTargetMachine.h         |    47 +
 llvm/lib/Target/SPIRV/SPIRVTargetObjectFile.h      |    45 +
 llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h   |    44 +
 llvm/lib/Target/SPIRV/SPIRVUtils.cpp               |   207 +
 llvm/lib/Target/SPIRV/SPIRVUtils.h                 |    83 +
 .../Target/SPIRV/TargetInfo/SPIRVTargetInfo.cpp    |    28 +
 llvm/lib/Target/SPIRV/TargetInfo/SPIRVTargetInfo.h |    21 +
 llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp |   124 +-
 llvm/lib/Target/Sparc/DelaySlotFiller.cpp          |    11 +-
 .../Sparc/Disassembler/SparcDisassembler.cpp       |   185 +-
 .../Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp  |    23 +-
 .../Sparc/MCTargetDesc/SparcELFObjectWriter.cpp    |     7 +-
 .../Target/Sparc/MCTargetDesc/SparcFixupKinds.h    |    12 +
 .../Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp      |    15 +-
 llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp |    16 +
 llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h   |     7 +-
 .../Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h  |     1 -
 llvm/lib/Target/Sparc/SparcCallingConv.td          |     2 +-
 llvm/lib/Target/Sparc/SparcFrameLowering.cpp       |    31 +-
 llvm/lib/Target/Sparc/SparcISelLowering.cpp        |   172 +-
 llvm/lib/Target/Sparc/SparcISelLowering.h          |    10 +-
 llvm/lib/Target/Sparc/SparcInstr64Bit.td           |    23 +-
 llvm/lib/Target/Sparc/SparcInstrInfo.td            |   123 +-
 llvm/lib/Target/Sparc/SparcMachineFunctionInfo.cpp |     7 +
 llvm/lib/Target/Sparc/SparcMachineFunctionInfo.h   |     5 +
 llvm/lib/Target/Sparc/SparcTargetMachine.cpp       |     4 +-
 llvm/lib/Target/Sparc/SparcTargetObjectFile.h      |     2 +-
 .../Target/SystemZ/AsmParser/SystemZAsmParser.cpp  |    12 +-
 .../SystemZ/Disassembler/SystemZDisassembler.cpp   |   139 +-
 .../SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp  |    10 +-
 .../SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp   |     3 +-
 .../SystemZ/MCTargetDesc/SystemZMCTargetDesc.h     |     1 -
 llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp      |   325 +-
 llvm/lib/Target/SystemZ/SystemZAsmPrinter.h        |    21 +-
 llvm/lib/Target/SystemZ/SystemZCopyPhysRegs.cpp    |     2 +-
 llvm/lib/Target/SystemZ/SystemZElimCompare.cpp     |    16 +-
 llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp   |    90 +-
 llvm/lib/Target/SystemZ/SystemZFrameLowering.h     |     3 +
 llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp    |     9 +-
 llvm/lib/Target/SystemZ/SystemZISelLowering.cpp    |   443 +-
 llvm/lib/Target/SystemZ/SystemZISelLowering.h      |    42 +-
 llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp       |    49 +-
 llvm/lib/Target/SystemZ/SystemZInstrInfo.h         |     9 +-
 llvm/lib/Target/SystemZ/SystemZInstrInfo.td        |    32 +-
 llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp       |     2 +-
 .../Target/SystemZ/SystemZMachineFunctionInfo.cpp  |     6 +
 .../Target/SystemZ/SystemZMachineFunctionInfo.h    |     5 +
 llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp     |     3 +-
 llvm/lib/Target/SystemZ/SystemZProcessors.td       |     3 +-
 llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp    |     5 +-
 llvm/lib/Target/SystemZ/SystemZRegisterInfo.h      |     6 +-
 llvm/lib/Target/SystemZ/SystemZSchedule.td         |     4 +-
 llvm/lib/Target/SystemZ/SystemZScheduleZ13.td      |     6 +-
 llvm/lib/Target/SystemZ/SystemZScheduleZ14.td      |     6 +-
 llvm/lib/Target/SystemZ/SystemZScheduleZ15.td      |     6 +-
 llvm/lib/Target/SystemZ/SystemZScheduleZ16.td      |  1728 ++
 llvm/lib/Target/SystemZ/SystemZScheduleZ196.td     |     6 +-
 llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td    |     6 +-
 .../lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp |     2 +-
 llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h  |     2 +-
 llvm/lib/Target/SystemZ/SystemZShortenInst.cpp     |    14 +-
 llvm/lib/Target/SystemZ/SystemZSubtarget.cpp       |    20 +-
 llvm/lib/Target/SystemZ/SystemZSubtarget.h         |     4 +-
 llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp   |    12 +-
 llvm/lib/Target/SystemZ/SystemZTargetMachine.h     |     2 +-
 llvm/lib/Target/SystemZ/SystemZTargetStreamer.h    |     1 +
 .../Target/SystemZ/SystemZTargetTransformInfo.cpp  |    49 +-
 .../Target/SystemZ/SystemZTargetTransformInfo.h    |     8 +-
 llvm/lib/Target/TargetIntrinsicInfo.cpp            |     8 +-
 llvm/lib/Target/TargetLoweringObjectFile.cpp       |     2 -
 llvm/lib/Target/TargetMachine.cpp                  |    17 +-
 llvm/lib/Target/TargetMachineC.cpp                 |     8 +-
 llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp       |     1 +
 llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp |   126 +-
 .../Target/VE/MCTargetDesc/VEELFObjectWriter.cpp   |     2 +-
 llvm/lib/Target/VE/MCTargetDesc/VEFixupKinds.h     |    16 +-
 .../lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp |     1 -
 llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp       |     1 +
 llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h   |     3 +-
 llvm/lib/Target/VE/VE.h                            |     4 +-
 llvm/lib/Target/VE/VECustomDAG.cpp                 |   514 +-
 llvm/lib/Target/VE/VECustomDAG.h                   |   144 +
 llvm/lib/Target/VE/VEISelDAGToDAG.cpp              |    37 +
 llvm/lib/Target/VE/VEISelLowering.cpp              |   281 +-
 llvm/lib/Target/VE/VEISelLowering.h                |    29 +-
 llvm/lib/Target/VE/VEInstrInfo.cpp                 |     7 +-
 llvm/lib/Target/VE/VEInstrInfo.td                  |    50 +-
 llvm/lib/Target/VE/VEInstrIntrinsicVL.gen.td       |    54 +
 llvm/lib/Target/VE/VEInstrIntrinsicVL.td           |     3 -
 llvm/lib/Target/VE/VEInstrPatternsVec.td           |    43 +
 llvm/lib/Target/VE/VEMachineFunctionInfo.cpp       |     7 +
 llvm/lib/Target/VE/VEMachineFunctionInfo.h         |     5 +
 llvm/lib/Target/VE/VERegisterInfo.td               |     4 +-
 llvm/lib/Target/VE/VETargetMachine.cpp             |     7 +-
 llvm/lib/Target/VE/VETargetMachine.h               |     2 +-
 llvm/lib/Target/VE/VETargetTransformInfo.h         |    66 +
 llvm/lib/Target/VE/VVPISelLowering.cpp             |   443 +
 llvm/lib/Target/VE/VVPInstrInfo.td                 |   111 +-
 llvm/lib/Target/VE/VVPInstrPatternsVec.td          |   358 +
 llvm/lib/Target/VE/VVPNodes.def                    |    89 +-
 .../WebAssembly/AsmParser/WebAssemblyAsmParser.cpp |    18 +-
 .../AsmParser/WebAssemblyAsmTypeCheck.cpp          |    95 +-
 .../AsmParser/WebAssemblyAsmTypeCheck.h            |     9 +-
 .../Disassembler/WebAssemblyDisassembler.cpp       |     2 +-
 .../MCTargetDesc/WebAssemblyMCAsmInfo.cpp          |     2 -
 .../MCTargetDesc/WebAssemblyMCTargetDesc.cpp       |     1 -
 .../MCTargetDesc/WebAssemblyTargetStreamer.cpp     |     6 -
 .../MCTargetDesc/WebAssemblyTargetStreamer.h       |     5 -
 .../WebAssembly/Utils/WebAssemblyTypeUtilities.h   |     4 +
 llvm/lib/Target/WebAssembly/WebAssembly.h          |     4 -
 llvm/lib/Target/WebAssembly/WebAssembly.td         |     4 +
 .../Target/WebAssembly/WebAssemblyAsmPrinter.cpp   |   207 +-
 .../lib/Target/WebAssembly/WebAssemblyAsmPrinter.h |     4 +-
 .../Target/WebAssembly/WebAssemblyCFGStackify.cpp  |     2 +-
 .../WebAssembly/WebAssemblyExceptionInfo.cpp       |     1 +
 .../WebAssembly/WebAssemblyFixBrTableDefaults.cpp  |     2 +-
 .../WebAssemblyFixIrreducibleControlFlow.cpp       |    54 +-
 .../Target/WebAssembly/WebAssemblyISelLowering.cpp |    46 +-
 .../Target/WebAssembly/WebAssemblyISelLowering.h   |     4 +
 .../Target/WebAssembly/WebAssemblyInstrAtomics.td  |    22 +-
 .../Target/WebAssembly/WebAssemblyInstrFormats.td  |    16 +-
 .../lib/Target/WebAssembly/WebAssemblyInstrInfo.td |    16 +-
 .../Target/WebAssembly/WebAssemblyInstrMemory.td   |     8 +-
 llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td |     6 +
 .../lib/Target/WebAssembly/WebAssemblyInstrSIMD.td |   131 +-
 .../WebAssembly/WebAssemblyLateEHPrepare.cpp       |     4 +-
 .../WebAssemblyLowerEmscriptenEHSjLj.cpp           |    74 +-
 .../WebAssembly/WebAssemblyLowerGlobalDtors.cpp    |   210 -
 .../WebAssembly/WebAssemblyMCLowerPrePass.cpp      |     3 +
 .../WebAssembly/WebAssemblyMachineFunctionInfo.cpp |    14 +-
 .../WebAssembly/WebAssemblyMachineFunctionInfo.h   |    13 +-
 .../WebAssemblyNullifyDebugValueLists.cpp          |     1 +
 .../WebAssemblyOptimizeLiveIntervals.cpp           |     7 +-
 .../WebAssemblyPrepareForLiveIntervals.cpp         |   126 -
 .../WebAssembly/WebAssemblyReplacePhysRegs.cpp     |     3 -
 .../WebAssembly/WebAssemblySelectionDAGInfo.cpp    |     2 +-
 .../WebAssembly/WebAssemblySelectionDAGInfo.h      |     1 +
 llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h |     1 +
 .../WebAssembly/WebAssemblyTargetMachine.cpp       |    42 +-
 .../Target/WebAssembly/WebAssemblyTargetMachine.h  |     2 +-
 .../WebAssembly/WebAssemblyTargetTransformInfo.cpp |     4 +
 .../WebAssembly/WebAssemblyTargetTransformInfo.h   |     2 +
 llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp     |   513 +-
 llvm/lib/Target/X86/AsmParser/X86Operand.h         |    36 +-
 .../Target/X86/Disassembler/X86Disassembler.cpp    |    77 +-
 llvm/lib/Target/X86/MCA/X86CustomBehaviour.h       |     2 +-
 .../Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp  |     8 +-
 llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp |   105 +-
 .../X86/MCTargetDesc/X86InstPrinterCommon.cpp      |    22 +-
 .../Target/X86/MCTargetDesc/X86InstPrinterCommon.h |     3 +-
 .../X86/MCTargetDesc/X86InstrRelaxTables.cpp       |   165 +
 .../Target/X86/MCTargetDesc/X86InstrRelaxTables.h  |    54 +
 .../X86/MCTargetDesc/X86IntelInstPrinter.cpp       |     4 +-
 .../Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp   |   134 +-
 llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h       |     1 +
 .../Target/X86/MCTargetDesc/X86MCTargetDesc.cpp    |    91 +
 llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h |    26 +-
 .../Target/X86/MCTargetDesc/X86MnemonicTables.cpp  |    16 +
 .../Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp |    21 +-
 .../X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp  |     1 +
 llvm/lib/Target/X86/X86.h                          |     4 +
 llvm/lib/Target/X86/X86.td                         |   279 +-
 llvm/lib/Target/X86/X86AsmPrinter.cpp              |    96 +-
 llvm/lib/Target/X86/X86AsmPrinter.h                |     5 +-
 llvm/lib/Target/X86/X86AvoidTrailingCall.cpp       |     7 +-
 llvm/lib/Target/X86/X86CallingConv.cpp             |     2 +-
 llvm/lib/Target/X86/X86CmovConversion.cpp          |    27 +-
 llvm/lib/Target/X86/X86DiscriminateMemOps.cpp      |     3 +-
 llvm/lib/Target/X86/X86DomainReassignment.cpp      |    14 +-
 llvm/lib/Target/X86/X86ExpandPseudo.cpp            |    11 +-
 llvm/lib/Target/X86/X86FastISel.cpp                |   133 +-
 llvm/lib/Target/X86/X86FastPreTileConfig.cpp       |   709 +
 llvm/lib/Target/X86/X86FastTileConfig.cpp          |   293 +-
 llvm/lib/Target/X86/X86FixupLEAs.cpp               |     3 +-
 llvm/lib/Target/X86/X86FloatingPoint.cpp           |    26 +-
 llvm/lib/Target/X86/X86FrameLowering.cpp           |   136 +-
 llvm/lib/Target/X86/X86FrameLowering.h             |     7 +-
 llvm/lib/Target/X86/X86ISelDAGToDAG.cpp            |   282 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp            |  3225 ++-
 llvm/lib/Target/X86/X86ISelLowering.h              |    58 +-
 llvm/lib/Target/X86/X86IndirectThunks.cpp          |     1 +
 llvm/lib/Target/X86/X86InsertPrefetch.cpp          |     1 +
 llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp    |    49 +-
 llvm/lib/Target/X86/X86InstrAMX.td                 |    18 +-
 llvm/lib/Target/X86/X86InstrAVX512.td              |   131 +-
 llvm/lib/Target/X86/X86InstrArithmetic.td          |     8 +-
 llvm/lib/Target/X86/X86InstrCMovSetCC.td           |     8 +-
 llvm/lib/Target/X86/X86InstrCompiler.td            |    85 +-
 llvm/lib/Target/X86/X86InstrControl.td             |     4 +-
 llvm/lib/Target/X86/X86InstrFPStack.td             |    22 +-
 llvm/lib/Target/X86/X86InstrFoldTables.cpp         |     4 +-
 llvm/lib/Target/X86/X86InstrFormats.td             |     6 +-
 llvm/lib/Target/X86/X86InstrFragmentsSIMD.td       |     1 -
 llvm/lib/Target/X86/X86InstrInfo.cpp               |   851 +-
 llvm/lib/Target/X86/X86InstrInfo.h                 |    18 +-
 llvm/lib/Target/X86/X86InstrInfo.td                |   111 +-
 llvm/lib/Target/X86/X86InstrMMX.td                 |     4 +-
 llvm/lib/Target/X86/X86InstrSSE.td                 |    68 +-
 llvm/lib/Target/X86/X86InstrSystem.td              |    16 +-
 llvm/lib/Target/X86/X86InstrTSX.td                 |     2 +
 llvm/lib/Target/X86/X86InstrVecCompiler.td         |     6 +-
 llvm/lib/Target/X86/X86InstrXOP.td                 |     4 +-
 llvm/lib/Target/X86/X86InstructionSelector.cpp     |    16 +-
 llvm/lib/Target/X86/X86IntrinsicsInfo.h            |    12 +-
 .../X86/X86LoadValueInjectionLoadHardening.cpp     |     3 +-
 llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp      |     1 +
 llvm/lib/Target/X86/X86LowerAMXType.cpp            |   181 +-
 llvm/lib/Target/X86/X86MCInstLower.cpp             |    41 +-
 llvm/lib/Target/X86/X86MachineFunctionInfo.cpp     |     7 +
 llvm/lib/Target/X86/X86MachineFunctionInfo.h       |    10 +-
 llvm/lib/Target/X86/X86MacroFusion.cpp             |     1 +
 llvm/lib/Target/X86/X86PadShortFunction.cpp        |    11 +-
 llvm/lib/Target/X86/X86PartialReduction.cpp        |    35 +-
 llvm/lib/Target/X86/X86PreAMXConfig.cpp            |    56 +-
 llvm/lib/Target/X86/X86PreTileConfig.cpp           |    53 +-
 llvm/lib/Target/X86/X86RegisterBankInfo.cpp        |     7 +-
 llvm/lib/Target/X86/X86RegisterBankInfo.h          |     2 +-
 llvm/lib/Target/X86/X86RegisterInfo.cpp            |    62 +
 llvm/lib/Target/X86/X86RegisterInfo.h              |    12 +
 llvm/lib/Target/X86/X86RegisterInfo.td             |    15 +-
 llvm/lib/Target/X86/X86SchedBroadwell.td           |    20 +-
 llvm/lib/Target/X86/X86SchedHaswell.td             |    20 +-
 llvm/lib/Target/X86/X86SchedIceLake.td             |    20 +-
 llvm/lib/Target/X86/X86SchedSandyBridge.td         |    40 +-
 llvm/lib/Target/X86/X86SchedSkylakeClient.td       |    26 +-
 llvm/lib/Target/X86/X86SchedSkylakeServer.td       |    32 +-
 llvm/lib/Target/X86/X86ScheduleBtVer2.td           |     4 +-
 llvm/lib/Target/X86/X86ScheduleSLM.td              |     6 +-
 llvm/lib/Target/X86/X86ScheduleZnver1.td           |   106 +-
 llvm/lib/Target/X86/X86ScheduleZnver2.td           |    86 +-
 llvm/lib/Target/X86/X86SelectionDAGInfo.cpp        |    39 +-
 llvm/lib/Target/X86/X86SelectionDAGInfo.h          |     2 +-
 .../lib/Target/X86/X86SpeculativeLoadHardening.cpp |    31 +-
 llvm/lib/Target/X86/X86Subtarget.cpp               |    12 +-
 llvm/lib/Target/X86/X86Subtarget.h                 |   629 +-
 llvm/lib/Target/X86/X86TargetMachine.cpp           |    51 +-
 llvm/lib/Target/X86/X86TargetMachine.h             |     2 +-
 llvm/lib/Target/X86/X86TargetTransformInfo.cpp     |   290 +-
 llvm/lib/Target/X86/X86TargetTransformInfo.h       |    21 +-
 llvm/lib/Target/X86/X86TileConfig.cpp              |    15 +-
 .../XCore/Disassembler/XCoreDisassembler.cpp       |   286 +-
 .../Target/XCore/MCTargetDesc/XCoreInstPrinter.h   |     3 +-
 llvm/lib/Target/XCore/XCore.h                      |     1 +
 llvm/lib/Target/XCore/XCoreAsmPrinter.cpp          |     2 +-
 llvm/lib/Target/XCore/XCoreISelLowering.cpp        |    36 +-
 llvm/lib/Target/XCore/XCoreInstrInfo.td            |     2 +-
 llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp |     7 +
 llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h   |     5 +
 llvm/lib/Target/XCore/XCoreTargetMachine.cpp       |     4 +-
 llvm/lib/Target/XCore/XCoreTargetMachine.h         |     4 +-
 llvm/lib/Testing/Support/Annotations.cpp           |     4 +-
 llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp        |    28 +-
 llvm/lib/ToolDrivers/llvm-lib/Options.td           |    16 +-
 .../AggressiveInstCombine.cpp                      |    92 +-
 .../AggressiveInstCombineInternal.h                |    46 +-
 .../AggressiveInstCombine/TruncInstCombine.cpp     |    86 +-
 llvm/lib/Transforms/Coroutines/CoroCleanup.cpp     |    81 +-
 .../Coroutines/CoroConditionalWrapper.cpp          |    24 +
 llvm/lib/Transforms/Coroutines/CoroEarly.cpp       |    79 +-
 llvm/lib/Transforms/Coroutines/CoroElide.cpp       |   125 +-
 llvm/lib/Transforms/Coroutines/CoroFrame.cpp       |   177 +-
 llvm/lib/Transforms/Coroutines/CoroInternal.h      |    47 +-
 llvm/lib/Transforms/Coroutines/CoroSplit.cpp       |   377 +-
 llvm/lib/Transforms/Coroutines/Coroutines.cpp      |   193 +-
 llvm/lib/Transforms/IPO/AlwaysInliner.cpp          |    47 +-
 llvm/lib/Transforms/IPO/ArgumentPromotion.cpp      |  1139 +-
 llvm/lib/Transforms/IPO/Attributor.cpp             |   462 +-
 llvm/lib/Transforms/IPO/AttributorAttributes.cpp   |  2060 +-
 llvm/lib/Transforms/IPO/BlockExtractor.cpp         |    11 +-
 llvm/lib/Transforms/IPO/CalledValuePropagation.cpp |     6 +-
 llvm/lib/Transforms/IPO/ConstantMerge.cpp          |     2 +-
 llvm/lib/Transforms/IPO/CrossDSOCFI.cpp            |     5 -
 .../lib/Transforms/IPO/DeadArgumentElimination.cpp |   578 +-
 llvm/lib/Transforms/IPO/ExtractGV.cpp              |     1 -
 llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp     |     2 +-
 llvm/lib/Transforms/IPO/FunctionAttrs.cpp          |   175 +-
 llvm/lib/Transforms/IPO/FunctionImport.cpp         |    19 +-
 llvm/lib/Transforms/IPO/FunctionSpecialization.cpp |   501 +-
 llvm/lib/Transforms/IPO/GlobalDCE.cpp              |    34 +-
 llvm/lib/Transforms/IPO/GlobalOpt.cpp              |   177 +-
 llvm/lib/Transforms/IPO/GlobalSplit.cpp            |     4 +-
 llvm/lib/Transforms/IPO/HotColdSplitting.cpp       |    26 +-
 llvm/lib/Transforms/IPO/IPO.cpp                    |     5 -
 llvm/lib/Transforms/IPO/IROutliner.cpp             |   326 +-
 llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp     |     5 +-
 llvm/lib/Transforms/IPO/InlineSimple.cpp           |     8 +-
 llvm/lib/Transforms/IPO/Inliner.cpp                |   111 +-
 llvm/lib/Transforms/IPO/Internalize.cpp            |     3 -
 llvm/lib/Transforms/IPO/LoopExtractor.cpp          |     5 -
 llvm/lib/Transforms/IPO/LowerTypeTests.cpp         |    21 +-
 llvm/lib/Transforms/IPO/MergeFunctions.cpp         |    48 +-
 llvm/lib/Transforms/IPO/ModuleInliner.cpp          |    25 +-
 llvm/lib/Transforms/IPO/OpenMPOpt.cpp              |   255 +-
 llvm/lib/Transforms/IPO/PartialInlining.cpp        |    16 +-
 llvm/lib/Transforms/IPO/PassManagerBuilder.cpp     |   295 +-
 llvm/lib/Transforms/IPO/PruneEH.cpp                |     5 +-
 llvm/lib/Transforms/IPO/SCCP.cpp                   |     1 +
 llvm/lib/Transforms/IPO/SampleContextTracker.cpp   |   123 +-
 llvm/lib/Transforms/IPO/SampleProfile.cpp          |   293 +-
 llvm/lib/Transforms/IPO/SampleProfileProbe.cpp     |    10 +-
 .../Transforms/IPO/SyntheticCountsPropagation.cpp  |    10 +-
 llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp   |     8 +-
 llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp     |   134 +-
 .../Transforms/InstCombine/InstCombineAddSub.cpp   |   115 +-
 .../Transforms/InstCombine/InstCombineAndOrXor.cpp |  1037 +-
 .../InstCombine/InstCombineAtomicRMW.cpp           |     1 -
 .../Transforms/InstCombine/InstCombineCalls.cpp    |   383 +-
 .../Transforms/InstCombine/InstCombineCasts.cpp    |   185 +-
 .../Transforms/InstCombine/InstCombineCompares.cpp |   874 +-
 .../Transforms/InstCombine/InstCombineInternal.h   |    41 +-
 .../InstCombine/InstCombineLoadStoreAlloca.cpp     |    13 +-
 .../InstCombine/InstCombineMulDivRem.cpp           |   344 +-
 .../Transforms/InstCombine/InstCombineNegator.cpp  |    14 +
 llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp |   127 +-
 .../Transforms/InstCombine/InstCombineSelect.cpp   |   913 +-
 .../Transforms/InstCombine/InstCombineShifts.cpp   |   164 +-
 .../InstCombine/InstCombineSimplifyDemanded.cpp    |   202 +-
 .../InstCombine/InstCombineVectorOps.cpp           |   157 +-
 .../InstCombine/InstructionCombining.cpp           |   533 +-
 .../Instrumentation/AddressSanitizer.cpp           |   400 +-
 .../Transforms/Instrumentation/BoundsChecking.cpp  |     5 +-
 llvm/lib/Transforms/Instrumentation/CGProfile.cpp  |     3 -
 .../Instrumentation/ControlHeightReduction.cpp     |    78 +-
 .../Instrumentation/DataFlowSanitizer.cpp          |   237 +-
 .../Transforms/Instrumentation/GCOVProfiling.cpp   |    59 +-
 .../Instrumentation/HWAddressSanitizer.cpp         |   521 +-
 .../Instrumentation/IndirectCallPromotion.cpp      |    73 +-
 .../Transforms/Instrumentation/InstrOrderFile.cpp  |     9 +-
 .../Transforms/Instrumentation/InstrProfiling.cpp  |    59 +-
 .../Transforms/Instrumentation/Instrumentation.cpp |    10 -
 .../Instrumentation/MaximumSpanningTree.h          |   109 -
 .../lib/Transforms/Instrumentation/MemProfiler.cpp |    54 +-
 .../Transforms/Instrumentation/MemorySanitizer.cpp |   155 +-
 .../Instrumentation/PGOInstrumentation.cpp         |   176 +-
 .../Transforms/Instrumentation/PGOMemOPSizeOpt.cpp |    65 +-
 .../Transforms/Instrumentation/PoisonChecking.cpp  |     6 -
 .../Instrumentation/SanitizerCoverage.cpp          |    23 +-
 .../Transforms/Instrumentation/ThreadSanitizer.cpp |    84 +-
 .../Instrumentation/ValueProfileCollector.cpp      |     7 +-
 .../Instrumentation/ValueProfileCollector.h        |     2 +-
 .../Instrumentation/ValueProfilePlugins.inc        |     1 +
 llvm/lib/Transforms/ObjCARC/ObjCARC.cpp            |     1 -
 llvm/lib/Transforms/ObjCARC/ObjCARC.h              |     1 -
 llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp      |     5 +-
 llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp    |     9 +-
 llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp      |     2 +-
 llvm/lib/Transforms/Scalar/ADCE.cpp                |     1 -
 .../Transforms/Scalar/AlignmentFromAssumptions.cpp |     7 +-
 llvm/lib/Transforms/Scalar/AnnotationRemarks.cpp   |     3 -
 llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp   |    13 +-
 llvm/lib/Transforms/Scalar/ConstantHoisting.cpp    |     1 +
 .../Transforms/Scalar/ConstraintElimination.cpp    |   754 +-
 .../Scalar/CorrelatedValuePropagation.cpp          |   110 +-
 llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp    |   157 +-
 .../lib/Transforms/Scalar/DeadStoreElimination.cpp |   138 +-
 llvm/lib/Transforms/Scalar/EarlyCSE.cpp            |    25 +-
 llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp      |     2 -
 llvm/lib/Transforms/Scalar/Float2Int.cpp           |   207 +-
 llvm/lib/Transforms/Scalar/GVN.cpp                 |   231 +-
 llvm/lib/Transforms/Scalar/GVNHoist.cpp            |    16 +-
 llvm/lib/Transforms/Scalar/GVNSink.cpp             |    30 +-
 llvm/lib/Transforms/Scalar/GuardWidening.cpp       |     3 +-
 llvm/lib/Transforms/Scalar/IVUsersPrinter.cpp      |     1 -
 llvm/lib/Transforms/Scalar/IndVarSimplify.cpp      |    15 +-
 .../Scalar/InductiveRangeCheckElimination.cpp      |    34 +-
 llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp  |    72 +-
 llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp    |     6 +-
 llvm/lib/Transforms/Scalar/JumpThreading.cpp       |   142 +-
 llvm/lib/Transforms/Scalar/LICM.cpp                |   482 +-
 .../Scalar/LoopAccessAnalysisPrinter.cpp           |     1 +
 llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp      |    26 +-
 llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp    |    18 +-
 llvm/lib/Transforms/Scalar/LoopDeletion.cpp        |    12 +-
 llvm/lib/Transforms/Scalar/LoopDistribute.cpp      |    15 +-
 llvm/lib/Transforms/Scalar/LoopFlatten.cpp         |    14 +-
 llvm/lib/Transforms/Scalar/LoopFuse.cpp            |    15 +-
 llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp  |    64 +-
 llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp    |    13 +-
 llvm/lib/Transforms/Scalar/LoopInterchange.cpp     |   200 +-
 llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp |    14 +-
 llvm/lib/Transforms/Scalar/LoopPassManager.cpp     |    10 +-
 llvm/lib/Transforms/Scalar/LoopPredication.cpp     |     5 +-
 llvm/lib/Transforms/Scalar/LoopRerollPass.cpp      |    11 +-
 llvm/lib/Transforms/Scalar/LoopRotation.cpp        |    13 +-
 llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp     |    26 +-
 llvm/lib/Transforms/Scalar/LoopSink.cpp            |    91 +-
 llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp  |   664 +-
 .../lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp |    30 +-
 llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp      |    57 +-
 llvm/lib/Transforms/Scalar/LoopUnswitch.cpp        |  1774 --
 llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp  |     2 -
 llvm/lib/Transforms/Scalar/LowerAtomic.cpp         |   177 -
 llvm/lib/Transforms/Scalar/LowerAtomicPass.cpp     |    99 +
 .../Transforms/Scalar/LowerConstantIntrinsics.cpp  |    18 +-
 .../lib/Transforms/Scalar/LowerExpectIntrinsic.cpp |    12 +-
 llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp |    11 +-
 .../Transforms/Scalar/LowerMatrixIntrinsics.cpp    |    57 +-
 .../Transforms/Scalar/LowerWidenableCondition.cpp  |    13 +-
 llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp  |     4 +-
 llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp     |   187 +-
 llvm/lib/Transforms/Scalar/MergeICmps.cpp          |    59 +-
 .../Transforms/Scalar/MergedLoadStoreMotion.cpp    |     6 +-
 llvm/lib/Transforms/Scalar/NewGVN.cpp              |    46 +-
 .../Transforms/Scalar/PartiallyInlineLibCalls.cpp  |     5 +-
 llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp     |     3 +-
 llvm/lib/Transforms/Scalar/Reassociate.cpp         |     7 +-
 llvm/lib/Transforms/Scalar/Reg2Mem.cpp             |     2 -
 .../Transforms/Scalar/RewriteStatepointsForGC.cpp  |   489 +-
 llvm/lib/Transforms/Scalar/SCCP.cpp                |   105 +-
 llvm/lib/Transforms/Scalar/SROA.cpp                |    75 +-
 llvm/lib/Transforms/Scalar/Scalar.cpp              |     9 +-
 .../Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp |     7 +-
 llvm/lib/Transforms/Scalar/Scalarizer.cpp          |   103 +-
 .../Scalar/SeparateConstOffsetFromGEP.cpp          |     1 -
 llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp  |   121 +-
 llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp     |    14 +-
 llvm/lib/Transforms/Scalar/Sink.cpp                |     7 +-
 .../lib/Transforms/Scalar/SpeculativeExecution.cpp |     6 +-
 .../Scalar/StraightLineStrengthReduce.cpp          |    19 +-
 llvm/lib/Transforms/Scalar/StructurizeCFG.cpp      |    67 +-
 llvm/lib/Transforms/Scalar/TLSVariableHoist.cpp    |   306 +
 .../Transforms/Scalar/TailRecursionElimination.cpp |    15 +-
 .../lib/Transforms/Scalar/WarnMissedTransforms.cpp |     2 +-
 llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp     |     3 -
 llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp |     1 -
 llvm/lib/Transforms/Utils/AddDiscriminators.cpp    |     4 +-
 llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp  |     1 +
 llvm/lib/Transforms/Utils/BasicBlockUtils.cpp      |     8 +-
 llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp   |    15 +-
 llvm/lib/Transforms/Utils/BuildLibCalls.cpp        |   406 +-
 llvm/lib/Transforms/Utils/CallGraphUpdater.cpp     |     3 +
 llvm/lib/Transforms/Utils/CallPromotionUtils.cpp   |     4 +-
 llvm/lib/Transforms/Utils/CanonicalizeAliases.cpp  |     3 +-
 .../Transforms/Utils/CanonicalizeFreezeInLoops.cpp |     1 -
 llvm/lib/Transforms/Utils/CloneFunction.cpp        |   106 +-
 llvm/lib/Transforms/Utils/CloneModule.cpp          |     5 +-
 llvm/lib/Transforms/Utils/CodeExtractor.cpp        |    24 +-
 llvm/lib/Transforms/Utils/CodeLayout.cpp           |    28 +-
 llvm/lib/Transforms/Utils/CtorUtils.cpp            |    65 +-
 llvm/lib/Transforms/Utils/Debugify.cpp             |   184 +-
 llvm/lib/Transforms/Utils/DemoteRegToStack.cpp     |     3 +-
 llvm/lib/Transforms/Utils/Evaluator.cpp            |   109 +-
 llvm/lib/Transforms/Utils/FixIrreducible.cpp       |     9 +
 llvm/lib/Transforms/Utils/FunctionImportUtils.cpp  |     2 -
 llvm/lib/Transforms/Utils/GlobalStatus.cpp         |    32 +-
 llvm/lib/Transforms/Utils/InjectTLIMappings.cpp    |     1 -
 llvm/lib/Transforms/Utils/InlineFunction.cpp       |    77 +-
 llvm/lib/Transforms/Utils/IntegerDivision.cpp      |     1 -
 llvm/lib/Transforms/Utils/LCSSA.cpp                |     3 +-
 llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp   |     2 -
 llvm/lib/Transforms/Utils/Local.cpp                |    77 +-
 llvm/lib/Transforms/Utils/LoopPeel.cpp             |   122 +-
 llvm/lib/Transforms/Utils/LoopRotationUtils.cpp    |    19 +-
 llvm/lib/Transforms/Utils/LoopSimplify.cpp         |     9 +-
 llvm/lib/Transforms/Utils/LoopUnroll.cpp           |     4 +-
 llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp     |     4 +-
 llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp    |    47 +-
 llvm/lib/Transforms/Utils/LoopUtils.cpp            |    85 +-
 llvm/lib/Transforms/Utils/LoopVersioning.cpp       |     7 +-
 llvm/lib/Transforms/Utils/LowerAtomic.cpp          |    93 +
 llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp     |   221 +
 llvm/lib/Transforms/Utils/LowerInvoke.cpp          |     2 -
 llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp   |   217 +-
 llvm/lib/Transforms/Utils/LowerSwitch.cpp          |    43 +-
 llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp |   195 +
 llvm/lib/Transforms/Utils/MisExpect.cpp            |   249 +
 llvm/lib/Transforms/Utils/ModuleUtils.cpp          |    12 +-
 llvm/lib/Transforms/Utils/PredicateInfo.cpp        |     8 -
 .../Transforms/Utils/PromoteMemoryToRegister.cpp   |     8 +-
 .../Transforms/Utils/RelLookupTableConverter.cpp   |    27 +-
 llvm/lib/Transforms/Utils/SCCPSolver.cpp           |   204 +-
 llvm/lib/Transforms/Utils/SSAUpdater.cpp           |     3 +-
 .../Transforms/Utils/SampleProfileInference.cpp    |   394 +-
 .../Utils/SampleProfileLoaderBaseUtil.cpp          |    10 +-
 llvm/lib/Transforms/Utils/SanitizerStats.cpp       |     1 -
 .../Transforms/Utils/ScalarEvolutionExpander.cpp   |   258 +-
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp          |   818 +-
 llvm/lib/Transforms/Utils/SimplifyIndVar.cpp       |    18 +-
 llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp     |   665 +-
 llvm/lib/Transforms/Utils/SizeOpts.cpp             |     4 +-
 llvm/lib/Transforms/Utils/StripGCRelocates.cpp     |     4 +-
 llvm/lib/Transforms/Utils/SymbolRewriter.cpp       |     1 -
 llvm/lib/Transforms/Utils/UnifyLoopExits.cpp       |    48 +-
 llvm/lib/Transforms/Utils/Utils.cpp                |     1 +
 llvm/lib/Transforms/Utils/VNCoercion.cpp           |   124 +-
 .../Transforms/Vectorize/LoadStoreVectorizer.cpp   |    19 +-
 .../Vectorize/LoopVectorizationLegality.cpp        |   133 +-
 .../Vectorize/LoopVectorizationPlanner.h           |    22 +-
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp    |  2149 +-
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp    |  4378 ++--
 llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h    |    12 +-
 llvm/lib/Transforms/Vectorize/VPlan.cpp            |  1161 +-
 llvm/lib/Transforms/Vectorize/VPlan.h              |   592 +-
 llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp |   135 +-
 llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h   |    10 +-
 llvm/lib/Transforms/Vectorize/VPlanLoopInfo.h      |    44 -
 llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp  |   248 -
 llvm/lib/Transforms/Vectorize/VPlanPredicator.h    |    74 -
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp     |   840 +
 llvm/lib/Transforms/Vectorize/VPlanSLP.cpp         |    15 +-
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp  |   114 +-
 llvm/lib/Transforms/Vectorize/VPlanTransforms.h    |    16 +-
 llvm/lib/Transforms/Vectorize/VPlanValue.h         |    24 +-
 llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp    |    55 +-
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp    |   379 +-
 llvm/lib/Transforms/Vectorize/Vectorize.cpp        |     1 -
 llvm/lib/WindowsDriver/MSVCPaths.cpp               |   719 +
 llvm/lib/WindowsManifest/WindowsManifestMerger.cpp |     4 +-
 llvm/lib/XRay/FDRTraceWriter.cpp                   |     2 +-
 llvm/tools/bugpoint/CrashDebugger.cpp              |     4 +-
 llvm/tools/bugpoint/ExecutionDriver.cpp            |     8 +-
 llvm/tools/bugpoint/OptimizerDriver.cpp            |     2 +-
 llvm/tools/bugpoint/bugpoint.cpp                   |    12 +-
 llvm/tools/llc/llc.cpp                             |    42 +-
 llvm/tools/lli/lli.cpp                             |   158 +-
 llvm/tools/llvm-ar/llvm-ar.cpp                     |   169 +-
 llvm/tools/llvm-cov/CodeCoverage.cpp               |    68 +-
 llvm/tools/llvm-cov/CoverageViewOptions.h          |     2 +
 llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp     |    42 +-
 llvm/tools/llvm-cov/TestingSupport.cpp             |     1 +
 llvm/tools/llvm-cxxdump/llvm-cxxdump.cpp           |     2 +-
 llvm/tools/llvm-cxxfilt/Opts.td                    |     2 +-
 llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp           |     2 +-
 llvm/tools/llvm-dis/llvm-dis.cpp                   |    12 +-
 llvm/tools/llvm-dwarfdump/Statistics.cpp           |    18 +-
 llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp       |    93 +-
 llvm/tools/llvm-dwp/llvm-dwp.cpp                   |    17 +-
 llvm/tools/llvm-extract/llvm-extract.cpp           |    21 +-
 llvm/tools/llvm-link/llvm-link.cpp                 |    24 +-
 llvm/tools/llvm-lto/llvm-lto.cpp                   |    16 +-
 llvm/tools/llvm-lto2/llvm-lto2.cpp                 |    57 +-
 llvm/tools/llvm-mc/llvm-mc.cpp                     |     4 +-
 llvm/tools/llvm-mca/CodeRegionGenerator.cpp        |     9 +-
 llvm/tools/llvm-mca/Views/InstructionInfoView.cpp  |     4 +-
 llvm/tools/llvm-mca/Views/InstructionView.h        |     3 +-
 llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp  |     8 +-
 llvm/tools/llvm-mca/llvm-mca.cpp                   |    31 +-
 llvm/tools/llvm-modextract/llvm-modextract.cpp     |     1 +
 llvm/tools/llvm-nm/Opts.td                         |    11 +-
 llvm/tools/llvm-nm/llvm-nm.cpp                     |   956 +-
 llvm/tools/llvm-objcopy/BitcodeStripOpts.td        |     8 +-
 llvm/tools/llvm-objcopy/COFF/COFFConfig.h          |    27 -
 llvm/tools/llvm-objcopy/COFF/COFFObjcopy.cpp       |   297 -
 llvm/tools/llvm-objcopy/COFF/COFFObjcopy.h         |    33 -
 llvm/tools/llvm-objcopy/COFF/Object.cpp            |   132 -
 llvm/tools/llvm-objcopy/COFF/Object.h              |   211 -
 llvm/tools/llvm-objcopy/COFF/Reader.cpp            |   226 -
 llvm/tools/llvm-objcopy/COFF/Reader.h              |    41 -
 llvm/tools/llvm-objcopy/COFF/Writer.cpp            |   457 -
 llvm/tools/llvm-objcopy/COFF/Writer.h              |    63 -
 llvm/tools/llvm-objcopy/CommonConfig.h             |   260 -
 llvm/tools/llvm-objcopy/ConfigManager.cpp          |  1432 --
 llvm/tools/llvm-objcopy/ConfigManager.h            |    80 -
 llvm/tools/llvm-objcopy/ELF/ELFConfig.h            |    38 -
 llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp         |   833 -
 llvm/tools/llvm-objcopy/ELF/ELFObjcopy.h           |    40 -
 llvm/tools/llvm-objcopy/ELF/Object.cpp             |  2826 ---
 llvm/tools/llvm-objcopy/ELF/Object.h               |  1113 -
 llvm/tools/llvm-objcopy/MachO/MachOConfig.h        |    43 -
 .../llvm-objcopy/MachO/MachOLayoutBuilder.cpp      |   441 -
 llvm/tools/llvm-objcopy/MachO/MachOLayoutBuilder.h |    97 -
 llvm/tools/llvm-objcopy/MachO/MachOObjcopy.cpp     |   549 -
 llvm/tools/llvm-objcopy/MachO/MachOObjcopy.h       |    39 -
 llvm/tools/llvm-objcopy/MachO/MachOReader.cpp      |   374 -
 llvm/tools/llvm-objcopy/MachO/MachOReader.h        |    57 -
 llvm/tools/llvm-objcopy/MachO/MachOWriter.cpp      |   748 -
 llvm/tools/llvm-objcopy/MachO/MachOWriter.h        |    71 -
 llvm/tools/llvm-objcopy/MachO/Object.cpp           |   214 -
 llvm/tools/llvm-objcopy/MachO/Object.h             |   374 -
 llvm/tools/llvm-objcopy/MultiFormatConfig.h        |    37 -
 llvm/tools/llvm-objcopy/ObjcopyOptions.cpp         |  1364 ++
 llvm/tools/llvm-objcopy/ObjcopyOptions.h           |    58 +
 llvm/tools/llvm-objcopy/ObjcopyOpts.td             |     6 +-
 llvm/tools/llvm-objcopy/llvm-objcopy.cpp           |   227 +-
 llvm/tools/llvm-objcopy/llvm-objcopy.h             |    34 -
 llvm/tools/llvm-objcopy/wasm/Object.cpp            |    34 -
 llvm/tools/llvm-objcopy/wasm/Object.h              |    47 -
 llvm/tools/llvm-objcopy/wasm/Reader.cpp            |    33 -
 llvm/tools/llvm-objcopy/wasm/Reader.h              |    31 -
 llvm/tools/llvm-objcopy/wasm/WasmConfig.h          |    21 -
 llvm/tools/llvm-objcopy/wasm/WasmObjcopy.cpp       |   162 -
 llvm/tools/llvm-objcopy/wasm/WasmObjcopy.h         |    32 -
 llvm/tools/llvm-objcopy/wasm/Writer.cpp            |    79 -
 llvm/tools/llvm-objcopy/wasm/Writer.h              |    49 -
 llvm/tools/llvm-objdump/COFFDump.cpp               |    32 +-
 llvm/tools/llvm-objdump/ELFDump.cpp                |     8 +-
 llvm/tools/llvm-objdump/MachODump.cpp              |    69 +-
 llvm/tools/llvm-objdump/MachODump.h                |     1 +
 llvm/tools/llvm-objdump/ObjdumpOpts.td             |     9 +
 llvm/tools/llvm-objdump/OffloadDump.cpp            |   102 +
 llvm/tools/llvm-objdump/OffloadDump.h              |    22 +
 llvm/tools/llvm-objdump/OtoolOpts.td               |     1 -
 llvm/tools/llvm-objdump/SourcePrinter.cpp          |     2 +
 llvm/tools/llvm-objdump/SourcePrinter.h            |     1 +
 llvm/tools/llvm-objdump/XCOFFDump.cpp              |     2 +-
 llvm/tools/llvm-objdump/llvm-objdump.cpp           |   125 +-
 llvm/tools/llvm-pdbutil/BytesOutputStyle.cpp       |    12 +-
 llvm/tools/llvm-pdbutil/BytesOutputStyle.h         |     2 +-
 llvm/tools/llvm-pdbutil/DumpOutputStyle.cpp        |   398 +-
 llvm/tools/llvm-pdbutil/DumpOutputStyle.h          |     2 +-
 llvm/tools/llvm-pdbutil/ExplainOutputStyle.cpp     |     9 +-
 llvm/tools/llvm-pdbutil/ExplainOutputStyle.h       |     3 +-
 llvm/tools/llvm-pdbutil/FormatUtil.cpp             |   258 -
 llvm/tools/llvm-pdbutil/FormatUtil.h               |   141 -
 llvm/tools/llvm-pdbutil/InputFile.cpp              |   510 -
 llvm/tools/llvm-pdbutil/InputFile.h                |   154 -
 llvm/tools/llvm-pdbutil/LinePrinter.cpp            |   335 -
 llvm/tools/llvm-pdbutil/LinePrinter.h              |   167 -
 llvm/tools/llvm-pdbutil/MinimalSymbolDumper.cpp    |    10 +-
 llvm/tools/llvm-pdbutil/MinimalTypeDumper.cpp      |     7 +-
 llvm/tools/llvm-pdbutil/OutputStyle.h              |     5 +-
 llvm/tools/llvm-pdbutil/PrettyBuiltinDumper.cpp    |     4 +-
 .../llvm-pdbutil/PrettyClassDefinitionDumper.cpp   |     3 +-
 .../PrettyClassLayoutGraphicalDumper.cpp           |     3 +-
 llvm/tools/llvm-pdbutil/PrettyCompilandDumper.cpp  |     1 -
 llvm/tools/llvm-pdbutil/PrettyEnumDumper.cpp       |     3 +-
 .../llvm-pdbutil/PrettyExternalSymbolDumper.cpp    |     3 +-
 llvm/tools/llvm-pdbutil/PrettyFunctionDumper.cpp   |     5 +-
 llvm/tools/llvm-pdbutil/PrettyTypeDumper.cpp       |     4 +-
 llvm/tools/llvm-pdbutil/PrettyTypedefDumper.cpp    |     4 +-
 llvm/tools/llvm-pdbutil/PrettyVariableDumper.cpp   |     6 +-
 llvm/tools/llvm-pdbutil/StreamUtil.cpp             |     4 +-
 llvm/tools/llvm-pdbutil/TypeReferenceTracker.cpp   |     6 +-
 llvm/tools/llvm-pdbutil/TypeReferenceTracker.h     |     3 +-
 llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp           |   145 +-
 llvm/tools/llvm-pdbutil/llvm-pdbutil.h             |     3 +
 llvm/tools/llvm-profdata/llvm-profdata.cpp         |   107 +-
 llvm/tools/llvm-readobj/ARMWinEHPrinter.cpp        |   176 +-
 llvm/tools/llvm-readobj/ARMWinEHPrinter.h          |     3 +-
 llvm/tools/llvm-readobj/ELFDumper.cpp              |   196 +-
 llvm/tools/llvm-readobj/MachODumper.cpp            |    53 +-
 llvm/tools/llvm-readobj/ObjDumper.h                |    58 +-
 llvm/tools/llvm-readobj/Opts.td                    |     3 +-
 llvm/tools/llvm-readobj/WasmDumper.cpp             |    14 +-
 llvm/tools/llvm-readobj/XCOFFDumper.cpp            |   253 +-
 llvm/tools/llvm-readobj/llvm-readobj.cpp           |    53 +-
 llvm/tools/llvm-readobj/llvm-readobj.h             |     5 +-
 llvm/tools/llvm-rtdyld/llvm-rtdyld.cpp             |    19 +-
 llvm/tools/llvm-sim/llvm-sim.cpp                   |     5 +-
 llvm/tools/llvm-stress/llvm-stress.cpp             |   101 +-
 llvm/tools/llvm-strings/llvm-strings.cpp           |     3 +-
 llvm/tools/llvm-symbolizer/Opts.td                 |     6 +
 llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp     |   210 +-
 llvm/tools/llvm-tapi-diff/llvm-tapi-diff.cpp       |     1 +
 llvm/tools/llvm-tli-checker/llvm-tli-checker.cpp   |     1 +
 llvm/tools/llvm-xray/func-id-helper.cpp            |     1 +
 llvm/tools/llvm-xray/func-id-helper.h              |     1 +
 llvm/tools/llvm-xray/xray-graph-diff.cpp           |     1 +
 llvm/tools/opt/NewPMDriver.cpp                     |    70 +-
 llvm/tools/opt/NewPMDriver.h                       |     5 +-
 llvm/tools/opt/PassPrinters.cpp                    |   212 -
 llvm/tools/opt/PassPrinters.h                      |    40 -
 llvm/tools/opt/opt.cpp                             |   122 +-
 llvm/utils/TableGen/AsmMatcherEmitter.cpp          |     3 +-
 llvm/utils/TableGen/AsmWriterEmitter.cpp           |    23 +-
 llvm/utils/TableGen/AsmWriterInst.cpp              |     1 +
 llvm/utils/TableGen/Attributes.cpp                 |     3 -
 llvm/utils/TableGen/CallingConvEmitter.cpp         |   139 +-
 llvm/utils/TableGen/CodeBeadsGen.cpp               |   137 -
 llvm/utils/TableGen/CodeEmitterGen.cpp             |   250 +-
 llvm/utils/TableGen/CodeGenDAGPatterns.cpp         |    14 +-
 llvm/utils/TableGen/CodeGenDAGPatterns.h           |     1 -
 llvm/utils/TableGen/CodeGenInstruction.cpp         |     6 +-
 llvm/utils/TableGen/CodeGenInstruction.h           |     3 +-
 llvm/utils/TableGen/CodeGenIntrinsics.h            |     5 +-
 llvm/utils/TableGen/CodeGenMapTable.cpp            |     2 +-
 llvm/utils/TableGen/CodeGenRegisters.cpp           |    38 +-
 llvm/utils/TableGen/CodeGenRegisters.h             |    36 +-
 llvm/utils/TableGen/CodeGenSchedule.cpp            |     1 -
 llvm/utils/TableGen/CodeGenSchedule.h              |     3 -
 llvm/utils/TableGen/CodeGenTarget.cpp              |    24 +-
 llvm/utils/TableGen/CodeGenTarget.h                |     7 +-
 llvm/utils/TableGen/DAGISelEmitter.cpp             |     1 +
 llvm/utils/TableGen/DAGISelMatcherEmitter.cpp      |     2 -
 llvm/utils/TableGen/DAGISelMatcherGen.cpp          |     3 +-
 llvm/utils/TableGen/DFAEmitter.cpp                 |     4 +-
 llvm/utils/TableGen/DFAPacketizerEmitter.cpp       |     2 -
 llvm/utils/TableGen/DXILEmitter.cpp                |   374 +
 llvm/utils/TableGen/DecoderEmitter.cpp             |  2705 +++
 llvm/utils/TableGen/DirectiveEmitter.cpp           |     4 +-
 llvm/utils/TableGen/DisassemblerEmitter.cpp        |    26 +-
 llvm/utils/TableGen/ExegesisEmitter.cpp            |     4 -
 llvm/utils/TableGen/FastISelEmitter.cpp            |     2 +-
 llvm/utils/TableGen/FixedLenDecoderEmitter.cpp     |  2560 ---
 llvm/utils/TableGen/GICombinerEmitter.cpp          |    17 +-
 llvm/utils/TableGen/GlobalISel/GIMatchDag.cpp      |     4 +-
 llvm/utils/TableGen/GlobalISel/GIMatchTree.h       |     4 +-
 llvm/utils/TableGen/GlobalISelEmitter.cpp          |    23 +-
 llvm/utils/TableGen/InstrInfoEmitter.cpp           |    51 +-
 llvm/utils/TableGen/IntrinsicEmitter.cpp           |    81 +-
 llvm/utils/TableGen/OptParserEmitter.cpp           |     2 +-
 llvm/utils/TableGen/OptRSTEmitter.cpp              |    29 +-
 llvm/utils/TableGen/PseudoLoweringEmitter.cpp      |     3 +-
 llvm/utils/TableGen/RegisterBankEmitter.cpp        |     8 +-
 llvm/utils/TableGen/RegisterInfoEmitter.cpp        |   120 +-
 llvm/utils/TableGen/SearchableTableEmitter.cpp     |     4 +-
 llvm/utils/TableGen/SequenceToOffsetTable.h        |    16 +-
 llvm/utils/TableGen/SubtargetEmitter.cpp           |    53 +-
 llvm/utils/TableGen/SubtargetFeatureInfo.cpp       |    66 +-
 llvm/utils/TableGen/TableGen.cpp                   |    20 +-
 llvm/utils/TableGen/TableGenBackends.h             |     3 +-
 llvm/utils/TableGen/VarLenCodeEmitterGen.cpp       |   487 +
 llvm/utils/TableGen/VarLenCodeEmitterGen.h         |    66 +
 .../TableGen/WebAssemblyDisassemblerEmitter.cpp    |    18 +-
 llvm/utils/TableGen/X86DisassemblerTables.cpp      |    34 +-
 llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp   |    75 +-
 llvm/utils/TableGen/X86FoldTablesEmitter.cpp       |   266 +-
 llvm/utils/TableGen/X86MnemonicTables.cpp          |    94 +
 llvm/utils/TableGen/X86RecognizableInstr.cpp       |   192 +-
 llvm/utils/TableGen/X86RecognizableInstr.h         |    77 +-
 3394 files changed, 262975 insertions(+), 110125 deletions(-)
 delete mode 100644 llvm/include/llvm-c/Transforms/Coroutines.h
 create mode 100644 llvm/include/llvm-c/blake3.h
 create mode 100644 llvm/include/llvm/ADT/AddressRanges.h
 create mode 100644 llvm/include/llvm/Analysis/ScalarFuncs.def
 create mode 100644 llvm/include/llvm/Analysis/TensorSpec.h
 create mode 100644 llvm/include/llvm/BinaryFormat/DXContainer.h
 create mode 100644 llvm/include/llvm/BinaryFormat/ELFRelocs/LoongArch.def
 create mode 100644 llvm/include/llvm/BinaryFormat/GOFF.h
 create mode 100644 llvm/include/llvm/Bitstream/BitCodeEnums.h
 create mode 100644 llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h
 create mode 100644 llvm/include/llvm/CodeGen/CFIFixup.h
 delete mode 100644 llvm/include/llvm/CodeGen/GlobalISel/RegisterBank.h
 delete mode 100644 llvm/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h
 create mode 100644 llvm/include/llvm/CodeGen/RegisterBank.h
 create mode 100644 llvm/include/llvm/CodeGen/RegisterBankInfo.h
 create mode 100644 llvm/include/llvm/DebugInfo/DWARF/DWARFTypePrinter.h
 create mode 100644 llvm/include/llvm/DebugInfo/GSYM/ExtractRanges.h
 delete mode 100644 llvm/include/llvm/DebugInfo/GSYM/Range.h
 create mode 100644 llvm/include/llvm/DebugInfo/PDB/Native/FormatUtil.h
 create mode 100644 llvm/include/llvm/DebugInfo/PDB/Native/InputFile.h
 create mode 100644 llvm/include/llvm/DebugInfo/PDB/Native/LinePrinter.h
 create mode 100644 llvm/include/llvm/DebugInfo/Symbolize/DIFetcher.h
 create mode 100644 llvm/include/llvm/DebugInfo/Symbolize/Markup.h
 create mode 100644 llvm/include/llvm/DebugInfo/Symbolize/MarkupFilter.h
 create mode 100644 llvm/include/llvm/DebugInfo/Symbolize/SymbolizableObjectFile.h
 create mode 100644 llvm/include/llvm/Debuginfod/DIFetcher.h
 create mode 100644 llvm/include/llvm/Demangle/ItaniumNodes.def
 create mode 100644 llvm/include/llvm/ExecutionEngine/JITLink/DWARFRecordSectionSplitter.h
 create mode 100644 llvm/include/llvm/ExecutionEngine/Orc/MemoryMapper.h
 delete mode 100644 llvm/include/llvm/IR/AttributesAMDGPU.td
 create mode 100644 llvm/include/llvm/IR/ConstantFold.h
 create mode 100644 llvm/include/llvm/IR/FMF.h
 create mode 100644 llvm/include/llvm/IR/IntrinsicsDirectX.td
 create mode 100644 llvm/include/llvm/IR/IntrinsicsSPIRV.td
 create mode 100644 llvm/include/llvm/IR/VectorBuilder.h
 create mode 100644 llvm/include/llvm/MC/MCDXContainerStreamer.h
 create mode 100644 llvm/include/llvm/MC/MCDXContainerWriter.h
 create mode 100644 llvm/include/llvm/MC/MCDecoderOps.h
 delete mode 100644 llvm/include/llvm/MC/MCFixedLenDisassembler.h
 create mode 100644 llvm/include/llvm/MC/MCSPIRVObjectWriter.h
 create mode 100644 llvm/include/llvm/MC/MCSPIRVStreamer.h
 create mode 100644 llvm/include/llvm/MC/MCSectionDXContainer.h
 create mode 100644 llvm/include/llvm/MC/MCSectionSPIRV.h
 create mode 100644 llvm/include/llvm/MCA/IncrementalSourceMgr.h
 create mode 100644 llvm/include/llvm/ObjCopy/COFF/COFFConfig.h
 create mode 100644 llvm/include/llvm/ObjCopy/COFF/COFFObjcopy.h
 create mode 100644 llvm/include/llvm/ObjCopy/CommonConfig.h
 create mode 100644 llvm/include/llvm/ObjCopy/ConfigManager.h
 create mode 100644 llvm/include/llvm/ObjCopy/ELF/ELFConfig.h
 create mode 100644 llvm/include/llvm/ObjCopy/ELF/ELFObjcopy.h
 create mode 100644 llvm/include/llvm/ObjCopy/MachO/MachOConfig.h
 create mode 100644 llvm/include/llvm/ObjCopy/MachO/MachOObjcopy.h
 create mode 100644 llvm/include/llvm/ObjCopy/MultiFormatConfig.h
 create mode 100644 llvm/include/llvm/ObjCopy/ObjCopy.h
 create mode 100644 llvm/include/llvm/ObjCopy/XCOFF/XCOFFConfig.h
 create mode 100644 llvm/include/llvm/ObjCopy/XCOFF/XCOFFObjcopy.h
 create mode 100644 llvm/include/llvm/ObjCopy/wasm/WasmConfig.h
 create mode 100644 llvm/include/llvm/ObjCopy/wasm/WasmObjcopy.h
 create mode 100644 llvm/include/llvm/Object/DXContainer.h
 create mode 100644 llvm/include/llvm/Object/OffloadBinary.h
 create mode 100644 llvm/include/llvm/ObjectYAML/DXContainerYAML.h
 create mode 100644 llvm/include/llvm/ObjectYAML/OffloadYAML.h
 create mode 100644 llvm/include/llvm/ProfileData/MIBEntryDef.inc
 create mode 100644 llvm/include/llvm/ProfileData/MemProf.h
 create mode 100644 llvm/include/llvm/Support/BLAKE3.h
 create mode 100644 llvm/include/llvm/Support/CSKYAttributeParser.h
 create mode 100644 llvm/include/llvm/Support/CSKYAttributes.h
 create mode 100644 llvm/include/llvm/Support/CSKYTargetParser.def
 create mode 100644 llvm/include/llvm/Support/CSKYTargetParser.h
 create mode 100644 llvm/include/llvm/TableGen/Parser.h
 delete mode 100644 llvm/include/llvm/Transforms/Coroutines.h
 create mode 100644 llvm/include/llvm/Transforms/Coroutines/CoroConditionalWrapper.h
 delete mode 100644 llvm/include/llvm/Transforms/Scalar/LowerAtomic.h
 create mode 100644 llvm/include/llvm/Transforms/Scalar/LowerAtomicPass.h
 create mode 100644 llvm/include/llvm/Transforms/Scalar/TLSVariableHoist.h
 create mode 100644 llvm/include/llvm/Transforms/Utils/LowerAtomic.h
 create mode 100644 llvm/include/llvm/Transforms/Utils/LowerGlobalDtors.h
 create mode 100644 llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
 create mode 100644 llvm/include/llvm/Transforms/Utils/MisExpect.h
 create mode 100644 llvm/include/llvm/WindowsDriver/MSVCPaths.h
 create mode 100644 llvm/include/llvm/WindowsDriver/MSVCSetupApi.h
 create mode 100644 llvm/lib/Analysis/TensorSpec.cpp
 create mode 100644 llvm/lib/BinaryFormat/COFF.cpp
 create mode 100644 llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp
 create mode 100644 llvm/lib/CodeGen/CFIFixup.cpp
 delete mode 100644 llvm/lib/CodeGen/GlobalISel/RegisterBank.cpp
 delete mode 100644 llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
 create mode 100644 llvm/lib/CodeGen/JMCInstrumenter.cpp
 create mode 100644 llvm/lib/CodeGen/RegisterBank.cpp
 create mode 100644 llvm/lib/CodeGen/RegisterBankInfo.cpp
 create mode 100644 llvm/lib/CodeGen/SelectOptimize.cpp
 create mode 100644 llvm/lib/DebugInfo/DWARF/DWARFTypePrinter.cpp
 create mode 100644 llvm/lib/DebugInfo/GSYM/ExtractRanges.cpp
 delete mode 100644 llvm/lib/DebugInfo/GSYM/Range.cpp
 create mode 100644 llvm/lib/DebugInfo/PDB/Native/FormatUtil.cpp
 create mode 100644 llvm/lib/DebugInfo/PDB/Native/InputFile.cpp
 create mode 100644 llvm/lib/DebugInfo/PDB/Native/LinePrinter.cpp
 create mode 100644 llvm/lib/DebugInfo/Symbolize/DIFetcher.cpp
 create mode 100644 llvm/lib/DebugInfo/Symbolize/Markup.cpp
 create mode 100644 llvm/lib/DebugInfo/Symbolize/MarkupFilter.cpp
 delete mode 100644 llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h
 create mode 100644 llvm/lib/Debuginfod/DIFetcher.cpp
 create mode 100644 llvm/lib/ExecutionEngine/JITLink/DWARFRecordSectionSplitter.cpp
 create mode 100644 llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
 delete mode 100644 llvm/lib/IR/ConstantFold.h
 create mode 100644 llvm/lib/IR/VectorBuilder.cpp
 create mode 100644 llvm/lib/MC/MCDXContainerStreamer.cpp
 create mode 100644 llvm/lib/MC/MCDXContainerWriter.cpp
 create mode 100644 llvm/lib/MC/MCSPIRVStreamer.cpp
 create mode 100644 llvm/lib/MC/MCSectionDXContainer.cpp
 create mode 100644 llvm/lib/MC/SPIRVObjectWriter.cpp
 create mode 100644 llvm/lib/MCA/IncrementalSourceMgr.cpp
 create mode 100644 llvm/lib/ObjCopy/Archive.cpp
 create mode 100644 llvm/lib/ObjCopy/Archive.h
 create mode 100644 llvm/lib/ObjCopy/COFF/COFFObjcopy.cpp
 create mode 100644 llvm/lib/ObjCopy/COFF/COFFObject.cpp
 create mode 100644 llvm/lib/ObjCopy/COFF/COFFObject.h
 create mode 100644 llvm/lib/ObjCopy/COFF/COFFReader.cpp
 create mode 100644 llvm/lib/ObjCopy/COFF/COFFReader.h
 create mode 100644 llvm/lib/ObjCopy/COFF/COFFWriter.cpp
 create mode 100644 llvm/lib/ObjCopy/COFF/COFFWriter.h
 create mode 100644 llvm/lib/ObjCopy/CommonConfig.cpp
 create mode 100644 llvm/lib/ObjCopy/ConfigManager.cpp
 create mode 100644 llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
 create mode 100644 llvm/lib/ObjCopy/ELF/ELFObject.cpp
 create mode 100644 llvm/lib/ObjCopy/ELF/ELFObject.h
 create mode 100644 llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp
 create mode 100644 llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.h
 create mode 100644 llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp
 create mode 100644 llvm/lib/ObjCopy/MachO/MachOObject.cpp
 create mode 100644 llvm/lib/ObjCopy/MachO/MachOObject.h
 create mode 100644 llvm/lib/ObjCopy/MachO/MachOReader.cpp
 create mode 100644 llvm/lib/ObjCopy/MachO/MachOReader.h
 create mode 100644 llvm/lib/ObjCopy/MachO/MachOWriter.cpp
 create mode 100644 llvm/lib/ObjCopy/MachO/MachOWriter.h
 create mode 100644 llvm/lib/ObjCopy/ObjCopy.cpp
 create mode 100644 llvm/lib/ObjCopy/XCOFF/XCOFFObjcopy.cpp
 create mode 100644 llvm/lib/ObjCopy/XCOFF/XCOFFObject.h
 create mode 100644 llvm/lib/ObjCopy/XCOFF/XCOFFReader.cpp
 create mode 100644 llvm/lib/ObjCopy/XCOFF/XCOFFReader.h
 create mode 100644 llvm/lib/ObjCopy/XCOFF/XCOFFWriter.cpp
 create mode 100644 llvm/lib/ObjCopy/XCOFF/XCOFFWriter.h
 create mode 100644 llvm/lib/ObjCopy/wasm/WasmObjcopy.cpp
 create mode 100644 llvm/lib/ObjCopy/wasm/WasmObject.cpp
 create mode 100644 llvm/lib/ObjCopy/wasm/WasmObject.h
 create mode 100644 llvm/lib/ObjCopy/wasm/WasmReader.cpp
 create mode 100644 llvm/lib/ObjCopy/wasm/WasmReader.h
 create mode 100644 llvm/lib/ObjCopy/wasm/WasmWriter.cpp
 create mode 100644 llvm/lib/ObjCopy/wasm/WasmWriter.h
 create mode 100644 llvm/lib/Object/DXContainer.cpp
 create mode 100644 llvm/lib/Object/OffloadBinary.cpp
 create mode 100644 llvm/lib/ObjectYAML/DXContainerEmitter.cpp
 create mode 100644 llvm/lib/ObjectYAML/DXContainerYAML.cpp
 create mode 100644 llvm/lib/ObjectYAML/OffloadEmitter.cpp
 create mode 100644 llvm/lib/ObjectYAML/OffloadYAML.cpp
 create mode 100644 llvm/lib/ProfileData/MemProf.cpp
 create mode 100644 llvm/lib/Support/AddressRanges.cpp
 create mode 100644 llvm/lib/Support/BLAKE3/LICENSE
 create mode 100644 llvm/lib/Support/BLAKE3/README.md
 create mode 100644 llvm/lib/Support/BLAKE3/blake3.c
 create mode 100644 llvm/lib/Support/BLAKE3/blake3_avx2.c
 create mode 100644 llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_unix.S
 create mode 100644 llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_windows_gnu.S
 create mode 100644 llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_windows_msvc.asm
 create mode 100644 llvm/lib/Support/BLAKE3/blake3_avx512.c
 create mode 100644 llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_unix.S
 create mode 100644 llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_windows_gnu.S
 create mode 100644 llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_windows_msvc.asm
 create mode 100644 llvm/lib/Support/BLAKE3/blake3_dispatch.c
 create mode 100644 llvm/lib/Support/BLAKE3/blake3_impl.h
 create mode 100644 llvm/lib/Support/BLAKE3/blake3_neon.c
 create mode 100644 llvm/lib/Support/BLAKE3/blake3_portable.c
 create mode 100644 llvm/lib/Support/BLAKE3/blake3_sse2.c
 create mode 100644 llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_unix.S
 create mode 100644 llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_windows_gnu.S
 create mode 100644 llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_windows_msvc.asm
 create mode 100644 llvm/lib/Support/BLAKE3/blake3_sse41.c
 create mode 100644 llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_unix.S
 create mode 100644 llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_windows_gnu.S
 create mode 100644 llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_windows_msvc.asm
 create mode 100644 llvm/lib/Support/CSKYAttributeParser.cpp
 create mode 100644 llvm/lib/Support/CSKYAttributes.cpp
 create mode 100644 llvm/lib/Support/CSKYTargetParser.cpp
 create mode 100644 llvm/lib/Support/UnicodeNameToCodepoint.cpp
 create mode 100644 llvm/lib/Support/UnicodeNameToCodepointGenerated.cpp
 create mode 100644 llvm/lib/TableGen/Parser.cpp
 create mode 100644 llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp
 create mode 100644 llvm/lib/Target/AArch64/AArch64MachineScheduler.h
 create mode 100644 llvm/lib/Target/AArch64/AArch64SchedAmpere1.td
 create mode 100644 llvm/lib/Target/AArch64/AArch64SchedPredAmpere.td
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUAttributes.def
 delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUReleaseVGPRs.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
 delete mode 100644 llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/LDSDIRInstructions.td
 create mode 100644 llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp
 delete mode 100644 llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
 delete mode 100644 llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
 create mode 100644 llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
 create mode 100644 llvm/lib/Target/AMDGPU/VINTERPInstructions.td
 create mode 100644 llvm/lib/Target/AMDGPU/VOPDInstructions.td
 create mode 100644 llvm/lib/Target/ARM/ARMFixCortexA57AES1742098Pass.cpp
 delete mode 100644 llvm/lib/Target/AVR/AVRRelaxMemOperations.cpp
 create mode 100644 llvm/lib/Target/CSKY/CSKYInstrAlias.td
 create mode 100644 llvm/lib/Target/CSKY/CSKYTargetObjectFile.cpp
 create mode 100644 llvm/lib/Target/CSKY/CSKYTargetObjectFile.h
 create mode 100644 llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp
 create mode 100644 llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.cpp
 create mode 100644 llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.h
 create mode 100644 llvm/lib/Target/CSKY/MCTargetDesc/CSKYTargetStreamer.cpp
 create mode 100644 llvm/lib/Target/CSKY/MCTargetDesc/CSKYTargetStreamer.h
 create mode 100644 llvm/lib/Target/DirectX/DXIL.td
 create mode 100644 llvm/lib/Target/DirectX/DXILConstants.h
 create mode 100644 llvm/lib/Target/DirectX/DXILOpLowering.cpp
 create mode 100644 llvm/lib/Target/DirectX/DXILPointerType.cpp
 create mode 100644 llvm/lib/Target/DirectX/DXILPointerType.h
 create mode 100644 llvm/lib/Target/DirectX/DXILPrepare.cpp
 create mode 100644 llvm/lib/Target/DirectX/DXILStubs.td
 create mode 100644 llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
 create mode 100644 llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
 create mode 100644 llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.h
 create mode 100644 llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.cpp
 create mode 100644 llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.h
 create mode 100644 llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp
 create mode 100644 llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.h
 create mode 100644 llvm/lib/Target/DirectX/DirectX.h
 create mode 100644 llvm/lib/Target/DirectX/DirectX.td
 create mode 100644 llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp
 create mode 100644 llvm/lib/Target/DirectX/DirectXFrameLowering.h
 create mode 100644 llvm/lib/Target/DirectX/DirectXInstrInfo.cpp
 create mode 100644 llvm/lib/Target/DirectX/DirectXInstrInfo.h
 create mode 100644 llvm/lib/Target/DirectX/DirectXRegisterInfo.cpp
 create mode 100644 llvm/lib/Target/DirectX/DirectXRegisterInfo.h
 create mode 100644 llvm/lib/Target/DirectX/DirectXSubtarget.cpp
 create mode 100644 llvm/lib/Target/DirectX/DirectXSubtarget.h
 create mode 100644 llvm/lib/Target/DirectX/DirectXTargetLowering.h
 create mode 100644 llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
 create mode 100644 llvm/lib/Target/DirectX/DirectXTargetMachine.h
 create mode 100644 llvm/lib/Target/DirectX/DirectXTargetTransformInfo.h
 create mode 100644 llvm/lib/Target/DirectX/MCTargetDesc/DirectXContainerObjectWriter.cpp
 create mode 100644 llvm/lib/Target/DirectX/MCTargetDesc/DirectXContainerObjectWriter.h
 create mode 100644 llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp
 create mode 100644 llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.h
 create mode 100644 llvm/lib/Target/DirectX/PointerTypeAnalysis.cpp
 create mode 100644 llvm/lib/Target/DirectX/PointerTypeAnalysis.h
 create mode 100644 llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp
 create mode 100644 llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.h
 delete mode 100644 llvm/lib/Target/Hexagon/HexagonArch.h
 create mode 100644 llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
 create mode 100644 llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp
 create mode 100644 llvm/lib/Target/LoongArch/LoongArch.h
 create mode 100644 llvm/lib/Target/LoongArch/LoongArch.td
 create mode 100644 llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
 create mode 100644 llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h
 create mode 100644 llvm/lib/Target/LoongArch/LoongArchCallingConv.td
 create mode 100644 llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
 create mode 100644 llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
 create mode 100644 llvm/lib/Target/LoongArch/LoongArchFloatInstrFormats.td
 create mode 100644 llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
 create mode 100644 llvm/lib/Target/LoongArch/LoongArchFrameLowering.h
 create mode 100644 llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp
 create mode 100644 llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h
 create mode 100644 llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
 create mode 100644 llvm/lib/Target/LoongArch/LoongArchISelLowering.h
 create mode 100644 llvm/lib/Target/LoongArch/LoongArchInstrFormats.td
 create mode 100644 llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
 create mode 100644 llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
 create mode 100644 llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
 create mode 100644 llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
 create mode 100644 llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h
 create mode 100644 llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp
 create mode 100644 llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h
 create mode 100644 llvm/lib/Target/LoongArch/LoongArchRegisterInfo.td
 create mode 100644 llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp
 create mode 100644 llvm/lib/Target/LoongArch/LoongArchSubtarget.h
 create mode 100644 llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
 create mode 100644 llvm/lib/Target/LoongArch/LoongArchTargetMachine.h
 create mode 100644 llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
 create mode 100644 llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
 create mode 100644 llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp
 create mode 100644 llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
 create mode 100644 llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
 create mode 100644 llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchInstPrinter.cpp
 create mode 100644 llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchInstPrinter.h
 create mode 100644 llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.cpp
 create mode 100644 llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.h
 create mode 100644 llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
 create mode 100644 llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
 create mode 100644 llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h
 create mode 100644 llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp
 create mode 100644 llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.h
 create mode 100644 llvm/lib/Target/LoongArch/TargetInfo/LoongArchTargetInfo.cpp
 create mode 100644 llvm/lib/Target/LoongArch/TargetInfo/LoongArchTargetInfo.h
 create mode 100644 llvm/lib/Target/Mips/MipsCombine.td
 create mode 100644 llvm/lib/Target/Mips/MipsPostLegalizerCombiner.cpp
 create mode 100644 llvm/lib/Target/Mips/MipsTargetTransformInfo.cpp
 create mode 100644 llvm/lib/Target/Mips/MipsTargetTransformInfo.h
 create mode 100644 llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp
 create mode 100644 llvm/lib/Target/PowerPC/PPCGenScalarMASSEntries.cpp
 create mode 100644 llvm/lib/Target/PowerPC/PPCInstrMMA.td
 create mode 100644 llvm/lib/Target/PowerPC/PPCInstrP10.td
 delete mode 100644 llvm/lib/Target/PowerPC/PPCInstrPrefix.td
 create mode 100644 llvm/lib/Target/PowerPC/PPCRegisterInfoMMA.td
 create mode 100644 llvm/lib/Target/RISCV/RISCVInstrInfoZicbo.td
 create mode 100644 llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.cpp
 create mode 100644 llvm/lib/Target/RISCV/RISCVMacroFusion.cpp
 create mode 100644 llvm/lib/Target/RISCV/RISCVMacroFusion.h
 create mode 100644 llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp
 create mode 100644 llvm/lib/Target/RISCV/RISCVRedundantCopyElimination.cpp
 create mode 100644 llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVAsmBackend.cpp
 create mode 100644 llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp
 create mode 100644 llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
 create mode 100644 llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
 create mode 100644 llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.h
 create mode 100644 llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCAsmInfo.cpp
 create mode 100644 llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCAsmInfo.h
 create mode 100644 llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCCodeEmitter.cpp
 create mode 100644 llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp
 create mode 100644 llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.h
 create mode 100644 llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVObjectTargetWriter.cpp
 create mode 100644 llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.cpp
 create mode 100644 llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.h
 create mode 100644 llvm/lib/Target/SPIRV/SPIRV.h
 create mode 100644 llvm/lib/Target/SPIRV/SPIRV.td
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVCallLowering.h
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVEnums.td
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVFrameLowering.h
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVISelLowering.h
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVInstrFormats.td
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVMCInstLower.h
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVRegisterBankInfo.cpp
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVRegisterBankInfo.h
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVRegisterBanks.td
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVRegisterInfo.cpp
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVRegisterInfo.h
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVRegisterInfo.td
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVSubtarget.h
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVTargetMachine.h
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVTargetObjectFile.h
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVUtils.cpp
 create mode 100644 llvm/lib/Target/SPIRV/SPIRVUtils.h
 create mode 100644 llvm/lib/Target/SPIRV/TargetInfo/SPIRVTargetInfo.cpp
 create mode 100644 llvm/lib/Target/SPIRV/TargetInfo/SPIRVTargetInfo.h
 create mode 100644 llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
 create mode 100644 llvm/lib/Target/VE/VVPISelLowering.cpp
 delete mode 100644 llvm/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp
 delete mode 100644 llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
 create mode 100644 llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.cpp
 create mode 100644 llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.h
 create mode 100644 llvm/lib/Target/X86/MCTargetDesc/X86MnemonicTables.cpp
 create mode 100644 llvm/lib/Target/X86/X86FastPreTileConfig.cpp
 create mode 100644 llvm/lib/Transforms/Coroutines/CoroConditionalWrapper.cpp
 delete mode 100644 llvm/lib/Transforms/Instrumentation/MaximumSpanningTree.h
 delete mode 100644 llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
 delete mode 100644 llvm/lib/Transforms/Scalar/LowerAtomic.cpp
 create mode 100644 llvm/lib/Transforms/Scalar/LowerAtomicPass.cpp
 create mode 100644 llvm/lib/Transforms/Scalar/TLSVariableHoist.cpp
 create mode 100644 llvm/lib/Transforms/Utils/LowerAtomic.cpp
 create mode 100644 llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp
 create mode 100644 llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
 create mode 100644 llvm/lib/Transforms/Utils/MisExpect.cpp
 delete mode 100644 llvm/lib/Transforms/Vectorize/VPlanLoopInfo.h
 delete mode 100644 llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
 delete mode 100644 llvm/lib/Transforms/Vectorize/VPlanPredicator.h
 create mode 100644 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
 create mode 100644 llvm/lib/WindowsDriver/MSVCPaths.cpp
 delete mode 100644 llvm/tools/llvm-objcopy/COFF/COFFConfig.h
 delete mode 100644 llvm/tools/llvm-objcopy/COFF/COFFObjcopy.cpp
 delete mode 100644 llvm/tools/llvm-objcopy/COFF/COFFObjcopy.h
 delete mode 100644 llvm/tools/llvm-objcopy/COFF/Object.cpp
 delete mode 100644 llvm/tools/llvm-objcopy/COFF/Object.h
 delete mode 100644 llvm/tools/llvm-objcopy/COFF/Reader.cpp
 delete mode 100644 llvm/tools/llvm-objcopy/COFF/Reader.h
 delete mode 100644 llvm/tools/llvm-objcopy/COFF/Writer.cpp
 delete mode 100644 llvm/tools/llvm-objcopy/COFF/Writer.h
 delete mode 100644 llvm/tools/llvm-objcopy/CommonConfig.h
 delete mode 100644 llvm/tools/llvm-objcopy/ConfigManager.cpp
 delete mode 100644 llvm/tools/llvm-objcopy/ConfigManager.h
 delete mode 100644 llvm/tools/llvm-objcopy/ELF/ELFConfig.h
 delete mode 100644 llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
 delete mode 100644 llvm/tools/llvm-objcopy/ELF/ELFObjcopy.h
 delete mode 100644 llvm/tools/llvm-objcopy/ELF/Object.cpp
 delete mode 100644 llvm/tools/llvm-objcopy/ELF/Object.h
 delete mode 100644 llvm/tools/llvm-objcopy/MachO/MachOConfig.h
 delete mode 100644 llvm/tools/llvm-objcopy/MachO/MachOLayoutBuilder.cpp
 delete mode 100644 llvm/tools/llvm-objcopy/MachO/MachOLayoutBuilder.h
 delete mode 100644 llvm/tools/llvm-objcopy/MachO/MachOObjcopy.cpp
 delete mode 100644 llvm/tools/llvm-objcopy/MachO/MachOObjcopy.h
 delete mode 100644 llvm/tools/llvm-objcopy/MachO/MachOReader.cpp
 delete mode 100644 llvm/tools/llvm-objcopy/MachO/MachOReader.h
 delete mode 100644 llvm/tools/llvm-objcopy/MachO/MachOWriter.cpp
 delete mode 100644 llvm/tools/llvm-objcopy/MachO/MachOWriter.h
 delete mode 100644 llvm/tools/llvm-objcopy/MachO/Object.cpp
 delete mode 100644 llvm/tools/llvm-objcopy/MachO/Object.h
 delete mode 100644 llvm/tools/llvm-objcopy/MultiFormatConfig.h
 create mode 100644 llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
 create mode 100644 llvm/tools/llvm-objcopy/ObjcopyOptions.h
 delete mode 100644 llvm/tools/llvm-objcopy/llvm-objcopy.h
 delete mode 100644 llvm/tools/llvm-objcopy/wasm/Object.cpp
 delete mode 100644 llvm/tools/llvm-objcopy/wasm/Object.h
 delete mode 100644 llvm/tools/llvm-objcopy/wasm/Reader.cpp
 delete mode 100644 llvm/tools/llvm-objcopy/wasm/Reader.h
 delete mode 100644 llvm/tools/llvm-objcopy/wasm/WasmConfig.h
 delete mode 100644 llvm/tools/llvm-objcopy/wasm/WasmObjcopy.cpp
 delete mode 100644 llvm/tools/llvm-objcopy/wasm/WasmObjcopy.h
 delete mode 100644 llvm/tools/llvm-objcopy/wasm/Writer.cpp
 delete mode 100644 llvm/tools/llvm-objcopy/wasm/Writer.h
 create mode 100644 llvm/tools/llvm-objdump/OffloadDump.cpp
 create mode 100644 llvm/tools/llvm-objdump/OffloadDump.h
 delete mode 100644 llvm/tools/llvm-pdbutil/FormatUtil.cpp
 delete mode 100644 llvm/tools/llvm-pdbutil/FormatUtil.h
 delete mode 100644 llvm/tools/llvm-pdbutil/InputFile.cpp
 delete mode 100644 llvm/tools/llvm-pdbutil/InputFile.h
 delete mode 100644 llvm/tools/llvm-pdbutil/LinePrinter.cpp
 delete mode 100644 llvm/tools/llvm-pdbutil/LinePrinter.h
 delete mode 100644 llvm/tools/opt/PassPrinters.cpp
 delete mode 100644 llvm/tools/opt/PassPrinters.h
 delete mode 100644 llvm/utils/TableGen/CodeBeadsGen.cpp
 create mode 100644 llvm/utils/TableGen/DXILEmitter.cpp
 create mode 100644 llvm/utils/TableGen/DecoderEmitter.cpp
 delete mode 100644 llvm/utils/TableGen/FixedLenDecoderEmitter.cpp
 create mode 100644 llvm/utils/TableGen/VarLenCodeEmitterGen.cpp
 create mode 100644 llvm/utils/TableGen/VarLenCodeEmitterGen.h
 create mode 100644 llvm/utils/TableGen/X86MnemonicTables.cpp

(limited to 'llvm')

diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h
index 09d80841fa5d..2abc29851cd9 100644
--- a/llvm/include/llvm-c/Core.h
+++ b/llvm/include/llvm-c/Core.h
@@ -548,6 +548,13 @@ LLVMBool LLVMContextShouldDiscardValueNames(LLVMContextRef C);
  */
 void LLVMContextSetDiscardValueNames(LLVMContextRef C, LLVMBool Discard);
 
+/**
+ * Set whether the given context is in opaque pointer mode.
+ *
+ * @see LLVMContext::setOpaquePointers()
+ */
+void LLVMContextSetOpaquePointers(LLVMContextRef C, LLVMBool OpaquePointers);
+
 /**
  * Destroy a context instance.
  *
@@ -1391,9 +1398,9 @@ LLVMBool LLVMIsLiteralStruct(LLVMTypeRef StructTy);
  */
 
 /**
- * Obtain the type of elements within a sequential type.
+ * Obtain the element type of an array or vector type.
  *
- * This works on array, vector, and pointer types.
+ * This currently also works for pointer types, but this usage is deprecated.
  *
  * @see llvm::SequentialType::getElementType()
  */
@@ -1442,6 +1449,22 @@ unsigned LLVMGetArrayLength(LLVMTypeRef ArrayTy);
  */
 LLVMTypeRef LLVMPointerType(LLVMTypeRef ElementType, unsigned AddressSpace);
 
+/**
+ * Determine whether a pointer is opaque.
+ *
+ * True if this is an instance of an opaque PointerType.
+ *
+ * @see llvm::Type::isOpaquePointerTy()
+ */
+LLVMBool LLVMPointerTypeIsOpaque(LLVMTypeRef Ty);
+
+/**
+ * Create an opaque pointer type in a context.
+ *
+ * @see llvm::PointerType::get()
+ */
+LLVMTypeRef LLVMPointerTypeInContext(LLVMContextRef C, unsigned AddressSpace);
+
 /**
  * Obtain the address space of a pointer type.
  *
@@ -2088,12 +2111,24 @@ LLVMValueRef LLVMConstNamedStruct(LLVMTypeRef StructTy,
                                   LLVMValueRef *ConstantVals,
                                   unsigned Count);
 
+/**
+ * Get element of a constant aggregate (struct, array or vector) at the
+ * specified index. Returns null if the index is out of range, or it's not
+ * possible to determine the element (e.g., because the constant is a
+ * constant expression.)
+ *
+ * @see llvm::Constant::getAggregateElement()
+ */
+LLVMValueRef LLVMGetAggregateElement(LLVMValueRef C, unsigned Idx);
+
 /**
  * Get an element at specified index as a constant.
  *
  * @see ConstantDataSequential::getElementAsConstant()
  */
-LLVMValueRef LLVMGetElementAsConstant(LLVMValueRef C, unsigned idx);
+LLVM_ATTRIBUTE_C_DEPRECATED(
+    LLVMValueRef LLVMGetElementAsConstant(LLVMValueRef C, unsigned idx),
+    "Use LLVMGetAggregateElement instead");
 
 /**
  * Create a ConstantVector from values.
@@ -2203,8 +2238,6 @@ LLVMValueRef LLVMConstInsertElement(LLVMValueRef VectorConstant,
 LLVMValueRef LLVMConstShuffleVector(LLVMValueRef VectorAConstant,
                                     LLVMValueRef VectorBConstant,
                                     LLVMValueRef MaskConstant);
-LLVMValueRef LLVMConstExtractValue(LLVMValueRef AggConstant, unsigned *IdxList,
-                                   unsigned NumIdx);
 LLVMValueRef LLVMConstInsertValue(LLVMValueRef AggConstant,
                                   LLVMValueRef ElementValueConstant,
                                   unsigned *IdxList, unsigned NumIdx);
@@ -3978,6 +4011,9 @@ LLVMValueRef LLVMBuildFPCast(LLVMBuilderRef, LLVMValueRef Val,
 LLVMValueRef LLVMBuildIntCast(LLVMBuilderRef, LLVMValueRef Val, /*Signed cast!*/
                               LLVMTypeRef DestTy, const char *Name);
 
+LLVMOpcode LLVMGetCastOpcode(LLVMValueRef Src, LLVMBool SrcIsSigned,
+                             LLVMTypeRef DestTy, LLVMBool DestIsSigned);
+
 /* Comparisons */
 LLVMValueRef LLVMBuildICmp(LLVMBuilderRef, LLVMIntPredicate Op,
                            LLVMValueRef LHS, LLVMValueRef RHS,
diff --git a/llvm/include/llvm-c/DisassemblerTypes.h b/llvm/include/llvm-c/DisassemblerTypes.h
index 53baaef11033..6999a350ec91 100644
--- a/llvm/include/llvm-c/DisassemblerTypes.h
+++ b/llvm/include/llvm-c/DisassemblerTypes.h
@@ -38,15 +38,15 @@ typedef void *LLVMDisasmContextRef;
  * one operand with symbolic information.  To determine the symbolic operand
  * information for each operand, the bytes for the specific operand in the
  * instruction are specified by the Offset parameter and its byte widith is the
- * size parameter.  For instructions sets with fixed widths and one symbolic
- * operand per instruction, the Offset parameter will be zero and Size parameter
- * will be the instruction width.  The information is returned in TagBuf and is
- * Triple specific with its specific information defined by the value of
- * TagType for that Triple.  If symbolic information is returned the function
- * returns 1, otherwise it returns 0.
+ * OpSize parameter.  For instructions sets with fixed widths and one symbolic
+ * operand per instruction, the Offset parameter will be zero and InstSize
+ * parameter will be the instruction width.  The information is returned in
+ * TagBuf and is Triple specific with its specific information defined by the
+ * value of TagType for that Triple.  If symbolic information is returned the
+ * function * returns 1, otherwise it returns 0.
  */
-typedef int (*LLVMOpInfoCallback)(void *DisInfo, uint64_t PC,
-                                  uint64_t Offset, uint64_t Size,
+typedef int (*LLVMOpInfoCallback)(void *DisInfo, uint64_t PC, uint64_t Offset,
+                                  uint64_t OpSize, uint64_t InstSize,
                                   int TagType, void *TagBuf);
 
 /**
diff --git a/llvm/include/llvm-c/Object.h b/llvm/include/llvm-c/Object.h
index 9a9596aaa08c..f422c1ad224d 100644
--- a/llvm/include/llvm-c/Object.h
+++ b/llvm/include/llvm-c/Object.h
@@ -38,21 +38,23 @@ typedef struct LLVMOpaqueSymbolIterator *LLVMSymbolIteratorRef;
 typedef struct LLVMOpaqueRelocationIterator *LLVMRelocationIteratorRef;
 
 typedef enum {
-  LLVMBinaryTypeArchive,                /**< Archive file. */
-  LLVMBinaryTypeMachOUniversalBinary,   /**< Mach-O Universal Binary file. */
-  LLVMBinaryTypeCOFFImportFile,         /**< COFF Import file. */
-  LLVMBinaryTypeIR,                     /**< LLVM IR. */
-  LLVMBinaryTypeWinRes,                 /**< Windows resource (.res) file. */
-  LLVMBinaryTypeCOFF,                   /**< COFF Object file. */
-  LLVMBinaryTypeELF32L,                 /**< ELF 32-bit, little endian. */
-  LLVMBinaryTypeELF32B,                 /**< ELF 32-bit, big endian. */
-  LLVMBinaryTypeELF64L,                 /**< ELF 64-bit, little endian. */
-  LLVMBinaryTypeELF64B,                 /**< ELF 64-bit, big endian. */
-  LLVMBinaryTypeMachO32L,               /**< MachO 32-bit, little endian. */
-  LLVMBinaryTypeMachO32B,               /**< MachO 32-bit, big endian. */
-  LLVMBinaryTypeMachO64L,               /**< MachO 64-bit, little endian. */
-  LLVMBinaryTypeMachO64B,               /**< MachO 64-bit, big endian. */
-  LLVMBinaryTypeWasm,                   /**< Web Assembly. */
+  LLVMBinaryTypeArchive,              /**< Archive file. */
+  LLVMBinaryTypeMachOUniversalBinary, /**< Mach-O Universal Binary file. */
+  LLVMBinaryTypeCOFFImportFile,       /**< COFF Import file. */
+  LLVMBinaryTypeIR,                   /**< LLVM IR. */
+  LLVMBinaryTypeWinRes,               /**< Windows resource (.res) file. */
+  LLVMBinaryTypeCOFF,                 /**< COFF Object file. */
+  LLVMBinaryTypeELF32L,               /**< ELF 32-bit, little endian. */
+  LLVMBinaryTypeELF32B,               /**< ELF 32-bit, big endian. */
+  LLVMBinaryTypeELF64L,               /**< ELF 64-bit, little endian. */
+  LLVMBinaryTypeELF64B,               /**< ELF 64-bit, big endian. */
+  LLVMBinaryTypeMachO32L,             /**< MachO 32-bit, little endian. */
+  LLVMBinaryTypeMachO32B,             /**< MachO 32-bit, big endian. */
+  LLVMBinaryTypeMachO64L,             /**< MachO 64-bit, little endian. */
+  LLVMBinaryTypeMachO64B,             /**< MachO 64-bit, big endian. */
+  LLVMBinaryTypeWasm,                 /**< Web Assembly. */
+  LLVMBinaryTypeOffload,              /**< Offloading fatbinary. */
+
 } LLVMBinaryType;
 
 /**
diff --git a/llvm/include/llvm-c/Orc.h b/llvm/include/llvm-c/Orc.h
index e2f30b7cdf45..0dcfb06865aa 100644
--- a/llvm/include/llvm-c/Orc.h
+++ b/llvm/include/llvm-c/Orc.h
@@ -54,6 +54,7 @@ typedef uint64_t LLVMOrcExecutorAddress;
  * Represents generic linkage flags for a symbol definition.
  */
 typedef enum {
+  LLVMJITSymbolGenericFlagsNone = 0,
   LLVMJITSymbolGenericFlagsExported = 1U << 0,
   LLVMJITSymbolGenericFlagsWeak = 1U << 1,
   LLVMJITSymbolGenericFlagsCallable = 1U << 2,
@@ -122,13 +123,13 @@ typedef LLVMOrcCSymbolFlagsMapPair *LLVMOrcCSymbolFlagsMapPairs;
 typedef struct {
   LLVMOrcSymbolStringPoolEntryRef Name;
   LLVMJITEvaluatedSymbol Sym;
-} LLVMJITCSymbolMapPair;
+} LLVMOrcCSymbolMapPair;
 
 /**
  * Represents a list of (SymbolStringPtr, JITEvaluatedSymbol) pairs that can be
  * used to construct a SymbolMap.
  */
-typedef LLVMJITCSymbolMapPair *LLVMOrcCSymbolMapPairs;
+typedef LLVMOrcCSymbolMapPair *LLVMOrcCSymbolMapPairs;
 
 /**
  * Represents a SymbolAliasMapEntry
@@ -202,6 +203,22 @@ typedef enum {
   LLVMOrcJITDylibLookupFlagsMatchAllSymbols
 } LLVMOrcJITDylibLookupFlags;
 
+/**
+ * An element type for a JITDylib search order.
+ */
+typedef struct {
+  LLVMOrcJITDylibRef JD;
+  LLVMOrcJITDylibLookupFlags JDLookupFlags;
+} LLVMOrcCJITDylibSearchOrderElement;
+
+/**
+ * A JITDylib search order.
+ *
+ * The list is terminated with an element containing a null pointer for the JD
+ * field.
+ */
+typedef LLVMOrcCJITDylibSearchOrderElement *LLVMOrcCJITDylibSearchOrder;
+
 /**
  * Symbol lookup flags for lookup sets. This should be kept in sync with
  * llvm::orc::SymbolLookupFlags.
@@ -340,6 +357,14 @@ typedef LLVMErrorRef (*LLVMOrcCAPIDefinitionGeneratorTryToGenerateFunction)(
     LLVMOrcJITDylibRef JD, LLVMOrcJITDylibLookupFlags JDLookupFlags,
     LLVMOrcCLookupSet LookupSet, size_t LookupSetSize);
 
+/**
+ * Disposer for a custom generator.
+ *
+ * Will be called by ORC when the JITDylib that the generator is attached to
+ * is destroyed.
+ */
+typedef void (*LLVMOrcDisposeCAPIDefinitionGeneratorFunction)(void *Ctx);
+
 /**
  * Predicate function for SymbolStringPoolEntries.
  */
@@ -494,6 +519,58 @@ void LLVMOrcSymbolStringPoolClearDeadEntries(LLVMOrcSymbolStringPoolRef SSP);
 LLVMOrcSymbolStringPoolEntryRef
 LLVMOrcExecutionSessionIntern(LLVMOrcExecutionSessionRef ES, const char *Name);
 
+/**
+ * Callback type for ExecutionSession lookups.
+ *
+ * If Err is LLVMErrorSuccess then Result will contain a pointer to a
+ * list of ( SymbolStringPtr, JITEvaluatedSymbol ) pairs of length NumPairs.
+ *
+ * If Err is a failure value then Result and Ctx are undefined and should
+ * not be accessed. The Callback is responsible for handling the error
+ * value (e.g. by calling LLVMGetErrorMessage + LLVMDisposeErrorMessage).
+ *
+ * The caller retains ownership of the Result array and will release all
+ * contained symbol names. Clients are responsible for retaining any symbol
+ * names that they wish to hold after the function returns.
+ */
+typedef void (*LLVMOrcExecutionSessionLookupHandleResultFunction)(
+    LLVMErrorRef Err, LLVMOrcCSymbolMapPairs Result, size_t NumPairs,
+    void *Ctx);
+
+/**
+ * Look up symbols in an execution session.
+ *
+ * This is a wrapper around the general ExecutionSession::lookup function.
+ *
+ * The SearchOrder argument contains a list of (JITDylibs, JITDylibSearchFlags)
+ * pairs that describe the search order. The JITDylibs will be searched in the
+ * given order to try to find the symbols in the Symbols argument.
+ *
+ * The Symbols argument should contain a null-terminated array of
+ * (SymbolStringPtr, SymbolLookupFlags) pairs describing the symbols to be
+ * searched for. This function takes ownership of the elements of the Symbols
+ * array. The Name fields of the Symbols elements are taken to have been
+ * retained by the client for this function. The client should *not* release the
+ * Name fields, but are still responsible for destroying the array itself.
+ *
+ * The HandleResult function will be called once all searched for symbols have
+ * been found, or an error occurs. The HandleResult function will be passed an
+ * LLVMErrorRef indicating success or failure, and (on success) a
+ * null-terminated LLVMOrcCSymbolMapPairs array containing the function result,
+ * and the Ctx value passed to the lookup function.
+ *
+ * The client is fully responsible for managing the lifetime of the Ctx object.
+ * A common idiom is to allocate the context prior to the lookup and deallocate
+ * it in the handler.
+ *
+ * THIS API IS EXPERIMENTAL AND LIKELY TO CHANGE IN THE NEAR FUTURE!
+ */
+void LLVMOrcExecutionSessionLookup(
+    LLVMOrcExecutionSessionRef ES, LLVMOrcLookupKind K,
+    LLVMOrcCJITDylibSearchOrder SearchOrder, size_t SearchOrderSize,
+    LLVMOrcCLookupSet Symbols, size_t SymbolsSize,
+    LLVMOrcExecutionSessionLookupHandleResultFunction HandleResult, void *Ctx);
+
 /**
  * Increments the ref-count for a SymbolStringPool entry.
  */
@@ -504,6 +581,11 @@ void LLVMOrcRetainSymbolStringPoolEntry(LLVMOrcSymbolStringPoolEntryRef S);
  */
 void LLVMOrcReleaseSymbolStringPoolEntry(LLVMOrcSymbolStringPoolEntryRef S);
 
+/**
+ * Return the c-string for the given symbol. This string will remain valid until
+ * the entry is freed (once all LLVMOrcSymbolStringPoolEntryRefs have been
+ * released).
+ */
 const char *LLVMOrcSymbolStringPoolEntryStr(LLVMOrcSymbolStringPoolEntryRef S);
 
 /**
@@ -547,7 +629,7 @@ void LLVMOrcDisposeMaterializationUnit(LLVMOrcMaterializationUnitRef MU);
  * unit. This function takes ownership of the elements of the Syms array. The
  * Name fields of the array elements are taken to have been retained for this
  * function. The client should *not* release the elements of the array, but is
- * still responsible for destroyingthe array itself.
+ * still responsible for destroying the array itself.
  *
  * The InitSym argument indicates whether or not this MaterializationUnit
  * contains static initializers. If three are no static initializers (the common
@@ -701,7 +783,7 @@ LLVMOrcMaterializationResponsibilityGetRequestedSymbols(
  */
 void LLVMOrcDisposeSymbols(LLVMOrcSymbolStringPoolEntryRef *Symbols);
 
-/*
+/**
  * Notifies the target JITDylib that the given symbols have been resolved.
  * This will update the given symbols' addresses in the JITDylib, and notify
  * any pending queries on the given symbols of their resolution. The given
@@ -901,9 +983,27 @@ void LLVMOrcJITDylibAddGenerator(LLVMOrcJITDylibRef JD,
 
 /**
  * Create a custom generator.
+ *
+ * The F argument will be used to implement the DefinitionGenerator's
+ * tryToGenerate method (see
+ * LLVMOrcCAPIDefinitionGeneratorTryToGenerateFunction).
+ *
+ * Ctx is a context object that will be passed to F. This argument is
+ * permitted to be null.
+ *
+ * Dispose is the disposal function for Ctx. This argument is permitted to be
+ * null (in which case the client is responsible for the lifetime of Ctx).
  */
 LLVMOrcDefinitionGeneratorRef LLVMOrcCreateCustomCAPIDefinitionGenerator(
-    LLVMOrcCAPIDefinitionGeneratorTryToGenerateFunction F, void *Ctx);
+    LLVMOrcCAPIDefinitionGeneratorTryToGenerateFunction F, void *Ctx,
+    LLVMOrcDisposeCAPIDefinitionGeneratorFunction Dispose);
+
+/**
+ * Continue a lookup that was suspended in a generator (see
+ * LLVMOrcCAPIDefinitionGeneratorTryToGenerateFunction).
+ */
+void LLVMOrcLookupStateContinueLookup(LLVMOrcLookupStateRef S,
+                                      LLVMErrorRef Err);
 
 /**
  * Get a DynamicLibrarySearchGenerator that will reflect process symbols into
diff --git a/llvm/include/llvm-c/TargetMachine.h b/llvm/include/llvm-c/TargetMachine.h
index 23c8c63ff0b4..bfbe1421a356 100644
--- a/llvm/include/llvm-c/TargetMachine.h
+++ b/llvm/include/llvm-c/TargetMachine.h
@@ -136,7 +136,9 @@ void LLVMSetTargetMachineAsmVerbosity(LLVMTargetMachineRef T,
   wraps several c++ only classes (among them a file stream). Returns any
   error in ErrorMessage. Use LLVMDisposeMessage to dispose the message. */
 LLVMBool LLVMTargetMachineEmitToFile(LLVMTargetMachineRef T, LLVMModuleRef M,
-  char *Filename, LLVMCodeGenFileType codegen, char **ErrorMessage);
+                                     const char *Filename,
+                                     LLVMCodeGenFileType codegen,
+                                     char **ErrorMessage);
 
 /** Compile the LLVM IR stored in \p M and store the result in \p OutMemBuf. */
 LLVMBool LLVMTargetMachineEmitToMemoryBuffer(LLVMTargetMachineRef T, LLVMModuleRef M,
diff --git a/llvm/include/llvm-c/Transforms/Coroutines.h b/llvm/include/llvm-c/Transforms/Coroutines.h
deleted file mode 100644
index 03b6822033c9..000000000000
--- a/llvm/include/llvm-c/Transforms/Coroutines.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*===-- Coroutines.h - Coroutines Library C Interface -----------*- C++ -*-===*\
-|*                                                                            *|
-|* Part of the LLVM Project, under the Apache License v2.0 with LLVM          *|
-|* Exceptions.                                                                *|
-|* See https://llvm.org/LICENSE.txt for license information.                  *|
-|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception                    *|
-|*                                                                            *|
-|*===----------------------------------------------------------------------===*|
-|*                                                                            *|
-|* This header declares the C interface to libLLVMCoroutines.a, which         *|
-|* implements various scalar transformations of the LLVM IR.                  *|
-|*                                                                            *|
-|* Many exotic languages can interoperate with C code but have a harder time  *|
-|* with C++ due to name mangling. So in addition to C, this interface enables *|
-|* tools written in such languages.                                           *|
-|*                                                                            *|
-\*===----------------------------------------------------------------------===*/
-
-#ifndef LLVM_C_TRANSFORMS_COROUTINES_H
-#define LLVM_C_TRANSFORMS_COROUTINES_H
-
-#include "llvm-c/ExternC.h"
-#include "llvm-c/Types.h"
-#include "llvm-c/Transforms/PassManagerBuilder.h"
-
-LLVM_C_EXTERN_C_BEGIN
-
-/**
- * @defgroup LLVMCTransformsCoroutines Coroutine transformations
- * @ingroup LLVMCTransforms
- *
- * @{
- */
-
-/** See llvm::createCoroEarlyLegacyPass function. */
-void LLVMAddCoroEarlyPass(LLVMPassManagerRef PM);
-
-/** See llvm::createCoroSplitLegacyPass function. */
-void LLVMAddCoroSplitPass(LLVMPassManagerRef PM);
-
-/** See llvm::createCoroElideLegacyPass function. */
-void LLVMAddCoroElidePass(LLVMPassManagerRef PM);
-
-/** See llvm::createCoroCleanupLegacyPass function. */
-void LLVMAddCoroCleanupPass(LLVMPassManagerRef PM);
-
-/** See llvm::addCoroutinePassesToExtensionPoints. */
-void LLVMPassManagerBuilderAddCoroutinePassesToExtensionPoints(LLVMPassManagerBuilderRef PMB);
-
-/**
- * @}
- */
-
-LLVM_C_EXTERN_C_END
-
-#endif
diff --git a/llvm/include/llvm-c/Transforms/IPO.h b/llvm/include/llvm-c/Transforms/IPO.h
index 3f2cadf32366..c806156281bd 100644
--- a/llvm/include/llvm-c/Transforms/IPO.h
+++ b/llvm/include/llvm-c/Transforms/IPO.h
@@ -27,9 +27,6 @@ LLVM_C_EXTERN_C_BEGIN
  * @{
  */
 
-/** See llvm::createArgumentPromotionPass function. */
-void LLVMAddArgumentPromotionPass(LLVMPassManagerRef PM);
-
 /** See llvm::createConstantMergePass function. */
 void LLVMAddConstantMergePass(LLVMPassManagerRef PM);
 
diff --git a/llvm/include/llvm-c/Transforms/PassManagerBuilder.h b/llvm/include/llvm-c/Transforms/PassManagerBuilder.h
index 6e13e18e063b..3ba75440129a 100644
--- a/llvm/include/llvm-c/Transforms/PassManagerBuilder.h
+++ b/llvm/include/llvm-c/Transforms/PassManagerBuilder.h
@@ -72,12 +72,6 @@ void
 LLVMPassManagerBuilderPopulateModulePassManager(LLVMPassManagerBuilderRef PMB,
                                                 LLVMPassManagerRef PM);
 
-/** See llvm::PassManagerBuilder::populateLTOPassManager. */
-void LLVMPassManagerBuilderPopulateLTOPassManager(LLVMPassManagerBuilderRef PMB,
-                                                  LLVMPassManagerRef PM,
-                                                  LLVMBool Internalize,
-                                                  LLVMBool RunInliner);
-
 /**
  * @}
  */
diff --git a/llvm/include/llvm-c/Transforms/Scalar.h b/llvm/include/llvm-c/Transforms/Scalar.h
index ba142508bbe4..1d0944799710 100644
--- a/llvm/include/llvm-c/Transforms/Scalar.h
+++ b/llvm/include/llvm-c/Transforms/Scalar.h
@@ -94,9 +94,6 @@ void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM);
 /** See llvm::createLoopUnrollAndJamPass function. */
 void LLVMAddLoopUnrollAndJamPass(LLVMPassManagerRef PM);
 
-/** See llvm::createLoopUnswitchPass function. */
-void LLVMAddLoopUnswitchPass(LLVMPassManagerRef PM);
-
 /** See llvm::createLowerAtomicPass function. */
 void LLVMAddLowerAtomicPass(LLVMPassManagerRef PM);
 
diff --git a/llvm/include/llvm-c/blake3.h b/llvm/include/llvm-c/blake3.h
new file mode 100644
index 000000000000..679477c3aa7f
--- /dev/null
+++ b/llvm/include/llvm-c/blake3.h
@@ -0,0 +1,79 @@
+/*===-- llvm-c/blake3.h - BLAKE3 C Interface ----------------------*- C -*-===*\
+|*                                                                            *|
+|* Released into the public domain with CC0 1.0                               *|
+|* See 'llvm/lib/Support/BLAKE3/LICENSE' for info.                            *|
+|* SPDX-License-Identifier: CC0-1.0                                           *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This header declares the C interface to LLVM's BLAKE3 implementation.      *|
+|* Original BLAKE3 C API: https://github.com/BLAKE3-team/BLAKE3/tree/1.3.1/c  *|
+|*                                                                            *|
+|* Symbols are prefixed with 'llvm' to avoid a potential conflict with        *|
+|* another BLAKE3 version within the same program.                            *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#ifndef LLVM_C_BLAKE3_H
+#define LLVM_C_BLAKE3_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define LLVM_BLAKE3_VERSION_STRING "1.3.1"
+#define LLVM_BLAKE3_KEY_LEN 32
+#define LLVM_BLAKE3_OUT_LEN 32
+#define LLVM_BLAKE3_BLOCK_LEN 64
+#define LLVM_BLAKE3_CHUNK_LEN 1024
+#define LLVM_BLAKE3_MAX_DEPTH 54
+
+// This struct is a private implementation detail. It has to be here because
+// it's part of llvm_blake3_hasher below.
+typedef struct {
+  uint32_t cv[8];
+  uint64_t chunk_counter;
+  uint8_t buf[LLVM_BLAKE3_BLOCK_LEN];
+  uint8_t buf_len;
+  uint8_t blocks_compressed;
+  uint8_t flags;
+} llvm_blake3_chunk_state;
+
+typedef struct {
+  uint32_t key[8];
+  llvm_blake3_chunk_state chunk;
+  uint8_t cv_stack_len;
+  // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example,
+  // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk
+  // requires a 4th entry, rather than merging everything down to 1, because we
+  // don't know whether more input is coming. This is different from how the
+  // reference implementation does things.
+  uint8_t cv_stack[(LLVM_BLAKE3_MAX_DEPTH + 1) * LLVM_BLAKE3_OUT_LEN];
+} llvm_blake3_hasher;
+
+const char *llvm_blake3_version(void);
+void llvm_blake3_hasher_init(llvm_blake3_hasher *self);
+void llvm_blake3_hasher_init_keyed(llvm_blake3_hasher *self,
+                                   const uint8_t key[LLVM_BLAKE3_KEY_LEN]);
+void llvm_blake3_hasher_init_derive_key(llvm_blake3_hasher *self,
+                                        const char *context);
+void llvm_blake3_hasher_init_derive_key_raw(llvm_blake3_hasher *self,
+                                            const void *context,
+                                            size_t context_len);
+void llvm_blake3_hasher_update(llvm_blake3_hasher *self, const void *input,
+                               size_t input_len);
+void llvm_blake3_hasher_finalize(const llvm_blake3_hasher *self, uint8_t *out,
+                                 size_t out_len);
+void llvm_blake3_hasher_finalize_seek(const llvm_blake3_hasher *self,
+                                      uint64_t seek, uint8_t *out,
+                                      size_t out_len);
+void llvm_blake3_hasher_reset(llvm_blake3_hasher *self);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* LLVM_C_BLAKE3_H */
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index 17b57de7b0aa..cdedb6ece992 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -155,7 +155,8 @@ struct APFloatBase {
     S_IEEEdouble,
     S_x87DoubleExtended,
     S_IEEEquad,
-    S_PPCDoubleDouble
+    S_PPCDoubleDouble,
+    S_MaxSemantics = S_PPCDoubleDouble
   };
 
   static const llvm::fltSemantics &EnumToSemantics(Semantics S);
diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h
index b1fc85d3c09d..4155cb260a2a 100644
--- a/llvm/include/llvm/ADT/APInt.h
+++ b/llvm/include/llvm/ADT/APInt.h
@@ -486,7 +486,7 @@ public:
     return (Ones > 0) && ((Ones + countLeadingZerosSlowCase()) == BitWidth);
   }
 
-  /// Return true if this APInt value contains a sequence of ones with
+  /// Return true if this APInt value contains a non-empty sequence of ones with
   /// the remainder zero.
   bool isShiftedMask() const {
     if (isSingleWord())
@@ -496,6 +496,23 @@ public:
     return (Ones + LeadZ + countTrailingZeros()) == BitWidth;
   }
 
+  /// Return true if this APInt value contains a non-empty sequence of ones with
+  /// the remainder zero. If true, \p MaskIdx will specify the index of the
+  /// lowest set bit and \p MaskLen is updated to specify the length of the
+  /// mask, else neither are updated.
+  bool isShiftedMask(unsigned &MaskIdx, unsigned &MaskLen) const {
+    if (isSingleWord())
+      return isShiftedMask_64(U.VAL, MaskIdx, MaskLen);
+    unsigned Ones = countPopulationSlowCase();
+    unsigned LeadZ = countLeadingZerosSlowCase();
+    unsigned TrailZ = countTrailingZerosSlowCase();
+    if ((Ones + LeadZ + TrailZ) != BitWidth)
+      return false;
+    MaskLen = Ones;
+    MaskIdx = TrailZ;
+    return true;
+  }
+
   /// Compute an APInt containing numBits highbits from this APInt.
   ///
   /// Get an APInt with the same BitWidth as this APInt, just zero mask the low
@@ -1201,7 +1218,7 @@ public:
   /// Truncate to new width.
   ///
   /// Truncate the APInt to a specified width. It is an error to specify a width
-  /// that is greater than or equal to the current width.
+  /// that is greater than the current width.
   APInt trunc(unsigned width) const;
 
   /// Truncate to new width with unsigned saturation.
@@ -1221,7 +1238,7 @@ public:
   ///
   /// This operation sign extends the APInt to a new width. If the high order
   /// bit is set, the fill on the left will be done with 1 bits, otherwise zero.
-  /// It is an error to specify a width that is less than or equal to the
+  /// It is an error to specify a width that is less than the
   /// current width.
   APInt sext(unsigned width) const;
 
@@ -1229,7 +1246,7 @@ public:
   ///
   /// This operation zero extends the APInt to a new width. The high order bits
   /// are filled with 0 bits.  It is an error to specify a width that is less
-  /// than or equal to the current width.
+  /// than the current width.
   APInt zext(unsigned width) const;
 
   /// Sign extend or truncate to width
@@ -1244,24 +1261,6 @@ public:
   /// extended, truncated, or left alone to make it that width.
   APInt zextOrTrunc(unsigned width) const;
 
-  /// Truncate to width
-  ///
-  /// Make this APInt have the bit width given by \p width. The value is
-  /// truncated or left alone to make it that width.
-  APInt truncOrSelf(unsigned width) const;
-
-  /// Sign extend or truncate to width
-  ///
-  /// Make this APInt have the bit width given by \p width. The value is sign
-  /// extended, or left alone to make it that width.
-  APInt sextOrSelf(unsigned width) const;
-
-  /// Zero extend or truncate to width
-  ///
-  /// Make this APInt have the bit width given by \p width. The value is zero
-  /// extended, or left alone to make it that width.
-  APInt zextOrSelf(unsigned width) const;
-
   /// @}
   /// \name Bit Manipulation Operators
   /// @{
@@ -1489,6 +1488,11 @@ public:
   /// equivalent of the string given by \p str.
   static unsigned getBitsNeeded(StringRef str, uint8_t radix);
 
+  /// Get the bits that are sufficient to represent the string value. This may
+  /// over estimate the amount of bits required, but it does not require
+  /// parsing the value in the string.
+  static unsigned getSufficientBitsNeeded(StringRef Str, uint8_t Radix);
+
   /// The APInt version of the countLeadingZeros functions in
   ///   MathExtras.h.
   ///
@@ -2235,12 +2239,16 @@ Optional<unsigned> GetMostSignificantDifferentBit(const APInt &A,
 /// Splat/Merge neighboring bits to widen/narrow the bitmask represented
 /// by \param A to \param NewBitWidth bits.
 ///
+/// MatchAnyBits: (Default)
 /// e.g. ScaleBitMask(0b0101, 8) -> 0b00110011
 /// e.g. ScaleBitMask(0b00011011, 4) -> 0b0111
-/// A.getBitwidth() or NewBitWidth must be a whole multiples of the other.
 ///
-/// TODO: Do we need a mode where all bits must be set when merging down?
-APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth);
+/// MatchAllBits:
+/// e.g. ScaleBitMask(0b0101, 8) -> 0b00110011
+/// e.g. ScaleBitMask(0b00011011, 4) -> 0b0001
+/// A.getBitwidth() or NewBitWidth must be a whole multiples of the other.
+APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth,
+                   bool MatchAllBits = false);
 } // namespace APIntOps
 
 // See friend declaration above. This additional declaration is required in
diff --git a/llvm/include/llvm/ADT/AddressRanges.h b/llvm/include/llvm/ADT/AddressRanges.h
new file mode 100644
index 000000000000..1953680d5222
--- /dev/null
+++ b/llvm/include/llvm/ADT/AddressRanges.h
@@ -0,0 +1,79 @@
+//===- AddressRanges.h ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_ADDRESSRANGES_H
+#define LLVM_ADT_ADDRESSRANGES_H
+
+#include "llvm/ADT/Optional.h"
+#include <cassert>
+#include <stdint.h>
+#include <vector>
+
+namespace llvm {
+
+/// A class that represents an address range. The range is specified using
+/// a start and an end address: [Start, End).
+class AddressRange {
+public:
+  AddressRange() {}
+  AddressRange(uint64_t S, uint64_t E) : Start(S), End(E) {
+    assert(Start <= End);
+  }
+  uint64_t start() const { return Start; }
+  uint64_t end() const { return End; }
+  uint64_t size() const { return End - Start; }
+  bool contains(uint64_t Addr) const { return Start <= Addr && Addr < End; }
+  bool intersects(const AddressRange &R) const {
+    return Start < R.End && R.Start < End;
+  }
+  bool operator==(const AddressRange &R) const {
+    return Start == R.Start && End == R.End;
+  }
+  bool operator!=(const AddressRange &R) const { return !(*this == R); }
+  bool operator<(const AddressRange &R) const {
+    return std::make_pair(Start, End) < std::make_pair(R.Start, R.End);
+  }
+
+private:
+  uint64_t Start = 0;
+  uint64_t End = 0;
+};
+
+/// The AddressRanges class helps normalize address range collections.
+/// This class keeps a sorted vector of AddressRange objects and can perform
+/// insertions and searches efficiently. The address ranges are always sorted
+/// and never contain any invalid or empty address ranges. Intersecting
+/// address ranges are combined during insertion.
+class AddressRanges {
+protected:
+  using Collection = std::vector<AddressRange>;
+  Collection Ranges;
+
+public:
+  void clear() { Ranges.clear(); }
+  bool empty() const { return Ranges.empty(); }
+  bool contains(uint64_t Addr) const;
+  bool contains(AddressRange Range) const;
+  Optional<AddressRange> getRangeThatContains(uint64_t Addr) const;
+  void insert(AddressRange Range);
+  void reserve(size_t Capacity) { Ranges.reserve(Capacity); }
+  size_t size() const { return Ranges.size(); }
+  bool operator==(const AddressRanges &RHS) const {
+    return Ranges == RHS.Ranges;
+  }
+  const AddressRange &operator[](size_t i) const {
+    assert(i < Ranges.size());
+    return Ranges[i];
+  }
+  Collection::const_iterator begin() const { return Ranges.begin(); }
+  Collection::const_iterator end() const { return Ranges.end(); }
+};
+
+} // namespace llvm
+
+#endif // LLVM_ADT_ADDRESSRANGES_H
diff --git a/llvm/include/llvm/ADT/ArrayRef.h b/llvm/include/llvm/ADT/ArrayRef.h
index b6896395dae8..ee35a5686fc4 100644
--- a/llvm/include/llvm/ADT/ArrayRef.h
+++ b/llvm/include/llvm/ADT/ArrayRef.h
@@ -25,6 +25,7 @@
 #include <vector>
 
 namespace llvm {
+  template<typename T> class LLVM_NODISCARD MutableArrayRef;
 
   /// ArrayRef - Represent a constant reference to an array (0 or more elements
   /// consecutively in memory), i.e. a start pointer and a length.  It allows
@@ -175,10 +176,10 @@ namespace llvm {
     }
 
     // copy - Allocate copy in Allocator and return ArrayRef<T> to it.
-    template <typename Allocator> ArrayRef<T> copy(Allocator &A) {
+    template <typename Allocator> MutableArrayRef<T> copy(Allocator &A) {
       T *Buff = A.template Allocate<T>(Length);
       std::uninitialized_copy(begin(), end(), Buff);
-      return ArrayRef<T>(Buff, Length);
+      return MutableArrayRef<T>(Buff, Length);
     }
 
     /// equals - Check for element-wise equality.
@@ -539,6 +540,42 @@ namespace llvm {
     return MutableArrayRef<T>(data, length);
   }
 
+  /// Construct a MutableArrayRef from a SmallVector.
+  template <typename T>
+  MutableArrayRef<T> makeMutableArrayRef(SmallVectorImpl<T> &Vec) {
+    return Vec;
+  }
+
+  /// Construct a MutableArrayRef from a SmallVector.
+  template <typename T, unsigned N>
+  MutableArrayRef<T> makeMutableArrayRef(SmallVector<T, N> &Vec) {
+    return Vec;
+  }
+
+  /// Construct a MutableArrayRef from a std::vector.
+  template<typename T>
+  MutableArrayRef<T> makeMutableArrayRef(std::vector<T> &Vec) {
+    return Vec;
+  }
+
+  /// Construct a MutableArrayRef from a std::array.
+  template <typename T, std::size_t N>
+  MutableArrayRef<T> makeMutableArrayRef(std::array<T, N> &Arr) {
+    return Arr;
+  }
+
+  /// Construct a MutableArrayRef from a MutableArrayRef (no-op) (const)
+  template <typename T>
+  MutableArrayRef<T> makeMutableArrayRef(const MutableArrayRef<T> &Vec) {
+    return Vec;
+  }
+
+  /// Construct a MutableArrayRef from a C array.
+  template<typename T, size_t N>
+  MutableArrayRef<T> makeMutableArrayRef(T (&Arr)[N]) {
+    return MutableArrayRef<T>(Arr);
+  }
+
   /// @}
   /// @name ArrayRef Comparison Operators
   /// @{
diff --git a/llvm/include/llvm/ADT/BitmaskEnum.h b/llvm/include/llvm/ADT/BitmaskEnum.h
index 89e5508e08e1..205da1240d44 100644
--- a/llvm/include/llvm/ADT/BitmaskEnum.h
+++ b/llvm/include/llvm/ADT/BitmaskEnum.h
@@ -77,7 +77,7 @@ namespace BitmaskEnumDetail {
 
 /// Get a bitmask with 1s in all places up to the high-order bit of E's largest
 /// value.
-template <typename E> std::underlying_type_t<E> Mask() {
+template <typename E> constexpr std::underlying_type_t<E> Mask() {
   // On overflow, NextPowerOf2 returns zero with the type uint64_t, so
   // subtracting 1 gives us the mask with all bits set, like we want.
   return NextPowerOf2(static_cast<std::underlying_type_t<E>>(
@@ -87,7 +87,7 @@ template <typename E> std::underlying_type_t<E> Mask() {
 
 /// Check that Val is in range for E, and return Val cast to E's underlying
 /// type.
-template <typename E> std::underlying_type_t<E> Underlying(E Val) {
+template <typename E> constexpr std::underlying_type_t<E> Underlying(E Val) {
   auto U = static_cast<std::underlying_type_t<E>>(Val);
   assert(U >= 0 && "Negative enum values are not allowed.");
   assert(U <= Mask<E>() && "Enum value too large (or largest val too small?)");
@@ -99,22 +99,22 @@ constexpr unsigned bitWidth(uint64_t Value) {
 }
 
 template <typename E, typename = std::enable_if_t<is_bitmask_enum<E>::value>>
-E operator~(E Val) {
+constexpr E operator~(E Val) {
   return static_cast<E>(~Underlying(Val) & Mask<E>());
 }
 
 template <typename E, typename = std::enable_if_t<is_bitmask_enum<E>::value>>
-E operator|(E LHS, E RHS) {
+constexpr E operator|(E LHS, E RHS) {
   return static_cast<E>(Underlying(LHS) | Underlying(RHS));
 }
 
 template <typename E, typename = std::enable_if_t<is_bitmask_enum<E>::value>>
-E operator&(E LHS, E RHS) {
+constexpr E operator&(E LHS, E RHS) {
   return static_cast<E>(Underlying(LHS) & Underlying(RHS));
 }
 
 template <typename E, typename = std::enable_if_t<is_bitmask_enum<E>::value>>
-E operator^(E LHS, E RHS) {
+constexpr E operator^(E LHS, E RHS) {
   return static_cast<E>(Underlying(LHS) ^ Underlying(RHS));
 }
 
diff --git a/llvm/include/llvm/ADT/BreadthFirstIterator.h b/llvm/include/llvm/ADT/BreadthFirstIterator.h
index 1312b5f91e83..807b0a92c48c 100644
--- a/llvm/include/llvm/ADT/BreadthFirstIterator.h
+++ b/llvm/include/llvm/ADT/BreadthFirstIterator.h
@@ -80,7 +80,7 @@ private:
 
   inline void toNext() {
     Optional<QueueElement> Head = VisitQueue.front();
-    QueueElement H = Head.getValue();
+    QueueElement H = *Head;
     NodeRef Node = H.first;
     Optional<ChildItTy> &ChildIt = H.second;
 
diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h
index 7673b66ca42a..c14414c46419 100644
--- a/llvm/include/llvm/ADT/DenseMap.h
+++ b/llvm/include/llvm/ADT/DenseMap.h
@@ -137,6 +137,7 @@ public:
         }
       }
       assert(NumEntries == 0 && "Node count imbalance!");
+      (void)NumEntries;
     }
     setNumEntries(0);
     setNumTombstones(0);
diff --git a/llvm/include/llvm/ADT/EpochTracker.h b/llvm/include/llvm/ADT/EpochTracker.h
index b06888494466..b46989bc5111 100644
--- a/llvm/include/llvm/ADT/EpochTracker.h
+++ b/llvm/include/llvm/ADT/EpochTracker.h
@@ -34,10 +34,10 @@ namespace llvm {
 /// is still valid.
 ///
 class DebugEpochBase {
-  uint64_t Epoch;
+  uint64_t Epoch = 0;
 
 public:
-  DebugEpochBase() : Epoch(0) {}
+  DebugEpochBase() = default;
 
   /// Calling incrementEpoch invalidates all handles pointing into the
   /// calling instance.
diff --git a/llvm/include/llvm/ADT/EquivalenceClasses.h b/llvm/include/llvm/ADT/EquivalenceClasses.h
index f12b683ead2d..4f98b84cf97d 100644
--- a/llvm/include/llvm/ADT/EquivalenceClasses.h
+++ b/llvm/include/llvm/ADT/EquivalenceClasses.h
@@ -161,7 +161,8 @@ public:
   //
 
   /// iterator* - Provides a way to iterate over all values in the set.
-  using iterator = typename std::set<ECValue>::const_iterator;
+  using iterator =
+      typename std::set<ECValue, ECValueComparator>::const_iterator;
 
   iterator begin() const { return TheMapping.begin(); }
   iterator end() const { return TheMapping.end(); }
diff --git a/llvm/include/llvm/ADT/FloatingPointMode.h b/llvm/include/llvm/ADT/FloatingPointMode.h
index 9cc69b8a8344..59ccea1f9d44 100644
--- a/llvm/include/llvm/ADT/FloatingPointMode.h
+++ b/llvm/include/llvm/ADT/FloatingPointMode.h
@@ -7,7 +7,8 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// Utilities for dealing with flags related to floating point mode controls.
+/// Utilities for dealing with flags related to floating point properties and
+/// mode controls.
 ///
 //===----------------------------------------------------------------------===/
 
@@ -193,4 +194,29 @@ void DenormalMode::print(raw_ostream &OS) const {
 
 }
 
+/// Floating-point class tests, supported by 'is_fpclass' intrinsic. Actual
+/// test may be an OR combination of basic tests.
+enum FPClassTest {
+  fcSNan = 0x0001,
+  fcQNan = 0x0002,
+  fcNegInf = 0x0004,
+  fcNegNormal = 0x0008,
+  fcNegSubnormal = 0x0010,
+  fcNegZero = 0x0020,
+  fcPosZero = 0x0040,
+  fcPosSubnormal = 0x0080,
+  fcPosNormal = 0x0100,
+  fcPosInf = 0x0200,
+
+  fcNan = fcSNan | fcQNan,
+  fcInf = fcPosInf | fcNegInf,
+  fcNormal = fcPosNormal | fcNegNormal,
+  fcSubnormal = fcPosSubnormal | fcNegSubnormal,
+  fcZero = fcPosZero | fcNegZero,
+  fcPosFinite = fcPosNormal | fcPosSubnormal | fcPosZero,
+  fcNegFinite = fcNegNormal | fcNegSubnormal | fcNegZero,
+  fcFinite = fcPosFinite | fcNegFinite,
+  fcAllFlags = fcNan | fcInf | fcFinite
+};
+
 #endif // LLVM_ADT_FLOATINGPOINTMODE_H
diff --git a/llvm/include/llvm/ADT/FoldingSet.h b/llvm/include/llvm/ADT/FoldingSet.h
index a8707f0ee81e..ec276d41da80 100644
--- a/llvm/include/llvm/ADT/FoldingSet.h
+++ b/llvm/include/llvm/ADT/FoldingSet.h
@@ -16,12 +16,14 @@
 #ifndef LLVM_ADT_FOLDINGSET_H
 #define LLVM_ADT_FOLDINGSET_H
 
+#include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/Support/Allocator.h"
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
+#include <type_traits>
 #include <utility>
 
 namespace llvm {
@@ -255,8 +257,8 @@ template<typename T> struct DefaultFoldingSetTrait {
 /// through template specialization the behavior can be tailored for specific
 /// types.  Combined with the FoldingSetNodeWrapper class, one can add objects
 /// to FoldingSets that were not originally designed to have that behavior.
-template<typename T> struct FoldingSetTrait
-  : public DefaultFoldingSetTrait<T> {};
+template <typename T, typename Enable = void>
+struct FoldingSetTrait : public DefaultFoldingSetTrait<T> {};
 
 /// DefaultContextualFoldingSetTrait - Like DefaultFoldingSetTrait, but
 /// for ContextualFoldingSets.
@@ -293,7 +295,9 @@ public:
 
   /// ComputeHash - Compute a strong hash value for this FoldingSetNodeIDRef,
   /// used to lookup the node in the FoldingSetBase.
-  unsigned ComputeHash() const;
+  unsigned ComputeHash() const {
+    return static_cast<unsigned>(hash_combine_range(Data, Data + Size));
+  }
 
   bool operator==(FoldingSetNodeIDRef) const;
 
@@ -323,13 +327,33 @@ public:
     : Bits(Ref.getData(), Ref.getData() + Ref.getSize()) {}
 
   /// Add* - Add various data types to Bit data.
-  void AddPointer(const void *Ptr);
-  void AddInteger(signed I);
-  void AddInteger(unsigned I);
-  void AddInteger(long I);
-  void AddInteger(unsigned long I);
-  void AddInteger(long long I);
-  void AddInteger(unsigned long long I);
+  void AddPointer(const void *Ptr) {
+    // Note: this adds pointers to the hash using sizes and endianness that
+    // depend on the host. It doesn't matter, however, because hashing on
+    // pointer values is inherently unstable. Nothing should depend on the
+    // ordering of nodes in the folding set.
+    static_assert(sizeof(uintptr_t) <= sizeof(unsigned long long),
+                  "unexpected pointer size");
+    AddInteger(reinterpret_cast<uintptr_t>(Ptr));
+  }
+  void AddInteger(signed I) { Bits.push_back(I); }
+  void AddInteger(unsigned I) { Bits.push_back(I); }
+  void AddInteger(long I) { AddInteger((unsigned long)I); }
+  void AddInteger(unsigned long I) {
+    if (sizeof(long) == sizeof(int))
+      AddInteger(unsigned(I));
+    else if (sizeof(long) == sizeof(long long)) {
+      AddInteger((unsigned long long)I);
+    } else {
+      llvm_unreachable("unexpected sizeof(long)");
+    }
+  }
+  void AddInteger(long long I) { AddInteger((unsigned long long)I); }
+  void AddInteger(unsigned long long I) {
+    AddInteger(unsigned(I));
+    AddInteger(unsigned(I >> 32));
+  }
+
   void AddBoolean(bool B) { AddInteger(B ? 1U : 0U); }
   void AddString(StringRef String);
   void AddNodeID(const FoldingSetNodeID &ID);
@@ -343,7 +367,9 @@ public:
 
   /// ComputeHash - Compute a strong hash value for this FoldingSetNodeID, used
   /// to lookup the node in the FoldingSetBase.
-  unsigned ComputeHash() const;
+  unsigned ComputeHash() const {
+    return FoldingSetNodeIDRef(Bits.data(), Bits.size()).ComputeHash();
+  }
 
   /// operator== - Used to compare two nodes to each other.
   bool operator==(const FoldingSetNodeID &RHS) const;
@@ -803,6 +829,13 @@ struct FoldingSetTrait<std::pair<T1, T2>> {
   }
 };
 
+template <typename T>
+struct FoldingSetTrait<T, typename std::enable_if_t<std::is_enum<T>::value>> {
+  static void Profile(const T &X, FoldingSetNodeID &ID) {
+    ID.AddInteger(static_cast<typename std::underlying_type_t<T>>(X));
+  }
+};
+
 } // end namespace llvm
 
 #endif // LLVM_ADT_FOLDINGSET_H
diff --git a/llvm/include/llvm/ADT/GenericCycleImpl.h b/llvm/include/llvm/ADT/GenericCycleImpl.h
index d443f9e21a47..ea2847f8c8ee 100644
--- a/llvm/include/llvm/ADT/GenericCycleImpl.h
+++ b/llvm/include/llvm/ADT/GenericCycleImpl.h
@@ -66,6 +66,44 @@ void GenericCycle<ContextT>::getExitBlocks(
   }
 }
 
+template <typename ContextT>
+auto GenericCycle<ContextT>::getCyclePreheader() const -> BlockT * {
+  BlockT *Predecessor = getCyclePredecessor();
+  if (!Predecessor)
+    return nullptr;
+
+  assert(isReducible() && "Cycle Predecessor must be in a reducible cycle!");
+
+  if (succ_size(Predecessor) != 1)
+    return nullptr;
+
+  // Make sure we are allowed to hoist instructions into the predecessor.
+  if (!Predecessor->isLegalToHoistInto())
+    return nullptr;
+
+  return Predecessor;
+}
+
+template <typename ContextT>
+auto GenericCycle<ContextT>::getCyclePredecessor() const -> BlockT * {
+  if (!isReducible())
+    return nullptr;
+
+  BlockT *Out = nullptr;
+
+  // Loop over the predecessors of the header node...
+  BlockT *Header = getHeader();
+  for (const auto Pred : predecessors(Header)) {
+    if (!contains(Pred)) {
+      if (Out && Out != Pred)
+        return nullptr;
+      Out = Pred;
+    }
+  }
+
+  return Out;
+}
+
 /// \brief Helper class for computing cycle information.
 template <typename ContextT> class GenericCycleInfoCompute {
   using BlockT = typename ContextT::BlockT;
@@ -267,8 +305,8 @@ void GenericCycleInfoCompute<ContextT>::dfs(BlockT *EntryBlock) {
       DFSTreeStack.emplace_back(TraverseStack.size());
       llvm::append_range(TraverseStack, successors(Block));
 
-      LLVM_ATTRIBUTE_UNUSED
       bool Added = BlockDFSInfo.try_emplace(Block, ++Counter).second;
+      (void)Added;
       assert(Added);
       BlockPreorder.push_back(Block);
       LLVM_DEBUG(errs() << "  preorder number: " << Counter << "\n");
@@ -326,6 +364,19 @@ auto GenericCycleInfo<ContextT>::getCycle(const BlockT *Block) const
   return nullptr;
 }
 
+/// \brief get the depth for the cycle which containing a given block.
+///
+/// \returns the depth for the innermost cycle containing \p Block or 0 if it is
+///          not contained in any cycle.
+template <typename ContextT>
+unsigned GenericCycleInfo<ContextT>::getCycleDepth(const BlockT *Block) const {
+  CycleT *Cycle = getCycle(Block);
+  if (!Cycle)
+    return 0;
+  return Cycle->getDepth();
+}
+
+#ifndef NDEBUG
 /// \brief Validate the internal consistency of the cycle tree.
 ///
 /// Note that this does \em not check that cycles are really cycles in the CFG,
@@ -391,6 +442,7 @@ bool GenericCycleInfo<ContextT>::validateTree() const {
 
   return true;
 }
+#endif
 
 /// \brief Print the cycle info.
 template <typename ContextT>
diff --git a/llvm/include/llvm/ADT/GenericCycleInfo.h b/llvm/include/llvm/ADT/GenericCycleInfo.h
index d5f9cd9142ac..970664b85715 100644
--- a/llvm/include/llvm/ADT/GenericCycleInfo.h
+++ b/llvm/include/llvm/ADT/GenericCycleInfo.h
@@ -100,6 +100,10 @@ public:
 
   BlockT *getHeader() const { return Entries[0]; }
 
+  const SmallVectorImpl<BlockT *> & getEntries() const {
+    return Entries;
+  }
+
   /// \brief Return whether \p Block is an entry block of the cycle.
   bool isEntry(BlockT *Block) const { return is_contained(Entries, Block); }
 
@@ -124,6 +128,16 @@ public:
   /// branched to.
   void getExitBlocks(SmallVectorImpl<BlockT *> &TmpStorage) const;
 
+  /// Return the preheader block for this cycle. Pre-header is well-defined for
+  /// reducible cycle in docs/LoopTerminology.rst as: the only one entering
+  /// block and its only edge is to the entry block. Return null for irreducible
+  /// cycles.
+  BlockT *getCyclePreheader() const;
+
+  /// If the cycle has exactly one entry with exactly one predecessor, return
+  /// it, otherwise return nullptr.
+  BlockT *getCyclePredecessor() const;
+
   /// Iteration over child cycles.
   //@{
   using const_child_iterator_base =
@@ -178,6 +192,7 @@ public:
   iterator_range<const_entry_iterator> entries() const {
     return llvm::make_range(Entries.begin(), Entries.end());
   }
+  //@}
 
   Printable printEntries(const ContextT &Ctx) const {
     return Printable([this, &Ctx](raw_ostream &Out) {
@@ -238,6 +253,7 @@ public:
   const ContextT &getSSAContext() const { return Context; }
 
   CycleT *getCycle(const BlockT *Block) const;
+  unsigned getCycleDepth(const BlockT *Block) const;
   CycleT *getTopLevelParentCycle(const BlockT *Block) const;
 
   /// Move \p Child to \p NewParent by manipulating Children vectors.
@@ -248,7 +264,9 @@ public:
 
   /// Methods for debug and self-test.
   //@{
+#ifndef NDEBUG
   bool validateTree() const;
+#endif
   void print(raw_ostream &Out) const;
   void dump() const { print(dbgs()); }
   //@}
diff --git a/llvm/include/llvm/ADT/IntervalMap.h b/llvm/include/llvm/ADT/IntervalMap.h
index 368ed46f98d2..57f02df252c0 100644
--- a/llvm/include/llvm/ADT/IntervalMap.h
+++ b/llvm/include/llvm/ADT/IntervalMap.h
@@ -106,13 +106,10 @@
 
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/bit.h"
-#include "llvm/Support/AlignOf.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/RecyclingAllocator.h"
 #include <algorithm>
 #include <cassert>
-#include <cstdint>
 #include <iterator>
 #include <new>
 #include <utility>
@@ -969,7 +966,10 @@ public:
 
 private:
   // The root data is either a RootLeaf or a RootBranchData instance.
-  AlignedCharArrayUnion<RootLeaf, RootBranchData> data;
+  union {
+    RootLeaf leaf;
+    RootBranchData branchData;
+  };
 
   // Tree height.
   // 0: Leaves in root.
@@ -983,25 +983,22 @@ private:
   // Allocator used for creating external nodes.
   Allocator &allocator;
 
-  /// Represent data as a node type without breaking aliasing rules.
-  template <typename T> T &dataAs() const { return *bit_cast<T *>(&data); }
-
   const RootLeaf &rootLeaf() const {
     assert(!branched() && "Cannot acces leaf data in branched root");
-    return dataAs<RootLeaf>();
+    return leaf;
   }
   RootLeaf &rootLeaf() {
     assert(!branched() && "Cannot acces leaf data in branched root");
-    return dataAs<RootLeaf>();
+    return leaf;
   }
 
-  RootBranchData &rootBranchData() const {
+  const RootBranchData &rootBranchData() const {
     assert(branched() && "Cannot access branch data in non-branched root");
-    return dataAs<RootBranchData>();
+    return branchData;
   }
   RootBranchData &rootBranchData() {
     assert(branched() && "Cannot access branch data in non-branched root");
-    return dataAs<RootBranchData>();
+    return branchData;
   }
 
   const RootBranch &rootBranch() const { return rootBranchData().node; }
@@ -1042,11 +1039,20 @@ private:
 
 public:
   explicit IntervalMap(Allocator &a) : height(0), rootSize(0), allocator(a) {
-    assert((uintptr_t(&data) & (alignof(RootLeaf) - 1)) == 0 &&
-           "Insufficient alignment");
     new(&rootLeaf()) RootLeaf();
   }
 
+  // The default copy/move constructors and assignment operators would perform
+  // a shallow copy, leading to an incorrect internal state. To prevent
+  // accidental use, explicitly delete these operators.
+  // If necessary, implement them to perform a deep copy.
+  IntervalMap(const IntervalMap &Other) = delete;
+  IntervalMap(IntervalMap &&Other) = delete;
+  // Note: these are already implicitly deleted, because RootLeaf (union
+  // member) has a non-trivial assignment operator (because of std::pair).
+  IntervalMap &operator=(const IntervalMap &Other) = delete;
+  IntervalMap &operator=(IntervalMap &&Other) = delete;
+
   ~IntervalMap() {
     clear();
     rootLeaf().~RootLeaf();
diff --git a/llvm/include/llvm/ADT/IntrusiveRefCntPtr.h b/llvm/include/llvm/ADT/IntrusiveRefCntPtr.h
index 975535bb5676..e41eb0639ce3 100644
--- a/llvm/include/llvm/ADT/IntrusiveRefCntPtr.h
+++ b/llvm/include/llvm/ADT/IntrusiveRefCntPtr.h
@@ -84,7 +84,7 @@ protected:
 #ifndef NDEBUG
   ~RefCountedBase() {
     assert(RefCount == 0 &&
-           "Destruction occured when there are still references to this.");
+           "Destruction occurred when there are still references to this.");
   }
 #else
   // Default the destructor in release builds, A trivial destructor may enable
@@ -115,7 +115,7 @@ protected:
 #ifndef NDEBUG
   ~ThreadSafeRefCountedBase() {
     assert(RefCount == 0 &&
-           "Destruction occured when there are still references to this.");
+           "Destruction occurred when there are still references to this.");
   }
 #else
   // Default the destructor in release builds, A trivial destructor may enable
diff --git a/llvm/include/llvm/ADT/Optional.h b/llvm/include/llvm/ADT/Optional.h
index e047b0fc6514..d1615d903e98 100644
--- a/llvm/include/llvm/ADT/Optional.h
+++ b/llvm/include/llvm/ADT/Optional.h
@@ -60,85 +60,96 @@ template <typename T,
 class OptionalStorage {
   union {
     char empty;
-    T value;
+    T val;
   };
-  bool hasVal;
+  bool hasVal = false;
 
 public:
   ~OptionalStorage() { reset(); }
 
-  constexpr OptionalStorage() noexcept : empty(), hasVal(false) {}
+  constexpr OptionalStorage() noexcept : empty() {}
 
   constexpr OptionalStorage(OptionalStorage const &other) : OptionalStorage() {
-    if (other.hasValue()) {
-      emplace(other.value);
+    if (other.has_value()) {
+      emplace(other.val);
     }
   }
   constexpr OptionalStorage(OptionalStorage &&other) : OptionalStorage() {
-    if (other.hasValue()) {
-      emplace(std::move(other.value));
+    if (other.has_value()) {
+      emplace(std::move(other.val));
     }
   }
 
   template <class... Args>
-  constexpr explicit OptionalStorage(in_place_t, Args &&... args)
-      : value(std::forward<Args>(args)...), hasVal(true) {}
+  constexpr explicit OptionalStorage(in_place_t, Args &&...args)
+      : val(std::forward<Args>(args)...), hasVal(true) {}
 
   void reset() noexcept {
     if (hasVal) {
-      value.~T();
+      val.~T();
       hasVal = false;
     }
   }
 
+  constexpr bool has_value() const noexcept { return hasVal; }
   constexpr bool hasValue() const noexcept { return hasVal; }
 
-  T &getValue() LLVM_LVALUE_FUNCTION noexcept {
+  T &value() &noexcept {
+    assert(hasVal);
+    return val;
+  }
+  T &getValue() &noexcept {
+    assert(hasVal);
+    return val;
+  }
+  constexpr T const &value() const &noexcept {
     assert(hasVal);
-    return value;
+    return val;
   }
-  constexpr T const &getValue() const LLVM_LVALUE_FUNCTION noexcept {
+  constexpr T const &getValue() const &noexcept {
     assert(hasVal);
-    return value;
+    return val;
   }
-#if LLVM_HAS_RVALUE_REFERENCE_THIS
-  T &&getValue() && noexcept {
+  T &&value() &&noexcept {
     assert(hasVal);
-    return std::move(value);
+    return std::move(val);
+  }
+  T &&getValue() &&noexcept {
+    assert(hasVal);
+    return std::move(val);
   }
-#endif
 
-  template <class... Args> void emplace(Args &&... args) {
+  template <class... Args> void emplace(Args &&...args) {
     reset();
-    ::new ((void *)std::addressof(value)) T(std::forward<Args>(args)...);
+    ::new ((void *)std::addressof(val)) T(std::forward<Args>(args)...);
     hasVal = true;
   }
 
   OptionalStorage &operator=(T const &y) {
-    if (hasValue()) {
-      value = y;
+    if (has_value()) {
+      val = y;
     } else {
-      ::new ((void *)std::addressof(value)) T(y);
+      ::new ((void *)std::addressof(val)) T(y);
       hasVal = true;
     }
     return *this;
   }
   OptionalStorage &operator=(T &&y) {
-    if (hasValue()) {
-      value = std::move(y);
+    if (has_value()) {
+      val = std::move(y);
     } else {
-      ::new ((void *)std::addressof(value)) T(std::move(y));
+      ::new ((void *)std::addressof(val)) T(std::move(y));
       hasVal = true;
     }
     return *this;
   }
 
   OptionalStorage &operator=(OptionalStorage const &other) {
-    if (other.hasValue()) {
-      if (hasValue()) {
-        value = other.value;
+    if (other.has_value()) {
+      if (has_value()) {
+        val = other.val;
       } else {
-        ::new ((void *)std::addressof(value)) T(other.value);
+        ::new ((void *)std::addressof(val)) T(other.val);
         hasVal = true;
       }
     } else {
@@ -148,11 +159,11 @@ public:
   }
 
   OptionalStorage &operator=(OptionalStorage &&other) {
-    if (other.hasValue()) {
-      if (hasValue()) {
-        value = std::move(other.value);
+    if (other.has_value()) {
+      if (has_value()) {
+        val = std::move(other.val);
       } else {
-        ::new ((void *)std::addressof(value)) T(std::move(other.value));
+        ::new ((void *)std::addressof(val)) T(std::move(other.val));
         hasVal = true;
       }
     } else {
@@ -165,7 +176,7 @@ public:
 template <typename T> class OptionalStorage<T, true> {
   union {
     char empty;
-    T value;
+    T val;
   };
   bool hasVal = false;
 
@@ -181,53 +192,64 @@ public:
   OptionalStorage &operator=(OptionalStorage &&other) = default;
 
   template <class... Args>
-  constexpr explicit OptionalStorage(in_place_t, Args &&... args)
-      : value(std::forward<Args>(args)...), hasVal(true) {}
+  constexpr explicit OptionalStorage(in_place_t, Args &&...args)
+      : val(std::forward<Args>(args)...), hasVal(true) {}
 
   void reset() noexcept {
     if (hasVal) {
-      value.~T();
+      val.~T();
       hasVal = false;
     }
   }
 
+  constexpr bool has_value() const noexcept { return hasVal; }
   constexpr bool hasValue() const noexcept { return hasVal; }
 
-  T &getValue() LLVM_LVALUE_FUNCTION noexcept {
+  T &value() &noexcept {
+    assert(hasVal);
+    return val;
+  }
+  T &getValue() &noexcept {
     assert(hasVal);
-    return value;
+    return val;
   }
-  constexpr T const &getValue() const LLVM_LVALUE_FUNCTION noexcept {
+  constexpr T const &value() const &noexcept {
     assert(hasVal);
-    return value;
+    return val;
   }
-#if LLVM_HAS_RVALUE_REFERENCE_THIS
-  T &&getValue() && noexcept {
+  constexpr T const &getValue() const &noexcept {
     assert(hasVal);
-    return std::move(value);
+    return val;
+  }
+  T &&value() &&noexcept {
+    assert(hasVal);
+    return std::move(val);
+  }
+  T &&getValue() &&noexcept {
+    assert(hasVal);
+    return std::move(val);
   }
-#endif
 
-  template <class... Args> void emplace(Args &&... args) {
+  template <class... Args> void emplace(Args &&...args) {
     reset();
-    ::new ((void *)std::addressof(value)) T(std::forward<Args>(args)...);
+    ::new ((void *)std::addressof(val)) T(std::forward<Args>(args)...);
     hasVal = true;
   }
 
   OptionalStorage &operator=(T const &y) {
-    if (hasValue()) {
-      value = y;
+    if (has_value()) {
+      val = y;
     } else {
-      ::new ((void *)std::addressof(value)) T(y);
+      ::new ((void *)std::addressof(val)) T(y);
       hasVal = true;
     }
     return *this;
   }
   OptionalStorage &operator=(T &&y) {
-    if (hasValue()) {
-      value = std::move(y);
+    if (has_value()) {
+      val = std::move(y);
     } else {
-      ::new ((void *)std::addressof(value)) T(std::move(y));
+      ::new ((void *)std::addressof(val)) T(std::move(y));
       hasVal = true;
     }
     return *this;
@@ -278,52 +300,55 @@ public:
 
   void reset() { Storage.reset(); }
 
-  constexpr const T *getPointer() const { return &Storage.getValue(); }
-  T *getPointer() { return &Storage.getValue(); }
-  constexpr const T &getValue() const LLVM_LVALUE_FUNCTION {
-    return Storage.getValue();
-  }
-  T &getValue() LLVM_LVALUE_FUNCTION { return Storage.getValue(); }
+  constexpr const T *getPointer() const { return &Storage.value(); }
+  T *getPointer() { return &Storage.value(); }
+  constexpr const T &value() const & { return Storage.value(); }
+  constexpr const T &getValue() const & { return Storage.value(); }
+  T &value() & { return Storage.value(); }
+  T &getValue() & { return Storage.value(); }
 
-  constexpr explicit operator bool() const { return hasValue(); }
-  constexpr bool hasValue() const { return Storage.hasValue(); }
+  constexpr explicit operator bool() const { return has_value(); }
+  constexpr bool has_value() const { return Storage.has_value(); }
+  constexpr bool hasValue() const { return Storage.has_value(); }
   constexpr const T *operator->() const { return getPointer(); }
   T *operator->() { return getPointer(); }
-  constexpr const T &operator*() const LLVM_LVALUE_FUNCTION {
-    return getValue();
-  }
-  T &operator*() LLVM_LVALUE_FUNCTION { return getValue(); }
+  constexpr const T &operator*() const & { return value(); }
+  T &operator*() & { return value(); }
 
-  template <typename U>
-  constexpr T getValueOr(U &&value) const LLVM_LVALUE_FUNCTION {
-    return hasValue() ? getValue() : std::forward<U>(value);
+  template <typename U> constexpr T value_or(U &&alt) const & {
+    return has_value() ? value() : std::forward<U>(alt);
+  }
+  template <typename U> constexpr T getValueOr(U &&alt) const & {
+    return has_value() ? value() : std::forward<U>(alt);
   }
 
   /// Apply a function to the value if present; otherwise return None.
   template <class Function>
-  auto map(const Function &F) const LLVM_LVALUE_FUNCTION
-      -> Optional<decltype(F(getValue()))> {
-    if (*this) return F(getValue());
+  auto map(const Function &F) const & -> Optional<decltype(F(value()))> {
+    if (*this)
+      return F(value());
     return None;
   }
 
-#if LLVM_HAS_RVALUE_REFERENCE_THIS
-  T &&getValue() && { return std::move(Storage.getValue()); }
-  T &&operator*() && { return std::move(Storage.getValue()); }
+  T &&value() && { return std::move(Storage.value()); }
+  T &&getValue() && { return std::move(Storage.value()); }
+  T &&operator*() && { return std::move(Storage.value()); }
 
-  template <typename U>
-  T getValueOr(U &&value) && {
-    return hasValue() ? std::move(getValue()) : std::forward<U>(value);
+  template <typename U> T value_or(U &&alt) && {
+    return has_value() ? std::move(value()) : std::forward<U>(alt);
+  }
+  template <typename U> T getValueOr(U &&alt) && {
+    return has_value() ? std::move(value()) : std::forward<U>(alt);
   }
 
   /// Apply a function to the value if present; otherwise return None.
   template <class Function>
-  auto map(const Function &F) &&
-      -> Optional<decltype(F(std::move(*this).getValue()))> {
-    if (*this) return F(std::move(*this).getValue());
+  auto map(const Function &F)
+      && -> Optional<decltype(F(std::move(*this).value()))> {
+    if (*this)
+      return F(std::move(*this).value());
     return None;
   }
-#endif
 };
 
 template <class T> llvm::hash_code hash_value(const Optional<T> &O) {
@@ -334,7 +359,7 @@ template <typename T, typename U>
 constexpr bool operator==(const Optional<T> &X, const Optional<U> &Y) {
   if (X && Y)
     return *X == *Y;
-  return X.hasValue() == Y.hasValue();
+  return X.has_value() == Y.has_value();
 }
 
 template <typename T, typename U>
@@ -346,7 +371,7 @@ template <typename T, typename U>
 constexpr bool operator<(const Optional<T> &X, const Optional<U> &Y) {
   if (X && Y)
     return *X < *Y;
-  return X.hasValue() < Y.hasValue();
+  return X.has_value() < Y.has_value();
 }
 
 template <typename T, typename U>
@@ -389,7 +414,7 @@ template <typename T> constexpr bool operator<(const Optional<T> &, NoneType) {
 }
 
 template <typename T> constexpr bool operator<(NoneType, const Optional<T> &X) {
-  return X.hasValue();
+  return X.has_value();
 }
 
 template <typename T>
diff --git a/llvm/include/llvm/ADT/PointerIntPair.h b/llvm/include/llvm/ADT/PointerIntPair.h
index b7ddf8855605..7d10b2a6dd14 100644
--- a/llvm/include/llvm/ADT/PointerIntPair.h
+++ b/llvm/include/llvm/ADT/PointerIntPair.h
@@ -61,19 +61,19 @@ public:
 
   IntType getInt() const { return (IntType)Info::getInt(Value); }
 
-  void setPointer(PointerTy PtrVal) LLVM_LVALUE_FUNCTION {
+  void setPointer(PointerTy PtrVal) & {
     Value = Info::updatePointer(Value, PtrVal);
   }
 
-  void setInt(IntType IntVal) LLVM_LVALUE_FUNCTION {
+  void setInt(IntType IntVal) & {
     Value = Info::updateInt(Value, static_cast<intptr_t>(IntVal));
   }
 
-  void initWithPointer(PointerTy PtrVal) LLVM_LVALUE_FUNCTION {
+  void initWithPointer(PointerTy PtrVal) & {
     Value = Info::updatePointer(0, PtrVal);
   }
 
-  void setPointerAndInt(PointerTy PtrVal, IntType IntVal) LLVM_LVALUE_FUNCTION {
+  void setPointerAndInt(PointerTy PtrVal, IntType IntVal) & {
     Value = Info::updateInt(Info::updatePointer(0, PtrVal),
                             static_cast<intptr_t>(IntVal));
   }
@@ -91,7 +91,7 @@ public:
 
   void *getOpaqueValue() const { return reinterpret_cast<void *>(Value); }
 
-  void setFromOpaqueValue(void *Val) LLVM_LVALUE_FUNCTION {
+  void setFromOpaqueValue(void *Val) & {
     Value = reinterpret_cast<intptr_t>(Val);
   }
 
diff --git a/llvm/include/llvm/ADT/PointerSumType.h b/llvm/include/llvm/ADT/PointerSumType.h
index a7ef774e205e..57f045035a78 100644
--- a/llvm/include/llvm/ADT/PointerSumType.h
+++ b/llvm/include/llvm/ADT/PointerSumType.h
@@ -272,11 +272,12 @@ struct DenseMapInfo<PointerSumType<TagT, MemberTs...>> {
   using SomePointerInfo = DenseMapInfo<SomePointerT>;
 
   static inline SumType getEmptyKey() {
-    return SumType::create<SomeTag>(SomePointerInfo::getEmptyKey());
+    return SumType::template create<SomeTag>(SomePointerInfo::getEmptyKey());
   }
 
   static inline SumType getTombstoneKey() {
-    return SumType::create<SomeTag>(SomePointerInfo::getTombstoneKey());
+    return SumType::template create<SomeTag>(
+        SomePointerInfo::getTombstoneKey());
   }
 
   static unsigned getHashValue(const SumType &Arg) {
diff --git a/llvm/include/llvm/ADT/PointerUnion.h b/llvm/include/llvm/ADT/PointerUnion.h
index 04d566bbc75e..f01db09dd765 100644
--- a/llvm/include/llvm/ADT/PointerUnion.h
+++ b/llvm/include/llvm/ADT/PointerUnion.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/PointerLikeTypeTraits.h"
 #include <algorithm>
 #include <cassert>
@@ -87,6 +88,9 @@ namespace pointer_union_detail {
   };
 }
 
+// This is a forward declaration of CastInfoPointerUnionImpl
+// Refer to its definition below for further details
+template <typename... PTs> struct CastInfoPointerUnionImpl;
 /// A discriminated union of two or more pointer types, with the discriminator
 /// in the low bit of the pointer.
 ///
@@ -122,6 +126,11 @@ class PointerUnion
   using First = TypeAtIndex<0, PTs...>;
   using Base = typename PointerUnion::PointerUnionMembers;
 
+  /// This is needed to give the CastInfo implementation below access
+  /// to protected members.
+  /// Refer to its definition for further details.
+  friend struct CastInfoPointerUnionImpl<PTs...>;
+
 public:
   PointerUnion() = default;
 
@@ -134,25 +143,24 @@ public:
 
   explicit operator bool() const { return !isNull(); }
 
+  // FIXME: Replace the uses of is(), get() and dyn_cast() with
+  //        isa<T>, cast<T> and the llvm::dyn_cast<T>
+
   /// Test if the Union currently holds the type matching T.
-  template <typename T> bool is() const {
-    return this->Val.getInt() == FirstIndexOfType<T, PTs...>::value;
-  }
+  template <typename T> inline bool is() const { return isa<T>(*this); }
 
   /// Returns the value of the specified pointer type.
   ///
   /// If the specified pointer type is incorrect, assert.
-  template <typename T> T get() const {
-    assert(is<T>() && "Invalid accessor called");
-    return PointerLikeTypeTraits<T>::getFromVoidPointer(this->Val.getPointer());
+  template <typename T> inline T get() const {
+    assert(isa<T>(*this) && "Invalid accessor called");
+    return cast<T>(*this);
   }
 
   /// Returns the current pointer if it is of the specified pointer type,
   /// otherwise returns null.
-  template <typename T> T dyn_cast() const {
-    if (is<T>())
-      return get<T>();
-    return T();
+  template <typename T> inline T dyn_cast() const {
+    return llvm::dyn_cast<T>(*this);
   }
 
   /// If the union is set to the first pointer type get an address pointing to
@@ -205,6 +213,52 @@ bool operator<(PointerUnion<PTs...> lhs, PointerUnion<PTs...> rhs) {
   return lhs.getOpaqueValue() < rhs.getOpaqueValue();
 }
 
+/// We can't (at least, at this moment with C++14) declare CastInfo
+/// as a friend of PointerUnion like this:
+/// ```
+///   template<typename To>
+///   friend struct CastInfo<To, PointerUnion<PTs...>>;
+/// ```
+/// The compiler complains 'Partial specialization cannot be declared as a
+/// friend'.
+/// So we define this struct to be a bridge between CastInfo and
+/// PointerUnion.
+template <typename... PTs> struct CastInfoPointerUnionImpl {
+  using From = PointerUnion<PTs...>;
+
+  template <typename To> static inline bool isPossible(From &F) {
+    return F.Val.getInt() == FirstIndexOfType<To, PTs...>::value;
+  }
+
+  template <typename To> static To doCast(From &F) {
+    assert(isPossible<To>(F) && "cast to an incompatible type !");
+    return PointerLikeTypeTraits<To>::getFromVoidPointer(F.Val.getPointer());
+  }
+};
+
+// Specialization of CastInfo for PointerUnion
+template <typename To, typename... PTs>
+struct CastInfo<To, PointerUnion<PTs...>>
+    : public DefaultDoCastIfPossible<To, PointerUnion<PTs...>,
+                                     CastInfo<To, PointerUnion<PTs...>>> {
+  using From = PointerUnion<PTs...>;
+  using Impl = CastInfoPointerUnionImpl<PTs...>;
+
+  static inline bool isPossible(From &f) {
+    return Impl::template isPossible<To>(f);
+  }
+
+  static To doCast(From &f) { return Impl::template doCast<To>(f); }
+
+  static inline To castFailed() { return To(); }
+};
+
+template <typename To, typename... PTs>
+struct CastInfo<To, const PointerUnion<PTs...>>
+    : public ConstStrippingForwardingCast<To, const PointerUnion<PTs...>,
+                                          CastInfo<To, PointerUnion<PTs...>>> {
+};
+
 // Teach SmallPtrSet that PointerUnion is "basically a pointer", that has
 // # low bits available = min(PT1bits,PT2bits)-1.
 template <typename ...PTs>
diff --git a/llvm/include/llvm/ADT/SCCIterator.h b/llvm/include/llvm/ADT/SCCIterator.h
index ad35e09f0f74..e4035a02b5f5 100644
--- a/llvm/include/llvm/ADT/SCCIterator.h
+++ b/llvm/include/llvm/ADT/SCCIterator.h
@@ -348,9 +348,14 @@ scc_member_iterator<GraphT, GT>::scc_member_iterator(
     NodeInfoMap[Edge->Target].Visited = false;
 
   std::queue<NodeType *> Queue;
-  for (auto &Node : NodeInfoMap)
-    if (Node.second.Visited)
-      Queue.push(Node.first);
+  // Initialze the queue with MST roots. Note that walking through SortedEdges
+  // instead of NodeInfoMap ensures an ordered deterministic push.
+  for (auto *Edge : SortedEdges) {
+    if (NodeInfoMap[Edge->Source].Visited) {
+      Queue.push(Edge->Source);
+      NodeInfoMap[Edge->Source].Visited = false;
+    }
+  }
 
   while (!Queue.empty()) {
     auto *Node = Queue.front();
diff --git a/llvm/include/llvm/ADT/STLExtras.h b/llvm/include/llvm/ADT/STLExtras.h
index e2972f4f902a..0efa96e69a8c 100644
--- a/llvm/include/llvm/ADT/STLExtras.h
+++ b/llvm/include/llvm/ADT/STLExtras.h
@@ -129,7 +129,7 @@ struct function_traits<ReturnType (ClassType::*)(Args...) const, false> {
 /// Overload for class function types.
 template <typename ClassType, typename ReturnType, typename... Args>
 struct function_traits<ReturnType (ClassType::*)(Args...), false>
-    : function_traits<ReturnType (ClassType::*)(Args...) const> {};
+    : public function_traits<ReturnType (ClassType::*)(Args...) const> {};
 /// Overload for non-class function types.
 template <typename ReturnType, typename... Args>
 struct function_traits<ReturnType (*)(Args...), false> {
@@ -143,6 +143,9 @@ struct function_traits<ReturnType (*)(Args...), false> {
   template <size_t i>
   using arg_t = typename std::tuple_element<i, std::tuple<Args...>>::type;
 };
+template <typename ReturnType, typename... Args>
+struct function_traits<ReturnType (*const)(Args...), false>
+    : public function_traits<ReturnType (*)(Args...)> {};
 /// Overload for non-class function type references.
 template <typename ReturnType, typename... Args>
 struct function_traits<ReturnType (&)(Args...), false>
@@ -203,6 +206,17 @@ struct FirstIndexOfType<T, T, Us...> : std::integral_constant<size_t, 0> {};
 template <size_t I, typename... Ts>
 using TypeAtIndex = std::tuple_element_t<I, std::tuple<Ts...>>;
 
+/// Helper which adds two underlying types of enumeration type.
+/// Implicit conversion to a common type is accepted.
+template <typename EnumTy1, typename EnumTy2,
+          typename UT1 = std::enable_if_t<std::is_enum<EnumTy1>::value,
+                                          std::underlying_type_t<EnumTy1>>,
+          typename UT2 = std::enable_if_t<std::is_enum<EnumTy2>::value,
+                                          std::underlying_type_t<EnumTy2>>>
+constexpr auto addEnumValues(EnumTy1 LHS, EnumTy2 RHS) {
+  return static_cast<UT1>(LHS) + static_cast<UT2>(RHS);
+}
+
 //===----------------------------------------------------------------------===//
 //     Extra additions to <iterator>
 //===----------------------------------------------------------------------===//
@@ -268,6 +282,13 @@ template <typename T> auto drop_begin(T &&RangeOrContainer, size_t N = 1) {
                     adl_end(RangeOrContainer));
 }
 
+/// Return a range covering \p RangeOrContainer with the last N elements
+/// excluded.
+template <typename T> auto drop_end(T &&RangeOrContainer, size_t N = 1) {
+  return make_range(adl_begin(RangeOrContainer),
+                    std::prev(adl_end(RangeOrContainer), N));
+}
+
 // mapped_iterator - This is a simple iterator adapter that causes a function to
 // be applied whenever operator* is invoked on the iterator.
 
@@ -423,6 +444,16 @@ public:
     findNextValid();
     return *this;
   }
+
+  decltype(auto) operator*() const {
+    assert(BaseT::wrapped() != End && "Cannot dereference end iterator!");
+    return BaseT::operator*();
+  }
+
+  decltype(auto) operator->() const {
+    assert(BaseT::wrapped() != End && "Cannot dereference end iterator!");
+    return BaseT::operator->();
+  }
 };
 
 /// Specialization of filter_iterator_base for forward iteration only.
@@ -1160,13 +1191,15 @@ public:
   }
 
   /// Compare this range with another.
-  template <typename OtherT> bool operator==(const OtherT &other) const {
-    return size() ==
-               static_cast<size_t>(std::distance(other.begin(), other.end())) &&
-           std::equal(begin(), end(), other.begin());
+  template <typename OtherT>
+  friend bool operator==(const indexed_accessor_range_base &lhs,
+                         const OtherT &rhs) {
+    return std::equal(lhs.begin(), lhs.end(), rhs.begin(), rhs.end());
   }
-  template <typename OtherT> bool operator!=(const OtherT &other) const {
-    return !(*this == other);
+  template <typename OtherT>
+  friend bool operator!=(const indexed_accessor_range_base &lhs,
+                         const OtherT &rhs) {
+    return !(lhs == rhs);
   }
 
   /// Return the size of this range.
@@ -1650,6 +1683,15 @@ bool is_contained(R &&Range, const E &Element) {
   return std::find(adl_begin(Range), adl_end(Range), Element) != adl_end(Range);
 }
 
+template <typename T>
+constexpr bool is_contained(std::initializer_list<T> Set, T Value) {
+  // TODO: Use std::find when we switch to C++20.
+  for (T V : Set)
+    if (V == Value)
+      return true;
+  return false;
+}
+
 /// Wrapper function around std::is_sorted to check if elements in a range \p R
 /// are sorted with respect to a comparator \p C.
 template <typename R, typename Compare> bool is_sorted(R &&Range, Compare C) {
diff --git a/llvm/include/llvm/ADT/SmallVector.h b/llvm/include/llvm/ADT/SmallVector.h
index a4a790323a6b..e34702bdbb3c 100644
--- a/llvm/include/llvm/ADT/SmallVector.h
+++ b/llvm/include/llvm/ADT/SmallVector.h
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 ///
-/// /file
+/// \file
 /// This file defines the SmallVector class.
 ///
 //===----------------------------------------------------------------------===//
@@ -949,6 +949,9 @@ public:
     return std::lexicographical_compare(this->begin(), this->end(),
                                         RHS.begin(), RHS.end());
   }
+  bool operator>(const SmallVectorImpl &RHS) const { return RHS < *this; }
+  bool operator<=(const SmallVectorImpl &RHS) const { return !(*this > RHS); }
+  bool operator>=(const SmallVectorImpl &RHS) const { return !(*this < RHS); }
 };
 
 template <typename T>
diff --git a/llvm/include/llvm/ADT/Statistic.h b/llvm/include/llvm/ADT/Statistic.h
index c39e161bcbcd..6c195cc44990 100644
--- a/llvm/include/llvm/ADT/Statistic.h
+++ b/llvm/include/llvm/ADT/Statistic.h
@@ -53,7 +53,7 @@ public:
   const char *const Name;
   const char *const Desc;
 
-  std::atomic<unsigned> Value;
+  std::atomic<uint64_t> Value;
   std::atomic<bool> Initialized;
 
   constexpr TrackingStatistic(const char *DebugType, const char *Name,
@@ -65,12 +65,12 @@ public:
   const char *getName() const { return Name; }
   const char *getDesc() const { return Desc; }
 
-  unsigned getValue() const { return Value.load(std::memory_order_relaxed); }
+  uint64_t getValue() const { return Value.load(std::memory_order_relaxed); }
 
   // Allow use of this class as the value itself.
-  operator unsigned() const { return getValue(); }
+  operator uint64_t() const { return getValue(); }
 
-  const TrackingStatistic &operator=(unsigned Val) {
+  const TrackingStatistic &operator=(uint64_t Val) {
     Value.store(Val, std::memory_order_relaxed);
     return init();
   }
@@ -80,7 +80,7 @@ public:
     return init();
   }
 
-  unsigned operator++(int) {
+  uint64_t operator++(int) {
     init();
     return Value.fetch_add(1, std::memory_order_relaxed);
   }
@@ -90,27 +90,27 @@ public:
     return init();
   }
 
-  unsigned operator--(int) {
+  uint64_t operator--(int) {
     init();
     return Value.fetch_sub(1, std::memory_order_relaxed);
   }
 
-  const TrackingStatistic &operator+=(unsigned V) {
+  const TrackingStatistic &operator+=(uint64_t V) {
     if (V == 0)
       return *this;
     Value.fetch_add(V, std::memory_order_relaxed);
     return init();
   }
 
-  const TrackingStatistic &operator-=(unsigned V) {
+  const TrackingStatistic &operator-=(uint64_t V) {
     if (V == 0)
       return *this;
     Value.fetch_sub(V, std::memory_order_relaxed);
     return init();
   }
 
-  void updateMax(unsigned V) {
-    unsigned PrevMax = Value.load(std::memory_order_relaxed);
+  void updateMax(uint64_t V) {
+    uint64_t PrevMax = Value.load(std::memory_order_relaxed);
     // Keep trying to update max until we succeed or another thread produces
     // a bigger max than us.
     while (V > PrevMax && !Value.compare_exchange_weak(
@@ -134,26 +134,26 @@ public:
   NoopStatistic(const char * /*DebugType*/, const char * /*Name*/,
                 const char * /*Desc*/) {}
 
-  unsigned getValue() const { return 0; }
+  uint64_t getValue() const { return 0; }
 
   // Allow use of this class as the value itself.
-  operator unsigned() const { return 0; }
+  operator uint64_t() const { return 0; }
 
-  const NoopStatistic &operator=(unsigned Val) { return *this; }
+  const NoopStatistic &operator=(uint64_t Val) { return *this; }
 
   const NoopStatistic &operator++() { return *this; }
 
-  unsigned operator++(int) { return 0; }
+  uint64_t operator++(int) { return 0; }
 
   const NoopStatistic &operator--() { return *this; }
 
-  unsigned operator--(int) { return 0; }
+  uint64_t operator--(int) { return 0; }
 
-  const NoopStatistic &operator+=(const unsigned &V) { return *this; }
+  const NoopStatistic &operator+=(const uint64_t &V) { return *this; }
 
-  const NoopStatistic &operator-=(const unsigned &V) { return *this; }
+  const NoopStatistic &operator-=(const uint64_t &V) { return *this; }
 
-  void updateMax(unsigned V) {}
+  void updateMax(uint64_t V) {}
 };
 
 #if LLVM_ENABLE_STATS
@@ -200,7 +200,7 @@ void PrintStatisticsJSON(raw_ostream &OS);
 /// during it's execution. It will return the value at the point that it is
 /// read. However, it will prevent new statistics from registering until it
 /// completes.
-const std::vector<std::pair<StringRef, unsigned>> GetStatistics();
+const std::vector<std::pair<StringRef, uint64_t>> GetStatistics();
 
 /// Reset the statistics. This can be used to zero and de-register the
 /// statistics in order to measure a compilation.
diff --git a/llvm/include/llvm/ADT/StringRef.h b/llvm/include/llvm/ADT/StringRef.h
index 118def2f43e1..80ba47dd619c 100644
--- a/llvm/include/llvm/ADT/StringRef.h
+++ b/llvm/include/llvm/ADT/StringRef.h
@@ -240,6 +240,10 @@ namespace llvm {
     unsigned edit_distance(StringRef Other, bool AllowReplacements = true,
                            unsigned MaxEditDistance = 0) const;
 
+    LLVM_NODISCARD unsigned
+    edit_distance_insensitive(StringRef Other, bool AllowReplacements = true,
+                              unsigned MaxEditDistance = 0) const;
+
     /// str - Get the contents as an std::string.
     LLVM_NODISCARD
     std::string str() const {
diff --git a/llvm/include/llvm/ADT/Triple.h b/llvm/include/llvm/ADT/Triple.h
index 42277c013035..9d85a28fbf04 100644
--- a/llvm/include/llvm/ADT/Triple.h
+++ b/llvm/include/llvm/ADT/Triple.h
@@ -56,7 +56,10 @@ public:
     bpfel,          // eBPF or extended BPF or 64-bit BPF (little endian)
     bpfeb,          // eBPF or extended BPF or 64-bit BPF (big endian)
     csky,           // CSKY: csky
+    dxil,           // DXIL 32-bit DirectX bytecode
     hexagon,        // Hexagon: hexagon
+    loongarch32,    // LoongArch (32-bit): loongarch32
+    loongarch64,    // LoongArch (64-bit): loongarch64
     m68k,           // M68k: Motorola 680x0 family
     mips,           // MIPS: mips, mipsallegrex, mipsr6
     mipsel,         // MIPSEL: mipsel, mipsallegrexe, mipsr6el
@@ -146,7 +149,15 @@ public:
 
     MipsSubArch_r6,
 
-    PPCSubArch_spe
+    PPCSubArch_spe,
+
+    // SPIR-V sub-arch corresponds to its version.
+    SPIRVSubArch_v10,
+    SPIRVSubArch_v11,
+    SPIRVSubArch_v12,
+    SPIRVSubArch_v13,
+    SPIRVSubArch_v14,
+    SPIRVSubArch_v15,
   };
   enum VendorType {
     UnknownVendor,
@@ -195,9 +206,11 @@ public:
     NVCL,       // NVIDIA OpenCL
     AMDHSA,     // AMD HSA Runtime
     PS4,
+    PS5,
     ELFIAMCU,
     TvOS,       // Apple tvOS
     WatchOS,    // Apple watchOS
+    DriverKit,  // Apple DriverKit
     Mesa3D,
     Contiki,
     AMDPAL,     // AMD PAL Runtime
@@ -205,7 +218,8 @@ public:
     Hurd,       // GNU/Hurd
     WASI,       // Experimental WebAssembly OS
     Emscripten,
-    LastOSType = Emscripten
+    ShaderModel, // DirectX ShaderModel
+    LastOSType = ShaderModel
   };
   enum EnvironmentType {
     UnknownEnvironment,
@@ -232,15 +246,35 @@ public:
     CoreCLR,
     Simulator, // Simulator variants of other systems, e.g., Apple's iOS
     MacABI, // Mac Catalyst variant of Apple's iOS deployment target.
-    LastEnvironmentType = MacABI
+    
+    // Shader Stages
+    Pixel,
+    Vertex,
+    Geometry,
+    Hull,
+    Domain,
+    Compute,
+    Library,
+    RayGeneration,
+    Intersection,
+    AnyHit,
+    ClosestHit,
+    Miss,
+    Callable,
+    Mesh,
+    Amplification,
+
+    LastEnvironmentType = Amplification
   };
   enum ObjectFormatType {
     UnknownObjectFormat,
 
     COFF,
+    DXContainer,
     ELF,
     GOFF,
     MachO,
+    SPIRV,
     Wasm,
     XCOFF,
   };
@@ -360,6 +394,9 @@ public:
   /// with WatchOS or generic triples.
   VersionTuple getWatchOSVersion() const;
 
+  /// Parse the version number as with getOSVersion.
+  VersionTuple getDriverKitVersion() const;
+
   /// @}
   /// @name Direct Component Access
   /// @{
@@ -462,11 +499,14 @@ public:
     return getSubArch() == Triple::ARMSubArch_v7k;
   }
 
+  /// Is this an Apple DriverKit triple.
+  bool isDriverKit() const { return getOS() == Triple::DriverKit; }
+
   bool isOSzOS() const { return getOS() == Triple::ZOS; }
 
-  /// Is this a "Darwin" OS (macOS, iOS, tvOS or watchOS).
+  /// Is this a "Darwin" OS (macOS, iOS, tvOS, watchOS, or DriverKit).
   bool isOSDarwin() const {
-    return isMacOSX() || isiOS() || isWatchOS();
+    return isMacOSX() || isiOS() || isWatchOS() || isDriverKit();
   }
 
   bool isSimulatorEnvironment() const {
@@ -640,19 +680,23 @@ public:
     return getObjectFormat() == Triple::XCOFF;
   }
 
-  /// Tests whether the target is the PS4 CPU
-  bool isPS4CPU() const {
+  /// Tests whether the target is the PS4 platform.
+  bool isPS4() const {
     return getArch() == Triple::x86_64 &&
            getVendor() == Triple::SCEI &&
            getOS() == Triple::PS4;
   }
 
-  /// Tests whether the target is the PS4 platform
-  bool isPS4() const {
-    return getVendor() == Triple::SCEI &&
-           getOS() == Triple::PS4;
+  /// Tests whether the target is the PS5 platform.
+  bool isPS5() const {
+    return getArch() == Triple::x86_64 &&
+      getVendor() == Triple::SCEI &&
+      getOS() == Triple::PS5;
   }
 
+  /// Tests whether the target is the PS4 or PS5 platform.
+  bool isPS() const { return isPS4() || isPS5(); }
+
   /// Tests whether the target is Android
   bool isAndroid() const { return getEnvironment() == Triple::Android; }
 
@@ -676,6 +720,11 @@ public:
            getEnvironment() == Triple::MuslX32;
   }
 
+  /// Tests whether the target is DXIL.
+  bool isDXIL() const {
+    return getArch() == Triple::dxil;
+  }
+
   /// Tests whether the target is SPIR (32- or 64-bit).
   bool isSPIR() const {
     return getArch() == Triple::spir || getArch() == Triple::spir64;
@@ -774,6 +823,11 @@ public:
                : PointerWidth == 64;
   }
 
+  /// Tests whether the target is LoongArch (32- and 64-bit).
+  bool isLoongArch() const {
+    return getArch() == Triple::loongarch32 || getArch() == Triple::loongarch64;
+  }
+
   /// Tests whether the target is MIPS 32-bit (little and big endian).
   bool isMIPS32() const {
     return getArch() == Triple::mips || getArch() == Triple::mipsel;
@@ -810,6 +864,17 @@ public:
     return getArch() == Triple::riscv32 || getArch() == Triple::riscv64;
   }
 
+  /// Tests whether the target is 32-bit SPARC (little and big endian).
+  bool isSPARC32() const {
+    return getArch() == Triple::sparc || getArch() == Triple::sparcel;
+  }
+
+  /// Tests whether the target is 64-bit SPARC (big endian).
+  bool isSPARC64() const { return getArch() == Triple::sparcv9; }
+
+  /// Tests whether the target is SPARC.
+  bool isSPARC() const { return isSPARC32() || isSPARC64(); }
+
   /// Tests whether the target is SystemZ.
   bool isSystemZ() const {
     return getArch() == Triple::systemz;
@@ -863,7 +928,7 @@ public:
   }
 
   /// Tests if the environment supports dllimport/export annotations.
-  bool hasDLLImportExport() const { return isOSWindows() || isPS4CPU(); }
+  bool hasDLLImportExport() const { return isOSWindows() || isPS(); }
 
   /// @}
   /// @name Mutators
@@ -971,7 +1036,7 @@ public:
 
   /// Get the "prefix" canonical name for the \p Kind architecture. This is the
   /// prefix used by the architecture specific builtins, and is suitable for
-  /// passing to \see Intrinsic::getIntrinsicForGCCBuiltin().
+  /// passing to \see Intrinsic::getIntrinsicForClangBuiltin().
   ///
   /// \return - The architecture prefix, or 0 if none is defined.
   static StringRef getArchTypePrefix(ArchType Kind);
diff --git a/llvm/include/llvm/ADT/edit_distance.h b/llvm/include/llvm/ADT/edit_distance.h
index c480c1e7cd78..6df3db6125d4 100644
--- a/llvm/include/llvm/ADT/edit_distance.h
+++ b/llvm/include/llvm/ADT/edit_distance.h
@@ -28,6 +28,9 @@ namespace llvm {
 ///
 /// \param ToArray the second sequence to compare.
 ///
+/// \param Map A Functor to apply to each item of the sequences before
+/// comparison.
+///
 /// \param AllowReplacements whether to allow element replacements (change one
 /// element into another) as a single operation, rather than as two operations
 /// (an insertion and a removal).
@@ -39,10 +42,10 @@ namespace llvm {
 /// \returns the minimum number of element insertions, removals, or (if
 /// \p AllowReplacements is \c true) replacements needed to transform one of
 /// the given sequences into the other. If zero, the sequences are identical.
-template<typename T>
-unsigned ComputeEditDistance(ArrayRef<T> FromArray, ArrayRef<T> ToArray,
-                             bool AllowReplacements = true,
-                             unsigned MaxEditDistance = 0) {
+template <typename T, typename Functor>
+unsigned ComputeMappedEditDistance(ArrayRef<T> FromArray, ArrayRef<T> ToArray,
+                                   Functor Map, bool AllowReplacements = true,
+                                   unsigned MaxEditDistance = 0) {
   // The algorithm implemented below is the "classic"
   // dynamic-programming algorithm for computing the Levenshtein
   // distance, which is described here:
@@ -58,6 +61,15 @@ unsigned ComputeEditDistance(ArrayRef<T> FromArray, ArrayRef<T> ToArray,
   typename ArrayRef<T>::size_type m = FromArray.size();
   typename ArrayRef<T>::size_type n = ToArray.size();
 
+  if (MaxEditDistance) {
+    // If the difference in size between the 2 arrays is larger than the max
+    // distance allowed, we can bail out as we will always need at least
+    // MaxEditDistance insertions or removals.
+    typename ArrayRef<T>::size_type AbsDiff = m > n ? m - n : n - m;
+    if (AbsDiff > MaxEditDistance)
+      return MaxEditDistance + 1;
+  }
+
   const unsigned SmallBufferSize = 64;
   unsigned SmallBuffer[SmallBufferSize];
   std::unique_ptr<unsigned[]> Allocated;
@@ -75,15 +87,16 @@ unsigned ComputeEditDistance(ArrayRef<T> FromArray, ArrayRef<T> ToArray,
     unsigned BestThisRow = Row[0];
 
     unsigned Previous = y - 1;
+    const auto &CurItem = Map(FromArray[y - 1]);
     for (typename ArrayRef<T>::size_type x = 1; x <= n; ++x) {
       int OldRow = Row[x];
       if (AllowReplacements) {
-        Row[x] = std::min(
-            Previous + (FromArray[y-1] == ToArray[x-1] ? 0u : 1u),
-            std::min(Row[x-1], Row[x])+1);
+        Row[x] = std::min(Previous + (CurItem == Map(ToArray[x - 1]) ? 0u : 1u),
+                          std::min(Row[x - 1], Row[x]) + 1);
       }
       else {
-        if (FromArray[y-1] == ToArray[x-1]) Row[x] = Previous;
+        if (CurItem == Map(ToArray[x - 1]))
+          Row[x] = Previous;
         else Row[x] = std::min(Row[x-1], Row[x]) + 1;
       }
       Previous = OldRow;
@@ -98,6 +111,15 @@ unsigned ComputeEditDistance(ArrayRef<T> FromArray, ArrayRef<T> ToArray,
   return Result;
 }
 
+template <typename T>
+unsigned ComputeEditDistance(ArrayRef<T> FromArray, ArrayRef<T> ToArray,
+                             bool AllowReplacements = true,
+                             unsigned MaxEditDistance = 0) {
+  return ComputeMappedEditDistance(
+      FromArray, ToArray, [](const T &X) -> const T & { return X; },
+      AllowReplacements, MaxEditDistance);
+}
+
 } // End llvm namespace
 
 #endif
diff --git a/llvm/include/llvm/Analysis/AliasAnalysis.h b/llvm/include/llvm/Analysis/AliasAnalysis.h
index d4febe6c1db9..c065553db8e9 100644
--- a/llvm/include/llvm/Analysis/AliasAnalysis.h
+++ b/llvm/include/llvm/Analysis/AliasAnalysis.h
@@ -38,7 +38,6 @@
 #define LLVM_ANALYSIS_ALIASANALYSIS_H
 
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/MemoryLocation.h"
@@ -64,6 +63,7 @@ class LoopInfo;
 class PreservedAnalyses;
 class TargetLibraryInfo;
 class Value;
+template <typename> class SmallPtrSetImpl;
 
 /// The possible results of an alias query.
 ///
@@ -413,8 +413,12 @@ class EarliestEscapeInfo final : public CaptureInfo {
   /// This is used for cache invalidation purposes.
   DenseMap<Instruction *, TinyPtrVector<const Value *>> Inst2Obj;
 
+  const SmallPtrSetImpl<const Value *> &EphValues;
+
 public:
-  EarliestEscapeInfo(DominatorTree &DT, const LoopInfo &LI) : DT(DT), LI(LI) {}
+  EarliestEscapeInfo(DominatorTree &DT, const LoopInfo &LI,
+                     const SmallPtrSetImpl<const Value *> &EphValues)
+      : DT(DT), LI(LI), EphValues(EphValues) {}
 
   bool isNotCapturedBeforeOrAt(const Value *Object,
                                const Instruction *I) override;
@@ -1267,6 +1271,10 @@ bool isIdentifiedObject(const Value *V);
 /// IdentifiedObjects.
 bool isIdentifiedFunctionLocal(const Value *V);
 
+/// Returns true if the pointer is one which would have been considered an
+/// escape by isNonEscapingLocalObject.
+bool isEscapeSource(const Value *V);
+
 /// Return true if Object memory is not visible after an unwind, in the sense
 /// that program semantics cannot depend on Object containing any particular
 /// value on unwind. If the RequiresNoCaptureBeforeUnwind out parameter is set
diff --git a/llvm/include/llvm/Analysis/AliasAnalysisEvaluator.h b/llvm/include/llvm/Analysis/AliasAnalysisEvaluator.h
index 2dd2e7ca916d..48181cc52626 100644
--- a/llvm/include/llvm/Analysis/AliasAnalysisEvaluator.h
+++ b/llvm/include/llvm/Analysis/AliasAnalysisEvaluator.h
@@ -24,12 +24,12 @@
 #ifndef LLVM_ANALYSIS_ALIASANALYSISEVALUATOR_H
 #define LLVM_ANALYSIS_ALIASANALYSISEVALUATOR_H
 
-#include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Pass.h"
 
 namespace llvm {
 class AAResults;
+class Function;
+class FunctionPass;
 
 class AAEvaluator : public PassInfoMixin<AAEvaluator> {
   int64_t FunctionCount = 0;
diff --git a/llvm/include/llvm/Analysis/AliasSetTracker.h b/llvm/include/llvm/Analysis/AliasSetTracker.h
index b66ff395454d..78f5545ab215 100644
--- a/llvm/include/llvm/Analysis/AliasSetTracker.h
+++ b/llvm/include/llvm/Analysis/AliasSetTracker.h
@@ -22,13 +22,10 @@
 #include "llvm/ADT/ilist_node.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/IR/Instruction.h"
-#include "llvm/IR/Metadata.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/ValueHandle.h"
-#include "llvm/Support/Casting.h"
 #include <cassert>
 #include <cstddef>
-#include <cstdint>
 #include <iterator>
 #include <vector>
 
@@ -224,10 +221,6 @@ public:
   // track of the list's exact size.
   unsigned size() { return SetSize; }
 
-  /// If this alias set is known to contain a single instruction and *only* a
-  /// single unique instruction, return it.  Otherwise, return nullptr.
-  Instruction* getUniqueInstruction();
-
   void print(raw_ostream &OS) const;
   void dump() const;
 
diff --git a/llvm/include/llvm/Analysis/AssumeBundleQueries.h b/llvm/include/llvm/Analysis/AssumeBundleQueries.h
index 77da19110246..785980130386 100644
--- a/llvm/include/llvm/Analysis/AssumeBundleQueries.h
+++ b/llvm/include/llvm/Analysis/AssumeBundleQueries.h
@@ -14,14 +14,14 @@
 #ifndef LLVM_ANALYSIS_ASSUMEBUNDLEQUERIES_H
 #define LLVM_ANALYSIS_ASSUMEBUNDLEQUERIES_H
 
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/IntrinsicInst.h"
 
 namespace llvm {
 class AssumptionCache;
 class DominatorTree;
+class Instruction;
+class Value;
 
 /// Index of elements in the operand bundle.
 /// If the element exist it is guaranteed to be what is specified in this enum
diff --git a/llvm/include/llvm/Analysis/BasicAliasAnalysis.h b/llvm/include/llvm/Analysis/BasicAliasAnalysis.h
index 97dda58109e9..46f14a21a9ff 100644
--- a/llvm/include/llvm/Analysis/BasicAliasAnalysis.h
+++ b/llvm/include/llvm/Analysis/BasicAliasAnalysis.h
@@ -18,8 +18,6 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
-#include <algorithm>
-#include <cstdint>
 #include <memory>
 #include <utility>
 
diff --git a/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h b/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
index 858dd369dd0b..d8e524d7cb80 100644
--- a/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
+++ b/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
@@ -20,6 +20,7 @@
 #include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SparseBitVector.h"
 #include "llvm/ADT/Twine.h"
@@ -31,7 +32,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/DOTGraphTraits.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/ScaledNumber.h"
 #include "llvm/Support/raw_ostream.h"
@@ -45,7 +45,6 @@
 #include <list>
 #include <queue>
 #include <string>
-#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -1300,7 +1299,7 @@ bool BlockFrequencyInfoImpl<BT>::computeMassInLoop(LoopData &Loop) {
       auto &HeaderNode = Loop.Nodes[H];
       assert(!getBlock(HeaderNode)->getIrrLoopHeaderWeight() &&
              "Shouldn't have a weight metadata");
-      uint64_t MinWeight = MinHeaderWeight.getValue();
+      uint64_t MinWeight = *MinHeaderWeight;
       LLVM_DEBUG(dbgs() << "Giving weight " << MinWeight << " to "
                         << getBlockName(HeaderNode) << "\n");
       if (MinWeight)
@@ -1516,7 +1515,7 @@ void BlockFrequencyInfoImpl<BT>::findReachableBlocks(
   // Find all blocks to apply inference on, that is, reachable from the entry
   // along edges with non-zero probablities
   std::queue<const BlockT *> Queue;
-  std::unordered_set<const BlockT *> Reachable;
+  SmallPtrSet<const BlockT *, 8> Reachable;
   const BlockT *Entry = &F->front();
   Queue.push(Entry);
   Reachable.insert(Entry);
@@ -1527,16 +1526,14 @@ void BlockFrequencyInfoImpl<BT>::findReachableBlocks(
       auto EP = BPI->getEdgeProbability(SrcBB, DstBB);
       if (EP.isZero())
         continue;
-      if (Reachable.find(DstBB) == Reachable.end()) {
+      if (Reachable.insert(DstBB).second)
         Queue.push(DstBB);
-        Reachable.insert(DstBB);
-      }
     }
   }
 
   // Find all blocks to apply inference on, that is, backward reachable from
   // the entry along (backward) edges with non-zero probablities
-  std::unordered_set<const BlockT *> InverseReachable;
+  SmallPtrSet<const BlockT *, 8> InverseReachable;
   for (const BlockT &BB : *F) {
     // An exit block is a block without any successors
     bool HasSucc = GraphTraits<const BlockT *>::child_begin(&BB) !=
@@ -1553,10 +1550,8 @@ void BlockFrequencyInfoImpl<BT>::findReachableBlocks(
       auto EP = BPI->getEdgeProbability(DstBB, SrcBB);
       if (EP.isZero())
         continue;
-      if (InverseReachable.find(DstBB) == InverseReachable.end()) {
+      if (InverseReachable.insert(DstBB).second)
         Queue.push(DstBB);
-        InverseReachable.insert(DstBB);
-      }
     }
   }
 
@@ -1581,15 +1576,14 @@ void BlockFrequencyInfoImpl<BT>::initTransitionProbabilities(
   // Find unique successors and corresponding probabilities for every block
   for (size_t Src = 0; Src < NumBlocks; Src++) {
     const BlockT *BB = Blocks[Src];
-    std::unordered_set<const BlockT *> UniqueSuccs;
+    SmallPtrSet<const BlockT *, 2> UniqueSuccs;
     for (const auto SI : children<const BlockT *>(BB)) {
       // Ignore cold blocks
       if (BlockIndex.find(SI) == BlockIndex.end())
         continue;
       // Ignore parallel edges between BB and SI blocks
-      if (UniqueSuccs.find(SI) != UniqueSuccs.end())
+      if (!UniqueSuccs.insert(SI).second)
         continue;
-      UniqueSuccs.insert(SI);
       // Ignore jumps with zero probability
       auto EP = BPI->getEdgeProbability(BB, SI);
       if (EP.isZero())
@@ -1875,7 +1869,7 @@ struct BFIDOTGraphTraitsBase : public DefaultDOTGraphTraits {
     case GVDT_Count: {
       auto Count = Graph->getBlockProfileCount(Node);
       if (Count)
-        OS << Count.getValue();
+        OS << *Count;
       else
         OS << "Unknown";
       break;
diff --git a/llvm/include/llvm/Analysis/BranchProbabilityInfo.h b/llvm/include/llvm/Analysis/BranchProbabilityInfo.h
index e2099eba0f65..28418198acea 100644
--- a/llvm/include/llvm/Analysis/BranchProbabilityInfo.h
+++ b/llvm/include/llvm/Analysis/BranchProbabilityInfo.h
@@ -16,14 +16,12 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/BranchProbability.h"
-#include "llvm/Support/Casting.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
diff --git a/llvm/include/llvm/Analysis/CFGPrinter.h b/llvm/include/llvm/Analysis/CFGPrinter.h
index c0cabceb4a54..768cda59c57d 100644
--- a/llvm/include/llvm/Analysis/CFGPrinter.h
+++ b/llvm/include/llvm/Analysis/CFGPrinter.h
@@ -18,7 +18,6 @@
 #ifndef LLVM_ANALYSIS_CFGPRINTER_H
 #define LLVM_ANALYSIS_CFGPRINTER_H
 
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/HeatUtils.h"
@@ -27,10 +26,11 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Support/DOTGraphTraits.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/GraphWriter.h"
 
 namespace llvm {
+template <class GraphType> struct GraphTraits;
 class CFGViewerPass : public PassInfoMixin<CFGViewerPass> {
 public:
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
diff --git a/llvm/include/llvm/Analysis/CFLAliasAnalysisUtils.h b/llvm/include/llvm/Analysis/CFLAliasAnalysisUtils.h
index 2eae2824bec3..6543c53c9b28 100644
--- a/llvm/include/llvm/Analysis/CFLAliasAnalysisUtils.h
+++ b/llvm/include/llvm/Analysis/CFLAliasAnalysisUtils.h
@@ -14,10 +14,12 @@
 #ifndef LLVM_ANALYSIS_CFLALIASANALYSISUTILS_H
 #define LLVM_ANALYSIS_CFLALIASANALYSISUTILS_H
 
+#include "llvm/IR/Argument.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/ValueHandle.h"
 
 namespace llvm {
+
 namespace cflaa {
 
 template <typename AAResult> struct FunctionHandle final : public CallbackVH {
diff --git a/llvm/include/llvm/Analysis/CFLAndersAliasAnalysis.h b/llvm/include/llvm/Analysis/CFLAndersAliasAnalysis.h
index 5f5e52af3d88..dfb363173187 100644
--- a/llvm/include/llvm/Analysis/CFLAndersAliasAnalysis.h
+++ b/llvm/include/llvm/Analysis/CFLAndersAliasAnalysis.h
@@ -15,7 +15,6 @@
 #define LLVM_ANALYSIS_CFLANDERSALIASANALYSIS_H
 
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/Optional.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CFLAliasAnalysisUtils.h"
 #include "llvm/IR/PassManager.h"
@@ -25,6 +24,7 @@
 
 namespace llvm {
 
+template <typename T> class Optional;
 class Function;
 class MemoryLocation;
 class TargetLibraryInfo;
diff --git a/llvm/include/llvm/Analysis/CFLSteensAliasAnalysis.h b/llvm/include/llvm/Analysis/CFLSteensAliasAnalysis.h
index ec05b3706ca3..865f4a54c094 100644
--- a/llvm/include/llvm/Analysis/CFLSteensAliasAnalysis.h
+++ b/llvm/include/llvm/Analysis/CFLSteensAliasAnalysis.h
@@ -15,13 +15,11 @@
 #define LLVM_ANALYSIS_CFLSTEENSALIASANALYSIS_H
 
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/Optional.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CFLAliasAnalysisUtils.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
 #include <forward_list>
 #include <memory>
 
diff --git a/llvm/include/llvm/Analysis/CGSCCPassManager.h b/llvm/include/llvm/Analysis/CGSCCPassManager.h
index 7cf172dc1dd1..9d1b331346b6 100644
--- a/llvm/include/llvm/Analysis/CGSCCPassManager.h
+++ b/llvm/include/llvm/Analysis/CGSCCPassManager.h
@@ -88,27 +88,21 @@
 #ifndef LLVM_ANALYSIS_CGSCCPASSMANAGER_H
 #define LLVM_ANALYSIS_CGSCCPASSMANAGER_H
 
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/PriorityWorklist.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/LazyCallGraph.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/ValueHandle.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cassert>
 #include <utility>
 
 namespace llvm {
 
+class Function;
+class Value;
+template <typename T, unsigned int N> class SmallPriorityWorklist;
 struct CGSCCUpdateResult;
+
 class Module;
 
 // Allow debug logging in this inline function.
@@ -278,16 +272,6 @@ struct CGSCCUpdateResult {
   /// the list and removing entries from it.
   SmallPtrSetImpl<LazyCallGraph::SCC *> &InvalidatedSCCs;
 
-  /// If non-null, the updated current \c RefSCC being processed.
-  ///
-  /// This is set when a graph refinement takes place and the "current" point
-  /// in the graph moves "down" or earlier in the post-order walk. This will
-  /// often cause the "current" RefSCC to be a newly created RefSCC object and
-  /// the old one to be added to the above worklist. When that happens, this
-  /// pointer is non-null and can be used to continue processing the "top" of
-  /// the post-order walk.
-  LazyCallGraph::RefSCC *UpdatedRC;
-
   /// If non-null, the updated current \c SCC being processed.
   ///
   /// This is set when a graph refinement takes place and the "current" point
diff --git a/llvm/include/llvm/Analysis/CallGraph.h b/llvm/include/llvm/Analysis/CallGraph.h
index 4da448c9900b..88d56785de67 100644
--- a/llvm/include/llvm/Analysis/CallGraph.h
+++ b/llvm/include/llvm/Analysis/CallGraph.h
@@ -45,9 +45,6 @@
 #ifndef LLVM_ANALYSIS_CALLGRAPH_H
 #define LLVM_ANALYSIS_CALLGRAPH_H
 
-#include "llvm/ADT/GraphTraits.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/IR/Function.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/PassManager.h"
@@ -61,7 +58,9 @@
 
 namespace llvm {
 
+template <class GraphType> struct GraphTraits;
 class CallGraphNode;
+class Function;
 class Module;
 class raw_ostream;
 
diff --git a/llvm/include/llvm/Analysis/CallPrinter.h b/llvm/include/llvm/Analysis/CallPrinter.h
index 8d4159f3ddc0..d325d0010371 100644
--- a/llvm/include/llvm/Analysis/CallPrinter.h
+++ b/llvm/include/llvm/Analysis/CallPrinter.h
@@ -14,10 +14,24 @@
 #ifndef LLVM_ANALYSIS_CALLPRINTER_H
 #define LLVM_ANALYSIS_CALLPRINTER_H
 
+#include "llvm/IR/PassManager.h"
+
 namespace llvm {
 
 class ModulePass;
 
+/// Pass for printing the call graph to a dot file
+class CallGraphDOTPrinterPass : public PassInfoMixin<CallGraphDOTPrinterPass> {
+public:
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
+/// Pass for viewing the call graph
+class CallGraphViewerPass : public PassInfoMixin<CallGraphViewerPass> {
+public:
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
 ModulePass *createCallGraphViewerPass();
 ModulePass *createCallGraphDOTPrinterPass();
 
diff --git a/llvm/include/llvm/Analysis/CaptureTracking.h b/llvm/include/llvm/Analysis/CaptureTracking.h
index 50d12db7a1c3..a2d9277745e4 100644
--- a/llvm/include/llvm/Analysis/CaptureTracking.h
+++ b/llvm/include/llvm/Analysis/CaptureTracking.h
@@ -14,6 +14,7 @@
 #define LLVM_ANALYSIS_CAPTURETRACKING_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 
 namespace llvm {
 
@@ -24,6 +25,7 @@ namespace llvm {
   class DominatorTree;
   class LoopInfo;
   class Function;
+  template <typename T> class SmallPtrSetImpl;
 
   /// getDefaultMaxUsesToExploreForCaptureTracking - Return default value of
   /// the maximal number of uses to explore before giving up. It is used by
@@ -40,8 +42,14 @@ namespace llvm {
   /// MaxUsesToExplore specifies how many uses the analysis should explore for
   /// one value before giving up due too "too many uses". If MaxUsesToExplore
   /// is zero, a default value is assumed.
+  bool PointerMayBeCaptured(const Value *V, bool ReturnCaptures,
+                            bool StoreCaptures, unsigned MaxUsesToExplore = 0);
+
+  /// Variant of the above function which accepts a set of Values that are
+  /// ephemeral and cannot cause pointers to escape.
   bool PointerMayBeCaptured(const Value *V, bool ReturnCaptures,
                             bool StoreCaptures,
+                            const SmallPtrSetImpl<const Value *> &EphValues,
                             unsigned MaxUsesToExplore = 0);
 
   /// PointerMayBeCapturedBefore - Return true if this pointer value may be
@@ -72,10 +80,11 @@ namespace llvm {
   // nullptr is returned. Note that the caller of the function has to ensure
   // that the instruction the result value is compared against is not in a
   // cycle.
-  Instruction *FindEarliestCapture(const Value *V, Function &F,
-                                   bool ReturnCaptures, bool StoreCaptures,
-                                   const DominatorTree &DT,
-                                   unsigned MaxUsesToExplore = 0);
+  Instruction *
+  FindEarliestCapture(const Value *V, Function &F, bool ReturnCaptures,
+                      bool StoreCaptures, const DominatorTree &DT,
+                      const SmallPtrSetImpl<const Value *> &EphValues,
+                      unsigned MaxUsesToExplore = 0);
 
   /// This callback is used in conjunction with PointerMayBeCaptured. In
   /// addition to the interface here, you'll need to provide your own getters
@@ -105,6 +114,24 @@ namespace llvm {
     virtual bool isDereferenceableOrNull(Value *O, const DataLayout &DL);
   };
 
+  /// Types of use capture kinds, see \p DetermineUseCaptureKind.
+  enum class UseCaptureKind {
+    NO_CAPTURE,
+    MAY_CAPTURE,
+    PASSTHROUGH,
+  };
+
+  /// Determine what kind of capture behaviour \p U may exhibit.
+  ///
+  /// A use can be no-capture, a use can potentially capture, or a use can be
+  /// passthrough such that the uses of the user or \p U should be inspected.
+  /// The \p IsDereferenceableOrNull callback is used to rule out capturing for
+  /// certain comparisons.
+  UseCaptureKind
+  DetermineUseCaptureKind(const Use &U,
+                          llvm::function_ref<bool(Value *, const DataLayout &)>
+                              IsDereferenceableOrNull);
+
   /// PointerMayBeCaptured - Visit the value and the values derived from it and
   /// find values which appear to be capturing the pointer value. This feeds
   /// results into and is controlled by the CaptureTracker object.
diff --git a/llvm/include/llvm/Analysis/CmpInstAnalysis.h b/llvm/include/llvm/Analysis/CmpInstAnalysis.h
index 3d34cd12aea4..332eb9b66e9c 100644
--- a/llvm/include/llvm/Analysis/CmpInstAnalysis.h
+++ b/llvm/include/llvm/Analysis/CmpInstAnalysis.h
@@ -17,7 +17,7 @@
 #include "llvm/IR/InstrTypes.h"
 
 namespace llvm {
-  class ICmpInst;
+  class Type;
   class Value;
 
   /// Encode a icmp predicate into a three bit mask. These bits are carefully
@@ -43,7 +43,7 @@ namespace llvm {
   /// 110     6   A <= B
   /// 111     7   Always true
   ///
-  unsigned getICmpCode(const ICmpInst *ICI, bool InvertPred = false);
+  unsigned getICmpCode(CmpInst::Predicate Pred);
 
   /// This is the complement of getICmpCode. It turns a predicate code into
   /// either a constant true or false or the predicate for a new ICmp.
@@ -58,6 +58,39 @@ namespace llvm {
   /// equality comparison (which is signless).
   bool predicatesFoldable(CmpInst::Predicate P1, CmpInst::Predicate P2);
 
+  /// Similar to getICmpCode but for FCmpInst. This encodes a fcmp predicate
+  /// into a four bit mask.
+  inline unsigned getFCmpCode(CmpInst::Predicate CC) {
+    assert(CmpInst::FCMP_FALSE <= CC && CC <= CmpInst::FCMP_TRUE &&
+           "Unexpected FCmp predicate!");
+    // Take advantage of the bit pattern of CmpInst::Predicate here.
+    //                                                U L G E
+    static_assert(CmpInst::FCMP_FALSE ==  0, "");  // 0 0 0 0
+    static_assert(CmpInst::FCMP_OEQ   ==  1, "");  // 0 0 0 1
+    static_assert(CmpInst::FCMP_OGT   ==  2, "");  // 0 0 1 0
+    static_assert(CmpInst::FCMP_OGE   ==  3, "");  // 0 0 1 1
+    static_assert(CmpInst::FCMP_OLT   ==  4, "");  // 0 1 0 0
+    static_assert(CmpInst::FCMP_OLE   ==  5, "");  // 0 1 0 1
+    static_assert(CmpInst::FCMP_ONE   ==  6, "");  // 0 1 1 0
+    static_assert(CmpInst::FCMP_ORD   ==  7, "");  // 0 1 1 1
+    static_assert(CmpInst::FCMP_UNO   ==  8, "");  // 1 0 0 0
+    static_assert(CmpInst::FCMP_UEQ   ==  9, "");  // 1 0 0 1
+    static_assert(CmpInst::FCMP_UGT   == 10, "");  // 1 0 1 0
+    static_assert(CmpInst::FCMP_UGE   == 11, "");  // 1 0 1 1
+    static_assert(CmpInst::FCMP_ULT   == 12, "");  // 1 1 0 0
+    static_assert(CmpInst::FCMP_ULE   == 13, "");  // 1 1 0 1
+    static_assert(CmpInst::FCMP_UNE   == 14, "");  // 1 1 1 0
+    static_assert(CmpInst::FCMP_TRUE  == 15, "");  // 1 1 1 1
+    return CC;
+  }
+
+  /// This is the complement of getFCmpCode. It turns a predicate code into
+  /// either a constant true or false or the predicate for a new FCmp.
+  /// Non-NULL return value will be a true or false constant.
+  /// NULL return means a new ICmp is needed. The predicate is output in Pred.
+  Constant *getPredForFCmpCode(unsigned Code, Type *OpTy,
+                               CmpInst::Predicate &Pred);
+
   /// Decompose an icmp into the form ((X & Mask) pred 0) if possible. The
   /// returned predicate is either == or !=. Returns false if decomposition
   /// fails.
diff --git a/llvm/include/llvm/Analysis/CodeMetrics.h b/llvm/include/llvm/Analysis/CodeMetrics.h
index 615591aa83ad..a9431bca1125 100644
--- a/llvm/include/llvm/Analysis/CodeMetrics.h
+++ b/llvm/include/llvm/Analysis/CodeMetrics.h
@@ -15,6 +15,7 @@
 #define LLVM_ANALYSIS_CODEMETRICS_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/InstructionCost.h"
 
 namespace llvm {
 class AssumptionCache;
@@ -47,14 +48,14 @@ struct CodeMetrics {
   /// True if this function calls alloca (in the C sense).
   bool usesDynamicAlloca = false;
 
-  /// Number of instructions in the analyzed blocks.
-  unsigned NumInsts = false;
+  /// Code size cost of the analyzed blocks.
+  InstructionCost NumInsts = 0;
 
   /// Number of analyzed blocks.
   unsigned NumBlocks = false;
 
   /// Keeps track of basic block code size estimates.
-  DenseMap<const BasicBlock *, unsigned> NumBBInsts;
+  DenseMap<const BasicBlock *, InstructionCost> NumBBInsts;
 
   /// Keep track of the number of calls to 'big' functions.
   unsigned NumCalls = false;
diff --git a/llvm/include/llvm/Analysis/ConstantFolding.h b/llvm/include/llvm/Analysis/ConstantFolding.h
index 37258c80e3a3..23ec7d6b70ec 100644
--- a/llvm/include/llvm/Analysis/ConstantFolding.h
+++ b/llvm/include/llvm/Analysis/ConstantFolding.h
@@ -19,16 +19,18 @@
 #ifndef LLVM_ANALYSIS_CONSTANTFOLDING_H
 #define LLVM_ANALYSIS_CONSTANTFOLDING_H
 
+#include <stdint.h>
+
 namespace llvm {
 class APInt;
 template <typename T> class ArrayRef;
 class CallBase;
 class Constant;
-class ConstantExpr;
 class DSOLocalEquivalent;
 class DataLayout;
 class Function;
 class GlobalValue;
+class GlobalVariable;
 class Instruction;
 class TargetLibraryInfo;
 class Type;
@@ -65,14 +67,13 @@ Constant *ConstantFoldInstOperands(Instruction *I, ArrayRef<Constant *> Ops,
                                    const DataLayout &DL,
                                    const TargetLibraryInfo *TLI = nullptr);
 
-/// ConstantFoldCompareInstOperands - Attempt to constant fold a compare
-/// instruction (icmp/fcmp) with the specified operands.  If it fails, it
-/// returns a constant expression of the specified operands.
-///
-Constant *
-ConstantFoldCompareInstOperands(unsigned Predicate, Constant *LHS,
-                                Constant *RHS, const DataLayout &DL,
-                                const TargetLibraryInfo *TLI = nullptr);
+/// Attempt to constant fold a compare instruction (icmp/fcmp) with the
+/// specified operands.  If it fails, it returns a constant expression of the
+/// specified operands.
+/// Denormal inputs may be flushed based on the denormal handling mode.
+Constant *ConstantFoldCompareInstOperands(
+    unsigned Predicate, Constant *LHS, Constant *RHS, const DataLayout &DL,
+    const TargetLibraryInfo *TLI = nullptr, const Instruction *I = nullptr);
 
 /// Attempt to constant fold a unary operation with the specified
 /// operand. If it fails, it returns a constant expression of the specified
@@ -86,6 +87,21 @@ Constant *ConstantFoldUnaryOpOperand(unsigned Opcode, Constant *Op,
 Constant *ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS,
                                        Constant *RHS, const DataLayout &DL);
 
+/// Attempt to constant fold a floating point binary operation with the
+/// specified operands, applying the denormal handling mod to the operands.  If
+/// it fails, it returns a constant expression of the specified operands.
+Constant *ConstantFoldFPInstOperands(unsigned Opcode, Constant *LHS,
+                                     Constant *RHS, const DataLayout &DL,
+                                     const Instruction *I);
+
+/// Attempt to flush float point constant according to denormal mode set in the
+/// instruction's parent function attributes. If so, return a zero with the
+/// correct sign, otherwise return the original constant. Inputs and outputs to
+/// floating point instructions can have their mode set separately, so the
+/// direction is also needed.
+Constant *FlushFPConstant(Constant *Operand, const Instruction *I,
+                          bool IsOutput);
+
 /// Attempt to constant fold a select instruction with the specified
 /// operands. The constant result is returned if successful; if not, null is
 /// returned.
@@ -173,6 +189,8 @@ Constant *ConstantFoldLoadThroughBitcast(Constant *C, Type *DestTy,
 /// Check whether the given call has no side-effects.
 /// Specifically checks for math routimes which sometimes set errno.
 bool isMathLibCallNoop(const CallBase *Call, const TargetLibraryInfo *TLI);
+
+Constant *ReadByteArrayFromGlobal(const GlobalVariable *GV, uint64_t Offset);
 }
 
 #endif
diff --git a/llvm/include/llvm/Analysis/ConstraintSystem.h b/llvm/include/llvm/Analysis/ConstraintSystem.h
index d7800f578325..2c83658b81dc 100644
--- a/llvm/include/llvm/Analysis/ConstraintSystem.h
+++ b/llvm/include/llvm/Analysis/ConstraintSystem.h
@@ -11,7 +11,6 @@
 
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 
 #include <string>
@@ -37,7 +36,7 @@ class ConstraintSystem {
   bool mayHaveSolutionImpl();
 
 public:
-  bool addVariableRow(const SmallVector<int64_t, 8> &R) {
+  bool addVariableRow(ArrayRef<int64_t> R) {
     assert(Constraints.empty() || R.size() == Constraints.back().size());
     // If all variable coefficients are 0, the constraint does not provide any
     // usable information.
@@ -49,11 +48,16 @@ public:
       GCD = APIntOps::GreatestCommonDivisor({32, (uint32_t)A}, {32, GCD})
                 .getZExtValue();
     }
-    Constraints.push_back(R);
+    Constraints.emplace_back(R.begin(), R.end());
     return true;
   }
 
-  bool addVariableRowFill(const SmallVector<int64_t, 8> &R) {
+  bool addVariableRowFill(ArrayRef<int64_t> R) {
+    // If all variable coefficients are 0, the constraint does not provide any
+    // usable information.
+    if (all_of(makeArrayRef(R).drop_front(1), [](int64_t C) { return C == 0; }))
+      return false;
+
     for (auto &CR : Constraints) {
       while (CR.size() != R.size())
         CR.push_back(0);
@@ -75,7 +79,14 @@ public:
 
   bool isConditionImplied(SmallVector<int64_t, 8> R) const;
 
+  ArrayRef<int64_t> getLastConstraint() { return Constraints[0]; }
   void popLastConstraint() { Constraints.pop_back(); }
+  void popLastNVariables(unsigned N) {
+    for (auto &C : Constraints) {
+      for (unsigned i = 0; i < N; i++)
+        C.pop_back();
+    }
+  }
 
   /// Returns the number of rows in the constraint system.
   unsigned size() const { return Constraints.size(); }
diff --git a/llvm/include/llvm/Analysis/DDG.h b/llvm/include/llvm/Analysis/DDG.h
index c5107da2a017..7649e630b23d 100644
--- a/llvm/include/llvm/Analysis/DDG.h
+++ b/llvm/include/llvm/Analysis/DDG.h
@@ -18,9 +18,11 @@
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/DependenceGraphBuilder.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
-#include "llvm/IR/Instructions.h"
 
 namespace llvm {
+class Function;
+class Loop;
+class LoopInfo;
 class DDGNode;
 class DDGEdge;
 using DDGNodeBase = DGNode<DDGNode, DDGEdge>;
diff --git a/llvm/include/llvm/Analysis/DDGPrinter.h b/llvm/include/llvm/Analysis/DDGPrinter.h
index 4477b387fe50..d93c28280bac 100644
--- a/llvm/include/llvm/Analysis/DDGPrinter.h
+++ b/llvm/include/llvm/Analysis/DDGPrinter.h
@@ -16,10 +16,11 @@
 #define LLVM_ANALYSIS_DDGPRINTER_H
 
 #include "llvm/Analysis/DDG.h"
-#include "llvm/Pass.h"
 #include "llvm/Support/DOTGraphTraits.h"
 
 namespace llvm {
+class LPMUpdater;
+class Loop;
 
 //===--------------------------------------------------------------------===//
 // Implementation of DDG DOT Printer for a loop.
diff --git a/llvm/include/llvm/Analysis/DOTGraphTraitsPass.h b/llvm/include/llvm/Analysis/DOTGraphTraitsPass.h
index d8021907b5b2..c35e189de6fc 100644
--- a/llvm/include/llvm/Analysis/DOTGraphTraitsPass.h
+++ b/llvm/include/llvm/Analysis/DOTGraphTraitsPass.h
@@ -14,23 +14,156 @@
 #define LLVM_ANALYSIS_DOTGRAPHTRAITSPASS_H
 
 #include "llvm/Analysis/CFGPrinter.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/GraphWriter.h"
 
 namespace llvm {
 
+/// Default traits class for extracting a graph from an analysis pass.
+///
+/// This assumes that 'GraphT' is 'AnalysisT::Result *', and pass it through
+template <typename Result, typename GraphT = Result *>
+struct DefaultAnalysisGraphTraits {
+  static GraphT getGraph(Result R) { return &R; }
+};
+
+template <typename GraphT>
+void viewGraphForFunction(Function &F, GraphT Graph, StringRef Name,
+                          bool IsSimple) {
+  std::string GraphName = DOTGraphTraits<GraphT *>::getGraphName(&Graph);
+
+  ViewGraph(Graph, Name, IsSimple,
+            GraphName + " for '" + F.getName() + "' function");
+}
+
+template <typename AnalysisT, bool IsSimple,
+          typename GraphT = typename AnalysisT::Result *,
+          typename AnalysisGraphTraitsT =
+              DefaultAnalysisGraphTraits<typename AnalysisT::Result &, GraphT>>
+struct DOTGraphTraitsViewer
+    : PassInfoMixin<DOTGraphTraitsViewer<AnalysisT, IsSimple, GraphT,
+                                         AnalysisGraphTraitsT>> {
+  DOTGraphTraitsViewer(StringRef GraphName) : Name(GraphName) {}
+
+  /// Return true if this function should be processed.
+  ///
+  /// An implementation of this class my override this function to indicate that
+  /// only certain functions should be viewed.
+  ///
+  /// @param Result The current analysis result for this function.
+  virtual bool processFunction(Function &F,
+                               const typename AnalysisT::Result &Result) {
+    return true;
+  }
+
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM) {
+    auto &Result = FAM.getResult<AnalysisT>(F);
+    if (!processFunction(F, Result))
+      return PreservedAnalyses::all();
+
+    GraphT Graph = AnalysisGraphTraitsT::getGraph(Result);
+    viewGraphForFunction(F, Graph, Name, IsSimple);
+
+    return PreservedAnalyses::all();
+  };
+
+protected:
+  /// Avoid compiler warning "has virtual functions but non-virtual destructor
+  /// [-Wnon-virtual-dtor]" in derived classes.
+  ///
+  /// DOTGraphTraitsViewer is also used as a mixin for avoiding repeated
+  /// implementation of viewer passes, ie there should be no
+  /// runtime-polymorphisms/downcasting involving this class and hence no
+  /// virtual destructor needed. Making this dtor protected stops accidental
+  /// invocation when the derived class destructor should have been called.
+  /// Those derived classes sould be marked final to avoid the warning.
+  ~DOTGraphTraitsViewer() {}
+
+private:
+  StringRef Name;
+};
+
+template <typename GraphT>
+void printGraphForFunction(Function &F, GraphT Graph, StringRef Name,
+                           bool IsSimple) {
+  std::string Filename = Name.str() + "." + F.getName().str() + ".dot";
+  std::error_code EC;
+
+  errs() << "Writing '" << Filename << "'...";
+
+  raw_fd_ostream File(Filename, EC, sys::fs::OF_TextWithCRLF);
+  std::string GraphName = DOTGraphTraits<GraphT>::getGraphName(Graph);
+
+  if (!EC)
+    WriteGraph(File, Graph, IsSimple,
+               GraphName + " for '" + F.getName() + "' function");
+  else
+    errs() << "  error opening file for writing!";
+  errs() << "\n";
+}
+
+template <typename AnalysisT, bool IsSimple,
+          typename GraphT = typename AnalysisT::Result *,
+          typename AnalysisGraphTraitsT =
+              DefaultAnalysisGraphTraits<typename AnalysisT::Result &, GraphT>>
+struct DOTGraphTraitsPrinter
+    : PassInfoMixin<DOTGraphTraitsPrinter<AnalysisT, IsSimple, GraphT,
+                                          AnalysisGraphTraitsT>> {
+  DOTGraphTraitsPrinter(StringRef GraphName) : Name(GraphName) {}
+
+  /// Return true if this function should be processed.
+  ///
+  /// An implementation of this class my override this function to indicate that
+  /// only certain functions should be viewed.
+  ///
+  /// @param Analysis The current analysis result for this function.
+  virtual bool processFunction(Function &F,
+                               const typename AnalysisT::Result &Result) {
+    return true;
+  }
+
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM) {
+    auto &Result = FAM.getResult<AnalysisT>(F);
+    if (!processFunction(F, Result))
+      return PreservedAnalyses::all();
+
+    GraphT Graph = AnalysisGraphTraitsT::getGraph(Result);
+
+    printGraphForFunction(F, Graph, Name, IsSimple);
+
+    return PreservedAnalyses::all();
+  };
+
+protected:
+  /// Avoid compiler warning "has virtual functions but non-virtual destructor
+  /// [-Wnon-virtual-dtor]" in derived classes.
+  ///
+  /// DOTGraphTraitsPrinter is also used as a mixin for avoiding repeated
+  /// implementation of printer passes, ie there should be no
+  /// runtime-polymorphisms/downcasting involving this class and hence no
+  /// virtual destructor needed. Making this dtor protected stops accidental
+  /// invocation when the derived class destructor should have been called.
+  /// Those derived classes sould be marked final to avoid the warning.
+  ~DOTGraphTraitsPrinter() {}
+
+private:
+  StringRef Name;
+};
+
 /// Default traits class for extracting a graph from an analysis pass.
 ///
 /// This assumes that 'GraphT' is 'AnalysisT *' and so just passes it through.
 template <typename AnalysisT, typename GraphT = AnalysisT *>
-struct DefaultAnalysisGraphTraits {
+struct LegacyDefaultAnalysisGraphTraits {
   static GraphT getGraph(AnalysisT *A) { return A; }
 };
 
-template <
-    typename AnalysisT, bool IsSimple, typename GraphT = AnalysisT *,
-    typename AnalysisGraphTraitsT = DefaultAnalysisGraphTraits<AnalysisT, GraphT> >
-class DOTGraphTraitsViewer : public FunctionPass {
+template <typename AnalysisT, bool IsSimple, typename GraphT = AnalysisT *,
+          typename AnalysisGraphTraitsT =
+              LegacyDefaultAnalysisGraphTraits<AnalysisT, GraphT>>
+class DOTGraphTraitsViewerWrapperPass : public FunctionPass {
 public:
-  DOTGraphTraitsViewer(StringRef GraphName, char &ID)
+  DOTGraphTraitsViewerWrapperPass(StringRef GraphName, char &ID)
       : FunctionPass(ID), Name(GraphName) {}
 
   /// Return true if this function should be processed.
@@ -50,10 +183,7 @@ public:
       return false;
 
     GraphT Graph = AnalysisGraphTraitsT::getGraph(&Analysis);
-    std::string GraphName = DOTGraphTraits<GraphT>::getGraphName(Graph);
-    std::string Title = GraphName + " for '" + F.getName().str() + "' function";
-
-    ViewGraph(Graph, Name, IsSimple, Title);
+    viewGraphForFunction(F, Graph, Name, IsSimple);
 
     return false;
   }
@@ -67,12 +197,12 @@ private:
   std::string Name;
 };
 
-template <
-    typename AnalysisT, bool IsSimple, typename GraphT = AnalysisT *,
-    typename AnalysisGraphTraitsT = DefaultAnalysisGraphTraits<AnalysisT, GraphT> >
-class DOTGraphTraitsPrinter : public FunctionPass {
+template <typename AnalysisT, bool IsSimple, typename GraphT = AnalysisT *,
+          typename AnalysisGraphTraitsT =
+              LegacyDefaultAnalysisGraphTraits<AnalysisT, GraphT>>
+class DOTGraphTraitsPrinterWrapperPass : public FunctionPass {
 public:
-  DOTGraphTraitsPrinter(StringRef GraphName, char &ID)
+  DOTGraphTraitsPrinterWrapperPass(StringRef GraphName, char &ID)
       : FunctionPass(ID), Name(GraphName) {}
 
   /// Return true if this function should be processed.
@@ -92,20 +222,7 @@ public:
       return false;
 
     GraphT Graph = AnalysisGraphTraitsT::getGraph(&Analysis);
-    std::string Filename = Name + "." + F.getName().str() + ".dot";
-    std::error_code EC;
-
-    errs() << "Writing '" << Filename << "'...";
-
-    raw_fd_ostream File(Filename, EC, sys::fs::OF_TextWithCRLF);
-    std::string GraphName = DOTGraphTraits<GraphT>::getGraphName(Graph);
-    std::string Title = GraphName + " for '" + F.getName().str() + "' function";
-
-    if (!EC)
-      WriteGraph(File, Graph, IsSimple, Title);
-    else
-      errs() << "  error opening file for writing!";
-    errs() << "\n";
+    printGraphForFunction(F, Graph, Name, IsSimple);
 
     return false;
   }
@@ -119,12 +236,12 @@ private:
   std::string Name;
 };
 
-template <
-    typename AnalysisT, bool IsSimple, typename GraphT = AnalysisT *,
-    typename AnalysisGraphTraitsT = DefaultAnalysisGraphTraits<AnalysisT, GraphT> >
-class DOTGraphTraitsModuleViewer : public ModulePass {
+template <typename AnalysisT, bool IsSimple, typename GraphT = AnalysisT *,
+          typename AnalysisGraphTraitsT =
+              LegacyDefaultAnalysisGraphTraits<AnalysisT, GraphT>>
+class DOTGraphTraitsModuleViewerWrapperPass : public ModulePass {
 public:
-  DOTGraphTraitsModuleViewer(StringRef GraphName, char &ID)
+  DOTGraphTraitsModuleViewerWrapperPass(StringRef GraphName, char &ID)
       : ModulePass(ID), Name(GraphName) {}
 
   bool runOnModule(Module &M) override {
@@ -145,12 +262,12 @@ private:
   std::string Name;
 };
 
-template <
-    typename AnalysisT, bool IsSimple, typename GraphT = AnalysisT *,
-    typename AnalysisGraphTraitsT = DefaultAnalysisGraphTraits<AnalysisT, GraphT> >
-class DOTGraphTraitsModulePrinter : public ModulePass {
+template <typename AnalysisT, bool IsSimple, typename GraphT = AnalysisT *,
+          typename AnalysisGraphTraitsT =
+              LegacyDefaultAnalysisGraphTraits<AnalysisT, GraphT>>
+class DOTGraphTraitsModulePrinterWrapperPass : public ModulePass {
 public:
-  DOTGraphTraitsModulePrinter(StringRef GraphName, char &ID)
+  DOTGraphTraitsModulePrinterWrapperPass(StringRef GraphName, char &ID)
       : ModulePass(ID), Name(GraphName) {}
 
   bool runOnModule(Module &M) override {
diff --git a/llvm/include/llvm/Analysis/Delinearization.h b/llvm/include/llvm/Analysis/Delinearization.h
index 6e942530f253..95a36b8b79a4 100644
--- a/llvm/include/llvm/Analysis/Delinearization.h
+++ b/llvm/include/llvm/Analysis/Delinearization.h
@@ -16,11 +16,11 @@
 #ifndef LLVM_ANALYSIS_DELINEARIZATION_H
 #define LLVM_ANALYSIS_DELINEARIZATION_H
 
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
+class raw_ostream;
+template <typename T> class SmallVectorImpl;
 class GetElementPtrInst;
 class ScalarEvolution;
 class SCEV;
@@ -125,6 +125,17 @@ bool getIndexExpressionsFromGEP(ScalarEvolution &SE,
                                 SmallVectorImpl<const SCEV *> &Subscripts,
                                 SmallVectorImpl<int> &Sizes);
 
+/// Implementation of fixed size array delinearization. Try to delinearize
+/// access function for a fixed size multi-dimensional array, by deriving
+/// subscripts from GEP instructions. Returns true upon success and false
+/// otherwise. \p Inst is the load/store instruction whose pointer operand is
+/// the one we want to delinearize. \p AccessFn is its corresponding SCEV
+/// expression w.r.t. the surrounding loop.
+bool tryDelinearizeFixedSizeImpl(ScalarEvolution *SE, Instruction *Inst,
+                                 const SCEV *AccessFn,
+                                 SmallVectorImpl<const SCEV *> &Subscripts,
+                                 SmallVectorImpl<int> &Sizes);
+
 struct DelinearizationPrinterPass
     : public PassInfoMixin<DelinearizationPrinterPass> {
   explicit DelinearizationPrinterPass(raw_ostream &OS);
diff --git a/llvm/include/llvm/Analysis/DependenceAnalysis.h b/llvm/include/llvm/Analysis/DependenceAnalysis.h
index 638f4869d677..a34afe9fb38d 100644
--- a/llvm/include/llvm/Analysis/DependenceAnalysis.h
+++ b/llvm/include/llvm/Analysis/DependenceAnalysis.h
@@ -927,9 +927,9 @@ namespace llvm {
     bool tryDelinearize(Instruction *Src, Instruction *Dst,
                         SmallVectorImpl<Subscript> &Pair);
 
-    /// Tries to delinearize access function for a fixed size multi-dimensional
-    /// array, by deriving subscripts from GEP instructions. Returns true upon
-    /// success and false otherwise.
+    /// Tries to delinearize \p Src and \p Dst access functions for a fixed size
+    /// multi-dimensional array. Calls tryDelinearizeFixedSizeImpl() to
+    /// delinearize \p Src and \p Dst separately,
     bool tryDelinearizeFixedSize(Instruction *Src, Instruction *Dst,
                                  const SCEV *SrcAccessFn,
                                  const SCEV *DstAccessFn,
diff --git a/llvm/include/llvm/Analysis/DivergenceAnalysis.h b/llvm/include/llvm/Analysis/DivergenceAnalysis.h
index c52b42ae8dc2..4c2a5399ea54 100644
--- a/llvm/include/llvm/Analysis/DivergenceAnalysis.h
+++ b/llvm/include/llvm/Analysis/DivergenceAnalysis.h
@@ -17,16 +17,16 @@
 
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/Analysis/SyncDependenceAnalysis.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Pass.h"
+#include "llvm/IR/PassManager.h"
 #include <vector>
 
 namespace llvm {
-class Value;
+class Function;
 class Instruction;
 class Loop;
 class raw_ostream;
 class TargetTransformInfo;
+class Value;
 
 /// \brief Generic divergence analysis for reducible CFGs.
 ///
@@ -41,7 +41,7 @@ public:
   /// \param RegionLoop if non-null the analysis is restricted to \p RegionLoop.
   /// Otherwise the whole function is analyzed.
   /// \param IsLCSSAForm whether the analysis may assume that the IR in the
-  /// region in in LCSSA form.
+  /// region in LCSSA form.
   DivergenceAnalysisImpl(const Function &F, const Loop *RegionLoop,
                          const DominatorTree &DT, const LoopInfo &LI,
                          SyncDependenceAnalysis &SDA, bool IsLCSSAForm);
diff --git a/llvm/include/llvm/Analysis/DomPrinter.h b/llvm/include/llvm/Analysis/DomPrinter.h
index e6df12d88072..83fe721346ab 100644
--- a/llvm/include/llvm/Analysis/DomPrinter.h
+++ b/llvm/include/llvm/Analysis/DomPrinter.h
@@ -14,30 +14,120 @@
 #ifndef LLVM_ANALYSIS_DOMPRINTER_H
 #define LLVM_ANALYSIS_DOMPRINTER_H
 
+#include "llvm/Analysis/DOTGraphTraitsPass.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
-class DomTreePrinterPass : public PassInfoMixin<DomTreePrinterPass> {
-public:
-  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+template <>
+struct DOTGraphTraits<DomTreeNode *> : public DefaultDOTGraphTraits {
+
+  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+
+  std::string getNodeLabel(DomTreeNode *Node, DomTreeNode *Graph) {
+
+    BasicBlock *BB = Node->getBlock();
+
+    if (!BB)
+      return "Post dominance root node";
+
+    if (isSimple())
+      return DOTGraphTraits<DOTFuncInfo *>::getSimpleNodeLabel(BB, nullptr);
+
+    return DOTGraphTraits<DOTFuncInfo *>::getCompleteNodeLabel(BB, nullptr);
+  }
+};
+
+template <>
+struct DOTGraphTraits<DominatorTree *>
+    : public DOTGraphTraits<DomTreeNode *> {
+
+  DOTGraphTraits(bool isSimple = false)
+      : DOTGraphTraits<DomTreeNode *>(isSimple) {}
+
+  static std::string getGraphName(DominatorTree *DT) {
+    return "Dominator tree";
+  }
+
+  std::string getNodeLabel(DomTreeNode *Node, DominatorTree *G) {
+    return DOTGraphTraits<DomTreeNode *>::getNodeLabel(Node,
+                                                             G->getRootNode());
+  }
+};
+
+template<>
+struct DOTGraphTraits<PostDominatorTree *>
+  : public DOTGraphTraits<DomTreeNode*> {
+
+  DOTGraphTraits (bool isSimple=false)
+    : DOTGraphTraits<DomTreeNode*>(isSimple) {}
+
+  static std::string getGraphName(PostDominatorTree *DT) {
+    return "Post dominator tree";
+  }
+
+  std::string getNodeLabel(DomTreeNode *Node,
+                           PostDominatorTree *G) {
+    return DOTGraphTraits<DomTreeNode*>::getNodeLabel(Node, G->getRootNode());
+  }
+};
+
+struct DomViewer final : DOTGraphTraitsViewer<DominatorTreeAnalysis, false> {
+  DomViewer() : DOTGraphTraitsViewer<DominatorTreeAnalysis, false>("dom") {}
+};
+
+struct DomOnlyViewer final : DOTGraphTraitsViewer<DominatorTreeAnalysis, true> {
+  DomOnlyViewer()
+      : DOTGraphTraitsViewer<DominatorTreeAnalysis, true>("domonly") {}
+};
+
+struct PostDomViewer final
+    : DOTGraphTraitsViewer<PostDominatorTreeAnalysis, false> {
+  PostDomViewer()
+      : DOTGraphTraitsViewer<PostDominatorTreeAnalysis, false>("postdom") {}
+};
+
+struct PostDomOnlyViewer final
+    : DOTGraphTraitsViewer<PostDominatorTreeAnalysis, true> {
+  PostDomOnlyViewer()
+      : DOTGraphTraitsViewer<PostDominatorTreeAnalysis, true>("postdomonly") {}
+};
+
+struct DomPrinter final : DOTGraphTraitsPrinter<DominatorTreeAnalysis, false> {
+  DomPrinter() : DOTGraphTraitsPrinter<DominatorTreeAnalysis, false>("dom") {}
+};
+
+struct DomOnlyPrinter final
+    : DOTGraphTraitsPrinter<DominatorTreeAnalysis, true> {
+  DomOnlyPrinter()
+      : DOTGraphTraitsPrinter<DominatorTreeAnalysis, true>("domonly") {}
+};
+
+struct PostDomPrinter final
+    : DOTGraphTraitsPrinter<PostDominatorTreeAnalysis, false> {
+  PostDomPrinter()
+      : DOTGraphTraitsPrinter<PostDominatorTreeAnalysis, false>("postdom") {}
 };
 
-class DomTreeOnlyPrinterPass : public PassInfoMixin<DomTreeOnlyPrinterPass> {
-public:
-  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+struct PostDomOnlyPrinter final
+    : DOTGraphTraitsPrinter<PostDominatorTreeAnalysis, true> {
+  PostDomOnlyPrinter()
+      : DOTGraphTraitsPrinter<PostDominatorTreeAnalysis, true>("postdomonly") {}
 };
 } // namespace llvm
 
 namespace llvm {
   class FunctionPass;
-  FunctionPass *createDomPrinterPass();
-  FunctionPass *createDomOnlyPrinterPass();
-  FunctionPass *createDomViewerPass();
-  FunctionPass *createDomOnlyViewerPass();
-  FunctionPass *createPostDomPrinterPass();
-  FunctionPass *createPostDomOnlyPrinterPass();
-  FunctionPass *createPostDomViewerPass();
-  FunctionPass *createPostDomOnlyViewerPass();
+  FunctionPass *createDomPrinterWrapperPassPass();
+  FunctionPass *createDomOnlyPrinterWrapperPassPass();
+  FunctionPass *createDomViewerWrapperPassPass();
+  FunctionPass *createDomOnlyViewerWrapperPassPass();
+  FunctionPass *createPostDomPrinterWrapperPassPass();
+  FunctionPass *createPostDomOnlyPrinterWrapperPassPass();
+  FunctionPass *createPostDomViewerWrapperPassPass();
+  FunctionPass *createPostDomOnlyViewerWrapperPassPass();
 } // End llvm namespace
 
 #endif
diff --git a/llvm/include/llvm/Analysis/DomTreeUpdater.h b/llvm/include/llvm/Analysis/DomTreeUpdater.h
index d09154d506ed..ddb958455ccd 100644
--- a/llvm/include/llvm/Analysis/DomTreeUpdater.h
+++ b/llvm/include/llvm/Analysis/DomTreeUpdater.h
@@ -150,49 +150,6 @@ public:
   /// awaiting deletion immediately.
   void recalculate(Function &F);
 
-  /// \deprecated { Submit an edge insertion to all available trees. The Eager
-  /// Strategy flushes this update immediately while the Lazy Strategy queues
-  /// the update. An internal function checks if the edge exists in the CFG in
-  /// DEBUG mode. CAUTION! This function has to be called *after* making the
-  /// update on the actual CFG. It is illegal to submit any update that has
-  /// already been applied. }
-  LLVM_ATTRIBUTE_DEPRECATED(void insertEdge(BasicBlock *From, BasicBlock *To),
-                            "Use applyUpdates() instead.");
-
-  /// \deprecated {Submit an edge insertion to all available trees.
-  /// Under either Strategy, an invalid update will be discard silently.
-  /// Invalid update means inserting an edge that does not exist in the CFG.
-  /// The Eager Strategy flushes this update immediately while the Lazy Strategy
-  /// queues the update. It is only recommended to use this method when you
-  /// want to discard an invalid update.
-  /// CAUTION! It is illegal to submit any update that has already been
-  /// submitted. }
-  LLVM_ATTRIBUTE_DEPRECATED(void insertEdgeRelaxed(BasicBlock *From,
-                                                   BasicBlock *To),
-                            "Use applyUpdatesPermissive() instead.");
-
-  /// \deprecated { Submit an edge deletion to all available trees. The Eager
-  /// Strategy flushes this update immediately while the Lazy Strategy queues
-  /// the update. An internal function checks if the edge doesn't exist in the
-  /// CFG in DEBUG mode.
-  /// CAUTION! This function has to be called *after* making the update on the
-  /// actual CFG. It is illegal to submit any update that has already been
-  /// submitted. }
-  LLVM_ATTRIBUTE_DEPRECATED(void deleteEdge(BasicBlock *From, BasicBlock *To),
-                            "Use applyUpdates() instead.");
-
-  /// \deprecated { Submit an edge deletion to all available trees.
-  /// Under either Strategy, an invalid update will be discard silently.
-  /// Invalid update means deleting an edge that exists in the CFG.
-  /// The Eager Strategy flushes this update immediately while the Lazy Strategy
-  /// queues the update. It is only recommended to use this method when you
-  /// want to discard an invalid update.
-  /// CAUTION! It is illegal to submit any update that has already been
-  /// submitted. }
-  LLVM_ATTRIBUTE_DEPRECATED(void deleteEdgeRelaxed(BasicBlock *From,
-                                                   BasicBlock *To),
-                            "Use applyUpdatesPermissive() instead.");
-
   /// Delete DelBB. DelBB will be removed from its Parent and
   /// erased from available trees if it exists and finally get deleted.
   /// Under Eager UpdateStrategy, DelBB will be processed immediately.
diff --git a/llvm/include/llvm/Analysis/DominanceFrontierImpl.h b/llvm/include/llvm/Analysis/DominanceFrontierImpl.h
index aa764be93b91..7a5f8f31bae3 100644
--- a/llvm/include/llvm/Analysis/DominanceFrontierImpl.h
+++ b/llvm/include/llvm/Analysis/DominanceFrontierImpl.h
@@ -17,7 +17,6 @@
 #ifndef LLVM_ANALYSIS_DOMINANCEFRONTIERIMPL_H
 #define LLVM_ANALYSIS_DOMINANCEFRONTIERIMPL_H
 
-#include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/DominanceFrontier.h"
 #include "llvm/Config/llvm-config.h"
diff --git a/llvm/include/llvm/Analysis/EHPersonalities.h b/llvm/include/llvm/Analysis/EHPersonalities.h
index eaada6627494..660d431bb063 100644
--- a/llvm/include/llvm/Analysis/EHPersonalities.h
+++ b/llvm/include/llvm/Analysis/EHPersonalities.h
@@ -11,7 +11,6 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/TinyPtrVector.h"
-#include "llvm/Support/ErrorHandling.h"
 
 namespace llvm {
 class BasicBlock;
diff --git a/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h b/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h
index cf07c873b17c..a0f5331fdba5 100644
--- a/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h
+++ b/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h
@@ -14,16 +14,33 @@
 #ifndef LLVM_ANALYSIS_FUNCTIONPROPERTIESANALYSIS_H
 #define LLVM_ANALYSIS_FUNCTIONPROPERTIESANALYSIS_H
 
-#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
+class DominatorTree;
 class Function;
+class LoopInfo;
 
 class FunctionPropertiesInfo {
+  friend class FunctionPropertiesUpdater;
+  void updateForBB(const BasicBlock &BB, int64_t Direction);
+  void updateAggregateStats(const Function &F, const LoopInfo &LI);
+  void reIncludeBB(const BasicBlock &BB);
+
 public:
-  static FunctionPropertiesInfo getFunctionPropertiesInfo(const Function &F,
-                                                          const LoopInfo &LI);
+  static FunctionPropertiesInfo
+  getFunctionPropertiesInfo(const Function &F, FunctionAnalysisManager &FAM);
+
+  bool operator==(const FunctionPropertiesInfo &FPI) const {
+    return std::memcmp(this, &FPI, sizeof(FunctionPropertiesInfo)) == 0;
+  }
+
+  bool operator!=(const FunctionPropertiesInfo &FPI) const {
+    return !(*this == FPI);
+  }
 
   void print(raw_ostream &OS) const;
 
@@ -57,6 +74,9 @@ public:
 
   // Number of Top Level Loops in the Function
   int64_t TopLevelLoopCount = 0;
+
+  // All non-debug instructions
+  int64_t TotalInstructionCount = 0;
 };
 
 // Analysis pass
@@ -66,9 +86,9 @@ class FunctionPropertiesAnalysis
 public:
   static AnalysisKey Key;
 
-  using Result = FunctionPropertiesInfo;
+  using Result = const FunctionPropertiesInfo;
 
-  Result run(Function &F, FunctionAnalysisManager &FAM);
+  FunctionPropertiesInfo run(Function &F, FunctionAnalysisManager &FAM);
 };
 
 /// Printer pass for the FunctionPropertiesAnalysis results.
@@ -82,5 +102,24 @@ public:
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
+/// Correctly update FunctionPropertiesInfo post-inlining. A
+/// FunctionPropertiesUpdater keeps the state necessary for tracking the changes
+/// llvm::InlineFunction makes. The idea is that inlining will at most modify
+/// a few BBs of the Caller (maybe the entry BB and definitely the callsite BB)
+/// and potentially affect exception handling BBs in the case of invoke
+/// inlining.
+class FunctionPropertiesUpdater {
+public:
+  FunctionPropertiesUpdater(FunctionPropertiesInfo &FPI, const CallBase &CB);
+
+  void finish(FunctionAnalysisManager &FAM) const;
+
+private:
+  FunctionPropertiesInfo &FPI;
+  const BasicBlock &CallSiteBB;
+  const Function &Caller;
+
+  DenseSet<const BasicBlock *> Successors;
+};
 } // namespace llvm
 #endif // LLVM_ANALYSIS_FUNCTIONPROPERTIESANALYSIS_H
diff --git a/llvm/include/llvm/Analysis/GlobalsModRef.h b/llvm/include/llvm/Analysis/GlobalsModRef.h
index 7daaa7f484de..4d8ed10bb18e 100644
--- a/llvm/include/llvm/Analysis/GlobalsModRef.h
+++ b/llvm/include/llvm/Analysis/GlobalsModRef.h
@@ -14,15 +14,14 @@
 #define LLVM_ANALYSIS_GLOBALSMODREF_H
 
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
 #include <list>
 
 namespace llvm {
 class CallGraph;
+class Function;
 
 /// An alias analysis result set for globals.
 ///
@@ -79,6 +78,8 @@ class GlobalsAAResult : public AAResultBase<GlobalsAAResult> {
       const DataLayout &DL,
       std::function<const TargetLibraryInfo &(Function &F)> GetTLI);
 
+  friend struct RecomputeGlobalsAAPass;
+
 public:
   GlobalsAAResult(GlobalsAAResult &&Arg);
   ~GlobalsAAResult();
@@ -139,6 +140,10 @@ public:
   GlobalsAAResult run(Module &M, ModuleAnalysisManager &AM);
 };
 
+struct RecomputeGlobalsAAPass : PassInfoMixin<RecomputeGlobalsAAPass> {
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
 /// Legacy wrapper pass to provide the GlobalsAAResult object.
 class GlobalsAAWrapperPass : public ModulePass {
   std::unique_ptr<GlobalsAAResult> Result;
diff --git a/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h b/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h
index 90ab2833e428..a3f1c1335cac 100644
--- a/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h
+++ b/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h
@@ -51,12 +51,13 @@
 
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Allocator.h"
 
 namespace llvm {
+class Module;
+
 namespace IRSimilarity {
 
 struct IRInstructionDataList;
@@ -546,7 +547,7 @@ struct IRInstructionMapper {
       // an outlined function. Also, assume-like intrinsics could be removed
       // from the region, removing arguments, causing discrepencies in the
       // number of inputs between different regions.
-      if (II.isLifetimeStartOrEnd() || II.isAssumeLikeIntrinsic())
+      if (II.isAssumeLikeIntrinsic())
         return Illegal;
       return EnableIntrinsics ? Legal : Illegal;
     }
@@ -559,6 +560,18 @@ struct IRInstructionMapper {
         return Illegal;
       if (!F && !IsIndirectCall)
         return Illegal;
+      // Functions marked with the swifttailcc and tailcc calling conventions
+      // require special handling when outlining musttail functions.  The
+      // calling convention must be passed down to the outlined function as
+      // well. Further, there is special handling for musttail calls as well,
+      // requiring a return call directly after.  For now, the outliner does not
+      // support this, so we do not handle matching this case either.
+      if ((CI.getCallingConv() == CallingConv::SwiftTail ||
+           CI.getCallingConv() == CallingConv::Tail) &&
+          !EnableMustTailCalls)
+        return Illegal;
+      if (CI.isMustTailCall() && !EnableMustTailCalls)
+        return Illegal;
       return Legal;
     }
     // TODO: We do not current handle similarity that changes the control flow.
@@ -580,6 +593,10 @@ struct IRInstructionMapper {
     // Flag that lets the classifier know whether we should allow intrinsics to
     // be checked for similarity.
     bool EnableIntrinsics = false;
+  
+    // Flag that lets the classifier know whether we should allow tail calls to
+    // be checked for similarity.
+    bool EnableMustTailCalls = false;
   };
 
   /// Maps an Instruction to a member of InstrType.
@@ -814,8 +831,6 @@ public:
   void getBasicBlocks(DenseSet<BasicBlock *> &BBSet) const {
     for (IRInstructionData &ID : *this) {
       BasicBlock *BB = ID.Inst->getParent();
-      if (BBSet.contains(BB))
-        continue;
       BBSet.insert(BB);
     }
   }
@@ -826,10 +841,8 @@ public:
                       SmallVector<BasicBlock *> &BBList) const {
     for (IRInstructionData &ID : *this) {
       BasicBlock *BB = ID.Inst->getParent();
-      if (BBSet.contains(BB))
-        continue;
-      BBSet.insert(BB);
-      BBList.push_back(BB);
+      if (BBSet.insert(BB).second)
+        BBList.push_back(BB);
     }
   }
 
@@ -967,11 +980,13 @@ public:
   IRSimilarityIdentifier(bool MatchBranches = true,
                          bool MatchIndirectCalls = true,
                          bool MatchCallsWithName = false,
-                         bool MatchIntrinsics = true)
+                         bool MatchIntrinsics = true,
+                         bool MatchMustTailCalls = true)
       : Mapper(&InstDataAllocator, &InstDataListAllocator),
         EnableBranches(MatchBranches), EnableIndirectCalls(MatchIndirectCalls),
         EnableMatchingCallsByName(MatchCallsWithName),
-        EnableIntrinsics(MatchIntrinsics) {}
+        EnableIntrinsics(MatchIntrinsics),
+        EnableMustTailCalls(MatchMustTailCalls) {}
 
 private:
   /// Map the instructions in the module to unsigned integers, using mapping
@@ -1024,7 +1039,7 @@ public:
     // If we've already analyzed a Module or set of Modules, so we must clear
     // the SimilarityCandidates to make sure we do not have only old values
     // hanging around.
-    if (SimilarityCandidates.hasValue())
+    if (SimilarityCandidates)
       SimilarityCandidates->clear();
     else
       SimilarityCandidates = SimilarityGroupList();
@@ -1064,6 +1079,10 @@ private:
   /// similarity.
   bool EnableIntrinsics = true;
 
+  // The flag variable that marks whether we should allow tailcalls
+  // to be checked for similarity.
+  bool EnableMustTailCalls = false;
+
   /// The SimilarityGroups found with the most recent run of \ref
   /// findSimilarity. None if there is no recent run.
   Optional<SimilarityGroupList> SimilarityCandidates;
diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index dec488a6f26d..231d3bbf534b 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -13,27 +13,23 @@
 #ifndef LLVM_ANALYSIS_IVDESCRIPTORS_H
 #define LLVM_ANALYSIS_IVDESCRIPTORS_H
 
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Operator.h"
 #include "llvm/IR/ValueHandle.h"
-#include "llvm/Support/Casting.h"
 
 namespace llvm {
 
-class DemandedBits;
 class AssumptionCache;
+class DemandedBits;
+class DominatorTree;
+class Instruction;
 class Loop;
 class PredicatedScalarEvolution;
 class ScalarEvolution;
 class SCEV;
-class DominatorTree;
+class StoreInst;
 
 /// These are the kinds of recurrences that we support.
 enum class RecurKind {
@@ -74,14 +70,14 @@ class RecurrenceDescriptor {
 public:
   RecurrenceDescriptor() = default;
 
-  RecurrenceDescriptor(Value *Start, Instruction *Exit, RecurKind K,
-                       FastMathFlags FMF, Instruction *ExactFP, Type *RT,
-                       bool Signed, bool Ordered,
+  RecurrenceDescriptor(Value *Start, Instruction *Exit, StoreInst *Store,
+                       RecurKind K, FastMathFlags FMF, Instruction *ExactFP,
+                       Type *RT, bool Signed, bool Ordered,
                        SmallPtrSetImpl<Instruction *> &CI,
                        unsigned MinWidthCastToRecurTy)
-      : StartValue(Start), LoopExitInstr(Exit), Kind(K), FMF(FMF),
-        ExactFPMathInst(ExactFP), RecurrenceType(RT), IsSigned(Signed),
-        IsOrdered(Ordered),
+      : IntermediateStore(Store), StartValue(Start), LoopExitInstr(Exit),
+        Kind(K), FMF(FMF), ExactFPMathInst(ExactFP), RecurrenceType(RT),
+        IsSigned(Signed), IsOrdered(Ordered),
         MinWidthCastToRecurrenceType(MinWidthCastToRecurTy) {
     CastInsts.insert(CI.begin(), CI.end());
   }
@@ -168,22 +164,21 @@ public:
   /// RecurrenceDescriptor. If either \p DB is non-null or \p AC and \p DT are
   /// non-null, the minimal bit width needed to compute the reduction will be
   /// computed.
-  static bool AddReductionVar(PHINode *Phi, RecurKind Kind, Loop *TheLoop,
-                              FastMathFlags FuncFMF,
-                              RecurrenceDescriptor &RedDes,
-                              DemandedBits *DB = nullptr,
-                              AssumptionCache *AC = nullptr,
-                              DominatorTree *DT = nullptr);
+  static bool
+  AddReductionVar(PHINode *Phi, RecurKind Kind, Loop *TheLoop,
+                  FastMathFlags FuncFMF, RecurrenceDescriptor &RedDes,
+                  DemandedBits *DB = nullptr, AssumptionCache *AC = nullptr,
+                  DominatorTree *DT = nullptr, ScalarEvolution *SE = nullptr);
 
   /// Returns true if Phi is a reduction in TheLoop. The RecurrenceDescriptor
   /// is returned in RedDes. If either \p DB is non-null or \p AC and \p DT are
   /// non-null, the minimal bit width needed to compute the reduction will be
-  /// computed.
-  static bool isReductionPHI(PHINode *Phi, Loop *TheLoop,
-                             RecurrenceDescriptor &RedDes,
-                             DemandedBits *DB = nullptr,
-                             AssumptionCache *AC = nullptr,
-                             DominatorTree *DT = nullptr);
+  /// computed. If \p SE is non-null, store instructions to loop invariant
+  /// addresses are processed.
+  static bool
+  isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes,
+                 DemandedBits *DB = nullptr, AssumptionCache *AC = nullptr,
+                 DominatorTree *DT = nullptr, ScalarEvolution *SE = nullptr);
 
   /// Returns true if Phi is a first-order recurrence. A first-order recurrence
   /// is a non-reduction recurrence relation in which the value of the
@@ -275,6 +270,11 @@ public:
            cast<IntrinsicInst>(I)->getIntrinsicID() == Intrinsic::fmuladd;
   }
 
+  /// Reductions may store temporary or final result to an invariant address.
+  /// If there is such a store in the loop then, after successfull run of
+  /// AddReductionVar method, this field will be assigned the last met store.
+  StoreInst *IntermediateStore = nullptr;
+
 private:
   // The starting value of the recurrence.
   // It does not have to be zero!
diff --git a/llvm/include/llvm/Analysis/IVUsers.h b/llvm/include/llvm/Analysis/IVUsers.h
index 390d09848dde..e5a496037691 100644
--- a/llvm/include/llvm/Analysis/IVUsers.h
+++ b/llvm/include/llvm/Analysis/IVUsers.h
@@ -23,8 +23,6 @@ namespace llvm {
 
 class AssumptionCache;
 class DominatorTree;
-class Instruction;
-class Value;
 class ScalarEvolution;
 class SCEV;
 class IVUsers;
diff --git a/llvm/include/llvm/Analysis/InlineAdvisor.h b/llvm/include/llvm/Analysis/InlineAdvisor.h
index 0103ee7f8386..31524126027b 100644
--- a/llvm/include/llvm/Analysis/InlineAdvisor.h
+++ b/llvm/include/llvm/Analysis/InlineAdvisor.h
@@ -9,19 +9,20 @@
 #ifndef LLVM_ANALYSIS_INLINEADVISOR_H
 #define LLVM_ANALYSIS_INLINEADVISOR_H
 
+#include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/LazyCallGraph.h"
-#include "llvm/Analysis/Utils/ImportedFunctionsInliningStatistics.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/PassManager.h"
 #include <memory>
-#include <unordered_set>
 
 namespace llvm {
 class BasicBlock;
 class CallBase;
 class Function;
 class Module;
+class OptimizationRemark;
+class ImportedFunctionsInliningStatistics;
 class OptimizationRemarkEmitter;
 struct ReplayInlinerSettings;
 
@@ -40,6 +41,28 @@ struct ReplayInlinerSettings;
 /// training.
 enum class InliningAdvisorMode : int { Default, Release, Development };
 
+// Each entry represents an inline driver.
+enum class InlinePass : int {
+  AlwaysInliner,
+  CGSCCInliner,
+  EarlyInliner,
+  ModuleInliner,
+  MLInliner,
+  ReplayCGSCCInliner,
+  ReplaySampleProfileInliner,
+  SampleProfileInliner,
+};
+
+/// Provides context on when an inline advisor is constructed in the pipeline
+/// (e.g., link phase, inline driver).
+struct InlineContext {
+  ThinOrFullLTOPhase LTOPhase;
+
+  InlinePass Pass;
+};
+
+std::string AnnotateInlinePassName(InlineContext IC);
+
 class InlineAdvisor;
 /// Capture state between an inlining decision having had been made, and
 /// its impact being observable. When collecting model training data, this
@@ -122,7 +145,7 @@ public:
   DefaultInlineAdvice(InlineAdvisor *Advisor, CallBase &CB,
                       Optional<InlineCost> OIC, OptimizationRemarkEmitter &ORE,
                       bool EmitRemarks = true)
-      : InlineAdvice(Advisor, CB, ORE, OIC.hasValue()), OriginalCB(&CB),
+      : InlineAdvice(Advisor, CB, ORE, OIC.has_value()), OriginalCB(&CB),
         OIC(OIC), EmitRemarks(EmitRemarks) {}
 
 private:
@@ -158,7 +181,7 @@ public:
   /// This must be called when the Inliner pass is entered, to allow the
   /// InlineAdvisor update internal state, as result of function passes run
   /// between Inliner pass runs (for the same module).
-  virtual void onPassEntry() {}
+  virtual void onPassEntry(LazyCallGraph::SCC *SCC = nullptr) {}
 
   /// This must be called when the Inliner pass is exited, as function passes
   /// may be run subsequently. This allows an implementation of InlineAdvisor
@@ -170,14 +193,22 @@ public:
     OS << "Unimplemented InlineAdvisor print\n";
   }
 
+  /// NOTE pass name is annotated only when inline advisor constructor provides InlineContext.
+  const char *getAnnotatedInlinePassName() const {
+    return AnnotatedInlinePassName.c_str();
+  }
+
 protected:
-  InlineAdvisor(Module &M, FunctionAnalysisManager &FAM);
+  InlineAdvisor(Module &M, FunctionAnalysisManager &FAM,
+                Optional<InlineContext> IC = NoneType::None);
   virtual std::unique_ptr<InlineAdvice> getAdviceImpl(CallBase &CB) = 0;
   virtual std::unique_ptr<InlineAdvice> getMandatoryAdvice(CallBase &CB,
                                                            bool Advice);
 
   Module &M;
   FunctionAnalysisManager &FAM;
+  const Optional<InlineContext> IC;
+  const std::string AnnotatedInlinePassName;
   std::unique_ptr<ImportedFunctionsInliningStatistics> ImportedFunctionsStats;
 
   enum class MandatoryInliningKind { NotMandatory, Always, Never };
@@ -198,8 +229,8 @@ private:
 class DefaultInlineAdvisor : public InlineAdvisor {
 public:
   DefaultInlineAdvisor(Module &M, FunctionAnalysisManager &FAM,
-                       InlineParams Params)
-      : InlineAdvisor(M, FAM), Params(Params) {}
+                       InlineParams Params, InlineContext IC)
+      : InlineAdvisor(M, FAM, IC), Params(Params) {}
 
 private:
   std::unique_ptr<InlineAdvice> getAdviceImpl(CallBase &CB) override;
@@ -223,7 +254,8 @@ public:
       return !PAC.preservedWhenStateless();
     }
     bool tryCreate(InlineParams Params, InliningAdvisorMode Mode,
-                   const ReplayInlinerSettings &ReplaySettings);
+                   const ReplayInlinerSettings &ReplaySettings,
+                   InlineContext IC);
     InlineAdvisor *getAdvisor() const { return Advisor.get(); }
 
   private:
@@ -244,6 +276,9 @@ public:
   explicit InlineAdvisorAnalysisPrinterPass(raw_ostream &OS) : OS(OS) {}
 
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
+
+  PreservedAnalyses run(LazyCallGraph::SCC &InitialC, CGSCCAnalysisManager &AM,
+                        LazyCallGraph &CG, CGSCCUpdateResult &UR);
 };
 
 std::unique_ptr<InlineAdvisor>
diff --git a/llvm/include/llvm/Analysis/InlineCost.h b/llvm/include/llvm/Analysis/InlineCost.h
index f86ee5a14874..756f1fb61f95 100644
--- a/llvm/include/llvm/Analysis/InlineCost.h
+++ b/llvm/include/llvm/Analysis/InlineCost.h
@@ -13,14 +13,17 @@
 #ifndef LLVM_ANALYSIS_INLINECOST_H
 #define LLVM_ANALYSIS_INLINECOST_H
 
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/Analysis/InlineModelFeatureMaps.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/IR/PassManager.h"
 #include <cassert>
 #include <climits>
 
 namespace llvm {
+class AssumptionCache;
+class OptimizationRemarkEmitter;
 class BlockFrequencyInfo;
 class CallBase;
 class DataLayout;
@@ -52,6 +55,9 @@ const unsigned TotalAllocaSizeRecursiveCaller = 1024;
 /// Do not inline dynamic allocas that have been constant propagated to be
 /// static allocas above this amount in bytes.
 const uint64_t MaxSimplifiedDynamicAllocaToInline = 65536;
+
+const char FunctionInlineCostMultiplierAttributeName[] =
+    "function-inline-cost-multiplier";
 } // namespace InlineConstants
 
 // The cost-benefit pair computed by cost-benefit analysis.
@@ -217,6 +223,8 @@ struct InlineParams {
   Optional<bool> AllowRecursiveCall = false;
 };
 
+Optional<int> getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind);
+
 /// Generate the parameters to tune the inline cost analysis based only on the
 /// commandline options.
 InlineParams getInlineParams();
diff --git a/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h b/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h
index 1afa8a825f15..fb8236c28b25 100644
--- a/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h
+++ b/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h
@@ -10,6 +10,8 @@
 #ifndef LLVM_ANALYSIS_INLINEMODELFEATUREMAPS_H
 #define LLVM_ANALYSIS_INLINEMODELFEATUREMAPS_H
 
+#include "llvm/Analysis/TensorSpec.h"
+
 #include <array>
 #include <string>
 #include <vector>
@@ -127,7 +129,7 @@ inlineCostFeatureToMlFeature(InlineCostFeatureIndex Feature) {
 constexpr size_t NumberOfFeatures =
     static_cast<size_t>(FeatureIndex::NumberOfFeatures);
 
-extern const std::array<std::string, NumberOfFeatures> FeatureNameMap;
+extern const std::array<TensorSpec, NumberOfFeatures> FeatureMap;
 
 extern const char *const DecisionName;
 extern const char *const DefaultDecisionName;
diff --git a/llvm/include/llvm/Analysis/InlineOrder.h b/llvm/include/llvm/Analysis/InlineOrder.h
index 84252bcf1b06..aabd86c98780 100644
--- a/llvm/include/llvm/Analysis/InlineOrder.h
+++ b/llvm/include/llvm/Analysis/InlineOrder.h
@@ -10,10 +10,9 @@
 #define LLVM_ANALYSIS_INLINEORDER_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
+#include "llvm/IR/InstrTypes.h"
 #include <algorithm>
 #include <utility>
 
@@ -71,34 +70,52 @@ private:
   size_t FirstIndex = 0;
 };
 
-class InlineSizePriority {
+class InlinePriority {
 public:
-  InlineSizePriority(int Size) : Size(Size) {}
+  virtual ~InlinePriority() = default;
+  virtual bool hasLowerPriority(const CallBase *L, const CallBase *R) const = 0;
+  virtual void update(const CallBase *CB) = 0;
+  virtual bool updateAndCheckDecreased(const CallBase *CB) = 0;
+};
 
-  static bool isMoreDesirable(const InlineSizePriority &S1,
-                              const InlineSizePriority &S2) {
-    return S1.Size < S2.Size;
-  }
+class SizePriority : public InlinePriority {
+  using PriorityT = unsigned;
+  DenseMap<const CallBase *, PriorityT> Priorities;
 
-  static InlineSizePriority evaluate(CallBase *CB) {
+  static PriorityT evaluate(const CallBase *CB) {
     Function *Callee = CB->getCalledFunction();
-    return InlineSizePriority(Callee->getInstructionCount());
+    return Callee->getInstructionCount();
+  }
+
+  static bool isMoreDesirable(const PriorityT &P1, const PriorityT &P2) {
+    return P1 < P2;
   }
 
-  int Size;
+  bool hasLowerPriority(const CallBase *L, const CallBase *R) const override {
+    const auto I1 = Priorities.find(L);
+    const auto I2 = Priorities.find(R);
+    assert(I1 != Priorities.end() && I2 != Priorities.end());
+    return isMoreDesirable(I2->second, I1->second);
+  }
+
+public:
+  // Update the priority associated with CB.
+  void update(const CallBase *CB) override { Priorities[CB] = evaluate(CB); };
+
+  bool updateAndCheckDecreased(const CallBase *CB) override {
+    auto It = Priorities.find(CB);
+    const auto OldPriority = It->second;
+    It->second = evaluate(CB);
+    const auto NewPriority = It->second;
+    return isMoreDesirable(OldPriority, NewPriority);
+  }
 };
 
-template <typename PriorityT>
 class PriorityInlineOrder : public InlineOrder<std::pair<CallBase *, int>> {
   using T = std::pair<CallBase *, int>;
-  using HeapT = std::pair<CallBase *, PriorityT>;
   using reference = T &;
   using const_reference = const T &;
 
-  static bool cmp(const HeapT &P1, const HeapT &P2) {
-    return PriorityT::isMoreDesirable(P2.second, P1.second);
-  }
-
   // A call site could become less desirable for inlining because of the size
   // growth from prior inlining into the callee. This method is used to lazily
   // update the desirability of a call site if it's decreasing. It is only
@@ -107,31 +124,29 @@ class PriorityInlineOrder : public InlineOrder<std::pair<CallBase *, int>> {
   // pushed right back into the heap. For simplicity, those cases where
   // the desirability of a call site increases are ignored here.
   void adjust() {
-    bool Changed = false;
-    do {
-      CallBase *CB = Heap.front().first;
-      const PriorityT PreviousGoodness = Heap.front().second;
-      const PriorityT CurrentGoodness = PriorityT::evaluate(CB);
-      Changed = PriorityT::isMoreDesirable(PreviousGoodness, CurrentGoodness);
-      if (Changed) {
-        std::pop_heap(Heap.begin(), Heap.end(), cmp);
-        Heap.pop_back();
-        Heap.push_back({CB, CurrentGoodness});
-        std::push_heap(Heap.begin(), Heap.end(), cmp);
-      }
-    } while (Changed);
+    while (PriorityPtr->updateAndCheckDecreased(Heap.front())) {
+      std::pop_heap(Heap.begin(), Heap.end(), isLess);
+      std::push_heap(Heap.begin(), Heap.end(), isLess);
+    }
   }
 
 public:
+  PriorityInlineOrder(std::unique_ptr<InlinePriority> PriorityPtr)
+      : PriorityPtr(std::move(PriorityPtr)) {
+    isLess = [this](const CallBase *L, const CallBase *R) {
+      return this->PriorityPtr->hasLowerPriority(L, R);
+    };
+  }
+
   size_t size() override { return Heap.size(); }
 
   void push(const T &Elt) override {
     CallBase *CB = Elt.first;
     const int InlineHistoryID = Elt.second;
-    const PriorityT Goodness = PriorityT::evaluate(CB);
 
-    Heap.push_back({CB, Goodness});
-    std::push_heap(Heap.begin(), Heap.end(), cmp);
+    Heap.push_back(CB);
+    PriorityPtr->update(CB);
+    std::push_heap(Heap.begin(), Heap.end(), isLess);
     InlineHistoryMap[CB] = InlineHistoryID;
   }
 
@@ -139,10 +154,10 @@ public:
     assert(size() > 0);
     adjust();
 
-    CallBase *CB = Heap.front().first;
+    CallBase *CB = Heap.front();
     T Result = std::make_pair(CB, InlineHistoryMap[CB]);
     InlineHistoryMap.erase(CB);
-    std::pop_heap(Heap.begin(), Heap.end(), cmp);
+    std::pop_heap(Heap.begin(), Heap.end(), isLess);
     Heap.pop_back();
     return Result;
   }
@@ -151,21 +166,23 @@ public:
     assert(size() > 0);
     adjust();
 
-    CallBase *CB = Heap.front().first;
+    CallBase *CB = Heap.front();
     return *InlineHistoryMap.find(CB);
   }
 
   void erase_if(function_ref<bool(T)> Pred) override {
-    auto PredWrapper = [=](HeapT P) -> bool {
-      return Pred(std::make_pair(P.first, 0));
+    auto PredWrapper = [=](CallBase *CB) -> bool {
+      return Pred(std::make_pair(CB, 0));
     };
     llvm::erase_if(Heap, PredWrapper);
-    std::make_heap(Heap.begin(), Heap.end(), cmp);
+    std::make_heap(Heap.begin(), Heap.end(), isLess);
   }
 
 private:
-  SmallVector<HeapT, 16> Heap;
+  SmallVector<CallBase *, 16> Heap;
+  std::function<bool(const CallBase *L, const CallBase *R)> isLess;
   DenseMap<CallBase *, int> InlineHistoryMap;
+  std::unique_ptr<InlinePriority> PriorityPtr;
 };
 } // namespace llvm
 #endif // LLVM_ANALYSIS_INLINEORDER_H
diff --git a/llvm/include/llvm/Analysis/InstSimplifyFolder.h b/llvm/include/llvm/Analysis/InstSimplifyFolder.h
index 54ef1ddf6085..d4ea7d73ec92 100644
--- a/llvm/include/llvm/Analysis/InstSimplifyFolder.h
+++ b/llvm/include/llvm/Analysis/InstSimplifyFolder.h
@@ -22,12 +22,11 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/TargetFolder.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/IR/IRBuilderFolder.h"
-#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 
 namespace llvm {
+class Constant;
 
 /// InstSimplifyFolder - Use InstructionSimplify to fold operations to existing
 /// values. Also applies target-specific constant folding when not using
@@ -47,108 +46,74 @@ public:
   // Return an existing value or a constant if the operation can be simplified.
   // Otherwise return nullptr.
   //===--------------------------------------------------------------------===//
-  Value *FoldAdd(Value *LHS, Value *RHS, bool HasNUW = false,
-                 bool HasNSW = false) const override {
-    return SimplifyAddInst(LHS, RHS, HasNUW, HasNSW, SQ);
+
+  Value *FoldBinOp(Instruction::BinaryOps Opc, Value *LHS,
+                   Value *RHS) const override {
+    return simplifyBinOp(Opc, LHS, RHS, SQ);
+  }
+
+  Value *FoldExactBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS,
+                        bool IsExact) const override {
+    return simplifyBinOp(Opc, LHS, RHS, SQ);
   }
 
-  Value *FoldAnd(Value *LHS, Value *RHS) const override {
-    return SimplifyAndInst(LHS, RHS, SQ);
+  Value *FoldNoWrapBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS,
+                         bool HasNUW, bool HasNSW) const override {
+    return simplifyBinOp(Opc, LHS, RHS, SQ);
   }
 
-  Value *FoldOr(Value *LHS, Value *RHS) const override {
-    return SimplifyOrInst(LHS, RHS, SQ);
+  Value *FoldBinOpFMF(Instruction::BinaryOps Opc, Value *LHS, Value *RHS,
+                      FastMathFlags FMF) const override {
+    return simplifyBinOp(Opc, LHS, RHS, FMF, SQ);
   }
 
   Value *FoldICmp(CmpInst::Predicate P, Value *LHS, Value *RHS) const override {
-    return SimplifyICmpInst(P, LHS, RHS, SQ);
+    return simplifyICmpInst(P, LHS, RHS, SQ);
   }
 
   Value *FoldGEP(Type *Ty, Value *Ptr, ArrayRef<Value *> IdxList,
                  bool IsInBounds = false) const override {
-    return SimplifyGEPInst(Ty, Ptr, IdxList, IsInBounds, SQ);
+    return simplifyGEPInst(Ty, Ptr, IdxList, IsInBounds, SQ);
   }
 
   Value *FoldSelect(Value *C, Value *True, Value *False) const override {
-    return SimplifySelectInst(C, True, False, SQ);
+    return simplifySelectInst(C, True, False, SQ);
   }
 
-  //===--------------------------------------------------------------------===//
-  // Binary Operators
-  //===--------------------------------------------------------------------===//
+  Value *FoldExtractValue(Value *Agg,
+                          ArrayRef<unsigned> IdxList) const override {
+    return simplifyExtractValueInst(Agg, IdxList, SQ);
+  };
 
-  Value *CreateFAdd(Constant *LHS, Constant *RHS) const override {
-    return ConstFolder.CreateFAdd(LHS, RHS);
-  }
-  Value *CreateSub(Constant *LHS, Constant *RHS, bool HasNUW = false,
-                   bool HasNSW = false) const override {
-    return ConstFolder.CreateSub(LHS, RHS, HasNUW, HasNSW);
-  }
-  Value *CreateFSub(Constant *LHS, Constant *RHS) const override {
-    return ConstFolder.CreateFSub(LHS, RHS);
-  }
-  Value *CreateMul(Constant *LHS, Constant *RHS, bool HasNUW = false,
-                   bool HasNSW = false) const override {
-    return ConstFolder.CreateMul(LHS, RHS, HasNUW, HasNSW);
-  }
-  Value *CreateFMul(Constant *LHS, Constant *RHS) const override {
-    return ConstFolder.CreateFMul(LHS, RHS);
-  }
-  Value *CreateUDiv(Constant *LHS, Constant *RHS,
-                    bool isExact = false) const override {
-    return ConstFolder.CreateUDiv(LHS, RHS, isExact);
-  }
-  Value *CreateSDiv(Constant *LHS, Constant *RHS,
-                    bool isExact = false) const override {
-    return ConstFolder.CreateSDiv(LHS, RHS, isExact);
-  }
-  Value *CreateFDiv(Constant *LHS, Constant *RHS) const override {
-    return ConstFolder.CreateFDiv(LHS, RHS);
-  }
-  Value *CreateURem(Constant *LHS, Constant *RHS) const override {
-    return ConstFolder.CreateURem(LHS, RHS);
-  }
-  Value *CreateSRem(Constant *LHS, Constant *RHS) const override {
-    return ConstFolder.CreateSRem(LHS, RHS);
-  }
-  Value *CreateFRem(Constant *LHS, Constant *RHS) const override {
-    return ConstFolder.CreateFRem(LHS, RHS);
-  }
-  Value *CreateShl(Constant *LHS, Constant *RHS, bool HasNUW = false,
-                   bool HasNSW = false) const override {
-    return ConstFolder.CreateShl(LHS, RHS, HasNUW, HasNSW);
-  }
-  Value *CreateLShr(Constant *LHS, Constant *RHS,
-                    bool isExact = false) const override {
-    return ConstFolder.CreateLShr(LHS, RHS, isExact);
+  Value *FoldInsertValue(Value *Agg, Value *Val,
+                         ArrayRef<unsigned> IdxList) const override {
+    return simplifyInsertValueInst(Agg, Val, IdxList, SQ);
   }
-  Value *CreateAShr(Constant *LHS, Constant *RHS,
-                    bool isExact = false) const override {
-    return ConstFolder.CreateAShr(LHS, RHS, isExact);
+
+  Value *FoldExtractElement(Value *Vec, Value *Idx) const override {
+    return simplifyExtractElementInst(Vec, Idx, SQ);
   }
-  Value *CreateXor(Constant *LHS, Constant *RHS) const override {
-    return ConstFolder.CreateXor(LHS, RHS);
+
+  Value *FoldInsertElement(Value *Vec, Value *NewElt,
+                           Value *Idx) const override {
+    return simplifyInsertElementInst(Vec, NewElt, Idx, SQ);
   }
 
-  Value *CreateBinOp(Instruction::BinaryOps Opc, Constant *LHS,
-                     Constant *RHS) const override {
-    return ConstFolder.CreateBinOp(Opc, LHS, RHS);
+  Value *FoldShuffleVector(Value *V1, Value *V2,
+                           ArrayRef<int> Mask) const override {
+    Type *RetTy = VectorType::get(
+        cast<VectorType>(V1->getType())->getElementType(), Mask.size(),
+        isa<ScalableVectorType>(V1->getType()));
+    return simplifyShuffleVectorInst(V1, V2, Mask, RetTy, SQ);
   }
 
   //===--------------------------------------------------------------------===//
   // Unary Operators
   //===--------------------------------------------------------------------===//
 
-  Value *CreateNeg(Constant *C, bool HasNUW = false,
-                   bool HasNSW = false) const override {
-    return ConstFolder.CreateNeg(C, HasNUW, HasNSW);
-  }
   Value *CreateFNeg(Constant *C) const override {
     return ConstFolder.CreateFNeg(C);
   }
-  Value *CreateNot(Constant *C) const override {
-    return ConstFolder.CreateNot(C);
-  }
 
   Value *CreateUnOp(Instruction::UnaryOps Opc, Constant *C) const override {
     return ConstFolder.CreateUnOp(Opc, C);
@@ -220,34 +185,6 @@ public:
                     Constant *RHS) const override {
     return ConstFolder.CreateFCmp(P, LHS, RHS);
   }
-
-  //===--------------------------------------------------------------------===//
-  // Other Instructions
-  //===--------------------------------------------------------------------===//
-
-  Value *CreateExtractElement(Constant *Vec, Constant *Idx) const override {
-    return ConstFolder.CreateExtractElement(Vec, Idx);
-  }
-
-  Value *CreateInsertElement(Constant *Vec, Constant *NewElt,
-                             Constant *Idx) const override {
-    return ConstFolder.CreateInsertElement(Vec, NewElt, Idx);
-  }
-
-  Value *CreateShuffleVector(Constant *V1, Constant *V2,
-                             ArrayRef<int> Mask) const override {
-    return ConstFolder.CreateShuffleVector(V1, V2, Mask);
-  }
-
-  Value *CreateExtractValue(Constant *Agg,
-                            ArrayRef<unsigned> IdxList) const override {
-    return ConstFolder.CreateExtractValue(Agg, IdxList);
-  }
-
-  Value *CreateInsertValue(Constant *Agg, Constant *Val,
-                           ArrayRef<unsigned> IdxList) const override {
-    return ConstFolder.CreateInsertValue(Agg, Val, IdxList);
-  }
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Analysis/InstructionSimplify.h b/llvm/include/llvm/Analysis/InstructionSimplify.h
index 8b49c115f101..52d43bf5c2a6 100644
--- a/llvm/include/llvm/Analysis/InstructionSimplify.h
+++ b/llvm/include/llvm/Analysis/InstructionSimplify.h
@@ -35,8 +35,6 @@
 #ifndef LLVM_ANALYSIS_INSTRUCTIONSIMPLIFY_H
 #define LLVM_ANALYSIS_INSTRUCTIONSIMPLIFY_H
 
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Operator.h"
 #include "llvm/IR/PatternMatch.h"
 
 namespace llvm {
@@ -49,6 +47,7 @@ class CallBase;
 class DataLayout;
 class DominatorTree;
 class Function;
+class Instruction;
 struct LoopStandardAnalysisResults;
 class MDNode;
 class OptimizationRemarkEmitter;
@@ -145,176 +144,185 @@ struct SimplifyQuery {
 // Please use the SimplifyQuery versions in new code.
 
 /// Given operand for an FNeg, fold the result or return null.
-Value *SimplifyFNegInst(Value *Op, FastMathFlags FMF, const SimplifyQuery &Q);
+Value *simplifyFNegInst(Value *Op, FastMathFlags FMF, const SimplifyQuery &Q);
 
 /// Given operands for an Add, fold the result or return null.
-Value *SimplifyAddInst(Value *LHS, Value *RHS, bool isNSW, bool isNUW,
+Value *simplifyAddInst(Value *LHS, Value *RHS, bool isNSW, bool isNUW,
                        const SimplifyQuery &Q);
 
 /// Given operands for a Sub, fold the result or return null.
-Value *SimplifySubInst(Value *LHS, Value *RHS, bool isNSW, bool isNUW,
+Value *simplifySubInst(Value *LHS, Value *RHS, bool isNSW, bool isNUW,
                        const SimplifyQuery &Q);
 
 /// Given operands for an FAdd, fold the result or return null.
 Value *
-SimplifyFAddInst(Value *LHS, Value *RHS, FastMathFlags FMF,
+simplifyFAddInst(Value *LHS, Value *RHS, FastMathFlags FMF,
                  const SimplifyQuery &Q,
                  fp::ExceptionBehavior ExBehavior = fp::ebIgnore,
                  RoundingMode Rounding = RoundingMode::NearestTiesToEven);
 
 /// Given operands for an FSub, fold the result or return null.
 Value *
-SimplifyFSubInst(Value *LHS, Value *RHS, FastMathFlags FMF,
+simplifyFSubInst(Value *LHS, Value *RHS, FastMathFlags FMF,
                  const SimplifyQuery &Q,
                  fp::ExceptionBehavior ExBehavior = fp::ebIgnore,
                  RoundingMode Rounding = RoundingMode::NearestTiesToEven);
 
 /// Given operands for an FMul, fold the result or return null.
 Value *
-SimplifyFMulInst(Value *LHS, Value *RHS, FastMathFlags FMF,
+simplifyFMulInst(Value *LHS, Value *RHS, FastMathFlags FMF,
                  const SimplifyQuery &Q,
                  fp::ExceptionBehavior ExBehavior = fp::ebIgnore,
                  RoundingMode Rounding = RoundingMode::NearestTiesToEven);
 
 /// Given operands for the multiplication of a FMA, fold the result or return
-/// null. In contrast to SimplifyFMulInst, this function will not perform
+/// null. In contrast to simplifyFMulInst, this function will not perform
 /// simplifications whose unrounded results differ when rounded to the argument
 /// type.
-Value *SimplifyFMAFMul(Value *LHS, Value *RHS, FastMathFlags FMF,
+Value *simplifyFMAFMul(Value *LHS, Value *RHS, FastMathFlags FMF,
                        const SimplifyQuery &Q,
                        fp::ExceptionBehavior ExBehavior = fp::ebIgnore,
                        RoundingMode Rounding = RoundingMode::NearestTiesToEven);
 
 /// Given operands for a Mul, fold the result or return null.
-Value *SimplifyMulInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
+Value *simplifyMulInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
 
 /// Given operands for an SDiv, fold the result or return null.
-Value *SimplifySDivInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
+Value *simplifySDivInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
 
 /// Given operands for a UDiv, fold the result or return null.
-Value *SimplifyUDivInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
+Value *simplifyUDivInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
 
 /// Given operands for an FDiv, fold the result or return null.
 Value *
-SimplifyFDivInst(Value *LHS, Value *RHS, FastMathFlags FMF,
+simplifyFDivInst(Value *LHS, Value *RHS, FastMathFlags FMF,
                  const SimplifyQuery &Q,
                  fp::ExceptionBehavior ExBehavior = fp::ebIgnore,
                  RoundingMode Rounding = RoundingMode::NearestTiesToEven);
 
 /// Given operands for an SRem, fold the result or return null.
-Value *SimplifySRemInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
+Value *simplifySRemInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
 
 /// Given operands for a URem, fold the result or return null.
-Value *SimplifyURemInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
+Value *simplifyURemInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
 
 /// Given operands for an FRem, fold the result or return null.
 Value *
-SimplifyFRemInst(Value *LHS, Value *RHS, FastMathFlags FMF,
+simplifyFRemInst(Value *LHS, Value *RHS, FastMathFlags FMF,
                  const SimplifyQuery &Q,
                  fp::ExceptionBehavior ExBehavior = fp::ebIgnore,
                  RoundingMode Rounding = RoundingMode::NearestTiesToEven);
 
 /// Given operands for a Shl, fold the result or return null.
-Value *SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
+Value *simplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                        const SimplifyQuery &Q);
 
 /// Given operands for a LShr, fold the result or return null.
-Value *SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
+Value *simplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
                         const SimplifyQuery &Q);
 
 /// Given operands for a AShr, fold the result or return nulll.
-Value *SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
+Value *simplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
                         const SimplifyQuery &Q);
 
 /// Given operands for an And, fold the result or return null.
-Value *SimplifyAndInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
+Value *simplifyAndInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
 
 /// Given operands for an Or, fold the result or return null.
-Value *SimplifyOrInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
+Value *simplifyOrInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
 
 /// Given operands for an Xor, fold the result or return null.
-Value *SimplifyXorInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
+Value *simplifyXorInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
 
 /// Given operands for an ICmpInst, fold the result or return null.
-Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
+Value *simplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                         const SimplifyQuery &Q);
 
 /// Given operands for an FCmpInst, fold the result or return null.
-Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
+Value *simplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                         FastMathFlags FMF, const SimplifyQuery &Q);
 
 /// Given operands for a SelectInst, fold the result or return null.
-Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
+Value *simplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
                           const SimplifyQuery &Q);
 
 /// Given operands for a GetElementPtrInst, fold the result or return null.
-Value *SimplifyGEPInst(Type *SrcTy, Value *Ptr, ArrayRef<Value *> Indices,
+Value *simplifyGEPInst(Type *SrcTy, Value *Ptr, ArrayRef<Value *> Indices,
                        bool InBounds, const SimplifyQuery &Q);
 
 /// Given operands for an InsertValueInst, fold the result or return null.
-Value *SimplifyInsertValueInst(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs,
+Value *simplifyInsertValueInst(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs,
                                const SimplifyQuery &Q);
 
 /// Given operands for an InsertElement, fold the result or return null.
-Value *SimplifyInsertElementInst(Value *Vec, Value *Elt, Value *Idx,
+Value *simplifyInsertElementInst(Value *Vec, Value *Elt, Value *Idx,
                                  const SimplifyQuery &Q);
 
 /// Given operands for an ExtractValueInst, fold the result or return null.
-Value *SimplifyExtractValueInst(Value *Agg, ArrayRef<unsigned> Idxs,
+Value *simplifyExtractValueInst(Value *Agg, ArrayRef<unsigned> Idxs,
                                 const SimplifyQuery &Q);
 
 /// Given operands for an ExtractElementInst, fold the result or return null.
-Value *SimplifyExtractElementInst(Value *Vec, Value *Idx,
+Value *simplifyExtractElementInst(Value *Vec, Value *Idx,
                                   const SimplifyQuery &Q);
 
 /// Given operands for a CastInst, fold the result or return null.
-Value *SimplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty,
+Value *simplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty,
                         const SimplifyQuery &Q);
 
 /// Given operands for a ShuffleVectorInst, fold the result or return null.
 /// See class ShuffleVectorInst for a description of the mask representation.
-Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, ArrayRef<int> Mask,
+Value *simplifyShuffleVectorInst(Value *Op0, Value *Op1, ArrayRef<int> Mask,
                                  Type *RetTy, const SimplifyQuery &Q);
 
 //=== Helper functions for higher up the class hierarchy.
 
 /// Given operands for a CmpInst, fold the result or return null.
-Value *SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
+Value *simplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                        const SimplifyQuery &Q);
 
 /// Given operand for a UnaryOperator, fold the result or return null.
-Value *SimplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q);
+Value *simplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q);
 
 /// Given operand for a UnaryOperator, fold the result or return null.
 /// Try to use FastMathFlags when folding the result.
-Value *SimplifyUnOp(unsigned Opcode, Value *Op, FastMathFlags FMF,
+Value *simplifyUnOp(unsigned Opcode, Value *Op, FastMathFlags FMF,
                     const SimplifyQuery &Q);
 
 /// Given operands for a BinaryOperator, fold the result or return null.
-Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
+Value *simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
                      const SimplifyQuery &Q);
 
 /// Given operands for a BinaryOperator, fold the result or return null.
 /// Try to use FastMathFlags when folding the result.
-Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, FastMathFlags FMF,
+Value *simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, FastMathFlags FMF,
                      const SimplifyQuery &Q);
 
 /// Given a callsite, fold the result or return null.
-Value *SimplifyCall(CallBase *Call, const SimplifyQuery &Q);
+Value *simplifyCall(CallBase *Call, const SimplifyQuery &Q);
+
+/// Given a constrained FP intrinsic call, tries to compute its simplified
+/// version. Returns a simplified result or null.
+///
+/// This function provides an additional contract: it guarantees that if
+/// simplification succeeds that the intrinsic is side effect free. As a result,
+/// successful simplification can be used to delete the intrinsic not just
+/// replace its result.
+Value *simplifyConstrainedFPCall(CallBase *Call, const SimplifyQuery &Q);
 
 /// Given an operand for a Freeze, see if we can fold the result.
 /// If not, this returns null.
-Value *SimplifyFreezeInst(Value *Op, const SimplifyQuery &Q);
+Value *simplifyFreezeInst(Value *Op, const SimplifyQuery &Q);
 
 /// See if we can compute a simplified version of this instruction. If not,
 /// return null.
-Value *SimplifyInstruction(Instruction *I, const SimplifyQuery &Q,
+Value *simplifyInstruction(Instruction *I, const SimplifyQuery &Q,
                            OptimizationRemarkEmitter *ORE = nullptr);
 
-/// Like \p SimplifyInstruction but the operands of \p I are replaced with
+/// Like \p simplifyInstruction but the operands of \p I are replaced with
 /// \p NewOps. Returns a simplified value, or null if none was found.
 Value *
-SimplifyInstructionWithOperands(Instruction *I, ArrayRef<Value *> NewOps,
+simplifyInstructionWithOperands(Instruction *I, ArrayRef<Value *> NewOps,
                                 const SimplifyQuery &Q,
                                 OptimizationRemarkEmitter *ORE = nullptr);
 
diff --git a/llvm/include/llvm/Analysis/IntervalIterator.h b/llvm/include/llvm/Analysis/IntervalIterator.h
index 8e2273618a66..cbb7cac1c508 100644
--- a/llvm/include/llvm/Analysis/IntervalIterator.h
+++ b/llvm/include/llvm/Analysis/IntervalIterator.h
@@ -36,8 +36,6 @@
 #include "llvm/Analysis/Interval.h"
 #include "llvm/Analysis/IntervalPartition.h"
 #include "llvm/IR/CFG.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Support/ErrorHandling.h"
 #include <algorithm>
 #include <cassert>
 #include <iterator>
@@ -48,6 +46,7 @@
 namespace llvm {
 
 class BasicBlock;
+class Function;
 
 // getNodeHeader - Given a source graph node and the source graph, return the
 // BasicBlock that is the header node.  This is the opposite of
diff --git a/llvm/include/llvm/Analysis/LazyCallGraph.h b/llvm/include/llvm/Analysis/LazyCallGraph.h
index c0404d37d04d..4cacf8951d6a 100644
--- a/llvm/include/llvm/Analysis/LazyCallGraph.h
+++ b/llvm/include/llvm/Analysis/LazyCallGraph.h
@@ -38,20 +38,14 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/PointerIntPair.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/Allocator.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <iterator>
@@ -60,8 +54,11 @@
 
 namespace llvm {
 
+class Constant;
+class Function;
 template <class GraphType> struct GraphTraits;
 class Module;
+class TargetLibraryInfo;
 class Value;
 
 /// A lazily constructed view of the call graph of a module.
@@ -331,7 +328,7 @@ public:
     bool operator!=(const Node &N) const { return !operator==(N); }
 
     /// Tests whether the node has been populated with edges.
-    bool isPopulated() const { return Edges.hasValue(); }
+    bool isPopulated() const { return Edges.has_value(); }
 
     /// Tests whether this is actually a dead node and no longer valid.
     ///
diff --git a/llvm/include/llvm/Analysis/LazyValueInfo.h b/llvm/include/llvm/Analysis/LazyValueInfo.h
index 754391e10630..24c2bfcc74b9 100644
--- a/llvm/include/llvm/Analysis/LazyValueInfo.h
+++ b/llvm/include/llvm/Analysis/LazyValueInfo.h
@@ -114,6 +114,9 @@ public:
   /// Inform the analysis cache that we have erased a block.
   void eraseBlock(BasicBlock *BB);
 
+  /// Complete flush all previously computed values
+  void clear(const Module *M);
+
   /// Print the \LazyValueInfo Analysis.
   /// We pass in the DTree that is required for identifying which basic blocks
   /// we can solve/print for, in the LVIPrinter.
diff --git a/llvm/include/llvm/Analysis/Loads.h b/llvm/include/llvm/Analysis/Loads.h
index 09bf98d324ed..29e3efb38e19 100644
--- a/llvm/include/llvm/Analysis/Loads.h
+++ b/llvm/include/llvm/Analysis/Loads.h
@@ -75,9 +75,9 @@ bool isSafeToLoadUnconditionally(Value *V, Align Alignment, APInt &Size,
 /// within the specified loop) would access only dereferenceable memory, and
 /// be properly aligned on every iteration of the specified loop regardless of
 /// its placement within the loop. (i.e. does not require predication beyond
-/// that required by the the header itself and could be hoisted into the header
+/// that required by the header itself and could be hoisted into the header
 /// if desired.)  This is more powerful than the variants above when the
-/// address loaded from is analyzeable by SCEV.  
+/// address loaded from is analyzeable by SCEV.
 bool isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L,
                                        ScalarEvolution &SE,
                                        DominatorTree &DT);
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index c83a04991b04..8f71ce9e96c0 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -244,6 +244,15 @@ public:
   SmallVector<Instruction *, 4> getInstructionsForAccess(Value *Ptr,
                                                          bool isWrite) const;
 
+  /// Return the program order indices for the access location (Ptr, IsWrite).
+  /// Returns an empty ArrayRef if there are no accesses for the location.
+  ArrayRef<unsigned> getOrderForAccess(Value *Ptr, bool IsWrite) const {
+    auto I = Accesses.find({Ptr, IsWrite});
+    if (I != Accesses.end())
+      return I->second;
+    return {};
+  }
+
 private:
   /// A wrapper around ScalarEvolution, used to add runtime SCEV checks, and
   /// applies dynamic knowledge to simplify SCEV expressions and convert them
@@ -327,12 +336,6 @@ struct RuntimeCheckingPtrGroup {
   /// pointer, with index \p Index in RtCheck.
   RuntimeCheckingPtrGroup(unsigned Index, RuntimePointerChecking &RtCheck);
 
-  RuntimeCheckingPtrGroup(unsigned Index, const SCEV *Start, const SCEV *End,
-                          unsigned AS)
-      : High(End), Low(Start), AddressSpace(AS) {
-    Members.push_back(Index);
-  }
-
   /// Tries to add the pointer recorded in RtCheck at index
   /// \p Index to this pointer checking group. We can only add a pointer
   /// to a checking group if we will still be able to get
@@ -340,7 +343,7 @@ struct RuntimeCheckingPtrGroup {
   /// of success, false otherwise.
   bool addPointer(unsigned Index, RuntimePointerChecking &RtCheck);
   bool addPointer(unsigned Index, const SCEV *Start, const SCEV *End,
-                  unsigned AS, ScalarEvolution &SE);
+                  unsigned AS, bool NeedsFreeze, ScalarEvolution &SE);
 
   /// The SCEV expression which represents the upper bound of all the
   /// pointers in this group.
@@ -352,6 +355,9 @@ struct RuntimeCheckingPtrGroup {
   SmallVector<unsigned, 2> Members;
   /// Address space of the involved pointers.
   unsigned AddressSpace;
+  /// Whether the pointer needs to be frozen after expansion, e.g. because it
+  /// may be poison outside the loop.
+  bool NeedsFreeze = false;
 };
 
 /// A memcheck which made up of a pair of grouped pointers.
@@ -359,6 +365,18 @@ typedef std::pair<const RuntimeCheckingPtrGroup *,
                   const RuntimeCheckingPtrGroup *>
     RuntimePointerCheck;
 
+struct PointerDiffInfo {
+  const SCEV *SrcStart;
+  const SCEV *SinkStart;
+  unsigned AccessSize;
+  bool NeedsFreeze;
+
+  PointerDiffInfo(const SCEV *SrcStart, const SCEV *SinkStart,
+                  unsigned AccessSize, bool NeedsFreeze)
+      : SrcStart(SrcStart), SinkStart(SinkStart), AccessSize(AccessSize),
+        NeedsFreeze(NeedsFreeze) {}
+};
+
 /// Holds information about the memory runtime legality checks to verify
 /// that a group of pointers do not overlap.
 class RuntimePointerChecking {
@@ -383,16 +401,19 @@ public:
     unsigned AliasSetId;
     /// SCEV for the access.
     const SCEV *Expr;
+    /// True if the pointer expressions needs to be frozen after expansion.
+    bool NeedsFreeze;
 
     PointerInfo(Value *PointerValue, const SCEV *Start, const SCEV *End,
                 bool IsWritePtr, unsigned DependencySetId, unsigned AliasSetId,
-                const SCEV *Expr)
+                const SCEV *Expr, bool NeedsFreeze)
         : PointerValue(PointerValue), Start(Start), End(End),
           IsWritePtr(IsWritePtr), DependencySetId(DependencySetId),
-          AliasSetId(AliasSetId), Expr(Expr) {}
+          AliasSetId(AliasSetId), Expr(Expr), NeedsFreeze(NeedsFreeze) {}
   };
 
-  RuntimePointerChecking(ScalarEvolution *SE) : SE(SE) {}
+  RuntimePointerChecking(MemoryDepChecker &DC, ScalarEvolution *SE)
+      : DC(DC), SE(SE) {}
 
   /// Reset the state of the pointer runtime information.
   void reset() {
@@ -406,9 +427,9 @@ public:
   /// according to the assumptions that we've made during the analysis.
   /// The method might also version the pointer stride according to \p Strides,
   /// and add new predicates to \p PSE.
-  void insert(Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId,
-              unsigned ASId, const ValueToValueMap &Strides,
-              PredicatedScalarEvolution &PSE);
+  void insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr, Type *AccessTy,
+              bool WritePtr, unsigned DepSetId, unsigned ASId,
+              PredicatedScalarEvolution &PSE, bool NeedsFreeze);
 
   /// No run-time memory checking is necessary.
   bool empty() const { return Pointers.empty(); }
@@ -418,11 +439,23 @@ public:
   void generateChecks(MemoryDepChecker::DepCandidates &DepCands,
                       bool UseDependencies);
 
-  /// Returns the checks that generateChecks created.
+  /// Returns the checks that generateChecks created. They can be used to ensure
+  /// no read/write accesses overlap across all loop iterations.
   const SmallVectorImpl<RuntimePointerCheck> &getChecks() const {
     return Checks;
   }
 
+  // Returns an optional list of (pointer-difference expressions, access size)
+  // pairs that can be used to prove that there are no vectorization-preventing
+  // dependencies at runtime. There are is a vectorization-preventing dependency
+  // if any pointer-difference is <u VF * InterleaveCount * access size. Returns
+  // None if pointer-difference checks cannot be used.
+  Optional<ArrayRef<PointerDiffInfo>> getDiffChecks() const {
+    if (!CanUseDiffCheck)
+      return None;
+    return {DiffChecks};
+  }
+
   /// Decide if we need to add a check between two groups of pointers,
   /// according to needsChecking.
   bool needsChecking(const RuntimeCheckingPtrGroup &M,
@@ -477,7 +510,15 @@ private:
                    bool UseDependencies);
 
   /// Generate the checks and return them.
-  SmallVector<RuntimePointerCheck, 4> generateChecks() const;
+  SmallVector<RuntimePointerCheck, 4> generateChecks();
+
+  /// Try to create add a new (pointer-difference, access size) pair to
+  /// DiffCheck for checking groups \p CGI and \p CGJ. If pointer-difference
+  /// checks cannot be used for the groups, set CanUseDiffCheck to false.
+  void tryToCreateDiffCheck(const RuntimeCheckingPtrGroup &CGI,
+                            const RuntimeCheckingPtrGroup &CGJ);
+
+  MemoryDepChecker &DC;
 
   /// Holds a pointer to the ScalarEvolution analysis.
   ScalarEvolution *SE;
@@ -485,6 +526,13 @@ private:
   /// Set of run-time checks required to establish independence of
   /// otherwise may-aliasing pointers in the loop.
   SmallVector<RuntimePointerCheck, 4> Checks;
+
+  /// Flag indicating if pointer-difference checks can be used
+  bool CanUseDiffCheck = true;
+
+  /// A list of (pointer-difference, access size) pairs that can be used to
+  /// prove that there are no vectorization-preventing dependencies.
+  SmallVector<PointerDiffInfo> DiffChecks;
 };
 
 /// Drive the analysis of memory accesses in the loop
@@ -575,6 +623,11 @@ public:
     return HasDependenceInvolvingLoopInvariantAddress;
   }
 
+  /// Return the list of stores to invariant addresses.
+  const ArrayRef<StoreInst *> getStoresToInvariantAddresses() const {
+    return StoresToInvariantAddresses;
+  }
+
   /// Used to add runtime SCEV checks. Simplifies SCEV expressions and converts
   /// them to a more usable form.  All SCEV expressions during the analysis
   /// should be re-written (and therefore simplified) according to PSE.
@@ -605,6 +658,11 @@ private:
   /// invariant.
   void collectStridedAccess(Value *LoadOrStoreInst);
 
+  // Emits the first unsafe memory dependence in a loop.
+  // Emits nothing if there are no unsafe dependences
+  // or if the dependences were not recorded.
+  void emitUnsafeDependenceRemark();
+
   std::unique_ptr<PredicatedScalarEvolution> PSE;
 
   /// We need to check that all of the pointers in this list are disjoint
@@ -629,6 +687,9 @@ private:
   /// Indicator that there are non vectorizable stores to a uniform address.
   bool HasDependenceInvolvingLoopInvariantAddress = false;
 
+  /// List of stores to invariant addresses.
+  SmallVector<StoreInst *> StoresToInvariantAddresses;
+
   /// The diagnostics report generated for the analysis.  E.g. why we
   /// couldn't analyze the loop.
   std::unique_ptr<OptimizationRemarkAnalysis> Report;
diff --git a/llvm/include/llvm/Analysis/LoopAnalysisManager.h b/llvm/include/llvm/Analysis/LoopAnalysisManager.h
index d07e6977fed1..d22675a308aa 100644
--- a/llvm/include/llvm/Analysis/LoopAnalysisManager.h
+++ b/llvm/include/llvm/Analysis/LoopAnalysisManager.h
@@ -29,7 +29,6 @@
 #ifndef LLVM_ANALYSIS_LOOPANALYSISMANAGER_H
 #define LLVM_ANALYSIS_LOOPANALYSISMANAGER_H
 
-#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
diff --git a/llvm/include/llvm/Analysis/LoopCacheAnalysis.h b/llvm/include/llvm/Analysis/LoopCacheAnalysis.h
index 21882ebd0087..4c5083f3c980 100644
--- a/llvm/include/llvm/Analysis/LoopCacheAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopCacheAnalysis.h
@@ -15,15 +15,17 @@
 #define LLVM_ANALYSIS_LOOPCACHEANALYSIS_H
 
 #include "llvm/Analysis/LoopAnalysisManager.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
 
 class AAResults;
 class DependenceInfo;
+class Instruction;
 class LPMUpdater;
+class raw_ostream;
+class LoopInfo;
+class Loop;
 class ScalarEvolution;
 class SCEV;
 class TargetTransformInfo;
@@ -96,6 +98,10 @@ private:
   /// Attempt to delinearize the indexed reference.
   bool delinearize(const LoopInfo &LI);
 
+  /// Attempt to delinearize \p AccessFn for fixed-size arrays.
+  bool tryDelinearizeFixedSize(const SCEV *AccessFn,
+                               SmallVectorImpl<const SCEV *> &Subscripts);
+
   /// Return true if the index reference is invariant with respect to loop \p L.
   bool isLoopInvariant(const Loop &L) const;
 
@@ -105,6 +111,13 @@ private:
   /// smaller than the cache line size \p CLS.
   bool isConsecutive(const Loop &L, unsigned CLS) const;
 
+  /// Retrieve the index of the subscript corresponding to the given loop \p
+  /// L. Return a zero-based positive index if the subscript index is
+  /// succesfully located and a negative value otherwise. For example given the
+  /// indexed reference 'A[i][2j+1][3k+2]', the call
+  /// 'getSubscriptIndex(loop-k)' would return value 2.
+  int getSubscriptIndex(const Loop &L) const;
+
   /// Return the coefficient used in the rightmost dimension.
   const SCEV *getLastCoefficient() const;
 
@@ -237,9 +250,10 @@ private:
 
   /// Sort the LoopCosts vector by decreasing cache cost.
   void sortLoopCosts() {
-    sort(LoopCosts, [](const LoopCacheCostTy &A, const LoopCacheCostTy &B) {
-      return A.second > B.second;
-    });
+    stable_sort(LoopCosts,
+                [](const LoopCacheCostTy &A, const LoopCacheCostTy &B) {
+                  return A.second > B.second;
+                });
   }
 
 private:
diff --git a/llvm/include/llvm/Analysis/LoopInfo.h b/llvm/include/llvm/Analysis/LoopInfo.h
index a0ffdb07a7ec..9351b83ad747 100644
--- a/llvm/include/llvm/Analysis/LoopInfo.h
+++ b/llvm/include/llvm/Analysis/LoopInfo.h
@@ -44,7 +44,6 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/CFG.h"
-#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
@@ -55,9 +54,10 @@
 namespace llvm {
 
 class DominatorTree;
+class InductionDescriptor;
+class Instruction;
 class LoopInfo;
 class Loop;
-class InductionDescriptor;
 class MDNode;
 class MemorySSAUpdater;
 class ScalarEvolution;
@@ -112,6 +112,22 @@ public:
   /// parent is the innermost loop in which it is enclosed.
   LoopT *getParentLoop() const { return ParentLoop; }
 
+  /// Get the outermost loop in which this loop is contained.
+  /// This may be the loop itself, if it already is the outermost loop.
+  const LoopT *getOutermostLoop() const {
+    const LoopT *L = static_cast<const LoopT *>(this);
+    while (L->ParentLoop)
+      L = L->ParentLoop;
+    return L;
+  }
+
+  LoopT *getOutermostLoop() {
+    LoopT *L = static_cast<LoopT *>(this);
+    while (L->ParentLoop)
+      L = L->ParentLoop;
+    return L;
+  }
+
   /// This is a raw interface for bypassing addChildLoop.
   void setParentLoop(LoopT *L) {
     assert(!isInvalid() && "Loop not in a valid state!");
diff --git a/llvm/include/llvm/Analysis/LoopInfoImpl.h b/llvm/include/llvm/Analysis/LoopInfoImpl.h
index b8b8330d0fe1..a96a698f3afb 100644
--- a/llvm/include/llvm/Analysis/LoopInfoImpl.h
+++ b/llvm/include/llvm/Analysis/LoopInfoImpl.h
@@ -14,7 +14,6 @@
 #ifndef LLVM_ANALYSIS_LOOPINFOIMPL_H
 #define LLVM_ANALYSIS_LOOPINFOIMPL_H
 
-#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetOperations.h"
@@ -315,12 +314,11 @@ void LoopBase<BlockT, LoopT>::verifyLoop() const {
            "Loop block has no in-loop predecessors!");
 
     SmallVector<BlockT *, 2> OutsideLoopPreds;
-    std::for_each(GraphTraits<Inverse<BlockT *>>::child_begin(BB),
-                  GraphTraits<Inverse<BlockT *>>::child_end(BB),
-                  [&](BlockT *B) {
-                    if (!contains(B))
-                      OutsideLoopPreds.push_back(B);
-                  });
+    for (BlockT *B :
+         llvm::make_range(GraphTraits<Inverse<BlockT *>>::child_begin(BB),
+                          GraphTraits<Inverse<BlockT *>>::child_end(BB)))
+      if (!contains(B))
+        OutsideLoopPreds.push_back(B);
 
     if (BB == getHeader()) {
       assert(!OutsideLoopPreds.empty() && "Loop is unreachable!");
@@ -455,8 +453,7 @@ static void discoverAndMapSubloop(LoopT *L, ArrayRef<BlockT *> Backedges,
                                 InvBlockTraits::child_end(PredBB));
     } else {
       // This is a discovered block. Find its outermost discovered loop.
-      while (LoopT *Parent = Subloop->getParentLoop())
-        Subloop = Parent;
+      Subloop = Subloop->getOutermostLoop();
 
       // If it is already discovered to be a subloop of this loop, continue.
       if (Subloop == L)
diff --git a/llvm/include/llvm/Analysis/LoopPass.h b/llvm/include/llvm/Analysis/LoopPass.h
index 0fd2a39eefc0..c5f08d0ae8af 100644
--- a/llvm/include/llvm/Analysis/LoopPass.h
+++ b/llvm/include/llvm/Analysis/LoopPass.h
@@ -14,13 +14,14 @@
 #ifndef LLVM_ANALYSIS_LOOPPASS_H
 #define LLVM_ANALYSIS_LOOPPASS_H
 
-#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/LegacyPassManagers.h"
 #include "llvm/Pass.h"
 #include <deque>
 
 namespace llvm {
 
+class Loop;
+class LoopInfo;
 class LPPassManager;
 class Function;
 
diff --git a/llvm/include/llvm/Analysis/LoopUnrollAnalyzer.h b/llvm/include/llvm/Analysis/LoopUnrollAnalyzer.h
index 7cf8a081f9a2..eada6a647763 100644
--- a/llvm/include/llvm/Analysis/LoopUnrollAnalyzer.h
+++ b/llvm/include/llvm/Analysis/LoopUnrollAnalyzer.h
@@ -15,8 +15,9 @@
 #ifndef LLVM_ANALYSIS_LOOPUNROLLANALYZER_H
 #define LLVM_ANALYSIS_LOOPUNROLLANALYZER_H
 
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/IR/InstVisitor.h"
 
 // This class is used to get an estimate of the optimization effects that we
@@ -36,6 +37,8 @@
 // And finally:
 //   v = b[1]
 namespace llvm {
+class Instruction;
+
 class UnrolledInstAnalyzer : private InstVisitor<UnrolledInstAnalyzer, bool> {
   typedef InstVisitor<UnrolledInstAnalyzer, bool> Base;
   friend class InstVisitor<UnrolledInstAnalyzer, bool>;
diff --git a/llvm/include/llvm/Analysis/MLInlineAdvisor.h b/llvm/include/llvm/Analysis/MLInlineAdvisor.h
index b1a81d5e7030..00e8d7d7dd4d 100644
--- a/llvm/include/llvm/Analysis/MLInlineAdvisor.h
+++ b/llvm/include/llvm/Analysis/MLInlineAdvisor.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_ANALYSIS_MLINLINEADVISOR_H
 #define LLVM_ANALYSIS_MLINLINEADVISOR_H
 
+#include "llvm/Analysis/FunctionPropertiesAnalysis.h"
 #include "llvm/Analysis/InlineAdvisor.h"
 #include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Analysis/MLModelRunner.h"
@@ -19,6 +20,7 @@
 #include <memory>
 
 namespace llvm {
+class DiagnosticInfoOptimizationBase;
 class Module;
 class MLInlineAdvice;
 
@@ -29,16 +31,19 @@ public:
 
   virtual ~MLInlineAdvisor() = default;
 
-  void onPassEntry() override;
+  void onPassEntry(LazyCallGraph::SCC *SCC) override;
   void onPassExit(LazyCallGraph::SCC *SCC) override;
 
-  int64_t getIRSize(const Function &F) const { return F.getInstructionCount(); }
+  int64_t getIRSize(Function &F) const {
+    return getCachedFPI(F).TotalInstructionCount;
+  }
   void onSuccessfulInlining(const MLInlineAdvice &Advice,
                             bool CalleeWasDeleted);
 
   bool isForcedToStop() const { return ForceStop; }
   int64_t getLocalCalls(Function &F);
   const MLModelRunner &getModelRunner() const { return *ModelRunner.get(); }
+  FunctionPropertiesInfo &getCachedFPI(Function &) const;
 
 protected:
   std::unique_ptr<InlineAdvice> getAdviceImpl(CallBase &CB) override;
@@ -60,11 +65,11 @@ protected:
 
 private:
   int64_t getModuleIRSize() const;
+  std::unique_ptr<InlineAdvice>
+  getSkipAdviceIfUnreachableCallsite(CallBase &CB);
+  void print(raw_ostream &OS) const override;
 
-  void print(raw_ostream &OS) const override {
-    OS << "[MLInlineAdvisor] Nodes: " << NodeCount << " Edges: " << EdgeCount
-       << "\n";
-  }
+  mutable DenseMap<const Function *, FunctionPropertiesInfo> FPICache;
 
   LazyCallGraph &CG;
 
@@ -75,7 +80,7 @@ private:
   std::map<const LazyCallGraph::Node *, unsigned> FunctionLevels;
   const int32_t InitialIRSize = 0;
   int32_t CurrentIRSize = 0;
-  std::deque<const LazyCallGraph::Node *> NodesInLastSCC;
+  llvm::SmallPtrSet<const LazyCallGraph::Node *, 1> NodesInLastSCC;
   DenseSet<const LazyCallGraph::Node *> AllNodes;
   bool ForceStop = false;
 };
@@ -85,16 +90,7 @@ private:
 class MLInlineAdvice : public InlineAdvice {
 public:
   MLInlineAdvice(MLInlineAdvisor *Advisor, CallBase &CB,
-                 OptimizationRemarkEmitter &ORE, bool Recommendation)
-      : InlineAdvice(Advisor, CB, ORE, Recommendation),
-        CallerIRSize(Advisor->isForcedToStop() ? 0
-                                               : Advisor->getIRSize(*Caller)),
-        CalleeIRSize(Advisor->isForcedToStop() ? 0
-                                               : Advisor->getIRSize(*Callee)),
-        CallerAndCalleeEdges(Advisor->isForcedToStop()
-                                 ? 0
-                                 : (Advisor->getLocalCalls(*Caller) +
-                                    Advisor->getLocalCalls(*Callee))) {}
+                 OptimizationRemarkEmitter &ORE, bool Recommendation);
   virtual ~MLInlineAdvice() = default;
 
   void recordInliningImpl() override;
@@ -108,13 +104,17 @@ public:
   const int64_t CallerIRSize;
   const int64_t CalleeIRSize;
   const int64_t CallerAndCalleeEdges;
+  void updateCachedCallerFPI(FunctionAnalysisManager &FAM) const;
 
 private:
   void reportContextForRemark(DiagnosticInfoOptimizationBase &OR);
-
   MLInlineAdvisor *getAdvisor() const {
     return static_cast<MLInlineAdvisor *>(Advisor);
   };
+  // Make a copy of the FPI of the caller right before inlining. If inlining
+  // fails, we can just update the cache with that value.
+  const FunctionPropertiesInfo PreInlineCallerFPI;
+  Optional<FunctionPropertiesUpdater> FPU;
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/Analysis/MLModelRunner.h b/llvm/include/llvm/Analysis/MLModelRunner.h
index 669c02af0b3b..872c0e37f00e 100644
--- a/llvm/include/llvm/Analysis/MLModelRunner.h
+++ b/llvm/include/llvm/Analysis/MLModelRunner.h
@@ -10,10 +10,11 @@
 #ifndef LLVM_ANALYSIS_MLMODELRUNNER_H
 #define LLVM_ANALYSIS_MLMODELRUNNER_H
 
-#include "llvm/IR/LLVMContext.h"
+#include "llvm/Analysis/TensorSpec.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
+class LLVMContext;
 
 /// MLModelRunner interface: abstraction of a mechanism for evaluating a
 /// tensorflow "saved model".
@@ -41,7 +42,7 @@ public:
         getTensorUntyped(static_cast<size_t>(FeatureID)));
   }
 
-  virtual void *getTensorUntyped(size_t Index) = 0;
+  void *getTensorUntyped(size_t Index) { return InputBuffers[Index]; }
   const void *getTensorUntyped(size_t Index) const {
     return (const_cast<MLModelRunner *>(this))->getTensorUntyped(Index);
   }
@@ -50,13 +51,27 @@ public:
   Kind getKind() const { return Type; }
 
 protected:
-  MLModelRunner(LLVMContext &Ctx, Kind Type) : Ctx(Ctx), Type(Type) {
+  MLModelRunner(LLVMContext &Ctx, Kind Type, size_t NrInputs)
+      : Ctx(Ctx), Type(Type), InputBuffers(NrInputs) {
     assert(Type != Kind::Unknown);
   }
   virtual void *evaluateUntyped() = 0;
 
+  void setUpBufferForTensor(size_t Index, const TensorSpec &Spec,
+                            void *Buffer) {
+    if (!Buffer) {
+      OwnedBuffers.emplace_back(Spec.getTotalTensorBufferSize());
+      Buffer = OwnedBuffers.back().data();
+    }
+    InputBuffers[Index] = Buffer;
+  }
+
   LLVMContext &Ctx;
   const Kind Type;
+
+private:
+  std::vector<void *> InputBuffers;
+  std::vector<std::vector<char *>> OwnedBuffers;
 };
 } // namespace llvm
 
diff --git a/llvm/include/llvm/Analysis/MemoryBuiltins.h b/llvm/include/llvm/Analysis/MemoryBuiltins.h
index d5b60ee540e0..7ad83612880f 100644
--- a/llvm/include/llvm/Analysis/MemoryBuiltins.h
+++ b/llvm/include/llvm/Analysis/MemoryBuiltins.h
@@ -28,6 +28,7 @@
 namespace llvm {
 
 class AllocaInst;
+class AAResults;
 class Argument;
 class CallInst;
 class ConstantPointerNull;
@@ -100,7 +101,10 @@ inline CallInst *isFreeCall(Value *I, const TargetLibraryInfo *TLI) {
 /// insertion or speculative execution of allocation routines.
 bool isAllocRemovable(const CallBase *V, const TargetLibraryInfo *TLI);
 
-/// Gets the alignment argument for an aligned_alloc-like function
+/// Gets the alignment argument for an aligned_alloc-like function, using either
+/// built-in knowledge based on fuction names/signatures or allocalign
+/// attributes. Note: the Value returned may not indicate a valid alignment, per
+/// the definition of the allocalign attribute.
 Value *getAllocAlignment(const CallBase *V, const TargetLibraryInfo *TLI);
 
 /// Return the size of the requested allocation.  With a trivial mapper, this is
@@ -111,12 +115,19 @@ Optional<APInt> getAllocSize(const CallBase *CB,
                              const TargetLibraryInfo *TLI,
                              std::function<const Value*(const Value*)> Mapper);
 
-/// If this allocation function initializes memory to a fixed value, return
-/// said value in the requested type.  Otherwise, return nullptr.
-Constant *getInitialValueOfAllocation(const CallBase *Alloc,
+/// If this is a call to an allocation function that initializes memory to a
+/// fixed value, return said value in the requested type.  Otherwise, return
+/// nullptr.
+Constant *getInitialValueOfAllocation(const Value *V,
                                       const TargetLibraryInfo *TLI,
                                       Type *Ty);
 
+/// If a function is part of an allocation family (e.g.
+/// malloc/realloc/calloc/free), return the identifier for its family
+/// of functions.
+Optional<StringRef> getAllocationFamily(const Value *I,
+                                        const TargetLibraryInfo *TLI);
+
 //===----------------------------------------------------------------------===//
 //  Utility functions to compute size of objects.
 //
@@ -143,6 +154,8 @@ struct ObjectSizeOpts {
   /// though they can't be evaluated. Otherwise, null is always considered to
   /// point to a 0 byte region of memory.
   bool NullIsUnknownSize = false;
+  /// If set, used for more accurate evaluation
+  AAResults *AA = nullptr;
 };
 
 /// Compute the size of the object pointed by Ptr. Returns true and the
@@ -162,8 +175,9 @@ bool getObjectSize(const Value *Ptr, uint64_t &Size, const DataLayout &DL,
 /// argument of the call to objectsize.
 Value *lowerObjectSizeCall(IntrinsicInst *ObjectSize, const DataLayout &DL,
                            const TargetLibraryInfo *TLI, bool MustSucceed);
-
-
+Value *lowerObjectSizeCall(IntrinsicInst *ObjectSize, const DataLayout &DL,
+                           const TargetLibraryInfo *TLI, AAResults *AA,
+                           bool MustSucceed);
 
 using SizeOffsetType = std::pair<APInt, APInt>;
 
@@ -210,7 +224,6 @@ public:
   SizeOffsetType visitConstantPointerNull(ConstantPointerNull&);
   SizeOffsetType visitExtractElementInst(ExtractElementInst &I);
   SizeOffsetType visitExtractValueInst(ExtractValueInst &I);
-  SizeOffsetType visitGEPOperator(GEPOperator &GEP);
   SizeOffsetType visitGlobalAlias(GlobalAlias &GA);
   SizeOffsetType visitGlobalVariable(GlobalVariable &GV);
   SizeOffsetType visitIntToPtrInst(IntToPtrInst&);
@@ -221,6 +234,12 @@ public:
   SizeOffsetType visitInstruction(Instruction &I);
 
 private:
+  SizeOffsetType findLoadSizeOffset(
+      LoadInst &LoadFrom, BasicBlock &BB, BasicBlock::iterator From,
+      SmallDenseMap<BasicBlock *, SizeOffsetType, 8> &VisitedBlocks,
+      unsigned &ScannedInstCount);
+  SizeOffsetType combineSizeOffset(SizeOffsetType LHS, SizeOffsetType RHS);
+  SizeOffsetType computeImpl(Value *V);
   bool CheckedZextOrTrunc(APInt &I);
 };
 
diff --git a/llvm/include/llvm/Analysis/MemoryLocation.h b/llvm/include/llvm/Analysis/MemoryLocation.h
index 23e50f601e04..dfac49445d75 100644
--- a/llvm/include/llvm/Analysis/MemoryLocation.h
+++ b/llvm/include/llvm/Analysis/MemoryLocation.h
@@ -36,6 +36,7 @@ class AnyMemTransferInst;
 class AnyMemIntrinsic;
 class TargetLibraryInfo;
 class VAArgInst;
+class Value;
 
 // Represents the size of a MemoryLocation. Logically, it's an
 // Optional<uint63_t> that also carries a bit to represent whether the integer
diff --git a/llvm/include/llvm/Analysis/MemorySSA.h b/llvm/include/llvm/Analysis/MemorySSA.h
index b41f5771bacd..8cadb6a4c912 100644
--- a/llvm/include/llvm/Analysis/MemorySSA.h
+++ b/llvm/include/llvm/Analysis/MemorySSA.h
@@ -66,6 +66,19 @@
 /// MemoryDefs are not disambiguated because it would require multiple reaching
 /// definitions, which would require multiple phis, and multiple memoryaccesses
 /// per instruction.
+///
+/// In addition to the def/use graph described above, MemoryDefs also contain
+/// an "optimized" definition use.  The "optimized" use points to some def
+/// reachable through the memory def chain.  The optimized def *may* (but is
+/// not required to) alias the original MemoryDef, but no def *closer* to the
+/// source def may alias it.  As the name implies, the purpose of the optimized
+/// use is to allow caching of clobber searches for memory defs.  The optimized
+/// def may be nullptr, in which case clients must walk the defining access
+/// chain.
+///
+/// When iterating the uses of a MemoryDef, both defining uses and optimized
+/// uses will be encountered.  If only one type is needed, the client must
+/// filter the use walk.
 //
 //===----------------------------------------------------------------------===//
 
@@ -73,30 +86,18 @@
 #define LLVM_ANALYSIS_MEMORYSSA_H
 
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/ilist.h"
 #include "llvm/ADT/ilist_node.h"
-#include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
-#include "llvm/ADT/simple_ilist.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/PHITransAddr.h"
-#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DerivedUser.h"
 #include "llvm/IR/Dominators.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
 #include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -106,11 +107,16 @@
 
 namespace llvm {
 
+template <class GraphType> struct GraphTraits;
+class BasicBlock;
 class Function;
 class Instruction;
+class LLVMContext;
 class MemoryAccess;
 class MemorySSAWalker;
-class LLVMContext;
+class Module;
+class Use;
+class Value;
 class raw_ostream;
 
 namespace MSSAHelpers {
@@ -259,10 +265,11 @@ public:
     return MA->getValueID() == MemoryUseVal || MA->getValueID() == MemoryDefVal;
   }
 
-  // Sadly, these have to be public because they are needed in some of the
-  // iterators.
+  /// Do we have an optimized use?
   inline bool isOptimized() const;
+  /// Return the MemoryAccess associated with the optimized use, or nullptr.
   inline MemoryAccess *getOptimized() const;
+  /// Sets the optimized use for a MemoryDef.
   inline void setOptimized(MemoryAccess *);
 
   // Retrieve AliasResult type of the optimized access. Ideally this would be
@@ -339,6 +346,9 @@ public:
     setOperand(0, DMA);
   }
 
+  /// Whether the MemoryUse is optimized. If ensureOptimizedUses() was called,
+  /// uses will usually be optimized, but this is not guaranteed (e.g. due to
+  /// invalidation and optimization limits.)
   bool isOptimized() const {
     return getDefiningAccess() && OptimizedID == getDefiningAccess()->getID();
   }
@@ -791,6 +801,13 @@ public:
   /// about the beginning or end of a block.
   enum InsertionPlace { Beginning, End, BeforeTerminator };
 
+  /// By default, uses are *not* optimized during MemorySSA construction.
+  /// Calling this method will attempt to optimize all MemoryUses, if this has
+  /// not happened yet for this MemorySSA instance. This should be done if you
+  /// plan to query the clobbering access for most uses, or if you walk the
+  /// def-use chain of uses.
+  void ensureOptimizedUses();
+
 protected:
   // Used by Memory SSA dumpers and wrapper pass
   friend class MemorySSAPrinterLegacyPass;
@@ -893,6 +910,7 @@ private:
   std::unique_ptr<CachingWalker<AliasAnalysis>> Walker;
   std::unique_ptr<SkipSelfWalker<AliasAnalysis>> SkipWalker;
   unsigned NextID = 0;
+  bool IsOptimized = false;
 };
 
 /// Enables verification of MemorySSA.
diff --git a/llvm/include/llvm/Analysis/MemorySSAUpdater.h b/llvm/include/llvm/Analysis/MemorySSAUpdater.h
index 3e5ebe9cb427..2bcd1a462871 100644
--- a/llvm/include/llvm/Analysis/MemorySSAUpdater.h
+++ b/llvm/include/llvm/Analysis/MemorySSAUpdater.h
@@ -31,7 +31,6 @@
 #ifndef LLVM_ANALYSIS_MEMORYSSAUPDATER_H
 #define LLVM_ANALYSIS_MEMORYSSAUPDATER_H
 
-#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -39,7 +38,6 @@
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/IR/ValueMap.h"
 #include "llvm/Support/CFGDiff.h"
-#include <utility>
 
 namespace llvm {
 
@@ -47,6 +45,7 @@ class BasicBlock;
 class DominatorTree;
 class Instruction;
 class LoopBlocksRPO;
+template <typename T, unsigned int N> class SmallSetVector;
 
 using ValueToValueMapTy = ValueMap<const Value *, WeakTrackingVH>;
 using PhiToDefMap = SmallDenseMap<MemoryPhi *, MemoryAccess *>;
diff --git a/llvm/include/llvm/Analysis/ModelUnderTrainingRunner.h b/llvm/include/llvm/Analysis/ModelUnderTrainingRunner.h
index 071ccf96fe5b..72bd185b6c32 100644
--- a/llvm/include/llvm/Analysis/ModelUnderTrainingRunner.h
+++ b/llvm/include/llvm/Analysis/ModelUnderTrainingRunner.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_ANALYSIS_MODELUNDERTRAININGRUNNER_H
 #define LLVM_ANALYSIS_MODELUNDERTRAININGRUNNER_H
 
+#include "llvm/Analysis/TensorSpec.h"
 #include "llvm/Config/llvm-config.h"
 
 #ifdef LLVM_HAVE_TF_API
@@ -48,6 +49,11 @@ public:
                        StringRef DecisionName,
                        const std::vector<TensorSpec> &InputSpecs,
                        StringRef OutputSpecsPathOverride = "");
+  static std::unique_ptr<ModelUnderTrainingRunner>
+  createAndEnsureValid(LLVMContext &Ctx, const std::string &ModelPath,
+                       StringRef DecisionName,
+                       const std::vector<TensorSpec> &InputSpecs,
+                       const std::vector<LoggedFeatureSpec> &OutputSpecs);
 
 private:
   ModelUnderTrainingRunner(LLVMContext &Ctx, const std::string &ModelPath,
@@ -58,7 +64,6 @@ private:
   const std::vector<LoggedFeatureSpec> OutputSpecs;
   Optional<TFModelEvaluator::EvaluationResult> LastEvaluationResult;
   void *evaluateUntyped() override;
-  void *getTensorUntyped(size_t Index) override;
   bool isValid() const { return !!Evaluator; }
 };
 
diff --git a/llvm/include/llvm/Analysis/ModuleDebugInfoPrinter.h b/llvm/include/llvm/Analysis/ModuleDebugInfoPrinter.h
index 99aa315319b8..fa91e4f653d0 100644
--- a/llvm/include/llvm/Analysis/ModuleDebugInfoPrinter.h
+++ b/llvm/include/llvm/Analysis/ModuleDebugInfoPrinter.h
@@ -11,9 +11,9 @@
 
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
+class raw_ostream;
 
 class ModuleDebugInfoPrinterPass
     : public PassInfoMixin<ModuleDebugInfoPrinterPass> {
diff --git a/llvm/include/llvm/Analysis/MustExecute.h b/llvm/include/llvm/Analysis/MustExecute.h
index 18a0bfee5730..1e4994207555 100644
--- a/llvm/include/llvm/Analysis/MustExecute.h
+++ b/llvm/include/llvm/Analysis/MustExecute.h
@@ -28,7 +28,6 @@
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/InstructionPrecedenceTracking.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
 
@@ -42,6 +41,7 @@ class Instruction;
 class Loop;
 class LoopInfo;
 class PostDominatorTree;
+class raw_ostream;
 
 /// Captures loop safety information.
 /// It keep information for loop blocks may throw exception or otherwise
diff --git a/llvm/include/llvm/Analysis/NoInferenceModelRunner.h b/llvm/include/llvm/Analysis/NoInferenceModelRunner.h
index 5bcedf98865c..980b40500d7c 100644
--- a/llvm/include/llvm/Analysis/NoInferenceModelRunner.h
+++ b/llvm/include/llvm/Analysis/NoInferenceModelRunner.h
@@ -10,13 +10,9 @@
 #ifndef LLVM_ANALYSIS_NOINFERENCEMODELRUNNER_H
 #define LLVM_ANALYSIS_NOINFERENCEMODELRUNNER_H
 
-#include "llvm/Config/llvm-config.h"
-
-/// While not strictly necessary to conditionally compile this, it really
-/// has no usecase outside the 'development' mode.
-#ifdef LLVM_HAVE_TF_API
 #include "llvm/Analysis/MLModelRunner.h"
-#include "llvm/Analysis/Utils/TFUtils.h"
+#include "llvm/Analysis/TensorSpec.h"
+#include "llvm/Config/llvm-config.h"
 namespace llvm {
 /// A pseudo model runner. We use it to store feature values when collecting
 /// logs for the default policy, in 'development' mode, but never ask it to
@@ -34,10 +30,6 @@ private:
   void *evaluateUntyped() override {
     llvm_unreachable("We shouldn't call run on this model runner.");
   }
-  void *getTensorUntyped(size_t Index) override;
-
-  std::vector<std::unique_ptr<char[]>> ValuesBuffer;
 };
 } // namespace llvm
-#endif // defined(LLVM_HAVE_TF_API)
 #endif // LLVM_ANALYSIS_NOINFERENCEMODELRUNNER_H
diff --git a/llvm/include/llvm/Analysis/ObjCARCUtil.h b/llvm/include/llvm/Analysis/ObjCARCUtil.h
index 385fa5422926..56faa20c4c6e 100644
--- a/llvm/include/llvm/Analysis/ObjCARCUtil.h
+++ b/llvm/include/llvm/Analysis/ObjCARCUtil.h
@@ -35,7 +35,7 @@ inline bool hasAttachedCallOpBundle(const CallBase *CB) {
   // functions.
   return !CB->getFunctionType()->getReturnType()->isVoidTy() &&
          CB->getOperandBundle(LLVMContext::OB_clang_arc_attachedcall)
-             .hasValue();
+             .has_value();
 }
 
 /// This function returns operand bundle clang_arc_attachedcall's argument,
@@ -59,7 +59,7 @@ inline bool isRetainOrClaimRV(ARCInstKind Kind) {
 /// or UnsafeClaimRV.
 inline ARCInstKind getAttachedARCFunctionKind(const CallBase *CB) {
   Optional<Function *> Fn = getAttachedARCFunction(CB);
-  if (!Fn.hasValue())
+  if (!Fn)
     return ARCInstKind::None;
   auto FnClass = GetFunctionClass(*Fn);
   assert(isRetainOrClaimRV(FnClass) && "unexpected ARC runtime function");
diff --git a/llvm/include/llvm/Analysis/OverflowInstAnalysis.h b/llvm/include/llvm/Analysis/OverflowInstAnalysis.h
index 7523fb9392cd..761d20f17a8b 100644
--- a/llvm/include/llvm/Analysis/OverflowInstAnalysis.h
+++ b/llvm/include/llvm/Analysis/OverflowInstAnalysis.h
@@ -14,11 +14,9 @@
 #ifndef LLVM_ANALYSIS_OVERFLOWINSTANALYSIS_H
 #define LLVM_ANALYSIS_OVERFLOWINSTANALYSIS_H
 
-#include "llvm/IR/InstrTypes.h"
-
 namespace llvm {
-class Value;
 class Use;
+class Value;
 
 /// Match one of the patterns up to the select/logic op:
 ///   %Op0 = icmp ne i4 %X, 0
diff --git a/llvm/include/llvm/Analysis/PhiValues.h b/llvm/include/llvm/Analysis/PhiValues.h
index c0e91c8b0bdf..ecbb8874b378 100644
--- a/llvm/include/llvm/Analysis/PhiValues.h
+++ b/llvm/include/llvm/Analysis/PhiValues.h
@@ -22,7 +22,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
diff --git a/llvm/include/llvm/Analysis/PostDominators.h b/llvm/include/llvm/Analysis/PostDominators.h
index 296110d8d03b..4383113c8db1 100644
--- a/llvm/include/llvm/Analysis/PostDominators.h
+++ b/llvm/include/llvm/Analysis/PostDominators.h
@@ -102,10 +102,7 @@ template <> struct GraphTraits<PostDominatorTree*>
   }
 
   static nodes_iterator nodes_begin(PostDominatorTree *N) {
-    if (getEntryNode(N))
-      return df_begin(getEntryNode(N));
-    else
-      return df_end(getEntryNode(N));
+    return df_begin(getEntryNode(N));
   }
 
   static nodes_iterator nodes_end(PostDominatorTree *N) {
diff --git a/llvm/include/llvm/Analysis/ProfileSummaryInfo.h b/llvm/include/llvm/Analysis/ProfileSummaryInfo.h
index 886800d8a0f5..773784ac418c 100644
--- a/llvm/include/llvm/Analysis/ProfileSummaryInfo.h
+++ b/llvm/include/llvm/Analysis/ProfileSummaryInfo.h
@@ -170,11 +170,11 @@ public:
   uint64_t getOrCompColdCountThreshold() const;
   /// Returns HotCountThreshold if set.
   uint64_t getHotCountThreshold() const {
-    return HotCountThreshold.getValueOr(0);
+    return HotCountThreshold.value_or(0);
   }
   /// Returns ColdCountThreshold if set.
   uint64_t getColdCountThreshold() const {
-    return ColdCountThreshold.getValueOr(0);
+    return ColdCountThreshold.value_or(0);
   }
 
  private:
diff --git a/llvm/include/llvm/Analysis/PtrUseVisitor.h b/llvm/include/llvm/Analysis/PtrUseVisitor.h
index 78e9251da627..86206b2d5e9f 100644
--- a/llvm/include/llvm/Analysis/PtrUseVisitor.h
+++ b/llvm/include/llvm/Analysis/PtrUseVisitor.h
@@ -26,22 +26,15 @@
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InstVisitor.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/Support/Casting.h"
-#include <algorithm>
 #include <cassert>
 #include <type_traits>
 
 namespace llvm {
+class DataLayout;
+class Use;
 
 namespace detail {
 
diff --git a/llvm/include/llvm/Analysis/RegionInfo.h b/llvm/include/llvm/Analysis/RegionInfo.h
index f93081d6f51d..612b977f1ffa 100644
--- a/llvm/include/llvm/Analysis/RegionInfo.h
+++ b/llvm/include/llvm/Analysis/RegionInfo.h
@@ -42,11 +42,9 @@
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Config/llvm-config.h"
-#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
 #include <map>
@@ -58,6 +56,7 @@
 
 namespace llvm {
 
+class BasicBlock;
 class DominanceFrontier;
 class Loop;
 class LoopInfo;
@@ -67,6 +66,7 @@ template <class RegionTr> class RegionBase;
 class RegionInfo;
 template <class RegionTr> class RegionInfoBase;
 class RegionNode;
+class raw_ostream;
 
 // Class to be specialized for different users of RegionInfo
 // (i.e. BasicBlocks or MachineBasicBlocks). This is only to avoid needing to
@@ -242,7 +242,7 @@ public:
 ///
 /// You can obtain more examples by either calling
 ///
-/// <tt> "opt -regions -analyze anyprogram.ll" </tt>
+/// <tt> "opt -passes='print<regions>' anyprogram.ll" </tt>
 /// or
 /// <tt> "opt -view-regions-only anyprogram.ll" </tt>
 ///
diff --git a/llvm/include/llvm/Analysis/RegionInfoImpl.h b/llvm/include/llvm/Analysis/RegionInfoImpl.h
index b694effb2229..561702db3790 100644
--- a/llvm/include/llvm/Analysis/RegionInfoImpl.h
+++ b/llvm/include/llvm/Analysis/RegionInfoImpl.h
@@ -15,8 +15,6 @@
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/Analysis/DominanceFrontier.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/RegionInfo.h"
@@ -24,7 +22,6 @@
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
 #include <iterator>
@@ -37,6 +34,7 @@
 #define DEBUG_TYPE "region"
 
 namespace llvm {
+class raw_ostream;
 
 //===----------------------------------------------------------------------===//
 /// RegionBase Implementation
diff --git a/llvm/include/llvm/Analysis/RegionIterator.h b/llvm/include/llvm/Analysis/RegionIterator.h
index fecb28725dcc..ba28b1b902ea 100644
--- a/llvm/include/llvm/Analysis/RegionIterator.h
+++ b/llvm/include/llvm/Analysis/RegionIterator.h
@@ -15,7 +15,6 @@
 #include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/Analysis/RegionInfo.h"
-#include "llvm/IR/CFG.h"
 #include <cassert>
 #include <iterator>
 #include <type_traits>
@@ -23,6 +22,7 @@
 namespace llvm {
 
 class BasicBlock;
+class RegionInfo;
 
 //===----------------------------------------------------------------------===//
 /// Hierarchical RegionNode successor iterator.
diff --git a/llvm/include/llvm/Analysis/RegionPass.h b/llvm/include/llvm/Analysis/RegionPass.h
index 5c7fa5f56693..dd5e6a1a3b24 100644
--- a/llvm/include/llvm/Analysis/RegionPass.h
+++ b/llvm/include/llvm/Analysis/RegionPass.h
@@ -15,7 +15,6 @@
 #ifndef LLVM_ANALYSIS_REGIONPASS_H
 #define LLVM_ANALYSIS_REGIONPASS_H
 
-#include "llvm/Analysis/RegionInfo.h"
 #include "llvm/IR/LegacyPassManagers.h"
 #include "llvm/Pass.h"
 #include <deque>
@@ -23,6 +22,8 @@
 namespace llvm {
 class Function;
 class RGPassManager;
+class Region;
+class RegionInfo;
 
 //===----------------------------------------------------------------------===//
 /// A pass that runs on each Region in a function.
diff --git a/llvm/include/llvm/Analysis/RegionPrinter.h b/llvm/include/llvm/Analysis/RegionPrinter.h
index 154ac35c486a..501a5406236e 100644
--- a/llvm/include/llvm/Analysis/RegionPrinter.h
+++ b/llvm/include/llvm/Analysis/RegionPrinter.h
@@ -14,6 +14,9 @@
 #ifndef LLVM_ANALYSIS_REGIONPRINTER_H
 #define LLVM_ANALYSIS_REGIONPRINTER_H
 
+#include "llvm/Analysis/DOTGraphTraitsPass.h"
+#include "llvm/Analysis/RegionInfo.h"
+
 namespace llvm {
   class FunctionPass;
   class Function;
@@ -24,6 +27,13 @@ namespace llvm {
   FunctionPass *createRegionPrinterPass();
   FunctionPass *createRegionOnlyPrinterPass();
 
+  template <>
+  struct DOTGraphTraits<RegionNode *> : public DefaultDOTGraphTraits {
+    DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+
+    std::string getNodeLabel(RegionNode *Node, RegionNode *Graph);
+  };
+
 #ifndef NDEBUG
   /// Open a viewer to display the GraphViz vizualization of the analysis
   /// result.
diff --git a/llvm/include/llvm/Analysis/ReleaseModeModelRunner.h b/llvm/include/llvm/Analysis/ReleaseModeModelRunner.h
index 1bf2e853980c..bf1aaca2adbb 100644
--- a/llvm/include/llvm/Analysis/ReleaseModeModelRunner.h
+++ b/llvm/include/llvm/Analysis/ReleaseModeModelRunner.h
@@ -15,11 +15,12 @@
 #define LLVM_ANALYSIS_RELEASEMODEMODELRUNNER_H
 
 #include "llvm/Analysis/MLModelRunner.h"
+#include "llvm/Analysis/TensorSpec.h"
+#include "llvm/Support/ErrorHandling.h"
 
 #include <memory>
 #include <vector>
 
-using namespace llvm;
 namespace llvm {
 
 /// ReleaseModeModelRunner - production mode implementation of the
@@ -30,21 +31,20 @@ public:
   /// FeatureNames' type should be an indexed collection of std::string, like
   /// std::array or std::vector, that has a size() method.
   template <class FType>
-  ReleaseModeModelRunner(LLVMContext &Ctx, const FType &FeatureNames,
+  ReleaseModeModelRunner(LLVMContext &Ctx, const FType &InputSpec,
                          StringRef DecisionName, StringRef FeedPrefix = "feed_",
                          StringRef FetchPrefix = "fetch_")
-      : MLModelRunner(Ctx, MLModelRunner::Kind::Release),
+      : MLModelRunner(Ctx, MLModelRunner::Kind::Release, InputSpec.size()),
         CompiledModel(std::make_unique<TGen>()) {
     assert(CompiledModel && "The CompiledModel should be valid");
 
-    const size_t FeatureCount = FeatureNames.size();
-    FeatureIndices.resize(FeatureCount);
-
-    for (size_t I = 0; I < FeatureCount; ++I) {
+    for (size_t I = 0; I < InputSpec.size(); ++I) {
       const int Index =
-          CompiledModel->LookupArgIndex(FeedPrefix.str() + FeatureNames[I]);
-      assert(Index >= 0 && "Cannot find Feature in inlining model");
-      FeatureIndices[I] = Index;
+          CompiledModel->LookupArgIndex(FeedPrefix.str() + InputSpec[I].name());
+      void *Buffer = nullptr;
+      if (Index >= 0)
+        Buffer = CompiledModel->arg_data(Index);
+      setUpBufferForTensor(I, InputSpec[I], Buffer);
     }
 
     ResultIndex = CompiledModel->LookupResultIndex(FetchPrefix.str() +
@@ -64,15 +64,27 @@ private:
     return CompiledModel->result_data(ResultIndex);
   }
 
-  void *getTensorUntyped(size_t Index) override {
-    return reinterpret_cast<char *>(
-        CompiledModel->arg_data(FeatureIndices[Index]));
-  }
-
-  std::vector<int32_t> FeatureIndices;
   int32_t ResultIndex = -1;
   std::unique_ptr<TGen> CompiledModel;
 };
+
+/// A mock class satisfying the interface expected by ReleaseModeModelRunner for
+/// its `TGen` parameter. Useful to avoid conditional compilation complexity, as
+/// a compile-time replacement for a real AOT-ed model.
+class NoopSavedModelImpl final {
+#define NOOP_MODEL_ERRMSG                                                      \
+  "The mock AOT-ed saved model is a compile-time stub and should not be "      \
+  "called."
+
+public:
+  NoopSavedModelImpl() = default;
+  int LookupArgIndex(const std::string &) { llvm_unreachable(NOOP_MODEL_ERRMSG); }
+  int LookupResultIndex(const std::string &) { llvm_unreachable(NOOP_MODEL_ERRMSG); }
+  void Run() { llvm_unreachable(NOOP_MODEL_ERRMSG); }
+  void *result_data(int) { llvm_unreachable(NOOP_MODEL_ERRMSG); }
+  void *arg_data(int) { llvm_unreachable(NOOP_MODEL_ERRMSG); }
+#undef NOOP_MODEL_ERRMSG
+};
 } // namespace llvm
 
 #endif // LLVM_ANALYSIS_RELEASEMODEMODELRUNNER_H
diff --git a/llvm/include/llvm/Analysis/ReplayInlineAdvisor.h b/llvm/include/llvm/Analysis/ReplayInlineAdvisor.h
index dc2efeafb568..0c5b566f60a4 100644
--- a/llvm/include/llvm/Analysis/ReplayInlineAdvisor.h
+++ b/llvm/include/llvm/Analysis/ReplayInlineAdvisor.h
@@ -11,11 +11,11 @@
 
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Analysis/InlineAdvisor.h"
-#include "llvm/IR/LLVMContext.h"
 
 namespace llvm {
 class CallBase;
 class Function;
+class LLVMContext;
 class Module;
 
 struct CallSiteFormat {
@@ -53,10 +53,12 @@ struct ReplayInlinerSettings {
 /// Get call site location as a string with the given format
 std::string formatCallSiteLocation(DebugLoc DLoc, const CallSiteFormat &Format);
 
-std::unique_ptr<InlineAdvisor> getReplayInlineAdvisor(
-    Module &M, FunctionAnalysisManager &FAM, LLVMContext &Context,
-    std::unique_ptr<InlineAdvisor> OriginalAdvisor,
-    const ReplayInlinerSettings &ReplaySettings, bool EmitRemarks);
+std::unique_ptr<InlineAdvisor>
+getReplayInlineAdvisor(Module &M, FunctionAnalysisManager &FAM,
+                       LLVMContext &Context,
+                       std::unique_ptr<InlineAdvisor> OriginalAdvisor,
+                       const ReplayInlinerSettings &ReplaySettings,
+                       bool EmitRemarks, InlineContext IC);
 
 /// Replay inline advisor that uses optimization remarks from inlining of
 /// previous build to guide current inlining. This is useful for inliner tuning.
@@ -66,7 +68,7 @@ public:
                       LLVMContext &Context,
                       std::unique_ptr<InlineAdvisor> OriginalAdvisor,
                       const ReplayInlinerSettings &ReplaySettings,
-                      bool EmitRemarks);
+                      bool EmitRemarks, InlineContext IC);
   std::unique_ptr<InlineAdvice> getAdviceImpl(CallBase &CB) override;
   bool areReplayRemarksLoaded() const { return HasReplayRemarks; }
 
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index b16aa7017719..de1cc299f062 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -31,18 +31,12 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/ConstantRange.h"
-#include "llvm/IR/Function.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/Operator.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/IR/ValueMap.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/Allocator.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Compiler.h"
-#include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <memory>
@@ -50,12 +44,14 @@
 
 namespace llvm {
 
+class OverflowingBinaryOperator;
 class AssumptionCache;
 class BasicBlock;
 class Constant;
 class ConstantInt;
 class DataLayout;
 class DominatorTree;
+class Function;
 class GEPOperator;
 class Instruction;
 class LLVMContext;
@@ -71,6 +67,8 @@ class Type;
 class Value;
 enum SCEVTypes : unsigned short;
 
+extern bool VerifySCEV;
+
 /// This class represents an analyzed expression in the program.  These are
 /// opaque objects that the client is not allowed to do much with directly.
 ///
@@ -222,7 +220,7 @@ class SCEVPredicate : public FoldingSetNode {
   FoldingSetNodeIDRef FastID;
 
 public:
-  enum SCEVPredicateKind { P_Union, P_Equal, P_Wrap };
+  enum SCEVPredicateKind { P_Union, P_Compare, P_Wrap };
 
 protected:
   SCEVPredicateKind Kind;
@@ -249,10 +247,6 @@ public:
   /// Prints a textual representation of this predicate with an indentation of
   /// \p Depth.
   virtual void print(raw_ostream &OS, unsigned Depth = 0) const = 0;
-
-  /// Returns the SCEV to which this predicate applies, or nullptr if this is
-  /// a SCEVUnionPredicate.
-  virtual const SCEV *getExpr() const = 0;
 };
 
 inline raw_ostream &operator<<(raw_ostream &OS, const SCEVPredicate &P) {
@@ -279,32 +273,35 @@ struct FoldingSetTrait<SCEVPredicate> : DefaultFoldingSetTrait<SCEVPredicate> {
   }
 };
 
-/// This class represents an assumption that two SCEV expressions are equal,
-/// and this can be checked at run-time.
-class SCEVEqualPredicate final : public SCEVPredicate {
-  /// We assume that LHS == RHS.
+/// This class represents an assumption that the expression LHS Pred RHS
+/// evaluates to true, and this can be checked at run-time.
+class SCEVComparePredicate final : public SCEVPredicate {
+  /// We assume that LHS Pred RHS is true.
+  const ICmpInst::Predicate Pred;
   const SCEV *LHS;
   const SCEV *RHS;
 
 public:
-  SCEVEqualPredicate(const FoldingSetNodeIDRef ID, const SCEV *LHS,
-                     const SCEV *RHS);
+  SCEVComparePredicate(const FoldingSetNodeIDRef ID,
+                       const ICmpInst::Predicate Pred,
+                       const SCEV *LHS, const SCEV *RHS);
 
   /// Implementation of the SCEVPredicate interface
   bool implies(const SCEVPredicate *N) const override;
   void print(raw_ostream &OS, unsigned Depth = 0) const override;
   bool isAlwaysTrue() const override;
-  const SCEV *getExpr() const override;
 
-  /// Returns the left hand side of the equality.
+  ICmpInst::Predicate getPredicate() const { return Pred; }
+
+  /// Returns the left hand side of the predicate.
   const SCEV *getLHS() const { return LHS; }
 
-  /// Returns the right hand side of the equality.
+  /// Returns the right hand side of the predicate.
   const SCEV *getRHS() const { return RHS; }
 
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const SCEVPredicate *P) {
-    return P->getKind() == P_Equal;
+    return P->getKind() == P_Compare;
   }
 };
 
@@ -396,7 +393,7 @@ public:
   IncrementWrapFlags getFlags() const { return Flags; }
 
   /// Implementation of the SCEVPredicate interface
-  const SCEV *getExpr() const override;
+  const SCEVAddRecExpr *getExpr() const;
   bool implies(const SCEVPredicate *N) const override;
   void print(raw_ostream &OS, unsigned Depth = 0) const override;
   bool isAlwaysTrue() const override;
@@ -421,28 +418,20 @@ private:
   /// Vector with references to all predicates in this union.
   SmallVector<const SCEVPredicate *, 16> Preds;
 
-  /// Maps SCEVs to predicates for quick look-ups.
-  PredicateMap SCEVToPreds;
+  /// Adds a predicate to this union.
+  void add(const SCEVPredicate *N);
 
 public:
-  SCEVUnionPredicate();
+  SCEVUnionPredicate(ArrayRef<const SCEVPredicate *> Preds);
 
   const SmallVectorImpl<const SCEVPredicate *> &getPredicates() const {
     return Preds;
   }
 
-  /// Adds a predicate to this union.
-  void add(const SCEVPredicate *N);
-
-  /// Returns a reference to a vector containing all predicates which apply to
-  /// \p Expr.
-  ArrayRef<const SCEVPredicate *> getPredicatesForExpr(const SCEV *Expr);
-
   /// Implementation of the SCEVPredicate interface
   bool isAlwaysTrue() const override;
   bool implies(const SCEVPredicate *N) const override;
   void print(raw_ostream &OS, unsigned Depth) const override;
-  const SCEV *getExpr() const override;
 
   /// We estimate the complexity of a union predicate as the size number of
   /// predicates in the union.
@@ -556,6 +545,10 @@ public:
   /// Return true if the SCEV expression contains an undef value.
   bool containsUndefs(const SCEV *S) const;
 
+  /// Return true if the SCEV expression contains a Value that has been
+  /// optimised out and is now a nullptr.
+  bool containsErasedValue(const SCEV *S) const;
+
   /// Return a SCEV expression for the full generality of the specified
   /// expression.
   const SCEV *getSCEV(Value *V);
@@ -885,7 +878,7 @@ public:
   /// the answer to be correct. Predicates can be checked with run-time
   /// checks and can be used to perform loop versioning.
   const SCEV *getPredicatedBackedgeTakenCount(const Loop *L,
-                                              SCEVUnionPredicate &Predicates);
+                                              SmallVector<const SCEVPredicate *, 4> &Predicates);
 
   /// When successful, this returns a SCEVConstant that is greater than or equal
   /// to (i.e. a "conservative over-approximation") of the value returend by
@@ -1166,6 +1159,8 @@ public:
   }
 
   const SCEVPredicate *getEqualPredicate(const SCEV *LHS, const SCEV *RHS);
+  const SCEVPredicate *getComparePredicate(ICmpInst::Predicate Pred,
+                                           const SCEV *LHS, const SCEV *RHS);
 
   const SCEVPredicate *
   getWrapPredicate(const SCEVAddRecExpr *AR,
@@ -1173,7 +1168,7 @@ public:
 
   /// Re-writes the SCEV according to the Predicates in \p A.
   const SCEV *rewriteUsingPredicate(const SCEV *S, const Loop *L,
-                                    SCEVUnionPredicate &A);
+                                    const SCEVPredicate &A);
   /// Tries to convert the \p S expression to an AddRec expression,
   /// adding additional predicates to \p Preds as required.
   const SCEVAddRecExpr *convertSCEVToAddRecWithPredicates(
@@ -1256,30 +1251,11 @@ private:
   HasRecMapType HasRecMap;
 
   /// The type for ExprValueMap.
-  using ValueOffsetPair = std::pair<Value *, ConstantInt *>;
-  using ValueOffsetPairSetVector = SmallSetVector<ValueOffsetPair, 4>;
-  using ExprValueMapType = DenseMap<const SCEV *, ValueOffsetPairSetVector>;
+  using ValueSetVector = SmallSetVector<Value *, 4>;
+  using ExprValueMapType = DenseMap<const SCEV *, ValueSetVector>;
 
   /// ExprValueMap -- This map records the original values from which
   /// the SCEV expr is generated from.
-  ///
-  /// We want to represent the mapping as SCEV -> ValueOffsetPair instead
-  /// of SCEV -> Value:
-  /// Suppose we know S1 expands to V1, and
-  ///  S1 = S2 + C_a
-  ///  S3 = S2 + C_b
-  /// where C_a and C_b are different SCEVConstants. Then we'd like to
-  /// expand S3 as V1 - C_a + C_b instead of expanding S2 literally.
-  /// It is helpful when S2 is a complex SCEV expr.
-  ///
-  /// In order to do that, we represent ExprValueMap as a mapping from
-  /// SCEV to ValueOffsetPair. We will save both S1->{V1, 0} and
-  /// S2->{V1, C_a} into the map when we create SCEV for V1. When S3
-  /// is expanded, it will first expand S2 to V1 - C_a because of
-  /// S2->{V1, C_a} in the map, then expand S3 to V1 - C_a + C_b.
-  ///
-  /// Note: S->{V, Offset} in the ExprValueMap means S can be expanded
-  /// to V - Offset.
   ExprValueMapType ExprValueMap;
 
   /// The type for ValueExprMap.
@@ -1310,7 +1286,7 @@ private:
   DenseMap<const SCEV *, uint32_t> MinTrailingZerosCache;
 
   /// Return the Value set from which the SCEV expr is generated.
-  ValueOffsetPairSetVector *getSCEVValues(const SCEV *S);
+  ArrayRef<Value *> getSCEVValues(const SCEV *S);
 
   /// Private helper method for the GetMinTrailingZeros method
   uint32_t GetMinTrailingZerosImpl(const SCEV *S);
@@ -1369,17 +1345,17 @@ private:
     PoisoningVH<BasicBlock> ExitingBlock;
     const SCEV *ExactNotTaken;
     const SCEV *MaxNotTaken;
-    std::unique_ptr<SCEVUnionPredicate> Predicate;
+    SmallPtrSet<const SCEVPredicate *, 4> Predicates;
 
     explicit ExitNotTakenInfo(PoisoningVH<BasicBlock> ExitingBlock,
                               const SCEV *ExactNotTaken,
                               const SCEV *MaxNotTaken,
-                              std::unique_ptr<SCEVUnionPredicate> Predicate)
+                              const SmallPtrSet<const SCEVPredicate *, 4> &Predicates)
       : ExitingBlock(ExitingBlock), ExactNotTaken(ExactNotTaken),
-        MaxNotTaken(ExactNotTaken), Predicate(std::move(Predicate)) {}
+        MaxNotTaken(ExactNotTaken), Predicates(Predicates) {}
 
     bool hasAlwaysTruePredicate() const {
-      return !Predicate || Predicate->isAlwaysTrue();
+      return Predicates.empty();
     }
   };
 
@@ -1452,7 +1428,7 @@ private:
     /// vector, this information can contain them and therefore a
     /// SCEVPredicate argument should be added to getExact.
     const SCEV *getExact(const Loop *L, ScalarEvolution *SE,
-                         SCEVUnionPredicate *Predicates = nullptr) const;
+                         SmallVector<const SCEVPredicate *, 4> *Predicates = nullptr) const;
 
     /// Return the number of times this loop exit may fall through to the back
     /// edge, or SCEVCouldNotCompute. The loop is guaranteed not to exit via
@@ -1599,9 +1575,17 @@ private:
   ConstantRange getRangeForUnknownRecurrence(const SCEVUnknown *U);
 
   /// We know that there is no SCEV for the specified value.  Analyze the
-  /// expression.
+  /// expression recursively.
   const SCEV *createSCEV(Value *V);
 
+  /// We know that there is no SCEV for the specified value. Create a new SCEV
+  /// for \p V iteratively.
+  const SCEV *createSCEVIter(Value *V);
+  /// Collect operands of \p V for which SCEV expressions should be constructed
+  /// first. Returns a SCEV directly if it can be constructed trivially for \p
+  /// V.
+  const SCEV *getOperandsToCreate(Value *V, SmallVectorImpl<Value *> &Ops);
+
   /// Provide the special handling we need to analyze PHI SCEVs.
   const SCEV *createNodeForPHI(PHINode *PN);
 
@@ -1619,8 +1603,22 @@ private:
   /// is either a select instruction or a phi node).  \p I is the instruction
   /// being processed, and it is assumed equivalent to "Cond ? TrueVal :
   /// FalseVal".
-  const SCEV *createNodeForSelectOrPHI(Instruction *I, Value *Cond,
-                                       Value *TrueVal, Value *FalseVal);
+  const SCEV *createNodeForSelectOrPHIInstWithICmpInstCond(Instruction *I,
+                                                           ICmpInst *Cond,
+                                                           Value *TrueVal,
+                                                           Value *FalseVal);
+
+  /// See if we can model this select-like instruction via umin_seq expression.
+  const SCEV *createNodeForSelectOrPHIViaUMinSeq(Value *I, Value *Cond,
+                                                 Value *TrueVal,
+                                                 Value *FalseVal);
+
+  /// Given a value \p V, which is a select-like instruction (currently this is
+  /// either a select instruction or a phi node), which is assumed equivalent to
+  ///   Cond ? TrueVal : FalseVal
+  /// see if we can model it as a SCEV expression.
+  const SCEV *createNodeForSelectOrPHI(Value *V, Value *Cond, Value *TrueVal,
+                                       Value *FalseVal);
 
   /// Provide the special handling we need to analyze GEP SCEVs.
   const SCEV *createNodeForGEP(GEPOperator *GEP);
@@ -2097,6 +2095,11 @@ private:
   /// `UniqueSCEVs`.  Return if found, else nullptr.
   SCEV *findExistingSCEVInCache(SCEVTypes SCEVType, ArrayRef<const SCEV *> Ops);
 
+  /// Get reachable blocks in this function, making limited use of SCEV
+  /// reasoning about conditions.
+  void getReachableBlocks(SmallPtrSetImpl<BasicBlock *> &Reachable,
+                          Function &F);
+
   FoldingSet<SCEV> UniqueSCEVs;
   FoldingSet<SCEVPredicate> UniquePreds;
   BumpPtrAllocator SCEVAllocator;
@@ -2182,7 +2185,7 @@ class PredicatedScalarEvolution {
 public:
   PredicatedScalarEvolution(ScalarEvolution &SE, Loop &L);
 
-  const SCEVUnionPredicate &getUnionPredicate() const;
+  const SCEVPredicate &getPredicate() const;
 
   /// Returns the SCEV expression of V, in the context of the current SCEV
   /// predicate.  The order of transformations applied on the expression of V
@@ -2251,7 +2254,7 @@ private:
 
   /// The SCEVPredicate that forms our context. We will rewrite all
   /// expressions assuming that this predicate true.
-  SCEVUnionPredicate Preds;
+  std::unique_ptr<SCEVUnionPredicate> Preds;
 
   /// Marks the version of the SCEV predicate used. When rewriting a SCEV
   /// expression we mark it with the version of the predicate. We use this to
diff --git a/llvm/include/llvm/Analysis/ScalarEvolutionAliasAnalysis.h b/llvm/include/llvm/Analysis/ScalarEvolutionAliasAnalysis.h
index ebd427354cee..15e27283021c 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolutionAliasAnalysis.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolutionAliasAnalysis.h
@@ -14,13 +14,14 @@
 #define LLVM_ANALYSIS_SCALAREVOLUTIONALIASANALYSIS_H
 
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 
 namespace llvm {
 
+class Function;
+class ScalarEvolution;
+class SCEV;
+
 /// A simple alias analysis implementation that uses ScalarEvolution to answer
 /// queries.
 class SCEVAAResult : public AAResultBase<SCEVAAResult> {
diff --git a/llvm/include/llvm/Analysis/ScalarEvolutionExpressions.h b/llvm/include/llvm/Analysis/ScalarEvolutionExpressions.h
index cd8e5fab6766..b29854cddc66 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolutionExpressions.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolutionExpressions.h
@@ -14,13 +14,11 @@
 #define LLVM_ANALYSIS_SCALAREVOLUTIONEXPRESSIONS_H
 
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -31,9 +29,11 @@ namespace llvm {
 
 class APInt;
 class Constant;
+class ConstantInt;
 class ConstantRange;
 class Loop;
 class Type;
+class Value;
 
 enum SCEVTypes : unsigned short {
   // These should be ordered in terms of increasing complexity to make the
@@ -699,8 +699,11 @@ public:
       case scUMinExpr:
       case scSequentialUMinExpr:
       case scAddRecExpr:
-        for (const auto *Op : cast<SCEVNAryExpr>(S)->operands())
+        for (const auto *Op : cast<SCEVNAryExpr>(S)->operands()) {
           push(Op);
+          if (Visitor.isDone())
+            break;
+        }
         continue;
       case scUDivExpr: {
         const SCEVUDivExpr *UDiv = cast<SCEVUDivExpr>(S);
diff --git a/llvm/include/llvm/Analysis/ScalarEvolutionNormalization.h b/llvm/include/llvm/Analysis/ScalarEvolutionNormalization.h
index 6ab92a3a977f..da420ff1e6d2 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolutionNormalization.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolutionNormalization.h
@@ -35,7 +35,7 @@
 #ifndef LLVM_ANALYSIS_SCALAREVOLUTIONNORMALIZATION_H
 #define LLVM_ANALYSIS_SCALAREVOLUTIONNORMALIZATION_H
 
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 
 namespace llvm {
diff --git a/llvm/include/llvm/Analysis/ScalarFuncs.def b/llvm/include/llvm/Analysis/ScalarFuncs.def
new file mode 100644
index 000000000000..2ed9be538091
--- /dev/null
+++ b/llvm/include/llvm/Analysis/ScalarFuncs.def
@@ -0,0 +1,117 @@
+//===-- ScalarFuncs.def - Library information ----------*- C++ -*----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This .def file creates mapping from standard IEEE math functions
+// their corresponding entries in the IBM MASS (scalar) library.
+// LLVM intrinsic math functions will be handled in PPCISelLowing to
+// allow existing optimizations like pow(x,0.5) --> sqrt(x).
+
+#if defined(TLI_DEFINE_SCALAR_MASS_FUNCS)
+#define TLI_DEFINE_SCALAR_MASS_FUNC(SCAL, MASSENTRY) {SCAL, MASSENTRY},
+#endif
+
+TLI_DEFINE_SCALAR_MASS_FUNC("acosf", "__xl_acosf")
+TLI_DEFINE_SCALAR_MASS_FUNC("__acosf_finite", "__xl_acosf")
+TLI_DEFINE_SCALAR_MASS_FUNC("acos", "__xl_acos")
+TLI_DEFINE_SCALAR_MASS_FUNC("__acos_finite", "__xl_acos")
+
+TLI_DEFINE_SCALAR_MASS_FUNC("acoshf", "__xl_acoshf")
+TLI_DEFINE_SCALAR_MASS_FUNC("__acoshf_finite", "__xl_acoshf")
+TLI_DEFINE_SCALAR_MASS_FUNC("acosh", "__xl_acosh")
+TLI_DEFINE_SCALAR_MASS_FUNC("__acosh_finite", "__xl_acosh")
+
+TLI_DEFINE_SCALAR_MASS_FUNC("asinf", "__xl_asinf")
+TLI_DEFINE_SCALAR_MASS_FUNC("__asinf_finite", "__xl_asinf")
+TLI_DEFINE_SCALAR_MASS_FUNC("asin", "__xl_asin")
+TLI_DEFINE_SCALAR_MASS_FUNC("__asin_finite", "__xl_asin")
+
+TLI_DEFINE_SCALAR_MASS_FUNC("asinhf", "__xl_asinhf")
+TLI_DEFINE_SCALAR_MASS_FUNC("asinh", "__xl_asinh")
+
+TLI_DEFINE_SCALAR_MASS_FUNC("atanf", "__xl_atanf")
+TLI_DEFINE_SCALAR_MASS_FUNC("atan", "__xl_atan")
+
+TLI_DEFINE_SCALAR_MASS_FUNC("atan2f", "__xl_atan2f")
+TLI_DEFINE_SCALAR_MASS_FUNC("__atan2f_finite", "__xl_atan2f")
+TLI_DEFINE_SCALAR_MASS_FUNC("atan2", "__xl_atan2")
+TLI_DEFINE_SCALAR_MASS_FUNC("__atan2_finite", "__xl_atan2")
+
+TLI_DEFINE_SCALAR_MASS_FUNC("atanhf", "__xl_atanhf")
+TLI_DEFINE_SCALAR_MASS_FUNC("__atanhf_finite", "__xl_atanhf")
+TLI_DEFINE_SCALAR_MASS_FUNC("atanh", "__xl_atanh")
+TLI_DEFINE_SCALAR_MASS_FUNC("__atanh_finite", "__xl_atanh")
+
+TLI_DEFINE_SCALAR_MASS_FUNC("cbrtf", "__xl_cbrtf")
+TLI_DEFINE_SCALAR_MASS_FUNC("cbrt", "__xl_cbrt")
+
+TLI_DEFINE_SCALAR_MASS_FUNC("cosf", "__xl_cosf")
+TLI_DEFINE_SCALAR_MASS_FUNC("cos", "__xl_cos")
+
+TLI_DEFINE_SCALAR_MASS_FUNC("coshf", "__xl_coshf")
+TLI_DEFINE_SCALAR_MASS_FUNC("__coshf_finite", "__xl_coshf")
+TLI_DEFINE_SCALAR_MASS_FUNC("cosh", "__xl_cosh")
+TLI_DEFINE_SCALAR_MASS_FUNC("__cosh_finite", "__xl_cosh")
+
+TLI_DEFINE_SCALAR_MASS_FUNC("erff", "__xl_erff")
+TLI_DEFINE_SCALAR_MASS_FUNC("erf", "__xl_erf")
+
+TLI_DEFINE_SCALAR_MASS_FUNC("erfcf", "__xl_erfcf")
+TLI_DEFINE_SCALAR_MASS_FUNC("erfc", "__xl_erfc")
+
+TLI_DEFINE_SCALAR_MASS_FUNC("expf", "__xl_expf")
+TLI_DEFINE_SCALAR_MASS_FUNC("__expf_finite", "__xl_expf")
+TLI_DEFINE_SCALAR_MASS_FUNC("exp", "__xl_exp")
+TLI_DEFINE_SCALAR_MASS_FUNC("__exp_finite", "__xl_exp")
+
+TLI_DEFINE_SCALAR_MASS_FUNC("expm1f", "__xl_expm1f")
+TLI_DEFINE_SCALAR_MASS_FUNC("expm1", "__xl_expm1")
+
+TLI_DEFINE_SCALAR_MASS_FUNC("hypotf", "__xl_hypotf")
+TLI_DEFINE_SCALAR_MASS_FUNC("hypot", "__xl_hypot")
+
+TLI_DEFINE_SCALAR_MASS_FUNC("lgammaf", "__xl_lgammaf")
+TLI_DEFINE_SCALAR_MASS_FUNC("lgamma", "__xl_lgamma")
+
+TLI_DEFINE_SCALAR_MASS_FUNC("logf", "__xl_logf")
+TLI_DEFINE_SCALAR_MASS_FUNC("__logf_finite", "__xl_logf")
+TLI_DEFINE_SCALAR_MASS_FUNC("log", "__xl_log")
+TLI_DEFINE_SCALAR_MASS_FUNC("__log_finite", "__xl_log")
+
+TLI_DEFINE_SCALAR_MASS_FUNC("log10f", "__xl_log10f")
+TLI_DEFINE_SCALAR_MASS_FUNC("__log10f_finite", "__xl_log10f")
+TLI_DEFINE_SCALAR_MASS_FUNC("log10", "__xl_log10")
+TLI_DEFINE_SCALAR_MASS_FUNC("__log10_finite", "__xl_log10")
+
+TLI_DEFINE_SCALAR_MASS_FUNC("log1pf", "__xl_log1pf")
+TLI_DEFINE_SCALAR_MASS_FUNC("log1p", "__xl_log1p")
+
+TLI_DEFINE_SCALAR_MASS_FUNC("powf", "__xl_powf")
+TLI_DEFINE_SCALAR_MASS_FUNC("__powf_finite", "__xl_powf")
+TLI_DEFINE_SCALAR_MASS_FUNC("pow", "__xl_pow")
+TLI_DEFINE_SCALAR_MASS_FUNC("__pow_finite", "__xl_pow")
+
+TLI_DEFINE_SCALAR_MASS_FUNC("rsqrt", "__xl_rsqrt")
+
+TLI_DEFINE_SCALAR_MASS_FUNC("sinf", "__xl_sinf")
+TLI_DEFINE_SCALAR_MASS_FUNC("sin", "__xl_sin")
+
+TLI_DEFINE_SCALAR_MASS_FUNC("sinhf", "__xl_sinhf")
+TLI_DEFINE_SCALAR_MASS_FUNC("__sinhf_finite", "__xl_sinhf")
+TLI_DEFINE_SCALAR_MASS_FUNC("sinh", "__xl_sinh")
+TLI_DEFINE_SCALAR_MASS_FUNC("__sinh_finite", "__xl_sinh")
+
+TLI_DEFINE_SCALAR_MASS_FUNC("sqrt", "__xl_sqrt")
+
+TLI_DEFINE_SCALAR_MASS_FUNC("tanf", "__xl_tanf")
+TLI_DEFINE_SCALAR_MASS_FUNC("tan", "__xl_tan")
+
+TLI_DEFINE_SCALAR_MASS_FUNC("tanhf", "__xl_tanhf")
+TLI_DEFINE_SCALAR_MASS_FUNC("tanh", "__xl_tanh")
+
+#undef TLI_DEFINE_SCALAR_MASS_FUNCS
+#undef TLI_DEFINE_SCALAR_MASS_FUNC
diff --git a/llvm/include/llvm/Analysis/SparsePropagation.h b/llvm/include/llvm/Analysis/SparsePropagation.h
index 6eb6d5518a41..428238c5fa0b 100644
--- a/llvm/include/llvm/Analysis/SparsePropagation.h
+++ b/llvm/include/llvm/Analysis/SparsePropagation.h
@@ -15,6 +15,7 @@
 #define LLVM_ANALYSIS_SPARSEPROPAGATION_H
 
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/Support/Debug.h"
 #include <set>
diff --git a/llvm/include/llvm/Analysis/StackLifetime.h b/llvm/include/llvm/Analysis/StackLifetime.h
index 239aec4e258b..7fd88362276a 100644
--- a/llvm/include/llvm/Analysis/StackLifetime.h
+++ b/llvm/include/llvm/Analysis/StackLifetime.h
@@ -14,10 +14,8 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/raw_ostream.h"
-#include <cassert>
 #include <utility>
 
 namespace llvm {
@@ -26,6 +24,7 @@ class AllocaInst;
 class BasicBlock;
 class Function;
 class Instruction;
+class IntrinsicInst;
 
 /// Compute live ranges of allocas.
 /// Live ranges are represented as sets of "interesting" instructions, which are
diff --git a/llvm/include/llvm/Analysis/SyncDependenceAnalysis.h b/llvm/include/llvm/Analysis/SyncDependenceAnalysis.h
index cfc1e20255d1..e6e3efbe0fcb 100644
--- a/llvm/include/llvm/Analysis/SyncDependenceAnalysis.h
+++ b/llvm/include/llvm/Analysis/SyncDependenceAnalysis.h
@@ -16,18 +16,18 @@
 #ifndef LLVM_ANALYSIS_SYNCDEPENDENCEANALYSIS_H
 #define LLVM_ANALYSIS_SYNCDEPENDENCEANALYSIS_H
 
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/Analysis/LoopInfo.h"
 #include <map>
 #include <memory>
 #include <unordered_map>
+#include <vector>
 
 namespace llvm {
 
 class BasicBlock;
 class DominatorTree;
+class Instruction;
+class LoopInfo;
 class PostDominatorTree;
 
 using ConstBlockSet = SmallPtrSet<const BasicBlock *, 4>;
diff --git a/llvm/include/llvm/Analysis/SyntheticCountsUtils.h b/llvm/include/llvm/Analysis/SyntheticCountsUtils.h
index f9bac739cee6..458b599f2937 100644
--- a/llvm/include/llvm/Analysis/SyntheticCountsUtils.h
+++ b/llvm/include/llvm/Analysis/SyntheticCountsUtils.h
@@ -13,7 +13,7 @@
 #ifndef LLVM_ANALYSIS_SYNTHETICCOUNTSUTILS_H
 #define LLVM_ANALYSIS_SYNTHETICCOUNTSUTILS_H
 
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Support/ScaledNumber.h"
 
diff --git a/llvm/include/llvm/Analysis/TargetFolder.h b/llvm/include/llvm/Analysis/TargetFolder.h
index 1df0530e40e6..3a7218b10b97 100644
--- a/llvm/include/llvm/Analysis/TargetFolder.h
+++ b/llvm/include/llvm/Analysis/TargetFolder.h
@@ -21,12 +21,14 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/IRBuilderFolder.h"
+#include "llvm/IR/Operator.h"
 
 namespace llvm {
 
+class Constant;
 class DataLayout;
+class Type;
 
 /// TargetFolder - Create constants with target dependent folding.
 class TargetFolder final : public IRBuilderFolder {
@@ -48,31 +50,45 @@ public:
   // Return an existing value or a constant if the operation can be simplified.
   // Otherwise return nullptr.
   //===--------------------------------------------------------------------===//
-  Value *FoldAdd(Value *LHS, Value *RHS, bool HasNUW = false,
-                 bool HasNSW = false) const override {
+
+  Value *FoldBinOp(Instruction::BinaryOps Opc, Value *LHS,
+                   Value *RHS) const override {
     auto *LC = dyn_cast<Constant>(LHS);
     auto *RC = dyn_cast<Constant>(RHS);
     if (LC && RC)
-      return Fold(ConstantExpr::getAdd(LC, RC, HasNUW, HasNSW));
+      return Fold(ConstantExpr::get(Opc, LC, RC));
     return nullptr;
   }
 
-  Value *FoldAnd(Value *LHS, Value *RHS) const override {
+  Value *FoldExactBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS,
+                        bool IsExact) const override {
     auto *LC = dyn_cast<Constant>(LHS);
     auto *RC = dyn_cast<Constant>(RHS);
     if (LC && RC)
-      return Fold(ConstantExpr::getAnd(LC, RC));
+      return Fold(ConstantExpr::get(
+          Opc, LC, RC, IsExact ? PossiblyExactOperator::IsExact : 0));
     return nullptr;
   }
 
-  Value *FoldOr(Value *LHS, Value *RHS) const override {
+  Value *FoldNoWrapBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS,
+                         bool HasNUW, bool HasNSW) const override {
     auto *LC = dyn_cast<Constant>(LHS);
     auto *RC = dyn_cast<Constant>(RHS);
-    if (LC && RC)
-      return Fold(ConstantExpr::getOr(LC, RC));
+    if (LC && RC) {
+      unsigned Flags = 0;
+      if (HasNUW)
+        Flags |= OverflowingBinaryOperator::NoUnsignedWrap;
+      if (HasNSW)
+        Flags |= OverflowingBinaryOperator::NoSignedWrap;
+      return Fold(ConstantExpr::get(Opc, LC, RC, Flags));
+    }
     return nullptr;
   }
 
+  Value *FoldBinOpFMF(Instruction::BinaryOps Opc, Value *LHS, Value *RHS,
+                      FastMathFlags FMF) const override {
+    return FoldBinOp(Opc, LHS, RHS);
+  }
   Value *FoldICmp(CmpInst::Predicate P, Value *LHS, Value *RHS) const override {
     auto *LC = dyn_cast<Constant>(LHS);
     auto *RC = dyn_cast<Constant>(RHS);
@@ -105,82 +121,56 @@ public:
     return nullptr;
   }
 
-  //===--------------------------------------------------------------------===//
-  // Binary Operators
-  //===--------------------------------------------------------------------===//
-
-  Constant *CreateFAdd(Constant *LHS, Constant *RHS) const override {
-    return Fold(ConstantExpr::getFAdd(LHS, RHS));
-  }
-  Constant *CreateSub(Constant *LHS, Constant *RHS,
-                      bool HasNUW = false, bool HasNSW = false) const override {
-    return Fold(ConstantExpr::getSub(LHS, RHS, HasNUW, HasNSW));
-  }
-  Constant *CreateFSub(Constant *LHS, Constant *RHS) const override {
-    return Fold(ConstantExpr::getFSub(LHS, RHS));
-  }
-  Constant *CreateMul(Constant *LHS, Constant *RHS,
-                      bool HasNUW = false, bool HasNSW = false) const override {
-    return Fold(ConstantExpr::getMul(LHS, RHS, HasNUW, HasNSW));
-  }
-  Constant *CreateFMul(Constant *LHS, Constant *RHS) const override {
-    return Fold(ConstantExpr::getFMul(LHS, RHS));
-  }
-  Constant *CreateUDiv(Constant *LHS, Constant *RHS,
-                       bool isExact = false) const override {
-    return Fold(ConstantExpr::getUDiv(LHS, RHS, isExact));
-  }
-  Constant *CreateSDiv(Constant *LHS, Constant *RHS,
-                       bool isExact = false) const override {
-    return Fold(ConstantExpr::getSDiv(LHS, RHS, isExact));
-  }
-  Constant *CreateFDiv(Constant *LHS, Constant *RHS) const override {
-    return Fold(ConstantExpr::getFDiv(LHS, RHS));
-  }
-  Constant *CreateURem(Constant *LHS, Constant *RHS) const override {
-    return Fold(ConstantExpr::getURem(LHS, RHS));
-  }
-  Constant *CreateSRem(Constant *LHS, Constant *RHS) const override {
-    return Fold(ConstantExpr::getSRem(LHS, RHS));
-  }
-  Constant *CreateFRem(Constant *LHS, Constant *RHS) const override {
-    return Fold(ConstantExpr::getFRem(LHS, RHS));
-  }
-  Constant *CreateShl(Constant *LHS, Constant *RHS,
-                      bool HasNUW = false, bool HasNSW = false) const override {
-    return Fold(ConstantExpr::getShl(LHS, RHS, HasNUW, HasNSW));
-  }
-  Constant *CreateLShr(Constant *LHS, Constant *RHS,
-                       bool isExact = false) const override {
-    return Fold(ConstantExpr::getLShr(LHS, RHS, isExact));
+  Value *FoldExtractValue(Value *Agg,
+                          ArrayRef<unsigned> IdxList) const override {
+    if (auto *CAgg = dyn_cast<Constant>(Agg))
+      return ConstantFoldExtractValueInstruction(CAgg, IdxList);
+    return nullptr;
+  };
+
+  Value *FoldInsertValue(Value *Agg, Value *Val,
+                         ArrayRef<unsigned> IdxList) const override {
+    auto *CAgg = dyn_cast<Constant>(Agg);
+    auto *CVal = dyn_cast<Constant>(Val);
+    if (CAgg && CVal)
+      return ConstantFoldInsertValueInstruction(CAgg, CVal, IdxList);
+    return nullptr;
   }
-  Constant *CreateAShr(Constant *LHS, Constant *RHS,
-                       bool isExact = false) const override {
-    return Fold(ConstantExpr::getAShr(LHS, RHS, isExact));
+
+  Value *FoldExtractElement(Value *Vec, Value *Idx) const override {
+    auto *CVec = dyn_cast<Constant>(Vec);
+    auto *CIdx = dyn_cast<Constant>(Idx);
+    if (CVec && CIdx)
+      return Fold(ConstantExpr::getExtractElement(CVec, CIdx));
+    return nullptr;
   }
-  Constant *CreateXor(Constant *LHS, Constant *RHS) const override {
-    return Fold(ConstantExpr::getXor(LHS, RHS));
+
+  Value *FoldInsertElement(Value *Vec, Value *NewElt,
+                           Value *Idx) const override {
+    auto *CVec = dyn_cast<Constant>(Vec);
+    auto *CNewElt = dyn_cast<Constant>(NewElt);
+    auto *CIdx = dyn_cast<Constant>(Idx);
+    if (CVec && CNewElt && CIdx)
+      return Fold(ConstantExpr::getInsertElement(CVec, CNewElt, CIdx));
+    return nullptr;
   }
 
-  Constant *CreateBinOp(Instruction::BinaryOps Opc,
-                        Constant *LHS, Constant *RHS) const override {
-    return Fold(ConstantExpr::get(Opc, LHS, RHS));
+  Value *FoldShuffleVector(Value *V1, Value *V2,
+                           ArrayRef<int> Mask) const override {
+    auto *C1 = dyn_cast<Constant>(V1);
+    auto *C2 = dyn_cast<Constant>(V2);
+    if (C1 && C2)
+      return Fold(ConstantExpr::getShuffleVector(C1, C2, Mask));
+    return nullptr;
   }
 
   //===--------------------------------------------------------------------===//
   // Unary Operators
   //===--------------------------------------------------------------------===//
 
-  Constant *CreateNeg(Constant *C,
-                      bool HasNUW = false, bool HasNSW = false) const override {
-    return Fold(ConstantExpr::getNeg(C, HasNUW, HasNSW));
-  }
   Constant *CreateFNeg(Constant *C) const override {
     return Fold(ConstantExpr::getFNeg(C));
   }
-  Constant *CreateNot(Constant *C) const override {
-    return Fold(ConstantExpr::getNot(C));
-  }
 
   Constant *CreateUnOp(Instruction::UnaryOps Opc, Constant *C) const override {
     return Fold(ConstantExpr::get(Opc, C));
@@ -252,34 +242,6 @@ public:
                        Constant *RHS) const override {
     return Fold(ConstantExpr::getCompare(P, LHS, RHS));
   }
-
-  //===--------------------------------------------------------------------===//
-  // Other Instructions
-  //===--------------------------------------------------------------------===//
-
-  Constant *CreateExtractElement(Constant *Vec, Constant *Idx) const override {
-    return Fold(ConstantExpr::getExtractElement(Vec, Idx));
-  }
-
-  Constant *CreateInsertElement(Constant *Vec, Constant *NewElt,
-                                Constant *Idx) const override {
-    return Fold(ConstantExpr::getInsertElement(Vec, NewElt, Idx));
-  }
-
-  Constant *CreateShuffleVector(Constant *V1, Constant *V2,
-                                ArrayRef<int> Mask) const override {
-    return Fold(ConstantExpr::getShuffleVector(V1, V2, Mask));
-  }
-
-  Constant *CreateExtractValue(Constant *Agg,
-                               ArrayRef<unsigned> IdxList) const override {
-    return Fold(ConstantExpr::getExtractValue(Agg, IdxList));
-  }
-
-  Constant *CreateInsertValue(Constant *Agg, Constant *Val,
-                              ArrayRef<unsigned> IdxList) const override {
-    return Fold(ConstantExpr::getInsertValue(Agg, Val, IdxList));
-  }
 };
 
 }
diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
index 17d1e3f770c1..7bfda0124de7 100644
--- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h
+++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
@@ -12,14 +12,15 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
-#include "llvm/IR/Function.h"
 #include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 
 namespace llvm {
+
 template <typename T> class ArrayRef;
+class Function;
+class Module;
 class Triple;
 
 /// Describes a possible vectorization of a function.
@@ -49,7 +50,7 @@ class TargetLibraryInfoImpl {
   friend class TargetLibraryInfo;
 
   unsigned char AvailableArray[(NumLibFuncs+3)/4];
-  llvm::DenseMap<unsigned, std::string> CustomNames;
+  DenseMap<unsigned, std::string> CustomNames;
   static StringLiteral const StandardNames[NumLibFuncs];
   bool ShouldExtI32Param, ShouldExtI32Return, ShouldSignExtI32Param;
   unsigned SizeOfInt;
@@ -279,6 +280,13 @@ public:
     return B == OverrideAsUnavailable;
   }
 
+  /// Return true if the function type FTy is valid for the library function
+  /// F, regardless of whether the function is available.
+  bool isValidProtoForLibFunc(const FunctionType &FTy, LibFunc F,
+                              const Module &M) const {
+    return Impl->isValidProtoForLibFunc(FTy, F, M);
+  }
+
   /// Searches for a particular function name.
   ///
   /// If it is one of the known library functions, return true and set F to the
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 7412e050322e..372f17cfc7ff 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -21,13 +21,13 @@
 #ifndef LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
 #define LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
 
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/IR/FMF.h"
 #include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Operator.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/BranchProbability.h"
-#include "llvm/Support/DataTypes.h"
 #include "llvm/Support/InstructionCost.h"
 #include <functional>
 #include <utility>
@@ -617,8 +617,8 @@ public:
                              Instruction *I = nullptr) const;
 
   /// Return true if LSR cost of C1 is lower than C1.
-  bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
-                     TargetTransformInfo::LSRCost &C2) const;
+  bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
+                     const TargetTransformInfo::LSRCost &C2) const;
 
   /// Return true if LSR major cost is number of registers. Targets which
   /// implement their own isLSRCostLess and unset number of registers as major
@@ -659,6 +659,10 @@ public:
   /// Return true if the target supports nontemporal load.
   bool isLegalNTLoad(Type *DataType, Align Alignment) const;
 
+  /// \Returns true if the target supports broadcasting a load to a vector of
+  /// type <NumElements x ElementTy>.
+  bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const;
+
   /// Return true if the target supports masked scatter.
   bool isLegalMaskedScatter(Type *DataType, Align Alignment) const;
   /// Return true if the target supports masked gather.
@@ -675,6 +679,16 @@ public:
   /// Return true if the target supports masked expand load.
   bool isLegalMaskedExpandLoad(Type *DataType) const;
 
+  /// Return true if this is an alternating opcode pattern that can be lowered
+  /// to a single instruction on the target. In X86 this is for the addsub
+  /// instruction which corrsponds to a Shuffle + Fadd + FSub pattern in IR.
+  /// This function expectes two opcodes: \p Opcode1 and \p Opcode2 being
+  /// selected by \p OpcodeMask. The mask contains one bit per lane and is a `0`
+  /// when \p Opcode0 is selected and `1` when Opcode1 is selected.
+  /// \p VecTy is the vector type of the instruction to be generated.
+  bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1,
+                       const SmallBitVector &OpcodeMask) const;
+
   /// Return true if we should be enabling ordered reductions for the target.
   bool enableOrderedReductions() const;
 
@@ -727,7 +741,7 @@ public:
   bool isTypeLegal(Type *Ty) const;
 
   /// Returns the estimated number of registers required to represent \p Ty.
-  InstructionCost getRegUsageForType(Type *Ty) const;
+  unsigned getRegUsageForType(Type *Ty) const;
 
   /// Return true if switches should be turned into lookup tables for the
   /// target.
@@ -762,6 +776,9 @@ public:
   /// the scalarization cost of a load/store.
   bool supportsEfficientVectorElementLoadStore() const;
 
+  /// If the target supports tail calls.
+  bool supportsTailCalls() const;
+
   /// Don't restrict interleaved unrolling to small loops.
   bool enableAggressiveInterleaving(bool LoopHasReductions) const;
 
@@ -934,7 +951,8 @@ public:
   /// creating vectors that span multiple vector registers.
   /// If false, the vectorization factor will be chosen based on the
   /// size of the widest element type.
-  bool shouldMaximizeVectorBandwidth() const;
+  /// \p K Register Kind for vectorization.
+  bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const;
 
   /// \return The minimum vectorization factor for types of given element
   /// bit width, or 0 if there is no minimum VF. The returned value only
@@ -947,6 +965,17 @@ public:
   /// Currently only used by the SLP vectorizer.
   unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const;
 
+  /// \return The minimum vectorization factor for the store instruction. Given
+  /// the initial estimation of the minimum vector factor and store value type,
+  /// it tries to find possible lowest VF, which still might be profitable for
+  /// the vectorization.
+  /// \param VF Initial estimation of the minimum vector factor.
+  /// \param ScalarMemTy Scalar memory type of the store operation.
+  /// \param ScalarValTy Scalar type of the stored value.
+  /// Currently only used by the SLP vectorizer.
+  unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
+                             Type *ScalarValTy) const;
+
   /// \return True if it should be considered for address type promotion.
   /// \p AllowPromotionWithoutCommonHeader Set true if promoting \p I is
   /// profitable without finding other extensions fed by the same input.
@@ -1045,11 +1074,14 @@ public:
   /// The exact mask may be passed as Mask, or else the array will be empty.
   /// The index and subtype parameters are used by the subvector insertion and
   /// extraction shuffle kinds to show the insert/extract point and the type of
-  /// the subvector being inserted/extracted.
+  /// the subvector being inserted/extracted. The operands of the shuffle can be
+  /// passed through \p Args, which helps improve the cost estimation in some
+  /// cases, like in broadcast loads.
   /// NOTE: For subvector extractions Tp represents the source type.
   InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp,
                                  ArrayRef<int> Mask = None, int Index = 0,
-                                 VectorType *SubTp = nullptr) const;
+                                 VectorType *SubTp = nullptr,
+                                 ArrayRef<const Value *> Args = None) const;
 
   /// Represents a hint about the context in which a cast is used.
   ///
@@ -1283,9 +1315,11 @@ public:
                                            Type *ExpectedType) const;
 
   /// \returns The type to use in a loop expansion of a memcpy call.
-  Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
-                                  unsigned SrcAddrSpace, unsigned DestAddrSpace,
-                                  unsigned SrcAlign, unsigned DestAlign) const;
+  Type *
+  getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
+                            unsigned SrcAddrSpace, unsigned DestAddrSpace,
+                            unsigned SrcAlign, unsigned DestAlign,
+                            Optional<uint32_t> AtomicElementSize = None) const;
 
   /// \param[out] OpsOut The operand types to copy RemainingBytes of memory.
   /// \param RemainingBytes The number of bytes to copy.
@@ -1296,7 +1330,8 @@ public:
   void getMemcpyLoopResidualLoweringType(
       SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
       unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
-      unsigned SrcAlign, unsigned DestAlign) const;
+      unsigned SrcAlign, unsigned DestAlign,
+      Optional<uint32_t> AtomicCpySize = None) const;
 
   /// \returns True if the two functions have compatible attributes for inlining
   /// purposes.
@@ -1536,8 +1571,8 @@ public:
                                      int64_t BaseOffset, bool HasBaseReg,
                                      int64_t Scale, unsigned AddrSpace,
                                      Instruction *I) = 0;
-  virtual bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
-                             TargetTransformInfo::LSRCost &C2) = 0;
+  virtual bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
+                             const TargetTransformInfo::LSRCost &C2) = 0;
   virtual bool isNumRegsMajorCostOfLSR() = 0;
   virtual bool isProfitableLSRChainElement(Instruction *I) = 0;
   virtual bool canMacroFuseCmp() = 0;
@@ -1550,6 +1585,8 @@ public:
   virtual bool isLegalMaskedLoad(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalNTStore(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalNTLoad(Type *DataType, Align Alignment) = 0;
+  virtual bool isLegalBroadcastLoad(Type *ElementTy,
+                                    ElementCount NumElements) const = 0;
   virtual bool isLegalMaskedScatter(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalMaskedGather(Type *DataType, Align Alignment) = 0;
   virtual bool forceScalarizeMaskedGather(VectorType *DataType,
@@ -1558,6 +1595,9 @@ public:
                                            Align Alignment) = 0;
   virtual bool isLegalMaskedCompressStore(Type *DataType) = 0;
   virtual bool isLegalMaskedExpandLoad(Type *DataType) = 0;
+  virtual bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
+                               unsigned Opcode1,
+                               const SmallBitVector &OpcodeMask) const = 0;
   virtual bool enableOrderedReductions() = 0;
   virtual bool hasDivRemOp(Type *DataType, bool IsSigned) = 0;
   virtual bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) = 0;
@@ -1571,7 +1611,7 @@ public:
   virtual bool isProfitableToHoist(Instruction *I) = 0;
   virtual bool useAA() = 0;
   virtual bool isTypeLegal(Type *Ty) = 0;
-  virtual InstructionCost getRegUsageForType(Type *Ty) = 0;
+  virtual unsigned getRegUsageForType(Type *Ty) = 0;
   virtual bool shouldBuildLookupTables() = 0;
   virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0;
   virtual bool shouldBuildRelLookupTables() = 0;
@@ -1584,6 +1624,7 @@ public:
   getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
                                    ArrayRef<Type *> Tys) = 0;
   virtual bool supportsEfficientVectorElementLoadStore() = 0;
+  virtual bool supportsTailCalls() = 0;
   virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
   virtual MemCmpExpansionOptions
   enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const = 0;
@@ -1618,10 +1659,13 @@ public:
   virtual unsigned getMinVectorRegisterBitWidth() const = 0;
   virtual Optional<unsigned> getMaxVScale() const = 0;
   virtual Optional<unsigned> getVScaleForTuning() const = 0;
-  virtual bool shouldMaximizeVectorBandwidth() const = 0;
+  virtual bool
+  shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const = 0;
   virtual ElementCount getMinimumVF(unsigned ElemWidth,
                                     bool IsScalable) const = 0;
   virtual unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const = 0;
+  virtual unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
+                                     Type *ScalarValTy) const = 0;
   virtual bool shouldConsiderAddressTypePromotion(
       const Instruction &I, bool &AllowPromotionWithoutCommonHeader) = 0;
   virtual unsigned getCacheLineSize() const = 0;
@@ -1660,7 +1704,8 @@ public:
       ArrayRef<const Value *> Args, const Instruction *CxtI = nullptr) = 0;
   virtual InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp,
                                          ArrayRef<int> Mask, int Index,
-                                         VectorType *SubTp) = 0;
+                                         VectorType *SubTp,
+                                         ArrayRef<const Value *> Args) = 0;
   virtual InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst,
                                            Type *Src, CastContextHint CCH,
                                            TTI::TargetCostKind CostKind,
@@ -1734,15 +1779,17 @@ public:
   virtual unsigned getAtomicMemIntrinsicMaxElementSize() const = 0;
   virtual Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
                                                    Type *ExpectedType) = 0;
-  virtual Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
-                                          unsigned SrcAddrSpace,
-                                          unsigned DestAddrSpace,
-                                          unsigned SrcAlign,
-                                          unsigned DestAlign) const = 0;
+  virtual Type *
+  getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
+                            unsigned SrcAddrSpace, unsigned DestAddrSpace,
+                            unsigned SrcAlign, unsigned DestAlign,
+                            Optional<uint32_t> AtomicElementSize) const = 0;
+
   virtual void getMemcpyLoopResidualLoweringType(
       SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
       unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
-      unsigned SrcAlign, unsigned DestAlign) const = 0;
+      unsigned SrcAlign, unsigned DestAlign,
+      Optional<uint32_t> AtomicCpySize) const = 0;
   virtual bool areInlineCompatible(const Function *Caller,
                                    const Function *Callee) const = 0;
   virtual bool areTypesABICompatible(const Function *Caller,
@@ -1920,8 +1967,8 @@ public:
     return Impl.isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg, Scale,
                                       AddrSpace, I);
   }
-  bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
-                     TargetTransformInfo::LSRCost &C2) override {
+  bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
+                     const TargetTransformInfo::LSRCost &C2) override {
     return Impl.isLSRCostLess(C1, C2);
   }
   bool isNumRegsMajorCostOfLSR() override {
@@ -1953,6 +2000,10 @@ public:
   bool isLegalNTLoad(Type *DataType, Align Alignment) override {
     return Impl.isLegalNTLoad(DataType, Alignment);
   }
+  bool isLegalBroadcastLoad(Type *ElementTy,
+                            ElementCount NumElements) const override {
+    return Impl.isLegalBroadcastLoad(ElementTy, NumElements);
+  }
   bool isLegalMaskedScatter(Type *DataType, Align Alignment) override {
     return Impl.isLegalMaskedScatter(DataType, Alignment);
   }
@@ -1973,6 +2024,10 @@ public:
   bool isLegalMaskedExpandLoad(Type *DataType) override {
     return Impl.isLegalMaskedExpandLoad(DataType);
   }
+  bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1,
+                       const SmallBitVector &OpcodeMask) const override {
+    return Impl.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask);
+  }
   bool enableOrderedReductions() override {
     return Impl.enableOrderedReductions();
   }
@@ -2001,7 +2056,7 @@ public:
   }
   bool useAA() override { return Impl.useAA(); }
   bool isTypeLegal(Type *Ty) override { return Impl.isTypeLegal(Ty); }
-  InstructionCost getRegUsageForType(Type *Ty) override {
+  unsigned getRegUsageForType(Type *Ty) override {
     return Impl.getRegUsageForType(Ty);
   }
   bool shouldBuildLookupTables() override {
@@ -2032,6 +2087,8 @@ public:
     return Impl.supportsEfficientVectorElementLoadStore();
   }
 
+  bool supportsTailCalls() override { return Impl.supportsTailCalls(); }
+
   bool enableAggressiveInterleaving(bool LoopHasReductions) override {
     return Impl.enableAggressiveInterleaving(LoopHasReductions);
   }
@@ -2108,8 +2165,9 @@ public:
   Optional<unsigned> getVScaleForTuning() const override {
     return Impl.getVScaleForTuning();
   }
-  bool shouldMaximizeVectorBandwidth() const override {
-    return Impl.shouldMaximizeVectorBandwidth();
+  bool shouldMaximizeVectorBandwidth(
+      TargetTransformInfo::RegisterKind K) const override {
+    return Impl.shouldMaximizeVectorBandwidth(K);
   }
   ElementCount getMinimumVF(unsigned ElemWidth,
                             bool IsScalable) const override {
@@ -2118,6 +2176,10 @@ public:
   unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override {
     return Impl.getMaximumVF(ElemWidth, Opcode);
   }
+  unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
+                             Type *ScalarValTy) const override {
+    return Impl.getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);
+  }
   bool shouldConsiderAddressTypePromotion(
       const Instruction &I, bool &AllowPromotionWithoutCommonHeader) override {
     return Impl.shouldConsiderAddressTypePromotion(
@@ -2180,8 +2242,9 @@ public:
   }
   InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp,
                                  ArrayRef<int> Mask, int Index,
-                                 VectorType *SubTp) override {
-    return Impl.getShuffleCost(Kind, Tp, Mask, Index, SubTp);
+                                 VectorType *SubTp,
+                                 ArrayRef<const Value *> Args) override {
+    return Impl.getShuffleCost(Kind, Tp, Mask, Index, SubTp, Args);
   }
   InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                                    CastContextHint CCH,
@@ -2298,20 +2361,22 @@ public:
                                            Type *ExpectedType) override {
     return Impl.getOrCreateResultFromMemIntrinsic(Inst, ExpectedType);
   }
-  Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
-                                  unsigned SrcAddrSpace, unsigned DestAddrSpace,
-                                  unsigned SrcAlign,
-                                  unsigned DestAlign) const override {
+  Type *getMemcpyLoopLoweringType(
+      LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
+      unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
+      Optional<uint32_t> AtomicElementSize) const override {
     return Impl.getMemcpyLoopLoweringType(Context, Length, SrcAddrSpace,
-                                          DestAddrSpace, SrcAlign, DestAlign);
+                                          DestAddrSpace, SrcAlign, DestAlign,
+                                          AtomicElementSize);
   }
   void getMemcpyLoopResidualLoweringType(
       SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
       unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
-      unsigned SrcAlign, unsigned DestAlign) const override {
+      unsigned SrcAlign, unsigned DestAlign,
+      Optional<uint32_t> AtomicCpySize) const override {
     Impl.getMemcpyLoopResidualLoweringType(OpsOut, Context, RemainingBytes,
                                            SrcAddrSpace, DestAddrSpace,
-                                           SrcAlign, DestAlign);
+                                           SrcAlign, DestAlign, AtomicCpySize);
   }
   bool areInlineCompatible(const Function *Caller,
                            const Function *Callee) const override {
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index a32744f8d58b..a70c418974f5 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -18,18 +18,16 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Type.h"
 #include <utility>
 
-using namespace llvm::PatternMatch;
-
 namespace llvm {
 
+class Function;
+
 /// Base class for use as a mix-in that aids implementing
 /// a TargetTransformInfo-compatible class.
 class TargetTransformInfoImplBase {
@@ -212,7 +210,7 @@ public:
     return !BaseGV && BaseOffset == 0 && (Scale == 0 || Scale == 1);
   }
 
-  bool isLSRCostLess(TTI::LSRCost &C1, TTI::LSRCost &C2) const {
+  bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const {
     return std::tie(C1.NumRegs, C1.AddRecCost, C1.NumIVMuls, C1.NumBaseAdds,
                     C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
            std::tie(C2.NumRegs, C2.AddRecCost, C2.NumIVMuls, C2.NumBaseAdds,
@@ -258,6 +256,10 @@ public:
     return Alignment >= DataSize && isPowerOf2_32(DataSize);
   }
 
+  bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const {
+    return false;
+  }
+
   bool isLegalMaskedScatter(Type *DataType, Align Alignment) const {
     return false;
   }
@@ -277,6 +279,11 @@ public:
 
   bool isLegalMaskedCompressStore(Type *DataType) const { return false; }
 
+  bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1,
+                       const SmallBitVector &OpcodeMask) const {
+    return false;
+  }
+
   bool isLegalMaskedExpandLoad(Type *DataType) const { return false; }
 
   bool enableOrderedReductions() const { return false; }
@@ -310,7 +317,7 @@ public:
 
   bool isTypeLegal(Type *Ty) const { return false; }
 
-  InstructionCost getRegUsageForType(Type *Ty) const { return 1; }
+  unsigned getRegUsageForType(Type *Ty) const { return 1; }
 
   bool shouldBuildLookupTables() const { return true; }
 
@@ -333,6 +340,8 @@ public:
 
   bool supportsEfficientVectorElementLoadStore() const { return false; }
 
+  bool supportsTailCalls() const { return true; }
+
   bool enableAggressiveInterleaving(bool LoopHasReductions) const {
     return false;
   }
@@ -415,13 +424,17 @@ public:
   Optional<unsigned> getMaxVScale() const { return None; }
   Optional<unsigned> getVScaleForTuning() const { return None; }
 
-  bool shouldMaximizeVectorBandwidth() const { return false; }
+  bool
+  shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const {
+    return false;
+  }
 
   ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const {
     return ElementCount::get(0, IsScalable);
   }
 
   unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { return 0; }
+  unsigned getStoreMinimumVF(unsigned VF, Type *, Type *) const { return VF; }
 
   bool shouldConsiderAddressTypePromotion(
       const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
@@ -490,7 +503,8 @@ public:
 
   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Ty,
                                  ArrayRef<int> Mask, int Index,
-                                 VectorType *SubTp) const {
+                                 VectorType *SubTp,
+                                 ArrayRef<const Value *> Args = None) const {
     return 1;
   }
 
@@ -697,16 +711,21 @@ public:
 
   Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
                                   unsigned SrcAddrSpace, unsigned DestAddrSpace,
-                                  unsigned SrcAlign, unsigned DestAlign) const {
-    return Type::getInt8Ty(Context);
+                                  unsigned SrcAlign, unsigned DestAlign,
+                                  Optional<uint32_t> AtomicElementSize) const {
+    return AtomicElementSize ? Type::getIntNTy(Context, *AtomicElementSize * 8)
+                             : Type::getInt8Ty(Context);
   }
 
   void getMemcpyLoopResidualLoweringType(
       SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
       unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
-      unsigned SrcAlign, unsigned DestAlign) const {
-    for (unsigned i = 0; i != RemainingBytes; ++i)
-      OpsOut.push_back(Type::getInt8Ty(Context));
+      unsigned SrcAlign, unsigned DestAlign,
+      Optional<uint32_t> AtomicCpySize) const {
+    unsigned OpSizeInBytes = AtomicCpySize ? *AtomicCpySize : 1;
+    Type *OpType = Type::getIntNTy(Context, OpSizeInBytes * 8);
+    for (unsigned i = 0; i != RemainingBytes; i += OpSizeInBytes)
+      OpsOut.push_back(OpType);
   }
 
   bool areInlineCompatible(const Function *Caller,
@@ -960,6 +979,8 @@ public:
 
   InstructionCost getUserCost(const User *U, ArrayRef<const Value *> Operands,
                               TTI::TargetCostKind CostKind) {
+    using namespace llvm::PatternMatch;
+
     auto *TargetTTI = static_cast<T *>(this);
     // Handle non-intrinsic calls, invokes, and callbr.
     // FIXME: Unlikely to be true for anything but CodeSize.
@@ -976,8 +997,6 @@ public:
     }
 
     Type *Ty = U->getType();
-    Type *OpTy =
-      U->getNumOperands() == 1 ? U->getOperand(0)->getType() : nullptr;
     unsigned Opcode = Operator::getOpcode(U);
     auto *I = dyn_cast<Instruction>(U);
     switch (Opcode) {
@@ -1049,9 +1068,11 @@ public:
     case Instruction::FPExt:
     case Instruction::SExt:
     case Instruction::ZExt:
-    case Instruction::AddrSpaceCast:
+    case Instruction::AddrSpaceCast: {
+      Type *OpTy = U->getOperand(0)->getType();
       return TargetTTI->getCastInstrCost(
           Opcode, Ty, OpTy, TTI::getCastContextHint(I), CostKind, I);
+    }
     case Instruction::Store: {
       auto *SI = cast<StoreInst>(U);
       Type *ValTy = U->getOperand(0)->getType();
@@ -1137,13 +1158,14 @@ public:
         if (Shuffle->isExtractSubvectorMask(SubIndex))
           return TargetTTI->getShuffleCost(TTI::SK_ExtractSubvector, VecSrcTy,
                                            Shuffle->getShuffleMask(), SubIndex,
-                                           VecTy);
+                                           VecTy, Operands);
 
         if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex))
           return TargetTTI->getShuffleCost(
               TTI::SK_InsertSubvector, VecTy, Shuffle->getShuffleMask(),
               SubIndex,
-              FixedVectorType::get(VecTy->getScalarType(), NumSubElts));
+              FixedVectorType::get(VecTy->getScalarType(), NumSubElts),
+              Operands);
 
         int ReplicationFactor, VF;
         if (Shuffle->isReplicationMask(ReplicationFactor, VF)) {
@@ -1166,31 +1188,37 @@ public:
 
       if (Shuffle->isReverse())
         return TargetTTI->getShuffleCost(TTI::SK_Reverse, VecTy,
-                                         Shuffle->getShuffleMask(), 0, nullptr);
+                                         Shuffle->getShuffleMask(), 0, nullptr,
+                                         Operands);
 
       if (Shuffle->isSelect())
         return TargetTTI->getShuffleCost(TTI::SK_Select, VecTy,
-                                         Shuffle->getShuffleMask(), 0, nullptr);
+                                         Shuffle->getShuffleMask(), 0, nullptr,
+                                         Operands);
 
       if (Shuffle->isTranspose())
         return TargetTTI->getShuffleCost(TTI::SK_Transpose, VecTy,
-                                         Shuffle->getShuffleMask(), 0, nullptr);
+                                         Shuffle->getShuffleMask(), 0, nullptr,
+                                         Operands);
 
       if (Shuffle->isZeroEltSplat())
         return TargetTTI->getShuffleCost(TTI::SK_Broadcast, VecTy,
-                                         Shuffle->getShuffleMask(), 0, nullptr);
+                                         Shuffle->getShuffleMask(), 0, nullptr,
+                                         Operands);
 
       if (Shuffle->isSingleSource())
         return TargetTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, VecTy,
-                                         Shuffle->getShuffleMask(), 0, nullptr);
+                                         Shuffle->getShuffleMask(), 0, nullptr,
+                                         Operands);
 
       if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex))
         return TargetTTI->getShuffleCost(
             TTI::SK_InsertSubvector, VecTy, Shuffle->getShuffleMask(), SubIndex,
-            FixedVectorType::get(VecTy->getScalarType(), NumSubElts));
+            FixedVectorType::get(VecTy->getScalarType(), NumSubElts), Operands);
 
       return TargetTTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy,
-                                       Shuffle->getShuffleMask(), 0, nullptr);
+                                       Shuffle->getShuffleMask(), 0, nullptr,
+                                       Operands);
     }
     case Instruction::ExtractElement: {
       auto *EEI = dyn_cast<ExtractElementInst>(U);
diff --git a/llvm/include/llvm/Analysis/TensorSpec.h b/llvm/include/llvm/Analysis/TensorSpec.h
new file mode 100644
index 000000000000..382ab3f10445
--- /dev/null
+++ b/llvm/include/llvm/Analysis/TensorSpec.h
@@ -0,0 +1,132 @@
+//===- TensorSpec.h - type descriptor for a tensor --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+#ifndef LLVM_ANALYSIS_TENSORSPEC_H
+#define LLVM_ANALYSIS_TENSORSPEC_H
+
+#include "llvm/Config/llvm-config.h"
+
+#include "llvm/ADT/StringMap.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/JSON.h"
+
+#include <memory>
+#include <vector>
+
+namespace llvm {
+/// TensorSpec encapsulates the specification of a tensor: its dimensions, or
+/// "shape" (row-major), its type (see TensorSpec::getDataType specializations
+/// for supported types), its name and port (see "TensorFlow: Large-Scale
+/// Machine Learning on Heterogeneous Distributed Systems", section 4.2, para 2:
+/// https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf)
+///
+/// Known tensor types. The left part is the C type, the right is a name we
+/// can use to identify the type (to implement TensorSpec equality checks), and
+/// to use, if needed, when mapping to an underlying evaluator's type system.
+/// The main requirement is that the C type we use has the same size and
+/// encoding (e.g. endian-ness) as the one used by the evaluator.
+#define SUPPORTED_TENSOR_TYPES(M)                                              \
+  M(float, Float)                                                              \
+  M(double, Double)                                                            \
+  M(int8_t, Int8)                                                              \
+  M(uint8_t, UInt8)                                                            \
+  M(int16_t, Int16)                                                            \
+  M(uint16_t, UInt16)                                                          \
+  M(int32_t, Int32)                                                            \
+  M(uint32_t, UInt32)                                                          \
+  M(int64_t, Int64)                                                            \
+  M(uint64_t, UInt64)
+
+enum class TensorType {
+  Invalid,
+#define _TENSOR_TYPE_ENUM_MEMBERS(_, Name) Name,
+  SUPPORTED_TENSOR_TYPES(_TENSOR_TYPE_ENUM_MEMBERS)
+#undef _TENSOR_TYPE_ENUM_MEMBERS
+};
+
+class TensorSpec final {
+public:
+  template <typename T>
+  static TensorSpec createSpec(const std::string &Name,
+                               const std::vector<int64_t> &Shape,
+                               int Port = 0) {
+    return TensorSpec(Name, Port, getDataType<T>(), sizeof(T), Shape);
+  }
+
+  const std::string &name() const { return Name; }
+  int port() const { return Port; }
+  TensorType type() const { return Type; }
+  const std::vector<int64_t> &shape() const { return Shape; }
+
+  bool operator==(const TensorSpec &Other) const {
+    return Name == Other.Name && Port == Other.Port && Type == Other.Type &&
+           Shape == Other.Shape;
+  }
+
+  bool operator!=(const TensorSpec &Other) const { return !(*this == Other); }
+
+  /// Get the number of elements in a tensor with this shape.
+  size_t getElementCount() const { return ElementCount; }
+  /// Get the size, in bytes, of one element.
+  size_t getElementByteSize() const { return ElementSize; }
+  /// Get the total size of a memory buffer needed to store the whole tensor.
+  size_t getTotalTensorBufferSize() const { return ElementCount * ElementSize; }
+
+  template <typename T> bool isElementType() const {
+    return getDataType<T>() == Type;
+  }
+
+private:
+  TensorSpec(const std::string &Name, int Port, TensorType Type,
+             size_t ElementSize, const std::vector<int64_t> &Shape);
+
+  template <typename T> static TensorType getDataType();
+
+  std::string Name;
+  int Port = 0;
+  TensorType Type = TensorType::Invalid;
+  std::vector<int64_t> Shape;
+  size_t ElementCount = 0;
+  size_t ElementSize = 0;
+};
+
+/// Construct a TensorSpec from a JSON dictionary of the form:
+/// { "name": <string>,
+///   "port": <int>,
+///   "type": <string. Use LLVM's types, e.g. float, double, int64_t>,
+///   "shape": <array of ints> }
+/// For the "type" field, see the C++ primitive types used in
+/// TFUTILS_SUPPORTED_TYPES.
+Optional<TensorSpec> getTensorSpecFromJSON(LLVMContext &Ctx,
+                                           const json::Value &Value);
+
+struct LoggedFeatureSpec {
+  TensorSpec Spec;
+  Optional<std::string> LoggingName;
+  const std::string &getLoggingName() const {
+    return LoggingName ? *LoggingName : Spec.name();
+  }
+};
+
+/// Load the output specs. If SpecFileOverride is not empty, that path is used.
+/// Otherwise, the file is assumed to be called 'output_spec.json' and be found
+/// under ModelPath (the model directory).
+/// The first output tensor name must match ExpectedDecisionName.
+/// In case of error, the return is None and the error is logged.
+Optional<std::vector<LoggedFeatureSpec>>
+loadOutputSpecs(LLVMContext &Ctx, StringRef ExpectedDecisionName,
+                StringRef ModelPath, StringRef SpecFileOverride = StringRef());
+
+#define TFUTILS_GETDATATYPE_DEF(T, Name)                                       \
+  template <> TensorType TensorSpec::getDataType<T>();
+SUPPORTED_TENSOR_TYPES(TFUTILS_GETDATATYPE_DEF)
+
+#undef TFUTILS_GETDATATYPE_DEF
+} // namespace llvm
+
+#endif // LLVM_ANALYSIS_TENSORSPEC_H
diff --git a/llvm/include/llvm/Analysis/TypeMetadataUtils.h b/llvm/include/llvm/Analysis/TypeMetadataUtils.h
index 074c40942b06..dab67aad1ab0 100644
--- a/llvm/include/llvm/Analysis/TypeMetadataUtils.h
+++ b/llvm/include/llvm/Analysis/TypeMetadataUtils.h
@@ -14,11 +14,11 @@
 #ifndef LLVM_ANALYSIS_TYPEMETADATAUTILS_H
 #define LLVM_ANALYSIS_TYPEMETADATAUTILS_H
 
-#include "llvm/ADT/SmallVector.h"
 #include <cstdint>
 
 namespace llvm {
 
+template <typename T> class SmallVectorImpl;
 class CallBase;
 class CallInst;
 class Constant;
diff --git a/llvm/include/llvm/Analysis/Utils/TFUtils.h b/llvm/include/llvm/Analysis/Utils/TFUtils.h
index 785b9fe949a5..372c35863f3f 100644
--- a/llvm/include/llvm/Analysis/Utils/TFUtils.h
+++ b/llvm/include/llvm/Analysis/Utils/TFUtils.h
@@ -13,6 +13,7 @@
 
 #ifdef LLVM_HAVE_TF_API
 #include "llvm/ADT/StringMap.h"
+#include "llvm/Analysis/TensorSpec.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/JSON.h"
 
@@ -38,86 +39,6 @@ namespace llvm {
 class TFModelEvaluatorImpl;
 class EvaluationResultImpl;
 
-/// TensorSpec encapsulates the specification of a tensor: its dimensions, or
-/// "shape" (row-major), its type (see TensorSpec::getDataType specializations
-/// for supported types), its name and port (see "TensorFlow: Large-Scale
-/// Machine Learning on Heterogeneous Distributed Systems", section 4.2, para 2:
-/// https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf)
-///
-/// TensorSpec is used to set up a TFModelEvaluator by describing the expected
-/// inputs and outputs.
-class TensorSpec final {
-public:
-  template <typename T>
-  static TensorSpec createSpec(const std::string &Name,
-                               const std::vector<int64_t> &Shape,
-                               int Port = 0) {
-    return TensorSpec(Name, Port, getDataType<T>(), Shape);
-  }
-
-  const std::string &name() const { return Name; }
-  int port() const { return Port; }
-  int typeIndex() const { return TypeIndex; }
-  const std::vector<int64_t> &shape() const { return Shape; }
-
-  bool operator==(const TensorSpec &Other) const {
-    return Name == Other.Name && Port == Other.Port &&
-           TypeIndex == Other.TypeIndex && Shape == Other.Shape;
-  }
-
-  bool operator!=(const TensorSpec &Other) const { return !(*this == Other); }
-
-  /// Get the number of elements in a tensor with this shape.
-  size_t getElementCount() const { return ElementCount; }
-  /// Get the size, in bytes, of one element.
-  size_t getElementByteSize() const;
-
-  template <typename T> bool isElementType() const {
-    return getDataType<T>() == TypeIndex;
-  }
-
-private:
-  TensorSpec(const std::string &Name, int Port, int TypeIndex,
-             const std::vector<int64_t> &Shape);
-
-  template <typename T> static int getDataType() {
-    llvm_unreachable("Undefined tensor type");
-  }
-
-  std::string Name;
-  int Port = 0;
-  int TypeIndex = 0;
-  std::vector<int64_t> Shape;
-  size_t ElementCount = 0;
-};
-
-/// Construct a TensorSpec from a JSON dictionary of the form:
-/// { "name": <string>,
-///   "port": <int>,
-///   "type": <string. Use LLVM's types, e.g. float, double, int64_t>,
-///   "shape": <array of ints> }
-/// For the "type" field, see the C++ primitive types used in
-/// TFUTILS_SUPPORTED_TYPES.
-Optional<TensorSpec> getTensorSpecFromJSON(LLVMContext &Ctx,
-                                           const json::Value &Value);
-
-struct LoggedFeatureSpec {
-  TensorSpec Spec;
-  Optional<std::string> LoggingName;
-  const std::string &getLoggingName() const {
-    return LoggingName ? *LoggingName : Spec.name();
-  }
-};
-
-/// Load the output specs. If SpecFileOverride is not empty, that path is used.
-/// Otherwise, the file is assumed to be called 'output_spec.json' and be found
-/// under ModelPath (the model directory).
-/// The first output tensor name must match ExpectedDecisionName.
-/// In case of error, the return is None and the error is logged.
-Optional<std::vector<LoggedFeatureSpec>>
-loadOutputSpecs(LLVMContext &Ctx, StringRef ExpectedDecisionName,
-                StringRef ModelPath, StringRef SpecFileOverride = StringRef());
-
 /// Logging utility - given an ordered specification of features, and assuming
 /// a scalar reward, allow logging feature values and rewards, and then print
 /// as tf.train.SequenceExample text protobuf.
@@ -262,27 +183,6 @@ private:
   std::unique_ptr<TFModelEvaluatorImpl> Impl;
 };
 
-/// List of supported types, as a pair:
-/// - C++ type
-/// - enum name (implementation-specific)
-#define TFUTILS_SUPPORTED_TYPES(M)                                             \
-  M(float, TF_FLOAT)                                                           \
-  M(double, TF_DOUBLE)                                                         \
-  M(int8_t, TF_INT8)                                                           \
-  M(uint8_t, TF_UINT8)                                                         \
-  M(int16_t, TF_INT16)                                                         \
-  M(uint16_t, TF_UINT16)                                                       \
-  M(int32_t, TF_INT32)                                                         \
-  M(uint32_t, TF_UINT32)                                                       \
-  M(int64_t, TF_INT64)                                                         \
-  M(uint64_t, TF_UINT64)
-
-#define TFUTILS_GETDATATYPE_DEF(T, E)                                          \
-  template <> int TensorSpec::getDataType<T>();
-
-TFUTILS_SUPPORTED_TYPES(TFUTILS_GETDATATYPE_DEF)
-
-#undef TFUTILS_GETDATATYPE_DEF
 } // namespace llvm
 
 #endif // LLVM_HAVE_TF_API
diff --git a/llvm/include/llvm/Analysis/ValueLattice.h b/llvm/include/llvm/Analysis/ValueLattice.h
index 1b32fca50697..bc6b279e9ed5 100644
--- a/llvm/include/llvm/Analysis/ValueLattice.h
+++ b/llvm/include/llvm/Analysis/ValueLattice.h
@@ -9,16 +9,18 @@
 #ifndef LLVM_ANALYSIS_VALUELATTICE_H
 #define LLVM_ANALYSIS_VALUELATTICE_H
 
-#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Instructions.h"
-//
+
 //===----------------------------------------------------------------------===//
 //                               ValueLatticeElement
 //===----------------------------------------------------------------------===//
 
 namespace llvm {
 
+class Constant;
+
 /// This class represents lattice values for constants.
 ///
 /// FIXME: This is basically just for bringup, this can be made a lot more rich
diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index 5b39b0244339..3b29bf1d53b4 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -21,12 +21,12 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Operator.h"
 #include <cassert>
 #include <cstdint>
 
 namespace llvm {
 
+class Operator;
 class AddOperator;
 class AllocaInst;
 class APInt;
@@ -463,15 +463,37 @@ constexpr unsigned MaxAnalysisRecursionDepth = 6;
                                     const DominatorTree *DT = nullptr,
                                     const TargetLibraryInfo *TLI = nullptr);
 
+  /// This returns the same result as isSafeToSpeculativelyExecute if Opcode is
+  /// the actual opcode of Inst. If the provided and actual opcode differ, the
+  /// function (virtually) overrides the opcode of Inst with the provided
+  /// Opcode. There are come constraints in this case:
+  /// * If Opcode has a fixed number of operands (eg, as binary operators do),
+  ///   then Inst has to have at least as many leading operands. The function
+  ///   will ignore all trailing operands beyond that number.
+  /// * If Opcode allows for an arbitrary number of operands (eg, as CallInsts
+  ///   do), then all operands are considered.
+  /// * The virtual instruction has to satisfy all typing rules of the provided
+  ///   Opcode.
+  /// * This function is pessimistic in the following sense: If one actually
+  ///   materialized the virtual instruction, then isSafeToSpeculativelyExecute
+  ///   may say that the materialized instruction is speculatable whereas this
+  ///   function may have said that the instruction wouldn't be speculatable.
+  ///   This behavior is a shortcoming in the current implementation and not
+  ///   intentional.
+  bool isSafeToSpeculativelyExecuteWithOpcode(
+      unsigned Opcode, const Operator *Inst, const Instruction *CtxI = nullptr,
+      const DominatorTree *DT = nullptr,
+      const TargetLibraryInfo *TLI = nullptr);
+
   /// Returns true if the result or effects of the given instructions \p I
-  /// depend on or influence global memory.
-  /// Memory dependence arises for example if the instruction reads from
-  /// memory or may produce effects or undefined behaviour. Memory dependent
-  /// instructions generally cannot be reorderd with respect to other memory
-  /// dependent instructions or moved into non-dominated basic blocks.
-  /// Instructions which just compute a value based on the values of their
-  /// operands are not memory dependent.
-  bool mayBeMemoryDependent(const Instruction &I);
+  /// depend values not reachable through the def use graph.
+  /// * Memory dependence arises for example if the instruction reads from
+  ///   memory or may produce effects or undefined behaviour. Memory dependent
+  ///   instructions generally cannot be reorderd with respect to other memory
+  ///   dependent instructions.
+  /// * Control dependence arises for example if the instruction may fault
+  ///   if lifted above a throwing call or infinite loop.
+  bool mayHaveNonDefUseDependency(const Instruction &I);
 
   /// Return true if it is an intrinsic that cannot be speculated but also
   /// cannot trap.
diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index 751c88a4ecbb..0005874ba040 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -236,7 +236,7 @@ class VFDatabase {
       // ensuring that the variant described in the attribute has a
       // corresponding definition or declaration of the vector
       // function in the Module M.
-      if (Shape.hasValue() && (Shape.getValue().ScalarName == ScalarName)) {
+      if (Shape && (Shape.getValue().ScalarName == ScalarName)) {
         assert(CI.getModule()->getFunction(Shape.getValue().VectorName) &&
                "Vector function is missing.");
         Mappings.push_back(Shape.getValue());
@@ -309,16 +309,16 @@ inline Type *ToVectorTy(Type *Scalar, unsigned VF) {
 /// Identify if the intrinsic is trivially vectorizable.
 /// This method returns true if the intrinsic's argument types are all scalars
 /// for the scalar form of the intrinsic and all vectors (or scalars handled by
-/// hasVectorInstrinsicScalarOpd) for the vector form of the intrinsic.
+/// isVectorIntrinsicWithScalarOpAtArg) for the vector form of the intrinsic.
 bool isTriviallyVectorizable(Intrinsic::ID ID);
 
 /// Identifies if the vector form of the intrinsic has a scalar operand.
-bool hasVectorInstrinsicScalarOpd(Intrinsic::ID ID, unsigned ScalarOpdIdx);
+bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
+                                        unsigned ScalarOpdIdx);
 
-/// Identifies if the vector form of the intrinsic has a scalar operand that has
+/// Identifies if the vector form of the intrinsic has a operand that has
 /// an overloaded type.
-bool hasVectorInstrinsicOverloadedScalarOpd(Intrinsic::ID ID,
-                                            unsigned ScalarOpdIdx);
+bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, unsigned OpdIdx);
 
 /// Returns intrinsic ID for call.
 /// For the input call instruction it finds mapping intrinsic and returns
@@ -398,6 +398,24 @@ void narrowShuffleMaskElts(int Scale, ArrayRef<int> Mask,
 bool widenShuffleMaskElts(int Scale, ArrayRef<int> Mask,
                           SmallVectorImpl<int> &ScaledMask);
 
+/// Splits and processes shuffle mask depending on the number of input and
+/// output registers. The function does 2 main things: 1) splits the
+/// source/destination vectors into real registers; 2) do the mask analysis to
+/// identify which real registers are permuted. Then the function processes
+/// resulting registers mask using provided action items. If no input register
+/// is defined, \p NoInputAction action is used. If only 1 input register is
+/// used, \p SingleInputAction is used, otherwise \p ManyInputsAction is used to
+/// process > 2 input registers and masks.
+/// \param Mask Original shuffle mask.
+/// \param NumOfSrcRegs Number of source registers.
+/// \param NumOfDestRegs Number of destination registers.
+/// \param NumOfUsedRegs Number of actually used destination registers.
+void processShuffleMasks(
+    ArrayRef<int> Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs,
+    unsigned NumOfUsedRegs, function_ref<void()> NoInputAction,
+    function_ref<void(ArrayRef<int>, unsigned, unsigned)> SingleInputAction,
+    function_ref<void(ArrayRef<int>, unsigned, unsigned)> ManyInputsAction);
+
 /// Compute a map of integer instructions to their minimum legal type
 /// size.
 ///
diff --git a/llvm/include/llvm/AsmParser/LLLexer.h b/llvm/include/llvm/AsmParser/LLLexer.h
index c30165e4a97b..7bcb33f18768 100644
--- a/llvm/include/llvm/AsmParser/LLLexer.h
+++ b/llvm/include/llvm/AsmParser/LLLexer.h
@@ -37,7 +37,7 @@ namespace llvm {
     lltok::Kind CurKind;
     std::string StrVal;
     unsigned UIntVal;
-    Type *TyVal;
+    Type *TyVal = nullptr;
     APFloat APFloatVal;
     APSInt  APSIntVal;
 
diff --git a/llvm/include/llvm/AsmParser/LLParser.h b/llvm/include/llvm/AsmParser/LLParser.h
index 62af3afbc142..3389475b2c9a 100644
--- a/llvm/include/llvm/AsmParser/LLParser.h
+++ b/llvm/include/llvm/AsmParser/LLParser.h
@@ -14,18 +14,25 @@
 #define LLVM_ASMPARSER_LLPARSER_H
 
 #include "LLLexer.h"
-#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Attributes.h"
+#include "llvm/IR/FMF.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/ModuleSummaryIndex.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/Type.h"
 #include <map>
 
 namespace llvm {
   class Module;
+  class ConstantRange;
+  class FunctionType;
+  class GlobalObject;
+  class SMDiagnostic;
+  class SMLoc;
+  class SourceMgr;
+  class Type;
+  struct MaybeAlign;
+  template <typename T> class Optional;
   class Function;
   class Value;
   class BasicBlock;
@@ -88,6 +95,8 @@ namespace llvm {
     typedef LLLexer::LocTy LocTy;
   private:
     LLVMContext &Context;
+    // Lexer to determine whether to use opaque pointers or not.
+    LLLexer OPLex;
     LLLexer Lex;
     // Module being parsed, null if we are only parsing summary index.
     Module *M;
@@ -150,8 +159,9 @@ namespace llvm {
     LLParser(StringRef F, SourceMgr &SM, SMDiagnostic &Err, Module *M,
              ModuleSummaryIndex *Index, LLVMContext &Context,
              SlotMapping *Slots = nullptr)
-        : Context(Context), Lex(F, SM, Err, Context), M(M), Index(Index),
-          Slots(Slots), BlockAddressPFS(nullptr) {}
+        : Context(Context), OPLex(F, SM, Err, Context),
+          Lex(F, SM, Err, Context), M(M), Index(Index), Slots(Slots),
+          BlockAddressPFS(nullptr) {}
     bool Run(
         bool UpgradeDebugInfo, DataLayoutCallbackTy DataLayoutCallback =
                                    [](StringRef) { return None; });
@@ -263,6 +273,8 @@ namespace llvm {
     bool parseOptionalAlignment(MaybeAlign &Alignment,
                                 bool AllowParens = false);
     bool parseOptionalDerefAttrBytes(lltok::Kind AttrKind, uint64_t &Bytes);
+    bool parseOptionalUWTableKind(UWTableKind &Kind);
+    bool parseAllocKind(AllocFnKind &Kind);
     bool parseScopeAndOrdering(bool IsAtomic, SyncScope::ID &SSID,
                                AtomicOrdering &Ordering);
     bool parseScope(SyncScope::ID &SSID);
@@ -503,6 +515,7 @@ namespace llvm {
     bool parseGlobalValueVector(SmallVectorImpl<Constant *> &Elts,
                                 Optional<unsigned> *InRangeOp = nullptr);
     bool parseOptionalComdat(StringRef GlobalName, Comdat *&C);
+    bool parseSanitizer(GlobalVariable *GV);
     bool parseMetadataAsValue(Value *&V, PerFunctionState &PFS);
     bool parseValueAsMetadata(Metadata *&MD, const Twine &TypeMsg,
                               PerFunctionState *PFS);
diff --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h
index 78ebb35e0ea4..230a1662cc04 100644
--- a/llvm/include/llvm/AsmParser/LLToken.h
+++ b/llvm/include/llvm/AsmParser/LLToken.h
@@ -88,7 +88,6 @@ enum Kind {
   kw_triple,
   kw_source_filename,
   kw_unwind,
-  kw_deplibs, // FIXME: Remove in 4.0
   kw_datalayout,
   kw_volatile,
   kw_atomic,
@@ -112,7 +111,6 @@ enum Kind {
   kw_exact,
   kw_inbounds,
   kw_inrange,
-  kw_align,
   kw_addrspace,
   kw_section,
   kw_partition,
@@ -121,7 +119,6 @@ enum Kind {
   kw_module,
   kw_asm,
   kw_sideeffect,
-  kw_alignstack,
   kw_inteldialect,
   kw_gc,
   kw_prefix,
@@ -177,81 +174,12 @@ enum Kind {
 
   // Attributes:
   kw_attributes,
-  kw_allocsize,
-  kw_alwaysinline,
-  kw_argmemonly,
-  kw_sanitize_address,
-  kw_sanitize_hwaddress,
-  kw_sanitize_memtag,
-  kw_builtin,
-  kw_byval,
-  kw_inalloca,
-  kw_cold,
-  kw_convergent,
-  kw_dereferenceable,
-  kw_dereferenceable_or_null,
-  kw_disable_sanitizer_instrumentation,
-  kw_elementtype,
-  kw_inaccessiblememonly,
-  kw_inaccessiblemem_or_argmemonly,
-  kw_inlinehint,
-  kw_inreg,
-  kw_jumptable,
-  kw_minsize,
-  kw_naked,
-  kw_nest,
-  kw_noalias,
-  kw_noundef,
-  kw_nobuiltin,
-  kw_nocallback,
-  kw_nocapture,
-  kw_noduplicate,
-  kw_nofree,
-  kw_noimplicitfloat,
-  kw_noinline,
-  kw_norecurse,
-  kw_nonlazybind,
-  kw_nomerge,
-  kw_nonnull,
-  kw_noprofile,
-  kw_noredzone,
-  kw_noreturn,
-  kw_nosync,
-  kw_nocf_check,
-  kw_nounwind,
-  kw_nosanitize_coverage,
-  kw_null_pointer_is_valid,
-  kw_optforfuzzing,
-  kw_optnone,
-  kw_optsize,
-  kw_preallocated,
-  kw_readnone,
-  kw_readonly,
-  kw_returned,
-  kw_returns_twice,
-  kw_signext,
-  kw_speculatable,
-  kw_ssp,
-  kw_sspreq,
-  kw_sspstrong,
-  kw_safestack,
-  kw_shadowcallstack,
-  kw_sret,
-  kw_sanitize_thread,
-  kw_sanitize_memory,
-  kw_speculative_load_hardening,
-  kw_strictfp,
-  kw_swifterror,
-  kw_swiftself,
-  kw_swiftasync,
-  kw_uwtable,
-  kw_vscale_range,
-  kw_willreturn,
-  kw_writeonly,
-  kw_zeroext,
-  kw_immarg,
-  kw_byref,
-  kw_mustprogress,
+  kw_sync,
+  kw_async,
+#define GET_ATTR_NAMES
+#define ATTRIBUTE_ENUM(ENUM_NAME, DISPLAY_NAME) \
+  kw_##DISPLAY_NAME,
+#include "llvm/IR/Attributes.inc"
 
   kw_type,
   kw_opaque,
@@ -415,7 +343,6 @@ enum Kind {
   kw_param,
   kw_hotness,
   kw_unknown,
-  kw_hot,
   kw_critical,
   kw_relbf,
   kw_variable,
@@ -464,6 +391,19 @@ enum Kind {
   kw_bit,
   kw_varFlags,
 
+  // GV's with __attribute__((no_sanitize("address"))), or things in
+  // -fsanitize-ignorelist when built with ASan.
+  kw_no_sanitize_address,
+  // GV's with __attribute__((no_sanitize("hwaddress"))), or things in
+  // -fsanitize-ignorelist when built with HWASan.
+  kw_no_sanitize_hwaddress,
+  // GV's with __attribute__((no_sanitize("memtag"))), or things in
+  // -fsanitize-ignorelist when built with memory tagging.
+  kw_no_sanitize_memtag,
+  // GV's where the clang++ frontend (when ASan is used) notes that this is
+  // dynamically initialized, and thus needs ODR detection.
+  kw_sanitize_address_dyninit,
+
   // Unsigned Valued tokens (UIntVal).
   LabelID,    // 42:
   GlobalID,   // @42
diff --git a/llvm/include/llvm/AsmParser/Parser.h b/llvm/include/llvm/AsmParser/Parser.h
index e1c7f746a335..6710ae6e358d 100644
--- a/llvm/include/llvm/AsmParser/Parser.h
+++ b/llvm/include/llvm/AsmParser/Parser.h
@@ -13,7 +13,9 @@
 #ifndef LLVM_ASMPARSER_PARSER_H
 #define LLVM_ASMPARSER_PARSER_H
 
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLForwardCompat.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include <memory>
 
diff --git a/llvm/include/llvm/BinaryFormat/COFF.h b/llvm/include/llvm/BinaryFormat/COFF.h
index e7dde986784f..fb563ff198ef 100644
--- a/llvm/include/llvm/BinaryFormat/COFF.h
+++ b/llvm/include/llvm/BinaryFormat/COFF.h
@@ -24,7 +24,6 @@
 
 #include "llvm/Support/DataTypes.h"
 #include <cassert>
-#include <cstring>
 
 namespace llvm {
 namespace COFF {
@@ -731,6 +730,10 @@ inline bool isReservedSectionNumber(int32_t SectionNumber) {
   return SectionNumber <= 0;
 }
 
+/// Encode section name based on string table offset.
+/// The size of Out must be at least COFF::NameSize.
+bool encodeSectionName(char *Out, uint64_t Offset);
+
 } // End namespace COFF.
 } // End namespace llvm.
 
diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h
new file mode 100644
index 000000000000..9e912c7bd4ba
--- /dev/null
+++ b/llvm/include/llvm/BinaryFormat/DXContainer.h
@@ -0,0 +1,131 @@
+//===-- llvm/BinaryFormat/DXContainer.h - The DXBC file format --*- C++/-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines manifest constants for the DXContainer object file format.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_BINARYFORMAT_DXCONTAINER_H
+#define LLVM_BINARYFORMAT_DXCONTAINER_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/SwapByteOrder.h"
+
+#include <stdint.h>
+
+namespace llvm {
+
+// The DXContainer file format is arranged as a header and "parts". Semantically
+// parts are similar to sections in other object file formats. The File format
+// structure is roughly:
+
+// ┌────────────────────────────────┐
+// │             Header             │
+// ├────────────────────────────────┤
+// │              Part              │
+// ├────────────────────────────────┤
+// │              Part              │
+// ├────────────────────────────────┤
+// │              ...               │
+// └────────────────────────────────┘
+
+namespace dxbc {
+
+struct Hash {
+  uint8_t Digest[16];
+};
+
+enum class HashFlags : uint32_t {
+  None = 0,           // No flags defined.
+  IncludesSource = 1, // This flag indicates that the shader hash was computed
+                      // taking into account source information (-Zss)
+};
+
+struct ShaderHash {
+  uint32_t Flags; // DxilShaderHashFlags
+  uint8_t Digest[16];
+
+  void swapBytes() { sys::swapByteOrder(Flags); }
+};
+
+struct ContainerVersion {
+  uint16_t Major;
+  uint16_t Minor;
+
+  void swapBytes() {
+    sys::swapByteOrder(Major);
+    sys::swapByteOrder(Minor);
+  }
+};
+
+struct Header {
+  uint8_t Magic[4]; // "DXBC"
+  Hash FileHash;
+  ContainerVersion Version;
+  uint32_t FileSize;
+  uint32_t PartCount;
+
+  void swapBytes() {
+    Version.swapBytes();
+    sys::swapByteOrder(FileSize);
+    sys::swapByteOrder(PartCount);
+  }
+  // Structure is followed by part offsets: uint32_t PartOffset[PartCount];
+  // The offset is to a PartHeader, which is followed by the Part Data.
+};
+
+/// Use this type to describe the size and type of a DXIL container part.
+struct PartHeader {
+  uint8_t Name[4];
+  uint32_t Size;
+
+  void swapBytes() { sys::swapByteOrder(Size); }
+  StringRef getName() const {
+    return StringRef(reinterpret_cast<const char *>(&Name[0]), 4);
+  }
+  // Structure is followed directly by part data: uint8_t PartData[PartSize].
+};
+
+struct BitcodeHeader {
+  uint8_t Magic[4];     // ACSII "DXIL".
+  uint8_t MajorVersion; // DXIL version.
+  uint8_t MinorVersion; // DXIL version.
+  uint16_t Unused;
+  uint32_t Offset; // Offset to LLVM bitcode (from start of header).
+  uint32_t Size;   // Size of LLVM bitcode (in bytes).
+  // Followed by uint8_t[BitcodeHeader.Size] at &BitcodeHeader + Header.Offset
+
+  void swapBytes() {
+    sys::swapByteOrder(MinorVersion);
+    sys::swapByteOrder(MajorVersion);
+    sys::swapByteOrder(Offset);
+    sys::swapByteOrder(Size);
+  }
+};
+
+struct ProgramHeader {
+  uint8_t MinorVersion : 4;
+  uint8_t MajorVersion : 4;
+  uint8_t Unused;
+  uint16_t ShaderKind;
+  uint32_t Size; // Size in uint32_t words including this header.
+  BitcodeHeader Bitcode;
+
+  void swapBytes() {
+    sys::swapByteOrder(ShaderKind);
+    sys::swapByteOrder(Size);
+    Bitcode.swapBytes();
+  }
+};
+
+static_assert(sizeof(ProgramHeader) == 24, "ProgramHeader Size incorrect!");
+
+} // namespace dxbc
+} // namespace llvm
+
+#endif // LLVM_BINARYFORMAT_DXCONTAINER_H
diff --git a/llvm/include/llvm/BinaryFormat/Dwarf.h b/llvm/include/llvm/BinaryFormat/Dwarf.h
index 4473f506d371..e288c5191bdb 100644
--- a/llvm/include/llvm/BinaryFormat/Dwarf.h
+++ b/llvm/include/llvm/BinaryFormat/Dwarf.h
@@ -320,6 +320,10 @@ inline bool isFortran(SourceLanguage S) {
   return result;
 }
 
+inline TypeKind getArrayIndexTypeEncoding(SourceLanguage S) {
+  return isFortran(S) ? DW_ATE_signed : DW_ATE_unsigned;
+}
+
 enum CaseSensitivity {
   // Identifier case codes
   DW_ID_case_sensitive = 0x00,
diff --git a/llvm/include/llvm/BinaryFormat/DynamicTags.def b/llvm/include/llvm/BinaryFormat/DynamicTags.def
index 814d8b113ec4..ae25ec53813c 100644
--- a/llvm/include/llvm/BinaryFormat/DynamicTags.def
+++ b/llvm/include/llvm/BinaryFormat/DynamicTags.def
@@ -209,6 +209,7 @@ MIPS_DYNAMIC_TAG(MIPS_RWPLT, 0x70000034)        // Points to the base
                                                 // of a writable PLT.
 MIPS_DYNAMIC_TAG(MIPS_RLD_MAP_REL, 0x70000035)  // Relative offset of run time loader
                                                 // map, used for debugging.
+MIPS_DYNAMIC_TAG(MIPS_XHASH, 0x70000036)        // GNU-style hash table with xlat.
 
 // PPC specific dynamic table entries.
 PPC_DYNAMIC_TAG(PPC_GOT, 0x70000000) // Uses Secure PLT ABI.
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index 5d3b1270b538..1e0ef613788d 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -319,6 +319,7 @@ enum {
   EM_BPF = 247,           // Linux kernel bpf virtual machine
   EM_VE = 251,            // NEC SX-Aurora VE
   EM_CSKY = 252,          // C-SKY 32-bit processor
+  EM_LOONGARCH = 258,     // LoongArch
 };
 
 // Object file classes.
@@ -563,6 +564,15 @@ enum : unsigned {
   EF_MIPS_ARCH = 0xf0000000       // Mask for applying EF_MIPS_ARCH_ variant
 };
 
+// MIPS-specific section indexes
+enum {
+  SHN_MIPS_ACOMMON = 0xff00,   // Common symbols which are defined and allocated
+  SHN_MIPS_TEXT = 0xff01,      // Not ABI compliant
+  SHN_MIPS_DATA = 0xff02,      // Not ABI compliant
+  SHN_MIPS_SCOMMON = 0xff03,   // Common symbols for global data area
+  SHN_MIPS_SUNDEFINED = 0xff04 // Undefined symbols for global data area
+};
+
 // ELF Relocation types for Mips
 enum {
 #include "ELFRelocs/Mips.def"
@@ -753,16 +763,18 @@ enum : unsigned {
   EF_AMDGPU_MACH_AMDGCN_GFX1035       = 0x03d,
   EF_AMDGPU_MACH_AMDGCN_GFX1034       = 0x03e,
   EF_AMDGPU_MACH_AMDGCN_GFX90A        = 0x03f,
-  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X40 = 0x040,
-  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X41 = 0x041,
+  EF_AMDGPU_MACH_AMDGCN_GFX940        = 0x040,
+  EF_AMDGPU_MACH_AMDGCN_GFX1100       = 0x041,
   EF_AMDGPU_MACH_AMDGCN_GFX1013       = 0x042,
   EF_AMDGPU_MACH_AMDGCN_RESERVED_0X43 = 0x043,
-  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X44 = 0x044,
-  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X45 = 0x045,
+  EF_AMDGPU_MACH_AMDGCN_GFX1103       = 0x044,
+  EF_AMDGPU_MACH_AMDGCN_GFX1036       = 0x045,
+  EF_AMDGPU_MACH_AMDGCN_GFX1101       = 0x046,
+  EF_AMDGPU_MACH_AMDGCN_GFX1102       = 0x047,
 
   // First/last AMDGCN-based processors.
   EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600,
-  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_RESERVED_0X45,
+  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX1102,
 
   // Indicates if the "xnack" target feature is enabled for all code contained
   // in the object.
@@ -865,12 +877,34 @@ enum {
 #include "ELFRelocs/VE.def"
 };
 
+// CSKY Specific e_flags
+enum : unsigned {
+  EF_CSKY_801 = 0xa,
+  EF_CSKY_802 = 0x10,
+  EF_CSKY_803 = 0x9,
+  EF_CSKY_805 = 0x11,
+  EF_CSKY_807 = 0x6,
+  EF_CSKY_810 = 0x8,
+  EF_CSKY_860 = 0xb,
+  EF_CSKY_800 = 0x1f,
+  EF_CSKY_FLOAT = 0x2000,
+  EF_CSKY_DSP = 0x4000,
+  EF_CSKY_ABIV2 = 0x20000000,
+  EF_CSKY_EFV1 = 0x1000000,
+  EF_CSKY_EFV2 = 0x2000000,
+  EF_CSKY_EFV3 = 0x3000000
+};
 
 // ELF Relocation types for CSKY
 enum {
 #include "ELFRelocs/CSKY.def"
 };
 
+// ELF Relocation types for LoongArch
+enum {
+#include "ELFRelocs/LoongArch.def"
+};
+
 #undef ELF_RELOC
 
 // Section header.
@@ -947,12 +981,15 @@ enum : unsigned {
   SHT_LLVM_ADDRSIG = 0x6fff4c03,        // List of address-significant symbols
                                         // for safe ICF.
   SHT_LLVM_DEPENDENT_LIBRARIES =
-      0x6fff4c04,                    // LLVM Dependent Library Specifiers.
-  SHT_LLVM_SYMPART = 0x6fff4c05,     // Symbol partition specification.
-  SHT_LLVM_PART_EHDR = 0x6fff4c06,   // ELF header for loadable partition.
-  SHT_LLVM_PART_PHDR = 0x6fff4c07,   // Phdrs for loadable partition.
-  SHT_LLVM_BB_ADDR_MAP = 0x6fff4c08, // LLVM Basic Block Address Map.
+      0x6fff4c04,                  // LLVM Dependent Library Specifiers.
+  SHT_LLVM_SYMPART = 0x6fff4c05,   // Symbol partition specification.
+  SHT_LLVM_PART_EHDR = 0x6fff4c06, // ELF header for loadable partition.
+  SHT_LLVM_PART_PHDR = 0x6fff4c07, // Phdrs for loadable partition.
+  SHT_LLVM_BB_ADDR_MAP_V0 =
+      0x6fff4c08, // LLVM Basic Block Address Map (old version kept for
+                  // backward-compatibility).
   SHT_LLVM_CALL_GRAPH_PROFILE = 0x6fff4c09, // LLVM Call Graph Profile.
+  SHT_LLVM_BB_ADDR_MAP = 0x6fff4c0a,        // LLVM Basic Block Address Map.
   // Android's experimental support for SHT_RELR sections.
   // https://android.googlesource.com/platform/bionic/+/b7feec74547f84559a1467aca02708ff61346d2a/libc/include/elf.h#512
   SHT_ANDROID_RELR = 0x6fffff00,   // Relocation entries; only offsets.
@@ -985,6 +1022,8 @@ enum : unsigned {
 
   SHT_RISCV_ATTRIBUTES = 0x70000003U,
 
+  SHT_CSKY_ATTRIBUTES = 0x70000001U,
+
   SHT_HIPROC = 0x7fffffff, // Highest processor arch-specific type.
   SHT_LOUSER = 0x80000000, // Lowest type reserved for applications.
   SHT_HIUSER = 0xffffffff  // Highest type reserved for applications.
@@ -1036,6 +1075,9 @@ enum : unsigned {
 
   SHF_MASKOS = 0x0ff00000,
 
+  // Solaris equivalent of SHF_GNU_RETAIN.
+  SHF_SUNW_NODISCARD = 0x00100000,
+
   // Bits indicating processor-specific flags.
   SHF_MASKPROC = 0xf0000000,
 
@@ -1329,6 +1371,9 @@ enum {
   PT_MIPS_RTPROC = 0x70000001,   // Runtime procedure table.
   PT_MIPS_OPTIONS = 0x70000002,  // Options segment.
   PT_MIPS_ABIFLAGS = 0x70000003, // Abiflags segment.
+
+  // RISCV program header types.
+  PT_RISCV_ATTRIBUTES = 0x70000003,
 };
 
 // Segment flag bits.
@@ -1531,6 +1576,31 @@ enum {
   NT_GNU_PROPERTY_TYPE_0 = 5,
 };
 
+// Android note types.
+enum {
+  NT_ANDROID_TYPE_IDENT = 1,
+  NT_ANDROID_TYPE_KUSER = 3,
+  NT_ANDROID_TYPE_MEMTAG = 4,
+};
+
+// Memory tagging values used in NT_ANDROID_TYPE_MEMTAG notes.
+enum {
+  // Enumeration to determine the tagging mode. In Android-land, 'SYNC' means
+  // running all threads in MTE Synchronous mode, and 'ASYNC' means to use the
+  // kernels auto-upgrade feature to allow for either MTE Asynchronous,
+  // Asymmetric, or Synchronous mode. This allows silicon vendors to specify, on
+  // a per-cpu basis what 'ASYNC' should mean. Generally, the expectation is
+  // "pick the most precise mode that's very fast".
+  NT_MEMTAG_LEVEL_NONE = 0,
+  NT_MEMTAG_LEVEL_ASYNC = 1,
+  NT_MEMTAG_LEVEL_SYNC = 2,
+  NT_MEMTAG_LEVEL_MASK = 3,
+  // Bits indicating whether the loader should prepare for MTE to be enabled on
+  // the heap and/or stack.
+  NT_MEMTAG_HEAP = 4,
+  NT_MEMTAG_STACK = 8,
+};
+
 // Property types used in GNU_PROPERTY_TYPE_0 notes.
 enum : unsigned {
   GNU_PROPERTY_STACK_SIZE = 1,
diff --git a/llvm/include/llvm/BinaryFormat/ELFRelocs/LoongArch.def b/llvm/include/llvm/BinaryFormat/ELFRelocs/LoongArch.def
new file mode 100644
index 000000000000..8cbfe2fe4235
--- /dev/null
+++ b/llvm/include/llvm/BinaryFormat/ELFRelocs/LoongArch.def
@@ -0,0 +1,62 @@
+#ifndef ELF_RELOC
+#error "ELF_RELOC must be defined"
+#endif
+
+// These types and values are from the LoongArch ELF psABI which can be found at
+// https://github.com/loongson/LoongArch-Documentation
+// and these definitions has been adopted by binutils (include/elf/loongarch.h).
+// The commit hash (main branch) we reference is:
+// 9b3bd9f4a497115913c22f1a2a47863798fbc02a
+
+ELF_RELOC(R_LARCH_NONE,                        0)
+ELF_RELOC(R_LARCH_32,                          1)
+ELF_RELOC(R_LARCH_64,                          2)
+ELF_RELOC(R_LARCH_RELATIVE,                    3)
+ELF_RELOC(R_LARCH_COPY,                        4)
+ELF_RELOC(R_LARCH_JUMP_SLOT,                   5)
+ELF_RELOC(R_LARCH_TLS_DTPMOD32,                6)
+ELF_RELOC(R_LARCH_TLS_DTPMOD64,                7)
+ELF_RELOC(R_LARCH_TLS_DTPREL32,                8)
+ELF_RELOC(R_LARCH_TLS_DTPREL64,                9)
+ELF_RELOC(R_LARCH_TLS_TPREL32,                10)
+ELF_RELOC(R_LARCH_TLS_TPREL64,                11)
+ELF_RELOC(R_LARCH_IRELATIVE,                  12)
+ELF_RELOC(R_LARCH_MARK_LA,                    20)
+ELF_RELOC(R_LARCH_MARK_PCREL,                 21)
+ELF_RELOC(R_LARCH_SOP_PUSH_PCREL,             22)
+ELF_RELOC(R_LARCH_SOP_PUSH_ABSOLUTE,          23)
+ELF_RELOC(R_LARCH_SOP_PUSH_DUP,               24)
+ELF_RELOC(R_LARCH_SOP_PUSH_GPREL,             25)
+ELF_RELOC(R_LARCH_SOP_PUSH_TLS_TPREL,         26)
+ELF_RELOC(R_LARCH_SOP_PUSH_TLS_GOT,           27)
+ELF_RELOC(R_LARCH_SOP_PUSH_TLS_GD,            28)
+ELF_RELOC(R_LARCH_SOP_PUSH_PLT_PCREL,         29)
+ELF_RELOC(R_LARCH_SOP_ASSERT,                 30)
+ELF_RELOC(R_LARCH_SOP_NOT,                    31)
+ELF_RELOC(R_LARCH_SOP_SUB,                    32)
+ELF_RELOC(R_LARCH_SOP_SL,                     33)
+ELF_RELOC(R_LARCH_SOP_SR,                     34)
+ELF_RELOC(R_LARCH_SOP_ADD,                    35)
+ELF_RELOC(R_LARCH_SOP_AND,                    36)
+ELF_RELOC(R_LARCH_SOP_IF_ELSE,                37)
+ELF_RELOC(R_LARCH_SOP_POP_32_S_10_5,          38)
+ELF_RELOC(R_LARCH_SOP_POP_32_U_10_12,         39)
+ELF_RELOC(R_LARCH_SOP_POP_32_S_10_12,         40)
+ELF_RELOC(R_LARCH_SOP_POP_32_S_10_16,         41)
+ELF_RELOC(R_LARCH_SOP_POP_32_S_10_16_S2,      42)
+ELF_RELOC(R_LARCH_SOP_POP_32_S_5_20,          43)
+ELF_RELOC(R_LARCH_SOP_POP_32_S_0_5_10_16_S2,  44)
+ELF_RELOC(R_LARCH_SOP_POP_32_S_0_10_10_16_S2, 45)
+ELF_RELOC(R_LARCH_SOP_POP_32_U,               46)
+ELF_RELOC(R_LARCH_ADD8,                       47)
+ELF_RELOC(R_LARCH_ADD16,                      48)
+ELF_RELOC(R_LARCH_ADD24,                      49)
+ELF_RELOC(R_LARCH_ADD32,                      50)
+ELF_RELOC(R_LARCH_ADD64,                      51)
+ELF_RELOC(R_LARCH_SUB8,                       52)
+ELF_RELOC(R_LARCH_SUB16,                      53)
+ELF_RELOC(R_LARCH_SUB24,                      54)
+ELF_RELOC(R_LARCH_SUB32,                      55)
+ELF_RELOC(R_LARCH_SUB64,                      56)
+ELF_RELOC(R_LARCH_GNU_VTINHERIT,              57)
+ELF_RELOC(R_LARCH_GNU_VTENTRY,                58)
diff --git a/llvm/include/llvm/BinaryFormat/GOFF.h b/llvm/include/llvm/BinaryFormat/GOFF.h
new file mode 100644
index 000000000000..96992414c6cc
--- /dev/null
+++ b/llvm/include/llvm/BinaryFormat/GOFF.h
@@ -0,0 +1,33 @@
+//===-- llvm/BinaryFormat/GOFF.h - GOFF definitions --------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header contains common, non-processor-specific data structures and
+// constants for the GOFF file format.
+//
+// GOFF specifics can be found in MVS Program Management: Advanced Facilities
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_BINARYFORMAT_GOFF_H
+#define LLVM_BINARYFORMAT_GOFF_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+namespace GOFF {
+
+// \brief Subsections of the primary C_CODE section in the object file.
+enum SubsectionKind : uint8_t {
+  SK_PPA1 = 2,
+};
+
+} // end namespace GOFF
+
+} // end namespace llvm
+
+#endif // LLVM_BINARYFORMAT_GOFF_H
diff --git a/llvm/include/llvm/BinaryFormat/MachO.h b/llvm/include/llvm/BinaryFormat/MachO.h
index ce3a5c46e0d1..c05e79333d38 100644
--- a/llvm/include/llvm/BinaryFormat/MachO.h
+++ b/llvm/include/llvm/BinaryFormat/MachO.h
@@ -255,7 +255,8 @@ enum BindType {
 enum BindSpecialDylib {
   BIND_SPECIAL_DYLIB_SELF = 0,
   BIND_SPECIAL_DYLIB_MAIN_EXECUTABLE = -1,
-  BIND_SPECIAL_DYLIB_FLAT_LOOKUP = -2
+  BIND_SPECIAL_DYLIB_FLAT_LOOKUP = -2,
+  BIND_SPECIAL_DYLIB_WEAK_LOOKUP = -3
 };
 
 enum {
@@ -1001,6 +1002,27 @@ struct nlist_64 {
   uint64_t n_value;
 };
 
+/// Structs for dyld chained fixups.
+/// dyld_chained_fixups_header is the data pointed to by LC_DYLD_CHAINED_FIXUPS
+/// load command.
+struct dyld_chained_fixups_header {
+  uint32_t fixups_version; ///< 0
+  uint32_t starts_offset;  ///< Offset of dyld_chained_starts_in_image.
+  uint32_t imports_offset; ///< Offset of imports table in chain_data.
+  uint32_t symbols_offset; ///< Offset of symbol strings in chain_data.
+  uint32_t imports_count;  ///< Number of imported symbol names.
+  uint32_t imports_format; ///< DYLD_CHAINED_IMPORT*
+  uint32_t symbols_format; ///< 0 => uncompressed, 1 => zlib compressed
+};
+
+/// dyld_chained_starts_in_image is embedded in LC_DYLD_CHAINED_FIXUPS payload.
+/// Each each seg_info_offset entry is the offset into this struct for that
+/// segment followed by pool of dyld_chain_starts_in_segment data.
+struct dyld_chained_starts_in_image {
+  uint32_t    seg_count;
+  uint32_t    seg_info_offset[1];
+};
+  
 // Byte order swapping functions for MachO structs
 
 inline void swapStruct(fat_header &mh) {
@@ -2008,6 +2030,16 @@ union alignas(4) macho_load_command {
 };
 LLVM_PACKED_END
 
+inline void swapStruct(dyld_chained_fixups_header &C) {
+  sys::swapByteOrder(C.fixups_version);
+  sys::swapByteOrder(C.starts_offset);
+  sys::swapByteOrder(C.imports_offset);
+  sys::swapByteOrder(C.symbols_offset);
+  sys::swapByteOrder(C.imports_count);
+  sys::swapByteOrder(C.imports_format);
+  sys::swapByteOrder(C.symbols_format);
+}
+
 /* code signing attributes of a process */
 
 enum CodeSignAttrs {
@@ -2205,6 +2237,17 @@ enum SecCSDigestAlgorithm {
   kSecCodeSignatureHashSHA512 = 5, /* SHA-512 */
 };
 
+enum LinkerOptimizationHintKind {
+  LOH_ARM64_ADRP_ADRP = 1,
+  LOH_ARM64_ADRP_LDR = 2,
+  LOH_ARM64_ADRP_ADD_LDR = 3,
+  LOH_ARM64_ADRP_LDR_GOT_LDR = 4,
+  LOH_ARM64_ADRP_ADD_STR = 5,
+  LOH_ARM64_ADRP_LDR_GOT_STR = 6,
+  LOH_ARM64_ADRP_ADD = 7,
+  LOH_ARM64_ADRP_LDR_GOT = 8,
+};
+
 } // end namespace MachO
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/BinaryFormat/Magic.h b/llvm/include/llvm/BinaryFormat/Magic.h
index 6988b2dde656..c8e0dad42b0b 100644
--- a/llvm/include/llvm/BinaryFormat/Magic.h
+++ b/llvm/include/llvm/BinaryFormat/Magic.h
@@ -51,6 +51,9 @@ struct file_magic {
     wasm_object,         ///< WebAssembly Object file
     pdb,                 ///< Windows PDB debug info file
     tapi_file,           ///< Text-based Dynamic Library Stub file
+    cuda_fatbinary,      ///< CUDA Fatbinary object file
+    offload_binary,      ///< LLVM offload object file
+    dxcontainer_object,  ///< DirectX container file
   };
 
   bool is_object() const { return V != unknown; }
diff --git a/llvm/include/llvm/BinaryFormat/Swift.def b/llvm/include/llvm/BinaryFormat/Swift.def
index 6160e2551432..05b60e40632c 100644
--- a/llvm/include/llvm/BinaryFormat/Swift.def
+++ b/llvm/include/llvm/BinaryFormat/Swift.def
@@ -24,3 +24,10 @@ HANDLE_SWIFT_SECTION(builtin, "__swift5_builtin", "swift5_builtin", ".sw5bltn")
 HANDLE_SWIFT_SECTION(capture, "__swift5_capture", "swift5_capture", ".sw5cptr")
 HANDLE_SWIFT_SECTION(typeref, "__swift5_typeref", "swift5_typeref", ".sw5tyrf")
 HANDLE_SWIFT_SECTION(reflstr, "__swift5_reflstr", "swift5_reflstr", ".sw5rfst")
+HANDLE_SWIFT_SECTION(conform, "__swift5_proto", "swift5_protocol_conformances",
+                     ".sw5prtc$B")
+HANDLE_SWIFT_SECTION(protocs, "__swift5_protos", "swift5_protocols",
+                     ".sw5prt$B")
+HANDLE_SWIFT_SECTION(acfuncs, "__swift5_acfuncs", "swift5_accessible_functions",
+                     ".sw5acfn$B")
+HANDLE_SWIFT_SECTION(mpenum, "__swift5_mpenum", "swift5_mpenum", ".sw5mpen$B")
diff --git a/llvm/include/llvm/BinaryFormat/Wasm.h b/llvm/include/llvm/BinaryFormat/Wasm.h
index 0bc8c4e167d8..62a6881ef36a 100644
--- a/llvm/include/llvm/BinaryFormat/Wasm.h
+++ b/llvm/include/llvm/BinaryFormat/Wasm.h
@@ -91,7 +91,7 @@ struct WasmTable {
   StringRef SymbolName; // from the "linking" section
 };
 
-struct WasmInitExpr {
+struct WasmInitExprMVP {
   uint8_t Opcode;
   union {
     int32_t Int32;
@@ -102,6 +102,13 @@ struct WasmInitExpr {
   } Value;
 };
 
+struct WasmInitExpr {
+  uint8_t Extended; // Set to non-zero if extended const is used (i.e. more than
+                    // one instruction)
+  WasmInitExprMVP Inst;
+  ArrayRef<uint8_t> Body;
+};
+
 struct WasmGlobalType {
   uint8_t Type;
   bool Mutable;
@@ -245,7 +252,8 @@ enum : unsigned {
   WASM_SEC_CODE = 10,      // Function bodies (code)
   WASM_SEC_DATA = 11,      // Data segments
   WASM_SEC_DATACOUNT = 12, // Data segment count
-  WASM_SEC_TAG = 13        // Tag declarations
+  WASM_SEC_TAG = 13,       // Tag declarations
+  WASM_SEC_LAST_KNOWN = WASM_SEC_TAG,
 };
 
 // Type immediate encodings used in various contexts.
@@ -276,6 +284,7 @@ enum : unsigned {
   WASM_OPCODE_CALL = 0x10,
   WASM_OPCODE_LOCAL_GET = 0x20,
   WASM_OPCODE_LOCAL_SET = 0x21,
+  WASM_OPCODE_LOCAL_TEE = 0x22,
   WASM_OPCODE_GLOBAL_GET = 0x23,
   WASM_OPCODE_GLOBAL_SET = 0x24,
   WASM_OPCODE_I32_STORE = 0x36,
@@ -285,7 +294,11 @@ enum : unsigned {
   WASM_OPCODE_F32_CONST = 0x43,
   WASM_OPCODE_F64_CONST = 0x44,
   WASM_OPCODE_I32_ADD = 0x6a,
+  WASM_OPCODE_I32_SUB = 0x6b,
+  WASM_OPCODE_I32_MUL = 0x6c,
   WASM_OPCODE_I64_ADD = 0x7c,
+  WASM_OPCODE_I64_SUB = 0x7d,
+  WASM_OPCODE_I64_MUL = 0x7e,
   WASM_OPCODE_REF_NULL = 0xd0,
 };
 
@@ -458,8 +471,9 @@ inline bool operator==(const WasmTableType &LHS, const WasmTableType &RHS) {
   return LHS.ElemType == RHS.ElemType && LHS.Limits == RHS.Limits;
 }
 
-std::string toString(WasmSymbolType type);
-std::string relocTypetoString(uint32_t type);
+llvm::StringRef toString(WasmSymbolType type);
+llvm::StringRef relocTypetoString(uint32_t type);
+llvm::StringRef sectionTypeToString(uint32_t type);
 bool relocTypeHasAddend(uint32_t type);
 
 } // end namespace wasm
diff --git a/llvm/include/llvm/BinaryFormat/XCOFF.h b/llvm/include/llvm/BinaryFormat/XCOFF.h
index cffd8618f1e3..5d23ec5cd911 100644
--- a/llvm/include/llvm/BinaryFormat/XCOFF.h
+++ b/llvm/include/llvm/BinaryFormat/XCOFF.h
@@ -54,6 +54,34 @@ enum AuxHeaderFlags64 : uint16_t {
                         ///< future use and should be set to 0.
 };
 
+enum XCOFFInterpret : uint16_t {
+  OLD_XCOFF_INTERPRET = 1,
+  NEW_XCOFF_INTERPRET = 2
+};
+
+enum FileFlag : uint16_t {
+  F_RELFLG = 0x0001,    ///< relocation info stripped from file
+  F_EXEC = 0x0002,      ///< file is executable (i.e., it
+                        ///< has a loader section)
+  F_LNNO = 0x0004,      ///< line numbers stripped from file
+  F_LSYMS = 0x0008,     ///< local symbols stripped from file
+  F_FDPR_PROF = 0x0010, ///< file was profiled with FDPR
+  F_FDPR_OPTI = 0x0020, ///< file was reordered with FDPR
+  F_DSA = 0x0040,       ///< file uses Dynamic Segment Allocation (32-bit
+                        ///< only)
+  F_DEP_1 = 0x0080,     ///< Data Execution Protection bit 1
+  F_VARPG = 0x0100,     ///< executable requests using variable size pages
+  F_LPTEXT = 0x0400,    ///< executable requires large pages for text
+  F_LPDATA = 0x0800,    ///< executable requires large pages for data
+  F_DYNLOAD = 0x1000,   ///< file is dynamically loadable and
+                        ///< executable (equivalent to F_EXEC on AIX)
+  F_SHROBJ = 0x2000,    ///< file is a shared object
+  F_LOADONLY =
+      0x4000,      ///< file can be loaded by the system loader, but it is
+                   ///< ignored by the linker if it is a member of an archive.
+  F_DEP_2 = 0x8000 ///< Data Execution Protection bit 2
+};
+
 // x_smclas field of x_csect from system header: /usr/include/syms.h
 /// Storage Mapping Class definitions.
 enum StorageMappingClass : uint8_t {
@@ -212,6 +240,8 @@ enum VisibilityType : uint16_t {
   SYM_V_EXPORTED = 0x4000
 };
 
+constexpr uint16_t VISIBILITY_MASK = 0x7000;
+
 // Relocation types, defined in `/usr/include/reloc.h`.
 enum RelocationType : uint8_t {
   R_POS = 0x00, ///< Positive relocation. Provides the address of the referenced
diff --git a/llvm/include/llvm/Bitcode/BitcodeAnalyzer.h b/llvm/include/llvm/Bitcode/BitcodeAnalyzer.h
index f6fc284da33f..102e2257abcc 100644
--- a/llvm/include/llvm/Bitcode/BitcodeAnalyzer.h
+++ b/llvm/include/llvm/Bitcode/BitcodeAnalyzer.h
@@ -18,12 +18,13 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Bitstream/BitstreamReader.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/raw_ostream.h"
 #include <map>
 #include <vector>
 
 namespace llvm {
 
+class raw_ostream;
+
 /// CurStreamTypeType - A type for CurStreamType
 enum CurStreamTypeType {
   UnknownBitstream,
diff --git a/llvm/include/llvm/Bitcode/BitcodeReader.h b/llvm/include/llvm/Bitcode/BitcodeReader.h
index a82791c8720b..39ea48c33fc3 100644
--- a/llvm/include/llvm/Bitcode/BitcodeReader.h
+++ b/llvm/include/llvm/Bitcode/BitcodeReader.h
@@ -15,12 +15,11 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Bitstream/BitCodes.h"
-#include "llvm/IR/ModuleSummaryIndex.h"
+#include "llvm/Bitstream/BitCodeEnums.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorOr.h"
-#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/MemoryBufferRef.h"
 #include <cstdint>
 #include <memory>
 #include <string>
@@ -30,6 +29,8 @@ namespace llvm {
 
 class LLVMContext;
 class Module;
+class MemoryBuffer;
+class ModuleSummaryIndex;
 
 typedef llvm::function_ref<Optional<std::string>(StringRef)>
     DataLayoutCallbackTy;
diff --git a/llvm/include/llvm/Bitcode/BitcodeWriter.h b/llvm/include/llvm/Bitcode/BitcodeWriter.h
index 96f25fce8ddb..248d33f4502e 100644
--- a/llvm/include/llvm/Bitcode/BitcodeWriter.h
+++ b/llvm/include/llvm/Bitcode/BitcodeWriter.h
@@ -17,7 +17,7 @@
 #include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Support/Allocator.h"
-#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/MemoryBufferRef.h"
 #include <map>
 #include <memory>
 #include <string>
diff --git a/llvm/include/llvm/Bitcode/BitcodeWriterPass.h b/llvm/include/llvm/Bitcode/BitcodeWriterPass.h
index dda5b20973c1..3c2471237532 100644
--- a/llvm/include/llvm/Bitcode/BitcodeWriterPass.h
+++ b/llvm/include/llvm/Bitcode/BitcodeWriterPass.h
@@ -14,7 +14,6 @@
 #ifndef LLVM_BITCODE_BITCODEWRITERPASS_H
 #define LLVM_BITCODE_BITCODEWRITERPASS_H
 
-#include "llvm/ADT/StringRef.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 6d0f51ce9c6d..5d96204ba42a 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -17,7 +17,10 @@
 #ifndef LLVM_BITCODE_LLVMBITCODES_H
 #define LLVM_BITCODE_LLVMBITCODES_H
 
-#include "llvm/Bitstream/BitCodes.h"
+// This is the only file included, and it, in turn, is a leaf header.
+// This allows external tools to dump the AST of this file and analyze it for
+// changes without needing to fully or partially build LLVM itself.
+#include "llvm/Bitstream/BitCodeEnums.h"
 
 namespace llvm {
 namespace bitc {
@@ -582,14 +585,15 @@ enum FunctionCodes {
       52, // CATCHSWITCH: [num,args...] or [num,args...,bb]
   // 53 is unused.
   // 54 is unused.
-  FUNC_CODE_OPERAND_BUNDLE = 55, // OPERAND_BUNDLE: [tag#, value...]
-  FUNC_CODE_INST_UNOP = 56,      // UNOP:       [opcode, ty, opval]
-  FUNC_CODE_INST_CALLBR = 57,    // CALLBR:     [attr, cc, norm, transfs,
-                                 //              fnty, fnid, args...]
-  FUNC_CODE_INST_FREEZE = 58,    // FREEZE: [opty, opval]
-  FUNC_CODE_INST_ATOMICRMW = 59, // ATOMICRMW: [ptrty, ptr, valty, val,
-                                 //             operation, align, vol,
-                                 //             ordering, synchscope]
+  FUNC_CODE_OPERAND_BUNDLE = 55,  // OPERAND_BUNDLE: [tag#, value...]
+  FUNC_CODE_INST_UNOP = 56,       // UNOP:       [opcode, ty, opval]
+  FUNC_CODE_INST_CALLBR = 57,     // CALLBR:     [attr, cc, norm, transfs,
+                                  //              fnty, fnid, args...]
+  FUNC_CODE_INST_FREEZE = 58,     // FREEZE: [opty, opval]
+  FUNC_CODE_INST_ATOMICRMW = 59,  // ATOMICRMW: [ptrty, ptr, valty, val,
+                                  //             operation, align, vol,
+                                  //             ordering, synchscope]
+  FUNC_CODE_BLOCKADDR_USERS = 60, // BLOCKADDR_USERS: [value...]
 };
 
 enum UseListCodes {
@@ -677,6 +681,11 @@ enum AttributeKindCodes {
   ATTR_KIND_NO_SANITIZE_COVERAGE = 76,
   ATTR_KIND_ELEMENTTYPE = 77,
   ATTR_KIND_DISABLE_SANITIZER_INSTRUMENTATION = 78,
+  ATTR_KIND_NO_SANITIZE_BOUNDS = 79,
+  ATTR_KIND_ALLOC_ALIGN = 80,
+  ATTR_KIND_ALLOCATED_POINTER = 81,
+  ATTR_KIND_ALLOC_KIND = 82,
+  ATTR_KIND_PRESPLIT_COROUTINE = 83,
 };
 
 enum ComdatSelectionKindCodes {
diff --git a/llvm/include/llvm/Bitstream/BitCodeEnums.h b/llvm/include/llvm/Bitstream/BitCodeEnums.h
new file mode 100644
index 000000000000..4288bd3987ae
--- /dev/null
+++ b/llvm/include/llvm/Bitstream/BitCodeEnums.h
@@ -0,0 +1,90 @@
+//===- BitCodeEnums.h - Core enums for the bitstream format -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header defines "core" bitstream enum values.
+// It has been separated from the other header that defines bitstream enum
+// values, BitCodes.h, to allow tools to track changes to the various
+// bitstream and bitcode enums without needing to fully or partially build
+// LLVM itself.
+//
+// The enum values defined in this file should be considered permanent.  If
+// new features are added, they should have values added at the end of the
+// respective lists.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_BITSTREAM_BITCODEENUMS_H
+#define LLVM_BITSTREAM_BITCODEENUMS_H
+
+namespace llvm {
+/// Offsets of the 32-bit fields of bitstream wrapper header.
+enum BitstreamWrapperHeader : unsigned {
+  BWH_MagicField = 0 * 4,
+  BWH_VersionField = 1 * 4,
+  BWH_OffsetField = 2 * 4,
+  BWH_SizeField = 3 * 4,
+  BWH_CPUTypeField = 4 * 4,
+  BWH_HeaderSize = 5 * 4
+};
+
+namespace bitc {
+enum StandardWidths {
+  BlockIDWidth = 8,   // We use VBR-8 for block IDs.
+  CodeLenWidth = 4,   // Codelen are VBR-4.
+  BlockSizeWidth = 32 // BlockSize up to 2^32 32-bit words = 16GB per block.
+};
+
+// The standard abbrev namespace always has a way to exit a block, enter a
+// nested block, define abbrevs, and define an unabbreviated record.
+enum FixedAbbrevIDs {
+  END_BLOCK = 0, // Must be zero to guarantee termination for broken bitcode.
+  ENTER_SUBBLOCK = 1,
+
+  /// DEFINE_ABBREV - Defines an abbrev for the current block.  It consists
+  /// of a vbr5 for # operand infos.  Each operand info is emitted with a
+  /// single bit to indicate if it is a literal encoding.  If so, the value is
+  /// emitted with a vbr8.  If not, the encoding is emitted as 3 bits followed
+  /// by the info value as a vbr5 if needed.
+  DEFINE_ABBREV = 2,
+
+  // UNABBREV_RECORDs are emitted with a vbr6 for the record code, followed by
+  // a vbr6 for the # operands, followed by vbr6's for each operand.
+  UNABBREV_RECORD = 3,
+
+  // This is not a code, this is a marker for the first abbrev assignment.
+  FIRST_APPLICATION_ABBREV = 4
+};
+
+/// StandardBlockIDs - All bitcode files can optionally include a BLOCKINFO
+/// block, which contains metadata about other blocks in the file.
+enum StandardBlockIDs {
+  /// BLOCKINFO_BLOCK is used to define metadata about blocks, for example,
+  /// standard abbrevs that should be available to all blocks of a specified
+  /// ID.
+  BLOCKINFO_BLOCK_ID = 0,
+
+  // Block IDs 1-7 are reserved for future expansion.
+  FIRST_APPLICATION_BLOCKID = 8
+};
+
+/// BlockInfoCodes - The blockinfo block contains metadata about user-defined
+/// blocks.
+enum BlockInfoCodes {
+  // DEFINE_ABBREV has magic semantics here, applying to the current SETBID'd
+  // block, instead of the BlockInfo block.
+
+  BLOCKINFO_CODE_SETBID = 1,       // SETBID: [blockid#]
+  BLOCKINFO_CODE_BLOCKNAME = 2,    // BLOCKNAME: [name]
+  BLOCKINFO_CODE_SETRECORDNAME = 3 // BLOCKINFO_CODE_SETRECORDNAME:
+                                   //                             [id, name]
+};
+
+} // namespace bitc
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/Bitstream/BitCodes.h b/llvm/include/llvm/Bitstream/BitCodes.h
index 9cd4e535a470..93888f7d3b33 100644
--- a/llvm/include/llvm/Bitstream/BitCodes.h
+++ b/llvm/include/llvm/Bitstream/BitCodes.h
@@ -19,75 +19,12 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Bitstream/BitCodeEnums.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cassert>
 
 namespace llvm {
-/// Offsets of the 32-bit fields of bitstream wrapper header.
-enum BitstreamWrapperHeader : unsigned {
-  BWH_MagicField   = 0 * 4,
-  BWH_VersionField = 1 * 4,
-  BWH_OffsetField  = 2 * 4,
-  BWH_SizeField    = 3 * 4,
-  BWH_CPUTypeField = 4 * 4,
-  BWH_HeaderSize   = 5 * 4
-};
-
-namespace bitc {
-  enum StandardWidths {
-    BlockIDWidth   = 8,  // We use VBR-8 for block IDs.
-    CodeLenWidth   = 4,  // Codelen are VBR-4.
-    BlockSizeWidth = 32  // BlockSize up to 2^32 32-bit words = 16GB per block.
-  };
-
-  // The standard abbrev namespace always has a way to exit a block, enter a
-  // nested block, define abbrevs, and define an unabbreviated record.
-  enum FixedAbbrevIDs {
-    END_BLOCK = 0,  // Must be zero to guarantee termination for broken bitcode.
-    ENTER_SUBBLOCK = 1,
-
-    /// DEFINE_ABBREV - Defines an abbrev for the current block.  It consists
-    /// of a vbr5 for # operand infos.  Each operand info is emitted with a
-    /// single bit to indicate if it is a literal encoding.  If so, the value is
-    /// emitted with a vbr8.  If not, the encoding is emitted as 3 bits followed
-    /// by the info value as a vbr5 if needed.
-    DEFINE_ABBREV = 2,
-
-    // UNABBREV_RECORDs are emitted with a vbr6 for the record code, followed by
-    // a vbr6 for the # operands, followed by vbr6's for each operand.
-    UNABBREV_RECORD = 3,
-
-    // This is not a code, this is a marker for the first abbrev assignment.
-    FIRST_APPLICATION_ABBREV = 4
-  };
-
-  /// StandardBlockIDs - All bitcode files can optionally include a BLOCKINFO
-  /// block, which contains metadata about other blocks in the file.
-  enum StandardBlockIDs {
-    /// BLOCKINFO_BLOCK is used to define metadata about blocks, for example,
-    /// standard abbrevs that should be available to all blocks of a specified
-    /// ID.
-    BLOCKINFO_BLOCK_ID = 0,
-
-    // Block IDs 1-7 are reserved for future expansion.
-    FIRST_APPLICATION_BLOCKID = 8
-  };
-
-  /// BlockInfoCodes - The blockinfo block contains metadata about user-defined
-  /// blocks.
-  enum BlockInfoCodes {
-    // DEFINE_ABBREV has magic semantics here, applying to the current SETBID'd
-    // block, instead of the BlockInfo block.
-
-    BLOCKINFO_CODE_SETBID        = 1, // SETBID: [blockid#]
-    BLOCKINFO_CODE_BLOCKNAME     = 2, // BLOCKNAME: [name]
-    BLOCKINFO_CODE_SETRECORDNAME = 3  // BLOCKINFO_CODE_SETRECORDNAME:
-                                      //                             [id, name]
-  };
-
-} // End bitc namespace
-
 /// BitCodeAbbrevOp - This describes one or more operands in an abbreviation.
 /// This is actually a union of two different things:
 ///   1. It could be a literal integer value ("the operand is always 17").
@@ -106,6 +43,10 @@ public:
     Blob  = 5   // 32-bit aligned array of 8-bit characters.
   };
 
+  static bool isValidEncoding(uint64_t E) {
+    return E >= 1 && E <= 5;
+  }
+
   explicit BitCodeAbbrevOp(uint64_t V) :  Val(V), IsLiteral(true) {}
   explicit BitCodeAbbrevOp(Encoding E, uint64_t Data = 0)
     : Val(Data), IsLiteral(false), Enc(E) {}
@@ -179,6 +120,6 @@ public:
     OperandList.push_back(OpInfo);
   }
 };
-} // End llvm namespace
+} // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/Bitstream/BitstreamReader.h b/llvm/include/llvm/Bitstream/BitstreamReader.h
index 37b7c4d73cff..10a0a4e0039e 100644
--- a/llvm/include/llvm/Bitstream/BitstreamReader.h
+++ b/llvm/include/llvm/Bitstream/BitstreamReader.h
@@ -19,7 +19,6 @@
 #include "llvm/Bitstream/BitCodes.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MemoryBufferRef.h"
 #include <algorithm>
 #include <cassert>
@@ -97,8 +96,6 @@ private:
   unsigned BitsInCurWord = 0;
 
 public:
-  static const constexpr size_t MaxChunkSize = sizeof(word_t) * 8;
-
   SimpleBitstreamCursor() = default;
   explicit SimpleBitstreamCursor(ArrayRef<uint8_t> BitcodeBytes)
       : BitcodeBytes(BitcodeBytes) {}
@@ -187,7 +184,7 @@ public:
   }
 
   Expected<word_t> Read(unsigned NumBits) {
-    static const unsigned BitsInWord = MaxChunkSize;
+    static const unsigned BitsInWord = sizeof(word_t) * 8;
 
     assert(NumBits && NumBits <= BitsInWord &&
            "Cannot return zero or more than BitsInWord bits!");
@@ -229,24 +226,32 @@ public:
     return R;
   }
 
-  Expected<uint32_t> ReadVBR(unsigned NumBits) {
+  Expected<uint32_t> ReadVBR(const unsigned NumBits) {
     Expected<unsigned> MaybeRead = Read(NumBits);
     if (!MaybeRead)
       return MaybeRead;
     uint32_t Piece = MaybeRead.get();
 
-    if ((Piece & (1U << (NumBits-1))) == 0)
+    assert(NumBits <= 32 && NumBits >= 1 && "Invalid NumBits value");
+    const uint32_t MaskBitOrder = (NumBits - 1);
+    const uint32_t Mask = 1UL << MaskBitOrder;
+
+    if ((Piece & Mask) == 0)
       return Piece;
 
     uint32_t Result = 0;
     unsigned NextBit = 0;
     while (true) {
-      Result |= (Piece & ((1U << (NumBits-1))-1)) << NextBit;
+      Result |= (Piece & (Mask - 1)) << NextBit;
 
-      if ((Piece & (1U << (NumBits-1))) == 0)
+      if ((Piece & Mask) == 0)
         return Result;
 
       NextBit += NumBits-1;
+      if (NextBit >= 32)
+        return createStringError(std::errc::illegal_byte_sequence,
+                                 "Unterminated VBR");
+
       MaybeRead = Read(NumBits);
       if (!MaybeRead)
         return MaybeRead;
@@ -256,24 +261,31 @@ public:
 
   // Read a VBR that may have a value up to 64-bits in size. The chunk size of
   // the VBR must still be <= 32 bits though.
-  Expected<uint64_t> ReadVBR64(unsigned NumBits) {
+  Expected<uint64_t> ReadVBR64(const unsigned NumBits) {
     Expected<uint64_t> MaybeRead = Read(NumBits);
     if (!MaybeRead)
       return MaybeRead;
     uint32_t Piece = MaybeRead.get();
+    assert(NumBits <= 32 && NumBits >= 1 && "Invalid NumBits value");
+    const uint32_t MaskBitOrder = (NumBits - 1);
+    const uint32_t Mask = 1UL << MaskBitOrder;
 
-    if ((Piece & (1U << (NumBits-1))) == 0)
+    if ((Piece & Mask) == 0)
       return uint64_t(Piece);
 
     uint64_t Result = 0;
     unsigned NextBit = 0;
     while (true) {
-      Result |= uint64_t(Piece & ((1U << (NumBits-1))-1)) << NextBit;
+      Result |= uint64_t(Piece & (Mask - 1)) << NextBit;
 
-      if ((Piece & (1U << (NumBits-1))) == 0)
+      if ((Piece & Mask) == 0)
         return Result;
 
       NextBit += NumBits-1;
+      if (NextBit >= 64)
+        return createStringError(std::errc::illegal_byte_sequence,
+                                 "Unterminated VBR");
+
       MaybeRead = Read(NumBits);
       if (!MaybeRead)
         return MaybeRead;
@@ -299,6 +311,13 @@ public:
 
   /// Skip to the end of the file.
   void skipToEnd() { NextChar = BitcodeBytes.size(); }
+
+  /// Check whether a reservation of Size elements is plausible.
+  bool isSizePlausible(size_t Size) const {
+    // Don't allow reserving more elements than the number of bits, assuming
+    // at least one bit is needed to encode an element.
+    return Size < BitcodeBytes.size() * 8;
+  }
 };
 
 /// When advancing through a bitstream cursor, each advance can discover a few
@@ -357,7 +376,7 @@ class BitstreamCursor : SimpleBitstreamCursor {
   BitstreamBlockInfo *BlockInfo = nullptr;
 
 public:
-  static const size_t MaxChunkSize = sizeof(word_t) * 8;
+  static const size_t MaxChunkSize = 32;
 
   BitstreamCursor() = default;
   explicit BitstreamCursor(ArrayRef<uint8_t> BitcodeBytes)
@@ -521,10 +540,11 @@ private:
 
 public:
   /// Return the abbreviation for the specified AbbrevId.
-  const BitCodeAbbrev *getAbbrev(unsigned AbbrevID) {
+  Expected<const BitCodeAbbrev *> getAbbrev(unsigned AbbrevID) {
     unsigned AbbrevNo = AbbrevID - bitc::FIRST_APPLICATION_ABBREV;
     if (AbbrevNo >= CurAbbrevs.size())
-      report_fatal_error("Invalid abbrev number");
+      return createStringError(
+          std::errc::illegal_byte_sequence, "Invalid abbrev number");
     return CurAbbrevs[AbbrevNo].get();
   }
 
diff --git a/llvm/include/llvm/Bitstream/BitstreamWriter.h b/llvm/include/llvm/Bitstream/BitstreamWriter.h
index 21b260b7b9f3..be6bab5532bd 100644
--- a/llvm/include/llvm/Bitstream/BitstreamWriter.h
+++ b/llvm/include/llvm/Bitstream/BitstreamWriter.h
@@ -74,16 +74,10 @@ class BitstreamWriter {
   };
   std::vector<BlockInfo> BlockInfoRecords;
 
-  void WriteByte(unsigned char Value) {
-    Out.push_back(Value);
-    FlushToFile();
-  }
-
   void WriteWord(unsigned Value) {
     Value = support::endian::byte_swap<uint32_t, support::little>(Value);
     Out.append(reinterpret_cast<const char *>(&Value),
                reinterpret_cast<const char *>(&Value + 1));
-    FlushToFile();
   }
 
   uint64_t GetNumOfFlushedBytes() const { return FS ? FS->tell() : 0; }
@@ -114,7 +108,7 @@ public:
   /// null, \p O does not flush incrementially, but writes to disk at the end.
   ///
   /// \p FlushThreshold is the threshold (unit M) to flush \p O if \p FS is
-  /// valid.
+  /// valid. Flushing only occurs at (sub)block boundaries.
   BitstreamWriter(SmallVectorImpl<char> &O, raw_fd_stream *FS = nullptr,
                   uint32_t FlushThreshold = 512)
       : Out(O), FS(FS), FlushThreshold(FlushThreshold << 20), CurBit(0),
@@ -249,8 +243,8 @@ public:
 
     // Emit the bits with VBR encoding, NumBits-1 bits at a time.
     while (Val >= Threshold) {
-      Emit(((uint32_t)Val & ((1 << (NumBits-1))-1)) |
-           (1 << (NumBits-1)), NumBits);
+      Emit(((uint32_t)Val & ((1 << (NumBits - 1)) - 1)) | (1 << (NumBits - 1)),
+           NumBits);
       Val >>= NumBits-1;
     }
 
@@ -327,6 +321,7 @@ public:
     CurCodeSize = B.PrevCodeSize;
     CurAbbrevs = std::move(B.PrevAbbrevs);
     BlockScope.pop_back();
+    FlushToFile();
   }
 
   //===--------------------------------------------------------------------===//
@@ -472,14 +467,12 @@ public:
     FlushToWord();
 
     // Emit literal bytes.
-    for (const auto &B : Bytes) {
-      assert(isUInt<8>(B) && "Value too large to emit as byte");
-      WriteByte((unsigned char)B);
-    }
+    assert(llvm::all_of(Bytes, [](UIntTy B) { return isUInt<8>(B); }));
+    Out.append(Bytes.begin(), Bytes.end());
 
     // Align end to 32-bits.
     while (GetBufferOffset() & 3)
-      WriteByte(0);
+      Out.push_back(0);
   }
   void emitBlob(StringRef Bytes, bool ShouldEmitSize = true) {
     emitBlob(makeArrayRef((const uint8_t *)Bytes.data(), Bytes.size()),
diff --git a/llvm/include/llvm/CodeGen/AccelTable.h b/llvm/include/llvm/CodeGen/AccelTable.h
index 1190d6061e45..c0e976317aef 100644
--- a/llvm/include/llvm/CodeGen/AccelTable.h
+++ b/llvm/include/llvm/CodeGen/AccelTable.h
@@ -14,19 +14,15 @@
 #define LLVM_CODEGEN_ACCELTABLE_H
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/DIE.h"
 #include "llvm/CodeGen/DwarfStringPoolEntry.h"
-#include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/DJB.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cstddef>
 #include <cstdint>
 #include <vector>
 
@@ -108,6 +104,8 @@ namespace llvm {
 class AsmPrinter;
 class DwarfCompileUnit;
 class DwarfDebug;
+class MCSymbol;
+class raw_ostream;
 
 /// Interface which the different types of accelerator table data have to
 /// conform. It serves as a base class for different values of the template
diff --git a/llvm/include/llvm/CodeGen/Analysis.h b/llvm/include/llvm/CodeGen/Analysis.h
index 60442326d6c7..1a09820f80ef 100644
--- a/llvm/include/llvm/CodeGen/Analysis.h
+++ b/llvm/include/llvm/CodeGen/Analysis.h
@@ -15,14 +15,11 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/Support/CodeGen.h"
 
 namespace llvm {
+template <typename T> class SmallVectorImpl;
 class GlobalValue;
 class LLT;
 class MachineBasicBlock;
diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h
index d911bfd435ae..fb4627c029b0 100644
--- a/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -22,9 +22,7 @@
 #include "llvm/CodeGen/DwarfStringPoolEntry.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/SourceMgr.h"
 #include <cstdint>
 #include <memory>
 #include <utility>
@@ -32,6 +30,7 @@
 
 namespace llvm {
 
+class AddrLabelMap;
 class BasicBlock;
 class BlockAddress;
 class Constant;
@@ -176,6 +175,10 @@ private:
   // function. This is used to calculate the size of the BB section.
   MCSymbol *CurrentSectionBeginSym = nullptr;
 
+  /// This map keeps track of which symbol is being used for the specified basic
+  /// block's address of label.
+  std::unique_ptr<AddrLabelMap> AddrLabelSymbols;
+
   // The garbage collection metadata printer table.
   void *GCMetadataPrinters = nullptr; // Really a DenseMap.
 
@@ -212,6 +215,16 @@ private:
   /// CFISection type the module needs i.e. either .eh_frame or .debug_frame.
   CFISection ModuleCFISection = CFISection::None;
 
+  /// True if the module contains split-stack functions. This is used to
+  /// emit .note.GNU-split-stack section as required by the linker for
+  /// special handling split-stack function calling no-split-stack function.
+  bool HasSplitStack = false;
+
+  /// True if the module contains no-split-stack functions. This is used to emit
+  /// .note.GNU-no-split-stack section when it also contains functions without a
+  /// split stack prologue.
+  bool HasNoSplitStack = false;
+
 protected:
   explicit AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer);
 
@@ -254,6 +267,25 @@ public:
   // given basic block.
   MCSymbol *getMBBExceptionSym(const MachineBasicBlock &MBB);
 
+  /// Return the symbol to be used for the specified basic block when its
+  /// address is taken.  This cannot be its normal LBB label because the block
+  /// may be accessed outside its containing function.
+  MCSymbol *getAddrLabelSymbol(const BasicBlock *BB) {
+    return getAddrLabelSymbolToEmit(BB).front();
+  }
+
+  /// Return the symbol to be used for the specified basic block when its
+  /// address is taken.  If other blocks were RAUW'd to this one, we may have
+  /// to emit them as well, return the whole set.
+  ArrayRef<MCSymbol *> getAddrLabelSymbolToEmit(const BasicBlock *BB);
+
+  /// If the specified function has had any references to address-taken blocks
+  /// generated, but the block got deleted, return the symbol now so we can
+  /// emit it.  This prevents emitting a reference to a symbol that has no
+  /// definition.
+  void takeDeletedSymbolsForFunction(const Function *F,
+                                     std::vector<MCSymbol *> &Result);
+
   /// Return information about object file lowering.
   const TargetLoweringObjectFile &getObjFileLowering() const;
 
diff --git a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h
new file mode 100644
index 000000000000..7ae1304cced9
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h
@@ -0,0 +1,109 @@
+//===-- BasicBlockSectionsProfileReader.h - BB sections profile reader pass ==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass creates the basic block cluster info by reading the basic block
+// sections profile. The cluster info will be used by the basic-block-sections
+// pass to arrange basic blocks in their sections.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_BASICBLOCKSECTIONSINFO_H
+#define LLVM_ANALYSIS_BASICBLOCKSECTIONSINFO_H
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/LineIterator.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+// The cluster information for a machine basic block.
+struct BBClusterInfo {
+  // MachineBasicBlock ID.
+  unsigned MBBNumber;
+  // Cluster ID this basic block belongs to.
+  unsigned ClusterID;
+  // Position of basic block within the cluster.
+  unsigned PositionInCluster;
+};
+
+using ProgramBBClusterInfoMapTy = StringMap<SmallVector<BBClusterInfo>>;
+
+class BasicBlockSectionsProfileReader : public ImmutablePass {
+public:
+  static char ID;
+
+  BasicBlockSectionsProfileReader(const MemoryBuffer *Buf)
+      : ImmutablePass(ID), MBuf(Buf) {
+    initializeBasicBlockSectionsProfileReaderPass(
+        *PassRegistry::getPassRegistry());
+  };
+
+  BasicBlockSectionsProfileReader() : ImmutablePass(ID) {
+    initializeBasicBlockSectionsProfileReaderPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "Basic Block Sections Profile Reader";
+  }
+
+  // Returns true if basic block sections profile exist for function \p
+  // FuncName.
+  bool isFunctionHot(StringRef FuncName) const;
+
+  // Returns a pair with first element representing whether basic block sections
+  // profile exist for the function \p FuncName, and the second element
+  // representing the basic block sections profile (cluster info) for this
+  // function. If the first element is true and the second element is empty, it
+  // means unique basic block sections are desired for all basic blocks of the
+  // function.
+  std::pair<bool, SmallVector<BBClusterInfo>>
+  getBBClusterInfoForFunction(StringRef FuncName) const;
+
+  /// Read profiles of basic blocks if available here.
+  void initializePass() override;
+
+private:
+  StringRef getAliasName(StringRef FuncName) const {
+    auto R = FuncAliasMap.find(FuncName);
+    return R == FuncAliasMap.end() ? FuncName : R->second;
+  }
+
+  // This contains the basic-block-sections profile.
+  const MemoryBuffer *MBuf = nullptr;
+
+  // This encapsulates the BB cluster information for the whole program.
+  //
+  // For every function name, it contains the cluster information for (all or
+  // some of) its basic blocks. The cluster information for every basic block
+  // includes its cluster ID along with the position of the basic block in that
+  // cluster.
+  ProgramBBClusterInfoMapTy ProgramBBClusterInfo;
+
+  // Some functions have alias names. We use this map to find the main alias
+  // name for which we have mapping in ProgramBBClusterInfo.
+  StringMap<StringRef> FuncAliasMap;
+};
+
+// Creates a BasicBlockSectionsProfileReader pass to parse the basic block
+// sections profile. \p Buf is a memory buffer that contains the list of
+// functions and basic block ids to selectively enable basic block sections.
+ImmutablePass *
+createBasicBlockSectionsProfileReaderPass(const MemoryBuffer *Buf);
+
+} // namespace llvm
+#endif // LLVM_ANALYSIS_BASICBLOCKSECTIONSINFO_H
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 0b2737628923..46be8e030406 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -195,6 +195,10 @@ private:
                                               bool VariableMask,
                                               bool IsGatherScatter,
                                               TTI::TargetCostKind CostKind) {
+    // We cannot scalarize scalable vectors, so return Invalid.
+    if (isa<ScalableVectorType>(DataTy))
+      return InstructionCost::getInvalid();
+
     auto *VT = cast<FixedVectorType>(DataTy);
     // Assume the target does not have support for gather/scatter operations
     // and provide a rough estimate.
@@ -312,6 +316,26 @@ public:
     return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
   }
 
+  unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
+                             Type *ScalarValTy) const {
+    auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) {
+      auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2);
+      EVT VT = getTLI()->getValueType(DL, SrcTy);
+      if (getTLI()->isOperationLegal(ISD::STORE, VT) ||
+          getTLI()->isOperationCustom(ISD::STORE, VT))
+        return true;
+
+      EVT ValVT =
+          getTLI()->getValueType(DL, FixedVectorType::get(ScalarValTy, VF / 2));
+      EVT LegalizedVT =
+          getTLI()->getTypeToTransformTo(ScalarMemTy->getContext(), VT);
+      return getTLI()->isTruncStoreLegal(LegalizedVT, ValVT);
+    };
+    while (VF > 2 && IsSupportedByTarget(VF))
+      VF /= 2;
+    return VF;
+  }
+
   bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty,
                           const DataLayout &DL) const {
     EVT VT = getTLI()->getValueType(DL, Ty);
@@ -362,10 +386,9 @@ public:
     return getTLI()->isTypeLegal(VT);
   }
 
-  InstructionCost getRegUsageForType(Type *Ty) {
-    InstructionCost Val = getTLI()->getTypeLegalizationCost(DL, Ty).first;
-    assert(Val >= 0 && "Negative cost!");
-    return Val;
+  unsigned getRegUsageForType(Type *Ty) {
+    EVT ETy = getTLI()->getValueType(DL, Ty);
+    return getTLI()->getNumRegisters(Ty->getContext(), ETy);
   }
 
   InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
@@ -680,6 +703,8 @@ public:
                                            bool Insert, bool Extract) {
     /// FIXME: a bitfield is not a reasonable abstraction for talking about
     /// which elements are needed from a scalable vector
+    if (isa<ScalableVectorType>(InTy))
+      return InstructionCost::getInvalid();
     auto *Ty = cast<FixedVectorType>(InTy);
 
     assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
@@ -702,6 +727,8 @@ public:
   /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
   InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert,
                                            bool Extract) {
+    if (isa<ScalableVectorType>(InTy))
+      return InstructionCost::getInvalid();
     auto *Ty = cast<FixedVectorType>(InTy);
 
     APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements());
@@ -871,7 +898,8 @@ public:
 
   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
                                  ArrayRef<int> Mask, int Index,
-                                 VectorType *SubTp) {
+                                 VectorType *SubTp,
+                                 ArrayRef<const Value *> Args = None) {
 
     switch (improveShuffleKindFromMask(Kind, Mask)) {
     case TTI::SK_Broadcast:
@@ -1100,6 +1128,9 @@ public:
     // TODO: If one of the types get legalized by splitting, handle this
     // similarly to what getCastInstrCost() does.
     if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
+      if (isa<ScalableVectorType>(ValTy))
+        return InstructionCost::getInvalid();
+
       unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
       if (CondTy)
         CondTy = CondTy->getScalarType();
@@ -1172,11 +1203,12 @@ public:
     if (CostKind != TTI::TCK_RecipThroughput)
       return Cost;
 
+    const DataLayout &DL = this->getDataLayout();
     if (Src->isVectorTy() &&
         // In practice it's not currently possible to have a change in lane
         // length for extending loads or truncating stores so both types should
         // have the same scalable property.
-        TypeSize::isKnownLT(Src->getPrimitiveSizeInBits(),
+        TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),
                             LT.second.getSizeInBits())) {
       // This is a vector load that legalizes to a larger type than the vector
       // itself. Unless the corresponding extending load or truncating store is
@@ -1220,6 +1252,11 @@ public:
       unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
       Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
       bool UseMaskForCond = false, bool UseMaskForGaps = false) {
+
+    // We cannot scalarize scalable vectors, so return Invalid.
+    if (isa<ScalableVectorType>(VecTy))
+      return InstructionCost::getInvalid();
+
     auto *VT = cast<FixedVectorType>(VecTy);
 
     unsigned NumElts = VT->getNumElements();
@@ -1274,8 +1311,7 @@ public:
 
       // Scale the cost of the load by the fraction of legal instructions that
       // will be used.
-      Cost = divideCeil(UsedInsts.count() * Cost.getValue().getValue(),
-                        NumLegalInsts);
+      Cost = divideCeil(UsedInsts.count() * *Cost.getValue(), NumLegalInsts);
     }
 
     // Then plus the cost of interleave operation.
@@ -1382,6 +1418,26 @@ public:
     default:
       break;
 
+    case Intrinsic::powi:
+      if (auto *RHSC = dyn_cast<ConstantInt>(Args[1])) {
+        bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize();
+        if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(),
+                                               ShouldOptForSize)) {
+          // The cost is modeled on the expansion performed by ExpandPowI in
+          // SelectionDAGBuilder.
+          APInt Exponent = RHSC->getValue().abs();
+          unsigned ActiveBits = Exponent.getActiveBits();
+          unsigned PopCount = Exponent.countPopulation();
+          InstructionCost Cost = (ActiveBits + PopCount - 2) *
+                                 thisT()->getArithmeticInstrCost(
+                                     Instruction::FMul, RetTy, CostKind);
+          if (RHSC->getSExtValue() < 0)
+            Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy,
+                                                    CostKind);
+          return Cost;
+        }
+      }
+      break;
     case Intrinsic::cttz:
       // FIXME: If necessary, this should go in target-specific overrides.
       if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz())
@@ -1418,7 +1474,7 @@ public:
       // The cost of materialising a constant integer vector.
       return TargetTransformInfo::TCC_Basic;
     }
-    case Intrinsic::experimental_vector_extract: {
+    case Intrinsic::vector_extract: {
       // FIXME: Handle case where a scalable vector is extracted from a scalable
       // vector
       if (isa<ScalableVectorType>(RetTy))
@@ -1428,7 +1484,7 @@ public:
                                      cast<VectorType>(Args[0]->getType()), None,
                                      Index, cast<VectorType>(RetTy));
     }
-    case Intrinsic::experimental_vector_insert: {
+    case Intrinsic::vector_insert: {
       // FIXME: Handle case where a scalable vector is inserted into a scalable
       // vector
       if (isa<ScalableVectorType>(Args[1]->getType()))
@@ -1471,8 +1527,6 @@ public:
     }
     case Intrinsic::fshl:
     case Intrinsic::fshr: {
-      if (isa<ScalableVectorType>(RetTy))
-        return BaseT::getIntrinsicInstrCost(ICA, CostKind);
       const Value *X = Args[0];
       const Value *Y = Args[1];
       const Value *Z = Args[2];
@@ -1512,6 +1566,29 @@ public:
       }
       return Cost;
     }
+    case Intrinsic::get_active_lane_mask: {
+      EVT ResVT = getTLI()->getValueType(DL, RetTy, true);
+      EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
+
+      // If we're not expanding the intrinsic then we assume this is cheap
+      // to implement.
+      if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) {
+        std::pair<InstructionCost, MVT> LT =
+            getTLI()->getTypeLegalizationCost(DL, RetTy);
+        return LT.first;
+      }
+
+      // Create the expanded types that will be used to calculate the uadd_sat
+      // operation.
+      Type *ExpRetTy = VectorType::get(
+          ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
+      IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF);
+      InstructionCost Cost =
+          thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
+      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy,
+                                          CmpInst::ICMP_ULT, CostKind);
+      return Cost;
+    }
     }
 
     // Assume that we need to scalarize this intrinsic.
@@ -1560,7 +1637,7 @@ public:
 
     // Library call cost - other than size, make it expensive.
     unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
-    SmallVector<unsigned, 2> ISDs;
+    unsigned ISD = 0;
     switch (IID) {
     default: {
       // Scalable vectors cannot be scalarized, so return Invalid.
@@ -1605,82 +1682,82 @@ public:
     // Look for intrinsics that can be lowered directly or turned into a scalar
     // intrinsic call.
     case Intrinsic::sqrt:
-      ISDs.push_back(ISD::FSQRT);
+      ISD = ISD::FSQRT;
       break;
     case Intrinsic::sin:
-      ISDs.push_back(ISD::FSIN);
+      ISD = ISD::FSIN;
       break;
     case Intrinsic::cos:
-      ISDs.push_back(ISD::FCOS);
+      ISD = ISD::FCOS;
       break;
     case Intrinsic::exp:
-      ISDs.push_back(ISD::FEXP);
+      ISD = ISD::FEXP;
       break;
     case Intrinsic::exp2:
-      ISDs.push_back(ISD::FEXP2);
+      ISD = ISD::FEXP2;
       break;
     case Intrinsic::log:
-      ISDs.push_back(ISD::FLOG);
+      ISD = ISD::FLOG;
       break;
     case Intrinsic::log10:
-      ISDs.push_back(ISD::FLOG10);
+      ISD = ISD::FLOG10;
       break;
     case Intrinsic::log2:
-      ISDs.push_back(ISD::FLOG2);
+      ISD = ISD::FLOG2;
       break;
     case Intrinsic::fabs:
-      ISDs.push_back(ISD::FABS);
+      ISD = ISD::FABS;
       break;
     case Intrinsic::canonicalize:
-      ISDs.push_back(ISD::FCANONICALIZE);
+      ISD = ISD::FCANONICALIZE;
       break;
     case Intrinsic::minnum:
-      ISDs.push_back(ISD::FMINNUM);
+      ISD = ISD::FMINNUM;
       break;
     case Intrinsic::maxnum:
-      ISDs.push_back(ISD::FMAXNUM);
+      ISD = ISD::FMAXNUM;
       break;
     case Intrinsic::minimum:
-      ISDs.push_back(ISD::FMINIMUM);
+      ISD = ISD::FMINIMUM;
       break;
     case Intrinsic::maximum:
-      ISDs.push_back(ISD::FMAXIMUM);
+      ISD = ISD::FMAXIMUM;
       break;
     case Intrinsic::copysign:
-      ISDs.push_back(ISD::FCOPYSIGN);
+      ISD = ISD::FCOPYSIGN;
       break;
     case Intrinsic::floor:
-      ISDs.push_back(ISD::FFLOOR);
+      ISD = ISD::FFLOOR;
       break;
     case Intrinsic::ceil:
-      ISDs.push_back(ISD::FCEIL);
+      ISD = ISD::FCEIL;
       break;
     case Intrinsic::trunc:
-      ISDs.push_back(ISD::FTRUNC);
+      ISD = ISD::FTRUNC;
       break;
     case Intrinsic::nearbyint:
-      ISDs.push_back(ISD::FNEARBYINT);
+      ISD = ISD::FNEARBYINT;
       break;
     case Intrinsic::rint:
-      ISDs.push_back(ISD::FRINT);
+      ISD = ISD::FRINT;
       break;
     case Intrinsic::round:
-      ISDs.push_back(ISD::FROUND);
+      ISD = ISD::FROUND;
       break;
     case Intrinsic::roundeven:
-      ISDs.push_back(ISD::FROUNDEVEN);
+      ISD = ISD::FROUNDEVEN;
       break;
     case Intrinsic::pow:
-      ISDs.push_back(ISD::FPOW);
+      ISD = ISD::FPOW;
       break;
     case Intrinsic::fma:
-      ISDs.push_back(ISD::FMA);
+      ISD = ISD::FMA;
       break;
     case Intrinsic::fmuladd:
-      ISDs.push_back(ISD::FMA);
+      ISD = ISD::FMA;
       break;
     case Intrinsic::experimental_constrained_fmuladd:
-      ISDs.push_back(ISD::STRICT_FMA);
+      ISD = ISD::STRICT_FMA;
       break;
     // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
     case Intrinsic::lifetime_start:
@@ -1897,23 +1974,49 @@ public:
           BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);
       return Cost;
     }
+    case Intrinsic::fptosi_sat:
+    case Intrinsic::fptoui_sat: {
+      if (Tys.empty())
+        break;
+      Type *FromTy = Tys[0];
+      bool IsSigned = IID == Intrinsic::fptosi_sat;
+
+      InstructionCost Cost = 0;
+      IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy,
+                                     {FromTy, FromTy});
+      Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind);
+      IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy,
+                                     {FromTy, FromTy});
+      Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind);
+      Cost += thisT()->getCastInstrCost(
+          IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy,
+          TTI::CastContextHint::None, CostKind);
+      if (IsSigned) {
+        Type *CondTy = RetTy->getWithNewBitWidth(1);
+        Cost += thisT()->getCmpSelInstrCost(
+            BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind);
+        Cost += thisT()->getCmpSelInstrCost(
+            BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind);
+      }
+      return Cost;
+    }
     case Intrinsic::ctpop:
-      ISDs.push_back(ISD::CTPOP);
+      ISD = ISD::CTPOP;
       // In case of legalization use TCC_Expensive. This is cheaper than a
       // library call but still not a cheap instruction.
       SingleCallCost = TargetTransformInfo::TCC_Expensive;
       break;
     case Intrinsic::ctlz:
-      ISDs.push_back(ISD::CTLZ);
+      ISD = ISD::CTLZ;
       break;
     case Intrinsic::cttz:
-      ISDs.push_back(ISD::CTTZ);
+      ISD = ISD::CTTZ;
       break;
     case Intrinsic::bswap:
-      ISDs.push_back(ISD::BSWAP);
+      ISD = ISD::BSWAP;
       break;
     case Intrinsic::bitreverse:
-      ISDs.push_back(ISD::BITREVERSE);
+      ISD = ISD::BITREVERSE;
       break;
     }
 
@@ -1921,38 +2024,25 @@ public:
     std::pair<InstructionCost, MVT> LT =
         TLI->getTypeLegalizationCost(DL, RetTy);
 
-    SmallVector<InstructionCost, 2> LegalCost;
-    SmallVector<InstructionCost, 2> CustomCost;
-    for (unsigned ISD : ISDs) {
-      if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
-        if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
-            TLI->isFAbsFree(LT.second)) {
-          return 0;
-        }
-
-        // The operation is legal. Assume it costs 1.
-        // If the type is split to multiple registers, assume that there is some
-        // overhead to this.
-        // TODO: Once we have extract/insert subvector cost we need to use them.
-        if (LT.first > 1)
-          LegalCost.push_back(LT.first * 2);
-        else
-          LegalCost.push_back(LT.first * 1);
-      } else if (!TLI->isOperationExpand(ISD, LT.second)) {
-        // If the operation is custom lowered then assume
-        // that the code is twice as expensive.
-        CustomCost.push_back(LT.first * 2);
+    if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
+      if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
+          TLI->isFAbsFree(LT.second)) {
+        return 0;
       }
-    }
 
-    auto *MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end());
-    if (MinLegalCostI != LegalCost.end())
-      return *MinLegalCostI;
-
-    auto MinCustomCostI =
-        std::min_element(CustomCost.begin(), CustomCost.end());
-    if (MinCustomCostI != CustomCost.end())
-      return *MinCustomCostI;
+      // The operation is legal. Assume it costs 1.
+      // If the type is split to multiple registers, assume that there is some
+      // overhead to this.
+      // TODO: Once we have extract/insert subvector cost we need to use them.
+      if (LT.first > 1)
+        return (LT.first * 2);
+      else
+        return (LT.first * 1);
+    } else if (!TLI->isOperationExpand(ISD, LT.second)) {
+      // If the operation is custom lowered then assume
+      // that the code is twice as expensive.
+      return (LT.first * 2);
+    }
 
     // If we can't lower fmuladd into an FMA estimate the cost as a floating
     // point mul followed by an add.
@@ -2061,6 +2151,11 @@ public:
   /// vector is reduced on each iteration.
   InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty,
                                        TTI::TargetCostKind CostKind) {
+    // Targets must implement a default value for the scalable case, since
+    // we don't know how many lanes the vector has.
+    if (isa<ScalableVectorType>(Ty))
+      return InstructionCost::getInvalid();
+
     Type *ScalarTy = Ty->getElementType();
     unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
     if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
@@ -2159,6 +2254,11 @@ public:
   InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
                                          bool IsUnsigned,
                                          TTI::TargetCostKind CostKind) {
+    // Targets must implement a default value for the scalable case, since
+    // we don't know how many lanes the vector has.
+    if (isa<ScalableVectorType>(Ty))
+      return InstructionCost::getInvalid();
+
     Type *ScalarTy = Ty->getElementType();
     Type *ScalarCondTy = CondTy->getElementType();
     unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
diff --git a/llvm/include/llvm/CodeGen/CFIFixup.h b/llvm/include/llvm/CodeGen/CFIFixup.h
new file mode 100644
index 000000000000..40e535106751
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/CFIFixup.h
@@ -0,0 +1,38 @@
+//===-- CFIFixup.h - Insert CFI remember/restore instructions ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Contains definition of the base CFIFixup pass.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_CFIFIXUP_H
+#define LLVM_CODEGEN_CFIFIXUP_H
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/InitializePasses.h"
+
+namespace llvm {
+class CFIFixup : public MachineFunctionPass {
+public:
+  static char ID;
+
+  CFIFixup() : MachineFunctionPass(ID) {
+    initializeCFIFixupPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_CFIFIXUP_H
diff --git a/llvm/include/llvm/CodeGen/CalcSpillWeights.h b/llvm/include/llvm/CodeGen/CalcSpillWeights.h
index bfd5bab3d1c0..41b7f10cfc38 100644
--- a/llvm/include/llvm/CodeGen/CalcSpillWeights.h
+++ b/llvm/include/llvm/CodeGen/CalcSpillWeights.h
@@ -9,7 +9,6 @@
 #ifndef LLVM_CODEGEN_CALCSPILLWEIGHTS_H
 #define LLVM_CODEGEN_CALCSPILLWEIGHTS_H
 
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 
 namespace llvm {
@@ -65,17 +64,6 @@ class VirtRegMap;
     /// (re)compute li's spill weight and allocation hint.
     void calculateSpillWeightAndHint(LiveInterval &LI);
 
-    /// Compute future expected spill weight of a split artifact of LI
-    /// that will span between start and end slot indexes.
-    /// \param LI     The live interval to be split.
-    /// \param Start  The expected beginning of the split artifact. Instructions
-    ///               before start will not affect the weight.
-    /// \param End    The expected end of the split artifact. Instructions
-    ///               after end will not affect the weight.
-    /// \return The expected spill weight of the split artifact. Returns
-    /// negative weight for unspillable LI.
-    float futureWeight(LiveInterval &LI, SlotIndex Start, SlotIndex End);
-
     /// Compute spill weights and allocation hints for all virtual register
     /// live intervals.
     void calculateSpillWeightsAndHints();
diff --git a/llvm/include/llvm/CodeGen/CallingConvLower.h b/llvm/include/llvm/CodeGen/CallingConvLower.h
index 8dbcd6b8ab7d..90afbfc32a4e 100644
--- a/llvm/include/llvm/CodeGen/CallingConvLower.h
+++ b/llvm/include/llvm/CodeGen/CallingConvLower.h
@@ -15,11 +15,9 @@
 #define LLVM_CODEGEN_CALLINGCONVLOWER_H
 
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/Register.h"
 #include "llvm/CodeGen/TargetCallingConv.h"
 #include "llvm/IR/CallingConv.h"
-#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/Alignment.h"
 
 namespace llvm {
diff --git a/llvm/include/llvm/CodeGen/CodeGenCommonISel.h b/llvm/include/llvm/CodeGen/CodeGenCommonISel.h
index 270f935b6738..ce278468dffc 100644
--- a/llvm/include/llvm/CodeGen/CodeGenCommonISel.h
+++ b/llvm/include/llvm/CodeGen/CodeGenCommonISel.h
@@ -19,7 +19,6 @@
 namespace llvm {
 
 class BasicBlock;
-class MachineBasicBlock;
 /// Encapsulates all of the information needed to generate a stack protector
 /// check, and signals to isel when initialized that one needs to be generated.
 ///
@@ -213,6 +212,13 @@ private:
 MachineBasicBlock::iterator
 findSplitPointForStackProtector(MachineBasicBlock *BB,
                                 const TargetInstrInfo &TII);
+/// Evaluates if the specified FP class test is an inversion of a simpler test.
+/// An example is the test "inf|normal|subnormal|zero", which is an inversion
+/// of "nan".
+/// \param Test The test as specified in 'is_fpclass' intrinsic invocation.
+/// \returns The inverted test, or zero, if inversion does not produce simpler
+/// test.
+unsigned getInvertedFPClassTest(unsigned Test);
 
 } // namespace llvm
 
diff --git a/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h b/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h
index f6563971f981..f4b1980b9ede 100644
--- a/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h
@@ -15,7 +15,6 @@
 #ifndef LLVM_CODEGEN_CODEGENPASSBUILDER_H
 #define LLVM_CODEGEN_CODEGENPASSBUILDER_H
 
-#include "llvm/ADT/FunctionExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -26,7 +25,6 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
 #include "llvm/CodeGen/ExpandReductions.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachinePassManager.h"
 #include "llvm/CodeGen/PreISelIntrinsicLowering.h"
 #include "llvm/CodeGen/ReplaceWithVeclib.h"
@@ -35,7 +33,6 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Debug.h"
@@ -43,7 +40,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/CGPassBuilderOption.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/ConstantHoisting.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
@@ -51,7 +47,6 @@
 #include "llvm/Transforms/Scalar/MergeICmps.h"
 #include "llvm/Transforms/Scalar/PartiallyInlineLibCalls.h"
 #include "llvm/Transforms/Scalar/ScalarizeMaskedMemIntrin.h"
-#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/EntryExitInstrumenter.h"
 #include "llvm/Transforms/Utils/LowerInvoke.h"
 #include <cassert>
@@ -668,6 +663,10 @@ void CodeGenPassBuilder<Derived>::addIRPasses(AddIRPass &addPass) const {
 
   // Expand reduction intrinsics into shuffle sequences if the target wants to.
   addPass(ExpandReductionsPass());
+
+  // Convert conditional moves to conditional jumps when profitable.
+  if (getOptLevel() != CodeGenOpt::None && !Opt.DisableSelectOptimize)
+    addPass(SelectOptimizePass());
 }
 
 /// Turn exception handling constructs into something the code generators can
@@ -751,7 +750,7 @@ template <typename Derived>
 Error CodeGenPassBuilder<Derived>::addCoreISelPasses(
     AddMachinePass &addPass) const {
   // Enable FastISel with -fast-isel, but allow that to be overridden.
-  TM.setO0WantsFastISel(Opt.EnableFastISelOption.getValueOr(true));
+  TM.setO0WantsFastISel(Opt.EnableFastISelOption.value_or(true));
 
   // Determine an instruction selector.
   enum class SelectorType { SelectionDAG, FastISel, GlobalISel };
diff --git a/llvm/include/llvm/CodeGen/CommandFlags.h b/llvm/include/llvm/CodeGen/CommandFlags.h
index 73d39fecc268..9281ed723854 100644
--- a/llvm/include/llvm/CodeGen/CommandFlags.h
+++ b/llvm/include/llvm/CodeGen/CommandFlags.h
@@ -16,11 +16,6 @@
 #define LLVM_CODEGEN_COMMANDFLAGS_H
 
 #include "llvm/ADT/FloatingPointMode.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/MC/MCTargetOptionsCommandFlags.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Target/TargetOptions.h"
 #include <string>
@@ -29,6 +24,9 @@
 namespace llvm {
 
 class Module;
+class AttrBuilder;
+class Function;
+class Triple;
 
 namespace codegen {
 
@@ -62,6 +60,8 @@ bool getEnableNoNaNsFPMath();
 
 bool getEnableNoSignedZerosFPMath();
 
+bool getEnableApproxFuncFPMath();
+
 bool getEnableNoTrappingFPMath();
 
 DenormalMode::DenormalModeKind getDenormalFPMath();
@@ -93,6 +93,8 @@ std::string getTrapFuncName();
 
 bool getUseCtors();
 
+bool getLowerGlobalDtorsViaCxaAtExit();
+
 bool getRelaxELFRelocations();
 
 bool getDataSections();
@@ -140,6 +142,8 @@ bool getDebugStrictDwarf();
 
 unsigned getAlignLoops();
 
+bool getJMCInstrument();
+
 /// Create this object with static storage to register codegen-related command
 /// line options.
 struct RegisterCodeGenFlags {
diff --git a/llvm/include/llvm/CodeGen/DFAPacketizer.h b/llvm/include/llvm/CodeGen/DFAPacketizer.h
index 9cdaedc9e861..aba6503a6a1f 100644
--- a/llvm/include/llvm/CodeGen/DFAPacketizer.h
+++ b/llvm/include/llvm/CodeGen/DFAPacketizer.h
@@ -25,9 +25,7 @@
 #ifndef LLVM_CODEGEN_DFAPACKETIZER_H
 #define LLVM_CODEGEN_DFAPACKETIZER_H
 
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/ScheduleDAGMutation.h"
 #include "llvm/Support/Automaton.h"
 #include <cstdint>
 #include <map>
@@ -38,6 +36,7 @@
 namespace llvm {
 
 class DefaultVLIWScheduler;
+class ScheduleDAGMutation;
 class InstrItineraryData;
 class MachineFunction;
 class MachineInstr;
diff --git a/llvm/include/llvm/CodeGen/DbgEntityHistoryCalculator.h b/llvm/include/llvm/CodeGen/DbgEntityHistoryCalculator.h
index 2ac9d938d281..465829159e42 100644
--- a/llvm/include/llvm/CodeGen/DbgEntityHistoryCalculator.h
+++ b/llvm/include/llvm/CodeGen/DbgEntityHistoryCalculator.h
@@ -12,12 +12,12 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/LexicalScopes.h"
 #include <utility>
 
 namespace llvm {
 
 class DILocation;
+class LexicalScopes;
 class DINode;
 class MachineFunction;
 class MachineInstr;
diff --git a/llvm/include/llvm/CodeGen/DwarfStringPoolEntry.h b/llvm/include/llvm/CodeGen/DwarfStringPoolEntry.h
index abeba62707c1..f19d321793e9 100644
--- a/llvm/include/llvm/CodeGen/DwarfStringPoolEntry.h
+++ b/llvm/include/llvm/CodeGen/DwarfStringPoolEntry.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_CODEGEN_DWARFSTRINGPOOLENTRY_H
 #define LLVM_CODEGEN_DWARFSTRINGPOOLENTRY_H
 
-#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/StringMap.h"
 
 namespace llvm {
@@ -20,49 +20,91 @@ class MCSymbol;
 struct DwarfStringPoolEntry {
   static constexpr unsigned NotIndexed = -1;
 
-  MCSymbol *Symbol;
-  uint64_t Offset;
-  unsigned Index;
+  MCSymbol *Symbol = nullptr;
+  uint64_t Offset = 0;
+  unsigned Index = 0;
 
   bool isIndexed() const { return Index != NotIndexed; }
 };
 
-/// String pool entry reference.
+/// DwarfStringPoolEntryRef: Dwarf string pool entry reference.
+///
+/// Dwarf string pool entry keeps string value and its data.
+/// There are two variants how data are represented:
+///
+///   1. By value - StringMapEntry<DwarfStringPoolEntry>.
+///   2. By pointer - StringMapEntry<DwarfStringPoolEntry *>.
+///
+/// The "By pointer" variant allows for reducing memory usage for the case
+/// when string pool entry does not have data: it keeps the null pointer
+/// and so no need to waste space for the full DwarfStringPoolEntry.
+/// It is recommended to use "By pointer" variant if not all entries
+/// of dwarf string pool have corresponding DwarfStringPoolEntry.
+
 class DwarfStringPoolEntryRef {
-  PointerIntPair<const StringMapEntry<DwarfStringPoolEntry> *, 1, bool>
-      MapEntryAndIndexed;
+  /// Pointer type for "By value" string entry.
+  using ByValStringEntryPtr = const StringMapEntry<DwarfStringPoolEntry> *;
 
-  const StringMapEntry<DwarfStringPoolEntry> *getMapEntry() const {
-    return MapEntryAndIndexed.getPointer();
-  }
+  /// Pointer type for "By pointer" string entry.
+  using ByPtrStringEntryPtr = const StringMapEntry<DwarfStringPoolEntry *> *;
+
+  /// Pointer to the dwarf string pool Entry.
+  PointerUnion<ByValStringEntryPtr, ByPtrStringEntryPtr> MapEntry = nullptr;
 
 public:
   DwarfStringPoolEntryRef() = default;
-  DwarfStringPoolEntryRef(const StringMapEntry<DwarfStringPoolEntry> &Entry,
-                          bool Indexed)
-      : MapEntryAndIndexed(&Entry, Indexed) {}
 
-  explicit operator bool() const { return getMapEntry(); }
+  /// ASSUMPTION: DwarfStringPoolEntryRef keeps pointer to \p Entry,
+  /// thus specified entry mustn`t be reallocated.
+  DwarfStringPoolEntryRef(const StringMapEntry<DwarfStringPoolEntry> &Entry)
+      : MapEntry(&Entry) {}
+
+  /// ASSUMPTION: DwarfStringPoolEntryRef keeps pointer to \p Entry,
+  /// thus specified entry mustn`t be reallocated.
+  DwarfStringPoolEntryRef(const StringMapEntry<DwarfStringPoolEntry *> &Entry)
+      : MapEntry(&Entry) {
+    assert(MapEntry.get<ByPtrStringEntryPtr>()->second != nullptr);
+  }
+
+  explicit operator bool() const { return !MapEntry.isNull(); }
+
+  /// \returns symbol for the dwarf string.
   MCSymbol *getSymbol() const {
-    assert(getMapEntry()->second.Symbol && "No symbol available!");
-    return getMapEntry()->second.Symbol;
+    assert(getEntry().Symbol && "No symbol available!");
+    return getEntry().Symbol;
   }
-  uint64_t getOffset() const { return getMapEntry()->second.Offset; }
-  bool isIndexed() const { return MapEntryAndIndexed.getInt(); }
+
+  /// \returns offset for the dwarf string.
+  uint64_t getOffset() const { return getEntry().Offset; }
+
+  /// \returns index for the dwarf string.
   unsigned getIndex() const {
-    assert(isIndexed());
-    assert(getMapEntry()->getValue().isIndexed());
-    return getMapEntry()->second.Index;
+    assert(getEntry().isIndexed() && "Index is not set!");
+    return getEntry().Index;
+  }
+
+  /// \returns string.
+  StringRef getString() const {
+    if (MapEntry.is<ByValStringEntryPtr>())
+      return MapEntry.get<ByValStringEntryPtr>()->first();
+
+    return MapEntry.get<ByPtrStringEntryPtr>()->first();
+  }
+
+  /// \returns the entire string pool entry for convenience.
+  const DwarfStringPoolEntry &getEntry() const {
+    if (MapEntry.is<ByValStringEntryPtr>())
+      return MapEntry.get<ByValStringEntryPtr>()->second;
+
+    return *MapEntry.get<ByPtrStringEntryPtr>()->second;
   }
-  StringRef getString() const { return getMapEntry()->first(); }
-  /// Return the entire string pool entry for convenience.
-  DwarfStringPoolEntry getEntry() const { return getMapEntry()->getValue(); }
 
   bool operator==(const DwarfStringPoolEntryRef &X) const {
-    return getMapEntry() == X.getMapEntry();
+    return MapEntry.getOpaqueValue() == X.MapEntry.getOpaqueValue();
   }
+
   bool operator!=(const DwarfStringPoolEntryRef &X) const {
-    return getMapEntry() != X.getMapEntry();
+    return MapEntry.getOpaqueValue() != X.MapEntry.getOpaqueValue();
   }
 };
 
diff --git a/llvm/include/llvm/CodeGen/FastISel.h b/llvm/include/llvm/CodeGen/FastISel.h
index 775698a66ada..8be97d2c2095 100644
--- a/llvm/include/llvm/CodeGen/FastISel.h
+++ b/llvm/include/llvm/CodeGen/FastISel.h
@@ -24,15 +24,15 @@
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/MachineValueType.h"
-#include <algorithm>
 #include <cstdint>
 #include <utility>
 
 namespace llvm {
 
 class AllocaInst;
+class Instruction;
+class IntrinsicInst;
 class BasicBlock;
 class CallInst;
 class Constant;
@@ -212,6 +212,7 @@ protected:
   const TargetRegisterInfo &TRI;
   const TargetLibraryInfo *LibInfo;
   bool SkipTargetIndependentISel;
+  bool UseInstrRefDebugInfo = false;
 
   /// The position of the last instruction for materializing constants
   /// for use in the current block. It resets to EmitStartPt when it makes sense
@@ -318,6 +319,12 @@ public:
   /// Reset InsertPt to the given old insert position.
   void leaveLocalValueArea(SavePoint Old);
 
+  /// Signal whether instruction referencing variable locations are desired for
+  /// this function's debug-info.
+  void useInstrRefDebugInfo(bool Flag) {
+    UseInstrRefDebugInfo = Flag;
+  }
+
 protected:
   explicit FastISel(FunctionLoweringInfo &FuncInfo,
                     const TargetLibraryInfo *LibInfo,
diff --git a/llvm/include/llvm/CodeGen/FaultMaps.h b/llvm/include/llvm/CodeGen/FaultMaps.h
index 8a8b1d2e6008..c228bb895edd 100644
--- a/llvm/include/llvm/CodeGen/FaultMaps.h
+++ b/llvm/include/llvm/CodeGen/FaultMaps.h
@@ -10,7 +10,6 @@
 #define LLVM_CODEGEN_FAULTMAPS_H
 
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/Endian.h"
 #include <map>
 #include <vector>
 
diff --git a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
index 524730d53694..f8156ce73196 100644
--- a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
+++ b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
@@ -101,6 +101,10 @@ public:
       // Value was lowered to tied def and gc.relocate should be replaced with
       // copy from vreg.
       VReg,
+      // Value was lowered to tied def and gc.relocate should be replaced with
+      // SDValue kept in StatepointLoweringInfo structure. This valid for local
+      // relocates only.
+      SDValueNode,
     } type = NoRelocate;
     // Payload contains either frame index of the stack slot in which the value
     // was spilled, or virtual register which contains the re-definition.
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CSEMIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/CSEMIRBuilder.h
index 4f95335db74b..4d9694347f17 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CSEMIRBuilder.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CSEMIRBuilder.h
@@ -13,10 +13,10 @@
 #define LLVM_CODEGEN_GLOBALISEL_CSEMIRBUILDER_H
 
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
-#include "llvm/CodeGen/GlobalISel/Utils.h"
 
 namespace llvm {
 
+class GISelInstProfileBuilder;
 /// Defines a builder that does CSE of MachineInstructions using GISelCSEInfo.
 /// Eg usage.
 ///
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h b/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
index f9663fadb868..9bf1c134618c 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
@@ -17,25 +17,26 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/TargetCallingConv.h"
-#include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LowLevelTypeImpl.h"
 #include "llvm/Support/MachineValueType.h"
 #include <cstdint>
 #include <functional>
 
 namespace llvm {
 
+class AttributeList;
 class CallBase;
 class DataLayout;
 class Function;
 class FunctionLoweringInfo;
 class MachineIRBuilder;
+class MachineFunction;
 struct MachinePointerInfo;
 class MachineRegisterInfo;
 class TargetLowering;
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Combiner.h b/llvm/include/llvm/CodeGen/GlobalISel/Combiner.h
index 795686980842..8c295428afe8 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/Combiner.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/Combiner.h
@@ -15,7 +15,6 @@
 #define LLVM_CODEGEN_GLOBALISEL_COMBINER_H
 
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
 
 namespace llvm {
 class MachineRegisterInfo;
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 45c27c25aea0..73edc3c37970 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -17,16 +17,20 @@
 #ifndef LLVM_CODEGEN_GLOBALISEL_COMBINERHELPER_H
 #define LLVM_CODEGEN_GLOBALISEL_COMBINERHELPER_H
 
-#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
-#include "llvm/CodeGen/LowLevelType.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/Register.h"
-#include "llvm/Support/Alignment.h"
+#include "llvm/Support/LowLevelTypeImpl.h"
+#include <functional>
 
 namespace llvm {
 
 class GISelChangeObserver;
+class APFloat;
+class APInt;
+class GPtrAdd;
+class GStore;
+class GZExtLoad;
 class MachineIRBuilder;
 class MachineInstrBuilder;
 class MachineRegisterInfo;
@@ -124,10 +128,20 @@ public:
 
   const TargetLowering &getTargetLowering() const;
 
+  /// \returns true if the combiner is running pre-legalization.
+  bool isPreLegalize() const;
+
+  /// \returns true if \p Query is legal on the target.
+  bool isLegal(const LegalityQuery &Query) const;
+
   /// \return true if the combine is running prior to legalization, or if \p
   /// Query is legal on the target.
   bool isLegalOrBeforeLegalizer(const LegalityQuery &Query) const;
 
+  /// \return true if the combine is running prior to legalization, or if \p Ty
+  /// is a legal integer constant type on the target.
+  bool isConstantLegalOrBeforeLegalizer(const LLT Ty) const;
+
   /// MachineRegisterInfo::replaceRegWith() and inform the observer of the changes
   void replaceRegWith(MachineRegisterInfo &MRI, Register FromReg, Register ToReg) const;
 
@@ -529,6 +543,13 @@ public:
   /// Combine G_UREM x, (known power of 2) to an add and bitmasking.
   void applySimplifyURemByPow2(MachineInstr &MI);
 
+  /// Push a binary operator through a select on constants.
+  ///
+  /// binop (select cond, K0, K1), K2 ->
+  ///   select cond, (binop K0, K2), (binop K1, K2)
+  bool matchFoldBinOpIntoSelect(MachineInstr &MI, unsigned &SelectOpNo);
+  bool applyFoldBinOpIntoSelect(MachineInstr &MI, const unsigned &SelectOpNo);
+
   bool matchCombineInsertVecElts(MachineInstr &MI,
                                  SmallVectorImpl<Register> &MatchInfo);
 
@@ -645,6 +666,14 @@ public:
   ///   (G_SMULO x, 2) -> (G_SADDO x, x)
   bool matchMulOBy2(MachineInstr &MI, BuildFnTy &MatchInfo);
 
+  /// Match:
+  /// (G_*MULO x, 0) -> 0 + no carry out
+  bool matchMulOBy0(MachineInstr &MI, BuildFnTy &MatchInfo);
+
+  /// Match:
+  /// (G_*ADDO x, 0) -> x + no carry out
+  bool matchAddOBy0(MachineInstr &MI, BuildFnTy &MatchInfo);
+
   /// Transform (fadd x, fneg(y)) -> (fsub x, y)
   ///           (fadd fneg(x), y) -> (fsub y, x)
   ///           (fsub x, fneg(y)) -> (fadd x, y)
@@ -702,6 +731,15 @@ public:
   bool matchCombineFSubFpExtFNegFMulToFMadOrFMA(MachineInstr &MI,
                                                 BuildFnTy &MatchInfo);
 
+  /// Fold boolean selects to logical operations.
+  bool matchSelectToLogical(MachineInstr &MI, BuildFnTy &MatchInfo);
+
+  bool matchCombineFMinMaxNaN(MachineInstr &MI, unsigned &Info);
+
+  /// Transform G_ADD(x, G_SUB(y, x)) to y.
+  /// Transform G_ADD(G_SUB(y, x), x) to y.
+  bool matchAddSubSameReg(MachineInstr &MI, Register &Src);
+
 private:
   /// Given a non-indexed load or store instruction \p MI, find an offset that
   /// can be usefully and legally folded into it as a post-indexing operation.
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GISelWorkList.h b/llvm/include/llvm/CodeGen/GlobalISel/GISelWorkList.h
index 7d198fada411..3ec6a1da201e 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GISelWorkList.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GISelWorkList.h
@@ -28,7 +28,7 @@ class GISelWorkList {
   SmallVector<MachineInstr *, N> Worklist;
   DenseMap<MachineInstr *, unsigned> WorklistMap;
 
-#ifndef NDEBUG
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
   bool Finalized = true;
 #endif
 
@@ -49,7 +49,7 @@ public:
   // of most passes.
   void deferred_insert(MachineInstr *I) {
     Worklist.push_back(I);
-#ifndef NDEBUG
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
     Finalized = false;
 #endif
   }
@@ -65,21 +65,25 @@ public:
     for (unsigned i = 0; i < Worklist.size(); ++i)
       if (!WorklistMap.try_emplace(Worklist[i], i).second)
         llvm_unreachable("Duplicate elements in the list");
-#ifndef NDEBUG
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
     Finalized = true;
 #endif
   }
 
   /// Add the specified instruction to the worklist if it isn't already in it.
   void insert(MachineInstr *I) {
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
     assert(Finalized && "GISelWorkList used without finalizing");
+#endif
     if (WorklistMap.try_emplace(I, Worklist.size()).second)
       Worklist.push_back(I);
   }
 
   /// Remove I from the worklist if it exists.
   void remove(const MachineInstr *I) {
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
     assert((Finalized || WorklistMap.empty()) && "Neither finalized nor empty");
+#endif
     auto It = WorklistMap.find(I);
     if (It == WorklistMap.end())
       return; // Not in worklist.
@@ -96,7 +100,9 @@ public:
   }
 
   MachineInstr *pop_back_val() {
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
     assert(Finalized && "GISelWorkList used without finalizing");
+#endif
     MachineInstr *I;
     do {
       I = Worklist.pop_back_val();
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
index 7103656365b1..58fe48200e73 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_CODEGEN_GLOBALISEL_GENERICMACHINEINSTRS_H
 #define LLVM_CODEGEN_GLOBALISEL_GENERICMACHINEINSTRS_H
 
+#include "llvm/IR/Instructions.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
@@ -226,6 +227,37 @@ public:
   }
 };
 
+/// Represent a G_ICMP or G_FCMP.
+class GAnyCmp : public GenericMachineInstr {
+public:
+  CmpInst::Predicate getCond() const {
+    return static_cast<CmpInst::Predicate>(getOperand(1).getPredicate());
+  }
+  Register getLHSReg() const { return getReg(2); }
+  Register getRHSReg() const { return getReg(3); }
+
+  static bool classof(const MachineInstr *MI) {
+    return MI->getOpcode() == TargetOpcode::G_ICMP ||
+           MI->getOpcode() == TargetOpcode::G_FCMP;
+  }
+};
+
+/// Represent a G_ICMP.
+class GICmp : public GAnyCmp {
+public:
+  static bool classof(const MachineInstr *MI) {
+    return MI->getOpcode() == TargetOpcode::G_ICMP;
+  }
+};
+
+/// Represent a G_FCMP.
+class GFCmp : public GAnyCmp {
+public:
+  static bool classof(const MachineInstr *MI) {
+    return MI->getOpcode() == TargetOpcode::G_FCMP;
+  }
+};
+
 } // namespace llvm
 
 #endif // LLVM_CODEGEN_GLOBALISEL_GENERICMACHINEINSTRS_H
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index ebe16cd4f58c..5e7428a5edc5 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -22,11 +22,10 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/CodeGenCommonISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
-#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/SwiftErrorValueTracking.h"
 #include "llvm/CodeGen/SwitchLoweringUtils.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CodeGen.h"
 #include <memory>
@@ -248,12 +247,6 @@ private:
 
   bool translateInlineAsm(const CallBase &CB, MachineIRBuilder &MIRBuilder);
 
-  /// Returns true if the value should be split into multiple LLTs.
-  /// If \p Offsets is given then the split type's offsets will be stored in it.
-  /// If \p Offsets is not empty it will be cleared first.
-  bool valueIsSplit(const Value &V,
-                    SmallVectorImpl<uint64_t> *Offsets = nullptr);
-
   /// Common code for translating normal calls or invokes.
   bool translateCallBase(const CallBase &CB, MachineIRBuilder &MIRBuilder);
 
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelect.h b/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelect.h
index 4a72621ec61e..60c7694725a5 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelect.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelect.h
@@ -13,8 +13,10 @@
 #ifndef LLVM_CODEGEN_GLOBALISEL_INSTRUCTIONSELECT_H
 #define LLVM_CODEGEN_GLOBALISEL_INSTRUCTIONSELECT_H
 
-#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Support/CodeGen.h"
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h b/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h
index 03f4f3bf0b19..8ea45e576e4d 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h
@@ -18,12 +18,9 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/Support/CodeGenCoverage.h"
+#include "llvm/IR/Function.h"
 #include "llvm/Support/LowLevelTypeImpl.h"
 #include <bitset>
 #include <cstddef>
@@ -34,6 +31,10 @@
 
 namespace llvm {
 
+class BlockFrequencyInfo;
+class CodeGenCoverage;
+class MachineBasicBlock;
+class ProfileSummaryInfo;
 class APInt;
 class APFloat;
 class GISelKnownBits;
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h b/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h
index bc9f952146c2..c06b33d11170 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h
@@ -17,16 +17,17 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/Support/CodeGenCoverage.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -673,7 +674,7 @@ bool InstructionSelector::executeMatchTable(
       ComplexRendererFns Renderer =
           (ISel.*ISelInfo.ComplexPredicates[ComplexPredicateID])(
               State.MIs[InsnID]->getOperand(OpIdx));
-      if (Renderer.hasValue())
+      if (Renderer)
         State.Renderers[RendererID] = Renderer.getValue();
       else
         if (handleReject() == RejectAndGiveUp)
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
index 38d2fe28063a..6802591b6350 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
@@ -24,10 +24,10 @@
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Register.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "legalizer"
-using namespace llvm::MIPatternMatch;
 
 namespace llvm {
 class LegalizationArtifactCombiner {
@@ -56,6 +56,7 @@ public:
                         SmallVectorImpl<MachineInstr *> &DeadInsts,
                         SmallVectorImpl<Register> &UpdatedDefs,
                         GISelObserverWrapper &Observer) {
+    using namespace llvm::MIPatternMatch;
     assert(MI.getOpcode() == TargetOpcode::G_ANYEXT);
 
     Builder.setInstrAndDebugLoc(MI);
@@ -109,6 +110,7 @@ public:
                       SmallVectorImpl<MachineInstr *> &DeadInsts,
                       SmallVectorImpl<Register> &UpdatedDefs,
                       GISelObserverWrapper &Observer) {
+    using namespace llvm::MIPatternMatch;
     assert(MI.getOpcode() == TargetOpcode::G_ZEXT);
 
     Builder.setInstrAndDebugLoc(MI);
@@ -170,6 +172,7 @@ public:
   bool tryCombineSExt(MachineInstr &MI,
                       SmallVectorImpl<MachineInstr *> &DeadInsts,
                       SmallVectorImpl<Register> &UpdatedDefs) {
+    using namespace llvm::MIPatternMatch;
     assert(MI.getOpcode() == TargetOpcode::G_SEXT);
 
     Builder.setInstrAndDebugLoc(MI);
@@ -227,6 +230,7 @@ public:
                        SmallVectorImpl<MachineInstr *> &DeadInsts,
                        SmallVectorImpl<Register> &UpdatedDefs,
                        GISelObserverWrapper &Observer) {
+    using namespace llvm::MIPatternMatch;
     assert(MI.getOpcode() == TargetOpcode::G_TRUNC);
 
     Builder.setInstr(MI);
@@ -1281,6 +1285,8 @@ private:
   /// Looks through copy instructions and returns the actual
   /// source register.
   Register lookThroughCopyInstrs(Register Reg) {
+    using namespace llvm::MIPatternMatch;
+
     Register TmpReg;
     while (mi_match(Reg, MRI, m_Copy(m_Reg(TmpReg)))) {
       if (MRI.getType(TmpReg).isValid())
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Legalizer.h b/llvm/include/llvm/CodeGen/GlobalISel/Legalizer.h
index c19f1d5330ba..7884b3f2ea6e 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/Legalizer.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/Legalizer.h
@@ -20,11 +20,17 @@
 #ifndef LLVM_CODEGEN_GLOBALISEL_LEGALIZER_H
 #define LLVM_CODEGEN_GLOBALISEL_LEGALIZER_H
 
-#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 
 namespace llvm {
 
+class LegalizerInfo;
+class MachineIRBuilder;
+class MachineInstr;
+class GISelChangeObserver;
 class LostDebugLocObserver;
 
 class Legalizer : public MachineFunctionPass {
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 3b2f937375eb..c6c57ac07f0e 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -21,14 +21,22 @@
 #define LLVM_CODEGEN_GLOBALISEL_LEGALIZERHELPER_H
 
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
-#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
-#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
-#include "llvm/CodeGen/LowLevelType.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
 
 namespace llvm {
 // Forward declarations.
+class APInt;
+class GAnyLoad;
+class GLoadStore;
+class GStore;
+class GenericMachineInstr;
+class MachineFunction;
+class MachineIRBuilder;
+class MachineInstr;
+class MachineInstrBuilder;
+struct MachinePointerInfo;
+template <typename T> class SmallVectorImpl;
 class LegalizerInfo;
 class MachineRegisterInfo;
 class GISelChangeObserver;
@@ -159,10 +167,6 @@ public:
   /// def by inserting a G_BITCAST from \p CastTy
   void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx);
 
-  /// Widen \p OrigReg to \p WideTy by merging to a wider type, padding with
-  /// G_IMPLICIT_DEF, and producing dead results.
-  Register widenWithUnmerge(LLT WideTy, Register OrigReg);
-
 private:
   LegalizeResult
   widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx, LLT WideTy);
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index 17cb53dd2d5b..c0cad8ff675d 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -14,26 +14,26 @@
 #ifndef LLVM_CODEGEN_GLOBALISEL_LEGALIZERINFO_H
 #define LLVM_CODEGEN_GLOBALISEL_LEGALIZERINFO_H
 
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/GlobalISel/LegacyLegalizerInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/LowLevelTypeImpl.h"
-#include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <cstdint>
 #include <tuple>
-#include <unordered_map>
 #include <utility>
 
 namespace llvm {
 
 extern cl::opt<bool> DisableGISelLegalityCheck;
 
+class MachineFunction;
+class raw_ostream;
 class LegalizerHelper;
 class MachineInstr;
 class MachineRegisterInfo;
@@ -327,8 +327,14 @@ LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1);
 /// index.
 LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1);
 
-/// True iff the specified MMO index has a size that is not a power of 2
+/// True iff the specified MMO index has a size (rounded to bytes) that is not a
+/// power of 2.
 LegalityPredicate memSizeInBytesNotPow2(unsigned MMOIdx);
+
+/// True iff the specified MMO index has a size that is not an even byte size,
+/// or that even byte size is not a power of 2.
+LegalityPredicate memSizeNotByteSizePow2(unsigned MMOIdx);
+
 /// True iff the specified type index is a vector whose element count is not a
 /// power of 2.
 LegalityPredicate numElementsNotPow2(unsigned TypeIdx);
@@ -351,6 +357,14 @@ LegalizeMutation changeElementTo(unsigned TypeIdx, unsigned FromTypeIdx);
 /// Keep the same scalar or element type as the given type.
 LegalizeMutation changeElementTo(unsigned TypeIdx, LLT Ty);
 
+/// Keep the same scalar or element type as \p TypeIdx, but take the number of
+/// elements from \p FromTypeIdx.
+LegalizeMutation changeElementCountTo(unsigned TypeIdx, unsigned FromTypeIdx);
+
+/// Keep the same scalar or element type as \p TypeIdx, but take the number of
+/// elements from \p Ty.
+LegalizeMutation changeElementCountTo(unsigned TypeIdx, LLT Ty);
+
 /// Change the scalar size or element size to have the same scalar size as type
 /// index \p FromIndex. Unlike changeElementTo, this discards pointer types and
 /// only changes the size.
@@ -800,11 +814,23 @@ public:
     return actionIf(LegalizeAction::Unsupported,
                     LegalityPredicates::memSizeInBytesNotPow2(0));
   }
+
+  /// Lower a memory operation if the memory size, rounded to bytes, is not a
+  /// power of 2. For example, this will not trigger for s1 or s7, but will for
+  /// s24.
   LegalizeRuleSet &lowerIfMemSizeNotPow2() {
     return actionIf(LegalizeAction::Lower,
                     LegalityPredicates::memSizeInBytesNotPow2(0));
   }
 
+  /// Lower a memory operation if the memory access size is not a round power of
+  /// 2 byte size. This is stricter than lowerIfMemSizeNotPow2, and more likely
+  /// what you want (e.g. this will lower s1, s7 and s24).
+  LegalizeRuleSet &lowerIfMemSizeNotByteSizePow2() {
+    return actionIf(LegalizeAction::Lower,
+                    LegalityPredicates::memSizeNotByteSizePow2(0));
+  }
+
   LegalizeRuleSet &customIf(LegalityPredicate Predicate) {
     // We have no choice but conservatively assume that a custom action with a
     // free-form user provided Predicate properly handles all type indices:
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LoadStoreOpt.h b/llvm/include/llvm/CodeGen/GlobalISel/LoadStoreOpt.h
index 0845c001abdb..6efe7c7c9bbd 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LoadStoreOpt.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LoadStoreOpt.h
@@ -17,18 +17,19 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
-#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
-#include "llvm/CodeGen/GlobalISel/Utils.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 
 namespace llvm {
 // Forward declarations.
+class AnalysisUsage;
+class GStore;
+class LegalizerInfo;
+class MachineBasicBlock;
+class MachineInstr;
+class TargetLowering;
+struct LegalityQuery;
 class MachineRegisterInfo;
 namespace GISelAddressing {
 /// Helper struct to store a base, index and offset that forms an address
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Localizer.h b/llvm/include/llvm/CodeGen/GlobalISel/Localizer.h
index 1d1afff7f934..9ea0d095eeb1 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/Localizer.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/Localizer.h
@@ -22,11 +22,14 @@
 #define LLVM_CODEGEN_GLOBALISEL_LOCALIZER_H
 
 #include "llvm/ADT/SetVector.h"
-#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 
 namespace llvm {
 // Forward declarations.
+class AnalysisUsage;
+class MachineBasicBlock;
+class MachineInstr;
+class MachineOperand;
 class MachineRegisterInfo;
 class TargetTransformInfo;
 
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
index daf1ff052983..1cacf96620f0 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
@@ -94,6 +94,48 @@ inline ConstantMatch<int64_t> m_ICst(int64_t &Cst) {
   return ConstantMatch<int64_t>(Cst);
 }
 
+template <typename ConstT>
+inline Optional<ConstT> matchConstantSplat(Register,
+                                           const MachineRegisterInfo &);
+
+template <>
+inline Optional<APInt> matchConstantSplat(Register Reg,
+                                          const MachineRegisterInfo &MRI) {
+  return getIConstantSplatVal(Reg, MRI);
+}
+
+template <>
+inline Optional<int64_t> matchConstantSplat(Register Reg,
+                                            const MachineRegisterInfo &MRI) {
+  return getIConstantSplatSExtVal(Reg, MRI);
+}
+
+template <typename ConstT> struct ICstOrSplatMatch {
+  ConstT &CR;
+  ICstOrSplatMatch(ConstT &C) : CR(C) {}
+  bool match(const MachineRegisterInfo &MRI, Register Reg) {
+    if (auto MaybeCst = matchConstant<ConstT>(Reg, MRI)) {
+      CR = *MaybeCst;
+      return true;
+    }
+
+    if (auto MaybeCstSplat = matchConstantSplat<ConstT>(Reg, MRI)) {
+      CR = *MaybeCstSplat;
+      return true;
+    }
+
+    return false;
+  };
+};
+
+inline ICstOrSplatMatch<APInt> m_ICstOrSplat(APInt &Cst) {
+  return ICstOrSplatMatch<APInt>(Cst);
+}
+
+inline ICstOrSplatMatch<int64_t> m_ICstOrSplat(int64_t &Cst) {
+  return ICstOrSplatMatch<int64_t>(Cst);
+}
+
 struct GCstAndRegMatch {
   Optional<ValueAndVReg> &ValReg;
   GCstAndRegMatch(Optional<ValueAndVReg> &ValReg) : ValReg(ValReg) {}
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
index c4c2fc076dd8..16ba568c1be9 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
@@ -13,19 +13,26 @@
 #ifndef LLVM_CODEGEN_GLOBALISEL_MACHINEIRBUILDER_H
 #define LLVM_CODEGEN_GLOBALISEL_MACHINEIRBUILDER_H
 
-#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
-#include "llvm/CodeGen/LowLevelType.h"
+#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Module.h"
 
 namespace llvm {
 
 // Forward declarations.
+class APInt;
+class BlockAddress;
+class Constant;
+class ConstantFP;
+class ConstantInt;
+class DataLayout;
+class GISelCSEInfo;
+class GlobalValue;
+class TargetRegisterClass;
 class MachineFunction;
 class MachineInstr;
 class TargetInstrInfo;
@@ -942,22 +949,6 @@ public:
   /// Build and insert \p Res = IMPLICIT_DEF.
   MachineInstrBuilder buildUndef(const DstOp &Res);
 
-  /// Build and insert instructions to put \p Ops together at the specified p
-  /// Indices to form a larger register.
-  ///
-  /// If the types of the input registers are uniform and cover the entirity of
-  /// \p Res then a G_MERGE_VALUES will be produced. Otherwise an IMPLICIT_DEF
-  /// followed by a sequence of G_INSERT instructions.
-  ///
-  /// \pre setBasicBlock or setMI must have been called.
-  /// \pre The final element of the sequence must not extend past the end of the
-  ///      destination register.
-  /// \pre The bits defined by each Op (derived from index and scalar size) must
-  ///      not overlap.
-  /// \pre \p Indices must be in ascending order of bit position.
-  void buildSequence(Register Res, ArrayRef<Register> Ops,
-                     ArrayRef<uint64_t> Indices);
-
   /// Build and insert \p Res = G_MERGE_VALUES \p Op0, ...
   ///
   /// G_MERGE_VALUES combines the input elements contiguously into a larger
@@ -1001,6 +992,11 @@ public:
   MachineInstrBuilder buildBuildVector(const DstOp &Res,
                                        ArrayRef<Register> Ops);
 
+  /// Build and insert \p Res = G_BUILD_VECTOR \p Op0, ... where each OpN is
+  /// built with G_CONSTANT.
+  MachineInstrBuilder buildBuildVectorConstant(const DstOp &Res,
+                                               ArrayRef<APInt> Ops);
+
   /// Build and insert \p Res = G_BUILD_VECTOR with \p Src replicated to fill
   /// the number of elements
   MachineInstrBuilder buildSplatVector(const DstOp &Res,
@@ -1442,8 +1438,8 @@ public:
 
   /// Build and insert \p Res = G_SUB \p Op0, \p Op1
   ///
-  /// G_SUB sets \p Res to the sum of integer parameters \p Op0 and \p Op1,
-  /// truncated to their width.
+  /// G_SUB sets \p Res to the difference of integer parameters \p Op0 and
+  /// \p Op1, truncated to their width.
   ///
   /// \pre setBasicBlock or setMI must have been called.
   /// \pre \p Res, \p Op0 and \p Op1 must be generic virtual registers
@@ -1459,7 +1455,7 @@ public:
 
   /// Build and insert \p Res = G_MUL \p Op0, \p Op1
   ///
-  /// G_MUL sets \p Res to the sum of integer parameters \p Op0 and \p Op1,
+  /// G_MUL sets \p Res to the product of integer parameters \p Op0 and \p Op1,
   /// truncated to their width.
   ///
   /// \pre setBasicBlock or setMI must have been called.
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/RegBankSelect.h b/llvm/include/llvm/CodeGen/GlobalISel/RegBankSelect.h
index 45006eecfce6..d0918485249d 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/RegBankSelect.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/RegBankSelect.h
@@ -66,10 +66,10 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
 #include <cassert>
 #include <cstdint>
 #include <memory>
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/RegisterBank.h b/llvm/include/llvm/CodeGen/GlobalISel/RegisterBank.h
deleted file mode 100644
index 5440d97728b4..000000000000
--- a/llvm/include/llvm/CodeGen/GlobalISel/RegisterBank.h
+++ /dev/null
@@ -1,98 +0,0 @@
-//==-- llvm/CodeGen/GlobalISel/RegisterBank.h - Register Bank ----*- C++ -*-==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file This file declares the API of register banks.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CODEGEN_GLOBALISEL_REGISTERBANK_H
-#define LLVM_CODEGEN_GLOBALISEL_REGISTERBANK_H
-
-#include "llvm/ADT/BitVector.h"
-
-namespace llvm {
-// Forward declarations.
-class RegisterBankInfo;
-class raw_ostream;
-class TargetRegisterClass;
-class TargetRegisterInfo;
-
-/// This class implements the register bank concept.
-/// Two instances of RegisterBank must have different ID.
-/// This property is enforced by the RegisterBankInfo class.
-class RegisterBank {
-private:
-  unsigned ID;
-  const char *Name;
-  unsigned Size;
-  BitVector ContainedRegClasses;
-
-  /// Sentinel value used to recognize register bank not properly
-  /// initialized yet.
-  static const unsigned InvalidID;
-
-  /// Only the RegisterBankInfo can initialize RegisterBank properly.
-  friend RegisterBankInfo;
-
-public:
-  RegisterBank(unsigned ID, const char *Name, unsigned Size,
-               const uint32_t *CoveredClasses, unsigned NumRegClasses);
-
-  /// Get the identifier of this register bank.
-  unsigned getID() const { return ID; }
-
-  /// Get a user friendly name of this register bank.
-  /// Should be used only for debugging purposes.
-  const char *getName() const { return Name; }
-
-  /// Get the maximal size in bits that fits in this register bank.
-  unsigned getSize() const { return Size; }
-
-  /// Check whether this instance is ready to be used.
-  bool isValid() const;
-
-  /// Check if this register bank is valid. In other words,
-  /// if it has been properly constructed.
-  ///
-  /// \note This method does not check anything when assertions are disabled.
-  ///
-  /// \return True is the check was successful.
-  bool verify(const TargetRegisterInfo &TRI) const;
-
-  /// Check whether this register bank covers \p RC.
-  /// In other words, check if this register bank fully covers
-  /// the registers that \p RC contains.
-  /// \pre isValid()
-  bool covers(const TargetRegisterClass &RC) const;
-
-  /// Check whether \p OtherRB is the same as this.
-  bool operator==(const RegisterBank &OtherRB) const;
-  bool operator!=(const RegisterBank &OtherRB) const {
-    return !this->operator==(OtherRB);
-  }
-
-  /// Dump the register mask on dbgs() stream.
-  /// The dump is verbose.
-  void dump(const TargetRegisterInfo *TRI = nullptr) const;
-
-  /// Print the register mask on OS.
-  /// If IsForDebug is false, then only the name of the register bank
-  /// is printed. Otherwise, all the fields are printing.
-  /// TRI is then used to print the name of the register classes that
-  /// this register bank covers.
-  void print(raw_ostream &OS, bool IsForDebug = false,
-             const TargetRegisterInfo *TRI = nullptr) const;
-};
-
-inline raw_ostream &operator<<(raw_ostream &OS, const RegisterBank &RegBank) {
-  RegBank.print(OS);
-  return OS;
-}
-} // End namespace llvm.
-
-#endif
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h
deleted file mode 100644
index da785406bc31..000000000000
--- a/llvm/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h
+++ /dev/null
@@ -1,775 +0,0 @@
-//===- llvm/CodeGen/GlobalISel/RegisterBankInfo.h ---------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file This file declares the API for the register bank info.
-/// This API is responsible for handling the register banks.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CODEGEN_GLOBALISEL_REGISTERBANKINFO_H
-#define LLVM_CODEGEN_GLOBALISEL_REGISTERBANKINFO_H
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/CodeGen/Register.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/LowLevelTypeImpl.h"
-#include <cassert>
-#include <initializer_list>
-#include <memory>
-
-namespace llvm {
-
-class MachineInstr;
-class MachineRegisterInfo;
-class raw_ostream;
-class RegisterBank;
-class TargetInstrInfo;
-class TargetRegisterClass;
-class TargetRegisterInfo;
-
-/// Holds all the information related to register banks.
-class RegisterBankInfo {
-public:
-  /// Helper struct that represents how a value is partially mapped
-  /// into a register.
-  /// The StartIdx and Length represent what region of the orginal
-  /// value this partial mapping covers.
-  /// This can be represented as a Mask of contiguous bit starting
-  /// at StartIdx bit and spanning Length bits.
-  /// StartIdx is the number of bits from the less significant bits.
-  struct PartialMapping {
-    /// Number of bits at which this partial mapping starts in the
-    /// original value.  The bits are counted from less significant
-    /// bits to most significant bits.
-    unsigned StartIdx;
-
-    /// Length of this mapping in bits. This is how many bits this
-    /// partial mapping covers in the original value:
-    /// from StartIdx to StartIdx + Length -1.
-    unsigned Length;
-
-    /// Register bank where the partial value lives.
-    const RegisterBank *RegBank;
-
-    PartialMapping() = default;
-
-    /// Provide a shortcut for quickly building PartialMapping.
-    PartialMapping(unsigned StartIdx, unsigned Length,
-                   const RegisterBank &RegBank)
-        : StartIdx(StartIdx), Length(Length), RegBank(&RegBank) {}
-
-    /// \return the index of in the original value of the most
-    /// significant bit that this partial mapping covers.
-    unsigned getHighBitIdx() const { return StartIdx + Length - 1; }
-
-    /// Print this partial mapping on dbgs() stream.
-    void dump() const;
-
-    /// Print this partial mapping on \p OS;
-    void print(raw_ostream &OS) const;
-
-    /// Check that the Mask is compatible with the RegBank.
-    /// Indeed, if the RegBank cannot accomadate the "active bits" of the mask,
-    /// there is no way this mapping is valid.
-    ///
-    /// \note This method does not check anything when assertions are disabled.
-    ///
-    /// \return True is the check was successful.
-    bool verify() const;
-  };
-
-  /// Helper struct that represents how a value is mapped through
-  /// different register banks.
-  ///
-  /// \note: So far we do not have any users of the complex mappings
-  /// (mappings with more than one partial mapping), but when we do,
-  /// we would have needed to duplicate partial mappings.
-  /// The alternative could be to use an array of pointers of partial
-  /// mapping (i.e., PartialMapping **BreakDown) and duplicate the
-  /// pointers instead.
-  ///
-  /// E.g.,
-  /// Let say we have a 32-bit add and a <2 x 32-bit> vadd. We
-  /// can expand the
-  /// <2 x 32-bit> add into 2 x 32-bit add.
-  ///
-  /// Currently the TableGen-like file would look like:
-  /// \code
-  /// PartialMapping[] = {
-  /// /*32-bit add*/      {0, 32, GPR}, // Scalar entry repeated for first
-  ///                                   // vec elt.
-  /// /*2x32-bit add*/    {0, 32, GPR}, {32, 32, GPR},
-  /// /*<2x32-bit> vadd*/ {0, 64, VPR}
-  /// }; // PartialMapping duplicated.
-  ///
-  /// ValueMapping[] {
-  ///   /*plain 32-bit add*/       {&PartialMapping[0], 1},
-  ///   /*expanded vadd on 2xadd*/ {&PartialMapping[1], 2},
-  ///   /*plain <2x32-bit> vadd*/  {&PartialMapping[3], 1}
-  /// };
-  /// \endcode
-  ///
-  /// With the array of pointer, we would have:
-  /// \code
-  /// PartialMapping[] = {
-  /// /*32-bit add lower */ { 0, 32, GPR},
-  /// /*32-bit add upper */ {32, 32, GPR},
-  /// /*<2x32-bit> vadd */  { 0, 64, VPR}
-  /// }; // No more duplication.
-  ///
-  /// BreakDowns[] = {
-  /// /*AddBreakDown*/   &PartialMapping[0],
-  /// /*2xAddBreakDown*/ &PartialMapping[0], &PartialMapping[1],
-  /// /*VAddBreakDown*/  &PartialMapping[2]
-  /// }; // Addresses of PartialMapping duplicated (smaller).
-  ///
-  /// ValueMapping[] {
-  ///   /*plain 32-bit add*/       {&BreakDowns[0], 1},
-  ///   /*expanded vadd on 2xadd*/ {&BreakDowns[1], 2},
-  ///   /*plain <2x32-bit> vadd*/  {&BreakDowns[3], 1}
-  /// };
-  /// \endcode
-  ///
-  /// Given that a PartialMapping is actually small, the code size
-  /// impact is actually a degradation. Moreover the compile time will
-  /// be hit by the additional indirection.
-  /// If PartialMapping gets bigger we may reconsider.
-  struct ValueMapping {
-    /// How the value is broken down between the different register banks.
-    const PartialMapping *BreakDown;
-
-    /// Number of partial mapping to break down this value.
-    unsigned NumBreakDowns;
-
-    /// The default constructor creates an invalid (isValid() == false)
-    /// instance.
-    ValueMapping() : ValueMapping(nullptr, 0) {}
-
-    /// Initialize a ValueMapping with the given parameter.
-    /// \p BreakDown needs to have a life time at least as long
-    /// as this instance.
-    ValueMapping(const PartialMapping *BreakDown, unsigned NumBreakDowns)
-        : BreakDown(BreakDown), NumBreakDowns(NumBreakDowns) {}
-
-    /// Iterators through the PartialMappings.
-    const PartialMapping *begin() const { return BreakDown; }
-    const PartialMapping *end() const { return BreakDown + NumBreakDowns; }
-
-    /// \return true if all partial mappings are the same size and register
-    /// bank.
-    bool partsAllUniform() const;
-
-    /// Check if this ValueMapping is valid.
-    bool isValid() const { return BreakDown && NumBreakDowns; }
-
-    /// Verify that this mapping makes sense for a value of
-    /// \p MeaningfulBitWidth.
-    /// \note This method does not check anything when assertions are disabled.
-    ///
-    /// \return True is the check was successful.
-    bool verify(unsigned MeaningfulBitWidth) const;
-
-    /// Print this on dbgs() stream.
-    void dump() const;
-
-    /// Print this on \p OS;
-    void print(raw_ostream &OS) const;
-  };
-
-  /// Helper class that represents how the value of an instruction may be
-  /// mapped and what is the related cost of such mapping.
-  class InstructionMapping {
-    /// Identifier of the mapping.
-    /// This is used to communicate between the target and the optimizers
-    /// which mapping should be realized.
-    unsigned ID = InvalidMappingID;
-
-    /// Cost of this mapping.
-    unsigned Cost = 0;
-
-    /// Mapping of all the operands.
-    const ValueMapping *OperandsMapping = nullptr;
-
-    /// Number of operands.
-    unsigned NumOperands = 0;
-
-    const ValueMapping &getOperandMapping(unsigned i) {
-      assert(i < getNumOperands() && "Out of bound operand");
-      return OperandsMapping[i];
-    }
-
-  public:
-    /// Constructor for the mapping of an instruction.
-    /// \p NumOperands must be equal to number of all the operands of
-    /// the related instruction.
-    /// The rationale is that it is more efficient for the optimizers
-    /// to be able to assume that the mapping of the ith operand is
-    /// at the index i.
-    InstructionMapping(unsigned ID, unsigned Cost,
-                       const ValueMapping *OperandsMapping,
-                       unsigned NumOperands)
-        : ID(ID), Cost(Cost), OperandsMapping(OperandsMapping),
-          NumOperands(NumOperands) {
-    }
-
-    /// Default constructor.
-    /// Use this constructor to express that the mapping is invalid.
-    InstructionMapping() = default;
-
-    /// Get the cost.
-    unsigned getCost() const { return Cost; }
-
-    /// Get the ID.
-    unsigned getID() const { return ID; }
-
-    /// Get the number of operands.
-    unsigned getNumOperands() const { return NumOperands; }
-
-    /// Get the value mapping of the ith operand.
-    /// \pre The mapping for the ith operand has been set.
-    /// \pre The ith operand is a register.
-    const ValueMapping &getOperandMapping(unsigned i) const {
-      const ValueMapping &ValMapping =
-          const_cast<InstructionMapping *>(this)->getOperandMapping(i);
-      return ValMapping;
-    }
-
-    /// Set the mapping for all the operands.
-    /// In other words, OpdsMapping should hold at least getNumOperands
-    /// ValueMapping.
-    void setOperandsMapping(const ValueMapping *OpdsMapping) {
-      OperandsMapping = OpdsMapping;
-    }
-
-    /// Check whether this object is valid.
-    /// This is a lightweight check for obvious wrong instance.
-    bool isValid() const {
-      return getID() != InvalidMappingID && OperandsMapping;
-    }
-
-    /// Verifiy that this mapping makes sense for \p MI.
-    /// \pre \p MI must be connected to a MachineFunction.
-    ///
-    /// \note This method does not check anything when assertions are disabled.
-    ///
-    /// \return True is the check was successful.
-    bool verify(const MachineInstr &MI) const;
-
-    /// Print this on dbgs() stream.
-    void dump() const;
-
-    /// Print this on \p OS;
-    void print(raw_ostream &OS) const;
-  };
-
-  /// Convenient type to represent the alternatives for mapping an
-  /// instruction.
-  /// \todo When we move to TableGen this should be an array ref.
-  using InstructionMappings = SmallVector<const InstructionMapping *, 4>;
-
-  /// Helper class used to get/create the virtual registers that will be used
-  /// to replace the MachineOperand when applying a mapping.
-  class OperandsMapper {
-    /// The OpIdx-th cell contains the index in NewVRegs where the VRegs of the
-    /// OpIdx-th operand starts. -1 means we do not have such mapping yet.
-    /// Note: We use a SmallVector to avoid heap allocation for most cases.
-    SmallVector<int, 8> OpToNewVRegIdx;
-
-    /// Hold the registers that will be used to map MI with InstrMapping.
-    SmallVector<Register, 8> NewVRegs;
-
-    /// Current MachineRegisterInfo, used to create new virtual registers.
-    MachineRegisterInfo &MRI;
-
-    /// Instruction being remapped.
-    MachineInstr &MI;
-
-    /// New mapping of the instruction.
-    const InstructionMapping &InstrMapping;
-
-    /// Constant value identifying that the index in OpToNewVRegIdx
-    /// for an operand has not been set yet.
-    static const int DontKnowIdx;
-
-    /// Get the range in NewVRegs to store all the partial
-    /// values for the \p OpIdx-th operand.
-    ///
-    /// \return The iterator range for the space created.
-    //
-    /// \pre getMI().getOperand(OpIdx).isReg()
-    iterator_range<SmallVectorImpl<Register>::iterator>
-    getVRegsMem(unsigned OpIdx);
-
-    /// Get the end iterator for a range starting at \p StartIdx and
-    /// spannig \p NumVal in NewVRegs.
-    /// \pre StartIdx + NumVal <= NewVRegs.size()
-    SmallVectorImpl<Register>::const_iterator
-    getNewVRegsEnd(unsigned StartIdx, unsigned NumVal) const;
-    SmallVectorImpl<Register>::iterator getNewVRegsEnd(unsigned StartIdx,
-                                                       unsigned NumVal);
-
-  public:
-    /// Create an OperandsMapper that will hold the information to apply \p
-    /// InstrMapping to \p MI.
-    /// \pre InstrMapping.verify(MI)
-    OperandsMapper(MachineInstr &MI, const InstructionMapping &InstrMapping,
-                   MachineRegisterInfo &MRI);
-
-    /// \name Getters.
-    /// @{
-    /// The MachineInstr being remapped.
-    MachineInstr &getMI() const { return MI; }
-
-    /// The final mapping of the instruction.
-    const InstructionMapping &getInstrMapping() const { return InstrMapping; }
-
-    /// The MachineRegisterInfo we used to realize the mapping.
-    MachineRegisterInfo &getMRI() const { return MRI; }
-    /// @}
-
-    /// Create as many new virtual registers as needed for the mapping of the \p
-    /// OpIdx-th operand.
-    /// The number of registers is determined by the number of breakdown for the
-    /// related operand in the instruction mapping.
-    /// The type of the new registers is a plain scalar of the right size.
-    /// The proper type is expected to be set when the mapping is applied to
-    /// the instruction(s) that realizes the mapping.
-    ///
-    /// \pre getMI().getOperand(OpIdx).isReg()
-    ///
-    /// \post All the partial mapping of the \p OpIdx-th operand have been
-    /// assigned a new virtual register.
-    void createVRegs(unsigned OpIdx);
-
-    /// Set the virtual register of the \p PartialMapIdx-th partial mapping of
-    /// the OpIdx-th operand to \p NewVReg.
-    ///
-    /// \pre getMI().getOperand(OpIdx).isReg()
-    /// \pre getInstrMapping().getOperandMapping(OpIdx).BreakDown.size() >
-    /// PartialMapIdx
-    /// \pre NewReg != 0
-    ///
-    /// \post the \p PartialMapIdx-th register of the value mapping of the \p
-    /// OpIdx-th operand has been set.
-    void setVRegs(unsigned OpIdx, unsigned PartialMapIdx, Register NewVReg);
-
-    /// Get all the virtual registers required to map the \p OpIdx-th operand of
-    /// the instruction.
-    ///
-    /// This return an empty range when createVRegs or setVRegs has not been
-    /// called.
-    /// The iterator may be invalidated by a call to setVRegs or createVRegs.
-    ///
-    /// When \p ForDebug is true, we will not check that the list of new virtual
-    /// registers does not contain uninitialized values.
-    ///
-    /// \pre getMI().getOperand(OpIdx).isReg()
-    /// \pre ForDebug || All partial mappings have been set a register
-    iterator_range<SmallVectorImpl<Register>::const_iterator>
-    getVRegs(unsigned OpIdx, bool ForDebug = false) const;
-
-    /// Print this operands mapper on dbgs() stream.
-    void dump() const;
-
-    /// Print this operands mapper on \p OS stream.
-    void print(raw_ostream &OS, bool ForDebug = false) const;
-  };
-
-protected:
-  /// Hold the set of supported register banks.
-  RegisterBank **RegBanks;
-
-  /// Total number of register banks.
-  unsigned NumRegBanks;
-
-  /// Keep dynamically allocated PartialMapping in a separate map.
-  /// This shouldn't be needed when everything gets TableGen'ed.
-  mutable DenseMap<unsigned, std::unique_ptr<const PartialMapping>>
-      MapOfPartialMappings;
-
-  /// Keep dynamically allocated ValueMapping in a separate map.
-  /// This shouldn't be needed when everything gets TableGen'ed.
-  mutable DenseMap<unsigned, std::unique_ptr<const ValueMapping>>
-      MapOfValueMappings;
-
-  /// Keep dynamically allocated array of ValueMapping in a separate map.
-  /// This shouldn't be needed when everything gets TableGen'ed.
-  mutable DenseMap<unsigned, std::unique_ptr<ValueMapping[]>>
-      MapOfOperandsMappings;
-
-  /// Keep dynamically allocated InstructionMapping in a separate map.
-  /// This shouldn't be needed when everything gets TableGen'ed.
-  mutable DenseMap<unsigned, std::unique_ptr<const InstructionMapping>>
-      MapOfInstructionMappings;
-
-  /// Getting the minimal register class of a physreg is expensive.
-  /// Cache this information as we get it.
-  mutable DenseMap<unsigned, const TargetRegisterClass *> PhysRegMinimalRCs;
-
-  /// Create a RegisterBankInfo that can accommodate up to \p NumRegBanks
-  /// RegisterBank instances.
-  RegisterBankInfo(RegisterBank **RegBanks, unsigned NumRegBanks);
-
-  /// This constructor is meaningless.
-  /// It just provides a default constructor that can be used at link time
-  /// when GlobalISel is not built.
-  /// That way, targets can still inherit from this class without doing
-  /// crazy gymnastic to avoid link time failures.
-  /// \note That works because the constructor is inlined.
-  RegisterBankInfo() {
-    llvm_unreachable("This constructor should not be executed");
-  }
-
-  /// Get the register bank identified by \p ID.
-  RegisterBank &getRegBank(unsigned ID) {
-    assert(ID < getNumRegBanks() && "Accessing an unknown register bank");
-    return *RegBanks[ID];
-  }
-
-  /// Get the MinimalPhysRegClass for Reg.
-  /// \pre Reg is a physical register.
-  const TargetRegisterClass &
-  getMinimalPhysRegClass(Register Reg, const TargetRegisterInfo &TRI) const;
-
-  /// Try to get the mapping of \p MI.
-  /// See getInstrMapping for more details on what a mapping represents.
-  ///
-  /// Unlike getInstrMapping the returned InstructionMapping may be invalid
-  /// (isValid() == false).
-  /// This means that the target independent code is not smart enough
-  /// to get the mapping of \p MI and thus, the target has to provide the
-  /// information for \p MI.
-  ///
-  /// This implementation is able to get the mapping of:
-  /// - Target specific instructions by looking at the encoding constraints.
-  /// - Any instruction if all the register operands have already been assigned
-  ///   a register, a register class, or a register bank.
-  /// - Copies and phis if at least one of the operands has been assigned a
-  ///   register, a register class, or a register bank.
-  /// In other words, this method will likely fail to find a mapping for
-  /// any generic opcode that has not been lowered by target specific code.
-  const InstructionMapping &getInstrMappingImpl(const MachineInstr &MI) const;
-
-  /// Get the uniquely generated PartialMapping for the
-  /// given arguments.
-  const PartialMapping &getPartialMapping(unsigned StartIdx, unsigned Length,
-                                          const RegisterBank &RegBank) const;
-
-  /// \name Methods to get a uniquely generated ValueMapping.
-  /// @{
-
-  /// The most common ValueMapping consists of a single PartialMapping.
-  /// Feature a method for that.
-  const ValueMapping &getValueMapping(unsigned StartIdx, unsigned Length,
-                                      const RegisterBank &RegBank) const;
-
-  /// Get the ValueMapping for the given arguments.
-  const ValueMapping &getValueMapping(const PartialMapping *BreakDown,
-                                      unsigned NumBreakDowns) const;
-  /// @}
-
-  /// \name Methods to get a uniquely generated array of ValueMapping.
-  /// @{
-
-  /// Get the uniquely generated array of ValueMapping for the
-  /// elements of between \p Begin and \p End.
-  ///
-  /// Elements that are nullptr will be replaced by
-  /// invalid ValueMapping (ValueMapping::isValid == false).
-  ///
-  /// \pre The pointers on ValueMapping between \p Begin and \p End
-  /// must uniquely identify a ValueMapping. Otherwise, there is no
-  /// guarantee that the return instance will be unique, i.e., another
-  /// OperandsMapping could have the same content.
-  template <typename Iterator>
-  const ValueMapping *getOperandsMapping(Iterator Begin, Iterator End) const;
-
-  /// Get the uniquely generated array of ValueMapping for the
-  /// elements of \p OpdsMapping.
-  ///
-  /// Elements of \p OpdsMapping that are nullptr will be replaced by
-  /// invalid ValueMapping (ValueMapping::isValid == false).
-  const ValueMapping *getOperandsMapping(
-      const SmallVectorImpl<const ValueMapping *> &OpdsMapping) const;
-
-  /// Get the uniquely generated array of ValueMapping for the
-  /// given arguments.
-  ///
-  /// Arguments that are nullptr will be replaced by invalid
-  /// ValueMapping (ValueMapping::isValid == false).
-  const ValueMapping *getOperandsMapping(
-      std::initializer_list<const ValueMapping *> OpdsMapping) const;
-  /// @}
-
-  /// \name Methods to get a uniquely generated InstructionMapping.
-  /// @{
-
-private:
-  /// Method to get a uniquely generated InstructionMapping.
-  const InstructionMapping &
-  getInstructionMappingImpl(bool IsInvalid, unsigned ID = InvalidMappingID,
-                            unsigned Cost = 0,
-                            const ValueMapping *OperandsMapping = nullptr,
-                            unsigned NumOperands = 0) const;
-
-public:
-  /// Method to get a uniquely generated InstructionMapping.
-  const InstructionMapping &
-  getInstructionMapping(unsigned ID, unsigned Cost,
-                        const ValueMapping *OperandsMapping,
-                        unsigned NumOperands) const {
-    return getInstructionMappingImpl(/*IsInvalid*/ false, ID, Cost,
-                                     OperandsMapping, NumOperands);
-  }
-
-  /// Method to get a uniquely generated invalid InstructionMapping.
-  const InstructionMapping &getInvalidInstructionMapping() const {
-    return getInstructionMappingImpl(/*IsInvalid*/ true);
-  }
-  /// @}
-
-  /// Get the register bank for the \p OpIdx-th operand of \p MI form
-  /// the encoding constraints, if any.
-  ///
-  /// \return A register bank that covers the register class of the
-  /// related encoding constraints or nullptr if \p MI did not provide
-  /// enough information to deduce it.
-  const RegisterBank *
-  getRegBankFromConstraints(const MachineInstr &MI, unsigned OpIdx,
-                            const TargetInstrInfo &TII,
-                            const MachineRegisterInfo &MRI) const;
-
-  /// Helper method to apply something that is like the default mapping.
-  /// Basically, that means that \p OpdMapper.getMI() is left untouched
-  /// aside from the reassignment of the register operand that have been
-  /// remapped.
-  ///
-  /// The type of all the new registers that have been created by the
-  /// mapper are properly remapped to the type of the original registers
-  /// they replace. In other words, the semantic of the instruction does
-  /// not change, only the register banks.
-  ///
-  /// If the mapping of one of the operand spans several registers, this
-  /// method will abort as this is not like a default mapping anymore.
-  ///
-  /// \pre For OpIdx in {0..\p OpdMapper.getMI().getNumOperands())
-  ///        the range OpdMapper.getVRegs(OpIdx) is empty or of size 1.
-  static void applyDefaultMapping(const OperandsMapper &OpdMapper);
-
-  /// See ::applyMapping.
-  virtual void applyMappingImpl(const OperandsMapper &OpdMapper) const {
-    llvm_unreachable("The target has to implement that part");
-  }
-
-public:
-  virtual ~RegisterBankInfo() = default;
-
-  /// Get the register bank identified by \p ID.
-  const RegisterBank &getRegBank(unsigned ID) const {
-    return const_cast<RegisterBankInfo *>(this)->getRegBank(ID);
-  }
-
-  /// Get the register bank of \p Reg.
-  /// If Reg has not been assigned a register, a register class,
-  /// or a register bank, then this returns nullptr.
-  ///
-  /// \pre Reg != 0 (NoRegister)
-  const RegisterBank *getRegBank(Register Reg, const MachineRegisterInfo &MRI,
-                                 const TargetRegisterInfo &TRI) const;
-
-  /// Get the total number of register banks.
-  unsigned getNumRegBanks() const { return NumRegBanks; }
-
-  /// Get a register bank that covers \p RC.
-  ///
-  /// \pre \p RC is a user-defined register class (as opposed as one
-  /// generated by TableGen).
-  ///
-  /// \note The mapping RC -> RegBank could be built while adding the
-  /// coverage for the register banks. However, we do not do it, because,
-  /// at least for now, we only need this information for register classes
-  /// that are used in the description of instruction. In other words,
-  /// there are just a handful of them and we do not want to waste space.
-  ///
-  /// \todo This should be TableGen'ed.
-  virtual const RegisterBank &
-  getRegBankFromRegClass(const TargetRegisterClass &RC, LLT Ty) const {
-    llvm_unreachable("The target must override this method");
-  }
-
-  /// Get the cost of a copy from \p B to \p A, or put differently,
-  /// get the cost of A = COPY B. Since register banks may cover
-  /// different size, \p Size specifies what will be the size in bits
-  /// that will be copied around.
-  ///
-  /// \note Since this is a copy, both registers have the same size.
-  virtual unsigned copyCost(const RegisterBank &A, const RegisterBank &B,
-                            unsigned Size) const {
-    // Optimistically assume that copies are coalesced. I.e., when
-    // they are on the same bank, they are free.
-    // Otherwise assume a non-zero cost of 1. The targets are supposed
-    // to override that properly anyway if they care.
-    return &A != &B;
-  }
-
-  /// \returns true if emitting a copy from \p Src to \p Dst is impossible.
-  bool cannotCopy(const RegisterBank &Dst, const RegisterBank &Src,
-                  unsigned Size) const {
-    return copyCost(Dst, Src, Size) == std::numeric_limits<unsigned>::max();
-  }
-
-  /// Get the cost of using \p ValMapping to decompose a register. This is
-  /// similar to ::copyCost, except for cases where multiple copy-like
-  /// operations need to be inserted. If the register is used as a source
-  /// operand and already has a bank assigned, \p CurBank is non-null.
-  virtual unsigned getBreakDownCost(const ValueMapping &ValMapping,
-                                    const RegisterBank *CurBank = nullptr) const {
-    return std::numeric_limits<unsigned>::max();
-  }
-
-  /// Constrain the (possibly generic) virtual register \p Reg to \p RC.
-  ///
-  /// \pre \p Reg is a virtual register that either has a bank or a class.
-  /// \returns The constrained register class, or nullptr if there is none.
-  /// \note This is a generic variant of MachineRegisterInfo::constrainRegClass
-  /// \note Use MachineRegisterInfo::constrainRegAttrs instead for any non-isel
-  /// purpose, including non-select passes of GlobalISel
-  static const TargetRegisterClass *
-  constrainGenericRegister(Register Reg, const TargetRegisterClass &RC,
-                           MachineRegisterInfo &MRI);
-
-  /// Identifier used when the related instruction mapping instance
-  /// is generated by target independent code.
-  /// Make sure not to use that identifier to avoid possible collision.
-  static const unsigned DefaultMappingID;
-
-  /// Identifier used when the related instruction mapping instance
-  /// is generated by the default constructor.
-  /// Make sure not to use that identifier.
-  static const unsigned InvalidMappingID;
-
-  /// Get the mapping of the different operands of \p MI
-  /// on the register bank.
-  /// This mapping should be the direct translation of \p MI.
-  /// In other words, when \p MI is mapped with the returned mapping,
-  /// only the register banks of the operands of \p MI need to be updated.
-  /// In particular, neither the opcode nor the type of \p MI needs to be
-  /// updated for this direct mapping.
-  ///
-  /// The target independent implementation gives a mapping based on
-  /// the register classes for the target specific opcode.
-  /// It uses the ID RegisterBankInfo::DefaultMappingID for that mapping.
-  /// Make sure you do not use that ID for the alternative mapping
-  /// for MI. See getInstrAlternativeMappings for the alternative
-  /// mappings.
-  ///
-  /// For instance, if \p MI is a vector add, the mapping should
-  /// not be a scalarization of the add.
-  ///
-  /// \post returnedVal.verify(MI).
-  ///
-  /// \note If returnedVal does not verify MI, this would probably mean
-  /// that the target does not support that instruction.
-  virtual const InstructionMapping &
-  getInstrMapping(const MachineInstr &MI) const;
-
-  /// Get the alternative mappings for \p MI.
-  /// Alternative in the sense different from getInstrMapping.
-  virtual InstructionMappings
-  getInstrAlternativeMappings(const MachineInstr &MI) const;
-
-  /// Get the possible mapping for \p MI.
-  /// A mapping defines where the different operands may live and at what cost.
-  /// For instance, let us consider:
-  /// v0(16) = G_ADD <2 x i8> v1, v2
-  /// The possible mapping could be:
-  ///
-  /// {/*ID*/VectorAdd, /*Cost*/1, /*v0*/{(0xFFFF, VPR)}, /*v1*/{(0xFFFF, VPR)},
-  ///                              /*v2*/{(0xFFFF, VPR)}}
-  /// {/*ID*/ScalarAddx2, /*Cost*/2, /*v0*/{(0x00FF, GPR),(0xFF00, GPR)},
-  ///                                /*v1*/{(0x00FF, GPR),(0xFF00, GPR)},
-  ///                                /*v2*/{(0x00FF, GPR),(0xFF00, GPR)}}
-  ///
-  /// \note The first alternative of the returned mapping should be the
-  /// direct translation of \p MI current form.
-  ///
-  /// \post !returnedVal.empty().
-  InstructionMappings getInstrPossibleMappings(const MachineInstr &MI) const;
-
-  /// Apply \p OpdMapper.getInstrMapping() to \p OpdMapper.getMI().
-  /// After this call \p OpdMapper.getMI() may not be valid anymore.
-  /// \p OpdMapper.getInstrMapping().getID() carries the information of
-  /// what has been chosen to map \p OpdMapper.getMI(). This ID is set
-  /// by the various getInstrXXXMapping method.
-  ///
-  /// Therefore, getting the mapping and applying it should be kept in
-  /// sync.
-  void applyMapping(const OperandsMapper &OpdMapper) const {
-    // The only mapping we know how to handle is the default mapping.
-    if (OpdMapper.getInstrMapping().getID() == DefaultMappingID)
-      return applyDefaultMapping(OpdMapper);
-    // For other mapping, the target needs to do the right thing.
-    // If that means calling applyDefaultMapping, fine, but this
-    // must be explicitly stated.
-    applyMappingImpl(OpdMapper);
-  }
-
-  /// Get the size in bits of \p Reg.
-  /// Utility method to get the size of any registers. Unlike
-  /// MachineRegisterInfo::getSize, the register does not need to be a
-  /// virtual register.
-  ///
-  /// \pre \p Reg != 0 (NoRegister).
-  unsigned getSizeInBits(Register Reg, const MachineRegisterInfo &MRI,
-                         const TargetRegisterInfo &TRI) const;
-
-  /// Check that information hold by this instance make sense for the
-  /// given \p TRI.
-  ///
-  /// \note This method does not check anything when assertions are disabled.
-  ///
-  /// \return True is the check was successful.
-  bool verify(const TargetRegisterInfo &TRI) const;
-};
-
-inline raw_ostream &
-operator<<(raw_ostream &OS,
-           const RegisterBankInfo::PartialMapping &PartMapping) {
-  PartMapping.print(OS);
-  return OS;
-}
-
-inline raw_ostream &
-operator<<(raw_ostream &OS, const RegisterBankInfo::ValueMapping &ValMapping) {
-  ValMapping.print(OS);
-  return OS;
-}
-
-inline raw_ostream &
-operator<<(raw_ostream &OS,
-           const RegisterBankInfo::InstructionMapping &InstrMapping) {
-  InstrMapping.print(OS);
-  return OS;
-}
-
-inline raw_ostream &
-operator<<(raw_ostream &OS, const RegisterBankInfo::OperandsMapper &OpdMapper) {
-  OpdMapper.print(OS, /*ForDebug*/ false);
-  return OS;
-}
-
-/// Hashing function for PartialMapping.
-/// It is required for the hashing of ValueMapping.
-hash_code hash_value(const RegisterBankInfo::PartialMapping &PartMapping);
-
-} // end namespace llvm
-
-#endif // LLVM_CODEGEN_GLOBALISEL_REGISTERBANKINFO_H
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
index aed915d2cc4b..78f1b49da822 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
@@ -15,18 +15,20 @@
 #define LLVM_CODEGEN_GLOBALISEL_UTILS_H
 
 #include "GISelWorkList.h"
-#include "LostDebugLocObserver.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/Register.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/Support/Alignment.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/LowLevelTypeImpl.h"
 #include <cstdint>
 
 namespace llvm {
 
 class AnalysisUsage;
+class LostDebugLocObserver;
+class MachineBasicBlock;
 class BlockFrequencyInfo;
 class GISelKnownBits;
 class MachineFunction;
@@ -267,13 +269,10 @@ Optional<APFloat> ConstantFoldFPBinOp(unsigned Opcode, const Register Op1,
                                       const MachineRegisterInfo &MRI);
 
 /// Tries to constant fold a vector binop with sources \p Op1 and \p Op2.
-/// If successful, returns the G_BUILD_VECTOR representing the folded vector
-/// constant. \p MIB should have an insertion point already set to create new
-/// G_CONSTANT instructions as needed.
-Register ConstantFoldVectorBinop(unsigned Opcode, const Register Op1,
-                                 const Register Op2,
-                                 const MachineRegisterInfo &MRI,
-                                 MachineIRBuilder &MIB);
+/// Returns an empty vector on failure.
+SmallVector<APInt> ConstantFoldVectorBinop(unsigned Opcode, const Register Op1,
+                                           const Register Op2,
+                                           const MachineRegisterInfo &MRI);
 
 Optional<APInt> ConstantFoldExtOp(unsigned Opcode, const Register Op1,
                                   uint64_t Imm, const MachineRegisterInfo &MRI);
@@ -374,9 +373,23 @@ public:
 /// If \p MI is not a splat, returns None.
 Optional<int> getSplatIndex(MachineInstr &MI);
 
-/// Returns a scalar constant of a G_BUILD_VECTOR splat if it exists.
-Optional<int64_t> getBuildVectorConstantSplat(const MachineInstr &MI,
-                                              const MachineRegisterInfo &MRI);
+/// \returns the scalar integral splat value of \p Reg if possible.
+Optional<APInt> getIConstantSplatVal(const Register Reg,
+                                     const MachineRegisterInfo &MRI);
+
+/// \returns the scalar integral splat value defined by \p MI if possible.
+Optional<APInt> getIConstantSplatVal(const MachineInstr &MI,
+                                     const MachineRegisterInfo &MRI);
+
+/// \returns the scalar sign extended integral splat value of \p Reg if
+/// possible.
+Optional<int64_t> getIConstantSplatSExtVal(const Register Reg,
+                                           const MachineRegisterInfo &MRI);
+
+/// \returns the scalar sign extended integral splat value defined by \p MI if
+/// possible.
+Optional<int64_t> getIConstantSplatSExtVal(const MachineInstr &MI,
+                                           const MachineRegisterInfo &MRI);
 
 /// Returns a floating point scalar constant of a build vector splat if it
 /// exists. When \p AllowUndef == true some elements can be undef but not all.
@@ -408,6 +421,30 @@ bool isBuildVectorAllOnes(const MachineInstr &MI,
                           const MachineRegisterInfo &MRI,
                           bool AllowUndef = false);
 
+/// Return true if the specified instruction is known to be a constant, or a
+/// vector of constants.
+///
+/// If \p AllowFP is true, this will consider G_FCONSTANT in addition to
+/// G_CONSTANT. If \p AllowOpaqueConstants is true, constant-like instructions
+/// such as G_GLOBAL_VALUE will also be considered.
+bool isConstantOrConstantVector(const MachineInstr &MI,
+                                const MachineRegisterInfo &MRI,
+                                bool AllowFP = true,
+                                bool AllowOpaqueConstants = true);
+
+/// Return true if the value is a constant 0 integer or a splatted vector of a
+/// constant 0 integer (with no undefs if \p AllowUndefs is false). This will
+/// handle G_BUILD_VECTOR and G_BUILD_VECTOR_TRUNC as truncation is not an issue
+/// for null values.
+bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI,
+                       bool AllowUndefs = false);
+
+/// Return true if the value is a constant -1 integer or a splatted vector of a
+/// constant -1 integer (with no undefs if \p AllowUndefs is false).
+bool isAllOnesOrAllOnesSplat(const MachineInstr &MI,
+                             const MachineRegisterInfo &MRI,
+                             bool AllowUndefs = false);
+
 /// \returns a value when \p MI is a vector splat. The splat can be either a
 /// Register or a constant.
 ///
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index b07c7cd3db3a..120f89952a95 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -281,12 +281,25 @@ enum NodeType {
 
   /// Carry-using nodes for multiple precision addition and subtraction.
   /// These nodes take three operands: The first two are the normal lhs and
-  /// rhs to the add or sub, and the third is a boolean indicating if there
-  /// is an incoming carry. These nodes produce two results: the normal
-  /// result of the add or sub, and the output carry so they can be chained
-  /// together. The use of this opcode is preferable to adde/sube if the
-  /// target supports it, as the carry is a regular value rather than a
-  /// glue, which allows further optimisation.
+  /// rhs to the add or sub, and the third is a boolean value that is 1 if and
+  /// only if there is an incoming carry/borrow. These nodes produce two
+  /// results: the normal result of the add or sub, and a boolean value that is
+  /// 1 if and only if there is an outgoing carry/borrow.
+  ///
+  /// Care must be taken if these opcodes are lowered to hardware instructions
+  /// that use the inverse logic -- 0 if and only if there is an
+  /// incoming/outgoing carry/borrow.  In such cases, you must preserve the
+  /// semantics of these opcodes by inverting the incoming carry/borrow, feeding
+  /// it to the add/sub hardware instruction, and then inverting the outgoing
+  /// carry/borrow.
+  ///
+  /// The use of these opcodes is preferable to adde/sube if the target supports
+  /// it, as the carry is a regular value rather than a glue, which allows
+  /// further optimisation.
+  ///
+  /// These opcodes are different from [US]{ADD,SUB}O in that ADDCARRY/SUBCARRY
+  /// consume and produce a carry/borrow, whereas [US]{ADD,SUB}O produce an
+  /// overflow.
   ADDCARRY,
   SUBCARRY,
 
@@ -294,7 +307,7 @@ enum NodeType {
   /// subtraction. These nodes take three operands: The first two are normal lhs
   /// and rhs to the add or sub, and the third is a boolean indicating if there
   /// is an incoming carry. They produce two results: the normal result of the
-  /// add or sub, and a boolean that indicates if an overflow occured (*not*
+  /// add or sub, and a boolean that indicates if an overflow occurred (*not*
   /// flag, because it may be a store to memory, etc.). If the type of the
   /// boolean is not i1 then the high bits conform to getBooleanContents.
   SADDO_CARRY,
@@ -462,6 +475,9 @@ enum NodeType {
   STRICT_FSETCC,
   STRICT_FSETCCS,
 
+  // FPTRUNC_ROUND - This corresponds to the fptrunc_round intrinsic.
+  FPTRUNC_ROUND,
+
   /// FMA - Perform a * b + c with no intermediate rounding step.
   FMA,
 
@@ -482,6 +498,13 @@ enum NodeType {
   /// Returns platform specific canonical encoding of a floating point number.
   FCANONICALIZE,
 
+  /// Performs a check of floating point class property, defined by IEEE-754.
+  /// The first operand is the floating point value to check. The second operand
+  /// specifies the checked property and is a TargetConstant which specifies
+  /// test in the same way as intrinsic 'is_fpclass'.
+  /// Returns boolean value.
+  IS_FPCLASS,
+
   /// BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector
   /// with the specified, possibly variable, elements. The types of the
   /// operands must match the vector element type, except that integer types
@@ -614,6 +637,17 @@ enum NodeType {
   MULHU,
   MULHS,
 
+  /// AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of
+  /// type i[N+1], halving the result by shifting it one bit right.
+  /// shr(add(ext(X), ext(Y)), 1)
+  AVGFLOORS,
+  AVGFLOORU,
+  /// AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an
+  /// integer of type i[N+2], add 1 and halve the result by shifting it one bit
+  /// right. shr(add(ext(X), ext(Y), 1), 1)
+  AVGCEILS,
+  AVGCEILU,
+
   // ABDS/ABDU - Absolute difference - Return the absolute difference between
   // two numbers interpreted as signed/unsigned.
   // i.e trunc(abs(sext(Op0) - sext(Op1))) becomes abds(Op0, Op1)
@@ -864,6 +898,13 @@ enum NodeType {
   STRICT_FP16_TO_FP,
   STRICT_FP_TO_FP16,
 
+  /// BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions
+  /// and truncation for bfloat16. These nodes form a semi-softened interface
+  /// for dealing with bf16 (as an i16), which is often a storage-only type but
+  /// has native conversions.
+  BF16_TO_FP,
+  FP_TO_BF16,
+
   /// Perform various unary floating-point operations inspired by libm. For
   /// FPOWI, the result is undefined if if the integer operand doesn't fit into
   /// sizeof(int).
@@ -1324,18 +1365,18 @@ static const int LAST_INDEXED_MODE = POST_DEC + 1;
 /// MemIndexType enum - This enum defines how to interpret MGATHER/SCATTER's
 /// index parameter when calculating addresses.
 ///
-/// SIGNED_SCALED     Addr = Base + ((signed)Index * sizeof(element))
-/// SIGNED_UNSCALED   Addr = Base + (signed)Index
-/// UNSIGNED_SCALED   Addr = Base + ((unsigned)Index * sizeof(element))
-/// UNSIGNED_UNSCALED Addr = Base + (unsigned)Index
-enum MemIndexType {
-  SIGNED_SCALED = 0,
-  SIGNED_UNSCALED,
-  UNSIGNED_SCALED,
-  UNSIGNED_UNSCALED
-};
+/// SIGNED_SCALED     Addr = Base + ((signed)Index * Scale)
+/// UNSIGNED_SCALED   Addr = Base + ((unsigned)Index * Scale)
+///
+/// NOTE: The value of Scale is typically only known to the node owning the
+/// IndexType, with a value of 1 the equivalent of being unscaled.
+enum MemIndexType { SIGNED_SCALED = 0, UNSIGNED_SCALED };
 
-static const int LAST_MEM_INDEX_TYPE = UNSIGNED_UNSCALED + 1;
+static const int LAST_MEM_INDEX_TYPE = UNSIGNED_SCALED + 1;
+
+inline bool isIndexTypeSigned(MemIndexType IndexType) {
+  return IndexType == SIGNED_SCALED;
+}
 
 //===--------------------------------------------------------------------===//
 /// LoadExtType enum - This enum defines the three variants of LOADEXT
diff --git a/llvm/include/llvm/CodeGen/IntrinsicLowering.h b/llvm/include/llvm/CodeGen/IntrinsicLowering.h
index 06512f2dc560..0b327a34ca09 100644
--- a/llvm/include/llvm/CodeGen/IntrinsicLowering.h
+++ b/llvm/include/llvm/CodeGen/IntrinsicLowering.h
@@ -15,8 +15,6 @@
 #ifndef LLVM_CODEGEN_INTRINSICLOWERING_H
 #define LLVM_CODEGEN_INTRINSICLOWERING_H
 
-#include "llvm/IR/Intrinsics.h"
-
 namespace llvm {
 class CallInst;
 class DataLayout;
diff --git a/llvm/include/llvm/CodeGen/LazyMachineBlockFrequencyInfo.h b/llvm/include/llvm/CodeGen/LazyMachineBlockFrequencyInfo.h
index c692dbc2199e..e5794966ce63 100644
--- a/llvm/include/llvm/CodeGen/LazyMachineBlockFrequencyInfo.h
+++ b/llvm/include/llvm/CodeGen/LazyMachineBlockFrequencyInfo.h
@@ -17,8 +17,8 @@
 #define LLVM_CODEGEN_LAZYMACHINEBLOCKFREQUENCYINFO_H
 
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
-#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 
 namespace llvm {
diff --git a/llvm/include/llvm/CodeGen/LiveInterval.h b/llvm/include/llvm/CodeGen/LiveInterval.h
index 51ffe2807434..92e35c9a4ab9 100644
--- a/llvm/include/llvm/CodeGen/LiveInterval.h
+++ b/llvm/include/llvm/CodeGen/LiveInterval.h
@@ -227,6 +227,14 @@ namespace llvm {
     const_vni_iterator vni_begin() const { return valnos.begin(); }
     const_vni_iterator vni_end() const   { return valnos.end(); }
 
+    iterator_range<vni_iterator> vnis() {
+      return make_range(vni_begin(), vni_end());
+    }
+
+    iterator_range<const_vni_iterator> vnis() const {
+      return make_range(vni_begin(), vni_end());
+    }
+
     /// Constructs a new LiveRange object.
     LiveRange(bool UseSegmentSet = false)
         : segmentSet(UseSegmentSet ? std::make_unique<SegmentSet>()
@@ -625,10 +633,8 @@ namespace llvm {
         // if the Seg is lower find first segment that is above Idx using binary
         // search
         if (Seg->end <= *Idx) {
-          Seg = std::upper_bound(
-              ++Seg, EndSeg, *Idx,
-              [=](std::remove_reference_t<decltype(*Idx)> V,
-                  const std::remove_reference_t<decltype(*Seg)> &S) {
+          Seg =
+              std::upper_bound(++Seg, EndSeg, *Idx, [=](auto V, const auto &S) {
                 return V < S.end;
               });
           if (Seg == EndSeg)
diff --git a/llvm/include/llvm/CodeGen/LiveIntervalUnion.h b/llvm/include/llvm/CodeGen/LiveIntervalUnion.h
index 3b6a4a379d72..81003455da42 100644
--- a/llvm/include/llvm/CodeGen/LiveIntervalUnion.h
+++ b/llvm/include/llvm/CodeGen/LiveIntervalUnion.h
@@ -43,7 +43,7 @@ class LiveIntervalUnion {
   // A set of live virtual register segments that supports fast insertion,
   // intersection, and removal.
   // Mapping SlotIndex intervals to virtual register numbers.
-  using LiveSegments = IntervalMap<SlotIndex, LiveInterval*>;
+  using LiveSegments = IntervalMap<SlotIndex, const LiveInterval *>;
 
 public:
   // SegmentIter can advance to the next segment ordered by starting position
@@ -88,10 +88,10 @@ public:
   bool changedSince(unsigned tag) const { return tag != Tag; }
 
   // Add a live virtual register to this union and merge its segments.
-  void unify(LiveInterval &VirtReg, const LiveRange &Range);
+  void unify(const LiveInterval &VirtReg, const LiveRange &Range);
 
   // Remove a live virtual register's segments from this union.
-  void extract(LiveInterval &VirtReg, const LiveRange &Range);
+  void extract(const LiveInterval &VirtReg, const LiveRange &Range);
 
   // Remove all inserted virtual registers.
   void clear() { Segments.clear(); ++Tag; }
@@ -105,7 +105,7 @@ public:
 #endif
 
   // Get any virtual register that is assign to this physical unit
-  LiveInterval *getOneVReg() const;
+  const LiveInterval *getOneVReg() const;
 
   /// Query interferences between a single live virtual register and a live
   /// interval union.
@@ -114,7 +114,7 @@ public:
     const LiveRange *LR = nullptr;
     LiveRange::const_iterator LRI;  ///< current position in LR
     ConstSegmentIter LiveUnionI;    ///< current position in LiveUnion
-    SmallVector<LiveInterval *, 4> InterferingVRegs;
+    SmallVector<const LiveInterval *, 4> InterferingVRegs;
     bool CheckedFirstInterference = false;
     bool SeenAllInterferences = false;
     unsigned Tag = 0;
@@ -125,7 +125,7 @@ public:
     unsigned collectInterferingVRegs(unsigned MaxInterferingRegs);
 
     // Was this virtual register visited during collectInterferingVRegs?
-    bool isSeenInterference(LiveInterval *VirtReg) const;
+    bool isSeenInterference(const LiveInterval *VirtReg) const;
 
   public:
     Query() = default;
@@ -159,7 +159,7 @@ public:
     bool checkInterference() { return collectInterferingVRegs(1); }
 
     // Vector generated by collectInterferingVRegs.
-    const SmallVectorImpl<LiveInterval *> &interferingVRegs(
+    const SmallVectorImpl<const LiveInterval *> &interferingVRegs(
         unsigned MaxInterferingRegs = std::numeric_limits<unsigned>::max()) {
       if (!SeenAllInterferences || MaxInterferingRegs < InterferingVRegs.size())
         collectInterferingVRegs(MaxInterferingRegs);
diff --git a/llvm/include/llvm/CodeGen/LiveIntervals.h b/llvm/include/llvm/CodeGen/LiveIntervals.h
index fa08166791b0..b832eaa37305 100644
--- a/llvm/include/llvm/CodeGen/LiveIntervals.h
+++ b/llvm/include/llvm/CodeGen/LiveIntervals.h
@@ -374,7 +374,7 @@ class VirtRegMap;
     ///
     /// Returns false if \p LI doesn't cross any register mask instructions. In
     /// that case, the bit vector is not filled in.
-    bool checkRegMaskInterference(LiveInterval &LI,
+    bool checkRegMaskInterference(const LiveInterval &LI,
                                   BitVector &UsableRegs);
 
     // Register unit functions.
diff --git a/llvm/include/llvm/CodeGen/LivePhysRegs.h b/llvm/include/llvm/CodeGen/LivePhysRegs.h
index 99ba1a28c934..27285d63aa83 100644
--- a/llvm/include/llvm/CodeGen/LivePhysRegs.h
+++ b/llvm/include/llvm/CodeGen/LivePhysRegs.h
@@ -32,6 +32,7 @@
 #include "llvm/ADT/SparseSet.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/MC/MCRegister.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include <cassert>
 #include <utility>
@@ -39,6 +40,7 @@
 namespace llvm {
 
 class MachineInstr;
+class MachineFunction;
 class MachineOperand;
 class MachineRegisterInfo;
 class raw_ostream;
diff --git a/llvm/include/llvm/CodeGen/LiveRangeCalc.h b/llvm/include/llvm/CodeGen/LiveRangeCalc.h
index 31efd6e37e01..895ecff18f89 100644
--- a/llvm/include/llvm/CodeGen/LiveRangeCalc.h
+++ b/llvm/include/llvm/CodeGen/LiveRangeCalc.h
@@ -31,7 +31,6 @@
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/SlotIndexes.h"
-#include "llvm/MC/LaneBitmask.h"
 #include <utility>
 
 namespace llvm {
diff --git a/llvm/include/llvm/CodeGen/LiveRangeEdit.h b/llvm/include/llvm/CodeGen/LiveRangeEdit.h
index d80522f5bdac..c6efa7b30d71 100644
--- a/llvm/include/llvm/CodeGen/LiveRangeEdit.h
+++ b/llvm/include/llvm/CodeGen/LiveRangeEdit.h
@@ -66,7 +66,7 @@ public:
   };
 
 private:
-  LiveInterval *Parent;
+  const LiveInterval *const Parent;
   SmallVectorImpl<Register> &NewRegs;
   MachineRegisterInfo &MRI;
   LiveIntervals &LIS;
@@ -129,7 +129,7 @@ public:
   ///            be done.  This could be the case if called before Regalloc.
   /// @param deadRemats The collection of all the instructions defining an
   ///                   original reg and are dead after remat.
-  LiveRangeEdit(LiveInterval *parent, SmallVectorImpl<Register> &newRegs,
+  LiveRangeEdit(const LiveInterval *parent, SmallVectorImpl<Register> &newRegs,
                 MachineFunction &MF, LiveIntervals &lis, VirtRegMap *vrm,
                 Delegate *delegate = nullptr,
                 SmallPtrSet<MachineInstr *, 32> *deadRemats = nullptr)
@@ -141,7 +141,7 @@ public:
 
   ~LiveRangeEdit() override { MRI.resetDelegate(this); }
 
-  LiveInterval &getParent() const {
+  const LiveInterval &getParent() const {
     assert(Parent && "No parent LiveInterval");
     return *Parent;
   }
@@ -193,11 +193,11 @@ public:
 
   /// Remat - Information needed to rematerialize at a specific location.
   struct Remat {
-    VNInfo *ParentVNI;              // parent_'s value at the remat location.
+    const VNInfo *const ParentVNI;  // parent_'s value at the remat location.
     MachineInstr *OrigMI = nullptr; // Instruction defining OrigVNI. It contains
                                     // the real expr for remat.
 
-    explicit Remat(VNInfo *ParentVNI) : ParentVNI(ParentVNI) {}
+    explicit Remat(const VNInfo *ParentVNI) : ParentVNI(ParentVNI) {}
   };
 
   /// allUsesAvailableAt - Return true if all registers used by OrigMI at
diff --git a/llvm/include/llvm/CodeGen/LiveRegMatrix.h b/llvm/include/llvm/CodeGen/LiveRegMatrix.h
index fc67bce329ab..9e28e4d243c2 100644
--- a/llvm/include/llvm/CodeGen/LiveRegMatrix.h
+++ b/llvm/include/llvm/CodeGen/LiveRegMatrix.h
@@ -104,7 +104,8 @@ public:
   /// If this function returns IK_Free, it is legal to assign(VirtReg, PhysReg).
   /// When there is more than one kind of interference, the InterferenceKind
   /// with the highest enum value is returned.
-  InterferenceKind checkInterference(LiveInterval &VirtReg, MCRegister PhysReg);
+  InterferenceKind checkInterference(const LiveInterval &VirtReg,
+                                     MCRegister PhysReg);
 
   /// Check for interference in the segment [Start, End) that may prevent
   /// assignment to PhysReg. If this function returns true, there is
@@ -116,12 +117,12 @@ public:
   /// Assign VirtReg to PhysReg.
   /// This will mark VirtReg's live range as occupied in the LiveRegMatrix and
   /// update VirtRegMap. The live range is expected to be available in PhysReg.
-  void assign(LiveInterval &VirtReg, MCRegister PhysReg);
+  void assign(const LiveInterval &VirtReg, MCRegister PhysReg);
 
   /// Unassign VirtReg from its PhysReg.
   /// Assuming that VirtReg was previously assigned to a PhysReg, this undoes
   /// the assignment and updates VirtRegMap accordingly.
-  void unassign(LiveInterval &VirtReg);
+  void unassign(const LiveInterval &VirtReg);
 
   /// Returns true if the given \p PhysReg has any live intervals assigned.
   bool isPhysRegUsed(MCRegister PhysReg) const;
@@ -136,13 +137,14 @@ public:
   /// Check for regmask interference only.
   /// Return true if VirtReg crosses a regmask operand that clobbers PhysReg.
   /// If PhysReg is null, check if VirtReg crosses any regmask operands.
-  bool checkRegMaskInterference(LiveInterval &VirtReg,
+  bool checkRegMaskInterference(const LiveInterval &VirtReg,
                                 MCRegister PhysReg = MCRegister::NoRegister);
 
   /// Check for regunit interference only.
   /// Return true if VirtReg overlaps a fixed assignment of one of PhysRegs's
   /// register units.
-  bool checkRegUnitInterference(LiveInterval &VirtReg, MCRegister PhysReg);
+  bool checkRegUnitInterference(const LiveInterval &VirtReg,
+                                MCRegister PhysReg);
 
   /// Query a line of the assigned virtual register matrix directly.
   /// Use MCRegUnitIterator to enumerate all regunits in the desired PhysReg.
diff --git a/llvm/include/llvm/CodeGen/LiveStacks.h b/llvm/include/llvm/CodeGen/LiveStacks.h
index 1cbdb8bd86bd..26f30fb4d088 100644
--- a/llvm/include/llvm/CodeGen/LiveStacks.h
+++ b/llvm/include/llvm/CodeGen/LiveStacks.h
@@ -18,13 +18,17 @@
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
 #include <cassert>
 #include <map>
 #include <unordered_map>
 
 namespace llvm {
 
+class AnalysisUsage;
+class MachineFunction;
+class Module;
+class raw_ostream;
 class TargetRegisterClass;
 class TargetRegisterInfo;
 
diff --git a/llvm/include/llvm/CodeGen/LiveVariables.h b/llvm/include/llvm/CodeGen/LiveVariables.h
index dee316677b25..aa198527415d 100644
--- a/llvm/include/llvm/CodeGen/LiveVariables.h
+++ b/llvm/include/llvm/CodeGen/LiveVariables.h
@@ -37,6 +37,7 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/PassRegistry.h"
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/CodeGen/MIRFSDiscriminator.h b/llvm/include/llvm/CodeGen/MIRFSDiscriminator.h
index deb6b37a9bcf..3bbcfd63e3aa 100644
--- a/llvm/include/llvm/CodeGen/MIRFSDiscriminator.h
+++ b/llvm/include/llvm/CodeGen/MIRFSDiscriminator.h
@@ -17,29 +17,16 @@
 #ifndef LLVM_CODEGEN_MIRFSDISCRIMINATOR_H
 #define LLVM_CODEGEN_MIRFSDISCRIMINATOR_H
 
-#include "llvm/Analysis/ProfileSummaryInfo.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
-#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
-#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/ProfileData/InstrProf.h"
-#include "llvm/ProfileData/SampleProf.h"
-#include "llvm/ProfileData/SampleProfReader.h"
+#include "llvm/Support/Discriminator.h"
 
 #include <cassert>
+#include <cstdint>
 
 namespace llvm {
+class MachineFunction;
 
 using namespace sampleprof;
 class MIRAddFSDiscriminators : public MachineFunctionPass {
diff --git a/llvm/include/llvm/CodeGen/MIRParser/MIRParser.h b/llvm/include/llvm/CodeGen/MIRParser/MIRParser.h
index a7c69e2d43ef..aa9891a80a32 100644
--- a/llvm/include/llvm/CodeGen/MIRParser/MIRParser.h
+++ b/llvm/include/llvm/CodeGen/MIRParser/MIRParser.h
@@ -17,13 +17,20 @@
 #ifndef LLVM_CODEGEN_MIRPARSER_MIRPARSER_H
 #define LLVM_CODEGEN_MIRPARSER_MIRPARSER_H
 
-#include "llvm/IR/Module.h"
-#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLForwardCompat.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include <functional>
 #include <memory>
 
 namespace llvm {
 
 class Function;
+class LLVMContext;
+class MemoryBuffer;
+class Module;
 class MIRParserImpl;
 class MachineModuleInfo;
 class SMDiagnostic;
diff --git a/llvm/include/llvm/CodeGen/MIRSampleProfile.h b/llvm/include/llvm/CodeGen/MIRSampleProfile.h
index 2503524ccfdf..f54c4b5891be 100644
--- a/llvm/include/llvm/CodeGen/MIRSampleProfile.h
+++ b/llvm/include/llvm/CodeGen/MIRSampleProfile.h
@@ -14,29 +14,17 @@
 #ifndef LLVM_CODEGEN_MIRSAMPLEPROFILE_H
 #define LLVM_CODEGEN_MIRSAMPLEPROFILE_H
 
-#include "llvm/Analysis/ProfileSummaryInfo.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
-#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/ProfileData/InstrProf.h"
-#include "llvm/ProfileData/SampleProf.h"
-#include "llvm/ProfileData/SampleProfReader.h"
-
-#include <cassert>
+#include "llvm/Support/Discriminator.h"
+#include <memory>
+#include <string>
 
 namespace llvm {
+class AnalysisUsage;
+class MachineBlockFrequencyInfo;
+class MachineFunction;
+class Module;
 
 using namespace sampleprof;
 
diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
index 02eb5d24271d..25247437b641 100644
--- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h
+++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
@@ -605,7 +605,7 @@ struct MachineFrameInfo {
   bool AdjustsStack = false;
   bool HasCalls = false;
   StringValue StackProtector;
-  // TODO: Serialize FunctionContextIdx
+  StringValue FunctionContext;
   unsigned MaxCallFrameSize = ~0u; ///< ~0u means: not computed yet.
   unsigned CVBytesOfCalleeSavedRegisters = 0;
   bool HasOpaqueSPAdjustment = false;
@@ -626,6 +626,7 @@ struct MachineFrameInfo {
            MaxAlignment == Other.MaxAlignment &&
            AdjustsStack == Other.AdjustsStack && HasCalls == Other.HasCalls &&
            StackProtector == Other.StackProtector &&
+           FunctionContext == Other.FunctionContext &&
            MaxCallFrameSize == Other.MaxCallFrameSize &&
            CVBytesOfCalleeSavedRegisters ==
                Other.CVBytesOfCalleeSavedRegisters &&
@@ -651,6 +652,8 @@ template <> struct MappingTraits<MachineFrameInfo> {
     YamlIO.mapOptional("hasCalls", MFI.HasCalls, false);
     YamlIO.mapOptional("stackProtector", MFI.StackProtector,
                        StringValue()); // Don't print it out when it's empty.
+    YamlIO.mapOptional("functionContext", MFI.FunctionContext,
+                       StringValue()); // Don't print it out when it's empty.
     YamlIO.mapOptional("maxCallFrameSize", MFI.MaxCallFrameSize, (unsigned)~0);
     YamlIO.mapOptional("cvBytesOfCalleeSavedRegisters",
                        MFI.CVBytesOfCalleeSavedRegisters, 0U);
@@ -694,6 +697,13 @@ struct MachineFunction {
   // Register information
   bool TracksRegLiveness = false;
   bool HasWinCFI = false;
+
+  bool CallsEHReturn = false;
+  bool CallsUnwindInit = false;
+  bool HasEHCatchret = false;
+  bool HasEHScopes = false;
+  bool HasEHFunclets = false;
+
   bool FailsVerification = false;
   bool TracksDebugUserValues = false;
   std::vector<VirtualRegisterDefinition> VirtualRegisters;
@@ -724,6 +734,13 @@ template <> struct MappingTraits<MachineFunction> {
     YamlIO.mapOptional("failedISel", MF.FailedISel, false);
     YamlIO.mapOptional("tracksRegLiveness", MF.TracksRegLiveness, false);
     YamlIO.mapOptional("hasWinCFI", MF.HasWinCFI, false);
+
+    YamlIO.mapOptional("callsEHReturn", MF.CallsEHReturn, false);
+    YamlIO.mapOptional("callsUnwindInit", MF.CallsUnwindInit, false);
+    YamlIO.mapOptional("hasEHCatchret", MF.HasEHCatchret, false);
+    YamlIO.mapOptional("hasEHScopes", MF.HasEHScopes, false);
+    YamlIO.mapOptional("hasEHFunclets", MF.HasEHFunclets, false);
+
     YamlIO.mapOptional("failsVerification", MF.FailsVerification, false);
     YamlIO.mapOptional("tracksDebugUserValues", MF.TracksDebugUserValues,
                        false);
diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
index 638b6732a543..ddfbd4018590 100644
--- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
@@ -14,9 +14,9 @@
 #define LLVM_CODEGEN_MACHINEBASICBLOCK_H
 
 #include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/SparseBitVector.h"
 #include "llvm/ADT/ilist.h"
 #include "llvm/ADT/iterator_range.h"
-#include "llvm/ADT/SparseBitVector.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBundleIterator.h"
 #include "llvm/IR/DebugLoc.h"
@@ -24,7 +24,6 @@
 #include "llvm/Support/BranchProbability.h"
 #include <cassert>
 #include <cstdint>
-#include <functional>
 #include <iterator>
 #include <string>
 #include <vector>
@@ -110,10 +109,10 @@ public:
 private:
   using Instructions = ilist<MachineInstr, ilist_sentinel_tracking<true>>;
 
-  Instructions Insts;
   const BasicBlock *BB;
   int Number;
   MachineFunction *xParent;
+  Instructions Insts;
 
   /// Keep track of the predecessor / successor basic blocks.
   std::vector<MachineBasicBlock *> Predecessors;
@@ -205,6 +204,12 @@ public:
   /// to an LLVM basic block.
   const BasicBlock *getBasicBlock() const { return BB; }
 
+  /// Remove the reference to the underlying IR BasicBlock. This is for
+  /// reduction tools and should generally not be used.
+  void clearBasicBlock() {
+    BB = nullptr;
+  }
+
   /// Return the name of the corresponding LLVM basic block, or an empty string.
   StringRef getName() const;
 
@@ -241,6 +246,7 @@ public:
       MachineInstrBundleIterator<const MachineInstr, true>;
 
   unsigned size() const { return (unsigned)Insts.size(); }
+  bool sizeWithoutDebugLargerThan(unsigned Limit) const;
   bool empty() const { return Insts.empty(); }
 
   MachineInstr       &instr_front()       { return Insts.front(); }
@@ -400,7 +406,7 @@ public:
   // Iteration support for live in sets.  These sets are kept in sorted
   // order by their register number.
   using livein_iterator = LiveInVector::const_iterator;
-#ifndef NDEBUG
+
   /// Unlike livein_begin, this method does not check that the liveness
   /// information is accurate. Still for debug purposes it may be useful
   /// to have iterators that won't assert if the liveness information
@@ -409,7 +415,7 @@ public:
   iterator_range<livein_iterator> liveins_dbg() const {
     return make_range(livein_begin_dbg(), livein_end());
   }
-#endif
+
   livein_iterator livein_begin() const;
   livein_iterator livein_end()   const { return LiveIns.end(); }
   bool            livein_empty() const { return LiveIns.empty(); }
@@ -731,6 +737,15 @@ public:
   /// other block.
   bool isLayoutSuccessor(const MachineBasicBlock *MBB) const;
 
+  /// Return the successor of this block if it has a single successor.
+  /// Otherwise return a null pointer.
+  ///
+  const MachineBasicBlock *getSingleSuccessor() const;
+  MachineBasicBlock *getSingleSuccessor() {
+    return const_cast<MachineBasicBlock *>(
+        static_cast<const MachineBasicBlock *>(this)->getSingleSuccessor());
+  }
+
   /// Return the fallthrough block if the block can implicitly
   /// transfer control to the block after it by falling off the end of
   /// it.  This should return null if it can reach the block after
@@ -1087,6 +1102,11 @@ public:
     IrrLoopHeaderWeight = Weight;
   }
 
+  /// Return probability of the edge from this block to MBB. This method should
+  /// NOT be called directly, but by using getEdgeProbability method from
+  /// MachineBranchProbabilityInfo class.
+  BranchProbability getSuccProbability(const_succ_iterator Succ) const;
+
 private:
   /// Return probability iterator corresponding to the I successor iterator.
   probability_iterator getProbabilityIterator(succ_iterator I);
@@ -1096,11 +1116,6 @@ private:
   friend class MachineBranchProbabilityInfo;
   friend class MIPrinter;
 
-  /// Return probability of the edge from this block to MBB. This method should
-  /// NOT be called directly, but by using getEdgeProbability method from
-  /// MachineBranchProbabilityInfo class.
-  BranchProbability getSuccProbability(const_succ_iterator Succ) const;
-
   // Methods used to maintain doubly linked list of blocks...
   friend struct ilist_callback_traits<MachineBasicBlock>;
 
diff --git a/llvm/include/llvm/CodeGen/MachineBranchProbabilityInfo.h b/llvm/include/llvm/CodeGen/MachineBranchProbabilityInfo.h
index 7e7e0a9c477a..bd544421bc0f 100644
--- a/llvm/include/llvm/CodeGen/MachineBranchProbabilityInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineBranchProbabilityInfo.h
@@ -16,8 +16,6 @@
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/BranchProbability.h"
-#include <climits>
-#include <numeric>
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/CodeGen/MachineCombinerPattern.h b/llvm/include/llvm/CodeGen/MachineCombinerPattern.h
index 67544779f34c..68c95679d466 100644
--- a/llvm/include/llvm/CodeGen/MachineCombinerPattern.h
+++ b/llvm/include/llvm/CodeGen/MachineCombinerPattern.h
@@ -34,6 +34,10 @@ enum class MachineCombinerPattern {
   REASSOC_XY_BCA,
   REASSOC_XY_BAC,
 
+  // These are patterns used to reduce the length of dependence chain.
+  SUBADD_OP1,
+  SUBADD_OP2,
+
   // These are multiply-add patterns matched by the AArch64 machine combiner.
   MULADDW_OP1,
   MULADDW_OP2,
diff --git a/llvm/include/llvm/CodeGen/MachineCycleAnalysis.h b/llvm/include/llvm/CodeGen/MachineCycleAnalysis.h
index d3816bbc0780..3f89f2076d50 100644
--- a/llvm/include/llvm/CodeGen/MachineCycleAnalysis.h
+++ b/llvm/include/llvm/CodeGen/MachineCycleAnalysis.h
@@ -15,8 +15,9 @@
 #define LLVM_CODEGEN_MACHINECYCLEANALYSIS_H
 
 #include "llvm/ADT/GenericCycleInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineSSAContext.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/InitializePasses.h"
 
 namespace llvm {
 
@@ -26,6 +27,29 @@ extern template class GenericCycle<MachineSSAContext>;
 using MachineCycleInfo = GenericCycleInfo<MachineSSAContext>;
 using MachineCycle = MachineCycleInfo::CycleT;
 
+/// Legacy analysis pass which computes a \ref MachineCycleInfo.
+class MachineCycleInfoWrapperPass : public MachineFunctionPass {
+  MachineFunction *F = nullptr;
+  MachineCycleInfo CI;
+
+public:
+  static char ID;
+
+  MachineCycleInfoWrapperPass();
+
+  MachineCycleInfo &getCycleInfo() { return CI; }
+  const MachineCycleInfo &getCycleInfo() const { return CI; }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  void releaseMemory() override;
+  void print(raw_ostream &OS, const Module *M = nullptr) const override;
+};
+
+// TODO: add this function to GenericCycle template after implementing IR
+//       version.
+bool isCycleInvariant(const MachineCycle *Cycle, MachineInstr &I);
+
 } // end namespace llvm
 
 #endif // LLVM_CODEGEN_MACHINECYCLEANALYSIS_H
diff --git a/llvm/include/llvm/CodeGen/MachineDominators.h b/llvm/include/llvm/CodeGen/MachineDominators.h
index f749e9ff7e0a..30c18ef410fa 100644
--- a/llvm/include/llvm/CodeGen/MachineDominators.h
+++ b/llvm/include/llvm/CodeGen/MachineDominators.h
@@ -19,12 +19,17 @@
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBundleIterator.h"
 #include "llvm/Support/GenericDomTree.h"
 #include "llvm/Support/GenericDomTreeConstruction.h"
 #include <cassert>
 #include <memory>
 
 namespace llvm {
+class AnalysisUsage;
+class MachineFunction;
+class Module;
+class raw_ostream;
 
 template <>
 inline void DominatorTreeBase<MachineBasicBlock, false>::addRoot(
diff --git a/llvm/include/llvm/CodeGen/MachineFrameInfo.h b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
index 864ca73180af..7ea731b46655 100644
--- a/llvm/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
@@ -16,7 +16,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/Register.h"
 #include "llvm/Support/Alignment.h"
-#include "llvm/Support/DataTypes.h"
 #include <cassert>
 #include <vector>
 
@@ -335,10 +334,13 @@ private:
   /// Not null, if shrink-wrapping found a better place for the epilogue.
   MachineBasicBlock *Restore = nullptr;
 
+  /// Size of the UnsafeStack Frame
+  uint64_t UnsafeStackSize = 0;
+
 public:
-  explicit MachineFrameInfo(unsigned StackAlignment, bool StackRealignable,
+  explicit MachineFrameInfo(Align StackAlignment, bool StackRealignable,
                             bool ForcedRealign)
-      : StackAlignment(assumeAligned(StackAlignment)),
+      : StackAlignment(StackAlignment),
         StackRealignable(StackRealignable), ForcedRealign(ForcedRealign) {}
 
   MachineFrameInfo(const MachineFrameInfo &) = delete;
@@ -360,6 +362,7 @@ public:
   /// This object is used for SjLj exceptions.
   int getFunctionContextIndex() const { return FunctionContextIdx; }
   void setFunctionContextIndex(int I) { FunctionContextIdx = I; }
+  bool hasFunctionContextIndex() const { return FunctionContextIdx != -1; }
 
   /// This method may be called any time after instruction
   /// selection is complete to determine if there is a call to
@@ -385,6 +388,20 @@ public:
   bool hasPatchPoint() const { return HasPatchPoint; }
   void setHasPatchPoint(bool s = true) { HasPatchPoint = s; }
 
+  /// Return true if this function requires a split stack prolog, even if it
+  /// uses no stack space. This is only meaningful for functions where
+  /// MachineFunction::shouldSplitStack() returns true.
+  //
+  // For non-leaf functions we have to allow for the possibility that the call
+  // is to a non-split function, as in PR37807. This function could also take
+  // the address of a non-split function. When the linker tries to adjust its
+  // non-existent prologue, it would fail with an error. Mark the object file so
+  // that such failures are not errors. See this Go language bug-report
+  // https://go-review.googlesource.com/c/go/+/148819/
+  bool needsSplitStackProlog() const {
+    return getStackSize() != 0 || hasTailCall();
+  }
+
   /// Return the minimum frame object index.
   int getObjectIndexBegin() const { return -NumFixedObjects; }
 
@@ -488,6 +505,14 @@ public:
     return Objects[ObjectIdx+NumFixedObjects].Alloca;
   }
 
+  /// Remove the underlying Alloca of the specified stack object if it
+  /// exists. This generally should not be used and is for reduction tooling.
+  void clearObjectAllocation(int ObjectIdx) {
+    assert(unsigned(ObjectIdx + NumFixedObjects) < Objects.size() &&
+           "Invalid Object Idx!");
+    Objects[ObjectIdx + NumFixedObjects].Alloca = nullptr;
+  }
+
   /// Return the assigned stack offset of the specified object
   /// from the incoming stack pointer.
   int64_t getObjectOffset(int ObjectIdx) const {
@@ -773,6 +798,9 @@ public:
   MachineBasicBlock *getRestorePoint() const { return Restore; }
   void setRestorePoint(MachineBasicBlock *NewRestore) { Restore = NewRestore; }
 
+  uint64_t getUnsafeStackSize() const { return UnsafeStackSize; }
+  void setUnsafeStackSize(uint64_t Size) { UnsafeStackSize = Size; }
+
   /// Return a set of physical registers that are pristine.
   ///
   /// Pristine registers hold a value that is useless to the current function,
diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h
index c4767a51b094..fc1188186ac4 100644
--- a/llvm/include/llvm/CodeGen/MachineFunction.h
+++ b/llvm/include/llvm/CodeGen/MachineFunction.h
@@ -103,6 +103,22 @@ struct MachineFunctionInfo {
   static Ty *create(BumpPtrAllocator &Allocator, MachineFunction &MF) {
     return new (Allocator.Allocate<Ty>()) Ty(MF);
   }
+
+  template <typename Ty>
+  static Ty *create(BumpPtrAllocator &Allocator, const Ty &MFI) {
+    return new (Allocator.Allocate<Ty>()) Ty(MFI);
+  }
+
+  /// Make a functionally equivalent copy of this MachineFunctionInfo in \p MF.
+  /// This requires remapping MachineBasicBlock references from the original
+  /// parent to values in the new function. Targets may assume that virtual
+  /// register and frame index values are preserved in the new function.
+  virtual MachineFunctionInfo *
+  clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+        const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+      const {
+    return nullptr;
+  }
 };
 
 /// Properties which a MachineFunction may have at a given point in time.
@@ -277,12 +293,6 @@ class LLVM_EXTERNAL_VISIBILITY MachineFunction {
   // numbered and this vector keeps track of the mapping from ID's to MBB's.
   std::vector<MachineBasicBlock*> MBBNumbering;
 
-  // Unary encoding of basic block symbols is used to reduce size of ".strtab".
-  // Basic block number 'i' gets a prefix of length 'i'.  The ith character also
-  // denotes the type of basic block number 'i'.  Return blocks are marked with
-  // 'r', landing pads with 'l' and regular blocks with 'a'.
-  std::vector<char> BBSectionsSymbolPrefix;
-
   // Pool-allocate MachineFunction-lifetime and IR objects.
   BumpPtrAllocator Allocator;
 
@@ -537,8 +547,13 @@ public:
   /// the copied value; or for parameters, creates a DBG_PHI on entry.
   /// May insert instructions into the entry block!
   /// \p MI The copy-like instruction to salvage.
+  /// \p DbgPHICache A container to cache already-solved COPYs.
   /// \returns An instruction/operand pair identifying the defining value.
-  DebugInstrOperandPair salvageCopySSA(MachineInstr &MI);
+  DebugInstrOperandPair
+  salvageCopySSA(MachineInstr &MI,
+                 DenseMap<Register, DebugInstrOperandPair> &DbgPHICache);
+
+  DebugInstrOperandPair salvageCopySSAImpl(MachineInstr &MI);
 
   /// Finalise any partially emitted debug instructions. These are DBG_INSTR_REF
   /// instructions where we only knew the vreg of the value they use, not the
@@ -747,6 +762,21 @@ public:
      return const_cast<MachineFunction*>(this)->getInfo<Ty>();
   }
 
+  template <typename Ty> Ty *cloneInfo(const Ty &Old) {
+    assert(!MFInfo);
+    MFInfo = Ty::template create<Ty>(Allocator, Old);
+    return static_cast<Ty *>(MFInfo);
+  }
+
+  MachineFunctionInfo *cloneInfoFrom(
+      const MachineFunction &OrigMF,
+      const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB) {
+    assert(!MFInfo && "new function already has MachineFunctionInfo");
+    if (!OrigMF.MFInfo)
+      return nullptr;
+    return OrigMF.MFInfo->clone(Allocator, *this, Src2DstMBB);
+  }
+
   /// Returns the denormal handling type for the default rounding mode of the
   /// function.
   DenormalMode getDenormalMode(const fltSemantics &FPType) const;
@@ -1101,12 +1131,6 @@ public:
   /// Add a cleanup action for a landing pad.
   void addCleanup(MachineBasicBlock *LandingPad);
 
-  void addSEHCatchHandler(MachineBasicBlock *LandingPad, const Function *Filter,
-                          const BlockAddress *RecoverBA);
-
-  void addSEHCleanupHandler(MachineBasicBlock *LandingPad,
-                            const Function *Cleanup);
-
   /// Return the type id for the specified typeinfo.  This is function wide.
   unsigned getTypeIDFor(const GlobalValue *TI);
 
@@ -1116,6 +1140,11 @@ public:
   /// Map the landing pad's EH symbol to the call site indexes.
   void setCallSiteLandingPad(MCSymbol *Sym, ArrayRef<unsigned> Sites);
 
+  /// Return if there is any wasm exception handling.
+  bool hasAnyWasmLandingPadIndex() const {
+    return !WasmLPadToIndexMap.empty();
+  }
+
   /// Map the landing pad to its index. Used for Wasm exception handling.
   void setWasmLandingPadIndex(const MachineBasicBlock *LPad, unsigned Index) {
     WasmLPadToIndexMap[LPad] = Index;
@@ -1132,6 +1161,10 @@ public:
     return WasmLPadToIndexMap.lookup(LPad);
   }
 
+  bool hasAnyCallSiteLandingPad() const {
+    return !LPadToCallSiteMap.empty();
+  }
+
   /// Get the call site indexes for a landing pad EH symbol.
   SmallVectorImpl<unsigned> &getCallSiteLandingPad(MCSymbol *Sym) {
     assert(hasCallSiteLandingPad(Sym) &&
@@ -1144,6 +1177,10 @@ public:
     return !LPadToCallSiteMap[Sym].empty();
   }
 
+  bool hasAnyCallSiteLabel() const {
+    return !CallSiteMap.empty();
+  }
+
   /// Map the begin label for a call site.
   void setCallSiteBeginLabel(MCSymbol *BeginLabel, unsigned Site) {
     CallSiteMap[BeginLabel] = Site;
@@ -1220,10 +1257,6 @@ public:
   void copyCallSiteInfo(const MachineInstr *Old,
                         const MachineInstr *New);
 
-  const std::vector<char> &getBBSectionsSymbolPrefix() const {
-    return BBSectionsSymbolPrefix;
-  }
-
   /// Move the call site info from \p Old to \New call site info. This function
   /// is used when we are replacing one call instruction with another one to
   /// the same callee.
diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h
index 2893e138a95c..acc4c9a24c01 100644
--- a/llvm/include/llvm/CodeGen/MachineInstr.h
+++ b/llvm/include/llvm/CodeGen/MachineInstr.h
@@ -26,7 +26,6 @@
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/PseudoProbe.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ArrayRecycler.h"
@@ -38,6 +37,9 @@
 
 namespace llvm {
 
+class DILabel;
+class Instruction;
+class MDNode;
 class AAResults;
 template <typename T> class ArrayRef;
 class DIExpression;
@@ -96,7 +98,7 @@ public:
     FmContract   = 1 << 8,              // Instruction supports Fast math
                                         // contraction operations like fma.
     FmAfn        = 1 << 9,              // Instruction may map to Fast math
-                                        // instrinsic approximation.
+                                        // intrinsic approximation.
     FmReassoc    = 1 << 10,             // Instruction supports Fast math
                                         // reassociation of operand order.
     NoUWrap      = 1 << 11,             // Instruction supports binary operator
@@ -586,8 +588,7 @@ public:
 
   /// Return true if operand \p OpIdx is a subregister index.
   bool isOperandSubregIdx(unsigned OpIdx) const {
-    assert(getOperand(OpIdx).getType() == MachineOperand::MO_Immediate &&
-           "Expected MO_Immediate operand type.");
+    assert(getOperand(OpIdx).isImm() && "Expected MO_Immediate operand type.");
     if (isExtractSubreg() && OpIdx == 2)
       return true;
     if (isInsertSubreg() && OpIdx == 3)
@@ -810,6 +811,12 @@ public:
     return hasProperty(MCID::Pseudo, Type);
   }
 
+  /// Return true if this instruction doesn't produce any output in the form of
+  /// executable instructions.
+  bool isMetaInstruction(QueryType Type = IgnoreBundle) const {
+    return hasProperty(MCID::Meta, Type);
+  }
+
   bool isReturn(QueryType Type = AnyInBundle) const {
     return hasProperty(MCID::Return, Type);
   }
@@ -1306,30 +1313,6 @@ public:
       getOperand(0).getSubReg() == getOperand(1).getSubReg();
   }
 
-  /// Return true if this instruction doesn't produce any output in the form of
-  /// executable instructions.
-  bool isMetaInstruction() const {
-    switch (getOpcode()) {
-    default:
-      return false;
-    case TargetOpcode::IMPLICIT_DEF:
-    case TargetOpcode::KILL:
-    case TargetOpcode::CFI_INSTRUCTION:
-    case TargetOpcode::EH_LABEL:
-    case TargetOpcode::GC_LABEL:
-    case TargetOpcode::DBG_VALUE:
-    case TargetOpcode::DBG_VALUE_LIST:
-    case TargetOpcode::DBG_INSTR_REF:
-    case TargetOpcode::DBG_PHI:
-    case TargetOpcode::DBG_LABEL:
-    case TargetOpcode::LIFETIME_START:
-    case TargetOpcode::LIFETIME_END:
-    case TargetOpcode::PSEUDO_PROBE:
-    case TargetOpcode::ARITH_FENCE:
-      return true;
-    }
-  }
-
   /// Return true if this is a transient instruction that is either very likely
   /// to be eliminated during register allocation (such as copy-like
   /// instructions), or if this instruction doesn't have an execution-time cost.
@@ -1744,7 +1727,7 @@ public:
 
   /// Erase an operand from an instruction, leaving it with one
   /// fewer operand than it started with.
-  void RemoveOperand(unsigned OpNo);
+  void removeOperand(unsigned OpNo);
 
   /// Clear this MachineInstr's memory reference descriptor list.  This resets
   /// the memrefs to their most conservative state.  This should be used only
@@ -1863,12 +1846,12 @@ private:
   /// Unlink all of the register operands in this instruction from their
   /// respective use lists.  This requires that the operands already be on their
   /// use lists.
-  void RemoveRegOperandsFromUseLists(MachineRegisterInfo&);
+  void removeRegOperandsFromUseLists(MachineRegisterInfo&);
 
   /// Add all of the register operands in this instruction from their
   /// respective use lists.  This requires that the operands not be on their
   /// use lists yet.
-  void AddRegOperandsToUseLists(MachineRegisterInfo&);
+  void addRegOperandsToUseLists(MachineRegisterInfo&);
 
   /// Slow path for hasProperty when we're dealing with a bundle.
   bool hasPropertyInBundle(uint64_t Mask, QueryType Type) const;
diff --git a/llvm/include/llvm/CodeGen/MachineLoopInfo.h b/llvm/include/llvm/CodeGen/MachineLoopInfo.h
index c90f07096d02..daf0f18a7518 100644
--- a/llvm/include/llvm/CodeGen/MachineLoopInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineLoopInfo.h
@@ -33,7 +33,6 @@
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/IR/DebugLoc.h"
-#include "llvm/Pass.h"
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/CodeGen/MachineMemOperand.h b/llvm/include/llvm/CodeGen/MachineMemOperand.h
index 00080b171974..41574d8d556a 100644
--- a/llvm/include/llvm/CodeGen/MachineMemOperand.h
+++ b/llvm/include/llvm/CodeGen/MachineMemOperand.h
@@ -31,14 +31,13 @@ class MDNode;
 class raw_ostream;
 class MachineFunction;
 class ModuleSlotTracker;
+class TargetInstrInfo;
 
 /// This class contains a discriminated union of information about pointers in
 /// memory operands, relating them back to LLVM IR or to virtual locations (such
 /// as frame indices) that are exposed during codegen.
 struct MachinePointerInfo {
   /// This is the IR pointer value for the access, or it is null if unknown.
-  /// If this is null, then the access is to a pointer in the default address
-  /// space.
   PointerUnion<const Value *, const PseudoSourceValue *> V;
 
   /// Offset - This is an offset from the base Value*.
diff --git a/llvm/include/llvm/CodeGen/MachineModuleInfo.h b/llvm/include/llvm/CodeGen/MachineModuleInfo.h
index c07606e89374..cdd0073749d3 100644
--- a/llvm/include/llvm/CodeGen/MachineModuleInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineModuleInfo.h
@@ -30,12 +30,10 @@
 #ifndef LLVM_CODEGEN_MACHINEMODULEINFO_H
 #define LLVM_CODEGEN_MACHINEMODULEINFO_H
 
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCSymbol.h"
 #include "llvm/Pass.h"
 #include <memory>
 #include <utility>
@@ -46,9 +44,9 @@ namespace llvm {
 class BasicBlock;
 class Function;
 class LLVMTargetMachine;
-class MMIAddrLabelMap;
 class MachineFunction;
 class Module;
+class MCSymbol;
 
 //===----------------------------------------------------------------------===//
 /// This class can be derived from and used by targets to hold private
@@ -106,10 +104,6 @@ class MachineModuleInfo {
 
   /// \}
 
-  /// This map keeps track of which symbol is being used for the specified
-  /// basic block's address of label.
-  MMIAddrLabelMap *AddrLabelSymbols;
-
   // TODO: Ideally, what we'd like is to have a switch that allows emitting
   // synchronous (precise at call-sites only) CFA into .eh_frame. However,
   // even under this switch, we'd like .debug_frame to be precise when using
@@ -123,22 +117,6 @@ class MachineModuleInfo {
   /// point.  This is used to emit an undefined reference to _fltused.
   bool UsesMSVCFloatingPoint;
 
-  /// True if the module calls the __morestack function indirectly, as is
-  /// required under the large code model on x86. This is used to emit
-  /// a definition of a symbol, __morestack_addr, containing the address. See
-  /// comments in lib/Target/X86/X86FrameLowering.cpp for more details.
-  bool UsesMorestackAddr;
-
-  /// True if the module contains split-stack functions. This is used to
-  /// emit .note.GNU-split-stack section as required by the linker for
-  /// special handling split-stack function calling no-split-stack function.
-  bool HasSplitStack;
-
-  /// True if the module contains no-split-stack functions. This is used to
-  /// emit .note.GNU-no-split-stack section when it also contains split-stack
-  /// functions.
-  bool HasNosplitStack;
-
   /// Maps IR Functions to their corresponding MachineFunctions.
   DenseMap<const Function*, std::unique_ptr<MachineFunction>> MachineFunctions;
   /// Next unique number available for a MachineFunction.
@@ -184,6 +162,9 @@ public:
   /// Machine Function map.
   void deleteMachineFunctionFor(Function &F);
 
+  /// Add an externally created MachineFunction \p MF for \p F.
+  void insertFunction(const Function &F, std::unique_ptr<MachineFunction> &&MF);
+
   /// Keep track of various per-module pieces of information for backends
   /// that would like to do so.
   template<typename Ty>
@@ -200,55 +181,11 @@ public:
 
   /// Returns true if valid debug info is present.
   bool hasDebugInfo() const { return DbgInfoAvailable; }
-  void setDebugInfoAvailability(bool avail) { DbgInfoAvailable = avail; }
 
   bool usesMSVCFloatingPoint() const { return UsesMSVCFloatingPoint; }
 
   void setUsesMSVCFloatingPoint(bool b) { UsesMSVCFloatingPoint = b; }
 
-  bool usesMorestackAddr() const {
-    return UsesMorestackAddr;
-  }
-
-  void setUsesMorestackAddr(bool b) {
-    UsesMorestackAddr = b;
-  }
-
-  bool hasSplitStack() const {
-    return HasSplitStack;
-  }
-
-  void setHasSplitStack(bool b) {
-    HasSplitStack = b;
-  }
-
-  bool hasNosplitStack() const {
-    return HasNosplitStack;
-  }
-
-  void setHasNosplitStack(bool b) {
-    HasNosplitStack = b;
-  }
-
-  /// Return the symbol to be used for the specified basic block when its
-  /// address is taken.  This cannot be its normal LBB label because the block
-  /// may be accessed outside its containing function.
-  MCSymbol *getAddrLabelSymbol(const BasicBlock *BB) {
-    return getAddrLabelSymbolToEmit(BB).front();
-  }
-
-  /// Return the symbol to be used for the specified basic block when its
-  /// address is taken.  If other blocks were RAUW'd to this one, we may have
-  /// to emit them as well, return the whole set.
-  ArrayRef<MCSymbol *> getAddrLabelSymbolToEmit(const BasicBlock *BB);
-
-  /// If the specified function has had any references to address-taken blocks
-  /// generated, but the block got deleted, return the symbol now so we can
-  /// emit it.  This prevents emitting a reference to a symbol that has no
-  /// definition.
-  void takeDeletedSymbolsForFunction(const Function *F,
-                                     std::vector<MCSymbol*> &Result);
-
   /// \name Exception Handling
   /// \{
 
diff --git a/llvm/include/llvm/CodeGen/MachineOperand.h b/llvm/include/llvm/CodeGen/MachineOperand.h
index eded28183ea2..c88e72cdc1d9 100644
--- a/llvm/include/llvm/CodeGen/MachineOperand.h
+++ b/llvm/include/llvm/CodeGen/MachineOperand.h
@@ -13,15 +13,14 @@
 #ifndef LLVM_CODEGEN_MACHINEOPERAND_H
 #define LLVM_CODEGEN_MACHINEOPERAND_H
 
-#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/CodeGen/Register.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/Support/DataTypes.h"
-#include "llvm/Support/LowLevelTypeImpl.h"
 #include <cassert>
 
 namespace llvm {
 
+class LLT;
 class BlockAddress;
 class Constant;
 class ConstantFP;
@@ -460,6 +459,16 @@ public:
     return !isUndef() && !isInternalRead() && (isUse() || getSubReg());
   }
 
+  /// Return true if this operand can validly be appended to an arbitrary
+  /// operand list. i.e. this behaves like an implicit operand.
+  bool isValidExcessOperand() const {
+    if ((isReg() && isImplicit()) || isRegMask())
+      return true;
+
+    // Debug operands
+    return isMetadata() || isMCSymbol();
+  }
+
   //===--------------------------------------------------------------------===//
   // Mutators for Register Operands
   //===--------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h b/llvm/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h
index 285b858c96cb..cb0998984dfb 100644
--- a/llvm/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h
+++ b/llvm/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h
@@ -15,8 +15,9 @@
 #ifndef LLVM_CODEGEN_MACHINEOPTIMIZATIONREMARKEMITTER_H
 #define LLVM_CODEGEN_MACHINEOPTIMIZATIONREMARKEMITTER_H
 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
 
 namespace llvm {
 class MachineBasicBlock;
diff --git a/llvm/include/llvm/CodeGen/MachineOutliner.h b/llvm/include/llvm/CodeGen/MachineOutliner.h
index 08b76295dbf2..f968089e0de0 100644
--- a/llvm/include/llvm/CodeGen/MachineOutliner.h
+++ b/llvm/include/llvm/CodeGen/MachineOutliner.h
@@ -15,11 +15,10 @@
 #ifndef LLVM_CODEGEN_MACHINEOUTLINER_H
 #define LLVM_CODEGEN_MACHINEOUTLINER_H
 
-#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include <initializer_list>
 
 namespace llvm {
 namespace outliner {
@@ -56,6 +55,55 @@ private:
   /// target.
   unsigned CallOverhead = 0;
 
+  /// Liveness information for this Candidate. Tracks from the end of the
+  /// block containing this Candidate to the beginning of its sequence.
+  ///
+  /// Optional. Can be used to fine-tune the cost model, or fine-tune legality
+  /// decisions.
+  LiveRegUnits FromEndOfBlockToStartOfSeq;
+
+  /// Liveness information restricted to this Candidate's instruction sequence.
+  ///
+  /// Optional. Can be used to fine-tune the cost model, or fine-tune legality
+  /// decisions.
+  LiveRegUnits InSeq;
+
+  /// True if FromEndOfBlockToStartOfSeq has been initialized.
+  bool FromEndOfBlockToStartOfSeqWasSet = false;
+
+  /// True if InSeq has been initialized.
+  bool InSeqWasSet = false;
+
+  /// Populate FromEndOfBlockToStartOfSeq with liveness information.
+  void initFromEndOfBlockToStartOfSeq(const TargetRegisterInfo &TRI) {
+    assert(MBB->getParent()->getRegInfo().tracksLiveness() &&
+           "Candidate's Machine Function must track liveness");
+    // Only initialize once.
+    if (FromEndOfBlockToStartOfSeqWasSet)
+      return;
+    FromEndOfBlockToStartOfSeqWasSet = true;
+    FromEndOfBlockToStartOfSeq.init(TRI);
+    FromEndOfBlockToStartOfSeq.addLiveOuts(*MBB);
+    // Compute liveness from the end of the block up to the beginning of the
+    // outlining candidate.
+    for (auto &MI : make_range(MBB->rbegin(),
+                               (MachineBasicBlock::reverse_iterator)front()))
+      FromEndOfBlockToStartOfSeq.stepBackward(MI);
+  }
+
+  /// Populate InSeq with liveness information.
+  void initInSeq(const TargetRegisterInfo &TRI) {
+    assert(MBB->getParent()->getRegInfo().tracksLiveness() &&
+           "Candidate's Machine Function must track liveness");
+    // Only initialize once.
+    if (InSeqWasSet)
+      return;
+    InSeqWasSet = true;
+    InSeq.init(TRI);
+    for (auto &MI : make_range(front(), std::next(back())))
+      InSeq.accumulate(MI);
+  }
+
 public:
   /// The index of this \p Candidate's \p OutlinedFunction in the list of
   /// \p OutlinedFunctions.
@@ -65,26 +113,9 @@ public:
   /// from this point. Defined by the target.
   unsigned CallConstructionID = 0;
 
-  /// Contains physical register liveness information for the MBB containing
-  /// this \p Candidate.
-  ///
-  /// This is optionally used by the target to calculate more fine-grained
-  /// cost model information.
-  LiveRegUnits LRU;
-
-  /// Contains the accumulated register liveness information for the
-  /// instructions in this \p Candidate.
-  ///
-  /// This is optionally used by the target to determine which registers have
-  /// been used across the sequence.
-  LiveRegUnits UsedInSequence;
-
   /// Target-specific flags for this Candidate's MBB.
   unsigned Flags = 0x0;
 
-  /// True if initLRU has been called on this Candidate.
-  bool LRUWasSet = false;
-
   /// Return the number of instructions in this Candidate.
   unsigned getLength() const { return Len; }
 
@@ -109,6 +140,50 @@ public:
   MachineFunction *getMF() const { return MBB->getParent(); }
   MachineBasicBlock *getMBB() const { return MBB; }
 
+  /// \returns True if \p Reg is available from the end of the block to the
+  /// beginning of the sequence.
+  ///
+  /// This query considers the following range:
+  ///
+  /// in_seq_1
+  /// in_seq_2
+  /// ...
+  /// in_seq_n
+  /// not_in_seq_1
+  /// ...
+  /// <end of block>
+  bool isAvailableAcrossAndOutOfSeq(Register Reg,
+                                    const TargetRegisterInfo &TRI) {
+    if (!FromEndOfBlockToStartOfSeqWasSet)
+      initFromEndOfBlockToStartOfSeq(TRI);
+    return FromEndOfBlockToStartOfSeq.available(Reg);
+  }
+
+  /// \returns True if `isAvailableAcrossAndOutOfSeq` fails for any register
+  /// in \p Regs.
+  bool isAnyUnavailableAcrossOrOutOfSeq(std::initializer_list<Register> Regs,
+                                        const TargetRegisterInfo &TRI) {
+    if (!FromEndOfBlockToStartOfSeqWasSet)
+      initFromEndOfBlockToStartOfSeq(TRI);
+    return any_of(Regs, [&](Register Reg) {
+      return !FromEndOfBlockToStartOfSeq.available(Reg);
+    });
+  }
+
+  /// \returns True if \p Reg is available within the sequence itself.
+  ///
+  /// This query considers the following range:
+  ///
+  /// in_seq_1
+  /// in_seq_2
+  /// ...
+  /// in_seq_n
+  bool isAvailableInsideSeq(Register Reg, const TargetRegisterInfo &TRI) {
+    if (!InSeqWasSet)
+      initInSeq(TRI);
+    return InSeq.available(Reg);
+  }
+
   /// The number of instructions that would be saved by outlining every
   /// candidate of this type.
   ///
@@ -132,31 +207,6 @@ public:
     return getStartIdx() > RHS.getStartIdx();
   }
 
-  /// Compute the registers that are live across this Candidate.
-  /// Used by targets that need this information for cost model calculation.
-  /// If a target does not need this information, then this should not be
-  /// called.
-  void initLRU(const TargetRegisterInfo &TRI) {
-    assert(MBB->getParent()->getRegInfo().tracksLiveness() &&
-           "Candidate's Machine Function must track liveness");
-    // Only initialize once.
-    if (LRUWasSet)
-      return;
-    LRUWasSet = true;
-    LRU.init(TRI);
-    LRU.addLiveOuts(*MBB);
-
-    // Compute liveness from the end of the block up to the beginning of the
-    // outlining candidate.
-    std::for_each(MBB->rbegin(), (MachineBasicBlock::reverse_iterator)front(),
-                  [this](MachineInstr &MI) { LRU.stepBackward(MI); });
-
-    // Walk over the sequence itself and figure out which registers were used
-    // in the sequence.
-    UsedInSequence.init(TRI);
-    std::for_each(front(), std::next(back()),
-                  [this](MachineInstr &MI) { UsedInSequence.accumulate(MI); });
-  }
 };
 
 /// The information necessary to create an outlined function for some
diff --git a/llvm/include/llvm/CodeGen/MachinePassManager.h b/llvm/include/llvm/CodeGen/MachinePassManager.h
index 75b8a89c812e..6089339c7f5a 100644
--- a/llvm/include/llvm/CodeGen/MachinePassManager.h
+++ b/llvm/include/llvm/CodeGen/MachinePassManager.h
@@ -25,13 +25,15 @@
 
 #include "llvm/ADT/FunctionExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/type_traits.h"
+
+#include <map>
 
 namespace llvm {
 class Module;
+class Function;
+class MachineFunction;
 
 extern template class AnalysisManager<MachineFunction>;
 
diff --git a/llvm/include/llvm/CodeGen/MachinePassRegistry.def b/llvm/include/llvm/CodeGen/MachinePassRegistry.def
index e6763899a083..7748055f5d35 100644
--- a/llvm/include/llvm/CodeGen/MachinePassRegistry.def
+++ b/llvm/include/llvm/CodeGen/MachinePassRegistry.def
@@ -47,6 +47,7 @@ FUNCTION_PASS("expand-reductions", ExpandReductionsPass, ())
 FUNCTION_PASS("expandvp", ExpandVectorPredicationPass, ())
 FUNCTION_PASS("lowerinvoke", LowerInvokePass, ())
 FUNCTION_PASS("scalarize-masked-mem-intrin", ScalarizeMaskedMemIntrinPass, ())
+FUNCTION_PASS("tlshoist", TLSVariableHoistPass, ())
 FUNCTION_PASS("verify", VerifierPass, ())
 #undef FUNCTION_PASS
 
@@ -119,6 +120,7 @@ DUMMY_FUNCTION_PASS("indirectbr-expand", IndirectBrExpandPass, ())
 DUMMY_FUNCTION_PASS("cfguard-dispatch", CFGuardDispatchPass, ())
 DUMMY_FUNCTION_PASS("cfguard-check", CFGuardCheckPass, ())
 DUMMY_FUNCTION_PASS("gc-info-printer", GCInfoPrinterPass, ())
+DUMMY_FUNCTION_PASS("select-optimize", SelectOptimizePass, ())
 #undef DUMMY_FUNCTION_PASS
 
 #ifndef DUMMY_MODULE_PASS
@@ -197,6 +199,5 @@ DUMMY_MACHINE_FUNCTION_PASS("regbankselect", RegBankSelectPass, ())
 DUMMY_MACHINE_FUNCTION_PASS("instruction-select", InstructionSelectPass, ())
 DUMMY_MACHINE_FUNCTION_PASS("reset-machine-function", ResetMachineFunctionPass, ())
 DUMMY_MACHINE_FUNCTION_PASS("machineverifier", MachineVerifierPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("machine-cycles", MachineCycleInfoWrapperPass, ())
 DUMMY_MACHINE_FUNCTION_PASS("print-machine-cycles", MachineCycleInfoPrinterPass, ())
 #undef DUMMY_MACHINE_FUNCTION_PASS
diff --git a/llvm/include/llvm/CodeGen/MachinePipeliner.h b/llvm/include/llvm/CodeGen/MachinePipeliner.h
index 7e7fa57d80da..4559f7a9bde7 100644
--- a/llvm/include/llvm/CodeGen/MachinePipeliner.h
+++ b/llvm/include/llvm/CodeGen/MachinePipeliner.h
@@ -40,13 +40,17 @@
 #ifndef LLVM_CODEGEN_MACHINEPIPELINER_H
 #define LLVM_CODEGEN_MACHINEPIPELINER_H
 
+#include "llvm/ADT/SetVector.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/InitializePasses.h"
 
+#include <deque>
+
 namespace llvm {
 
 class AAResults;
@@ -80,6 +84,8 @@ public:
     SmallVector<MachineOperand, 4> BrCond;
     MachineInstr *LoopInductionVar = nullptr;
     MachineInstr *LoopCompare = nullptr;
+    std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopPipelinerInfo =
+        nullptr;
   };
   LoopInfo LI;
 
@@ -115,6 +121,7 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
   LiveIntervals &LIS;
   const RegisterClassInfo &RegClassInfo;
   unsigned II_setByPragma = 0;
+  TargetInstrInfo::PipelinerLoopInfo *LoopPipelinerInfo = nullptr;
 
   /// A toplogical ordering of the SUnits, which is needed for changing
   /// dependences and iterating over the SUnits.
@@ -192,9 +199,11 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
 
 public:
   SwingSchedulerDAG(MachinePipeliner &P, MachineLoop &L, LiveIntervals &lis,
-                    const RegisterClassInfo &rci, unsigned II)
+                    const RegisterClassInfo &rci, unsigned II,
+                    TargetInstrInfo::PipelinerLoopInfo *PLI)
       : ScheduleDAGInstrs(*P.MF, P.MLI, false), Pass(P), Loop(L), LIS(lis),
-        RegClassInfo(rci), II_setByPragma(II), Topo(SUnits, &ExitSU) {
+        RegClassInfo(rci), II_setByPragma(II), LoopPipelinerInfo(PLI),
+        Topo(SUnits, &ExitSU) {
     P.MF->getSubtarget().getSMSMutations(Mutations);
     if (SwpEnableCopyToPhi)
       Mutations.push_back(std::make_unique<CopyToPhiMutation>());
@@ -585,6 +594,13 @@ public:
     return ScheduledInstrs[cycle];
   }
 
+  SmallSet<SUnit *, 8>
+  computeUnpipelineableNodes(SwingSchedulerDAG *SSD,
+                             TargetInstrInfo::PipelinerLoopInfo *PLI);
+
+  bool
+  normalizeNonPipelinedInstructions(SwingSchedulerDAG *SSD,
+                                    TargetInstrInfo::PipelinerLoopInfo *PLI);
   bool isValidSchedule(SwingSchedulerDAG *SSD);
   void finalizeSchedule(SwingSchedulerDAG *SSD);
   void orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index 94ae6fe02e9c..b2c5f12106af 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -15,18 +15,16 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/IndexedMap.h"
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/iterator_range.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
-#include "llvm/CodeGen/LowLevelType.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/RegisterBank.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/MC/LaneBitmask.h"
@@ -229,6 +227,16 @@ public:
   /// Returns true if the updated CSR list was initialized and false otherwise.
   bool isUpdatedCSRsInitialized() const { return IsUpdatedCSRsInitialized; }
 
+  /// Returns true if a register can be used as an argument to a function.
+  bool isArgumentRegister(const MachineFunction &MF, MCRegister Reg) const;
+
+  /// Returns true if a register is a fixed register.
+  bool isFixedRegister(const MachineFunction &MF, MCRegister Reg) const;
+
+  /// Returns true if a register is a general purpose register.
+  bool isGeneralPurposeRegister(const MachineFunction &MF,
+                                MCRegister Reg) const;
+
   /// Disables the register from the list of CSRs.
   /// I.e. the register will not appear as part of the CSR mask.
   /// \see UpdatedCalleeSavedRegs.
@@ -825,23 +833,12 @@ public:
   /// to refer to the designated register.
   void updateDbgUsersToReg(MCRegister OldReg, MCRegister NewReg,
                            ArrayRef<MachineInstr *> Users) const {
-    SmallSet<MCRegister, 4> OldRegUnits;
-    for (MCRegUnitIterator RUI(OldReg, getTargetRegisterInfo()); RUI.isValid();
-         ++RUI)
-      OldRegUnits.insert(*RUI);
-
     // If this operand is a register, check whether it overlaps with OldReg.
     // If it does, replace with NewReg.
-    auto UpdateOp = [this, &NewReg, &OldReg, &OldRegUnits](MachineOperand &Op) {
-      if (Op.isReg()) {
-        for (MCRegUnitIterator RUI(OldReg, getTargetRegisterInfo());
-             RUI.isValid(); ++RUI) {
-          if (OldRegUnits.contains(*RUI)) {
-            Op.setReg(NewReg);
-            break;
-          }
-        }
-      }
+    auto UpdateOp = [this, &NewReg, &OldReg](MachineOperand &Op) {
+      if (Op.isReg() &&
+          getTargetRegisterInfo()->regsOverlap(Op.getReg(), OldReg))
+        Op.setReg(NewReg);
     };
 
     // Iterate through (possibly several) operands to DBG_VALUEs and update
diff --git a/llvm/include/llvm/CodeGen/MachineSSAContext.h b/llvm/include/llvm/CodeGen/MachineSSAContext.h
index 6dbf321bdeaa..f59d7cf8a522 100644
--- a/llvm/include/llvm/CodeGen/MachineSSAContext.h
+++ b/llvm/include/llvm/CodeGen/MachineSSAContext.h
@@ -15,21 +15,21 @@
 #ifndef LLVM_CODEGEN_MACHINESSACONTEXT_H
 #define LLVM_CODEGEN_MACHINESSACONTEXT_H
 
-#include "llvm/ADT/GenericSSAContext.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/Support/Printable.h"
 
-#include <memory>
-
 namespace llvm {
+class MachineRegisterInfo;
 class MachineInstr;
-class MachineBasicBlock;
 class MachineFunction;
 class Register;
+template <typename _FunctionT> class GenericSSAContext;
 template <typename, bool> class DominatorTreeBase;
 
 inline auto successors(MachineBasicBlock *BB) { return BB->successors(); }
 inline auto predecessors(MachineBasicBlock *BB) { return BB->predecessors(); }
+inline unsigned succ_size(MachineBasicBlock *BB) { return BB->succ_size(); }
+inline unsigned pred_size(MachineBasicBlock *BB) { return BB->pred_size(); }
 
 template <> class GenericSSAContext<MachineFunction> {
   const MachineRegisterInfo *RegInfo = nullptr;
diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h
index 267c4b595eec..0554eb1ab77e 100644
--- a/llvm/include/llvm/CodeGen/MachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -287,7 +287,7 @@ protected:
   const SUnit *NextClusterPred = nullptr;
   const SUnit *NextClusterSucc = nullptr;
 
-#ifndef NDEBUG
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
   /// The number of instructions scheduled so far. Used to cut off the
   /// scheduler at the point determined by misched-cutoff.
   unsigned NumInstrsScheduled = 0;
@@ -679,7 +679,7 @@ private:
   // For each PIdx, stores the resource group IDs of its subunits
   SmallVector<APInt, 16> ResourceGroupSubUnitMasks;
 
-#ifndef NDEBUG
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
   // Remember the greatest possible stall as an upper bound on the number of
   // times we should retry the pending queue because of a hazard.
   unsigned MaxObservedStall;
diff --git a/llvm/include/llvm/CodeGen/MachineStableHash.h b/llvm/include/llvm/CodeGen/MachineStableHash.h
index 8423b2da1c78..43571b7b8afd 100644
--- a/llvm/include/llvm/CodeGen/MachineStableHash.h
+++ b/llvm/include/llvm/CodeGen/MachineStableHash.h
@@ -17,6 +17,8 @@
 #include "llvm/CodeGen/StableHashing.h"
 
 namespace llvm {
+class MachineBasicBlock;
+class MachineFunction;
 class MachineInstr;
 class MachineOperand;
 
@@ -24,6 +26,8 @@ stable_hash stableHashValue(const MachineOperand &MO);
 stable_hash stableHashValue(const MachineInstr &MI, bool HashVRegs = false,
                             bool HashConstantPoolIndices = false,
                             bool HashMemOperands = false);
+stable_hash stableHashValue(const MachineBasicBlock &MBB);
+stable_hash stableHashValue(const MachineFunction &MF);
 
 } // namespace llvm
 
diff --git a/llvm/include/llvm/CodeGen/ModuloSchedule.h b/llvm/include/llvm/CodeGen/ModuloSchedule.h
index e8dbf49994bb..c515101e80fd 100644
--- a/llvm/include/llvm/CodeGen/ModuloSchedule.h
+++ b/llvm/include/llvm/CodeGen/ModuloSchedule.h
@@ -61,7 +61,6 @@
 #define LLVM_CODEGEN_MODULOSCHEDULE_H
 
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineLoopUtils.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -70,6 +69,8 @@
 
 namespace llvm {
 class MachineBasicBlock;
+class MachineLoop;
+class MachineRegisterInfo;
 class MachineInstr;
 class LiveIntervals;
 
@@ -190,8 +191,8 @@ private:
   void generateProlog(unsigned LastStage, MachineBasicBlock *KernelBB,
                       ValueMapTy *VRMap, MBBVectorTy &PrologBBs);
   void generateEpilog(unsigned LastStage, MachineBasicBlock *KernelBB,
-                      ValueMapTy *VRMap, MBBVectorTy &EpilogBBs,
-                      MBBVectorTy &PrologBBs);
+                      MachineBasicBlock *OrigBB, ValueMapTy *VRMap,
+                      MBBVectorTy &EpilogBBs, MBBVectorTy &PrologBBs);
   void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
                             MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
                             ValueMapTy *VRMap, InstrMapTy &InstrMap,
diff --git a/llvm/include/llvm/CodeGen/PBQP/ReductionRules.h b/llvm/include/llvm/CodeGen/PBQP/ReductionRules.h
index 51822d082bad..043b6b120632 100644
--- a/llvm/include/llvm/CodeGen/PBQP/ReductionRules.h
+++ b/llvm/include/llvm/CodeGen/PBQP/ReductionRules.h
@@ -190,7 +190,7 @@ namespace PBQP {
 
       RawVector v = G.getNodeCosts(NId);
 
-#ifndef NDEBUG
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
       // Although a conservatively allocatable node can be allocated to a register,
       // spilling it may provide a lower cost solution. Assert here that spilling
       // is done by choice, not because there were no register available.
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index 616ab1034133..6e37d42f0d29 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -51,10 +51,8 @@ namespace llvm {
   FunctionPass *createUnreachableBlockEliminationPass();
 
   /// createBasicBlockSections Pass - This pass assigns sections to machine
-  /// basic blocks and is enabled with -fbasic-block-sections. Buf is a memory
-  /// buffer that contains the list of functions and basic block ids to
-  /// selectively enable basic block sections.
-  MachineFunctionPass *createBasicBlockSectionsPass(const MemoryBuffer *Buf);
+  /// basic blocks and is enabled with -fbasic-block-sections.
+  MachineFunctionPass *createBasicBlockSectionsPass();
 
   /// createMachineFunctionSplitterPass - This pass splits machine functions
   /// using profile information.
@@ -331,6 +329,8 @@ namespace llvm {
   /// machine instructions.
   extern char &MachineCopyPropagationID;
 
+  MachineFunctionPass *createMachineCopyPropagationPass(bool UseCopyInstr);
+
   /// PeepholeOptimizer - This pass performs peephole optimizations -
   /// like extension and comparison eliminations.
   extern char &PeepholeOptimizerID;
@@ -494,6 +494,9 @@ namespace llvm {
   // This pass expands indirectbr instructions.
   FunctionPass *createIndirectBrExpandPass();
 
+  /// Creates CFI Fixup pass. \see CFIFixup.cpp
+  FunctionPass *createCFIFixup();
+
   /// Creates CFI Instruction Inserter pass. \see CFIInstrInserter.cpp
   FunctionPass *createCFIInstrInserter();
 
@@ -554,6 +557,12 @@ namespace llvm {
   /// When learning an eviction policy, extract score(reward) information,
   /// otherwise this does nothing
   FunctionPass *createRegAllocScoringPass();
+
+  /// JMC instrument pass.
+  ModulePass *createJMCInstrumenterPass();
+
+  /// This pass converts conditional moves to conditional jumps when profitable.
+  FunctionPass *createSelectOptimizePass();
 } // End llvm namespace
 
 #endif
diff --git a/llvm/include/llvm/CodeGen/PseudoSourceValue.h b/llvm/include/llvm/CodeGen/PseudoSourceValue.h
index f1487017f205..07b7ba321566 100644
--- a/llvm/include/llvm/CodeGen/PseudoSourceValue.h
+++ b/llvm/include/llvm/CodeGen/PseudoSourceValue.h
@@ -25,7 +25,7 @@ class MachineMemOperand;
 class MIRFormatter;
 class PseudoSourceValue;
 class raw_ostream;
-class TargetInstrInfo;
+class TargetMachine;
 
 raw_ostream &operator<<(raw_ostream &OS, const PseudoSourceValue* PSV);
 
@@ -59,7 +59,7 @@ private:
   virtual void printCustom(raw_ostream &O) const;
 
 public:
-  explicit PseudoSourceValue(unsigned Kind, const TargetInstrInfo &TII);
+  explicit PseudoSourceValue(unsigned Kind, const TargetMachine &TM);
 
   virtual ~PseudoSourceValue();
 
@@ -95,8 +95,8 @@ class FixedStackPseudoSourceValue : public PseudoSourceValue {
   const int FI;
 
 public:
-  explicit FixedStackPseudoSourceValue(int FI, const TargetInstrInfo &TII)
-      : PseudoSourceValue(FixedStack, TII), FI(FI) {}
+  explicit FixedStackPseudoSourceValue(int FI, const TargetMachine &TM)
+      : PseudoSourceValue(FixedStack, TM), FI(FI) {}
 
   static bool classof(const PseudoSourceValue *V) {
     return V->kind() == FixedStack;
@@ -115,7 +115,7 @@ public:
 
 class CallEntryPseudoSourceValue : public PseudoSourceValue {
 protected:
-  CallEntryPseudoSourceValue(unsigned Kind, const TargetInstrInfo &TII);
+  CallEntryPseudoSourceValue(unsigned Kind, const TargetMachine &TM);
 
 public:
   bool isConstant(const MachineFrameInfo *) const override;
@@ -128,8 +128,7 @@ class GlobalValuePseudoSourceValue : public CallEntryPseudoSourceValue {
   const GlobalValue *GV;
 
 public:
-  GlobalValuePseudoSourceValue(const GlobalValue *GV,
-                               const TargetInstrInfo &TII);
+  GlobalValuePseudoSourceValue(const GlobalValue *GV, const TargetMachine &TM);
 
   static bool classof(const PseudoSourceValue *V) {
     return V->kind() == GlobalValueCallEntry;
@@ -143,7 +142,7 @@ class ExternalSymbolPseudoSourceValue : public CallEntryPseudoSourceValue {
   const char *ES;
 
 public:
-  ExternalSymbolPseudoSourceValue(const char *ES, const TargetInstrInfo &TII);
+  ExternalSymbolPseudoSourceValue(const char *ES, const TargetMachine &TM);
 
   static bool classof(const PseudoSourceValue *V) {
     return V->kind() == ExternalSymbolCallEntry;
@@ -154,7 +153,7 @@ public:
 
 /// Manages creation of pseudo source values.
 class PseudoSourceValueManager {
-  const TargetInstrInfo &TII;
+  const TargetMachine &TM;
   const PseudoSourceValue StackPSV, GOTPSV, JumpTablePSV, ConstantPoolPSV;
   std::map<int, std::unique_ptr<FixedStackPseudoSourceValue>> FSValues;
   StringMap<std::unique_ptr<const ExternalSymbolPseudoSourceValue>>
@@ -164,7 +163,7 @@ class PseudoSourceValueManager {
       GlobalCallEntries;
 
 public:
-  PseudoSourceValueManager(const TargetInstrInfo &TII);
+  PseudoSourceValueManager(const TargetMachine &TM);
 
   /// Return a pseudo source value referencing the area below the stack frame of
   /// a function, e.g., the argument space.
diff --git a/llvm/include/llvm/CodeGen/RDFGraph.h b/llvm/include/llvm/CodeGen/RDFGraph.h
index e0205d7c92c8..a323ee9dc396 100644
--- a/llvm/include/llvm/CodeGen/RDFGraph.h
+++ b/llvm/include/llvm/CodeGen/RDFGraph.h
@@ -749,7 +749,6 @@ namespace rdf {
 
     RegisterRef makeRegRef(unsigned Reg, unsigned Sub) const;
     RegisterRef makeRegRef(const MachineOperand &Op) const;
-    RegisterRef restrictRef(RegisterRef AR, RegisterRef BR) const;
 
     NodeAddr<RefNode*> getNextRelated(NodeAddr<InstrNode*> IA,
         NodeAddr<RefNode*> RA) const;
diff --git a/llvm/include/llvm/CodeGen/RegAllocPBQP.h b/llvm/include/llvm/CodeGen/RegAllocPBQP.h
index 1ed55082e32c..1ea8840947bc 100644
--- a/llvm/include/llvm/CodeGen/RegAllocPBQP.h
+++ b/llvm/include/llvm/CodeGen/RegAllocPBQP.h
@@ -183,11 +183,12 @@ public:
   NodeMetadata() = default;
 
   NodeMetadata(const NodeMetadata &Other)
-    : RS(Other.RS), NumOpts(Other.NumOpts), DeniedOpts(Other.DeniedOpts),
-      OptUnsafeEdges(new unsigned[NumOpts]), VReg(Other.VReg),
-      AllowedRegs(Other.AllowedRegs)
-#ifndef NDEBUG
-      , everConservativelyAllocatable(Other.everConservativelyAllocatable)
+      : RS(Other.RS), NumOpts(Other.NumOpts), DeniedOpts(Other.DeniedOpts),
+        OptUnsafeEdges(new unsigned[NumOpts]), VReg(Other.VReg),
+        AllowedRegs(Other.AllowedRegs)
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
+        ,
+        everConservativelyAllocatable(Other.everConservativelyAllocatable)
 #endif
   {
     if (NumOpts > 0) {
@@ -217,7 +218,7 @@ public:
     assert(RS >= this->RS && "A node's reduction state can not be downgraded");
     this->RS = RS;
 
-#ifndef NDEBUG
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
     // Remember this state to assert later that a non-infinite register
     // option was available.
     if (RS == ConservativelyAllocatable)
@@ -247,7 +248,7 @@ public:
        &OptUnsafeEdges[NumOpts]);
   }
 
-#ifndef NDEBUG
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
   bool wasConservativelyAllocatable() const {
     return everConservativelyAllocatable;
   }
@@ -261,7 +262,7 @@ private:
   Register VReg;
   GraphMetadata::AllowedRegVecRef AllowedRegs;
 
-#ifndef NDEBUG
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
   bool everConservativelyAllocatable = false;
 #endif
 };
diff --git a/llvm/include/llvm/CodeGen/Register.h b/llvm/include/llvm/CodeGen/Register.h
index a683223b5a4a..9dc3e98fe837 100644
--- a/llvm/include/llvm/CodeGen/Register.h
+++ b/llvm/include/llvm/CodeGen/Register.h
@@ -69,7 +69,7 @@ public:
   /// Return true if the specified register number is in
   /// the virtual register namespace.
   static bool isVirtualRegister(unsigned Reg) {
-    return Reg & MCRegister::VirtualRegFlag && !isStackSlot(Reg);
+    return Reg & MCRegister::VirtualRegFlag;
   }
 
   /// Convert a virtual register number to a 0-based index.
diff --git a/llvm/include/llvm/CodeGen/RegisterBank.h b/llvm/include/llvm/CodeGen/RegisterBank.h
new file mode 100644
index 000000000000..66885f113e8e
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/RegisterBank.h
@@ -0,0 +1,98 @@
+//==-- llvm/CodeGen/RegisterBank.h - Register Bank ---------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file declares the API of register banks.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_REGISTERBANK_H
+#define LLVM_CODEGEN_REGISTERBANK_H
+
+#include "llvm/ADT/BitVector.h"
+
+namespace llvm {
+// Forward declarations.
+class RegisterBankInfo;
+class raw_ostream;
+class TargetRegisterClass;
+class TargetRegisterInfo;
+
+/// This class implements the register bank concept.
+/// Two instances of RegisterBank must have different ID.
+/// This property is enforced by the RegisterBankInfo class.
+class RegisterBank {
+private:
+  unsigned ID;
+  const char *Name;
+  unsigned Size;
+  BitVector ContainedRegClasses;
+
+  /// Sentinel value used to recognize register bank not properly
+  /// initialized yet.
+  static const unsigned InvalidID;
+
+  /// Only the RegisterBankInfo can initialize RegisterBank properly.
+  friend RegisterBankInfo;
+
+public:
+  RegisterBank(unsigned ID, const char *Name, unsigned Size,
+               const uint32_t *CoveredClasses, unsigned NumRegClasses);
+
+  /// Get the identifier of this register bank.
+  unsigned getID() const { return ID; }
+
+  /// Get a user friendly name of this register bank.
+  /// Should be used only for debugging purposes.
+  const char *getName() const { return Name; }
+
+  /// Get the maximal size in bits that fits in this register bank.
+  unsigned getSize() const { return Size; }
+
+  /// Check whether this instance is ready to be used.
+  bool isValid() const;
+
+  /// Check if this register bank is valid. In other words,
+  /// if it has been properly constructed.
+  ///
+  /// \note This method does not check anything when assertions are disabled.
+  ///
+  /// \return True is the check was successful.
+  bool verify(const TargetRegisterInfo &TRI) const;
+
+  /// Check whether this register bank covers \p RC.
+  /// In other words, check if this register bank fully covers
+  /// the registers that \p RC contains.
+  /// \pre isValid()
+  bool covers(const TargetRegisterClass &RC) const;
+
+  /// Check whether \p OtherRB is the same as this.
+  bool operator==(const RegisterBank &OtherRB) const;
+  bool operator!=(const RegisterBank &OtherRB) const {
+    return !this->operator==(OtherRB);
+  }
+
+  /// Dump the register mask on dbgs() stream.
+  /// The dump is verbose.
+  void dump(const TargetRegisterInfo *TRI = nullptr) const;
+
+  /// Print the register mask on OS.
+  /// If IsForDebug is false, then only the name of the register bank
+  /// is printed. Otherwise, all the fields are printing.
+  /// TRI is then used to print the name of the register classes that
+  /// this register bank covers.
+  void print(raw_ostream &OS, bool IsForDebug = false,
+             const TargetRegisterInfo *TRI = nullptr) const;
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS, const RegisterBank &RegBank) {
+  RegBank.print(OS);
+  return OS;
+}
+} // End namespace llvm.
+
+#endif
diff --git a/llvm/include/llvm/CodeGen/RegisterBankInfo.h b/llvm/include/llvm/CodeGen/RegisterBankInfo.h
new file mode 100644
index 000000000000..bba4f1f025a0
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/RegisterBankInfo.h
@@ -0,0 +1,775 @@
+//===- llvm/CodeGen/RegisterBankInfo.h --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file declares the API for the register bank info.
+/// This API is responsible for handling the register banks.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_REGISTERBANKINFO_H
+#define LLVM_CODEGEN_REGISTERBANKINFO_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/Register.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LowLevelTypeImpl.h"
+#include <cassert>
+#include <initializer_list>
+#include <memory>
+
+namespace llvm {
+
+class MachineInstr;
+class MachineRegisterInfo;
+class raw_ostream;
+class RegisterBank;
+class TargetInstrInfo;
+class TargetRegisterClass;
+class TargetRegisterInfo;
+
+/// Holds all the information related to register banks.
+class RegisterBankInfo {
+public:
+  /// Helper struct that represents how a value is partially mapped
+  /// into a register.
+  /// The StartIdx and Length represent what region of the orginal
+  /// value this partial mapping covers.
+  /// This can be represented as a Mask of contiguous bit starting
+  /// at StartIdx bit and spanning Length bits.
+  /// StartIdx is the number of bits from the less significant bits.
+  struct PartialMapping {
+    /// Number of bits at which this partial mapping starts in the
+    /// original value.  The bits are counted from less significant
+    /// bits to most significant bits.
+    unsigned StartIdx;
+
+    /// Length of this mapping in bits. This is how many bits this
+    /// partial mapping covers in the original value:
+    /// from StartIdx to StartIdx + Length -1.
+    unsigned Length;
+
+    /// Register bank where the partial value lives.
+    const RegisterBank *RegBank;
+
+    PartialMapping() = default;
+
+    /// Provide a shortcut for quickly building PartialMapping.
+    PartialMapping(unsigned StartIdx, unsigned Length,
+                   const RegisterBank &RegBank)
+        : StartIdx(StartIdx), Length(Length), RegBank(&RegBank) {}
+
+    /// \return the index of in the original value of the most
+    /// significant bit that this partial mapping covers.
+    unsigned getHighBitIdx() const { return StartIdx + Length - 1; }
+
+    /// Print this partial mapping on dbgs() stream.
+    void dump() const;
+
+    /// Print this partial mapping on \p OS;
+    void print(raw_ostream &OS) const;
+
+    /// Check that the Mask is compatible with the RegBank.
+    /// Indeed, if the RegBank cannot accomadate the "active bits" of the mask,
+    /// there is no way this mapping is valid.
+    ///
+    /// \note This method does not check anything when assertions are disabled.
+    ///
+    /// \return True is the check was successful.
+    bool verify() const;
+  };
+
+  /// Helper struct that represents how a value is mapped through
+  /// different register banks.
+  ///
+  /// \note: So far we do not have any users of the complex mappings
+  /// (mappings with more than one partial mapping), but when we do,
+  /// we would have needed to duplicate partial mappings.
+  /// The alternative could be to use an array of pointers of partial
+  /// mapping (i.e., PartialMapping **BreakDown) and duplicate the
+  /// pointers instead.
+  ///
+  /// E.g.,
+  /// Let say we have a 32-bit add and a <2 x 32-bit> vadd. We
+  /// can expand the
+  /// <2 x 32-bit> add into 2 x 32-bit add.
+  ///
+  /// Currently the TableGen-like file would look like:
+  /// \code
+  /// PartialMapping[] = {
+  /// /*32-bit add*/      {0, 32, GPR}, // Scalar entry repeated for first
+  ///                                   // vec elt.
+  /// /*2x32-bit add*/    {0, 32, GPR}, {32, 32, GPR},
+  /// /*<2x32-bit> vadd*/ {0, 64, VPR}
+  /// }; // PartialMapping duplicated.
+  ///
+  /// ValueMapping[] {
+  ///   /*plain 32-bit add*/       {&PartialMapping[0], 1},
+  ///   /*expanded vadd on 2xadd*/ {&PartialMapping[1], 2},
+  ///   /*plain <2x32-bit> vadd*/  {&PartialMapping[3], 1}
+  /// };
+  /// \endcode
+  ///
+  /// With the array of pointer, we would have:
+  /// \code
+  /// PartialMapping[] = {
+  /// /*32-bit add lower */ { 0, 32, GPR},
+  /// /*32-bit add upper */ {32, 32, GPR},
+  /// /*<2x32-bit> vadd */  { 0, 64, VPR}
+  /// }; // No more duplication.
+  ///
+  /// BreakDowns[] = {
+  /// /*AddBreakDown*/   &PartialMapping[0],
+  /// /*2xAddBreakDown*/ &PartialMapping[0], &PartialMapping[1],
+  /// /*VAddBreakDown*/  &PartialMapping[2]
+  /// }; // Addresses of PartialMapping duplicated (smaller).
+  ///
+  /// ValueMapping[] {
+  ///   /*plain 32-bit add*/       {&BreakDowns[0], 1},
+  ///   /*expanded vadd on 2xadd*/ {&BreakDowns[1], 2},
+  ///   /*plain <2x32-bit> vadd*/  {&BreakDowns[3], 1}
+  /// };
+  /// \endcode
+  ///
+  /// Given that a PartialMapping is actually small, the code size
+  /// impact is actually a degradation. Moreover the compile time will
+  /// be hit by the additional indirection.
+  /// If PartialMapping gets bigger we may reconsider.
+  struct ValueMapping {
+    /// How the value is broken down between the different register banks.
+    const PartialMapping *BreakDown;
+
+    /// Number of partial mapping to break down this value.
+    unsigned NumBreakDowns;
+
+    /// The default constructor creates an invalid (isValid() == false)
+    /// instance.
+    ValueMapping() : ValueMapping(nullptr, 0) {}
+
+    /// Initialize a ValueMapping with the given parameter.
+    /// \p BreakDown needs to have a life time at least as long
+    /// as this instance.
+    ValueMapping(const PartialMapping *BreakDown, unsigned NumBreakDowns)
+        : BreakDown(BreakDown), NumBreakDowns(NumBreakDowns) {}
+
+    /// Iterators through the PartialMappings.
+    const PartialMapping *begin() const { return BreakDown; }
+    const PartialMapping *end() const { return BreakDown + NumBreakDowns; }
+
+    /// \return true if all partial mappings are the same size and register
+    /// bank.
+    bool partsAllUniform() const;
+
+    /// Check if this ValueMapping is valid.
+    bool isValid() const { return BreakDown && NumBreakDowns; }
+
+    /// Verify that this mapping makes sense for a value of
+    /// \p MeaningfulBitWidth.
+    /// \note This method does not check anything when assertions are disabled.
+    ///
+    /// \return True is the check was successful.
+    bool verify(unsigned MeaningfulBitWidth) const;
+
+    /// Print this on dbgs() stream.
+    void dump() const;
+
+    /// Print this on \p OS;
+    void print(raw_ostream &OS) const;
+  };
+
+  /// Helper class that represents how the value of an instruction may be
+  /// mapped and what is the related cost of such mapping.
+  class InstructionMapping {
+    /// Identifier of the mapping.
+    /// This is used to communicate between the target and the optimizers
+    /// which mapping should be realized.
+    unsigned ID = InvalidMappingID;
+
+    /// Cost of this mapping.
+    unsigned Cost = 0;
+
+    /// Mapping of all the operands.
+    const ValueMapping *OperandsMapping = nullptr;
+
+    /// Number of operands.
+    unsigned NumOperands = 0;
+
+    const ValueMapping &getOperandMapping(unsigned i) {
+      assert(i < getNumOperands() && "Out of bound operand");
+      return OperandsMapping[i];
+    }
+
+  public:
+    /// Constructor for the mapping of an instruction.
+    /// \p NumOperands must be equal to number of all the operands of
+    /// the related instruction.
+    /// The rationale is that it is more efficient for the optimizers
+    /// to be able to assume that the mapping of the ith operand is
+    /// at the index i.
+    InstructionMapping(unsigned ID, unsigned Cost,
+                       const ValueMapping *OperandsMapping,
+                       unsigned NumOperands)
+        : ID(ID), Cost(Cost), OperandsMapping(OperandsMapping),
+          NumOperands(NumOperands) {}
+
+    /// Default constructor.
+    /// Use this constructor to express that the mapping is invalid.
+    InstructionMapping() = default;
+
+    /// Get the cost.
+    unsigned getCost() const { return Cost; }
+
+    /// Get the ID.
+    unsigned getID() const { return ID; }
+
+    /// Get the number of operands.
+    unsigned getNumOperands() const { return NumOperands; }
+
+    /// Get the value mapping of the ith operand.
+    /// \pre The mapping for the ith operand has been set.
+    /// \pre The ith operand is a register.
+    const ValueMapping &getOperandMapping(unsigned i) const {
+      const ValueMapping &ValMapping =
+          const_cast<InstructionMapping *>(this)->getOperandMapping(i);
+      return ValMapping;
+    }
+
+    /// Set the mapping for all the operands.
+    /// In other words, OpdsMapping should hold at least getNumOperands
+    /// ValueMapping.
+    void setOperandsMapping(const ValueMapping *OpdsMapping) {
+      OperandsMapping = OpdsMapping;
+    }
+
+    /// Check whether this object is valid.
+    /// This is a lightweight check for obvious wrong instance.
+    bool isValid() const {
+      return getID() != InvalidMappingID && OperandsMapping;
+    }
+
+    /// Verifiy that this mapping makes sense for \p MI.
+    /// \pre \p MI must be connected to a MachineFunction.
+    ///
+    /// \note This method does not check anything when assertions are disabled.
+    ///
+    /// \return True is the check was successful.
+    bool verify(const MachineInstr &MI) const;
+
+    /// Print this on dbgs() stream.
+    void dump() const;
+
+    /// Print this on \p OS;
+    void print(raw_ostream &OS) const;
+  };
+
+  /// Convenient type to represent the alternatives for mapping an
+  /// instruction.
+  /// \todo When we move to TableGen this should be an array ref.
+  using InstructionMappings = SmallVector<const InstructionMapping *, 4>;
+
+  /// Helper class used to get/create the virtual registers that will be used
+  /// to replace the MachineOperand when applying a mapping.
+  class OperandsMapper {
+    /// The OpIdx-th cell contains the index in NewVRegs where the VRegs of the
+    /// OpIdx-th operand starts. -1 means we do not have such mapping yet.
+    /// Note: We use a SmallVector to avoid heap allocation for most cases.
+    SmallVector<int, 8> OpToNewVRegIdx;
+
+    /// Hold the registers that will be used to map MI with InstrMapping.
+    SmallVector<Register, 8> NewVRegs;
+
+    /// Current MachineRegisterInfo, used to create new virtual registers.
+    MachineRegisterInfo &MRI;
+
+    /// Instruction being remapped.
+    MachineInstr &MI;
+
+    /// New mapping of the instruction.
+    const InstructionMapping &InstrMapping;
+
+    /// Constant value identifying that the index in OpToNewVRegIdx
+    /// for an operand has not been set yet.
+    static const int DontKnowIdx;
+
+    /// Get the range in NewVRegs to store all the partial
+    /// values for the \p OpIdx-th operand.
+    ///
+    /// \return The iterator range for the space created.
+    //
+    /// \pre getMI().getOperand(OpIdx).isReg()
+    iterator_range<SmallVectorImpl<Register>::iterator>
+    getVRegsMem(unsigned OpIdx);
+
+    /// Get the end iterator for a range starting at \p StartIdx and
+    /// spannig \p NumVal in NewVRegs.
+    /// \pre StartIdx + NumVal <= NewVRegs.size()
+    SmallVectorImpl<Register>::const_iterator
+    getNewVRegsEnd(unsigned StartIdx, unsigned NumVal) const;
+    SmallVectorImpl<Register>::iterator getNewVRegsEnd(unsigned StartIdx,
+                                                       unsigned NumVal);
+
+  public:
+    /// Create an OperandsMapper that will hold the information to apply \p
+    /// InstrMapping to \p MI.
+    /// \pre InstrMapping.verify(MI)
+    OperandsMapper(MachineInstr &MI, const InstructionMapping &InstrMapping,
+                   MachineRegisterInfo &MRI);
+
+    /// \name Getters.
+    /// @{
+    /// The MachineInstr being remapped.
+    MachineInstr &getMI() const { return MI; }
+
+    /// The final mapping of the instruction.
+    const InstructionMapping &getInstrMapping() const { return InstrMapping; }
+
+    /// The MachineRegisterInfo we used to realize the mapping.
+    MachineRegisterInfo &getMRI() const { return MRI; }
+    /// @}
+
+    /// Create as many new virtual registers as needed for the mapping of the \p
+    /// OpIdx-th operand.
+    /// The number of registers is determined by the number of breakdown for the
+    /// related operand in the instruction mapping.
+    /// The type of the new registers is a plain scalar of the right size.
+    /// The proper type is expected to be set when the mapping is applied to
+    /// the instruction(s) that realizes the mapping.
+    ///
+    /// \pre getMI().getOperand(OpIdx).isReg()
+    ///
+    /// \post All the partial mapping of the \p OpIdx-th operand have been
+    /// assigned a new virtual register.
+    void createVRegs(unsigned OpIdx);
+
+    /// Set the virtual register of the \p PartialMapIdx-th partial mapping of
+    /// the OpIdx-th operand to \p NewVReg.
+    ///
+    /// \pre getMI().getOperand(OpIdx).isReg()
+    /// \pre getInstrMapping().getOperandMapping(OpIdx).BreakDown.size() >
+    /// PartialMapIdx
+    /// \pre NewReg != 0
+    ///
+    /// \post the \p PartialMapIdx-th register of the value mapping of the \p
+    /// OpIdx-th operand has been set.
+    void setVRegs(unsigned OpIdx, unsigned PartialMapIdx, Register NewVReg);
+
+    /// Get all the virtual registers required to map the \p OpIdx-th operand of
+    /// the instruction.
+    ///
+    /// This return an empty range when createVRegs or setVRegs has not been
+    /// called.
+    /// The iterator may be invalidated by a call to setVRegs or createVRegs.
+    ///
+    /// When \p ForDebug is true, we will not check that the list of new virtual
+    /// registers does not contain uninitialized values.
+    ///
+    /// \pre getMI().getOperand(OpIdx).isReg()
+    /// \pre ForDebug || All partial mappings have been set a register
+    iterator_range<SmallVectorImpl<Register>::const_iterator>
+    getVRegs(unsigned OpIdx, bool ForDebug = false) const;
+
+    /// Print this operands mapper on dbgs() stream.
+    void dump() const;
+
+    /// Print this operands mapper on \p OS stream.
+    void print(raw_ostream &OS, bool ForDebug = false) const;
+  };
+
+protected:
+  /// Hold the set of supported register banks.
+  RegisterBank **RegBanks;
+
+  /// Total number of register banks.
+  unsigned NumRegBanks;
+
+  /// Keep dynamically allocated PartialMapping in a separate map.
+  /// This shouldn't be needed when everything gets TableGen'ed.
+  mutable DenseMap<unsigned, std::unique_ptr<const PartialMapping>>
+      MapOfPartialMappings;
+
+  /// Keep dynamically allocated ValueMapping in a separate map.
+  /// This shouldn't be needed when everything gets TableGen'ed.
+  mutable DenseMap<unsigned, std::unique_ptr<const ValueMapping>>
+      MapOfValueMappings;
+
+  /// Keep dynamically allocated array of ValueMapping in a separate map.
+  /// This shouldn't be needed when everything gets TableGen'ed.
+  mutable DenseMap<unsigned, std::unique_ptr<ValueMapping[]>>
+      MapOfOperandsMappings;
+
+  /// Keep dynamically allocated InstructionMapping in a separate map.
+  /// This shouldn't be needed when everything gets TableGen'ed.
+  mutable DenseMap<unsigned, std::unique_ptr<const InstructionMapping>>
+      MapOfInstructionMappings;
+
+  /// Getting the minimal register class of a physreg is expensive.
+  /// Cache this information as we get it.
+  mutable DenseMap<unsigned, const TargetRegisterClass *> PhysRegMinimalRCs;
+
+  /// Create a RegisterBankInfo that can accommodate up to \p NumRegBanks
+  /// RegisterBank instances.
+  RegisterBankInfo(RegisterBank **RegBanks, unsigned NumRegBanks);
+
+  /// This constructor is meaningless.
+  /// It just provides a default constructor that can be used at link time
+  /// when GlobalISel is not built.
+  /// That way, targets can still inherit from this class without doing
+  /// crazy gymnastic to avoid link time failures.
+  /// \note That works because the constructor is inlined.
+  RegisterBankInfo() {
+    llvm_unreachable("This constructor should not be executed");
+  }
+
+  /// Get the register bank identified by \p ID.
+  RegisterBank &getRegBank(unsigned ID) {
+    assert(ID < getNumRegBanks() && "Accessing an unknown register bank");
+    return *RegBanks[ID];
+  }
+
+  /// Get the MinimalPhysRegClass for Reg.
+  /// \pre Reg is a physical register.
+  const TargetRegisterClass &
+  getMinimalPhysRegClass(Register Reg, const TargetRegisterInfo &TRI) const;
+
+  /// Try to get the mapping of \p MI.
+  /// See getInstrMapping for more details on what a mapping represents.
+  ///
+  /// Unlike getInstrMapping the returned InstructionMapping may be invalid
+  /// (isValid() == false).
+  /// This means that the target independent code is not smart enough
+  /// to get the mapping of \p MI and thus, the target has to provide the
+  /// information for \p MI.
+  ///
+  /// This implementation is able to get the mapping of:
+  /// - Target specific instructions by looking at the encoding constraints.
+  /// - Any instruction if all the register operands have already been assigned
+  ///   a register, a register class, or a register bank.
+  /// - Copies and phis if at least one of the operands has been assigned a
+  ///   register, a register class, or a register bank.
+  /// In other words, this method will likely fail to find a mapping for
+  /// any generic opcode that has not been lowered by target specific code.
+  const InstructionMapping &getInstrMappingImpl(const MachineInstr &MI) const;
+
+  /// Get the uniquely generated PartialMapping for the
+  /// given arguments.
+  const PartialMapping &getPartialMapping(unsigned StartIdx, unsigned Length,
+                                          const RegisterBank &RegBank) const;
+
+  /// \name Methods to get a uniquely generated ValueMapping.
+  /// @{
+
+  /// The most common ValueMapping consists of a single PartialMapping.
+  /// Feature a method for that.
+  const ValueMapping &getValueMapping(unsigned StartIdx, unsigned Length,
+                                      const RegisterBank &RegBank) const;
+
+  /// Get the ValueMapping for the given arguments.
+  const ValueMapping &getValueMapping(const PartialMapping *BreakDown,
+                                      unsigned NumBreakDowns) const;
+  /// @}
+
+  /// \name Methods to get a uniquely generated array of ValueMapping.
+  /// @{
+
+  /// Get the uniquely generated array of ValueMapping for the
+  /// elements of between \p Begin and \p End.
+  ///
+  /// Elements that are nullptr will be replaced by
+  /// invalid ValueMapping (ValueMapping::isValid == false).
+  ///
+  /// \pre The pointers on ValueMapping between \p Begin and \p End
+  /// must uniquely identify a ValueMapping. Otherwise, there is no
+  /// guarantee that the return instance will be unique, i.e., another
+  /// OperandsMapping could have the same content.
+  template <typename Iterator>
+  const ValueMapping *getOperandsMapping(Iterator Begin, Iterator End) const;
+
+  /// Get the uniquely generated array of ValueMapping for the
+  /// elements of \p OpdsMapping.
+  ///
+  /// Elements of \p OpdsMapping that are nullptr will be replaced by
+  /// invalid ValueMapping (ValueMapping::isValid == false).
+  const ValueMapping *getOperandsMapping(
+      const SmallVectorImpl<const ValueMapping *> &OpdsMapping) const;
+
+  /// Get the uniquely generated array of ValueMapping for the
+  /// given arguments.
+  ///
+  /// Arguments that are nullptr will be replaced by invalid
+  /// ValueMapping (ValueMapping::isValid == false).
+  const ValueMapping *getOperandsMapping(
+      std::initializer_list<const ValueMapping *> OpdsMapping) const;
+  /// @}
+
+  /// \name Methods to get a uniquely generated InstructionMapping.
+  /// @{
+
+private:
+  /// Method to get a uniquely generated InstructionMapping.
+  const InstructionMapping &
+  getInstructionMappingImpl(bool IsInvalid, unsigned ID = InvalidMappingID,
+                            unsigned Cost = 0,
+                            const ValueMapping *OperandsMapping = nullptr,
+                            unsigned NumOperands = 0) const;
+
+public:
+  /// Method to get a uniquely generated InstructionMapping.
+  const InstructionMapping &
+  getInstructionMapping(unsigned ID, unsigned Cost,
+                        const ValueMapping *OperandsMapping,
+                        unsigned NumOperands) const {
+    return getInstructionMappingImpl(/*IsInvalid*/ false, ID, Cost,
+                                     OperandsMapping, NumOperands);
+  }
+
+  /// Method to get a uniquely generated invalid InstructionMapping.
+  const InstructionMapping &getInvalidInstructionMapping() const {
+    return getInstructionMappingImpl(/*IsInvalid*/ true);
+  }
+  /// @}
+
+  /// Get the register bank for the \p OpIdx-th operand of \p MI form
+  /// the encoding constraints, if any.
+  ///
+  /// \return A register bank that covers the register class of the
+  /// related encoding constraints or nullptr if \p MI did not provide
+  /// enough information to deduce it.
+  const RegisterBank *
+  getRegBankFromConstraints(const MachineInstr &MI, unsigned OpIdx,
+                            const TargetInstrInfo &TII,
+                            const MachineRegisterInfo &MRI) const;
+
+  /// Helper method to apply something that is like the default mapping.
+  /// Basically, that means that \p OpdMapper.getMI() is left untouched
+  /// aside from the reassignment of the register operand that have been
+  /// remapped.
+  ///
+  /// The type of all the new registers that have been created by the
+  /// mapper are properly remapped to the type of the original registers
+  /// they replace. In other words, the semantic of the instruction does
+  /// not change, only the register banks.
+  ///
+  /// If the mapping of one of the operand spans several registers, this
+  /// method will abort as this is not like a default mapping anymore.
+  ///
+  /// \pre For OpIdx in {0..\p OpdMapper.getMI().getNumOperands())
+  ///        the range OpdMapper.getVRegs(OpIdx) is empty or of size 1.
+  static void applyDefaultMapping(const OperandsMapper &OpdMapper);
+
+  /// See ::applyMapping.
+  virtual void applyMappingImpl(const OperandsMapper &OpdMapper) const {
+    llvm_unreachable("The target has to implement that part");
+  }
+
+public:
+  virtual ~RegisterBankInfo() = default;
+
+  /// Get the register bank identified by \p ID.
+  const RegisterBank &getRegBank(unsigned ID) const {
+    return const_cast<RegisterBankInfo *>(this)->getRegBank(ID);
+  }
+
+  /// Get the register bank of \p Reg.
+  /// If Reg has not been assigned a register, a register class,
+  /// or a register bank, then this returns nullptr.
+  ///
+  /// \pre Reg != 0 (NoRegister)
+  const RegisterBank *getRegBank(Register Reg, const MachineRegisterInfo &MRI,
+                                 const TargetRegisterInfo &TRI) const;
+
+  /// Get the total number of register banks.
+  unsigned getNumRegBanks() const { return NumRegBanks; }
+
+  /// Get a register bank that covers \p RC.
+  ///
+  /// \pre \p RC is a user-defined register class (as opposed as one
+  /// generated by TableGen).
+  ///
+  /// \note The mapping RC -> RegBank could be built while adding the
+  /// coverage for the register banks. However, we do not do it, because,
+  /// at least for now, we only need this information for register classes
+  /// that are used in the description of instruction. In other words,
+  /// there are just a handful of them and we do not want to waste space.
+  ///
+  /// \todo This should be TableGen'ed.
+  virtual const RegisterBank &
+  getRegBankFromRegClass(const TargetRegisterClass &RC, LLT Ty) const {
+    llvm_unreachable("The target must override this method");
+  }
+
+  /// Get the cost of a copy from \p B to \p A, or put differently,
+  /// get the cost of A = COPY B. Since register banks may cover
+  /// different size, \p Size specifies what will be the size in bits
+  /// that will be copied around.
+  ///
+  /// \note Since this is a copy, both registers have the same size.
+  virtual unsigned copyCost(const RegisterBank &A, const RegisterBank &B,
+                            unsigned Size) const {
+    // Optimistically assume that copies are coalesced. I.e., when
+    // they are on the same bank, they are free.
+    // Otherwise assume a non-zero cost of 1. The targets are supposed
+    // to override that properly anyway if they care.
+    return &A != &B;
+  }
+
+  /// \returns true if emitting a copy from \p Src to \p Dst is impossible.
+  bool cannotCopy(const RegisterBank &Dst, const RegisterBank &Src,
+                  unsigned Size) const {
+    return copyCost(Dst, Src, Size) == std::numeric_limits<unsigned>::max();
+  }
+
+  /// Get the cost of using \p ValMapping to decompose a register. This is
+  /// similar to ::copyCost, except for cases where multiple copy-like
+  /// operations need to be inserted. If the register is used as a source
+  /// operand and already has a bank assigned, \p CurBank is non-null.
+  virtual unsigned
+  getBreakDownCost(const ValueMapping &ValMapping,
+                   const RegisterBank *CurBank = nullptr) const {
+    return std::numeric_limits<unsigned>::max();
+  }
+
+  /// Constrain the (possibly generic) virtual register \p Reg to \p RC.
+  ///
+  /// \pre \p Reg is a virtual register that either has a bank or a class.
+  /// \returns The constrained register class, or nullptr if there is none.
+  /// \note This is a generic variant of MachineRegisterInfo::constrainRegClass
+  /// \note Use MachineRegisterInfo::constrainRegAttrs instead for any non-isel
+  /// purpose, including non-select passes of GlobalISel
+  static const TargetRegisterClass *
+  constrainGenericRegister(Register Reg, const TargetRegisterClass &RC,
+                           MachineRegisterInfo &MRI);
+
+  /// Identifier used when the related instruction mapping instance
+  /// is generated by target independent code.
+  /// Make sure not to use that identifier to avoid possible collision.
+  static const unsigned DefaultMappingID;
+
+  /// Identifier used when the related instruction mapping instance
+  /// is generated by the default constructor.
+  /// Make sure not to use that identifier.
+  static const unsigned InvalidMappingID;
+
+  /// Get the mapping of the different operands of \p MI
+  /// on the register bank.
+  /// This mapping should be the direct translation of \p MI.
+  /// In other words, when \p MI is mapped with the returned mapping,
+  /// only the register banks of the operands of \p MI need to be updated.
+  /// In particular, neither the opcode nor the type of \p MI needs to be
+  /// updated for this direct mapping.
+  ///
+  /// The target independent implementation gives a mapping based on
+  /// the register classes for the target specific opcode.
+  /// It uses the ID RegisterBankInfo::DefaultMappingID for that mapping.
+  /// Make sure you do not use that ID for the alternative mapping
+  /// for MI. See getInstrAlternativeMappings for the alternative
+  /// mappings.
+  ///
+  /// For instance, if \p MI is a vector add, the mapping should
+  /// not be a scalarization of the add.
+  ///
+  /// \post returnedVal.verify(MI).
+  ///
+  /// \note If returnedVal does not verify MI, this would probably mean
+  /// that the target does not support that instruction.
+  virtual const InstructionMapping &
+  getInstrMapping(const MachineInstr &MI) const;
+
+  /// Get the alternative mappings for \p MI.
+  /// Alternative in the sense different from getInstrMapping.
+  virtual InstructionMappings
+  getInstrAlternativeMappings(const MachineInstr &MI) const;
+
+  /// Get the possible mapping for \p MI.
+  /// A mapping defines where the different operands may live and at what cost.
+  /// For instance, let us consider:
+  /// v0(16) = G_ADD <2 x i8> v1, v2
+  /// The possible mapping could be:
+  ///
+  /// {/*ID*/VectorAdd, /*Cost*/1, /*v0*/{(0xFFFF, VPR)}, /*v1*/{(0xFFFF, VPR)},
+  ///                              /*v2*/{(0xFFFF, VPR)}}
+  /// {/*ID*/ScalarAddx2, /*Cost*/2, /*v0*/{(0x00FF, GPR),(0xFF00, GPR)},
+  ///                                /*v1*/{(0x00FF, GPR),(0xFF00, GPR)},
+  ///                                /*v2*/{(0x00FF, GPR),(0xFF00, GPR)}}
+  ///
+  /// \note The first alternative of the returned mapping should be the
+  /// direct translation of \p MI current form.
+  ///
+  /// \post !returnedVal.empty().
+  InstructionMappings getInstrPossibleMappings(const MachineInstr &MI) const;
+
+  /// Apply \p OpdMapper.getInstrMapping() to \p OpdMapper.getMI().
+  /// After this call \p OpdMapper.getMI() may not be valid anymore.
+  /// \p OpdMapper.getInstrMapping().getID() carries the information of
+  /// what has been chosen to map \p OpdMapper.getMI(). This ID is set
+  /// by the various getInstrXXXMapping method.
+  ///
+  /// Therefore, getting the mapping and applying it should be kept in
+  /// sync.
+  void applyMapping(const OperandsMapper &OpdMapper) const {
+    // The only mapping we know how to handle is the default mapping.
+    if (OpdMapper.getInstrMapping().getID() == DefaultMappingID)
+      return applyDefaultMapping(OpdMapper);
+    // For other mapping, the target needs to do the right thing.
+    // If that means calling applyDefaultMapping, fine, but this
+    // must be explicitly stated.
+    applyMappingImpl(OpdMapper);
+  }
+
+  /// Get the size in bits of \p Reg.
+  /// Utility method to get the size of any registers. Unlike
+  /// MachineRegisterInfo::getSize, the register does not need to be a
+  /// virtual register.
+  ///
+  /// \pre \p Reg != 0 (NoRegister).
+  unsigned getSizeInBits(Register Reg, const MachineRegisterInfo &MRI,
+                         const TargetRegisterInfo &TRI) const;
+
+  /// Check that information hold by this instance make sense for the
+  /// given \p TRI.
+  ///
+  /// \note This method does not check anything when assertions are disabled.
+  ///
+  /// \return True is the check was successful.
+  bool verify(const TargetRegisterInfo &TRI) const;
+};
+
+inline raw_ostream &
+operator<<(raw_ostream &OS,
+           const RegisterBankInfo::PartialMapping &PartMapping) {
+  PartMapping.print(OS);
+  return OS;
+}
+
+inline raw_ostream &
+operator<<(raw_ostream &OS, const RegisterBankInfo::ValueMapping &ValMapping) {
+  ValMapping.print(OS);
+  return OS;
+}
+
+inline raw_ostream &
+operator<<(raw_ostream &OS,
+           const RegisterBankInfo::InstructionMapping &InstrMapping) {
+  InstrMapping.print(OS);
+  return OS;
+}
+
+inline raw_ostream &
+operator<<(raw_ostream &OS, const RegisterBankInfo::OperandsMapper &OpdMapper) {
+  OpdMapper.print(OS, /*ForDebug*/ false);
+  return OS;
+}
+
+/// Hashing function for PartialMapping.
+/// It is required for the hashing of ValueMapping.
+hash_code hash_value(const RegisterBankInfo::PartialMapping &PartMapping);
+
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_REGISTERBANKINFO_H
diff --git a/llvm/include/llvm/CodeGen/RegisterClassInfo.h b/llvm/include/llvm/CodeGen/RegisterClassInfo.h
index d82f1db60d8b..39c72a42c433 100644
--- a/llvm/include/llvm/CodeGen/RegisterClassInfo.h
+++ b/llvm/include/llvm/CodeGen/RegisterClassInfo.h
@@ -20,8 +20,7 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include <cassert>
+#include "llvm/MC/MCRegister.h"
 #include <cstdint>
 #include <memory>
 
@@ -61,6 +60,10 @@ class RegisterClassInfo {
   // Map register alias to the callee saved Register.
   SmallVector<MCPhysReg, 4> CalleeSavedAliases;
 
+  // Indicate if a specified callee saved register be in the allocation order
+  // exactly as written in the tablegen descriptions or listed later.
+  BitVector IgnoreCSRForAllocOrder;
+
   // Reserved registers in the current MF.
   BitVector Reserved;
 
diff --git a/llvm/include/llvm/CodeGen/RegisterPressure.h b/llvm/include/llvm/CodeGen/RegisterPressure.h
index 1deeb4d41511..c40c0eec80ec 100644
--- a/llvm/include/llvm/CodeGen/RegisterPressure.h
+++ b/llvm/include/llvm/CodeGen/RegisterPressure.h
@@ -22,7 +22,6 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/MC/LaneBitmask.h"
 #include <cassert>
-#include <cstddef>
 #include <cstdint>
 #include <cstdlib>
 #include <limits>
diff --git a/llvm/include/llvm/CodeGen/RegisterScavenging.h b/llvm/include/llvm/CodeGen/RegisterScavenging.h
index 218e05f6eb6b..1f0cd273bf61 100644
--- a/llvm/include/llvm/CodeGen/RegisterScavenging.h
+++ b/llvm/include/llvm/CodeGen/RegisterScavenging.h
@@ -70,6 +70,26 @@ class RegScavenger {
 public:
   RegScavenger() = default;
 
+  /// Record that \p Reg is in use at scavenging index \p FI. This is for
+  /// targets which need to directly manage the spilling process, and need to
+  /// update the scavenger's internal state.  It's expected this be called a
+  /// second time with \p Restore set to a non-null value, so that the
+  /// externally inserted restore instruction resets the scavenged slot
+  /// liveness when encountered.
+  void assignRegToScavengingIndex(int FI, Register Reg,
+                                  MachineInstr *Restore = nullptr) {
+    for (ScavengedInfo &Slot : Scavenged) {
+      if (Slot.FrameIndex == FI) {
+        assert(!Slot.Reg || Slot.Reg == Reg);
+        Slot.Reg = Reg;
+        Slot.Restore = Restore;
+        return;
+      }
+    }
+
+    llvm_unreachable("did not find scavenging index");
+  }
+
   /// Start tracking liveness from the begin of basic block \p MBB.
   void enterBasicBlock(MachineBasicBlock &MBB);
 
diff --git a/llvm/include/llvm/CodeGen/RegisterUsageInfo.h b/llvm/include/llvm/CodeGen/RegisterUsageInfo.h
index bf347c0753e5..8b406a275025 100644
--- a/llvm/include/llvm/CodeGen/RegisterUsageInfo.h
+++ b/llvm/include/llvm/CodeGen/RegisterUsageInfo.h
@@ -20,9 +20,9 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
 #include <cstdint>
 #include <vector>
 
diff --git a/llvm/include/llvm/CodeGen/ReplaceWithVeclib.h b/llvm/include/llvm/CodeGen/ReplaceWithVeclib.h
index 7c0ebe7191e4..c71aca0c992b 100644
--- a/llvm/include/llvm/CodeGen/ReplaceWithVeclib.h
+++ b/llvm/include/llvm/CodeGen/ReplaceWithVeclib.h
@@ -1,4 +1,4 @@
-//===- ReplaceWithVeclib.h - Replace vector instrinsics with veclib calls -===//
+//===- ReplaceWithVeclib.h - Replace vector intrinsics with veclib calls --===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -17,8 +17,10 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
 
 namespace llvm {
+class Function;
 struct ReplaceWithVeclib : public PassInfoMixin<ReplaceWithVeclib> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
diff --git a/llvm/include/llvm/CodeGen/ScheduleDAG.h b/llvm/include/llvm/CodeGen/ScheduleDAG.h
index af8c0cd8756e..f1c377f76d02 100644
--- a/llvm/include/llvm/CodeGen/ScheduleDAG.h
+++ b/llvm/include/llvm/CodeGen/ScheduleDAG.h
@@ -16,7 +16,6 @@
 #define LLVM_CODEGEN_SCHEDULEDAG_H
 
 #include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator.h"
@@ -31,6 +30,7 @@
 
 namespace llvm {
 
+template <class GraphType> struct GraphTraits;
 template<class Graph> class GraphWriter;
 class LLVMTargetMachine;
 class MachineFunction;
diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
index 50b186de2b05..fb3900b4a9c1 100644
--- a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -16,10 +16,10 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PointerIntPair.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SparseMultiSet.h"
 #include "llvm/ADT/SparseSet.h"
+#include "llvm/ADT/identity.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index e31719bcff0b..bcbd7ebcc0c9 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -20,7 +20,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/FoldingSet.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/ilist.h"
@@ -33,17 +32,13 @@
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/ArrayRecycler.h"
-#include "llvm/Support/AtomicOrdering.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/RecyclingAllocator.h"
-#include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <functional>
@@ -55,6 +50,15 @@
 
 namespace llvm {
 
+class DIExpression;
+class DILabel;
+class DIVariable;
+class Function;
+class Pass;
+class Type;
+template <class GraphType> struct GraphTraits;
+template <typename T, unsigned int N> class SmallSetVector;
+template <typename T, typename Enable> struct FoldingSetTrait;
 class AAResults;
 class BlockAddress;
 class BlockFrequencyInfo;
@@ -276,8 +280,16 @@ class SelectionDAG {
 
   DenseMap<const SDNode *, CallSiteDbgInfo> SDCallSiteDbgInfo;
 
+  /// PersistentId counter to be used when inserting the next
+  /// SDNode to this SelectionDAG. We do not place that under
+  /// `#if LLVM_ENABLE_ABI_BREAKING_CHECKS` intentionally because
+  /// it adds unneeded complexity without noticeable
+  /// benefits (see discussion with @thakis in D120714).
   uint16_t NextPersistentId = 0;
 
+  /// Are instruction referencing variable locations desired for this function?
+  bool UseInstrRefDebugInfo = false;
+
 public:
   /// Clients of various APIs that cause global effects on
   /// the DAG can optionally implement this interface.  This allows the clients
@@ -440,6 +452,9 @@ public:
   const DataLayout &getDataLayout() const { return MF->getDataLayout(); }
   const TargetMachine &getTarget() const { return TM; }
   const TargetSubtargetInfo &getSubtarget() const { return MF->getSubtarget(); }
+  template <typename STC> const STC &getSubtarget() const {
+    return MF->getSubtarget<STC>();
+  }
   const TargetLowering &getTargetLoweringInfo() const { return *TLI; }
   const TargetLibraryInfo &getLibInfo() const { return *LibInfo; }
   const SelectionDAGTargetInfo &getSelectionDAGInfo() const { return *TSI; }
@@ -467,7 +482,7 @@ public:
   void viewGraph(const std::string &Title);
   void viewGraph();
 
-#ifndef NDEBUG
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
   std::map<const SDNode *, std::string> NodeGraphAttrs;
 #endif
 
@@ -893,6 +908,11 @@ public:
   /// Create a logical NOT operation as (XOR Val, BooleanOne).
   SDValue getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT);
 
+  /// Create a vector-predicated logical NOT operation as (VP_XOR Val,
+  /// BooleanOne, Mask, EVL).
+  SDValue getVPLogicalNOT(const SDLoc &DL, SDValue Val, SDValue Mask,
+                          SDValue EVL, EVT VT);
+
   /// Returns sum of the base pointer and offset.
   /// Unlike getObjectPtrOffset this does not set NoUnsignedWrap by default.
   SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL,
@@ -1032,25 +1052,26 @@ public:
                      const AAMDNodes &AAInfo = AAMDNodes());
 
   SDValue getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src,
-                    SDValue Size, Align Alignment, bool isVol, bool isTailCall,
+                    SDValue Size, Align Alignment, bool isVol,
+                    bool AlwaysInline, bool isTailCall,
                     MachinePointerInfo DstPtrInfo,
                     const AAMDNodes &AAInfo = AAMDNodes());
 
   SDValue getAtomicMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst,
-                          unsigned DstAlign, SDValue Src, unsigned SrcAlign,
-                          SDValue Size, Type *SizeTy, unsigned ElemSz,
-                          bool isTailCall, MachinePointerInfo DstPtrInfo,
+                          SDValue Src, SDValue Size, Type *SizeTy,
+                          unsigned ElemSz, bool isTailCall,
+                          MachinePointerInfo DstPtrInfo,
                           MachinePointerInfo SrcPtrInfo);
 
   SDValue getAtomicMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst,
-                           unsigned DstAlign, SDValue Src, unsigned SrcAlign,
-                           SDValue Size, Type *SizeTy, unsigned ElemSz,
-                           bool isTailCall, MachinePointerInfo DstPtrInfo,
+                           SDValue Src, SDValue Size, Type *SizeTy,
+                           unsigned ElemSz, bool isTailCall,
+                           MachinePointerInfo DstPtrInfo,
                            MachinePointerInfo SrcPtrInfo);
 
   SDValue getAtomicMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
-                          unsigned DstAlign, SDValue Value, SDValue Size,
-                          Type *SizeTy, unsigned ElemSz, bool isTailCall,
+                          SDValue Value, SDValue Size, Type *SizeTy,
+                          unsigned ElemSz, bool isTailCall,
                           MachinePointerInfo DstPtrInfo);
 
   /// Helper function to make it easier to build SetCC's if you just have an
@@ -1070,14 +1091,24 @@ public:
     return getNode(ISD::SETCC, DL, VT, LHS, RHS, getCondCode(Cond));
   }
 
+  /// Helper function to make it easier to build VP_SETCCs if you just have an
+  /// ISD::CondCode instead of an SDValue.
+  SDValue getSetCCVP(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS,
+                     ISD::CondCode Cond, SDValue Mask, SDValue EVL) {
+    assert(LHS.getValueType().isVector() && RHS.getValueType().isVector() &&
+           "Cannot compare scalars");
+    assert(Cond != ISD::SETCC_INVALID &&
+           "Cannot create a setCC of an invalid node.");
+    return getNode(ISD::VP_SETCC, DL, VT, LHS, RHS, getCondCode(Cond), Mask,
+                   EVL);
+  }
+
   /// Helper function to make it easier to build Select's if you just have
   /// operands and don't want to check for vector.
   SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS,
                     SDValue RHS) {
-    assert(LHS.getValueType() == RHS.getValueType() &&
+    assert(LHS.getValueType() == VT && RHS.getValueType() == VT &&
            "Cannot use select on differing types");
-    assert(VT.isVector() == LHS.getValueType().isVector() &&
-           "Cannot mix vectors and scalars");
     auto Opcode = Cond.getValueType().isVector() ? ISD::VSELECT : ISD::SELECT;
     return getNode(Opcode, DL, VT, Cond, LHS, RHS);
   }
@@ -1149,7 +1180,7 @@ public:
       uint64_t Size = 0, const AAMDNodes &AAInfo = AAMDNodes()) {
     // Ensure that codegen never sees alignment 0
     return getMemIntrinsicNode(Opcode, dl, VTList, Ops, MemVT, PtrInfo,
-                               Alignment.getValueOr(getEVTAlign(MemVT)), Flags,
+                               Alignment.value_or(getEVTAlign(MemVT)), Flags,
                                Size, AAInfo);
   }
 
@@ -1230,7 +1261,7 @@ public:
       const AAMDNodes &AAInfo = AAMDNodes(), const MDNode *Ranges = nullptr) {
     // Ensures that codegen never sees a None Alignment.
     return getLoad(AM, ExtType, VT, dl, Chain, Ptr, Offset, PtrInfo, MemVT,
-                   Alignment.getValueOr(getEVTAlign(MemVT)), MMOFlags, AAInfo,
+                   Alignment.value_or(getEVTAlign(MemVT)), MMOFlags, AAInfo,
                    Ranges);
   }
   /// FIXME: Remove once transition to Align is over.
@@ -1264,7 +1295,7 @@ public:
            MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
            const AAMDNodes &AAInfo = AAMDNodes()) {
     return getStore(Chain, dl, Val, Ptr, PtrInfo,
-                    Alignment.getValueOr(getEVTAlign(Val.getValueType())),
+                    Alignment.value_or(getEVTAlign(Val.getValueType())),
                     MMOFlags, AAInfo);
   }
   /// FIXME: Remove once transition to Align is over.
@@ -1290,7 +1321,7 @@ public:
                 MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
                 const AAMDNodes &AAInfo = AAMDNodes()) {
     return getTruncStore(Chain, dl, Val, Ptr, PtrInfo, SVT,
-                         Alignment.getValueOr(getEVTAlign(SVT)), MMOFlags,
+                         Alignment.value_or(getEVTAlign(SVT)), MMOFlags,
                          AAInfo);
   }
   /// FIXME: Remove once transition to Align is over.
@@ -1323,7 +1354,7 @@ public:
             const MDNode *Ranges = nullptr, bool IsExpanding = false) {
     // Ensures that codegen never sees a None Alignment.
     return getLoadVP(AM, ExtType, VT, dl, Chain, Ptr, Offset, Mask, EVL,
-                     PtrInfo, MemVT, Alignment.getValueOr(getEVTAlign(MemVT)),
+                     PtrInfo, MemVT, Alignment.value_or(getEVTAlign(MemVT)),
                      MMOFlags, AAInfo, Ranges, IsExpanding);
   }
   SDValue getLoadVP(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT,
@@ -1364,6 +1395,77 @@ public:
   SDValue getIndexedStoreVP(SDValue OrigStore, const SDLoc &dl, SDValue Base,
                             SDValue Offset, ISD::MemIndexedMode AM);
 
+  SDValue getStridedLoadVP(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
+                           EVT VT, const SDLoc &DL, SDValue Chain, SDValue Ptr,
+                           SDValue Offset, SDValue Stride, SDValue Mask,
+                           SDValue EVL, MachinePointerInfo PtrInfo, EVT MemVT,
+                           Align Alignment, MachineMemOperand::Flags MMOFlags,
+                           const AAMDNodes &AAInfo,
+                           const MDNode *Ranges = nullptr,
+                           bool IsExpanding = false);
+  inline SDValue getStridedLoadVP(
+      ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT, const SDLoc &DL,
+      SDValue Chain, SDValue Ptr, SDValue Offset, SDValue Stride, SDValue Mask,
+      SDValue EVL, MachinePointerInfo PtrInfo, EVT MemVT,
+      MaybeAlign Alignment = MaybeAlign(),
+      MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
+      const AAMDNodes &AAInfo = AAMDNodes(), const MDNode *Ranges = nullptr,
+      bool IsExpanding = false) {
+    // Ensures that codegen never sees a None Alignment.
+    return getStridedLoadVP(AM, ExtType, VT, DL, Chain, Ptr, Offset, Stride,
+                            Mask, EVL, PtrInfo, MemVT,
+                            Alignment.value_or(getEVTAlign(MemVT)), MMOFlags,
+                            AAInfo, Ranges, IsExpanding);
+  }
+  SDValue getStridedLoadVP(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
+                           EVT VT, const SDLoc &DL, SDValue Chain, SDValue Ptr,
+                           SDValue Offset, SDValue Stride, SDValue Mask,
+                           SDValue EVL, EVT MemVT, MachineMemOperand *MMO,
+                           bool IsExpanding = false);
+  SDValue getStridedLoadVP(EVT VT, const SDLoc &DL, SDValue Chain, SDValue Ptr,
+                           SDValue Stride, SDValue Mask, SDValue EVL,
+                           MachinePointerInfo PtrInfo, MaybeAlign Alignment,
+                           MachineMemOperand::Flags MMOFlags,
+                           const AAMDNodes &AAInfo,
+                           const MDNode *Ranges = nullptr,
+                           bool IsExpanding = false);
+  SDValue getStridedLoadVP(EVT VT, const SDLoc &DL, SDValue Chain, SDValue Ptr,
+                           SDValue Stride, SDValue Mask, SDValue EVL,
+                           MachineMemOperand *MMO, bool IsExpanding = false);
+  SDValue
+  getExtStridedLoadVP(ISD::LoadExtType ExtType, const SDLoc &DL, EVT VT,
+                      SDValue Chain, SDValue Ptr, SDValue Stride, SDValue Mask,
+                      SDValue EVL, MachinePointerInfo PtrInfo, EVT MemVT,
+                      MaybeAlign Alignment, MachineMemOperand::Flags MMOFlags,
+                      const AAMDNodes &AAInfo, bool IsExpanding = false);
+  SDValue getExtStridedLoadVP(ISD::LoadExtType ExtType, const SDLoc &DL, EVT VT,
+                              SDValue Chain, SDValue Ptr, SDValue Stride,
+                              SDValue Mask, SDValue EVL, EVT MemVT,
+                              MachineMemOperand *MMO, bool IsExpanding = false);
+  SDValue getIndexedStridedLoadVP(SDValue OrigLoad, const SDLoc &DL,
+                                  SDValue Base, SDValue Offset,
+                                  ISD::MemIndexedMode AM);
+  SDValue getStridedStoreVP(SDValue Chain, const SDLoc &DL, SDValue Val,
+                            SDValue Ptr, SDValue Offset, SDValue Stride,
+                            SDValue Mask, SDValue EVL, EVT MemVT,
+                            MachineMemOperand *MMO, ISD::MemIndexedMode AM,
+                            bool IsTruncating = false,
+                            bool IsCompressing = false);
+  SDValue getTruncStridedStoreVP(SDValue Chain, const SDLoc &DL, SDValue Val,
+                                 SDValue Ptr, SDValue Stride, SDValue Mask,
+                                 SDValue EVL, MachinePointerInfo PtrInfo,
+                                 EVT SVT, Align Alignment,
+                                 MachineMemOperand::Flags MMOFlags,
+                                 const AAMDNodes &AAInfo,
+                                 bool IsCompressing = false);
+  SDValue getTruncStridedStoreVP(SDValue Chain, const SDLoc &DL, SDValue Val,
+                                 SDValue Ptr, SDValue Stride, SDValue Mask,
+                                 SDValue EVL, EVT SVT, MachineMemOperand *MMO,
+                                 bool IsCompressing = false);
+  SDValue getIndexedStridedStoreVP(SDValue OrigStore, const SDLoc &DL,
+                                   SDValue Base, SDValue Offset,
+                                   ISD::MemIndexedMode AM);
+
   SDValue getGatherVP(SDVTList VTs, EVT VT, const SDLoc &dl,
                       ArrayRef<SDValue> Ops, MachineMemOperand *MMO,
                       ISD::MemIndexType IndexType);
@@ -1412,6 +1514,11 @@ public:
   /// Return an AssertAlignSDNode.
   SDValue getAssertAlign(const SDLoc &DL, SDValue V, Align A);
 
+  /// Swap N1 and N2 if Opcode is a commutative binary opcode
+  /// and the canonical form expects the opposite order.
+  void canonicalizeCommutativeBinop(unsigned Opcode, SDValue &N1,
+                                    SDValue &N2) const;
+
   /// Return the specified value casted to
   /// the target's desired shift amount type.
   SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op);
@@ -1702,6 +1809,16 @@ public:
   /// function mirrors \c llvm::salvageDebugInfo.
   void salvageDebugInfo(SDNode &N);
 
+  /// Signal whether instruction referencing variable locations are desired for
+  /// this function's debug-info.
+  void useInstrRefDebugInfo(bool Flag) {
+    UseInstrRefDebugInfo = Flag;
+  }
+
+  bool getUseInstrRefDebugInfo() const {
+    return UseInstrRefDebugInfo;
+  }
+
   void dump() const;
 
   /// In most cases this function returns the ABI alignment for a given type,
@@ -1745,16 +1862,6 @@ public:
   /// simplify nodes with multiple uses more aggressively.)
   SDValue GetDemandedBits(SDValue V, const APInt &DemandedBits);
 
-  /// See if the specified operand can be simplified with the knowledge that
-  /// only the bits specified by DemandedBits are used in the elements specified
-  /// by DemandedElts.  If so, return the simpler operand, otherwise return a
-  /// null SDValue.
-  ///
-  /// (This exists alongside SimplifyDemandedBits because GetDemandedBits can
-  /// simplify nodes with multiple uses more aggressively.)
-  SDValue GetDemandedBits(SDValue V, const APInt &DemandedBits,
-                          const APInt &DemandedElts);
-
   /// Return true if the sign bit of Op is known to be zero.
   /// We use this predicate to simplify operations downstream.
   bool SignBitIsZero(SDValue Op, unsigned Depth = 0) const;
@@ -1771,6 +1878,11 @@ public:
   bool MaskedValueIsZero(SDValue Op, const APInt &Mask,
                          const APInt &DemandedElts, unsigned Depth = 0) const;
 
+  /// Return true if 'Op' is known to be zero in DemandedElts.  We
+  /// use this predicate to simplify operations downstream.
+  bool MaskedVectorIsZero(SDValue Op, const APInt &DemandedElts,
+                          unsigned Depth = 0) const;
+
   /// Return true if '(Op & Mask) == Mask'.
   /// Op and Mask are known to be the same type.
   bool MaskedValueIsAllOnes(SDValue Op, const APInt &Mask,
@@ -2020,11 +2132,6 @@ public:
 
   /// Compute the default alignment value for the given type.
   Align getEVTAlign(EVT MemoryVT) const;
-  /// Compute the default alignment value for the given type.
-  /// FIXME: Remove once transition to Align is over.
-  inline unsigned getEVTAlignment(EVT MemoryVT) const {
-    return getEVTAlign(MemoryVT).value();
-  }
 
   /// Test whether the given value is a constant int or similar node.
   SDNode *isConstantIntBuildVectorOrConstantInt(SDValue N) const;
@@ -2039,39 +2146,34 @@ public:
            isConstantFPBuildVectorOrConstantFP(N);
   }
 
-  void addCallSiteInfo(const SDNode *CallNode, CallSiteInfoImpl &&CallInfo) {
-    SDCallSiteDbgInfo[CallNode].CSInfo = std::move(CallInfo);
+  /// Set CallSiteInfo to be associated with Node.
+  void addCallSiteInfo(const SDNode *Node, CallSiteInfoImpl &&CallInfo) {
+    SDCallSiteDbgInfo[Node].CSInfo = std::move(CallInfo);
   }
-
-  CallSiteInfo getSDCallSiteInfo(const SDNode *CallNode) {
-    auto I = SDCallSiteDbgInfo.find(CallNode);
-    if (I != SDCallSiteDbgInfo.end())
-      return std::move(I->second).CSInfo;
-    return CallSiteInfo();
+  /// Return CallSiteInfo associated with Node, or a default if none exists.
+  CallSiteInfo getCallSiteInfo(const SDNode *Node) {
+    auto I = SDCallSiteDbgInfo.find(Node);
+    return I != SDCallSiteDbgInfo.end() ? std::move(I->second).CSInfo
+                                        : CallSiteInfo();
   }
-
+  /// Set HeapAllocSite to be associated with Node.
   void addHeapAllocSite(const SDNode *Node, MDNode *MD) {
     SDCallSiteDbgInfo[Node].HeapAllocSite = MD;
   }
-
-  /// Return the HeapAllocSite type associated with the SDNode, if it exists.
-  MDNode *getHeapAllocSite(const SDNode *Node) {
-    auto It = SDCallSiteDbgInfo.find(Node);
-    if (It == SDCallSiteDbgInfo.end())
-      return nullptr;
-    return It->second.HeapAllocSite;
+  /// Return HeapAllocSite associated with Node, or nullptr if none exists.
+  MDNode *getHeapAllocSite(const SDNode *Node) const {
+    auto I = SDCallSiteDbgInfo.find(Node);
+    return I != SDCallSiteDbgInfo.end() ? I->second.HeapAllocSite : nullptr;
   }
-
+  /// Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
   void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge) {
     if (NoMerge)
       SDCallSiteDbgInfo[Node].NoMerge = NoMerge;
   }
-
-  bool getNoMergeSiteInfo(const SDNode *Node) {
+  /// Return NoMerge info associated with Node.
+  bool getNoMergeSiteInfo(const SDNode *Node) const {
     auto I = SDCallSiteDbgInfo.find(Node);
-    if (I == SDCallSiteDbgInfo.end())
-      return false;
-    return I->second.NoMerge;
+    return I != SDCallSiteDbgInfo.end() ? I->second.NoMerge : false;
   }
 
   /// Return the current function's default denormal handling kind for the given
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h b/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h
index 0f3af915da64..e23eebec81db 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h
@@ -49,7 +49,7 @@ public:
   SDValue getBase() const { return Base; }
   SDValue getIndex() { return Index; }
   SDValue getIndex() const { return Index; }
-  bool hasValidOffset() const { return Offset.hasValue(); }
+  bool hasValidOffset() const { return Offset.has_value(); }
   int64_t getOffset() const { return *Offset; }
 
   // Returns true if `Other` and `*this` are both some offset from the same base
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGISel.h b/llvm/include/llvm/CodeGen/SelectionDAGISel.h
index 9cea197724cc..35fb0bc80593 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGISel.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGISel.h
@@ -16,12 +16,13 @@
 
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include <memory>
 
 namespace llvm {
 class AAResults;
+class TargetInstrInfo;
+class TargetMachine;
 class SelectionDAGBuilder;
 class SDValue;
 class MachineRegisterInfo;
@@ -53,6 +54,7 @@ public:
   const TargetLowering *TLI;
   bool FastISelFailed;
   SmallPtrSet<const Instruction *, 4> ElidedArgCopyInstrs;
+  bool UseInstrRefDebugInfo = false;
 
   /// Current optimization remark emitter.
   /// Used to report things like combines and FastISel failures.
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 04c6b50197d4..5974f13a296b 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -508,7 +508,7 @@ BEGIN_TWO_BYTE_PACK()
 
   class LSBaseSDNodeBitfields {
     friend class LSBaseSDNode;
-    friend class VPLoadStoreSDNode;
+    friend class VPBaseLoadStoreSDNode;
     friend class MaskedLoadStoreSDNode;
     friend class MaskedGatherScatterSDNode;
     friend class VPGatherScatterSDNode;
@@ -529,6 +529,7 @@ BEGIN_TWO_BYTE_PACK()
   class LoadSDNodeBitfields {
     friend class LoadSDNode;
     friend class VPLoadSDNode;
+    friend class VPStridedLoadSDNode;
     friend class MaskedLoadSDNode;
     friend class MaskedGatherSDNode;
     friend class VPGatherSDNode;
@@ -542,6 +543,7 @@ BEGIN_TWO_BYTE_PACK()
   class StoreSDNodeBitfields {
     friend class StoreSDNode;
     friend class VPStoreSDNode;
+    friend class VPStridedStoreSDNode;
     friend class MaskedStoreSDNode;
     friend class MaskedScatterSDNode;
     friend class VPScatterSDNode;
@@ -613,8 +615,10 @@ private:
   SDNodeFlags Flags;
 
 public:
-  /// Unique and persistent id per SDNode in the DAG.
-  /// Used for debug printing.
+  /// Unique and persistent id per SDNode in the DAG. Used for debug printing.
+  /// We do not place that under `#if LLVM_ENABLE_ABI_BREAKING_CHECKS`
+  /// intentionally because it adds unneeded complexity without noticeable
+  /// benefits (see discussion with @thakis in D120714).
   uint16_t PersistentId;
 
   //===--------------------------------------------------------------------===//
@@ -1191,12 +1195,13 @@ inline void SDValue::dumpr(const SelectionDAG *G) const {
 inline void SDUse::set(const SDValue &V) {
   if (Val.getNode()) removeFromList();
   Val = V;
-  if (V.getNode()) V.getNode()->addUse(*this);
+  if (V.getNode())
+    V->addUse(*this);
 }
 
 inline void SDUse::setInitial(const SDValue &V) {
   Val = V;
-  V.getNode()->addUse(*this);
+  V->addUse(*this);
 }
 
 inline void SDUse::setNode(SDNode *N) {
@@ -1364,6 +1369,7 @@ public:
     case ISD::VP_STORE:
     case ISD::MSTORE:
     case ISD::VP_SCATTER:
+    case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
       return getOperand(2);
     case ISD::MGATHER:
     case ISD::MSCATTER:
@@ -1407,6 +1413,8 @@ public:
     case ISD::VP_STORE:
     case ISD::VP_GATHER:
     case ISD::VP_SCATTER:
+    case ISD::EXPERIMENTAL_VP_STRIDED_LOAD:
+    case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
       return true;
     default:
       return N->isMemIntrinsic() || N->isTargetMemoryOpcode();
@@ -1661,6 +1669,9 @@ bool isAllOnesConstant(SDValue V);
 /// Returns true if \p V is a constant integer one.
 bool isOneConstant(SDValue V);
 
+/// Returns true if \p V is a constant min signed integer value.
+bool isMinSignedConstant(SDValue V);
+
 /// Return the non-bitcasted source operand of \p V if it exists.
 /// If \p V is not a bitcasted value, it is returned as-is.
 SDValue peekThroughBitcasts(SDValue V);
@@ -1677,6 +1688,11 @@ SDValue peekThroughExtractSubvectors(SDValue V);
 /// constant is canonicalized to be operand 1.
 bool isBitwiseNot(SDValue V, bool AllowUndefs = false);
 
+/// If \p V is a bitwise not, returns the inverted operand. Otherwise returns
+/// an empty SDValue. Only bits set in \p Mask are required to be inverted,
+/// other bits may be arbitrary.
+SDValue getBitwiseNotOperand(SDValue V, SDValue Mask, bool AllowUndefs);
+
 /// Returns the SDNode if it is a constant splat BuildVector or constant int.
 ConstantSDNode *isConstOrConstSplat(SDValue N, bool AllowUndefs = false,
                                     bool AllowTruncation = false);
@@ -2353,34 +2369,64 @@ public:
   }
 };
 
-/// This base class is used to represent VP_LOAD and VP_STORE nodes
-class VPLoadStoreSDNode : public MemSDNode {
+/// This base class is used to represent VP_LOAD, VP_STORE,
+/// EXPERIMENTAL_VP_STRIDED_LOAD and EXPERIMENTAL_VP_STRIDED_STORE nodes
+class VPBaseLoadStoreSDNode : public MemSDNode {
 public:
   friend class SelectionDAG;
 
-  VPLoadStoreSDNode(ISD::NodeType NodeTy, unsigned Order, const DebugLoc &dl,
-                    SDVTList VTs, ISD::MemIndexedMode AM, EVT MemVT,
-                    MachineMemOperand *MMO)
-      : MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) {
+  VPBaseLoadStoreSDNode(ISD::NodeType NodeTy, unsigned Order,
+                        const DebugLoc &DL, SDVTList VTs,
+                        ISD::MemIndexedMode AM, EVT MemVT,
+                        MachineMemOperand *MMO)
+      : MemSDNode(NodeTy, Order, DL, VTs, MemVT, MMO) {
     LSBaseSDNodeBits.AddressingMode = AM;
     assert(getAddressingMode() == AM && "Value truncated");
   }
 
-  // VPLoadSDNode (Chain, Ptr, Offset, Mask, EVL)
-  // VPStoreSDNode (Chain, Data, Ptr, Offset, Mask, EVL)
+  // VPStridedStoreSDNode (Chain, Data, Ptr,    Offset, Stride, Mask, EVL)
+  // VPStoreSDNode        (Chain, Data, Ptr,    Offset, Mask,   EVL)
+  // VPStridedLoadSDNode  (Chain, Ptr,  Offset, Stride, Mask,   EVL)
+  // VPLoadSDNode         (Chain, Ptr,  Offset, Mask,   EVL)
   // Mask is a vector of i1 elements;
   // the type of EVL is TLI.getVPExplicitVectorLengthTy().
   const SDValue &getOffset() const {
-    return getOperand(getOpcode() == ISD::VP_LOAD ? 2 : 3);
+    return getOperand((getOpcode() == ISD::EXPERIMENTAL_VP_STRIDED_LOAD ||
+                       getOpcode() == ISD::VP_LOAD)
+                          ? 2
+                          : 3);
   }
   const SDValue &getBasePtr() const {
-    return getOperand(getOpcode() == ISD::VP_LOAD ? 1 : 2);
+    return getOperand((getOpcode() == ISD::EXPERIMENTAL_VP_STRIDED_LOAD ||
+                       getOpcode() == ISD::VP_LOAD)
+                          ? 1
+                          : 2);
   }
   const SDValue &getMask() const {
-    return getOperand(getOpcode() == ISD::VP_LOAD ? 3 : 4);
+    switch (getOpcode()) {
+    default:
+      llvm_unreachable("Invalid opcode");
+    case ISD::VP_LOAD:
+      return getOperand(3);
+    case ISD::VP_STORE:
+    case ISD::EXPERIMENTAL_VP_STRIDED_LOAD:
+      return getOperand(4);
+    case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
+      return getOperand(5);
+    }
   }
   const SDValue &getVectorLength() const {
-    return getOperand(getOpcode() == ISD::VP_LOAD ? 4 : 5);
+    switch (getOpcode()) {
+    default:
+      llvm_unreachable("Invalid opcode");
+    case ISD::VP_LOAD:
+      return getOperand(4);
+    case ISD::VP_STORE:
+    case ISD::EXPERIMENTAL_VP_STRIDED_LOAD:
+      return getOperand(5);
+    case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
+      return getOperand(6);
+    }
   }
 
   /// Return the addressing mode for this load or store:
@@ -2396,19 +2442,21 @@ public:
   bool isUnindexed() const { return getAddressingMode() == ISD::UNINDEXED; }
 
   static bool classof(const SDNode *N) {
-    return N->getOpcode() == ISD::VP_LOAD || N->getOpcode() == ISD::VP_STORE;
+    return N->getOpcode() == ISD::EXPERIMENTAL_VP_STRIDED_LOAD ||
+           N->getOpcode() == ISD::EXPERIMENTAL_VP_STRIDED_STORE ||
+           N->getOpcode() == ISD::VP_LOAD || N->getOpcode() == ISD::VP_STORE;
   }
 };
 
 /// This class is used to represent a VP_LOAD node
-class VPLoadSDNode : public VPLoadStoreSDNode {
+class VPLoadSDNode : public VPBaseLoadStoreSDNode {
 public:
   friend class SelectionDAG;
 
   VPLoadSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
                ISD::MemIndexedMode AM, ISD::LoadExtType ETy, bool isExpanding,
                EVT MemVT, MachineMemOperand *MMO)
-      : VPLoadStoreSDNode(ISD::VP_LOAD, Order, dl, VTs, AM, MemVT, MMO) {
+      : VPBaseLoadStoreSDNode(ISD::VP_LOAD, Order, dl, VTs, AM, MemVT, MMO) {
     LoadSDNodeBits.ExtTy = ETy;
     LoadSDNodeBits.IsExpanding = isExpanding;
   }
@@ -2428,15 +2476,45 @@ public:
   bool isExpandingLoad() const { return LoadSDNodeBits.IsExpanding; }
 };
 
+/// This class is used to represent an EXPERIMENTAL_VP_STRIDED_LOAD node.
+class VPStridedLoadSDNode : public VPBaseLoadStoreSDNode {
+public:
+  friend class SelectionDAG;
+
+  VPStridedLoadSDNode(unsigned Order, const DebugLoc &DL, SDVTList VTs,
+                      ISD::MemIndexedMode AM, ISD::LoadExtType ETy,
+                      bool IsExpanding, EVT MemVT, MachineMemOperand *MMO)
+      : VPBaseLoadStoreSDNode(ISD::EXPERIMENTAL_VP_STRIDED_LOAD, Order, DL, VTs,
+                              AM, MemVT, MMO) {
+    LoadSDNodeBits.ExtTy = ETy;
+    LoadSDNodeBits.IsExpanding = IsExpanding;
+  }
+
+  ISD::LoadExtType getExtensionType() const {
+    return static_cast<ISD::LoadExtType>(LoadSDNodeBits.ExtTy);
+  }
+
+  const SDValue &getBasePtr() const { return getOperand(1); }
+  const SDValue &getOffset() const { return getOperand(2); }
+  const SDValue &getStride() const { return getOperand(3); }
+  const SDValue &getMask() const { return getOperand(4); }
+  const SDValue &getVectorLength() const { return getOperand(5); }
+
+  static bool classof(const SDNode *N) {
+    return N->getOpcode() == ISD::EXPERIMENTAL_VP_STRIDED_LOAD;
+  }
+  bool isExpandingLoad() const { return LoadSDNodeBits.IsExpanding; }
+};
+
 /// This class is used to represent a VP_STORE node
-class VPStoreSDNode : public VPLoadStoreSDNode {
+class VPStoreSDNode : public VPBaseLoadStoreSDNode {
 public:
   friend class SelectionDAG;
 
   VPStoreSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
                 ISD::MemIndexedMode AM, bool isTrunc, bool isCompressing,
                 EVT MemVT, MachineMemOperand *MMO)
-      : VPLoadStoreSDNode(ISD::VP_STORE, Order, dl, VTs, AM, MemVT, MMO) {
+      : VPBaseLoadStoreSDNode(ISD::VP_STORE, Order, dl, VTs, AM, MemVT, MMO) {
     StoreSDNodeBits.IsTruncating = isTrunc;
     StoreSDNodeBits.IsCompressing = isCompressing;
   }
@@ -2463,6 +2541,43 @@ public:
   }
 };
 
+/// This class is used to represent an EXPERIMENTAL_VP_STRIDED_STORE node.
+class VPStridedStoreSDNode : public VPBaseLoadStoreSDNode {
+public:
+  friend class SelectionDAG;
+
+  VPStridedStoreSDNode(unsigned Order, const DebugLoc &DL, SDVTList VTs,
+                       ISD::MemIndexedMode AM, bool IsTrunc, bool IsCompressing,
+                       EVT MemVT, MachineMemOperand *MMO)
+      : VPBaseLoadStoreSDNode(ISD::EXPERIMENTAL_VP_STRIDED_STORE, Order, DL,
+                              VTs, AM, MemVT, MMO) {
+    StoreSDNodeBits.IsTruncating = IsTrunc;
+    StoreSDNodeBits.IsCompressing = IsCompressing;
+  }
+
+  /// Return true if this is a truncating store.
+  /// For integers this is the same as doing a TRUNCATE and storing the result.
+  /// For floats, it is the same as doing an FP_ROUND and storing the result.
+  bool isTruncatingStore() const { return StoreSDNodeBits.IsTruncating; }
+
+  /// Returns true if the op does a compression to the vector before storing.
+  /// The node contiguously stores the active elements (integers or floats)
+  /// in src (those with their respective bit set in writemask k) to unaligned
+  /// memory at base_addr.
+  bool isCompressingStore() const { return StoreSDNodeBits.IsCompressing; }
+
+  const SDValue &getValue() const { return getOperand(1); }
+  const SDValue &getBasePtr() const { return getOperand(2); }
+  const SDValue &getOffset() const { return getOperand(3); }
+  const SDValue &getStride() const { return getOperand(4); }
+  const SDValue &getMask() const { return getOperand(5); }
+  const SDValue &getVectorLength() const { return getOperand(6); }
+
+  static bool classof(const SDNode *N) {
+    return N->getOpcode() == ISD::EXPERIMENTAL_VP_STRIDED_STORE;
+  }
+};
+
 /// This base class is used to represent MLOAD and MSTORE nodes
 class MaskedLoadStoreSDNode : public MemSDNode {
 public:
@@ -2588,13 +2703,9 @@ public:
     return static_cast<ISD::MemIndexType>(LSBaseSDNodeBits.AddressingMode);
   }
   bool isIndexScaled() const {
-    return (getIndexType() == ISD::SIGNED_SCALED) ||
-           (getIndexType() == ISD::UNSIGNED_SCALED);
-  }
-  bool isIndexSigned() const {
-    return (getIndexType() == ISD::SIGNED_SCALED) ||
-           (getIndexType() == ISD::SIGNED_UNSCALED);
+    return !cast<ConstantSDNode>(getScale())->isOne();
   }
+  bool isIndexSigned() const { return isIndexTypeSigned(getIndexType()); }
 
   // In the both nodes address is Op1, mask is Op2:
   // VPGatherSDNode  (Chain, base, index, scale, mask, vlen)
@@ -2675,17 +2786,10 @@ public:
   ISD::MemIndexType getIndexType() const {
     return static_cast<ISD::MemIndexType>(LSBaseSDNodeBits.AddressingMode);
   }
-  void setIndexType(ISD::MemIndexType IndexType) {
-    LSBaseSDNodeBits.AddressingMode = IndexType;
-  }
   bool isIndexScaled() const {
-    return (getIndexType() == ISD::SIGNED_SCALED) ||
-           (getIndexType() == ISD::UNSIGNED_SCALED);
-  }
-  bool isIndexSigned() const {
-    return (getIndexType() == ISD::SIGNED_SCALED) ||
-           (getIndexType() == ISD::SIGNED_UNSCALED);
+    return !cast<ConstantSDNode>(getScale())->isOne();
   }
+  bool isIndexSigned() const { return isIndexTypeSigned(getIndexType()); }
 
   // In the both nodes address is Op1, mask is Op2:
   // MaskedGatherSDNode  (Chain, passthru, mask, base, index, scale)
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h b/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h
index 722c3275fd06..e7d608969124 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h
@@ -76,11 +76,13 @@ public:
   /// that don't fit the target's parameters for simple stores and can be more
   /// efficient than using a library call. This function can return a null
   /// SDValue if the target declines to use custom code and a different
-  /// lowering strategy should be used.
+  /// lowering strategy should be used. Note that if AlwaysInline is true the
+  /// function has to return a valid SDValue.
   virtual SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
                                           SDValue Chain, SDValue Op1,
                                           SDValue Op2, SDValue Op3,
                                           Align Alignment, bool isVolatile,
+                                          bool AlwaysInline,
                                           MachinePointerInfo DstPtrInfo) const {
     return SDValue();
   }
diff --git a/llvm/include/llvm/CodeGen/SlotIndexes.h b/llvm/include/llvm/CodeGen/SlotIndexes.h
index e8d618a24f9b..942a47c6cc7d 100644
--- a/llvm/include/llvm/CodeGen/SlotIndexes.h
+++ b/llvm/include/llvm/CodeGen/SlotIndexes.h
@@ -28,7 +28,6 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBundle.h"
-#include "llvm/Pass.h"
 #include "llvm/Support/Allocator.h"
 #include <algorithm>
 #include <cassert>
diff --git a/llvm/include/llvm/CodeGen/StackMaps.h b/llvm/include/llvm/CodeGen/StackMaps.h
index 928d7cc6cc04..01cc9bc37931 100644
--- a/llvm/include/llvm/CodeGen/StackMaps.h
+++ b/llvm/include/llvm/CodeGen/StackMaps.h
@@ -13,7 +13,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/IR/CallingConv.h"
-#include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Debug.h"
 #include <algorithm>
 #include <cassert>
@@ -23,6 +22,7 @@
 namespace llvm {
 
 class AsmPrinter;
+class MCSymbol;
 class MCExpr;
 class MCStreamer;
 class raw_ostream;
diff --git a/llvm/include/llvm/CodeGen/StackProtector.h b/llvm/include/llvm/CodeGen/StackProtector.h
index 57456b3f6c16..b96c0c74fabc 100644
--- a/llvm/include/llvm/CodeGen/StackProtector.h
+++ b/llvm/include/llvm/CodeGen/StackProtector.h
@@ -20,7 +20,6 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/ValueMap.h"
 #include "llvm/Pass.h"
 
 namespace llvm {
diff --git a/llvm/include/llvm/CodeGen/SwiftErrorValueTracking.h b/llvm/include/llvm/CodeGen/SwiftErrorValueTracking.h
index 08ab2abbdd5b..a374736347f6 100644
--- a/llvm/include/llvm/CodeGen/SwiftErrorValueTracking.h
+++ b/llvm/include/llvm/CodeGen/SwiftErrorValueTracking.h
@@ -20,8 +20,6 @@
 #include "llvm/CodeGen/Register.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DebugLoc.h"
-#include <functional>
-#include <type_traits>
 #include <utility>
 
 
diff --git a/llvm/include/llvm/CodeGen/TailDuplicator.h b/llvm/include/llvm/CodeGen/TailDuplicator.h
index daaa27f72d52..94e8092319d7 100644
--- a/llvm/include/llvm/CodeGen/TailDuplicator.h
+++ b/llvm/include/llvm/CodeGen/TailDuplicator.h
@@ -16,15 +16,16 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MBFIWrapper.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include <utility>
 #include <vector>
 
 namespace llvm {
 
+template <typename T, unsigned int N> class SmallSetVector;
+template <typename Fn> class function_ref;
+class MBFIWrapper;
 class MachineBasicBlock;
 class MachineBranchProbabilityInfo;
 class MachineFunction;
diff --git a/llvm/include/llvm/CodeGen/TargetCallingConv.h b/llvm/include/llvm/CodeGen/TargetCallingConv.h
index 62365330379d..1333f2d98973 100644
--- a/llvm/include/llvm/CodeGen/TargetCallingConv.h
+++ b/llvm/include/llvm/CodeGen/TargetCallingConv.h
@@ -46,7 +46,8 @@ namespace ISD {
     unsigned IsHvaStart : 1;   ///< HVA structure start
     unsigned IsSecArgPass : 1; ///< Second argument
     unsigned MemAlign : 4;     ///< Log 2 of alignment when arg is passed in memory
-                               ///< (including byval/byref)
+                               ///< (including byval/byref). The max alignment is
+                               ///< verified in IR verification.
     unsigned OrigAlign : 5;    ///< Log 2 of original alignment
     unsigned IsInConsecutiveRegsLast : 1;
     unsigned IsInConsecutiveRegs : 1;
diff --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
index f2ca1590fc39..fbce5d7a9102 100644
--- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
@@ -213,12 +213,24 @@ public:
   virtual void emitEpilogue(MachineFunction &MF,
                             MachineBasicBlock &MBB) const = 0;
 
+  /// emitZeroCallUsedRegs - Zeros out call used registers.
+  virtual void emitZeroCallUsedRegs(BitVector RegsToZero,
+                                    MachineBasicBlock &MBB) const {}
+
   /// With basic block sections, emit callee saved frame moves for basic blocks
   /// that are in a different section.
   virtual void
   emitCalleeSavedFrameMovesFullCFA(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MBBI) const {}
 
+  /// Returns true if we may need to fix the unwind information for the
+  /// function.
+  virtual bool enableCFIFixup(MachineFunction &MF) const;
+
+  /// Emit CFI instructions that recreate the state of the unwind information
+  /// upon fucntion entry.
+  virtual void resetCFIToInitialState(MachineBasicBlock &MBB) const {}
+
   /// Replace a StackProbe stub (if any) with the actual probe code inline
   virtual void inlineStackProbe(MachineFunction &MF,
                                 MachineBasicBlock &PrologueMBB) const {}
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 411811d08c18..f9183e0a9c66 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -382,6 +382,17 @@ public:
   /// to which instructions should be sunk.
   virtual bool shouldSink(const MachineInstr &MI) const { return true; }
 
+  /// Return false if the instruction should not be hoisted by MachineLICM.
+  ///
+  /// MachineLICM determines on its own whether the instruction is safe to
+  /// hoist; this gives the target a hook to extend this assessment and prevent
+  /// an instruction being hoisted from a given loop for target specific
+  /// reasons.
+  virtual bool shouldHoist(const MachineInstr &MI,
+                           const MachineLoop *FromLoop) const {
+    return true;
+  }
+
   /// Re-issue the specified 'original' instruction at the
   /// specific location targeting a new destination register.
   /// The register in Orig->getOperand(0).getReg() will be substituted by
@@ -723,12 +734,16 @@ public:
     virtual bool shouldIgnoreForPipelining(const MachineInstr *MI) const = 0;
 
     /// Create a condition to determine if the trip count of the loop is greater
-    /// than TC.
+    /// than TC, where TC is always one more than for the previous prologue or
+    /// 0 if this is being called for the outermost prologue.
     ///
     /// If the trip count is statically known to be greater than TC, return
     /// true. If the trip count is statically known to be not greater than TC,
     /// return false. Otherwise return nullopt and fill out Cond with the test
     /// condition.
+    ///
+    /// Note: This hook is guaranteed to be called from the innermost to the
+    /// outermost prologue of the loop being software pipelined.
     virtual Optional<bool>
     createTripCountGreaterCondition(int TC, MachineBasicBlock &MBB,
                                     SmallVectorImpl<MachineOperand> &Cond) = 0;
@@ -1268,13 +1283,6 @@ protected:
   }
 
 public:
-  /// getAddressSpaceForPseudoSourceKind - Given the kind of memory
-  /// (e.g. stack) the target returns the corresponding address space.
-  virtual unsigned
-  getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
-    return 0;
-  }
-
   /// unfoldMemoryOperand - Separate a single instruction which folded a load or
   /// a store or a load and a store into two or more instruction. If this is
   /// possible, returns true as well as the new instructions by reference.
@@ -1942,7 +1950,7 @@ public:
   virtual MachineBasicBlock::iterator
   insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
                      MachineBasicBlock::iterator &It, MachineFunction &MF,
-                     const outliner::Candidate &C) const {
+                     outliner::Candidate &C) const {
     llvm_unreachable(
         "Target didn't implement TargetInstrInfo::insertOutlinedCall!");
   }
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 3861648a5feb..98b9a416ea59 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -25,7 +25,7 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/STLArrayExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/DAGCombine.h"
@@ -248,12 +248,21 @@ public:
   /// w.r.t. what they should expand to.
   enum class AtomicExpansionKind {
     None,    // Don't expand the instruction.
+    CastToInteger,    // Cast the atomic instruction to another type, e.g. from
+                      // floating-point to integer type.
     LLSC,    // Expand the instruction into loadlinked/storeconditional; used
              // by ARM/AArch64.
     LLOnly,  // Expand the (load) instruction into just a load-linked, which has
              // greater atomic guarantees than a normal load.
     CmpXChg, // Expand the instruction into cmpxchg; used by at least X86.
-    MaskedIntrinsic, // Use a target-specific intrinsic for the LL/SC loop.
+    MaskedIntrinsic,  // Use a target-specific intrinsic for the LL/SC loop.
+    BitTestIntrinsic, // Use a target-specific intrinsic for special bit
+                      // operations; used by X86.
+    Expand,           // Generic expansion in terms of other atomic operations.
+
+    // Rewrite to a non-atomic form for use in a known non-preemptible
+    // environment.
+    NotAtomic
   };
 
   /// Enum that specifies when a multiplication should be expanded.
@@ -1071,6 +1080,11 @@ public:
     return false;
   }
 
+  /// How to legalize this custom operation?
+  virtual LegalizeAction getCustomOperationAction(SDNode &Op) const {
+    return Legal;
+  }
+
   /// Return how this operation should be treated: either it is legal, needs to
   /// be promoted to a larger size, needs to be expanded to some other code
   /// sequence, or the target has a custom expander for it.
@@ -1210,6 +1224,10 @@ public:
                                       uint64_t Range, ProfileSummaryInfo *PSI,
                                       BlockFrequencyInfo *BFI) const;
 
+  /// Returns preferred type for switch condition.
+  virtual MVT getPreferredSwitchConditionType(LLVMContext &Context,
+                                              EVT ConditionVT) const;
+
   /// Return true if lowering to a bit test is suitable for a set of case
   /// clusters which contains \p NumDests unique destinations, \p Low and
   /// \p High as its lowest and highest case values, and expects \p NumCmps
@@ -1372,7 +1390,9 @@ public:
 
   // Returns true if VT is a legal index type for masked gathers/scatters
   // on this target
-  virtual bool shouldRemoveExtendFromGSIndex(EVT VT) const { return false; }
+  virtual bool shouldRemoveExtendFromGSIndex(EVT IndexVT, EVT DataVT) const {
+    return false;
+  }
 
   /// Return how the condition code should be treated: either it is legal, needs
   /// to be expanded to some other code sequence, or the target has a custom
@@ -1871,7 +1891,7 @@ public:
   /// minimum size the object must be to be aligned and PrefAlign is set to the
   /// preferred alignment.
   virtual bool shouldAlignPointerArgs(CallInst * /*CI*/, unsigned & /*MinSize*/,
-                                      unsigned & /*PrefAlign*/) const {
+                                      Align & /*PrefAlign*/) const {
     return false;
   }
 
@@ -1946,6 +1966,14 @@ public:
     llvm_unreachable("Masked atomicrmw expansion unimplemented on this target");
   }
 
+  /// Perform a bit test atomicrmw using a target-specific intrinsic. This
+  /// represents the combined bit test intrinsic which will be lowered at a late
+  /// stage by the backend.
+  virtual void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
+    llvm_unreachable(
+        "Bit test atomicrmw expansion unimplemented on this target");
+  }
+
   /// Perform a masked cmpxchg using a target-specific intrinsic. This
   /// represents the core LL/SC loop which will be lowered at a late stage by
   /// the backend.
@@ -2005,12 +2033,6 @@ public:
   // be unnecessarily held, except if clrex, inserted by this hook, is executed.
   virtual void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const {}
 
-  /// Returns true if the given (atomic) store should be expanded by the
-  /// IR-level AtomicExpand pass into an "atomic xchg" which ignores its input.
-  virtual bool shouldExpandAtomicStoreInIR(StoreInst *SI) const {
-    return false;
-  }
-
   /// Returns true if arguments should be sign-extended in lib calls.
   virtual bool shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const {
     return IsSigned;
@@ -2027,6 +2049,30 @@ public:
     return AtomicExpansionKind::None;
   }
 
+  /// Returns how the given (atomic) load should be cast by the IR-level
+  /// AtomicExpand pass.
+  virtual AtomicExpansionKind shouldCastAtomicLoadInIR(LoadInst *LI) const {
+    if (LI->getType()->isFloatingPointTy())
+      return AtomicExpansionKind::CastToInteger;
+    return AtomicExpansionKind::None;
+  }
+
+  /// Returns how the given (atomic) store should be expanded by the IR-level
+  /// AtomicExpand pass into. For instance AtomicExpansionKind::Expand will try
+  /// to use an atomicrmw xchg.
+  virtual AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+    return AtomicExpansionKind::None;
+  }
+
+  /// Returns how the given (atomic) store should be cast by the IR-level
+  /// AtomicExpand pass into. For instance AtomicExpansionKind::CastToInteger
+  /// will try to cast the operands to integer values.
+  virtual AtomicExpansionKind shouldCastAtomicStoreInIR(StoreInst *SI) const {
+    if (SI->getValueOperand()->getType()->isFloatingPointTy())
+      return AtomicExpansionKind::CastToInteger;
+    return AtomicExpansionKind::None;
+  }
+
   /// Returns how the given atomic cmpxchg should be expanded by the IR-level
   /// AtomicExpand pass.
   virtual AtomicExpansionKind
@@ -2041,6 +2087,18 @@ public:
       AtomicExpansionKind::CmpXChg : AtomicExpansionKind::None;
   }
 
+  /// Returns how the given atomic atomicrmw should be cast by the IR-level
+  /// AtomicExpand pass.
+  virtual AtomicExpansionKind
+  shouldCastAtomicRMWIInIR(AtomicRMWInst *RMWI) const {
+    if (RMWI->getOperation() == AtomicRMWInst::Xchg &&
+        (RMWI->getValOperand()->getType()->isFloatingPointTy() ||
+         RMWI->getValOperand()->getType()->isPointerTy()))
+      return AtomicExpansionKind::CastToInteger;
+
+    return AtomicExpansionKind::None;
+  }
+
   /// On some platforms, an AtomicRMW that never actually modifies the value
   /// (such as fetch_add of 0) can be turned into a fence followed by an
   /// atomic load. This may sound useless, but it makes it possible for the
@@ -2123,8 +2181,8 @@ public:
   /// about some cases, a default true can be returned to let the DAGCombiner
   /// decide.
   /// AddNode is (add x, c1), and ConstNode is c2.
-  virtual bool isMulAddWithConstProfitable(const SDValue &AddNode,
-                                           const SDValue &ConstNode) const {
+  virtual bool isMulAddWithConstProfitable(SDValue AddNode,
+                                           SDValue ConstNode) const {
     return true;
   }
 
@@ -2138,6 +2196,18 @@ public:
     return false;
   }
 
+  /// Return true if it is beneficial to expand an @llvm.powi.* intrinsic.
+  /// If not optimizing for size, expanding @llvm.powi.* intrinsics is always
+  /// considered beneficial.
+  /// If optimizing for size, expansion is only considered beneficial for upto
+  /// 5 multiplies and a divide (if the exponent is negative).
+  bool isBeneficialToExpandPowI(int Exponent, bool OptForSize) const {
+    if (Exponent < 0)
+      Exponent = -Exponent;
+    return !OptForSize ||
+           (countPopulation((unsigned int)Exponent) + Log2_32(Exponent) < 7);
+  }
+
   //===--------------------------------------------------------------------===//
   // TargetLowering Configuration Methods - These methods should be invoked by
   // the derived class constructor to configure this object for the target.
@@ -2232,6 +2302,16 @@ protected:
     assert(Op < array_lengthof(OpActions[0]) && "Table isn't big enough!");
     OpActions[(unsigned)VT.SimpleTy][Op] = Action;
   }
+  void setOperationAction(ArrayRef<unsigned> Ops, MVT VT,
+                          LegalizeAction Action) {
+    for (auto Op : Ops)
+      setOperationAction(Op, VT, Action);
+  }
+  void setOperationAction(ArrayRef<unsigned> Ops, ArrayRef<MVT> VTs,
+                          LegalizeAction Action) {
+    for (auto VT : VTs)
+      setOperationAction(Ops, VT, Action);
+  }
 
   /// Indicate that the specified load with extension does not work with the
   /// specified type and indicate what to do about it.
@@ -2244,6 +2324,16 @@ protected:
     LoadExtActions[ValVT.SimpleTy][MemVT.SimpleTy] &= ~((uint16_t)0xF << Shift);
     LoadExtActions[ValVT.SimpleTy][MemVT.SimpleTy] |= (uint16_t)Action << Shift;
   }
+  void setLoadExtAction(ArrayRef<unsigned> ExtTypes, MVT ValVT, MVT MemVT,
+                        LegalizeAction Action) {
+    for (auto ExtType : ExtTypes)
+      setLoadExtAction(ExtType, ValVT, MemVT, Action);
+  }
+  void setLoadExtAction(ArrayRef<unsigned> ExtTypes, MVT ValVT,
+                        ArrayRef<MVT> MemVTs, LegalizeAction Action) {
+    for (auto MemVT : MemVTs)
+      setLoadExtAction(ExtTypes, ValVT, MemVT, Action);
+  }
 
   /// Indicate that the specified truncating store does not work with the
   /// specified type and indicate what to do about it.
@@ -2257,8 +2347,16 @@ protected:
   ///
   /// NOTE: All indexed mode loads are initialized to Expand in
   /// TargetLowering.cpp
-  void setIndexedLoadAction(unsigned IdxMode, MVT VT, LegalizeAction Action) {
-    setIndexedModeAction(IdxMode, VT, IMAB_Load, Action);
+  void setIndexedLoadAction(ArrayRef<unsigned> IdxModes, MVT VT,
+                            LegalizeAction Action) {
+    for (auto IdxMode : IdxModes)
+      setIndexedModeAction(IdxMode, VT, IMAB_Load, Action);
+  }
+
+  void setIndexedLoadAction(ArrayRef<unsigned> IdxModes, ArrayRef<MVT> VTs,
+                            LegalizeAction Action) {
+    for (auto VT : VTs)
+      setIndexedLoadAction(IdxModes, VT, Action);
   }
 
   /// Indicate that the specified indexed store does or does not work with the
@@ -2266,8 +2364,16 @@ protected:
   ///
   /// NOTE: All indexed mode stores are initialized to Expand in
   /// TargetLowering.cpp
-  void setIndexedStoreAction(unsigned IdxMode, MVT VT, LegalizeAction Action) {
-    setIndexedModeAction(IdxMode, VT, IMAB_Store, Action);
+  void setIndexedStoreAction(ArrayRef<unsigned> IdxModes, MVT VT,
+                             LegalizeAction Action) {
+    for (auto IdxMode : IdxModes)
+      setIndexedModeAction(IdxMode, VT, IMAB_Store, Action);
+  }
+
+  void setIndexedStoreAction(ArrayRef<unsigned> IdxModes, ArrayRef<MVT> VTs,
+                             LegalizeAction Action) {
+    for (auto VT : VTs)
+      setIndexedStoreAction(IdxModes, VT, Action);
   }
 
   /// Indicate that the specified indexed masked load does or does not work with
@@ -2292,17 +2398,24 @@ protected:
 
   /// Indicate that the specified condition code is or isn't supported on the
   /// target and indicate what to do about it.
-  void setCondCodeAction(ISD::CondCode CC, MVT VT,
+  void setCondCodeAction(ArrayRef<ISD::CondCode> CCs, MVT VT,
                          LegalizeAction Action) {
-    assert(VT.isValid() && (unsigned)CC < array_lengthof(CondCodeActions) &&
-           "Table isn't big enough!");
-    assert((unsigned)Action < 0x10 && "too many bits for bitfield array");
-    /// The lower 3 bits of the SimpleTy index into Nth 4bit set from the 32-bit
-    /// value and the upper 29 bits index into the second dimension of the array
-    /// to select what 32-bit value to use.
-    uint32_t Shift = 4 * (VT.SimpleTy & 0x7);
-    CondCodeActions[CC][VT.SimpleTy >> 3] &= ~((uint32_t)0xF << Shift);
-    CondCodeActions[CC][VT.SimpleTy >> 3] |= (uint32_t)Action << Shift;
+    for (auto CC : CCs) {
+      assert(VT.isValid() && (unsigned)CC < array_lengthof(CondCodeActions) &&
+             "Table isn't big enough!");
+      assert((unsigned)Action < 0x10 && "too many bits for bitfield array");
+      /// The lower 3 bits of the SimpleTy index into Nth 4bit set from the
+      /// 32-bit value and the upper 29 bits index into the second dimension of
+      /// the array to select what 32-bit value to use.
+      uint32_t Shift = 4 * (VT.SimpleTy & 0x7);
+      CondCodeActions[CC][VT.SimpleTy >> 3] &= ~((uint32_t)0xF << Shift);
+      CondCodeActions[CC][VT.SimpleTy >> 3] |= (uint32_t)Action << Shift;
+    }
+  }
+  void setCondCodeAction(ArrayRef<ISD::CondCode> CCs, ArrayRef<MVT> VTs,
+                         LegalizeAction Action) {
+    for (auto VT : VTs)
+      setCondCodeAction(CCs, VT, Action);
   }
 
   /// If Opc/OrigVT is specified as being promoted, the promotion code defaults
@@ -2323,9 +2436,11 @@ protected:
   /// Targets should invoke this method for each target independent node that
   /// they want to provide a custom DAG combiner for by implementing the
   /// PerformDAGCombine virtual method.
-  void setTargetDAGCombine(ISD::NodeType NT) {
-    assert(unsigned(NT >> 3) < array_lengthof(TargetDAGCombineArray));
-    TargetDAGCombineArray[NT >> 3] |= 1 << (NT&7);
+  void setTargetDAGCombine(ArrayRef<ISD::NodeType> NTs) {
+    for (auto NT : NTs) {
+      assert(unsigned(NT >> 3) < array_lengthof(TargetDAGCombineArray));
+      TargetDAGCombineArray[NT >> 3] |= 1 << (NT & 7);
+    }
   }
 
   /// Set the target's minimum function alignment.
@@ -2510,6 +2625,10 @@ public:
     case ISD::FMAXNUM_IEEE:
     case ISD::FMINIMUM:
     case ISD::FMAXIMUM:
+    case ISD::AVGFLOORS:
+    case ISD::AVGFLOORU:
+    case ISD::AVGCEILS:
+    case ISD::AVGCEILU:
       return true;
     default: return false;
     }
@@ -2653,6 +2772,10 @@ public:
     return false;
   }
 
+  /// Return true if this constant should be sign extended when promoting to
+  /// a larger type.
+  virtual bool signExtendConstant(const ConstantInt *C) const { return false; }
+
   /// Return true if sinking I's operands to the same basic block as I is
   /// profitable, e.g. because the operands can be folded into a target
   /// instruction during instruction selection. After calling the function
@@ -2851,6 +2974,14 @@ public:
     return false;
   }
 
+  /// Return true if pulling a binary operation into a select with an identity
+  /// constant is profitable. This is the inverse of an IR transform.
+  /// Example: X + (Cond ? Y : 0) --> Cond ? (X + Y) : X
+  virtual bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode,
+                                                    EVT VT) const {
+    return false;
+  }
+
   /// Return true if it is beneficial to convert a load of a constant to
   /// just the constant itself.
   /// On some targets it might be more efficient to use a combination of
@@ -2940,6 +3071,10 @@ public:
   void setLibcallName(RTLIB::Libcall Call, const char *Name) {
     LibcallRoutineNames[Call] = Name;
   }
+  void setLibcallName(ArrayRef<RTLIB::Libcall> Calls, const char *Name) {
+    for (auto Call : Calls)
+      setLibcallName(Call, Name);
+  }
 
   /// Get the libcall routine name for the specified libcall.
   const char *getLibcallName(RTLIB::Libcall Call) const {
@@ -3421,11 +3556,13 @@ public:
 
   /// Determines the optimal series of memory ops to replace the memset / memcpy.
   /// Return true if the number of memory ops is below the threshold (Limit).
+  /// Note that this is always the case when Limit is ~0.
   /// It returns the types of the sequence of memory ops to perform
   /// memset / memcpy by reference.
-  bool findOptimalMemOpLowering(std::vector<EVT> &MemOps, unsigned Limit,
-                                const MemOp &Op, unsigned DstAS, unsigned SrcAS,
-                                const AttributeList &FuncAttributes) const;
+  virtual bool
+  findOptimalMemOpLowering(std::vector<EVT> &MemOps, unsigned Limit,
+                           const MemOp &Op, unsigned DstAS, unsigned SrcAS,
+                           const AttributeList &FuncAttributes) const;
 
   /// Check to see if the specified operand of the specified instruction is a
   /// constant integer.  If so, check to see if there are any bits set in the
@@ -3534,9 +3671,16 @@ public:
   /// Helper wrapper around SimplifyDemandedVectorElts.
   /// Adds Op back to the worklist upon success.
   bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedElts,
-                                  APInt &KnownUndef, APInt &KnownZero,
                                   DAGCombinerInfo &DCI) const;
 
+  /// Return true if the target supports simplifying demanded vector elements by
+  /// converting them to undefs.
+  virtual bool
+  shouldSimplifyDemandedVectorElts(SDValue Op,
+                                   const TargetLoweringOpt &TLO) const {
+    return true;
+  }
+
   /// Determine which of the bits specified in Mask are known to be either zero
   /// or one and return them in the KnownZero/KnownOne bitsets. The DemandedElts
   /// argument allows us to only collect the known bits that are shared by the
@@ -3653,6 +3797,12 @@ public:
                                          APInt &UndefElts,
                                          unsigned Depth = 0) const;
 
+  /// Returns true if the given Opc is considered a canonical constant for the
+  /// target, which should not be transformed back into a BUILD_VECTOR.
+  virtual bool isTargetCanonicalConstantNode(SDValue Op) const {
+    return Op.getOpcode() == ISD::SPLAT_VECTOR;
+  }
+
   struct DAGCombinerInfo {
     void *DC;  // The DAG Combiner object.
     CombineLevel Level;
@@ -3805,7 +3955,7 @@ public:
     if (Neg && Cost == NegatibleCost::Cheaper)
       return Neg;
     // Remove the new created node to avoid the side effect to the DAG.
-    if (Neg && Neg.getNode()->use_empty())
+    if (Neg && Neg->use_empty())
       DAG.RemoveDeadNode(Neg.getNode());
     return SDValue();
   }
@@ -4270,6 +4420,7 @@ public:
     C_Register,            // Constraint represents specific register(s).
     C_RegisterClass,       // Constraint represents any of register(s) in class.
     C_Memory,              // Memory constraint.
+    C_Address,             // Address constraint.
     C_Immediate,           // Requires an immediate.
     C_Other,               // Something else.
     C_Unknown              // Unsupported constraint.
@@ -4374,6 +4525,8 @@ public:
       return InlineAsm::Constraint_o;
     if (ConstraintCode == "X")
       return InlineAsm::Constraint_X;
+    if (ConstraintCode == "p")
+      return InlineAsm::Constraint_p;
     return InlineAsm::Constraint_Unknown;
   }
 
@@ -4410,6 +4563,14 @@ public:
                                 SelectionDAG &DAG,
                                 SmallVectorImpl<SDNode *> &Created) const;
 
+  /// Targets may override this function to provide custom SREM lowering for
+  /// power-of-2 denominators.  If the target returns an empty SDValue, LLVM
+  /// assumes SREM is expensive and replaces it with a series of other integer
+  /// operations.
+  virtual SDValue BuildSREMPow2(SDNode *N, const APInt &Divisor,
+                                SelectionDAG &DAG,
+                                SmallVectorImpl<SDNode *> &Created) const;
+
   /// Indicate whether this target prefers to combine FDIVs with the same
   /// divisor. If the transform should never be done, return zero. If the
   /// transform should be done, return the minimum number of divisor uses
@@ -4442,6 +4603,13 @@ public:
     return SDValue();
   }
 
+  /// Try to convert the fminnum/fmaxnum to a compare/select sequence. This is
+  /// required for correctness since InstCombine might have canonicalized a
+  /// fcmp+select sequence to a FMINNUM/FMAXNUM intrinsic.  If we were to fall
+  /// through to the default expansion/soften to libcall, we might introduce a
+  /// link-time dependency on libm into a file that originally did not have one.
+  SDValue createSelectForFMINNUM_FMAXNUM(SDNode *Node, SelectionDAG &DAG) const;
+
   /// Return a reciprocal estimate value for the input operand.
   /// \p Enabled is a ReciprocalEstimate enum with value either 'Unspecified' or
   /// 'Enabled' as set by a potential default override attribute.
@@ -4554,6 +4722,16 @@ public:
   /// \returns The expansion result
   SDValue expandFP_TO_INT_SAT(SDNode *N, SelectionDAG &DAG) const;
 
+  /// Expand check for floating point class.
+  /// \param ResultVT The type of intrinsic call result.
+  /// \param Op The tested value.
+  /// \param Test The test to perform.
+  /// \param Flags The optimization flags.
+  /// \returns The expansion result or SDValue() if it fails.
+  SDValue expandIS_FPCLASS(EVT ResultVT, SDValue Op, unsigned Test,
+                           SDNodeFlags Flags, const SDLoc &DL,
+                           SelectionDAG &DAG) const;
+
   /// Expand CTPOP nodes. Expands vector/scalar CTPOP nodes,
   /// vector nodes can only succeed if all operations are legal/custom.
   /// \param N Node to expand
@@ -4693,28 +4871,32 @@ public:
   /// method accepts vectors as its arguments.
   SDValue expandVectorSplice(SDNode *Node, SelectionDAG &DAG) const;
 
-  /// Legalize a SETCC with given LHS and RHS and condition code CC on the
-  /// current target.
+  /// Legalize a SETCC or VP_SETCC with given LHS and RHS and condition code CC
+  /// on the current target. A VP_SETCC will additionally be given a Mask
+  /// and/or EVL not equal to SDValue().
   ///
   /// If the SETCC has been legalized using AND / OR, then the legalized node
   /// will be stored in LHS. RHS and CC will be set to SDValue(). NeedInvert
-  /// will be set to false.
+  /// will be set to false. This will also hold if the VP_SETCC has been
+  /// legalized using VP_AND / VP_OR.
   ///
-  /// If the SETCC has been legalized by using getSetCCSwappedOperands(),
-  /// then the values of LHS and RHS will be swapped, CC will be set to the
-  /// new condition, and NeedInvert will be set to false.
+  /// If the SETCC / VP_SETCC has been legalized by using
+  /// getSetCCSwappedOperands(), then the values of LHS and RHS will be
+  /// swapped, CC will be set to the new condition, and NeedInvert will be set
+  /// to false.
   ///
-  /// If the SETCC has been legalized using the inverse condcode, then LHS and
-  /// RHS will be unchanged, CC will set to the inverted condcode, and
-  /// NeedInvert will be set to true. The caller must invert the result of the
-  /// SETCC with SelectionDAG::getLogicalNOT() or take equivalent action to swap
-  /// the effect of a true/false result.
+  /// If the SETCC / VP_SETCC has been legalized using the inverse condcode,
+  /// then LHS and RHS will be unchanged, CC will set to the inverted condcode,
+  /// and NeedInvert will be set to true. The caller must invert the result of
+  /// the SETCC with SelectionDAG::getLogicalNOT() or take equivalent action to
+  /// swap the effect of a true/false result.
   ///
-  /// \returns true if the SetCC has been legalized, false if it hasn't.
+  /// \returns true if the SETCC / VP_SETCC has been legalized, false if it
+  /// hasn't.
   bool LegalizeSetCCCondCode(SelectionDAG &DAG, EVT VT, SDValue &LHS,
-                             SDValue &RHS, SDValue &CC, bool &NeedInvert,
-                             const SDLoc &dl, SDValue &Chain,
-                             bool IsSignaling = false) const;
+                             SDValue &RHS, SDValue &CC, SDValue Mask,
+                             SDValue EVL, bool &NeedInvert, const SDLoc &dl,
+                             SDValue &Chain, bool IsSignaling = false) const;
 
   //===--------------------------------------------------------------------===//
   // Instruction Emitting Hooks
@@ -4766,10 +4948,6 @@ public:
   // combiner can fold the new nodes.
   SDValue lowerCmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) const;
 
-  /// Give targets the chance to reduce the number of distinct addresing modes.
-  ISD::MemIndexType getCanonicalIndexType(ISD::MemIndexType IndexType,
-                                          EVT MemVT, SDValue Offsets) const;
-
 private:
   SDValue foldSetCCWithAnd(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
                            const SDLoc &DL, DAGCombinerInfo &DCI) const;
diff --git a/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h b/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
index 2c8b17807f7c..08267d70906a 100644
--- a/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
+++ b/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
@@ -16,6 +16,7 @@
 
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/BinaryFormat/XCOFF.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 
 namespace llvm {
@@ -118,6 +119,9 @@ public:
 
   void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
 
+  MCSection *getStaticDtorSection(unsigned Priority,
+                                  const MCSymbol *KeySym) const override;
+
   /// Emit the module flags that specify the garbage collection information.
   void emitModuleMetadata(MCStreamer &Streamer, Module &M) const override;
 
@@ -282,6 +286,13 @@ public:
 
   MCSymbol *getFunctionEntryPointSymbol(const GlobalValue *Func,
                                         const TargetMachine &TM) const override;
+
+  /// For functions, this will return the LSDA section. If option
+  /// -ffunction-sections is on, this will return a unique csect with the
+  /// function name appended to .gcc_except_table as a suffix of the LSDA
+  /// section name.
+  MCSection *getSectionForLSDA(const Function &F, const MCSymbol &FnSym,
+                               const TargetMachine &TM) const override;
 };
 
 class TargetLoweringObjectFileGOFF : public TargetLoweringObjectFile {
diff --git a/llvm/include/llvm/CodeGen/TargetPassConfig.h b/llvm/include/llvm/CodeGen/TargetPassConfig.h
index 9b13b61fc9de..8d7086d02c8a 100644
--- a/llvm/include/llvm/CodeGen/TargetPassConfig.h
+++ b/llvm/include/llvm/CodeGen/TargetPassConfig.h
@@ -345,6 +345,9 @@ protected:
   // Helper to verify the analysis is really immutable.
   void setOpt(bool &Opt, bool Val);
 
+  /// Return true if register allocator is specified by -regalloc=override.
+  bool isCustomizedRegAlloc();
+
   /// Methods with trivial inline returns are convenient points in the common
   /// codegen pass pipeline where targets may insert passes. Methods with
   /// out-of-line standard implementations are major CodeGen stages called by
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index c3b842052ef5..04369a5bfe0d 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -29,7 +29,6 @@
 #include "llvm/Support/Printable.h"
 #include <cassert>
 #include <cstdint>
-#include <functional>
 
 namespace llvm {
 
@@ -56,6 +55,8 @@ public:
   const LaneBitmask LaneMask;
   /// Classes with a higher priority value are assigned first by register
   /// allocators using a greedy heuristic. The value is in the range [0,63].
+  /// Values >= 32 should be used with care since they may overlap with other
+  /// fields in the allocator's priority heuristics.
   const uint8_t AllocationPriority;
   /// Configurable target specific flags.
   const uint8_t TSFlags;
@@ -415,19 +416,11 @@ public:
 
   /// Returns true if the two registers are equal or alias each other.
   /// The registers may be virtual registers.
-  bool regsOverlap(Register regA, Register regB) const {
-    if (regA == regB) return true;
-    if (!regA.isPhysical() || !regB.isPhysical())
-      return false;
-
-    // Regunits are numerically ordered. Find a common unit.
-    MCRegUnitIterator RUA(regA.asMCReg(), this);
-    MCRegUnitIterator RUB(regB.asMCReg(), this);
-    do {
-      if (*RUA == *RUB) return true;
-      if (*RUA < *RUB) ++RUA;
-      else             ++RUB;
-    } while (RUA.isValid() && RUB.isValid());
+  bool regsOverlap(Register RegA, Register RegB) const {
+    if (RegA == RegB)
+      return true;
+    if (RegA.isPhysical() && RegB.isPhysical())
+      return MCRegisterInfo::regsOverlap(RegA.asMCReg(), RegB.asMCReg());
     return false;
   }
 
@@ -567,6 +560,24 @@ public:
   virtual bool isCalleeSavedPhysReg(MCRegister PhysReg,
                                     const MachineFunction &MF) const;
 
+  /// Returns true if PhysReg can be used as an argument to a function.
+  virtual bool isArgumentRegister(const MachineFunction &MF,
+                                  MCRegister PhysReg) const {
+    return false;
+  }
+
+  /// Returns true if PhysReg is a fixed register.
+  virtual bool isFixedRegister(const MachineFunction &MF,
+                               MCRegister PhysReg) const {
+    return false;
+  }
+
+  /// Returns true if PhysReg is a general purpose register.
+  virtual bool isGeneralPurposeRegister(const MachineFunction &MF,
+                                        MCRegister PhysReg) const {
+    return false;
+  }
+
   /// Prior to adding the live-out mask to a stackmap or patchpoint
   /// instruction, provide the target the opportunity to adjust it (mainly to
   /// remove pseudo-registers that should be ignored).
@@ -1067,6 +1078,14 @@ public:
     return false;
   }
 
+  /// When prioritizing live ranges in register allocation, if this hook returns
+  /// true then the AllocationPriority of the register class will be treated as
+  /// more important than whether the range is local to a basic block or global.
+  virtual bool
+  regClassPriorityTrumpsGlobalness(const MachineFunction &MF) const {
+    return false;
+  }
+
   //===--------------------------------------------------------------------===//
   /// Debug information queries.
 
diff --git a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
index 3fac2f688dd8..dbd678b75d05 100644
--- a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
@@ -13,12 +13,10 @@
 #ifndef LLVM_CODEGEN_TARGETSUBTARGETINFO_H
 #define LLVM_CODEGEN_TARGETSUBTARGETINFO_H
 
-#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/PBQPRAConstraint.h"
-#include "llvm/CodeGen/ScheduleDAGMutation.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/CodeGen.h"
@@ -27,6 +25,9 @@
 
 namespace llvm {
 
+class APInt;
+class MachineFunction;
+class ScheduleDAGMutation;
 class CallLowering;
 class InlineAsmLowering;
 class InstrItineraryData;
@@ -272,11 +273,6 @@ public:
   /// a finer grain to tune the register allocator.
   virtual bool enableRALocalReassignment(CodeGenOpt::Level OptLevel) const;
 
-  /// True if the subtarget should consider the cost of local intervals
-  /// created by a split candidate when choosing the best split candidate. This
-  /// heuristic may be compile time intensive.
-  virtual bool enableAdvancedRASplitCost() const;
-
   /// Enable use of alias analysis during code generation (during MI
   /// scheduling, DAGCombine, etc.).
   virtual bool useAA() const;
diff --git a/llvm/include/llvm/CodeGen/TileShapeInfo.h b/llvm/include/llvm/CodeGen/TileShapeInfo.h
index 4e574bd96cca..1b5f902139fb 100644
--- a/llvm/include/llvm/CodeGen/TileShapeInfo.h
+++ b/llvm/include/llvm/CodeGen/TileShapeInfo.h
@@ -38,7 +38,7 @@ public:
   ShapeT()
       : Row(nullptr), Col(nullptr), RowImm(InvalidImmShape),
         ColImm(InvalidImmShape) {}
-  bool operator==(const ShapeT &Shape) {
+  bool operator==(const ShapeT &Shape) const {
     MachineOperand *R = Shape.Row;
     MachineOperand *C = Shape.Col;
     if (!R || !C)
@@ -52,7 +52,7 @@ public:
     return false;
   }
 
-  bool operator!=(const ShapeT &Shape) { return !(*this == Shape); }
+  bool operator!=(const ShapeT &Shape) const { return !(*this == Shape); }
 
   MachineOperand *getRow() const { return Row; }
 
diff --git a/llvm/include/llvm/CodeGen/ValueTypes.h b/llvm/include/llvm/CodeGen/ValueTypes.h
index 7b17b98d5c55..48d265476ca8 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.h
+++ b/llvm/include/llvm/CodeGen/ValueTypes.h
@@ -19,7 +19,6 @@
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TypeSize.h"
-#include "llvm/Support/WithColor.h"
 #include <cassert>
 #include <cstdint>
 #include <string>
@@ -365,6 +364,12 @@ namespace llvm {
       return {(BaseSize.getKnownMinSize() + 7) / 8, BaseSize.isScalable()};
     }
 
+    // Return the number of bytes overwritten by a store of this value type or
+    // this value type's element type in the case of a vector.
+    uint64_t getScalarStoreSize() const {
+      return getScalarType().getStoreSize().getFixedSize();
+    }
+
     /// Return the number of bits overwritten by a store of the specified value
     /// type.
     ///
diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td
index 7f989e08e9bf..2194800b7ba9 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.td
+++ b/llvm/include/llvm/CodeGen/ValueTypes.td
@@ -20,204 +20,211 @@ class ValueType<int size, int value> {
 
 def OtherVT : ValueType<0,   1>;  // "Other" value
 def i1      : ValueType<1,   2>;  // One bit boolean value
-def i8      : ValueType<8,   3>;  // 8-bit integer value
-def i16     : ValueType<16,  4>;  // 16-bit integer value
-def i32     : ValueType<32,  5>;  // 32-bit integer value
-def i64     : ValueType<64,  6>;  // 64-bit integer value
-def i128    : ValueType<128, 7>;  // 128-bit integer value
-
-def bf16    : ValueType<16,   8>;  // 16-bit brain floating point value
-def f16     : ValueType<16,   9>;  // 16-bit floating point value
-def f32     : ValueType<32,  10>;  // 32-bit floating point value
-def f64     : ValueType<64,  11>;  // 64-bit floating point value
-def f80     : ValueType<80,  12>;  // 80-bit floating point value
-def f128    : ValueType<128, 13>;  // 128-bit floating point value
-def ppcf128 : ValueType<128, 14>;  // PPC 128-bit floating point value
-
-def v1i1    : ValueType<1,    15>;  //    1 x i1 vector value
-def v2i1    : ValueType<2,    16>;  //    2 x i1 vector value
-def v4i1    : ValueType<4,    17>;  //    4 x i1 vector value
-def v8i1    : ValueType<8,    18>;  //    8 x i1 vector value
-def v16i1   : ValueType<16,   19>;  //   16 x i1 vector value
-def v32i1   : ValueType<32,   20>;  //   32 x i1 vector value
-def v64i1   : ValueType<64,   21>;  //   64 x i1 vector value
-def v128i1  : ValueType<128,  22>;  //  128 x i1 vector value
-def v256i1  : ValueType<256,  23>;  //  256 x i1 vector value
-def v512i1  : ValueType<512,  24>;  //  512 x i1 vector value
-def v1024i1 : ValueType<1024, 25>;  // 1024 x i1 vector value
-
-def v1i8    : ValueType<8,    26>;  //    1 x i8 vector value
-def v2i8    : ValueType<16,   27>;  //    2 x i8 vector value
-def v4i8    : ValueType<32,   28>;  //    4 x i8 vector value
-def v8i8    : ValueType<64,   29>;  //    8 x i8 vector value
-def v16i8   : ValueType<128,  30>;  //   16 x i8 vector value
-def v32i8   : ValueType<256,  31>;  //   32 x i8 vector value
-def v64i8   : ValueType<512,  32>;  //   64 x i8 vector value
-def v128i8  : ValueType<1024, 33>;  //  128 x i8 vector value
-def v256i8  : ValueType<2048, 34>;  //  256 x i8 vector value
-def v512i8  : ValueType<4096, 35>;  //  512 x i8 vector value
-def v1024i8 : ValueType<8192, 36>;  // 1024 x i8 vector value
-
-def v1i16   : ValueType<16,   37>;  //   1 x i16 vector value
-def v2i16   : ValueType<32,   38>;  //   2 x i16 vector value
-def v3i16   : ValueType<48,   39>;  //   3 x i16 vector value
-def v4i16   : ValueType<64,   40>;  //   4 x i16 vector value
-def v8i16   : ValueType<128,  41>;  //   8 x i16 vector value
-def v16i16  : ValueType<256,  42>;  //  16 x i16 vector value
-def v32i16  : ValueType<512,  43>;  //  32 x i16 vector value
-def v64i16  : ValueType<1024, 44>;  //  64 x i16 vector value
-def v128i16 : ValueType<2048, 45>;  // 128 x i16 vector value
-def v256i16 : ValueType<4096, 46>;  // 256 x i16 vector value
-def v512i16 : ValueType<8192, 47>;  // 512 x i16 vector value
-
-def v1i32    : ValueType<32,    48>;  //    1 x i32 vector value
-def v2i32    : ValueType<64,    49>;  //    2 x i32 vector value
-def v3i32    : ValueType<96,    50>;  //    3 x i32 vector value
-def v4i32    : ValueType<128,   51>;  //    4 x i32 vector value
-def v5i32    : ValueType<160,   52>;  //    5 x i32 vector value
-def v6i32    : ValueType<192,   53>;  //    6 x f32 vector value
-def v7i32    : ValueType<224,   54>;  //    7 x f32 vector value
-def v8i32    : ValueType<256,   55>;  //    8 x i32 vector value
-def v16i32   : ValueType<512,   56>;  //   16 x i32 vector value
-def v32i32   : ValueType<1024,  57>;  //   32 x i32 vector value
-def v64i32   : ValueType<2048,  58>;  //   64 x i32 vector value
-def v128i32  : ValueType<4096,  59>;  //  128 x i32 vector value
-def v256i32  : ValueType<8192,  60>;  //  256 x i32 vector value
-def v512i32  : ValueType<16384, 61>;  //  512 x i32 vector value
-def v1024i32 : ValueType<32768, 62>;  // 1024 x i32 vector value
-def v2048i32 : ValueType<65536, 63>;  // 2048 x i32 vector value
-
-def v1i64   : ValueType<64,    64>;  //   1 x i64 vector value
-def v2i64   : ValueType<128,   65>;  //   2 x i64 vector value
-def v3i64   : ValueType<192,   66>;  //   3 x i64 vector value
-def v4i64   : ValueType<256,   67>;  //   4 x i64 vector value
-def v8i64   : ValueType<512,   68>;  //   8 x i64 vector value
-def v16i64  : ValueType<1024,  69>;  //  16 x i64 vector value
-def v32i64  : ValueType<2048,  70>;  //  32 x i64 vector value
-def v64i64  : ValueType<4096,  71>;  //  64 x i64 vector value
-def v128i64 : ValueType<8192,  72>;  // 128 x i64 vector value
-def v256i64 : ValueType<16384, 73>;  // 256 x i64 vector value
-
-def v1i128 : ValueType<128, 74>;  //  1 x i128 vector value
-
-def v1f16    : ValueType<16,     75>;  //    1 x f16 vector value
-def v2f16    : ValueType<32,     76>;  //    2 x f16 vector value
-def v3f16    : ValueType<48,     77>;  //    3 x f16 vector value
-def v4f16    : ValueType<64,     78>;  //    4 x f16 vector value
-def v8f16    : ValueType<128,    79>;  //    8 x f16 vector value
-def v16f16   : ValueType<256,    80>;  //   16 x f16 vector value
-def v32f16   : ValueType<512,    81>;  //   32 x f16 vector value
-def v64f16   : ValueType<1024,   82>;  //   64 x f16 vector value
-def v128f16  : ValueType<2048,   83>;  //  128 x f16 vector value
-def v256f16  : ValueType<4096,   84>;  //  256 x f16 vector value
-def v512f16  : ValueType<8192,   85>;  //  512 x f16 vector value
-
-def v2bf16   : ValueType<32,     86>;  //    2 x bf16 vector value
-def v3bf16   : ValueType<48,     87>;  //    3 x bf16 vector value
-def v4bf16   : ValueType<64,     88>;  //    4 x bf16 vector value
-def v8bf16   : ValueType<128,    89>;  //    8 x bf16 vector value
-def v16bf16  : ValueType<256,    90>;  //   16 x bf16 vector value
-def v32bf16  : ValueType<512,    91>;  //   32 x bf16 vector value
-def v64bf16  : ValueType<1024,   92>;  //   64 x bf16 vector value
-def v128bf16 : ValueType<2048,   93>;  //  128 x bf16 vector value
-
-def v1f32    : ValueType<32,     94>;  //    1 x f32 vector value
-def v2f32    : ValueType<64,     95>;  //    2 x f32 vector value
-def v3f32    : ValueType<96,     96>;  //    3 x f32 vector value
-def v4f32    : ValueType<128,    97>;  //    4 x f32 vector value
-def v5f32    : ValueType<160,    98>;  //    5 x f32 vector value
-def v6f32    : ValueType<192,    99>;  //    6 x f32 vector value
-def v7f32    : ValueType<224,   100>;  //    7 x f32 vector value
-def v8f32    : ValueType<256,   101>;  //    8 x f32 vector value
-def v16f32   : ValueType<512,   102>;  //   16 x f32 vector value
-def v32f32   : ValueType<1024,  103>;  //   32 x f32 vector value
-def v64f32   : ValueType<2048,  104>;  //   64 x f32 vector value
-def v128f32  : ValueType<4096,  105>;  //  128 x f32 vector value
-def v256f32  : ValueType<8192,  106>;  //  256 x f32 vector value
-def v512f32  : ValueType<16384, 107>;  //  512 x f32 vector value
-def v1024f32 : ValueType<32768, 108>;  // 1024 x f32 vector value
-def v2048f32 : ValueType<65536, 109>;  // 2048 x f32 vector value
-
-def v1f64    : ValueType<64,    110>;  //    1 x f64 vector value
-def v2f64    : ValueType<128,   111>;  //    2 x f64 vector value
-def v3f64    : ValueType<192,   112>;  //    3 x f64 vector value
-def v4f64    : ValueType<256,   113>;  //    4 x f64 vector value
-def v8f64    : ValueType<512,   114>;  //    8 x f64 vector value
-def v16f64   : ValueType<1024,  115>;  //   16 x f64 vector value
-def v32f64   : ValueType<2048,  116>;  //   32 x f64 vector value
-def v64f64   : ValueType<4096,  117>;  //   64 x f64 vector value
-def v128f64  : ValueType<8192,  118>;  //  128 x f64 vector value
-def v256f64  : ValueType<16384, 119>;  //  256 x f64 vector value
-
-def nxv1i1  : ValueType<1,  120>;  // n x  1 x i1  vector value
-def nxv2i1  : ValueType<2,  121>;  // n x  2 x i1  vector value
-def nxv4i1  : ValueType<4,  122>;  // n x  4 x i1  vector value
-def nxv8i1  : ValueType<8,  123>;  // n x  8 x i1  vector value
-def nxv16i1 : ValueType<16, 124>;  // n x 16 x i1  vector value
-def nxv32i1 : ValueType<32, 125>;  // n x 32 x i1  vector value
-def nxv64i1 : ValueType<64, 126>;  // n x 64 x i1  vector value
-
-def nxv1i8  : ValueType<8,   127>;  // n x  1 x i8  vector value
-def nxv2i8  : ValueType<16,  128>;  // n x  2 x i8  vector value
-def nxv4i8  : ValueType<32,  129>;  // n x  4 x i8  vector value
-def nxv8i8  : ValueType<64,  130>;  // n x  8 x i8  vector value
-def nxv16i8 : ValueType<128, 131>;  // n x 16 x i8  vector value
-def nxv32i8 : ValueType<256, 132>;  // n x 32 x i8  vector value
-def nxv64i8 : ValueType<512, 133>;  // n x 64 x i8  vector value
-
-def nxv1i16  : ValueType<16,  134>;  // n x  1 x i16 vector value
-def nxv2i16  : ValueType<32,  135>;  // n x  2 x i16 vector value
-def nxv4i16  : ValueType<64,  136>;  // n x  4 x i16 vector value
-def nxv8i16  : ValueType<128, 137>;  // n x  8 x i16 vector value
-def nxv16i16 : ValueType<256, 138>;  // n x 16 x i16 vector value
-def nxv32i16 : ValueType<512, 139>;  // n x 32 x i16 vector value
-
-def nxv1i32  : ValueType<32,   140>;  // n x  1 x i32 vector value
-def nxv2i32  : ValueType<64,   141>;  // n x  2 x i32 vector value
-def nxv4i32  : ValueType<128,  142>;  // n x  4 x i32 vector value
-def nxv8i32  : ValueType<256,  143>;  // n x  8 x i32 vector value
-def nxv16i32 : ValueType<512,  144>;  // n x 16 x i32 vector value
-def nxv32i32 : ValueType<1024, 145>;  // n x 32 x i32 vector value
-
-def nxv1i64  : ValueType<64,   146>;  // n x  1 x i64 vector value
-def nxv2i64  : ValueType<128,  147>;  // n x  2 x i64 vector value
-def nxv4i64  : ValueType<256,  148>;  // n x  4 x i64 vector value
-def nxv8i64  : ValueType<512,  149>;  // n x  8 x i64 vector value
-def nxv16i64 : ValueType<1024, 150>;  // n x 16 x i64 vector value
-def nxv32i64 : ValueType<2048, 151>;  // n x 32 x i64 vector value
-
-def nxv1f16  : ValueType<16,  152>;  // n x  1 x  f16 vector value
-def nxv2f16  : ValueType<32,  153>;  // n x  2 x  f16 vector value
-def nxv4f16  : ValueType<64,  154>;  // n x  4 x  f16 vector value
-def nxv8f16  : ValueType<128, 155>;  // n x  8 x  f16 vector value
-def nxv16f16 : ValueType<256, 156>;  // n x 16 x  f16 vector value
-def nxv32f16 : ValueType<512, 157>;  // n x 32 x  f16 vector value
-
-def nxv1bf16 : ValueType<16,  158>;  // n x  1 x bf16 vector value
-def nxv2bf16 : ValueType<32,  159>;  // n x  2 x bf16 vector value
-def nxv4bf16 : ValueType<64,  160>;  // n x  4 x bf16 vector value
-def nxv8bf16 : ValueType<128, 161>;  // n x  8 x bf16 vector value
-
-def nxv1f32  : ValueType<32,  162>;  // n x  1 x  f32 vector value
-def nxv2f32  : ValueType<64,  163>;  // n x  2 x  f32 vector value
-def nxv4f32  : ValueType<128, 164>;  // n x  4 x  f32 vector value
-def nxv8f32  : ValueType<256, 165>;  // n x  8 x  f32 vector value
-def nxv16f32 : ValueType<512, 166>;  // n x 16 x  f32 vector value
-
-def nxv1f64  : ValueType<64,  167>;  // n x  1 x  f64 vector value
-def nxv2f64  : ValueType<128, 168>;  // n x  2 x  f64 vector value
-def nxv4f64  : ValueType<256, 169>;  // n x  4 x  f64 vector value
-def nxv8f64  : ValueType<512, 170>;  // n x  8 x  f64 vector value
-
-def x86mmx    : ValueType<64,   171>;  // X86 MMX value
-def FlagVT    : ValueType<0,    172>;  // Pre-RA sched glue
-def isVoid    : ValueType<0,    173>;  // Produces no value
-def untyped   : ValueType<8,    174>;  // Produces an untyped value
-def funcref   : ValueType<0,    175>;  // WebAssembly's funcref type
-def externref : ValueType<0,    176>;  // WebAssembly's externref type
-def x86amx    : ValueType<8192, 177>;  // X86 AMX value
-def i64x8     : ValueType<512,  178>;  // 8 Consecutive GPRs (AArch64)
-
+def i2      : ValueType<2,   3>;  // 2-bit integer value
+def i4      : ValueType<4,   4>;  // 4-bit integer value
+def i8      : ValueType<8,   5>;  // 8-bit integer value
+def i16     : ValueType<16,  6>;  // 16-bit integer value
+def i32     : ValueType<32,  7>;  // 32-bit integer value
+def i64     : ValueType<64,  8>;  // 64-bit integer value
+def i128    : ValueType<128, 9>;  // 128-bit integer value
+
+def bf16    : ValueType<16,  10>;  // 16-bit brain floating point value
+def f16     : ValueType<16,  11>;  // 16-bit floating point value
+def f32     : ValueType<32,  12>;  // 32-bit floating point value
+def f64     : ValueType<64,  13>;  // 64-bit floating point value
+def f80     : ValueType<80,  14>;  // 80-bit floating point value
+def f128    : ValueType<128, 15>;  // 128-bit floating point value
+def ppcf128 : ValueType<128, 16>;  // PPC 128-bit floating point value
+
+def v1i1    : ValueType<1,    17>;  //    1 x i1 vector value
+def v2i1    : ValueType<2,    18>;  //    2 x i1 vector value
+def v4i1    : ValueType<4,    19>;  //    4 x i1 vector value
+def v8i1    : ValueType<8,    20>;  //    8 x i1 vector value
+def v16i1   : ValueType<16,   21>;  //   16 x i1 vector value
+def v32i1   : ValueType<32,   22>;  //   32 x i1 vector value
+def v64i1   : ValueType<64,   23>;  //   64 x i1 vector value
+def v128i1  : ValueType<128,  24>;  //  128 x i1 vector value
+def v256i1  : ValueType<256,  25>;  //  256 x i1 vector value
+def v512i1  : ValueType<512,  26>;  //  512 x i1 vector value
+def v1024i1 : ValueType<1024, 27>;  // 1024 x i1 vector value
+
+def v128i2  : ValueType<256,  28>;   //  128 x i2 vector value
+
+def v64i4   : ValueType<256,  29>;   //   64 x i4 vector value
+
+def v1i8    : ValueType<8,    30>;  //    1 x i8 vector value
+def v2i8    : ValueType<16,   31>;  //    2 x i8 vector value
+def v4i8    : ValueType<32,   32>;  //    4 x i8 vector value
+def v8i8    : ValueType<64,   33>;  //    8 x i8 vector value
+def v16i8   : ValueType<128,  34>;  //   16 x i8 vector value
+def v32i8   : ValueType<256,  35>;  //   32 x i8 vector value
+def v64i8   : ValueType<512,  36>;  //   64 x i8 vector value
+def v128i8  : ValueType<1024, 37>;  //  128 x i8 vector value
+def v256i8  : ValueType<2048, 38>;  //  256 x i8 vector value
+def v512i8  : ValueType<4096, 39>;  //  512 x i8 vector value
+def v1024i8 : ValueType<8192, 40>;  // 1024 x i8 vector value
+
+def v1i16   : ValueType<16,   41>;  //   1 x i16 vector value
+def v2i16   : ValueType<32,   42>;  //   2 x i16 vector value
+def v3i16   : ValueType<48,   43>;  //   3 x i16 vector value
+def v4i16   : ValueType<64,   44>;  //   4 x i16 vector value
+def v8i16   : ValueType<128,  45>;  //   8 x i16 vector value
+def v16i16  : ValueType<256,  46>;  //  16 x i16 vector value
+def v32i16  : ValueType<512,  47>;  //  32 x i16 vector value
+def v64i16  : ValueType<1024, 48>;  //  64 x i16 vector value
+def v128i16 : ValueType<2048, 49>;  // 128 x i16 vector value
+def v256i16 : ValueType<4096, 50>;  // 256 x i16 vector value
+def v512i16 : ValueType<8192, 51>;  // 512 x i16 vector value
+
+def v1i32    : ValueType<32,    52>;  //    1 x i32 vector value
+def v2i32    : ValueType<64,    53>;  //    2 x i32 vector value
+def v3i32    : ValueType<96,    54>;  //    3 x i32 vector value
+def v4i32    : ValueType<128,   55>;  //    4 x i32 vector value
+def v5i32    : ValueType<160,   56>;  //    5 x i32 vector value
+def v6i32    : ValueType<192,   57>;  //    6 x f32 vector value
+def v7i32    : ValueType<224,   58>;  //    7 x f32 vector value
+def v8i32    : ValueType<256,   59>;  //    8 x i32 vector value
+def v16i32   : ValueType<512,   60>;  //   16 x i32 vector value
+def v32i32   : ValueType<1024,  61>;  //   32 x i32 vector value
+def v64i32   : ValueType<2048,  62>;  //   64 x i32 vector value
+def v128i32  : ValueType<4096,  63>;  //  128 x i32 vector value
+def v256i32  : ValueType<8192,  64>;  //  256 x i32 vector value
+def v512i32  : ValueType<16384, 65>;  //  512 x i32 vector value
+def v1024i32 : ValueType<32768, 66>;  // 1024 x i32 vector value
+def v2048i32 : ValueType<65536, 67>;  // 2048 x i32 vector value
+
+def v1i64   : ValueType<64,    68>;  //   1 x i64 vector value
+def v2i64   : ValueType<128,   69>;  //   2 x i64 vector value
+def v3i64   : ValueType<192,   70>;  //   3 x i64 vector value
+def v4i64   : ValueType<256,   71>;  //   4 x i64 vector value
+def v8i64   : ValueType<512,   72>;  //   8 x i64 vector value
+def v16i64  : ValueType<1024,  73>;  //  16 x i64 vector value
+def v32i64  : ValueType<2048,  74>;  //  32 x i64 vector value
+def v64i64  : ValueType<4096,  75>;  //  64 x i64 vector value
+def v128i64 : ValueType<8192,  76>;  // 128 x i64 vector value
+def v256i64 : ValueType<16384, 77>;  // 256 x i64 vector value
+
+def v1i128 : ValueType<128, 78>;  //  1 x i128 vector value
+
+def v1f16    : ValueType<16,     79>;  //    1 x f16 vector value
+def v2f16    : ValueType<32,     80>;  //    2 x f16 vector value
+def v3f16    : ValueType<48,     81>;  //    3 x f16 vector value
+def v4f16    : ValueType<64,     82>;  //    4 x f16 vector value
+def v8f16    : ValueType<128,    83>;  //    8 x f16 vector value
+def v16f16   : ValueType<256,    84>;  //   16 x f16 vector value
+def v32f16   : ValueType<512,    85>;  //   32 x f16 vector value
+def v64f16   : ValueType<1024,   86>;  //   64 x f16 vector value
+def v128f16  : ValueType<2048,   87>;  //  128 x f16 vector value
+def v256f16  : ValueType<4096,   88>;  //  256 x f16 vector value
+def v512f16  : ValueType<8192,   89>;  //  512 x f16 vector value
+
+def v2bf16   : ValueType<32,     90>;  //    2 x bf16 vector value
+def v3bf16   : ValueType<48,     91>;  //    3 x bf16 vector value
+def v4bf16   : ValueType<64,     92>;  //    4 x bf16 vector value
+def v8bf16   : ValueType<128,    93>;  //    8 x bf16 vector value
+def v16bf16  : ValueType<256,    94>;  //   16 x bf16 vector value
+def v32bf16  : ValueType<512,    95>;  //   32 x bf16 vector value
+def v64bf16  : ValueType<1024,   96>;  //   64 x bf16 vector value
+def v128bf16 : ValueType<2048,   97>;  //  128 x bf16 vector value
+
+def v1f32    : ValueType<32,     98>;  //    1 x f32 vector value
+def v2f32    : ValueType<64,     99>;  //    2 x f32 vector value
+def v3f32    : ValueType<96,    100>;  //    3 x f32 vector value
+def v4f32    : ValueType<128,   101>;  //    4 x f32 vector value
+def v5f32    : ValueType<160,   102>;  //    5 x f32 vector value
+def v6f32    : ValueType<192,   103>;  //    6 x f32 vector value
+def v7f32    : ValueType<224,   104>;  //    7 x f32 vector value
+def v8f32    : ValueType<256,   105>;  //    8 x f32 vector value
+def v16f32   : ValueType<512,   106>;  //   16 x f32 vector value
+def v32f32   : ValueType<1024,  107>;  //   32 x f32 vector value
+def v64f32   : ValueType<2048,  108>;  //   64 x f32 vector value
+def v128f32  : ValueType<4096,  109>;  //  128 x f32 vector value
+def v256f32  : ValueType<8192,  110>;  //  256 x f32 vector value
+def v512f32  : ValueType<16384, 111>;  //  512 x f32 vector value
+def v1024f32 : ValueType<32768, 112>;  // 1024 x f32 vector value
+def v2048f32 : ValueType<65536, 113>;  // 2048 x f32 vector value
+
+def v1f64    : ValueType<64,    114>;  //    1 x f64 vector value
+def v2f64    : ValueType<128,   115>;  //    2 x f64 vector value
+def v3f64    : ValueType<192,   116>;  //    3 x f64 vector value
+def v4f64    : ValueType<256,   117>;  //    4 x f64 vector value
+def v8f64    : ValueType<512,   118>;  //    8 x f64 vector value
+def v16f64   : ValueType<1024,  119>;  //   16 x f64 vector value
+def v32f64   : ValueType<2048,  120>;  //   32 x f64 vector value
+def v64f64   : ValueType<4096,  121>;  //   64 x f64 vector value
+def v128f64  : ValueType<8192,  122>;  //  128 x f64 vector value
+def v256f64  : ValueType<16384, 123>;  //  256 x f64 vector value
+
+def nxv1i1  : ValueType<1,  124>;  // n x  1 x i1  vector value
+def nxv2i1  : ValueType<2,  125>;  // n x  2 x i1  vector value
+def nxv4i1  : ValueType<4,  126>;  // n x  4 x i1  vector value
+def nxv8i1  : ValueType<8,  127>;  // n x  8 x i1  vector value
+def nxv16i1 : ValueType<16, 128>;  // n x 16 x i1  vector value
+def nxv32i1 : ValueType<32, 129>;  // n x 32 x i1  vector value
+def nxv64i1 : ValueType<64, 130>;  // n x 64 x i1  vector value
+
+def nxv1i8  : ValueType<8,   131>;  // n x  1 x i8  vector value
+def nxv2i8  : ValueType<16,  132>;  // n x  2 x i8  vector value
+def nxv4i8  : ValueType<32,  133>;  // n x  4 x i8  vector value
+def nxv8i8  : ValueType<64,  134>;  // n x  8 x i8  vector value
+def nxv16i8 : ValueType<128, 135>;  // n x 16 x i8  vector value
+def nxv32i8 : ValueType<256, 136>;  // n x 32 x i8  vector value
+def nxv64i8 : ValueType<512, 137>;  // n x 64 x i8  vector value
+
+def nxv1i16  : ValueType<16,  138>;  // n x  1 x i16 vector value
+def nxv2i16  : ValueType<32,  139>;  // n x  2 x i16 vector value
+def nxv4i16  : ValueType<64,  140>;  // n x  4 x i16 vector value
+def nxv8i16  : ValueType<128, 141>;  // n x  8 x i16 vector value
+def nxv16i16 : ValueType<256, 142>;  // n x 16 x i16 vector value
+def nxv32i16 : ValueType<512, 143>;  // n x 32 x i16 vector value
+
+def nxv1i32  : ValueType<32,   144>;  // n x  1 x i32 vector value
+def nxv2i32  : ValueType<64,   145>;  // n x  2 x i32 vector value
+def nxv4i32  : ValueType<128,  146>;  // n x  4 x i32 vector value
+def nxv8i32  : ValueType<256,  147>;  // n x  8 x i32 vector value
+def nxv16i32 : ValueType<512,  148>;  // n x 16 x i32 vector value
+def nxv32i32 : ValueType<1024, 149>;  // n x 32 x i32 vector value
+
+def nxv1i64  : ValueType<64,   150>;  // n x  1 x i64 vector value
+def nxv2i64  : ValueType<128,  151>;  // n x  2 x i64 vector value
+def nxv4i64  : ValueType<256,  152>;  // n x  4 x i64 vector value
+def nxv8i64  : ValueType<512,  153>;  // n x  8 x i64 vector value
+def nxv16i64 : ValueType<1024, 154>;  // n x 16 x i64 vector value
+def nxv32i64 : ValueType<2048, 155>;  // n x 32 x i64 vector value
+
+def nxv1f16  : ValueType<16,  156>;  // n x  1 x  f16 vector value
+def nxv2f16  : ValueType<32,  157>;  // n x  2 x  f16 vector value
+def nxv4f16  : ValueType<64,  158>;  // n x  4 x  f16 vector value
+def nxv8f16  : ValueType<128, 159>;  // n x  8 x  f16 vector value
+def nxv16f16 : ValueType<256, 160>;  // n x 16 x  f16 vector value
+def nxv32f16 : ValueType<512, 161>;  // n x 32 x  f16 vector value
+
+def nxv1bf16 : ValueType<16,  162>;  // n x  1 x bf16 vector value
+def nxv2bf16 : ValueType<32,  163>;  // n x  2 x bf16 vector value
+def nxv4bf16 : ValueType<64,  164>;  // n x  4 x bf16 vector value
+def nxv8bf16 : ValueType<128, 165>;  // n x  8 x bf16 vector value
+def nxv16bf16 : ValueType<256, 166>;  // n x 16 x bf16 vector value
+def nxv32bf16 : ValueType<512, 167>;  // n x 32 x bf16 vector value
+
+def nxv1f32  : ValueType<32,  168>;  // n x  1 x  f32 vector value
+def nxv2f32  : ValueType<64,  169>;  // n x  2 x  f32 vector value
+def nxv4f32  : ValueType<128, 170>;  // n x  4 x  f32 vector value
+def nxv8f32  : ValueType<256, 171>;  // n x  8 x  f32 vector value
+def nxv16f32 : ValueType<512, 172>;  // n x 16 x  f32 vector value
+
+def nxv1f64  : ValueType<64,  173>;  // n x  1 x  f64 vector value
+def nxv2f64  : ValueType<128, 174>;  // n x  2 x  f64 vector value
+def nxv4f64  : ValueType<256, 175>;  // n x  4 x  f64 vector value
+def nxv8f64  : ValueType<512, 176>;  // n x  8 x  f64 vector value
+
+def x86mmx    : ValueType<64,   177>;  // X86 MMX value
+def FlagVT    : ValueType<0,    178>;  // Pre-RA sched glue
+def isVoid    : ValueType<0,    179>;  // Produces no value
+def untyped   : ValueType<8,    180>;  // Produces an untyped value
+def funcref   : ValueType<0,    181>;  // WebAssembly's funcref type
+def externref : ValueType<0,    182>;  // WebAssembly's externref type
+def x86amx    : ValueType<8192, 183>;  // X86 AMX value
+def i64x8     : ValueType<512,  184>;  // 8 Consecutive GPRs (AArch64)
 
 def token      : ValueType<0, 248>;  // TokenTy
 def MetadataVT : ValueType<0, 249>;  // Metadata
diff --git a/llvm/include/llvm/DWARFLinker/DWARFLinker.h b/llvm/include/llvm/DWARFLinker/DWARFLinker.h
index 4f1c666df35f..0b2e033bd97a 100644
--- a/llvm/include/llvm/DWARFLinker/DWARFLinker.h
+++ b/llvm/include/llvm/DWARFLinker/DWARFLinker.h
@@ -11,18 +11,26 @@
 
 #include "llvm/CodeGen/AccelTable.h"
 #include "llvm/CodeGen/NonRelocatableStringpool.h"
-#include "llvm/DWARFLinker/DWARFLinkerDeclContext.h"
-#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
-#include "llvm/DebugInfo/DWARF/DWARFContext.h"
-#include "llvm/MC/MCDwarf.h"
+#include "llvm/DWARFLinker/DWARFLinkerCompileUnit.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
+#include "llvm/DebugInfo/DWARF/DWARFDie.h"
 #include <map>
 
 namespace llvm {
+class DWARFContext;
+class DWARFExpression;
+class DWARFUnit;
+class DataExtractor;
+class DeclContextTree;
+struct MCDwarfLineTableParams;
+template <typename T> class SmallVectorImpl;
 
 enum class DwarfLinkerClient { Dsymutil, LLD, General };
 
 /// The kind of accelerator tables we should emit.
-enum class AccelTableKind {
+enum class DwarfLinkerAccelTableKind : uint8_t {
+  None,
   Apple,   ///< .apple_names, .apple_namespaces, .apple_types, .apple_objc.
   Dwarf,   ///< DWARF v5 .debug_names.
   Default, ///< Dwarf for DWARF5 or later, Apple otherwise.
@@ -56,28 +64,21 @@ class AddressesMap {
 public:
   virtual ~AddressesMap();
 
-  /// Returns true if represented addresses are from linked file.
-  /// Returns false if represented addresses are from not-linked
-  /// object file.
-  virtual bool areRelocationsResolved() const = 0;
-
   /// Checks that there are valid relocations against a .debug_info
   /// section.
   virtual bool hasValidRelocs() = 0;
 
-  /// Checks that the specified DIE has a DW_AT_Location attribute
-  /// that references into a live code section.
-  ///
+  /// Checks that the specified variable \p DIE references live code section.
+  /// Allowed kind of input die: DW_TAG_variable, DW_TAG_constant.
   /// \returns true and sets Info.InDebugMap if it is the case.
-  virtual bool hasLiveMemoryLocation(const DWARFDie &DIE,
-                                     CompileUnit::DIEInfo &Info) = 0;
+  virtual bool isLiveVariable(const DWARFDie &DIE,
+                              CompileUnit::DIEInfo &Info) = 0;
 
-  /// Checks that the specified DIE has a DW_AT_Low_pc attribute
-  /// that references into a live code section.
-  ///
+  /// Checks that the specified subprogram \p DIE references live code section.
+  /// Allowed kind of input die: DW_TAG_subprogram, DW_TAG_label.
   /// \returns true and sets Info.InDebugMap if it is the case.
-  virtual bool hasLiveAddressRange(const DWARFDie &DIE,
-                                   CompileUnit::DIEInfo &Info) = 0;
+  virtual bool isLiveSubprogram(const DWARFDie &DIE,
+                                CompileUnit::DIEInfo &Info) = 0;
 
   /// Apply the valid relocations to the buffer \p Data, taking into
   /// account that Data is at \p BaseOffset in the .debug_info section.
@@ -272,6 +273,9 @@ public:
   /// Print statistics to standard output.
   void setStatistics(bool Statistics) { Options.Statistics = Statistics; }
 
+  /// Verify the input DWARF.
+  void setVerifyInputDWARF(bool Verify) { Options.VerifyInputDWARF = Verify; }
+
   /// Do not emit linked dwarf info.
   void setNoOutput(bool NoOut) { Options.NoOutput = NoOut; }
 
@@ -290,7 +294,7 @@ public:
   void setNumThreads(unsigned NumThreads) { Options.Threads = NumThreads; }
 
   /// Set kind of accelerator tables to be generated.
-  void setAccelTableKind(AccelTableKind Kind) {
+  void setAccelTableKind(DwarfLinkerAccelTableKind Kind) {
     Options.TheAccelTableKind = Kind;
   }
 
@@ -361,6 +365,8 @@ private:
     /// Given a DIE, update its incompleteness based on whether the DIEs it
     /// references are incomplete.
     UpdateRefIncompleteness,
+    /// Given a DIE, mark it as ODR Canonical if applicable.
+    MarkODRCanonicalDie,
   };
 
   /// This class represents an item in the work list. The type defines what kind
@@ -389,6 +395,9 @@ private:
           AncestorIdx(AncestorIdx) {}
   };
 
+  /// Verify the given DWARF file.
+  bool verify(const DWARFFile &File);
+
   /// returns true if we need to translate strings.
   bool needToTranslateStrings() { return StringsTranslator != nullptr; }
 
@@ -457,6 +466,10 @@ private:
                             const DWARFFile &File,
                             SmallVectorImpl<WorklistItem> &Worklist);
 
+  /// Mark context corresponding to the specified \p Die as having canonical
+  /// die, if applicable.
+  void markODRCanonicalDie(const DWARFDie &Die, CompileUnit &CU);
+
   /// \defgroup FindRootDIEs Find DIEs corresponding to Address map entries.
   ///
   /// @{
@@ -778,6 +791,9 @@ private:
     /// Print statistics.
     bool Statistics = false;
 
+    /// Verify the input DWARF.
+    bool VerifyInputDWARF = false;
+
     /// Skip emitting output
     bool NoOutput = false;
 
@@ -795,7 +811,8 @@ private:
     unsigned Threads = 1;
 
     /// The accelerator table kind
-    AccelTableKind TheAccelTableKind = AccelTableKind::Default;
+    DwarfLinkerAccelTableKind TheAccelTableKind =
+        DwarfLinkerAccelTableKind::Default;
 
     /// Prepend path for the clang modules.
     std::string PrependPath;
diff --git a/llvm/include/llvm/DWARFLinker/DWARFLinkerCompileUnit.h b/llvm/include/llvm/DWARFLinker/DWARFLinkerCompileUnit.h
index afba19ac7d42..788275782235 100644
--- a/llvm/include/llvm/DWARFLinker/DWARFLinkerCompileUnit.h
+++ b/llvm/include/llvm/DWARFLinker/DWARFLinkerCompileUnit.h
@@ -9,10 +9,10 @@
 #ifndef LLVM_DWARFLINKER_DWARFLINKERCOMPILEUNIT_H
 #define LLVM_DWARFLINKER_DWARFLINKERCOMPILEUNIT_H
 
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/IntervalMap.h"
 #include "llvm/CodeGen/DIE.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
-#include "llvm/Support/DataExtractor.h"
 
 namespace llvm {
 
@@ -74,6 +74,12 @@ public:
 
     /// Does DIE transitively refer an incomplete decl?
     bool Incomplete : 1;
+
+    /// Is DIE in the clang module scope?
+    bool InModuleScope : 1;
+
+    /// Is ODR marking done?
+    bool ODRMarkingDone : 1;
   };
 
   CompileUnit(DWARFUnit &OrigUnit, unsigned ID, bool CanUseODR,
diff --git a/llvm/include/llvm/DWARFLinker/DWARFLinkerDeclContext.h b/llvm/include/llvm/DWARFLinker/DWARFLinkerDeclContext.h
index d2274488e85f..fb02b0fc1b4d 100644
--- a/llvm/include/llvm/DWARFLinker/DWARFLinkerDeclContext.h
+++ b/llvm/include/llvm/DWARFLinker/DWARFLinkerDeclContext.h
@@ -14,14 +14,15 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/NonRelocatableStringpool.h"
-#include "llvm/DWARFLinker/DWARFLinkerCompileUnit.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
+#include <atomic>
 
 namespace llvm {
 
+class CompileUnit;
 struct DeclMapInfo;
 
 /// Small helper that resolves and caches file paths. This helps reduce the
@@ -91,6 +92,10 @@ public:
 
   bool setLastSeenDIE(CompileUnit &U, const DWARFDie &Die);
 
+  void setHasCanonicalDIE() { HasCanonicalDIE = true; }
+
+  bool hasCanonicalDIE() const { return HasCanonicalDIE; }
+
   uint32_t getCanonicalDIEOffset() const { return CanonicalDIEOffset; }
   void setCanonicalDIEOffset(uint32_t Offset) { CanonicalDIEOffset = Offset; }
 
@@ -112,7 +117,8 @@ private:
   const DeclContext &Parent;
   DWARFDie LastSeenDIE;
   uint32_t LastSeenCompileUnitID = 0;
-  uint32_t CanonicalDIEOffset = 0;
+  std::atomic<uint32_t> CanonicalDIEOffset = {0};
+  bool HasCanonicalDIE = false;
 };
 
 /// This class gives a tree-like API to the DenseMap that stores the
diff --git a/llvm/include/llvm/DWARFLinker/DWARFStreamer.h b/llvm/include/llvm/DWARFLinker/DWARFStreamer.h
index fc8c59904cfb..003fe548252a 100644
--- a/llvm/include/llvm/DWARFLinker/DWARFStreamer.h
+++ b/llvm/include/llvm/DWARFLinker/DWARFStreamer.h
@@ -10,7 +10,6 @@
 #define LLVM_DWARFLINKER_DWARFSTREAMER_H
 
 #include "llvm/BinaryFormat/Swift.h"
-#include "llvm/CodeGen/AccelTable.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/DWARFLinker/DWARFLinker.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -18,9 +17,11 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
+template <typename DataT> class AccelTable;
 
 enum class OutputFileType {
   Object,
diff --git a/llvm/include/llvm/DWP/DWPStringPool.h b/llvm/include/llvm/DWP/DWPStringPool.h
index 9f69851f0055..1354b46f156b 100644
--- a/llvm/include/llvm/DWP/DWPStringPool.h
+++ b/llvm/include/llvm/DWP/DWPStringPool.h
@@ -43,7 +43,7 @@ public:
 
     auto Pair = Pool.insert(std::make_pair(Str, Offset));
     if (Pair.second) {
-      Out.SwitchSection(Sec);
+      Out.switchSection(Sec);
       Out.emitBytes(StringRef(Str, Length));
       Offset += Length;
     }
diff --git a/llvm/include/llvm/DebugInfo/CodeView/AppendingTypeTableBuilder.h b/llvm/include/llvm/DebugInfo/CodeView/AppendingTypeTableBuilder.h
index 5a91682e9bd4..d474173973b5 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/AppendingTypeTableBuilder.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/AppendingTypeTableBuilder.h
@@ -11,7 +11,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
 #include "llvm/DebugInfo/CodeView/SimpleTypeSerializer.h"
 #include "llvm/DebugInfo/CodeView/TypeCollection.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
diff --git a/llvm/include/llvm/DebugInfo/CodeView/CVSymbolVisitor.h b/llvm/include/llvm/DebugInfo/CodeView/CVSymbolVisitor.h
index 82ef8c173bee..ef44b622d955 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/CVSymbolVisitor.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/CVSymbolVisitor.h
@@ -10,7 +10,7 @@
 #define LLVM_DEBUGINFO_CODEVIEW_CVSYMBOLVISITOR_H
 
 #include "llvm/DebugInfo/CodeView/CVRecord.h"
-#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/Error.h"
 
 namespace llvm {
 namespace codeview {
@@ -18,12 +18,20 @@ class SymbolVisitorCallbacks;
 
 class CVSymbolVisitor {
 public:
+  struct FilterOptions {
+    llvm::Optional<uint32_t> SymbolOffset;
+    llvm::Optional<uint32_t> ParentRecursiveDepth;
+    llvm::Optional<uint32_t> ChildRecursiveDepth;
+  };
+
   CVSymbolVisitor(SymbolVisitorCallbacks &Callbacks);
 
   Error visitSymbolRecord(CVSymbol &Record);
   Error visitSymbolRecord(CVSymbol &Record, uint32_t Offset);
   Error visitSymbolStream(const CVSymbolArray &Symbols);
   Error visitSymbolStream(const CVSymbolArray &Symbols, uint32_t InitialOffset);
+  Error visitSymbolStreamFiltered(const CVSymbolArray &Symbols,
+                                  const FilterOptions &Filter);
 
 private:
   SymbolVisitorCallbacks &Callbacks;
diff --git a/llvm/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h b/llvm/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
index 7538cb2c2548..7780e233cab3 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
@@ -9,14 +9,17 @@
 #ifndef LLVM_DEBUGINFO_CODEVIEW_CVTYPEVISITOR_H
 #define LLVM_DEBUGINFO_CODEVIEW_CVTYPEVISITOR_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/DebugInfo/CodeView/CVRecord.h"
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
 namespace codeview {
+class TypeIndex;
 class TypeCollection;
 class TypeVisitorCallbacks;
+struct CVMemberRecord;
 
 enum VisitorDataSource {
   VDS_BytesPresent, // The record bytes are passed into the visitation
diff --git a/llvm/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h b/llvm/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h
index d851dea0a27f..4fbe7e835a8a 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h
@@ -9,14 +9,11 @@
 #ifndef LLVM_DEBUGINFO_CODEVIEW_CODEVIEWRECORDIO_H
 #define LLVM_DEBUGINFO_CODEVIEW_CODEVIEWRECORDIO_H
 
-#include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
-#include "llvm/DebugInfo/CodeView/GUID.h"
-#include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/Error.h"
@@ -26,7 +23,12 @@
 
 namespace llvm {
 
+template <typename T> class ArrayRef;
+class APSInt;
+
 namespace codeview {
+class TypeIndex;
+struct GUID;
 
 class CodeViewRecordStreamer {
 public:
@@ -246,7 +248,7 @@ private:
     Optional<uint32_t> MaxLength;
 
     Optional<uint32_t> bytesRemaining(uint32_t CurrentOffset) const {
-      if (!MaxLength.hasValue())
+      if (!MaxLength)
         return None;
       assert(CurrentOffset >= BeginOffset);
 
diff --git a/llvm/include/llvm/DebugInfo/CodeView/ContinuationRecordBuilder.h b/llvm/include/llvm/DebugInfo/CodeView/ContinuationRecordBuilder.h
index 0e2f5d90e243..0f83ae370a1e 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/ContinuationRecordBuilder.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/ContinuationRecordBuilder.h
@@ -12,22 +12,16 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/DebugInfo/CodeView/CodeView.h"
-#include "llvm/DebugInfo/CodeView/RecordSerialization.h"
-#include "llvm/DebugInfo/CodeView/TypeIndex.h"
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeRecordMapping.h"
-#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
 #include "llvm/Support/BinaryByteStream.h"
 #include "llvm/Support/BinaryStreamWriter.h"
-#include "llvm/Support/Error.h"
-#include <cassert>
 #include <cstdint>
-#include <memory>
 #include <vector>
 
 namespace llvm {
 namespace codeview {
+class TypeIndex;
 enum class ContinuationRecordKind { FieldList, MethodOverloadList };
 
 class ContinuationRecordBuilder {
diff --git a/llvm/include/llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h b/llvm/include/llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h
index 01f83676afdf..615fd216e655 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h
@@ -16,7 +16,6 @@
 #include "llvm/DebugInfo/CodeView/DebugSubsection.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/BinaryStreamArray.h"
-#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Error.h"
 #include <cstdint>
@@ -24,6 +23,9 @@
 
 namespace llvm {
 
+class BinaryStreamReader;
+class BinaryStreamWriter;
+
 namespace codeview {
 
 class DebugStringTableSubsection;
diff --git a/llvm/include/llvm/DebugInfo/CodeView/DebugCrossExSubsection.h b/llvm/include/llvm/DebugInfo/CodeView/DebugCrossExSubsection.h
index 64a78a7cef21..e21873a3af8f 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/DebugCrossExSubsection.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/DebugCrossExSubsection.h
@@ -12,13 +12,14 @@
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/DebugSubsection.h"
 #include "llvm/Support/BinaryStreamArray.h"
-#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Error.h"
 #include <cstdint>
 #include <map>
 
 namespace llvm {
+class BinaryStreamReader;
+class BinaryStreamWriter;
 namespace codeview {
 
 class DebugCrossModuleExportsSubsectionRef final : public DebugSubsectionRef {
diff --git a/llvm/include/llvm/DebugInfo/CodeView/DebugCrossImpSubsection.h b/llvm/include/llvm/DebugInfo/CodeView/DebugCrossImpSubsection.h
index e7683cb2a9c4..198ce4a8b4e4 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/DebugCrossImpSubsection.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/DebugCrossImpSubsection.h
@@ -14,7 +14,6 @@
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/DebugSubsection.h"
 #include "llvm/Support/BinaryStreamArray.h"
-#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
@@ -22,6 +21,8 @@
 #include <vector>
 
 namespace llvm {
+class BinaryStreamReader;
+class BinaryStreamWriter;
 
 namespace codeview {
 
diff --git a/llvm/include/llvm/DebugInfo/CodeView/DebugFrameDataSubsection.h b/llvm/include/llvm/DebugInfo/CodeView/DebugFrameDataSubsection.h
index d5cd640231f9..f2c5bf9d7c95 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/DebugFrameDataSubsection.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/DebugFrameDataSubsection.h
@@ -11,11 +11,15 @@
 
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/DebugSubsection.h"
-#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamArray.h"
+#include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
+class BinaryStreamReader;
+class BinaryStreamWriter;
+
 namespace codeview {
 class DebugFrameDataSubsectionRef final : public DebugSubsectionRef {
 public:
diff --git a/llvm/include/llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h b/llvm/include/llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h
index 9fd88a64873a..f9d1507af5f3 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h
@@ -12,7 +12,6 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/DebugSubsection.h"
-#include "llvm/DebugInfo/CodeView/Line.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/Support/BinaryStreamArray.h"
 #include "llvm/Support/BinaryStreamReader.h"
diff --git a/llvm/include/llvm/DebugInfo/CodeView/DebugLinesSubsection.h b/llvm/include/llvm/DebugInfo/CodeView/DebugLinesSubsection.h
index 1f8e56c5311f..68eb9e1af3bd 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/DebugLinesSubsection.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/DebugLinesSubsection.h
@@ -14,7 +14,6 @@
 #include "llvm/DebugInfo/CodeView/DebugSubsection.h"
 #include "llvm/DebugInfo/CodeView/Line.h"
 #include "llvm/Support/BinaryStreamArray.h"
-#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
@@ -22,6 +21,8 @@
 #include <vector>
 
 namespace llvm {
+class BinaryStreamReader;
+class BinaryStreamWriter;
 namespace codeview {
 
 class DebugChecksumsSubsection;
diff --git a/llvm/include/llvm/DebugInfo/CodeView/DebugSubsection.h b/llvm/include/llvm/DebugInfo/CodeView/DebugSubsection.h
index 2e1cd15a3956..39413bb73b58 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/DebugSubsection.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/DebugSubsection.h
@@ -10,10 +10,12 @@
 #define LLVM_DEBUGINFO_CODEVIEW_DEBUGSUBSECTION_H
 
 #include "llvm/DebugInfo/CodeView/CodeView.h"
-#include "llvm/Support/BinaryStreamWriter.h"
-#include "llvm/Support/Casting.h"
+#include "llvm/Support/Error.h"
+
+#include <cstdint>
 
 namespace llvm {
+class BinaryStreamWriter;
 namespace codeview {
 
 class DebugSubsectionRef {
diff --git a/llvm/include/llvm/DebugInfo/CodeView/DebugSubsectionVisitor.h b/llvm/include/llvm/DebugInfo/CodeView/DebugSubsectionVisitor.h
index 151930d6d43d..fdca2ad063a1 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/DebugSubsectionVisitor.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/DebugSubsectionVisitor.h
@@ -9,7 +9,6 @@
 #ifndef LLVM_DEBUGINFO_CODEVIEW_DEBUGSUBSECTIONVISITOR_H
 #define LLVM_DEBUGINFO_CODEVIEW_DEBUGSUBSECTIONVISITOR_H
 
-#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/StringsAndChecksums.h"
 #include "llvm/Support/Error.h"
 
diff --git a/llvm/include/llvm/DebugInfo/CodeView/EnumTables.h b/llvm/include/llvm/DebugInfo/CodeView/EnumTables.h
index 270cd4b8330c..ec874b7ca114 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/EnumTables.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/EnumTables.h
@@ -12,10 +12,10 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
-#include "llvm/Support/ScopedPrinter.h"
 #include <cstdint>
 
 namespace llvm {
+template <typename T> struct EnumEntry;
 namespace codeview {
 
 ArrayRef<EnumEntry<SymbolKind>> getSymbolTypeNames();
diff --git a/llvm/include/llvm/DebugInfo/CodeView/Formatters.h b/llvm/include/llvm/DebugInfo/CodeView/Formatters.h
index 7d04a6a89bef..10683c289224 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/Formatters.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/Formatters.h
@@ -22,6 +22,8 @@ namespace llvm {
 
 namespace codeview {
 
+struct GUID;
+
 namespace detail {
 
 class GuidAdapter final : public FormatAdapter<ArrayRef<uint8_t>> {
diff --git a/llvm/include/llvm/DebugInfo/CodeView/GlobalTypeTableBuilder.h b/llvm/include/llvm/DebugInfo/CodeView/GlobalTypeTableBuilder.h
index 465c26ec2ce6..d592bde18bae 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/GlobalTypeTableBuilder.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/GlobalTypeTableBuilder.h
@@ -10,9 +10,9 @@
 #define LLVM_DEBUGINFO_CODEVIEW_GLOBALTYPETABLEBUILDER_H
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
 #include "llvm/DebugInfo/CodeView/SimpleTypeSerializer.h"
 #include "llvm/DebugInfo/CodeView/TypeCollection.h"
 #include "llvm/DebugInfo/CodeView/TypeHashing.h"
diff --git a/llvm/include/llvm/DebugInfo/CodeView/Line.h b/llvm/include/llvm/DebugInfo/CodeView/Line.h
index eb2aa154df1b..6918645b94d2 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/Line.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/Line.h
@@ -9,7 +9,6 @@
 #ifndef LLVM_DEBUGINFO_CODEVIEW_LINE_H
 #define LLVM_DEBUGINFO_CODEVIEW_LINE_H
 
-#include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/Support/Endian.h"
 #include <cinttypes>
 
diff --git a/llvm/include/llvm/DebugInfo/CodeView/MergingTypeTableBuilder.h b/llvm/include/llvm/DebugInfo/CodeView/MergingTypeTableBuilder.h
index 0f9d5e476075..1965aab9b5cc 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/MergingTypeTableBuilder.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/MergingTypeTableBuilder.h
@@ -10,18 +10,18 @@
 #define LLVM_DEBUGINFO_CODEVIEW_MERGINGTYPETABLEBUILDER_H
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
 #include "llvm/DebugInfo/CodeView/SimpleTypeSerializer.h"
 #include "llvm/DebugInfo/CodeView/TypeCollection.h"
-#include "llvm/DebugInfo/CodeView/TypeHashing.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/Support/Allocator.h"
 #include <cstdint>
 
 namespace llvm {
 namespace codeview {
+struct LocallyHashedType;
 
 class ContinuationRecordBuilder;
 
diff --git a/llvm/include/llvm/DebugInfo/CodeView/RecordName.h b/llvm/include/llvm/DebugInfo/CodeView/RecordName.h
index 8e06be9e41e8..9078ed38d2f1 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/RecordName.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/RecordName.h
@@ -9,11 +9,14 @@
 #ifndef LLVM_DEBUGINFO_CODEVIEW_RECORDNAME_H
 #define LLVM_DEBUGINFO_CODEVIEW_RECORDNAME_H
 
-#include "llvm/DebugInfo/CodeView/TypeCollection.h"
-#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
+#include <string>
 
 namespace llvm {
 namespace codeview {
+class TypeCollection;
+class TypeIndex;
 std::string computeTypeName(TypeCollection &Types, TypeIndex Index);
 StringRef getSymbolName(CVSymbol Sym);
 } // namespace codeview
diff --git a/llvm/include/llvm/DebugInfo/CodeView/RecordSerialization.h b/llvm/include/llvm/DebugInfo/CodeView/RecordSerialization.h
index 36c0f2fbd8fa..10248dbf646b 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/RecordSerialization.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/RecordSerialization.h
@@ -9,7 +9,6 @@
 #ifndef LLVM_DEBUGINFO_CODEVIEW_RECORDSERIALIZATION_H
 #define LLVM_DEBUGINFO_CODEVIEW_RECORDSERIALIZATION_H
 
-#include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
@@ -18,9 +17,9 @@
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include <cinttypes>
-#include <tuple>
 
 namespace llvm {
+class APSInt;
 namespace codeview {
 using llvm::support::little32_t;
 using llvm::support::ulittle16_t;
diff --git a/llvm/include/llvm/DebugInfo/CodeView/StringsAndChecksums.h b/llvm/include/llvm/DebugInfo/CodeView/StringsAndChecksums.h
index 22a283e785e1..50e745e5c2ab 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/StringsAndChecksums.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/StringsAndChecksums.h
@@ -10,13 +10,15 @@
 #define LLVM_DEBUGINFO_CODEVIEW_STRINGSANDCHECKSUMS_H
 
 #include "llvm/DebugInfo/CodeView/CodeView.h"
-#include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
-#include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h"
 #include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
 #include <memory>
 
 namespace llvm {
 namespace codeview {
+class DebugChecksumsSubsection;
+class DebugChecksumsSubsectionRef;
+class DebugStringTableSubsection;
+class DebugStringTableSubsectionRef;
 
 class StringsAndChecksumsRef {
 public:
diff --git a/llvm/include/llvm/DebugInfo/CodeView/SymbolDumper.h b/llvm/include/llvm/DebugInfo/CodeView/SymbolDumper.h
index aaeffb2446ad..c674700fac59 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/SymbolDumper.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/SymbolDumper.h
@@ -9,11 +9,13 @@
 #ifndef LLVM_DEBUGINFO_CODEVIEW_SYMBOLDUMPER_H
 #define LLVM_DEBUGINFO_CODEVIEW_SYMBOLDUMPER_H
 
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/StringSet.h"
 #include "llvm/DebugInfo/CodeView/CVRecord.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/SymbolDumpDelegate.h"
-#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/Support/Error.h"
+
+#include <memory>
+#include <utility>
 
 namespace llvm {
 class ScopedPrinter;
diff --git a/llvm/include/llvm/DebugInfo/CodeView/SymbolRecord.h b/llvm/include/llvm/DebugInfo/CodeView/SymbolRecord.h
index c37f6b4d5fa7..9513e19a330a 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/SymbolRecord.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/SymbolRecord.h
@@ -196,7 +196,7 @@ struct BinaryAnnotationIterator
 
   const DecodedAnnotation &operator*() {
     ParseCurrentAnnotation();
-    return Current.getValue();
+    return *Current;
   }
 
 private:
@@ -249,7 +249,7 @@ private:
   }
 
   bool ParseCurrentAnnotation() {
-    if (Current.hasValue())
+    if (Current)
       return true;
 
     Next = Data;
diff --git a/llvm/include/llvm/DebugInfo/CodeView/SymbolSerializer.h b/llvm/include/llvm/DebugInfo/CodeView/SymbolSerializer.h
index fb806c692cfd..53986f9a6db6 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/SymbolSerializer.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/SymbolSerializer.h
@@ -10,15 +10,17 @@
 #define LLVM_DEBUGINFO_CODEVIEW_SYMBOLSERIALIZER_H
 
 #include "llvm/ADT/Optional.h"
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/RecordSerialization.h"
-#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecordMapping.h"
 #include "llvm/DebugInfo/CodeView/SymbolVisitorCallbacks.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/BinaryByteStream.h"
 #include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
+#include <array>
 #include <cstdint>
 
 namespace llvm {
diff --git a/llvm/include/llvm/DebugInfo/CodeView/TypeCollection.h b/llvm/include/llvm/DebugInfo/CodeView/TypeCollection.h
index bde5a8b3ab2f..f643bc4d7451 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/TypeCollection.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/TypeCollection.h
@@ -34,7 +34,7 @@ public:
   template <typename TFunc> void ForEachRecord(TFunc Func) {
     Optional<TypeIndex> Next = getFirst();
 
-    while (Next.hasValue()) {
+    while (Next) {
       TypeIndex N = *Next;
       Func(N, getType(N));
       Next = getNext(N);
diff --git a/llvm/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h b/llvm/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h
index 41a219ae5a7b..1fad50343e3a 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h
@@ -9,16 +9,18 @@
 #ifndef LLVM_DEBUGINFO_CODEVIEW_TYPEDUMPVISITOR_H
 #define LLVM_DEBUGINFO_CODEVIEW_TYPEDUMPVISITOR_H
 
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/StringSet.h"
-#include "llvm/DebugInfo/CodeView/TypeIndex.h"
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
 
 namespace llvm {
 class ScopedPrinter;
 
 namespace codeview {
+class TypeIndex;
+struct CVMemberRecord;
+struct MemberAttributes;
 
 class TypeCollection;
 
diff --git a/llvm/include/llvm/DebugInfo/CodeView/TypeHashing.h b/llvm/include/llvm/DebugInfo/CodeView/TypeHashing.h
index 9f34d026b1ba..f49bc9b8e790 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/TypeHashing.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/TypeHashing.h
@@ -9,10 +9,11 @@
 #ifndef LLVM_DEBUGINFO_CODEVIEW_TYPEHASHING_H
 #define LLVM_DEBUGINFO_CODEVIEW_TYPEHASHING_H
 
-#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/StringRef.h"
 
-#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeCollection.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 
@@ -21,6 +22,7 @@
 #include <type_traits>
 
 namespace llvm {
+class raw_ostream;
 namespace codeview {
 
 /// A locally hashed type represents a straightforward hash code of a serialized
diff --git a/llvm/include/llvm/DebugInfo/CodeView/TypeIndex.h b/llvm/include/llvm/DebugInfo/CodeView/TypeIndex.h
index 226a436c0930..653eafa04e0a 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/TypeIndex.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/TypeIndex.h
@@ -13,7 +13,6 @@
 #include "llvm/Support/Endian.h"
 #include <cassert>
 #include <cinttypes>
-#include <functional>
 
 namespace llvm {
 
@@ -36,6 +35,7 @@ enum class SimpleTypeKind : uint32_t {
   WideCharacter = 0x0071,     // wide char
   Character16 = 0x007a,       // char16_t
   Character32 = 0x007b,       // char32_t
+  Character8 = 0x007c,        // char8_t
 
   SByte = 0x0068,       // 8 bit signed int
   Byte = 0x0069,        // 8 bit unsigned int
diff --git a/llvm/include/llvm/DebugInfo/CodeView/TypeIndexDiscovery.h b/llvm/include/llvm/DebugInfo/CodeView/TypeIndexDiscovery.h
index f4f5835d8b57..7ef8521604fb 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/TypeIndexDiscovery.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/TypeIndexDiscovery.h
@@ -9,13 +9,13 @@
 #ifndef LLVM_DEBUGINFO_CODEVIEW_TYPEINDEXDISCOVERY_H
 #define LLVM_DEBUGINFO_CODEVIEW_TYPEINDEXDISCOVERY_H
 
-#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/DebugInfo/CodeView/CVRecord.h"
-#include "llvm/DebugInfo/CodeView/TypeIndex.h"
-#include "llvm/Support/Error.h"
 
 namespace llvm {
+template <typename T> class SmallVectorImpl;
 namespace codeview {
+class TypeIndex;
 enum class TiRefKind { TypeRef, IndexRef };
 struct TiReference {
   TiRefKind Kind;
diff --git a/llvm/include/llvm/DebugInfo/CodeView/TypeRecordMapping.h b/llvm/include/llvm/DebugInfo/CodeView/TypeRecordMapping.h
index c6044d5138a8..ed4fc7a75624 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/TypeRecordMapping.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/TypeRecordMapping.h
@@ -10,7 +10,8 @@
 #define LLVM_DEBUGINFO_CODEVIEW_TYPERECORDMAPPING_H
 
 #include "llvm/ADT/Optional.h"
-#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/CodeViewRecordIO.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
 #include "llvm/Support/Error.h"
@@ -20,6 +21,8 @@ class BinaryStreamReader;
 class BinaryStreamWriter;
 
 namespace codeview {
+class TypeIndex;
+struct CVMemberRecord;
 class TypeRecordMapping : public TypeVisitorCallbacks {
 public:
   explicit TypeRecordMapping(BinaryStreamReader &Reader) : IO(Reader) {}
diff --git a/llvm/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h b/llvm/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h
index 04d7c7b0420a..04a1e44dd809 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h
@@ -10,11 +10,12 @@
 #define LLVM_DEBUGINFO_CODEVIEW_TYPESTREAMMERGER_H
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/DebugInfo/CodeView/CVRecord.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
+template <typename T> class Optional;
+template <typename T> class SmallVectorImpl;
 namespace codeview {
 
 class TypeIndex;
diff --git a/llvm/include/llvm/DebugInfo/DIContext.h b/llvm/include/llvm/DebugInfo/DIContext.h
index d029556c9d89..9b278b696073 100644
--- a/llvm/include/llvm/DebugInfo/DIContext.h
+++ b/llvm/include/llvm/DebugInfo/DIContext.h
@@ -90,6 +90,8 @@ class DIInliningInfo {
 public:
   DIInliningInfo() = default;
 
+  /// Returns the frame at `Index`. Frames are stored in bottom-up
+  /// (leaf-to-root) order with increasing index.
   const DILineInfo &getFrame(unsigned Index) const {
     assert(Index < Frames.size());
     return Frames[Index];
@@ -112,6 +114,8 @@ struct DIGlobal {
   std::string Name;
   uint64_t Start = 0;
   uint64_t Size = 0;
+  std::string DeclFile;
+  uint64_t DeclLine = 0;
 
   DIGlobal() : Name(DILineInfo::BadString) {}
 };
@@ -151,6 +155,10 @@ struct DILineInfoSpecifier {
   DILineInfoSpecifier(FileLineInfoKind FLIKind = FileLineInfoKind::RawValue,
                       FunctionNameKind FNKind = FunctionNameKind::None)
       : FLIKind(FLIKind), FNKind(FNKind) {}
+
+  inline bool operator==(const DILineInfoSpecifier &RHS) const {
+    return FLIKind == RHS.FLIKind && FNKind == RHS.FNKind;
+  }
 };
 
 /// This is just a helper to programmatically construct DIDumpType.
@@ -233,6 +241,8 @@ public:
   virtual DILineInfo getLineInfoForAddress(
       object::SectionedAddress Address,
       DILineInfoSpecifier Specifier = DILineInfoSpecifier()) = 0;
+  virtual DILineInfo
+  getLineInfoForDataAddress(object::SectionedAddress Address) = 0;
   virtual DILineInfoTable getLineInfoForAddressRange(
       object::SectionedAddress Address, uint64_t Size,
       DILineInfoSpecifier Specifier = DILineInfoSpecifier()) = 0;
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h
index cdf3f60f88be..3887656ceef6 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h
@@ -13,13 +13,13 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/BinaryFormat/Dwarf.h"
-#include "llvm/Support/DataExtractor.h"
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
 
 namespace llvm {
 
+class DataExtractor;
 class DWARFFormValue;
 class DWARFUnit;
 class raw_ostream;
@@ -34,7 +34,7 @@ public:
     AttributeSpec(dwarf::Attribute A, dwarf::Form F, Optional<uint8_t> ByteSize)
         : Attr(A), Form(F) {
       assert(!isImplicitConst());
-      this->ByteSize.HasByteSize = ByteSize.hasValue();
+      this->ByteSize.HasByteSize = ByteSize.has_value();
       if (this->ByteSize.HasByteSize)
         this->ByteSize.ByteSize = *ByteSize;
     }
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFAddressRange.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFAddressRange.h
index 537a03ec11fc..f4d6c451cbe1 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFAddressRange.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFAddressRange.h
@@ -10,6 +10,9 @@
 #define LLVM_DEBUGINFO_DWARF_DWARFADDRESSRANGE_H
 
 #include "llvm/DebugInfo/DIContext.h"
+#include "llvm/Object/ObjectFile.h"
+#include <algorithm>
+#include <cassert>
 #include <cstdint>
 #include <tuple>
 #include <vector>
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFCompileUnit.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFCompileUnit.h
index ec5a3cd85266..d449b7bed796 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFCompileUnit.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFCompileUnit.h
@@ -10,10 +10,15 @@
 #define LLVM_DEBUGINFO_DWARF_DWARFCOMPILEUNIT_H
 
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
-#include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
 
 namespace llvm {
 
+class DWARFContext;
+class DWARFDebugAbbrev;
+class raw_ostream;
+struct DIDumpOptions;
+struct DWARFSection;
+
 class DWARFCompileUnit : public DWARFUnit {
 public:
   DWARFCompileUnit(DWARFContext &Context, const DWARFSection &Section,
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h
index e82faf6eeb24..bf591ed554c6 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h
@@ -9,43 +9,37 @@
 #ifndef LLVM_DEBUGINFO_DWARF_DWARFCONTEXT_H
 #define LLVM_DEBUGINFO_DWARF_DWARFCONTEXT_H
 
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/iterator_range.h"
 #include "llvm/DebugInfo/DIContext.h"
-#include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h"
-#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugAranges.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugFrame.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugLoc.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugMacro.h"
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
-#include "llvm/DebugInfo/DWARF/DWARFGdbIndex.h"
 #include "llvm/DebugInfo/DWARF/DWARFObject.h"
-#include "llvm/DebugInfo/DWARF/DWARFSection.h"
-#include "llvm/DebugInfo/DWARF/DWARFTypeUnit.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
-#include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
 #include "llvm/Object/Binary.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/Host.h"
 #include <cstdint>
-#include <deque>
-#include <map>
 #include <memory>
 
 namespace llvm {
 
 class MCRegisterInfo;
 class MemoryBuffer;
-class raw_ostream;
+class AppleAcceleratorTable;
+class DWARFCompileUnit;
+class DWARFDebugAbbrev;
+class DWARFDebugAranges;
+class DWARFDebugFrame;
+class DWARFDebugLoc;
+class DWARFDebugMacro;
+class DWARFDebugNames;
+class DWARFGdbIndex;
+class DWARFTypeUnit;
+class DWARFUnitIndex;
 
 /// DWARFContext
 /// This data structure is the top level entity that deals with dwarf debug
@@ -124,7 +118,7 @@ public:
                    WithColor::defaultErrorHandler,
                std::function<void(Error)> WarningHandler =
                    WithColor::defaultWarningHandler);
-  ~DWARFContext();
+  ~DWARFContext() override;
 
   DWARFContext(DWARFContext &) = delete;
   DWARFContext &operator=(DWARFContext &) = delete;
@@ -339,6 +333,10 @@ public:
   getLineTableForUnit(DWARFUnit *U,
                       function_ref<void(Error)> RecoverableErrorHandler);
 
+  // Clear the line table object corresponding to a compile unit for memory
+  // management purpose. When it's referred to again, it'll be re-populated.
+  void clearLineTableForUnit(DWARFUnit *U);
+
   DataExtractor getStringExtractor() const {
     return DataExtractor(DObj->getStrSection(), false, 0);
   }
@@ -366,6 +364,8 @@ public:
   DILineInfo getLineInfoForAddress(
       object::SectionedAddress Address,
       DILineInfoSpecifier Specifier = DILineInfoSpecifier()) override;
+  DILineInfo
+  getLineInfoForDataAddress(object::SectionedAddress Address) override;
   DILineInfoTable getLineInfoForAddressRange(
       object::SectionedAddress Address, uint64_t Size,
       DILineInfoSpecifier Specifier = DILineInfoSpecifier()) override;
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugAddr.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugAddr.h
index e1407ddd89eb..67d9ce1476dd 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugAddr.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugAddr.h
@@ -11,17 +11,14 @@
 
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DIContext.h"
-#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
-#include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 #include <cstdint>
-#include <map>
 #include <vector>
 
 namespace llvm {
 
-class Error;
 class raw_ostream;
+class DWARFDataExtractor;
 
 /// A class representing an address table as specified in DWARF v5.
 /// The table consists of a header followed by an array of address values from
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h
index 65334b4a4976..760d8826771c 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h
@@ -10,7 +10,7 @@
 #define LLVM_DEBUGINFO_DWARF_DWARFDEBUGARANGESET_H
 
 #include "llvm/ADT/iterator_range.h"
-#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/Support/Error.h"
 #include <cstdint>
 #include <vector>
@@ -18,6 +18,7 @@
 namespace llvm {
 
 class raw_ostream;
+class DWARFDataExtractor;
 
 class DWARFDebugArangeSet {
 public:
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugAranges.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugAranges.h
index 216dd1e4defc..068674cfae5c 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugAranges.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugAranges.h
@@ -10,11 +10,13 @@
 #define LLVM_DEBUGINFO_DWARF_DWARFDEBUGARANGES_H
 
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include <cstdint>
 #include <vector>
 
 namespace llvm {
+class DWARFDataExtractor;
+class Error;
 
 class DWARFContext;
 
@@ -26,7 +28,8 @@ public:
 private:
   void clear();
   void extract(DWARFDataExtractor DebugArangesData,
-               function_ref<void(Error)> RecoverableErrorHandler);
+               function_ref<void(Error)> RecoverableErrorHandler,
+               function_ref<void(Error)> WarningHandler);
 
   /// Call appendRange multiple times and then call construct.
   void appendRange(uint64_t CUOffset, uint64_t LowPC, uint64_t HighPC);
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h
index 8167aaaeffb5..48df091412bf 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h
@@ -13,7 +13,6 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/iterator.h"
-#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
 #include "llvm/DebugInfo/DWARF/DWARFExpression.h"
 #include "llvm/Support/Error.h"
 #include <map>
@@ -23,6 +22,9 @@
 namespace llvm {
 
 class raw_ostream;
+class DWARFDataExtractor;
+class MCRegisterInfo;
+struct DIDumpOptions;
 
 namespace dwarf {
 
@@ -130,7 +132,7 @@ public:
   uint32_t getRegister() const { return RegNum; }
   int32_t getOffset() const { return Offset; }
   uint32_t getAddressSpace() const {
-    assert(Kind == RegPlusOffset && AddrSpace.hasValue());
+    assert(Kind == RegPlusOffset && AddrSpace);
     return *AddrSpace;
   }
   int32_t getConstant() const { return Offset; }
@@ -259,7 +261,7 @@ public:
   UnwindRow() : CFAValue(UnwindLocation::createUnspecified()) {}
 
   /// Returns true if the address is valid in this object.
-  bool hasAddress() const { return Address.hasValue(); }
+  bool hasAddress() const { return Address.has_value(); }
 
   /// Get the address for this row.
   ///
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h
index 6bdd23900182..9befcc0c4182 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h
@@ -11,12 +11,12 @@
 
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
-#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
 #include <cstdint>
 
 namespace llvm {
 
 class DWARFUnit;
+class DWARFDataExtractor;
 
 /// DWARFDebugInfoEntry - A DIE with only the minimum required data.
 class DWARFDebugInfoEntry {
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
index ee15b6d4112d..86f90135f8d4 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
@@ -11,12 +11,10 @@
 
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DIContext.h"
-#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
-#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
-#include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
-#include "llvm/DebugInfo/DWARF/DWARFTypeUnit.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
 #include "llvm/Support/MD5.h"
 #include "llvm/Support/Path.h"
 #include <cstdint>
@@ -26,7 +24,6 @@
 
 namespace llvm {
 
-class DWARFUnit;
 class raw_ostream;
 
 class DWARFDebugLine {
@@ -307,6 +304,7 @@ public:
   getOrParseLineTable(DWARFDataExtractor &DebugLineData, uint64_t Offset,
                       const DWARFContext &Ctx, const DWARFUnit *U,
                       function_ref<void(Error)> RecoverableErrorHandler);
+  void clearLineTable(uint64_t Offset);
 
   /// Helper to allow for parsing of an entire .debug_line section in sequence.
   class SectionParser {
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h
index 1794f6649827..90e009e514d4 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h
@@ -11,10 +11,7 @@
 
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/DebugInfo/DIContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
-#include "llvm/DebugInfo/DWARF/DWARFLocationExpression.h"
-#include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
 #include "llvm/Support/Errc.h"
 #include <cstdint>
 
@@ -22,6 +19,12 @@ namespace llvm {
 class DWARFUnit;
 class MCRegisterInfo;
 class raw_ostream;
+class DWARFObject;
+struct DIDumpOptions;
+struct DWARFLocationExpression;
+namespace object {
+struct SectionedAddress;
+}
 
 /// A single location within a location list. Entries are stored in the DWARF5
 /// form even if they originally come from a DWARF<=4 location list.
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugMacro.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugMacro.h
index f1768a1ddab5..d98cf9a6045a 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugMacro.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugMacro.h
@@ -12,7 +12,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
-#include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 #include <cstdint>
 
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugPubTable.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugPubTable.h
index cb347615868b..6c82bbfe74f7 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugPubTable.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugPubTable.h
@@ -10,16 +10,17 @@
 #define LLVM_DEBUGINFO_DWARF_DWARFDEBUGPUBTABLE_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/Dwarf.h"
-#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
-#include "llvm/DebugInfo/DWARF/DWARFObject.h"
 #include <cstdint>
 #include <vector>
 
 namespace llvm {
 
 class raw_ostream;
+class DWARFDataExtractor;
+class Error;
 
 /// Represents structure for holding and parsing .debug_pub* tables.
 class DWARFDebugPubTable {
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
index 0d9f37c5610b..f4aeac1bb9db 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
@@ -10,14 +10,16 @@
 #define LLVM_DEBUGINFO_DWARF_DWARFDEBUGRANGELIST_H
 
 #include "llvm/DebugInfo/DWARF/DWARFAddressRange.h"
-#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
-#include <cassert>
 #include <cstdint>
 #include <vector>
 
 namespace llvm {
 
 class raw_ostream;
+class DWARFDataExtractor;
+namespace object {
+struct SectionedAddress;
+}
 
 class DWARFDebugRangeList {
 public:
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h
index 2baa6493f709..13f018f53fa1 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h
@@ -10,11 +10,9 @@
 #define LLVM_DEBUGINFO_DWARF_DWARFDEBUGRNGLISTS_H
 
 #include "llvm/ADT/Optional.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/BinaryFormat/Dwarf.h"
-#include "llvm/DebugInfo/DIContext.h"
-#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
+#include "llvm/DebugInfo/DWARF/DWARFAddressRange.h"
 #include "llvm/DebugInfo/DWARF/DWARFListTable.h"
 #include <cstdint>
 
@@ -23,6 +21,11 @@ namespace llvm {
 class Error;
 class raw_ostream;
 class DWARFUnit;
+class DWARFDataExtractor;
+struct DIDumpOptions;
+namespace object {
+struct SectionedAddress;
+}
 
 /// A class representing a single range list entry.
 struct RangeListEntry : public DWARFListEntryBase {
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h
index f731d440a35b..149c5ef4e493 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h
@@ -18,7 +18,7 @@
 #include "llvm/DebugInfo/DWARF/DWARFAddressRange.h"
 #include "llvm/DebugInfo/DWARF/DWARFAttribute.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugLoc.h"
+#include "llvm/DebugInfo/DWARF/DWARFLocationExpression.h"
 #include <cassert>
 #include <cstdint>
 #include <iterator>
@@ -280,6 +280,13 @@ public:
   /// \returns an iterator range for the attributes of the current DIE.
   iterator_range<attribute_iterator> attributes() const;
 
+  /// Gets the type size (in bytes) for this DIE.
+  ///
+  /// \param PointerSize the pointer size of the containing CU.
+  /// \returns if this is a type DIE, or this DIE contains a DW_AT_type, returns
+  /// the size of the type.
+  Optional<uint64_t> getTypeSize(uint64_t PointerSize);
+
   class iterator;
 
   iterator begin() const;
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFExpression.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFExpression.h
index b694eeacfd9d..c4d81047a4dc 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFExpression.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFExpression.h
@@ -9,16 +9,15 @@
 #ifndef LLVM_DEBUGINFO_DWARF_DWARFEXPRESSION_H
 #define LLVM_DEBUGINFO_DWARF_DWARFEXPRESSION_H
 
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator.h"
-#include "llvm/ADT/iterator_range.h"
 #include "llvm/BinaryFormat/Dwarf.h"
-#include "llvm/DebugInfo/DIContext.h"
 #include "llvm/Support/DataExtractor.h"
 
 namespace llvm {
 class DWARFUnit;
+struct DIDumpOptions;
 class MCRegisterInfo;
 class raw_ostream;
 
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFFormValue.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
index 130cdb8800a9..c2c1df5b590b 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
@@ -14,12 +14,14 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DIContext.h"
-#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
+#include "llvm/Support/DataExtractor.h"
 #include <cstdint>
 
 namespace llvm {
 
 class DWARFContext;
+class DWARFObject;
+class DWARFDataExtractor;
 class DWARFUnit;
 class raw_ostream;
 
@@ -234,7 +236,7 @@ inline Optional<uint64_t> toUnsigned(const Optional<DWARFFormValue> &V) {
 /// value or the form value's encoding wasn't an unsigned constant form.
 inline uint64_t toUnsigned(const Optional<DWARFFormValue> &V,
                            uint64_t Default) {
-  return toUnsigned(V).getValueOr(Default);
+  return toUnsigned(V).value_or(Default);
 }
 
 /// Take an optional DWARFFormValue and try to extract an reference.
@@ -256,7 +258,7 @@ inline Optional<uint64_t> toReference(const Optional<DWARFFormValue> &V) {
 /// value or the form value's encoding wasn't a reference form.
 inline uint64_t toReference(const Optional<DWARFFormValue> &V,
                             uint64_t Default) {
-  return toReference(V).getValueOr(Default);
+  return toReference(V).value_or(Default);
 }
 
 /// Take an optional DWARFFormValue and try to extract an signed constant.
@@ -277,7 +279,7 @@ inline Optional<int64_t> toSigned(const Optional<DWARFFormValue> &V) {
 /// \returns the extracted signed integer value or Default if the V doesn't
 /// have a value or the form value's encoding wasn't a signed integer form.
 inline int64_t toSigned(const Optional<DWARFFormValue> &V, int64_t Default) {
-  return toSigned(V).getValueOr(Default);
+  return toSigned(V).value_or(Default);
 }
 
 /// Take an optional DWARFFormValue and try to extract an address.
@@ -305,7 +307,7 @@ toSectionedAddress(const Optional<DWARFFormValue> &V) {
 /// \returns the extracted address value or Default if the V doesn't have a
 /// value or the form value's encoding wasn't an address form.
 inline uint64_t toAddress(const Optional<DWARFFormValue> &V, uint64_t Default) {
-  return toAddress(V).getValueOr(Default);
+  return toAddress(V).value_or(Default);
 }
 
 /// Take an optional DWARFFormValue and try to extract an section offset.
@@ -327,7 +329,7 @@ inline Optional<uint64_t> toSectionOffset(const Optional<DWARFFormValue> &V) {
 /// have a value or the form value's encoding wasn't a section offset form.
 inline uint64_t toSectionOffset(const Optional<DWARFFormValue> &V,
                                 uint64_t Default) {
-  return toSectionOffset(V).getValueOr(Default);
+  return toSectionOffset(V).value_or(Default);
 }
 
 /// Take an optional DWARFFormValue and try to extract block data.
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFGdbIndex.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFGdbIndex.h
index 38cd42ddb883..6b23c4e57d95 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFGdbIndex.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFGdbIndex.h
@@ -11,13 +11,13 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/DataExtractor.h"
 #include <cstdint>
 #include <utility>
 
 namespace llvm {
 
 class raw_ostream;
+class DataExtractor;
 
 class DWARFGdbIndex {
   uint32_t Version;
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFListTable.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFListTable.h
index 515623cedc94..84c8d71b04fc 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFListTable.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFListTable.h
@@ -14,7 +14,6 @@
 #include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdint>
 #include <map>
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h
index 3add711943d0..fef59c5e95f8 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h
@@ -10,6 +10,7 @@
 #define LLVM_DEBUGINFO_DWARF_DWARFRELOCMAP_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/Object/ObjectFile.h"
 #include "llvm/Object/RelocationResolver.h"
 #include <cstdint>
 
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFTypePrinter.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFTypePrinter.h
new file mode 100644
index 000000000000..e05271740e61
--- /dev/null
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFTypePrinter.h
@@ -0,0 +1,67 @@
+//===- DWARFTypePrinter.h ---------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARF_DWARFTYPEPRINTER_H
+#define LLVM_DEBUGINFO_DWARF_DWARFTYPEPRINTER_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/DebugInfo/DWARF/DWARFDie.h"
+
+#include <string>
+
+namespace llvm {
+
+class raw_ostream;
+
+// FIXME: We should have pretty printers per language. Currently we print
+// everything as if it was C++ and fall back to the TAG type name.
+struct DWARFTypePrinter {
+  raw_ostream &OS;
+  bool Word = true;
+  bool EndedWithTemplate = false;
+
+  DWARFTypePrinter(raw_ostream &OS) : OS(OS) {}
+
+  /// Dump the name encoded in the type tag.
+  void appendTypeTagName(dwarf::Tag T);
+
+  void appendArrayType(const DWARFDie &D);
+
+  DWARFDie skipQualifiers(DWARFDie D);
+
+  bool needsParens(DWARFDie D);
+
+  void appendPointerLikeTypeBefore(DWARFDie D, DWARFDie Inner, StringRef Ptr);
+
+  DWARFDie appendUnqualifiedNameBefore(DWARFDie D,
+                                       std::string *OriginalFullName = nullptr);
+
+  void appendUnqualifiedNameAfter(DWARFDie D, DWARFDie Inner,
+                                  bool SkipFirstParamIfArtificial = false);
+  void appendQualifiedName(DWARFDie D);
+  DWARFDie appendQualifiedNameBefore(DWARFDie D);
+  bool appendTemplateParameters(DWARFDie D, bool *FirstParameter = nullptr);
+  void decomposeConstVolatile(DWARFDie &N, DWARFDie &T, DWARFDie &C,
+                              DWARFDie &V);
+  void appendConstVolatileQualifierAfter(DWARFDie N);
+  void appendConstVolatileQualifierBefore(DWARFDie N);
+
+  /// Recursively append the DIE type name when applicable.
+  void appendUnqualifiedName(DWARFDie D,
+                             std::string *OriginalFullName = nullptr);
+
+  void appendSubroutineNameAfter(DWARFDie D, DWARFDie Inner,
+                                 bool SkipFirstParamIfArtificial, bool Const,
+                                 bool Volatile);
+  void appendScopes(DWARFDie D);
+};
+
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_DWARF_DWARFTYPEPRINTER_H
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h
index c95bdcbd8a43..85ec6fd86ade 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h
@@ -11,12 +11,11 @@
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
-#include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
-#include "llvm/Support/DataExtractor.h"
 #include <cstdint>
 
 namespace llvm {
 
+struct DIDumpOptions;
 class DWARFContext;
 class DWARFDebugAbbrev;
 struct DWARFSection;
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h
index b96a4c19758f..9188865b4d77 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h
@@ -9,28 +9,26 @@
 #ifndef LLVM_DEBUGINFO_DWARF_DWARFUNIT_H
 #define LLVM_DEBUGINFO_DWARF_DWARFUNIT_H
 
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/DebugInfo/DWARF/DWARFAddressRange.h"
+#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugLoc.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugRnglists.h"
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
-#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
-#include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
-#include "llvm/DebugInfo/DWARF/DWARFSection.h"
+#include "llvm/DebugInfo/DWARF/DWARFLocationExpression.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
 #include "llvm/Support/DataExtractor.h"
-#include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
 #include <map>
 #include <memory>
+#include <set>
 #include <utility>
 #include <vector>
 
@@ -40,6 +38,12 @@ class DWARFAbbreviationDeclarationSet;
 class DWARFContext;
 class DWARFDebugAbbrev;
 class DWARFUnit;
+class DWARFDebugRangeList;
+class DWARFLocationTable;
+class DWARFObject;
+class raw_ostream;
+struct DIDumpOptions;
+struct DWARFSection;
 
 /// Base class describing the header of any kind of "unit."  Some information
 /// is specific to certain unit types.  We separate this class out so we can
@@ -238,6 +242,11 @@ class DWARFUnit {
   /// std::map::upper_bound for address range lookup.
   std::map<uint64_t, std::pair<uint64_t, DWARFDie>> AddrDieMap;
 
+  /// Map from the location (interpreted DW_AT_location) of a DW_TAG_variable,
+  /// to the end address and the corresponding DIE.
+  std::map<uint64_t, std::pair<uint64_t, DWARFDie>> VariableDieMap;
+  DenseSet<uint64_t> RootsParsedForVariables;
+
   using die_iterator_range =
       iterator_range<std::vector<DWARFDebugInfoEntry>::iterator>;
 
@@ -320,6 +329,9 @@ public:
   /// Recursively update address to Die map.
   void updateAddressDieMap(DWARFDie Die);
 
+  /// Recursively update address to variable Die map.
+  void updateVariableDieMap(DWARFDie Die);
+
   void setRangesSection(const DWARFSection *RS, uint64_t Base) {
     RangeSection = RS;
     RangeSectionBase = Base;
@@ -434,6 +446,10 @@ public:
   /// cleared.
   DWARFDie getSubroutineForAddress(uint64_t Address);
 
+  /// Returns variable DIE for the address provided. The pointer is alive as
+  /// long as parsed compile unit DIEs are not cleared.
+  DWARFDie getVariableForAddress(uint64_t Address);
+
   /// getInlinedChainForAddress - fetches inlined chain for a given address.
   /// Returns empty chain if there is no subprogram containing address. The
   /// chain is valid as long as parsed compile unit DIEs are not cleared.
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFUnitIndex.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFUnitIndex.h
index edea59e474cf..b5e191ba7def 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFUnitIndex.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFUnitIndex.h
@@ -11,13 +11,13 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/DataExtractor.h"
 #include <cstdint>
 #include <memory>
 
 namespace llvm {
 
 class raw_ostream;
+class DataExtractor;
 
 /// The enum of section identifiers to be used in internal interfaces.
 ///
@@ -64,6 +64,25 @@ enum DWARFSectionKind {
   DW_SECT_EXT_MACINFO = 10,
 };
 
+inline const char *toString(DWARFSectionKind Kind) {
+  switch (Kind) {
+  case DW_SECT_EXT_unknown:
+    return "Unknown DW_SECT value 0";
+#define STRINGIZE(X) #X
+#define HANDLE_DW_SECT(ID, NAME)                                               \
+  case DW_SECT_##NAME:                                                         \
+    return "DW_SECT_" STRINGIZE(NAME);
+#include "llvm/BinaryFormat/Dwarf.def"
+  case DW_SECT_EXT_TYPES:
+    return "DW_SECT_TYPES";
+  case DW_SECT_EXT_LOC:
+    return "DW_SECT_LOC";
+  case DW_SECT_EXT_MACINFO:
+    return "DW_SECT_MACINFO";
+  }
+  llvm_unreachable("unknown DWARFSectionKind");
+}
+
 /// Convert the internal value for a section kind to an on-disk value.
 ///
 /// The conversion depends on the version of the index section.
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
index 505686bfbf59..1f1ebe943238 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
@@ -12,9 +12,9 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/DebugInfo/DIContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h"
+#include "llvm/DebugInfo/DWARF/DWARFAddressRange.h"
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
-#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
 #include <cstdint>
 #include <map>
 #include <set>
@@ -22,13 +22,14 @@
 namespace llvm {
 class raw_ostream;
 struct DWARFAddressRange;
+class DWARFUnit;
+class DWARFUnitVector;
 struct DWARFAttribute;
 class DWARFContext;
 class DWARFDataExtractor;
 class DWARFDebugAbbrev;
 class DataExtractor;
 struct DWARFSection;
-class DWARFUnit;
 
 /// A class that verifies DWARF debug information given a DWARF Context.
 class DWARFVerifier {
@@ -151,12 +152,15 @@ private:
   /// section.
   ///
   /// \param S           The DWARF Section to verify.
-  /// \param SectionKind The object-file section kind that S comes from.
   ///
   /// \returns The number of errors that occurred during verification.
   unsigned verifyUnitSection(const DWARFSection &S);
   unsigned verifyUnits(const DWARFUnitVector &Units);
 
+  unsigned verifyIndexes(const DWARFObject &DObj);
+  unsigned verifyIndex(StringRef Name, DWARFSectionKind SectionKind,
+                       StringRef Index);
+
   /// Verifies that a call site entry is nested within a subprogram with a
   /// DW_AT_call attribute.
   ///
@@ -301,6 +305,24 @@ public:
   /// \returns true if all sections verify successfully, false otherwise.
   bool handleDebugInfo();
 
+  /// Verify the information in the .debug_cu_index section.
+  ///
+  /// Any errors are reported to the stream that was this object was
+  /// constructed with.
+  ///
+  /// \returns true if the .debug_cu_index verifies successfully, false
+  /// otherwise.
+  bool handleDebugCUIndex();
+
+  /// Verify the information in the .debug_tu_index section.
+  ///
+  /// Any errors are reported to the stream that was this object was
+  /// constructed with.
+  ///
+  /// \returns true if the .debug_tu_index verifies successfully, false
+  /// otherwise.
+  bool handleDebugTUIndex();
+
   /// Verify the information in the .debug_line section.
   ///
   /// Any errors are reported to the stream that was this object was
diff --git a/llvm/include/llvm/DebugInfo/GSYM/DwarfTransformer.h b/llvm/include/llvm/DebugInfo/GSYM/DwarfTransformer.h
index 32fc54b14796..b8d7199f2d87 100644
--- a/llvm/include/llvm/DebugInfo/GSYM/DwarfTransformer.h
+++ b/llvm/include/llvm/DebugInfo/GSYM/DwarfTransformer.h
@@ -10,7 +10,7 @@
 #define LLVM_DEBUGINFO_GSYM_DWARFTRANSFORMER_H
 
 #include "llvm/ADT/StringRef.h"
-#include "llvm/DebugInfo/GSYM/Range.h"
+#include "llvm/DebugInfo/GSYM/ExtractRanges.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
diff --git a/llvm/include/llvm/DebugInfo/GSYM/ExtractRanges.h b/llvm/include/llvm/DebugInfo/GSYM/ExtractRanges.h
new file mode 100644
index 000000000000..9a6568719875
--- /dev/null
+++ b/llvm/include/llvm/DebugInfo/GSYM/ExtractRanges.h
@@ -0,0 +1,81 @@
+//===- ExtractRanges.h ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_GSYM_EXTRACTRANGES_H
+#define LLVM_DEBUGINFO_GSYM_EXTRACTRANGES_H
+
+#include "llvm/ADT/AddressRanges.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <stdint.h>
+#include <vector>
+
+#define HEX8(v) llvm::format_hex(v, 4)
+#define HEX16(v) llvm::format_hex(v, 6)
+#define HEX32(v) llvm::format_hex(v, 10)
+#define HEX64(v) llvm::format_hex(v, 18)
+
+namespace llvm {
+class DataExtractor;
+class raw_ostream;
+
+namespace gsym {
+
+class FileWriter;
+
+/// AddressRange objects are encoded and decoded to be relative to a base
+/// address. This will be the FunctionInfo's start address if the AddressRange
+/// is directly contained in a FunctionInfo, or a base address of the
+/// containing parent AddressRange or AddressRanges. This allows address
+/// ranges to be efficiently encoded using ULEB128 encodings as we encode the
+/// offset and size of each range instead of full addresses. This also makes
+/// encoded addresses easy to relocate as we just need to relocate one base
+/// address.
+/// @{
+AddressRange decodeRange(DataExtractor &Data, uint64_t BaseAddr,
+                         uint64_t &Offset);
+void encodeRange(const AddressRange &Range, FileWriter &O, uint64_t BaseAddr);
+/// @}
+
+/// Skip an address range object in the specified data a the specified
+/// offset.
+///
+/// \param Data The binary stream to read the data from.
+///
+/// \param Offset The byte offset within \a Data.
+void skipRange(DataExtractor &Data, uint64_t &Offset);
+
+/// Address ranges are decoded and encoded to be relative to a base address.
+/// See the AddressRange comment for the encode and decode methods for full
+/// details.
+/// @{
+void decodeRanges(AddressRanges &Ranges, DataExtractor &Data, uint64_t BaseAddr,
+                  uint64_t &Offset);
+void encodeRanges(const AddressRanges &Ranges, FileWriter &O,
+                  uint64_t BaseAddr);
+/// @}
+
+/// Skip an address range object in the specified data a the specified
+/// offset.
+///
+/// \param Data The binary stream to read the data from.
+///
+/// \param Offset The byte offset within \a Data.
+///
+/// \returns The number of address ranges that were skipped.
+uint64_t skipRanges(DataExtractor &Data, uint64_t &Offset);
+
+} // namespace gsym
+
+raw_ostream &operator<<(raw_ostream &OS, const AddressRange &R);
+
+raw_ostream &operator<<(raw_ostream &OS, const AddressRanges &AR);
+
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_GSYM_EXTRACTRANGES_H
diff --git a/llvm/include/llvm/DebugInfo/GSYM/FunctionInfo.h b/llvm/include/llvm/DebugInfo/GSYM/FunctionInfo.h
index 552337f54390..fb48f7f9a93c 100644
--- a/llvm/include/llvm/DebugInfo/GSYM/FunctionInfo.h
+++ b/llvm/include/llvm/DebugInfo/GSYM/FunctionInfo.h
@@ -10,10 +10,10 @@
 #define LLVM_DEBUGINFO_GSYM_FUNCTIONINFO_H
 
 #include "llvm/ADT/Optional.h"
+#include "llvm/DebugInfo/GSYM/ExtractRanges.h"
 #include "llvm/DebugInfo/GSYM/InlineInfo.h"
 #include "llvm/DebugInfo/GSYM/LineTable.h"
 #include "llvm/DebugInfo/GSYM/LookupResult.h"
-#include "llvm/DebugInfo/GSYM/Range.h"
 #include "llvm/DebugInfo/GSYM/StringTable.h"
 #include <cstdint>
 #include <tuple>
@@ -102,9 +102,7 @@ struct FunctionInfo {
   /// debug info, we might end up with multiple FunctionInfo objects for the
   /// same range and we need to be able to tell which one is the better object
   /// to use.
-  bool hasRichInfo() const {
-    return OptLineTable.hasValue() || Inline.hasValue();
-  }
+  bool hasRichInfo() const { return OptLineTable || Inline; }
 
   /// Query if a FunctionInfo object is valid.
   ///
@@ -170,12 +168,9 @@ struct FunctionInfo {
                                              uint64_t FuncAddr,
                                              uint64_t Addr);
 
-  uint64_t startAddress() const { return Range.Start; }
-  uint64_t endAddress() const { return Range.End; }
+  uint64_t startAddress() const { return Range.start(); }
+  uint64_t endAddress() const { return Range.end(); }
   uint64_t size() const { return Range.size(); }
-  void setStartAddress(uint64_t Addr) { Range.Start = Addr; }
-  void setEndAddress(uint64_t Addr) { Range.End = Addr; }
-  void setSize(uint64_t Size) { Range.End = Range.Start + Size; }
 
   void clear() {
     Range = {0, 0};
@@ -203,8 +198,8 @@ inline bool operator<(const FunctionInfo &LHS, const FunctionInfo &RHS) {
     return LHS.Range < RHS.Range;
 
   // Then sort by inline
-  if (LHS.Inline.hasValue() != RHS.Inline.hasValue())
-    return RHS.Inline.hasValue();
+  if (LHS.Inline.has_value() != RHS.Inline.has_value())
+    return RHS.Inline.has_value();
 
   return LHS.OptLineTable < RHS.OptLineTable;
 }
diff --git a/llvm/include/llvm/DebugInfo/GSYM/GsymCreator.h b/llvm/include/llvm/DebugInfo/GSYM/GsymCreator.h
index 872ccd4a0b6a..29ad1c18e295 100644
--- a/llvm/include/llvm/DebugInfo/GSYM/GsymCreator.h
+++ b/llvm/include/llvm/DebugInfo/GSYM/GsymCreator.h
@@ -14,11 +14,11 @@
 #include <mutex>
 #include <thread>
 
+#include "llvm/ADT/AddressRanges.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/DebugInfo/GSYM/FileEntry.h"
 #include "llvm/DebugInfo/GSYM/FunctionInfo.h"
-#include "llvm/DebugInfo/GSYM/Range.h"
 #include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
diff --git a/llvm/include/llvm/DebugInfo/GSYM/InlineInfo.h b/llvm/include/llvm/DebugInfo/GSYM/InlineInfo.h
index 9bcfa5935180..80385116598a 100644
--- a/llvm/include/llvm/DebugInfo/GSYM/InlineInfo.h
+++ b/llvm/include/llvm/DebugInfo/GSYM/InlineInfo.h
@@ -10,14 +10,13 @@
 #define LLVM_DEBUGINFO_GSYM_INLINEINFO_H
 
 #include "llvm/ADT/Optional.h"
+#include "llvm/DebugInfo/GSYM/ExtractRanges.h"
 #include "llvm/DebugInfo/GSYM/LineEntry.h"
 #include "llvm/DebugInfo/GSYM/LookupResult.h"
-#include "llvm/DebugInfo/GSYM/Range.h"
 #include "llvm/Support/Error.h"
 #include <stdint.h>
 #include <vector>
 
-
 namespace llvm {
 class raw_ostream;
 
diff --git a/llvm/include/llvm/DebugInfo/GSYM/LineEntry.h b/llvm/include/llvm/DebugInfo/GSYM/LineEntry.h
index b4e7587fc5ee..e68624b21929 100644
--- a/llvm/include/llvm/DebugInfo/GSYM/LineEntry.h
+++ b/llvm/include/llvm/DebugInfo/GSYM/LineEntry.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_DEBUGINFO_GSYM_LINEENTRY_H
 #define LLVM_DEBUGINFO_GSYM_LINEENTRY_H
 
-#include "llvm/DebugInfo/GSYM/Range.h"
+#include "llvm/DebugInfo/GSYM/ExtractRanges.h"
 
 namespace llvm {
 namespace gsym {
diff --git a/llvm/include/llvm/DebugInfo/GSYM/LookupResult.h b/llvm/include/llvm/DebugInfo/GSYM/LookupResult.h
index 3dabbce32bb2..44e58f522002 100644
--- a/llvm/include/llvm/DebugInfo/GSYM/LookupResult.h
+++ b/llvm/include/llvm/DebugInfo/GSYM/LookupResult.h
@@ -9,8 +9,8 @@
 #ifndef LLVM_DEBUGINFO_GSYM_LOOKUPRESULT_H
 #define LLVM_DEBUGINFO_GSYM_LOOKUPRESULT_H
 
+#include "llvm/ADT/AddressRanges.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/DebugInfo/GSYM/Range.h"
 #include <inttypes.h>
 #include <vector>
 
diff --git a/llvm/include/llvm/DebugInfo/GSYM/Range.h b/llvm/include/llvm/DebugInfo/GSYM/Range.h
deleted file mode 100644
index 36ad95602d14..000000000000
--- a/llvm/include/llvm/DebugInfo/GSYM/Range.h
+++ /dev/null
@@ -1,130 +0,0 @@
-//===- Range.h --------------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_DEBUGINFO_GSYM_RANGE_H
-#define LLVM_DEBUGINFO_GSYM_RANGE_H
-
-#include "llvm/ADT/Optional.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-#include <stdint.h>
-#include <vector>
-
-#define HEX8(v) llvm::format_hex(v, 4)
-#define HEX16(v) llvm::format_hex(v, 6)
-#define HEX32(v) llvm::format_hex(v, 10)
-#define HEX64(v) llvm::format_hex(v, 18)
-
-namespace llvm {
-class DataExtractor;
-class raw_ostream;
-
-namespace gsym {
-
-class FileWriter;
-
-/// A class that represents an address range. The range is specified using
-/// a start and an end address.
-struct AddressRange {
-  uint64_t Start;
-  uint64_t End;
-  AddressRange() : Start(0), End(0) {}
-  AddressRange(uint64_t S, uint64_t E) : Start(S), End(E) {}
-  uint64_t size() const { return End - Start; }
-  bool contains(uint64_t Addr) const { return Start <= Addr && Addr < End; }
-  bool intersects(const AddressRange &R) const {
-    return Start < R.End && R.Start < End;
-  }
-
-  bool operator==(const AddressRange &R) const {
-    return Start == R.Start && End == R.End;
-  }
-  bool operator!=(const AddressRange &R) const {
-    return !(*this == R);
-  }
-  bool operator<(const AddressRange &R) const {
-    return std::make_pair(Start, End) < std::make_pair(R.Start, R.End);
-  }
-  /// AddressRange objects are encoded and decoded to be relative to a base
-  /// address. This will be the FunctionInfo's start address if the AddressRange
-  /// is directly contained in a FunctionInfo, or a base address of the
-  /// containing parent AddressRange or AddressRanges. This allows address
-  /// ranges to be efficiently encoded using ULEB128 encodings as we encode the
-  /// offset and size of each range instead of full addresses. This also makes
-  /// encoded addresses easy to relocate as we just need to relocate one base
-  /// address.
-  /// @{
-  void decode(DataExtractor &Data, uint64_t BaseAddr, uint64_t &Offset);
-  void encode(FileWriter &O, uint64_t BaseAddr) const;
-  /// @}
-
-  /// Skip an address range object in the specified data a the specified
-  /// offset.
-  ///
-  /// \param Data The binary stream to read the data from.
-  ///
-  /// \param Offset The byte offset within \a Data.
-  static void skip(DataExtractor &Data, uint64_t &Offset);
-};
-
-raw_ostream &operator<<(raw_ostream &OS, const AddressRange &R);
-
-/// The AddressRanges class helps normalize address range collections.
-/// This class keeps a sorted vector of AddressRange objects and can perform
-/// insertions and searches efficiently. The address ranges are always sorted
-/// and never contain any invalid or empty address ranges. This allows us to
-/// emit address ranges into the GSYM file efficiently. Intersecting address
-/// ranges are combined during insertion so that we can emit the most compact
-/// representation for address ranges when writing to disk.
-class AddressRanges {
-protected:
-  using Collection = std::vector<AddressRange>;
-  Collection Ranges;
-public:
-  void clear() { Ranges.clear(); }
-  bool empty() const { return Ranges.empty(); }
-  bool contains(uint64_t Addr) const;
-  bool contains(AddressRange Range) const;
-  Optional<AddressRange> getRangeThatContains(uint64_t Addr) const;
-  void insert(AddressRange Range);
-  size_t size() const { return Ranges.size(); }
-  bool operator==(const AddressRanges &RHS) const {
-    return Ranges == RHS.Ranges;
-  }
-  const AddressRange &operator[](size_t i) const {
-    assert(i < Ranges.size());
-    return Ranges[i];
-  }
-  Collection::const_iterator begin() const { return Ranges.begin(); }
-  Collection::const_iterator end() const { return Ranges.end(); }
-
-  /// Address ranges are decoded and encoded to be relative to a base address.
-  /// See the AddressRange comment for the encode and decode methods for full
-  /// details.
-  /// @{
-  void decode(DataExtractor &Data, uint64_t BaseAddr, uint64_t &Offset);
-  void encode(FileWriter &O, uint64_t BaseAddr) const;
-  /// @}
-
-  /// Skip an address range object in the specified data a the specified
-  /// offset.
-  ///
-  /// \param Data The binary stream to read the data from.
-  ///
-  /// \param Offset The byte offset within \a Data.
-  ///
-  /// \returns The number of address ranges that were skipped.
-  static uint64_t skip(DataExtractor &Data, uint64_t &Offset);
-};
-
-raw_ostream &operator<<(raw_ostream &OS, const AddressRanges &AR);
-
-} // namespace gsym
-} // namespace llvm
-
-#endif // LLVM_DEBUGINFO_GSYM_RANGE_H
diff --git a/llvm/include/llvm/DebugInfo/GSYM/StringTable.h b/llvm/include/llvm/DebugInfo/GSYM/StringTable.h
index d920335d373e..d9c9ede91be5 100644
--- a/llvm/include/llvm/DebugInfo/GSYM/StringTable.h
+++ b/llvm/include/llvm/DebugInfo/GSYM/StringTable.h
@@ -10,7 +10,7 @@
 #define LLVM_DEBUGINFO_GSYM_STRINGTABLE_H
 
 #include "llvm/ADT/StringRef.h"
-#include "llvm/DebugInfo/GSYM/Range.h"
+#include "llvm/DebugInfo/GSYM/ExtractRanges.h"
 #include <stdint.h>
 
 namespace llvm {
diff --git a/llvm/include/llvm/DebugInfo/MSF/MSFBuilder.h b/llvm/include/llvm/DebugInfo/MSF/MSFBuilder.h
index 1a03d42ded92..2ac18a8efaba 100644
--- a/llvm/include/llvm/DebugInfo/MSF/MSFBuilder.h
+++ b/llvm/include/llvm/DebugInfo/MSF/MSFBuilder.h
@@ -11,7 +11,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
-#include "llvm/DebugInfo/MSF/MSFCommon.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Error.h"
 #include <cstdint>
@@ -22,6 +22,8 @@ namespace llvm {
 class FileBufferByteStream;
 namespace msf {
 
+struct MSFLayout;
+
 class MSFBuilder {
 public:
   /// Create a new `MSFBuilder`.
diff --git a/llvm/include/llvm/DebugInfo/PDB/IPDBEnumChildren.h b/llvm/include/llvm/DebugInfo/PDB/IPDBEnumChildren.h
index bfa67d39bc76..6cd5c8d1d668 100644
--- a/llvm/include/llvm/DebugInfo/PDB/IPDBEnumChildren.h
+++ b/llvm/include/llvm/DebugInfo/PDB/IPDBEnumChildren.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_DEBUGINFO_PDB_IPDBENUMCHILDREN_H
 #define LLVM_DEBUGINFO_PDB_IPDBENUMCHILDREN_H
 
+#include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
 #include <cassert>
 #include <cstdint>
 #include <memory>
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptor.h b/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptor.h
index 70ef4d058082..1ecae5c32509 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptor.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptor.h
@@ -10,16 +10,16 @@
 #define LLVM_DEBUGINFO_PDB_NATIVE_DBIMODULEDESCRIPTOR_H
 
 #include "llvm/ADT/StringRef.h"
-#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
-#include "llvm/Support/BinaryStreamArray.h"
 #include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Error.h"
 #include <cstdint>
 
 namespace llvm {
+template <typename T> struct VarStreamArrayExtractor;
 
 namespace pdb {
-
+struct ModuleInfoHeader;
+struct SectionContrib;
 class DbiModuleDescriptor {
   friend class DbiStreamBuilder;
 
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h b/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h
index 8a49f46320b0..287f319e01b0 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h
@@ -9,13 +9,12 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_DBIMODULEDESCRIPTORBUILDER_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_DBIMODULEDESCRIPTORBUILDER_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
-#include "llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h"
-#include "llvm/DebugInfo/CodeView/DebugLinesSubsection.h"
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
 #include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
-#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Error.h"
 #include <cstdint>
 #include <string>
@@ -23,9 +22,8 @@
 
 namespace llvm {
 class BinaryStreamWriter;
-
 namespace codeview {
-class DebugSubsectionRecordBuilder;
+class DebugSubsection;
 }
 
 namespace msf {
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/DbiStream.h b/llvm/include/llvm/DebugInfo/PDB/Native/DbiStream.h
index 0bdb27a0a991..3f60130f5752 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/DbiStream.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/DbiStream.h
@@ -9,14 +9,10 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_DBISTREAM_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_DBISTREAM_H
 
-#include "llvm/DebugInfo/CodeView/DebugSubsection.h"
 #include "llvm/DebugInfo/CodeView/DebugFrameDataSubsection.h"
-#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/PDB/Native/DbiModuleDescriptor.h"
 #include "llvm/DebugInfo/PDB/Native/DbiModuleList.h"
 #include "llvm/DebugInfo/PDB/Native/PDBStringTable.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
-#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/DebugInfo/PDB/PDBTypes.h"
 #include "llvm/Support/BinaryStreamArray.h"
 #include "llvm/Support/BinaryStreamRef.h"
@@ -24,13 +20,19 @@
 #include "llvm/Support/Error.h"
 
 namespace llvm {
+class BinaryStream;
 namespace object {
 struct FpoData;
 struct coff_section;
 }
-
+namespace msf {
+class MappedBlockStream;
+}
 namespace pdb {
-class DbiStreamBuilder;
+struct DbiStreamHeader;
+struct SecMapEntry;
+struct SectionContrib2;
+struct SectionContrib;
 class PDBFile;
 class ISectionContribVisitor;
 
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h b/llvm/include/llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h
index ef441d433040..2f99aa942a05 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h
@@ -10,35 +10,33 @@
 #define LLVM_DEBUGINFO_PDB_NATIVE_DBISTREAMBUILDER_H
 
 #include "llvm/ADT/Optional.h"
-#include "llvm/ADT/StringSet.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/COFF.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Support/Allocator.h"
 #include "llvm/Support/Error.h"
 
 #include "llvm/DebugInfo/CodeView/DebugFrameDataSubsection.h"
-#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/Native/PDBStringTableBuilder.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/DebugInfo/PDB/PDBTypes.h"
 #include "llvm/Support/BinaryByteStream.h"
-#include "llvm/Support/BinaryStreamReader.h"
-#include "llvm/Support/Endian.h"
+#include "llvm/Support/BinaryStreamRef.h"
 
 namespace llvm {
+
+class BinaryStreamWriter;
 namespace codeview {
 struct FrameData;
 }
 namespace msf {
 class MSFBuilder;
-}
-namespace object {
-struct coff_section;
-struct FpoData;
+struct MSFLayout;
 }
 namespace pdb {
-class DbiStream;
-struct DbiStreamHeader;
 class DbiModuleDescriptorBuilder;
-class PDBFile;
 
 class DbiStreamBuilder {
 public:
@@ -134,7 +132,7 @@ private:
   std::vector<SecMapEntry> SectionMap;
   std::array<Optional<DebugStream>, (int)DbgHeaderType::Max> DbgStreams;
 };
-}
+} // namespace pdb
 }
 
 #endif
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/EnumTables.h b/llvm/include/llvm/DebugInfo/PDB/Native/EnumTables.h
index 60cd494639c1..dcc67f1e4a8c 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/EnumTables.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/EnumTables.h
@@ -10,9 +10,9 @@
 #define LLVM_DEBUGINFO_PDB_NATIVE_ENUMTABLES_H
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/Support/ScopedPrinter.h"
 
 namespace llvm {
+template <typename T> struct EnumEntry;
 namespace pdb {
 ArrayRef<EnumEntry<uint16_t>> getOMFSegMapDescFlagNames();
 }
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/FormatUtil.h b/llvm/include/llvm/DebugInfo/PDB/Native/FormatUtil.h
new file mode 100644
index 000000000000..ed745eaf9727
--- /dev/null
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/FormatUtil.h
@@ -0,0 +1,133 @@
+//===- FormatUtil.h ------------------------------------------- *- C++ --*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_NATIVE_FORMATUTIL_H
+#define LLVM_DEBUGINFO_PDB_NATIVE_FORMATUTIL_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/FormatAdapters.h"
+#include "llvm/Support/FormatVariadic.h"
+
+#include <string>
+#include <type_traits>
+
+namespace llvm {
+namespace pdb {
+
+#define PUSH_MASKED_FLAG(Enum, Mask, TheOpt, Value, Text)                      \
+  if (Enum::TheOpt == (Value & Mask))                                          \
+    Opts.push_back(Text);
+
+#define PUSH_FLAG(Enum, TheOpt, Value, Text)                                   \
+  PUSH_MASKED_FLAG(Enum, Enum::TheOpt, TheOpt, Value, Text)
+
+#define RETURN_CASE(Enum, X, Ret)                                              \
+  case Enum::X:                                                                \
+    return Ret;
+
+template <typename T> std::string formatUnknownEnum(T Value) {
+  return formatv("unknown ({0})", static_cast<std::underlying_type_t<T>>(Value))
+      .str();
+}
+
+std::string formatSegmentOffset(uint16_t Segment, uint32_t Offset);
+
+enum class CharacteristicStyle {
+  HeaderDefinition, // format as windows header definition
+  Descriptive,      // format as human readable words
+};
+std::string formatSectionCharacteristics(
+    uint32_t IndentLevel, uint32_t C, uint32_t FlagsPerLine,
+    StringRef Separator,
+    CharacteristicStyle Style = CharacteristicStyle::HeaderDefinition);
+
+std::string typesetItemList(ArrayRef<std::string> Opts, uint32_t IndentLevel,
+                            uint32_t GroupSize, StringRef Sep);
+
+std::string typesetStringList(uint32_t IndentLevel,
+                              ArrayRef<StringRef> Strings);
+
+std::string formatChunkKind(codeview::DebugSubsectionKind Kind,
+                            bool Friendly = true);
+std::string formatSymbolKind(codeview::SymbolKind K);
+std::string formatTypeLeafKind(codeview::TypeLeafKind K);
+
+/// Returns the number of digits in the given integer.
+inline int NumDigits(uint64_t N) {
+  if (N < 10ULL)
+    return 1;
+  if (N < 100ULL)
+    return 2;
+  if (N < 1000ULL)
+    return 3;
+  if (N < 10000ULL)
+    return 4;
+  if (N < 100000ULL)
+    return 5;
+  if (N < 1000000ULL)
+    return 6;
+  if (N < 10000000ULL)
+    return 7;
+  if (N < 100000000ULL)
+    return 8;
+  if (N < 1000000000ULL)
+    return 9;
+  if (N < 10000000000ULL)
+    return 10;
+  if (N < 100000000000ULL)
+    return 11;
+  if (N < 1000000000000ULL)
+    return 12;
+  if (N < 10000000000000ULL)
+    return 13;
+  if (N < 100000000000000ULL)
+    return 14;
+  if (N < 1000000000000000ULL)
+    return 15;
+  if (N < 10000000000000000ULL)
+    return 16;
+  if (N < 100000000000000000ULL)
+    return 17;
+  if (N < 1000000000000000000ULL)
+    return 18;
+  if (N < 10000000000000000000ULL)
+    return 19;
+  return 20;
+}
+
+namespace detail {
+template <typename T>
+struct EndianAdapter final
+    : public FormatAdapter<support::detail::packed_endian_specific_integral<
+          T, support::little, support::unaligned>> {
+  using EndianType =
+      support::detail::packed_endian_specific_integral<T, support::little,
+                                                       support::unaligned>;
+
+  explicit EndianAdapter(EndianType &&Item)
+      : FormatAdapter<EndianType>(std::move(Item)) {}
+
+  void format(llvm::raw_ostream &Stream, StringRef Style) override {
+    format_provider<T>::format(static_cast<T>(this->Item), Stream, Style);
+  }
+};
+} // namespace detail
+
+template <typename T>
+detail::EndianAdapter<T>
+fmtle(support::detail::packed_endian_specific_integral<T, support::little,
+                                                       support::unaligned>
+          Value) {
+  return detail::EndianAdapter<T>(std::move(Value));
+}
+} // namespace pdb
+} // namespace llvm
+#endif
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h b/llvm/include/llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h
index 9530a15849d5..28a72c887f25 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h
@@ -10,18 +10,20 @@
 #define LLVM_DEBUGINFO_PDB_NATIVE_GSISTREAMBUILDER_H
 
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
-#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
-#include "llvm/Support/BinaryByteStream.h"
-#include "llvm/Support/BinaryItemStream.h"
 #include "llvm/Support/BinaryStreamRef.h"
-#include "llvm/Support/BinaryStreamWriter.h"
-#include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
+namespace codeview {
+class ConstantSym;
+class DataSym;
+class ProcRefSym;
+} // namespace codeview
+template <typename T> struct BinaryItemTraits;
 
 template <> struct BinaryItemTraits<codeview::CVSymbol> {
   static size_t length(const codeview::CVSymbol &Item) {
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/GlobalsStream.h b/llvm/include/llvm/DebugInfo/PDB/Native/GlobalsStream.h
index 2b74babd6ab9..2988bef4a75b 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/GlobalsStream.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/GlobalsStream.h
@@ -10,18 +10,18 @@
 #define LLVM_DEBUGINFO_PDB_NATIVE_GLOBALSSTREAM_H
 
 #include "llvm/ADT/iterator.h"
-#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
-#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/PDB/Native/RawConstants.h"
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
-#include "llvm/DebugInfo/PDB/PDBTypes.h"
 #include "llvm/Support/BinaryStreamArray.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
+class BinaryStreamReader;
+namespace msf {
+class MappedBlockStream;
+}
 namespace pdb {
-class DbiStream;
-class PDBFile;
 class SymbolStream;
 
 /// Iterator over hash records producing symbol record offsets. Abstracts away
@@ -81,7 +81,7 @@ private:
   GSIHashTable GlobalsTable;
   std::unique_ptr<msf::MappedBlockStream> Stream;
 };
-}
+} // namespace pdb
 }
 
 #endif
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/HashTable.h b/llvm/include/llvm/DebugInfo/PDB/Native/HashTable.h
index 474bd796b2b3..7924cffd640f 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/HashTable.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/HashTable.h
@@ -23,9 +23,6 @@
 
 namespace llvm {
 
-class BinaryStreamReader;
-class BinaryStreamWriter;
-
 namespace pdb {
 
 Error readSparseBitVector(BinaryStreamReader &Stream, SparseBitVector<> &V);
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/InfoStream.h b/llvm/include/llvm/DebugInfo/PDB/Native/InfoStream.h
index 67db92b64913..625bab6a4378 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/InfoStream.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/InfoStream.h
@@ -9,22 +9,18 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_INFOSTREAM_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_INFOSTREAM_H
 
-#include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/DebugInfo/CodeView/GUID.h"
-#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/PDB/Native/NamedStreamMap.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
-#include "llvm/DebugInfo/PDB/PDBTypes.h"
+#include "llvm/Support/BinaryStream.h"
+#include "llvm/Support/BinaryStreamRef.h"
 
-#include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
 namespace pdb {
-class InfoStreamBuilder;
-class PDBFile;
-
+struct InfoStreamHeader;
 class InfoStream {
   friend class InfoStreamBuilder;
 
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h b/llvm/include/llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h
index 4952173c5873..2d5088a3bd42 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h
@@ -12,19 +12,17 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/Support/Error.h"
 
-#include "llvm/DebugInfo/PDB/Native/NamedStreamMap.h"
-#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/CodeView/GUID.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
-#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 namespace llvm {
 class WritableBinaryStreamRef;
 
 namespace msf {
 class MSFBuilder;
+struct MSFLayout;
 }
 namespace pdb {
-class PDBFile;
 class NamedStreamMap;
 
 class InfoStreamBuilder {
@@ -70,7 +68,7 @@ private:
 
   NamedStreamMap &NamedStreams;
 };
-}
+} // namespace pdb
 }
 
 #endif
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/InjectedSourceStream.h b/llvm/include/llvm/DebugInfo/PDB/Native/InjectedSourceStream.h
index b2ba81a88254..259c924d9d7c 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/InjectedSourceStream.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/InjectedSourceStream.h
@@ -9,15 +9,14 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_INJECTEDSOURCESTREAM_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_INJECTEDSOURCESTREAM_H
 
+#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/PDB/Native/HashTable.h"
-#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
-namespace msf {
-class MappedBlockStream;
-}
 namespace pdb {
+struct SrcHeaderBlockEntry;
+struct SrcHeaderBlockHeader;
 class PDBStringTable;
 
 class InjectedSourceStream {
@@ -38,6 +37,6 @@ private:
   HashTable<SrcHeaderBlockEntry> InjectedSourceTable;
 };
 }
-}
+} // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/InputFile.h b/llvm/include/llvm/DebugInfo/PDB/Native/InputFile.h
new file mode 100644
index 000000000000..c0d722960540
--- /dev/null
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/InputFile.h
@@ -0,0 +1,231 @@
+//===- InputFile.h -------------------------------------------- *- C++ --*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_NATIVE_INPUTFILE_H
+#define LLVM_DEBUGINFO_PDB_NATIVE_INPUTFILE_H
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/PointerUnion.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
+#include "llvm/DebugInfo/CodeView/StringsAndChecksums.h"
+#include "llvm/DebugInfo/PDB/Native/LinePrinter.h"
+#include "llvm/DebugInfo/PDB/Native/ModuleDebugStream.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Error.h"
+
+namespace llvm {
+namespace codeview {
+class LazyRandomTypeCollection;
+}
+namespace object {
+class COFFObjectFile;
+} // namespace object
+
+namespace pdb {
+class InputFile;
+class LinePrinter;
+class PDBFile;
+class NativeSession;
+class SymbolGroupIterator;
+class SymbolGroup;
+
+class InputFile {
+  InputFile();
+
+  std::unique_ptr<NativeSession> PdbSession;
+  object::OwningBinary<object::Binary> CoffObject;
+  std::unique_ptr<MemoryBuffer> UnknownFile;
+  PointerUnion<PDBFile *, object::COFFObjectFile *, MemoryBuffer *> PdbOrObj;
+
+  using TypeCollectionPtr = std::unique_ptr<codeview::LazyRandomTypeCollection>;
+
+  TypeCollectionPtr Types;
+  TypeCollectionPtr Ids;
+
+  enum TypeCollectionKind { kTypes, kIds };
+  codeview::LazyRandomTypeCollection &
+  getOrCreateTypeCollection(TypeCollectionKind Kind);
+
+public:
+  InputFile(PDBFile *Pdb) { PdbOrObj = Pdb; }
+  InputFile(object::COFFObjectFile *Obj) { PdbOrObj = Obj; }
+  InputFile(MemoryBuffer *Buffer) { PdbOrObj = Buffer; }
+  ~InputFile();
+  InputFile(InputFile &&Other) = default;
+
+  static Expected<InputFile> open(StringRef Path,
+                                  bool AllowUnknownFile = false);
+
+  PDBFile &pdb();
+  const PDBFile &pdb() const;
+  object::COFFObjectFile &obj();
+  const object::COFFObjectFile &obj() const;
+  MemoryBuffer &unknown();
+  const MemoryBuffer &unknown() const;
+
+  StringRef getFilePath() const;
+
+  bool hasTypes() const;
+  bool hasIds() const;
+
+  codeview::LazyRandomTypeCollection &types();
+  codeview::LazyRandomTypeCollection &ids();
+
+  iterator_range<SymbolGroupIterator> symbol_groups();
+  SymbolGroupIterator symbol_groups_begin();
+  SymbolGroupIterator symbol_groups_end();
+
+  bool isPdb() const;
+  bool isObj() const;
+  bool isUnknown() const;
+};
+
+class SymbolGroup {
+  friend class SymbolGroupIterator;
+
+public:
+  explicit SymbolGroup(InputFile *File, uint32_t GroupIndex = 0);
+
+  Expected<StringRef> getNameFromStringTable(uint32_t Offset) const;
+  Expected<StringRef> getNameFromChecksums(uint32_t Offset) const;
+
+  void formatFromFileName(LinePrinter &Printer, StringRef File,
+                          bool Append = false) const;
+
+  void formatFromChecksumsOffset(LinePrinter &Printer, uint32_t Offset,
+                                 bool Append = false) const;
+
+  StringRef name() const;
+
+  codeview::DebugSubsectionArray getDebugSubsections() const {
+    return Subsections;
+  }
+  const ModuleDebugStreamRef &getPdbModuleStream() const;
+
+  const InputFile &getFile() const { return *File; }
+  InputFile &getFile() { return *File; }
+
+  bool hasDebugStream() const { return DebugStream != nullptr; }
+
+private:
+  void initializeForPdb(uint32_t Modi);
+  void updatePdbModi(uint32_t Modi);
+  void updateDebugS(const codeview::DebugSubsectionArray &SS);
+
+  void rebuildChecksumMap();
+  InputFile *File = nullptr;
+  StringRef Name;
+  codeview::DebugSubsectionArray Subsections;
+  std::shared_ptr<ModuleDebugStreamRef> DebugStream;
+  codeview::StringsAndChecksumsRef SC;
+  StringMap<codeview::FileChecksumEntry> ChecksumsByFile;
+};
+
+class SymbolGroupIterator
+    : public iterator_facade_base<SymbolGroupIterator,
+                                  std::forward_iterator_tag, SymbolGroup> {
+public:
+  SymbolGroupIterator();
+  explicit SymbolGroupIterator(InputFile &File);
+  SymbolGroupIterator(const SymbolGroupIterator &Other) = default;
+  SymbolGroupIterator &operator=(const SymbolGroupIterator &R) = default;
+
+  const SymbolGroup &operator*() const;
+  SymbolGroup &operator*();
+
+  bool operator==(const SymbolGroupIterator &R) const;
+  SymbolGroupIterator &operator++();
+
+private:
+  void scanToNextDebugS();
+  bool isEnd() const;
+
+  uint32_t Index = 0;
+  Optional<object::section_iterator> SectionIter;
+  SymbolGroup Value;
+};
+
+Expected<ModuleDebugStreamRef>
+getModuleDebugStream(PDBFile &File, StringRef &ModuleName, uint32_t Index);
+Expected<ModuleDebugStreamRef> getModuleDebugStream(PDBFile &File,
+                                                    uint32_t Index);
+
+bool shouldDumpSymbolGroup(uint32_t Idx, const SymbolGroup &Group,
+                           const FilterOptions &Filters);
+
+// TODO: Change these callbacks to be function_refs (de-templatify them).
+template <typename CallbackT>
+Error iterateOneModule(InputFile &File, const PrintScope &HeaderScope,
+                       const SymbolGroup &SG, uint32_t Modi,
+                       CallbackT Callback) {
+  HeaderScope.P.formatLine(
+      "Mod {0:4} | `{1}`: ",
+      fmt_align(Modi, AlignStyle::Right, HeaderScope.LabelWidth), SG.name());
+
+  AutoIndent Indent(HeaderScope);
+  return Callback(Modi, SG);
+}
+
+template <typename CallbackT>
+Error iterateSymbolGroups(InputFile &Input, const PrintScope &HeaderScope,
+                          CallbackT Callback) {
+  AutoIndent Indent(HeaderScope);
+
+  FilterOptions Filters = HeaderScope.P.getFilters();
+  if (Filters.DumpModi) {
+    uint32_t Modi = *Filters.DumpModi;
+    SymbolGroup SG(&Input, Modi);
+    return iterateOneModule(Input, withLabelWidth(HeaderScope, NumDigits(Modi)),
+                            SG, Modi, Callback);
+  }
+
+  uint32_t I = 0;
+
+  for (const auto &SG : Input.symbol_groups()) {
+    if (shouldDumpSymbolGroup(I, SG, Filters))
+      if (auto Err =
+              iterateOneModule(Input, withLabelWidth(HeaderScope, NumDigits(I)),
+                               SG, I, Callback))
+        return Err;
+
+    ++I;
+  }
+  return Error::success();
+}
+
+template <typename SubsectionT>
+Error iterateModuleSubsections(
+    InputFile &File, const PrintScope &HeaderScope,
+    llvm::function_ref<Error(uint32_t, const SymbolGroup &, SubsectionT &)>
+        Callback) {
+
+  return iterateSymbolGroups(
+      File, HeaderScope, [&](uint32_t Modi, const SymbolGroup &SG) -> Error {
+        for (const auto &SS : SG.getDebugSubsections()) {
+          SubsectionT Subsection;
+
+          if (SS.kind() != Subsection.kind())
+            continue;
+
+          BinaryStreamReader Reader(SS.getRecordData());
+          if (auto Err = Subsection.initialize(Reader))
+            continue;
+          if (auto Err = Callback(Modi, SG, Subsection))
+            return Err;
+        }
+        return Error::success();
+      });
+}
+
+} // namespace pdb
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/LinePrinter.h b/llvm/include/llvm/DebugInfo/PDB/Native/LinePrinter.h
new file mode 100644
index 000000000000..0db21309f593
--- /dev/null
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/LinePrinter.h
@@ -0,0 +1,185 @@
+//===- LinePrinter.h ------------------------------------------ *- C++ --*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_NATIVE_LINEPRINTER_H
+#define LLVM_DEBUGINFO_PDB_NATIVE_LINEPRINTER_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/DebugInfo/PDB/Native/FormatUtil.h"
+#include "llvm/Support/BinaryStreamRef.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <list>
+
+// Container for filter options to control which elements will be printed.
+struct FilterOptions {
+  std::list<std::string> ExcludeTypes;
+  std::list<std::string> ExcludeSymbols;
+  std::list<std::string> ExcludeCompilands;
+  std::list<std::string> IncludeTypes;
+  std::list<std::string> IncludeSymbols;
+  std::list<std::string> IncludeCompilands;
+  uint32_t PaddingThreshold;
+  uint32_t SizeThreshold;
+  llvm::Optional<uint32_t> DumpModi;
+  llvm::Optional<uint32_t> ParentRecurseDepth;
+  llvm::Optional<uint32_t> ChildrenRecurseDepth;
+  llvm::Optional<uint32_t> SymbolOffset;
+  bool JustMyCode;
+};
+
+namespace llvm {
+namespace msf {
+class MSFStreamLayout;
+} // namespace msf
+namespace pdb {
+
+class ClassLayout;
+class PDBFile;
+class SymbolGroup;
+
+class LinePrinter {
+  friend class WithColor;
+
+public:
+  LinePrinter(int Indent, bool UseColor, raw_ostream &Stream,
+              const FilterOptions &Filters);
+
+  void Indent(uint32_t Amount = 0);
+  void Unindent(uint32_t Amount = 0);
+  void NewLine();
+
+  void printLine(const Twine &T);
+  void print(const Twine &T);
+  template <typename... Ts> void formatLine(const char *Fmt, Ts &&...Items) {
+    printLine(formatv(Fmt, std::forward<Ts>(Items)...));
+  }
+  template <typename... Ts> void format(const char *Fmt, Ts &&...Items) {
+    print(formatv(Fmt, std::forward<Ts>(Items)...));
+  }
+
+  void formatBinary(StringRef Label, ArrayRef<uint8_t> Data,
+                    uint64_t StartOffset);
+  void formatBinary(StringRef Label, ArrayRef<uint8_t> Data, uint64_t BaseAddr,
+                    uint64_t StartOffset);
+
+  void formatMsfStreamData(StringRef Label, PDBFile &File, uint32_t StreamIdx,
+                           StringRef StreamPurpose, uint64_t Offset,
+                           uint64_t Size);
+  void formatMsfStreamData(StringRef Label, PDBFile &File,
+                           const msf::MSFStreamLayout &Stream,
+                           BinarySubstreamRef Substream);
+  void formatMsfStreamBlocks(PDBFile &File, const msf::MSFStreamLayout &Stream);
+
+  bool hasColor() const { return UseColor; }
+  raw_ostream &getStream() { return OS; }
+  int getIndentLevel() const { return CurrentIndent; }
+
+  bool IsClassExcluded(const ClassLayout &Class);
+  bool IsTypeExcluded(llvm::StringRef TypeName, uint64_t Size);
+  bool IsSymbolExcluded(llvm::StringRef SymbolName);
+  bool IsCompilandExcluded(llvm::StringRef CompilandName);
+
+  const FilterOptions &getFilters() const { return Filters; }
+
+private:
+  template <typename Iter>
+  void SetFilters(std::list<Regex> &List, Iter Begin, Iter End) {
+    List.clear();
+    for (; Begin != End; ++Begin)
+      List.emplace_back(StringRef(*Begin));
+  }
+
+  raw_ostream &OS;
+  int IndentSpaces;
+  int CurrentIndent;
+  bool UseColor;
+  const FilterOptions &Filters;
+
+  std::list<Regex> ExcludeCompilandFilters;
+  std::list<Regex> ExcludeTypeFilters;
+  std::list<Regex> ExcludeSymbolFilters;
+
+  std::list<Regex> IncludeCompilandFilters;
+  std::list<Regex> IncludeTypeFilters;
+  std::list<Regex> IncludeSymbolFilters;
+};
+
+struct PrintScope {
+  explicit PrintScope(LinePrinter &P, uint32_t IndentLevel)
+      : P(P), IndentLevel(IndentLevel) {}
+  explicit PrintScope(const PrintScope &Other, uint32_t LabelWidth)
+      : P(Other.P), IndentLevel(Other.IndentLevel), LabelWidth(LabelWidth) {}
+
+  LinePrinter &P;
+  uint32_t IndentLevel;
+  uint32_t LabelWidth = 0;
+};
+
+inline PrintScope withLabelWidth(const PrintScope &Scope, uint32_t W) {
+  return PrintScope{Scope, W};
+}
+
+struct AutoIndent {
+  explicit AutoIndent(LinePrinter &L, uint32_t Amount = 0)
+      : L(&L), Amount(Amount) {
+    L.Indent(Amount);
+  }
+  explicit AutoIndent(const PrintScope &Scope) {
+    L = &Scope.P;
+    Amount = Scope.IndentLevel;
+  }
+  ~AutoIndent() {
+    if (L)
+      L->Unindent(Amount);
+  }
+
+  LinePrinter *L = nullptr;
+  uint32_t Amount = 0;
+};
+
+template <class T>
+inline raw_ostream &operator<<(LinePrinter &Printer, const T &Item) {
+  return Printer.getStream() << Item;
+}
+
+enum class PDB_ColorItem {
+  None,
+  Address,
+  Type,
+  Comment,
+  Padding,
+  Keyword,
+  Offset,
+  Identifier,
+  Path,
+  SectionHeader,
+  LiteralValue,
+  Register,
+};
+
+class WithColor {
+public:
+  WithColor(LinePrinter &P, PDB_ColorItem C);
+  ~WithColor();
+
+  raw_ostream &get() { return OS; }
+
+private:
+  void applyColor(PDB_ColorItem C);
+  raw_ostream &OS;
+  bool UseColor;
+};
+} // namespace pdb
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/ModuleDebugStream.h b/llvm/include/llvm/DebugInfo/PDB/Native/ModuleDebugStream.h
index cb1ffc729512..0caf9fffbad6 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/ModuleDebugStream.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/ModuleDebugStream.h
@@ -10,10 +10,8 @@
 #define LLVM_DEBUGINFO_PDB_NATIVE_MODULEDEBUGSTREAM_H
 
 #include "llvm/ADT/iterator_range.h"
-#include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
 #include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
-#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
-#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/PDB/Native/DbiModuleDescriptor.h"
 #include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Error.h"
@@ -21,10 +19,15 @@
 #include <memory>
 
 namespace llvm {
+class BinaryStreamReader;
+namespace codeview {
+class DebugChecksumsSubsectionRef;
+}
+namespace msf {
+class MappedBlockStream;
+}
 namespace pdb {
 
-class DbiModuleDescriptor;
-
 class ModuleDebugStreamRef {
   using DebugSubsectionIterator = codeview::DebugSubsectionArray::Iterator;
 
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NamedStreamMap.h b/llvm/include/llvm/DebugInfo/PDB/Native/NamedStreamMap.h
index f110e90b3f90..18fbab0dd38c 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NamedStreamMap.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NamedStreamMap.h
@@ -11,7 +11,6 @@
 
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/iterator_range.h"
 #include "llvm/DebugInfo/PDB/Native/HashTable.h"
 #include "llvm/Support/Error.h"
 #include <cstdint>
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumGlobals.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumGlobals.h
index 073878afd129..c10e652efa8d 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumGlobals.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumGlobals.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVEENUMGLOBALS_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVEENUMGLOBALS_H
 
-#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/PDBSymbol.h"
 
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumLineNumbers.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumLineNumbers.h
index 32a4515d557e..a936b769d688 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumLineNumbers.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumLineNumbers.h
@@ -9,16 +9,13 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVEENUMLINENUMBERS_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVEENUMLINENUMBERS_H
 
-#include "llvm/DebugInfo/CodeView/DebugLinesSubsection.h"
-#include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
-#include "llvm/DebugInfo/CodeView/StringsAndChecksums.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/Native/NativeLineNumber.h"
+#include <vector>
 
 namespace llvm {
 namespace pdb {
-class IPDBLineNumber;
 
 class NativeEnumLineNumbers : public IPDBEnumChildren<IPDBLineNumber> {
 public:
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumSymbols.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumSymbols.h
index 480b3fb11419..5fc91675f209 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumSymbols.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumSymbols.h
@@ -9,9 +9,9 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVEENUMSYMBOLS_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVEENUMSYMBOLS_H
 
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 #include <vector>
 
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumTypes.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumTypes.h
index 25c56567384f..2ca000c1c0fe 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumTypes.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumTypes.h
@@ -9,14 +9,17 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVEENUMTYPES_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVEENUMTYPES_H
 
-#include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/PDBSymbol.h"
 
 #include <vector>
 
 namespace llvm {
+namespace codeview {
+class LazyRandomTypeCollection;
+}
 namespace pdb {
 
 class NativeSession;
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeExeSymbol.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeExeSymbol.h
index 280358d02305..82fdff130c4f 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeExeSymbol.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeExeSymbol.h
@@ -9,12 +9,15 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVEEXESYMBOL_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVEEXESYMBOL_H
 
+#include "llvm/DebugInfo/CodeView/GUID.h"
 #include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 namespace llvm {
 namespace pdb {
 
+class NativeSession;
+
 class DbiStream;
 
 class NativeExeSymbol : public NativeRawSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeFunctionSymbol.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeFunctionSymbol.h
index b219055d2153..c15e22f61077 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeFunctionSymbol.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeFunctionSymbol.h
@@ -9,14 +9,17 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVEFUNCTIONSYMBOL_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVEFUNCTIONSYMBOL_H
 
-#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 namespace llvm {
+class raw_ostream;
 namespace pdb {
 
+class NativeSession;
+
 class NativeFunctionSymbol : public NativeRawSymbol {
 public:
   NativeFunctionSymbol(NativeSession &Session, SymIndexId Id,
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeInlineSiteSymbol.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeInlineSiteSymbol.h
index 2f6aba038ae8..3467ac912162 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeInlineSiteSymbol.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeInlineSiteSymbol.h
@@ -9,14 +9,16 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVEINLINESITESYMBOL_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVEINLINESITESYMBOL_H
 
-#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 namespace llvm {
 namespace pdb {
 
+class NativeSession;
+
 class NativeInlineSiteSymbol : public NativeRawSymbol {
 public:
   NativeInlineSiteSymbol(NativeSession &Session, SymIndexId Id,
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeLineNumber.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeLineNumber.h
index be0ddf0a063a..53f2985833fd 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeLineNumber.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeLineNumber.h
@@ -11,10 +11,12 @@
 
 #include "llvm/DebugInfo/CodeView/Line.h"
 #include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 
 namespace llvm {
 namespace pdb {
+
+class NativeSession;
+
 class NativeLineNumber : public IPDBLineNumber {
 public:
   explicit NativeLineNumber(const NativeSession &Session,
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativePublicSymbol.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativePublicSymbol.h
index 9f410e27f4cb..43de80507d02 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativePublicSymbol.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativePublicSymbol.h
@@ -9,13 +9,14 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVEPUBLICSYMBOL_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVEPUBLICSYMBOL_H
 
-#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 
 namespace llvm {
+
+class raw_ostream;
 namespace pdb {
+class NativeSession;
 
 class NativePublicSymbol : public NativeRawSymbol {
 public:
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeSession.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeSession.h
index 5f8fc587e546..95be7d09aae9 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeSession.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeSession.h
@@ -9,13 +9,11 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVESESSION_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVESESSION_H
 
-#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/IntervalMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/DebugInfo/CodeView/TypeIndex.h"
-#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
-#include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/SymbolCache.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Error.h"
 
@@ -24,6 +22,12 @@ class MemoryBuffer;
 namespace pdb {
 class PDBFile;
 class NativeExeSymbol;
+class IPDBSourceFile;
+class ModuleDebugStreamRef;
+class PDBSymbol;
+class PDBSymbolCompiland;
+class PDBSymbolExe;
+template <typename ChildType> class IPDBEnumChildren;
 
 class NativeSession : public IPDBSession {
   struct PdbSearchOptions {
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeSourceFile.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeSourceFile.h
index eb6336f268e8..c6653368bc0c 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeSourceFile.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeSourceFile.h
@@ -11,11 +11,12 @@
 
 #include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
 #include "llvm/DebugInfo/PDB/IPDBSourceFile.h"
-#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
-#include "llvm/DebugInfo/PDB/Native/PDBStringTable.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 namespace llvm {
 namespace pdb {
+class PDBSymbolCompiland;
+template <typename ChildType> class IPDBEnumChildren;
 class NativeSession;
 
 class NativeSourceFile : public IPDBSourceFile {
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeSymbolEnumerator.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeSymbolEnumerator.h
index d6a3125ee40b..ab4abc4d3c2c 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeSymbolEnumerator.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeSymbolEnumerator.h
@@ -9,12 +9,16 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVESYMBOLENUMERATOR_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVESYMBOLENUMERATOR_H
 
-#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 namespace llvm {
+
+class raw_ostream;
 namespace pdb {
+class NativeSession;
 class NativeTypeEnum;
 
 class NativeSymbolEnumerator : public NativeRawSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeEnum.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeEnum.h
index 2068c88fc74a..429c06f29ac7 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeEnum.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeEnum.h
@@ -10,12 +10,14 @@
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVETYPEENUM_H
 
 #include "llvm/ADT/Optional.h"
-#include "llvm/DebugInfo/CodeView/CodeView.h"
-#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 namespace llvm {
+class raw_ostream;
 namespace pdb {
 
 class NativeTypeBuiltin;
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeFunctionSig.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeFunctionSig.h
index 90b5d8068959..47ea722313c3 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeFunctionSig.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeFunctionSig.h
@@ -9,17 +9,15 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVETYPEFUNCTIONSIG_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVETYPEFUNCTIONSIG_H
 
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 namespace llvm {
 namespace pdb {
 
-class NativeTypeUDT;
-
 class NativeTypeFunctionSig : public NativeRawSymbol {
 protected:
   void initialize() override;
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypePointer.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypePointer.h
index 7a3dfaecefeb..1f357754ac0f 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypePointer.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypePointer.h
@@ -10,10 +10,11 @@
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVETYPEPOINTER_H
 
 #include "llvm/ADT/Optional.h"
-#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 namespace llvm {
 namespace pdb {
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeTypedef.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeTypedef.h
index 292fc48e7b6d..ce4ebcd00c4a 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeTypedef.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeTypedef.h
@@ -9,14 +9,19 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVETYPETYPEDEF_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVETYPETYPEDEF_H
 
-#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 namespace llvm {
+
+class raw_ostream;
+
 namespace pdb {
 
+class NativeSession;
+
 class NativeTypeTypedef : public NativeRawSymbol {
 public:
   // Create a pointer record for a non-simple type.
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeUDT.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeUDT.h
index e1b31a256c12..a1dd39c0b4be 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeUDT.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeUDT.h
@@ -10,13 +10,17 @@
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVETYPEUDT_H
 
 #include "llvm/ADT/Optional.h"
-#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 namespace llvm {
+
+class raw_ostream;
 namespace pdb {
+class NativeSession;
 
 class NativeTypeUDT : public NativeRawSymbol {
 public:
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeVTShape.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeVTShape.h
index 21995ca665c1..92d51706c1da 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeVTShape.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeVTShape.h
@@ -9,13 +9,15 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVETYPEVTSHAPE_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVETYPEVTSHAPE_H
 
-#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 namespace llvm {
 namespace pdb {
+class NativeSession;
 
 class NativeTypeVTShape : public NativeRawSymbol {
 public:
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/PDBFile.h b/llvm/include/llvm/DebugInfo/PDB/Native/PDBFile.h
index c5ee73280c46..1ea92ed4bf21 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/PDBFile.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/PDBFile.h
@@ -9,14 +9,12 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_PDBFILE_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_PDBFILE_H
 
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/DebugInfo/MSF/IMSFFile.h"
 #include "llvm/DebugInfo/MSF/MSFCommon.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/MathExtras.h"
 
 #include <memory>
 
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/PDBFileBuilder.h b/llvm/include/llvm/DebugInfo/PDB/Native/PDBFileBuilder.h
index 004d005280d4..c23d958f8ed0 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/PDBFileBuilder.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/PDBFileBuilder.h
@@ -9,24 +9,28 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_PDBFILEBUILDER_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_PDBFILEBUILDER_H
 
-#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/DebugInfo/PDB/Native/HashTable.h"
 #include "llvm/DebugInfo/PDB/Native/NamedStreamMap.h"
-#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/Native/PDBStringTableBuilder.h"
-#include "llvm/DebugInfo/PDB/Native/RawConstants.h"
-#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/Support/Allocator.h"
-#include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include <memory>
 
 namespace llvm {
+class WritableBinaryStream;
+namespace codeview {
+struct GUID;
+}
+
 namespace msf {
 class MSFBuilder;
+struct MSFLayout;
 }
 namespace pdb {
+struct SrcHeaderBlockEntry;
 class DbiStreamBuilder;
 class InfoStreamBuilder;
 class GSIStreamBuilder;
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/PDBStringTable.h b/llvm/include/llvm/DebugInfo/PDB/Native/PDBStringTable.h
index 5cb749c8a747..4336cd398baf 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/PDBStringTable.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/PDBStringTable.h
@@ -9,11 +9,9 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_PDBSTRINGTABLE_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_PDBSTRINGTABLE_H
 
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h"
 #include "llvm/Support/BinaryStreamArray.h"
-#include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include <cstdint>
@@ -21,10 +19,6 @@
 namespace llvm {
 class BinaryStreamReader;
 
-namespace msf {
-class MappedBlockStream;
-}
-
 namespace pdb {
 
 struct PDBStringTableHeader;
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/PublicsStream.h b/llvm/include/llvm/DebugInfo/PDB/Native/PublicsStream.h
index bf6da3ea2920..a59a752ff911 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/PublicsStream.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/PublicsStream.h
@@ -9,20 +9,17 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_PUBLICSSTREAM_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_PUBLICSSTREAM_H
 
-#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
-#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
-#include "llvm/DebugInfo/PDB/Native/RawConstants.h"
-#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
-#include "llvm/DebugInfo/PDB/PDBTypes.h"
 #include "llvm/Support/BinaryStreamArray.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
+namespace msf {
+class MappedBlockStream;
+}
 namespace pdb {
-class DbiStream;
-struct GSIHashHeader;
-class PDBFile;
+struct PublicsStreamHeader;
+struct SectionOffset;
 
 class PublicsStream {
 public:
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/SymbolCache.h b/llvm/include/llvm/DebugInfo/PDB/Native/SymbolCache.h
index 1ff6ca173b2b..7c5b6b9e1bdf 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/SymbolCache.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/SymbolCache.h
@@ -10,23 +10,29 @@
 #define LLVM_DEBUGINFO_PDB_NATIVE_SYMBOLCACHE_H
 
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/IntervalMap.h"
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/Line.h"
-#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/PDB/Native/ModuleDebugStream.h"
 #include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSourceFile.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 #include <memory>
 #include <vector>
 
 namespace llvm {
+namespace codeview {
+class InlineSiteSym;
+struct FileChecksumEntry;
+} // namespace codeview
 namespace pdb {
+class IPDBSourceFile;
+class NativeSession;
+class PDBSymbol;
+class PDBSymbolCompiland;
 class DbiStream;
-class PDBFile;
 
 class SymbolCache {
   NativeSession &Session;
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/SymbolStream.h b/llvm/include/llvm/DebugInfo/PDB/Native/SymbolStream.h
index 839cc8d2c503..c2f7eb04d16e 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/SymbolStream.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/SymbolStream.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_SYMBOLSTREAM_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_SYMBOLSTREAM_H
 
-#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
 
 #include "llvm/Support/Error.h"
 
@@ -18,7 +18,6 @@ namespace msf {
 class MappedBlockStream;
 }
 namespace pdb {
-class PDBFile;
 
 class SymbolStream {
 public:
@@ -41,7 +40,7 @@ private:
   codeview::CVSymbolArray SymbolRecords;
   std::unique_ptr<msf::MappedBlockStream> Stream;
 };
-}
+} // namespace pdb
 }
 
 #endif
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/TpiStream.h b/llvm/include/llvm/DebugInfo/PDB/Native/TpiStream.h
index e49d58af4421..4c413abb2bf0 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/TpiStream.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/TpiStream.h
@@ -12,22 +12,23 @@
 #include "llvm/DebugInfo/CodeView/CVRecord.h"
 #include "llvm/DebugInfo/PDB/Native/HashTable.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
-#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
-#include "llvm/DebugInfo/PDB/PDBTypes.h"
 #include "llvm/Support/BinaryStreamArray.h"
 #include "llvm/Support/BinaryStreamRef.h"
-#include "llvm/Support/raw_ostream.h"
 
 #include "llvm/Support/Error.h"
 
 namespace llvm {
+class BinaryStream;
 namespace codeview {
+class TypeIndex;
+struct TypeIndexOffset;
 class LazyRandomTypeCollection;
 }
 namespace msf {
 class MappedBlockStream;
 }
 namespace pdb {
+struct TpiStreamHeader;
 class PDBFile;
 
 class TpiStream {
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h b/llvm/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h
index f18d38ae0b31..9f320358144c 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h
@@ -10,12 +10,10 @@
 #define LLVM_DEBUGINFO_PDB_NATIVE_TPISTREAMBUILDER_H
 
 #include "llvm/ADT/Optional.h"
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
-#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/Support/Allocator.h"
-#include "llvm/Support/BinaryByteStream.h"
-#include "llvm/Support/BinaryItemStream.h"
 #include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Error.h"
 
@@ -23,7 +21,7 @@
 
 namespace llvm {
 class BinaryByteStream;
-class WritableBinaryStreamRef;
+template <typename T> struct BinaryItemTraits;
 
 template <> struct BinaryItemTraits<llvm::codeview::CVType> {
   static size_t length(const codeview::CVType &Item) { return Item.length(); }
@@ -32,16 +30,11 @@ template <> struct BinaryItemTraits<llvm::codeview::CVType> {
   }
 };
 
-namespace codeview {
-class TypeRecord;
-}
 namespace msf {
 class MSFBuilder;
 struct MSFLayout;
 }
 namespace pdb {
-class PDBFile;
-class TpiStream;
 struct TpiStreamHeader;
 
 class TpiStreamBuilder {
@@ -88,7 +81,7 @@ private:
   const TpiStreamHeader *Header;
   uint32_t Idx;
 };
-}
+} // namespace pdb
 }
 
 #endif
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBContext.h b/llvm/include/llvm/DebugInfo/PDB/PDBContext.h
index 7b6793f0a639..3163c0a1dae0 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBContext.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBContext.h
@@ -45,6 +45,8 @@ namespace pdb {
     DILineInfo getLineInfoForAddress(
         object::SectionedAddress Address,
         DILineInfoSpecifier Specifier = DILineInfoSpecifier()) override;
+    DILineInfo
+    getLineInfoForDataAddress(object::SectionedAddress Address) override;
     DILineInfoTable getLineInfoForAddressRange(
         object::SectionedAddress Address, uint64_t Size,
         DILineInfoSpecifier Specifier = DILineInfoSpecifier()) override;
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbol.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbol.h
index 24cf1e459f92..4e34b75b6117 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbol.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbol.h
@@ -9,11 +9,9 @@
 #ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOL_H
 #define LLVM_DEBUGINFO_PDB_PDBSYMBOL_H
 
-#include "ConcreteSymbolEnumerator.h"
 #include "IPDBRawSymbol.h"
 #include "PDBExtras.h"
 #include "PDBTypes.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Casting.h"
 
 #define FORWARD_SYMBOL_METHOD(MethodName)                                      \
@@ -43,6 +41,9 @@ class raw_ostream;
 
 namespace pdb {
 class IPDBSession;
+class PDBSymDumper;
+class PDBSymbol;
+template <typename ChildType> class ConcreteSymbolEnumerator;
 
 #define DECLARE_PDB_SYMBOL_CONCRETE_TYPE(TagValue)                             \
 private:                                                                       \
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolAnnotation.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolAnnotation.h
index c76466a97b66..c8d3d0b7bb96 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolAnnotation.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolAnnotation.h
@@ -13,7 +13,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolAnnotation : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolBlock.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolBlock.h
index cf471450d989..09142227b017 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolBlock.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolBlock.h
@@ -13,8 +13,6 @@
 
 namespace llvm {
 
-class raw_ostream;
-
 namespace pdb {
 
 class PDBSymbolBlock : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolCompilandDetails.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolCompilandDetails.h
index dbd8ba5a63ff..46c159268533 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolCompilandDetails.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolCompilandDetails.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolCompilandDetails : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolCompilandEnv.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolCompilandEnv.h
index 61607a03593d..cba082f2ff19 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolCompilandEnv.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolCompilandEnv.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 class PDBSymbolCompilandEnv : public PDBSymbol {
   DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::CompilandEnv)
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolCustom.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolCustom.h
index 75a86411643a..c78b47ce9924 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolCustom.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolCustom.h
@@ -15,8 +15,6 @@
 
 namespace llvm {
 
-class raw_ostream;
-
 namespace pdb {
 /// PDBSymbolCustom represents symbols that are compiler-specific and do not
 /// fit anywhere else in the lexical hierarchy.
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolData.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolData.h
index 7e9b69d7cf4b..61e67d1368a8 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolData.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolData.h
@@ -9,16 +9,16 @@
 #ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLDATA_H
 #define LLVM_DEBUGINFO_PDB_PDBSYMBOLDATA_H
 
-#include "IPDBLineNumber.h"
 #include "PDBSymbol.h"
 #include "PDBTypes.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 
 namespace llvm {
 
-class raw_ostream;
-
 namespace pdb {
 
+class PDBSymDumper;
+
 class PDBSymbolData : public PDBSymbol {
   DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::Data)
 public:
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolFunc.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolFunc.h
index f50057c68406..bfc7f7689718 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolFunc.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolFunc.h
@@ -9,17 +9,20 @@
 #ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLFUNC_H
 #define LLVM_DEBUGINFO_PDB_PDBSYMBOLFUNC_H
 
-#include "IPDBLineNumber.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
+
 #include "PDBSymbol.h"
-#include "PDBSymbolTypeFunctionSig.h"
 #include "PDBTypes.h"
 
 namespace llvm {
 
-class raw_ostream;
-
 namespace pdb {
 
+class PDBSymDumper;
+class PDBSymbolData;
+class PDBSymbolTypeFunctionSig;
+template <typename ChildType> class IPDBEnumChildren;
+
 class PDBSymbolFunc : public PDBSymbol {
   DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::Function)
 public:
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolFuncDebugEnd.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolFuncDebugEnd.h
index 1cdc1811bb1a..09c6f4728960 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolFuncDebugEnd.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolFuncDebugEnd.h
@@ -14,8 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
-
 namespace pdb {
 
 class PDBSymbolFuncDebugEnd : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolFuncDebugStart.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolFuncDebugStart.h
index 021f27c7f0f7..843a8348a2f0 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolFuncDebugStart.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolFuncDebugStart.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolFuncDebugStart : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolLabel.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolLabel.h
index 33eb36696cc2..148802a47cbc 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolLabel.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolLabel.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolLabel : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolPublicSymbol.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolPublicSymbol.h
index f8dcb2ba9d5f..a757cc02624b 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolPublicSymbol.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolPublicSymbol.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolPublicSymbol : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolThunk.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolThunk.h
index a5f795cc1303..2b81a63995e6 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolThunk.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolThunk.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolThunk : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeArray.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeArray.h
index d4cd6e71423e..496141e5fa68 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeArray.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeArray.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolTypeArray : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h
index bd2dbc914725..c74ac3fb9cce 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h
@@ -12,14 +12,14 @@
 #include "PDBSymbol.h"
 #include "PDBTypes.h"
 
-#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
+class PDBSymDumper;
+
 class PDBSymbolTypeBaseClass : public PDBSymbol {
   DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::BaseClass)
 public:
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h
index df6309b1545c..b923983095f3 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolTypeBuiltin : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeCustom.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeCustom.h
index 7bf0317ff1ca..b15abf7bedfd 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeCustom.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeCustom.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolTypeCustom : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeDimension.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeDimension.h
index 5d742237bac4..e7570b41dd21 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeDimension.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeDimension.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolTypeDimension : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h
index 0aab91039509..ee1f736c17a0 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h
@@ -9,16 +9,18 @@
 #ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEENUM_H
 #define LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEENUM_H
 
-#include "IPDBLineNumber.h"
 #include "PDBSymbol.h"
-#include "PDBSymbolTypeBuiltin.h"
 #include "PDBTypes.h"
 
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
+
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
+class PDBSymDumper;
+class PDBSymbolTypeBuiltin;
+
 class PDBSymbolTypeEnum : public PDBSymbol {
   DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::Enum)
 public:
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeFriend.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeFriend.h
index d56a90662dae..9fde42116261 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeFriend.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeFriend.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolTypeFriend : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeFunctionArg.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeFunctionArg.h
index 559ceec5aace..71decff722a5 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeFunctionArg.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeFunctionArg.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolTypeFunctionArg : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeManaged.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeManaged.h
index 5e7b83ce8004..866bf520a3b2 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeManaged.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeManaged.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolTypeManaged : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypePointer.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypePointer.h
index da25eab50f9b..1b43ef9a21bd 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypePointer.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypePointer.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolTypePointer : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h
index 8dc29ca26192..3f37730cf1df 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolTypeTypedef : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h
index 3e73ad7ac85a..a3a49a4b619a 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h
@@ -9,18 +9,17 @@
 #ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEUDT_H
 #define LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEUDT_H
 
-#include "IPDBLineNumber.h"
-#include "IPDBSession.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
+
 #include "PDBSymbol.h"
-#include "PDBSymbolTypeBaseClass.h"
 #include "PDBTypes.h"
 
 namespace llvm {
 
-class raw_ostream;
-
 namespace pdb {
 
+class PDBSymDumper;
+
 class PDBSymbolTypeUDT : public PDBSymbol {
   DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::UDT)
 public:
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h
index d08728dafa76..6223bee98670 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolTypeVTable : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h
index c7e2ac148503..bec0a9970a9f 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolTypeVTableShape : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolUnknown.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolUnknown.h
index 5b4909b800b9..a53af49bc9e0 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolUnknown.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolUnknown.h
@@ -13,7 +13,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolUnknown : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolUsingNamespace.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolUsingNamespace.h
index 19a8f414eb43..dde25a023d00 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolUsingNamespace.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolUsingNamespace.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolUsingNamespace : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBTypes.h b/llvm/include/llvm/DebugInfo/PDB/PDBTypes.h
index e7c2ded1bee1..b6a794ad7e76 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBTypes.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBTypes.h
@@ -352,7 +352,8 @@ enum class PDB_BuiltinType {
   BSTR = 30,
   HResult = 31,
   Char16 = 32,
-  Char32 = 33
+  Char32 = 33,
+  Char8 = 34,
 };
 
 /// These values correspond to the flags that can be combined to control the
diff --git a/llvm/include/llvm/DebugInfo/PDB/UDTLayout.h b/llvm/include/llvm/DebugInfo/PDB/UDTLayout.h
index c67b093b63c0..8631c412f114 100644
--- a/llvm/include/llvm/DebugInfo/PDB/UDTLayout.h
+++ b/llvm/include/llvm/DebugInfo/PDB/UDTLayout.h
@@ -18,7 +18,6 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h"
-#include "llvm/DebugInfo/PDB/PDBTypes.h"
 #include <cstdint>
 #include <memory>
 #include <string>
diff --git a/llvm/include/llvm/DebugInfo/Symbolize/DIFetcher.h b/llvm/include/llvm/DebugInfo/Symbolize/DIFetcher.h
new file mode 100644
index 000000000000..c5340b5f0460
--- /dev/null
+++ b/llvm/include/llvm/DebugInfo/Symbolize/DIFetcher.h
@@ -0,0 +1,51 @@
+//===-- llvm/DebugInfo/Symbolize/DIFetcher.h --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares a DIFetcher abstraction for obtaining debug info from an
+/// arbitrary outside source.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_SYMBOLIZE_DIFETCHER_H
+#define LLVM_DEBUGINFO_SYMBOLIZE_DIFETCHER_H
+
+#include <cstdint>
+#include <string>
+
+#include "llvm/ADT/ArrayRef.h"
+
+namespace llvm {
+namespace symbolize {
+
+/// The DIFetcher interface provides arbitrary mechanisms for obtaining debug
+/// info from an outside source.
+class DIFetcher {
+public:
+  virtual ~DIFetcher() = default;
+  virtual Optional<std::string>
+  fetchBuildID(ArrayRef<uint8_t> BuildID) const = 0;
+};
+
+/// LocalDIFetcher searches local cache directories for debug info.
+class LocalDIFetcher : public DIFetcher {
+public:
+  LocalDIFetcher(ArrayRef<std::string> DebugFileDirectory)
+      : DebugFileDirectory(DebugFileDirectory){};
+  virtual ~LocalDIFetcher() = default;
+
+  Optional<std::string> fetchBuildID(ArrayRef<uint8_t> BuildID) const override;
+
+private:
+  const ArrayRef<std::string> DebugFileDirectory;
+};
+
+} // end namespace symbolize
+} // end namespace llvm
+
+#endif // LLVM_DEBUGINFO_SYMBOLIZE_DIFETCHER_H
diff --git a/llvm/include/llvm/DebugInfo/Symbolize/Markup.h b/llvm/include/llvm/DebugInfo/Symbolize/Markup.h
new file mode 100644
index 000000000000..2628b47cf6d3
--- /dev/null
+++ b/llvm/include/llvm/DebugInfo/Symbolize/Markup.h
@@ -0,0 +1,120 @@
+//===- Markup.h -------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares the log symbolizer markup data model and parser.
+///
+/// See https://llvm.org/docs/SymbolizerMarkupFormat.html
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_SYMBOLIZE_MARKUP_H
+#define LLVM_DEBUGINFO_SYMBOLIZE_MARKUP_H
+
+#include <iostream>
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Support/Regex.h"
+
+namespace llvm {
+namespace symbolize {
+
+/// A node of symbolizer markup.
+///
+/// If only the Text field is set, this represents a region of text outside a
+/// markup element. ANSI SGR control codes are also reported this way; if
+/// detected, then the control code will be the entirety of the Text field, and
+/// any surrounding text will be reported as preceding and following nodes.
+struct MarkupNode {
+  /// The full text of this node in the input.
+  StringRef Text;
+
+  /// If this represents an element, the tag. Otherwise, empty.
+  StringRef Tag;
+
+  /// If this represents an element with fields, a list of the field contents.
+  /// Otherwise, empty.
+  SmallVector<StringRef> Fields;
+
+  bool operator==(const MarkupNode &Other) const {
+    return Text == Other.Text && Tag == Other.Tag && Fields == Other.Fields;
+  }
+  bool operator!=(const MarkupNode &Other) const { return !(*this == Other); }
+};
+
+/// Parses a log containing symbolizer markup into a sequence of nodes.
+class MarkupParser {
+public:
+  MarkupParser(StringSet<> MultilineTags = {});
+
+  /// Parses an individual \p Line of input.
+  ///
+  /// Nodes from the previous parseLine() call that haven't yet been extracted
+  /// by nextNode() are discarded. The nodes returned by nextNode() may
+  /// reference the input string, so it must be retained by the caller until the
+  /// last use.
+  ///
+  /// Note that some elements may span multiple lines. If a line ends with the
+  /// start of one of these elements, then no nodes will be produced until the
+  /// either the end or something that cannot be part of an element is
+  /// encountered. This may only occur after multiple calls to parseLine(),
+  /// corresponding to the lines of the multi-line element.
+  void parseLine(StringRef Line);
+
+  /// Inform the parser of that the input stream has ended.
+  ///
+  /// This allows the parser to finish any deferred processing (e.g., an
+  /// in-progress multi-line element) and may cause nextNode() to return
+  /// additional nodes.
+  void flush();
+
+  /// Returns the next node in the input sequence.
+  ///
+  /// Calling nextNode() may invalidate the contents of the node returned by the
+  /// previous call.
+  ///
+  /// \returns the next markup node or None if none remain.
+  Optional<MarkupNode> nextNode();
+
+private:
+  Optional<MarkupNode> parseElement(StringRef Line);
+  void parseTextOutsideMarkup(StringRef Text);
+  Optional<StringRef> parseMultiLineBegin(StringRef Line);
+  Optional<StringRef> parseMultiLineEnd(StringRef Line);
+
+  // Tags of elements that can span multiple lines.
+  const StringSet<> MultilineTags;
+
+  // Contents of a multi-line element that has finished being parsed. Retained
+  // to keep returned StringRefs for the contents valid.
+  std::string FinishedMultiline;
+
+  // Contents of a multi-line element that is still in the process of receiving
+  // lines.
+  std::string InProgressMultiline;
+
+  // The line currently being parsed.
+  StringRef Line;
+
+  // Buffer for nodes parsed from the current line.
+  SmallVector<MarkupNode> Buffer;
+
+  // Next buffer index to return.
+  size_t NextIdx;
+
+  // Regular expression matching supported ANSI SGR escape sequences.
+  const Regex SGRSyntax;
+};
+
+} // end namespace symbolize
+} // end namespace llvm
+
+#endif // LLVM_DEBUGINFO_SYMBOLIZE_MARKUP_H
diff --git a/llvm/include/llvm/DebugInfo/Symbolize/MarkupFilter.h b/llvm/include/llvm/DebugInfo/Symbolize/MarkupFilter.h
new file mode 100644
index 000000000000..b7d70ccafe66
--- /dev/null
+++ b/llvm/include/llvm/DebugInfo/Symbolize/MarkupFilter.h
@@ -0,0 +1,76 @@
+//===- MarkupFilter.h -------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares a filter that replaces symbolizer markup with
+/// human-readable expressions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_SYMBOLIZE_MARKUPFILTER_H
+#define LLVM_DEBUGINFO_SYMBOLIZE_MARKUPFILTER_H
+
+#include "Markup.h"
+
+#include "llvm/Support/WithColor.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+namespace symbolize {
+
+/// Filter to convert parsed log symbolizer markup elements into human-readable
+/// text.
+class MarkupFilter {
+public:
+  MarkupFilter(raw_ostream &OS, Optional<bool> ColorsEnabled = llvm::None);
+
+  /// Begins a logical \p Line of markup.
+  ///
+  /// This must be called for each line of the input stream before calls to
+  /// filter() for elements of that line. The provided \p Line must be the same
+  /// one that was passed to parseLine() to produce the elements to be later
+  /// passed to filter().
+  ///
+  /// This informs the filter that a new line is beginning and establishes a
+  /// context for error location reporting.
+  void beginLine(StringRef Line);
+
+  /// Handle a \p Node of symbolizer markup.
+  ///
+  /// If the node is a recognized, valid markup element, it is replaced with a
+  /// human-readable string. If the node isn't an element or the element isn't
+  /// recognized, it is output verbatim. If the element is recognized but isn't
+  /// valid, it is omitted from the output.
+  void filter(const MarkupNode &Node);
+
+private:
+  bool trySGR(const MarkupNode &Node);
+
+  void highlight();
+  void restoreColor();
+  void resetColor();
+
+  bool checkTag(const MarkupNode &Node) const;
+  bool checkNumFields(const MarkupNode &Node, size_t Size) const;
+
+  void reportTypeError(StringRef Str, StringRef TypeName) const;
+  void reportLocation(StringRef::iterator Loc) const;
+
+  raw_ostream &OS;
+  const bool ColorsEnabled;
+
+  StringRef Line;
+
+  Optional<raw_ostream::Colors> Color;
+  bool Bold = false;
+};
+
+} // end namespace symbolize
+} // end namespace llvm
+
+#endif // LLVM_DEBUGINFO_SYMBOLIZE_MARKUPFILTER_H
diff --git a/llvm/include/llvm/DebugInfo/Symbolize/SymbolizableObjectFile.h b/llvm/include/llvm/DebugInfo/Symbolize/SymbolizableObjectFile.h
new file mode 100644
index 000000000000..075dbe3e0e37
--- /dev/null
+++ b/llvm/include/llvm/DebugInfo/Symbolize/SymbolizableObjectFile.h
@@ -0,0 +1,103 @@
+//===- SymbolizableObjectFile.h ---------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SymbolizableObjectFile class.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_DEBUGINFO_SYMBOLIZE_SYMBOLIZABLEOBJECTFILE_H
+#define LLVM_DEBUGINFO_SYMBOLIZE_SYMBOLIZABLEOBJECTFILE_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/DIContext.h"
+#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
+#include "llvm/Support/Error.h"
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace llvm {
+
+class DataExtractor;
+
+namespace symbolize {
+
+class SymbolizableObjectFile : public SymbolizableModule {
+public:
+  static Expected<std::unique_ptr<SymbolizableObjectFile>>
+  create(const object::ObjectFile *Obj, std::unique_ptr<DIContext> DICtx,
+         bool UntagAddresses);
+
+  DILineInfo symbolizeCode(object::SectionedAddress ModuleOffset,
+                           DILineInfoSpecifier LineInfoSpecifier,
+                           bool UseSymbolTable) const override;
+  DIInliningInfo symbolizeInlinedCode(object::SectionedAddress ModuleOffset,
+                                      DILineInfoSpecifier LineInfoSpecifier,
+                                      bool UseSymbolTable) const override;
+  DIGlobal symbolizeData(object::SectionedAddress ModuleOffset) const override;
+  std::vector<DILocal>
+  symbolizeFrame(object::SectionedAddress ModuleOffset) const override;
+
+  // Return true if this is a 32-bit x86 PE COFF module.
+  bool isWin32Module() const override;
+
+  // Returns the preferred base of the module, i.e. where the loader would place
+  // it in memory assuming there were no conflicts.
+  uint64_t getModulePreferredBase() const override;
+
+private:
+  bool shouldOverrideWithSymbolTable(FunctionNameKind FNKind,
+                                     bool UseSymbolTable) const;
+
+  bool getNameFromSymbolTable(uint64_t Address, std::string &Name,
+                              uint64_t &Addr, uint64_t &Size,
+                              std::string &FileName) const;
+  // For big-endian PowerPC64 ELF, OpdAddress is the address of the .opd
+  // (function descriptor) section and OpdExtractor refers to its contents.
+  Error addSymbol(const object::SymbolRef &Symbol, uint64_t SymbolSize,
+                  DataExtractor *OpdExtractor = nullptr,
+                  uint64_t OpdAddress = 0);
+  Error addCoffExportSymbols(const object::COFFObjectFile *CoffObj);
+
+  /// Search for the first occurence of specified Address in ObjectFile.
+  uint64_t getModuleSectionIndexForAddress(uint64_t Address) const;
+
+  const object::ObjectFile *Module;
+  std::unique_ptr<DIContext> DebugInfoContext;
+  bool UntagAddresses;
+
+  struct SymbolDesc {
+    uint64_t Addr;
+    // If size is 0, assume that symbol occupies the whole memory range up to
+    // the following symbol.
+    uint64_t Size;
+
+    StringRef Name;
+    // Non-zero if this is an ELF local symbol. See the comment in
+    // getNameFromSymbolTable.
+    uint32_t ELFLocalSymIdx;
+
+    bool operator<(const SymbolDesc &RHS) const {
+      return Addr != RHS.Addr ? Addr < RHS.Addr : Size < RHS.Size;
+    }
+  };
+  std::vector<SymbolDesc> Symbols;
+  // (index, filename) pairs of ELF STT_FILE symbols.
+  std::vector<std::pair<uint32_t, StringRef>> FileSymbols;
+
+  SymbolizableObjectFile(const object::ObjectFile *Obj,
+                         std::unique_ptr<DIContext> DICtx,
+                         bool UntagAddresses);
+};
+
+} // end namespace symbolize
+
+} // end namespace llvm
+
+#endif // LLVM_DEBUGINFO_SYMBOLIZE_SYMBOLIZABLEOBJECTFILE_H
diff --git a/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h b/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h
index 4ec333422c4b..00c4bf0a615f 100644
--- a/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h
+++ b/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h
@@ -13,10 +13,12 @@
 #ifndef LLVM_DEBUGINFO_SYMBOLIZE_SYMBOLIZE_H
 #define LLVM_DEBUGINFO_SYMBOLIZE_SYMBOLIZE_H
 
-#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/ilist_node.h"
+#include "llvm/ADT/simple_ilist.h"
+#include "llvm/DebugInfo/DIContext.h"
+#include "llvm/DebugInfo/Symbolize/DIFetcher.h"
 #include "llvm/Object/Binary.h"
-#include "llvm/Object/ELFObjectFile.h"
-#include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Error.h"
 #include <algorithm>
 #include <cstdint>
@@ -27,13 +29,24 @@
 #include <vector>
 
 namespace llvm {
+namespace object {
+class ELFObjectFileBase;
+class MachOObjectFile;
+class ObjectFile;
+struct SectionedAddress;
+} // namespace object
+
 namespace symbolize {
 
+class SymbolizableModule;
+
 using namespace object;
 
 using FunctionNameKind = DILineInfoSpecifier::FunctionNameKind;
 using FileLineInfoKind = DILineInfoSpecifier::FileLineInfoKind;
 
+class CachedBinary;
+
 class LLVMSymbolizer {
 public:
   struct Options {
@@ -49,40 +62,63 @@ public:
     std::string FallbackDebugPath;
     std::string DWPName;
     std::vector<std::string> DebugFileDirectory;
+    size_t MaxCacheSize =
+        sizeof(size_t) == 4
+            ? 512 * 1024 * 1024 /* 512 MiB */
+            : static_cast<size_t>(4ULL * 1024 * 1024 * 1024) /* 4 GiB */;
   };
 
-  LLVMSymbolizer() = default;
-  LLVMSymbolizer(const Options &Opts) : Opts(Opts) {}
+  LLVMSymbolizer();
+  LLVMSymbolizer(const Options &Opts);
 
-  ~LLVMSymbolizer() { flush(); }
+  ~LLVMSymbolizer();
 
   // Overloads accepting ObjectFile does not support COFF currently
   Expected<DILineInfo> symbolizeCode(const ObjectFile &Obj,
                                      object::SectionedAddress ModuleOffset);
   Expected<DILineInfo> symbolizeCode(const std::string &ModuleName,
                                      object::SectionedAddress ModuleOffset);
+  Expected<DILineInfo> symbolizeCode(ArrayRef<uint8_t> BuildID,
+                                     object::SectionedAddress ModuleOffset);
   Expected<DIInliningInfo>
   symbolizeInlinedCode(const ObjectFile &Obj,
                        object::SectionedAddress ModuleOffset);
   Expected<DIInliningInfo>
   symbolizeInlinedCode(const std::string &ModuleName,
                        object::SectionedAddress ModuleOffset);
+  Expected<DIInliningInfo>
+  symbolizeInlinedCode(ArrayRef<uint8_t> BuildID,
+                       object::SectionedAddress ModuleOffset);
 
   Expected<DIGlobal> symbolizeData(const ObjectFile &Obj,
                                    object::SectionedAddress ModuleOffset);
   Expected<DIGlobal> symbolizeData(const std::string &ModuleName,
                                    object::SectionedAddress ModuleOffset);
+  Expected<DIGlobal> symbolizeData(ArrayRef<uint8_t> BuildID,
+                                   object::SectionedAddress ModuleOffset);
   Expected<std::vector<DILocal>>
   symbolizeFrame(const ObjectFile &Obj, object::SectionedAddress ModuleOffset);
   Expected<std::vector<DILocal>>
   symbolizeFrame(const std::string &ModuleName,
                  object::SectionedAddress ModuleOffset);
+  Expected<std::vector<DILocal>>
+  symbolizeFrame(ArrayRef<uint8_t> BuildID,
+                 object::SectionedAddress ModuleOffset);
   void flush();
 
+  // Evict entries from the binary cache until it is under the maximum size
+  // given in the options. Calling this invalidates references in the DI...
+  // objects returned by the methods above.
+  void pruneCache();
+
   static std::string
   DemangleName(const std::string &Name,
                const SymbolizableModule *DbiModuleDescriptor);
 
+  void addDIFetcher(std::unique_ptr<DIFetcher> Fetcher) {
+    DIFetchers.push_back(std::move(Fetcher));
+  }
+
 private:
   // Bundles together object file with code/data and object file with
   // corresponding debug info. These objects can be the same.
@@ -112,6 +148,12 @@ private:
   getOrCreateModuleInfo(const std::string &ModuleName);
   Expected<SymbolizableModule *> getOrCreateModuleInfo(const ObjectFile &Obj);
 
+  /// Returns a SymbolizableModule or an error if loading debug info failed.
+  /// Unlike the above, errors are reported each time, since they are more
+  /// likely to be transient.
+  Expected<SymbolizableModule *>
+  getOrCreateModuleInfo(ArrayRef<uint8_t> BuildID);
+
   Expected<SymbolizableModule *>
   createModuleInfo(const ObjectFile *Obj, std::unique_ptr<DIContext> Context,
                    StringRef ModuleName);
@@ -126,6 +168,13 @@ private:
                                   const ELFObjectFileBase *Obj,
                                   const std::string &ArchName);
 
+  bool findDebugBinary(const std::string &OrigPath,
+                       const std::string &DebuglinkName, uint32_t CRCHash,
+                       std::string &Result);
+
+  bool getOrFindDebugBinary(const ArrayRef<uint8_t> BuildID,
+                            std::string &Result);
+
   /// Returns pair of pointers to object and debug object.
   Expected<ObjectPair> getOrCreateObjectPair(const std::string &Path,
                                              const std::string &ArchName);
@@ -136,15 +185,24 @@ private:
   Expected<ObjectFile *> getOrCreateObject(const std::string &Path,
                                            const std::string &ArchName);
 
+  /// Update the LRU cache order when a binary is accessed.
+  void recordAccess(CachedBinary &Bin);
+
   std::map<std::string, std::unique_ptr<SymbolizableModule>, std::less<>>
       Modules;
+  StringMap<std::string> BuildIDPaths;
 
   /// Contains cached results of getOrCreateObjectPair().
   std::map<std::pair<std::string, std::string>, ObjectPair>
       ObjectPairForPathArch;
 
   /// Contains parsed binary for each path, or parsing error.
-  std::map<std::string, OwningBinary<Binary>> BinaryForPath;
+  std::map<std::string, CachedBinary> BinaryForPath;
+
+  /// A list of cached binaries in LRU order.
+  simple_ilist<CachedBinary> LRUBinaries;
+  /// Sum of the sizes of the cached binaries.
+  size_t CacheSize = 0;
 
   /// Parsed object file for path/architecture pair, where "path" refers
   /// to Mach-O universal binary.
@@ -152,6 +210,37 @@ private:
       ObjectForUBPathAndArch;
 
   Options Opts;
+
+  SmallVector<std::unique_ptr<DIFetcher>> DIFetchers;
+};
+
+// A binary intrusively linked into a LRU cache list. If the binary is empty,
+// then the entry marks that an error occurred, and it is not part of the LRU
+// list.
+class CachedBinary : public ilist_node<CachedBinary> {
+public:
+  CachedBinary() = default;
+  CachedBinary(OwningBinary<Binary> Bin) : Bin(std::move(Bin)) {}
+
+  OwningBinary<Binary> &operator*() { return Bin; }
+  OwningBinary<Binary> *operator->() { return &Bin; }
+
+  // Add an action to be performed when the binary is evicted, before all
+  // previously registered evictors.
+  void pushEvictor(std::function<void()> Evictor);
+
+  // Run all registered evictors in the reverse of the order in which they were
+  // added.
+  void evict() {
+    if (Evictor)
+      Evictor();
+  }
+
+  size_t size() { return Bin.getBinary()->getData().size(); }
+
+private:
+  OwningBinary<Binary> Bin;
+  std::function<void()> Evictor;
 };
 
 } // end namespace symbolize
diff --git a/llvm/include/llvm/Debuginfod/DIFetcher.h b/llvm/include/llvm/Debuginfod/DIFetcher.h
new file mode 100644
index 000000000000..d398fd900051
--- /dev/null
+++ b/llvm/include/llvm/Debuginfod/DIFetcher.h
@@ -0,0 +1,34 @@
+//===- llvm/DebugInfod/DIFetcher.h - Debug info fetcher----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares a DIFetcher implementation for obtaining debug info from
+/// debuginfod.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFOD_DIFETCHER_H
+#define LLVM_DEBUGINFOD_DIFETCHER_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/DebugInfo/Symbolize/DIFetcher.h"
+
+namespace llvm {
+
+class DebuginfodDIFetcher : public symbolize::DIFetcher {
+public:
+  virtual ~DebuginfodDIFetcher() = default;
+
+  /// Fetches the given Build ID using debuginfod and returns a local path to
+  /// the resulting debug binary.
+  Optional<std::string> fetchBuildID(ArrayRef<uint8_t> BuildID) const override;
+};
+
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFOD_DIFETCHER_H
diff --git a/llvm/include/llvm/Debuginfod/HTTPClient.h b/llvm/include/llvm/Debuginfod/HTTPClient.h
index ca3b76ca9f3f..6c94961032e7 100644
--- a/llvm/include/llvm/Debuginfod/HTTPClient.h
+++ b/llvm/include/llvm/Debuginfod/HTTPClient.h
@@ -7,9 +7,8 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// This file contains the declarations of the HTTPClient, HTTPMethod,
-/// HTTPResponseHandler, and BufferedHTTPResponseHandler classes, as well as
-/// the HTTPResponseBuffer and HTTPRequest structs.
+/// This file contains the declarations of the HTTPClient library for issuing
+/// HTTP requests and handling the responses.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -40,43 +39,13 @@ bool operator==(const HTTPRequest &A, const HTTPRequest &B);
 /// of its methods.
 class HTTPResponseHandler {
 public:
-  /// Processes one line of HTTP response headers.
-  virtual Error handleHeaderLine(StringRef HeaderLine) = 0;
-
   /// Processes an additional chunk of bytes of the HTTP response body.
   virtual Error handleBodyChunk(StringRef BodyChunk) = 0;
 
-  /// Processes the HTTP response status code.
-  virtual Error handleStatusCode(unsigned Code) = 0;
-
 protected:
   ~HTTPResponseHandler();
 };
 
-/// An HTTP response status code bundled with a buffer to store the body.
-struct HTTPResponseBuffer {
-  unsigned Code = 0;
-  std::unique_ptr<WritableMemoryBuffer> Body;
-};
-
-/// A simple handler which writes returned data to an HTTPResponseBuffer.
-/// Ignores all headers except the Content-Length, which it uses to
-/// allocate an appropriately-sized Body buffer.
-class BufferedHTTPResponseHandler final : public HTTPResponseHandler {
-  size_t Offset = 0;
-
-public:
-  /// Stores the data received from the HTTP server.
-  HTTPResponseBuffer ResponseBuffer;
-
-  /// These callbacks store the body and status code in an HTTPResponseBuffer
-  /// allocated based on Content-Length. The Content-Length header must be
-  /// handled by handleHeaderLine before any calls to handleBodyChunk.
-  Error handleHeaderLine(StringRef HeaderLine) override;
-  Error handleBodyChunk(StringRef BodyChunk) override;
-  Error handleStatusCode(unsigned Code) override;
-};
-
 /// A reusable client that can perform HTTPRequests through a network socket.
 class HTTPClient {
 #ifdef LLVM_ENABLE_CURL
@@ -107,13 +76,8 @@ public:
   /// Handler method.
   Error perform(const HTTPRequest &Request, HTTPResponseHandler &Handler);
 
-  /// Performs the Request with the default BufferedHTTPResponseHandler, and
-  /// returns its HTTPResponseBuffer or an Error.
-  Expected<HTTPResponseBuffer> perform(const HTTPRequest &Request);
-
-  /// Performs an HTTPRequest with the default configuration to make a GET
-  /// request to the given Url. Returns an HTTPResponseBuffer or an Error.
-  Expected<HTTPResponseBuffer> get(StringRef Url);
+  /// Returns the last received response code or zero if none.
+  unsigned responseCode();
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Demangle/Demangle.h b/llvm/include/llvm/Demangle/Demangle.h
index 3150e049320b..6133d0b95bbf 100644
--- a/llvm/include/llvm/Demangle/Demangle.h
+++ b/llvm/include/llvm/Demangle/Demangle.h
@@ -57,8 +57,8 @@ char *microsoftDemangle(const char *mangled_name, size_t *n_read, char *buf,
                         size_t *n_buf, int *status,
                         MSDemangleFlags Flags = MSDF_None);
 
-// Demangles a Rust v0 mangled symbol. The API follows that of __cxa_demangle.
-char *rustDemangle(const char *MangledName, char *Buf, size_t *N, int *Status);
+// Demangles a Rust v0 mangled symbol.
+char *rustDemangle(const char *MangledName);
 
 // Demangles a D mangled symbol.
 char *dlangDemangle(const char *MangledName);
diff --git a/llvm/include/llvm/Demangle/ItaniumDemangle.h b/llvm/include/llvm/Demangle/ItaniumDemangle.h
index 760319544a02..959632f13e1e 100644
--- a/llvm/include/llvm/Demangle/ItaniumDemangle.h
+++ b/llvm/include/llvm/Demangle/ItaniumDemangle.h
@@ -16,10 +16,6 @@
 #ifndef DEMANGLE_ITANIUMDEMANGLE_H
 #define DEMANGLE_ITANIUMDEMANGLE_H
 
-// FIXME: (possibly) incomplete list of features that clang mangles that this
-// file does not yet support:
-//   - C++ modules TS
-
 #include "DemangleConfig.h"
 #include "StringView.h"
 #include "Utility.h"
@@ -32,85 +28,6 @@
 #include <limits>
 #include <utility>
 
-#define FOR_EACH_NODE_KIND(X) \
-    X(NodeArrayNode) \
-    X(DotSuffix) \
-    X(VendorExtQualType) \
-    X(QualType) \
-    X(ConversionOperatorType) \
-    X(PostfixQualifiedType) \
-    X(ElaboratedTypeSpefType) \
-    X(NameType) \
-    X(AbiTagAttr) \
-    X(EnableIfAttr) \
-    X(ObjCProtoName) \
-    X(PointerType) \
-    X(ReferenceType) \
-    X(PointerToMemberType) \
-    X(ArrayType) \
-    X(FunctionType) \
-    X(NoexceptSpec) \
-    X(DynamicExceptionSpec) \
-    X(FunctionEncoding) \
-    X(LiteralOperator) \
-    X(SpecialName) \
-    X(CtorVtableSpecialName) \
-    X(QualifiedName) \
-    X(NestedName) \
-    X(LocalName) \
-    X(VectorType) \
-    X(PixelVectorType) \
-    X(BinaryFPType) \
-    X(SyntheticTemplateParamName) \
-    X(TypeTemplateParamDecl) \
-    X(NonTypeTemplateParamDecl) \
-    X(TemplateTemplateParamDecl) \
-    X(TemplateParamPackDecl) \
-    X(ParameterPack) \
-    X(TemplateArgumentPack) \
-    X(ParameterPackExpansion) \
-    X(TemplateArgs) \
-    X(ForwardTemplateReference) \
-    X(NameWithTemplateArgs) \
-    X(GlobalQualifiedName) \
-    X(StdQualifiedName) \
-    X(ExpandedSpecialSubstitution) \
-    X(SpecialSubstitution) \
-    X(CtorDtorName) \
-    X(DtorName) \
-    X(UnnamedTypeName) \
-    X(ClosureTypeName) \
-    X(StructuredBindingName) \
-    X(BinaryExpr) \
-    X(ArraySubscriptExpr) \
-    X(PostfixExpr) \
-    X(ConditionalExpr) \
-    X(MemberExpr) \
-    X(SubobjectExpr) \
-    X(EnclosingExpr) \
-    X(CastExpr) \
-    X(SizeofParamPackExpr) \
-    X(CallExpr) \
-    X(NewExpr) \
-    X(DeleteExpr) \
-    X(PrefixExpr) \
-    X(FunctionParam) \
-    X(ConversionExpr) \
-    X(PointerToMemberConversionExpr) \
-    X(InitListExpr) \
-    X(FoldExpr) \
-    X(ThrowExpr) \
-    X(BoolExpr) \
-    X(StringLiteral) \
-    X(LambdaExpr) \
-    X(EnumLiteral)    \
-    X(IntegerLiteral) \
-    X(FloatLiteral) \
-    X(DoubleLiteral) \
-    X(LongDoubleLiteral) \
-    X(BracedExpr) \
-    X(BracedRangeExpr)
-
 DEMANGLE_NAMESPACE_BEGIN
 
 template <class T, size_t N> class PODSmallVector {
@@ -238,37 +155,68 @@ public:
 class Node {
 public:
   enum Kind : unsigned char {
-#define ENUMERATOR(NodeKind) K ## NodeKind,
-    FOR_EACH_NODE_KIND(ENUMERATOR)
-#undef ENUMERATOR
+#define NODE(NodeKind) K##NodeKind,
+#include "ItaniumNodes.def"
   };
 
   /// Three-way bool to track a cached value. Unknown is possible if this node
   /// has an unexpanded parameter pack below it that may affect this cache.
   enum class Cache : unsigned char { Yes, No, Unknown, };
 
+  /// Operator precedence for expression nodes. Used to determine required
+  /// parens in expression emission.
+  enum class Prec {
+    Primary,
+    Postfix,
+    Unary,
+    Cast,
+    PtrMem,
+    Multiplicative,
+    Additive,
+    Shift,
+    Spaceship,
+    Relational,
+    Equality,
+    And,
+    Xor,
+    Ior,
+    AndIf,
+    OrIf,
+    Conditional,
+    Assign,
+    Comma,
+    Default,
+  };
+
 private:
   Kind K;
 
+  Prec Precedence : 6;
+
   // FIXME: Make these protected.
 public:
   /// Tracks if this node has a component on its right side, in which case we
   /// need to call printRight.
-  Cache RHSComponentCache;
+  Cache RHSComponentCache : 2;
 
   /// Track if this node is a (possibly qualified) array type. This can affect
   /// how we format the output string.
-  Cache ArrayCache;
+  Cache ArrayCache : 2;
 
   /// Track if this node is a (possibly qualified) function type. This can
   /// affect how we format the output string.
-  Cache FunctionCache;
+  Cache FunctionCache : 2;
 
 public:
-  Node(Kind K_, Cache RHSComponentCache_ = Cache::No,
-       Cache ArrayCache_ = Cache::No, Cache FunctionCache_ = Cache::No)
-      : K(K_), RHSComponentCache(RHSComponentCache_), ArrayCache(ArrayCache_),
-        FunctionCache(FunctionCache_) {}
+  Node(Kind K_, Prec Precedence_ = Prec::Primary,
+       Cache RHSComponentCache_ = Cache::No, Cache ArrayCache_ = Cache::No,
+       Cache FunctionCache_ = Cache::No)
+      : K(K_), Precedence(Precedence_), RHSComponentCache(RHSComponentCache_),
+        ArrayCache(ArrayCache_), FunctionCache(FunctionCache_) {}
+  Node(Kind K_, Cache RHSComponentCache_, Cache ArrayCache_ = Cache::No,
+       Cache FunctionCache_ = Cache::No)
+      : Node(K_, Prec::Primary, RHSComponentCache_, ArrayCache_,
+             FunctionCache_) {}
 
   /// Visit the most-derived object corresponding to this object.
   template<typename Fn> void visit(Fn F) const;
@@ -299,6 +247,8 @@ public:
 
   Kind getKind() const { return K; }
 
+  Prec getPrecedence() const { return Precedence; }
+
   virtual bool hasRHSComponentSlow(OutputBuffer &) const { return false; }
   virtual bool hasArraySlow(OutputBuffer &) const { return false; }
   virtual bool hasFunctionSlow(OutputBuffer &) const { return false; }
@@ -307,6 +257,19 @@ public:
   // get at a node that actually represents some concrete syntax.
   virtual const Node *getSyntaxNode(OutputBuffer &) const { return this; }
 
+  // Print this node as an expression operand, surrounding it in parentheses if
+  // its precedence is [Strictly] weaker than P.
+  void printAsOperand(OutputBuffer &OB, Prec P = Prec::Default,
+                      bool StrictlyWorse = false) const {
+    bool Paren =
+        unsigned(getPrecedence()) >= unsigned(P) + unsigned(StrictlyWorse);
+    if (Paren)
+      OB.printOpen();
+    print(OB);
+    if (Paren)
+      OB.printClose();
+  }
+
   void print(OutputBuffer &OB) const {
     printLeft(OB);
     if (RHSComponentCache != Cache::No)
@@ -356,7 +319,7 @@ public:
       if (!FirstElement)
         OB += ", ";
       size_t AfterComma = OB.getCurrentPosition();
-      Elements[Idx]->print(OB);
+      Elements[Idx]->printAsOperand(OB, Node::Prec::Comma);
 
       // Elements[Idx] is an empty parameter pack expansion, we should erase the
       // comma we just printed.
@@ -494,7 +457,7 @@ class PostfixQualifiedType final : public Node {
   const StringView Postfix;
 
 public:
-  PostfixQualifiedType(Node *Ty_, StringView Postfix_)
+  PostfixQualifiedType(const Node *Ty_, StringView Postfix_)
       : Node(KPostfixQualifiedType), Ty(Ty_), Postfix(Postfix_) {}
 
   template<typename Fn> void match(Fn F) const { F(Ty, Postfix); }
@@ -519,6 +482,26 @@ public:
   void printLeft(OutputBuffer &OB) const override { OB += Name; }
 };
 
+class BitIntType final : public Node {
+  const Node *Size;
+  bool Signed;
+
+public:
+  BitIntType(const Node *Size_, bool Signed_)
+      : Node(KBitIntType), Size(Size_), Signed(Signed_) {}
+
+  template <typename Fn> void match(Fn F) const { F(Size, Signed); }
+
+  void printLeft(OutputBuffer &OB) const override {
+    if (!Signed)
+      OB += "unsigned ";
+    OB += "_BitInt";
+    OB.printOpen();
+    Size->printAsOperand(OB);
+    OB.printClose();
+  }
+};
+
 class ElaboratedTypeSpefType : public Node {
   StringView Kind;
   Node *Child;
@@ -693,7 +676,7 @@ public:
   void printLeft(OutputBuffer &OB) const override {
     if (Printing)
       return;
-    SwapAndRestore<bool> SavePrinting(Printing, true);
+    ScopedOverride<bool> SavePrinting(Printing, true);
     std::pair<ReferenceKind, const Node *> Collapsed = collapse(OB);
     if (!Collapsed.second)
       return;
@@ -708,7 +691,7 @@ public:
   void printRight(OutputBuffer &OB) const override {
     if (Printing)
       return;
-    SwapAndRestore<bool> SavePrinting(Printing, true);
+    ScopedOverride<bool> SavePrinting(Printing, true);
     std::pair<ReferenceKind, const Node *> Collapsed = collapse(OB);
     if (!Collapsed.second)
       return;
@@ -815,9 +798,9 @@ public:
   }
 
   void printRight(OutputBuffer &OB) const override {
-    OB += "(";
+    OB.printOpen();
     Params.printWithComma(OB);
-    OB += ")";
+    OB.printClose();
     Ret->printRight(OB);
 
     if (CVQuals & QualConst)
@@ -847,9 +830,10 @@ public:
   template<typename Fn> void match(Fn F) const { F(E); }
 
   void printLeft(OutputBuffer &OB) const override {
-    OB += "noexcept(";
-    E->print(OB);
-    OB += ")";
+    OB += "noexcept";
+    OB.printOpen();
+    E->printAsOperand(OB);
+    OB.printClose();
   }
 };
 
@@ -862,9 +846,10 @@ public:
   template<typename Fn> void match(Fn F) const { F(Types); }
 
   void printLeft(OutputBuffer &OB) const override {
-    OB += "throw(";
+    OB += "throw";
+    OB.printOpen();
     Types.printWithComma(OB);
-    OB += ')';
+    OB.printClose();
   }
 };
 
@@ -910,9 +895,9 @@ public:
   }
 
   void printRight(OutputBuffer &OB) const override {
-    OB += "(";
+    OB.printOpen();
     Params.printWithComma(OB);
-    OB += ")";
+    OB.printClose();
     if (Ret)
       Ret->printRight(OB);
 
@@ -1001,6 +986,46 @@ struct NestedName : Node {
   }
 };
 
+struct ModuleName : Node {
+  ModuleName *Parent;
+  Node *Name;
+  bool IsPartition;
+
+  ModuleName(ModuleName *Parent_, Node *Name_, bool IsPartition_ = false)
+      : Node(KModuleName), Parent(Parent_), Name(Name_),
+        IsPartition(IsPartition_) {}
+
+  template <typename Fn> void match(Fn F) const {
+    F(Parent, Name, IsPartition);
+  }
+
+  void printLeft(OutputBuffer &OB) const override {
+    if (Parent)
+      Parent->print(OB);
+    if (Parent || IsPartition)
+      OB += IsPartition ? ':' : '.';
+    Name->print(OB);
+  }
+};
+
+struct ModuleEntity : Node {
+  ModuleName *Module;
+  Node *Name;
+
+  ModuleEntity(ModuleName *Module_, Node *Name_)
+      : Node(KModuleEntity), Module(Module_), Name(Name_) {}
+
+  template <typename Fn> void match(Fn F) const { F(Module, Name); }
+
+  StringView getBaseName() const override { return Name->getBaseName(); }
+
+  void printLeft(OutputBuffer &OB) const override {
+    Name->print(OB);
+    OB += '@';
+    Module->print(OB);
+  }
+};
+
 struct LocalName : Node {
   Node *Encoding;
   Node *Entity;
@@ -1042,9 +1067,8 @@ class VectorType final : public Node {
   const Node *Dimension;
 
 public:
-  VectorType(const Node *BaseType_, Node *Dimension_)
-      : Node(KVectorType), BaseType(BaseType_),
-        Dimension(Dimension_) {}
+  VectorType(const Node *BaseType_, const Node *Dimension_)
+      : Node(KVectorType), BaseType(BaseType_), Dimension(Dimension_) {}
 
   template<typename Fn> void match(Fn F) const { F(BaseType, Dimension); }
 
@@ -1176,6 +1200,7 @@ public:
   template<typename Fn> void match(Fn F) const { F(Name, Params); }
 
   void printLeft(OutputBuffer &OB) const override {
+    ScopedOverride<unsigned> LT(OB.GtIsGt, 0);
     OB += "template<";
     Params.printWithComma(OB);
     OB += "> typename ";
@@ -1311,8 +1336,8 @@ public:
 
   void printLeft(OutputBuffer &OB) const override {
     constexpr unsigned Max = std::numeric_limits<unsigned>::max();
-    SwapAndRestore<unsigned> SavePackIdx(OB.CurrentPackIndex, Max);
-    SwapAndRestore<unsigned> SavePackMax(OB.CurrentPackMax, Max);
+    ScopedOverride<unsigned> SavePackIdx(OB.CurrentPackIndex, Max);
+    ScopedOverride<unsigned> SavePackMax(OB.CurrentPackMax, Max);
     size_t StreamPos = OB.getCurrentPosition();
 
     // Print the first element in the pack. If Child contains a ParameterPack,
@@ -1353,10 +1378,9 @@ public:
   NodeArray getParams() { return Params; }
 
   void printLeft(OutputBuffer &OB) const override {
+    ScopedOverride<unsigned> LT(OB.GtIsGt, 0);
     OB += "<";
     Params.printWithComma(OB);
-    if (OB.back() == '>')
-      OB += " ";
     OB += ">";
   }
 };
@@ -1402,38 +1426,38 @@ struct ForwardTemplateReference : Node {
   bool hasRHSComponentSlow(OutputBuffer &OB) const override {
     if (Printing)
       return false;
-    SwapAndRestore<bool> SavePrinting(Printing, true);
+    ScopedOverride<bool> SavePrinting(Printing, true);
     return Ref->hasRHSComponent(OB);
   }
   bool hasArraySlow(OutputBuffer &OB) const override {
     if (Printing)
       return false;
-    SwapAndRestore<bool> SavePrinting(Printing, true);
+    ScopedOverride<bool> SavePrinting(Printing, true);
     return Ref->hasArray(OB);
   }
   bool hasFunctionSlow(OutputBuffer &OB) const override {
     if (Printing)
       return false;
-    SwapAndRestore<bool> SavePrinting(Printing, true);
+    ScopedOverride<bool> SavePrinting(Printing, true);
     return Ref->hasFunction(OB);
   }
   const Node *getSyntaxNode(OutputBuffer &OB) const override {
     if (Printing)
       return this;
-    SwapAndRestore<bool> SavePrinting(Printing, true);
+    ScopedOverride<bool> SavePrinting(Printing, true);
     return Ref->getSyntaxNode(OB);
   }
 
   void printLeft(OutputBuffer &OB) const override {
     if (Printing)
       return;
-    SwapAndRestore<bool> SavePrinting(Printing, true);
+    ScopedOverride<bool> SavePrinting(Printing, true);
     Ref->printLeft(OB);
   }
   void printRight(OutputBuffer &OB) const override {
     if (Printing)
       return;
-    SwapAndRestore<bool> SavePrinting(Printing, true);
+    ScopedOverride<bool> SavePrinting(Printing, true);
     Ref->printRight(OB);
   }
 };
@@ -1473,21 +1497,6 @@ public:
   }
 };
 
-struct StdQualifiedName : Node {
-  Node *Child;
-
-  StdQualifiedName(Node *Child_) : Node(KStdQualifiedName), Child(Child_) {}
-
-  template<typename Fn> void match(Fn F) const { F(Child); }
-
-  StringView getBaseName() const override { return Child->getBaseName(); }
-
-  void printLeft(OutputBuffer &OB) const override {
-    OB += "std::";
-    Child->print(OB);
-  }
-};
-
 enum class SpecialSubKind {
   allocator,
   basic_string,
@@ -1497,15 +1506,25 @@ enum class SpecialSubKind {
   iostream,
 };
 
-class ExpandedSpecialSubstitution final : public Node {
+class SpecialSubstitution;
+class ExpandedSpecialSubstitution : public Node {
+protected:
   SpecialSubKind SSK;
 
+  ExpandedSpecialSubstitution(SpecialSubKind SSK_, Kind K_)
+      : Node(K_), SSK(SSK_) {}
 public:
   ExpandedSpecialSubstitution(SpecialSubKind SSK_)
-      : Node(KExpandedSpecialSubstitution), SSK(SSK_) {}
+      : ExpandedSpecialSubstitution(SSK_, KExpandedSpecialSubstitution) {}
+  inline ExpandedSpecialSubstitution(SpecialSubstitution const *);
 
   template<typename Fn> void match(Fn F) const { F(SSK); }
 
+protected:
+  bool isInstantiation() const {
+    return unsigned(SSK) >= unsigned(SpecialSubKind::string);
+  }
+
   StringView getBaseName() const override {
     switch (SSK) {
     case SpecialSubKind::allocator:
@@ -1524,82 +1543,44 @@ public:
     DEMANGLE_UNREACHABLE;
   }
 
+private:
   void printLeft(OutputBuffer &OB) const override {
-    switch (SSK) {
-    case SpecialSubKind::allocator:
-      OB += "std::allocator";
-      break;
-    case SpecialSubKind::basic_string:
-      OB += "std::basic_string";
-      break;
-    case SpecialSubKind::string:
-      OB += "std::basic_string<char, std::char_traits<char>, "
-            "std::allocator<char> >";
-      break;
-    case SpecialSubKind::istream:
-      OB += "std::basic_istream<char, std::char_traits<char> >";
-      break;
-    case SpecialSubKind::ostream:
-      OB += "std::basic_ostream<char, std::char_traits<char> >";
-      break;
-    case SpecialSubKind::iostream:
-      OB += "std::basic_iostream<char, std::char_traits<char> >";
-      break;
+    OB << "std::" << getBaseName();
+    if (isInstantiation()) {
+      OB << "<char, std::char_traits<char>";
+      if (SSK == SpecialSubKind::string)
+        OB << ", std::allocator<char>";
+      OB << ">";
     }
   }
 };
 
-class SpecialSubstitution final : public Node {
+class SpecialSubstitution final : public ExpandedSpecialSubstitution {
 public:
-  SpecialSubKind SSK;
-
   SpecialSubstitution(SpecialSubKind SSK_)
-      : Node(KSpecialSubstitution), SSK(SSK_) {}
+      : ExpandedSpecialSubstitution(SSK_, KSpecialSubstitution) {}
 
   template<typename Fn> void match(Fn F) const { F(SSK); }
 
   StringView getBaseName() const override {
-    switch (SSK) {
-    case SpecialSubKind::allocator:
-      return StringView("allocator");
-    case SpecialSubKind::basic_string:
-      return StringView("basic_string");
-    case SpecialSubKind::string:
-      return StringView("string");
-    case SpecialSubKind::istream:
-      return StringView("istream");
-    case SpecialSubKind::ostream:
-      return StringView("ostream");
-    case SpecialSubKind::iostream:
-      return StringView("iostream");
+    auto SV = ExpandedSpecialSubstitution::getBaseName ();
+    if (isInstantiation()) {
+      // The instantiations are typedefs that drop the "basic_" prefix.
+      assert(SV.startsWith("basic_"));
+      SV = SV.dropFront(sizeof("basic_") - 1);
     }
-    DEMANGLE_UNREACHABLE;
+    return SV;
   }
 
   void printLeft(OutputBuffer &OB) const override {
-    switch (SSK) {
-    case SpecialSubKind::allocator:
-      OB += "std::allocator";
-      break;
-    case SpecialSubKind::basic_string:
-      OB += "std::basic_string";
-      break;
-    case SpecialSubKind::string:
-      OB += "std::string";
-      break;
-    case SpecialSubKind::istream:
-      OB += "std::istream";
-      break;
-    case SpecialSubKind::ostream:
-      OB += "std::ostream";
-      break;
-    case SpecialSubKind::iostream:
-      OB += "std::iostream";
-      break;
-    }
+    OB << "std::" << getBaseName();
   }
 };
 
+inline ExpandedSpecialSubstitution::ExpandedSpecialSubstitution(
+    SpecialSubstitution const *SS)
+    : ExpandedSpecialSubstitution(SS->SSK) {}
+
 class CtorDtorName final : public Node {
   const Node *Basename;
   const bool IsDtor;
@@ -1665,13 +1646,14 @@ public:
 
   void printDeclarator(OutputBuffer &OB) const {
     if (!TemplateParams.empty()) {
+      ScopedOverride<unsigned> LT(OB.GtIsGt, 0);
       OB += "<";
       TemplateParams.printWithComma(OB);
       OB += ">";
     }
-    OB += "(";
+    OB.printOpen();
     Params.printWithComma(OB);
-    OB += ")";
+    OB.printClose();
   }
 
   void printLeft(OutputBuffer &OB) const override {
@@ -1691,9 +1673,9 @@ public:
   template<typename Fn> void match(Fn F) const { F(Bindings); }
 
   void printLeft(OutputBuffer &OB) const override {
-    OB += '[';
+    OB.printOpen('[');
     Bindings.printWithComma(OB);
-    OB += ']';
+    OB.printClose(']');
   }
 };
 
@@ -1705,28 +1687,31 @@ class BinaryExpr : public Node {
   const Node *RHS;
 
 public:
-  BinaryExpr(const Node *LHS_, StringView InfixOperator_, const Node *RHS_)
-      : Node(KBinaryExpr), LHS(LHS_), InfixOperator(InfixOperator_), RHS(RHS_) {
-  }
+  BinaryExpr(const Node *LHS_, StringView InfixOperator_, const Node *RHS_,
+             Prec Prec_)
+      : Node(KBinaryExpr, Prec_), LHS(LHS_), InfixOperator(InfixOperator_),
+        RHS(RHS_) {}
 
-  template<typename Fn> void match(Fn F) const { F(LHS, InfixOperator, RHS); }
+  template <typename Fn> void match(Fn F) const {
+    F(LHS, InfixOperator, RHS, getPrecedence());
+  }
 
   void printLeft(OutputBuffer &OB) const override {
-    // might be a template argument expression, then we need to disambiguate
-    // with parens.
-    if (InfixOperator == ">")
-      OB += "(";
-
-    OB += "(";
-    LHS->print(OB);
-    OB += ") ";
+    bool ParenAll = OB.isGtInsideTemplateArgs() &&
+                    (InfixOperator == ">" || InfixOperator == ">>");
+    if (ParenAll)
+      OB.printOpen();
+    // Assignment is right associative, with special LHS precedence.
+    bool IsAssign = getPrecedence() == Prec::Assign;
+    LHS->printAsOperand(OB, IsAssign ? Prec::OrIf : getPrecedence(), !IsAssign);
+    // No space before comma operator
+    if (!(InfixOperator == ","))
+      OB += " ";
     OB += InfixOperator;
-    OB += " (";
-    RHS->print(OB);
-    OB += ")";
-
-    if (InfixOperator == ">")
-      OB += ")";
+    OB += " ";
+    RHS->printAsOperand(OB, getPrecedence(), IsAssign);
+    if (ParenAll)
+      OB.printClose();
   }
 };
 
@@ -1735,17 +1720,18 @@ class ArraySubscriptExpr : public Node {
   const Node *Op2;
 
 public:
-  ArraySubscriptExpr(const Node *Op1_, const Node *Op2_)
-      : Node(KArraySubscriptExpr), Op1(Op1_), Op2(Op2_) {}
+  ArraySubscriptExpr(const Node *Op1_, const Node *Op2_, Prec Prec_)
+      : Node(KArraySubscriptExpr, Prec_), Op1(Op1_), Op2(Op2_) {}
 
-  template<typename Fn> void match(Fn F) const { F(Op1, Op2); }
+  template <typename Fn> void match(Fn F) const {
+    F(Op1, Op2, getPrecedence());
+  }
 
   void printLeft(OutputBuffer &OB) const override {
-    OB += "(";
-    Op1->print(OB);
-    OB += ")[";
-    Op2->print(OB);
-    OB += "]";
+    Op1->printAsOperand(OB, getPrecedence());
+    OB.printOpen('[');
+    Op2->printAsOperand(OB);
+    OB.printClose(']');
   }
 };
 
@@ -1754,15 +1740,15 @@ class PostfixExpr : public Node {
   const StringView Operator;
 
 public:
-  PostfixExpr(const Node *Child_, StringView Operator_)
-      : Node(KPostfixExpr), Child(Child_), Operator(Operator_) {}
+  PostfixExpr(const Node *Child_, StringView Operator_, Prec Prec_)
+      : Node(KPostfixExpr, Prec_), Child(Child_), Operator(Operator_) {}
 
-  template<typename Fn> void match(Fn F) const { F(Child, Operator); }
+  template <typename Fn> void match(Fn F) const {
+    F(Child, Operator, getPrecedence());
+  }
 
   void printLeft(OutputBuffer &OB) const override {
-    OB += "(";
-    Child->print(OB);
-    OB += ")";
+    Child->printAsOperand(OB, getPrecedence(), true);
     OB += Operator;
   }
 };
@@ -1773,19 +1759,20 @@ class ConditionalExpr : public Node {
   const Node *Else;
 
 public:
-  ConditionalExpr(const Node *Cond_, const Node *Then_, const Node *Else_)
-      : Node(KConditionalExpr), Cond(Cond_), Then(Then_), Else(Else_) {}
+  ConditionalExpr(const Node *Cond_, const Node *Then_, const Node *Else_,
+                  Prec Prec_)
+      : Node(KConditionalExpr, Prec_), Cond(Cond_), Then(Then_), Else(Else_) {}
 
-  template<typename Fn> void match(Fn F) const { F(Cond, Then, Else); }
+  template <typename Fn> void match(Fn F) const {
+    F(Cond, Then, Else, getPrecedence());
+  }
 
   void printLeft(OutputBuffer &OB) const override {
-    OB += "(";
-    Cond->print(OB);
-    OB += ") ? (";
-    Then->print(OB);
-    OB += ") : (";
-    Else->print(OB);
-    OB += ")";
+    Cond->printAsOperand(OB, getPrecedence());
+    OB += " ? ";
+    Then->printAsOperand(OB);
+    OB += " : ";
+    Else->printAsOperand(OB, Prec::Assign, true);
   }
 };
 
@@ -1795,15 +1782,17 @@ class MemberExpr : public Node {
   const Node *RHS;
 
 public:
-  MemberExpr(const Node *LHS_, StringView Kind_, const Node *RHS_)
-      : Node(KMemberExpr), LHS(LHS_), Kind(Kind_), RHS(RHS_) {}
+  MemberExpr(const Node *LHS_, StringView Kind_, const Node *RHS_, Prec Prec_)
+      : Node(KMemberExpr, Prec_), LHS(LHS_), Kind(Kind_), RHS(RHS_) {}
 
-  template<typename Fn> void match(Fn F) const { F(LHS, Kind, RHS); }
+  template <typename Fn> void match(Fn F) const {
+    F(LHS, Kind, RHS, getPrecedence());
+  }
 
   void printLeft(OutputBuffer &OB) const override {
-    LHS->print(OB);
+    LHS->printAsOperand(OB, getPrecedence(), true);
     OB += Kind;
-    RHS->print(OB);
+    RHS->printAsOperand(OB, getPrecedence(), false);
   }
 };
 
@@ -1847,15 +1836,19 @@ class EnclosingExpr : public Node {
   const StringView Postfix;
 
 public:
-  EnclosingExpr(StringView Prefix_, Node *Infix_, StringView Postfix_)
-      : Node(KEnclosingExpr), Prefix(Prefix_), Infix(Infix_),
-        Postfix(Postfix_) {}
+  EnclosingExpr(StringView Prefix_, const Node *Infix_,
+                Prec Prec_ = Prec::Primary)
+      : Node(KEnclosingExpr, Prec_), Prefix(Prefix_), Infix(Infix_) {}
 
-  template<typename Fn> void match(Fn F) const { F(Prefix, Infix, Postfix); }
+  template <typename Fn> void match(Fn F) const {
+    F(Prefix, Infix, getPrecedence());
+  }
 
   void printLeft(OutputBuffer &OB) const override {
     OB += Prefix;
+    OB.printOpen();
     Infix->print(OB);
+    OB.printClose();
     OB += Postfix;
   }
 };
@@ -1867,18 +1860,24 @@ class CastExpr : public Node {
   const Node *From;
 
 public:
-  CastExpr(StringView CastKind_, const Node *To_, const Node *From_)
-      : Node(KCastExpr), CastKind(CastKind_), To(To_), From(From_) {}
+  CastExpr(StringView CastKind_, const Node *To_, const Node *From_, Prec Prec_)
+      : Node(KCastExpr, Prec_), CastKind(CastKind_), To(To_), From(From_) {}
 
-  template<typename Fn> void match(Fn F) const { F(CastKind, To, From); }
+  template <typename Fn> void match(Fn F) const {
+    F(CastKind, To, From, getPrecedence());
+  }
 
   void printLeft(OutputBuffer &OB) const override {
     OB += CastKind;
-    OB += "<";
-    To->printLeft(OB);
-    OB += ">(";
-    From->printLeft(OB);
-    OB += ")";
+    {
+      ScopedOverride<unsigned> LT(OB.GtIsGt, 0);
+      OB += "<";
+      To->printLeft(OB);
+      OB += ">";
+    }
+    OB.printOpen();
+    From->printAsOperand(OB);
+    OB.printClose();
   }
 };
 
@@ -1892,10 +1891,11 @@ public:
   template<typename Fn> void match(Fn F) const { F(Pack); }
 
   void printLeft(OutputBuffer &OB) const override {
-    OB += "sizeof...(";
+    OB += "sizeof...";
+    OB.printOpen();
     ParameterPackExpansion PPE(Pack);
     PPE.printLeft(OB);
-    OB += ")";
+    OB.printClose();
   }
 };
 
@@ -1904,16 +1904,18 @@ class CallExpr : public Node {
   NodeArray Args;
 
 public:
-  CallExpr(const Node *Callee_, NodeArray Args_)
-      : Node(KCallExpr), Callee(Callee_), Args(Args_) {}
+  CallExpr(const Node *Callee_, NodeArray Args_, Prec Prec_)
+      : Node(KCallExpr, Prec_), Callee(Callee_), Args(Args_) {}
 
-  template<typename Fn> void match(Fn F) const { F(Callee, Args); }
+  template <typename Fn> void match(Fn F) const {
+    F(Callee, Args, getPrecedence());
+  }
 
   void printLeft(OutputBuffer &OB) const override {
     Callee->print(OB);
-    OB += "(";
+    OB.printOpen();
     Args.printWithComma(OB);
-    OB += ")";
+    OB.printClose();
   }
 };
 
@@ -1926,31 +1928,31 @@ class NewExpr : public Node {
   bool IsArray;  // new[] ?
 public:
   NewExpr(NodeArray ExprList_, Node *Type_, NodeArray InitList_, bool IsGlobal_,
-          bool IsArray_)
-      : Node(KNewExpr), ExprList(ExprList_), Type(Type_), InitList(InitList_),
-        IsGlobal(IsGlobal_), IsArray(IsArray_) {}
+          bool IsArray_, Prec Prec_)
+      : Node(KNewExpr, Prec_), ExprList(ExprList_), Type(Type_),
+        InitList(InitList_), IsGlobal(IsGlobal_), IsArray(IsArray_) {}
 
   template<typename Fn> void match(Fn F) const {
-    F(ExprList, Type, InitList, IsGlobal, IsArray);
+    F(ExprList, Type, InitList, IsGlobal, IsArray, getPrecedence());
   }
 
   void printLeft(OutputBuffer &OB) const override {
     if (IsGlobal)
-      OB += "::operator ";
+      OB += "::";
     OB += "new";
     if (IsArray)
       OB += "[]";
-    OB += ' ';
     if (!ExprList.empty()) {
-      OB += "(";
+      OB.printOpen();
       ExprList.printWithComma(OB);
-      OB += ")";
+      OB.printClose();
     }
+    OB += " ";
     Type->print(OB);
     if (!InitList.empty()) {
-      OB += "(";
+      OB.printOpen();
       InitList.printWithComma(OB);
-      OB += ")";
+      OB.printClose();
     }
   }
 };
@@ -1961,17 +1963,21 @@ class DeleteExpr : public Node {
   bool IsArray;
 
 public:
-  DeleteExpr(Node *Op_, bool IsGlobal_, bool IsArray_)
-      : Node(KDeleteExpr), Op(Op_), IsGlobal(IsGlobal_), IsArray(IsArray_) {}
+  DeleteExpr(Node *Op_, bool IsGlobal_, bool IsArray_, Prec Prec_)
+      : Node(KDeleteExpr, Prec_), Op(Op_), IsGlobal(IsGlobal_),
+        IsArray(IsArray_) {}
 
-  template<typename Fn> void match(Fn F) const { F(Op, IsGlobal, IsArray); }
+  template <typename Fn> void match(Fn F) const {
+    F(Op, IsGlobal, IsArray, getPrecedence());
+  }
 
   void printLeft(OutputBuffer &OB) const override {
     if (IsGlobal)
       OB += "::";
     OB += "delete";
     if (IsArray)
-      OB += "[] ";
+      OB += "[]";
+    OB += ' ';
     Op->print(OB);
   }
 };
@@ -1981,16 +1987,16 @@ class PrefixExpr : public Node {
   Node *Child;
 
 public:
-  PrefixExpr(StringView Prefix_, Node *Child_)
-      : Node(KPrefixExpr), Prefix(Prefix_), Child(Child_) {}
+  PrefixExpr(StringView Prefix_, Node *Child_, Prec Prec_)
+      : Node(KPrefixExpr, Prec_), Prefix(Prefix_), Child(Child_) {}
 
-  template<typename Fn> void match(Fn F) const { F(Prefix, Child); }
+  template <typename Fn> void match(Fn F) const {
+    F(Prefix, Child, getPrecedence());
+  }
 
   void printLeft(OutputBuffer &OB) const override {
     OB += Prefix;
-    OB += "(";
-    Child->print(OB);
-    OB += ")";
+    Child->printAsOperand(OB, getPrecedence());
   }
 };
 
@@ -2013,17 +2019,20 @@ class ConversionExpr : public Node {
   NodeArray Expressions;
 
 public:
-  ConversionExpr(const Node *Type_, NodeArray Expressions_)
-      : Node(KConversionExpr), Type(Type_), Expressions(Expressions_) {}
+  ConversionExpr(const Node *Type_, NodeArray Expressions_, Prec Prec_)
+      : Node(KConversionExpr, Prec_), Type(Type_), Expressions(Expressions_) {}
 
-  template<typename Fn> void match(Fn F) const { F(Type, Expressions); }
+  template <typename Fn> void match(Fn F) const {
+    F(Type, Expressions, getPrecedence());
+  }
 
   void printLeft(OutputBuffer &OB) const override {
-    OB += "(";
+    OB.printOpen();
     Type->print(OB);
-    OB += ")(";
+    OB.printClose();
+    OB.printOpen();
     Expressions.printWithComma(OB);
-    OB += ")";
+    OB.printClose();
   }
 };
 
@@ -2034,18 +2043,21 @@ class PointerToMemberConversionExpr : public Node {
 
 public:
   PointerToMemberConversionExpr(const Node *Type_, const Node *SubExpr_,
-                                StringView Offset_)
-      : Node(KPointerToMemberConversionExpr), Type(Type_), SubExpr(SubExpr_),
-        Offset(Offset_) {}
+                                StringView Offset_, Prec Prec_)
+      : Node(KPointerToMemberConversionExpr, Prec_), Type(Type_),
+        SubExpr(SubExpr_), Offset(Offset_) {}
 
-  template<typename Fn> void match(Fn F) const { F(Type, SubExpr, Offset); }
+  template <typename Fn> void match(Fn F) const {
+    F(Type, SubExpr, Offset, getPrecedence());
+  }
 
   void printLeft(OutputBuffer &OB) const override {
-    OB += "(";
+    OB.printOpen();
     Type->print(OB);
-    OB += ")(";
+    OB.printClose();
+    OB.printOpen();
     SubExpr->print(OB);
-    OB += ")";
+    OB.printClose();
   }
 };
 
@@ -2131,41 +2143,33 @@ public:
 
   void printLeft(OutputBuffer &OB) const override {
     auto PrintPack = [&] {
-      OB += '(';
+      OB.printOpen();
       ParameterPackExpansion(Pack).print(OB);
-      OB += ')';
+      OB.printClose();
     };
 
-    OB += '(';
-
-    if (IsLeftFold) {
-      // init op ... op pack
-      if (Init != nullptr) {
-        Init->print(OB);
-        OB += ' ';
-        OB += OperatorName;
-        OB += ' ';
-      }
-      // ... op pack
-      OB += "... ";
-      OB += OperatorName;
-      OB += ' ';
-      PrintPack();
-    } else { // !IsLeftFold
-      // pack op ...
-      PrintPack();
-      OB += ' ';
-      OB += OperatorName;
-      OB += " ...";
-      // pack op ... op init
-      if (Init != nullptr) {
-        OB += ' ';
-        OB += OperatorName;
-        OB += ' ';
-        Init->print(OB);
-      }
+    OB.printOpen();
+    // Either '[init op ]... op pack' or 'pack op ...[ op init]'
+    // Refactored to '[(init|pack) op ]...[ op (pack|init)]'
+    // Fold expr operands are cast-expressions
+    if (!IsLeftFold || Init != nullptr) {
+      // '(init|pack) op '
+      if (IsLeftFold)
+        Init->printAsOperand(OB, Prec::Cast, true);
+      else
+        PrintPack();
+      OB << " " << OperatorName << " ";
+    }
+    OB << "...";
+    if (IsLeftFold || Init != nullptr) {
+      // ' op (init|pack)'
+      OB << " " << OperatorName << " ";
+      if (IsLeftFold)
+        PrintPack();
+      else
+        Init->printAsOperand(OB, Prec::Cast, true);
     }
-    OB += ')';
+    OB.printClose();
   }
 };
 
@@ -2239,9 +2243,9 @@ public:
   template<typename Fn> void match(Fn F) const { F(Ty, Integer); }
 
   void printLeft(OutputBuffer &OB) const override {
-    OB << "(";
+    OB.printOpen();
     Ty->print(OB);
-    OB << ")";
+    OB.printClose();
 
     if (Integer[0] == 'n')
       OB << "-" << Integer.dropFront(1);
@@ -2262,13 +2266,13 @@ public:
 
   void printLeft(OutputBuffer &OB) const override {
     if (Type.size() > 3) {
-      OB += "(";
+      OB.printOpen();
       OB += Type;
-      OB += ")";
+      OB.printClose();
     }
 
     if (Value[0] == 'n') {
-      OB += "-";
+      OB += '-';
       OB += Value.dropFront(1);
     } else
       OB += Value;
@@ -2344,24 +2348,22 @@ using LongDoubleLiteral = FloatLiteralImpl<long double>;
 template<typename Fn>
 void Node::visit(Fn F) const {
   switch (K) {
-#define CASE(X) case K ## X: return F(static_cast<const X*>(this));
-    FOR_EACH_NODE_KIND(CASE)
-#undef CASE
+#define NODE(X)                                                                \
+  case K##X:                                                                   \
+    return F(static_cast<const X *>(this));
+#include "ItaniumNodes.def"
   }
   assert(0 && "unknown mangling node kind");
 }
 
 /// Determine the kind of a node from its type.
 template<typename NodeT> struct NodeKind;
-#define SPECIALIZATION(X) \
-  template<> struct NodeKind<X> { \
-    static constexpr Node::Kind Kind = Node::K##X; \
-    static constexpr const char *name() { return #X; } \
+#define NODE(X)                                                                \
+  template <> struct NodeKind<X> {                                             \
+    static constexpr Node::Kind Kind = Node::K##X;                             \
+    static constexpr const char *name() { return #X; }                         \
   };
-FOR_EACH_NODE_KIND(SPECIALIZATION)
-#undef SPECIALIZATION
-
-#undef FOR_EACH_NODE_KIND
+#include "ItaniumNodes.def"
 
 template <typename Derived, typename Alloc> struct AbstractManglingParser {
   const char *First;
@@ -2499,17 +2501,16 @@ template <typename Derived, typename Alloc> struct AbstractManglingParser {
 
   /// Parse the <expr> production.
   Node *parseExpr();
-  Node *parsePrefixExpr(StringView Kind);
-  Node *parseBinaryExpr(StringView Kind);
+  Node *parsePrefixExpr(StringView Kind, Node::Prec Prec);
+  Node *parseBinaryExpr(StringView Kind, Node::Prec Prec);
   Node *parseIntegerLiteral(StringView Lit);
   Node *parseExprPrimary();
   template <class Float> Node *parseFloatingLiteral();
   Node *parseFunctionParam();
-  Node *parseNewExpr();
   Node *parseConversionExpr();
   Node *parseBracedExpr();
   Node *parseFoldExpr();
-  Node *parsePointerToMemberConversionExpr();
+  Node *parsePointerToMemberConversionExpr(Node::Prec Prec);
   Node *parseSubobjectExpr();
 
   /// Parse the <type> production.
@@ -2557,17 +2558,80 @@ template <typename Derived, typename Alloc> struct AbstractManglingParser {
   Node *parseName(NameState *State = nullptr);
   Node *parseLocalName(NameState *State);
   Node *parseOperatorName(NameState *State);
-  Node *parseUnqualifiedName(NameState *State);
+  bool parseModuleNameOpt(ModuleName *&Module);
+  Node *parseUnqualifiedName(NameState *State, Node *Scope, ModuleName *Module);
   Node *parseUnnamedTypeName(NameState *State);
   Node *parseSourceName(NameState *State);
-  Node *parseUnscopedName(NameState *State);
+  Node *parseUnscopedName(NameState *State, bool *isSubstName);
   Node *parseNestedName(NameState *State);
   Node *parseCtorDtorName(Node *&SoFar, NameState *State);
 
   Node *parseAbiTags(Node *N);
 
+  struct OperatorInfo {
+    enum OIKind : unsigned char {
+      Prefix,      // Prefix unary: @ expr
+      Postfix,     // Postfix unary: expr @
+      Binary,      // Binary: lhs @ rhs
+      Array,       // Array index:  lhs [ rhs ]
+      Member,      // Member access: lhs @ rhs
+      New,         // New
+      Del,         // Delete
+      Call,        // Function call: expr (expr*)
+      CCast,       // C cast: (type)expr
+      Conditional, // Conditional: expr ? expr : expr
+      NameOnly,    // Overload only, not allowed in expression.
+      // Below do not have operator names
+      NamedCast, // Named cast, @<type>(expr)
+      OfIdOp,    // alignof, sizeof, typeid
+
+      Unnameable = NamedCast,
+    };
+    char Enc[2];      // Encoding
+    OIKind Kind;      // Kind of operator
+    bool Flag : 1;    // Entry-specific flag
+    Node::Prec Prec : 7; // Precedence
+    const char *Name; // Spelling
+
+  public:
+    constexpr OperatorInfo(const char (&E)[3], OIKind K, bool F, Node::Prec P,
+                           const char *N)
+        : Enc{E[0], E[1]}, Kind{K}, Flag{F}, Prec{P}, Name{N} {}
+
+  public:
+    bool operator<(const OperatorInfo &Other) const {
+      return *this < Other.Enc;
+    }
+    bool operator<(const char *Peek) const {
+      return Enc[0] < Peek[0] || (Enc[0] == Peek[0] && Enc[1] < Peek[1]);
+    }
+    bool operator==(const char *Peek) const {
+      return Enc[0] == Peek[0] && Enc[1] == Peek[1];
+    }
+    bool operator!=(const char *Peek) const { return !this->operator==(Peek); }
+
+  public:
+    StringView getSymbol() const {
+      StringView Res = Name;
+      if (Kind < Unnameable) {
+        assert(Res.startsWith("operator") &&
+               "operator name does not start with 'operator'");
+        Res = Res.dropFront(sizeof("operator") - 1);
+        Res.consumeFront(' ');
+      }
+      return Res;
+    }
+    StringView getName() const { return Name; }
+    OIKind getKind() const { return Kind; }
+    bool getFlag() const { return Flag; }
+    Node::Prec getPrecedence() const { return Prec; }
+  };
+  static const OperatorInfo Ops[];
+  static const size_t NumOps;
+  const OperatorInfo *parseOperatorEncoding();
+
   /// Parse the <unresolved-name> production.
-  Node *parseUnresolvedName();
+  Node *parseUnresolvedName(bool Global);
   Node *parseSimpleId();
   Node *parseBaseUnresolvedName();
   Node *parseUnresolvedType();
@@ -2588,26 +2652,16 @@ const char* parse_discriminator(const char* first, const char* last);
 //                          ::= <substitution>
 template <typename Derived, typename Alloc>
 Node *AbstractManglingParser<Derived, Alloc>::parseName(NameState *State) {
-  consumeIf('L'); // extension
-
   if (look() == 'N')
     return getDerived().parseNestedName(State);
   if (look() == 'Z')
     return getDerived().parseLocalName(State);
 
   Node *Result = nullptr;
-  bool IsSubst = look() == 'S' && look(1) != 't';
-  if (IsSubst) {
-    // A substitution must lead to:
-    //        ::= <unscoped-template-name> <template-args>
-    Result = getDerived().parseSubstitution();
-  } else {
-    // An unscoped name can be one of:
-    //        ::= <unscoped-name>
-    //        ::= <unscoped-template-name> <template-args>
-    Result = getDerived().parseUnscopedName(State);
-  }
-  if (Result == nullptr)
+  bool IsSubst = false;
+
+  Result = getDerived().parseUnscopedName(State, &IsSubst);
+  if (!Result)
     return nullptr;
 
   if (look() == 'I') {
@@ -2667,38 +2721,63 @@ Node *AbstractManglingParser<Derived, Alloc>::parseLocalName(NameState *State) {
 
 // <unscoped-name> ::= <unqualified-name>
 //                 ::= St <unqualified-name>   # ::std::
-// extension       ::= StL<unqualified-name>
+// [*] extension
 template <typename Derived, typename Alloc>
 Node *
-AbstractManglingParser<Derived, Alloc>::parseUnscopedName(NameState *State) {
-  bool IsStd = consumeIf("St");
-  if (IsStd)
-    consumeIf('L');
+AbstractManglingParser<Derived, Alloc>::parseUnscopedName(NameState *State,
+                                                          bool *IsSubst) {
 
-  Node *Result = getDerived().parseUnqualifiedName(State);
-  if (Result == nullptr)
-    return nullptr;
-  if (IsStd)
-    Result = make<StdQualifiedName>(Result);
+  Node *Std = nullptr;
+  if (consumeIf("St")) {
+    Std = make<NameType>("std");
+    if (Std == nullptr)
+      return nullptr;
+  }
 
-  return Result;
+  Node *Res = nullptr;
+  ModuleName *Module = nullptr;
+  if (look() == 'S') {
+    Node *S = getDerived().parseSubstitution();
+    if (!S)
+      return nullptr;
+    if (S->getKind() == Node::KModuleName)
+      Module = static_cast<ModuleName *>(S);
+    else if (IsSubst && Std == nullptr) {
+      Res = S;
+      *IsSubst = true;
+    } else {
+      return nullptr;
+    }
+  }
+
+  if (Res == nullptr || Std != nullptr) {
+    Res = getDerived().parseUnqualifiedName(State, Std, Module);
+  }
+
+  return Res;
 }
 
-// <unqualified-name> ::= <operator-name> [abi-tags]
-//                    ::= <ctor-dtor-name>
-//                    ::= <source-name>
-//                    ::= <unnamed-type-name>
-//                    ::= DC <source-name>+ E      # structured binding declaration
+// <unqualified-name> ::= [<module-name>] L? <operator-name> [<abi-tags>]
+//                    ::= [<module-name>] <ctor-dtor-name> [<abi-tags>]
+//                    ::= [<module-name>] L? <source-name> [<abi-tags>]
+//                    ::= [<module-name>] L? <unnamed-type-name> [<abi-tags>]
+//			# structured binding declaration
+//                    ::= [<module-name>] L? DC <source-name>+ E
 template <typename Derived, typename Alloc>
-Node *
-AbstractManglingParser<Derived, Alloc>::parseUnqualifiedName(NameState *State) {
-  // <ctor-dtor-name>s are special-cased in parseNestedName().
+Node *AbstractManglingParser<Derived, Alloc>::parseUnqualifiedName(
+    NameState *State, Node *Scope, ModuleName *Module) {
+  if (getDerived().parseModuleNameOpt(Module))
+    return nullptr;
+
+  consumeIf('L');
+
   Node *Result;
-  if (look() == 'U')
-    Result = getDerived().parseUnnamedTypeName(State);
-  else if (look() >= '1' && look() <= '9')
+  if (look() >= '1' && look() <= '9') {
     Result = getDerived().parseSourceName(State);
-  else if (consumeIf("DC")) {
+  } else if (look() == 'U') {
+    Result = getDerived().parseUnnamedTypeName(State);
+  } else if (consumeIf("DC")) {
+    // Structured binding
     size_t BindingsBegin = Names.size();
     do {
       Node *Binding = getDerived().parseSourceName(State);
@@ -2707,13 +2786,46 @@ AbstractManglingParser<Derived, Alloc>::parseUnqualifiedName(NameState *State) {
       Names.push_back(Binding);
     } while (!consumeIf('E'));
     Result = make<StructuredBindingName>(popTrailingNodeArray(BindingsBegin));
-  } else
+  } else if (look() == 'C' || look() == 'D') {
+    // A <ctor-dtor-name>.
+    if (Scope == nullptr || Module != nullptr)
+      return nullptr;
+    Result = getDerived().parseCtorDtorName(Scope, State);
+  } else {
     Result = getDerived().parseOperatorName(State);
+  }
+
+  if (Result != nullptr && Module != nullptr)
+    Result = make<ModuleEntity>(Module, Result);
   if (Result != nullptr)
     Result = getDerived().parseAbiTags(Result);
+  if (Result != nullptr && Scope != nullptr)
+    Result = make<NestedName>(Scope, Result);
+
   return Result;
 }
 
+// <module-name> ::= <module-subname>
+// 	 	 ::= <module-name> <module-subname>
+//		 ::= <substitution>  # passed in by caller
+// <module-subname> ::= W <source-name>
+//		    ::= W P <source-name>
+template <typename Derived, typename Alloc>
+bool AbstractManglingParser<Derived, Alloc>::parseModuleNameOpt(
+    ModuleName *&Module) {
+  while (consumeIf('W')) {
+    bool IsPartition = consumeIf('P');
+    Node *Sub = getDerived().parseSourceName(nullptr);
+    if (!Sub)
+      return true;
+    Module =
+        static_cast<ModuleName *>(make<ModuleName>(Module, Sub, IsPartition));
+    Subs.push_back(Module);
+  }
+
+  return false;
+}
+
 // <unnamed-type-name> ::= Ut [<nonnegative number>] _
 //                     ::= <closure-type-name>
 //
@@ -2735,7 +2847,7 @@ AbstractManglingParser<Derived, Alloc>::parseUnnamedTypeName(NameState *State) {
     return make<UnnamedTypeName>(Count);
   }
   if (consumeIf("Ul")) {
-    SwapAndRestore<size_t> SwapParams(ParsingLambdaParamsAtLevel,
+    ScopedOverride<size_t> SwapParams(ParsingLambdaParamsAtLevel,
                                       TemplateParams.size());
     ScopedTemplateParamList LambdaTemplateParams(this);
 
@@ -2813,97 +2925,124 @@ Node *AbstractManglingParser<Derived, Alloc>::parseSourceName(NameState *) {
   return make<NameType>(Name);
 }
 
-//   <operator-name> ::= aa    # &&
-//                   ::= ad    # & (unary)
-//                   ::= an    # &
-//                   ::= aN    # &=
-//                   ::= aS    # =
-//                   ::= cl    # ()
-//                   ::= cm    # ,
-//                   ::= co    # ~
-//                   ::= cv <type>    # (cast)
-//                   ::= da    # delete[]
-//                   ::= de    # * (unary)
-//                   ::= dl    # delete
-//                   ::= dv    # /
-//                   ::= dV    # /=
-//                   ::= eo    # ^
-//                   ::= eO    # ^=
-//                   ::= eq    # ==
-//                   ::= ge    # >=
-//                   ::= gt    # >
-//                   ::= ix    # []
-//                   ::= le    # <=
+// Operator encodings
+template <typename Derived, typename Alloc>
+const typename AbstractManglingParser<
+    Derived, Alloc>::OperatorInfo AbstractManglingParser<Derived,
+                                                         Alloc>::Ops[] = {
+    // Keep ordered by encoding
+    {"aN", OperatorInfo::Binary, false, Node::Prec::Assign, "operator&="},
+    {"aS", OperatorInfo::Binary, false, Node::Prec::Assign, "operator="},
+    {"aa", OperatorInfo::Binary, false, Node::Prec::AndIf, "operator&&"},
+    {"ad", OperatorInfo::Prefix, false, Node::Prec::Unary, "operator&"},
+    {"an", OperatorInfo::Binary, false, Node::Prec::And, "operator&"},
+    {"at", OperatorInfo::OfIdOp, /*Type*/ true, Node::Prec::Unary, "alignof "},
+    {"aw", OperatorInfo::NameOnly, false, Node::Prec::Primary,
+     "operator co_await"},
+    {"az", OperatorInfo::OfIdOp, /*Type*/ false, Node::Prec::Unary, "alignof "},
+    {"cc", OperatorInfo::NamedCast, false, Node::Prec::Postfix, "const_cast"},
+    {"cl", OperatorInfo::Call, false, Node::Prec::Postfix, "operator()"},
+    {"cm", OperatorInfo::Binary, false, Node::Prec::Comma, "operator,"},
+    {"co", OperatorInfo::Prefix, false, Node::Prec::Unary, "operator~"},
+    {"cv", OperatorInfo::CCast, false, Node::Prec::Cast, "operator"}, // C Cast
+    {"dV", OperatorInfo::Binary, false, Node::Prec::Assign, "operator/="},
+    {"da", OperatorInfo::Del, /*Ary*/ true, Node::Prec::Unary,
+     "operator delete[]"},
+    {"dc", OperatorInfo::NamedCast, false, Node::Prec::Postfix, "dynamic_cast"},
+    {"de", OperatorInfo::Prefix, false, Node::Prec::Unary, "operator*"},
+    {"dl", OperatorInfo::Del, /*Ary*/ false, Node::Prec::Unary,
+     "operator delete"},
+    {"ds", OperatorInfo::Member, /*Named*/ false, Node::Prec::PtrMem,
+     "operator.*"},
+    {"dt", OperatorInfo::Member, /*Named*/ false, Node::Prec::Postfix,
+     "operator."},
+    {"dv", OperatorInfo::Binary, false, Node::Prec::Assign, "operator/"},
+    {"eO", OperatorInfo::Binary, false, Node::Prec::Assign, "operator^="},
+    {"eo", OperatorInfo::Binary, false, Node::Prec::Xor, "operator^"},
+    {"eq", OperatorInfo::Binary, false, Node::Prec::Equality, "operator=="},
+    {"ge", OperatorInfo::Binary, false, Node::Prec::Relational, "operator>="},
+    {"gt", OperatorInfo::Binary, false, Node::Prec::Relational, "operator>"},
+    {"ix", OperatorInfo::Array, false, Node::Prec::Postfix, "operator[]"},
+    {"lS", OperatorInfo::Binary, false, Node::Prec::Assign, "operator<<="},
+    {"le", OperatorInfo::Binary, false, Node::Prec::Relational, "operator<="},
+    {"ls", OperatorInfo::Binary, false, Node::Prec::Shift, "operator<<"},
+    {"lt", OperatorInfo::Binary, false, Node::Prec::Relational, "operator<"},
+    {"mI", OperatorInfo::Binary, false, Node::Prec::Assign, "operator-="},
+    {"mL", OperatorInfo::Binary, false, Node::Prec::Assign, "operator*="},
+    {"mi", OperatorInfo::Binary, false, Node::Prec::Additive, "operator-"},
+    {"ml", OperatorInfo::Binary, false, Node::Prec::Multiplicative,
+     "operator*"},
+    {"mm", OperatorInfo::Postfix, false, Node::Prec::Postfix, "operator--"},
+    {"na", OperatorInfo::New, /*Ary*/ true, Node::Prec::Unary,
+     "operator new[]"},
+    {"ne", OperatorInfo::Binary, false, Node::Prec::Equality, "operator!="},
+    {"ng", OperatorInfo::Prefix, false, Node::Prec::Unary, "operator-"},
+    {"nt", OperatorInfo::Prefix, false, Node::Prec::Unary, "operator!"},
+    {"nw", OperatorInfo::New, /*Ary*/ false, Node::Prec::Unary, "operator new"},
+    {"oR", OperatorInfo::Binary, false, Node::Prec::Assign, "operator|="},
+    {"oo", OperatorInfo::Binary, false, Node::Prec::OrIf, "operator||"},
+    {"or", OperatorInfo::Binary, false, Node::Prec::Ior, "operator|"},
+    {"pL", OperatorInfo::Binary, false, Node::Prec::Assign, "operator+="},
+    {"pl", OperatorInfo::Binary, false, Node::Prec::Additive, "operator+"},
+    {"pm", OperatorInfo::Member, /*Named*/ false, Node::Prec::PtrMem,
+     "operator->*"},
+    {"pp", OperatorInfo::Postfix, false, Node::Prec::Postfix, "operator++"},
+    {"ps", OperatorInfo::Prefix, false, Node::Prec::Unary, "operator+"},
+    {"pt", OperatorInfo::Member, /*Named*/ true, Node::Prec::Postfix,
+     "operator->"},
+    {"qu", OperatorInfo::Conditional, false, Node::Prec::Conditional,
+     "operator?"},
+    {"rM", OperatorInfo::Binary, false, Node::Prec::Assign, "operator%="},
+    {"rS", OperatorInfo::Binary, false, Node::Prec::Assign, "operator>>="},
+    {"rc", OperatorInfo::NamedCast, false, Node::Prec::Postfix,
+     "reinterpret_cast"},
+    {"rm", OperatorInfo::Binary, false, Node::Prec::Multiplicative,
+     "operator%"},
+    {"rs", OperatorInfo::Binary, false, Node::Prec::Shift, "operator>>"},
+    {"sc", OperatorInfo::NamedCast, false, Node::Prec::Postfix, "static_cast"},
+    {"ss", OperatorInfo::Binary, false, Node::Prec::Spaceship, "operator<=>"},
+    {"st", OperatorInfo::OfIdOp, /*Type*/ true, Node::Prec::Unary, "sizeof "},
+    {"sz", OperatorInfo::OfIdOp, /*Type*/ false, Node::Prec::Unary, "sizeof "},
+    {"te", OperatorInfo::OfIdOp, /*Type*/ false, Node::Prec::Postfix,
+     "typeid "},
+    {"ti", OperatorInfo::OfIdOp, /*Type*/ true, Node::Prec::Postfix, "typeid "},
+};
+template <typename Derived, typename Alloc>
+const size_t AbstractManglingParser<Derived, Alloc>::NumOps = sizeof(Ops) /
+                                                              sizeof(Ops[0]);
+
+// If the next 2 chars are an operator encoding, consume them and return their
+// OperatorInfo.  Otherwise return nullptr.
+template <typename Derived, typename Alloc>
+const typename AbstractManglingParser<Derived, Alloc>::OperatorInfo *
+AbstractManglingParser<Derived, Alloc>::parseOperatorEncoding() {
+  if (numLeft() < 2)
+    return nullptr;
+
+  auto Op = std::lower_bound(
+      &Ops[0], &Ops[NumOps], First,
+      [](const OperatorInfo &Op_, const char *Enc_) { return Op_ < Enc_; });
+  if (Op == &Ops[NumOps] || *Op != First)
+    return nullptr;
+
+  First += 2;
+  return Op;
+}
+
+//   <operator-name> ::= See parseOperatorEncoding()
 //                   ::= li <source-name>  # operator ""
-//                   ::= ls    # <<
-//                   ::= lS    # <<=
-//                   ::= lt    # <
-//                   ::= mi    # -
-//                   ::= mI    # -=
-//                   ::= ml    # *
-//                   ::= mL    # *=
-//                   ::= mm    # -- (postfix in <expression> context)
-//                   ::= na    # new[]
-//                   ::= ne    # !=
-//                   ::= ng    # - (unary)
-//                   ::= nt    # !
-//                   ::= nw    # new
-//                   ::= oo    # ||
-//                   ::= or    # |
-//                   ::= oR    # |=
-//                   ::= pm    # ->*
-//                   ::= pl    # +
-//                   ::= pL    # +=
-//                   ::= pp    # ++ (postfix in <expression> context)
-//                   ::= ps    # + (unary)
-//                   ::= pt    # ->
-//                   ::= qu    # ?
-//                   ::= rm    # %
-//                   ::= rM    # %=
-//                   ::= rs    # >>
-//                   ::= rS    # >>=
-//                   ::= ss    # <=> C++2a
-//                   ::= v <digit> <source-name>        # vendor extended operator
+//                   ::= v <digit> <source-name>  # vendor extended operator
 template <typename Derived, typename Alloc>
 Node *
 AbstractManglingParser<Derived, Alloc>::parseOperatorName(NameState *State) {
-  switch (look()) {
-  case 'a':
-    switch (look(1)) {
-    case 'a':
-      First += 2;
-      return make<NameType>("operator&&");
-    case 'd':
-    case 'n':
-      First += 2;
-      return make<NameType>("operator&");
-    case 'N':
-      First += 2;
-      return make<NameType>("operator&=");
-    case 'S':
-      First += 2;
-      return make<NameType>("operator=");
-    }
-    return nullptr;
-  case 'c':
-    switch (look(1)) {
-    case 'l':
-      First += 2;
-      return make<NameType>("operator()");
-    case 'm':
-      First += 2;
-      return make<NameType>("operator,");
-    case 'o':
-      First += 2;
-      return make<NameType>("operator~");
-    //                   ::= cv <type>    # (cast)
-    case 'v': {
-      First += 2;
-      SwapAndRestore<bool> SaveTemplate(TryToParseTemplateArgs, false);
+  if (const auto *Op = parseOperatorEncoding()) {
+    if (Op->getKind() == OperatorInfo::CCast) {
+      //              ::= cv <type>    # (cast)
+      ScopedOverride<bool> SaveTemplate(TryToParseTemplateArgs, false);
       // If we're parsing an encoding, State != nullptr and the conversion
       // operators' <type> could have a <template-param> that refers to some
       // <template-arg>s further ahead in the mangled name.
-      SwapAndRestore<bool> SavePermit(PermitForwardTemplateReferences,
+      ScopedOverride<bool> SavePermit(PermitForwardTemplateReferences,
                                       PermitForwardTemplateReferences ||
                                           State != nullptr);
       Node *Ty = getDerived().parseType();
@@ -2912,185 +3051,29 @@ AbstractManglingParser<Derived, Alloc>::parseOperatorName(NameState *State) {
       if (State) State->CtorDtorConversion = true;
       return make<ConversionOperatorType>(Ty);
     }
-    }
-    return nullptr;
-  case 'd':
-    switch (look(1)) {
-    case 'a':
-      First += 2;
-      return make<NameType>("operator delete[]");
-    case 'e':
-      First += 2;
-      return make<NameType>("operator*");
-    case 'l':
-      First += 2;
-      return make<NameType>("operator delete");
-    case 'v':
-      First += 2;
-      return make<NameType>("operator/");
-    case 'V':
-      First += 2;
-      return make<NameType>("operator/=");
-    }
-    return nullptr;
-  case 'e':
-    switch (look(1)) {
-    case 'o':
-      First += 2;
-      return make<NameType>("operator^");
-    case 'O':
-      First += 2;
-      return make<NameType>("operator^=");
-    case 'q':
-      First += 2;
-      return make<NameType>("operator==");
-    }
-    return nullptr;
-  case 'g':
-    switch (look(1)) {
-    case 'e':
-      First += 2;
-      return make<NameType>("operator>=");
-    case 't':
-      First += 2;
-      return make<NameType>("operator>");
-    }
-    return nullptr;
-  case 'i':
-    if (look(1) == 'x') {
-      First += 2;
-      return make<NameType>("operator[]");
-    }
-    return nullptr;
-  case 'l':
-    switch (look(1)) {
-    case 'e':
-      First += 2;
-      return make<NameType>("operator<=");
+
+    if (Op->getKind() >= OperatorInfo::Unnameable)
+      /* Not a nameable operator.  */
+      return nullptr;
+    if (Op->getKind() == OperatorInfo::Member && !Op->getFlag())
+      /* Not a nameable MemberExpr */
+      return nullptr;
+
+    return make<NameType>(Op->getName());
+  }
+
+  if (consumeIf("li")) {
     //                   ::= li <source-name>  # operator ""
-    case 'i': {
-      First += 2;
-      Node *SN = getDerived().parseSourceName(State);
-      if (SN == nullptr)
-        return nullptr;
-      return make<LiteralOperator>(SN);
-    }
-    case 's':
-      First += 2;
-      return make<NameType>("operator<<");
-    case 'S':
-      First += 2;
-      return make<NameType>("operator<<=");
-    case 't':
-      First += 2;
-      return make<NameType>("operator<");
-    }
-    return nullptr;
-  case 'm':
-    switch (look(1)) {
-    case 'i':
-      First += 2;
-      return make<NameType>("operator-");
-    case 'I':
-      First += 2;
-      return make<NameType>("operator-=");
-    case 'l':
-      First += 2;
-      return make<NameType>("operator*");
-    case 'L':
-      First += 2;
-      return make<NameType>("operator*=");
-    case 'm':
-      First += 2;
-      return make<NameType>("operator--");
-    }
-    return nullptr;
-  case 'n':
-    switch (look(1)) {
-    case 'a':
-      First += 2;
-      return make<NameType>("operator new[]");
-    case 'e':
-      First += 2;
-      return make<NameType>("operator!=");
-    case 'g':
-      First += 2;
-      return make<NameType>("operator-");
-    case 't':
-      First += 2;
-      return make<NameType>("operator!");
-    case 'w':
-      First += 2;
-      return make<NameType>("operator new");
-    }
-    return nullptr;
-  case 'o':
-    switch (look(1)) {
-    case 'o':
-      First += 2;
-      return make<NameType>("operator||");
-    case 'r':
-      First += 2;
-      return make<NameType>("operator|");
-    case 'R':
-      First += 2;
-      return make<NameType>("operator|=");
-    }
-    return nullptr;
-  case 'p':
-    switch (look(1)) {
-    case 'm':
-      First += 2;
-      return make<NameType>("operator->*");
-    case 'l':
-      First += 2;
-      return make<NameType>("operator+");
-    case 'L':
-      First += 2;
-      return make<NameType>("operator+=");
-    case 'p':
-      First += 2;
-      return make<NameType>("operator++");
-    case 's':
-      First += 2;
-      return make<NameType>("operator+");
-    case 't':
-      First += 2;
-      return make<NameType>("operator->");
-    }
-    return nullptr;
-  case 'q':
-    if (look(1) == 'u') {
-      First += 2;
-      return make<NameType>("operator?");
-    }
-    return nullptr;
-  case 'r':
-    switch (look(1)) {
-    case 'm':
-      First += 2;
-      return make<NameType>("operator%");
-    case 'M':
-      First += 2;
-      return make<NameType>("operator%=");
-    case 's':
-      First += 2;
-      return make<NameType>("operator>>");
-    case 'S':
-      First += 2;
-      return make<NameType>("operator>>=");
-    }
-    return nullptr;
-  case 's':
-    if (look(1) == 's') {
-      First += 2;
-      return make<NameType>("operator<=>");
-    }
-    return nullptr;
-  // ::= v <digit> <source-name>        # vendor extended operator
-  case 'v':
-    if (std::isdigit(look(1))) {
-      First += 2;
+    Node *SN = getDerived().parseSourceName(State);
+    if (SN == nullptr)
+      return nullptr;
+    return make<LiteralOperator>(SN);
+  }
+
+  if (consumeIf('v')) {
+    // ::= v <digit> <source-name>        # vendor extended operator
+    if (look() >= '0' && look() <= '9') {
+      First++;
       Node *SN = getDerived().parseSourceName(State);
       if (SN == nullptr)
         return nullptr;
@@ -3098,6 +3081,7 @@ AbstractManglingParser<Derived, Alloc>::parseOperatorName(NameState *State) {
     }
     return nullptr;
   }
+
   return nullptr;
 }
 
@@ -3116,19 +3100,11 @@ Node *
 AbstractManglingParser<Derived, Alloc>::parseCtorDtorName(Node *&SoFar,
                                                           NameState *State) {
   if (SoFar->getKind() == Node::KSpecialSubstitution) {
-    auto SSK = static_cast<SpecialSubstitution *>(SoFar)->SSK;
-    switch (SSK) {
-    case SpecialSubKind::string:
-    case SpecialSubKind::istream:
-    case SpecialSubKind::ostream:
-    case SpecialSubKind::iostream:
-      SoFar = make<ExpandedSpecialSubstitution>(SSK);
-      if (!SoFar)
-        return nullptr;
-      break;
-    default:
-      break;
-    }
+    // Expand the special substitution.
+    SoFar = make<ExpandedSpecialSubstitution>(
+        static_cast<SpecialSubstitution *>(SoFar));
+    if (!SoFar)
+      return nullptr;
   }
 
   if (consumeIf('C')) {
@@ -3157,8 +3133,10 @@ AbstractManglingParser<Derived, Alloc>::parseCtorDtorName(Node *&SoFar,
   return nullptr;
 }
 
-// <nested-name> ::= N [<CV-Qualifiers>] [<ref-qualifier>] <prefix> <unqualified-name> E
-//               ::= N [<CV-Qualifiers>] [<ref-qualifier>] <template-prefix> <template-args> E
+// <nested-name> ::= N [<CV-Qualifiers>] [<ref-qualifier>] <prefix>
+// 			<unqualified-name> E
+//               ::= N [<CV-Qualifiers>] [<ref-qualifier>] <template-prefix>
+//               	<template-args> E
 //
 // <prefix> ::= <prefix> <unqualified-name>
 //          ::= <template-prefix> <template-args>
@@ -3167,7 +3145,7 @@ AbstractManglingParser<Derived, Alloc>::parseCtorDtorName(Node *&SoFar,
 //          ::= # empty
 //          ::= <substitution>
 //          ::= <prefix> <data-member-prefix>
-//  extension ::= L
+// [*] extension
 //
 // <data-member-prefix> := <member source-name> [<template-args>] M
 //
@@ -3187,90 +3165,76 @@ AbstractManglingParser<Derived, Alloc>::parseNestedName(NameState *State) {
     if (State) State->ReferenceQualifier = FrefQualRValue;
   } else if (consumeIf('R')) {
     if (State) State->ReferenceQualifier = FrefQualLValue;
-  } else
+  } else {
     if (State) State->ReferenceQualifier = FrefQualNone;
-
-  Node *SoFar = nullptr;
-  auto PushComponent = [&](Node *Comp) {
-    if (!Comp) return false;
-    if (SoFar) SoFar = make<NestedName>(SoFar, Comp);
-    else       SoFar = Comp;
-    if (State) State->EndsWithTemplateArgs = false;
-    return SoFar != nullptr;
-  };
-
-  if (consumeIf("St")) {
-    SoFar = make<NameType>("std");
-    if (!SoFar)
-      return nullptr;
   }
 
+  Node *SoFar = nullptr;
   while (!consumeIf('E')) {
-    consumeIf('L'); // extension
-
-    // <data-member-prefix> := <member source-name> [<template-args>] M
-    if (consumeIf('M')) {
-      if (SoFar == nullptr)
-        return nullptr;
-      continue;
-    }
+    if (State)
+      // Only set end-with-template on the case that does that.
+      State->EndsWithTemplateArgs = false;
 
-    //          ::= <template-param>
     if (look() == 'T') {
-      if (!PushComponent(getDerived().parseTemplateParam()))
-        return nullptr;
-      Subs.push_back(SoFar);
-      continue;
-    }
-
-    //          ::= <template-prefix> <template-args>
-    if (look() == 'I') {
+      //          ::= <template-param>
+      if (SoFar != nullptr)
+        return nullptr; // Cannot have a prefix.
+      SoFar = getDerived().parseTemplateParam();
+    } else if (look() == 'I') {
+      //          ::= <template-prefix> <template-args>
+      if (SoFar == nullptr)
+        return nullptr; // Must have a prefix.
       Node *TA = getDerived().parseTemplateArgs(State != nullptr);
-      if (TA == nullptr || SoFar == nullptr)
-        return nullptr;
-      SoFar = make<NameWithTemplateArgs>(SoFar, TA);
-      if (!SoFar)
-        return nullptr;
-      if (State) State->EndsWithTemplateArgs = true;
-      Subs.push_back(SoFar);
-      continue;
-    }
-
-    //          ::= <decltype>
-    if (look() == 'D' && (look(1) == 't' || look(1) == 'T')) {
-      if (!PushComponent(getDerived().parseDecltype()))
+      if (TA == nullptr)
         return nullptr;
-      Subs.push_back(SoFar);
-      continue;
-    }
-
-    //          ::= <substitution>
-    if (look() == 'S' && look(1) != 't') {
-      Node *S = getDerived().parseSubstitution();
-      if (!PushComponent(S))
+      if (SoFar->getKind() == Node::KNameWithTemplateArgs)
+        // Semantically <template-args> <template-args> cannot be generated by a
+        // C++ entity.  There will always be [something like] a name between
+        // them.
         return nullptr;
-      if (SoFar != S)
-        Subs.push_back(S);
-      continue;
-    }
+      if (State)
+        State->EndsWithTemplateArgs = true;
+      SoFar = make<NameWithTemplateArgs>(SoFar, TA);
+    } else if (look() == 'D' && (look(1) == 't' || look(1) == 'T')) {
+      //          ::= <decltype>
+      if (SoFar != nullptr)
+        return nullptr; // Cannot have a prefix.
+      SoFar = getDerived().parseDecltype();
+    } else {
+      ModuleName *Module = nullptr;
+
+      if (look() == 'S') {
+        //          ::= <substitution>
+        Node *S = nullptr;
+        if (look(1) == 't') {
+          First += 2;
+          S = make<NameType>("std");
+        } else {
+          S = getDerived().parseSubstitution();
+        }
+        if (!S)
+          return nullptr;
+        if (S->getKind() == Node::KModuleName) {
+          Module = static_cast<ModuleName *>(S);
+        } else if (SoFar != nullptr) {
+          return nullptr; // Cannot have a prefix.
+        } else {
+          SoFar = S;
+          continue; // Do not push a new substitution.
+        }
+      }
 
-    // Parse an <unqualified-name> thats actually a <ctor-dtor-name>.
-    if (look() == 'C' || (look() == 'D' && look(1) != 'C')) {
-      if (SoFar == nullptr)
-        return nullptr;
-      if (!PushComponent(getDerived().parseCtorDtorName(SoFar, State)))
-        return nullptr;
-      SoFar = getDerived().parseAbiTags(SoFar);
-      if (SoFar == nullptr)
-        return nullptr;
-      Subs.push_back(SoFar);
-      continue;
+      //          ::= [<prefix>] <unqualified-name>
+      SoFar = getDerived().parseUnqualifiedName(State, SoFar, Module);
     }
 
-    //          ::= <prefix> <unqualified-name>
-    if (!PushComponent(getDerived().parseUnqualifiedName(State)))
+    if (SoFar == nullptr)
       return nullptr;
     Subs.push_back(SoFar);
+
+    // No longer used.
+    // <data-member-prefix> := <member source-name> [<template-args>] M
+    consumeIf('M');
   }
 
   if (SoFar == nullptr || Subs.empty())
@@ -3365,6 +3329,7 @@ Node *AbstractManglingParser<Derived, Alloc>::parseBaseUnresolvedName() {
 //                   ::= [gs] <base-unresolved-name>                     # x or (with "gs") ::x
 //                   ::= [gs] sr <unresolved-qualifier-level>+ E <base-unresolved-name>
 //                                                                       # A::x, N::y, A<T>::z; "gs" means leading "::"
+// [gs] has been parsed by caller.
 //                   ::= sr <unresolved-type> <base-unresolved-name>     # T::x / decltype(p)::x
 //  extension        ::= sr <unresolved-type> <template-args> <base-unresolved-name>
 //                                                                       # T::N::x /decltype(p)::N::x
@@ -3372,7 +3337,7 @@ Node *AbstractManglingParser<Derived, Alloc>::parseBaseUnresolvedName() {
 //
 // <unresolved-qualifier-level> ::= <simple-id>
 template <typename Derived, typename Alloc>
-Node *AbstractManglingParser<Derived, Alloc>::parseUnresolvedName() {
+Node *AbstractManglingParser<Derived, Alloc>::parseUnresolvedName(bool Global) {
   Node *SoFar = nullptr;
 
   // srN <unresolved-type> [<template-args>] <unresolved-qualifier-level>* E <base-unresolved-name>
@@ -3406,8 +3371,6 @@ Node *AbstractManglingParser<Derived, Alloc>::parseUnresolvedName() {
     return make<QualifiedName>(SoFar, Base);
   }
 
-  bool Global = consumeIf("gs");
-
   // [gs] <base-unresolved-name>                     # x or (with "gs") ::x
   if (!consumeIf("sr")) {
     SoFar = getDerived().parseBaseUnresolvedName();
@@ -3637,7 +3600,7 @@ Node *AbstractManglingParser<Derived, Alloc>::parseDecltype() {
     return nullptr;
   if (!consumeIf('E'))
     return nullptr;
-  return make<EnclosingExpr>("decltype(", E, ")");
+  return make<EnclosingExpr>("decltype", E);
 }
 
 // <array-type> ::= A <positive dimension number> _ <element type>
@@ -3723,8 +3686,8 @@ Node *AbstractManglingParser<Derived, Alloc>::parseQualifiedType() {
       StringView ProtoSourceName = Qual.dropFront(std::strlen("objcproto"));
       StringView Proto;
       {
-        SwapAndRestore<const char *> SaveFirst(First, ProtoSourceName.begin()),
-                                     SaveLast(Last, ProtoSourceName.end());
+        ScopedOverride<const char *> SaveFirst(First, ProtoSourceName.begin()),
+            SaveLast(Last, ProtoSourceName.end());
         Proto = parseBareSourceName();
       }
       if (Proto.empty())
@@ -3929,6 +3892,22 @@ Node *AbstractManglingParser<Derived, Alloc>::parseType() {
         return nullptr;
       return make<BinaryFPType>(DimensionNumber);
     }
+    //                ::= DB <number> _                             # C23 signed _BitInt(N)
+    //                ::= DB <instantiation-dependent expression> _ # C23 signed _BitInt(N)
+    //                ::= DU <number> _                             # C23 unsigned _BitInt(N)
+    //                ::= DU <instantiation-dependent expression> _ # C23 unsigned _BitInt(N)
+    case 'B':
+    case 'U': {
+      bool Signed = look(1) == 'B';
+      First += 2;
+      Node *Size = std::isdigit(look()) ? make<NameType>(parseNumber())
+                                        : getDerived().parseExpr();
+      if (!Size)
+        return nullptr;
+      if (!consumeIf('_'))
+        return nullptr;
+      return make<BitIntType>(Size, Signed);
+    }
     //                ::= Di   # char32_t
     case 'i':
       First += 2;
@@ -4077,8 +4056,9 @@ Node *AbstractManglingParser<Derived, Alloc>::parseType() {
   //             ::= <substitution>  # See Compression below
   case 'S': {
     if (look(1) != 't') {
-      Result = getDerived().parseSubstitution();
-      if (Result == nullptr)
+      bool IsSubst = false;
+      Result = getDerived().parseUnscopedName(nullptr, &IsSubst);
+      if (!Result)
         return nullptr;
 
       // Sub could be either of:
@@ -4091,12 +4071,14 @@ Node *AbstractManglingParser<Derived, Alloc>::parseType() {
       // If this is followed by some <template-args>, and we're permitted to
       // parse them, take the second production.
 
-      if (TryToParseTemplateArgs && look() == 'I') {
+      if (look() == 'I' && (!IsSubst || TryToParseTemplateArgs)) {
+        if (!IsSubst)
+          Subs.push_back(Result);
         Node *TA = getDerived().parseTemplateArgs();
         if (TA == nullptr)
           return nullptr;
         Result = make<NameWithTemplateArgs>(Result, TA);
-      } else {
+      } else if (IsSubst) {
         // If all we parsed was a substitution, don't re-insert into the
         // substitution table.
         return Result;
@@ -4121,22 +4103,24 @@ Node *AbstractManglingParser<Derived, Alloc>::parseType() {
 }
 
 template <typename Derived, typename Alloc>
-Node *AbstractManglingParser<Derived, Alloc>::parsePrefixExpr(StringView Kind) {
+Node *AbstractManglingParser<Derived, Alloc>::parsePrefixExpr(StringView Kind,
+                                                              Node::Prec Prec) {
   Node *E = getDerived().parseExpr();
   if (E == nullptr)
     return nullptr;
-  return make<PrefixExpr>(Kind, E);
+  return make<PrefixExpr>(Kind, E, Prec);
 }
 
 template <typename Derived, typename Alloc>
-Node *AbstractManglingParser<Derived, Alloc>::parseBinaryExpr(StringView Kind) {
+Node *AbstractManglingParser<Derived, Alloc>::parseBinaryExpr(StringView Kind,
+                                                              Node::Prec Prec) {
   Node *LHS = getDerived().parseExpr();
   if (LHS == nullptr)
     return nullptr;
   Node *RHS = getDerived().parseExpr();
   if (RHS == nullptr)
     return nullptr;
-  return make<BinaryExpr>(LHS, Kind, RHS);
+  return make<BinaryExpr>(LHS, Kind, RHS, Prec);
 }
 
 template <typename Derived, typename Alloc>
@@ -4191,43 +4175,6 @@ Node *AbstractManglingParser<Derived, Alloc>::parseFunctionParam() {
   return nullptr;
 }
 
-// [gs] nw <expression>* _ <type> E                     # new (expr-list) type
-// [gs] nw <expression>* _ <type> <initializer>         # new (expr-list) type (init)
-// [gs] na <expression>* _ <type> E                     # new[] (expr-list) type
-// [gs] na <expression>* _ <type> <initializer>         # new[] (expr-list) type (init)
-// <initializer> ::= pi <expression>* E                 # parenthesized initialization
-template <typename Derived, typename Alloc>
-Node *AbstractManglingParser<Derived, Alloc>::parseNewExpr() {
-  bool Global = consumeIf("gs");
-  bool IsArray = look(1) == 'a';
-  if (!consumeIf("nw") && !consumeIf("na"))
-    return nullptr;
-  size_t Exprs = Names.size();
-  while (!consumeIf('_')) {
-    Node *Ex = getDerived().parseExpr();
-    if (Ex == nullptr)
-      return nullptr;
-    Names.push_back(Ex);
-  }
-  NodeArray ExprList = popTrailingNodeArray(Exprs);
-  Node *Ty = getDerived().parseType();
-  if (Ty == nullptr)
-    return Ty;
-  if (consumeIf("pi")) {
-    size_t InitsBegin = Names.size();
-    while (!consumeIf('E')) {
-      Node *Init = getDerived().parseExpr();
-      if (Init == nullptr)
-        return Init;
-      Names.push_back(Init);
-    }
-    NodeArray Inits = popTrailingNodeArray(InitsBegin);
-    return make<NewExpr>(ExprList, Ty, Inits, Global, IsArray);
-  } else if (!consumeIf('E'))
-    return nullptr;
-  return make<NewExpr>(ExprList, Ty, NodeArray(), Global, IsArray);
-}
-
 // cv <type> <expression>                               # conversion with one argument
 // cv <type> _ <expression>* E                          # conversion with a different number of arguments
 template <typename Derived, typename Alloc>
@@ -4236,7 +4183,7 @@ Node *AbstractManglingParser<Derived, Alloc>::parseConversionExpr() {
     return nullptr;
   Node *Ty;
   {
-    SwapAndRestore<bool> SaveTemp(TryToParseTemplateArgs, false);
+    ScopedOverride<bool> SaveTemp(TryToParseTemplateArgs, false);
     Ty = getDerived().parseType();
   }
 
@@ -4353,7 +4300,7 @@ Node *AbstractManglingParser<Derived, Alloc>::parseExprPrimary() {
     return nullptr;
   }
   case 'D':
-    if (consumeIf("DnE"))
+    if (consumeIf("Dn") && (consumeIf('0'), consumeIf('E')))
       return make<NameType>("nullptr");
     return nullptr;
   case 'T':
@@ -4440,55 +4387,38 @@ Node *AbstractManglingParser<Derived, Alloc>::parseFoldExpr() {
   if (!consumeIf('f'))
     return nullptr;
 
-  char FoldKind = look();
-  bool IsLeftFold, HasInitializer;
-  HasInitializer = FoldKind == 'L' || FoldKind == 'R';
-  if (FoldKind == 'l' || FoldKind == 'L')
-    IsLeftFold = true;
-  else if (FoldKind == 'r' || FoldKind == 'R')
-    IsLeftFold = false;
-  else
+  bool IsLeftFold = false, HasInitializer = false;
+  switch (look()) {
+  default:
     return nullptr;
+  case 'L':
+    IsLeftFold = true;
+    HasInitializer = true;
+    break;
+  case 'R':
+    HasInitializer = true;
+    break;
+  case 'l':
+    IsLeftFold = true;
+    break;
+  case 'r':
+    break;
+  }
   ++First;
 
-  // FIXME: This map is duplicated in parseOperatorName and parseExpr.
-  StringView OperatorName;
-  if      (consumeIf("aa")) OperatorName = "&&";
-  else if (consumeIf("an")) OperatorName = "&";
-  else if (consumeIf("aN")) OperatorName = "&=";
-  else if (consumeIf("aS")) OperatorName = "=";
-  else if (consumeIf("cm")) OperatorName = ",";
-  else if (consumeIf("ds")) OperatorName = ".*";
-  else if (consumeIf("dv")) OperatorName = "/";
-  else if (consumeIf("dV")) OperatorName = "/=";
-  else if (consumeIf("eo")) OperatorName = "^";
-  else if (consumeIf("eO")) OperatorName = "^=";
-  else if (consumeIf("eq")) OperatorName = "==";
-  else if (consumeIf("ge")) OperatorName = ">=";
-  else if (consumeIf("gt")) OperatorName = ">";
-  else if (consumeIf("le")) OperatorName = "<=";
-  else if (consumeIf("ls")) OperatorName = "<<";
-  else if (consumeIf("lS")) OperatorName = "<<=";
-  else if (consumeIf("lt")) OperatorName = "<";
-  else if (consumeIf("mi")) OperatorName = "-";
-  else if (consumeIf("mI")) OperatorName = "-=";
-  else if (consumeIf("ml")) OperatorName = "*";
-  else if (consumeIf("mL")) OperatorName = "*=";
-  else if (consumeIf("ne")) OperatorName = "!=";
-  else if (consumeIf("oo")) OperatorName = "||";
-  else if (consumeIf("or")) OperatorName = "|";
-  else if (consumeIf("oR")) OperatorName = "|=";
-  else if (consumeIf("pl")) OperatorName = "+";
-  else if (consumeIf("pL")) OperatorName = "+=";
-  else if (consumeIf("rm")) OperatorName = "%";
-  else if (consumeIf("rM")) OperatorName = "%=";
-  else if (consumeIf("rs")) OperatorName = ">>";
-  else if (consumeIf("rS")) OperatorName = ">>=";
-  else return nullptr;
-
-  Node *Pack = getDerived().parseExpr(), *Init = nullptr;
+  const auto *Op = parseOperatorEncoding();
+  if (!Op)
+    return nullptr;
+  if (!(Op->getKind() == OperatorInfo::Binary
+        || (Op->getKind() == OperatorInfo::Member
+            && Op->getName().back() == '*')))
+    return nullptr;
+
+  Node *Pack = getDerived().parseExpr();
   if (Pack == nullptr)
     return nullptr;
+
+  Node *Init = nullptr;
   if (HasInitializer) {
     Init = getDerived().parseExpr();
     if (Init == nullptr)
@@ -4498,14 +4428,16 @@ Node *AbstractManglingParser<Derived, Alloc>::parseFoldExpr() {
   if (IsLeftFold && Init)
     std::swap(Pack, Init);
 
-  return make<FoldExpr>(IsLeftFold, OperatorName, Pack, Init);
+  return make<FoldExpr>(IsLeftFold, Op->getSymbol(), Pack, Init);
 }
 
 // <expression> ::= mc <parameter type> <expr> [<offset number>] E
 //
 // Not yet in the spec: https://github.com/itanium-cxx-abi/cxx-abi/issues/47
 template <typename Derived, typename Alloc>
-Node *AbstractManglingParser<Derived, Alloc>::parsePointerToMemberConversionExpr() {
+Node *
+AbstractManglingParser<Derived, Alloc>::parsePointerToMemberConversionExpr(
+    Node::Prec Prec) {
   Node *Ty = getDerived().parseType();
   if (!Ty)
     return nullptr;
@@ -4515,7 +4447,7 @@ Node *AbstractManglingParser<Derived, Alloc>::parsePointerToMemberConversionExpr
   StringView Offset = getDerived().parseNumber(true);
   if (!consumeIf('E'))
     return nullptr;
-  return make<PointerToMemberConversionExpr>(Ty, Expr, Offset);
+  return make<PointerToMemberConversionExpr>(Ty, Expr, Offset, Prec);
 }
 
 // <expression> ::= so <referent type> <expr> [<offset number>] <union-selector>* [p] E
@@ -4592,316 +4524,127 @@ Node *AbstractManglingParser<Derived, Alloc>::parseSubobjectExpr() {
 template <typename Derived, typename Alloc>
 Node *AbstractManglingParser<Derived, Alloc>::parseExpr() {
   bool Global = consumeIf("gs");
-  if (numLeft() < 2)
-    return nullptr;
 
-  switch (*First) {
-  case 'L':
-    return getDerived().parseExprPrimary();
-  case 'T':
-    return getDerived().parseTemplateParam();
-  case 'f': {
-    // Disambiguate a fold expression from a <function-param>.
-    if (look(1) == 'p' || (look(1) == 'L' && std::isdigit(look(2))))
-      return getDerived().parseFunctionParam();
-    return getDerived().parseFoldExpr();
-  }
-  case 'a':
-    switch (First[1]) {
-    case 'a':
-      First += 2;
-      return getDerived().parseBinaryExpr("&&");
-    case 'd':
-      First += 2;
-      return getDerived().parsePrefixExpr("&");
-    case 'n':
-      First += 2;
-      return getDerived().parseBinaryExpr("&");
-    case 'N':
-      First += 2;
-      return getDerived().parseBinaryExpr("&=");
-    case 'S':
-      First += 2;
-      return getDerived().parseBinaryExpr("=");
-    case 't': {
-      First += 2;
-      Node *Ty = getDerived().parseType();
-      if (Ty == nullptr)
+  const auto *Op = parseOperatorEncoding();
+  if (Op) {
+    auto Sym = Op->getSymbol();
+    switch (Op->getKind()) {
+    case OperatorInfo::Binary:
+      // Binary operator: lhs @ rhs
+      return getDerived().parseBinaryExpr(Sym, Op->getPrecedence());
+    case OperatorInfo::Prefix:
+      // Prefix unary operator: @ expr
+      return getDerived().parsePrefixExpr(Sym, Op->getPrecedence());
+    case OperatorInfo::Postfix: {
+      // Postfix unary operator: expr @
+      if (consumeIf('_'))
+        return getDerived().parsePrefixExpr(Sym, Op->getPrecedence());
+      Node *Ex = getDerived().parseExpr();
+      if (Ex == nullptr)
         return nullptr;
-      return make<EnclosingExpr>("alignof (", Ty, ")");
+      return make<PostfixExpr>(Ex, Sym, Op->getPrecedence());
     }
-    case 'z': {
-      First += 2;
-      Node *Ty = getDerived().parseExpr();
-      if (Ty == nullptr)
+    case OperatorInfo::Array: {
+      // Array Index:  lhs [ rhs ]
+      Node *Base = getDerived().parseExpr();
+      if (Base == nullptr)
         return nullptr;
-      return make<EnclosingExpr>("alignof (", Ty, ")");
-    }
-    }
-    return nullptr;
-  case 'c':
-    switch (First[1]) {
-    // cc <type> <expression>                               # const_cast<type>(expression)
-    case 'c': {
-      First += 2;
-      Node *Ty = getDerived().parseType();
-      if (Ty == nullptr)
-        return Ty;
-      Node *Ex = getDerived().parseExpr();
-      if (Ex == nullptr)
-        return Ex;
-      return make<CastExpr>("const_cast", Ty, Ex);
+      Node *Index = getDerived().parseExpr();
+      if (Index == nullptr)
+        return nullptr;
+      return make<ArraySubscriptExpr>(Base, Index, Op->getPrecedence());
     }
-    // cl <expression>+ E                                   # call
-    case 'l': {
-      First += 2;
-      Node *Callee = getDerived().parseExpr();
-      if (Callee == nullptr)
-        return Callee;
-      size_t ExprsBegin = Names.size();
-      while (!consumeIf('E')) {
-        Node *E = getDerived().parseExpr();
-        if (E == nullptr)
-          return E;
-        Names.push_back(E);
-      }
-      return make<CallExpr>(Callee, popTrailingNodeArray(ExprsBegin));
-    }
-    case 'm':
-      First += 2;
-      return getDerived().parseBinaryExpr(",");
-    case 'o':
-      First += 2;
-      return getDerived().parsePrefixExpr("~");
-    case 'v':
-      return getDerived().parseConversionExpr();
-    }
-    return nullptr;
-  case 'd':
-    switch (First[1]) {
-    case 'a': {
-      First += 2;
-      Node *Ex = getDerived().parseExpr();
-      if (Ex == nullptr)
-        return Ex;
-      return make<DeleteExpr>(Ex, Global, /*is_array=*/true);
-    }
-    case 'c': {
-      First += 2;
-      Node *T = getDerived().parseType();
-      if (T == nullptr)
-        return T;
-      Node *Ex = getDerived().parseExpr();
-      if (Ex == nullptr)
-        return Ex;
-      return make<CastExpr>("dynamic_cast", T, Ex);
-    }
-    case 'e':
-      First += 2;
-      return getDerived().parsePrefixExpr("*");
-    case 'l': {
-      First += 2;
-      Node *E = getDerived().parseExpr();
-      if (E == nullptr)
-        return E;
-      return make<DeleteExpr>(E, Global, /*is_array=*/false);
-    }
-    case 'n':
-      return getDerived().parseUnresolvedName();
-    case 's': {
-      First += 2;
+    case OperatorInfo::Member: {
+      // Member access lhs @ rhs
       Node *LHS = getDerived().parseExpr();
       if (LHS == nullptr)
         return nullptr;
       Node *RHS = getDerived().parseExpr();
       if (RHS == nullptr)
         return nullptr;
-      return make<MemberExpr>(LHS, ".*", RHS);
-    }
-    case 't': {
-      First += 2;
-      Node *LHS = getDerived().parseExpr();
-      if (LHS == nullptr)
-        return LHS;
-      Node *RHS = getDerived().parseExpr();
-      if (RHS == nullptr)
-        return nullptr;
-      return make<MemberExpr>(LHS, ".", RHS);
-    }
-    case 'v':
-      First += 2;
-      return getDerived().parseBinaryExpr("/");
-    case 'V':
-      First += 2;
-      return getDerived().parseBinaryExpr("/=");
-    }
-    return nullptr;
-  case 'e':
-    switch (First[1]) {
-    case 'o':
-      First += 2;
-      return getDerived().parseBinaryExpr("^");
-    case 'O':
-      First += 2;
-      return getDerived().parseBinaryExpr("^=");
-    case 'q':
-      First += 2;
-      return getDerived().parseBinaryExpr("==");
-    }
-    return nullptr;
-  case 'g':
-    switch (First[1]) {
-    case 'e':
-      First += 2;
-      return getDerived().parseBinaryExpr(">=");
-    case 't':
-      First += 2;
-      return getDerived().parseBinaryExpr(">");
-    }
-    return nullptr;
-  case 'i':
-    switch (First[1]) {
-    case 'x': {
-      First += 2;
-      Node *Base = getDerived().parseExpr();
-      if (Base == nullptr)
+      return make<MemberExpr>(LHS, Sym, RHS, Op->getPrecedence());
+    }
+    case OperatorInfo::New: {
+      // New
+      // # new (expr-list) type [(init)]
+      // [gs] nw <expression>* _ <type> [pi <expression>*] E
+      // # new[] (expr-list) type [(init)]
+      // [gs] na <expression>* _ <type> [pi <expression>*] E
+      size_t Exprs = Names.size();
+      while (!consumeIf('_')) {
+        Node *Ex = getDerived().parseExpr();
+        if (Ex == nullptr)
+          return nullptr;
+        Names.push_back(Ex);
+      }
+      NodeArray ExprList = popTrailingNodeArray(Exprs);
+      Node *Ty = getDerived().parseType();
+      if (Ty == nullptr)
         return nullptr;
-      Node *Index = getDerived().parseExpr();
-      if (Index == nullptr)
-        return Index;
-      return make<ArraySubscriptExpr>(Base, Index);
-    }
-    case 'l': {
-      First += 2;
+      bool HaveInits = consumeIf("pi");
       size_t InitsBegin = Names.size();
       while (!consumeIf('E')) {
-        Node *E = getDerived().parseBracedExpr();
-        if (E == nullptr)
+        if (!HaveInits)
           return nullptr;
-        Names.push_back(E);
+        Node *Init = getDerived().parseExpr();
+        if (Init == nullptr)
+          return Init;
+        Names.push_back(Init);
       }
-      return make<InitListExpr>(nullptr, popTrailingNodeArray(InitsBegin));
+      NodeArray Inits = popTrailingNodeArray(InitsBegin);
+      return make<NewExpr>(ExprList, Ty, Inits, Global,
+                           /*IsArray=*/Op->getFlag(), Op->getPrecedence());
     }
-    }
-    return nullptr;
-  case 'l':
-    switch (First[1]) {
-    case 'e':
-      First += 2;
-      return getDerived().parseBinaryExpr("<=");
-    case 's':
-      First += 2;
-      return getDerived().parseBinaryExpr("<<");
-    case 'S':
-      First += 2;
-      return getDerived().parseBinaryExpr("<<=");
-    case 't':
-      First += 2;
-      return getDerived().parseBinaryExpr("<");
-    }
-    return nullptr;
-  case 'm':
-    switch (First[1]) {
-    case 'c':
-      First += 2;
-      return parsePointerToMemberConversionExpr();
-    case 'i':
-      First += 2;
-      return getDerived().parseBinaryExpr("-");
-    case 'I':
-      First += 2;
-      return getDerived().parseBinaryExpr("-=");
-    case 'l':
-      First += 2;
-      return getDerived().parseBinaryExpr("*");
-    case 'L':
-      First += 2;
-      return getDerived().parseBinaryExpr("*=");
-    case 'm':
-      First += 2;
-      if (consumeIf('_'))
-        return getDerived().parsePrefixExpr("--");
+    case OperatorInfo::Del: {
+      // Delete
       Node *Ex = getDerived().parseExpr();
       if (Ex == nullptr)
         return nullptr;
-      return make<PostfixExpr>(Ex, "--");
-    }
-    return nullptr;
-  case 'n':
-    switch (First[1]) {
-    case 'a':
-    case 'w':
-      return getDerived().parseNewExpr();
-    case 'e':
-      First += 2;
-      return getDerived().parseBinaryExpr("!=");
-    case 'g':
-      First += 2;
-      return getDerived().parsePrefixExpr("-");
-    case 't':
-      First += 2;
-      return getDerived().parsePrefixExpr("!");
-    case 'x':
-      First += 2;
-      Node *Ex = getDerived().parseExpr();
-      if (Ex == nullptr)
-        return Ex;
-      return make<EnclosingExpr>("noexcept (", Ex, ")");
-    }
-    return nullptr;
-  case 'o':
-    switch (First[1]) {
-    case 'n':
-      return getDerived().parseUnresolvedName();
-    case 'o':
-      First += 2;
-      return getDerived().parseBinaryExpr("||");
-    case 'r':
-      First += 2;
-      return getDerived().parseBinaryExpr("|");
-    case 'R':
-      First += 2;
-      return getDerived().parseBinaryExpr("|=");
+      return make<DeleteExpr>(Ex, Global, /*IsArray=*/Op->getFlag(),
+                              Op->getPrecedence());
     }
-    return nullptr;
-  case 'p':
-    switch (First[1]) {
-    case 'm':
-      First += 2;
-      return getDerived().parseBinaryExpr("->*");
-    case 'l':
-      First += 2;
-      return getDerived().parseBinaryExpr("+");
-    case 'L':
-      First += 2;
-      return getDerived().parseBinaryExpr("+=");
-    case 'p': {
-      First += 2;
-      if (consumeIf('_'))
-        return getDerived().parsePrefixExpr("++");
-      Node *Ex = getDerived().parseExpr();
-      if (Ex == nullptr)
-        return Ex;
-      return make<PostfixExpr>(Ex, "++");
+    case OperatorInfo::Call: {
+      // Function Call
+      Node *Callee = getDerived().parseExpr();
+      if (Callee == nullptr)
+        return nullptr;
+      size_t ExprsBegin = Names.size();
+      while (!consumeIf('E')) {
+        Node *E = getDerived().parseExpr();
+        if (E == nullptr)
+          return nullptr;
+        Names.push_back(E);
+      }
+      return make<CallExpr>(Callee, popTrailingNodeArray(ExprsBegin),
+                            Op->getPrecedence());
     }
-    case 's':
-      First += 2;
-      return getDerived().parsePrefixExpr("+");
-    case 't': {
-      First += 2;
-      Node *L = getDerived().parseExpr();
-      if (L == nullptr)
+    case OperatorInfo::CCast: {
+      // C Cast: (type)expr
+      Node *Ty;
+      {
+        ScopedOverride<bool> SaveTemp(TryToParseTemplateArgs, false);
+        Ty = getDerived().parseType();
+      }
+      if (Ty == nullptr)
         return nullptr;
-      Node *R = getDerived().parseExpr();
-      if (R == nullptr)
+
+      size_t ExprsBegin = Names.size();
+      bool IsMany = consumeIf('_');
+      while (!consumeIf('E')) {
+        Node *E = getDerived().parseExpr();
+        if (E == nullptr)
+          return E;
+        Names.push_back(E);
+        if (!IsMany)
+          break;
+      }
+      NodeArray Exprs = popTrailingNodeArray(ExprsBegin);
+      if (!IsMany && Exprs.size() != 1)
         return nullptr;
-      return make<MemberExpr>(L, "->", R);
+      return make<ConversionExpr>(Ty, Exprs, Op->getPrecedence());
     }
-    }
-    return nullptr;
-  case 'q':
-    if (First[1] == 'u') {
-      First += 2;
+    case OperatorInfo::Conditional: {
+      // Conditional operator: expr ? expr : expr
       Node *Cond = getDerived().parseExpr();
       if (Cond == nullptr)
         return nullptr;
@@ -4911,147 +4654,120 @@ Node *AbstractManglingParser<Derived, Alloc>::parseExpr() {
       Node *RHS = getDerived().parseExpr();
       if (RHS == nullptr)
         return nullptr;
-      return make<ConditionalExpr>(Cond, LHS, RHS);
-    }
-    return nullptr;
-  case 'r':
-    switch (First[1]) {
-    case 'c': {
-      First += 2;
-      Node *T = getDerived().parseType();
-      if (T == nullptr)
-        return T;
-      Node *Ex = getDerived().parseExpr();
-      if (Ex == nullptr)
-        return Ex;
-      return make<CastExpr>("reinterpret_cast", T, Ex);
-    }
-    case 'm':
-      First += 2;
-      return getDerived().parseBinaryExpr("%");
-    case 'M':
-      First += 2;
-      return getDerived().parseBinaryExpr("%=");
-    case 's':
-      First += 2;
-      return getDerived().parseBinaryExpr(">>");
-    case 'S':
-      First += 2;
-      return getDerived().parseBinaryExpr(">>=");
-    }
-    return nullptr;
-  case 's':
-    switch (First[1]) {
-    case 'c': {
-      First += 2;
-      Node *T = getDerived().parseType();
-      if (T == nullptr)
-        return T;
-      Node *Ex = getDerived().parseExpr();
-      if (Ex == nullptr)
-        return Ex;
-      return make<CastExpr>("static_cast", T, Ex);
-    }
-    case 'o':
-      First += 2;
-      return parseSubobjectExpr();
-    case 'p': {
-      First += 2;
-      Node *Child = getDerived().parseExpr();
-      if (Child == nullptr)
-        return nullptr;
-      return make<ParameterPackExpansion>(Child);
+      return make<ConditionalExpr>(Cond, LHS, RHS, Op->getPrecedence());
     }
-    case 'r':
-      return getDerived().parseUnresolvedName();
-    case 't': {
-      First += 2;
+    case OperatorInfo::NamedCast: {
+      // Named cast operation, @<type>(expr)
       Node *Ty = getDerived().parseType();
       if (Ty == nullptr)
-        return Ty;
-      return make<EnclosingExpr>("sizeof (", Ty, ")");
-    }
-    case 'z': {
-      First += 2;
+        return nullptr;
       Node *Ex = getDerived().parseExpr();
       if (Ex == nullptr)
-        return Ex;
-      return make<EnclosingExpr>("sizeof (", Ex, ")");
+        return nullptr;
+      return make<CastExpr>(Sym, Ty, Ex, Op->getPrecedence());
     }
-    case 'Z':
-      First += 2;
-      if (look() == 'T') {
-        Node *R = getDerived().parseTemplateParam();
-        if (R == nullptr)
-          return nullptr;
-        return make<SizeofParamPackExpr>(R);
-      } else if (look() == 'f') {
-        Node *FP = getDerived().parseFunctionParam();
-        if (FP == nullptr)
-          return nullptr;
-        return make<EnclosingExpr>("sizeof... (", FP, ")");
-      }
-      return nullptr;
-    case 'P': {
-      First += 2;
-      size_t ArgsBegin = Names.size();
-      while (!consumeIf('E')) {
-        Node *Arg = getDerived().parseTemplateArg();
-        if (Arg == nullptr)
-          return nullptr;
-        Names.push_back(Arg);
-      }
-      auto *Pack = make<NodeArrayNode>(popTrailingNodeArray(ArgsBegin));
-      if (!Pack)
+    case OperatorInfo::OfIdOp: {
+      // [sizeof/alignof/typeid] ( <type>|<expr> )
+      Node *Arg =
+          Op->getFlag() ? getDerived().parseType() : getDerived().parseExpr();
+      if (!Arg)
         return nullptr;
-      return make<EnclosingExpr>("sizeof... (", Pack, ")");
+      return make<EnclosingExpr>(Sym, Arg, Op->getPrecedence());
     }
+    case OperatorInfo::NameOnly: {
+      // Not valid as an expression operand.
+      return nullptr;
     }
-    return nullptr;
-  case 't':
-    switch (First[1]) {
-    case 'e': {
-      First += 2;
-      Node *Ex = getDerived().parseExpr();
-      if (Ex == nullptr)
-        return Ex;
-      return make<EnclosingExpr>("typeid (", Ex, ")");
     }
-    case 'i': {
-      First += 2;
-      Node *Ty = getDerived().parseType();
-      if (Ty == nullptr)
-        return Ty;
-      return make<EnclosingExpr>("typeid (", Ty, ")");
+    DEMANGLE_UNREACHABLE;
+  }
+
+  if (numLeft() < 2)
+    return nullptr;
+
+  if (look() == 'L')
+    return getDerived().parseExprPrimary();
+  if (look() == 'T')
+    return getDerived().parseTemplateParam();
+  if (look() == 'f') {
+    // Disambiguate a fold expression from a <function-param>.
+    if (look(1) == 'p' || (look(1) == 'L' && std::isdigit(look(2))))
+      return getDerived().parseFunctionParam();
+    return getDerived().parseFoldExpr();
+  }
+  if (consumeIf("il")) {
+    size_t InitsBegin = Names.size();
+    while (!consumeIf('E')) {
+      Node *E = getDerived().parseBracedExpr();
+      if (E == nullptr)
+        return nullptr;
+      Names.push_back(E);
     }
-    case 'l': {
-      First += 2;
-      Node *Ty = getDerived().parseType();
-      if (Ty == nullptr)
+    return make<InitListExpr>(nullptr, popTrailingNodeArray(InitsBegin));
+  }
+  if (consumeIf("mc"))
+    return parsePointerToMemberConversionExpr(Node::Prec::Unary);
+  if (consumeIf("nx")) {
+    Node *Ex = getDerived().parseExpr();
+    if (Ex == nullptr)
+      return Ex;
+    return make<EnclosingExpr>("noexcept ", Ex, Node::Prec::Unary);
+  }
+  if (consumeIf("so"))
+    return parseSubobjectExpr();
+  if (consumeIf("sp")) {
+    Node *Child = getDerived().parseExpr();
+    if (Child == nullptr)
+      return nullptr;
+    return make<ParameterPackExpansion>(Child);
+  }
+  if (consumeIf("sZ")) {
+    if (look() == 'T') {
+      Node *R = getDerived().parseTemplateParam();
+      if (R == nullptr)
         return nullptr;
-      size_t InitsBegin = Names.size();
-      while (!consumeIf('E')) {
-        Node *E = getDerived().parseBracedExpr();
-        if (E == nullptr)
-          return nullptr;
-        Names.push_back(E);
-      }
-      return make<InitListExpr>(Ty, popTrailingNodeArray(InitsBegin));
+      return make<SizeofParamPackExpr>(R);
     }
-    case 'r':
-      First += 2;
-      return make<NameType>("throw");
-    case 'w': {
-      First += 2;
-      Node *Ex = getDerived().parseExpr();
-      if (Ex == nullptr)
+    Node *FP = getDerived().parseFunctionParam();
+    if (FP == nullptr)
+      return nullptr;
+    return make<EnclosingExpr>("sizeof... ", FP);
+  }
+  if (consumeIf("sP")) {
+    size_t ArgsBegin = Names.size();
+    while (!consumeIf('E')) {
+      Node *Arg = getDerived().parseTemplateArg();
+      if (Arg == nullptr)
         return nullptr;
-      return make<ThrowExpr>(Ex);
+      Names.push_back(Arg);
     }
+    auto *Pack = make<NodeArrayNode>(popTrailingNodeArray(ArgsBegin));
+    if (!Pack)
+      return nullptr;
+    return make<EnclosingExpr>("sizeof... ", Pack);
+  }
+  if (consumeIf("tl")) {
+    Node *Ty = getDerived().parseType();
+    if (Ty == nullptr)
+      return nullptr;
+    size_t InitsBegin = Names.size();
+    while (!consumeIf('E')) {
+      Node *E = getDerived().parseBracedExpr();
+      if (E == nullptr)
+        return nullptr;
+      Names.push_back(E);
     }
-    return nullptr;
-  case 'u': {
-    ++First;
+    return make<InitListExpr>(Ty, popTrailingNodeArray(InitsBegin));
+  }
+  if (consumeIf("tr"))
+    return make<NameType>("throw");
+  if (consumeIf("tw")) {
+    Node *Ex = getDerived().parseExpr();
+    if (Ex == nullptr)
+      return nullptr;
+    return make<ThrowExpr>(Ex);
+  }
+  if (consumeIf('u')) {
     Node *Name = getDerived().parseSourceName(/*NameState=*/nullptr);
     if (!Name)
       return nullptr;
@@ -5060,45 +4776,36 @@ Node *AbstractManglingParser<Derived, Alloc>::parseExpr() {
     // interpreted as <type> node 'short' or 'ellipsis'. However, neither
     // __uuidof(short) nor __uuidof(...) can actually appear, so there is no
     // actual conflict here.
+    bool IsUUID = false;
+    Node *UUID = nullptr;
     if (Name->getBaseName() == "__uuidof") {
-      if (numLeft() < 2)
-        return nullptr;
-      if (*First == 't') {
-        ++First;
-        Node *Ty = getDerived().parseType();
-        if (!Ty)
-          return nullptr;
-        return make<CallExpr>(Name, makeNodeArray(&Ty, &Ty + 1));
-      }
-      if (*First == 'z') {
-        ++First;
-        Node *Ex = getDerived().parseExpr();
-        if (!Ex)
-          return nullptr;
-        return make<CallExpr>(Name, makeNodeArray(&Ex, &Ex + 1));
+      if (consumeIf('t')) {
+        UUID = getDerived().parseType();
+        IsUUID = true;
+      } else if (consumeIf('z')) {
+        UUID = getDerived().parseExpr();
+        IsUUID = true;
       }
     }
     size_t ExprsBegin = Names.size();
-    while (!consumeIf('E')) {
-      Node *E = getDerived().parseTemplateArg();
-      if (E == nullptr)
-        return E;
-      Names.push_back(E);
+    if (IsUUID) {
+      if (UUID == nullptr)
+        return nullptr;
+      Names.push_back(UUID);
+    } else {
+      while (!consumeIf('E')) {
+        Node *E = getDerived().parseTemplateArg();
+        if (E == nullptr)
+          return E;
+        Names.push_back(E);
+      }
     }
-    return make<CallExpr>(Name, popTrailingNodeArray(ExprsBegin));
-  }
-  case '1':
-  case '2':
-  case '3':
-  case '4':
-  case '5':
-  case '6':
-  case '7':
-  case '8':
-  case '9':
-    return getDerived().parseUnresolvedName();
+    return make<CallExpr>(Name, popTrailingNodeArray(ExprsBegin),
+                          Node::Prec::Postfix);
   }
-  return nullptr;
+
+  // Only unresolved names remain.
+  return getDerived().parseUnresolvedName(Global);
 }
 
 // <call-offset> ::= h <nv-offset> _
@@ -5131,14 +4838,17 @@ bool AbstractManglingParser<Alloc, Derived>::parseCallOffset() {
 //                    # second call-offset is result adjustment
 //                ::= T <call-offset> <base encoding>
 //                    # base is the nominal target function of thunk
-//                ::= GV <object name> # Guard variable for one-time initialization
+//                # Guard variable for one-time initialization
+//                ::= GV <object name>
 //                                     # No <type>
 //                ::= TW <object name> # Thread-local wrapper
 //                ::= TH <object name> # Thread-local initialization
 //                ::= GR <object name> _             # First temporary
 //                ::= GR <object name> <seq-id> _    # Subsequent temporaries
-//      extension ::= TC <first type> <number> _ <second type> # construction vtable for second-in-first
+//                # construction vtable for second-in-first
+//      extension ::= TC <first type> <number> _ <second type>
 //      extension ::= GR <object name> # reference temporary for object
+//      extension ::= GI <module name> # module global initializer
 template <typename Derived, typename Alloc>
 Node *AbstractManglingParser<Derived, Alloc>::parseSpecialName() {
   switch (look()) {
@@ -5265,6 +4975,16 @@ Node *AbstractManglingParser<Derived, Alloc>::parseSpecialName() {
         return nullptr;
       return make<SpecialName>("reference temporary for ", Name);
     }
+    // GI <module-name> v
+    case 'I': {
+      First += 2;
+      ModuleName *Module = nullptr;
+      if (getDerived().parseModuleNameOpt(Module))
+        return nullptr;
+      if (Module == nullptr)
+        return nullptr;
+      return make<SpecialName>("initializer for module ", Module);
+    }
     }
   }
   return nullptr;
@@ -5379,7 +5099,7 @@ template <>
 struct FloatData<long double>
 {
 #if defined(__mips__) && defined(__mips_n64) || defined(__aarch64__) || \
-    defined(__wasm__)
+    defined(__wasm__) || defined(__riscv)
     static const size_t mangled_size = 32;
 #elif defined(__arm__) || defined(__mips__) || defined(__hexagon__)
     static const size_t mangled_size = 16;
@@ -5444,6 +5164,7 @@ bool AbstractManglingParser<Alloc, Derived>::parseSeqId(size_t *Out) {
 // <substitution> ::= Si # ::std::basic_istream<char,  std::char_traits<char> >
 // <substitution> ::= So # ::std::basic_ostream<char,  std::char_traits<char> >
 // <substitution> ::= Sd # ::std::basic_iostream<char, std::char_traits<char> >
+// The St case is handled specially in parseNestedName.
 template <typename Derived, typename Alloc>
 Node *AbstractManglingParser<Derived, Alloc>::parseSubstitution() {
   if (!consumeIf('S'))
diff --git a/llvm/include/llvm/Demangle/ItaniumNodes.def b/llvm/include/llvm/Demangle/ItaniumNodes.def
new file mode 100644
index 000000000000..c0e277d554cc
--- /dev/null
+++ b/llvm/include/llvm/Demangle/ItaniumNodes.def
@@ -0,0 +1,95 @@
+//===--- ItaniumNodes.def ------------*- mode:c++;eval:(read-only-mode) -*-===//
+//       Do not edit! See README.txt.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Define the demangler's node names
+
+#ifndef NODE
+#error Define NODE to handle nodes
+#endif
+
+NODE(NodeArrayNode)
+NODE(DotSuffix)
+NODE(VendorExtQualType)
+NODE(QualType)
+NODE(ConversionOperatorType)
+NODE(PostfixQualifiedType)
+NODE(ElaboratedTypeSpefType)
+NODE(NameType)
+NODE(AbiTagAttr)
+NODE(EnableIfAttr)
+NODE(ObjCProtoName)
+NODE(PointerType)
+NODE(ReferenceType)
+NODE(PointerToMemberType)
+NODE(ArrayType)
+NODE(FunctionType)
+NODE(NoexceptSpec)
+NODE(DynamicExceptionSpec)
+NODE(FunctionEncoding)
+NODE(LiteralOperator)
+NODE(SpecialName)
+NODE(CtorVtableSpecialName)
+NODE(QualifiedName)
+NODE(NestedName)
+NODE(LocalName)
+NODE(ModuleName)
+NODE(ModuleEntity)
+NODE(VectorType)
+NODE(PixelVectorType)
+NODE(BinaryFPType)
+NODE(BitIntType)
+NODE(SyntheticTemplateParamName)
+NODE(TypeTemplateParamDecl)
+NODE(NonTypeTemplateParamDecl)
+NODE(TemplateTemplateParamDecl)
+NODE(TemplateParamPackDecl)
+NODE(ParameterPack)
+NODE(TemplateArgumentPack)
+NODE(ParameterPackExpansion)
+NODE(TemplateArgs)
+NODE(ForwardTemplateReference)
+NODE(NameWithTemplateArgs)
+NODE(GlobalQualifiedName)
+NODE(ExpandedSpecialSubstitution)
+NODE(SpecialSubstitution)
+NODE(CtorDtorName)
+NODE(DtorName)
+NODE(UnnamedTypeName)
+NODE(ClosureTypeName)
+NODE(StructuredBindingName)
+NODE(BinaryExpr)
+NODE(ArraySubscriptExpr)
+NODE(PostfixExpr)
+NODE(ConditionalExpr)
+NODE(MemberExpr)
+NODE(SubobjectExpr)
+NODE(EnclosingExpr)
+NODE(CastExpr)
+NODE(SizeofParamPackExpr)
+NODE(CallExpr)
+NODE(NewExpr)
+NODE(DeleteExpr)
+NODE(PrefixExpr)
+NODE(FunctionParam)
+NODE(ConversionExpr)
+NODE(PointerToMemberConversionExpr)
+NODE(InitListExpr)
+NODE(FoldExpr)
+NODE(ThrowExpr)
+NODE(BoolExpr)
+NODE(StringLiteral)
+NODE(LambdaExpr)
+NODE(EnumLiteral)
+NODE(IntegerLiteral)
+NODE(FloatLiteral)
+NODE(DoubleLiteral)
+NODE(LongDoubleLiteral)
+NODE(BracedExpr)
+NODE(BracedRangeExpr)
+
+#undef NODE
diff --git a/llvm/include/llvm/Demangle/Utility.h b/llvm/include/llvm/Demangle/Utility.h
index 1cf7e8f1df45..ca7e44b948c7 100644
--- a/llvm/include/llvm/Demangle/Utility.h
+++ b/llvm/include/llvm/Demangle/Utility.h
@@ -33,43 +33,50 @@ class OutputBuffer {
   size_t CurrentPosition = 0;
   size_t BufferCapacity = 0;
 
-  // Ensure there is at least n more positions in buffer.
+  // Ensure there are at least N more positions in the buffer.
   void grow(size_t N) {
-    if (N + CurrentPosition >= BufferCapacity) {
+    size_t Need = N + CurrentPosition;
+    if (Need > BufferCapacity) {
+      // Reduce the number of reallocations, with a bit of hysteresis. The
+      // number here is chosen so the first allocation will more-than-likely not
+      // allocate more than 1K.
+      Need += 1024 - 32;
       BufferCapacity *= 2;
-      if (BufferCapacity < N + CurrentPosition)
-        BufferCapacity = N + CurrentPosition;
+      if (BufferCapacity < Need)
+        BufferCapacity = Need;
       Buffer = static_cast<char *>(std::realloc(Buffer, BufferCapacity));
       if (Buffer == nullptr)
         std::terminate();
     }
   }
 
-  void writeUnsigned(uint64_t N, bool isNeg = false) {
-    // Handle special case...
-    if (N == 0) {
-      *this << '0';
-      return;
-    }
-
+  OutputBuffer &writeUnsigned(uint64_t N, bool isNeg = false) {
     std::array<char, 21> Temp;
     char *TempPtr = Temp.data() + Temp.size();
 
-    while (N) {
+    // Output at least one character.
+    do {
       *--TempPtr = char('0' + N % 10);
       N /= 10;
-    }
+    } while (N);
 
-    // Add negative sign...
+    // Add negative sign.
     if (isNeg)
       *--TempPtr = '-';
-    this->operator<<(StringView(TempPtr, Temp.data() + Temp.size()));
+
+    return operator+=(StringView(TempPtr, Temp.data() + Temp.size()));
   }
 
 public:
   OutputBuffer(char *StartBuf, size_t Size)
       : Buffer(StartBuf), CurrentPosition(0), BufferCapacity(Size) {}
   OutputBuffer() = default;
+  // Non-copyable
+  OutputBuffer(const OutputBuffer &) = delete;
+  OutputBuffer &operator=(const OutputBuffer &) = delete;
+
+  operator StringView() const { return StringView(Buffer, CurrentPosition); }
+
   void reset(char *Buffer_, size_t BufferCapacity_) {
     CurrentPosition = 0;
     Buffer = Buffer_;
@@ -81,13 +88,27 @@ public:
   unsigned CurrentPackIndex = std::numeric_limits<unsigned>::max();
   unsigned CurrentPackMax = std::numeric_limits<unsigned>::max();
 
+  /// When zero, we're printing template args and '>' needs to be parenthesized.
+  /// Use a counter so we can simply increment inside parentheses.
+  unsigned GtIsGt = 1;
+
+  bool isGtInsideTemplateArgs() const { return GtIsGt == 0; }
+
+  void printOpen(char Open = '(') {
+    GtIsGt++;
+    *this += Open;
+  }
+  void printClose(char Close = ')') {
+    GtIsGt--;
+    *this += Close;
+  }
+
   OutputBuffer &operator+=(StringView R) {
-    size_t Size = R.size();
-    if (Size == 0)
-      return *this;
-    grow(Size);
-    std::memmove(Buffer + CurrentPosition, R.begin(), Size);
-    CurrentPosition += Size;
+    if (size_t Size = R.size()) {
+      grow(Size);
+      std::memcpy(Buffer + CurrentPosition, R.begin(), Size);
+      CurrentPosition += Size;
+    }
     return *this;
   }
 
@@ -97,9 +118,7 @@ public:
     return *this;
   }
 
-  OutputBuffer &operator<<(StringView R) { return (*this += R); }
-
-  OutputBuffer prepend(StringView R) {
+  OutputBuffer &prepend(StringView R) {
     size_t Size = R.size();
 
     grow(Size);
@@ -110,19 +129,16 @@ public:
     return *this;
   }
 
+  OutputBuffer &operator<<(StringView R) { return (*this += R); }
+
   OutputBuffer &operator<<(char C) { return (*this += C); }
 
   OutputBuffer &operator<<(long long N) {
-    if (N < 0)
-      writeUnsigned(static_cast<unsigned long long>(-N), true);
-    else
-      writeUnsigned(static_cast<unsigned long long>(N));
-    return *this;
+    return writeUnsigned(static_cast<unsigned long long>(std::abs(N)), N < 0);
   }
 
   OutputBuffer &operator<<(unsigned long long N) {
-    writeUnsigned(N, false);
-    return *this;
+    return writeUnsigned(N, false);
   }
 
   OutputBuffer &operator<<(long N) {
@@ -155,7 +171,8 @@ public:
   void setCurrentPosition(size_t NewPos) { CurrentPosition = NewPos; }
 
   char back() const {
-    return CurrentPosition ? Buffer[CurrentPosition - 1] : '\0';
+    assert(CurrentPosition);
+    return Buffer[CurrentPosition - 1];
   }
 
   bool empty() const { return CurrentPosition == 0; }
@@ -165,35 +182,20 @@ public:
   size_t getBufferCapacity() const { return BufferCapacity; }
 };
 
-template <class T> class SwapAndRestore {
-  T &Restore;
-  T OriginalValue;
-  bool ShouldRestore = true;
+template <class T> class ScopedOverride {
+  T &Loc;
+  T Original;
 
 public:
-  SwapAndRestore(T &Restore_) : SwapAndRestore(Restore_, Restore_) {}
-
-  SwapAndRestore(T &Restore_, T NewVal)
-      : Restore(Restore_), OriginalValue(Restore) {
-    Restore = std::move(NewVal);
-  }
-  ~SwapAndRestore() {
-    if (ShouldRestore)
-      Restore = std::move(OriginalValue);
-  }
-
-  void shouldRestore(bool ShouldRestore_) { ShouldRestore = ShouldRestore_; }
-
-  void restoreNow(bool Force) {
-    if (!Force && !ShouldRestore)
-      return;
+  ScopedOverride(T &Loc_) : ScopedOverride(Loc_, Loc_) {}
 
-    Restore = std::move(OriginalValue);
-    ShouldRestore = false;
+  ScopedOverride(T &Loc_, T NewVal) : Loc(Loc_), Original(Loc_) {
+    Loc_ = std::move(NewVal);
   }
+  ~ScopedOverride() { Loc = std::move(Original); }
 
-  SwapAndRestore(const SwapAndRestore &) = delete;
-  SwapAndRestore &operator=(const SwapAndRestore &) = delete;
+  ScopedOverride(const ScopedOverride &) = delete;
+  ScopedOverride &operator=(const ScopedOverride &) = delete;
 };
 
 inline bool initializeOutputBuffer(char *Buf, size_t *N, OutputBuffer &OB,
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/DWARFRecordSectionSplitter.h b/llvm/include/llvm/ExecutionEngine/JITLink/DWARFRecordSectionSplitter.h
new file mode 100644
index 000000000000..d748d4b0fa59
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/DWARFRecordSectionSplitter.h
@@ -0,0 +1,35 @@
+//===--------- DWARFRecordSectionSplitter.h - JITLink -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_JITLINK_DWARFRECORDSECTIONSPLITTER_H
+#define LLVM_EXECUTIONENGINE_JITLINK_DWARFRECORDSECTIONSPLITTER_H
+
+#include "llvm/ExecutionEngine/JITLink/JITLink.h"
+
+namespace llvm {
+namespace jitlink {
+
+/// A LinkGraph pass that splits blocks in a section that follows the DWARF
+/// Record format into sub-blocks where each header gets its own block.
+/// When splitting EHFrames, DWARFRecordSectionSplitter should not be run
+/// without EHFrameEdgeFixer, which is responsible for adding FDE-to-CIE edges.
+class DWARFRecordSectionSplitter {
+public:
+  DWARFRecordSectionSplitter(StringRef SectionName);
+  Error operator()(LinkGraph &G);
+
+private:
+  Error processBlock(LinkGraph &G, Block &B, LinkGraph::SplitBlockCache &Cache);
+
+  StringRef SectionName;
+};
+
+} // namespace jitlink
+} // namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_JITLINK_DWARFRECORDSECTIONSPLITTER_H
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
index 25f1349f15f2..897808c0ee83 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
@@ -223,6 +223,11 @@ public:
   /// Returns the size of this defined addressable.
   size_t getSize() const { return Size; }
 
+  /// Returns the address range of this defined addressable.
+  orc::ExecutorAddrRange getRange() const {
+    return orc::ExecutorAddrRange(getAddress(), getSize());
+  }
+
   /// Get the content for this block. Block must not be a zero-fill block.
   ArrayRef<char> getContent() const {
     assert(Data && "Block does not contain content");
@@ -576,6 +581,11 @@ public:
     this->Size = Size;
   }
 
+  /// Returns the address range of this symbol.
+  orc::ExecutorAddrRange getRange() const {
+    return orc::ExecutorAddrRange(getAddress(), getSize());
+  }
+
   /// Returns true if this symbol is backed by a zero-fill block.
   /// This method may only be called on defined symbols.
   bool isSymbolZeroFill() const { return getBlock().isZeroFill(); }
@@ -1215,8 +1225,11 @@ public:
   /// Make the given symbol an absolute with the given address (must not already
   /// be absolute).
   ///
-  /// Symbol size, linkage, scope, and callability, and liveness will be left
-  /// unchanged. Symbol offset will be reset to 0.
+  /// The symbol's size, linkage, and callability, and liveness will be left
+  /// unchanged, and its offset will be reset to 0.
+  ///
+  /// If the symbol was external then its scope will be set to local, otherwise
+  /// it will be left unchanged.
   void makeAbsolute(Symbol &Sym, orc::ExecutorAddr Address) {
     assert(!Sym.isAbsolute() && "Symbol is already absolute");
     if (Sym.isExternal()) {
@@ -1225,6 +1238,7 @@ public:
       assert(Sym.getOffset() == 0 && "External is not at offset 0");
       ExternalSymbols.erase(&Sym);
       Sym.getAddressable().setAbsolute(true);
+      Sym.setScope(Scope::Local);
     } else {
       assert(Sym.isDefined() && "Sym is not a defined symbol");
       Section &Sec = Sym.getBlock().getSection();
@@ -1733,6 +1747,9 @@ Error markAllSymbolsLive(LinkGraph &G);
 Error makeTargetOutOfRangeError(const LinkGraph &G, const Block &B,
                                 const Edge &E);
 
+Error makeAlignmentError(llvm::orc::ExecutorAddr Loc, uint64_t Value, int N,
+                         const Edge &E);
+
 /// Base case for edge-visitors where the visitor-list is empty.
 inline void visitEdge(LinkGraph &G, Block *B, Edge &E) {}
 
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/MachO_arm64.h b/llvm/include/llvm/ExecutionEngine/JITLink/MachO_arm64.h
index aee14c0d1fe5..6f2ff012697d 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/MachO_arm64.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/MachO_arm64.h
@@ -18,30 +18,6 @@
 namespace llvm {
 namespace jitlink {
 
-namespace MachO_arm64_Edges {
-
-enum MachOARM64RelocationKind : Edge::Kind {
-  Branch26 = Edge::FirstRelocation,
-  Pointer32,
-  Pointer64,
-  Pointer64Anon,
-  Page21,
-  PageOffset12,
-  GOTPage21,
-  GOTPageOffset12,
-  TLVPage21,
-  TLVPageOffset12,
-  PointerToGOT,
-  PairedAddend,
-  LDRLiteral19,
-  Delta32,
-  Delta64,
-  NegDelta32,
-  NegDelta64,
-};
-
-} // namespace MachO_arm64_Edges
-
 /// Create a LinkGraph from a MachO/arm64 relocatable object.
 ///
 /// Note: The graph does not take ownership of the underlying buffer, nor copy
@@ -62,9 +38,6 @@ createLinkGraphFromMachOObject_arm64(MemoryBufferRef ObjectBuffer);
 void link_MachO_arm64(std::unique_ptr<LinkGraph> G,
                       std::unique_ptr<JITLinkContext> Ctx);
 
-/// Return the string name of the given MachO arm64 edge kind.
-const char *getMachOARM64RelocationKindName(Edge::Kind R);
-
 } // end namespace jitlink
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/MemoryFlags.h b/llvm/include/llvm/ExecutionEngine/JITLink/MemoryFlags.h
index e9771319ef06..a18098e5a1a9 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/MemoryFlags.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/MemoryFlags.h
@@ -152,13 +152,9 @@ public:
   using iterator = typename VectorTy::iterator;
 
   AllocGroupSmallMap() = default;
-  AllocGroupSmallMap(std::initializer_list<std::pair<AllocGroup, T>> Inits) {
-    Elems.reserve(Inits.size());
-    for (const auto &E : Inits)
-      Elems.push_back(E);
-    llvm::sort(Elems, [](const ElemT &LHS, const ElemT &RHS) {
-      return LHS.first < RHS.first;
-    });
+  AllocGroupSmallMap(std::initializer_list<std::pair<AllocGroup, T>> Inits)
+      : Elems(Inits) {
+    llvm::sort(Elems, llvm::less_first());
   }
 
   iterator begin() { return Elems.begin(); }
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/aarch64.h b/llvm/include/llvm/ExecutionEngine/JITLink/aarch64.h
index 994ce783b058..53ff6c7a219e 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/aarch64.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/aarch64.h
@@ -13,24 +13,353 @@
 #ifndef LLVM_EXECUTIONENGINE_JITLINK_AARCH64_H
 #define LLVM_EXECUTIONENGINE_JITLINK_AARCH64_H
 
+#include "TableManager.h"
 #include "llvm/ExecutionEngine/JITLink/JITLink.h"
+#include "llvm/ExecutionEngine/JITLink/MemoryFlags.h"
 
 namespace llvm {
 namespace jitlink {
 namespace aarch64 {
 
-/// Represets aarch64 fixups
 enum EdgeKind_aarch64 : Edge::Kind {
-
-  /// Set a CALL immediate field to bits [27:2] of X = Target - Fixup + Addend
-  R_AARCH64_CALL26 = Edge::FirstRelocation,
-
+  Branch26 = Edge::FirstRelocation,
+  Pointer32,
+  Pointer64,
+  Pointer64Anon,
+  Page21,
+  PageOffset12,
+  MoveWide16,
+  GOTPage21,
+  GOTPageOffset12,
+  TLVPage21,
+  TLVPageOffset12,
+  PointerToGOT,
+  PairedAddend,
+  LDRLiteral19,
+  Delta32,
+  Delta64,
+  NegDelta32,
+  NegDelta64,
 };
 
 /// Returns a string name for the given aarch64 edge. For debugging purposes
 /// only
 const char *getEdgeKindName(Edge::Kind K);
 
+// Returns whether the Instr is LD/ST (imm12)
+inline bool isLoadStoreImm12(uint32_t Instr) {
+  constexpr uint32_t LoadStoreImm12Mask = 0x3b000000;
+  return (Instr & LoadStoreImm12Mask) == 0x39000000;
+}
+
+// Returns the amount the address operand of LD/ST (imm12)
+// should be shifted right by.
+//
+// The shift value varies by the data size of LD/ST instruction.
+// For instance, LDH instructoin needs the address to be shifted
+// right by 1.
+inline unsigned getPageOffset12Shift(uint32_t Instr) {
+  constexpr uint32_t Vec128Mask = 0x04800000;
+
+  if (isLoadStoreImm12(Instr)) {
+    uint32_t ImplicitShift = Instr >> 30;
+    if (ImplicitShift == 0)
+      if ((Instr & Vec128Mask) == Vec128Mask)
+        ImplicitShift = 4;
+
+    return ImplicitShift;
+  }
+
+  return 0;
+}
+
+// Returns whether the Instr is MOVK/MOVZ (imm16) with a zero immediate field
+inline bool isMoveWideImm16(uint32_t Instr) {
+  constexpr uint32_t MoveWideImm16Mask = 0x5f9fffe0;
+  return (Instr & MoveWideImm16Mask) == 0x52800000;
+}
+
+// Returns the amount the address operand of MOVK/MOVZ (imm16)
+// should be shifted right by.
+//
+// The shift value is specfied in the assembly as LSL #<shift>.
+inline unsigned getMoveWide16Shift(uint32_t Instr) {
+  if (isMoveWideImm16(Instr)) {
+    uint32_t ImplicitShift = (Instr >> 21) & 0b11;
+    return ImplicitShift << 4;
+  }
+
+  return 0;
+}
+
+/// Apply fixup expression for edge to block content.
+inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E) {
+  using namespace support;
+
+  char *BlockWorkingMem = B.getAlreadyMutableContent().data();
+  char *FixupPtr = BlockWorkingMem + E.getOffset();
+  orc::ExecutorAddr FixupAddress = B.getAddress() + E.getOffset();
+
+  switch (E.getKind()) {
+  case Branch26: {
+    assert((FixupAddress.getValue() & 0x3) == 0 &&
+           "Branch-inst is not 32-bit aligned");
+
+    int64_t Value = E.getTarget().getAddress() - FixupAddress + E.getAddend();
+
+    if (static_cast<uint64_t>(Value) & 0x3)
+      return make_error<JITLinkError>("Branch26 target is not 32-bit "
+                                      "aligned");
+
+    if (Value < -(1 << 27) || Value > ((1 << 27) - 1))
+      return makeTargetOutOfRangeError(G, B, E);
+
+    uint32_t RawInstr = *(little32_t *)FixupPtr;
+    assert((RawInstr & 0x7fffffff) == 0x14000000 &&
+           "RawInstr isn't a B or BR immediate instruction");
+    uint32_t Imm = (static_cast<uint32_t>(Value) & ((1 << 28) - 1)) >> 2;
+    uint32_t FixedInstr = RawInstr | Imm;
+    *(little32_t *)FixupPtr = FixedInstr;
+    break;
+  }
+  case Pointer32: {
+    uint64_t Value = E.getTarget().getAddress().getValue() + E.getAddend();
+    if (Value > std::numeric_limits<uint32_t>::max())
+      return makeTargetOutOfRangeError(G, B, E);
+    *(ulittle32_t *)FixupPtr = Value;
+    break;
+  }
+  case Pointer64:
+  case Pointer64Anon: {
+    uint64_t Value = E.getTarget().getAddress().getValue() + E.getAddend();
+    *(ulittle64_t *)FixupPtr = Value;
+    break;
+  }
+  case Page21: {
+    assert((E.getKind() != GOTPage21 || E.getAddend() == 0) &&
+           "GOTPAGE21 with non-zero addend");
+    uint64_t TargetPage =
+        (E.getTarget().getAddress().getValue() + E.getAddend()) &
+        ~static_cast<uint64_t>(4096 - 1);
+    uint64_t PCPage =
+        FixupAddress.getValue() & ~static_cast<uint64_t>(4096 - 1);
+
+    int64_t PageDelta = TargetPage - PCPage;
+    if (!isInt<33>(PageDelta))
+      return makeTargetOutOfRangeError(G, B, E);
+
+    uint32_t RawInstr = *(ulittle32_t *)FixupPtr;
+    assert((RawInstr & 0xffffffe0) == 0x90000000 &&
+           "RawInstr isn't an ADRP instruction");
+    uint32_t ImmLo = (static_cast<uint64_t>(PageDelta) >> 12) & 0x3;
+    uint32_t ImmHi = (static_cast<uint64_t>(PageDelta) >> 14) & 0x7ffff;
+    uint32_t FixedInstr = RawInstr | (ImmLo << 29) | (ImmHi << 5);
+    *(ulittle32_t *)FixupPtr = FixedInstr;
+    break;
+  }
+  case PageOffset12: {
+    uint64_t TargetOffset =
+        (E.getTarget().getAddress() + E.getAddend()).getValue() & 0xfff;
+
+    uint32_t RawInstr = *(ulittle32_t *)FixupPtr;
+    unsigned ImmShift = getPageOffset12Shift(RawInstr);
+
+    if (TargetOffset & ((1 << ImmShift) - 1))
+      return make_error<JITLinkError>("PAGEOFF12 target is not aligned");
+
+    uint32_t EncodedImm = (TargetOffset >> ImmShift) << 10;
+    uint32_t FixedInstr = RawInstr | EncodedImm;
+    *(ulittle32_t *)FixupPtr = FixedInstr;
+    break;
+  }
+  case MoveWide16: {
+    uint64_t TargetOffset =
+        (E.getTarget().getAddress() + E.getAddend()).getValue();
+
+    uint32_t RawInstr = *(ulittle32_t *)FixupPtr;
+    assert(isMoveWideImm16(RawInstr) &&
+           "RawInstr isn't a MOVK/MOVZ instruction");
+
+    unsigned ImmShift = getMoveWide16Shift(RawInstr);
+    uint32_t Imm = (TargetOffset >> ImmShift) & 0xffff;
+    uint32_t FixedInstr = RawInstr | (Imm << 5);
+    *(ulittle32_t *)FixupPtr = FixedInstr;
+    break;
+  }
+  case LDRLiteral19: {
+    assert((FixupAddress.getValue() & 0x3) == 0 && "LDR is not 32-bit aligned");
+    assert(E.getAddend() == 0 && "LDRLiteral19 with non-zero addend");
+    uint32_t RawInstr = *(ulittle32_t *)FixupPtr;
+    assert(RawInstr == 0x58000010 && "RawInstr isn't a 64-bit LDR literal");
+    int64_t Delta = E.getTarget().getAddress() - FixupAddress;
+    if (Delta & 0x3)
+      return make_error<JITLinkError>("LDR literal target is not 32-bit "
+                                      "aligned");
+    if (Delta < -(1 << 20) || Delta > ((1 << 20) - 1))
+      return makeTargetOutOfRangeError(G, B, E);
+
+    uint32_t EncodedImm = ((static_cast<uint32_t>(Delta) >> 2) & 0x7ffff) << 5;
+    uint32_t FixedInstr = RawInstr | EncodedImm;
+    *(ulittle32_t *)FixupPtr = FixedInstr;
+    break;
+  }
+  case Delta32:
+  case Delta64:
+  case NegDelta32:
+  case NegDelta64: {
+    int64_t Value;
+    if (E.getKind() == Delta32 || E.getKind() == Delta64)
+      Value = E.getTarget().getAddress() - FixupAddress + E.getAddend();
+    else
+      Value = FixupAddress - E.getTarget().getAddress() + E.getAddend();
+
+    if (E.getKind() == Delta32 || E.getKind() == NegDelta32) {
+      if (Value < std::numeric_limits<int32_t>::min() ||
+          Value > std::numeric_limits<int32_t>::max())
+        return makeTargetOutOfRangeError(G, B, E);
+      *(little32_t *)FixupPtr = Value;
+    } else
+      *(little64_t *)FixupPtr = Value;
+    break;
+  }
+  case TLVPage21:
+  case GOTPage21:
+  case TLVPageOffset12:
+  case GOTPageOffset12:
+  case PointerToGOT: {
+    return make_error<JITLinkError>(
+        "In graph " + G.getName() + ", section " + B.getSection().getName() +
+        "GOT/TLV edge kinds not lowered: " + getEdgeKindName(E.getKind()));
+  }
+  default:
+    return make_error<JITLinkError>(
+        "In graph " + G.getName() + ", section " + B.getSection().getName() +
+        "unsupported edge kind" + getEdgeKindName(E.getKind()));
+  }
+
+  return Error::success();
+}
+
+/// AArch64 null pointer content.
+extern const uint8_t NullGOTEntryContent[8];
+
+/// AArch64 PLT stub content.
+extern const uint8_t StubContent[8];
+
+/// Global Offset Table Builder.
+class GOTTableManager : public TableManager<GOTTableManager> {
+public:
+  static StringRef getSectionName() { return "$__GOT"; }
+
+  bool visitEdge(LinkGraph &G, Block *B, Edge &E) {
+    Edge::Kind KindToSet = Edge::Invalid;
+    const char *BlockWorkingMem = B->getContent().data();
+    const char *FixupPtr = BlockWorkingMem + E.getOffset();
+
+    switch (E.getKind()) {
+    case aarch64::GOTPage21:
+    case aarch64::TLVPage21: {
+      KindToSet = aarch64::Page21;
+      break;
+    }
+    case aarch64::GOTPageOffset12:
+    case aarch64::TLVPageOffset12: {
+      KindToSet = aarch64::PageOffset12;
+      uint32_t RawInstr = *(const support::ulittle32_t *)FixupPtr;
+      (void)RawInstr;
+      assert(E.getAddend() == 0 &&
+             "GOTPageOffset12/TLVPageOffset12 with non-zero addend");
+      assert((RawInstr & 0xfffffc00) == 0xf9400000 &&
+             "RawInstr isn't a 64-bit LDR immediate");
+      break;
+    }
+    case aarch64::PointerToGOT: {
+      KindToSet = aarch64::Delta64;
+      break;
+    }
+    default:
+      return false;
+    }
+    assert(KindToSet != Edge::Invalid &&
+           "Fell through switch, but no new kind to set");
+    DEBUG_WITH_TYPE("jitlink", {
+      dbgs() << "  Fixing " << G.getEdgeKindName(E.getKind()) << " edge at "
+             << B->getFixupAddress(E) << " (" << B->getAddress() << " + "
+             << formatv("{0:x}", E.getOffset()) << ")\n";
+    });
+    E.setKind(KindToSet);
+    E.setTarget(getEntryForTarget(G, E.getTarget()));
+    return true;
+  }
+
+  Symbol &createEntry(LinkGraph &G, Symbol &Target) {
+    auto &GOTEntryBlock = G.createContentBlock(
+        getGOTSection(G), getGOTEntryBlockContent(), orc::ExecutorAddr(), 8, 0);
+    GOTEntryBlock.addEdge(aarch64::Pointer64, 0, Target, 0);
+    return G.addAnonymousSymbol(GOTEntryBlock, 0, 8, false, false);
+  }
+
+private:
+  Section &getGOTSection(LinkGraph &G) {
+    if (!GOTSection)
+      GOTSection =
+          &G.createSection(getSectionName(), MemProt::Read | MemProt::Exec);
+    return *GOTSection;
+  }
+
+  ArrayRef<char> getGOTEntryBlockContent() {
+    return {reinterpret_cast<const char *>(NullGOTEntryContent),
+            sizeof(NullGOTEntryContent)};
+  }
+
+  Section *GOTSection = nullptr;
+};
+
+/// Procedure Linkage Table Builder.
+class PLTTableManager : public TableManager<PLTTableManager> {
+public:
+  PLTTableManager(GOTTableManager &GOT) : GOT(GOT) {}
+
+  static StringRef getSectionName() { return "$__STUBS"; }
+
+  bool visitEdge(LinkGraph &G, Block *B, Edge &E) {
+    if (E.getKind() == aarch64::Branch26 && !E.getTarget().isDefined()) {
+      DEBUG_WITH_TYPE("jitlink", {
+        dbgs() << "  Fixing " << G.getEdgeKindName(E.getKind()) << " edge at "
+               << B->getFixupAddress(E) << " (" << B->getAddress() << " + "
+               << formatv("{0:x}", E.getOffset()) << ")\n";
+      });
+      E.setTarget(getEntryForTarget(G, E.getTarget()));
+      return true;
+    }
+    return false;
+  }
+
+  Symbol &createEntry(LinkGraph &G, Symbol &Target) {
+    auto &StubContentBlock = G.createContentBlock(
+        getStubsSection(G), getStubBlockContent(), orc::ExecutorAddr(), 1, 0);
+    // Re-use GOT entries for stub targets.
+    auto &GOTEntrySymbol = GOT.getEntryForTarget(G, Target);
+    StubContentBlock.addEdge(aarch64::LDRLiteral19, 0, GOTEntrySymbol, 0);
+    return G.addAnonymousSymbol(StubContentBlock, 0, 8, true, false);
+  }
+
+public:
+  Section &getStubsSection(LinkGraph &G) {
+    if (!StubsSection)
+      StubsSection =
+          &G.createSection(getSectionName(), MemProt::Read | MemProt::Exec);
+    return *StubsSection;
+  }
+
+  ArrayRef<char> getStubBlockContent() {
+    return {reinterpret_cast<const char *>(StubContent), sizeof(StubContent)};
+  }
+
+  GOTTableManager &GOT;
+  Section *StubsSection = nullptr;
+};
+
 } // namespace aarch64
 } // namespace jitlink
 } // namespace llvm
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/riscv.h b/llvm/include/llvm/ExecutionEngine/JITLink/riscv.h
index 5abd4cf11dea..95f45fae91e4 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/riscv.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/riscv.h
@@ -37,13 +37,20 @@ enum EdgeKind_riscv : Edge::Kind {
   ///
   R_RISCV_64,
 
-  /// Low 12 bits of PC-relative branch pointer value relocation
+  /// PC-relative branch pointer value relocation
   ///
   /// Fixup expression:
-  ///   Fixup <- (Target - Fixup + Addend) & 0xFFF
+  ///   Fixup <- (Target - Fixup + Addend)
   ///
   R_RISCV_BRANCH,
 
+  /// High 20 bits of PC-relative jump pointer value relocation
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target - Fixup + Addend
+  ///
+  R_RISCV_JAL,
+
   /// High 20 bits of 32-bit pointer value relocation
   ///
   /// Fixup expression
@@ -145,6 +152,12 @@ enum EdgeKind_riscv : Edge::Kind {
   ///   Fixup <- (Target - *{1}Fixup - Addend)
   R_RISCV_SUB8,
 
+  /// 6 bits label subtraction
+  ///
+  /// Fixup expression
+  ///   Fixup <- (Target - *{1}Fixup - Addend)
+  R_RISCV_SUB6,
+
   /// Local label assignment
   ///
   /// Fixup expression:
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h b/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h
index 4a4e8d15be66..9a2bc9b09350 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h
@@ -447,11 +447,10 @@ inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E,
     break;
   }
 
-  default: {
-    // If you hit this you should check that *constructor and other non-fixup
-    // edges have been removed prior to applying fixups.
-    llvm_unreachable("Graph contains edge kind with no fixup expression");
-  }
+  default:
+    return make_error<JITLinkError>(
+        "In graph " + G.getName() + ", section " + B.getSection().getName() +
+        "unsupported edge kind" + getEdgeKindName(E.getKind()));
   }
 
   return Error::success();
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Core.h b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
index c4647148f287..df2826b50784 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Core.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
@@ -339,11 +339,7 @@ public:
   /// Sort the lookup set by pointer value. This sort is fast but sensitive to
   /// allocation order and so should not be used where a consistent order is
   /// required.
-  void sortByAddress() {
-    llvm::sort(Symbols, [](const value_type &LHS, const value_type &RHS) {
-      return LHS.first < RHS.first;
-    });
-  }
+  void sortByAddress() { llvm::sort(Symbols, llvm::less_first()); }
 
   /// Sort the lookup set lexicographically. This sort is slow but the order
   /// is unaffected by allocation order.
@@ -420,12 +416,15 @@ class FailedToMaterialize : public ErrorInfo<FailedToMaterialize> {
 public:
   static char ID;
 
-  FailedToMaterialize(std::shared_ptr<SymbolDependenceMap> Symbols);
+  FailedToMaterialize(std::shared_ptr<SymbolStringPool> SSP,
+                      std::shared_ptr<SymbolDependenceMap> Symbols);
+  ~FailedToMaterialize();
   std::error_code convertToErrorCode() const override;
   void log(raw_ostream &OS) const override;
   const SymbolDependenceMap &getSymbols() const { return *Symbols; }
 
 private:
+  std::shared_ptr<SymbolStringPool> SSP;
   std::shared_ptr<SymbolDependenceMap> Symbols;
 };
 
@@ -1331,7 +1330,7 @@ public:
   lookupInitSymbols(ExecutionSession &ES,
                     const DenseMap<JITDylib *, SymbolLookupSet> &InitSyms);
 
-  /// Performs an async lookup for the the given symbols in each of the given
+  /// Performs an async lookup for the given symbols in each of the given
   /// JITDylibs, calling the given handler once all lookups have completed.
   static void
   lookupInitSymbolsAsync(unique_function<void(Error)> OnComplete,
@@ -1389,8 +1388,12 @@ public:
   /// object.
   ExecutionSession(std::unique_ptr<ExecutorProcessControl> EPC);
 
+  /// Destroy an ExecutionSession. Verifies that endSession was called prior to
+  /// destruction.
+  ~ExecutionSession();
+
   /// End the session. Closes all JITDylibs and disconnects from the
-  /// executor.
+  /// executor. Clients must call this method before destroying the session.
   Error endSession();
 
   /// Get the ExecutorProcessControl object associated with this
@@ -1523,7 +1526,7 @@ public:
   /// after resolution, the function will return a success value, but the
   /// error will be reported via reportErrors.
   Expected<SymbolMap> lookup(const JITDylibSearchOrder &SearchOrder,
-                             const SymbolLookupSet &Symbols,
+                             SymbolLookupSet Symbols,
                              LookupKind K = LookupKind::Static,
                              SymbolState RequiredState = SymbolState::Ready,
                              RegisterDependenciesFunction RegisterDependencies =
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/DebugUtils.h b/llvm/include/llvm/ExecutionEngine/Orc/DebugUtils.h
index 7eb98dfc741e..c4ef06f1fbc6 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/DebugUtils.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/DebugUtils.h
@@ -92,6 +92,9 @@ raw_ostream &operator<<(raw_ostream &OS, const SymbolState &S);
 /// Render a LookupKind.
 raw_ostream &operator<<(raw_ostream &OS, const LookupKind &K);
 
+/// Dump a SymbolStringPool. Useful for debugging dangling-pointer crashes.
+raw_ostream &operator<<(raw_ostream &OS, const SymbolStringPool &SSP);
+
 /// A function object that can be used as an ObjectTransformLayer transform
 /// to dump object files to disk at a specified path.
 class DumpObjects {
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ELFNixPlatform.h b/llvm/include/llvm/ExecutionEngine/Orc/ELFNixPlatform.h
index 6b12fe990a8a..3804b6dda91f 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/ELFNixPlatform.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/ELFNixPlatform.h
@@ -109,7 +109,8 @@ public:
   /// Returns an AliasMap containing the default aliases for the ELFNixPlatform.
   /// This can be modified by clients when constructing the platform to add
   /// or remove aliases.
-  static SymbolAliasMap standardPlatformAliases(ExecutionSession &ES);
+  static Expected<SymbolAliasMap> standardPlatformAliases(ExecutionSession &ES,
+                                                          JITDylib &PlatformJD);
 
   /// Returns the array of required CXX aliases.
   static ArrayRef<std::pair<const char *, const char *>> requiredCXXAliases();
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/EPCDebugObjectRegistrar.h b/llvm/include/llvm/ExecutionEngine/Orc/EPCDebugObjectRegistrar.h
index ac7051b5b75c..241453320ad5 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/EPCDebugObjectRegistrar.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/EPCDebugObjectRegistrar.h
@@ -23,8 +23,6 @@
 #include <memory>
 #include <vector>
 
-using namespace llvm::orc::shared;
-
 namespace llvm {
 namespace orc {
 
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/EPCIndirectionUtils.h b/llvm/include/llvm/ExecutionEngine/Orc/EPCIndirectionUtils.h
index 92de5882bafe..354984b540a9 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/EPCIndirectionUtils.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/EPCIndirectionUtils.h
@@ -148,7 +148,7 @@ private:
   std::mutex EPCUIMutex;
   ExecutorProcessControl &EPC;
   std::unique_ptr<ABISupport> ABI;
-  JITTargetAddress ResolverBlockAddr;
+  JITTargetAddress ResolverBlockAddr = 0;
   FinalizedAlloc ResolverBlock;
   std::unique_ptr<TrampolinePool> TP;
   std::unique_ptr<LazyCallThroughManager> LCTM;
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h b/llvm/include/llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h
index 2cc8c29b2813..e6a63707653a 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h
@@ -125,7 +125,7 @@ public:
   /// Set TargetOptions.
   ///
   /// Note: This operation will overwrite any previously configured options,
-  /// including EmulatedTLS and ExplicitEmulatedTLS which
+  /// including EmulatedTLS, ExplicitEmulatedTLS, and UseInitArray which
   /// the JITTargetMachineBuilder sets by default. Clients are responsible
   /// for re-enabling these overwritten options.
   JITTargetMachineBuilder &setOptions(TargetOptions Options) {
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/LLJIT.h b/llvm/include/llvm/ExecutionEngine/Orc/LLJIT.h
index d76e6a21a9bb..d67a7f2bfeb2 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/LLJIT.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/LLJIT.h
@@ -56,7 +56,7 @@ public:
 
   /// Destruct this instance. If a multi-threaded instance, waits for all
   /// compile threads to complete.
-  ~LLJIT();
+  virtual ~LLJIT();
 
   /// Returns the ExecutionSession for this instance.
   ExecutionSession &getExecutionSession() { return *ES; }
@@ -110,30 +110,30 @@ public:
 
   /// Look up a symbol in JITDylib JD by the symbol's linker-mangled name (to
   /// look up symbols based on their IR name use the lookup function instead).
-  Expected<JITEvaluatedSymbol> lookupLinkerMangled(JITDylib &JD,
-                                                   SymbolStringPtr Name);
+  Expected<ExecutorAddr> lookupLinkerMangled(JITDylib &JD,
+                                             SymbolStringPtr Name);
 
   /// Look up a symbol in JITDylib JD by the symbol's linker-mangled name (to
   /// look up symbols based on their IR name use the lookup function instead).
-  Expected<JITEvaluatedSymbol> lookupLinkerMangled(JITDylib &JD,
-                                                   StringRef Name) {
+  Expected<ExecutorAddr> lookupLinkerMangled(JITDylib &JD,
+                                             StringRef Name) {
     return lookupLinkerMangled(JD, ES->intern(Name));
   }
 
   /// Look up a symbol in the main JITDylib by the symbol's linker-mangled name
   /// (to look up symbols based on their IR name use the lookup function
   /// instead).
-  Expected<JITEvaluatedSymbol> lookupLinkerMangled(StringRef Name) {
+  Expected<ExecutorAddr> lookupLinkerMangled(StringRef Name) {
     return lookupLinkerMangled(*Main, Name);
   }
 
   /// Look up a symbol in JITDylib JD based on its IR symbol name.
-  Expected<JITEvaluatedSymbol> lookup(JITDylib &JD, StringRef UnmangledName) {
+  Expected<ExecutorAddr> lookup(JITDylib &JD, StringRef UnmangledName) {
     return lookupLinkerMangled(JD, mangle(UnmangledName));
   }
 
   /// Look up a symbol in the main JITDylib based on its IR symbol name.
-  Expected<JITEvaluatedSymbol> lookup(StringRef UnmangledName) {
+  Expected<ExecutorAddr> lookup(StringRef UnmangledName) {
     return lookup(*Main, UnmangledName);
   }
 
@@ -401,7 +401,7 @@ public:
       std::function<std::unique_ptr<IndirectStubsManager>()>;
 
   Triple TT;
-  JITTargetAddress LazyCompileFailureAddr = 0;
+  ExecutorAddr LazyCompileFailureAddr;
   std::unique_ptr<LazyCallThroughManager> LCTMgr;
   IndirectStubsManagerBuilderFunction ISMBuilder;
 
@@ -415,7 +415,7 @@ public:
   /// Set the address in the target address to call if a lazy compile fails.
   ///
   /// If this method is not called then the value will default to 0.
-  SetterImpl &setLazyCompileFailureAddr(JITTargetAddress Addr) {
+  SetterImpl &setLazyCompileFailureAddr(ExecutorAddr Addr) {
     this->impl().LazyCompileFailureAddr = Addr;
     return this->impl();
   }
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h b/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
index 01f3f1b2ab63..141dd73548c8 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
@@ -26,30 +26,19 @@
 namespace llvm {
 namespace orc {
 
-struct MachOJITDylibInitializers {
-  using SectionList = std::vector<ExecutorAddrRange>;
-
-  MachOJITDylibInitializers(std::string Name, ExecutorAddr MachOHeaderAddress)
-      : Name(std::move(Name)),
-        MachOHeaderAddress(std::move(MachOHeaderAddress)) {}
-
-  std::string Name;
-  ExecutorAddr MachOHeaderAddress;
-  ExecutorAddr ObjCImageInfoAddress;
-
-  StringMap<SectionList> InitSections;
-};
-
-class MachOJITDylibDeinitializers {};
-
-using MachOJITDylibInitializerSequence = std::vector<MachOJITDylibInitializers>;
-
-using MachOJITDylibDeinitializerSequence =
-    std::vector<MachOJITDylibDeinitializers>;
-
 /// Mediates between MachO initialization and ExecutionSession state.
 class MachOPlatform : public Platform {
 public:
+  // Used internally by MachOPlatform, but made public to enable serialization.
+  struct MachOJITDylibDepInfo {
+    bool Sealed = false;
+    std::vector<ExecutorAddr> DepHeaders;
+  };
+
+  // Used internally by MachOPlatform, but made public to enable serialization.
+  using MachOJITDylibDepInfoMap =
+      std::vector<std::pair<ExecutorAddr, MachOJITDylibDepInfo>>;
+
   /// Try to create a MachOPlatform instance, adding the ORC runtime to the
   /// given JITDylib.
   ///
@@ -161,26 +150,28 @@ private:
     Error processObjCImageInfo(jitlink::LinkGraph &G,
                                MaterializationResponsibility &MR);
 
-    Error registerInitSections(jitlink::LinkGraph &G, JITDylib &JD);
-
     Error fixTLVSectionsAndEdges(jitlink::LinkGraph &G, JITDylib &JD);
 
-    Error registerEHAndTLVSections(jitlink::LinkGraph &G);
+    Error registerObjectPlatformSections(jitlink::LinkGraph &G, JITDylib &JD);
 
     Error registerEHSectionsPhase1(jitlink::LinkGraph &G);
 
     std::mutex PluginMutex;
     MachOPlatform &MP;
+
+    // FIXME: ObjCImageInfos and HeaderAddrs need to be cleared when
+    // JITDylibs are removed.
     DenseMap<JITDylib *, std::pair<uint32_t, uint32_t>> ObjCImageInfos;
+    DenseMap<JITDylib *, ExecutorAddr> HeaderAddrs;
     InitSymbolDepMap InitSymbolDeps;
   };
 
-  using SendInitializerSequenceFn =
-      unique_function<void(Expected<MachOJITDylibInitializerSequence>)>;
-
-  using SendDeinitializerSequenceFn =
-      unique_function<void(Expected<MachOJITDylibDeinitializerSequence>)>;
-
+  using GetJITDylibHeaderSendResultFn =
+      unique_function<void(Expected<ExecutorAddr>)>;
+  using GetJITDylibNameSendResultFn =
+      unique_function<void(Expected<StringRef>)>;
+  using PushInitializersSendResultFn =
+      unique_function<void(Expected<MachOJITDylibDepInfoMap>)>;
   using SendSymbolAddressFn = unique_function<void(Expected<ExecutorAddr>)>;
 
   static bool supportedTarget(const Triple &TT);
@@ -193,28 +184,24 @@ private:
   // Associate MachOPlatform JIT-side runtime support functions with handlers.
   Error associateRuntimeSupportFunctions(JITDylib &PlatformJD);
 
-  void getInitializersBuildSequencePhase(SendInitializerSequenceFn SendResult,
-                                         JITDylib &JD,
-                                         std::vector<JITDylibSP> DFSLinkOrder);
+  // Implements rt_pushInitializers by making repeat async lookups for
+  // initializer symbols (each lookup may spawn more initializer symbols if
+  // it pulls in new materializers, e.g. from objects in a static library).
+  void pushInitializersLoop(PushInitializersSendResultFn SendResult,
+                            JITDylibSP JD);
 
-  void getInitializersLookupPhase(SendInitializerSequenceFn SendResult,
-                                  JITDylib &JD);
-
-  void rt_getInitializers(SendInitializerSequenceFn SendResult,
-                          StringRef JDName);
-
-  void rt_getDeinitializers(SendDeinitializerSequenceFn SendResult,
-                            ExecutorAddr Handle);
+  // Handle requests from the ORC runtime to push MachO initializer info.
+  void rt_pushInitializers(PushInitializersSendResultFn SendResult,
+                           ExecutorAddr JDHeaderAddr);
 
+  // Handle requests for symbol addresses from the ORC runtime.
   void rt_lookupSymbol(SendSymbolAddressFn SendResult, ExecutorAddr Handle,
                        StringRef SymbolName);
 
   // Records the addresses of runtime symbols used by the platform.
   Error bootstrapMachORuntime(JITDylib &PlatformJD);
 
-  Error registerInitInfo(JITDylib &JD, ExecutorAddr ObjCImageInfoAddr,
-                         ArrayRef<jitlink::Section *> InitSections);
-
+  // Call the ORC runtime to create a pthread key.
   Expected<uint64_t> createPThreadKey();
 
   enum PlatformState { BootstrapPhase1, BootstrapPhase2, Initialized };
@@ -229,81 +216,24 @@ private:
   ExecutorAddr orc_rt_macho_platform_shutdown;
   ExecutorAddr orc_rt_macho_register_ehframe_section;
   ExecutorAddr orc_rt_macho_deregister_ehframe_section;
-  ExecutorAddr orc_rt_macho_register_thread_data_section;
-  ExecutorAddr orc_rt_macho_deregister_thread_data_section;
+  ExecutorAddr orc_rt_macho_register_jitdylib;
+  ExecutorAddr orc_rt_macho_deregister_jitdylib;
+  ExecutorAddr orc_rt_macho_register_object_platform_sections;
+  ExecutorAddr orc_rt_macho_deregister_object_platform_sections;
   ExecutorAddr orc_rt_macho_create_pthread_key;
 
   DenseMap<JITDylib *, SymbolLookupSet> RegisteredInitSymbols;
 
-  // InitSeqs gets its own mutex to avoid locking the whole session when
-  // aggregating data from the jitlink.
   std::mutex PlatformMutex;
-  DenseMap<JITDylib *, MachOJITDylibInitializers> InitSeqs;
-
+  DenseMap<JITDylib *, ExecutorAddr> JITDylibToHeaderAddr;
   DenseMap<ExecutorAddr, JITDylib *> HeaderAddrToJITDylib;
   DenseMap<JITDylib *, uint64_t> JITDylibToPThreadKey;
 };
 
 namespace shared {
 
-using SPSNamedExecutorAddrRangeSequenceMap =
-    SPSSequence<SPSTuple<SPSString, SPSExecutorAddrRangeSequence>>;
-
-using SPSMachOJITDylibInitializers =
-    SPSTuple<SPSString, SPSExecutorAddr, SPSExecutorAddr,
-             SPSNamedExecutorAddrRangeSequenceMap>;
-
-using SPSMachOJITDylibInitializerSequence =
-    SPSSequence<SPSMachOJITDylibInitializers>;
-
-/// Serialization traits for MachOJITDylibInitializers.
-template <>
-class SPSSerializationTraits<SPSMachOJITDylibInitializers,
-                             MachOJITDylibInitializers> {
-public:
-  static size_t size(const MachOJITDylibInitializers &MOJDIs) {
-    return SPSMachOJITDylibInitializers::AsArgList::size(
-        MOJDIs.Name, MOJDIs.MachOHeaderAddress, MOJDIs.ObjCImageInfoAddress,
-        MOJDIs.InitSections);
-  }
-
-  static bool serialize(SPSOutputBuffer &OB,
-                        const MachOJITDylibInitializers &MOJDIs) {
-    return SPSMachOJITDylibInitializers::AsArgList::serialize(
-        OB, MOJDIs.Name, MOJDIs.MachOHeaderAddress, MOJDIs.ObjCImageInfoAddress,
-        MOJDIs.InitSections);
-  }
-
-  static bool deserialize(SPSInputBuffer &IB,
-                          MachOJITDylibInitializers &MOJDIs) {
-    return SPSMachOJITDylibInitializers::AsArgList::deserialize(
-        IB, MOJDIs.Name, MOJDIs.MachOHeaderAddress, MOJDIs.ObjCImageInfoAddress,
-        MOJDIs.InitSections);
-  }
-};
-
-using SPSMachOJITDylibDeinitializers = SPSEmpty;
-
-using SPSMachOJITDylibDeinitializerSequence =
-    SPSSequence<SPSMachOJITDylibDeinitializers>;
-
-template <>
-class SPSSerializationTraits<SPSMachOJITDylibDeinitializers,
-                             MachOJITDylibDeinitializers> {
-public:
-  static size_t size(const MachOJITDylibDeinitializers &MOJDDs) { return 0; }
-
-  static bool serialize(SPSOutputBuffer &OB,
-                        const MachOJITDylibDeinitializers &MOJDDs) {
-    return true;
-  }
-
-  static bool deserialize(SPSInputBuffer &IB,
-                          MachOJITDylibDeinitializers &MOJDDs) {
-    MOJDDs = MachOJITDylibDeinitializers();
-    return true;
-  }
-};
+using SPSNamedExecutorAddrRangeSequence =
+    SPSSequence<SPSTuple<SPSString, SPSExecutorAddrRange>>;
 
 } // end namespace shared
 } // end namespace orc
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MemoryMapper.h b/llvm/include/llvm/ExecutionEngine/Orc/MemoryMapper.h
new file mode 100644
index 000000000000..d023bfbdb5b6
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/MemoryMapper.h
@@ -0,0 +1,115 @@
+//===- MemoryMapper.h - Cross-process memory mapper -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Cross-process (and in-process) memory mapping and transfer
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_MEMORYMAPPER_H
+#define LLVM_EXECUTIONENGINE_ORC_MEMORYMAPPER_H
+
+#include "llvm/ExecutionEngine/Orc/Core.h"
+
+#include <mutex>
+
+namespace llvm {
+namespace orc {
+
+/// Manages mapping, content transfer and protections for JIT memory
+class MemoryMapper {
+public:
+  /// Represents a single allocation containing multiple segments and
+  /// initialization and deinitialization actions
+  struct AllocInfo {
+    struct SegInfo {
+      ExecutorAddrDiff Offset;
+      const char *WorkingMem;
+      size_t ContentSize;
+      size_t ZeroFillSize;
+      unsigned Prot;
+    };
+
+    ExecutorAddr MappingBase;
+    std::vector<SegInfo> Segments;
+    shared::AllocActions Actions;
+  };
+
+  using OnReservedFunction = unique_function<void(Expected<ExecutorAddrRange>)>;
+
+  /// Reserves address space in executor process
+  virtual void reserve(size_t NumBytes, OnReservedFunction OnReserved) = 0;
+
+  /// Provides working memory
+  virtual char *prepare(ExecutorAddr Addr, size_t ContentSize) = 0;
+
+  using OnInitializedFunction = unique_function<void(Expected<ExecutorAddr>)>;
+
+  /// Ensures executor memory is synchronized with working copy memory, sends
+  /// functions to be called after initilization and before deinitialization and
+  /// applies memory protections
+  /// Returns a unique address identifying the allocation. This address should
+  /// be passed to deinitialize to run deallocation actions (and reset
+  /// permissions where possible).
+  virtual void initialize(AllocInfo &AI,
+                          OnInitializedFunction OnInitialized) = 0;
+
+  using OnDeinitializedFunction = unique_function<void(Error)>;
+
+  /// Runs previously specified deinitialization actions
+  /// Executor addresses returned by initialize should be passed
+  virtual void deinitialize(ArrayRef<ExecutorAddr> Allocations,
+                            OnDeinitializedFunction OnDeInitialized) = 0;
+
+  using OnReleasedFunction = unique_function<void(Error)>;
+
+  /// Release address space acquired through reserve()
+  virtual void release(ArrayRef<ExecutorAddr> Reservations,
+                       OnReleasedFunction OnRelease) = 0;
+
+  virtual ~MemoryMapper();
+};
+
+class InProcessMemoryMapper final : public MemoryMapper {
+public:
+  InProcessMemoryMapper() {}
+
+  void reserve(size_t NumBytes, OnReservedFunction OnReserved) override;
+
+  void initialize(AllocInfo &AI, OnInitializedFunction OnInitialized) override;
+
+  char *prepare(ExecutorAddr Addr, size_t ContentSize) override;
+
+  void deinitialize(ArrayRef<ExecutorAddr> Allocations,
+                    OnDeinitializedFunction OnDeInitialized) override;
+
+  void release(ArrayRef<ExecutorAddr> Reservations,
+               OnReleasedFunction OnRelease) override;
+
+  ~InProcessMemoryMapper() override;
+
+private:
+  struct Allocation {
+    std::vector<shared::WrapperFunctionCall> DeinitializationActions;
+  };
+  using AllocationMap = DenseMap<ExecutorAddr, Allocation>;
+
+  struct Reservation {
+    size_t Size;
+    std::vector<ExecutorAddr> Allocations;
+  };
+  using ReservationMap = DenseMap<void *, Reservation>;
+
+  std::mutex Mutex;
+  ReservationMap Reservations;
+  AllocationMap Allocations;
+};
+
+} // namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_MEMORYMAPPER_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/OrcABISupport.h b/llvm/include/llvm/ExecutionEngine/Orc/OrcABISupport.h
index 82dfdc270128..c5c2780bc9ee 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/OrcABISupport.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/OrcABISupport.h
@@ -330,6 +330,45 @@ public:
       JITTargetAddress PointersBlockTargetAddress, unsigned NumStubs);
 };
 
+// @brief riscv64 support.
+//
+// RISC-V 64 supports lazy JITing.
+class OrcRiscv64 {
+public:
+  static constexpr unsigned PointerSize = 8;
+  static constexpr unsigned TrampolineSize = 16;
+  static constexpr unsigned StubSize = 16;
+  static constexpr unsigned StubToPointerMaxDisplacement = 1 << 31;
+  static constexpr unsigned ResolverCodeSize = 0x148;
+
+  /// Write the resolver code into the given memory. The user is
+  /// responsible for allocating the memory and setting permissions.
+  ///
+  /// ReentryFnAddr should be the address of a function whose signature matches
+  /// void* (*)(void *TrampolineAddr, void *ReentryCtxAddr). The ReentryCtxAddr
+  /// argument of writeResolverCode will be passed as the second argument to
+  /// the function at ReentryFnAddr.
+  static void writeResolverCode(char *ResolverWorkingMem,
+                                JITTargetAddress ResolverTargetAddress,
+                                JITTargetAddress ReentryFnAddr,
+                                JITTargetAddress ReentryCtxAddr);
+
+  /// Write the requested number of trampolines into the given memory,
+  /// which must be big enough to hold 1 pointer, plus NumTrampolines
+  /// trampolines.
+  static void writeTrampolines(char *TrampolineBlockWorkingMem,
+                               JITTargetAddress TrampolineBlockTargetAddress,
+                               JITTargetAddress ResolverFnAddr,
+                               unsigned NumTrampolines);
+  /// Write NumStubs indirect stubs to working memory at StubsBlockWorkingMem.
+  /// Stubs will be written as if linked at StubsBlockTargetAddress, with the
+  /// Nth stub using the Nth pointer in memory starting at
+  /// PointersBlockTargetAddress.
+  static void writeIndirectStubsBlock(
+      char *StubsBlockWorkingMem, JITTargetAddress StubsBlockTargetAddress,
+      JITTargetAddress PointersBlockTargetAddress, unsigned NumStubs);
+};
+
 } // end namespace orc
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h
index dc080cfc79d1..5d545f8abdb9 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h
@@ -43,13 +43,22 @@ public:
 
   /// Cast this ExecutorAddr to a pointer of the given type.
   /// Warning: This should only be used when JITing in-process.
-  template <typename T> T toPtr() const {
-    static_assert(std::is_pointer<T>::value, "T must be a pointer type");
+  template <typename T>
+  std::enable_if_t<std::is_pointer<T>::value, T> toPtr() const {
     uintptr_t IntPtr = static_cast<uintptr_t>(Addr);
     assert(IntPtr == Addr && "ExecutorAddr value out of range for uintptr_t");
     return reinterpret_cast<T>(IntPtr);
   }
 
+  /// Cast this ExecutorAddr to a pointer of the given function type.
+  /// Warning: This should only be used when JITing in-process.
+  template <typename T>
+  std::enable_if_t<std::is_function<T>::value, T *> toPtr() const {
+    uintptr_t IntPtr = static_cast<uintptr_t>(Addr);
+    assert(IntPtr == Addr && "ExecutorAddr value out of range for uintptr_t");
+    return reinterpret_cast<T *>(IntPtr);
+  }
+
   uint64_t getValue() const { return Addr; }
   void setValue(uint64_t Addr) { this->Addr = Addr; }
   bool isNull() const { return Addr == 0; }
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/SimplePackedSerialization.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/SimplePackedSerialization.h
index 302b60b80fd0..9be58e9f0fa9 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/SimplePackedSerialization.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/SimplePackedSerialization.h
@@ -586,7 +586,7 @@ SPSSerializableExpected<T> toSPSSerializable(Expected<T> E) {
   if (E)
     return {true, std::move(*E), {}};
   else
-    return {false, {}, toString(E.takeError())};
+    return {false, T(), toString(E.takeError())};
 }
 
 template <typename T>
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h b/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h
index a138f60a7756..b7bba7a48786 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h
@@ -88,7 +88,7 @@ private:
     for (auto &Callee : CandidateSet) {
       auto ImplSymbol = AliaseeImplTable.getImplFor(Callee);
       // try to distinguish already compiled & library symbols
-      if (!ImplSymbol.hasValue())
+      if (!ImplSymbol)
         continue;
       const auto &ImplSymbolName = ImplSymbol.getPointer()->first;
       JITDylib *ImplJD = ImplSymbol.getPointer()->second;
@@ -175,9 +175,8 @@ public:
   using ResultEval = std::function<IRlikiesStrRef(Function &)>;
   using TargetAndLikelies = DenseMap<SymbolStringPtr, SymbolNameSet>;
 
-  IRSpeculationLayer(ExecutionSession &ES, IRCompileLayer &BaseLayer,
-                     Speculator &Spec, MangleAndInterner &Mangle,
-                     ResultEval Interpreter)
+  IRSpeculationLayer(ExecutionSession &ES, IRLayer &BaseLayer, Speculator &Spec,
+                     MangleAndInterner &Mangle, ResultEval Interpreter)
       : IRLayer(ES, BaseLayer.getManglingOptions()), NextLayer(BaseLayer),
         S(Spec), Mangle(Mangle), QueryAnalysis(Interpreter) {}
 
@@ -198,7 +197,7 @@ private:
     return InternedNames;
   }
 
-  IRCompileLayer &NextLayer;
+  IRLayer &NextLayer;
   Speculator &S;
   MangleAndInterner &Mangle;
   ResultEval QueryAnalysis;
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/SymbolStringPool.h b/llvm/include/llvm/ExecutionEngine/Orc/SymbolStringPool.h
index 63abb196ba49..7e433965c922 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/SymbolStringPool.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/SymbolStringPool.h
@@ -19,6 +19,9 @@
 #include <mutex>
 
 namespace llvm {
+
+class raw_ostream;
+
 namespace orc {
 
 class SymbolStringPtr;
@@ -26,6 +29,10 @@ class SymbolStringPtr;
 /// String pool for symbol names used by the JIT.
 class SymbolStringPool {
   friend class SymbolStringPtr;
+
+  // Implemented in DebugUtils.h.
+  friend raw_ostream &operator<<(raw_ostream &OS, const SymbolStringPool &SSP);
+
 public:
   /// Destroy a SymbolStringPool.
   ~SymbolStringPool();
diff --git a/llvm/include/llvm/FileCheck/FileCheck.h b/llvm/include/llvm/FileCheck/FileCheck.h
index 7a6c98db3029..d6d8dc531e10 100644
--- a/llvm/include/llvm/FileCheck/FileCheck.h
+++ b/llvm/include/llvm/FileCheck/FileCheck.h
@@ -14,14 +14,17 @@
 #define LLVM_FILECHECK_FILECHECK_H
 
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Regex.h"
-#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/SMLoc.h"
 #include <bitset>
+#include <memory>
 #include <string>
 #include <vector>
 
 namespace llvm {
+class MemoryBuffer;
+class SourceMgr;
+template <typename T> class SmallVectorImpl;
 
 /// Contains info about various FileCheck options.
 struct FileCheckRequest {
@@ -45,6 +48,7 @@ namespace Check {
 
 enum FileCheckKind {
   CheckNone = 0,
+  CheckMisspelled,
   CheckPlain,
   CheckNext,
   CheckSame,
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index c5abb16dd9e5..5f1d335ef04f 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -122,13 +122,12 @@ def OMPC_ProcBind : Clause<"proc_bind"> {
   ];
 }
 
-// static and auto are C++ keywords so need a capital to disambiguate.
-def OMP_SCHEDULE_Static : ClauseVal<"Static", 2, 1> {}
-def OMP_SCHEDULE_Dynamic : ClauseVal<"Dynamic", 3, 1> {}
-def OMP_SCHEDULE_Guided : ClauseVal<"Guided", 4, 1> {}
-def OMP_SCHEDULE_Auto : ClauseVal<"Auto", 5, 1> {}
-def OMP_SCHEDULE_Runtime : ClauseVal<"Runtime", 6, 1> {}
-def OMP_SCHEDULE_Default : ClauseVal<"Default", 7, 0> { let isDefault = 1; }
+def OMP_SCHEDULE_Static : ClauseVal<"static", 2, 1> {}
+def OMP_SCHEDULE_Dynamic : ClauseVal<"dynamic", 3, 1> {}
+def OMP_SCHEDULE_Guided : ClauseVal<"guided", 4, 1> {}
+def OMP_SCHEDULE_Auto : ClauseVal<"auto", 5, 1> {}
+def OMP_SCHEDULE_Runtime : ClauseVal<"runtime", 6, 1> {}
+def OMP_SCHEDULE_Default : ClauseVal<"default", 7, 0> { let isDefault = 1; }
 
 def OMPC_Schedule : Clause<"schedule"> {
   let clangClass = "OMPScheduleClause";
@@ -164,6 +163,25 @@ def OMPC_MemoryOrder : Clause<"memory_order"> {
   ];
 }
 
+def OMP_CANCELLATION_CONSTRUCT_Parallel : ClauseVal<"parallel", 1, 1> {}
+def OMP_CANCELLATION_CONSTRUCT_Loop : ClauseVal<"loop", 2, 1> {}
+def OMP_CANCELLATION_CONSTRUCT_Sections : ClauseVal<"sections", 3, 1> {}
+def OMP_CANCELLATION_CONSTRUCT_Taskgroup : ClauseVal<"taskgroup", 4, 1> {}
+def OMP_CANCELLATION_CONSTRUCT_None : ClauseVal<"none", 5, 0> {
+  let isDefault = 1;
+}
+
+def OMPC_CancellationConstructType : Clause<"cancellation_construct_type"> {
+  let enumClauseValue = "CancellationConstructType";
+  let allowedClauseValues = [
+    OMP_CANCELLATION_CONSTRUCT_Parallel,
+    OMP_CANCELLATION_CONSTRUCT_Loop,
+    OMP_CANCELLATION_CONSTRUCT_Sections,
+    OMP_CANCELLATION_CONSTRUCT_Taskgroup,
+    OMP_CANCELLATION_CONSTRUCT_None
+  ];
+}
+
 def OMPC_Ordered : Clause<"ordered"> {
   let clangClass = "OMPOrderedClause";
   let flangClass = "ScalarIntConstantExpr";
@@ -254,12 +272,18 @@ def OMPC_IsDevicePtr : Clause<"is_device_ptr"> {
   let flangClass = "Name";
   let isValueList = true;
 }
+def OMPC_HasDeviceAddr : Clause<"has_device_addr"> {
+  let clangClass = "OMPHasDeviceAddrClause";
+  let flangClass = "Name";
+  let isValueList = true;
+}
 def OMPC_TaskReduction : Clause<"task_reduction"> {
   let clangClass = "OMPTaskReductionClause";
   let flangClass = "OmpReductionClause";
 }
 def OMPC_InReduction : Clause<"in_reduction"> {
   let clangClass = "OMPInReductionClause";
+  let flangClass = "OmpInReductionClause";
 }
 def OMPC_UnifiedAddress : Clause<"unified_address"> {
   let clangClass = "OMPUnifiedAddressClause";
@@ -557,7 +581,9 @@ def OMP_Target : Directive<"target"> {
     VersionedClause<OMPC_Depend>,
     VersionedClause<OMPC_FirstPrivate>,
     VersionedClause<OMPC_IsDevicePtr>,
+    VersionedClause<OMPC_HasDeviceAddr, 51>,
     VersionedClause<OMPC_Reduction>,
+    VersionedClause<OMPC_InReduction, 50>,
     VersionedClause<OMPC_Allocate>,
     VersionedClause<OMPC_UsesAllocators, 50>
   ];
@@ -590,11 +616,20 @@ def OMP_Requires : Directive<"requires"> {
   let allowedClauses = [
     VersionedClause<OMPC_UnifiedAddress>,
     VersionedClause<OMPC_UnifiedSharedMemory>,
-    VersionedClause<OMPC_ReverseOffload>,
+    // OpenMP 5.2 Spec: If an implementation is not supporting a requirement
+    // (reverse offload in this case) then it should give compile-time error
+    // termination.
+    // Seeting supported version for reverse_offload to a distant future version
+    // 9.9 so that its partial support can be tested in the meantime.
+    //
+    // TODO: Correct this supprted version number whenever complete
+    // implementation of reverse_offload is available.
+    VersionedClause<OMPC_ReverseOffload, 99>,
     VersionedClause<OMPC_DynamicAllocators>,
     VersionedClause<OMPC_AtomicDefaultMemOrder>
   ];
 }
+def OMP_Nothing : Directive<"nothing"> {}
 def OMP_TargetData : Directive<"target data"> {
   let allowedClauses = [
     VersionedClause<OMPC_UseDevicePtr>,
@@ -645,6 +680,7 @@ def OMP_TargetParallel : Directive<"target parallel"> {
     VersionedClause<OMPC_Shared>,
     VersionedClause<OMPC_Reduction>,
     VersionedClause<OMPC_IsDevicePtr>,
+    VersionedClause<OMPC_HasDeviceAddr, 51>,
     VersionedClause<OMPC_Allocate>,
     VersionedClause<OMPC_UsesAllocators, 50>
   ];
@@ -677,6 +713,7 @@ def OMP_TargetParallelFor : Directive<"target parallel for"> {
     VersionedClause<OMPC_Ordered>,
     VersionedClause<OMPC_Linear>,
     VersionedClause<OMPC_IsDevicePtr>,
+    VersionedClause<OMPC_HasDeviceAddr, 51>,
     VersionedClause<OMPC_Allocate>,
     VersionedClause<OMPC_Order, 50>,
     VersionedClause<OMPC_UsesAllocators, 50>
@@ -693,6 +730,7 @@ def OMP_TargetParallelDo : Directive<"target parallel do"> {
     VersionedClause<OMPC_Reduction>,
     VersionedClause<OMPC_Linear>,
     VersionedClause<OMPC_IsDevicePtr>,
+    VersionedClause<OMPC_HasDeviceAddr, 51>,
     VersionedClause<OMPC_Allocator>,
     VersionedClause<OMPC_Order>,
     VersionedClause<OMPC_UsesAllocators>,
@@ -825,6 +863,21 @@ def OMP_ParallelMaster : Directive<"parallel master"> {
     VersionedClause<OMPC_Allocate>
   ];
 }
+def OMP_ParallelMasked : Directive<"parallel masked"> {
+  let allowedClauses = [
+    VersionedClause<OMPC_If>,
+    VersionedClause<OMPC_NumThreads>,
+    VersionedClause<OMPC_Default>,
+    VersionedClause<OMPC_Private>,
+    VersionedClause<OMPC_FirstPrivate>,
+    VersionedClause<OMPC_Shared>,
+    VersionedClause<OMPC_Copyin>,
+    VersionedClause<OMPC_Reduction>,
+    VersionedClause<OMPC_ProcBind>,
+    VersionedClause<OMPC_Allocate>,
+    VersionedClause<OMPC_Filter>
+  ];
+}
 def OMP_ParallelSections : Directive<"parallel sections"> {
   let allowedClauses = [
     VersionedClause<OMPC_If>,
@@ -1126,6 +1179,7 @@ def OMP_TargetParallelForSimd : Directive<"target parallel for simd"> {
     VersionedClause<OMPC_SimdLen>,
     VersionedClause<OMPC_Aligned>,
     VersionedClause<OMPC_IsDevicePtr>,
+    VersionedClause<OMPC_HasDeviceAddr, 51>,
     VersionedClause<OMPC_Allocate>,
     VersionedClause<OMPC_NonTemporal, 50>,
     VersionedClause<OMPC_Order, 50>,
@@ -1156,6 +1210,7 @@ def OMP_TargetParallelDoSimd : Directive<"target parallel do simd"> {
     VersionedClause<OMPC_SimdLen>,
     VersionedClause<OMPC_Aligned>,
     VersionedClause<OMPC_IsDevicePtr>,
+    VersionedClause<OMPC_HasDeviceAddr, 51>,
     VersionedClause<OMPC_Allocate>,
     VersionedClause<OMPC_NonTemporal>,
     VersionedClause<OMPC_Order>,
@@ -1169,6 +1224,7 @@ def OMP_TargetSimd : Directive<"target simd"> {
     VersionedClause<OMPC_Depend>,
     VersionedClause<OMPC_FirstPrivate>,
     VersionedClause<OMPC_IsDevicePtr>,
+    VersionedClause<OMPC_HasDeviceAddr, 51>,
     VersionedClause<OMPC_LastPrivate>,
     VersionedClause<OMPC_Linear>,
     VersionedClause<OMPC_Map>,
@@ -1342,6 +1398,7 @@ def OMP_TargetTeams : Directive<"target teams"> {
     VersionedClause<OMPC_Depend>,
     VersionedClause<OMPC_FirstPrivate>,
     VersionedClause<OMPC_IsDevicePtr>,
+    VersionedClause<OMPC_HasDeviceAddr, 51>,
     VersionedClause<OMPC_Reduction>,
     VersionedClause<OMPC_Allocate>,
     VersionedClause<OMPC_UsesAllocators, 50>,
@@ -1365,6 +1422,7 @@ def OMP_TargetTeamsDistribute : Directive<"target teams distribute"> {
     VersionedClause<OMPC_Depend>,
     VersionedClause<OMPC_FirstPrivate>,
     VersionedClause<OMPC_IsDevicePtr>,
+    VersionedClause<OMPC_HasDeviceAddr, 51>,
     VersionedClause<OMPC_Reduction>,
     VersionedClause<OMPC_Allocate>,
     VersionedClause<OMPC_UsesAllocators, 50>,
@@ -1395,6 +1453,7 @@ def OMP_TargetTeamsDistributeParallelFor :
     VersionedClause<OMPC_DefaultMap>,
     VersionedClause<OMPC_FirstPrivate>,
     VersionedClause<OMPC_IsDevicePtr>,
+    VersionedClause<OMPC_HasDeviceAddr, 51>,
     VersionedClause<OMPC_Default>,
     VersionedClause<OMPC_Shared>,
     VersionedClause<OMPC_Reduction>,
@@ -1420,6 +1479,7 @@ def OMP_TargetTeamsDistributeParallelDo :
     VersionedClause<OMPC_Depend>,
     VersionedClause<OMPC_FirstPrivate>,
     VersionedClause<OMPC_IsDevicePtr>,
+    VersionedClause<OMPC_HasDeviceAddr, 51>,
     VersionedClause<OMPC_Reduction>,
     VersionedClause<OMPC_Allocate>,
     VersionedClause<OMPC_UsesAllocators>,
@@ -1456,6 +1516,7 @@ def OMP_TargetTeamsDistributeParallelForSimd :
     VersionedClause<OMPC_DefaultMap>,
     VersionedClause<OMPC_FirstPrivate>,
     VersionedClause<OMPC_IsDevicePtr>,
+    VersionedClause<OMPC_HasDeviceAddr, 51>,
     VersionedClause<OMPC_Default>,
     VersionedClause<OMPC_Shared>,
     VersionedClause<OMPC_Reduction>,
@@ -1485,6 +1546,7 @@ def OMP_TargetTeamsDistributeParallelDoSimd :
     VersionedClause<OMPC_Depend>,
     VersionedClause<OMPC_FirstPrivate>,
     VersionedClause<OMPC_IsDevicePtr>,
+    VersionedClause<OMPC_HasDeviceAddr, 51>,
     VersionedClause<OMPC_Reduction>,
     VersionedClause<OMPC_Allocate>,
     VersionedClause<OMPC_UsesAllocators>,
@@ -1523,6 +1585,7 @@ def OMP_TargetTeamsDistributeSimd :
     VersionedClause<OMPC_FirstPrivate>,
     VersionedClause<OMPC_If>,
     VersionedClause<OMPC_IsDevicePtr>,
+    VersionedClause<OMPC_HasDeviceAddr, 51>,
     VersionedClause<OMPC_LastPrivate>,
     VersionedClause<OMPC_Linear>,
     VersionedClause<OMPC_Map>,
@@ -1581,6 +1644,28 @@ def OMP_MasterTaskloop : Directive<"master taskloop"> {
     VersionedClause<OMPC_Allocate>
   ];
 }
+def OMP_MaskedTaskloop : Directive<"masked taskloop"> {
+  let allowedClauses = [
+    VersionedClause<OMPC_If>,
+    VersionedClause<OMPC_Shared>,
+    VersionedClause<OMPC_Private>,
+    VersionedClause<OMPC_FirstPrivate>,
+    VersionedClause<OMPC_LastPrivate>,
+    VersionedClause<OMPC_Default>,
+    VersionedClause<OMPC_Collapse>,
+    VersionedClause<OMPC_Final>,
+    VersionedClause<OMPC_Untied>,
+    VersionedClause<OMPC_Mergeable>,
+    VersionedClause<OMPC_Priority>,
+    VersionedClause<OMPC_GrainSize>,
+    VersionedClause<OMPC_NoGroup>,
+    VersionedClause<OMPC_NumTasks>,
+    VersionedClause<OMPC_Reduction>,
+    VersionedClause<OMPC_InReduction>,
+    VersionedClause<OMPC_Allocate>,
+    VersionedClause<OMPC_Filter>
+  ];
+}
 def OMP_ParallelMasterTaskloop :
     Directive<"parallel master taskloop"> {
   let allowedClauses = [
@@ -1605,6 +1690,31 @@ def OMP_ParallelMasterTaskloop :
     VersionedClause<OMPC_Copyin>
   ];
 }
+def OMP_ParallelMaskedTaskloop :
+    Directive<"parallel masked taskloop"> {
+  let allowedClauses = [
+    VersionedClause<OMPC_If>,
+    VersionedClause<OMPC_Shared>,
+    VersionedClause<OMPC_Private>,
+    VersionedClause<OMPC_FirstPrivate>,
+    VersionedClause<OMPC_LastPrivate>,
+    VersionedClause<OMPC_Default>,
+    VersionedClause<OMPC_Collapse>,
+    VersionedClause<OMPC_Final>,
+    VersionedClause<OMPC_Untied>,
+    VersionedClause<OMPC_Mergeable>,
+    VersionedClause<OMPC_Priority>,
+    VersionedClause<OMPC_GrainSize>,
+    VersionedClause<OMPC_NoGroup>,
+    VersionedClause<OMPC_NumTasks>,
+    VersionedClause<OMPC_Reduction>,
+    VersionedClause<OMPC_Allocate>,
+    VersionedClause<OMPC_NumThreads>,
+    VersionedClause<OMPC_ProcBind>,
+    VersionedClause<OMPC_Copyin>,
+    VersionedClause<OMPC_Filter>
+  ];
+}
 def OMP_MasterTaskloopSimd : Directive<"master taskloop simd"> {
   let allowedClauses = [
     VersionedClause<OMPC_If>,
@@ -1632,6 +1742,34 @@ def OMP_MasterTaskloopSimd : Directive<"master taskloop simd"> {
     VersionedClause<OMPC_Order, 50>
   ];
 }
+def OMP_MaskedTaskloopSimd : Directive<"masked taskloop simd"> {
+  let allowedClauses = [
+    VersionedClause<OMPC_If>,
+    VersionedClause<OMPC_Shared>,
+    VersionedClause<OMPC_Private>,
+    VersionedClause<OMPC_FirstPrivate>,
+    VersionedClause<OMPC_LastPrivate>,
+    VersionedClause<OMPC_Default>,
+    VersionedClause<OMPC_Collapse>,
+    VersionedClause<OMPC_Final>,
+    VersionedClause<OMPC_Untied>,
+    VersionedClause<OMPC_Mergeable>,
+    VersionedClause<OMPC_Priority>,
+    VersionedClause<OMPC_Linear>,
+    VersionedClause<OMPC_Aligned>,
+    VersionedClause<OMPC_SafeLen>,
+    VersionedClause<OMPC_SimdLen>,
+    VersionedClause<OMPC_GrainSize>,
+    VersionedClause<OMPC_NoGroup>,
+    VersionedClause<OMPC_NumTasks>,
+    VersionedClause<OMPC_Reduction>,
+    VersionedClause<OMPC_InReduction>,
+    VersionedClause<OMPC_Allocate>,
+    VersionedClause<OMPC_NonTemporal, 50>,
+    VersionedClause<OMPC_Order, 50>,
+    VersionedClause<OMPC_Filter>
+  ];
+}
 def OMP_ParallelMasterTaskloopSimd :
     Directive<"parallel master taskloop simd"> {
   let allowedClauses = [
@@ -1662,6 +1800,37 @@ def OMP_ParallelMasterTaskloopSimd :
     VersionedClause<OMPC_Order, 50>
   ];
 }
+def OMP_ParallelMaskedTaskloopSimd :
+    Directive<"parallel masked taskloop simd"> {
+  let allowedClauses = [
+    VersionedClause<OMPC_If>,
+    VersionedClause<OMPC_Shared>,
+    VersionedClause<OMPC_Private>,
+    VersionedClause<OMPC_FirstPrivate>,
+    VersionedClause<OMPC_LastPrivate>,
+    VersionedClause<OMPC_Default>,
+    VersionedClause<OMPC_Collapse>,
+    VersionedClause<OMPC_Final>,
+    VersionedClause<OMPC_Untied>,
+    VersionedClause<OMPC_Mergeable>,
+    VersionedClause<OMPC_Priority>,
+    VersionedClause<OMPC_GrainSize>,
+    VersionedClause<OMPC_NoGroup>,
+    VersionedClause<OMPC_NumTasks>,
+    VersionedClause<OMPC_Reduction>,
+    VersionedClause<OMPC_Allocate>,
+    VersionedClause<OMPC_NumThreads>,
+    VersionedClause<OMPC_ProcBind>,
+    VersionedClause<OMPC_Copyin>,
+    VersionedClause<OMPC_Linear>,
+    VersionedClause<OMPC_Aligned>,
+    VersionedClause<OMPC_SafeLen>,
+    VersionedClause<OMPC_SimdLen>,
+    VersionedClause<OMPC_NonTemporal, 50>,
+    VersionedClause<OMPC_Order, 50>,
+    VersionedClause<OMPC_Filter>
+  ];
+}
 def OMP_Depobj : Directive<"depobj"> {
   let allowedClauses = [
     VersionedClause<OMPC_Depend, 50>,
@@ -1734,6 +1903,7 @@ def OMP_dispatch : Directive<"dispatch"> {
   let allowedClauses = [
     VersionedClause<OMPC_Device>,
     VersionedClause<OMPC_IsDevicePtr>,
+    VersionedClause<OMPC_HasDeviceAddr, 51>,
     VersionedClause<OMPC_NoWait>,
     VersionedClause<OMPC_Depend>,
     VersionedClause<OMPC_Novariants>,
@@ -1757,6 +1927,99 @@ def OMP_loop : Directive<"loop"> {
     VersionedClause<OMPC_Order>,
   ];
 }
+def OMP_teams_loop : Directive<"teams loop"> {
+  let allowedClauses = [
+    VersionedClause<OMPC_Allocate>,
+    VersionedClause<OMPC_FirstPrivate>,
+    VersionedClause<OMPC_LastPrivate>,
+    VersionedClause<OMPC_Private>,
+    VersionedClause<OMPC_Reduction>,
+    VersionedClause<OMPC_Shared>,
+  ];
+  let allowedOnceClauses = [
+    VersionedClause<OMPC_Bind, 50>,
+    VersionedClause<OMPC_Collapse>,
+    VersionedClause<OMPC_Default>,
+    VersionedClause<OMPC_NumTeams>,
+    VersionedClause<OMPC_Order>,
+    VersionedClause<OMPC_ThreadLimit>,
+  ];
+}
+def OMP_target_teams_loop : Directive<"target teams loop"> {
+  let allowedClauses = [
+    VersionedClause<OMPC_Allocate>,
+    VersionedClause<OMPC_Depend>,
+    VersionedClause<OMPC_DefaultMap>,
+    VersionedClause<OMPC_Device>,
+    VersionedClause<OMPC_FirstPrivate>,
+    VersionedClause<OMPC_IsDevicePtr>,
+    VersionedClause<OMPC_HasDeviceAddr, 51>,
+    VersionedClause<OMPC_LastPrivate>,
+    VersionedClause<OMPC_Map>,
+    VersionedClause<OMPC_Private>,
+    VersionedClause<OMPC_Reduction>,
+    VersionedClause<OMPC_Shared>,
+    VersionedClause<OMPC_UsesAllocators, 50>
+  ];
+  let allowedOnceClauses = [
+    VersionedClause<OMPC_Bind, 50>,
+    VersionedClause<OMPC_Collapse>,
+    VersionedClause<OMPC_Default>,
+    VersionedClause<OMPC_If>,
+    VersionedClause<OMPC_NoWait>,
+    VersionedClause<OMPC_NumTeams>,
+    VersionedClause<OMPC_Order>,
+    VersionedClause<OMPC_ThreadLimit>,
+  ];
+}
+def OMP_parallel_loop : Directive<"parallel loop"> {
+  let allowedClauses = [
+    VersionedClause<OMPC_Allocate>,
+    VersionedClause<OMPC_Copyin>,
+    VersionedClause<OMPC_FirstPrivate>,
+    VersionedClause<OMPC_LastPrivate>,
+    VersionedClause<OMPC_Private>,
+    VersionedClause<OMPC_Reduction>,
+    VersionedClause<OMPC_Shared>,
+  ];
+  let allowedOnceClauses = [
+    VersionedClause<OMPC_Bind, 50>,
+    VersionedClause<OMPC_Collapse>,
+    VersionedClause<OMPC_Default>,
+    VersionedClause<OMPC_If>,
+    VersionedClause<OMPC_NumThreads>,
+    VersionedClause<OMPC_Order>,
+    VersionedClause<OMPC_ProcBind>,
+  ];
+}
+def OMP_target_parallel_loop : Directive<"target parallel loop"> {
+  let allowedClauses = [
+    VersionedClause<OMPC_Allocate>,
+    VersionedClause<OMPC_Copyin>,
+    VersionedClause<OMPC_Depend>,
+    VersionedClause<OMPC_Device>,
+    VersionedClause<OMPC_FirstPrivate>,
+    VersionedClause<OMPC_IsDevicePtr>,
+    VersionedClause<OMPC_HasDeviceAddr, 51>,
+    VersionedClause<OMPC_LastPrivate>,
+    VersionedClause<OMPC_Map>,
+    VersionedClause<OMPC_Private>,
+    VersionedClause<OMPC_Reduction>,
+    VersionedClause<OMPC_Shared>,
+    VersionedClause<OMPC_UsesAllocators, 50>,
+  ];
+  let allowedOnceClauses = [
+    VersionedClause<OMPC_Bind, 50>,
+    VersionedClause<OMPC_Collapse>,
+    VersionedClause<OMPC_Default>,
+    VersionedClause<OMPC_DefaultMap>,
+    VersionedClause<OMPC_If>,
+    VersionedClause<OMPC_NoWait>,
+    VersionedClause<OMPC_NumThreads>,
+    VersionedClause<OMPC_Order>,
+    VersionedClause<OMPC_ProcBind>,
+  ];
+}
 def OMP_Metadirective : Directive<"metadirective"> {
   let allowedClauses = [VersionedClause<OMPC_When>];
   let allowedOnceClauses = [VersionedClause<OMPC_Default>];
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
index bee90281e086..76104f6bc9cf 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
@@ -74,26 +74,114 @@ enum class IdentFlag {
 
 /// \note This needs to be kept in sync with kmp.h enum sched_type.
 /// Todo: Update kmp.h to include this file, and remove the enums in kmp.h
-///       To complete this, more enum values will need to be moved here.
 enum class OMPScheduleType {
-  StaticChunked = 33,
-  Static = 34, // static unspecialized
-  DistributeChunked = 91,
-  Distribute = 92,
-  DynamicChunked = 35,
-  GuidedChunked = 36, // guided unspecialized
-  Runtime = 37,
-  Auto = 38, // auto
-
-  StaticBalancedChunked = 45, // static with chunk adjustment (e.g., simd)
-  GuidedSimd = 46,            // guided with chunk adjustment
-  RuntimeSimd = 47,           // runtime with chunk adjustment
-
-  ModifierMonotonic =
-      (1 << 29), // Set if the monotonic schedule modifier was present
-  ModifierNonmonotonic =
-      (1 << 30), // Set if the nonmonotonic schedule modifier was present
-  ModifierMask = ModifierMonotonic | ModifierNonmonotonic,
+  // For typed comparisons, not a valid schedule
+  None = 0,
+
+  // Schedule algorithms
+  BaseStaticChunked = 1,
+  BaseStatic = 2,
+  BaseDynamicChunked = 3,
+  BaseGuidedChunked = 4,
+  BaseRuntime = 5,
+  BaseAuto = 6,
+  BaseTrapezoidal = 7,
+  BaseGreedy = 8,
+  BaseBalanced = 9,
+  BaseGuidedIterativeChunked = 10,
+  BaseGuidedAnalyticalChunked = 11,
+  BaseSteal = 12,
+
+  // with chunk adjustment (e.g., simd)
+  BaseStaticBalancedChunked = 13,
+  BaseGuidedSimd = 14,
+  BaseRuntimeSimd = 15,
+
+  // static schedules algorithims for distribute
+  BaseDistributeChunked = 27,
+  BaseDistribute = 28,
+
+  // Modifier flags to be combined with schedule algorithms
+  ModifierUnordered = (1 << 5),
+  ModifierOrdered = (1 << 6),
+  ModifierNomerge = (1 << 7),
+  ModifierMonotonic = (1 << 29),
+  ModifierNonmonotonic = (1 << 30),
+
+  // Masks combining multiple flags
+  OrderingMask = ModifierUnordered | ModifierOrdered | ModifierNomerge,
+  MonotonicityMask = ModifierMonotonic | ModifierNonmonotonic,
+  ModifierMask = OrderingMask | MonotonicityMask,
+
+  // valid schedule type values, without monotonicity flags
+  UnorderedStaticChunked = BaseStaticChunked | ModifierUnordered,        //  33
+  UnorderedStatic = BaseStatic | ModifierUnordered,                      //  34
+  UnorderedDynamicChunked = BaseDynamicChunked | ModifierUnordered,      //  35
+  UnorderedGuidedChunked = BaseGuidedChunked | ModifierUnordered,        //  36
+  UnorderedRuntime = BaseRuntime | ModifierUnordered,                    //  37
+  UnorderedAuto = BaseAuto | ModifierUnordered,                          //  38
+  UnorderedTrapezoidal = BaseTrapezoidal | ModifierUnordered,            //  39
+  UnorderedGreedy = BaseGreedy | ModifierUnordered,                      //  40
+  UnorderedBalanced = BaseBalanced | ModifierUnordered,                  //  41
+  UnorderedGuidedIterativeChunked =
+      BaseGuidedIterativeChunked | ModifierUnordered,                    //  42
+  UnorderedGuidedAnalyticalChunked =
+      BaseGuidedAnalyticalChunked | ModifierUnordered,                   //  43
+  UnorderedSteal = BaseSteal | ModifierUnordered,                        //  44
+
+  UnorderedStaticBalancedChunked =
+      BaseStaticBalancedChunked | ModifierUnordered,                     //  45
+  UnorderedGuidedSimd = BaseGuidedSimd | ModifierUnordered,              //  46
+  UnorderedRuntimeSimd = BaseRuntimeSimd | ModifierUnordered,            //  47
+
+  OrderedStaticChunked = BaseStaticChunked | ModifierOrdered,            //  65
+  OrderedStatic = BaseStatic | ModifierOrdered,                          //  66
+  OrderedDynamicChunked = BaseDynamicChunked | ModifierOrdered,          //  67
+  OrderedGuidedChunked = BaseGuidedChunked | ModifierOrdered,            //  68
+  OrderedRuntime = BaseRuntime | ModifierOrdered,                        //  69
+  OrderedAuto = BaseAuto | ModifierOrdered,                              //  70
+  OrderdTrapezoidal = BaseTrapezoidal | ModifierOrdered,                 //  71
+
+  OrderedDistributeChunked = BaseDistributeChunked | ModifierOrdered,    //  91
+  OrderedDistribute = BaseDistribute | ModifierOrdered,                  //  92
+
+  NomergeUnorderedStaticChunked =
+      BaseStaticChunked | ModifierUnordered | ModifierNomerge,           // 161
+  NomergeUnorderedStatic =
+      BaseStatic | ModifierUnordered | ModifierNomerge,                  // 162
+  NomergeUnorderedDynamicChunked =
+      BaseDynamicChunked | ModifierUnordered | ModifierNomerge,          // 163
+  NomergeUnorderedGuidedChunked =
+      BaseGuidedChunked | ModifierUnordered | ModifierNomerge,           // 164
+  NomergeUnorderedRuntime =
+      BaseRuntime | ModifierUnordered | ModifierNomerge,                 // 165
+  NomergeUnorderedAuto = BaseAuto | ModifierUnordered | ModifierNomerge, // 166
+  NomergeUnorderedTrapezoidal =
+      BaseTrapezoidal | ModifierUnordered | ModifierNomerge,             // 167
+  NomergeUnorderedGreedy =
+      BaseGreedy | ModifierUnordered | ModifierNomerge,                  // 168
+  NomergeUnorderedBalanced =
+      BaseBalanced | ModifierUnordered | ModifierNomerge,                // 169
+  NomergeUnorderedGuidedIterativeChunked =
+      BaseGuidedIterativeChunked | ModifierUnordered | ModifierNomerge,  // 170
+  NomergeUnorderedGuidedAnalyticalChunked =
+      BaseGuidedAnalyticalChunked | ModifierUnordered | ModifierNomerge, // 171
+  NomergeUnorderedSteal =
+      BaseSteal | ModifierUnordered | ModifierNomerge,                   // 172
+
+  NomergeOrderedStaticChunked =
+      BaseStaticChunked | ModifierOrdered | ModifierNomerge,             // 193
+  NomergeOrderedStatic = BaseStatic | ModifierOrdered | ModifierNomerge, // 194
+  NomergeOrderedDynamicChunked =
+      BaseDynamicChunked | ModifierOrdered | ModifierNomerge,            // 195
+  NomergeOrderedGuidedChunked =
+      BaseGuidedChunked | ModifierOrdered | ModifierNomerge,             // 196
+  NomergeOrderedRuntime =
+      BaseRuntime | ModifierOrdered | ModifierNomerge,                   // 197
+  NomergeOrderedAuto = BaseAuto | ModifierOrdered | ModifierNomerge,     // 198
+  NomergeOrderedTrapezoidal =
+      BaseTrapezoidal | ModifierOrdered | ModifierNomerge,               // 199
+
   LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue */ ModifierMask)
 };
 
@@ -116,6 +204,9 @@ enum class AddressSpace : unsigned {
 /// \note This needs to be kept in sync with interop.h enum kmp_interop_type_t.:
 enum class OMPInteropType { Unknown, Target, TargetSync };
 
+/// Atomic compare operations. Currently OpenMP only supports ==, >, and <.
+enum class OMPAtomicCompareOp : unsigned { EQ, MIN, MAX };
+
 } // end namespace omp
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPContext.h b/llvm/include/llvm/Frontend/OpenMP/OMPContext.h
index 544f698655a4..b13b74ceab86 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPContext.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPContext.h
@@ -15,14 +15,14 @@
 #ifndef LLVM_FRONTEND_OPENMP_OMPCONTEXT_H
 #define LLVM_FRONTEND_OPENMP_OMPCONTEXT_H
 
-#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 
 namespace llvm {
+class Triple;
 namespace omp {
 
 /// OpenMP Context related IDs and helpers
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index f60debe8411c..8a6b1c7d412d 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -23,6 +23,52 @@
 namespace llvm {
 class CanonicalLoopInfo;
 
+/// Move the instruction after an InsertPoint to the beginning of another
+/// BasicBlock.
+///
+/// The instructions after \p IP are moved to the beginning of \p New which must
+/// not have any PHINodes. If \p CreateBranch is true, a branch instruction to
+/// \p New will be added such that there is no semantic change. Otherwise, the
+/// \p IP insert block remains degenerate and it is up to the caller to insert a
+/// terminator.
+void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New,
+              bool CreateBranch);
+
+/// Splice a BasicBlock at an IRBuilder's current insertion point. Its new
+/// insert location will stick to after the instruction before the insertion
+/// point (instead of moving with the instruction the InsertPoint stores
+/// internally).
+void spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch);
+
+/// Split a BasicBlock at an InsertPoint, even if the block is degenerate
+/// (missing the terminator).
+///
+/// llvm::SplitBasicBlock and BasicBlock::splitBasicBlock require a well-formed
+/// BasicBlock. \p Name is used for the new successor block. If \p CreateBranch
+/// is true, a branch to the new successor will new created such that
+/// semantically there is no change; otherwise the block of the insertion point
+/// remains degenerate and it is the caller's responsibility to insert a
+/// terminator. Returns the new successor block.
+BasicBlock *splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch,
+                    llvm::Twine Name = {});
+
+/// Split a BasicBlock at \p Builder's insertion point, even if the block is
+/// degenerate (missing the terminator).  Its new insert location will stick to
+/// after the instruction before the insertion point (instead of moving with the
+/// instruction the InsertPoint stores internally).
+BasicBlock *splitBB(IRBuilderBase &Builder, bool CreateBranch,
+                    llvm::Twine Name = {});
+
+/// Split a BasicBlock at \p Builder's insertion point, even if the block is
+/// degenerate (missing the terminator).  Its new insert location will stick to
+/// after the instruction before the insertion point (instead of moving with the
+/// instruction the InsertPoint stores internally).
+BasicBlock *splitBB(IRBuilder<> &Builder, bool CreateBranch, llvm::Twine Name);
+
+/// Like splitBB, but reuses the current block's name for the new name.
+BasicBlock *splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch,
+                              llvm::Twine Suffix = ".split");
+
 /// An interface to create LLVM-IR for OpenMP directives.
 ///
 /// Each OpenMP directive has a corresponding public generator method.
@@ -87,27 +133,36 @@ public:
   /// Callback type for body (=inner region) code generation
   ///
   /// The callback takes code locations as arguments, each describing a
-  /// location at which code might need to be generated or a location that is
-  /// the target of control transfer.
+  /// location where additional instructions can be inserted.
+  ///
+  /// The CodeGenIP may be in the middle of a basic block or point to the end of
+  /// it. The basic block may have a terminator or be degenerate. The callback
+  /// function may just insert instructions at that position, but also split the
+  /// block (without the Before argument of BasicBlock::splitBasicBlock such
+  /// that the identify of the split predecessor block is preserved) and insert
+  /// additional control flow, including branches that do not lead back to what
+  /// follows the CodeGenIP. Note that since the callback is allowed to split
+  /// the block, callers must assume that InsertPoints to positions in the
+  /// BasicBlock after CodeGenIP including CodeGenIP itself are invalidated. If
+  /// such InsertPoints need to be preserved, it can split the block itself
+  /// before calling the callback.
+  ///
+  /// AllocaIP and CodeGenIP must not point to the same position.
   ///
   /// \param AllocaIP is the insertion point at which new alloca instructions
-  ///                 should be placed.
+  ///                 should be placed. The BasicBlock it is pointing to must
+  ///                 not be split.
   /// \param CodeGenIP is the insertion point at which the body code should be
   ///                  placed.
-  /// \param ContinuationBB is the basic block target to leave the body.
-  ///
-  /// Note that all blocks pointed to by the arguments have terminators.
   using BodyGenCallbackTy =
-      function_ref<void(InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
-                        BasicBlock &ContinuationBB)>;
+      function_ref<void(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
 
   // This is created primarily for sections construct as llvm::function_ref
   // (BodyGenCallbackTy) is not storable (as described in the comments of
   // function_ref class - function_ref contains non-ownable reference
   // to the callable.
   using StorableBodyGenCallbackTy =
-      std::function<void(InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
-                         BasicBlock &ContinuationBB)>;
+      std::function<void(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
 
   /// Callback type for loop body code generation.
   ///
@@ -145,8 +200,7 @@ public:
   /// Description of a LLVM-IR insertion point (IP) and a debug/source location
   /// (filename, line, column, ...).
   struct LocationDescription {
-    template <typename T, typename U>
-    LocationDescription(const IRBuilder<T, U> &IRB)
+    LocationDescription(const IRBuilderBase &IRB)
         : IP(IRB.saveIP()), DL(IRB.getCurrentDebugLocation()) {}
     LocationDescription(const InsertPointTy &IP) : IP(IP) {}
     LocationDescription(const InsertPointTy &IP, const DebugLoc &DL)
@@ -345,6 +399,7 @@ public:
                                    ArrayRef<CanonicalLoopInfo *> Loops,
                                    InsertPointTy ComputeIP);
 
+private:
   /// Modifies the canonical loop to be a statically-scheduled workshare loop.
   ///
   /// This takes a \p LoopInfo representing a canonical loop, such as the one
@@ -354,14 +409,6 @@ public:
   /// the current thread, updates the relevant instructions in the canonical
   /// loop and calls to an OpenMP runtime finalization function after the loop.
   ///
-  /// TODO: Workshare loops with static scheduling may contain up to two loops
-  /// that fulfill the requirements of an OpenMP canonical loop. One for
-  /// iterating over all iterations of a chunk and another one for iterating
-  /// over all chunks that are executed on the same thread. Returning
-  /// CanonicalLoopInfo objects representing them may eventually be useful for
-  /// the apply clause planned in OpenMP 6.0, but currently whether these are
-  /// canonical loops is irrelevant.
-  ///
   /// \param DL       Debug location for instructions added for the
   ///                 workshare-loop construct itself.
   /// \param CLI      A descriptor of the canonical loop to workshare.
@@ -369,14 +416,30 @@ public:
   ///                 preheader of the loop.
   /// \param NeedsBarrier Indicates whether a barrier must be inserted after
   ///                     the loop.
-  /// \param Chunk    The size of loop chunk considered as a unit when
-  ///                 scheduling. If \p nullptr, defaults to 1.
   ///
   /// \returns Point where to insert code after the workshare construct.
   InsertPointTy applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
                                          InsertPointTy AllocaIP,
-                                         bool NeedsBarrier,
-                                         Value *Chunk = nullptr);
+                                         bool NeedsBarrier);
+
+  /// Modifies the canonical loop a statically-scheduled workshare loop with a
+  /// user-specified chunk size.
+  ///
+  /// \param DL           Debug location for instructions added for the
+  ///                     workshare-loop construct itself.
+  /// \param CLI          A descriptor of the canonical loop to workshare.
+  /// \param AllocaIP     An insertion point for Alloca instructions usable in
+  ///                     the preheader of the loop.
+  /// \param NeedsBarrier Indicates whether a barrier must be inserted after the
+  ///                     loop.
+  /// \param ChunkSize    The user-specified chunk size.
+  ///
+  /// \returns Point where to insert code after the workshare construct.
+  InsertPointTy applyStaticChunkedWorkshareLoop(DebugLoc DL,
+                                                CanonicalLoopInfo *CLI,
+                                                InsertPointTy AllocaIP,
+                                                bool NeedsBarrier,
+                                                Value *ChunkSize);
 
   /// Modifies the canonical loop to be a dynamically-scheduled workshare loop.
   ///
@@ -404,6 +467,7 @@ public:
                                           bool NeedsBarrier,
                                           Value *Chunk = nullptr);
 
+public:
   /// Modifies the canonical loop to be a workshare loop.
   ///
   /// This takes a \p LoopInfo representing a canonical loop, such as the one
@@ -413,6 +477,10 @@ public:
   /// the current thread, updates the relevant instructions in the canonical
   /// loop and calls to an OpenMP runtime finalization function after the loop.
   ///
+  /// The concrete transformation is done by applyStaticWorkshareLoop,
+  /// applyStaticChunkedWorkshareLoop, or applyDynamicWorkshareLoop, depending
+  /// on the value of \p SchedKind and \p ChunkSize.
+  ///
   /// \param DL       Debug location for instructions added for the
   ///                 workshare-loop construct itself.
   /// \param CLI      A descriptor of the canonical loop to workshare.
@@ -420,10 +488,25 @@ public:
   ///                 preheader of the loop.
   /// \param NeedsBarrier Indicates whether a barrier must be insterted after
   ///                     the loop.
+  /// \param SchedKind Scheduling algorithm to use.
+  /// \param ChunkSize The chunk size for the inner loop.
+  /// \param HasSimdModifier Whether the simd modifier is present in the
+  ///                        schedule clause.
+  /// \param HasMonotonicModifier Whether the monotonic modifier is present in
+  ///                             the schedule clause.
+  /// \param HasNonmonotonicModifier Whether the nonmonotonic modifier is
+  ///                                present in the schedule clause.
+  /// \param HasOrderedClause Whether the (parameterless) ordered clause is
+  ///                         present.
   ///
   /// \returns Point where to insert code after the workshare construct.
-  InsertPointTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
-                                   InsertPointTy AllocaIP, bool NeedsBarrier);
+  InsertPointTy applyWorkshareLoop(
+      DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
+      bool NeedsBarrier,
+      llvm::omp::ScheduleKind SchedKind = llvm::omp::OMP_SCHEDULE_Default,
+      Value *ChunkSize = nullptr, bool HasSimdModifier = false,
+      bool HasMonotonicModifier = false, bool HasNonmonotonicModifier = false,
+      bool HasOrderedClause = false);
 
   /// Tile a loop nest.
   ///
@@ -535,6 +618,18 @@ public:
   /// \param Loc The location where the taskyield directive was encountered.
   void createTaskyield(const LocationDescription &Loc);
 
+  /// Generator for `#omp task`
+  ///
+  /// \param Loc The location where the task construct was encountered.
+  /// \param AllocaIP The insertion point to be used for alloca instructions.
+  /// \param BodyGenCB Callback that will generate the region code.
+  /// \param Tied True if the task is tied, false if the task is untied.
+  /// \param Final i1 value which is `true` if the task is final, `false` if the
+  ///              task is not final.
+  InsertPointTy createTask(const LocationDescription &Loc,
+                           InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB,
+                           bool Tied = true, Value *Final = nullptr);
+
   /// Functions used to generate reductions. Such functions take two Values
   /// representing LHS and RHS of the reduction, respectively, and a reference
   /// to the value that is updated to refer to the reduction result.
@@ -696,6 +791,27 @@ public:
   /// Value.
   GlobalValue *createGlobalFlag(unsigned Value, StringRef Name);
 
+  /// Create an offloading section struct used to register this global at
+  /// runtime.
+  ///
+  /// Type struct __tgt_offload_entry{
+  ///   void    *addr;      // Pointer to the offload entry info.
+  ///                       // (function or global)
+  ///   char    *name;      // Name of the function or global.
+  ///   size_t  size;       // Size of the entry info (0 if it a function).
+  ///   int32_t flags;
+  ///   int32_t reserved;
+  /// };
+  ///
+  /// \param Addr The pointer to the global being registered.
+  /// \param Name The symbol name associated with the global.
+  /// \param Size The size in bytes of the global (0 for functions).
+  /// \param Flags Flags associated with the entry.
+  /// \param SectionName The section this entry will be placed at.
+  void emitOffloadingEntry(Constant *Addr, StringRef Name, uint64_t Size,
+                           int32_t Flags,
+                           StringRef SectionName = "omp_offloading_entries");
+
   /// Generate control flow and cleanup for cancellation.
   ///
   /// \param CancelFlag Flag indicating if the cancellation is performed.
@@ -768,7 +884,7 @@ public:
   struct OutlineInfo {
     using PostOutlineCBTy = std::function<void(Function &)>;
     PostOutlineCBTy PostOutlineCB;
-    BasicBlock *EntryBB, *ExitBB;
+    BasicBlock *EntryBB, *ExitBB, *OuterAllocaBB;
     SmallVector<Value *, 2> ExcludeArgsFromAggregate;
 
     /// Collect all blocks in between EntryBB and ExitBB in both the given
@@ -851,12 +967,14 @@ public:
   /// \param Loc The source location description.
   /// \param BodyGenCB Callback that will generate the region code.
   /// \param FiniCB Callback to finalize variable copies.
+  /// \param IsNowait If false, a barrier is emitted.
   /// \param DidIt Local variable used as a flag to indicate 'single' thread
   ///
   /// \returns The insertion position *after* the single call.
   InsertPointTy createSingle(const LocationDescription &Loc,
                              BodyGenCallbackTy BodyGenCB,
-                             FinalizeCallbackTy FiniCB, llvm::Value *DidIt);
+                             FinalizeCallbackTy FiniCB, bool IsNowait,
+                             llvm::Value *DidIt);
 
   /// Generator for '#omp master'
   ///
@@ -1198,7 +1316,7 @@ private:
       const function_ref<Value *(Value *XOld, IRBuilder<> &IRB)>;
 
 private:
-  enum AtomicKind { Read, Write, Update, Capture };
+  enum AtomicKind { Read, Write, Update, Capture, Compare };
 
   /// Determine whether to emit flush or not
   ///
@@ -1214,7 +1332,8 @@ private:
   /// For complex Operations: X = UpdateOp(X) => CmpExch X, old_X, UpdateOp(X)
   /// Only Scalar data types.
   ///
-  /// \param AllocIP	  Instruction to create AllocaInst before.
+  /// \param AllocaIP	  The insertion point to be used for alloca
+  ///                   instructions.
   /// \param X			    The target atomic pointer to be updated
   /// \param XElemTy    The element type of the atomic pointer.
   /// \param Expr		    The value to update X with.
@@ -1234,7 +1353,7 @@ private:
   /// \returns A pair of the old value of X before the update, and the value
   ///          used for the update.
   std::pair<Value *, Value *>
-  emitAtomicUpdate(Instruction *AllocIP, Value *X, Type *XElemTy, Value *Expr,
+  emitAtomicUpdate(InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
                    AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
                    AtomicUpdateCallbackTy &UpdateOp, bool VolatileX,
                    bool IsXBinopExpr);
@@ -1286,7 +1405,7 @@ public:
   /// Only Scalar data types.
   ///
   /// \param Loc      The insert and source location description.
-  /// \param AllocIP  Instruction to create AllocaInst before.
+  /// \param AllocaIP The insertion point to be used for alloca instructions.
   /// \param X        The target atomic pointer to be updated
   /// \param Expr     The value to update X with.
   /// \param AO       Atomic ordering of the generated atomic instructions.
@@ -1302,7 +1421,7 @@ public:
   ///
   /// \return Insertion point after generated atomic update IR.
   InsertPointTy createAtomicUpdate(const LocationDescription &Loc,
-                                   Instruction *AllocIP, AtomicOpValue &X,
+                                   InsertPointTy AllocaIP, AtomicOpValue &X,
                                    Value *Expr, AtomicOrdering AO,
                                    AtomicRMWInst::BinOp RMWOp,
                                    AtomicUpdateCallbackTy &UpdateOp,
@@ -1317,7 +1436,7 @@ public:
   /// X = UpdateOp(X); V = X,
   ///
   /// \param Loc        The insert and source location description.
-  /// \param AllocIP    Instruction to create AllocaInst before.
+  /// \param AllocaIP   The insertion point to be used for alloca instructions.
   /// \param X          The target atomic pointer to be updated
   /// \param V          Memory address where to store captured value
   /// \param Expr       The value to update X with.
@@ -1338,12 +1457,63 @@ public:
   ///
   /// \return Insertion point after generated atomic capture IR.
   InsertPointTy
-  createAtomicCapture(const LocationDescription &Loc, Instruction *AllocIP,
+  createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP,
                       AtomicOpValue &X, AtomicOpValue &V, Value *Expr,
                       AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
                       AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr,
                       bool IsPostfixUpdate, bool IsXBinopExpr);
 
+  /// Emit atomic compare for constructs: --- Only scalar data types
+  /// cond-expr-stmt:
+  /// x = x ordop expr ? expr : x;
+  /// x = expr ordop x ? expr : x;
+  /// x = x == e ? d : x;
+  /// x = e == x ? d : x; (this one is not in the spec)
+  /// cond-update-stmt:
+  /// if (x ordop expr) { x = expr; }
+  /// if (expr ordop x) { x = expr; }
+  /// if (x == e) { x = d; }
+  /// if (e == x) { x = d; } (this one is not in the spec)
+  /// conditional-update-capture-atomic:
+  /// v = x; cond-update-stmt; (IsPostfixUpdate=true, IsFailOnly=false)
+  /// cond-update-stmt; v = x; (IsPostfixUpdate=false, IsFailOnly=false)
+  /// if (x == e) { x = d; } else { v = x; } (IsPostfixUpdate=false,
+  ///                                         IsFailOnly=true)
+  /// r = x == e; if (r) { x = d; } (IsPostfixUpdate=false, IsFailOnly=false)
+  /// r = x == e; if (r) { x = d; } else { v = x; } (IsPostfixUpdate=false,
+  ///                                                IsFailOnly=true)
+  ///
+  /// \param Loc          The insert and source location description.
+  /// \param X            The target atomic pointer to be updated.
+  /// \param V            Memory address where to store captured value (for
+  ///                     compare capture only).
+  /// \param R            Memory address where to store comparison result
+  ///                     (for compare capture with '==' only).
+  /// \param E            The expected value ('e') for forms that use an
+  ///                     equality comparison or an expression ('expr') for
+  ///                     forms that use 'ordop' (logically an atomic maximum or
+  ///                     minimum).
+  /// \param D            The desired value for forms that use an equality
+  ///                     comparison. If forms that use 'ordop', it should be
+  ///                     \p nullptr.
+  /// \param AO           Atomic ordering of the generated atomic instructions.
+  /// \param Op           Atomic compare operation. It can only be ==, <, or >.
+  /// \param IsXBinopExpr True if the conditional statement is in the form where
+  ///                     x is on LHS. It only matters for < or >.
+  /// \param IsPostfixUpdate  True if original value of 'x' must be stored in
+  ///                         'v', not an updated one (for compare capture
+  ///                         only).
+  /// \param IsFailOnly   True if the original value of 'x' is stored to 'v'
+  ///                     only when the comparison fails. This is only valid for
+  ///                     the case the comparison is '=='.
+  ///
+  /// \return Insertion point after generated atomic capture IR.
+  InsertPointTy
+  createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X,
+                      AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D,
+                      AtomicOrdering AO, omp::OMPAtomicCompareOp Op,
+                      bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly);
+
   /// Create the control flow structure of a canonical OpenMP loop.
   ///
   /// The emitted loop will be disconnected, i.e. no edge to the loop's
@@ -1484,6 +1654,27 @@ private:
   /// Re-evaluated whether this makes sense.
   void collectControlBlocks(SmallVectorImpl<BasicBlock *> &BBs);
 
+  /// Sets the number of loop iterations to the given value. This value must be
+  /// valid in the condition block (i.e., defined in the preheader) and is
+  /// interpreted as an unsigned integer.
+  void setTripCount(Value *TripCount);
+
+  /// Replace all uses of the canonical induction variable in the loop body with
+  /// a new one.
+  ///
+  /// The intended use case is to update the induction variable for an updated
+  /// iteration space such that it can stay normalized in the 0...tripcount-1
+  /// range.
+  ///
+  /// The \p Updater is called with the (presumable updated) current normalized
+  /// induction variable and is expected to return the value that uses of the
+  /// pre-updated induction values should use instead, typically dependent on
+  /// the new induction variable. This is a lambda (instead of e.g. just passing
+  /// the new value) to be able to distinguish the uses of the pre-updated
+  /// induction variable and uses of the induction varible to compute the
+  /// updated induction variable value.
+  void mapIndVar(llvm::function_ref<Value *(Instruction *)> Updater);
+
 public:
   /// Returns whether this object currently represents the IR of a loop. If
   /// returning false, it may have been consumed by a loop transformation or not
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 0c3cb3f43105..14aa53a6b08d 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -86,6 +86,8 @@ __OMP_ARRAY_TYPE(KmpCriticalName, Int32, 8)
   OMP_STRUCT_TYPE(VarName, "struct." #Name, __VA_ARGS__)
 
 __OMP_STRUCT_TYPE(Ident, ident_t, Int32, Int32, Int32, Int32, Int8Ptr)
+__OMP_STRUCT_TYPE(OffloadEntry, __tgt_offload_entry, Int8Ptr, Int8Ptr, SizeTy,
+                  Int32, Int32)
 __OMP_STRUCT_TYPE(AsyncInfo, __tgt_async_info, Int8Ptr)
 
 #undef __OMP_STRUCT_TYPE
@@ -475,6 +477,7 @@ __OMP_RTL(__last, false, Void, )
 #define ParamAttrs(...) ArrayRef<AttributeSet>({__VA_ARGS__})
 #define EnumAttr(Kind) Attribute::get(Ctx, Attribute::AttrKind::Kind)
 #define EnumAttrInt(Kind, N) Attribute::get(Ctx, Attribute::AttrKind::Kind, N)
+#define AllocSizeAttr(N, M) Attribute::getWithAllocSizeArgs(Ctx, N, M)
 #define AttributeSet(...)                                                      \
   AttributeSet::get(Ctx, ArrayRef<Attribute>({__VA_ARGS__}))
 
@@ -908,8 +911,10 @@ __OMP_RTL_ATTRS(__kmpc_doacross_wait, BarrierAttrs, AttributeSet(),
 __OMP_RTL_ATTRS(__kmpc_doacross_fini, BarrierAttrs, AttributeSet(),
                 ParamAttrs(ReadOnlyPtrAttrs))
 
-__OMP_RTL_ATTRS(__kmpc_alloc_shared, DeviceAllocAttrs, ReturnPtrAttrs,
-                ParamAttrs())
+__OMP_RTL_ATTRS(__kmpc_alloc_shared, AttributeSet(
+  EnumAttr(NoUnwind),
+  EnumAttr(NoSync), 
+  AllocSizeAttr(0, None)), ReturnPtrAttrs, ParamAttrs())
 __OMP_RTL_ATTRS(__kmpc_free_shared, DeviceAllocAttrs, AttributeSet(),
                 ParamAttrs(NoCaptureAttrs))
 
@@ -962,6 +967,7 @@ __OMP_RTL_ATTRS(__kmpc_parallel_51, AlwaysInlineAttrs, AttributeSet(),
 #undef EnumAttr
 #undef EnumAttrInt
 #undef ParamAttrs
+#undef AllocSizeAttr
 
 ///}
 
@@ -1026,6 +1032,7 @@ __OMP_CANCEL_KIND(taskgroup, 4)
 
 __OMP_DEFAULT_KIND(none)
 __OMP_DEFAULT_KIND(shared)
+__OMP_DEFAULT_KIND(private)
 __OMP_DEFAULT_KIND(firstprivate)
 __OMP_DEFAULT_KIND(unknown)
 
@@ -1153,6 +1160,7 @@ __OMP_TRAIT_PROPERTY(implementation, extension, match_any)
 __OMP_TRAIT_PROPERTY(implementation, extension, match_none)
 __OMP_TRAIT_PROPERTY(implementation, extension, disable_implicit_base)
 __OMP_TRAIT_PROPERTY(implementation, extension, allow_templates)
+__OMP_TRAIT_PROPERTY(implementation, extension, bind_to_declaration)
 
 __OMP_TRAIT_SET(user)
 
diff --git a/llvm/include/llvm/FuzzMutate/FuzzerCLI.h b/llvm/include/llvm/FuzzMutate/FuzzerCLI.h
index 473277396a90..db0168d3e675 100644
--- a/llvm/include/llvm/FuzzMutate/FuzzerCLI.h
+++ b/llvm/include/llvm/FuzzMutate/FuzzerCLI.h
@@ -14,8 +14,8 @@
 #ifndef LLVM_FUZZMUTATE_FUZZERCLI_H
 #define LLVM_FUZZMUTATE_FUZZERCLI_H
 
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/DataTypes.h"
+#include <stddef.h>
 
 namespace llvm {
 
@@ -51,29 +51,6 @@ using FuzzerInitFun = int (*)(int *argc, char ***argv);
 int runFuzzerOnInputs(int ArgC, char *ArgV[], FuzzerTestFun TestOne,
                       FuzzerInitFun Init = [](int *, char ***) { return 0; });
 
-/// Fuzzer friendly interface for the llvm bitcode parser.
-///
-/// \param Data Bitcode we are going to parse
-/// \param Size Size of the 'Data' in bytes
-/// \return New module or nullptr in case of error
-std::unique_ptr<Module> parseModule(const uint8_t *Data, size_t Size,
-                                    LLVMContext &Context);
-
-/// Fuzzer friendly interface for the llvm bitcode printer.
-///
-/// \param M Module to print
-/// \param Dest Location to store serialized module
-/// \param MaxSize Size of the destination buffer
-/// \return Number of bytes that were written. When module size exceeds MaxSize
-///         returns 0 and leaves Dest unchanged.
-size_t writeModule(const Module &M, uint8_t *Dest, size_t MaxSize);
-
-/// Try to parse module and verify it. May output verification errors to the
-/// errs().
-/// \return New module or nullptr in case of error.
-std::unique_ptr<Module> parseAndVerify(const uint8_t *Data, size_t Size,
-                                       LLVMContext &Context);
-
-} // end llvm namespace
+} // namespace llvm
 
 #endif // LLVM_FUZZMUTATE_FUZZERCLI_H
diff --git a/llvm/include/llvm/FuzzMutate/IRMutator.h b/llvm/include/llvm/FuzzMutate/IRMutator.h
index 423582eace9b..ade76f1b5845 100644
--- a/llvm/include/llvm/FuzzMutate/IRMutator.h
+++ b/llvm/include/llvm/FuzzMutate/IRMutator.h
@@ -10,6 +10,9 @@
 // configurable set of strategies. Some common strategies are also included
 // here.
 //
+// Fuzzer-friendly (de)serialization functions are also provided, as these
+// are usually needed when mutating IR.
+//
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_FUZZMUTATE_IRMUTATOR_H
@@ -113,6 +116,29 @@ public:
   void mutate(Instruction &Inst, RandomIRBuilder &IB) override;
 };
 
+/// Fuzzer friendly interface for the llvm bitcode parser.
+///
+/// \param Data Bitcode we are going to parse
+/// \param Size Size of the 'Data' in bytes
+/// \return New module or nullptr in case of error
+std::unique_ptr<Module> parseModule(const uint8_t *Data, size_t Size,
+                                    LLVMContext &Context);
+
+/// Fuzzer friendly interface for the llvm bitcode printer.
+///
+/// \param M Module to print
+/// \param Dest Location to store serialized module
+/// \param MaxSize Size of the destination buffer
+/// \return Number of bytes that were written. When module size exceeds MaxSize
+///         returns 0 and leaves Dest unchanged.
+size_t writeModule(const Module &M, uint8_t *Dest, size_t MaxSize);
+
+/// Try to parse module and verify it. May output verification errors to the
+/// errs().
+/// \return New module or nullptr in case of error.
+std::unique_ptr<Module> parseAndVerify(const uint8_t *Data, size_t Size,
+                                       LLVMContext &Context);
+
 } // end llvm namespace
 
 #endif // LLVM_FUZZMUTATE_IRMUTATOR_H
diff --git a/llvm/include/llvm/FuzzMutate/OpDescriptor.h b/llvm/include/llvm/FuzzMutate/OpDescriptor.h
index 43c810920766..847f975571bc 100644
--- a/llvm/include/llvm/FuzzMutate/OpDescriptor.h
+++ b/llvm/include/llvm/FuzzMutate/OpDescriptor.h
@@ -15,16 +15,15 @@
 #define LLVM_FUZZMUTATE_OPDESCRIPTOR_H
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include <functional>
 
 namespace llvm {
+class Instruction;
 namespace fuzzerop {
 
 /// @{
@@ -146,7 +145,8 @@ static inline SourcePred sizedPtrType() {
       return false;
 
     if (const auto *PtrT = dyn_cast<PointerType>(V->getType()))
-      return PtrT->getPointerElementType()->isSized();
+      return PtrT->isOpaque() ||
+             PtrT->getNonOpaquePointerElementType()->isSized();
     return false;
   };
   auto Make = [](ArrayRef<Value *>, ArrayRef<Type *> Ts) {
diff --git a/llvm/include/llvm/FuzzMutate/RandomIRBuilder.h b/llvm/include/llvm/FuzzMutate/RandomIRBuilder.h
index f3b609702e9d..aeb41baa8e07 100644
--- a/llvm/include/llvm/FuzzMutate/RandomIRBuilder.h
+++ b/llvm/include/llvm/FuzzMutate/RandomIRBuilder.h
@@ -13,12 +13,19 @@
 #ifndef LLVM_FUZZMUTATE_RANDOMIRBUILDER_H
 #define LLVM_FUZZMUTATE_RANDOMIRBUILDER_H
 
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/FuzzMutate/IRMutator.h"
-#include "llvm/FuzzMutate/Random.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
 #include <random>
 
 namespace llvm {
+class BasicBlock;
+class Instruction;
+class LLVMContext;
+class Type;
+class Value;
+namespace fuzzerop {
+class SourcePred;
+}
 
 using RandomEngine = std::mt19937;
 
diff --git a/llvm/include/llvm/IR/AbstractCallSite.h b/llvm/include/llvm/IR/AbstractCallSite.h
index 69048554a05c..50afe016f0d6 100644
--- a/llvm/include/llvm/IR/AbstractCallSite.h
+++ b/llvm/include/llvm/IR/AbstractCallSite.h
@@ -14,17 +14,17 @@
 #ifndef LLVM_IR_ABSTRACTCALLSITE_H
 #define LLVM_IR_ABSTRACTCALLSITE_H
 
-#include "llvm/IR/Argument.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Use.h"
 #include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
 #include <cassert>
 
 namespace llvm {
 
+class Argument;
+class Use;
+
 /// AbstractCallSite
 ///
 /// An abstract call site is a wrapper that allows to treat direct,
diff --git a/llvm/include/llvm/IR/Argument.h b/llvm/include/llvm/IR/Argument.h
index 7cbfa2a7b6ce..3b74853cdafa 100644
--- a/llvm/include/llvm/IR/Argument.h
+++ b/llvm/include/llvm/IR/Argument.h
@@ -14,7 +14,6 @@
 #define LLVM_IR_ARGUMENT_H
 
 #include "llvm/ADT/Twine.h"
-#include "llvm/ADT/ilist_node.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Value.h"
 
diff --git a/llvm/include/llvm/IR/Assumptions.h b/llvm/include/llvm/IR/Assumptions.h
index 08e6c8b6f1e0..2d2ecfbde6e6 100644
--- a/llvm/include/llvm/IR/Assumptions.h
+++ b/llvm/include/llvm/IR/Assumptions.h
@@ -34,6 +34,10 @@ extern StringSet<> KnownAssumptionStrings;
 /// Helper that allows to insert a new assumption string in the known assumption
 /// set by creating a (static) object.
 struct KnownAssumptionString {
+  KnownAssumptionString(const char *AssumptionStr)
+      : AssumptionStr(AssumptionStr) {
+    KnownAssumptionStrings.insert(AssumptionStr);
+  }
   KnownAssumptionString(StringRef AssumptionStr)
       : AssumptionStr(AssumptionStr) {
     KnownAssumptionStrings.insert(AssumptionStr);
diff --git a/llvm/include/llvm/IR/Attributes.h b/llvm/include/llvm/IR/Attributes.h
index 74b60f1e3d05..6a4e6d63a973 100644
--- a/llvm/include/llvm/IR/Attributes.h
+++ b/llvm/include/llvm/IR/Attributes.h
@@ -17,11 +17,13 @@
 
 #include "llvm-c/Types.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Alignment.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/PointerLikeTypeTraits.h"
 #include <bitset>
 #include <cassert>
@@ -42,6 +44,18 @@ class Function;
 class LLVMContext;
 class Type;
 
+enum class AllocFnKind : uint64_t {
+  Unknown = 0,
+  Alloc = 1 << 0,         // Allocator function returns a new allocation
+  Realloc = 1 << 1,       // Allocator function resizes the `allocptr` argument
+  Free = 1 << 2,          // Allocator function frees the `allocptr` argument
+  Uninitialized = 1 << 3, // Allocator function returns uninitialized memory
+  Zeroed = 1 << 4,        // Allocator function returns zeroed memory
+  Aligned = 1 << 5,       // Allocator function aligns allocations per the
+                          // `allocalign` argument
+  LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue = */ Aligned)
+};
+
 //===----------------------------------------------------------------------===//
 /// \class
 /// Functions, function parameters, and return types can have attributes
@@ -130,6 +144,7 @@ public:
   static Attribute getWithByRefType(LLVMContext &Context, Type *Ty);
   static Attribute getWithPreallocatedType(LLVMContext &Context, Type *Ty);
   static Attribute getWithInAllocaType(LLVMContext &Context, Type *Ty);
+  static Attribute getWithUWTableKind(LLVMContext &Context, UWTableKind Kind);
 
   /// For a typed attribute, return the equivalent attribute with the type
   /// changed to \p ReplacementTy.
@@ -223,6 +238,12 @@ public:
   /// unknown.
   Optional<unsigned> getVScaleRangeMax() const;
 
+  // Returns the unwind table kind.
+  UWTableKind getUWTableKind() const;
+
+  // Returns the allocator function kind.
+  AllocFnKind getAllocKind() const;
+
   /// The Attribute is converted to a string of equivalent mnemonic. This
   /// is, presumably, for writing out the mnemonics for the assembly writer.
   std::string getAsString(bool InAttrGrp = false) const;
@@ -353,6 +374,8 @@ public:
   std::pair<unsigned, Optional<unsigned>> getAllocSizeArgs() const;
   unsigned getVScaleRangeMin() const;
   Optional<unsigned> getVScaleRangeMax() const;
+  UWTableKind getUWTableKind() const;
+  AllocFnKind getAllocKind() const;
   std::string getAsString(bool InAttrGrp = false) const;
 
   /// Return true if this attribute set belongs to the LLVMContext.
@@ -841,6 +864,11 @@ public:
   /// arg.
   uint64_t getParamDereferenceableOrNullBytes(unsigned ArgNo) const;
 
+  /// Get the unwind table kind requested for the function.
+  UWTableKind getUWTableKind() const;
+
+  AllocFnKind getAllocKind() const;
+
   /// Return the attributes at the index as a string.
   std::string getAsString(unsigned Index, bool InAttrGrp = false) const;
 
@@ -1190,6 +1218,13 @@ public:
   /// Attribute.getIntValue().
   AttrBuilder &addVScaleRangeAttrFromRawRepr(uint64_t RawVScaleRangeRepr);
 
+  /// This turns the unwind table kind into the form used internally in
+  /// Attribute.
+  AttrBuilder &addUWTableAttr(UWTableKind Kind);
+
+  // This turns the allocator kind into the form used internally in Attribute.
+  AttrBuilder &addAllocKindAttr(AllocFnKind Kind);
+
   ArrayRef<Attribute> attrs() const { return Attrs; }
 
   bool operator==(const AttrBuilder &B) const;
@@ -1198,8 +1233,17 @@ public:
 
 namespace AttributeFuncs {
 
-/// Which attributes cannot be applied to a type.
-AttributeMask typeIncompatible(Type *Ty);
+enum AttributeSafetyKind : uint8_t {
+  ASK_SAFE_TO_DROP = 1,
+  ASK_UNSAFE_TO_DROP = 2,
+  ASK_ALL = ASK_SAFE_TO_DROP | ASK_UNSAFE_TO_DROP,
+};
+
+/// Which attributes cannot be applied to a type. The argument \p ASK indicates,
+/// if only attributes that are known to be safely droppable are contained in
+/// the mask; only attributes that might be unsafe to drop (e.g., ABI-related
+/// attributes) are in the mask; or both.
+AttributeMask typeIncompatible(Type *Ty, AttributeSafetyKind ASK = ASK_ALL);
 
 /// Get param/return attributes which imply immediate undefined behavior if an
 /// invalid value is passed. For example, this includes noundef (where undef
@@ -1230,6 +1274,9 @@ void mergeAttributesForInlining(Function &Caller, const Function &Callee);
 /// \param [in] ToMerge - The function to merge attributes from.
 void mergeAttributesForOutlining(Function &Base, const Function &ToMerge);
 
+/// Update min-legal-vector-width if it is in Attribute and less than Width.
+void updateMinLegalVectorWidthAttr(Function &Fn, uint64_t Width);
+
 } // end namespace AttributeFuncs
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td
index 40c554c269ca..7b955b40b0a8 100644
--- a/llvm/include/llvm/IR/Attributes.td
+++ b/llvm/include/llvm/IR/Attributes.td
@@ -47,6 +47,16 @@ class StrBoolAttr<string S> : Attr<S, []>;
 /// 0 means unaligned (different from align(1)).
 def Alignment : IntAttr<"align", [ParamAttr, RetAttr]>;
 
+/// Parameter of a function that tells us the alignment of an allocation, as in
+/// aligned_alloc and aligned ::operator::new.
+def AllocAlign: EnumAttr<"allocalign", [ParamAttr]>;
+
+/// Describes behavior of an allocator function in terms of known properties.
+def AllocKind: IntAttr<"allockind", [FnAttr]>;
+
+/// Parameter is the pointer to be manipulated by the allocator function.
+def AllocatedPointer : EnumAttr<"allocptr", [ParamAttr]>;
+
 /// The result of the function is guaranteed to point to a number of bytes that
 /// we can determine if we know the value of the function's arguments.
 def AllocSize : IntAttr<"allocsize", [FnAttr]>;
@@ -175,6 +185,9 @@ def NoProfile : EnumAttr<"noprofile", [FnAttr]>;
 /// Function doesn't unwind stack.
 def NoUnwind : EnumAttr<"nounwind", [FnAttr]>;
 
+/// No SanitizeBounds instrumentation.
+def NoSanitizeBounds : EnumAttr<"nosanitize_bounds", [FnAttr]>;
+
 /// No SanitizeCoverage instrumentation.
 def NoSanitizeCoverage : EnumAttr<"nosanitize_coverage", [FnAttr]>;
 
@@ -273,7 +286,7 @@ def SwiftSelf : EnumAttr<"swiftself", [ParamAttr]>;
 def SwiftAsync : EnumAttr<"swiftasync", [ParamAttr]>;
 
 /// Function must be in a unwind table.
-def UWTable : EnumAttr<"uwtable", [FnAttr]>;
+def UWTable : IntAttr<"uwtable", [FnAttr]>;
 
 /// Minimum/Maximum vscale value for function.
 def VScaleRange : IntAttr<"vscale_range", [FnAttr]>;
@@ -290,10 +303,14 @@ def ZExt : EnumAttr<"zeroext", [ParamAttr, RetAttr]>;
 /// Function is required to make Forward Progress.
 def MustProgress : EnumAttr<"mustprogress", [FnAttr]>;
 
+/// Function is a presplit coroutine.
+def PresplitCoroutine : EnumAttr<"presplitcoroutine", [FnAttr]>;
+
 /// Target-independent string attributes.
 def LessPreciseFPMAD : StrBoolAttr<"less-precise-fpmad">;
 def NoInfsFPMath : StrBoolAttr<"no-infs-fp-math">;
 def NoNansFPMath : StrBoolAttr<"no-nans-fp-math">;
+def ApproxFuncFPMath : StrBoolAttr<"approx-func-fp-math">;
 def NoSignedZerosFPMath : StrBoolAttr<"no-signed-zeros-fp-math">;
 def UnsafeFPMath : StrBoolAttr<"unsafe-fp-math">;
 def NoJumpTables : StrBoolAttr<"no-jump-tables">;
@@ -333,6 +350,7 @@ class MergeRule<string F> {
 def : MergeRule<"setAND<LessPreciseFPMADAttr>">;
 def : MergeRule<"setAND<NoInfsFPMathAttr>">;
 def : MergeRule<"setAND<NoNansFPMathAttr>">;
+def : MergeRule<"setAND<ApproxFuncFPMathAttr>">;
 def : MergeRule<"setAND<NoSignedZerosFPMathAttr>">;
 def : MergeRule<"setAND<UnsafeFPMathAttr>">;
 def : MergeRule<"setOR<NoImplicitFloatAttr>">;
@@ -345,6 +363,3 @@ def : MergeRule<"adjustCallerStackProbeSize">;
 def : MergeRule<"adjustMinLegalVectorWidth">;
 def : MergeRule<"adjustNullPointerValidAttr">;
 def : MergeRule<"setAND<MustProgressAttr>">;
-
-// Target dependent attributes
-include "llvm/IR/AttributesAMDGPU.td"
diff --git a/llvm/include/llvm/IR/AttributesAMDGPU.td b/llvm/include/llvm/IR/AttributesAMDGPU.td
deleted file mode 100644
index e2a0f045b656..000000000000
--- a/llvm/include/llvm/IR/AttributesAMDGPU.td
+++ /dev/null
@@ -1,14 +0,0 @@
-//===- AttributesAMDGPU.td - Defines AMDGPU attributes -----*- tablegen -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines AMDGPU specific attributes.
-//
-//===----------------------------------------------------------------------===//
-
-def AMDGPUUnsafeFPAtomics : StrBoolAttr<"amdgpu-unsafe-fp-atomics">;
-def : MergeRule<"setAND<AMDGPUUnsafeFPAtomicsAttr>">;
diff --git a/llvm/include/llvm/IR/AutoUpgrade.h b/llvm/include/llvm/IR/AutoUpgrade.h
index f331fc3c413f..12952f25cbda 100644
--- a/llvm/include/llvm/IR/AutoUpgrade.h
+++ b/llvm/include/llvm/IR/AutoUpgrade.h
@@ -14,19 +14,24 @@
 #define LLVM_IR_AUTOUPGRADE_H
 
 #include "llvm/ADT/StringRef.h"
+#include <vector>
 
 namespace llvm {
   class AttrBuilder;
-  class CallInst;
+  class CallBase;
   class Constant;
   class Function;
   class Instruction;
+  class GlobalVariable;
   class MDNode;
   class Module;
-  class GlobalVariable;
+  class StringRef;
   class Type;
   class Value;
 
+  template <typename T> class OperandBundleDefT;
+  using OperandBundleDef = OperandBundleDefT<Value *>;
+
   /// This is a more granular function that simply checks an intrinsic function
   /// for upgrading, and returns true if it requires upgrading. It may return
   /// null in NewFn if the all calls to the original intrinsic function
@@ -35,7 +40,7 @@ namespace llvm {
 
   /// This is the complement to the above, replacing a specific call to an
   /// intrinsic function with a call to the specified new function.
-  void UpgradeIntrinsicCall(CallInst *CI, Function *NewFn);
+  void UpgradeIntrinsicCall(CallBase *CB, Function *NewFn);
 
   // This upgrades the comment for objc retain release markers in inline asm
   // calls
@@ -77,7 +82,7 @@ namespace llvm {
   /// This is an auto-upgrade for bitcast constant expression between pointers
   /// with different address spaces: the instruction is replaced by a pair
   /// ptrtoint+inttoptr.
-  Value *UpgradeBitCastExpr(unsigned Opc, Constant *C, Type *DestTy);
+  Constant *UpgradeBitCastExpr(unsigned Opc, Constant *C, Type *DestTy);
 
   /// Check the debug info version number, if it is out-dated, drop the debug
   /// info. Return true if module is modified.
@@ -98,6 +103,9 @@ namespace llvm {
   /// Upgrade attributes that changed format or kind.
   void UpgradeAttributes(AttrBuilder &B);
 
+  /// Upgrade operand bundles (without knowing about their user instruction).
+  void UpgradeOperandBundles(std::vector<OperandBundleDef> &OperandBundles);
+
 } // End llvm namespace
 
 #endif
diff --git a/llvm/include/llvm/IR/BasicBlock.h b/llvm/include/llvm/IR/BasicBlock.h
index 184ddfc01c29..d487223eca02 100644
--- a/llvm/include/llvm/IR/BasicBlock.h
+++ b/llvm/include/llvm/IR/BasicBlock.h
@@ -22,9 +22,6 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/SymbolTableListTraits.h"
 #include "llvm/IR/Value.h"
-#include "llvm/Support/CBindingWrapping.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Compiler.h"
 #include <cassert>
 #include <cstddef>
 #include <iterator>
@@ -119,7 +116,11 @@ public:
 
   /// Returns the terminator instruction if the block is well formed or null
   /// if the block is not well formed.
-  const Instruction *getTerminator() const LLVM_READONLY;
+  const Instruction *getTerminator() const LLVM_READONLY {
+    if (InstList.empty() || !InstList.back().isTerminator())
+      return nullptr;
+    return &InstList.back();
+  }
   Instruction *getTerminator() {
     return const_cast<Instruction *>(
         static_cast<const BasicBlock *>(this)->getTerminator());
diff --git a/llvm/include/llvm/IR/CFG.h b/llvm/include/llvm/IR/CFG.h
index 0ee584f8af7e..28a8d31a4cc6 100644
--- a/llvm/include/llvm/IR/CFG.h
+++ b/llvm/include/llvm/IR/CFG.h
@@ -25,7 +25,6 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
 #include <cassert>
 #include <cstddef>
 #include <iterator>
diff --git a/llvm/include/llvm/IR/ConstantFold.h b/llvm/include/llvm/IR/ConstantFold.h
new file mode 100644
index 000000000000..d637a180b0ba
--- /dev/null
+++ b/llvm/include/llvm/IR/ConstantFold.h
@@ -0,0 +1,60 @@
+//==-- ConstantFold.h - DL-independent Constant Folding Interface -*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the DataLayout-independent constant folding interface.
+// When possible, the DataLayout-aware constant folding interface in
+// Analysis/ConstantFolding.h should be preferred.
+//
+// These interfaces are used by the ConstantExpr::get* methods to automatically
+// fold constants when possible.
+//
+// These operators may return a null object if they don't know how to perform
+// the specified operation on the specified constant types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_IR_CONSTANTFOLD_H
+#define LLVM_IR_CONSTANTFOLD_H
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/IR/InstrTypes.h"
+
+namespace llvm {
+  template <typename T> class ArrayRef;
+  class Value;
+  class Constant;
+  class Type;
+
+  // Constant fold various types of instruction...
+  Constant *ConstantFoldCastInstruction(
+    unsigned opcode,     ///< The opcode of the cast
+    Constant *V,         ///< The source constant
+    Type *DestTy   ///< The destination type
+  );
+  Constant *ConstantFoldSelectInstruction(Constant *Cond,
+                                          Constant *V1, Constant *V2);
+  Constant *ConstantFoldExtractElementInstruction(Constant *Val, Constant *Idx);
+  Constant *ConstantFoldInsertElementInstruction(Constant *Val, Constant *Elt,
+                                                 Constant *Idx);
+  Constant *ConstantFoldShuffleVectorInstruction(Constant *V1, Constant *V2,
+                                                 ArrayRef<int> Mask);
+  Constant *ConstantFoldExtractValueInstruction(Constant *Agg,
+                                                ArrayRef<unsigned> Idxs);
+  Constant *ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val,
+                                               ArrayRef<unsigned> Idxs);
+  Constant *ConstantFoldUnaryInstruction(unsigned Opcode, Constant *V);
+  Constant *ConstantFoldBinaryInstruction(unsigned Opcode, Constant *V1,
+                                          Constant *V2);
+  Constant *ConstantFoldCompareInstruction(CmpInst::Predicate Predicate,
+                                           Constant *C1, Constant *C2);
+  Constant *ConstantFoldGetElementPtr(Type *Ty, Constant *C, bool InBounds,
+                                      Optional<unsigned> InRangeIndex,
+                                      ArrayRef<Value *> Idxs);
+} // End llvm namespace
+
+#endif
diff --git a/llvm/include/llvm/IR/ConstantFolder.h b/llvm/include/llvm/IR/ConstantFolder.h
index 28dc63a5886e..5e7ddb9aa673 100644
--- a/llvm/include/llvm/IR/ConstantFolder.h
+++ b/llvm/include/llvm/IR/ConstantFolder.h
@@ -19,9 +19,10 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/ConstantFold.h"
 #include "llvm/IR/IRBuilderFolder.h"
-#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/IR/Operator.h"
 
 namespace llvm {
 
@@ -38,31 +39,46 @@ public:
   // Return an existing value or a constant if the operation can be simplified.
   // Otherwise return nullptr.
   //===--------------------------------------------------------------------===//
-  Value *FoldAdd(Value *LHS, Value *RHS, bool HasNUW = false,
-                 bool HasNSW = false) const override {
+
+  Value *FoldBinOp(Instruction::BinaryOps Opc, Value *LHS,
+                   Value *RHS) const override {
     auto *LC = dyn_cast<Constant>(LHS);
     auto *RC = dyn_cast<Constant>(RHS);
     if (LC && RC)
-      return ConstantExpr::getAdd(LC, RC, HasNUW, HasNSW);
+      return ConstantExpr::get(Opc, LC, RC);
     return nullptr;
   }
 
-  Value *FoldAnd(Value *LHS, Value *RHS) const override {
+  Value *FoldExactBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS,
+                        bool IsExact) const override {
     auto *LC = dyn_cast<Constant>(LHS);
     auto *RC = dyn_cast<Constant>(RHS);
     if (LC && RC)
-      return ConstantExpr::getAnd(LC, RC);
+      return ConstantExpr::get(Opc, LC, RC,
+                               IsExact ? PossiblyExactOperator::IsExact : 0);
     return nullptr;
   }
 
-  Value *FoldOr(Value *LHS, Value *RHS) const override {
+  Value *FoldNoWrapBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS,
+                         bool HasNUW, bool HasNSW) const override {
     auto *LC = dyn_cast<Constant>(LHS);
     auto *RC = dyn_cast<Constant>(RHS);
-    if (LC && RC)
-      return ConstantExpr::getOr(LC, RC);
+    if (LC && RC) {
+      unsigned Flags = 0;
+      if (HasNUW)
+        Flags |= OverflowingBinaryOperator::NoUnsignedWrap;
+      if (HasNSW)
+        Flags |= OverflowingBinaryOperator::NoSignedWrap;
+      return ConstantExpr::get(Opc, LC, RC, Flags);
+    }
     return nullptr;
   }
 
+  Value *FoldBinOpFMF(Instruction::BinaryOps Opc, Value *LHS, Value *RHS,
+                      FastMathFlags FMF) const override {
+    return FoldBinOp(Opc, LHS, RHS);
+  }
+
   Value *FoldICmp(CmpInst::Predicate P, Value *LHS, Value *RHS) const override {
     auto *LC = dyn_cast<Constant>(LHS);
     auto *RC = dyn_cast<Constant>(RHS);
@@ -95,103 +111,57 @@ public:
     return nullptr;
   }
 
-  //===--------------------------------------------------------------------===//
-  // Binary Operators
-  //===--------------------------------------------------------------------===//
-
-  Constant *CreateFAdd(Constant *LHS, Constant *RHS) const override {
-    return ConstantExpr::getFAdd(LHS, RHS);
-  }
-
-  Constant *CreateSub(Constant *LHS, Constant *RHS,
-                      bool HasNUW = false, bool HasNSW = false) const override {
-    return ConstantExpr::getSub(LHS, RHS, HasNUW, HasNSW);
-  }
-
-  Constant *CreateFSub(Constant *LHS, Constant *RHS) const override {
-    return ConstantExpr::getFSub(LHS, RHS);
-  }
-
-  Constant *CreateMul(Constant *LHS, Constant *RHS,
-                      bool HasNUW = false, bool HasNSW = false) const override {
-    return ConstantExpr::getMul(LHS, RHS, HasNUW, HasNSW);
-  }
-
-  Constant *CreateFMul(Constant *LHS, Constant *RHS) const override {
-    return ConstantExpr::getFMul(LHS, RHS);
-  }
-
-  Constant *CreateUDiv(Constant *LHS, Constant *RHS,
-                               bool isExact = false) const override {
-    return ConstantExpr::getUDiv(LHS, RHS, isExact);
-  }
-
-  Constant *CreateSDiv(Constant *LHS, Constant *RHS,
-                               bool isExact = false) const override {
-    return ConstantExpr::getSDiv(LHS, RHS, isExact);
-  }
-
-  Constant *CreateFDiv(Constant *LHS, Constant *RHS) const override {
-    return ConstantExpr::getFDiv(LHS, RHS);
-  }
-
-  Constant *CreateURem(Constant *LHS, Constant *RHS) const override {
-    return ConstantExpr::getURem(LHS, RHS);
-  }
-
-  Constant *CreateSRem(Constant *LHS, Constant *RHS) const override {
-    return ConstantExpr::getSRem(LHS, RHS);
-  }
-
-  Constant *CreateFRem(Constant *LHS, Constant *RHS) const override {
-    return ConstantExpr::getFRem(LHS, RHS);
-  }
-
-  Constant *CreateShl(Constant *LHS, Constant *RHS,
-                      bool HasNUW = false, bool HasNSW = false) const override {
-    return ConstantExpr::getShl(LHS, RHS, HasNUW, HasNSW);
-  }
-
-  Constant *CreateLShr(Constant *LHS, Constant *RHS,
-                       bool isExact = false) const override {
-    return ConstantExpr::getLShr(LHS, RHS, isExact);
-  }
-
-  Constant *CreateAShr(Constant *LHS, Constant *RHS,
-                       bool isExact = false) const override {
-    return ConstantExpr::getAShr(LHS, RHS, isExact);
+  Value *FoldExtractValue(Value *Agg,
+                          ArrayRef<unsigned> IdxList) const override {
+    if (auto *CAgg = dyn_cast<Constant>(Agg))
+      return ConstantFoldExtractValueInstruction(CAgg, IdxList);
+    return nullptr;
+  };
+
+  Value *FoldInsertValue(Value *Agg, Value *Val,
+                         ArrayRef<unsigned> IdxList) const override {
+    auto *CAgg = dyn_cast<Constant>(Agg);
+    auto *CVal = dyn_cast<Constant>(Val);
+    if (CAgg && CVal)
+      return ConstantFoldInsertValueInstruction(CAgg, CVal, IdxList);
+    return nullptr;
   }
 
-  Constant *CreateOr(Constant *LHS, Constant *RHS) const {
-    return ConstantExpr::getOr(LHS, RHS);
+  Value *FoldExtractElement(Value *Vec, Value *Idx) const override {
+    auto *CVec = dyn_cast<Constant>(Vec);
+    auto *CIdx = dyn_cast<Constant>(Idx);
+    if (CVec && CIdx)
+      return ConstantExpr::getExtractElement(CVec, CIdx);
+    return nullptr;
   }
 
-  Constant *CreateXor(Constant *LHS, Constant *RHS) const override {
-    return ConstantExpr::getXor(LHS, RHS);
+  Value *FoldInsertElement(Value *Vec, Value *NewElt,
+                           Value *Idx) const override {
+    auto *CVec = dyn_cast<Constant>(Vec);
+    auto *CNewElt = dyn_cast<Constant>(NewElt);
+    auto *CIdx = dyn_cast<Constant>(Idx);
+    if (CVec && CNewElt && CIdx)
+      return ConstantExpr::getInsertElement(CVec, CNewElt, CIdx);
+    return nullptr;
   }
 
-  Constant *CreateBinOp(Instruction::BinaryOps Opc,
-                        Constant *LHS, Constant *RHS) const override {
-    return ConstantExpr::get(Opc, LHS, RHS);
+  Value *FoldShuffleVector(Value *V1, Value *V2,
+                           ArrayRef<int> Mask) const override {
+    auto *C1 = dyn_cast<Constant>(V1);
+    auto *C2 = dyn_cast<Constant>(V2);
+    if (C1 && C2)
+      return ConstantExpr::getShuffleVector(C1, C2, Mask);
+    return nullptr;
   }
 
   //===--------------------------------------------------------------------===//
   // Unary Operators
   //===--------------------------------------------------------------------===//
 
-  Constant *CreateNeg(Constant *C,
-                      bool HasNUW = false, bool HasNSW = false) const override {
-    return ConstantExpr::getNeg(C, HasNUW, HasNSW);
-  }
-
   Constant *CreateFNeg(Constant *C) const override {
     return ConstantExpr::getFNeg(C);
   }
 
-  Constant *CreateNot(Constant *C) const override {
-    return ConstantExpr::getNot(C);
-  }
-
   Constant *CreateUnOp(Instruction::UnaryOps Opc, Constant *C) const override {
     return ConstantExpr::get(Opc, C);
   }
@@ -255,34 +225,6 @@ public:
                        Constant *RHS) const override {
     return ConstantExpr::getCompare(P, LHS, RHS);
   }
-
-  //===--------------------------------------------------------------------===//
-  // Other Instructions
-  //===--------------------------------------------------------------------===//
-
-  Constant *CreateExtractElement(Constant *Vec, Constant *Idx) const override {
-    return ConstantExpr::getExtractElement(Vec, Idx);
-  }
-
-  Constant *CreateInsertElement(Constant *Vec, Constant *NewElt,
-                                Constant *Idx) const override {
-    return ConstantExpr::getInsertElement(Vec, NewElt, Idx);
-  }
-
-  Constant *CreateShuffleVector(Constant *V1, Constant *V2,
-                                ArrayRef<int> Mask) const override {
-    return ConstantExpr::getShuffleVector(V1, V2, Mask);
-  }
-
-  Constant *CreateExtractValue(Constant *Agg,
-                               ArrayRef<unsigned> IdxList) const override {
-    return ConstantExpr::getExtractValue(Agg, IdxList);
-  }
-
-  Constant *CreateInsertValue(Constant *Agg, Constant *Val,
-                              ArrayRef<unsigned> IdxList) const override {
-    return ConstantExpr::getInsertValue(Agg, Val, IdxList);
-  }
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/IR/ConstantRange.h b/llvm/include/llvm/IR/ConstantRange.h
index fea4d0da1d0d..68abf4ef555d 100644
--- a/llvm/include/llvm/IR/ConstantRange.h
+++ b/llvm/include/llvm/IR/ConstantRange.h
@@ -553,6 +553,9 @@ public:
   /// Return whether unsigned mul of the two ranges always/never overflows.
   OverflowResult unsignedMulMayOverflow(const ConstantRange &Other) const;
 
+  /// Return known bits for values in this range.
+  KnownBits toKnownBits() const;
+
   /// Print out the bounds to a stream.
   void print(raw_ostream &OS) const;
 
diff --git a/llvm/include/llvm/IR/Constants.h b/llvm/include/llvm/IR/Constants.h
index fb884912b318..b5445ff71b74 100644
--- a/llvm/include/llvm/IR/Constants.h
+++ b/llvm/include/llvm/IR/Constants.h
@@ -289,7 +289,8 @@ public:
                            APInt *Payload = nullptr);
   static Constant *getSNaN(Type *Ty, bool Negative = false,
                            APInt *Payload = nullptr);
-  static Constant *getNegativeZero(Type *Ty);
+  static Constant *getZero(Type *Ty, bool Negative = false);
+  static Constant *getNegativeZero(Type *Ty) { return getZero(Ty, true); }
   static Constant *getInfinity(Type *Ty, bool Negative = false);
 
   /// Return true if Ty is big enough to represent V.
@@ -1120,9 +1121,12 @@ public:
   /// commutative, callers can acquire the operand 1 identity constant by
   /// setting AllowRHSConstant to true. For example, any shift has a zero
   /// identity constant for operand 1: X shift 0 = X.
+  /// If this is a fadd/fsub operation and we don't care about signed zeros,
+  /// then setting NSZ to true returns the identity +0.0 instead of -0.0.
   /// Return nullptr if the operator does not have an identity constant.
   static Constant *getBinOpIdentity(unsigned Opcode, Type *Ty,
-                                    bool AllowRHSConstant = false);
+                                    bool AllowRHSConstant = false,
+                                    bool NSZ = false);
 
   /// Return the absorbing element for the given binary
   /// operation, i.e. a constant C such that X op C = C and C op X = C for
@@ -1160,6 +1164,11 @@ public:
                     Type *Ty     ///< The type to trunc or bitcast C to
   );
 
+  /// Create either an sext, trunc or nothing, depending on whether Ty is
+  /// wider, narrower or the same as C->getType(). This only works with
+  /// integer or vector of integer types.
+  static Constant *getSExtOrTrunc(Constant *C, Type *Ty);
+
   /// Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant
   /// expression.
   static Constant *
@@ -1285,8 +1294,6 @@ public:
   static Constant *getShuffleVector(Constant *V1, Constant *V2,
                                     ArrayRef<int> Mask,
                                     Type *OnlyIfReducedTy = nullptr);
-  static Constant *getExtractValue(Constant *Agg, ArrayRef<unsigned> Idxs,
-                                   Type *OnlyIfReducedTy = nullptr);
   static Constant *getInsertValue(Constant *Agg, Constant *Val,
                                   ArrayRef<unsigned> Idxs,
                                   Type *OnlyIfReducedTy = nullptr);
diff --git a/llvm/include/llvm/IR/DIBuilder.h b/llvm/include/llvm/IR/DIBuilder.h
index fc461fc3f49f..9afa715b650c 100644
--- a/llvm/include/llvm/IR/DIBuilder.h
+++ b/llvm/include/llvm/IR/DIBuilder.h
@@ -21,6 +21,7 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/TrackingMDRef.h"
 #include "llvm/Support/Casting.h"
@@ -220,6 +221,23 @@ namespace llvm {
     /// \param SizeInBits  Size of the type.
     DIStringType *createStringType(StringRef Name, uint64_t SizeInBits);
 
+    /// Create debugging information entry for Fortran
+    /// assumed length string type.
+    /// \param Name            Type name.
+    /// \param StringLength    String length expressed as DIVariable *.
+    /// \param StrLocationExp  Optional memory location of the string.
+    DIStringType *createStringType(StringRef Name, DIVariable *StringLength,
+                                   DIExpression *StrLocationExp = nullptr);
+
+    /// Create debugging information entry for Fortran
+    /// assumed length string type.
+    /// \param Name             Type name.
+    /// \param StringLengthExp  String length expressed in DIExpression form.
+    /// \param StrLocationExp   Optional memory location of the string.
+    DIStringType *createStringType(StringRef Name,
+                                   DIExpression *StringLengthExp,
+                                   DIExpression *StrLocationExp = nullptr);
+
     /// Create debugging information entry for a qualified
     /// type, e.g. 'const int'.
     /// \param Tag         Tag identifing type, e.g. dwarf::TAG_volatile_type
@@ -734,6 +752,8 @@ namespace llvm {
     /// \param TParams       Function template parameters.
     /// \param ThrownTypes   Exception types this function may throw.
     /// \param Annotations   Attribute Annotations.
+    /// \param TargetFuncName The name of the target function if this is
+    ///                       a trampoline.
     DISubprogram *
     createFunction(DIScope *Scope, StringRef Name, StringRef LinkageName,
                    DIFile *File, unsigned LineNo, DISubroutineType *Ty,
@@ -742,7 +762,8 @@ namespace llvm {
                    DITemplateParameterArray TParams = nullptr,
                    DISubprogram *Decl = nullptr,
                    DITypeArray ThrownTypes = nullptr,
-                   DINodeArray Annotations = nullptr);
+                   DINodeArray Annotations = nullptr,
+                   StringRef TargetFuncName = "");
 
     /// Identical to createFunction,
     /// except that the resulting DbgNode is meant to be RAUWed.
diff --git a/llvm/include/llvm/IR/DataLayout.h b/llvm/include/llvm/IR/DataLayout.h
index 36438fc4f4e0..a6621c963d85 100644
--- a/llvm/include/llvm/IR/DataLayout.h
+++ b/llvm/include/llvm/IR/DataLayout.h
@@ -26,10 +26,10 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Type.h"
+#include "llvm/Support/Alignment.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/Alignment.h"
 #include "llvm/Support/TrailingObjects.h"
 #include "llvm/Support/TypeSize.h"
 #include <cassert>
diff --git a/llvm/include/llvm/IR/DebugInfoMetadata.h b/llvm/include/llvm/IR/DebugInfoMetadata.h
index 96569179060f..db1d031a062d 100644
--- a/llvm/include/llvm/IR/DebugInfoMetadata.h
+++ b/llvm/include/llvm/IR/DebugInfoMetadata.h
@@ -22,7 +22,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
-#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/Support/Casting.h"
@@ -61,6 +60,10 @@
 
 namespace llvm {
 
+namespace dwarf {
+enum Tag : uint16_t;
+}
+
 extern cl::opt<bool> EnableFSDiscriminator;
 
 class DITypeRefArray {
@@ -156,7 +159,7 @@ protected:
   void setTag(unsigned Tag) { SubclassData16 = Tag; }
 
 public:
-  dwarf::Tag getTag() const { return (dwarf::Tag)SubclassData16; }
+  dwarf::Tag getTag() const;
 
   /// Debug info flags.
   ///
@@ -267,7 +270,7 @@ public:
   /// Return a (temporary) clone of this.
   TempGenericDINode clone() const { return cloneImpl(); }
 
-  dwarf::Tag getTag() const { return (dwarf::Tag)SubclassData16; }
+  dwarf::Tag getTag() const;
   StringRef getHeader() const { return getStringOperand(0); }
   MDString *getRawHeader() const { return getOperandAs<MDString>(0); }
 
@@ -298,8 +301,7 @@ class DISubrange : public DINode {
   friend class LLVMContextImpl;
   friend class MDNode;
 
-  DISubrange(LLVMContext &C, StorageType Storage, ArrayRef<Metadata *> Ops)
-      : DINode(C, DISubrangeKind, Storage, dwarf::DW_TAG_subrange_type, Ops) {}
+  DISubrange(LLVMContext &C, StorageType Storage, ArrayRef<Metadata *> Ops);
 
   ~DISubrange() = default;
 
@@ -363,9 +365,7 @@ class DIGenericSubrange : public DINode {
   friend class MDNode;
 
   DIGenericSubrange(LLVMContext &C, StorageType Storage,
-                    ArrayRef<Metadata *> Ops)
-      : DINode(C, DIGenericSubrangeKind, Storage,
-               dwarf::DW_TAG_generic_subrange, Ops) {}
+                    ArrayRef<Metadata *> Ops);
 
   ~DIGenericSubrange() = default;
 
@@ -414,11 +414,7 @@ class DIEnumerator : public DINode {
 
   APInt Value;
   DIEnumerator(LLVMContext &C, StorageType Storage, const APInt &Value,
-               bool IsUnsigned, ArrayRef<Metadata *> Ops)
-      : DINode(C, DIEnumeratorKind, Storage, dwarf::DW_TAG_enumerator, Ops),
-        Value(Value) {
-    SubclassData32 = IsUnsigned;
-  }
+               bool IsUnsigned, ArrayRef<Metadata *> Ops);
   DIEnumerator(LLVMContext &C, StorageType Storage, int64_t Value,
                bool IsUnsigned, ArrayRef<Metadata *> Ops)
       : DIEnumerator(C, Storage, APInt(64, Value, !IsUnsigned), IsUnsigned,
@@ -568,9 +564,7 @@ private:
 
   DIFile(LLVMContext &C, StorageType Storage,
          Optional<ChecksumInfo<MDString *>> CS, Optional<MDString *> Src,
-         ArrayRef<Metadata *> Ops)
-      : DIScope(C, DIFileKind, Storage, dwarf::DW_TAG_file_type, Ops),
-        Checksum(CS), Source(Src) {}
+         ArrayRef<Metadata *> Ops);
   ~DIFile() = default;
 
   static DIFile *getImpl(LLVMContext &Context, StringRef Filename,
@@ -1021,42 +1015,19 @@ public:
 
   /// Get casted version of extra data.
   /// @{
-  DIType *getClassType() const {
-    assert(getTag() == dwarf::DW_TAG_ptr_to_member_type);
-    return cast_or_null<DIType>(getExtraData());
-  }
+  DIType *getClassType() const;
 
   DIObjCProperty *getObjCProperty() const {
     return dyn_cast_or_null<DIObjCProperty>(getExtraData());
   }
 
-  uint32_t getVBPtrOffset() const {
-    assert(getTag() == dwarf::DW_TAG_inheritance);
-    if (auto *CM = cast_or_null<ConstantAsMetadata>(getExtraData()))
-      if (auto *CI = dyn_cast_or_null<ConstantInt>(CM->getValue()))
-        return static_cast<uint32_t>(CI->getZExtValue());
-    return 0;
-  }
+  uint32_t getVBPtrOffset() const;
 
-  Constant *getStorageOffsetInBits() const {
-    assert(getTag() == dwarf::DW_TAG_member && isBitField());
-    if (auto *C = cast_or_null<ConstantAsMetadata>(getExtraData()))
-      return C->getValue();
-    return nullptr;
-  }
+  Constant *getStorageOffsetInBits() const;
 
-  Constant *getConstant() const {
-    assert(getTag() == dwarf::DW_TAG_member && isStaticMember());
-    if (auto *C = cast_or_null<ConstantAsMetadata>(getExtraData()))
-      return C->getValue();
-    return nullptr;
-  }
-  Constant *getDiscriminantValue() const {
-    assert(getTag() == dwarf::DW_TAG_member && !isStaticMember());
-    if (auto *C = cast_or_null<ConstantAsMetadata>(getExtraData()))
-      return C->getValue();
-    return nullptr;
-  }
+  Constant *getConstant() const;
+
+  Constant *getDiscriminantValue() const;
   /// @}
 
   static bool classof(const Metadata *MD) {
@@ -1300,10 +1271,7 @@ class DISubroutineType : public DIType {
   uint8_t CC;
 
   DISubroutineType(LLVMContext &C, StorageType Storage, DIFlags Flags,
-                   uint8_t CC, ArrayRef<Metadata *> Ops)
-      : DIType(C, DISubroutineTypeKind, Storage, dwarf::DW_TAG_subroutine_type,
-               0, 0, 0, 0, Flags, Ops),
-        CC(CC) {}
+                   uint8_t CC, ArrayRef<Metadata *> Ops);
   ~DISubroutineType() = default;
 
   static DISubroutineType *getImpl(LLVMContext &Context, DIFlags Flags,
@@ -1330,6 +1298,12 @@ public:
                     (Flags, CC, TypeArray))
 
   TempDISubroutineType clone() const { return cloneImpl(); }
+  // Returns a new temporary DISubroutineType with updated CC
+  TempDISubroutineType cloneWithCC(uint8_t CC) const {
+    auto NewTy = clone();
+    NewTy->CC = CC;
+    return NewTy;
+  }
 
   uint8_t getCC() const { return CC; }
 
@@ -1385,15 +1359,7 @@ private:
                 bool IsOptimized, unsigned RuntimeVersion,
                 unsigned EmissionKind, uint64_t DWOId, bool SplitDebugInlining,
                 bool DebugInfoForProfiling, unsigned NameTableKind,
-                bool RangesBaseAddress, ArrayRef<Metadata *> Ops)
-      : DIScope(C, DICompileUnitKind, Storage, dwarf::DW_TAG_compile_unit, Ops),
-        SourceLanguage(SourceLanguage), IsOptimized(IsOptimized),
-        RuntimeVersion(RuntimeVersion), EmissionKind(EmissionKind),
-        DWOId(DWOId), SplitDebugInlining(SplitDebugInlining),
-        DebugInfoForProfiling(DebugInfoForProfiling),
-        NameTableKind(NameTableKind), RangesBaseAddress(RangesBaseAddress) {
-    assert(Storage != Uniqued);
-  }
+                bool RangesBaseAddress, ArrayRef<Metadata *> Ops);
   ~DICompileUnit() = default;
 
   static DICompileUnit *
@@ -1872,19 +1838,7 @@ public:
   static DISPFlags toSPFlags(bool IsLocalToUnit, bool IsDefinition,
                              bool IsOptimized,
                              unsigned Virtuality = SPFlagNonvirtual,
-                             bool IsMainSubprogram = false) {
-    // We're assuming virtuality is the low-order field.
-    static_assert(int(SPFlagVirtual) == int(dwarf::DW_VIRTUALITY_virtual) &&
-                      int(SPFlagPureVirtual) ==
-                          int(dwarf::DW_VIRTUALITY_pure_virtual),
-                  "Virtuality constant mismatch");
-    return static_cast<DISPFlags>(
-        (Virtuality & SPFlagVirtuality) |
-        (IsLocalToUnit ? SPFlagLocalToUnit : SPFlagZero) |
-        (IsDefinition ? SPFlagDefinition : SPFlagZero) |
-        (IsOptimized ? SPFlagOptimized : SPFlagZero) |
-        (IsMainSubprogram ? SPFlagMainSubprogram : SPFlagZero));
-  }
+                             bool IsMainSubprogram = false);
 
 private:
   DIFlags Flags;
@@ -1892,13 +1846,7 @@ private:
 
   DISubprogram(LLVMContext &C, StorageType Storage, unsigned Line,
                unsigned ScopeLine, unsigned VirtualIndex, int ThisAdjustment,
-               DIFlags Flags, DISPFlags SPFlags, ArrayRef<Metadata *> Ops)
-      : DILocalScope(C, DISubprogramKind, Storage, dwarf::DW_TAG_subprogram,
-                     Ops),
-        Line(Line), ScopeLine(ScopeLine), VirtualIndex(VirtualIndex),
-        ThisAdjustment(ThisAdjustment), Flags(Flags), SPFlags(SPFlags) {
-    static_assert(dwarf::DW_VIRTUALITY_max < 4, "Virtuality out of range");
-  }
+               DIFlags Flags, DISPFlags SPFlags, ArrayRef<Metadata *> Ops);
   ~DISubprogram() = default;
 
   static DISubprogram *
@@ -1909,13 +1857,14 @@ private:
           DISPFlags SPFlags, DICompileUnit *Unit,
           DITemplateParameterArray TemplateParams, DISubprogram *Declaration,
           DINodeArray RetainedNodes, DITypeArray ThrownTypes,
-          DINodeArray Annotations, StorageType Storage,
-          bool ShouldCreate = true) {
+          DINodeArray Annotations, StringRef TargetFuncName,
+          StorageType Storage, bool ShouldCreate = true) {
     return getImpl(Context, Scope, getCanonicalMDString(Context, Name),
                    getCanonicalMDString(Context, LinkageName), File, Line, Type,
                    ScopeLine, ContainingType, VirtualIndex, ThisAdjustment,
                    Flags, SPFlags, Unit, TemplateParams.get(), Declaration,
                    RetainedNodes.get(), ThrownTypes.get(), Annotations.get(),
+                   getCanonicalMDString(Context, TargetFuncName),
                    Storage, ShouldCreate);
   }
   static DISubprogram *
@@ -1925,7 +1874,8 @@ private:
           int ThisAdjustment, DIFlags Flags, DISPFlags SPFlags, Metadata *Unit,
           Metadata *TemplateParams, Metadata *Declaration,
           Metadata *RetainedNodes, Metadata *ThrownTypes, Metadata *Annotations,
-          StorageType Storage, bool ShouldCreate = true);
+          MDString *TargetFuncName, StorageType Storage,
+          bool ShouldCreate = true);
 
   TempDISubprogram cloneImpl() const {
     return getTemporary(getContext(), getScope(), getName(), getLinkageName(),
@@ -1933,7 +1883,8 @@ private:
                         getContainingType(), getVirtualIndex(),
                         getThisAdjustment(), getFlags(), getSPFlags(),
                         getUnit(), getTemplateParams(), getDeclaration(),
-                        getRetainedNodes(), getThrownTypes(), getAnnotations());
+                        getRetainedNodes(), getThrownTypes(), getAnnotations(),
+                        getTargetFuncName());
   }
 
 public:
@@ -1945,10 +1896,11 @@ public:
        DIFlags Flags, DISPFlags SPFlags, DICompileUnit *Unit,
        DITemplateParameterArray TemplateParams = nullptr,
        DISubprogram *Declaration = nullptr, DINodeArray RetainedNodes = nullptr,
-       DITypeArray ThrownTypes = nullptr, DINodeArray Annotations = nullptr),
+       DITypeArray ThrownTypes = nullptr, DINodeArray Annotations = nullptr,
+       StringRef TargetFuncName = ""),
       (Scope, Name, LinkageName, File, Line, Type, ScopeLine, ContainingType,
        VirtualIndex, ThisAdjustment, Flags, SPFlags, Unit, TemplateParams,
-       Declaration, RetainedNodes, ThrownTypes, Annotations))
+       Declaration, RetainedNodes, ThrownTypes, Annotations, TargetFuncName))
 
   DEFINE_MDNODE_GET(
       DISubprogram,
@@ -1958,10 +1910,10 @@ public:
        DIFlags Flags, DISPFlags SPFlags, Metadata *Unit,
        Metadata *TemplateParams = nullptr, Metadata *Declaration = nullptr,
        Metadata *RetainedNodes = nullptr, Metadata *ThrownTypes = nullptr,
-       Metadata *Annotations = nullptr),
+       Metadata *Annotations = nullptr, MDString *TargetFuncName = nullptr),
       (Scope, Name, LinkageName, File, Line, Type, ScopeLine, ContainingType,
        VirtualIndex, ThisAdjustment, Flags, SPFlags, Unit, TemplateParams,
-       Declaration, RetainedNodes, ThrownTypes, Annotations))
+       Declaration, RetainedNodes, ThrownTypes, Annotations, TargetFuncName))
 
   TempDISubprogram clone() const { return cloneImpl(); }
 
@@ -2050,6 +2002,10 @@ public:
   DIType *getContainingType() const {
     return cast_or_null<DIType>(getRawContainingType());
   }
+  void replaceType(DISubroutineType *Ty) {
+    assert(isDistinct() && "Only distinct nodes can mutate");
+    replaceOperandWith(4, Ty);
+  }
 
   DICompileUnit *getUnit() const {
     return cast_or_null<DICompileUnit>(getRawUnit());
@@ -2070,6 +2026,9 @@ public:
   DINodeArray getAnnotations() const {
     return cast_or_null<MDTuple>(getRawAnnotations());
   }
+  StringRef getTargetFuncName() const {
+    return (getRawTargetFuncName()) ? getStringOperand(12) : StringRef();
+  }
 
   Metadata *getRawScope() const { return getOperand(1); }
   MDString *getRawName() const { return getOperandAs<MDString>(2); }
@@ -2090,6 +2049,9 @@ public:
   Metadata *getRawAnnotations() const {
     return getNumOperands() > 11 ? getOperandAs<Metadata>(11) : nullptr;
   }
+  MDString *getRawTargetFuncName() const {
+    return getNumOperands() > 12 ? getOperandAs<MDString>(12) : nullptr;
+  }
 
   void replaceRawLinkageName(MDString *LinkageName) {
     replaceOperandWith(3, LinkageName);
@@ -2108,8 +2070,7 @@ public:
 class DILexicalBlockBase : public DILocalScope {
 protected:
   DILexicalBlockBase(LLVMContext &C, unsigned ID, StorageType Storage,
-                     ArrayRef<Metadata *> Ops)
-      : DILocalScope(C, ID, Storage, dwarf::DW_TAG_lexical_block, Ops) {}
+                     ArrayRef<Metadata *> Ops);
   ~DILexicalBlockBase() = default;
 
 public:
@@ -2301,10 +2262,7 @@ class DINamespace : public DIScope {
   unsigned ExportSymbols : 1;
 
   DINamespace(LLVMContext &Context, StorageType Storage, bool ExportSymbols,
-              ArrayRef<Metadata *> Ops)
-      : DIScope(Context, DINamespaceKind, Storage, dwarf::DW_TAG_namespace,
-                Ops),
-        ExportSymbols(ExportSymbols) {}
+              ArrayRef<Metadata *> Ops);
   ~DINamespace() = default;
 
   static DINamespace *getImpl(LLVMContext &Context, DIScope *Scope,
@@ -2353,9 +2311,7 @@ class DIModule : public DIScope {
   bool IsDecl;
 
   DIModule(LLVMContext &Context, StorageType Storage, unsigned LineNo,
-           bool IsDecl, ArrayRef<Metadata *> Ops)
-      : DIScope(Context, DIModuleKind, Storage, dwarf::DW_TAG_module, Ops),
-        LineNo(LineNo), IsDecl(IsDecl) {}
+           bool IsDecl, ArrayRef<Metadata *> Ops);
   ~DIModule() = default;
 
   static DIModule *getImpl(LLVMContext &Context, DIFile *File, DIScope *Scope,
@@ -2449,10 +2405,7 @@ class DITemplateTypeParameter : public DITemplateParameter {
   friend class MDNode;
 
   DITemplateTypeParameter(LLVMContext &Context, StorageType Storage,
-                          bool IsDefault, ArrayRef<Metadata *> Ops)
-      : DITemplateParameter(Context, DITemplateTypeParameterKind, Storage,
-                            dwarf::DW_TAG_template_type_parameter, IsDefault,
-                            Ops) {}
+                          bool IsDefault, ArrayRef<Metadata *> Ops);
   ~DITemplateTypeParameter() = default;
 
   static DITemplateTypeParameter *getImpl(LLVMContext &Context, StringRef Name,
@@ -2541,10 +2494,8 @@ class DIVariable : public DINode {
   uint32_t AlignInBits;
 
 protected:
-  DIVariable(LLVMContext &C, unsigned ID, StorageType Storage, unsigned Line,
-             ArrayRef<Metadata *> Ops, uint32_t AlignInBits = 0)
-      : DINode(C, ID, Storage, dwarf::DW_TAG_variable, Ops), Line(Line),
-        AlignInBits(AlignInBits) {}
+  DIVariable(LLVMContext &C, unsigned ID, StorageType Storage, signed Line,
+             ArrayRef<Metadata *> Ops, uint32_t AlignInBits = 0);
   ~DIVariable() = default;
 
 public:
@@ -2763,9 +2714,7 @@ public:
   }
 
   /// Return whether the first element a DW_OP_deref.
-  bool startsWithDeref() const {
-    return getNumElements() > 0 && getElement(0) == dwarf::DW_OP_deref;
-  }
+  bool startsWithDeref() const;
 
   /// Holds the characteristics of one fragment of a larger variable.
   struct FragmentInfo {
@@ -2783,7 +2732,7 @@ public:
   }
 
   /// Return whether this is a piece of an aggregate variable.
-  bool isFragment() const { return getFragmentInfo().hasValue(); }
+  bool isFragment() const { return getFragmentInfo().has_value(); }
 
   /// Return whether this is an implicit location description.
   bool isImplicit() const;
@@ -2923,10 +2872,7 @@ public:
 
   /// Check if the expression consists of exactly one entry value operand.
   /// (This is the only configuration of entry values that is supported.)
-  bool isEntryValue() const {
-    return getNumElements() > 0 &&
-           getElement(0) == dwarf::DW_OP_LLVM_entry_value;
-  }
+  bool isEntryValue() const;
 
   /// Try to shorten an expression with an initial constant operand.
   /// Returns a new expression and constant on success, or the original
@@ -3057,10 +3003,7 @@ class DICommonBlock : public DIScope {
   friend class MDNode;
 
   DICommonBlock(LLVMContext &Context, StorageType Storage, unsigned LineNo,
-                ArrayRef<Metadata *> Ops)
-      : DIScope(Context, DICommonBlockKind, Storage, dwarf::DW_TAG_common_block,
-                Ops),
-        LineNo(LineNo) {}
+                ArrayRef<Metadata *> Ops);
 
   static DICommonBlock *getImpl(LLVMContext &Context, DIScope *Scope,
                                 DIGlobalVariable *Decl, StringRef Name,
@@ -3209,8 +3152,7 @@ class DILabel : public DINode {
   unsigned Line;
 
   DILabel(LLVMContext &C, StorageType Storage, unsigned Line,
-          ArrayRef<Metadata *> Ops)
-      : DINode(C, DILabelKind, Storage, dwarf::DW_TAG_label, Ops), Line(Line) {}
+          ArrayRef<Metadata *> Ops);
   ~DILabel() = default;
 
   static DILabel *getImpl(LLVMContext &Context, DIScope *Scope, StringRef Name,
@@ -3276,10 +3218,7 @@ class DIObjCProperty : public DINode {
   unsigned Attributes;
 
   DIObjCProperty(LLVMContext &C, StorageType Storage, unsigned Line,
-                 unsigned Attributes, ArrayRef<Metadata *> Ops)
-      : DINode(C, DIObjCPropertyKind, Storage, dwarf::DW_TAG_APPLE_property,
-               Ops),
-        Line(Line), Attributes(Attributes) {}
+                 unsigned Attributes, ArrayRef<Metadata *> Ops);
   ~DIObjCProperty() = default;
 
   static DIObjCProperty *
@@ -3705,7 +3644,7 @@ public:
   const DILocation *getInlinedAt() const { return InlinedAt; }
 
   FragmentInfo getFragmentOrDefault() const {
-    return Fragment.getValueOr(DefaultFragment);
+    return Fragment.value_or(DefaultFragment);
   }
 
   static bool isDefaultFragment(const FragmentInfo F) {
diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h
index f52ce3cde318..f505fd3f3e32 100644
--- a/llvm/include/llvm/IR/DerivedTypes.h
+++ b/llvm/include/llvm/IR/DerivedTypes.h
@@ -659,7 +659,7 @@ public:
   }
 
   /// This constructs a pointer type with the same pointee type as input
-  /// PointerType (or opaque pointer is the input PointerType is opaque) and the
+  /// PointerType (or opaque pointer if the input PointerType is opaque) and the
   /// given address space. This is only useful during the opaque pointer
   /// transition.
   /// TODO: remove after opaque pointer transition is complete.
@@ -670,13 +670,6 @@ public:
     return get(PT->PointeeTy, AddressSpace);
   }
 
-  [[deprecated("Pointer element types are deprecated. You can *temporarily* "
-               "use Type::getPointerElementType() instead")]]
-  Type *getElementType() const {
-    assert(!isOpaque() && "Attempting to get element type of opaque pointer");
-    return PointeeTy;
-  }
-
   bool isOpaque() const { return !PointeeTy; }
 
   /// Return true if the specified type is valid as a element type.
diff --git a/llvm/include/llvm/IR/DiagnosticInfo.h b/llvm/include/llvm/IR/DiagnosticInfo.h
index 1ea1d9787d61..da37801b6d19 100644
--- a/llvm/include/llvm/IR/DiagnosticInfo.h
+++ b/llvm/include/llvm/IR/DiagnosticInfo.h
@@ -85,6 +85,7 @@ enum DiagnosticKind {
   DK_Unsupported,
   DK_SrcMgr,
   DK_DontCall,
+  DK_MisExpect,
   DK_FirstPluginKind // Must be last value to work with
                      // getNextAvailablePluginDiagnosticKind
 };
@@ -1032,6 +1033,25 @@ public:
   void print(DiagnosticPrinter &DP) const override;
 };
 
+/// Diagnostic information for MisExpect analysis.
+class DiagnosticInfoMisExpect : public DiagnosticInfoWithLocationBase {
+public:
+  DiagnosticInfoMisExpect(const Instruction *Inst, Twine &Msg);
+
+  /// \see DiagnosticInfo::print.
+  void print(DiagnosticPrinter &DP) const override;
+
+  static bool classof(const DiagnosticInfo *DI) {
+    return DI->getKind() == DK_MisExpect;
+  }
+
+  const Twine &getMsg() const { return Msg; }
+
+private:
+  /// Message to report.
+  const Twine &Msg;
+};
+
 static DiagnosticSeverity getDiagnosticSeverity(SourceMgr::DiagKind DK) {
   switch (DK) {
   case llvm::SourceMgr::DK_Error:
diff --git a/llvm/include/llvm/IR/Dominators.h b/llvm/include/llvm/IR/Dominators.h
index d13a5856df3b..a381c075d77b 100644
--- a/llvm/include/llvm/IR/Dominators.h
+++ b/llvm/include/llvm/IR/Dominators.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_IR_DOMINATORS_H
 #define LLVM_IR_DOMINATORS_H
 
+#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseMapInfo.h"
@@ -22,6 +23,8 @@
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/ADT/ilist_iterator.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/PassManager.h"
@@ -31,6 +34,7 @@
 #include "llvm/Support/CFGUpdate.h"
 #include "llvm/Support/GenericDomTree.h"
 #include "llvm/Support/GenericDomTreeConstruction.h"
+#include <algorithm>
 #include <utility>
 #include <vector>
 
diff --git a/llvm/include/llvm/IR/FMF.h b/llvm/include/llvm/IR/FMF.h
new file mode 100644
index 000000000000..a49feb5a8946
--- /dev/null
+++ b/llvm/include/llvm/IR/FMF.h
@@ -0,0 +1,121 @@
+//===-- llvm/FMF.h - Fast math flags subclass -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the fast math flags.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_IR_FMF_H
+#define LLVM_IR_FMF_H
+
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+/// Convenience struct for specifying and reasoning about fast-math flags.
+class FastMathFlags {
+private:
+  friend class FPMathOperator;
+
+  unsigned Flags = 0;
+
+  FastMathFlags(unsigned F) {
+    // If all 7 bits are set, turn this into -1. If the number of bits grows,
+    // this must be updated. This is intended to provide some forward binary
+    // compatibility insurance for the meaning of 'fast' in case bits are added.
+    if (F == 0x7F) Flags = ~0U;
+    else Flags = F;
+  }
+
+public:
+  // This is how the bits are used in Value::SubclassOptionalData so they
+  // should fit there too.
+  // WARNING: We're out of space. SubclassOptionalData only has 7 bits. New
+  // functionality will require a change in how this information is stored.
+  enum {
+    AllowReassoc    = (1 << 0),
+    NoNaNs          = (1 << 1),
+    NoInfs          = (1 << 2),
+    NoSignedZeros   = (1 << 3),
+    AllowReciprocal = (1 << 4),
+    AllowContract   = (1 << 5),
+    ApproxFunc      = (1 << 6)
+  };
+
+  FastMathFlags() = default;
+
+  static FastMathFlags getFast() {
+    FastMathFlags FMF;
+    FMF.setFast();
+    return FMF;
+  }
+
+  bool any() const { return Flags != 0; }
+  bool none() const { return Flags == 0; }
+  bool all() const { return Flags == ~0U; }
+
+  void clear() { Flags = 0; }
+  void set()   { Flags = ~0U; }
+
+  /// Flag queries
+  bool allowReassoc() const    { return 0 != (Flags & AllowReassoc); }
+  bool noNaNs() const          { return 0 != (Flags & NoNaNs); }
+  bool noInfs() const          { return 0 != (Flags & NoInfs); }
+  bool noSignedZeros() const   { return 0 != (Flags & NoSignedZeros); }
+  bool allowReciprocal() const { return 0 != (Flags & AllowReciprocal); }
+  bool allowContract() const   { return 0 != (Flags & AllowContract); }
+  bool approxFunc() const      { return 0 != (Flags & ApproxFunc); }
+  /// 'Fast' means all bits are set.
+  bool isFast() const          { return all(); }
+
+  /// Flag setters
+  void setAllowReassoc(bool B = true) {
+    Flags = (Flags & ~AllowReassoc) | B * AllowReassoc;
+  }
+  void setNoNaNs(bool B = true) {
+    Flags = (Flags & ~NoNaNs) | B * NoNaNs;
+  }
+  void setNoInfs(bool B = true) {
+    Flags = (Flags & ~NoInfs) | B * NoInfs;
+  }
+  void setNoSignedZeros(bool B = true) {
+    Flags = (Flags & ~NoSignedZeros) | B * NoSignedZeros;
+  }
+  void setAllowReciprocal(bool B = true) {
+    Flags = (Flags & ~AllowReciprocal) | B * AllowReciprocal;
+  }
+  void setAllowContract(bool B = true) {
+    Flags = (Flags & ~AllowContract) | B * AllowContract;
+  }
+  void setApproxFunc(bool B = true) {
+    Flags = (Flags & ~ApproxFunc) | B * ApproxFunc;
+  }
+  void setFast(bool B = true) { B ? set() : clear(); }
+
+  void operator&=(const FastMathFlags &OtherFlags) {
+    Flags &= OtherFlags.Flags;
+  }
+  void operator|=(const FastMathFlags &OtherFlags) {
+    Flags |= OtherFlags.Flags;
+  }
+  bool operator!=(const FastMathFlags &OtherFlags) const {
+    return Flags != OtherFlags.Flags;
+  }
+
+  /// Print fast-math flags to \p O.
+  void print(raw_ostream &O) const;
+};
+
+inline raw_ostream &operator<<(raw_ostream &O, FastMathFlags FMF) {
+  FMF.print(O);
+  return O;
+}
+
+} // end namespace llvm
+
+#endif // LLVM_IR_FMF_H
diff --git a/llvm/include/llvm/IR/FPEnv.h b/llvm/include/llvm/IR/FPEnv.h
index bf435ec6d109..e598db224211 100644
--- a/llvm/include/llvm/IR/FPEnv.h
+++ b/llvm/include/llvm/IR/FPEnv.h
@@ -17,10 +17,17 @@
 
 #include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/IR/FMF.h"
 
 namespace llvm {
 class StringRef;
 
+namespace Intrinsic {
+typedef unsigned ID;
+}
+
+class Instruction;
+
 namespace fp {
 
 /// Exception behavior used for floating point operations.
@@ -59,10 +66,22 @@ inline bool isDefaultFPEnvironment(fp::ExceptionBehavior EB, RoundingMode RM) {
   return EB == fp::ebIgnore && RM == RoundingMode::NearestTiesToEven;
 }
 
+/// Returns constrained intrinsic id to represent the given instruction in
+/// strictfp function. If the instruction is already a constrained intrinsic or
+/// does not have a constrained intrinsic counterpart, the function returns
+/// zero.
+Intrinsic::ID getConstrainedIntrinsicID(const Instruction &Instr);
+
 /// Returns true if the rounding mode RM may be QRM at compile time or
 /// at run time.
 inline bool canRoundingModeBe(RoundingMode RM, RoundingMode QRM) {
   return RM == QRM || RM == RoundingMode::Dynamic;
 }
+
+/// Returns true if the possibility of a signaling NaN can be safely
+/// ignored.
+inline bool canIgnoreSNaN(fp::ExceptionBehavior EB, FastMathFlags FMF) {
+  return (EB == fp::ebIgnore || FMF.noNaNs());
+}
 }
 #endif
diff --git a/llvm/include/llvm/IR/FixedMetadataKinds.def b/llvm/include/llvm/IR/FixedMetadataKinds.def
index 31979cd2f9db..7c32c5d13760 100644
--- a/llvm/include/llvm/IR/FixedMetadataKinds.def
+++ b/llvm/include/llvm/IR/FixedMetadataKinds.def
@@ -42,3 +42,5 @@ LLVM_FIXED_MD_KIND(MD_preserve_access_index, "llvm.preserve.access.index", 27)
 LLVM_FIXED_MD_KIND(MD_vcall_visibility, "vcall_visibility", 28)
 LLVM_FIXED_MD_KIND(MD_noundef, "noundef", 29)
 LLVM_FIXED_MD_KIND(MD_annotation, "annotation", 30)
+LLVM_FIXED_MD_KIND(MD_nosanitize, "nosanitize", 31)
+LLVM_FIXED_MD_KIND(MD_func_sanitize, "func_sanitize", 32)
diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h
index 90095cd1bc77..7945c64c8610 100644
--- a/llvm/include/llvm/IR/Function.h
+++ b/llvm/include/llvm/IR/Function.h
@@ -32,8 +32,6 @@
 #include "llvm/IR/OperandTraits.h"
 #include "llvm/IR/SymbolTableListTraits.h"
 #include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Compiler.h"
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
@@ -290,7 +288,7 @@ public:
   /// profile annotations. If IncludeSynthetic is false, only return true
   /// when the profile data is real.
   bool hasProfileData(bool IncludeSynthetic = false) const {
-    return getEntryCount(IncludeSynthetic).hasValue();
+    return getEntryCount(IncludeSynthetic).has_value();
   }
 
   /// Returns the set of GUIDs that needs to be imported to the function for
@@ -486,11 +484,12 @@ public:
     return AttributeSets.getParamDereferenceableOrNullBytes(ArgNo);
   }
 
-  /// A function will have the "coroutine.presplit" attribute if it's
-  /// a coroutine and has not gone through full CoroSplit pass.
+  /// Determine if the function is presplit coroutine.
   bool isPresplitCoroutine() const {
-    return hasFnAttribute("coroutine.presplit");
+    return hasFnAttribute(Attribute::PresplitCoroutine);
   }
+  void setPresplitCoroutine() { addFnAttr(Attribute::PresplitCoroutine); }
+  void setSplittedCoroutine() { removeFnAttr(Attribute::PresplitCoroutine); }
 
   /// Determine if the function does not access memory.
   bool doesNotAccessMemory() const {
@@ -623,15 +622,19 @@ public:
   bool willReturn() const { return hasFnAttribute(Attribute::WillReturn); }
   void setWillReturn() { addFnAttr(Attribute::WillReturn); }
 
+  /// Get what kind of unwind table entry to generate for this function.
+  UWTableKind getUWTableKind() const {
+    return AttributeSets.getUWTableKind();
+  }
+
   /// True if the ABI mandates (or the user requested) that this
   /// function be in a unwind table.
   bool hasUWTable() const {
-    return hasFnAttribute(Attribute::UWTable);
+    return getUWTableKind() != UWTableKind::None;
   }
-  void setHasUWTable() {
-    addFnAttr(Attribute::UWTable);
+  void setUWTableKind(UWTableKind K) {
+    addFnAttr(Attribute::getWithUWTableKind(getContext(), K));
   }
-
   /// True if this function needs an unwind table.
   bool needsUnwindTableEntry() const {
     return hasUWTable() || !doesNotThrow() || hasPersonalityFn();
diff --git a/llvm/include/llvm/IR/GCStrategy.h b/llvm/include/llvm/IR/GCStrategy.h
index 4fa8e3a8dcf4..41024469044f 100644
--- a/llvm/include/llvm/IR/GCStrategy.h
+++ b/llvm/include/llvm/IR/GCStrategy.h
@@ -38,9 +38,7 @@
 // When used with gc.statepoint, information about safepoint and roots can be
 // found in the binary StackMap section after code generation.  Safepoint
 // placement is currently the responsibility of the frontend, though late
-// insertion support is planned.  gc.statepoint does not currently support
-// custom stack map formats; such can be generated by parsing the standard
-// stack map section if desired.
+// insertion support is planned.
 //
 // The read and write barrier support can be used with either implementation.
 //
@@ -101,6 +99,11 @@ public:
   }
   ///@}
 
+  /// If set, appropriate metadata tables must be emitted by the back-end
+  /// (assembler, JIT, or otherwise). The default stackmap information can be
+  /// found in the StackMap section as described in the documentation.
+  bool usesMetadata() const { return UsesMetadata; }
+
   /** @name GCRoot Specific Properties
    * These properties and overrides only apply to collector strategies using
    * GCRoot.
@@ -110,12 +113,6 @@ public:
   /// True if safe points need to be inferred on call sites
   bool needsSafePoints() const { return NeededSafePoints; }
 
-  /// If set, appropriate metadata tables must be emitted by the back-end
-  /// (assembler, JIT, or otherwise). For statepoint, this method is
-  /// currently unsupported.  The stackmap information can be found in the
-  /// StackMap section as described in the documentation.
-  bool usesMetadata() const { return UsesMetadata; }
-
   ///@}
 };
 
@@ -126,7 +123,7 @@ public:
 /// static GCRegistry::Add<CustomGC> X("custom-name",
 ///        "my custom supper fancy gc strategy");
 ///
-/// Note that to use a custom GCMetadataPrinter w/gc.roots, you must also
+/// Note that to use a custom GCMetadataPrinter, you must also
 /// register your GCMetadataPrinter subclass with the
 /// GCMetadataPrinterRegistery as well.
 using GCRegistry = Registry<GCStrategy>;
diff --git a/llvm/include/llvm/IR/GlobalIFunc.h b/llvm/include/llvm/IR/GlobalIFunc.h
index 10088ee2fff4..976772b343fd 100644
--- a/llvm/include/llvm/IR/GlobalIFunc.h
+++ b/llvm/include/llvm/IR/GlobalIFunc.h
@@ -84,6 +84,11 @@ public:
     return FunctionType::get(IFuncValTy->getPointerTo(), false);
   }
 
+  static bool isValidLinkage(LinkageTypes L) {
+    return isExternalLinkage(L) || isLocalLinkage(L) || isWeakLinkage(L) ||
+           isLinkOnceLinkage(L);
+  }
+
   // Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const Value *V) {
     return V->getValueID() == Value::GlobalIFuncVal;
diff --git a/llvm/include/llvm/IR/GlobalObject.h b/llvm/include/llvm/IR/GlobalObject.h
index 0bb9fd730059..96a270316686 100644
--- a/llvm/include/llvm/IR/GlobalObject.h
+++ b/llvm/include/llvm/IR/GlobalObject.h
@@ -43,13 +43,12 @@ protected:
   GlobalObject(Type *Ty, ValueTy VTy, Use *Ops, unsigned NumOps,
                LinkageTypes Linkage, const Twine &Name,
                unsigned AddressSpace = 0)
-      : GlobalValue(Ty, VTy, Ops, NumOps, Linkage, Name, AddressSpace),
-        ObjComdat(nullptr) {
+      : GlobalValue(Ty, VTy, Ops, NumOps, Linkage, Name, AddressSpace) {
     setGlobalValueSubClassData(0);
   }
   ~GlobalObject();
 
-  Comdat *ObjComdat;
+  Comdat *ObjComdat = nullptr;
   enum {
     LastAlignmentBit = 5,
     HasSectionHashEntryBit,
diff --git a/llvm/include/llvm/IR/GlobalValue.h b/llvm/include/llvm/IR/GlobalValue.h
index 1818f2a8f3cc..a17423dd965b 100644
--- a/llvm/include/llvm/IR/GlobalValue.h
+++ b/llvm/include/llvm/IR/GlobalValue.h
@@ -80,14 +80,14 @@ protected:
         UnnamedAddrVal(unsigned(UnnamedAddr::None)),
         DllStorageClass(DefaultStorageClass), ThreadLocal(NotThreadLocal),
         HasLLVMReservedName(false), IsDSOLocal(false), HasPartition(false),
-        IntID((Intrinsic::ID)0U), Parent(nullptr) {
+        HasSanitizerMetadata(false) {
     setLinkage(Linkage);
     setName(Name);
   }
 
   Type *ValueType;
 
-  static const unsigned GlobalValueSubClassDataBits = 16;
+  static const unsigned GlobalValueSubClassDataBits = 15;
 
   // All bitfields use unsigned as the underlying type so that MSVC will pack
   // them.
@@ -112,9 +112,14 @@ protected:
   /// https://lld.llvm.org/Partitions.html).
   unsigned HasPartition : 1;
 
+  /// True if this symbol has sanitizer metadata available. Should only happen
+  /// if sanitizers were enabled when building the translation unit which
+  /// contains this GV.
+  unsigned HasSanitizerMetadata : 1;
+
 private:
   // Give subclasses access to what otherwise would be wasted padding.
-  // (16 + 4 + 2 + 2 + 2 + 3 + 1 + 1 + 1) == 32.
+  // (15 + 4 + 2 + 2 + 2 + 3 + 1 + 1 + 1 + 1) == 32.
   unsigned SubClassData : GlobalValueSubClassDataBits;
 
   friend class Constant;
@@ -153,7 +158,7 @@ protected:
   /// Subclasses can use it to store their intrinsic ID, if they have one.
   ///
   /// This is stored here to save space in Function on 64-bit hosts.
-  Intrinsic::ID IntID;
+  Intrinsic::ID IntID = (Intrinsic::ID)0U;
 
   unsigned getGlobalValueSubClassData() const {
     return SubClassData;
@@ -163,7 +168,7 @@ protected:
     SubClassData = V;
   }
 
-  Module *Parent;             // The containing module.
+  Module *Parent = nullptr; // The containing module.
 
   // Used by SymbolTableListTraits.
   void setParent(Module *parent) {
@@ -289,6 +294,43 @@ public:
   StringRef getPartition() const;
   void setPartition(StringRef Part);
 
+  // ASan, HWASan and Memtag sanitizers have some instrumentation that applies
+  // specifically to global variables. This instrumentation is implicitly
+  // applied to all global variables when built with -fsanitize=*. What we need
+  // is a way to persist the information that a certain global variable should
+  // *not* have sanitizers applied, which occurs if:
+  //   1. The global variable is in the sanitizer ignore list, or
+  //   2. The global variable is created by the sanitizers itself for internal
+  //      usage, or
+  //   3. The global variable has __attribute__((no_sanitize("..."))) or
+  //      __attribute__((disable_sanitizer_instrumentation)).
+  //
+  // This is important, a some IR passes like GlobalMerge can delete global
+  // variables and replace them with new ones. If the old variables were marked
+  // to be unsanitized, then the new ones should also be.
+  struct SanitizerMetadata {
+    SanitizerMetadata()
+        : NoAddress(false), NoHWAddress(false), NoMemtag(false),
+          IsDynInit(false) {}
+    unsigned NoAddress : 1;
+    unsigned NoHWAddress : 1;
+    unsigned NoMemtag : 1;
+
+    // ASan-specific metadata. Is this global variable dynamically initialized
+    // (from a C++ language perspective), and should therefore be checked for
+    // ODR violations.
+    unsigned IsDynInit : 1;
+  };
+
+  bool hasSanitizerMetadata() const { return HasSanitizerMetadata; }
+  const SanitizerMetadata &getSanitizerMetadata() const;
+  // Note: Not byref as it's a POD and otherwise it's too easy to call
+  // G.setSanitizerMetadata(G2.getSanitizerMetadata()), and the argument becomes
+  // dangling when the backing storage allocates the metadata for `G`, as the
+  // storage is shared between `G1` and `G2`.
+  void setSanitizerMetadata(SanitizerMetadata Meta);
+  void removeSanitizerMetadata();
+
   static LinkageTypes getLinkOnceLinkage(bool ODR) {
     return ODR ? LinkOnceODRLinkage : LinkOnceAnyLinkage;
   }
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index a1789759960d..d8f08934b3d6 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -25,7 +25,6 @@
 #include "llvm/IR/ConstantFolder.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/FPEnv.h"
@@ -77,7 +76,7 @@ class IRBuilderCallbackInserter : public IRBuilderDefaultInserter {
   std::function<void(Instruction *)> Callback;
 
 public:
-  virtual ~IRBuilderCallbackInserter();
+  ~IRBuilderCallbackInserter() override;
 
   IRBuilderCallbackInserter(std::function<void(Instruction *)> Callback)
       : Callback(std::move(Callback)) {}
@@ -125,21 +124,18 @@ protected:
   MDNode *DefaultFPMathTag;
   FastMathFlags FMF;
 
-  bool IsFPConstrained;
-  fp::ExceptionBehavior DefaultConstrainedExcept;
-  RoundingMode DefaultConstrainedRounding;
+  bool IsFPConstrained = false;
+  fp::ExceptionBehavior DefaultConstrainedExcept = fp::ebStrict;
+  RoundingMode DefaultConstrainedRounding = RoundingMode::Dynamic;
 
   ArrayRef<OperandBundleDef> DefaultOperandBundles;
 
 public:
   IRBuilderBase(LLVMContext &context, const IRBuilderFolder &Folder,
-                const IRBuilderDefaultInserter &Inserter,
-                MDNode *FPMathTag, ArrayRef<OperandBundleDef> OpBundles)
+                const IRBuilderDefaultInserter &Inserter, MDNode *FPMathTag,
+                ArrayRef<OperandBundleDef> OpBundles)
       : Context(context), Folder(Folder), Inserter(Inserter),
-        DefaultFPMathTag(FPMathTag), IsFPConstrained(false),
-        DefaultConstrainedExcept(fp::ebStrict),
-        DefaultConstrainedRounding(RoundingMode::Dynamic),
-        DefaultOperandBundles(OpBundles) {
+        DefaultFPMathTag(FPMathTag), DefaultOperandBundles(OpBundles) {
     ClearInsertionPoint();
   }
 
@@ -218,23 +214,11 @@ public:
   }
 
   /// Get location information used by debugging information.
-  DebugLoc getCurrentDebugLocation() const {
-    for (auto &KV : MetadataToCopy)
-      if (KV.first == LLVMContext::MD_dbg)
-        return {cast<DILocation>(KV.second)};
-
-    return {};
-  }
+  DebugLoc getCurrentDebugLocation() const;
 
   /// If this builder has a current debug location, set it on the
   /// specified instruction.
-  void SetInstDebugLocation(Instruction *I) const {
-    for (const auto &KV : MetadataToCopy)
-      if (KV.first == LLVMContext::MD_dbg) {
-        I->setDebugLoc(DebugLoc(KV.second));
-        return;
-      }
-  }
+  void SetInstDebugLocation(Instruction *I) const;
 
   /// Add all entries in MetadataToCopy to \p I.
   void AddMetadataToInst(Instruction *I) const {
@@ -316,7 +300,7 @@ public:
   void setDefaultConstrainedExcept(fp::ExceptionBehavior NewExcept) {
 #ifndef NDEBUG
     Optional<StringRef> ExceptStr = convertExceptionBehaviorToStr(NewExcept);
-    assert(ExceptStr.hasValue() && "Garbage strict exception behavior!");
+    assert(ExceptStr && "Garbage strict exception behavior!");
 #endif
     DefaultConstrainedExcept = NewExcept;
   }
@@ -325,7 +309,7 @@ public:
   void setDefaultConstrainedRounding(RoundingMode NewRounding) {
 #ifndef NDEBUG
     Optional<StringRef> RoundingStr = convertRoundingModeToStr(NewRounding);
-    assert(RoundingStr.hasValue() && "Garbage strict rounding mode!");
+    assert(RoundingStr && "Garbage strict rounding mode!");
 #endif
     DefaultConstrainedRounding = NewRounding;
   }
@@ -556,6 +540,11 @@ public:
     return Type::getVoidTy(Context);
   }
 
+  /// Fetch the type representing a pointer.
+  PointerType *getPtrTy(unsigned AddrSpace = 0) {
+    return PointerType::get(Context, AddrSpace);
+  }
+
   /// Fetch the type representing a pointer to an 8-bit integer value.
   PointerType *getInt8PtrTy(unsigned AddrSpace = 0) {
     return Type::getInt8PtrTy(Context, AddrSpace);
@@ -589,6 +578,12 @@ public:
                          MDNode *ScopeTag = nullptr,
                          MDNode *NoAliasTag = nullptr);
 
+  CallInst *CreateMemSetInline(Value *Dst, MaybeAlign DstAlign, Value *Val,
+                               Value *Size, bool IsVolatile = false,
+                               MDNode *TBAATag = nullptr,
+                               MDNode *ScopeTag = nullptr,
+                               MDNode *NoAliasTag = nullptr);
+
   /// Create and insert an element unordered-atomic memset of the region of
   /// memory starting at the given pointer to the given value.
   ///
@@ -789,7 +784,7 @@ public:
   /// Create a call to the experimental.gc.statepoint intrinsic to
   /// start a new statepoint sequence.
   CallInst *CreateGCStatepointCall(uint64_t ID, uint32_t NumPatchBytes,
-                                   Value *ActualCallee,
+                                   FunctionCallee ActualCallee,
                                    ArrayRef<Value *> CallArgs,
                                    Optional<ArrayRef<Value *>> DeoptArgs,
                                    ArrayRef<Value *> GCArgs,
@@ -798,7 +793,7 @@ public:
   /// Create a call to the experimental.gc.statepoint intrinsic to
   /// start a new statepoint sequence.
   CallInst *CreateGCStatepointCall(uint64_t ID, uint32_t NumPatchBytes,
-                                   Value *ActualCallee, uint32_t Flags,
+                                   FunctionCallee ActualCallee, uint32_t Flags,
                                    ArrayRef<Value *> CallArgs,
                                    Optional<ArrayRef<Use>> TransitionArgs,
                                    Optional<ArrayRef<Use>> DeoptArgs,
@@ -809,7 +804,8 @@ public:
   /// in using makeArrayRef(CS.arg_begin(), CS.arg_end()); Use needs to be
   /// .get()'ed to get the Value pointer.
   CallInst *CreateGCStatepointCall(uint64_t ID, uint32_t NumPatchBytes,
-                                   Value *ActualCallee, ArrayRef<Use> CallArgs,
+                                   FunctionCallee ActualCallee,
+                                   ArrayRef<Use> CallArgs,
                                    Optional<ArrayRef<Value *>> DeoptArgs,
                                    ArrayRef<Value *> GCArgs,
                                    const Twine &Name = "");
@@ -818,7 +814,7 @@ public:
   /// start a new statepoint sequence.
   InvokeInst *
   CreateGCStatepointInvoke(uint64_t ID, uint32_t NumPatchBytes,
-                           Value *ActualInvokee, BasicBlock *NormalDest,
+                           FunctionCallee ActualInvokee, BasicBlock *NormalDest,
                            BasicBlock *UnwindDest, ArrayRef<Value *> InvokeArgs,
                            Optional<ArrayRef<Value *>> DeoptArgs,
                            ArrayRef<Value *> GCArgs, const Twine &Name = "");
@@ -826,7 +822,7 @@ public:
   /// Create an invoke to the experimental.gc.statepoint intrinsic to
   /// start a new statepoint sequence.
   InvokeInst *CreateGCStatepointInvoke(
-      uint64_t ID, uint32_t NumPatchBytes, Value *ActualInvokee,
+      uint64_t ID, uint32_t NumPatchBytes, FunctionCallee ActualInvokee,
       BasicBlock *NormalDest, BasicBlock *UnwindDest, uint32_t Flags,
       ArrayRef<Value *> InvokeArgs, Optional<ArrayRef<Use>> TransitionArgs,
       Optional<ArrayRef<Use>> DeoptArgs, ArrayRef<Value *> GCArgs,
@@ -837,7 +833,7 @@ public:
   // get the Value *.
   InvokeInst *
   CreateGCStatepointInvoke(uint64_t ID, uint32_t NumPatchBytes,
-                           Value *ActualInvokee, BasicBlock *NormalDest,
+                           FunctionCallee ActualInvokee, BasicBlock *NormalDest,
                            BasicBlock *UnwindDest, ArrayRef<Use> InvokeArgs,
                            Optional<ArrayRef<Value *>> DeoptArgs,
                            ArrayRef<Value *> GCArgs, const Twine &Name = "");
@@ -918,18 +914,18 @@ public:
                            Name);
   }
 
-  /// Create a call to the experimental.vector.extract intrinsic.
+  /// Create a call to the vector.extract intrinsic.
   CallInst *CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx,
                                 const Twine &Name = "") {
-    return CreateIntrinsic(Intrinsic::experimental_vector_extract,
+    return CreateIntrinsic(Intrinsic::vector_extract,
                            {DstType, SrcVec->getType()}, {SrcVec, Idx}, nullptr,
                            Name);
   }
 
-  /// Create a call to the experimental.vector.insert intrinsic.
+  /// Create a call to the vector.insert intrinsic.
   CallInst *CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec,
                                Value *Idx, const Twine &Name = "") {
-    return CreateIntrinsic(Intrinsic::experimental_vector_insert,
+    return CreateIntrinsic(Intrinsic::vector_insert,
                            {DstType, SubVec->getType()}, {SrcVec, SubVec, Idx},
                            nullptr, Name);
   }
@@ -1162,21 +1158,14 @@ private:
     return I;
   }
 
-  Value *foldConstant(Instruction::BinaryOps Opc, Value *L,
-                      Value *R, const Twine &Name) const {
-    auto *LC = dyn_cast<Constant>(L);
-    auto *RC = dyn_cast<Constant>(R);
-    return (LC && RC) ? Insert(Folder.CreateBinOp(Opc, LC, RC), Name) : nullptr;
-  }
-
   Value *getConstrainedFPRounding(Optional<RoundingMode> Rounding) {
     RoundingMode UseRounding = DefaultConstrainedRounding;
 
-    if (Rounding.hasValue())
+    if (Rounding)
       UseRounding = Rounding.getValue();
 
     Optional<StringRef> RoundingStr = convertRoundingModeToStr(UseRounding);
-    assert(RoundingStr.hasValue() && "Garbage strict rounding mode!");
+    assert(RoundingStr && "Garbage strict rounding mode!");
     auto *RoundingMDS = MDString::get(Context, RoundingStr.getValue());
 
     return MetadataAsValue::get(Context, RoundingMDS);
@@ -1185,11 +1174,11 @@ private:
   Value *getConstrainedFPExcept(Optional<fp::ExceptionBehavior> Except) {
     fp::ExceptionBehavior UseExcept = DefaultConstrainedExcept;
 
-    if (Except.hasValue())
+    if (Except)
       UseExcept = Except.getValue();
 
     Optional<StringRef> ExceptStr = convertExceptionBehaviorToStr(UseExcept);
-    assert(ExceptStr.hasValue() && "Garbage strict exception behavior!");
+    assert(ExceptStr && "Garbage strict exception behavior!");
     auto *ExceptMDS = MDString::get(Context, ExceptStr.getValue());
 
     return MetadataAsValue::get(Context, ExceptMDS);
@@ -1210,10 +1199,11 @@ private:
 public:
   Value *CreateAdd(Value *LHS, Value *RHS, const Twine &Name = "",
                    bool HasNUW = false, bool HasNSW = false) {
-    if (auto *V = Folder.FoldAdd(LHS, RHS, HasNUW, HasNSW))
+    if (Value *V =
+            Folder.FoldNoWrapBinOp(Instruction::Add, LHS, RHS, HasNUW, HasNSW))
       return V;
-    return CreateInsertNUWNSWBinOp(Instruction::Add, LHS, RHS, Name,
-                                   HasNUW, HasNSW);
+    return CreateInsertNUWNSWBinOp(Instruction::Add, LHS, RHS, Name, HasNUW,
+                                   HasNSW);
   }
 
   Value *CreateNSWAdd(Value *LHS, Value *RHS, const Twine &Name = "") {
@@ -1226,11 +1216,11 @@ public:
 
   Value *CreateSub(Value *LHS, Value *RHS, const Twine &Name = "",
                    bool HasNUW = false, bool HasNSW = false) {
-    if (auto *LC = dyn_cast<Constant>(LHS))
-      if (auto *RC = dyn_cast<Constant>(RHS))
-        return Insert(Folder.CreateSub(LC, RC, HasNUW, HasNSW), Name);
-    return CreateInsertNUWNSWBinOp(Instruction::Sub, LHS, RHS, Name,
-                                   HasNUW, HasNSW);
+    if (Value *V =
+            Folder.FoldNoWrapBinOp(Instruction::Sub, LHS, RHS, HasNUW, HasNSW))
+      return V;
+    return CreateInsertNUWNSWBinOp(Instruction::Sub, LHS, RHS, Name, HasNUW,
+                                   HasNSW);
   }
 
   Value *CreateNSWSub(Value *LHS, Value *RHS, const Twine &Name = "") {
@@ -1243,11 +1233,11 @@ public:
 
   Value *CreateMul(Value *LHS, Value *RHS, const Twine &Name = "",
                    bool HasNUW = false, bool HasNSW = false) {
-    if (auto *LC = dyn_cast<Constant>(LHS))
-      if (auto *RC = dyn_cast<Constant>(RHS))
-        return Insert(Folder.CreateMul(LC, RC, HasNUW, HasNSW), Name);
-    return CreateInsertNUWNSWBinOp(Instruction::Mul, LHS, RHS, Name,
-                                   HasNUW, HasNSW);
+    if (Value *V =
+            Folder.FoldNoWrapBinOp(Instruction::Mul, LHS, RHS, HasNUW, HasNSW))
+      return V;
+    return CreateInsertNUWNSWBinOp(Instruction::Mul, LHS, RHS, Name, HasNUW,
+                                   HasNSW);
   }
 
   Value *CreateNSWMul(Value *LHS, Value *RHS, const Twine &Name = "") {
@@ -1260,9 +1250,8 @@ public:
 
   Value *CreateUDiv(Value *LHS, Value *RHS, const Twine &Name = "",
                     bool isExact = false) {
-    if (auto *LC = dyn_cast<Constant>(LHS))
-      if (auto *RC = dyn_cast<Constant>(RHS))
-        return Insert(Folder.CreateUDiv(LC, RC, isExact), Name);
+    if (Value *V = Folder.FoldExactBinOp(Instruction::UDiv, LHS, RHS, isExact))
+      return V;
     if (!isExact)
       return Insert(BinaryOperator::CreateUDiv(LHS, RHS), Name);
     return Insert(BinaryOperator::CreateExactUDiv(LHS, RHS), Name);
@@ -1274,9 +1263,8 @@ public:
 
   Value *CreateSDiv(Value *LHS, Value *RHS, const Twine &Name = "",
                     bool isExact = false) {
-    if (auto *LC = dyn_cast<Constant>(LHS))
-      if (auto *RC = dyn_cast<Constant>(RHS))
-        return Insert(Folder.CreateSDiv(LC, RC, isExact), Name);
+    if (Value *V = Folder.FoldExactBinOp(Instruction::SDiv, LHS, RHS, isExact))
+      return V;
     if (!isExact)
       return Insert(BinaryOperator::CreateSDiv(LHS, RHS), Name);
     return Insert(BinaryOperator::CreateExactSDiv(LHS, RHS), Name);
@@ -1287,20 +1275,22 @@ public:
   }
 
   Value *CreateURem(Value *LHS, Value *RHS, const Twine &Name = "") {
-    if (Value *V = foldConstant(Instruction::URem, LHS, RHS, Name)) return V;
+    if (Value *V = Folder.FoldBinOp(Instruction::URem, LHS, RHS))
+      return V;
     return Insert(BinaryOperator::CreateURem(LHS, RHS), Name);
   }
 
   Value *CreateSRem(Value *LHS, Value *RHS, const Twine &Name = "") {
-    if (Value *V = foldConstant(Instruction::SRem, LHS, RHS, Name)) return V;
+    if (Value *V = Folder.FoldBinOp(Instruction::SRem, LHS, RHS))
+      return V;
     return Insert(BinaryOperator::CreateSRem(LHS, RHS), Name);
   }
 
   Value *CreateShl(Value *LHS, Value *RHS, const Twine &Name = "",
                    bool HasNUW = false, bool HasNSW = false) {
-    if (auto *LC = dyn_cast<Constant>(LHS))
-      if (auto *RC = dyn_cast<Constant>(RHS))
-        return Insert(Folder.CreateShl(LC, RC, HasNUW, HasNSW), Name);
+    if (Value *V =
+            Folder.FoldNoWrapBinOp(Instruction::Shl, LHS, RHS, HasNUW, HasNSW))
+      return V;
     return CreateInsertNUWNSWBinOp(Instruction::Shl, LHS, RHS, Name,
                                    HasNUW, HasNSW);
   }
@@ -1319,9 +1309,8 @@ public:
 
   Value *CreateLShr(Value *LHS, Value *RHS, const Twine &Name = "",
                     bool isExact = false) {
-    if (auto *LC = dyn_cast<Constant>(LHS))
-      if (auto *RC = dyn_cast<Constant>(RHS))
-        return Insert(Folder.CreateLShr(LC, RC, isExact), Name);
+    if (Value *V = Folder.FoldExactBinOp(Instruction::LShr, LHS, RHS, isExact))
+      return V;
     if (!isExact)
       return Insert(BinaryOperator::CreateLShr(LHS, RHS), Name);
     return Insert(BinaryOperator::CreateExactLShr(LHS, RHS), Name);
@@ -1339,9 +1328,8 @@ public:
 
   Value *CreateAShr(Value *LHS, Value *RHS, const Twine &Name = "",
                     bool isExact = false) {
-    if (auto *LC = dyn_cast<Constant>(LHS))
-      if (auto *RC = dyn_cast<Constant>(RHS))
-        return Insert(Folder.CreateAShr(LC, RC, isExact), Name);
+    if (Value *V = Folder.FoldExactBinOp(Instruction::AShr, LHS, RHS, isExact))
+      return V;
     if (!isExact)
       return Insert(BinaryOperator::CreateAShr(LHS, RHS), Name);
     return Insert(BinaryOperator::CreateExactAShr(LHS, RHS), Name);
@@ -1358,7 +1346,7 @@ public:
   }
 
   Value *CreateAnd(Value *LHS, Value *RHS, const Twine &Name = "") {
-    if (auto *V = Folder.FoldAnd(LHS, RHS))
+    if (auto *V = Folder.FoldBinOp(Instruction::And, LHS, RHS))
       return V;
     return Insert(BinaryOperator::CreateAnd(LHS, RHS), Name);
   }
@@ -1380,7 +1368,7 @@ public:
   }
 
   Value *CreateOr(Value *LHS, Value *RHS, const Twine &Name = "") {
-    if (auto *V = Folder.FoldOr(LHS, RHS))
+    if (auto *V = Folder.FoldBinOp(Instruction::Or, LHS, RHS))
       return V;
     return Insert(BinaryOperator::CreateOr(LHS, RHS), Name);
   }
@@ -1402,7 +1390,8 @@ public:
   }
 
   Value *CreateXor(Value *LHS, Value *RHS, const Twine &Name = "") {
-    if (Value *V = foldConstant(Instruction::Xor, LHS, RHS, Name)) return V;
+    if (Value *V = Folder.FoldBinOp(Instruction::Xor, LHS, RHS))
+      return V;
     return Insert(BinaryOperator::CreateXor(LHS, RHS), Name);
   }
 
@@ -1420,7 +1409,8 @@ public:
       return CreateConstrainedFPBinOp(Intrinsic::experimental_constrained_fadd,
                                       L, R, nullptr, Name, FPMD);
 
-    if (Value *V = foldConstant(Instruction::FAdd, L, R, Name)) return V;
+    if (Value *V = Folder.FoldBinOpFMF(Instruction::FAdd, L, R, FMF))
+      return V;
     Instruction *I = setFPAttrs(BinaryOperator::CreateFAdd(L, R), FPMD, FMF);
     return Insert(I, Name);
   }
@@ -1433,9 +1423,10 @@ public:
       return CreateConstrainedFPBinOp(Intrinsic::experimental_constrained_fadd,
                                       L, R, FMFSource, Name);
 
-    if (Value *V = foldConstant(Instruction::FAdd, L, R, Name)) return V;
-    Instruction *I = setFPAttrs(BinaryOperator::CreateFAdd(L, R), nullptr,
-                                FMFSource->getFastMathFlags());
+    FastMathFlags FMF = FMFSource->getFastMathFlags();
+    if (Value *V = Folder.FoldBinOpFMF(Instruction::FAdd, L, R, FMF))
+      return V;
+    Instruction *I = setFPAttrs(BinaryOperator::CreateFAdd(L, R), nullptr, FMF);
     return Insert(I, Name);
   }
 
@@ -1445,7 +1436,8 @@ public:
       return CreateConstrainedFPBinOp(Intrinsic::experimental_constrained_fsub,
                                       L, R, nullptr, Name, FPMD);
 
-    if (Value *V = foldConstant(Instruction::FSub, L, R, Name)) return V;
+    if (Value *V = Folder.FoldBinOpFMF(Instruction::FSub, L, R, FMF))
+      return V;
     Instruction *I = setFPAttrs(BinaryOperator::CreateFSub(L, R), FPMD, FMF);
     return Insert(I, Name);
   }
@@ -1458,9 +1450,10 @@ public:
       return CreateConstrainedFPBinOp(Intrinsic::experimental_constrained_fsub,
                                       L, R, FMFSource, Name);
 
-    if (Value *V = foldConstant(Instruction::FSub, L, R, Name)) return V;
-    Instruction *I = setFPAttrs(BinaryOperator::CreateFSub(L, R), nullptr,
-                                FMFSource->getFastMathFlags());
+    FastMathFlags FMF = FMFSource->getFastMathFlags();
+    if (Value *V = Folder.FoldBinOpFMF(Instruction::FSub, L, R, FMF))
+      return V;
+    Instruction *I = setFPAttrs(BinaryOperator::CreateFSub(L, R), nullptr, FMF);
     return Insert(I, Name);
   }
 
@@ -1470,7 +1463,8 @@ public:
       return CreateConstrainedFPBinOp(Intrinsic::experimental_constrained_fmul,
                                       L, R, nullptr, Name, FPMD);
 
-    if (Value *V = foldConstant(Instruction::FMul, L, R, Name)) return V;
+    if (Value *V = Folder.FoldBinOpFMF(Instruction::FMul, L, R, FMF))
+      return V;
     Instruction *I = setFPAttrs(BinaryOperator::CreateFMul(L, R), FPMD, FMF);
     return Insert(I, Name);
   }
@@ -1483,9 +1477,10 @@ public:
       return CreateConstrainedFPBinOp(Intrinsic::experimental_constrained_fmul,
                                       L, R, FMFSource, Name);
 
-    if (Value *V = foldConstant(Instruction::FMul, L, R, Name)) return V;
-    Instruction *I = setFPAttrs(BinaryOperator::CreateFMul(L, R), nullptr,
-                                FMFSource->getFastMathFlags());
+    FastMathFlags FMF = FMFSource->getFastMathFlags();
+    if (Value *V = Folder.FoldBinOpFMF(Instruction::FMul, L, R, FMF))
+      return V;
+    Instruction *I = setFPAttrs(BinaryOperator::CreateFMul(L, R), nullptr, FMF);
     return Insert(I, Name);
   }
 
@@ -1495,7 +1490,8 @@ public:
       return CreateConstrainedFPBinOp(Intrinsic::experimental_constrained_fdiv,
                                       L, R, nullptr, Name, FPMD);
 
-    if (Value *V = foldConstant(Instruction::FDiv, L, R, Name)) return V;
+    if (Value *V = Folder.FoldBinOpFMF(Instruction::FDiv, L, R, FMF))
+      return V;
     Instruction *I = setFPAttrs(BinaryOperator::CreateFDiv(L, R), FPMD, FMF);
     return Insert(I, Name);
   }
@@ -1508,9 +1504,9 @@ public:
       return CreateConstrainedFPBinOp(Intrinsic::experimental_constrained_fdiv,
                                       L, R, FMFSource, Name);
 
-    if (Value *V = foldConstant(Instruction::FDiv, L, R, Name)) return V;
-    Instruction *I = setFPAttrs(BinaryOperator::CreateFDiv(L, R), nullptr,
-                                FMFSource->getFastMathFlags());
+    if (Value *V = Folder.FoldBinOpFMF(Instruction::FDiv, L, R, FMF))
+      return V;
+    Instruction *I = setFPAttrs(BinaryOperator::CreateFDiv(L, R), nullptr, FMF);
     return Insert(I, Name);
   }
 
@@ -1520,7 +1516,7 @@ public:
       return CreateConstrainedFPBinOp(Intrinsic::experimental_constrained_frem,
                                       L, R, nullptr, Name, FPMD);
 
-    if (Value *V = foldConstant(Instruction::FRem, L, R, Name)) return V;
+    if (Value *V = Folder.FoldBinOpFMF(Instruction::FRem, L, R, FMF)) return V;
     Instruction *I = setFPAttrs(BinaryOperator::CreateFRem(L, R), FPMD, FMF);
     return Insert(I, Name);
   }
@@ -1533,16 +1529,16 @@ public:
       return CreateConstrainedFPBinOp(Intrinsic::experimental_constrained_frem,
                                       L, R, FMFSource, Name);
 
-    if (Value *V = foldConstant(Instruction::FRem, L, R, Name)) return V;
-    Instruction *I = setFPAttrs(BinaryOperator::CreateFRem(L, R), nullptr,
-                                FMFSource->getFastMathFlags());
+    FastMathFlags FMF = FMFSource->getFastMathFlags();
+    if (Value *V = Folder.FoldBinOpFMF(Instruction::FRem, L, R, FMF)) return V;
+    Instruction *I = setFPAttrs(BinaryOperator::CreateFRem(L, R), nullptr, FMF);
     return Insert(I, Name);
   }
 
   Value *CreateBinOp(Instruction::BinaryOps Opc,
                      Value *LHS, Value *RHS, const Twine &Name = "",
                      MDNode *FPMathTag = nullptr) {
-    if (Value *V = foldConstant(Opc, LHS, RHS, Name)) return V;
+    if (Value *V = Folder.FoldBinOp(Opc, LHS, RHS)) return V;
     Instruction *BinOp = BinaryOperator::Create(Opc, LHS, RHS);
     if (isa<FPMathOperator>(BinOp))
       setFPAttrs(BinOp, FPMathTag, FMF);
@@ -1576,14 +1572,10 @@ public:
       Optional<RoundingMode> Rounding = None,
       Optional<fp::ExceptionBehavior> Except = None);
 
-  Value *CreateNeg(Value *V, const Twine &Name = "",
-                   bool HasNUW = false, bool HasNSW = false) {
-    if (auto *VC = dyn_cast<Constant>(V))
-      return Insert(Folder.CreateNeg(VC, HasNUW, HasNSW), Name);
-    BinaryOperator *BO = Insert(BinaryOperator::CreateNeg(V), Name);
-    if (HasNUW) BO->setHasNoUnsignedWrap();
-    if (HasNSW) BO->setHasNoSignedWrap();
-    return BO;
+  Value *CreateNeg(Value *V, const Twine &Name = "", bool HasNUW = false,
+                   bool HasNSW = false) {
+    return CreateSub(Constant::getNullValue(V->getType()), V, Name, HasNUW,
+                     HasNSW);
   }
 
   Value *CreateNSWNeg(Value *V, const Twine &Name = "") {
@@ -1614,9 +1606,7 @@ public:
   }
 
   Value *CreateNot(Value *V, const Twine &Name = "") {
-    if (auto *VC = dyn_cast<Constant>(V))
-      return Insert(Folder.CreateNot(VC), Name);
-    return Insert(BinaryOperator::CreateNot(V), Name);
+    return CreateXor(V, Constant::getAllOnesValue(V->getType()), Name);
   }
 
   Value *CreateUnOp(Instruction::UnaryOps Opc,
@@ -1733,30 +1723,18 @@ public:
   }
 
   Value *CreateGEP(Type *Ty, Value *Ptr, ArrayRef<Value *> IdxList,
-                   const Twine &Name = "") {
-    if (auto *V = Folder.FoldGEP(Ty, Ptr, IdxList, /*IsInBounds=*/false))
+                   const Twine &Name = "", bool IsInBounds = false) {
+    if (auto *V = Folder.FoldGEP(Ty, Ptr, IdxList, IsInBounds))
       return V;
-    return Insert(GetElementPtrInst::Create(Ty, Ptr, IdxList), Name);
+    return Insert(IsInBounds
+                      ? GetElementPtrInst::CreateInBounds(Ty, Ptr, IdxList)
+                      : GetElementPtrInst::Create(Ty, Ptr, IdxList),
+                  Name);
   }
 
   Value *CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef<Value *> IdxList,
                            const Twine &Name = "") {
-    if (auto *V = Folder.FoldGEP(Ty, Ptr, IdxList, /*IsInBounds=*/true))
-      return V;
-    return Insert(GetElementPtrInst::CreateInBounds(Ty, Ptr, IdxList), Name);
-  }
-
-  Value *CreateGEP(Type *Ty, Value *Ptr, Value *Idx, const Twine &Name = "") {
-    if (auto *V = Folder.FoldGEP(Ty, Ptr, {Idx}, /*IsInBounds=*/false))
-      return V;
-    return Insert(GetElementPtrInst::Create(Ty, Ptr, Idx), Name);
-  }
-
-  Value *CreateInBoundsGEP(Type *Ty, Value *Ptr, Value *Idx,
-                           const Twine &Name = "") {
-    if (auto *V = Folder.FoldGEP(Ty, Ptr, {Idx}, /*IsInBounds=*/true))
-      return V;
-    return Insert(GetElementPtrInst::CreateInBounds(Ty, Ptr, Idx), Name);
+    return CreateGEP(Ty, Ptr, IdxList, Name, /* IsInBounds */ true);
   }
 
   Value *CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0,
@@ -2297,9 +2275,8 @@ public:
 
   Value *CreateExtractElement(Value *Vec, Value *Idx,
                               const Twine &Name = "") {
-    if (auto *VC = dyn_cast<Constant>(Vec))
-      if (auto *IC = dyn_cast<Constant>(Idx))
-        return Insert(Folder.CreateExtractElement(VC, IC), Name);
+    if (Value *V = Folder.FoldExtractElement(Vec, Idx))
+      return V;
     return Insert(ExtractElementInst::Create(Vec, Idx), Name);
   }
 
@@ -2320,10 +2297,8 @@ public:
 
   Value *CreateInsertElement(Value *Vec, Value *NewElt, Value *Idx,
                              const Twine &Name = "") {
-    if (auto *VC = dyn_cast<Constant>(Vec))
-      if (auto *NC = dyn_cast<Constant>(NewElt))
-        if (auto *IC = dyn_cast<Constant>(Idx))
-          return Insert(Folder.CreateInsertElement(VC, NC, IC), Name);
+    if (Value *V = Folder.FoldInsertElement(Vec, NewElt, Idx))
+      return V;
     return Insert(InsertElementInst::Create(Vec, NewElt, Idx), Name);
   }
 
@@ -2339,21 +2314,11 @@ public:
     return CreateShuffleVector(V1, V2, IntMask, Name);
   }
 
-  LLVM_ATTRIBUTE_DEPRECATED(Value *CreateShuffleVector(Value *V1, Value *V2,
-                                                       ArrayRef<uint32_t> Mask,
-                                                       const Twine &Name = ""),
-                            "Pass indices as 'int' instead") {
-    SmallVector<int, 16> IntMask;
-    IntMask.assign(Mask.begin(), Mask.end());
-    return CreateShuffleVector(V1, V2, IntMask, Name);
-  }
-
   /// See class ShuffleVectorInst for a description of the mask representation.
   Value *CreateShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask,
                              const Twine &Name = "") {
-    if (auto *V1C = dyn_cast<Constant>(V1))
-      if (auto *V2C = dyn_cast<Constant>(V2))
-        return Insert(Folder.CreateShuffleVector(V1C, V2C, Mask), Name);
+    if (Value *V = Folder.FoldShuffleVector(V1, V2, Mask))
+      return V;
     return Insert(new ShuffleVectorInst(V1, V2, Mask), Name);
   }
 
@@ -2364,20 +2329,17 @@ public:
     return CreateShuffleVector(V, PoisonValue::get(V->getType()), Mask, Name);
   }
 
-  Value *CreateExtractValue(Value *Agg,
-                            ArrayRef<unsigned> Idxs,
+  Value *CreateExtractValue(Value *Agg, ArrayRef<unsigned> Idxs,
                             const Twine &Name = "") {
-    if (auto *AggC = dyn_cast<Constant>(Agg))
-      return Insert(Folder.CreateExtractValue(AggC, Idxs), Name);
+    if (auto *V = Folder.FoldExtractValue(Agg, Idxs))
+      return V;
     return Insert(ExtractValueInst::Create(Agg, Idxs), Name);
   }
 
-  Value *CreateInsertValue(Value *Agg, Value *Val,
-                           ArrayRef<unsigned> Idxs,
+  Value *CreateInsertValue(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs,
                            const Twine &Name = "") {
-    if (auto *AggC = dyn_cast<Constant>(Agg))
-      if (auto *ValC = dyn_cast<Constant>(Val))
-        return Insert(Folder.CreateInsertValue(AggC, ValC, Idxs), Name);
+    if (auto *V = Folder.FoldInsertValue(Agg, Val, Idxs))
+      return V;
     return Insert(InsertValueInst::Create(Agg, Val, Idxs), Name);
   }
 
@@ -2394,16 +2356,25 @@ public:
   // Utility creation methods
   //===--------------------------------------------------------------------===//
 
-  /// Return an i1 value testing if \p Arg is null.
+  /// Return a boolean value testing if \p Arg == 0.
   Value *CreateIsNull(Value *Arg, const Twine &Name = "") {
-    return CreateICmpEQ(Arg, Constant::getNullValue(Arg->getType()),
-                        Name);
+    return CreateICmpEQ(Arg, ConstantInt::getNullValue(Arg->getType()), Name);
   }
 
-  /// Return an i1 value testing if \p Arg is not null.
+  /// Return a boolean value testing if \p Arg != 0.
   Value *CreateIsNotNull(Value *Arg, const Twine &Name = "") {
-    return CreateICmpNE(Arg, Constant::getNullValue(Arg->getType()),
-                        Name);
+    return CreateICmpNE(Arg, ConstantInt::getNullValue(Arg->getType()), Name);
+  }
+
+  /// Return a boolean value testing if \p Arg < 0.
+  Value *CreateIsNeg(Value *Arg, const Twine &Name = "") {
+    return CreateICmpSLT(Arg, ConstantInt::getNullValue(Arg->getType()), Name);
+  }
+
+  /// Return a boolean value testing if \p Arg > -1.
+  Value *CreateIsNotNeg(Value *Arg, const Twine &Name = "") {
+    return CreateICmpSGT(Arg, ConstantInt::getAllOnesValue(Arg->getType()),
+                         Name);
   }
 
   /// Return the i64 difference between two pointer values, dividing out
diff --git a/llvm/include/llvm/IR/IRBuilderFolder.h b/llvm/include/llvm/IR/IRBuilderFolder.h
index 2827ab553adc..9505f1e3be2a 100644
--- a/llvm/include/llvm/IR/IRBuilderFolder.h
+++ b/llvm/include/llvm/IR/IRBuilderFolder.h
@@ -31,12 +31,19 @@ public:
   // Return an existing value or a constant if the operation can be simplified.
   // Otherwise return nullptr.
   //===--------------------------------------------------------------------===//
-  virtual Value *FoldAdd(Value *LHS, Value *RHS, bool HasNUW = false,
-                         bool HasNSW = false) const = 0;
 
-  virtual Value *FoldAnd(Value *LHS, Value *RHS) const = 0;
+  virtual Value *FoldBinOp(Instruction::BinaryOps Opc, Value *LHS,
+                           Value *RHS) const = 0;
 
-  virtual Value *FoldOr(Value *LHS, Value *RHS) const = 0;
+  virtual Value *FoldExactBinOp(Instruction::BinaryOps Opc, Value *LHS,
+                                Value *RHS, bool IsExact) const = 0;
+
+  virtual Value *FoldNoWrapBinOp(Instruction::BinaryOps Opc, Value *LHS,
+                                 Value *RHS, bool HasNUW,
+                                 bool HasNSW) const = 0;
+
+  virtual Value *FoldBinOpFMF(Instruction::BinaryOps Opc, Value *LHS,
+                              Value *RHS, FastMathFlags FMF) const = 0;
 
   virtual Value *FoldICmp(CmpInst::Predicate P, Value *LHS,
                           Value *RHS) const = 0;
@@ -46,43 +53,25 @@ public:
 
   virtual Value *FoldSelect(Value *C, Value *True, Value *False) const = 0;
 
-  //===--------------------------------------------------------------------===//
-  // Binary Operators
-  //===--------------------------------------------------------------------===//
+  virtual Value *FoldExtractValue(Value *Agg,
+                                  ArrayRef<unsigned> IdxList) const = 0;
+
+  virtual Value *FoldInsertValue(Value *Agg, Value *Val,
+                                 ArrayRef<unsigned> IdxList) const = 0;
+
+  virtual Value *FoldExtractElement(Value *Vec, Value *Idx) const = 0;
 
-  virtual Value *CreateFAdd(Constant *LHS, Constant *RHS) const = 0;
-  virtual Value *CreateSub(Constant *LHS, Constant *RHS,
-                           bool HasNUW = false, bool HasNSW = false) const = 0;
-  virtual Value *CreateFSub(Constant *LHS, Constant *RHS) const = 0;
-  virtual Value *CreateMul(Constant *LHS, Constant *RHS,
-                           bool HasNUW = false, bool HasNSW = false) const = 0;
-  virtual Value *CreateFMul(Constant *LHS, Constant *RHS) const = 0;
-  virtual Value *CreateUDiv(Constant *LHS, Constant *RHS,
-                            bool isExact = false) const = 0;
-  virtual Value *CreateSDiv(Constant *LHS, Constant *RHS,
-                            bool isExact = false) const = 0;
-  virtual Value *CreateFDiv(Constant *LHS, Constant *RHS) const = 0;
-  virtual Value *CreateURem(Constant *LHS, Constant *RHS) const = 0;
-  virtual Value *CreateSRem(Constant *LHS, Constant *RHS) const = 0;
-  virtual Value *CreateFRem(Constant *LHS, Constant *RHS) const = 0;
-  virtual Value *CreateShl(Constant *LHS, Constant *RHS,
-                           bool HasNUW = false, bool HasNSW = false) const = 0;
-  virtual Value *CreateLShr(Constant *LHS, Constant *RHS,
-                            bool isExact = false) const = 0;
-  virtual Value *CreateAShr(Constant *LHS, Constant *RHS,
-                            bool isExact = false) const = 0;
-  virtual Value *CreateXor(Constant *LHS, Constant *RHS) const = 0;
-  virtual Value *CreateBinOp(Instruction::BinaryOps Opc,
-                             Constant *LHS, Constant *RHS) const = 0;
+  virtual Value *FoldInsertElement(Value *Vec, Value *NewElt,
+                                   Value *Idx) const = 0;
+
+  virtual Value *FoldShuffleVector(Value *V1, Value *V2,
+                                   ArrayRef<int> Mask) const = 0;
 
   //===--------------------------------------------------------------------===//
   // Unary Operators
   //===--------------------------------------------------------------------===//
 
-  virtual Value *CreateNeg(Constant *C,
-                           bool HasNUW = false, bool HasNSW = false) const = 0;
   virtual Value *CreateFNeg(Constant *C) const = 0;
-  virtual Value *CreateNot(Constant *C) const = 0;
   virtual Value *CreateUnOp(Instruction::UnaryOps Opc, Constant *C) const = 0;
 
   //===--------------------------------------------------------------------===//
@@ -110,20 +99,6 @@ public:
 
   virtual Value *CreateFCmp(CmpInst::Predicate P, Constant *LHS,
                             Constant *RHS) const = 0;
-
-  //===--------------------------------------------------------------------===//
-  // Other Instructions
-  //===--------------------------------------------------------------------===//
-
-  virtual Value *CreateExtractElement(Constant *Vec, Constant *Idx) const = 0;
-  virtual Value *CreateInsertElement(Constant *Vec, Constant *NewElt,
-                                     Constant *Idx) const = 0;
-  virtual Value *CreateShuffleVector(Constant *V1, Constant *V2,
-                                     ArrayRef<int> Mask) const = 0;
-  virtual Value *CreateExtractValue(Constant *Agg,
-                                    ArrayRef<unsigned> IdxList) const = 0;
-  virtual Value *CreateInsertValue(Constant *Agg, Constant *Val,
-                                   ArrayRef<unsigned> IdxList) const = 0;
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/IR/InlineAsm.h b/llvm/include/llvm/IR/InlineAsm.h
index cf6b7af96980..57f2da27e04e 100644
--- a/llvm/include/llvm/IR/InlineAsm.h
+++ b/llvm/include/llvm/IR/InlineAsm.h
@@ -240,12 +240,15 @@ public:
     Kind_RegDefEarlyClobber = 3, // Early-clobber output register, "=&r".
     Kind_Clobber = 4,            // Clobbered register, "~r".
     Kind_Imm = 5,                // Immediate.
-    Kind_Mem = 6,                // Memory operand, "m".
+    Kind_Mem = 6,                // Memory operand, "m", or an address, "p".
 
     // Memory constraint codes.
     // These could be tablegenerated but there's little need to do that since
     // there's plenty of space in the encoding to support the union of all
     // constraint codes for all targets.
+    // Addresses are included here as they need to be treated the same by the
+    // backend, the only difference is that they are not used to actaully
+    // access memory by the instruction.
     Constraint_Unknown = 0,
     Constraint_es,
     Constraint_i,
@@ -268,7 +271,15 @@ public:
     Constraint_Z,
     Constraint_ZC,
     Constraint_Zy,
-    Constraints_Max = Constraint_Zy,
+
+    // Address constraints
+    Constraint_p,
+    Constraint_ZQ,
+    Constraint_ZR,
+    Constraint_ZS,
+    Constraint_ZT,
+
+    Constraints_Max = Constraint_ZT,
     Constraints_ShiftAmount = 16,
 
     Flag_MatchingOperand = 0x80000000
@@ -453,6 +464,16 @@ public:
       return "ZC";
     case InlineAsm::Constraint_Zy:
       return "Zy";
+    case InlineAsm::Constraint_p:
+      return "p";
+    case InlineAsm::Constraint_ZQ:
+      return "ZQ";
+    case InlineAsm::Constraint_ZR:
+      return "ZR";
+    case InlineAsm::Constraint_ZS:
+      return "ZS";
+    case InlineAsm::Constraint_ZT:
+      return "ZT";
     default:
       llvm_unreachable("Unknown memory constraint");
     }
diff --git a/llvm/include/llvm/IR/InstVisitor.h b/llvm/include/llvm/IR/InstVisitor.h
index 585129904dd4..7fec081d8155 100644
--- a/llvm/include/llvm/IR/InstVisitor.h
+++ b/llvm/include/llvm/IR/InstVisitor.h
@@ -15,7 +15,6 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Support/ErrorHandling.h"
 
 namespace llvm {
 
@@ -200,7 +199,7 @@ public:
   RetTy visitCatchPadInst(CatchPadInst &I)     { DELEGATE(FuncletPadInst); }
   RetTy visitFreezeInst(FreezeInst &I)         { DELEGATE(Instruction); }
 
-  // Handle the special instrinsic instruction classes.
+  // Handle the special intrinsic instruction classes.
   RetTy visitDbgDeclareInst(DbgDeclareInst &I)    { DELEGATE(DbgVariableIntrinsic);}
   RetTy visitDbgValueInst(DbgValueInst &I)        { DELEGATE(DbgVariableIntrinsic);}
   RetTy visitDbgVariableIntrinsic(DbgVariableIntrinsic &I)
diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
index 589926c0faf1..eb6f89d740c6 100644
--- a/llvm/include/llvm/IR/InstrTypes.h
+++ b/llvm/include/llvm/IR/InstrTypes.h
@@ -21,22 +21,16 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallingConv.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/OperandTraits.h"
-#include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -47,6 +41,10 @@
 
 namespace llvm {
 
+class StringRef;
+class Type;
+class Value;
+
 namespace Intrinsic {
 typedef unsigned ID;
 }
@@ -1615,12 +1613,18 @@ public:
 
   /// Get the attribute of a given kind for the function.
   Attribute getFnAttr(StringRef Kind) const {
-    return getAttributes().getFnAttr(Kind);
+    Attribute Attr = getAttributes().getFnAttr(Kind);
+    if (Attr.isValid())
+      return Attr;
+    return getFnAttrOnCalledFunction(Kind);
   }
 
   /// Get the attribute of a given kind for the function.
   Attribute getFnAttr(Attribute::AttrKind Kind) const {
-    return getAttributes().getFnAttr(Kind);
+    Attribute A = getAttributes().getFnAttr(Kind);
+    if (A.isValid())
+      return A;
+    return getFnAttrOnCalledFunction(Kind);
   }
 
   /// Get the attribute of a given kind from a given arg
@@ -1761,7 +1765,7 @@ public:
     return nullptr;
   }
 
-  /// Extract the preallocated type for a call or parameter.
+  /// Extract the inalloca type for a call or parameter.
   Type *getParamInAllocaType(unsigned ArgNo) const {
     if (auto *Ty = Attrs.getParamInAllocaType(ArgNo))
       return Ty;
@@ -1770,6 +1774,22 @@ public:
     return nullptr;
   }
 
+  /// Extract the sret type for a call or parameter.
+  Type *getParamStructRetType(unsigned ArgNo) const {
+    if (auto *Ty = Attrs.getParamStructRetType(ArgNo))
+      return Ty;
+    if (const Function *F = getCalledFunction())
+      return F->getAttributes().getParamStructRetType(ArgNo);
+    return nullptr;
+  }
+
+  /// Extract the elementtype type for a parameter.
+  /// Note that elementtype() can only be applied to call arguments, not
+  /// function declaration parameters.
+  Type *getParamElementType(unsigned ArgNo) const {
+    return Attrs.getParamElementType(ArgNo);
+  }
+
   /// Extract the number of dereferenceable bytes for a call or
   /// parameter (0=unknown).
   uint64_t getRetDereferenceableBytes() const {
@@ -1806,7 +1826,13 @@ public:
 
   /// If one of the arguments has the 'returned' attribute, returns its
   /// operand value. Otherwise, return nullptr.
-  Value *getReturnedArgOperand() const;
+  Value *getReturnedArgOperand() const {
+    return getArgOperandWithAttribute(Attribute::Returned);
+  }
+
+  /// If one of the arguments has the specified attribute, returns its
+  /// operand value. Otherwise, return nullptr.
+  Value *getArgOperandWithAttribute(Attribute::AttrKind Kind) const;
 
   /// Return true if the call should not be treated as a call to a
   /// builtin.
@@ -2052,7 +2078,8 @@ public:
   bool hasClobberingOperandBundles() const {
     for (auto &BOI : bundle_op_infos()) {
       if (BOI.Tag->second == LLVMContext::OB_deopt ||
-          BOI.Tag->second == LLVMContext::OB_funclet)
+          BOI.Tag->second == LLVMContext::OB_funclet ||
+          BOI.Tag->second == LLVMContext::OB_ptrauth)
         continue;
 
       // This instruction has an operand bundle that is not known to us.
@@ -2296,6 +2323,7 @@ private:
 
     return hasFnAttrOnCalledFunction(Kind);
   }
+  template <typename AK> Attribute getFnAttrOnCalledFunction(AK Kind) const;
 
   /// A specialized version of hasFnAttrImpl for when the caller wants to
   /// know if an attribute's semantics are implied, not whether the attribute
diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h
index 1937ffd36f7b..8d0a8363cdfb 100644
--- a/llvm/include/llvm/IR/Instruction.h
+++ b/llvm/include/llvm/IR/Instruction.h
@@ -24,7 +24,6 @@
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/AtomicOrdering.h"
-#include "llvm/Support/Casting.h"
 #include <cstdint>
 #include <utility>
 
diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index 5929cff3b4fb..d152e86488e1 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -21,24 +21,18 @@
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/OperandTraits.h"
-#include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
 #include "llvm/Support/AtomicOrdering.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cassert>
 #include <cstddef>
@@ -47,9 +41,14 @@
 
 namespace llvm {
 
+class APFloat;
 class APInt;
+class BasicBlock;
 class ConstantInt;
 class DataLayout;
+class StringRef;
+class Type;
+class Value;
 
 //===----------------------------------------------------------------------===//
 //                                AllocaInst Class
@@ -127,9 +126,6 @@ public:
     setSubclassData<AlignmentField>(Log2(Align));
   }
 
-  // FIXME: Remove this one transition to Align is over.
-  uint64_t getAlignment() const { return getAlign().value(); }
-
   /// Return true if this alloca is in the entry block of the function and is a
   /// constant size. If so, the code generator will fold it into the
   /// prolog/epilog code, so it is basically free.
@@ -216,11 +212,6 @@ public:
   /// Specify whether this is a volatile load or not.
   void setVolatile(bool V) { setSubclassData<VolatileField>(V); }
 
-  /// Return the alignment of the access that is being performed.
-  /// FIXME: Remove this function once transition to Align is over.
-  /// Use getAlign() instead.
-  uint64_t getAlignment() const { return getAlign().value(); }
-
   /// Return the alignment of the access that is being performed.
   Align getAlign() const {
     return Align(1ULL << (getSubclassData<AlignmentField>()));
@@ -347,11 +338,6 @@ public:
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 
-  /// Return the alignment of the access that is being performed
-  /// FIXME: Remove this function once transition to Align is over.
-  /// Use getAlign() instead.
-  uint64_t getAlignment() const { return getAlign().value(); }
-
   Align getAlign() const {
     return Align(1ULL << (getSubclassData<AlignmentField>()));
   }
@@ -2138,6 +2124,12 @@ public:
   static bool isIdentityMask(ArrayRef<int> Mask);
   static bool isIdentityMask(const Constant *Mask) {
     assert(Mask->getType()->isVectorTy() && "Shuffle needs vector constant.");
+
+    // Not possible to express a shuffle mask for a scalable vector for this
+    // case.
+    if (isa<ScalableVectorType>(Mask->getType()))
+      return false;
+
     SmallVector<int, 16> MaskAsInts;
     getShuffleMask(Mask, MaskAsInts);
     return isIdentityMask(MaskAsInts);
@@ -2148,6 +2140,11 @@ public:
   /// from its input vectors.
   /// Example: shufflevector <4 x n> A, <4 x n> B, <4,undef,6,undef>
   bool isIdentity() const {
+    // Not possible to express a shuffle mask for a scalable vector for this
+    // case.
+    if (isa<ScalableVectorType>(getType()))
+      return false;
+
     return !changesLength() && isIdentityMask(ShuffleMask);
   }
 
@@ -5311,6 +5308,10 @@ public:
   }
 };
 
+//===----------------------------------------------------------------------===//
+//                          Helper functions
+//===----------------------------------------------------------------------===//
+
 /// A helper function that returns the pointer operand of a load or store
 /// instruction. Returns nullptr if not load or store.
 inline const Value *getLoadStorePointerOperand(const Value *V) {
@@ -5366,6 +5367,24 @@ inline Type *getLoadStoreType(Value *I) {
   return cast<StoreInst>(I)->getValueOperand()->getType();
 }
 
+/// A helper function that returns an atomic operation's sync scope; returns
+/// None if it is not an atomic operation.
+inline Optional<SyncScope::ID> getAtomicSyncScopeID(const Instruction *I) {
+  if (!I->isAtomic())
+    return None;
+  if (auto *AI = dyn_cast<LoadInst>(I))
+    return AI->getSyncScopeID();
+  if (auto *AI = dyn_cast<StoreInst>(I))
+    return AI->getSyncScopeID();
+  if (auto *AI = dyn_cast<FenceInst>(I))
+    return AI->getSyncScopeID();
+  if (auto *AI = dyn_cast<AtomicCmpXchgInst>(I))
+    return AI->getSyncScopeID();
+  if (auto *AI = dyn_cast<AtomicRMWInst>(I))
+    return AI->getSyncScopeID();
+  llvm_unreachable("unhandled atomic operation");
+}
+
 //===----------------------------------------------------------------------===//
 //                              FreezeInst Class
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index 01dada25a285..06d2335821d3 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -31,7 +31,6 @@
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
 #include <cassert>
@@ -39,6 +38,8 @@
 
 namespace llvm {
 
+class Metadata;
+
 /// A wrapper class for inspecting calls to intrinsic functions.
 /// This allows the standard isa/dyncast/cast functionality to work with calls
 /// to intrinsic functions.
@@ -472,6 +473,38 @@ public:
   /// @}
 };
 
+class VPCastIntrinsic : public VPIntrinsic {
+public:
+  static bool isVPCast(Intrinsic::ID ID);
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast:
+  /// @{
+  static bool classof(const IntrinsicInst *I) {
+    return VPCastIntrinsic::isVPCast(I->getIntrinsicID());
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+  /// @}
+};
+
+class VPCmpIntrinsic : public VPIntrinsic {
+public:
+  static bool isVPCmp(Intrinsic::ID ID);
+
+  CmpInst::Predicate getPredicate() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast:
+  /// @{
+  static bool classof(const IntrinsicInst *I) {
+    return VPCmpIntrinsic::isVPCmp(I->getIntrinsicID());
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+  /// @}
+};
+
 /// This is the common base class for constrained floating point intrinsics.
 class ConstrainedFPIntrinsic : public IntrinsicInst {
 public:
@@ -492,6 +525,9 @@ public:
 class ConstrainedFPCmpIntrinsic : public ConstrainedFPIntrinsic {
 public:
   FCmpInst::Predicate getPredicate() const;
+  bool isSignaling() const {
+    return getIntrinsicID() == Intrinsic::experimental_constrained_fcmps;
+  }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const IntrinsicInst *I) {
@@ -723,11 +759,6 @@ public:
     setArgOperand(ARG_DEST, Ptr);
   }
 
-  /// FIXME: Remove this function once transition to Align is over.
-  /// Use the version that takes MaybeAlign instead of this one.
-  void setDestAlignment(unsigned Alignment) {
-    setDestAlignment(MaybeAlign(Alignment));
-  }
   void setDestAlignment(MaybeAlign Alignment) {
     removeParamAttr(ARG_DEST, Attribute::Alignment);
     if (Alignment)
@@ -942,6 +973,7 @@ public:
     case Intrinsic::memcpy:
     case Intrinsic::memmove:
     case Intrinsic::memset:
+    case Intrinsic::memset_inline:
     case Intrinsic::memcpy_inline:
       return true;
     default:
@@ -953,12 +985,33 @@ public:
   }
 };
 
-/// This class wraps the llvm.memset intrinsic.
+/// This class wraps the llvm.memset and llvm.memset.inline intrinsics.
 class MemSetInst : public MemSetBase<MemIntrinsic> {
 public:
   // Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const IntrinsicInst *I) {
-    return I->getIntrinsicID() == Intrinsic::memset;
+    switch (I->getIntrinsicID()) {
+    case Intrinsic::memset:
+    case Intrinsic::memset_inline:
+      return true;
+    default:
+      return false;
+    }
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+/// This class wraps the llvm.memset.inline intrinsic.
+class MemSetInlineInst : public MemSetInst {
+public:
+  ConstantInt *getLength() const {
+    return cast<ConstantInt>(MemSetInst::getLength());
+  }
+  // Methods for support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::memset_inline;
   }
   static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
@@ -1043,6 +1096,7 @@ public:
     case Intrinsic::memcpy_inline:
     case Intrinsic::memmove:
     case Intrinsic::memset:
+    case Intrinsic::memset_inline:
     case Intrinsic::memcpy_element_unordered_atomic:
     case Intrinsic::memmove_element_unordered_atomic:
     case Intrinsic::memset_element_unordered_atomic:
@@ -1064,6 +1118,7 @@ public:
   static bool classof(const IntrinsicInst *I) {
     switch (I->getIntrinsicID()) {
     case Intrinsic::memset:
+    case Intrinsic::memset_inline:
     case Intrinsic::memset_element_unordered_atomic:
       return true;
     default:
diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h
index 2ff48380ac28..a3db2fa59399 100644
--- a/llvm/include/llvm/IR/Intrinsics.h
+++ b/llvm/include/llvm/IR/Intrinsics.h
@@ -104,8 +104,8 @@ namespace Intrinsic {
   int lookupLLVMIntrinsicByName(ArrayRef<const char *> NameTable,
                                 StringRef Name);
 
-  /// Map a GCC builtin name to an intrinsic ID.
-  ID getIntrinsicForGCCBuiltin(const char *Prefix, StringRef BuiltinName);
+  /// Map a Clang builtin name to an intrinsic ID.
+  ID getIntrinsicForClangBuiltin(const char *Prefix, StringRef BuiltinName);
 
   /// Map a MS builtin name to an intrinsic ID.
   ID getIntrinsicForMSBuiltin(const char *Prefix, StringRef BuiltinName);
@@ -142,6 +142,7 @@ namespace Intrinsic {
       VecOfBitcastsToInt,
       AMX,
       PPCQuad,
+      AnyPtrToElt,
     } Kind;
 
     union {
@@ -180,14 +181,15 @@ namespace Intrinsic {
       return (ArgKind)(Argument_Info & 7);
     }
 
-    // VecOfAnyPtrsToElt uses both an overloaded argument (for address space)
-    // and a reference argument (for matching vector width and element types)
+    // VecOfAnyPtrsToElt and AnyPtrToElt uses both an overloaded argument (for
+    // address space) and a reference argument (for matching vector width and
+    // element types)
     unsigned getOverloadArgNumber() const {
-      assert(Kind == VecOfAnyPtrsToElt);
+      assert(Kind == VecOfAnyPtrsToElt || Kind == AnyPtrToElt);
       return Argument_Info >> 16;
     }
     unsigned getRefArgNumber() const {
-      assert(Kind == VecOfAnyPtrsToElt);
+      assert(Kind == VecOfAnyPtrsToElt || Kind == AnyPtrToElt);
       return Argument_Info & 0xFFFF;
     }
 
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index f5248e82ad21..0dceea13ea36 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -120,6 +120,9 @@ class ReadNone<AttrIndex idx> : IntrinsicProperty {
 
 def IntrNoReturn : IntrinsicProperty;
 
+// Applied by default.
+def IntrNoCallback : IntrinsicProperty<1>;
+
 // IntrNoSync - Threads executing the intrinsic will not synchronize using
 // memory or other means. Applied by default.
 def IntrNoSync : IntrinsicProperty<1>;
@@ -212,6 +215,7 @@ class LLVMScalarOrSameVectorWidth<int idx, LLVMType elty>
 
 class LLVMPointerTo<int num> : LLVMMatchType<num>;
 class LLVMPointerToElt<int num> : LLVMMatchType<num>;
+class LLVMAnyPointerToElt<int num> : LLVMMatchType<num>;
 class LLVMVectorOfAnyPointersToElt<int num> : LLVMMatchType<num>;
 class LLVMVectorElementType<int num> : LLVMMatchType<num>;
 
@@ -241,6 +245,7 @@ def llvm_i8_ty         : LLVMType<i8>;
 def llvm_i16_ty        : LLVMType<i16>;
 def llvm_i32_ty        : LLVMType<i32>;
 def llvm_i64_ty        : LLVMType<i64>;
+def llvm_i128_ty       : LLVMType<i128>;
 def llvm_half_ty       : LLVMType<f16>;
 def llvm_bfloat_ty     : LLVMType<bf16>;
 def llvm_float_ty      : LLVMType<f32>;
@@ -380,11 +385,11 @@ class DefaultAttrsIntrinsic<list<LLVMType> ret_types,
                             intr_properties, name,
                             sd_properties, /*disable_default_attributes*/ 0> {}
 
-/// GCCBuiltin - If this intrinsic exactly corresponds to a GCC builtin, this
+/// ClangBuiltin - If this intrinsic exactly corresponds to a Clang builtin, this
 /// specifies the name of the builtin.  This provides automatic CBE and CFE
 /// support.
-class GCCBuiltin<string name> {
-  string GCCBuiltinName = name;
+class ClangBuiltin<string name> {
+  string ClangBuiltinName = name;
 }
 
 class MSBuiltin<string name> {
@@ -540,14 +545,14 @@ def int_seh_scope_end : Intrinsic<[], [], [IntrNoMem]>;
 // Note: we treat stacksave/stackrestore as writemem because we don't otherwise
 // model their dependencies on allocas.
 def int_stacksave     : DefaultAttrsIntrinsic<[llvm_ptr_ty]>,
-                        GCCBuiltin<"__builtin_stack_save">;
+                        ClangBuiltin<"__builtin_stack_save">;
 def int_stackrestore  : DefaultAttrsIntrinsic<[], [llvm_ptr_ty]>,
-                        GCCBuiltin<"__builtin_stack_restore">;
+                        ClangBuiltin<"__builtin_stack_restore">;
 
 def int_get_dynamic_area_offset : DefaultAttrsIntrinsic<[llvm_anyint_ty]>;
 
 def int_thread_pointer : DefaultAttrsIntrinsic<[llvm_ptr_ty], [], [IntrNoMem]>,
-                         GCCBuiltin<"__builtin_thread_pointer">;
+                         ClangBuiltin<"__builtin_thread_pointer">;
 
 // IntrInaccessibleMemOrArgMemOnly is a little more pessimistic than strictly
 // necessary for prefetch, however it does conveniently prevent the prefetch
@@ -647,6 +652,17 @@ def int_memset  : Intrinsic<[],
                              NoCapture<ArgIndex<0>>, WriteOnly<ArgIndex<0>>,
                              ImmArg<ArgIndex<3>>]>;
 
+// Memset version that is guaranteed to be inlined.
+// In particular this means that the generated code is not allowed to call any
+// external function.
+// The third argument (specifying the size) must be a constant.
+def int_memset_inline
+    : Intrinsic<[],
+      [llvm_anyptr_ty, llvm_i8_ty, llvm_anyint_ty, llvm_i1_ty],
+      [IntrWriteMem, IntrArgMemOnly, IntrWillReturn, IntrNoFree,
+       NoCapture<ArgIndex<0>>, WriteOnly<ArgIndex<0>>, 
+       ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
+
 // FIXME: Add version of these floating point intrinsics which allow non-default
 // rounding modes and FP exception handling.
 
@@ -715,7 +731,7 @@ def int_objectsize : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                                [IntrNoMem, IntrSpeculatable, IntrWillReturn,
                                 ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>,
                                 ImmArg<ArgIndex<3>>]>,
-                               GCCBuiltin<"__builtin_object_size">;
+                               ClangBuiltin<"__builtin_object_size">;
 
 //===--------------- Access to Floating Point Environment -----------------===//
 //
@@ -725,6 +741,14 @@ let IntrProperties = [IntrInaccessibleMemOnly, IntrWillReturn] in {
   def int_set_rounding  : DefaultAttrsIntrinsic<[], [llvm_i32_ty]>;
 }
 
+//===--------------- Floating Point Properties ----------------------------===//
+//
+
+def int_is_fpclass
+    : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                            [llvm_anyfloat_ty, llvm_i32_ty],
+                            [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
+
 //===--------------- Constrained Floating Point Intrinsics ----------------===//
 //
 
@@ -909,6 +933,12 @@ let IntrProperties = [IntrInaccessibleMemOnly, IntrWillReturn] in {
 }
 // FIXME: Consider maybe adding intrinsics for sitofp, uitofp.
 
+
+// Truncate a floating point number with a specific rounding mode
+def int_fptrunc_round : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
+                                              [ llvm_anyfloat_ty, llvm_metadata_ty ],
+                                              [ IntrNoMem, IntrWillReturn ]>;
+
 //===------------------------- Expect Intrinsics --------------------------===//
 //
 def int_expect : DefaultAttrsIntrinsic<[llvm_anyint_ty],
@@ -984,12 +1014,12 @@ def int_eh_exceptioncode : Intrinsic<[llvm_i32_ty], [llvm_token_ty], [IntrNoMem]
 // callee-saved registers to be saved and restored (regardless of whether they
 // are used) in the calling function. It is used by libgcc_eh.
 def int_eh_unwind_init: Intrinsic<[]>,
-                        GCCBuiltin<"__builtin_unwind_init">;
+                        ClangBuiltin<"__builtin_unwind_init">;
 
 def int_eh_dwarf_cfa  : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty]>;
 
 def int_eh_sjlj_lsda             : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
-def int_eh_sjlj_callsite         : Intrinsic<[], [llvm_i32_ty], [IntrNoMem]>;
+def int_eh_sjlj_callsite         : Intrinsic<[], [llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<0>>]>;
 
 def int_eh_sjlj_functioncontext : Intrinsic<[], [llvm_ptr_ty]>;
 def int_eh_sjlj_setjmp          : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty]>;
@@ -1025,11 +1055,11 @@ def int_init_trampoline : DefaultAttrsIntrinsic<
     [], [llvm_ptr_ty, llvm_ptr_ty, llvm_ptr_ty],
     [IntrArgMemOnly, NoCapture<ArgIndex<0>>, WriteOnly<ArgIndex<0>>,
      ReadNone<ArgIndex<1>>, ReadNone<ArgIndex<2>>]>,
-    GCCBuiltin<"__builtin_init_trampoline">;
+    ClangBuiltin<"__builtin_init_trampoline">;
 
 def int_adjust_trampoline : DefaultAttrsIntrinsic<
     [llvm_ptr_ty], [llvm_ptr_ty], [IntrReadMem, IntrArgMemOnly]>,
-    GCCBuiltin<"__builtin_adjust_trampoline">;
+    ClangBuiltin<"__builtin_adjust_trampoline">;
 
 //===------------------------ Overflow Intrinsics -------------------------===//
 //
@@ -1309,9 +1339,9 @@ def int_coro_subfn_addr : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_i8_ty],
 ///===-------------------------- Other Intrinsics --------------------------===//
 //
 def int_trap : Intrinsic<[], [], [IntrNoReturn, IntrCold]>,
-               GCCBuiltin<"__builtin_trap">;
+               ClangBuiltin<"__builtin_trap">;
 def int_debugtrap : Intrinsic<[]>,
-                    GCCBuiltin<"__builtin_debugtrap">;
+                    ClangBuiltin<"__builtin_debugtrap">;
 def int_ubsantrap : Intrinsic<[], [llvm_i8_ty],
                               [IntrNoReturn, IntrCold, ImmArg<ArgIndex<0>>]>;
 
@@ -1397,14 +1427,31 @@ def int_vp_gather: DefaultAttrsIntrinsic<[ llvm_anyvector_ty],
                              [ IntrReadMem, IntrNoSync, IntrWillReturn, IntrArgMemOnly ]>;
 
 def int_vp_scatter: DefaultAttrsIntrinsic<[],
-                              [ llvm_anyvector_ty,
-                                LLVMVectorOfAnyPointersToElt<0>,
-                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                llvm_i32_ty],
-                              [ IntrArgMemOnly, IntrNoSync, IntrWillReturn ]>; // TODO allow IntrNoCapture for vectors of pointers
-
-// Speculatable Binary operators
-let IntrProperties = [IntrSpeculatable, IntrNoMem, IntrNoSync, IntrWillReturn] in {
+                             [ llvm_anyvector_ty,
+                               LLVMVectorOfAnyPointersToElt<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty],
+                             [ IntrArgMemOnly, IntrNoSync, IntrWillReturn ]>; // TODO allow IntrNoCapture for vectors of pointers
+
+// Experimental strided memory accesses
+def int_experimental_vp_strided_store : DefaultAttrsIntrinsic<[],
+                             [ llvm_anyvector_ty,
+                               LLVMAnyPointerToElt<0>,
+                               llvm_anyint_ty, // Stride in bytes
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty],
+                             [ NoCapture<ArgIndex<1>>, IntrNoSync, IntrWriteMem, IntrArgMemOnly, IntrWillReturn ]>;
+
+def int_experimental_vp_strided_load  : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                             [ LLVMAnyPointerToElt<0>,
+                               llvm_anyint_ty, // Stride in bytes
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty],
+                             [ NoCapture<ArgIndex<0>>, IntrNoSync, IntrReadMem, IntrWillReturn, IntrArgMemOnly ]>;
+
+// Operators
+let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in {
+  // Integer arithmetic
   def int_vp_add : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
                              [ LLVMMatchType<0>,
                                LLVMMatchType<0>,
@@ -1416,30 +1463,30 @@ let IntrProperties = [IntrSpeculatable, IntrNoMem, IntrNoSync, IntrWillReturn] i
                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                llvm_i32_ty]>;
   def int_vp_mul  : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
-                              [ LLVMMatchType<0>,
-                                LLVMMatchType<0>,
-                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                llvm_i32_ty]>;
+                             [ LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
   def int_vp_ashr : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
-                              [ LLVMMatchType<0>,
-                                LLVMMatchType<0>,
-                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                llvm_i32_ty]>;
+                             [ LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
   def int_vp_lshr : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
-                              [ LLVMMatchType<0>,
-                                LLVMMatchType<0>,
-                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                llvm_i32_ty]>;
+                             [ LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
   def int_vp_shl : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
                              [ LLVMMatchType<0>,
                                LLVMMatchType<0>,
                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                llvm_i32_ty]>;
   def int_vp_or : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
-                            [ LLVMMatchType<0>,
-                              LLVMMatchType<0>,
-                              LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                              llvm_i32_ty]>;
+                             [ LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
   def int_vp_and : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
                              [ LLVMMatchType<0>,
                                LLVMMatchType<0>,
@@ -1450,35 +1497,28 @@ let IntrProperties = [IntrSpeculatable, IntrNoMem, IntrNoSync, IntrWillReturn] i
                                LLVMMatchType<0>,
                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                llvm_i32_ty]>;
-}
-
-// Non-speculatable binary operators.
-let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in {
   def int_vp_sdiv : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
-                              [ LLVMMatchType<0>,
-                                LLVMMatchType<0>,
-                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                llvm_i32_ty]>;
+                             [ LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
   def int_vp_udiv : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
-                              [ LLVMMatchType<0>,
-                                LLVMMatchType<0>,
-                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                llvm_i32_ty]>;
+                             [ LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
   def int_vp_srem : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
-                              [ LLVMMatchType<0>,
-                                LLVMMatchType<0>,
-                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                llvm_i32_ty]>;
+                             [ LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
   def int_vp_urem : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
-                              [ LLVMMatchType<0>,
-                                LLVMMatchType<0>,
-                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                llvm_i32_ty]>;
-}
+                             [ LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
 
-// Floating-point arithmetic.
-let IntrProperties =
-    [IntrSpeculatable, IntrNoMem, IntrNoSync, IntrWillReturn] in {
+  // Floating-point arithmetic
   def int_vp_fadd : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
                              [ LLVMMatchType<0>,
                                LLVMMatchType<0>,
@@ -1490,101 +1530,169 @@ let IntrProperties =
                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                llvm_i32_ty]>;
   def int_vp_fmul  : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
-                              [ LLVMMatchType<0>,
-                                LLVMMatchType<0>,
-                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                llvm_i32_ty]>;
+                             [ LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
   def int_vp_fdiv : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
-                              [ LLVMMatchType<0>,
-                                LLVMMatchType<0>,
-                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                llvm_i32_ty]>;
+                             [ LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
   def int_vp_frem : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
-                              [ LLVMMatchType<0>,
-                                LLVMMatchType<0>,
-                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                llvm_i32_ty]>;
-}
-// Shuffles.
-def int_vp_select : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
-                              [ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                LLVMMatchType<0>,
-                                LLVMMatchType<0>,
-                                llvm_i32_ty]>;
-
-def int_vp_merge : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
-                              [ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                LLVMMatchType<0>,
-                                LLVMMatchType<0>,
-                                llvm_i32_ty]>;
-
-// Reductions
-let IntrProperties = [IntrSpeculatable, IntrNoMem, IntrNoSync, IntrWillReturn] in {
+                             [ LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
+  def int_vp_fneg : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                             [ LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
+  def int_vp_fma : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                             [ LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
+
+  // Casts
+  def int_vp_trunc : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                             [ llvm_anyvector_ty,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
+  def int_vp_zext : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                             [ llvm_anyvector_ty,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
+  def int_vp_sext : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                             [ llvm_anyvector_ty,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
+  def int_vp_fptrunc : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                             [ llvm_anyvector_ty,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
+  def int_vp_fpext : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                             [ llvm_anyvector_ty,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
+  def int_vp_fptoui : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                             [ llvm_anyvector_ty,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
+  def int_vp_fptosi : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                             [ llvm_anyvector_ty,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
+  def int_vp_uitofp : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                             [ llvm_anyvector_ty,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
+  def int_vp_sitofp : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                             [ llvm_anyvector_ty,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
+  def int_vp_ptrtoint : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                             [ llvm_anyvector_ty,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
+  def int_vp_inttoptr : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                             [ llvm_anyvector_ty,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
+
+  // Shuffles
+  def int_vp_select : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                             [ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               llvm_i32_ty]>;
+  def int_vp_merge : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                             [ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               llvm_i32_ty]>;
+
+  // Comparisons
+  def int_vp_fcmp : DefaultAttrsIntrinsic<[ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty> ],
+                             [ llvm_anyvector_ty,
+                               LLVMMatchType<0>,
+                               llvm_metadata_ty,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
+  def int_vp_icmp : DefaultAttrsIntrinsic<[ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty> ],
+                             [ llvm_anyvector_ty,
+                               LLVMMatchType<0>,
+                               llvm_metadata_ty,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
+
+  // Reductions
   def int_vp_reduce_fadd : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
-                                    [LLVMVectorElementType<0>,
-                                     llvm_anyvector_ty,
-                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                     llvm_i32_ty]>;
+                             [ LLVMVectorElementType<0>,
+                               llvm_anyvector_ty,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
   def int_vp_reduce_fmul : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
-                                    [LLVMVectorElementType<0>,
-                                     llvm_anyvector_ty,
-                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                     llvm_i32_ty]>;
+                             [ LLVMVectorElementType<0>,
+                               llvm_anyvector_ty,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
   def int_vp_reduce_add  : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
-                                    [LLVMVectorElementType<0>,
-                                     llvm_anyvector_ty,
-                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                     llvm_i32_ty]>;
+                             [ LLVMVectorElementType<0>,
+                               llvm_anyvector_ty,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
   def int_vp_reduce_mul : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
-                                    [LLVMVectorElementType<0>,
-                                     llvm_anyvector_ty,
-                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                     llvm_i32_ty]>;
+                             [ LLVMVectorElementType<0>,
+                               llvm_anyvector_ty,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
   def int_vp_reduce_and : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
-                                    [LLVMVectorElementType<0>,
-                                     llvm_anyvector_ty,
-                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                     llvm_i32_ty]>;
+                             [ LLVMVectorElementType<0>,
+                               llvm_anyvector_ty,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
   def int_vp_reduce_or : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
-                                    [LLVMVectorElementType<0>,
-                                     llvm_anyvector_ty,
-                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                     llvm_i32_ty]>;
+                             [ LLVMVectorElementType<0>,
+                               llvm_anyvector_ty,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
   def int_vp_reduce_xor : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
-                                    [LLVMVectorElementType<0>,
-                                     llvm_anyvector_ty,
-                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                     llvm_i32_ty]>;
+                             [ LLVMVectorElementType<0>,
+                               llvm_anyvector_ty,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
   def int_vp_reduce_smax : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
-                                    [LLVMVectorElementType<0>,
-                                     llvm_anyvector_ty,
-                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                     llvm_i32_ty]>;
+                             [ LLVMVectorElementType<0>,
+                               llvm_anyvector_ty,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
   def int_vp_reduce_smin : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
-                                    [LLVMVectorElementType<0>,
-                                     llvm_anyvector_ty,
-                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                     llvm_i32_ty]>;
+                             [ LLVMVectorElementType<0>,
+                               llvm_anyvector_ty,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
   def int_vp_reduce_umax : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
-                                    [LLVMVectorElementType<0>,
-                                     llvm_anyvector_ty,
-                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                     llvm_i32_ty]>;
+                             [ LLVMVectorElementType<0>,
+                               llvm_anyvector_ty,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
   def int_vp_reduce_umin : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
-                                    [LLVMVectorElementType<0>,
-                                     llvm_anyvector_ty,
-                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                     llvm_i32_ty]>;
+                             [ LLVMVectorElementType<0>,
+                               llvm_anyvector_ty,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
   def int_vp_reduce_fmax : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
-                                    [LLVMVectorElementType<0>,
-                                     llvm_anyvector_ty,
-                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                     llvm_i32_ty]>;
+                             [ LLVMVectorElementType<0>,
+                               llvm_anyvector_ty,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
   def int_vp_reduce_fmin : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
-                                    [LLVMVectorElementType<0>,
-                                     llvm_anyvector_ty,
-                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                     llvm_i32_ty]>;
+                             [ LLVMVectorElementType<0>,
+                               llvm_anyvector_ty,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
 }
 
 def int_get_active_lane_mask:
@@ -1840,28 +1948,26 @@ def int_preserve_struct_access_index : DefaultAttrsIntrinsic<[llvm_anyptr_ty],
 //===------------ Intrinsics to perform common vector shuffles ------------===//
 
 def int_experimental_vector_reverse : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                                   [LLVMMatchType<0>],
-                                   [IntrNoMem]>;
+                                                            [LLVMMatchType<0>],
+                                                            [IntrNoMem]>;
 
-//===---------- Intrinsics to query properties of scalable vectors --------===//
-def int_vscale : DefaultAttrsIntrinsic<[llvm_anyint_ty], [], [IntrNoMem]>;
-
-//===---------- Intrinsics to perform subvector insertion/extraction ------===//
-def int_experimental_vector_insert : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                                                           [LLVMMatchType<0>, llvm_anyvector_ty, llvm_i64_ty],
-                                                           [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-
-def int_experimental_vector_extract : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                                                            [llvm_anyvector_ty, llvm_i64_ty],
-                                                            [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-
-//===---------- Named shufflevector intrinsics ------===//
 def int_experimental_vector_splice : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                                                            [LLVMMatchType<0>,
                                                             LLVMMatchType<0>,
                                                             llvm_i32_ty],
                                                            [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
+//===---------- Intrinsics to query properties of scalable vectors --------===//
+def int_vscale : DefaultAttrsIntrinsic<[llvm_anyint_ty], [], [IntrNoMem]>;
+
+//===---------- Intrinsics to perform subvector insertion/extraction ------===//
+def int_vector_insert : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                              [LLVMMatchType<0>, llvm_anyvector_ty, llvm_i64_ty],
+                                              [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+
+def int_vector_extract : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                               [llvm_anyvector_ty, llvm_i64_ty],
+                                               [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 //===----------------- Pointer Authentication Intrinsics ------------------===//
 //
@@ -1936,4 +2042,6 @@ include "llvm/IR/IntrinsicsBPF.td"
 include "llvm/IR/IntrinsicsSystemZ.td"
 include "llvm/IR/IntrinsicsWebAssembly.td"
 include "llvm/IR/IntrinsicsRISCV.td"
+include "llvm/IR/IntrinsicsSPIRV.td"
 include "llvm/IR/IntrinsicsVE.td"
+include "llvm/IR/IntrinsicsDirectX.td"
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index a65ddff07a29..1256ab2c9f84 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -62,14 +62,17 @@ def int_aarch64_frint64x
 
 def int_aarch64_hint : DefaultAttrsIntrinsic<[], [llvm_i32_ty]>;
 
+def int_aarch64_break : Intrinsic<[], [llvm_i32_ty],
+    [IntrNoMem, IntrHasSideEffects, IntrNoReturn, IntrCold, ImmArg<ArgIndex<0>>]>;
+
 //===----------------------------------------------------------------------===//
 // Data Barrier Instructions
 
-def int_aarch64_dmb : GCCBuiltin<"__builtin_arm_dmb">, MSBuiltin<"__dmb">,
+def int_aarch64_dmb : ClangBuiltin<"__builtin_arm_dmb">, MSBuiltin<"__dmb">,
                       Intrinsic<[], [llvm_i32_ty], [IntrNoFree, IntrWillReturn]>;
-def int_aarch64_dsb : GCCBuiltin<"__builtin_arm_dsb">, MSBuiltin<"__dsb">,
+def int_aarch64_dsb : ClangBuiltin<"__builtin_arm_dsb">, MSBuiltin<"__dsb">,
                       Intrinsic<[], [llvm_i32_ty], [IntrNoFree, IntrWillReturn]>;
-def int_aarch64_isb : GCCBuiltin<"__builtin_arm_isb">, MSBuiltin<"__isb">,
+def int_aarch64_isb : ClangBuiltin<"__builtin_arm_isb">, MSBuiltin<"__isb">,
                       Intrinsic<[], [llvm_i32_ty], [IntrNoFree, IntrWillReturn]>;
 
 // A space-consuming intrinsic primarily for testing block and jump table
@@ -907,15 +910,15 @@ let TargetPrefix = "aarch64" in {
 
 // Transactional Memory Extension (TME) Intrinsics
 let TargetPrefix = "aarch64" in {
-def int_aarch64_tstart  : GCCBuiltin<"__builtin_arm_tstart">,
+def int_aarch64_tstart  : ClangBuiltin<"__builtin_arm_tstart">,
                          Intrinsic<[llvm_i64_ty], [], [IntrWillReturn]>;
 
-def int_aarch64_tcommit : GCCBuiltin<"__builtin_arm_tcommit">, Intrinsic<[], [], [IntrWillReturn]>;
+def int_aarch64_tcommit : ClangBuiltin<"__builtin_arm_tcommit">, Intrinsic<[], [], [IntrWillReturn]>;
 
-def int_aarch64_tcancel : GCCBuiltin<"__builtin_arm_tcancel">,
+def int_aarch64_tcancel : ClangBuiltin<"__builtin_arm_tcancel">,
                           Intrinsic<[], [llvm_i64_ty], [IntrWillReturn, ImmArg<ArgIndex<0>>]>;
 
-def int_aarch64_ttest   : GCCBuiltin<"__builtin_arm_ttest">,
+def int_aarch64_ttest   : ClangBuiltin<"__builtin_arm_ttest">,
                           Intrinsic<[llvm_i64_ty], [],
                                     [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
 
@@ -1759,10 +1762,10 @@ def int_aarch64_sve_cntp : AdvSIMD_SVE_CNTP_Intrinsic;
 // FFR manipulation
 //
 
-def int_aarch64_sve_rdffr   : GCCBuiltin<"__builtin_sve_svrdffr">,   DefaultAttrsIntrinsic<[llvm_nxv16i1_ty], [], [IntrReadMem, IntrInaccessibleMemOnly]>;
-def int_aarch64_sve_rdffr_z : GCCBuiltin<"__builtin_sve_svrdffr_z">, DefaultAttrsIntrinsic<[llvm_nxv16i1_ty], [llvm_nxv16i1_ty], [IntrReadMem, IntrInaccessibleMemOnly]>;
-def int_aarch64_sve_setffr  : GCCBuiltin<"__builtin_sve_svsetffr">,  DefaultAttrsIntrinsic<[], [], [IntrWriteMem, IntrInaccessibleMemOnly]>;
-def int_aarch64_sve_wrffr   : GCCBuiltin<"__builtin_sve_svwrffr">,   DefaultAttrsIntrinsic<[], [llvm_nxv16i1_ty], [IntrWriteMem, IntrInaccessibleMemOnly]>;
+def int_aarch64_sve_rdffr   : ClangBuiltin<"__builtin_sve_svrdffr">,   DefaultAttrsIntrinsic<[llvm_nxv16i1_ty], [], [IntrReadMem, IntrInaccessibleMemOnly]>;
+def int_aarch64_sve_rdffr_z : ClangBuiltin<"__builtin_sve_svrdffr_z">, DefaultAttrsIntrinsic<[llvm_nxv16i1_ty], [llvm_nxv16i1_ty], [IntrReadMem, IntrInaccessibleMemOnly]>;
+def int_aarch64_sve_setffr  : ClangBuiltin<"__builtin_sve_svsetffr">,  DefaultAttrsIntrinsic<[], [], [IntrWriteMem, IntrInaccessibleMemOnly]>;
+def int_aarch64_sve_wrffr   : ClangBuiltin<"__builtin_sve_svwrffr">,   DefaultAttrsIntrinsic<[], [llvm_nxv16i1_ty], [IntrWriteMem, IntrInaccessibleMemOnly]>;
 
 //
 // Saturating scalar arithmetic
@@ -2493,31 +2496,31 @@ def int_aarch64_sve_xar    : AdvSIMD_2VectorArgIndexed_Intrinsic;
 // SVE2 - Optional AES, SHA-3 and SM4
 //
 
-def int_aarch64_sve_aesd    : GCCBuiltin<"__builtin_sve_svaesd_u8">,
+def int_aarch64_sve_aesd    : ClangBuiltin<"__builtin_sve_svaesd_u8">,
                               DefaultAttrsIntrinsic<[llvm_nxv16i8_ty],
                                         [llvm_nxv16i8_ty, llvm_nxv16i8_ty],
                                         [IntrNoMem]>;
-def int_aarch64_sve_aesimc  : GCCBuiltin<"__builtin_sve_svaesimc_u8">,
+def int_aarch64_sve_aesimc  : ClangBuiltin<"__builtin_sve_svaesimc_u8">,
                               DefaultAttrsIntrinsic<[llvm_nxv16i8_ty],
                                         [llvm_nxv16i8_ty],
                                         [IntrNoMem]>;
-def int_aarch64_sve_aese    : GCCBuiltin<"__builtin_sve_svaese_u8">,
+def int_aarch64_sve_aese    : ClangBuiltin<"__builtin_sve_svaese_u8">,
                               DefaultAttrsIntrinsic<[llvm_nxv16i8_ty],
                                         [llvm_nxv16i8_ty, llvm_nxv16i8_ty],
                                         [IntrNoMem]>;
-def int_aarch64_sve_aesmc   : GCCBuiltin<"__builtin_sve_svaesmc_u8">,
+def int_aarch64_sve_aesmc   : ClangBuiltin<"__builtin_sve_svaesmc_u8">,
                               DefaultAttrsIntrinsic<[llvm_nxv16i8_ty],
                                         [llvm_nxv16i8_ty],
                                         [IntrNoMem]>;
-def int_aarch64_sve_rax1    : GCCBuiltin<"__builtin_sve_svrax1_u64">,
+def int_aarch64_sve_rax1    : ClangBuiltin<"__builtin_sve_svrax1_u64">,
                               DefaultAttrsIntrinsic<[llvm_nxv2i64_ty],
                                         [llvm_nxv2i64_ty, llvm_nxv2i64_ty],
                                         [IntrNoMem]>;
-def int_aarch64_sve_sm4e    : GCCBuiltin<"__builtin_sve_svsm4e_u32">,
+def int_aarch64_sve_sm4e    : ClangBuiltin<"__builtin_sve_svsm4e_u32">,
                               DefaultAttrsIntrinsic<[llvm_nxv4i32_ty],
                                         [llvm_nxv4i32_ty, llvm_nxv4i32_ty],
                                         [IntrNoMem]>;
-def int_aarch64_sve_sm4ekey : GCCBuiltin<"__builtin_sve_svsm4ekey_u32">,
+def int_aarch64_sve_sm4ekey : ClangBuiltin<"__builtin_sve_svsm4ekey_u32">,
                               DefaultAttrsIntrinsic<[llvm_nxv4i32_ty],
                                         [llvm_nxv4i32_ty, llvm_nxv4i32_ty],
                                         [IntrNoMem]>;
@@ -2580,3 +2583,130 @@ def int_aarch64_sve_whilewr_b : SVE2_CONFLICT_DETECT_Intrinsic;
 def int_aarch64_sve_whilewr_h : SVE2_CONFLICT_DETECT_Intrinsic;
 def int_aarch64_sve_whilewr_s : SVE2_CONFLICT_DETECT_Intrinsic;
 def int_aarch64_sve_whilewr_d : SVE2_CONFLICT_DETECT_Intrinsic;
+
+// Scalable Matrix Extension (SME) Intrinsics
+let TargetPrefix = "aarch64" in {
+  class SME_Load_Store_Intrinsic<LLVMType pred_ty>
+    : DefaultAttrsIntrinsic<[],
+        [pred_ty, llvm_ptr_ty, llvm_i64_ty, llvm_i32_ty], []>;
+
+  // Loads
+  def int_aarch64_sme_ld1b_horiz : SME_Load_Store_Intrinsic<llvm_nxv16i1_ty>;
+  def int_aarch64_sme_ld1h_horiz : SME_Load_Store_Intrinsic<llvm_nxv16i1_ty>;
+  def int_aarch64_sme_ld1w_horiz : SME_Load_Store_Intrinsic<llvm_nxv16i1_ty>;
+  def int_aarch64_sme_ld1d_horiz : SME_Load_Store_Intrinsic<llvm_nxv16i1_ty>;
+  def int_aarch64_sme_ld1q_horiz : SME_Load_Store_Intrinsic<llvm_nxv16i1_ty>;
+  def int_aarch64_sme_ld1b_vert  : SME_Load_Store_Intrinsic<llvm_nxv16i1_ty>;
+  def int_aarch64_sme_ld1h_vert  : SME_Load_Store_Intrinsic<llvm_nxv16i1_ty>;
+  def int_aarch64_sme_ld1w_vert  : SME_Load_Store_Intrinsic<llvm_nxv16i1_ty>;
+  def int_aarch64_sme_ld1d_vert  : SME_Load_Store_Intrinsic<llvm_nxv16i1_ty>;
+  def int_aarch64_sme_ld1q_vert  : SME_Load_Store_Intrinsic<llvm_nxv16i1_ty>;
+
+  // Stores
+  def int_aarch64_sme_st1b_horiz : SME_Load_Store_Intrinsic<llvm_nxv16i1_ty>;
+  def int_aarch64_sme_st1h_horiz : SME_Load_Store_Intrinsic<llvm_nxv16i1_ty>;
+  def int_aarch64_sme_st1w_horiz : SME_Load_Store_Intrinsic<llvm_nxv16i1_ty>;
+  def int_aarch64_sme_st1d_horiz : SME_Load_Store_Intrinsic<llvm_nxv16i1_ty>;
+  def int_aarch64_sme_st1q_horiz : SME_Load_Store_Intrinsic<llvm_nxv16i1_ty>;
+  def int_aarch64_sme_st1b_vert  : SME_Load_Store_Intrinsic<llvm_nxv16i1_ty>;
+  def int_aarch64_sme_st1h_vert  : SME_Load_Store_Intrinsic<llvm_nxv16i1_ty>;
+  def int_aarch64_sme_st1w_vert  : SME_Load_Store_Intrinsic<llvm_nxv16i1_ty>;
+  def int_aarch64_sme_st1d_vert  : SME_Load_Store_Intrinsic<llvm_nxv16i1_ty>;
+  def int_aarch64_sme_st1q_vert  : SME_Load_Store_Intrinsic<llvm_nxv16i1_ty>;
+
+  // Spill + fill
+  def int_aarch64_sme_ldr : DefaultAttrsIntrinsic<
+    [], [llvm_i32_ty, llvm_ptr_ty]>;
+  def int_aarch64_sme_str : DefaultAttrsIntrinsic<
+    [], [llvm_i32_ty, llvm_ptr_ty]>;
+
+  class SME_TileToVector_Intrinsic
+      : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+          [LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_i64_ty, llvm_i32_ty]>;
+  class SME_VectorToTile_Intrinsic
+      : DefaultAttrsIntrinsic<[],
+          [llvm_i64_ty, llvm_i32_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+           llvm_anyvector_ty]>;
+
+  def int_aarch64_sme_read_horiz  : SME_TileToVector_Intrinsic;
+  def int_aarch64_sme_read_vert   : SME_TileToVector_Intrinsic;
+  def int_aarch64_sme_write_horiz : SME_VectorToTile_Intrinsic;
+  def int_aarch64_sme_write_vert  : SME_VectorToTile_Intrinsic;
+
+  def int_aarch64_sme_readq_horiz  : SME_TileToVector_Intrinsic;
+  def int_aarch64_sme_readq_vert   : SME_TileToVector_Intrinsic;
+  def int_aarch64_sme_writeq_horiz : SME_VectorToTile_Intrinsic;
+  def int_aarch64_sme_writeq_vert  : SME_VectorToTile_Intrinsic;
+
+  def int_aarch64_sme_zero : DefaultAttrsIntrinsic<[], [llvm_i64_ty]>;
+
+  class SME_OuterProduct_Intrinsic
+      : DefaultAttrsIntrinsic<[],
+          [llvm_i64_ty,
+           LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+           LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+           LLVMMatchType<0>,
+           llvm_anyvector_ty]>;
+
+  def int_aarch64_sme_mopa : SME_OuterProduct_Intrinsic;
+  def int_aarch64_sme_mops : SME_OuterProduct_Intrinsic;
+
+  def int_aarch64_sme_mopa_wide : SME_OuterProduct_Intrinsic;
+  def int_aarch64_sme_mops_wide : SME_OuterProduct_Intrinsic;
+
+  def int_aarch64_sme_smopa_wide  : SME_OuterProduct_Intrinsic;
+  def int_aarch64_sme_smops_wide  : SME_OuterProduct_Intrinsic;
+  def int_aarch64_sme_umopa_wide  : SME_OuterProduct_Intrinsic;
+  def int_aarch64_sme_umops_wide  : SME_OuterProduct_Intrinsic;
+  def int_aarch64_sme_sumopa_wide : SME_OuterProduct_Intrinsic;
+  def int_aarch64_sme_sumops_wide : SME_OuterProduct_Intrinsic;
+  def int_aarch64_sme_usmopa_wide : SME_OuterProduct_Intrinsic;
+  def int_aarch64_sme_usmops_wide : SME_OuterProduct_Intrinsic;
+
+  //
+  // Counting elements
+  //
+
+  class AdvSIMD_SME_CNTSB_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem]>;
+
+  def int_aarch64_sme_cntsb : AdvSIMD_SME_CNTSB_Intrinsic;
+  def int_aarch64_sme_cntsh : AdvSIMD_SME_CNTSB_Intrinsic;
+  def int_aarch64_sme_cntsw : AdvSIMD_SME_CNTSB_Intrinsic;
+  def int_aarch64_sme_cntsd : AdvSIMD_SME_CNTSB_Intrinsic;
+
+  //
+  // PSTATE Functions
+  //
+
+  def int_aarch64_sme_get_pstatesm
+      : DefaultAttrsIntrinsic<[llvm_i64_ty], [],
+                              [IntrReadMem, IntrInaccessibleMemOnly]>;
+
+  def int_aarch64_sme_get_tpidr2
+      : DefaultAttrsIntrinsic<[llvm_i64_ty], [],
+                              [IntrNoMem, IntrHasSideEffects]>;
+  def int_aarch64_sme_set_tpidr2
+      : DefaultAttrsIntrinsic<[], [llvm_i64_ty],
+                              [IntrNoMem, IntrHasSideEffects]>;
+  // Clamp
+  //
+
+  def int_aarch64_sve_sclamp : AdvSIMD_3VectorArg_Intrinsic;
+  def int_aarch64_sve_uclamp : AdvSIMD_3VectorArg_Intrinsic;
+
+  //
+  // Reversal
+  //
+
+  def int_aarch64_sve_revd : AdvSIMD_Merged1VectorArg_Intrinsic;
+
+  //
+  // Predicate selection
+  //
+
+  def int_aarch64_sve_psel
+      : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                              [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               LLVMMatchType<0>, llvm_i32_ty]>;
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index c5d266eb57ec..c2dcfc254568 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -14,7 +14,7 @@ class AMDGPUReadPreloadRegisterIntrinsic
   : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
 
 class AMDGPUReadPreloadRegisterIntrinsicNamed<string name>
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>, GCCBuiltin<name>;
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>, ClangBuiltin<name>;
 
 // Used to tag image and resource intrinsics with information used to generate
 // mem operands.
@@ -47,12 +47,12 @@ defm int_r600_read_tgid : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
 defm int_r600_read_local_size : AMDGPUReadPreloadRegisterIntrinsic_xyz;
 defm int_r600_read_tidig : AMDGPUReadPreloadRegisterIntrinsic_xyz;
 
-def int_r600_group_barrier : GCCBuiltin<"__builtin_r600_group_barrier">,
+def int_r600_group_barrier : ClangBuiltin<"__builtin_r600_group_barrier">,
   Intrinsic<[], [], [IntrConvergent, IntrWillReturn]>;
 
 // AS 7 is PARAM_I_ADDRESS, used for kernel arguments
 def int_r600_implicitarg_ptr :
-  GCCBuiltin<"__builtin_r600_implicitarg_ptr">,
+  ClangBuiltin<"__builtin_r600_implicitarg_ptr">,
   Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 7>], [],
   [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
 
@@ -61,7 +61,7 @@ def int_r600_rat_store_typed :
   // 2nd parameter: Index
   // 3rd parameter: Constant RAT ID
   Intrinsic<[], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], [IntrWillReturn]>,
-  GCCBuiltin<"__builtin_r600_rat_store_typed">;
+  ClangBuiltin<"__builtin_r600_rat_store_typed">;
 
 def int_r600_recipsqrt_ieee :  Intrinsic<
   [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
@@ -145,30 +145,30 @@ def int_amdgcn_dispatch_ptr :
   [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
 
 def int_amdgcn_queue_ptr :
-  GCCBuiltin<"__builtin_amdgcn_queue_ptr">,
+  ClangBuiltin<"__builtin_amdgcn_queue_ptr">,
   Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
   [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
 
 def int_amdgcn_kernarg_segment_ptr :
-  GCCBuiltin<"__builtin_amdgcn_kernarg_segment_ptr">,
+  ClangBuiltin<"__builtin_amdgcn_kernarg_segment_ptr">,
   Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
   [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
 
 def int_amdgcn_implicitarg_ptr :
-  GCCBuiltin<"__builtin_amdgcn_implicitarg_ptr">,
+  ClangBuiltin<"__builtin_amdgcn_implicitarg_ptr">,
   Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
   [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
 
 def int_amdgcn_groupstaticsize :
-  GCCBuiltin<"__builtin_amdgcn_groupstaticsize">,
+  ClangBuiltin<"__builtin_amdgcn_groupstaticsize">,
   Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
 
 def int_amdgcn_dispatch_id :
-  GCCBuiltin<"__builtin_amdgcn_dispatch_id">,
+  ClangBuiltin<"__builtin_amdgcn_dispatch_id">,
   Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
 
 def int_amdgcn_implicit_buffer_ptr :
-  GCCBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">,
+  ClangBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">,
   Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
   [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
 
@@ -190,7 +190,7 @@ def int_amdgcn_init_exec_from_input : Intrinsic<[],
   [IntrConvergent, ImmArg<ArgIndex<1>>]>;
 
 def int_amdgcn_wavefrontsize :
-  GCCBuiltin<"__builtin_amdgcn_wavefrontsize">,
+  ClangBuiltin<"__builtin_amdgcn_wavefrontsize">,
   Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
 
 
@@ -200,20 +200,44 @@ def int_amdgcn_wavefrontsize :
 
 // The first parameter is s_sendmsg immediate (i16),
 // the second one is copied to m0
-def int_amdgcn_s_sendmsg : GCCBuiltin<"__builtin_amdgcn_s_sendmsg">,
+def int_amdgcn_s_sendmsg : ClangBuiltin<"__builtin_amdgcn_s_sendmsg">,
   Intrinsic <[], [llvm_i32_ty, llvm_i32_ty],
   [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
-def int_amdgcn_s_sendmsghalt : GCCBuiltin<"__builtin_amdgcn_s_sendmsghalt">,
+def int_amdgcn_s_sendmsghalt : ClangBuiltin<"__builtin_amdgcn_s_sendmsghalt">,
   Intrinsic <[], [llvm_i32_ty, llvm_i32_ty],
   [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
 
-def int_amdgcn_s_barrier : GCCBuiltin<"__builtin_amdgcn_s_barrier">,
+
+// gfx11 intrinsic
+// The first parameter is s_sendmsg immediate (i16). Return type is i32 or i64.
+def int_amdgcn_s_sendmsg_rtn : Intrinsic <[llvm_anyint_ty], [llvm_i32_ty],
+  [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
+
+def int_amdgcn_s_barrier : ClangBuiltin<"__builtin_amdgcn_s_barrier">,
   Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn]>;
 
-def int_amdgcn_wave_barrier : GCCBuiltin<"__builtin_amdgcn_wave_barrier">,
+def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">,
   Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn]>;
 
-def int_amdgcn_s_waitcnt : GCCBuiltin<"__builtin_amdgcn_s_waitcnt">,
+// The 1st parameter is a mask for the types of instructions that may be allowed
+// to cross the SCHED_BARRIER during scheduling.
+//     MASK = 0x0000 0000: No instructions may be scheduled across SCHED_BARRIER.
+//     MASK = 0x0000 0001: ALL, non-memory, non-side-effect producing instructions may be
+//                         scheduled across SCHED_BARRIER, i.e. allow ALU instructions to pass.
+//     MASK = 0x0000 0002: VALU instructions may be scheduled across SCHED_BARRIER.
+//     MASK = 0x0000 0004: SALU instructions may be scheduled across SCHED_BARRIER.
+//     MASK = 0x0000 0008: MFMA instructions may be scheduled across SCHED_BARRIER.
+//     MASK = 0x0000 0010: ALL VMEM instructions may be scheduled across SCHED_BARRIER.
+//     MASK = 0x0000 0020: VMEM read instructions may be scheduled across SCHED_BARRIER.
+//     MASK = 0x0000 0040: VMEM write instructions may be scheduled across SCHED_BARRIER.
+//     MASK = 0x0000 0080: ALL DS instructions may be scheduled across SCHED_BARRIER.
+//     MASK = 0x0000 0100: ALL DS read instructions may be scheduled accoss SCHED_BARRIER.
+//     MASK = 0x0000 0200: ALL DS write instructions may be scheduled across SCHED_BARRIER.
+def int_amdgcn_sched_barrier : ClangBuiltin<"__builtin_amdgcn_sched_barrier">,
+  Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
+                                IntrWillReturn]>;
+
+def int_amdgcn_s_waitcnt : ClangBuiltin<"__builtin_amdgcn_s_waitcnt">,
   Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
 
 def int_amdgcn_div_scale : Intrinsic<
@@ -255,7 +279,7 @@ def int_amdgcn_log_clamp : Intrinsic<
   [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
 >;
 
-def int_amdgcn_fmul_legacy : GCCBuiltin<"__builtin_amdgcn_fmul_legacy">,
+def int_amdgcn_fmul_legacy : ClangBuiltin<"__builtin_amdgcn_fmul_legacy">,
   Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
   [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative]
 >;
@@ -274,7 +298,7 @@ def int_amdgcn_rcp : Intrinsic<
   [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
 >;
 
-def int_amdgcn_rcp_legacy : GCCBuiltin<"__builtin_amdgcn_rcp_legacy">,
+def int_amdgcn_rcp_legacy : ClangBuiltin<"__builtin_amdgcn_rcp_legacy">,
   Intrinsic<[llvm_float_ty], [llvm_float_ty],
   [IntrNoMem, IntrSpeculatable, IntrWillReturn]
 >;
@@ -287,7 +311,7 @@ def int_amdgcn_rsq :  Intrinsic<
   [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
 >;
 
-def int_amdgcn_rsq_legacy :  GCCBuiltin<"__builtin_amdgcn_rsq_legacy">,
+def int_amdgcn_rsq_legacy :  ClangBuiltin<"__builtin_amdgcn_rsq_legacy">,
   Intrinsic<
   [llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
 >;
@@ -316,31 +340,31 @@ def int_amdgcn_fract : Intrinsic<
   [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
 >;
 
-def int_amdgcn_cvt_pkrtz : GCCBuiltin<"__builtin_amdgcn_cvt_pkrtz">,
+def int_amdgcn_cvt_pkrtz : ClangBuiltin<"__builtin_amdgcn_cvt_pkrtz">,
   Intrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty],
             [IntrNoMem, IntrSpeculatable, IntrWillReturn]
 >;
 
 def int_amdgcn_cvt_pknorm_i16 :
-  GCCBuiltin<"__builtin_amdgcn_cvt_pknorm_i16">,
+  ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_i16">,
   Intrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
             [IntrNoMem, IntrSpeculatable, IntrWillReturn]
 >;
 
 def int_amdgcn_cvt_pknorm_u16 :
-  GCCBuiltin<"__builtin_amdgcn_cvt_pknorm_u16">,
+  ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_u16">,
   Intrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
             [IntrNoMem, IntrSpeculatable, IntrWillReturn]
 >;
 
 def int_amdgcn_cvt_pk_i16 :
-    GCCBuiltin<"__builtin_amdgcn_cvt_pk_i16">,
+    ClangBuiltin<"__builtin_amdgcn_cvt_pk_i16">,
     Intrinsic<
   [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
   [IntrNoMem, IntrSpeculatable, IntrWillReturn]
 >;
 
-def int_amdgcn_cvt_pk_u16 : GCCBuiltin<"__builtin_amdgcn_cvt_pk_u16">,
+def int_amdgcn_cvt_pk_u16 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_u16">,
   Intrinsic<[llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
     [IntrNoMem, IntrSpeculatable, IntrWillReturn]
 >;
@@ -350,31 +374,31 @@ def int_amdgcn_class : Intrinsic<
   [IntrNoMem, IntrSpeculatable, IntrWillReturn]
 >;
 
-def int_amdgcn_fmed3 : GCCBuiltin<"__builtin_amdgcn_fmed3">,
+def int_amdgcn_fmed3 : ClangBuiltin<"__builtin_amdgcn_fmed3">,
   Intrinsic<[llvm_anyfloat_ty],
     [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
     [IntrNoMem, IntrSpeculatable, IntrWillReturn]
 >;
 
-def int_amdgcn_cubeid : GCCBuiltin<"__builtin_amdgcn_cubeid">,
+def int_amdgcn_cubeid : ClangBuiltin<"__builtin_amdgcn_cubeid">,
   Intrinsic<[llvm_float_ty],
     [llvm_float_ty, llvm_float_ty, llvm_float_ty],
     [IntrNoMem, IntrSpeculatable, IntrWillReturn]
 >;
 
-def int_amdgcn_cubema : GCCBuiltin<"__builtin_amdgcn_cubema">,
+def int_amdgcn_cubema : ClangBuiltin<"__builtin_amdgcn_cubema">,
   Intrinsic<[llvm_float_ty],
   [llvm_float_ty, llvm_float_ty, llvm_float_ty],
   [IntrNoMem, IntrSpeculatable, IntrWillReturn]
 >;
 
-def int_amdgcn_cubesc : GCCBuiltin<"__builtin_amdgcn_cubesc">,
+def int_amdgcn_cubesc : ClangBuiltin<"__builtin_amdgcn_cubesc">,
   Intrinsic<[llvm_float_ty],
     [llvm_float_ty, llvm_float_ty, llvm_float_ty],
     [IntrNoMem, IntrSpeculatable, IntrWillReturn]
 >;
 
-def int_amdgcn_cubetc : GCCBuiltin<"__builtin_amdgcn_cubetc">,
+def int_amdgcn_cubetc : ClangBuiltin<"__builtin_amdgcn_cubetc">,
   Intrinsic<[llvm_float_ty],
     [llvm_float_ty, llvm_float_ty, llvm_float_ty],
     [IntrNoMem, IntrSpeculatable, IntrWillReturn]
@@ -838,6 +862,13 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
             [IntrReadMem], [SDNPMemOperand]>;
   }
 
+  foreach dim = AMDGPUDims.Msaa in {
+    def int_amdgcn_image_msaa_load # _ # dim.Name:
+        AMDGPUImageDimIntrinsic<
+            AMDGPUDimNoSampleProfile<"MSAA_LOAD", dim, [llvm_any_ty], []>,
+            [IntrReadMem], [SDNPMemOperand]>;
+  }
+
   //////////////////////////////////////////////////////////////////////////
   // sample and getlod intrinsics
   //////////////////////////////////////////////////////////////////////////
@@ -949,10 +980,12 @@ class AMDGPUBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
 def int_amdgcn_buffer_load_format : AMDGPUBufferLoad<llvm_anyfloat_ty>;
 def int_amdgcn_buffer_load : AMDGPUBufferLoad;
 
+// Generate a buffer_load instruction that may be optimized to s_buffer_load if
+// the offset argument is uniform.
 def int_amdgcn_s_buffer_load : Intrinsic <
   [llvm_any_ty],
   [llvm_v4i32_ty,     // rsrc(SGPR)
-   llvm_i32_ty,       // byte offset(SGPR/imm)
+   llvm_i32_ty,       // byte offset
    llvm_i32_ty],      // cachepolicy(imm; bit 0 = glc, bit 2 = dlc)
   [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<2>>]>,
   AMDGPURsrcIntrinsic<0>;
@@ -1259,6 +1292,40 @@ class AMDGPUBufferAtomicFP : Intrinsic <
 
 // Legacy form of the intrinsic. raw and struct forms should be preferred.
 def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicFP;
+
+class AMDGPURawBufferLoadLDS : Intrinsic <
+  [],
+  [llvm_v4i32_ty,                      // rsrc(SGPR)
+   LLVMQualPointerType<llvm_i8_ty, 3>, // LDS base offset
+   llvm_i32_ty,                        // Data byte size: 1/2/4
+   llvm_i32_ty,                        // voffset(VGPR, included in bounds checking and swizzling)
+   llvm_i32_ty,                        // soffset(SGPR/imm, excluded from bounds checking and swizzling)
+   llvm_i32_ty,                        // imm offset(imm, included in bounds checking and swizzling)
+   llvm_i32_ty],                       // auxiliary data (imm, cachepolicy     (bit 0 = glc,
+                                       //                                       bit 1 = slc,
+                                       //                                       bit 2 = dlc on gfx10+))
+                                       //                      swizzled buffer (bit 3 = swz))
+  [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>,
+   ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
+def int_amdgcn_raw_buffer_load_lds : AMDGPURawBufferLoadLDS;
+
+class AMDGPUStructBufferLoadLDS : Intrinsic <
+  [],
+  [llvm_v4i32_ty,                      // rsrc(SGPR)
+   LLVMQualPointerType<llvm_i8_ty, 3>, // LDS base offset
+   llvm_i32_ty,                        // Data byte size: 1/2/4
+   llvm_i32_ty,                        // vindex(VGPR)
+   llvm_i32_ty,                        // voffset(VGPR, included in bounds checking and swizzling)
+   llvm_i32_ty,                        // soffset(SGPR/imm, excluded from bounds checking and swizzling)
+   llvm_i32_ty,                        // imm offset(imm, included in bounds checking and swizzling)
+   llvm_i32_ty],                       // auxiliary data (imm, cachepolicy     (bit 0 = glc,
+                                       //                                       bit 1 = slc,
+                                       //                                       bit 2 = dlc on gfx10+))
+                                       //                      swizzled buffer (bit 3 = swz))
+  [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>,
+   ImmArg<ArgIndex<7>>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
+def int_amdgcn_struct_buffer_load_lds : AMDGPUStructBufferLoadLDS;
+
 } // defset AMDGPUBufferIntrinsics
 
 // Uses that do not set the done bit should set IntrWriteMem on the
@@ -1278,7 +1345,21 @@ def int_amdgcn_exp : Intrinsic <[], [
    IntrWillReturn]
 >;
 
-// exp with compr bit set.
+// exp with row_en bit set. Only supported on GFX11+.
+def int_amdgcn_exp_row : Intrinsic <[], [
+  llvm_i32_ty,       // tgt,
+  llvm_i32_ty,       // en
+  llvm_any_ty,       // src0 (f32 or i32)
+  LLVMMatchType<0>,  // src1
+  LLVMMatchType<0>,  // src2
+  LLVMMatchType<0>,  // src3
+  llvm_i1_ty,        // done
+  llvm_i32_ty],      // row number
+  [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<6>>,
+   IntrWriteMem, IntrInaccessibleMemOnly, IntrWillReturn]
+>;
+
+// exp with compr bit set. Not supported on GFX11+.
 def int_amdgcn_exp_compr : Intrinsic <[], [
   llvm_i32_ty,       // tgt,
   llvm_i32_ty,       // en
@@ -1292,35 +1373,35 @@ def int_amdgcn_exp_compr : Intrinsic <[], [
 >;
 
 def int_amdgcn_buffer_wbinvl1_sc :
-  GCCBuiltin<"__builtin_amdgcn_buffer_wbinvl1_sc">,
+  ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1_sc">,
   Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
 
 def int_amdgcn_buffer_wbinvl1 :
-  GCCBuiltin<"__builtin_amdgcn_buffer_wbinvl1">,
+  ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1">,
   Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
 
 def int_amdgcn_s_dcache_inv :
-  GCCBuiltin<"__builtin_amdgcn_s_dcache_inv">,
+  ClangBuiltin<"__builtin_amdgcn_s_dcache_inv">,
   Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
 
 def int_amdgcn_s_memtime :
-  GCCBuiltin<"__builtin_amdgcn_s_memtime">,
+  ClangBuiltin<"__builtin_amdgcn_s_memtime">,
   Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
 
 def int_amdgcn_s_sleep :
-  GCCBuiltin<"__builtin_amdgcn_s_sleep">,
+  ClangBuiltin<"__builtin_amdgcn_s_sleep">,
   Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
                                 IntrHasSideEffects, IntrWillReturn]> {
 }
 
 def int_amdgcn_s_incperflevel :
-  GCCBuiltin<"__builtin_amdgcn_s_incperflevel">,
+  ClangBuiltin<"__builtin_amdgcn_s_incperflevel">,
   Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
                                 IntrHasSideEffects, IntrWillReturn]> {
 }
 
 def int_amdgcn_s_decperflevel :
-  GCCBuiltin<"__builtin_amdgcn_s_decperflevel">,
+  ClangBuiltin<"__builtin_amdgcn_s_decperflevel">,
   Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
                                 IntrHasSideEffects, IntrWillReturn]> {
 }
@@ -1329,11 +1410,16 @@ def int_amdgcn_s_sethalt :
   Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
                                 IntrHasSideEffects, IntrWillReturn]>;
 
+def int_amdgcn_s_setprio :
+  ClangBuiltin<"__builtin_amdgcn_s_setprio">,
+  Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
+                                IntrHasSideEffects, IntrWillReturn]>;
+
+// This is IntrHasSideEffects so it can be used to read cycle counters.
 def int_amdgcn_s_getreg :
-  GCCBuiltin<"__builtin_amdgcn_s_getreg">,
+  ClangBuiltin<"__builtin_amdgcn_s_getreg">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
-  [IntrInaccessibleMemOnly, IntrReadMem, IntrSpeculatable,
-   IntrWillReturn, ImmArg<ArgIndex<0>>]
+  [IntrNoMem, IntrHasSideEffects, IntrWillReturn, ImmArg<ArgIndex<0>>]
 >;
 
 // Note this can be used to set FP environment properties that are
@@ -1341,7 +1427,7 @@ def int_amdgcn_s_getreg :
 // available (and value required to access them) may differ per
 // subtarget. llvm.amdgcn.s.setreg(hwmode, value)
 def int_amdgcn_s_setreg :
-  GCCBuiltin<"__builtin_amdgcn_s_setreg">,
+  ClangBuiltin<"__builtin_amdgcn_s_setreg">,
   Intrinsic<[], [llvm_i32_ty, llvm_i32_ty],
   [IntrNoMem, IntrHasSideEffects, IntrWillReturn, ImmArg<ArgIndex<0>>]
 >;
@@ -1353,14 +1439,14 @@ def int_amdgcn_s_setreg :
 // produce the desired results as optimizations may cause code movement,
 // especially as we explicitly use IntrNoMem to allow optimizations.
 def int_amdgcn_s_getpc :
-  GCCBuiltin<"__builtin_amdgcn_s_getpc">,
+  ClangBuiltin<"__builtin_amdgcn_s_getpc">,
   Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable,
                                 IntrWillReturn]>;
 
 // __builtin_amdgcn_interp_mov <param>, <attr_chan>, <attr>, <m0>
 // param values: 0 = P10, 1 = P20, 2 = P0
 def int_amdgcn_interp_mov :
-  GCCBuiltin<"__builtin_amdgcn_interp_mov">,
+  ClangBuiltin<"__builtin_amdgcn_interp_mov">,
   Intrinsic<[llvm_float_ty],
             [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
             [IntrNoMem, IntrSpeculatable, IntrWillReturn,
@@ -1370,7 +1456,7 @@ def int_amdgcn_interp_mov :
 // This intrinsic reads from lds, but the memory values are constant,
 // so it behaves like IntrNoMem.
 def int_amdgcn_interp_p1 :
-  GCCBuiltin<"__builtin_amdgcn_interp_p1">,
+  ClangBuiltin<"__builtin_amdgcn_interp_p1">,
   Intrinsic<[llvm_float_ty],
             [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
             [IntrNoMem, IntrSpeculatable, IntrWillReturn,
@@ -1378,7 +1464,7 @@ def int_amdgcn_interp_p1 :
 
 // __builtin_amdgcn_interp_p2 <p1>, <j>, <attr_chan>, <attr>, <m0>
 def int_amdgcn_interp_p2 :
-  GCCBuiltin<"__builtin_amdgcn_interp_p2">,
+  ClangBuiltin<"__builtin_amdgcn_interp_p2">,
   Intrinsic<[llvm_float_ty],
             [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
             [IntrNoMem, IntrSpeculatable, IntrWillReturn,
@@ -1388,7 +1474,7 @@ def int_amdgcn_interp_p2 :
 // __builtin_amdgcn_interp_p1_f16 <i>, <attr_chan>, <attr>, <high>, <m0>
 // high selects whether high or low 16-bits are loaded from LDS
 def int_amdgcn_interp_p1_f16 :
-  GCCBuiltin<"__builtin_amdgcn_interp_p1_f16">,
+  ClangBuiltin<"__builtin_amdgcn_interp_p1_f16">,
   Intrinsic<[llvm_float_ty],
             [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty],
             [IntrNoMem, IntrSpeculatable, IntrWillReturn,
@@ -1397,12 +1483,57 @@ def int_amdgcn_interp_p1_f16 :
 // __builtin_amdgcn_interp_p2_f16 <p1>, <j>, <attr_chan>, <attr>, <high>, <m0>
 // high selects whether high or low 16-bits are loaded from LDS
 def int_amdgcn_interp_p2_f16 :
-  GCCBuiltin<"__builtin_amdgcn_interp_p2_f16">,
+  ClangBuiltin<"__builtin_amdgcn_interp_p2_f16">,
   Intrinsic<[llvm_half_ty],
             [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty],
             [IntrNoMem, IntrSpeculatable, IntrWillReturn,
              ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
 
+// llvm.amdgcn.lds.direct.load <m0>
+// The input argument is m0, which contains a packed combination of address
+// offset and flags describing the data type.
+def int_amdgcn_lds_direct_load :
+  Intrinsic<[llvm_any_ty], // overloaded for types u8, u16, i32/f32, i8, i16
+            [llvm_i32_ty],
+            [IntrReadMem, IntrSpeculatable, IntrWillReturn]>;
+
+// llvm.amdgcn.lds.param.load <attr_chan>, <attr>, <m0>
+// Like interp intrinsics, this reads from lds, but the memory values are constant,
+// so it behaves like IntrNoMem.
+def int_amdgcn_lds_param_load :
+  Intrinsic<[llvm_float_ty],
+            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem, IntrSpeculatable, IntrWillReturn,
+             ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
+
+// llvm.amdgcn.interp.inreg.p10 <p>, <i>, <p0>
+def int_amdgcn_interp_inreg_p10 :
+  Intrinsic<[llvm_float_ty],
+            [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+            [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
+
+// llvm.amdgcn.interp.inreg.p2 <p>, <j>, <tmp>
+def int_amdgcn_interp_inreg_p2 :
+  Intrinsic<[llvm_float_ty],
+            [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+            [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
+
+// llvm.amdgcn.interp.inreg.p10.f16 <p>, <i>, <p0>, <high>
+// high selects whether high or low 16-bits are used for p and p0 operands
+def int_amdgcn_interp_inreg_p10_f16:
+  Intrinsic<[llvm_float_ty],
+            [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty],
+            [IntrNoMem, IntrSpeculatable, IntrWillReturn,
+             ImmArg<ArgIndex<3>>]>;
+
+// llvm.amdgcn.interp.inreg.p2.f16 <p>, <j>, <tmp>, <high>
+// high selects whether high or low 16-bits are used for p operand
+def int_amdgcn_interp_inreg_p2_f16 :
+  Intrinsic<[llvm_half_ty],
+            [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty],
+            [IntrNoMem, IntrSpeculatable, IntrWillReturn,
+             ImmArg<ArgIndex<3>>]>;
+
 // Deprecated: use llvm.amdgcn.live.mask instead.
 def int_amdgcn_ps_live : Intrinsic <
   [llvm_i1_ty],
@@ -1416,18 +1547,18 @@ def int_amdgcn_live_mask : Intrinsic <[llvm_i1_ty],
 >;
 
 def int_amdgcn_mbcnt_lo :
-  GCCBuiltin<"__builtin_amdgcn_mbcnt_lo">,
+  ClangBuiltin<"__builtin_amdgcn_mbcnt_lo">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
    [IntrNoMem, IntrWillReturn]>;
 
 def int_amdgcn_mbcnt_hi :
-  GCCBuiltin<"__builtin_amdgcn_mbcnt_hi">,
+  ClangBuiltin<"__builtin_amdgcn_mbcnt_hi">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
             [IntrNoMem, IntrWillReturn]>;
 
 // llvm.amdgcn.ds.swizzle src offset
 def int_amdgcn_ds_swizzle :
-  GCCBuiltin<"__builtin_amdgcn_ds_swizzle">,
+  ClangBuiltin<"__builtin_amdgcn_ds_swizzle">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
             [IntrNoMem, IntrConvergent, IntrWillReturn,
              ImmArg<ArgIndex<1>>]>;
@@ -1443,55 +1574,55 @@ def int_amdgcn_sbfe : Intrinsic<[llvm_anyint_ty],
 >;
 
 def int_amdgcn_lerp :
-  GCCBuiltin<"__builtin_amdgcn_lerp">,
+  ClangBuiltin<"__builtin_amdgcn_lerp">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
   [IntrNoMem, IntrSpeculatable, IntrWillReturn]
 >;
 
 def int_amdgcn_sad_u8 :
-  GCCBuiltin<"__builtin_amdgcn_sad_u8">,
+  ClangBuiltin<"__builtin_amdgcn_sad_u8">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
   [IntrNoMem, IntrSpeculatable, IntrWillReturn]
 >;
 
 def int_amdgcn_msad_u8 :
-  GCCBuiltin<"__builtin_amdgcn_msad_u8">,
+  ClangBuiltin<"__builtin_amdgcn_msad_u8">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
   [IntrNoMem, IntrSpeculatable, IntrWillReturn]
 >;
 
 def int_amdgcn_sad_hi_u8 :
-  GCCBuiltin<"__builtin_amdgcn_sad_hi_u8">,
+  ClangBuiltin<"__builtin_amdgcn_sad_hi_u8">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
   [IntrNoMem, IntrSpeculatable, IntrWillReturn]
 >;
 
 def int_amdgcn_sad_u16 :
-  GCCBuiltin<"__builtin_amdgcn_sad_u16">,
+  ClangBuiltin<"__builtin_amdgcn_sad_u16">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
   [IntrNoMem, IntrSpeculatable, IntrWillReturn]
 >;
 
 def int_amdgcn_qsad_pk_u16_u8 :
-  GCCBuiltin<"__builtin_amdgcn_qsad_pk_u16_u8">,
+  ClangBuiltin<"__builtin_amdgcn_qsad_pk_u16_u8">,
   Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty],
   [IntrNoMem, IntrSpeculatable, IntrWillReturn]
 >;
 
 def int_amdgcn_mqsad_pk_u16_u8 :
-  GCCBuiltin<"__builtin_amdgcn_mqsad_pk_u16_u8">,
+  ClangBuiltin<"__builtin_amdgcn_mqsad_pk_u16_u8">,
   Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty],
   [IntrNoMem, IntrSpeculatable, IntrWillReturn]
 >;
 
 def int_amdgcn_mqsad_u32_u8 :
-  GCCBuiltin<"__builtin_amdgcn_mqsad_u32_u8">,
+  ClangBuiltin<"__builtin_amdgcn_mqsad_u32_u8">,
   Intrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_v4i32_ty],
   [IntrNoMem, IntrSpeculatable, IntrWillReturn]
 >;
 
 def int_amdgcn_cvt_pk_u8_f32 :
-  GCCBuiltin<"__builtin_amdgcn_cvt_pk_u8_f32">,
+  ClangBuiltin<"__builtin_amdgcn_cvt_pk_u8_f32">,
   Intrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
   [IntrNoMem, IntrSpeculatable, IntrWillReturn]
 >;
@@ -1511,14 +1642,14 @@ def int_amdgcn_ballot :
             [IntrNoMem, IntrConvergent, IntrWillReturn]>;
 
 def int_amdgcn_readfirstlane :
-  GCCBuiltin<"__builtin_amdgcn_readfirstlane">,
+  ClangBuiltin<"__builtin_amdgcn_readfirstlane">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
             [IntrNoMem, IntrConvergent, IntrWillReturn]>;
 
 // The lane argument must be uniform across the currently active threads of the
 // current wave. Otherwise, the result is undefined.
 def int_amdgcn_readlane :
-  GCCBuiltin<"__builtin_amdgcn_readlane">,
+  ClangBuiltin<"__builtin_amdgcn_readlane">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
             [IntrNoMem, IntrConvergent, IntrWillReturn]>;
 
@@ -1526,7 +1657,7 @@ def int_amdgcn_readlane :
 // currently active threads of the current wave. Otherwise, the result is
 // undefined.
 def int_amdgcn_writelane :
-  GCCBuiltin<"__builtin_amdgcn_writelane">,
+  ClangBuiltin<"__builtin_amdgcn_writelane">,
   Intrinsic<[llvm_i32_ty], [
     llvm_i32_ty,    // uniform value to write: returned by the selected lane
     llvm_i32_ty,    // uniform lane select
@@ -1535,7 +1666,7 @@ def int_amdgcn_writelane :
   [IntrNoMem, IntrConvergent, IntrWillReturn]
 >;
 
-def int_amdgcn_alignbyte : GCCBuiltin<"__builtin_amdgcn_alignbyte">,
+def int_amdgcn_alignbyte : ClangBuiltin<"__builtin_amdgcn_alignbyte">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
   [IntrNoMem, IntrSpeculatable, IntrWillReturn]
 >;
@@ -1565,7 +1696,7 @@ def int_amdgcn_mulhi_u24 : Intrinsic<[llvm_i32_ty],
 // bar_val is the total number of waves that will wait on this
 // barrier, minus 1.
 def int_amdgcn_ds_gws_init :
-  GCCBuiltin<"__builtin_amdgcn_ds_gws_init">,
+  ClangBuiltin<"__builtin_amdgcn_ds_gws_init">,
   Intrinsic<[],
   [llvm_i32_ty, llvm_i32_ty],
   [IntrConvergent, IntrWriteMem,
@@ -1577,7 +1708,7 @@ def int_amdgcn_ds_gws_init :
 // bar_val is the total number of waves that will wait on this
 // barrier, minus 1.
 def int_amdgcn_ds_gws_barrier :
-  GCCBuiltin<"__builtin_amdgcn_ds_gws_barrier">,
+  ClangBuiltin<"__builtin_amdgcn_ds_gws_barrier">,
   Intrinsic<[],
   [llvm_i32_ty, llvm_i32_ty],
   [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn], "",
@@ -1586,7 +1717,7 @@ def int_amdgcn_ds_gws_barrier :
 
 // llvm.amdgcn.ds.gws.sema.v(i32 resource_id)
 def int_amdgcn_ds_gws_sema_v :
-  GCCBuiltin<"__builtin_amdgcn_ds_gws_sema_v">,
+  ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_v">,
   Intrinsic<[],
   [llvm_i32_ty],
   [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn], "",
@@ -1595,7 +1726,7 @@ def int_amdgcn_ds_gws_sema_v :
 
 // llvm.amdgcn.ds.gws.sema.br(i32 vsrc, i32 resource_id)
 def int_amdgcn_ds_gws_sema_br :
-  GCCBuiltin<"__builtin_amdgcn_ds_gws_sema_br">,
+  ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_br">,
   Intrinsic<[],
   [llvm_i32_ty, llvm_i32_ty],
   [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn], "",
@@ -1604,7 +1735,7 @@ def int_amdgcn_ds_gws_sema_br :
 
 // llvm.amdgcn.ds.gws.sema.p(i32 resource_id)
 def int_amdgcn_ds_gws_sema_p :
-  GCCBuiltin<"__builtin_amdgcn_ds_gws_sema_p">,
+  ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_p">,
   Intrinsic<[],
   [llvm_i32_ty],
   [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn], "",
@@ -1613,7 +1744,7 @@ def int_amdgcn_ds_gws_sema_p :
 
 // llvm.amdgcn.ds.gws.sema.release.all(i32 resource_id)
 def int_amdgcn_ds_gws_sema_release_all :
-  GCCBuiltin<"__builtin_amdgcn_ds_gws_sema_release_all">,
+  ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_release_all">,
   Intrinsic<[],
   [llvm_i32_ty],
   [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn], "",
@@ -1644,7 +1775,7 @@ def int_amdgcn_wqm_vote : Intrinsic<[llvm_i1_ty],
 // FIXME: Should this be IntrNoMem, IntrHasSideEffects, or IntrWillReturn?
 def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], []>;
 
-def int_amdgcn_endpgm : GCCBuiltin<"__builtin_amdgcn_endpgm">,
+def int_amdgcn_endpgm : ClangBuiltin<"__builtin_amdgcn_endpgm">,
   Intrinsic<[], [], [IntrNoReturn, IntrCold, IntrNoMem, IntrHasSideEffects]
 >;
 
@@ -1683,13 +1814,13 @@ def int_amdgcn_set_inactive :
             [IntrNoMem, IntrConvergent, IntrWillReturn]>;
 
 // Return if the given flat pointer points to a local memory address.
-def int_amdgcn_is_shared : GCCBuiltin<"__builtin_amdgcn_is_shared">,
+def int_amdgcn_is_shared : ClangBuiltin<"__builtin_amdgcn_is_shared">,
   Intrinsic<[llvm_i1_ty], [llvm_ptr_ty],
   [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>, IntrWillReturn]
 >;
 
 // Return if the given flat pointer points to a prvate memory address.
-def int_amdgcn_is_private : GCCBuiltin<"__builtin_amdgcn_is_private">,
+def int_amdgcn_is_private : ClangBuiltin<"__builtin_amdgcn_is_private">,
   Intrinsic<[llvm_i1_ty], [llvm_ptr_ty],
   [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>, IntrWillReturn]
 >;
@@ -1699,11 +1830,11 @@ def int_amdgcn_is_private : GCCBuiltin<"__builtin_amdgcn_is_private">,
 //===----------------------------------------------------------------------===//
 
 def int_amdgcn_s_dcache_inv_vol :
-  GCCBuiltin<"__builtin_amdgcn_s_dcache_inv_vol">,
+  ClangBuiltin<"__builtin_amdgcn_s_dcache_inv_vol">,
   Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
 
 def int_amdgcn_buffer_wbinvl1_vol :
-  GCCBuiltin<"__builtin_amdgcn_buffer_wbinvl1_vol">,
+  ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1_vol">,
   Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
 
 //===----------------------------------------------------------------------===//
@@ -1732,48 +1863,67 @@ def int_amdgcn_update_dpp :
               ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
 def int_amdgcn_s_dcache_wb :
-  GCCBuiltin<"__builtin_amdgcn_s_dcache_wb">,
+  ClangBuiltin<"__builtin_amdgcn_s_dcache_wb">,
   Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
 
 def int_amdgcn_s_dcache_wb_vol :
-  GCCBuiltin<"__builtin_amdgcn_s_dcache_wb_vol">,
+  ClangBuiltin<"__builtin_amdgcn_s_dcache_wb_vol">,
   Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
 
 def int_amdgcn_s_memrealtime :
-  GCCBuiltin<"__builtin_amdgcn_s_memrealtime">,
+  ClangBuiltin<"__builtin_amdgcn_s_memrealtime">,
   Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
 
 // llvm.amdgcn.ds.permute <index> <src>
 def int_amdgcn_ds_permute :
-  GCCBuiltin<"__builtin_amdgcn_ds_permute">,
+  ClangBuiltin<"__builtin_amdgcn_ds_permute">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
     [IntrNoMem, IntrConvergent, IntrWillReturn]>;
 
 // llvm.amdgcn.ds.bpermute <index> <src>
 def int_amdgcn_ds_bpermute :
-  GCCBuiltin<"__builtin_amdgcn_ds_bpermute">,
+  ClangBuiltin<"__builtin_amdgcn_ds_bpermute">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
      [IntrNoMem, IntrConvergent, IntrWillReturn]>;
 
 // llvm.amdgcn.perm <src0> <src1> <selector>
 def int_amdgcn_perm :
-  GCCBuiltin<"__builtin_amdgcn_perm">,
+  ClangBuiltin<"__builtin_amdgcn_perm">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
      [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
 
+//===----------------------------------------------------------------------===//
+// GFX9 Intrinsics
+//===----------------------------------------------------------------------===//
+
+class AMDGPUGlobalLoadLDS : Intrinsic <
+  [],
+  [LLVMQualPointerType<llvm_i8_ty, 1>, // Base global pointer to load from
+   LLVMQualPointerType<llvm_i8_ty, 3>, // LDS base pointer to store to
+   llvm_i32_ty,                        // Data byte size: 1/2/4
+   llvm_i32_ty,                        // imm offset (applied to both global and LDS address)
+   llvm_i32_ty],                       // auxiliary data (imm, cachepolicy (bit 0 = glc/sc0,
+                                       //                                   bit 1 = slc/sc1,
+                                       //                                   bit 2 = dlc on gfx10+))
+                                       //                                   bit 4 = scc/nt on gfx90a+))
+  [IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
+   ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>],
+  "", [SDNPMemOperand]>;
+def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS;
+
 //===----------------------------------------------------------------------===//
 // GFX10 Intrinsics
 //===----------------------------------------------------------------------===//
 
 // llvm.amdgcn.permlane16 <old> <src0> <src1> <src2> <fi> <bound_control>
-def int_amdgcn_permlane16 : GCCBuiltin<"__builtin_amdgcn_permlane16">,
+def int_amdgcn_permlane16 : ClangBuiltin<"__builtin_amdgcn_permlane16">,
   Intrinsic<[llvm_i32_ty],
             [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
             [IntrNoMem, IntrConvergent, IntrWillReturn,
              ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
 // llvm.amdgcn.permlanex16 <old> <src0> <src1> <src2> <fi> <bound_control>
-def int_amdgcn_permlanex16 : GCCBuiltin<"__builtin_amdgcn_permlanex16">,
+def int_amdgcn_permlanex16 : ClangBuiltin<"__builtin_amdgcn_permlanex16">,
   Intrinsic<[llvm_i32_ty],
             [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
             [IntrNoMem, IntrConvergent, IntrWillReturn,
@@ -1789,9 +1939,9 @@ def int_amdgcn_mov_dpp8 :
              ImmArg<ArgIndex<1>>]>;
 
 def int_amdgcn_s_get_waveid_in_workgroup :
-  GCCBuiltin<"__builtin_amdgcn_s_get_waveid_in_workgroup">,
+  ClangBuiltin<"__builtin_amdgcn_s_get_waveid_in_workgroup">,
   Intrinsic<[llvm_i32_ty], [],
-    [IntrReadMem, IntrInaccessibleMemOnly, IntrWillReturn]>;
+    [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
 
 class AMDGPUGlobalAtomicRtn<LLVMType vt> : Intrinsic <
   [vt],
@@ -1812,6 +1962,75 @@ def int_amdgcn_image_bvh_intersect_ray :
              LLVMMatchType<1>, llvm_v4i32_ty],
             [IntrReadMem, IntrWillReturn]>;
 
+//===----------------------------------------------------------------------===//
+// GFX11 Intrinsics
+//===----------------------------------------------------------------------===//
+
+// llvm.amdgcn.permlane64 <src0>
+def int_amdgcn_permlane64 :
+  Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
+            [IntrNoMem, IntrConvergent, IntrWillReturn]>;
+
+def int_amdgcn_ds_add_gs_reg_rtn :
+  ClangBuiltin<"__builtin_amdgcn_ds_add_gs_reg_rtn">,
+  Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty],
+            [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn]>;
+
+def int_amdgcn_ds_sub_gs_reg_rtn :
+  ClangBuiltin<"__builtin_amdgcn_ds_sub_gs_reg_rtn">,
+  Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty],
+            [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn]>;
+
+// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
+//
+// These operations perform a matrix multiplication and accumulation of
+// the form: D = A * B + C .
+
+class AMDGPUWmmaIntrinsic<LLVMType AB, LLVMType CD> :
+  Intrinsic<
+    [CD],               // %D
+    [
+      AB,               // %A
+      AB,               // %B
+      LLVMMatchType<0>, // %C
+    ],
+    [IntrNoMem, IntrConvergent, IntrWillReturn]
+>;
+
+class AMDGPUWmmaIntrinsicOPSEL<LLVMType AB, LLVMType CD> :
+  Intrinsic<
+    [CD],               // %D
+    [
+      AB,               // %A
+      AB,               // %B
+      LLVMMatchType<0>, // %C
+      llvm_i1_ty,       // %high
+    ],
+    [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<3>>]
+>;
+
+class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> :
+  Intrinsic<
+    [CD],               // %D
+    [
+      llvm_i1_ty,       // %A_sign
+      AB,               // %A
+      llvm_i1_ty,       // %B_sign
+      AB,               // %B
+      LLVMMatchType<0>, // %C
+      llvm_i1_ty,       // %clamp
+    ],
+    [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
+>;
+
+def int_amdgcn_wmma_f32_16x16x16_f16   : AMDGPUWmmaIntrinsic<llvm_v16f16_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x16_bf16  : AMDGPUWmmaIntrinsic<llvm_v16i16_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f16_16x16x16_f16   : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>;
+def int_amdgcn_wmma_i32_16x16x16_iu8   : AMDGPUWmmaIntrinsicIU<llvm_v4i32_ty, llvm_anyint_ty>;
+def int_amdgcn_wmma_i32_16x16x16_iu4   : AMDGPUWmmaIntrinsicIU<llvm_v2i32_ty, llvm_anyint_ty>;
+
+
 //===----------------------------------------------------------------------===//
 // Deep learning intrinsics.
 //===----------------------------------------------------------------------===//
@@ -1819,7 +2038,7 @@ def int_amdgcn_image_bvh_intersect_ray :
 // f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c, i1 %clamp)
 //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
 def int_amdgcn_fdot2 :
-  GCCBuiltin<"__builtin_amdgcn_fdot2">,
+  ClangBuiltin<"__builtin_amdgcn_fdot2">,
   Intrinsic<
     [llvm_float_ty], // %r
     [
@@ -1831,10 +2050,53 @@ def int_amdgcn_fdot2 :
     [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>]
   >;
 
+// f16 %r = llvm.amdgcn.fdot2.f16.f16(v2f16 %a, v2f16 %b, f16 %c)
+//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
+def int_amdgcn_fdot2_f16_f16 :
+  ClangBuiltin<"__builtin_amdgcn_fdot2_f16_f16">,
+  Intrinsic<
+    [llvm_half_ty],  // %r
+    [
+      llvm_v2f16_ty, // %a
+      llvm_v2f16_ty, // %b
+      llvm_half_ty   // %c
+    ],
+    [IntrNoMem, IntrSpeculatable, IntrWillReturn]
+  >;
+
+// bf16 %r = llvm.amdgcn.fdot2.bf16.bf16(v2bf16 %a, v2bf16 %b, bf16 %c)
+//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
+def int_amdgcn_fdot2_bf16_bf16 :
+  ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">,
+  Intrinsic<
+    [llvm_i16_ty],   // %r
+    [
+      llvm_v2i16_ty, // %a
+      llvm_v2i16_ty, // %b
+      llvm_i16_ty    // %c
+    ],
+    [IntrNoMem, IntrSpeculatable, IntrWillReturn]
+  >;
+
+// f32 %r = llvm.amdgcn.fdot2.f32.bf16(v2bf16 %a, v2bf16 %b, f32 %c, i1 %clamp)
+//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
+def int_amdgcn_fdot2_f32_bf16 :
+  ClangBuiltin<"__builtin_amdgcn_fdot2_f32_bf16">,
+  Intrinsic<
+    [llvm_float_ty], // %r
+    [
+      llvm_v2i16_ty, // %a
+      llvm_v2i16_ty, // %b
+      llvm_float_ty, // %c
+      llvm_i1_ty     // %clamp
+    ],
+    [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>]
+  >;
+
 // i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c, i1 %clamp)
 //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
 def int_amdgcn_sdot2 :
-  GCCBuiltin<"__builtin_amdgcn_sdot2">,
+  ClangBuiltin<"__builtin_amdgcn_sdot2">,
   Intrinsic<
     [llvm_i32_ty], // %r
     [
@@ -1849,7 +2111,7 @@ def int_amdgcn_sdot2 :
 // u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c, i1 %clamp)
 //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
 def int_amdgcn_udot2 :
-  GCCBuiltin<"__builtin_amdgcn_udot2">,
+  ClangBuiltin<"__builtin_amdgcn_udot2">,
   Intrinsic<
     [llvm_i32_ty], // %r
     [
@@ -1864,7 +2126,7 @@ def int_amdgcn_udot2 :
 // i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c, i1 %clamp)
 //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
 def int_amdgcn_sdot4 :
-  GCCBuiltin<"__builtin_amdgcn_sdot4">,
+  ClangBuiltin<"__builtin_amdgcn_sdot4">,
   Intrinsic<
     [llvm_i32_ty], // %r
     [
@@ -1879,7 +2141,7 @@ def int_amdgcn_sdot4 :
 // u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c, i1 %clamp)
 //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
 def int_amdgcn_udot4 :
-  GCCBuiltin<"__builtin_amdgcn_udot4">,
+  ClangBuiltin<"__builtin_amdgcn_udot4">,
   Intrinsic<
     [llvm_i32_ty], // %r
     [
@@ -1891,11 +2153,32 @@ def int_amdgcn_udot4 :
     [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>]
   >;
 
+// i32 %r = llvm.amdgcn.sudot4(i1 %a_sign, v4i8 (as i32) %a, i1 %b_sign, v4i8 (as i32) %b, i32 %c, i1 %clamp)
+// Treat input as signed (_sign = 1) or unsigned (_sign = 0).
+// a[i in 0. . . 3] = (%a_sign ? a.i8[i] : promoteToSigned(a.u8[i]));
+// b[i in 0. . . 3] = (%b_sign ? b.i8[i] : promoteToSigned(b.u8[i]));
+//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
+def int_amdgcn_sudot4 :
+  ClangBuiltin<"__builtin_amdgcn_sudot4">,
+  Intrinsic<
+    [llvm_i32_ty], // %r
+    [
+      llvm_i1_ty,  // %a_sign
+      llvm_i32_ty, // %a
+      llvm_i1_ty,  // %b_sign
+      llvm_i32_ty, // %b
+      llvm_i32_ty, // %c
+      llvm_i1_ty   // %clamp
+    ],
+    [IntrNoMem, IntrSpeculatable, IntrWillReturn,
+     ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
+  >;
+
 // i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c, i1 %clamp)
 //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
 //        %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
 def int_amdgcn_sdot8 :
-  GCCBuiltin<"__builtin_amdgcn_sdot8">,
+  ClangBuiltin<"__builtin_amdgcn_sdot8">,
   Intrinsic<
     [llvm_i32_ty], // %r
     [
@@ -1911,7 +2194,7 @@ def int_amdgcn_sdot8 :
 //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
 //        %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
 def int_amdgcn_udot8 :
-  GCCBuiltin<"__builtin_amdgcn_udot8">,
+  ClangBuiltin<"__builtin_amdgcn_udot8">,
   Intrinsic<
     [llvm_i32_ty], // %r
     [
@@ -1923,6 +2206,28 @@ def int_amdgcn_udot8 :
     [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>]
   >;
 
+// i32 %r = llvm.amdgcn.sudot8(i1 %a_sign, v8i4 (as i32) %a, i1 %b_sign, v8i4 (as i32) %b, i32 %c, i1 %clamp)
+// Treat input as signed (_sign = 1) or unsigned (_sign = 0).
+// a[i in 0. . . 7] = (%a_sign ? a.i4[i] : promoteToSigned(a.u4[i]));
+// b[i in 0. . . 7] = (%b_sign ? b.i4[i] : promoteToSigned(b.u4[i]));
+//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
+//        %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
+  def int_amdgcn_sudot8 :
+  ClangBuiltin<"__builtin_amdgcn_sudot8">,
+  Intrinsic<
+    [llvm_i32_ty], // %r
+    [
+      llvm_i1_ty,  // %a_sign
+      llvm_i32_ty, // %a
+      llvm_i1_ty,  // %b_sign
+      llvm_i32_ty, // %b
+      llvm_i32_ty, // %c
+      llvm_i1_ty   // %clamp
+    ],
+    [IntrNoMem, IntrSpeculatable, IntrWillReturn,
+     ImmArg<ArgIndex<0>>,  ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
+  >;
+
 //===----------------------------------------------------------------------===//
 // gfx908 intrinsics
 // ===----------------------------------------------------------------------===//
@@ -1931,7 +2236,7 @@ def int_amdgcn_global_atomic_fadd : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
 
 // llvm.amdgcn.mfma.*.* vdst, srcA, srcB, srcC, cbsz, abid, blgp
 class AMDGPUMfmaIntrinsic<LLVMType DestTy, LLVMType SrcABTy> :
-  GCCBuiltin<!subst("int", "__builtin", NAME)>,
+  ClangBuiltin<!subst("int", "__builtin", NAME)>,
   Intrinsic<[DestTy],
             [SrcABTy, SrcABTy, DestTy,
              llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
@@ -1975,9 +2280,46 @@ def int_amdgcn_mfma_f32_4x4x4bf16_1k    : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  ll
 def int_amdgcn_mfma_f32_32x32x8bf16_1k  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>;
 def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v4i16_ty>;
 
+// Note: in gfx940 BLGP argument is replaced by NEG bitfield in the DGEMM MFMA.
+//       Three bits corresponding to the neg modifier applied to the respective
+//       source operand.
 def int_amdgcn_mfma_f64_16x16x4f64      : AMDGPUMfmaIntrinsic<llvm_v4f64_ty,  llvm_double_ty>;
 def int_amdgcn_mfma_f64_4x4x4f64        : AMDGPUMfmaIntrinsic<llvm_double_ty, llvm_double_ty>;
 
+//===----------------------------------------------------------------------===//
+// gfx940 intrinsics
+// ===----------------------------------------------------------------------===//
+
+// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
+def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUGlobalAtomicRtn<llvm_v2i16_ty>;
+def int_amdgcn_flat_atomic_fadd_v2bf16   : AMDGPUGlobalAtomicRtn<llvm_v2i16_ty>;
+def int_amdgcn_ds_fadd_v2bf16 : Intrinsic<
+    [llvm_v2i16_ty],
+    [LLVMQualPointerType<llvm_v2i16_ty, 3>, llvm_v2i16_ty],
+    [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>]>,
+    ClangBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">;
+
+def int_amdgcn_mfma_i32_16x16x32_i8     : AMDGPUMfmaIntrinsic<llvm_v4i32_ty,  llvm_i64_ty>;
+def int_amdgcn_mfma_i32_32x32x16_i8     : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i64_ty>;
+def int_amdgcn_mfma_f32_16x16x8_xf32    : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v2f32_ty>;
+def int_amdgcn_mfma_f32_32x32x4_xf32    : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2f32_ty>;
+
+// llvm.amdgcn.smfmac.?32.* vdst, srcA, srcB, srcC, index, cbsz, abid
+class AMDGPUMSmfmacIntrinsic<LLVMType DestTy, LLVMType SrcA, LLVMType SrcB> :
+  ClangBuiltin<!subst("int", "__builtin", NAME)>,
+  Intrinsic<[DestTy],
+            [SrcA, SrcB, DestTy, llvm_i32_ty,
+             llvm_i32_ty, llvm_i32_ty],
+            [IntrConvergent, IntrNoMem, IntrWillReturn,
+             ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
+
+def int_amdgcn_smfmac_f32_16x16x32_f16  : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty,  llvm_v4f16_ty, llvm_v8f16_ty>;
+def int_amdgcn_smfmac_f32_32x32x16_f16  : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>;
+def int_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty,  llvm_v4i16_ty, llvm_v8i16_ty>;
+def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>;
+def int_amdgcn_smfmac_i32_16x16x64_i8   : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty,  llvm_v2i32_ty, llvm_v4i32_ty>;
+def int_amdgcn_smfmac_i32_32x32x32_i8   : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;
+
 //===----------------------------------------------------------------------===//
 // Special Intrinsics for backend internal use only. No frontend
 // should emit calls to these.
diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td
index a42484757592..3d905dbca6b9 100644
--- a/llvm/include/llvm/IR/IntrinsicsARM.td
+++ b/llvm/include/llvm/IR/IntrinsicsARM.td
@@ -22,199 +22,199 @@ let TargetPrefix = "arm" in {  // All intrinsics start with "llvm.arm.".
 def int_arm_space : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
 
 // 16-bit multiplications
-def int_arm_smulbb : GCCBuiltin<"__builtin_arm_smulbb">,
+def int_arm_smulbb : ClangBuiltin<"__builtin_arm_smulbb">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_arm_smulbt : GCCBuiltin<"__builtin_arm_smulbt">,
+def int_arm_smulbt : ClangBuiltin<"__builtin_arm_smulbt">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_arm_smultb : GCCBuiltin<"__builtin_arm_smultb">,
+def int_arm_smultb : ClangBuiltin<"__builtin_arm_smultb">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_arm_smultt : GCCBuiltin<"__builtin_arm_smultt">,
+def int_arm_smultt : ClangBuiltin<"__builtin_arm_smultt">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_arm_smulwb : GCCBuiltin<"__builtin_arm_smulwb">,
+def int_arm_smulwb : ClangBuiltin<"__builtin_arm_smulwb">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_arm_smulwt : GCCBuiltin<"__builtin_arm_smulwt">,
+def int_arm_smulwt : ClangBuiltin<"__builtin_arm_smulwt">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
 
 //===----------------------------------------------------------------------===//
 // Saturating Arithmetic
 
-def int_arm_qadd : GCCBuiltin<"__builtin_arm_qadd">,
+def int_arm_qadd : ClangBuiltin<"__builtin_arm_qadd">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
     [Commutative, IntrNoMem]>;
-def int_arm_qsub : GCCBuiltin<"__builtin_arm_qsub">,
+def int_arm_qsub : ClangBuiltin<"__builtin_arm_qsub">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_arm_ssat : GCCBuiltin<"__builtin_arm_ssat">,
+def int_arm_ssat : ClangBuiltin<"__builtin_arm_ssat">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_arm_usat : GCCBuiltin<"__builtin_arm_usat">,
+def int_arm_usat : ClangBuiltin<"__builtin_arm_usat">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
 
 // Accumulating multiplications
-def int_arm_smlabb : GCCBuiltin<"__builtin_arm_smlabb">,
+def int_arm_smlabb : ClangBuiltin<"__builtin_arm_smlabb">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
     [IntrNoMem]>;
-def int_arm_smlabt : GCCBuiltin<"__builtin_arm_smlabt">,
+def int_arm_smlabt : ClangBuiltin<"__builtin_arm_smlabt">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
     [IntrNoMem]>;
-def int_arm_smlatb : GCCBuiltin<"__builtin_arm_smlatb">,
+def int_arm_smlatb : ClangBuiltin<"__builtin_arm_smlatb">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
     [IntrNoMem]>;
-def int_arm_smlatt : GCCBuiltin<"__builtin_arm_smlatt">,
+def int_arm_smlatt : ClangBuiltin<"__builtin_arm_smlatt">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
     [IntrNoMem]>;
-def int_arm_smlawb : GCCBuiltin<"__builtin_arm_smlawb">,
+def int_arm_smlawb : ClangBuiltin<"__builtin_arm_smlawb">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
     [IntrNoMem]>;
-def int_arm_smlawt : GCCBuiltin<"__builtin_arm_smlawt">,
+def int_arm_smlawt : ClangBuiltin<"__builtin_arm_smlawt">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
     [IntrNoMem]>;
 
 // Parallel 16-bit saturation
-def int_arm_ssat16 : GCCBuiltin<"__builtin_arm_ssat16">,
+def int_arm_ssat16 : ClangBuiltin<"__builtin_arm_ssat16">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_arm_usat16 : GCCBuiltin<"__builtin_arm_usat16">,
+def int_arm_usat16 : ClangBuiltin<"__builtin_arm_usat16">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
 
 // Packing and unpacking
-def int_arm_sxtab16 : GCCBuiltin<"__builtin_arm_sxtab16">,
+def int_arm_sxtab16 : ClangBuiltin<"__builtin_arm_sxtab16">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_arm_sxtb16 : GCCBuiltin<"__builtin_arm_sxtb16">,
+def int_arm_sxtb16 : ClangBuiltin<"__builtin_arm_sxtb16">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
-def int_arm_uxtab16 : GCCBuiltin<"__builtin_arm_uxtab16">,
+def int_arm_uxtab16 : ClangBuiltin<"__builtin_arm_uxtab16">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_arm_uxtb16 : GCCBuiltin<"__builtin_arm_uxtb16">,
+def int_arm_uxtb16 : ClangBuiltin<"__builtin_arm_uxtb16">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
 
 // Parallel selection, reads the GE flags.
-def int_arm_sel : GCCBuiltin<"__builtin_arm_sel">,
+def int_arm_sel : ClangBuiltin<"__builtin_arm_sel">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrReadMem]>;
 
 // Parallel 8-bit addition and subtraction
-def int_arm_qadd8  : GCCBuiltin<"__builtin_arm_qadd8">,
+def int_arm_qadd8  : ClangBuiltin<"__builtin_arm_qadd8">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_arm_qsub8  : GCCBuiltin<"__builtin_arm_qsub8">,
+def int_arm_qsub8  : ClangBuiltin<"__builtin_arm_qsub8">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
 // Writes to the GE bits.
-def int_arm_sadd8  : GCCBuiltin<"__builtin_arm_sadd8">,
+def int_arm_sadd8  : ClangBuiltin<"__builtin_arm_sadd8">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>;
-def int_arm_shadd8  : GCCBuiltin<"__builtin_arm_shadd8">,
+def int_arm_shadd8  : ClangBuiltin<"__builtin_arm_shadd8">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_arm_shsub8  : GCCBuiltin<"__builtin_arm_shsub8">,
+def int_arm_shsub8  : ClangBuiltin<"__builtin_arm_shsub8">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
 // Writes to the GE bits.
-def int_arm_ssub8  : GCCBuiltin<"__builtin_arm_ssub8">,
+def int_arm_ssub8  : ClangBuiltin<"__builtin_arm_ssub8">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>;
 // Writes to the GE bits.
-def int_arm_uadd8  : GCCBuiltin<"__builtin_arm_uadd8">,
+def int_arm_uadd8  : ClangBuiltin<"__builtin_arm_uadd8">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>;
-def int_arm_uhadd8  : GCCBuiltin<"__builtin_arm_uhadd8">,
+def int_arm_uhadd8  : ClangBuiltin<"__builtin_arm_uhadd8">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_arm_uhsub8  : GCCBuiltin<"__builtin_arm_uhsub8">,
+def int_arm_uhsub8  : ClangBuiltin<"__builtin_arm_uhsub8">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_arm_uqadd8  : GCCBuiltin<"__builtin_arm_uqadd8">,
+def int_arm_uqadd8  : ClangBuiltin<"__builtin_arm_uqadd8">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_arm_uqsub8  : GCCBuiltin<"__builtin_arm_uqsub8">,
+def int_arm_uqsub8  : ClangBuiltin<"__builtin_arm_uqsub8">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
 // Writes to the GE bits.
-def int_arm_usub8  : GCCBuiltin<"__builtin_arm_usub8">,
+def int_arm_usub8  : ClangBuiltin<"__builtin_arm_usub8">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>;
 
 // Sum of 8-bit absolute differences
-def int_arm_usad8  : GCCBuiltin<"__builtin_arm_usad8">,
+def int_arm_usad8  : ClangBuiltin<"__builtin_arm_usad8">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_arm_usada8  : GCCBuiltin<"__builtin_arm_usada8">,
+def int_arm_usada8  : ClangBuiltin<"__builtin_arm_usada8">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
               [IntrNoMem]>;
 
 // Parallel 16-bit addition and subtraction
-def int_arm_qadd16  : GCCBuiltin<"__builtin_arm_qadd16">,
+def int_arm_qadd16  : ClangBuiltin<"__builtin_arm_qadd16">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_arm_qasx  : GCCBuiltin<"__builtin_arm_qasx">,
+def int_arm_qasx  : ClangBuiltin<"__builtin_arm_qasx">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_arm_qsax  : GCCBuiltin<"__builtin_arm_qsax">,
+def int_arm_qsax  : ClangBuiltin<"__builtin_arm_qsax">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_arm_qsub16  : GCCBuiltin<"__builtin_arm_qsub16">,
+def int_arm_qsub16  : ClangBuiltin<"__builtin_arm_qsub16">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
 // Writes to the GE bits.
-def int_arm_sadd16  : GCCBuiltin<"__builtin_arm_sadd16">,
+def int_arm_sadd16  : ClangBuiltin<"__builtin_arm_sadd16">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>;
 // Writes to the GE bits.
-def int_arm_sasx  : GCCBuiltin<"__builtin_arm_sasx">,
+def int_arm_sasx  : ClangBuiltin<"__builtin_arm_sasx">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>;
-def int_arm_shadd16  : GCCBuiltin<"__builtin_arm_shadd16">,
+def int_arm_shadd16  : ClangBuiltin<"__builtin_arm_shadd16">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_arm_shasx  : GCCBuiltin<"__builtin_arm_shasx">,
+def int_arm_shasx  : ClangBuiltin<"__builtin_arm_shasx">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_arm_shsax  : GCCBuiltin<"__builtin_arm_shsax">,
+def int_arm_shsax  : ClangBuiltin<"__builtin_arm_shsax">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_arm_shsub16  : GCCBuiltin<"__builtin_arm_shsub16">,
+def int_arm_shsub16  : ClangBuiltin<"__builtin_arm_shsub16">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
 // Writes to the GE bits.
-def int_arm_ssax  : GCCBuiltin<"__builtin_arm_ssax">,
+def int_arm_ssax  : ClangBuiltin<"__builtin_arm_ssax">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>;
 // Writes to the GE bits.
-def int_arm_ssub16  : GCCBuiltin<"__builtin_arm_ssub16">,
+def int_arm_ssub16  : ClangBuiltin<"__builtin_arm_ssub16">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>;
 // Writes to the GE bits.
-def int_arm_uadd16  : GCCBuiltin<"__builtin_arm_uadd16">,
+def int_arm_uadd16  : ClangBuiltin<"__builtin_arm_uadd16">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>;
 // Writes to the GE bits.
-def int_arm_uasx  : GCCBuiltin<"__builtin_arm_uasx">,
+def int_arm_uasx  : ClangBuiltin<"__builtin_arm_uasx">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>;
-def int_arm_uhadd16  : GCCBuiltin<"__builtin_arm_uhadd16">,
+def int_arm_uhadd16  : ClangBuiltin<"__builtin_arm_uhadd16">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_arm_uhasx  : GCCBuiltin<"__builtin_arm_uhasx">,
+def int_arm_uhasx  : ClangBuiltin<"__builtin_arm_uhasx">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_arm_uhsax  : GCCBuiltin<"__builtin_arm_uhsax">,
+def int_arm_uhsax  : ClangBuiltin<"__builtin_arm_uhsax">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_arm_uhsub16  : GCCBuiltin<"__builtin_arm_uhsub16">,
+def int_arm_uhsub16  : ClangBuiltin<"__builtin_arm_uhsub16">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_arm_uqadd16  : GCCBuiltin<"__builtin_arm_uqadd16">,
+def int_arm_uqadd16  : ClangBuiltin<"__builtin_arm_uqadd16">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_arm_uqasx  : GCCBuiltin<"__builtin_arm_uqasx">,
+def int_arm_uqasx  : ClangBuiltin<"__builtin_arm_uqasx">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_arm_uqsax  : GCCBuiltin<"__builtin_arm_uqsax">,
+def int_arm_uqsax  : ClangBuiltin<"__builtin_arm_uqsax">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_arm_uqsub16  : GCCBuiltin<"__builtin_arm_uqsub16">,
+def int_arm_uqsub16  : ClangBuiltin<"__builtin_arm_uqsub16">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
 // Writes to the GE bits.
-def int_arm_usax  : GCCBuiltin<"__builtin_arm_usax">,
+def int_arm_usax  : ClangBuiltin<"__builtin_arm_usax">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>;
 // Writes to the GE bits.
-def int_arm_usub16  : GCCBuiltin<"__builtin_arm_usub16">,
+def int_arm_usub16  : ClangBuiltin<"__builtin_arm_usub16">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>;
 
 // Parallel 16-bit multiplication
-def int_arm_smlad : GCCBuiltin<"__builtin_arm_smlad">,
+def int_arm_smlad : ClangBuiltin<"__builtin_arm_smlad">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
               [IntrNoMem]>;
-def int_arm_smladx : GCCBuiltin<"__builtin_arm_smladx">,
+def int_arm_smladx : ClangBuiltin<"__builtin_arm_smladx">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
               [IntrNoMem]>;
-def int_arm_smlald : GCCBuiltin<"__builtin_arm_smlald">,
+def int_arm_smlald : ClangBuiltin<"__builtin_arm_smlald">,
     Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i64_ty],
               [IntrNoMem]>;
-def int_arm_smlaldx : GCCBuiltin<"__builtin_arm_smlaldx">,
+def int_arm_smlaldx : ClangBuiltin<"__builtin_arm_smlaldx">,
     Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i64_ty],
               [IntrNoMem]>;
-def int_arm_smlsd : GCCBuiltin<"__builtin_arm_smlsd">,
+def int_arm_smlsd : ClangBuiltin<"__builtin_arm_smlsd">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
               [IntrNoMem]>;
-def int_arm_smlsdx : GCCBuiltin<"__builtin_arm_smlsdx">,
+def int_arm_smlsdx : ClangBuiltin<"__builtin_arm_smlsdx">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
               [IntrNoMem]>;
-def int_arm_smlsld : GCCBuiltin<"__builtin_arm_smlsld">,
+def int_arm_smlsld : ClangBuiltin<"__builtin_arm_smlsld">,
     Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i64_ty],
               [IntrNoMem]>;
-def int_arm_smlsldx : GCCBuiltin<"__builtin_arm_smlsldx">,
+def int_arm_smlsldx : ClangBuiltin<"__builtin_arm_smlsldx">,
     Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i64_ty],
               [IntrNoMem]>;
-def int_arm_smuad : GCCBuiltin<"__builtin_arm_smuad">,
+def int_arm_smuad : ClangBuiltin<"__builtin_arm_smuad">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_arm_smuadx : GCCBuiltin<"__builtin_arm_smuadx">,
+def int_arm_smuadx : ClangBuiltin<"__builtin_arm_smuadx">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_arm_smusd : GCCBuiltin<"__builtin_arm_smusd">,
+def int_arm_smusd : ClangBuiltin<"__builtin_arm_smusd">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_arm_smusdx : GCCBuiltin<"__builtin_arm_smusdx">,
+def int_arm_smusdx : ClangBuiltin<"__builtin_arm_smusdx">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
 
 
@@ -239,19 +239,19 @@ def int_arm_ldaexd : Intrinsic<[llvm_i32_ty, llvm_i32_ty], [llvm_ptr_ty]>;
 
 //===----------------------------------------------------------------------===//
 // Data barrier instructions
-def int_arm_dmb : GCCBuiltin<"__builtin_arm_dmb">, MSBuiltin<"__dmb">,
+def int_arm_dmb : ClangBuiltin<"__builtin_arm_dmb">, MSBuiltin<"__dmb">,
                   Intrinsic<[], [llvm_i32_ty]>;
-def int_arm_dsb : GCCBuiltin<"__builtin_arm_dsb">, MSBuiltin<"__dsb">,
+def int_arm_dsb : ClangBuiltin<"__builtin_arm_dsb">, MSBuiltin<"__dsb">,
                   Intrinsic<[], [llvm_i32_ty]>;
-def int_arm_isb : GCCBuiltin<"__builtin_arm_isb">, MSBuiltin<"__isb">,
+def int_arm_isb : ClangBuiltin<"__builtin_arm_isb">, MSBuiltin<"__isb">,
                   Intrinsic<[], [llvm_i32_ty]>;
 
 //===----------------------------------------------------------------------===//
 // VFP
 
-def int_arm_get_fpscr : GCCBuiltin<"__builtin_arm_get_fpscr">,
+def int_arm_get_fpscr : ClangBuiltin<"__builtin_arm_get_fpscr">,
                        Intrinsic<[llvm_i32_ty], [], []>;
-def int_arm_set_fpscr : GCCBuiltin<"__builtin_arm_set_fpscr">,
+def int_arm_set_fpscr : ClangBuiltin<"__builtin_arm_set_fpscr">,
                        Intrinsic<[], [llvm_i32_ty], []>;
 def int_arm_vcvtr     : Intrinsic<[llvm_float_ty], [llvm_anyfloat_ty],
                                   [IntrNoMem]>;
@@ -261,47 +261,47 @@ def int_arm_vcvtru    : Intrinsic<[llvm_float_ty], [llvm_anyfloat_ty],
 //===----------------------------------------------------------------------===//
 // Coprocessor
 
-def int_arm_ldc : GCCBuiltin<"__builtin_arm_ldc">,
+def int_arm_ldc : ClangBuiltin<"__builtin_arm_ldc">,
    Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
-def int_arm_ldcl : GCCBuiltin<"__builtin_arm_ldcl">,
+def int_arm_ldcl : ClangBuiltin<"__builtin_arm_ldcl">,
    Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
-def int_arm_ldc2 : GCCBuiltin<"__builtin_arm_ldc2">,
+def int_arm_ldc2 : ClangBuiltin<"__builtin_arm_ldc2">,
    Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
-def int_arm_ldc2l : GCCBuiltin<"__builtin_arm_ldc2l">,
+def int_arm_ldc2l : ClangBuiltin<"__builtin_arm_ldc2l">,
    Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
 
-def int_arm_stc : GCCBuiltin<"__builtin_arm_stc">,
+def int_arm_stc : ClangBuiltin<"__builtin_arm_stc">,
    Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
-def int_arm_stcl : GCCBuiltin<"__builtin_arm_stcl">,
+def int_arm_stcl : ClangBuiltin<"__builtin_arm_stcl">,
    Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
-def int_arm_stc2 : GCCBuiltin<"__builtin_arm_stc2">,
+def int_arm_stc2 : ClangBuiltin<"__builtin_arm_stc2">,
    Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
-def int_arm_stc2l : GCCBuiltin<"__builtin_arm_stc2l">,
+def int_arm_stc2l : ClangBuiltin<"__builtin_arm_stc2l">,
    Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
 
 // Move to coprocessor
-def int_arm_mcr : GCCBuiltin<"__builtin_arm_mcr">,
+def int_arm_mcr : ClangBuiltin<"__builtin_arm_mcr">,
    Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
-def int_arm_mcr2 : GCCBuiltin<"__builtin_arm_mcr2">,
+def int_arm_mcr2 : ClangBuiltin<"__builtin_arm_mcr2">,
    Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
 // Move from coprocessor
-def int_arm_mrc : GCCBuiltin<"__builtin_arm_mrc">,
+def int_arm_mrc : ClangBuiltin<"__builtin_arm_mrc">,
                   MSBuiltin<"_MoveFromCoprocessor">,
    Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                              llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
-def int_arm_mrc2 : GCCBuiltin<"__builtin_arm_mrc2">,
+def int_arm_mrc2 : ClangBuiltin<"__builtin_arm_mrc2">,
                    MSBuiltin<"_MoveFromCoprocessor2">,
    Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                              llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
 
 // Coprocessor data processing
-def int_arm_cdp : GCCBuiltin<"__builtin_arm_cdp">,
+def int_arm_cdp : ClangBuiltin<"__builtin_arm_cdp">,
    Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
-def int_arm_cdp2 : GCCBuiltin<"__builtin_arm_cdp2">,
+def int_arm_cdp2 : ClangBuiltin<"__builtin_arm_cdp2">,
    Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
@@ -335,13 +335,13 @@ def int_arm_crc32cw : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
 //===----------------------------------------------------------------------===//
 // CMSE
 
-def int_arm_cmse_tt : GCCBuiltin<"__builtin_arm_cmse_TT">,
+def int_arm_cmse_tt : ClangBuiltin<"__builtin_arm_cmse_TT">,
     Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrNoMem]>;
-def int_arm_cmse_ttt : GCCBuiltin<"__builtin_arm_cmse_TTT">,
+def int_arm_cmse_ttt : ClangBuiltin<"__builtin_arm_cmse_TTT">,
     Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrNoMem]>;
-def int_arm_cmse_tta : GCCBuiltin<"__builtin_arm_cmse_TTA">,
+def int_arm_cmse_tta : ClangBuiltin<"__builtin_arm_cmse_TTA">,
     Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrNoMem]>;
-def int_arm_cmse_ttat : GCCBuiltin<"__builtin_arm_cmse_TTAT">,
+def int_arm_cmse_ttat : ClangBuiltin<"__builtin_arm_cmse_TTAT">,
     Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrNoMem]>;
 
 //===----------------------------------------------------------------------===//
@@ -1158,7 +1158,7 @@ defm int_arm_mve_vabav: MVEPredicated<
   [llvm_i32_ty],
   [llvm_i32_ty, llvm_i32_ty, llvm_anyvector_ty, LLVMMatchType<0>], llvm_anyvector_ty>;
 
-// The following 3 instrinsics are MVE vector reductions with two vector
+// The following 3 intrinsics are MVE vector reductions with two vector
 // operands.
 // The first 3 operands are boolean flags (must be compile-time constants):
 // * unsigned - the instruction operates on vectors of unsigned values and
diff --git a/llvm/include/llvm/IR/IntrinsicsBPF.td b/llvm/include/llvm/IR/IntrinsicsBPF.td
index a6bd6f841aab..8916b60d2be3 100644
--- a/llvm/include/llvm/IR/IntrinsicsBPF.td
+++ b/llvm/include/llvm/IR/IntrinsicsBPF.td
@@ -12,29 +12,29 @@
 
 // Specialized loads from packet
 let TargetPrefix = "bpf" in {  // All intrinsics start with "llvm.bpf."
-  def int_bpf_load_byte : GCCBuiltin<"__builtin_bpf_load_byte">,
+  def int_bpf_load_byte : ClangBuiltin<"__builtin_bpf_load_byte">,
               Intrinsic<[llvm_i64_ty], [llvm_ptr_ty, llvm_i64_ty], [IntrReadMem]>;
-  def int_bpf_load_half : GCCBuiltin<"__builtin_bpf_load_half">,
+  def int_bpf_load_half : ClangBuiltin<"__builtin_bpf_load_half">,
               Intrinsic<[llvm_i64_ty], [llvm_ptr_ty, llvm_i64_ty], [IntrReadMem]>;
-  def int_bpf_load_word : GCCBuiltin<"__builtin_bpf_load_word">,
+  def int_bpf_load_word : ClangBuiltin<"__builtin_bpf_load_word">,
               Intrinsic<[llvm_i64_ty], [llvm_ptr_ty, llvm_i64_ty], [IntrReadMem]>;
-  def int_bpf_pseudo : GCCBuiltin<"__builtin_bpf_pseudo">,
+  def int_bpf_pseudo : ClangBuiltin<"__builtin_bpf_pseudo">,
               Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty]>;
-  def int_bpf_preserve_field_info : GCCBuiltin<"__builtin_bpf_preserve_field_info">,
+  def int_bpf_preserve_field_info : ClangBuiltin<"__builtin_bpf_preserve_field_info">,
               Intrinsic<[llvm_i32_ty], [llvm_anyptr_ty, llvm_i64_ty],
               [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_bpf_btf_type_id : GCCBuiltin<"__builtin_bpf_btf_type_id">,
+  def int_bpf_btf_type_id : ClangBuiltin<"__builtin_bpf_btf_type_id">,
               Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i64_ty],
               [IntrNoMem]>;
-  def int_bpf_preserve_type_info : GCCBuiltin<"__builtin_bpf_preserve_type_info">,
+  def int_bpf_preserve_type_info : ClangBuiltin<"__builtin_bpf_preserve_type_info">,
               Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i64_ty],
               [IntrNoMem]>;
-  def int_bpf_preserve_enum_value : GCCBuiltin<"__builtin_bpf_preserve_enum_value">,
+  def int_bpf_preserve_enum_value : ClangBuiltin<"__builtin_bpf_preserve_enum_value">,
               Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_ptr_ty, llvm_i64_ty],
               [IntrNoMem]>;
-  def int_bpf_passthrough : GCCBuiltin<"__builtin_bpf_passthrough">,
+  def int_bpf_passthrough : ClangBuiltin<"__builtin_bpf_passthrough">,
               Intrinsic<[llvm_any_ty], [llvm_i32_ty, llvm_any_ty], [IntrNoMem]>;
-  def int_bpf_compare : GCCBuiltin<"__builtin_bpf_compare">,
+  def int_bpf_compare : ClangBuiltin<"__builtin_bpf_compare">,
               Intrinsic<[llvm_i1_ty], [llvm_i32_ty, llvm_anyint_ty, llvm_anyint_ty],
               [IntrNoMem]>;
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
new file mode 100644
index 000000000000..4a21cf1eb7fc
--- /dev/null
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -0,0 +1,20 @@
+//===- IntrinsicsDirectX.td - Defines DirectX intrinsics ---*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines all of the DirectX-specific intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+let TargetPrefix = "dxil" in {
+
+def int_dxil_thread_id : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrWillReturn]>;
+def int_dxil_group_id : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrWillReturn]>;
+def int_dxil_thread_id_in_group : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrWillReturn]>;
+def int_dxil_flattened_thread_id_in_group : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrWillReturn]>;
+
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsHexagon.td b/llvm/include/llvm/IR/IntrinsicsHexagon.td
index 212262c28706..52c29ef31f0a 100644
--- a/llvm/include/llvm/IR/IntrinsicsHexagon.td
+++ b/llvm/include/llvm/IR/IntrinsicsHexagon.td
@@ -18,7 +18,7 @@ let TargetPrefix = "hexagon" in {
   class Hexagon_Intrinsic<string GCCIntSuffix, list<LLVMType> ret_types,
                               list<LLVMType> param_types,
                               list<IntrinsicProperty> properties>
-    : GCCBuiltin<!strconcat("__builtin_", GCCIntSuffix)>,
+    : ClangBuiltin<!strconcat("__builtin_", GCCIntSuffix)>,
       Intrinsic<ret_types, param_types, properties>;
 
   /// Hexagon_NonGCC_Intrinsic - Base class for bitcode convertible Hexagon
@@ -404,4 +404,15 @@ def int_hexagon_V6_vmaskedstorenq_128B: Hexagon_custom_vms_Intrinsic_128B;
 def int_hexagon_V6_vmaskedstorentq_128B: Hexagon_custom_vms_Intrinsic_128B;
 def int_hexagon_V6_vmaskedstorentnq_128B: Hexagon_custom_vms_Intrinsic_128B;
 
+
+// Intrinsic for instrumentation based profiling using a custom handler. The
+// name of the handler is passed as the first operand to the intrinsic. The
+// handler can take only one int32 input which is passed as the second
+// operand to the intrinsic.
+def int_hexagon_instrprof_custom
+    : Hexagon_NonGCC_Intrinsic<[],
+                               [llvm_ptr_ty, llvm_i32_ty],
+                               [IntrInaccessibleMemOnly]>;
+
+
 include "llvm/IR/IntrinsicsHexagonDep.td"
diff --git a/llvm/include/llvm/IR/IntrinsicsMips.td b/llvm/include/llvm/IR/IntrinsicsMips.td
index 271142ca7788..3056f37b9d87 100644
--- a/llvm/include/llvm/IR/IntrinsicsMips.td
+++ b/llvm/include/llvm/IR/IntrinsicsMips.td
@@ -24,370 +24,370 @@ let TargetPrefix = "mips" in {  // All intrinsics start with "llvm.mips.".
 //===----------------------------------------------------------------------===//
 // Addition/subtraction
 
-def int_mips_addu_qb : GCCBuiltin<"__builtin_mips_addu_qb">,
+def int_mips_addu_qb : ClangBuiltin<"__builtin_mips_addu_qb">,
   Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_v4i8_ty],
             [Commutative, IntrNoMem]>;
-def int_mips_addu_s_qb : GCCBuiltin<"__builtin_mips_addu_s_qb">,
+def int_mips_addu_s_qb : ClangBuiltin<"__builtin_mips_addu_s_qb">,
   Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_v4i8_ty],
             [Commutative, IntrNoMem]>;
-def int_mips_subu_qb : GCCBuiltin<"__builtin_mips_subu_qb">,
+def int_mips_subu_qb : ClangBuiltin<"__builtin_mips_subu_qb">,
   Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_v4i8_ty], [IntrNoMem]>;
-def int_mips_subu_s_qb : GCCBuiltin<"__builtin_mips_subu_s_qb">,
+def int_mips_subu_s_qb : ClangBuiltin<"__builtin_mips_subu_s_qb">,
   Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_v4i8_ty], [IntrNoMem]>;
 
-def int_mips_addq_ph : GCCBuiltin<"__builtin_mips_addq_ph">,
+def int_mips_addq_ph : ClangBuiltin<"__builtin_mips_addq_ph">,
   Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, mips_v2q15_ty],
             [Commutative, IntrNoMem]>;
-def int_mips_addq_s_ph : GCCBuiltin<"__builtin_mips_addq_s_ph">,
+def int_mips_addq_s_ph : ClangBuiltin<"__builtin_mips_addq_s_ph">,
   Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, mips_v2q15_ty],
             [Commutative, IntrNoMem]>;
-def int_mips_subq_ph : GCCBuiltin<"__builtin_mips_subq_ph">,
+def int_mips_subq_ph : ClangBuiltin<"__builtin_mips_subq_ph">,
   Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, mips_v2q15_ty], [IntrNoMem]>;
-def int_mips_subq_s_ph : GCCBuiltin<"__builtin_mips_subq_s_ph">,
+def int_mips_subq_s_ph : ClangBuiltin<"__builtin_mips_subq_s_ph">,
   Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, mips_v2q15_ty], [IntrNoMem]>;
 
-def int_mips_madd: GCCBuiltin<"__builtin_mips_madd">,
+def int_mips_madd: ClangBuiltin<"__builtin_mips_madd">,
   Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty],
             [IntrNoMem, Commutative]>;
-def int_mips_maddu: GCCBuiltin<"__builtin_mips_maddu">,
+def int_mips_maddu: ClangBuiltin<"__builtin_mips_maddu">,
   Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty],
             [IntrNoMem, Commutative]>;
 
-def int_mips_msub: GCCBuiltin<"__builtin_mips_msub">,
+def int_mips_msub: ClangBuiltin<"__builtin_mips_msub">,
   Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty],
             [IntrNoMem]>;
-def int_mips_msubu: GCCBuiltin<"__builtin_mips_msubu">,
+def int_mips_msubu: ClangBuiltin<"__builtin_mips_msubu">,
   Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty],
             [IntrNoMem]>;
 
-def int_mips_addq_s_w: GCCBuiltin<"__builtin_mips_addq_s_w">,
+def int_mips_addq_s_w: ClangBuiltin<"__builtin_mips_addq_s_w">,
   Intrinsic<[mips_q31_ty], [mips_q31_ty, mips_q31_ty], [Commutative]>;
-def int_mips_subq_s_w: GCCBuiltin<"__builtin_mips_subq_s_w">,
+def int_mips_subq_s_w: ClangBuiltin<"__builtin_mips_subq_s_w">,
   Intrinsic<[mips_q31_ty], [mips_q31_ty, mips_q31_ty], []>;
 
-def int_mips_addsc: GCCBuiltin<"__builtin_mips_addsc">,
+def int_mips_addsc: ClangBuiltin<"__builtin_mips_addsc">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [Commutative]>;
-def int_mips_addwc: GCCBuiltin<"__builtin_mips_addwc">,
+def int_mips_addwc: ClangBuiltin<"__builtin_mips_addwc">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [Commutative]>;
 
-def int_mips_modsub: GCCBuiltin<"__builtin_mips_modsub">,
+def int_mips_modsub: ClangBuiltin<"__builtin_mips_modsub">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
 
-def int_mips_raddu_w_qb: GCCBuiltin<"__builtin_mips_raddu_w_qb">,
+def int_mips_raddu_w_qb: ClangBuiltin<"__builtin_mips_raddu_w_qb">,
   Intrinsic<[llvm_i32_ty], [llvm_v4i8_ty], [IntrNoMem]>;
 
 //===----------------------------------------------------------------------===//
 // Absolute value
 
-def int_mips_absq_s_ph: GCCBuiltin<"__builtin_mips_absq_s_ph">,
+def int_mips_absq_s_ph: ClangBuiltin<"__builtin_mips_absq_s_ph">,
   Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty], []>;
-def int_mips_absq_s_w: GCCBuiltin<"__builtin_mips_absq_s_w">,
+def int_mips_absq_s_w: ClangBuiltin<"__builtin_mips_absq_s_w">,
   Intrinsic<[mips_q31_ty], [mips_q31_ty], []>;
 
 //===----------------------------------------------------------------------===//
 // Precision reduce/expand
 
-def int_mips_precrq_qb_ph: GCCBuiltin<"__builtin_mips_precrq_qb_ph">,
+def int_mips_precrq_qb_ph: ClangBuiltin<"__builtin_mips_precrq_qb_ph">,
   Intrinsic<[llvm_v4i8_ty], [mips_v2q15_ty, mips_v2q15_ty], [IntrNoMem]>;
-def int_mips_precrqu_s_qb_ph: GCCBuiltin<"__builtin_mips_precrqu_s_qb_ph">,
+def int_mips_precrqu_s_qb_ph: ClangBuiltin<"__builtin_mips_precrqu_s_qb_ph">,
   Intrinsic<[llvm_v4i8_ty], [mips_v2q15_ty, mips_v2q15_ty], []>;
-def int_mips_precrq_ph_w: GCCBuiltin<"__builtin_mips_precrq_ph_w">,
+def int_mips_precrq_ph_w: ClangBuiltin<"__builtin_mips_precrq_ph_w">,
   Intrinsic<[mips_v2q15_ty], [mips_q31_ty, mips_q31_ty], [IntrNoMem]>;
-def int_mips_precrq_rs_ph_w: GCCBuiltin<"__builtin_mips_precrq_rs_ph_w">,
+def int_mips_precrq_rs_ph_w: ClangBuiltin<"__builtin_mips_precrq_rs_ph_w">,
   Intrinsic<[mips_v2q15_ty], [mips_q31_ty, mips_q31_ty], []>;
-def int_mips_preceq_w_phl: GCCBuiltin<"__builtin_mips_preceq_w_phl">,
+def int_mips_preceq_w_phl: ClangBuiltin<"__builtin_mips_preceq_w_phl">,
   Intrinsic<[mips_q31_ty], [mips_v2q15_ty], [IntrNoMem]>;
-def int_mips_preceq_w_phr: GCCBuiltin<"__builtin_mips_preceq_w_phr">,
+def int_mips_preceq_w_phr: ClangBuiltin<"__builtin_mips_preceq_w_phr">,
   Intrinsic<[mips_q31_ty], [mips_v2q15_ty], [IntrNoMem]>;
-def int_mips_precequ_ph_qbl: GCCBuiltin<"__builtin_mips_precequ_ph_qbl">,
+def int_mips_precequ_ph_qbl: ClangBuiltin<"__builtin_mips_precequ_ph_qbl">,
   Intrinsic<[mips_v2q15_ty], [llvm_v4i8_ty], [IntrNoMem]>;
-def int_mips_precequ_ph_qbr: GCCBuiltin<"__builtin_mips_precequ_ph_qbr">,
+def int_mips_precequ_ph_qbr: ClangBuiltin<"__builtin_mips_precequ_ph_qbr">,
   Intrinsic<[mips_v2q15_ty], [llvm_v4i8_ty], [IntrNoMem]>;
-def int_mips_precequ_ph_qbla: GCCBuiltin<"__builtin_mips_precequ_ph_qbla">,
+def int_mips_precequ_ph_qbla: ClangBuiltin<"__builtin_mips_precequ_ph_qbla">,
   Intrinsic<[mips_v2q15_ty], [llvm_v4i8_ty], [IntrNoMem]>;
-def int_mips_precequ_ph_qbra: GCCBuiltin<"__builtin_mips_precequ_ph_qbra">,
+def int_mips_precequ_ph_qbra: ClangBuiltin<"__builtin_mips_precequ_ph_qbra">,
   Intrinsic<[mips_v2q15_ty], [llvm_v4i8_ty], [IntrNoMem]>;
-def int_mips_preceu_ph_qbl: GCCBuiltin<"__builtin_mips_preceu_ph_qbl">,
+def int_mips_preceu_ph_qbl: ClangBuiltin<"__builtin_mips_preceu_ph_qbl">,
   Intrinsic<[mips_v2q15_ty], [llvm_v4i8_ty], [IntrNoMem]>;
-def int_mips_preceu_ph_qbr: GCCBuiltin<"__builtin_mips_preceu_ph_qbr">,
+def int_mips_preceu_ph_qbr: ClangBuiltin<"__builtin_mips_preceu_ph_qbr">,
   Intrinsic<[mips_v2q15_ty], [llvm_v4i8_ty], [IntrNoMem]>;
-def int_mips_preceu_ph_qbla: GCCBuiltin<"__builtin_mips_preceu_ph_qbla">,
+def int_mips_preceu_ph_qbla: ClangBuiltin<"__builtin_mips_preceu_ph_qbla">,
   Intrinsic<[mips_v2q15_ty], [llvm_v4i8_ty], [IntrNoMem]>;
-def int_mips_preceu_ph_qbra: GCCBuiltin<"__builtin_mips_preceu_ph_qbra">,
+def int_mips_preceu_ph_qbra: ClangBuiltin<"__builtin_mips_preceu_ph_qbra">,
   Intrinsic<[mips_v2q15_ty], [llvm_v4i8_ty], [IntrNoMem]>;
 
 //===----------------------------------------------------------------------===//
 // Shift
 
-def int_mips_shll_qb: GCCBuiltin<"__builtin_mips_shll_qb">,
+def int_mips_shll_qb: ClangBuiltin<"__builtin_mips_shll_qb">,
   Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_i32_ty], []>;
-def int_mips_shrl_qb: GCCBuiltin<"__builtin_mips_shrl_qb">,
+def int_mips_shrl_qb: ClangBuiltin<"__builtin_mips_shrl_qb">,
   Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_mips_shll_ph: GCCBuiltin<"__builtin_mips_shll_ph">,
+def int_mips_shll_ph: ClangBuiltin<"__builtin_mips_shll_ph">,
   Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, llvm_i32_ty], []>;
-def int_mips_shll_s_ph: GCCBuiltin<"__builtin_mips_shll_s_ph">,
+def int_mips_shll_s_ph: ClangBuiltin<"__builtin_mips_shll_s_ph">,
   Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, llvm_i32_ty], []>;
-def int_mips_shra_ph: GCCBuiltin<"__builtin_mips_shra_ph">,
+def int_mips_shra_ph: ClangBuiltin<"__builtin_mips_shra_ph">,
   Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_mips_shra_r_ph: GCCBuiltin<"__builtin_mips_shra_r_ph">,
+def int_mips_shra_r_ph: ClangBuiltin<"__builtin_mips_shra_r_ph">,
   Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_mips_shll_s_w: GCCBuiltin<"__builtin_mips_shll_s_w">,
+def int_mips_shll_s_w: ClangBuiltin<"__builtin_mips_shll_s_w">,
   Intrinsic<[mips_q31_ty], [mips_q31_ty, llvm_i32_ty], []>;
-def int_mips_shra_r_w: GCCBuiltin<"__builtin_mips_shra_r_w">,
+def int_mips_shra_r_w: ClangBuiltin<"__builtin_mips_shra_r_w">,
   Intrinsic<[mips_q31_ty], [mips_q31_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_mips_shilo: GCCBuiltin<"__builtin_mips_shilo">,
+def int_mips_shilo: ClangBuiltin<"__builtin_mips_shilo">,
   Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty], [IntrNoMem]>;
 
 //===----------------------------------------------------------------------===//
 // Multiplication
 
-def int_mips_muleu_s_ph_qbl: GCCBuiltin<"__builtin_mips_muleu_s_ph_qbl">,
+def int_mips_muleu_s_ph_qbl: ClangBuiltin<"__builtin_mips_muleu_s_ph_qbl">,
   Intrinsic<[mips_v2q15_ty], [llvm_v4i8_ty, mips_v2q15_ty], []>;
-def int_mips_muleu_s_ph_qbr: GCCBuiltin<"__builtin_mips_muleu_s_ph_qbr">,
+def int_mips_muleu_s_ph_qbr: ClangBuiltin<"__builtin_mips_muleu_s_ph_qbr">,
   Intrinsic<[mips_v2q15_ty], [llvm_v4i8_ty, mips_v2q15_ty], []>;
-def int_mips_mulq_rs_ph: GCCBuiltin<"__builtin_mips_mulq_rs_ph">,
+def int_mips_mulq_rs_ph: ClangBuiltin<"__builtin_mips_mulq_rs_ph">,
   Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, mips_v2q15_ty], [Commutative]>;
-def int_mips_muleq_s_w_phl: GCCBuiltin<"__builtin_mips_muleq_s_w_phl">,
+def int_mips_muleq_s_w_phl: ClangBuiltin<"__builtin_mips_muleq_s_w_phl">,
   Intrinsic<[mips_q31_ty], [mips_v2q15_ty, mips_v2q15_ty], [Commutative]>;
-def int_mips_muleq_s_w_phr: GCCBuiltin<"__builtin_mips_muleq_s_w_phr">,
+def int_mips_muleq_s_w_phr: ClangBuiltin<"__builtin_mips_muleq_s_w_phr">,
   Intrinsic<[mips_q31_ty], [mips_v2q15_ty, mips_v2q15_ty], [Commutative]>;
-def int_mips_mulsaq_s_w_ph: GCCBuiltin<"__builtin_mips_mulsaq_s_w_ph">,
+def int_mips_mulsaq_s_w_ph: ClangBuiltin<"__builtin_mips_mulsaq_s_w_ph">,
   Intrinsic<[llvm_i64_ty], [llvm_i64_ty, mips_v2q15_ty, mips_v2q15_ty], []>;
-def int_mips_maq_s_w_phl: GCCBuiltin<"__builtin_mips_maq_s_w_phl">,
+def int_mips_maq_s_w_phl: ClangBuiltin<"__builtin_mips_maq_s_w_phl">,
   Intrinsic<[llvm_i64_ty], [llvm_i64_ty, mips_v2q15_ty, mips_v2q15_ty], []>;
-def int_mips_maq_s_w_phr: GCCBuiltin<"__builtin_mips_maq_s_w_phr">,
+def int_mips_maq_s_w_phr: ClangBuiltin<"__builtin_mips_maq_s_w_phr">,
   Intrinsic<[llvm_i64_ty], [llvm_i64_ty, mips_v2q15_ty, mips_v2q15_ty], []>;
-def int_mips_maq_sa_w_phl: GCCBuiltin<"__builtin_mips_maq_sa_w_phl">,
+def int_mips_maq_sa_w_phl: ClangBuiltin<"__builtin_mips_maq_sa_w_phl">,
   Intrinsic<[llvm_i64_ty], [llvm_i64_ty, mips_v2q15_ty, mips_v2q15_ty], []>;
-def int_mips_maq_sa_w_phr: GCCBuiltin<"__builtin_mips_maq_sa_w_phr">,
+def int_mips_maq_sa_w_phr: ClangBuiltin<"__builtin_mips_maq_sa_w_phr">,
   Intrinsic<[llvm_i64_ty], [llvm_i64_ty, mips_v2q15_ty, mips_v2q15_ty], []>;
-def int_mips_mult: GCCBuiltin<"__builtin_mips_mult">,
+def int_mips_mult: ClangBuiltin<"__builtin_mips_mult">,
   Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i32_ty],
             [IntrNoMem, Commutative]>;
-def int_mips_multu: GCCBuiltin<"__builtin_mips_multu">,
+def int_mips_multu: ClangBuiltin<"__builtin_mips_multu">,
   Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i32_ty],
             [IntrNoMem, Commutative]>;
 
 //===----------------------------------------------------------------------===//
 // Dot product with accumulate/subtract
 
-def int_mips_dpau_h_qbl: GCCBuiltin<"__builtin_mips_dpau_h_qbl">,
+def int_mips_dpau_h_qbl: ClangBuiltin<"__builtin_mips_dpau_h_qbl">,
   Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_v4i8_ty, llvm_v4i8_ty],
             [IntrNoMem]>;
-def int_mips_dpau_h_qbr: GCCBuiltin<"__builtin_mips_dpau_h_qbr">,
+def int_mips_dpau_h_qbr: ClangBuiltin<"__builtin_mips_dpau_h_qbr">,
   Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_v4i8_ty, llvm_v4i8_ty],
             [IntrNoMem]>;
-def int_mips_dpsu_h_qbl: GCCBuiltin<"__builtin_mips_dpsu_h_qbl">,
+def int_mips_dpsu_h_qbl: ClangBuiltin<"__builtin_mips_dpsu_h_qbl">,
   Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_v4i8_ty, llvm_v4i8_ty],
             [IntrNoMem]>;
-def int_mips_dpsu_h_qbr: GCCBuiltin<"__builtin_mips_dpsu_h_qbr">,
+def int_mips_dpsu_h_qbr: ClangBuiltin<"__builtin_mips_dpsu_h_qbr">,
   Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_v4i8_ty, llvm_v4i8_ty],
             [IntrNoMem]>;
-def int_mips_dpaq_s_w_ph: GCCBuiltin<"__builtin_mips_dpaq_s_w_ph">,
+def int_mips_dpaq_s_w_ph: ClangBuiltin<"__builtin_mips_dpaq_s_w_ph">,
   Intrinsic<[llvm_i64_ty], [llvm_i64_ty, mips_v2q15_ty, mips_v2q15_ty], []>;
-def int_mips_dpsq_s_w_ph: GCCBuiltin<"__builtin_mips_dpsq_s_w_ph">,
+def int_mips_dpsq_s_w_ph: ClangBuiltin<"__builtin_mips_dpsq_s_w_ph">,
   Intrinsic<[llvm_i64_ty], [llvm_i64_ty, mips_v2q15_ty, mips_v2q15_ty], []>;
-def int_mips_dpaq_sa_l_w: GCCBuiltin<"__builtin_mips_dpaq_sa_l_w">,
+def int_mips_dpaq_sa_l_w: ClangBuiltin<"__builtin_mips_dpaq_sa_l_w">,
   Intrinsic<[llvm_i64_ty], [llvm_i64_ty, mips_q31_ty, mips_q31_ty], []>;
-def int_mips_dpsq_sa_l_w: GCCBuiltin<"__builtin_mips_dpsq_sa_l_w">,
+def int_mips_dpsq_sa_l_w: ClangBuiltin<"__builtin_mips_dpsq_sa_l_w">,
   Intrinsic<[llvm_i64_ty], [llvm_i64_ty, mips_q31_ty, mips_q31_ty], []>;
 
 //===----------------------------------------------------------------------===//
 // Comparison
 
-def int_mips_cmpu_eq_qb: GCCBuiltin<"__builtin_mips_cmpu_eq_qb">,
+def int_mips_cmpu_eq_qb: ClangBuiltin<"__builtin_mips_cmpu_eq_qb">,
   Intrinsic<[], [llvm_v4i8_ty, llvm_v4i8_ty], [Commutative]>;
-def int_mips_cmpu_lt_qb: GCCBuiltin<"__builtin_mips_cmpu_lt_qb">,
+def int_mips_cmpu_lt_qb: ClangBuiltin<"__builtin_mips_cmpu_lt_qb">,
   Intrinsic<[], [llvm_v4i8_ty, llvm_v4i8_ty], []>;
-def int_mips_cmpu_le_qb: GCCBuiltin<"__builtin_mips_cmpu_le_qb">,
+def int_mips_cmpu_le_qb: ClangBuiltin<"__builtin_mips_cmpu_le_qb">,
   Intrinsic<[], [llvm_v4i8_ty, llvm_v4i8_ty], []>;
-def int_mips_cmpgu_eq_qb: GCCBuiltin<"__builtin_mips_cmpgu_eq_qb">,
+def int_mips_cmpgu_eq_qb: ClangBuiltin<"__builtin_mips_cmpgu_eq_qb">,
   Intrinsic<[llvm_i32_ty], [llvm_v4i8_ty, llvm_v4i8_ty], [Commutative]>;
-def int_mips_cmpgu_lt_qb: GCCBuiltin<"__builtin_mips_cmpgu_lt_qb">,
+def int_mips_cmpgu_lt_qb: ClangBuiltin<"__builtin_mips_cmpgu_lt_qb">,
   Intrinsic<[llvm_i32_ty], [llvm_v4i8_ty, llvm_v4i8_ty], []>;
-def int_mips_cmpgu_le_qb: GCCBuiltin<"__builtin_mips_cmpgu_le_qb">,
+def int_mips_cmpgu_le_qb: ClangBuiltin<"__builtin_mips_cmpgu_le_qb">,
   Intrinsic<[llvm_i32_ty], [llvm_v4i8_ty, llvm_v4i8_ty], []>;
-def int_mips_cmp_eq_ph: GCCBuiltin<"__builtin_mips_cmp_eq_ph">,
+def int_mips_cmp_eq_ph: ClangBuiltin<"__builtin_mips_cmp_eq_ph">,
   Intrinsic<[], [mips_v2q15_ty, mips_v2q15_ty], [Commutative]>;
-def int_mips_cmp_lt_ph: GCCBuiltin<"__builtin_mips_cmp_lt_ph">,
+def int_mips_cmp_lt_ph: ClangBuiltin<"__builtin_mips_cmp_lt_ph">,
   Intrinsic<[], [mips_v2q15_ty, mips_v2q15_ty], []>;
-def int_mips_cmp_le_ph: GCCBuiltin<"__builtin_mips_cmp_le_ph">,
+def int_mips_cmp_le_ph: ClangBuiltin<"__builtin_mips_cmp_le_ph">,
   Intrinsic<[], [mips_v2q15_ty, mips_v2q15_ty], []>;
 
 //===----------------------------------------------------------------------===//
 // Extracting
 
-def int_mips_extr_s_h: GCCBuiltin<"__builtin_mips_extr_s_h">,
+def int_mips_extr_s_h: ClangBuiltin<"__builtin_mips_extr_s_h">,
   Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_i32_ty], []>;
-def int_mips_extr_w: GCCBuiltin<"__builtin_mips_extr_w">,
+def int_mips_extr_w: ClangBuiltin<"__builtin_mips_extr_w">,
   Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_i32_ty], []>;
-def int_mips_extr_rs_w: GCCBuiltin<"__builtin_mips_extr_rs_w">,
+def int_mips_extr_rs_w: ClangBuiltin<"__builtin_mips_extr_rs_w">,
   Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_i32_ty], []>;
-def int_mips_extr_r_w: GCCBuiltin<"__builtin_mips_extr_r_w">,
+def int_mips_extr_r_w: ClangBuiltin<"__builtin_mips_extr_r_w">,
   Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_i32_ty], []>;
-def int_mips_extp: GCCBuiltin<"__builtin_mips_extp">,
+def int_mips_extp: ClangBuiltin<"__builtin_mips_extp">,
   Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_i32_ty], []>;
-def int_mips_extpdp: GCCBuiltin<"__builtin_mips_extpdp">,
+def int_mips_extpdp: ClangBuiltin<"__builtin_mips_extpdp">,
   Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_i32_ty], []>;
 
 //===----------------------------------------------------------------------===//
 // Misc
 
-def int_mips_wrdsp: GCCBuiltin<"__builtin_mips_wrdsp">,
+def int_mips_wrdsp: ClangBuiltin<"__builtin_mips_wrdsp">,
   Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<1>>]>;
-def int_mips_rddsp: GCCBuiltin<"__builtin_mips_rddsp">,
+def int_mips_rddsp: ClangBuiltin<"__builtin_mips_rddsp">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrReadMem, ImmArg<ArgIndex<0>>]>;
 
-def int_mips_insv: GCCBuiltin<"__builtin_mips_insv">,
+def int_mips_insv: ClangBuiltin<"__builtin_mips_insv">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrReadMem]>;
-def int_mips_bitrev: GCCBuiltin<"__builtin_mips_bitrev">,
+def int_mips_bitrev: ClangBuiltin<"__builtin_mips_bitrev">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
 
-def int_mips_packrl_ph: GCCBuiltin<"__builtin_mips_packrl_ph">,
+def int_mips_packrl_ph: ClangBuiltin<"__builtin_mips_packrl_ph">,
   Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, mips_v2q15_ty], [IntrNoMem]>;
 
-def int_mips_repl_qb: GCCBuiltin<"__builtin_mips_repl_qb">,
+def int_mips_repl_qb: ClangBuiltin<"__builtin_mips_repl_qb">,
   Intrinsic<[llvm_v4i8_ty], [llvm_i32_ty], [IntrNoMem]>;
-def int_mips_repl_ph: GCCBuiltin<"__builtin_mips_repl_ph">,
+def int_mips_repl_ph: ClangBuiltin<"__builtin_mips_repl_ph">,
   Intrinsic<[mips_v2q15_ty], [llvm_i32_ty], [IntrNoMem]>;
 
-def int_mips_pick_qb: GCCBuiltin<"__builtin_mips_pick_qb">,
+def int_mips_pick_qb: ClangBuiltin<"__builtin_mips_pick_qb">,
   Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_v4i8_ty], [IntrReadMem]>;
-def int_mips_pick_ph: GCCBuiltin<"__builtin_mips_pick_ph">,
+def int_mips_pick_ph: ClangBuiltin<"__builtin_mips_pick_ph">,
   Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, mips_v2q15_ty], [IntrReadMem]>;
 
-def int_mips_mthlip: GCCBuiltin<"__builtin_mips_mthlip">,
+def int_mips_mthlip: ClangBuiltin<"__builtin_mips_mthlip">,
   Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty], []>;
 
-def int_mips_bposge32: GCCBuiltin<"__builtin_mips_bposge32">,
+def int_mips_bposge32: ClangBuiltin<"__builtin_mips_bposge32">,
   Intrinsic<[llvm_i32_ty], [], [IntrReadMem]>;
 
-def int_mips_lbux: GCCBuiltin<"__builtin_mips_lbux">,
+def int_mips_lbux: ClangBuiltin<"__builtin_mips_lbux">,
   Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>;
-def int_mips_lhx: GCCBuiltin<"__builtin_mips_lhx">,
+def int_mips_lhx: ClangBuiltin<"__builtin_mips_lhx">,
   Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>;
-def int_mips_lwx: GCCBuiltin<"__builtin_mips_lwx">,
+def int_mips_lwx: ClangBuiltin<"__builtin_mips_lwx">,
   Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>;
 
 //===----------------------------------------------------------------------===//
 // MIPS DSP Rev 2
 
-def int_mips_absq_s_qb: GCCBuiltin<"__builtin_mips_absq_s_qb">,
+def int_mips_absq_s_qb: ClangBuiltin<"__builtin_mips_absq_s_qb">,
   Intrinsic<[mips_v4q7_ty], [mips_v4q7_ty], []>;
 
-def int_mips_addqh_ph: GCCBuiltin<"__builtin_mips_addqh_ph">,
+def int_mips_addqh_ph: ClangBuiltin<"__builtin_mips_addqh_ph">,
   Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, mips_v2q15_ty],
             [IntrNoMem, Commutative]>;
-def int_mips_addqh_r_ph: GCCBuiltin<"__builtin_mips_addqh_r_ph">,
+def int_mips_addqh_r_ph: ClangBuiltin<"__builtin_mips_addqh_r_ph">,
   Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, mips_v2q15_ty],
             [IntrNoMem, Commutative]>;
-def int_mips_addqh_w: GCCBuiltin<"__builtin_mips_addqh_w">,
+def int_mips_addqh_w: ClangBuiltin<"__builtin_mips_addqh_w">,
   Intrinsic<[mips_q31_ty], [mips_q31_ty, mips_q31_ty],
             [IntrNoMem, Commutative]>;
-def int_mips_addqh_r_w: GCCBuiltin<"__builtin_mips_addqh_r_w">,
+def int_mips_addqh_r_w: ClangBuiltin<"__builtin_mips_addqh_r_w">,
   Intrinsic<[mips_q31_ty], [mips_q31_ty, mips_q31_ty],
             [IntrNoMem, Commutative]>;
 
-def int_mips_addu_ph: GCCBuiltin<"__builtin_mips_addu_ph">,
+def int_mips_addu_ph: ClangBuiltin<"__builtin_mips_addu_ph">,
   Intrinsic<[llvm_v2i16_ty], [llvm_v2i16_ty, llvm_v2i16_ty], [Commutative]>;
-def int_mips_addu_s_ph: GCCBuiltin<"__builtin_mips_addu_s_ph">,
+def int_mips_addu_s_ph: ClangBuiltin<"__builtin_mips_addu_s_ph">,
   Intrinsic<[llvm_v2i16_ty], [llvm_v2i16_ty, llvm_v2i16_ty], [Commutative]>;
 
-def int_mips_adduh_qb: GCCBuiltin<"__builtin_mips_adduh_qb">,
+def int_mips_adduh_qb: ClangBuiltin<"__builtin_mips_adduh_qb">,
   Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_v4i8_ty],
             [IntrNoMem, Commutative]>;
-def int_mips_adduh_r_qb: GCCBuiltin<"__builtin_mips_adduh_r_qb">,
+def int_mips_adduh_r_qb: ClangBuiltin<"__builtin_mips_adduh_r_qb">,
   Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_v4i8_ty],
             [IntrNoMem, Commutative]>;
 
-def int_mips_append: GCCBuiltin<"__builtin_mips_append">,
+def int_mips_append: ClangBuiltin<"__builtin_mips_append">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
   [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-def int_mips_balign: GCCBuiltin<"__builtin_mips_balign">,
+def int_mips_balign: ClangBuiltin<"__builtin_mips_balign">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
   [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
-def int_mips_cmpgdu_eq_qb: GCCBuiltin<"__builtin_mips_cmpgdu_eq_qb">,
+def int_mips_cmpgdu_eq_qb: ClangBuiltin<"__builtin_mips_cmpgdu_eq_qb">,
   Intrinsic<[llvm_i32_ty], [llvm_v4i8_ty, llvm_v4i8_ty], [Commutative]>;
-def int_mips_cmpgdu_lt_qb: GCCBuiltin<"__builtin_mips_cmpgdu_lt_qb">,
+def int_mips_cmpgdu_lt_qb: ClangBuiltin<"__builtin_mips_cmpgdu_lt_qb">,
   Intrinsic<[llvm_i32_ty], [llvm_v4i8_ty, llvm_v4i8_ty], []>;
-def int_mips_cmpgdu_le_qb: GCCBuiltin<"__builtin_mips_cmpgdu_le_qb">,
+def int_mips_cmpgdu_le_qb: ClangBuiltin<"__builtin_mips_cmpgdu_le_qb">,
   Intrinsic<[llvm_i32_ty], [llvm_v4i8_ty, llvm_v4i8_ty], []>;
 
-def int_mips_dpa_w_ph: GCCBuiltin<"__builtin_mips_dpa_w_ph">,
+def int_mips_dpa_w_ph: ClangBuiltin<"__builtin_mips_dpa_w_ph">,
   Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_v2i16_ty, llvm_v2i16_ty],
             [IntrNoMem]>;
-def int_mips_dps_w_ph: GCCBuiltin<"__builtin_mips_dps_w_ph">,
+def int_mips_dps_w_ph: ClangBuiltin<"__builtin_mips_dps_w_ph">,
   Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_v2i16_ty, llvm_v2i16_ty],
             [IntrNoMem]>;
 
-def int_mips_dpaqx_s_w_ph: GCCBuiltin<"__builtin_mips_dpaqx_s_w_ph">,
+def int_mips_dpaqx_s_w_ph: ClangBuiltin<"__builtin_mips_dpaqx_s_w_ph">,
   Intrinsic<[llvm_i64_ty], [llvm_i64_ty, mips_v2q15_ty, mips_v2q15_ty], []>;
-def int_mips_dpaqx_sa_w_ph: GCCBuiltin<"__builtin_mips_dpaqx_sa_w_ph">,
+def int_mips_dpaqx_sa_w_ph: ClangBuiltin<"__builtin_mips_dpaqx_sa_w_ph">,
   Intrinsic<[llvm_i64_ty], [llvm_i64_ty, mips_v2q15_ty, mips_v2q15_ty], []>;
-def int_mips_dpax_w_ph: GCCBuiltin<"__builtin_mips_dpax_w_ph">,
+def int_mips_dpax_w_ph: ClangBuiltin<"__builtin_mips_dpax_w_ph">,
   Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_v2i16_ty, llvm_v2i16_ty],
             [IntrNoMem]>;
-def int_mips_dpsx_w_ph: GCCBuiltin<"__builtin_mips_dpsx_w_ph">,
+def int_mips_dpsx_w_ph: ClangBuiltin<"__builtin_mips_dpsx_w_ph">,
   Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_v2i16_ty, llvm_v2i16_ty],
             [IntrNoMem]>;
-def int_mips_dpsqx_s_w_ph: GCCBuiltin<"__builtin_mips_dpsqx_s_w_ph">,
+def int_mips_dpsqx_s_w_ph: ClangBuiltin<"__builtin_mips_dpsqx_s_w_ph">,
   Intrinsic<[llvm_i64_ty], [llvm_i64_ty, mips_v2q15_ty, mips_v2q15_ty], []>;
-def int_mips_dpsqx_sa_w_ph: GCCBuiltin<"__builtin_mips_dpsqx_sa_w_ph">,
+def int_mips_dpsqx_sa_w_ph: ClangBuiltin<"__builtin_mips_dpsqx_sa_w_ph">,
   Intrinsic<[llvm_i64_ty], [llvm_i64_ty, mips_v2q15_ty, mips_v2q15_ty], []>;
 
-def int_mips_mul_ph: GCCBuiltin<"__builtin_mips_mul_ph">,
+def int_mips_mul_ph: ClangBuiltin<"__builtin_mips_mul_ph">,
   Intrinsic<[llvm_v2i16_ty], [llvm_v2i16_ty, llvm_v2i16_ty], [Commutative]>;
-def int_mips_mul_s_ph: GCCBuiltin<"__builtin_mips_mul_s_ph">,
+def int_mips_mul_s_ph: ClangBuiltin<"__builtin_mips_mul_s_ph">,
   Intrinsic<[llvm_v2i16_ty], [llvm_v2i16_ty, llvm_v2i16_ty], [Commutative]>;
 
-def int_mips_mulq_rs_w: GCCBuiltin<"__builtin_mips_mulq_rs_w">,
+def int_mips_mulq_rs_w: ClangBuiltin<"__builtin_mips_mulq_rs_w">,
   Intrinsic<[mips_q31_ty], [mips_q31_ty, mips_q31_ty], [Commutative]>;
-def int_mips_mulq_s_ph: GCCBuiltin<"__builtin_mips_mulq_s_ph">,
+def int_mips_mulq_s_ph: ClangBuiltin<"__builtin_mips_mulq_s_ph">,
   Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, mips_v2q15_ty], [Commutative]>;
-def int_mips_mulq_s_w: GCCBuiltin<"__builtin_mips_mulq_s_w">,
+def int_mips_mulq_s_w: ClangBuiltin<"__builtin_mips_mulq_s_w">,
   Intrinsic<[mips_q31_ty], [mips_q31_ty, mips_q31_ty], [Commutative]>;
-def int_mips_mulsa_w_ph: GCCBuiltin<"__builtin_mips_mulsa_w_ph">,
+def int_mips_mulsa_w_ph: ClangBuiltin<"__builtin_mips_mulsa_w_ph">,
   Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_v2i16_ty, llvm_v2i16_ty],
             [IntrNoMem]>;
 
-def int_mips_precr_qb_ph: GCCBuiltin<"__builtin_mips_precr_qb_ph">,
+def int_mips_precr_qb_ph: ClangBuiltin<"__builtin_mips_precr_qb_ph">,
   Intrinsic<[llvm_v4i8_ty], [llvm_v2i16_ty, llvm_v2i16_ty], []>;
-def int_mips_precr_sra_ph_w: GCCBuiltin<"__builtin_mips_precr_sra_ph_w">,
+def int_mips_precr_sra_ph_w: ClangBuiltin<"__builtin_mips_precr_sra_ph_w">,
   Intrinsic<[llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
             [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-def int_mips_precr_sra_r_ph_w: GCCBuiltin<"__builtin_mips_precr_sra_r_ph_w">,
+def int_mips_precr_sra_r_ph_w: ClangBuiltin<"__builtin_mips_precr_sra_r_ph_w">,
   Intrinsic<[llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
             [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
-def int_mips_prepend: GCCBuiltin<"__builtin_mips_prepend">,
+def int_mips_prepend: ClangBuiltin<"__builtin_mips_prepend">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
   [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
-def int_mips_shra_qb: GCCBuiltin<"__builtin_mips_shra_qb">,
+def int_mips_shra_qb: ClangBuiltin<"__builtin_mips_shra_qb">,
   Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_mips_shra_r_qb: GCCBuiltin<"__builtin_mips_shra_r_qb">,
+def int_mips_shra_r_qb: ClangBuiltin<"__builtin_mips_shra_r_qb">,
   Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_mips_shrl_ph: GCCBuiltin<"__builtin_mips_shrl_ph">,
+def int_mips_shrl_ph: ClangBuiltin<"__builtin_mips_shrl_ph">,
   Intrinsic<[llvm_v2i16_ty], [llvm_v2i16_ty, llvm_i32_ty], [IntrNoMem]>;
 
-def int_mips_subqh_ph: GCCBuiltin<"__builtin_mips_subqh_ph">,
+def int_mips_subqh_ph: ClangBuiltin<"__builtin_mips_subqh_ph">,
   Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, mips_v2q15_ty], [IntrNoMem]>;
-def int_mips_subqh_r_ph: GCCBuiltin<"__builtin_mips_subqh_r_ph">,
+def int_mips_subqh_r_ph: ClangBuiltin<"__builtin_mips_subqh_r_ph">,
   Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, mips_v2q15_ty], [IntrNoMem]>;
-def int_mips_subqh_w: GCCBuiltin<"__builtin_mips_subqh_w">,
+def int_mips_subqh_w: ClangBuiltin<"__builtin_mips_subqh_w">,
   Intrinsic<[mips_q31_ty], [mips_q31_ty, mips_q31_ty], [IntrNoMem]>;
-def int_mips_subqh_r_w: GCCBuiltin<"__builtin_mips_subqh_r_w">,
+def int_mips_subqh_r_w: ClangBuiltin<"__builtin_mips_subqh_r_w">,
   Intrinsic<[mips_q31_ty], [mips_q31_ty, mips_q31_ty], [IntrNoMem]>;
 
-def int_mips_subu_ph: GCCBuiltin<"__builtin_mips_subu_ph">,
+def int_mips_subu_ph: ClangBuiltin<"__builtin_mips_subu_ph">,
   Intrinsic<[llvm_v2i16_ty], [llvm_v2i16_ty, llvm_v2i16_ty], []>;
-def int_mips_subu_s_ph: GCCBuiltin<"__builtin_mips_subu_s_ph">,
+def int_mips_subu_s_ph: ClangBuiltin<"__builtin_mips_subu_s_ph">,
   Intrinsic<[llvm_v2i16_ty], [llvm_v2i16_ty, llvm_v2i16_ty], []>;
 
-def int_mips_subuh_qb: GCCBuiltin<"__builtin_mips_subuh_qb">,
+def int_mips_subuh_qb: ClangBuiltin<"__builtin_mips_subuh_qb">,
   Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_v4i8_ty], [IntrNoMem]>;
-def int_mips_subuh_r_qb: GCCBuiltin<"__builtin_mips_subuh_r_qb">,
+def int_mips_subuh_r_qb: ClangBuiltin<"__builtin_mips_subuh_r_qb">,
   Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_v4i8_ty], [IntrNoMem]>;
 
 //===----------------------------------------------------------------------===//
@@ -396,1389 +396,1389 @@ def int_mips_subuh_r_qb: GCCBuiltin<"__builtin_mips_subuh_r_qb">,
 //===----------------------------------------------------------------------===//
 // Addition/subtraction
 
-def int_mips_add_a_b : GCCBuiltin<"__builtin_msa_add_a_b">,
+def int_mips_add_a_b : ClangBuiltin<"__builtin_msa_add_a_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
   [Commutative, IntrNoMem]>;
-def int_mips_add_a_h : GCCBuiltin<"__builtin_msa_add_a_h">,
+def int_mips_add_a_h : ClangBuiltin<"__builtin_msa_add_a_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
   [Commutative, IntrNoMem]>;
-def int_mips_add_a_w : GCCBuiltin<"__builtin_msa_add_a_w">,
+def int_mips_add_a_w : ClangBuiltin<"__builtin_msa_add_a_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
   [Commutative, IntrNoMem]>;
-def int_mips_add_a_d : GCCBuiltin<"__builtin_msa_add_a_d">,
+def int_mips_add_a_d : ClangBuiltin<"__builtin_msa_add_a_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
   [Commutative, IntrNoMem]>;
 
-def int_mips_adds_a_b : GCCBuiltin<"__builtin_msa_adds_a_b">,
+def int_mips_adds_a_b : ClangBuiltin<"__builtin_msa_adds_a_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
   [Commutative, IntrNoMem]>;
-def int_mips_adds_a_h : GCCBuiltin<"__builtin_msa_adds_a_h">,
+def int_mips_adds_a_h : ClangBuiltin<"__builtin_msa_adds_a_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
   [Commutative, IntrNoMem]>;
-def int_mips_adds_a_w : GCCBuiltin<"__builtin_msa_adds_a_w">,
+def int_mips_adds_a_w : ClangBuiltin<"__builtin_msa_adds_a_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
   [Commutative, IntrNoMem]>;
-def int_mips_adds_a_d : GCCBuiltin<"__builtin_msa_adds_a_d">,
+def int_mips_adds_a_d : ClangBuiltin<"__builtin_msa_adds_a_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
   [Commutative, IntrNoMem]>;
 
-def int_mips_adds_s_b : GCCBuiltin<"__builtin_msa_adds_s_b">,
+def int_mips_adds_s_b : ClangBuiltin<"__builtin_msa_adds_s_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
   [Commutative, IntrNoMem]>;
-def int_mips_adds_s_h : GCCBuiltin<"__builtin_msa_adds_s_h">,
+def int_mips_adds_s_h : ClangBuiltin<"__builtin_msa_adds_s_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
   [Commutative, IntrNoMem]>;
-def int_mips_adds_s_w : GCCBuiltin<"__builtin_msa_adds_s_w">,
+def int_mips_adds_s_w : ClangBuiltin<"__builtin_msa_adds_s_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
   [Commutative, IntrNoMem]>;
-def int_mips_adds_s_d : GCCBuiltin<"__builtin_msa_adds_s_d">,
+def int_mips_adds_s_d : ClangBuiltin<"__builtin_msa_adds_s_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
   [Commutative, IntrNoMem]>;
 
-def int_mips_adds_u_b : GCCBuiltin<"__builtin_msa_adds_u_b">,
+def int_mips_adds_u_b : ClangBuiltin<"__builtin_msa_adds_u_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
   [Commutative, IntrNoMem]>;
-def int_mips_adds_u_h : GCCBuiltin<"__builtin_msa_adds_u_h">,
+def int_mips_adds_u_h : ClangBuiltin<"__builtin_msa_adds_u_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
   [Commutative, IntrNoMem]>;
-def int_mips_adds_u_w : GCCBuiltin<"__builtin_msa_adds_u_w">,
+def int_mips_adds_u_w : ClangBuiltin<"__builtin_msa_adds_u_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
   [Commutative, IntrNoMem]>;
-def int_mips_adds_u_d : GCCBuiltin<"__builtin_msa_adds_u_d">,
+def int_mips_adds_u_d : ClangBuiltin<"__builtin_msa_adds_u_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
   [Commutative, IntrNoMem]>;
 
-def int_mips_addv_b : GCCBuiltin<"__builtin_msa_addv_b">,
+def int_mips_addv_b : ClangBuiltin<"__builtin_msa_addv_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
   [Commutative, IntrNoMem]>;
-def int_mips_addv_h : GCCBuiltin<"__builtin_msa_addv_h">,
+def int_mips_addv_h : ClangBuiltin<"__builtin_msa_addv_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
   [Commutative, IntrNoMem]>;
-def int_mips_addv_w : GCCBuiltin<"__builtin_msa_addv_w">,
+def int_mips_addv_w : ClangBuiltin<"__builtin_msa_addv_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
   [Commutative, IntrNoMem]>;
-def int_mips_addv_d : GCCBuiltin<"__builtin_msa_addv_d">,
+def int_mips_addv_d : ClangBuiltin<"__builtin_msa_addv_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
   [Commutative, IntrNoMem]>;
 
-def int_mips_addvi_b : GCCBuiltin<"__builtin_msa_addvi_b">,
+def int_mips_addvi_b : ClangBuiltin<"__builtin_msa_addvi_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty],
   [Commutative, IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_addvi_h : GCCBuiltin<"__builtin_msa_addvi_h">,
+def int_mips_addvi_h : ClangBuiltin<"__builtin_msa_addvi_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty],
   [Commutative, IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_addvi_w : GCCBuiltin<"__builtin_msa_addvi_w">,
+def int_mips_addvi_w : ClangBuiltin<"__builtin_msa_addvi_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty],
   [Commutative, IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_addvi_d : GCCBuiltin<"__builtin_msa_addvi_d">,
+def int_mips_addvi_d : ClangBuiltin<"__builtin_msa_addvi_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty],
   [Commutative, IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-def int_mips_and_v : GCCBuiltin<"__builtin_msa_and_v">,
+def int_mips_and_v : ClangBuiltin<"__builtin_msa_and_v">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
 
-def int_mips_andi_b : GCCBuiltin<"__builtin_msa_andi_b">,
+def int_mips_andi_b : ClangBuiltin<"__builtin_msa_andi_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-def int_mips_asub_s_b : GCCBuiltin<"__builtin_msa_asub_s_b">,
+def int_mips_asub_s_b : ClangBuiltin<"__builtin_msa_asub_s_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_asub_s_h : GCCBuiltin<"__builtin_msa_asub_s_h">,
+def int_mips_asub_s_h : ClangBuiltin<"__builtin_msa_asub_s_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_asub_s_w : GCCBuiltin<"__builtin_msa_asub_s_w">,
+def int_mips_asub_s_w : ClangBuiltin<"__builtin_msa_asub_s_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_asub_s_d : GCCBuiltin<"__builtin_msa_asub_s_d">,
+def int_mips_asub_s_d : ClangBuiltin<"__builtin_msa_asub_s_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_asub_u_b : GCCBuiltin<"__builtin_msa_asub_u_b">,
+def int_mips_asub_u_b : ClangBuiltin<"__builtin_msa_asub_u_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_asub_u_h : GCCBuiltin<"__builtin_msa_asub_u_h">,
+def int_mips_asub_u_h : ClangBuiltin<"__builtin_msa_asub_u_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_asub_u_w : GCCBuiltin<"__builtin_msa_asub_u_w">,
+def int_mips_asub_u_w : ClangBuiltin<"__builtin_msa_asub_u_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_asub_u_d : GCCBuiltin<"__builtin_msa_asub_u_d">,
+def int_mips_asub_u_d : ClangBuiltin<"__builtin_msa_asub_u_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_ave_s_b : GCCBuiltin<"__builtin_msa_ave_s_b">,
+def int_mips_ave_s_b : ClangBuiltin<"__builtin_msa_ave_s_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
   [Commutative, IntrNoMem]>;
-def int_mips_ave_s_h : GCCBuiltin<"__builtin_msa_ave_s_h">,
+def int_mips_ave_s_h : ClangBuiltin<"__builtin_msa_ave_s_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
   [Commutative, IntrNoMem]>;
-def int_mips_ave_s_w : GCCBuiltin<"__builtin_msa_ave_s_w">,
+def int_mips_ave_s_w : ClangBuiltin<"__builtin_msa_ave_s_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
   [Commutative, IntrNoMem]>;
-def int_mips_ave_s_d : GCCBuiltin<"__builtin_msa_ave_s_d">,
+def int_mips_ave_s_d : ClangBuiltin<"__builtin_msa_ave_s_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
   [Commutative, IntrNoMem]>;
 
-def int_mips_ave_u_b : GCCBuiltin<"__builtin_msa_ave_u_b">,
+def int_mips_ave_u_b : ClangBuiltin<"__builtin_msa_ave_u_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
   [Commutative, IntrNoMem]>;
-def int_mips_ave_u_h : GCCBuiltin<"__builtin_msa_ave_u_h">,
+def int_mips_ave_u_h : ClangBuiltin<"__builtin_msa_ave_u_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
   [Commutative, IntrNoMem]>;
-def int_mips_ave_u_w : GCCBuiltin<"__builtin_msa_ave_u_w">,
+def int_mips_ave_u_w : ClangBuiltin<"__builtin_msa_ave_u_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
   [Commutative, IntrNoMem]>;
-def int_mips_ave_u_d : GCCBuiltin<"__builtin_msa_ave_u_d">,
+def int_mips_ave_u_d : ClangBuiltin<"__builtin_msa_ave_u_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
   [Commutative, IntrNoMem]>;
 
-def int_mips_aver_s_b : GCCBuiltin<"__builtin_msa_aver_s_b">,
+def int_mips_aver_s_b : ClangBuiltin<"__builtin_msa_aver_s_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
   [Commutative, IntrNoMem]>;
-def int_mips_aver_s_h : GCCBuiltin<"__builtin_msa_aver_s_h">,
+def int_mips_aver_s_h : ClangBuiltin<"__builtin_msa_aver_s_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
   [Commutative, IntrNoMem]>;
-def int_mips_aver_s_w : GCCBuiltin<"__builtin_msa_aver_s_w">,
+def int_mips_aver_s_w : ClangBuiltin<"__builtin_msa_aver_s_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
   [Commutative, IntrNoMem]>;
-def int_mips_aver_s_d : GCCBuiltin<"__builtin_msa_aver_s_d">,
+def int_mips_aver_s_d : ClangBuiltin<"__builtin_msa_aver_s_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
   [Commutative, IntrNoMem]>;
 
-def int_mips_aver_u_b : GCCBuiltin<"__builtin_msa_aver_u_b">,
+def int_mips_aver_u_b : ClangBuiltin<"__builtin_msa_aver_u_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
   [Commutative, IntrNoMem]>;
-def int_mips_aver_u_h : GCCBuiltin<"__builtin_msa_aver_u_h">,
+def int_mips_aver_u_h : ClangBuiltin<"__builtin_msa_aver_u_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
   [Commutative, IntrNoMem]>;
-def int_mips_aver_u_w : GCCBuiltin<"__builtin_msa_aver_u_w">,
+def int_mips_aver_u_w : ClangBuiltin<"__builtin_msa_aver_u_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
   [Commutative, IntrNoMem]>;
-def int_mips_aver_u_d : GCCBuiltin<"__builtin_msa_aver_u_d">,
+def int_mips_aver_u_d : ClangBuiltin<"__builtin_msa_aver_u_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
   [Commutative, IntrNoMem]>;
 
-def int_mips_bclr_b : GCCBuiltin<"__builtin_msa_bclr_b">,
+def int_mips_bclr_b : ClangBuiltin<"__builtin_msa_bclr_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_bclr_h : GCCBuiltin<"__builtin_msa_bclr_h">,
+def int_mips_bclr_h : ClangBuiltin<"__builtin_msa_bclr_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_bclr_w : GCCBuiltin<"__builtin_msa_bclr_w">,
+def int_mips_bclr_w : ClangBuiltin<"__builtin_msa_bclr_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_bclr_d : GCCBuiltin<"__builtin_msa_bclr_d">,
+def int_mips_bclr_d : ClangBuiltin<"__builtin_msa_bclr_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_bclri_b : GCCBuiltin<"__builtin_msa_bclri_b">,
+def int_mips_bclri_b : ClangBuiltin<"__builtin_msa_bclri_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_bclri_h : GCCBuiltin<"__builtin_msa_bclri_h">,
+def int_mips_bclri_h : ClangBuiltin<"__builtin_msa_bclri_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_bclri_w : GCCBuiltin<"__builtin_msa_bclri_w">,
+def int_mips_bclri_w : ClangBuiltin<"__builtin_msa_bclri_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_bclri_d : GCCBuiltin<"__builtin_msa_bclri_d">,
+def int_mips_bclri_d : ClangBuiltin<"__builtin_msa_bclri_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-def int_mips_binsl_b : GCCBuiltin<"__builtin_msa_binsl_b">,
+def int_mips_binsl_b : ClangBuiltin<"__builtin_msa_binsl_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
             [IntrNoMem]>;
-def int_mips_binsl_h : GCCBuiltin<"__builtin_msa_binsl_h">,
+def int_mips_binsl_h : ClangBuiltin<"__builtin_msa_binsl_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty],
             [IntrNoMem]>;
-def int_mips_binsl_w : GCCBuiltin<"__builtin_msa_binsl_w">,
+def int_mips_binsl_w : ClangBuiltin<"__builtin_msa_binsl_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
             [IntrNoMem]>;
-def int_mips_binsl_d : GCCBuiltin<"__builtin_msa_binsl_d">,
+def int_mips_binsl_d : ClangBuiltin<"__builtin_msa_binsl_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty],
             [IntrNoMem]>;
 
-def int_mips_binsli_b : GCCBuiltin<"__builtin_msa_binsli_b">,
+def int_mips_binsli_b : ClangBuiltin<"__builtin_msa_binsli_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
             [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-def int_mips_binsli_h : GCCBuiltin<"__builtin_msa_binsli_h">,
+def int_mips_binsli_h : ClangBuiltin<"__builtin_msa_binsli_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty],
             [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-def int_mips_binsli_w : GCCBuiltin<"__builtin_msa_binsli_w">,
+def int_mips_binsli_w : ClangBuiltin<"__builtin_msa_binsli_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty],
             [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-def int_mips_binsli_d : GCCBuiltin<"__builtin_msa_binsli_d">,
+def int_mips_binsli_d : ClangBuiltin<"__builtin_msa_binsli_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty],
             [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
-def int_mips_binsr_b : GCCBuiltin<"__builtin_msa_binsr_b">,
+def int_mips_binsr_b : ClangBuiltin<"__builtin_msa_binsr_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
             [IntrNoMem]>;
-def int_mips_binsr_h : GCCBuiltin<"__builtin_msa_binsr_h">,
+def int_mips_binsr_h : ClangBuiltin<"__builtin_msa_binsr_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty],
             [IntrNoMem]>;
-def int_mips_binsr_w : GCCBuiltin<"__builtin_msa_binsr_w">,
+def int_mips_binsr_w : ClangBuiltin<"__builtin_msa_binsr_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
             [IntrNoMem]>;
-def int_mips_binsr_d : GCCBuiltin<"__builtin_msa_binsr_d">,
+def int_mips_binsr_d : ClangBuiltin<"__builtin_msa_binsr_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty],
             [IntrNoMem]>;
 
-def int_mips_binsri_b : GCCBuiltin<"__builtin_msa_binsri_b">,
+def int_mips_binsri_b : ClangBuiltin<"__builtin_msa_binsri_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
             [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-def int_mips_binsri_h : GCCBuiltin<"__builtin_msa_binsri_h">,
+def int_mips_binsri_h : ClangBuiltin<"__builtin_msa_binsri_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty],
             [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-def int_mips_binsri_w : GCCBuiltin<"__builtin_msa_binsri_w">,
+def int_mips_binsri_w : ClangBuiltin<"__builtin_msa_binsri_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty],
             [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-def int_mips_binsri_d : GCCBuiltin<"__builtin_msa_binsri_d">,
+def int_mips_binsri_d : ClangBuiltin<"__builtin_msa_binsri_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty],
             [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
-def int_mips_bmnz_v : GCCBuiltin<"__builtin_msa_bmnz_v">,
+def int_mips_bmnz_v : ClangBuiltin<"__builtin_msa_bmnz_v">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
             [IntrNoMem]>;
 
-def int_mips_bmnzi_b : GCCBuiltin<"__builtin_msa_bmnzi_b">,
+def int_mips_bmnzi_b : ClangBuiltin<"__builtin_msa_bmnzi_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
             [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
-def int_mips_bmz_v : GCCBuiltin<"__builtin_msa_bmz_v">,
+def int_mips_bmz_v : ClangBuiltin<"__builtin_msa_bmz_v">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
             [IntrNoMem]>;
 
-def int_mips_bmzi_b : GCCBuiltin<"__builtin_msa_bmzi_b">,
+def int_mips_bmzi_b : ClangBuiltin<"__builtin_msa_bmzi_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
             [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
-def int_mips_bneg_b : GCCBuiltin<"__builtin_msa_bneg_b">,
+def int_mips_bneg_b : ClangBuiltin<"__builtin_msa_bneg_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_bneg_h : GCCBuiltin<"__builtin_msa_bneg_h">,
+def int_mips_bneg_h : ClangBuiltin<"__builtin_msa_bneg_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_bneg_w : GCCBuiltin<"__builtin_msa_bneg_w">,
+def int_mips_bneg_w : ClangBuiltin<"__builtin_msa_bneg_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_bneg_d : GCCBuiltin<"__builtin_msa_bneg_d">,
+def int_mips_bneg_d : ClangBuiltin<"__builtin_msa_bneg_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_bnegi_b : GCCBuiltin<"__builtin_msa_bnegi_b">,
+def int_mips_bnegi_b : ClangBuiltin<"__builtin_msa_bnegi_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_bnegi_h : GCCBuiltin<"__builtin_msa_bnegi_h">,
+def int_mips_bnegi_h : ClangBuiltin<"__builtin_msa_bnegi_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_bnegi_w : GCCBuiltin<"__builtin_msa_bnegi_w">,
+def int_mips_bnegi_w : ClangBuiltin<"__builtin_msa_bnegi_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_bnegi_d : GCCBuiltin<"__builtin_msa_bnegi_d">,
+def int_mips_bnegi_d : ClangBuiltin<"__builtin_msa_bnegi_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-def int_mips_bnz_b : GCCBuiltin<"__builtin_msa_bnz_b">,
+def int_mips_bnz_b : ClangBuiltin<"__builtin_msa_bnz_b">,
   Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_bnz_h : GCCBuiltin<"__builtin_msa_bnz_h">,
+def int_mips_bnz_h : ClangBuiltin<"__builtin_msa_bnz_h">,
   Intrinsic<[llvm_i32_ty], [llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_bnz_w : GCCBuiltin<"__builtin_msa_bnz_w">,
+def int_mips_bnz_w : ClangBuiltin<"__builtin_msa_bnz_w">,
   Intrinsic<[llvm_i32_ty], [llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_bnz_d : GCCBuiltin<"__builtin_msa_bnz_d">,
+def int_mips_bnz_d : ClangBuiltin<"__builtin_msa_bnz_d">,
   Intrinsic<[llvm_i32_ty], [llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_bnz_v : GCCBuiltin<"__builtin_msa_bnz_v">,
+def int_mips_bnz_v : ClangBuiltin<"__builtin_msa_bnz_v">,
   Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty], [IntrNoMem]>;
 
-def int_mips_bsel_v : GCCBuiltin<"__builtin_msa_bsel_v">,
+def int_mips_bsel_v : ClangBuiltin<"__builtin_msa_bsel_v">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
             [IntrNoMem]>;
 
-def int_mips_bseli_b : GCCBuiltin<"__builtin_msa_bseli_b">,
+def int_mips_bseli_b : ClangBuiltin<"__builtin_msa_bseli_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
             [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
-def int_mips_bset_b : GCCBuiltin<"__builtin_msa_bset_b">,
+def int_mips_bset_b : ClangBuiltin<"__builtin_msa_bset_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_bset_h : GCCBuiltin<"__builtin_msa_bset_h">,
+def int_mips_bset_h : ClangBuiltin<"__builtin_msa_bset_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_bset_w : GCCBuiltin<"__builtin_msa_bset_w">,
+def int_mips_bset_w : ClangBuiltin<"__builtin_msa_bset_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_bset_d : GCCBuiltin<"__builtin_msa_bset_d">,
+def int_mips_bset_d : ClangBuiltin<"__builtin_msa_bset_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_bseti_b : GCCBuiltin<"__builtin_msa_bseti_b">,
+def int_mips_bseti_b : ClangBuiltin<"__builtin_msa_bseti_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_bseti_h : GCCBuiltin<"__builtin_msa_bseti_h">,
+def int_mips_bseti_h : ClangBuiltin<"__builtin_msa_bseti_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_bseti_w : GCCBuiltin<"__builtin_msa_bseti_w">,
+def int_mips_bseti_w : ClangBuiltin<"__builtin_msa_bseti_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_bseti_d : GCCBuiltin<"__builtin_msa_bseti_d">,
+def int_mips_bseti_d : ClangBuiltin<"__builtin_msa_bseti_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-def int_mips_bz_b : GCCBuiltin<"__builtin_msa_bz_b">,
+def int_mips_bz_b : ClangBuiltin<"__builtin_msa_bz_b">,
   Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_bz_h : GCCBuiltin<"__builtin_msa_bz_h">,
+def int_mips_bz_h : ClangBuiltin<"__builtin_msa_bz_h">,
   Intrinsic<[llvm_i32_ty], [llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_bz_w : GCCBuiltin<"__builtin_msa_bz_w">,
+def int_mips_bz_w : ClangBuiltin<"__builtin_msa_bz_w">,
   Intrinsic<[llvm_i32_ty], [llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_bz_d : GCCBuiltin<"__builtin_msa_bz_d">,
+def int_mips_bz_d : ClangBuiltin<"__builtin_msa_bz_d">,
   Intrinsic<[llvm_i32_ty], [llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_bz_v : GCCBuiltin<"__builtin_msa_bz_v">,
+def int_mips_bz_v : ClangBuiltin<"__builtin_msa_bz_v">,
   Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty], [IntrNoMem]>;
 
-def int_mips_ceq_b : GCCBuiltin<"__builtin_msa_ceq_b">,
+def int_mips_ceq_b : ClangBuiltin<"__builtin_msa_ceq_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_ceq_h : GCCBuiltin<"__builtin_msa_ceq_h">,
+def int_mips_ceq_h : ClangBuiltin<"__builtin_msa_ceq_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_ceq_w : GCCBuiltin<"__builtin_msa_ceq_w">,
+def int_mips_ceq_w : ClangBuiltin<"__builtin_msa_ceq_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_ceq_d : GCCBuiltin<"__builtin_msa_ceq_d">,
+def int_mips_ceq_d : ClangBuiltin<"__builtin_msa_ceq_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_ceqi_b : GCCBuiltin<"__builtin_msa_ceqi_b">,
+def int_mips_ceqi_b : ClangBuiltin<"__builtin_msa_ceqi_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_ceqi_h : GCCBuiltin<"__builtin_msa_ceqi_h">,
+def int_mips_ceqi_h : ClangBuiltin<"__builtin_msa_ceqi_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_ceqi_w : GCCBuiltin<"__builtin_msa_ceqi_w">,
+def int_mips_ceqi_w : ClangBuiltin<"__builtin_msa_ceqi_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_ceqi_d : GCCBuiltin<"__builtin_msa_ceqi_d">,
+def int_mips_ceqi_d : ClangBuiltin<"__builtin_msa_ceqi_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-def int_mips_cfcmsa : GCCBuiltin<"__builtin_msa_cfcmsa">,
+def int_mips_cfcmsa : ClangBuiltin<"__builtin_msa_cfcmsa">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
 
-def int_mips_cle_s_b : GCCBuiltin<"__builtin_msa_cle_s_b">,
+def int_mips_cle_s_b : ClangBuiltin<"__builtin_msa_cle_s_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_cle_s_h : GCCBuiltin<"__builtin_msa_cle_s_h">,
+def int_mips_cle_s_h : ClangBuiltin<"__builtin_msa_cle_s_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_cle_s_w : GCCBuiltin<"__builtin_msa_cle_s_w">,
+def int_mips_cle_s_w : ClangBuiltin<"__builtin_msa_cle_s_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_cle_s_d : GCCBuiltin<"__builtin_msa_cle_s_d">,
+def int_mips_cle_s_d : ClangBuiltin<"__builtin_msa_cle_s_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_cle_u_b : GCCBuiltin<"__builtin_msa_cle_u_b">,
+def int_mips_cle_u_b : ClangBuiltin<"__builtin_msa_cle_u_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_cle_u_h : GCCBuiltin<"__builtin_msa_cle_u_h">,
+def int_mips_cle_u_h : ClangBuiltin<"__builtin_msa_cle_u_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_cle_u_w : GCCBuiltin<"__builtin_msa_cle_u_w">,
+def int_mips_cle_u_w : ClangBuiltin<"__builtin_msa_cle_u_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_cle_u_d : GCCBuiltin<"__builtin_msa_cle_u_d">,
+def int_mips_cle_u_d : ClangBuiltin<"__builtin_msa_cle_u_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_clei_s_b : GCCBuiltin<"__builtin_msa_clei_s_b">,
+def int_mips_clei_s_b : ClangBuiltin<"__builtin_msa_clei_s_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_clei_s_h : GCCBuiltin<"__builtin_msa_clei_s_h">,
+def int_mips_clei_s_h : ClangBuiltin<"__builtin_msa_clei_s_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_clei_s_w : GCCBuiltin<"__builtin_msa_clei_s_w">,
+def int_mips_clei_s_w : ClangBuiltin<"__builtin_msa_clei_s_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_clei_s_d : GCCBuiltin<"__builtin_msa_clei_s_d">,
+def int_mips_clei_s_d : ClangBuiltin<"__builtin_msa_clei_s_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-def int_mips_clei_u_b : GCCBuiltin<"__builtin_msa_clei_u_b">,
+def int_mips_clei_u_b : ClangBuiltin<"__builtin_msa_clei_u_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_clei_u_h : GCCBuiltin<"__builtin_msa_clei_u_h">,
+def int_mips_clei_u_h : ClangBuiltin<"__builtin_msa_clei_u_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_clei_u_w : GCCBuiltin<"__builtin_msa_clei_u_w">,
+def int_mips_clei_u_w : ClangBuiltin<"__builtin_msa_clei_u_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_clei_u_d : GCCBuiltin<"__builtin_msa_clei_u_d">,
+def int_mips_clei_u_d : ClangBuiltin<"__builtin_msa_clei_u_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-def int_mips_clt_s_b : GCCBuiltin<"__builtin_msa_clt_s_b">,
+def int_mips_clt_s_b : ClangBuiltin<"__builtin_msa_clt_s_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_clt_s_h : GCCBuiltin<"__builtin_msa_clt_s_h">,
+def int_mips_clt_s_h : ClangBuiltin<"__builtin_msa_clt_s_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_clt_s_w : GCCBuiltin<"__builtin_msa_clt_s_w">,
+def int_mips_clt_s_w : ClangBuiltin<"__builtin_msa_clt_s_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_clt_s_d : GCCBuiltin<"__builtin_msa_clt_s_d">,
+def int_mips_clt_s_d : ClangBuiltin<"__builtin_msa_clt_s_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_clt_u_b : GCCBuiltin<"__builtin_msa_clt_u_b">,
+def int_mips_clt_u_b : ClangBuiltin<"__builtin_msa_clt_u_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_clt_u_h : GCCBuiltin<"__builtin_msa_clt_u_h">,
+def int_mips_clt_u_h : ClangBuiltin<"__builtin_msa_clt_u_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_clt_u_w : GCCBuiltin<"__builtin_msa_clt_u_w">,
+def int_mips_clt_u_w : ClangBuiltin<"__builtin_msa_clt_u_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_clt_u_d : GCCBuiltin<"__builtin_msa_clt_u_d">,
+def int_mips_clt_u_d : ClangBuiltin<"__builtin_msa_clt_u_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_clti_s_b : GCCBuiltin<"__builtin_msa_clti_s_b">,
+def int_mips_clti_s_b : ClangBuiltin<"__builtin_msa_clti_s_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_clti_s_h : GCCBuiltin<"__builtin_msa_clti_s_h">,
+def int_mips_clti_s_h : ClangBuiltin<"__builtin_msa_clti_s_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_clti_s_w : GCCBuiltin<"__builtin_msa_clti_s_w">,
+def int_mips_clti_s_w : ClangBuiltin<"__builtin_msa_clti_s_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_clti_s_d : GCCBuiltin<"__builtin_msa_clti_s_d">,
+def int_mips_clti_s_d : ClangBuiltin<"__builtin_msa_clti_s_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-def int_mips_clti_u_b : GCCBuiltin<"__builtin_msa_clti_u_b">,
+def int_mips_clti_u_b : ClangBuiltin<"__builtin_msa_clti_u_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_clti_u_h : GCCBuiltin<"__builtin_msa_clti_u_h">,
+def int_mips_clti_u_h : ClangBuiltin<"__builtin_msa_clti_u_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_clti_u_w : GCCBuiltin<"__builtin_msa_clti_u_w">,
+def int_mips_clti_u_w : ClangBuiltin<"__builtin_msa_clti_u_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_clti_u_d : GCCBuiltin<"__builtin_msa_clti_u_d">,
+def int_mips_clti_u_d : ClangBuiltin<"__builtin_msa_clti_u_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-def int_mips_copy_s_b : GCCBuiltin<"__builtin_msa_copy_s_b">,
+def int_mips_copy_s_b : ClangBuiltin<"__builtin_msa_copy_s_b">,
   Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_mips_copy_s_h : GCCBuiltin<"__builtin_msa_copy_s_h">,
+def int_mips_copy_s_h : ClangBuiltin<"__builtin_msa_copy_s_h">,
   Intrinsic<[llvm_i32_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_mips_copy_s_w : GCCBuiltin<"__builtin_msa_copy_s_w">,
+def int_mips_copy_s_w : ClangBuiltin<"__builtin_msa_copy_s_w">,
   Intrinsic<[llvm_i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_mips_copy_s_d : GCCBuiltin<"__builtin_msa_copy_s_d">,
+def int_mips_copy_s_d : ClangBuiltin<"__builtin_msa_copy_s_d">,
   Intrinsic<[llvm_i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
 
-def int_mips_copy_u_b : GCCBuiltin<"__builtin_msa_copy_u_b">,
+def int_mips_copy_u_b : ClangBuiltin<"__builtin_msa_copy_u_b">,
   Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_mips_copy_u_h : GCCBuiltin<"__builtin_msa_copy_u_h">,
+def int_mips_copy_u_h : ClangBuiltin<"__builtin_msa_copy_u_h">,
   Intrinsic<[llvm_i32_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_mips_copy_u_w : GCCBuiltin<"__builtin_msa_copy_u_w">,
+def int_mips_copy_u_w : ClangBuiltin<"__builtin_msa_copy_u_w">,
   Intrinsic<[llvm_i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_mips_copy_u_d : GCCBuiltin<"__builtin_msa_copy_u_d">,
+def int_mips_copy_u_d : ClangBuiltin<"__builtin_msa_copy_u_d">,
   Intrinsic<[llvm_i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
 
-def int_mips_ctcmsa : GCCBuiltin<"__builtin_msa_ctcmsa">,
+def int_mips_ctcmsa : ClangBuiltin<"__builtin_msa_ctcmsa">,
   Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
 
-def int_mips_div_s_b : GCCBuiltin<"__builtin_msa_div_s_b">,
+def int_mips_div_s_b : ClangBuiltin<"__builtin_msa_div_s_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_div_s_h : GCCBuiltin<"__builtin_msa_div_s_h">,
+def int_mips_div_s_h : ClangBuiltin<"__builtin_msa_div_s_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_div_s_w : GCCBuiltin<"__builtin_msa_div_s_w">,
+def int_mips_div_s_w : ClangBuiltin<"__builtin_msa_div_s_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_div_s_d : GCCBuiltin<"__builtin_msa_div_s_d">,
+def int_mips_div_s_d : ClangBuiltin<"__builtin_msa_div_s_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_div_u_b : GCCBuiltin<"__builtin_msa_div_u_b">,
+def int_mips_div_u_b : ClangBuiltin<"__builtin_msa_div_u_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_div_u_h : GCCBuiltin<"__builtin_msa_div_u_h">,
+def int_mips_div_u_h : ClangBuiltin<"__builtin_msa_div_u_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_div_u_w : GCCBuiltin<"__builtin_msa_div_u_w">,
+def int_mips_div_u_w : ClangBuiltin<"__builtin_msa_div_u_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_div_u_d : GCCBuiltin<"__builtin_msa_div_u_d">,
+def int_mips_div_u_d : ClangBuiltin<"__builtin_msa_div_u_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
 // This instruction is part of the MSA spec but it does not share the
 // __builtin_msa prefix because it operates on GP registers.
-def int_mips_dlsa : GCCBuiltin<"__builtin_mips_dlsa">,
+def int_mips_dlsa : ClangBuiltin<"__builtin_mips_dlsa">,
   Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty],
             [IntrNoMem]>;
 
-def int_mips_dotp_s_h : GCCBuiltin<"__builtin_msa_dotp_s_h">,
+def int_mips_dotp_s_h : ClangBuiltin<"__builtin_msa_dotp_s_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_dotp_s_w : GCCBuiltin<"__builtin_msa_dotp_s_w">,
+def int_mips_dotp_s_w : ClangBuiltin<"__builtin_msa_dotp_s_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_dotp_s_d : GCCBuiltin<"__builtin_msa_dotp_s_d">,
+def int_mips_dotp_s_d : ClangBuiltin<"__builtin_msa_dotp_s_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
 
-def int_mips_dotp_u_h : GCCBuiltin<"__builtin_msa_dotp_u_h">,
+def int_mips_dotp_u_h : ClangBuiltin<"__builtin_msa_dotp_u_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_dotp_u_w : GCCBuiltin<"__builtin_msa_dotp_u_w">,
+def int_mips_dotp_u_w : ClangBuiltin<"__builtin_msa_dotp_u_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_dotp_u_d : GCCBuiltin<"__builtin_msa_dotp_u_d">,
+def int_mips_dotp_u_d : ClangBuiltin<"__builtin_msa_dotp_u_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
 
-def int_mips_dpadd_s_h : GCCBuiltin<"__builtin_msa_dpadd_s_h">,
+def int_mips_dpadd_s_h : ClangBuiltin<"__builtin_msa_dpadd_s_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v16i8_ty, llvm_v16i8_ty],
   [IntrNoMem]>;
-def int_mips_dpadd_s_w : GCCBuiltin<"__builtin_msa_dpadd_s_w">,
+def int_mips_dpadd_s_w : ClangBuiltin<"__builtin_msa_dpadd_s_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty],
   [IntrNoMem]>;
-def int_mips_dpadd_s_d : GCCBuiltin<"__builtin_msa_dpadd_s_d">,
+def int_mips_dpadd_s_d : ClangBuiltin<"__builtin_msa_dpadd_s_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v4i32_ty, llvm_v4i32_ty],
   [IntrNoMem]>;
 
-def int_mips_dpadd_u_h : GCCBuiltin<"__builtin_msa_dpadd_u_h">,
+def int_mips_dpadd_u_h : ClangBuiltin<"__builtin_msa_dpadd_u_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v16i8_ty, llvm_v16i8_ty],
   [IntrNoMem]>;
-def int_mips_dpadd_u_w : GCCBuiltin<"__builtin_msa_dpadd_u_w">,
+def int_mips_dpadd_u_w : ClangBuiltin<"__builtin_msa_dpadd_u_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty],
   [IntrNoMem]>;
-def int_mips_dpadd_u_d : GCCBuiltin<"__builtin_msa_dpadd_u_d">,
+def int_mips_dpadd_u_d : ClangBuiltin<"__builtin_msa_dpadd_u_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v4i32_ty, llvm_v4i32_ty],
   [IntrNoMem]>;
 
-def int_mips_dpsub_s_h : GCCBuiltin<"__builtin_msa_dpsub_s_h">,
+def int_mips_dpsub_s_h : ClangBuiltin<"__builtin_msa_dpsub_s_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v16i8_ty, llvm_v16i8_ty],
   [IntrNoMem]>;
-def int_mips_dpsub_s_w : GCCBuiltin<"__builtin_msa_dpsub_s_w">,
+def int_mips_dpsub_s_w : ClangBuiltin<"__builtin_msa_dpsub_s_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty],
   [IntrNoMem]>;
-def int_mips_dpsub_s_d : GCCBuiltin<"__builtin_msa_dpsub_s_d">,
+def int_mips_dpsub_s_d : ClangBuiltin<"__builtin_msa_dpsub_s_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v4i32_ty, llvm_v4i32_ty],
   [IntrNoMem]>;
 
-def int_mips_dpsub_u_h : GCCBuiltin<"__builtin_msa_dpsub_u_h">,
+def int_mips_dpsub_u_h : ClangBuiltin<"__builtin_msa_dpsub_u_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v16i8_ty, llvm_v16i8_ty],
   [IntrNoMem]>;
-def int_mips_dpsub_u_w : GCCBuiltin<"__builtin_msa_dpsub_u_w">,
+def int_mips_dpsub_u_w : ClangBuiltin<"__builtin_msa_dpsub_u_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty],
   [IntrNoMem]>;
-def int_mips_dpsub_u_d : GCCBuiltin<"__builtin_msa_dpsub_u_d">,
+def int_mips_dpsub_u_d : ClangBuiltin<"__builtin_msa_dpsub_u_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v4i32_ty, llvm_v4i32_ty],
   [IntrNoMem]>;
 
-def int_mips_fadd_w : GCCBuiltin<"__builtin_msa_fadd_w">,
+def int_mips_fadd_w : ClangBuiltin<"__builtin_msa_fadd_w">,
   Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fadd_d : GCCBuiltin<"__builtin_msa_fadd_d">,
+def int_mips_fadd_d : ClangBuiltin<"__builtin_msa_fadd_d">,
   Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fcaf_w : GCCBuiltin<"__builtin_msa_fcaf_w">,
+def int_mips_fcaf_w : ClangBuiltin<"__builtin_msa_fcaf_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fcaf_d : GCCBuiltin<"__builtin_msa_fcaf_d">,
+def int_mips_fcaf_d : ClangBuiltin<"__builtin_msa_fcaf_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fceq_w : GCCBuiltin<"__builtin_msa_fceq_w">,
+def int_mips_fceq_w : ClangBuiltin<"__builtin_msa_fceq_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fceq_d : GCCBuiltin<"__builtin_msa_fceq_d">,
+def int_mips_fceq_d : ClangBuiltin<"__builtin_msa_fceq_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fcle_w : GCCBuiltin<"__builtin_msa_fcle_w">,
+def int_mips_fcle_w : ClangBuiltin<"__builtin_msa_fcle_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fcle_d : GCCBuiltin<"__builtin_msa_fcle_d">,
+def int_mips_fcle_d : ClangBuiltin<"__builtin_msa_fcle_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fclt_w : GCCBuiltin<"__builtin_msa_fclt_w">,
+def int_mips_fclt_w : ClangBuiltin<"__builtin_msa_fclt_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fclt_d : GCCBuiltin<"__builtin_msa_fclt_d">,
+def int_mips_fclt_d : ClangBuiltin<"__builtin_msa_fclt_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fclass_w : GCCBuiltin<"__builtin_msa_fclass_w">,
+def int_mips_fclass_w : ClangBuiltin<"__builtin_msa_fclass_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fclass_d : GCCBuiltin<"__builtin_msa_fclass_d">,
+def int_mips_fclass_d : ClangBuiltin<"__builtin_msa_fclass_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fcne_w : GCCBuiltin<"__builtin_msa_fcne_w">,
+def int_mips_fcne_w : ClangBuiltin<"__builtin_msa_fcne_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fcne_d : GCCBuiltin<"__builtin_msa_fcne_d">,
+def int_mips_fcne_d : ClangBuiltin<"__builtin_msa_fcne_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fcor_w : GCCBuiltin<"__builtin_msa_fcor_w">,
+def int_mips_fcor_w : ClangBuiltin<"__builtin_msa_fcor_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fcor_d : GCCBuiltin<"__builtin_msa_fcor_d">,
+def int_mips_fcor_d : ClangBuiltin<"__builtin_msa_fcor_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fcueq_w : GCCBuiltin<"__builtin_msa_fcueq_w">,
+def int_mips_fcueq_w : ClangBuiltin<"__builtin_msa_fcueq_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fcueq_d : GCCBuiltin<"__builtin_msa_fcueq_d">,
+def int_mips_fcueq_d : ClangBuiltin<"__builtin_msa_fcueq_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fcule_w : GCCBuiltin<"__builtin_msa_fcule_w">,
+def int_mips_fcule_w : ClangBuiltin<"__builtin_msa_fcule_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fcule_d : GCCBuiltin<"__builtin_msa_fcule_d">,
+def int_mips_fcule_d : ClangBuiltin<"__builtin_msa_fcule_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fcult_w : GCCBuiltin<"__builtin_msa_fcult_w">,
+def int_mips_fcult_w : ClangBuiltin<"__builtin_msa_fcult_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fcult_d : GCCBuiltin<"__builtin_msa_fcult_d">,
+def int_mips_fcult_d : ClangBuiltin<"__builtin_msa_fcult_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fcun_w : GCCBuiltin<"__builtin_msa_fcun_w">,
+def int_mips_fcun_w : ClangBuiltin<"__builtin_msa_fcun_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fcun_d : GCCBuiltin<"__builtin_msa_fcun_d">,
+def int_mips_fcun_d : ClangBuiltin<"__builtin_msa_fcun_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fcune_w : GCCBuiltin<"__builtin_msa_fcune_w">,
+def int_mips_fcune_w : ClangBuiltin<"__builtin_msa_fcune_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fcune_d : GCCBuiltin<"__builtin_msa_fcune_d">,
+def int_mips_fcune_d : ClangBuiltin<"__builtin_msa_fcune_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fdiv_w : GCCBuiltin<"__builtin_msa_fdiv_w">,
+def int_mips_fdiv_w : ClangBuiltin<"__builtin_msa_fdiv_w">,
   Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fdiv_d : GCCBuiltin<"__builtin_msa_fdiv_d">,
+def int_mips_fdiv_d : ClangBuiltin<"__builtin_msa_fdiv_d">,
   Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fexdo_h : GCCBuiltin<"__builtin_msa_fexdo_h">,
+def int_mips_fexdo_h : ClangBuiltin<"__builtin_msa_fexdo_h">,
   Intrinsic<[llvm_v8f16_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fexdo_w : GCCBuiltin<"__builtin_msa_fexdo_w">,
+def int_mips_fexdo_w : ClangBuiltin<"__builtin_msa_fexdo_w">,
   Intrinsic<[llvm_v4f32_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fexp2_w : GCCBuiltin<"__builtin_msa_fexp2_w">,
+def int_mips_fexp2_w : ClangBuiltin<"__builtin_msa_fexp2_w">,
   Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_fexp2_d : GCCBuiltin<"__builtin_msa_fexp2_d">,
+def int_mips_fexp2_d : ClangBuiltin<"__builtin_msa_fexp2_d">,
   Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_fexupl_w : GCCBuiltin<"__builtin_msa_fexupl_w">,
+def int_mips_fexupl_w : ClangBuiltin<"__builtin_msa_fexupl_w">,
   Intrinsic<[llvm_v4f32_ty], [llvm_v8f16_ty], [IntrNoMem]>;
-def int_mips_fexupl_d : GCCBuiltin<"__builtin_msa_fexupl_d">,
+def int_mips_fexupl_d : ClangBuiltin<"__builtin_msa_fexupl_d">,
   Intrinsic<[llvm_v2f64_ty], [llvm_v4f32_ty], [IntrNoMem]>;
 
-def int_mips_fexupr_w : GCCBuiltin<"__builtin_msa_fexupr_w">,
+def int_mips_fexupr_w : ClangBuiltin<"__builtin_msa_fexupr_w">,
   Intrinsic<[llvm_v4f32_ty], [llvm_v8f16_ty], [IntrNoMem]>;
-def int_mips_fexupr_d : GCCBuiltin<"__builtin_msa_fexupr_d">,
+def int_mips_fexupr_d : ClangBuiltin<"__builtin_msa_fexupr_d">,
   Intrinsic<[llvm_v2f64_ty], [llvm_v4f32_ty], [IntrNoMem]>;
 
-def int_mips_ffint_s_w : GCCBuiltin<"__builtin_msa_ffint_s_w">,
+def int_mips_ffint_s_w : ClangBuiltin<"__builtin_msa_ffint_s_w">,
   Intrinsic<[llvm_v4f32_ty], [llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_ffint_s_d : GCCBuiltin<"__builtin_msa_ffint_s_d">,
+def int_mips_ffint_s_d : ClangBuiltin<"__builtin_msa_ffint_s_d">,
   Intrinsic<[llvm_v2f64_ty], [llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_ffint_u_w : GCCBuiltin<"__builtin_msa_ffint_u_w">,
+def int_mips_ffint_u_w : ClangBuiltin<"__builtin_msa_ffint_u_w">,
   Intrinsic<[llvm_v4f32_ty], [llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_ffint_u_d : GCCBuiltin<"__builtin_msa_ffint_u_d">,
+def int_mips_ffint_u_d : ClangBuiltin<"__builtin_msa_ffint_u_d">,
   Intrinsic<[llvm_v2f64_ty], [llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_ffql_w : GCCBuiltin<"__builtin_msa_ffql_w">,
+def int_mips_ffql_w : ClangBuiltin<"__builtin_msa_ffql_w">,
   Intrinsic<[llvm_v4f32_ty], [llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_ffql_d : GCCBuiltin<"__builtin_msa_ffql_d">,
+def int_mips_ffql_d : ClangBuiltin<"__builtin_msa_ffql_d">,
   Intrinsic<[llvm_v2f64_ty], [llvm_v4i32_ty], [IntrNoMem]>;
 
-def int_mips_ffqr_w : GCCBuiltin<"__builtin_msa_ffqr_w">,
+def int_mips_ffqr_w : ClangBuiltin<"__builtin_msa_ffqr_w">,
   Intrinsic<[llvm_v4f32_ty], [llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_ffqr_d : GCCBuiltin<"__builtin_msa_ffqr_d">,
+def int_mips_ffqr_d : ClangBuiltin<"__builtin_msa_ffqr_d">,
   Intrinsic<[llvm_v2f64_ty], [llvm_v4i32_ty], [IntrNoMem]>;
 
-def int_mips_fill_b : GCCBuiltin<"__builtin_msa_fill_b">,
+def int_mips_fill_b : ClangBuiltin<"__builtin_msa_fill_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_i32_ty], [IntrNoMem]>;
-def int_mips_fill_h : GCCBuiltin<"__builtin_msa_fill_h">,
+def int_mips_fill_h : ClangBuiltin<"__builtin_msa_fill_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_i32_ty], [IntrNoMem]>;
-def int_mips_fill_w : GCCBuiltin<"__builtin_msa_fill_w">,
+def int_mips_fill_w : ClangBuiltin<"__builtin_msa_fill_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_i32_ty], [IntrNoMem]>;
-def int_mips_fill_d : GCCBuiltin<"__builtin_msa_fill_d">,
+def int_mips_fill_d : ClangBuiltin<"__builtin_msa_fill_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_i64_ty], [IntrNoMem]>;
 
-def int_mips_flog2_w : GCCBuiltin<"__builtin_msa_flog2_w">,
+def int_mips_flog2_w : ClangBuiltin<"__builtin_msa_flog2_w">,
   Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_flog2_d : GCCBuiltin<"__builtin_msa_flog2_d">,
+def int_mips_flog2_d : ClangBuiltin<"__builtin_msa_flog2_d">,
   Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fmadd_w : GCCBuiltin<"__builtin_msa_fmadd_w">,
+def int_mips_fmadd_w : ClangBuiltin<"__builtin_msa_fmadd_w">,
   Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
             [IntrNoMem]>;
-def int_mips_fmadd_d : GCCBuiltin<"__builtin_msa_fmadd_d">,
+def int_mips_fmadd_d : ClangBuiltin<"__builtin_msa_fmadd_d">,
   Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
             [IntrNoMem]>;
 
-def int_mips_fmax_w : GCCBuiltin<"__builtin_msa_fmax_w">,
+def int_mips_fmax_w : ClangBuiltin<"__builtin_msa_fmax_w">,
   Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fmax_d : GCCBuiltin<"__builtin_msa_fmax_d">,
+def int_mips_fmax_d : ClangBuiltin<"__builtin_msa_fmax_d">,
   Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fmax_a_w : GCCBuiltin<"__builtin_msa_fmax_a_w">,
+def int_mips_fmax_a_w : ClangBuiltin<"__builtin_msa_fmax_a_w">,
   Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fmax_a_d : GCCBuiltin<"__builtin_msa_fmax_a_d">,
+def int_mips_fmax_a_d : ClangBuiltin<"__builtin_msa_fmax_a_d">,
   Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fmin_w : GCCBuiltin<"__builtin_msa_fmin_w">,
+def int_mips_fmin_w : ClangBuiltin<"__builtin_msa_fmin_w">,
   Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fmin_d : GCCBuiltin<"__builtin_msa_fmin_d">,
+def int_mips_fmin_d : ClangBuiltin<"__builtin_msa_fmin_d">,
   Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fmin_a_w : GCCBuiltin<"__builtin_msa_fmin_a_w">,
+def int_mips_fmin_a_w : ClangBuiltin<"__builtin_msa_fmin_a_w">,
   Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fmin_a_d : GCCBuiltin<"__builtin_msa_fmin_a_d">,
+def int_mips_fmin_a_d : ClangBuiltin<"__builtin_msa_fmin_a_d">,
   Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fmsub_w : GCCBuiltin<"__builtin_msa_fmsub_w">,
+def int_mips_fmsub_w : ClangBuiltin<"__builtin_msa_fmsub_w">,
   Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
             [IntrNoMem]>;
-def int_mips_fmsub_d : GCCBuiltin<"__builtin_msa_fmsub_d">,
+def int_mips_fmsub_d : ClangBuiltin<"__builtin_msa_fmsub_d">,
   Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
             [IntrNoMem]>;
 
-def int_mips_fmul_w : GCCBuiltin<"__builtin_msa_fmul_w">,
+def int_mips_fmul_w : ClangBuiltin<"__builtin_msa_fmul_w">,
   Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fmul_d : GCCBuiltin<"__builtin_msa_fmul_d">,
+def int_mips_fmul_d : ClangBuiltin<"__builtin_msa_fmul_d">,
   Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_frint_w : GCCBuiltin<"__builtin_msa_frint_w">,
+def int_mips_frint_w : ClangBuiltin<"__builtin_msa_frint_w">,
   Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_frint_d : GCCBuiltin<"__builtin_msa_frint_d">,
+def int_mips_frint_d : ClangBuiltin<"__builtin_msa_frint_d">,
   Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_frcp_w : GCCBuiltin<"__builtin_msa_frcp_w">,
+def int_mips_frcp_w : ClangBuiltin<"__builtin_msa_frcp_w">,
   Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_frcp_d : GCCBuiltin<"__builtin_msa_frcp_d">,
+def int_mips_frcp_d : ClangBuiltin<"__builtin_msa_frcp_d">,
   Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_frsqrt_w : GCCBuiltin<"__builtin_msa_frsqrt_w">,
+def int_mips_frsqrt_w : ClangBuiltin<"__builtin_msa_frsqrt_w">,
   Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_frsqrt_d : GCCBuiltin<"__builtin_msa_frsqrt_d">,
+def int_mips_frsqrt_d : ClangBuiltin<"__builtin_msa_frsqrt_d">,
   Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fsaf_w : GCCBuiltin<"__builtin_msa_fsaf_w">,
+def int_mips_fsaf_w : ClangBuiltin<"__builtin_msa_fsaf_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fsaf_d : GCCBuiltin<"__builtin_msa_fsaf_d">,
+def int_mips_fsaf_d : ClangBuiltin<"__builtin_msa_fsaf_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fseq_w : GCCBuiltin<"__builtin_msa_fseq_w">,
+def int_mips_fseq_w : ClangBuiltin<"__builtin_msa_fseq_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fseq_d : GCCBuiltin<"__builtin_msa_fseq_d">,
+def int_mips_fseq_d : ClangBuiltin<"__builtin_msa_fseq_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fsle_w : GCCBuiltin<"__builtin_msa_fsle_w">,
+def int_mips_fsle_w : ClangBuiltin<"__builtin_msa_fsle_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fsle_d : GCCBuiltin<"__builtin_msa_fsle_d">,
+def int_mips_fsle_d : ClangBuiltin<"__builtin_msa_fsle_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fslt_w : GCCBuiltin<"__builtin_msa_fslt_w">,
+def int_mips_fslt_w : ClangBuiltin<"__builtin_msa_fslt_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fslt_d : GCCBuiltin<"__builtin_msa_fslt_d">,
+def int_mips_fslt_d : ClangBuiltin<"__builtin_msa_fslt_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fsne_w : GCCBuiltin<"__builtin_msa_fsne_w">,
+def int_mips_fsne_w : ClangBuiltin<"__builtin_msa_fsne_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fsne_d : GCCBuiltin<"__builtin_msa_fsne_d">,
+def int_mips_fsne_d : ClangBuiltin<"__builtin_msa_fsne_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fsor_w : GCCBuiltin<"__builtin_msa_fsor_w">,
+def int_mips_fsor_w : ClangBuiltin<"__builtin_msa_fsor_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fsor_d : GCCBuiltin<"__builtin_msa_fsor_d">,
+def int_mips_fsor_d : ClangBuiltin<"__builtin_msa_fsor_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fsqrt_w : GCCBuiltin<"__builtin_msa_fsqrt_w">,
+def int_mips_fsqrt_w : ClangBuiltin<"__builtin_msa_fsqrt_w">,
   Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fsqrt_d : GCCBuiltin<"__builtin_msa_fsqrt_d">,
+def int_mips_fsqrt_d : ClangBuiltin<"__builtin_msa_fsqrt_d">,
   Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fsub_w : GCCBuiltin<"__builtin_msa_fsub_w">,
+def int_mips_fsub_w : ClangBuiltin<"__builtin_msa_fsub_w">,
   Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fsub_d : GCCBuiltin<"__builtin_msa_fsub_d">,
+def int_mips_fsub_d : ClangBuiltin<"__builtin_msa_fsub_d">,
   Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fsueq_w : GCCBuiltin<"__builtin_msa_fsueq_w">,
+def int_mips_fsueq_w : ClangBuiltin<"__builtin_msa_fsueq_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fsueq_d : GCCBuiltin<"__builtin_msa_fsueq_d">,
+def int_mips_fsueq_d : ClangBuiltin<"__builtin_msa_fsueq_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fsule_w : GCCBuiltin<"__builtin_msa_fsule_w">,
+def int_mips_fsule_w : ClangBuiltin<"__builtin_msa_fsule_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fsule_d : GCCBuiltin<"__builtin_msa_fsule_d">,
+def int_mips_fsule_d : ClangBuiltin<"__builtin_msa_fsule_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fsult_w : GCCBuiltin<"__builtin_msa_fsult_w">,
+def int_mips_fsult_w : ClangBuiltin<"__builtin_msa_fsult_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fsult_d : GCCBuiltin<"__builtin_msa_fsult_d">,
+def int_mips_fsult_d : ClangBuiltin<"__builtin_msa_fsult_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fsun_w : GCCBuiltin<"__builtin_msa_fsun_w">,
+def int_mips_fsun_w : ClangBuiltin<"__builtin_msa_fsun_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fsun_d : GCCBuiltin<"__builtin_msa_fsun_d">,
+def int_mips_fsun_d : ClangBuiltin<"__builtin_msa_fsun_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_fsune_w : GCCBuiltin<"__builtin_msa_fsune_w">,
+def int_mips_fsune_w : ClangBuiltin<"__builtin_msa_fsune_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_fsune_d : GCCBuiltin<"__builtin_msa_fsune_d">,
+def int_mips_fsune_d : ClangBuiltin<"__builtin_msa_fsune_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_ftint_s_w : GCCBuiltin<"__builtin_msa_ftint_s_w">,
+def int_mips_ftint_s_w : ClangBuiltin<"__builtin_msa_ftint_s_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_ftint_s_d : GCCBuiltin<"__builtin_msa_ftint_s_d">,
+def int_mips_ftint_s_d : ClangBuiltin<"__builtin_msa_ftint_s_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_ftint_u_w : GCCBuiltin<"__builtin_msa_ftint_u_w">,
+def int_mips_ftint_u_w : ClangBuiltin<"__builtin_msa_ftint_u_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_ftint_u_d : GCCBuiltin<"__builtin_msa_ftint_u_d">,
+def int_mips_ftint_u_d : ClangBuiltin<"__builtin_msa_ftint_u_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_ftq_h : GCCBuiltin<"__builtin_msa_ftq_h">,
+def int_mips_ftq_h : ClangBuiltin<"__builtin_msa_ftq_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_ftq_w : GCCBuiltin<"__builtin_msa_ftq_w">,
+def int_mips_ftq_w : ClangBuiltin<"__builtin_msa_ftq_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_ftrunc_s_w : GCCBuiltin<"__builtin_msa_ftrunc_s_w">,
+def int_mips_ftrunc_s_w : ClangBuiltin<"__builtin_msa_ftrunc_s_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_ftrunc_s_d : GCCBuiltin<"__builtin_msa_ftrunc_s_d">,
+def int_mips_ftrunc_s_d : ClangBuiltin<"__builtin_msa_ftrunc_s_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_ftrunc_u_w : GCCBuiltin<"__builtin_msa_ftrunc_u_w">,
+def int_mips_ftrunc_u_w : ClangBuiltin<"__builtin_msa_ftrunc_u_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-def int_mips_ftrunc_u_d : GCCBuiltin<"__builtin_msa_ftrunc_u_d">,
+def int_mips_ftrunc_u_d : ClangBuiltin<"__builtin_msa_ftrunc_u_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
 
-def int_mips_hadd_s_h : GCCBuiltin<"__builtin_msa_hadd_s_h">,
+def int_mips_hadd_s_h : ClangBuiltin<"__builtin_msa_hadd_s_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_hadd_s_w : GCCBuiltin<"__builtin_msa_hadd_s_w">,
+def int_mips_hadd_s_w : ClangBuiltin<"__builtin_msa_hadd_s_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_hadd_s_d : GCCBuiltin<"__builtin_msa_hadd_s_d">,
+def int_mips_hadd_s_d : ClangBuiltin<"__builtin_msa_hadd_s_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
 
-def int_mips_hadd_u_h : GCCBuiltin<"__builtin_msa_hadd_u_h">,
+def int_mips_hadd_u_h : ClangBuiltin<"__builtin_msa_hadd_u_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_hadd_u_w : GCCBuiltin<"__builtin_msa_hadd_u_w">,
+def int_mips_hadd_u_w : ClangBuiltin<"__builtin_msa_hadd_u_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_hadd_u_d : GCCBuiltin<"__builtin_msa_hadd_u_d">,
+def int_mips_hadd_u_d : ClangBuiltin<"__builtin_msa_hadd_u_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
 
-def int_mips_hsub_s_h : GCCBuiltin<"__builtin_msa_hsub_s_h">,
+def int_mips_hsub_s_h : ClangBuiltin<"__builtin_msa_hsub_s_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_hsub_s_w : GCCBuiltin<"__builtin_msa_hsub_s_w">,
+def int_mips_hsub_s_w : ClangBuiltin<"__builtin_msa_hsub_s_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_hsub_s_d : GCCBuiltin<"__builtin_msa_hsub_s_d">,
+def int_mips_hsub_s_d : ClangBuiltin<"__builtin_msa_hsub_s_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
 
-def int_mips_hsub_u_h : GCCBuiltin<"__builtin_msa_hsub_u_h">,
+def int_mips_hsub_u_h : ClangBuiltin<"__builtin_msa_hsub_u_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_hsub_u_w : GCCBuiltin<"__builtin_msa_hsub_u_w">,
+def int_mips_hsub_u_w : ClangBuiltin<"__builtin_msa_hsub_u_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_hsub_u_d : GCCBuiltin<"__builtin_msa_hsub_u_d">,
+def int_mips_hsub_u_d : ClangBuiltin<"__builtin_msa_hsub_u_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
 
-def int_mips_ilvev_b : GCCBuiltin<"__builtin_msa_ilvev_b">,
+def int_mips_ilvev_b : ClangBuiltin<"__builtin_msa_ilvev_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_ilvev_h : GCCBuiltin<"__builtin_msa_ilvev_h">,
+def int_mips_ilvev_h : ClangBuiltin<"__builtin_msa_ilvev_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_ilvev_w : GCCBuiltin<"__builtin_msa_ilvev_w">,
+def int_mips_ilvev_w : ClangBuiltin<"__builtin_msa_ilvev_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_ilvev_d : GCCBuiltin<"__builtin_msa_ilvev_d">,
+def int_mips_ilvev_d : ClangBuiltin<"__builtin_msa_ilvev_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_ilvl_b : GCCBuiltin<"__builtin_msa_ilvl_b">,
+def int_mips_ilvl_b : ClangBuiltin<"__builtin_msa_ilvl_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_ilvl_h : GCCBuiltin<"__builtin_msa_ilvl_h">,
+def int_mips_ilvl_h : ClangBuiltin<"__builtin_msa_ilvl_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_ilvl_w : GCCBuiltin<"__builtin_msa_ilvl_w">,
+def int_mips_ilvl_w : ClangBuiltin<"__builtin_msa_ilvl_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_ilvl_d : GCCBuiltin<"__builtin_msa_ilvl_d">,
+def int_mips_ilvl_d : ClangBuiltin<"__builtin_msa_ilvl_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_ilvod_b : GCCBuiltin<"__builtin_msa_ilvod_b">,
+def int_mips_ilvod_b : ClangBuiltin<"__builtin_msa_ilvod_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_ilvod_h : GCCBuiltin<"__builtin_msa_ilvod_h">,
+def int_mips_ilvod_h : ClangBuiltin<"__builtin_msa_ilvod_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_ilvod_w : GCCBuiltin<"__builtin_msa_ilvod_w">,
+def int_mips_ilvod_w : ClangBuiltin<"__builtin_msa_ilvod_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_ilvod_d : GCCBuiltin<"__builtin_msa_ilvod_d">,
+def int_mips_ilvod_d : ClangBuiltin<"__builtin_msa_ilvod_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_ilvr_b : GCCBuiltin<"__builtin_msa_ilvr_b">,
+def int_mips_ilvr_b : ClangBuiltin<"__builtin_msa_ilvr_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_ilvr_h : GCCBuiltin<"__builtin_msa_ilvr_h">,
+def int_mips_ilvr_h : ClangBuiltin<"__builtin_msa_ilvr_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_ilvr_w : GCCBuiltin<"__builtin_msa_ilvr_w">,
+def int_mips_ilvr_w : ClangBuiltin<"__builtin_msa_ilvr_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_ilvr_d : GCCBuiltin<"__builtin_msa_ilvr_d">,
+def int_mips_ilvr_d : ClangBuiltin<"__builtin_msa_ilvr_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_insert_b : GCCBuiltin<"__builtin_msa_insert_b">,
+def int_mips_insert_b : ClangBuiltin<"__builtin_msa_insert_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty, llvm_i32_ty],
   [IntrNoMem]>;
-def int_mips_insert_h : GCCBuiltin<"__builtin_msa_insert_h">,
+def int_mips_insert_h : ClangBuiltin<"__builtin_msa_insert_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty, llvm_i32_ty],
   [IntrNoMem]>;
-def int_mips_insert_w : GCCBuiltin<"__builtin_msa_insert_w">,
+def int_mips_insert_w : ClangBuiltin<"__builtin_msa_insert_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty, llvm_i32_ty],
   [IntrNoMem]>;
-def int_mips_insert_d : GCCBuiltin<"__builtin_msa_insert_d">,
+def int_mips_insert_d : ClangBuiltin<"__builtin_msa_insert_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty, llvm_i64_ty],
   [IntrNoMem]>;
 
-def int_mips_insve_b : GCCBuiltin<"__builtin_msa_insve_b">,
+def int_mips_insve_b : ClangBuiltin<"__builtin_msa_insve_b">,
   Intrinsic<[llvm_v16i8_ty],
             [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty],
             [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_insve_h : GCCBuiltin<"__builtin_msa_insve_h">,
+def int_mips_insve_h : ClangBuiltin<"__builtin_msa_insve_h">,
   Intrinsic<[llvm_v8i16_ty],
             [llvm_v8i16_ty, llvm_i32_ty, llvm_v8i16_ty],
             [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_insve_w : GCCBuiltin<"__builtin_msa_insve_w">,
+def int_mips_insve_w : ClangBuiltin<"__builtin_msa_insve_w">,
   Intrinsic<[llvm_v4i32_ty],
             [llvm_v4i32_ty, llvm_i32_ty, llvm_v4i32_ty],
             [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_insve_d : GCCBuiltin<"__builtin_msa_insve_d">,
+def int_mips_insve_d : ClangBuiltin<"__builtin_msa_insve_d">,
   Intrinsic<[llvm_v2i64_ty],
             [llvm_v2i64_ty, llvm_i32_ty, llvm_v2i64_ty],
             [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-def int_mips_ld_b : GCCBuiltin<"__builtin_msa_ld_b">,
+def int_mips_ld_b : ClangBuiltin<"__builtin_msa_ld_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_ptr_ty, llvm_i32_ty],
   [IntrReadMem, IntrArgMemOnly]>;
-def int_mips_ld_h : GCCBuiltin<"__builtin_msa_ld_h">,
+def int_mips_ld_h : ClangBuiltin<"__builtin_msa_ld_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_ptr_ty, llvm_i32_ty],
   [IntrReadMem, IntrArgMemOnly]>;
-def int_mips_ld_w : GCCBuiltin<"__builtin_msa_ld_w">,
+def int_mips_ld_w : ClangBuiltin<"__builtin_msa_ld_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_i32_ty],
   [IntrReadMem, IntrArgMemOnly]>;
-def int_mips_ld_d : GCCBuiltin<"__builtin_msa_ld_d">,
+def int_mips_ld_d : ClangBuiltin<"__builtin_msa_ld_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_ptr_ty, llvm_i32_ty],
   [IntrReadMem, IntrArgMemOnly]>;
 
-def int_mips_ldr_d : GCCBuiltin<"__builtin_msa_ldr_d">,
+def int_mips_ldr_d : ClangBuiltin<"__builtin_msa_ldr_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_ptr_ty, llvm_i32_ty],
   [IntrReadMem, IntrArgMemOnly]>;
-def int_mips_ldr_w : GCCBuiltin<"__builtin_msa_ldr_w">,
+def int_mips_ldr_w : ClangBuiltin<"__builtin_msa_ldr_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_i32_ty],
   [IntrReadMem, IntrArgMemOnly]>;
 
-def int_mips_ldi_b : GCCBuiltin<"__builtin_msa_ldi_b">,
+def int_mips_ldi_b : ClangBuiltin<"__builtin_msa_ldi_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<0>>]>;
-def int_mips_ldi_h : GCCBuiltin<"__builtin_msa_ldi_h">,
+def int_mips_ldi_h : ClangBuiltin<"__builtin_msa_ldi_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<0>>]>;
-def int_mips_ldi_w : GCCBuiltin<"__builtin_msa_ldi_w">,
+def int_mips_ldi_w : ClangBuiltin<"__builtin_msa_ldi_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<0>>]>;
-def int_mips_ldi_d : GCCBuiltin<"__builtin_msa_ldi_d">,
+def int_mips_ldi_d : ClangBuiltin<"__builtin_msa_ldi_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<0>>]>;
 
 // This instruction is part of the MSA spec but it does not share the
 // __builtin_msa prefix because it operates on the GPR registers.
-def int_mips_lsa : GCCBuiltin<"__builtin_mips_lsa">,
+def int_mips_lsa : ClangBuiltin<"__builtin_mips_lsa">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
             [IntrNoMem]>;
 
-def int_mips_madd_q_h : GCCBuiltin<"__builtin_msa_madd_q_h">,
+def int_mips_madd_q_h : ClangBuiltin<"__builtin_msa_madd_q_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty],
   [IntrNoMem]>;
-def int_mips_madd_q_w : GCCBuiltin<"__builtin_msa_madd_q_w">,
+def int_mips_madd_q_w : ClangBuiltin<"__builtin_msa_madd_q_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
   [IntrNoMem]>;
 
-def int_mips_maddr_q_h : GCCBuiltin<"__builtin_msa_maddr_q_h">,
+def int_mips_maddr_q_h : ClangBuiltin<"__builtin_msa_maddr_q_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty],
   [IntrNoMem]>;
-def int_mips_maddr_q_w : GCCBuiltin<"__builtin_msa_maddr_q_w">,
+def int_mips_maddr_q_w : ClangBuiltin<"__builtin_msa_maddr_q_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
   [IntrNoMem]>;
 
-def int_mips_maddv_b : GCCBuiltin<"__builtin_msa_maddv_b">,
+def int_mips_maddv_b : ClangBuiltin<"__builtin_msa_maddv_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
   [IntrNoMem]>;
-def int_mips_maddv_h : GCCBuiltin<"__builtin_msa_maddv_h">,
+def int_mips_maddv_h : ClangBuiltin<"__builtin_msa_maddv_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty],
   [IntrNoMem]>;
-def int_mips_maddv_w : GCCBuiltin<"__builtin_msa_maddv_w">,
+def int_mips_maddv_w : ClangBuiltin<"__builtin_msa_maddv_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
   [IntrNoMem]>;
-def int_mips_maddv_d : GCCBuiltin<"__builtin_msa_maddv_d">,
+def int_mips_maddv_d : ClangBuiltin<"__builtin_msa_maddv_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty],
   [IntrNoMem]>;
 
-def int_mips_max_a_b : GCCBuiltin<"__builtin_msa_max_a_b">,
+def int_mips_max_a_b : ClangBuiltin<"__builtin_msa_max_a_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_max_a_h : GCCBuiltin<"__builtin_msa_max_a_h">,
+def int_mips_max_a_h : ClangBuiltin<"__builtin_msa_max_a_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_max_a_w : GCCBuiltin<"__builtin_msa_max_a_w">,
+def int_mips_max_a_w : ClangBuiltin<"__builtin_msa_max_a_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_max_a_d : GCCBuiltin<"__builtin_msa_max_a_d">,
+def int_mips_max_a_d : ClangBuiltin<"__builtin_msa_max_a_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_max_s_b : GCCBuiltin<"__builtin_msa_max_s_b">,
+def int_mips_max_s_b : ClangBuiltin<"__builtin_msa_max_s_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_max_s_h : GCCBuiltin<"__builtin_msa_max_s_h">,
+def int_mips_max_s_h : ClangBuiltin<"__builtin_msa_max_s_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_max_s_w : GCCBuiltin<"__builtin_msa_max_s_w">,
+def int_mips_max_s_w : ClangBuiltin<"__builtin_msa_max_s_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_max_s_d : GCCBuiltin<"__builtin_msa_max_s_d">,
+def int_mips_max_s_d : ClangBuiltin<"__builtin_msa_max_s_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_max_u_b : GCCBuiltin<"__builtin_msa_max_u_b">,
+def int_mips_max_u_b : ClangBuiltin<"__builtin_msa_max_u_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_max_u_h : GCCBuiltin<"__builtin_msa_max_u_h">,
+def int_mips_max_u_h : ClangBuiltin<"__builtin_msa_max_u_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_max_u_w : GCCBuiltin<"__builtin_msa_max_u_w">,
+def int_mips_max_u_w : ClangBuiltin<"__builtin_msa_max_u_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_max_u_d : GCCBuiltin<"__builtin_msa_max_u_d">,
+def int_mips_max_u_d : ClangBuiltin<"__builtin_msa_max_u_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_maxi_s_b : GCCBuiltin<"__builtin_msa_maxi_s_b">,
+def int_mips_maxi_s_b : ClangBuiltin<"__builtin_msa_maxi_s_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_maxi_s_h : GCCBuiltin<"__builtin_msa_maxi_s_h">,
+def int_mips_maxi_s_h : ClangBuiltin<"__builtin_msa_maxi_s_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_maxi_s_w : GCCBuiltin<"__builtin_msa_maxi_s_w">,
+def int_mips_maxi_s_w : ClangBuiltin<"__builtin_msa_maxi_s_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_maxi_s_d : GCCBuiltin<"__builtin_msa_maxi_s_d">,
+def int_mips_maxi_s_d : ClangBuiltin<"__builtin_msa_maxi_s_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-def int_mips_maxi_u_b : GCCBuiltin<"__builtin_msa_maxi_u_b">,
+def int_mips_maxi_u_b : ClangBuiltin<"__builtin_msa_maxi_u_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_maxi_u_h : GCCBuiltin<"__builtin_msa_maxi_u_h">,
+def int_mips_maxi_u_h : ClangBuiltin<"__builtin_msa_maxi_u_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_maxi_u_w : GCCBuiltin<"__builtin_msa_maxi_u_w">,
+def int_mips_maxi_u_w : ClangBuiltin<"__builtin_msa_maxi_u_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_maxi_u_d : GCCBuiltin<"__builtin_msa_maxi_u_d">,
+def int_mips_maxi_u_d : ClangBuiltin<"__builtin_msa_maxi_u_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-def int_mips_min_a_b : GCCBuiltin<"__builtin_msa_min_a_b">,
+def int_mips_min_a_b : ClangBuiltin<"__builtin_msa_min_a_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_min_a_h : GCCBuiltin<"__builtin_msa_min_a_h">,
+def int_mips_min_a_h : ClangBuiltin<"__builtin_msa_min_a_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_min_a_w : GCCBuiltin<"__builtin_msa_min_a_w">,
+def int_mips_min_a_w : ClangBuiltin<"__builtin_msa_min_a_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_min_a_d : GCCBuiltin<"__builtin_msa_min_a_d">,
+def int_mips_min_a_d : ClangBuiltin<"__builtin_msa_min_a_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_min_s_b : GCCBuiltin<"__builtin_msa_min_s_b">,
+def int_mips_min_s_b : ClangBuiltin<"__builtin_msa_min_s_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_min_s_h : GCCBuiltin<"__builtin_msa_min_s_h">,
+def int_mips_min_s_h : ClangBuiltin<"__builtin_msa_min_s_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_min_s_w : GCCBuiltin<"__builtin_msa_min_s_w">,
+def int_mips_min_s_w : ClangBuiltin<"__builtin_msa_min_s_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_min_s_d : GCCBuiltin<"__builtin_msa_min_s_d">,
+def int_mips_min_s_d : ClangBuiltin<"__builtin_msa_min_s_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_min_u_b : GCCBuiltin<"__builtin_msa_min_u_b">,
+def int_mips_min_u_b : ClangBuiltin<"__builtin_msa_min_u_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_min_u_h : GCCBuiltin<"__builtin_msa_min_u_h">,
+def int_mips_min_u_h : ClangBuiltin<"__builtin_msa_min_u_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_min_u_w : GCCBuiltin<"__builtin_msa_min_u_w">,
+def int_mips_min_u_w : ClangBuiltin<"__builtin_msa_min_u_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_min_u_d : GCCBuiltin<"__builtin_msa_min_u_d">,
+def int_mips_min_u_d : ClangBuiltin<"__builtin_msa_min_u_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_mini_s_b : GCCBuiltin<"__builtin_msa_mini_s_b">,
+def int_mips_mini_s_b : ClangBuiltin<"__builtin_msa_mini_s_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_mini_s_h : GCCBuiltin<"__builtin_msa_mini_s_h">,
+def int_mips_mini_s_h : ClangBuiltin<"__builtin_msa_mini_s_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_mini_s_w : GCCBuiltin<"__builtin_msa_mini_s_w">,
+def int_mips_mini_s_w : ClangBuiltin<"__builtin_msa_mini_s_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_mini_s_d : GCCBuiltin<"__builtin_msa_mini_s_d">,
+def int_mips_mini_s_d : ClangBuiltin<"__builtin_msa_mini_s_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-def int_mips_mini_u_b : GCCBuiltin<"__builtin_msa_mini_u_b">,
+def int_mips_mini_u_b : ClangBuiltin<"__builtin_msa_mini_u_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_mini_u_h : GCCBuiltin<"__builtin_msa_mini_u_h">,
+def int_mips_mini_u_h : ClangBuiltin<"__builtin_msa_mini_u_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_mini_u_w : GCCBuiltin<"__builtin_msa_mini_u_w">,
+def int_mips_mini_u_w : ClangBuiltin<"__builtin_msa_mini_u_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_mini_u_d : GCCBuiltin<"__builtin_msa_mini_u_d">,
+def int_mips_mini_u_d : ClangBuiltin<"__builtin_msa_mini_u_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-def int_mips_mod_s_b : GCCBuiltin<"__builtin_msa_mod_s_b">,
+def int_mips_mod_s_b : ClangBuiltin<"__builtin_msa_mod_s_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_mod_s_h : GCCBuiltin<"__builtin_msa_mod_s_h">,
+def int_mips_mod_s_h : ClangBuiltin<"__builtin_msa_mod_s_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_mod_s_w : GCCBuiltin<"__builtin_msa_mod_s_w">,
+def int_mips_mod_s_w : ClangBuiltin<"__builtin_msa_mod_s_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_mod_s_d : GCCBuiltin<"__builtin_msa_mod_s_d">,
+def int_mips_mod_s_d : ClangBuiltin<"__builtin_msa_mod_s_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_mod_u_b : GCCBuiltin<"__builtin_msa_mod_u_b">,
+def int_mips_mod_u_b : ClangBuiltin<"__builtin_msa_mod_u_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_mod_u_h : GCCBuiltin<"__builtin_msa_mod_u_h">,
+def int_mips_mod_u_h : ClangBuiltin<"__builtin_msa_mod_u_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_mod_u_w : GCCBuiltin<"__builtin_msa_mod_u_w">,
+def int_mips_mod_u_w : ClangBuiltin<"__builtin_msa_mod_u_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_mod_u_d : GCCBuiltin<"__builtin_msa_mod_u_d">,
+def int_mips_mod_u_d : ClangBuiltin<"__builtin_msa_mod_u_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_move_v : GCCBuiltin<"__builtin_msa_move_v">,
+def int_mips_move_v : ClangBuiltin<"__builtin_msa_move_v">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>;
 
-def int_mips_msub_q_h : GCCBuiltin<"__builtin_msa_msub_q_h">,
+def int_mips_msub_q_h : ClangBuiltin<"__builtin_msa_msub_q_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty],
   [IntrNoMem]>;
-def int_mips_msub_q_w : GCCBuiltin<"__builtin_msa_msub_q_w">,
+def int_mips_msub_q_w : ClangBuiltin<"__builtin_msa_msub_q_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
   [IntrNoMem]>;
 
-def int_mips_msubr_q_h : GCCBuiltin<"__builtin_msa_msubr_q_h">,
+def int_mips_msubr_q_h : ClangBuiltin<"__builtin_msa_msubr_q_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty],
   [IntrNoMem]>;
-def int_mips_msubr_q_w : GCCBuiltin<"__builtin_msa_msubr_q_w">,
+def int_mips_msubr_q_w : ClangBuiltin<"__builtin_msa_msubr_q_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
   [IntrNoMem]>;
 
-def int_mips_msubv_b : GCCBuiltin<"__builtin_msa_msubv_b">,
+def int_mips_msubv_b : ClangBuiltin<"__builtin_msa_msubv_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
   [IntrNoMem]>;
-def int_mips_msubv_h : GCCBuiltin<"__builtin_msa_msubv_h">,
+def int_mips_msubv_h : ClangBuiltin<"__builtin_msa_msubv_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty],
   [IntrNoMem]>;
-def int_mips_msubv_w : GCCBuiltin<"__builtin_msa_msubv_w">,
+def int_mips_msubv_w : ClangBuiltin<"__builtin_msa_msubv_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
   [IntrNoMem]>;
-def int_mips_msubv_d : GCCBuiltin<"__builtin_msa_msubv_d">,
+def int_mips_msubv_d : ClangBuiltin<"__builtin_msa_msubv_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty],
   [IntrNoMem]>;
 
-def int_mips_mul_q_h : GCCBuiltin<"__builtin_msa_mul_q_h">,
+def int_mips_mul_q_h : ClangBuiltin<"__builtin_msa_mul_q_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_mul_q_w : GCCBuiltin<"__builtin_msa_mul_q_w">,
+def int_mips_mul_q_w : ClangBuiltin<"__builtin_msa_mul_q_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
 
-def int_mips_mulr_q_h : GCCBuiltin<"__builtin_msa_mulr_q_h">,
+def int_mips_mulr_q_h : ClangBuiltin<"__builtin_msa_mulr_q_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_mulr_q_w : GCCBuiltin<"__builtin_msa_mulr_q_w">,
+def int_mips_mulr_q_w : ClangBuiltin<"__builtin_msa_mulr_q_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
 
-def int_mips_mulv_b : GCCBuiltin<"__builtin_msa_mulv_b">,
+def int_mips_mulv_b : ClangBuiltin<"__builtin_msa_mulv_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_mulv_h : GCCBuiltin<"__builtin_msa_mulv_h">,
+def int_mips_mulv_h : ClangBuiltin<"__builtin_msa_mulv_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_mulv_w : GCCBuiltin<"__builtin_msa_mulv_w">,
+def int_mips_mulv_w : ClangBuiltin<"__builtin_msa_mulv_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_mulv_d : GCCBuiltin<"__builtin_msa_mulv_d">,
+def int_mips_mulv_d : ClangBuiltin<"__builtin_msa_mulv_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_nloc_b : GCCBuiltin<"__builtin_msa_nloc_b">,
+def int_mips_nloc_b : ClangBuiltin<"__builtin_msa_nloc_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_nloc_h : GCCBuiltin<"__builtin_msa_nloc_h">,
+def int_mips_nloc_h : ClangBuiltin<"__builtin_msa_nloc_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_nloc_w : GCCBuiltin<"__builtin_msa_nloc_w">,
+def int_mips_nloc_w : ClangBuiltin<"__builtin_msa_nloc_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_nloc_d : GCCBuiltin<"__builtin_msa_nloc_d">,
+def int_mips_nloc_d : ClangBuiltin<"__builtin_msa_nloc_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_nlzc_b : GCCBuiltin<"__builtin_msa_nlzc_b">,
+def int_mips_nlzc_b : ClangBuiltin<"__builtin_msa_nlzc_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_nlzc_h : GCCBuiltin<"__builtin_msa_nlzc_h">,
+def int_mips_nlzc_h : ClangBuiltin<"__builtin_msa_nlzc_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_nlzc_w : GCCBuiltin<"__builtin_msa_nlzc_w">,
+def int_mips_nlzc_w : ClangBuiltin<"__builtin_msa_nlzc_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_nlzc_d : GCCBuiltin<"__builtin_msa_nlzc_d">,
+def int_mips_nlzc_d : ClangBuiltin<"__builtin_msa_nlzc_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_nor_v : GCCBuiltin<"__builtin_msa_nor_v">,
+def int_mips_nor_v : ClangBuiltin<"__builtin_msa_nor_v">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
 
-def int_mips_nori_b : GCCBuiltin<"__builtin_msa_nori_b">,
+def int_mips_nori_b : ClangBuiltin<"__builtin_msa_nori_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-def int_mips_or_v : GCCBuiltin<"__builtin_msa_or_v">,
+def int_mips_or_v : ClangBuiltin<"__builtin_msa_or_v">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
 
-def int_mips_ori_b : GCCBuiltin<"__builtin_msa_ori_b">,
+def int_mips_ori_b : ClangBuiltin<"__builtin_msa_ori_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-def int_mips_pckev_b : GCCBuiltin<"__builtin_msa_pckev_b">,
+def int_mips_pckev_b : ClangBuiltin<"__builtin_msa_pckev_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_pckev_h : GCCBuiltin<"__builtin_msa_pckev_h">,
+def int_mips_pckev_h : ClangBuiltin<"__builtin_msa_pckev_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_pckev_w : GCCBuiltin<"__builtin_msa_pckev_w">,
+def int_mips_pckev_w : ClangBuiltin<"__builtin_msa_pckev_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_pckev_d : GCCBuiltin<"__builtin_msa_pckev_d">,
+def int_mips_pckev_d : ClangBuiltin<"__builtin_msa_pckev_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_pckod_b : GCCBuiltin<"__builtin_msa_pckod_b">,
+def int_mips_pckod_b : ClangBuiltin<"__builtin_msa_pckod_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_pckod_h : GCCBuiltin<"__builtin_msa_pckod_h">,
+def int_mips_pckod_h : ClangBuiltin<"__builtin_msa_pckod_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_pckod_w : GCCBuiltin<"__builtin_msa_pckod_w">,
+def int_mips_pckod_w : ClangBuiltin<"__builtin_msa_pckod_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_pckod_d : GCCBuiltin<"__builtin_msa_pckod_d">,
+def int_mips_pckod_d : ClangBuiltin<"__builtin_msa_pckod_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_pcnt_b : GCCBuiltin<"__builtin_msa_pcnt_b">,
+def int_mips_pcnt_b : ClangBuiltin<"__builtin_msa_pcnt_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_pcnt_h : GCCBuiltin<"__builtin_msa_pcnt_h">,
+def int_mips_pcnt_h : ClangBuiltin<"__builtin_msa_pcnt_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_pcnt_w : GCCBuiltin<"__builtin_msa_pcnt_w">,
+def int_mips_pcnt_w : ClangBuiltin<"__builtin_msa_pcnt_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_pcnt_d : GCCBuiltin<"__builtin_msa_pcnt_d">,
+def int_mips_pcnt_d : ClangBuiltin<"__builtin_msa_pcnt_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_sat_s_b : GCCBuiltin<"__builtin_msa_sat_s_b">,
+def int_mips_sat_s_b : ClangBuiltin<"__builtin_msa_sat_s_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_sat_s_h : GCCBuiltin<"__builtin_msa_sat_s_h">,
+def int_mips_sat_s_h : ClangBuiltin<"__builtin_msa_sat_s_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_sat_s_w : GCCBuiltin<"__builtin_msa_sat_s_w">,
+def int_mips_sat_s_w : ClangBuiltin<"__builtin_msa_sat_s_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_sat_s_d : GCCBuiltin<"__builtin_msa_sat_s_d">,
+def int_mips_sat_s_d : ClangBuiltin<"__builtin_msa_sat_s_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-def int_mips_sat_u_b : GCCBuiltin<"__builtin_msa_sat_u_b">,
+def int_mips_sat_u_b : ClangBuiltin<"__builtin_msa_sat_u_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_sat_u_h : GCCBuiltin<"__builtin_msa_sat_u_h">,
+def int_mips_sat_u_h : ClangBuiltin<"__builtin_msa_sat_u_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_sat_u_w : GCCBuiltin<"__builtin_msa_sat_u_w">,
+def int_mips_sat_u_w : ClangBuiltin<"__builtin_msa_sat_u_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_sat_u_d : GCCBuiltin<"__builtin_msa_sat_u_d">,
+def int_mips_sat_u_d : ClangBuiltin<"__builtin_msa_sat_u_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-def int_mips_shf_b : GCCBuiltin<"__builtin_msa_shf_b">,
+def int_mips_shf_b : ClangBuiltin<"__builtin_msa_shf_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_shf_h : GCCBuiltin<"__builtin_msa_shf_h">,
+def int_mips_shf_h : ClangBuiltin<"__builtin_msa_shf_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_shf_w : GCCBuiltin<"__builtin_msa_shf_w">,
+def int_mips_shf_w : ClangBuiltin<"__builtin_msa_shf_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-def int_mips_sld_b : GCCBuiltin<"__builtin_msa_sld_b">,
+def int_mips_sld_b : ClangBuiltin<"__builtin_msa_sld_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_mips_sld_h : GCCBuiltin<"__builtin_msa_sld_h">,
+def int_mips_sld_h : ClangBuiltin<"__builtin_msa_sld_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_mips_sld_w : GCCBuiltin<"__builtin_msa_sld_w">,
+def int_mips_sld_w : ClangBuiltin<"__builtin_msa_sld_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_mips_sld_d : GCCBuiltin<"__builtin_msa_sld_d">,
+def int_mips_sld_d : ClangBuiltin<"__builtin_msa_sld_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
 
-def int_mips_sldi_b : GCCBuiltin<"__builtin_msa_sldi_b">,
+def int_mips_sldi_b : ClangBuiltin<"__builtin_msa_sldi_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
             [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-def int_mips_sldi_h : GCCBuiltin<"__builtin_msa_sldi_h">,
+def int_mips_sldi_h : ClangBuiltin<"__builtin_msa_sldi_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty],
             [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-def int_mips_sldi_w : GCCBuiltin<"__builtin_msa_sldi_w">,
+def int_mips_sldi_w : ClangBuiltin<"__builtin_msa_sldi_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty],
             [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-def int_mips_sldi_d : GCCBuiltin<"__builtin_msa_sldi_d">,
+def int_mips_sldi_d : ClangBuiltin<"__builtin_msa_sldi_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty],
             [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
-def int_mips_sll_b : GCCBuiltin<"__builtin_msa_sll_b">,
+def int_mips_sll_b : ClangBuiltin<"__builtin_msa_sll_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_sll_h : GCCBuiltin<"__builtin_msa_sll_h">,
+def int_mips_sll_h : ClangBuiltin<"__builtin_msa_sll_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_sll_w : GCCBuiltin<"__builtin_msa_sll_w">,
+def int_mips_sll_w : ClangBuiltin<"__builtin_msa_sll_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_sll_d : GCCBuiltin<"__builtin_msa_sll_d">,
+def int_mips_sll_d : ClangBuiltin<"__builtin_msa_sll_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_slli_b : GCCBuiltin<"__builtin_msa_slli_b">,
+def int_mips_slli_b : ClangBuiltin<"__builtin_msa_slli_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_slli_h : GCCBuiltin<"__builtin_msa_slli_h">,
+def int_mips_slli_h : ClangBuiltin<"__builtin_msa_slli_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_slli_w : GCCBuiltin<"__builtin_msa_slli_w">,
+def int_mips_slli_w : ClangBuiltin<"__builtin_msa_slli_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_slli_d : GCCBuiltin<"__builtin_msa_slli_d">,
+def int_mips_slli_d : ClangBuiltin<"__builtin_msa_slli_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-def int_mips_splat_b : GCCBuiltin<"__builtin_msa_splat_b">,
+def int_mips_splat_b : ClangBuiltin<"__builtin_msa_splat_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_mips_splat_h : GCCBuiltin<"__builtin_msa_splat_h">,
+def int_mips_splat_h : ClangBuiltin<"__builtin_msa_splat_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_mips_splat_w : GCCBuiltin<"__builtin_msa_splat_w">,
+def int_mips_splat_w : ClangBuiltin<"__builtin_msa_splat_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
-def int_mips_splat_d : GCCBuiltin<"__builtin_msa_splat_d">,
+def int_mips_splat_d : ClangBuiltin<"__builtin_msa_splat_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
 
-def int_mips_splati_b : GCCBuiltin<"__builtin_msa_splati_b">,
+def int_mips_splati_b : ClangBuiltin<"__builtin_msa_splati_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_splati_h : GCCBuiltin<"__builtin_msa_splati_h">,
+def int_mips_splati_h : ClangBuiltin<"__builtin_msa_splati_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_splati_w : GCCBuiltin<"__builtin_msa_splati_w">,
+def int_mips_splati_w : ClangBuiltin<"__builtin_msa_splati_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_splati_d : GCCBuiltin<"__builtin_msa_splati_d">,
+def int_mips_splati_d : ClangBuiltin<"__builtin_msa_splati_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-def int_mips_sra_b : GCCBuiltin<"__builtin_msa_sra_b">,
+def int_mips_sra_b : ClangBuiltin<"__builtin_msa_sra_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_sra_h : GCCBuiltin<"__builtin_msa_sra_h">,
+def int_mips_sra_h : ClangBuiltin<"__builtin_msa_sra_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_sra_w : GCCBuiltin<"__builtin_msa_sra_w">,
+def int_mips_sra_w : ClangBuiltin<"__builtin_msa_sra_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_sra_d : GCCBuiltin<"__builtin_msa_sra_d">,
+def int_mips_sra_d : ClangBuiltin<"__builtin_msa_sra_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_srai_b : GCCBuiltin<"__builtin_msa_srai_b">,
+def int_mips_srai_b : ClangBuiltin<"__builtin_msa_srai_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_srai_h : GCCBuiltin<"__builtin_msa_srai_h">,
+def int_mips_srai_h : ClangBuiltin<"__builtin_msa_srai_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_srai_w : GCCBuiltin<"__builtin_msa_srai_w">,
+def int_mips_srai_w : ClangBuiltin<"__builtin_msa_srai_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_srai_d : GCCBuiltin<"__builtin_msa_srai_d">,
+def int_mips_srai_d : ClangBuiltin<"__builtin_msa_srai_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-def int_mips_srar_b : GCCBuiltin<"__builtin_msa_srar_b">,
+def int_mips_srar_b : ClangBuiltin<"__builtin_msa_srar_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_srar_h : GCCBuiltin<"__builtin_msa_srar_h">,
+def int_mips_srar_h : ClangBuiltin<"__builtin_msa_srar_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_srar_w : GCCBuiltin<"__builtin_msa_srar_w">,
+def int_mips_srar_w : ClangBuiltin<"__builtin_msa_srar_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_srar_d : GCCBuiltin<"__builtin_msa_srar_d">,
+def int_mips_srar_d : ClangBuiltin<"__builtin_msa_srar_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_srari_b : GCCBuiltin<"__builtin_msa_srari_b">,
+def int_mips_srari_b : ClangBuiltin<"__builtin_msa_srari_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_srari_h : GCCBuiltin<"__builtin_msa_srari_h">,
+def int_mips_srari_h : ClangBuiltin<"__builtin_msa_srari_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_srari_w : GCCBuiltin<"__builtin_msa_srari_w">,
+def int_mips_srari_w : ClangBuiltin<"__builtin_msa_srari_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_srari_d : GCCBuiltin<"__builtin_msa_srari_d">,
+def int_mips_srari_d : ClangBuiltin<"__builtin_msa_srari_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-def int_mips_srl_b : GCCBuiltin<"__builtin_msa_srl_b">,
+def int_mips_srl_b : ClangBuiltin<"__builtin_msa_srl_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_srl_h : GCCBuiltin<"__builtin_msa_srl_h">,
+def int_mips_srl_h : ClangBuiltin<"__builtin_msa_srl_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_srl_w : GCCBuiltin<"__builtin_msa_srl_w">,
+def int_mips_srl_w : ClangBuiltin<"__builtin_msa_srl_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_srl_d : GCCBuiltin<"__builtin_msa_srl_d">,
+def int_mips_srl_d : ClangBuiltin<"__builtin_msa_srl_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_srli_b : GCCBuiltin<"__builtin_msa_srli_b">,
+def int_mips_srli_b : ClangBuiltin<"__builtin_msa_srli_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_srli_h : GCCBuiltin<"__builtin_msa_srli_h">,
+def int_mips_srli_h : ClangBuiltin<"__builtin_msa_srli_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_srli_w : GCCBuiltin<"__builtin_msa_srli_w">,
+def int_mips_srli_w : ClangBuiltin<"__builtin_msa_srli_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_srli_d : GCCBuiltin<"__builtin_msa_srli_d">,
+def int_mips_srli_d : ClangBuiltin<"__builtin_msa_srli_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-def int_mips_srlr_b : GCCBuiltin<"__builtin_msa_srlr_b">,
+def int_mips_srlr_b : ClangBuiltin<"__builtin_msa_srlr_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_srlr_h : GCCBuiltin<"__builtin_msa_srlr_h">,
+def int_mips_srlr_h : ClangBuiltin<"__builtin_msa_srlr_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_srlr_w : GCCBuiltin<"__builtin_msa_srlr_w">,
+def int_mips_srlr_w : ClangBuiltin<"__builtin_msa_srlr_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_srlr_d : GCCBuiltin<"__builtin_msa_srlr_d">,
+def int_mips_srlr_d : ClangBuiltin<"__builtin_msa_srlr_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_srlri_b : GCCBuiltin<"__builtin_msa_srlri_b">,
+def int_mips_srlri_b : ClangBuiltin<"__builtin_msa_srlri_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_srlri_h : GCCBuiltin<"__builtin_msa_srlri_h">,
+def int_mips_srlri_h : ClangBuiltin<"__builtin_msa_srlri_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_srlri_w : GCCBuiltin<"__builtin_msa_srlri_w">,
+def int_mips_srlri_w : ClangBuiltin<"__builtin_msa_srlri_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_srlri_d : GCCBuiltin<"__builtin_msa_srlri_d">,
+def int_mips_srlri_d : ClangBuiltin<"__builtin_msa_srlri_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-def int_mips_st_b : GCCBuiltin<"__builtin_msa_st_b">,
+def int_mips_st_b : ClangBuiltin<"__builtin_msa_st_b">,
   Intrinsic<[], [llvm_v16i8_ty, llvm_ptr_ty, llvm_i32_ty],
   [IntrArgMemOnly]>;
-def int_mips_st_h : GCCBuiltin<"__builtin_msa_st_h">,
+def int_mips_st_h : ClangBuiltin<"__builtin_msa_st_h">,
   Intrinsic<[], [llvm_v8i16_ty, llvm_ptr_ty, llvm_i32_ty],
   [IntrArgMemOnly]>;
-def int_mips_st_w : GCCBuiltin<"__builtin_msa_st_w">,
+def int_mips_st_w : ClangBuiltin<"__builtin_msa_st_w">,
   Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty, llvm_i32_ty],
   [IntrArgMemOnly]>;
-def int_mips_st_d : GCCBuiltin<"__builtin_msa_st_d">,
+def int_mips_st_d : ClangBuiltin<"__builtin_msa_st_d">,
   Intrinsic<[], [llvm_v2i64_ty, llvm_ptr_ty, llvm_i32_ty],
   [IntrArgMemOnly]>;
 
-def int_mips_str_d : GCCBuiltin<"__builtin_msa_str_d">,
+def int_mips_str_d : ClangBuiltin<"__builtin_msa_str_d">,
   Intrinsic<[], [llvm_v2i64_ty, llvm_ptr_ty, llvm_i32_ty],
   [IntrArgMemOnly]>;
-def int_mips_str_w : GCCBuiltin<"__builtin_msa_str_w">,
+def int_mips_str_w : ClangBuiltin<"__builtin_msa_str_w">,
   Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty, llvm_i32_ty],
   [IntrArgMemOnly]>;
 
-def int_mips_subs_s_b : GCCBuiltin<"__builtin_msa_subs_s_b">,
+def int_mips_subs_s_b : ClangBuiltin<"__builtin_msa_subs_s_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_subs_s_h : GCCBuiltin<"__builtin_msa_subs_s_h">,
+def int_mips_subs_s_h : ClangBuiltin<"__builtin_msa_subs_s_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_subs_s_w : GCCBuiltin<"__builtin_msa_subs_s_w">,
+def int_mips_subs_s_w : ClangBuiltin<"__builtin_msa_subs_s_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_subs_s_d : GCCBuiltin<"__builtin_msa_subs_s_d">,
+def int_mips_subs_s_d : ClangBuiltin<"__builtin_msa_subs_s_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_subs_u_b : GCCBuiltin<"__builtin_msa_subs_u_b">,
+def int_mips_subs_u_b : ClangBuiltin<"__builtin_msa_subs_u_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_subs_u_h : GCCBuiltin<"__builtin_msa_subs_u_h">,
+def int_mips_subs_u_h : ClangBuiltin<"__builtin_msa_subs_u_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_subs_u_w : GCCBuiltin<"__builtin_msa_subs_u_w">,
+def int_mips_subs_u_w : ClangBuiltin<"__builtin_msa_subs_u_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_subs_u_d : GCCBuiltin<"__builtin_msa_subs_u_d">,
+def int_mips_subs_u_d : ClangBuiltin<"__builtin_msa_subs_u_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_subsus_u_b : GCCBuiltin<"__builtin_msa_subsus_u_b">,
+def int_mips_subsus_u_b : ClangBuiltin<"__builtin_msa_subsus_u_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_subsus_u_h : GCCBuiltin<"__builtin_msa_subsus_u_h">,
+def int_mips_subsus_u_h : ClangBuiltin<"__builtin_msa_subsus_u_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_subsus_u_w : GCCBuiltin<"__builtin_msa_subsus_u_w">,
+def int_mips_subsus_u_w : ClangBuiltin<"__builtin_msa_subsus_u_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_subsus_u_d : GCCBuiltin<"__builtin_msa_subsus_u_d">,
+def int_mips_subsus_u_d : ClangBuiltin<"__builtin_msa_subsus_u_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_subsuu_s_b : GCCBuiltin<"__builtin_msa_subsuu_s_b">,
+def int_mips_subsuu_s_b : ClangBuiltin<"__builtin_msa_subsuu_s_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_subsuu_s_h : GCCBuiltin<"__builtin_msa_subsuu_s_h">,
+def int_mips_subsuu_s_h : ClangBuiltin<"__builtin_msa_subsuu_s_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_subsuu_s_w : GCCBuiltin<"__builtin_msa_subsuu_s_w">,
+def int_mips_subsuu_s_w : ClangBuiltin<"__builtin_msa_subsuu_s_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_subsuu_s_d : GCCBuiltin<"__builtin_msa_subsuu_s_d">,
+def int_mips_subsuu_s_d : ClangBuiltin<"__builtin_msa_subsuu_s_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_subv_b : GCCBuiltin<"__builtin_msa_subv_b">,
+def int_mips_subv_b : ClangBuiltin<"__builtin_msa_subv_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-def int_mips_subv_h : GCCBuiltin<"__builtin_msa_subv_h">,
+def int_mips_subv_h : ClangBuiltin<"__builtin_msa_subv_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
-def int_mips_subv_w : GCCBuiltin<"__builtin_msa_subv_w">,
+def int_mips_subv_w : ClangBuiltin<"__builtin_msa_subv_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-def int_mips_subv_d : GCCBuiltin<"__builtin_msa_subv_d">,
+def int_mips_subv_d : ClangBuiltin<"__builtin_msa_subv_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-def int_mips_subvi_b : GCCBuiltin<"__builtin_msa_subvi_b">,
+def int_mips_subvi_b : ClangBuiltin<"__builtin_msa_subvi_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_subvi_h : GCCBuiltin<"__builtin_msa_subvi_h">,
+def int_mips_subvi_h : ClangBuiltin<"__builtin_msa_subvi_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_subvi_w : GCCBuiltin<"__builtin_msa_subvi_w">,
+def int_mips_subvi_w : ClangBuiltin<"__builtin_msa_subvi_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-def int_mips_subvi_d : GCCBuiltin<"__builtin_msa_subvi_d">,
+def int_mips_subvi_d : ClangBuiltin<"__builtin_msa_subvi_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-def int_mips_vshf_b : GCCBuiltin<"__builtin_msa_vshf_b">,
+def int_mips_vshf_b : ClangBuiltin<"__builtin_msa_vshf_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
             [IntrNoMem]>;
-def int_mips_vshf_h : GCCBuiltin<"__builtin_msa_vshf_h">,
+def int_mips_vshf_h : ClangBuiltin<"__builtin_msa_vshf_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty],
             [IntrNoMem]>;
-def int_mips_vshf_w : GCCBuiltin<"__builtin_msa_vshf_w">,
+def int_mips_vshf_w : ClangBuiltin<"__builtin_msa_vshf_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
             [IntrNoMem]>;
-def int_mips_vshf_d : GCCBuiltin<"__builtin_msa_vshf_d">,
+def int_mips_vshf_d : ClangBuiltin<"__builtin_msa_vshf_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty],
             [IntrNoMem]>;
 
-def int_mips_xor_v : GCCBuiltin<"__builtin_msa_xor_v">,
+def int_mips_xor_v : ClangBuiltin<"__builtin_msa_xor_v">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
 
-def int_mips_xori_b : GCCBuiltin<"__builtin_msa_xori_b">,
+def int_mips_xori_b : ClangBuiltin<"__builtin_msa_xori_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 41b28db56c75..9c3813128364 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -556,95 +556,124 @@ class SHFL_INFO<bit sync, string mode, string type, bit return_pred> {
 }
 
 let TargetPrefix = "nvvm" in {
-  def int_nvvm_prmt : GCCBuiltin<"__nvvm_prmt">,
+  def int_nvvm_prmt : ClangBuiltin<"__nvvm_prmt">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-        [IntrNoMem, Commutative]>;
+        [IntrNoMem, IntrSpeculatable]>;
 
 //
 // Min Max
 //
 
-  def int_nvvm_fmin_f : GCCBuiltin<"__nvvm_fmin_f">,
-      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, IntrSpeculatable, Commutative]>;
-  def int_nvvm_fmin_ftz_f : GCCBuiltin<"__nvvm_fmin_ftz_f">,
-      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+  foreach operation = ["min", "max"] in {
+    def int_nvvm_f # operation # _d :
+      ClangBuiltin<!strconcat("__nvvm_f", operation, "_d")>,
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
 
-  def int_nvvm_fmax_f : GCCBuiltin<"__nvvm_fmax_f">,
-      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty]
-        , [IntrNoMem, IntrSpeculatable, Commutative]>;
-  def int_nvvm_fmax_ftz_f : GCCBuiltin<"__nvvm_fmax_ftz_f">,
-      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, IntrSpeculatable, Commutative]>;
+    foreach variant = ["_f", "_ftz_f", "_nan_f", "_ftz_nan_f",
+      "_xorsign_abs_f", "_ftz_xorsign_abs_f", "_nan_xorsign_abs_f",
+      "_ftz_nan_xorsign_abs_f"] in {
+      def int_nvvm_f # operation # variant :
+        ClangBuiltin<!strconcat("__nvvm_f", operation, variant)>,
+        DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+          [IntrNoMem, IntrSpeculatable, Commutative]>;
+    }
 
-  def int_nvvm_fmin_d : GCCBuiltin<"__nvvm_fmin_d">,
-      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
-        [IntrNoMem, IntrSpeculatable, Commutative]>;
-  def int_nvvm_fmax_d : GCCBuiltin<"__nvvm_fmax_d">,
-      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
-        [IntrNoMem, IntrSpeculatable, Commutative]>;
+    foreach variant = ["_f16", "_ftz_f16", "_nan_f16", "_ftz_nan_f16",
+      "_xorsign_abs_f16", "_ftz_xorsign_abs_f16", "_nan_xorsign_abs_f16",
+      "_ftz_nan_xorsign_abs_f16"] in {
+      def int_nvvm_f # operation # variant :
+        ClangBuiltin<!strconcat("__nvvm_f", operation, variant)>,
+        DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty, llvm_half_ty],
+          [IntrNoMem, IntrSpeculatable, Commutative]>;
+    }
+
+    foreach variant = ["_f16x2", "_ftz_f16x2", "_nan_f16x2",
+      "_ftz_nan_f16x2", "_xorsign_abs_f16x2", "_ftz_xorsign_abs_f16x2",
+      "_nan_xorsign_abs_f16x2", "_ftz_nan_xorsign_abs_f16x2"] in {
+      def int_nvvm_f # operation # variant :
+        ClangBuiltin<!strconcat("__nvvm_f", operation, variant)>,
+        DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty, llvm_v2f16_ty],
+          [IntrNoMem, IntrSpeculatable, Commutative]>;
+    }
+
+    foreach variant = ["_bf16", "_nan_bf16", "_xorsign_abs_bf16",
+      "_nan_xorsign_abs_bf16"] in {
+      def int_nvvm_f # operation # variant :
+        ClangBuiltin<!strconcat("__nvvm_f", operation, variant)>,
+        DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty],
+          [IntrNoMem, IntrSpeculatable, Commutative]>;
+    }
+
+    foreach variant = ["_bf16x2", "_nan_bf16x2", "_xorsign_abs_bf16x2",
+      "_nan_xorsign_abs_bf16x2"] in {
+      def int_nvvm_f # operation # variant :
+        ClangBuiltin<!strconcat("__nvvm_f", operation, variant)>,
+        DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+          [IntrNoMem, IntrSpeculatable, Commutative]>;
+    }
+  }
 
 //
 // Multiplication
 //
 
-  def int_nvvm_mulhi_i : GCCBuiltin<"__nvvm_mulhi_i">,
+  def int_nvvm_mulhi_i : ClangBuiltin<"__nvvm_mulhi_i">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
-  def int_nvvm_mulhi_ui : GCCBuiltin<"__nvvm_mulhi_ui">,
+  def int_nvvm_mulhi_ui : ClangBuiltin<"__nvvm_mulhi_ui">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
 
-  def int_nvvm_mulhi_ll : GCCBuiltin<"__nvvm_mulhi_ll">,
+  def int_nvvm_mulhi_ll : ClangBuiltin<"__nvvm_mulhi_ll">,
       DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
-  def int_nvvm_mulhi_ull : GCCBuiltin<"__nvvm_mulhi_ull">,
+  def int_nvvm_mulhi_ull : ClangBuiltin<"__nvvm_mulhi_ull">,
       DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
 
-  def int_nvvm_mul_rn_ftz_f : GCCBuiltin<"__nvvm_mul_rn_ftz_f">,
+  def int_nvvm_mul_rn_ftz_f : ClangBuiltin<"__nvvm_mul_rn_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
-  def int_nvvm_mul_rn_f : GCCBuiltin<"__nvvm_mul_rn_f">,
+  def int_nvvm_mul_rn_f : ClangBuiltin<"__nvvm_mul_rn_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
-  def int_nvvm_mul_rz_ftz_f : GCCBuiltin<"__nvvm_mul_rz_ftz_f">,
+  def int_nvvm_mul_rz_ftz_f : ClangBuiltin<"__nvvm_mul_rz_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
-  def int_nvvm_mul_rz_f : GCCBuiltin<"__nvvm_mul_rz_f">,
+  def int_nvvm_mul_rz_f : ClangBuiltin<"__nvvm_mul_rz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
-  def int_nvvm_mul_rm_ftz_f : GCCBuiltin<"__nvvm_mul_rm_ftz_f">,
+  def int_nvvm_mul_rm_ftz_f : ClangBuiltin<"__nvvm_mul_rm_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
-  def int_nvvm_mul_rm_f : GCCBuiltin<"__nvvm_mul_rm_f">,
+  def int_nvvm_mul_rm_f : ClangBuiltin<"__nvvm_mul_rm_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
-  def int_nvvm_mul_rp_ftz_f : GCCBuiltin<"__nvvm_mul_rp_ftz_f">,
+  def int_nvvm_mul_rp_ftz_f : ClangBuiltin<"__nvvm_mul_rp_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
-  def int_nvvm_mul_rp_f : GCCBuiltin<"__nvvm_mul_rp_f">,
+  def int_nvvm_mul_rp_f : ClangBuiltin<"__nvvm_mul_rp_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
 
-  def int_nvvm_mul_rn_d : GCCBuiltin<"__nvvm_mul_rn_d">,
+  def int_nvvm_mul_rn_d : ClangBuiltin<"__nvvm_mul_rn_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
-  def int_nvvm_mul_rz_d : GCCBuiltin<"__nvvm_mul_rz_d">,
+  def int_nvvm_mul_rz_d : ClangBuiltin<"__nvvm_mul_rz_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
-  def int_nvvm_mul_rm_d : GCCBuiltin<"__nvvm_mul_rm_d">,
+  def int_nvvm_mul_rm_d : ClangBuiltin<"__nvvm_mul_rm_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
-  def int_nvvm_mul_rp_d : GCCBuiltin<"__nvvm_mul_rp_d">,
+  def int_nvvm_mul_rp_d : ClangBuiltin<"__nvvm_mul_rp_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
 
-  def int_nvvm_mul24_i : GCCBuiltin<"__nvvm_mul24_i">,
+  def int_nvvm_mul24_i : ClangBuiltin<"__nvvm_mul24_i">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
-  def int_nvvm_mul24_ui : GCCBuiltin<"__nvvm_mul24_ui">,
+  def int_nvvm_mul24_ui : ClangBuiltin<"__nvvm_mul24_ui">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
 
@@ -652,51 +681,51 @@ let TargetPrefix = "nvvm" in {
 // Div
 //
 
-  def int_nvvm_div_approx_ftz_f : GCCBuiltin<"__nvvm_div_approx_ftz_f">,
+  def int_nvvm_div_approx_ftz_f : ClangBuiltin<"__nvvm_div_approx_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
         [IntrNoMem]>;
-  def int_nvvm_div_approx_f : GCCBuiltin<"__nvvm_div_approx_f">,
+  def int_nvvm_div_approx_f : ClangBuiltin<"__nvvm_div_approx_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
         [IntrNoMem]>;
 
-  def int_nvvm_div_rn_ftz_f : GCCBuiltin<"__nvvm_div_rn_ftz_f">,
+  def int_nvvm_div_rn_ftz_f : ClangBuiltin<"__nvvm_div_rn_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
         [IntrNoMem]>;
-  def int_nvvm_div_rn_f : GCCBuiltin<"__nvvm_div_rn_f">,
+  def int_nvvm_div_rn_f : ClangBuiltin<"__nvvm_div_rn_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
         [IntrNoMem]>;
 
-  def int_nvvm_div_rz_ftz_f : GCCBuiltin<"__nvvm_div_rz_ftz_f">,
+  def int_nvvm_div_rz_ftz_f : ClangBuiltin<"__nvvm_div_rz_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
         [IntrNoMem]>;
-  def int_nvvm_div_rz_f : GCCBuiltin<"__nvvm_div_rz_f">,
+  def int_nvvm_div_rz_f : ClangBuiltin<"__nvvm_div_rz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
         [IntrNoMem]>;
 
-  def int_nvvm_div_rm_ftz_f : GCCBuiltin<"__nvvm_div_rm_ftz_f">,
+  def int_nvvm_div_rm_ftz_f : ClangBuiltin<"__nvvm_div_rm_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
         [IntrNoMem]>;
-  def int_nvvm_div_rm_f : GCCBuiltin<"__nvvm_div_rm_f">,
+  def int_nvvm_div_rm_f : ClangBuiltin<"__nvvm_div_rm_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
         [IntrNoMem]>;
 
-  def int_nvvm_div_rp_ftz_f : GCCBuiltin<"__nvvm_div_rp_ftz_f">,
+  def int_nvvm_div_rp_ftz_f : ClangBuiltin<"__nvvm_div_rp_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
         [IntrNoMem]>;
-  def int_nvvm_div_rp_f : GCCBuiltin<"__nvvm_div_rp_f">,
+  def int_nvvm_div_rp_f : ClangBuiltin<"__nvvm_div_rp_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
         [IntrNoMem]>;
 
-  def int_nvvm_div_rn_d : GCCBuiltin<"__nvvm_div_rn_d">,
+  def int_nvvm_div_rn_d : ClangBuiltin<"__nvvm_div_rn_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
         [IntrNoMem]>;
-  def int_nvvm_div_rz_d : GCCBuiltin<"__nvvm_div_rz_d">,
+  def int_nvvm_div_rz_d : ClangBuiltin<"__nvvm_div_rz_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
         [IntrNoMem]>;
-  def int_nvvm_div_rm_d : GCCBuiltin<"__nvvm_div_rm_d">,
+  def int_nvvm_div_rm_d : ClangBuiltin<"__nvvm_div_rm_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
         [IntrNoMem]>;
-  def int_nvvm_div_rp_d : GCCBuiltin<"__nvvm_div_rp_d">,
+  def int_nvvm_div_rp_d : ClangBuiltin<"__nvvm_div_rp_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
         [IntrNoMem]>;
 
@@ -704,10 +733,10 @@ let TargetPrefix = "nvvm" in {
 // Sad
 //
 
-  def int_nvvm_sad_i : GCCBuiltin<"__nvvm_sad_i">,
+  def int_nvvm_sad_i : ClangBuiltin<"__nvvm_sad_i">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
         [IntrNoMem, Commutative]>;
-  def int_nvvm_sad_ui : GCCBuiltin<"__nvvm_sad_ui">,
+  def int_nvvm_sad_ui : ClangBuiltin<"__nvvm_sad_ui">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
         [IntrNoMem, Commutative]>;
 
@@ -715,264 +744,286 @@ let TargetPrefix = "nvvm" in {
 // Floor  Ceil
 //
 
-  def int_nvvm_floor_ftz_f : GCCBuiltin<"__nvvm_floor_ftz_f">,
+  def int_nvvm_floor_ftz_f : ClangBuiltin<"__nvvm_floor_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_floor_f : GCCBuiltin<"__nvvm_floor_f">,
+  def int_nvvm_floor_f : ClangBuiltin<"__nvvm_floor_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_floor_d : GCCBuiltin<"__nvvm_floor_d">,
+  def int_nvvm_floor_d : ClangBuiltin<"__nvvm_floor_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
 
-  def int_nvvm_ceil_ftz_f : GCCBuiltin<"__nvvm_ceil_ftz_f">,
+  def int_nvvm_ceil_ftz_f : ClangBuiltin<"__nvvm_ceil_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_ceil_f : GCCBuiltin<"__nvvm_ceil_f">,
+  def int_nvvm_ceil_f : ClangBuiltin<"__nvvm_ceil_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_ceil_d : GCCBuiltin<"__nvvm_ceil_d">,
+  def int_nvvm_ceil_d : ClangBuiltin<"__nvvm_ceil_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
 
 //
 // Abs
 //
 
-  def int_nvvm_fabs_ftz_f : GCCBuiltin<"__nvvm_fabs_ftz_f">,
+  def int_nvvm_fabs_ftz_f : ClangBuiltin<"__nvvm_fabs_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_fabs_f : GCCBuiltin<"__nvvm_fabs_f">,
+  def int_nvvm_fabs_f : ClangBuiltin<"__nvvm_fabs_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_fabs_d : GCCBuiltin<"__nvvm_fabs_d">,
+  def int_nvvm_fabs_d : ClangBuiltin<"__nvvm_fabs_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
 
+//
+// Abs, Neg bf16, bf16x2
+//
+
+  foreach unary = ["abs", "neg"] in {
+    def int_nvvm_ # unary # _bf16 :
+      ClangBuiltin<!strconcat("__nvvm_", unary, "_bf16")>,
+      DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i16_ty], [IntrNoMem]>;
+    def int_nvvm_ # unary # _bf16x2 :
+      ClangBuiltin<!strconcat("__nvvm_", unary, "_bf16x2")>,
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+  }
+
 //
 // Round
 //
 
-  def int_nvvm_round_ftz_f : GCCBuiltin<"__nvvm_round_ftz_f">,
+  def int_nvvm_round_ftz_f : ClangBuiltin<"__nvvm_round_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_round_f : GCCBuiltin<"__nvvm_round_f">,
+  def int_nvvm_round_f : ClangBuiltin<"__nvvm_round_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
 
-  def int_nvvm_round_d : GCCBuiltin<"__nvvm_round_d">,
+  def int_nvvm_round_d : ClangBuiltin<"__nvvm_round_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
 
 //
 // Trunc
 //
 
-  def int_nvvm_trunc_ftz_f : GCCBuiltin<"__nvvm_trunc_ftz_f">,
+  def int_nvvm_trunc_ftz_f : ClangBuiltin<"__nvvm_trunc_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_trunc_f : GCCBuiltin<"__nvvm_trunc_f">,
+  def int_nvvm_trunc_f : ClangBuiltin<"__nvvm_trunc_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
 
-  def int_nvvm_trunc_d : GCCBuiltin<"__nvvm_trunc_d">,
+  def int_nvvm_trunc_d : ClangBuiltin<"__nvvm_trunc_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
 
 //
 // Saturate
 //
 
-  def int_nvvm_saturate_ftz_f : GCCBuiltin<"__nvvm_saturate_ftz_f">,
+  def int_nvvm_saturate_ftz_f : ClangBuiltin<"__nvvm_saturate_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_saturate_f : GCCBuiltin<"__nvvm_saturate_f">,
+  def int_nvvm_saturate_f : ClangBuiltin<"__nvvm_saturate_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
 
-  def int_nvvm_saturate_d : GCCBuiltin<"__nvvm_saturate_d">,
+  def int_nvvm_saturate_d : ClangBuiltin<"__nvvm_saturate_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
 
 //
 // Exp2  Log2
 //
 
-  def int_nvvm_ex2_approx_ftz_f : GCCBuiltin<"__nvvm_ex2_approx_ftz_f">,
+  def int_nvvm_ex2_approx_ftz_f : ClangBuiltin<"__nvvm_ex2_approx_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_ex2_approx_f : GCCBuiltin<"__nvvm_ex2_approx_f">,
+  def int_nvvm_ex2_approx_f : ClangBuiltin<"__nvvm_ex2_approx_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_ex2_approx_d : GCCBuiltin<"__nvvm_ex2_approx_d">,
+  def int_nvvm_ex2_approx_d : ClangBuiltin<"__nvvm_ex2_approx_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_ex2_approx_f16 : ClangBuiltin<"__nvvm_ex2_approx_f16">,
+      DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty], [IntrNoMem]>;
+  def int_nvvm_ex2_approx_f16x2 : ClangBuiltin<"__nvvm_ex2_approx_f16x2">,
+      DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty], [IntrNoMem]>;
 
-  def int_nvvm_lg2_approx_ftz_f : GCCBuiltin<"__nvvm_lg2_approx_ftz_f">,
+  def int_nvvm_lg2_approx_ftz_f : ClangBuiltin<"__nvvm_lg2_approx_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_lg2_approx_f : GCCBuiltin<"__nvvm_lg2_approx_f">,
+  def int_nvvm_lg2_approx_f : ClangBuiltin<"__nvvm_lg2_approx_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_lg2_approx_d : GCCBuiltin<"__nvvm_lg2_approx_d">,
+  def int_nvvm_lg2_approx_d : ClangBuiltin<"__nvvm_lg2_approx_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
 
 //
 // Sin  Cos
 //
 
-  def int_nvvm_sin_approx_ftz_f : GCCBuiltin<"__nvvm_sin_approx_ftz_f">,
+  def int_nvvm_sin_approx_ftz_f : ClangBuiltin<"__nvvm_sin_approx_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_sin_approx_f : GCCBuiltin<"__nvvm_sin_approx_f">,
+  def int_nvvm_sin_approx_f : ClangBuiltin<"__nvvm_sin_approx_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
 
-  def int_nvvm_cos_approx_ftz_f : GCCBuiltin<"__nvvm_cos_approx_ftz_f">,
+  def int_nvvm_cos_approx_ftz_f : ClangBuiltin<"__nvvm_cos_approx_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_cos_approx_f : GCCBuiltin<"__nvvm_cos_approx_f">,
+  def int_nvvm_cos_approx_f : ClangBuiltin<"__nvvm_cos_approx_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
 
 //
 // Fma
 //
 
-  def int_nvvm_fma_rn_ftz_f : GCCBuiltin<"__nvvm_fma_rn_ftz_f">,
-      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_fma_rn_f : GCCBuiltin<"__nvvm_fma_rn_f">,
-      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_fma_rz_ftz_f : GCCBuiltin<"__nvvm_fma_rz_ftz_f">,
-      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_fma_rz_f : GCCBuiltin<"__nvvm_fma_rz_f">,
-      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_fma_rm_ftz_f : GCCBuiltin<"__nvvm_fma_rm_ftz_f">,
-      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_fma_rm_f : GCCBuiltin<"__nvvm_fma_rm_f">,
-      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_fma_rp_ftz_f : GCCBuiltin<"__nvvm_fma_rp_ftz_f">,
-      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_fma_rp_f : GCCBuiltin<"__nvvm_fma_rp_f">,
-      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+  foreach variant = ["_rn_f16", "_rn_ftz_f16", "_rn_sat_f16",
+    "_rn_ftz_sat_f16", "_rn_relu_f16", "_rn_ftz_relu_f16"] in {
+    def int_nvvm_fma # variant : ClangBuiltin<!strconcat("__nvvm_fma", variant)>,
+        DefaultAttrsIntrinsic<[llvm_half_ty],
+          [llvm_half_ty, llvm_half_ty, llvm_half_ty],
+          [IntrNoMem, IntrSpeculatable]>;
+  }
+
+  foreach variant = ["_rn_f16x2", "_rn_ftz_f16x2", "_rn_sat_f16x2",
+    "_rn_ftz_sat_f16x2", "_rn_relu_f16x2", "_rn_ftz_relu_f16x2"] in {
+    def int_nvvm_fma # variant : ClangBuiltin<!strconcat("__nvvm_fma", variant)>,
+      DefaultAttrsIntrinsic<[llvm_v2f16_ty],
+        [llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty],
         [IntrNoMem, IntrSpeculatable]>;
+  }
 
-  def int_nvvm_fma_rn_d : GCCBuiltin<"__nvvm_fma_rn_d">,
-      DefaultAttrsIntrinsic<[llvm_double_ty],
-        [llvm_double_ty, llvm_double_ty, llvm_double_ty],
+  foreach variant = ["_rn_bf16", "_rn_relu_bf16"] in {
+    def int_nvvm_fma # variant : ClangBuiltin<!strconcat("__nvvm_fma", variant)>,
+      DefaultAttrsIntrinsic<[llvm_i16_ty],
+        [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
         [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_fma_rz_d : GCCBuiltin<"__nvvm_fma_rz_d">,
-      DefaultAttrsIntrinsic<[llvm_double_ty],
-        [llvm_double_ty, llvm_double_ty, llvm_double_ty],
+  }
+
+  foreach variant = ["_rn_bf16x2", "_rn_relu_bf16x2"] in {
+    def int_nvvm_fma # variant : ClangBuiltin<!strconcat("__nvvm_fma", variant)>,
+      DefaultAttrsIntrinsic<[llvm_i32_ty],
+        [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
         [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_fma_rm_d : GCCBuiltin<"__nvvm_fma_rm_d">,
-      DefaultAttrsIntrinsic<[llvm_double_ty],
-        [llvm_double_ty, llvm_double_ty, llvm_double_ty],
+  }
+
+  foreach variant = ["_rn_ftz_f", "_rn_f", "_rz_ftz_f", "_rz_f", "_rm_ftz_f",
+    "_rm_f", "_rp_ftz_f", "_rp_f"] in {
+    def int_nvvm_fma # variant : ClangBuiltin<!strconcat("__nvvm_fma", variant)>,
+      DefaultAttrsIntrinsic<[llvm_float_ty],
+        [llvm_float_ty, llvm_float_ty, llvm_float_ty],
         [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_fma_rp_d : GCCBuiltin<"__nvvm_fma_rp_d">,
+  }
+
+  foreach variant = ["_rn_d", "_rz_d", "_rm_d", "_rp_d"] in {
+    def int_nvvm_fma # variant : ClangBuiltin<!strconcat("__nvvm_fma", variant)>,
       DefaultAttrsIntrinsic<[llvm_double_ty],
         [llvm_double_ty, llvm_double_ty, llvm_double_ty],
         [IntrNoMem, IntrSpeculatable]>;
+  }
 
 //
 // Rcp
 //
 
-  def int_nvvm_rcp_rn_ftz_f : GCCBuiltin<"__nvvm_rcp_rn_ftz_f">,
+  def int_nvvm_rcp_rn_ftz_f : ClangBuiltin<"__nvvm_rcp_rn_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_rcp_rn_f : GCCBuiltin<"__nvvm_rcp_rn_f">,
+  def int_nvvm_rcp_rn_f : ClangBuiltin<"__nvvm_rcp_rn_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_rcp_rz_ftz_f : GCCBuiltin<"__nvvm_rcp_rz_ftz_f">,
+  def int_nvvm_rcp_rz_ftz_f : ClangBuiltin<"__nvvm_rcp_rz_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_rcp_rz_f : GCCBuiltin<"__nvvm_rcp_rz_f">,
+  def int_nvvm_rcp_rz_f : ClangBuiltin<"__nvvm_rcp_rz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_rcp_rm_ftz_f : GCCBuiltin<"__nvvm_rcp_rm_ftz_f">,
+  def int_nvvm_rcp_rm_ftz_f : ClangBuiltin<"__nvvm_rcp_rm_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_rcp_rm_f : GCCBuiltin<"__nvvm_rcp_rm_f">,
+  def int_nvvm_rcp_rm_f : ClangBuiltin<"__nvvm_rcp_rm_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_rcp_rp_ftz_f : GCCBuiltin<"__nvvm_rcp_rp_ftz_f">,
+  def int_nvvm_rcp_rp_ftz_f : ClangBuiltin<"__nvvm_rcp_rp_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_rcp_rp_f : GCCBuiltin<"__nvvm_rcp_rp_f">,
+  def int_nvvm_rcp_rp_f : ClangBuiltin<"__nvvm_rcp_rp_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
 
-  def int_nvvm_rcp_rn_d : GCCBuiltin<"__nvvm_rcp_rn_d">,
+  def int_nvvm_rcp_rn_d : ClangBuiltin<"__nvvm_rcp_rn_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
-  def int_nvvm_rcp_rz_d : GCCBuiltin<"__nvvm_rcp_rz_d">,
+  def int_nvvm_rcp_rz_d : ClangBuiltin<"__nvvm_rcp_rz_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
-  def int_nvvm_rcp_rm_d : GCCBuiltin<"__nvvm_rcp_rm_d">,
+  def int_nvvm_rcp_rm_d : ClangBuiltin<"__nvvm_rcp_rm_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
-  def int_nvvm_rcp_rp_d : GCCBuiltin<"__nvvm_rcp_rp_d">,
+  def int_nvvm_rcp_rp_d : ClangBuiltin<"__nvvm_rcp_rp_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
 
-  def int_nvvm_rcp_approx_ftz_d : GCCBuiltin<"__nvvm_rcp_approx_ftz_d">,
+  def int_nvvm_rcp_approx_ftz_f : ClangBuiltin<"__nvvm_rcp_approx_ftz_f">,
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_rcp_approx_ftz_d : ClangBuiltin<"__nvvm_rcp_approx_ftz_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
 
 //
 // Sqrt
 //
 
-  def int_nvvm_sqrt_f : GCCBuiltin<"__nvvm_sqrt_f">,
+  def int_nvvm_sqrt_f : ClangBuiltin<"__nvvm_sqrt_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_sqrt_rn_ftz_f : GCCBuiltin<"__nvvm_sqrt_rn_ftz_f">,
+  def int_nvvm_sqrt_rn_ftz_f : ClangBuiltin<"__nvvm_sqrt_rn_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_sqrt_rn_f : GCCBuiltin<"__nvvm_sqrt_rn_f">,
+  def int_nvvm_sqrt_rn_f : ClangBuiltin<"__nvvm_sqrt_rn_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_sqrt_rz_ftz_f : GCCBuiltin<"__nvvm_sqrt_rz_ftz_f">,
+  def int_nvvm_sqrt_rz_ftz_f : ClangBuiltin<"__nvvm_sqrt_rz_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_sqrt_rz_f : GCCBuiltin<"__nvvm_sqrt_rz_f">,
+  def int_nvvm_sqrt_rz_f : ClangBuiltin<"__nvvm_sqrt_rz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_sqrt_rm_ftz_f : GCCBuiltin<"__nvvm_sqrt_rm_ftz_f">,
+  def int_nvvm_sqrt_rm_ftz_f : ClangBuiltin<"__nvvm_sqrt_rm_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_sqrt_rm_f : GCCBuiltin<"__nvvm_sqrt_rm_f">,
+  def int_nvvm_sqrt_rm_f : ClangBuiltin<"__nvvm_sqrt_rm_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_sqrt_rp_ftz_f : GCCBuiltin<"__nvvm_sqrt_rp_ftz_f">,
+  def int_nvvm_sqrt_rp_ftz_f : ClangBuiltin<"__nvvm_sqrt_rp_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_sqrt_rp_f : GCCBuiltin<"__nvvm_sqrt_rp_f">,
+  def int_nvvm_sqrt_rp_f : ClangBuiltin<"__nvvm_sqrt_rp_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_sqrt_approx_ftz_f : GCCBuiltin<"__nvvm_sqrt_approx_ftz_f">,
+  def int_nvvm_sqrt_approx_ftz_f : ClangBuiltin<"__nvvm_sqrt_approx_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_sqrt_approx_f : GCCBuiltin<"__nvvm_sqrt_approx_f">,
+  def int_nvvm_sqrt_approx_f : ClangBuiltin<"__nvvm_sqrt_approx_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
 
-  def int_nvvm_sqrt_rn_d : GCCBuiltin<"__nvvm_sqrt_rn_d">,
+  def int_nvvm_sqrt_rn_d : ClangBuiltin<"__nvvm_sqrt_rn_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
-  def int_nvvm_sqrt_rz_d : GCCBuiltin<"__nvvm_sqrt_rz_d">,
+  def int_nvvm_sqrt_rz_d : ClangBuiltin<"__nvvm_sqrt_rz_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
-  def int_nvvm_sqrt_rm_d : GCCBuiltin<"__nvvm_sqrt_rm_d">,
+  def int_nvvm_sqrt_rm_d : ClangBuiltin<"__nvvm_sqrt_rm_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
-  def int_nvvm_sqrt_rp_d : GCCBuiltin<"__nvvm_sqrt_rp_d">,
+  def int_nvvm_sqrt_rp_d : ClangBuiltin<"__nvvm_sqrt_rp_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
 
 //
 // Rsqrt
 //
 
-  def int_nvvm_rsqrt_approx_ftz_f : GCCBuiltin<"__nvvm_rsqrt_approx_ftz_f">,
+  def int_nvvm_rsqrt_approx_ftz_f : ClangBuiltin<"__nvvm_rsqrt_approx_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_rsqrt_approx_f : GCCBuiltin<"__nvvm_rsqrt_approx_f">,
+  def int_nvvm_rsqrt_approx_f : ClangBuiltin<"__nvvm_rsqrt_approx_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_rsqrt_approx_d : GCCBuiltin<"__nvvm_rsqrt_approx_d">,
+  def int_nvvm_rsqrt_approx_d : ClangBuiltin<"__nvvm_rsqrt_approx_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
 
 //
 // Add
 //
 
-  def int_nvvm_add_rn_ftz_f : GCCBuiltin<"__nvvm_add_rn_ftz_f">,
+  def int_nvvm_add_rn_ftz_f : ClangBuiltin<"__nvvm_add_rn_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
-  def int_nvvm_add_rn_f : GCCBuiltin<"__nvvm_add_rn_f">,
+  def int_nvvm_add_rn_f : ClangBuiltin<"__nvvm_add_rn_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
-  def int_nvvm_add_rz_ftz_f : GCCBuiltin<"__nvvm_add_rz_ftz_f">,
+  def int_nvvm_add_rz_ftz_f : ClangBuiltin<"__nvvm_add_rz_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
-  def int_nvvm_add_rz_f : GCCBuiltin<"__nvvm_add_rz_f">,
+  def int_nvvm_add_rz_f : ClangBuiltin<"__nvvm_add_rz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
-  def int_nvvm_add_rm_ftz_f : GCCBuiltin<"__nvvm_add_rm_ftz_f">,
+  def int_nvvm_add_rm_ftz_f : ClangBuiltin<"__nvvm_add_rm_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
-  def int_nvvm_add_rm_f : GCCBuiltin<"__nvvm_add_rm_f">,
+  def int_nvvm_add_rm_f : ClangBuiltin<"__nvvm_add_rm_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
-  def int_nvvm_add_rp_ftz_f : GCCBuiltin<"__nvvm_add_rp_ftz_f">,
+  def int_nvvm_add_rp_ftz_f : ClangBuiltin<"__nvvm_add_rp_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
-  def int_nvvm_add_rp_f : GCCBuiltin<"__nvvm_add_rp_f">,
+  def int_nvvm_add_rp_f : ClangBuiltin<"__nvvm_add_rp_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
 
-  def int_nvvm_add_rn_d : GCCBuiltin<"__nvvm_add_rn_d">,
+  def int_nvvm_add_rn_d : ClangBuiltin<"__nvvm_add_rn_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
-  def int_nvvm_add_rz_d : GCCBuiltin<"__nvvm_add_rz_d">,
+  def int_nvvm_add_rz_d : ClangBuiltin<"__nvvm_add_rz_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
-  def int_nvvm_add_rm_d : GCCBuiltin<"__nvvm_add_rm_d">,
+  def int_nvvm_add_rm_d : ClangBuiltin<"__nvvm_add_rm_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
-  def int_nvvm_add_rp_d : GCCBuiltin<"__nvvm_add_rp_d">,
+  def int_nvvm_add_rp_d : ClangBuiltin<"__nvvm_add_rp_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
 
@@ -980,278 +1031,278 @@ let TargetPrefix = "nvvm" in {
 // Convert
 //
 
-  def int_nvvm_d2f_rn_ftz : GCCBuiltin<"__nvvm_d2f_rn_ftz">,
+  def int_nvvm_d2f_rn_ftz : ClangBuiltin<"__nvvm_d2f_rn_ftz">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_d2f_rn : GCCBuiltin<"__nvvm_d2f_rn">,
+  def int_nvvm_d2f_rn : ClangBuiltin<"__nvvm_d2f_rn">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_d2f_rz_ftz : GCCBuiltin<"__nvvm_d2f_rz_ftz">,
+  def int_nvvm_d2f_rz_ftz : ClangBuiltin<"__nvvm_d2f_rz_ftz">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_d2f_rz : GCCBuiltin<"__nvvm_d2f_rz">,
+  def int_nvvm_d2f_rz : ClangBuiltin<"__nvvm_d2f_rz">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_d2f_rm_ftz : GCCBuiltin<"__nvvm_d2f_rm_ftz">,
+  def int_nvvm_d2f_rm_ftz : ClangBuiltin<"__nvvm_d2f_rm_ftz">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_d2f_rm : GCCBuiltin<"__nvvm_d2f_rm">,
+  def int_nvvm_d2f_rm : ClangBuiltin<"__nvvm_d2f_rm">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_d2f_rp_ftz : GCCBuiltin<"__nvvm_d2f_rp_ftz">,
+  def int_nvvm_d2f_rp_ftz : ClangBuiltin<"__nvvm_d2f_rp_ftz">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_d2f_rp : GCCBuiltin<"__nvvm_d2f_rp">,
+  def int_nvvm_d2f_rp : ClangBuiltin<"__nvvm_d2f_rp">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
 
-  def int_nvvm_d2i_rn : GCCBuiltin<"__nvvm_d2i_rn">,
+  def int_nvvm_d2i_rn : ClangBuiltin<"__nvvm_d2i_rn">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_d2i_rz : GCCBuiltin<"__nvvm_d2i_rz">,
+  def int_nvvm_d2i_rz : ClangBuiltin<"__nvvm_d2i_rz">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_d2i_rm : GCCBuiltin<"__nvvm_d2i_rm">,
+  def int_nvvm_d2i_rm : ClangBuiltin<"__nvvm_d2i_rm">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_d2i_rp : GCCBuiltin<"__nvvm_d2i_rp">,
+  def int_nvvm_d2i_rp : ClangBuiltin<"__nvvm_d2i_rp">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
 
-  def int_nvvm_d2ui_rn : GCCBuiltin<"__nvvm_d2ui_rn">,
+  def int_nvvm_d2ui_rn : ClangBuiltin<"__nvvm_d2ui_rn">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_d2ui_rz : GCCBuiltin<"__nvvm_d2ui_rz">,
+  def int_nvvm_d2ui_rz : ClangBuiltin<"__nvvm_d2ui_rz">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_d2ui_rm : GCCBuiltin<"__nvvm_d2ui_rm">,
+  def int_nvvm_d2ui_rm : ClangBuiltin<"__nvvm_d2ui_rm">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_d2ui_rp : GCCBuiltin<"__nvvm_d2ui_rp">,
+  def int_nvvm_d2ui_rp : ClangBuiltin<"__nvvm_d2ui_rp">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
 
-  def int_nvvm_i2d_rn : GCCBuiltin<"__nvvm_i2d_rn">,
+  def int_nvvm_i2d_rn : ClangBuiltin<"__nvvm_i2d_rn">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_i2d_rz : GCCBuiltin<"__nvvm_i2d_rz">,
+  def int_nvvm_i2d_rz : ClangBuiltin<"__nvvm_i2d_rz">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_i2d_rm : GCCBuiltin<"__nvvm_i2d_rm">,
+  def int_nvvm_i2d_rm : ClangBuiltin<"__nvvm_i2d_rm">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_i2d_rp : GCCBuiltin<"__nvvm_i2d_rp">,
+  def int_nvvm_i2d_rp : ClangBuiltin<"__nvvm_i2d_rp">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
 
-  def int_nvvm_ui2d_rn : GCCBuiltin<"__nvvm_ui2d_rn">,
+  def int_nvvm_ui2d_rn : ClangBuiltin<"__nvvm_ui2d_rn">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_ui2d_rz : GCCBuiltin<"__nvvm_ui2d_rz">,
+  def int_nvvm_ui2d_rz : ClangBuiltin<"__nvvm_ui2d_rz">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_ui2d_rm : GCCBuiltin<"__nvvm_ui2d_rm">,
+  def int_nvvm_ui2d_rm : ClangBuiltin<"__nvvm_ui2d_rm">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_ui2d_rp : GCCBuiltin<"__nvvm_ui2d_rp">,
+  def int_nvvm_ui2d_rp : ClangBuiltin<"__nvvm_ui2d_rp">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
 
-  def int_nvvm_f2i_rn_ftz : GCCBuiltin<"__nvvm_f2i_rn_ftz">,
+  def int_nvvm_f2i_rn_ftz : ClangBuiltin<"__nvvm_f2i_rn_ftz">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_f2i_rn : GCCBuiltin<"__nvvm_f2i_rn">,
+  def int_nvvm_f2i_rn : ClangBuiltin<"__nvvm_f2i_rn">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_f2i_rz_ftz : GCCBuiltin<"__nvvm_f2i_rz_ftz">,
+  def int_nvvm_f2i_rz_ftz : ClangBuiltin<"__nvvm_f2i_rz_ftz">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_f2i_rz : GCCBuiltin<"__nvvm_f2i_rz">,
+  def int_nvvm_f2i_rz : ClangBuiltin<"__nvvm_f2i_rz">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_f2i_rm_ftz : GCCBuiltin<"__nvvm_f2i_rm_ftz">,
+  def int_nvvm_f2i_rm_ftz : ClangBuiltin<"__nvvm_f2i_rm_ftz">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_f2i_rm : GCCBuiltin<"__nvvm_f2i_rm">,
+  def int_nvvm_f2i_rm : ClangBuiltin<"__nvvm_f2i_rm">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_f2i_rp_ftz : GCCBuiltin<"__nvvm_f2i_rp_ftz">,
+  def int_nvvm_f2i_rp_ftz : ClangBuiltin<"__nvvm_f2i_rp_ftz">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_f2i_rp : GCCBuiltin<"__nvvm_f2i_rp">,
+  def int_nvvm_f2i_rp : ClangBuiltin<"__nvvm_f2i_rp">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
 
-  def int_nvvm_f2ui_rn_ftz : GCCBuiltin<"__nvvm_f2ui_rn_ftz">,
+  def int_nvvm_f2ui_rn_ftz : ClangBuiltin<"__nvvm_f2ui_rn_ftz">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_f2ui_rn : GCCBuiltin<"__nvvm_f2ui_rn">,
+  def int_nvvm_f2ui_rn : ClangBuiltin<"__nvvm_f2ui_rn">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_f2ui_rz_ftz : GCCBuiltin<"__nvvm_f2ui_rz_ftz">,
+  def int_nvvm_f2ui_rz_ftz : ClangBuiltin<"__nvvm_f2ui_rz_ftz">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_f2ui_rz : GCCBuiltin<"__nvvm_f2ui_rz">,
+  def int_nvvm_f2ui_rz : ClangBuiltin<"__nvvm_f2ui_rz">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_f2ui_rm_ftz : GCCBuiltin<"__nvvm_f2ui_rm_ftz">,
+  def int_nvvm_f2ui_rm_ftz : ClangBuiltin<"__nvvm_f2ui_rm_ftz">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_f2ui_rm : GCCBuiltin<"__nvvm_f2ui_rm">,
+  def int_nvvm_f2ui_rm : ClangBuiltin<"__nvvm_f2ui_rm">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_f2ui_rp_ftz : GCCBuiltin<"__nvvm_f2ui_rp_ftz">,
+  def int_nvvm_f2ui_rp_ftz : ClangBuiltin<"__nvvm_f2ui_rp_ftz">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_f2ui_rp : GCCBuiltin<"__nvvm_f2ui_rp">,
+  def int_nvvm_f2ui_rp : ClangBuiltin<"__nvvm_f2ui_rp">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
 
-  def int_nvvm_i2f_rn : GCCBuiltin<"__nvvm_i2f_rn">,
+  def int_nvvm_i2f_rn : ClangBuiltin<"__nvvm_i2f_rn">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_i2f_rz : GCCBuiltin<"__nvvm_i2f_rz">,
+  def int_nvvm_i2f_rz : ClangBuiltin<"__nvvm_i2f_rz">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_i2f_rm : GCCBuiltin<"__nvvm_i2f_rm">,
+  def int_nvvm_i2f_rm : ClangBuiltin<"__nvvm_i2f_rm">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_i2f_rp : GCCBuiltin<"__nvvm_i2f_rp">,
+  def int_nvvm_i2f_rp : ClangBuiltin<"__nvvm_i2f_rp">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
 
-  def int_nvvm_ui2f_rn : GCCBuiltin<"__nvvm_ui2f_rn">,
+  def int_nvvm_ui2f_rn : ClangBuiltin<"__nvvm_ui2f_rn">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_ui2f_rz : GCCBuiltin<"__nvvm_ui2f_rz">,
+  def int_nvvm_ui2f_rz : ClangBuiltin<"__nvvm_ui2f_rz">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_ui2f_rm : GCCBuiltin<"__nvvm_ui2f_rm">,
+  def int_nvvm_ui2f_rm : ClangBuiltin<"__nvvm_ui2f_rm">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_ui2f_rp : GCCBuiltin<"__nvvm_ui2f_rp">,
+  def int_nvvm_ui2f_rp : ClangBuiltin<"__nvvm_ui2f_rp">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
 
-  def int_nvvm_lohi_i2d : GCCBuiltin<"__nvvm_lohi_i2d">,
+  def int_nvvm_lohi_i2d : ClangBuiltin<"__nvvm_lohi_i2d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i32_ty, llvm_i32_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
 
-  def int_nvvm_d2i_lo : GCCBuiltin<"__nvvm_d2i_lo">,
+  def int_nvvm_d2i_lo : ClangBuiltin<"__nvvm_d2i_lo">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_d2i_hi : GCCBuiltin<"__nvvm_d2i_hi">,
+  def int_nvvm_d2i_hi : ClangBuiltin<"__nvvm_d2i_hi">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
 
-  def int_nvvm_f2ll_rn_ftz : GCCBuiltin<"__nvvm_f2ll_rn_ftz">,
+  def int_nvvm_f2ll_rn_ftz : ClangBuiltin<"__nvvm_f2ll_rn_ftz">,
       DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_f2ll_rn : GCCBuiltin<"__nvvm_f2ll_rn">,
+  def int_nvvm_f2ll_rn : ClangBuiltin<"__nvvm_f2ll_rn">,
       DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_f2ll_rz_ftz : GCCBuiltin<"__nvvm_f2ll_rz_ftz">,
+  def int_nvvm_f2ll_rz_ftz : ClangBuiltin<"__nvvm_f2ll_rz_ftz">,
       DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_f2ll_rz : GCCBuiltin<"__nvvm_f2ll_rz">,
+  def int_nvvm_f2ll_rz : ClangBuiltin<"__nvvm_f2ll_rz">,
       DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_f2ll_rm_ftz : GCCBuiltin<"__nvvm_f2ll_rm_ftz">,
+  def int_nvvm_f2ll_rm_ftz : ClangBuiltin<"__nvvm_f2ll_rm_ftz">,
       DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_f2ll_rm : GCCBuiltin<"__nvvm_f2ll_rm">,
+  def int_nvvm_f2ll_rm : ClangBuiltin<"__nvvm_f2ll_rm">,
       DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_f2ll_rp_ftz : GCCBuiltin<"__nvvm_f2ll_rp_ftz">,
+  def int_nvvm_f2ll_rp_ftz : ClangBuiltin<"__nvvm_f2ll_rp_ftz">,
       DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_f2ll_rp : GCCBuiltin<"__nvvm_f2ll_rp">,
+  def int_nvvm_f2ll_rp : ClangBuiltin<"__nvvm_f2ll_rp">,
       DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
 
-  def int_nvvm_f2ull_rn_ftz : GCCBuiltin<"__nvvm_f2ull_rn_ftz">,
+  def int_nvvm_f2ull_rn_ftz : ClangBuiltin<"__nvvm_f2ull_rn_ftz">,
       DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_f2ull_rn : GCCBuiltin<"__nvvm_f2ull_rn">,
+  def int_nvvm_f2ull_rn : ClangBuiltin<"__nvvm_f2ull_rn">,
       DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_f2ull_rz_ftz : GCCBuiltin<"__nvvm_f2ull_rz_ftz">,
+  def int_nvvm_f2ull_rz_ftz : ClangBuiltin<"__nvvm_f2ull_rz_ftz">,
       DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_f2ull_rz : GCCBuiltin<"__nvvm_f2ull_rz">,
+  def int_nvvm_f2ull_rz : ClangBuiltin<"__nvvm_f2ull_rz">,
       DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_f2ull_rm_ftz : GCCBuiltin<"__nvvm_f2ull_rm_ftz">,
+  def int_nvvm_f2ull_rm_ftz : ClangBuiltin<"__nvvm_f2ull_rm_ftz">,
       DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_f2ull_rm : GCCBuiltin<"__nvvm_f2ull_rm">,
+  def int_nvvm_f2ull_rm : ClangBuiltin<"__nvvm_f2ull_rm">,
       DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_f2ull_rp_ftz : GCCBuiltin<"__nvvm_f2ull_rp_ftz">,
+  def int_nvvm_f2ull_rp_ftz : ClangBuiltin<"__nvvm_f2ull_rp_ftz">,
       DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_f2ull_rp : GCCBuiltin<"__nvvm_f2ull_rp">,
+  def int_nvvm_f2ull_rp : ClangBuiltin<"__nvvm_f2ull_rp">,
       DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
 
-  def int_nvvm_d2ll_rn : GCCBuiltin<"__nvvm_d2ll_rn">,
+  def int_nvvm_d2ll_rn : ClangBuiltin<"__nvvm_d2ll_rn">,
       DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_d2ll_rz : GCCBuiltin<"__nvvm_d2ll_rz">,
+  def int_nvvm_d2ll_rz : ClangBuiltin<"__nvvm_d2ll_rz">,
       DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_d2ll_rm : GCCBuiltin<"__nvvm_d2ll_rm">,
+  def int_nvvm_d2ll_rm : ClangBuiltin<"__nvvm_d2ll_rm">,
       DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_d2ll_rp : GCCBuiltin<"__nvvm_d2ll_rp">,
+  def int_nvvm_d2ll_rp : ClangBuiltin<"__nvvm_d2ll_rp">,
       DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
 
-  def int_nvvm_d2ull_rn : GCCBuiltin<"__nvvm_d2ull_rn">,
+  def int_nvvm_d2ull_rn : ClangBuiltin<"__nvvm_d2ull_rn">,
       DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_d2ull_rz : GCCBuiltin<"__nvvm_d2ull_rz">,
+  def int_nvvm_d2ull_rz : ClangBuiltin<"__nvvm_d2ull_rz">,
       DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_d2ull_rm : GCCBuiltin<"__nvvm_d2ull_rm">,
+  def int_nvvm_d2ull_rm : ClangBuiltin<"__nvvm_d2ull_rm">,
       DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_d2ull_rp : GCCBuiltin<"__nvvm_d2ull_rp">,
+  def int_nvvm_d2ull_rp : ClangBuiltin<"__nvvm_d2ull_rp">,
       DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
 
-  def int_nvvm_ll2f_rn : GCCBuiltin<"__nvvm_ll2f_rn">,
+  def int_nvvm_ll2f_rn : ClangBuiltin<"__nvvm_ll2f_rn">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_ll2f_rz : GCCBuiltin<"__nvvm_ll2f_rz">,
+  def int_nvvm_ll2f_rz : ClangBuiltin<"__nvvm_ll2f_rz">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_ll2f_rm : GCCBuiltin<"__nvvm_ll2f_rm">,
+  def int_nvvm_ll2f_rm : ClangBuiltin<"__nvvm_ll2f_rm">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_ll2f_rp : GCCBuiltin<"__nvvm_ll2f_rp">,
+  def int_nvvm_ll2f_rp : ClangBuiltin<"__nvvm_ll2f_rp">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_ull2f_rn : GCCBuiltin<"__nvvm_ull2f_rn">,
+  def int_nvvm_ull2f_rn : ClangBuiltin<"__nvvm_ull2f_rn">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_ull2f_rz : GCCBuiltin<"__nvvm_ull2f_rz">,
+  def int_nvvm_ull2f_rz : ClangBuiltin<"__nvvm_ull2f_rz">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_ull2f_rm : GCCBuiltin<"__nvvm_ull2f_rm">,
+  def int_nvvm_ull2f_rm : ClangBuiltin<"__nvvm_ull2f_rm">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_ull2f_rp : GCCBuiltin<"__nvvm_ull2f_rp">,
+  def int_nvvm_ull2f_rp : ClangBuiltin<"__nvvm_ull2f_rp">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
 
-  def int_nvvm_ll2d_rn : GCCBuiltin<"__nvvm_ll2d_rn">,
+  def int_nvvm_ll2d_rn : ClangBuiltin<"__nvvm_ll2d_rn">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_ll2d_rz : GCCBuiltin<"__nvvm_ll2d_rz">,
+  def int_nvvm_ll2d_rz : ClangBuiltin<"__nvvm_ll2d_rz">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_ll2d_rm : GCCBuiltin<"__nvvm_ll2d_rm">,
+  def int_nvvm_ll2d_rm : ClangBuiltin<"__nvvm_ll2d_rm">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_ll2d_rp : GCCBuiltin<"__nvvm_ll2d_rp">,
+  def int_nvvm_ll2d_rp : ClangBuiltin<"__nvvm_ll2d_rp">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_ull2d_rn : GCCBuiltin<"__nvvm_ull2d_rn">,
+  def int_nvvm_ull2d_rn : ClangBuiltin<"__nvvm_ull2d_rn">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_ull2d_rz : GCCBuiltin<"__nvvm_ull2d_rz">,
+  def int_nvvm_ull2d_rz : ClangBuiltin<"__nvvm_ull2d_rz">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_ull2d_rm : GCCBuiltin<"__nvvm_ull2d_rm">,
+  def int_nvvm_ull2d_rm : ClangBuiltin<"__nvvm_ull2d_rm">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_ull2d_rp : GCCBuiltin<"__nvvm_ull2d_rp">,
+  def int_nvvm_ull2d_rp : ClangBuiltin<"__nvvm_ull2d_rp">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
 
-  def int_nvvm_f2h_rn_ftz : GCCBuiltin<"__nvvm_f2h_rn_ftz">,
+  def int_nvvm_f2h_rn_ftz : ClangBuiltin<"__nvvm_f2h_rn_ftz">,
       DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_f2h_rn : GCCBuiltin<"__nvvm_f2h_rn">,
+  def int_nvvm_f2h_rn : ClangBuiltin<"__nvvm_f2h_rn">,
       DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
 
-  def int_nvvm_ff2bf16x2_rn : GCCBuiltin<"__nvvm_ff2bf16x2_rn">,
-       Intrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_ff2bf16x2_rn_relu : GCCBuiltin<"__nvvm_ff2bf16x2_rn_relu">,
-      Intrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_ff2bf16x2_rz : GCCBuiltin<"__nvvm_ff2bf16x2_rz">,
-      Intrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_ff2bf16x2_rz_relu : GCCBuiltin<"__nvvm_ff2bf16x2_rz_relu">,
+  def int_nvvm_ff2bf16x2_rn : ClangBuiltin<"__nvvm_ff2bf16x2_rn">,
+       Intrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+  def int_nvvm_ff2bf16x2_rn_relu : ClangBuiltin<"__nvvm_ff2bf16x2_rn_relu">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+  def int_nvvm_ff2bf16x2_rz : ClangBuiltin<"__nvvm_ff2bf16x2_rz">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+  def int_nvvm_ff2bf16x2_rz_relu : ClangBuiltin<"__nvvm_ff2bf16x2_rz_relu">,
       Intrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
 
-  def int_nvvm_ff2f16x2_rn : GCCBuiltin<"__nvvm_ff2f16x2_rn">,
-      Intrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_ff2f16x2_rn_relu : GCCBuiltin<"__nvvm_ff2f16x2_rn_relu">,
-      Intrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_ff2f16x2_rz : GCCBuiltin<"__nvvm_ff2f16x2_rz">,
-      Intrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_ff2f16x2_rz_relu : GCCBuiltin<"__nvvm_ff2f16x2_rz_relu">,
-      Intrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-
-  def int_nvvm_f2bf16_rn : GCCBuiltin<"__nvvm_f2bf16_rn">,
-      Intrinsic<[llvm_i16_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_f2bf16_rn_relu : GCCBuiltin<"__nvvm_f2bf16_rn_relu">,
-      Intrinsic<[llvm_i16_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_f2bf16_rz : GCCBuiltin<"__nvvm_f2bf16_rz">,
-      Intrinsic<[llvm_i16_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_nvvm_f2bf16_rz_relu : GCCBuiltin<"__nvvm_f2bf16_rz_relu">,
-       Intrinsic<[llvm_i16_ty], [llvm_float_ty], [IntrNoMem]>;
-
-  def int_nvvm_f2tf32_rna : GCCBuiltin<"__nvvm_f2tf32_rna">,
-      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_ff2f16x2_rn : ClangBuiltin<"__nvvm_ff2f16x2_rn">,
+      Intrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+  def int_nvvm_ff2f16x2_rn_relu : ClangBuiltin<"__nvvm_ff2f16x2_rn_relu">,
+      Intrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+  def int_nvvm_ff2f16x2_rz : ClangBuiltin<"__nvvm_ff2f16x2_rz">,
+      Intrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+  def int_nvvm_ff2f16x2_rz_relu : ClangBuiltin<"__nvvm_ff2f16x2_rz_relu">,
+      Intrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+
+  def int_nvvm_f2bf16_rn : ClangBuiltin<"__nvvm_f2bf16_rn">,
+      Intrinsic<[llvm_i16_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+  def int_nvvm_f2bf16_rn_relu : ClangBuiltin<"__nvvm_f2bf16_rn_relu">,
+      Intrinsic<[llvm_i16_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+  def int_nvvm_f2bf16_rz : ClangBuiltin<"__nvvm_f2bf16_rz">,
+      Intrinsic<[llvm_i16_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+  def int_nvvm_f2bf16_rz_relu : ClangBuiltin<"__nvvm_f2bf16_rz_relu">,
+       Intrinsic<[llvm_i16_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+
+  def int_nvvm_f2tf32_rna : ClangBuiltin<"__nvvm_f2tf32_rna">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
 
 //
 // Bitcast
 //
 
-  def int_nvvm_bitcast_f2i : GCCBuiltin<"__nvvm_bitcast_f2i">,
+  def int_nvvm_bitcast_f2i : ClangBuiltin<"__nvvm_bitcast_f2i">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_bitcast_i2f : GCCBuiltin<"__nvvm_bitcast_i2f">,
+  def int_nvvm_bitcast_i2f : ClangBuiltin<"__nvvm_bitcast_i2f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
 
-  def int_nvvm_bitcast_ll2d : GCCBuiltin<"__nvvm_bitcast_ll2d">,
+  def int_nvvm_bitcast_ll2d : ClangBuiltin<"__nvvm_bitcast_ll2d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_bitcast_d2ll : GCCBuiltin<"__nvvm_bitcast_d2ll">,
+  def int_nvvm_bitcast_d2ll : ClangBuiltin<"__nvvm_bitcast_d2ll">,
       DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
 
 // FNS
 
-  def int_nvvm_fns : GCCBuiltin<"__nvvm_fns">,
+  def int_nvvm_fns : ClangBuiltin<"__nvvm_fns">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
                 [IntrNoMem]>;
 
 // Atomics not available as llvm intrinsics.
   def int_nvvm_atomic_load_inc_32 : Intrinsic<[llvm_i32_ty],
           [LLVMAnyPointerType<llvm_i32_ty>, llvm_i32_ty],
-                                      [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
+                                      [IntrArgMemOnly, IntrNoCallback, NoCapture<ArgIndex<0>>]>;
   def int_nvvm_atomic_load_dec_32 : Intrinsic<[llvm_i32_ty],
           [LLVMAnyPointerType<llvm_i32_ty>, llvm_i32_ty],
-                                      [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
+                                      [IntrArgMemOnly, IntrNoCallback, NoCapture<ArgIndex<0>>]>;
 
   class SCOPED_ATOMIC2_impl<LLVMType elty>
         : Intrinsic<[elty],
           [LLVMAnyPointerType<LLVMMatchType<0>>, LLVMMatchType<0>],
-          [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
+          [IntrArgMemOnly, IntrNoCallback, NoCapture<ArgIndex<0>>]>;
   class SCOPED_ATOMIC3_impl<LLVMType elty>
         : Intrinsic<[elty],
           [LLVMAnyPointerType<LLVMMatchType<0>>, LLVMMatchType<0>,
            LLVMMatchType<0>],
-          [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
+          [IntrArgMemOnly, IntrNoCallback, NoCapture<ArgIndex<0>>]>;
 
   multiclass PTXAtomicWithScope2<LLVMType elty> {
     def _cta : SCOPED_ATOMIC2_impl<elty>;
@@ -1280,177 +1331,179 @@ let TargetPrefix = "nvvm" in {
 
   // The builtin for "bar.sync 0" is called __syncthreads.  Unlike most of the
   // intrinsics in this file, this one is a user-facing API.
-  def int_nvvm_barrier0 : GCCBuiltin<"__syncthreads">,
-      Intrinsic<[], [], [IntrConvergent]>;
+  def int_nvvm_barrier0 : ClangBuiltin<"__syncthreads">,
+      Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>;
   // Synchronize all threads in the CTA at barrier 'n'.
-  def int_nvvm_barrier_n : GCCBuiltin<"__nvvm_bar_n">,
-      Intrinsic<[], [llvm_i32_ty], [IntrConvergent]>;
+  def int_nvvm_barrier_n : ClangBuiltin<"__nvvm_bar_n">,
+      Intrinsic<[], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
   // Synchronize 'm', a multiple of warp size, (arg 2) threads in
   // the CTA at barrier 'n' (arg 1).
-  def int_nvvm_barrier : GCCBuiltin<"__nvvm_bar">,
-      Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrConvergent]>;
-  def int_nvvm_barrier0_popc : GCCBuiltin<"__nvvm_bar0_popc">,
-      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent]>;
-  def int_nvvm_barrier0_and : GCCBuiltin<"__nvvm_bar0_and">,
-      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent]>;
-  def int_nvvm_barrier0_or : GCCBuiltin<"__nvvm_bar0_or">,
-      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent]>;
+  def int_nvvm_barrier : ClangBuiltin<"__nvvm_bar">,
+      Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
+  def int_nvvm_barrier0_popc : ClangBuiltin<"__nvvm_bar0_popc">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
+  def int_nvvm_barrier0_and : ClangBuiltin<"__nvvm_bar0_and">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
+  def int_nvvm_barrier0_or : ClangBuiltin<"__nvvm_bar0_or">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
 
   def int_nvvm_bar_sync :
-      Intrinsic<[], [llvm_i32_ty], [IntrConvergent]>,
-      GCCBuiltin<"__nvvm_bar_sync">;
+      Intrinsic<[], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>,
+      ClangBuiltin<"__nvvm_bar_sync">;
   def int_nvvm_bar_warp_sync :
-      Intrinsic<[], [llvm_i32_ty], [IntrConvergent]>,
-      GCCBuiltin<"__nvvm_bar_warp_sync">;
+      Intrinsic<[], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>,
+      ClangBuiltin<"__nvvm_bar_warp_sync">;
 
   // barrier.sync id[, cnt]
   def int_nvvm_barrier_sync :
-      Intrinsic<[], [llvm_i32_ty], [IntrConvergent]>,
-      GCCBuiltin<"__nvvm_barrier_sync">;
+      Intrinsic<[], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>,
+      ClangBuiltin<"__nvvm_barrier_sync">;
   def int_nvvm_barrier_sync_cnt :
-      Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrConvergent]>,
-      GCCBuiltin<"__nvvm_barrier_sync_cnt">;
+      Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoCallback]>,
+      ClangBuiltin<"__nvvm_barrier_sync_cnt">;
 
   // Membar
-  def int_nvvm_membar_cta : GCCBuiltin<"__nvvm_membar_cta">,
-      Intrinsic<[], [], []>;
-  def int_nvvm_membar_gl : GCCBuiltin<"__nvvm_membar_gl">,
-      Intrinsic<[], [], []>;
-  def int_nvvm_membar_sys : GCCBuiltin<"__nvvm_membar_sys">,
-      Intrinsic<[], [], []>;
+  def int_nvvm_membar_cta : ClangBuiltin<"__nvvm_membar_cta">,
+      Intrinsic<[], [], [IntrNoCallback]>;
+  def int_nvvm_membar_gl : ClangBuiltin<"__nvvm_membar_gl">,
+      Intrinsic<[], [], [IntrNoCallback]>;
+  def int_nvvm_membar_sys : ClangBuiltin<"__nvvm_membar_sys">,
+      Intrinsic<[], [], [IntrNoCallback]>;
 
 // Async Copy
 def int_nvvm_cp_async_mbarrier_arrive :
-    GCCBuiltin<"__nvvm_cp_async_mbarrier_arrive">,
-    Intrinsic<[],[llvm_i64ptr_ty],[IntrConvergent]>;
+    ClangBuiltin<"__nvvm_cp_async_mbarrier_arrive">,
+    Intrinsic<[],[llvm_i64ptr_ty],[IntrConvergent, IntrNoCallback]>;
 def int_nvvm_cp_async_mbarrier_arrive_shared :
-    GCCBuiltin<"__nvvm_cp_async_mbarrier_arrive_shared">,
-    Intrinsic<[],[llvm_shared_i64ptr_ty],[IntrConvergent]>;
+    ClangBuiltin<"__nvvm_cp_async_mbarrier_arrive_shared">,
+    Intrinsic<[],[llvm_shared_i64ptr_ty],[IntrConvergent, IntrNoCallback]>;
 def int_nvvm_cp_async_mbarrier_arrive_noinc :
-    GCCBuiltin<"__nvvm_cp_async_mbarrier_arrive_noinc">,
-    Intrinsic<[],[llvm_i64ptr_ty],[IntrConvergent]>;
+    ClangBuiltin<"__nvvm_cp_async_mbarrier_arrive_noinc">,
+    Intrinsic<[],[llvm_i64ptr_ty],[IntrConvergent, IntrNoCallback]>;
 def int_nvvm_cp_async_mbarrier_arrive_noinc_shared :
-    GCCBuiltin<"__nvvm_cp_async_mbarrier_arrive_noinc_shared">,
-    Intrinsic<[],[llvm_shared_i64ptr_ty],[IntrConvergent]>;
+    ClangBuiltin<"__nvvm_cp_async_mbarrier_arrive_noinc_shared">,
+    Intrinsic<[],[llvm_shared_i64ptr_ty],[IntrConvergent, IntrNoCallback]>;
 
 def int_nvvm_cp_async_ca_shared_global_4 :
-    GCCBuiltin<"__nvvm_cp_async_ca_shared_global_4">,
+    ClangBuiltin<"__nvvm_cp_async_ca_shared_global_4">,
     Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
-    [IntrArgMemOnly, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
+    [IntrArgMemOnly, IntrNoCallback, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
      WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
     "llvm.nvvm.cp.async.ca.shared.global.4">;
 def int_nvvm_cp_async_ca_shared_global_8 :
-    GCCBuiltin<"__nvvm_cp_async_ca_shared_global_8">,
+    ClangBuiltin<"__nvvm_cp_async_ca_shared_global_8">,
     Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
-    [IntrArgMemOnly, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
+    [IntrArgMemOnly, IntrNoCallback, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
      WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
     "llvm.nvvm.cp.async.ca.shared.global.8">;
 def int_nvvm_cp_async_ca_shared_global_16 :
-    GCCBuiltin<"__nvvm_cp_async_ca_shared_global_16">,
+    ClangBuiltin<"__nvvm_cp_async_ca_shared_global_16">,
     Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
-    [IntrArgMemOnly, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
+    [IntrArgMemOnly, IntrNoCallback, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
      WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
     "llvm.nvvm.cp.async.ca.shared.global.16">;
 def int_nvvm_cp_async_cg_shared_global_16 :
-    GCCBuiltin<"__nvvm_cp_async_cg_shared_global_16">,
+    ClangBuiltin<"__nvvm_cp_async_cg_shared_global_16">,
     Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
-    [IntrArgMemOnly, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
+    [IntrArgMemOnly, IntrNoCallback, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
      WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
     "llvm.nvvm.cp.async.cg.shared.global.16">;
 
 def int_nvvm_cp_async_commit_group :
-    GCCBuiltin<"__nvvm_cp_async_commit_group">,
+    ClangBuiltin<"__nvvm_cp_async_commit_group">,
     Intrinsic<[],[],[]>;
 
 def int_nvvm_cp_async_wait_group :
-    GCCBuiltin<"__nvvm_cp_async_wait_group">,
+    ClangBuiltin<"__nvvm_cp_async_wait_group">,
     Intrinsic<[],[llvm_i32_ty],[ImmArg<ArgIndex<0>>]>;
 
 def int_nvvm_cp_async_wait_all :
-    GCCBuiltin<"__nvvm_cp_async_wait_all">,
+    ClangBuiltin<"__nvvm_cp_async_wait_all">,
     Intrinsic<[],[],[]>;
 
 // mbarrier
-def int_nvvm_mbarrier_init : GCCBuiltin<"__nvvm_mbarrier_init">,
-    Intrinsic<[],[llvm_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>;
+def int_nvvm_mbarrier_init : ClangBuiltin<"__nvvm_mbarrier_init">,
+    Intrinsic<[],[llvm_i64ptr_ty, llvm_i32_ty],[IntrConvergent, IntrNoCallback]>;
 def int_nvvm_mbarrier_init_shared :
-    GCCBuiltin<"__nvvm_mbarrier_init_shared">,
-    Intrinsic<[],[llvm_shared_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>;
+    ClangBuiltin<"__nvvm_mbarrier_init_shared">,
+    Intrinsic<[],[llvm_shared_i64ptr_ty, llvm_i32_ty],[IntrConvergent, IntrNoCallback]>;
 
-def int_nvvm_mbarrier_inval : GCCBuiltin<"__nvvm_mbarrier_inval">,
+def int_nvvm_mbarrier_inval : ClangBuiltin<"__nvvm_mbarrier_inval">,
     Intrinsic<[],[llvm_i64ptr_ty],
-    [IntrConvergent, IntrWriteMem, IntrArgMemOnly,
+    [IntrConvergent, IntrWriteMem, IntrArgMemOnly, IntrNoCallback,
     WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>]>;
 def int_nvvm_mbarrier_inval_shared :
-    GCCBuiltin<"__nvvm_mbarrier_inval_shared">,
+    ClangBuiltin<"__nvvm_mbarrier_inval_shared">,
     Intrinsic<[],[llvm_shared_i64ptr_ty],
-    [IntrConvergent, IntrWriteMem, IntrArgMemOnly,
+    [IntrConvergent, IntrWriteMem, IntrArgMemOnly, IntrNoCallback,
     WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>]>;
 
-def int_nvvm_mbarrier_arrive : GCCBuiltin<"__nvvm_mbarrier_arrive">,
-    Intrinsic<[llvm_i64_ty],[llvm_i64ptr_ty],[IntrConvergent]>;
+def int_nvvm_mbarrier_arrive : ClangBuiltin<"__nvvm_mbarrier_arrive">,
+    Intrinsic<[llvm_i64_ty],[llvm_i64ptr_ty],[IntrConvergent, IntrNoCallback]>;
 def int_nvvm_mbarrier_arrive_shared :
-    GCCBuiltin<"__nvvm_mbarrier_arrive_shared">,
-    Intrinsic<[llvm_i64_ty],[llvm_shared_i64ptr_ty],[IntrConvergent]>;
+    ClangBuiltin<"__nvvm_mbarrier_arrive_shared">,
+    Intrinsic<[llvm_i64_ty],[llvm_shared_i64ptr_ty],[IntrConvergent, IntrNoCallback]>;
 def int_nvvm_mbarrier_arrive_noComplete :
-    GCCBuiltin<"__nvvm_mbarrier_arrive_noComplete">,
-    Intrinsic<[llvm_i64_ty],[llvm_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>;
+    ClangBuiltin<"__nvvm_mbarrier_arrive_noComplete">,
+    Intrinsic<[llvm_i64_ty],[llvm_i64ptr_ty, llvm_i32_ty],[IntrConvergent, IntrNoCallback]>;
 def int_nvvm_mbarrier_arrive_noComplete_shared :
-    GCCBuiltin<"__nvvm_mbarrier_arrive_noComplete_shared">,
-    Intrinsic<[llvm_i64_ty],[llvm_shared_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>;
+    ClangBuiltin<"__nvvm_mbarrier_arrive_noComplete_shared">,
+    Intrinsic<[llvm_i64_ty],[llvm_shared_i64ptr_ty,
+    llvm_i32_ty],[IntrConvergent, IntrNoCallback]>;
 
 def int_nvvm_mbarrier_arrive_drop :
-    GCCBuiltin<"__nvvm_mbarrier_arrive_drop">,
-    Intrinsic<[llvm_i64_ty],[llvm_i64ptr_ty],[IntrConvergent]>;
+    ClangBuiltin<"__nvvm_mbarrier_arrive_drop">,
+    Intrinsic<[llvm_i64_ty],[llvm_i64ptr_ty],[IntrConvergent, IntrNoCallback]>;
 def int_nvvm_mbarrier_arrive_drop_shared :
-    GCCBuiltin<"__nvvm_mbarrier_arrive_drop_shared">,
-    Intrinsic<[llvm_i64_ty],[llvm_shared_i64ptr_ty],[IntrConvergent]>;
+    ClangBuiltin<"__nvvm_mbarrier_arrive_drop_shared">,
+    Intrinsic<[llvm_i64_ty],[llvm_shared_i64ptr_ty],[IntrConvergent, IntrNoCallback]>;
 def int_nvvm_mbarrier_arrive_drop_noComplete :
-    GCCBuiltin<"__nvvm_mbarrier_arrive_drop_noComplete">,
-    Intrinsic<[llvm_i64_ty],[llvm_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>;
+    ClangBuiltin<"__nvvm_mbarrier_arrive_drop_noComplete">,
+    Intrinsic<[llvm_i64_ty],[llvm_i64ptr_ty, llvm_i32_ty],[IntrConvergent, IntrNoCallback]>;
 def int_nvvm_mbarrier_arrive_drop_noComplete_shared :
-    GCCBuiltin<"__nvvm_mbarrier_arrive_drop_noComplete_shared">,
-    Intrinsic<[llvm_i64_ty],[llvm_shared_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>;
+    ClangBuiltin<"__nvvm_mbarrier_arrive_drop_noComplete_shared">,
+    Intrinsic<[llvm_i64_ty],[llvm_shared_i64ptr_ty,
+    llvm_i32_ty],[IntrConvergent, IntrNoCallback]>;
 
 def int_nvvm_mbarrier_test_wait :
-    GCCBuiltin<"__nvvm_mbarrier_test_wait">,
-    Intrinsic<[llvm_i1_ty],[llvm_i64ptr_ty, llvm_i64_ty],[IntrConvergent]>;
+    ClangBuiltin<"__nvvm_mbarrier_test_wait">,
+    Intrinsic<[llvm_i1_ty],[llvm_i64ptr_ty, llvm_i64_ty],[IntrConvergent, IntrNoCallback]>;
 def int_nvvm_mbarrier_test_wait_shared :
-    GCCBuiltin<"__nvvm_mbarrier_test_wait_shared">,
-    Intrinsic<[llvm_i1_ty],[llvm_shared_i64ptr_ty, llvm_i64_ty],[IntrConvergent]>;
+    ClangBuiltin<"__nvvm_mbarrier_test_wait_shared">,
+    Intrinsic<[llvm_i1_ty],[llvm_shared_i64ptr_ty, llvm_i64_ty],[IntrConvergent, IntrNoCallback]>;
 
 def int_nvvm_mbarrier_pending_count :
-    GCCBuiltin<"__nvvm_mbarrier_pending_count">,
-    Intrinsic<[llvm_i32_ty],[llvm_i64_ty],[IntrNoMem, IntrConvergent]>;
+    ClangBuiltin<"__nvvm_mbarrier_pending_count">,
+    Intrinsic<[llvm_i32_ty],[llvm_i64_ty],[IntrNoMem, IntrConvergent, IntrNoCallback]>;
 
 // Generated within nvvm. Use for ldu on sm_20 or later.  Second arg is the
 // pointer's alignment.
 def int_nvvm_ldu_global_i : Intrinsic<[llvm_anyint_ty],
   [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, NoCapture<ArgIndex<0>>],
+  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, NoCapture<ArgIndex<0>>],
   "llvm.nvvm.ldu.global.i">;
 def int_nvvm_ldu_global_f : Intrinsic<[llvm_anyfloat_ty],
   [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, NoCapture<ArgIndex<0>>],
+  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, NoCapture<ArgIndex<0>>],
   "llvm.nvvm.ldu.global.f">;
 def int_nvvm_ldu_global_p : Intrinsic<[llvm_anyptr_ty],
   [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, NoCapture<ArgIndex<0>>],
+  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, NoCapture<ArgIndex<0>>],
   "llvm.nvvm.ldu.global.p">;
 
 // Generated within nvvm. Use for ldg on sm_35 or later.  Second arg is the
 // pointer's alignment.
 def int_nvvm_ldg_global_i : Intrinsic<[llvm_anyint_ty],
   [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, NoCapture<ArgIndex<0>>],
+  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, NoCapture<ArgIndex<0>>],
   "llvm.nvvm.ldg.global.i">;
 def int_nvvm_ldg_global_f : Intrinsic<[llvm_anyfloat_ty],
   [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, NoCapture<ArgIndex<0>>],
+  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, NoCapture<ArgIndex<0>>],
   "llvm.nvvm.ldg.global.f">;
 def int_nvvm_ldg_global_p : Intrinsic<[llvm_anyptr_ty],
   [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, NoCapture<ArgIndex<0>>],
+  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, NoCapture<ArgIndex<0>>],
   "llvm.nvvm.ldg.global.p">;
 
 // Use for generic pointers
@@ -1491,7 +1544,7 @@ def int_nvvm_ptr_gen_to_constant: DefaultAttrsIntrinsic<[llvm_anyptr_ty],
 // This is for params that are passed to kernel functions by pointer by-val.
 def int_nvvm_ptr_gen_to_param: Intrinsic<[llvm_anyptr_ty],
                                      [llvm_anyptr_ty],
-                                   [IntrNoMem, IntrSpeculatable],
+                                   [IntrNoMem, IntrSpeculatable, IntrNoCallback],
                                    "llvm.nvvm.ptr.gen.to.param">;
 
 // Move intrinsics, used in nvvm internally
@@ -1531,149 +1584,149 @@ def int_nvvm_reflect :
 def int_nvvm_isspacep_const
   : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.isspacep.const">,
-    GCCBuiltin<"__nvvm_isspacep_const">;
+    ClangBuiltin<"__nvvm_isspacep_const">;
 def int_nvvm_isspacep_global
   : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.isspacep.global">,
-    GCCBuiltin<"__nvvm_isspacep_global">;
+    ClangBuiltin<"__nvvm_isspacep_global">;
 def int_nvvm_isspacep_local
   : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.isspacep.local">,
-    GCCBuiltin<"__nvvm_isspacep_local">;
+    ClangBuiltin<"__nvvm_isspacep_local">;
 def int_nvvm_isspacep_shared
   : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.isspacep.shared">,
-    GCCBuiltin<"__nvvm_isspacep_shared">;
+    ClangBuiltin<"__nvvm_isspacep_shared">;
 
 // Environment register read
 def int_nvvm_read_ptx_sreg_envreg0
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg0">,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg0">;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_envreg0">;
 def int_nvvm_read_ptx_sreg_envreg1
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg1">,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg1">;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_envreg1">;
 def int_nvvm_read_ptx_sreg_envreg2
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg2">,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg2">;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_envreg2">;
 def int_nvvm_read_ptx_sreg_envreg3
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg3">,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg3">;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_envreg3">;
 def int_nvvm_read_ptx_sreg_envreg4
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg4">,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg4">;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_envreg4">;
 def int_nvvm_read_ptx_sreg_envreg5
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg5">,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg5">;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_envreg5">;
 def int_nvvm_read_ptx_sreg_envreg6
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg6">,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg6">;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_envreg6">;
 def int_nvvm_read_ptx_sreg_envreg7
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg7">,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg7">;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_envreg7">;
 def int_nvvm_read_ptx_sreg_envreg8
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg8">,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg8">;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_envreg8">;
 def int_nvvm_read_ptx_sreg_envreg9
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg9">,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg9">;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_envreg9">;
 def int_nvvm_read_ptx_sreg_envreg10
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg10">,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg10">;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_envreg10">;
 def int_nvvm_read_ptx_sreg_envreg11
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg11">,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg11">;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_envreg11">;
 def int_nvvm_read_ptx_sreg_envreg12
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg12">,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg12">;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_envreg12">;
 def int_nvvm_read_ptx_sreg_envreg13
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg13">,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg13">;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_envreg13">;
 def int_nvvm_read_ptx_sreg_envreg14
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg14">,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg14">;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_envreg14">;
 def int_nvvm_read_ptx_sreg_envreg15
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg15">,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg15">;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_envreg15">;
 def int_nvvm_read_ptx_sreg_envreg16
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg16">,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg16">;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_envreg16">;
 def int_nvvm_read_ptx_sreg_envreg17
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg17">,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg17">;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_envreg17">;
 def int_nvvm_read_ptx_sreg_envreg18
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg18">,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg18">;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_envreg18">;
 def int_nvvm_read_ptx_sreg_envreg19
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg19">,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg19">;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_envreg19">;
 def int_nvvm_read_ptx_sreg_envreg20
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg20">,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg20">;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_envreg20">;
 def int_nvvm_read_ptx_sreg_envreg21
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg21">,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg21">;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_envreg21">;
 def int_nvvm_read_ptx_sreg_envreg22
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg22">,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg22">;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_envreg22">;
 def int_nvvm_read_ptx_sreg_envreg23
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg23">,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg23">;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_envreg23">;
 def int_nvvm_read_ptx_sreg_envreg24
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg24">,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg24">;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_envreg24">;
 def int_nvvm_read_ptx_sreg_envreg25
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg25">,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg25">;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_envreg25">;
 def int_nvvm_read_ptx_sreg_envreg26
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg26">,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg26">;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_envreg26">;
 def int_nvvm_read_ptx_sreg_envreg27
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg27">,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg27">;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_envreg27">;
 def int_nvvm_read_ptx_sreg_envreg28
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg28">,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg28">;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_envreg28">;
 def int_nvvm_read_ptx_sreg_envreg29
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg29">,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg29">;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_envreg29">;
 def int_nvvm_read_ptx_sreg_envreg30
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg30">,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg30">;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_envreg30">;
 def int_nvvm_read_ptx_sreg_envreg31
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg31">,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg31">;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_envreg31">;
 
 
 // Texture Fetch
@@ -3161,62 +3214,62 @@ def int_nvvm_suld_3d_v4i32_zero
 def int_nvvm_txq_channel_order
   : Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem],
               "llvm.nvvm.txq.channel.order">,
-    GCCBuiltin<"__nvvm_txq_channel_order">;
+    ClangBuiltin<"__nvvm_txq_channel_order">;
 def int_nvvm_txq_channel_data_type
   : Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem],
               "llvm.nvvm.txq.channel.data.type">,
-    GCCBuiltin<"__nvvm_txq_channel_data_type">;
+    ClangBuiltin<"__nvvm_txq_channel_data_type">;
 def int_nvvm_txq_width
   : Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem],
               "llvm.nvvm.txq.width">,
-    GCCBuiltin<"__nvvm_txq_width">;
+    ClangBuiltin<"__nvvm_txq_width">;
 def int_nvvm_txq_height
   : Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem],
               "llvm.nvvm.txq.height">,
-    GCCBuiltin<"__nvvm_txq_height">;
+    ClangBuiltin<"__nvvm_txq_height">;
 def int_nvvm_txq_depth
   : Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem],
               "llvm.nvvm.txq.depth">,
-    GCCBuiltin<"__nvvm_txq_depth">;
+    ClangBuiltin<"__nvvm_txq_depth">;
 def int_nvvm_txq_array_size
   : Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem],
               "llvm.nvvm.txq.array.size">,
-    GCCBuiltin<"__nvvm_txq_array_size">;
+    ClangBuiltin<"__nvvm_txq_array_size">;
 def int_nvvm_txq_num_samples
   : Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem],
               "llvm.nvvm.txq.num.samples">,
-    GCCBuiltin<"__nvvm_txq_num_samples">;
+    ClangBuiltin<"__nvvm_txq_num_samples">;
 def int_nvvm_txq_num_mipmap_levels
   : Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem],
               "llvm.nvvm.txq.num.mipmap.levels">,
-    GCCBuiltin<"__nvvm_txq_num_mipmap_levels">;
+    ClangBuiltin<"__nvvm_txq_num_mipmap_levels">;
 
 //===- Surface Query ------------------------------------------------------===//
 
 def int_nvvm_suq_channel_order
   : Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem],
               "llvm.nvvm.suq.channel.order">,
-    GCCBuiltin<"__nvvm_suq_channel_order">;
+    ClangBuiltin<"__nvvm_suq_channel_order">;
 def int_nvvm_suq_channel_data_type
   : Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem],
               "llvm.nvvm.suq.channel.data.type">,
-    GCCBuiltin<"__nvvm_suq_channel_data_type">;
+    ClangBuiltin<"__nvvm_suq_channel_data_type">;
 def int_nvvm_suq_width
   : Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem],
               "llvm.nvvm.suq.width">,
-    GCCBuiltin<"__nvvm_suq_width">;
+    ClangBuiltin<"__nvvm_suq_width">;
 def int_nvvm_suq_height
   : Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem],
               "llvm.nvvm.suq.height">,
-    GCCBuiltin<"__nvvm_suq_height">;
+    ClangBuiltin<"__nvvm_suq_height">;
 def int_nvvm_suq_depth
   : Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem],
               "llvm.nvvm.suq.depth">,
-    GCCBuiltin<"__nvvm_suq_depth">;
+    ClangBuiltin<"__nvvm_suq_depth">;
 def int_nvvm_suq_array_size
   : Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem],
               "llvm.nvvm.suq.array.size">,
-    GCCBuiltin<"__nvvm_suq_array_size">;
+    ClangBuiltin<"__nvvm_suq_array_size">;
 
 
 //===- Handle Query -------------------------------------------------------===//
@@ -3224,15 +3277,15 @@ def int_nvvm_suq_array_size
 def int_nvvm_istypep_sampler
   : Intrinsic<[llvm_i1_ty], [llvm_i64_ty], [IntrNoMem],
               "llvm.nvvm.istypep.sampler">,
-    GCCBuiltin<"__nvvm_istypep_sampler">;
+    ClangBuiltin<"__nvvm_istypep_sampler">;
 def int_nvvm_istypep_surface
   : Intrinsic<[llvm_i1_ty], [llvm_i64_ty], [IntrNoMem],
               "llvm.nvvm.istypep.surface">,
-    GCCBuiltin<"__nvvm_istypep_surface">;
+    ClangBuiltin<"__nvvm_istypep_surface">;
 def int_nvvm_istypep_texture
   : Intrinsic<[llvm_i1_ty], [llvm_i64_ty], [IntrNoMem],
               "llvm.nvvm.istypep.texture">,
-    GCCBuiltin<"__nvvm_istypep_texture">;
+    ClangBuiltin<"__nvvm_istypep_texture">;
 
 
 
@@ -3243,810 +3296,810 @@ def int_nvvm_istypep_texture
 def int_nvvm_sust_b_1d_i8_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.i8.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_1d_i8_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_1d_i8_clamp">;
 def int_nvvm_sust_b_1d_i16_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.i16.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_1d_i16_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_1d_i16_clamp">;
 def int_nvvm_sust_b_1d_i32_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.1d.i32.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_1d_i32_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_1d_i32_clamp">;
 def int_nvvm_sust_b_1d_i64_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], [],
               "llvm.nvvm.sust.b.1d.i64.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_1d_i64_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_1d_i64_clamp">;
 def int_nvvm_sust_b_1d_v2i8_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.v2i8.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_1d_v2i8_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_1d_v2i8_clamp">;
 def int_nvvm_sust_b_1d_v2i16_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.v2i16.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_1d_v2i16_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_1d_v2i16_clamp">;
 def int_nvvm_sust_b_1d_v2i32_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.1d.v2i32.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_1d_v2i32_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_1d_v2i32_clamp">;
 def int_nvvm_sust_b_1d_v2i64_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty, llvm_i64_ty], [],
               "llvm.nvvm.sust.b.1d.v2i64.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_1d_v2i64_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_1d_v2i64_clamp">;
 def int_nvvm_sust_b_1d_v4i8_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.v4i8.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_1d_v4i8_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_1d_v4i8_clamp">;
 def int_nvvm_sust_b_1d_v4i16_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.v4i16.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_1d_v4i16_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_1d_v4i16_clamp">;
 def int_nvvm_sust_b_1d_v4i32_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.1d.v4i32.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_1d_v4i32_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_1d_v4i32_clamp">;
 
 
 def int_nvvm_sust_b_1d_array_i8_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.array.i8.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_i8_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_i8_clamp">;
 def int_nvvm_sust_b_1d_array_i16_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.array.i16.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_i16_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_i16_clamp">;
 def int_nvvm_sust_b_1d_array_i32_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.1d.array.i32.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_i32_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_i32_clamp">;
 def int_nvvm_sust_b_1d_array_i64_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], [],
               "llvm.nvvm.sust.b.1d.array.i64.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_i64_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_i64_clamp">;
 def int_nvvm_sust_b_1d_array_v2i8_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.array.v2i8.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_v2i8_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_v2i8_clamp">;
 def int_nvvm_sust_b_1d_array_v2i16_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.array.v2i16.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_v2i16_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_v2i16_clamp">;
 def int_nvvm_sust_b_1d_array_v2i32_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.1d.array.v2i32.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_v2i32_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_v2i32_clamp">;
 def int_nvvm_sust_b_1d_array_v2i64_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i64_ty, llvm_i64_ty], [],
               "llvm.nvvm.sust.b.1d.array.v2i64.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_v2i64_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_v2i64_clamp">;
 def int_nvvm_sust_b_1d_array_v4i8_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.array.v4i8.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_v4i8_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_v4i8_clamp">;
 def int_nvvm_sust_b_1d_array_v4i16_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.array.v4i16.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_v4i16_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_v4i16_clamp">;
 def int_nvvm_sust_b_1d_array_v4i32_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.1d.array.v4i32.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_v4i32_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_v4i32_clamp">;
 
 
 def int_nvvm_sust_b_2d_i8_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.i8.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_2d_i8_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_2d_i8_clamp">;
 def int_nvvm_sust_b_2d_i16_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.i16.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_2d_i16_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_2d_i16_clamp">;
 def int_nvvm_sust_b_2d_i32_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.2d.i32.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_2d_i32_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_2d_i32_clamp">;
 def int_nvvm_sust_b_2d_i64_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], [],
               "llvm.nvvm.sust.b.2d.i64.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_2d_i64_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_2d_i64_clamp">;
 def int_nvvm_sust_b_2d_v2i8_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.v2i8.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_2d_v2i8_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_2d_v2i8_clamp">;
 def int_nvvm_sust_b_2d_v2i16_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.v2i16.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_2d_v2i16_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_2d_v2i16_clamp">;
 def int_nvvm_sust_b_2d_v2i32_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.2d.v2i32.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_2d_v2i32_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_2d_v2i32_clamp">;
 def int_nvvm_sust_b_2d_v2i64_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i64_ty, llvm_i64_ty], [],
               "llvm.nvvm.sust.b.2d.v2i64.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_2d_v2i64_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_2d_v2i64_clamp">;
 def int_nvvm_sust_b_2d_v4i8_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.v4i8.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_2d_v4i8_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_2d_v4i8_clamp">;
 def int_nvvm_sust_b_2d_v4i16_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.v4i16.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_2d_v4i16_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_2d_v4i16_clamp">;
 def int_nvvm_sust_b_2d_v4i32_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.2d.v4i32.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_2d_v4i32_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_2d_v4i32_clamp">;
 
 
 def int_nvvm_sust_b_2d_array_i8_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.array.i8.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_i8_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_i8_clamp">;
 def int_nvvm_sust_b_2d_array_i16_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.array.i16.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_i16_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_i16_clamp">;
 def int_nvvm_sust_b_2d_array_i32_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.2d.array.i32.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_i32_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_i32_clamp">;
 def int_nvvm_sust_b_2d_array_i64_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i64_ty], [],
               "llvm.nvvm.sust.b.2d.array.i64.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_i64_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_i64_clamp">;
 def int_nvvm_sust_b_2d_array_v2i8_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.array.v2i8.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_v2i8_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_v2i8_clamp">;
 def int_nvvm_sust_b_2d_array_v2i16_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.array.v2i16.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_v2i16_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_v2i16_clamp">;
 def int_nvvm_sust_b_2d_array_v2i32_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.2d.array.v2i32.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_v2i32_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_v2i32_clamp">;
 def int_nvvm_sust_b_2d_array_v2i64_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i64_ty, llvm_i64_ty], [],
               "llvm.nvvm.sust.b.2d.array.v2i64.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_v2i64_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_v2i64_clamp">;
 def int_nvvm_sust_b_2d_array_v4i8_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.array.v4i8.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_v4i8_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_v4i8_clamp">;
 def int_nvvm_sust_b_2d_array_v4i16_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.array.v4i16.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_v4i16_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_v4i16_clamp">;
 def int_nvvm_sust_b_2d_array_v4i32_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.2d.array.v4i32.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_v4i32_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_v4i32_clamp">;
 
 
 def int_nvvm_sust_b_3d_i8_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.3d.i8.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_3d_i8_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_3d_i8_clamp">;
 def int_nvvm_sust_b_3d_i16_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.3d.i16.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_3d_i16_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_3d_i16_clamp">;
 def int_nvvm_sust_b_3d_i32_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.3d.i32.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_3d_i32_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_3d_i32_clamp">;
 def int_nvvm_sust_b_3d_i64_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i64_ty], [],
               "llvm.nvvm.sust.b.3d.i64.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_3d_i64_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_3d_i64_clamp">;
 def int_nvvm_sust_b_3d_v2i8_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.3d.v2i8.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_3d_v2i8_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_3d_v2i8_clamp">;
 def int_nvvm_sust_b_3d_v2i16_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.3d.v2i16.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_3d_v2i16_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_3d_v2i16_clamp">;
 def int_nvvm_sust_b_3d_v2i32_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.3d.v2i32.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_3d_v2i32_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_3d_v2i32_clamp">;
 def int_nvvm_sust_b_3d_v2i64_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i64_ty, llvm_i64_ty], [],
               "llvm.nvvm.sust.b.3d.v2i64.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_3d_v2i64_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_3d_v2i64_clamp">;
 def int_nvvm_sust_b_3d_v4i8_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.3d.v4i8.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_3d_v4i8_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_3d_v4i8_clamp">;
 def int_nvvm_sust_b_3d_v4i16_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.3d.v4i16.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_3d_v4i16_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_3d_v4i16_clamp">;
 def int_nvvm_sust_b_3d_v4i32_clamp
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.3d.v4i32.clamp">,
-    GCCBuiltin<"__nvvm_sust_b_3d_v4i32_clamp">;
+    ClangBuiltin<"__nvvm_sust_b_3d_v4i32_clamp">;
 
 
 // .trap variant
 def int_nvvm_sust_b_1d_i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.i8.trap">,
-    GCCBuiltin<"__nvvm_sust_b_1d_i8_trap">;
+    ClangBuiltin<"__nvvm_sust_b_1d_i8_trap">;
 def int_nvvm_sust_b_1d_i16_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.i16.trap">,
-    GCCBuiltin<"__nvvm_sust_b_1d_i16_trap">;
+    ClangBuiltin<"__nvvm_sust_b_1d_i16_trap">;
 def int_nvvm_sust_b_1d_i32_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.1d.i32.trap">,
-    GCCBuiltin<"__nvvm_sust_b_1d_i32_trap">;
+    ClangBuiltin<"__nvvm_sust_b_1d_i32_trap">;
 def int_nvvm_sust_b_1d_i64_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], [],
               "llvm.nvvm.sust.b.1d.i64.trap">,
-    GCCBuiltin<"__nvvm_sust_b_1d_i64_trap">;
+    ClangBuiltin<"__nvvm_sust_b_1d_i64_trap">;
 def int_nvvm_sust_b_1d_v2i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.v2i8.trap">,
-    GCCBuiltin<"__nvvm_sust_b_1d_v2i8_trap">;
+    ClangBuiltin<"__nvvm_sust_b_1d_v2i8_trap">;
 def int_nvvm_sust_b_1d_v2i16_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.v2i16.trap">,
-    GCCBuiltin<"__nvvm_sust_b_1d_v2i16_trap">;
+    ClangBuiltin<"__nvvm_sust_b_1d_v2i16_trap">;
 def int_nvvm_sust_b_1d_v2i32_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.1d.v2i32.trap">,
-    GCCBuiltin<"__nvvm_sust_b_1d_v2i32_trap">;
+    ClangBuiltin<"__nvvm_sust_b_1d_v2i32_trap">;
 def int_nvvm_sust_b_1d_v2i64_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty, llvm_i64_ty], [],
               "llvm.nvvm.sust.b.1d.v2i64.trap">,
-    GCCBuiltin<"__nvvm_sust_b_1d_v2i64_trap">;
+    ClangBuiltin<"__nvvm_sust_b_1d_v2i64_trap">;
 def int_nvvm_sust_b_1d_v4i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.v4i8.trap">,
-    GCCBuiltin<"__nvvm_sust_b_1d_v4i8_trap">;
+    ClangBuiltin<"__nvvm_sust_b_1d_v4i8_trap">;
 def int_nvvm_sust_b_1d_v4i16_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.v4i16.trap">,
-    GCCBuiltin<"__nvvm_sust_b_1d_v4i16_trap">;
+    ClangBuiltin<"__nvvm_sust_b_1d_v4i16_trap">;
 def int_nvvm_sust_b_1d_v4i32_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.1d.v4i32.trap">,
-    GCCBuiltin<"__nvvm_sust_b_1d_v4i32_trap">;
+    ClangBuiltin<"__nvvm_sust_b_1d_v4i32_trap">;
 
 
 def int_nvvm_sust_b_1d_array_i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.array.i8.trap">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_i8_trap">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_i8_trap">;
 def int_nvvm_sust_b_1d_array_i16_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.array.i16.trap">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_i16_trap">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_i16_trap">;
 def int_nvvm_sust_b_1d_array_i32_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.1d.array.i32.trap">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_i32_trap">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_i32_trap">;
 def int_nvvm_sust_b_1d_array_i64_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], [],
               "llvm.nvvm.sust.b.1d.array.i64.trap">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_i64_trap">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_i64_trap">;
 def int_nvvm_sust_b_1d_array_v2i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.array.v2i8.trap">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_v2i8_trap">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_v2i8_trap">;
 def int_nvvm_sust_b_1d_array_v2i16_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.array.v2i16.trap">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_v2i16_trap">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_v2i16_trap">;
 def int_nvvm_sust_b_1d_array_v2i32_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.1d.array.v2i32.trap">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_v2i32_trap">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_v2i32_trap">;
 def int_nvvm_sust_b_1d_array_v2i64_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i64_ty, llvm_i64_ty], [],
               "llvm.nvvm.sust.b.1d.array.v2i64.trap">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_v2i64_trap">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_v2i64_trap">;
 def int_nvvm_sust_b_1d_array_v4i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.array.v4i8.trap">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_v4i8_trap">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_v4i8_trap">;
 def int_nvvm_sust_b_1d_array_v4i16_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.array.v4i16.trap">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_v4i16_trap">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_v4i16_trap">;
 def int_nvvm_sust_b_1d_array_v4i32_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.1d.array.v4i32.trap">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_v4i32_trap">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_v4i32_trap">;
 
 
 def int_nvvm_sust_b_2d_i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.i8.trap">,
-    GCCBuiltin<"__nvvm_sust_b_2d_i8_trap">;
+    ClangBuiltin<"__nvvm_sust_b_2d_i8_trap">;
 def int_nvvm_sust_b_2d_i16_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.i16.trap">,
-    GCCBuiltin<"__nvvm_sust_b_2d_i16_trap">;
+    ClangBuiltin<"__nvvm_sust_b_2d_i16_trap">;
 def int_nvvm_sust_b_2d_i32_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.2d.i32.trap">,
-    GCCBuiltin<"__nvvm_sust_b_2d_i32_trap">;
+    ClangBuiltin<"__nvvm_sust_b_2d_i32_trap">;
 def int_nvvm_sust_b_2d_i64_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], [],
               "llvm.nvvm.sust.b.2d.i64.trap">,
-    GCCBuiltin<"__nvvm_sust_b_2d_i64_trap">;
+    ClangBuiltin<"__nvvm_sust_b_2d_i64_trap">;
 def int_nvvm_sust_b_2d_v2i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.v2i8.trap">,
-    GCCBuiltin<"__nvvm_sust_b_2d_v2i8_trap">;
+    ClangBuiltin<"__nvvm_sust_b_2d_v2i8_trap">;
 def int_nvvm_sust_b_2d_v2i16_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.v2i16.trap">,
-    GCCBuiltin<"__nvvm_sust_b_2d_v2i16_trap">;
+    ClangBuiltin<"__nvvm_sust_b_2d_v2i16_trap">;
 def int_nvvm_sust_b_2d_v2i32_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.2d.v2i32.trap">,
-    GCCBuiltin<"__nvvm_sust_b_2d_v2i32_trap">;
+    ClangBuiltin<"__nvvm_sust_b_2d_v2i32_trap">;
 def int_nvvm_sust_b_2d_v2i64_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i64_ty, llvm_i64_ty], [],
               "llvm.nvvm.sust.b.2d.v2i64.trap">,
-    GCCBuiltin<"__nvvm_sust_b_2d_v2i64_trap">;
+    ClangBuiltin<"__nvvm_sust_b_2d_v2i64_trap">;
 def int_nvvm_sust_b_2d_v4i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.v4i8.trap">,
-    GCCBuiltin<"__nvvm_sust_b_2d_v4i8_trap">;
+    ClangBuiltin<"__nvvm_sust_b_2d_v4i8_trap">;
 def int_nvvm_sust_b_2d_v4i16_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.v4i16.trap">,
-    GCCBuiltin<"__nvvm_sust_b_2d_v4i16_trap">;
+    ClangBuiltin<"__nvvm_sust_b_2d_v4i16_trap">;
 def int_nvvm_sust_b_2d_v4i32_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.2d.v4i32.trap">,
-    GCCBuiltin<"__nvvm_sust_b_2d_v4i32_trap">;
+    ClangBuiltin<"__nvvm_sust_b_2d_v4i32_trap">;
 
 
 def int_nvvm_sust_b_2d_array_i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.array.i8.trap">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_i8_trap">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_i8_trap">;
 def int_nvvm_sust_b_2d_array_i16_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.array.i16.trap">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_i16_trap">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_i16_trap">;
 def int_nvvm_sust_b_2d_array_i32_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.2d.array.i32.trap">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_i32_trap">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_i32_trap">;
 def int_nvvm_sust_b_2d_array_i64_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i64_ty], [],
               "llvm.nvvm.sust.b.2d.array.i64.trap">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_i64_trap">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_i64_trap">;
 def int_nvvm_sust_b_2d_array_v2i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.array.v2i8.trap">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_v2i8_trap">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_v2i8_trap">;
 def int_nvvm_sust_b_2d_array_v2i16_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.array.v2i16.trap">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_v2i16_trap">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_v2i16_trap">;
 def int_nvvm_sust_b_2d_array_v2i32_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.2d.array.v2i32.trap">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_v2i32_trap">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_v2i32_trap">;
 def int_nvvm_sust_b_2d_array_v2i64_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i64_ty, llvm_i64_ty], [],
               "llvm.nvvm.sust.b.2d.array.v2i64.trap">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_v2i64_trap">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_v2i64_trap">;
 def int_nvvm_sust_b_2d_array_v4i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.array.v4i8.trap">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_v4i8_trap">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_v4i8_trap">;
 def int_nvvm_sust_b_2d_array_v4i16_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.array.v4i16.trap">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_v4i16_trap">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_v4i16_trap">;
 def int_nvvm_sust_b_2d_array_v4i32_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.2d.array.v4i32.trap">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_v4i32_trap">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_v4i32_trap">;
 
 
 def int_nvvm_sust_b_3d_i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.3d.i8.trap">,
-    GCCBuiltin<"__nvvm_sust_b_3d_i8_trap">;
+    ClangBuiltin<"__nvvm_sust_b_3d_i8_trap">;
 def int_nvvm_sust_b_3d_i16_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.3d.i16.trap">,
-    GCCBuiltin<"__nvvm_sust_b_3d_i16_trap">;
+    ClangBuiltin<"__nvvm_sust_b_3d_i16_trap">;
 def int_nvvm_sust_b_3d_i32_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.3d.i32.trap">,
-    GCCBuiltin<"__nvvm_sust_b_3d_i32_trap">;
+    ClangBuiltin<"__nvvm_sust_b_3d_i32_trap">;
 def int_nvvm_sust_b_3d_i64_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i64_ty], [],
               "llvm.nvvm.sust.b.3d.i64.trap">,
-    GCCBuiltin<"__nvvm_sust_b_3d_i64_trap">;
+    ClangBuiltin<"__nvvm_sust_b_3d_i64_trap">;
 def int_nvvm_sust_b_3d_v2i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.3d.v2i8.trap">,
-    GCCBuiltin<"__nvvm_sust_b_3d_v2i8_trap">;
+    ClangBuiltin<"__nvvm_sust_b_3d_v2i8_trap">;
 def int_nvvm_sust_b_3d_v2i16_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.3d.v2i16.trap">,
-    GCCBuiltin<"__nvvm_sust_b_3d_v2i16_trap">;
+    ClangBuiltin<"__nvvm_sust_b_3d_v2i16_trap">;
 def int_nvvm_sust_b_3d_v2i32_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.3d.v2i32.trap">,
-    GCCBuiltin<"__nvvm_sust_b_3d_v2i32_trap">;
+    ClangBuiltin<"__nvvm_sust_b_3d_v2i32_trap">;
 def int_nvvm_sust_b_3d_v2i64_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i64_ty, llvm_i64_ty], [],
               "llvm.nvvm.sust.b.3d.v2i64.trap">,
-    GCCBuiltin<"__nvvm_sust_b_3d_v2i64_trap">;
+    ClangBuiltin<"__nvvm_sust_b_3d_v2i64_trap">;
 def int_nvvm_sust_b_3d_v4i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.3d.v4i8.trap">,
-    GCCBuiltin<"__nvvm_sust_b_3d_v4i8_trap">;
+    ClangBuiltin<"__nvvm_sust_b_3d_v4i8_trap">;
 def int_nvvm_sust_b_3d_v4i16_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.3d.v4i16.trap">,
-    GCCBuiltin<"__nvvm_sust_b_3d_v4i16_trap">;
+    ClangBuiltin<"__nvvm_sust_b_3d_v4i16_trap">;
 def int_nvvm_sust_b_3d_v4i32_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.3d.v4i32.trap">,
-    GCCBuiltin<"__nvvm_sust_b_3d_v4i32_trap">;
+    ClangBuiltin<"__nvvm_sust_b_3d_v4i32_trap">;
 
 
 // .zero variant
 def int_nvvm_sust_b_1d_i8_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.i8.zero">,
-    GCCBuiltin<"__nvvm_sust_b_1d_i8_zero">;
+    ClangBuiltin<"__nvvm_sust_b_1d_i8_zero">;
 def int_nvvm_sust_b_1d_i16_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.i16.zero">,
-    GCCBuiltin<"__nvvm_sust_b_1d_i16_zero">;
+    ClangBuiltin<"__nvvm_sust_b_1d_i16_zero">;
 def int_nvvm_sust_b_1d_i32_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.1d.i32.zero">,
-    GCCBuiltin<"__nvvm_sust_b_1d_i32_zero">;
+    ClangBuiltin<"__nvvm_sust_b_1d_i32_zero">;
 def int_nvvm_sust_b_1d_i64_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], [],
               "llvm.nvvm.sust.b.1d.i64.zero">,
-    GCCBuiltin<"__nvvm_sust_b_1d_i64_zero">;
+    ClangBuiltin<"__nvvm_sust_b_1d_i64_zero">;
 def int_nvvm_sust_b_1d_v2i8_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.v2i8.zero">,
-    GCCBuiltin<"__nvvm_sust_b_1d_v2i8_zero">;
+    ClangBuiltin<"__nvvm_sust_b_1d_v2i8_zero">;
 def int_nvvm_sust_b_1d_v2i16_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.v2i16.zero">,
-    GCCBuiltin<"__nvvm_sust_b_1d_v2i16_zero">;
+    ClangBuiltin<"__nvvm_sust_b_1d_v2i16_zero">;
 def int_nvvm_sust_b_1d_v2i32_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.1d.v2i32.zero">,
-    GCCBuiltin<"__nvvm_sust_b_1d_v2i32_zero">;
+    ClangBuiltin<"__nvvm_sust_b_1d_v2i32_zero">;
 def int_nvvm_sust_b_1d_v2i64_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty, llvm_i64_ty], [],
               "llvm.nvvm.sust.b.1d.v2i64.zero">,
-    GCCBuiltin<"__nvvm_sust_b_1d_v2i64_zero">;
+    ClangBuiltin<"__nvvm_sust_b_1d_v2i64_zero">;
 def int_nvvm_sust_b_1d_v4i8_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.v4i8.zero">,
-    GCCBuiltin<"__nvvm_sust_b_1d_v4i8_zero">;
+    ClangBuiltin<"__nvvm_sust_b_1d_v4i8_zero">;
 def int_nvvm_sust_b_1d_v4i16_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.v4i16.zero">,
-    GCCBuiltin<"__nvvm_sust_b_1d_v4i16_zero">;
+    ClangBuiltin<"__nvvm_sust_b_1d_v4i16_zero">;
 def int_nvvm_sust_b_1d_v4i32_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.1d.v4i32.zero">,
-    GCCBuiltin<"__nvvm_sust_b_1d_v4i32_zero">;
+    ClangBuiltin<"__nvvm_sust_b_1d_v4i32_zero">;
 
 
 def int_nvvm_sust_b_1d_array_i8_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.array.i8.zero">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_i8_zero">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_i8_zero">;
 def int_nvvm_sust_b_1d_array_i16_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.array.i16.zero">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_i16_zero">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_i16_zero">;
 def int_nvvm_sust_b_1d_array_i32_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.1d.array.i32.zero">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_i32_zero">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_i32_zero">;
 def int_nvvm_sust_b_1d_array_i64_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], [],
               "llvm.nvvm.sust.b.1d.array.i64.zero">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_i64_zero">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_i64_zero">;
 def int_nvvm_sust_b_1d_array_v2i8_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.array.v2i8.zero">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_v2i8_zero">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_v2i8_zero">;
 def int_nvvm_sust_b_1d_array_v2i16_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.array.v2i16.zero">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_v2i16_zero">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_v2i16_zero">;
 def int_nvvm_sust_b_1d_array_v2i32_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.1d.array.v2i32.zero">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_v2i32_zero">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_v2i32_zero">;
 def int_nvvm_sust_b_1d_array_v2i64_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i64_ty, llvm_i64_ty], [],
               "llvm.nvvm.sust.b.1d.array.v2i64.zero">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_v2i64_zero">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_v2i64_zero">;
 def int_nvvm_sust_b_1d_array_v4i8_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.array.v4i8.zero">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_v4i8_zero">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_v4i8_zero">;
 def int_nvvm_sust_b_1d_array_v4i16_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.array.v4i16.zero">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_v4i16_zero">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_v4i16_zero">;
 def int_nvvm_sust_b_1d_array_v4i32_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.1d.array.v4i32.zero">,
-    GCCBuiltin<"__nvvm_sust_b_1d_array_v4i32_zero">;
+    ClangBuiltin<"__nvvm_sust_b_1d_array_v4i32_zero">;
 
 
 def int_nvvm_sust_b_2d_i8_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.i8.zero">,
-    GCCBuiltin<"__nvvm_sust_b_2d_i8_zero">;
+    ClangBuiltin<"__nvvm_sust_b_2d_i8_zero">;
 def int_nvvm_sust_b_2d_i16_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.i16.zero">,
-    GCCBuiltin<"__nvvm_sust_b_2d_i16_zero">;
+    ClangBuiltin<"__nvvm_sust_b_2d_i16_zero">;
 def int_nvvm_sust_b_2d_i32_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.2d.i32.zero">,
-    GCCBuiltin<"__nvvm_sust_b_2d_i32_zero">;
+    ClangBuiltin<"__nvvm_sust_b_2d_i32_zero">;
 def int_nvvm_sust_b_2d_i64_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], [],
               "llvm.nvvm.sust.b.2d.i64.zero">,
-    GCCBuiltin<"__nvvm_sust_b_2d_i64_zero">;
+    ClangBuiltin<"__nvvm_sust_b_2d_i64_zero">;
 def int_nvvm_sust_b_2d_v2i8_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.v2i8.zero">,
-    GCCBuiltin<"__nvvm_sust_b_2d_v2i8_zero">;
+    ClangBuiltin<"__nvvm_sust_b_2d_v2i8_zero">;
 def int_nvvm_sust_b_2d_v2i16_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.v2i16.zero">,
-    GCCBuiltin<"__nvvm_sust_b_2d_v2i16_zero">;
+    ClangBuiltin<"__nvvm_sust_b_2d_v2i16_zero">;
 def int_nvvm_sust_b_2d_v2i32_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.2d.v2i32.zero">,
-    GCCBuiltin<"__nvvm_sust_b_2d_v2i32_zero">;
+    ClangBuiltin<"__nvvm_sust_b_2d_v2i32_zero">;
 def int_nvvm_sust_b_2d_v2i64_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i64_ty, llvm_i64_ty], [],
               "llvm.nvvm.sust.b.2d.v2i64.zero">,
-    GCCBuiltin<"__nvvm_sust_b_2d_v2i64_zero">;
+    ClangBuiltin<"__nvvm_sust_b_2d_v2i64_zero">;
 def int_nvvm_sust_b_2d_v4i8_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.v4i8.zero">,
-    GCCBuiltin<"__nvvm_sust_b_2d_v4i8_zero">;
+    ClangBuiltin<"__nvvm_sust_b_2d_v4i8_zero">;
 def int_nvvm_sust_b_2d_v4i16_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.v4i16.zero">,
-    GCCBuiltin<"__nvvm_sust_b_2d_v4i16_zero">;
+    ClangBuiltin<"__nvvm_sust_b_2d_v4i16_zero">;
 def int_nvvm_sust_b_2d_v4i32_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.2d.v4i32.zero">,
-    GCCBuiltin<"__nvvm_sust_b_2d_v4i32_zero">;
+    ClangBuiltin<"__nvvm_sust_b_2d_v4i32_zero">;
 
 
 def int_nvvm_sust_b_2d_array_i8_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.array.i8.zero">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_i8_zero">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_i8_zero">;
 def int_nvvm_sust_b_2d_array_i16_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.array.i16.zero">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_i16_zero">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_i16_zero">;
 def int_nvvm_sust_b_2d_array_i32_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.2d.array.i32.zero">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_i32_zero">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_i32_zero">;
 def int_nvvm_sust_b_2d_array_i64_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i64_ty], [],
               "llvm.nvvm.sust.b.2d.array.i64.zero">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_i64_zero">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_i64_zero">;
 def int_nvvm_sust_b_2d_array_v2i8_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.array.v2i8.zero">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_v2i8_zero">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_v2i8_zero">;
 def int_nvvm_sust_b_2d_array_v2i16_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.array.v2i16.zero">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_v2i16_zero">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_v2i16_zero">;
 def int_nvvm_sust_b_2d_array_v2i32_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.2d.array.v2i32.zero">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_v2i32_zero">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_v2i32_zero">;
 def int_nvvm_sust_b_2d_array_v2i64_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i64_ty, llvm_i64_ty], [],
               "llvm.nvvm.sust.b.2d.array.v2i64.zero">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_v2i64_zero">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_v2i64_zero">;
 def int_nvvm_sust_b_2d_array_v4i8_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.array.v4i8.zero">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_v4i8_zero">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_v4i8_zero">;
 def int_nvvm_sust_b_2d_array_v4i16_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.2d.array.v4i16.zero">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_v4i16_zero">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_v4i16_zero">;
 def int_nvvm_sust_b_2d_array_v4i32_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.2d.array.v4i32.zero">,
-    GCCBuiltin<"__nvvm_sust_b_2d_array_v4i32_zero">;
+    ClangBuiltin<"__nvvm_sust_b_2d_array_v4i32_zero">;
 
 
 def int_nvvm_sust_b_3d_i8_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.3d.i8.zero">,
-    GCCBuiltin<"__nvvm_sust_b_3d_i8_zero">;
+    ClangBuiltin<"__nvvm_sust_b_3d_i8_zero">;
 def int_nvvm_sust_b_3d_i16_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.3d.i16.zero">,
-    GCCBuiltin<"__nvvm_sust_b_3d_i16_zero">;
+    ClangBuiltin<"__nvvm_sust_b_3d_i16_zero">;
 def int_nvvm_sust_b_3d_i32_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.3d.i32.zero">,
-    GCCBuiltin<"__nvvm_sust_b_3d_i32_zero">;
+    ClangBuiltin<"__nvvm_sust_b_3d_i32_zero">;
 def int_nvvm_sust_b_3d_i64_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i64_ty], [],
               "llvm.nvvm.sust.b.3d.i64.zero">,
-    GCCBuiltin<"__nvvm_sust_b_3d_i64_zero">;
+    ClangBuiltin<"__nvvm_sust_b_3d_i64_zero">;
 def int_nvvm_sust_b_3d_v2i8_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.3d.v2i8.zero">,
-    GCCBuiltin<"__nvvm_sust_b_3d_v2i8_zero">;
+    ClangBuiltin<"__nvvm_sust_b_3d_v2i8_zero">;
 def int_nvvm_sust_b_3d_v2i16_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.3d.v2i16.zero">,
-    GCCBuiltin<"__nvvm_sust_b_3d_v2i16_zero">;
+    ClangBuiltin<"__nvvm_sust_b_3d_v2i16_zero">;
 def int_nvvm_sust_b_3d_v2i32_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.3d.v2i32.zero">,
-    GCCBuiltin<"__nvvm_sust_b_3d_v2i32_zero">;
+    ClangBuiltin<"__nvvm_sust_b_3d_v2i32_zero">;
 def int_nvvm_sust_b_3d_v2i64_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i64_ty, llvm_i64_ty], [],
               "llvm.nvvm.sust.b.3d.v2i64.zero">,
-    GCCBuiltin<"__nvvm_sust_b_3d_v2i64_zero">;
+    ClangBuiltin<"__nvvm_sust_b_3d_v2i64_zero">;
 def int_nvvm_sust_b_3d_v4i8_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.3d.v4i8.zero">,
-    GCCBuiltin<"__nvvm_sust_b_3d_v4i8_zero">;
+    ClangBuiltin<"__nvvm_sust_b_3d_v4i8_zero">;
 def int_nvvm_sust_b_3d_v4i16_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.3d.v4i16.zero">,
-    GCCBuiltin<"__nvvm_sust_b_3d_v4i16_zero">;
+    ClangBuiltin<"__nvvm_sust_b_3d_v4i16_zero">;
 def int_nvvm_sust_b_3d_v4i32_zero
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.3d.v4i32.zero">,
-    GCCBuiltin<"__nvvm_sust_b_3d_v4i32_zero">;
+    ClangBuiltin<"__nvvm_sust_b_3d_v4i32_zero">;
 
 
 
@@ -4055,245 +4108,245 @@ def int_nvvm_sust_b_3d_v4i32_zero
 def int_nvvm_sust_p_1d_i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.p.1d.i8.trap">,
-    GCCBuiltin<"__nvvm_sust_p_1d_i8_trap">;
+    ClangBuiltin<"__nvvm_sust_p_1d_i8_trap">;
 def int_nvvm_sust_p_1d_i16_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.p.1d.i16.trap">,
-    GCCBuiltin<"__nvvm_sust_p_1d_i16_trap">;
+    ClangBuiltin<"__nvvm_sust_p_1d_i16_trap">;
 def int_nvvm_sust_p_1d_i32_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.p.1d.i32.trap">,
-    GCCBuiltin<"__nvvm_sust_p_1d_i32_trap">;
+    ClangBuiltin<"__nvvm_sust_p_1d_i32_trap">;
 def int_nvvm_sust_p_1d_v2i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.p.1d.v2i8.trap">,
-    GCCBuiltin<"__nvvm_sust_p_1d_v2i8_trap">;
+    ClangBuiltin<"__nvvm_sust_p_1d_v2i8_trap">;
 def int_nvvm_sust_p_1d_v2i16_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.p.1d.v2i16.trap">,
-    GCCBuiltin<"__nvvm_sust_p_1d_v2i16_trap">;
+    ClangBuiltin<"__nvvm_sust_p_1d_v2i16_trap">;
 def int_nvvm_sust_p_1d_v2i32_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.p.1d.v2i32.trap">,
-    GCCBuiltin<"__nvvm_sust_p_1d_v2i32_trap">;
+    ClangBuiltin<"__nvvm_sust_p_1d_v2i32_trap">;
 def int_nvvm_sust_p_1d_v4i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.p.1d.v4i8.trap">,
-    GCCBuiltin<"__nvvm_sust_p_1d_v4i8_trap">;
+    ClangBuiltin<"__nvvm_sust_p_1d_v4i8_trap">;
 def int_nvvm_sust_p_1d_v4i16_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.p.1d.v4i16.trap">,
-    GCCBuiltin<"__nvvm_sust_p_1d_v4i16_trap">;
+    ClangBuiltin<"__nvvm_sust_p_1d_v4i16_trap">;
 def int_nvvm_sust_p_1d_v4i32_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.p.1d.v4i32.trap">,
-    GCCBuiltin<"__nvvm_sust_p_1d_v4i32_trap">;
+    ClangBuiltin<"__nvvm_sust_p_1d_v4i32_trap">;
 
 
 def int_nvvm_sust_p_1d_array_i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.p.1d.array.i8.trap">,
-    GCCBuiltin<"__nvvm_sust_p_1d_array_i8_trap">;
+    ClangBuiltin<"__nvvm_sust_p_1d_array_i8_trap">;
 def int_nvvm_sust_p_1d_array_i16_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.p.1d.array.i16.trap">,
-    GCCBuiltin<"__nvvm_sust_p_1d_array_i16_trap">;
+    ClangBuiltin<"__nvvm_sust_p_1d_array_i16_trap">;
 def int_nvvm_sust_p_1d_array_i32_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.p.1d.array.i32.trap">,
-    GCCBuiltin<"__nvvm_sust_p_1d_array_i32_trap">;
+    ClangBuiltin<"__nvvm_sust_p_1d_array_i32_trap">;
 def int_nvvm_sust_p_1d_array_v2i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.p.1d.array.v2i8.trap">,
-    GCCBuiltin<"__nvvm_sust_p_1d_array_v2i8_trap">;
+    ClangBuiltin<"__nvvm_sust_p_1d_array_v2i8_trap">;
 def int_nvvm_sust_p_1d_array_v2i16_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.p.1d.array.v2i16.trap">,
-    GCCBuiltin<"__nvvm_sust_p_1d_array_v2i16_trap">;
+    ClangBuiltin<"__nvvm_sust_p_1d_array_v2i16_trap">;
 def int_nvvm_sust_p_1d_array_v2i32_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.p.1d.array.v2i32.trap">,
-    GCCBuiltin<"__nvvm_sust_p_1d_array_v2i32_trap">;
+    ClangBuiltin<"__nvvm_sust_p_1d_array_v2i32_trap">;
 def int_nvvm_sust_p_1d_array_v4i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.p.1d.array.v4i8.trap">,
-    GCCBuiltin<"__nvvm_sust_p_1d_array_v4i8_trap">;
+    ClangBuiltin<"__nvvm_sust_p_1d_array_v4i8_trap">;
 def int_nvvm_sust_p_1d_array_v4i16_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.p.1d.array.v4i16.trap">,
-    GCCBuiltin<"__nvvm_sust_p_1d_array_v4i16_trap">;
+    ClangBuiltin<"__nvvm_sust_p_1d_array_v4i16_trap">;
 def int_nvvm_sust_p_1d_array_v4i32_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.p.1d.array.v4i32.trap">,
-    GCCBuiltin<"__nvvm_sust_p_1d_array_v4i32_trap">;
+    ClangBuiltin<"__nvvm_sust_p_1d_array_v4i32_trap">;
 
 
 def int_nvvm_sust_p_2d_i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.p.2d.i8.trap">,
-    GCCBuiltin<"__nvvm_sust_p_2d_i8_trap">;
+    ClangBuiltin<"__nvvm_sust_p_2d_i8_trap">;
 def int_nvvm_sust_p_2d_i16_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.p.2d.i16.trap">,
-    GCCBuiltin<"__nvvm_sust_p_2d_i16_trap">;
+    ClangBuiltin<"__nvvm_sust_p_2d_i16_trap">;
 def int_nvvm_sust_p_2d_i32_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.p.2d.i32.trap">,
-    GCCBuiltin<"__nvvm_sust_p_2d_i32_trap">;
+    ClangBuiltin<"__nvvm_sust_p_2d_i32_trap">;
 def int_nvvm_sust_p_2d_v2i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.p.2d.v2i8.trap">,
-    GCCBuiltin<"__nvvm_sust_p_2d_v2i8_trap">;
+    ClangBuiltin<"__nvvm_sust_p_2d_v2i8_trap">;
 def int_nvvm_sust_p_2d_v2i16_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.p.2d.v2i16.trap">,
-    GCCBuiltin<"__nvvm_sust_p_2d_v2i16_trap">;
+    ClangBuiltin<"__nvvm_sust_p_2d_v2i16_trap">;
 def int_nvvm_sust_p_2d_v2i32_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.p.2d.v2i32.trap">,
-    GCCBuiltin<"__nvvm_sust_p_2d_v2i32_trap">;
+    ClangBuiltin<"__nvvm_sust_p_2d_v2i32_trap">;
 def int_nvvm_sust_p_2d_v4i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.p.2d.v4i8.trap">,
-    GCCBuiltin<"__nvvm_sust_p_2d_v4i8_trap">;
+    ClangBuiltin<"__nvvm_sust_p_2d_v4i8_trap">;
 def int_nvvm_sust_p_2d_v4i16_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.p.2d.v4i16.trap">,
-    GCCBuiltin<"__nvvm_sust_p_2d_v4i16_trap">;
+    ClangBuiltin<"__nvvm_sust_p_2d_v4i16_trap">;
 def int_nvvm_sust_p_2d_v4i32_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.p.2d.v4i32.trap">,
-    GCCBuiltin<"__nvvm_sust_p_2d_v4i32_trap">;
+    ClangBuiltin<"__nvvm_sust_p_2d_v4i32_trap">;
 
 
 def int_nvvm_sust_p_2d_array_i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.p.2d.array.i8.trap">,
-    GCCBuiltin<"__nvvm_sust_p_2d_array_i8_trap">;
+    ClangBuiltin<"__nvvm_sust_p_2d_array_i8_trap">;
 def int_nvvm_sust_p_2d_array_i16_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.p.2d.array.i16.trap">,
-    GCCBuiltin<"__nvvm_sust_p_2d_array_i16_trap">;
+    ClangBuiltin<"__nvvm_sust_p_2d_array_i16_trap">;
 def int_nvvm_sust_p_2d_array_i32_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.p.2d.array.i32.trap">,
-    GCCBuiltin<"__nvvm_sust_p_2d_array_i32_trap">;
+    ClangBuiltin<"__nvvm_sust_p_2d_array_i32_trap">;
 def int_nvvm_sust_p_2d_array_v2i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.p.2d.array.v2i8.trap">,
-    GCCBuiltin<"__nvvm_sust_p_2d_array_v2i8_trap">;
+    ClangBuiltin<"__nvvm_sust_p_2d_array_v2i8_trap">;
 def int_nvvm_sust_p_2d_array_v2i16_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.p.2d.array.v2i16.trap">,
-    GCCBuiltin<"__nvvm_sust_p_2d_array_v2i16_trap">;
+    ClangBuiltin<"__nvvm_sust_p_2d_array_v2i16_trap">;
 def int_nvvm_sust_p_2d_array_v2i32_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.p.2d.array.v2i32.trap">,
-    GCCBuiltin<"__nvvm_sust_p_2d_array_v2i32_trap">;
+    ClangBuiltin<"__nvvm_sust_p_2d_array_v2i32_trap">;
 def int_nvvm_sust_p_2d_array_v4i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.p.2d.array.v4i8.trap">,
-    GCCBuiltin<"__nvvm_sust_p_2d_array_v4i8_trap">;
+    ClangBuiltin<"__nvvm_sust_p_2d_array_v4i8_trap">;
 def int_nvvm_sust_p_2d_array_v4i16_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.p.2d.array.v4i16.trap">,
-    GCCBuiltin<"__nvvm_sust_p_2d_array_v4i16_trap">;
+    ClangBuiltin<"__nvvm_sust_p_2d_array_v4i16_trap">;
 def int_nvvm_sust_p_2d_array_v4i32_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.p.2d.array.v4i32.trap">,
-    GCCBuiltin<"__nvvm_sust_p_2d_array_v4i32_trap">;
+    ClangBuiltin<"__nvvm_sust_p_2d_array_v4i32_trap">;
 
 
 def int_nvvm_sust_p_3d_i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.p.3d.i8.trap">,
-    GCCBuiltin<"__nvvm_sust_p_3d_i8_trap">;
+    ClangBuiltin<"__nvvm_sust_p_3d_i8_trap">;
 def int_nvvm_sust_p_3d_i16_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.p.3d.i16.trap">,
-    GCCBuiltin<"__nvvm_sust_p_3d_i16_trap">;
+    ClangBuiltin<"__nvvm_sust_p_3d_i16_trap">;
 def int_nvvm_sust_p_3d_i32_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.p.3d.i32.trap">,
-    GCCBuiltin<"__nvvm_sust_p_3d_i32_trap">;
+    ClangBuiltin<"__nvvm_sust_p_3d_i32_trap">;
 def int_nvvm_sust_p_3d_v2i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.p.3d.v2i8.trap">,
-    GCCBuiltin<"__nvvm_sust_p_3d_v2i8_trap">;
+    ClangBuiltin<"__nvvm_sust_p_3d_v2i8_trap">;
 def int_nvvm_sust_p_3d_v2i16_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.p.3d.v2i16.trap">,
-    GCCBuiltin<"__nvvm_sust_p_3d_v2i16_trap">;
+    ClangBuiltin<"__nvvm_sust_p_3d_v2i16_trap">;
 def int_nvvm_sust_p_3d_v2i32_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.p.3d.v2i32.trap">,
-    GCCBuiltin<"__nvvm_sust_p_3d_v2i32_trap">;
+    ClangBuiltin<"__nvvm_sust_p_3d_v2i32_trap">;
 def int_nvvm_sust_p_3d_v4i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.p.3d.v4i8.trap">,
-    GCCBuiltin<"__nvvm_sust_p_3d_v4i8_trap">;
+    ClangBuiltin<"__nvvm_sust_p_3d_v4i8_trap">;
 def int_nvvm_sust_p_3d_v4i16_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.p.3d.v4i16.trap">,
-    GCCBuiltin<"__nvvm_sust_p_3d_v4i16_trap">;
+    ClangBuiltin<"__nvvm_sust_p_3d_v4i16_trap">;
 def int_nvvm_sust_p_3d_v4i32_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.p.3d.v4i32.trap">,
-    GCCBuiltin<"__nvvm_sust_p_3d_v4i32_trap">;
+    ClangBuiltin<"__nvvm_sust_p_3d_v4i32_trap">;
 
 
 def int_nvvm_rotate_b32
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
               [IntrNoMem, IntrSpeculatable], "llvm.nvvm.rotate.b32">,
-              GCCBuiltin<"__nvvm_rotate_b32">;
+              ClangBuiltin<"__nvvm_rotate_b32">;
 
 def int_nvvm_rotate_b64
   : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty],
              [IntrNoMem, IntrSpeculatable], "llvm.nvvm.rotate.b64">,
-             GCCBuiltin<"__nvvm_rotate_b64">;
+             ClangBuiltin<"__nvvm_rotate_b64">;
 
 def int_nvvm_rotate_right_b64
   : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty],
               [IntrNoMem, IntrSpeculatable], "llvm.nvvm.rotate.right.b64">,
-              GCCBuiltin<"__nvvm_rotate_right_b64">;
+              ClangBuiltin<"__nvvm_rotate_right_b64">;
 
 def int_nvvm_swap_lo_hi_b64
   : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty],
               [IntrNoMem, IntrSpeculatable], "llvm.nvvm.swap.lo.hi.b64">,
-              GCCBuiltin<"__nvvm_swap_lo_hi_b64">;
+              ClangBuiltin<"__nvvm_swap_lo_hi_b64">;
 
 
 // Accessing special registers.
@@ -4304,31 +4357,31 @@ multiclass PTXReadSRegIntrinsic_v4i32<string regname> {
 // FIXME: Enable this once v4i32 support is enabled in back-end.
 //    def _v4i16 : Intrinsic<[llvm_v4i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
 
-  def _x     : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>,
-               GCCBuiltin<"__nvvm_read_ptx_sreg_" # regname # "_x">;
-  def _y     : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>,
-               GCCBuiltin<"__nvvm_read_ptx_sreg_" # regname # "_y">;
-  def _z     : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>,
-               GCCBuiltin<"__nvvm_read_ptx_sreg_" # regname # "_z">;
-  def _w     : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>,
-               GCCBuiltin<"__nvvm_read_ptx_sreg_" # regname # "_w">;
+  def _x     : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>,
+               ClangBuiltin<"__nvvm_read_ptx_sreg_" # regname # "_x">;
+  def _y     : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>,
+               ClangBuiltin<"__nvvm_read_ptx_sreg_" # regname # "_y">;
+  def _z     : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>,
+               ClangBuiltin<"__nvvm_read_ptx_sreg_" # regname # "_z">;
+  def _w     : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>,
+               ClangBuiltin<"__nvvm_read_ptx_sreg_" # regname # "_w">;
 }
 
 class PTXReadSRegIntrinsic_r32<string name>
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_" # name>;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;
 class PTXReadSRegIntrinsic_r64<string name>
   : DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable]>,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_" # name>;
+    ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;
 
 // Intrinsics to read registers with non-constant values. E.g. the values that
 // do change over the kernel lifetime. Such reads should not be CSE'd.
 class PTXReadNCSRegIntrinsic_r32<string name>
-  : Intrinsic<[llvm_i32_ty], [], [IntrInaccessibleMemOnly]>,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_" # name>;
+  : Intrinsic<[llvm_i32_ty], [], [IntrInaccessibleMemOnly, IntrNoCallback]>,
+    ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;
 class PTXReadNCSRegIntrinsic_r64<string name>
-  : Intrinsic<[llvm_i64_ty], [], [IntrInaccessibleMemOnly]>,
-    GCCBuiltin<"__nvvm_read_ptx_sreg_" # name>;
+  : Intrinsic<[llvm_i64_ty], [], [IntrInaccessibleMemOnly, IntrNoCallback]>,
+    ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;
 
 defm int_nvvm_read_ptx_sreg_tid : PTXReadSRegIntrinsic_v4i32<"tid">;
 defm int_nvvm_read_ptx_sreg_ntid : PTXReadSRegIntrinsic_v4i32<"ntid">;
@@ -4375,14 +4428,16 @@ foreach sync = [false, true] in {
       foreach return_pred = [false, true] in {
         foreach i = [SHFL_INFO<sync, mode, type, return_pred>] in {
           if i.withGccBuiltin then {
-            def i.Name : GCCBuiltin<i.Builtin>,
+            def i.Name : ClangBuiltin<i.Builtin>,
                          Intrinsic<i.RetTy, i.ArgsTy,
-                                   [IntrInaccessibleMemOnly, IntrConvergent],
+                                   [IntrInaccessibleMemOnly, IntrConvergent,
+                                   IntrNoCallback],
                                    i.IntrName>;
           }
           if i.withoutGccBuiltin then {
             def i.Name : Intrinsic<i.RetTy, i.ArgsTy,
-                         [IntrInaccessibleMemOnly, IntrConvergent], i.IntrName>;
+                         [IntrInaccessibleMemOnly, IntrConvergent,
+                         IntrNoCallback], i.IntrName>;
           }
         }
       }
@@ -4397,23 +4452,23 @@ foreach sync = [false, true] in {
 // vote.all pred
 def int_nvvm_vote_all :
   Intrinsic<[llvm_i1_ty], [llvm_i1_ty],
-            [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.all">,
-  GCCBuiltin<"__nvvm_vote_all">;
+            [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback], "llvm.nvvm.vote.all">,
+  ClangBuiltin<"__nvvm_vote_all">;
 // vote.any pred
 def int_nvvm_vote_any :
   Intrinsic<[llvm_i1_ty], [llvm_i1_ty],
-            [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.any">,
-  GCCBuiltin<"__nvvm_vote_any">;
+            [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback], "llvm.nvvm.vote.any">,
+  ClangBuiltin<"__nvvm_vote_any">;
 // vote.uni pred
 def int_nvvm_vote_uni :
   Intrinsic<[llvm_i1_ty], [llvm_i1_ty],
-            [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.uni">,
-  GCCBuiltin<"__nvvm_vote_uni">;
+            [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback], "llvm.nvvm.vote.uni">,
+  ClangBuiltin<"__nvvm_vote_uni">;
 // vote.ballot pred
 def int_nvvm_vote_ballot :
   Intrinsic<[llvm_i32_ty], [llvm_i1_ty],
-            [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.ballot">,
-  GCCBuiltin<"__nvvm_vote_ballot">;
+            [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback], "llvm.nvvm.vote.ballot">,
+  ClangBuiltin<"__nvvm_vote_ballot">;
 
 //
 // VOTE.SYNC
@@ -4422,23 +4477,23 @@ def int_nvvm_vote_ballot :
 // vote.sync.all mask, pred
 def int_nvvm_vote_all_sync :
   Intrinsic<[llvm_i1_ty], [llvm_i32_ty, llvm_i1_ty],
-            [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.all.sync">,
-  GCCBuiltin<"__nvvm_vote_all_sync">;
+            [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback], "llvm.nvvm.vote.all.sync">,
+  ClangBuiltin<"__nvvm_vote_all_sync">;
 // vote.sync.any mask, pred
 def int_nvvm_vote_any_sync :
   Intrinsic<[llvm_i1_ty], [llvm_i32_ty, llvm_i1_ty],
-            [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.any.sync">,
-  GCCBuiltin<"__nvvm_vote_any_sync">;
+            [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback], "llvm.nvvm.vote.any.sync">,
+  ClangBuiltin<"__nvvm_vote_any_sync">;
 // vote.sync.uni mask, pred
 def int_nvvm_vote_uni_sync :
   Intrinsic<[llvm_i1_ty], [llvm_i32_ty, llvm_i1_ty],
-            [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.uni.sync">,
-  GCCBuiltin<"__nvvm_vote_uni_sync">;
+            [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback], "llvm.nvvm.vote.uni.sync">,
+  ClangBuiltin<"__nvvm_vote_uni_sync">;
 // vote.sync.ballot mask, pred
 def int_nvvm_vote_ballot_sync :
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i1_ty],
-            [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.ballot.sync">,
-  GCCBuiltin<"__nvvm_vote_ballot_sync">;
+            [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback], "llvm.nvvm.vote.ballot.sync">,
+  ClangBuiltin<"__nvvm_vote_ballot_sync">;
 
 //
 // MATCH.SYNC
@@ -4446,13 +4501,13 @@ def int_nvvm_vote_ballot_sync :
 // match.any.sync.b32 mask, value
 def int_nvvm_match_any_sync_i32 :
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-            [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.match.any.sync.i32">,
-  GCCBuiltin<"__nvvm_match_any_sync_i32">;
+            [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback], "llvm.nvvm.match.any.sync.i32">,
+  ClangBuiltin<"__nvvm_match_any_sync_i32">;
 // match.any.sync.b64 mask, value
 def int_nvvm_match_any_sync_i64 :
-  Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i64_ty],
-            [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.match.any.sync.i64">,
-  GCCBuiltin<"__nvvm_match_any_sync_i64">;
+  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i64_ty],
+            [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback], "llvm.nvvm.match.any.sync.i64">,
+  ClangBuiltin<"__nvvm_match_any_sync_i64">;
 
 // match.all instruction have two variants -- one returns a single value, another
 // returns a pair {value, predicate}. We currently only implement the latter as
@@ -4461,54 +4516,54 @@ def int_nvvm_match_any_sync_i64 :
 // match.all.sync.b32p mask, value
 def int_nvvm_match_all_sync_i32p :
   Intrinsic<[llvm_i32_ty, llvm_i1_ty], [llvm_i32_ty, llvm_i32_ty],
-            [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.match.all.sync.i32p">;
+            [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback], "llvm.nvvm.match.all.sync.i32p">;
 // match.all.sync.b64p mask, value
 def int_nvvm_match_all_sync_i64p :
-  Intrinsic<[llvm_i64_ty, llvm_i1_ty], [llvm_i32_ty, llvm_i64_ty],
-            [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.match.all.sync.i64p">;
+  Intrinsic<[llvm_i32_ty, llvm_i1_ty], [llvm_i32_ty, llvm_i64_ty],
+            [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback], "llvm.nvvm.match.all.sync.i64p">;
 
 //
 // REDUX.SYNC
 //
 // redux.sync.min.u32 dst, src, membermask;
-def int_nvvm_redux_sync_umin : GCCBuiltin<"__nvvm_redux_sync_umin">,
+def int_nvvm_redux_sync_umin : ClangBuiltin<"__nvvm_redux_sync_umin">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-            [IntrConvergent, IntrInaccessibleMemOnly]>;
+            [IntrConvergent, IntrInaccessibleMemOnly, IntrNoCallback]>;
 
 // redux.sync.max.u32 dst, src, membermask;
-def int_nvvm_redux_sync_umax : GCCBuiltin<"__nvvm_redux_sync_umax">,
+def int_nvvm_redux_sync_umax : ClangBuiltin<"__nvvm_redux_sync_umax">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-            [IntrConvergent, IntrInaccessibleMemOnly]>;
+            [IntrConvergent, IntrInaccessibleMemOnly, IntrNoCallback]>;
 
 // redux.sync.add.s32 dst, src, membermask;
-def int_nvvm_redux_sync_add : GCCBuiltin<"__nvvm_redux_sync_add">,
+def int_nvvm_redux_sync_add : ClangBuiltin<"__nvvm_redux_sync_add">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-            [IntrConvergent, IntrInaccessibleMemOnly]>;
+            [IntrConvergent, IntrInaccessibleMemOnly, IntrNoCallback]>;
 
 // redux.sync.min.s32 dst, src, membermask;
-def int_nvvm_redux_sync_min : GCCBuiltin<"__nvvm_redux_sync_min">,
+def int_nvvm_redux_sync_min : ClangBuiltin<"__nvvm_redux_sync_min">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-            [IntrConvergent, IntrInaccessibleMemOnly]>;
+            [IntrConvergent, IntrInaccessibleMemOnly, IntrNoCallback]>;
 
 // redux.sync.max.s32 dst, src, membermask;
-def int_nvvm_redux_sync_max : GCCBuiltin<"__nvvm_redux_sync_max">,
+def int_nvvm_redux_sync_max : ClangBuiltin<"__nvvm_redux_sync_max">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-            [IntrConvergent, IntrInaccessibleMemOnly]>;
+            [IntrConvergent, IntrInaccessibleMemOnly, IntrNoCallback]>;
 
 // redux.sync.and.b32 dst, src, membermask;
-def int_nvvm_redux_sync_and : GCCBuiltin<"__nvvm_redux_sync_and">,
+def int_nvvm_redux_sync_and : ClangBuiltin<"__nvvm_redux_sync_and">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-            [IntrConvergent, IntrInaccessibleMemOnly]>;
+            [IntrConvergent, IntrInaccessibleMemOnly, IntrNoCallback]>;
 
 // redux.sync.xor.b32 dst, src, membermask;
-def int_nvvm_redux_sync_xor : GCCBuiltin<"__nvvm_redux_sync_xor">,
+def int_nvvm_redux_sync_xor : ClangBuiltin<"__nvvm_redux_sync_xor">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-            [IntrConvergent, IntrInaccessibleMemOnly]>;
+            [IntrConvergent, IntrInaccessibleMemOnly, IntrNoCallback]>;
 
 // redux.sync.or.b32 dst, src, membermask;
-def int_nvvm_redux_sync_or : GCCBuiltin<"__nvvm_redux_sync_or">,
+def int_nvvm_redux_sync_or : ClangBuiltin<"__nvvm_redux_sync_or">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-            [IntrConvergent, IntrInaccessibleMemOnly]>;
+            [IntrConvergent, IntrInaccessibleMemOnly, IntrNoCallback]>;
 
 //
 // WMMA instructions
@@ -4517,7 +4572,7 @@ def int_nvvm_redux_sync_or : GCCBuiltin<"__nvvm_redux_sync_or">,
 class NVVM_WMMA_LD<WMMA_REGS Frag, string Layout, int WithStride>
   : Intrinsic<Frag.regs,
               !if(WithStride, [llvm_anyptr_ty, llvm_i32_ty], [llvm_anyptr_ty]),
-              [IntrReadMem, IntrArgMemOnly, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>],
+              [IntrReadMem, IntrArgMemOnly, IntrNoCallback, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>],
               WMMA_NAME_LDST<"load", Frag, Layout, WithStride>.intr>;
 
 // WMMA.STORE.D
@@ -4527,7 +4582,7 @@ class NVVM_WMMA_ST<WMMA_REGS Frag, string Layout, int WithStride>
                 [llvm_anyptr_ty],
                 Frag.regs,
                 !if(WithStride, [llvm_i32_ty], [])),
-              [IntrWriteMem, IntrArgMemOnly, WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>],
+              [IntrWriteMem, IntrArgMemOnly, IntrNoCallback, WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>],
               WMMA_NAME_LDST<"store", Frag, Layout, WithStride>.intr>;
 
 // Create all load/store variants
@@ -4550,7 +4605,7 @@ class NVVM_WMMA_MMA<string ALayout, string BLayout, int Satfinite, string rnd, s
                     WMMA_REGS C, WMMA_REGS D>
   : Intrinsic<D.regs,
               !listconcat(A.regs, B.regs, C.regs),
-              [IntrNoMem],
+              [IntrNoMem, IntrNoCallback],
               WMMA_NAME<ALayout, BLayout, Satfinite, rnd, b1op, A, B, C, D>.llvm>;
 
 foreach layout_a = ["row", "col"] in {
@@ -4577,7 +4632,7 @@ class NVVM_MMA<string ALayout, string BLayout, int Satfinite, string b1op,
                WMMA_REGS A, WMMA_REGS B, WMMA_REGS C, WMMA_REGS D>
   : Intrinsic<D.regs,
               !listconcat(A.regs, B.regs, C.regs),
-              [IntrNoMem],
+              [IntrNoMem, IntrNoCallback],
               MMA_NAME<ALayout, BLayout, Satfinite, b1op, A, B, C, D>.llvm>;
 
 foreach layout_a = ["row", "col"] in {
@@ -4598,7 +4653,7 @@ foreach layout_a = ["row", "col"] in {
 // LDMATRIX
 class NVVM_LDMATRIX<WMMA_REGS Frag, int Transposed>
   : Intrinsic<Frag.regs, [llvm_anyptr_ty],
-              [IntrReadMem, IntrArgMemOnly, ReadOnly<ArgIndex<0>>,
+              [IntrReadMem, IntrArgMemOnly, IntrNoCallback, ReadOnly<ArgIndex<0>>,
                NoCapture<ArgIndex<0>>],
               LDMATRIX_NAME<Frag, Transposed>.intr>;
 
diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
index b01fa10763b8..577122328dd2 100644
--- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -18,7 +18,7 @@
 let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
   // dcba/dcbf/dcbi/dcbst/dcbt/dcbz/dcbzl(PPC970) instructions.
   def int_ppc_dcba  : Intrinsic<[], [llvm_ptr_ty], []>;
-  def int_ppc_dcbf : GCCBuiltin<"__builtin_dcbf">,
+  def int_ppc_dcbf : ClangBuiltin<"__builtin_dcbf">,
                       Intrinsic<[], [llvm_ptr_ty], [IntrArgMemOnly]>;
   def int_ppc_dcbfps : Intrinsic<[], [llvm_ptr_ty], [IntrArgMemOnly]>;
   def int_ppc_dcbstps : Intrinsic<[], [llvm_ptr_ty], [IntrArgMemOnly]>;
@@ -30,136 +30,170 @@ let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
   def int_ppc_dcbzl : Intrinsic<[], [llvm_ptr_ty], []>;
 
   // Get content from current FPSCR register
-  def int_ppc_readflm : GCCBuiltin<"__builtin_readflm">,
+  def int_ppc_readflm : ClangBuiltin<"__builtin_readflm">,
                         Intrinsic<[llvm_double_ty], [],
                                   [IntrNoMerge, IntrHasSideEffects]>;
   // Set FPSCR register, and return previous content
-  def int_ppc_setflm : GCCBuiltin<"__builtin_setflm">,
+  def int_ppc_setflm : ClangBuiltin<"__builtin_setflm">,
                        Intrinsic<[llvm_double_ty], [llvm_double_ty],
                                  [IntrHasSideEffects]>;
 
   // Intrinsics for [double]word extended forms of divide instructions
-  def int_ppc_divwe : GCCBuiltin<"__builtin_divwe">,
+  def int_ppc_divwe : ClangBuiltin<"__builtin_divwe">,
                       Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
                                 [IntrNoMem]>;
-  def int_ppc_divweu : GCCBuiltin<"__builtin_divweu">,
+  def int_ppc_divweu : ClangBuiltin<"__builtin_divweu">,
                        Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
                                  [IntrNoMem]>;
-  def int_ppc_divde : GCCBuiltin<"__builtin_divde">,
+  def int_ppc_divde : ClangBuiltin<"__builtin_divde">,
                       Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
                                 [IntrNoMem]>;
-  def int_ppc_divdeu : GCCBuiltin<"__builtin_divdeu">,
+  def int_ppc_divdeu : ClangBuiltin<"__builtin_divdeu">,
                        Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
                                  [IntrNoMem]>;
 
-  def int_ppc_unpack_longdouble : GCCBuiltin<"__builtin_unpack_longdouble">,
+  def int_ppc_unpack_longdouble : ClangBuiltin<"__builtin_unpack_longdouble">,
                                   Intrinsic<[llvm_double_ty],
                                             [llvm_ppcf128_ty, llvm_i32_ty],
                                             [IntrNoMem]>;
-  def int_ppc_pack_longdouble : GCCBuiltin<"__builtin_pack_longdouble">,
+  def int_ppc_pack_longdouble : ClangBuiltin<"__builtin_pack_longdouble">,
                                 Intrinsic<[llvm_ppcf128_ty],
                                           [llvm_double_ty, llvm_double_ty],
                                           [IntrNoMem]>;
 
   // Generate a random number
-  def int_ppc_darn : GCCBuiltin<"__builtin_darn">,
-                     Intrinsic<[llvm_i64_ty], [], [IntrNoMem]>;
-  def int_ppc_darnraw : GCCBuiltin<"__builtin_darn_raw">,
-                     Intrinsic<[llvm_i64_ty], [], [IntrNoMem]>;
-  def int_ppc_darn32 : GCCBuiltin<"__builtin_darn_32">,
-                     Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>;
+  def int_ppc_darn : ClangBuiltin<"__builtin_darn">,
+                     Intrinsic<[llvm_i64_ty], [],
+                               [IntrNoMerge, IntrHasSideEffects]>;
+  def int_ppc_darnraw : ClangBuiltin<"__builtin_darn_raw">,
+                     Intrinsic<[llvm_i64_ty], [],
+                               [IntrNoMerge, IntrHasSideEffects]>;
+  def int_ppc_darn32 : ClangBuiltin<"__builtin_darn_32">,
+                     Intrinsic<[llvm_i32_ty], [],
+                               [IntrNoMerge, IntrHasSideEffects]>;
 
   // Bit permute doubleword
-  def int_ppc_bpermd : GCCBuiltin<"__builtin_bpermd">,
+  def int_ppc_bpermd : ClangBuiltin<"__builtin_bpermd">,
                        Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
                                  [IntrNoMem]>;
 
   // Parallel Bits Deposit/Extract Doubleword Builtins.
   def int_ppc_pdepd
-      : GCCBuiltin<"__builtin_pdepd">,
+      : ClangBuiltin<"__builtin_pdepd">,
         Intrinsic <[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
   def int_ppc_pextd
-      : GCCBuiltin<"__builtin_pextd">,
+      : ClangBuiltin<"__builtin_pextd">,
         Intrinsic <[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
 
   // Centrifuge Doubleword Builtin.
   def int_ppc_cfuged
-      : GCCBuiltin<"__builtin_cfuged">,
+      : ClangBuiltin<"__builtin_cfuged">,
         Intrinsic <[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
 
   // Count Leading / Trailing Zeroes under bit Mask Builtins.
   def int_ppc_cntlzdm
-      : GCCBuiltin<"__builtin_cntlzdm">,
+      : ClangBuiltin<"__builtin_cntlzdm">,
         Intrinsic <[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
   def int_ppc_cnttzdm
-      : GCCBuiltin<"__builtin_cnttzdm">,
+      : ClangBuiltin<"__builtin_cnttzdm">,
         Intrinsic <[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
 
   def int_ppc_truncf128_round_to_odd
-      : GCCBuiltin<"__builtin_truncf128_round_to_odd">,
+      : ClangBuiltin<"__builtin_truncf128_round_to_odd">,
         Intrinsic <[llvm_double_ty], [llvm_f128_ty], [IntrNoMem]>;
   def int_ppc_sqrtf128_round_to_odd
-      : GCCBuiltin<"__builtin_sqrtf128_round_to_odd">,
+      : ClangBuiltin<"__builtin_sqrtf128_round_to_odd">,
         Intrinsic <[llvm_f128_ty], [llvm_f128_ty], [IntrNoMem]>;
   def int_ppc_addf128_round_to_odd
-      : GCCBuiltin<"__builtin_addf128_round_to_odd">,
+      : ClangBuiltin<"__builtin_addf128_round_to_odd">,
         Intrinsic <[llvm_f128_ty], [llvm_f128_ty,llvm_f128_ty], [IntrNoMem]>;
   def int_ppc_subf128_round_to_odd
-      : GCCBuiltin<"__builtin_subf128_round_to_odd">,
+      : ClangBuiltin<"__builtin_subf128_round_to_odd">,
         Intrinsic <[llvm_f128_ty], [llvm_f128_ty,llvm_f128_ty], [IntrNoMem]>;
   def int_ppc_mulf128_round_to_odd
-      : GCCBuiltin<"__builtin_mulf128_round_to_odd">,
+      : ClangBuiltin<"__builtin_mulf128_round_to_odd">,
         Intrinsic <[llvm_f128_ty], [llvm_f128_ty,llvm_f128_ty], [IntrNoMem]>;
   def int_ppc_divf128_round_to_odd
-      : GCCBuiltin<"__builtin_divf128_round_to_odd">,
+      : ClangBuiltin<"__builtin_divf128_round_to_odd">,
         Intrinsic <[llvm_f128_ty], [llvm_f128_ty,llvm_f128_ty], [IntrNoMem]>;
   def int_ppc_fmaf128_round_to_odd
-      : GCCBuiltin<"__builtin_fmaf128_round_to_odd">,
+      : ClangBuiltin<"__builtin_fmaf128_round_to_odd">,
         Intrinsic <[llvm_f128_ty], [llvm_f128_ty,llvm_f128_ty,llvm_f128_ty], [IntrNoMem]>;
   def int_ppc_scalar_extract_expq
-      : GCCBuiltin<"__builtin_vsx_scalar_extract_expq">,
+      : ClangBuiltin<"__builtin_vsx_scalar_extract_expq">,
         Intrinsic <[llvm_i64_ty], [llvm_f128_ty], [IntrNoMem]>;
   def int_ppc_scalar_insert_exp_qp
-      : GCCBuiltin<"__builtin_vsx_scalar_insert_exp_qp">,
+      : ClangBuiltin<"__builtin_vsx_scalar_insert_exp_qp">,
         Intrinsic <[llvm_f128_ty], [llvm_f128_ty, llvm_i64_ty], [IntrNoMem]>;
 
   // Intrinsics defined to maintain XL compatibility
   def int_ppc_tdw
-      : GCCBuiltin<"__builtin_ppc_tdw">,
+      : ClangBuiltin<"__builtin_ppc_tdw">,
         Intrinsic <[], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty], [ImmArg<ArgIndex<2>>]>;
   def int_ppc_tw
-      : GCCBuiltin<"__builtin_ppc_tw">,
+      : ClangBuiltin<"__builtin_ppc_tw">,
         Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<2>>]>;
   def int_ppc_trapd
-      : GCCBuiltin<"__builtin_ppc_trapd">,
+      : ClangBuiltin<"__builtin_ppc_trapd">,
         Intrinsic <[], [llvm_i64_ty], []>;
   def int_ppc_trap
-      : GCCBuiltin<"__builtin_ppc_trap">,
+      : ClangBuiltin<"__builtin_ppc_trap">,
         Intrinsic <[], [llvm_i32_ty], []>;
   def int_ppc_fcfid
-      : GCCBuiltin<"__builtin_ppc_fcfid">,
+      : ClangBuiltin<"__builtin_ppc_fcfid">,
         Intrinsic <[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
   def int_ppc_fcfud
-      : GCCBuiltin<"__builtin_ppc_fcfud">,
+      : ClangBuiltin<"__builtin_ppc_fcfud">,
         Intrinsic <[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
   def int_ppc_fctid
-      : GCCBuiltin<"__builtin_ppc_fctid">,
+      : ClangBuiltin<"__builtin_ppc_fctid">,
         Intrinsic <[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
   def int_ppc_fctidz
-      : GCCBuiltin<"__builtin_ppc_fctidz">,
+      : ClangBuiltin<"__builtin_ppc_fctidz">,
         Intrinsic <[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
   def int_ppc_fctiw
-      : GCCBuiltin<"__builtin_ppc_fctiw">,
+      : ClangBuiltin<"__builtin_ppc_fctiw">,
         Intrinsic <[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
   def int_ppc_fctiwz
-      : GCCBuiltin<"__builtin_ppc_fctiwz">,
+      : ClangBuiltin<"__builtin_ppc_fctiwz">,
         Intrinsic <[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
   def int_ppc_fctudz
-      : GCCBuiltin<"__builtin_ppc_fctudz">,
+      : ClangBuiltin<"__builtin_ppc_fctudz">,
         Intrinsic <[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
   def int_ppc_fctuwz
-      : GCCBuiltin<"__builtin_ppc_fctuwz">,
+      : ClangBuiltin<"__builtin_ppc_fctuwz">,
         Intrinsic <[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+
+  // XL compatible select functions
+  // TODO: Add llvm_f128_ty support.
+  def int_ppc_maxfe
+      : Intrinsic<
+            [llvm_ppcf128_ty],
+            [llvm_ppcf128_ty, llvm_ppcf128_ty, llvm_ppcf128_ty, llvm_vararg_ty],
+            [IntrNoMem]>;
+  def int_ppc_maxfl
+      : Intrinsic<
+            [llvm_double_ty],
+            [llvm_double_ty, llvm_double_ty, llvm_double_ty, llvm_vararg_ty],
+            [IntrNoMem]>;
+  def int_ppc_maxfs
+      : Intrinsic<[llvm_float_ty],
+                  [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_vararg_ty],
+                  [IntrNoMem]>;
+  def int_ppc_minfe
+      : Intrinsic<
+            [llvm_ppcf128_ty],
+            [llvm_ppcf128_ty, llvm_ppcf128_ty, llvm_ppcf128_ty, llvm_vararg_ty],
+            [IntrNoMem]>;
+  def int_ppc_minfl
+      : Intrinsic<
+            [llvm_double_ty],
+            [llvm_double_ty, llvm_double_ty, llvm_double_ty, llvm_vararg_ty],
+            [IntrNoMem]>;
+  def int_ppc_minfs
+      : Intrinsic<[llvm_float_ty],
+                  [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_vararg_ty],
+                  [IntrNoMem]>;
 }
 
 let TargetPrefix = "ppc" in {  // All PPC intrinsics start with "llvm.ppc.".
@@ -167,14 +201,14 @@ let TargetPrefix = "ppc" in {  // All PPC intrinsics start with "llvm.ppc.".
   class PowerPC_Vec_Intrinsic<string GCCIntSuffix, list<LLVMType> ret_types,
                               list<LLVMType> param_types,
                               list<IntrinsicProperty> properties>
-    : GCCBuiltin<!strconcat("__builtin_altivec_", GCCIntSuffix)>,
+    : ClangBuiltin<!strconcat("__builtin_altivec_", GCCIntSuffix)>,
       Intrinsic<ret_types, param_types, properties>;
 
   /// PowerPC_VSX_Intrinsic - Base class for all VSX intrinsics.
   class PowerPC_VSX_Intrinsic<string GCCIntSuffix, list<LLVMType> ret_types,
                               list<LLVMType> param_types,
                               list<IntrinsicProperty> properties>
-    : GCCBuiltin<!strconcat("__builtin_vsx_", GCCIntSuffix)>,
+    : ClangBuiltin<!strconcat("__builtin_vsx_", GCCIntSuffix)>,
       Intrinsic<ret_types, param_types, properties>;
 }
 
@@ -289,31 +323,31 @@ class PowerPC_VSX_Sca_DDD_Intrinsic<string GCCIntSuffix>
 
 let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
   // Data Stream Control.
-  def int_ppc_altivec_dss : GCCBuiltin<"__builtin_altivec_dss">,
+  def int_ppc_altivec_dss : ClangBuiltin<"__builtin_altivec_dss">,
               Intrinsic<[], [llvm_i32_ty], []>;
-  def int_ppc_altivec_dssall : GCCBuiltin<"__builtin_altivec_dssall">,
+  def int_ppc_altivec_dssall : ClangBuiltin<"__builtin_altivec_dssall">,
               Intrinsic<[], [], []>;
-  def int_ppc_altivec_dst : GCCBuiltin<"__builtin_altivec_dst">,
+  def int_ppc_altivec_dst : ClangBuiltin<"__builtin_altivec_dst">,
               Intrinsic<[],
                         [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty],
                         []>;
-  def int_ppc_altivec_dstt : GCCBuiltin<"__builtin_altivec_dstt">,
+  def int_ppc_altivec_dstt : ClangBuiltin<"__builtin_altivec_dstt">,
               Intrinsic<[],
                         [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty],
                         []>;
-  def int_ppc_altivec_dstst : GCCBuiltin<"__builtin_altivec_dstst">,
+  def int_ppc_altivec_dstst : ClangBuiltin<"__builtin_altivec_dstst">,
               Intrinsic<[],
                         [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty],
                         []>;
-  def int_ppc_altivec_dststt : GCCBuiltin<"__builtin_altivec_dststt">,
+  def int_ppc_altivec_dststt : ClangBuiltin<"__builtin_altivec_dststt">,
               Intrinsic<[],
                         [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty],
                         []>;
 
   // VSCR access.
-  def int_ppc_altivec_mfvscr : GCCBuiltin<"__builtin_altivec_mfvscr">,
+  def int_ppc_altivec_mfvscr : ClangBuiltin<"__builtin_altivec_mfvscr">,
               Intrinsic<[llvm_v8i16_ty], [], [IntrNoMem, IntrHasSideEffects]>;
-  def int_ppc_altivec_mtvscr : GCCBuiltin<"__builtin_altivec_mtvscr">,
+  def int_ppc_altivec_mtvscr : ClangBuiltin<"__builtin_altivec_mtvscr">,
               Intrinsic<[], [llvm_v4i32_ty], [IntrNoMem, IntrHasSideEffects]>;
 
 
@@ -349,354 +383,354 @@ let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
                         [IntrWriteMem, IntrArgMemOnly]>;
 
   // Comparisons setting a vector.
-  def int_ppc_altivec_vcmpbfp : GCCBuiltin<"__builtin_altivec_vcmpbfp">,
+  def int_ppc_altivec_vcmpbfp : ClangBuiltin<"__builtin_altivec_vcmpbfp">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpeqfp : GCCBuiltin<"__builtin_altivec_vcmpeqfp">,
+  def int_ppc_altivec_vcmpeqfp : ClangBuiltin<"__builtin_altivec_vcmpeqfp">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpgefp : GCCBuiltin<"__builtin_altivec_vcmpgefp">,
+  def int_ppc_altivec_vcmpgefp : ClangBuiltin<"__builtin_altivec_vcmpgefp">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpgtfp : GCCBuiltin<"__builtin_altivec_vcmpgtfp">,
+  def int_ppc_altivec_vcmpgtfp : ClangBuiltin<"__builtin_altivec_vcmpgtfp">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty],
                         [IntrNoMem]>;
 
-  def int_ppc_altivec_vcmpequd : GCCBuiltin<"__builtin_altivec_vcmpequd">,
+  def int_ppc_altivec_vcmpequd : ClangBuiltin<"__builtin_altivec_vcmpequd">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpgtsd : GCCBuiltin<"__builtin_altivec_vcmpgtsd">,
+  def int_ppc_altivec_vcmpgtsd : ClangBuiltin<"__builtin_altivec_vcmpgtsd">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpgtud : GCCBuiltin<"__builtin_altivec_vcmpgtud">,
+  def int_ppc_altivec_vcmpgtud : ClangBuiltin<"__builtin_altivec_vcmpgtud">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
                         [IntrNoMem]>;
 
-  def int_ppc_altivec_vcmpequw : GCCBuiltin<"__builtin_altivec_vcmpequw">,
+  def int_ppc_altivec_vcmpequw : ClangBuiltin<"__builtin_altivec_vcmpequw">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpgtsw : GCCBuiltin<"__builtin_altivec_vcmpgtsw">,
+  def int_ppc_altivec_vcmpgtsw : ClangBuiltin<"__builtin_altivec_vcmpgtsw">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpgtuw : GCCBuiltin<"__builtin_altivec_vcmpgtuw">,
+  def int_ppc_altivec_vcmpgtuw : ClangBuiltin<"__builtin_altivec_vcmpgtuw">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpnew : GCCBuiltin<"__builtin_altivec_vcmpnew">,
+  def int_ppc_altivec_vcmpnew : ClangBuiltin<"__builtin_altivec_vcmpnew">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpnezw : GCCBuiltin<"__builtin_altivec_vcmpnezw">,
+  def int_ppc_altivec_vcmpnezw : ClangBuiltin<"__builtin_altivec_vcmpnezw">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
                         [IntrNoMem]>;
 
-  def int_ppc_altivec_vcmpequh : GCCBuiltin<"__builtin_altivec_vcmpequh">,
+  def int_ppc_altivec_vcmpequh : ClangBuiltin<"__builtin_altivec_vcmpequh">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpgtsh : GCCBuiltin<"__builtin_altivec_vcmpgtsh">,
+  def int_ppc_altivec_vcmpgtsh : ClangBuiltin<"__builtin_altivec_vcmpgtsh">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpgtuh : GCCBuiltin<"__builtin_altivec_vcmpgtuh">,
+  def int_ppc_altivec_vcmpgtuh : ClangBuiltin<"__builtin_altivec_vcmpgtuh">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpneh : GCCBuiltin<"__builtin_altivec_vcmpneh">,
+  def int_ppc_altivec_vcmpneh : ClangBuiltin<"__builtin_altivec_vcmpneh">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpnezh : GCCBuiltin<"__builtin_altivec_vcmpnezh">,
+  def int_ppc_altivec_vcmpnezh : ClangBuiltin<"__builtin_altivec_vcmpnezh">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
                         [IntrNoMem]>;
 
-  def int_ppc_altivec_vcmpequb : GCCBuiltin<"__builtin_altivec_vcmpequb">,
+  def int_ppc_altivec_vcmpequb : ClangBuiltin<"__builtin_altivec_vcmpequb">,
               Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpgtsb : GCCBuiltin<"__builtin_altivec_vcmpgtsb">,
+  def int_ppc_altivec_vcmpgtsb : ClangBuiltin<"__builtin_altivec_vcmpgtsb">,
               Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpgtub : GCCBuiltin<"__builtin_altivec_vcmpgtub">,
+  def int_ppc_altivec_vcmpgtub : ClangBuiltin<"__builtin_altivec_vcmpgtub">,
               Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpneb : GCCBuiltin<"__builtin_altivec_vcmpneb">,
+  def int_ppc_altivec_vcmpneb : ClangBuiltin<"__builtin_altivec_vcmpneb">,
               Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpnezb : GCCBuiltin<"__builtin_altivec_vcmpnezb">,
+  def int_ppc_altivec_vcmpnezb : ClangBuiltin<"__builtin_altivec_vcmpnezb">,
               Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
                         [IntrNoMem]>;
 
-  def int_ppc_altivec_vcmpequq : GCCBuiltin<"__builtin_altivec_vcmpequq">,
+  def int_ppc_altivec_vcmpequq : ClangBuiltin<"__builtin_altivec_vcmpequq">,
               Intrinsic<[llvm_v1i128_ty], [llvm_v1i128_ty, llvm_v1i128_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpgtsq : GCCBuiltin<"__builtin_altivec_vcmpgtsq">,
+  def int_ppc_altivec_vcmpgtsq : ClangBuiltin<"__builtin_altivec_vcmpgtsq">,
               Intrinsic<[llvm_v1i128_ty], [llvm_v1i128_ty, llvm_v1i128_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpgtuq : GCCBuiltin<"__builtin_altivec_vcmpgtuq">,
+  def int_ppc_altivec_vcmpgtuq : ClangBuiltin<"__builtin_altivec_vcmpgtuq">,
               Intrinsic<[llvm_v1i128_ty], [llvm_v1i128_ty, llvm_v1i128_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpequq_p : GCCBuiltin<"__builtin_altivec_vcmpequq_p">,
+  def int_ppc_altivec_vcmpequq_p : ClangBuiltin<"__builtin_altivec_vcmpequq_p">,
               Intrinsic<[llvm_i32_ty],
                         [llvm_i32_ty,llvm_v1i128_ty,llvm_v1i128_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpgtsq_p : GCCBuiltin<"__builtin_altivec_vcmpgtsq_p">,
+  def int_ppc_altivec_vcmpgtsq_p : ClangBuiltin<"__builtin_altivec_vcmpgtsq_p">,
               Intrinsic<[llvm_i32_ty],
                         [llvm_i32_ty,llvm_v1i128_ty,llvm_v1i128_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpgtuq_p : GCCBuiltin<"__builtin_altivec_vcmpgtuq_p">,
+  def int_ppc_altivec_vcmpgtuq_p : ClangBuiltin<"__builtin_altivec_vcmpgtuq_p">,
               Intrinsic<[llvm_i32_ty],
                         [llvm_i32_ty,llvm_v1i128_ty,llvm_v1i128_ty],
                         [IntrNoMem]>;
 
   // Predicate Comparisons.  The first operand specifies interpretation of CR6.
-  def int_ppc_altivec_vcmpbfp_p : GCCBuiltin<"__builtin_altivec_vcmpbfp_p">,
+  def int_ppc_altivec_vcmpbfp_p : ClangBuiltin<"__builtin_altivec_vcmpbfp_p">,
               Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v4f32_ty,llvm_v4f32_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpeqfp_p : GCCBuiltin<"__builtin_altivec_vcmpeqfp_p">,
+  def int_ppc_altivec_vcmpeqfp_p : ClangBuiltin<"__builtin_altivec_vcmpeqfp_p">,
               Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v4f32_ty,llvm_v4f32_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpgefp_p : GCCBuiltin<"__builtin_altivec_vcmpgefp_p">,
+  def int_ppc_altivec_vcmpgefp_p : ClangBuiltin<"__builtin_altivec_vcmpgefp_p">,
               Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v4f32_ty,llvm_v4f32_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpgtfp_p : GCCBuiltin<"__builtin_altivec_vcmpgtfp_p">,
+  def int_ppc_altivec_vcmpgtfp_p : ClangBuiltin<"__builtin_altivec_vcmpgtfp_p">,
               Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v4f32_ty,llvm_v4f32_ty],
                         [IntrNoMem]>;
 
-  def int_ppc_altivec_vcmpequd_p : GCCBuiltin<"__builtin_altivec_vcmpequd_p">,
+  def int_ppc_altivec_vcmpequd_p : ClangBuiltin<"__builtin_altivec_vcmpequd_p">,
               Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v2i64_ty,llvm_v2i64_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpgtsd_p : GCCBuiltin<"__builtin_altivec_vcmpgtsd_p">,
+  def int_ppc_altivec_vcmpgtsd_p : ClangBuiltin<"__builtin_altivec_vcmpgtsd_p">,
               Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v2i64_ty,llvm_v2i64_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpgtud_p : GCCBuiltin<"__builtin_altivec_vcmpgtud_p">,
+  def int_ppc_altivec_vcmpgtud_p : ClangBuiltin<"__builtin_altivec_vcmpgtud_p">,
               Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v2i64_ty,llvm_v2i64_ty],
                         [IntrNoMem]>;
 
-  def int_ppc_altivec_vcmpequw_p : GCCBuiltin<"__builtin_altivec_vcmpequw_p">,
+  def int_ppc_altivec_vcmpequw_p : ClangBuiltin<"__builtin_altivec_vcmpequw_p">,
               Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v4i32_ty,llvm_v4i32_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpgtsw_p : GCCBuiltin<"__builtin_altivec_vcmpgtsw_p">,
+  def int_ppc_altivec_vcmpgtsw_p : ClangBuiltin<"__builtin_altivec_vcmpgtsw_p">,
               Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v4i32_ty,llvm_v4i32_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpgtuw_p : GCCBuiltin<"__builtin_altivec_vcmpgtuw_p">,
+  def int_ppc_altivec_vcmpgtuw_p : ClangBuiltin<"__builtin_altivec_vcmpgtuw_p">,
               Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v4i32_ty,llvm_v4i32_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpnew_p : GCCBuiltin<"__builtin_altivec_vcmpnew_p">,
+  def int_ppc_altivec_vcmpnew_p : ClangBuiltin<"__builtin_altivec_vcmpnew_p">,
               Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v4i32_ty,llvm_v4i32_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpnezw_p : GCCBuiltin<"__builtin_altivec_vcmpnezw_p">,
+  def int_ppc_altivec_vcmpnezw_p : ClangBuiltin<"__builtin_altivec_vcmpnezw_p">,
               Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v4i32_ty,llvm_v4i32_ty],
                         [IntrNoMem]>;
 
-  def int_ppc_altivec_vcmpequh_p : GCCBuiltin<"__builtin_altivec_vcmpequh_p">,
+  def int_ppc_altivec_vcmpequh_p : ClangBuiltin<"__builtin_altivec_vcmpequh_p">,
               Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v8i16_ty,llvm_v8i16_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpgtsh_p : GCCBuiltin<"__builtin_altivec_vcmpgtsh_p">,
+  def int_ppc_altivec_vcmpgtsh_p : ClangBuiltin<"__builtin_altivec_vcmpgtsh_p">,
               Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v8i16_ty,llvm_v8i16_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpgtuh_p : GCCBuiltin<"__builtin_altivec_vcmpgtuh_p">,
+  def int_ppc_altivec_vcmpgtuh_p : ClangBuiltin<"__builtin_altivec_vcmpgtuh_p">,
               Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v8i16_ty,llvm_v8i16_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpneh_p : GCCBuiltin<"__builtin_altivec_vcmpneh_p">,
+  def int_ppc_altivec_vcmpneh_p : ClangBuiltin<"__builtin_altivec_vcmpneh_p">,
               Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v8i16_ty,llvm_v8i16_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpnezh_p : GCCBuiltin<"__builtin_altivec_vcmpnezh_p">,
+  def int_ppc_altivec_vcmpnezh_p : ClangBuiltin<"__builtin_altivec_vcmpnezh_p">,
               Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v8i16_ty,llvm_v8i16_ty],
                         [IntrNoMem]>;
 
-  def int_ppc_altivec_vcmpequb_p : GCCBuiltin<"__builtin_altivec_vcmpequb_p">,
+  def int_ppc_altivec_vcmpequb_p : ClangBuiltin<"__builtin_altivec_vcmpequb_p">,
               Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v16i8_ty,llvm_v16i8_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpgtsb_p : GCCBuiltin<"__builtin_altivec_vcmpgtsb_p">,
+  def int_ppc_altivec_vcmpgtsb_p : ClangBuiltin<"__builtin_altivec_vcmpgtsb_p">,
               Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v16i8_ty,llvm_v16i8_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpgtub_p : GCCBuiltin<"__builtin_altivec_vcmpgtub_p">,
+  def int_ppc_altivec_vcmpgtub_p : ClangBuiltin<"__builtin_altivec_vcmpgtub_p">,
               Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v16i8_ty,llvm_v16i8_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpneb_p : GCCBuiltin<"__builtin_altivec_vcmpneb_p">,
+  def int_ppc_altivec_vcmpneb_p : ClangBuiltin<"__builtin_altivec_vcmpneb_p">,
               Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v16i8_ty,llvm_v16i8_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vcmpnezb_p : GCCBuiltin<"__builtin_altivec_vcmpnezb_p">,
+  def int_ppc_altivec_vcmpnezb_p : ClangBuiltin<"__builtin_altivec_vcmpnezb_p">,
               Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v16i8_ty,llvm_v16i8_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vclzlsbb : GCCBuiltin<"__builtin_altivec_vclzlsbb">,
+  def int_ppc_altivec_vclzlsbb : ClangBuiltin<"__builtin_altivec_vclzlsbb">,
               Intrinsic<[llvm_i32_ty],[llvm_v16i8_ty],[IntrNoMem]>;
-  def int_ppc_altivec_vctzlsbb : GCCBuiltin<"__builtin_altivec_vctzlsbb">,
+  def int_ppc_altivec_vctzlsbb : ClangBuiltin<"__builtin_altivec_vctzlsbb">,
               Intrinsic<[llvm_i32_ty],[llvm_v16i8_ty],[IntrNoMem]>;
-  def int_ppc_altivec_vprtybw : GCCBuiltin<"__builtin_altivec_vprtybw">,
+  def int_ppc_altivec_vprtybw : ClangBuiltin<"__builtin_altivec_vprtybw">,
               Intrinsic<[llvm_v4i32_ty],[llvm_v4i32_ty],[IntrNoMem]>;
-  def int_ppc_altivec_vprtybd : GCCBuiltin<"__builtin_altivec_vprtybd">,
+  def int_ppc_altivec_vprtybd : ClangBuiltin<"__builtin_altivec_vprtybd">,
               Intrinsic<[llvm_v2i64_ty],[llvm_v2i64_ty],[IntrNoMem]>;
-  def int_ppc_altivec_vprtybq : GCCBuiltin<"__builtin_altivec_vprtybq">,
+  def int_ppc_altivec_vprtybq : ClangBuiltin<"__builtin_altivec_vprtybq">,
               Intrinsic<[llvm_v1i128_ty],[llvm_v1i128_ty],[IntrNoMem]>;
 
   // BCD intrinsics.
-  def int_ppc_bcdadd : GCCBuiltin<"__builtin_ppc_bcdadd">, Intrinsic<
+  def int_ppc_bcdadd : ClangBuiltin<"__builtin_ppc_bcdadd">, Intrinsic<
     [llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
     [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_ppc_bcdadd_p : GCCBuiltin<"__builtin_ppc_bcdadd_p">, Intrinsic<
+  def int_ppc_bcdadd_p : ClangBuiltin<"__builtin_ppc_bcdadd_p">, Intrinsic<
     [llvm_i32_ty], [llvm_i32_ty, llvm_v16i8_ty, llvm_v16i8_ty],
     [IntrNoMem, ImmArg<ArgIndex<0>>]>;
-  def int_ppc_bcdsub : GCCBuiltin<"__builtin_ppc_bcdsub">, Intrinsic<
+  def int_ppc_bcdsub : ClangBuiltin<"__builtin_ppc_bcdsub">, Intrinsic<
     [llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
     [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_ppc_bcdsub_p : GCCBuiltin<"__builtin_ppc_bcdsub_p">, Intrinsic<
+  def int_ppc_bcdsub_p : ClangBuiltin<"__builtin_ppc_bcdsub_p">, Intrinsic<
     [llvm_i32_ty], [llvm_i32_ty, llvm_v16i8_ty, llvm_v16i8_ty],
     [IntrNoMem, ImmArg<ArgIndex<0>>]>;
 
   // P10 Vector Extract with Mask
-  def int_ppc_altivec_vextractbm : GCCBuiltin<"__builtin_altivec_vextractbm">,
+  def int_ppc_altivec_vextractbm : ClangBuiltin<"__builtin_altivec_vextractbm">,
               Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vextracthm : GCCBuiltin<"__builtin_altivec_vextracthm">,
+  def int_ppc_altivec_vextracthm : ClangBuiltin<"__builtin_altivec_vextracthm">,
               Intrinsic<[llvm_i32_ty], [llvm_v8i16_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vextractwm : GCCBuiltin<"__builtin_altivec_vextractwm">,
+  def int_ppc_altivec_vextractwm : ClangBuiltin<"__builtin_altivec_vextractwm">,
               Intrinsic<[llvm_i32_ty], [llvm_v4i32_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vextractdm : GCCBuiltin<"__builtin_altivec_vextractdm">,
+  def int_ppc_altivec_vextractdm : ClangBuiltin<"__builtin_altivec_vextractdm">,
               Intrinsic<[llvm_i32_ty], [llvm_v2i64_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vextractqm : GCCBuiltin<"__builtin_altivec_vextractqm">,
+  def int_ppc_altivec_vextractqm : ClangBuiltin<"__builtin_altivec_vextractqm">,
               Intrinsic<[llvm_i32_ty], [llvm_v1i128_ty], [IntrNoMem]>;
 
   // P10 Vector Expand with Mask
-  def int_ppc_altivec_vexpandbm : GCCBuiltin<"__builtin_altivec_vexpandbm">,
+  def int_ppc_altivec_vexpandbm : ClangBuiltin<"__builtin_altivec_vexpandbm">,
               Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vexpandhm : GCCBuiltin<"__builtin_altivec_vexpandhm">,
+  def int_ppc_altivec_vexpandhm : ClangBuiltin<"__builtin_altivec_vexpandhm">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vexpandwm : GCCBuiltin<"__builtin_altivec_vexpandwm">,
+  def int_ppc_altivec_vexpandwm : ClangBuiltin<"__builtin_altivec_vexpandwm">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vexpanddm : GCCBuiltin<"__builtin_altivec_vexpanddm">,
+  def int_ppc_altivec_vexpanddm : ClangBuiltin<"__builtin_altivec_vexpanddm">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vexpandqm : GCCBuiltin<"__builtin_altivec_vexpandqm">,
+  def int_ppc_altivec_vexpandqm : ClangBuiltin<"__builtin_altivec_vexpandqm">,
               Intrinsic<[llvm_v1i128_ty], [llvm_v1i128_ty], [IntrNoMem]>;
 
   // P10 Vector Count with Mask intrinsics.
-  def int_ppc_altivec_vcntmbb : GCCBuiltin<"__builtin_altivec_vcntmbb">,
+  def int_ppc_altivec_vcntmbb : ClangBuiltin<"__builtin_altivec_vcntmbb">,
               Intrinsic<[llvm_i64_ty], [llvm_v16i8_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_ppc_altivec_vcntmbh : GCCBuiltin<"__builtin_altivec_vcntmbh">,
+  def int_ppc_altivec_vcntmbh : ClangBuiltin<"__builtin_altivec_vcntmbh">,
               Intrinsic<[llvm_i64_ty], [llvm_v8i16_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_ppc_altivec_vcntmbw : GCCBuiltin<"__builtin_altivec_vcntmbw">,
+  def int_ppc_altivec_vcntmbw : ClangBuiltin<"__builtin_altivec_vcntmbw">,
               Intrinsic<[llvm_i64_ty], [llvm_v4i32_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_ppc_altivec_vcntmbd : GCCBuiltin<"__builtin_altivec_vcntmbd">,
+  def int_ppc_altivec_vcntmbd : ClangBuiltin<"__builtin_altivec_vcntmbd">,
               Intrinsic<[llvm_i64_ty], [llvm_v2i64_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
   // P10 Move to VSR with Mask Intrinsics.
-  def int_ppc_altivec_mtvsrbm : GCCBuiltin<"__builtin_altivec_mtvsrbm">,
+  def int_ppc_altivec_mtvsrbm : ClangBuiltin<"__builtin_altivec_mtvsrbm">,
               Intrinsic<[llvm_v16i8_ty], [llvm_i64_ty], [IntrNoMem]>;
-  def int_ppc_altivec_mtvsrhm : GCCBuiltin<"__builtin_altivec_mtvsrhm">,
+  def int_ppc_altivec_mtvsrhm : ClangBuiltin<"__builtin_altivec_mtvsrhm">,
               Intrinsic<[llvm_v8i16_ty], [llvm_i64_ty], [IntrNoMem]>;
-  def int_ppc_altivec_mtvsrwm : GCCBuiltin<"__builtin_altivec_mtvsrwm">,
+  def int_ppc_altivec_mtvsrwm : ClangBuiltin<"__builtin_altivec_mtvsrwm">,
               Intrinsic<[llvm_v4i32_ty], [llvm_i64_ty], [IntrNoMem]>;
-  def int_ppc_altivec_mtvsrdm : GCCBuiltin<"__builtin_altivec_mtvsrdm">,
+  def int_ppc_altivec_mtvsrdm : ClangBuiltin<"__builtin_altivec_mtvsrdm">,
               Intrinsic<[llvm_v2i64_ty], [llvm_i64_ty], [IntrNoMem]>;
-  def int_ppc_altivec_mtvsrqm : GCCBuiltin<"__builtin_altivec_mtvsrqm">,
+  def int_ppc_altivec_mtvsrqm : ClangBuiltin<"__builtin_altivec_mtvsrqm">,
               Intrinsic<[llvm_v1i128_ty], [llvm_i64_ty], [IntrNoMem]>;
 
   // P10 Vector Parallel Bits Deposit/Extract Doubleword Builtins.
-  def int_ppc_altivec_vpdepd : GCCBuiltin<"__builtin_altivec_vpdepd">,
+  def int_ppc_altivec_vpdepd : ClangBuiltin<"__builtin_altivec_vpdepd">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vpextd : GCCBuiltin<"__builtin_altivec_vpextd">,
+  def int_ppc_altivec_vpextd : ClangBuiltin<"__builtin_altivec_vpextd">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
                         [IntrNoMem]>;
 
   // P10 Vector String Isolate Intrinsics.
-  def int_ppc_altivec_vstribr : GCCBuiltin<"__builtin_altivec_vstribr">,
+  def int_ppc_altivec_vstribr : ClangBuiltin<"__builtin_altivec_vstribr">,
               Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vstribl : GCCBuiltin<"__builtin_altivec_vstribl">,
+  def int_ppc_altivec_vstribl : ClangBuiltin<"__builtin_altivec_vstribl">,
               Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vstrihr : GCCBuiltin<"__builtin_altivec_vstrihr">,
+  def int_ppc_altivec_vstrihr : ClangBuiltin<"__builtin_altivec_vstrihr">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vstrihl : GCCBuiltin<"__builtin_altivec_vstrihl">,
+  def int_ppc_altivec_vstrihl : ClangBuiltin<"__builtin_altivec_vstrihl">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty], [IntrNoMem]>;
   // Predicate Intrinsics: The first operand specifies interpretation of CR6.
-  def int_ppc_altivec_vstribr_p : GCCBuiltin<"__builtin_altivec_vstribr_p">,
+  def int_ppc_altivec_vstribr_p : ClangBuiltin<"__builtin_altivec_vstribr_p">,
               Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_v16i8_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vstribl_p : GCCBuiltin<"__builtin_altivec_vstribl_p">,
+  def int_ppc_altivec_vstribl_p : ClangBuiltin<"__builtin_altivec_vstribl_p">,
               Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_v16i8_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vstrihr_p : GCCBuiltin<"__builtin_altivec_vstrihr_p">,
+  def int_ppc_altivec_vstrihr_p : ClangBuiltin<"__builtin_altivec_vstrihr_p">,
               Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_v8i16_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vstrihl_p : GCCBuiltin<"__builtin_altivec_vstrihl_p">,
+  def int_ppc_altivec_vstrihl_p : ClangBuiltin<"__builtin_altivec_vstrihl_p">,
               Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_v8i16_ty], [IntrNoMem]>;
 
   // P10 Vector Centrifuge Builtin.
-  def int_ppc_altivec_vcfuged : GCCBuiltin<"__builtin_altivec_vcfuged">,
+  def int_ppc_altivec_vcfuged : ClangBuiltin<"__builtin_altivec_vcfuged">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
                         [IntrNoMem]>;
 
   // P10 Vector Gather Every Nth Bit Builtin.
-  def int_ppc_altivec_vgnb : GCCBuiltin<"__builtin_altivec_vgnb">,
+  def int_ppc_altivec_vgnb : ClangBuiltin<"__builtin_altivec_vgnb">,
               Intrinsic<[llvm_i64_ty], [llvm_v1i128_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
    // P10 Vector Clear Bytes
-   def int_ppc_altivec_vclrlb :  GCCBuiltin<"__builtin_altivec_vclrlb">,
+   def int_ppc_altivec_vclrlb :  ClangBuiltin<"__builtin_altivec_vclrlb">,
                Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty],
                          [IntrNoMem]>;
-   def int_ppc_altivec_vclrrb :  GCCBuiltin<"__builtin_altivec_vclrrb">,
+   def int_ppc_altivec_vclrrb :  ClangBuiltin<"__builtin_altivec_vclrrb">,
                Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty],
                          [IntrNoMem]>;
 
   // P10 Vector Shift Double Bit Immediate.
-  def int_ppc_altivec_vsldbi : GCCBuiltin<"__builtin_altivec_vsldbi">,
+  def int_ppc_altivec_vsldbi : ClangBuiltin<"__builtin_altivec_vsldbi">,
               Intrinsic<[llvm_v16i8_ty],
                         [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_ppc_altivec_vsrdbi : GCCBuiltin<"__builtin_altivec_vsrdbi">,
+  def int_ppc_altivec_vsrdbi : ClangBuiltin<"__builtin_altivec_vsrdbi">,
               Intrinsic<[llvm_v16i8_ty],
                         [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
   // P10 Vector Insert.
-  def int_ppc_altivec_vinsblx : GCCBuiltin<"__builtin_altivec_vinsblx">,
+  def int_ppc_altivec_vinsblx : ClangBuiltin<"__builtin_altivec_vinsblx">,
               Intrinsic<[llvm_v16i8_ty],
                         [llvm_v16i8_ty, llvm_i32_ty, llvm_i32_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vinsbrx : GCCBuiltin<"__builtin_altivec_vinsbrx">,
+  def int_ppc_altivec_vinsbrx : ClangBuiltin<"__builtin_altivec_vinsbrx">,
               Intrinsic<[llvm_v16i8_ty],
                         [llvm_v16i8_ty, llvm_i32_ty, llvm_i32_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vinshlx : GCCBuiltin<"__builtin_altivec_vinshlx">,
+  def int_ppc_altivec_vinshlx : ClangBuiltin<"__builtin_altivec_vinshlx">,
               Intrinsic<[llvm_v8i16_ty],
                         [llvm_v8i16_ty, llvm_i32_ty, llvm_i32_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vinshrx : GCCBuiltin<"__builtin_altivec_vinshrx">,
+  def int_ppc_altivec_vinshrx : ClangBuiltin<"__builtin_altivec_vinshrx">,
               Intrinsic<[llvm_v8i16_ty],
                         [llvm_v8i16_ty, llvm_i32_ty, llvm_i32_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vinswlx : GCCBuiltin<"__builtin_altivec_vinswlx">,
+  def int_ppc_altivec_vinswlx : ClangBuiltin<"__builtin_altivec_vinswlx">,
               Intrinsic<[llvm_v4i32_ty],
                         [llvm_v4i32_ty, llvm_i32_ty, llvm_i32_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vinswrx : GCCBuiltin<"__builtin_altivec_vinswrx">,
+  def int_ppc_altivec_vinswrx : ClangBuiltin<"__builtin_altivec_vinswrx">,
               Intrinsic<[llvm_v4i32_ty],
                         [llvm_v4i32_ty, llvm_i32_ty, llvm_i32_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vinsdlx : GCCBuiltin<"__builtin_altivec_vinsdlx">,
+  def int_ppc_altivec_vinsdlx : ClangBuiltin<"__builtin_altivec_vinsdlx">,
               Intrinsic<[llvm_v2i64_ty],
                         [llvm_v2i64_ty, llvm_i64_ty, llvm_i64_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vinsdrx : GCCBuiltin<"__builtin_altivec_vinsdrx">,
+  def int_ppc_altivec_vinsdrx : ClangBuiltin<"__builtin_altivec_vinsdrx">,
               Intrinsic<[llvm_v2i64_ty],
                         [llvm_v2i64_ty, llvm_i64_ty, llvm_i64_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vinsbvlx : GCCBuiltin<"__builtin_altivec_vinsbvlx">,
+  def int_ppc_altivec_vinsbvlx : ClangBuiltin<"__builtin_altivec_vinsbvlx">,
               Intrinsic<[llvm_v16i8_ty],
                         [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vinsbvrx : GCCBuiltin<"__builtin_altivec_vinsbvrx">,
+  def int_ppc_altivec_vinsbvrx : ClangBuiltin<"__builtin_altivec_vinsbvrx">,
               Intrinsic<[llvm_v16i8_ty],
                         [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vinshvlx : GCCBuiltin<"__builtin_altivec_vinshvlx">,
+  def int_ppc_altivec_vinshvlx : ClangBuiltin<"__builtin_altivec_vinshvlx">,
               Intrinsic<[llvm_v8i16_ty],
                         [llvm_v8i16_ty, llvm_i32_ty, llvm_v8i16_ty],
                         [IntrNoMem]>;
- def int_ppc_altivec_vinshvrx : GCCBuiltin<"__builtin_altivec_vinshvrx">,
+ def int_ppc_altivec_vinshvrx : ClangBuiltin<"__builtin_altivec_vinshvrx">,
               Intrinsic<[llvm_v8i16_ty],
                         [llvm_v8i16_ty, llvm_i32_ty, llvm_v8i16_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vinswvlx : GCCBuiltin<"__builtin_altivec_vinswvlx">,
+  def int_ppc_altivec_vinswvlx : ClangBuiltin<"__builtin_altivec_vinswvlx">,
               Intrinsic<[llvm_v4i32_ty],
                         [llvm_v4i32_ty, llvm_i32_ty, llvm_v4i32_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vinswvrx : GCCBuiltin<"__builtin_altivec_vinswvrx">,
+  def int_ppc_altivec_vinswvrx : ClangBuiltin<"__builtin_altivec_vinswvrx">,
               Intrinsic<[llvm_v4i32_ty],
                         [llvm_v4i32_ty, llvm_i32_ty, llvm_v4i32_ty],
                         [IntrNoMem]>;
@@ -710,35 +744,35 @@ let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
                         [llvm_v2i64_ty, llvm_i64_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   // P10 Vector Extract.
-  def int_ppc_altivec_vextdubvlx : GCCBuiltin<"__builtin_altivec_vextdubvlx">,
+  def int_ppc_altivec_vextdubvlx : ClangBuiltin<"__builtin_altivec_vextdubvlx">,
               Intrinsic<[llvm_v2i64_ty],
                         [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vextdubvrx : GCCBuiltin<"__builtin_altivec_vextdubvrx">,
+  def int_ppc_altivec_vextdubvrx : ClangBuiltin<"__builtin_altivec_vextdubvrx">,
               Intrinsic<[llvm_v2i64_ty],
                         [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vextduhvlx : GCCBuiltin<"__builtin_altivec_vextduhvlx">,
+  def int_ppc_altivec_vextduhvlx : ClangBuiltin<"__builtin_altivec_vextduhvlx">,
               Intrinsic<[llvm_v2i64_ty],
                         [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vextduhvrx : GCCBuiltin<"__builtin_altivec_vextduhvrx">,
+  def int_ppc_altivec_vextduhvrx : ClangBuiltin<"__builtin_altivec_vextduhvrx">,
               Intrinsic<[llvm_v2i64_ty],
                         [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vextduwvlx : GCCBuiltin<"__builtin_altivec_vextduwvlx">,
+  def int_ppc_altivec_vextduwvlx : ClangBuiltin<"__builtin_altivec_vextduwvlx">,
               Intrinsic<[llvm_v2i64_ty],
                         [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vextduwvrx : GCCBuiltin<"__builtin_altivec_vextduwvrx">,
+  def int_ppc_altivec_vextduwvrx : ClangBuiltin<"__builtin_altivec_vextduwvrx">,
               Intrinsic<[llvm_v2i64_ty],
                         [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vextddvlx : GCCBuiltin<"__builtin_altivec_vextddvlx">,
+  def int_ppc_altivec_vextddvlx : ClangBuiltin<"__builtin_altivec_vextddvlx">,
               Intrinsic<[llvm_v2i64_ty],
                         [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vextddvrx : GCCBuiltin<"__builtin_altivec_vextddvrx">,
+  def int_ppc_altivec_vextddvrx : ClangBuiltin<"__builtin_altivec_vextddvrx">,
               Intrinsic<[llvm_v2i64_ty],
                         [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty],
                         [IntrNoMem]>;
@@ -796,229 +830,229 @@ def int_ppc_altivec_vsubcuq : PowerPC_Vec_QQQ_Intrinsic<"vsubcuq">;
 
 let TargetPrefix = "ppc" in {  // All PPC intrinsics start with "llvm.ppc.".
   // Saturating multiply-adds.
-  def int_ppc_altivec_vmhaddshs : GCCBuiltin<"__builtin_altivec_vmhaddshs">,
+  def int_ppc_altivec_vmhaddshs : ClangBuiltin<"__builtin_altivec_vmhaddshs">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                          llvm_v8i16_ty, llvm_v8i16_ty],
                          [IntrNoMem, IntrHasSideEffects]>;
-  def int_ppc_altivec_vmhraddshs : GCCBuiltin<"__builtin_altivec_vmhraddshs">,
+  def int_ppc_altivec_vmhraddshs : ClangBuiltin<"__builtin_altivec_vmhraddshs">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                          llvm_v8i16_ty, llvm_v8i16_ty],
                          [IntrNoMem, IntrHasSideEffects]>;
 
-  def int_ppc_altivec_vmaddfp : GCCBuiltin<"__builtin_altivec_vmaddfp">,
+  def int_ppc_altivec_vmaddfp : ClangBuiltin<"__builtin_altivec_vmaddfp">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
                          llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vnmsubfp : GCCBuiltin<"__builtin_altivec_vnmsubfp">,
+  def int_ppc_altivec_vnmsubfp : ClangBuiltin<"__builtin_altivec_vnmsubfp">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
                          llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
 
   // Vector Multiply Sum Instructions.
-  def int_ppc_altivec_vmsummbm : GCCBuiltin<"__builtin_altivec_vmsummbm">,
+  def int_ppc_altivec_vmsummbm : ClangBuiltin<"__builtin_altivec_vmsummbm">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v16i8_ty, llvm_v16i8_ty,
                        llvm_v4i32_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vmsumshm : GCCBuiltin<"__builtin_altivec_vmsumshm">,
+  def int_ppc_altivec_vmsumshm : ClangBuiltin<"__builtin_altivec_vmsumshm">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
                        llvm_v4i32_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vmsumshs : GCCBuiltin<"__builtin_altivec_vmsumshs">,
+  def int_ppc_altivec_vmsumshs : ClangBuiltin<"__builtin_altivec_vmsumshs">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
                        llvm_v4i32_ty], [IntrNoMem, IntrHasSideEffects]>;
-  def int_ppc_altivec_vmsumubm : GCCBuiltin<"__builtin_altivec_vmsumubm">,
+  def int_ppc_altivec_vmsumubm : ClangBuiltin<"__builtin_altivec_vmsumubm">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v16i8_ty, llvm_v16i8_ty,
                        llvm_v4i32_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vmsumuhm : GCCBuiltin<"__builtin_altivec_vmsumuhm">,
+  def int_ppc_altivec_vmsumuhm : ClangBuiltin<"__builtin_altivec_vmsumuhm">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
                        llvm_v4i32_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vmsumudm : GCCBuiltin<"__builtin_altivec_vmsumudm">,
+  def int_ppc_altivec_vmsumudm : ClangBuiltin<"__builtin_altivec_vmsumudm">,
             Intrinsic<[llvm_v1i128_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
                        llvm_v1i128_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vmsumuhs : GCCBuiltin<"__builtin_altivec_vmsumuhs">,
+  def int_ppc_altivec_vmsumuhs : ClangBuiltin<"__builtin_altivec_vmsumuhs">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
                        llvm_v4i32_ty], [IntrNoMem, IntrHasSideEffects]>;
-  def int_ppc_altivec_vmsumcud : GCCBuiltin<"__builtin_altivec_vmsumcud">,
+  def int_ppc_altivec_vmsumcud : ClangBuiltin<"__builtin_altivec_vmsumcud">,
             Intrinsic<[llvm_v1i128_ty],
                       [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v1i128_ty], [IntrNoMem]>;
 
   // Vector Multiply Instructions.
-  def int_ppc_altivec_vmulesb : GCCBuiltin<"__builtin_altivec_vmulesb">,
+  def int_ppc_altivec_vmulesb : ClangBuiltin<"__builtin_altivec_vmulesb">,
           Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
                     [IntrNoMem]>;
-  def int_ppc_altivec_vmulesh : GCCBuiltin<"__builtin_altivec_vmulesh">,
+  def int_ppc_altivec_vmulesh : ClangBuiltin<"__builtin_altivec_vmulesh">,
           Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
                     [IntrNoMem]>;
-  def int_ppc_altivec_vmulesw : GCCBuiltin<"__builtin_altivec_vmulesw">,
+  def int_ppc_altivec_vmulesw : ClangBuiltin<"__builtin_altivec_vmulesw">,
           Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
                     [IntrNoMem]>;
   def int_ppc_altivec_vmulesd : PowerPC_Vec_QDD_Intrinsic<"vmulesd">;
-  def int_ppc_altivec_vmuleub : GCCBuiltin<"__builtin_altivec_vmuleub">,
+  def int_ppc_altivec_vmuleub : ClangBuiltin<"__builtin_altivec_vmuleub">,
           Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
                     [IntrNoMem]>;
-  def int_ppc_altivec_vmuleuh : GCCBuiltin<"__builtin_altivec_vmuleuh">,
+  def int_ppc_altivec_vmuleuh : ClangBuiltin<"__builtin_altivec_vmuleuh">,
           Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
                     [IntrNoMem]>;
-  def int_ppc_altivec_vmuleuw : GCCBuiltin<"__builtin_altivec_vmuleuw">,
+  def int_ppc_altivec_vmuleuw : ClangBuiltin<"__builtin_altivec_vmuleuw">,
           Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
                     [IntrNoMem]>;
   def int_ppc_altivec_vmuleud : PowerPC_Vec_QDD_Intrinsic<"vmuleud">;
 
-  def int_ppc_altivec_vmulosb : GCCBuiltin<"__builtin_altivec_vmulosb">,
+  def int_ppc_altivec_vmulosb : ClangBuiltin<"__builtin_altivec_vmulosb">,
           Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
                     [IntrNoMem]>;
-  def int_ppc_altivec_vmulosh : GCCBuiltin<"__builtin_altivec_vmulosh">,
+  def int_ppc_altivec_vmulosh : ClangBuiltin<"__builtin_altivec_vmulosh">,
           Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
                     [IntrNoMem]>;
-  def int_ppc_altivec_vmulosw : GCCBuiltin<"__builtin_altivec_vmulosw">,
+  def int_ppc_altivec_vmulosw : ClangBuiltin<"__builtin_altivec_vmulosw">,
           Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
                     [IntrNoMem]>;
   def int_ppc_altivec_vmulosd : PowerPC_Vec_QDD_Intrinsic<"vmulosd">;
-  def int_ppc_altivec_vmuloub : GCCBuiltin<"__builtin_altivec_vmuloub">,
+  def int_ppc_altivec_vmuloub : ClangBuiltin<"__builtin_altivec_vmuloub">,
           Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
                     [IntrNoMem]>;
-  def int_ppc_altivec_vmulouh : GCCBuiltin<"__builtin_altivec_vmulouh">,
+  def int_ppc_altivec_vmulouh : ClangBuiltin<"__builtin_altivec_vmulouh">,
           Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
                     [IntrNoMem]>;
-  def int_ppc_altivec_vmulouw : GCCBuiltin<"__builtin_altivec_vmulouw">,
+  def int_ppc_altivec_vmulouw : ClangBuiltin<"__builtin_altivec_vmulouw">,
           Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
                     [IntrNoMem]>;
   def int_ppc_altivec_vmuloud : PowerPC_Vec_QDD_Intrinsic<"vmuloud">;
 
   // Vector Sum Instructions.
-  def int_ppc_altivec_vsumsws : GCCBuiltin<"__builtin_altivec_vsumsws">,
+  def int_ppc_altivec_vsumsws : ClangBuiltin<"__builtin_altivec_vsumsws">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
                       [IntrNoMem, IntrHasSideEffects]>;
-  def int_ppc_altivec_vsum2sws : GCCBuiltin<"__builtin_altivec_vsum2sws">,
+  def int_ppc_altivec_vsum2sws : ClangBuiltin<"__builtin_altivec_vsum2sws">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
                       [IntrNoMem, IntrHasSideEffects]>;
-  def int_ppc_altivec_vsum4sbs : GCCBuiltin<"__builtin_altivec_vsum4sbs">,
+  def int_ppc_altivec_vsum4sbs : ClangBuiltin<"__builtin_altivec_vsum4sbs">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v16i8_ty, llvm_v4i32_ty],
                       [IntrNoMem, IntrHasSideEffects]>;
-  def int_ppc_altivec_vsum4shs : GCCBuiltin<"__builtin_altivec_vsum4shs">,
+  def int_ppc_altivec_vsum4shs : ClangBuiltin<"__builtin_altivec_vsum4shs">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v4i32_ty],
                       [IntrNoMem, IntrHasSideEffects]>;
-  def int_ppc_altivec_vsum4ubs : GCCBuiltin<"__builtin_altivec_vsum4ubs">,
+  def int_ppc_altivec_vsum4ubs : ClangBuiltin<"__builtin_altivec_vsum4ubs">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v16i8_ty, llvm_v4i32_ty],
                       [IntrNoMem, IntrHasSideEffects]>;
 
   // Vector Sign Extension Instructions
-  def int_ppc_altivec_vextsb2w : GCCBuiltin<"__builtin_altivec_vextsb2w">,
+  def int_ppc_altivec_vextsb2w : ClangBuiltin<"__builtin_altivec_vextsb2w">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v16i8_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vextsb2d : GCCBuiltin<"__builtin_altivec_vextsb2d">,
+  def int_ppc_altivec_vextsb2d : ClangBuiltin<"__builtin_altivec_vextsb2d">,
             Intrinsic<[llvm_v2i64_ty], [llvm_v16i8_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vextsh2w : GCCBuiltin<"__builtin_altivec_vextsh2w">,
+  def int_ppc_altivec_vextsh2w : ClangBuiltin<"__builtin_altivec_vextsh2w">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vextsh2d : GCCBuiltin<"__builtin_altivec_vextsh2d">,
+  def int_ppc_altivec_vextsh2d : ClangBuiltin<"__builtin_altivec_vextsh2d">,
             Intrinsic<[llvm_v2i64_ty], [llvm_v8i16_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vextsw2d : GCCBuiltin<"__builtin_altivec_vextsw2d">,
+  def int_ppc_altivec_vextsw2d : ClangBuiltin<"__builtin_altivec_vextsw2d">,
             Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vextsd2q : GCCBuiltin<"__builtin_altivec_vextsd2q">,
+  def int_ppc_altivec_vextsd2q : ClangBuiltin<"__builtin_altivec_vextsd2q">,
             Intrinsic<[llvm_v1i128_ty], [llvm_v2i64_ty], [IntrNoMem]>;
 
   // Other multiplies.
-  def int_ppc_altivec_vmladduhm : GCCBuiltin<"__builtin_altivec_vmladduhm">,
+  def int_ppc_altivec_vmladduhm : ClangBuiltin<"__builtin_altivec_vmladduhm">,
             Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
                        llvm_v8i16_ty], [IntrNoMem]>;
 
   // Packs.
-  def int_ppc_altivec_vpkpx : GCCBuiltin<"__builtin_altivec_vpkpx">,
+  def int_ppc_altivec_vpkpx : ClangBuiltin<"__builtin_altivec_vpkpx">,
             Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
                       [IntrNoMem]>;
-  def int_ppc_altivec_vpkshss : GCCBuiltin<"__builtin_altivec_vpkshss">,
+  def int_ppc_altivec_vpkshss : ClangBuiltin<"__builtin_altivec_vpkshss">,
             Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
                       [IntrNoMem, IntrHasSideEffects]>;
-  def int_ppc_altivec_vpkshus : GCCBuiltin<"__builtin_altivec_vpkshus">,
+  def int_ppc_altivec_vpkshus : ClangBuiltin<"__builtin_altivec_vpkshus">,
             Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
                       [IntrNoMem, IntrHasSideEffects]>;
-  def int_ppc_altivec_vpkswss : GCCBuiltin<"__builtin_altivec_vpkswss">,
+  def int_ppc_altivec_vpkswss : ClangBuiltin<"__builtin_altivec_vpkswss">,
             Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
                       [IntrNoMem, IntrHasSideEffects]>;
-  def int_ppc_altivec_vpkswus : GCCBuiltin<"__builtin_altivec_vpkswus">,
+  def int_ppc_altivec_vpkswus : ClangBuiltin<"__builtin_altivec_vpkswus">,
             Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
                       [IntrNoMem, IntrHasSideEffects]>;
-  def int_ppc_altivec_vpksdss : GCCBuiltin<"__builtin_altivec_vpksdss">,
+  def int_ppc_altivec_vpksdss : ClangBuiltin<"__builtin_altivec_vpksdss">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
                       [IntrNoMem, IntrHasSideEffects]>;
-  def int_ppc_altivec_vpksdus : GCCBuiltin<"__builtin_altivec_vpksdus">,
+  def int_ppc_altivec_vpksdus : ClangBuiltin<"__builtin_altivec_vpksdus">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
                       [IntrNoMem, IntrHasSideEffects]>;
   // vpkuhum is lowered to a shuffle.
-  def int_ppc_altivec_vpkuhus : GCCBuiltin<"__builtin_altivec_vpkuhus">,
+  def int_ppc_altivec_vpkuhus : ClangBuiltin<"__builtin_altivec_vpkuhus">,
             Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
                       [IntrNoMem, IntrHasSideEffects]>;
   // vpkuwum is lowered to a shuffle.
-  def int_ppc_altivec_vpkuwus : GCCBuiltin<"__builtin_altivec_vpkuwus">,
+  def int_ppc_altivec_vpkuwus : ClangBuiltin<"__builtin_altivec_vpkuwus">,
             Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
                       [IntrNoMem, IntrHasSideEffects]>;
   // vpkudum is lowered to a shuffle.
-  def int_ppc_altivec_vpkudus : GCCBuiltin<"__builtin_altivec_vpkudus">,
+  def int_ppc_altivec_vpkudus : ClangBuiltin<"__builtin_altivec_vpkudus">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
                       [IntrNoMem, IntrHasSideEffects]>;
 
   // Unpacks.
-  def int_ppc_altivec_vupkhpx : GCCBuiltin<"__builtin_altivec_vupkhpx">,
+  def int_ppc_altivec_vupkhpx : ClangBuiltin<"__builtin_altivec_vupkhpx">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vupkhsb : GCCBuiltin<"__builtin_altivec_vupkhsb">,
+  def int_ppc_altivec_vupkhsb : ClangBuiltin<"__builtin_altivec_vupkhsb">,
             Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vupkhsh : GCCBuiltin<"__builtin_altivec_vupkhsh">,
+  def int_ppc_altivec_vupkhsh : ClangBuiltin<"__builtin_altivec_vupkhsh">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vupkhsw : GCCBuiltin<"__builtin_altivec_vupkhsw">,
+  def int_ppc_altivec_vupkhsw : ClangBuiltin<"__builtin_altivec_vupkhsw">,
             Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vupklpx : GCCBuiltin<"__builtin_altivec_vupklpx">,
+  def int_ppc_altivec_vupklpx : ClangBuiltin<"__builtin_altivec_vupklpx">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vupklsb : GCCBuiltin<"__builtin_altivec_vupklsb">,
+  def int_ppc_altivec_vupklsb : ClangBuiltin<"__builtin_altivec_vupklsb">,
             Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vupklsh : GCCBuiltin<"__builtin_altivec_vupklsh">,
+  def int_ppc_altivec_vupklsh : ClangBuiltin<"__builtin_altivec_vupklsh">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vupklsw : GCCBuiltin<"__builtin_altivec_vupklsw">,
+  def int_ppc_altivec_vupklsw : ClangBuiltin<"__builtin_altivec_vupklsw">,
             Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty], [IntrNoMem]>;
 
 
   // FP <-> integer conversion.
-  def int_ppc_altivec_vcfsx : GCCBuiltin<"__builtin_altivec_vcfsx">,
+  def int_ppc_altivec_vcfsx : ClangBuiltin<"__builtin_altivec_vcfsx">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4i32_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_ppc_altivec_vcfux : GCCBuiltin<"__builtin_altivec_vcfux">,
+  def int_ppc_altivec_vcfux : ClangBuiltin<"__builtin_altivec_vcfux">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4i32_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_ppc_altivec_vctsxs : GCCBuiltin<"__builtin_altivec_vctsxs">,
+  def int_ppc_altivec_vctsxs : ClangBuiltin<"__builtin_altivec_vctsxs">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_ppc_altivec_vctuxs : GCCBuiltin<"__builtin_altivec_vctuxs">,
+  def int_ppc_altivec_vctuxs : ClangBuiltin<"__builtin_altivec_vctuxs">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-  def int_ppc_altivec_vrfim : GCCBuiltin<"__builtin_altivec_vrfim">,
+  def int_ppc_altivec_vrfim : ClangBuiltin<"__builtin_altivec_vrfim">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vrfin : GCCBuiltin<"__builtin_altivec_vrfin">,
+  def int_ppc_altivec_vrfin : ClangBuiltin<"__builtin_altivec_vrfin">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vrfip : GCCBuiltin<"__builtin_altivec_vrfip">,
+  def int_ppc_altivec_vrfip : ClangBuiltin<"__builtin_altivec_vrfip">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vrfiz : GCCBuiltin<"__builtin_altivec_vrfiz">,
+  def int_ppc_altivec_vrfiz : ClangBuiltin<"__builtin_altivec_vrfiz">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
 
   // Add Extended Quadword
-  def int_ppc_altivec_vaddeuqm : GCCBuiltin<"__builtin_altivec_vaddeuqm">,
+  def int_ppc_altivec_vaddeuqm : ClangBuiltin<"__builtin_altivec_vaddeuqm">,
               Intrinsic<[llvm_v1i128_ty],
                         [llvm_v1i128_ty, llvm_v1i128_ty, llvm_v1i128_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vaddecuq : GCCBuiltin<"__builtin_altivec_vaddecuq">,
+  def int_ppc_altivec_vaddecuq : ClangBuiltin<"__builtin_altivec_vaddecuq">,
               Intrinsic<[llvm_v1i128_ty],
                         [llvm_v1i128_ty, llvm_v1i128_ty, llvm_v1i128_ty],
                         [IntrNoMem]>;
 
   // Sub Extended Quadword
-  def int_ppc_altivec_vsubeuqm : GCCBuiltin<"__builtin_altivec_vsubeuqm">,
+  def int_ppc_altivec_vsubeuqm : ClangBuiltin<"__builtin_altivec_vsubeuqm">,
               Intrinsic<[llvm_v1i128_ty],
                         [llvm_v1i128_ty, llvm_v1i128_ty, llvm_v1i128_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vsubecuq : GCCBuiltin<"__builtin_altivec_vsubecuq">,
+  def int_ppc_altivec_vsubecuq : ClangBuiltin<"__builtin_altivec_vsubecuq">,
               Intrinsic<[llvm_v1i128_ty],
                         [llvm_v1i128_ty, llvm_v1i128_ty, llvm_v1i128_ty],
                         [IntrNoMem]>;
 
   // P10 Vector Count Leading / Trailing Zeroes under bit Mask Builtins.
-  def int_ppc_altivec_vclzdm : GCCBuiltin<"__builtin_altivec_vclzdm">,
+  def int_ppc_altivec_vclzdm : ClangBuiltin<"__builtin_altivec_vclzdm">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vctzdm : GCCBuiltin<"__builtin_altivec_vctzdm">,
+  def int_ppc_altivec_vctzdm : ClangBuiltin<"__builtin_altivec_vctzdm">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
                         [IntrNoMem]>;
 }
@@ -1056,18 +1090,18 @@ let TargetPrefix = "ppc" in {  // All PPC intrinsics start with "llvm.ppc.".
   def int_ppc_altivec_lvsr :
               Intrinsic<[llvm_v16i8_ty], [llvm_ptr_ty], [IntrNoMem]>;
 
-  def int_ppc_altivec_vperm : GCCBuiltin<"__builtin_altivec_vperm_4si">,
+  def int_ppc_altivec_vperm : ClangBuiltin<"__builtin_altivec_vperm_4si">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
                          llvm_v4i32_ty, llvm_v16i8_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vsel : GCCBuiltin<"__builtin_altivec_vsel_4si">,
+  def int_ppc_altivec_vsel : ClangBuiltin<"__builtin_altivec_vsel_4si">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
                          llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vgbbd : GCCBuiltin<"__builtin_altivec_vgbbd">,
+  def int_ppc_altivec_vgbbd : ClangBuiltin<"__builtin_altivec_vgbbd">,
               Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>;
-  def int_ppc_altivec_vbpermq : GCCBuiltin<"__builtin_altivec_vbpermq">,
+  def int_ppc_altivec_vbpermq : ClangBuiltin<"__builtin_altivec_vbpermq">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
                         [IntrNoMem]>;
-  def int_ppc_altivec_vbpermd : GCCBuiltin<"__builtin_altivec_vbpermd">,
+  def int_ppc_altivec_vbpermd : ClangBuiltin<"__builtin_altivec_vbpermd">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v16i8_ty],
                         [IntrNoMem]>;
 }
@@ -1081,23 +1115,23 @@ def int_ppc_altivec_vrsqrtefp : PowerPC_Vec_FF_Intrinsic<"vrsqrtefp">;
 // Crypto
 let TargetPrefix = "ppc" in {  // All PPC intrinsics start with "llvm.ppc.".
   def int_ppc_altivec_crypto_vsbox :
-              GCCBuiltin<"__builtin_altivec_crypto_vsbox">,
+              ClangBuiltin<"__builtin_altivec_crypto_vsbox">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty], [IntrNoMem]>;
   def int_ppc_altivec_crypto_vpermxor :
-              GCCBuiltin<"__builtin_altivec_crypto_vpermxor">,
+              ClangBuiltin<"__builtin_altivec_crypto_vpermxor">,
               Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
                          llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
   def int_ppc_altivec_crypto_vpermxor_be :
-              GCCBuiltin<"__builtin_altivec_crypto_vpermxor_be">,
+              ClangBuiltin<"__builtin_altivec_crypto_vpermxor_be">,
               Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
                          llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
 
 def int_ppc_altivec_crypto_vshasigmad :
-            GCCBuiltin<"__builtin_altivec_crypto_vshasigmad">,
+            ClangBuiltin<"__builtin_altivec_crypto_vshasigmad">,
             Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
                        llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
 def int_ppc_altivec_crypto_vshasigmaw :
-            GCCBuiltin<"__builtin_altivec_crypto_vshasigmaw">,
+            ClangBuiltin<"__builtin_altivec_crypto_vshasigmaw">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
                        llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
 }
@@ -1224,52 +1258,52 @@ def int_ppc_vsx_xvrdpip :
       Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
 
 // Vector reciprocal estimate
-def int_ppc_vsx_xvresp : GCCBuiltin<"__builtin_vsx_xvresp">,
+def int_ppc_vsx_xvresp : ClangBuiltin<"__builtin_vsx_xvresp">,
       Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-def int_ppc_vsx_xvredp : GCCBuiltin<"__builtin_vsx_xvredp">,
+def int_ppc_vsx_xvredp : ClangBuiltin<"__builtin_vsx_xvredp">,
       Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
 
 // Vector rsqrte
-def int_ppc_vsx_xvrsqrtesp : GCCBuiltin<"__builtin_vsx_xvrsqrtesp">,
+def int_ppc_vsx_xvrsqrtesp : ClangBuiltin<"__builtin_vsx_xvrsqrtesp">,
       Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-def int_ppc_vsx_xvrsqrtedp : GCCBuiltin<"__builtin_vsx_xvrsqrtedp">,
+def int_ppc_vsx_xvrsqrtedp : ClangBuiltin<"__builtin_vsx_xvrsqrtedp">,
       Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
 
 // Vector compare
 def int_ppc_vsx_xvcmpeqdp :
       PowerPC_VSX_Intrinsic<"xvcmpeqdp", [llvm_v2i64_ty],
                             [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
-def int_ppc_vsx_xvcmpeqdp_p : GCCBuiltin<"__builtin_vsx_xvcmpeqdp_p">,
+def int_ppc_vsx_xvcmpeqdp_p : ClangBuiltin<"__builtin_vsx_xvcmpeqdp_p">,
               Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v2f64_ty,llvm_v2f64_ty],
                         [IntrNoMem]>;
 def int_ppc_vsx_xvcmpeqsp :
       PowerPC_VSX_Intrinsic<"xvcmpeqsp", [llvm_v4i32_ty],
                             [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_ppc_vsx_xvcmpeqsp_p : GCCBuiltin<"__builtin_vsx_xvcmpeqsp_p">,
+def int_ppc_vsx_xvcmpeqsp_p : ClangBuiltin<"__builtin_vsx_xvcmpeqsp_p">,
               Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v4f32_ty,llvm_v4f32_ty],
                         [IntrNoMem]>;
 def int_ppc_vsx_xvcmpgedp :
       PowerPC_VSX_Intrinsic<"xvcmpgedp", [llvm_v2i64_ty],
                             [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
-def int_ppc_vsx_xvcmpgedp_p : GCCBuiltin<"__builtin_vsx_xvcmpgedp_p">,
+def int_ppc_vsx_xvcmpgedp_p : ClangBuiltin<"__builtin_vsx_xvcmpgedp_p">,
               Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v2f64_ty,llvm_v2f64_ty],
                         [IntrNoMem]>;
 def int_ppc_vsx_xvcmpgesp :
       PowerPC_VSX_Intrinsic<"xvcmpgesp", [llvm_v4i32_ty],
                             [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_ppc_vsx_xvcmpgesp_p : GCCBuiltin<"__builtin_vsx_xvcmpgesp_p">,
+def int_ppc_vsx_xvcmpgesp_p : ClangBuiltin<"__builtin_vsx_xvcmpgesp_p">,
               Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v4f32_ty,llvm_v4f32_ty],
                         [IntrNoMem]>;
 def int_ppc_vsx_xvcmpgtdp :
       PowerPC_VSX_Intrinsic<"xvcmpgtdp", [llvm_v2i64_ty],
                             [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
-def int_ppc_vsx_xvcmpgtdp_p : GCCBuiltin<"__builtin_vsx_xvcmpgtdp_p">,
+def int_ppc_vsx_xvcmpgtdp_p : ClangBuiltin<"__builtin_vsx_xvcmpgtdp_p">,
               Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v2f64_ty,llvm_v2f64_ty],
                         [IntrNoMem]>;
 def int_ppc_vsx_xvcmpgtsp :
       PowerPC_VSX_Intrinsic<"xvcmpgtsp", [llvm_v4i32_ty],
                             [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-def int_ppc_vsx_xvcmpgtsp_p : GCCBuiltin<"__builtin_vsx_xvcmpgtsp_p">,
+def int_ppc_vsx_xvcmpgtsp_p : ClangBuiltin<"__builtin_vsx_xvcmpgtsp_p">,
               Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v4f32_ty,llvm_v4f32_ty],
                         [IntrNoMem]>;
 def int_ppc_vsx_xxleqv :
@@ -1381,21 +1415,21 @@ def int_ppc_vsx_xxgenpcvdm :
 
 // P10 VSX Vector permute extended.
 def int_ppc_vsx_xxpermx : 
-      GCCBuiltin<"__builtin_vsx_xxpermx">,
+      ClangBuiltin<"__builtin_vsx_xxpermx">,
       Intrinsic<[llvm_v16i8_ty],
                 [llvm_v16i8_ty,llvm_v16i8_ty,llvm_v16i8_ty,llvm_i32_ty],
                 [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 // P10 VSX Vector Blend Variable.
-def  int_ppc_vsx_xxblendvb: GCCBuiltin<"__builtin_vsx_xxblendvb">,
+def  int_ppc_vsx_xxblendvb: ClangBuiltin<"__builtin_vsx_xxblendvb">,
        Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
                  [IntrNoMem]>;
-def  int_ppc_vsx_xxblendvh: GCCBuiltin<"__builtin_vsx_xxblendvh">,
+def  int_ppc_vsx_xxblendvh: ClangBuiltin<"__builtin_vsx_xxblendvh">,
        Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,llvm_v8i16_ty],
                  [IntrNoMem]>;
-def  int_ppc_vsx_xxblendvw: GCCBuiltin<"__builtin_vsx_xxblendvw">,
+def  int_ppc_vsx_xxblendvw: ClangBuiltin<"__builtin_vsx_xxblendvw">,
        Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
                  [IntrNoMem]>;
-def  int_ppc_vsx_xxblendvd: GCCBuiltin<"__builtin_vsx_xxblendvd">,
+def  int_ppc_vsx_xxblendvd: ClangBuiltin<"__builtin_vsx_xxblendvd">,
        Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty],
                  [IntrNoMem]>;
 }
@@ -1405,64 +1439,68 @@ def  int_ppc_vsx_xxblendvd: GCCBuiltin<"__builtin_vsx_xxblendvd">,
 
 let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
 
-def int_ppc_tbegin : GCCBuiltin<"__builtin_tbegin">,
+def int_ppc_tbegin : ClangBuiltin<"__builtin_tbegin">,
       Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
-def int_ppc_tend : GCCBuiltin<"__builtin_tend">,
+def int_ppc_tend : ClangBuiltin<"__builtin_tend">,
       Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
 
-def int_ppc_tabort : GCCBuiltin<"__builtin_tabort">,
+def int_ppc_tabort : ClangBuiltin<"__builtin_tabort">,
       Intrinsic<[llvm_i32_ty], [llvm_i32_ty], []>;
-def int_ppc_tabortwc : GCCBuiltin<"__builtin_tabortwc">,
+def int_ppc_tabortwc : ClangBuiltin<"__builtin_tabortwc">,
       Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
-def int_ppc_tabortwci : GCCBuiltin<"__builtin_tabortwci">,
+def int_ppc_tabortwci : ClangBuiltin<"__builtin_tabortwci">,
       Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
-def int_ppc_tabortdc : GCCBuiltin<"__builtin_tabortdc">,
+def int_ppc_tabortdc : ClangBuiltin<"__builtin_tabortdc">,
       Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
-def int_ppc_tabortdci : GCCBuiltin<"__builtin_tabortdci">,
+def int_ppc_tabortdci : ClangBuiltin<"__builtin_tabortdci">,
       Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
 
-def int_ppc_tcheck : GCCBuiltin<"__builtin_tcheck">,
+def int_ppc_tcheck : ClangBuiltin<"__builtin_tcheck">,
       Intrinsic<[llvm_i32_ty], [], []>;
-def int_ppc_treclaim : GCCBuiltin<"__builtin_treclaim">,
+def int_ppc_treclaim : ClangBuiltin<"__builtin_treclaim">,
       Intrinsic<[llvm_i32_ty], [llvm_i32_ty], []>;
-def int_ppc_trechkpt : GCCBuiltin<"__builtin_trechkpt">,
+def int_ppc_trechkpt : ClangBuiltin<"__builtin_trechkpt">,
       Intrinsic<[llvm_i32_ty], [], []>;
-def int_ppc_tsr : GCCBuiltin<"__builtin_tsr">,
+def int_ppc_tsr : ClangBuiltin<"__builtin_tsr">,
       Intrinsic<[llvm_i32_ty], [llvm_i32_ty], []>;
 
-def int_ppc_get_texasr : GCCBuiltin<"__builtin_get_texasr">,
+def int_ppc_get_texasr : ClangBuiltin<"__builtin_get_texasr">,
       Intrinsic<[llvm_i64_ty], [], []>;
-def int_ppc_get_texasru : GCCBuiltin<"__builtin_get_texasru">,
+def int_ppc_get_texasru : ClangBuiltin<"__builtin_get_texasru">,
       Intrinsic<[llvm_i64_ty], [], []>;
-def int_ppc_get_tfhar : GCCBuiltin<"__builtin_get_tfhar">,
+def int_ppc_get_tfhar : ClangBuiltin<"__builtin_get_tfhar">,
       Intrinsic<[llvm_i64_ty], [], []>;
-def int_ppc_get_tfiar : GCCBuiltin<"__builtin_get_tfiar">,
+def int_ppc_get_tfiar : ClangBuiltin<"__builtin_get_tfiar">,
       Intrinsic<[llvm_i64_ty], [], []>;
 
-def int_ppc_set_texasr : GCCBuiltin<"__builtin_set_texasr">,
+def int_ppc_set_texasr : ClangBuiltin<"__builtin_set_texasr">,
       Intrinsic<[], [llvm_i64_ty], []>;
-def int_ppc_set_texasru : GCCBuiltin<"__builtin_set_texasru">,
+def int_ppc_set_texasru : ClangBuiltin<"__builtin_set_texasru">,
       Intrinsic<[], [llvm_i64_ty], []>;
-def int_ppc_set_tfhar : GCCBuiltin<"__builtin_set_tfhar">,
+def int_ppc_set_tfhar : ClangBuiltin<"__builtin_set_tfhar">,
       Intrinsic<[], [llvm_i64_ty], []>;
-def int_ppc_set_tfiar : GCCBuiltin<"__builtin_set_tfiar">,
+def int_ppc_set_tfiar : ClangBuiltin<"__builtin_set_tfiar">,
       Intrinsic<[], [llvm_i64_ty], []>;
 
 // Extended mnemonics
-def int_ppc_tendall : GCCBuiltin<"__builtin_tendall">,
+def int_ppc_tendall : ClangBuiltin<"__builtin_tendall">,
       Intrinsic<[llvm_i32_ty], [], []>;
-def int_ppc_tresume : GCCBuiltin<"__builtin_tresume">,
+def int_ppc_tresume : ClangBuiltin<"__builtin_tresume">,
       Intrinsic<[llvm_i32_ty], [], []>;
-def int_ppc_tsuspend : GCCBuiltin<"__builtin_tsuspend">,
+def int_ppc_tsuspend : ClangBuiltin<"__builtin_tsuspend">,
       Intrinsic<[llvm_i32_ty], [], []>;
 
-def int_ppc_ttest : GCCBuiltin<"__builtin_ttest">,
+def int_ppc_ttest : ClangBuiltin<"__builtin_ttest">,
       Intrinsic<[llvm_i64_ty], [], []>;
 
-def int_ppc_cfence : Intrinsic<[], [llvm_anyint_ty], []>;
+// We currently use llvm.ppc.cfence in the context of atomic load which
+// in LLVM IR requires its type to be one of integer, pointer and
+// float point type. So llvm_any_ty here refers to type mentioned above.
+// Backend is supposed to lower these types to appropriate MVTs.
+def int_ppc_cfence : Intrinsic<[], [llvm_any_ty], []>;
 
 // PowerPC set FPSCR Intrinsic Definitions.
-def int_ppc_setrnd : GCCBuiltin<"__builtin_setrnd">,
+def int_ppc_setrnd : ClangBuiltin<"__builtin_setrnd">,
       Intrinsic<[llvm_double_ty], [llvm_i32_ty], []>;
 }
 
@@ -1552,218 +1590,212 @@ let TargetPrefix = "ppc" in {
 
 // XL Compat intrinsics.
 let TargetPrefix = "ppc" in {
-  def int_ppc_dcbfl : GCCBuiltin<"__builtin_ppc_dcbfl">,
+  def int_ppc_dcbfl : ClangBuiltin<"__builtin_ppc_dcbfl">,
                       Intrinsic<[], [llvm_ptr_ty], [IntrArgMemOnly]>;
-  def int_ppc_dcbflp : GCCBuiltin<"__builtin_ppc_dcbflp">,
+  def int_ppc_dcbflp : ClangBuiltin<"__builtin_ppc_dcbflp">,
                        Intrinsic<[], [llvm_ptr_ty], [IntrArgMemOnly]>;
-  def int_ppc_dcbst : GCCBuiltin<"__builtin_ppc_dcbst">,
+  def int_ppc_dcbst : ClangBuiltin<"__builtin_ppc_dcbst">,
                       Intrinsic<[], [llvm_ptr_ty], []>;
-  def int_ppc_dcbt  : GCCBuiltin<"__builtin_ppc_dcbt">,
+  def int_ppc_dcbt  : ClangBuiltin<"__builtin_ppc_dcbt">,
                       Intrinsic<[], [llvm_ptr_ty],
     [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
-  def int_ppc_dcbtst : GCCBuiltin<"__builtin_ppc_dcbtst">,
+  def int_ppc_dcbtst : ClangBuiltin<"__builtin_ppc_dcbtst">,
                        Intrinsic<[], [llvm_ptr_ty],
     [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
-  def int_ppc_dcbz  : GCCBuiltin<"__builtin_ppc_dcbz">,
+  def int_ppc_dcbz  : ClangBuiltin<"__builtin_ppc_dcbz">,
                       Intrinsic<[], [llvm_ptr_ty], []>;
-  def int_ppc_icbt : GCCBuiltin<"__builtin_ppc_icbt">,
+  def int_ppc_icbt : ClangBuiltin<"__builtin_ppc_icbt">,
                      Intrinsic<[], [llvm_ptr_ty], []>;
   
   // Population Count in each Byte.
   def int_ppc_popcntb : Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem]>;
   
   // sync instruction (i.e. sync 0, a.k.a hwsync)
-  def int_ppc_sync : GCCBuiltin<"__builtin_ppc_sync">,
+  def int_ppc_sync : ClangBuiltin<"__builtin_ppc_sync">,
                      Intrinsic<[], [], []>;
-  def int_ppc_iospace_sync : GCCBuiltin<"__builtin_ppc_iospace_sync">,
+  def int_ppc_iospace_sync : ClangBuiltin<"__builtin_ppc_iospace_sync">,
                              Intrinsic<[], [], []>;
   // isync instruction
-  def int_ppc_isync : GCCBuiltin<"__builtin_ppc_isync">,
+  def int_ppc_isync : ClangBuiltin<"__builtin_ppc_isync">,
                       Intrinsic<[], [], []>;
   // lwsync is sync 1
-  def int_ppc_lwsync : GCCBuiltin<"__builtin_ppc_lwsync">,
+  def int_ppc_lwsync : ClangBuiltin<"__builtin_ppc_lwsync">,
                        Intrinsic<[], [], []>;
-  def int_ppc_iospace_lwsync : GCCBuiltin<"__builtin_ppc_iospace_lwsync">,
+  def int_ppc_iospace_lwsync : ClangBuiltin<"__builtin_ppc_iospace_lwsync">,
                                Intrinsic<[], [], []>;
   // eieio instruction
-  def int_ppc_eieio : GCCBuiltin<"__builtin_ppc_eieio">,
+  def int_ppc_eieio : ClangBuiltin<"__builtin_ppc_eieio">,
                       Intrinsic<[],[],[]>;
-  def int_ppc_iospace_eieio : GCCBuiltin<"__builtin_ppc_iospace_eieio">,
+  def int_ppc_iospace_eieio : ClangBuiltin<"__builtin_ppc_iospace_eieio">,
                               Intrinsic<[],[],[]>;
-  def int_ppc_stdcx : GCCBuiltin<"__builtin_ppc_stdcx">,
+  def int_ppc_stdcx : ClangBuiltin<"__builtin_ppc_stdcx">,
                       Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i64_ty],
                                 [IntrWriteMem]>;
-  def int_ppc_stwcx : GCCBuiltin<"__builtin_ppc_stwcx">,
+  def int_ppc_stwcx : ClangBuiltin<"__builtin_ppc_stwcx">,
                       Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
                                 [IntrWriteMem]>;
   def int_ppc_sthcx
       : Intrinsic<[llvm_i32_ty], [ llvm_ptr_ty, llvm_i32_ty ], [IntrWriteMem]>;
-  def int_ppc_stbcx : GCCBuiltin<"__builtin_ppc_stbcx">,
+  def int_ppc_stbcx : ClangBuiltin<"__builtin_ppc_stbcx">,
                       Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
                                 [IntrWriteMem]>;
-  def int_ppc_dcbtstt : GCCBuiltin<"__builtin_ppc_dcbtstt">,
+  def int_ppc_dcbtstt : ClangBuiltin<"__builtin_ppc_dcbtstt">,
                         Intrinsic<[], [llvm_ptr_ty],
                                   [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
-  def int_ppc_dcbtt : GCCBuiltin<"__builtin_ppc_dcbtt">,
+  def int_ppc_dcbtt : ClangBuiltin<"__builtin_ppc_dcbtt">,
                       Intrinsic<[], [llvm_ptr_ty],
                                 [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
-  def int_ppc_mftbu : GCCBuiltin<"__builtin_ppc_mftbu">,
+  def int_ppc_mftbu : ClangBuiltin<"__builtin_ppc_mftbu">,
                       Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>;
-  def int_ppc_mfmsr : GCCBuiltin<"__builtin_ppc_mfmsr">,
+  def int_ppc_mfmsr : ClangBuiltin<"__builtin_ppc_mfmsr">,
                       Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>;
   def int_ppc_mfspr
       : Intrinsic<[llvm_anyint_ty], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
   def int_ppc_mtmsr
-      : GCCBuiltin<"__builtin_ppc_mtmsr">, Intrinsic<[], [llvm_i32_ty], []>;
+      : ClangBuiltin<"__builtin_ppc_mtmsr">, Intrinsic<[], [llvm_i32_ty], []>;
   def int_ppc_mtspr
       : Intrinsic<[], [llvm_i32_ty, llvm_anyint_ty], [ImmArg<ArgIndex<0>>]>;
-  def int_ppc_stfiw : GCCBuiltin<"__builtin_ppc_stfiw">,
+  def int_ppc_stfiw : ClangBuiltin<"__builtin_ppc_stfiw">,
                       Intrinsic<[], [llvm_ptr_ty, llvm_double_ty],
                                 [IntrWriteMem]>;
   // compare
   def int_ppc_cmpeqb
-      : GCCBuiltin<"__builtin_ppc_cmpeqb">,
+      : ClangBuiltin<"__builtin_ppc_cmpeqb">,
         Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
   def int_ppc_cmprb
-      : GCCBuiltin<"__builtin_ppc_cmprb">,
+      : ClangBuiltin<"__builtin_ppc_cmprb">,
         Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
                   [IntrNoMem, ImmArg<ArgIndex<0>>]>;
   def int_ppc_setb
-      : GCCBuiltin<"__builtin_ppc_setb">,
+      : ClangBuiltin<"__builtin_ppc_setb">,
         Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
   def int_ppc_cmpb
       : Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty, llvm_anyint_ty],
                   [IntrNoMem]>;
   // multiply
   def int_ppc_mulhd
-      : GCCBuiltin<"__builtin_ppc_mulhd">,
+      : ClangBuiltin<"__builtin_ppc_mulhd">,
         Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
   def int_ppc_mulhdu
-      : GCCBuiltin<"__builtin_ppc_mulhdu">,
+      : ClangBuiltin<"__builtin_ppc_mulhdu">,
         Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
   def int_ppc_mulhw
-      : GCCBuiltin<"__builtin_ppc_mulhw">,
+      : ClangBuiltin<"__builtin_ppc_mulhw">,
         Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_ppc_mulhwu
-      : GCCBuiltin<"__builtin_ppc_mulhwu">,
+      : ClangBuiltin<"__builtin_ppc_mulhwu">,
         Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_ppc_maddhd
-      : GCCBuiltin<"__builtin_ppc_maddhd">,
+      : ClangBuiltin<"__builtin_ppc_maddhd">,
         Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
   def int_ppc_maddhdu
-      : GCCBuiltin<"__builtin_ppc_maddhdu">,
+      : ClangBuiltin<"__builtin_ppc_maddhdu">,
         Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
   def int_ppc_maddld
-      : GCCBuiltin<"__builtin_ppc_maddld">,
+      : ClangBuiltin<"__builtin_ppc_maddld">,
         Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
   // load
   def int_ppc_load2r
       : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrReadMem, IntrArgMemOnly]>;
   def int_ppc_load4r
-      : GCCBuiltin<"__builtin_ppc_load4r">,
+      : ClangBuiltin<"__builtin_ppc_load4r">,
         Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrReadMem, IntrArgMemOnly]>;
   def int_ppc_load8r
-      : GCCBuiltin<"__builtin_ppc_load8r">,
+      : ClangBuiltin<"__builtin_ppc_load8r">,
         Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], [IntrReadMem, IntrArgMemOnly]>;
   // store
   def int_ppc_store2r
-      : GCCBuiltin<"__builtin_ppc_store2r">,
+      : ClangBuiltin<"__builtin_ppc_store2r">,
         Intrinsic<[], [llvm_i32_ty, llvm_ptr_ty], [IntrWriteMem]>;
   def int_ppc_store4r
-      : GCCBuiltin<"__builtin_ppc_store4r">,
+      : ClangBuiltin<"__builtin_ppc_store4r">,
         Intrinsic<[], [llvm_i32_ty, llvm_ptr_ty], [IntrWriteMem]>;
   def int_ppc_store8r
-      : GCCBuiltin<"__builtin_ppc_store8r">,
+      : ClangBuiltin<"__builtin_ppc_store8r">,
         Intrinsic<[], [llvm_i64_ty, llvm_ptr_ty], [IntrWriteMem]>;
   def int_ppc_insert_exp
-      : GCCBuiltin<"__builtin_ppc_insert_exp">,
+      : ClangBuiltin<"__builtin_ppc_insert_exp">,
         Intrinsic <[llvm_double_ty], [llvm_double_ty, llvm_i64_ty],
                    [IntrNoMem]>;
   def int_ppc_extract_exp
-      : GCCBuiltin<"__builtin_ppc_extract_exp">,
+      : ClangBuiltin<"__builtin_ppc_extract_exp">,
         Intrinsic <[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
   def int_ppc_extract_sig
-      : GCCBuiltin<"__builtin_ppc_extract_sig">,
+      : ClangBuiltin<"__builtin_ppc_extract_sig">,
         Intrinsic <[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
   def int_ppc_mtfsb0
-      : GCCBuiltin<"__builtin_ppc_mtfsb0">,
+      : ClangBuiltin<"__builtin_ppc_mtfsb0">,
         Intrinsic <[], [llvm_i32_ty],
                    [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
   def int_ppc_mtfsb1
-      : GCCBuiltin<"__builtin_ppc_mtfsb1">,
+      : ClangBuiltin<"__builtin_ppc_mtfsb1">,
         Intrinsic <[], [llvm_i32_ty],
                    [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
   def int_ppc_mtfsf :
         Intrinsic <[], [llvm_i32_ty, llvm_double_ty],
                    [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
   def int_ppc_mtfsfi
-      : GCCBuiltin<"__builtin_ppc_mtfsfi">,
+      : ClangBuiltin<"__builtin_ppc_mtfsfi">,
         Intrinsic <[], [llvm_i32_ty, llvm_i32_ty],
                    [IntrNoMem, IntrHasSideEffects,
                     ImmArg<ArgIndex<0>>,ImmArg<ArgIndex<1>>]>;
   def int_ppc_fmsub
-      : GCCBuiltin<"__builtin_ppc_fmsub">,
+      : ClangBuiltin<"__builtin_ppc_fmsub">,
         Intrinsic <[llvm_double_ty],
                    [llvm_double_ty, llvm_double_ty, llvm_double_ty],
                    [IntrNoMem]>;
   def int_ppc_fmsubs
-      : GCCBuiltin<"__builtin_ppc_fmsubs">,
+      : ClangBuiltin<"__builtin_ppc_fmsubs">,
         Intrinsic <[llvm_float_ty],
                    [llvm_float_ty, llvm_float_ty, llvm_float_ty],
                    [IntrNoMem]>;
   def int_ppc_fnmadd
-      : GCCBuiltin<"__builtin_ppc_fnmadd">,
+      : ClangBuiltin<"__builtin_ppc_fnmadd">,
         Intrinsic <[llvm_double_ty],
                    [llvm_double_ty, llvm_double_ty, llvm_double_ty],
                    [IntrNoMem]>;
   def int_ppc_fnmadds
-      : GCCBuiltin<"__builtin_ppc_fnmadds">,
+      : ClangBuiltin<"__builtin_ppc_fnmadds">,
         Intrinsic <[llvm_float_ty],
                    [llvm_float_ty, llvm_float_ty, llvm_float_ty],
                    [IntrNoMem]>;
   def int_ppc_fnmsub
-      : GCCBuiltin<"__builtin_ppc_fnmsub">,
-        Intrinsic <[llvm_double_ty],
-                   [llvm_double_ty, llvm_double_ty, llvm_double_ty],
-                   [IntrNoMem]>;
-  def int_ppc_fnmsubs
-      : GCCBuiltin<"__builtin_ppc_fnmsubs">,
-        Intrinsic <[llvm_float_ty],
-                   [llvm_float_ty, llvm_float_ty, llvm_float_ty],
-                   [IntrNoMem]>;
+      : Intrinsic<[llvm_anyfloat_ty],
+                  [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+                  [IntrNoMem]>;
   def int_ppc_fre
-      : GCCBuiltin<"__builtin_ppc_fre">,
+      : ClangBuiltin<"__builtin_ppc_fre">,
         Intrinsic <[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
   def int_ppc_fres
-      : GCCBuiltin<"__builtin_ppc_fres">,
+      : ClangBuiltin<"__builtin_ppc_fres">,
         Intrinsic <[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_ppc_addex
-      : GCCBuiltin<"__builtin_ppc_addex">,
+      : ClangBuiltin<"__builtin_ppc_addex">,
         Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty],
                   [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<2>>]>;
-  def int_ppc_fsel : GCCBuiltin<"__builtin_ppc_fsel">,
+  def int_ppc_fsel : ClangBuiltin<"__builtin_ppc_fsel">,
                      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty, 
                                                   llvm_double_ty], [IntrNoMem]>;
-  def int_ppc_fsels : GCCBuiltin<"__builtin_ppc_fsels">,
+  def int_ppc_fsels : ClangBuiltin<"__builtin_ppc_fsels">,
                       Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty,
                                                   llvm_float_ty], [IntrNoMem]>;
-  def int_ppc_frsqrte : GCCBuiltin<"__builtin_ppc_frsqrte">,
+  def int_ppc_frsqrte : ClangBuiltin<"__builtin_ppc_frsqrte">,
                         Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
-  def int_ppc_frsqrtes : GCCBuiltin<"__builtin_ppc_frsqrtes">,
+  def int_ppc_frsqrtes : ClangBuiltin<"__builtin_ppc_frsqrtes">,
                          Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_ppc_compare_exp_uo : GCCBuiltin<"__builtin_ppc_compare_exp_uo">,
+  def int_ppc_compare_exp_uo : ClangBuiltin<"__builtin_ppc_compare_exp_uo">,
                                Intrinsic<[llvm_i32_ty],
                                          [llvm_double_ty, llvm_double_ty], 
                                          [IntrNoMem]>;
-  def int_ppc_compare_exp_lt : GCCBuiltin<"__builtin_ppc_compare_exp_lt">,
+  def int_ppc_compare_exp_lt : ClangBuiltin<"__builtin_ppc_compare_exp_lt">,
                                Intrinsic<[llvm_i32_ty], 
                                          [llvm_double_ty, llvm_double_ty], 
                                          [IntrNoMem]>;
-  def int_ppc_compare_exp_gt : GCCBuiltin<"__builtin_ppc_compare_exp_gt">,
+  def int_ppc_compare_exp_gt : ClangBuiltin<"__builtin_ppc_compare_exp_gt">,
                                Intrinsic<[llvm_i32_ty],
                                          [llvm_double_ty, llvm_double_ty], 
                                          [IntrNoMem]>;
-  def int_ppc_compare_exp_eq : GCCBuiltin<"__builtin_ppc_compare_exp_eq">,
+  def int_ppc_compare_exp_eq : ClangBuiltin<"__builtin_ppc_compare_exp_eq">,
                                Intrinsic<[llvm_i32_ty], 
                                          [llvm_double_ty, llvm_double_ty], 
                                          [IntrNoMem]>;
@@ -1773,6 +1805,12 @@ let TargetPrefix = "ppc" in {
   def int_ppc_test_data_class_f : Intrinsic<[llvm_i32_ty],
                                             [llvm_float_ty, llvm_i32_ty],
                                             [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+  def int_ppc_fnabs
+      : ClangBuiltin<"__builtin_ppc_fnabs">,
+        Intrinsic <[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_ppc_fnabss
+      : ClangBuiltin<"__builtin_ppc_fnabss">,
+        Intrinsic <[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
 
   def int_ppc_convert_f128_to_ppcf128
       : Intrinsic<[llvm_ppcf128_ty], [llvm_f128_ty], [IntrNoMem]>;
diff --git a/llvm/include/llvm/IR/IntrinsicsRISCV.td b/llvm/include/llvm/IR/IntrinsicsRISCV.td
index 6780436bd701..098ca1bc6cfb 100644
--- a/llvm/include/llvm/IR/IntrinsicsRISCV.td
+++ b/llvm/include/llvm/IR/IntrinsicsRISCV.td
@@ -140,7 +140,7 @@ let TargetPrefix = "riscv" in {
 // Vectors
 
 // The intrinsic does not have any operand that must be extended.
-defvar NoSplatOperand = 0xF;
+defvar NoScalarOperand = 0xF;
 
 // The intrinsic does not have a VL operand.
 // (e.g., riscv_vmv_x_s and riscv_vfmv_f_s)
@@ -150,7 +150,7 @@ class RISCVVIntrinsic {
   // These intrinsics may accept illegal integer values in their llvm_any_ty
   // operand, so they have to be extended.
   Intrinsic IntrinsicID = !cast<Intrinsic>(NAME);
-  bits<4> SplatOperand = NoSplatOperand;
+  bits<4> ScalarOperand = NoScalarOperand;
   bits<5> VLOperand = NoVLOperand;
 }
 
@@ -219,8 +219,8 @@ let TargetPrefix = "riscv" in {
     let VLOperand = 2;
   }
   // For unit stride load with mask
-  // Input: (maskedoff, pointer, mask, vl, ta)
-  class RISCVUSLoadMask
+  // Input: (maskedoff, pointer, mask, vl, policy)
+  class RISCVUSLoadMasked
         : Intrinsic<[llvm_anyvector_ty ],
                     [LLVMMatchType<0>,
                      LLVMPointerType<LLVMMatchType<0>>,
@@ -231,11 +231,11 @@ let TargetPrefix = "riscv" in {
     let VLOperand = 3;
   }
   // For unit stride fault-only-first load with mask
-  // Input: (maskedoff, pointer, mask, vl, ta)
+  // Input: (maskedoff, pointer, mask, vl, policy)
   // Output: (data, vl)
   // NOTE: We model this with default memory properties since we model writing
   // VL as a side effect. IntrReadMem, IntrHasSideEffects does not work.
-  class RISCVUSLoadFFMask
+  class RISCVUSLoadFFMasked
         : Intrinsic<[llvm_anyvector_ty, llvm_anyint_ty],
                     [LLVMMatchType<0>,
                      LLVMPointerType<LLVMMatchType<0>>,
@@ -255,8 +255,8 @@ let TargetPrefix = "riscv" in {
     let VLOperand = 3;
   }
   // For strided load with mask
-  // Input: (maskedoff, pointer, stride, mask, vl, ta)
-  class RISCVSLoadMask
+  // Input: (maskedoff, pointer, stride, mask, vl, policy)
+  class RISCVSLoadMasked
         : Intrinsic<[llvm_anyvector_ty ],
                     [LLVMMatchType<0>,
                      LLVMPointerType<LLVMMatchType<0>>, llvm_anyint_ty,
@@ -277,8 +277,8 @@ let TargetPrefix = "riscv" in {
     let VLOperand = 3;
   }
   // For indexed load with mask
-  // Input: (maskedoff, pointer, index, mask, vl, ta)
-  class RISCVILoadMask
+  // Input: (maskedoff, pointer, index, mask, vl, policy)
+  class RISCVILoadMasked
         : Intrinsic<[llvm_anyvector_ty ],
                     [LLVMMatchType<0>,
                      LLVMPointerType<LLVMMatchType<0>>, llvm_anyvector_ty,
@@ -300,7 +300,7 @@ let TargetPrefix = "riscv" in {
   }
   // For unit stride store with mask
   // Input: (vector_in, pointer, mask, vl)
-  class RISCVUSStoreMask
+  class RISCVUSStoreMasked
         : Intrinsic<[],
                     [llvm_anyvector_ty,
                      LLVMPointerType<LLVMMatchType<0>>,
@@ -321,7 +321,7 @@ let TargetPrefix = "riscv" in {
   }
   // For stride store with mask
   // Input: (vector_in, pointer, stirde, mask, vl)
-  class RISCVSStoreMask
+  class RISCVSStoreMasked
         : Intrinsic<[],
                     [llvm_anyvector_ty,
                      LLVMPointerType<LLVMMatchType<0>>, llvm_anyint_ty,
@@ -341,7 +341,7 @@ let TargetPrefix = "riscv" in {
   }
   // For indexed store with mask
   // Input: (vector_in, pointer, index, mask, vl)
-  class RISCVIStoreMask
+  class RISCVIStoreMasked
         : Intrinsic<[],
                     [llvm_anyvector_ty,
                      LLVMPointerType<LLVMMatchType<0>>, llvm_anyvector_ty,
@@ -350,16 +350,16 @@ let TargetPrefix = "riscv" in {
     let VLOperand = 4;
   }
   // For destination vector type is the same as source vector.
-  // Input: (vector_in, vl)
-  class RISCVUnaryAANoMask
+  // Input: (passthru, vector_in, vl)
+  class RISCVUnaryAAUnMasked
         : Intrinsic<[llvm_anyvector_ty],
-                    [LLVMMatchType<0>, llvm_anyint_ty],
+                    [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyint_ty],
                     [IntrNoMem]>, RISCVVIntrinsic {
-    let VLOperand = 1;
+    let VLOperand = 2;
   }
   // For destination vector type is the same as first source vector (with mask).
-  // Input: (vector_in, mask, vl, ta)
-  class RISCVUnaryAAMask
+  // Input: (vector_in, vector_in, mask, vl, policy)
+  class RISCVUnaryAAMasked
         : Intrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, LLVMMatchType<0>,
                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty,
@@ -367,7 +367,8 @@ let TargetPrefix = "riscv" in {
                     [ImmArg<ArgIndex<4>>, IntrNoMem]>, RISCVVIntrinsic {
     let VLOperand = 3;
   }
-  class RISCVUnaryAAMaskNoTA
+  // Input: (passthru, vector_in, vector_in, mask, vl)
+  class RISCVCompress
         : Intrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, LLVMMatchType<0>,
                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty],
@@ -376,23 +377,24 @@ let TargetPrefix = "riscv" in {
   }
   // For destination vector type is the same as first and second source vector.
   // Input: (vector_in, vector_in, vl)
-  class RISCVBinaryAAANoMask
+  class RISCVBinaryAAAUnMasked
         : Intrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyint_ty],
                     [IntrNoMem]>, RISCVVIntrinsic {
     let VLOperand = 2;
   }
   // For destination vector type is the same as first and second source vector.
-  // Input: (vector_in, int_vector_in, vl)
-  class RISCVRGatherVVNoMask
+  // Input: (passthru, vector_in, int_vector_in, vl)
+  class RISCVRGatherVVUnMasked
         : Intrinsic<[llvm_anyvector_ty],
-                    [LLVMMatchType<0>, LLVMVectorOfBitcastsToInt<0>, llvm_anyint_ty],
+                    [LLVMMatchType<0>, LLVMMatchType<0>,
+                     LLVMVectorOfBitcastsToInt<0>, llvm_anyint_ty],
                     [IntrNoMem]>, RISCVVIntrinsic {
-    let VLOperand = 2;
+    let VLOperand = 3;
   }
   // For destination vector type is the same as first and second source vector.
-  // Input: (vector_in, vector_in, int_vector_in, vl, ta)
-  class RISCVRGatherVVMask
+  // Input: (vector_in, vector_in, int_vector_in, vl, policy)
+  class RISCVRGatherVVMasked
         : Intrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, LLVMMatchType<0>, LLVMVectorOfBitcastsToInt<0>,
                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty,
@@ -400,17 +402,18 @@ let TargetPrefix = "riscv" in {
                     [ImmArg<ArgIndex<5>>, IntrNoMem]>, RISCVVIntrinsic {
     let VLOperand = 4;
   }
-  // Input: (vector_in, int16_vector_in, vl)
-  class RISCVRGatherEI16VVNoMask
+  // Input: (passthru, vector_in, int16_vector_in, vl)
+  class RISCVRGatherEI16VVUnMasked
         : Intrinsic<[llvm_anyvector_ty],
-                    [LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i16_ty>,
+                    [LLVMMatchType<0>, LLVMMatchType<0>,
+                     LLVMScalarOrSameVectorWidth<0, llvm_i16_ty>,
                      llvm_anyint_ty],
                     [IntrNoMem]>, RISCVVIntrinsic {
-    let VLOperand = 2;
+    let VLOperand = 3;
   }
   // For destination vector type is the same as first and second source vector.
-  // Input: (vector_in, vector_in, int16_vector_in, vl, ta)
-  class RISCVRGatherEI16VVMask
+  // Input: (vector_in, vector_in, int16_vector_in, vl, policy)
+  class RISCVRGatherEI16VVMasked
         : Intrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, LLVMMatchType<0>,
                      LLVMScalarOrSameVectorWidth<0, llvm_i16_ty>,
@@ -421,17 +424,18 @@ let TargetPrefix = "riscv" in {
   }
   // For destination vector type is the same as first source vector, and the
   // second operand is XLen.
-  // Input: (vector_in, xlen_in, vl)
-  class RISCVGatherVXNoMask
+  // Input: (passthru, vector_in, xlen_in, vl)
+  class RISCVGatherVXUnMasked
         : Intrinsic<[llvm_anyvector_ty],
-                    [LLVMMatchType<0>, llvm_anyint_ty, LLVMMatchType<1>],
+                    [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyint_ty,
+                     LLVMMatchType<1>],
                     [IntrNoMem]>, RISCVVIntrinsic {
-    let VLOperand = 2;
+    let VLOperand = 3;
   }
   // For destination vector type is the same as first source vector (with mask).
   // Second operand is XLen.
-  // Input: (maskedoff, vector_in, xlen_in, mask, vl, ta)
-  class RISCVGatherVXMask
+  // Input: (maskedoff, vector_in, xlen_in, mask, vl, policy)
+  class RISCVGatherVXMasked
        : Intrinsic<[llvm_anyvector_ty],
                    [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyint_ty,
                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<1>,
@@ -440,38 +444,40 @@ let TargetPrefix = "riscv" in {
     let VLOperand = 4;
   }
   // For destination vector type is the same as first source vector.
-  // Input: (vector_in, vector_in/scalar_in, vl)
-  class RISCVBinaryAAXNoMask
+  // Input: (passthru, vector_in, vector_in/scalar_in, vl)
+  class RISCVBinaryAAXUnMasked
         : Intrinsic<[llvm_anyvector_ty],
-                    [LLVMMatchType<0>, llvm_any_ty, llvm_anyint_ty],
+                    [LLVMMatchType<0>, LLVMMatchType<0>, llvm_any_ty,
+                     llvm_anyint_ty],
                     [IntrNoMem]>, RISCVVIntrinsic {
-    let SplatOperand = 1;
-    let VLOperand = 2;
+    let ScalarOperand = 2;
+    let VLOperand = 3;
   }
   // For destination vector type is the same as first source vector (with mask).
-  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl, ta)
-  class RISCVBinaryAAXMask
+  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl, policy)
+  class RISCVBinaryAAXMasked
        : Intrinsic<[llvm_anyvector_ty],
                    [LLVMMatchType<0>, LLVMMatchType<0>, llvm_any_ty,
                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty,
                     LLVMMatchType<2>],
                    [ImmArg<ArgIndex<5>>, IntrNoMem]>, RISCVVIntrinsic {
-    let SplatOperand = 2;
+    let ScalarOperand = 2;
     let VLOperand = 4;
   }
   // For destination vector type is the same as first source vector. The
   // second source operand must match the destination type or be an XLen scalar.
-  // Input: (vector_in, vector_in/scalar_in, vl)
-  class RISCVBinaryAAShiftNoMask
+  // Input: (passthru, vector_in, vector_in/scalar_in, vl)
+  class RISCVBinaryAAShiftUnMasked
         : Intrinsic<[llvm_anyvector_ty],
-                    [LLVMMatchType<0>, llvm_any_ty, llvm_anyint_ty],
+                    [LLVMMatchType<0>, LLVMMatchType<0>, llvm_any_ty,
+                     llvm_anyint_ty],
                     [IntrNoMem]>, RISCVVIntrinsic {
-    let VLOperand = 2;
+    let VLOperand = 3;
   }
   // For destination vector type is the same as first source vector (with mask).
   // The second source operand must match the destination type or be an XLen scalar.
-  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl, ta)
-  class RISCVBinaryAAShiftMask
+  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl, policy)
+  class RISCVBinaryAAShiftMasked
        : Intrinsic<[llvm_anyvector_ty],
                    [LLVMMatchType<0>, LLVMMatchType<0>, llvm_any_ty,
                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty,
@@ -480,38 +486,40 @@ let TargetPrefix = "riscv" in {
     let VLOperand = 4;
   }
   // For destination vector type is NOT the same as first source vector.
-  // Input: (vector_in, vector_in/scalar_in, vl)
-  class RISCVBinaryABXNoMask
+  // Input: (passthru, vector_in, vector_in/scalar_in, vl)
+  class RISCVBinaryABXUnMasked
         : Intrinsic<[llvm_anyvector_ty],
-                    [llvm_anyvector_ty, llvm_any_ty, llvm_anyint_ty],
+                    [LLVMMatchType<0>, llvm_anyvector_ty, llvm_any_ty,
+                     llvm_anyint_ty],
                     [IntrNoMem]>, RISCVVIntrinsic {
-    let SplatOperand = 1;
-    let VLOperand = 2;
+    let ScalarOperand = 2;
+    let VLOperand = 3;
   }
   // For destination vector type is NOT the same as first source vector (with mask).
-  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl, ta)
-  class RISCVBinaryABXMask
+  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl, policy)
+  class RISCVBinaryABXMasked
         : Intrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, llvm_anyvector_ty, llvm_any_ty,
                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty,
                      LLVMMatchType<3>],
                     [ImmArg<ArgIndex<5>>, IntrNoMem]>, RISCVVIntrinsic {
-    let SplatOperand = 2;
+    let ScalarOperand = 2;
     let VLOperand = 4;
   }
   // For destination vector type is NOT the same as first source vector. The
   // second source operand must match the destination type or be an XLen scalar.
-  // Input: (vector_in, vector_in/scalar_in, vl)
-  class RISCVBinaryABShiftNoMask
+  // Input: (passthru, vector_in, vector_in/scalar_in, vl)
+  class RISCVBinaryABShiftUnMasked
         : Intrinsic<[llvm_anyvector_ty],
-                    [llvm_anyvector_ty, llvm_any_ty, llvm_anyint_ty],
+                    [LLVMMatchType<0>, llvm_anyvector_ty, llvm_any_ty,
+                     llvm_anyint_ty],
                     [IntrNoMem]>, RISCVVIntrinsic {
-    let VLOperand = 2;
+    let VLOperand = 3;
   }
   // For destination vector type is NOT the same as first source vector (with mask).
   // The second source operand must match the destination type or be an XLen scalar.
-  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl, ta)
-  class RISCVBinaryABShiftMask
+  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl, policy)
+  class RISCVBinaryABShiftMasked
         : Intrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, llvm_anyvector_ty, llvm_any_ty,
                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty,
@@ -520,15 +528,15 @@ let TargetPrefix = "riscv" in {
     let VLOperand = 4;
   }
   // For binary operations with V0 as input.
-  // Input: (vector_in, vector_in/scalar_in, V0, vl)
+  // Input: (passthru, vector_in, vector_in/scalar_in, V0, vl)
   class RISCVBinaryWithV0
         : Intrinsic<[llvm_anyvector_ty],
-                    [LLVMMatchType<0>, llvm_any_ty,
+                    [LLVMMatchType<0>, LLVMMatchType<0>, llvm_any_ty,
                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                      llvm_anyint_ty],
                     [IntrNoMem]>, RISCVVIntrinsic {
-    let SplatOperand = 1;
-    let VLOperand = 3;
+    let ScalarOperand = 2;
+    let VLOperand = 4;
   }
   // For binary operations with mask type output and V0 as input.
   // Output: (mask type output)
@@ -539,7 +547,7 @@ let TargetPrefix = "riscv" in {
                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                     llvm_anyint_ty],
                    [IntrNoMem]>, RISCVVIntrinsic {
-    let SplatOperand = 1;
+    let ScalarOperand = 1;
     let VLOperand = 3;
   }
   // For binary operations with mask type output.
@@ -549,87 +557,91 @@ let TargetPrefix = "riscv" in {
         : Intrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                     [llvm_anyvector_ty, llvm_any_ty, llvm_anyint_ty],
                     [IntrNoMem]>, RISCVVIntrinsic {
-    let SplatOperand = 1;
+    let ScalarOperand = 1;
     let VLOperand = 2;
   }
   // For binary operations with mask type output without mask.
   // Output: (mask type output)
   // Input: (vector_in, vector_in/scalar_in, vl)
-  class RISCVCompareNoMask
+  class RISCVCompareUnMasked
         : Intrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                     [llvm_anyvector_ty, llvm_any_ty, llvm_anyint_ty],
                     [IntrNoMem]>, RISCVVIntrinsic {
-    let SplatOperand = 1;
+    let ScalarOperand = 1;
     let VLOperand = 2;
   }
   // For binary operations with mask type output with mask.
   // Output: (mask type output)
   // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl)
-  class RISCVCompareMask
+  class RISCVCompareMasked
         : Intrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                     [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                      llvm_anyvector_ty, llvm_any_ty,
                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty],
                     [IntrNoMem]>, RISCVVIntrinsic {
-    let SplatOperand = 2;
+    let ScalarOperand = 2;
     let VLOperand = 4;
   }
   // For FP classify operations.
   // Output: (bit mask type output)
-  // Input: (vector_in, vl)
-  class RISCVClassifyNoMask
+  // Input: (passthru, vector_in, vl)
+  class RISCVClassifyUnMasked
         : Intrinsic<[LLVMVectorOfBitcastsToInt<0>],
-                    [llvm_anyvector_ty, llvm_anyint_ty],
+                    [LLVMVectorOfBitcastsToInt<0>, llvm_anyvector_ty,
+                      llvm_anyint_ty],
                     [IntrNoMem]>, RISCVVIntrinsic {
     let VLOperand = 1;
   }
   // For FP classify operations with mask.
   // Output: (bit mask type output)
-  // Input: (maskedoff, vector_in, mask, vl)
-  class RISCVClassifyMask
+  // Input: (maskedoff, vector_in, mask, vl, policy)
+  class RISCVClassifyMasked
         : Intrinsic<[LLVMVectorOfBitcastsToInt<0>],
                     [LLVMVectorOfBitcastsToInt<0>, llvm_anyvector_ty,
-                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty],
-                    [IntrNoMem]>, RISCVVIntrinsic {
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, 
+                     llvm_anyint_ty, LLVMMatchType<1>],
+                    [IntrNoMem, ImmArg<ArgIndex<4>>]>, RISCVVIntrinsic {
     let VLOperand = 3;
   }
   // For Saturating binary operations.
   // The destination vector type is the same as first source vector.
-  // Input: (vector_in, vector_in/scalar_in, vl)
-  class RISCVSaturatingBinaryAAXNoMask
+  // Input: (passthru, vector_in, vector_in/scalar_in, vl)
+  class RISCVSaturatingBinaryAAXUnMasked
         : Intrinsic<[llvm_anyvector_ty],
-                    [LLVMMatchType<0>, llvm_any_ty, llvm_anyint_ty],
+                    [LLVMMatchType<0>, LLVMMatchType<0>, llvm_any_ty,
+                     llvm_anyint_ty],
                     [IntrNoMem, IntrHasSideEffects]>, RISCVVIntrinsic {
-    let SplatOperand = 1;
-    let VLOperand = 2;
+    let ScalarOperand = 2;
+    let VLOperand = 3;
   }
   // For Saturating binary operations with mask.
   // The destination vector type is the same as first source vector.
-  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl, ta)
-  class RISCVSaturatingBinaryAAXMask
+  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl, policy)
+  class RISCVSaturatingBinaryAAXMasked
         : Intrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, LLVMMatchType<0>, llvm_any_ty,
                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty,
                      LLVMMatchType<2>],
                     [ImmArg<ArgIndex<5>>, IntrNoMem, IntrHasSideEffects]>, RISCVVIntrinsic {
-    let SplatOperand = 2;
+    let ScalarOperand = 2;
     let VLOperand = 4;
   }
   // For Saturating binary operations.
   // The destination vector type is the same as first source vector.
   // The second source operand matches the destination type or is an XLen scalar.
-  // Input: (vector_in, vector_in/scalar_in, vl)
-  class RISCVSaturatingBinaryAAShiftNoMask
+  // Input: (passthru, vector_in, vector_in/scalar_in, vl)
+  class RISCVSaturatingBinaryAAShiftUnMasked
         : Intrinsic<[llvm_anyvector_ty],
-                    [LLVMMatchType<0>, llvm_any_ty, llvm_anyint_ty],
+                    [LLVMMatchType<0>, LLVMMatchType<0>, llvm_any_ty,
+                     llvm_anyint_ty],
                     [IntrNoMem, IntrHasSideEffects]>, RISCVVIntrinsic {
-    let VLOperand = 2;
+    let VLOperand = 3;
   }
   // For Saturating binary operations with mask.
   // The destination vector type is the same as first source vector.
   // The second source operand matches the destination type or is an XLen scalar.
-  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl, ta)
-  class RISCVSaturatingBinaryAAShiftMask
+  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl, policy)
+  class RISCVSaturatingBinaryAAShiftMasked
         : Intrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, LLVMMatchType<0>, llvm_any_ty,
                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty,
@@ -640,18 +652,19 @@ let TargetPrefix = "riscv" in {
   // For Saturating binary operations.
   // The destination vector type is NOT the same as first source vector.
   // The second source operand matches the destination type or is an XLen scalar.
-  // Input: (vector_in, vector_in/scalar_in, vl)
-  class RISCVSaturatingBinaryABShiftNoMask
+  // Input: (passthru, vector_in, vector_in/scalar_in, vl)
+  class RISCVSaturatingBinaryABShiftUnMasked
         : Intrinsic<[llvm_anyvector_ty],
-                    [llvm_anyvector_ty, llvm_any_ty, llvm_anyint_ty],
+                    [LLVMMatchType<0>, llvm_anyvector_ty, llvm_any_ty,
+                     llvm_anyint_ty],
                     [IntrNoMem, IntrHasSideEffects]>, RISCVVIntrinsic {
-    let VLOperand = 2;
+    let VLOperand = 3;
   }
   // For Saturating binary operations with mask.
   // The destination vector type is NOT the same as first source vector (with mask).
   // The second source operand matches the destination type or is an XLen scalar.
-  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl, ta)
-  class RISCVSaturatingBinaryABShiftMask
+  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl, policy)
+  class RISCVSaturatingBinaryABShiftMasked
         : Intrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, llvm_anyvector_ty, llvm_any_ty,
                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty,
@@ -659,56 +672,69 @@ let TargetPrefix = "riscv" in {
                     [ImmArg<ArgIndex<5>>, IntrNoMem, IntrHasSideEffects]>, RISCVVIntrinsic {
     let VLOperand = 4;
   }
-  class RISCVTernaryAAAXNoMask
+  // Input: (vector_in, vector_in, scalar_in, vl, policy)
+  class RVVSlideUnMasked
         : Intrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyint_ty,
-                     LLVMMatchType<1>],
-                    [IntrNoMem]>, RISCVVIntrinsic {
+                     LLVMMatchType<1>, LLVMMatchType<1>],
+                    [ImmArg<ArgIndex<4>>, IntrNoMem]>, RISCVVIntrinsic {
     let VLOperand = 3;
   }
-  class RISCVTernaryAAAXMask
+  // Input: (vector_in, vector_in, vector_in/scalar_in, mask, vl, policy)
+  class RVVSlideMasked
         : Intrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyint_ty,
-                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<1>],
-                    [IntrNoMem]>, RISCVVIntrinsic {
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                     LLVMMatchType<1>, LLVMMatchType<1>],
+                    [ImmArg<ArgIndex<5>>, IntrNoMem]>, RISCVVIntrinsic {
     let VLOperand = 4;
   }
-  class RISCVTernaryAAXANoMask
+  // UnMasked Vector Multiply-Add operations, its first operand can not be undef.
+  // Input: (vector_in, vector_in/scalar, vector_in, vl, policy)
+  class RISCVTernaryAAXAUnMasked
         : Intrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, llvm_any_ty, LLVMMatchType<0>,
-                     llvm_anyint_ty],
-                    [IntrNoMem]>, RISCVVIntrinsic {
-    let SplatOperand = 1;
+                     llvm_anyint_ty, LLVMMatchType<2>],
+                    [ImmArg<ArgIndex<4>>, IntrNoMem]>, RISCVVIntrinsic {
+    let ScalarOperand = 1;
     let VLOperand = 3;
   }
-  class RISCVTernaryAAXAMask
+  // Masked Vector Multiply-Add operations, its first operand can not be undef.
+  // Input: (vector_in, vector_in/scalar, vector_in, mask, vl, policy
+  class RISCVTernaryAAXAMasked
         : Intrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, llvm_any_ty, LLVMMatchType<0>,
-                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty],
-                    [IntrNoMem]>, RISCVVIntrinsic {
-    let SplatOperand = 1;
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                     llvm_anyint_ty, LLVMMatchType<2>],
+                    [ImmArg<ArgIndex<5>>, IntrNoMem]>, RISCVVIntrinsic {
+    let ScalarOperand = 1;
     let VLOperand = 4;
   }
-  class RISCVTernaryWideNoMask
+  // UnMasked Widening Vector Multiply-Add operations, its first operand can not be undef.
+  // Input: (vector_in, vector_in/scalar, vector_in, vl, policy)
+  class RISCVTernaryWideUnMasked
         : Intrinsic< [llvm_anyvector_ty],
                      [LLVMMatchType<0>, llvm_any_ty, llvm_anyvector_ty,
-                      llvm_anyint_ty],
-                     [IntrNoMem] >, RISCVVIntrinsic {
-    let SplatOperand = 1;
+                      llvm_anyint_ty, LLVMMatchType<3>],
+                     [ImmArg<ArgIndex<4>>, IntrNoMem] >, RISCVVIntrinsic {
+    let ScalarOperand = 1;
     let VLOperand = 3;
   }
-  class RISCVTernaryWideMask
+  // Masked Widening Vector Multiply-Add operations, its first operand can not be undef.
+  // Input: (vector_in, vector_in/scalar, vector_in, mask, vl, policy
+  class RISCVTernaryWideMasked
         : Intrinsic< [llvm_anyvector_ty],
                      [LLVMMatchType<0>, llvm_any_ty, llvm_anyvector_ty,
-                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty],
-                     [IntrNoMem]>, RISCVVIntrinsic {
-    let SplatOperand = 1;
+                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                      llvm_anyint_ty, LLVMMatchType<3>],
+                     [ImmArg<ArgIndex<5>>, IntrNoMem]>, RISCVVIntrinsic {
+    let ScalarOperand = 1;
     let VLOperand = 4;
   }
   // For Reduction ternary operations.
   // For destination vector type is the same as first and third source vector.
   // Input: (vector_in, vector_in, vector_in, vl)
-  class RISCVReductionNoMask
+  class RISCVReductionUnMasked
         : Intrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>,
                      llvm_anyint_ty],
@@ -719,7 +745,7 @@ let TargetPrefix = "riscv" in {
   // For destination vector type is the same as first and third source vector.
   // The mask type come from second source vector.
   // Input: (maskedoff, vector_in, vector_in, vector_in, mask, vl)
-  class RISCVReductionMask
+  class RISCVReductionMasked
         : Intrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>,
                      LLVMScalarOrSameVectorWidth<1, llvm_i1_ty>, llvm_anyint_ty],
@@ -729,7 +755,7 @@ let TargetPrefix = "riscv" in {
   // For unary operations with scalar type output without mask
   // Output: (scalar type)
   // Input: (vector_in, vl)
-  class RISCVMaskUnarySOutNoMask
+  class RISCVMaskedUnarySOutUnMasked
         : Intrinsic<[LLVMMatchType<1>],
                     [llvm_anyvector_ty, llvm_anyint_ty],
                     [IntrNoMem]>, RISCVVIntrinsic {
@@ -738,23 +764,23 @@ let TargetPrefix = "riscv" in {
   // For unary operations with scalar type output with mask
   // Output: (scalar type)
   // Input: (vector_in, mask, vl)
-  class RISCVMaskUnarySOutMask
+  class RISCVMaskedUnarySOutMasked
         : Intrinsic<[LLVMMatchType<1>],
                     [llvm_anyvector_ty, LLVMMatchType<0>, llvm_anyint_ty],
                     [IntrNoMem]>, RISCVVIntrinsic {
     let VLOperand = 2;
   }
   // For destination vector type is NOT the same as source vector.
-  // Input: (vector_in, vl)
-  class RISCVUnaryABNoMask
+  // Input: (passthru, vector_in, vl)
+  class RISCVUnaryABUnMasked
         : Intrinsic<[llvm_anyvector_ty],
-                    [llvm_anyvector_ty, llvm_anyint_ty],
+                    [LLVMMatchType<0>, llvm_anyvector_ty, llvm_anyint_ty],
                     [IntrNoMem]>, RISCVVIntrinsic {
-    let VLOperand = 1;
+    let VLOperand = 2;
   }
   // For destination vector type is NOT the same as source vector (with mask).
-  // Input: (maskedoff, vector_in, mask, vl, ta)
-  class RISCVUnaryABMask
+  // Input: (maskedoff, vector_in, mask, vl, policy)
+  class RISCVUnaryABMasked
         : Intrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, llvm_anyvector_ty,
                      LLVMScalarOrSameVectorWidth<1, llvm_i1_ty>,
@@ -765,7 +791,7 @@ let TargetPrefix = "riscv" in {
   // For unary operations with the same vector type in/out without mask
   // Output: (vector)
   // Input: (vector_in, vl)
-  class RISCVUnaryNoMask
+  class RISCVUnaryUnMasked
         : Intrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, llvm_anyint_ty],
                     [IntrNoMem]>, RISCVVIntrinsic {
@@ -774,7 +800,7 @@ let TargetPrefix = "riscv" in {
   // For mask unary operations with mask type in/out with mask
   // Output: (mask type output)
   // Input: (mask type maskedoff, mask type vector_in, mask, vl)
-  class RISCVMaskUnaryMOutMask
+  class RISCVMaskedUnaryMOutMasked
         : Intrinsic<[llvm_anyint_ty],
                     [LLVMMatchType<0>, LLVMMatchType<0>,
                      LLVMMatchType<0>, llvm_anyint_ty],
@@ -785,21 +811,28 @@ let TargetPrefix = "riscv" in {
   // Input: (vl)
   class RISCVNullaryIntrinsic
         : Intrinsic<[llvm_anyvector_ty],
-                    [llvm_anyint_ty],
+                    [llvm_anyint_ty], [IntrNoMem]>, RISCVVIntrinsic {
+    let VLOperand = 1;
+  }
+  // Output: (vector)
+  // Input: (passthru, vl)
+  class RISCVID
+        : Intrinsic<[llvm_anyvector_ty],
+                    [LLVMMatchType<0>, llvm_anyint_ty],
                     [IntrNoMem]>, RISCVVIntrinsic {
-    let VLOperand = 0;
+    let VLOperand = 1;
   }
   // For Conversion unary operations.
-  // Input: (vector_in, vl)
-  class RISCVConversionNoMask
+  // Input: (passthru, vector_in, vl)
+  class RISCVConversionUnMasked
         : Intrinsic<[llvm_anyvector_ty],
-                    [llvm_anyvector_ty, llvm_anyint_ty],
+                    [LLVMMatchType<0>, llvm_anyvector_ty, llvm_anyint_ty],
                     [IntrNoMem]>, RISCVVIntrinsic {
-    let VLOperand = 1;
+    let VLOperand = 2;
   }
   // For Conversion unary operations with mask.
-  // Input: (maskedoff, vector_in, mask, vl, ta)
-  class RISCVConversionMask
+  // Input: (maskedoff, vector_in, mask, vl, policy)
+  class RISCVConversionMasked
         : Intrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, llvm_anyvector_ty,
                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty,
@@ -809,17 +842,18 @@ let TargetPrefix = "riscv" in {
   }
 
   // For unit stride segment load
-  // Input: (pointer, vl)
+  // Input: (passthru, pointer, vl)
   class RISCVUSSegLoad<int nf>
         : Intrinsic<!listconcat([llvm_anyvector_ty], !listsplat(LLVMMatchType<0>,
                                 !add(nf, -1))),
-                    [LLVMPointerToElt<0>, llvm_anyint_ty],
-                    [NoCapture<ArgIndex<0>>, IntrReadMem]>, RISCVVIntrinsic {
-    let VLOperand = 1;
+                    !listconcat(!listsplat(LLVMMatchType<0>, nf),
+                                [LLVMPointerToElt<0>, llvm_anyint_ty]),
+                    [NoCapture<ArgIndex<nf>>, IntrReadMem]>, RISCVVIntrinsic {
+    let VLOperand = !add(nf, 1);
   }
   // For unit stride segment load with mask
-  // Input: (maskedoff, pointer, mask, vl, ta)
-  class RISCVUSSegLoadMask<int nf>
+  // Input: (maskedoff, pointer, mask, vl, policy)
+  class RISCVUSSegLoadMasked<int nf>
         : Intrinsic<!listconcat([llvm_anyvector_ty], !listsplat(LLVMMatchType<0>,
                                 !add(nf, -1))),
                     !listconcat(!listsplat(LLVMMatchType<0>, nf),
@@ -832,23 +866,24 @@ let TargetPrefix = "riscv" in {
   }
 
   // For unit stride fault-only-first segment load
-  // Input: (pointer, vl)
+  // Input: (passthru, pointer, vl)
   // Output: (data, vl)
   // NOTE: We model this with default memory properties since we model writing
   // VL as a side effect. IntrReadMem, IntrHasSideEffects does not work.
   class RISCVUSSegLoadFF<int nf>
         : Intrinsic<!listconcat([llvm_anyvector_ty], !listsplat(LLVMMatchType<0>,
                                 !add(nf, -1)), [llvm_anyint_ty]),
-                    [LLVMPointerToElt<0>, LLVMMatchType<1>],
-                    [NoCapture<ArgIndex<0>>]>, RISCVVIntrinsic {
-    let VLOperand = 1;
+                    !listconcat(!listsplat(LLVMMatchType<0>, nf),
+                    [LLVMPointerToElt<0>, LLVMMatchType<1>]),
+                    [NoCapture<ArgIndex<nf>>]>, RISCVVIntrinsic {
+    let VLOperand = !add(nf, 1);
   }
   // For unit stride fault-only-first segment load with mask
-  // Input: (maskedoff, pointer, mask, vl, ta)
+  // Input: (maskedoff, pointer, mask, vl, policy)
   // Output: (data, vl)
   // NOTE: We model this with default memory properties since we model writing
   // VL as a side effect. IntrReadMem, IntrHasSideEffects does not work.
-  class RISCVUSSegLoadFFMask<int nf>
+  class RISCVUSSegLoadFFMasked<int nf>
         : Intrinsic<!listconcat([llvm_anyvector_ty], !listsplat(LLVMMatchType<0>,
                                 !add(nf, -1)), [llvm_anyint_ty]),
                     !listconcat(!listsplat(LLVMMatchType<0>, nf),
@@ -861,17 +896,18 @@ let TargetPrefix = "riscv" in {
   }
 
   // For stride segment load
-  // Input: (pointer, offset, vl)
+  // Input: (passthru, pointer, offset, vl)
   class RISCVSSegLoad<int nf>
         : Intrinsic<!listconcat([llvm_anyvector_ty], !listsplat(LLVMMatchType<0>,
                                 !add(nf, -1))),
-                    [LLVMPointerToElt<0>, llvm_anyint_ty, LLVMMatchType<1>],
-                    [NoCapture<ArgIndex<0>>, IntrReadMem]>, RISCVVIntrinsic {
-    let VLOperand = 2;
+                    !listconcat(!listsplat(LLVMMatchType<0>, nf),
+                    [LLVMPointerToElt<0>, llvm_anyint_ty, LLVMMatchType<1>]),
+                    [NoCapture<ArgIndex<nf>>, IntrReadMem]>, RISCVVIntrinsic {
+    let VLOperand = !add(nf, 2);
   }
   // For stride segment load with mask
-  // Input: (maskedoff, pointer, offset, mask, vl, ta)
-  class RISCVSSegLoadMask<int nf>
+  // Input: (maskedoff, pointer, offset, mask, vl, policy)
+  class RISCVSSegLoadMasked<int nf>
         : Intrinsic<!listconcat([llvm_anyvector_ty], !listsplat(LLVMMatchType<0>,
                                 !add(nf, -1))),
                     !listconcat(!listsplat(LLVMMatchType<0>, nf),
@@ -885,17 +921,18 @@ let TargetPrefix = "riscv" in {
   }
 
   // For indexed segment load
-  // Input: (pointer, index, vl)
+  // Input: (passthru, pointer, index, vl)
   class RISCVISegLoad<int nf>
         : Intrinsic<!listconcat([llvm_anyvector_ty], !listsplat(LLVMMatchType<0>,
                                 !add(nf, -1))),
-                    [LLVMPointerToElt<0>, llvm_anyvector_ty, llvm_anyint_ty],
-                    [NoCapture<ArgIndex<0>>, IntrReadMem]>, RISCVVIntrinsic {
-    let VLOperand = 2;
+                    !listconcat(!listsplat(LLVMMatchType<0>, nf),
+                    [LLVMPointerToElt<0>, llvm_anyvector_ty, llvm_anyint_ty]),
+                    [NoCapture<ArgIndex<nf>>, IntrReadMem]>, RISCVVIntrinsic {
+    let VLOperand = !add(nf, 2);
   }
   // For indexed segment load with mask
-  // Input: (maskedoff, pointer, index, mask, vl, ta)
-  class RISCVISegLoadMask<int nf>
+  // Input: (maskedoff, pointer, index, mask, vl, policy)
+  class RISCVISegLoadMasked<int nf>
         : Intrinsic<!listconcat([llvm_anyvector_ty], !listsplat(LLVMMatchType<0>,
                                 !add(nf, -1))),
                     !listconcat(!listsplat(LLVMMatchType<0>, nf),
@@ -920,7 +957,7 @@ let TargetPrefix = "riscv" in {
   }
   // For unit stride segment store with mask
   // Input: (value, pointer, mask, vl)
-  class RISCVUSSegStoreMask<int nf>
+  class RISCVUSSegStoreMasked<int nf>
         : Intrinsic<[],
                     !listconcat([llvm_anyvector_ty],
                                 !listsplat(LLVMMatchType<0>, !add(nf, -1)),
@@ -944,7 +981,7 @@ let TargetPrefix = "riscv" in {
   }
   // For stride segment store with mask
   // Input: (value, pointer, offset, mask, vl)
-  class RISCVSSegStoreMask<int nf>
+  class RISCVSSegStoreMasked<int nf>
         : Intrinsic<[],
                     !listconcat([llvm_anyvector_ty],
                                 !listsplat(LLVMMatchType<0>, !add(nf, -1)),
@@ -968,7 +1005,7 @@ let TargetPrefix = "riscv" in {
   }
   // For indexed segment store with mask
   // Input: (value, pointer, offset, mask, vl)
-  class RISCVISegStoreMask<int nf>
+  class RISCVISegStoreMasked<int nf>
         : Intrinsic<[],
                     !listconcat([llvm_anyvector_ty],
                                 !listsplat(LLVMMatchType<0>, !add(nf, -1)),
@@ -981,76 +1018,76 @@ let TargetPrefix = "riscv" in {
 
   multiclass RISCVUSLoad {
     def "int_riscv_" # NAME : RISCVUSLoad;
-    def "int_riscv_" # NAME # "_mask" : RISCVUSLoadMask;
+    def "int_riscv_" # NAME # "_mask" : RISCVUSLoadMasked;
   }
   multiclass RISCVUSLoadFF {
     def "int_riscv_" # NAME : RISCVUSLoadFF;
-    def "int_riscv_" # NAME # "_mask" : RISCVUSLoadFFMask;
+    def "int_riscv_" # NAME # "_mask" : RISCVUSLoadFFMasked;
   }
   multiclass RISCVSLoad {
     def "int_riscv_" # NAME : RISCVSLoad;
-    def "int_riscv_" # NAME # "_mask" : RISCVSLoadMask;
+    def "int_riscv_" # NAME # "_mask" : RISCVSLoadMasked;
   }
   multiclass RISCVILoad {
     def "int_riscv_" # NAME : RISCVILoad;
-    def "int_riscv_" # NAME # "_mask" : RISCVILoadMask;
+    def "int_riscv_" # NAME # "_mask" : RISCVILoadMasked;
   }
   multiclass RISCVUSStore {
     def "int_riscv_" # NAME : RISCVUSStore;
-    def "int_riscv_" # NAME # "_mask" : RISCVUSStoreMask;
+    def "int_riscv_" # NAME # "_mask" : RISCVUSStoreMasked;
   }
   multiclass RISCVSStore {
     def "int_riscv_" # NAME : RISCVSStore;
-    def "int_riscv_" # NAME # "_mask" : RISCVSStoreMask;
+    def "int_riscv_" # NAME # "_mask" : RISCVSStoreMasked;
   }
 
   multiclass RISCVIStore {
     def "int_riscv_" # NAME : RISCVIStore;
-    def "int_riscv_" # NAME # "_mask" : RISCVIStoreMask;
+    def "int_riscv_" # NAME # "_mask" : RISCVIStoreMasked;
   }
   multiclass RISCVUnaryAA {
-    def "int_riscv_" # NAME : RISCVUnaryAANoMask;
-    def "int_riscv_" # NAME # "_mask" : RISCVUnaryAAMask;
+    def "int_riscv_" # NAME : RISCVUnaryAAUnMasked;
+    def "int_riscv_" # NAME # "_mask" : RISCVUnaryAAMasked;
   }
   multiclass RISCVUnaryAB {
-    def "int_riscv_" # NAME : RISCVUnaryABNoMask;
-    def "int_riscv_" # NAME # "_mask" : RISCVUnaryABMask;
+    def "int_riscv_" # NAME : RISCVUnaryABUnMasked;
+    def "int_riscv_" # NAME # "_mask" : RISCVUnaryABMasked;
   }
   // AAX means the destination type(A) is the same as the first source
   // type(A). X means any type for the second source operand.
   multiclass RISCVBinaryAAX {
-    def "int_riscv_" # NAME : RISCVBinaryAAXNoMask;
-    def "int_riscv_" # NAME # "_mask" : RISCVBinaryAAXMask;
+    def "int_riscv_" # NAME : RISCVBinaryAAXUnMasked;
+    def "int_riscv_" # NAME # "_mask" : RISCVBinaryAAXMasked;
   }
   // Like RISCVBinaryAAX, but the second operand is used a shift amount so it
   // must be a vector or an XLen scalar.
   multiclass RISCVBinaryAAShift {
-    def "int_riscv_" # NAME : RISCVBinaryAAShiftNoMask;
-    def "int_riscv_" # NAME # "_mask" : RISCVBinaryAAShiftMask;
+    def "int_riscv_" # NAME : RISCVBinaryAAShiftUnMasked;
+    def "int_riscv_" # NAME # "_mask" : RISCVBinaryAAShiftMasked;
   }
   multiclass RISCVRGatherVV {
-    def "int_riscv_" # NAME : RISCVRGatherVVNoMask;
-    def "int_riscv_" # NAME # "_mask" : RISCVRGatherVVMask;
+    def "int_riscv_" # NAME : RISCVRGatherVVUnMasked;
+    def "int_riscv_" # NAME # "_mask" : RISCVRGatherVVMasked;
   }
   multiclass RISCVRGatherVX {
-    def "int_riscv_" # NAME : RISCVGatherVXNoMask;
-    def "int_riscv_" # NAME # "_mask" : RISCVGatherVXMask;
+    def "int_riscv_" # NAME : RISCVGatherVXUnMasked;
+    def "int_riscv_" # NAME # "_mask" : RISCVGatherVXMasked;
   }
   multiclass RISCVRGatherEI16VV {
-    def "int_riscv_" # NAME : RISCVRGatherEI16VVNoMask;
-    def "int_riscv_" # NAME # "_mask" : RISCVRGatherEI16VVMask;
+    def "int_riscv_" # NAME : RISCVRGatherEI16VVUnMasked;
+    def "int_riscv_" # NAME # "_mask" : RISCVRGatherEI16VVMasked;
   }
   // ABX means the destination type(A) is different from the first source
   // type(B). X means any type for the second source operand.
   multiclass RISCVBinaryABX {
-    def "int_riscv_" # NAME : RISCVBinaryABXNoMask;
-    def "int_riscv_" # NAME # "_mask" : RISCVBinaryABXMask;
+    def "int_riscv_" # NAME : RISCVBinaryABXUnMasked;
+    def "int_riscv_" # NAME # "_mask" : RISCVBinaryABXMasked;
   }
   // Like RISCVBinaryABX, but the second operand is used a shift amount so it
   // must be a vector or an XLen scalar.
   multiclass RISCVBinaryABShift {
-    def "int_riscv_" # NAME : RISCVBinaryABShiftNoMask;
-    def "int_riscv_" # NAME # "_mask" : RISCVBinaryABShiftMask;
+    def "int_riscv_" # NAME : RISCVBinaryABShiftUnMasked;
+    def "int_riscv_" # NAME # "_mask" : RISCVBinaryABShiftMasked;
   }
   multiclass RISCVBinaryWithV0 {
     def "int_riscv_" # NAME : RISCVBinaryWithV0;
@@ -1062,80 +1099,80 @@ let TargetPrefix = "riscv" in {
     def "int_riscv_" # NAME : RISCVBinaryMOut;
   }
   multiclass RISCVSaturatingBinaryAAX {
-    def "int_riscv_" # NAME : RISCVSaturatingBinaryAAXNoMask;
-    def "int_riscv_" # NAME # "_mask" : RISCVSaturatingBinaryAAXMask;
+    def "int_riscv_" # NAME : RISCVSaturatingBinaryAAXUnMasked;
+    def "int_riscv_" # NAME # "_mask" : RISCVSaturatingBinaryAAXMasked;
   }
   multiclass RISCVSaturatingBinaryAAShift {
-    def "int_riscv_" # NAME : RISCVSaturatingBinaryAAShiftNoMask;
-    def "int_riscv_" # NAME # "_mask" : RISCVSaturatingBinaryAAShiftMask;
+    def "int_riscv_" # NAME : RISCVSaturatingBinaryAAShiftUnMasked;
+    def "int_riscv_" # NAME # "_mask" : RISCVSaturatingBinaryAAShiftMasked;
   }
   multiclass RISCVSaturatingBinaryABShift {
-    def "int_riscv_" # NAME : RISCVSaturatingBinaryABShiftNoMask;
-    def "int_riscv_" # NAME # "_mask" : RISCVSaturatingBinaryABShiftMask;
+    def "int_riscv_" # NAME : RISCVSaturatingBinaryABShiftUnMasked;
+    def "int_riscv_" # NAME # "_mask" : RISCVSaturatingBinaryABShiftMasked;
   }
-  multiclass RISCVTernaryAAAX {
-    def "int_riscv_" # NAME : RISCVTernaryAAAXNoMask;
-    def "int_riscv_" # NAME # "_mask" : RISCVTernaryAAAXMask;
+  multiclass RVVSlide {
+    def "int_riscv_" # NAME : RVVSlideUnMasked;
+    def "int_riscv_" # NAME # "_mask" : RVVSlideMasked;
   }
   multiclass RISCVTernaryAAXA {
-    def "int_riscv_" # NAME : RISCVTernaryAAXANoMask;
-    def "int_riscv_" # NAME # "_mask" : RISCVTernaryAAXAMask;
+    def "int_riscv_" # NAME : RISCVTernaryAAXAUnMasked;
+    def "int_riscv_" # NAME # "_mask" : RISCVTernaryAAXAMasked;
   }
   multiclass RISCVCompare {
-    def "int_riscv_" # NAME : RISCVCompareNoMask;
-    def "int_riscv_" # NAME # "_mask" : RISCVCompareMask;
+    def "int_riscv_" # NAME : RISCVCompareUnMasked;
+    def "int_riscv_" # NAME # "_mask" : RISCVCompareMasked;
   }
   multiclass RISCVClassify {
-    def "int_riscv_" # NAME : RISCVClassifyNoMask;
-    def "int_riscv_" # NAME # "_mask" : RISCVClassifyMask;
+    def "int_riscv_" # NAME : RISCVClassifyUnMasked;
+    def "int_riscv_" # NAME # "_mask" : RISCVClassifyMasked;
   }
   multiclass RISCVTernaryWide {
-    def "int_riscv_" # NAME : RISCVTernaryWideNoMask;
-    def "int_riscv_" # NAME # "_mask" : RISCVTernaryWideMask;
+    def "int_riscv_" # NAME : RISCVTernaryWideUnMasked;
+    def "int_riscv_" # NAME # "_mask" : RISCVTernaryWideMasked;
   }
   multiclass RISCVReduction {
-    def "int_riscv_" # NAME : RISCVReductionNoMask;
-    def "int_riscv_" # NAME # "_mask" : RISCVReductionMask;
+    def "int_riscv_" # NAME : RISCVReductionUnMasked;
+    def "int_riscv_" # NAME # "_mask" : RISCVReductionMasked;
   }
-  multiclass RISCVMaskUnarySOut {
-    def "int_riscv_" # NAME : RISCVMaskUnarySOutNoMask;
-    def "int_riscv_" # NAME # "_mask" : RISCVMaskUnarySOutMask;
+  multiclass RISCVMaskedUnarySOut {
+    def "int_riscv_" # NAME : RISCVMaskedUnarySOutUnMasked;
+    def "int_riscv_" # NAME # "_mask" : RISCVMaskedUnarySOutMasked;
   }
-  multiclass RISCVMaskUnaryMOut {
-    def "int_riscv_" # NAME : RISCVUnaryNoMask;
-    def "int_riscv_" # NAME # "_mask" : RISCVMaskUnaryMOutMask;
+  multiclass RISCVMaskedUnaryMOut {
+    def "int_riscv_" # NAME : RISCVUnaryUnMasked;
+    def "int_riscv_" # NAME # "_mask" : RISCVMaskedUnaryMOutMasked;
   }
   multiclass RISCVConversion {
-    def "int_riscv_" #NAME :RISCVConversionNoMask;
-    def "int_riscv_" # NAME # "_mask" : RISCVConversionMask;
+    def "int_riscv_" #NAME :RISCVConversionUnMasked;
+    def "int_riscv_" # NAME # "_mask" : RISCVConversionMasked;
   }
   multiclass RISCVUSSegLoad<int nf> {
     def "int_riscv_" # NAME : RISCVUSSegLoad<nf>;
-    def "int_riscv_" # NAME # "_mask" : RISCVUSSegLoadMask<nf>;
+    def "int_riscv_" # NAME # "_mask" : RISCVUSSegLoadMasked<nf>;
   }
   multiclass RISCVUSSegLoadFF<int nf> {
     def "int_riscv_" # NAME : RISCVUSSegLoadFF<nf>;
-    def "int_riscv_" # NAME # "_mask" : RISCVUSSegLoadFFMask<nf>;
+    def "int_riscv_" # NAME # "_mask" : RISCVUSSegLoadFFMasked<nf>;
   }
   multiclass RISCVSSegLoad<int nf> {
     def "int_riscv_" # NAME : RISCVSSegLoad<nf>;
-    def "int_riscv_" # NAME # "_mask" : RISCVSSegLoadMask<nf>;
+    def "int_riscv_" # NAME # "_mask" : RISCVSSegLoadMasked<nf>;
   }
   multiclass RISCVISegLoad<int nf> {
     def "int_riscv_" # NAME : RISCVISegLoad<nf>;
-    def "int_riscv_" # NAME # "_mask" : RISCVISegLoadMask<nf>;
+    def "int_riscv_" # NAME # "_mask" : RISCVISegLoadMasked<nf>;
   }
   multiclass RISCVUSSegStore<int nf> {
     def "int_riscv_" # NAME : RISCVUSSegStore<nf>;
-    def "int_riscv_" # NAME # "_mask" : RISCVUSSegStoreMask<nf>;
+    def "int_riscv_" # NAME # "_mask" : RISCVUSSegStoreMasked<nf>;
   }
   multiclass RISCVSSegStore<int nf> {
     def "int_riscv_" # NAME : RISCVSSegStore<nf>;
-    def "int_riscv_" # NAME # "_mask" : RISCVSSegStoreMask<nf>;
+    def "int_riscv_" # NAME # "_mask" : RISCVSSegStoreMasked<nf>;
   }
   multiclass RISCVISegStore<int nf> {
     def "int_riscv_" # NAME : RISCVISegStore<nf>;
-    def "int_riscv_" # NAME # "_mask" : RISCVISegStoreMask<nf>;
+    def "int_riscv_" # NAME # "_mask" : RISCVISegStoreMasked<nf>;
   }
 
   defm vle : RISCVUSLoad;
@@ -1242,20 +1279,29 @@ let TargetPrefix = "riscv" in {
 
   defm vmerge : RISCVBinaryWithV0;
 
+  // Output: (vector)
+  // Input: (passthru, vector_in, vl)
   def int_riscv_vmv_v_v : Intrinsic<[llvm_anyvector_ty],
-                                    [LLVMMatchType<0>, llvm_anyint_ty],
+                                    [LLVMMatchType<0>, LLVMMatchType<0>,
+                                     llvm_anyint_ty],
                                     [IntrNoMem]>, RISCVVIntrinsic {
-    let VLOperand = 1;
+    let VLOperand = 2;
   }
+  // Output: (vector)
+  // Input: (passthru, scalar, vl)
   def int_riscv_vmv_v_x : Intrinsic<[llvm_anyint_ty],
-                                    [LLVMVectorElementType<0>, llvm_anyint_ty],
+                                    [LLVMMatchType<0>, LLVMVectorElementType<0>,
+                                     llvm_anyint_ty],
                                     [IntrNoMem]>, RISCVVIntrinsic {
-    let VLOperand = 1;
+    let VLOperand = 2;
   }
+  // Output: (vector)
+  // Input: (passthru, scalar, vl)
   def int_riscv_vfmv_v_f : Intrinsic<[llvm_anyfloat_ty],
-                                     [LLVMVectorElementType<0>, llvm_anyint_ty],
+                                     [LLVMMatchType<0>, LLVMVectorElementType<0>,
+                                      llvm_anyint_ty],
                                      [IntrNoMem]>, RISCVVIntrinsic {
-    let VLOperand = 1;
+    let VLOperand = 2;
   }
 
   def int_riscv_vmv_x_s : Intrinsic<[LLVMVectorElementType<0>],
@@ -1313,8 +1359,8 @@ let TargetPrefix = "riscv" in {
 
   defm vfmerge : RISCVBinaryWithV0;
 
-  defm vslideup : RISCVTernaryAAAX;
-  defm vslidedown : RISCVTernaryAAAX;
+  defm vslideup : RVVSlide;
+  defm vslidedown : RVVSlide;
 
   defm vslide1up : RISCVBinaryAAX;
   defm vslide1down : RISCVBinaryAAX;
@@ -1325,7 +1371,7 @@ let TargetPrefix = "riscv" in {
   defm vrgather_vx : RISCVRGatherVX;
   defm vrgatherei16_vv : RISCVRGatherEI16VV;
 
-  def "int_riscv_vcompress" : RISCVUnaryAAMaskNoTA;
+  def "int_riscv_vcompress" : RISCVCompress;
 
   defm vaaddu : RISCVSaturatingBinaryAAX;
   defm vaadd : RISCVSaturatingBinaryAAX;
@@ -1367,22 +1413,22 @@ let TargetPrefix = "riscv" in {
   defm vfwredusum : RISCVReduction;
   defm vfwredosum : RISCVReduction;
 
-  def int_riscv_vmand: RISCVBinaryAAANoMask;
-  def int_riscv_vmnand: RISCVBinaryAAANoMask;
-  def int_riscv_vmandn: RISCVBinaryAAANoMask;
-  def int_riscv_vmxor: RISCVBinaryAAANoMask;
-  def int_riscv_vmor: RISCVBinaryAAANoMask;
-  def int_riscv_vmnor: RISCVBinaryAAANoMask;
-  def int_riscv_vmorn: RISCVBinaryAAANoMask;
-  def int_riscv_vmxnor: RISCVBinaryAAANoMask;
+  def int_riscv_vmand: RISCVBinaryAAAUnMasked;
+  def int_riscv_vmnand: RISCVBinaryAAAUnMasked;
+  def int_riscv_vmandn: RISCVBinaryAAAUnMasked;
+  def int_riscv_vmxor: RISCVBinaryAAAUnMasked;
+  def int_riscv_vmor: RISCVBinaryAAAUnMasked;
+  def int_riscv_vmnor: RISCVBinaryAAAUnMasked;
+  def int_riscv_vmorn: RISCVBinaryAAAUnMasked;
+  def int_riscv_vmxnor: RISCVBinaryAAAUnMasked;
   def int_riscv_vmclr : RISCVNullaryIntrinsic;
   def int_riscv_vmset : RISCVNullaryIntrinsic;
 
-  defm vcpop : RISCVMaskUnarySOut;
-  defm vfirst : RISCVMaskUnarySOut;
-  defm vmsbf : RISCVMaskUnaryMOut;
-  defm vmsof : RISCVMaskUnaryMOut;
-  defm vmsif : RISCVMaskUnaryMOut;
+  defm vcpop : RISCVMaskedUnarySOut;
+  defm vfirst : RISCVMaskedUnarySOut;
+  defm vmsbf : RISCVMaskedUnaryMOut;
+  defm vmsof : RISCVMaskedUnaryMOut;
+  defm vmsif : RISCVMaskedUnaryMOut;
 
   defm vfcvt_xu_f_v : RISCVConversion;
   defm vfcvt_x_f_v : RISCVConversion;
@@ -1409,34 +1455,35 @@ let TargetPrefix = "riscv" in {
   defm vfncvt_rod_f_f_w : RISCVConversion;
 
   // Output: (vector)
-  // Input: (mask type input, vl)
+  // Input: (passthru, mask type input, vl)
   def int_riscv_viota : Intrinsic<[llvm_anyvector_ty],
-                                  [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                  [LLVMMatchType<0>,
+                                   LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                    llvm_anyint_ty],
                                   [IntrNoMem]>, RISCVVIntrinsic {
-    let VLOperand = 1;
+    let VLOperand = 2;
   }
   // Output: (vector)
-  // Input: (maskedoff, mask type vector_in, mask, vl)
+  // Input: (maskedoff, mask type vector_in, mask, vl, policy)
   def int_riscv_viota_mask : Intrinsic<[llvm_anyvector_ty],
                                        [LLVMMatchType<0>,
                                         LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                         LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                        llvm_anyint_ty],
-                                       [IntrNoMem]>, RISCVVIntrinsic {
+                                        llvm_anyint_ty, LLVMMatchType<1>],
+                                       [ImmArg<ArgIndex<4>>, IntrNoMem]>, RISCVVIntrinsic {
     let VLOperand = 3;
   }
   // Output: (vector)
-  // Input: (vl)
-  def int_riscv_vid : RISCVNullaryIntrinsic;
+  // Input: (passthru, vl)
+  def int_riscv_vid : RISCVID;
 
   // Output: (vector)
-  // Input: (maskedoff, mask, vl)
+  // Input: (maskedoff, mask, vl, policy)
   def int_riscv_vid_mask : Intrinsic<[llvm_anyvector_ty],
                                      [LLVMMatchType<0>,
                                       LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                      llvm_anyint_ty],
-                                     [IntrNoMem]>, RISCVVIntrinsic {
+                                      llvm_anyint_ty, LLVMMatchType<1>],
+                                     [ImmArg<ArgIndex<3>>, IntrNoMem]>, RISCVVIntrinsic {
     let VLOperand = 2;
   }
 
@@ -1463,6 +1510,16 @@ let TargetPrefix = "riscv" in {
                     [llvm_anyvector_ty, llvm_anyptr_ty,
                      llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                     [NoCapture<ArgIndex<1>>, IntrWriteMem]>;
+
+  // Segment loads for fixed vectors.
+  foreach nf = [2, 3, 4, 5, 6, 7, 8] in {
+    def int_riscv_seg # nf # _load
+          : Intrinsic<!listconcat([llvm_anyvector_ty], !listsplat(LLVMMatchType<0>,
+                                !add(nf, -1))),
+                      [llvm_anyptr_ty, llvm_anyint_ty],
+                      [NoCapture<ArgIndex<0>>, IntrReadMem]>;
+  }
+
 } // TargetPrefix = "riscv"
 
 //===----------------------------------------------------------------------===//
@@ -1503,7 +1560,7 @@ class ScalarCryptoByteSelectAny
     : Intrinsic<[llvm_anyint_ty],
                 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i8_ty],
                 [IntrNoMem, IntrSpeculatable, IntrWillReturn,
-                 ImmArg<ArgIndex<2>>, Returned<ArgIndex<0>>]>;
+                 ImmArg<ArgIndex<2>>]>;
 
 // Zknd
 def int_riscv_aes32dsi  : ScalarCryptoByteSelect32;
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
new file mode 100644
index 000000000000..14c628595d30
--- /dev/null
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -0,0 +1,31 @@
+//===- IntrinsicsSPIRV.td - Defines SPIRV intrinsics -------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines all of the SPIRV-specific intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+let TargetPrefix = "spv" in {
+  def int_spv_assign_type : Intrinsic<[], [llvm_any_ty, llvm_metadata_ty]>;
+  def int_spv_assign_name : Intrinsic<[], [llvm_any_ty, llvm_vararg_ty]>;
+
+  def int_spv_track_constant : Intrinsic<[llvm_any_ty], [llvm_any_ty, llvm_metadata_ty]>;
+  def int_spv_init_global : Intrinsic<[], [llvm_any_ty, llvm_any_ty]>;
+  def int_spv_unref_global : Intrinsic<[], [llvm_any_ty]>;
+
+  def int_spv_gep : Intrinsic<[llvm_anyptr_ty], [llvm_i1_ty, llvm_any_ty, llvm_vararg_ty], [ImmArg<ArgIndex<0>>]>;
+  def int_spv_load : Intrinsic<[llvm_i32_ty], [llvm_anyptr_ty, llvm_i16_ty, llvm_i8_ty], [ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
+  def int_spv_store : Intrinsic<[], [llvm_i32_ty, llvm_anyptr_ty, llvm_i16_ty, llvm_i8_ty], [ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
+  def int_spv_extractv : Intrinsic<[llvm_any_ty], [llvm_i32_ty, llvm_vararg_ty]>;
+  def int_spv_insertv : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_any_ty, llvm_vararg_ty]>;
+  def int_spv_extractelt : Intrinsic<[llvm_any_ty], [llvm_any_ty, llvm_anyint_ty]>;
+  def int_spv_insertelt : Intrinsic<[llvm_any_ty], [llvm_any_ty, llvm_any_ty, llvm_anyint_ty]>;
+  def int_spv_const_composite : Intrinsic<[llvm_i32_ty], [llvm_vararg_ty]>;
+  def int_spv_bitcast : Intrinsic<[llvm_any_ty], [llvm_any_ty]>;
+  def int_spv_switch : Intrinsic<[], [llvm_any_ty, llvm_vararg_ty]>;
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsSystemZ.td b/llvm/include/llvm/IR/IntrinsicsSystemZ.td
index a149b571072c..d881a1126bf2 100644
--- a/llvm/include/llvm/IR/IntrinsicsSystemZ.td
+++ b/llvm/include/llvm/IR/IntrinsicsSystemZ.td
@@ -11,7 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 class SystemZUnaryConv<string name, LLVMType result, LLVMType arg>
-  : GCCBuiltin<"__builtin_s390_" # name>,
+  : ClangBuiltin<"__builtin_s390_" # name>,
     Intrinsic<[result], [arg], [IntrNoMem]>;
 
 class SystemZUnary<string name, LLVMType type>
@@ -24,14 +24,14 @@ class SystemZUnaryCC<LLVMType type>
   : SystemZUnaryConvCC<type, type>;
 
 class SystemZBinaryConv<string name, LLVMType result, LLVMType arg>
-  : GCCBuiltin<"__builtin_s390_" # name>,
+  : ClangBuiltin<"__builtin_s390_" # name>,
     Intrinsic<[result], [arg, arg], [IntrNoMem]>;
 
 class SystemZBinary<string name, LLVMType type>
   : SystemZBinaryConv<name, type, type>;
 
 class SystemZBinaryInt<string name, LLVMType type>
-  : GCCBuiltin<"__builtin_s390_" # name>,
+  : ClangBuiltin<"__builtin_s390_" # name>,
     Intrinsic<[type], [type, llvm_i32_ty], [IntrNoMem]>;
 
 class SystemZBinaryConvCC<LLVMType result, LLVMType arg>
@@ -45,7 +45,7 @@ class SystemZBinaryCC<LLVMType type>
   : SystemZBinaryConvCC<type, type>;
 
 class SystemZTernaryConv<string name, LLVMType result, LLVMType arg>
-  : GCCBuiltin<"__builtin_s390_" # name>,
+  : ClangBuiltin<"__builtin_s390_" # name>,
     Intrinsic<[result], [arg, arg, result], [IntrNoMem]>;
 
 class SystemZTernaryConvCC<LLVMType result, LLVMType arg>
@@ -55,7 +55,7 @@ class SystemZTernary<string name, LLVMType type>
   : SystemZTernaryConv<name, type, type>;
 
 class SystemZTernaryInt<string name, LLVMType type>
-  : GCCBuiltin<"__builtin_s390_" # name>,
+  : ClangBuiltin<"__builtin_s390_" # name>,
     Intrinsic<[type], [type, type, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 class SystemZTernaryIntCC<LLVMType type>
@@ -63,7 +63,7 @@ class SystemZTernaryIntCC<LLVMType type>
               [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 class SystemZQuaternaryInt<string name, LLVMType type>
-  : GCCBuiltin<"__builtin_s390_" # name>,
+  : ClangBuiltin<"__builtin_s390_" # name>,
     Intrinsic<[type], [type, type, type, llvm_i32_ty],
     [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
@@ -216,16 +216,16 @@ let TargetPrefix = "s390" in {
   def int_s390_tabort : Intrinsic<[], [llvm_i64_ty],
                                   [IntrNoReturn, Throws, IntrWriteMem]>;
 
-  def int_s390_tend : GCCBuiltin<"__builtin_tend">,
+  def int_s390_tend : ClangBuiltin<"__builtin_tend">,
                       Intrinsic<[llvm_i32_ty], []>;
 
-  def int_s390_etnd : GCCBuiltin<"__builtin_tx_nesting_depth">,
+  def int_s390_etnd : ClangBuiltin<"__builtin_tx_nesting_depth">,
                       Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>;
 
   def int_s390_ntstg : Intrinsic<[], [llvm_i64_ty, llvm_ptr64_ty],
                                  [IntrArgMemOnly, IntrWriteMem]>;
 
-  def int_s390_ppa_txassist : GCCBuiltin<"__builtin_tx_assist">,
+  def int_s390_ppa_txassist : ClangBuiltin<"__builtin_tx_assist">,
                               Intrinsic<[], [llvm_i32_ty]>;
 }
 
@@ -236,24 +236,24 @@ let TargetPrefix = "s390" in {
 //===----------------------------------------------------------------------===//
 
 let TargetPrefix = "s390" in {
-  def int_s390_lcbb : GCCBuiltin<"__builtin_s390_lcbb">,
+  def int_s390_lcbb : ClangBuiltin<"__builtin_s390_lcbb">,
                       Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
                                 [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-  def int_s390_vlbb : GCCBuiltin<"__builtin_s390_vlbb">,
+  def int_s390_vlbb : ClangBuiltin<"__builtin_s390_vlbb">,
                       Intrinsic<[llvm_v16i8_ty], [llvm_ptr_ty, llvm_i32_ty],
                                 [IntrReadMem, IntrArgMemOnly, ImmArg<ArgIndex<1>>]>;
 
-  def int_s390_vll : GCCBuiltin<"__builtin_s390_vll">,
+  def int_s390_vll : ClangBuiltin<"__builtin_s390_vll">,
                      Intrinsic<[llvm_v16i8_ty], [llvm_i32_ty, llvm_ptr_ty],
                                [IntrReadMem, IntrArgMemOnly]>;
 
-  def int_s390_vpdi : GCCBuiltin<"__builtin_s390_vpdi">,
+  def int_s390_vpdi : ClangBuiltin<"__builtin_s390_vpdi">,
                       Intrinsic<[llvm_v2i64_ty],
                                 [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty],
                                 [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
-  def int_s390_vperm : GCCBuiltin<"__builtin_s390_vperm">,
+  def int_s390_vperm : ClangBuiltin<"__builtin_s390_vperm">,
                        Intrinsic<[llvm_v16i8_ty],
                                  [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
                                  [IntrNoMem]>;
@@ -264,7 +264,7 @@ let TargetPrefix = "s390" in {
   defm int_s390_vpkls : SystemZBinaryTruncHFG<"vpkls">;
   defm int_s390_vpkls : SystemZBinaryTruncCCHFG;
 
-  def int_s390_vstl : GCCBuiltin<"__builtin_s390_vstl">,
+  def int_s390_vstl : ClangBuiltin<"__builtin_s390_vstl">,
                       Intrinsic<[], [llvm_v16i8_ty, llvm_i32_ty, llvm_ptr_ty],
                                 [IntrArgMemOnly, IntrWriteMem]>;
 
@@ -314,7 +314,7 @@ let TargetPrefix = "s390" in {
   def int_s390_vsrl  : SystemZBinary<"vsrl",  llvm_v16i8_ty>;
   def int_s390_vsrlb : SystemZBinary<"vsrlb", llvm_v16i8_ty>;
 
-  def int_s390_vsldb : GCCBuiltin<"__builtin_s390_vsldb">,
+  def int_s390_vsldb : ClangBuiltin<"__builtin_s390_vsldb">,
                        Intrinsic<[llvm_v16i8_ty],
                                  [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
                                  [IntrNoMem, ImmArg<ArgIndex<2>>]>;
@@ -382,7 +382,7 @@ let TargetPrefix = "s390" in {
   def int_s390_vbperm : SystemZBinaryConv<"vbperm", llvm_v2i64_ty,
                                           llvm_v16i8_ty>;
 
-  def int_s390_vmslg  : GCCBuiltin<"__builtin_s390_vmslg">,
+  def int_s390_vmslg  : ClangBuiltin<"__builtin_s390_vmslg">,
                         Intrinsic<[llvm_v16i8_ty],
                                   [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v16i8_ty,
                                    llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<3>>]>;
@@ -411,21 +411,21 @@ let TargetPrefix = "s390" in {
                                  [IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
 
   // Instructions from the Vector Packed Decimal Facility
-  def int_s390_vlrl : GCCBuiltin<"__builtin_s390_vlrl">,
+  def int_s390_vlrl : ClangBuiltin<"__builtin_s390_vlrl">,
                       Intrinsic<[llvm_v16i8_ty], [llvm_i32_ty, llvm_ptr_ty],
                                 [IntrReadMem, IntrArgMemOnly]>;
 
-  def int_s390_vstrl : GCCBuiltin<"__builtin_s390_vstrl">,
+  def int_s390_vstrl : ClangBuiltin<"__builtin_s390_vstrl">,
                        Intrinsic<[], [llvm_v16i8_ty, llvm_i32_ty, llvm_ptr_ty],
                                  [IntrArgMemOnly, IntrWriteMem]>;
 
   // Instructions from the Vector Enhancements Facility 2
-  def int_s390_vsld : GCCBuiltin<"__builtin_s390_vsld">,
+  def int_s390_vsld : ClangBuiltin<"__builtin_s390_vsld">,
                       Intrinsic<[llvm_v16i8_ty],
                                 [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
                                 [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
-  def int_s390_vsrd : GCCBuiltin<"__builtin_s390_vsrd">,
+  def int_s390_vsrd : ClangBuiltin<"__builtin_s390_vsrd">,
                       Intrinsic<[llvm_v16i8_ty],
                                 [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
                                 [IntrNoMem, ImmArg<ArgIndex<2>>]>;
@@ -438,23 +438,23 @@ let TargetPrefix = "s390" in {
   def int_s390_vstrszf : SystemZTernaryConvCC<llvm_v16i8_ty, llvm_v4i32_ty>;
 
   // Instructions from the NNP-assist Facility
-  def int_s390_vclfnhs : GCCBuiltin<"__builtin_s390_vclfnhs">,
+  def int_s390_vclfnhs : ClangBuiltin<"__builtin_s390_vclfnhs">,
                          Intrinsic<[llvm_v4f32_ty],
                                    [llvm_v8i16_ty, llvm_i32_ty],
                                    [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_s390_vclfnls : GCCBuiltin<"__builtin_s390_vclfnls">,
+  def int_s390_vclfnls : ClangBuiltin<"__builtin_s390_vclfnls">,
                          Intrinsic<[llvm_v4f32_ty],
                                    [llvm_v8i16_ty, llvm_i32_ty],
                                    [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_s390_vcrnfs : GCCBuiltin<"__builtin_s390_vcrnfs">,
+  def int_s390_vcrnfs : ClangBuiltin<"__builtin_s390_vcrnfs">,
                         Intrinsic<[llvm_v8i16_ty],
                                   [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty],
                                   [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_s390_vcfn : GCCBuiltin<"__builtin_s390_vcfn">,
+  def int_s390_vcfn : ClangBuiltin<"__builtin_s390_vcfn">,
                       Intrinsic<[llvm_v8i16_ty],
                                 [llvm_v8i16_ty, llvm_i32_ty],
                                 [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_s390_vcnf : GCCBuiltin<"__builtin_s390_vcnf">,
+  def int_s390_vcnf : ClangBuiltin<"__builtin_s390_vcnf">,
                       Intrinsic<[llvm_v8i16_ty],
                                 [llvm_v8i16_ty, llvm_i32_ty],
                                 [IntrNoMem, ImmArg<ArgIndex<1>>]>;
@@ -467,9 +467,9 @@ let TargetPrefix = "s390" in {
 //===----------------------------------------------------------------------===//
 
 let TargetPrefix = "s390" in {
-  def int_s390_sfpc : GCCBuiltin<"__builtin_s390_sfpc">,
+  def int_s390_sfpc : ClangBuiltin<"__builtin_s390_sfpc">,
                       Intrinsic<[], [llvm_i32_ty], []>;
-  def int_s390_efpc : GCCBuiltin<"__builtin_s390_efpc">,
+  def int_s390_efpc : ClangBuiltin<"__builtin_s390_efpc">,
                       Intrinsic<[llvm_i32_ty], [], []>;
 
   def int_s390_tdc : Intrinsic<[llvm_i32_ty], [llvm_anyfloat_ty, llvm_i64_ty],
diff --git a/llvm/include/llvm/IR/IntrinsicsVE.td b/llvm/include/llvm/IR/IntrinsicsVE.td
index be4bccef0cc1..15b828b320ea 100644
--- a/llvm/include/llvm/IR/IntrinsicsVE.td
+++ b/llvm/include/llvm/IR/IntrinsicsVE.td
@@ -2,31 +2,28 @@
 
 // VEL Intrinsic instructions.
 let TargetPrefix = "ve" in {
-  def int_ve_vl_svob : GCCBuiltin<"__builtin_ve_vl_svob">,
-                       Intrinsic<[], [], [IntrHasSideEffects]>;
-
-  def int_ve_vl_pack_f32p : GCCBuiltin<"__builtin_ve_vl_pack_f32p">,
+  def int_ve_vl_pack_f32p : ClangBuiltin<"__builtin_ve_vl_pack_f32p">,
                             Intrinsic<[llvm_i64_ty], [llvm_ptr_ty, llvm_ptr_ty],
                                       [IntrReadMem]>;
-  def int_ve_vl_pack_f32a : GCCBuiltin<"__builtin_ve_vl_pack_f32a">,
+  def int_ve_vl_pack_f32a : ClangBuiltin<"__builtin_ve_vl_pack_f32a">,
                             Intrinsic<[llvm_i64_ty], [llvm_ptr_ty],
                                       [IntrReadMem]>;
 
   def int_ve_vl_extract_vm512u :
-      GCCBuiltin<"__builtin_ve_vl_extract_vm512u">,
+      ClangBuiltin<"__builtin_ve_vl_extract_vm512u">,
       Intrinsic<[LLVMType<v256i1>], [LLVMType<v512i1>], [IntrNoMem]>;
 
   def int_ve_vl_extract_vm512l :
-      GCCBuiltin<"__builtin_ve_vl_extract_vm512l">,
+      ClangBuiltin<"__builtin_ve_vl_extract_vm512l">,
       Intrinsic<[LLVMType<v256i1>], [LLVMType<v512i1>], [IntrNoMem]>;
 
   def int_ve_vl_insert_vm512u :
-      GCCBuiltin<"__builtin_ve_vl_insert_vm512u">,
+      ClangBuiltin<"__builtin_ve_vl_insert_vm512u">,
       Intrinsic<[LLVMType<v512i1>], [LLVMType<v512i1>, LLVMType<v256i1>],
                 [IntrNoMem]>;
 
   def int_ve_vl_insert_vm512l :
-      GCCBuiltin<"__builtin_ve_vl_insert_vm512l">,
+      ClangBuiltin<"__builtin_ve_vl_insert_vm512l">,
       Intrinsic<[LLVMType<v512i1>], [LLVMType<v512i1>, LLVMType<v256i1>],
                 [IntrNoMem]>;
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsVEVL.gen.td b/llvm/include/llvm/IR/IntrinsicsVEVL.gen.td
index 67cbd307903d..554dd8557200 100644
--- a/llvm/include/llvm/IR/IntrinsicsVEVL.gen.td
+++ b/llvm/include/llvm/IR/IntrinsicsVEVL.gen.td
@@ -1,1213 +1,1257 @@
-let TargetPrefix = "ve" in def int_ve_vl_vld_vssl : GCCBuiltin<"__builtin_ve_vl_vld_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vld_vssvl : GCCBuiltin<"__builtin_ve_vl_vld_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vldnc_vssl : GCCBuiltin<"__builtin_ve_vl_vldnc_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vldnc_vssvl : GCCBuiltin<"__builtin_ve_vl_vldnc_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vldu_vssl : GCCBuiltin<"__builtin_ve_vl_vldu_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vldu_vssvl : GCCBuiltin<"__builtin_ve_vl_vldu_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vldunc_vssl : GCCBuiltin<"__builtin_ve_vl_vldunc_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vldunc_vssvl : GCCBuiltin<"__builtin_ve_vl_vldunc_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vldlsx_vssl : GCCBuiltin<"__builtin_ve_vl_vldlsx_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vldlsx_vssvl : GCCBuiltin<"__builtin_ve_vl_vldlsx_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vldlsxnc_vssl : GCCBuiltin<"__builtin_ve_vl_vldlsxnc_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vldlsxnc_vssvl : GCCBuiltin<"__builtin_ve_vl_vldlsxnc_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vldlzx_vssl : GCCBuiltin<"__builtin_ve_vl_vldlzx_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vldlzx_vssvl : GCCBuiltin<"__builtin_ve_vl_vldlzx_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vldlzxnc_vssl : GCCBuiltin<"__builtin_ve_vl_vldlzxnc_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vldlzxnc_vssvl : GCCBuiltin<"__builtin_ve_vl_vldlzxnc_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vld2d_vssl : GCCBuiltin<"__builtin_ve_vl_vld2d_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vld2d_vssvl : GCCBuiltin<"__builtin_ve_vl_vld2d_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vld2dnc_vssl : GCCBuiltin<"__builtin_ve_vl_vld2dnc_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vld2dnc_vssvl : GCCBuiltin<"__builtin_ve_vl_vld2dnc_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vldu2d_vssl : GCCBuiltin<"__builtin_ve_vl_vldu2d_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vldu2d_vssvl : GCCBuiltin<"__builtin_ve_vl_vldu2d_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vldu2dnc_vssl : GCCBuiltin<"__builtin_ve_vl_vldu2dnc_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vldu2dnc_vssvl : GCCBuiltin<"__builtin_ve_vl_vldu2dnc_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vldl2dsx_vssl : GCCBuiltin<"__builtin_ve_vl_vldl2dsx_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vldl2dsx_vssvl : GCCBuiltin<"__builtin_ve_vl_vldl2dsx_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vldl2dsxnc_vssl : GCCBuiltin<"__builtin_ve_vl_vldl2dsxnc_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vldl2dsxnc_vssvl : GCCBuiltin<"__builtin_ve_vl_vldl2dsxnc_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vldl2dzx_vssl : GCCBuiltin<"__builtin_ve_vl_vldl2dzx_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vldl2dzx_vssvl : GCCBuiltin<"__builtin_ve_vl_vldl2dzx_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vldl2dzxnc_vssl : GCCBuiltin<"__builtin_ve_vl_vldl2dzxnc_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vldl2dzxnc_vssvl : GCCBuiltin<"__builtin_ve_vl_vldl2dzxnc_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vst_vssl : GCCBuiltin<"__builtin_ve_vl_vst_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vst_vssml : GCCBuiltin<"__builtin_ve_vl_vst_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstnc_vssl : GCCBuiltin<"__builtin_ve_vl_vstnc_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstnc_vssml : GCCBuiltin<"__builtin_ve_vl_vstnc_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstot_vssl : GCCBuiltin<"__builtin_ve_vl_vstot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstot_vssml : GCCBuiltin<"__builtin_ve_vl_vstot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstncot_vssl : GCCBuiltin<"__builtin_ve_vl_vstncot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstncot_vssml : GCCBuiltin<"__builtin_ve_vl_vstncot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstu_vssl : GCCBuiltin<"__builtin_ve_vl_vstu_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstu_vssml : GCCBuiltin<"__builtin_ve_vl_vstu_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstunc_vssl : GCCBuiltin<"__builtin_ve_vl_vstunc_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstunc_vssml : GCCBuiltin<"__builtin_ve_vl_vstunc_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstuot_vssl : GCCBuiltin<"__builtin_ve_vl_vstuot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstuot_vssml : GCCBuiltin<"__builtin_ve_vl_vstuot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstuncot_vssl : GCCBuiltin<"__builtin_ve_vl_vstuncot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstuncot_vssml : GCCBuiltin<"__builtin_ve_vl_vstuncot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstl_vssl : GCCBuiltin<"__builtin_ve_vl_vstl_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstl_vssml : GCCBuiltin<"__builtin_ve_vl_vstl_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstlnc_vssl : GCCBuiltin<"__builtin_ve_vl_vstlnc_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstlnc_vssml : GCCBuiltin<"__builtin_ve_vl_vstlnc_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstlot_vssl : GCCBuiltin<"__builtin_ve_vl_vstlot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstlot_vssml : GCCBuiltin<"__builtin_ve_vl_vstlot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstlncot_vssl : GCCBuiltin<"__builtin_ve_vl_vstlncot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstlncot_vssml : GCCBuiltin<"__builtin_ve_vl_vstlncot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vst2d_vssl : GCCBuiltin<"__builtin_ve_vl_vst2d_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vst2d_vssml : GCCBuiltin<"__builtin_ve_vl_vst2d_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vst2dnc_vssl : GCCBuiltin<"__builtin_ve_vl_vst2dnc_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vst2dnc_vssml : GCCBuiltin<"__builtin_ve_vl_vst2dnc_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vst2dot_vssl : GCCBuiltin<"__builtin_ve_vl_vst2dot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vst2dot_vssml : GCCBuiltin<"__builtin_ve_vl_vst2dot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vst2dncot_vssl : GCCBuiltin<"__builtin_ve_vl_vst2dncot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vst2dncot_vssml : GCCBuiltin<"__builtin_ve_vl_vst2dncot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstu2d_vssl : GCCBuiltin<"__builtin_ve_vl_vstu2d_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstu2d_vssml : GCCBuiltin<"__builtin_ve_vl_vstu2d_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstu2dnc_vssl : GCCBuiltin<"__builtin_ve_vl_vstu2dnc_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstu2dnc_vssml : GCCBuiltin<"__builtin_ve_vl_vstu2dnc_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstu2dot_vssl : GCCBuiltin<"__builtin_ve_vl_vstu2dot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstu2dot_vssml : GCCBuiltin<"__builtin_ve_vl_vstu2dot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstu2dncot_vssl : GCCBuiltin<"__builtin_ve_vl_vstu2dncot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstu2dncot_vssml : GCCBuiltin<"__builtin_ve_vl_vstu2dncot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstl2d_vssl : GCCBuiltin<"__builtin_ve_vl_vstl2d_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstl2d_vssml : GCCBuiltin<"__builtin_ve_vl_vstl2d_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstl2dnc_vssl : GCCBuiltin<"__builtin_ve_vl_vstl2dnc_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstl2dnc_vssml : GCCBuiltin<"__builtin_ve_vl_vstl2dnc_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstl2dot_vssl : GCCBuiltin<"__builtin_ve_vl_vstl2dot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstl2dot_vssml : GCCBuiltin<"__builtin_ve_vl_vstl2dot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstl2dncot_vssl : GCCBuiltin<"__builtin_ve_vl_vstl2dncot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vstl2dncot_vssml : GCCBuiltin<"__builtin_ve_vl_vstl2dncot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pfchv_ssl : GCCBuiltin<"__builtin_ve_vl_pfchv_ssl">, Intrinsic<[], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrInaccessibleMemOrArgMemOnly]>;
-let TargetPrefix = "ve" in def int_ve_vl_pfchvnc_ssl : GCCBuiltin<"__builtin_ve_vl_pfchvnc_ssl">, Intrinsic<[], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrInaccessibleMemOrArgMemOnly]>;
-let TargetPrefix = "ve" in def int_ve_vl_lsv_vvss : GCCBuiltin<"__builtin_ve_vl_lsv_vvss">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<i64>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_lvsl_svs : GCCBuiltin<"__builtin_ve_vl_lvsl_svs">, Intrinsic<[LLVMType<i64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_lvsd_svs : GCCBuiltin<"__builtin_ve_vl_lvsd_svs">, Intrinsic<[LLVMType<f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_lvss_svs : GCCBuiltin<"__builtin_ve_vl_lvss_svs">, Intrinsic<[LLVMType<f32>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_lvm_mmss : GCCBuiltin<"__builtin_ve_vl_lvm_mmss">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256i1>, LLVMType<i64>, LLVMType<i64>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_lvm_MMss : GCCBuiltin<"__builtin_ve_vl_lvm_MMss">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v512i1>, LLVMType<i64>, LLVMType<i64>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_svm_sms : GCCBuiltin<"__builtin_ve_vl_svm_sms">, Intrinsic<[LLVMType<i64>], [LLVMType<v256i1>, LLVMType<i64>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_svm_sMs : GCCBuiltin<"__builtin_ve_vl_svm_sMs">, Intrinsic<[LLVMType<i64>], [LLVMType<v512i1>, LLVMType<i64>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vbrdd_vsl : GCCBuiltin<"__builtin_ve_vl_vbrdd_vsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vbrdd_vsvl : GCCBuiltin<"__builtin_ve_vl_vbrdd_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vbrdd_vsmvl : GCCBuiltin<"__builtin_ve_vl_vbrdd_vsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vbrdl_vsl : GCCBuiltin<"__builtin_ve_vl_vbrdl_vsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vbrdl_vsvl : GCCBuiltin<"__builtin_ve_vl_vbrdl_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vbrdl_vsmvl : GCCBuiltin<"__builtin_ve_vl_vbrdl_vsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vbrds_vsl : GCCBuiltin<"__builtin_ve_vl_vbrds_vsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vbrds_vsvl : GCCBuiltin<"__builtin_ve_vl_vbrds_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vbrds_vsmvl : GCCBuiltin<"__builtin_ve_vl_vbrds_vsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vbrdw_vsl : GCCBuiltin<"__builtin_ve_vl_vbrdw_vsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vbrdw_vsvl : GCCBuiltin<"__builtin_ve_vl_vbrdw_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vbrdw_vsmvl : GCCBuiltin<"__builtin_ve_vl_vbrdw_vsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvbrd_vsl : GCCBuiltin<"__builtin_ve_vl_pvbrd_vsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvbrd_vsvl : GCCBuiltin<"__builtin_ve_vl_pvbrd_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvbrd_vsMvl : GCCBuiltin<"__builtin_ve_vl_pvbrd_vsMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmv_vsvl : GCCBuiltin<"__builtin_ve_vl_vmv_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmv_vsvvl : GCCBuiltin<"__builtin_ve_vl_vmv_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmv_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vmv_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vaddul_vvvl : GCCBuiltin<"__builtin_ve_vl_vaddul_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vaddul_vvvvl : GCCBuiltin<"__builtin_ve_vl_vaddul_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vaddul_vsvl : GCCBuiltin<"__builtin_ve_vl_vaddul_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vaddul_vsvvl : GCCBuiltin<"__builtin_ve_vl_vaddul_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vaddul_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vaddul_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vaddul_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vaddul_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vadduw_vvvl : GCCBuiltin<"__builtin_ve_vl_vadduw_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vadduw_vvvvl : GCCBuiltin<"__builtin_ve_vl_vadduw_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vadduw_vsvl : GCCBuiltin<"__builtin_ve_vl_vadduw_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vadduw_vsvvl : GCCBuiltin<"__builtin_ve_vl_vadduw_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vadduw_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vadduw_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vadduw_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vadduw_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvaddu_vvvl : GCCBuiltin<"__builtin_ve_vl_pvaddu_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvaddu_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvaddu_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvaddu_vsvl : GCCBuiltin<"__builtin_ve_vl_pvaddu_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvaddu_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvaddu_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvaddu_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvaddu_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvaddu_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvaddu_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vaddswsx_vvvl : GCCBuiltin<"__builtin_ve_vl_vaddswsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vaddswsx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vaddswsx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vaddswsx_vsvl : GCCBuiltin<"__builtin_ve_vl_vaddswsx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vaddswsx_vsvvl : GCCBuiltin<"__builtin_ve_vl_vaddswsx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vaddswsx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vaddswsx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vaddswsx_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vaddswsx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vaddswzx_vvvl : GCCBuiltin<"__builtin_ve_vl_vaddswzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vaddswzx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vaddswzx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vaddswzx_vsvl : GCCBuiltin<"__builtin_ve_vl_vaddswzx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vaddswzx_vsvvl : GCCBuiltin<"__builtin_ve_vl_vaddswzx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vaddswzx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vaddswzx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vaddswzx_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vaddswzx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvadds_vvvl : GCCBuiltin<"__builtin_ve_vl_pvadds_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvadds_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvadds_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvadds_vsvl : GCCBuiltin<"__builtin_ve_vl_pvadds_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvadds_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvadds_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvadds_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvadds_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvadds_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvadds_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vaddsl_vvvl : GCCBuiltin<"__builtin_ve_vl_vaddsl_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vaddsl_vvvvl : GCCBuiltin<"__builtin_ve_vl_vaddsl_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vaddsl_vsvl : GCCBuiltin<"__builtin_ve_vl_vaddsl_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vaddsl_vsvvl : GCCBuiltin<"__builtin_ve_vl_vaddsl_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vaddsl_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vaddsl_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vaddsl_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vaddsl_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsubul_vvvl : GCCBuiltin<"__builtin_ve_vl_vsubul_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsubul_vvvvl : GCCBuiltin<"__builtin_ve_vl_vsubul_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsubul_vsvl : GCCBuiltin<"__builtin_ve_vl_vsubul_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsubul_vsvvl : GCCBuiltin<"__builtin_ve_vl_vsubul_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsubul_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vsubul_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsubul_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vsubul_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsubuw_vvvl : GCCBuiltin<"__builtin_ve_vl_vsubuw_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsubuw_vvvvl : GCCBuiltin<"__builtin_ve_vl_vsubuw_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsubuw_vsvl : GCCBuiltin<"__builtin_ve_vl_vsubuw_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsubuw_vsvvl : GCCBuiltin<"__builtin_ve_vl_vsubuw_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsubuw_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vsubuw_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsubuw_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vsubuw_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsubu_vvvl : GCCBuiltin<"__builtin_ve_vl_pvsubu_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsubu_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvsubu_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsubu_vsvl : GCCBuiltin<"__builtin_ve_vl_pvsubu_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsubu_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvsubu_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsubu_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvsubu_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsubu_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvsubu_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsubswsx_vvvl : GCCBuiltin<"__builtin_ve_vl_vsubswsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsubswsx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vsubswsx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsubswsx_vsvl : GCCBuiltin<"__builtin_ve_vl_vsubswsx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsubswsx_vsvvl : GCCBuiltin<"__builtin_ve_vl_vsubswsx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsubswsx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vsubswsx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsubswsx_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vsubswsx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsubswzx_vvvl : GCCBuiltin<"__builtin_ve_vl_vsubswzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsubswzx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vsubswzx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsubswzx_vsvl : GCCBuiltin<"__builtin_ve_vl_vsubswzx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsubswzx_vsvvl : GCCBuiltin<"__builtin_ve_vl_vsubswzx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsubswzx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vsubswzx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsubswzx_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vsubswzx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsubs_vvvl : GCCBuiltin<"__builtin_ve_vl_pvsubs_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsubs_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvsubs_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsubs_vsvl : GCCBuiltin<"__builtin_ve_vl_pvsubs_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsubs_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvsubs_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsubs_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvsubs_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsubs_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvsubs_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsubsl_vvvl : GCCBuiltin<"__builtin_ve_vl_vsubsl_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsubsl_vvvvl : GCCBuiltin<"__builtin_ve_vl_vsubsl_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsubsl_vsvl : GCCBuiltin<"__builtin_ve_vl_vsubsl_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsubsl_vsvvl : GCCBuiltin<"__builtin_ve_vl_vsubsl_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsubsl_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vsubsl_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsubsl_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vsubsl_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmulul_vvvl : GCCBuiltin<"__builtin_ve_vl_vmulul_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmulul_vvvvl : GCCBuiltin<"__builtin_ve_vl_vmulul_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmulul_vsvl : GCCBuiltin<"__builtin_ve_vl_vmulul_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmulul_vsvvl : GCCBuiltin<"__builtin_ve_vl_vmulul_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmulul_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vmulul_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmulul_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vmulul_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmuluw_vvvl : GCCBuiltin<"__builtin_ve_vl_vmuluw_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmuluw_vvvvl : GCCBuiltin<"__builtin_ve_vl_vmuluw_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmuluw_vsvl : GCCBuiltin<"__builtin_ve_vl_vmuluw_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmuluw_vsvvl : GCCBuiltin<"__builtin_ve_vl_vmuluw_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmuluw_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vmuluw_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmuluw_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vmuluw_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmulswsx_vvvl : GCCBuiltin<"__builtin_ve_vl_vmulswsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmulswsx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vmulswsx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmulswsx_vsvl : GCCBuiltin<"__builtin_ve_vl_vmulswsx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmulswsx_vsvvl : GCCBuiltin<"__builtin_ve_vl_vmulswsx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmulswsx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vmulswsx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmulswsx_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vmulswsx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmulswzx_vvvl : GCCBuiltin<"__builtin_ve_vl_vmulswzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmulswzx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vmulswzx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmulswzx_vsvl : GCCBuiltin<"__builtin_ve_vl_vmulswzx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmulswzx_vsvvl : GCCBuiltin<"__builtin_ve_vl_vmulswzx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmulswzx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vmulswzx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmulswzx_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vmulswzx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmulsl_vvvl : GCCBuiltin<"__builtin_ve_vl_vmulsl_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmulsl_vvvvl : GCCBuiltin<"__builtin_ve_vl_vmulsl_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmulsl_vsvl : GCCBuiltin<"__builtin_ve_vl_vmulsl_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmulsl_vsvvl : GCCBuiltin<"__builtin_ve_vl_vmulsl_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmulsl_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vmulsl_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmulsl_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vmulsl_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmulslw_vvvl : GCCBuiltin<"__builtin_ve_vl_vmulslw_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmulslw_vvvvl : GCCBuiltin<"__builtin_ve_vl_vmulslw_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmulslw_vsvl : GCCBuiltin<"__builtin_ve_vl_vmulslw_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmulslw_vsvvl : GCCBuiltin<"__builtin_ve_vl_vmulslw_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivul_vvvl : GCCBuiltin<"__builtin_ve_vl_vdivul_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivul_vvvvl : GCCBuiltin<"__builtin_ve_vl_vdivul_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivul_vsvl : GCCBuiltin<"__builtin_ve_vl_vdivul_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivul_vsvvl : GCCBuiltin<"__builtin_ve_vl_vdivul_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivul_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vdivul_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivul_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vdivul_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivuw_vvvl : GCCBuiltin<"__builtin_ve_vl_vdivuw_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivuw_vvvvl : GCCBuiltin<"__builtin_ve_vl_vdivuw_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivuw_vsvl : GCCBuiltin<"__builtin_ve_vl_vdivuw_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivuw_vsvvl : GCCBuiltin<"__builtin_ve_vl_vdivuw_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivuw_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vdivuw_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivuw_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vdivuw_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivul_vvsl : GCCBuiltin<"__builtin_ve_vl_vdivul_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivul_vvsvl : GCCBuiltin<"__builtin_ve_vl_vdivul_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivul_vvsmvl : GCCBuiltin<"__builtin_ve_vl_vdivul_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivuw_vvsl : GCCBuiltin<"__builtin_ve_vl_vdivuw_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivuw_vvsvl : GCCBuiltin<"__builtin_ve_vl_vdivuw_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivuw_vvsmvl : GCCBuiltin<"__builtin_ve_vl_vdivuw_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivswsx_vvvl : GCCBuiltin<"__builtin_ve_vl_vdivswsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivswsx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vdivswsx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivswsx_vsvl : GCCBuiltin<"__builtin_ve_vl_vdivswsx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivswsx_vsvvl : GCCBuiltin<"__builtin_ve_vl_vdivswsx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivswsx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vdivswsx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivswsx_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vdivswsx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivswzx_vvvl : GCCBuiltin<"__builtin_ve_vl_vdivswzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivswzx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vdivswzx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivswzx_vsvl : GCCBuiltin<"__builtin_ve_vl_vdivswzx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivswzx_vsvvl : GCCBuiltin<"__builtin_ve_vl_vdivswzx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivswzx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vdivswzx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivswzx_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vdivswzx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivswsx_vvsl : GCCBuiltin<"__builtin_ve_vl_vdivswsx_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivswsx_vvsvl : GCCBuiltin<"__builtin_ve_vl_vdivswsx_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivswsx_vvsmvl : GCCBuiltin<"__builtin_ve_vl_vdivswsx_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivswzx_vvsl : GCCBuiltin<"__builtin_ve_vl_vdivswzx_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivswzx_vvsvl : GCCBuiltin<"__builtin_ve_vl_vdivswzx_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivswzx_vvsmvl : GCCBuiltin<"__builtin_ve_vl_vdivswzx_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivsl_vvvl : GCCBuiltin<"__builtin_ve_vl_vdivsl_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivsl_vvvvl : GCCBuiltin<"__builtin_ve_vl_vdivsl_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivsl_vsvl : GCCBuiltin<"__builtin_ve_vl_vdivsl_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivsl_vsvvl : GCCBuiltin<"__builtin_ve_vl_vdivsl_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivsl_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vdivsl_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivsl_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vdivsl_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivsl_vvsl : GCCBuiltin<"__builtin_ve_vl_vdivsl_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivsl_vvsvl : GCCBuiltin<"__builtin_ve_vl_vdivsl_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vdivsl_vvsmvl : GCCBuiltin<"__builtin_ve_vl_vdivsl_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcmpul_vvvl : GCCBuiltin<"__builtin_ve_vl_vcmpul_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcmpul_vvvvl : GCCBuiltin<"__builtin_ve_vl_vcmpul_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcmpul_vsvl : GCCBuiltin<"__builtin_ve_vl_vcmpul_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcmpul_vsvvl : GCCBuiltin<"__builtin_ve_vl_vcmpul_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcmpul_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vcmpul_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcmpul_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vcmpul_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcmpuw_vvvl : GCCBuiltin<"__builtin_ve_vl_vcmpuw_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcmpuw_vvvvl : GCCBuiltin<"__builtin_ve_vl_vcmpuw_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcmpuw_vsvl : GCCBuiltin<"__builtin_ve_vl_vcmpuw_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcmpuw_vsvvl : GCCBuiltin<"__builtin_ve_vl_vcmpuw_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcmpuw_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vcmpuw_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcmpuw_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vcmpuw_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvcmpu_vvvl : GCCBuiltin<"__builtin_ve_vl_pvcmpu_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvcmpu_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvcmpu_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvcmpu_vsvl : GCCBuiltin<"__builtin_ve_vl_pvcmpu_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvcmpu_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvcmpu_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvcmpu_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvcmpu_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvcmpu_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvcmpu_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcmpswsx_vvvl : GCCBuiltin<"__builtin_ve_vl_vcmpswsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcmpswsx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vcmpswsx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcmpswsx_vsvl : GCCBuiltin<"__builtin_ve_vl_vcmpswsx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcmpswsx_vsvvl : GCCBuiltin<"__builtin_ve_vl_vcmpswsx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcmpswsx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vcmpswsx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcmpswsx_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vcmpswsx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcmpswzx_vvvl : GCCBuiltin<"__builtin_ve_vl_vcmpswzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcmpswzx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vcmpswzx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcmpswzx_vsvl : GCCBuiltin<"__builtin_ve_vl_vcmpswzx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcmpswzx_vsvvl : GCCBuiltin<"__builtin_ve_vl_vcmpswzx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcmpswzx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vcmpswzx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcmpswzx_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vcmpswzx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvcmps_vvvl : GCCBuiltin<"__builtin_ve_vl_pvcmps_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvcmps_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvcmps_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvcmps_vsvl : GCCBuiltin<"__builtin_ve_vl_pvcmps_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvcmps_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvcmps_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvcmps_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvcmps_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvcmps_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvcmps_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcmpsl_vvvl : GCCBuiltin<"__builtin_ve_vl_vcmpsl_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcmpsl_vvvvl : GCCBuiltin<"__builtin_ve_vl_vcmpsl_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcmpsl_vsvl : GCCBuiltin<"__builtin_ve_vl_vcmpsl_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcmpsl_vsvvl : GCCBuiltin<"__builtin_ve_vl_vcmpsl_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcmpsl_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vcmpsl_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcmpsl_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vcmpsl_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmaxswsx_vvvl : GCCBuiltin<"__builtin_ve_vl_vmaxswsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmaxswsx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vmaxswsx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmaxswsx_vsvl : GCCBuiltin<"__builtin_ve_vl_vmaxswsx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmaxswsx_vsvvl : GCCBuiltin<"__builtin_ve_vl_vmaxswsx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmaxswsx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vmaxswsx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmaxswsx_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vmaxswsx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmaxswzx_vvvl : GCCBuiltin<"__builtin_ve_vl_vmaxswzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmaxswzx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vmaxswzx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmaxswzx_vsvl : GCCBuiltin<"__builtin_ve_vl_vmaxswzx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmaxswzx_vsvvl : GCCBuiltin<"__builtin_ve_vl_vmaxswzx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmaxswzx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vmaxswzx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmaxswzx_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vmaxswzx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvmaxs_vvvl : GCCBuiltin<"__builtin_ve_vl_pvmaxs_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvmaxs_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvmaxs_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvmaxs_vsvl : GCCBuiltin<"__builtin_ve_vl_pvmaxs_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvmaxs_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvmaxs_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvmaxs_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvmaxs_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvmaxs_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvmaxs_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vminswsx_vvvl : GCCBuiltin<"__builtin_ve_vl_vminswsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vminswsx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vminswsx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vminswsx_vsvl : GCCBuiltin<"__builtin_ve_vl_vminswsx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vminswsx_vsvvl : GCCBuiltin<"__builtin_ve_vl_vminswsx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vminswsx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vminswsx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vminswsx_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vminswsx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vminswzx_vvvl : GCCBuiltin<"__builtin_ve_vl_vminswzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vminswzx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vminswzx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vminswzx_vsvl : GCCBuiltin<"__builtin_ve_vl_vminswzx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vminswzx_vsvvl : GCCBuiltin<"__builtin_ve_vl_vminswzx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vminswzx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vminswzx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vminswzx_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vminswzx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvmins_vvvl : GCCBuiltin<"__builtin_ve_vl_pvmins_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvmins_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvmins_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvmins_vsvl : GCCBuiltin<"__builtin_ve_vl_pvmins_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvmins_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvmins_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvmins_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvmins_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvmins_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvmins_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmaxsl_vvvl : GCCBuiltin<"__builtin_ve_vl_vmaxsl_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmaxsl_vvvvl : GCCBuiltin<"__builtin_ve_vl_vmaxsl_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmaxsl_vsvl : GCCBuiltin<"__builtin_ve_vl_vmaxsl_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmaxsl_vsvvl : GCCBuiltin<"__builtin_ve_vl_vmaxsl_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmaxsl_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vmaxsl_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmaxsl_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vmaxsl_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vminsl_vvvl : GCCBuiltin<"__builtin_ve_vl_vminsl_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vminsl_vvvvl : GCCBuiltin<"__builtin_ve_vl_vminsl_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vminsl_vsvl : GCCBuiltin<"__builtin_ve_vl_vminsl_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vminsl_vsvvl : GCCBuiltin<"__builtin_ve_vl_vminsl_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vminsl_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vminsl_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vminsl_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vminsl_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vand_vvvl : GCCBuiltin<"__builtin_ve_vl_vand_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vand_vvvvl : GCCBuiltin<"__builtin_ve_vl_vand_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vand_vsvl : GCCBuiltin<"__builtin_ve_vl_vand_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vand_vsvvl : GCCBuiltin<"__builtin_ve_vl_vand_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vand_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vand_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vand_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vand_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvand_vvvl : GCCBuiltin<"__builtin_ve_vl_pvand_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvand_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvand_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvand_vsvl : GCCBuiltin<"__builtin_ve_vl_pvand_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvand_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvand_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvand_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvand_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvand_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvand_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vor_vvvl : GCCBuiltin<"__builtin_ve_vl_vor_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vor_vvvvl : GCCBuiltin<"__builtin_ve_vl_vor_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vor_vsvl : GCCBuiltin<"__builtin_ve_vl_vor_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vor_vsvvl : GCCBuiltin<"__builtin_ve_vl_vor_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vor_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vor_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vor_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vor_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvor_vvvl : GCCBuiltin<"__builtin_ve_vl_pvor_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvor_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvor_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvor_vsvl : GCCBuiltin<"__builtin_ve_vl_pvor_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvor_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvor_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvor_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvor_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvor_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvor_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vxor_vvvl : GCCBuiltin<"__builtin_ve_vl_vxor_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vxor_vvvvl : GCCBuiltin<"__builtin_ve_vl_vxor_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vxor_vsvl : GCCBuiltin<"__builtin_ve_vl_vxor_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vxor_vsvvl : GCCBuiltin<"__builtin_ve_vl_vxor_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vxor_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vxor_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vxor_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vxor_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvxor_vvvl : GCCBuiltin<"__builtin_ve_vl_pvxor_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvxor_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvxor_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvxor_vsvl : GCCBuiltin<"__builtin_ve_vl_pvxor_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvxor_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvxor_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvxor_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvxor_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvxor_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvxor_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_veqv_vvvl : GCCBuiltin<"__builtin_ve_vl_veqv_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_veqv_vvvvl : GCCBuiltin<"__builtin_ve_vl_veqv_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_veqv_vsvl : GCCBuiltin<"__builtin_ve_vl_veqv_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_veqv_vsvvl : GCCBuiltin<"__builtin_ve_vl_veqv_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_veqv_vvvmvl : GCCBuiltin<"__builtin_ve_vl_veqv_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_veqv_vsvmvl : GCCBuiltin<"__builtin_ve_vl_veqv_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pveqv_vvvl : GCCBuiltin<"__builtin_ve_vl_pveqv_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pveqv_vvvvl : GCCBuiltin<"__builtin_ve_vl_pveqv_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pveqv_vsvl : GCCBuiltin<"__builtin_ve_vl_pveqv_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pveqv_vsvvl : GCCBuiltin<"__builtin_ve_vl_pveqv_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pveqv_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pveqv_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pveqv_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pveqv_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vseq_vl : GCCBuiltin<"__builtin_ve_vl_vseq_vl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vseq_vvl : GCCBuiltin<"__builtin_ve_vl_vseq_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvseqlo_vl : GCCBuiltin<"__builtin_ve_vl_pvseqlo_vl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvseqlo_vvl : GCCBuiltin<"__builtin_ve_vl_pvseqlo_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsequp_vl : GCCBuiltin<"__builtin_ve_vl_pvsequp_vl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsequp_vvl : GCCBuiltin<"__builtin_ve_vl_pvsequp_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvseq_vl : GCCBuiltin<"__builtin_ve_vl_pvseq_vl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvseq_vvl : GCCBuiltin<"__builtin_ve_vl_pvseq_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsll_vvvl : GCCBuiltin<"__builtin_ve_vl_vsll_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsll_vvvvl : GCCBuiltin<"__builtin_ve_vl_vsll_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsll_vvsl : GCCBuiltin<"__builtin_ve_vl_vsll_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsll_vvsvl : GCCBuiltin<"__builtin_ve_vl_vsll_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsll_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vsll_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsll_vvsmvl : GCCBuiltin<"__builtin_ve_vl_vsll_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsll_vvvl : GCCBuiltin<"__builtin_ve_vl_pvsll_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsll_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvsll_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsll_vvsl : GCCBuiltin<"__builtin_ve_vl_pvsll_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsll_vvsvl : GCCBuiltin<"__builtin_ve_vl_pvsll_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsll_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvsll_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsll_vvsMvl : GCCBuiltin<"__builtin_ve_vl_pvsll_vvsMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsrl_vvvl : GCCBuiltin<"__builtin_ve_vl_vsrl_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsrl_vvvvl : GCCBuiltin<"__builtin_ve_vl_vsrl_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsrl_vvsl : GCCBuiltin<"__builtin_ve_vl_vsrl_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsrl_vvsvl : GCCBuiltin<"__builtin_ve_vl_vsrl_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsrl_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vsrl_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsrl_vvsmvl : GCCBuiltin<"__builtin_ve_vl_vsrl_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsrl_vvvl : GCCBuiltin<"__builtin_ve_vl_pvsrl_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsrl_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvsrl_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsrl_vvsl : GCCBuiltin<"__builtin_ve_vl_pvsrl_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsrl_vvsvl : GCCBuiltin<"__builtin_ve_vl_pvsrl_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsrl_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvsrl_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsrl_vvsMvl : GCCBuiltin<"__builtin_ve_vl_pvsrl_vvsMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vslawsx_vvvl : GCCBuiltin<"__builtin_ve_vl_vslawsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vslawsx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vslawsx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vslawsx_vvsl : GCCBuiltin<"__builtin_ve_vl_vslawsx_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vslawsx_vvsvl : GCCBuiltin<"__builtin_ve_vl_vslawsx_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vslawsx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vslawsx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vslawsx_vvsmvl : GCCBuiltin<"__builtin_ve_vl_vslawsx_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vslawzx_vvvl : GCCBuiltin<"__builtin_ve_vl_vslawzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vslawzx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vslawzx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vslawzx_vvsl : GCCBuiltin<"__builtin_ve_vl_vslawzx_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vslawzx_vvsvl : GCCBuiltin<"__builtin_ve_vl_vslawzx_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vslawzx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vslawzx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vslawzx_vvsmvl : GCCBuiltin<"__builtin_ve_vl_vslawzx_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsla_vvvl : GCCBuiltin<"__builtin_ve_vl_pvsla_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsla_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvsla_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsla_vvsl : GCCBuiltin<"__builtin_ve_vl_pvsla_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsla_vvsvl : GCCBuiltin<"__builtin_ve_vl_pvsla_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsla_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvsla_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsla_vvsMvl : GCCBuiltin<"__builtin_ve_vl_pvsla_vvsMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vslal_vvvl : GCCBuiltin<"__builtin_ve_vl_vslal_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vslal_vvvvl : GCCBuiltin<"__builtin_ve_vl_vslal_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vslal_vvsl : GCCBuiltin<"__builtin_ve_vl_vslal_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vslal_vvsvl : GCCBuiltin<"__builtin_ve_vl_vslal_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vslal_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vslal_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vslal_vvsmvl : GCCBuiltin<"__builtin_ve_vl_vslal_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsrawsx_vvvl : GCCBuiltin<"__builtin_ve_vl_vsrawsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsrawsx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vsrawsx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsrawsx_vvsl : GCCBuiltin<"__builtin_ve_vl_vsrawsx_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsrawsx_vvsvl : GCCBuiltin<"__builtin_ve_vl_vsrawsx_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsrawsx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vsrawsx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsrawsx_vvsmvl : GCCBuiltin<"__builtin_ve_vl_vsrawsx_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsrawzx_vvvl : GCCBuiltin<"__builtin_ve_vl_vsrawzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsrawzx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vsrawzx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsrawzx_vvsl : GCCBuiltin<"__builtin_ve_vl_vsrawzx_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsrawzx_vvsvl : GCCBuiltin<"__builtin_ve_vl_vsrawzx_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsrawzx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vsrawzx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsrawzx_vvsmvl : GCCBuiltin<"__builtin_ve_vl_vsrawzx_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsra_vvvl : GCCBuiltin<"__builtin_ve_vl_pvsra_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsra_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvsra_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsra_vvsl : GCCBuiltin<"__builtin_ve_vl_pvsra_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsra_vvsvl : GCCBuiltin<"__builtin_ve_vl_pvsra_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsra_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvsra_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvsra_vvsMvl : GCCBuiltin<"__builtin_ve_vl_pvsra_vvsMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsral_vvvl : GCCBuiltin<"__builtin_ve_vl_vsral_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsral_vvvvl : GCCBuiltin<"__builtin_ve_vl_vsral_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsral_vvsl : GCCBuiltin<"__builtin_ve_vl_vsral_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsral_vvsvl : GCCBuiltin<"__builtin_ve_vl_vsral_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsral_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vsral_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsral_vvsmvl : GCCBuiltin<"__builtin_ve_vl_vsral_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsfa_vvssl : GCCBuiltin<"__builtin_ve_vl_vsfa_vvssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsfa_vvssvl : GCCBuiltin<"__builtin_ve_vl_vsfa_vvssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsfa_vvssmvl : GCCBuiltin<"__builtin_ve_vl_vsfa_vvssmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfaddd_vvvl : GCCBuiltin<"__builtin_ve_vl_vfaddd_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfaddd_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfaddd_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfaddd_vsvl : GCCBuiltin<"__builtin_ve_vl_vfaddd_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfaddd_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfaddd_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfaddd_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vfaddd_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfaddd_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vfaddd_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfadds_vvvl : GCCBuiltin<"__builtin_ve_vl_vfadds_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfadds_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfadds_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfadds_vsvl : GCCBuiltin<"__builtin_ve_vl_vfadds_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfadds_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfadds_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfadds_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vfadds_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfadds_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vfadds_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfadd_vvvl : GCCBuiltin<"__builtin_ve_vl_pvfadd_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfadd_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvfadd_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfadd_vsvl : GCCBuiltin<"__builtin_ve_vl_pvfadd_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfadd_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvfadd_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfadd_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvfadd_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfadd_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvfadd_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfsubd_vvvl : GCCBuiltin<"__builtin_ve_vl_vfsubd_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfsubd_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfsubd_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfsubd_vsvl : GCCBuiltin<"__builtin_ve_vl_vfsubd_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfsubd_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfsubd_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfsubd_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vfsubd_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfsubd_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vfsubd_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfsubs_vvvl : GCCBuiltin<"__builtin_ve_vl_vfsubs_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfsubs_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfsubs_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfsubs_vsvl : GCCBuiltin<"__builtin_ve_vl_vfsubs_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfsubs_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfsubs_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfsubs_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vfsubs_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfsubs_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vfsubs_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfsub_vvvl : GCCBuiltin<"__builtin_ve_vl_pvfsub_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfsub_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvfsub_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfsub_vsvl : GCCBuiltin<"__builtin_ve_vl_pvfsub_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfsub_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvfsub_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfsub_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvfsub_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfsub_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvfsub_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmuld_vvvl : GCCBuiltin<"__builtin_ve_vl_vfmuld_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmuld_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfmuld_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmuld_vsvl : GCCBuiltin<"__builtin_ve_vl_vfmuld_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmuld_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfmuld_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmuld_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vfmuld_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmuld_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vfmuld_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmuls_vvvl : GCCBuiltin<"__builtin_ve_vl_vfmuls_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmuls_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfmuls_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmuls_vsvl : GCCBuiltin<"__builtin_ve_vl_vfmuls_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmuls_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfmuls_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmuls_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vfmuls_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmuls_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vfmuls_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmul_vvvl : GCCBuiltin<"__builtin_ve_vl_pvfmul_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmul_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvfmul_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmul_vsvl : GCCBuiltin<"__builtin_ve_vl_pvfmul_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmul_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvfmul_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmul_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvfmul_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmul_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvfmul_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfdivd_vvvl : GCCBuiltin<"__builtin_ve_vl_vfdivd_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfdivd_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfdivd_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfdivd_vsvl : GCCBuiltin<"__builtin_ve_vl_vfdivd_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfdivd_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfdivd_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfdivd_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vfdivd_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfdivd_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vfdivd_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfdivs_vvvl : GCCBuiltin<"__builtin_ve_vl_vfdivs_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfdivs_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfdivs_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfdivs_vsvl : GCCBuiltin<"__builtin_ve_vl_vfdivs_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfdivs_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfdivs_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfdivs_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vfdivs_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfdivs_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vfdivs_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfsqrtd_vvl : GCCBuiltin<"__builtin_ve_vl_vfsqrtd_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfsqrtd_vvvl : GCCBuiltin<"__builtin_ve_vl_vfsqrtd_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfsqrts_vvl : GCCBuiltin<"__builtin_ve_vl_vfsqrts_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfsqrts_vvvl : GCCBuiltin<"__builtin_ve_vl_vfsqrts_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfcmpd_vvvl : GCCBuiltin<"__builtin_ve_vl_vfcmpd_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfcmpd_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfcmpd_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfcmpd_vsvl : GCCBuiltin<"__builtin_ve_vl_vfcmpd_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfcmpd_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfcmpd_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfcmpd_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vfcmpd_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfcmpd_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vfcmpd_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfcmps_vvvl : GCCBuiltin<"__builtin_ve_vl_vfcmps_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfcmps_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfcmps_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfcmps_vsvl : GCCBuiltin<"__builtin_ve_vl_vfcmps_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfcmps_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfcmps_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfcmps_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vfcmps_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfcmps_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vfcmps_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfcmp_vvvl : GCCBuiltin<"__builtin_ve_vl_pvfcmp_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfcmp_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvfcmp_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfcmp_vsvl : GCCBuiltin<"__builtin_ve_vl_pvfcmp_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfcmp_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvfcmp_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfcmp_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvfcmp_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfcmp_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvfcmp_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmaxd_vvvl : GCCBuiltin<"__builtin_ve_vl_vfmaxd_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmaxd_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfmaxd_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmaxd_vsvl : GCCBuiltin<"__builtin_ve_vl_vfmaxd_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmaxd_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfmaxd_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmaxd_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vfmaxd_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmaxd_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vfmaxd_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmaxs_vvvl : GCCBuiltin<"__builtin_ve_vl_vfmaxs_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmaxs_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfmaxs_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmaxs_vsvl : GCCBuiltin<"__builtin_ve_vl_vfmaxs_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmaxs_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfmaxs_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmaxs_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vfmaxs_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmaxs_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vfmaxs_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmax_vvvl : GCCBuiltin<"__builtin_ve_vl_pvfmax_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmax_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvfmax_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmax_vsvl : GCCBuiltin<"__builtin_ve_vl_pvfmax_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmax_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvfmax_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmax_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvfmax_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmax_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvfmax_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmind_vvvl : GCCBuiltin<"__builtin_ve_vl_vfmind_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmind_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfmind_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmind_vsvl : GCCBuiltin<"__builtin_ve_vl_vfmind_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmind_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfmind_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmind_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vfmind_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmind_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vfmind_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmins_vvvl : GCCBuiltin<"__builtin_ve_vl_vfmins_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmins_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfmins_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmins_vsvl : GCCBuiltin<"__builtin_ve_vl_vfmins_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmins_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfmins_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmins_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vfmins_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmins_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vfmins_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmin_vvvl : GCCBuiltin<"__builtin_ve_vl_pvfmin_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmin_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvfmin_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmin_vsvl : GCCBuiltin<"__builtin_ve_vl_pvfmin_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmin_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvfmin_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmin_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvfmin_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmin_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvfmin_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmadd_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfmadd_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmadd_vvvvvl : GCCBuiltin<"__builtin_ve_vl_vfmadd_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmadd_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfmadd_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmadd_vsvvvl : GCCBuiltin<"__builtin_ve_vl_vfmadd_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmadd_vvsvl : GCCBuiltin<"__builtin_ve_vl_vfmadd_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmadd_vvsvvl : GCCBuiltin<"__builtin_ve_vl_vfmadd_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmadd_vvvvmvl : GCCBuiltin<"__builtin_ve_vl_vfmadd_vvvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmadd_vsvvmvl : GCCBuiltin<"__builtin_ve_vl_vfmadd_vsvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmadd_vvsvmvl : GCCBuiltin<"__builtin_ve_vl_vfmadd_vvsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmads_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfmads_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmads_vvvvvl : GCCBuiltin<"__builtin_ve_vl_vfmads_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmads_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfmads_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmads_vsvvvl : GCCBuiltin<"__builtin_ve_vl_vfmads_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmads_vvsvl : GCCBuiltin<"__builtin_ve_vl_vfmads_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmads_vvsvvl : GCCBuiltin<"__builtin_ve_vl_vfmads_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmads_vvvvmvl : GCCBuiltin<"__builtin_ve_vl_vfmads_vvvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmads_vsvvmvl : GCCBuiltin<"__builtin_ve_vl_vfmads_vsvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmads_vvsvmvl : GCCBuiltin<"__builtin_ve_vl_vfmads_vvsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmad_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvfmad_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmad_vvvvvl : GCCBuiltin<"__builtin_ve_vl_pvfmad_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmad_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvfmad_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmad_vsvvvl : GCCBuiltin<"__builtin_ve_vl_pvfmad_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmad_vvsvl : GCCBuiltin<"__builtin_ve_vl_pvfmad_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmad_vvsvvl : GCCBuiltin<"__builtin_ve_vl_pvfmad_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmad_vvvvMvl : GCCBuiltin<"__builtin_ve_vl_pvfmad_vvvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmad_vsvvMvl : GCCBuiltin<"__builtin_ve_vl_pvfmad_vsvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmad_vvsvMvl : GCCBuiltin<"__builtin_ve_vl_pvfmad_vvsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmsbd_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfmsbd_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmsbd_vvvvvl : GCCBuiltin<"__builtin_ve_vl_vfmsbd_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmsbd_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfmsbd_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmsbd_vsvvvl : GCCBuiltin<"__builtin_ve_vl_vfmsbd_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmsbd_vvsvl : GCCBuiltin<"__builtin_ve_vl_vfmsbd_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmsbd_vvsvvl : GCCBuiltin<"__builtin_ve_vl_vfmsbd_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmsbd_vvvvmvl : GCCBuiltin<"__builtin_ve_vl_vfmsbd_vvvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmsbd_vsvvmvl : GCCBuiltin<"__builtin_ve_vl_vfmsbd_vsvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmsbd_vvsvmvl : GCCBuiltin<"__builtin_ve_vl_vfmsbd_vvsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmsbs_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfmsbs_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmsbs_vvvvvl : GCCBuiltin<"__builtin_ve_vl_vfmsbs_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmsbs_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfmsbs_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmsbs_vsvvvl : GCCBuiltin<"__builtin_ve_vl_vfmsbs_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmsbs_vvsvl : GCCBuiltin<"__builtin_ve_vl_vfmsbs_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmsbs_vvsvvl : GCCBuiltin<"__builtin_ve_vl_vfmsbs_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmsbs_vvvvmvl : GCCBuiltin<"__builtin_ve_vl_vfmsbs_vvvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmsbs_vsvvmvl : GCCBuiltin<"__builtin_ve_vl_vfmsbs_vsvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmsbs_vvsvmvl : GCCBuiltin<"__builtin_ve_vl_vfmsbs_vvsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmsb_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvfmsb_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmsb_vvvvvl : GCCBuiltin<"__builtin_ve_vl_pvfmsb_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmsb_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvfmsb_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmsb_vsvvvl : GCCBuiltin<"__builtin_ve_vl_pvfmsb_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmsb_vvsvl : GCCBuiltin<"__builtin_ve_vl_pvfmsb_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmsb_vvsvvl : GCCBuiltin<"__builtin_ve_vl_pvfmsb_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmsb_vvvvMvl : GCCBuiltin<"__builtin_ve_vl_pvfmsb_vvvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmsb_vsvvMvl : GCCBuiltin<"__builtin_ve_vl_pvfmsb_vsvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmsb_vvsvMvl : GCCBuiltin<"__builtin_ve_vl_pvfmsb_vvsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmadd_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfnmadd_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmadd_vvvvvl : GCCBuiltin<"__builtin_ve_vl_vfnmadd_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmadd_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfnmadd_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmadd_vsvvvl : GCCBuiltin<"__builtin_ve_vl_vfnmadd_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmadd_vvsvl : GCCBuiltin<"__builtin_ve_vl_vfnmadd_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmadd_vvsvvl : GCCBuiltin<"__builtin_ve_vl_vfnmadd_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmadd_vvvvmvl : GCCBuiltin<"__builtin_ve_vl_vfnmadd_vvvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmadd_vsvvmvl : GCCBuiltin<"__builtin_ve_vl_vfnmadd_vsvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmadd_vvsvmvl : GCCBuiltin<"__builtin_ve_vl_vfnmadd_vvsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmads_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfnmads_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmads_vvvvvl : GCCBuiltin<"__builtin_ve_vl_vfnmads_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmads_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfnmads_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmads_vsvvvl : GCCBuiltin<"__builtin_ve_vl_vfnmads_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmads_vvsvl : GCCBuiltin<"__builtin_ve_vl_vfnmads_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmads_vvsvvl : GCCBuiltin<"__builtin_ve_vl_vfnmads_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmads_vvvvmvl : GCCBuiltin<"__builtin_ve_vl_vfnmads_vvvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmads_vsvvmvl : GCCBuiltin<"__builtin_ve_vl_vfnmads_vsvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmads_vvsvmvl : GCCBuiltin<"__builtin_ve_vl_vfnmads_vvsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfnmad_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvfnmad_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfnmad_vvvvvl : GCCBuiltin<"__builtin_ve_vl_pvfnmad_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfnmad_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvfnmad_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfnmad_vsvvvl : GCCBuiltin<"__builtin_ve_vl_pvfnmad_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfnmad_vvsvl : GCCBuiltin<"__builtin_ve_vl_pvfnmad_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfnmad_vvsvvl : GCCBuiltin<"__builtin_ve_vl_pvfnmad_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfnmad_vvvvMvl : GCCBuiltin<"__builtin_ve_vl_pvfnmad_vvvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfnmad_vsvvMvl : GCCBuiltin<"__builtin_ve_vl_pvfnmad_vsvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfnmad_vvsvMvl : GCCBuiltin<"__builtin_ve_vl_pvfnmad_vvsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmsbd_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbd_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmsbd_vvvvvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbd_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmsbd_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbd_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmsbd_vsvvvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbd_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmsbd_vvsvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbd_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmsbd_vvsvvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbd_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmsbd_vvvvmvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbd_vvvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmsbd_vsvvmvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbd_vsvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmsbd_vvsvmvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbd_vvsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmsbs_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbs_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmsbs_vvvvvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbs_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmsbs_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbs_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmsbs_vsvvvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbs_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmsbs_vvsvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbs_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmsbs_vvsvvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbs_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmsbs_vvvvmvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbs_vvvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmsbs_vsvvmvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbs_vsvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfnmsbs_vvsvmvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbs_vvsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfnmsb_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvfnmsb_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfnmsb_vvvvvl : GCCBuiltin<"__builtin_ve_vl_pvfnmsb_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfnmsb_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvfnmsb_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfnmsb_vsvvvl : GCCBuiltin<"__builtin_ve_vl_pvfnmsb_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfnmsb_vvsvl : GCCBuiltin<"__builtin_ve_vl_pvfnmsb_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfnmsb_vvsvvl : GCCBuiltin<"__builtin_ve_vl_pvfnmsb_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfnmsb_vvvvMvl : GCCBuiltin<"__builtin_ve_vl_pvfnmsb_vvvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfnmsb_vsvvMvl : GCCBuiltin<"__builtin_ve_vl_pvfnmsb_vsvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfnmsb_vvsvMvl : GCCBuiltin<"__builtin_ve_vl_pvfnmsb_vvsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrcpd_vvl : GCCBuiltin<"__builtin_ve_vl_vrcpd_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrcpd_vvvl : GCCBuiltin<"__builtin_ve_vl_vrcpd_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrcps_vvl : GCCBuiltin<"__builtin_ve_vl_vrcps_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrcps_vvvl : GCCBuiltin<"__builtin_ve_vl_vrcps_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvrcp_vvl : GCCBuiltin<"__builtin_ve_vl_pvrcp_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvrcp_vvvl : GCCBuiltin<"__builtin_ve_vl_pvrcp_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrsqrtd_vvl : GCCBuiltin<"__builtin_ve_vl_vrsqrtd_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrsqrtd_vvvl : GCCBuiltin<"__builtin_ve_vl_vrsqrtd_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrsqrts_vvl : GCCBuiltin<"__builtin_ve_vl_vrsqrts_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrsqrts_vvvl : GCCBuiltin<"__builtin_ve_vl_vrsqrts_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvrsqrt_vvl : GCCBuiltin<"__builtin_ve_vl_pvrsqrt_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvrsqrt_vvvl : GCCBuiltin<"__builtin_ve_vl_pvrsqrt_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrsqrtdnex_vvl : GCCBuiltin<"__builtin_ve_vl_vrsqrtdnex_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrsqrtdnex_vvvl : GCCBuiltin<"__builtin_ve_vl_vrsqrtdnex_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrsqrtsnex_vvl : GCCBuiltin<"__builtin_ve_vl_vrsqrtsnex_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrsqrtsnex_vvvl : GCCBuiltin<"__builtin_ve_vl_vrsqrtsnex_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvrsqrtnex_vvl : GCCBuiltin<"__builtin_ve_vl_pvrsqrtnex_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvrsqrtnex_vvvl : GCCBuiltin<"__builtin_ve_vl_pvrsqrtnex_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtwdsx_vvl : GCCBuiltin<"__builtin_ve_vl_vcvtwdsx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtwdsx_vvvl : GCCBuiltin<"__builtin_ve_vl_vcvtwdsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtwdsx_vvmvl : GCCBuiltin<"__builtin_ve_vl_vcvtwdsx_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtwdsxrz_vvl : GCCBuiltin<"__builtin_ve_vl_vcvtwdsxrz_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtwdsxrz_vvvl : GCCBuiltin<"__builtin_ve_vl_vcvtwdsxrz_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtwdsxrz_vvmvl : GCCBuiltin<"__builtin_ve_vl_vcvtwdsxrz_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtwdzx_vvl : GCCBuiltin<"__builtin_ve_vl_vcvtwdzx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtwdzx_vvvl : GCCBuiltin<"__builtin_ve_vl_vcvtwdzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtwdzx_vvmvl : GCCBuiltin<"__builtin_ve_vl_vcvtwdzx_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtwdzxrz_vvl : GCCBuiltin<"__builtin_ve_vl_vcvtwdzxrz_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtwdzxrz_vvvl : GCCBuiltin<"__builtin_ve_vl_vcvtwdzxrz_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtwdzxrz_vvmvl : GCCBuiltin<"__builtin_ve_vl_vcvtwdzxrz_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtwssx_vvl : GCCBuiltin<"__builtin_ve_vl_vcvtwssx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtwssx_vvvl : GCCBuiltin<"__builtin_ve_vl_vcvtwssx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtwssx_vvmvl : GCCBuiltin<"__builtin_ve_vl_vcvtwssx_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtwssxrz_vvl : GCCBuiltin<"__builtin_ve_vl_vcvtwssxrz_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtwssxrz_vvvl : GCCBuiltin<"__builtin_ve_vl_vcvtwssxrz_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtwssxrz_vvmvl : GCCBuiltin<"__builtin_ve_vl_vcvtwssxrz_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtwszx_vvl : GCCBuiltin<"__builtin_ve_vl_vcvtwszx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtwszx_vvvl : GCCBuiltin<"__builtin_ve_vl_vcvtwszx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtwszx_vvmvl : GCCBuiltin<"__builtin_ve_vl_vcvtwszx_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtwszxrz_vvl : GCCBuiltin<"__builtin_ve_vl_vcvtwszxrz_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtwszxrz_vvvl : GCCBuiltin<"__builtin_ve_vl_vcvtwszxrz_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtwszxrz_vvmvl : GCCBuiltin<"__builtin_ve_vl_vcvtwszxrz_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvcvtws_vvl : GCCBuiltin<"__builtin_ve_vl_pvcvtws_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvcvtws_vvvl : GCCBuiltin<"__builtin_ve_vl_pvcvtws_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvcvtws_vvMvl : GCCBuiltin<"__builtin_ve_vl_pvcvtws_vvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvcvtwsrz_vvl : GCCBuiltin<"__builtin_ve_vl_pvcvtwsrz_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvcvtwsrz_vvvl : GCCBuiltin<"__builtin_ve_vl_pvcvtwsrz_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvcvtwsrz_vvMvl : GCCBuiltin<"__builtin_ve_vl_pvcvtwsrz_vvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtld_vvl : GCCBuiltin<"__builtin_ve_vl_vcvtld_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtld_vvvl : GCCBuiltin<"__builtin_ve_vl_vcvtld_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtld_vvmvl : GCCBuiltin<"__builtin_ve_vl_vcvtld_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtldrz_vvl : GCCBuiltin<"__builtin_ve_vl_vcvtldrz_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtldrz_vvvl : GCCBuiltin<"__builtin_ve_vl_vcvtldrz_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtldrz_vvmvl : GCCBuiltin<"__builtin_ve_vl_vcvtldrz_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtdw_vvl : GCCBuiltin<"__builtin_ve_vl_vcvtdw_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtdw_vvvl : GCCBuiltin<"__builtin_ve_vl_vcvtdw_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtsw_vvl : GCCBuiltin<"__builtin_ve_vl_vcvtsw_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtsw_vvvl : GCCBuiltin<"__builtin_ve_vl_vcvtsw_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvcvtsw_vvl : GCCBuiltin<"__builtin_ve_vl_pvcvtsw_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvcvtsw_vvvl : GCCBuiltin<"__builtin_ve_vl_pvcvtsw_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtdl_vvl : GCCBuiltin<"__builtin_ve_vl_vcvtdl_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtdl_vvvl : GCCBuiltin<"__builtin_ve_vl_vcvtdl_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtds_vvl : GCCBuiltin<"__builtin_ve_vl_vcvtds_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtds_vvvl : GCCBuiltin<"__builtin_ve_vl_vcvtds_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtsd_vvl : GCCBuiltin<"__builtin_ve_vl_vcvtsd_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcvtsd_vvvl : GCCBuiltin<"__builtin_ve_vl_vcvtsd_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmrg_vvvml : GCCBuiltin<"__builtin_ve_vl_vmrg_vvvml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmrg_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vmrg_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmrg_vsvml : GCCBuiltin<"__builtin_ve_vl_vmrg_vsvml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmrg_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vmrg_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmrgw_vvvMl : GCCBuiltin<"__builtin_ve_vl_vmrgw_vvvMl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmrgw_vvvMvl : GCCBuiltin<"__builtin_ve_vl_vmrgw_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmrgw_vsvMl : GCCBuiltin<"__builtin_ve_vl_vmrgw_vsvMl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vmrgw_vsvMvl : GCCBuiltin<"__builtin_ve_vl_vmrgw_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vshf_vvvsl : GCCBuiltin<"__builtin_ve_vl_vshf_vvvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vshf_vvvsvl : GCCBuiltin<"__builtin_ve_vl_vshf_vvvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vcp_vvmvl : GCCBuiltin<"__builtin_ve_vl_vcp_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vex_vvmvl : GCCBuiltin<"__builtin_ve_vl_vex_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmklat_ml : GCCBuiltin<"__builtin_ve_vl_vfmklat_ml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmklaf_ml : GCCBuiltin<"__builtin_ve_vl_vfmklaf_ml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkat_Ml : GCCBuiltin<"__builtin_ve_vl_pvfmkat_Ml">, Intrinsic<[LLVMType<v512i1>], [LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkaf_Ml : GCCBuiltin<"__builtin_ve_vl_pvfmkaf_Ml">, Intrinsic<[LLVMType<v512i1>], [LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmklgt_mvl : GCCBuiltin<"__builtin_ve_vl_vfmklgt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmklgt_mvml : GCCBuiltin<"__builtin_ve_vl_vfmklgt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkllt_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkllt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkllt_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkllt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmklne_mvl : GCCBuiltin<"__builtin_ve_vl_vfmklne_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmklne_mvml : GCCBuiltin<"__builtin_ve_vl_vfmklne_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkleq_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkleq_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkleq_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkleq_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmklge_mvl : GCCBuiltin<"__builtin_ve_vl_vfmklge_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmklge_mvml : GCCBuiltin<"__builtin_ve_vl_vfmklge_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmklle_mvl : GCCBuiltin<"__builtin_ve_vl_vfmklle_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmklle_mvml : GCCBuiltin<"__builtin_ve_vl_vfmklle_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmklnum_mvl : GCCBuiltin<"__builtin_ve_vl_vfmklnum_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmklnum_mvml : GCCBuiltin<"__builtin_ve_vl_vfmklnum_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmklnan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmklnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmklnan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmklnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmklgtnan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmklgtnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmklgtnan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmklgtnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmklltnan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmklltnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmklltnan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmklltnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmklnenan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmklnenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmklnenan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmklnenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkleqnan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkleqnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkleqnan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkleqnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmklgenan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmklgenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmklgenan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmklgenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkllenan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkllenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkllenan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkllenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkwgt_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkwgt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkwgt_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkwgt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkwlt_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkwlt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkwlt_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkwlt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkwne_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkwne_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkwne_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkwne_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkweq_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkweq_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkweq_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkweq_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkwge_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkwge_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkwge_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkwge_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkwle_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkwle_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkwle_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkwle_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkwnum_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkwnum_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkwnum_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkwnum_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkwnan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkwnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkwnan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkwnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkwgtnan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkwgtnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkwgtnan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkwgtnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkwltnan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkwltnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkwltnan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkwltnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkwnenan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkwnenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkwnenan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkwnenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkweqnan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkweqnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkweqnan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkweqnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkwgenan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkwgenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkwgenan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkwgenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkwlenan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkwlenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkwlenan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkwlenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlogt_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwlogt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupgt_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwupgt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlogt_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwlogt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupgt_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwupgt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlolt_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwlolt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwuplt_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwuplt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlolt_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwlolt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwuplt_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwuplt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlone_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwlone_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupne_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwupne_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlone_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwlone_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupne_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwupne_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwloeq_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwloeq_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupeq_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwupeq_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwloeq_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwloeq_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupeq_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwupeq_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwloge_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwloge_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupge_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwupge_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwloge_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwloge_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupge_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwupge_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlole_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwlole_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwuple_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwuple_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlole_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwlole_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwuple_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwuple_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlonum_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwlonum_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupnum_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwupnum_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlonum_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwlonum_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupnum_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwupnum_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlonan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwlonan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupnan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwupnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlonan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwlonan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupnan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwupnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlogtnan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwlogtnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupgtnan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwupgtnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlogtnan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwlogtnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupgtnan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwupgtnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwloltnan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwloltnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupltnan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwupltnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwloltnan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwloltnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupltnan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwupltnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlonenan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwlonenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupnenan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwupnenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlonenan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwlonenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupnenan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwupnenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwloeqnan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwloeqnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupeqnan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwupeqnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwloeqnan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwloeqnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupeqnan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwupeqnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlogenan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwlogenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupgenan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwupgenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlogenan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwlogenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupgenan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwupgenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlolenan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwlolenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwuplenan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwuplenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlolenan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwlolenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwuplenan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwuplenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwgt_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwgt_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwgt_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkwgt_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlt_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwlt_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlt_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkwlt_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwne_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwne_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwne_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkwne_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkweq_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkweq_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkweq_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkweq_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwge_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwge_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwge_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkwge_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwle_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwle_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwle_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkwle_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwnum_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwnum_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwnum_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkwnum_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwnan_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwnan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwnan_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkwnan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwgtnan_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwgtnan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwgtnan_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkwgtnan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwltnan_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwltnan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwltnan_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkwltnan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwnenan_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwnenan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwnenan_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkwnenan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkweqnan_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkweqnan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkweqnan_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkweqnan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwgenan_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwgenan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwgenan_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkwgenan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlenan_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwlenan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlenan_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkwlenan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkdgt_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkdgt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkdgt_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkdgt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkdlt_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkdlt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkdlt_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkdlt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkdne_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkdne_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkdne_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkdne_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkdeq_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkdeq_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkdeq_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkdeq_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkdge_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkdge_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkdge_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkdge_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkdle_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkdle_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkdle_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkdle_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkdnum_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkdnum_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkdnum_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkdnum_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkdnan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkdnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkdnan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkdnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkdgtnan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkdgtnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkdgtnan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkdgtnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkdltnan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkdltnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkdltnan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkdltnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkdnenan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkdnenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkdnenan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkdnenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkdeqnan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkdeqnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkdeqnan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkdeqnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkdgenan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkdgenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkdgenan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkdgenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkdlenan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkdlenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkdlenan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkdlenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmksgt_mvl : GCCBuiltin<"__builtin_ve_vl_vfmksgt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmksgt_mvml : GCCBuiltin<"__builtin_ve_vl_vfmksgt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkslt_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkslt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkslt_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkslt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmksne_mvl : GCCBuiltin<"__builtin_ve_vl_vfmksne_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmksne_mvml : GCCBuiltin<"__builtin_ve_vl_vfmksne_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkseq_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkseq_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkseq_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkseq_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmksge_mvl : GCCBuiltin<"__builtin_ve_vl_vfmksge_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmksge_mvml : GCCBuiltin<"__builtin_ve_vl_vfmksge_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmksle_mvl : GCCBuiltin<"__builtin_ve_vl_vfmksle_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmksle_mvml : GCCBuiltin<"__builtin_ve_vl_vfmksle_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmksnum_mvl : GCCBuiltin<"__builtin_ve_vl_vfmksnum_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmksnum_mvml : GCCBuiltin<"__builtin_ve_vl_vfmksnum_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmksnan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmksnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmksnan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmksnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmksgtnan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmksgtnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmksgtnan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmksgtnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmksltnan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmksltnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmksltnan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmksltnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmksnenan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmksnenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmksnenan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmksnenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkseqnan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkseqnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkseqnan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkseqnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmksgenan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmksgenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmksgenan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmksgenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkslenan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkslenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfmkslenan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkslenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkslogt_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkslogt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksupgt_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksupgt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkslogt_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkslogt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksupgt_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksupgt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkslolt_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkslolt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksuplt_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksuplt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkslolt_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkslolt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksuplt_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksuplt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkslone_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkslone_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksupne_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksupne_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkslone_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkslone_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksupne_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksupne_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksloeq_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksloeq_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksupeq_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksupeq_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksloeq_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksloeq_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksupeq_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksupeq_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksloge_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksloge_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksupge_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksupge_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksloge_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksloge_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksupge_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksupge_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkslole_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkslole_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksuple_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksuple_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkslole_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkslole_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksuple_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksuple_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkslonum_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkslonum_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksupnum_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksupnum_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkslonum_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkslonum_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksupnum_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksupnum_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkslonan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkslonan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksupnan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksupnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkslonan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkslonan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksupnan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksupnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkslogtnan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkslogtnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksupgtnan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksupgtnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkslogtnan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkslogtnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksupgtnan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksupgtnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksloltnan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksloltnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksupltnan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksupltnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksloltnan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksloltnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksupltnan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksupltnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkslonenan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkslonenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksupnenan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksupnenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkslonenan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkslonenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksupnenan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksupnenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksloeqnan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksloeqnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksupeqnan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksupeqnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksloeqnan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksloeqnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksupeqnan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksupeqnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkslogenan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkslogenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksupgenan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksupgenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkslogenan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkslogenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksupgenan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksupgenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkslolenan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkslolenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksuplenan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksuplenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkslolenan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkslolenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksuplenan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksuplenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksgt_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksgt_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksgt_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmksgt_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkslt_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkslt_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkslt_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkslt_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksne_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksne_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksne_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmksne_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkseq_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkseq_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkseq_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkseq_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksge_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksge_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksge_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmksge_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksle_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksle_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksle_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmksle_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksnum_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksnum_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksnum_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmksnum_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksnan_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksnan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksnan_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmksnan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksgtnan_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksgtnan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksgtnan_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmksgtnan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksltnan_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksltnan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksltnan_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmksltnan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksnenan_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksnenan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksnenan_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmksnenan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkseqnan_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkseqnan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkseqnan_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkseqnan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksgenan_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksgenan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmksgenan_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmksgenan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkslenan_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkslenan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pvfmkslenan_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkslenan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsumwsx_vvl : GCCBuiltin<"__builtin_ve_vl_vsumwsx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsumwsx_vvml : GCCBuiltin<"__builtin_ve_vl_vsumwsx_vvml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsumwzx_vvl : GCCBuiltin<"__builtin_ve_vl_vsumwzx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsumwzx_vvml : GCCBuiltin<"__builtin_ve_vl_vsumwzx_vvml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsuml_vvl : GCCBuiltin<"__builtin_ve_vl_vsuml_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsuml_vvml : GCCBuiltin<"__builtin_ve_vl_vsuml_vvml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfsumd_vvl : GCCBuiltin<"__builtin_ve_vl_vfsumd_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfsumd_vvml : GCCBuiltin<"__builtin_ve_vl_vfsumd_vvml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfsums_vvl : GCCBuiltin<"__builtin_ve_vl_vfsums_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfsums_vvml : GCCBuiltin<"__builtin_ve_vl_vfsums_vvml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrmaxswfstsx_vvl : GCCBuiltin<"__builtin_ve_vl_vrmaxswfstsx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrmaxswfstsx_vvvl : GCCBuiltin<"__builtin_ve_vl_vrmaxswfstsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrmaxswlstsx_vvl : GCCBuiltin<"__builtin_ve_vl_vrmaxswlstsx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrmaxswlstsx_vvvl : GCCBuiltin<"__builtin_ve_vl_vrmaxswlstsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrmaxswfstzx_vvl : GCCBuiltin<"__builtin_ve_vl_vrmaxswfstzx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrmaxswfstzx_vvvl : GCCBuiltin<"__builtin_ve_vl_vrmaxswfstzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrmaxswlstzx_vvl : GCCBuiltin<"__builtin_ve_vl_vrmaxswlstzx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrmaxswlstzx_vvvl : GCCBuiltin<"__builtin_ve_vl_vrmaxswlstzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrminswfstsx_vvl : GCCBuiltin<"__builtin_ve_vl_vrminswfstsx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrminswfstsx_vvvl : GCCBuiltin<"__builtin_ve_vl_vrminswfstsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrminswlstsx_vvl : GCCBuiltin<"__builtin_ve_vl_vrminswlstsx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrminswlstsx_vvvl : GCCBuiltin<"__builtin_ve_vl_vrminswlstsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrminswfstzx_vvl : GCCBuiltin<"__builtin_ve_vl_vrminswfstzx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrminswfstzx_vvvl : GCCBuiltin<"__builtin_ve_vl_vrminswfstzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrminswlstzx_vvl : GCCBuiltin<"__builtin_ve_vl_vrminswlstzx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrminswlstzx_vvvl : GCCBuiltin<"__builtin_ve_vl_vrminswlstzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrmaxslfst_vvl : GCCBuiltin<"__builtin_ve_vl_vrmaxslfst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrmaxslfst_vvvl : GCCBuiltin<"__builtin_ve_vl_vrmaxslfst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrmaxsllst_vvl : GCCBuiltin<"__builtin_ve_vl_vrmaxsllst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrmaxsllst_vvvl : GCCBuiltin<"__builtin_ve_vl_vrmaxsllst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrminslfst_vvl : GCCBuiltin<"__builtin_ve_vl_vrminslfst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrminslfst_vvvl : GCCBuiltin<"__builtin_ve_vl_vrminslfst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrminsllst_vvl : GCCBuiltin<"__builtin_ve_vl_vrminsllst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrminsllst_vvvl : GCCBuiltin<"__builtin_ve_vl_vrminsllst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfrmaxdfst_vvl : GCCBuiltin<"__builtin_ve_vl_vfrmaxdfst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfrmaxdfst_vvvl : GCCBuiltin<"__builtin_ve_vl_vfrmaxdfst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfrmaxdlst_vvl : GCCBuiltin<"__builtin_ve_vl_vfrmaxdlst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfrmaxdlst_vvvl : GCCBuiltin<"__builtin_ve_vl_vfrmaxdlst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfrmaxsfst_vvl : GCCBuiltin<"__builtin_ve_vl_vfrmaxsfst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfrmaxsfst_vvvl : GCCBuiltin<"__builtin_ve_vl_vfrmaxsfst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfrmaxslst_vvl : GCCBuiltin<"__builtin_ve_vl_vfrmaxslst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfrmaxslst_vvvl : GCCBuiltin<"__builtin_ve_vl_vfrmaxslst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfrmindfst_vvl : GCCBuiltin<"__builtin_ve_vl_vfrmindfst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfrmindfst_vvvl : GCCBuiltin<"__builtin_ve_vl_vfrmindfst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfrmindlst_vvl : GCCBuiltin<"__builtin_ve_vl_vfrmindlst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfrmindlst_vvvl : GCCBuiltin<"__builtin_ve_vl_vfrmindlst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfrminsfst_vvl : GCCBuiltin<"__builtin_ve_vl_vfrminsfst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfrminsfst_vvvl : GCCBuiltin<"__builtin_ve_vl_vfrminsfst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfrminslst_vvl : GCCBuiltin<"__builtin_ve_vl_vfrminslst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vfrminslst_vvvl : GCCBuiltin<"__builtin_ve_vl_vfrminslst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrand_vvl : GCCBuiltin<"__builtin_ve_vl_vrand_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrand_vvml : GCCBuiltin<"__builtin_ve_vl_vrand_vvml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vror_vvl : GCCBuiltin<"__builtin_ve_vl_vror_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vror_vvml : GCCBuiltin<"__builtin_ve_vl_vror_vvml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrxor_vvl : GCCBuiltin<"__builtin_ve_vl_vrxor_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vrxor_vvml : GCCBuiltin<"__builtin_ve_vl_vrxor_vvml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vgt_vvssl : GCCBuiltin<"__builtin_ve_vl_vgt_vvssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vgt_vvssvl : GCCBuiltin<"__builtin_ve_vl_vgt_vvssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vgt_vvssml : GCCBuiltin<"__builtin_ve_vl_vgt_vvssml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vgt_vvssmvl : GCCBuiltin<"__builtin_ve_vl_vgt_vvssmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vgtnc_vvssl : GCCBuiltin<"__builtin_ve_vl_vgtnc_vvssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vgtnc_vvssvl : GCCBuiltin<"__builtin_ve_vl_vgtnc_vvssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vgtnc_vvssml : GCCBuiltin<"__builtin_ve_vl_vgtnc_vvssml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vgtnc_vvssmvl : GCCBuiltin<"__builtin_ve_vl_vgtnc_vvssmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vgtu_vvssl : GCCBuiltin<"__builtin_ve_vl_vgtu_vvssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vgtu_vvssvl : GCCBuiltin<"__builtin_ve_vl_vgtu_vvssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vgtu_vvssml : GCCBuiltin<"__builtin_ve_vl_vgtu_vvssml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vgtu_vvssmvl : GCCBuiltin<"__builtin_ve_vl_vgtu_vvssmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vgtunc_vvssl : GCCBuiltin<"__builtin_ve_vl_vgtunc_vvssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vgtunc_vvssvl : GCCBuiltin<"__builtin_ve_vl_vgtunc_vvssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vgtunc_vvssml : GCCBuiltin<"__builtin_ve_vl_vgtunc_vvssml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vgtunc_vvssmvl : GCCBuiltin<"__builtin_ve_vl_vgtunc_vvssmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vgtlsx_vvssl : GCCBuiltin<"__builtin_ve_vl_vgtlsx_vvssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vgtlsx_vvssvl : GCCBuiltin<"__builtin_ve_vl_vgtlsx_vvssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vgtlsx_vvssml : GCCBuiltin<"__builtin_ve_vl_vgtlsx_vvssml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vgtlsx_vvssmvl : GCCBuiltin<"__builtin_ve_vl_vgtlsx_vvssmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vgtlsxnc_vvssl : GCCBuiltin<"__builtin_ve_vl_vgtlsxnc_vvssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vgtlsxnc_vvssvl : GCCBuiltin<"__builtin_ve_vl_vgtlsxnc_vvssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vgtlsxnc_vvssml : GCCBuiltin<"__builtin_ve_vl_vgtlsxnc_vvssml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vgtlsxnc_vvssmvl : GCCBuiltin<"__builtin_ve_vl_vgtlsxnc_vvssmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vgtlzx_vvssl : GCCBuiltin<"__builtin_ve_vl_vgtlzx_vvssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vgtlzx_vvssvl : GCCBuiltin<"__builtin_ve_vl_vgtlzx_vvssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vgtlzx_vvssml : GCCBuiltin<"__builtin_ve_vl_vgtlzx_vvssml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vgtlzx_vvssmvl : GCCBuiltin<"__builtin_ve_vl_vgtlzx_vvssmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vgtlzxnc_vvssl : GCCBuiltin<"__builtin_ve_vl_vgtlzxnc_vvssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vgtlzxnc_vvssvl : GCCBuiltin<"__builtin_ve_vl_vgtlzxnc_vvssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vgtlzxnc_vvssml : GCCBuiltin<"__builtin_ve_vl_vgtlzxnc_vvssml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vgtlzxnc_vvssmvl : GCCBuiltin<"__builtin_ve_vl_vgtlzxnc_vvssmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsc_vvssl : GCCBuiltin<"__builtin_ve_vl_vsc_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsc_vvssml : GCCBuiltin<"__builtin_ve_vl_vsc_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vscnc_vvssl : GCCBuiltin<"__builtin_ve_vl_vscnc_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vscnc_vvssml : GCCBuiltin<"__builtin_ve_vl_vscnc_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vscot_vvssl : GCCBuiltin<"__builtin_ve_vl_vscot_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vscot_vvssml : GCCBuiltin<"__builtin_ve_vl_vscot_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vscncot_vvssl : GCCBuiltin<"__builtin_ve_vl_vscncot_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vscncot_vvssml : GCCBuiltin<"__builtin_ve_vl_vscncot_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vscu_vvssl : GCCBuiltin<"__builtin_ve_vl_vscu_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vscu_vvssml : GCCBuiltin<"__builtin_ve_vl_vscu_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vscunc_vvssl : GCCBuiltin<"__builtin_ve_vl_vscunc_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vscunc_vvssml : GCCBuiltin<"__builtin_ve_vl_vscunc_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vscuot_vvssl : GCCBuiltin<"__builtin_ve_vl_vscuot_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vscuot_vvssml : GCCBuiltin<"__builtin_ve_vl_vscuot_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vscuncot_vvssl : GCCBuiltin<"__builtin_ve_vl_vscuncot_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vscuncot_vvssml : GCCBuiltin<"__builtin_ve_vl_vscuncot_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vscl_vvssl : GCCBuiltin<"__builtin_ve_vl_vscl_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vscl_vvssml : GCCBuiltin<"__builtin_ve_vl_vscl_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsclnc_vvssl : GCCBuiltin<"__builtin_ve_vl_vsclnc_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsclnc_vvssml : GCCBuiltin<"__builtin_ve_vl_vsclnc_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsclot_vvssl : GCCBuiltin<"__builtin_ve_vl_vsclot_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsclot_vvssml : GCCBuiltin<"__builtin_ve_vl_vsclot_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsclncot_vvssl : GCCBuiltin<"__builtin_ve_vl_vsclncot_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_vsclncot_vvssml : GCCBuiltin<"__builtin_ve_vl_vsclncot_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_andm_mmm : GCCBuiltin<"__builtin_ve_vl_andm_mmm">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256i1>, LLVMType<v256i1>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_andm_MMM : GCCBuiltin<"__builtin_ve_vl_andm_MMM">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v512i1>, LLVMType<v512i1>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_orm_mmm : GCCBuiltin<"__builtin_ve_vl_orm_mmm">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256i1>, LLVMType<v256i1>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_orm_MMM : GCCBuiltin<"__builtin_ve_vl_orm_MMM">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v512i1>, LLVMType<v512i1>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_xorm_mmm : GCCBuiltin<"__builtin_ve_vl_xorm_mmm">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256i1>, LLVMType<v256i1>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_xorm_MMM : GCCBuiltin<"__builtin_ve_vl_xorm_MMM">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v512i1>, LLVMType<v512i1>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_eqvm_mmm : GCCBuiltin<"__builtin_ve_vl_eqvm_mmm">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256i1>, LLVMType<v256i1>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_eqvm_MMM : GCCBuiltin<"__builtin_ve_vl_eqvm_MMM">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v512i1>, LLVMType<v512i1>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_nndm_mmm : GCCBuiltin<"__builtin_ve_vl_nndm_mmm">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256i1>, LLVMType<v256i1>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_nndm_MMM : GCCBuiltin<"__builtin_ve_vl_nndm_MMM">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v512i1>, LLVMType<v512i1>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_negm_mm : GCCBuiltin<"__builtin_ve_vl_negm_mm">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256i1>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_negm_MM : GCCBuiltin<"__builtin_ve_vl_negm_MM">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v512i1>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_pcvm_sml : GCCBuiltin<"__builtin_ve_vl_pcvm_sml">, Intrinsic<[LLVMType<i64>], [LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_lzvm_sml : GCCBuiltin<"__builtin_ve_vl_lzvm_sml">, Intrinsic<[LLVMType<i64>], [LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
-let TargetPrefix = "ve" in def int_ve_vl_tovm_sml : GCCBuiltin<"__builtin_ve_vl_tovm_sml">, Intrinsic<[LLVMType<i64>], [LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vld_vssl : ClangBuiltin<"__builtin_ve_vl_vld_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vld_vssvl : ClangBuiltin<"__builtin_ve_vl_vld_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldnc_vssl : ClangBuiltin<"__builtin_ve_vl_vldnc_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldnc_vssvl : ClangBuiltin<"__builtin_ve_vl_vldnc_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldu_vssl : ClangBuiltin<"__builtin_ve_vl_vldu_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldu_vssvl : ClangBuiltin<"__builtin_ve_vl_vldu_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldunc_vssl : ClangBuiltin<"__builtin_ve_vl_vldunc_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldunc_vssvl : ClangBuiltin<"__builtin_ve_vl_vldunc_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldlsx_vssl : ClangBuiltin<"__builtin_ve_vl_vldlsx_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldlsx_vssvl : ClangBuiltin<"__builtin_ve_vl_vldlsx_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldlsxnc_vssl : ClangBuiltin<"__builtin_ve_vl_vldlsxnc_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldlsxnc_vssvl : ClangBuiltin<"__builtin_ve_vl_vldlsxnc_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldlzx_vssl : ClangBuiltin<"__builtin_ve_vl_vldlzx_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldlzx_vssvl : ClangBuiltin<"__builtin_ve_vl_vldlzx_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldlzxnc_vssl : ClangBuiltin<"__builtin_ve_vl_vldlzxnc_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldlzxnc_vssvl : ClangBuiltin<"__builtin_ve_vl_vldlzxnc_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vld2d_vssl : ClangBuiltin<"__builtin_ve_vl_vld2d_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vld2d_vssvl : ClangBuiltin<"__builtin_ve_vl_vld2d_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vld2dnc_vssl : ClangBuiltin<"__builtin_ve_vl_vld2dnc_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vld2dnc_vssvl : ClangBuiltin<"__builtin_ve_vl_vld2dnc_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldu2d_vssl : ClangBuiltin<"__builtin_ve_vl_vldu2d_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldu2d_vssvl : ClangBuiltin<"__builtin_ve_vl_vldu2d_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldu2dnc_vssl : ClangBuiltin<"__builtin_ve_vl_vldu2dnc_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldu2dnc_vssvl : ClangBuiltin<"__builtin_ve_vl_vldu2dnc_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldl2dsx_vssl : ClangBuiltin<"__builtin_ve_vl_vldl2dsx_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldl2dsx_vssvl : ClangBuiltin<"__builtin_ve_vl_vldl2dsx_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldl2dsxnc_vssl : ClangBuiltin<"__builtin_ve_vl_vldl2dsxnc_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldl2dsxnc_vssvl : ClangBuiltin<"__builtin_ve_vl_vldl2dsxnc_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldl2dzx_vssl : ClangBuiltin<"__builtin_ve_vl_vldl2dzx_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldl2dzx_vssvl : ClangBuiltin<"__builtin_ve_vl_vldl2dzx_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldl2dzxnc_vssl : ClangBuiltin<"__builtin_ve_vl_vldl2dzxnc_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldl2dzxnc_vssvl : ClangBuiltin<"__builtin_ve_vl_vldl2dzxnc_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vst_vssl : ClangBuiltin<"__builtin_ve_vl_vst_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vst_vssml : ClangBuiltin<"__builtin_ve_vl_vst_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstnc_vssl : ClangBuiltin<"__builtin_ve_vl_vstnc_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstnc_vssml : ClangBuiltin<"__builtin_ve_vl_vstnc_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstot_vssl : ClangBuiltin<"__builtin_ve_vl_vstot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstot_vssml : ClangBuiltin<"__builtin_ve_vl_vstot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstncot_vssl : ClangBuiltin<"__builtin_ve_vl_vstncot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstncot_vssml : ClangBuiltin<"__builtin_ve_vl_vstncot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstu_vssl : ClangBuiltin<"__builtin_ve_vl_vstu_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstu_vssml : ClangBuiltin<"__builtin_ve_vl_vstu_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstunc_vssl : ClangBuiltin<"__builtin_ve_vl_vstunc_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstunc_vssml : ClangBuiltin<"__builtin_ve_vl_vstunc_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstuot_vssl : ClangBuiltin<"__builtin_ve_vl_vstuot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstuot_vssml : ClangBuiltin<"__builtin_ve_vl_vstuot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstuncot_vssl : ClangBuiltin<"__builtin_ve_vl_vstuncot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstuncot_vssml : ClangBuiltin<"__builtin_ve_vl_vstuncot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstl_vssl : ClangBuiltin<"__builtin_ve_vl_vstl_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstl_vssml : ClangBuiltin<"__builtin_ve_vl_vstl_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstlnc_vssl : ClangBuiltin<"__builtin_ve_vl_vstlnc_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstlnc_vssml : ClangBuiltin<"__builtin_ve_vl_vstlnc_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstlot_vssl : ClangBuiltin<"__builtin_ve_vl_vstlot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstlot_vssml : ClangBuiltin<"__builtin_ve_vl_vstlot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstlncot_vssl : ClangBuiltin<"__builtin_ve_vl_vstlncot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstlncot_vssml : ClangBuiltin<"__builtin_ve_vl_vstlncot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vst2d_vssl : ClangBuiltin<"__builtin_ve_vl_vst2d_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vst2d_vssml : ClangBuiltin<"__builtin_ve_vl_vst2d_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vst2dnc_vssl : ClangBuiltin<"__builtin_ve_vl_vst2dnc_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vst2dnc_vssml : ClangBuiltin<"__builtin_ve_vl_vst2dnc_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vst2dot_vssl : ClangBuiltin<"__builtin_ve_vl_vst2dot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vst2dot_vssml : ClangBuiltin<"__builtin_ve_vl_vst2dot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vst2dncot_vssl : ClangBuiltin<"__builtin_ve_vl_vst2dncot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vst2dncot_vssml : ClangBuiltin<"__builtin_ve_vl_vst2dncot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstu2d_vssl : ClangBuiltin<"__builtin_ve_vl_vstu2d_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstu2d_vssml : ClangBuiltin<"__builtin_ve_vl_vstu2d_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstu2dnc_vssl : ClangBuiltin<"__builtin_ve_vl_vstu2dnc_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstu2dnc_vssml : ClangBuiltin<"__builtin_ve_vl_vstu2dnc_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstu2dot_vssl : ClangBuiltin<"__builtin_ve_vl_vstu2dot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstu2dot_vssml : ClangBuiltin<"__builtin_ve_vl_vstu2dot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstu2dncot_vssl : ClangBuiltin<"__builtin_ve_vl_vstu2dncot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstu2dncot_vssml : ClangBuiltin<"__builtin_ve_vl_vstu2dncot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstl2d_vssl : ClangBuiltin<"__builtin_ve_vl_vstl2d_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstl2d_vssml : ClangBuiltin<"__builtin_ve_vl_vstl2d_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstl2dnc_vssl : ClangBuiltin<"__builtin_ve_vl_vstl2dnc_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstl2dnc_vssml : ClangBuiltin<"__builtin_ve_vl_vstl2dnc_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstl2dot_vssl : ClangBuiltin<"__builtin_ve_vl_vstl2dot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstl2dot_vssml : ClangBuiltin<"__builtin_ve_vl_vstl2dot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstl2dncot_vssl : ClangBuiltin<"__builtin_ve_vl_vstl2dncot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstl2dncot_vssml : ClangBuiltin<"__builtin_ve_vl_vstl2dncot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pfchv_ssl : ClangBuiltin<"__builtin_ve_vl_pfchv_ssl">, Intrinsic<[], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrInaccessibleMemOrArgMemOnly]>;
+let TargetPrefix = "ve" in def int_ve_vl_pfchvnc_ssl : ClangBuiltin<"__builtin_ve_vl_pfchvnc_ssl">, Intrinsic<[], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrInaccessibleMemOrArgMemOnly]>;
+let TargetPrefix = "ve" in def int_ve_vl_lsv_vvss : ClangBuiltin<"__builtin_ve_vl_lsv_vvss">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<i64>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_lvsl_svs : ClangBuiltin<"__builtin_ve_vl_lvsl_svs">, Intrinsic<[LLVMType<i64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_lvsd_svs : ClangBuiltin<"__builtin_ve_vl_lvsd_svs">, Intrinsic<[LLVMType<f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_lvss_svs : ClangBuiltin<"__builtin_ve_vl_lvss_svs">, Intrinsic<[LLVMType<f32>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_lvm_mmss : ClangBuiltin<"__builtin_ve_vl_lvm_mmss">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256i1>, LLVMType<i64>, LLVMType<i64>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_lvm_MMss : ClangBuiltin<"__builtin_ve_vl_lvm_MMss">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v512i1>, LLVMType<i64>, LLVMType<i64>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_svm_sms : ClangBuiltin<"__builtin_ve_vl_svm_sms">, Intrinsic<[LLVMType<i64>], [LLVMType<v256i1>, LLVMType<i64>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_svm_sMs : ClangBuiltin<"__builtin_ve_vl_svm_sMs">, Intrinsic<[LLVMType<i64>], [LLVMType<v512i1>, LLVMType<i64>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vbrdd_vsl : ClangBuiltin<"__builtin_ve_vl_vbrdd_vsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vbrdd_vsvl : ClangBuiltin<"__builtin_ve_vl_vbrdd_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vbrdd_vsmvl : ClangBuiltin<"__builtin_ve_vl_vbrdd_vsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vbrdl_vsl : ClangBuiltin<"__builtin_ve_vl_vbrdl_vsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vbrdl_vsvl : ClangBuiltin<"__builtin_ve_vl_vbrdl_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vbrdl_vsmvl : ClangBuiltin<"__builtin_ve_vl_vbrdl_vsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vbrds_vsl : ClangBuiltin<"__builtin_ve_vl_vbrds_vsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vbrds_vsvl : ClangBuiltin<"__builtin_ve_vl_vbrds_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vbrds_vsmvl : ClangBuiltin<"__builtin_ve_vl_vbrds_vsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vbrdw_vsl : ClangBuiltin<"__builtin_ve_vl_vbrdw_vsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vbrdw_vsvl : ClangBuiltin<"__builtin_ve_vl_vbrdw_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vbrdw_vsmvl : ClangBuiltin<"__builtin_ve_vl_vbrdw_vsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvbrd_vsl : ClangBuiltin<"__builtin_ve_vl_pvbrd_vsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvbrd_vsvl : ClangBuiltin<"__builtin_ve_vl_pvbrd_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvbrd_vsMvl : ClangBuiltin<"__builtin_ve_vl_pvbrd_vsMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmv_vsvl : ClangBuiltin<"__builtin_ve_vl_vmv_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmv_vsvvl : ClangBuiltin<"__builtin_ve_vl_vmv_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmv_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vmv_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddul_vvvl : ClangBuiltin<"__builtin_ve_vl_vaddul_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddul_vvvvl : ClangBuiltin<"__builtin_ve_vl_vaddul_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddul_vsvl : ClangBuiltin<"__builtin_ve_vl_vaddul_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddul_vsvvl : ClangBuiltin<"__builtin_ve_vl_vaddul_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddul_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vaddul_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddul_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vaddul_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vadduw_vvvl : ClangBuiltin<"__builtin_ve_vl_vadduw_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vadduw_vvvvl : ClangBuiltin<"__builtin_ve_vl_vadduw_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vadduw_vsvl : ClangBuiltin<"__builtin_ve_vl_vadduw_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vadduw_vsvvl : ClangBuiltin<"__builtin_ve_vl_vadduw_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vadduw_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vadduw_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vadduw_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vadduw_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvaddu_vvvl : ClangBuiltin<"__builtin_ve_vl_pvaddu_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvaddu_vvvvl : ClangBuiltin<"__builtin_ve_vl_pvaddu_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvaddu_vsvl : ClangBuiltin<"__builtin_ve_vl_pvaddu_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvaddu_vsvvl : ClangBuiltin<"__builtin_ve_vl_pvaddu_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvaddu_vvvMvl : ClangBuiltin<"__builtin_ve_vl_pvaddu_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvaddu_vsvMvl : ClangBuiltin<"__builtin_ve_vl_pvaddu_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddswsx_vvvl : ClangBuiltin<"__builtin_ve_vl_vaddswsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddswsx_vvvvl : ClangBuiltin<"__builtin_ve_vl_vaddswsx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddswsx_vsvl : ClangBuiltin<"__builtin_ve_vl_vaddswsx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddswsx_vsvvl : ClangBuiltin<"__builtin_ve_vl_vaddswsx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddswsx_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vaddswsx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddswsx_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vaddswsx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddswzx_vvvl : ClangBuiltin<"__builtin_ve_vl_vaddswzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddswzx_vvvvl : ClangBuiltin<"__builtin_ve_vl_vaddswzx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddswzx_vsvl : ClangBuiltin<"__builtin_ve_vl_vaddswzx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddswzx_vsvvl : ClangBuiltin<"__builtin_ve_vl_vaddswzx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddswzx_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vaddswzx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddswzx_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vaddswzx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvadds_vvvl : ClangBuiltin<"__builtin_ve_vl_pvadds_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvadds_vvvvl : ClangBuiltin<"__builtin_ve_vl_pvadds_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvadds_vsvl : ClangBuiltin<"__builtin_ve_vl_pvadds_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvadds_vsvvl : ClangBuiltin<"__builtin_ve_vl_pvadds_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvadds_vvvMvl : ClangBuiltin<"__builtin_ve_vl_pvadds_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvadds_vsvMvl : ClangBuiltin<"__builtin_ve_vl_pvadds_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddsl_vvvl : ClangBuiltin<"__builtin_ve_vl_vaddsl_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddsl_vvvvl : ClangBuiltin<"__builtin_ve_vl_vaddsl_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddsl_vsvl : ClangBuiltin<"__builtin_ve_vl_vaddsl_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddsl_vsvvl : ClangBuiltin<"__builtin_ve_vl_vaddsl_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddsl_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vaddsl_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddsl_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vaddsl_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubul_vvvl : ClangBuiltin<"__builtin_ve_vl_vsubul_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubul_vvvvl : ClangBuiltin<"__builtin_ve_vl_vsubul_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubul_vsvl : ClangBuiltin<"__builtin_ve_vl_vsubul_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubul_vsvvl : ClangBuiltin<"__builtin_ve_vl_vsubul_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubul_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vsubul_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubul_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vsubul_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubuw_vvvl : ClangBuiltin<"__builtin_ve_vl_vsubuw_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubuw_vvvvl : ClangBuiltin<"__builtin_ve_vl_vsubuw_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubuw_vsvl : ClangBuiltin<"__builtin_ve_vl_vsubuw_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubuw_vsvvl : ClangBuiltin<"__builtin_ve_vl_vsubuw_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubuw_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vsubuw_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubuw_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vsubuw_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsubu_vvvl : ClangBuiltin<"__builtin_ve_vl_pvsubu_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsubu_vvvvl : ClangBuiltin<"__builtin_ve_vl_pvsubu_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsubu_vsvl : ClangBuiltin<"__builtin_ve_vl_pvsubu_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsubu_vsvvl : ClangBuiltin<"__builtin_ve_vl_pvsubu_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsubu_vvvMvl : ClangBuiltin<"__builtin_ve_vl_pvsubu_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsubu_vsvMvl : ClangBuiltin<"__builtin_ve_vl_pvsubu_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubswsx_vvvl : ClangBuiltin<"__builtin_ve_vl_vsubswsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubswsx_vvvvl : ClangBuiltin<"__builtin_ve_vl_vsubswsx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubswsx_vsvl : ClangBuiltin<"__builtin_ve_vl_vsubswsx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubswsx_vsvvl : ClangBuiltin<"__builtin_ve_vl_vsubswsx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubswsx_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vsubswsx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubswsx_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vsubswsx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubswzx_vvvl : ClangBuiltin<"__builtin_ve_vl_vsubswzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubswzx_vvvvl : ClangBuiltin<"__builtin_ve_vl_vsubswzx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubswzx_vsvl : ClangBuiltin<"__builtin_ve_vl_vsubswzx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubswzx_vsvvl : ClangBuiltin<"__builtin_ve_vl_vsubswzx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubswzx_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vsubswzx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubswzx_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vsubswzx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsubs_vvvl : ClangBuiltin<"__builtin_ve_vl_pvsubs_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsubs_vvvvl : ClangBuiltin<"__builtin_ve_vl_pvsubs_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsubs_vsvl : ClangBuiltin<"__builtin_ve_vl_pvsubs_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsubs_vsvvl : ClangBuiltin<"__builtin_ve_vl_pvsubs_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsubs_vvvMvl : ClangBuiltin<"__builtin_ve_vl_pvsubs_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsubs_vsvMvl : ClangBuiltin<"__builtin_ve_vl_pvsubs_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubsl_vvvl : ClangBuiltin<"__builtin_ve_vl_vsubsl_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubsl_vvvvl : ClangBuiltin<"__builtin_ve_vl_vsubsl_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubsl_vsvl : ClangBuiltin<"__builtin_ve_vl_vsubsl_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubsl_vsvvl : ClangBuiltin<"__builtin_ve_vl_vsubsl_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubsl_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vsubsl_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubsl_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vsubsl_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulul_vvvl : ClangBuiltin<"__builtin_ve_vl_vmulul_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulul_vvvvl : ClangBuiltin<"__builtin_ve_vl_vmulul_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulul_vsvl : ClangBuiltin<"__builtin_ve_vl_vmulul_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulul_vsvvl : ClangBuiltin<"__builtin_ve_vl_vmulul_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulul_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vmulul_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulul_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vmulul_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmuluw_vvvl : ClangBuiltin<"__builtin_ve_vl_vmuluw_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmuluw_vvvvl : ClangBuiltin<"__builtin_ve_vl_vmuluw_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmuluw_vsvl : ClangBuiltin<"__builtin_ve_vl_vmuluw_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmuluw_vsvvl : ClangBuiltin<"__builtin_ve_vl_vmuluw_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmuluw_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vmuluw_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmuluw_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vmuluw_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulswsx_vvvl : ClangBuiltin<"__builtin_ve_vl_vmulswsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulswsx_vvvvl : ClangBuiltin<"__builtin_ve_vl_vmulswsx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulswsx_vsvl : ClangBuiltin<"__builtin_ve_vl_vmulswsx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulswsx_vsvvl : ClangBuiltin<"__builtin_ve_vl_vmulswsx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulswsx_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vmulswsx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulswsx_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vmulswsx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulswzx_vvvl : ClangBuiltin<"__builtin_ve_vl_vmulswzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulswzx_vvvvl : ClangBuiltin<"__builtin_ve_vl_vmulswzx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulswzx_vsvl : ClangBuiltin<"__builtin_ve_vl_vmulswzx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulswzx_vsvvl : ClangBuiltin<"__builtin_ve_vl_vmulswzx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulswzx_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vmulswzx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulswzx_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vmulswzx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulsl_vvvl : ClangBuiltin<"__builtin_ve_vl_vmulsl_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulsl_vvvvl : ClangBuiltin<"__builtin_ve_vl_vmulsl_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulsl_vsvl : ClangBuiltin<"__builtin_ve_vl_vmulsl_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulsl_vsvvl : ClangBuiltin<"__builtin_ve_vl_vmulsl_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulsl_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vmulsl_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulsl_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vmulsl_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulslw_vvvl : ClangBuiltin<"__builtin_ve_vl_vmulslw_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulslw_vvvvl : ClangBuiltin<"__builtin_ve_vl_vmulslw_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulslw_vsvl : ClangBuiltin<"__builtin_ve_vl_vmulslw_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulslw_vsvvl : ClangBuiltin<"__builtin_ve_vl_vmulslw_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivul_vvvl : ClangBuiltin<"__builtin_ve_vl_vdivul_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivul_vvvvl : ClangBuiltin<"__builtin_ve_vl_vdivul_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivul_vsvl : ClangBuiltin<"__builtin_ve_vl_vdivul_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivul_vsvvl : ClangBuiltin<"__builtin_ve_vl_vdivul_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivul_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vdivul_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivul_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vdivul_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivuw_vvvl : ClangBuiltin<"__builtin_ve_vl_vdivuw_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivuw_vvvvl : ClangBuiltin<"__builtin_ve_vl_vdivuw_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivuw_vsvl : ClangBuiltin<"__builtin_ve_vl_vdivuw_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivuw_vsvvl : ClangBuiltin<"__builtin_ve_vl_vdivuw_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivuw_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vdivuw_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivuw_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vdivuw_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivul_vvsl : ClangBuiltin<"__builtin_ve_vl_vdivul_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivul_vvsvl : ClangBuiltin<"__builtin_ve_vl_vdivul_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivul_vvsmvl : ClangBuiltin<"__builtin_ve_vl_vdivul_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivuw_vvsl : ClangBuiltin<"__builtin_ve_vl_vdivuw_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivuw_vvsvl : ClangBuiltin<"__builtin_ve_vl_vdivuw_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivuw_vvsmvl : ClangBuiltin<"__builtin_ve_vl_vdivuw_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswsx_vvvl : ClangBuiltin<"__builtin_ve_vl_vdivswsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswsx_vvvvl : ClangBuiltin<"__builtin_ve_vl_vdivswsx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswsx_vsvl : ClangBuiltin<"__builtin_ve_vl_vdivswsx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswsx_vsvvl : ClangBuiltin<"__builtin_ve_vl_vdivswsx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswsx_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vdivswsx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswsx_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vdivswsx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswzx_vvvl : ClangBuiltin<"__builtin_ve_vl_vdivswzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswzx_vvvvl : ClangBuiltin<"__builtin_ve_vl_vdivswzx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswzx_vsvl : ClangBuiltin<"__builtin_ve_vl_vdivswzx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswzx_vsvvl : ClangBuiltin<"__builtin_ve_vl_vdivswzx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswzx_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vdivswzx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswzx_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vdivswzx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswsx_vvsl : ClangBuiltin<"__builtin_ve_vl_vdivswsx_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswsx_vvsvl : ClangBuiltin<"__builtin_ve_vl_vdivswsx_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswsx_vvsmvl : ClangBuiltin<"__builtin_ve_vl_vdivswsx_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswzx_vvsl : ClangBuiltin<"__builtin_ve_vl_vdivswzx_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswzx_vvsvl : ClangBuiltin<"__builtin_ve_vl_vdivswzx_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswzx_vvsmvl : ClangBuiltin<"__builtin_ve_vl_vdivswzx_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivsl_vvvl : ClangBuiltin<"__builtin_ve_vl_vdivsl_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivsl_vvvvl : ClangBuiltin<"__builtin_ve_vl_vdivsl_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivsl_vsvl : ClangBuiltin<"__builtin_ve_vl_vdivsl_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivsl_vsvvl : ClangBuiltin<"__builtin_ve_vl_vdivsl_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivsl_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vdivsl_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivsl_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vdivsl_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivsl_vvsl : ClangBuiltin<"__builtin_ve_vl_vdivsl_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivsl_vvsvl : ClangBuiltin<"__builtin_ve_vl_vdivsl_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivsl_vvsmvl : ClangBuiltin<"__builtin_ve_vl_vdivsl_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpul_vvvl : ClangBuiltin<"__builtin_ve_vl_vcmpul_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpul_vvvvl : ClangBuiltin<"__builtin_ve_vl_vcmpul_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpul_vsvl : ClangBuiltin<"__builtin_ve_vl_vcmpul_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpul_vsvvl : ClangBuiltin<"__builtin_ve_vl_vcmpul_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpul_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vcmpul_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpul_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vcmpul_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpuw_vvvl : ClangBuiltin<"__builtin_ve_vl_vcmpuw_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpuw_vvvvl : ClangBuiltin<"__builtin_ve_vl_vcmpuw_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpuw_vsvl : ClangBuiltin<"__builtin_ve_vl_vcmpuw_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpuw_vsvvl : ClangBuiltin<"__builtin_ve_vl_vcmpuw_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpuw_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vcmpuw_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpuw_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vcmpuw_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcmpu_vvvl : ClangBuiltin<"__builtin_ve_vl_pvcmpu_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcmpu_vvvvl : ClangBuiltin<"__builtin_ve_vl_pvcmpu_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcmpu_vsvl : ClangBuiltin<"__builtin_ve_vl_pvcmpu_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcmpu_vsvvl : ClangBuiltin<"__builtin_ve_vl_pvcmpu_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcmpu_vvvMvl : ClangBuiltin<"__builtin_ve_vl_pvcmpu_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcmpu_vsvMvl : ClangBuiltin<"__builtin_ve_vl_pvcmpu_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpswsx_vvvl : ClangBuiltin<"__builtin_ve_vl_vcmpswsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpswsx_vvvvl : ClangBuiltin<"__builtin_ve_vl_vcmpswsx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpswsx_vsvl : ClangBuiltin<"__builtin_ve_vl_vcmpswsx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpswsx_vsvvl : ClangBuiltin<"__builtin_ve_vl_vcmpswsx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpswsx_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vcmpswsx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpswsx_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vcmpswsx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpswzx_vvvl : ClangBuiltin<"__builtin_ve_vl_vcmpswzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpswzx_vvvvl : ClangBuiltin<"__builtin_ve_vl_vcmpswzx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpswzx_vsvl : ClangBuiltin<"__builtin_ve_vl_vcmpswzx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpswzx_vsvvl : ClangBuiltin<"__builtin_ve_vl_vcmpswzx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpswzx_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vcmpswzx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpswzx_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vcmpswzx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcmps_vvvl : ClangBuiltin<"__builtin_ve_vl_pvcmps_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcmps_vvvvl : ClangBuiltin<"__builtin_ve_vl_pvcmps_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcmps_vsvl : ClangBuiltin<"__builtin_ve_vl_pvcmps_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcmps_vsvvl : ClangBuiltin<"__builtin_ve_vl_pvcmps_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcmps_vvvMvl : ClangBuiltin<"__builtin_ve_vl_pvcmps_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcmps_vsvMvl : ClangBuiltin<"__builtin_ve_vl_pvcmps_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpsl_vvvl : ClangBuiltin<"__builtin_ve_vl_vcmpsl_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpsl_vvvvl : ClangBuiltin<"__builtin_ve_vl_vcmpsl_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpsl_vsvl : ClangBuiltin<"__builtin_ve_vl_vcmpsl_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpsl_vsvvl : ClangBuiltin<"__builtin_ve_vl_vcmpsl_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpsl_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vcmpsl_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpsl_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vcmpsl_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxswsx_vvvl : ClangBuiltin<"__builtin_ve_vl_vmaxswsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxswsx_vvvvl : ClangBuiltin<"__builtin_ve_vl_vmaxswsx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxswsx_vsvl : ClangBuiltin<"__builtin_ve_vl_vmaxswsx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxswsx_vsvvl : ClangBuiltin<"__builtin_ve_vl_vmaxswsx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxswsx_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vmaxswsx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxswsx_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vmaxswsx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxswzx_vvvl : ClangBuiltin<"__builtin_ve_vl_vmaxswzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxswzx_vvvvl : ClangBuiltin<"__builtin_ve_vl_vmaxswzx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxswzx_vsvl : ClangBuiltin<"__builtin_ve_vl_vmaxswzx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxswzx_vsvvl : ClangBuiltin<"__builtin_ve_vl_vmaxswzx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxswzx_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vmaxswzx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxswzx_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vmaxswzx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvmaxs_vvvl : ClangBuiltin<"__builtin_ve_vl_pvmaxs_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvmaxs_vvvvl : ClangBuiltin<"__builtin_ve_vl_pvmaxs_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvmaxs_vsvl : ClangBuiltin<"__builtin_ve_vl_pvmaxs_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvmaxs_vsvvl : ClangBuiltin<"__builtin_ve_vl_pvmaxs_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvmaxs_vvvMvl : ClangBuiltin<"__builtin_ve_vl_pvmaxs_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvmaxs_vsvMvl : ClangBuiltin<"__builtin_ve_vl_pvmaxs_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminswsx_vvvl : ClangBuiltin<"__builtin_ve_vl_vminswsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminswsx_vvvvl : ClangBuiltin<"__builtin_ve_vl_vminswsx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminswsx_vsvl : ClangBuiltin<"__builtin_ve_vl_vminswsx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminswsx_vsvvl : ClangBuiltin<"__builtin_ve_vl_vminswsx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminswsx_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vminswsx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminswsx_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vminswsx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminswzx_vvvl : ClangBuiltin<"__builtin_ve_vl_vminswzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminswzx_vvvvl : ClangBuiltin<"__builtin_ve_vl_vminswzx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminswzx_vsvl : ClangBuiltin<"__builtin_ve_vl_vminswzx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminswzx_vsvvl : ClangBuiltin<"__builtin_ve_vl_vminswzx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminswzx_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vminswzx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminswzx_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vminswzx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvmins_vvvl : ClangBuiltin<"__builtin_ve_vl_pvmins_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvmins_vvvvl : ClangBuiltin<"__builtin_ve_vl_pvmins_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvmins_vsvl : ClangBuiltin<"__builtin_ve_vl_pvmins_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvmins_vsvvl : ClangBuiltin<"__builtin_ve_vl_pvmins_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvmins_vvvMvl : ClangBuiltin<"__builtin_ve_vl_pvmins_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvmins_vsvMvl : ClangBuiltin<"__builtin_ve_vl_pvmins_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxsl_vvvl : ClangBuiltin<"__builtin_ve_vl_vmaxsl_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxsl_vvvvl : ClangBuiltin<"__builtin_ve_vl_vmaxsl_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxsl_vsvl : ClangBuiltin<"__builtin_ve_vl_vmaxsl_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxsl_vsvvl : ClangBuiltin<"__builtin_ve_vl_vmaxsl_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxsl_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vmaxsl_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxsl_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vmaxsl_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminsl_vvvl : ClangBuiltin<"__builtin_ve_vl_vminsl_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminsl_vvvvl : ClangBuiltin<"__builtin_ve_vl_vminsl_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminsl_vsvl : ClangBuiltin<"__builtin_ve_vl_vminsl_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminsl_vsvvl : ClangBuiltin<"__builtin_ve_vl_vminsl_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminsl_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vminsl_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminsl_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vminsl_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vand_vvvl : ClangBuiltin<"__builtin_ve_vl_vand_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vand_vvvvl : ClangBuiltin<"__builtin_ve_vl_vand_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vand_vsvl : ClangBuiltin<"__builtin_ve_vl_vand_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vand_vsvvl : ClangBuiltin<"__builtin_ve_vl_vand_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vand_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vand_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vand_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vand_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvand_vvvl : ClangBuiltin<"__builtin_ve_vl_pvand_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvand_vvvvl : ClangBuiltin<"__builtin_ve_vl_pvand_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvand_vsvl : ClangBuiltin<"__builtin_ve_vl_pvand_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvand_vsvvl : ClangBuiltin<"__builtin_ve_vl_pvand_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvand_vvvMvl : ClangBuiltin<"__builtin_ve_vl_pvand_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvand_vsvMvl : ClangBuiltin<"__builtin_ve_vl_pvand_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vor_vvvl : ClangBuiltin<"__builtin_ve_vl_vor_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vor_vvvvl : ClangBuiltin<"__builtin_ve_vl_vor_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vor_vsvl : ClangBuiltin<"__builtin_ve_vl_vor_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vor_vsvvl : ClangBuiltin<"__builtin_ve_vl_vor_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vor_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vor_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vor_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vor_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvor_vvvl : ClangBuiltin<"__builtin_ve_vl_pvor_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvor_vvvvl : ClangBuiltin<"__builtin_ve_vl_pvor_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvor_vsvl : ClangBuiltin<"__builtin_ve_vl_pvor_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvor_vsvvl : ClangBuiltin<"__builtin_ve_vl_pvor_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvor_vvvMvl : ClangBuiltin<"__builtin_ve_vl_pvor_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvor_vsvMvl : ClangBuiltin<"__builtin_ve_vl_pvor_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vxor_vvvl : ClangBuiltin<"__builtin_ve_vl_vxor_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vxor_vvvvl : ClangBuiltin<"__builtin_ve_vl_vxor_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vxor_vsvl : ClangBuiltin<"__builtin_ve_vl_vxor_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vxor_vsvvl : ClangBuiltin<"__builtin_ve_vl_vxor_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vxor_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vxor_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vxor_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vxor_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvxor_vvvl : ClangBuiltin<"__builtin_ve_vl_pvxor_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvxor_vvvvl : ClangBuiltin<"__builtin_ve_vl_pvxor_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvxor_vsvl : ClangBuiltin<"__builtin_ve_vl_pvxor_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvxor_vsvvl : ClangBuiltin<"__builtin_ve_vl_pvxor_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvxor_vvvMvl : ClangBuiltin<"__builtin_ve_vl_pvxor_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvxor_vsvMvl : ClangBuiltin<"__builtin_ve_vl_pvxor_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_veqv_vvvl : ClangBuiltin<"__builtin_ve_vl_veqv_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_veqv_vvvvl : ClangBuiltin<"__builtin_ve_vl_veqv_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_veqv_vsvl : ClangBuiltin<"__builtin_ve_vl_veqv_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_veqv_vsvvl : ClangBuiltin<"__builtin_ve_vl_veqv_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_veqv_vvvmvl : ClangBuiltin<"__builtin_ve_vl_veqv_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_veqv_vsvmvl : ClangBuiltin<"__builtin_ve_vl_veqv_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pveqv_vvvl : ClangBuiltin<"__builtin_ve_vl_pveqv_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pveqv_vvvvl : ClangBuiltin<"__builtin_ve_vl_pveqv_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pveqv_vsvl : ClangBuiltin<"__builtin_ve_vl_pveqv_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pveqv_vsvvl : ClangBuiltin<"__builtin_ve_vl_pveqv_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pveqv_vvvMvl : ClangBuiltin<"__builtin_ve_vl_pveqv_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pveqv_vsvMvl : ClangBuiltin<"__builtin_ve_vl_pveqv_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldz_vvl : ClangBuiltin<"__builtin_ve_vl_vldz_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldz_vvvl : ClangBuiltin<"__builtin_ve_vl_vldz_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldz_vvmvl : ClangBuiltin<"__builtin_ve_vl_vldz_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvldzlo_vvl : ClangBuiltin<"__builtin_ve_vl_pvldzlo_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvldzlo_vvvl : ClangBuiltin<"__builtin_ve_vl_pvldzlo_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvldzlo_vvmvl : ClangBuiltin<"__builtin_ve_vl_pvldzlo_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvldzup_vvl : ClangBuiltin<"__builtin_ve_vl_pvldzup_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvldzup_vvvl : ClangBuiltin<"__builtin_ve_vl_pvldzup_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvldzup_vvmvl : ClangBuiltin<"__builtin_ve_vl_pvldzup_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvldz_vvl : ClangBuiltin<"__builtin_ve_vl_pvldz_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvldz_vvvl : ClangBuiltin<"__builtin_ve_vl_pvldz_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvldz_vvMvl : ClangBuiltin<"__builtin_ve_vl_pvldz_vvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vpcnt_vvl : ClangBuiltin<"__builtin_ve_vl_vpcnt_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vpcnt_vvvl : ClangBuiltin<"__builtin_ve_vl_vpcnt_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vpcnt_vvmvl : ClangBuiltin<"__builtin_ve_vl_vpcnt_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvpcntlo_vvl : ClangBuiltin<"__builtin_ve_vl_pvpcntlo_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvpcntlo_vvvl : ClangBuiltin<"__builtin_ve_vl_pvpcntlo_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvpcntlo_vvmvl : ClangBuiltin<"__builtin_ve_vl_pvpcntlo_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvpcntup_vvl : ClangBuiltin<"__builtin_ve_vl_pvpcntup_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvpcntup_vvvl : ClangBuiltin<"__builtin_ve_vl_pvpcntup_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvpcntup_vvmvl : ClangBuiltin<"__builtin_ve_vl_pvpcntup_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvpcnt_vvl : ClangBuiltin<"__builtin_ve_vl_pvpcnt_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvpcnt_vvvl : ClangBuiltin<"__builtin_ve_vl_pvpcnt_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvpcnt_vvMvl : ClangBuiltin<"__builtin_ve_vl_pvpcnt_vvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vbrv_vvl : ClangBuiltin<"__builtin_ve_vl_vbrv_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vbrv_vvvl : ClangBuiltin<"__builtin_ve_vl_vbrv_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vbrv_vvmvl : ClangBuiltin<"__builtin_ve_vl_vbrv_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvbrvlo_vvl : ClangBuiltin<"__builtin_ve_vl_pvbrvlo_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvbrvlo_vvvl : ClangBuiltin<"__builtin_ve_vl_pvbrvlo_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvbrvlo_vvmvl : ClangBuiltin<"__builtin_ve_vl_pvbrvlo_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvbrvup_vvl : ClangBuiltin<"__builtin_ve_vl_pvbrvup_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvbrvup_vvvl : ClangBuiltin<"__builtin_ve_vl_pvbrvup_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvbrvup_vvmvl : ClangBuiltin<"__builtin_ve_vl_pvbrvup_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvbrv_vvl : ClangBuiltin<"__builtin_ve_vl_pvbrv_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvbrv_vvvl : ClangBuiltin<"__builtin_ve_vl_pvbrv_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvbrv_vvMvl : ClangBuiltin<"__builtin_ve_vl_pvbrv_vvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vseq_vl : ClangBuiltin<"__builtin_ve_vl_vseq_vl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vseq_vvl : ClangBuiltin<"__builtin_ve_vl_vseq_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvseqlo_vl : ClangBuiltin<"__builtin_ve_vl_pvseqlo_vl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvseqlo_vvl : ClangBuiltin<"__builtin_ve_vl_pvseqlo_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsequp_vl : ClangBuiltin<"__builtin_ve_vl_pvsequp_vl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsequp_vvl : ClangBuiltin<"__builtin_ve_vl_pvsequp_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvseq_vl : ClangBuiltin<"__builtin_ve_vl_pvseq_vl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvseq_vvl : ClangBuiltin<"__builtin_ve_vl_pvseq_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsll_vvvl : ClangBuiltin<"__builtin_ve_vl_vsll_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsll_vvvvl : ClangBuiltin<"__builtin_ve_vl_vsll_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsll_vvsl : ClangBuiltin<"__builtin_ve_vl_vsll_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsll_vvsvl : ClangBuiltin<"__builtin_ve_vl_vsll_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsll_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vsll_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsll_vvsmvl : ClangBuiltin<"__builtin_ve_vl_vsll_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsll_vvvl : ClangBuiltin<"__builtin_ve_vl_pvsll_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsll_vvvvl : ClangBuiltin<"__builtin_ve_vl_pvsll_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsll_vvsl : ClangBuiltin<"__builtin_ve_vl_pvsll_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsll_vvsvl : ClangBuiltin<"__builtin_ve_vl_pvsll_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsll_vvvMvl : ClangBuiltin<"__builtin_ve_vl_pvsll_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsll_vvsMvl : ClangBuiltin<"__builtin_ve_vl_pvsll_vvsMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrl_vvvl : ClangBuiltin<"__builtin_ve_vl_vsrl_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrl_vvvvl : ClangBuiltin<"__builtin_ve_vl_vsrl_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrl_vvsl : ClangBuiltin<"__builtin_ve_vl_vsrl_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrl_vvsvl : ClangBuiltin<"__builtin_ve_vl_vsrl_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrl_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vsrl_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrl_vvsmvl : ClangBuiltin<"__builtin_ve_vl_vsrl_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsrl_vvvl : ClangBuiltin<"__builtin_ve_vl_pvsrl_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsrl_vvvvl : ClangBuiltin<"__builtin_ve_vl_pvsrl_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsrl_vvsl : ClangBuiltin<"__builtin_ve_vl_pvsrl_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsrl_vvsvl : ClangBuiltin<"__builtin_ve_vl_pvsrl_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsrl_vvvMvl : ClangBuiltin<"__builtin_ve_vl_pvsrl_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsrl_vvsMvl : ClangBuiltin<"__builtin_ve_vl_pvsrl_vvsMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslawsx_vvvl : ClangBuiltin<"__builtin_ve_vl_vslawsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslawsx_vvvvl : ClangBuiltin<"__builtin_ve_vl_vslawsx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslawsx_vvsl : ClangBuiltin<"__builtin_ve_vl_vslawsx_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslawsx_vvsvl : ClangBuiltin<"__builtin_ve_vl_vslawsx_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslawsx_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vslawsx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslawsx_vvsmvl : ClangBuiltin<"__builtin_ve_vl_vslawsx_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslawzx_vvvl : ClangBuiltin<"__builtin_ve_vl_vslawzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslawzx_vvvvl : ClangBuiltin<"__builtin_ve_vl_vslawzx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslawzx_vvsl : ClangBuiltin<"__builtin_ve_vl_vslawzx_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslawzx_vvsvl : ClangBuiltin<"__builtin_ve_vl_vslawzx_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslawzx_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vslawzx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslawzx_vvsmvl : ClangBuiltin<"__builtin_ve_vl_vslawzx_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsla_vvvl : ClangBuiltin<"__builtin_ve_vl_pvsla_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsla_vvvvl : ClangBuiltin<"__builtin_ve_vl_pvsla_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsla_vvsl : ClangBuiltin<"__builtin_ve_vl_pvsla_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsla_vvsvl : ClangBuiltin<"__builtin_ve_vl_pvsla_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsla_vvvMvl : ClangBuiltin<"__builtin_ve_vl_pvsla_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsla_vvsMvl : ClangBuiltin<"__builtin_ve_vl_pvsla_vvsMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslal_vvvl : ClangBuiltin<"__builtin_ve_vl_vslal_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslal_vvvvl : ClangBuiltin<"__builtin_ve_vl_vslal_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslal_vvsl : ClangBuiltin<"__builtin_ve_vl_vslal_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslal_vvsvl : ClangBuiltin<"__builtin_ve_vl_vslal_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslal_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vslal_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslal_vvsmvl : ClangBuiltin<"__builtin_ve_vl_vslal_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrawsx_vvvl : ClangBuiltin<"__builtin_ve_vl_vsrawsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrawsx_vvvvl : ClangBuiltin<"__builtin_ve_vl_vsrawsx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrawsx_vvsl : ClangBuiltin<"__builtin_ve_vl_vsrawsx_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrawsx_vvsvl : ClangBuiltin<"__builtin_ve_vl_vsrawsx_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrawsx_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vsrawsx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrawsx_vvsmvl : ClangBuiltin<"__builtin_ve_vl_vsrawsx_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrawzx_vvvl : ClangBuiltin<"__builtin_ve_vl_vsrawzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrawzx_vvvvl : ClangBuiltin<"__builtin_ve_vl_vsrawzx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrawzx_vvsl : ClangBuiltin<"__builtin_ve_vl_vsrawzx_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrawzx_vvsvl : ClangBuiltin<"__builtin_ve_vl_vsrawzx_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrawzx_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vsrawzx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrawzx_vvsmvl : ClangBuiltin<"__builtin_ve_vl_vsrawzx_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsra_vvvl : ClangBuiltin<"__builtin_ve_vl_pvsra_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsra_vvvvl : ClangBuiltin<"__builtin_ve_vl_pvsra_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsra_vvsl : ClangBuiltin<"__builtin_ve_vl_pvsra_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsra_vvsvl : ClangBuiltin<"__builtin_ve_vl_pvsra_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsra_vvvMvl : ClangBuiltin<"__builtin_ve_vl_pvsra_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsra_vvsMvl : ClangBuiltin<"__builtin_ve_vl_pvsra_vvsMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsral_vvvl : ClangBuiltin<"__builtin_ve_vl_vsral_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsral_vvvvl : ClangBuiltin<"__builtin_ve_vl_vsral_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsral_vvsl : ClangBuiltin<"__builtin_ve_vl_vsral_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsral_vvsvl : ClangBuiltin<"__builtin_ve_vl_vsral_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsral_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vsral_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsral_vvsmvl : ClangBuiltin<"__builtin_ve_vl_vsral_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsfa_vvssl : ClangBuiltin<"__builtin_ve_vl_vsfa_vvssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsfa_vvssvl : ClangBuiltin<"__builtin_ve_vl_vsfa_vvssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsfa_vvssmvl : ClangBuiltin<"__builtin_ve_vl_vsfa_vvssmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfaddd_vvvl : ClangBuiltin<"__builtin_ve_vl_vfaddd_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfaddd_vvvvl : ClangBuiltin<"__builtin_ve_vl_vfaddd_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfaddd_vsvl : ClangBuiltin<"__builtin_ve_vl_vfaddd_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfaddd_vsvvl : ClangBuiltin<"__builtin_ve_vl_vfaddd_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfaddd_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vfaddd_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfaddd_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vfaddd_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfadds_vvvl : ClangBuiltin<"__builtin_ve_vl_vfadds_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfadds_vvvvl : ClangBuiltin<"__builtin_ve_vl_vfadds_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfadds_vsvl : ClangBuiltin<"__builtin_ve_vl_vfadds_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfadds_vsvvl : ClangBuiltin<"__builtin_ve_vl_vfadds_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfadds_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vfadds_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfadds_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vfadds_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfadd_vvvl : ClangBuiltin<"__builtin_ve_vl_pvfadd_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfadd_vvvvl : ClangBuiltin<"__builtin_ve_vl_pvfadd_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfadd_vsvl : ClangBuiltin<"__builtin_ve_vl_pvfadd_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfadd_vsvvl : ClangBuiltin<"__builtin_ve_vl_pvfadd_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfadd_vvvMvl : ClangBuiltin<"__builtin_ve_vl_pvfadd_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfadd_vsvMvl : ClangBuiltin<"__builtin_ve_vl_pvfadd_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsubd_vvvl : ClangBuiltin<"__builtin_ve_vl_vfsubd_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsubd_vvvvl : ClangBuiltin<"__builtin_ve_vl_vfsubd_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsubd_vsvl : ClangBuiltin<"__builtin_ve_vl_vfsubd_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsubd_vsvvl : ClangBuiltin<"__builtin_ve_vl_vfsubd_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsubd_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vfsubd_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsubd_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vfsubd_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsubs_vvvl : ClangBuiltin<"__builtin_ve_vl_vfsubs_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsubs_vvvvl : ClangBuiltin<"__builtin_ve_vl_vfsubs_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsubs_vsvl : ClangBuiltin<"__builtin_ve_vl_vfsubs_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsubs_vsvvl : ClangBuiltin<"__builtin_ve_vl_vfsubs_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsubs_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vfsubs_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsubs_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vfsubs_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfsub_vvvl : ClangBuiltin<"__builtin_ve_vl_pvfsub_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfsub_vvvvl : ClangBuiltin<"__builtin_ve_vl_pvfsub_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfsub_vsvl : ClangBuiltin<"__builtin_ve_vl_pvfsub_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfsub_vsvvl : ClangBuiltin<"__builtin_ve_vl_pvfsub_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfsub_vvvMvl : ClangBuiltin<"__builtin_ve_vl_pvfsub_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfsub_vsvMvl : ClangBuiltin<"__builtin_ve_vl_pvfsub_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmuld_vvvl : ClangBuiltin<"__builtin_ve_vl_vfmuld_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmuld_vvvvl : ClangBuiltin<"__builtin_ve_vl_vfmuld_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmuld_vsvl : ClangBuiltin<"__builtin_ve_vl_vfmuld_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmuld_vsvvl : ClangBuiltin<"__builtin_ve_vl_vfmuld_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmuld_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vfmuld_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmuld_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vfmuld_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmuls_vvvl : ClangBuiltin<"__builtin_ve_vl_vfmuls_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmuls_vvvvl : ClangBuiltin<"__builtin_ve_vl_vfmuls_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmuls_vsvl : ClangBuiltin<"__builtin_ve_vl_vfmuls_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmuls_vsvvl : ClangBuiltin<"__builtin_ve_vl_vfmuls_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmuls_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vfmuls_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmuls_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vfmuls_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmul_vvvl : ClangBuiltin<"__builtin_ve_vl_pvfmul_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmul_vvvvl : ClangBuiltin<"__builtin_ve_vl_pvfmul_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmul_vsvl : ClangBuiltin<"__builtin_ve_vl_pvfmul_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmul_vsvvl : ClangBuiltin<"__builtin_ve_vl_pvfmul_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmul_vvvMvl : ClangBuiltin<"__builtin_ve_vl_pvfmul_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmul_vsvMvl : ClangBuiltin<"__builtin_ve_vl_pvfmul_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfdivd_vvvl : ClangBuiltin<"__builtin_ve_vl_vfdivd_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfdivd_vvvvl : ClangBuiltin<"__builtin_ve_vl_vfdivd_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfdivd_vsvl : ClangBuiltin<"__builtin_ve_vl_vfdivd_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfdivd_vsvvl : ClangBuiltin<"__builtin_ve_vl_vfdivd_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfdivd_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vfdivd_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfdivd_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vfdivd_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfdivs_vvvl : ClangBuiltin<"__builtin_ve_vl_vfdivs_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfdivs_vvvvl : ClangBuiltin<"__builtin_ve_vl_vfdivs_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfdivs_vsvl : ClangBuiltin<"__builtin_ve_vl_vfdivs_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfdivs_vsvvl : ClangBuiltin<"__builtin_ve_vl_vfdivs_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfdivs_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vfdivs_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfdivs_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vfdivs_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsqrtd_vvl : ClangBuiltin<"__builtin_ve_vl_vfsqrtd_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsqrtd_vvvl : ClangBuiltin<"__builtin_ve_vl_vfsqrtd_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsqrts_vvl : ClangBuiltin<"__builtin_ve_vl_vfsqrts_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsqrts_vvvl : ClangBuiltin<"__builtin_ve_vl_vfsqrts_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfcmpd_vvvl : ClangBuiltin<"__builtin_ve_vl_vfcmpd_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfcmpd_vvvvl : ClangBuiltin<"__builtin_ve_vl_vfcmpd_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfcmpd_vsvl : ClangBuiltin<"__builtin_ve_vl_vfcmpd_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfcmpd_vsvvl : ClangBuiltin<"__builtin_ve_vl_vfcmpd_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfcmpd_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vfcmpd_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfcmpd_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vfcmpd_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfcmps_vvvl : ClangBuiltin<"__builtin_ve_vl_vfcmps_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfcmps_vvvvl : ClangBuiltin<"__builtin_ve_vl_vfcmps_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfcmps_vsvl : ClangBuiltin<"__builtin_ve_vl_vfcmps_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfcmps_vsvvl : ClangBuiltin<"__builtin_ve_vl_vfcmps_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfcmps_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vfcmps_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfcmps_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vfcmps_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfcmp_vvvl : ClangBuiltin<"__builtin_ve_vl_pvfcmp_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfcmp_vvvvl : ClangBuiltin<"__builtin_ve_vl_pvfcmp_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfcmp_vsvl : ClangBuiltin<"__builtin_ve_vl_pvfcmp_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfcmp_vsvvl : ClangBuiltin<"__builtin_ve_vl_pvfcmp_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfcmp_vvvMvl : ClangBuiltin<"__builtin_ve_vl_pvfcmp_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfcmp_vsvMvl : ClangBuiltin<"__builtin_ve_vl_pvfcmp_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmaxd_vvvl : ClangBuiltin<"__builtin_ve_vl_vfmaxd_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmaxd_vvvvl : ClangBuiltin<"__builtin_ve_vl_vfmaxd_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmaxd_vsvl : ClangBuiltin<"__builtin_ve_vl_vfmaxd_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmaxd_vsvvl : ClangBuiltin<"__builtin_ve_vl_vfmaxd_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmaxd_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vfmaxd_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmaxd_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vfmaxd_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmaxs_vvvl : ClangBuiltin<"__builtin_ve_vl_vfmaxs_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmaxs_vvvvl : ClangBuiltin<"__builtin_ve_vl_vfmaxs_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmaxs_vsvl : ClangBuiltin<"__builtin_ve_vl_vfmaxs_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmaxs_vsvvl : ClangBuiltin<"__builtin_ve_vl_vfmaxs_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmaxs_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vfmaxs_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmaxs_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vfmaxs_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmax_vvvl : ClangBuiltin<"__builtin_ve_vl_pvfmax_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmax_vvvvl : ClangBuiltin<"__builtin_ve_vl_pvfmax_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmax_vsvl : ClangBuiltin<"__builtin_ve_vl_pvfmax_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmax_vsvvl : ClangBuiltin<"__builtin_ve_vl_pvfmax_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmax_vvvMvl : ClangBuiltin<"__builtin_ve_vl_pvfmax_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmax_vsvMvl : ClangBuiltin<"__builtin_ve_vl_pvfmax_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmind_vvvl : ClangBuiltin<"__builtin_ve_vl_vfmind_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmind_vvvvl : ClangBuiltin<"__builtin_ve_vl_vfmind_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmind_vsvl : ClangBuiltin<"__builtin_ve_vl_vfmind_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmind_vsvvl : ClangBuiltin<"__builtin_ve_vl_vfmind_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmind_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vfmind_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmind_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vfmind_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmins_vvvl : ClangBuiltin<"__builtin_ve_vl_vfmins_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmins_vvvvl : ClangBuiltin<"__builtin_ve_vl_vfmins_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmins_vsvl : ClangBuiltin<"__builtin_ve_vl_vfmins_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmins_vsvvl : ClangBuiltin<"__builtin_ve_vl_vfmins_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmins_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vfmins_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmins_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vfmins_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmin_vvvl : ClangBuiltin<"__builtin_ve_vl_pvfmin_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmin_vvvvl : ClangBuiltin<"__builtin_ve_vl_pvfmin_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmin_vsvl : ClangBuiltin<"__builtin_ve_vl_pvfmin_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmin_vsvvl : ClangBuiltin<"__builtin_ve_vl_pvfmin_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmin_vvvMvl : ClangBuiltin<"__builtin_ve_vl_pvfmin_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmin_vsvMvl : ClangBuiltin<"__builtin_ve_vl_pvfmin_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmadd_vvvvl : ClangBuiltin<"__builtin_ve_vl_vfmadd_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmadd_vvvvvl : ClangBuiltin<"__builtin_ve_vl_vfmadd_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmadd_vsvvl : ClangBuiltin<"__builtin_ve_vl_vfmadd_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmadd_vsvvvl : ClangBuiltin<"__builtin_ve_vl_vfmadd_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmadd_vvsvl : ClangBuiltin<"__builtin_ve_vl_vfmadd_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmadd_vvsvvl : ClangBuiltin<"__builtin_ve_vl_vfmadd_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmadd_vvvvmvl : ClangBuiltin<"__builtin_ve_vl_vfmadd_vvvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmadd_vsvvmvl : ClangBuiltin<"__builtin_ve_vl_vfmadd_vsvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmadd_vvsvmvl : ClangBuiltin<"__builtin_ve_vl_vfmadd_vvsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmads_vvvvl : ClangBuiltin<"__builtin_ve_vl_vfmads_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmads_vvvvvl : ClangBuiltin<"__builtin_ve_vl_vfmads_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmads_vsvvl : ClangBuiltin<"__builtin_ve_vl_vfmads_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmads_vsvvvl : ClangBuiltin<"__builtin_ve_vl_vfmads_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmads_vvsvl : ClangBuiltin<"__builtin_ve_vl_vfmads_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmads_vvsvvl : ClangBuiltin<"__builtin_ve_vl_vfmads_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmads_vvvvmvl : ClangBuiltin<"__builtin_ve_vl_vfmads_vvvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmads_vsvvmvl : ClangBuiltin<"__builtin_ve_vl_vfmads_vsvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmads_vvsvmvl : ClangBuiltin<"__builtin_ve_vl_vfmads_vvsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmad_vvvvl : ClangBuiltin<"__builtin_ve_vl_pvfmad_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmad_vvvvvl : ClangBuiltin<"__builtin_ve_vl_pvfmad_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmad_vsvvl : ClangBuiltin<"__builtin_ve_vl_pvfmad_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmad_vsvvvl : ClangBuiltin<"__builtin_ve_vl_pvfmad_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmad_vvsvl : ClangBuiltin<"__builtin_ve_vl_pvfmad_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmad_vvsvvl : ClangBuiltin<"__builtin_ve_vl_pvfmad_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmad_vvvvMvl : ClangBuiltin<"__builtin_ve_vl_pvfmad_vvvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmad_vsvvMvl : ClangBuiltin<"__builtin_ve_vl_pvfmad_vsvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmad_vvsvMvl : ClangBuiltin<"__builtin_ve_vl_pvfmad_vvsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbd_vvvvl : ClangBuiltin<"__builtin_ve_vl_vfmsbd_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbd_vvvvvl : ClangBuiltin<"__builtin_ve_vl_vfmsbd_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbd_vsvvl : ClangBuiltin<"__builtin_ve_vl_vfmsbd_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbd_vsvvvl : ClangBuiltin<"__builtin_ve_vl_vfmsbd_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbd_vvsvl : ClangBuiltin<"__builtin_ve_vl_vfmsbd_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbd_vvsvvl : ClangBuiltin<"__builtin_ve_vl_vfmsbd_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbd_vvvvmvl : ClangBuiltin<"__builtin_ve_vl_vfmsbd_vvvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbd_vsvvmvl : ClangBuiltin<"__builtin_ve_vl_vfmsbd_vsvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbd_vvsvmvl : ClangBuiltin<"__builtin_ve_vl_vfmsbd_vvsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbs_vvvvl : ClangBuiltin<"__builtin_ve_vl_vfmsbs_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbs_vvvvvl : ClangBuiltin<"__builtin_ve_vl_vfmsbs_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbs_vsvvl : ClangBuiltin<"__builtin_ve_vl_vfmsbs_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbs_vsvvvl : ClangBuiltin<"__builtin_ve_vl_vfmsbs_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbs_vvsvl : ClangBuiltin<"__builtin_ve_vl_vfmsbs_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbs_vvsvvl : ClangBuiltin<"__builtin_ve_vl_vfmsbs_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbs_vvvvmvl : ClangBuiltin<"__builtin_ve_vl_vfmsbs_vvvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbs_vsvvmvl : ClangBuiltin<"__builtin_ve_vl_vfmsbs_vsvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbs_vvsvmvl : ClangBuiltin<"__builtin_ve_vl_vfmsbs_vvsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmsb_vvvvl : ClangBuiltin<"__builtin_ve_vl_pvfmsb_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmsb_vvvvvl : ClangBuiltin<"__builtin_ve_vl_pvfmsb_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmsb_vsvvl : ClangBuiltin<"__builtin_ve_vl_pvfmsb_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmsb_vsvvvl : ClangBuiltin<"__builtin_ve_vl_pvfmsb_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmsb_vvsvl : ClangBuiltin<"__builtin_ve_vl_pvfmsb_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmsb_vvsvvl : ClangBuiltin<"__builtin_ve_vl_pvfmsb_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmsb_vvvvMvl : ClangBuiltin<"__builtin_ve_vl_pvfmsb_vvvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmsb_vsvvMvl : ClangBuiltin<"__builtin_ve_vl_pvfmsb_vsvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmsb_vvsvMvl : ClangBuiltin<"__builtin_ve_vl_pvfmsb_vvsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmadd_vvvvl : ClangBuiltin<"__builtin_ve_vl_vfnmadd_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmadd_vvvvvl : ClangBuiltin<"__builtin_ve_vl_vfnmadd_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmadd_vsvvl : ClangBuiltin<"__builtin_ve_vl_vfnmadd_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmadd_vsvvvl : ClangBuiltin<"__builtin_ve_vl_vfnmadd_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmadd_vvsvl : ClangBuiltin<"__builtin_ve_vl_vfnmadd_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmadd_vvsvvl : ClangBuiltin<"__builtin_ve_vl_vfnmadd_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmadd_vvvvmvl : ClangBuiltin<"__builtin_ve_vl_vfnmadd_vvvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmadd_vsvvmvl : ClangBuiltin<"__builtin_ve_vl_vfnmadd_vsvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmadd_vvsvmvl : ClangBuiltin<"__builtin_ve_vl_vfnmadd_vvsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmads_vvvvl : ClangBuiltin<"__builtin_ve_vl_vfnmads_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmads_vvvvvl : ClangBuiltin<"__builtin_ve_vl_vfnmads_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmads_vsvvl : ClangBuiltin<"__builtin_ve_vl_vfnmads_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmads_vsvvvl : ClangBuiltin<"__builtin_ve_vl_vfnmads_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmads_vvsvl : ClangBuiltin<"__builtin_ve_vl_vfnmads_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmads_vvsvvl : ClangBuiltin<"__builtin_ve_vl_vfnmads_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmads_vvvvmvl : ClangBuiltin<"__builtin_ve_vl_vfnmads_vvvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmads_vsvvmvl : ClangBuiltin<"__builtin_ve_vl_vfnmads_vsvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmads_vvsvmvl : ClangBuiltin<"__builtin_ve_vl_vfnmads_vvsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmad_vvvvl : ClangBuiltin<"__builtin_ve_vl_pvfnmad_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmad_vvvvvl : ClangBuiltin<"__builtin_ve_vl_pvfnmad_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmad_vsvvl : ClangBuiltin<"__builtin_ve_vl_pvfnmad_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmad_vsvvvl : ClangBuiltin<"__builtin_ve_vl_pvfnmad_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmad_vvsvl : ClangBuiltin<"__builtin_ve_vl_pvfnmad_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmad_vvsvvl : ClangBuiltin<"__builtin_ve_vl_pvfnmad_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmad_vvvvMvl : ClangBuiltin<"__builtin_ve_vl_pvfnmad_vvvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmad_vsvvMvl : ClangBuiltin<"__builtin_ve_vl_pvfnmad_vsvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmad_vvsvMvl : ClangBuiltin<"__builtin_ve_vl_pvfnmad_vvsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbd_vvvvl : ClangBuiltin<"__builtin_ve_vl_vfnmsbd_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbd_vvvvvl : ClangBuiltin<"__builtin_ve_vl_vfnmsbd_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbd_vsvvl : ClangBuiltin<"__builtin_ve_vl_vfnmsbd_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbd_vsvvvl : ClangBuiltin<"__builtin_ve_vl_vfnmsbd_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbd_vvsvl : ClangBuiltin<"__builtin_ve_vl_vfnmsbd_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbd_vvsvvl : ClangBuiltin<"__builtin_ve_vl_vfnmsbd_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbd_vvvvmvl : ClangBuiltin<"__builtin_ve_vl_vfnmsbd_vvvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbd_vsvvmvl : ClangBuiltin<"__builtin_ve_vl_vfnmsbd_vsvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbd_vvsvmvl : ClangBuiltin<"__builtin_ve_vl_vfnmsbd_vvsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbs_vvvvl : ClangBuiltin<"__builtin_ve_vl_vfnmsbs_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbs_vvvvvl : ClangBuiltin<"__builtin_ve_vl_vfnmsbs_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbs_vsvvl : ClangBuiltin<"__builtin_ve_vl_vfnmsbs_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbs_vsvvvl : ClangBuiltin<"__builtin_ve_vl_vfnmsbs_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbs_vvsvl : ClangBuiltin<"__builtin_ve_vl_vfnmsbs_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbs_vvsvvl : ClangBuiltin<"__builtin_ve_vl_vfnmsbs_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbs_vvvvmvl : ClangBuiltin<"__builtin_ve_vl_vfnmsbs_vvvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbs_vsvvmvl : ClangBuiltin<"__builtin_ve_vl_vfnmsbs_vsvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbs_vvsvmvl : ClangBuiltin<"__builtin_ve_vl_vfnmsbs_vvsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmsb_vvvvl : ClangBuiltin<"__builtin_ve_vl_pvfnmsb_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmsb_vvvvvl : ClangBuiltin<"__builtin_ve_vl_pvfnmsb_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmsb_vsvvl : ClangBuiltin<"__builtin_ve_vl_pvfnmsb_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmsb_vsvvvl : ClangBuiltin<"__builtin_ve_vl_pvfnmsb_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmsb_vvsvl : ClangBuiltin<"__builtin_ve_vl_pvfnmsb_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmsb_vvsvvl : ClangBuiltin<"__builtin_ve_vl_pvfnmsb_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmsb_vvvvMvl : ClangBuiltin<"__builtin_ve_vl_pvfnmsb_vvvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmsb_vsvvMvl : ClangBuiltin<"__builtin_ve_vl_pvfnmsb_vsvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmsb_vvsvMvl : ClangBuiltin<"__builtin_ve_vl_pvfnmsb_vvsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrcpd_vvl : ClangBuiltin<"__builtin_ve_vl_vrcpd_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrcpd_vvvl : ClangBuiltin<"__builtin_ve_vl_vrcpd_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrcps_vvl : ClangBuiltin<"__builtin_ve_vl_vrcps_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrcps_vvvl : ClangBuiltin<"__builtin_ve_vl_vrcps_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvrcp_vvl : ClangBuiltin<"__builtin_ve_vl_pvrcp_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvrcp_vvvl : ClangBuiltin<"__builtin_ve_vl_pvrcp_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrsqrtd_vvl : ClangBuiltin<"__builtin_ve_vl_vrsqrtd_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrsqrtd_vvvl : ClangBuiltin<"__builtin_ve_vl_vrsqrtd_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrsqrts_vvl : ClangBuiltin<"__builtin_ve_vl_vrsqrts_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrsqrts_vvvl : ClangBuiltin<"__builtin_ve_vl_vrsqrts_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvrsqrt_vvl : ClangBuiltin<"__builtin_ve_vl_pvrsqrt_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvrsqrt_vvvl : ClangBuiltin<"__builtin_ve_vl_pvrsqrt_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrsqrtdnex_vvl : ClangBuiltin<"__builtin_ve_vl_vrsqrtdnex_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrsqrtdnex_vvvl : ClangBuiltin<"__builtin_ve_vl_vrsqrtdnex_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrsqrtsnex_vvl : ClangBuiltin<"__builtin_ve_vl_vrsqrtsnex_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrsqrtsnex_vvvl : ClangBuiltin<"__builtin_ve_vl_vrsqrtsnex_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvrsqrtnex_vvl : ClangBuiltin<"__builtin_ve_vl_pvrsqrtnex_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvrsqrtnex_vvvl : ClangBuiltin<"__builtin_ve_vl_pvrsqrtnex_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwdsx_vvl : ClangBuiltin<"__builtin_ve_vl_vcvtwdsx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwdsx_vvvl : ClangBuiltin<"__builtin_ve_vl_vcvtwdsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwdsx_vvmvl : ClangBuiltin<"__builtin_ve_vl_vcvtwdsx_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwdsxrz_vvl : ClangBuiltin<"__builtin_ve_vl_vcvtwdsxrz_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwdsxrz_vvvl : ClangBuiltin<"__builtin_ve_vl_vcvtwdsxrz_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwdsxrz_vvmvl : ClangBuiltin<"__builtin_ve_vl_vcvtwdsxrz_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwdzx_vvl : ClangBuiltin<"__builtin_ve_vl_vcvtwdzx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwdzx_vvvl : ClangBuiltin<"__builtin_ve_vl_vcvtwdzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwdzx_vvmvl : ClangBuiltin<"__builtin_ve_vl_vcvtwdzx_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwdzxrz_vvl : ClangBuiltin<"__builtin_ve_vl_vcvtwdzxrz_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwdzxrz_vvvl : ClangBuiltin<"__builtin_ve_vl_vcvtwdzxrz_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwdzxrz_vvmvl : ClangBuiltin<"__builtin_ve_vl_vcvtwdzxrz_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwssx_vvl : ClangBuiltin<"__builtin_ve_vl_vcvtwssx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwssx_vvvl : ClangBuiltin<"__builtin_ve_vl_vcvtwssx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwssx_vvmvl : ClangBuiltin<"__builtin_ve_vl_vcvtwssx_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwssxrz_vvl : ClangBuiltin<"__builtin_ve_vl_vcvtwssxrz_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwssxrz_vvvl : ClangBuiltin<"__builtin_ve_vl_vcvtwssxrz_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwssxrz_vvmvl : ClangBuiltin<"__builtin_ve_vl_vcvtwssxrz_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwszx_vvl : ClangBuiltin<"__builtin_ve_vl_vcvtwszx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwszx_vvvl : ClangBuiltin<"__builtin_ve_vl_vcvtwszx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwszx_vvmvl : ClangBuiltin<"__builtin_ve_vl_vcvtwszx_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwszxrz_vvl : ClangBuiltin<"__builtin_ve_vl_vcvtwszxrz_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwszxrz_vvvl : ClangBuiltin<"__builtin_ve_vl_vcvtwszxrz_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwszxrz_vvmvl : ClangBuiltin<"__builtin_ve_vl_vcvtwszxrz_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcvtws_vvl : ClangBuiltin<"__builtin_ve_vl_pvcvtws_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcvtws_vvvl : ClangBuiltin<"__builtin_ve_vl_pvcvtws_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcvtws_vvMvl : ClangBuiltin<"__builtin_ve_vl_pvcvtws_vvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcvtwsrz_vvl : ClangBuiltin<"__builtin_ve_vl_pvcvtwsrz_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcvtwsrz_vvvl : ClangBuiltin<"__builtin_ve_vl_pvcvtwsrz_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcvtwsrz_vvMvl : ClangBuiltin<"__builtin_ve_vl_pvcvtwsrz_vvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtld_vvl : ClangBuiltin<"__builtin_ve_vl_vcvtld_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtld_vvvl : ClangBuiltin<"__builtin_ve_vl_vcvtld_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtld_vvmvl : ClangBuiltin<"__builtin_ve_vl_vcvtld_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtldrz_vvl : ClangBuiltin<"__builtin_ve_vl_vcvtldrz_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtldrz_vvvl : ClangBuiltin<"__builtin_ve_vl_vcvtldrz_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtldrz_vvmvl : ClangBuiltin<"__builtin_ve_vl_vcvtldrz_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtdw_vvl : ClangBuiltin<"__builtin_ve_vl_vcvtdw_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtdw_vvvl : ClangBuiltin<"__builtin_ve_vl_vcvtdw_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtsw_vvl : ClangBuiltin<"__builtin_ve_vl_vcvtsw_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtsw_vvvl : ClangBuiltin<"__builtin_ve_vl_vcvtsw_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcvtsw_vvl : ClangBuiltin<"__builtin_ve_vl_pvcvtsw_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcvtsw_vvvl : ClangBuiltin<"__builtin_ve_vl_pvcvtsw_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtdl_vvl : ClangBuiltin<"__builtin_ve_vl_vcvtdl_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtdl_vvvl : ClangBuiltin<"__builtin_ve_vl_vcvtdl_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtds_vvl : ClangBuiltin<"__builtin_ve_vl_vcvtds_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtds_vvvl : ClangBuiltin<"__builtin_ve_vl_vcvtds_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtsd_vvl : ClangBuiltin<"__builtin_ve_vl_vcvtsd_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtsd_vvvl : ClangBuiltin<"__builtin_ve_vl_vcvtsd_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmrg_vvvml : ClangBuiltin<"__builtin_ve_vl_vmrg_vvvml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmrg_vvvmvl : ClangBuiltin<"__builtin_ve_vl_vmrg_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmrg_vsvml : ClangBuiltin<"__builtin_ve_vl_vmrg_vsvml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmrg_vsvmvl : ClangBuiltin<"__builtin_ve_vl_vmrg_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmrgw_vvvMl : ClangBuiltin<"__builtin_ve_vl_vmrgw_vvvMl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmrgw_vvvMvl : ClangBuiltin<"__builtin_ve_vl_vmrgw_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmrgw_vsvMl : ClangBuiltin<"__builtin_ve_vl_vmrgw_vsvMl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmrgw_vsvMvl : ClangBuiltin<"__builtin_ve_vl_vmrgw_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vshf_vvvsl : ClangBuiltin<"__builtin_ve_vl_vshf_vvvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vshf_vvvsvl : ClangBuiltin<"__builtin_ve_vl_vshf_vvvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcp_vvmvl : ClangBuiltin<"__builtin_ve_vl_vcp_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vex_vvmvl : ClangBuiltin<"__builtin_ve_vl_vex_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklat_ml : ClangBuiltin<"__builtin_ve_vl_vfmklat_ml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklaf_ml : ClangBuiltin<"__builtin_ve_vl_vfmklaf_ml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkat_Ml : ClangBuiltin<"__builtin_ve_vl_pvfmkat_Ml">, Intrinsic<[LLVMType<v512i1>], [LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkaf_Ml : ClangBuiltin<"__builtin_ve_vl_pvfmkaf_Ml">, Intrinsic<[LLVMType<v512i1>], [LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklgt_mvl : ClangBuiltin<"__builtin_ve_vl_vfmklgt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklgt_mvml : ClangBuiltin<"__builtin_ve_vl_vfmklgt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkllt_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkllt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkllt_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkllt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklne_mvl : ClangBuiltin<"__builtin_ve_vl_vfmklne_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklne_mvml : ClangBuiltin<"__builtin_ve_vl_vfmklne_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkleq_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkleq_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkleq_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkleq_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklge_mvl : ClangBuiltin<"__builtin_ve_vl_vfmklge_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklge_mvml : ClangBuiltin<"__builtin_ve_vl_vfmklge_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklle_mvl : ClangBuiltin<"__builtin_ve_vl_vfmklle_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklle_mvml : ClangBuiltin<"__builtin_ve_vl_vfmklle_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklnum_mvl : ClangBuiltin<"__builtin_ve_vl_vfmklnum_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklnum_mvml : ClangBuiltin<"__builtin_ve_vl_vfmklnum_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklnan_mvl : ClangBuiltin<"__builtin_ve_vl_vfmklnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklnan_mvml : ClangBuiltin<"__builtin_ve_vl_vfmklnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklgtnan_mvl : ClangBuiltin<"__builtin_ve_vl_vfmklgtnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklgtnan_mvml : ClangBuiltin<"__builtin_ve_vl_vfmklgtnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklltnan_mvl : ClangBuiltin<"__builtin_ve_vl_vfmklltnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklltnan_mvml : ClangBuiltin<"__builtin_ve_vl_vfmklltnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklnenan_mvl : ClangBuiltin<"__builtin_ve_vl_vfmklnenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklnenan_mvml : ClangBuiltin<"__builtin_ve_vl_vfmklnenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkleqnan_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkleqnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkleqnan_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkleqnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklgenan_mvl : ClangBuiltin<"__builtin_ve_vl_vfmklgenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklgenan_mvml : ClangBuiltin<"__builtin_ve_vl_vfmklgenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkllenan_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkllenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkllenan_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkllenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwgt_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkwgt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwgt_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkwgt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwlt_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkwlt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwlt_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkwlt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwne_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkwne_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwne_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkwne_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkweq_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkweq_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkweq_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkweq_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwge_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkwge_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwge_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkwge_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwle_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkwle_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwle_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkwle_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwnum_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkwnum_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwnum_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkwnum_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwnan_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkwnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwnan_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkwnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwgtnan_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkwgtnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwgtnan_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkwgtnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwltnan_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkwltnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwltnan_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkwltnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwnenan_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkwnenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwnenan_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkwnenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkweqnan_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkweqnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkweqnan_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkweqnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwgenan_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkwgenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwgenan_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkwgenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwlenan_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkwlenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwlenan_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkwlenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlogt_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwlogt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupgt_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwupgt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlogt_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkwlogt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupgt_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkwupgt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlolt_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwlolt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwuplt_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwuplt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlolt_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkwlolt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwuplt_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkwuplt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlone_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwlone_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupne_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwupne_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlone_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkwlone_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupne_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkwupne_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwloeq_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwloeq_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupeq_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwupeq_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwloeq_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkwloeq_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupeq_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkwupeq_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwloge_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwloge_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupge_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwupge_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwloge_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkwloge_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupge_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkwupge_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlole_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwlole_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwuple_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwuple_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlole_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkwlole_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwuple_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkwuple_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlonum_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwlonum_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupnum_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwupnum_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlonum_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkwlonum_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupnum_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkwupnum_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlonan_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwlonan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupnan_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwupnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlonan_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkwlonan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupnan_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkwupnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlogtnan_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwlogtnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupgtnan_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwupgtnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlogtnan_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkwlogtnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupgtnan_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkwupgtnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwloltnan_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwloltnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupltnan_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwupltnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwloltnan_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkwloltnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupltnan_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkwupltnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlonenan_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwlonenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupnenan_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwupnenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlonenan_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkwlonenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupnenan_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkwupnenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwloeqnan_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwloeqnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupeqnan_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwupeqnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwloeqnan_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkwloeqnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupeqnan_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkwupeqnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlogenan_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwlogenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupgenan_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwupgenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlogenan_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkwlogenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupgenan_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkwupgenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlolenan_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwlolenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwuplenan_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwuplenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlolenan_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkwlolenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwuplenan_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkwuplenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwgt_Mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwgt_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwgt_MvMl : ClangBuiltin<"__builtin_ve_vl_pvfmkwgt_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlt_Mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwlt_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlt_MvMl : ClangBuiltin<"__builtin_ve_vl_pvfmkwlt_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwne_Mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwne_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwne_MvMl : ClangBuiltin<"__builtin_ve_vl_pvfmkwne_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkweq_Mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkweq_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkweq_MvMl : ClangBuiltin<"__builtin_ve_vl_pvfmkweq_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwge_Mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwge_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwge_MvMl : ClangBuiltin<"__builtin_ve_vl_pvfmkwge_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwle_Mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwle_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwle_MvMl : ClangBuiltin<"__builtin_ve_vl_pvfmkwle_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwnum_Mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwnum_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwnum_MvMl : ClangBuiltin<"__builtin_ve_vl_pvfmkwnum_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwnan_Mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwnan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwnan_MvMl : ClangBuiltin<"__builtin_ve_vl_pvfmkwnan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwgtnan_Mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwgtnan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwgtnan_MvMl : ClangBuiltin<"__builtin_ve_vl_pvfmkwgtnan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwltnan_Mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwltnan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwltnan_MvMl : ClangBuiltin<"__builtin_ve_vl_pvfmkwltnan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwnenan_Mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwnenan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwnenan_MvMl : ClangBuiltin<"__builtin_ve_vl_pvfmkwnenan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkweqnan_Mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkweqnan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkweqnan_MvMl : ClangBuiltin<"__builtin_ve_vl_pvfmkweqnan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwgenan_Mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwgenan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwgenan_MvMl : ClangBuiltin<"__builtin_ve_vl_pvfmkwgenan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlenan_Mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkwlenan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlenan_MvMl : ClangBuiltin<"__builtin_ve_vl_pvfmkwlenan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdgt_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkdgt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdgt_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkdgt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdlt_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkdlt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdlt_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkdlt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdne_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkdne_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdne_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkdne_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdeq_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkdeq_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdeq_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkdeq_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdge_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkdge_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdge_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkdge_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdle_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkdle_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdle_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkdle_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdnum_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkdnum_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdnum_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkdnum_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdnan_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkdnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdnan_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkdnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdgtnan_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkdgtnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdgtnan_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkdgtnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdltnan_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkdltnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdltnan_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkdltnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdnenan_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkdnenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdnenan_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkdnenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdeqnan_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkdeqnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdeqnan_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkdeqnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdgenan_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkdgenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdgenan_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkdgenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdlenan_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkdlenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdlenan_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkdlenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksgt_mvl : ClangBuiltin<"__builtin_ve_vl_vfmksgt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksgt_mvml : ClangBuiltin<"__builtin_ve_vl_vfmksgt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkslt_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkslt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkslt_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkslt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksne_mvl : ClangBuiltin<"__builtin_ve_vl_vfmksne_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksne_mvml : ClangBuiltin<"__builtin_ve_vl_vfmksne_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkseq_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkseq_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkseq_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkseq_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksge_mvl : ClangBuiltin<"__builtin_ve_vl_vfmksge_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksge_mvml : ClangBuiltin<"__builtin_ve_vl_vfmksge_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksle_mvl : ClangBuiltin<"__builtin_ve_vl_vfmksle_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksle_mvml : ClangBuiltin<"__builtin_ve_vl_vfmksle_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksnum_mvl : ClangBuiltin<"__builtin_ve_vl_vfmksnum_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksnum_mvml : ClangBuiltin<"__builtin_ve_vl_vfmksnum_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksnan_mvl : ClangBuiltin<"__builtin_ve_vl_vfmksnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksnan_mvml : ClangBuiltin<"__builtin_ve_vl_vfmksnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksgtnan_mvl : ClangBuiltin<"__builtin_ve_vl_vfmksgtnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksgtnan_mvml : ClangBuiltin<"__builtin_ve_vl_vfmksgtnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksltnan_mvl : ClangBuiltin<"__builtin_ve_vl_vfmksltnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksltnan_mvml : ClangBuiltin<"__builtin_ve_vl_vfmksltnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksnenan_mvl : ClangBuiltin<"__builtin_ve_vl_vfmksnenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksnenan_mvml : ClangBuiltin<"__builtin_ve_vl_vfmksnenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkseqnan_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkseqnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkseqnan_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkseqnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksgenan_mvl : ClangBuiltin<"__builtin_ve_vl_vfmksgenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksgenan_mvml : ClangBuiltin<"__builtin_ve_vl_vfmksgenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkslenan_mvl : ClangBuiltin<"__builtin_ve_vl_vfmkslenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkslenan_mvml : ClangBuiltin<"__builtin_ve_vl_vfmkslenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslogt_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkslogt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupgt_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmksupgt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslogt_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkslogt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupgt_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmksupgt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslolt_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkslolt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksuplt_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmksuplt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslolt_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkslolt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksuplt_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmksuplt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslone_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkslone_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupne_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmksupne_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslone_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkslone_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupne_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmksupne_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksloeq_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmksloeq_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupeq_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmksupeq_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksloeq_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmksloeq_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupeq_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmksupeq_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksloge_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmksloge_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupge_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmksupge_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksloge_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmksloge_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupge_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmksupge_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslole_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkslole_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksuple_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmksuple_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslole_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkslole_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksuple_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmksuple_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslonum_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkslonum_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupnum_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmksupnum_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslonum_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkslonum_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupnum_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmksupnum_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslonan_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkslonan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupnan_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmksupnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslonan_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkslonan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupnan_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmksupnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslogtnan_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkslogtnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupgtnan_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmksupgtnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslogtnan_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkslogtnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupgtnan_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmksupgtnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksloltnan_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmksloltnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupltnan_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmksupltnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksloltnan_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmksloltnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupltnan_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmksupltnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslonenan_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkslonenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupnenan_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmksupnenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslonenan_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkslonenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupnenan_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmksupnenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksloeqnan_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmksloeqnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupeqnan_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmksupeqnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksloeqnan_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmksloeqnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupeqnan_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmksupeqnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslogenan_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkslogenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupgenan_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmksupgenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslogenan_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkslogenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupgenan_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmksupgenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslolenan_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkslolenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksuplenan_mvl : ClangBuiltin<"__builtin_ve_vl_pvfmksuplenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslolenan_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmkslolenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksuplenan_mvml : ClangBuiltin<"__builtin_ve_vl_pvfmksuplenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksgt_Mvl : ClangBuiltin<"__builtin_ve_vl_pvfmksgt_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksgt_MvMl : ClangBuiltin<"__builtin_ve_vl_pvfmksgt_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslt_Mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkslt_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslt_MvMl : ClangBuiltin<"__builtin_ve_vl_pvfmkslt_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksne_Mvl : ClangBuiltin<"__builtin_ve_vl_pvfmksne_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksne_MvMl : ClangBuiltin<"__builtin_ve_vl_pvfmksne_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkseq_Mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkseq_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkseq_MvMl : ClangBuiltin<"__builtin_ve_vl_pvfmkseq_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksge_Mvl : ClangBuiltin<"__builtin_ve_vl_pvfmksge_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksge_MvMl : ClangBuiltin<"__builtin_ve_vl_pvfmksge_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksle_Mvl : ClangBuiltin<"__builtin_ve_vl_pvfmksle_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksle_MvMl : ClangBuiltin<"__builtin_ve_vl_pvfmksle_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksnum_Mvl : ClangBuiltin<"__builtin_ve_vl_pvfmksnum_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksnum_MvMl : ClangBuiltin<"__builtin_ve_vl_pvfmksnum_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksnan_Mvl : ClangBuiltin<"__builtin_ve_vl_pvfmksnan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksnan_MvMl : ClangBuiltin<"__builtin_ve_vl_pvfmksnan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksgtnan_Mvl : ClangBuiltin<"__builtin_ve_vl_pvfmksgtnan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksgtnan_MvMl : ClangBuiltin<"__builtin_ve_vl_pvfmksgtnan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksltnan_Mvl : ClangBuiltin<"__builtin_ve_vl_pvfmksltnan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksltnan_MvMl : ClangBuiltin<"__builtin_ve_vl_pvfmksltnan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksnenan_Mvl : ClangBuiltin<"__builtin_ve_vl_pvfmksnenan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksnenan_MvMl : ClangBuiltin<"__builtin_ve_vl_pvfmksnenan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkseqnan_Mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkseqnan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkseqnan_MvMl : ClangBuiltin<"__builtin_ve_vl_pvfmkseqnan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksgenan_Mvl : ClangBuiltin<"__builtin_ve_vl_pvfmksgenan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksgenan_MvMl : ClangBuiltin<"__builtin_ve_vl_pvfmksgenan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslenan_Mvl : ClangBuiltin<"__builtin_ve_vl_pvfmkslenan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslenan_MvMl : ClangBuiltin<"__builtin_ve_vl_pvfmkslenan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsumwsx_vvl : ClangBuiltin<"__builtin_ve_vl_vsumwsx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsumwsx_vvml : ClangBuiltin<"__builtin_ve_vl_vsumwsx_vvml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsumwzx_vvl : ClangBuiltin<"__builtin_ve_vl_vsumwzx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsumwzx_vvml : ClangBuiltin<"__builtin_ve_vl_vsumwzx_vvml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsuml_vvl : ClangBuiltin<"__builtin_ve_vl_vsuml_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsuml_vvml : ClangBuiltin<"__builtin_ve_vl_vsuml_vvml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsumd_vvl : ClangBuiltin<"__builtin_ve_vl_vfsumd_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsumd_vvml : ClangBuiltin<"__builtin_ve_vl_vfsumd_vvml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsums_vvl : ClangBuiltin<"__builtin_ve_vl_vfsums_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsums_vvml : ClangBuiltin<"__builtin_ve_vl_vfsums_vvml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrmaxswfstsx_vvl : ClangBuiltin<"__builtin_ve_vl_vrmaxswfstsx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrmaxswfstsx_vvvl : ClangBuiltin<"__builtin_ve_vl_vrmaxswfstsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrmaxswlstsx_vvl : ClangBuiltin<"__builtin_ve_vl_vrmaxswlstsx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrmaxswlstsx_vvvl : ClangBuiltin<"__builtin_ve_vl_vrmaxswlstsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrmaxswfstzx_vvl : ClangBuiltin<"__builtin_ve_vl_vrmaxswfstzx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrmaxswfstzx_vvvl : ClangBuiltin<"__builtin_ve_vl_vrmaxswfstzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrmaxswlstzx_vvl : ClangBuiltin<"__builtin_ve_vl_vrmaxswlstzx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrmaxswlstzx_vvvl : ClangBuiltin<"__builtin_ve_vl_vrmaxswlstzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrminswfstsx_vvl : ClangBuiltin<"__builtin_ve_vl_vrminswfstsx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrminswfstsx_vvvl : ClangBuiltin<"__builtin_ve_vl_vrminswfstsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrminswlstsx_vvl : ClangBuiltin<"__builtin_ve_vl_vrminswlstsx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrminswlstsx_vvvl : ClangBuiltin<"__builtin_ve_vl_vrminswlstsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrminswfstzx_vvl : ClangBuiltin<"__builtin_ve_vl_vrminswfstzx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrminswfstzx_vvvl : ClangBuiltin<"__builtin_ve_vl_vrminswfstzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrminswlstzx_vvl : ClangBuiltin<"__builtin_ve_vl_vrminswlstzx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrminswlstzx_vvvl : ClangBuiltin<"__builtin_ve_vl_vrminswlstzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrmaxslfst_vvl : ClangBuiltin<"__builtin_ve_vl_vrmaxslfst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrmaxslfst_vvvl : ClangBuiltin<"__builtin_ve_vl_vrmaxslfst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrmaxsllst_vvl : ClangBuiltin<"__builtin_ve_vl_vrmaxsllst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrmaxsllst_vvvl : ClangBuiltin<"__builtin_ve_vl_vrmaxsllst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrminslfst_vvl : ClangBuiltin<"__builtin_ve_vl_vrminslfst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrminslfst_vvvl : ClangBuiltin<"__builtin_ve_vl_vrminslfst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrminsllst_vvl : ClangBuiltin<"__builtin_ve_vl_vrminsllst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrminsllst_vvvl : ClangBuiltin<"__builtin_ve_vl_vrminsllst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfrmaxdfst_vvl : ClangBuiltin<"__builtin_ve_vl_vfrmaxdfst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfrmaxdfst_vvvl : ClangBuiltin<"__builtin_ve_vl_vfrmaxdfst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfrmaxdlst_vvl : ClangBuiltin<"__builtin_ve_vl_vfrmaxdlst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfrmaxdlst_vvvl : ClangBuiltin<"__builtin_ve_vl_vfrmaxdlst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfrmaxsfst_vvl : ClangBuiltin<"__builtin_ve_vl_vfrmaxsfst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfrmaxsfst_vvvl : ClangBuiltin<"__builtin_ve_vl_vfrmaxsfst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfrmaxslst_vvl : ClangBuiltin<"__builtin_ve_vl_vfrmaxslst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfrmaxslst_vvvl : ClangBuiltin<"__builtin_ve_vl_vfrmaxslst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfrmindfst_vvl : ClangBuiltin<"__builtin_ve_vl_vfrmindfst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfrmindfst_vvvl : ClangBuiltin<"__builtin_ve_vl_vfrmindfst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfrmindlst_vvl : ClangBuiltin<"__builtin_ve_vl_vfrmindlst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfrmindlst_vvvl : ClangBuiltin<"__builtin_ve_vl_vfrmindlst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfrminsfst_vvl : ClangBuiltin<"__builtin_ve_vl_vfrminsfst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfrminsfst_vvvl : ClangBuiltin<"__builtin_ve_vl_vfrminsfst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfrminslst_vvl : ClangBuiltin<"__builtin_ve_vl_vfrminslst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfrminslst_vvvl : ClangBuiltin<"__builtin_ve_vl_vfrminslst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrand_vvl : ClangBuiltin<"__builtin_ve_vl_vrand_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrand_vvml : ClangBuiltin<"__builtin_ve_vl_vrand_vvml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vror_vvl : ClangBuiltin<"__builtin_ve_vl_vror_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vror_vvml : ClangBuiltin<"__builtin_ve_vl_vror_vvml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrxor_vvl : ClangBuiltin<"__builtin_ve_vl_vrxor_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrxor_vvml : ClangBuiltin<"__builtin_ve_vl_vrxor_vvml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgt_vvssl : ClangBuiltin<"__builtin_ve_vl_vgt_vvssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgt_vvssvl : ClangBuiltin<"__builtin_ve_vl_vgt_vvssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgt_vvssml : ClangBuiltin<"__builtin_ve_vl_vgt_vvssml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgt_vvssmvl : ClangBuiltin<"__builtin_ve_vl_vgt_vvssmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtnc_vvssl : ClangBuiltin<"__builtin_ve_vl_vgtnc_vvssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtnc_vvssvl : ClangBuiltin<"__builtin_ve_vl_vgtnc_vvssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtnc_vvssml : ClangBuiltin<"__builtin_ve_vl_vgtnc_vvssml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtnc_vvssmvl : ClangBuiltin<"__builtin_ve_vl_vgtnc_vvssmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtu_vvssl : ClangBuiltin<"__builtin_ve_vl_vgtu_vvssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtu_vvssvl : ClangBuiltin<"__builtin_ve_vl_vgtu_vvssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtu_vvssml : ClangBuiltin<"__builtin_ve_vl_vgtu_vvssml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtu_vvssmvl : ClangBuiltin<"__builtin_ve_vl_vgtu_vvssmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtunc_vvssl : ClangBuiltin<"__builtin_ve_vl_vgtunc_vvssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtunc_vvssvl : ClangBuiltin<"__builtin_ve_vl_vgtunc_vvssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtunc_vvssml : ClangBuiltin<"__builtin_ve_vl_vgtunc_vvssml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtunc_vvssmvl : ClangBuiltin<"__builtin_ve_vl_vgtunc_vvssmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtlsx_vvssl : ClangBuiltin<"__builtin_ve_vl_vgtlsx_vvssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtlsx_vvssvl : ClangBuiltin<"__builtin_ve_vl_vgtlsx_vvssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtlsx_vvssml : ClangBuiltin<"__builtin_ve_vl_vgtlsx_vvssml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtlsx_vvssmvl : ClangBuiltin<"__builtin_ve_vl_vgtlsx_vvssmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtlsxnc_vvssl : ClangBuiltin<"__builtin_ve_vl_vgtlsxnc_vvssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtlsxnc_vvssvl : ClangBuiltin<"__builtin_ve_vl_vgtlsxnc_vvssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtlsxnc_vvssml : ClangBuiltin<"__builtin_ve_vl_vgtlsxnc_vvssml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtlsxnc_vvssmvl : ClangBuiltin<"__builtin_ve_vl_vgtlsxnc_vvssmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtlzx_vvssl : ClangBuiltin<"__builtin_ve_vl_vgtlzx_vvssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtlzx_vvssvl : ClangBuiltin<"__builtin_ve_vl_vgtlzx_vvssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtlzx_vvssml : ClangBuiltin<"__builtin_ve_vl_vgtlzx_vvssml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtlzx_vvssmvl : ClangBuiltin<"__builtin_ve_vl_vgtlzx_vvssmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtlzxnc_vvssl : ClangBuiltin<"__builtin_ve_vl_vgtlzxnc_vvssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtlzxnc_vvssvl : ClangBuiltin<"__builtin_ve_vl_vgtlzxnc_vvssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtlzxnc_vvssml : ClangBuiltin<"__builtin_ve_vl_vgtlzxnc_vvssml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtlzxnc_vvssmvl : ClangBuiltin<"__builtin_ve_vl_vgtlzxnc_vvssmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsc_vvssl : ClangBuiltin<"__builtin_ve_vl_vsc_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsc_vvssml : ClangBuiltin<"__builtin_ve_vl_vsc_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vscnc_vvssl : ClangBuiltin<"__builtin_ve_vl_vscnc_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vscnc_vvssml : ClangBuiltin<"__builtin_ve_vl_vscnc_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vscot_vvssl : ClangBuiltin<"__builtin_ve_vl_vscot_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vscot_vvssml : ClangBuiltin<"__builtin_ve_vl_vscot_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vscncot_vvssl : ClangBuiltin<"__builtin_ve_vl_vscncot_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vscncot_vvssml : ClangBuiltin<"__builtin_ve_vl_vscncot_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vscu_vvssl : ClangBuiltin<"__builtin_ve_vl_vscu_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vscu_vvssml : ClangBuiltin<"__builtin_ve_vl_vscu_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vscunc_vvssl : ClangBuiltin<"__builtin_ve_vl_vscunc_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vscunc_vvssml : ClangBuiltin<"__builtin_ve_vl_vscunc_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vscuot_vvssl : ClangBuiltin<"__builtin_ve_vl_vscuot_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vscuot_vvssml : ClangBuiltin<"__builtin_ve_vl_vscuot_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vscuncot_vvssl : ClangBuiltin<"__builtin_ve_vl_vscuncot_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vscuncot_vvssml : ClangBuiltin<"__builtin_ve_vl_vscuncot_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vscl_vvssl : ClangBuiltin<"__builtin_ve_vl_vscl_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vscl_vvssml : ClangBuiltin<"__builtin_ve_vl_vscl_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsclnc_vvssl : ClangBuiltin<"__builtin_ve_vl_vsclnc_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsclnc_vvssml : ClangBuiltin<"__builtin_ve_vl_vsclnc_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsclot_vvssl : ClangBuiltin<"__builtin_ve_vl_vsclot_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsclot_vvssml : ClangBuiltin<"__builtin_ve_vl_vsclot_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsclncot_vvssl : ClangBuiltin<"__builtin_ve_vl_vsclncot_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsclncot_vvssml : ClangBuiltin<"__builtin_ve_vl_vsclncot_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_andm_mmm : ClangBuiltin<"__builtin_ve_vl_andm_mmm">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256i1>, LLVMType<v256i1>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_andm_MMM : ClangBuiltin<"__builtin_ve_vl_andm_MMM">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v512i1>, LLVMType<v512i1>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_orm_mmm : ClangBuiltin<"__builtin_ve_vl_orm_mmm">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256i1>, LLVMType<v256i1>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_orm_MMM : ClangBuiltin<"__builtin_ve_vl_orm_MMM">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v512i1>, LLVMType<v512i1>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_xorm_mmm : ClangBuiltin<"__builtin_ve_vl_xorm_mmm">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256i1>, LLVMType<v256i1>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_xorm_MMM : ClangBuiltin<"__builtin_ve_vl_xorm_MMM">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v512i1>, LLVMType<v512i1>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_eqvm_mmm : ClangBuiltin<"__builtin_ve_vl_eqvm_mmm">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256i1>, LLVMType<v256i1>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_eqvm_MMM : ClangBuiltin<"__builtin_ve_vl_eqvm_MMM">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v512i1>, LLVMType<v512i1>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_nndm_mmm : ClangBuiltin<"__builtin_ve_vl_nndm_mmm">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256i1>, LLVMType<v256i1>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_nndm_MMM : ClangBuiltin<"__builtin_ve_vl_nndm_MMM">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v512i1>, LLVMType<v512i1>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_negm_mm : ClangBuiltin<"__builtin_ve_vl_negm_mm">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256i1>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_negm_MM : ClangBuiltin<"__builtin_ve_vl_negm_MM">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v512i1>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pcvm_sml : ClangBuiltin<"__builtin_ve_vl_pcvm_sml">, Intrinsic<[LLVMType<i64>], [LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_lzvm_sml : ClangBuiltin<"__builtin_ve_vl_lzvm_sml">, Intrinsic<[LLVMType<i64>], [LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_tovm_sml : ClangBuiltin<"__builtin_ve_vl_tovm_sml">, Intrinsic<[LLVMType<i64>], [LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_lcr_sss : ClangBuiltin<"__builtin_ve_vl_lcr_sss">, Intrinsic<[LLVMType<i64>], [LLVMType<i64>, LLVMType<i64>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_scr_sss : ClangBuiltin<"__builtin_ve_vl_scr_sss">, Intrinsic<[], [LLVMType<i64>, LLVMType<i64>, LLVMType<i64>], [IntrNoMem, IntrHasSideEffects]>;
+let TargetPrefix = "ve" in def int_ve_vl_tscr_ssss : ClangBuiltin<"__builtin_ve_vl_tscr_ssss">, Intrinsic<[LLVMType<i64>], [LLVMType<i64>, LLVMType<i64>, LLVMType<i64>], [IntrNoMem, IntrHasSideEffects]>;
+let TargetPrefix = "ve" in def int_ve_vl_fidcr_sss : ClangBuiltin<"__builtin_ve_vl_fidcr_sss">, Intrinsic<[LLVMType<i64>], [LLVMType<i64>, LLVMType<i32>], [IntrNoMem, IntrHasSideEffects]>;
+let TargetPrefix = "ve" in def int_ve_vl_fencei : ClangBuiltin<"__builtin_ve_vl_fencei">, Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>;
+let TargetPrefix = "ve" in def int_ve_vl_fencem_s : ClangBuiltin<"__builtin_ve_vl_fencem_s">, Intrinsic<[], [LLVMType<i32>], [IntrNoMem, IntrHasSideEffects]>;
+let TargetPrefix = "ve" in def int_ve_vl_fencec_s : ClangBuiltin<"__builtin_ve_vl_fencec_s">, Intrinsic<[], [LLVMType<i32>], [IntrNoMem, IntrHasSideEffects]>;
+let TargetPrefix = "ve" in def int_ve_vl_svob : ClangBuiltin<"__builtin_ve_vl_svob">, Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>;
diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
index aecc3d91fae7..f313be1b2235 100644
--- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -31,6 +31,10 @@ def int_wasm_memory_grow : Intrinsic<[llvm_anyint_ty],
 //===----------------------------------------------------------------------===//
 def int_wasm_ref_null_extern : Intrinsic<[llvm_externref_ty], [], [IntrNoMem]>;
 def int_wasm_ref_null_func : Intrinsic<[llvm_funcref_ty], [], [IntrNoMem]>;
+def int_wasm_ref_is_null_extern : Intrinsic<[llvm_i32_ty], [llvm_externref_ty],
+                                            [IntrNoMem], "llvm.wasm.ref.is_null.extern">;
+def int_wasm_ref_is_null_func : Intrinsic<[llvm_i32_ty], [llvm_funcref_ty],
+                                          [IntrNoMem], "llvm.wasm.ref.is_null.func">;
 
 //===----------------------------------------------------------------------===//
 // Table intrinsics
@@ -256,16 +260,30 @@ def int_wasm_relaxed_trunc_unsigned:
             [llvm_v4f32_ty],
             [IntrNoMem, IntrSpeculatable]>;
 
-def int_wasm_relaxed_trunc_zero_signed:
+def int_wasm_relaxed_trunc_signed_zero:
   Intrinsic<[llvm_v4i32_ty],
             [llvm_v2f64_ty],
             [IntrNoMem, IntrSpeculatable]>;
 
-def int_wasm_relaxed_trunc_zero_unsigned:
+def int_wasm_relaxed_trunc_unsigned_zero:
   Intrinsic<[llvm_v4i32_ty],
             [llvm_v2f64_ty],
             [IntrNoMem, IntrSpeculatable]>;
 
+def int_wasm_relaxed_q15mulr_signed:
+  Intrinsic<[llvm_v8i16_ty],
+            [llvm_v8i16_ty, llvm_v8i16_ty],
+            [IntrNoMem, IntrSpeculatable]>;
+
+def int_wasm_dot_i8x16_i7x16_signed:
+  Intrinsic<[llvm_v8i16_ty],
+            [llvm_v16i8_ty, llvm_v16i8_ty],
+            [IntrNoMem, IntrSpeculatable]>;
+
+def int_wasm_dot_i8x16_i7x16_add_signed:
+  Intrinsic<[llvm_v4i32_ty],
+            [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v4i32_ty],
+            [IntrNoMem, IntrSpeculatable]>;
 
 //===----------------------------------------------------------------------===//
 // Thread-local storage intrinsics
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 8de737a1c7a5..0930abcc0993 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -31,20 +31,20 @@ let TargetPrefix = "x86" in {
 //===----------------------------------------------------------------------===//
 // FLAGS.
 let TargetPrefix = "x86" in {
-  def int_x86_flags_read_u32 : GCCBuiltin<"__builtin_ia32_readeflags_u32">,
+  def int_x86_flags_read_u32 : ClangBuiltin<"__builtin_ia32_readeflags_u32">,
         Intrinsic<[llvm_i32_ty], [], []>;
-  def int_x86_flags_read_u64 : GCCBuiltin<"__builtin_ia32_readeflags_u64">,
+  def int_x86_flags_read_u64 : ClangBuiltin<"__builtin_ia32_readeflags_u64">,
         Intrinsic<[llvm_i64_ty], [], []>;
-  def int_x86_flags_write_u32 : GCCBuiltin<"__builtin_ia32_writeeflags_u32">,
+  def int_x86_flags_write_u32 : ClangBuiltin<"__builtin_ia32_writeeflags_u32">,
         Intrinsic<[], [llvm_i32_ty], []>;
-  def int_x86_flags_write_u64 : GCCBuiltin<"__builtin_ia32_writeeflags_u64">,
+  def int_x86_flags_write_u64 : ClangBuiltin<"__builtin_ia32_writeeflags_u64">,
         Intrinsic<[], [llvm_i64_ty], []>;
 }
 
 //===----------------------------------------------------------------------===//
 // Read Time Stamp Counter.
 let TargetPrefix = "x86" in {
-  def int_x86_rdtsc : GCCBuiltin<"__builtin_ia32_rdtsc">,
+  def int_x86_rdtsc : ClangBuiltin<"__builtin_ia32_rdtsc">,
               Intrinsic<[llvm_i64_ty], [], []>;
   def int_x86_rdtscp :
               Intrinsic<[llvm_i64_ty, llvm_i32_ty], [], []>;
@@ -52,42 +52,52 @@ let TargetPrefix = "x86" in {
 
 // Read Performance-Monitoring Counter.
 let TargetPrefix = "x86" in {
-  def int_x86_rdpmc : GCCBuiltin<"__builtin_ia32_rdpmc">,
+  def int_x86_rdpmc : ClangBuiltin<"__builtin_ia32_rdpmc">,
               Intrinsic<[llvm_i64_ty], [llvm_i32_ty], []>;
 }
 
 // Read processor ID.
 let TargetPrefix = "x86" in {
-  def int_x86_rdpid : GCCBuiltin<"__builtin_ia32_rdpid">,
+  def int_x86_rdpid : ClangBuiltin<"__builtin_ia32_rdpid">,
               Intrinsic<[llvm_i32_ty], [], []>;
 }
 
+// Lock bit test.
+let TargetPrefix = "x86" in {
+  def int_x86_atomic_bts : Intrinsic<[llvm_anyint_ty], [llvm_ptr_ty, llvm_i8_ty],
+                                     [ImmArg<ArgIndex<1>>]>;
+  def int_x86_atomic_btc : Intrinsic<[llvm_anyint_ty], [llvm_ptr_ty, llvm_i8_ty],
+                                     [ImmArg<ArgIndex<1>>]>;
+  def int_x86_atomic_btr : Intrinsic<[llvm_anyint_ty], [llvm_ptr_ty, llvm_i8_ty],
+                                     [ImmArg<ArgIndex<1>>]>;
+}
+
 //===----------------------------------------------------------------------===//
 // CET SS
 let TargetPrefix = "x86" in {
-  def int_x86_incsspd : GCCBuiltin<"__builtin_ia32_incsspd">,
+  def int_x86_incsspd : ClangBuiltin<"__builtin_ia32_incsspd">,
               Intrinsic<[], [llvm_i32_ty], []>;
-  def int_x86_incsspq : GCCBuiltin<"__builtin_ia32_incsspq">,
+  def int_x86_incsspq : ClangBuiltin<"__builtin_ia32_incsspq">,
               Intrinsic<[], [llvm_i64_ty], []>;
-  def int_x86_rdsspd : GCCBuiltin<"__builtin_ia32_rdsspd">,
+  def int_x86_rdsspd : ClangBuiltin<"__builtin_ia32_rdsspd">,
               Intrinsic<[llvm_i32_ty], [llvm_i32_ty], []>;
-  def int_x86_rdsspq : GCCBuiltin<"__builtin_ia32_rdsspq">,
+  def int_x86_rdsspq : ClangBuiltin<"__builtin_ia32_rdsspq">,
               Intrinsic<[llvm_i64_ty], [llvm_i64_ty], []>;
-  def int_x86_saveprevssp : GCCBuiltin<"__builtin_ia32_saveprevssp">,
+  def int_x86_saveprevssp : ClangBuiltin<"__builtin_ia32_saveprevssp">,
               Intrinsic<[], [], []>;
-  def int_x86_rstorssp : GCCBuiltin<"__builtin_ia32_rstorssp">,
+  def int_x86_rstorssp : ClangBuiltin<"__builtin_ia32_rstorssp">,
               Intrinsic<[], [llvm_ptr_ty], []>;
-  def int_x86_wrssd : GCCBuiltin<"__builtin_ia32_wrssd">,
+  def int_x86_wrssd : ClangBuiltin<"__builtin_ia32_wrssd">,
               Intrinsic<[], [llvm_i32_ty, llvm_ptr_ty], []>;
-  def int_x86_wrssq : GCCBuiltin<"__builtin_ia32_wrssq">,
+  def int_x86_wrssq : ClangBuiltin<"__builtin_ia32_wrssq">,
               Intrinsic<[], [llvm_i64_ty, llvm_ptr_ty], []>;
-  def int_x86_wrussd : GCCBuiltin<"__builtin_ia32_wrussd">,
+  def int_x86_wrussd : ClangBuiltin<"__builtin_ia32_wrussd">,
               Intrinsic<[], [llvm_i32_ty, llvm_ptr_ty], []>;
-  def int_x86_wrussq : GCCBuiltin<"__builtin_ia32_wrussq">,
+  def int_x86_wrussq : ClangBuiltin<"__builtin_ia32_wrussq">,
               Intrinsic<[], [llvm_i64_ty, llvm_ptr_ty], []>;
-  def int_x86_setssbsy : GCCBuiltin<"__builtin_ia32_setssbsy">,
+  def int_x86_setssbsy : ClangBuiltin<"__builtin_ia32_setssbsy">,
               Intrinsic<[], [], []>;
-  def int_x86_clrssbsy : GCCBuiltin<"__builtin_ia32_clrssbsy">,
+  def int_x86_clrssbsy : ClangBuiltin<"__builtin_ia32_clrssbsy">,
               Intrinsic<[], [llvm_ptr_ty], []>;
 }
 
@@ -95,57 +105,57 @@ let TargetPrefix = "x86" in {
 // 3DNow!
 
 let TargetPrefix = "x86" in {
-  def int_x86_3dnow_pavgusb : GCCBuiltin<"__builtin_ia32_pavgusb">,
+  def int_x86_3dnow_pavgusb : ClangBuiltin<"__builtin_ia32_pavgusb">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem]>;
-  def int_x86_3dnow_pf2id : GCCBuiltin<"__builtin_ia32_pf2id">,
+  def int_x86_3dnow_pf2id : ClangBuiltin<"__builtin_ia32_pf2id">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_3dnow_pfacc : GCCBuiltin<"__builtin_ia32_pfacc">,
+  def int_x86_3dnow_pfacc : ClangBuiltin<"__builtin_ia32_pfacc">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem]>;
-  def int_x86_3dnow_pfadd : GCCBuiltin<"__builtin_ia32_pfadd">,
+  def int_x86_3dnow_pfadd : ClangBuiltin<"__builtin_ia32_pfadd">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem]>;
-  def int_x86_3dnow_pfcmpeq : GCCBuiltin<"__builtin_ia32_pfcmpeq">,
+  def int_x86_3dnow_pfcmpeq : ClangBuiltin<"__builtin_ia32_pfcmpeq">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem]>;
-  def int_x86_3dnow_pfcmpge : GCCBuiltin<"__builtin_ia32_pfcmpge">,
+  def int_x86_3dnow_pfcmpge : ClangBuiltin<"__builtin_ia32_pfcmpge">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem]>;
-  def int_x86_3dnow_pfcmpgt : GCCBuiltin<"__builtin_ia32_pfcmpgt">,
+  def int_x86_3dnow_pfcmpgt : ClangBuiltin<"__builtin_ia32_pfcmpgt">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem]>;
-  def int_x86_3dnow_pfmax : GCCBuiltin<"__builtin_ia32_pfmax">,
+  def int_x86_3dnow_pfmax : ClangBuiltin<"__builtin_ia32_pfmax">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem]>;
-  def int_x86_3dnow_pfmin : GCCBuiltin<"__builtin_ia32_pfmin">,
+  def int_x86_3dnow_pfmin : ClangBuiltin<"__builtin_ia32_pfmin">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem]>;
-  def int_x86_3dnow_pfmul : GCCBuiltin<"__builtin_ia32_pfmul">,
+  def int_x86_3dnow_pfmul : ClangBuiltin<"__builtin_ia32_pfmul">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem]>;
-  def int_x86_3dnow_pfrcp : GCCBuiltin<"__builtin_ia32_pfrcp">,
+  def int_x86_3dnow_pfrcp : ClangBuiltin<"__builtin_ia32_pfrcp">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_3dnow_pfrcpit1 : GCCBuiltin<"__builtin_ia32_pfrcpit1">,
+  def int_x86_3dnow_pfrcpit1 : ClangBuiltin<"__builtin_ia32_pfrcpit1">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem]>;
-  def int_x86_3dnow_pfrcpit2 : GCCBuiltin<"__builtin_ia32_pfrcpit2">,
+  def int_x86_3dnow_pfrcpit2 : ClangBuiltin<"__builtin_ia32_pfrcpit2">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem]>;
-  def int_x86_3dnow_pfrsqrt : GCCBuiltin<"__builtin_ia32_pfrsqrt">,
+  def int_x86_3dnow_pfrsqrt : ClangBuiltin<"__builtin_ia32_pfrsqrt">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_3dnow_pfrsqit1 : GCCBuiltin<"__builtin_ia32_pfrsqit1">,
+  def int_x86_3dnow_pfrsqit1 : ClangBuiltin<"__builtin_ia32_pfrsqit1">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem]>;
-  def int_x86_3dnow_pfsub : GCCBuiltin<"__builtin_ia32_pfsub">,
+  def int_x86_3dnow_pfsub : ClangBuiltin<"__builtin_ia32_pfsub">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem]>;
-  def int_x86_3dnow_pfsubr : GCCBuiltin<"__builtin_ia32_pfsubr">,
+  def int_x86_3dnow_pfsubr : ClangBuiltin<"__builtin_ia32_pfsubr">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem]>;
-  def int_x86_3dnow_pi2fd : GCCBuiltin<"__builtin_ia32_pi2fd">,
+  def int_x86_3dnow_pi2fd : ClangBuiltin<"__builtin_ia32_pi2fd">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_3dnow_pmulhrw : GCCBuiltin<"__builtin_ia32_pmulhrw">,
+  def int_x86_3dnow_pmulhrw : ClangBuiltin<"__builtin_ia32_pmulhrw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem]>;
 }
@@ -154,15 +164,15 @@ let TargetPrefix = "x86" in {
 // 3DNow! extensions
 
 let TargetPrefix = "x86" in {
-  def int_x86_3dnowa_pf2iw : GCCBuiltin<"__builtin_ia32_pf2iw">,
+  def int_x86_3dnowa_pf2iw : ClangBuiltin<"__builtin_ia32_pf2iw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_3dnowa_pfnacc : GCCBuiltin<"__builtin_ia32_pfnacc">,
+  def int_x86_3dnowa_pfnacc : ClangBuiltin<"__builtin_ia32_pfnacc">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem]>;
-  def int_x86_3dnowa_pfpnacc : GCCBuiltin<"__builtin_ia32_pfpnacc">,
+  def int_x86_3dnowa_pfpnacc : ClangBuiltin<"__builtin_ia32_pfpnacc">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem]>;
-  def int_x86_3dnowa_pi2fw : GCCBuiltin<"__builtin_ia32_pi2fw">,
+  def int_x86_3dnowa_pi2fw : ClangBuiltin<"__builtin_ia32_pi2fw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
   def int_x86_3dnowa_pswapd :
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
@@ -173,35 +183,35 @@ let TargetPrefix = "x86" in {
 
 // Arithmetic ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse_rcp_ss : GCCBuiltin<"__builtin_ia32_rcpss">,
+  def int_x86_sse_rcp_ss : ClangBuiltin<"__builtin_ia32_rcpss">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty],
                         [IntrNoMem]>;
-  def int_x86_sse_rcp_ps : GCCBuiltin<"__builtin_ia32_rcpps">,
+  def int_x86_sse_rcp_ps : ClangBuiltin<"__builtin_ia32_rcpps">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty],
                         [IntrNoMem]>;
-  def int_x86_sse_rsqrt_ss : GCCBuiltin<"__builtin_ia32_rsqrtss">,
+  def int_x86_sse_rsqrt_ss : ClangBuiltin<"__builtin_ia32_rsqrtss">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty],
                         [IntrNoMem]>;
-  def int_x86_sse_rsqrt_ps : GCCBuiltin<"__builtin_ia32_rsqrtps">,
+  def int_x86_sse_rsqrt_ps : ClangBuiltin<"__builtin_ia32_rsqrtps">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty],
                         [IntrNoMem]>;
-  def int_x86_sse_min_ss : GCCBuiltin<"__builtin_ia32_minss">,
+  def int_x86_sse_min_ss : ClangBuiltin<"__builtin_ia32_minss">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
                          llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_sse_min_ps : GCCBuiltin<"__builtin_ia32_minps">,
+  def int_x86_sse_min_ps : ClangBuiltin<"__builtin_ia32_minps">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
                          llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_sse_max_ss : GCCBuiltin<"__builtin_ia32_maxss">,
+  def int_x86_sse_max_ss : ClangBuiltin<"__builtin_ia32_maxss">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
                          llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_sse_max_ps : GCCBuiltin<"__builtin_ia32_maxps">,
+  def int_x86_sse_max_ps : ClangBuiltin<"__builtin_ia32_maxps">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
                          llvm_v4f32_ty], [IntrNoMem]>;
 }
 
 // Comparison ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse_cmp_ss : GCCBuiltin<"__builtin_ia32_cmpss">,
+  def int_x86_sse_cmp_ss : ClangBuiltin<"__builtin_ia32_cmpss">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
                          llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   // NOTE: This comparison intrinsic is not used by clang as long as the
@@ -209,40 +219,40 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse_cmp_ps :
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
                          llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_sse_comieq_ss : GCCBuiltin<"__builtin_ia32_comieq">,
+  def int_x86_sse_comieq_ss : ClangBuiltin<"__builtin_ia32_comieq">,
               Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
                          llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_sse_comilt_ss : GCCBuiltin<"__builtin_ia32_comilt">,
+  def int_x86_sse_comilt_ss : ClangBuiltin<"__builtin_ia32_comilt">,
               Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
                          llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_sse_comile_ss : GCCBuiltin<"__builtin_ia32_comile">,
+  def int_x86_sse_comile_ss : ClangBuiltin<"__builtin_ia32_comile">,
               Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
                          llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_sse_comigt_ss : GCCBuiltin<"__builtin_ia32_comigt">,
+  def int_x86_sse_comigt_ss : ClangBuiltin<"__builtin_ia32_comigt">,
               Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
                          llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_sse_comige_ss : GCCBuiltin<"__builtin_ia32_comige">,
+  def int_x86_sse_comige_ss : ClangBuiltin<"__builtin_ia32_comige">,
               Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
                          llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_sse_comineq_ss : GCCBuiltin<"__builtin_ia32_comineq">,
+  def int_x86_sse_comineq_ss : ClangBuiltin<"__builtin_ia32_comineq">,
               Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
                          llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_sse_ucomieq_ss : GCCBuiltin<"__builtin_ia32_ucomieq">,
+  def int_x86_sse_ucomieq_ss : ClangBuiltin<"__builtin_ia32_ucomieq">,
               Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
                          llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_sse_ucomilt_ss : GCCBuiltin<"__builtin_ia32_ucomilt">,
+  def int_x86_sse_ucomilt_ss : ClangBuiltin<"__builtin_ia32_ucomilt">,
               Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
                          llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_sse_ucomile_ss : GCCBuiltin<"__builtin_ia32_ucomile">,
+  def int_x86_sse_ucomile_ss : ClangBuiltin<"__builtin_ia32_ucomile">,
               Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
                          llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_sse_ucomigt_ss : GCCBuiltin<"__builtin_ia32_ucomigt">,
+  def int_x86_sse_ucomigt_ss : ClangBuiltin<"__builtin_ia32_ucomigt">,
               Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
                          llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_sse_ucomige_ss : GCCBuiltin<"__builtin_ia32_ucomige">,
+  def int_x86_sse_ucomige_ss : ClangBuiltin<"__builtin_ia32_ucomige">,
               Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
                          llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_sse_ucomineq_ss : GCCBuiltin<"__builtin_ia32_ucomineq">,
+  def int_x86_sse_ucomineq_ss : ClangBuiltin<"__builtin_ia32_ucomineq">,
               Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
                          llvm_v4f32_ty], [IntrNoMem]>;
 }
@@ -250,27 +260,27 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // Conversion ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse_cvtss2si : GCCBuiltin<"__builtin_ia32_cvtss2si">,
+  def int_x86_sse_cvtss2si : ClangBuiltin<"__builtin_ia32_cvtss2si">,
               Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_sse_cvtss2si64 : GCCBuiltin<"__builtin_ia32_cvtss2si64">,
+  def int_x86_sse_cvtss2si64 : ClangBuiltin<"__builtin_ia32_cvtss2si64">,
               Intrinsic<[llvm_i64_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_sse_cvttss2si : GCCBuiltin<"__builtin_ia32_cvttss2si">,
+  def int_x86_sse_cvttss2si : ClangBuiltin<"__builtin_ia32_cvttss2si">,
               Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_sse_cvttss2si64 : GCCBuiltin<"__builtin_ia32_cvttss2si64">,
+  def int_x86_sse_cvttss2si64 : ClangBuiltin<"__builtin_ia32_cvttss2si64">,
               Intrinsic<[llvm_i64_ty], [llvm_v4f32_ty], [IntrNoMem]>;
 
-  def int_x86_sse_cvtps2pi : GCCBuiltin<"__builtin_ia32_cvtps2pi">,
+  def int_x86_sse_cvtps2pi : ClangBuiltin<"__builtin_ia32_cvtps2pi">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_sse_cvttps2pi: GCCBuiltin<"__builtin_ia32_cvttps2pi">,
+  def int_x86_sse_cvttps2pi: ClangBuiltin<"__builtin_ia32_cvttps2pi">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_sse_cvtpi2ps : GCCBuiltin<"__builtin_ia32_cvtpi2ps">,
+  def int_x86_sse_cvtpi2ps : ClangBuiltin<"__builtin_ia32_cvtpi2ps">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
                          llvm_x86mmx_ty], [IntrNoMem]>;
 }
 
 // Cacheability support ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse_sfence : GCCBuiltin<"__builtin_ia32_sfence">,
+  def int_x86_sse_sfence : ClangBuiltin<"__builtin_ia32_sfence">,
               Intrinsic<[], [], []>;
 }
 
@@ -291,7 +301,7 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // Misc.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse_movmsk_ps : GCCBuiltin<"__builtin_ia32_movmskps">,
+  def int_x86_sse_movmsk_ps : ClangBuiltin<"__builtin_ia32_movmskps">,
               Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
 }
 
@@ -300,23 +310,23 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // FP arithmetic ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse2_min_sd : GCCBuiltin<"__builtin_ia32_minsd">,
+  def int_x86_sse2_min_sd : ClangBuiltin<"__builtin_ia32_minsd">,
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
                          llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse2_min_pd : GCCBuiltin<"__builtin_ia32_minpd">,
+  def int_x86_sse2_min_pd : ClangBuiltin<"__builtin_ia32_minpd">,
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
                          llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse2_max_sd : GCCBuiltin<"__builtin_ia32_maxsd">,
+  def int_x86_sse2_max_sd : ClangBuiltin<"__builtin_ia32_maxsd">,
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
                          llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse2_max_pd : GCCBuiltin<"__builtin_ia32_maxpd">,
+  def int_x86_sse2_max_pd : ClangBuiltin<"__builtin_ia32_maxpd">,
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
                          llvm_v2f64_ty], [IntrNoMem]>;
 }
 
 // FP comparison ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse2_cmp_sd : GCCBuiltin<"__builtin_ia32_cmpsd">,
+  def int_x86_sse2_cmp_sd : ClangBuiltin<"__builtin_ia32_cmpsd">,
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
                          llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   // NOTE: This comparison intrinsic is not used by clang as long as the
@@ -324,176 +334,176 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse2_cmp_pd :
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
                          llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_sse2_comieq_sd : GCCBuiltin<"__builtin_ia32_comisdeq">,
+  def int_x86_sse2_comieq_sd : ClangBuiltin<"__builtin_ia32_comisdeq">,
               Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
                          llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse2_comilt_sd : GCCBuiltin<"__builtin_ia32_comisdlt">,
+  def int_x86_sse2_comilt_sd : ClangBuiltin<"__builtin_ia32_comisdlt">,
               Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
                          llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse2_comile_sd : GCCBuiltin<"__builtin_ia32_comisdle">,
+  def int_x86_sse2_comile_sd : ClangBuiltin<"__builtin_ia32_comisdle">,
               Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
                          llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse2_comigt_sd : GCCBuiltin<"__builtin_ia32_comisdgt">,
+  def int_x86_sse2_comigt_sd : ClangBuiltin<"__builtin_ia32_comisdgt">,
               Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
                          llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse2_comige_sd : GCCBuiltin<"__builtin_ia32_comisdge">,
+  def int_x86_sse2_comige_sd : ClangBuiltin<"__builtin_ia32_comisdge">,
               Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
                          llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse2_comineq_sd : GCCBuiltin<"__builtin_ia32_comisdneq">,
+  def int_x86_sse2_comineq_sd : ClangBuiltin<"__builtin_ia32_comisdneq">,
               Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
                          llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse2_ucomieq_sd : GCCBuiltin<"__builtin_ia32_ucomisdeq">,
+  def int_x86_sse2_ucomieq_sd : ClangBuiltin<"__builtin_ia32_ucomisdeq">,
               Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
                          llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse2_ucomilt_sd : GCCBuiltin<"__builtin_ia32_ucomisdlt">,
+  def int_x86_sse2_ucomilt_sd : ClangBuiltin<"__builtin_ia32_ucomisdlt">,
               Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
                          llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse2_ucomile_sd : GCCBuiltin<"__builtin_ia32_ucomisdle">,
+  def int_x86_sse2_ucomile_sd : ClangBuiltin<"__builtin_ia32_ucomisdle">,
               Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
                          llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse2_ucomigt_sd : GCCBuiltin<"__builtin_ia32_ucomisdgt">,
+  def int_x86_sse2_ucomigt_sd : ClangBuiltin<"__builtin_ia32_ucomisdgt">,
               Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
                          llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse2_ucomige_sd : GCCBuiltin<"__builtin_ia32_ucomisdge">,
+  def int_x86_sse2_ucomige_sd : ClangBuiltin<"__builtin_ia32_ucomisdge">,
               Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
                          llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse2_ucomineq_sd : GCCBuiltin<"__builtin_ia32_ucomisdneq">,
+  def int_x86_sse2_ucomineq_sd : ClangBuiltin<"__builtin_ia32_ucomisdneq">,
               Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
                          llvm_v2f64_ty], [IntrNoMem]>;
 }
 
 // Integer arithmetic ops.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse2_pmulhu_w : GCCBuiltin<"__builtin_ia32_pmulhuw128">,
+  def int_x86_sse2_pmulhu_w : ClangBuiltin<"__builtin_ia32_pmulhuw128">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                          llvm_v8i16_ty], [IntrNoMem, Commutative]>;
-  def int_x86_sse2_pmulh_w : GCCBuiltin<"__builtin_ia32_pmulhw128">,
+  def int_x86_sse2_pmulh_w : ClangBuiltin<"__builtin_ia32_pmulhw128">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                          llvm_v8i16_ty], [IntrNoMem, Commutative]>;
-  def int_x86_sse2_pmadd_wd : GCCBuiltin<"__builtin_ia32_pmaddwd128">,
+  def int_x86_sse2_pmadd_wd : ClangBuiltin<"__builtin_ia32_pmaddwd128">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty,
                          llvm_v8i16_ty], [IntrNoMem, Commutative]>;
-  def int_x86_sse2_pavg_b : GCCBuiltin<"__builtin_ia32_pavgb128">,
+  def int_x86_sse2_pavg_b : ClangBuiltin<"__builtin_ia32_pavgb128">,
               Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
                          llvm_v16i8_ty], [IntrNoMem, Commutative]>;
-  def int_x86_sse2_pavg_w : GCCBuiltin<"__builtin_ia32_pavgw128">,
+  def int_x86_sse2_pavg_w : ClangBuiltin<"__builtin_ia32_pavgw128">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                          llvm_v8i16_ty], [IntrNoMem, Commutative]>;
-  def int_x86_sse2_psad_bw : GCCBuiltin<"__builtin_ia32_psadbw128">,
+  def int_x86_sse2_psad_bw : ClangBuiltin<"__builtin_ia32_psadbw128">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v16i8_ty,
                          llvm_v16i8_ty], [IntrNoMem, Commutative]>;
 }
 
 // Integer shift ops.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse2_psll_w : GCCBuiltin<"__builtin_ia32_psllw128">,
+  def int_x86_sse2_psll_w : ClangBuiltin<"__builtin_ia32_psllw128">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                          llvm_v8i16_ty], [IntrNoMem]>;
-  def int_x86_sse2_psll_d : GCCBuiltin<"__builtin_ia32_pslld128">,
+  def int_x86_sse2_psll_d : ClangBuiltin<"__builtin_ia32_pslld128">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
                          llvm_v4i32_ty], [IntrNoMem]>;
-  def int_x86_sse2_psll_q : GCCBuiltin<"__builtin_ia32_psllq128">,
+  def int_x86_sse2_psll_q : ClangBuiltin<"__builtin_ia32_psllq128">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
                          llvm_v2i64_ty], [IntrNoMem]>;
-  def int_x86_sse2_psrl_w : GCCBuiltin<"__builtin_ia32_psrlw128">,
+  def int_x86_sse2_psrl_w : ClangBuiltin<"__builtin_ia32_psrlw128">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                          llvm_v8i16_ty], [IntrNoMem]>;
-  def int_x86_sse2_psrl_d : GCCBuiltin<"__builtin_ia32_psrld128">,
+  def int_x86_sse2_psrl_d : ClangBuiltin<"__builtin_ia32_psrld128">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
                          llvm_v4i32_ty], [IntrNoMem]>;
-  def int_x86_sse2_psrl_q : GCCBuiltin<"__builtin_ia32_psrlq128">,
+  def int_x86_sse2_psrl_q : ClangBuiltin<"__builtin_ia32_psrlq128">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
                          llvm_v2i64_ty], [IntrNoMem]>;
-  def int_x86_sse2_psra_w : GCCBuiltin<"__builtin_ia32_psraw128">,
+  def int_x86_sse2_psra_w : ClangBuiltin<"__builtin_ia32_psraw128">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                          llvm_v8i16_ty], [IntrNoMem]>;
-  def int_x86_sse2_psra_d : GCCBuiltin<"__builtin_ia32_psrad128">,
+  def int_x86_sse2_psra_d : ClangBuiltin<"__builtin_ia32_psrad128">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
                          llvm_v4i32_ty], [IntrNoMem]>;
 
   // Oddly these don't require an immediate due to a gcc compatibility issue.
-  def int_x86_sse2_pslli_w : GCCBuiltin<"__builtin_ia32_psllwi128">,
+  def int_x86_sse2_pslli_w : ClangBuiltin<"__builtin_ia32_psllwi128">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                          llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_sse2_pslli_d : GCCBuiltin<"__builtin_ia32_pslldi128">,
+  def int_x86_sse2_pslli_d : ClangBuiltin<"__builtin_ia32_pslldi128">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
                          llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_sse2_pslli_q : GCCBuiltin<"__builtin_ia32_psllqi128">,
+  def int_x86_sse2_pslli_q : ClangBuiltin<"__builtin_ia32_psllqi128">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
                          llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_sse2_psrli_w : GCCBuiltin<"__builtin_ia32_psrlwi128">,
+  def int_x86_sse2_psrli_w : ClangBuiltin<"__builtin_ia32_psrlwi128">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                          llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_sse2_psrli_d : GCCBuiltin<"__builtin_ia32_psrldi128">,
+  def int_x86_sse2_psrli_d : ClangBuiltin<"__builtin_ia32_psrldi128">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
                          llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_sse2_psrli_q : GCCBuiltin<"__builtin_ia32_psrlqi128">,
+  def int_x86_sse2_psrli_q : ClangBuiltin<"__builtin_ia32_psrlqi128">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
                          llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_sse2_psrai_w : GCCBuiltin<"__builtin_ia32_psrawi128">,
+  def int_x86_sse2_psrai_w : ClangBuiltin<"__builtin_ia32_psrawi128">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                          llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_sse2_psrai_d : GCCBuiltin<"__builtin_ia32_psradi128">,
+  def int_x86_sse2_psrai_d : ClangBuiltin<"__builtin_ia32_psradi128">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
                          llvm_i32_ty], [IntrNoMem]>;
 }
 
 // Conversion ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse2_cvtpd2dq : GCCBuiltin<"__builtin_ia32_cvtpd2dq">,
+  def int_x86_sse2_cvtpd2dq : ClangBuiltin<"__builtin_ia32_cvtpd2dq">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse2_cvttpd2dq : GCCBuiltin<"__builtin_ia32_cvttpd2dq">,
+  def int_x86_sse2_cvttpd2dq : ClangBuiltin<"__builtin_ia32_cvttpd2dq">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse2_cvtpd2ps : GCCBuiltin<"__builtin_ia32_cvtpd2ps">,
+  def int_x86_sse2_cvtpd2ps : ClangBuiltin<"__builtin_ia32_cvtpd2ps">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse2_cvtps2dq : GCCBuiltin<"__builtin_ia32_cvtps2dq">,
+  def int_x86_sse2_cvtps2dq : ClangBuiltin<"__builtin_ia32_cvtps2dq">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_sse2_cvttps2dq : GCCBuiltin<"__builtin_ia32_cvttps2dq">,
+  def int_x86_sse2_cvttps2dq : ClangBuiltin<"__builtin_ia32_cvttps2dq">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_sse2_cvtsd2si : GCCBuiltin<"__builtin_ia32_cvtsd2si">,
+  def int_x86_sse2_cvtsd2si : ClangBuiltin<"__builtin_ia32_cvtsd2si">,
               Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse2_cvtsd2si64 : GCCBuiltin<"__builtin_ia32_cvtsd2si64">,
+  def int_x86_sse2_cvtsd2si64 : ClangBuiltin<"__builtin_ia32_cvtsd2si64">,
               Intrinsic<[llvm_i64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse2_cvttsd2si : GCCBuiltin<"__builtin_ia32_cvttsd2si">,
+  def int_x86_sse2_cvttsd2si : ClangBuiltin<"__builtin_ia32_cvttsd2si">,
               Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse2_cvttsd2si64 : GCCBuiltin<"__builtin_ia32_cvttsd2si64">,
+  def int_x86_sse2_cvttsd2si64 : ClangBuiltin<"__builtin_ia32_cvttsd2si64">,
               Intrinsic<[llvm_i64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse2_cvtsd2ss : GCCBuiltin<"__builtin_ia32_cvtsd2ss">,
+  def int_x86_sse2_cvtsd2ss : ClangBuiltin<"__builtin_ia32_cvtsd2ss">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
                          llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse_cvtpd2pi : GCCBuiltin<"__builtin_ia32_cvtpd2pi">,
+  def int_x86_sse_cvtpd2pi : ClangBuiltin<"__builtin_ia32_cvtpd2pi">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse_cvttpd2pi: GCCBuiltin<"__builtin_ia32_cvttpd2pi">,
+  def int_x86_sse_cvttpd2pi: ClangBuiltin<"__builtin_ia32_cvttpd2pi">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse_cvtpi2pd : GCCBuiltin<"__builtin_ia32_cvtpi2pd">,
+  def int_x86_sse_cvtpi2pd : ClangBuiltin<"__builtin_ia32_cvtpi2pd">,
               Intrinsic<[llvm_v2f64_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
 }
 
 // Misc.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse2_packsswb_128 : GCCBuiltin<"__builtin_ia32_packsswb128">,
+  def int_x86_sse2_packsswb_128 : ClangBuiltin<"__builtin_ia32_packsswb128">,
               Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty,
                          llvm_v8i16_ty], [IntrNoMem]>;
-  def int_x86_sse2_packssdw_128 : GCCBuiltin<"__builtin_ia32_packssdw128">,
+  def int_x86_sse2_packssdw_128 : ClangBuiltin<"__builtin_ia32_packssdw128">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty,
                          llvm_v4i32_ty], [IntrNoMem]>;
-  def int_x86_sse2_packuswb_128 : GCCBuiltin<"__builtin_ia32_packuswb128">,
+  def int_x86_sse2_packuswb_128 : ClangBuiltin<"__builtin_ia32_packuswb128">,
               Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty,
                          llvm_v8i16_ty], [IntrNoMem]>;
-  def int_x86_sse2_movmsk_pd : GCCBuiltin<"__builtin_ia32_movmskpd">,
+  def int_x86_sse2_movmsk_pd : ClangBuiltin<"__builtin_ia32_movmskpd">,
               Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse2_pmovmskb_128 : GCCBuiltin<"__builtin_ia32_pmovmskb128">,
+  def int_x86_sse2_pmovmskb_128 : ClangBuiltin<"__builtin_ia32_pmovmskb128">,
               Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty], [IntrNoMem]>;
-  def int_x86_sse2_maskmov_dqu : GCCBuiltin<"__builtin_ia32_maskmovdqu">,
+  def int_x86_sse2_maskmov_dqu : ClangBuiltin<"__builtin_ia32_maskmovdqu">,
               Intrinsic<[], [llvm_v16i8_ty,
                          llvm_v16i8_ty, llvm_ptr_ty], []>;
-  def int_x86_sse2_clflush : GCCBuiltin<"__builtin_ia32_clflush">,
+  def int_x86_sse2_clflush : ClangBuiltin<"__builtin_ia32_clflush">,
               Intrinsic<[], [llvm_ptr_ty], []>;
-  def int_x86_sse2_lfence : GCCBuiltin<"__builtin_ia32_lfence">,
+  def int_x86_sse2_lfence : ClangBuiltin<"__builtin_ia32_lfence">,
               Intrinsic<[], [], []>;
-  def int_x86_sse2_mfence : GCCBuiltin<"__builtin_ia32_mfence">,
+  def int_x86_sse2_mfence : ClangBuiltin<"__builtin_ia32_mfence">,
               Intrinsic<[], [], []>;
-  def int_x86_sse2_pause : GCCBuiltin<"__builtin_ia32_pause">,
+  def int_x86_sse2_pause : ClangBuiltin<"__builtin_ia32_pause">,
               Intrinsic<[], [], []>;
 }
 
@@ -502,42 +512,42 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // Addition / subtraction ops.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse3_addsub_ps : GCCBuiltin<"__builtin_ia32_addsubps">,
+  def int_x86_sse3_addsub_ps : ClangBuiltin<"__builtin_ia32_addsubps">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
                          llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_sse3_addsub_pd : GCCBuiltin<"__builtin_ia32_addsubpd">,
+  def int_x86_sse3_addsub_pd : ClangBuiltin<"__builtin_ia32_addsubpd">,
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
                          llvm_v2f64_ty], [IntrNoMem]>;
 }
 
 // Horizontal ops.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse3_hadd_ps : GCCBuiltin<"__builtin_ia32_haddps">,
+  def int_x86_sse3_hadd_ps : ClangBuiltin<"__builtin_ia32_haddps">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
                          llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_sse3_hadd_pd : GCCBuiltin<"__builtin_ia32_haddpd">,
+  def int_x86_sse3_hadd_pd : ClangBuiltin<"__builtin_ia32_haddpd">,
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
                          llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse3_hsub_ps : GCCBuiltin<"__builtin_ia32_hsubps">,
+  def int_x86_sse3_hsub_ps : ClangBuiltin<"__builtin_ia32_hsubps">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
                          llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_sse3_hsub_pd : GCCBuiltin<"__builtin_ia32_hsubpd">,
+  def int_x86_sse3_hsub_pd : ClangBuiltin<"__builtin_ia32_hsubpd">,
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
                          llvm_v2f64_ty], [IntrNoMem]>;
 }
 
 // Specialized unaligned load.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse3_ldu_dq : GCCBuiltin<"__builtin_ia32_lddqu">,
+  def int_x86_sse3_ldu_dq : ClangBuiltin<"__builtin_ia32_lddqu">,
               Intrinsic<[llvm_v16i8_ty], [llvm_ptr_ty], [IntrReadMem]>;
 }
 
 // Thread synchronization ops.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse3_monitor : GCCBuiltin<"__builtin_ia32_monitor">,
+  def int_x86_sse3_monitor : ClangBuiltin<"__builtin_ia32_monitor">,
               Intrinsic<[], [llvm_ptr_ty,
                          llvm_i32_ty, llvm_i32_ty], []>;
-  def int_x86_sse3_mwait : GCCBuiltin<"__builtin_ia32_mwait">,
+  def int_x86_sse3_mwait : ClangBuiltin<"__builtin_ia32_mwait">,
               Intrinsic<[], [llvm_i32_ty,
                          llvm_i32_ty], []>;
 }
@@ -547,112 +557,112 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // Horizontal arithmetic ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_ssse3_phadd_w         : GCCBuiltin<"__builtin_ia32_phaddw">,
+  def int_x86_ssse3_phadd_w         : ClangBuiltin<"__builtin_ia32_phaddw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_ssse3_phadd_w_128     : GCCBuiltin<"__builtin_ia32_phaddw128">,
+  def int_x86_ssse3_phadd_w_128     : ClangBuiltin<"__builtin_ia32_phaddw128">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                          llvm_v8i16_ty], [IntrNoMem]>;
 
-  def int_x86_ssse3_phadd_d         : GCCBuiltin<"__builtin_ia32_phaddd">,
+  def int_x86_ssse3_phadd_d         : ClangBuiltin<"__builtin_ia32_phaddd">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_ssse3_phadd_d_128     : GCCBuiltin<"__builtin_ia32_phaddd128">,
+  def int_x86_ssse3_phadd_d_128     : ClangBuiltin<"__builtin_ia32_phaddd128">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
                          llvm_v4i32_ty], [IntrNoMem]>;
 
-  def int_x86_ssse3_phadd_sw        : GCCBuiltin<"__builtin_ia32_phaddsw">,
+  def int_x86_ssse3_phadd_sw        : ClangBuiltin<"__builtin_ia32_phaddsw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_ssse3_phadd_sw_128    : GCCBuiltin<"__builtin_ia32_phaddsw128">,
+  def int_x86_ssse3_phadd_sw_128    : ClangBuiltin<"__builtin_ia32_phaddsw128">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                          llvm_v8i16_ty], [IntrNoMem]>;
 
-  def int_x86_ssse3_phsub_w         : GCCBuiltin<"__builtin_ia32_phsubw">,
+  def int_x86_ssse3_phsub_w         : ClangBuiltin<"__builtin_ia32_phsubw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_ssse3_phsub_w_128     : GCCBuiltin<"__builtin_ia32_phsubw128">,
+  def int_x86_ssse3_phsub_w_128     : ClangBuiltin<"__builtin_ia32_phsubw128">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                          llvm_v8i16_ty], [IntrNoMem]>;
 
-  def int_x86_ssse3_phsub_d         : GCCBuiltin<"__builtin_ia32_phsubd">,
+  def int_x86_ssse3_phsub_d         : ClangBuiltin<"__builtin_ia32_phsubd">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_ssse3_phsub_d_128     : GCCBuiltin<"__builtin_ia32_phsubd128">,
+  def int_x86_ssse3_phsub_d_128     : ClangBuiltin<"__builtin_ia32_phsubd128">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
                          llvm_v4i32_ty], [IntrNoMem]>;
 
-  def int_x86_ssse3_phsub_sw        : GCCBuiltin<"__builtin_ia32_phsubsw">,
+  def int_x86_ssse3_phsub_sw        : ClangBuiltin<"__builtin_ia32_phsubsw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_ssse3_phsub_sw_128    : GCCBuiltin<"__builtin_ia32_phsubsw128">,
+  def int_x86_ssse3_phsub_sw_128    : ClangBuiltin<"__builtin_ia32_phsubsw128">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                          llvm_v8i16_ty], [IntrNoMem]>;
 
-  def int_x86_ssse3_pmadd_ub_sw     : GCCBuiltin<"__builtin_ia32_pmaddubsw">,
+  def int_x86_ssse3_pmadd_ub_sw     : ClangBuiltin<"__builtin_ia32_pmaddubsw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_ssse3_pmadd_ub_sw_128 : GCCBuiltin<"__builtin_ia32_pmaddubsw128">,
+  def int_x86_ssse3_pmadd_ub_sw_128 : ClangBuiltin<"__builtin_ia32_pmaddubsw128">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty,
                          llvm_v16i8_ty], [IntrNoMem]>;
 }
 
 // Packed multiply high with round and scale
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_ssse3_pmul_hr_sw      : GCCBuiltin<"__builtin_ia32_pmulhrsw">,
+  def int_x86_ssse3_pmul_hr_sw      : ClangBuiltin<"__builtin_ia32_pmulhrsw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
-  def int_x86_ssse3_pmul_hr_sw_128  : GCCBuiltin<"__builtin_ia32_pmulhrsw128">,
+  def int_x86_ssse3_pmul_hr_sw_128  : ClangBuiltin<"__builtin_ia32_pmulhrsw128">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                          llvm_v8i16_ty], [IntrNoMem, Commutative]>;
 }
 
 // Shuffle ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_ssse3_pshuf_b         : GCCBuiltin<"__builtin_ia32_pshufb">,
+  def int_x86_ssse3_pshuf_b         : ClangBuiltin<"__builtin_ia32_pshufb">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_ssse3_pshuf_b_128     : GCCBuiltin<"__builtin_ia32_pshufb128">,
+  def int_x86_ssse3_pshuf_b_128     : ClangBuiltin<"__builtin_ia32_pshufb128">,
               Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
                          llvm_v16i8_ty], [IntrNoMem]>;
-  def int_x86_sse_pshuf_w           : GCCBuiltin<"__builtin_ia32_pshufw">,
+  def int_x86_sse_pshuf_w           : ClangBuiltin<"__builtin_ia32_pshufw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i8_ty],
                          [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 }
 
 // Sign ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_ssse3_psign_b         : GCCBuiltin<"__builtin_ia32_psignb">,
+  def int_x86_ssse3_psign_b         : ClangBuiltin<"__builtin_ia32_psignb">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_ssse3_psign_b_128     : GCCBuiltin<"__builtin_ia32_psignb128">,
+  def int_x86_ssse3_psign_b_128     : ClangBuiltin<"__builtin_ia32_psignb128">,
               Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
                          llvm_v16i8_ty], [IntrNoMem]>;
 
-  def int_x86_ssse3_psign_w         : GCCBuiltin<"__builtin_ia32_psignw">,
+  def int_x86_ssse3_psign_w         : ClangBuiltin<"__builtin_ia32_psignw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_ssse3_psign_w_128     : GCCBuiltin<"__builtin_ia32_psignw128">,
+  def int_x86_ssse3_psign_w_128     : ClangBuiltin<"__builtin_ia32_psignw128">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                          llvm_v8i16_ty], [IntrNoMem]>;
 
-  def int_x86_ssse3_psign_d         : GCCBuiltin<"__builtin_ia32_psignd">,
+  def int_x86_ssse3_psign_d         : ClangBuiltin<"__builtin_ia32_psignd">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_ssse3_psign_d_128     : GCCBuiltin<"__builtin_ia32_psignd128">,
+  def int_x86_ssse3_psign_d_128     : ClangBuiltin<"__builtin_ia32_psignd128">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
                          llvm_v4i32_ty], [IntrNoMem]>;
 }
 
 // Absolute value ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_ssse3_pabs_b     : GCCBuiltin<"__builtin_ia32_pabsb">,
+  def int_x86_ssse3_pabs_b     : ClangBuiltin<"__builtin_ia32_pabsb">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
 
-  def int_x86_ssse3_pabs_w     : GCCBuiltin<"__builtin_ia32_pabsw">,
+  def int_x86_ssse3_pabs_w     : ClangBuiltin<"__builtin_ia32_pabsw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
 
-  def int_x86_ssse3_pabs_d     : GCCBuiltin<"__builtin_ia32_pabsd">,
+  def int_x86_ssse3_pabs_d     : ClangBuiltin<"__builtin_ia32_pabsd">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
 }
 
@@ -661,149 +671,149 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // FP rounding ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse41_round_ss        : GCCBuiltin<"__builtin_ia32_roundss">,
+  def int_x86_sse41_round_ss        : ClangBuiltin<"__builtin_ia32_roundss">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                          llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_sse41_round_ps        : GCCBuiltin<"__builtin_ia32_roundps">,
+  def int_x86_sse41_round_ps        : ClangBuiltin<"__builtin_ia32_roundps">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
                          llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_sse41_round_sd        : GCCBuiltin<"__builtin_ia32_roundsd">,
+  def int_x86_sse41_round_sd        : ClangBuiltin<"__builtin_ia32_roundsd">,
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                          llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_sse41_round_pd        : GCCBuiltin<"__builtin_ia32_roundpd">,
+  def int_x86_sse41_round_pd        : ClangBuiltin<"__builtin_ia32_roundpd">,
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
                          llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 }
 
 // Vector min element
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse41_phminposuw     : GCCBuiltin<"__builtin_ia32_phminposuw128">,
+  def int_x86_sse41_phminposuw     : ClangBuiltin<"__builtin_ia32_phminposuw128">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty],
                         [IntrNoMem]>;
 }
 
 // Advanced Encryption Standard (AES) Instructions
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_aesni_aesimc          : GCCBuiltin<"__builtin_ia32_aesimc128">,
+  def int_x86_aesni_aesimc          : ClangBuiltin<"__builtin_ia32_aesimc128">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty],
                         [IntrNoMem]>;
 
-  def int_x86_aesni_aesenc          : GCCBuiltin<"__builtin_ia32_aesenc128">,
+  def int_x86_aesni_aesenc          : ClangBuiltin<"__builtin_ia32_aesenc128">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
                         [IntrNoMem]>;
-  def int_x86_aesni_aesenc_256      : GCCBuiltin<"__builtin_ia32_aesenc256">,
+  def int_x86_aesni_aesenc_256      : ClangBuiltin<"__builtin_ia32_aesenc256">,
               Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty],
                         [IntrNoMem]>;
-  def int_x86_aesni_aesenc_512      : GCCBuiltin<"__builtin_ia32_aesenc512">,
+  def int_x86_aesni_aesenc_512      : ClangBuiltin<"__builtin_ia32_aesenc512">,
               Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty],
                         [IntrNoMem]>;
 
-  def int_x86_aesni_aesenclast : GCCBuiltin<"__builtin_ia32_aesenclast128">,
+  def int_x86_aesni_aesenclast : ClangBuiltin<"__builtin_ia32_aesenclast128">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
                         [IntrNoMem]>;
   def int_x86_aesni_aesenclast_256 :
-    GCCBuiltin<"__builtin_ia32_aesenclast256">,
+    ClangBuiltin<"__builtin_ia32_aesenclast256">,
     Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty],
               [IntrNoMem]>;
   def int_x86_aesni_aesenclast_512 :
-    GCCBuiltin<"__builtin_ia32_aesenclast512">,
+    ClangBuiltin<"__builtin_ia32_aesenclast512">,
     Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty],
               [IntrNoMem]>;
 
-  def int_x86_aesni_aesdec          : GCCBuiltin<"__builtin_ia32_aesdec128">,
+  def int_x86_aesni_aesdec          : ClangBuiltin<"__builtin_ia32_aesdec128">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
                         [IntrNoMem]>;
-  def int_x86_aesni_aesdec_256      : GCCBuiltin<"__builtin_ia32_aesdec256">,
+  def int_x86_aesni_aesdec_256      : ClangBuiltin<"__builtin_ia32_aesdec256">,
               Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty],
                         [IntrNoMem]>;
-  def int_x86_aesni_aesdec_512      : GCCBuiltin<"__builtin_ia32_aesdec512">,
+  def int_x86_aesni_aesdec_512      : ClangBuiltin<"__builtin_ia32_aesdec512">,
               Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty],
                         [IntrNoMem]>;
 
-  def int_x86_aesni_aesdeclast : GCCBuiltin<"__builtin_ia32_aesdeclast128">,
+  def int_x86_aesni_aesdeclast : ClangBuiltin<"__builtin_ia32_aesdeclast128">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
                         [IntrNoMem]>;
   def int_x86_aesni_aesdeclast_256 :
-    GCCBuiltin<"__builtin_ia32_aesdeclast256">,
+    ClangBuiltin<"__builtin_ia32_aesdeclast256">,
     Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty],
               [IntrNoMem]>;
   def int_x86_aesni_aesdeclast_512 :
-    GCCBuiltin<"__builtin_ia32_aesdeclast512">,
+    ClangBuiltin<"__builtin_ia32_aesdeclast512">,
     Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty],
               [IntrNoMem]>;
 
   def int_x86_aesni_aeskeygenassist :
-              GCCBuiltin<"__builtin_ia32_aeskeygenassist128">,
+              ClangBuiltin<"__builtin_ia32_aeskeygenassist128">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i8_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 }
 
 // PCLMUL instructions
 let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
-  def int_x86_pclmulqdq : GCCBuiltin<"__builtin_ia32_pclmulqdq128">,
+  def int_x86_pclmulqdq : ClangBuiltin<"__builtin_ia32_pclmulqdq128">,
           Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty],
                     [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_pclmulqdq_256 : GCCBuiltin<"__builtin_ia32_pclmulqdq256">,
+  def int_x86_pclmulqdq_256 : ClangBuiltin<"__builtin_ia32_pclmulqdq256">,
           Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty],
                     [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_pclmulqdq_512 : GCCBuiltin<"__builtin_ia32_pclmulqdq512">,
+  def int_x86_pclmulqdq_512 : ClangBuiltin<"__builtin_ia32_pclmulqdq512">,
           Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty],
                     [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 }
 
 // Vector pack
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse41_packusdw        : GCCBuiltin<"__builtin_ia32_packusdw128">,
+  def int_x86_sse41_packusdw        : ClangBuiltin<"__builtin_ia32_packusdw128">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
                         [IntrNoMem]>;
 }
 
 // Vector insert
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse41_insertps       : GCCBuiltin<"__builtin_ia32_insertps128">,
+  def int_x86_sse41_insertps       : ClangBuiltin<"__builtin_ia32_insertps128">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
                     [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 }
 
 // Vector blend
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse41_pblendvb         : GCCBuiltin<"__builtin_ia32_pblendvb128">,
+  def int_x86_sse41_pblendvb         : ClangBuiltin<"__builtin_ia32_pblendvb128">,
         Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty,llvm_v16i8_ty],
                   [IntrNoMem]>;
-  def int_x86_sse41_blendvpd         : GCCBuiltin<"__builtin_ia32_blendvpd">,
+  def int_x86_sse41_blendvpd         : ClangBuiltin<"__builtin_ia32_blendvpd">,
         Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,llvm_v2f64_ty],
                   [IntrNoMem]>;
-  def int_x86_sse41_blendvps         : GCCBuiltin<"__builtin_ia32_blendvps">,
+  def int_x86_sse41_blendvps         : ClangBuiltin<"__builtin_ia32_blendvps">,
         Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,llvm_v4f32_ty],
                   [IntrNoMem]>;
 }
 
 // Vector dot product
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse41_dppd            : GCCBuiltin<"__builtin_ia32_dppd">,
+  def int_x86_sse41_dppd            : ClangBuiltin<"__builtin_ia32_dppd">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
                     [IntrNoMem, Commutative, ImmArg<ArgIndex<2>>]>;
-  def int_x86_sse41_dpps            : GCCBuiltin<"__builtin_ia32_dpps">,
+  def int_x86_sse41_dpps            : ClangBuiltin<"__builtin_ia32_dpps">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
                     [IntrNoMem, Commutative, ImmArg<ArgIndex<2>>]>;
 }
 
 // Vector sum of absolute differences
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse41_mpsadbw         : GCCBuiltin<"__builtin_ia32_mpsadbw128">,
+  def int_x86_sse41_mpsadbw         : ClangBuiltin<"__builtin_ia32_mpsadbw128">,
           Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty,llvm_i8_ty],
                     [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 }
 
 // Test instruction with bitwise comparison.
 let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
-  def int_x86_sse41_ptestz          : GCCBuiltin<"__builtin_ia32_ptestz128">,
+  def int_x86_sse41_ptestz          : ClangBuiltin<"__builtin_ia32_ptestz128">,
           Intrinsic<[llvm_i32_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
                     [IntrNoMem]>;
-  def int_x86_sse41_ptestc          : GCCBuiltin<"__builtin_ia32_ptestc128">,
+  def int_x86_sse41_ptestc          : ClangBuiltin<"__builtin_ia32_ptestc128">,
           Intrinsic<[llvm_i32_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
                     [IntrNoMem]>;
-  def int_x86_sse41_ptestnzc        : GCCBuiltin<"__builtin_ia32_ptestnzc128">,
+  def int_x86_sse41_ptestnzc        : ClangBuiltin<"__builtin_ia32_ptestnzc128">,
           Intrinsic<[llvm_i32_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
                     [IntrNoMem]>;
 }
@@ -814,81 +824,81 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
 // Miscellaneous
 // CRC Instruction
 let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
-  def int_x86_sse42_crc32_32_8       : GCCBuiltin<"__builtin_ia32_crc32qi">,
+  def int_x86_sse42_crc32_32_8       : ClangBuiltin<"__builtin_ia32_crc32qi">,
           Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i8_ty],
                     [IntrNoMem]>;
-  def int_x86_sse42_crc32_32_16      : GCCBuiltin<"__builtin_ia32_crc32hi">,
+  def int_x86_sse42_crc32_32_16      : ClangBuiltin<"__builtin_ia32_crc32hi">,
           Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i16_ty],
                     [IntrNoMem]>;
-  def int_x86_sse42_crc32_32_32      : GCCBuiltin<"__builtin_ia32_crc32si">,
+  def int_x86_sse42_crc32_32_32      : ClangBuiltin<"__builtin_ia32_crc32si">,
           Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
                     [IntrNoMem]>;
-  def int_x86_sse42_crc32_64_64      : GCCBuiltin<"__builtin_ia32_crc32di">,
+  def int_x86_sse42_crc32_64_64      : ClangBuiltin<"__builtin_ia32_crc32di">,
           Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
                     [IntrNoMem]>;
 }
 
 // String/text processing ops.
 let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
-  def int_x86_sse42_pcmpistrm128  : GCCBuiltin<"__builtin_ia32_pcmpistrm128">,
+  def int_x86_sse42_pcmpistrm128  : ClangBuiltin<"__builtin_ia32_pcmpistrm128">,
     Intrinsic<[llvm_v16i8_ty],
         [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
         [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_sse42_pcmpistri128  : GCCBuiltin<"__builtin_ia32_pcmpistri128">,
+  def int_x86_sse42_pcmpistri128  : ClangBuiltin<"__builtin_ia32_pcmpistri128">,
     Intrinsic<[llvm_i32_ty],
         [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
         [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_sse42_pcmpistria128 : GCCBuiltin<"__builtin_ia32_pcmpistria128">,
+  def int_x86_sse42_pcmpistria128 : ClangBuiltin<"__builtin_ia32_pcmpistria128">,
     Intrinsic<[llvm_i32_ty],
         [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
         [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_sse42_pcmpistric128 : GCCBuiltin<"__builtin_ia32_pcmpistric128">,
+  def int_x86_sse42_pcmpistric128 : ClangBuiltin<"__builtin_ia32_pcmpistric128">,
     Intrinsic<[llvm_i32_ty],
         [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
         [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_sse42_pcmpistrio128 : GCCBuiltin<"__builtin_ia32_pcmpistrio128">,
+  def int_x86_sse42_pcmpistrio128 : ClangBuiltin<"__builtin_ia32_pcmpistrio128">,
     Intrinsic<[llvm_i32_ty],
         [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
         [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_sse42_pcmpistris128 : GCCBuiltin<"__builtin_ia32_pcmpistris128">,
+  def int_x86_sse42_pcmpistris128 : ClangBuiltin<"__builtin_ia32_pcmpistris128">,
     Intrinsic<[llvm_i32_ty],
         [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
         [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_sse42_pcmpistriz128 : GCCBuiltin<"__builtin_ia32_pcmpistriz128">,
+  def int_x86_sse42_pcmpistriz128 : ClangBuiltin<"__builtin_ia32_pcmpistriz128">,
     Intrinsic<[llvm_i32_ty],
         [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
         [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_sse42_pcmpestrm128  : GCCBuiltin<"__builtin_ia32_pcmpestrm128">,
+  def int_x86_sse42_pcmpestrm128  : ClangBuiltin<"__builtin_ia32_pcmpestrm128">,
     Intrinsic<[llvm_v16i8_ty],
         [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
          llvm_i8_ty],
         [IntrNoMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_sse42_pcmpestri128  : GCCBuiltin<"__builtin_ia32_pcmpestri128">,
+  def int_x86_sse42_pcmpestri128  : ClangBuiltin<"__builtin_ia32_pcmpestri128">,
     Intrinsic<[llvm_i32_ty],
         [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
          llvm_i8_ty],
         [IntrNoMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_sse42_pcmpestria128 : GCCBuiltin<"__builtin_ia32_pcmpestria128">,
+  def int_x86_sse42_pcmpestria128 : ClangBuiltin<"__builtin_ia32_pcmpestria128">,
     Intrinsic<[llvm_i32_ty],
         [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
          llvm_i8_ty],
         [IntrNoMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_sse42_pcmpestric128 : GCCBuiltin<"__builtin_ia32_pcmpestric128">,
+  def int_x86_sse42_pcmpestric128 : ClangBuiltin<"__builtin_ia32_pcmpestric128">,
     Intrinsic<[llvm_i32_ty],
         [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
          llvm_i8_ty],
         [IntrNoMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_sse42_pcmpestrio128 : GCCBuiltin<"__builtin_ia32_pcmpestrio128">,
+  def int_x86_sse42_pcmpestrio128 : ClangBuiltin<"__builtin_ia32_pcmpestrio128">,
     Intrinsic<[llvm_i32_ty],
         [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
          llvm_i8_ty],
         [IntrNoMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_sse42_pcmpestris128 : GCCBuiltin<"__builtin_ia32_pcmpestris128">,
+  def int_x86_sse42_pcmpestris128 : ClangBuiltin<"__builtin_ia32_pcmpestris128">,
     Intrinsic<[llvm_i32_ty],
         [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
          llvm_i8_ty],
         [IntrNoMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_sse42_pcmpestriz128 : GCCBuiltin<"__builtin_ia32_pcmpestriz128">,
+  def int_x86_sse42_pcmpestriz128 : ClangBuiltin<"__builtin_ia32_pcmpestriz128">,
     Intrinsic<[llvm_i32_ty],
         [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
          llvm_i8_ty],
@@ -899,17 +909,17 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
 // SSE4A
 
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse4a_extrqi : GCCBuiltin<"__builtin_ia32_extrqi">,
+  def int_x86_sse4a_extrqi : ClangBuiltin<"__builtin_ia32_extrqi">,
     Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i8_ty, llvm_i8_ty],
               [IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
-  def int_x86_sse4a_extrq  : GCCBuiltin<"__builtin_ia32_extrq">,
+  def int_x86_sse4a_extrq  : ClangBuiltin<"__builtin_ia32_extrq">,
     Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v16i8_ty], [IntrNoMem]>;
 
-  def int_x86_sse4a_insertqi : GCCBuiltin<"__builtin_ia32_insertqi">,
+  def int_x86_sse4a_insertqi : ClangBuiltin<"__builtin_ia32_insertqi">,
     Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
                                 llvm_i8_ty, llvm_i8_ty],
               [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
-  def int_x86_sse4a_insertq  : GCCBuiltin<"__builtin_ia32_insertq">,
+  def int_x86_sse4a_insertq  : ClangBuiltin<"__builtin_ia32_insertq">,
     Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 }
 
@@ -918,177 +928,177 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // Arithmetic ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx_addsub_pd_256 : GCCBuiltin<"__builtin_ia32_addsubpd256">,
+  def int_x86_avx_addsub_pd_256 : ClangBuiltin<"__builtin_ia32_addsubpd256">,
         Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty,
                   llvm_v4f64_ty], [IntrNoMem]>;
-  def int_x86_avx_addsub_ps_256 : GCCBuiltin<"__builtin_ia32_addsubps256">,
+  def int_x86_avx_addsub_ps_256 : ClangBuiltin<"__builtin_ia32_addsubps256">,
         Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
                   llvm_v8f32_ty], [IntrNoMem]>;
-  def int_x86_avx_max_pd_256 : GCCBuiltin<"__builtin_ia32_maxpd256">,
+  def int_x86_avx_max_pd_256 : ClangBuiltin<"__builtin_ia32_maxpd256">,
         Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty,
                   llvm_v4f64_ty], [IntrNoMem]>;
-  def int_x86_avx_max_ps_256 : GCCBuiltin<"__builtin_ia32_maxps256">,
+  def int_x86_avx_max_ps_256 : ClangBuiltin<"__builtin_ia32_maxps256">,
         Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
                   llvm_v8f32_ty], [IntrNoMem]>;
-  def int_x86_avx_min_pd_256 : GCCBuiltin<"__builtin_ia32_minpd256">,
+  def int_x86_avx_min_pd_256 : ClangBuiltin<"__builtin_ia32_minpd256">,
         Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty,
                   llvm_v4f64_ty], [IntrNoMem]>;
-  def int_x86_avx_min_ps_256 : GCCBuiltin<"__builtin_ia32_minps256">,
+  def int_x86_avx_min_ps_256 : ClangBuiltin<"__builtin_ia32_minps256">,
         Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
                   llvm_v8f32_ty], [IntrNoMem]>;
 
-  def int_x86_avx_rsqrt_ps_256 : GCCBuiltin<"__builtin_ia32_rsqrtps256">,
+  def int_x86_avx_rsqrt_ps_256 : ClangBuiltin<"__builtin_ia32_rsqrtps256">,
         Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
 
-  def int_x86_avx_rcp_ps_256 : GCCBuiltin<"__builtin_ia32_rcpps256">,
+  def int_x86_avx_rcp_ps_256 : ClangBuiltin<"__builtin_ia32_rcpps256">,
         Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
 
-  def int_x86_avx_round_pd_256 : GCCBuiltin<"__builtin_ia32_roundpd256">,
+  def int_x86_avx_round_pd_256 : ClangBuiltin<"__builtin_ia32_roundpd256">,
         Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty,
                   llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_avx_round_ps_256 : GCCBuiltin<"__builtin_ia32_roundps256">,
+  def int_x86_avx_round_ps_256 : ClangBuiltin<"__builtin_ia32_roundps256">,
         Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
                   llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 }
 
 // Horizontal ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx_hadd_pd_256 : GCCBuiltin<"__builtin_ia32_haddpd256">,
+  def int_x86_avx_hadd_pd_256 : ClangBuiltin<"__builtin_ia32_haddpd256">,
         Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty,
                   llvm_v4f64_ty], [IntrNoMem]>;
-  def int_x86_avx_hsub_ps_256 : GCCBuiltin<"__builtin_ia32_hsubps256">,
+  def int_x86_avx_hsub_ps_256 : ClangBuiltin<"__builtin_ia32_hsubps256">,
         Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
                   llvm_v8f32_ty], [IntrNoMem]>;
-  def int_x86_avx_hsub_pd_256 : GCCBuiltin<"__builtin_ia32_hsubpd256">,
+  def int_x86_avx_hsub_pd_256 : ClangBuiltin<"__builtin_ia32_hsubpd256">,
         Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty,
                   llvm_v4f64_ty], [IntrNoMem]>;
-  def int_x86_avx_hadd_ps_256 : GCCBuiltin<"__builtin_ia32_haddps256">,
+  def int_x86_avx_hadd_ps_256 : ClangBuiltin<"__builtin_ia32_haddps256">,
         Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
                   llvm_v8f32_ty], [IntrNoMem]>;
 }
 
 // Vector permutation
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx_vpermilvar_pd : GCCBuiltin<"__builtin_ia32_vpermilvarpd">,
+  def int_x86_avx_vpermilvar_pd : ClangBuiltin<"__builtin_ia32_vpermilvarpd">,
         Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
                   llvm_v2i64_ty], [IntrNoMem]>;
-  def int_x86_avx_vpermilvar_ps : GCCBuiltin<"__builtin_ia32_vpermilvarps">,
+  def int_x86_avx_vpermilvar_ps : ClangBuiltin<"__builtin_ia32_vpermilvarps">,
         Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
                   llvm_v4i32_ty], [IntrNoMem]>;
 
   def int_x86_avx_vpermilvar_pd_256 :
-        GCCBuiltin<"__builtin_ia32_vpermilvarpd256">,
+        ClangBuiltin<"__builtin_ia32_vpermilvarpd256">,
         Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4i64_ty], [IntrNoMem]>;
   def int_x86_avx_vpermilvar_ps_256 :
-        GCCBuiltin<"__builtin_ia32_vpermilvarps256">,
+        ClangBuiltin<"__builtin_ia32_vpermilvarps256">,
         Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8i32_ty], [IntrNoMem]>;
 
   def int_x86_avx512_vpermi2var_d_128 :
-       GCCBuiltin<"__builtin_ia32_vpermi2vard128">,
+       ClangBuiltin<"__builtin_ia32_vpermi2vard128">,
        Intrinsic<[llvm_v4i32_ty],
                  [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
 
   def int_x86_avx512_vpermi2var_d_256 :
-        GCCBuiltin<"__builtin_ia32_vpermi2vard256">,
+        ClangBuiltin<"__builtin_ia32_vpermi2vard256">,
         Intrinsic<[llvm_v8i32_ty],
                   [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty], [IntrNoMem]>;
 
   def int_x86_avx512_vpermi2var_d_512 :
-        GCCBuiltin<"__builtin_ia32_vpermi2vard512">,
+        ClangBuiltin<"__builtin_ia32_vpermi2vard512">,
         Intrinsic<[llvm_v16i32_ty],
                   [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
                   [IntrNoMem]>;
 
   def int_x86_avx512_vpermi2var_hi_128 :
-        GCCBuiltin<"__builtin_ia32_vpermi2varhi128">,
+        ClangBuiltin<"__builtin_ia32_vpermi2varhi128">,
         Intrinsic<[llvm_v8i16_ty],
                   [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
 
   def int_x86_avx512_vpermi2var_hi_256 :
-        GCCBuiltin<"__builtin_ia32_vpermi2varhi256">,
+        ClangBuiltin<"__builtin_ia32_vpermi2varhi256">,
         Intrinsic<[llvm_v16i16_ty],
                   [llvm_v16i16_ty, llvm_v16i16_ty, llvm_v16i16_ty],
                   [IntrNoMem]>;
 
   def int_x86_avx512_vpermi2var_hi_512 :
-        GCCBuiltin<"__builtin_ia32_vpermi2varhi512">,
+        ClangBuiltin<"__builtin_ia32_vpermi2varhi512">,
         Intrinsic<[llvm_v32i16_ty],
                   [llvm_v32i16_ty, llvm_v32i16_ty, llvm_v32i16_ty],
                   [IntrNoMem]>;
 
   def int_x86_avx512_vpermi2var_pd_128 :
-        GCCBuiltin<"__builtin_ia32_vpermi2varpd128">,
+        ClangBuiltin<"__builtin_ia32_vpermi2varpd128">,
         Intrinsic<[llvm_v2f64_ty],
                   [llvm_v2f64_ty, llvm_v2i64_ty, llvm_v2f64_ty], [IntrNoMem]>;
 
   def int_x86_avx512_vpermi2var_pd_256 :
-        GCCBuiltin<"__builtin_ia32_vpermi2varpd256">,
+        ClangBuiltin<"__builtin_ia32_vpermi2varpd256">,
         Intrinsic<[llvm_v4f64_ty],
                   [llvm_v4f64_ty, llvm_v4i64_ty, llvm_v4f64_ty], [IntrNoMem]>;
 
   def int_x86_avx512_vpermi2var_pd_512 :
-        GCCBuiltin<"__builtin_ia32_vpermi2varpd512">,
+        ClangBuiltin<"__builtin_ia32_vpermi2varpd512">,
         Intrinsic<[llvm_v8f64_ty],
                   [llvm_v8f64_ty, llvm_v8i64_ty, llvm_v8f64_ty], [IntrNoMem]>;
 
   def int_x86_avx512_vpermi2var_ps_128 :
-        GCCBuiltin<"__builtin_ia32_vpermi2varps128">,
+        ClangBuiltin<"__builtin_ia32_vpermi2varps128">,
         Intrinsic<[llvm_v4f32_ty],
                   [llvm_v4f32_ty, llvm_v4i32_ty, llvm_v4f32_ty], [IntrNoMem]>;
 
   def int_x86_avx512_vpermi2var_ps_256 :
-        GCCBuiltin<"__builtin_ia32_vpermi2varps256">,
+        ClangBuiltin<"__builtin_ia32_vpermi2varps256">,
         Intrinsic<[llvm_v8f32_ty],
                   [llvm_v8f32_ty, llvm_v8i32_ty, llvm_v8f32_ty], [IntrNoMem]>;
 
   def int_x86_avx512_vpermi2var_ps_512 :
-        GCCBuiltin<"__builtin_ia32_vpermi2varps512">,
+        ClangBuiltin<"__builtin_ia32_vpermi2varps512">,
         Intrinsic<[llvm_v16f32_ty],
                   [llvm_v16f32_ty, llvm_v16i32_ty, llvm_v16f32_ty],
                   [IntrNoMem]>;
 
   def int_x86_avx512_vpermi2var_q_128 :
-        GCCBuiltin<"__builtin_ia32_vpermi2varq128">,
+        ClangBuiltin<"__builtin_ia32_vpermi2varq128">,
         Intrinsic<[llvm_v2i64_ty],
                   [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
   def int_x86_avx512_vpermi2var_q_256 :
-        GCCBuiltin<"__builtin_ia32_vpermi2varq256">,
+        ClangBuiltin<"__builtin_ia32_vpermi2varq256">,
         Intrinsic<[llvm_v4i64_ty],
                   [llvm_v4i64_ty, llvm_v4i64_ty, llvm_v4i64_ty], [IntrNoMem]>;
 
   def int_x86_avx512_vpermi2var_q_512 :
-        GCCBuiltin<"__builtin_ia32_vpermi2varq512">,
+        ClangBuiltin<"__builtin_ia32_vpermi2varq512">,
         Intrinsic<[llvm_v8i64_ty],
                   [llvm_v8i64_ty, llvm_v8i64_ty, llvm_v8i64_ty], [IntrNoMem]>;
 
   def int_x86_avx512_vpermi2var_qi_128 :
-        GCCBuiltin<"__builtin_ia32_vpermi2varqi128">,
+        ClangBuiltin<"__builtin_ia32_vpermi2varqi128">,
         Intrinsic<[llvm_v16i8_ty],
                   [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
 
   def int_x86_avx512_vpermi2var_qi_256 :
-        GCCBuiltin<"__builtin_ia32_vpermi2varqi256">,
+        ClangBuiltin<"__builtin_ia32_vpermi2varqi256">,
         Intrinsic<[llvm_v32i8_ty],
                   [llvm_v32i8_ty, llvm_v32i8_ty, llvm_v32i8_ty], [IntrNoMem]>;
 
   def int_x86_avx512_vpermi2var_qi_512 :
-        GCCBuiltin<"__builtin_ia32_vpermi2varqi512">,
+        ClangBuiltin<"__builtin_ia32_vpermi2varqi512">,
         Intrinsic<[llvm_v64i8_ty],
                   [llvm_v64i8_ty, llvm_v64i8_ty, llvm_v64i8_ty], [IntrNoMem]>;
 
   def int_x86_avx512_vpermilvar_pd_512 :
-        GCCBuiltin<"__builtin_ia32_vpermilvarpd512">,
+        ClangBuiltin<"__builtin_ia32_vpermilvarpd512">,
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8i64_ty],
           [IntrNoMem]>;
 
   def int_x86_avx512_vpermilvar_ps_512 :
-        GCCBuiltin<"__builtin_ia32_vpermilvarps512">,
+        ClangBuiltin<"__builtin_ia32_vpermilvarps512">,
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16i32_ty],
           [IntrNoMem]>;
 
   def int_x86_avx512_pshuf_b_512 :
-        GCCBuiltin<"__builtin_ia32_pshufb512">,
+        ClangBuiltin<"__builtin_ia32_pshufb512">,
           Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty],
           [IntrNoMem]>;
 
@@ -1097,49 +1107,49 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 // GFNI Instructions
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_vgf2p8affineinvqb_128 :
-         GCCBuiltin<"__builtin_ia32_vgf2p8affineinvqb_v16qi">,
+         ClangBuiltin<"__builtin_ia32_vgf2p8affineinvqb_v16qi">,
           Intrinsic<[llvm_v16i8_ty],
           [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
           [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_vgf2p8affineinvqb_256 :
-         GCCBuiltin<"__builtin_ia32_vgf2p8affineinvqb_v32qi">,
+         ClangBuiltin<"__builtin_ia32_vgf2p8affineinvqb_v32qi">,
           Intrinsic<[llvm_v32i8_ty],
           [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i8_ty],
           [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_vgf2p8affineinvqb_512 :
-         GCCBuiltin<"__builtin_ia32_vgf2p8affineinvqb_v64qi">,
+         ClangBuiltin<"__builtin_ia32_vgf2p8affineinvqb_v64qi">,
           Intrinsic<[llvm_v64i8_ty],
           [llvm_v64i8_ty, llvm_v64i8_ty, llvm_i8_ty],
           [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
   def int_x86_vgf2p8affineqb_128 :
-         GCCBuiltin<"__builtin_ia32_vgf2p8affineqb_v16qi">,
+         ClangBuiltin<"__builtin_ia32_vgf2p8affineqb_v16qi">,
           Intrinsic<[llvm_v16i8_ty],
           [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
           [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_vgf2p8affineqb_256 :
-         GCCBuiltin<"__builtin_ia32_vgf2p8affineqb_v32qi">,
+         ClangBuiltin<"__builtin_ia32_vgf2p8affineqb_v32qi">,
           Intrinsic<[llvm_v32i8_ty],
           [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i8_ty],
           [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_vgf2p8affineqb_512 :
-         GCCBuiltin<"__builtin_ia32_vgf2p8affineqb_v64qi">,
+         ClangBuiltin<"__builtin_ia32_vgf2p8affineqb_v64qi">,
           Intrinsic<[llvm_v64i8_ty],
           [llvm_v64i8_ty, llvm_v64i8_ty, llvm_i8_ty],
           [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
   def int_x86_vgf2p8mulb_128     :
-         GCCBuiltin<"__builtin_ia32_vgf2p8mulb_v16qi">,
+         ClangBuiltin<"__builtin_ia32_vgf2p8mulb_v16qi">,
           Intrinsic<[llvm_v16i8_ty],
           [llvm_v16i8_ty, llvm_v16i8_ty],
           [IntrNoMem]>;
   def int_x86_vgf2p8mulb_256     :
-         GCCBuiltin<"__builtin_ia32_vgf2p8mulb_v32qi">,
+         ClangBuiltin<"__builtin_ia32_vgf2p8mulb_v32qi">,
           Intrinsic<[llvm_v32i8_ty],
           [llvm_v32i8_ty, llvm_v32i8_ty],
           [IntrNoMem]>;
   def int_x86_vgf2p8mulb_512     :
-         GCCBuiltin<"__builtin_ia32_vgf2p8mulb_v64qi">,
+         ClangBuiltin<"__builtin_ia32_vgf2p8mulb_v64qi">,
           Intrinsic<[llvm_v64i8_ty],
           [llvm_v64i8_ty, llvm_v64i8_ty],
           [IntrNoMem]>;
@@ -1147,17 +1157,17 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // Vector blend
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx_blendv_pd_256 : GCCBuiltin<"__builtin_ia32_blendvpd256">,
+  def int_x86_avx_blendv_pd_256 : ClangBuiltin<"__builtin_ia32_blendvpd256">,
         Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty,
                   llvm_v4f64_ty, llvm_v4f64_ty], [IntrNoMem]>;
-  def int_x86_avx_blendv_ps_256 : GCCBuiltin<"__builtin_ia32_blendvps256">,
+  def int_x86_avx_blendv_ps_256 : ClangBuiltin<"__builtin_ia32_blendvps256">,
         Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
                   llvm_v8f32_ty, llvm_v8f32_ty], [IntrNoMem]>;
 }
 
 // Vector dot product
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx_dp_ps_256 : GCCBuiltin<"__builtin_ia32_dpps256">,
+  def int_x86_avx_dp_ps_256 : ClangBuiltin<"__builtin_ia32_dpps256">,
         Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
                   llvm_v8f32_ty, llvm_i8_ty],
                   [IntrNoMem, Commutative, ImmArg<ArgIndex<2>>]>;
@@ -1175,63 +1185,63 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // Vector convert
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx_cvt_pd2_ps_256 : GCCBuiltin<"__builtin_ia32_cvtpd2ps256">,
+  def int_x86_avx_cvt_pd2_ps_256 : ClangBuiltin<"__builtin_ia32_cvtpd2ps256">,
         Intrinsic<[llvm_v4f32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
-  def int_x86_avx_cvt_ps2dq_256 : GCCBuiltin<"__builtin_ia32_cvtps2dq256">,
+  def int_x86_avx_cvt_ps2dq_256 : ClangBuiltin<"__builtin_ia32_cvtps2dq256">,
         Intrinsic<[llvm_v8i32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
-  def int_x86_avx_cvtt_pd2dq_256 : GCCBuiltin<"__builtin_ia32_cvttpd2dq256">,
+  def int_x86_avx_cvtt_pd2dq_256 : ClangBuiltin<"__builtin_ia32_cvttpd2dq256">,
         Intrinsic<[llvm_v4i32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
-  def int_x86_avx_cvt_pd2dq_256 : GCCBuiltin<"__builtin_ia32_cvtpd2dq256">,
+  def int_x86_avx_cvt_pd2dq_256 : ClangBuiltin<"__builtin_ia32_cvtpd2dq256">,
         Intrinsic<[llvm_v4i32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
-  def int_x86_avx_cvtt_ps2dq_256 : GCCBuiltin<"__builtin_ia32_cvttps2dq256">,
+  def int_x86_avx_cvtt_ps2dq_256 : ClangBuiltin<"__builtin_ia32_cvttps2dq256">,
         Intrinsic<[llvm_v8i32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
 }
 
 // Vector bit test
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx_vtestz_pd : GCCBuiltin<"__builtin_ia32_vtestzpd">,
+  def int_x86_avx_vtestz_pd : ClangBuiltin<"__builtin_ia32_vtestzpd">,
         Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
                   llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_avx_vtestc_pd : GCCBuiltin<"__builtin_ia32_vtestcpd">,
+  def int_x86_avx_vtestc_pd : ClangBuiltin<"__builtin_ia32_vtestcpd">,
         Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
                   llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_avx_vtestnzc_pd : GCCBuiltin<"__builtin_ia32_vtestnzcpd">,
+  def int_x86_avx_vtestnzc_pd : ClangBuiltin<"__builtin_ia32_vtestnzcpd">,
         Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
                   llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_avx_vtestz_ps : GCCBuiltin<"__builtin_ia32_vtestzps">,
+  def int_x86_avx_vtestz_ps : ClangBuiltin<"__builtin_ia32_vtestzps">,
         Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
                   llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_avx_vtestc_ps : GCCBuiltin<"__builtin_ia32_vtestcps">,
+  def int_x86_avx_vtestc_ps : ClangBuiltin<"__builtin_ia32_vtestcps">,
         Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
                   llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_avx_vtestnzc_ps : GCCBuiltin<"__builtin_ia32_vtestnzcps">,
+  def int_x86_avx_vtestnzc_ps : ClangBuiltin<"__builtin_ia32_vtestnzcps">,
         Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
                   llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_avx_vtestz_pd_256 : GCCBuiltin<"__builtin_ia32_vtestzpd256">,
+  def int_x86_avx_vtestz_pd_256 : ClangBuiltin<"__builtin_ia32_vtestzpd256">,
         Intrinsic<[llvm_i32_ty], [llvm_v4f64_ty,
                   llvm_v4f64_ty], [IntrNoMem]>;
-  def int_x86_avx_vtestc_pd_256 : GCCBuiltin<"__builtin_ia32_vtestcpd256">,
+  def int_x86_avx_vtestc_pd_256 : ClangBuiltin<"__builtin_ia32_vtestcpd256">,
         Intrinsic<[llvm_i32_ty], [llvm_v4f64_ty,
                   llvm_v4f64_ty], [IntrNoMem]>;
-  def int_x86_avx_vtestnzc_pd_256 : GCCBuiltin<"__builtin_ia32_vtestnzcpd256">,
+  def int_x86_avx_vtestnzc_pd_256 : ClangBuiltin<"__builtin_ia32_vtestnzcpd256">,
         Intrinsic<[llvm_i32_ty], [llvm_v4f64_ty,
                   llvm_v4f64_ty], [IntrNoMem]>;
-  def int_x86_avx_vtestz_ps_256 : GCCBuiltin<"__builtin_ia32_vtestzps256">,
+  def int_x86_avx_vtestz_ps_256 : ClangBuiltin<"__builtin_ia32_vtestzps256">,
         Intrinsic<[llvm_i32_ty], [llvm_v8f32_ty,
                   llvm_v8f32_ty], [IntrNoMem]>;
-  def int_x86_avx_vtestc_ps_256 : GCCBuiltin<"__builtin_ia32_vtestcps256">,
+  def int_x86_avx_vtestc_ps_256 : ClangBuiltin<"__builtin_ia32_vtestcps256">,
         Intrinsic<[llvm_i32_ty], [llvm_v8f32_ty,
                   llvm_v8f32_ty], [IntrNoMem]>;
-  def int_x86_avx_vtestnzc_ps_256 : GCCBuiltin<"__builtin_ia32_vtestnzcps256">,
+  def int_x86_avx_vtestnzc_ps_256 : ClangBuiltin<"__builtin_ia32_vtestnzcps256">,
         Intrinsic<[llvm_i32_ty], [llvm_v8f32_ty,
                   llvm_v8f32_ty], [IntrNoMem]>;
-  def int_x86_avx_ptestz_256 : GCCBuiltin<"__builtin_ia32_ptestz256">,
+  def int_x86_avx_ptestz_256 : ClangBuiltin<"__builtin_ia32_ptestz256">,
         Intrinsic<[llvm_i32_ty], [llvm_v4i64_ty,
                   llvm_v4i64_ty], [IntrNoMem]>;
-  def int_x86_avx_ptestc_256 : GCCBuiltin<"__builtin_ia32_ptestc256">,
+  def int_x86_avx_ptestc_256 : ClangBuiltin<"__builtin_ia32_ptestc256">,
         Intrinsic<[llvm_i32_ty], [llvm_v4i64_ty,
                   llvm_v4i64_ty], [IntrNoMem]>;
-  def int_x86_avx_ptestnzc_256 : GCCBuiltin<"__builtin_ia32_ptestnzc256">,
+  def int_x86_avx_ptestnzc_256 : ClangBuiltin<"__builtin_ia32_ptestnzc256">,
         Intrinsic<[llvm_i32_ty], [llvm_v4i64_ty,
                   llvm_v4i64_ty], [IntrNoMem]>;
 
@@ -1254,67 +1264,67 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
           Intrinsic<[llvm_v16i1_ty], [llvm_v16f32_ty, llvm_i32_ty],
           [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_mask_fpclass_sd :
-         GCCBuiltin<"__builtin_ia32_fpclasssd_mask">,
+         ClangBuiltin<"__builtin_ia32_fpclasssd_mask">,
           Intrinsic<[llvm_i8_ty], [llvm_v2f64_ty, llvm_i32_ty, llvm_i8_ty],
           [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_mask_fpclass_ss :
-         GCCBuiltin<"__builtin_ia32_fpclassss_mask">,
+         ClangBuiltin<"__builtin_ia32_fpclassss_mask">,
           Intrinsic<[llvm_i8_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i8_ty],
           [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 }
 
 // Vector extract sign mask
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx_movmsk_pd_256 : GCCBuiltin<"__builtin_ia32_movmskpd256">,
+  def int_x86_avx_movmsk_pd_256 : ClangBuiltin<"__builtin_ia32_movmskpd256">,
         Intrinsic<[llvm_i32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
-  def int_x86_avx_movmsk_ps_256 : GCCBuiltin<"__builtin_ia32_movmskps256">,
+  def int_x86_avx_movmsk_ps_256 : ClangBuiltin<"__builtin_ia32_movmskps256">,
         Intrinsic<[llvm_i32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
 }
 
 // Vector zero
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx_vzeroall : GCCBuiltin<"__builtin_ia32_vzeroall">,
+  def int_x86_avx_vzeroall : ClangBuiltin<"__builtin_ia32_vzeroall">,
         Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>;
-  def int_x86_avx_vzeroupper : GCCBuiltin<"__builtin_ia32_vzeroupper">,
+  def int_x86_avx_vzeroupper : ClangBuiltin<"__builtin_ia32_vzeroupper">,
         Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>;
 }
 
 // SIMD load ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx_ldu_dq_256 : GCCBuiltin<"__builtin_ia32_lddqu256">,
+  def int_x86_avx_ldu_dq_256 : ClangBuiltin<"__builtin_ia32_lddqu256">,
         Intrinsic<[llvm_v32i8_ty], [llvm_ptr_ty], [IntrReadMem]>;
 }
 
 // Conditional load ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx_maskload_pd : GCCBuiltin<"__builtin_ia32_maskloadpd">,
+  def int_x86_avx_maskload_pd : ClangBuiltin<"__builtin_ia32_maskloadpd">,
         Intrinsic<[llvm_v2f64_ty], [llvm_ptr_ty, llvm_v2i64_ty],
                   [IntrReadMem, IntrArgMemOnly]>;
-  def int_x86_avx_maskload_ps : GCCBuiltin<"__builtin_ia32_maskloadps">,
+  def int_x86_avx_maskload_ps : ClangBuiltin<"__builtin_ia32_maskloadps">,
         Intrinsic<[llvm_v4f32_ty], [llvm_ptr_ty, llvm_v4i32_ty],
                   [IntrReadMem, IntrArgMemOnly]>;
-  def int_x86_avx_maskload_pd_256 : GCCBuiltin<"__builtin_ia32_maskloadpd256">,
+  def int_x86_avx_maskload_pd_256 : ClangBuiltin<"__builtin_ia32_maskloadpd256">,
         Intrinsic<[llvm_v4f64_ty], [llvm_ptr_ty, llvm_v4i64_ty],
                   [IntrReadMem, IntrArgMemOnly]>;
-  def int_x86_avx_maskload_ps_256 : GCCBuiltin<"__builtin_ia32_maskloadps256">,
+  def int_x86_avx_maskload_ps_256 : ClangBuiltin<"__builtin_ia32_maskloadps256">,
         Intrinsic<[llvm_v8f32_ty], [llvm_ptr_ty, llvm_v8i32_ty],
                   [IntrReadMem, IntrArgMemOnly]>;
 }
 
 // Conditional store ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx_maskstore_pd : GCCBuiltin<"__builtin_ia32_maskstorepd">,
+  def int_x86_avx_maskstore_pd : ClangBuiltin<"__builtin_ia32_maskstorepd">,
         Intrinsic<[], [llvm_ptr_ty,
                   llvm_v2i64_ty, llvm_v2f64_ty], [IntrArgMemOnly]>;
-  def int_x86_avx_maskstore_ps : GCCBuiltin<"__builtin_ia32_maskstoreps">,
+  def int_x86_avx_maskstore_ps : ClangBuiltin<"__builtin_ia32_maskstoreps">,
         Intrinsic<[], [llvm_ptr_ty,
                   llvm_v4i32_ty, llvm_v4f32_ty], [IntrArgMemOnly]>;
   def int_x86_avx_maskstore_pd_256 :
-        GCCBuiltin<"__builtin_ia32_maskstorepd256">,
+        ClangBuiltin<"__builtin_ia32_maskstorepd256">,
         Intrinsic<[], [llvm_ptr_ty,
                   llvm_v4i64_ty, llvm_v4f64_ty], [IntrArgMemOnly]>;
   def int_x86_avx_maskstore_ps_256 :
-        GCCBuiltin<"__builtin_ia32_maskstoreps256">,
+        ClangBuiltin<"__builtin_ia32_maskstoreps256">,
         Intrinsic<[], [llvm_ptr_ty,
                   llvm_v8i32_ty, llvm_v8f32_ty], [IntrArgMemOnly]>;
 }
@@ -1334,229 +1344,229 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // Integer arithmetic ops.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx2_pmulhu_w : GCCBuiltin<"__builtin_ia32_pmulhuw256">,
+  def int_x86_avx2_pmulhu_w : ClangBuiltin<"__builtin_ia32_pmulhuw256">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
                          llvm_v16i16_ty], [IntrNoMem, Commutative]>;
-  def int_x86_avx2_pmulh_w : GCCBuiltin<"__builtin_ia32_pmulhw256">,
+  def int_x86_avx2_pmulh_w : ClangBuiltin<"__builtin_ia32_pmulhw256">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
                          llvm_v16i16_ty], [IntrNoMem, Commutative]>;
-  def int_x86_avx2_pmadd_wd : GCCBuiltin<"__builtin_ia32_pmaddwd256">,
+  def int_x86_avx2_pmadd_wd : ClangBuiltin<"__builtin_ia32_pmaddwd256">,
               Intrinsic<[llvm_v8i32_ty], [llvm_v16i16_ty,
                          llvm_v16i16_ty], [IntrNoMem, Commutative]>;
-  def int_x86_avx2_pavg_b : GCCBuiltin<"__builtin_ia32_pavgb256">,
+  def int_x86_avx2_pavg_b : ClangBuiltin<"__builtin_ia32_pavgb256">,
               Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
                          llvm_v32i8_ty], [IntrNoMem, Commutative]>;
-  def int_x86_avx2_pavg_w : GCCBuiltin<"__builtin_ia32_pavgw256">,
+  def int_x86_avx2_pavg_w : ClangBuiltin<"__builtin_ia32_pavgw256">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
                          llvm_v16i16_ty], [IntrNoMem, Commutative]>;
-  def int_x86_avx2_psad_bw : GCCBuiltin<"__builtin_ia32_psadbw256">,
+  def int_x86_avx2_psad_bw : ClangBuiltin<"__builtin_ia32_psadbw256">,
               Intrinsic<[llvm_v4i64_ty], [llvm_v32i8_ty,
                          llvm_v32i8_ty], [IntrNoMem, Commutative]>;
 }
 
 // Integer shift ops.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx2_psll_w : GCCBuiltin<"__builtin_ia32_psllw256">,
+  def int_x86_avx2_psll_w : ClangBuiltin<"__builtin_ia32_psllw256">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
                          llvm_v8i16_ty], [IntrNoMem]>;
-  def int_x86_avx2_psll_d : GCCBuiltin<"__builtin_ia32_pslld256">,
+  def int_x86_avx2_psll_d : ClangBuiltin<"__builtin_ia32_pslld256">,
               Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
                          llvm_v4i32_ty], [IntrNoMem]>;
-  def int_x86_avx2_psll_q : GCCBuiltin<"__builtin_ia32_psllq256">,
+  def int_x86_avx2_psll_q : ClangBuiltin<"__builtin_ia32_psllq256">,
               Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
                          llvm_v2i64_ty], [IntrNoMem]>;
-  def int_x86_avx2_psrl_w : GCCBuiltin<"__builtin_ia32_psrlw256">,
+  def int_x86_avx2_psrl_w : ClangBuiltin<"__builtin_ia32_psrlw256">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
                          llvm_v8i16_ty], [IntrNoMem]>;
-  def int_x86_avx2_psrl_d : GCCBuiltin<"__builtin_ia32_psrld256">,
+  def int_x86_avx2_psrl_d : ClangBuiltin<"__builtin_ia32_psrld256">,
               Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
                          llvm_v4i32_ty], [IntrNoMem]>;
-  def int_x86_avx2_psrl_q : GCCBuiltin<"__builtin_ia32_psrlq256">,
+  def int_x86_avx2_psrl_q : ClangBuiltin<"__builtin_ia32_psrlq256">,
               Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
                          llvm_v2i64_ty], [IntrNoMem]>;
-  def int_x86_avx2_psra_w : GCCBuiltin<"__builtin_ia32_psraw256">,
+  def int_x86_avx2_psra_w : ClangBuiltin<"__builtin_ia32_psraw256">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
                          llvm_v8i16_ty], [IntrNoMem]>;
-  def int_x86_avx2_psra_d : GCCBuiltin<"__builtin_ia32_psrad256">,
+  def int_x86_avx2_psra_d : ClangBuiltin<"__builtin_ia32_psrad256">,
               Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
                          llvm_v4i32_ty], [IntrNoMem]>;
 
   // Oddly these don't require an immediate due to a gcc compatibility issue.
-  def int_x86_avx2_pslli_w : GCCBuiltin<"__builtin_ia32_psllwi256">,
+  def int_x86_avx2_pslli_w : ClangBuiltin<"__builtin_ia32_psllwi256">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
                          llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx2_pslli_d : GCCBuiltin<"__builtin_ia32_pslldi256">,
+  def int_x86_avx2_pslli_d : ClangBuiltin<"__builtin_ia32_pslldi256">,
               Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
                          llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx2_pslli_q : GCCBuiltin<"__builtin_ia32_psllqi256">,
+  def int_x86_avx2_pslli_q : ClangBuiltin<"__builtin_ia32_psllqi256">,
               Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
                          llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx2_psrli_w : GCCBuiltin<"__builtin_ia32_psrlwi256">,
+  def int_x86_avx2_psrli_w : ClangBuiltin<"__builtin_ia32_psrlwi256">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
                          llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx2_psrli_d : GCCBuiltin<"__builtin_ia32_psrldi256">,
+  def int_x86_avx2_psrli_d : ClangBuiltin<"__builtin_ia32_psrldi256">,
               Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
                          llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx2_psrli_q : GCCBuiltin<"__builtin_ia32_psrlqi256">,
+  def int_x86_avx2_psrli_q : ClangBuiltin<"__builtin_ia32_psrlqi256">,
               Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
                          llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx2_psrai_w : GCCBuiltin<"__builtin_ia32_psrawi256">,
+  def int_x86_avx2_psrai_w : ClangBuiltin<"__builtin_ia32_psrawi256">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
                          llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx2_psrai_d : GCCBuiltin<"__builtin_ia32_psradi256">,
+  def int_x86_avx2_psrai_d : ClangBuiltin<"__builtin_ia32_psradi256">,
               Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
                          llvm_i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_psra_q_128 : GCCBuiltin<"__builtin_ia32_psraq128">,
+  def int_x86_avx512_psra_q_128 : ClangBuiltin<"__builtin_ia32_psraq128">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
                          llvm_v2i64_ty], [IntrNoMem]>;
-  def int_x86_avx512_psra_q_256 : GCCBuiltin<"__builtin_ia32_psraq256">,
+  def int_x86_avx512_psra_q_256 : ClangBuiltin<"__builtin_ia32_psraq256">,
               Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
                          llvm_v2i64_ty], [IntrNoMem]>;
 
   // Oddly these don't require an immediate due to a gcc compatibility issue.
-  def int_x86_avx512_psrai_q_128 : GCCBuiltin<"__builtin_ia32_psraqi128">,
+  def int_x86_avx512_psrai_q_128 : ClangBuiltin<"__builtin_ia32_psraqi128">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
                          llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_psrai_q_256 : GCCBuiltin<"__builtin_ia32_psraqi256">,
+  def int_x86_avx512_psrai_q_256 : ClangBuiltin<"__builtin_ia32_psraqi256">,
               Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
                          llvm_i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_psll_w_512 : GCCBuiltin<"__builtin_ia32_psllw512">,
+  def int_x86_avx512_psll_w_512 : ClangBuiltin<"__builtin_ia32_psllw512">,
               Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty,
                          llvm_v8i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_psll_d_512 : GCCBuiltin<"__builtin_ia32_pslld512">,
+  def int_x86_avx512_psll_d_512 : ClangBuiltin<"__builtin_ia32_pslld512">,
               Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
                          llvm_v4i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_psll_q_512 : GCCBuiltin<"__builtin_ia32_psllq512">,
+  def int_x86_avx512_psll_q_512 : ClangBuiltin<"__builtin_ia32_psllq512">,
               Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
                          llvm_v2i64_ty], [IntrNoMem]>;
-  def int_x86_avx512_psrl_w_512 : GCCBuiltin<"__builtin_ia32_psrlw512">,
+  def int_x86_avx512_psrl_w_512 : ClangBuiltin<"__builtin_ia32_psrlw512">,
               Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty,
                          llvm_v8i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_psrl_d_512 : GCCBuiltin<"__builtin_ia32_psrld512">,
+  def int_x86_avx512_psrl_d_512 : ClangBuiltin<"__builtin_ia32_psrld512">,
               Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
                          llvm_v4i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_psrl_q_512 : GCCBuiltin<"__builtin_ia32_psrlq512">,
+  def int_x86_avx512_psrl_q_512 : ClangBuiltin<"__builtin_ia32_psrlq512">,
               Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
                          llvm_v2i64_ty], [IntrNoMem]>;
-  def int_x86_avx512_psra_w_512 : GCCBuiltin<"__builtin_ia32_psraw512">,
+  def int_x86_avx512_psra_w_512 : ClangBuiltin<"__builtin_ia32_psraw512">,
               Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty,
                          llvm_v8i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_psra_d_512 : GCCBuiltin<"__builtin_ia32_psrad512">,
+  def int_x86_avx512_psra_d_512 : ClangBuiltin<"__builtin_ia32_psrad512">,
               Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
                          llvm_v4i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_psra_q_512 : GCCBuiltin<"__builtin_ia32_psraq512">,
+  def int_x86_avx512_psra_q_512 : ClangBuiltin<"__builtin_ia32_psraq512">,
               Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
                          llvm_v2i64_ty], [IntrNoMem]>;
 
   // Oddly these don't require an immediate due to a gcc compatibility issue.
-  def int_x86_avx512_pslli_w_512 : GCCBuiltin<"__builtin_ia32_psllwi512">,
+  def int_x86_avx512_pslli_w_512 : ClangBuiltin<"__builtin_ia32_psllwi512">,
               Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty,
                          llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_pslli_d_512 : GCCBuiltin<"__builtin_ia32_pslldi512">,
+  def int_x86_avx512_pslli_d_512 : ClangBuiltin<"__builtin_ia32_pslldi512">,
               Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
                          llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_pslli_q_512 : GCCBuiltin<"__builtin_ia32_psllqi512">,
+  def int_x86_avx512_pslli_q_512 : ClangBuiltin<"__builtin_ia32_psllqi512">,
               Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
                          llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_psrli_w_512 : GCCBuiltin<"__builtin_ia32_psrlwi512">,
+  def int_x86_avx512_psrli_w_512 : ClangBuiltin<"__builtin_ia32_psrlwi512">,
               Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty,
                          llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_psrli_d_512 : GCCBuiltin<"__builtin_ia32_psrldi512">,
+  def int_x86_avx512_psrli_d_512 : ClangBuiltin<"__builtin_ia32_psrldi512">,
               Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
                          llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_psrli_q_512 : GCCBuiltin<"__builtin_ia32_psrlqi512">,
+  def int_x86_avx512_psrli_q_512 : ClangBuiltin<"__builtin_ia32_psrlqi512">,
               Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
                          llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_psrai_w_512 : GCCBuiltin<"__builtin_ia32_psrawi512">,
+  def int_x86_avx512_psrai_w_512 : ClangBuiltin<"__builtin_ia32_psrawi512">,
               Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty,
                          llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_psrai_d_512 : GCCBuiltin<"__builtin_ia32_psradi512">,
+  def int_x86_avx512_psrai_d_512 : ClangBuiltin<"__builtin_ia32_psradi512">,
               Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
                          llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_psrai_q_512 : GCCBuiltin<"__builtin_ia32_psraqi512">,
+  def int_x86_avx512_psrai_q_512 : ClangBuiltin<"__builtin_ia32_psraqi512">,
               Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
                          llvm_i32_ty], [IntrNoMem]>;
 
   def int_x86_avx512_pmultishift_qb_128:
-        GCCBuiltin<"__builtin_ia32_vpmultishiftqb128">,
+        ClangBuiltin<"__builtin_ia32_vpmultishiftqb128">,
         Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
   def int_x86_avx512_pmultishift_qb_256:
-        GCCBuiltin<"__builtin_ia32_vpmultishiftqb256">,
+        ClangBuiltin<"__builtin_ia32_vpmultishiftqb256">,
         Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty], [IntrNoMem]>;
   def int_x86_avx512_pmultishift_qb_512:
-        GCCBuiltin<"__builtin_ia32_vpmultishiftqb512">,
+        ClangBuiltin<"__builtin_ia32_vpmultishiftqb512">,
         Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty], [IntrNoMem]>;
 }
 
 // Pack ops.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx2_packsswb : GCCBuiltin<"__builtin_ia32_packsswb256">,
+  def int_x86_avx2_packsswb : ClangBuiltin<"__builtin_ia32_packsswb256">,
               Intrinsic<[llvm_v32i8_ty], [llvm_v16i16_ty,
                          llvm_v16i16_ty], [IntrNoMem]>;
-  def int_x86_avx2_packssdw : GCCBuiltin<"__builtin_ia32_packssdw256">,
+  def int_x86_avx2_packssdw : ClangBuiltin<"__builtin_ia32_packssdw256">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v8i32_ty,
                          llvm_v8i32_ty], [IntrNoMem]>;
-  def int_x86_avx2_packuswb : GCCBuiltin<"__builtin_ia32_packuswb256">,
+  def int_x86_avx2_packuswb : ClangBuiltin<"__builtin_ia32_packuswb256">,
               Intrinsic<[llvm_v32i8_ty], [llvm_v16i16_ty,
                          llvm_v16i16_ty], [IntrNoMem]>;
-  def int_x86_avx2_packusdw : GCCBuiltin<"__builtin_ia32_packusdw256">,
+  def int_x86_avx2_packusdw : ClangBuiltin<"__builtin_ia32_packusdw256">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v8i32_ty,
                          llvm_v8i32_ty], [IntrNoMem]>;
 }
 
 // Horizontal arithmetic ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx2_phadd_w : GCCBuiltin<"__builtin_ia32_phaddw256">,
+  def int_x86_avx2_phadd_w : ClangBuiltin<"__builtin_ia32_phaddw256">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
                          llvm_v16i16_ty], [IntrNoMem]>;
-  def int_x86_avx2_phadd_d : GCCBuiltin<"__builtin_ia32_phaddd256">,
+  def int_x86_avx2_phadd_d : ClangBuiltin<"__builtin_ia32_phaddd256">,
               Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
                          llvm_v8i32_ty], [IntrNoMem]>;
-  def int_x86_avx2_phadd_sw : GCCBuiltin<"__builtin_ia32_phaddsw256">,
+  def int_x86_avx2_phadd_sw : ClangBuiltin<"__builtin_ia32_phaddsw256">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
                          llvm_v16i16_ty], [IntrNoMem]>;
-  def int_x86_avx2_phsub_w : GCCBuiltin<"__builtin_ia32_phsubw256">,
+  def int_x86_avx2_phsub_w : ClangBuiltin<"__builtin_ia32_phsubw256">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
                          llvm_v16i16_ty], [IntrNoMem]>;
-  def int_x86_avx2_phsub_d : GCCBuiltin<"__builtin_ia32_phsubd256">,
+  def int_x86_avx2_phsub_d : ClangBuiltin<"__builtin_ia32_phsubd256">,
               Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
                          llvm_v8i32_ty], [IntrNoMem]>;
-  def int_x86_avx2_phsub_sw : GCCBuiltin<"__builtin_ia32_phsubsw256">,
+  def int_x86_avx2_phsub_sw : ClangBuiltin<"__builtin_ia32_phsubsw256">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
                          llvm_v16i16_ty], [IntrNoMem]>;
-  def int_x86_avx2_pmadd_ub_sw : GCCBuiltin<"__builtin_ia32_pmaddubsw256">,
+  def int_x86_avx2_pmadd_ub_sw : ClangBuiltin<"__builtin_ia32_pmaddubsw256">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v32i8_ty,
                          llvm_v32i8_ty], [IntrNoMem]>;
 }
 
 // Sign ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx2_psign_b : GCCBuiltin<"__builtin_ia32_psignb256">,
+  def int_x86_avx2_psign_b : ClangBuiltin<"__builtin_ia32_psignb256">,
               Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
                          llvm_v32i8_ty], [IntrNoMem]>;
-  def int_x86_avx2_psign_w : GCCBuiltin<"__builtin_ia32_psignw256">,
+  def int_x86_avx2_psign_w : ClangBuiltin<"__builtin_ia32_psignw256">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
                          llvm_v16i16_ty], [IntrNoMem]>;
-  def int_x86_avx2_psign_d : GCCBuiltin<"__builtin_ia32_psignd256">,
+  def int_x86_avx2_psign_d : ClangBuiltin<"__builtin_ia32_psignd256">,
               Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
                          llvm_v8i32_ty], [IntrNoMem]>;
 }
 
 // Packed multiply high with round and scale
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx2_pmul_hr_sw : GCCBuiltin<"__builtin_ia32_pmulhrsw256">,
+  def int_x86_avx2_pmul_hr_sw : ClangBuiltin<"__builtin_ia32_pmulhrsw256">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
                          llvm_v16i16_ty], [IntrNoMem, Commutative]>;
-  def int_x86_avx512_pmul_hr_sw_512 : GCCBuiltin<"__builtin_ia32_pmulhrsw512">,
+  def int_x86_avx512_pmul_hr_sw_512 : ClangBuiltin<"__builtin_ia32_pmulhrsw512">,
               Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty,
                          llvm_v32i16_ty], [IntrNoMem, Commutative]>;
 }
 
 // Vector blend
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx2_pblendvb : GCCBuiltin<"__builtin_ia32_pblendvb256">,
+  def int_x86_avx2_pblendvb : ClangBuiltin<"__builtin_ia32_pblendvb256">,
               Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty,
                          llvm_v32i8_ty], [IntrNoMem]>;
 }
@@ -1564,137 +1574,137 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // Vector permutation
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx2_permd : GCCBuiltin<"__builtin_ia32_permvarsi256">,
+  def int_x86_avx2_permd : ClangBuiltin<"__builtin_ia32_permvarsi256">,
               Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty],
                         [IntrNoMem]>;
-  def int_x86_avx2_permps : GCCBuiltin<"__builtin_ia32_permvarsf256">,
+  def int_x86_avx2_permps : ClangBuiltin<"__builtin_ia32_permvarsf256">,
               Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8i32_ty],
                         [IntrNoMem]>;
 }
 
 // Conditional load ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx2_maskload_d : GCCBuiltin<"__builtin_ia32_maskloadd">,
+  def int_x86_avx2_maskload_d : ClangBuiltin<"__builtin_ia32_maskloadd">,
         Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_v4i32_ty],
                   [IntrReadMem, IntrArgMemOnly]>;
-  def int_x86_avx2_maskload_q : GCCBuiltin<"__builtin_ia32_maskloadq">,
+  def int_x86_avx2_maskload_q : ClangBuiltin<"__builtin_ia32_maskloadq">,
         Intrinsic<[llvm_v2i64_ty], [llvm_ptr_ty, llvm_v2i64_ty],
                   [IntrReadMem, IntrArgMemOnly]>;
-  def int_x86_avx2_maskload_d_256 : GCCBuiltin<"__builtin_ia32_maskloadd256">,
+  def int_x86_avx2_maskload_d_256 : ClangBuiltin<"__builtin_ia32_maskloadd256">,
         Intrinsic<[llvm_v8i32_ty], [llvm_ptr_ty, llvm_v8i32_ty],
                   [IntrReadMem, IntrArgMemOnly]>;
-  def int_x86_avx2_maskload_q_256 : GCCBuiltin<"__builtin_ia32_maskloadq256">,
+  def int_x86_avx2_maskload_q_256 : ClangBuiltin<"__builtin_ia32_maskloadq256">,
         Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty, llvm_v4i64_ty],
                   [IntrReadMem, IntrArgMemOnly]>;
 }
 
 // Conditional store ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx2_maskstore_d : GCCBuiltin<"__builtin_ia32_maskstored">,
+  def int_x86_avx2_maskstore_d : ClangBuiltin<"__builtin_ia32_maskstored">,
         Intrinsic<[], [llvm_ptr_ty, llvm_v4i32_ty, llvm_v4i32_ty],
                   [IntrArgMemOnly]>;
-  def int_x86_avx2_maskstore_q : GCCBuiltin<"__builtin_ia32_maskstoreq">,
+  def int_x86_avx2_maskstore_q : ClangBuiltin<"__builtin_ia32_maskstoreq">,
         Intrinsic<[], [llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i64_ty],
                   [IntrArgMemOnly]>;
   def int_x86_avx2_maskstore_d_256 :
-        GCCBuiltin<"__builtin_ia32_maskstored256">,
+        ClangBuiltin<"__builtin_ia32_maskstored256">,
         Intrinsic<[], [llvm_ptr_ty, llvm_v8i32_ty, llvm_v8i32_ty],
                   [IntrArgMemOnly]>;
   def int_x86_avx2_maskstore_q_256 :
-        GCCBuiltin<"__builtin_ia32_maskstoreq256">,
+        ClangBuiltin<"__builtin_ia32_maskstoreq256">,
         Intrinsic<[], [llvm_ptr_ty, llvm_v4i64_ty, llvm_v4i64_ty],
                   [IntrArgMemOnly]>;
 }
 
 // Variable bit shift ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx2_psllv_d : GCCBuiltin<"__builtin_ia32_psllv4si">,
+  def int_x86_avx2_psllv_d : ClangBuiltin<"__builtin_ia32_psllv4si">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
                         [IntrNoMem]>;
-  def int_x86_avx2_psllv_d_256 : GCCBuiltin<"__builtin_ia32_psllv8si">,
+  def int_x86_avx2_psllv_d_256 : ClangBuiltin<"__builtin_ia32_psllv8si">,
               Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty],
                         [IntrNoMem]>;
-  def int_x86_avx2_psllv_q : GCCBuiltin<"__builtin_ia32_psllv2di">,
+  def int_x86_avx2_psllv_q : ClangBuiltin<"__builtin_ia32_psllv2di">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
                         [IntrNoMem]>;
-  def int_x86_avx2_psllv_q_256 : GCCBuiltin<"__builtin_ia32_psllv4di">,
+  def int_x86_avx2_psllv_q_256 : ClangBuiltin<"__builtin_ia32_psllv4di">,
               Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty],
                         [IntrNoMem]>;
 
-  def int_x86_avx512_psllv_d_512 : GCCBuiltin<"__builtin_ia32_psllv16si">,
+  def int_x86_avx512_psllv_d_512 : ClangBuiltin<"__builtin_ia32_psllv16si">,
               Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty],
                         [IntrNoMem]>;
-  def int_x86_avx512_psllv_q_512 : GCCBuiltin<"__builtin_ia32_psllv8di">,
+  def int_x86_avx512_psllv_q_512 : ClangBuiltin<"__builtin_ia32_psllv8di">,
               Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty],
                         [IntrNoMem]>;
 
-  def int_x86_avx2_psrlv_d : GCCBuiltin<"__builtin_ia32_psrlv4si">,
+  def int_x86_avx2_psrlv_d : ClangBuiltin<"__builtin_ia32_psrlv4si">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
                         [IntrNoMem]>;
-  def int_x86_avx2_psrlv_d_256 : GCCBuiltin<"__builtin_ia32_psrlv8si">,
+  def int_x86_avx2_psrlv_d_256 : ClangBuiltin<"__builtin_ia32_psrlv8si">,
               Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty],
                         [IntrNoMem]>;
-  def int_x86_avx2_psrlv_q : GCCBuiltin<"__builtin_ia32_psrlv2di">,
+  def int_x86_avx2_psrlv_q : ClangBuiltin<"__builtin_ia32_psrlv2di">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
                         [IntrNoMem]>;
-  def int_x86_avx2_psrlv_q_256 : GCCBuiltin<"__builtin_ia32_psrlv4di">,
+  def int_x86_avx2_psrlv_q_256 : ClangBuiltin<"__builtin_ia32_psrlv4di">,
               Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty],
                         [IntrNoMem]>;
 
-  def int_x86_avx512_psrlv_d_512 : GCCBuiltin<"__builtin_ia32_psrlv16si">,
+  def int_x86_avx512_psrlv_d_512 : ClangBuiltin<"__builtin_ia32_psrlv16si">,
               Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty],
                         [IntrNoMem]>;
-  def int_x86_avx512_psrlv_q_512 : GCCBuiltin<"__builtin_ia32_psrlv8di">,
+  def int_x86_avx512_psrlv_q_512 : ClangBuiltin<"__builtin_ia32_psrlv8di">,
               Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty],
                         [IntrNoMem]>;
 
-  def int_x86_avx2_psrav_d : GCCBuiltin<"__builtin_ia32_psrav4si">,
+  def int_x86_avx2_psrav_d : ClangBuiltin<"__builtin_ia32_psrav4si">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
                         [IntrNoMem]>;
-  def int_x86_avx2_psrav_d_256 : GCCBuiltin<"__builtin_ia32_psrav8si">,
+  def int_x86_avx2_psrav_d_256 : ClangBuiltin<"__builtin_ia32_psrav8si">,
               Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty],
                         [IntrNoMem]>;
 
-  def int_x86_avx512_psrav_d_512 : GCCBuiltin<"__builtin_ia32_psrav16si">,
+  def int_x86_avx512_psrav_d_512 : ClangBuiltin<"__builtin_ia32_psrav16si">,
               Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty],
                         [IntrNoMem]>;
-  def int_x86_avx512_psrav_q_128 : GCCBuiltin<"__builtin_ia32_psravq128">,
+  def int_x86_avx512_psrav_q_128 : ClangBuiltin<"__builtin_ia32_psravq128">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
                         [IntrNoMem]>;
-  def int_x86_avx512_psrav_q_256 : GCCBuiltin<"__builtin_ia32_psravq256">,
+  def int_x86_avx512_psrav_q_256 : ClangBuiltin<"__builtin_ia32_psravq256">,
               Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty],
                         [IntrNoMem]>;
-  def int_x86_avx512_psrav_q_512 : GCCBuiltin<"__builtin_ia32_psrav8di">,
+  def int_x86_avx512_psrav_q_512 : ClangBuiltin<"__builtin_ia32_psrav8di">,
               Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty],
                         [IntrNoMem]>;
 
-  def int_x86_avx512_psllv_w_128 : GCCBuiltin<"__builtin_ia32_psllv8hi">,
+  def int_x86_avx512_psllv_w_128 : ClangBuiltin<"__builtin_ia32_psllv8hi">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
                         [IntrNoMem]>;
-  def int_x86_avx512_psllv_w_256 : GCCBuiltin<"__builtin_ia32_psllv16hi">,
+  def int_x86_avx512_psllv_w_256 : ClangBuiltin<"__builtin_ia32_psllv16hi">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty],
                         [IntrNoMem]>;
-  def int_x86_avx512_psllv_w_512 : GCCBuiltin<"__builtin_ia32_psllv32hi">,
+  def int_x86_avx512_psllv_w_512 : ClangBuiltin<"__builtin_ia32_psllv32hi">,
               Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty],
                         [IntrNoMem]>;
 
-  def int_x86_avx512_psrlv_w_128 : GCCBuiltin<"__builtin_ia32_psrlv8hi">,
+  def int_x86_avx512_psrlv_w_128 : ClangBuiltin<"__builtin_ia32_psrlv8hi">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
                         [IntrNoMem]>;
-  def int_x86_avx512_psrlv_w_256 : GCCBuiltin<"__builtin_ia32_psrlv16hi">,
+  def int_x86_avx512_psrlv_w_256 : ClangBuiltin<"__builtin_ia32_psrlv16hi">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty],
                         [IntrNoMem]>;
-  def int_x86_avx512_psrlv_w_512 : GCCBuiltin<"__builtin_ia32_psrlv32hi">,
+  def int_x86_avx512_psrlv_w_512 : ClangBuiltin<"__builtin_ia32_psrlv32hi">,
               Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty],
                         [IntrNoMem]>;
 
-  def int_x86_avx512_psrav_w_128 : GCCBuiltin<"__builtin_ia32_psrav8hi">,
+  def int_x86_avx512_psrav_w_128 : ClangBuiltin<"__builtin_ia32_psrav8hi">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
                         [IntrNoMem]>;
-  def int_x86_avx512_psrav_w_256 : GCCBuiltin<"__builtin_ia32_psrav16hi">,
+  def int_x86_avx512_psrav_w_256 : ClangBuiltin<"__builtin_ia32_psrav16hi">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty],
                         [IntrNoMem]>;
-  def int_x86_avx512_psrav_w_512 : GCCBuiltin<"__builtin_ia32_psrav32hi">,
+  def int_x86_avx512_psrav_w_512 : ClangBuiltin<"__builtin_ia32_psrav32hi">,
               Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty],
                         [IntrNoMem]>;
 }
@@ -1703,68 +1713,68 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   // NOTE: These can't be ArgMemOnly because you can put the address completely
   // in the index register.
-  def int_x86_avx2_gather_d_pd : GCCBuiltin<"__builtin_ia32_gatherd_pd">,
+  def int_x86_avx2_gather_d_pd : ClangBuiltin<"__builtin_ia32_gatherd_pd">,
       Intrinsic<[llvm_v2f64_ty],
         [llvm_v2f64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v2f64_ty, llvm_i8_ty],
         [IntrReadMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx2_gather_d_pd_256 : GCCBuiltin<"__builtin_ia32_gatherd_pd256">,
+  def int_x86_avx2_gather_d_pd_256 : ClangBuiltin<"__builtin_ia32_gatherd_pd256">,
       Intrinsic<[llvm_v4f64_ty],
         [llvm_v4f64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4f64_ty, llvm_i8_ty],
         [IntrReadMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx2_gather_q_pd : GCCBuiltin<"__builtin_ia32_gatherq_pd">,
+  def int_x86_avx2_gather_q_pd : ClangBuiltin<"__builtin_ia32_gatherq_pd">,
       Intrinsic<[llvm_v2f64_ty],
         [llvm_v2f64_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v2f64_ty, llvm_i8_ty],
         [IntrReadMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx2_gather_q_pd_256 : GCCBuiltin<"__builtin_ia32_gatherq_pd256">,
+  def int_x86_avx2_gather_q_pd_256 : ClangBuiltin<"__builtin_ia32_gatherq_pd256">,
       Intrinsic<[llvm_v4f64_ty],
         [llvm_v4f64_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4f64_ty, llvm_i8_ty],
         [IntrReadMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx2_gather_d_ps : GCCBuiltin<"__builtin_ia32_gatherd_ps">,
+  def int_x86_avx2_gather_d_ps : ClangBuiltin<"__builtin_ia32_gatherd_ps">,
       Intrinsic<[llvm_v4f32_ty],
         [llvm_v4f32_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4f32_ty, llvm_i8_ty],
         [IntrReadMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx2_gather_d_ps_256 : GCCBuiltin<"__builtin_ia32_gatherd_ps256">,
+  def int_x86_avx2_gather_d_ps_256 : ClangBuiltin<"__builtin_ia32_gatherd_ps256">,
       Intrinsic<[llvm_v8f32_ty],
         [llvm_v8f32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_v8f32_ty, llvm_i8_ty],
         [IntrReadMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx2_gather_q_ps : GCCBuiltin<"__builtin_ia32_gatherq_ps">,
+  def int_x86_avx2_gather_q_ps : ClangBuiltin<"__builtin_ia32_gatherq_ps">,
       Intrinsic<[llvm_v4f32_ty],
         [llvm_v4f32_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v4f32_ty, llvm_i8_ty],
         [IntrReadMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx2_gather_q_ps_256 : GCCBuiltin<"__builtin_ia32_gatherq_ps256">,
+  def int_x86_avx2_gather_q_ps_256 : ClangBuiltin<"__builtin_ia32_gatherq_ps256">,
       Intrinsic<[llvm_v4f32_ty],
         [llvm_v4f32_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4f32_ty, llvm_i8_ty],
         [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
-  def int_x86_avx2_gather_d_q : GCCBuiltin<"__builtin_ia32_gatherd_q">,
+  def int_x86_avx2_gather_d_q : ClangBuiltin<"__builtin_ia32_gatherd_q">,
       Intrinsic<[llvm_v2i64_ty],
         [llvm_v2i64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v2i64_ty, llvm_i8_ty],
         [IntrReadMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx2_gather_d_q_256 : GCCBuiltin<"__builtin_ia32_gatherd_q256">,
+  def int_x86_avx2_gather_d_q_256 : ClangBuiltin<"__builtin_ia32_gatherd_q256">,
       Intrinsic<[llvm_v4i64_ty],
         [llvm_v4i64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4i64_ty, llvm_i8_ty],
         [IntrReadMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx2_gather_q_q : GCCBuiltin<"__builtin_ia32_gatherq_q">,
+  def int_x86_avx2_gather_q_q : ClangBuiltin<"__builtin_ia32_gatherq_q">,
       Intrinsic<[llvm_v2i64_ty],
         [llvm_v2i64_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty],
         [IntrReadMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx2_gather_q_q_256 : GCCBuiltin<"__builtin_ia32_gatherq_q256">,
+  def int_x86_avx2_gather_q_q_256 : ClangBuiltin<"__builtin_ia32_gatherq_q256">,
       Intrinsic<[llvm_v4i64_ty],
         [llvm_v4i64_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty],
         [IntrReadMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx2_gather_d_d : GCCBuiltin<"__builtin_ia32_gatherd_d">,
+  def int_x86_avx2_gather_d_d : ClangBuiltin<"__builtin_ia32_gatherd_d">,
       Intrinsic<[llvm_v4i32_ty],
         [llvm_v4i32_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty],
         [IntrReadMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx2_gather_d_d_256 : GCCBuiltin<"__builtin_ia32_gatherd_d256">,
+  def int_x86_avx2_gather_d_d_256 : ClangBuiltin<"__builtin_ia32_gatherd_d256">,
       Intrinsic<[llvm_v8i32_ty],
         [llvm_v8i32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty],
         [IntrReadMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx2_gather_q_d : GCCBuiltin<"__builtin_ia32_gatherq_d">,
+  def int_x86_avx2_gather_q_d : ClangBuiltin<"__builtin_ia32_gatherq_d">,
       Intrinsic<[llvm_v4i32_ty],
         [llvm_v4i32_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v4i32_ty, llvm_i8_ty],
         [IntrReadMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx2_gather_q_d_256 : GCCBuiltin<"__builtin_ia32_gatherq_d256">,
+  def int_x86_avx2_gather_q_d_256 : ClangBuiltin<"__builtin_ia32_gatherq_d256">,
       Intrinsic<[llvm_v4i32_ty],
         [llvm_v4i32_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4i32_ty, llvm_i8_ty],
         [IntrReadMem, ImmArg<ArgIndex<4>>]>;
@@ -1772,12 +1782,12 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // Misc.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx2_pmovmskb : GCCBuiltin<"__builtin_ia32_pmovmskb256">,
+  def int_x86_avx2_pmovmskb : ClangBuiltin<"__builtin_ia32_pmovmskb256">,
               Intrinsic<[llvm_i32_ty], [llvm_v32i8_ty], [IntrNoMem]>;
-  def int_x86_avx2_pshuf_b : GCCBuiltin<"__builtin_ia32_pshufb256">,
+  def int_x86_avx2_pshuf_b : ClangBuiltin<"__builtin_ia32_pshufb256">,
               Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
                          llvm_v32i8_ty], [IntrNoMem]>;
-  def int_x86_avx2_mpsadbw : GCCBuiltin<"__builtin_ia32_mpsadbw256">,
+  def int_x86_avx2_mpsadbw : ClangBuiltin<"__builtin_ia32_mpsadbw256">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v32i8_ty, llvm_v32i8_ty,
                          llvm_i8_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 }
@@ -1786,21 +1796,21 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 // FMA3 and FMA4
 
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_fma_vfmaddsub_ps : GCCBuiltin<"__builtin_ia32_vfmaddsubps">,
+  def int_x86_fma_vfmaddsub_ps : ClangBuiltin<"__builtin_ia32_vfmaddsubps">,
               Intrinsic<[llvm_v4f32_ty],
                         [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
                         [IntrNoMem]>;
-  def int_x86_fma_vfmaddsub_pd : GCCBuiltin<"__builtin_ia32_vfmaddsubpd">,
+  def int_x86_fma_vfmaddsub_pd : ClangBuiltin<"__builtin_ia32_vfmaddsubpd">,
               Intrinsic<[llvm_v2f64_ty],
                         [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
                         [IntrNoMem]>;
   def int_x86_fma_vfmaddsub_ps_256 :
-               GCCBuiltin<"__builtin_ia32_vfmaddsubps256">,
+               ClangBuiltin<"__builtin_ia32_vfmaddsubps256">,
               Intrinsic<[llvm_v8f32_ty],
                         [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty],
                         [IntrNoMem]>;
   def int_x86_fma_vfmaddsub_pd_256 :
-              GCCBuiltin<"__builtin_ia32_vfmaddsubpd256">,
+              ClangBuiltin<"__builtin_ia32_vfmaddsubpd256">,
               Intrinsic<[llvm_v4f64_ty],
                         [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
                         [IntrNoMem]>;
@@ -1835,27 +1845,27 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
                     [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_vpmadd52h_uq_128 :
-              GCCBuiltin<"__builtin_ia32_vpmadd52huq128">,
+              ClangBuiltin<"__builtin_ia32_vpmadd52huq128">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
                          llvm_v2i64_ty], [IntrNoMem]>;
   def int_x86_avx512_vpmadd52l_uq_128 :
-              GCCBuiltin<"__builtin_ia32_vpmadd52luq128">,
+              ClangBuiltin<"__builtin_ia32_vpmadd52luq128">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
                          llvm_v2i64_ty], [IntrNoMem]>;
   def int_x86_avx512_vpmadd52h_uq_256 :
-              GCCBuiltin<"__builtin_ia32_vpmadd52huq256">,
+              ClangBuiltin<"__builtin_ia32_vpmadd52huq256">,
               Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty,
                          llvm_v4i64_ty], [IntrNoMem]>;
   def int_x86_avx512_vpmadd52l_uq_256 :
-              GCCBuiltin<"__builtin_ia32_vpmadd52luq256">,
+              ClangBuiltin<"__builtin_ia32_vpmadd52luq256">,
               Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty,
                          llvm_v4i64_ty], [IntrNoMem]>;
   def int_x86_avx512_vpmadd52h_uq_512 :
-              GCCBuiltin<"__builtin_ia32_vpmadd52huq512">,
+              ClangBuiltin<"__builtin_ia32_vpmadd52huq512">,
               Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
                          llvm_v8i64_ty], [IntrNoMem]>;
   def int_x86_avx512_vpmadd52l_uq_512 :
-              GCCBuiltin<"__builtin_ia32_vpmadd52luq512">,
+              ClangBuiltin<"__builtin_ia32_vpmadd52luq512">,
               Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
                          llvm_v8i64_ty], [IntrNoMem]>;
 }
@@ -1863,54 +1873,54 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 // VNNI
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx512_vpdpbusd_128 :
-              GCCBuiltin<"__builtin_ia32_vpdpbusd128">,
+              ClangBuiltin<"__builtin_ia32_vpdpbusd128">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
                          llvm_v4i32_ty], [IntrNoMem]>;
   def int_x86_avx512_vpdpbusd_256 :
-              GCCBuiltin<"__builtin_ia32_vpdpbusd256">,
+              ClangBuiltin<"__builtin_ia32_vpdpbusd256">,
               Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
                          llvm_v8i32_ty], [IntrNoMem]>;
   def int_x86_avx512_vpdpbusd_512 :
-              GCCBuiltin<"__builtin_ia32_vpdpbusd512">,
+              ClangBuiltin<"__builtin_ia32_vpdpbusd512">,
               Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
                          llvm_v16i32_ty], [IntrNoMem]>;
 
   def int_x86_avx512_vpdpbusds_128 :
-              GCCBuiltin<"__builtin_ia32_vpdpbusds128">,
+              ClangBuiltin<"__builtin_ia32_vpdpbusds128">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
                          llvm_v4i32_ty], [IntrNoMem]>;
   def int_x86_avx512_vpdpbusds_256 :
-              GCCBuiltin<"__builtin_ia32_vpdpbusds256">,
+              ClangBuiltin<"__builtin_ia32_vpdpbusds256">,
               Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
                          llvm_v8i32_ty], [IntrNoMem]>;
   def int_x86_avx512_vpdpbusds_512 :
-              GCCBuiltin<"__builtin_ia32_vpdpbusds512">,
+              ClangBuiltin<"__builtin_ia32_vpdpbusds512">,
               Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
                          llvm_v16i32_ty], [IntrNoMem]>;
 
   def int_x86_avx512_vpdpwssd_128 :
-              GCCBuiltin<"__builtin_ia32_vpdpwssd128">,
+              ClangBuiltin<"__builtin_ia32_vpdpwssd128">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
                          llvm_v4i32_ty], [IntrNoMem]>;
   def int_x86_avx512_vpdpwssd_256 :
-              GCCBuiltin<"__builtin_ia32_vpdpwssd256">,
+              ClangBuiltin<"__builtin_ia32_vpdpwssd256">,
               Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
                          llvm_v8i32_ty], [IntrNoMem]>;
   def int_x86_avx512_vpdpwssd_512 :
-              GCCBuiltin<"__builtin_ia32_vpdpwssd512">,
+              ClangBuiltin<"__builtin_ia32_vpdpwssd512">,
               Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
                          llvm_v16i32_ty], [IntrNoMem]>;
 
   def int_x86_avx512_vpdpwssds_128 :
-              GCCBuiltin<"__builtin_ia32_vpdpwssds128">,
+              ClangBuiltin<"__builtin_ia32_vpdpwssds128">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
                          llvm_v4i32_ty], [IntrNoMem]>;
   def int_x86_avx512_vpdpwssds_256 :
-              GCCBuiltin<"__builtin_ia32_vpdpwssds256">,
+              ClangBuiltin<"__builtin_ia32_vpdpwssds256">,
               Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
                          llvm_v8i32_ty], [IntrNoMem]>;
   def int_x86_avx512_vpdpwssds_512 :
-              GCCBuiltin<"__builtin_ia32_vpdpwssds512">,
+              ClangBuiltin<"__builtin_ia32_vpdpwssds512">,
               Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
                          llvm_v16i32_ty], [IntrNoMem]>;
 }
@@ -1919,180 +1929,180 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 // XOP
 
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_xop_vpermil2pd : GCCBuiltin<"__builtin_ia32_vpermil2pd">,
+  def int_x86_xop_vpermil2pd : ClangBuiltin<"__builtin_ia32_vpermil2pd">,
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                                           llvm_v2i64_ty, llvm_i8_ty],
                         [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_xop_vpermil2pd_256 :
-              GCCBuiltin<"__builtin_ia32_vpermil2pd256">,
+              ClangBuiltin<"__builtin_ia32_vpermil2pd256">,
               Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
                                           llvm_v4i64_ty, llvm_i8_ty],
                         [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
-  def int_x86_xop_vpermil2ps : GCCBuiltin<"__builtin_ia32_vpermil2ps">,
+  def int_x86_xop_vpermil2ps : ClangBuiltin<"__builtin_ia32_vpermil2ps">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                                           llvm_v4i32_ty, llvm_i8_ty],
                         [IntrNoMem, ImmArg<ArgIndex<3>>]>;
   def int_x86_xop_vpermil2ps_256 :
-              GCCBuiltin<"__builtin_ia32_vpermil2ps256">,
+              ClangBuiltin<"__builtin_ia32_vpermil2ps256">,
               Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
                                           llvm_v8i32_ty, llvm_i8_ty],
                         [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
-  def int_x86_xop_vfrcz_pd : GCCBuiltin<"__builtin_ia32_vfrczpd">,
+  def int_x86_xop_vfrcz_pd : ClangBuiltin<"__builtin_ia32_vfrczpd">,
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_xop_vfrcz_ps : GCCBuiltin<"__builtin_ia32_vfrczps">,
+  def int_x86_xop_vfrcz_ps : ClangBuiltin<"__builtin_ia32_vfrczps">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_xop_vfrcz_sd : GCCBuiltin<"__builtin_ia32_vfrczsd">,
+  def int_x86_xop_vfrcz_sd : ClangBuiltin<"__builtin_ia32_vfrczsd">,
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_xop_vfrcz_ss : GCCBuiltin<"__builtin_ia32_vfrczss">,
+  def int_x86_xop_vfrcz_ss : ClangBuiltin<"__builtin_ia32_vfrczss">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_xop_vfrcz_pd_256 : GCCBuiltin<"__builtin_ia32_vfrczpd256">,
+  def int_x86_xop_vfrcz_pd_256 : ClangBuiltin<"__builtin_ia32_vfrczpd256">,
               Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty], [IntrNoMem]>;
-  def int_x86_xop_vfrcz_ps_256 : GCCBuiltin<"__builtin_ia32_vfrczps256">,
+  def int_x86_xop_vfrcz_ps_256 : ClangBuiltin<"__builtin_ia32_vfrczps256">,
               Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
 
   def int_x86_xop_vphaddbd :
-              GCCBuiltin<"__builtin_ia32_vphaddbd">,
+              ClangBuiltin<"__builtin_ia32_vphaddbd">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v16i8_ty], [IntrNoMem]>;
   def int_x86_xop_vphaddbq :
-              GCCBuiltin<"__builtin_ia32_vphaddbq">,
+              ClangBuiltin<"__builtin_ia32_vphaddbq">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v16i8_ty], [IntrNoMem]>;
   def int_x86_xop_vphaddbw :
-              GCCBuiltin<"__builtin_ia32_vphaddbw">,
+              ClangBuiltin<"__builtin_ia32_vphaddbw">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty], [IntrNoMem]>;
   def int_x86_xop_vphadddq :
-              GCCBuiltin<"__builtin_ia32_vphadddq">,
+              ClangBuiltin<"__builtin_ia32_vphadddq">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty], [IntrNoMem]>;
   def int_x86_xop_vphaddubd :
-              GCCBuiltin<"__builtin_ia32_vphaddubd">,
+              ClangBuiltin<"__builtin_ia32_vphaddubd">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v16i8_ty], [IntrNoMem]>;
   def int_x86_xop_vphaddubq :
-              GCCBuiltin<"__builtin_ia32_vphaddubq">,
+              ClangBuiltin<"__builtin_ia32_vphaddubq">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v16i8_ty], [IntrNoMem]>;
   def int_x86_xop_vphaddubw :
-              GCCBuiltin<"__builtin_ia32_vphaddubw">,
+              ClangBuiltin<"__builtin_ia32_vphaddubw">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty], [IntrNoMem]>;
   def int_x86_xop_vphaddudq :
-              GCCBuiltin<"__builtin_ia32_vphaddudq">,
+              ClangBuiltin<"__builtin_ia32_vphaddudq">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty], [IntrNoMem]>;
   def int_x86_xop_vphadduwd :
-              GCCBuiltin<"__builtin_ia32_vphadduwd">,
+              ClangBuiltin<"__builtin_ia32_vphadduwd">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty], [IntrNoMem]>;
   def int_x86_xop_vphadduwq :
-              GCCBuiltin<"__builtin_ia32_vphadduwq">,
+              ClangBuiltin<"__builtin_ia32_vphadduwq">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v8i16_ty], [IntrNoMem]>;
   def int_x86_xop_vphaddwd :
-              GCCBuiltin<"__builtin_ia32_vphaddwd">,
+              ClangBuiltin<"__builtin_ia32_vphaddwd">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty], [IntrNoMem]>;
   def int_x86_xop_vphaddwq :
-              GCCBuiltin<"__builtin_ia32_vphaddwq">,
+              ClangBuiltin<"__builtin_ia32_vphaddwq">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v8i16_ty], [IntrNoMem]>;
   def int_x86_xop_vphsubbw :
-              GCCBuiltin<"__builtin_ia32_vphsubbw">,
+              ClangBuiltin<"__builtin_ia32_vphsubbw">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty], [IntrNoMem]>;
   def int_x86_xop_vphsubdq :
-              GCCBuiltin<"__builtin_ia32_vphsubdq">,
+              ClangBuiltin<"__builtin_ia32_vphsubdq">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty], [IntrNoMem]>;
   def int_x86_xop_vphsubwd :
-              GCCBuiltin<"__builtin_ia32_vphsubwd">,
+              ClangBuiltin<"__builtin_ia32_vphsubwd">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty], [IntrNoMem]>;
   def int_x86_xop_vpmacsdd :
-              GCCBuiltin<"__builtin_ia32_vpmacsdd">,
+              ClangBuiltin<"__builtin_ia32_vpmacsdd">,
               Intrinsic<[llvm_v4i32_ty],
                         [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
                         [IntrNoMem, Commutative]>;
   def int_x86_xop_vpmacsdqh :
-              GCCBuiltin<"__builtin_ia32_vpmacsdqh">,
+              ClangBuiltin<"__builtin_ia32_vpmacsdqh">,
               Intrinsic<[llvm_v2i64_ty],
                         [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v2i64_ty],
                         [IntrNoMem, Commutative]>;
   def int_x86_xop_vpmacsdql :
-              GCCBuiltin<"__builtin_ia32_vpmacsdql">,
+              ClangBuiltin<"__builtin_ia32_vpmacsdql">,
               Intrinsic<[llvm_v2i64_ty],
                         [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v2i64_ty],
                         [IntrNoMem, Commutative]>;
   def int_x86_xop_vpmacssdd :
-              GCCBuiltin<"__builtin_ia32_vpmacssdd">,
+              ClangBuiltin<"__builtin_ia32_vpmacssdd">,
               Intrinsic<[llvm_v4i32_ty],
                         [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
                         [IntrNoMem, Commutative]>;
   def int_x86_xop_vpmacssdqh :
-              GCCBuiltin<"__builtin_ia32_vpmacssdqh">,
+              ClangBuiltin<"__builtin_ia32_vpmacssdqh">,
               Intrinsic<[llvm_v2i64_ty],
                         [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v2i64_ty],
                         [IntrNoMem, Commutative]>;
   def int_x86_xop_vpmacssdql :
-              GCCBuiltin<"__builtin_ia32_vpmacssdql">,
+              ClangBuiltin<"__builtin_ia32_vpmacssdql">,
               Intrinsic<[llvm_v2i64_ty],
                         [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v2i64_ty],
                         [IntrNoMem, Commutative]>;
   def int_x86_xop_vpmacsswd :
-              GCCBuiltin<"__builtin_ia32_vpmacsswd">,
+              ClangBuiltin<"__builtin_ia32_vpmacsswd">,
               Intrinsic<[llvm_v4i32_ty],
                         [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v4i32_ty],
                         [IntrNoMem, Commutative]>;
   def int_x86_xop_vpmacssww :
-              GCCBuiltin<"__builtin_ia32_vpmacssww">,
+              ClangBuiltin<"__builtin_ia32_vpmacssww">,
               Intrinsic<[llvm_v8i16_ty],
                         [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty],
                         [IntrNoMem, Commutative]>;
   def int_x86_xop_vpmacswd :
-              GCCBuiltin<"__builtin_ia32_vpmacswd">,
+              ClangBuiltin<"__builtin_ia32_vpmacswd">,
               Intrinsic<[llvm_v4i32_ty],
                         [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v4i32_ty],
                         [IntrNoMem, Commutative]>;
   def int_x86_xop_vpmacsww :
-              GCCBuiltin<"__builtin_ia32_vpmacsww">,
+              ClangBuiltin<"__builtin_ia32_vpmacsww">,
               Intrinsic<[llvm_v8i16_ty],
                         [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty],
                         [IntrNoMem, Commutative]>;
   def int_x86_xop_vpmadcsswd :
-              GCCBuiltin<"__builtin_ia32_vpmadcsswd">,
+              ClangBuiltin<"__builtin_ia32_vpmadcsswd">,
               Intrinsic<[llvm_v4i32_ty],
                         [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v4i32_ty],
                         [IntrNoMem, Commutative]>;
   def int_x86_xop_vpmadcswd :
-              GCCBuiltin<"__builtin_ia32_vpmadcswd">,
+              ClangBuiltin<"__builtin_ia32_vpmadcswd">,
               Intrinsic<[llvm_v4i32_ty],
                         [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v4i32_ty],
                         [IntrNoMem, Commutative]>;
   def int_x86_xop_vpperm :
-              GCCBuiltin<"__builtin_ia32_vpperm">,
+              ClangBuiltin<"__builtin_ia32_vpperm">,
               Intrinsic<[llvm_v16i8_ty],
                         [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
                         [IntrNoMem]>;
   def int_x86_xop_vpshab :
-              GCCBuiltin<"__builtin_ia32_vpshab">,
+              ClangBuiltin<"__builtin_ia32_vpshab">,
               Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
                         [IntrNoMem]>;
   def int_x86_xop_vpshad :
-              GCCBuiltin<"__builtin_ia32_vpshad">,
+              ClangBuiltin<"__builtin_ia32_vpshad">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
                         [IntrNoMem]>;
   def int_x86_xop_vpshaq :
-              GCCBuiltin<"__builtin_ia32_vpshaq">,
+              ClangBuiltin<"__builtin_ia32_vpshaq">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
                         [IntrNoMem]>;
   def int_x86_xop_vpshaw :
-              GCCBuiltin<"__builtin_ia32_vpshaw">,
+              ClangBuiltin<"__builtin_ia32_vpshaw">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
                         [IntrNoMem]>;
   def int_x86_xop_vpshlb :
-              GCCBuiltin<"__builtin_ia32_vpshlb">,
+              ClangBuiltin<"__builtin_ia32_vpshlb">,
               Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
                         [IntrNoMem]>;
   def int_x86_xop_vpshld :
-              GCCBuiltin<"__builtin_ia32_vpshld">,
+              ClangBuiltin<"__builtin_ia32_vpshld">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
                         [IntrNoMem]>;
   def int_x86_xop_vpshlq :
-              GCCBuiltin<"__builtin_ia32_vpshlq">,
+              ClangBuiltin<"__builtin_ia32_vpshlq">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
                         [IntrNoMem]>;
   def int_x86_xop_vpshlw :
-              GCCBuiltin<"__builtin_ia32_vpshlw">,
+              ClangBuiltin<"__builtin_ia32_vpshlw">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
                         [IntrNoMem]>;
 }
@@ -2101,25 +2111,25 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 // LWP
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_llwpcb :
-              GCCBuiltin<"__builtin_ia32_llwpcb">,
+              ClangBuiltin<"__builtin_ia32_llwpcb">,
               Intrinsic<[], [llvm_ptr_ty], []>;
   def int_x86_slwpcb :
-              GCCBuiltin<"__builtin_ia32_slwpcb">,
+              ClangBuiltin<"__builtin_ia32_slwpcb">,
               Intrinsic<[llvm_ptr_ty], [], []>;
   def int_x86_lwpins32 :
-              GCCBuiltin<"__builtin_ia32_lwpins32">,
+              ClangBuiltin<"__builtin_ia32_lwpins32">,
               Intrinsic<[llvm_i8_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
                         [ImmArg<ArgIndex<2>>]>;
   def int_x86_lwpins64 :
-              GCCBuiltin<"__builtin_ia32_lwpins64">,
+              ClangBuiltin<"__builtin_ia32_lwpins64">,
               Intrinsic<[llvm_i8_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty],
                         [ImmArg<ArgIndex<2>>]>;
   def int_x86_lwpval32 :
-              GCCBuiltin<"__builtin_ia32_lwpval32">,
+              ClangBuiltin<"__builtin_ia32_lwpval32">,
               Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
                         [ImmArg<ArgIndex<2>>]>;
   def int_x86_lwpval64 :
-              GCCBuiltin<"__builtin_ia32_lwpval64">,
+              ClangBuiltin<"__builtin_ia32_lwpval64">,
               Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty],
                         [ImmArg<ArgIndex<2>>]>;
 }
@@ -2129,127 +2139,127 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // Empty MMX state op.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_mmx_emms  : GCCBuiltin<"__builtin_ia32_emms">,
+  def int_x86_mmx_emms  : ClangBuiltin<"__builtin_ia32_emms">,
               Intrinsic<[], [], []>;
-  def int_x86_mmx_femms : GCCBuiltin<"__builtin_ia32_femms">,
+  def int_x86_mmx_femms : ClangBuiltin<"__builtin_ia32_femms">,
               Intrinsic<[], [], []>;
 }
 
 // Integer arithmetic ops.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   // Addition
-  def int_x86_mmx_padd_b : GCCBuiltin<"__builtin_ia32_paddb">,
+  def int_x86_mmx_padd_b : ClangBuiltin<"__builtin_ia32_paddb">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem, Commutative]>;
-  def int_x86_mmx_padd_w : GCCBuiltin<"__builtin_ia32_paddw">,
+  def int_x86_mmx_padd_w : ClangBuiltin<"__builtin_ia32_paddw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem, Commutative]>;
-  def int_x86_mmx_padd_d : GCCBuiltin<"__builtin_ia32_paddd">,
+  def int_x86_mmx_padd_d : ClangBuiltin<"__builtin_ia32_paddd">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem, Commutative]>;
-  def int_x86_mmx_padd_q : GCCBuiltin<"__builtin_ia32_paddq">,
+  def int_x86_mmx_padd_q : ClangBuiltin<"__builtin_ia32_paddq">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem, Commutative]>;
 
-  def int_x86_mmx_padds_b : GCCBuiltin<"__builtin_ia32_paddsb">,
+  def int_x86_mmx_padds_b : ClangBuiltin<"__builtin_ia32_paddsb">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
-  def int_x86_mmx_padds_w : GCCBuiltin<"__builtin_ia32_paddsw">,
+  def int_x86_mmx_padds_w : ClangBuiltin<"__builtin_ia32_paddsw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
 
-  def int_x86_mmx_paddus_b : GCCBuiltin<"__builtin_ia32_paddusb">,
+  def int_x86_mmx_paddus_b : ClangBuiltin<"__builtin_ia32_paddusb">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
-  def int_x86_mmx_paddus_w : GCCBuiltin<"__builtin_ia32_paddusw">,
+  def int_x86_mmx_paddus_w : ClangBuiltin<"__builtin_ia32_paddusw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
 
   // Subtraction
-  def int_x86_mmx_psub_b : GCCBuiltin<"__builtin_ia32_psubb">,
+  def int_x86_mmx_psub_b : ClangBuiltin<"__builtin_ia32_psubb">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem]>;
-  def int_x86_mmx_psub_w : GCCBuiltin<"__builtin_ia32_psubw">,
+  def int_x86_mmx_psub_w : ClangBuiltin<"__builtin_ia32_psubw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem]>;
-  def int_x86_mmx_psub_d : GCCBuiltin<"__builtin_ia32_psubd">,
+  def int_x86_mmx_psub_d : ClangBuiltin<"__builtin_ia32_psubd">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem]>;
-  def int_x86_mmx_psub_q : GCCBuiltin<"__builtin_ia32_psubq">,
+  def int_x86_mmx_psub_q : ClangBuiltin<"__builtin_ia32_psubq">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem]>;
 
-  def int_x86_mmx_psubs_b : GCCBuiltin<"__builtin_ia32_psubsb">,
+  def int_x86_mmx_psubs_b : ClangBuiltin<"__builtin_ia32_psubsb">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_mmx_psubs_w : GCCBuiltin<"__builtin_ia32_psubsw">,
+  def int_x86_mmx_psubs_w : ClangBuiltin<"__builtin_ia32_psubsw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem]>;
 
-  def int_x86_mmx_psubus_b : GCCBuiltin<"__builtin_ia32_psubusb">,
+  def int_x86_mmx_psubus_b : ClangBuiltin<"__builtin_ia32_psubusb">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_mmx_psubus_w : GCCBuiltin<"__builtin_ia32_psubusw">,
+  def int_x86_mmx_psubus_w : ClangBuiltin<"__builtin_ia32_psubusw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem]>;
 
   // Multiplication
-  def int_x86_mmx_pmulh_w : GCCBuiltin<"__builtin_ia32_pmulhw">,
+  def int_x86_mmx_pmulh_w : ClangBuiltin<"__builtin_ia32_pmulhw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pmull_w : GCCBuiltin<"__builtin_ia32_pmullw">,
+  def int_x86_mmx_pmull_w : ClangBuiltin<"__builtin_ia32_pmullw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pmulhu_w : GCCBuiltin<"__builtin_ia32_pmulhuw">,
+  def int_x86_mmx_pmulhu_w : ClangBuiltin<"__builtin_ia32_pmulhuw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pmulu_dq : GCCBuiltin<"__builtin_ia32_pmuludq">,
+  def int_x86_mmx_pmulu_dq : ClangBuiltin<"__builtin_ia32_pmuludq">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pmadd_wd : GCCBuiltin<"__builtin_ia32_pmaddwd">,
+  def int_x86_mmx_pmadd_wd : ClangBuiltin<"__builtin_ia32_pmaddwd">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
 
   // Bitwise operations
-  def int_x86_mmx_pand : GCCBuiltin<"__builtin_ia32_pand">,
+  def int_x86_mmx_pand : ClangBuiltin<"__builtin_ia32_pand">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pandn : GCCBuiltin<"__builtin_ia32_pandn">,
+  def int_x86_mmx_pandn : ClangBuiltin<"__builtin_ia32_pandn">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem]>;
-  def int_x86_mmx_por : GCCBuiltin<"__builtin_ia32_por">,
+  def int_x86_mmx_por : ClangBuiltin<"__builtin_ia32_por">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pxor : GCCBuiltin<"__builtin_ia32_pxor">,
+  def int_x86_mmx_pxor : ClangBuiltin<"__builtin_ia32_pxor">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem, Commutative]>;
 
   // Averages
-  def int_x86_mmx_pavg_b : GCCBuiltin<"__builtin_ia32_pavgb">,
+  def int_x86_mmx_pavg_b : ClangBuiltin<"__builtin_ia32_pavgb">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pavg_w : GCCBuiltin<"__builtin_ia32_pavgw">,
+  def int_x86_mmx_pavg_w : ClangBuiltin<"__builtin_ia32_pavgw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
 
   // Maximum
-  def int_x86_mmx_pmaxu_b : GCCBuiltin<"__builtin_ia32_pmaxub">,
+  def int_x86_mmx_pmaxu_b : ClangBuiltin<"__builtin_ia32_pmaxub">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pmaxs_w : GCCBuiltin<"__builtin_ia32_pmaxsw">,
+  def int_x86_mmx_pmaxs_w : ClangBuiltin<"__builtin_ia32_pmaxsw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
 
   // Minimum
-  def int_x86_mmx_pminu_b : GCCBuiltin<"__builtin_ia32_pminub">,
+  def int_x86_mmx_pminu_b : ClangBuiltin<"__builtin_ia32_pminub">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pmins_w : GCCBuiltin<"__builtin_ia32_pminsw">,
+  def int_x86_mmx_pmins_w : ClangBuiltin<"__builtin_ia32_pminsw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
 
   // Packed sum of absolute differences
-  def int_x86_mmx_psad_bw : GCCBuiltin<"__builtin_ia32_psadbw">,
+  def int_x86_mmx_psad_bw : ClangBuiltin<"__builtin_ia32_psadbw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
 }
@@ -2257,178 +2267,178 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 // Integer shift ops.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   // Shift left logical
-  def int_x86_mmx_psll_w : GCCBuiltin<"__builtin_ia32_psllw">,
+  def int_x86_mmx_psll_w : ClangBuiltin<"__builtin_ia32_psllw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_mmx_psll_d : GCCBuiltin<"__builtin_ia32_pslld">,
+  def int_x86_mmx_psll_d : ClangBuiltin<"__builtin_ia32_pslld">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_mmx_psll_q : GCCBuiltin<"__builtin_ia32_psllq">,
+  def int_x86_mmx_psll_q : ClangBuiltin<"__builtin_ia32_psllq">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem]>;
 
-  def int_x86_mmx_psrl_w : GCCBuiltin<"__builtin_ia32_psrlw">,
+  def int_x86_mmx_psrl_w : ClangBuiltin<"__builtin_ia32_psrlw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_mmx_psrl_d : GCCBuiltin<"__builtin_ia32_psrld">,
+  def int_x86_mmx_psrl_d : ClangBuiltin<"__builtin_ia32_psrld">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_mmx_psrl_q : GCCBuiltin<"__builtin_ia32_psrlq">,
+  def int_x86_mmx_psrl_q : ClangBuiltin<"__builtin_ia32_psrlq">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem]>;
 
-  def int_x86_mmx_psra_w : GCCBuiltin<"__builtin_ia32_psraw">,
+  def int_x86_mmx_psra_w : ClangBuiltin<"__builtin_ia32_psraw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_mmx_psra_d : GCCBuiltin<"__builtin_ia32_psrad">,
+  def int_x86_mmx_psra_d : ClangBuiltin<"__builtin_ia32_psrad">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem]>;
 
   // Oddly these don't require an immediate due to a gcc compatibility issue.
-  def int_x86_mmx_pslli_w : GCCBuiltin<"__builtin_ia32_psllwi">,
+  def int_x86_mmx_pslli_w : ClangBuiltin<"__builtin_ia32_psllwi">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_mmx_pslli_d : GCCBuiltin<"__builtin_ia32_pslldi">,
+  def int_x86_mmx_pslli_d : ClangBuiltin<"__builtin_ia32_pslldi">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_mmx_pslli_q : GCCBuiltin<"__builtin_ia32_psllqi">,
+  def int_x86_mmx_pslli_q : ClangBuiltin<"__builtin_ia32_psllqi">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_i32_ty], [IntrNoMem]>;
 
-  def int_x86_mmx_psrli_w : GCCBuiltin<"__builtin_ia32_psrlwi">,
+  def int_x86_mmx_psrli_w : ClangBuiltin<"__builtin_ia32_psrlwi">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_mmx_psrli_d : GCCBuiltin<"__builtin_ia32_psrldi">,
+  def int_x86_mmx_psrli_d : ClangBuiltin<"__builtin_ia32_psrldi">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_mmx_psrli_q : GCCBuiltin<"__builtin_ia32_psrlqi">,
+  def int_x86_mmx_psrli_q : ClangBuiltin<"__builtin_ia32_psrlqi">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_i32_ty], [IntrNoMem]>;
 
-  def int_x86_mmx_psrai_w : GCCBuiltin<"__builtin_ia32_psrawi">,
+  def int_x86_mmx_psrai_w : ClangBuiltin<"__builtin_ia32_psrawi">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_mmx_psrai_d : GCCBuiltin<"__builtin_ia32_psradi">,
+  def int_x86_mmx_psrai_d : ClangBuiltin<"__builtin_ia32_psradi">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_i32_ty], [IntrNoMem]>;
 }
 // Permute
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx512_permvar_df_256 : GCCBuiltin<"__builtin_ia32_permvardf256">,
+  def int_x86_avx512_permvar_df_256 : ClangBuiltin<"__builtin_ia32_permvardf256">,
               Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty,
                         llvm_v4i64_ty],  [IntrNoMem]>;
-  def int_x86_avx512_permvar_df_512 : GCCBuiltin<"__builtin_ia32_permvardf512">,
+  def int_x86_avx512_permvar_df_512 : ClangBuiltin<"__builtin_ia32_permvardf512">,
               Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty,
                         llvm_v8i64_ty],  [IntrNoMem]>;
-  def int_x86_avx512_permvar_di_256 : GCCBuiltin<"__builtin_ia32_permvardi256">,
+  def int_x86_avx512_permvar_di_256 : ClangBuiltin<"__builtin_ia32_permvardi256">,
               Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
                         llvm_v4i64_ty],  [IntrNoMem]>;
-  def int_x86_avx512_permvar_di_512 : GCCBuiltin<"__builtin_ia32_permvardi512">,
+  def int_x86_avx512_permvar_di_512 : ClangBuiltin<"__builtin_ia32_permvardi512">,
               Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
                         llvm_v8i64_ty],  [IntrNoMem]>;
-  def int_x86_avx512_permvar_hi_128 : GCCBuiltin<"__builtin_ia32_permvarhi128">,
+  def int_x86_avx512_permvar_hi_128 : ClangBuiltin<"__builtin_ia32_permvarhi128">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                         llvm_v8i16_ty],  [IntrNoMem]>;
-  def int_x86_avx512_permvar_hi_256 : GCCBuiltin<"__builtin_ia32_permvarhi256">,
+  def int_x86_avx512_permvar_hi_256 : ClangBuiltin<"__builtin_ia32_permvarhi256">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
                         llvm_v16i16_ty],  [IntrNoMem]>;
-  def int_x86_avx512_permvar_hi_512 : GCCBuiltin<"__builtin_ia32_permvarhi512">,
+  def int_x86_avx512_permvar_hi_512 : ClangBuiltin<"__builtin_ia32_permvarhi512">,
               Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty,
                         llvm_v32i16_ty],  [IntrNoMem]>;
-  def int_x86_avx512_permvar_qi_128 : GCCBuiltin<"__builtin_ia32_permvarqi128">,
+  def int_x86_avx512_permvar_qi_128 : ClangBuiltin<"__builtin_ia32_permvarqi128">,
               Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
                         llvm_v16i8_ty],  [IntrNoMem]>;
-  def int_x86_avx512_permvar_qi_256 : GCCBuiltin<"__builtin_ia32_permvarqi256">,
+  def int_x86_avx512_permvar_qi_256 : ClangBuiltin<"__builtin_ia32_permvarqi256">,
               Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
                         llvm_v32i8_ty],  [IntrNoMem]>;
-  def int_x86_avx512_permvar_qi_512 : GCCBuiltin<"__builtin_ia32_permvarqi512">,
+  def int_x86_avx512_permvar_qi_512 : ClangBuiltin<"__builtin_ia32_permvarqi512">,
               Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty,
                         llvm_v64i8_ty],  [IntrNoMem]>;
-  def int_x86_avx512_permvar_sf_512 : GCCBuiltin<"__builtin_ia32_permvarsf512">,
+  def int_x86_avx512_permvar_sf_512 : ClangBuiltin<"__builtin_ia32_permvarsf512">,
               Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty,
                         llvm_v16i32_ty],  [IntrNoMem]>;
-  def int_x86_avx512_permvar_si_512 : GCCBuiltin<"__builtin_ia32_permvarsi512">,
+  def int_x86_avx512_permvar_si_512 : ClangBuiltin<"__builtin_ia32_permvarsi512">,
               Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
                         llvm_v16i32_ty],  [IntrNoMem]>;
 }
 // Pack ops.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_mmx_packsswb : GCCBuiltin<"__builtin_ia32_packsswb">,
+  def int_x86_mmx_packsswb : ClangBuiltin<"__builtin_ia32_packsswb">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_mmx_packssdw : GCCBuiltin<"__builtin_ia32_packssdw">,
+  def int_x86_mmx_packssdw : ClangBuiltin<"__builtin_ia32_packssdw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_mmx_packuswb : GCCBuiltin<"__builtin_ia32_packuswb">,
+  def int_x86_mmx_packuswb : ClangBuiltin<"__builtin_ia32_packuswb">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem]>;
 }
 
 // Unpacking ops.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_mmx_punpckhbw : GCCBuiltin<"__builtin_ia32_punpckhbw">,
+  def int_x86_mmx_punpckhbw : ClangBuiltin<"__builtin_ia32_punpckhbw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem]>;
-  def int_x86_mmx_punpckhwd : GCCBuiltin<"__builtin_ia32_punpckhwd">,
+  def int_x86_mmx_punpckhwd : ClangBuiltin<"__builtin_ia32_punpckhwd">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem]>;
-  def int_x86_mmx_punpckhdq : GCCBuiltin<"__builtin_ia32_punpckhdq">,
+  def int_x86_mmx_punpckhdq : ClangBuiltin<"__builtin_ia32_punpckhdq">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem]>;
-  def int_x86_mmx_punpcklbw : GCCBuiltin<"__builtin_ia32_punpcklbw">,
+  def int_x86_mmx_punpcklbw : ClangBuiltin<"__builtin_ia32_punpcklbw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem]>;
-  def int_x86_mmx_punpcklwd : GCCBuiltin<"__builtin_ia32_punpcklwd">,
+  def int_x86_mmx_punpcklwd : ClangBuiltin<"__builtin_ia32_punpcklwd">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem]>;
-  def int_x86_mmx_punpckldq : GCCBuiltin<"__builtin_ia32_punpckldq">,
+  def int_x86_mmx_punpckldq : ClangBuiltin<"__builtin_ia32_punpckldq">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                         [IntrNoMem]>;
 }
 
 // Integer comparison ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_mmx_pcmpeq_b : GCCBuiltin<"__builtin_ia32_pcmpeqb">,
+  def int_x86_mmx_pcmpeq_b : ClangBuiltin<"__builtin_ia32_pcmpeqb">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pcmpeq_w : GCCBuiltin<"__builtin_ia32_pcmpeqw">,
+  def int_x86_mmx_pcmpeq_w : ClangBuiltin<"__builtin_ia32_pcmpeqw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pcmpeq_d : GCCBuiltin<"__builtin_ia32_pcmpeqd">,
+  def int_x86_mmx_pcmpeq_d : ClangBuiltin<"__builtin_ia32_pcmpeqd">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
 
-  def int_x86_mmx_pcmpgt_b : GCCBuiltin<"__builtin_ia32_pcmpgtb">,
+  def int_x86_mmx_pcmpgt_b : ClangBuiltin<"__builtin_ia32_pcmpgtb">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_mmx_pcmpgt_w : GCCBuiltin<"__builtin_ia32_pcmpgtw">,
+  def int_x86_mmx_pcmpgt_w : ClangBuiltin<"__builtin_ia32_pcmpgtw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_mmx_pcmpgt_d : GCCBuiltin<"__builtin_ia32_pcmpgtd">,
+  def int_x86_mmx_pcmpgt_d : ClangBuiltin<"__builtin_ia32_pcmpgtd">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                          llvm_x86mmx_ty], [IntrNoMem]>;
 }
 
 // Misc.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_mmx_maskmovq : GCCBuiltin<"__builtin_ia32_maskmovq">,
+  def int_x86_mmx_maskmovq : ClangBuiltin<"__builtin_ia32_maskmovq">,
               Intrinsic<[], [llvm_x86mmx_ty, llvm_x86mmx_ty, llvm_ptr_ty], []>;
 
-  def int_x86_mmx_pmovmskb : GCCBuiltin<"__builtin_ia32_pmovmskb">,
+  def int_x86_mmx_pmovmskb : ClangBuiltin<"__builtin_ia32_pmovmskb">,
               Intrinsic<[llvm_i32_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
 
-  def int_x86_mmx_movnt_dq : GCCBuiltin<"__builtin_ia32_movntq">,
+  def int_x86_mmx_movnt_dq : ClangBuiltin<"__builtin_ia32_movntq">,
               Intrinsic<[], [llvm_ptrx86mmx_ty, llvm_x86mmx_ty], []>;
 
-  def int_x86_mmx_palignr_b : GCCBuiltin<"__builtin_ia32_palignr">,
+  def int_x86_mmx_palignr_b : ClangBuiltin<"__builtin_ia32_palignr">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                         llvm_x86mmx_ty, llvm_i8_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
-  def int_x86_mmx_pextr_w : GCCBuiltin<"__builtin_ia32_vec_ext_v4hi">,
+  def int_x86_mmx_pextr_w : ClangBuiltin<"__builtin_ia32_vec_ext_v4hi">,
               Intrinsic<[llvm_i32_ty], [llvm_x86mmx_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-  def int_x86_mmx_pinsr_w : GCCBuiltin<"__builtin_ia32_vec_set_v4hi">,
+  def int_x86_mmx_pinsr_w : ClangBuiltin<"__builtin_ia32_vec_set_v4hi">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                         llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 }
@@ -2437,21 +2447,21 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 // BMI
 
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_bmi_bextr_32 : GCCBuiltin<"__builtin_ia32_bextr_u32">,
+  def int_x86_bmi_bextr_32 : ClangBuiltin<"__builtin_ia32_bextr_u32">,
               Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_bmi_bextr_64 : GCCBuiltin<"__builtin_ia32_bextr_u64">,
+  def int_x86_bmi_bextr_64 : ClangBuiltin<"__builtin_ia32_bextr_u64">,
               Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
-  def int_x86_bmi_bzhi_32 : GCCBuiltin<"__builtin_ia32_bzhi_si">,
+  def int_x86_bmi_bzhi_32 : ClangBuiltin<"__builtin_ia32_bzhi_si">,
               Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_bmi_bzhi_64 : GCCBuiltin<"__builtin_ia32_bzhi_di">,
+  def int_x86_bmi_bzhi_64 : ClangBuiltin<"__builtin_ia32_bzhi_di">,
               Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
-  def int_x86_bmi_pdep_32 : GCCBuiltin<"__builtin_ia32_pdep_si">,
+  def int_x86_bmi_pdep_32 : ClangBuiltin<"__builtin_ia32_pdep_si">,
               Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_bmi_pdep_64 : GCCBuiltin<"__builtin_ia32_pdep_di">,
+  def int_x86_bmi_pdep_64 : ClangBuiltin<"__builtin_ia32_pdep_di">,
               Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
-  def int_x86_bmi_pext_32 : GCCBuiltin<"__builtin_ia32_pext_si">,
+  def int_x86_bmi_pext_32 : ClangBuiltin<"__builtin_ia32_pext_si">,
               Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_bmi_pext_64 : GCCBuiltin<"__builtin_ia32_pext_di">,
+  def int_x86_bmi_pext_64 : ClangBuiltin<"__builtin_ia32_pext_di">,
               Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
 }
 
@@ -2459,34 +2469,34 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 // FS/GS Base
 
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_rdfsbase_32 : GCCBuiltin<"__builtin_ia32_rdfsbase32">,
+  def int_x86_rdfsbase_32 : ClangBuiltin<"__builtin_ia32_rdfsbase32">,
               Intrinsic<[llvm_i32_ty], []>;
-  def int_x86_rdgsbase_32 : GCCBuiltin<"__builtin_ia32_rdgsbase32">,
+  def int_x86_rdgsbase_32 : ClangBuiltin<"__builtin_ia32_rdgsbase32">,
               Intrinsic<[llvm_i32_ty], []>;
-  def int_x86_rdfsbase_64 : GCCBuiltin<"__builtin_ia32_rdfsbase64">,
+  def int_x86_rdfsbase_64 : ClangBuiltin<"__builtin_ia32_rdfsbase64">,
               Intrinsic<[llvm_i64_ty], []>;
-  def int_x86_rdgsbase_64 : GCCBuiltin<"__builtin_ia32_rdgsbase64">,
+  def int_x86_rdgsbase_64 : ClangBuiltin<"__builtin_ia32_rdgsbase64">,
               Intrinsic<[llvm_i64_ty], []>;
-  def int_x86_wrfsbase_32 : GCCBuiltin<"__builtin_ia32_wrfsbase32">,
+  def int_x86_wrfsbase_32 : ClangBuiltin<"__builtin_ia32_wrfsbase32">,
               Intrinsic<[], [llvm_i32_ty]>;
-  def int_x86_wrgsbase_32 : GCCBuiltin<"__builtin_ia32_wrgsbase32">,
+  def int_x86_wrgsbase_32 : ClangBuiltin<"__builtin_ia32_wrgsbase32">,
               Intrinsic<[], [llvm_i32_ty]>;
-  def int_x86_wrfsbase_64 : GCCBuiltin<"__builtin_ia32_wrfsbase64">,
+  def int_x86_wrfsbase_64 : ClangBuiltin<"__builtin_ia32_wrfsbase64">,
               Intrinsic<[], [llvm_i64_ty]>;
-  def int_x86_wrgsbase_64 : GCCBuiltin<"__builtin_ia32_wrgsbase64">,
+  def int_x86_wrgsbase_64 : ClangBuiltin<"__builtin_ia32_wrgsbase64">,
               Intrinsic<[], [llvm_i64_ty]>;
 }
 
 //===----------------------------------------------------------------------===//
 // FXSR
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_fxrstor : GCCBuiltin<"__builtin_ia32_fxrstor">,
+  def int_x86_fxrstor : ClangBuiltin<"__builtin_ia32_fxrstor">,
               Intrinsic<[], [llvm_ptr_ty], []>;
-  def int_x86_fxrstor64 : GCCBuiltin<"__builtin_ia32_fxrstor64">,
+  def int_x86_fxrstor64 : ClangBuiltin<"__builtin_ia32_fxrstor64">,
               Intrinsic<[], [llvm_ptr_ty], []>;
-  def int_x86_fxsave : GCCBuiltin<"__builtin_ia32_fxsave">,
+  def int_x86_fxsave : ClangBuiltin<"__builtin_ia32_fxsave">,
               Intrinsic<[], [llvm_ptr_ty], []>;
-  def int_x86_fxsave64 : GCCBuiltin<"__builtin_ia32_fxsave64">,
+  def int_x86_fxsave64 : ClangBuiltin<"__builtin_ia32_fxsave64">,
               Intrinsic<[], [llvm_ptr_ty], []>;
 }
 
@@ -2526,44 +2536,44 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 //===----------------------------------------------------------------------===//
 // CLFLUSHOPT and CLWB
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_clflushopt : GCCBuiltin<"__builtin_ia32_clflushopt">,
+  def int_x86_clflushopt : ClangBuiltin<"__builtin_ia32_clflushopt">,
               Intrinsic<[], [llvm_ptr_ty], []>;
 
-  def int_x86_clwb : GCCBuiltin<"__builtin_ia32_clwb">,
+  def int_x86_clwb : ClangBuiltin<"__builtin_ia32_clwb">,
               Intrinsic<[], [llvm_ptr_ty], []>;
 }
 
 //===----------------------------------------------------------------------===//
 // Support protection key
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_rdpkru : GCCBuiltin <"__builtin_ia32_rdpkru">,
+  def int_x86_rdpkru : ClangBuiltin <"__builtin_ia32_rdpkru">,
               Intrinsic<[llvm_i32_ty], [], []>;
-  def int_x86_wrpkru : GCCBuiltin<"__builtin_ia32_wrpkru">,
+  def int_x86_wrpkru : ClangBuiltin<"__builtin_ia32_wrpkru">,
               Intrinsic<[], [llvm_i32_ty], []>;
 }
 //===----------------------------------------------------------------------===//
 // Half float conversion
 
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_vcvtps2ph_128 : GCCBuiltin<"__builtin_ia32_vcvtps2ph">,
+  def int_x86_vcvtps2ph_128 : ClangBuiltin<"__builtin_ia32_vcvtps2ph">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v4f32_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_vcvtps2ph_256 : GCCBuiltin<"__builtin_ia32_vcvtps2ph256">,
+  def int_x86_vcvtps2ph_256 : ClangBuiltin<"__builtin_ia32_vcvtps2ph256">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8f32_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_mask_vcvtph2ps_512 :
               Intrinsic<[llvm_v16f32_ty], [llvm_v16i16_ty, llvm_v16f32_ty,
                                            llvm_i16_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<3>>]>;
-  def int_x86_avx512_mask_vcvtps2ph_512 : GCCBuiltin<"__builtin_ia32_vcvtps2ph512_mask">,
+  def int_x86_avx512_mask_vcvtps2ph_512 : ClangBuiltin<"__builtin_ia32_vcvtps2ph512_mask">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v16f32_ty, llvm_i32_ty,
                                            llvm_v16i16_ty, llvm_i16_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_avx512_mask_vcvtps2ph_256 : GCCBuiltin<"__builtin_ia32_vcvtps2ph256_mask">,
+  def int_x86_avx512_mask_vcvtps2ph_256 : ClangBuiltin<"__builtin_ia32_vcvtps2ph256_mask">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8f32_ty, llvm_i32_ty,
                                            llvm_v8i16_ty, llvm_i8_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_avx512_mask_vcvtps2ph_128 : GCCBuiltin<"__builtin_ia32_vcvtps2ph_mask">,
+  def int_x86_avx512_mask_vcvtps2ph_128 : ClangBuiltin<"__builtin_ia32_vcvtps2ph_mask">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v4f32_ty, llvm_i32_ty,
                                            llvm_v8i16_ty, llvm_i8_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
@@ -2573,10 +2583,10 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 // TBM
 
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_tbm_bextri_u32 : GCCBuiltin<"__builtin_ia32_bextri_u32">,
+  def int_x86_tbm_bextri_u32 : ClangBuiltin<"__builtin_ia32_bextri_u32">,
         Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
                   [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_tbm_bextri_u64 : GCCBuiltin<"__builtin_ia32_bextri_u64">,
+  def int_x86_tbm_bextri_u64 : ClangBuiltin<"__builtin_ia32_bextri_u64">,
         Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
                   [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 }
@@ -2619,13 +2629,13 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 // RTM intrinsics. Transactional Memory support.
 
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_xbegin : GCCBuiltin<"__builtin_ia32_xbegin">,
+  def int_x86_xbegin : ClangBuiltin<"__builtin_ia32_xbegin">,
               Intrinsic<[llvm_i32_ty], [], []>;
-  def int_x86_xend : GCCBuiltin<"__builtin_ia32_xend">,
+  def int_x86_xend : ClangBuiltin<"__builtin_ia32_xend">,
               Intrinsic<[], [], []>;
-  def int_x86_xabort : GCCBuiltin<"__builtin_ia32_xabort">,
+  def int_x86_xabort : ClangBuiltin<"__builtin_ia32_xabort">,
               Intrinsic<[], [llvm_i8_ty], [ImmArg<ArgIndex<0>>]>;
-  def int_x86_xtest : GCCBuiltin<"__builtin_ia32_xtest">,
+  def int_x86_xtest : ClangBuiltin<"__builtin_ia32_xtest">,
               Intrinsic<[llvm_i32_ty], [], []>;
 }
 
@@ -2664,86 +2674,86 @@ let TargetPrefix = "x86" in {
 
 // Conversion ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx512_cvttss2si : GCCBuiltin<"__builtin_ia32_vcvttss2si32">,
+  def int_x86_avx512_cvttss2si : ClangBuiltin<"__builtin_ia32_vcvttss2si32">,
               Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_avx512_cvttss2si64 : GCCBuiltin<"__builtin_ia32_vcvttss2si64">,
+  def int_x86_avx512_cvttss2si64 : ClangBuiltin<"__builtin_ia32_vcvttss2si64">,
               Intrinsic<[llvm_i64_ty], [llvm_v4f32_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_avx512_cvttss2usi : GCCBuiltin<"__builtin_ia32_vcvttss2usi32">,
+  def int_x86_avx512_cvttss2usi : ClangBuiltin<"__builtin_ia32_vcvttss2usi32">,
               Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_avx512_cvttss2usi64 : GCCBuiltin<"__builtin_ia32_vcvttss2usi64">,
+  def int_x86_avx512_cvttss2usi64 : ClangBuiltin<"__builtin_ia32_vcvttss2usi64">,
               Intrinsic<[llvm_i64_ty], [llvm_v4f32_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_avx512_cvtusi2ss : GCCBuiltin<"__builtin_ia32_cvtusi2ss32">,
+  def int_x86_avx512_cvtusi2ss : ClangBuiltin<"__builtin_ia32_cvtusi2ss32">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
                          llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_avx512_cvtusi642ss : GCCBuiltin<"__builtin_ia32_cvtusi2ss64">,
+  def int_x86_avx512_cvtusi642ss : ClangBuiltin<"__builtin_ia32_cvtusi2ss64">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
                          llvm_i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_avx512_cvttsd2si : GCCBuiltin<"__builtin_ia32_vcvttsd2si32">,
+  def int_x86_avx512_cvttsd2si : ClangBuiltin<"__builtin_ia32_vcvttsd2si32">,
               Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_avx512_cvttsd2si64 : GCCBuiltin<"__builtin_ia32_vcvttsd2si64">,
+  def int_x86_avx512_cvttsd2si64 : ClangBuiltin<"__builtin_ia32_vcvttsd2si64">,
               Intrinsic<[llvm_i64_ty], [llvm_v2f64_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_avx512_cvttsd2usi : GCCBuiltin<"__builtin_ia32_vcvttsd2usi32">,
+  def int_x86_avx512_cvttsd2usi : ClangBuiltin<"__builtin_ia32_vcvttsd2usi32">,
               Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_avx512_cvttsd2usi64 : GCCBuiltin<"__builtin_ia32_vcvttsd2usi64">,
+  def int_x86_avx512_cvttsd2usi64 : ClangBuiltin<"__builtin_ia32_vcvttsd2usi64">,
               Intrinsic<[llvm_i64_ty], [llvm_v2f64_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_avx512_cvtusi642sd : GCCBuiltin<"__builtin_ia32_cvtusi2sd64">,
+  def int_x86_avx512_cvtusi642sd : ClangBuiltin<"__builtin_ia32_cvtusi2sd64">,
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
                          llvm_i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_avx512_vcvtss2usi32 : GCCBuiltin<"__builtin_ia32_vcvtss2usi32">,
+  def int_x86_avx512_vcvtss2usi32 : ClangBuiltin<"__builtin_ia32_vcvtss2usi32">,
               Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_avx512_vcvtss2usi64 : GCCBuiltin<"__builtin_ia32_vcvtss2usi64">,
+  def int_x86_avx512_vcvtss2usi64 : ClangBuiltin<"__builtin_ia32_vcvtss2usi64">,
               Intrinsic<[llvm_i64_ty], [llvm_v4f32_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_avx512_vcvtss2si32 : GCCBuiltin<"__builtin_ia32_vcvtss2si32">,
+  def int_x86_avx512_vcvtss2si32 : ClangBuiltin<"__builtin_ia32_vcvtss2si32">,
               Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_avx512_vcvtss2si64 : GCCBuiltin<"__builtin_ia32_vcvtss2si64">,
+  def int_x86_avx512_vcvtss2si64 : ClangBuiltin<"__builtin_ia32_vcvtss2si64">,
               Intrinsic<[llvm_i64_ty], [llvm_v4f32_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_avx512_vcvtsd2usi32 : GCCBuiltin<"__builtin_ia32_vcvtsd2usi32">,
+  def int_x86_avx512_vcvtsd2usi32 : ClangBuiltin<"__builtin_ia32_vcvtsd2usi32">,
               Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_avx512_vcvtsd2usi64 : GCCBuiltin<"__builtin_ia32_vcvtsd2usi64">,
+  def int_x86_avx512_vcvtsd2usi64 : ClangBuiltin<"__builtin_ia32_vcvtsd2usi64">,
               Intrinsic<[llvm_i64_ty], [llvm_v2f64_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_avx512_vcvtsd2si32 : GCCBuiltin<"__builtin_ia32_vcvtsd2si32">,
+  def int_x86_avx512_vcvtsd2si32 : ClangBuiltin<"__builtin_ia32_vcvtsd2si32">,
               Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_avx512_vcvtsd2si64 : GCCBuiltin<"__builtin_ia32_vcvtsd2si64">,
+  def int_x86_avx512_vcvtsd2si64 : ClangBuiltin<"__builtin_ia32_vcvtsd2si64">,
               Intrinsic<[llvm_i64_ty], [llvm_v2f64_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_avx512_cvtsi2ss32 : GCCBuiltin<"__builtin_ia32_cvtsi2ss32">,
+  def int_x86_avx512_cvtsi2ss32 : ClangBuiltin<"__builtin_ia32_cvtsi2ss32">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
                          llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_avx512_cvtsi2ss64 : GCCBuiltin<"__builtin_ia32_cvtsi2ss64">,
+  def int_x86_avx512_cvtsi2ss64 : ClangBuiltin<"__builtin_ia32_cvtsi2ss64">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
                          llvm_i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_avx512_cvtsi2sd64 : GCCBuiltin<"__builtin_ia32_cvtsi2sd64">,
+  def int_x86_avx512_cvtsi2sd64 : ClangBuiltin<"__builtin_ia32_cvtsi2sd64">,
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
                          llvm_i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 }
 
 // Pack ops.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx512_packsswb_512 : GCCBuiltin<"__builtin_ia32_packsswb512">,
+  def int_x86_avx512_packsswb_512 : ClangBuiltin<"__builtin_ia32_packsswb512">,
               Intrinsic<[llvm_v64i8_ty], [llvm_v32i16_ty,llvm_v32i16_ty],
                         [IntrNoMem]>;
-  def int_x86_avx512_packssdw_512 : GCCBuiltin<"__builtin_ia32_packssdw512">,
+  def int_x86_avx512_packssdw_512 : ClangBuiltin<"__builtin_ia32_packssdw512">,
               Intrinsic<[llvm_v32i16_ty], [llvm_v16i32_ty, llvm_v16i32_ty],
                          [IntrNoMem]>;
-  def int_x86_avx512_packuswb_512 : GCCBuiltin<"__builtin_ia32_packuswb512">,
+  def int_x86_avx512_packuswb_512 : ClangBuiltin<"__builtin_ia32_packuswb512">,
               Intrinsic<[llvm_v64i8_ty], [llvm_v32i16_ty,llvm_v32i16_ty],
                          [IntrNoMem]>;
-  def int_x86_avx512_packusdw_512 : GCCBuiltin<"__builtin_ia32_packusdw512">,
+  def int_x86_avx512_packusdw_512 : ClangBuiltin<"__builtin_ia32_packusdw512">,
               Intrinsic<[llvm_v32i16_ty], [llvm_v16i32_ty, llvm_v16i32_ty],
                          [IntrNoMem]>;
 }
@@ -2759,380 +2769,380 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
                     [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
   def int_x86_avx512_mask_cvtpd2dq_128 :
-        GCCBuiltin<"__builtin_ia32_cvtpd2dq128_mask">,
+        ClangBuiltin<"__builtin_ia32_cvtpd2dq128_mask">,
           Intrinsic<[llvm_v4i32_ty],
           [llvm_v2f64_ty, llvm_v4i32_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
   def int_x86_avx512_mask_cvtpd2dq_512 :
-        GCCBuiltin<"__builtin_ia32_cvtpd2dq512_mask">,
+        ClangBuiltin<"__builtin_ia32_cvtpd2dq512_mask">,
           Intrinsic<[llvm_v8i32_ty],
           [llvm_v8f64_ty, llvm_v8i32_ty,  llvm_i8_ty,  llvm_i32_ty],
           [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvtpd2ps_512 :
-        GCCBuiltin<"__builtin_ia32_cvtpd2ps512_mask">,
+        ClangBuiltin<"__builtin_ia32_cvtpd2ps512_mask">,
           Intrinsic<[llvm_v8f32_ty],
           [llvm_v8f64_ty, llvm_v8f32_ty,  llvm_i8_ty,  llvm_i32_ty],
           [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvtsd2ss_round :
-        GCCBuiltin<"__builtin_ia32_cvtsd2ss_round_mask">,
+        ClangBuiltin<"__builtin_ia32_cvtsd2ss_round_mask">,
           Intrinsic<[llvm_v4f32_ty],
           [llvm_v4f32_ty, llvm_v2f64_ty, llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty],
           [IntrNoMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_cvtss2sd_round :
-        GCCBuiltin<"__builtin_ia32_cvtss2sd_round_mask">,
+        ClangBuiltin<"__builtin_ia32_cvtss2sd_round_mask">,
           Intrinsic<[llvm_v2f64_ty],
           [llvm_v2f64_ty, llvm_v4f32_ty, llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty],
           [IntrNoMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_cvtpd2ps :
-        GCCBuiltin<"__builtin_ia32_cvtpd2ps_mask">,
+        ClangBuiltin<"__builtin_ia32_cvtpd2ps_mask">,
           Intrinsic<[llvm_v4f32_ty],
           [llvm_v2f64_ty, llvm_v4f32_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
   def int_x86_avx512_mask_cvtpd2qq_128 :
-        GCCBuiltin<"__builtin_ia32_cvtpd2qq128_mask">,
+        ClangBuiltin<"__builtin_ia32_cvtpd2qq128_mask">,
           Intrinsic<[llvm_v2i64_ty],
           [llvm_v2f64_ty, llvm_v2i64_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
   def int_x86_avx512_mask_cvtpd2qq_256 :
-        GCCBuiltin<"__builtin_ia32_cvtpd2qq256_mask">,
+        ClangBuiltin<"__builtin_ia32_cvtpd2qq256_mask">,
           Intrinsic<[llvm_v4i64_ty],
           [llvm_v4f64_ty, llvm_v4i64_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
   def int_x86_avx512_mask_cvtpd2qq_512 :
-        GCCBuiltin<"__builtin_ia32_cvtpd2qq512_mask">,
+        ClangBuiltin<"__builtin_ia32_cvtpd2qq512_mask">,
           Intrinsic<[llvm_v8i64_ty],
           [llvm_v8f64_ty, llvm_v8i64_ty,  llvm_i8_ty,  llvm_i32_ty],
           [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvtpd2udq_128 :
-        GCCBuiltin<"__builtin_ia32_cvtpd2udq128_mask">,
+        ClangBuiltin<"__builtin_ia32_cvtpd2udq128_mask">,
           Intrinsic<[llvm_v4i32_ty],
           [llvm_v2f64_ty, llvm_v4i32_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
   def int_x86_avx512_mask_cvtpd2udq_256 :
-        GCCBuiltin<"__builtin_ia32_cvtpd2udq256_mask">,
+        ClangBuiltin<"__builtin_ia32_cvtpd2udq256_mask">,
           Intrinsic<[llvm_v4i32_ty],
           [llvm_v4f64_ty, llvm_v4i32_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
   def int_x86_avx512_mask_cvtpd2udq_512 :
-        GCCBuiltin<"__builtin_ia32_cvtpd2udq512_mask">,
+        ClangBuiltin<"__builtin_ia32_cvtpd2udq512_mask">,
           Intrinsic<[llvm_v8i32_ty],
           [llvm_v8f64_ty, llvm_v8i32_ty,  llvm_i8_ty,  llvm_i32_ty],
           [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvtpd2uqq_128 :
-        GCCBuiltin<"__builtin_ia32_cvtpd2uqq128_mask">,
+        ClangBuiltin<"__builtin_ia32_cvtpd2uqq128_mask">,
           Intrinsic<[llvm_v2i64_ty],
           [llvm_v2f64_ty, llvm_v2i64_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
   def int_x86_avx512_mask_cvtpd2uqq_256 :
-        GCCBuiltin<"__builtin_ia32_cvtpd2uqq256_mask">,
+        ClangBuiltin<"__builtin_ia32_cvtpd2uqq256_mask">,
           Intrinsic<[llvm_v4i64_ty],
           [llvm_v4f64_ty, llvm_v4i64_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
   def int_x86_avx512_mask_cvtpd2uqq_512 :
-        GCCBuiltin<"__builtin_ia32_cvtpd2uqq512_mask">,
+        ClangBuiltin<"__builtin_ia32_cvtpd2uqq512_mask">,
           Intrinsic<[llvm_v8i64_ty],
           [llvm_v8f64_ty, llvm_v8i64_ty,  llvm_i8_ty,  llvm_i32_ty],
           [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvtps2dq_128 :
-        GCCBuiltin<"__builtin_ia32_cvtps2dq128_mask">,
+        ClangBuiltin<"__builtin_ia32_cvtps2dq128_mask">,
           Intrinsic<[llvm_v4i32_ty],
           [llvm_v4f32_ty, llvm_v4i32_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
   def int_x86_avx512_mask_cvtps2dq_256 :
-        GCCBuiltin<"__builtin_ia32_cvtps2dq256_mask">,
+        ClangBuiltin<"__builtin_ia32_cvtps2dq256_mask">,
           Intrinsic<[llvm_v8i32_ty],
           [llvm_v8f32_ty, llvm_v8i32_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
   def int_x86_avx512_mask_cvtps2dq_512 :
-        GCCBuiltin<"__builtin_ia32_cvtps2dq512_mask">,
+        ClangBuiltin<"__builtin_ia32_cvtps2dq512_mask">,
           Intrinsic<[llvm_v16i32_ty],
           [llvm_v16f32_ty, llvm_v16i32_ty,  llvm_i16_ty,  llvm_i32_ty],
           [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvtps2pd_512 :
-        GCCBuiltin<"__builtin_ia32_cvtps2pd512_mask">,
+        ClangBuiltin<"__builtin_ia32_cvtps2pd512_mask">,
           Intrinsic<[llvm_v8f64_ty],
           [llvm_v8f32_ty, llvm_v8f64_ty,  llvm_i8_ty,  llvm_i32_ty],
           [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvtps2qq_128 :
-        GCCBuiltin<"__builtin_ia32_cvtps2qq128_mask">,
+        ClangBuiltin<"__builtin_ia32_cvtps2qq128_mask">,
           Intrinsic<[llvm_v2i64_ty],
           [llvm_v4f32_ty, llvm_v2i64_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
   def int_x86_avx512_mask_cvtps2qq_256 :
-        GCCBuiltin<"__builtin_ia32_cvtps2qq256_mask">,
+        ClangBuiltin<"__builtin_ia32_cvtps2qq256_mask">,
           Intrinsic<[llvm_v4i64_ty],
           [llvm_v4f32_ty, llvm_v4i64_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
   def int_x86_avx512_mask_cvtps2qq_512 :
-        GCCBuiltin<"__builtin_ia32_cvtps2qq512_mask">,
+        ClangBuiltin<"__builtin_ia32_cvtps2qq512_mask">,
           Intrinsic<[llvm_v8i64_ty],
           [llvm_v8f32_ty, llvm_v8i64_ty,  llvm_i8_ty,  llvm_i32_ty],
           [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvtps2udq_128 :
-        GCCBuiltin<"__builtin_ia32_cvtps2udq128_mask">,
+        ClangBuiltin<"__builtin_ia32_cvtps2udq128_mask">,
           Intrinsic<[llvm_v4i32_ty],
           [llvm_v4f32_ty, llvm_v4i32_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
   def int_x86_avx512_mask_cvtps2udq_256 :
-        GCCBuiltin<"__builtin_ia32_cvtps2udq256_mask">,
+        ClangBuiltin<"__builtin_ia32_cvtps2udq256_mask">,
           Intrinsic<[llvm_v8i32_ty],
           [llvm_v8f32_ty, llvm_v8i32_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
   def int_x86_avx512_mask_cvtps2udq_512 :
-        GCCBuiltin<"__builtin_ia32_cvtps2udq512_mask">,
+        ClangBuiltin<"__builtin_ia32_cvtps2udq512_mask">,
           Intrinsic<[llvm_v16i32_ty],
           [llvm_v16f32_ty, llvm_v16i32_ty,  llvm_i16_ty,  llvm_i32_ty],
           [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvtps2uqq_128 :
-        GCCBuiltin<"__builtin_ia32_cvtps2uqq128_mask">,
+        ClangBuiltin<"__builtin_ia32_cvtps2uqq128_mask">,
           Intrinsic<[llvm_v2i64_ty],
           [llvm_v4f32_ty, llvm_v2i64_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
   def int_x86_avx512_mask_cvtps2uqq_256 :
-        GCCBuiltin<"__builtin_ia32_cvtps2uqq256_mask">,
+        ClangBuiltin<"__builtin_ia32_cvtps2uqq256_mask">,
           Intrinsic<[llvm_v4i64_ty],
           [llvm_v4f32_ty, llvm_v4i64_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
   def int_x86_avx512_mask_cvtps2uqq_512 :
-        GCCBuiltin<"__builtin_ia32_cvtps2uqq512_mask">,
+        ClangBuiltin<"__builtin_ia32_cvtps2uqq512_mask">,
           Intrinsic<[llvm_v8i64_ty],
           [llvm_v8f32_ty, llvm_v8i64_ty,  llvm_i8_ty,  llvm_i32_ty],
           [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvtqq2ps_128 :
-        GCCBuiltin<"__builtin_ia32_cvtqq2ps128_mask">,
+        ClangBuiltin<"__builtin_ia32_cvtqq2ps128_mask">,
           Intrinsic<[llvm_v4f32_ty],
           [llvm_v2i64_ty, llvm_v4f32_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
   def int_x86_avx512_mask_cvttpd2dq_128 :
-        GCCBuiltin<"__builtin_ia32_cvttpd2dq128_mask">,
+        ClangBuiltin<"__builtin_ia32_cvttpd2dq128_mask">,
           Intrinsic<[llvm_v4i32_ty],
           [llvm_v2f64_ty, llvm_v4i32_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
   def int_x86_avx512_mask_cvttpd2dq_512 :
-        GCCBuiltin<"__builtin_ia32_cvttpd2dq512_mask">,
+        ClangBuiltin<"__builtin_ia32_cvttpd2dq512_mask">,
           Intrinsic<[llvm_v8i32_ty],
           [llvm_v8f64_ty, llvm_v8i32_ty,  llvm_i8_ty,  llvm_i32_ty],
           [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvttpd2qq_128 :
-        GCCBuiltin<"__builtin_ia32_cvttpd2qq128_mask">,
+        ClangBuiltin<"__builtin_ia32_cvttpd2qq128_mask">,
           Intrinsic<[llvm_v2i64_ty],
           [llvm_v2f64_ty, llvm_v2i64_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
   def int_x86_avx512_mask_cvttpd2qq_256 :
-        GCCBuiltin<"__builtin_ia32_cvttpd2qq256_mask">,
+        ClangBuiltin<"__builtin_ia32_cvttpd2qq256_mask">,
           Intrinsic<[llvm_v4i64_ty],
           [llvm_v4f64_ty, llvm_v4i64_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
   def int_x86_avx512_mask_cvttpd2qq_512 :
-        GCCBuiltin<"__builtin_ia32_cvttpd2qq512_mask">,
+        ClangBuiltin<"__builtin_ia32_cvttpd2qq512_mask">,
           Intrinsic<[llvm_v8i64_ty],
           [llvm_v8f64_ty, llvm_v8i64_ty,  llvm_i8_ty,  llvm_i32_ty],
           [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvttpd2udq_128 :
-        GCCBuiltin<"__builtin_ia32_cvttpd2udq128_mask">,
+        ClangBuiltin<"__builtin_ia32_cvttpd2udq128_mask">,
           Intrinsic<[llvm_v4i32_ty],
           [llvm_v2f64_ty, llvm_v4i32_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
   def int_x86_avx512_mask_cvttpd2udq_256 :
-        GCCBuiltin<"__builtin_ia32_cvttpd2udq256_mask">,
+        ClangBuiltin<"__builtin_ia32_cvttpd2udq256_mask">,
           Intrinsic<[llvm_v4i32_ty],
           [llvm_v4f64_ty, llvm_v4i32_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
   def int_x86_avx512_mask_cvttpd2udq_512 :
-        GCCBuiltin<"__builtin_ia32_cvttpd2udq512_mask">,
+        ClangBuiltin<"__builtin_ia32_cvttpd2udq512_mask">,
           Intrinsic<[llvm_v8i32_ty],
           [llvm_v8f64_ty, llvm_v8i32_ty,  llvm_i8_ty,  llvm_i32_ty],
           [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvttpd2uqq_128 :
-        GCCBuiltin<"__builtin_ia32_cvttpd2uqq128_mask">,
+        ClangBuiltin<"__builtin_ia32_cvttpd2uqq128_mask">,
           Intrinsic<[llvm_v2i64_ty],
           [llvm_v2f64_ty, llvm_v2i64_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
   def int_x86_avx512_mask_cvttpd2uqq_256 :
-        GCCBuiltin<"__builtin_ia32_cvttpd2uqq256_mask">,
+        ClangBuiltin<"__builtin_ia32_cvttpd2uqq256_mask">,
           Intrinsic<[llvm_v4i64_ty],
           [llvm_v4f64_ty, llvm_v4i64_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
   def int_x86_avx512_mask_cvttpd2uqq_512 :
-        GCCBuiltin<"__builtin_ia32_cvttpd2uqq512_mask">,
+        ClangBuiltin<"__builtin_ia32_cvttpd2uqq512_mask">,
           Intrinsic<[llvm_v8i64_ty],
           [llvm_v8f64_ty, llvm_v8i64_ty,  llvm_i8_ty,  llvm_i32_ty],
           [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvttps2dq_512 :
-        GCCBuiltin<"__builtin_ia32_cvttps2dq512_mask">,
+        ClangBuiltin<"__builtin_ia32_cvttps2dq512_mask">,
           Intrinsic<[llvm_v16i32_ty],
           [llvm_v16f32_ty, llvm_v16i32_ty,  llvm_i16_ty,  llvm_i32_ty],
           [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvttps2qq_128 :
-        GCCBuiltin<"__builtin_ia32_cvttps2qq128_mask">,
+        ClangBuiltin<"__builtin_ia32_cvttps2qq128_mask">,
           Intrinsic<[llvm_v2i64_ty],
           [llvm_v4f32_ty, llvm_v2i64_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
   def int_x86_avx512_mask_cvttps2qq_256 :
-        GCCBuiltin<"__builtin_ia32_cvttps2qq256_mask">,
+        ClangBuiltin<"__builtin_ia32_cvttps2qq256_mask">,
           Intrinsic<[llvm_v4i64_ty],
           [llvm_v4f32_ty, llvm_v4i64_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
   def int_x86_avx512_mask_cvttps2qq_512 :
-        GCCBuiltin<"__builtin_ia32_cvttps2qq512_mask">,
+        ClangBuiltin<"__builtin_ia32_cvttps2qq512_mask">,
           Intrinsic<[llvm_v8i64_ty],
           [llvm_v8f32_ty, llvm_v8i64_ty,  llvm_i8_ty,  llvm_i32_ty],
           [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvttps2udq_128 :
-        GCCBuiltin<"__builtin_ia32_cvttps2udq128_mask">,
+        ClangBuiltin<"__builtin_ia32_cvttps2udq128_mask">,
           Intrinsic<[llvm_v4i32_ty],
           [llvm_v4f32_ty, llvm_v4i32_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
   def int_x86_avx512_mask_cvttps2udq_256 :
-        GCCBuiltin<"__builtin_ia32_cvttps2udq256_mask">,
+        ClangBuiltin<"__builtin_ia32_cvttps2udq256_mask">,
           Intrinsic<[llvm_v8i32_ty],
           [llvm_v8f32_ty, llvm_v8i32_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
   def int_x86_avx512_mask_cvttps2udq_512 :
-        GCCBuiltin<"__builtin_ia32_cvttps2udq512_mask">,
+        ClangBuiltin<"__builtin_ia32_cvttps2udq512_mask">,
           Intrinsic<[llvm_v16i32_ty],
           [llvm_v16f32_ty, llvm_v16i32_ty,  llvm_i16_ty,  llvm_i32_ty],
           [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvttps2uqq_128 :
-        GCCBuiltin<"__builtin_ia32_cvttps2uqq128_mask">,
+        ClangBuiltin<"__builtin_ia32_cvttps2uqq128_mask">,
           Intrinsic<[llvm_v2i64_ty],
           [llvm_v4f32_ty, llvm_v2i64_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
   def int_x86_avx512_mask_cvttps2uqq_256 :
-        GCCBuiltin<"__builtin_ia32_cvttps2uqq256_mask">,
+        ClangBuiltin<"__builtin_ia32_cvttps2uqq256_mask">,
           Intrinsic<[llvm_v4i64_ty],
           [llvm_v4f32_ty, llvm_v4i64_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
   def int_x86_avx512_mask_cvttps2uqq_512 :
-        GCCBuiltin<"__builtin_ia32_cvttps2uqq512_mask">,
+        ClangBuiltin<"__builtin_ia32_cvttps2uqq512_mask">,
           Intrinsic<[llvm_v8i64_ty],
           [llvm_v8f32_ty, llvm_v8i64_ty,  llvm_i8_ty,  llvm_i32_ty],
           [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvtuqq2ps_128 :
-        GCCBuiltin<"__builtin_ia32_cvtuqq2ps128_mask">,
+        ClangBuiltin<"__builtin_ia32_cvtuqq2ps128_mask">,
           Intrinsic<[llvm_v4f32_ty],
           [llvm_v2i64_ty, llvm_v4f32_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
-  def int_x86_avx512_mask_rndscale_pd_128 : GCCBuiltin<"__builtin_ia32_rndscalepd_128_mask">,
+  def int_x86_avx512_mask_rndscale_pd_128 : ClangBuiltin<"__builtin_ia32_rndscalepd_128_mask">,
         Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_i32_ty,
                                      llvm_v2f64_ty,  llvm_i8_ty],
                   [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_avx512_mask_rndscale_pd_256 : GCCBuiltin<"__builtin_ia32_rndscalepd_256_mask">,
+  def int_x86_avx512_mask_rndscale_pd_256 : ClangBuiltin<"__builtin_ia32_rndscalepd_256_mask">,
         Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_i32_ty,
                                      llvm_v4f64_ty,  llvm_i8_ty],
                   [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_avx512_mask_rndscale_pd_512 : GCCBuiltin<"__builtin_ia32_rndscalepd_mask">,
+  def int_x86_avx512_mask_rndscale_pd_512 : ClangBuiltin<"__builtin_ia32_rndscalepd_mask">,
         Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_i32_ty, llvm_v8f64_ty,
                                      llvm_i8_ty, llvm_i32_ty],
                   [IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_mask_rndscale_ps_128 : GCCBuiltin<"__builtin_ia32_rndscaleps_128_mask">,
+  def int_x86_avx512_mask_rndscale_ps_128 : ClangBuiltin<"__builtin_ia32_rndscaleps_128_mask">,
         Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty,
                                      llvm_v4f32_ty,  llvm_i8_ty],
                   [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_avx512_mask_rndscale_ps_256 : GCCBuiltin<"__builtin_ia32_rndscaleps_256_mask">,
+  def int_x86_avx512_mask_rndscale_ps_256 : ClangBuiltin<"__builtin_ia32_rndscaleps_256_mask">,
         Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i32_ty,
                                      llvm_v8f32_ty,  llvm_i8_ty],
                   [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_avx512_mask_rndscale_ps_512 : GCCBuiltin<"__builtin_ia32_rndscaleps_mask">,
+  def int_x86_avx512_mask_rndscale_ps_512 : ClangBuiltin<"__builtin_ia32_rndscaleps_mask">,
         Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_i32_ty, llvm_v16f32_ty,
                                      llvm_i16_ty, llvm_i32_ty],
                   [IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_mask_reduce_pd_128 : GCCBuiltin<"__builtin_ia32_reducepd128_mask">,
+  def int_x86_avx512_mask_reduce_pd_128 : ClangBuiltin<"__builtin_ia32_reducepd128_mask">,
         Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_i32_ty,
                                      llvm_v2f64_ty,  llvm_i8_ty],
                   [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_avx512_mask_reduce_pd_256 : GCCBuiltin<"__builtin_ia32_reducepd256_mask">,
+  def int_x86_avx512_mask_reduce_pd_256 : ClangBuiltin<"__builtin_ia32_reducepd256_mask">,
         Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_i32_ty,
                                      llvm_v4f64_ty,  llvm_i8_ty],
                   [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_avx512_mask_reduce_pd_512 : GCCBuiltin<"__builtin_ia32_reducepd512_mask">,
+  def int_x86_avx512_mask_reduce_pd_512 : ClangBuiltin<"__builtin_ia32_reducepd512_mask">,
         Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_i32_ty, llvm_v8f64_ty,
                                      llvm_i8_ty, llvm_i32_ty],
                   [IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_mask_reduce_ps_128 : GCCBuiltin<"__builtin_ia32_reduceps128_mask">,
+  def int_x86_avx512_mask_reduce_ps_128 : ClangBuiltin<"__builtin_ia32_reduceps128_mask">,
         Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty,
                                      llvm_v4f32_ty,  llvm_i8_ty],
                   [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_avx512_mask_reduce_ps_256 : GCCBuiltin<"__builtin_ia32_reduceps256_mask">,
+  def int_x86_avx512_mask_reduce_ps_256 : ClangBuiltin<"__builtin_ia32_reduceps256_mask">,
         Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i32_ty,
                                      llvm_v8f32_ty,  llvm_i8_ty],
                   [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_avx512_mask_reduce_ps_512 : GCCBuiltin<"__builtin_ia32_reduceps512_mask">,
+  def int_x86_avx512_mask_reduce_ps_512 : ClangBuiltin<"__builtin_ia32_reduceps512_mask">,
         Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_i32_ty, llvm_v16f32_ty,
                                      llvm_i16_ty, llvm_i32_ty],
                   [IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>>]>;
-def int_x86_avx512_mask_range_pd_128 : GCCBuiltin<"__builtin_ia32_rangepd128_mask">,
+def int_x86_avx512_mask_range_pd_128 : ClangBuiltin<"__builtin_ia32_rangepd128_mask">,
         Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty,
                                     llvm_v2f64_ty,  llvm_i8_ty],
                   [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-def int_x86_avx512_mask_range_pd_256 : GCCBuiltin<"__builtin_ia32_rangepd256_mask">,
+def int_x86_avx512_mask_range_pd_256 : ClangBuiltin<"__builtin_ia32_rangepd256_mask">,
         Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, llvm_i32_ty,
                                     llvm_v4f64_ty,  llvm_i8_ty],
                   [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-def int_x86_avx512_mask_range_pd_512 : GCCBuiltin<"__builtin_ia32_rangepd512_mask">,
+def int_x86_avx512_mask_range_pd_512 : ClangBuiltin<"__builtin_ia32_rangepd512_mask">,
         Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i32_ty,
                                     llvm_v8f64_ty,  llvm_i8_ty,  llvm_i32_ty],
                   [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]>;
-def int_x86_avx512_mask_range_ps_128 : GCCBuiltin<"__builtin_ia32_rangeps128_mask">,
+def int_x86_avx512_mask_range_ps_128 : ClangBuiltin<"__builtin_ia32_rangeps128_mask">,
         Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty,
                                     llvm_v4f32_ty,  llvm_i8_ty],
                   [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-def int_x86_avx512_mask_range_ps_256 : GCCBuiltin<"__builtin_ia32_rangeps256_mask">,
+def int_x86_avx512_mask_range_ps_256 : ClangBuiltin<"__builtin_ia32_rangeps256_mask">,
         Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty, llvm_i32_ty,
                                     llvm_v8f32_ty,  llvm_i8_ty],
                   [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-def int_x86_avx512_mask_range_ps_512 : GCCBuiltin<"__builtin_ia32_rangeps512_mask">,
+def int_x86_avx512_mask_range_ps_512 : ClangBuiltin<"__builtin_ia32_rangeps512_mask">,
         Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i32_ty,
                                      llvm_v16f32_ty,  llvm_i16_ty,  llvm_i32_ty],
                   [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]>;
@@ -3141,152 +3151,152 @@ def int_x86_avx512_mask_range_ps_512 : GCCBuiltin<"__builtin_ia32_rangeps512_mas
 // Vector load with broadcast
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
    def int_x86_avx512_broadcastmw_512 :
-          GCCBuiltin<"__builtin_ia32_broadcastmw512">,
+          ClangBuiltin<"__builtin_ia32_broadcastmw512">,
           Intrinsic<[llvm_v16i32_ty], [llvm_i16_ty], [IntrNoMem]>;
    def int_x86_avx512_broadcastmw_256 :
-          GCCBuiltin<"__builtin_ia32_broadcastmw256">,
+          ClangBuiltin<"__builtin_ia32_broadcastmw256">,
           Intrinsic<[llvm_v8i32_ty], [llvm_i16_ty], [IntrNoMem]>;
    def int_x86_avx512_broadcastmw_128 :
-          GCCBuiltin<"__builtin_ia32_broadcastmw128">,
+          ClangBuiltin<"__builtin_ia32_broadcastmw128">,
           Intrinsic<[llvm_v4i32_ty], [llvm_i16_ty], [IntrNoMem]>;
    def int_x86_avx512_broadcastmb_512 :
-          GCCBuiltin<"__builtin_ia32_broadcastmb512">,
+          ClangBuiltin<"__builtin_ia32_broadcastmb512">,
           Intrinsic<[llvm_v8i64_ty], [llvm_i8_ty], [IntrNoMem]>;
    def int_x86_avx512_broadcastmb_256 :
-          GCCBuiltin<"__builtin_ia32_broadcastmb256">,
+          ClangBuiltin<"__builtin_ia32_broadcastmb256">,
           Intrinsic<[llvm_v4i64_ty], [llvm_i8_ty], [IntrNoMem]>;
    def int_x86_avx512_broadcastmb_128 :
-          GCCBuiltin<"__builtin_ia32_broadcastmb128">,
+          ClangBuiltin<"__builtin_ia32_broadcastmb128">,
           Intrinsic<[llvm_v2i64_ty], [llvm_i8_ty], [IntrNoMem]>;
 }
 
 // Arithmetic ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
-  def int_x86_avx512_add_ps_512 : GCCBuiltin<"__builtin_ia32_addps512">,
+  def int_x86_avx512_add_ps_512 : ClangBuiltin<"__builtin_ia32_addps512">,
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
                      llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_avx512_add_pd_512 : GCCBuiltin<"__builtin_ia32_addpd512">,
+  def int_x86_avx512_add_pd_512 : ClangBuiltin<"__builtin_ia32_addpd512">,
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                      llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_avx512_sub_ps_512 : GCCBuiltin<"__builtin_ia32_subps512">,
+  def int_x86_avx512_sub_ps_512 : ClangBuiltin<"__builtin_ia32_subps512">,
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
                      llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_avx512_sub_pd_512 : GCCBuiltin<"__builtin_ia32_subpd512">,
+  def int_x86_avx512_sub_pd_512 : ClangBuiltin<"__builtin_ia32_subpd512">,
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                      llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_avx512_mul_ps_512 : GCCBuiltin<"__builtin_ia32_mulps512">,
+  def int_x86_avx512_mul_ps_512 : ClangBuiltin<"__builtin_ia32_mulps512">,
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
                      llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_avx512_mul_pd_512 : GCCBuiltin<"__builtin_ia32_mulpd512">,
+  def int_x86_avx512_mul_pd_512 : ClangBuiltin<"__builtin_ia32_mulpd512">,
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                      llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_avx512_div_ps_512 : GCCBuiltin<"__builtin_ia32_divps512">,
+  def int_x86_avx512_div_ps_512 : ClangBuiltin<"__builtin_ia32_divps512">,
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
                      llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_avx512_div_pd_512 : GCCBuiltin<"__builtin_ia32_divpd512">,
+  def int_x86_avx512_div_pd_512 : ClangBuiltin<"__builtin_ia32_divpd512">,
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                      llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
-  def int_x86_avx512_max_ps_512 : GCCBuiltin<"__builtin_ia32_maxps512">,
+  def int_x86_avx512_max_ps_512 : ClangBuiltin<"__builtin_ia32_maxps512">,
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
                      llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_avx512_max_pd_512 : GCCBuiltin<"__builtin_ia32_maxpd512">,
+  def int_x86_avx512_max_pd_512 : ClangBuiltin<"__builtin_ia32_maxpd512">,
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                      llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_avx512_min_ps_512 : GCCBuiltin<"__builtin_ia32_minps512">,
+  def int_x86_avx512_min_ps_512 : ClangBuiltin<"__builtin_ia32_minps512">,
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
                      llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_avx512_min_pd_512 : GCCBuiltin<"__builtin_ia32_minpd512">,
+  def int_x86_avx512_min_pd_512 : ClangBuiltin<"__builtin_ia32_minpd512">,
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                      llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
-  def int_x86_avx512_mask_add_ss_round : GCCBuiltin<"__builtin_ia32_addss_round_mask">,
+  def int_x86_avx512_mask_add_ss_round : ClangBuiltin<"__builtin_ia32_addss_round_mask">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                      llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_mask_div_ss_round : GCCBuiltin<"__builtin_ia32_divss_round_mask">,
+  def int_x86_avx512_mask_div_ss_round : ClangBuiltin<"__builtin_ia32_divss_round_mask">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                      llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_mask_mul_ss_round : GCCBuiltin<"__builtin_ia32_mulss_round_mask">,
+  def int_x86_avx512_mask_mul_ss_round : ClangBuiltin<"__builtin_ia32_mulss_round_mask">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                      llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_mask_sub_ss_round : GCCBuiltin<"__builtin_ia32_subss_round_mask">,
+  def int_x86_avx512_mask_sub_ss_round : ClangBuiltin<"__builtin_ia32_subss_round_mask">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                      llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_mask_max_ss_round : GCCBuiltin<"__builtin_ia32_maxss_round_mask">,
+  def int_x86_avx512_mask_max_ss_round : ClangBuiltin<"__builtin_ia32_maxss_round_mask">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                      llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_mask_min_ss_round : GCCBuiltin<"__builtin_ia32_minss_round_mask">,
+  def int_x86_avx512_mask_min_ss_round : ClangBuiltin<"__builtin_ia32_minss_round_mask">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                      llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_mask_add_sd_round : GCCBuiltin<"__builtin_ia32_addsd_round_mask">,
+  def int_x86_avx512_mask_add_sd_round : ClangBuiltin<"__builtin_ia32_addsd_round_mask">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                      llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_mask_div_sd_round : GCCBuiltin<"__builtin_ia32_divsd_round_mask">,
+  def int_x86_avx512_mask_div_sd_round : ClangBuiltin<"__builtin_ia32_divsd_round_mask">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                      llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_mask_mul_sd_round : GCCBuiltin<"__builtin_ia32_mulsd_round_mask">,
+  def int_x86_avx512_mask_mul_sd_round : ClangBuiltin<"__builtin_ia32_mulsd_round_mask">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                      llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_mask_sub_sd_round : GCCBuiltin<"__builtin_ia32_subsd_round_mask">,
+  def int_x86_avx512_mask_sub_sd_round : ClangBuiltin<"__builtin_ia32_subsd_round_mask">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                      llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_mask_max_sd_round : GCCBuiltin<"__builtin_ia32_maxsd_round_mask">,
+  def int_x86_avx512_mask_max_sd_round : ClangBuiltin<"__builtin_ia32_maxsd_round_mask">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                      llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_mask_min_sd_round : GCCBuiltin<"__builtin_ia32_minsd_round_mask">,
+  def int_x86_avx512_mask_min_sd_round : ClangBuiltin<"__builtin_ia32_minsd_round_mask">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                      llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<4>>]>;
 
-  def int_x86_avx512_mask_rndscale_ss : GCCBuiltin<"__builtin_ia32_rndscaless_round_mask">,
+  def int_x86_avx512_mask_rndscale_ss : ClangBuiltin<"__builtin_ia32_rndscaless_round_mask">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
                                      llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],
                                      [IntrNoMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
-  def int_x86_avx512_mask_rndscale_sd : GCCBuiltin<"__builtin_ia32_rndscalesd_round_mask">,
+  def int_x86_avx512_mask_rndscale_sd : ClangBuiltin<"__builtin_ia32_rndscalesd_round_mask">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
                                       llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],
                                      [IntrNoMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
-  def int_x86_avx512_mask_range_ss : GCCBuiltin<"__builtin_ia32_rangess128_round_mask">,
+  def int_x86_avx512_mask_range_ss : ClangBuiltin<"__builtin_ia32_rangess128_round_mask">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
                                      llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],
                                      [IntrNoMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
-  def int_x86_avx512_mask_range_sd : GCCBuiltin<"__builtin_ia32_rangesd128_round_mask">,
+  def int_x86_avx512_mask_range_sd : ClangBuiltin<"__builtin_ia32_rangesd128_round_mask">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
                                       llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],
                                      [IntrNoMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
-  def int_x86_avx512_mask_reduce_ss : GCCBuiltin<"__builtin_ia32_reducess_mask">,
+  def int_x86_avx512_mask_reduce_ss : ClangBuiltin<"__builtin_ia32_reducess_mask">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
                                      llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],
                                      [IntrNoMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
-  def int_x86_avx512_mask_reduce_sd : GCCBuiltin<"__builtin_ia32_reducesd_mask">,
+  def int_x86_avx512_mask_reduce_sd : ClangBuiltin<"__builtin_ia32_reducesd_mask">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
                                       llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],
                                      [IntrNoMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
-  def int_x86_avx512_mask_scalef_sd : GCCBuiltin<"__builtin_ia32_scalefsd_round_mask">,
+  def int_x86_avx512_mask_scalef_sd : ClangBuiltin<"__builtin_ia32_scalefsd_round_mask">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                                       llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty],
                                      [IntrNoMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_mask_scalef_ss : GCCBuiltin<"__builtin_ia32_scalefss_round_mask">,
+  def int_x86_avx512_mask_scalef_ss : ClangBuiltin<"__builtin_ia32_scalefss_round_mask">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                                       llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty],
                                      [IntrNoMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_mask_scalef_pd_128 : GCCBuiltin<"__builtin_ia32_scalefpd128_mask">,
+  def int_x86_avx512_mask_scalef_pd_128 : ClangBuiltin<"__builtin_ia32_scalefpd128_mask">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                     llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_scalef_pd_256 : GCCBuiltin<"__builtin_ia32_scalefpd256_mask">,
+  def int_x86_avx512_mask_scalef_pd_256 : ClangBuiltin<"__builtin_ia32_scalefpd256_mask">,
           Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
                     llvm_v4f64_ty, llvm_i8_ty],[IntrNoMem]>;
-  def int_x86_avx512_mask_scalef_pd_512 : GCCBuiltin<"__builtin_ia32_scalefpd512_mask">,
+  def int_x86_avx512_mask_scalef_pd_512 : ClangBuiltin<"__builtin_ia32_scalefpd512_mask">,
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                                       llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty],
                     [IntrNoMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_mask_scalef_ps_128 : GCCBuiltin<"__builtin_ia32_scalefps128_mask">,
+  def int_x86_avx512_mask_scalef_ps_128 : ClangBuiltin<"__builtin_ia32_scalefps128_mask">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                     llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_scalef_ps_256 : GCCBuiltin<"__builtin_ia32_scalefps256_mask">,
+  def int_x86_avx512_mask_scalef_ps_256 : ClangBuiltin<"__builtin_ia32_scalefps256_mask">,
           Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
                     llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_scalef_ps_512 : GCCBuiltin<"__builtin_ia32_scalefps512_mask">,
+  def int_x86_avx512_mask_scalef_ps_512 : ClangBuiltin<"__builtin_ia32_scalefps512_mask">,
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
                                        llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty],
                     [IntrNoMem, ImmArg<ArgIndex<4>>]>;
@@ -3307,290 +3317,290 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
         Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_i32_ty],
                   [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_mask_fixupimm_pd_128 :
-         GCCBuiltin<"__builtin_ia32_fixupimmpd128_mask">,
+         ClangBuiltin<"__builtin_ia32_fixupimmpd128_mask">,
           Intrinsic<[llvm_v2f64_ty],
           [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2i64_ty, llvm_i32_ty, llvm_i8_ty],
           [IntrNoMem, ImmArg<ArgIndex<3>>]>;
   def int_x86_avx512_maskz_fixupimm_pd_128 :
-         GCCBuiltin<"__builtin_ia32_fixupimmpd128_maskz">,
+         ClangBuiltin<"__builtin_ia32_fixupimmpd128_maskz">,
           Intrinsic<[llvm_v2f64_ty],
           [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2i64_ty, llvm_i32_ty, llvm_i8_ty],
           [IntrNoMem, ImmArg<ArgIndex<3>>]>;
   def int_x86_avx512_mask_fixupimm_pd_256 :
-         GCCBuiltin<"__builtin_ia32_fixupimmpd256_mask">,
+         ClangBuiltin<"__builtin_ia32_fixupimmpd256_mask">,
           Intrinsic<[llvm_v4f64_ty],
           [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4i64_ty, llvm_i32_ty, llvm_i8_ty],
           [IntrNoMem, ImmArg<ArgIndex<3>>]>;
   def int_x86_avx512_maskz_fixupimm_pd_256 :
-         GCCBuiltin<"__builtin_ia32_fixupimmpd256_maskz">,
+         ClangBuiltin<"__builtin_ia32_fixupimmpd256_maskz">,
           Intrinsic<[llvm_v4f64_ty],
           [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4i64_ty, llvm_i32_ty, llvm_i8_ty],
           [IntrNoMem, ImmArg<ArgIndex<3>>]>;
   def int_x86_avx512_mask_fixupimm_pd_512 :
-         GCCBuiltin<"__builtin_ia32_fixupimmpd512_mask">,
+         ClangBuiltin<"__builtin_ia32_fixupimmpd512_mask">,
           Intrinsic<[llvm_v8f64_ty],
           [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8i64_ty, llvm_i32_ty, llvm_i8_ty,
           llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<5>>]>;
   def int_x86_avx512_maskz_fixupimm_pd_512 :
-         GCCBuiltin<"__builtin_ia32_fixupimmpd512_maskz">,
+         ClangBuiltin<"__builtin_ia32_fixupimmpd512_maskz">,
           Intrinsic<[llvm_v8f64_ty],
           [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8i64_ty, llvm_i32_ty, llvm_i8_ty,
           llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<5>>]>;
   def int_x86_avx512_mask_fixupimm_ps_128 :
-         GCCBuiltin<"__builtin_ia32_fixupimmps128_mask">,
+         ClangBuiltin<"__builtin_ia32_fixupimmps128_mask">,
           Intrinsic<[llvm_v4f32_ty],
           [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4i32_ty, llvm_i32_ty, llvm_i8_ty],
           [IntrNoMem, ImmArg<ArgIndex<3>>]>;
   def int_x86_avx512_maskz_fixupimm_ps_128 :
-         GCCBuiltin<"__builtin_ia32_fixupimmps128_maskz">,
+         ClangBuiltin<"__builtin_ia32_fixupimmps128_maskz">,
           Intrinsic<[llvm_v4f32_ty],
           [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4i32_ty, llvm_i32_ty, llvm_i8_ty],
           [IntrNoMem, ImmArg<ArgIndex<3>>]>;
   def int_x86_avx512_mask_fixupimm_ps_256 :
-         GCCBuiltin<"__builtin_ia32_fixupimmps256_mask">,
+         ClangBuiltin<"__builtin_ia32_fixupimmps256_mask">,
           Intrinsic<[llvm_v8f32_ty],
           [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8i32_ty, llvm_i32_ty, llvm_i8_ty],
           [IntrNoMem, ImmArg<ArgIndex<3>>]>;
   def int_x86_avx512_maskz_fixupimm_ps_256 :
-         GCCBuiltin<"__builtin_ia32_fixupimmps256_maskz">,
+         ClangBuiltin<"__builtin_ia32_fixupimmps256_maskz">,
           Intrinsic<[llvm_v8f32_ty],
           [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8i32_ty, llvm_i32_ty, llvm_i8_ty],
           [IntrNoMem, ImmArg<ArgIndex<3>>]>;
   def int_x86_avx512_mask_fixupimm_ps_512 :
-         GCCBuiltin<"__builtin_ia32_fixupimmps512_mask">,
+         ClangBuiltin<"__builtin_ia32_fixupimmps512_mask">,
           Intrinsic<[llvm_v16f32_ty],
           [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16i32_ty, llvm_i32_ty,
           llvm_i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<5>>]>;
   def int_x86_avx512_maskz_fixupimm_ps_512 :
-         GCCBuiltin<"__builtin_ia32_fixupimmps512_maskz">,
+         ClangBuiltin<"__builtin_ia32_fixupimmps512_maskz">,
           Intrinsic<[llvm_v16f32_ty],
           [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16i32_ty, llvm_i32_ty,
           llvm_i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<5>>]>;
   def int_x86_avx512_mask_fixupimm_sd :
-         GCCBuiltin<"__builtin_ia32_fixupimmsd_mask">,
+         ClangBuiltin<"__builtin_ia32_fixupimmsd_mask">,
           Intrinsic<[llvm_v2f64_ty],
           [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2i64_ty, llvm_i32_ty, llvm_i8_ty,
           llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<5>>]>;
   def int_x86_avx512_maskz_fixupimm_sd :
-         GCCBuiltin<"__builtin_ia32_fixupimmsd_maskz">,
+         ClangBuiltin<"__builtin_ia32_fixupimmsd_maskz">,
           Intrinsic<[llvm_v2f64_ty],
           [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2i64_ty, llvm_i32_ty, llvm_i8_ty,
           llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<5>>]>;
   def int_x86_avx512_mask_fixupimm_ss :
-         GCCBuiltin<"__builtin_ia32_fixupimmss_mask">,
+         ClangBuiltin<"__builtin_ia32_fixupimmss_mask">,
           Intrinsic<[llvm_v4f32_ty],
           [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4i32_ty, llvm_i32_ty, llvm_i8_ty,
           llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<5>>]>;
   def int_x86_avx512_maskz_fixupimm_ss :
-         GCCBuiltin<"__builtin_ia32_fixupimmss_maskz">,
+         ClangBuiltin<"__builtin_ia32_fixupimmss_maskz">,
           Intrinsic<[llvm_v4f32_ty],
           [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4i32_ty, llvm_i32_ty, llvm_i8_ty,
           llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<5>>]>;
-  def int_x86_avx512_mask_getexp_pd_128 : GCCBuiltin<"__builtin_ia32_getexppd128_mask">,
+  def int_x86_avx512_mask_getexp_pd_128 : ClangBuiltin<"__builtin_ia32_getexppd128_mask">,
         Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                                     llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_getexp_pd_256 : GCCBuiltin<"__builtin_ia32_getexppd256_mask">,
+  def int_x86_avx512_mask_getexp_pd_256 : ClangBuiltin<"__builtin_ia32_getexppd256_mask">,
         Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
                                     llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_getexp_pd_512 : GCCBuiltin<"__builtin_ia32_getexppd512_mask">,
+  def int_x86_avx512_mask_getexp_pd_512 : ClangBuiltin<"__builtin_ia32_getexppd512_mask">,
         Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                                     llvm_i8_ty, llvm_i32_ty],
                   [IntrNoMem, ImmArg<ArgIndex<3>>]>;
-  def int_x86_avx512_mask_getexp_ps_128 : GCCBuiltin<"__builtin_ia32_getexpps128_mask">,
+  def int_x86_avx512_mask_getexp_ps_128 : ClangBuiltin<"__builtin_ia32_getexpps128_mask">,
         Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                                      llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_getexp_ps_256 : GCCBuiltin<"__builtin_ia32_getexpps256_mask">,
+  def int_x86_avx512_mask_getexp_ps_256 : ClangBuiltin<"__builtin_ia32_getexpps256_mask">,
         Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
                                      llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_getexp_ps_512 : GCCBuiltin<"__builtin_ia32_getexpps512_mask">,
+  def int_x86_avx512_mask_getexp_ps_512 : ClangBuiltin<"__builtin_ia32_getexpps512_mask">,
         Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
                                      llvm_i16_ty, llvm_i32_ty],
                   [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
-  def int_x86_avx512_mask_getexp_ss : GCCBuiltin<"__builtin_ia32_getexpss128_round_mask">,
+  def int_x86_avx512_mask_getexp_ss : ClangBuiltin<"__builtin_ia32_getexpss128_round_mask">,
         Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
                                     llvm_i8_ty, llvm_i32_ty],
                   [IntrNoMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_mask_getexp_sd : GCCBuiltin<"__builtin_ia32_getexpsd128_round_mask">,
+  def int_x86_avx512_mask_getexp_sd : ClangBuiltin<"__builtin_ia32_getexpsd128_round_mask">,
         Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
                                     llvm_i8_ty, llvm_i32_ty],
                   [IntrNoMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_getmant_pd_128 :
-         GCCBuiltin<"__builtin_ia32_getmantpd128_mask">,
+         ClangBuiltin<"__builtin_ia32_getmantpd128_mask">,
           Intrinsic<[llvm_v2f64_ty],
           [llvm_v2f64_ty,llvm_i32_ty, llvm_v2f64_ty,  llvm_i8_ty],
           [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
   def int_x86_avx512_mask_getmant_pd_256 :
-         GCCBuiltin<"__builtin_ia32_getmantpd256_mask">,
+         ClangBuiltin<"__builtin_ia32_getmantpd256_mask">,
           Intrinsic<[llvm_v4f64_ty],
           [llvm_v4f64_ty,llvm_i32_ty, llvm_v4f64_ty,  llvm_i8_ty],
           [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
   def int_x86_avx512_mask_getmant_pd_512 :
-         GCCBuiltin<"__builtin_ia32_getmantpd512_mask">,
+         ClangBuiltin<"__builtin_ia32_getmantpd512_mask">,
           Intrinsic<[llvm_v8f64_ty],
           [llvm_v8f64_ty,llvm_i32_ty, llvm_v8f64_ty,  llvm_i8_ty,llvm_i32_ty ],
           [IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_getmant_ps_128 :
-         GCCBuiltin<"__builtin_ia32_getmantps128_mask">,
+         ClangBuiltin<"__builtin_ia32_getmantps128_mask">,
           Intrinsic<[llvm_v4f32_ty],
           [llvm_v4f32_ty, llvm_i32_ty, llvm_v4f32_ty,  llvm_i8_ty],
           [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
   def int_x86_avx512_mask_getmant_ps_256 :
-         GCCBuiltin<"__builtin_ia32_getmantps256_mask">,
+         ClangBuiltin<"__builtin_ia32_getmantps256_mask">,
           Intrinsic<[llvm_v8f32_ty],
           [llvm_v8f32_ty, llvm_i32_ty, llvm_v8f32_ty,  llvm_i8_ty],
           [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
   def int_x86_avx512_mask_getmant_ps_512 :
-         GCCBuiltin<"__builtin_ia32_getmantps512_mask">,
+         ClangBuiltin<"__builtin_ia32_getmantps512_mask">,
           Intrinsic<[llvm_v16f32_ty],
           [llvm_v16f32_ty,llvm_i32_ty, llvm_v16f32_ty,llvm_i16_ty,llvm_i32_ty],
           [IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_getmant_ss :
-         GCCBuiltin<"__builtin_ia32_getmantss_round_mask">,
+         ClangBuiltin<"__builtin_ia32_getmantss_round_mask">,
           Intrinsic<[llvm_v4f32_ty],
           [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_v4f32_ty,
            llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]>;
 
   def int_x86_avx512_mask_getmant_sd :
-         GCCBuiltin<"__builtin_ia32_getmantsd_round_mask">,
+         ClangBuiltin<"__builtin_ia32_getmantsd_round_mask">,
           Intrinsic<[llvm_v2f64_ty],
           [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty, llvm_v2f64_ty,
            llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]>;
 
-  def int_x86_avx512_rsqrt14_ss : GCCBuiltin<"__builtin_ia32_rsqrt14ss_mask">,
+  def int_x86_avx512_rsqrt14_ss : ClangBuiltin<"__builtin_ia32_rsqrt14ss_mask">,
         Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
                                     llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_rsqrt14_sd : GCCBuiltin<"__builtin_ia32_rsqrt14sd_mask">,
+  def int_x86_avx512_rsqrt14_sd : ClangBuiltin<"__builtin_ia32_rsqrt14sd_mask">,
         Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
                                     llvm_i8_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_rsqrt14_pd_128 : GCCBuiltin<"__builtin_ia32_rsqrt14pd128_mask">,
+  def int_x86_avx512_rsqrt14_pd_128 : ClangBuiltin<"__builtin_ia32_rsqrt14pd128_mask">,
         Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                                     llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_rsqrt14_pd_256 : GCCBuiltin<"__builtin_ia32_rsqrt14pd256_mask">,
+  def int_x86_avx512_rsqrt14_pd_256 : ClangBuiltin<"__builtin_ia32_rsqrt14pd256_mask">,
         Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
                                     llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_rsqrt14_pd_512 : GCCBuiltin<"__builtin_ia32_rsqrt14pd512_mask">,
+  def int_x86_avx512_rsqrt14_pd_512 : ClangBuiltin<"__builtin_ia32_rsqrt14pd512_mask">,
         Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                                     llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_rsqrt14_ps_128 : GCCBuiltin<"__builtin_ia32_rsqrt14ps128_mask">,
+  def int_x86_avx512_rsqrt14_ps_128 : ClangBuiltin<"__builtin_ia32_rsqrt14ps128_mask">,
         Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                                      llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_rsqrt14_ps_256 : GCCBuiltin<"__builtin_ia32_rsqrt14ps256_mask">,
+  def int_x86_avx512_rsqrt14_ps_256 : ClangBuiltin<"__builtin_ia32_rsqrt14ps256_mask">,
           Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
                                      llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_rsqrt14_ps_512 : GCCBuiltin<"__builtin_ia32_rsqrt14ps512_mask">,
+  def int_x86_avx512_rsqrt14_ps_512 : ClangBuiltin<"__builtin_ia32_rsqrt14ps512_mask">,
         Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
                                      llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_rcp14_ss : GCCBuiltin<"__builtin_ia32_rcp14ss_mask">,
+  def int_x86_avx512_rcp14_ss : ClangBuiltin<"__builtin_ia32_rcp14ss_mask">,
         Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
                                     llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_rcp14_sd : GCCBuiltin<"__builtin_ia32_rcp14sd_mask">,
+  def int_x86_avx512_rcp14_sd : ClangBuiltin<"__builtin_ia32_rcp14sd_mask">,
         Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
                                     llvm_i8_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_rcp14_pd_128 : GCCBuiltin<"__builtin_ia32_rcp14pd128_mask">,
+  def int_x86_avx512_rcp14_pd_128 : ClangBuiltin<"__builtin_ia32_rcp14pd128_mask">,
         Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                                     llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_rcp14_pd_256 : GCCBuiltin<"__builtin_ia32_rcp14pd256_mask">,
+  def int_x86_avx512_rcp14_pd_256 : ClangBuiltin<"__builtin_ia32_rcp14pd256_mask">,
         Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
                                     llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_rcp14_pd_512 : GCCBuiltin<"__builtin_ia32_rcp14pd512_mask">,
+  def int_x86_avx512_rcp14_pd_512 : ClangBuiltin<"__builtin_ia32_rcp14pd512_mask">,
         Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                                     llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_rcp14_ps_128 : GCCBuiltin<"__builtin_ia32_rcp14ps128_mask">,
+  def int_x86_avx512_rcp14_ps_128 : ClangBuiltin<"__builtin_ia32_rcp14ps128_mask">,
         Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                                      llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_rcp14_ps_256 : GCCBuiltin<"__builtin_ia32_rcp14ps256_mask">,
+  def int_x86_avx512_rcp14_ps_256 : ClangBuiltin<"__builtin_ia32_rcp14ps256_mask">,
           Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
                                      llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_rcp14_ps_512 : GCCBuiltin<"__builtin_ia32_rcp14ps512_mask">,
+  def int_x86_avx512_rcp14_ps_512 : ClangBuiltin<"__builtin_ia32_rcp14ps512_mask">,
         Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
                                      llvm_i16_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_rcp28_ps : GCCBuiltin<"__builtin_ia32_rcp28ps_mask">,
+  def int_x86_avx512_rcp28_ps : ClangBuiltin<"__builtin_ia32_rcp28ps_mask">,
             Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
                                          llvm_i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<3>>]>;
-  def int_x86_avx512_rcp28_pd : GCCBuiltin<"__builtin_ia32_rcp28pd_mask">,
+  def int_x86_avx512_rcp28_pd : ClangBuiltin<"__builtin_ia32_rcp28pd_mask">,
             Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                                         llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<3>>]>;
-  def int_x86_avx512_exp2_ps : GCCBuiltin<"__builtin_ia32_exp2ps_mask">,
+  def int_x86_avx512_exp2_ps : ClangBuiltin<"__builtin_ia32_exp2ps_mask">,
             Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
                                          llvm_i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<3>>]>;
-  def int_x86_avx512_exp2_pd : GCCBuiltin<"__builtin_ia32_exp2pd_mask">,
+  def int_x86_avx512_exp2_pd : ClangBuiltin<"__builtin_ia32_exp2pd_mask">,
             Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                                         llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
-  def int_x86_avx512_rcp28_ss : GCCBuiltin<"__builtin_ia32_rcp28ss_round_mask">,
+  def int_x86_avx512_rcp28_ss : ClangBuiltin<"__builtin_ia32_rcp28ss_round_mask">,
             Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                                         llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty],
                       [IntrNoMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_rcp28_sd : GCCBuiltin<"__builtin_ia32_rcp28sd_round_mask">,
+  def int_x86_avx512_rcp28_sd : ClangBuiltin<"__builtin_ia32_rcp28sd_round_mask">,
             Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                                         llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty],
                       [IntrNoMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_rsqrt28_ps : GCCBuiltin<"__builtin_ia32_rsqrt28ps_mask">,
+  def int_x86_avx512_rsqrt28_ps : ClangBuiltin<"__builtin_ia32_rsqrt28ps_mask">,
             Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
                                          llvm_i16_ty, llvm_i32_ty],
                       [IntrNoMem, ImmArg<ArgIndex<3>>]>;
-  def int_x86_avx512_rsqrt28_pd : GCCBuiltin<"__builtin_ia32_rsqrt28pd_mask">,
+  def int_x86_avx512_rsqrt28_pd : ClangBuiltin<"__builtin_ia32_rsqrt28pd_mask">,
             Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                                         llvm_i8_ty, llvm_i32_ty],
                       [IntrNoMem, ImmArg<ArgIndex<3>>]>;
-  def int_x86_avx512_rsqrt28_ss : GCCBuiltin<"__builtin_ia32_rsqrt28ss_round_mask">,
+  def int_x86_avx512_rsqrt28_ss : ClangBuiltin<"__builtin_ia32_rsqrt28ss_round_mask">,
             Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                                         llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty],
                       [IntrNoMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_rsqrt28_sd : GCCBuiltin<"__builtin_ia32_rsqrt28sd_round_mask">,
+  def int_x86_avx512_rsqrt28_sd : ClangBuiltin<"__builtin_ia32_rsqrt28sd_round_mask">,
             Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                                         llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty],
                       [IntrNoMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_psad_bw_512 : GCCBuiltin<"__builtin_ia32_psadbw512">,
+  def int_x86_avx512_psad_bw_512 : ClangBuiltin<"__builtin_ia32_psadbw512">,
               Intrinsic<[llvm_v8i64_ty], [llvm_v64i8_ty, llvm_v64i8_ty],
                         [IntrNoMem, Commutative]>;
 }
 // Integer arithmetic ops
 let TargetPrefix = "x86" in {
-  def int_x86_avx512_pmulhu_w_512 : GCCBuiltin<"__builtin_ia32_pmulhuw512">,
+  def int_x86_avx512_pmulhu_w_512 : ClangBuiltin<"__builtin_ia32_pmulhuw512">,
               Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty,
                          llvm_v32i16_ty], [IntrNoMem, Commutative]>;
-  def int_x86_avx512_pmulh_w_512 : GCCBuiltin<"__builtin_ia32_pmulhw512">,
+  def int_x86_avx512_pmulh_w_512 : ClangBuiltin<"__builtin_ia32_pmulhw512">,
               Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty,
                          llvm_v32i16_ty], [IntrNoMem, Commutative]>;
-  def int_x86_avx512_pavg_b_512 : GCCBuiltin<"__builtin_ia32_pavgb512">,
+  def int_x86_avx512_pavg_b_512 : ClangBuiltin<"__builtin_ia32_pavgb512">,
           Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty],
                     [IntrNoMem]>;
-  def int_x86_avx512_pavg_w_512 : GCCBuiltin<"__builtin_ia32_pavgw512">,
+  def int_x86_avx512_pavg_w_512 : ClangBuiltin<"__builtin_ia32_pavgw512">,
           Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty],
                     [IntrNoMem]>;
-  def int_x86_avx512_pmaddw_d_512 : GCCBuiltin<"__builtin_ia32_pmaddwd512">,
+  def int_x86_avx512_pmaddw_d_512 : ClangBuiltin<"__builtin_ia32_pmaddwd512">,
               Intrinsic<[llvm_v16i32_ty], [llvm_v32i16_ty,
                          llvm_v32i16_ty], [IntrNoMem, Commutative]>;
-  def int_x86_avx512_pmaddubs_w_512 : GCCBuiltin<"__builtin_ia32_pmaddubsw512">,
+  def int_x86_avx512_pmaddubs_w_512 : ClangBuiltin<"__builtin_ia32_pmaddubsw512">,
               Intrinsic<[llvm_v32i16_ty], [llvm_v64i8_ty,
                          llvm_v64i8_ty], [IntrNoMem]>;
 
   def int_x86_avx512_dbpsadbw_128 :
-         GCCBuiltin<"__builtin_ia32_dbpsadbw128">,
+         ClangBuiltin<"__builtin_ia32_dbpsadbw128">,
           Intrinsic<[llvm_v8i16_ty],
                     [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
                     [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
   def int_x86_avx512_dbpsadbw_256 :
-         GCCBuiltin<"__builtin_ia32_dbpsadbw256">,
+         ClangBuiltin<"__builtin_ia32_dbpsadbw256">,
           Intrinsic<[llvm_v16i16_ty],
                     [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty],
                     [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
   def int_x86_avx512_dbpsadbw_512 :
-         GCCBuiltin<"__builtin_ia32_dbpsadbw512">,
+         ClangBuiltin<"__builtin_ia32_dbpsadbw512">,
           Intrinsic<[llvm_v32i16_ty],
                     [llvm_v64i8_ty, llvm_v64i8_ty, llvm_i32_ty],
                     [IntrNoMem, ImmArg<ArgIndex<2>>]>;
@@ -3838,32 +3848,32 @@ let TargetPrefix = "x86" in {
   // gather prefetch
   // NOTE: These can't be ArgMemOnly because you can put the address completely
   // in the index register.
-  def int_x86_avx512_gatherpf_dpd_512  : GCCBuiltin<"__builtin_ia32_gatherpfdpd">,
+  def int_x86_avx512_gatherpf_dpd_512  : ClangBuiltin<"__builtin_ia32_gatherpfdpd">,
           Intrinsic<[], [llvm_i8_ty, llvm_v8i32_ty, llvm_ptr_ty,
                      llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_gatherpf_dps_512  : GCCBuiltin<"__builtin_ia32_gatherpfdps">,
+  def int_x86_avx512_gatherpf_dps_512  : ClangBuiltin<"__builtin_ia32_gatherpfdps">,
           Intrinsic<[], [llvm_i16_ty, llvm_v16i32_ty, llvm_ptr_ty,
                      llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_gatherpf_qpd_512  : GCCBuiltin<"__builtin_ia32_gatherpfqpd">,
+  def int_x86_avx512_gatherpf_qpd_512  : ClangBuiltin<"__builtin_ia32_gatherpfqpd">,
           Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
                      llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_gatherpf_qps_512  : GCCBuiltin<"__builtin_ia32_gatherpfqps">,
+  def int_x86_avx512_gatherpf_qps_512  : ClangBuiltin<"__builtin_ia32_gatherpfqps">,
           Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
                      llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
 
   // scatter prefetch
   // NOTE: These can't be ArgMemOnly because you can put the address completely
   // in the index register.
-  def int_x86_avx512_scatterpf_dpd_512  : GCCBuiltin<"__builtin_ia32_scatterpfdpd">,
+  def int_x86_avx512_scatterpf_dpd_512  : ClangBuiltin<"__builtin_ia32_scatterpfdpd">,
           Intrinsic<[], [llvm_i8_ty, llvm_v8i32_ty, llvm_ptr_ty,
                      llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_scatterpf_dps_512  : GCCBuiltin<"__builtin_ia32_scatterpfdps">,
+  def int_x86_avx512_scatterpf_dps_512  : ClangBuiltin<"__builtin_ia32_scatterpfdps">,
           Intrinsic<[], [llvm_i16_ty, llvm_v16i32_ty, llvm_ptr_ty,
                      llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_scatterpf_qpd_512  : GCCBuiltin<"__builtin_ia32_scatterpfqpd">,
+  def int_x86_avx512_scatterpf_qpd_512  : ClangBuiltin<"__builtin_ia32_scatterpfqpd">,
           Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
                      llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_scatterpf_qps_512  : GCCBuiltin<"__builtin_ia32_scatterpfqps">,
+  def int_x86_avx512_scatterpf_qps_512  : ClangBuiltin<"__builtin_ia32_scatterpfqps">,
           Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
                      llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
 }
@@ -4109,34 +4119,34 @@ let TargetPrefix = "x86" in {
 // Instructions that count the number of leading zero bits
 let TargetPrefix = "x86" in {
   def int_x86_avx512_conflict_d_128 :
-          GCCBuiltin<"__builtin_ia32_vpconflictsi_128">,
+          ClangBuiltin<"__builtin_ia32_vpconflictsi_128">,
           Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty], [IntrNoMem]>;
   def int_x86_avx512_conflict_d_256 :
-          GCCBuiltin<"__builtin_ia32_vpconflictsi_256">,
+          ClangBuiltin<"__builtin_ia32_vpconflictsi_256">,
           Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty], [IntrNoMem]>;
   def int_x86_avx512_conflict_d_512 :
-          GCCBuiltin<"__builtin_ia32_vpconflictsi_512">,
+          ClangBuiltin<"__builtin_ia32_vpconflictsi_512">,
           Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty], [IntrNoMem]>;
 
   def int_x86_avx512_conflict_q_128 :
-          GCCBuiltin<"__builtin_ia32_vpconflictdi_128">,
+          ClangBuiltin<"__builtin_ia32_vpconflictdi_128">,
           Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty], [IntrNoMem]>;
   def int_x86_avx512_conflict_q_256 :
-          GCCBuiltin<"__builtin_ia32_vpconflictdi_256">,
+          ClangBuiltin<"__builtin_ia32_vpconflictdi_256">,
           Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty], [IntrNoMem]>;
   def int_x86_avx512_conflict_q_512 :
-          GCCBuiltin<"__builtin_ia32_vpconflictdi_512">,
+          ClangBuiltin<"__builtin_ia32_vpconflictdi_512">,
           Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty], [IntrNoMem]>;
 }
 
 // Compares
 let TargetPrefix = "x86" in {
   // 512-bit
-  def int_x86_avx512_vcomi_sd : GCCBuiltin<"__builtin_ia32_vcomisd">,
+  def int_x86_avx512_vcomi_sd : ClangBuiltin<"__builtin_ia32_vcomisd">,
               Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
                          llvm_v2f64_ty, llvm_i32_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
-  def int_x86_avx512_vcomi_ss : GCCBuiltin<"__builtin_ia32_vcomiss">,
+  def int_x86_avx512_vcomi_ss : ClangBuiltin<"__builtin_ia32_vcomiss">,
               Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
                          llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
@@ -4159,152 +4169,152 @@ let TargetPrefix = "x86" in {
 // truncate
 let TargetPrefix = "x86" in {
   def int_x86_avx512_mask_pmov_qb_128 :
-          GCCBuiltin<"__builtin_ia32_pmovqb128_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovqb128_mask">,
           Intrinsic<[llvm_v16i8_ty],
                     [llvm_v2i64_ty, llvm_v16i8_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmov_qb_mem_128 :
-          GCCBuiltin<"__builtin_ia32_pmovqb128mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovqb128mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovs_qb_128 :
-          GCCBuiltin<"__builtin_ia32_pmovsqb128_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovsqb128_mask">,
           Intrinsic<[llvm_v16i8_ty],
                     [llvm_v2i64_ty, llvm_v16i8_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovs_qb_mem_128 :
-          GCCBuiltin<"__builtin_ia32_pmovsqb128mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovsqb128mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovus_qb_128 :
-          GCCBuiltin<"__builtin_ia32_pmovusqb128_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovusqb128_mask">,
           Intrinsic<[llvm_v16i8_ty],
                     [llvm_v2i64_ty, llvm_v16i8_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovus_qb_mem_128 :
-          GCCBuiltin<"__builtin_ia32_pmovusqb128mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovusqb128mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmov_qb_256 :
-          GCCBuiltin<"__builtin_ia32_pmovqb256_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovqb256_mask">,
           Intrinsic<[llvm_v16i8_ty],
                     [llvm_v4i64_ty, llvm_v16i8_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmov_qb_mem_256 :
-          GCCBuiltin<"__builtin_ia32_pmovqb256mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovqb256mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovs_qb_256 :
-          GCCBuiltin<"__builtin_ia32_pmovsqb256_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovsqb256_mask">,
           Intrinsic<[llvm_v16i8_ty],
                     [llvm_v4i64_ty, llvm_v16i8_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovs_qb_mem_256 :
-          GCCBuiltin<"__builtin_ia32_pmovsqb256mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovsqb256mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovus_qb_256 :
-          GCCBuiltin<"__builtin_ia32_pmovusqb256_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovusqb256_mask">,
           Intrinsic<[llvm_v16i8_ty],
                     [llvm_v4i64_ty, llvm_v16i8_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovus_qb_mem_256 :
-          GCCBuiltin<"__builtin_ia32_pmovusqb256mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovusqb256mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmov_qb_512 :
-          GCCBuiltin<"__builtin_ia32_pmovqb512_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovqb512_mask">,
           Intrinsic<[llvm_v16i8_ty],
                     [llvm_v8i64_ty, llvm_v16i8_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmov_qb_mem_512 :
-          GCCBuiltin<"__builtin_ia32_pmovqb512mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovqb512mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovs_qb_512 :
-          GCCBuiltin<"__builtin_ia32_pmovsqb512_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovsqb512_mask">,
           Intrinsic<[llvm_v16i8_ty],
                     [llvm_v8i64_ty, llvm_v16i8_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovs_qb_mem_512 :
-          GCCBuiltin<"__builtin_ia32_pmovsqb512mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovsqb512mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovus_qb_512 :
-          GCCBuiltin<"__builtin_ia32_pmovusqb512_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovusqb512_mask">,
           Intrinsic<[llvm_v16i8_ty],
                     [llvm_v8i64_ty, llvm_v16i8_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovus_qb_mem_512 :
-          GCCBuiltin<"__builtin_ia32_pmovusqb512mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovusqb512mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmov_qw_128 :
-          GCCBuiltin<"__builtin_ia32_pmovqw128_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovqw128_mask">,
           Intrinsic<[llvm_v8i16_ty],
                     [llvm_v2i64_ty, llvm_v8i16_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmov_qw_mem_128 :
-          GCCBuiltin<"__builtin_ia32_pmovqw128mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovqw128mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovs_qw_128 :
-          GCCBuiltin<"__builtin_ia32_pmovsqw128_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovsqw128_mask">,
           Intrinsic<[llvm_v8i16_ty],
                     [llvm_v2i64_ty, llvm_v8i16_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovs_qw_mem_128 :
-          GCCBuiltin<"__builtin_ia32_pmovsqw128mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovsqw128mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovus_qw_128 :
-          GCCBuiltin<"__builtin_ia32_pmovusqw128_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovusqw128_mask">,
           Intrinsic<[llvm_v8i16_ty],
                     [llvm_v2i64_ty, llvm_v8i16_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovus_qw_mem_128 :
-          GCCBuiltin<"__builtin_ia32_pmovusqw128mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovusqw128mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmov_qw_256 :
-          GCCBuiltin<"__builtin_ia32_pmovqw256_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovqw256_mask">,
           Intrinsic<[llvm_v8i16_ty],
                     [llvm_v4i64_ty, llvm_v8i16_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmov_qw_mem_256 :
-          GCCBuiltin<"__builtin_ia32_pmovqw256mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovqw256mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovs_qw_256 :
-          GCCBuiltin<"__builtin_ia32_pmovsqw256_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovsqw256_mask">,
           Intrinsic<[llvm_v8i16_ty],
                     [llvm_v4i64_ty, llvm_v8i16_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovs_qw_mem_256 :
-          GCCBuiltin<"__builtin_ia32_pmovsqw256mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovsqw256mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovus_qw_256 :
-          GCCBuiltin<"__builtin_ia32_pmovusqw256_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovusqw256_mask">,
           Intrinsic<[llvm_v8i16_ty],
                     [llvm_v4i64_ty, llvm_v8i16_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovus_qw_mem_256 :
-          GCCBuiltin<"__builtin_ia32_pmovusqw256mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovusqw256mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
@@ -4313,167 +4323,167 @@ let TargetPrefix = "x86" in {
                     [llvm_v8i64_ty, llvm_v8i16_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmov_qw_mem_512 :
-          GCCBuiltin<"__builtin_ia32_pmovqw512mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovqw512mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovs_qw_512 :
-          GCCBuiltin<"__builtin_ia32_pmovsqw512_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovsqw512_mask">,
           Intrinsic<[llvm_v8i16_ty],
                     [llvm_v8i64_ty, llvm_v8i16_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovs_qw_mem_512 :
-          GCCBuiltin<"__builtin_ia32_pmovsqw512mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovsqw512mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovus_qw_512 :
-          GCCBuiltin<"__builtin_ia32_pmovusqw512_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovusqw512_mask">,
           Intrinsic<[llvm_v8i16_ty],
                     [llvm_v8i64_ty, llvm_v8i16_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovus_qw_mem_512 :
-          GCCBuiltin<"__builtin_ia32_pmovusqw512mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovusqw512mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmov_qd_128 :
-          GCCBuiltin<"__builtin_ia32_pmovqd128_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovqd128_mask">,
           Intrinsic<[llvm_v4i32_ty],
                     [llvm_v2i64_ty, llvm_v4i32_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmov_qd_mem_128 :
-          GCCBuiltin<"__builtin_ia32_pmovqd128mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovqd128mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovs_qd_128 :
-          GCCBuiltin<"__builtin_ia32_pmovsqd128_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovsqd128_mask">,
           Intrinsic<[llvm_v4i32_ty],
                     [llvm_v2i64_ty, llvm_v4i32_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovs_qd_mem_128 :
-          GCCBuiltin<"__builtin_ia32_pmovsqd128mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovsqd128mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovus_qd_128 :
-          GCCBuiltin<"__builtin_ia32_pmovusqd128_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovusqd128_mask">,
           Intrinsic<[llvm_v4i32_ty],
                     [llvm_v2i64_ty, llvm_v4i32_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovus_qd_mem_128 :
-          GCCBuiltin<"__builtin_ia32_pmovusqd128mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovusqd128mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmov_qd_mem_256 :
-          GCCBuiltin<"__builtin_ia32_pmovqd256mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovqd256mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovs_qd_256 :
-          GCCBuiltin<"__builtin_ia32_pmovsqd256_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovsqd256_mask">,
           Intrinsic<[llvm_v4i32_ty],
                     [llvm_v4i64_ty, llvm_v4i32_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovs_qd_mem_256 :
-          GCCBuiltin<"__builtin_ia32_pmovsqd256mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovsqd256mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovus_qd_256 :
-          GCCBuiltin<"__builtin_ia32_pmovusqd256_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovusqd256_mask">,
           Intrinsic<[llvm_v4i32_ty],
                     [llvm_v4i64_ty, llvm_v4i32_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovus_qd_mem_256 :
-          GCCBuiltin<"__builtin_ia32_pmovusqd256mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovusqd256mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmov_qd_mem_512 :
-          GCCBuiltin<"__builtin_ia32_pmovqd512mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovqd512mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovs_qd_512 :
-          GCCBuiltin<"__builtin_ia32_pmovsqd512_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovsqd512_mask">,
           Intrinsic<[llvm_v8i32_ty],
                     [llvm_v8i64_ty, llvm_v8i32_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovs_qd_mem_512 :
-          GCCBuiltin<"__builtin_ia32_pmovsqd512mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovsqd512mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovus_qd_512 :
-          GCCBuiltin<"__builtin_ia32_pmovusqd512_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovusqd512_mask">,
           Intrinsic<[llvm_v8i32_ty],
                     [llvm_v8i64_ty, llvm_v8i32_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovus_qd_mem_512 :
-          GCCBuiltin<"__builtin_ia32_pmovusqd512mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovusqd512mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmov_db_128 :
-          GCCBuiltin<"__builtin_ia32_pmovdb128_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovdb128_mask">,
           Intrinsic<[llvm_v16i8_ty],
                     [llvm_v4i32_ty, llvm_v16i8_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmov_db_mem_128 :
-          GCCBuiltin<"__builtin_ia32_pmovdb128mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovdb128mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovs_db_128 :
-          GCCBuiltin<"__builtin_ia32_pmovsdb128_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovsdb128_mask">,
           Intrinsic<[llvm_v16i8_ty],
                     [llvm_v4i32_ty, llvm_v16i8_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovs_db_mem_128 :
-          GCCBuiltin<"__builtin_ia32_pmovsdb128mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovsdb128mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovus_db_128 :
-          GCCBuiltin<"__builtin_ia32_pmovusdb128_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovusdb128_mask">,
           Intrinsic<[llvm_v16i8_ty],
                     [llvm_v4i32_ty, llvm_v16i8_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovus_db_mem_128 :
-          GCCBuiltin<"__builtin_ia32_pmovusdb128mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovusdb128mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmov_db_256 :
-          GCCBuiltin<"__builtin_ia32_pmovdb256_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovdb256_mask">,
           Intrinsic<[llvm_v16i8_ty],
                     [llvm_v8i32_ty, llvm_v16i8_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmov_db_mem_256 :
-          GCCBuiltin<"__builtin_ia32_pmovdb256mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovdb256mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovs_db_256 :
-          GCCBuiltin<"__builtin_ia32_pmovsdb256_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovsdb256_mask">,
           Intrinsic<[llvm_v16i8_ty],
                     [llvm_v8i32_ty, llvm_v16i8_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovs_db_mem_256 :
-          GCCBuiltin<"__builtin_ia32_pmovsdb256mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovsdb256mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovus_db_256 :
-          GCCBuiltin<"__builtin_ia32_pmovusdb256_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovusdb256_mask">,
           Intrinsic<[llvm_v16i8_ty],
                     [llvm_v8i32_ty, llvm_v16i8_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovus_db_mem_256 :
-          GCCBuiltin<"__builtin_ia32_pmovusdb256mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovusdb256mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
@@ -4482,87 +4492,87 @@ let TargetPrefix = "x86" in {
                     [llvm_v16i32_ty, llvm_v16i8_ty, llvm_i16_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmov_db_mem_512 :
-          GCCBuiltin<"__builtin_ia32_pmovdb512mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovdb512mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovs_db_512 :
-          GCCBuiltin<"__builtin_ia32_pmovsdb512_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovsdb512_mask">,
           Intrinsic<[llvm_v16i8_ty],
                     [llvm_v16i32_ty, llvm_v16i8_ty, llvm_i16_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovs_db_mem_512 :
-          GCCBuiltin<"__builtin_ia32_pmovsdb512mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovsdb512mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovus_db_512 :
-          GCCBuiltin<"__builtin_ia32_pmovusdb512_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovusdb512_mask">,
           Intrinsic<[llvm_v16i8_ty],
                     [llvm_v16i32_ty, llvm_v16i8_ty, llvm_i16_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovus_db_mem_512 :
-          GCCBuiltin<"__builtin_ia32_pmovusdb512mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovusdb512mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmov_dw_128 :
-          GCCBuiltin<"__builtin_ia32_pmovdw128_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovdw128_mask">,
           Intrinsic<[llvm_v8i16_ty],
                     [llvm_v4i32_ty, llvm_v8i16_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmov_dw_mem_128 :
-          GCCBuiltin<"__builtin_ia32_pmovdw128mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovdw128mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovs_dw_128 :
-          GCCBuiltin<"__builtin_ia32_pmovsdw128_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovsdw128_mask">,
           Intrinsic<[llvm_v8i16_ty],
                     [llvm_v4i32_ty, llvm_v8i16_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovs_dw_mem_128 :
-          GCCBuiltin<"__builtin_ia32_pmovsdw128mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovsdw128mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovus_dw_128 :
-          GCCBuiltin<"__builtin_ia32_pmovusdw128_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovusdw128_mask">,
           Intrinsic<[llvm_v8i16_ty],
                     [llvm_v4i32_ty, llvm_v8i16_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovus_dw_mem_128 :
-          GCCBuiltin<"__builtin_ia32_pmovusdw128mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovusdw128mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmov_dw_256 :
-          GCCBuiltin<"__builtin_ia32_pmovdw256_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovdw256_mask">,
           Intrinsic<[llvm_v8i16_ty],
                     [llvm_v8i32_ty, llvm_v8i16_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmov_dw_mem_256 :
-          GCCBuiltin<"__builtin_ia32_pmovdw256mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovdw256mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovs_dw_256 :
-          GCCBuiltin<"__builtin_ia32_pmovsdw256_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovsdw256_mask">,
           Intrinsic<[llvm_v8i16_ty],
                     [llvm_v8i32_ty, llvm_v8i16_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovs_dw_mem_256 :
-          GCCBuiltin<"__builtin_ia32_pmovsdw256mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovsdw256mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovus_dw_256 :
-          GCCBuiltin<"__builtin_ia32_pmovusdw256_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovusdw256_mask">,
           Intrinsic<[llvm_v8i16_ty],
                     [llvm_v8i32_ty, llvm_v8i16_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovus_dw_mem_256 :
-          GCCBuiltin<"__builtin_ia32_pmovusdw256mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovusdw256mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
@@ -4571,107 +4581,107 @@ let TargetPrefix = "x86" in {
                     [llvm_v16i32_ty, llvm_v16i16_ty, llvm_i16_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmov_dw_mem_512 :
-          GCCBuiltin<"__builtin_ia32_pmovdw512mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovdw512mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovs_dw_512 :
-          GCCBuiltin<"__builtin_ia32_pmovsdw512_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovsdw512_mask">,
           Intrinsic<[llvm_v16i16_ty],
                     [llvm_v16i32_ty, llvm_v16i16_ty, llvm_i16_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovs_dw_mem_512 :
-          GCCBuiltin<"__builtin_ia32_pmovsdw512mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovsdw512mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovus_dw_512 :
-          GCCBuiltin<"__builtin_ia32_pmovusdw512_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovusdw512_mask">,
           Intrinsic<[llvm_v16i16_ty],
                     [llvm_v16i32_ty, llvm_v16i16_ty, llvm_i16_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovus_dw_mem_512 :
-          GCCBuiltin<"__builtin_ia32_pmovusdw512mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovusdw512mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmov_wb_128 :
-          GCCBuiltin<"__builtin_ia32_pmovwb128_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovwb128_mask">,
           Intrinsic<[llvm_v16i8_ty],
                     [llvm_v8i16_ty, llvm_v16i8_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmov_wb_mem_128 :
-          GCCBuiltin<"__builtin_ia32_pmovwb128mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovwb128mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v8i16_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovs_wb_128 :
-          GCCBuiltin<"__builtin_ia32_pmovswb128_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovswb128_mask">,
           Intrinsic<[llvm_v16i8_ty],
                     [llvm_v8i16_ty, llvm_v16i8_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovs_wb_mem_128 :
-          GCCBuiltin<"__builtin_ia32_pmovswb128mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovswb128mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v8i16_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovus_wb_128 :
-          GCCBuiltin<"__builtin_ia32_pmovuswb128_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovuswb128_mask">,
           Intrinsic<[llvm_v16i8_ty],
                     [llvm_v8i16_ty, llvm_v16i8_ty, llvm_i8_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovus_wb_mem_128 :
-          GCCBuiltin<"__builtin_ia32_pmovuswb128mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovuswb128mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v8i16_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmov_wb_mem_256 :
-          GCCBuiltin<"__builtin_ia32_pmovwb256mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovwb256mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v16i16_ty, llvm_i16_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovs_wb_256 :
-          GCCBuiltin<"__builtin_ia32_pmovswb256_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovswb256_mask">,
           Intrinsic<[llvm_v16i8_ty],
                     [llvm_v16i16_ty, llvm_v16i8_ty, llvm_i16_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovs_wb_mem_256 :
-          GCCBuiltin<"__builtin_ia32_pmovswb256mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovswb256mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v16i16_ty, llvm_i16_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovus_wb_256 :
-          GCCBuiltin<"__builtin_ia32_pmovuswb256_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovuswb256_mask">,
           Intrinsic<[llvm_v16i8_ty],
                     [llvm_v16i16_ty, llvm_v16i8_ty, llvm_i16_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovus_wb_mem_256 :
-          GCCBuiltin<"__builtin_ia32_pmovuswb256mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovuswb256mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v16i16_ty, llvm_i16_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmov_wb_mem_512 :
-          GCCBuiltin<"__builtin_ia32_pmovwb512mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovwb512mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v32i16_ty, llvm_i32_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovs_wb_512 :
-          GCCBuiltin<"__builtin_ia32_pmovswb512_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovswb512_mask">,
           Intrinsic<[llvm_v32i8_ty],
                     [llvm_v32i16_ty, llvm_v32i8_ty, llvm_i32_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovs_wb_mem_512 :
-          GCCBuiltin<"__builtin_ia32_pmovswb512mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovswb512mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v32i16_ty, llvm_i32_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmovus_wb_512 :
-          GCCBuiltin<"__builtin_ia32_pmovuswb512_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovuswb512_mask">,
           Intrinsic<[llvm_v32i8_ty],
                     [llvm_v32i16_ty, llvm_v32i8_ty, llvm_i32_ty],
                     [IntrNoMem]>;
   def int_x86_avx512_mask_pmovus_wb_mem_512 :
-          GCCBuiltin<"__builtin_ia32_pmovuswb512mem_mask">,
+          ClangBuiltin<"__builtin_ia32_pmovuswb512mem_mask">,
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v32i16_ty, llvm_i32_ty],
                     [IntrArgMemOnly]>;
@@ -4680,37 +4690,37 @@ let TargetPrefix = "x86" in {
 // Bitwise ternary logic
 let TargetPrefix = "x86" in {
   def int_x86_avx512_pternlog_d_128 :
-          GCCBuiltin<"__builtin_ia32_pternlogd128">,
+          ClangBuiltin<"__builtin_ia32_pternlogd128">,
           Intrinsic<[llvm_v4i32_ty],
                     [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty],
                     [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_pternlog_d_256 :
-          GCCBuiltin<"__builtin_ia32_pternlogd256">,
+          ClangBuiltin<"__builtin_ia32_pternlogd256">,
           Intrinsic<[llvm_v8i32_ty],
                     [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty],
                     [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_pternlog_d_512 :
-          GCCBuiltin<"__builtin_ia32_pternlogd512">,
+          ClangBuiltin<"__builtin_ia32_pternlogd512">,
           Intrinsic<[llvm_v16i32_ty],
                     [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty,
                      llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_pternlog_q_128 :
-          GCCBuiltin<"__builtin_ia32_pternlogq128">,
+          ClangBuiltin<"__builtin_ia32_pternlogq128">,
           Intrinsic<[llvm_v2i64_ty],
                     [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty],
                     [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_pternlog_q_256 :
-          GCCBuiltin<"__builtin_ia32_pternlogq256">,
+          ClangBuiltin<"__builtin_ia32_pternlogq256">,
           Intrinsic<[llvm_v4i64_ty],
                     [llvm_v4i64_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty],
                     [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_pternlog_q_512 :
-          GCCBuiltin<"__builtin_ia32_pternlogq512">,
+          ClangBuiltin<"__builtin_ia32_pternlogq512">,
           Intrinsic<[llvm_v8i64_ty],
                     [llvm_v8i64_ty, llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty],
                     [IntrNoMem, ImmArg<ArgIndex<3>>]>;
@@ -4770,12 +4780,12 @@ let TargetPrefix = "x86" in {
                        llvm_i32_ty, llvm_v2i1_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
   def int_x86_avx512_mask_cmp_ss :
-        GCCBuiltin<"__builtin_ia32_cmpss_mask">,
+        ClangBuiltin<"__builtin_ia32_cmpss_mask">,
               Intrinsic<[llvm_i8_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                          llvm_i32_ty, llvm_i8_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_cmp_sd :
-        GCCBuiltin<"__builtin_ia32_cmpsd_mask">,
+        ClangBuiltin<"__builtin_ia32_cmpsd_mask">,
               Intrinsic<[llvm_i8_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                          llvm_i32_ty, llvm_i8_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>]>;
@@ -4784,21 +4794,21 @@ let TargetPrefix = "x86" in {
 //===----------------------------------------------------------------------===//
 // SHA intrinsics
 let TargetPrefix = "x86" in {
-  def int_x86_sha1rnds4 : GCCBuiltin<"__builtin_ia32_sha1rnds4">,
+  def int_x86_sha1rnds4 : ClangBuiltin<"__builtin_ia32_sha1rnds4">,
         Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty],
                   [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_sha1nexte : GCCBuiltin<"__builtin_ia32_sha1nexte">,
+  def int_x86_sha1nexte : ClangBuiltin<"__builtin_ia32_sha1nexte">,
       Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-  def int_x86_sha1msg1 : GCCBuiltin<"__builtin_ia32_sha1msg1">,
+  def int_x86_sha1msg1 : ClangBuiltin<"__builtin_ia32_sha1msg1">,
       Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-  def int_x86_sha1msg2 : GCCBuiltin<"__builtin_ia32_sha1msg2">,
+  def int_x86_sha1msg2 : ClangBuiltin<"__builtin_ia32_sha1msg2">,
       Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-  def int_x86_sha256rnds2 : GCCBuiltin<"__builtin_ia32_sha256rnds2">,
+  def int_x86_sha256rnds2 : ClangBuiltin<"__builtin_ia32_sha256rnds2">,
       Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
                 [IntrNoMem]>;
-  def int_x86_sha256msg1 : GCCBuiltin<"__builtin_ia32_sha256msg1">,
+  def int_x86_sha256msg1 : ClangBuiltin<"__builtin_ia32_sha256msg1">,
       Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-  def int_x86_sha256msg2 : GCCBuiltin<"__builtin_ia32_sha256msg2">,
+  def int_x86_sha256msg2 : ClangBuiltin<"__builtin_ia32_sha256msg2">,
       Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
 }
 
@@ -4806,17 +4816,17 @@ let TargetPrefix = "x86" in {
 // Thread synchronization ops with timer.
 let TargetPrefix = "x86" in {
   def int_x86_monitorx
-      : GCCBuiltin<"__builtin_ia32_monitorx">,
+      : ClangBuiltin<"__builtin_ia32_monitorx">,
         Intrinsic<[], [ llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty ], []>;
   def int_x86_mwaitx
-      : GCCBuiltin<"__builtin_ia32_mwaitx">,
+      : ClangBuiltin<"__builtin_ia32_mwaitx">,
         Intrinsic<[], [ llvm_i32_ty, llvm_i32_ty, llvm_i32_ty ], []>;
 }
 
 //===----------------------------------------------------------------------===//
 // Cache-line zero
 let TargetPrefix = "x86" in {
-  def int_x86_clzero : GCCBuiltin<"__builtin_ia32_clzero">,
+  def int_x86_clzero : ClangBuiltin<"__builtin_ia32_clzero">,
       Intrinsic<[], [llvm_ptr_ty], []>;
 }
 
@@ -4825,11 +4835,11 @@ let TargetPrefix = "x86" in {
 
 let TargetPrefix = "x86" in {
   // Write back and invalidate
-  def int_x86_wbinvd : GCCBuiltin<"__builtin_ia32_wbinvd">,
+  def int_x86_wbinvd : ClangBuiltin<"__builtin_ia32_wbinvd">,
       Intrinsic<[], [], []>;
 
   // Write back no-invalidate
-  def int_x86_wbnoinvd : GCCBuiltin<"__builtin_ia32_wbnoinvd">,
+  def int_x86_wbnoinvd : ClangBuiltin<"__builtin_ia32_wbnoinvd">,
       Intrinsic<[], [], []>;
 }
 
@@ -4837,18 +4847,18 @@ let TargetPrefix = "x86" in {
 // Cache-line demote
 
 let TargetPrefix = "x86" in {
-  def int_x86_cldemote : GCCBuiltin<"__builtin_ia32_cldemote">,
+  def int_x86_cldemote : ClangBuiltin<"__builtin_ia32_cldemote">,
       Intrinsic<[], [llvm_ptr_ty], []>;
 }
 
 //===----------------------------------------------------------------------===//
 // Wait and pause enhancements
 let TargetPrefix = "x86" in {
-  def int_x86_umonitor : GCCBuiltin<"__builtin_ia32_umonitor">,
+  def int_x86_umonitor : ClangBuiltin<"__builtin_ia32_umonitor">,
               Intrinsic<[], [llvm_ptr_ty], []>;
-  def int_x86_umwait : GCCBuiltin<"__builtin_ia32_umwait">,
+  def int_x86_umwait : ClangBuiltin<"__builtin_ia32_umwait">,
               Intrinsic<[llvm_i8_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
-  def int_x86_tpause : GCCBuiltin<"__builtin_ia32_tpause">,
+  def int_x86_tpause : ClangBuiltin<"__builtin_ia32_tpause">,
               Intrinsic<[llvm_i8_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
 }
 
@@ -4856,11 +4866,11 @@ let TargetPrefix = "x86" in {
 // Direct Move Instructions
 
 let TargetPrefix = "x86" in {
-  def int_x86_directstore32 : GCCBuiltin<"__builtin_ia32_directstore_u32">,
+  def int_x86_directstore32 : ClangBuiltin<"__builtin_ia32_directstore_u32">,
       Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], []>;
-  def int_x86_directstore64 : GCCBuiltin<"__builtin_ia32_directstore_u64">,
+  def int_x86_directstore64 : ClangBuiltin<"__builtin_ia32_directstore_u64">,
       Intrinsic<[], [llvm_ptr_ty, llvm_i64_ty], []>;
-  def int_x86_movdir64b : GCCBuiltin<"__builtin_ia32_movdir64b">,
+  def int_x86_movdir64b : ClangBuiltin<"__builtin_ia32_movdir64b">,
       Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty], []>;
 }
 
@@ -4868,9 +4878,9 @@ let TargetPrefix = "x86" in {
 // PTWrite - Write data to processor trace pocket
 
 let TargetPrefix = "x86" in {
-  def int_x86_ptwrite32 : GCCBuiltin<"__builtin_ia32_ptwrite32">,
+  def int_x86_ptwrite32 : ClangBuiltin<"__builtin_ia32_ptwrite32">,
               Intrinsic<[], [llvm_i32_ty], []>;
-  def int_x86_ptwrite64 : GCCBuiltin<"__builtin_ia32_ptwrite64">,
+  def int_x86_ptwrite64 : ClangBuiltin<"__builtin_ia32_ptwrite64">,
               Intrinsic<[], [llvm_i64_ty], []>;
 }
 
@@ -4878,21 +4888,21 @@ let TargetPrefix = "x86" in {
 // INVPCID - Invalidate Process-Context Identifier
 
 let TargetPrefix = "x86" in {
-  def int_x86_invpcid : GCCBuiltin<"__builtin_ia32_invpcid">,
+  def int_x86_invpcid : ClangBuiltin<"__builtin_ia32_invpcid">,
               Intrinsic<[], [llvm_i32_ty, llvm_ptr_ty], []>;
 }
 
 let TargetPrefix = "x86" in {
   def int_x86_avx512bf16_cvtne2ps2bf16_128:
-              GCCBuiltin<"__builtin_ia32_cvtne2ps2bf16_128">,
+              ClangBuiltin<"__builtin_ia32_cvtne2ps2bf16_128">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v4f32_ty, llvm_v4f32_ty],
               [IntrNoMem]>;
   def int_x86_avx512bf16_cvtne2ps2bf16_256:
-              GCCBuiltin<"__builtin_ia32_cvtne2ps2bf16_256">,
+              ClangBuiltin<"__builtin_ia32_cvtne2ps2bf16_256">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v8f32_ty, llvm_v8f32_ty],
               [IntrNoMem]>;
   def int_x86_avx512bf16_cvtne2ps2bf16_512:
-              GCCBuiltin<"__builtin_ia32_cvtne2ps2bf16_512">,
+              ClangBuiltin<"__builtin_ia32_cvtne2ps2bf16_512">,
               Intrinsic<[llvm_v32i16_ty], [llvm_v16f32_ty, llvm_v16f32_ty],
               [IntrNoMem]>;
   // Intrinsic must be masked due to it producing less than 128 bits of results.
@@ -4901,21 +4911,21 @@ let TargetPrefix = "x86" in {
                         [llvm_v4f32_ty, llvm_v8i16_ty, llvm_v4i1_ty],
                         [IntrNoMem]>;
   def int_x86_avx512bf16_cvtneps2bf16_256:
-              GCCBuiltin<"__builtin_ia32_cvtneps2bf16_256">,
+              ClangBuiltin<"__builtin_ia32_cvtneps2bf16_256">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8f32_ty], [IntrNoMem]>;
   def int_x86_avx512bf16_cvtneps2bf16_512:
-              GCCBuiltin<"__builtin_ia32_cvtneps2bf16_512">,
+              ClangBuiltin<"__builtin_ia32_cvtneps2bf16_512">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v16f32_ty], [IntrNoMem]>;
   def int_x86_avx512bf16_dpbf16ps_128:
-              GCCBuiltin<"__builtin_ia32_dpbf16ps_128">,
+              ClangBuiltin<"__builtin_ia32_dpbf16ps_128">,
               Intrinsic<[llvm_v4f32_ty],
               [llvm_v4f32_ty, llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
   def int_x86_avx512bf16_dpbf16ps_256:
-              GCCBuiltin<"__builtin_ia32_dpbf16ps_256">,
+              ClangBuiltin<"__builtin_ia32_dpbf16ps_256">,
               Intrinsic<[llvm_v8f32_ty],
               [llvm_v8f32_ty, llvm_v8i32_ty, llvm_v8i32_ty], [IntrNoMem]>;
   def int_x86_avx512bf16_dpbf16ps_512:
-              GCCBuiltin<"__builtin_ia32_dpbf16ps_512">,
+              ClangBuiltin<"__builtin_ia32_dpbf16ps_512">,
               Intrinsic<[llvm_v16f32_ty],
               [llvm_v16f32_ty, llvm_v16i32_ty, llvm_v16i32_ty], [IntrNoMem]>;
 }
@@ -4924,9 +4934,9 @@ let TargetPrefix = "x86" in {
 // ENQCMD - Enqueue Stores Instructions
 
 let TargetPrefix = "x86" in {
-  def int_x86_enqcmd : GCCBuiltin<"__builtin_ia32_enqcmd">,
+  def int_x86_enqcmd : ClangBuiltin<"__builtin_ia32_enqcmd">,
               Intrinsic<[llvm_i8_ty], [llvm_ptr_ty, llvm_ptr_ty], []>;
-  def int_x86_enqcmds : GCCBuiltin<"__builtin_ia32_enqcmds">,
+  def int_x86_enqcmds : ClangBuiltin<"__builtin_ia32_enqcmds">,
               Intrinsic<[llvm_i8_ty], [llvm_ptr_ty, llvm_ptr_ty], []>;
 }
 
@@ -4934,7 +4944,7 @@ let TargetPrefix = "x86" in {
 // SERIALIZE - Serialize instruction fetch and execution
 
 let TargetPrefix = "x86" in {
-  def int_x86_serialize : GCCBuiltin<"__builtin_ia32_serialize">,
+  def int_x86_serialize : ClangBuiltin<"__builtin_ia32_serialize">,
               Intrinsic<[], [], []>;
 }
 
@@ -4942,16 +4952,16 @@ let TargetPrefix = "x86" in {
 // TSXLDTRK - TSX Suspend Load Address Tracking
 
 let TargetPrefix = "x86" in {
-  def int_x86_xsusldtrk : GCCBuiltin<"__builtin_ia32_xsusldtrk">,
+  def int_x86_xsusldtrk : ClangBuiltin<"__builtin_ia32_xsusldtrk">,
               Intrinsic<[], [], []>;
-  def int_x86_xresldtrk : GCCBuiltin<"__builtin_ia32_xresldtrk">,
+  def int_x86_xresldtrk : ClangBuiltin<"__builtin_ia32_xresldtrk">,
               Intrinsic<[], [], []>;
 }
 
 //===----------------------------------------------------------------------===//
 // Key Locker
 let TargetPrefix = "x86" in {
-  def int_x86_loadiwkey : GCCBuiltin<"__builtin_ia32_loadiwkey">,
+  def int_x86_loadiwkey : ClangBuiltin<"__builtin_ia32_loadiwkey">,
       Intrinsic<[], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty],
                 []>;
   def int_x86_encodekey128 :
@@ -5004,91 +5014,91 @@ let TargetPrefix = "x86" in {
 // AMX - Intel AMX extensions
 
 let TargetPrefix = "x86" in {
-  def int_x86_ldtilecfg : GCCBuiltin<"__builtin_ia32_tile_loadconfig">,
+  def int_x86_ldtilecfg : ClangBuiltin<"__builtin_ia32_tile_loadconfig">,
               Intrinsic<[], [llvm_ptr_ty], []>;
-  def int_x86_sttilecfg : GCCBuiltin<"__builtin_ia32_tile_storeconfig">,
+  def int_x86_sttilecfg : ClangBuiltin<"__builtin_ia32_tile_storeconfig">,
               Intrinsic<[], [llvm_ptr_ty], []>;
-  def int_x86_tilerelease : GCCBuiltin<"__builtin_ia32_tilerelease">,
+  def int_x86_tilerelease : ClangBuiltin<"__builtin_ia32_tilerelease">,
               Intrinsic<[], [], []>;
-  def int_x86_tilezero : GCCBuiltin<"__builtin_ia32_tilezero">,
+  def int_x86_tilezero : ClangBuiltin<"__builtin_ia32_tilezero">,
               Intrinsic<[], [llvm_i8_ty], [ImmArg<ArgIndex<0>>]>;
-  def int_x86_tileloadd64 : GCCBuiltin<"__builtin_ia32_tileloadd64">,
+  def int_x86_tileloadd64 : ClangBuiltin<"__builtin_ia32_tileloadd64">,
               Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
                         [ImmArg<ArgIndex<0>>]>;
-  def int_x86_tileloaddt164 : GCCBuiltin<"__builtin_ia32_tileloaddt164">,
+  def int_x86_tileloaddt164 : ClangBuiltin<"__builtin_ia32_tileloaddt164">,
               Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
                         [ImmArg<ArgIndex<0>>]>;
-  def int_x86_tilestored64 : GCCBuiltin<"__builtin_ia32_tilestored64">,
+  def int_x86_tilestored64 : ClangBuiltin<"__builtin_ia32_tilestored64">,
               Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
                         [ImmArg<ArgIndex<0>>]>;
-  def int_x86_tdpbssd : GCCBuiltin<"__builtin_ia32_tdpbssd">,
+  def int_x86_tdpbssd : ClangBuiltin<"__builtin_ia32_tdpbssd">,
               Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
                         [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
                          ImmArg<ArgIndex<2>>]>;
-  def int_x86_tdpbsud : GCCBuiltin<"__builtin_ia32_tdpbsud">,
+  def int_x86_tdpbsud : ClangBuiltin<"__builtin_ia32_tdpbsud">,
               Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
                         [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
                          ImmArg<ArgIndex<2>>]>;
-  def int_x86_tdpbusd : GCCBuiltin<"__builtin_ia32_tdpbusd">,
+  def int_x86_tdpbusd : ClangBuiltin<"__builtin_ia32_tdpbusd">,
               Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
                         [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
                          ImmArg<ArgIndex<2>>]>;
-  def int_x86_tdpbuud : GCCBuiltin<"__builtin_ia32_tdpbuud">,
+  def int_x86_tdpbuud : ClangBuiltin<"__builtin_ia32_tdpbuud">,
               Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
                         [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
                          ImmArg<ArgIndex<2>>]>;
-  def int_x86_tdpbf16ps : GCCBuiltin<"__builtin_ia32_tdpbf16ps">,
+  def int_x86_tdpbf16ps : ClangBuiltin<"__builtin_ia32_tdpbf16ps">,
               Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
                         [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
                          ImmArg<ArgIndex<2>>]>;
   // AMX - internal intrinsics
   def int_x86_ldtilecfg_internal :
-              GCCBuiltin<"__builtin_ia32_tile_loadconfig_internal">,
+              ClangBuiltin<"__builtin_ia32_tile_loadconfig_internal">,
               Intrinsic<[], [llvm_ptr_ty], []>;
   def int_x86_tileloadd64_internal :
-              GCCBuiltin<"__builtin_ia32_tileloadd64_internal">,
+              ClangBuiltin<"__builtin_ia32_tileloadd64_internal">,
               Intrinsic<[llvm_x86amx_ty],
                         [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
                         []>;
   def int_x86_tileloaddt164_internal :
-              GCCBuiltin<"__builtin_ia32_tileloaddt164_internal">,
+              ClangBuiltin<"__builtin_ia32_tileloaddt164_internal">,
               Intrinsic<[llvm_x86amx_ty],
                         [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
                         []>;
   def int_x86_tdpbssd_internal :
-              GCCBuiltin<"__builtin_ia32_tdpbssd_internal">,
+              ClangBuiltin<"__builtin_ia32_tdpbssd_internal">,
               Intrinsic<[llvm_x86amx_ty],
                         [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
                          llvm_x86amx_ty, llvm_x86amx_ty,
                          llvm_x86amx_ty], []>;
   def int_x86_tdpbsud_internal :
-              GCCBuiltin<"__builtin_ia32_tdpbsud_internal">,
+              ClangBuiltin<"__builtin_ia32_tdpbsud_internal">,
               Intrinsic<[llvm_x86amx_ty],
                         [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
                          llvm_x86amx_ty, llvm_x86amx_ty,
                          llvm_x86amx_ty], []>;
   def int_x86_tdpbusd_internal :
-              GCCBuiltin<"__builtin_ia32_tdpbusd_internal">,
+              ClangBuiltin<"__builtin_ia32_tdpbusd_internal">,
               Intrinsic<[llvm_x86amx_ty],
                         [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
                          llvm_x86amx_ty, llvm_x86amx_ty,
                          llvm_x86amx_ty], []>;
   def int_x86_tdpbuud_internal :
-              GCCBuiltin<"__builtin_ia32_tdpbuud_internal">,
+              ClangBuiltin<"__builtin_ia32_tdpbuud_internal">,
               Intrinsic<[llvm_x86amx_ty],
                         [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
                          llvm_x86amx_ty, llvm_x86amx_ty,
                          llvm_x86amx_ty], []>;
   def int_x86_tilestored64_internal :
-              GCCBuiltin<"__builtin_ia32_tilestored64_internal">,
+              ClangBuiltin<"__builtin_ia32_tilestored64_internal">,
               Intrinsic<[], [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty,
                              llvm_i64_ty, llvm_x86amx_ty], []>;
   def int_x86_tilezero_internal :
-              GCCBuiltin<"__builtin_ia32_tilezero_internal">,
+              ClangBuiltin<"__builtin_ia32_tilezero_internal">,
               Intrinsic<[llvm_x86amx_ty], [llvm_i16_ty, llvm_i16_ty],
                         []>;
   def int_x86_tdpbf16ps_internal :
-              GCCBuiltin<"__builtin_ia32_tdpbf16ps_internal">,
+              ClangBuiltin<"__builtin_ia32_tdpbf16ps_internal">,
               Intrinsic<[llvm_x86amx_ty],
                         [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
                          llvm_x86amx_ty, llvm_x86amx_ty,
@@ -5103,13 +5113,13 @@ let TargetPrefix = "x86" in {
 // UINTR - User Level Interrupt
 
 let TargetPrefix = "x86" in {
-  def int_x86_clui : GCCBuiltin<"__builtin_ia32_clui">,
+  def int_x86_clui : ClangBuiltin<"__builtin_ia32_clui">,
               Intrinsic<[], [], []>;
-  def int_x86_stui : GCCBuiltin<"__builtin_ia32_stui">,
+  def int_x86_stui : ClangBuiltin<"__builtin_ia32_stui">,
               Intrinsic<[], [], []>;
-  def int_x86_testui : GCCBuiltin<"__builtin_ia32_testui">,
+  def int_x86_testui : ClangBuiltin<"__builtin_ia32_testui">,
               Intrinsic<[llvm_i8_ty], [], []>;
-  def int_x86_senduipi : GCCBuiltin<"__builtin_ia32_senduipi">,
+  def int_x86_senduipi : ClangBuiltin<"__builtin_ia32_senduipi">,
               Intrinsic<[], [llvm_i64_ty], []>;
 }
 
@@ -5117,48 +5127,48 @@ let TargetPrefix = "x86" in {
 // avx512_fp16: vaddph
 let TargetPrefix = "x86" in {
   def int_x86_avx512fp16_add_ph_512
-      : GCCBuiltin<"__builtin_ia32_addph512">,
+      : ClangBuiltin<"__builtin_ia32_addph512">,
         Intrinsic<[ llvm_v32f16_ty ],
                   [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
   def int_x86_avx512fp16_sub_ph_512
-      : GCCBuiltin<"__builtin_ia32_subph512">,
+      : ClangBuiltin<"__builtin_ia32_subph512">,
         Intrinsic<[ llvm_v32f16_ty ],
                   [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
   def int_x86_avx512fp16_mul_ph_512
-      : GCCBuiltin<"__builtin_ia32_mulph512">,
+      : ClangBuiltin<"__builtin_ia32_mulph512">,
         Intrinsic<[ llvm_v32f16_ty ],
                   [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
   def int_x86_avx512fp16_div_ph_512
-      : GCCBuiltin<"__builtin_ia32_divph512">,
+      : ClangBuiltin<"__builtin_ia32_divph512">,
         Intrinsic<[ llvm_v32f16_ty ],
                   [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
   def int_x86_avx512fp16_max_ph_128
-      : GCCBuiltin<"__builtin_ia32_maxph128">,
+      : ClangBuiltin<"__builtin_ia32_maxph128">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v8f16_ty, llvm_v8f16_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_max_ph_256
-      : GCCBuiltin<"__builtin_ia32_maxph256">,
+      : ClangBuiltin<"__builtin_ia32_maxph256">,
         Intrinsic<[ llvm_v16f16_ty ],
                   [ llvm_v16f16_ty, llvm_v16f16_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_max_ph_512
-      : GCCBuiltin<"__builtin_ia32_maxph512">,
+      : ClangBuiltin<"__builtin_ia32_maxph512">,
         Intrinsic<[ llvm_v32f16_ty ],
                   [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
   def int_x86_avx512fp16_min_ph_128
-      : GCCBuiltin<"__builtin_ia32_minph128">,
+      : ClangBuiltin<"__builtin_ia32_minph128">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v8f16_ty, llvm_v8f16_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_min_ph_256
-      : GCCBuiltin<"__builtin_ia32_minph256">,
+      : ClangBuiltin<"__builtin_ia32_minph256">,
         Intrinsic<[ llvm_v16f16_ty ],
                   [ llvm_v16f16_ty, llvm_v16f16_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_min_ph_512
-      : GCCBuiltin<"__builtin_ia32_minph512">,
+      : ClangBuiltin<"__builtin_ia32_minph512">,
         Intrinsic<[ llvm_v32f16_ty ],
                   [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
@@ -5178,367 +5188,367 @@ let TargetPrefix = "x86" in {
                   [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
 
   def int_x86_avx512fp16_mask_add_sh_round
-      : GCCBuiltin<"__builtin_ia32_addsh_round_mask">,
+      : ClangBuiltin<"__builtin_ia32_addsh_round_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
                     llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
   def int_x86_avx512fp16_mask_sub_sh_round
-      : GCCBuiltin<"__builtin_ia32_subsh_round_mask">,
+      : ClangBuiltin<"__builtin_ia32_subsh_round_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
                     llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
   def int_x86_avx512fp16_mask_mul_sh_round
-      : GCCBuiltin<"__builtin_ia32_mulsh_round_mask">,
+      : ClangBuiltin<"__builtin_ia32_mulsh_round_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
                     llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
   def int_x86_avx512fp16_mask_div_sh_round
-      : GCCBuiltin<"__builtin_ia32_divsh_round_mask">,
+      : ClangBuiltin<"__builtin_ia32_divsh_round_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
                     llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
   def int_x86_avx512fp16_mask_min_sh_round
-      : GCCBuiltin<"__builtin_ia32_minsh_round_mask">,
+      : ClangBuiltin<"__builtin_ia32_minsh_round_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
                     llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
   def int_x86_avx512fp16_mask_max_sh_round
-      : GCCBuiltin<"__builtin_ia32_maxsh_round_mask">,
+      : ClangBuiltin<"__builtin_ia32_maxsh_round_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
                     llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
   def int_x86_avx512fp16_mask_cmp_sh
-      : GCCBuiltin<"__builtin_ia32_cmpsh_mask">,
+      : ClangBuiltin<"__builtin_ia32_cmpsh_mask">,
         Intrinsic<[ llvm_i8_ty ],
                   [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i32_ty, llvm_i8_ty,
                     llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>> ]>;
   def int_x86_avx512fp16_vcomi_sh
-      : GCCBuiltin<"__builtin_ia32_vcomish">,
+      : ClangBuiltin<"__builtin_ia32_vcomish">,
         Intrinsic<[ llvm_i32_ty ],
                   [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i32_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>> ]>;
 
   def int_x86_avx512fp16_mask_vcvtph2psx_128
-      : GCCBuiltin<"__builtin_ia32_vcvtph2psx128_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtph2psx128_mask">,
         Intrinsic<[ llvm_v4f32_ty ],
                   [ llvm_v8f16_ty, llvm_v4f32_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvtph2psx_256
-      : GCCBuiltin<"__builtin_ia32_vcvtph2psx256_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtph2psx256_mask">,
         Intrinsic<[ llvm_v8f32_ty ],
                   [ llvm_v8f16_ty, llvm_v8f32_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvtph2psx_512
-      : GCCBuiltin<"__builtin_ia32_vcvtph2psx512_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtph2psx512_mask">,
         Intrinsic<[ llvm_v16f32_ty ],
                   [ llvm_v16f16_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
   def int_x86_avx512fp16_mask_vcvtps2phx_128
-      : GCCBuiltin<"__builtin_ia32_vcvtps2phx128_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtps2phx128_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v4f32_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvtps2phx_256
-      : GCCBuiltin<"__builtin_ia32_vcvtps2phx256_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtps2phx256_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v8f32_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvtps2phx_512
-      : GCCBuiltin<"__builtin_ia32_vcvtps2phx512_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtps2phx512_mask">,
         Intrinsic<[ llvm_v16f16_ty ],
                   [ llvm_v16f32_ty, llvm_v16f16_ty, llvm_i16_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
   def int_x86_avx512fp16_mask_vcvtpd2ph_128
-      : GCCBuiltin<"__builtin_ia32_vcvtpd2ph128_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtpd2ph128_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v2f64_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvtpd2ph_256
-      : GCCBuiltin<"__builtin_ia32_vcvtpd2ph256_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtpd2ph256_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v4f64_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvtpd2ph_512
-      : GCCBuiltin<"__builtin_ia32_vcvtpd2ph512_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtpd2ph512_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v8f64_ty, llvm_v8f16_ty, llvm_i8_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
   def int_x86_avx512fp16_mask_vcvtph2pd_128
-      : GCCBuiltin<"__builtin_ia32_vcvtph2pd128_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtph2pd128_mask">,
         Intrinsic<[ llvm_v2f64_ty ],
                   [ llvm_v8f16_ty, llvm_v2f64_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvtph2pd_256
-      : GCCBuiltin<"__builtin_ia32_vcvtph2pd256_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtph2pd256_mask">,
         Intrinsic<[ llvm_v4f64_ty ],
                   [ llvm_v8f16_ty, llvm_v4f64_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvtph2pd_512
-      : GCCBuiltin<"__builtin_ia32_vcvtph2pd512_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtph2pd512_mask">,
         Intrinsic<[ llvm_v8f64_ty ],
                   [ llvm_v8f16_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
   def int_x86_avx512fp16_mask_vcvtsh2ss_round
-      : GCCBuiltin<"__builtin_ia32_vcvtsh2ss_round_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtsh2ss_round_mask">,
         Intrinsic<[ llvm_v4f32_ty ],
                   [ llvm_v4f32_ty, llvm_v8f16_ty, llvm_v4f32_ty, llvm_i8_ty,
                     llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
   def int_x86_avx512fp16_mask_vcvtss2sh_round
-      : GCCBuiltin<"__builtin_ia32_vcvtss2sh_round_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtss2sh_round_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v8f16_ty, llvm_v4f32_ty, llvm_v8f16_ty, llvm_i8_ty,
                     llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
   def int_x86_avx512fp16_mask_vcvtsd2sh_round
-      : GCCBuiltin<"__builtin_ia32_vcvtsd2sh_round_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtsd2sh_round_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v8f16_ty, llvm_v2f64_ty, llvm_v8f16_ty, llvm_i8_ty,
                     llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
   def int_x86_avx512fp16_mask_vcvtsh2sd_round
-      : GCCBuiltin<"__builtin_ia32_vcvtsh2sd_round_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtsh2sd_round_mask">,
         Intrinsic<[ llvm_v2f64_ty ],
                   [ llvm_v2f64_ty, llvm_v8f16_ty, llvm_v2f64_ty, llvm_i8_ty,
                     llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
 
   def int_x86_avx512fp16_mask_vcvtph2w_128
-      : GCCBuiltin<"__builtin_ia32_vcvtph2w128_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtph2w128_mask">,
         Intrinsic<[ llvm_v8i16_ty ],
                   [ llvm_v8f16_ty, llvm_v8i16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvtph2w_256
-      : GCCBuiltin<"__builtin_ia32_vcvtph2w256_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtph2w256_mask">,
         Intrinsic<[ llvm_v16i16_ty ],
                   [ llvm_v16f16_ty, llvm_v16i16_ty, llvm_i16_ty ],
                   [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvtph2w_512
-      : GCCBuiltin<"__builtin_ia32_vcvtph2w512_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtph2w512_mask">,
         Intrinsic<[ llvm_v32i16_ty ],
                   [ llvm_v32f16_ty, llvm_v32i16_ty, llvm_i32_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
   def int_x86_avx512fp16_mask_vcvttph2w_128
-      : GCCBuiltin<"__builtin_ia32_vcvttph2w128_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvttph2w128_mask">,
         Intrinsic<[ llvm_v8i16_ty ],
                   [ llvm_v8f16_ty, llvm_v8i16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvttph2w_256
-      : GCCBuiltin<"__builtin_ia32_vcvttph2w256_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvttph2w256_mask">,
         Intrinsic<[ llvm_v16i16_ty ],
                   [ llvm_v16f16_ty, llvm_v16i16_ty, llvm_i16_ty ],
                   [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvttph2w_512
-      : GCCBuiltin<"__builtin_ia32_vcvttph2w512_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvttph2w512_mask">,
         Intrinsic<[ llvm_v32i16_ty ],
                   [ llvm_v32f16_ty, llvm_v32i16_ty, llvm_i32_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
   def int_x86_avx512fp16_mask_vcvtph2uw_128
-      : GCCBuiltin<"__builtin_ia32_vcvtph2uw128_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtph2uw128_mask">,
         Intrinsic<[ llvm_v8i16_ty ],
                   [ llvm_v8f16_ty, llvm_v8i16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvtph2uw_256
-      : GCCBuiltin<"__builtin_ia32_vcvtph2uw256_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtph2uw256_mask">,
         Intrinsic<[ llvm_v16i16_ty ],
                   [ llvm_v16f16_ty, llvm_v16i16_ty, llvm_i16_ty ],
                   [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvtph2uw_512
-      : GCCBuiltin<"__builtin_ia32_vcvtph2uw512_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtph2uw512_mask">,
         Intrinsic<[ llvm_v32i16_ty ],
                   [ llvm_v32f16_ty, llvm_v32i16_ty, llvm_i32_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
   def int_x86_avx512fp16_mask_vcvttph2uw_128
-      : GCCBuiltin<"__builtin_ia32_vcvttph2uw128_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvttph2uw128_mask">,
         Intrinsic<[ llvm_v8i16_ty ],
                   [ llvm_v8f16_ty, llvm_v8i16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvttph2uw_256
-      : GCCBuiltin<"__builtin_ia32_vcvttph2uw256_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvttph2uw256_mask">,
         Intrinsic<[ llvm_v16i16_ty ],
                   [ llvm_v16f16_ty, llvm_v16i16_ty, llvm_i16_ty ],
                   [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvttph2uw_512
-      : GCCBuiltin<"__builtin_ia32_vcvttph2uw512_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvttph2uw512_mask">,
         Intrinsic<[ llvm_v32i16_ty ],
                   [ llvm_v32f16_ty, llvm_v32i16_ty, llvm_i32_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
 
   def int_x86_avx512fp16_mask_vcvtph2dq_128
-      : GCCBuiltin<"__builtin_ia32_vcvtph2dq128_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtph2dq128_mask">,
         Intrinsic<[ llvm_v4i32_ty ],
                   [ llvm_v8f16_ty, llvm_v4i32_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvtph2dq_256
-      : GCCBuiltin<"__builtin_ia32_vcvtph2dq256_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtph2dq256_mask">,
         Intrinsic<[ llvm_v8i32_ty ],
                   [ llvm_v8f16_ty, llvm_v8i32_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvtph2dq_512
-      : GCCBuiltin<"__builtin_ia32_vcvtph2dq512_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtph2dq512_mask">,
         Intrinsic<[ llvm_v16i32_ty ],
                   [ llvm_v16f16_ty, llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
   def int_x86_avx512fp16_mask_vcvtph2udq_128
-      : GCCBuiltin<"__builtin_ia32_vcvtph2udq128_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtph2udq128_mask">,
         Intrinsic<[ llvm_v4i32_ty ],
                   [ llvm_v8f16_ty, llvm_v4i32_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvtph2udq_256
-      : GCCBuiltin<"__builtin_ia32_vcvtph2udq256_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtph2udq256_mask">,
         Intrinsic<[ llvm_v8i32_ty ],
                   [ llvm_v8f16_ty, llvm_v8i32_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvtph2udq_512
-      : GCCBuiltin<"__builtin_ia32_vcvtph2udq512_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtph2udq512_mask">,
         Intrinsic<[ llvm_v16i32_ty ],
                   [ llvm_v16f16_ty, llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
   def int_x86_avx512fp16_mask_vcvtdq2ph_128
-      : GCCBuiltin<"__builtin_ia32_vcvtdq2ph128_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtdq2ph128_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v4i32_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvtudq2ph_128
-      : GCCBuiltin<"__builtin_ia32_vcvtudq2ph128_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtudq2ph128_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v4i32_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvttph2dq_128
-      : GCCBuiltin<"__builtin_ia32_vcvttph2dq128_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvttph2dq128_mask">,
         Intrinsic<[ llvm_v4i32_ty ],
                   [ llvm_v8f16_ty, llvm_v4i32_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvttph2dq_256
-      : GCCBuiltin<"__builtin_ia32_vcvttph2dq256_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvttph2dq256_mask">,
         Intrinsic<[ llvm_v8i32_ty ],
                   [ llvm_v8f16_ty, llvm_v8i32_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvttph2dq_512
-      : GCCBuiltin<"__builtin_ia32_vcvttph2dq512_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvttph2dq512_mask">,
         Intrinsic<[ llvm_v16i32_ty ],
                   [ llvm_v16f16_ty, llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
   def int_x86_avx512fp16_mask_vcvttph2udq_128
-      : GCCBuiltin<"__builtin_ia32_vcvttph2udq128_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvttph2udq128_mask">,
         Intrinsic<[ llvm_v4i32_ty ],
                   [ llvm_v8f16_ty, llvm_v4i32_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvttph2udq_256
-      : GCCBuiltin<"__builtin_ia32_vcvttph2udq256_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvttph2udq256_mask">,
         Intrinsic<[ llvm_v8i32_ty ],
                   [ llvm_v8f16_ty, llvm_v8i32_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvttph2udq_512
-      : GCCBuiltin<"__builtin_ia32_vcvttph2udq512_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvttph2udq512_mask">,
         Intrinsic<[ llvm_v16i32_ty ],
                   [ llvm_v16f16_ty, llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
 
   def int_x86_avx512fp16_mask_vcvtqq2ph_128
-      : GCCBuiltin<"__builtin_ia32_vcvtqq2ph128_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtqq2ph128_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v2i64_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvtqq2ph_256
-      : GCCBuiltin<"__builtin_ia32_vcvtqq2ph256_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtqq2ph256_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v4i64_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvtph2qq_128
-      : GCCBuiltin<"__builtin_ia32_vcvtph2qq128_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtph2qq128_mask">,
         Intrinsic<[ llvm_v2i64_ty ],
                   [ llvm_v8f16_ty, llvm_v2i64_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvtph2qq_256
-      : GCCBuiltin<"__builtin_ia32_vcvtph2qq256_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtph2qq256_mask">,
         Intrinsic<[ llvm_v4i64_ty ],
                   [ llvm_v8f16_ty, llvm_v4i64_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvtph2qq_512
-      : GCCBuiltin<"__builtin_ia32_vcvtph2qq512_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtph2qq512_mask">,
         Intrinsic<[ llvm_v8i64_ty ],
                   [ llvm_v8f16_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
   def int_x86_avx512fp16_mask_vcvtuqq2ph_128
-      : GCCBuiltin<"__builtin_ia32_vcvtuqq2ph128_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtuqq2ph128_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v2i64_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvtuqq2ph_256
-      : GCCBuiltin<"__builtin_ia32_vcvtuqq2ph256_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtuqq2ph256_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v4i64_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvtph2uqq_128
-      : GCCBuiltin<"__builtin_ia32_vcvtph2uqq128_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtph2uqq128_mask">,
         Intrinsic<[ llvm_v2i64_ty ],
                   [ llvm_v8f16_ty, llvm_v2i64_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvtph2uqq_256
-      : GCCBuiltin<"__builtin_ia32_vcvtph2uqq256_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtph2uqq256_mask">,
         Intrinsic<[ llvm_v4i64_ty ],
                   [ llvm_v8f16_ty, llvm_v4i64_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvtph2uqq_512
-      : GCCBuiltin<"__builtin_ia32_vcvtph2uqq512_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvtph2uqq512_mask">,
         Intrinsic<[ llvm_v8i64_ty ],
                   [ llvm_v8f16_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
   def int_x86_avx512fp16_mask_vcvttph2qq_128
-      : GCCBuiltin<"__builtin_ia32_vcvttph2qq128_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvttph2qq128_mask">,
         Intrinsic<[ llvm_v2i64_ty ],
                   [ llvm_v8f16_ty, llvm_v2i64_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvttph2qq_256
-      : GCCBuiltin<"__builtin_ia32_vcvttph2qq256_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvttph2qq256_mask">,
         Intrinsic<[ llvm_v4i64_ty ],
                   [ llvm_v8f16_ty, llvm_v4i64_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvttph2qq_512
-      : GCCBuiltin<"__builtin_ia32_vcvttph2qq512_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvttph2qq512_mask">,
         Intrinsic<[ llvm_v8i64_ty ],
                   [ llvm_v8f16_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
   def int_x86_avx512fp16_mask_vcvttph2uqq_128
-      : GCCBuiltin<"__builtin_ia32_vcvttph2uqq128_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvttph2uqq128_mask">,
         Intrinsic<[ llvm_v2i64_ty ],
                   [ llvm_v8f16_ty, llvm_v2i64_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvttph2uqq_256
-      : GCCBuiltin<"__builtin_ia32_vcvttph2uqq256_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvttph2uqq256_mask">,
         Intrinsic<[ llvm_v4i64_ty ],
                   [ llvm_v8f16_ty, llvm_v4i64_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vcvttph2uqq_512
-      : GCCBuiltin<"__builtin_ia32_vcvttph2uqq512_mask">,
+      : ClangBuiltin<"__builtin_ia32_vcvttph2uqq512_mask">,
         Intrinsic<[ llvm_v8i64_ty ],
                   [ llvm_v8f16_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
 
   def int_x86_avx512fp16_vcvtsh2si32
-      : GCCBuiltin<"__builtin_ia32_vcvtsh2si32">,
+      : ClangBuiltin<"__builtin_ia32_vcvtsh2si32">,
         Intrinsic<[ llvm_i32_ty ], [ llvm_v8f16_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
   def int_x86_avx512fp16_vcvtsh2usi32
-      : GCCBuiltin<"__builtin_ia32_vcvtsh2usi32">,
+      : ClangBuiltin<"__builtin_ia32_vcvtsh2usi32">,
         Intrinsic<[ llvm_i32_ty ], [ llvm_v8f16_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
   def int_x86_avx512fp16_vcvtsh2si64
-      : GCCBuiltin<"__builtin_ia32_vcvtsh2si64">,
+      : ClangBuiltin<"__builtin_ia32_vcvtsh2si64">,
         Intrinsic<[ llvm_i64_ty ], [ llvm_v8f16_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
   def int_x86_avx512fp16_vcvtsh2usi64
-      : GCCBuiltin<"__builtin_ia32_vcvtsh2usi64">,
+      : ClangBuiltin<"__builtin_ia32_vcvtsh2usi64">,
         Intrinsic<[ llvm_i64_ty ], [ llvm_v8f16_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
   def int_x86_avx512fp16_vcvtusi2sh
-      : GCCBuiltin<"__builtin_ia32_vcvtusi2sh">,
+      : ClangBuiltin<"__builtin_ia32_vcvtusi2sh">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v8f16_ty, llvm_i32_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
   def int_x86_avx512fp16_vcvtusi642sh
-      : GCCBuiltin<"__builtin_ia32_vcvtusi642sh">,
+      : ClangBuiltin<"__builtin_ia32_vcvtusi642sh">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v8f16_ty, llvm_i64_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
   def int_x86_avx512fp16_vcvtsi2sh
-      : GCCBuiltin<"__builtin_ia32_vcvtsi2sh">,
+      : ClangBuiltin<"__builtin_ia32_vcvtsi2sh">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v8f16_ty, llvm_i32_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
   def int_x86_avx512fp16_vcvtsi642sh
-      : GCCBuiltin<"__builtin_ia32_vcvtsi642sh">,
+      : ClangBuiltin<"__builtin_ia32_vcvtsi642sh">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v8f16_ty, llvm_i64_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
   def int_x86_avx512fp16_vcvttsh2si32
-      : GCCBuiltin<"__builtin_ia32_vcvttsh2si32">,
+      : ClangBuiltin<"__builtin_ia32_vcvttsh2si32">,
         Intrinsic<[ llvm_i32_ty ], [ llvm_v8f16_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
   def int_x86_avx512fp16_vcvttsh2si64
-      : GCCBuiltin<"__builtin_ia32_vcvttsh2si64">,
+      : ClangBuiltin<"__builtin_ia32_vcvttsh2si64">,
         Intrinsic<[ llvm_i64_ty ], [ llvm_v8f16_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
   def int_x86_avx512fp16_vcvttsh2usi32
-      : GCCBuiltin<"__builtin_ia32_vcvttsh2usi32">,
+      : ClangBuiltin<"__builtin_ia32_vcvttsh2usi32">,
         Intrinsic<[ llvm_i32_ty ], [ llvm_v8f16_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
   def int_x86_avx512fp16_vcvttsh2usi64
-      : GCCBuiltin<"__builtin_ia32_vcvttsh2usi64">,
+      : ClangBuiltin<"__builtin_ia32_vcvttsh2usi64">,
         Intrinsic<[ llvm_i64_ty ], [ llvm_v8f16_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
 
@@ -5551,61 +5561,61 @@ let TargetPrefix = "x86" in {
                     llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
   def int_x86_avx512fp16_mask_rsqrt_ph_128
-      : GCCBuiltin<"__builtin_ia32_rsqrtph128_mask">,
+      : ClangBuiltin<"__builtin_ia32_rsqrtph128_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_rsqrt_ph_256
-      : GCCBuiltin<"__builtin_ia32_rsqrtph256_mask">,
+      : ClangBuiltin<"__builtin_ia32_rsqrtph256_mask">,
         Intrinsic<[ llvm_v16f16_ty ],
                   [ llvm_v16f16_ty, llvm_v16f16_ty, llvm_i16_ty ],
                   [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_rsqrt_ph_512
-      : GCCBuiltin<"__builtin_ia32_rsqrtph512_mask">,
+      : ClangBuiltin<"__builtin_ia32_rsqrtph512_mask">,
         Intrinsic<[ llvm_v32f16_ty ],
                   [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ],
                   [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_rsqrt_sh
-      : GCCBuiltin<"__builtin_ia32_rsqrtsh_mask">,
+      : ClangBuiltin<"__builtin_ia32_rsqrtsh_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty ],
                   [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_rcp_ph_128
-      : GCCBuiltin<"__builtin_ia32_rcpph128_mask">,
+      : ClangBuiltin<"__builtin_ia32_rcpph128_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_rcp_ph_256
-      : GCCBuiltin<"__builtin_ia32_rcpph256_mask">,
+      : ClangBuiltin<"__builtin_ia32_rcpph256_mask">,
         Intrinsic<[ llvm_v16f16_ty ],
                   [ llvm_v16f16_ty, llvm_v16f16_ty, llvm_i16_ty ],
                   [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_rcp_ph_512
-      : GCCBuiltin<"__builtin_ia32_rcpph512_mask">,
+      : ClangBuiltin<"__builtin_ia32_rcpph512_mask">,
         Intrinsic<[ llvm_v32f16_ty ],
                   [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ],
                   [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_rcp_sh
-      : GCCBuiltin<"__builtin_ia32_rcpsh_mask">,
+      : ClangBuiltin<"__builtin_ia32_rcpsh_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty ],
                   [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_reduce_ph_128
-      : GCCBuiltin<"__builtin_ia32_reduceph128_mask">,
+      : ClangBuiltin<"__builtin_ia32_reduceph128_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v8f16_ty, llvm_i32_ty, llvm_v8f16_ty, llvm_i8_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
   def int_x86_avx512fp16_mask_reduce_ph_256
-      : GCCBuiltin<"__builtin_ia32_reduceph256_mask">,
+      : ClangBuiltin<"__builtin_ia32_reduceph256_mask">,
         Intrinsic<[ llvm_v16f16_ty ],
                   [ llvm_v16f16_ty, llvm_i32_ty, llvm_v16f16_ty, llvm_i16_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
   def int_x86_avx512fp16_mask_reduce_ph_512
-      : GCCBuiltin<"__builtin_ia32_reduceph512_mask">,
+      : ClangBuiltin<"__builtin_ia32_reduceph512_mask">,
         Intrinsic<[ llvm_v32f16_ty ],
                   [ llvm_v32f16_ty, llvm_i32_ty, llvm_v32f16_ty, llvm_i32_ty,
                     llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>> ]>;
   def int_x86_avx512fp16_mask_reduce_sh
-      : GCCBuiltin<"__builtin_ia32_reducesh_mask">,
+      : ClangBuiltin<"__builtin_ia32_reducesh_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
                     llvm_i32_ty, llvm_i32_ty ],
@@ -5620,91 +5630,91 @@ let TargetPrefix = "x86" in {
       : Intrinsic<[ llvm_v32i1_ty ], [ llvm_v32f16_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
   def int_x86_avx512fp16_mask_fpclass_sh
-      : GCCBuiltin<"__builtin_ia32_fpclasssh_mask">,
+      : ClangBuiltin<"__builtin_ia32_fpclasssh_mask">,
         Intrinsic<[ llvm_i8_ty ], [ llvm_v8f16_ty, llvm_i32_ty, llvm_i8_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
   def int_x86_avx512fp16_mask_getexp_ph_128
-      : GCCBuiltin<"__builtin_ia32_getexpph128_mask">,
+      : ClangBuiltin<"__builtin_ia32_getexpph128_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_getexp_ph_256
-      : GCCBuiltin<"__builtin_ia32_getexpph256_mask">,
+      : ClangBuiltin<"__builtin_ia32_getexpph256_mask">,
         Intrinsic<[ llvm_v16f16_ty ],
                   [ llvm_v16f16_ty, llvm_v16f16_ty, llvm_i16_ty ],
                   [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_getexp_ph_512
-      : GCCBuiltin<"__builtin_ia32_getexpph512_mask">,
+      : ClangBuiltin<"__builtin_ia32_getexpph512_mask">,
         Intrinsic<[ llvm_v32f16_ty ],
                   [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
   def int_x86_avx512fp16_mask_getexp_sh
-      : GCCBuiltin<"__builtin_ia32_getexpsh128_round_mask">,
+      : ClangBuiltin<"__builtin_ia32_getexpsh128_round_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
                     llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
   def int_x86_avx512fp16_mask_getmant_ph_128
-      : GCCBuiltin<"__builtin_ia32_getmantph128_mask">,
+      : ClangBuiltin<"__builtin_ia32_getmantph128_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v8f16_ty, llvm_i32_ty, llvm_v8f16_ty, llvm_i8_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
   def int_x86_avx512fp16_mask_getmant_ph_256
-      : GCCBuiltin<"__builtin_ia32_getmantph256_mask">,
+      : ClangBuiltin<"__builtin_ia32_getmantph256_mask">,
         Intrinsic<[ llvm_v16f16_ty ],
                   [ llvm_v16f16_ty, llvm_i32_ty, llvm_v16f16_ty, llvm_i16_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
   def int_x86_avx512fp16_mask_getmant_ph_512
-      : GCCBuiltin<"__builtin_ia32_getmantph512_mask">,
+      : ClangBuiltin<"__builtin_ia32_getmantph512_mask">,
         Intrinsic<[ llvm_v32f16_ty ],
                   [ llvm_v32f16_ty, llvm_i32_ty, llvm_v32f16_ty, llvm_i32_ty,
                     llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>> ]>;
   def int_x86_avx512fp16_mask_getmant_sh
-      : GCCBuiltin<"__builtin_ia32_getmantsh_round_mask">,
+      : ClangBuiltin<"__builtin_ia32_getmantsh_round_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i32_ty, llvm_v8f16_ty,
                     llvm_i8_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>> ]>;
   def int_x86_avx512fp16_mask_rndscale_ph_128
-      : GCCBuiltin<"__builtin_ia32_rndscaleph_128_mask">,
+      : ClangBuiltin<"__builtin_ia32_rndscaleph_128_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v8f16_ty, llvm_i32_ty, llvm_v8f16_ty, llvm_i8_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
   def int_x86_avx512fp16_mask_rndscale_ph_256
-      : GCCBuiltin<"__builtin_ia32_rndscaleph_256_mask">,
+      : ClangBuiltin<"__builtin_ia32_rndscaleph_256_mask">,
         Intrinsic<[ llvm_v16f16_ty ],
                   [ llvm_v16f16_ty, llvm_i32_ty, llvm_v16f16_ty, llvm_i16_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
   def int_x86_avx512fp16_mask_rndscale_ph_512
-      : GCCBuiltin<"__builtin_ia32_rndscaleph_mask">,
+      : ClangBuiltin<"__builtin_ia32_rndscaleph_mask">,
         Intrinsic<[ llvm_v32f16_ty ],
                   [ llvm_v32f16_ty, llvm_i32_ty, llvm_v32f16_ty, llvm_i32_ty,
                     llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>> ]>;
   def int_x86_avx512fp16_mask_rndscale_sh
-      : GCCBuiltin<"__builtin_ia32_rndscalesh_round_mask">,
+      : ClangBuiltin<"__builtin_ia32_rndscalesh_round_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
                     llvm_i32_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>> ]>;
   def int_x86_avx512fp16_mask_scalef_ph_128
-      : GCCBuiltin<"__builtin_ia32_scalefph128_mask">,
+      : ClangBuiltin<"__builtin_ia32_scalefph128_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty ],
                   [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_scalef_ph_256
-      : GCCBuiltin<"__builtin_ia32_scalefph256_mask">,
+      : ClangBuiltin<"__builtin_ia32_scalefph256_mask">,
         Intrinsic<[ llvm_v16f16_ty ],
                   [ llvm_v16f16_ty, llvm_v16f16_ty, llvm_v16f16_ty, llvm_i16_ty ],
                   [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_scalef_ph_512
-      : GCCBuiltin<"__builtin_ia32_scalefph512_mask">,
+      : ClangBuiltin<"__builtin_ia32_scalefph512_mask">,
         Intrinsic<[ llvm_v32f16_ty ],
                   [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty,
                     llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
   def int_x86_avx512fp16_mask_scalef_sh
-      : GCCBuiltin<"__builtin_ia32_scalefsh_round_mask">,
+      : ClangBuiltin<"__builtin_ia32_scalefsh_round_mask">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
                     llvm_i32_ty ],
@@ -5715,12 +5725,12 @@ let TargetPrefix = "x86" in {
                   [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
   def int_x86_avx512fp16_vfmaddsub_ph_128
-      : GCCBuiltin<"__builtin_ia32_vfmaddsubph">,
+      : ClangBuiltin<"__builtin_ia32_vfmaddsubph">,
         Intrinsic<[ llvm_v8f16_ty ],
                   [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty ],
                   [ IntrNoMem ]>;
   def int_x86_avx512fp16_vfmaddsub_ph_256
-      : GCCBuiltin<"__builtin_ia32_vfmaddsubph256">,
+      : ClangBuiltin<"__builtin_ia32_vfmaddsubph256">,
         Intrinsic<[ llvm_v16f16_ty ],
                   [ llvm_v16f16_ty, llvm_v16f16_ty, llvm_v16f16_ty ],
                   [ IntrNoMem ]>;
@@ -5734,133 +5744,133 @@ let TargetPrefix = "x86" in {
                   [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
 
   def int_x86_avx512fp16_mask_vfcmadd_cph_128
-      : GCCBuiltin<"__builtin_ia32_vfcmaddcph128_mask">,
+      : ClangBuiltin<"__builtin_ia32_vfcmaddcph128_mask">,
         Intrinsic<[ llvm_v4f32_ty ],
                   [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty ],
                   [ IntrNoMem ]>;
   def int_x86_avx512fp16_maskz_vfcmadd_cph_128
-      : GCCBuiltin<"__builtin_ia32_vfcmaddcph128_maskz">,
+      : ClangBuiltin<"__builtin_ia32_vfcmaddcph128_maskz">,
         Intrinsic<[ llvm_v4f32_ty ],
                   [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty ],
                   [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vfcmadd_cph_256
-      : GCCBuiltin<"__builtin_ia32_vfcmaddcph256_mask">,
+      : ClangBuiltin<"__builtin_ia32_vfcmaddcph256_mask">,
         Intrinsic<[ llvm_v8f32_ty ],
                   [ llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty ],
                   [ IntrNoMem ]>;
   def int_x86_avx512fp16_maskz_vfcmadd_cph_256
-      : GCCBuiltin<"__builtin_ia32_vfcmaddcph256_maskz">,
+      : ClangBuiltin<"__builtin_ia32_vfcmaddcph256_maskz">,
         Intrinsic<[ llvm_v8f32_ty ],
                   [ llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty ],
                   [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vfcmadd_cph_512
-      : GCCBuiltin<"__builtin_ia32_vfcmaddcph512_mask3">,
+      : ClangBuiltin<"__builtin_ia32_vfcmaddcph512_mask3">,
         Intrinsic<[ llvm_v16f32_ty ],
                   [ llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
                     llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
   def int_x86_avx512fp16_maskz_vfcmadd_cph_512
-      : GCCBuiltin<"__builtin_ia32_vfcmaddcph512_maskz">,
+      : ClangBuiltin<"__builtin_ia32_vfcmaddcph512_maskz">,
         Intrinsic<[ llvm_v16f32_ty ],
                   [ llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
                     llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
   def int_x86_avx512fp16_mask_vfmadd_cph_128
-      : GCCBuiltin<"__builtin_ia32_vfmaddcph128_mask">,
+      : ClangBuiltin<"__builtin_ia32_vfmaddcph128_mask">,
         Intrinsic<[ llvm_v4f32_ty ],
                   [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty ],
                   [ IntrNoMem ]>;
   def int_x86_avx512fp16_maskz_vfmadd_cph_128
-      : GCCBuiltin<"__builtin_ia32_vfmaddcph128_maskz">,
+      : ClangBuiltin<"__builtin_ia32_vfmaddcph128_maskz">,
         Intrinsic<[ llvm_v4f32_ty ],
                   [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty ],
                   [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vfmadd_cph_256
-      : GCCBuiltin<"__builtin_ia32_vfmaddcph256_mask">,
+      : ClangBuiltin<"__builtin_ia32_vfmaddcph256_mask">,
         Intrinsic<[ llvm_v8f32_ty ],
                   [ llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty ],
                   [ IntrNoMem ]>;
   def int_x86_avx512fp16_maskz_vfmadd_cph_256
-      : GCCBuiltin<"__builtin_ia32_vfmaddcph256_maskz">,
+      : ClangBuiltin<"__builtin_ia32_vfmaddcph256_maskz">,
         Intrinsic<[ llvm_v8f32_ty ],
                   [ llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty ],
                   [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vfmadd_cph_512
-      : GCCBuiltin<"__builtin_ia32_vfmaddcph512_mask3">,
+      : ClangBuiltin<"__builtin_ia32_vfmaddcph512_mask3">,
         Intrinsic<[ llvm_v16f32_ty ],
                   [ llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
                     llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
   def int_x86_avx512fp16_maskz_vfmadd_cph_512
-      : GCCBuiltin<"__builtin_ia32_vfmaddcph512_maskz">,
+      : ClangBuiltin<"__builtin_ia32_vfmaddcph512_maskz">,
         Intrinsic<[ llvm_v16f32_ty ],
                   [ llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
                     llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
   def int_x86_avx512fp16_mask_vfmadd_csh
-      : GCCBuiltin<"__builtin_ia32_vfmaddcsh_mask">,
+      : ClangBuiltin<"__builtin_ia32_vfmaddcsh_mask">,
         Intrinsic<[ llvm_v4f32_ty ],
                   [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
                     llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
   def int_x86_avx512fp16_maskz_vfmadd_csh
-      : GCCBuiltin<"__builtin_ia32_vfmaddcsh_maskz">,
+      : ClangBuiltin<"__builtin_ia32_vfmaddcsh_maskz">,
         Intrinsic<[ llvm_v4f32_ty ],
                   [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
                     llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
   def int_x86_avx512fp16_mask_vfcmadd_csh
-      : GCCBuiltin<"__builtin_ia32_vfcmaddcsh_mask">,
+      : ClangBuiltin<"__builtin_ia32_vfcmaddcsh_mask">,
         Intrinsic<[ llvm_v4f32_ty ],
                   [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
                     llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
   def int_x86_avx512fp16_maskz_vfcmadd_csh
-      : GCCBuiltin<"__builtin_ia32_vfcmaddcsh_maskz">,
+      : ClangBuiltin<"__builtin_ia32_vfcmaddcsh_maskz">,
         Intrinsic<[ llvm_v4f32_ty ],
                   [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
                     llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
   def int_x86_avx512fp16_mask_vfmul_cph_128
-      : GCCBuiltin<"__builtin_ia32_vfmulcph128_mask">,
+      : ClangBuiltin<"__builtin_ia32_vfmulcph128_mask">,
         Intrinsic<[ llvm_v4f32_ty ],
                   [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty ],
                   [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vfcmul_cph_128
-      : GCCBuiltin<"__builtin_ia32_vfcmulcph128_mask">,
+      : ClangBuiltin<"__builtin_ia32_vfcmulcph128_mask">,
         Intrinsic<[ llvm_v4f32_ty ],
                   [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty ],
                   [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vfmul_cph_256
-      : GCCBuiltin<"__builtin_ia32_vfmulcph256_mask">,
+      : ClangBuiltin<"__builtin_ia32_vfmulcph256_mask">,
         Intrinsic<[ llvm_v8f32_ty ],
                   [ llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty ],
                   [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vfcmul_cph_256
-      : GCCBuiltin<"__builtin_ia32_vfcmulcph256_mask">,
+      : ClangBuiltin<"__builtin_ia32_vfcmulcph256_mask">,
         Intrinsic<[ llvm_v8f32_ty ],
                   [ llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty ],
                   [ IntrNoMem ]>;
   def int_x86_avx512fp16_mask_vfmul_cph_512
-      : GCCBuiltin<"__builtin_ia32_vfmulcph512_mask">,
+      : ClangBuiltin<"__builtin_ia32_vfmulcph512_mask">,
         Intrinsic<[ llvm_v16f32_ty ],
                   [ llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
                     llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
   def int_x86_avx512fp16_mask_vfcmul_cph_512
-      : GCCBuiltin<"__builtin_ia32_vfcmulcph512_mask">,
+      : ClangBuiltin<"__builtin_ia32_vfcmulcph512_mask">,
         Intrinsic<[ llvm_v16f32_ty ],
                   [ llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
                     llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
   def int_x86_avx512fp16_mask_vfmul_csh
-      : GCCBuiltin<"__builtin_ia32_vfmulcsh_mask">,
+      : ClangBuiltin<"__builtin_ia32_vfmulcsh_mask">,
         Intrinsic<[ llvm_v4f32_ty ],
                   [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
                     llvm_i32_ty ],
                   [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
   def int_x86_avx512fp16_mask_vfcmul_csh
-      : GCCBuiltin<"__builtin_ia32_vfcmulcsh_mask">,
+      : ClangBuiltin<"__builtin_ia32_vfcmulcsh_mask">,
         Intrinsic<[ llvm_v4f32_ty ],
                   [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
                     llvm_i32_ty ],
diff --git a/llvm/include/llvm/IR/IntrinsicsXCore.td b/llvm/include/llvm/IR/IntrinsicsXCore.td
index 89dbc65fea44..d2afc3497833 100644
--- a/llvm/include/llvm/IR/IntrinsicsXCore.td
+++ b/llvm/include/llvm/IR/IntrinsicsXCore.td
@@ -13,7 +13,7 @@
 let TargetPrefix = "xcore" in {  // All intrinsics start with "llvm.xcore.".
   // Miscellaneous instructions.
   def int_xcore_bitrev : Intrinsic<[llvm_i32_ty],[llvm_i32_ty],[IntrNoMem]>,
-                         GCCBuiltin<"__builtin_bitrev">;
+                         ClangBuiltin<"__builtin_bitrev">;
   def int_xcore_crc8 : Intrinsic<[llvm_i32_ty, llvm_i32_ty],
                                  [llvm_i32_ty,llvm_i32_ty,llvm_i32_ty],
                                  [IntrNoMem]>;
@@ -25,11 +25,11 @@ let TargetPrefix = "xcore" in {  // All intrinsics start with "llvm.xcore.".
   def int_xcore_zext : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
                                  [IntrNoMem]>;
   def int_xcore_getid : Intrinsic<[llvm_i32_ty],[],[IntrNoMem]>,
-                        GCCBuiltin<"__builtin_getid">;
+                        ClangBuiltin<"__builtin_getid">;
   def int_xcore_getps : Intrinsic<[llvm_i32_ty],[llvm_i32_ty]>,
-                        GCCBuiltin<"__builtin_getps">;
+                        ClangBuiltin<"__builtin_getps">;
   def int_xcore_setps : Intrinsic<[],[llvm_i32_ty, llvm_i32_ty]>,
-                        GCCBuiltin<"__builtin_setps">;
+                        ClangBuiltin<"__builtin_setps">;
   def int_xcore_geted : Intrinsic<[llvm_i32_ty],[]>;
   def int_xcore_getet : Intrinsic<[llvm_i32_ty],[]>;
   def int_xcore_setsr : Intrinsic<[],[llvm_i32_ty]>;
diff --git a/llvm/include/llvm/IR/LLVMContext.h b/llvm/include/llvm/IR/LLVMContext.h
index 446bcecf1c64..91712df153a0 100644
--- a/llvm/include/llvm/IR/LLVMContext.h
+++ b/llvm/include/llvm/IR/LLVMContext.h
@@ -24,6 +24,7 @@
 
 namespace llvm {
 
+class Any;
 class DiagnosticInfo;
 enum DiagnosticSeverity : char;
 class Function;
@@ -93,6 +94,7 @@ public:
     OB_preallocated = 4,           // "preallocated"
     OB_gc_live = 5,                // "gc-live"
     OB_clang_arc_attachedcall = 6, // "clang.arc.attachedcall"
+    OB_ptrauth = 7,                // "ptrauth"
   };
 
   /// getMDKindID - Return a unique non-zero ID for the specified metadata kind.
@@ -201,6 +203,11 @@ public:
   /// diagnostics.
   void setDiagnosticsHotnessRequested(bool Requested);
 
+  bool getMisExpectWarningRequested() const;
+  void setMisExpectWarningRequested(bool Requested);
+  void setDiagnosticsMisExpectTolerance(Optional<uint64_t> Tolerance);
+  uint64_t getDiagnosticsMisExpectTolerance() const;
+
   /// Return the minimum hotness value a diagnostic would need in order
   /// to be included in optimization diagnostics.
   ///
@@ -304,13 +311,22 @@ public:
   /// LLVMContext is used by compilation.
   void setOptPassGate(OptPassGate&);
 
-  /// Enable opaque pointers. Can only be called before creating the first
-  /// pointer type.
-  void enableOpaquePointers() const;
+  /// Whether we've decided on using opaque pointers or typed pointers yet.
+  bool hasSetOpaquePointersValue() const;
+
+  /// Set whether opaque pointers are enabled. The method may be called multiple
+  /// times, but only with the same value. Note that creating a pointer type or
+  /// otherwise querying the opaque pointer mode performs an implicit set to
+  /// the default value.
+  void setOpaquePointers(bool Enable) const;
 
   /// Whether typed pointers are supported. If false, all pointers are opaque.
   bool supportsTypedPointers() const;
 
+  /// Optionally target-spcific data can be attached to the context for lifetime
+  /// management and bypassing layering restrictions.
+  llvm::Any &getTargetData() const;
+
 private:
   // Module needs access to the add/removeModule methods.
   friend class Module;
diff --git a/llvm/include/llvm/IR/LegacyPassManagers.h b/llvm/include/llvm/IR/LegacyPassManagers.h
index 311a407f1a19..41c11d26aa45 100644
--- a/llvm/include/llvm/IR/LegacyPassManagers.h
+++ b/llvm/include/llvm/IR/LegacyPassManagers.h
@@ -294,9 +294,7 @@ private:
 /// used by pass managers.
 class PMDataManager {
 public:
-  explicit PMDataManager() : TPM(nullptr), Depth(0) {
-    initializeAnalysisInfo();
-  }
+  explicit PMDataManager() { initializeAnalysisInfo(); }
 
   virtual ~PMDataManager();
 
@@ -418,7 +416,7 @@ public:
 
 protected:
   // Top level manager.
-  PMTopLevelManager *TPM;
+  PMTopLevelManager *TPM = nullptr;
 
   // Collection of pass that are managed by this manager
   SmallVector<Pass *, 16> PassVector;
@@ -446,7 +444,7 @@ private:
   // this manager.
   SmallVector<Pass *, 16> HigherLevelAnalysis;
 
-  unsigned Depth;
+  unsigned Depth = 0;
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/MDBuilder.h b/llvm/include/llvm/IR/MDBuilder.h
index 42829388b79a..21d7b8b6da71 100644
--- a/llvm/include/llvm/IR/MDBuilder.h
+++ b/llvm/include/llvm/IR/MDBuilder.h
@@ -108,6 +108,10 @@ public:
   /// Merge the new callback encoding \p NewCB into \p ExistingCallbacks.
   MDNode *mergeCallbackEncodings(MDNode *ExistingCallbacks, MDNode *NewCB);
 
+  /// Return metadata feeding to the CodeGen about how to generate a function
+  /// prologue for the "function" santizier.
+  MDNode *createRTTIPointerPrologue(Constant *PrologueSig, Constant *RTTI);
+
   //===------------------------------------------------------------------===//
   // AA metadata.
   //===------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/MatrixBuilder.h b/llvm/include/llvm/IR/MatrixBuilder.h
index 4c8286692ebf..dbf2cfb7c5e9 100644
--- a/llvm/include/llvm/IR/MatrixBuilder.h
+++ b/llvm/include/llvm/IR/MatrixBuilder.h
@@ -30,8 +30,8 @@ class Function;
 class Twine;
 class Module;
 
-template <class IRBuilderTy> class MatrixBuilder {
-  IRBuilderTy &B;
+class MatrixBuilder {
+  IRBuilderBase &B;
   Module *getModule() { return B.GetInsertBlock()->getParent()->getParent(); }
 
   std::pair<Value *, Value *> splatScalarOperandIfNeeded(Value *LHS,
@@ -55,21 +55,17 @@ template <class IRBuilderTy> class MatrixBuilder {
   }
 
 public:
-  MatrixBuilder(IRBuilderTy &Builder) : B(Builder) {}
+  MatrixBuilder(IRBuilderBase &Builder) : B(Builder) {}
 
   /// Create a column major, strided matrix load.
+  /// \p EltTy   - Matrix element type
   /// \p DataPtr - Start address of the matrix read
   /// \p Rows    - Number of rows in matrix (must be a constant)
   /// \p Columns - Number of columns in matrix (must be a constant)
   /// \p Stride  - Space between columns
-  CallInst *CreateColumnMajorLoad(Value *DataPtr, Align Alignment,
+  CallInst *CreateColumnMajorLoad(Type *EltTy, Value *DataPtr, Align Alignment,
                                   Value *Stride, bool IsVolatile, unsigned Rows,
                                   unsigned Columns, const Twine &Name = "") {
-
-    // Deal with the pointer
-    PointerType *PtrTy = cast<PointerType>(DataPtr->getType());
-    Type *EltTy = PtrTy->getPointerElementType();
-
     auto *RetType = FixedVectorType::get(EltTy, Rows * Columns);
 
     Value *Ops[] = {DataPtr, Stride, B.getInt1(IsVolatile), B.getInt32(Rows),
@@ -234,12 +230,11 @@ public:
   /// Create an assumption that \p Idx is less than \p NumElements.
   void CreateIndexAssumption(Value *Idx, unsigned NumElements,
                              Twine const &Name = "") {
-
     Value *NumElts =
         B.getIntN(Idx->getType()->getScalarSizeInBits(), NumElements);
     auto *Cmp = B.CreateICmpULT(Idx, NumElts);
-    if (auto *ConstCond = dyn_cast<ConstantInt>(Cmp))
-      assert(ConstCond->isOne() && "Index must be valid!");
+    if (isa<ConstantInt>(Cmp))
+      assert(cast<ConstantInt>(Cmp)->isOne() && "Index must be valid!");
     else
       B.CreateAssumption(Cmp);
   }
@@ -248,7 +243,6 @@ public:
   /// a matrix with \p NumRows embedded in a vector.
   Value *CreateIndex(Value *RowIdx, Value *ColumnIdx, unsigned NumRows,
                      Twine const &Name = "") {
-
     unsigned MaxWidth = std::max(RowIdx->getType()->getScalarSizeInBits(),
                                  ColumnIdx->getType()->getScalarSizeInBits());
     Type *IntTy = IntegerType::get(RowIdx->getType()->getContext(), MaxWidth);
diff --git a/llvm/include/llvm/IR/Metadata.h b/llvm/include/llvm/IR/Metadata.h
index 7965884990e5..be359d94f812 100644
--- a/llvm/include/llvm/IR/Metadata.h
+++ b/llvm/include/llvm/IR/Metadata.h
@@ -169,7 +169,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const Metadata &MD) {
 /// Metadata wrapper in the Value hierarchy.
 ///
 /// A member of the \a Value hierarchy to represent a reference to metadata.
-/// This allows, e.g., instrinsics to have metadata as operands.
+/// This allows, e.g., intrinsics to have metadata as operands.
 ///
 /// Notably, this is the only thing in either hierarchy that is allowed to
 /// reference \a LocalAsMetadata.
@@ -302,7 +302,8 @@ public:
   ///
   /// Replace all uses of this with \c MD, which is allowed to be null.
   void replaceAllUsesWith(Metadata *MD);
-
+   /// Replace all uses of the constant with Undef in debug info metadata
+  static void SalvageDebugInfo(const Constant &C); 
   /// Returns the list of all DIArgList users of this.
   SmallVector<Metadata *> getAllArgListUsers();
 
@@ -774,10 +775,21 @@ class MDOperand {
 
 public:
   MDOperand() = default;
-  MDOperand(MDOperand &&) = delete;
   MDOperand(const MDOperand &) = delete;
-  MDOperand &operator=(MDOperand &&) = delete;
+  MDOperand(MDOperand &&Op) {
+    MD = Op.MD;
+    if (MD)
+      (void)MetadataTracking::retrack(Op.MD, MD);
+    Op.MD = nullptr;
+  }
   MDOperand &operator=(const MDOperand &) = delete;
+  MDOperand &operator=(MDOperand &&Op) {
+    MD = Op.MD;
+    if (MD)
+      (void)MetadataTracking::retrack(Op.MD, MD);
+    Op.MD = nullptr;
+    return *this;
+  }
   ~MDOperand() { untrack(); }
 
   Metadata *get() const { return MD; }
@@ -922,13 +934,109 @@ struct TempMDNodeDeleter {
 /// If an unresolved node is part of a cycle, \a resolveCycles() needs
 /// to be called on some member of the cycle once all temporary nodes have been
 /// replaced.
+///
+/// MDNodes can be large or small, as well as resizable or non-resizable.
+/// Large MDNodes' operands are allocated in a separate storage vector,
+/// whereas small MDNodes' operands are co-allocated. Distinct and temporary
+/// MDnodes are resizable, but only MDTuples support this capability.
+///
+/// Clients can add operands to resizable MDNodes using push_back().
 class MDNode : public Metadata {
   friend class ReplaceableMetadataImpl;
   friend class LLVMContextImpl;
   friend class DIArgList;
 
-  unsigned NumOperands;
-  unsigned NumUnresolved;
+  /// The header that is coallocated with an MDNode along with its "small"
+  /// operands. It is located immediately before the main body of the node.
+  /// The operands are in turn located immediately before the header.
+  /// For resizable MDNodes, the space for the storage vector is also allocated
+  /// immediately before the header, overlapping with the operands.
+  struct Header {
+    bool IsResizable : 1;
+    bool IsLarge : 1;
+    size_t SmallSize : 4;
+    size_t SmallNumOps : 4;
+    size_t : sizeof(size_t) * CHAR_BIT - 10;
+
+    unsigned NumUnresolved = 0;
+    using LargeStorageVector = SmallVector<MDOperand, 0>;
+
+    static constexpr size_t NumOpsFitInVector =
+        sizeof(LargeStorageVector) / sizeof(MDOperand);
+    static_assert(
+        NumOpsFitInVector * sizeof(MDOperand) == sizeof(LargeStorageVector),
+        "sizeof(LargeStorageVector) must be a multiple of sizeof(MDOperand)");
+
+    static constexpr size_t MaxSmallSize = 15;
+
+    static constexpr size_t getOpSize(unsigned NumOps) {
+      return sizeof(MDOperand) * NumOps;
+    }
+    /// Returns the number of operands the node has space for based on its
+    /// allocation characteristics.
+    static size_t getSmallSize(size_t NumOps, bool IsResizable, bool IsLarge) {
+      return IsLarge ? NumOpsFitInVector
+                     : std::max(NumOps, NumOpsFitInVector * IsResizable);
+    }
+    /// Returns the number of bytes allocated for operands and header.
+    static size_t getAllocSize(StorageType Storage, size_t NumOps) {
+      return getOpSize(
+                 getSmallSize(NumOps, isResizable(Storage), isLarge(NumOps))) +
+             sizeof(Header);
+    }
+
+    /// Only temporary and distinct nodes are resizable.
+    static bool isResizable(StorageType Storage) { return Storage != Uniqued; }
+    static bool isLarge(size_t NumOps) { return NumOps > MaxSmallSize; }
+
+    size_t getAllocSize() const {
+      return getOpSize(SmallSize) + sizeof(Header);
+    }
+    void *getAllocation() {
+      return reinterpret_cast<char *>(this + 1) -
+             alignTo(getAllocSize(), alignof(uint64_t));
+    }
+
+    void *getLargePtr() const;
+    void *getSmallPtr();
+
+    LargeStorageVector &getLarge() {
+      assert(IsLarge);
+      return *reinterpret_cast<LargeStorageVector *>(getLargePtr());
+    }
+
+    const LargeStorageVector &getLarge() const {
+      assert(IsLarge);
+      return *reinterpret_cast<const LargeStorageVector *>(getLargePtr());
+    }
+
+    void resizeSmall(size_t NumOps);
+    void resizeSmallToLarge(size_t NumOps);
+    void resize(size_t NumOps);
+
+    explicit Header(size_t NumOps, StorageType Storage);
+    ~Header();
+
+    MutableArrayRef<MDOperand> operands() {
+      if (IsLarge)
+        return getLarge();
+      return makeMutableArrayRef(
+          reinterpret_cast<MDOperand *>(this) - SmallSize, SmallNumOps);
+    }
+
+    ArrayRef<MDOperand> operands() const {
+      if (IsLarge)
+        return getLarge();
+      return makeArrayRef(reinterpret_cast<const MDOperand *>(this) - SmallSize,
+                          SmallNumOps);
+    }
+  };
+
+  Header &getHeader() { return *(reinterpret_cast<Header *>(this) - 1); }
+
+  const Header &getHeader() const {
+    return *(reinterpret_cast<const Header *>(this) - 1);
+  }
 
   ContextAndReplaceableUses Context;
 
@@ -937,7 +1045,7 @@ protected:
          ArrayRef<Metadata *> Ops1, ArrayRef<Metadata *> Ops2 = None);
   ~MDNode() = default;
 
-  void *operator new(size_t Size, unsigned NumOps);
+  void *operator new(size_t Size, size_t NumOps, StorageType Storage);
   void operator delete(void *Mem);
 
   /// Required by std, but never called.
@@ -952,8 +1060,8 @@ protected:
 
   void dropAllReferences();
 
-  MDOperand *mutable_begin() { return mutable_end() - NumOperands; }
-  MDOperand *mutable_end() { return reinterpret_cast<MDOperand *>(this); }
+  MDOperand *mutable_begin() { return getHeader().operands().begin(); }
+  MDOperand *mutable_end() { return getHeader().operands().end(); }
 
   using mutable_op_range = iterator_range<MDOperand *>;
 
@@ -999,7 +1107,7 @@ public:
   /// As forward declarations are resolved, their containers should get
   /// resolved automatically.  However, if this (or one of its operands) is
   /// involved in a cycle, \a resolveCycles() needs to be called explicitly.
-  bool isResolved() const { return !isTemporary() && !NumUnresolved; }
+  bool isResolved() const { return !isTemporary() && !getNumUnresolved(); }
 
   bool isUniqued() const { return Storage == Uniqued; }
   bool isDistinct() const { return Storage == Distinct; }
@@ -1093,11 +1201,25 @@ protected:
   /// Sets the operand directly, without worrying about uniquing.
   void setOperand(unsigned I, Metadata *New);
 
+  unsigned getNumUnresolved() const { return getHeader().NumUnresolved; }
+
+  void setNumUnresolved(unsigned N) { getHeader().NumUnresolved = N; }
   void storeDistinctInContext();
   template <class T, class StoreT>
   static T *storeImpl(T *N, StorageType Storage, StoreT &Store);
   template <class T> static T *storeImpl(T *N, StorageType Storage);
 
+  /// Resize the node to hold \a NumOps operands.
+  ///
+  /// \pre \a isTemporary() or \a isDistinct()
+  /// \pre MetadataID == MDTupleKind
+  void resize(size_t NumOps) {
+    assert(!isUniqued() && "Resizing is not supported for uniqued nodes");
+    assert(getMetadataID() == MDTupleKind &&
+           "Resizing is not supported for this node kind");
+    getHeader().resize(NumOps);
+  }
+
 private:
   void handleChangedOperand(void *Ref, Metadata *New);
 
@@ -1154,12 +1276,12 @@ public:
   op_range operands() const { return op_range(op_begin(), op_end()); }
 
   const MDOperand &getOperand(unsigned I) const {
-    assert(I < NumOperands && "Out of range");
-    return op_begin()[I];
+    assert(I < getNumOperands() && "Out of range");
+    return getHeader().operands()[I];
   }
 
   /// Return number of MDNode operands.
-  unsigned getNumOperands() const { return NumOperands; }
+  unsigned getNumOperands() const { return getHeader().operands().size(); }
 
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const Metadata *MD) {
@@ -1244,6 +1366,16 @@ public:
   /// Return a (temporary) clone of this.
   TempMDTuple clone() const { return cloneImpl(); }
 
+  /// Append an element to the tuple. This will resize the node.
+  void push_back(Metadata *MD) {
+    size_t NumOps = getNumOperands();
+    resize(NumOps + 1);
+    setOperand(NumOps, MD);
+  }
+
+  /// Shrink the operands by 1.
+  void pop_back() { resize(getNumOperands() - 1); }
+
   static bool classof(const Metadata *MD) {
     return MD->getMetadataID() == MDTupleKind;
   }
diff --git a/llvm/include/llvm/IR/Module.h b/llvm/include/llvm/IR/Module.h
index 7b834fbeeebf..fc2d60947118 100644
--- a/llvm/include/llvm/IR/Module.h
+++ b/llvm/include/llvm/IR/Module.h
@@ -58,9 +58,9 @@ class VersionTuple;
 /// other modules) this module depends on, a symbol table, and various data
 /// about the target's characteristics.
 ///
-/// A module maintains a GlobalValRefMap object that is used to hold all
+/// A module maintains a GlobalList object that is used to hold all
 /// constant references to global variables in the module.  When a global
-/// variable is destroyed, it should have no entries in the GlobalValueRefMap.
+/// variable is destroyed, it should have no entries in the GlobalList.
 /// The main container class for the LLVM Intermediate Representation.
 class LLVM_EXTERNAL_VISIBILITY Module {
   /// @name Types And Enumerations
@@ -146,9 +146,12 @@ public:
     /// Takes the max of the two values, which are required to be integers.
     Max = 7,
 
+    /// Takes the min of the two values, which are required to be integers.
+    Min = 8,
+
     // Markers:
     ModFlagBehaviorFirstVal = Error,
-    ModFlagBehaviorLastVal = Max
+    ModFlagBehaviorLastVal = Min
   };
 
   /// Checks if Metadata represents a valid ModFlagBehavior, and stores the
@@ -360,6 +363,8 @@ public:
   /// In all cases, the returned value is a FunctionCallee wrapper around the
   /// 'FunctionType *T' passed in, as well as a 'Value*' either of the Function or
   /// the bitcast to the function.
+  ///
+  /// Note: For library calls getOrInsertLibFunc() should be used instead.
   FunctionCallee getOrInsertFunction(StringRef Name, FunctionType *T,
                                      AttributeList AttributeList);
 
@@ -888,8 +893,8 @@ public:
   void setRtLibUseGOT();
 
   /// Get/set whether synthesized functions should get the uwtable attribute.
-  bool getUwtable() const;
-  void setUwtable();
+  UWTableKind getUwtable() const;
+  void setUwtable(UWTableKind Kind);
 
   /// Get/set whether synthesized functions should get the "frame-pointer"
   /// attribute.
@@ -939,10 +944,17 @@ public:
   /// @returns a string containing the target variant triple.
   StringRef getDarwinTargetVariantTriple() const;
 
+  /// Set the target variant triple which is a string describing a variant of
+  /// the target host platform.
+  void setDarwinTargetVariantTriple(StringRef T);
+
   /// Get the target variant version build SDK version metadata.
   ///
   /// An empty version is returned if no such metadata is attached.
   VersionTuple getDarwinTargetVariantSDKVersion() const;
+
+  /// Set the target variant version build SDK version metadata.
+  void setDarwinTargetVariantSDKVersion(VersionTuple Version);
 };
 
 /// Given "llvm.used" or "llvm.compiler.used" as a global name, collect the
diff --git a/llvm/include/llvm/IR/NoFolder.h b/llvm/include/llvm/IR/NoFolder.h
index ec149747e3f4..4e9f772dfdb6 100644
--- a/llvm/include/llvm/IR/NoFolder.h
+++ b/llvm/include/llvm/IR/NoFolder.h
@@ -23,10 +23,11 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/FMF.h"
+#include "llvm/IR/IRBuilderFolder.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/IRBuilderFolder.h"
 
 namespace llvm {
 
@@ -43,144 +44,72 @@ public:
   // Return an existing value or a constant if the operation can be simplified.
   // Otherwise return nullptr.
   //===--------------------------------------------------------------------===//
-  Value *FoldAdd(Value *LHS, Value *RHS, bool HasNUW = false,
-                 bool HasNSW = false) const override {
-    return nullptr;
-  }
 
-  Value *FoldAnd(Value *LHS, Value *RHS) const override { return nullptr; }
-
-  Value *FoldOr(Value *LHS, Value *RHS) const override { return nullptr; }
-
-  Value *FoldICmp(CmpInst::Predicate P, Value *LHS, Value *RHS) const override {
+  Value *FoldBinOp(Instruction::BinaryOps Opc, Value *LHS,
+                   Value *RHS) const override {
     return nullptr;
   }
 
-  Value *FoldGEP(Type *Ty, Value *Ptr, ArrayRef<Value *> IdxList,
-                 bool IsInBounds = false) const override {
+  Value *FoldExactBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS,
+                        bool IsExact) const override {
     return nullptr;
   }
 
-  Value *FoldSelect(Value *C, Value *True, Value *False) const override {
+  Value *FoldNoWrapBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS,
+                         bool HasNUW, bool HasNSW) const override {
     return nullptr;
   }
 
-  //===--------------------------------------------------------------------===//
-  // Binary Operators
-  //===--------------------------------------------------------------------===//
-
-  Instruction *CreateFAdd(Constant *LHS, Constant *RHS) const override {
-    return BinaryOperator::CreateFAdd(LHS, RHS);
-  }
-
-  Instruction *CreateSub(Constant *LHS, Constant *RHS,
-                         bool HasNUW = false,
-                         bool HasNSW = false) const override {
-    BinaryOperator *BO = BinaryOperator::CreateSub(LHS, RHS);
-    if (HasNUW) BO->setHasNoUnsignedWrap();
-    if (HasNSW) BO->setHasNoSignedWrap();
-    return BO;
-  }
-
-  Instruction *CreateFSub(Constant *LHS, Constant *RHS) const override {
-    return BinaryOperator::CreateFSub(LHS, RHS);
-  }
-
-  Instruction *CreateMul(Constant *LHS, Constant *RHS,
-                         bool HasNUW = false,
-                         bool HasNSW = false) const override {
-    BinaryOperator *BO = BinaryOperator::CreateMul(LHS, RHS);
-    if (HasNUW) BO->setHasNoUnsignedWrap();
-    if (HasNSW) BO->setHasNoSignedWrap();
-    return BO;
-  }
-
-  Instruction *CreateFMul(Constant *LHS, Constant *RHS) const override {
-    return BinaryOperator::CreateFMul(LHS, RHS);
-  }
-
-  Instruction *CreateUDiv(Constant *LHS, Constant *RHS,
-                          bool isExact = false) const override {
-    if (!isExact)
-      return BinaryOperator::CreateUDiv(LHS, RHS);
-    return BinaryOperator::CreateExactUDiv(LHS, RHS);
-  }
-
-  Instruction *CreateSDiv(Constant *LHS, Constant *RHS,
-                          bool isExact = false) const override {
-    if (!isExact)
-      return BinaryOperator::CreateSDiv(LHS, RHS);
-    return BinaryOperator::CreateExactSDiv(LHS, RHS);
-  }
-
-  Instruction *CreateFDiv(Constant *LHS, Constant *RHS) const override {
-    return BinaryOperator::CreateFDiv(LHS, RHS);
+  Value *FoldBinOpFMF(Instruction::BinaryOps Opc, Value *LHS, Value *RHS,
+                      FastMathFlags FMF) const override {
+    return nullptr;
   }
 
-  Instruction *CreateURem(Constant *LHS, Constant *RHS) const override {
-    return BinaryOperator::CreateURem(LHS, RHS);
+  Value *FoldICmp(CmpInst::Predicate P, Value *LHS, Value *RHS) const override {
+    return nullptr;
   }
 
-  Instruction *CreateSRem(Constant *LHS, Constant *RHS) const override {
-    return BinaryOperator::CreateSRem(LHS, RHS);
+  Value *FoldGEP(Type *Ty, Value *Ptr, ArrayRef<Value *> IdxList,
+                 bool IsInBounds = false) const override {
+    return nullptr;
   }
 
-  Instruction *CreateFRem(Constant *LHS, Constant *RHS) const override {
-    return BinaryOperator::CreateFRem(LHS, RHS);
+  Value *FoldSelect(Value *C, Value *True, Value *False) const override {
+    return nullptr;
   }
 
-  Instruction *CreateShl(Constant *LHS, Constant *RHS, bool HasNUW = false,
-                         bool HasNSW = false) const override {
-    BinaryOperator *BO = BinaryOperator::CreateShl(LHS, RHS);
-    if (HasNUW) BO->setHasNoUnsignedWrap();
-    if (HasNSW) BO->setHasNoSignedWrap();
-    return BO;
+  Value *FoldExtractValue(Value *Agg,
+                          ArrayRef<unsigned> IdxList) const override {
+    return nullptr;
   }
 
-  Instruction *CreateLShr(Constant *LHS, Constant *RHS,
-                          bool isExact = false) const override {
-    if (!isExact)
-      return BinaryOperator::CreateLShr(LHS, RHS);
-    return BinaryOperator::CreateExactLShr(LHS, RHS);
+  Value *FoldInsertValue(Value *Agg, Value *Val,
+                         ArrayRef<unsigned> IdxList) const override {
+    return nullptr;
   }
 
-  Instruction *CreateAShr(Constant *LHS, Constant *RHS,
-                          bool isExact = false) const override {
-    if (!isExact)
-      return BinaryOperator::CreateAShr(LHS, RHS);
-    return BinaryOperator::CreateExactAShr(LHS, RHS);
+  Value *FoldExtractElement(Value *Vec, Value *Idx) const override {
+    return nullptr;
   }
 
-  Instruction *CreateXor(Constant *LHS, Constant *RHS) const override {
-    return BinaryOperator::CreateXor(LHS, RHS);
+  Value *FoldInsertElement(Value *Vec, Value *NewElt,
+                           Value *Idx) const override {
+    return nullptr;
   }
 
-  Instruction *CreateBinOp(Instruction::BinaryOps Opc,
-                           Constant *LHS, Constant *RHS) const override {
-    return BinaryOperator::Create(Opc, LHS, RHS);
+  Value *FoldShuffleVector(Value *V1, Value *V2,
+                           ArrayRef<int> Mask) const override {
+    return nullptr;
   }
 
   //===--------------------------------------------------------------------===//
   // Unary Operators
   //===--------------------------------------------------------------------===//
 
-  Instruction *CreateNeg(Constant *C,
-                         bool HasNUW = false,
-                         bool HasNSW = false) const override {
-    BinaryOperator *BO = BinaryOperator::CreateNeg(C);
-    if (HasNUW) BO->setHasNoUnsignedWrap();
-    if (HasNSW) BO->setHasNoSignedWrap();
-    return BO;
-  }
-
   Instruction *CreateFNeg(Constant *C) const override {
     return UnaryOperator::CreateFNeg(C);
   }
 
-  Instruction *CreateNot(Constant *C) const override {
-    return BinaryOperator::CreateNot(C);
-  }
-
   Instruction *CreateUnOp(Instruction::UnaryOps Opc,
                           Constant *C) const override {
     return UnaryOperator::Create(Opc, C);
@@ -245,35 +174,6 @@ public:
                           Constant *LHS, Constant *RHS) const override {
     return new FCmpInst(P, LHS, RHS);
   }
-
-  //===--------------------------------------------------------------------===//
-  // Other Instructions
-  //===--------------------------------------------------------------------===//
-
-  Instruction *CreateExtractElement(Constant *Vec,
-                                    Constant *Idx) const override {
-    return ExtractElementInst::Create(Vec, Idx);
-  }
-
-  Instruction *CreateInsertElement(Constant *Vec, Constant *NewElt,
-                                   Constant *Idx) const override {
-    return InsertElementInst::Create(Vec, NewElt, Idx);
-  }
-
-  Instruction *CreateShuffleVector(Constant *V1, Constant *V2,
-                                   ArrayRef<int> Mask) const override {
-    return new ShuffleVectorInst(V1, V2, Mask);
-  }
-
-  Instruction *CreateExtractValue(Constant *Agg,
-                                  ArrayRef<unsigned> IdxList) const override {
-    return ExtractValueInst::Create(Agg, IdxList);
-  }
-
-  Instruction *CreateInsertValue(Constant *Agg, Constant *Val,
-                                 ArrayRef<unsigned> IdxList) const override {
-    return InsertValueInst::Create(Agg, Val, IdxList);
-  }
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/IR/Operator.h b/llvm/include/llvm/IR/Operator.h
index 7d232bba0864..1a234e273eff 100644
--- a/llvm/include/llvm/IR/Operator.h
+++ b/llvm/include/llvm/IR/Operator.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/FMF.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
@@ -161,105 +162,6 @@ public:
   }
 };
 
-/// Convenience struct for specifying and reasoning about fast-math flags.
-class FastMathFlags {
-private:
-  friend class FPMathOperator;
-
-  unsigned Flags = 0;
-
-  FastMathFlags(unsigned F) {
-    // If all 7 bits are set, turn this into -1. If the number of bits grows,
-    // this must be updated. This is intended to provide some forward binary
-    // compatibility insurance for the meaning of 'fast' in case bits are added.
-    if (F == 0x7F) Flags = ~0U;
-    else Flags = F;
-  }
-
-public:
-  // This is how the bits are used in Value::SubclassOptionalData so they
-  // should fit there too.
-  // WARNING: We're out of space. SubclassOptionalData only has 7 bits. New
-  // functionality will require a change in how this information is stored.
-  enum {
-    AllowReassoc    = (1 << 0),
-    NoNaNs          = (1 << 1),
-    NoInfs          = (1 << 2),
-    NoSignedZeros   = (1 << 3),
-    AllowReciprocal = (1 << 4),
-    AllowContract   = (1 << 5),
-    ApproxFunc      = (1 << 6)
-  };
-
-  FastMathFlags() = default;
-
-  static FastMathFlags getFast() {
-    FastMathFlags FMF;
-    FMF.setFast();
-    return FMF;
-  }
-
-  bool any() const { return Flags != 0; }
-  bool none() const { return Flags == 0; }
-  bool all() const { return Flags == ~0U; }
-
-  void clear() { Flags = 0; }
-  void set()   { Flags = ~0U; }
-
-  /// Flag queries
-  bool allowReassoc() const    { return 0 != (Flags & AllowReassoc); }
-  bool noNaNs() const          { return 0 != (Flags & NoNaNs); }
-  bool noInfs() const          { return 0 != (Flags & NoInfs); }
-  bool noSignedZeros() const   { return 0 != (Flags & NoSignedZeros); }
-  bool allowReciprocal() const { return 0 != (Flags & AllowReciprocal); }
-  bool allowContract() const   { return 0 != (Flags & AllowContract); }
-  bool approxFunc() const      { return 0 != (Flags & ApproxFunc); }
-  /// 'Fast' means all bits are set.
-  bool isFast() const          { return all(); }
-
-  /// Flag setters
-  void setAllowReassoc(bool B = true) {
-    Flags = (Flags & ~AllowReassoc) | B * AllowReassoc;
-  }
-  void setNoNaNs(bool B = true) {
-    Flags = (Flags & ~NoNaNs) | B * NoNaNs;
-  }
-  void setNoInfs(bool B = true) {
-    Flags = (Flags & ~NoInfs) | B * NoInfs;
-  }
-  void setNoSignedZeros(bool B = true) {
-    Flags = (Flags & ~NoSignedZeros) | B * NoSignedZeros;
-  }
-  void setAllowReciprocal(bool B = true) {
-    Flags = (Flags & ~AllowReciprocal) | B * AllowReciprocal;
-  }
-  void setAllowContract(bool B = true) {
-    Flags = (Flags & ~AllowContract) | B * AllowContract;
-  }
-  void setApproxFunc(bool B = true) {
-    Flags = (Flags & ~ApproxFunc) | B * ApproxFunc;
-  }
-  void setFast(bool B = true) { B ? set() : clear(); }
-
-  void operator&=(const FastMathFlags &OtherFlags) {
-    Flags &= OtherFlags.Flags;
-  }
-  void operator|=(const FastMathFlags &OtherFlags) {
-    Flags |= OtherFlags.Flags;
-  }
-  bool operator!=(const FastMathFlags &OtherFlags) const {
-    return Flags != OtherFlags.Flags;
-  }
-
-  /// Print fast-math flags to \p O.
-  void print(raw_ostream &O) const;
-};
-
-inline raw_ostream &operator<<(raw_ostream &O, FastMathFlags FMF) {
-  FMF.print(O);
-  return O;
-}
-
 /// Utility class for floating point operations which can have
 /// information about relaxed accuracy requirements attached to them.
 class FPMathOperator : public Operator {
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index f9f4f1603861..7f0695b552e1 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -136,7 +136,9 @@ struct undef_match {
 inline auto m_Undef() { return undef_match(); }
 
 /// Match an arbitrary poison constant.
-inline class_match<PoisonValue> m_Poison() { return class_match<PoisonValue>(); }
+inline class_match<PoisonValue> m_Poison() {
+  return class_match<PoisonValue>();
+}
 
 /// Match an arbitrary Constant and ignore it.
 inline class_match<Constant> m_Constant() { return class_match<Constant>(); }
@@ -222,7 +224,7 @@ struct apint_match {
   bool AllowUndef;
 
   apint_match(const APInt *&Res, bool AllowUndef)
-    : Res(Res), AllowUndef(AllowUndef) {}
+      : Res(Res), AllowUndef(AllowUndef) {}
 
   template <typename ITy> bool match(ITy *V) {
     if (auto *CI = dyn_cast<ConstantInt>(V)) {
@@ -231,8 +233,8 @@ struct apint_match {
     }
     if (V->getType()->isVectorTy())
       if (const auto *C = dyn_cast<Constant>(V))
-        if (auto *CI = dyn_cast_or_null<ConstantInt>(
-                C->getSplatValue(AllowUndef))) {
+        if (auto *CI =
+                dyn_cast_or_null<ConstantInt>(C->getSplatValue(AllowUndef))) {
           Res = &CI->getValue();
           return true;
         }
@@ -256,8 +258,8 @@ struct apfloat_match {
     }
     if (V->getType()->isVectorTy())
       if (const auto *C = dyn_cast<Constant>(V))
-        if (auto *CI = dyn_cast_or_null<ConstantFP>(
-                C->getSplatValue(AllowUndef))) {
+        if (auto *CI =
+                dyn_cast_or_null<ConstantFP>(C->getSplatValue(AllowUndef))) {
           Res = &CI->getValueAPF();
           return true;
         }
@@ -467,9 +469,7 @@ struct is_negative {
 inline cst_pred_ty<is_negative> m_Negative() {
   return cst_pred_ty<is_negative>();
 }
-inline api_pred_ty<is_negative> m_Negative(const APInt *&V) {
-  return V;
-}
+inline api_pred_ty<is_negative> m_Negative(const APInt *&V) { return V; }
 
 struct is_nonnegative {
   bool isValue(const APInt &C) { return C.isNonNegative(); }
@@ -479,9 +479,7 @@ struct is_nonnegative {
 inline cst_pred_ty<is_nonnegative> m_NonNegative() {
   return cst_pred_ty<is_nonnegative>();
 }
-inline api_pred_ty<is_nonnegative> m_NonNegative(const APInt *&V) {
-  return V;
-}
+inline api_pred_ty<is_nonnegative> m_NonNegative(const APInt *&V) { return V; }
 
 struct is_strictlypositive {
   bool isValue(const APInt &C) { return C.isStrictlyPositive(); }
@@ -510,9 +508,7 @@ struct is_one {
 };
 /// Match an integer 1 or a vector with all elements equal to 1.
 /// For vectors, this includes constants with undefined elements.
-inline cst_pred_ty<is_one> m_One() {
-  return cst_pred_ty<is_one>();
-}
+inline cst_pred_ty<is_one> m_One() { return cst_pred_ty<is_one>(); }
 
 struct is_zero_int {
   bool isValue(const APInt &C) { return C.isZero(); }
@@ -532,21 +528,15 @@ struct is_zero {
 };
 /// Match any null constant or a vector with all elements equal to 0.
 /// For vectors, this includes constants with undefined elements.
-inline is_zero m_Zero() {
-  return is_zero();
-}
+inline is_zero m_Zero() { return is_zero(); }
 
 struct is_power2 {
   bool isValue(const APInt &C) { return C.isPowerOf2(); }
 };
 /// Match an integer or vector power-of-2.
 /// For vectors, this includes constants with undefined elements.
-inline cst_pred_ty<is_power2> m_Power2() {
-  return cst_pred_ty<is_power2>();
-}
-inline api_pred_ty<is_power2> m_Power2(const APInt *&V) {
-  return V;
-}
+inline cst_pred_ty<is_power2> m_Power2() { return cst_pred_ty<is_power2>(); }
+inline api_pred_ty<is_power2> m_Power2(const APInt *&V) { return V; }
 
 struct is_negated_power2 {
   bool isValue(const APInt &C) { return C.isNegatedPowerOf2(); }
@@ -589,9 +579,7 @@ struct is_lowbit_mask {
 inline cst_pred_ty<is_lowbit_mask> m_LowBitMask() {
   return cst_pred_ty<is_lowbit_mask>();
 }
-inline api_pred_ty<is_lowbit_mask> m_LowBitMask(const APInt *&V) {
-  return V;
-}
+inline api_pred_ty<is_lowbit_mask> m_LowBitMask(const APInt *&V) { return V; }
 
 struct icmp_pred_with_threshold {
   ICmpInst::Predicate Pred;
@@ -613,9 +601,7 @@ struct is_nan {
 };
 /// Match an arbitrary NaN constant. This includes quiet and signalling nans.
 /// For vectors, this includes constants with undefined elements.
-inline cstfp_pred_ty<is_nan> m_NaN() {
-  return cstfp_pred_ty<is_nan>();
-}
+inline cstfp_pred_ty<is_nan> m_NaN() { return cstfp_pred_ty<is_nan>(); }
 
 struct is_nonnan {
   bool isValue(const APFloat &C) { return !C.isNaN(); }
@@ -631,9 +617,7 @@ struct is_inf {
 };
 /// Match a positive or negative infinity FP constant.
 /// For vectors, this includes constants with undefined elements.
-inline cstfp_pred_ty<is_inf> m_Inf() {
-  return cstfp_pred_ty<is_inf>();
-}
+inline cstfp_pred_ty<is_inf> m_Inf() { return cstfp_pred_ty<is_inf>(); }
 
 struct is_noninf {
   bool isValue(const APFloat &C) { return !C.isInfinity(); }
@@ -729,7 +713,9 @@ inline bind_ty<UnaryOperator> m_UnOp(UnaryOperator *&I) { return I; }
 /// Match a binary operator, capturing it if we match.
 inline bind_ty<BinaryOperator> m_BinOp(BinaryOperator *&I) { return I; }
 /// Match a with overflow intrinsic, capturing it if we match.
-inline bind_ty<WithOverflowInst> m_WithOverflowInst(WithOverflowInst *&I) { return I; }
+inline bind_ty<WithOverflowInst> m_WithOverflowInst(WithOverflowInst *&I) {
+  return I;
+}
 inline bind_ty<const WithOverflowInst>
 m_WithOverflowInst(const WithOverflowInst *&I) {
   return I;
@@ -842,8 +828,7 @@ struct bind_const_intval_ty {
 
 /// Match a specified integer value or vector of all elements of that
 /// value.
-template <bool AllowUndefs>
-struct specific_intval {
+template <bool AllowUndefs> struct specific_intval {
   APInt Val;
 
   specific_intval(APInt V) : Val(std::move(V)) {}
@@ -1014,7 +999,8 @@ template <typename Op_t> struct FNeg_match {
   FNeg_match(const Op_t &Op) : X(Op) {}
   template <typename OpTy> bool match(OpTy *V) {
     auto *FPMO = dyn_cast<FPMathOperator>(V);
-    if (!FPMO) return false;
+    if (!FPMO)
+      return false;
 
     if (FPMO->getOpcode() == Instruction::FNeg)
       return X.match(FPMO->getOperand(0));
@@ -1038,9 +1024,7 @@ template <typename Op_t> struct FNeg_match {
 };
 
 /// Match 'fneg X' as 'fsub -0.0, X'.
-template <typename OpTy>
-inline FNeg_match<OpTy>
-m_FNeg(const OpTy &X) {
+template <typename OpTy> inline FNeg_match<OpTy> m_FNeg(const OpTy &X) {
   return FNeg_match<OpTy>(X);
 }
 
@@ -1165,32 +1149,32 @@ inline OverflowingBinaryOp_match<LHS, RHS, Instruction::Add,
                                  OverflowingBinaryOperator::NoSignedWrap>
 m_NSWAdd(const LHS &L, const RHS &R) {
   return OverflowingBinaryOp_match<LHS, RHS, Instruction::Add,
-                                   OverflowingBinaryOperator::NoSignedWrap>(
-      L, R);
+                                   OverflowingBinaryOperator::NoSignedWrap>(L,
+                                                                            R);
 }
 template <typename LHS, typename RHS>
 inline OverflowingBinaryOp_match<LHS, RHS, Instruction::Sub,
                                  OverflowingBinaryOperator::NoSignedWrap>
 m_NSWSub(const LHS &L, const RHS &R) {
   return OverflowingBinaryOp_match<LHS, RHS, Instruction::Sub,
-                                   OverflowingBinaryOperator::NoSignedWrap>(
-      L, R);
+                                   OverflowingBinaryOperator::NoSignedWrap>(L,
+                                                                            R);
 }
 template <typename LHS, typename RHS>
 inline OverflowingBinaryOp_match<LHS, RHS, Instruction::Mul,
                                  OverflowingBinaryOperator::NoSignedWrap>
 m_NSWMul(const LHS &L, const RHS &R) {
   return OverflowingBinaryOp_match<LHS, RHS, Instruction::Mul,
-                                   OverflowingBinaryOperator::NoSignedWrap>(
-      L, R);
+                                   OverflowingBinaryOperator::NoSignedWrap>(L,
+                                                                            R);
 }
 template <typename LHS, typename RHS>
 inline OverflowingBinaryOp_match<LHS, RHS, Instruction::Shl,
                                  OverflowingBinaryOperator::NoSignedWrap>
 m_NSWShl(const LHS &L, const RHS &R) {
   return OverflowingBinaryOp_match<LHS, RHS, Instruction::Shl,
-                                   OverflowingBinaryOperator::NoSignedWrap>(
-      L, R);
+                                   OverflowingBinaryOperator::NoSignedWrap>(L,
+                                                                            R);
 }
 
 template <typename LHS, typename RHS>
@@ -1384,7 +1368,7 @@ struct CmpClass_match {
         Predicate = I->getPredicate();
         return true;
       } else if (Commutable && L.match(I->getOperand(1)) &&
-           R.match(I->getOperand(0))) {
+                 R.match(I->getOperand(0))) {
         Predicate = I->getSwappedPredicate();
         return true;
       }
@@ -2080,15 +2064,13 @@ template <typename T0, typename T1> struct m_Intrinsic_Ty<T0, T1> {
 };
 template <typename T0, typename T1, typename T2>
 struct m_Intrinsic_Ty<T0, T1, T2> {
-  using Ty =
-      match_combine_and<typename m_Intrinsic_Ty<T0, T1>::Ty,
-                        Argument_match<T2>>;
+  using Ty = match_combine_and<typename m_Intrinsic_Ty<T0, T1>::Ty,
+                               Argument_match<T2>>;
 };
 template <typename T0, typename T1, typename T2, typename T3>
 struct m_Intrinsic_Ty<T0, T1, T2, T3> {
-  using Ty =
-      match_combine_and<typename m_Intrinsic_Ty<T0, T1, T2>::Ty,
-                        Argument_match<T3>>;
+  using Ty = match_combine_and<typename m_Intrinsic_Ty<T0, T1, T2>::Ty,
+                               Argument_match<T3>>;
 };
 
 template <typename T0, typename T1, typename T2, typename T3, typename T4>
@@ -2097,7 +2079,8 @@ struct m_Intrinsic_Ty<T0, T1, T2, T3, T4> {
                                Argument_match<T4>>;
 };
 
-template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
+template <typename T0, typename T1, typename T2, typename T3, typename T4,
+          typename T5>
 struct m_Intrinsic_Ty<T0, T1, T2, T3, T4, T5> {
   using Ty = match_combine_and<typename m_Intrinsic_Ty<T0, T1, T2, T3, T4>::Ty,
                                Argument_match<T5>>;
@@ -2117,6 +2100,14 @@ m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2,
   return m_Intrinsic<Intrinsic::masked_load>(Op0, Op1, Op2, Op3);
 }
 
+/// Matches MaskedGather Intrinsic.
+template <typename Opnd0, typename Opnd1, typename Opnd2, typename Opnd3>
+inline typename m_Intrinsic_Ty<Opnd0, Opnd1, Opnd2, Opnd3>::Ty
+m_MaskedGather(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2,
+               const Opnd3 &Op3) {
+  return m_Intrinsic<Intrinsic::masked_gather>(Op0, Op1, Op2, Op3);
+}
+
 template <Intrinsic::ID IntrID, typename T0>
 inline typename m_Intrinsic_Ty<T0>::Ty m_Intrinsic(const T0 &Op0) {
   return m_CombineAnd(m_Intrinsic<IntrID>(), m_Argument<0>(Op0));
@@ -2204,6 +2195,11 @@ m_FShr(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2) {
   return m_Intrinsic<Intrinsic::fshr>(Op0, Op1, Op2);
 }
 
+template <typename Opnd0>
+inline typename m_Intrinsic_Ty<Opnd0>::Ty m_Sqrt(const Opnd0 &Op0) {
+  return m_Intrinsic<Intrinsic::sqrt>(Op0);
+}
+
 //===----------------------------------------------------------------------===//
 // Matchers for two-operands operators with the operators in either order
 //
@@ -2532,8 +2528,8 @@ struct LogicalOp_match {
 /// Matches L && R either in the form of L & R or L ? R : false.
 /// Note that the latter form is poison-blocking.
 template <typename LHS, typename RHS>
-inline LogicalOp_match<LHS, RHS, Instruction::And>
-m_LogicalAnd(const LHS &L, const RHS &R) {
+inline LogicalOp_match<LHS, RHS, Instruction::And> m_LogicalAnd(const LHS &L,
+                                                                const RHS &R) {
   return LogicalOp_match<LHS, RHS, Instruction::And>(L, R);
 }
 
@@ -2550,8 +2546,8 @@ m_c_LogicalAnd(const LHS &L, const RHS &R) {
 /// Matches L || R either in the form of L | R or L ? true : R.
 /// Note that the latter form is poison-blocking.
 template <typename LHS, typename RHS>
-inline LogicalOp_match<LHS, RHS, Instruction::Or>
-m_LogicalOr(const LHS &L, const RHS &R) {
+inline LogicalOp_match<LHS, RHS, Instruction::Or> m_LogicalOr(const LHS &L,
+                                                              const RHS &R) {
   return LogicalOp_match<LHS, RHS, Instruction::Or>(L, R);
 }
 
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.def b/llvm/include/llvm/IR/RuntimeLibcalls.def
index 62d67308114f..39c11771ff41 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.def
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.def
@@ -47,6 +47,8 @@ HANDLE_LIBCALL(MUL_I16, "__mulhi3")
 HANDLE_LIBCALL(MUL_I32, "__mulsi3")
 HANDLE_LIBCALL(MUL_I64, "__muldi3")
 HANDLE_LIBCALL(MUL_I128, "__multi3")
+HANDLE_LIBCALL(MUL_IEXT, nullptr)
+
 HANDLE_LIBCALL(MULO_I32, "__mulosi4")
 HANDLE_LIBCALL(MULO_I64, "__mulodi4")
 HANDLE_LIBCALL(MULO_I128, "__muloti4")
@@ -55,31 +57,43 @@ HANDLE_LIBCALL(SDIV_I16, "__divhi3")
 HANDLE_LIBCALL(SDIV_I32, "__divsi3")
 HANDLE_LIBCALL(SDIV_I64, "__divdi3")
 HANDLE_LIBCALL(SDIV_I128, "__divti3")
+HANDLE_LIBCALL(SDIV_IEXT, "__divei4")
+
 HANDLE_LIBCALL(UDIV_I8, "__udivqi3")
 HANDLE_LIBCALL(UDIV_I16, "__udivhi3")
 HANDLE_LIBCALL(UDIV_I32, "__udivsi3")
 HANDLE_LIBCALL(UDIV_I64, "__udivdi3")
 HANDLE_LIBCALL(UDIV_I128, "__udivti3")
+HANDLE_LIBCALL(UDIV_IEXT, "__udivei4")
+
 HANDLE_LIBCALL(SREM_I8, "__modqi3")
 HANDLE_LIBCALL(SREM_I16, "__modhi3")
 HANDLE_LIBCALL(SREM_I32, "__modsi3")
 HANDLE_LIBCALL(SREM_I64, "__moddi3")
 HANDLE_LIBCALL(SREM_I128, "__modti3")
+HANDLE_LIBCALL(SREM_IEXT, "__modei4")
+
 HANDLE_LIBCALL(UREM_I8, "__umodqi3")
 HANDLE_LIBCALL(UREM_I16, "__umodhi3")
 HANDLE_LIBCALL(UREM_I32, "__umodsi3")
 HANDLE_LIBCALL(UREM_I64, "__umoddi3")
 HANDLE_LIBCALL(UREM_I128, "__umodti3")
+HANDLE_LIBCALL(UREM_IEXT, "__umodei4")
+
 HANDLE_LIBCALL(SDIVREM_I8, nullptr)
 HANDLE_LIBCALL(SDIVREM_I16, nullptr)
 HANDLE_LIBCALL(SDIVREM_I32, nullptr)
 HANDLE_LIBCALL(SDIVREM_I64, nullptr)
 HANDLE_LIBCALL(SDIVREM_I128, nullptr)
+HANDLE_LIBCALL(SDIVREM_IEXT, nullptr)
+
 HANDLE_LIBCALL(UDIVREM_I8, nullptr)
 HANDLE_LIBCALL(UDIVREM_I16, nullptr)
 HANDLE_LIBCALL(UDIVREM_I32, nullptr)
 HANDLE_LIBCALL(UDIVREM_I64, nullptr)
 HANDLE_LIBCALL(UDIVREM_I128, nullptr)
+HANDLE_LIBCALL(UDIVREM_IEXT, nullptr)
+
 HANDLE_LIBCALL(NEG_I32, "__negsi2")
 HANDLE_LIBCALL(NEG_I64, "__negdi2")
 HANDLE_LIBCALL(CTLZ_I32, "__clzsi2")
@@ -296,6 +310,8 @@ HANDLE_LIBCALL(FPROUND_F64_F16, "__truncdfhf2")
 HANDLE_LIBCALL(FPROUND_F80_F16, "__truncxfhf2")
 HANDLE_LIBCALL(FPROUND_F128_F16, "__trunctfhf2")
 HANDLE_LIBCALL(FPROUND_PPCF128_F16, "__trunctfhf2")
+HANDLE_LIBCALL(FPROUND_F32_BF16, "__truncsfbf2")
+HANDLE_LIBCALL(FPROUND_F64_BF16, "__truncdfbf2")
 HANDLE_LIBCALL(FPROUND_F64_F32, "__truncdfsf2")
 HANDLE_LIBCALL(FPROUND_F80_F32, "__truncxfsf2")
 HANDLE_LIBCALL(FPROUND_F128_F32, "__trunctfsf2")
diff --git a/llvm/include/llvm/IR/Statepoint.h b/llvm/include/llvm/IR/Statepoint.h
index da9c732ad818..ba8ffbbaf397 100644
--- a/llvm/include/llvm/IR/Statepoint.h
+++ b/llvm/include/llvm/IR/Statepoint.h
@@ -121,9 +121,8 @@ public:
   /// Return the type of the value returned by the call underlying the
   /// statepoint.
   Type *getActualReturnType() const {
-    auto *CalleeTy =
-        getActualCalledOperand()->getType()->getPointerElementType();
-    return cast<FunctionType>(CalleeTy)->getReturnType();
+    auto *FT = cast<FunctionType>(getParamElementType(CalledFunctionPos));
+    return FT->getReturnType();
   }
 
 
diff --git a/llvm/include/llvm/IR/Type.h b/llvm/include/llvm/IR/Type.h
index e4e8a5529c87..51263c6b8fcc 100644
--- a/llvm/include/llvm/IR/Type.h
+++ b/llvm/include/llvm/IR/Type.h
@@ -68,13 +68,14 @@ public:
     TokenTyID,     ///< Tokens
 
     // Derived types... see DerivedTypes.h file.
-    IntegerTyID,       ///< Arbitrary bit width integers
-    FunctionTyID,      ///< Functions
-    PointerTyID,       ///< Pointers
-    StructTyID,        ///< Structures
-    ArrayTyID,         ///< Arrays
-    FixedVectorTyID,   ///< Fixed width SIMD vector type
-    ScalableVectorTyID ///< Scalable SIMD vector type
+    IntegerTyID,        ///< Arbitrary bit width integers
+    FunctionTyID,       ///< Functions
+    PointerTyID,        ///< Pointers
+    StructTyID,         ///< Structures
+    ArrayTyID,          ///< Arrays
+    FixedVectorTyID,    ///< Fixed width SIMD vector type
+    ScalableVectorTyID, ///< Scalable SIMD vector type
+    DXILPointerTyID,    ///< DXIL typed pointer used by DirectX target
   };
 
 private:
@@ -368,6 +369,9 @@ public:
 
   /// This method is deprecated without replacement. Pointer element types are
   /// not available with opaque pointers.
+  [[deprecated("Deprecated without replacement, see "
+               "https://llvm.org/docs/OpaquePointers.html for context and "
+               "migration instructions")]]
   Type *getPointerElementType() const {
     return getNonOpaquePointerElementType();
   }
diff --git a/llvm/include/llvm/IR/User.h b/llvm/include/llvm/IR/User.h
index 221bb5b2cb1c..a9cf60151e5d 100644
--- a/llvm/include/llvm/IR/User.h
+++ b/llvm/include/llvm/IR/User.h
@@ -304,8 +304,8 @@ public:
   /// Replace uses of one Value with another.
   ///
   /// Replaces all references to the "From" definition with references to the
-  /// "To" definition.
-  void replaceUsesOfWith(Value *From, Value *To);
+  /// "To" definition. Returns whether any uses were replaced.
+  bool replaceUsesOfWith(Value *From, Value *To);
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const Value *V) {
diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def
index 1abcbb874a8d..1d639e8aeb01 100644
--- a/llvm/include/llvm/IR/VPIntrinsics.def
+++ b/llvm/include/llvm/IR/VPIntrinsics.def
@@ -54,6 +54,12 @@
 #define END_REGISTER_VP_SDNODE(VPSD)
 #endif
 
+// Helper macro to set up the mapping from VP intrinsic to ISD opcode.
+// Note: More than one VP intrinsic may map to one ISD opcode.
+#ifndef HELPER_MAP_VPID_TO_VPSD
+#define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD)
+#endif
+
 // Helper macros for the common "1:1 - Intrinsic : SDNode" case.
 //
 // There is one VP intrinsic that maps directly to one SDNode that goes by the
@@ -70,7 +76,8 @@
 //             the SDNode is used.
 #define BEGIN_REGISTER_VP(VPID, MASKPOS, EVLPOS, VPSD, LEGALPOS)               \
   BEGIN_REGISTER_VP_INTRINSIC(VPID, MASKPOS, EVLPOS)                           \
-  BEGIN_REGISTER_VP_SDNODE(VPSD, LEGALPOS, VPID, MASKPOS, EVLPOS)
+  BEGIN_REGISTER_VP_SDNODE(VPSD, LEGALPOS, VPID, MASKPOS, EVLPOS)              \
+  HELPER_MAP_VPID_TO_VPSD(VPID, VPSD)
 
 #define END_REGISTER_VP(VPID, VPSD)                                            \
   END_REGISTER_VP_INTRINSIC(VPID)                                              \
@@ -121,6 +128,18 @@
 #define VP_PROPERTY_BINARYOP
 #endif
 
+// A property to infer VP type casts automatically.
+#ifndef VP_PROPERTY_CASTOP
+#define VP_PROPERTY_CASTOP
+#endif
+
+// This VP Intrinsic is a comparison operation
+// The condition code arg is at CCPOS and accepts floating-point condition
+// codes if ISFP is set, else it accepts integer condition codes.
+#ifndef VP_PROPERTY_CMP
+#define VP_PROPERTY_CMP(CCPOS, ISFP)
+#endif
+
 /// } Property Macros
 
 ///// Integer Arithmetic {
@@ -211,22 +230,130 @@ HELPER_REGISTER_BINARY_FP_VP(frem, VP_FREM, FRem)
 
 #undef HELPER_REGISTER_BINARY_FP_VP
 
+// llvm.vp.fneg(x,mask,vlen)
+BEGIN_REGISTER_VP(vp_fneg, 1, 2, VP_FNEG, -1)
+VP_PROPERTY_FUNCTIONAL_OPC(FNeg)
+END_REGISTER_VP(vp_fneg, VP_FNEG)
+
+// llvm.vp.fma(x,y,z,mask,vlen)
+BEGIN_REGISTER_VP(vp_fma, 3, 4, VP_FMA, -1)
+VP_PROPERTY_CONSTRAINEDFP(1, 1, experimental_constrained_fma)
+END_REGISTER_VP(vp_fma, VP_FMA)
+
 ///// } Floating-Point Arithmetic
 
+///// Type Casts {
+// Specialized helper macro for type conversions.
+// <operation>(%x, %mask, %evl).
+#ifdef HELPER_REGISTER_FP_CAST_VP
+#error                                                                         \
+    "The internal helper macro HELPER_REGISTER_FP_CAST_VP is already defined!"
+#endif
+#define HELPER_REGISTER_FP_CAST_VP(OPSUFFIX, VPSD, IROPC, HASROUND)            \
+  BEGIN_REGISTER_VP(vp_##OPSUFFIX, 1, 2, VPSD, -1)                             \
+  VP_PROPERTY_FUNCTIONAL_OPC(IROPC)                                            \
+  VP_PROPERTY_CONSTRAINEDFP(HASROUND, 1, experimental_constrained_##OPSUFFIX)  \
+  VP_PROPERTY_CASTOP                                                           \
+  END_REGISTER_VP(vp_##OPSUFFIX, VPSD)
+
+// llvm.vp.fptoui(x,mask,vlen)
+HELPER_REGISTER_FP_CAST_VP(fptoui, VP_FPTOUI, FPToUI, 0)
+
+// llvm.vp.fptosi(x,mask,vlen)
+HELPER_REGISTER_FP_CAST_VP(fptosi, VP_FPTOSI, FPToSI, 0)
+
+// llvm.vp.uitofp(x,mask,vlen)
+HELPER_REGISTER_FP_CAST_VP(uitofp, VP_UITOFP, UIToFP, 1)
+
+// llvm.vp.sitofp(x,mask,vlen)
+HELPER_REGISTER_FP_CAST_VP(sitofp, VP_SITOFP, SIToFP, 1)
+
+// llvm.vp.fptrunc(x,mask,vlen)
+HELPER_REGISTER_FP_CAST_VP(fptrunc, VP_FP_ROUND, FPTrunc, 1)
+
+// llvm.vp.fpext(x,mask,vlen)
+HELPER_REGISTER_FP_CAST_VP(fpext, VP_FP_EXTEND, FPExt, 0)
+
+#undef HELPER_REGISTER_FP_CAST_VP
+
+// Specialized helper macro for integer type conversions.
+// <operation>(%x, %mask, %evl).
+#ifdef HELPER_REGISTER_INT_CAST_VP
+#error                                                                         \
+    "The internal helper macro HELPER_REGISTER_INT_CAST_VP is already defined!"
+#endif
+#define HELPER_REGISTER_INT_CAST_VP(OPSUFFIX, VPSD, IROPC)                     \
+  BEGIN_REGISTER_VP(vp_##OPSUFFIX, 1, 2, VPSD, -1)                             \
+  VP_PROPERTY_FUNCTIONAL_OPC(IROPC)                                            \
+  VP_PROPERTY_CASTOP                                                           \
+  END_REGISTER_VP(vp_##OPSUFFIX, VPSD)
+
+// llvm.vp.trunc(x,mask,vlen)
+HELPER_REGISTER_INT_CAST_VP(trunc, VP_TRUNCATE, Trunc)
+
+// llvm.vp.zext(x,mask,vlen)
+HELPER_REGISTER_INT_CAST_VP(zext, VP_ZERO_EXTEND, ZExt)
+
+// llvm.vp.sext(x,mask,vlen)
+HELPER_REGISTER_INT_CAST_VP(sext, VP_SIGN_EXTEND, SExt)
+
+// llvm.vp.ptrtoint(x,mask,vlen)
+HELPER_REGISTER_INT_CAST_VP(ptrtoint, VP_PTRTOINT, PtrToInt)
+
+// llvm.vp.inttoptr(x,mask,vlen)
+HELPER_REGISTER_INT_CAST_VP(inttoptr, VP_INTTOPTR, IntToPtr)
+
+#undef HELPER_REGISTER_INT_CAST_VP
+
+///// } Type Casts
+
+///// Comparisons {
+
+// VP_SETCC (ISel only)
+BEGIN_REGISTER_VP_SDNODE(VP_SETCC, 0, vp_setcc, 3, 4)
+END_REGISTER_VP_SDNODE(VP_SETCC)
+
+// llvm.vp.fcmp(x,y,cc,mask,vlen)
+BEGIN_REGISTER_VP_INTRINSIC(vp_fcmp, 3, 4)
+HELPER_MAP_VPID_TO_VPSD(vp_fcmp, VP_SETCC)
+VP_PROPERTY_FUNCTIONAL_OPC(FCmp)
+VP_PROPERTY_CMP(2, true)
+VP_PROPERTY_CONSTRAINEDFP(0, 1, experimental_constrained_fcmp)
+END_REGISTER_VP_INTRINSIC(vp_fcmp)
+
+// llvm.vp.icmp(x,y,cc,mask,vlen)
+BEGIN_REGISTER_VP_INTRINSIC(vp_icmp, 3, 4)
+HELPER_MAP_VPID_TO_VPSD(vp_icmp, VP_SETCC)
+VP_PROPERTY_FUNCTIONAL_OPC(ICmp)
+VP_PROPERTY_CMP(2, false)
+END_REGISTER_VP_INTRINSIC(vp_icmp)
+
+///// } Comparisons
+
 ///// Memory Operations {
 // llvm.vp.store(val,ptr,mask,vlen)
 BEGIN_REGISTER_VP_INTRINSIC(vp_store, 2, 3)
 // chain = VP_STORE chain,val,base,offset,mask,evl
 BEGIN_REGISTER_VP_SDNODE(VP_STORE, 0, vp_store, 4, 5)
+HELPER_MAP_VPID_TO_VPSD(vp_store, VP_STORE)
 VP_PROPERTY_FUNCTIONAL_OPC(Store)
 VP_PROPERTY_FUNCTIONAL_INTRINSIC(masked_store)
 VP_PROPERTY_MEMOP(1, 0)
 END_REGISTER_VP(vp_store, VP_STORE)
 
+// llvm.experimental.vp.strided.store(val,ptr,stride,mask,vlen)
+BEGIN_REGISTER_VP_INTRINSIC(experimental_vp_strided_store, 3, 4)
+// chain = EXPERIMENTAL_VP_STRIDED_STORE chain,val,base,offset,stride,mask,evl
+BEGIN_REGISTER_VP_SDNODE(EXPERIMENTAL_VP_STRIDED_STORE, 0, experimental_vp_strided_store, 5, 6)
+HELPER_MAP_VPID_TO_VPSD(experimental_vp_strided_store, EXPERIMENTAL_VP_STRIDED_STORE)
+VP_PROPERTY_MEMOP(1, 0)
+END_REGISTER_VP(experimental_vp_strided_store, EXPERIMENTAL_VP_STRIDED_STORE)
+
 // llvm.vp.scatter(ptr,val,mask,vlen)
 BEGIN_REGISTER_VP_INTRINSIC(vp_scatter, 2, 3)
 // chain = VP_SCATTER chain,val,base,indices,scale,mask,evl
 BEGIN_REGISTER_VP_SDNODE(VP_SCATTER, -1, vp_scatter, 5, 6)
+HELPER_MAP_VPID_TO_VPSD(vp_scatter, VP_SCATTER)
 VP_PROPERTY_FUNCTIONAL_INTRINSIC(masked_scatter)
 VP_PROPERTY_MEMOP(1, 0)
 END_REGISTER_VP(vp_scatter, VP_SCATTER)
@@ -235,15 +362,25 @@ END_REGISTER_VP(vp_scatter, VP_SCATTER)
 BEGIN_REGISTER_VP_INTRINSIC(vp_load, 1, 2)
 // val,chain = VP_LOAD chain,base,offset,mask,evl
 BEGIN_REGISTER_VP_SDNODE(VP_LOAD, -1, vp_load, 3, 4)
+HELPER_MAP_VPID_TO_VPSD(vp_load, VP_LOAD)
 VP_PROPERTY_FUNCTIONAL_OPC(Load)
 VP_PROPERTY_FUNCTIONAL_INTRINSIC(masked_load)
 VP_PROPERTY_MEMOP(0, None)
 END_REGISTER_VP(vp_load, VP_LOAD)
 
+// llvm.experimental.vp.strided.load(ptr,stride,mask,vlen)
+BEGIN_REGISTER_VP_INTRINSIC(experimental_vp_strided_load, 2, 3)
+// chain = EXPERIMENTAL_VP_STRIDED_LOAD chain,base,offset,stride,mask,evl
+BEGIN_REGISTER_VP_SDNODE(EXPERIMENTAL_VP_STRIDED_LOAD, -1, experimental_vp_strided_load, 4, 5)
+HELPER_MAP_VPID_TO_VPSD(experimental_vp_strided_load, EXPERIMENTAL_VP_STRIDED_LOAD)
+VP_PROPERTY_MEMOP(0, None)
+END_REGISTER_VP(experimental_vp_strided_load, EXPERIMENTAL_VP_STRIDED_LOAD)
+
 // llvm.vp.gather(ptr,mask,vlen)
 BEGIN_REGISTER_VP_INTRINSIC(vp_gather, 1, 2)
 // val,chain = VP_GATHER chain,base,indices,scale,mask,evl
 BEGIN_REGISTER_VP_SDNODE(VP_GATHER, -1, vp_gather, 4, 5)
+HELPER_MAP_VPID_TO_VPSD(vp_gather, VP_GATHER)
 VP_PROPERTY_FUNCTIONAL_INTRINSIC(masked_gather)
 VP_PROPERTY_MEMOP(0, None)
 END_REGISTER_VP(vp_gather, VP_GATHER)
@@ -313,6 +450,8 @@ HELPER_REGISTER_REDUCTION_VP(vp_reduce_fmin, VP_REDUCE_FMIN,
 // sequential and reassociative. These manifest as the presence of 'reassoc'
 // fast-math flags in the IR and as two distinct ISD opcodes in the
 // SelectionDAG.
+// Note we by default map from the VP intrinsic to the SEQ ISD opcode, which
+// can then be relaxed to the non-SEQ ISD opcode if the 'reassoc' flag is set.
 #ifdef HELPER_REGISTER_REDUCTION_SEQ_VP
 #error                                                                         \
     "The internal helper macro HELPER_REGISTER_REDUCTION_SEQ_VP is already defined!"
@@ -323,6 +462,7 @@ HELPER_REGISTER_REDUCTION_VP(vp_reduce_fmin, VP_REDUCE_FMIN,
   VP_PROPERTY_REDUCTION(0, 1)                                                  \
   END_REGISTER_VP_SDNODE(VPSD)                                                 \
   BEGIN_REGISTER_VP_SDNODE(SEQ_VPSD, -1, VPID, 2, 3)                           \
+  HELPER_MAP_VPID_TO_VPSD(VPID, SEQ_VPSD)                                      \
   VP_PROPERTY_REDUCTION(0, 1)                                                  \
   END_REGISTER_VP_SDNODE(SEQ_VPSD)                                             \
   VP_PROPERTY_FUNCTIONAL_INTRINSIC(INTRIN)                                     \
@@ -344,13 +484,18 @@ HELPER_REGISTER_REDUCTION_SEQ_VP(vp_reduce_fmul, VP_REDUCE_FMUL,
 
 ///// Shuffles {
 
-// llvm.vp.select(mask,on_true,on_false,vlen)
-BEGIN_REGISTER_VP(vp_select, 0, 3, VP_SELECT, -1)
+// The mask 'cond' operand of llvm.vp.select and llvm.vp.merge are not reported
+// as masks with the BEGIN_REGISTER_VP_* macros.  This is because, unlike other
+// VP intrinsics, these two have a defined result on lanes where the mask is
+// false.
+//
+// llvm.vp.select(cond,on_true,on_false,vlen)
+BEGIN_REGISTER_VP(vp_select, None, 3, VP_SELECT, -1)
 VP_PROPERTY_FUNCTIONAL_OPC(Select)
 END_REGISTER_VP(vp_select, VP_SELECT)
 
-// llvm.vp.merge(mask,on_true,on_false,pivot)
-BEGIN_REGISTER_VP(vp_merge, 0, 3, VP_MERGE, -1)
+// llvm.vp.merge(cond,on_true,on_false,pivot)
+BEGIN_REGISTER_VP(vp_merge, None, 3, VP_MERGE, -1)
 END_REGISTER_VP(vp_merge, VP_MERGE)
 
 BEGIN_REGISTER_VP(experimental_vp_splice, 3, 5, EXPERIMENTAL_VP_SPLICE, -1)
@@ -364,7 +509,10 @@ END_REGISTER_VP(experimental_vp_splice, EXPERIMENTAL_VP_SPLICE)
 #undef END_REGISTER_VP
 #undef END_REGISTER_VP_INTRINSIC
 #undef END_REGISTER_VP_SDNODE
+#undef HELPER_MAP_VPID_TO_VPSD
 #undef VP_PROPERTY_BINARYOP
+#undef VP_PROPERTY_CASTOP
+#undef VP_PROPERTY_CMP
 #undef VP_PROPERTY_CONSTRAINEDFP
 #undef VP_PROPERTY_FUNCTIONAL_INTRINSIC
 #undef VP_PROPERTY_FUNCTIONAL_OPC
diff --git a/llvm/include/llvm/IR/ValueMap.h b/llvm/include/llvm/IR/ValueMap.h
index 67f275cc06d9..a4b6091cf115 100644
--- a/llvm/include/llvm/IR/ValueMap.h
+++ b/llvm/include/llvm/IR/ValueMap.h
@@ -104,8 +104,8 @@ public:
       : Map(NumInitBuckets), Data() {}
   explicit ValueMap(const ExtraData &Data, unsigned NumInitBuckets = 64)
       : Map(NumInitBuckets), Data(Data) {}
-  // ValueMap can't be copied nor moved, beucase the callbacks store pointer
-  // to it.
+  // ValueMap can't be copied nor moved, because the callbacks store pointer to
+  // it.
   ValueMap(const ValueMap &) = delete;
   ValueMap(ValueMap &&) = delete;
   ValueMap &operator=(const ValueMap &) = delete;
@@ -141,7 +141,7 @@ public:
   size_type size() const { return Map.size(); }
 
   /// Grow the map so that it has at least Size buckets. Does not shrink
-  void resize(size_t Size) { Map.resize(Size); }
+  void reserve(size_t Size) { Map.reserve(Size); }
 
   void clear() {
     Map.clear();
diff --git a/llvm/include/llvm/IR/VectorBuilder.h b/llvm/include/llvm/IR/VectorBuilder.h
new file mode 100644
index 000000000000..301edaed70fe
--- /dev/null
+++ b/llvm/include/llvm/IR/VectorBuilder.h
@@ -0,0 +1,99 @@
+//===- llvm/VectorBuilder.h - Builder for VP Intrinsics ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the VectorBuilder class, which is used as a convenient way
+// to create VP intrinsics as if they were LLVM instructions with a consistent
+// and simplified interface.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_IR_VECTORBUILDER_H
+#define LLVM_IR_VECTORBUILDER_H
+
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/InstrTypes.h>
+#include <llvm/IR/Instruction.h>
+#include <llvm/IR/Value.h>
+
+namespace llvm {
+
+class VectorBuilder {
+public:
+  enum class Behavior {
+    // Abort if the requested VP intrinsic could not be created.
+    // This is useful for strict consistency.
+    ReportAndAbort = 0,
+
+    // Return a default-initialized value if the requested VP intrinsic could
+    // not be created.
+    // This is useful for a defensive fallback to non-VP code.
+    SilentlyReturnNone = 1,
+  };
+
+private:
+  IRBuilderBase &Builder;
+  Behavior ErrorHandling;
+
+  // Explicit mask parameter.
+  Value *Mask;
+  // Explicit vector length parameter.
+  Value *ExplicitVectorLength;
+  // Compile-time vector length.
+  ElementCount StaticVectorLength;
+
+  // Get mask/evl value handles for the current configuration.
+  Value &requestMask();
+  Value &requestEVL();
+
+  void handleError(const char *ErrorMsg) const;
+  template <typename RetType>
+  RetType returnWithError(const char *ErrorMsg) const {
+    handleError(ErrorMsg);
+    return RetType();
+  }
+
+public:
+  VectorBuilder(IRBuilderBase &Builder,
+                Behavior ErrorHandling = Behavior::ReportAndAbort)
+      : Builder(Builder), ErrorHandling(ErrorHandling), Mask(nullptr),
+        ExplicitVectorLength(nullptr),
+        StaticVectorLength(ElementCount::getFixed(0)) {}
+
+  Module &getModule() const;
+  LLVMContext &getContext() const { return Builder.getContext(); }
+
+  // All-true mask for the currently configured explicit vector length.
+  Value *getAllTrueMask();
+
+  VectorBuilder &setMask(Value *NewMask) {
+    Mask = NewMask;
+    return *this;
+  }
+  VectorBuilder &setEVL(Value *NewExplicitVectorLength) {
+    ExplicitVectorLength = NewExplicitVectorLength;
+    return *this;
+  }
+  VectorBuilder &setStaticVL(unsigned NewFixedVL) {
+    StaticVectorLength = ElementCount::getFixed(NewFixedVL);
+    return *this;
+  }
+  // TODO: setStaticVL(ElementCount) for scalable types.
+
+  // Emit a VP intrinsic call that mimics a regular instruction.
+  // This operation behaves according to the VectorBuilderBehavior.
+  // \p Opcode      The functional instruction opcode of the emitted intrinsic.
+  // \p ReturnTy    The return type of the operation.
+  // \p VecOpArray  The operand list.
+  Value *createVectorInstruction(unsigned Opcode, Type *ReturnTy,
+                                 ArrayRef<Value *> VecOpArray,
+                                 const Twine &Name = Twine());
+};
+
+} // namespace llvm
+
+#endif // LLVM_IR_VECTORBUILDER_H
diff --git a/llvm/include/llvm/IRReader/IRReader.h b/llvm/include/llvm/IRReader/IRReader.h
index a14e46e2edc8..3f2a01fdc54a 100644
--- a/llvm/include/llvm/IRReader/IRReader.h
+++ b/llvm/include/llvm/IRReader/IRReader.h
@@ -14,7 +14,9 @@
 #ifndef LLVM_IRREADER_IRREADER_H
 #define LLVM_IRREADER_IRREADER_H
 
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLForwardCompat.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include <memory>
 
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 489ef045796f..77f2c6330788 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -48,9 +48,6 @@ void initializeInstrumentation(PassRegistry&);
 /// Initialize all passes linked into the Analysis library.
 void initializeAnalysis(PassRegistry&);
 
-/// Initialize all passes linked into the Coroutines library.
-void initializeCoroutines(PassRegistry&);
-
 /// Initialize all passes linked into the CodeGen library.
 void initializeCodeGen(PassRegistry&);
 
@@ -65,9 +62,6 @@ void initializeAAResultsWrapperPassPass(PassRegistry&);
 void initializeADCELegacyPassPass(PassRegistry&);
 void initializeAddDiscriminatorsLegacyPassPass(PassRegistry&);
 void initializeAddFSDiscriminatorsPass(PassRegistry &);
-void initializeModuleAddressSanitizerLegacyPassPass(PassRegistry &);
-void initializeASanGlobalsMetadataWrapperPassPass(PassRegistry &);
-void initializeAddressSanitizerLegacyPassPass(PassRegistry &);
 void initializeAggressiveInstCombinerLegacyPassPass(PassRegistry&);
 void initializeAliasSetPrinterPass(PassRegistry&);
 void initializeAlignmentFromAssumptionsPass(PassRegistry&);
@@ -77,11 +71,11 @@ void initializeAssumeBuilderPassLegacyPassPass(PassRegistry &);
 void initializeAnnotation2MetadataLegacyPass(PassRegistry &);
 void initializeAnnotationRemarksLegacyPass(PassRegistry &);
 void initializeOpenMPOptCGSCCLegacyPassPass(PassRegistry &);
-void initializeArgPromotionPass(PassRegistry&);
 void initializeAssumptionCacheTrackerPass(PassRegistry&);
 void initializeAtomicExpandPass(PassRegistry&);
 void initializeAttributorLegacyPassPass(PassRegistry&);
 void initializeAttributorCGSCCLegacyPassPass(PassRegistry &);
+void initializeBasicBlockSectionsProfileReaderPass(PassRegistry &);
 void initializeBasicBlockSectionsPass(PassRegistry &);
 void initializeBDCELegacyPassPass(PassRegistry&);
 void initializeBarrierNoopPass(PassRegistry&);
@@ -103,6 +97,7 @@ void initializeCFGSimplifyPassPass(PassRegistry&);
 void initializeCFGuardPass(PassRegistry&);
 void initializeCFGuardLongjmpPass(PassRegistry&);
 void initializeCFGViewerLegacyPassPass(PassRegistry&);
+void initializeCFIFixupPass(PassRegistry&);
 void initializeCFIInstrInserterPass(PassRegistry&);
 void initializeCFLAndersAAWrapperPassPass(PassRegistry&);
 void initializeCFLSteensAAWrapperPassPass(PassRegistry&);
@@ -137,10 +132,10 @@ void initializeDependenceAnalysisPass(PassRegistry&);
 void initializeDependenceAnalysisWrapperPassPass(PassRegistry&);
 void initializeDetectDeadLanesPass(PassRegistry&);
 void initializeDivRemPairsLegacyPassPass(PassRegistry&);
-void initializeDomOnlyPrinterPass(PassRegistry&);
-void initializeDomOnlyViewerPass(PassRegistry&);
-void initializeDomPrinterPass(PassRegistry&);
-void initializeDomViewerPass(PassRegistry&);
+void initializeDomOnlyPrinterWrapperPassPass(PassRegistry &);
+void initializeDomOnlyViewerWrapperPassPass(PassRegistry &);
+void initializeDomPrinterWrapperPassPass(PassRegistry &);
+void initializeDomViewerWrapperPassPass(PassRegistry &);
 void initializeDominanceFrontierWrapperPassPass(PassRegistry&);
 void initializeDominatorTreeWrapperPassPass(PassRegistry&);
 void initializeDwarfEHPrepareLegacyPassPass(PassRegistry &);
@@ -174,7 +169,6 @@ void initializeFunctionImportLegacyPassPass(PassRegistry&);
 void initializeFunctionSpecializationLegacyPassPass(PassRegistry &);
 void initializeGCMachineCodeAnalysisPass(PassRegistry&);
 void initializeGCModuleInfoPass(PassRegistry&);
-void initializeGCOVProfilerLegacyPassPass(PassRegistry&);
 void initializeGVNHoistLegacyPassPass(PassRegistry&);
 void initializeGVNLegacyPassPass(PassRegistry&);
 void initializeGVNSinkLegacyPassPass(PassRegistry&);
@@ -188,7 +182,6 @@ void initializeHardwareLoopsPass(PassRegistry&);
 void initializeMIRProfileLoaderPassPass(PassRegistry &);
 void initializeMemProfilerLegacyPassPass(PassRegistry &);
 void initializeHotColdSplittingLegacyPassPass(PassRegistry&);
-void initializeHWAddressSanitizerLegacyPassPass(PassRegistry &);
 void initializeIPSCCPLegacyPassPass(PassRegistry&);
 void initializeIRCELegacyPassPass(PassRegistry&);
 void initializeIROutlinerLegacyPassPass(PassRegistry&);
@@ -215,6 +208,7 @@ void initializeInterleavedAccessPass(PassRegistry&);
 void initializeInterleavedLoadCombinePass(PassRegistry &);
 void initializeInternalizeLegacyPassPass(PassRegistry&);
 void initializeIntervalPartitionPass(PassRegistry&);
+void initializeJMCInstrumenterPass(PassRegistry&);
 void initializeJumpThreadingPass(PassRegistry&);
 void initializeLCSSAVerificationPassPass(PassRegistry&);
 void initializeLCSSAWrapperPassPass(PassRegistry&);
@@ -273,6 +267,7 @@ void initializeLowerAtomicLegacyPassPass(PassRegistry&);
 void initializeLowerConstantIntrinsicsPass(PassRegistry&);
 void initializeLowerEmuTLSPass(PassRegistry&);
 void initializeLowerExpectIntrinsicPass(PassRegistry&);
+void initializeLowerGlobalDtorsLegacyPassPass(PassRegistry &);
 void initializeLowerGuardIntrinsicLegacyPassPass(PassRegistry&);
 void initializeLowerWidenableConditionLegacyPassPass(PassRegistry&);
 void initializeLowerIntrinsicsPass(PassRegistry&);
@@ -316,7 +311,6 @@ void initializeMemDerefPrinterPass(PassRegistry&);
 void initializeMemoryDependenceWrapperPassPass(PassRegistry&);
 void initializeMemorySSAPrinterLegacyPassPass(PassRegistry&);
 void initializeMemorySSAWrapperPassPass(PassRegistry&);
-void initializeMemorySanitizerLegacyPassPass(PassRegistry&);
 void initializeMergeFunctionsLegacyPassPass(PassRegistry&);
 void initializeMergeICmpsLegacyPassPass(PassRegistry &);
 void initializeMergedLoadStoreMotionLegacyPassPass(PassRegistry&);
@@ -339,11 +333,6 @@ void initializeOptimizationRemarkEmitterWrapperPassPass(PassRegistry&);
 void initializeOptimizePHIsPass(PassRegistry&);
 void initializePAEvalPass(PassRegistry&);
 void initializePEIPass(PassRegistry&);
-void initializePGOIndirectCallPromotionLegacyPassPass(PassRegistry&);
-void initializePGOInstrumentationGenLegacyPassPass(PassRegistry&);
-void initializePGOInstrumentationUseLegacyPassPass(PassRegistry&);
-void initializePGOInstrumentationGenCreateVarLegacyPassPass(PassRegistry&);
-void initializePGOMemOPSizeOptLegacyPassPass(PassRegistry&);
 void initializePHIEliminationPass(PassRegistry&);
 void initializePartialInlinerLegacyPassPass(PassRegistry&);
 void initializePartiallyInlineLibCallsLegacyPassPass(PassRegistry&);
@@ -353,10 +342,10 @@ void initializePhiValuesWrapperPassPass(PassRegistry&);
 void initializePhysicalRegisterUsageInfoPass(PassRegistry&);
 void initializePlaceBackedgeSafepointsImplPass(PassRegistry&);
 void initializePlaceSafepointsPass(PassRegistry&);
-void initializePostDomOnlyPrinterPass(PassRegistry&);
-void initializePostDomOnlyViewerPass(PassRegistry&);
-void initializePostDomPrinterPass(PassRegistry&);
-void initializePostDomViewerPass(PassRegistry&);
+void initializePostDomOnlyPrinterWrapperPassPass(PassRegistry &);
+void initializePostDomOnlyViewerWrapperPassPass(PassRegistry &);
+void initializePostDomPrinterWrapperPassPass(PassRegistry &);
+void initializePostDomViewerWrapperPassPass(PassRegistry &);
 void initializePostDominatorTreeWrapperPassPass(PassRegistry&);
 void initializePostInlineEntryExitInstrumenterPass(PassRegistry&);
 void initializePostMachineSchedulerPass(PassRegistry&);
@@ -405,6 +394,7 @@ void initializeSROALegacyPassPass(PassRegistry&);
 void initializeSafeStackLegacyPassPass(PassRegistry&);
 void initializeSafepointIRVerifierPass(PassRegistry&);
 void initializeSampleProfileLoaderLegacyPassPass(PassRegistry&);
+void initializeSelectOptimizePass(PassRegistry &);
 void initializeModuleSanitizerCoverageLegacyPassPass(PassRegistry &);
 void initializeScalarEvolutionWrapperPassPass(PassRegistry&);
 void initializeScalarizeMaskedMemIntrinLegacyPassPass(PassRegistry &);
@@ -443,7 +433,7 @@ void initializeTailDuplicatePass(PassRegistry&);
 void initializeTargetLibraryInfoWrapperPassPass(PassRegistry&);
 void initializeTargetPassConfigPass(PassRegistry&);
 void initializeTargetTransformInfoWrapperPassPass(PassRegistry&);
-void initializeThreadSanitizerLegacyPassPass(PassRegistry&);
+void initializeTLSVariableHoistLegacyPassPass(PassRegistry &);
 void initializeTwoAddressInstructionPassPass(PassRegistry&);
 void initializeTypeBasedAAWrapperPassPass(PassRegistry&);
 void initializeTypePromotionPass(PassRegistry&);
diff --git a/llvm/include/llvm/InterfaceStub/ELFObjHandler.h b/llvm/include/llvm/InterfaceStub/ELFObjHandler.h
index 20a02c6d5445..c15838c4ae0a 100644
--- a/llvm/include/llvm/InterfaceStub/ELFObjHandler.h
+++ b/llvm/include/llvm/InterfaceStub/ELFObjHandler.h
@@ -13,16 +13,15 @@
 #ifndef LLVM_INTERFACESTUB_ELFOBJHANDLER_H
 #define LLVM_INTERFACESTUB_ELFOBJHANDLER_H
 
-#include "llvm/InterfaceStub/IFSStub.h"
-#include "llvm/Object/ELFObjectFile.h"
-#include "llvm/Object/ELFTypes.h"
-#include "llvm/Support/FileSystem.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MemoryBufferRef.h"
+#include <memory>
 
 namespace llvm {
 
-class MemoryBuffer;
-
 namespace ifs {
+struct IFSStub;
 
 /// Attempt to read a binary ELF file from a MemoryBuffer.
 Expected<std::unique_ptr<IFSStub>> readELFFile(MemoryBufferRef Buf);
diff --git a/llvm/include/llvm/InterfaceStub/IFSHandler.h b/llvm/include/llvm/InterfaceStub/IFSHandler.h
index 6ae6a421318e..bfa5692811d7 100644
--- a/llvm/include/llvm/InterfaceStub/IFSHandler.h
+++ b/llvm/include/llvm/InterfaceStub/IFSHandler.h
@@ -19,6 +19,8 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/VersionTuple.h"
 #include <memory>
+#include <string>
+#include <vector>
 
 namespace llvm {
 
@@ -51,8 +53,8 @@ Error validateIFSTarget(IFSStub &Stub, bool ParseTriple);
 void stripIFSTarget(IFSStub &Stub, bool StripTriple, bool StripArch,
                     bool StripEndianness, bool StripBitWidth);
 
-/// Strips symbols from IFS symbol table that are undefined.
-void stripIFSUndefinedSymbols(IFSStub &Stub);
+Error filterIFSSyms(IFSStub &Stub, bool StripUndefined,
+                    const std::vector<std::string> &Exclude = {});
 
 /// Parse llvm triple string into a IFSTarget struct.
 IFSTarget parseTriple(StringRef TripleStr);
diff --git a/llvm/include/llvm/InterfaceStub/IFSStub.h b/llvm/include/llvm/InterfaceStub/IFSStub.h
index 8c3cd171b1a2..0f935cd478d5 100644
--- a/llvm/include/llvm/InterfaceStub/IFSStub.h
+++ b/llvm/include/llvm/InterfaceStub/IFSStub.h
@@ -14,9 +14,8 @@
 #ifndef LLVM_INTERFACESTUB_IFSSTUB_H
 #define LLVM_INTERFACESTUB_IFSSTUB_H
 
-#include "llvm/Support/Error.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/Support/VersionTuple.h"
-#include <set>
 #include <vector>
 
 namespace llvm {
@@ -54,7 +53,7 @@ struct IFSSymbol {
   IFSSymbol() = default;
   explicit IFSSymbol(std::string SymbolName) : Name(std::move(SymbolName)) {}
   std::string Name;
-  uint64_t Size;
+  Optional<uint64_t> Size;
   IFSSymbolType Type;
   bool Undefined;
   bool Weak;
diff --git a/llvm/include/llvm/LTO/Config.h b/llvm/include/llvm/LTO/Config.h
index eb793d62907e..54bb82d84d96 100644
--- a/llvm/include/llvm/LTO/Config.h
+++ b/llvm/include/llvm/LTO/Config.h
@@ -57,8 +57,8 @@ struct Config {
   unsigned OptLevel = 2;
   bool DisableVerify = false;
 
-  /// Use the new pass manager
-  bool UseNewPM = LLVM_ENABLE_NEW_PASS_MANAGER;
+  /// Use the standard optimization pipeline.
+  bool UseDefaultPipeline = false;
 
   /// Flag to indicate that the optimizer should not assume builtins are present
   /// on the target.
@@ -177,6 +177,10 @@ struct Config {
   /// Add FSAFDO discriminators.
   bool AddFSDiscriminator = false;
 
+  /// Use opaque pointer types. Used to call LLVMContext::setOpaquePointers
+  /// unless already set by the `-opaque-pointers` commandline option.
+  bool OpaquePointers = true;
+
   /// If this field is set, LTO will write input file paths and symbol
   /// resolutions here in llvm-lto2 command line flag format. This can be
   /// used for testing and for running the LTO pipeline outside of the linker
@@ -288,6 +292,8 @@ struct LTOLLVMContext : LLVMContext {
     enableDebugTypeODRUniquing();
     setDiagnosticHandler(
         std::make_unique<LTOLLVMDiagnosticHandler>(&DiagHandler), true);
+    if (!hasSetOpaquePointersValue())
+      setOpaquePointers(C.OpaquePointers);
   }
   DiagnosticHandlerFunction DiagHandler;
 };
diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h
index 0d085a88a193..ea52226dca16 100644
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -197,7 +197,17 @@ using ThinBackend = std::function<std::unique_ptr<ThinBackendProc>(
 
 /// This ThinBackend runs the individual backend jobs in-process.
 /// The default value means to use one job per hardware core (not hyper-thread).
-ThinBackend createInProcessThinBackend(ThreadPoolStrategy Parallelism);
+/// OnWrite is callback which receives module identifier and notifies LTO user
+/// that index file for the module (and optionally imports file) was created.
+/// ShouldEmitIndexFiles being true will write sharded ThinLTO index files
+/// to the same path as the input module, with suffix ".thinlto.bc"
+/// ShouldEmitImportsFiles is true it also writes a list of imported files to a
+/// similar path with ".imports" appended instead.
+using IndexWriteCallback = std::function<void(const std::string &)>;
+ThinBackend createInProcessThinBackend(ThreadPoolStrategy Parallelism,
+                                       IndexWriteCallback OnWrite = nullptr,
+                                       bool ShouldEmitIndexFiles = false,
+                                       bool ShouldEmitImportsFiles = false);
 
 /// This ThinBackend writes individual module indexes to files, instead of
 /// running the individual backend jobs. This backend is for distributed builds
@@ -212,7 +222,6 @@ ThinBackend createInProcessThinBackend(ThreadPoolStrategy Parallelism);
 /// the final ThinLTO linking. Can be nullptr.
 /// OnWrite is callback which receives module identifier and notifies LTO user
 /// that index file for the module (and optionally imports file) was created.
-using IndexWriteCallback = std::function<void(const std::string &)>;
 ThinBackend createWriteIndexesThinBackend(std::string OldPrefix,
                                           std::string NewPrefix,
                                           bool ShouldEmitImportsFiles,
diff --git a/llvm/include/llvm/LTO/legacy/LTOCodeGenerator.h b/llvm/include/llvm/LTO/legacy/LTOCodeGenerator.h
index 333f483f29c5..96f82a9276e0 100644
--- a/llvm/include/llvm/LTO/legacy/LTOCodeGenerator.h
+++ b/llvm/include/llvm/LTO/legacy/LTOCodeGenerator.h
@@ -184,7 +184,7 @@ struct LTOCodeGenerator {
 
   void setDisableVerify(bool Value) { Config.DisableVerify = Value; }
 
-  void setUseNewPM(bool Value) { Config.UseNewPM = Value; }
+  void setDebugPassManager(bool Enabled) { Config.DebugPassManager = Enabled; }
 
   void setDiagnosticHandler(lto_diagnostic_handler_t, void *);
 
diff --git a/llvm/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h b/llvm/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h
index be1f3154029c..ab40d88af8c1 100644
--- a/llvm/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h
+++ b/llvm/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h
@@ -225,9 +225,6 @@ public:
     OptLevel = (NewOptLevel > 3) ? 3 : NewOptLevel;
   }
 
-  /// Enable or disable the new pass manager.
-  void setUseNewPM(unsigned Enabled) { UseNewPM = Enabled; }
-
   /// Enable or disable debug output for the new pass manager.
   void setDebugPassManager(unsigned Enabled) { DebugPassManager = Enabled; }
 
@@ -347,10 +344,6 @@ private:
   /// IR Optimization Level [0-3].
   unsigned OptLevel = 3;
 
-  /// Flag to indicate whether the new pass manager should be used for IR
-  /// optimizations.
-  bool UseNewPM = LLVM_ENABLE_NEW_PASS_MANAGER;
-
   /// Flag to indicate whether debug output should be enabled for the new pass
   /// manager.
   bool DebugPassManager = false;
diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h
index c8b9aaeed76a..af5926dcb38b 100644
--- a/llvm/include/llvm/LinkAllPasses.h
+++ b/llvm/include/llvm/LinkAllPasses.h
@@ -75,7 +75,6 @@ namespace {
       (void) llvm::createAggressiveInstCombinerPass();
       (void) llvm::createBitTrackingDCEPass();
       (void)llvm::createOpenMPOptCGSCCLegacyPass();
-      (void) llvm::createArgumentPromotionPass();
       (void) llvm::createAlignmentFromAssumptionsPass();
       (void) llvm::createBasicAAWrapperPass();
       (void) llvm::createSCEVAAWrapperPass();
@@ -98,16 +97,10 @@ namespace {
       (void) llvm::createDeadCodeEliminationPass();
       (void) llvm::createDeadStoreEliminationPass();
       (void) llvm::createDependenceAnalysisWrapperPass();
-      (void) llvm::createDomOnlyPrinterPass();
-      (void) llvm::createDomPrinterPass();
-      (void) llvm::createDomOnlyViewerPass();
-      (void) llvm::createDomViewerPass();
-      (void) llvm::createGCOVProfilerPass();
-      (void) llvm::createPGOInstrumentationGenLegacyPass();
-      (void) llvm::createPGOInstrumentationUseLegacyPass();
-      (void) llvm::createPGOInstrumentationGenCreateVarLegacyPass();
-      (void) llvm::createPGOIndirectCallPromotionLegacyPass();
-      (void) llvm::createPGOMemOPSizeOptLegacyPass();
+      (void) llvm::createDomOnlyPrinterWrapperPassPass();
+      (void) llvm::createDomPrinterWrapperPassPass();
+      (void) llvm::createDomOnlyViewerWrapperPassPass();
+      (void) llvm::createDomViewerWrapperPassPass();
       (void) llvm::createInstrProfilingLegacyPass();
       (void) llvm::createFunctionImportPass();
       (void) llvm::createFunctionInliningPass();
@@ -123,6 +116,7 @@ namespace {
       (void) llvm::createInstSimplifyLegacyPass();
       (void) llvm::createInstructionCombiningPass();
       (void) llvm::createInternalizePass();
+      (void) llvm::createJMCInstrumenterPass();
       (void) llvm::createLCSSAPass();
       (void) llvm::createLegacyDivergenceAnalysisPass();
       (void) llvm::createLICMPass();
@@ -138,12 +132,12 @@ namespace {
       (void) llvm::createLoopRerollPass();
       (void) llvm::createLoopUnrollPass();
       (void) llvm::createLoopUnrollAndJamPass();
-      (void) llvm::createLoopUnswitchPass();
       (void) llvm::createLoopVersioningLICMPass();
       (void) llvm::createLoopIdiomPass();
       (void) llvm::createLoopRotatePass();
       (void) llvm::createLowerConstantIntrinsicsPass();
       (void) llvm::createLowerExpectIntrinsicPass();
+      (void) llvm::createLowerGlobalDtorsLegacyPass();
       (void) llvm::createLowerInvokePass();
       (void) llvm::createLowerSwitchPass();
       (void) llvm::createNaryReassociatePass();
@@ -156,10 +150,10 @@ namespace {
       (void) llvm::createPromoteMemoryToRegisterPass();
       (void) llvm::createDemoteRegisterToMemoryPass();
       (void) llvm::createPruneEHPass();
-      (void) llvm::createPostDomOnlyPrinterPass();
-      (void) llvm::createPostDomPrinterPass();
-      (void) llvm::createPostDomOnlyViewerPass();
-      (void) llvm::createPostDomViewerPass();
+      (void)llvm::createPostDomOnlyPrinterWrapperPassPass();
+      (void)llvm::createPostDomPrinterWrapperPassPass();
+      (void)llvm::createPostDomOnlyViewerWrapperPassPass();
+      (void)llvm::createPostDomViewerWrapperPassPass();
       (void) llvm::createReassociatePass();
       (void) llvm::createRedundantDbgInstEliminationPass();
       (void) llvm::createRegionInfoPass();
@@ -176,6 +170,7 @@ namespace {
       (void) llvm::createStripDeadDebugInfoPass();
       (void) llvm::createStripDeadPrototypesPass();
       (void) llvm::createTailCallEliminationPass();
+      (void)llvm::createTLSVariableHoistPass();
       (void) llvm::createJumpThreadingPass();
       (void) llvm::createDFAJumpThreadingPass();
       (void) llvm::createUnifyFunctionExitNodesPass();
@@ -236,6 +231,7 @@ namespace {
       (void) llvm::createUnifyLoopExitsPass();
       (void) llvm::createFixIrreduciblePass();
       (void)llvm::createFunctionSpecializationPass();
+      (void)llvm::createSelectOptimizePass();
 
       (void)new llvm::IntervalPartition();
       (void)new llvm::ScalarEvolutionWrapperPass();
diff --git a/llvm/include/llvm/Linker/IRMover.h b/llvm/include/llvm/Linker/IRMover.h
index e5df83f01fe3..1e3c5394ffa2 100644
--- a/llvm/include/llvm/Linker/IRMover.h
+++ b/llvm/include/llvm/Linker/IRMover.h
@@ -11,6 +11,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/FunctionExtras.h"
 #include <functional>
 
 namespace llvm {
@@ -62,6 +63,8 @@ public:
   IRMover(Module &M);
 
   typedef std::function<void(GlobalValue &)> ValueAdder;
+  using LazyCallback =
+      llvm::unique_function<void(GlobalValue &GV, ValueAdder Add)>;
 
   /// Move in the provide values in \p ValuesToLink from \p Src.
   ///
@@ -70,11 +73,11 @@ public:
   ///   not present in ValuesToLink. The GlobalValue and a ValueAdder callback
   ///   are passed as an argument, and the callback is expected to be called
   ///   if the GlobalValue needs to be added to the \p ValuesToLink and linked.
+  ///   Pass nullptr if there's no work to be done in such cases.
   /// - \p IsPerformingImport is true when this IR link is to perform ThinLTO
   ///   function importing from Src.
   Error move(std::unique_ptr<Module> Src, ArrayRef<GlobalValue *> ValuesToLink,
-             std::function<void(GlobalValue &GV, ValueAdder Add)> AddLazyFor,
-             bool IsPerformingImport);
+             LazyCallback AddLazyFor, bool IsPerformingImport);
   Module &getModule() { return Composite; }
 
 private:
diff --git a/llvm/include/llvm/MC/ConstantPools.h b/llvm/include/llvm/MC/ConstantPools.h
index 9fe0cce8d68c..7eac75362eff 100644
--- a/llvm/include/llvm/MC/ConstantPools.h
+++ b/llvm/include/llvm/MC/ConstantPools.h
@@ -43,7 +43,8 @@ struct ConstantPoolEntry {
 class ConstantPool {
   using EntryVecTy = SmallVector<ConstantPoolEntry, 4>;
   EntryVecTy Entries;
-  std::map<int64_t, const MCSymbolRefExpr *> CachedEntries;
+  std::map<int64_t, const MCSymbolRefExpr *> CachedConstantEntries;
+  DenseMap<const MCSymbol *, const MCSymbolRefExpr *> CachedSymbolEntries;
 
 public:
   // Initialize a new empty constant pool
diff --git a/llvm/include/llvm/MC/MCAsmBackend.h b/llvm/include/llvm/MC/MCAsmBackend.h
index bb57c3453d10..a5e7b3f504f5 100644
--- a/llvm/include/llvm/MC/MCAsmBackend.h
+++ b/llvm/include/llvm/MC/MCAsmBackend.h
@@ -13,12 +13,17 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCFixup.h"
-#include "llvm/MC/MCFragment.h"
 #include "llvm/Support/Endian.h"
 #include <cstdint>
 
 namespace llvm {
 
+class MCAlignFragment;
+class MCDwarfCallFrameFragment;
+class MCDwarfLineAddrFragment;
+class MCFragment;
+class MCRelaxableFragment;
+class MCSymbol;
 class MCAsmLayout;
 class MCAssembler;
 class MCCFIInstruction;
@@ -31,6 +36,7 @@ class MCSubtargetInfo;
 class MCValue;
 class raw_pwrite_stream;
 class StringRef;
+class raw_ostream;
 
 /// Generic interface to target specific assembler backends.
 class MCAsmBackend {
diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h
index 355f569861d8..ec17131e17e8 100644
--- a/llvm/include/llvm/MC/MCAsmInfo.h
+++ b/llvm/include/llvm/MC/MCAsmInfo.h
@@ -430,6 +430,10 @@ protected:
   /// hidden visibility.  Defaults to MCSA_Hidden.
   MCSymbolAttr HiddenVisibilityAttr = MCSA_Hidden;
 
+  /// This attribute, if not MCSA_Invalid, is used to declare a symbol as having
+  /// exported visibility.  Defaults to MCSA_Exported.
+  MCSymbolAttr ExportedVisibilityAttr = MCSA_Exported;
+
   /// This attribute, if not MCSA_Invalid, is used to declare an undefined
   /// symbol as having hidden visibility. Defaults to MCSA_Hidden.
   MCSymbolAttr HiddenDeclarationVisibilityAttr = MCSA_Hidden;
@@ -466,6 +470,10 @@ protected:
   /// the .loc/.file directives. Defaults to true.
   bool UsesDwarfFileAndLocDirectives = true;
 
+  /// True if DWARF `.file directory' directive syntax is used by
+  /// default.
+  bool EnableDwarfFileDirectoryDefault = true;
+
   /// True if the target needs the DWARF section length in the header (if any)
   /// of the DWARF section in the assembly file. Defaults to true.
   bool DwarfSectionSizeRequired = true;
@@ -478,6 +486,10 @@ protected:
   /// For example, foo(plt) instead of foo@plt.  Defaults to false.
   bool UseParensForSymbolVariant = false;
 
+  /// True if the target uses parens for symbol names starting with
+  /// '$' character to distinguish them from absolute names.
+  bool UseParensForDollarSignNames = true;
+
   /// True if the target supports flags in ".loc" directive, false if only
   /// location is allowed.
   bool SupportsExtendedDwarfLocDirective = true;
@@ -671,6 +683,7 @@ public:
   const char *getCode64Directive() const { return Code64Directive; }
   unsigned getAssemblerDialect() const { return AssemblerDialect; }
   bool doesAllowAtInName() const { return AllowAtInName; }
+  void setAllowAtInName(bool V) { AllowAtInName = V; }
   bool doesAllowQuestionAtStartOfIdentifier() const {
     return AllowQuestionAtStartOfIdentifier;
   }
@@ -749,6 +762,8 @@ public:
 
   MCSymbolAttr getHiddenVisibilityAttr() const { return HiddenVisibilityAttr; }
 
+  MCSymbolAttr getExportedVisibilityAttr() const { return ExportedVisibilityAttr; }
+
   MCSymbolAttr getHiddenDeclarationVisibilityAttr() const {
     return HiddenDeclarationVisibilityAttr;
   }
@@ -788,6 +803,9 @@ public:
   bool doDwarfFDESymbolsUseAbsDiff() const { return DwarfFDESymbolsUseAbsDiff; }
   bool useDwarfRegNumForCFI() const { return DwarfRegNumForCFI; }
   bool useParensForSymbolVariant() const { return UseParensForSymbolVariant; }
+  bool useParensForDollarSignNames() const {
+    return UseParensForDollarSignNames;
+  }
   bool supportsExtendedDwarfLocDirective() const {
     return SupportsExtendedDwarfLocDirective;
   }
@@ -800,6 +818,10 @@ public:
     return DwarfSectionSizeRequired;
   }
 
+  bool enableDwarfFileDirectoryDefault() const {
+    return EnableDwarfFileDirectoryDefault;
+  }
+
   void addInitialFrameState(const MCCFIInstruction &Inst);
 
   const std::vector<MCCFIInstruction> &getInitialFrameState() const {
diff --git a/llvm/include/llvm/MC/MCAssembler.h b/llvm/include/llvm/MC/MCAssembler.h
index 9d5cb620c9de..80aa97c315da 100644
--- a/llvm/include/llvm/MC/MCAssembler.h
+++ b/llvm/include/llvm/MC/MCAssembler.h
@@ -10,7 +10,6 @@
 #define LLVM_MC_MCASSEMBLER_H
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator.h"
@@ -18,20 +17,34 @@
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCDwarf.h"
-#include "llvm/MC/MCFixup.h"
-#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCLinkerOptimizationHint.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/VersionTuple.h"
+#include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
+#include <memory>
 #include <string>
+#include <tuple>
 #include <utility>
 #include <vector>
 
 namespace llvm {
 
+class MCBoundaryAlignFragment;
+class MCCVDefRangeFragment;
+class MCCVInlineLineTableFragment;
+class MCDwarfCallFrameFragment;
+class MCDwarfLineAddrFragment;
+class MCEncodedFragment;
+class MCFixup;
+class MCLEBFragment;
+class MCPseudoProbeAddrFragment;
+class MCRelaxableFragment;
+class MCSymbolRefExpr;
+class raw_ostream;
 class MCAsmBackend;
 class MCAsmLayout;
 class MCContext;
diff --git a/llvm/include/llvm/MC/MCCodeView.h b/llvm/include/llvm/MC/MCCodeView.h
index 5770f370341d..3d15c4009e43 100644
--- a/llvm/include/llvm/MC/MCCodeView.h
+++ b/llvm/include/llvm/MC/MCCodeView.h
@@ -13,18 +13,25 @@
 #ifndef LLVM_MC_MCCODEVIEW_H
 #define LLVM_MC_MCCODEVIEW_H
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/MC/MCFragment.h"
-#include "llvm/MC/MCObjectStreamer.h"
 #include <map>
 #include <vector>
 
 namespace llvm {
+class MCAsmLayout;
+class MCCVDefRangeFragment;
+class MCCVInlineLineTableFragment;
+class MCDataFragment;
+class MCFragment;
+class MCSection;
+class MCSymbol;
 class MCContext;
 class MCObjectStreamer;
 class MCStreamer;
-class CodeViewContext;
 
 /// Instances of this class represent the information from a
 /// .cv_loc directive.
diff --git a/llvm/include/llvm/MC/MCContext.h b/llvm/include/llvm/MC/MCContext.h
index d2307d692278..a0e18891ed90 100644
--- a/llvm/include/llvm/MC/MCContext.h
+++ b/llvm/include/llvm/MC/MCContext.h
@@ -13,18 +13,15 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/Dwarf.h"
-#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/BinaryFormat/XCOFF.h"
 #include "llvm/MC/MCAsmMacro.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCPseudoProbe.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/MC/MCSection.h"
 #include "llvm/MC/SectionKind.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Compiler.h"
@@ -44,798 +41,825 @@
 
 namespace llvm {
 
-  class CodeViewContext;
-  class MCAsmInfo;
-  class MCLabel;
-  class MCObjectFileInfo;
-  class MCRegisterInfo;
-  class MCSection;
-  class MCSectionCOFF;
-  class MCSectionELF;
-  class MCSectionGOFF;
-  class MCSectionMachO;
-  class MCSectionWasm;
-  class MCSectionXCOFF;
-  class MCStreamer;
-  class MCSymbol;
-  class MCSymbolELF;
-  class MCSymbolWasm;
-  class MCSymbolXCOFF;
-  class MDNode;
-  class SMDiagnostic;
-  class SMLoc;
-  class SourceMgr;
-
-  /// Context object for machine code objects.  This class owns all of the
-  /// sections that it creates.
-  ///
-  class MCContext {
-  public:
-    using SymbolTable = StringMap<MCSymbol *, BumpPtrAllocator &>;
-    using DiagHandlerTy =
-        std::function<void(const SMDiagnostic &, bool, const SourceMgr &,
-                           std::vector<const MDNode *> &)>;
-    enum Environment { IsMachO, IsELF, IsGOFF, IsCOFF, IsWasm, IsXCOFF };
-
-  private:
-    Environment Env;
-
-    /// The name of the Segment where Swift5 Reflection Section data will be
-    /// outputted
-    StringRef Swift5ReflectionSegmentName;
-
-    /// The triple for this object.
-    Triple TT;
-
-    /// The SourceMgr for this object, if any.
-    const SourceMgr *SrcMgr;
-
-    /// The SourceMgr for inline assembly, if any.
-    std::unique_ptr<SourceMgr> InlineSrcMgr;
-    std::vector<const MDNode *> LocInfos;
-
-    DiagHandlerTy DiagHandler;
-
-    /// The MCAsmInfo for this target.
-    const MCAsmInfo *MAI;
-
-    /// The MCRegisterInfo for this target.
-    const MCRegisterInfo *MRI;
-
-    /// The MCObjectFileInfo for this target.
-    const MCObjectFileInfo *MOFI;
-
-    /// The MCSubtargetInfo for this target.
-    const MCSubtargetInfo *MSTI;
-
-    std::unique_ptr<CodeViewContext> CVContext;
-
-    /// Allocator object used for creating machine code objects.
-    ///
-    /// We use a bump pointer allocator to avoid the need to track all allocated
-    /// objects.
-    BumpPtrAllocator Allocator;
-
-    SpecificBumpPtrAllocator<MCSectionCOFF> COFFAllocator;
-    SpecificBumpPtrAllocator<MCSectionELF> ELFAllocator;
-    SpecificBumpPtrAllocator<MCSectionMachO> MachOAllocator;
-    SpecificBumpPtrAllocator<MCSectionGOFF> GOFFAllocator;
-    SpecificBumpPtrAllocator<MCSectionWasm> WasmAllocator;
-    SpecificBumpPtrAllocator<MCSectionXCOFF> XCOFFAllocator;
-    SpecificBumpPtrAllocator<MCInst> MCInstAllocator;
-
-    /// Bindings of names to symbols.
-    SymbolTable Symbols;
-
-    /// A mapping from a local label number and an instance count to a symbol.
-    /// For example, in the assembly
-    ///     1:
-    ///     2:
-    ///     1:
-    /// We have three labels represented by the pairs (1, 0), (2, 0) and (1, 1)
-    DenseMap<std::pair<unsigned, unsigned>, MCSymbol *> LocalSymbols;
-
-    /// Keeps tracks of names that were used both for used declared and
-    /// artificial symbols. The value is "true" if the name has been used for a
-    /// non-section symbol (there can be at most one of those, plus an unlimited
-    /// number of section symbols with the same name).
-    StringMap<bool, BumpPtrAllocator &> UsedNames;
-
-    /// Keeps track of labels that are used in inline assembly.
-    SymbolTable InlineAsmUsedLabelNames;
-
-    /// The next ID to dole out to an unnamed assembler temporary symbol with
-    /// a given prefix.
-    StringMap<unsigned> NextID;
-
-    /// Instances of directional local labels.
-    DenseMap<unsigned, MCLabel *> Instances;
-    /// NextInstance() creates the next instance of the directional local label
-    /// for the LocalLabelVal and adds it to the map if needed.
-    unsigned NextInstance(unsigned LocalLabelVal);
-    /// GetInstance() gets the current instance of the directional local label
-    /// for the LocalLabelVal and adds it to the map if needed.
-    unsigned GetInstance(unsigned LocalLabelVal);
-
-    /// The file name of the log file from the environment variable
-    /// AS_SECURE_LOG_FILE.  Which must be set before the .secure_log_unique
-    /// directive is used or it is an error.
-    char *SecureLogFile;
-    /// The stream that gets written to for the .secure_log_unique directive.
-    std::unique_ptr<raw_fd_ostream> SecureLog;
-    /// Boolean toggled when .secure_log_unique / .secure_log_reset is seen to
-    /// catch errors if .secure_log_unique appears twice without
-    /// .secure_log_reset appearing between them.
-    bool SecureLogUsed = false;
-
-    /// The compilation directory to use for DW_AT_comp_dir.
-    SmallString<128> CompilationDir;
-
-    /// Prefix replacement map for source file information.
-    std::map<const std::string, const std::string> DebugPrefixMap;
-
-    /// The main file name if passed in explicitly.
-    std::string MainFileName;
-
-    /// The dwarf file and directory tables from the dwarf .file directive.
-    /// We now emit a line table for each compile unit. To reduce the prologue
-    /// size of each line table, the files and directories used by each compile
-    /// unit are separated.
-    std::map<unsigned, MCDwarfLineTable> MCDwarfLineTablesCUMap;
-
-    /// The current dwarf line information from the last dwarf .loc directive.
-    MCDwarfLoc CurrentDwarfLoc;
-    bool DwarfLocSeen = false;
-
-    /// Generate dwarf debugging info for assembly source files.
-    bool GenDwarfForAssembly = false;
-
-    /// The current dwarf file number when generate dwarf debugging info for
-    /// assembly source files.
-    unsigned GenDwarfFileNumber = 0;
-
-    /// Sections for generating the .debug_ranges and .debug_aranges sections.
-    SetVector<MCSection *> SectionsForRanges;
-
-    /// The information gathered from labels that will have dwarf label
-    /// entries when generating dwarf assembly source files.
-    std::vector<MCGenDwarfLabelEntry> MCGenDwarfLabelEntries;
-
-    /// The string to embed in the debug information for the compile unit, if
-    /// non-empty.
-    StringRef DwarfDebugFlags;
-
-    /// The string to embed in as the dwarf AT_producer for the compile unit, if
-    /// non-empty.
-    StringRef DwarfDebugProducer;
-
-    /// The maximum version of dwarf that we should emit.
-    uint16_t DwarfVersion = 4;
-
-    /// The format of dwarf that we emit.
-    dwarf::DwarfFormat DwarfFormat = dwarf::DWARF32;
-
-    /// Honor temporary labels, this is useful for debugging semantic
-    /// differences between temporary and non-temporary labels (primarily on
-    /// Darwin).
-    bool AllowTemporaryLabels = true;
-    bool UseNamesOnTempLabels = false;
-
-    /// The Compile Unit ID that we are currently processing.
-    unsigned DwarfCompileUnitID = 0;
-
-    /// A collection of MCPseudoProbe in the current module
-    MCPseudoProbeTable PseudoProbeTable;
-
-    // Sections are differentiated by the quadruple (section_name, group_name,
-    // unique_id, link_to_symbol_name). Sections sharing the same quadruple are
-    // combined into one section.
-    struct ELFSectionKey {
-      std::string SectionName;
-      StringRef GroupName;
-      StringRef LinkedToName;
-      unsigned UniqueID;
-
-      ELFSectionKey(StringRef SectionName, StringRef GroupName,
-                    StringRef LinkedToName, unsigned UniqueID)
-          : SectionName(SectionName), GroupName(GroupName),
-            LinkedToName(LinkedToName), UniqueID(UniqueID) {}
-
-      bool operator<(const ELFSectionKey &Other) const {
-        if (SectionName != Other.SectionName)
-          return SectionName < Other.SectionName;
-        if (GroupName != Other.GroupName)
-          return GroupName < Other.GroupName;
-        if (int O = LinkedToName.compare(Other.LinkedToName))
-          return O < 0;
-        return UniqueID < Other.UniqueID;
-      }
-    };
-
-    struct COFFSectionKey {
-      std::string SectionName;
-      StringRef GroupName;
-      int SelectionKey;
-      unsigned UniqueID;
-
-      COFFSectionKey(StringRef SectionName, StringRef GroupName,
-                     int SelectionKey, unsigned UniqueID)
-          : SectionName(SectionName), GroupName(GroupName),
-            SelectionKey(SelectionKey), UniqueID(UniqueID) {}
-
-      bool operator<(const COFFSectionKey &Other) const {
-        if (SectionName != Other.SectionName)
-          return SectionName < Other.SectionName;
-        if (GroupName != Other.GroupName)
-          return GroupName < Other.GroupName;
-        if (SelectionKey != Other.SelectionKey)
-          return SelectionKey < Other.SelectionKey;
-        return UniqueID < Other.UniqueID;
-      }
-    };
-
-    struct WasmSectionKey {
-      std::string SectionName;
-      StringRef GroupName;
-      unsigned UniqueID;
-
-      WasmSectionKey(StringRef SectionName, StringRef GroupName,
-                     unsigned UniqueID)
-          : SectionName(SectionName), GroupName(GroupName), UniqueID(UniqueID) {
-      }
-
-      bool operator<(const WasmSectionKey &Other) const {
-        if (SectionName != Other.SectionName)
-          return SectionName < Other.SectionName;
-        if (GroupName != Other.GroupName)
-          return GroupName < Other.GroupName;
-        return UniqueID < Other.UniqueID;
-      }
-    };
-
-    struct XCOFFSectionKey {
-      // Section name.
-      std::string SectionName;
-      // Section property.
-      // For csect section, it is storage mapping class.
-      // For debug section, it is section type flags.
-      union {
-        XCOFF::StorageMappingClass MappingClass;
-        XCOFF::DwarfSectionSubtypeFlags DwarfSubtypeFlags;
-      };
-      bool IsCsect;
-
-      XCOFFSectionKey(StringRef SectionName,
-                      XCOFF::StorageMappingClass MappingClass)
-          : SectionName(SectionName), MappingClass(MappingClass),
-            IsCsect(true) {}
-
-      XCOFFSectionKey(StringRef SectionName,
-                      XCOFF::DwarfSectionSubtypeFlags DwarfSubtypeFlags)
-          : SectionName(SectionName), DwarfSubtypeFlags(DwarfSubtypeFlags),
-            IsCsect(false) {}
-
-      bool operator<(const XCOFFSectionKey &Other) const {
-        if (IsCsect && Other.IsCsect)
-          return std::tie(SectionName, MappingClass) <
-                 std::tie(Other.SectionName, Other.MappingClass);
-        if (IsCsect != Other.IsCsect)
-          return IsCsect;
-        return std::tie(SectionName, DwarfSubtypeFlags) <
-               std::tie(Other.SectionName, Other.DwarfSubtypeFlags);
-      }
-    };
-
-    StringMap<MCSectionMachO *> MachOUniquingMap;
-    std::map<ELFSectionKey, MCSectionELF *> ELFUniquingMap;
-    std::map<COFFSectionKey, MCSectionCOFF *> COFFUniquingMap;
-    std::map<std::string, MCSectionGOFF *> GOFFUniquingMap;
-    std::map<WasmSectionKey, MCSectionWasm *> WasmUniquingMap;
-    std::map<XCOFFSectionKey, MCSectionXCOFF *> XCOFFUniquingMap;
-    StringMap<bool> RelSecNames;
-
-    SpecificBumpPtrAllocator<MCSubtargetInfo> MCSubtargetAllocator;
+class CodeViewContext;
+class MCAsmInfo;
+class MCInst;
+class MCLabel;
+class MCObjectFileInfo;
+class MCRegisterInfo;
+class MCSection;
+class MCSectionCOFF;
+class MCSectionDXContainer;
+class MCSectionELF;
+class MCSectionGOFF;
+class MCSectionMachO;
+class MCSectionSPIRV;
+class MCSectionWasm;
+class MCSectionXCOFF;
+class MCStreamer;
+class MCSubtargetInfo;
+class MCSymbol;
+class MCSymbolELF;
+class MCSymbolWasm;
+class MCSymbolXCOFF;
+class MCTargetOptions;
+class MDNode;
+template <typename T> class SmallVectorImpl;
+class SMDiagnostic;
+class SMLoc;
+class SourceMgr;
+enum class EmitDwarfUnwindType;
+
+/// Context object for machine code objects.  This class owns all of the
+/// sections that it creates.
+///
+class MCContext {
+public:
+  using SymbolTable = StringMap<MCSymbol *, BumpPtrAllocator &>;
+  using DiagHandlerTy =
+      std::function<void(const SMDiagnostic &, bool, const SourceMgr &,
+                         std::vector<const MDNode *> &)>;
+  enum Environment {
+    IsMachO,
+    IsELF,
+    IsGOFF,
+    IsCOFF,
+    IsSPIRV,
+    IsWasm,
+    IsXCOFF,
+    IsDXContainer
+  };
 
-    /// Do automatic reset in destructor
-    bool AutoReset;
+private:
+  Environment Env;
 
-    MCTargetOptions const *TargetOptions;
+  /// The name of the Segment where Swift5 Reflection Section data will be
+  /// outputted
+  StringRef Swift5ReflectionSegmentName;
 
-    bool HadError = false;
+  /// The triple for this object.
+  Triple TT;
 
-    void reportCommon(SMLoc Loc,
-                      std::function<void(SMDiagnostic &, const SourceMgr *)>);
+  /// The SourceMgr for this object, if any.
+  const SourceMgr *SrcMgr;
 
-    MCSymbol *createSymbolImpl(const StringMapEntry<bool> *Name,
-                               bool CanBeUnnamed);
-    MCSymbol *createSymbol(StringRef Name, bool AlwaysAddSuffix,
-                           bool IsTemporary);
+  /// The SourceMgr for inline assembly, if any.
+  std::unique_ptr<SourceMgr> InlineSrcMgr;
+  std::vector<const MDNode *> LocInfos;
 
-    MCSymbol *getOrCreateDirectionalLocalSymbol(unsigned LocalLabelVal,
-                                                unsigned Instance);
+  DiagHandlerTy DiagHandler;
 
-    MCSectionELF *createELFSectionImpl(StringRef Section, unsigned Type,
-                                       unsigned Flags, SectionKind K,
-                                       unsigned EntrySize,
-                                       const MCSymbolELF *Group, bool IsComdat,
-                                       unsigned UniqueID,
-                                       const MCSymbolELF *LinkedToSym);
+  /// The MCAsmInfo for this target.
+  const MCAsmInfo *MAI;
 
-    MCSymbolXCOFF *createXCOFFSymbolImpl(const StringMapEntry<bool> *Name,
-                                         bool IsTemporary);
+  /// The MCRegisterInfo for this target.
+  const MCRegisterInfo *MRI;
 
-    /// Map of currently defined macros.
-    StringMap<MCAsmMacro> MacroMap;
+  /// The MCObjectFileInfo for this target.
+  const MCObjectFileInfo *MOFI;
 
-    struct ELFEntrySizeKey {
-      std::string SectionName;
-      unsigned Flags;
-      unsigned EntrySize;
+  /// The MCSubtargetInfo for this target.
+  const MCSubtargetInfo *MSTI;
 
-      ELFEntrySizeKey(StringRef SectionName, unsigned Flags, unsigned EntrySize)
-          : SectionName(SectionName), Flags(Flags), EntrySize(EntrySize) {}
+  std::unique_ptr<CodeViewContext> CVContext;
 
-      bool operator<(const ELFEntrySizeKey &Other) const {
-        if (SectionName != Other.SectionName)
-          return SectionName < Other.SectionName;
-        if (Flags != Other.Flags)
-          return Flags < Other.Flags;
-        return EntrySize < Other.EntrySize;
-      }
-    };
-
-    // Symbols must be assigned to a section with a compatible entry size and
-    // flags. This map is used to assign unique IDs to sections to distinguish
-    // between sections with identical names but incompatible entry sizes and/or
-    // flags. This can occur when a symbol is explicitly assigned to a section,
-    // e.g. via __attribute__((section("myname"))).
-    std::map<ELFEntrySizeKey, unsigned> ELFEntrySizeMap;
-
-    // This set is used to record the generic mergeable section names seen.
-    // These are sections that are created as mergeable e.g. .debug_str. We need
-    // to avoid assigning non-mergeable symbols to these sections. It is used
-    // to prevent non-mergeable symbols being explicitly assigned  to mergeable
-    // sections (e.g. via _attribute_((section("myname")))).
-    DenseSet<StringRef> ELFSeenGenericMergeableSections;
-
-  public:
-    explicit MCContext(const Triple &TheTriple, const MCAsmInfo *MAI,
-                       const MCRegisterInfo *MRI, const MCSubtargetInfo *MSTI,
-                       const SourceMgr *Mgr = nullptr,
-                       MCTargetOptions const *TargetOpts = nullptr,
-                       bool DoAutoReset = true,
-                       StringRef Swift5ReflSegmentName = {});
-    MCContext(const MCContext &) = delete;
-    MCContext &operator=(const MCContext &) = delete;
-    ~MCContext();
-
-    Environment getObjectFileType() const { return Env; }
-
-    const StringRef &getSwift5ReflectionSegmentName() const {
-      return Swift5ReflectionSegmentName;
+  /// Allocator object used for creating machine code objects.
+  ///
+  /// We use a bump pointer allocator to avoid the need to track all allocated
+  /// objects.
+  BumpPtrAllocator Allocator;
+
+  SpecificBumpPtrAllocator<MCSectionCOFF> COFFAllocator;
+  SpecificBumpPtrAllocator<MCSectionDXContainer> DXCAllocator;
+  SpecificBumpPtrAllocator<MCSectionELF> ELFAllocator;
+  SpecificBumpPtrAllocator<MCSectionMachO> MachOAllocator;
+  SpecificBumpPtrAllocator<MCSectionGOFF> GOFFAllocator;
+  SpecificBumpPtrAllocator<MCSectionSPIRV> SPIRVAllocator;
+  SpecificBumpPtrAllocator<MCSectionWasm> WasmAllocator;
+  SpecificBumpPtrAllocator<MCSectionXCOFF> XCOFFAllocator;
+  SpecificBumpPtrAllocator<MCInst> MCInstAllocator;
+
+  /// Bindings of names to symbols.
+  SymbolTable Symbols;
+
+  /// A mapping from a local label number and an instance count to a symbol.
+  /// For example, in the assembly
+  ///     1:
+  ///     2:
+  ///     1:
+  /// We have three labels represented by the pairs (1, 0), (2, 0) and (1, 1)
+  DenseMap<std::pair<unsigned, unsigned>, MCSymbol *> LocalSymbols;
+
+  /// Keeps tracks of names that were used both for used declared and
+  /// artificial symbols. The value is "true" if the name has been used for a
+  /// non-section symbol (there can be at most one of those, plus an unlimited
+  /// number of section symbols with the same name).
+  StringMap<bool, BumpPtrAllocator &> UsedNames;
+
+  /// Keeps track of labels that are used in inline assembly.
+  SymbolTable InlineAsmUsedLabelNames;
+
+  /// The next ID to dole out to an unnamed assembler temporary symbol with
+  /// a given prefix.
+  StringMap<unsigned> NextID;
+
+  /// Instances of directional local labels.
+  DenseMap<unsigned, MCLabel *> Instances;
+  /// NextInstance() creates the next instance of the directional local label
+  /// for the LocalLabelVal and adds it to the map if needed.
+  unsigned NextInstance(unsigned LocalLabelVal);
+  /// GetInstance() gets the current instance of the directional local label
+  /// for the LocalLabelVal and adds it to the map if needed.
+  unsigned GetInstance(unsigned LocalLabelVal);
+
+  /// LLVM_BB_ADDR_MAP version to emit.
+  uint8_t BBAddrMapVersion = 1;
+
+  /// The file name of the log file from the environment variable
+  /// AS_SECURE_LOG_FILE.  Which must be set before the .secure_log_unique
+  /// directive is used or it is an error.
+  char *SecureLogFile;
+  /// The stream that gets written to for the .secure_log_unique directive.
+  std::unique_ptr<raw_fd_ostream> SecureLog;
+  /// Boolean toggled when .secure_log_unique / .secure_log_reset is seen to
+  /// catch errors if .secure_log_unique appears twice without
+  /// .secure_log_reset appearing between them.
+  bool SecureLogUsed = false;
+
+  /// The compilation directory to use for DW_AT_comp_dir.
+  SmallString<128> CompilationDir;
+
+  /// Prefix replacement map for source file information.
+  std::map<const std::string, const std::string> DebugPrefixMap;
+
+  /// The main file name if passed in explicitly.
+  std::string MainFileName;
+
+  /// The dwarf file and directory tables from the dwarf .file directive.
+  /// We now emit a line table for each compile unit. To reduce the prologue
+  /// size of each line table, the files and directories used by each compile
+  /// unit are separated.
+  std::map<unsigned, MCDwarfLineTable> MCDwarfLineTablesCUMap;
+
+  /// The current dwarf line information from the last dwarf .loc directive.
+  MCDwarfLoc CurrentDwarfLoc;
+  bool DwarfLocSeen = false;
+
+  /// Generate dwarf debugging info for assembly source files.
+  bool GenDwarfForAssembly = false;
+
+  /// The current dwarf file number when generate dwarf debugging info for
+  /// assembly source files.
+  unsigned GenDwarfFileNumber = 0;
+
+  /// Sections for generating the .debug_ranges and .debug_aranges sections.
+  SetVector<MCSection *> SectionsForRanges;
+
+  /// The information gathered from labels that will have dwarf label
+  /// entries when generating dwarf assembly source files.
+  std::vector<MCGenDwarfLabelEntry> MCGenDwarfLabelEntries;
+
+  /// The string to embed in the debug information for the compile unit, if
+  /// non-empty.
+  StringRef DwarfDebugFlags;
+
+  /// The string to embed in as the dwarf AT_producer for the compile unit, if
+  /// non-empty.
+  StringRef DwarfDebugProducer;
+
+  /// The maximum version of dwarf that we should emit.
+  uint16_t DwarfVersion = 4;
+
+  /// The format of dwarf that we emit.
+  dwarf::DwarfFormat DwarfFormat = dwarf::DWARF32;
+
+  /// Honor temporary labels, this is useful for debugging semantic
+  /// differences between temporary and non-temporary labels (primarily on
+  /// Darwin).
+  bool AllowTemporaryLabels = true;
+  bool UseNamesOnTempLabels = false;
+
+  /// The Compile Unit ID that we are currently processing.
+  unsigned DwarfCompileUnitID = 0;
+
+  /// A collection of MCPseudoProbe in the current module
+  MCPseudoProbeTable PseudoProbeTable;
+
+  // Sections are differentiated by the quadruple (section_name, group_name,
+  // unique_id, link_to_symbol_name). Sections sharing the same quadruple are
+  // combined into one section.
+  struct ELFSectionKey {
+    std::string SectionName;
+    StringRef GroupName;
+    StringRef LinkedToName;
+    unsigned UniqueID;
+
+    ELFSectionKey(StringRef SectionName, StringRef GroupName,
+                  StringRef LinkedToName, unsigned UniqueID)
+        : SectionName(SectionName), GroupName(GroupName),
+          LinkedToName(LinkedToName), UniqueID(UniqueID) {}
+
+    bool operator<(const ELFSectionKey &Other) const {
+      if (SectionName != Other.SectionName)
+        return SectionName < Other.SectionName;
+      if (GroupName != Other.GroupName)
+        return GroupName < Other.GroupName;
+      if (int O = LinkedToName.compare(Other.LinkedToName))
+        return O < 0;
+      return UniqueID < Other.UniqueID;
     }
-    const Triple &getTargetTriple() const { return TT; }
-    const SourceMgr *getSourceManager() const { return SrcMgr; }
+  };
 
-    void initInlineSourceManager();
-    SourceMgr *getInlineSourceManager() {
-      return InlineSrcMgr.get();
-    }
-    std::vector<const MDNode *> &getLocInfos() { return LocInfos; }
-    void setDiagnosticHandler(DiagHandlerTy DiagHandler) {
-      this->DiagHandler = DiagHandler;
+  struct COFFSectionKey {
+    std::string SectionName;
+    StringRef GroupName;
+    int SelectionKey;
+    unsigned UniqueID;
+
+    COFFSectionKey(StringRef SectionName, StringRef GroupName, int SelectionKey,
+                   unsigned UniqueID)
+        : SectionName(SectionName), GroupName(GroupName),
+          SelectionKey(SelectionKey), UniqueID(UniqueID) {}
+
+    bool operator<(const COFFSectionKey &Other) const {
+      if (SectionName != Other.SectionName)
+        return SectionName < Other.SectionName;
+      if (GroupName != Other.GroupName)
+        return GroupName < Other.GroupName;
+      if (SelectionKey != Other.SelectionKey)
+        return SelectionKey < Other.SelectionKey;
+      return UniqueID < Other.UniqueID;
     }
+  };
 
-    void setObjectFileInfo(const MCObjectFileInfo *Mofi) { MOFI = Mofi; }
-
-    const MCAsmInfo *getAsmInfo() const { return MAI; }
-
-    const MCRegisterInfo *getRegisterInfo() const { return MRI; }
-
-    const MCObjectFileInfo *getObjectFileInfo() const { return MOFI; }
-
-    const MCSubtargetInfo *getSubtargetInfo() const { return MSTI; }
-
-    CodeViewContext &getCVContext();
-
-    void setAllowTemporaryLabels(bool Value) { AllowTemporaryLabels = Value; }
-    void setUseNamesOnTempLabels(bool Value) { UseNamesOnTempLabels = Value; }
-
-    /// \name Module Lifetime Management
-    /// @{
-
-    /// reset - return object to right after construction state to prepare
-    /// to process a new module
-    void reset();
-
-    /// @}
-
-    /// \name McInst Management
-
-    /// Create and return a new MC instruction.
-    MCInst *createMCInst();
-
-    /// \name Symbol Management
-    /// @{
-
-    /// Create and return a new linker temporary symbol with a unique but
-    /// unspecified name.
-    MCSymbol *createLinkerPrivateTempSymbol();
-
-    /// Create a temporary symbol with a unique name. The name will be omitted
-    /// in the symbol table if UseNamesOnTempLabels is false (default except
-    /// MCAsmStreamer). The overload without Name uses an unspecified name.
-    MCSymbol *createTempSymbol();
-    MCSymbol *createTempSymbol(const Twine &Name, bool AlwaysAddSuffix = true);
-
-    /// Create a temporary symbol with a unique name whose name cannot be
-    /// omitted in the symbol table. This is rarely used.
-    MCSymbol *createNamedTempSymbol();
-    MCSymbol *createNamedTempSymbol(const Twine &Name);
-
-    /// Create the definition of a directional local symbol for numbered label
-    /// (used for "1:" definitions).
-    MCSymbol *createDirectionalLocalSymbol(unsigned LocalLabelVal);
-
-    /// Create and return a directional local symbol for numbered label (used
-    /// for "1b" or 1f" references).
-    MCSymbol *getDirectionalLocalSymbol(unsigned LocalLabelVal, bool Before);
-
-    /// Lookup the symbol inside with the specified \p Name.  If it exists,
-    /// return it.  If not, create a forward reference and return it.
-    ///
-    /// \param Name - The symbol name, which must be unique across all symbols.
-    MCSymbol *getOrCreateSymbol(const Twine &Name);
+  struct WasmSectionKey {
+    std::string SectionName;
+    StringRef GroupName;
+    unsigned UniqueID;
+
+    WasmSectionKey(StringRef SectionName, StringRef GroupName,
+                   unsigned UniqueID)
+        : SectionName(SectionName), GroupName(GroupName), UniqueID(UniqueID) {}
+
+    bool operator<(const WasmSectionKey &Other) const {
+      if (SectionName != Other.SectionName)
+        return SectionName < Other.SectionName;
+      if (GroupName != Other.GroupName)
+        return GroupName < Other.GroupName;
+      return UniqueID < Other.UniqueID;
+    }
+  };
 
-    /// Gets a symbol that will be defined to the final stack offset of a local
-    /// variable after codegen.
-    ///
-    /// \param Idx - The index of a local variable passed to \@llvm.localescape.
-    MCSymbol *getOrCreateFrameAllocSymbol(StringRef FuncName, unsigned Idx);
+  struct XCOFFSectionKey {
+    // Section name.
+    std::string SectionName;
+    // Section property.
+    // For csect section, it is storage mapping class.
+    // For debug section, it is section type flags.
+    union {
+      XCOFF::StorageMappingClass MappingClass;
+      XCOFF::DwarfSectionSubtypeFlags DwarfSubtypeFlags;
+    };
+    bool IsCsect;
+
+    XCOFFSectionKey(StringRef SectionName,
+                    XCOFF::StorageMappingClass MappingClass)
+        : SectionName(SectionName), MappingClass(MappingClass), IsCsect(true) {}
+
+    XCOFFSectionKey(StringRef SectionName,
+                    XCOFF::DwarfSectionSubtypeFlags DwarfSubtypeFlags)
+        : SectionName(SectionName), DwarfSubtypeFlags(DwarfSubtypeFlags),
+          IsCsect(false) {}
+
+    bool operator<(const XCOFFSectionKey &Other) const {
+      if (IsCsect && Other.IsCsect)
+        return std::tie(SectionName, MappingClass) <
+               std::tie(Other.SectionName, Other.MappingClass);
+      if (IsCsect != Other.IsCsect)
+        return IsCsect;
+      return std::tie(SectionName, DwarfSubtypeFlags) <
+             std::tie(Other.SectionName, Other.DwarfSubtypeFlags);
+    }
+  };
 
-    MCSymbol *getOrCreateParentFrameOffsetSymbol(StringRef FuncName);
+  StringMap<MCSectionMachO *> MachOUniquingMap;
+  std::map<ELFSectionKey, MCSectionELF *> ELFUniquingMap;
+  std::map<COFFSectionKey, MCSectionCOFF *> COFFUniquingMap;
+  std::map<std::string, MCSectionGOFF *> GOFFUniquingMap;
+  std::map<WasmSectionKey, MCSectionWasm *> WasmUniquingMap;
+  std::map<XCOFFSectionKey, MCSectionXCOFF *> XCOFFUniquingMap;
+  StringMap<MCSectionDXContainer *> DXCUniquingMap;
+  StringMap<bool> RelSecNames;
 
-    MCSymbol *getOrCreateLSDASymbol(StringRef FuncName);
+  SpecificBumpPtrAllocator<MCSubtargetInfo> MCSubtargetAllocator;
 
-    /// Get the symbol for \p Name, or null.
-    MCSymbol *lookupSymbol(const Twine &Name) const;
+  /// Do automatic reset in destructor
+  bool AutoReset;
 
-    /// Set value for a symbol.
-    void setSymbolValue(MCStreamer &Streamer, StringRef Sym, uint64_t Val);
+  MCTargetOptions const *TargetOptions;
 
-    /// getSymbols - Get a reference for the symbol table for clients that
-    /// want to, for example, iterate over all symbols. 'const' because we
-    /// still want any modifications to the table itself to use the MCContext
-    /// APIs.
-    const SymbolTable &getSymbols() const { return Symbols; }
+  bool HadError = false;
 
-    /// isInlineAsmLabel - Return true if the name is a label referenced in
-    /// inline assembly.
-    MCSymbol *getInlineAsmLabel(StringRef Name) const {
-      return InlineAsmUsedLabelNames.lookup(Name);
-    }
+  void reportCommon(SMLoc Loc,
+                    std::function<void(SMDiagnostic &, const SourceMgr *)>);
 
-    /// registerInlineAsmLabel - Records that the name is a label referenced in
-    /// inline assembly.
-    void registerInlineAsmLabel(MCSymbol *Sym);
+  MCSymbol *createSymbolImpl(const StringMapEntry<bool> *Name,
+                             bool CanBeUnnamed);
+  MCSymbol *createSymbol(StringRef Name, bool AlwaysAddSuffix,
+                         bool IsTemporary);
 
-    /// @}
+  MCSymbol *getOrCreateDirectionalLocalSymbol(unsigned LocalLabelVal,
+                                              unsigned Instance);
 
-    /// \name Section Management
-    /// @{
+  MCSectionELF *createELFSectionImpl(StringRef Section, unsigned Type,
+                                     unsigned Flags, SectionKind K,
+                                     unsigned EntrySize,
+                                     const MCSymbolELF *Group, bool IsComdat,
+                                     unsigned UniqueID,
+                                     const MCSymbolELF *LinkedToSym);
 
-    enum : unsigned {
-      /// Pass this value as the UniqueID during section creation to get the
-      /// generic section with the given name and characteristics. The usual
-      /// sections such as .text use this ID.
-      GenericSectionID = ~0U
-    };
+  MCSymbolXCOFF *createXCOFFSymbolImpl(const StringMapEntry<bool> *Name,
+                                       bool IsTemporary);
 
-    /// Return the MCSection for the specified mach-o section.  This requires
-    /// the operands to be valid.
-    MCSectionMachO *getMachOSection(StringRef Segment, StringRef Section,
-                                    unsigned TypeAndAttributes,
-                                    unsigned Reserved2, SectionKind K,
-                                    const char *BeginSymName = nullptr);
-
-    MCSectionMachO *getMachOSection(StringRef Segment, StringRef Section,
-                                    unsigned TypeAndAttributes, SectionKind K,
-                                    const char *BeginSymName = nullptr) {
-      return getMachOSection(Segment, Section, TypeAndAttributes, 0, K,
-                             BeginSymName);
-    }
+  /// Map of currently defined macros.
+  StringMap<MCAsmMacro> MacroMap;
 
-    MCSectionELF *getELFSection(const Twine &Section, unsigned Type,
-                                unsigned Flags) {
-      return getELFSection(Section, Type, Flags, 0, "", false);
-    }
+  struct ELFEntrySizeKey {
+    std::string SectionName;
+    unsigned Flags;
+    unsigned EntrySize;
 
-    MCSectionELF *getELFSection(const Twine &Section, unsigned Type,
-                                unsigned Flags, unsigned EntrySize) {
-      return getELFSection(Section, Type, Flags, EntrySize, "", false,
-                           MCSection::NonUniqueID, nullptr);
-    }
+    ELFEntrySizeKey(StringRef SectionName, unsigned Flags, unsigned EntrySize)
+        : SectionName(SectionName), Flags(Flags), EntrySize(EntrySize) {}
 
-    MCSectionELF *getELFSection(const Twine &Section, unsigned Type,
-                                unsigned Flags, unsigned EntrySize,
-                                const Twine &Group, bool IsComdat) {
-      return getELFSection(Section, Type, Flags, EntrySize, Group, IsComdat,
-                           MCSection::NonUniqueID, nullptr);
+    bool operator<(const ELFEntrySizeKey &Other) const {
+      if (SectionName != Other.SectionName)
+        return SectionName < Other.SectionName;
+      if (Flags != Other.Flags)
+        return Flags < Other.Flags;
+      return EntrySize < Other.EntrySize;
     }
+  };
 
-    MCSectionELF *getELFSection(const Twine &Section, unsigned Type,
-                                unsigned Flags, unsigned EntrySize,
-                                const Twine &Group, bool IsComdat,
-                                unsigned UniqueID,
-                                const MCSymbolELF *LinkedToSym);
-
-    MCSectionELF *getELFSection(const Twine &Section, unsigned Type,
-                                unsigned Flags, unsigned EntrySize,
-                                const MCSymbolELF *Group, bool IsComdat,
-                                unsigned UniqueID,
-                                const MCSymbolELF *LinkedToSym);
-
-    /// Get a section with the provided group identifier. This section is
-    /// named by concatenating \p Prefix with '.' then \p Suffix. The \p Type
-    /// describes the type of the section and \p Flags are used to further
-    /// configure this named section.
-    MCSectionELF *getELFNamedSection(const Twine &Prefix, const Twine &Suffix,
-                                     unsigned Type, unsigned Flags,
-                                     unsigned EntrySize = 0);
-
-    MCSectionELF *createELFRelSection(const Twine &Name, unsigned Type,
-                                      unsigned Flags, unsigned EntrySize,
-                                      const MCSymbolELF *Group,
-                                      const MCSectionELF *RelInfoSection);
-
-    void renameELFSection(MCSectionELF *Section, StringRef Name);
+  // Symbols must be assigned to a section with a compatible entry size and
+  // flags. This map is used to assign unique IDs to sections to distinguish
+  // between sections with identical names but incompatible entry sizes and/or
+  // flags. This can occur when a symbol is explicitly assigned to a section,
+  // e.g. via __attribute__((section("myname"))).
+  std::map<ELFEntrySizeKey, unsigned> ELFEntrySizeMap;
 
-    MCSectionELF *createELFGroupSection(const MCSymbolELF *Group,
-                                        bool IsComdat);
+  // This set is used to record the generic mergeable section names seen.
+  // These are sections that are created as mergeable e.g. .debug_str. We need
+  // to avoid assigning non-mergeable symbols to these sections. It is used
+  // to prevent non-mergeable symbols being explicitly assigned  to mergeable
+  // sections (e.g. via _attribute_((section("myname")))).
+  DenseSet<StringRef> ELFSeenGenericMergeableSections;
 
-    void recordELFMergeableSectionInfo(StringRef SectionName, unsigned Flags,
-                                       unsigned UniqueID, unsigned EntrySize);
+public:
+  explicit MCContext(const Triple &TheTriple, const MCAsmInfo *MAI,
+                     const MCRegisterInfo *MRI, const MCSubtargetInfo *MSTI,
+                     const SourceMgr *Mgr = nullptr,
+                     MCTargetOptions const *TargetOpts = nullptr,
+                     bool DoAutoReset = true,
+                     StringRef Swift5ReflSegmentName = {});
+  MCContext(const MCContext &) = delete;
+  MCContext &operator=(const MCContext &) = delete;
+  ~MCContext();
 
-    bool isELFImplicitMergeableSectionNamePrefix(StringRef Name);
+  Environment getObjectFileType() const { return Env; }
 
-    bool isELFGenericMergeableSection(StringRef Name);
+  const StringRef &getSwift5ReflectionSegmentName() const {
+    return Swift5ReflectionSegmentName;
+  }
+  const Triple &getTargetTriple() const { return TT; }
+  const SourceMgr *getSourceManager() const { return SrcMgr; }
 
-    /// Return the unique ID of the section with the given name, flags and entry
-    /// size, if it exists.
-    Optional<unsigned> getELFUniqueIDForEntsize(StringRef SectionName,
-                                                unsigned Flags,
-                                                unsigned EntrySize);
+  void initInlineSourceManager();
+  SourceMgr *getInlineSourceManager() { return InlineSrcMgr.get(); }
+  std::vector<const MDNode *> &getLocInfos() { return LocInfos; }
+  void setDiagnosticHandler(DiagHandlerTy DiagHandler) {
+    this->DiagHandler = DiagHandler;
+  }
 
-    MCSectionGOFF *getGOFFSection(StringRef Section, SectionKind Kind);
+  void setObjectFileInfo(const MCObjectFileInfo *Mofi) { MOFI = Mofi; }
 
-    MCSectionCOFF *getCOFFSection(StringRef Section, unsigned Characteristics,
-                                  SectionKind Kind, StringRef COMDATSymName,
-                                  int Selection,
-                                  unsigned UniqueID = GenericSectionID,
-                                  const char *BeginSymName = nullptr);
+  const MCAsmInfo *getAsmInfo() const { return MAI; }
 
-    MCSectionCOFF *getCOFFSection(StringRef Section, unsigned Characteristics,
-                                  SectionKind Kind,
-                                  const char *BeginSymName = nullptr);
-
-    /// Gets or creates a section equivalent to Sec that is associated with the
-    /// section containing KeySym. For example, to create a debug info section
-    /// associated with an inline function, pass the normal debug info section
-    /// as Sec and the function symbol as KeySym.
-    MCSectionCOFF *
-    getAssociativeCOFFSection(MCSectionCOFF *Sec, const MCSymbol *KeySym,
-                              unsigned UniqueID = GenericSectionID);
-
-    MCSectionWasm *getWasmSection(const Twine &Section, SectionKind K,
-                                  unsigned Flags = 0) {
-      return getWasmSection(Section, K, Flags, nullptr);
-    }
+  const MCRegisterInfo *getRegisterInfo() const { return MRI; }
 
-    MCSectionWasm *getWasmSection(const Twine &Section, SectionKind K,
-                                  unsigned Flags, const char *BeginSymName) {
-      return getWasmSection(Section, K, Flags, "", ~0, BeginSymName);
-    }
+  const MCObjectFileInfo *getObjectFileInfo() const { return MOFI; }
 
-    MCSectionWasm *getWasmSection(const Twine &Section, SectionKind K,
-                                  unsigned Flags, const Twine &Group,
-                                  unsigned UniqueID) {
-      return getWasmSection(Section, K, Flags, Group, UniqueID, nullptr);
-    }
+  const MCSubtargetInfo *getSubtargetInfo() const { return MSTI; }
 
-    MCSectionWasm *getWasmSection(const Twine &Section, SectionKind K,
-                                  unsigned Flags, const Twine &Group,
-                                  unsigned UniqueID, const char *BeginSymName);
+  CodeViewContext &getCVContext();
 
-    MCSectionWasm *getWasmSection(const Twine &Section, SectionKind K,
-                                  unsigned Flags, const MCSymbolWasm *Group,
-                                  unsigned UniqueID, const char *BeginSymName);
+  void setAllowTemporaryLabels(bool Value) { AllowTemporaryLabels = Value; }
+  void setUseNamesOnTempLabels(bool Value) { UseNamesOnTempLabels = Value; }
 
-    MCSectionXCOFF *getXCOFFSection(
-        StringRef Section, SectionKind K,
-        Optional<XCOFF::CsectProperties> CsectProp = None,
-        bool MultiSymbolsAllowed = false, const char *BeginSymName = nullptr,
-        Optional<XCOFF::DwarfSectionSubtypeFlags> DwarfSubtypeFlags = None);
+  /// \name Module Lifetime Management
+  /// @{
 
-    // Create and save a copy of STI and return a reference to the copy.
-    MCSubtargetInfo &getSubtargetCopy(const MCSubtargetInfo &STI);
+  /// reset - return object to right after construction state to prepare
+  /// to process a new module
+  void reset();
 
-    /// @}
+  /// @}
 
-    /// \name Dwarf Management
-    /// @{
+  /// \name McInst Management
 
-    /// Get the compilation directory for DW_AT_comp_dir
-    /// The compilation directory should be set with \c setCompilationDir before
-    /// calling this function. If it is unset, an empty string will be returned.
-    StringRef getCompilationDir() const { return CompilationDir; }
+  /// Create and return a new MC instruction.
+  MCInst *createMCInst();
 
-    /// Set the compilation directory for DW_AT_comp_dir
-    void setCompilationDir(StringRef S) { CompilationDir = S.str(); }
+  /// \name Symbol Management
+  /// @{
 
-    /// Add an entry to the debug prefix map.
-    void addDebugPrefixMapEntry(const std::string &From, const std::string &To);
+  /// Create and return a new linker temporary symbol with a unique but
+  /// unspecified name.
+  MCSymbol *createLinkerPrivateTempSymbol();
 
-    // Remaps all debug directory paths in-place as per the debug prefix map.
-    void RemapDebugPaths();
+  /// Create a temporary symbol with a unique name. The name will be omitted
+  /// in the symbol table if UseNamesOnTempLabels is false (default except
+  /// MCAsmStreamer). The overload without Name uses an unspecified name.
+  MCSymbol *createTempSymbol();
+  MCSymbol *createTempSymbol(const Twine &Name, bool AlwaysAddSuffix = true);
 
-    /// Get the main file name for use in error messages and debug
-    /// info. This can be set to ensure we've got the correct file name
-    /// after preprocessing or for -save-temps.
-    const std::string &getMainFileName() const { return MainFileName; }
+  /// Create a temporary symbol with a unique name whose name cannot be
+  /// omitted in the symbol table. This is rarely used.
+  MCSymbol *createNamedTempSymbol();
+  MCSymbol *createNamedTempSymbol(const Twine &Name);
 
-    /// Set the main file name and override the default.
-    void setMainFileName(StringRef S) { MainFileName = std::string(S); }
+  /// Create the definition of a directional local symbol for numbered label
+  /// (used for "1:" definitions).
+  MCSymbol *createDirectionalLocalSymbol(unsigned LocalLabelVal);
 
-    /// Creates an entry in the dwarf file and directory tables.
-    Expected<unsigned> getDwarfFile(StringRef Directory, StringRef FileName,
-                                    unsigned FileNumber,
-                                    Optional<MD5::MD5Result> Checksum,
-                                    Optional<StringRef> Source, unsigned CUID);
+  /// Create and return a directional local symbol for numbered label (used
+  /// for "1b" or 1f" references).
+  MCSymbol *getDirectionalLocalSymbol(unsigned LocalLabelVal, bool Before);
 
-    bool isValidDwarfFileNumber(unsigned FileNumber, unsigned CUID = 0);
+  /// Lookup the symbol inside with the specified \p Name.  If it exists,
+  /// return it.  If not, create a forward reference and return it.
+  ///
+  /// \param Name - The symbol name, which must be unique across all symbols.
+  MCSymbol *getOrCreateSymbol(const Twine &Name);
 
-    const std::map<unsigned, MCDwarfLineTable> &getMCDwarfLineTables() const {
-      return MCDwarfLineTablesCUMap;
-    }
+  /// Gets a symbol that will be defined to the final stack offset of a local
+  /// variable after codegen.
+  ///
+  /// \param Idx - The index of a local variable passed to \@llvm.localescape.
+  MCSymbol *getOrCreateFrameAllocSymbol(StringRef FuncName, unsigned Idx);
 
-    MCDwarfLineTable &getMCDwarfLineTable(unsigned CUID) {
-      return MCDwarfLineTablesCUMap[CUID];
-    }
+  MCSymbol *getOrCreateParentFrameOffsetSymbol(StringRef FuncName);
 
-    const MCDwarfLineTable &getMCDwarfLineTable(unsigned CUID) const {
-      auto I = MCDwarfLineTablesCUMap.find(CUID);
-      assert(I != MCDwarfLineTablesCUMap.end());
-      return I->second;
-    }
+  MCSymbol *getOrCreateLSDASymbol(StringRef FuncName);
 
-    const SmallVectorImpl<MCDwarfFile> &getMCDwarfFiles(unsigned CUID = 0) {
-      return getMCDwarfLineTable(CUID).getMCDwarfFiles();
-    }
+  /// Get the symbol for \p Name, or null.
+  MCSymbol *lookupSymbol(const Twine &Name) const;
 
-    const SmallVectorImpl<std::string> &getMCDwarfDirs(unsigned CUID = 0) {
-      return getMCDwarfLineTable(CUID).getMCDwarfDirs();
-    }
+  /// Set value for a symbol.
+  void setSymbolValue(MCStreamer &Streamer, StringRef Sym, uint64_t Val);
 
-    unsigned getDwarfCompileUnitID() { return DwarfCompileUnitID; }
+  /// getSymbols - Get a reference for the symbol table for clients that
+  /// want to, for example, iterate over all symbols. 'const' because we
+  /// still want any modifications to the table itself to use the MCContext
+  /// APIs.
+  const SymbolTable &getSymbols() const { return Symbols; }
 
-    void setDwarfCompileUnitID(unsigned CUIndex) {
-      DwarfCompileUnitID = CUIndex;
-    }
+  /// isInlineAsmLabel - Return true if the name is a label referenced in
+  /// inline assembly.
+  MCSymbol *getInlineAsmLabel(StringRef Name) const {
+    return InlineAsmUsedLabelNames.lookup(Name);
+  }
 
-    /// Specifies the "root" file and directory of the compilation unit.
-    /// These are "file 0" and "directory 0" in DWARF v5.
-    void setMCLineTableRootFile(unsigned CUID, StringRef CompilationDir,
-                                StringRef Filename,
-                                Optional<MD5::MD5Result> Checksum,
-                                Optional<StringRef> Source) {
-      getMCDwarfLineTable(CUID).setRootFile(CompilationDir, Filename, Checksum,
-                                            Source);
-    }
+  /// registerInlineAsmLabel - Records that the name is a label referenced in
+  /// inline assembly.
+  void registerInlineAsmLabel(MCSymbol *Sym);
 
-    /// Reports whether MD5 checksum usage is consistent (all-or-none).
-    bool isDwarfMD5UsageConsistent(unsigned CUID) const {
-      return getMCDwarfLineTable(CUID).isMD5UsageConsistent();
-    }
+  /// @}
 
-    /// Saves the information from the currently parsed dwarf .loc directive
-    /// and sets DwarfLocSeen.  When the next instruction is assembled an entry
-    /// in the line number table with this information and the address of the
-    /// instruction will be created.
-    void setCurrentDwarfLoc(unsigned FileNum, unsigned Line, unsigned Column,
-                            unsigned Flags, unsigned Isa,
-                            unsigned Discriminator) {
-      CurrentDwarfLoc.setFileNum(FileNum);
-      CurrentDwarfLoc.setLine(Line);
-      CurrentDwarfLoc.setColumn(Column);
-      CurrentDwarfLoc.setFlags(Flags);
-      CurrentDwarfLoc.setIsa(Isa);
-      CurrentDwarfLoc.setDiscriminator(Discriminator);
-      DwarfLocSeen = true;
-    }
+  /// \name Section Management
+  /// @{
 
-    void clearDwarfLocSeen() { DwarfLocSeen = false; }
+  enum : unsigned {
+    /// Pass this value as the UniqueID during section creation to get the
+    /// generic section with the given name and characteristics. The usual
+    /// sections such as .text use this ID.
+    GenericSectionID = ~0U
+  };
 
-    bool getDwarfLocSeen() { return DwarfLocSeen; }
-    const MCDwarfLoc &getCurrentDwarfLoc() { return CurrentDwarfLoc; }
+  /// Return the MCSection for the specified mach-o section.  This requires
+  /// the operands to be valid.
+  MCSectionMachO *getMachOSection(StringRef Segment, StringRef Section,
+                                  unsigned TypeAndAttributes,
+                                  unsigned Reserved2, SectionKind K,
+                                  const char *BeginSymName = nullptr);
 
-    bool getGenDwarfForAssembly() { return GenDwarfForAssembly; }
-    void setGenDwarfForAssembly(bool Value) { GenDwarfForAssembly = Value; }
-    unsigned getGenDwarfFileNumber() { return GenDwarfFileNumber; }
+  MCSectionMachO *getMachOSection(StringRef Segment, StringRef Section,
+                                  unsigned TypeAndAttributes, SectionKind K,
+                                  const char *BeginSymName = nullptr) {
+    return getMachOSection(Segment, Section, TypeAndAttributes, 0, K,
+                           BeginSymName);
+  }
+
+  MCSectionELF *getELFSection(const Twine &Section, unsigned Type,
+                              unsigned Flags) {
+    return getELFSection(Section, Type, Flags, 0, "", false);
+  }
+
+  MCSectionELF *getELFSection(const Twine &Section, unsigned Type,
+                              unsigned Flags, unsigned EntrySize) {
+    return getELFSection(Section, Type, Flags, EntrySize, "", false,
+                         MCSection::NonUniqueID, nullptr);
+  }
+
+  MCSectionELF *getELFSection(const Twine &Section, unsigned Type,
+                              unsigned Flags, unsigned EntrySize,
+                              const Twine &Group, bool IsComdat) {
+    return getELFSection(Section, Type, Flags, EntrySize, Group, IsComdat,
+                         MCSection::NonUniqueID, nullptr);
+  }
+
+  MCSectionELF *getELFSection(const Twine &Section, unsigned Type,
+                              unsigned Flags, unsigned EntrySize,
+                              const Twine &Group, bool IsComdat,
+                              unsigned UniqueID,
+                              const MCSymbolELF *LinkedToSym);
+
+  MCSectionELF *getELFSection(const Twine &Section, unsigned Type,
+                              unsigned Flags, unsigned EntrySize,
+                              const MCSymbolELF *Group, bool IsComdat,
+                              unsigned UniqueID,
+                              const MCSymbolELF *LinkedToSym);
+
+  /// Get a section with the provided group identifier. This section is
+  /// named by concatenating \p Prefix with '.' then \p Suffix. The \p Type
+  /// describes the type of the section and \p Flags are used to further
+  /// configure this named section.
+  MCSectionELF *getELFNamedSection(const Twine &Prefix, const Twine &Suffix,
+                                   unsigned Type, unsigned Flags,
+                                   unsigned EntrySize = 0);
+
+  MCSectionELF *createELFRelSection(const Twine &Name, unsigned Type,
+                                    unsigned Flags, unsigned EntrySize,
+                                    const MCSymbolELF *Group,
+                                    const MCSectionELF *RelInfoSection);
+
+  void renameELFSection(MCSectionELF *Section, StringRef Name);
+
+  MCSectionELF *createELFGroupSection(const MCSymbolELF *Group, bool IsComdat);
+
+  void recordELFMergeableSectionInfo(StringRef SectionName, unsigned Flags,
+                                     unsigned UniqueID, unsigned EntrySize);
+
+  bool isELFImplicitMergeableSectionNamePrefix(StringRef Name);
+
+  bool isELFGenericMergeableSection(StringRef Name);
+
+  /// Return the unique ID of the section with the given name, flags and entry
+  /// size, if it exists.
+  Optional<unsigned> getELFUniqueIDForEntsize(StringRef SectionName,
+                                              unsigned Flags,
+                                              unsigned EntrySize);
+
+  MCSectionGOFF *getGOFFSection(StringRef Section, SectionKind Kind,
+                                MCSection *Parent, const MCExpr *SubsectionId);
+
+  MCSectionCOFF *getCOFFSection(StringRef Section, unsigned Characteristics,
+                                SectionKind Kind, StringRef COMDATSymName,
+                                int Selection,
+                                unsigned UniqueID = GenericSectionID,
+                                const char *BeginSymName = nullptr);
+
+  MCSectionCOFF *getCOFFSection(StringRef Section, unsigned Characteristics,
+                                SectionKind Kind,
+                                const char *BeginSymName = nullptr);
+
+  /// Gets or creates a section equivalent to Sec that is associated with the
+  /// section containing KeySym. For example, to create a debug info section
+  /// associated with an inline function, pass the normal debug info section
+  /// as Sec and the function symbol as KeySym.
+  MCSectionCOFF *
+  getAssociativeCOFFSection(MCSectionCOFF *Sec, const MCSymbol *KeySym,
+                            unsigned UniqueID = GenericSectionID);
+
+  MCSectionSPIRV *getSPIRVSection();
+
+  MCSectionWasm *getWasmSection(const Twine &Section, SectionKind K,
+                                unsigned Flags = 0) {
+    return getWasmSection(Section, K, Flags, nullptr);
+  }
+
+  MCSectionWasm *getWasmSection(const Twine &Section, SectionKind K,
+                                unsigned Flags, const char *BeginSymName) {
+    return getWasmSection(Section, K, Flags, "", ~0, BeginSymName);
+  }
+
+  MCSectionWasm *getWasmSection(const Twine &Section, SectionKind K,
+                                unsigned Flags, const Twine &Group,
+                                unsigned UniqueID) {
+    return getWasmSection(Section, K, Flags, Group, UniqueID, nullptr);
+  }
+
+  MCSectionWasm *getWasmSection(const Twine &Section, SectionKind K,
+                                unsigned Flags, const Twine &Group,
+                                unsigned UniqueID, const char *BeginSymName);
+
+  MCSectionWasm *getWasmSection(const Twine &Section, SectionKind K,
+                                unsigned Flags, const MCSymbolWasm *Group,
+                                unsigned UniqueID, const char *BeginSymName);
+  
+  /// Get the section for the provided Section name
+  MCSectionDXContainer *getDXContainerSection(StringRef Section, SectionKind K);
+
+  bool hasXCOFFSection(StringRef Section,
+                       XCOFF::CsectProperties CsectProp) const;
+
+  MCSectionXCOFF *getXCOFFSection(
+      StringRef Section, SectionKind K,
+      Optional<XCOFF::CsectProperties> CsectProp = None,
+      bool MultiSymbolsAllowed = false, const char *BeginSymName = nullptr,
+      Optional<XCOFF::DwarfSectionSubtypeFlags> DwarfSubtypeFlags = None);
+
+  // Create and save a copy of STI and return a reference to the copy.
+  MCSubtargetInfo &getSubtargetCopy(const MCSubtargetInfo &STI);
+
+  uint8_t getBBAddrMapVersion() const { return BBAddrMapVersion; }
+
+  /// @}
+
+  /// \name Dwarf Management
+  /// @{
+
+  /// Get the compilation directory for DW_AT_comp_dir
+  /// The compilation directory should be set with \c setCompilationDir before
+  /// calling this function. If it is unset, an empty string will be returned.
+  StringRef getCompilationDir() const { return CompilationDir; }
+
+  /// Set the compilation directory for DW_AT_comp_dir
+  void setCompilationDir(StringRef S) { CompilationDir = S.str(); }
+
+  /// Add an entry to the debug prefix map.
+  void addDebugPrefixMapEntry(const std::string &From, const std::string &To);
+
+  // Remaps all debug directory paths in-place as per the debug prefix map.
+  void RemapDebugPaths();
+
+  /// Get the main file name for use in error messages and debug
+  /// info. This can be set to ensure we've got the correct file name
+  /// after preprocessing or for -save-temps.
+  const std::string &getMainFileName() const { return MainFileName; }
+
+  /// Set the main file name and override the default.
+  void setMainFileName(StringRef S) { MainFileName = std::string(S); }
+
+  /// Creates an entry in the dwarf file and directory tables.
+  Expected<unsigned> getDwarfFile(StringRef Directory, StringRef FileName,
+                                  unsigned FileNumber,
+                                  Optional<MD5::MD5Result> Checksum,
+                                  Optional<StringRef> Source, unsigned CUID);
+
+  bool isValidDwarfFileNumber(unsigned FileNumber, unsigned CUID = 0);
+
+  const std::map<unsigned, MCDwarfLineTable> &getMCDwarfLineTables() const {
+    return MCDwarfLineTablesCUMap;
+  }
+
+  MCDwarfLineTable &getMCDwarfLineTable(unsigned CUID) {
+    return MCDwarfLineTablesCUMap[CUID];
+  }
+
+  const MCDwarfLineTable &getMCDwarfLineTable(unsigned CUID) const {
+    auto I = MCDwarfLineTablesCUMap.find(CUID);
+    assert(I != MCDwarfLineTablesCUMap.end());
+    return I->second;
+  }
+
+  const SmallVectorImpl<MCDwarfFile> &getMCDwarfFiles(unsigned CUID = 0) {
+    return getMCDwarfLineTable(CUID).getMCDwarfFiles();
+  }
+
+  const SmallVectorImpl<std::string> &getMCDwarfDirs(unsigned CUID = 0) {
+    return getMCDwarfLineTable(CUID).getMCDwarfDirs();
+  }
 
-    void setGenDwarfFileNumber(unsigned FileNumber) {
-      GenDwarfFileNumber = FileNumber;
-    }
+  unsigned getDwarfCompileUnitID() { return DwarfCompileUnitID; }
 
-    /// Specifies information about the "root file" for assembler clients
-    /// (e.g., llvm-mc). Assumes compilation dir etc. have been set up.
-    void setGenDwarfRootFile(StringRef FileName, StringRef Buffer);
+  void setDwarfCompileUnitID(unsigned CUIndex) { DwarfCompileUnitID = CUIndex; }
+
+  /// Specifies the "root" file and directory of the compilation unit.
+  /// These are "file 0" and "directory 0" in DWARF v5.
+  void setMCLineTableRootFile(unsigned CUID, StringRef CompilationDir,
+                              StringRef Filename,
+                              Optional<MD5::MD5Result> Checksum,
+                              Optional<StringRef> Source) {
+    getMCDwarfLineTable(CUID).setRootFile(CompilationDir, Filename, Checksum,
+                                          Source);
+  }
+
+  /// Reports whether MD5 checksum usage is consistent (all-or-none).
+  bool isDwarfMD5UsageConsistent(unsigned CUID) const {
+    return getMCDwarfLineTable(CUID).isMD5UsageConsistent();
+  }
 
-    const SetVector<MCSection *> &getGenDwarfSectionSyms() {
-      return SectionsForRanges;
-    }
+  /// Saves the information from the currently parsed dwarf .loc directive
+  /// and sets DwarfLocSeen.  When the next instruction is assembled an entry
+  /// in the line number table with this information and the address of the
+  /// instruction will be created.
+  void setCurrentDwarfLoc(unsigned FileNum, unsigned Line, unsigned Column,
+                          unsigned Flags, unsigned Isa,
+                          unsigned Discriminator) {
+    CurrentDwarfLoc.setFileNum(FileNum);
+    CurrentDwarfLoc.setLine(Line);
+    CurrentDwarfLoc.setColumn(Column);
+    CurrentDwarfLoc.setFlags(Flags);
+    CurrentDwarfLoc.setIsa(Isa);
+    CurrentDwarfLoc.setDiscriminator(Discriminator);
+    DwarfLocSeen = true;
+  }
 
-    bool addGenDwarfSection(MCSection *Sec) {
-      return SectionsForRanges.insert(Sec);
-    }
+  void clearDwarfLocSeen() { DwarfLocSeen = false; }
 
-    void finalizeDwarfSections(MCStreamer &MCOS);
+  bool getDwarfLocSeen() { return DwarfLocSeen; }
+  const MCDwarfLoc &getCurrentDwarfLoc() { return CurrentDwarfLoc; }
 
-    const std::vector<MCGenDwarfLabelEntry> &getMCGenDwarfLabelEntries() const {
-      return MCGenDwarfLabelEntries;
-    }
+  bool getGenDwarfForAssembly() { return GenDwarfForAssembly; }
+  void setGenDwarfForAssembly(bool Value) { GenDwarfForAssembly = Value; }
+  unsigned getGenDwarfFileNumber() { return GenDwarfFileNumber; }
+  EmitDwarfUnwindType emitDwarfUnwindInfo() const;
 
-    void addMCGenDwarfLabelEntry(const MCGenDwarfLabelEntry &E) {
-      MCGenDwarfLabelEntries.push_back(E);
-    }
+  void setGenDwarfFileNumber(unsigned FileNumber) {
+    GenDwarfFileNumber = FileNumber;
+  }
 
-    void setDwarfDebugFlags(StringRef S) { DwarfDebugFlags = S; }
-    StringRef getDwarfDebugFlags() { return DwarfDebugFlags; }
+  /// Specifies information about the "root file" for assembler clients
+  /// (e.g., llvm-mc). Assumes compilation dir etc. have been set up.
+  void setGenDwarfRootFile(StringRef FileName, StringRef Buffer);
 
-    void setDwarfDebugProducer(StringRef S) { DwarfDebugProducer = S; }
-    StringRef getDwarfDebugProducer() { return DwarfDebugProducer; }
+  const SetVector<MCSection *> &getGenDwarfSectionSyms() {
+    return SectionsForRanges;
+  }
 
-    void setDwarfFormat(dwarf::DwarfFormat f) { DwarfFormat = f; }
-    dwarf::DwarfFormat getDwarfFormat() const { return DwarfFormat; }
+  bool addGenDwarfSection(MCSection *Sec) {
+    return SectionsForRanges.insert(Sec);
+  }
 
-    void setDwarfVersion(uint16_t v) { DwarfVersion = v; }
-    uint16_t getDwarfVersion() const { return DwarfVersion; }
+  void finalizeDwarfSections(MCStreamer &MCOS);
+
+  const std::vector<MCGenDwarfLabelEntry> &getMCGenDwarfLabelEntries() const {
+    return MCGenDwarfLabelEntries;
+  }
 
-    /// @}
+  void addMCGenDwarfLabelEntry(const MCGenDwarfLabelEntry &E) {
+    MCGenDwarfLabelEntries.push_back(E);
+  }
+
+  void setDwarfDebugFlags(StringRef S) { DwarfDebugFlags = S; }
+  StringRef getDwarfDebugFlags() { return DwarfDebugFlags; }
 
-    char *getSecureLogFile() { return SecureLogFile; }
-    raw_fd_ostream *getSecureLog() { return SecureLog.get(); }
+  void setDwarfDebugProducer(StringRef S) { DwarfDebugProducer = S; }
+  StringRef getDwarfDebugProducer() { return DwarfDebugProducer; }
 
-    void setSecureLog(std::unique_ptr<raw_fd_ostream> Value) {
-      SecureLog = std::move(Value);
-    }
+  void setDwarfFormat(dwarf::DwarfFormat f) { DwarfFormat = f; }
+  dwarf::DwarfFormat getDwarfFormat() const { return DwarfFormat; }
+
+  void setDwarfVersion(uint16_t v) { DwarfVersion = v; }
+  uint16_t getDwarfVersion() const { return DwarfVersion; }
+
+  /// @}
 
-    bool getSecureLogUsed() { return SecureLogUsed; }
-    void setSecureLogUsed(bool Value) { SecureLogUsed = Value; }
+  char *getSecureLogFile() { return SecureLogFile; }
+  raw_fd_ostream *getSecureLog() { return SecureLog.get(); }
+
+  void setSecureLog(std::unique_ptr<raw_fd_ostream> Value) {
+    SecureLog = std::move(Value);
+  }
 
-    void *allocate(unsigned Size, unsigned Align = 8) {
-      return Allocator.Allocate(Size, Align);
-    }
+  bool getSecureLogUsed() { return SecureLogUsed; }
+  void setSecureLogUsed(bool Value) { SecureLogUsed = Value; }
 
-    void deallocate(void *Ptr) {}
+  void *allocate(unsigned Size, unsigned Align = 8) {
+    return Allocator.Allocate(Size, Align);
+  }
 
-    bool hadError() { return HadError; }
-    void diagnose(const SMDiagnostic &SMD);
-    void reportError(SMLoc L, const Twine &Msg);
-    void reportWarning(SMLoc L, const Twine &Msg);
+  void deallocate(void *Ptr) {}
 
-    const MCAsmMacro *lookupMacro(StringRef Name) {
-      StringMap<MCAsmMacro>::iterator I = MacroMap.find(Name);
-      return (I == MacroMap.end()) ? nullptr : &I->getValue();
-    }
+  bool hadError() { return HadError; }
+  void diagnose(const SMDiagnostic &SMD);
+  void reportError(SMLoc L, const Twine &Msg);
+  void reportWarning(SMLoc L, const Twine &Msg);
+
+  const MCAsmMacro *lookupMacro(StringRef Name) {
+    StringMap<MCAsmMacro>::iterator I = MacroMap.find(Name);
+    return (I == MacroMap.end()) ? nullptr : &I->getValue();
+  }
 
-    void defineMacro(StringRef Name, MCAsmMacro Macro) {
-      MacroMap.insert(std::make_pair(Name, std::move(Macro)));
-    }
+  void defineMacro(StringRef Name, MCAsmMacro Macro) {
+    MacroMap.insert(std::make_pair(Name, std::move(Macro)));
+  }
 
-    void undefineMacro(StringRef Name) { MacroMap.erase(Name); }
+  void undefineMacro(StringRef Name) { MacroMap.erase(Name); }
 
-    MCPseudoProbeTable &getMCPseudoProbeTable() { return PseudoProbeTable; }
-  };
+  MCPseudoProbeTable &getMCPseudoProbeTable() { return PseudoProbeTable; }
+};
 
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/MC/MCDXContainerStreamer.h b/llvm/include/llvm/MC/MCDXContainerStreamer.h
new file mode 100644
index 000000000000..ef1a95f71778
--- /dev/null
+++ b/llvm/include/llvm/MC/MCDXContainerStreamer.h
@@ -0,0 +1,49 @@
+//===- MCDXContainerStreamer.h - MCDXContainerStreamer Interface ---*- C++ ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Overrides MCObjectStreamer to disable all unnecessary features with stubs.
+// The DXContainer format isn't a fully featured object format. It doesn't
+// support symbols, and initially it will not support instruction data since it
+// is used as a bitcode container for DXIL.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MC_MCDXCONTAINERSTREAMER_H
+#define LLVM_MC_MCDXCONTAINERSTREAMER_H
+
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCObjectWriter.h"
+
+namespace llvm {
+class MCAssembler;
+class MCExpr;
+class MCInst;
+class raw_ostream;
+
+class MCDXContainerStreamer : public MCObjectStreamer {
+public:
+  MCDXContainerStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
+                        std::unique_ptr<MCObjectWriter> OW,
+                        std::unique_ptr<MCCodeEmitter> Emitter)
+      : MCObjectStreamer(Context, std::move(TAB), std::move(OW),
+                         std::move(Emitter)) {}
+
+  bool emitSymbolAttribute(MCSymbol *, MCSymbolAttr) override { return false; }
+  void emitCommonSymbol(MCSymbol *, uint64_t, unsigned) override {}
+  void emitZerofill(MCSection *, MCSymbol *Symbol = nullptr, uint64_t Size = 0,
+                    unsigned ByteAlignment = 0, SMLoc Loc = SMLoc()) override {}
+
+private:
+  void emitInstToData(const MCInst &, const MCSubtargetInfo &) override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_MC_MCDXCONTAINERSTREAMER_H
diff --git a/llvm/include/llvm/MC/MCDXContainerWriter.h b/llvm/include/llvm/MC/MCDXContainerWriter.h
new file mode 100644
index 000000000000..8ecb86c8a16f
--- /dev/null
+++ b/llvm/include/llvm/MC/MCDXContainerWriter.h
@@ -0,0 +1,45 @@
+//===- llvm/MC/MCDXContainerWriter.h - DXContainer Writer -*- C++ -------*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MC_MCDXCONTAINERWRITER_H
+#define LLVM_MC_MCDXCONTAINERWRITER_H
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCObjectWriter.h"
+
+namespace llvm {
+
+class raw_pwrite_stream;
+
+class MCDXContainerTargetWriter : public MCObjectTargetWriter {
+protected:
+  MCDXContainerTargetWriter() {}
+
+public:
+  virtual ~MCDXContainerTargetWriter();
+
+  Triple::ObjectFormatType getFormat() const override {
+    return Triple::DXContainer;
+  }
+  static bool classof(const MCObjectTargetWriter *W) {
+    return W->getFormat() == Triple::DXContainer;
+  }
+};
+
+/// Construct a new DXContainer writer instance.
+///
+/// \param MOTW - The target specific DXContainer writer subclass.
+/// \param OS - The stream to write to.
+/// \returns The constructed object writer.
+std::unique_ptr<MCObjectWriter>
+createDXContainerObjectWriter(std::unique_ptr<MCDXContainerTargetWriter> MOTW,
+                              raw_pwrite_stream &OS);
+
+} // end namespace llvm
+
+#endif // LLVM_MC_MCDXCONTAINERWRITER_H
diff --git a/llvm/include/llvm/MC/MCDecoderOps.h b/llvm/include/llvm/MC/MCDecoderOps.h
new file mode 100644
index 000000000000..c1956993fca2
--- /dev/null
+++ b/llvm/include/llvm/MC/MCDecoderOps.h
@@ -0,0 +1,33 @@
+//===------------ llvm/MC/MCDecoderOps.h - Decoder driver -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Disassembler decoder state machine driver.
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_MC_MCDECODEROPS_H
+#define LLVM_MC_MCDECODEROPS_H
+
+namespace llvm {
+
+namespace MCD {
+// Disassembler state machine opcodes.
+enum DecoderOps {
+  OPC_ExtractField = 1, // OPC_ExtractField(uint8_t Start, uint8_t Len)
+  OPC_FilterValue,      // OPC_FilterValue(uleb128 Val, uint16_t NumToSkip)
+  OPC_CheckField,       // OPC_CheckField(uint8_t Start, uint8_t Len,
+                        //                uleb128 Val, uint16_t NumToSkip)
+  OPC_CheckPredicate,   // OPC_CheckPredicate(uleb128 PIdx, uint16_t NumToSkip)
+  OPC_Decode,           // OPC_Decode(uleb128 Opcode, uleb128 DIdx)
+  OPC_TryDecode,        // OPC_TryDecode(uleb128 Opcode, uleb128 DIdx,
+                        //               uint16_t NumToSkip)
+  OPC_SoftFail,         // OPC_SoftFail(uleb128 PMask, uleb128 NMask)
+  OPC_Fail              // OPC_Fail()
+};
+
+} // namespace MCD
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/MC/MCDirectives.h b/llvm/include/llvm/MC/MCDirectives.h
index 51e57ad37021..d6ab29febeeb 100644
--- a/llvm/include/llvm/MC/MCDirectives.h
+++ b/llvm/include/llvm/MC/MCDirectives.h
@@ -31,6 +31,7 @@ enum MCSymbolAttr {
   MCSA_LGlobal,                 ///< .lglobl (XCOFF)
   MCSA_Extern,                  ///< .extern (XCOFF)
   MCSA_Hidden,                  ///< .hidden (ELF)
+  MCSA_Exported,                ///< .globl _foo, exported (XCOFF)
   MCSA_IndirectSymbol,          ///< .indirect_symbol (MachO)
   MCSA_Internal,                ///< .internal (ELF)
   MCSA_LazyReference,           ///< .lazy_reference (MachO)
diff --git a/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h b/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h
index 10037cd66ef1..de069ff95c2f 100644
--- a/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h
+++ b/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h
@@ -40,26 +40,35 @@ struct SymbolInfoTy {
 
 private:
   bool IsXCOFF;
+  bool HasType;
 
 public:
   SymbolInfoTy(uint64_t Addr, StringRef Name,
                Optional<XCOFF::StorageMappingClass> Smc, Optional<uint32_t> Idx,
                bool Label)
-      : Addr(Addr), Name(Name), XCOFFSymInfo(Smc, Idx, Label), IsXCOFF(true) {}
-  SymbolInfoTy(uint64_t Addr, StringRef Name, uint8_t Type)
-      : Addr(Addr), Name(Name), Type(Type), IsXCOFF(false) {}
+      : Addr(Addr), Name(Name), XCOFFSymInfo(Smc, Idx, Label), IsXCOFF(true),
+        HasType(false) {}
+  SymbolInfoTy(uint64_t Addr, StringRef Name, uint8_t Type,
+               bool IsXCOFF = false)
+      : Addr(Addr), Name(Name), Type(Type), IsXCOFF(IsXCOFF), HasType(true) {}
   bool isXCOFF() const { return IsXCOFF; }
 
 private:
   friend bool operator<(const SymbolInfoTy &P1, const SymbolInfoTy &P2) {
-    assert(P1.IsXCOFF == P2.IsXCOFF &&
-           "P1.IsXCOFF should be equal to P2.IsXCOFF.");
+    assert((P1.IsXCOFF == P2.IsXCOFF && P1.HasType == P2.HasType) &&
+           "The value of IsXCOFF and HasType in P1 and P2 should be the same "
+           "respectively.");
+
+    if (P1.IsXCOFF && P1.HasType)
+      return std::tie(P1.Addr, P1.Type, P1.Name) <
+             std::tie(P2.Addr, P2.Type, P2.Name);
+
     if (P1.IsXCOFF)
       return std::tie(P1.Addr, P1.XCOFFSymInfo, P1.Name) <
              std::tie(P2.Addr, P2.XCOFFSymInfo, P2.Name);
 
     return std::tie(P1.Addr, P1.Name, P1.Type) <
-             std::tie(P2.Addr, P2.Name, P2.Type);
+           std::tie(P2.Addr, P2.Name, P2.Type);
   }
 };
 
@@ -172,10 +181,9 @@ protected:
 
 public:
   // Helpers around MCSymbolizer
-  bool tryAddingSymbolicOperand(MCInst &Inst,
-                                int64_t Value,
-                                uint64_t Address, bool IsBranch,
-                                uint64_t Offset, uint64_t InstSize) const;
+  bool tryAddingSymbolicOperand(MCInst &Inst, int64_t Value, uint64_t Address,
+                                bool IsBranch, uint64_t Offset, uint64_t OpSize,
+                                uint64_t InstSize) const;
 
   void tryAddingPcLoadReferenceComment(int64_t Value, uint64_t Address) const;
 
diff --git a/llvm/include/llvm/MC/MCDisassembler/MCExternalSymbolizer.h b/llvm/include/llvm/MC/MCDisassembler/MCExternalSymbolizer.h
index ffac5ee5cb1f..8af3bb2296ec 100644
--- a/llvm/include/llvm/MC/MCDisassembler/MCExternalSymbolizer.h
+++ b/llvm/include/llvm/MC/MCDisassembler/MCExternalSymbolizer.h
@@ -15,7 +15,7 @@
 #ifndef LLVM_MC_MCDISASSEMBLER_MCEXTERNALSYMBOLIZER_H
 #define LLVM_MC_MCDISASSEMBLER_MCEXTERNALSYMBOLIZER_H
 
-#include "llvm-c/Disassembler.h"
+#include "llvm-c/DisassemblerTypes.h"
 #include "llvm/MC/MCDisassembler/MCSymbolizer.h"
 #include <memory>
 
@@ -46,7 +46,8 @@ public:
 
   bool tryAddingSymbolicOperand(MCInst &MI, raw_ostream &CommentStream,
                                 int64_t Value, uint64_t Address, bool IsBranch,
-                                uint64_t Offset, uint64_t InstSize) override;
+                                uint64_t Offset, uint64_t OpSize,
+                                uint64_t InstSize) override;
   void tryAddingPcLoadReferenceComment(raw_ostream &CommentStream,
                                        int64_t Value,
                                        uint64_t Address) override;
diff --git a/llvm/include/llvm/MC/MCDisassembler/MCSymbolizer.h b/llvm/include/llvm/MC/MCDisassembler/MCSymbolizer.h
index b966106007db..1efb63f1a142 100644
--- a/llvm/include/llvm/MC/MCDisassembler/MCSymbolizer.h
+++ b/llvm/include/llvm/MC/MCDisassembler/MCSymbolizer.h
@@ -17,9 +17,9 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/MC/MCDisassembler/MCRelocationInfo.h"
-#include <algorithm>
 #include <cstdint>
 #include <memory>
+#include <utility>
 
 namespace llvm {
 
@@ -63,12 +63,13 @@ public:
   /// \param Address   - Load address of the instruction.
   /// \param IsBranch  - Is the instruction a branch?
   /// \param Offset    - Byte offset of the operand inside the inst.
+  /// \param OpSize    - Size of the operand in bytes.
   /// \param InstSize  - Size of the instruction in bytes.
   /// \return Whether a symbolic operand was added.
   virtual bool tryAddingSymbolicOperand(MCInst &Inst, raw_ostream &cStream,
                                         int64_t Value, uint64_t Address,
                                         bool IsBranch, uint64_t Offset,
-                                        uint64_t InstSize) = 0;
+                                        uint64_t OpSize, uint64_t InstSize) = 0;
 
   /// Try to add a comment on the PC-relative load.
   /// For instance, in Mach-O, this is used to add annotations to instructions
diff --git a/llvm/include/llvm/MC/MCDwarf.h b/llvm/include/llvm/MC/MCDwarf.h
index 7e72d56f3097..ce65b173b3d2 100644
--- a/llvm/include/llvm/MC/MCDwarf.h
+++ b/llvm/include/llvm/MC/MCDwarf.h
@@ -19,14 +19,12 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/MC/MCSection.h"
 #include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MD5.h"
 #include <cassert>
 #include <cstdint>
 #include <string>
-#include <tuple>
 #include <utility>
 #include <vector>
 
@@ -36,6 +34,7 @@ template <typename T> class ArrayRef;
 class MCAsmBackend;
 class MCContext;
 class MCObjectStreamer;
+class MCSection;
 class MCStreamer;
 class MCSymbol;
 class raw_ostream;
@@ -63,6 +62,9 @@ public:
 
   /// Emit the .debug_line_str section if appropriate.
   void emitSection(MCStreamer *MCOS);
+
+  /// Returns finalized section.
+  SmallString<0> getFinalizedData();
 };
 
 /// Instances of this class represent the name of the dwarf .file directive and
@@ -294,8 +296,8 @@ public:
     RootFile.DirIndex = 0;
     RootFile.Checksum = Checksum;
     RootFile.Source = Source;
-    trackMD5Usage(Checksum.hasValue());
-    HasSource = Source.hasValue();
+    trackMD5Usage(Checksum.has_value());
+    HasSource = Source.has_value();
   }
 
   void resetFileTable() {
@@ -686,6 +688,7 @@ struct MCDwarfFrameInfo {
   bool IsSimple = false;
   unsigned RAReg = static_cast<unsigned>(INT_MAX);
   bool IsBKeyFrame = false;
+  bool IsMTETaggedFrame = false;
 };
 
 class MCDwarfFrameEmitter {
diff --git a/llvm/include/llvm/MC/MCELFStreamer.h b/llvm/include/llvm/MC/MCELFStreamer.h
index 8f2b176862c8..eac807aad908 100644
--- a/llvm/include/llvm/MC/MCELFStreamer.h
+++ b/llvm/include/llvm/MC/MCELFStreamer.h
@@ -10,12 +10,19 @@
 #define LLVM_MC_MCELFSTREAMER_H
 
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCObjectStreamer.h"
 
 namespace llvm {
 
+class MCContext;
+class MCDataFragment;
+class MCFragment;
+class MCObjectWriter;
+class MCSection;
+class MCSubtargetInfo;
+class MCSymbol;
+class MCSymbolRefExpr;
 class MCAsmBackend;
 class MCCodeEmitter;
 class MCExpr;
diff --git a/llvm/include/llvm/MC/MCFixedLenDisassembler.h b/llvm/include/llvm/MC/MCFixedLenDisassembler.h
deleted file mode 100644
index 1edf3899c130..000000000000
--- a/llvm/include/llvm/MC/MCFixedLenDisassembler.h
+++ /dev/null
@@ -1,33 +0,0 @@
-//===-- llvm/MC/MCFixedLenDisassembler.h - Decoder driver -------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// Fixed length disassembler decoder state machine driver.
-//===----------------------------------------------------------------------===//
-#ifndef LLVM_MC_MCFIXEDLENDISASSEMBLER_H
-#define LLVM_MC_MCFIXEDLENDISASSEMBLER_H
-
-namespace llvm {
-
-namespace MCD {
-// Disassembler state machine opcodes.
-enum DecoderOps {
-  OPC_ExtractField = 1, // OPC_ExtractField(uint8_t Start, uint8_t Len)
-  OPC_FilterValue,      // OPC_FilterValue(uleb128 Val, uint16_t NumToSkip)
-  OPC_CheckField,       // OPC_CheckField(uint8_t Start, uint8_t Len,
-                        //                uleb128 Val, uint16_t NumToSkip)
-  OPC_CheckPredicate,   // OPC_CheckPredicate(uleb128 PIdx, uint16_t NumToSkip)
-  OPC_Decode,           // OPC_Decode(uleb128 Opcode, uleb128 DIdx)
-  OPC_TryDecode,        // OPC_TryDecode(uleb128 Opcode, uleb128 DIdx,
-                        //               uint16_t NumToSkip)
-  OPC_SoftFail,         // OPC_SoftFail(uleb128 PMask, uleb128 NMask)
-  OPC_Fail              // OPC_Fail()
-};
-
-} // namespace MCD
-} // namespace llvm
-
-#endif
diff --git a/llvm/include/llvm/MC/MCFragment.h b/llvm/include/llvm/MC/MCFragment.h
index 736fdd992063..b6329b131624 100644
--- a/llvm/include/llvm/MC/MCFragment.h
+++ b/llvm/include/llvm/MC/MCFragment.h
@@ -17,7 +17,6 @@
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/Alignment.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/SMLoc.h"
 #include <cstdint>
 #include <utility>
@@ -294,7 +293,7 @@ public:
 
 class MCAlignFragment : public MCFragment {
   /// The alignment to ensure, in bytes.
-  unsigned Alignment;
+  Align Alignment;
 
   /// Flag to indicate that (optimal) NOPs should be emitted instead
   /// of using the provided value. The exact interpretation of this flag is
@@ -315,12 +314,12 @@ class MCAlignFragment : public MCFragment {
   const MCSubtargetInfo *STI;
 
 public:
-  MCAlignFragment(unsigned Alignment, int64_t Value, unsigned ValueSize,
+  MCAlignFragment(Align Alignment, int64_t Value, unsigned ValueSize,
                   unsigned MaxBytesToEmit, MCSection *Sec = nullptr)
       : MCFragment(FT_Align, false, Sec), Alignment(Alignment), EmitNops(false),
         Value(Value), ValueSize(ValueSize), MaxBytesToEmit(MaxBytesToEmit) {}
 
-  unsigned getAlignment() const { return Alignment; }
+  Align getAlignment() const { return Alignment; }
 
   int64_t getValue() const { return Value; }
 
diff --git a/llvm/include/llvm/MC/MCInstrAnalysis.h b/llvm/include/llvm/MC/MCInstrAnalysis.h
index 632a7d8f820e..a937f8203a0d 100644
--- a/llvm/include/llvm/MC/MCInstrAnalysis.h
+++ b/llvm/include/llvm/MC/MCInstrAnalysis.h
@@ -14,10 +14,13 @@
 #ifndef LLVM_MC_MCINSTRANALYSIS_H
 #define LLVM_MC_MCINSTRANALYSIS_H
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include <cstdint>
+#include <vector>
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/MC/MCInstrDesc.h b/llvm/include/llvm/MC/MCInstrDesc.h
index e8ffd29170e6..120c3482ce70 100644
--- a/llvm/include/llvm/MC/MCInstrDesc.h
+++ b/llvm/include/llvm/MC/MCInstrDesc.h
@@ -14,10 +14,11 @@
 #ifndef LLVM_MC_MCINSTRDESC_H
 #define LLVM_MC_MCINSTRDESC_H
 
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/Support/DataTypes.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/MC/MCRegister.h"
 
 namespace llvm {
+class MCRegisterInfo;
 
 class MCInst;
 
@@ -148,6 +149,7 @@ enum Flag {
   Variadic,
   HasOptionalDef,
   Pseudo,
+  Meta,
   Return,
   EHScopeReturn,
   Call,
@@ -263,6 +265,10 @@ public:
   /// correspond to a real machine instruction.
   bool isPseudo() const { return Flags & (1ULL << MCID::Pseudo); }
 
+  /// Return true if this is a meta instruction that doesn't
+  /// produce any output in the form of executable instructions.
+  bool isMetaInstruction() const { return Flags & (1ULL << MCID::Meta); }
+
   /// Return true if the instruction is a return.
   bool isReturn() const { return Flags & (1ULL << MCID::Return); }
 
diff --git a/llvm/include/llvm/MC/MCInstrInfo.h b/llvm/include/llvm/MC/MCInstrInfo.h
index 598e24257e5d..84995b1e93fe 100644
--- a/llvm/include/llvm/MC/MCInstrInfo.h
+++ b/llvm/include/llvm/MC/MCInstrInfo.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_MC_MCINSTRINFO_H
 #define LLVM_MC_MCINSTRINFO_H
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include <cassert>
 
diff --git a/llvm/include/llvm/MC/MCLinkerOptimizationHint.h b/llvm/include/llvm/MC/MCLinkerOptimizationHint.h
index 003491f32f75..b91fbc62aa75 100644
--- a/llvm/include/llvm/MC/MCLinkerOptimizationHint.h
+++ b/llvm/include/llvm/MC/MCLinkerOptimizationHint.h
@@ -19,7 +19,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <cstdint>
 
@@ -28,6 +27,7 @@ namespace llvm {
 class MachObjectWriter;
 class MCAsmLayout;
 class MCSymbol;
+class raw_ostream;
 
 /// Linker Optimization Hint Type.
 enum MCLOHType {
diff --git a/llvm/include/llvm/MC/MCMachObjectWriter.h b/llvm/include/llvm/MC/MCMachObjectWriter.h
index f4f9c474cdcd..149373dd2b54 100644
--- a/llvm/include/llvm/MC/MCMachObjectWriter.h
+++ b/llvm/include/llvm/MC/MCMachObjectWriter.h
@@ -264,6 +264,8 @@ public:
                                               bool IsPCRel) const override;
 
   uint64_t writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
+
+  void writeAddrsigSection(MCAssembler &Asm);
 };
 
 /// Construct a new Mach-O writer instance.
diff --git a/llvm/include/llvm/MC/MCObjectFileInfo.h b/llvm/include/llvm/MC/MCObjectFileInfo.h
index 3c1d10c4e62f..ebc9b95d6d4e 100644
--- a/llvm/include/llvm/MC/MCObjectFileInfo.h
+++ b/llvm/include/llvm/MC/MCObjectFileInfo.h
@@ -13,13 +13,13 @@
 #ifndef LLVM_MC_MCOBJECTFILEINFO_H
 #define LLVM_MC_MCOBJECTFILEINFO_H
 
-#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/BinaryFormat/Swift.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/VersionTuple.h"
 
+#include <array>
+
 namespace llvm {
 class MCContext;
 class MCSection;
@@ -213,6 +213,7 @@ protected:
   MCSection *LazySymbolPointerSection = nullptr;
   MCSection *NonLazySymbolPointerSection = nullptr;
   MCSection *ThreadLocalPointerSection = nullptr;
+  MCSection *AddrSigSection = nullptr;
 
   /// COFF specific sections.
   MCSection *DrectveSection = nullptr;
@@ -224,6 +225,9 @@ protected:
   MCSection *GIATsSection = nullptr;
   MCSection *GLJMPSection = nullptr;
 
+  // GOFF specific sections.
+  MCSection *PPA1Section = nullptr;
+
   // XCOFF specific sections
   MCSection *TOCBaseSection = nullptr;
   MCSection *ReadOnly8Section = nullptr;
@@ -410,6 +414,7 @@ public:
   MCSection *getThreadLocalPointerSection() const {
     return ThreadLocalPointerSection;
   }
+  MCSection *getAddrSigSection() const { return AddrSigSection; }
 
   // COFF specific sections.
   MCSection *getDrectveSection() const { return DrectveSection; }
@@ -421,6 +426,9 @@ public:
   MCSection *getGIATsSection() const { return GIATsSection; }
   MCSection *getGLJMPSection() const { return GLJMPSection; }
 
+  // GOFF specific sections.
+  MCSection *getPPA1Section() const { return PPA1Section; }
+
   // XCOFF specific sections
   MCSection *getTOCBaseSection() const { return TOCBaseSection; }
 
@@ -448,8 +456,10 @@ private:
   void initELFMCObjectFileInfo(const Triple &T, bool Large);
   void initGOFFMCObjectFileInfo(const Triple &T);
   void initCOFFMCObjectFileInfo(const Triple &T);
+  void initSPIRVMCObjectFileInfo(const Triple &T);
   void initWasmMCObjectFileInfo(const Triple &T);
   void initXCOFFMCObjectFileInfo(const Triple &T);
+  void initDXContainerObjectFileInfo(const Triple &T);
   MCSection *getDwarfComdatSection(const char *Name, uint64_t Hash) const;
 
 public:
diff --git a/llvm/include/llvm/MC/MCObjectStreamer.h b/llvm/include/llvm/MC/MCObjectStreamer.h
index 183fd79fb9fc..6536c81d4aac 100644
--- a/llvm/include/llvm/MC/MCObjectStreamer.h
+++ b/llvm/include/llvm/MC/MCObjectStreamer.h
@@ -11,11 +11,17 @@
 
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 
 namespace llvm {
+class MCContext;
+class MCInst;
+class MCObjectWriter;
+class MCSymbol;
+struct MCDwarfFrameInfo;
 class MCAssembler;
 class MCCodeEmitter;
 class MCSubtargetInfo;
diff --git a/llvm/include/llvm/MC/MCObjectWriter.h b/llvm/include/llvm/MC/MCObjectWriter.h
index d2a2f1a13ff5..a8e24a0c56ba 100644
--- a/llvm/include/llvm/MC/MCObjectWriter.h
+++ b/llvm/include/llvm/MC/MCObjectWriter.h
@@ -10,6 +10,7 @@
 #define LLVM_MC_MCOBJECTWRITER_H
 
 #include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCSymbol.h"
 #include <cstdint>
 
 namespace llvm {
@@ -32,6 +33,9 @@ class MCValue;
 /// should be emitted as part of writeObject().
 class MCObjectWriter {
 protected:
+  std::vector<const MCSymbol *> AddrsigSyms;
+  bool EmitAddrsigSection = false;
+
   MCObjectWriter() = default;
 
 public:
@@ -91,11 +95,15 @@ public:
   /// Tell the object writer to emit an address-significance table during
   /// writeObject(). If this function is not called, all symbols are treated as
   /// address-significant.
-  virtual void emitAddrsigSection() {}
+  void emitAddrsigSection() { EmitAddrsigSection = true; }
+
+  bool getEmitAddrsigSection() { return EmitAddrsigSection; }
 
   /// Record the given symbol in the address-significance table to be written
   /// diring writeObject().
-  virtual void addAddrsigSymbol(const MCSymbol *Sym) {}
+  void addAddrsigSymbol(const MCSymbol *Sym) { AddrsigSyms.push_back(Sym); }
+
+  std::vector<const MCSymbol *> &getAddrsigSyms() { return AddrsigSyms; }
 
   /// Write the object file and returns the number of bytes written.
   ///
diff --git a/llvm/include/llvm/MC/MCParser/MCAsmLexer.h b/llvm/include/llvm/MC/MCParser/MCAsmLexer.h
index 06796979b4fc..850a9cffe73a 100644
--- a/llvm/include/llvm/MC/MCParser/MCAsmLexer.h
+++ b/llvm/include/llvm/MC/MCParser/MCAsmLexer.h
@@ -12,10 +12,8 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCAsmMacro.h"
-#include <algorithm>
 #include <cassert>
 #include <cstddef>
-#include <cstdint>
 #include <string>
 
 namespace llvm {
diff --git a/llvm/include/llvm/MC/MCParser/MCAsmParser.h b/llvm/include/llvm/MC/MCParser/MCAsmParser.h
index 29386ffc45ac..4a1291856a20 100644
--- a/llvm/include/llvm/MC/MCParser/MCAsmParser.h
+++ b/llvm/include/llvm/MC/MCParser/MCAsmParser.h
@@ -10,20 +10,20 @@
 #define LLVM_MC_MCPARSER_MCASMPARSER_H
 
 #include "llvm/ADT/None.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCAsmMacro.h"
 #include "llvm/Support/SMLoc.h"
 #include <cstdint>
-#include <ctime>
 #include <string>
 #include <utility>
 
 namespace llvm {
 
+class MCAsmLexer;
 class MCAsmInfo;
 class MCAsmParserExtension;
 class MCContext;
diff --git a/llvm/include/llvm/MC/MCParser/MCAsmParserExtension.h b/llvm/include/llvm/MC/MCParser/MCAsmParserExtension.h
index fc10e33bcf6b..cbabc2c9d69d 100644
--- a/llvm/include/llvm/MC/MCParser/MCAsmParserExtension.h
+++ b/llvm/include/llvm/MC/MCParser/MCAsmParserExtension.h
@@ -9,9 +9,8 @@
 #ifndef LLVM_MC_MCPARSER_MCASMPARSEREXTENSION_H
 #define LLVM_MC_MCPARSER_MCASMPARSEREXTENSION_H
 
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/Support/SMLoc.h"
 
diff --git a/llvm/include/llvm/MC/MCParser/MCParsedAsmOperand.h b/llvm/include/llvm/MC/MCParser/MCParsedAsmOperand.h
index faf0a4474c8a..22f66a011ece 100644
--- a/llvm/include/llvm/MC/MCParser/MCParsedAsmOperand.h
+++ b/llvm/include/llvm/MC/MCParser/MCParsedAsmOperand.h
@@ -10,7 +10,6 @@
 #define LLVM_MC_MCPARSER_MCPARSEDASMOPERAND_H
 
 #include "llvm/ADT/StringRef.h"
-#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Support/SMLoc.h"
 #include <string>
 
@@ -63,6 +62,13 @@ public:
   /// isMem - Is this a memory operand?
   virtual bool isMem() const = 0;
 
+  /// isMemUseUpRegs - Is memory operand use up regs, for example, intel MS
+  /// inline asm may use ARR[baseReg + IndexReg + ...] which may use up regs
+  /// in [...] expr, so ARR[baseReg + IndexReg + ...] can not use extra reg
+  /// for ARR. For example, calculating ARR address to a reg or use another
+  /// base reg in PIC model.
+  virtual bool isMemUseUpRegs() const { return false; }
+
   /// getStartLoc - Get the location of the first token of this operand.
   virtual SMLoc getStartLoc() const = 0;
   /// getEndLoc - Get the location of the last token of this operand.
@@ -77,10 +83,6 @@ public:
   /// assembly.
   virtual bool isOffsetOfLocal() const { return false; }
 
-  /// isMemPlaceholder - Do we need to ignore the constraint, rather than emit
-  /// code? Only valid when parsing MS-style inline assembly.
-  virtual bool isMemPlaceholder(const MCInstrDesc &Desc) const { return false; }
-
   /// getOffsetOfLoc - Get the location of the offset operator.
   virtual SMLoc getOffsetOfLoc() const { return SMLoc(); }
 
diff --git a/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h b/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h
index 908ee30e4060..1d380c6a00b7 100644
--- a/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h
+++ b/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h
@@ -11,10 +11,8 @@
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCParser/MCAsmLexer.h"
-#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCAsmParserExtension.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Support/SMLoc.h"
@@ -23,9 +21,12 @@
 
 namespace llvm {
 
+class MCContext;
 class MCInst;
+class MCInstrInfo;
 class MCStreamer;
 class MCSubtargetInfo;
+class MCSymbol;
 template <typename T> class SmallVectorImpl;
 
 using OperandVector = SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand>>;
@@ -100,10 +101,14 @@ struct AsmRewrite {
   int64_t Val;
   StringRef Label;
   IntelExpr IntelExp;
+  bool IntelExpRestricted;
 
 public:
-  AsmRewrite(AsmRewriteKind kind, SMLoc loc, unsigned len = 0, int64_t val = 0)
-    : Kind(kind), Loc(loc), Len(len), Done(false), Val(val) {}
+  AsmRewrite(AsmRewriteKind kind, SMLoc loc, unsigned len = 0, int64_t val = 0,
+             bool Restricted = false)
+      : Kind(kind), Loc(loc), Len(len), Done(false), Val(val) {
+    IntelExpRestricted = Restricted;
+  }
   AsmRewrite(AsmRewriteKind kind, SMLoc loc, unsigned len, StringRef label)
     : AsmRewrite(kind, loc, len) { Label = label; }
   AsmRewrite(SMLoc loc, unsigned len, IntelExpr exp)
diff --git a/llvm/include/llvm/MC/MCPseudoProbe.h b/llvm/include/llvm/MC/MCPseudoProbe.h
index 9ff68f4236ca..d10d6015cd3c 100644
--- a/llvm/include/llvm/MC/MCPseudoProbe.h
+++ b/llvm/include/llvm/MC/MCPseudoProbe.h
@@ -55,6 +55,7 @@
 #include <tuple>
 #include <type_traits>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 namespace llvm {
@@ -82,10 +83,9 @@ struct MCPseudoProbeFuncDesc {
   void print(raw_ostream &OS);
 };
 
-class MCPseudoProbe;
 class MCDecodedPseudoProbe;
 
-// An inline frame has the form <Guid, ProbeID>
+// An inline frame has the form <CalleeGuid, ProbeID>
 using InlineSite = std::tuple<uint64_t, uint32_t>;
 using MCPseudoProbeInlineStack = SmallVector<InlineSite, 8>;
 // GUID to PseudoProbeFuncDesc map
@@ -95,7 +95,6 @@ using GUIDProbeFunctionMap =
 using AddressProbesMap =
     std::unordered_map<uint64_t, std::list<MCDecodedPseudoProbe>>;
 
-class MCPseudoProbeInlineTree;
 class MCDecodedPseudoProbeInlineTree;
 
 class MCPseudoProbeBase {
@@ -272,7 +271,7 @@ public:
   MCDecodedPseudoProbeInlineTree(const InlineSite &Site) : ISite(Site){};
 
   // Return false if it's a dummy inline site
-  bool hasInlineSite() const { return std::get<0>(ISite) != 0; }
+  bool hasInlineSite() const { return !isRoot() && !Parent->isRoot(); }
 };
 
 /// Instances of this class represent the pseudo probes inserted into a compile
@@ -355,6 +354,15 @@ public:
   // Decode pseudo_probe section to build address to probes map.
   bool buildAddress2ProbeMap(const uint8_t *Start, std::size_t Size);
 
+  // Decode pseudo_probe section to build address to probes map for specifed
+  // functions only.
+  bool buildAddress2ProbeMap(const uint8_t *Start, std::size_t Size,
+                             std::unordered_set<uint64_t> &GuildFilter);
+
+  bool buildAddress2ProbeMap(MCDecodedPseudoProbeInlineTree *Cur,
+                             uint64_t &LastAddr,
+                             std::unordered_set<uint64_t> &GuildFilter);
+
   // Print pseudo_probe_desc section info
   void printGUID2FuncDescMap(raw_ostream &OS);
 
diff --git a/llvm/include/llvm/MC/MCRegisterInfo.h b/llvm/include/llvm/MC/MCRegisterInfo.h
index 65436dc74c3e..7165a2982d1b 100644
--- a/llvm/include/llvm/MC/MCRegisterInfo.h
+++ b/llvm/include/llvm/MC/MCRegisterInfo.h
@@ -580,6 +580,9 @@ public:
   bool isSuperOrSubRegisterEq(MCRegister RegA, MCRegister RegB) const {
     return isSubRegisterEq(RegA, RegB) || isSuperRegister(RegA, RegB);
   }
+
+  /// Returns true if the two registers are equal or alias each other.
+  bool regsOverlap(MCRegister RegA, MCRegister RegB) const;
 };
 
 //===----------------------------------------------------------------------===//
@@ -698,6 +701,11 @@ public:
     // unit, we can allow a 0 differential here.
     advance();
   }
+
+  MCRegUnitIterator &operator++() {
+    MCRegisterInfo::DiffListIterator::operator++();
+    return *this;
+  }
 };
 
 /// MCRegUnitMaskIterator enumerates a list of register units and their
diff --git a/llvm/include/llvm/MC/MCSPIRVObjectWriter.h b/llvm/include/llvm/MC/MCSPIRVObjectWriter.h
new file mode 100644
index 000000000000..a8baf96b8384
--- /dev/null
+++ b/llvm/include/llvm/MC/MCSPIRVObjectWriter.h
@@ -0,0 +1,40 @@
+//===-- llvm/MC/MCSPIRVObjectWriter.h - SPIR-V Object Writer -----*- C++ *-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MC_MCSPIRVOBJECTWRITER_H
+#define LLVM_MC_MCSPIRVOBJECTWRITER_H
+
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include <memory>
+
+namespace llvm {
+
+class MCSPIRVObjectTargetWriter : public MCObjectTargetWriter {
+protected:
+  explicit MCSPIRVObjectTargetWriter() {}
+
+public:
+  Triple::ObjectFormatType getFormat() const override { return Triple::SPIRV; }
+  static bool classof(const MCObjectTargetWriter *W) {
+    return W->getFormat() == Triple::SPIRV;
+  }
+};
+
+/// Construct a new SPIR-V writer instance.
+///
+/// \param MOTW - The target specific SPIR-V writer subclass.
+/// \param OS - The stream to write to.
+/// \returns The constructed object writer.
+std::unique_ptr<MCObjectWriter>
+createSPIRVObjectWriter(std::unique_ptr<MCSPIRVObjectTargetWriter> MOTW,
+                        raw_pwrite_stream &OS);
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/MC/MCSPIRVStreamer.h b/llvm/include/llvm/MC/MCSPIRVStreamer.h
new file mode 100644
index 000000000000..7366e0a9d82c
--- /dev/null
+++ b/llvm/include/llvm/MC/MCSPIRVStreamer.h
@@ -0,0 +1,50 @@
+//===- MCSPIRVStreamer.h - MCStreamer SPIR-V Object File Interface -*- C++ ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Overrides MCObjectStreamer to disable all unnecessary features with stubs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MC_MCSPIRVSTREAMER_H
+#define LLVM_MC_MCSPIRVSTREAMER_H
+
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCObjectWriter.h"
+
+namespace llvm {
+class MCAssembler;
+class MCExpr;
+class MCInst;
+class raw_ostream;
+
+class MCSPIRVStreamer : public MCObjectStreamer {
+public:
+  MCSPIRVStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
+                  std::unique_ptr<MCObjectWriter> OW,
+                  std::unique_ptr<MCCodeEmitter> Emitter)
+      : MCObjectStreamer(Context, std::move(TAB), std::move(OW),
+                         std::move(Emitter)) {}
+
+  bool emitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override {
+    return false;
+  }
+  void emitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                        unsigned ByteAlignment) override {}
+  void emitZerofill(MCSection *Section, MCSymbol *Symbol = nullptr,
+                    uint64_t Size = 0, unsigned ByteAlignment = 0,
+                    SMLoc Loc = SMLoc()) override {}
+
+private:
+  void emitInstToData(const MCInst &Inst, const MCSubtargetInfo &) override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/MC/MCSection.h b/llvm/include/llvm/MC/MCSection.h
index 4335092f0920..2f7e17123c19 100644
--- a/llvm/include/llvm/MC/MCSection.h
+++ b/llvm/include/llvm/MC/MCSection.h
@@ -46,7 +46,9 @@ public:
     SV_GOFF,
     SV_MachO,
     SV_Wasm,
-    SV_XCOFF
+    SV_XCOFF,
+    SV_SPIRV,
+    SV_DXContainer,
   };
 
   /// Express the state of bundle locked groups while emitting code.
@@ -184,13 +186,13 @@ public:
 
   void dump() const;
 
-  virtual void PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
+  virtual void printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                                     raw_ostream &OS,
                                     const MCExpr *Subsection) const = 0;
 
   /// Return true if a .align directive should use "optimized nops" to fill
   /// instead of 0s.
-  virtual bool UseCodeAlign() const = 0;
+  virtual bool useCodeAlign() const = 0;
 
   /// Check whether this section is "virtual", that is has no actual object
   /// file contents.
diff --git a/llvm/include/llvm/MC/MCSectionCOFF.h b/llvm/include/llvm/MC/MCSectionCOFF.h
index 3ece6eb904bc..373863e21ff0 100644
--- a/llvm/include/llvm/MC/MCSectionCOFF.h
+++ b/llvm/include/llvm/MC/MCSectionCOFF.h
@@ -61,7 +61,7 @@ private:
 public:
   /// Decides whether a '.section' directive should be printed before the
   /// section name
-  bool ShouldOmitSectionDirective(StringRef Name, const MCAsmInfo &MAI) const;
+  bool shouldOmitSectionDirective(StringRef Name, const MCAsmInfo &MAI) const;
 
   unsigned getCharacteristics() const { return Characteristics; }
   MCSymbol *getCOMDATSymbol() const { return COMDATSymbol; }
@@ -69,10 +69,10 @@ public:
 
   void setSelection(int Selection) const;
 
-  void PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
+  void printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                             raw_ostream &OS,
                             const MCExpr *Subsection) const override;
-  bool UseCodeAlign() const override;
+  bool useCodeAlign() const override;
   bool isVirtualSection() const override;
   StringRef getVirtualSectionKind() const override;
 
diff --git a/llvm/include/llvm/MC/MCSectionDXContainer.h b/llvm/include/llvm/MC/MCSectionDXContainer.h
new file mode 100644
index 000000000000..014684a93529
--- /dev/null
+++ b/llvm/include/llvm/MC/MCSectionDXContainer.h
@@ -0,0 +1,38 @@
+//===- MCSectionDXContainer.h - DXContainer MC Sections ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the MCSectionDXContainer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MC_MCSECTIONDXCONTAINER_H
+#define LLVM_MC_MCSECTIONDXCONTAINER_H
+
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/SectionKind.h"
+
+namespace llvm {
+
+class MCSymbol;
+
+class MCSectionDXContainer final : public MCSection {
+  friend class MCContext;
+
+  MCSectionDXContainer(StringRef Name, SectionKind K, MCSymbol *Begin)
+      : MCSection(SV_DXContainer, Name, K, Begin) {}
+
+public:
+  void printSwitchToSection(const MCAsmInfo &, const Triple &, raw_ostream &,
+                            const MCExpr *) const override;
+  bool useCodeAlign() const override { return false; }
+  bool isVirtualSection() const override { return false; }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_MC_MCSECTIONDXCONTAINER_H
diff --git a/llvm/include/llvm/MC/MCSectionELF.h b/llvm/include/llvm/MC/MCSectionELF.h
index 8b17df25a158..3b5239394493 100644
--- a/llvm/include/llvm/MC/MCSectionELF.h
+++ b/llvm/include/llvm/MC/MCSectionELF.h
@@ -21,8 +21,6 @@
 
 namespace llvm {
 
-class MCSymbol;
-
 /// This represents a section on linux, lots of unix variants and some bare
 /// metal systems.
 class MCSectionELF final : public MCSection {
@@ -69,7 +67,7 @@ private:
 public:
   /// Decides whether a '.section' directive should be printed before the
   /// section name
-  bool ShouldOmitSectionDirective(StringRef Name, const MCAsmInfo &MAI) const;
+  bool shouldOmitSectionDirective(StringRef Name, const MCAsmInfo &MAI) const;
 
   unsigned getType() const { return Type; }
   unsigned getFlags() const { return Flags; }
@@ -78,10 +76,10 @@ public:
   const MCSymbolELF *getGroup() const { return Group.getPointer(); }
   bool isComdat() const { return Group.getInt(); }
 
-  void PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
+  void printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                             raw_ostream &OS,
                             const MCExpr *Subsection) const override;
-  bool UseCodeAlign() const override;
+  bool useCodeAlign() const override;
   bool isVirtualSection() const override;
   StringRef getVirtualSectionKind() const override;
 
diff --git a/llvm/include/llvm/MC/MCSectionGOFF.h b/llvm/include/llvm/MC/MCSectionGOFF.h
index 4ba7f79f9696..d866329461ce 100644
--- a/llvm/include/llvm/MC/MCSectionGOFF.h
+++ b/llvm/include/llvm/MC/MCSectionGOFF.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_MC_MCSECTIONGOFF_H
 #define LLVM_MC_MCSECTIONGOFF_H
 
+#include "llvm/BinaryFormat/GOFF.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -24,21 +25,27 @@ class MCExpr;
 
 class MCSectionGOFF final : public MCSection {
 private:
+  MCSection *Parent;
+  const MCExpr *SubsectionId;
+
   friend class MCContext;
-  MCSectionGOFF(StringRef Name, SectionKind K)
-      : MCSection(SV_GOFF, Name, K, nullptr) {}
+  MCSectionGOFF(StringRef Name, SectionKind K, MCSection *P, const MCExpr *Sub)
+      : MCSection(SV_GOFF, Name, K, nullptr), Parent(P), SubsectionId(Sub) {}
 
 public:
-  void PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
+  void printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                             raw_ostream &OS,
                             const MCExpr *Subsection) const override {
     OS << "\t.section\t\"" << getName() << "\"\n";
   }
 
-  bool UseCodeAlign() const override { return false; }
+  bool useCodeAlign() const override { return false; }
 
   bool isVirtualSection() const override { return false; }
 
+  MCSection *getParent() const { return Parent; }
+  const MCExpr *getSubsectionId() const { return SubsectionId; }
+
   static bool classof(const MCSection *S) { return S->getVariant() == SV_GOFF; }
 };
 } // end namespace llvm
diff --git a/llvm/include/llvm/MC/MCSectionMachO.h b/llvm/include/llvm/MC/MCSectionMachO.h
index bf8940524e5a..fdf1773d4002 100644
--- a/llvm/include/llvm/MC/MCSectionMachO.h
+++ b/llvm/include/llvm/MC/MCSectionMachO.h
@@ -68,10 +68,10 @@ public:
                                      bool &TAAParsed,     // Out.
                                      unsigned &StubSize); // Out.
 
-  void PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
+  void printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                             raw_ostream &OS,
                             const MCExpr *Subsection) const override;
-  bool UseCodeAlign() const override;
+  bool useCodeAlign() const override;
   bool isVirtualSection() const override;
 
   static bool classof(const MCSection *S) {
diff --git a/llvm/include/llvm/MC/MCSectionSPIRV.h b/llvm/include/llvm/MC/MCSectionSPIRV.h
new file mode 100644
index 000000000000..6534599d2091
--- /dev/null
+++ b/llvm/include/llvm/MC/MCSectionSPIRV.h
@@ -0,0 +1,41 @@
+//===- MCSectionSPIRV.h - SPIR-V Machine Code Sections ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the MCSectionSPIRV class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MC_MCSECTIONSPIRV_H
+#define LLVM_MC_MCSECTIONSPIRV_H
+
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/SectionKind.h"
+
+namespace llvm {
+
+class MCSymbol;
+
+class MCSectionSPIRV final : public MCSection {
+  friend class MCContext;
+
+  MCSectionSPIRV(SectionKind K, MCSymbol *Begin)
+      : MCSection(SV_SPIRV, "", K, Begin) {}
+  // TODO: Add StringRef Name to MCSectionSPIRV.
+
+public:
+  ~MCSectionSPIRV() = default;
+  void printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
+                            raw_ostream &OS,
+                            const MCExpr *Subsection) const override {}
+  bool useCodeAlign() const override { return false; }
+  bool isVirtualSection() const override { return false; }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_MC_MCSECTIONSPIRV_H
diff --git a/llvm/include/llvm/MC/MCSectionWasm.h b/llvm/include/llvm/MC/MCSectionWasm.h
index f34dd6b3507c..579f92a75056 100644
--- a/llvm/include/llvm/MC/MCSectionWasm.h
+++ b/llvm/include/llvm/MC/MCSectionWasm.h
@@ -58,10 +58,10 @@ public:
   const MCSymbolWasm *getGroup() const { return Group; }
   unsigned getSegmentFlags() const { return SegmentFlags; }
 
-  void PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
+  void printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                             raw_ostream &OS,
                             const MCExpr *Subsection) const override;
-  bool UseCodeAlign() const override;
+  bool useCodeAlign() const override;
   bool isVirtualSection() const override;
 
   bool isWasmData() const {
diff --git a/llvm/include/llvm/MC/MCSectionXCOFF.h b/llvm/include/llvm/MC/MCSectionXCOFF.h
index 1dafdd3ac500..95332647c9be 100644
--- a/llvm/include/llvm/MC/MCSectionXCOFF.h
+++ b/llvm/include/llvm/MC/MCSectionXCOFF.h
@@ -38,6 +38,7 @@ class MCSectionXCOFF final : public MCSection {
   Optional<XCOFF::DwarfSectionSubtypeFlags> DwarfSubtypeFlags;
   bool MultiSymbolsAllowed;
   static constexpr unsigned DefaultAlignVal = 4;
+  static constexpr unsigned DefaultTextAlignVal = 32;
 
   MCSectionXCOFF(StringRef Name, XCOFF::StorageMappingClass SMC,
                  XCOFF::SymbolType ST, SectionKind K, MCSymbolXCOFF *QualName,
@@ -57,9 +58,14 @@ class MCSectionXCOFF final : public MCSection {
 
     QualName->setRepresentedCsect(this);
     QualName->setStorageClass(XCOFF::C_HIDEXT);
-    // A csect is 4 byte aligned by default, except for undefined symbol csects.
-    if (ST != XCOFF::XTY_ER)
-      setAlignment(Align(DefaultAlignVal));
+    if (ST != XCOFF::XTY_ER) {
+      // For a csect for program code, set the alignment to 32 bytes by default.
+      // For other csects, set the alignment to 4 bytes by default.
+      if (SMC == XCOFF::XMC_PR)
+        setAlignment(Align(DefaultTextAlignVal));
+      else
+        setAlignment(Align(DefaultAlignVal));
+    }
   }
 
   MCSectionXCOFF(StringRef Name, SectionKind K, MCSymbolXCOFF *QualName,
@@ -74,9 +80,8 @@ class MCSectionXCOFF final : public MCSection {
     // FIXME: use a more meaningful name for non csect sections.
     QualName->setRepresentedCsect(this);
 
-    // Set default alignment 4 for all non csect sections for now.
-    // FIXME: set different alignments according to section types.
-    setAlignment(Align(DefaultAlignVal));
+    // Use default text alignment as the alignment for DWARF sections.
+    setAlignment(Align(DefaultTextAlignVal));
   }
 
   void printCsectDirective(raw_ostream &OS) const;
@@ -95,24 +100,28 @@ public:
   XCOFF::StorageClass getStorageClass() const {
     return QualName->getStorageClass();
   }
+  XCOFF::VisibilityType getVisibilityType() const {
+    return QualName->getVisibilityType();
+  }
   XCOFF::SymbolType getCSectType() const {
     assert(isCsect() && "Only csect section has symbol type property!");
     return CsectProp->Type;
   }
   MCSymbolXCOFF *getQualNameSymbol() const { return QualName; }
 
-  void PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
+  void printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                             raw_ostream &OS,
                             const MCExpr *Subsection) const override;
-  bool UseCodeAlign() const override;
+  bool useCodeAlign() const override;
   bool isVirtualSection() const override;
   StringRef getSymbolTableName() const { return SymbolTableName; }
   bool isMultiSymbolsAllowed() const { return MultiSymbolsAllowed; }
-  bool isCsect() const { return CsectProp.hasValue(); }
-  bool isDwarfSect() const { return DwarfSubtypeFlags.hasValue(); }
+  bool isCsect() const { return CsectProp.has_value(); }
+  bool isDwarfSect() const { return DwarfSubtypeFlags.has_value(); }
   Optional<XCOFF::DwarfSectionSubtypeFlags> getDwarfSubtypeFlags() const {
     return DwarfSubtypeFlags;
   }
+  Optional<XCOFF::CsectProperties> getCsectProp() const { return CsectProp; }
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h
index 3d6c512bfe73..e71014b8cccf 100644
--- a/llvm/include/llvm/MC/MCStreamer.h
+++ b/llvm/include/llvm/MC/MCStreamer.h
@@ -13,22 +13,20 @@
 #ifndef LLVM_MC_MCSTREAMER_H
 #define LLVM_MC_MCSTREAMER_H
 
-#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCLinkerOptimizationHint.h"
 #include "llvm/MC/MCPseudoProbe.h"
-#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCWinEH.h"
+#include "llvm/Support/ARMTargetParser.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MD5.h"
 #include "llvm/Support/SMLoc.h"
-#include "llvm/Support/ARMTargetParser.h"
-#include "llvm/Support/TargetParser.h"
 #include "llvm/Support/VersionTuple.h"
 #include <cassert>
 #include <cstdint>
@@ -39,20 +37,24 @@
 
 namespace llvm {
 
+class APInt;
 class AssemblerConstantPools;
 class MCAsmBackend;
+class MCAssembler;
 class MCContext;
-struct MCDwarfFrameInfo;
 class MCExpr;
+class MCFragment;
 class MCInst;
 class MCInstPrinter;
 class MCRegister;
 class MCSection;
 class MCStreamer;
-class MCSymbolRefExpr;
 class MCSubtargetInfo;
-class raw_ostream;
+class MCSymbol;
+class MCSymbolRefExpr;
+class Triple;
 class Twine;
+class raw_ostream;
 
 namespace codeview {
 struct DefRangeRegisterRelHeader;
@@ -111,7 +113,7 @@ public:
 
   /// Update streamer for a new active section.
   ///
-  /// This is called by PopSection and SwitchSection, if the current
+  /// This is called by popSection and switchSection, if the current
   /// section changes.
   virtual void changeSection(const MCSection *CurSection, MCSection *Section,
                              const MCExpr *SubSection, raw_ostream &OS);
@@ -163,12 +165,23 @@ public:
   virtual void finishAttributeSection();
   virtual void emitInst(uint32_t Inst, char Suffix = '\0');
 
-  virtual void AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE);
+  virtual void annotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE);
 
   virtual void emitThumbSet(MCSymbol *Symbol, const MCExpr *Value);
 
   void emitConstantPools() override;
 
+  virtual void emitARMWinCFIAllocStack(unsigned Size, bool Wide);
+  virtual void emitARMWinCFISaveRegMask(unsigned Mask, bool Wide);
+  virtual void emitARMWinCFISaveSP(unsigned Reg);
+  virtual void emitARMWinCFISaveFRegs(unsigned First, unsigned Last);
+  virtual void emitARMWinCFISaveLR(unsigned Offset);
+  virtual void emitARMWinCFIPrologEnd(bool Fragment);
+  virtual void emitARMWinCFINop(bool Wide);
+  virtual void emitARMWinCFIEpilogStart(unsigned Condition);
+  virtual void emitARMWinCFIEpilogEnd();
+  virtual void emitARMWinCFICustom(unsigned Opcode);
+
   /// Reset any state between object emissions, i.e. the equivalent of
   /// MCStreamer's reset method.
   virtual void reset();
@@ -215,7 +228,7 @@ class MCStreamer {
   DenseMap<const MCSymbol *, unsigned> SymbolOrdering;
 
   /// This is stack of current and previous section values saved by
-  /// PushSection.
+  /// pushSection.
   SmallVector<std::pair<MCSectionSubPair, MCSectionSubPair>, 4> SectionStack;
 
   /// Pointer to the parser's SMLoc if available. This is used to provide
@@ -247,9 +260,9 @@ protected:
     return CurrentWinFrameInfo;
   }
 
-  virtual void EmitWindowsUnwindTables(WinEH::FrameInfo *Frame);
+  virtual void emitWindowsUnwindTables(WinEH::FrameInfo *Frame);
 
-  virtual void EmitWindowsUnwindTables();
+  virtual void emitWindowsUnwindTables();
 
   virtual void emitRawTextImpl(StringRef String);
 
@@ -344,7 +357,7 @@ public:
   /// Return a raw_ostream that comments can be written to. Unlike
   /// AddComment, you are required to terminate comments with \n if you use this
   /// method.
-  virtual raw_ostream &GetCommentOS();
+  virtual raw_ostream &getCommentOS();
 
   /// Print T and prefix it with the comment string (normally #) and
   /// optionally a tab. This prints the comment immediately, not at the end of
@@ -359,8 +372,8 @@ public:
   /// Emit added explicit comments.
   virtual void emitExplicitComments();
 
-  /// AddBlankLine - Emit a blank line to a .s file to pretty it up.
-  virtual void AddBlankLine() {}
+  /// Emit a blank line to a .s file to pretty it up.
+  virtual void addBlankLine() {}
 
   /// @}
 
@@ -384,18 +397,18 @@ public:
 
   /// Returns an index to represent the order a symbol was emitted in.
   /// (zero if we did not emit that symbol)
-  unsigned GetSymbolOrder(const MCSymbol *Sym) const {
+  unsigned getSymbolOrder(const MCSymbol *Sym) const {
     return SymbolOrdering.lookup(Sym);
   }
 
   /// Update streamer for a new active section.
   ///
-  /// This is called by PopSection and SwitchSection, if the current
+  /// This is called by popSection and switchSection, if the current
   /// section changes.
   virtual void changeSection(MCSection *, const MCExpr *);
 
   /// Save the current and previous section on the section stack.
-  void PushSection() {
+  void pushSection() {
     SectionStack.push_back(
         std::make_pair(getCurrentSection(), getPreviousSection()));
   }
@@ -404,7 +417,7 @@ public:
   /// Calls changeSection as needed.
   ///
   /// Returns false if the stack was empty.
-  bool PopSection() {
+  bool popSection() {
     if (SectionStack.size() <= 1)
       return false;
     auto I = SectionStack.end();
@@ -419,11 +432,11 @@ public:
     return true;
   }
 
-  bool SubSection(const MCExpr *Subsection) {
+  bool subSection(const MCExpr *Subsection) {
     if (SectionStack.empty())
       return false;
 
-    SwitchSection(SectionStack.back().first.first, Subsection);
+    switchSection(SectionStack.back().first.first, Subsection);
     return true;
   }
 
@@ -431,13 +444,13 @@ public:
   /// is required to update CurSection.
   ///
   /// This corresponds to assembler directives like .section, .text, etc.
-  virtual void SwitchSection(MCSection *Section,
+  virtual void switchSection(MCSection *Section,
                              const MCExpr *Subsection = nullptr);
 
   /// Set the current section where code is being emitted to \p Section.
   /// This is required to update CurSection. This version does not call
   /// changeSection.
-  void SwitchSectionNoChange(MCSection *Section,
+  void switchSectionNoChange(MCSection *Section,
                              const MCExpr *Subsection = nullptr) {
     assert(Section && "Cannot switch to a null section!");
     MCSectionSubPair curSection = SectionStack.back().first;
@@ -455,7 +468,7 @@ public:
   ///
   /// Each emitted symbol will be tracked in the ordering table,
   /// so we can sort on them later.
-  void AssignFragment(MCSymbol *Symbol, MCFragment *Fragment);
+  void assignFragment(MCSymbol *Symbol, MCFragment *Fragment);
 
   /// Returns the mnemonic for \p MI, if the streamer has access to a
   /// instruction printer and returns an empty string otherwise.
@@ -550,40 +563,40 @@ public:
   /// Start emitting COFF symbol definition
   ///
   /// \param Symbol - The symbol to have its External & Type fields set.
-  virtual void BeginCOFFSymbolDef(const MCSymbol *Symbol);
+  virtual void beginCOFFSymbolDef(const MCSymbol *Symbol);
 
   /// Emit the storage class of the symbol.
   ///
   /// \param StorageClass - The storage class the symbol should have.
-  virtual void EmitCOFFSymbolStorageClass(int StorageClass);
+  virtual void emitCOFFSymbolStorageClass(int StorageClass);
 
   /// Emit the type of the symbol.
   ///
   /// \param Type - A COFF type identifier (see COFF::SymbolType in X86COFF.h)
-  virtual void EmitCOFFSymbolType(int Type);
+  virtual void emitCOFFSymbolType(int Type);
 
   /// Marks the end of the symbol definition.
-  virtual void EndCOFFSymbolDef();
+  virtual void endCOFFSymbolDef();
 
-  virtual void EmitCOFFSafeSEH(MCSymbol const *Symbol);
+  virtual void emitCOFFSafeSEH(MCSymbol const *Symbol);
 
   /// Emits the symbol table index of a Symbol into the current section.
-  virtual void EmitCOFFSymbolIndex(MCSymbol const *Symbol);
+  virtual void emitCOFFSymbolIndex(MCSymbol const *Symbol);
 
   /// Emits a COFF section index.
   ///
   /// \param Symbol - Symbol the section number relocation should point to.
-  virtual void EmitCOFFSectionIndex(MCSymbol const *Symbol);
+  virtual void emitCOFFSectionIndex(MCSymbol const *Symbol);
 
   /// Emits a COFF section relative relocation.
   ///
   /// \param Symbol - Symbol the section relative relocation should point to.
-  virtual void EmitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset);
+  virtual void emitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset);
 
   /// Emits a COFF image relative relocation.
   ///
   /// \param Symbol - Symbol the image relative relocation should point to.
-  virtual void EmitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset);
+  virtual void emitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset);
 
   /// Emits an lcomm directive with XCOFF csect information.
   ///
@@ -615,6 +628,12 @@ public:
   /// changed at the end of assembly.
   virtual void emitXCOFFRenameDirective(const MCSymbol *Name, StringRef Rename);
 
+  /// Emit a XCOFF .ref directive which creates R_REF type entry in the
+  /// relocation table for one or more symbols.
+  ///
+  /// \param Sym - The symbol on the .ref directive.
+  virtual void emitXCOFFRefDirective(StringRef Sym);
+
   /// Emit an ELF .size directive.
   ///
   /// This corresponds to an assembler statement such as:
@@ -907,6 +926,7 @@ public:
                                        unsigned CUID = 0);
 
   virtual void emitCFIBKeyFrame();
+  virtual void emitCFIMTETaggedFrame();
 
   /// This implements the DWARF2 '.loc fileno lineno ...' assembler
   /// directive.
@@ -918,16 +938,16 @@ public:
   /// Associate a filename with a specified logical file number, and also
   /// specify that file's checksum information.  This implements the '.cv_file 4
   /// "foo.c"' assembler directive. Returns true on success.
-  virtual bool EmitCVFileDirective(unsigned FileNo, StringRef Filename,
+  virtual bool emitCVFileDirective(unsigned FileNo, StringRef Filename,
                                    ArrayRef<uint8_t> Checksum,
                                    unsigned ChecksumKind);
 
   /// Introduces a function id for use with .cv_loc.
-  virtual bool EmitCVFuncIdDirective(unsigned FunctionId);
+  virtual bool emitCVFuncIdDirective(unsigned FunctionId);
 
   /// Introduces an inline call site id for use with .cv_loc. Includes
   /// extra information for inline line table generation.
-  virtual bool EmitCVInlineSiteIdDirective(unsigned FunctionId, unsigned IAFunc,
+  virtual bool emitCVInlineSiteIdDirective(unsigned FunctionId, unsigned IAFunc,
                                            unsigned IAFile, unsigned IALine,
                                            unsigned IACol, SMLoc Loc);
 
@@ -983,7 +1003,7 @@ public:
   virtual void emitCVFileChecksumOffsetDirective(unsigned FileNo) {}
 
   /// This implements the CodeView '.cv_fpo_data' assembler directive.
-  virtual void EmitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc = {}) {}
+  virtual void emitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc = {}) {}
 
   /// Emit the absolute difference between two symbols.
   ///
@@ -1022,28 +1042,28 @@ public:
   virtual void emitCFIWindowSave();
   virtual void emitCFINegateRAState();
 
-  virtual void EmitWinCFIStartProc(const MCSymbol *Symbol, SMLoc Loc = SMLoc());
-  virtual void EmitWinCFIEndProc(SMLoc Loc = SMLoc());
+  virtual void emitWinCFIStartProc(const MCSymbol *Symbol, SMLoc Loc = SMLoc());
+  virtual void emitWinCFIEndProc(SMLoc Loc = SMLoc());
   /// This is used on platforms, such as Windows on ARM64, that require function
   /// or funclet sizes to be emitted in .xdata before the End marker is emitted
   /// for the frame.  We cannot use the End marker, as it is not set at the
   /// point of emitting .xdata, in order to indicate that the frame is active.
-  virtual void EmitWinCFIFuncletOrFuncEnd(SMLoc Loc = SMLoc());
-  virtual void EmitWinCFIStartChained(SMLoc Loc = SMLoc());
-  virtual void EmitWinCFIEndChained(SMLoc Loc = SMLoc());
-  virtual void EmitWinCFIPushReg(MCRegister Register, SMLoc Loc = SMLoc());
-  virtual void EmitWinCFISetFrame(MCRegister Register, unsigned Offset,
+  virtual void emitWinCFIFuncletOrFuncEnd(SMLoc Loc = SMLoc());
+  virtual void emitWinCFIStartChained(SMLoc Loc = SMLoc());
+  virtual void emitWinCFIEndChained(SMLoc Loc = SMLoc());
+  virtual void emitWinCFIPushReg(MCRegister Register, SMLoc Loc = SMLoc());
+  virtual void emitWinCFISetFrame(MCRegister Register, unsigned Offset,
                                   SMLoc Loc = SMLoc());
-  virtual void EmitWinCFIAllocStack(unsigned Size, SMLoc Loc = SMLoc());
-  virtual void EmitWinCFISaveReg(MCRegister Register, unsigned Offset,
+  virtual void emitWinCFIAllocStack(unsigned Size, SMLoc Loc = SMLoc());
+  virtual void emitWinCFISaveReg(MCRegister Register, unsigned Offset,
                                  SMLoc Loc = SMLoc());
-  virtual void EmitWinCFISaveXMM(MCRegister Register, unsigned Offset,
+  virtual void emitWinCFISaveXMM(MCRegister Register, unsigned Offset,
                                  SMLoc Loc = SMLoc());
-  virtual void EmitWinCFIPushFrame(bool Code, SMLoc Loc = SMLoc());
-  virtual void EmitWinCFIEndProlog(SMLoc Loc = SMLoc());
-  virtual void EmitWinEHHandler(const MCSymbol *Sym, bool Unwind, bool Except,
+  virtual void emitWinCFIPushFrame(bool Code, SMLoc Loc = SMLoc());
+  virtual void emitWinCFIEndProlog(SMLoc Loc = SMLoc());
+  virtual void emitWinEHHandler(const MCSymbol *Sym, bool Unwind, bool Except,
                                 SMLoc Loc = SMLoc());
-  virtual void EmitWinEHHandlerData(SMLoc Loc = SMLoc());
+  virtual void emitWinEHHandlerData(SMLoc Loc = SMLoc());
 
   virtual void emitCGProfileEntry(const MCSymbolRefExpr *From,
                                   const MCSymbolRefExpr *To, uint64_t Count);
@@ -1099,7 +1119,7 @@ public:
   /// Streamer specific finalization.
   virtual void finishImpl();
   /// Finish emission of machine code.
-  void Finish(SMLoc EndLoc = SMLoc());
+  void finish(SMLoc EndLoc = SMLoc());
 
   virtual bool mayHaveInstructions(MCSection &Sec) const { return true; }
 
diff --git a/llvm/include/llvm/MC/MCSubtargetInfo.h b/llvm/include/llvm/MC/MCSubtargetInfo.h
index 839a3bd85829..e1f0a86141e3 100644
--- a/llvm/include/llvm/MC/MCSubtargetInfo.h
+++ b/llvm/include/llvm/MC/MCSubtargetInfo.h
@@ -14,12 +14,13 @@
 #define LLVM_MC_MCSUBTARGETINFO_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/MC/MCSchedule.h"
 #include "llvm/MC/SubtargetFeature.h"
-#include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <string>
diff --git a/llvm/include/llvm/MC/MCSymbol.h b/llvm/include/llvm/MC/MCSymbol.h
index d8fc4505d446..91ef6ee31d8d 100644
--- a/llvm/include/llvm/MC/MCSymbol.h
+++ b/llvm/include/llvm/MC/MCSymbol.h
@@ -14,7 +14,7 @@
 #define LLVM_MC_MCSYMBOL_H
 
 #include "llvm/ADT/PointerIntPair.h"
-#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringMapEntry.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFragment.h"
diff --git a/llvm/include/llvm/MC/MCSymbolWasm.h b/llvm/include/llvm/MC/MCSymbolWasm.h
index 5a4852e0e895..5eab32cb5c12 100644
--- a/llvm/include/llvm/MC/MCSymbolWasm.h
+++ b/llvm/include/llvm/MC/MCSymbolWasm.h
@@ -86,9 +86,9 @@ public:
   bool omitFromLinkingSection() const { return OmitFromLinkingSection; }
   void setOmitFromLinkingSection() { OmitFromLinkingSection = true; }
 
-  bool hasImportModule() const { return ImportModule.hasValue(); }
+  bool hasImportModule() const { return ImportModule.has_value(); }
   StringRef getImportModule() const {
-    if (ImportModule.hasValue())
+    if (ImportModule)
       return ImportModule.getValue();
     // Use a default module name of "env" for now, for compatibility with
     // existing tools.
@@ -98,15 +98,15 @@ public:
   }
   void setImportModule(StringRef Name) { ImportModule = Name; }
 
-  bool hasImportName() const { return ImportName.hasValue(); }
+  bool hasImportName() const { return ImportName.has_value(); }
   StringRef getImportName() const {
-    if (ImportName.hasValue())
+    if (ImportName)
       return ImportName.getValue();
     return getName();
   }
   void setImportName(StringRef Name) { ImportName = Name; }
 
-  bool hasExportName() const { return ExportName.hasValue(); }
+  bool hasExportName() const { return ExportName.has_value(); }
   StringRef getExportName() const { return ExportName.getValue(); }
   void setExportName(StringRef Name) { ExportName = Name; }
 
@@ -129,12 +129,12 @@ public:
   void setSignature(wasm::WasmSignature *Sig) { Signature = Sig; }
 
   const wasm::WasmGlobalType &getGlobalType() const {
-    assert(GlobalType.hasValue());
+    assert(GlobalType);
     return GlobalType.getValue();
   }
   void setGlobalType(wasm::WasmGlobalType GT) { GlobalType = GT; }
 
-  bool hasTableType() const { return TableType.hasValue(); }
+  bool hasTableType() const { return TableType.has_value(); }
   const wasm::WasmTableType &getTableType() const {
     assert(hasTableType());
     return TableType.getValue();
diff --git a/llvm/include/llvm/MC/MCSymbolXCOFF.h b/llvm/include/llvm/MC/MCSymbolXCOFF.h
index 752e1e7bba0f..2ec265e66300 100644
--- a/llvm/include/llvm/MC/MCSymbolXCOFF.h
+++ b/llvm/include/llvm/MC/MCSymbolXCOFF.h
@@ -39,8 +39,7 @@ public:
   };
 
   XCOFF::StorageClass getStorageClass() const {
-    assert(StorageClass.hasValue() &&
-           "StorageClass not set on XCOFF MCSymbol.");
+    assert(StorageClass && "StorageClass not set on XCOFF MCSymbol.");
     return StorageClass.getValue();
   }
 
diff --git a/llvm/include/llvm/MC/MCTargetOptions.h b/llvm/include/llvm/MC/MCTargetOptions.h
index db50dc6749e2..9c906cdc90d0 100644
--- a/llvm/include/llvm/MC/MCTargetOptions.h
+++ b/llvm/include/llvm/MC/MCTargetOptions.h
@@ -31,6 +31,12 @@ enum class DebugCompressionType {
   Z,    ///< zlib style complession
 };
 
+enum class EmitDwarfUnwindType {
+  Always,          // Always emit dwarf unwind
+  NoCompactUnwind, // Only emit if compact unwind isn't available
+  Default,         // Default behavior is based on the target
+};
+
 class StringRef;
 
 class MCTargetOptions {
@@ -47,7 +53,6 @@ public:
   bool MCNoDeprecatedWarn : 1;
   bool MCNoTypeCheck : 1;
   bool MCSaveTempLabels : 1;
-  bool MCUseDwarfDirectory : 1;
   bool MCIncrementalLinkerCompatible : 1;
   bool ShowMCEncoding : 1;
   bool ShowMCInst : 1;
@@ -57,8 +62,22 @@ public:
   bool PreserveAsmComments : 1;
 
   bool Dwarf64 : 1;
+
+  EmitDwarfUnwindType EmitDwarfUnwind;
+
   int DwarfVersion = 0;
 
+  enum DwarfDirectory {
+    // Force disable
+    DisableDwarfDirectory,
+    // Force enable, for assemblers that support
+    // `.file fileno directory filename' syntax
+    EnableDwarfDirectory,
+    // Default is based on the target
+    DefaultDwarfDirectory
+  };
+  DwarfDirectory MCUseDwarfDirectory;
+
   std::string ABIName;
   std::string AssemblyLanguage;
   std::string SplitDwarfFile;
diff --git a/llvm/include/llvm/MC/MCTargetOptionsCommandFlags.h b/llvm/include/llvm/MC/MCTargetOptionsCommandFlags.h
index 189484198916..d51e740177f7 100644
--- a/llvm/include/llvm/MC/MCTargetOptionsCommandFlags.h
+++ b/llvm/include/llvm/MC/MCTargetOptionsCommandFlags.h
@@ -20,6 +20,7 @@
 namespace llvm {
 
 class MCTargetOptions;
+enum class EmitDwarfUnwindType;
 
 namespace mc {
 
@@ -32,6 +33,8 @@ int getDwarfVersion();
 
 bool getDwarf64();
 
+EmitDwarfUnwindType getEmitDwarfUnwind();
+
 bool getShowMCInst();
 
 bool getFatalWarnings();
diff --git a/llvm/include/llvm/MC/MCValue.h b/llvm/include/llvm/MC/MCValue.h
index 37feee4c9ea8..37265d72c9df 100644
--- a/llvm/include/llvm/MC/MCValue.h
+++ b/llvm/include/llvm/MC/MCValue.h
@@ -15,7 +15,6 @@
 
 #include "llvm/MC/MCExpr.h"
 #include "llvm/Support/DataTypes.h"
-#include <cassert>
 
 namespace llvm {
 class raw_ostream;
diff --git a/llvm/include/llvm/MC/MCWin64EH.h b/llvm/include/llvm/MC/MCWin64EH.h
index 065161d1759e..622a666b78dd 100644
--- a/llvm/include/llvm/MC/MCWin64EH.h
+++ b/llvm/include/llvm/MC/MCWin64EH.h
@@ -57,13 +57,19 @@ public:
                       bool HandlerData) const override;
 };
 
-class ARM64UnwindEmitter : public WinEH::UnwindEmitter {
+class ARMUnwindEmitter : public WinEH::UnwindEmitter {
 public:
   void Emit(MCStreamer &Streamer) const override;
   void EmitUnwindInfo(MCStreamer &Streamer, WinEH::FrameInfo *FI,
                       bool HandlerData) const override;
 };
 
+class ARM64UnwindEmitter : public WinEH::UnwindEmitter {
+public:
+  void Emit(MCStreamer &Streamer) const override;
+  void EmitUnwindInfo(MCStreamer &Streamer, WinEH::FrameInfo *FI,
+                      bool HandlerData) const override;
+};
 }
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/MC/MCWinCOFFStreamer.h b/llvm/include/llvm/MC/MCWinCOFFStreamer.h
index af1ed6faf753..0778c4d52c5e 100644
--- a/llvm/include/llvm/MC/MCWinCOFFStreamer.h
+++ b/llvm/include/llvm/MC/MCWinCOFFStreamer.h
@@ -45,15 +45,15 @@ public:
   void emitThumbFunc(MCSymbol *Func) override;
   bool emitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override;
   void emitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) override;
-  void BeginCOFFSymbolDef(MCSymbol const *Symbol) override;
-  void EmitCOFFSymbolStorageClass(int StorageClass) override;
-  void EmitCOFFSymbolType(int Type) override;
-  void EndCOFFSymbolDef() override;
-  void EmitCOFFSafeSEH(MCSymbol const *Symbol) override;
-  void EmitCOFFSymbolIndex(MCSymbol const *Symbol) override;
-  void EmitCOFFSectionIndex(MCSymbol const *Symbol) override;
-  void EmitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset) override;
-  void EmitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) override;
+  void beginCOFFSymbolDef(MCSymbol const *Symbol) override;
+  void emitCOFFSymbolStorageClass(int StorageClass) override;
+  void emitCOFFSymbolType(int Type) override;
+  void endCOFFSymbolDef() override;
+  void emitCOFFSafeSEH(MCSymbol const *Symbol) override;
+  void emitCOFFSymbolIndex(MCSymbol const *Symbol) override;
+  void emitCOFFSectionIndex(MCSymbol const *Symbol) override;
+  void emitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset) override;
+  void emitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) override;
   void emitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                         unsigned ByteAlignment) override;
   void emitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
@@ -64,7 +64,7 @@ public:
   void emitTBSSSymbol(MCSection *Section, MCSymbol *Symbol, uint64_t Size,
                       unsigned ByteAlignment) override;
   void emitIdent(StringRef IdentString) override;
-  void EmitWinEHHandlerData(SMLoc Loc) override;
+  void emitWinEHHandlerData(SMLoc Loc) override;
   void emitCGProfileEntry(const MCSymbolRefExpr *From,
                           const MCSymbolRefExpr *To, uint64_t Count) override;
   void finishImpl() override;
diff --git a/llvm/include/llvm/MC/MCWinEH.h b/llvm/include/llvm/MC/MCWinEH.h
index 5688255810d0..c16396ea5e71 100644
--- a/llvm/include/llvm/MC/MCWinEH.h
+++ b/llvm/include/llvm/MC/MCWinEH.h
@@ -50,11 +50,17 @@ struct FrameInfo {
   bool HandlesUnwind = false;
   bool HandlesExceptions = false;
   bool EmitAttempted = false;
+  bool Fragment = false;
 
   int LastFrameInst = -1;
   const FrameInfo *ChainedParent = nullptr;
   std::vector<Instruction> Instructions;
-  MapVector<MCSymbol*, std::vector<Instruction>> EpilogMap;
+  struct Epilog {
+    std::vector<Instruction> Instructions;
+    unsigned Condition;
+    MCSymbol *End;
+  };
+  MapVector<MCSymbol *, Epilog> EpilogMap;
 
   FrameInfo() = default;
   FrameInfo(const MCSymbol *Function, const MCSymbol *BeginFuncEHLabel)
@@ -68,7 +74,7 @@ struct FrameInfo {
     if (!Instructions.empty())
       return false;
     for (const auto &E : EpilogMap)
-      if (!E.second.empty())
+      if (!E.second.Instructions.empty())
         return false;
     return true;
   }
diff --git a/llvm/include/llvm/MC/MCXCOFFStreamer.h b/llvm/include/llvm/MC/MCXCOFFStreamer.h
index 5fc2efbe5284..3faa03fa69e9 100644
--- a/llvm/include/llvm/MC/MCXCOFFStreamer.h
+++ b/llvm/include/llvm/MC/MCXCOFFStreamer.h
@@ -32,6 +32,10 @@ public:
   void emitXCOFFSymbolLinkageWithVisibility(MCSymbol *Symbol,
                                             MCSymbolAttr Linkage,
                                             MCSymbolAttr Visibility) override;
+  void emitXCOFFRefDirective(StringRef Name) override {
+    report_fatal_error("emitXCOFFRefDirective is not implemented yet on object"
+                       "generation path");
+  }
   void emitXCOFFRenameDirective(const MCSymbol *Name,
                                 StringRef Rename) override {
     report_fatal_error("emitXCOFFRenameDirective is not implemented yet on "
diff --git a/llvm/include/llvm/MC/SectionKind.h b/llvm/include/llvm/MC/SectionKind.h
index 0fd86cc457de..61e400fe9ede 100644
--- a/llvm/include/llvm/MC/SectionKind.h
+++ b/llvm/include/llvm/MC/SectionKind.h
@@ -24,6 +24,10 @@ class SectionKind {
     /// Metadata - Debug info sections or other metadata.
     Metadata,
 
+    /// Exclude - This section will be excluded from the final executable or
+    /// shared library. Only valid for ELF / COFF targets.
+    Exclude,
+
     /// Text - Text section, used for functions and other executable code.
     Text,
 
@@ -118,6 +122,8 @@ public:
 
   bool isMetadata() const { return K == Metadata; }
 
+  bool isExclude() const { return K == Exclude; }
+
   bool isText() const { return K == Text || K == ExecuteOnly; }
 
   bool isExecuteOnly() const { return K == ExecuteOnly; }
@@ -180,6 +186,7 @@ private:
 public:
 
   static SectionKind getMetadata() { return get(Metadata); }
+  static SectionKind getExclude() { return get(Exclude); }
   static SectionKind getText() { return get(Text); }
   static SectionKind getExecuteOnly() { return get(ExecuteOnly); }
   static SectionKind getReadOnly() { return get(ReadOnly); }
diff --git a/llvm/include/llvm/MC/StringTableBuilder.h b/llvm/include/llvm/MC/StringTableBuilder.h
index 3f9c91be05d3..42133f3f7726 100644
--- a/llvm/include/llvm/MC/StringTableBuilder.h
+++ b/llvm/include/llvm/MC/StringTableBuilder.h
@@ -85,7 +85,6 @@ public:
   void write(raw_ostream &OS) const;
   void write(uint8_t *Buf) const;
 
-private:
   bool isFinalized() const { return Finalized; }
 };
 
diff --git a/llvm/include/llvm/MC/SubtargetFeature.h b/llvm/include/llvm/MC/SubtargetFeature.h
index 032e2a7df1f2..799912d4bacb 100644
--- a/llvm/include/llvm/MC/SubtargetFeature.h
+++ b/llvm/include/llvm/MC/SubtargetFeature.h
@@ -17,11 +17,10 @@
 #ifndef LLVM_MC_SUBTARGETFEATURE_H
 #define LLVM_MC_SUBTARGETFEATURE_H
 
-#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/MathExtras.h"
 #include <array>
-#include <bitset>
 #include <initializer_list>
 #include <string>
 #include <vector>
diff --git a/llvm/include/llvm/MC/TargetRegistry.h b/llvm/include/llvm/MC/TargetRegistry.h
index da9a9269edbf..eeac559f81b1 100644
--- a/llvm/include/llvm/MC/TargetRegistry.h
+++ b/llvm/include/llvm/MC/TargetRegistry.h
@@ -27,7 +27,6 @@
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
-#include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <iterator>
@@ -56,13 +55,12 @@ class MCTargetAsmParser;
 class MCTargetOptions;
 class MCTargetStreamer;
 class raw_ostream;
-class raw_pwrite_stream;
 class TargetMachine;
 class TargetOptions;
 namespace mca {
 class CustomBehaviour;
 class InstrPostProcess;
-class SourceMgr;
+struct SourceMgr;
 } // namespace mca
 
 MCStreamer *createNullStreamer(MCContext &Ctx);
@@ -111,6 +109,16 @@ MCStreamer *createXCOFFStreamer(MCContext &Ctx,
                                 std::unique_ptr<MCObjectWriter> &&OW,
                                 std::unique_ptr<MCCodeEmitter> &&CE,
                                 bool RelaxAll);
+MCStreamer *createSPIRVStreamer(MCContext &Ctx,
+                                std::unique_ptr<MCAsmBackend> &&TAB,
+                                std::unique_ptr<MCObjectWriter> &&OW,
+                                std::unique_ptr<MCCodeEmitter> &&CE,
+                                bool RelaxAll);
+MCStreamer *createDXContainerStreamer(MCContext &Ctx,
+                                      std::unique_ptr<MCAsmBackend> &&TAB,
+                                      std::unique_ptr<MCObjectWriter> &&OW,
+                                      std::unique_ptr<MCCodeEmitter> &&CE,
+                                      bool RelaxAll);
 
 MCRelocationInfo *createMCRelocationInfo(const Triple &TT, MCContext &Ctx);
 
@@ -177,7 +185,6 @@ public:
                                                  const MCInstrInfo &MII,
                                                  const MCRegisterInfo &MRI);
   using MCCodeEmitterCtorTy = MCCodeEmitter *(*)(const MCInstrInfo &II,
-                                                 const MCRegisterInfo &MRI,
                                                  MCContext &Ctx);
   using ELFStreamerCtorTy =
       MCStreamer *(*)(const Triple &T, MCContext &Ctx,
@@ -204,6 +211,17 @@ public:
                       std::unique_ptr<MCAsmBackend> &&TAB,
                       std::unique_ptr<MCObjectWriter> &&OW,
                       std::unique_ptr<MCCodeEmitter> &&Emitter, bool RelaxAll);
+  using SPIRVStreamerCtorTy =
+      MCStreamer *(*)(const Triple &T, MCContext &Ctx,
+                      std::unique_ptr<MCAsmBackend> &&TAB,
+                      std::unique_ptr<MCObjectWriter> &&OW,
+                      std::unique_ptr<MCCodeEmitter> &&Emitter, bool RelaxAll);
+  
+  using DXContainerStreamerCtorTy =
+      MCStreamer *(*)(const Triple &T, MCContext &Ctx,
+                      std::unique_ptr<MCAsmBackend> &&TAB,
+                      std::unique_ptr<MCObjectWriter> &&OW,
+                      std::unique_ptr<MCCodeEmitter> &&Emitter, bool RelaxAll);
 
   using NullTargetStreamerCtorTy = MCTargetStreamer *(*)(MCStreamer &S);
   using AsmTargetStreamerCtorTy = MCTargetStreamer *(*)(
@@ -305,6 +323,8 @@ private:
   ELFStreamerCtorTy ELFStreamerCtorFn = nullptr;
   WasmStreamerCtorTy WasmStreamerCtorFn = nullptr;
   XCOFFStreamerCtorTy XCOFFStreamerCtorFn = nullptr;
+  SPIRVStreamerCtorTy SPIRVStreamerCtorFn = nullptr;
+  DXContainerStreamerCtorTy DXContainerStreamerCtorFn = nullptr;
 
   /// Construction function for this target's null TargetStreamer, if
   /// registered (default = nullptr).
@@ -508,11 +528,10 @@ public:
 
   /// createMCCodeEmitter - Create a target specific code emitter.
   MCCodeEmitter *createMCCodeEmitter(const MCInstrInfo &II,
-                                     const MCRegisterInfo &MRI,
                                      MCContext &Ctx) const {
     if (!MCCodeEmitterCtorFn)
       return nullptr;
-    return MCCodeEmitterCtorFn(II, MRI, Ctx);
+    return MCCodeEmitterCtorFn(II, Ctx);
   }
 
   /// Create a target specific MCStreamer.
@@ -576,6 +595,22 @@ public:
         S = createXCOFFStreamer(Ctx, std::move(TAB), std::move(OW),
                                 std::move(Emitter), RelaxAll);
       break;
+    case Triple::SPIRV:
+      if (SPIRVStreamerCtorFn)
+        S = SPIRVStreamerCtorFn(T, Ctx, std::move(TAB), std::move(OW),
+                                std::move(Emitter), RelaxAll);
+      else
+        S = createSPIRVStreamer(Ctx, std::move(TAB), std::move(OW),
+                                std::move(Emitter), RelaxAll);
+      break;
+    case Triple::DXContainer:
+      if (DXContainerStreamerCtorFn)
+        S = DXContainerStreamerCtorFn(T, Ctx, std::move(TAB), std::move(OW),
+                              std::move(Emitter), RelaxAll);
+      else
+        S = createDXContainerStreamer(Ctx, std::move(TAB), std::move(OW),
+                                      std::move(Emitter), RelaxAll);
+      break;
     }
     if (ObjectTargetStreamerCtorFn)
       ObjectTargetStreamerCtorFn(*S, STI);
@@ -956,6 +991,14 @@ struct TargetRegistry {
     T.ELFStreamerCtorFn = Fn;
   }
 
+  static void RegisterSPIRVStreamer(Target &T, Target::SPIRVStreamerCtorTy Fn) {
+    T.SPIRVStreamerCtorFn = Fn;
+  }
+
+  static void RegisterDXContainerStreamer(Target &T, Target::DXContainerStreamerCtorTy Fn) {
+    T.DXContainerStreamerCtorFn = Fn;
+  }
+
   static void RegisterWasmStreamer(Target &T, Target::WasmStreamerCtorTy Fn) {
     T.WasmStreamerCtorFn = Fn;
   }
@@ -1362,7 +1405,6 @@ template <class MCCodeEmitterImpl> struct RegisterMCCodeEmitter {
 
 private:
   static MCCodeEmitter *Allocator(const MCInstrInfo & /*II*/,
-                                  const MCRegisterInfo & /*MRI*/,
                                   MCContext & /*Ctx*/) {
     return new MCCodeEmitterImpl();
   }
diff --git a/llvm/include/llvm/MCA/CustomBehaviour.h b/llvm/include/llvm/MCA/CustomBehaviour.h
index c4be5312ea19..527dc766b739 100644
--- a/llvm/include/llvm/MCA/CustomBehaviour.h
+++ b/llvm/include/llvm/MCA/CustomBehaviour.h
@@ -49,6 +49,11 @@ public:
   /// scheduling model.
   virtual void postProcessInstruction(std::unique_ptr<Instruction> &Inst,
                                       const MCInst &MCI) {}
+
+  // The resetState() method gets invoked at the beginning of each code region
+  // so that targets that override this function can clear any state that they
+  // have left from the previous code region.
+  virtual void resetState() {}
 };
 
 /// Class which can be overriden by targets to enforce instruction
diff --git a/llvm/include/llvm/MCA/IncrementalSourceMgr.h b/llvm/include/llvm/MCA/IncrementalSourceMgr.h
new file mode 100644
index 000000000000..d91cc5f23311
--- /dev/null
+++ b/llvm/include/llvm/MCA/IncrementalSourceMgr.h
@@ -0,0 +1,92 @@
+//===---------------- IncrementalSourceMgr.h --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file contains IncrementalSourceMgr, an implementation of SourceMgr
+/// that allows users to add new instructions incrementally / dynamically.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MCA_INCREMENTALSOURCEMGR_H
+#define LLVM_MCA_INCREMENTALSOURCEMGR_H
+
+#include "llvm/MCA/SourceMgr.h"
+#include <deque>
+
+namespace llvm {
+namespace mca {
+
+/// An implementation of \a SourceMgr that allows users to add new instructions
+/// incrementally / dynamically.
+/// Note that this SourceMgr takes ownership of all \a mca::Instruction.
+class IncrementalSourceMgr : public SourceMgr {
+  /// Owner of all mca::Instruction instances. Note that we use std::deque here
+  /// to have a better throughput, in comparison to std::vector or
+  /// llvm::SmallVector, as they usually pay a higher re-allocation cost when
+  /// there is a large number of instructions.
+  std::deque<UniqueInst> InstStorage;
+
+  /// Instructions that are ready to be used. Each of them is a pointer of an
+  /// \a UniqueInst inside InstStorage.
+  std::deque<Instruction *> Staging;
+
+  /// Current instruction index.
+  unsigned TotalCounter;
+
+  /// End-of-stream flag.
+  bool EOS;
+
+  /// Called when an instruction is no longer needed.
+  using InstFreedCallback = llvm::function_ref<void(Instruction *)>;
+  InstFreedCallback InstFreedCB;
+
+public:
+  IncrementalSourceMgr() : TotalCounter(0U), EOS(false) {}
+
+  void clear();
+
+  /// Set a callback that is invoked when a mca::Instruction is
+  /// no longer needed. This is usually used for recycling the
+  /// instruction.
+  void setOnInstFreedCallback(InstFreedCallback CB) { InstFreedCB = CB; }
+
+  ArrayRef<UniqueInst> getInstructions() const override {
+    llvm_unreachable("Not applicable");
+  }
+
+  bool hasNext() const override { return !Staging.empty(); }
+  bool isEnd() const override { return EOS; }
+
+  SourceRef peekNext() const override {
+    assert(hasNext());
+    return SourceRef(TotalCounter, *Staging.front());
+  }
+
+  /// Add a new instruction.
+  void addInst(UniqueInst &&Inst) {
+    InstStorage.emplace_back(std::move(Inst));
+    Staging.push_back(InstStorage.back().get());
+  }
+
+  /// Add a recycled instruction.
+  void addRecycledInst(Instruction *Inst) { Staging.push_back(Inst); }
+
+  void updateNext() override;
+
+  /// Mark the end of instruction stream.
+  void endOfStream() { EOS = true; }
+
+#ifndef NDEBUG
+  /// Print statistic about instruction recycling stats.
+  void printStatistic(raw_ostream &OS);
+#endif
+};
+
+} // end namespace mca
+} // end namespace llvm
+
+#endif // LLVM_MCA_INCREMENTALSOURCEMGR_H
diff --git a/llvm/include/llvm/MCA/InstrBuilder.h b/llvm/include/llvm/MCA/InstrBuilder.h
index 04b5cf590d70..92b92a515db9 100644
--- a/llvm/include/llvm/MCA/InstrBuilder.h
+++ b/llvm/include/llvm/MCA/InstrBuilder.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_MCA_INSTRBUILDER_H
 #define LLVM_MCA_INSTRBUILDER_H
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -25,6 +26,27 @@
 namespace llvm {
 namespace mca {
 
+class RecycledInstErr : public ErrorInfo<RecycledInstErr> {
+  Instruction *RecycledInst;
+
+public:
+  static char ID;
+
+  explicit RecycledInstErr(Instruction *Inst) : RecycledInst(Inst) {}
+  // Always need to carry an Instruction
+  RecycledInstErr() = delete;
+
+  Instruction *getInst() const { return RecycledInst; }
+
+  void log(raw_ostream &OS) const override {
+    OS << "Instruction is recycled\n";
+  }
+
+  std::error_code convertToErrorCode() const override {
+    return llvm::inconvertibleErrorCode();
+  }
+};
+
 /// A builder class that knows how to construct Instruction objects.
 ///
 /// Every llvm-mca Instruction is described by an object of class InstrDesc.
@@ -48,6 +70,10 @@ class InstrBuilder {
   bool FirstCallInst;
   bool FirstReturnInst;
 
+  using InstRecycleCallback =
+      llvm::function_ref<Instruction *(const InstrDesc &)>;
+  InstRecycleCallback InstRecycleCB;
+
   Expected<const InstrDesc &> createInstrDescImpl(const MCInst &MCI);
   Expected<const InstrDesc &> getOrCreateInstrDesc(const MCInst &MCI);
 
@@ -69,6 +95,10 @@ public:
     FirstReturnInst = true;
   }
 
+  /// Set a callback which is invoked to retrieve a recycled mca::Instruction
+  /// or null if there isn't any.
+  void setInstRecycleCallback(InstRecycleCallback CB) { InstRecycleCB = CB; }
+
   Expected<std::unique_ptr<Instruction>> createInstruction(const MCInst &MCI);
 };
 } // namespace mca
diff --git a/llvm/include/llvm/MCA/Instruction.h b/llvm/include/llvm/MCA/Instruction.h
index 33e3c8a2e630..86f2d7ade161 100644
--- a/llvm/include/llvm/MCA/Instruction.h
+++ b/llvm/include/llvm/MCA/Instruction.h
@@ -472,17 +472,15 @@ struct InstrDesc {
   // subtarget when computing the reciprocal throughput.
   unsigned SchedClassID;
 
-  unsigned MayLoad : 1;
-  unsigned MayStore : 1;
-  unsigned HasSideEffects : 1;
-  unsigned BeginGroup : 1;
-  unsigned EndGroup : 1;
-  unsigned RetireOOO : 1;
-
   // True if all buffered resources are in-order, and there is at least one
   // buffer which is a dispatch hazard (BufferSize = 0).
   unsigned MustIssueImmediately : 1;
 
+  // True if the corresponding mca::Instruction can be recycled. Currently only
+  // instructions that are neither variadic nor have any variant can be
+  // recycled.
+  unsigned IsRecyclable : 1;
+
   // A zero latency instruction doesn't consume any scheduler resources.
   bool isZeroLatency() const { return !MaxLatency && Resources.empty(); }
 
@@ -518,8 +516,16 @@ class InstructionBase {
   unsigned Opcode;
 
   // Flags used by the LSUnit.
-  bool IsALoadBarrier;
-  bool IsAStoreBarrier;
+  bool IsALoadBarrier : 1;
+  bool IsAStoreBarrier : 1;
+  // Flags copied from the InstrDesc and potentially modified by
+  // CustomBehaviour or (more likely) InstrPostProcess.
+  bool MayLoad : 1;
+  bool MayStore : 1;
+  bool HasSideEffects : 1;
+  bool BeginGroup : 1;
+  bool EndGroup : 1;
+  bool RetireOOO : 1;
 
 public:
   InstructionBase(const InstrDesc &D, const unsigned Opcode)
@@ -568,7 +574,23 @@ public:
   // Returns true if this instruction is a candidate for move elimination.
   bool isOptimizableMove() const { return IsOptimizableMove; }
   void setOptimizableMove() { IsOptimizableMove = true; }
-  bool isMemOp() const { return Desc.MayLoad || Desc.MayStore; }
+  void clearOptimizableMove() { IsOptimizableMove = false; }
+  bool isMemOp() const { return MayLoad || MayStore; }
+
+  // Getters and setters for general instruction flags.
+  void setMayLoad(bool newVal) { MayLoad = newVal; }
+  void setMayStore(bool newVal) { MayStore = newVal; }
+  void setHasSideEffects(bool newVal) { HasSideEffects = newVal; }
+  void setBeginGroup(bool newVal) { BeginGroup = newVal; }
+  void setEndGroup(bool newVal) { EndGroup = newVal; }
+  void setRetireOOO(bool newVal) { RetireOOO = newVal; }
+
+  bool getMayLoad() const { return MayLoad; }
+  bool getMayStore() const { return MayStore; }
+  bool getHasSideEffects() const { return HasSideEffects; }
+  bool getBeginGroup() const { return BeginGroup; }
+  bool getEndGroup() const { return EndGroup; }
+  bool getRetireOOO() const { return RetireOOO; }
 };
 
 /// An instruction propagated through the simulated instruction pipeline.
@@ -628,6 +650,8 @@ public:
         UsedBuffers(D.UsedBuffers), CriticalRegDep(), CriticalMemDep(),
         CriticalResourceMask(0), IsEliminated(false) {}
 
+  void reset();
+
   unsigned getRCUTokenID() const { return RCUTokenID; }
   unsigned getLSUTokenID() const { return LSUTokenID; }
   void setLSUTokenID(unsigned LSUTok) { LSUTokenID = LSUTok; }
@@ -657,6 +681,7 @@ public:
   bool updateDispatched();
   bool updatePending();
 
+  bool isInvalid() const { return Stage == IS_INVALID; }
   bool isDispatched() const { return Stage == IS_DISPATCHED; }
   bool isPending() const { return Stage == IS_PENDING; }
   bool isReady() const { return Stage == IS_READY; }
diff --git a/llvm/include/llvm/MCA/Pipeline.h b/llvm/include/llvm/MCA/Pipeline.h
index 0ac988c52dc1..92c3836124ad 100644
--- a/llvm/include/llvm/MCA/Pipeline.h
+++ b/llvm/include/llvm/MCA/Pipeline.h
@@ -51,6 +51,13 @@ class Pipeline {
   Pipeline(const Pipeline &P) = delete;
   Pipeline &operator=(const Pipeline &P) = delete;
 
+  enum class State {
+    Created, // Pipeline was just created. The default state.
+    Started, // Pipeline has started running.
+    Paused   // Pipeline is paused.
+  };
+  State CurrentState;
+
   /// An ordered list of stages that define this instruction pipeline.
   SmallVector<std::unique_ptr<Stage>, 8> Stages;
   std::set<HWEventListener *> Listeners;
@@ -62,13 +69,16 @@ class Pipeline {
   void notifyCycleEnd();
 
 public:
-  Pipeline() : Cycles(0) {}
+  Pipeline() : CurrentState(State::Created), Cycles(0) {}
   void appendStage(std::unique_ptr<Stage> S);
 
   /// Returns the total number of simulated cycles.
   Expected<unsigned> run();
 
   void addEventListener(HWEventListener *Listener);
+
+  /// Returns whether the pipeline is currently paused.
+  bool isPaused() const { return CurrentState == State::Paused; }
 };
 } // namespace mca
 } // namespace llvm
diff --git a/llvm/include/llvm/MCA/SourceMgr.h b/llvm/include/llvm/MCA/SourceMgr.h
index e844171bdcab..16a60d1116ad 100644
--- a/llvm/include/llvm/MCA/SourceMgr.h
+++ b/llvm/include/llvm/MCA/SourceMgr.h
@@ -6,9 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 /// \file
-/// This file implements class SourceMgr. Class SourceMgr abstracts the input
-/// code sequence (a sequence of MCInst), and assings unique identifiers to
-/// every instruction in the sequence.
+/// This file contains abstract class SourceMgr and the default implementation,
+/// CircularSourceMgr.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -25,30 +24,62 @@ namespace mca {
 // prevent compiler error C2139 about intrinsic type trait '__is_assignable'.
 typedef std::pair<unsigned, const Instruction &> SourceRef;
 
-class SourceMgr {
+/// Abstracting the input code sequence (a sequence of MCInst) and assigning
+/// unique identifiers to every instruction in the sequence.
+struct SourceMgr {
   using UniqueInst = std::unique_ptr<Instruction>;
+
+  /// Provides a fixed range of \a UniqueInst to iterate.
+  virtual ArrayRef<UniqueInst> getInstructions() const = 0;
+
+  /// (Fixed) Number of \a UniqueInst. Returns the size of
+  /// \a getInstructions by default.
+  virtual size_t size() const { return getInstructions().size(); }
+
+  /// Whether there is any \a SourceRef to inspect / peek next.
+  /// Note that returning false from this doesn't mean the instruction
+  /// stream has ended.
+  virtual bool hasNext() const = 0;
+
+  /// Whether the instruction stream has eneded.
+  virtual bool isEnd() const = 0;
+
+  /// The next \a SourceRef.
+  virtual SourceRef peekNext() const = 0;
+
+  /// Advance to the next \a SourceRef.
+  virtual void updateNext() = 0;
+
+  virtual ~SourceMgr() {}
+};
+
+/// The default implementation of \a SourceMgr. It always takes a fixed number
+/// of instructions and provides an option to loop the given sequence for a
+/// certain iterations.
+class CircularSourceMgr : public SourceMgr {
   ArrayRef<UniqueInst> Sequence;
   unsigned Current;
   const unsigned Iterations;
   static const unsigned DefaultIterations = 100;
 
 public:
-  SourceMgr(ArrayRef<UniqueInst> S, unsigned Iter)
-      : Sequence(S), Current(0), Iterations(Iter ? Iter : DefaultIterations) {}
+  CircularSourceMgr(ArrayRef<UniqueInst> S, unsigned Iter)
+      : Sequence(S), Current(0U), Iterations(Iter ? Iter : DefaultIterations) {}
+
+  ArrayRef<UniqueInst> getInstructions() const override { return Sequence; }
 
   unsigned getNumIterations() const { return Iterations; }
-  unsigned size() const { return Sequence.size(); }
-  bool hasNext() const { return Current < (Iterations * Sequence.size()); }
-  void updateNext() { ++Current; }
+  bool hasNext() const override {
+    return Current < (Iterations * Sequence.size());
+  }
+  bool isEnd() const override { return !hasNext(); }
 
-  SourceRef peekNext() const {
+  SourceRef peekNext() const override {
     assert(hasNext() && "Already at end of sequence!");
     return SourceRef(Current, *Sequence[Current % Sequence.size()]);
   }
 
-  using const_iterator = ArrayRef<UniqueInst>::const_iterator;
-  const_iterator begin() const { return Sequence.begin(); }
-  const_iterator end() const { return Sequence.end(); }
+  void updateNext() override { ++Current; }
 };
 
 } // namespace mca
diff --git a/llvm/include/llvm/MCA/Stages/EntryStage.h b/llvm/include/llvm/MCA/Stages/EntryStage.h
index 4c50838bef4b..fb1244aa1933 100644
--- a/llvm/include/llvm/MCA/Stages/EntryStage.h
+++ b/llvm/include/llvm/MCA/Stages/EntryStage.h
@@ -30,7 +30,7 @@ class EntryStage final : public Stage {
   unsigned NumRetired;
 
   // Updates the program counter, and sets 'CurrentInstruction'.
-  void getNextInstruction();
+  Error getNextInstruction();
 
   EntryStage(const EntryStage &Other) = delete;
   EntryStage &operator=(const EntryStage &Other) = delete;
@@ -42,6 +42,7 @@ public:
   bool hasWorkToComplete() const override;
   Error execute(InstRef &IR) override;
   Error cycleStart() override;
+  Error cycleResume() override;
   Error cycleEnd() override;
 };
 
diff --git a/llvm/include/llvm/MCA/Stages/Stage.h b/llvm/include/llvm/MCA/Stages/Stage.h
index 84868e89ac29..2477b9b3d69c 100644
--- a/llvm/include/llvm/MCA/Stages/Stage.h
+++ b/llvm/include/llvm/MCA/Stages/Stage.h
@@ -48,6 +48,9 @@ public:
   /// phase to prepare for the executions during the cycle.
   virtual Error cycleStart() { return ErrorSuccess(); }
 
+  /// Called after the pipeline is resumed from pausing state.
+  virtual Error cycleResume() { return ErrorSuccess(); }
+
   /// Called once at the end of each cycle.
   virtual Error cycleEnd() { return ErrorSuccess(); }
 
@@ -82,6 +85,16 @@ public:
   }
 };
 
+/// This is actually not an error but a marker to indicate that
+/// the instruction stream is paused.
+struct InstStreamPause : public ErrorInfo<InstStreamPause> {
+  static char ID;
+
+  std::error_code convertToErrorCode() const override {
+    return llvm::inconvertibleErrorCode();
+  }
+  void log(raw_ostream &OS) const override { OS << "Stream is paused"; }
+};
 } // namespace mca
 } // namespace llvm
 #endif // LLVM_MCA_STAGES_STAGE_H
diff --git a/llvm/include/llvm/ObjCopy/COFF/COFFConfig.h b/llvm/include/llvm/ObjCopy/COFF/COFFConfig.h
new file mode 100644
index 000000000000..29d56d75698b
--- /dev/null
+++ b/llvm/include/llvm/ObjCopy/COFF/COFFConfig.h
@@ -0,0 +1,27 @@
+//===- COFFConfig.h ---------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJCOPY_COFF_COFFCONFIG_H
+#define LLVM_OBJCOPY_COFF_COFFCONFIG_H
+
+#include "llvm/ADT/Optional.h"
+
+namespace llvm {
+namespace objcopy {
+
+// Coff specific configuration for copying/stripping a single file.
+struct COFFConfig {
+  Optional<unsigned> Subsystem;
+  Optional<unsigned> MajorSubsystemVersion;
+  Optional<unsigned> MinorSubsystemVersion;
+};
+
+} // namespace objcopy
+} // namespace llvm
+
+#endif // LLVM_OBJCOPY_COFF_COFFCONFIG_H
diff --git a/llvm/include/llvm/ObjCopy/COFF/COFFObjcopy.h b/llvm/include/llvm/ObjCopy/COFF/COFFObjcopy.h
new file mode 100644
index 000000000000..d9043d6c5d01
--- /dev/null
+++ b/llvm/include/llvm/ObjCopy/COFF/COFFObjcopy.h
@@ -0,0 +1,36 @@
+//===- COFFObjcopy.h --------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJCOPY_COFF_COFFOBJCOPY_H
+#define LLVM_OBJCOPY_COFF_COFFOBJCOPY_H
+
+namespace llvm {
+class Error;
+class raw_ostream;
+
+namespace object {
+class COFFObjectFile;
+} // end namespace object
+
+namespace objcopy {
+struct CommonConfig;
+struct COFFConfig;
+
+namespace coff {
+
+/// Apply the transformations described by \p Config and \p COFFConfig
+/// to \p In and writes the result into \p Out.
+/// \returns any Error encountered whilst performing the operation.
+Error executeObjcopyOnBinary(const CommonConfig &Config, const COFFConfig &,
+                             object::COFFObjectFile &In, raw_ostream &Out);
+
+} // end namespace coff
+} // end namespace objcopy
+} // end namespace llvm
+
+#endif // LLVM_OBJCOPY_COFF_COFFOBJCOPY_H
diff --git a/llvm/include/llvm/ObjCopy/CommonConfig.h b/llvm/include/llvm/ObjCopy/CommonConfig.h
new file mode 100644
index 000000000000..24503caed342
--- /dev/null
+++ b/llvm/include/llvm/ObjCopy/CommonConfig.h
@@ -0,0 +1,271 @@
+//===- CommonConfig.h -------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJCOPY_COMMONCONFIG_H
+#define LLVM_OBJCOPY_COMMONCONFIG_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/CachedHashString.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Object/ELFTypes.h"
+#include "llvm/Support/GlobPattern.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Regex.h"
+// Necessary for llvm::DebugCompressionType::None
+#include "llvm/Target/TargetOptions.h"
+#include <vector>
+
+namespace llvm {
+namespace objcopy {
+
+enum class FileFormat {
+  Unspecified,
+  ELF,
+  Binary,
+  IHex,
+};
+
+// This type keeps track of the machine info for various architectures. This
+// lets us map architecture names to ELF types and the e_machine value of the
+// ELF file.
+struct MachineInfo {
+  MachineInfo(uint16_t EM, uint8_t ABI, bool Is64, bool IsLittle)
+      : EMachine(EM), OSABI(ABI), Is64Bit(Is64), IsLittleEndian(IsLittle) {}
+  // Alternative constructor that defaults to NONE for OSABI.
+  MachineInfo(uint16_t EM, bool Is64, bool IsLittle)
+      : MachineInfo(EM, ELF::ELFOSABI_NONE, Is64, IsLittle) {}
+  // Default constructor for unset fields.
+  MachineInfo() : MachineInfo(0, 0, false, false) {}
+  uint16_t EMachine;
+  uint8_t OSABI;
+  bool Is64Bit;
+  bool IsLittleEndian;
+};
+
+// Flags set by --set-section-flags or --rename-section. Interpretation of these
+// is format-specific and not all flags are meaningful for all object file
+// formats. This is a bitmask; many section flags may be set.
+enum SectionFlag {
+  SecNone = 0,
+  SecAlloc = 1 << 0,
+  SecLoad = 1 << 1,
+  SecNoload = 1 << 2,
+  SecReadonly = 1 << 3,
+  SecDebug = 1 << 4,
+  SecCode = 1 << 5,
+  SecData = 1 << 6,
+  SecRom = 1 << 7,
+  SecMerge = 1 << 8,
+  SecStrings = 1 << 9,
+  SecContents = 1 << 10,
+  SecShare = 1 << 11,
+  SecExclude = 1 << 12,
+  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/SecExclude)
+};
+
+struct SectionRename {
+  StringRef OriginalName;
+  StringRef NewName;
+  Optional<SectionFlag> NewFlags;
+};
+
+struct SectionFlagsUpdate {
+  StringRef Name;
+  SectionFlag NewFlags;
+};
+
+enum class DiscardType {
+  None,   // Default
+  All,    // --discard-all (-x)
+  Locals, // --discard-locals (-X)
+};
+
+enum class MatchStyle {
+  Literal,  // Default for symbols.
+  Wildcard, // Default for sections, or enabled with --wildcard (-w).
+  Regex,    // Enabled with --regex.
+};
+
+class NameOrPattern {
+  StringRef Name;
+  // Regex is shared between multiple CommonConfig instances.
+  std::shared_ptr<Regex> R;
+  std::shared_ptr<GlobPattern> G;
+  bool IsPositiveMatch = true;
+
+  NameOrPattern(StringRef N) : Name(N) {}
+  NameOrPattern(std::shared_ptr<Regex> R) : R(R) {}
+  NameOrPattern(std::shared_ptr<GlobPattern> G, bool IsPositiveMatch)
+      : G(G), IsPositiveMatch(IsPositiveMatch) {}
+
+public:
+  // ErrorCallback is used to handle recoverable errors. An Error returned
+  // by the callback aborts the parsing and is then returned by this function.
+  static Expected<NameOrPattern>
+  create(StringRef Pattern, MatchStyle MS,
+         llvm::function_ref<Error(Error)> ErrorCallback);
+
+  bool isPositiveMatch() const { return IsPositiveMatch; }
+  Optional<StringRef> getName() const {
+    if (!R && !G)
+      return Name;
+    return None;
+  }
+  bool operator==(StringRef S) const {
+    return R ? R->match(S) : G ? G->match(S) : Name == S;
+  }
+  bool operator!=(StringRef S) const { return !operator==(S); }
+};
+
+// Matcher that checks symbol or section names against the command line flags
+// provided for that option.
+class NameMatcher {
+  DenseSet<CachedHashStringRef> PosNames;
+  std::vector<NameOrPattern> PosPatterns;
+  std::vector<NameOrPattern> NegMatchers;
+
+public:
+  Error addMatcher(Expected<NameOrPattern> Matcher) {
+    if (!Matcher)
+      return Matcher.takeError();
+    if (Matcher->isPositiveMatch()) {
+      if (Optional<StringRef> MaybeName = Matcher->getName())
+        PosNames.insert(CachedHashStringRef(*MaybeName));
+      else
+        PosPatterns.push_back(std::move(*Matcher));
+    } else {
+      NegMatchers.push_back(std::move(*Matcher));
+    }
+    return Error::success();
+  }
+  bool matches(StringRef S) const {
+    return (PosNames.contains(CachedHashStringRef(S)) ||
+            is_contained(PosPatterns, S)) &&
+           !is_contained(NegMatchers, S);
+  }
+  bool empty() const {
+    return PosNames.empty() && PosPatterns.empty() && NegMatchers.empty();
+  }
+};
+
+enum class SymbolFlag {
+  Global,
+  Local,
+  Weak,
+  Default,
+  Hidden,
+  Protected,
+  File,
+  Section,
+  Object,
+  Function,
+  IndirectFunction,
+  Debug,
+  Constructor,
+  Warning,
+  Indirect,
+  Synthetic,
+  UniqueObject,
+};
+
+// Symbol info specified by --add-symbol option. Symbol flags not supported
+// by a concrete format should be ignored.
+struct NewSymbolInfo {
+  StringRef SymbolName;
+  StringRef SectionName;
+  uint64_t Value = 0;
+  std::vector<SymbolFlag> Flags;
+  std::vector<StringRef> BeforeSyms;
+};
+
+// Specify section name and section body for newly added or updated section.
+struct NewSectionInfo {
+  NewSectionInfo() = default;
+  NewSectionInfo(StringRef Name, std::unique_ptr<MemoryBuffer> &&Buffer)
+      : SectionName(Name), SectionData(std::move(Buffer)) {}
+
+  StringRef SectionName;
+  std::shared_ptr<MemoryBuffer> SectionData;
+};
+
+// Configuration for copying/stripping a single file.
+struct CommonConfig {
+  // Main input/output options
+  StringRef InputFilename;
+  FileFormat InputFormat = FileFormat::Unspecified;
+  StringRef OutputFilename;
+  FileFormat OutputFormat = FileFormat::Unspecified;
+
+  // Only applicable when --output-format!=binary (e.g. elf64-x86-64).
+  Optional<MachineInfo> OutputArch;
+
+  // Advanced options
+  StringRef AddGnuDebugLink;
+  // Cached gnu_debuglink's target CRC
+  uint32_t GnuDebugLinkCRC32;
+  Optional<StringRef> ExtractPartition;
+  StringRef SplitDWO;
+  StringRef SymbolsPrefix;
+  StringRef AllocSectionsPrefix;
+  DiscardType DiscardMode = DiscardType::None;
+
+  // Repeated options
+  std::vector<NewSectionInfo> AddSection;
+  std::vector<StringRef> DumpSection;
+  std::vector<NewSectionInfo> UpdateSection;
+
+  // Section matchers
+  NameMatcher KeepSection;
+  NameMatcher OnlySection;
+  NameMatcher ToRemove;
+
+  // Symbol matchers
+  NameMatcher SymbolsToGlobalize;
+  NameMatcher SymbolsToKeep;
+  NameMatcher SymbolsToLocalize;
+  NameMatcher SymbolsToRemove;
+  NameMatcher UnneededSymbolsToRemove;
+  NameMatcher SymbolsToWeaken;
+  NameMatcher SymbolsToKeepGlobal;
+
+  // Map options
+  StringMap<SectionRename> SectionsToRename;
+  StringMap<uint64_t> SetSectionAlignment;
+  StringMap<SectionFlagsUpdate> SetSectionFlags;
+  StringMap<StringRef> SymbolsToRename;
+
+  // Symbol info specified by --add-symbol option.
+  std::vector<NewSymbolInfo> SymbolsToAdd;
+
+  // Boolean options
+  bool DeterministicArchives = true;
+  bool ExtractDWO = false;
+  bool ExtractMainPartition = false;
+  bool OnlyKeepDebug = false;
+  bool PreserveDates = false;
+  bool StripAll = false;
+  bool StripAllGNU = false;
+  bool StripDWO = false;
+  bool StripDebug = false;
+  bool StripNonAlloc = false;
+  bool StripSections = false;
+  bool StripUnneeded = false;
+  bool Weaken = false;
+  bool DecompressDebugSections = false;
+
+  DebugCompressionType CompressionType = DebugCompressionType::None;
+};
+
+} // namespace objcopy
+} // namespace llvm
+
+#endif // LLVM_OBJCOPY_COMMONCONFIG_H
diff --git a/llvm/include/llvm/ObjCopy/ConfigManager.h b/llvm/include/llvm/ObjCopy/ConfigManager.h
new file mode 100644
index 000000000000..2962cf99b270
--- /dev/null
+++ b/llvm/include/llvm/ObjCopy/ConfigManager.h
@@ -0,0 +1,50 @@
+//===- ConfigManager.h ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJCOPY_CONFIGMANAGER_H
+#define LLVM_OBJCOPY_CONFIGMANAGER_H
+
+#include "llvm/ObjCopy/COFF/COFFConfig.h"
+#include "llvm/ObjCopy/CommonConfig.h"
+#include "llvm/ObjCopy/ELF/ELFConfig.h"
+#include "llvm/ObjCopy/MachO/MachOConfig.h"
+#include "llvm/ObjCopy/MultiFormatConfig.h"
+#include "llvm/ObjCopy/wasm/WasmConfig.h"
+#include "llvm/ObjCopy/XCOFF/XCOFFConfig.h"
+
+namespace llvm {
+namespace objcopy {
+
+struct ConfigManager : public MultiFormatConfig {
+  virtual ~ConfigManager() {}
+
+  const CommonConfig &getCommonConfig() const override { return Common; }
+
+  Expected<const ELFConfig &> getELFConfig() const override { return ELF; }
+
+  Expected<const COFFConfig &> getCOFFConfig() const override;
+
+  Expected<const MachOConfig &> getMachOConfig() const override;
+
+  Expected<const WasmConfig &> getWasmConfig() const override;
+
+  Expected<const XCOFFConfig &> getXCOFFConfig() const override;
+
+  // All configs.
+  CommonConfig Common;
+  ELFConfig ELF;
+  COFFConfig COFF;
+  MachOConfig MachO;
+  WasmConfig Wasm;
+  XCOFFConfig XCOFF;
+};
+
+} // namespace objcopy
+} // namespace llvm
+
+#endif // LLVM_OBJCOPY_CONFIGMANAGER_H
diff --git a/llvm/include/llvm/ObjCopy/ELF/ELFConfig.h b/llvm/include/llvm/ObjCopy/ELF/ELFConfig.h
new file mode 100644
index 000000000000..52bc728e36ff
--- /dev/null
+++ b/llvm/include/llvm/ObjCopy/ELF/ELFConfig.h
@@ -0,0 +1,38 @@
+//===- ELFConfig.h ----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJCOPY_ELF_ELFCONFIG_H
+#define LLVM_OBJCOPY_ELF_ELFCONFIG_H
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Object/ELFTypes.h"
+#include <vector>
+
+namespace llvm {
+namespace objcopy {
+
+// ELF specific configuration for copying/stripping a single file.
+struct ELFConfig {
+  uint8_t NewSymbolVisibility = (uint8_t)ELF::STV_DEFAULT;
+
+  // ELF entry point address expression. The input parameter is an entry point
+  // address in the input ELF file. The entry address in the output file is
+  // calculated with EntryExpr(input_address), when either --set-start or
+  // --change-start is used.
+  std::function<uint64_t(uint64_t)> EntryExpr;
+
+  bool AllowBrokenLinks = false;
+  bool KeepFileSymbols = false;
+  bool LocalizeHidden = false;
+};
+
+} // namespace objcopy
+} // namespace llvm
+
+#endif // LLVM_OBJCOPY_ELF_ELFCONFIG_H
diff --git a/llvm/include/llvm/ObjCopy/ELF/ELFObjcopy.h b/llvm/include/llvm/ObjCopy/ELF/ELFObjcopy.h
new file mode 100644
index 000000000000..552b6fb655f1
--- /dev/null
+++ b/llvm/include/llvm/ObjCopy/ELF/ELFObjcopy.h
@@ -0,0 +1,53 @@
+//===- ELFObjcopy.h ---------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJCOPY_ELF_ELFOBJCOPY_H
+#define LLVM_OBJCOPY_ELF_ELFOBJCOPY_H
+
+namespace llvm {
+class Error;
+class MemoryBuffer;
+class raw_ostream;
+
+namespace object {
+class ELFObjectFileBase;
+} // end namespace object
+
+namespace objcopy {
+struct CommonConfig;
+struct ELFConfig;
+
+namespace elf {
+/// Apply the transformations described by \p Config and \p ELFConfig to
+/// \p In, which must represent an IHex file, and writes the result
+/// into \p Out.
+/// \returns any Error encountered whilst performing the operation.
+Error executeObjcopyOnIHex(const CommonConfig &Config,
+                           const ELFConfig &ELFConfig, MemoryBuffer &In,
+                           raw_ostream &Out);
+
+/// Apply the transformations described by \p Config and \p ELFConfig to
+/// \p In, which is treated as a raw binary input, and writes the result
+/// into \p Out.
+/// \returns any Error encountered whilst performing the operation.
+Error executeObjcopyOnRawBinary(const CommonConfig &Config,
+                                const ELFConfig &ELFConfig, MemoryBuffer &In,
+                                raw_ostream &Out);
+
+/// Apply the transformations described by \p Config and \p ELFConfig to
+/// \p In and writes the result into \p Out.
+/// \returns any Error encountered whilst performing the operation.
+Error executeObjcopyOnBinary(const CommonConfig &Config,
+                             const ELFConfig &ELFConfig,
+                             object::ELFObjectFileBase &In, raw_ostream &Out);
+
+} // end namespace elf
+} // end namespace objcopy
+} // end namespace llvm
+
+#endif // LLVM_OBJCOPY_ELF_ELFOBJCOPY_H
diff --git a/llvm/include/llvm/ObjCopy/MachO/MachOConfig.h b/llvm/include/llvm/ObjCopy/MachO/MachOConfig.h
new file mode 100644
index 000000000000..c5f861363297
--- /dev/null
+++ b/llvm/include/llvm/ObjCopy/MachO/MachOConfig.h
@@ -0,0 +1,46 @@
+//===- MachOConfig.h --------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJCOPY_MACHO_MACHOCONFIG_H
+#define LLVM_OBJCOPY_MACHO_MACHOCONFIG_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/StringRef.h"
+#include <vector>
+
+namespace llvm {
+namespace objcopy {
+
+// Mach-O specific configuration for copying/stripping a single file.
+struct MachOConfig {
+  // Repeated options
+  std::vector<StringRef> RPathToAdd;
+  std::vector<StringRef> RPathToPrepend;
+  DenseMap<StringRef, StringRef> RPathsToUpdate;
+  DenseMap<StringRef, StringRef> InstallNamesToUpdate;
+  DenseSet<StringRef> RPathsToRemove;
+
+  // install-name-tool's id option
+  Optional<StringRef> SharedLibId;
+
+  // Segments to remove if they are empty
+  DenseSet<StringRef> EmptySegmentsToRemove;
+
+  // Boolean options
+  bool StripSwiftSymbols = false;
+  bool KeepUndefined = false;
+
+  // install-name-tool's --delete_all_rpaths
+  bool RemoveAllRpaths = false;
+};
+
+} // namespace objcopy
+} // namespace llvm
+
+#endif // LLVM_OBJCOPY_MACHO_MACHOCONFIG_H
diff --git a/llvm/include/llvm/ObjCopy/MachO/MachOObjcopy.h b/llvm/include/llvm/ObjCopy/MachO/MachOObjcopy.h
new file mode 100644
index 000000000000..73690d7ace8a
--- /dev/null
+++ b/llvm/include/llvm/ObjCopy/MachO/MachOObjcopy.h
@@ -0,0 +1,45 @@
+//===- MachOObjcopy.h -------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJCOPY_MACHO_MACHOOBJCOPY_H
+#define LLVM_OBJCOPY_MACHO_MACHOOBJCOPY_H
+
+namespace llvm {
+class Error;
+class raw_ostream;
+
+namespace object {
+class MachOObjectFile;
+class MachOUniversalBinary;
+} // end namespace object
+
+namespace objcopy {
+struct CommonConfig;
+struct MachOConfig;
+class MultiFormatConfig;
+
+namespace macho {
+/// Apply the transformations described by \p Config and \p MachOConfig to
+/// \p In and writes the result into \p Out.
+/// \returns any Error encountered whilst performing the operation.
+Error executeObjcopyOnBinary(const CommonConfig &Config,
+                             const MachOConfig &MachOConfig,
+                             object::MachOObjectFile &In, raw_ostream &Out);
+
+/// Apply the transformations described by \p Config and \p MachOConfig to
+/// \p In and writes the result into \p Out.
+/// \returns any Error encountered whilst performing the operation.
+Error executeObjcopyOnMachOUniversalBinary(
+    const MultiFormatConfig &Config, const object::MachOUniversalBinary &In,
+    raw_ostream &Out);
+
+} // end namespace macho
+} // end namespace objcopy
+} // end namespace llvm
+
+#endif // LLVM_OBJCOPY_MACHO_MACHOOBJCOPY_H
diff --git a/llvm/include/llvm/ObjCopy/MultiFormatConfig.h b/llvm/include/llvm/ObjCopy/MultiFormatConfig.h
new file mode 100644
index 000000000000..180f2f82a908
--- /dev/null
+++ b/llvm/include/llvm/ObjCopy/MultiFormatConfig.h
@@ -0,0 +1,39 @@
+//===- MultiFormatConfig.h --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJCOPY_MULTIFORMATCONFIG_H
+#define LLVM_OBJCOPY_MULTIFORMATCONFIG_H
+
+#include "llvm/Support/Error.h"
+
+namespace llvm {
+namespace objcopy {
+
+struct CommonConfig;
+struct ELFConfig;
+struct COFFConfig;
+struct MachOConfig;
+struct WasmConfig;
+struct XCOFFConfig;
+
+class MultiFormatConfig {
+public:
+  virtual ~MultiFormatConfig() {}
+
+  virtual const CommonConfig &getCommonConfig() const = 0;
+  virtual Expected<const ELFConfig &> getELFConfig() const = 0;
+  virtual Expected<const COFFConfig &> getCOFFConfig() const = 0;
+  virtual Expected<const MachOConfig &> getMachOConfig() const = 0;
+  virtual Expected<const WasmConfig &> getWasmConfig() const = 0;
+  virtual Expected<const XCOFFConfig &> getXCOFFConfig() const = 0;
+};
+
+} // namespace objcopy
+} // namespace llvm
+
+#endif // LLVM_OBJCOPY_MULTIFORMATCONFIG_H
diff --git a/llvm/include/llvm/ObjCopy/ObjCopy.h b/llvm/include/llvm/ObjCopy/ObjCopy.h
new file mode 100644
index 000000000000..023814002c72
--- /dev/null
+++ b/llvm/include/llvm/ObjCopy/ObjCopy.h
@@ -0,0 +1,42 @@
+//===- ObjCopy.h ------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJCOPY_OBJCOPY_H
+#define LLVM_OBJCOPY_OBJCOPY_H
+
+#include "llvm/Support/Error.h"
+
+namespace llvm {
+class raw_ostream;
+
+namespace object {
+class Archive;
+class Binary;
+} // end namespace object
+
+namespace objcopy {
+class MultiFormatConfig;
+
+/// Applies the transformations described by \p Config to
+/// each member in archive \p Ar.
+/// Writes a result in a file specified by \p Config.OutputFilename.
+/// \returns any Error encountered whilst performing the operation.
+Error executeObjcopyOnArchive(const MultiFormatConfig &Config,
+                              const object::Archive &Ar);
+
+/// Applies the transformations described by \p Config to \p In and writes
+/// the result into \p Out. This function does the dispatch based on the
+/// format of the input binary (COFF, ELF, MachO or wasm).
+/// \returns any Error encountered whilst performing the operation.
+Error executeObjcopyOnBinary(const MultiFormatConfig &Config,
+                             object::Binary &In, raw_ostream &Out);
+
+} // end namespace objcopy
+} // end namespace llvm
+
+#endif // LLVM_OBJCOPY_OBJCOPY_H
diff --git a/llvm/include/llvm/ObjCopy/XCOFF/XCOFFConfig.h b/llvm/include/llvm/ObjCopy/XCOFF/XCOFFConfig.h
new file mode 100644
index 000000000000..adaeedc82b73
--- /dev/null
+++ b/llvm/include/llvm/ObjCopy/XCOFF/XCOFFConfig.h
@@ -0,0 +1,21 @@
+//===- XCOFFConfig.h --------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJCOPY_XCOFF_XCOFFCONFIG_H
+#define LLVM_OBJCOPY_XCOFF_XCOFFCONFIG_H
+
+namespace llvm {
+namespace objcopy {
+
+// XCOFF specific configuration for copying/stripping a single file.
+struct XCOFFConfig {};
+
+} // namespace objcopy
+} // namespace llvm
+
+#endif // LLVM_OBJCOPY_XCOFF_XCOFFCONFIG_H
diff --git a/llvm/include/llvm/ObjCopy/XCOFF/XCOFFObjcopy.h b/llvm/include/llvm/ObjCopy/XCOFF/XCOFFObjcopy.h
new file mode 100644
index 000000000000..9fc85cb39fa5
--- /dev/null
+++ b/llvm/include/llvm/ObjCopy/XCOFF/XCOFFObjcopy.h
@@ -0,0 +1,35 @@
+//===- XCOFFObjcopy.h -------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJCOPY_XCOFF_XCOFFOBJCOPY_H
+#define LLVM_OBJCOPY_XCOFF_XCOFFOBJCOPY_H
+
+namespace llvm {
+class Error;
+class raw_ostream;
+
+namespace object {
+class XCOFFObjectFile;
+} // end namespace object
+
+namespace objcopy {
+struct CommonConfig;
+struct XCOFFConfig;
+
+namespace xcoff {
+/// Apply the transformations described by \p Config and \p XCOFFConfig
+/// to \p In and writes the result into \p Out.
+/// \returns any Error encountered whilst performing the operation.
+Error executeObjcopyOnBinary(const CommonConfig &Config, const XCOFFConfig &,
+                             object::XCOFFObjectFile &In, raw_ostream &Out);
+
+} // end namespace xcoff
+} // end namespace objcopy
+} // end namespace llvm
+
+#endif // LLVM_OBJCOPY_XCOFF_XCOFFOBJCOPY_H
diff --git a/llvm/include/llvm/ObjCopy/wasm/WasmConfig.h b/llvm/include/llvm/ObjCopy/wasm/WasmConfig.h
new file mode 100644
index 000000000000..56a7055da9a7
--- /dev/null
+++ b/llvm/include/llvm/ObjCopy/wasm/WasmConfig.h
@@ -0,0 +1,21 @@
+//===- WasmConfig.h ---------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJCOPY_WASM_WASMCONFIG_H
+#define LLVM_OBJCOPY_WASM_WASMCONFIG_H
+
+namespace llvm {
+namespace objcopy {
+
+// Wasm specific configuration for copying/stripping a single file.
+struct WasmConfig {};
+
+} // namespace objcopy
+} // namespace llvm
+
+#endif // LLVM_OBJCOPY_WASM_WASMCONFIG_H
diff --git a/llvm/include/llvm/ObjCopy/wasm/WasmObjcopy.h b/llvm/include/llvm/ObjCopy/wasm/WasmObjcopy.h
new file mode 100644
index 000000000000..5b4181c22b97
--- /dev/null
+++ b/llvm/include/llvm/ObjCopy/wasm/WasmObjcopy.h
@@ -0,0 +1,35 @@
+//===- WasmObjcopy.h -------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJCOPY_WASM_WASMOBJCOPY_H
+#define LLVM_OBJCOPY_WASM_WASMOBJCOPY_H
+
+namespace llvm {
+class Error;
+class raw_ostream;
+
+namespace object {
+class WasmObjectFile;
+} // end namespace object
+
+namespace objcopy {
+struct CommonConfig;
+struct WasmConfig;
+
+namespace wasm {
+/// Apply the transformations described by \p Config and \p WasmConfig
+/// to \p In and writes the result into \p Out.
+/// \returns any Error encountered whilst performing the operation.
+Error executeObjcopyOnBinary(const CommonConfig &Config, const WasmConfig &,
+                             object::WasmObjectFile &In, raw_ostream &Out);
+
+} // end namespace wasm
+} // end namespace objcopy
+} // end namespace llvm
+
+#endif // LLVM_OBJCOPY_WASM_WASMOBJCOPY_H
diff --git a/llvm/include/llvm/Object/Archive.h b/llvm/include/llvm/Object/Archive.h
index b792cbc3d9ac..a36c9bd6163b 100644
--- a/llvm/include/llvm/Object/Archive.h
+++ b/llvm/include/llvm/Object/Archive.h
@@ -13,7 +13,6 @@
 #ifndef LLVM_OBJECT_ARCHIVE_H
 #define LLVM_OBJECT_ARCHIVE_H
 
-#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/fallible_iterator.h"
 #include "llvm/ADT/iterator_range.h"
@@ -22,7 +21,6 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <memory>
@@ -30,6 +28,9 @@
 #include <vector>
 
 namespace llvm {
+
+template <typename T> class Optional;
+
 namespace object {
 
 const char ArchiveMagic[] = "!<arch>\n";
@@ -339,6 +340,7 @@ public:
 
   Kind kind() const { return (Kind)Format; }
   bool isThin() const { return IsThin; }
+  static object::Archive::Kind getDefaultKindForHost();
 
   child_iterator child_begin(Error &Err, bool SkipInternal = true) const;
   child_iterator child_end() const;
@@ -358,7 +360,7 @@ public:
   // check if a symbol is in the archive
   Expected<Optional<Child>> findSym(StringRef name) const;
 
-  bool isEmpty() const;
+  virtual bool isEmpty() const;
   bool hasSymbolTable() const;
   StringRef getSymbolTable() const { return SymbolTable; }
   StringRef getStringTable() const { return StringTable; }
@@ -390,6 +392,7 @@ private:
 };
 
 class BigArchive : public Archive {
+public:
   /// Fixed-Length Header.
   struct FixLenHdr {
     char Magic[sizeof(BigArchiveMagic) - 1]; ///< Big archive magic string.
@@ -410,6 +413,9 @@ public:
   BigArchive(MemoryBufferRef Source, Error &Err);
   uint64_t getFirstChildOffset() const override { return FirstChildOffset; }
   uint64_t getLastChildOffset() const { return LastChildOffset; }
+  bool isEmpty() const override {
+    return Data.getBufferSize() == sizeof(FixLenHdr);
+  };
 };
 
 } // end namespace object
diff --git a/llvm/include/llvm/Object/ArchiveWriter.h b/llvm/include/llvm/Object/ArchiveWriter.h
index 7eaf13e8fb22..6acab45215da 100644
--- a/llvm/include/llvm/Object/ArchiveWriter.h
+++ b/llvm/include/llvm/Object/ArchiveWriter.h
@@ -26,6 +26,11 @@ struct NewArchiveMember {
   NewArchiveMember() = default;
   NewArchiveMember(MemoryBufferRef BufRef);
 
+  // Detect the archive format from the object or bitcode file. This helps
+  // assume the archive format when creating or editing archives in the case
+  // one isn't explicitly set.
+  object::Archive::Kind detectKindFromObject() const;
+
   static Expected<NewArchiveMember>
   getOldMember(const object::Archive::Child &OldMember, bool Deterministic);
 
diff --git a/llvm/include/llvm/Object/Binary.h b/llvm/include/llvm/Object/Binary.h
index a8f4437d5dbb..53b299ae8612 100644
--- a/llvm/include/llvm/Object/Binary.h
+++ b/llvm/include/llvm/Object/Binary.h
@@ -16,9 +16,9 @@
 #include "llvm-c/Types.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Object/Error.h"
+#include "llvm/Support/CBindingWrapping.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include <algorithm>
 #include <memory>
 #include <utility>
 
@@ -50,6 +50,8 @@ protected:
 
     ID_WinRes, // Windows resource (.res) file.
 
+    ID_Offload, // Offloading binary file.
+
     // Object and children.
     ID_StartObjects,
     ID_COFF,
@@ -133,6 +135,8 @@ public:
 
   bool isWasm() const { return TypeID == ID_Wasm; }
 
+  bool isOffloadFile() const { return TypeID == ID_Offload; }
+
   bool isCOFFImportFile() const {
     return TypeID == ID_COFFImportFile;
   }
diff --git a/llvm/include/llvm/Object/COFF.h b/llvm/include/llvm/Object/COFF.h
index 3add3811069b..0b6975b9590f 100644
--- a/llvm/include/llvm/Object/COFF.h
+++ b/llvm/include/llvm/Object/COFF.h
@@ -1079,13 +1079,15 @@ public:
 
   uint64_t getImageBase() const;
   Error getVaPtr(uint64_t VA, uintptr_t &Res) const;
-  Error getRvaPtr(uint32_t Rva, uintptr_t &Res) const;
+  Error getRvaPtr(uint32_t Rva, uintptr_t &Res,
+                  const char *ErrorContext = nullptr) const;
 
   /// Given an RVA base and size, returns a valid array of bytes or an error
   /// code if the RVA and size is not contained completely within a valid
   /// section.
   Error getRvaAndSizeAsBytes(uint32_t RVA, uint32_t Size,
-                             ArrayRef<uint8_t> &Contents) const;
+                             ArrayRef<uint8_t> &Contents,
+                             const char *ErrorContext = nullptr) const;
 
   Error getHintName(uint32_t Rva, uint16_t &Hint,
                               StringRef &Name) const;
@@ -1296,6 +1298,12 @@ struct FpoData {
   frame_type getFP() const { return static_cast<frame_type>(Attributes >> 14); }
 };
 
+class SectionStrippedError
+    : public ErrorInfo<SectionStrippedError, BinaryError> {
+public:
+  SectionStrippedError() { setErrorCode(object_error::section_stripped); }
+};
+
 } // end namespace object
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Object/COFFImportFile.h b/llvm/include/llvm/Object/COFFImportFile.h
index 0da0d8fa70c9..f8f0e0343b22 100644
--- a/llvm/include/llvm/Object/COFFImportFile.h
+++ b/llvm/include/llvm/Object/COFFImportFile.h
@@ -18,10 +18,9 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Object/COFF.h"
-#include "llvm/Object/IRObjectFile.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Object/SymbolicFile.h"
-#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/MemoryBufferRef.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
diff --git a/llvm/include/llvm/Object/COFFModuleDefinition.h b/llvm/include/llvm/Object/COFFModuleDefinition.h
index fb3d0952e3a3..8e14dd61472d 100644
--- a/llvm/include/llvm/Object/COFFModuleDefinition.h
+++ b/llvm/include/llvm/Object/COFFModuleDefinition.h
@@ -18,7 +18,7 @@
 #ifndef LLVM_OBJECT_COFFMODULEDEFINITION_H
 #define LLVM_OBJECT_COFFMODULEDEFINITION_H
 
-#include "llvm/Object/COFF.h"
+#include "llvm/BinaryFormat/COFF.h"
 #include "llvm/Object/COFFImportFile.h"
 
 namespace llvm {
diff --git a/llvm/include/llvm/Object/DXContainer.h b/llvm/include/llvm/Object/DXContainer.h
new file mode 100644
index 000000000000..7aa7d8ecf4c7
--- /dev/null
+++ b/llvm/include/llvm/Object/DXContainer.h
@@ -0,0 +1,124 @@
+//===- DXContainer.h - DXContainer file implementation ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the DXContainerFile class, which implements the ObjectFile
+// interface for DXContainer files.
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJECT_DXCONTAINER_H
+#define LLVM_OBJECT_DXCONTAINER_H
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/DXContainer.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MemoryBufferRef.h"
+
+namespace llvm {
+namespace object {
+class DXContainer {
+public:
+  using DXILData = std::pair<dxbc::ProgramHeader, const char *>;
+
+private:
+  DXContainer(MemoryBufferRef O);
+
+  MemoryBufferRef Data;
+  dxbc::Header Header;
+  SmallVector<uint32_t, 4> PartOffsets;
+  Optional<DXILData> DXIL;
+
+  Error parseHeader();
+  Error parsePartOffsets();
+  Error parseDXILHeader(uint32_t Offset);
+  friend class PartIterator;
+
+public:
+  // The PartIterator is a wrapper around the iterator for the PartOffsets
+  // member of the DXContainer. It contains a refernce to the container, and the
+  // current iterator value, as well as storage for a parsed part header.
+  class PartIterator {
+    const DXContainer &Container;
+    SmallVectorImpl<uint32_t>::const_iterator OffsetIt;
+    struct PartData {
+      dxbc::PartHeader Part;
+      uint32_t Offset;
+      StringRef Data;
+    } IteratorState;
+
+    friend class DXContainer;
+
+    PartIterator(const DXContainer &C,
+                 SmallVectorImpl<uint32_t>::const_iterator It)
+        : Container(C), OffsetIt(It) {
+      if (OffsetIt == Container.PartOffsets.end())
+        updateIteratorImpl(Container.PartOffsets.back());
+      else
+        updateIterator();
+    }
+
+    // Updates the iterator's state data. This results in copying the part
+    // header into the iterator and handling any required byte swapping. This is
+    // called when incrementing or decrementing the iterator.
+    void updateIterator() {
+      if (OffsetIt != Container.PartOffsets.end())
+        updateIteratorImpl(*OffsetIt);
+    }
+
+    // Implementation for updating the iterator state based on a specified
+    // offest.
+    void updateIteratorImpl(const uint32_t Offset);
+
+  public:
+    PartIterator &operator++() {
+      if (OffsetIt == Container.PartOffsets.end())
+        return *this;
+      ++OffsetIt;
+      updateIterator();
+      return *this;
+    }
+
+    PartIterator operator++(int) {
+      PartIterator Tmp = *this;
+      ++(*this);
+      return Tmp;
+    }
+
+    bool operator==(const PartIterator &RHS) const {
+      return OffsetIt == RHS.OffsetIt;
+    }
+
+    bool operator!=(const PartIterator &RHS) const {
+      return OffsetIt != RHS.OffsetIt;
+    }
+
+    const PartData &operator*() { return IteratorState; }
+    const PartData *operator->() { return &IteratorState; }
+  };
+
+  PartIterator begin() const {
+    return PartIterator(*this, PartOffsets.begin());
+  }
+
+  PartIterator end() const { return PartIterator(*this, PartOffsets.end()); }
+
+  StringRef getData() const { return Data.getBuffer(); }
+  static Expected<DXContainer> create(MemoryBufferRef Object);
+
+  const dxbc::Header &getHeader() const { return Header; }
+
+  Optional<DXILData> getDXIL() const { return DXIL; }
+};
+
+} // namespace object
+} // namespace llvm
+
+#endif // LLVM_OBJECT_DXCONTAINERFILE_H
diff --git a/llvm/include/llvm/Object/Decompressor.h b/llvm/include/llvm/Object/Decompressor.h
index cc918481b308..e04ee3c3e4c0 100644
--- a/llvm/include/llvm/Object/Decompressor.h
+++ b/llvm/include/llvm/Object/Decompressor.h
@@ -9,13 +9,15 @@
 #ifndef LLVM_OBJECT_DECOMPRESSOR_H
 #define LLVM_OBJECT_DECOMPRESSOR_H
 
-#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Error.h"
 
 namespace llvm {
 namespace object {
 
+class SectionRef;
+
 /// Decompressor helps to handle decompression of compressed sections.
 class Decompressor {
 public:
diff --git a/llvm/include/llvm/Object/ELF.h b/llvm/include/llvm/Object/ELF.h
index 37f23c435ae1..1a59ba94098f 100644
--- a/llvm/include/llvm/Object/ELF.h
+++ b/llvm/include/llvm/Object/ELF.h
@@ -855,7 +855,7 @@ Expected<StringRef> ELFFile<ELFT>::getSymbolVersionByIndex(
 
   const VersionEntry &Entry = *VersionMap[VersionIndex];
   // A default version (@@) is only available for defined symbols.
-  if (!Entry.IsVerDef || IsSymHidden.getValueOr(false))
+  if (!Entry.IsVerDef || IsSymHidden.value_or(false))
     IsDefault = false;
   else
     IsDefault = !(SymbolVersionIndex & llvm::ELF::VERSYM_HIDDEN);
diff --git a/llvm/include/llvm/Object/ELFObjectFile.h b/llvm/include/llvm/Object/ELFObjectFile.h
index e2d2784d4f23..c449a3dafc0c 100644
--- a/llvm/include/llvm/Object/ELFObjectFile.h
+++ b/llvm/include/llvm/Object/ELFObjectFile.h
@@ -15,7 +15,6 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/iterator_range.h"
@@ -27,19 +26,21 @@
 #include "llvm/Object/Error.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Object/SymbolicFile.h"
-#include "llvm/Support/ARMAttributeParser.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/ELFAttributeParser.h"
 #include "llvm/Support/ELFAttributes.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/MemoryBufferRef.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include <cassert>
 #include <cstdint>
-#include <system_error>
 
 namespace llvm {
+
+template <typename T> class SmallVectorImpl;
+
 namespace object {
 
 constexpr int NumElfSymbolTypes = 16;
@@ -101,6 +102,12 @@ public:
   /// Returns a vector containing a symbol version for each dynamic symbol.
   /// Returns an empty vector if version sections do not exist.
   Expected<std::vector<VersionEntry>> readDynsymVersions() const;
+
+  /// Returns a vector of all BB address maps in the object file. When
+  // `TextSectionIndex` is specified, only returns the BB address maps
+  // corresponding to the section with that index.
+  Expected<std::vector<BBAddrMap>>
+  readBBAddrMap(Optional<unsigned> TextSectionIndex = None) const;
 };
 
 class ELFSectionRef : public SectionRef {
@@ -1167,7 +1174,7 @@ uint8_t ELFObjectFile<ELFT>::getBytesInAddress() const {
 
 template <class ELFT>
 StringRef ELFObjectFile<ELFT>::getFileFormatName() const {
-  bool IsLittleEndian = ELFT::TargetEndianness == support::little;
+  constexpr bool IsLittleEndian = ELFT::TargetEndianness == support::little;
   switch (EF.getHeader().e_ident[ELF::EI_CLASS]) {
   case ELF::ELFCLASS32:
     switch (EF.getHeader().e_machine) {
@@ -1202,6 +1209,8 @@ StringRef ELFObjectFile<ELFT>::getFileFormatName() const {
       return "elf32-sparc";
     case ELF::EM_AMDGPU:
       return "elf32-amdgpu";
+    case ELF::EM_LOONGARCH:
+      return "elf32-loongarch";
     default:
       return "elf32-unknown";
     }
@@ -1229,6 +1238,8 @@ StringRef ELFObjectFile<ELFT>::getFileFormatName() const {
       return "elf64-bpf";
     case ELF::EM_VE:
       return "elf64-ve";
+    case ELF::EM_LOONGARCH:
+      return "elf64-loongarch";
     default:
       return "elf64-unknown";
     }
@@ -1313,6 +1324,17 @@ template <class ELFT> Triple::ArchType ELFObjectFile<ELFT>::getArch() const {
     return Triple::ve;
   case ELF::EM_CSKY:
     return Triple::csky;
+
+  case ELF::EM_LOONGARCH:
+    switch (EF.getHeader().e_ident[ELF::EI_CLASS]) {
+    case ELF::ELFCLASS32:
+      return Triple::loongarch32;
+    case ELF::ELFCLASS64:
+      return Triple::loongarch64;
+    default:
+      report_fatal_error("Invalid ELFCLASS!");
+    }
+
   default:
     return Triple::UnknownArch;
   }
diff --git a/llvm/include/llvm/Object/ELFTypes.h b/llvm/include/llvm/Object/ELFTypes.h
index c674b80c814d..5942b6f1d0a1 100644
--- a/llvm/include/llvm/Object/ELFTypes.h
+++ b/llvm/include/llvm/Object/ELFTypes.h
@@ -812,8 +812,20 @@ struct BBAddrMap {
         : Offset(Offset), Size(Size), HasReturn(Metadata & 1),
           HasTailCall(Metadata & (1 << 1)), IsEHPad(Metadata & (1 << 2)),
           CanFallThrough(Metadata & (1 << 3)){};
+
+    bool operator==(const BBEntry &Other) const {
+      return Offset == Other.Offset && Size == Other.Size &&
+             HasReturn == Other.HasReturn && HasTailCall == Other.HasTailCall &&
+             IsEHPad == Other.IsEHPad && CanFallThrough == Other.CanFallThrough;
+    }
   };
   std::vector<BBEntry> BBEntries; // Basic block entries for this function.
+
+  // Equality operator for unit testing.
+  bool operator==(const BBAddrMap &Other) const {
+    return Addr == Other.Addr && std::equal(BBEntries.begin(), BBEntries.end(),
+                                            Other.BBEntries.begin());
+  }
 };
 
 } // end namespace object.
diff --git a/llvm/include/llvm/Object/Error.h b/llvm/include/llvm/Object/Error.h
index af334fc42658..8875fb6e1a20 100644
--- a/llvm/include/llvm/Object/Error.h
+++ b/llvm/include/llvm/Object/Error.h
@@ -34,6 +34,7 @@ enum class object_error {
   invalid_section_index,
   bitcode_section_not_found,
   invalid_symbol_index,
+  section_stripped,
 };
 
 inline std::error_code make_error_code(object_error e) {
diff --git a/llvm/include/llvm/Object/IRObjectFile.h b/llvm/include/llvm/Object/IRObjectFile.h
index db47960237a0..6b3f2cd5671c 100644
--- a/llvm/include/llvm/Object/IRObjectFile.h
+++ b/llvm/include/llvm/Object/IRObjectFile.h
@@ -13,7 +13,6 @@
 #ifndef LLVM_OBJECT_IROBJECTFILE_H
 #define LLVM_OBJECT_IROBJECTFILE_H
 
-#include "llvm/ADT/PointerUnion.h"
 #include "llvm/Object/IRSymtab.h"
 #include "llvm/Object/ModuleSymbolTable.h"
 #include "llvm/Object/SymbolicFile.h"
diff --git a/llvm/include/llvm/Object/MachO.h b/llvm/include/llvm/Object/MachO.h
index 49a0706b84be..4ec366055db6 100644
--- a/llvm/include/llvm/Object/MachO.h
+++ b/llvm/include/llvm/Object/MachO.h
@@ -260,6 +260,124 @@ private:
 };
 using bind_iterator = content_iterator<MachOBindEntry>;
 
+/// ChainedFixupTarget holds all the information about an external symbol
+/// necessary to bind this binary to that symbol. These values are referenced
+/// indirectly by chained fixup binds. This structure captures values from all
+/// import and symbol formats.
+///
+/// Be aware there are two notions of weak here:
+///   WeakImport == true
+///     The associated bind may be set to 0 if this symbol is missing from its
+///     parent library. This is called a "weak import."
+///   LibOrdinal == BIND_SPECIAL_DYLIB_WEAK_LOOKUP
+///     This symbol may be coalesced with other libraries vending the same
+///     symbol. E.g., C++'s "operator new". This is called a "weak bind."
+struct ChainedFixupTarget {
+public:
+  ChainedFixupTarget(int LibOrdinal, StringRef Symbol, uint64_t Addend,
+                     bool WeakImport)
+      : LibOrdinal(LibOrdinal), SymbolName(Symbol), Addend(Addend),
+        WeakImport(WeakImport) {}
+
+  int libOrdinal() { return LibOrdinal; }
+  StringRef symbolName() { return SymbolName; }
+  uint64_t addend() { return Addend; }
+  bool weakImport() { return WeakImport; }
+  bool weakBind() {
+    return LibOrdinal == MachO::BIND_SPECIAL_DYLIB_WEAK_LOOKUP;
+  }
+
+private:
+  int LibOrdinal;
+  StringRef SymbolName;
+  uint64_t Addend;
+  bool WeakImport;
+};
+
+/// MachOAbstractFixupEntry is an abstract class representing a fixup in a
+/// MH_DYLDLINK file. Fixups generally represent rebases and binds. Binds also
+/// subdivide into additional subtypes (weak, lazy, reexport).
+///
+/// The two concrete subclasses of MachOAbstractFixupEntry are:
+///
+///   MachORebaseBindEntry   - for dyld opcode-based tables, including threaded-
+///                            rebase, where rebases are mixed in with other
+///                            bind opcodes.
+///   MachOChainedFixupEntry - for pointer chains embedded in data pages.
+class MachOAbstractFixupEntry {
+public:
+  MachOAbstractFixupEntry(Error *Err, const MachOObjectFile *O);
+
+  int32_t segmentIndex() const;
+  uint64_t segmentOffset() const;
+  uint64_t segmentAddress() const;
+  StringRef segmentName() const;
+  StringRef sectionName() const;
+  StringRef typeName() const;
+  StringRef symbolName() const;
+  uint32_t flags() const;
+  int64_t addend() const;
+  int ordinal() const;
+
+  /// \return the location of this fixup as a VM Address. For the VM
+  /// Address this fixup is pointing to, use pointerValue().
+  uint64_t address() const;
+
+  /// \return the VM Address pointed to by this fixup. Use
+  /// pointerValue() to compare against other VM Addresses, such as
+  /// section addresses or segment vmaddrs.
+  uint64_t pointerValue() const { return PointerValue; }
+
+  /// \return the raw "on-disk" representation of the fixup. For
+  /// Threaded rebases and Chained pointers these values are generally
+  /// encoded into various different pointer formats. This value is
+  /// exposed in API for tools that want to display and annotate the
+  /// raw bits.
+  uint64_t rawValue() const { return RawValue; }
+
+  void moveNext();
+
+protected:
+  Error *E;
+  const MachOObjectFile *O;
+  uint64_t SegmentOffset = 0;
+  int32_t SegmentIndex = -1;
+  StringRef SymbolName;
+  int32_t Ordinal = 0;
+  uint32_t Flags = 0;
+  int64_t Addend = 0;
+  uint64_t PointerValue = 0;
+  uint64_t RawValue = 0;
+  bool Done = false;
+
+  void moveToFirst();
+  void moveToEnd();
+
+  /// \return the vm address of the start of __TEXT segment.
+  uint64_t textAddress() const { return TextAddress; }
+
+private:
+  uint64_t TextAddress;
+};
+
+class MachOChainedFixupEntry : public MachOAbstractFixupEntry {
+public:
+  enum class FixupKind { All, Bind, WeakBind, Rebase };
+
+  MachOChainedFixupEntry(Error *Err, const MachOObjectFile *O, bool Parse);
+
+  bool operator==(const MachOChainedFixupEntry &) const;
+
+  void moveNext();
+  void moveToFirst();
+  void moveToEnd();
+
+private:
+  std::vector<ChainedFixupTarget> FixupTargets;
+  uint32_t FixupIndex = 0;
+};
+using fixup_iterator = content_iterator<MachOChainedFixupEntry>;
+
 class MachOObjectFile : public ObjectFile {
 public:
   struct LoadCommandInfo {
@@ -273,6 +391,8 @@ public:
   create(MemoryBufferRef Object, bool IsLittleEndian, bool Is64Bits,
          uint32_t UniversalCputype = 0, uint32_t UniversalIndex = 0);
 
+  static bool isMachOPairedReloc(uint64_t RelocType, uint64_t Arch);
+
   void moveSymbolNext(DataRefImpl &Symb) const override;
 
   uint64_t getNValue(DataRefImpl Sym) const;
@@ -402,6 +522,9 @@ public:
   /// For use iterating over all bind table entries.
   iterator_range<bind_iterator> bindTable(Error &Err);
 
+  /// For iterating over all chained fixups.
+  iterator_range<fixup_iterator> fixupTable(Error &Err);
+
   /// For use iterating over all lazy bind table entries.
   iterator_range<bind_iterator> lazyBindTable(Error &Err);
 
@@ -562,7 +685,12 @@ public:
   ArrayRef<uint8_t> getDyldInfoBindOpcodes() const;
   ArrayRef<uint8_t> getDyldInfoWeakBindOpcodes() const;
   ArrayRef<uint8_t> getDyldInfoLazyBindOpcodes() const;
+  /// If the optional is None, no header was found, but the object was well-formed.
+  Expected<Optional<MachO::dyld_chained_fixups_header>>
+  getChainedFixupsHeader() const;
+  Expected<std::vector<ChainedFixupTarget>> getDyldChainedFixupTargets() const;
   ArrayRef<uint8_t> getDyldInfoExportsTrie() const;
+  SmallVector<uint64_t> getFunctionStarts() const;
   ArrayRef<uint8_t> getUuid() const;
 
   StringRef getStringTableData() const;
@@ -689,6 +817,8 @@ private:
   const char *DataInCodeLoadCmd = nullptr;
   const char *LinkOptHintsLoadCmd = nullptr;
   const char *DyldInfoLoadCmd = nullptr;
+  const char *FuncStartsLoadCmd = nullptr;
+  const char *DyldChainedFixupsLoadCmd = nullptr;
   const char *UuidLoadCmd = nullptr;
   bool HasPageZeroSegment = false;
 };
diff --git a/llvm/include/llvm/Object/MachOUniversal.h b/llvm/include/llvm/Object/MachOUniversal.h
index e87eb31aad4e..4fe7a68d9680 100644
--- a/llvm/include/llvm/Object/MachOUniversal.h
+++ b/llvm/include/llvm/Object/MachOUniversal.h
@@ -16,7 +16,6 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/BinaryFormat/MachO.h"
-#include "llvm/Object/Archive.h"
 #include "llvm/Object/Binary.h"
 #include "llvm/Object/MachO.h"
 
@@ -25,6 +24,7 @@ class StringRef;
 class LLVMContext;
 
 namespace object {
+class Archive;
 class IRObjectFile;
 
 class MachOUniversalBinary : public Binary {
diff --git a/llvm/include/llvm/Object/MachOUniversalWriter.h b/llvm/include/llvm/Object/MachOUniversalWriter.h
index 8d095766cf48..4004f25f3fb7 100644
--- a/llvm/include/llvm/Object/MachOUniversalWriter.h
+++ b/llvm/include/llvm/Object/MachOUniversalWriter.h
@@ -14,15 +14,22 @@
 #ifndef LLVM_OBJECT_MACHOUNIVERSALWRITER_H
 #define LLVM_OBJECT_MACHOUNIVERSALWRITER_H
 
-#include "llvm/Object/Archive.h"
-#include "llvm/Object/Binary.h"
-#include "llvm/Object/MachO.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/MachO.h"
+#include "llvm/Support/Error.h"
+#include <cstdint>
+#include <string>
 
 namespace llvm {
 class LLVMContext;
 
 namespace object {
+class Archive;
+class Binary;
 class IRObjectFile;
+class MachOObjectFile;
 
 class Slice {
   const Binary *B;
diff --git a/llvm/include/llvm/Object/ObjectFile.h b/llvm/include/llvm/Object/ObjectFile.h
index 950c38a599d5..8754c229bd4b 100644
--- a/llvm/include/llvm/Object/ObjectFile.h
+++ b/llvm/include/llvm/Object/ObjectFile.h
@@ -13,7 +13,8 @@
 #ifndef LLVM_OBJECT_OBJECTFILE_H
 #define LLVM_OBJECT_OBJECTFILE_H
 
-#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/iterator_range.h"
@@ -24,11 +25,10 @@
 #include "llvm/Object/SymbolicFile.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/MemoryBufferRef.h"
 #include <cassert>
 #include <cstdint>
 #include <memory>
-#include <system_error>
 
 namespace llvm {
 
@@ -170,11 +170,11 @@ class SymbolRef : public BasicSymbolRef {
 public:
   enum Type {
     ST_Unknown, // Type not specified
+    ST_Other,
     ST_Data,
     ST_Debug,
     ST_File,
     ST_Function,
-    ST_Other
   };
 
   SymbolRef() = default;
@@ -350,6 +350,11 @@ public:
   /// True if this is a relocatable object (.o/.obj).
   virtual bool isRelocatableObject() const = 0;
 
+  /// True if the reflection section can be stripped by the linker.
+  bool isReflectionSectionStrippable(
+      llvm::binaryformat::Swift5ReflectionSectionKind ReflectionSectionKind)
+      const;
+
   /// @returns Pointer to ObjectFile subclass to handle this type of object.
   /// @param ObjectPath The path to the object file. ObjectPath.isObject must
   ///        return true.
diff --git a/llvm/include/llvm/Object/OffloadBinary.h b/llvm/include/llvm/Object/OffloadBinary.h
new file mode 100644
index 000000000000..5afc3ed295ae
--- /dev/null
+++ b/llvm/include/llvm/Object/OffloadBinary.h
@@ -0,0 +1,169 @@
+//===--- Offloading.h - Utilities for handling offloading code  -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the binary format used for budingling device metadata with
+// an associated device image. The data can then be stored inside a host object
+// file to create a fat binary and read by the linker. This is intended to be a
+// thin wrapper around the image itself. If this format becomes sufficiently
+// complex it should be moved to a standard binary format like msgpack or ELF.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_BINARYFORMAT_OFFLOADING_H
+#define LLVM_BINARYFORMAT_OFFLOADING_H
+
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <memory>
+
+namespace llvm {
+
+namespace object {
+
+/// The producer of the associated offloading image.
+enum OffloadKind : uint16_t {
+  OFK_None = 0,
+  OFK_OpenMP,
+  OFK_Cuda,
+  OFK_HIP,
+  OFK_LAST,
+};
+
+/// The type of contents the offloading image contains.
+enum ImageKind : uint16_t {
+  IMG_None = 0,
+  IMG_Object,
+  IMG_Bitcode,
+  IMG_Cubin,
+  IMG_Fatbinary,
+  IMG_PTX,
+  IMG_LAST,
+};
+
+/// A simple binary serialization of an offloading file. We use this format to
+/// embed the offloading image into the host executable so it can be extracted
+/// and used by the linker.
+///
+/// Many of these could be stored in the same section by the time the linker
+/// sees it so we mark this information with a header. The version is used to
+/// detect ABI stability and the size is used to find other offloading entries
+/// that may exist in the same section. All offsets are given as absolute byte
+/// offsets from the beginning of the file.
+class OffloadBinary : public Binary {
+public:
+  using string_iterator = StringMap<StringRef>::const_iterator;
+  using string_iterator_range = iterator_range<string_iterator>;
+
+  /// The current version of the binary used for backwards compatibility.
+  static const uint32_t Version = 1;
+
+  /// The offloading metadata that will be serialized to a memory buffer.
+  struct OffloadingImage {
+    ImageKind TheImageKind;
+    OffloadKind TheOffloadKind;
+    uint32_t Flags;
+    StringMap<StringRef> StringData;
+    std::unique_ptr<MemoryBuffer> Image;
+  };
+
+  /// Attempt to parse the offloading binary stored in \p Data.
+  static Expected<std::unique_ptr<OffloadBinary>> create(MemoryBufferRef);
+
+  /// Serialize the contents of \p File to a binary buffer to be read later.
+  static std::unique_ptr<MemoryBuffer> write(const OffloadingImage &);
+
+  static uint64_t getAlignment() { return alignof(Header); }
+
+  ImageKind getImageKind() const { return TheEntry->TheImageKind; }
+  OffloadKind getOffloadKind() const { return TheEntry->TheOffloadKind; }
+  uint32_t getVersion() const { return TheHeader->Version; }
+  uint32_t getFlags() const { return TheEntry->Flags; }
+  uint64_t getSize() const { return TheHeader->Size; }
+
+  StringRef getTriple() const { return getString("triple"); }
+  StringRef getArch() const { return getString("arch"); }
+  StringRef getImage() const {
+    return StringRef(&Buffer[TheEntry->ImageOffset], TheEntry->ImageSize);
+  }
+
+  // Iterator over all the key and value pairs in the binary.
+  string_iterator_range strings() const {
+    return string_iterator_range(StringData.begin(), StringData.end());
+  }
+
+  StringRef getString(StringRef Key) const { return StringData.lookup(Key); }
+
+  static bool classof(const Binary *V) { return V->isOffloadFile(); }
+
+  struct Header {
+    uint8_t Magic[4] = {0x10, 0xFF, 0x10, 0xAD}; // 0x10FF10AD magic bytes.
+    uint32_t Version = OffloadBinary::Version;   // Version identifier.
+    uint64_t Size;        // Size in bytes of this entire binary.
+    uint64_t EntryOffset; // Offset of the metadata entry in bytes.
+    uint64_t EntrySize;   // Size of the metadata entry in bytes.
+  };
+
+  struct Entry {
+    ImageKind TheImageKind;     // The kind of the image stored.
+    OffloadKind TheOffloadKind; // The producer of this image.
+    uint32_t Flags;             // Additional flags associated with the image.
+    uint64_t StringOffset;      // Offset in bytes to the string map.
+    uint64_t NumStrings;        // Number of entries in the string map.
+    uint64_t ImageOffset;       // Offset in bytes of the actual binary image.
+    uint64_t ImageSize;         // Size in bytes of the binary image.
+  };
+
+  struct StringEntry {
+    uint64_t KeyOffset;
+    uint64_t ValueOffset;
+  };
+
+private:
+  OffloadBinary(MemoryBufferRef Source, const Header *TheHeader,
+                const Entry *TheEntry)
+      : Binary(Binary::ID_Offload, Source), Buffer(Source.getBufferStart()),
+        TheHeader(TheHeader), TheEntry(TheEntry) {
+    const StringEntry *StringMapBegin =
+        reinterpret_cast<const StringEntry *>(&Buffer[TheEntry->StringOffset]);
+    for (uint64_t I = 0, E = TheEntry->NumStrings; I != E; ++I) {
+      StringRef Key = &Buffer[StringMapBegin[I].KeyOffset];
+      StringData[Key] = &Buffer[StringMapBegin[I].ValueOffset];
+    }
+  }
+
+  OffloadBinary(const OffloadBinary &Other) = delete;
+
+  /// Map from keys to offsets in the binary.
+  StringMap<StringRef> StringData;
+  /// Raw pointer to the MemoryBufferRef for convenience.
+  const char *Buffer;
+  /// Location of the header within the binary.
+  const Header *TheHeader;
+  /// Location of the metadata entries within the binary.
+  const Entry *TheEntry;
+};
+
+/// Convert a string \p Name to an image kind.
+ImageKind getImageKind(StringRef Name);
+
+/// Convert an image kind to its string representation.
+StringRef getImageKindName(ImageKind Name);
+
+/// Convert a string \p Name to an offload kind.
+OffloadKind getOffloadKind(StringRef Name);
+
+/// Convert an offload kind to its string representation.
+StringRef getOffloadKindName(OffloadKind Name);
+
+} // namespace object
+
+} // namespace llvm
+#endif
diff --git a/llvm/include/llvm/Object/RelocationResolver.h b/llvm/include/llvm/Object/RelocationResolver.h
index d3b604018e89..2acdf5ed2fe1 100644
--- a/llvm/include/llvm/Object/RelocationResolver.h
+++ b/llvm/include/llvm/Object/RelocationResolver.h
@@ -15,22 +15,15 @@
 #ifndef LLVM_OBJECT_RELOCATIONRESOLVER_H
 #define LLVM_OBJECT_RELOCATIONRESOLVER_H
 
-#include "llvm/ADT/Triple.h"
-#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/BinaryFormat/MachO.h"
-#include "llvm/Object/COFF.h"
-#include "llvm/Object/ELFObjectFile.h"
-#include "llvm/Object/MachO.h"
-#include "llvm/Object/ObjectFile.h"
-#include "llvm/Object/Wasm.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
 #include <cstdint>
-#include <system_error>
+#include <utility>
 
 namespace llvm {
 namespace object {
 
+class ObjectFile;
+class RelocationRef;
+
 using SupportsRelocation = bool (*)(uint64_t);
 using RelocationResolver = uint64_t (*)(uint64_t Type, uint64_t Offset,
                                         uint64_t S, uint64_t LocData,
diff --git a/llvm/include/llvm/Object/SymbolicFile.h b/llvm/include/llvm/Object/SymbolicFile.h
index 284302c5e042..ea51afce5d2a 100644
--- a/llvm/include/llvm/Object/SymbolicFile.h
+++ b/llvm/include/llvm/Object/SymbolicFile.h
@@ -13,21 +13,23 @@
 #ifndef LLVM_OBJECT_SYMBOLICFILE_H
 #define LLVM_OBJECT_SYMBOLICFILE_H
 
-#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/BinaryFormat/Magic.h"
 #include "llvm/Object/Binary.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/MemoryBufferRef.h"
 #include <cinttypes>
 #include <cstdint>
 #include <cstring>
 #include <iterator>
 #include <memory>
-#include <system_error>
 
 namespace llvm {
+
+class LLVMContext;
+class raw_ostream;
+
 namespace object {
 
 union DataRefImpl {
diff --git a/llvm/include/llvm/Object/TapiFile.h b/llvm/include/llvm/Object/TapiFile.h
index ffa27fdf9654..410e58dceaf4 100644
--- a/llvm/include/llvm/Object/TapiFile.h
+++ b/llvm/include/llvm/Object/TapiFile.h
@@ -14,13 +14,22 @@
 #define LLVM_OBJECT_TAPIFILE_H
 
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/iterator_range.h"
+#include "llvm/Object/Binary.h"
 #include "llvm/Object/SymbolicFile.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/TextAPI/InterfaceFile.h"
+#include "llvm/Support/MemoryBufferRef.h"
+#include "llvm/TextAPI/Architecture.h"
 
 namespace llvm {
+
+class raw_ostream;
+
+namespace MachO {
+
+class InterfaceFile;
+
+}
+
 namespace object {
 
 class TapiFile : public SymbolicFile {
diff --git a/llvm/include/llvm/Object/TapiUniversal.h b/llvm/include/llvm/Object/TapiUniversal.h
index ab548aa5bb2a..fff66c28c1a4 100644
--- a/llvm/include/llvm/Object/TapiUniversal.h
+++ b/llvm/include/llvm/Object/TapiUniversal.h
@@ -13,16 +13,18 @@
 #ifndef LLVM_OBJECT_TAPIUNIVERSAL_H
 #define LLVM_OBJECT_TAPIUNIVERSAL_H
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Object/Binary.h"
-#include "llvm/Object/TapiFile.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/MemoryBufferRef.h"
 #include "llvm/TextAPI/Architecture.h"
 #include "llvm/TextAPI/InterfaceFile.h"
 
 namespace llvm {
 namespace object {
 
+class TapiFile;
+
 class TapiUniversal : public Binary {
 public:
   class ObjectForArch {
diff --git a/llvm/include/llvm/Object/Wasm.h b/llvm/include/llvm/Object/Wasm.h
index e4802c087b8b..abe0f6f528cc 100644
--- a/llvm/include/llvm/Object/Wasm.h
+++ b/llvm/include/llvm/Object/Wasm.h
@@ -287,7 +287,6 @@ private:
   uint32_t StartFunction = -1;
   bool HasLinkingSection = false;
   bool HasDylinkSection = false;
-  bool SeenCodeSection = false;
   bool HasMemory64 = false;
   wasm::WasmLinkingData LinkingData;
   uint32_t NumImportedGlobals = 0;
diff --git a/llvm/include/llvm/Object/WindowsResource.h b/llvm/include/llvm/Object/WindowsResource.h
index b8fad299c693..acda9e2659b1 100644
--- a/llvm/include/llvm/Object/WindowsResource.h
+++ b/llvm/include/llvm/Object/WindowsResource.h
@@ -31,7 +31,6 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/Object/Binary.h"
-#include "llvm/Object/COFF.h"
 #include "llvm/Object/Error.h"
 #include "llvm/Support/BinaryByteStream.h"
 #include "llvm/Support/BinaryStreamReader.h"
@@ -50,6 +49,7 @@ namespace object {
 
 class WindowsResource;
 class ResourceSectionRef;
+struct coff_resource_dir_table;
 
 const size_t WIN_RES_MAGIC_SIZE = 16;
 const size_t WIN_RES_NULL_ENTRY_SIZE = 16;
diff --git a/llvm/include/llvm/Object/XCOFFObjectFile.h b/llvm/include/llvm/Object/XCOFFObjectFile.h
index ac911e534f34..68d9afff887c 100644
--- a/llvm/include/llvm/Object/XCOFFObjectFile.h
+++ b/llvm/include/llvm/Object/XCOFFObjectFile.h
@@ -60,10 +60,13 @@ public:
     return static_cast<const T *>(this)->FlagAndTDataAlignment &
            AuxiHeaderFlagMask;
   }
+
   uint8_t getTDataAlignment() const {
     return static_cast<const T *>(this)->FlagAndTDataAlignment &
            AuxiHeaderTDataAlignmentMask;
   }
+
+  uint16_t getVersion() const { return static_cast<const T *>(this)->Version; }
 };
 
 struct XCOFFAuxiliaryHeader32 : XCOFFAuxiliaryHeader<XCOFFAuxiliaryHeader32> {
@@ -113,7 +116,7 @@ struct XCOFFAuxiliaryHeader32 : XCOFFAuxiliaryHeader<XCOFFAuxiliaryHeader32> {
   support::ubig16_t SecNumOfTBSS;
 };
 
-struct XCOFFAuxiliaryHeader64 : XCOFFAuxiliaryHeader<XCOFFAuxiliaryHeader32> {
+struct XCOFFAuxiliaryHeader64 : XCOFFAuxiliaryHeader<XCOFFAuxiliaryHeader64> {
   support::ubig16_t AuxMagic;
   support::ubig16_t Version;
   support::ubig32_t ReservedForDebugger;
@@ -448,9 +451,6 @@ private:
   const void *SymbolTblPtr = nullptr;
   XCOFFStringTable StringTable = {0, nullptr};
 
-  const XCOFFFileHeader32 *fileHeader32() const;
-  const XCOFFFileHeader64 *fileHeader64() const;
-
   const XCOFFSectionHeader32 *sectionHeaderTable32() const;
   const XCOFFSectionHeader64 *sectionHeaderTable64() const;
   template <typename T> const T *sectionHeaderTable() const;
@@ -548,6 +548,8 @@ public:
 
   // Below here is the non-inherited interface.
   bool is64Bit() const;
+  Expected<StringRef> getRawData(const char *Start, uint64_t Size,
+                                 StringRef Name) const;
 
   const XCOFFAuxiliaryHeader32 *auxiliaryHeader32() const;
   const XCOFFAuxiliaryHeader64 *auxiliaryHeader64() const;
@@ -559,6 +561,8 @@ public:
   XCOFFSymbolRef toSymbolRef(DataRefImpl Ref) const;
 
   // File header related interfaces.
+  const XCOFFFileHeader32 *fileHeader32() const;
+  const XCOFFFileHeader64 *fileHeader64() const;
   uint16_t getMagic() const;
   uint16_t getNumberOfSections() const;
   int32_t getTimeStamp() const;
@@ -687,6 +691,9 @@ public:
       Entry32 = reinterpret_cast<const XCOFFSymbolEntry32 *>(SymEntDataRef.p);
   }
 
+  const XCOFFSymbolEntry32 *getSymbol32() { return Entry32; }
+  const XCOFFSymbolEntry64 *getSymbol64() { return Entry64; }
+
   uint64_t getValue() const { return Entry32 ? getValue32() : getValue64(); }
 
   uint32_t getValue32() const { return Entry32->Value; }
diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
new file mode 100644
index 000000000000..d1c0cd912d97
--- /dev/null
+++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
@@ -0,0 +1,101 @@
+//===- DXContainerYAML.h - DXContainer YAMLIO implementation ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares classes for handling the YAML representation
+/// of DXContainer.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJECTYAML_DXCONTAINERYAML_H
+#define LLVM_OBJECTYAML_DXCONTAINERYAML_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ObjectYAML/YAML.h"
+#include "llvm/Support/YAMLTraits.h"
+#include <cstdint>
+#include <string>
+#include <vector>
+
+namespace llvm {
+namespace DXContainerYAML {
+
+struct VersionTuple {
+  uint16_t Major;
+  uint16_t Minor;
+};
+
+// The optional header fields are required in the binary and will be populated
+// when reading from binary, but can be omitted in the YAML text because the
+// emitter can calculate them.
+struct FileHeader {
+  std::vector<llvm::yaml::Hex8> Hash;
+  VersionTuple Version;
+  Optional<uint32_t> FileSize;
+  uint32_t PartCount;
+  Optional<std::vector<uint32_t>> PartOffsets;
+};
+
+struct DXILProgram {
+  uint8_t MajorVersion;
+  uint8_t MinorVersion;
+  uint16_t ShaderKind;
+  Optional<uint32_t> Size;
+  uint16_t DXILMajorVersion;
+  uint16_t DXILMinorVersion;
+  Optional<uint32_t> DXILOffset;
+  Optional<uint32_t> DXILSize;
+  Optional<std::vector<llvm::yaml::Hex8>> DXIL;
+};
+
+struct Part {
+  std::string Name;
+  uint32_t Size;
+  Optional<DXILProgram> Program;
+};
+
+struct Object {
+  FileHeader Header;
+  std::vector<Part> Parts;
+};
+
+} // namespace DXContainerYAML
+} // namespace llvm
+
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DXContainerYAML::Part)
+namespace llvm {
+
+class raw_ostream;
+
+namespace yaml {
+
+template <> struct MappingTraits<DXContainerYAML::VersionTuple> {
+  static void mapping(IO &IO, DXContainerYAML::VersionTuple &Version);
+};
+
+template <> struct MappingTraits<DXContainerYAML::FileHeader> {
+  static void mapping(IO &IO, DXContainerYAML::FileHeader &Header);
+};
+
+template <> struct MappingTraits<DXContainerYAML::DXILProgram> {
+  static void mapping(IO &IO, DXContainerYAML::DXILProgram &Program);
+};
+
+template <> struct MappingTraits<DXContainerYAML::Part> {
+  static void mapping(IO &IO, DXContainerYAML::Part &Version);
+};
+
+template <> struct MappingTraits<DXContainerYAML::Object> {
+  static void mapping(IO &IO, DXContainerYAML::Object &Obj);
+};
+
+} // namespace yaml
+
+} // namespace llvm
+
+#endif // LLVM_OBJECTYAML_DXCONTAINERYAML_H
diff --git a/llvm/include/llvm/ObjectYAML/ELFYAML.h b/llvm/include/llvm/ObjectYAML/ELFYAML.h
index 92a9f78ce7bf..ddd5dd9cf3c9 100644
--- a/llvm/include/llvm/ObjectYAML/ELFYAML.h
+++ b/llvm/include/llvm/ObjectYAML/ELFYAML.h
@@ -161,6 +161,8 @@ struct BBAddrMapEntry {
     llvm::yaml::Hex64 Size;
     llvm::yaml::Hex64 Metadata;
   };
+  uint8_t Version;
+  llvm::yaml::Hex8 Feature;
   llvm::yaml::Hex64 Address;
   Optional<uint64_t> NumBlocks;
   Optional<std::vector<BBEntry>> BBEntries;
@@ -317,7 +319,7 @@ struct BBAddrMapSection : Section {
   BBAddrMapSection() : Section(ChunkKind::BBAddrMap) {}
 
   std::vector<std::pair<StringRef, bool>> getEntries() const override {
-    return {{"Entries", Entries.hasValue()}};
+    return {{"Entries", Entries.has_value()}};
   };
 
   static bool classof(const Chunk *S) {
@@ -331,7 +333,7 @@ struct StackSizesSection : Section {
   StackSizesSection() : Section(ChunkKind::StackSizes) {}
 
   std::vector<std::pair<StringRef, bool>> getEntries() const override {
-    return {{"Entries", Entries.hasValue()}};
+    return {{"Entries", Entries.has_value()}};
   };
 
   static bool classof(const Chunk *S) {
@@ -349,7 +351,7 @@ struct DynamicSection : Section {
   DynamicSection() : Section(ChunkKind::Dynamic) {}
 
   std::vector<std::pair<StringRef, bool>> getEntries() const override {
-    return {{"Entries", Entries.hasValue()}};
+    return {{"Entries", Entries.has_value()}};
   };
 
   static bool classof(const Chunk *S) { return S->Kind == ChunkKind::Dynamic; }
@@ -380,7 +382,7 @@ struct NoteSection : Section {
   NoteSection() : Section(ChunkKind::Note) {}
 
   std::vector<std::pair<StringRef, bool>> getEntries() const override {
-    return {{"Notes", Notes.hasValue()}};
+    return {{"Notes", Notes.has_value()}};
   };
 
   static bool classof(const Chunk *S) { return S->Kind == ChunkKind::Note; }
@@ -391,7 +393,7 @@ struct HashSection : Section {
   Optional<std::vector<uint32_t>> Chain;
 
   std::vector<std::pair<StringRef, bool>> getEntries() const override {
-    return {{"Bucket", Bucket.hasValue()}, {"Chain", Chain.hasValue()}};
+    return {{"Bucket", Bucket.has_value()}, {"Chain", Chain.has_value()}};
   };
 
   // The following members are used to override section fields.
@@ -433,10 +435,10 @@ struct GnuHashSection : Section {
   GnuHashSection() : Section(ChunkKind::GnuHash) {}
 
   std::vector<std::pair<StringRef, bool>> getEntries() const override {
-    return {{"Header", Header.hasValue()},
-            {"BloomFilter", BloomFilter.hasValue()},
-            {"HashBuckets", HashBuckets.hasValue()},
-            {"HashValues", HashValues.hasValue()}};
+    return {{"Header", Header.has_value()},
+            {"BloomFilter", BloomFilter.has_value()},
+            {"HashBuckets", HashBuckets.has_value()},
+            {"HashValues", HashValues.has_value()}};
   };
 
   static bool classof(const Chunk *S) { return S->Kind == ChunkKind::GnuHash; }
@@ -462,7 +464,7 @@ struct VerneedSection : Section {
   VerneedSection() : Section(ChunkKind::Verneed) {}
 
   std::vector<std::pair<StringRef, bool>> getEntries() const override {
-    return {{"Dependencies", VerneedV.hasValue()}};
+    return {{"Dependencies", VerneedV.has_value()}};
   };
 
   static bool classof(const Chunk *S) {
@@ -476,7 +478,7 @@ struct AddrsigSection : Section {
   AddrsigSection() : Section(ChunkKind::Addrsig) {}
 
   std::vector<std::pair<StringRef, bool>> getEntries() const override {
-    return {{"Symbols", Symbols.hasValue()}};
+    return {{"Symbols", Symbols.has_value()}};
   };
 
   static bool classof(const Chunk *S) { return S->Kind == ChunkKind::Addrsig; }
@@ -493,7 +495,7 @@ struct LinkerOptionsSection : Section {
   LinkerOptionsSection() : Section(ChunkKind::LinkerOptions) {}
 
   std::vector<std::pair<StringRef, bool>> getEntries() const override {
-    return {{"Options", Options.hasValue()}};
+    return {{"Options", Options.has_value()}};
   };
 
   static bool classof(const Chunk *S) {
@@ -507,7 +509,7 @@ struct DependentLibrariesSection : Section {
   DependentLibrariesSection() : Section(ChunkKind::DependentLibraries) {}
 
   std::vector<std::pair<StringRef, bool>> getEntries() const override {
-    return {{"Libraries", Libs.hasValue()}};
+    return {{"Libraries", Libs.has_value()}};
   };
 
   static bool classof(const Chunk *S) {
@@ -527,7 +529,7 @@ struct CallGraphProfileSection : Section {
   CallGraphProfileSection() : Section(ChunkKind::CallGraphProfile) {}
 
   std::vector<std::pair<StringRef, bool>> getEntries() const override {
-    return {{"Entries", Entries.hasValue()}};
+    return {{"Entries", Entries.has_value()}};
   };
 
   static bool classof(const Chunk *S) {
@@ -541,7 +543,7 @@ struct SymverSection : Section {
   SymverSection() : Section(ChunkKind::Symver) {}
 
   std::vector<std::pair<StringRef, bool>> getEntries() const override {
-    return {{"Entries", Entries.hasValue()}};
+    return {{"Entries", Entries.has_value()}};
   };
 
   static bool classof(const Chunk *S) { return S->Kind == ChunkKind::Symver; }
@@ -562,7 +564,7 @@ struct VerdefSection : Section {
   VerdefSection() : Section(ChunkKind::Verdef) {}
 
   std::vector<std::pair<StringRef, bool>> getEntries() const override {
-    return {{"Entries", Entries.hasValue()}};
+    return {{"Entries", Entries.has_value()}};
   };
 
   static bool classof(const Chunk *S) { return S->Kind == ChunkKind::Verdef; }
@@ -577,7 +579,7 @@ struct GroupSection : Section {
   GroupSection() : Section(ChunkKind::Group) {}
 
   std::vector<std::pair<StringRef, bool>> getEntries() const override {
-    return {{"Members", Members.hasValue()}};
+    return {{"Members", Members.has_value()}};
   };
 
   static bool classof(const Chunk *S) { return S->Kind == ChunkKind::Group; }
@@ -597,7 +599,7 @@ struct RelocationSection : Section {
   RelocationSection() : Section(ChunkKind::Relocation) {}
 
   std::vector<std::pair<StringRef, bool>> getEntries() const override {
-    return {{"Relocations", Relocations.hasValue()}};
+    return {{"Relocations", Relocations.has_value()}};
   };
 
   static bool classof(const Chunk *S) {
@@ -611,7 +613,7 @@ struct RelrSection : Section {
   RelrSection() : Section(ChunkKind::Relr) {}
 
   std::vector<std::pair<StringRef, bool>> getEntries() const override {
-    return {{"Entries", Entries.hasValue()}};
+    return {{"Entries", Entries.has_value()}};
   };
 
   static bool classof(const Chunk *S) {
@@ -625,7 +627,7 @@ struct SymtabShndxSection : Section {
   SymtabShndxSection() : Section(ChunkKind::SymtabShndxSection) {}
 
   std::vector<std::pair<StringRef, bool>> getEntries() const override {
-    return {{"Entries", Entries.hasValue()}};
+    return {{"Entries", Entries.has_value()}};
   };
 
   static bool classof(const Chunk *S) {
@@ -644,7 +646,7 @@ struct ARMIndexTableSection : Section {
   ARMIndexTableSection() : Section(ChunkKind::ARMIndexTable) {}
 
   std::vector<std::pair<StringRef, bool>> getEntries() const override {
-    return {{"Entries", Entries.hasValue()}};
+    return {{"Entries", Entries.has_value()}};
   };
 
   static bool classof(const Chunk *S) {
@@ -720,6 +722,7 @@ struct Object {
     llvm_unreachable("the section header table chunk must always be present");
   }
 
+  ELF_ELFOSABI getOSAbi() const;
   unsigned getMachine() const;
 };
 
diff --git a/llvm/include/llvm/ObjectYAML/MachOYAML.h b/llvm/include/llvm/ObjectYAML/MachOYAML.h
index 38a7de3d6131..095377c1b824 100644
--- a/llvm/include/llvm/ObjectYAML/MachOYAML.h
+++ b/llvm/include/llvm/ObjectYAML/MachOYAML.h
@@ -122,6 +122,7 @@ struct LinkEditData {
   std::vector<NListEntry> NameList;
   std::vector<StringRef> StringTable;
   std::vector<yaml::Hex32> IndirectSymbols;
+  std::vector<yaml::Hex64> FunctionStarts;
 
   bool isEmpty() const;
 };
diff --git a/llvm/include/llvm/ObjectYAML/ObjectYAML.h b/llvm/include/llvm/ObjectYAML/ObjectYAML.h
index 312777aadd4c..b63607e6796b 100644
--- a/llvm/include/llvm/ObjectYAML/ObjectYAML.h
+++ b/llvm/include/llvm/ObjectYAML/ObjectYAML.h
@@ -11,9 +11,11 @@
 
 #include "llvm/ObjectYAML/ArchiveYAML.h"
 #include "llvm/ObjectYAML/COFFYAML.h"
+#include "llvm/ObjectYAML/DXContainerYAML.h"
 #include "llvm/ObjectYAML/ELFYAML.h"
 #include "llvm/ObjectYAML/MachOYAML.h"
 #include "llvm/ObjectYAML/MinidumpYAML.h"
+#include "llvm/ObjectYAML/OffloadYAML.h"
 #include "llvm/ObjectYAML/WasmYAML.h"
 #include "llvm/ObjectYAML/XCOFFYAML.h"
 #include "llvm/Support/YAMLTraits.h"
@@ -31,8 +33,10 @@ struct YamlObjectFile {
   std::unique_ptr<MachOYAML::Object> MachO;
   std::unique_ptr<MachOYAML::UniversalBinary> FatMachO;
   std::unique_ptr<MinidumpYAML::Object> Minidump;
+  std::unique_ptr<OffloadYAML::Binary> Offload;
   std::unique_ptr<WasmYAML::Object> Wasm;
   std::unique_ptr<XCOFFYAML::Object> Xcoff;
+  std::unique_ptr<DXContainerYAML::Object> DXContainer;
 };
 
 template <> struct MappingTraits<YamlObjectFile> {
diff --git a/llvm/include/llvm/ObjectYAML/OffloadYAML.h b/llvm/include/llvm/ObjectYAML/OffloadYAML.h
new file mode 100644
index 000000000000..a4fdbce0b320
--- /dev/null
+++ b/llvm/include/llvm/ObjectYAML/OffloadYAML.h
@@ -0,0 +1,79 @@
+//===- OffloadYAML.h - Offload Binary YAMLIO implementation -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares classes for handling the YAML representation of
+/// offloading binaries.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJECTYAML_OFFLOADYAML_H
+#define LLVM_OBJECTYAML_OFFLOADYAML_H
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/Object/OffloadBinary.h"
+#include "llvm/ObjectYAML/YAML.h"
+#include "llvm/Support/YAMLTraits.h"
+
+namespace llvm {
+namespace OffloadYAML {
+
+struct Binary {
+  struct StringEntry {
+    StringRef Key;
+    StringRef Value;
+  };
+
+  struct Member {
+    Optional<object::ImageKind> ImageKind;
+    Optional<object::OffloadKind> OffloadKind;
+    Optional<uint32_t> Flags;
+    Optional<std::vector<StringEntry>> StringEntries;
+    Optional<yaml::BinaryRef> Content;
+  };
+
+  Optional<uint32_t> Version;
+  Optional<uint64_t> Size;
+  Optional<uint64_t> EntryOffset;
+  Optional<uint64_t> EntrySize;
+  std::vector<Member> Members;
+};
+
+} // end namespace OffloadYAML
+} // end namespace llvm
+
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::OffloadYAML::Binary::Member)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::OffloadYAML::Binary::StringEntry)
+
+namespace llvm {
+namespace yaml {
+
+template <> struct ScalarEnumerationTraits<object::ImageKind> {
+  static void enumeration(IO &IO, object::ImageKind &Value);
+};
+
+template <> struct ScalarEnumerationTraits<object::OffloadKind> {
+  static void enumeration(IO &IO, object::OffloadKind &Value);
+};
+
+template <> struct MappingTraits<OffloadYAML::Binary> {
+  static void mapping(IO &IO, OffloadYAML::Binary &O);
+};
+
+template <> struct MappingTraits<OffloadYAML::Binary::StringEntry> {
+  static void mapping(IO &IO, OffloadYAML::Binary::StringEntry &M);
+};
+
+template <> struct MappingTraits<OffloadYAML::Binary::Member> {
+  static void mapping(IO &IO, OffloadYAML::Binary::Member &M);
+};
+
+} // end namespace yaml
+} // end namespace llvm
+
+#endif // LLVM_OBJECTYAML_ARCHIVEYAML_H
diff --git a/llvm/include/llvm/ObjectYAML/WasmYAML.h b/llvm/include/llvm/ObjectYAML/WasmYAML.h
index e3a1ba0d58a6..0f6c4f06665f 100644
--- a/llvm/include/llvm/ObjectYAML/WasmYAML.h
+++ b/llvm/include/llvm/ObjectYAML/WasmYAML.h
@@ -62,11 +62,20 @@ struct Export {
   uint32_t Index;
 };
 
+struct InitExpr {
+  InitExpr() {}
+  bool Extended;
+  union {
+    wasm::WasmInitExprMVP Inst;
+    yaml::BinaryRef Body;
+  };
+};
+
 struct ElemSegment {
   uint32_t Flags;
   uint32_t TableNumber;
   ValueType ElemKind;
-  wasm::WasmInitExpr Offset;
+  InitExpr Offset;
   std::vector<uint32_t> Functions;
 };
 
@@ -74,19 +83,20 @@ struct Global {
   uint32_t Index;
   ValueType Type;
   bool Mutable;
-  wasm::WasmInitExpr InitExpr;
+  InitExpr Init;
 };
 
 struct Import {
+  Import() {}
   StringRef Module;
   StringRef Field;
   ExportKind Kind;
   union {
     uint32_t SigIndex;
-    Global GlobalImport;
     Table TableImport;
     Limits Memory;
     uint32_t TagIndex;
+    Global GlobalImport;
   };
 };
 
@@ -114,7 +124,7 @@ struct DataSegment {
   uint32_t SectionOffset;
   uint32_t InitFlags;
   uint32_t MemoryIndex;
-  wasm::WasmInitExpr Offset;
+  InitExpr Offset;
   yaml::BinaryRef Content;
 };
 
@@ -526,8 +536,8 @@ template <> struct MappingTraits<WasmYAML::LocalDecl> {
   static void mapping(IO &IO, WasmYAML::LocalDecl &LocalDecl);
 };
 
-template <> struct MappingTraits<wasm::WasmInitExpr> {
-  static void mapping(IO &IO, wasm::WasmInitExpr &Expr);
+template <> struct MappingTraits<WasmYAML::InitExpr> {
+  static void mapping(IO &IO, WasmYAML::InitExpr &Expr);
 };
 
 template <> struct MappingTraits<WasmYAML::DataSegment> {
diff --git a/llvm/include/llvm/ObjectYAML/yaml2obj.h b/llvm/include/llvm/ObjectYAML/yaml2obj.h
index 468f673fd451..000da077bb18 100644
--- a/llvm/include/llvm/ObjectYAML/yaml2obj.h
+++ b/llvm/include/llvm/ObjectYAML/yaml2obj.h
@@ -36,6 +36,10 @@ namespace MinidumpYAML {
 struct Object;
 }
 
+namespace OffloadYAML {
+struct Binary;
+}
+
 namespace WasmYAML {
 struct Object;
 }
@@ -48,6 +52,10 @@ namespace ArchYAML {
 struct Archive;
 }
 
+namespace DXContainerYAML {
+struct Object;
+} // namespace DXContainerYAML
+
 namespace yaml {
 class Input;
 struct YamlObjectFile;
@@ -61,8 +69,11 @@ bool yaml2elf(ELFYAML::Object &Doc, raw_ostream &Out, ErrorHandler EH,
 bool yaml2macho(YamlObjectFile &Doc, raw_ostream &Out, ErrorHandler EH);
 bool yaml2minidump(MinidumpYAML::Object &Doc, raw_ostream &Out,
                    ErrorHandler EH);
+bool yaml2offload(OffloadYAML::Binary &Doc, raw_ostream &Out, ErrorHandler EH);
 bool yaml2wasm(WasmYAML::Object &Doc, raw_ostream &Out, ErrorHandler EH);
 bool yaml2xcoff(XCOFFYAML::Object &Doc, raw_ostream &Out, ErrorHandler EH);
+bool yaml2dxcontainer(DXContainerYAML::Object &Doc, raw_ostream &Out,
+                      ErrorHandler EH);
 
 bool convertYAML(Input &YIn, raw_ostream &Out, ErrorHandler ErrHandler,
                  unsigned DocNum = 1, uint64_t MaxSize = UINT64_MAX);
diff --git a/llvm/include/llvm/Option/ArgList.h b/llvm/include/llvm/Option/ArgList.h
index 74897de52a93..6a07e1c657dc 100644
--- a/llvm/include/llvm/Option/ArgList.h
+++ b/llvm/include/llvm/Option/ArgList.h
@@ -298,14 +298,24 @@ public:
   /// true if the option is present, false if the negation is present, and
   /// \p Default if neither option is given. If both the option and its
   /// negation are present, the last one wins.
-  bool hasFlag(OptSpecifier Pos, OptSpecifier Neg, bool Default=true) const;
+  bool hasFlag(OptSpecifier Pos, OptSpecifier Neg, bool Default) const;
 
   /// hasFlag - Given an option \p Pos, an alias \p PosAlias and its negative
   /// form \p Neg, return true if the option or its alias is present, false if
   /// the negation is present, and \p Default if none of the options are
   /// given. If multiple options are present, the last one wins.
   bool hasFlag(OptSpecifier Pos, OptSpecifier PosAlias, OptSpecifier Neg,
-               bool Default = true) const;
+               bool Default) const;
+
+  /// Given an option Pos and its negative form Neg, render the option if Pos is
+  /// present.
+  void addOptInFlag(ArgStringList &Output, OptSpecifier Pos,
+                    OptSpecifier Neg) const;
+  /// Render the option if Neg is present.
+  void addOptOutFlag(ArgStringList &Output, OptSpecifier Pos,
+                     OptSpecifier Neg) const {
+    addOptInFlag(Output, Neg, Pos);
+  }
 
   /// Render only the last argument match \p Id0, if present.
   template<typename ...OptSpecifiers>
diff --git a/llvm/include/llvm/Pass.h b/llvm/include/llvm/Pass.h
index 8aa9ba90a9ca..6445e16ab68f 100644
--- a/llvm/include/llvm/Pass.h
+++ b/llvm/include/llvm/Pass.h
@@ -228,6 +228,16 @@ public:
   template <typename AnalysisType>
   AnalysisType &getAnalysisID(AnalysisID PI, Function &F,
                               bool *Changed = nullptr);
+
+#ifdef EXPENSIVE_CHECKS
+  /// Hash a module in order to detect when a module (or more specific) pass has
+  /// modified it.
+  uint64_t structuralHash(Module &M) const;
+
+  /// Hash a function in order to detect when a function (or more specific) pass
+  /// has modified it.
+  virtual uint64_t structuralHash(Function &F) const;
+#endif
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h
index 66b0b149fa25..0cbbdf7f3ce8 100644
--- a/llvm/include/llvm/Passes/PassBuilder.h
+++ b/llvm/include/llvm/Passes/PassBuilder.h
@@ -215,8 +215,9 @@ public:
   /// only intended for use when attempting to optimize code. If frontends
   /// require some transformations for semantic reasons, they should explicitly
   /// build them.
-  ModulePassManager buildModuleOptimizationPipeline(OptimizationLevel Level,
-                                                    bool LTOPreLink = false);
+  ModulePassManager
+  buildModuleOptimizationPipeline(OptimizationLevel Level,
+                                  ThinOrFullLTOPhase LTOPhase);
 
   /// Build a per-module default optimization pipeline.
   ///
@@ -468,6 +469,15 @@ public:
     PipelineEarlySimplificationEPCallbacks.push_back(C);
   }
 
+  /// Register a callback for a default optimizer pipeline extension point
+  ///
+  /// This extension point allows adding optimizations before the function
+  /// optimization pipeline.
+  void registerOptimizerEarlyEPCallback(
+      const std::function<void(ModulePassManager &, OptimizationLevel)> &C) {
+    OptimizerEarlyEPCallbacks.push_back(C);
+  }
+
   /// Register a callback for a default optimizer pipeline extension point
   ///
   /// This extension point allows adding optimizations at the very end of the
@@ -477,6 +487,24 @@ public:
     OptimizerLastEPCallbacks.push_back(C);
   }
 
+  /// Register a callback for a default optimizer pipeline extension point
+  ///
+  /// This extension point allows adding optimizations at the start of the full
+  /// LTO pipeline.
+  void registerFullLinkTimeOptimizationEarlyEPCallback(
+      const std::function<void(ModulePassManager &, OptimizationLevel)> &C) {
+    FullLinkTimeOptimizationEarlyEPCallbacks.push_back(C);
+  }
+
+  /// Register a callback for a default optimizer pipeline extension point
+  ///
+  /// This extension point allows adding optimizations at the end of the full
+  /// LTO pipeline.
+  void registerFullLinkTimeOptimizationLastEPCallback(
+      const std::function<void(ModulePassManager &, OptimizationLevel)> &C) {
+    FullLinkTimeOptimizationLastEPCallbacks.push_back(C);
+  }
+
   /// Register a callback for parsing an AliasAnalysis Name to populate
   /// the given AAManager \p AA
   void registerParseAACallback(
@@ -582,7 +610,8 @@ private:
 
   void addPGOInstrPasses(ModulePassManager &MPM, OptimizationLevel Level,
                          bool RunProfileGen, bool IsCS, std::string ProfileFile,
-                         std::string ProfileRemappingFile);
+                         std::string ProfileRemappingFile,
+                         ThinOrFullLTOPhase LTOPhase);
   void invokePeepholeEPCallbacks(FunctionPassManager &, OptimizationLevel);
 
   // Extension Point callbacks
@@ -598,9 +627,15 @@ private:
       CGSCCOptimizerLateEPCallbacks;
   SmallVector<std::function<void(FunctionPassManager &, OptimizationLevel)>, 2>
       VectorizerStartEPCallbacks;
+  // Module callbacks
+  SmallVector<std::function<void(ModulePassManager &, OptimizationLevel)>, 2>
+      OptimizerEarlyEPCallbacks;
   SmallVector<std::function<void(ModulePassManager &, OptimizationLevel)>, 2>
       OptimizerLastEPCallbacks;
-  // Module callbacks
+  SmallVector<std::function<void(ModulePassManager &, OptimizationLevel)>, 2>
+      FullLinkTimeOptimizationEarlyEPCallbacks;
+  SmallVector<std::function<void(ModulePassManager &, OptimizationLevel)>, 2>
+      FullLinkTimeOptimizationLastEPCallbacks;
   SmallVector<std::function<void(ModulePassManager &, OptimizationLevel)>, 2>
       PipelineStartEPCallbacks;
   SmallVector<std::function<void(ModulePassManager &, OptimizationLevel)>, 2>
diff --git a/llvm/include/llvm/Passes/StandardInstrumentations.h b/llvm/include/llvm/Passes/StandardInstrumentations.h
index 561cd54fa998..32ecc9ec5fb0 100644
--- a/llvm/include/llvm/Passes/StandardInstrumentations.h
+++ b/llvm/include/llvm/Passes/StandardInstrumentations.h
@@ -187,17 +187,6 @@ protected:
   // Register required callbacks.
   void registerRequiredCallbacks(PassInstrumentationCallbacks &PIC);
 
-  // Return true when this is a defined function for which printing
-  // of changes is desired.
-  bool isInterestingFunction(const Function &F);
-
-  // Return true when this is a pass for which printing of changes is desired.
-  bool isInterestingPass(StringRef PassID);
-
-  // Return true when this is a pass on IR for which printing
-  // of changes is desired.
-  bool isInteresting(Any IR, StringRef PassID);
-
   // Called on the first IR processed.
   virtual void handleInitialIR(Any IR) = 0;
   // Called before and after a pass to get the representation of the IR.
@@ -491,6 +480,25 @@ protected:
   std::unique_ptr<raw_fd_ostream> HTML;
 };
 
+// Print IR on crash.
+class PrintCrashIRInstrumentation {
+public:
+  PrintCrashIRInstrumentation()
+      : SavedIR("*** Dump of IR Before Last Pass Unknown ***") {}
+  ~PrintCrashIRInstrumentation();
+  void registerCallbacks(PassInstrumentationCallbacks &PIC);
+  void reportCrashIR();
+
+protected:
+  std::string SavedIR;
+
+private:
+  // The crash reporter that will report on a crash.
+  static PrintCrashIRInstrumentation *CrashReporter;
+  // Crash handler registered when print-on-crash is specified.
+  static void SignalHandler(void *);
+};
+
 /// This class provides an interface to register all the standard pass
 /// instrumentations and manages their state (if any).
 class StandardInstrumentations {
@@ -504,6 +512,7 @@ class StandardInstrumentations {
   PseudoProbeVerifier PseudoProbeVerification;
   InLineChangePrinter PrintChangedDiff;
   DotCfgChangeReporter WebsiteChangeReporter;
+  PrintCrashIRInstrumentation PrintCrashIR;
   VerifyInstrumentation Verify;
 
   bool VerifyEach;
diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index e1f45019b1a9..e35751512245 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -195,11 +195,11 @@ public:
   ArrayRef<CounterExpression> getExpressions() const { return Expressions; }
 
   /// Return a counter that represents the expression that adds LHS and RHS.
-  Counter add(Counter LHS, Counter RHS);
+  Counter add(Counter LHS, Counter RHS, bool Simplify = true);
 
   /// Return a counter that represents the expression that subtracts RHS from
   /// LHS.
-  Counter subtract(Counter LHS, Counter RHS);
+  Counter subtract(Counter LHS, Counter RHS, bool Simplify = true);
 };
 
 using LineColPair = std::pair<unsigned, unsigned>;
diff --git a/llvm/include/llvm/ProfileData/GCOV.h b/llvm/include/llvm/ProfileData/GCOV.h
index ef6515d39144..fe56f84f28b6 100644
--- a/llvm/include/llvm/ProfileData/GCOV.h
+++ b/llvm/include/llvm/ProfileData/GCOV.h
@@ -14,9 +14,7 @@
 #ifndef LLVM_PROFILEDATA_GCOV_H
 #define LLVM_PROFILEDATA_GCOV_H
 
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
@@ -26,10 +24,8 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
-#include <cassert>
 #include <cstddef>
 #include <cstdint>
-#include <limits>
 #include <map>
 #include <memory>
 #include <string>
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index a416eb28906e..401d278cbd06 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -281,13 +281,21 @@ bool needsComdatForCounter(const Function &F, const Module &M);
 /// An enum describing the attributes of an instrumented profile.
 enum class InstrProfKind {
   Unknown = 0x0,
-  FE = 0x1, // A frontend clang profile, incompatible with other attrs.
-  IR = 0x2, // An IR-level profile (default when -fprofile-generate is used).
-  BB = 0x4, // A profile with entry basic block instrumentation.
-  CS = 0x8, // A context sensitive IR-level profile.
-  SingleByteCoverage = 0x10, // Use single byte probes for coverage.
-  FunctionEntryOnly = 0x20,  // Only instrument the function entry basic block.
-  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/FunctionEntryOnly)
+  // A frontend clang profile, incompatible with other attrs.
+  FrontendInstrumentation = 0x1,
+  // An IR-level profile (default when -fprofile-generate is used).
+  IRInstrumentation = 0x2,
+  // A profile with entry basic block instrumentation.
+  FunctionEntryInstrumentation = 0x4,
+  // A context sensitive IR-level profile.
+  ContextSensitive = 0x8,
+  // Use single byte probes for coverage.
+  SingleByteCoverage = 0x10,
+  // Only instrument the function entry basic block.
+  FunctionEntryOnly = 0x20,
+  // A memory profile collected using -fprofile=memory.
+  MemProf = 0x40,
+  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/MemProf)
 };
 
 const std::error_category &instrprof_category();
@@ -1011,7 +1019,9 @@ enum ProfVersion {
   Version6 = 6,
   // An additional counter is added around logical operators.
   Version7 = 7,
-  // The current version is 7.
+  // An additional (optional) memory profile type is added.
+  Version8 = 8,
+  // The current version is 8.
   CurrentVersion = INSTR_PROF_INDEX_VERSION
 };
 const uint64_t Version = ProfVersion::CurrentVersion;
@@ -1028,6 +1038,21 @@ struct Header {
   uint64_t Unused; // Becomes unused since version 4
   uint64_t HashType;
   uint64_t HashOffset;
+  uint64_t MemProfOffset;
+  // New fields should only be added at the end to ensure that the size
+  // computation is correct. The methods below need to be updated to ensure that
+  // the new field is read correctly.
+
+  // Reads a header struct from the buffer.
+  static Expected<Header> readFromBuffer(const unsigned char *Buffer);
+
+  // Returns the size of the header in bytes for all valid fields based on the
+  // version. I.e a older version header will return a smaller size.
+  size_t size() const;
+
+  // Returns the format version in little endian. The header retains the version
+  // in native endian of the compiler runtime.
+  uint64_t formatVersion() const;
 };
 
 // Profile summary data recorded in the profile data file in indexed
diff --git a/llvm/include/llvm/ProfileData/InstrProfCorrelator.h b/llvm/include/llvm/ProfileData/InstrProfCorrelator.h
index 3d0076fd9035..79995c813266 100644
--- a/llvm/include/llvm/ProfileData/InstrProfCorrelator.h
+++ b/llvm/include/llvm/ProfileData/InstrProfCorrelator.h
@@ -13,16 +13,17 @@
 #define LLVM_PROFILEDATA_INSTRPROFCORRELATOR_H
 
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/DebugInfo/DWARF/DWARFContext.h"
-#include "llvm/Object/Binary.h"
-#include "llvm/Object/ObjectFile.h"
 #include "llvm/ProfileData/InstrProf.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include <vector>
 
 namespace llvm {
+class DWARFContext;
+class DWARFDie;
+namespace object {
+class ObjectFile;
+}
 
 /// InstrProfCorrelator - A base class used to create raw instrumentation data
 /// to their functions.
diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc
index 62054a6a3df5..282620d8b5dc 100644
--- a/llvm/include/llvm/ProfileData/InstrProfData.inc
+++ b/llvm/include/llvm/ProfileData/InstrProfData.inc
@@ -650,7 +650,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 /* Raw profile format version (start from 1). */
 #define INSTR_PROF_RAW_VERSION 8
 /* Indexed profile format version (start from 1). */
-#define INSTR_PROF_INDEX_VERSION 7
+#define INSTR_PROF_INDEX_VERSION 8
 /* Coverage mapping format version (start from 0). */
 #define INSTR_PROF_COVMAP_VERSION 5
 
@@ -662,6 +662,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
  * The 59th bit indicates whether to use debug info to correlate profiles.
  * The 60th bit indicates single byte coverage instrumentation.
  * The 61st bit indicates function entry instrumentation only.
+ * The 62nd bit indicates whether memory profile information is present.
  */
 #define VARIANT_MASKS_ALL 0xff00000000000000ULL
 #define GET_VERSION(V) ((V) & ~VARIANT_MASKS_ALL)
@@ -671,6 +672,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define VARIANT_MASK_DBG_CORRELATE (0x1ULL << 59)
 #define VARIANT_MASK_BYTE_COVERAGE (0x1ULL << 60)
 #define VARIANT_MASK_FUNCTION_ENTRY_ONLY (0x1ULL << 61)
+#define VARIANT_MASK_MEMPROF (0x1ULL << 62)
 #define INSTR_PROF_RAW_VERSION_VAR __llvm_profile_raw_version
 #define INSTR_PROF_PROFILE_RUNTIME_VAR __llvm_profile_runtime
 #define INSTR_PROF_PROFILE_COUNTER_BIAS_VAR __llvm_profile_counter_bias
diff --git a/llvm/include/llvm/ProfileData/InstrProfReader.h b/llvm/include/llvm/ProfileData/InstrProfReader.h
index e9dd19a69792..3a25de05bbf1 100644
--- a/llvm/include/llvm/ProfileData/InstrProfReader.h
+++ b/llvm/include/llvm/ProfileData/InstrProfReader.h
@@ -19,6 +19,7 @@
 #include "llvm/IR/ProfileSummary.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/ProfileData/InstrProfCorrelator.h"
+#include "llvm/ProfileData/MemProf.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/LineIterator.h"
@@ -39,25 +40,36 @@ namespace llvm {
 class InstrProfReader;
 
 /// A file format agnostic iterator over profiling data.
+template <class record_type = NamedInstrProfRecord,
+          class reader_type = InstrProfReader>
 class InstrProfIterator {
 public:
   using iterator_category = std::input_iterator_tag;
-  using value_type = NamedInstrProfRecord;
+  using value_type = record_type;
   using difference_type = std::ptrdiff_t;
   using pointer = value_type *;
   using reference = value_type &;
 
 private:
-  InstrProfReader *Reader = nullptr;
+  reader_type *Reader = nullptr;
   value_type Record;
 
-  void Increment();
+  void increment() {
+    if (Error E = Reader->readNextRecord(Record)) {
+      // Handle errors in the reader.
+      InstrProfError::take(std::move(E));
+      *this = InstrProfIterator();
+    }
+  }
 
 public:
   InstrProfIterator() = default;
-  InstrProfIterator(InstrProfReader *Reader) : Reader(Reader) { Increment(); }
+  InstrProfIterator(reader_type *Reader) : Reader(Reader) { increment(); }
 
-  InstrProfIterator &operator++() { Increment(); return *this; }
+  InstrProfIterator &operator++() {
+    increment();
+    return *this;
+  }
   bool operator==(const InstrProfIterator &RHS) const {
     return Reader == RHS.Reader;
   }
@@ -88,8 +100,8 @@ public:
   virtual Error printBinaryIds(raw_ostream &OS) { return success(); };
 
   /// Iterator over profile data.
-  InstrProfIterator begin() { return InstrProfIterator(this); }
-  InstrProfIterator end() { return InstrProfIterator(); }
+  InstrProfIterator<> begin() { return InstrProfIterator<>(this); }
+  InstrProfIterator<> end() { return InstrProfIterator<>(); }
 
   virtual bool isIRLevelProfile() const = 0;
 
@@ -201,15 +213,16 @@ public:
   static bool hasFormat(const MemoryBuffer &Buffer);
 
   bool isIRLevelProfile() const override {
-    return static_cast<bool>(ProfileKind & InstrProfKind::IR);
+    return static_cast<bool>(ProfileKind & InstrProfKind::IRInstrumentation);
   }
 
   bool hasCSIRLevelProfile() const override {
-    return static_cast<bool>(ProfileKind & InstrProfKind::CS);
+    return static_cast<bool>(ProfileKind & InstrProfKind::ContextSensitive);
   }
 
   bool instrEntryBBEnabled() const override {
-    return static_cast<bool>(ProfileKind & InstrProfKind::BB);
+    return static_cast<bool>(ProfileKind &
+                             InstrProfKind::FunctionEntryInstrumentation);
   }
 
   bool hasSingleByteCoverage() const override {
@@ -460,6 +473,11 @@ struct InstrProfReaderIndexBase {
 using OnDiskHashTableImplV3 =
     OnDiskIterableChainedHashTable<InstrProfLookupTrait>;
 
+using MemProfRecordHashTable =
+    OnDiskIterableChainedHashTable<memprof::RecordLookupTrait>;
+using MemProfFrameHashTable =
+    OnDiskIterableChainedHashTable<memprof::FrameLookupTrait>;
+
 template <typename HashTableImpl>
 class InstrProfReaderItaniumRemapper;
 
@@ -545,6 +563,13 @@ private:
   std::unique_ptr<ProfileSummary> Summary;
   /// Context sensitive profile summary data.
   std::unique_ptr<ProfileSummary> CS_Summary;
+  /// MemProf profile schema (if available).
+  memprof::MemProfSchema Schema;
+  /// MemProf record profile data on-disk indexed via llvm::md5(FunctionName).
+  std::unique_ptr<MemProfRecordHashTable> MemProfRecordTable;
+  /// MemProf frame profile data on-disk indexed via frame id.
+  std::unique_ptr<MemProfFrameHashTable> MemProfFrameTable;
+
   // Index to the current record in the record array.
   unsigned RecordIndex;
 
@@ -598,6 +623,10 @@ public:
   Expected<InstrProfRecord> getInstrProfRecord(StringRef FuncName,
                                                uint64_t FuncHash);
 
+  /// Return the memprof record for the function identified by
+  /// llvm::md5(Name).
+  Expected<memprof::MemProfRecord> getMemProfRecord(uint64_t FuncNameHash);
+
   /// Fill Counts with the profile data for the given function name.
   Error getFunctionCounts(StringRef FuncName, uint64_t FuncHash,
                           std::vector<uint64_t> &Counts);
diff --git a/llvm/include/llvm/ProfileData/InstrProfWriter.h b/llvm/include/llvm/ProfileData/InstrProfWriter.h
index af1e46cf4fc2..29e07961a2f4 100644
--- a/llvm/include/llvm/ProfileData/InstrProfWriter.h
+++ b/llvm/include/llvm/ProfileData/InstrProfWriter.h
@@ -15,11 +15,13 @@
 #define LLVM_PROFILEDATA_INSTRPROFWRITER_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/MemProf.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/MemoryBuffer.h"
 #include <cstdint>
 #include <memory>
 
@@ -28,6 +30,7 @@ namespace llvm {
 /// Writer for instrumentation based profile data.
 class InstrProfRecordWriterTrait;
 class ProfOStream;
+class MemoryBuffer;
 class raw_fd_ostream;
 
 class InstrProfWriter {
@@ -37,6 +40,16 @@ public:
 private:
   bool Sparse;
   StringMap<ProfilingData> FunctionData;
+
+  // A map to hold memprof data per function. The lower 64 bits obtained from
+  // the md5 hash of the function name is used to index into the map.
+  llvm::MapVector<GlobalValue::GUID, memprof::IndexedMemProfRecord>
+      MemProfRecordData;
+  // A map to hold frame id to frame mappings. The mappings are used to
+  // convert IndexedMemProfRecord to MemProfRecords with frame information
+  // inline.
+  llvm::MapVector<memprof::FrameId, memprof::Frame> MemProfFrameData;
+
   // An enum describing the attributes of the profile.
   InstrProfKind ProfileKind = InstrProfKind::Unknown;
   // Use raw pointer here for the incomplete type object.
@@ -57,6 +70,15 @@ public:
     addRecord(std::move(I), 1, Warn);
   }
 
+  /// Add a memprof record for a function identified by its \p Id.
+  void addMemProfRecord(const GlobalValue::GUID Id,
+                        const memprof::IndexedMemProfRecord &Record);
+
+  /// Add a memprof frame identified by the hash of the contents of the frame in
+  /// \p FrameId.
+  bool addMemProfFrame(const memprof::FrameId, const memprof::Frame &F,
+                       function_ref<void(Error)> Warn);
+
   /// Merge existing function counts from the given writer.
   void mergeRecordsFromWriter(InstrProfWriter &&IPW,
                               function_ref<void(Error)> Warn);
@@ -97,11 +119,13 @@ public:
 
     // Check if the profiles are in-compatible. Clang frontend profiles can't be
     // merged with other profile types.
-    if (static_cast<bool>((ProfileKind & InstrProfKind::FE) ^
-                          (Other & InstrProfKind::FE))) {
+    if (static_cast<bool>(
+            (ProfileKind & InstrProfKind::FrontendInstrumentation) ^
+            (Other & InstrProfKind::FrontendInstrumentation))) {
       return make_error<InstrProfError>(instrprof_error::unsupported_version);
     }
-    if (testIncompatible(InstrProfKind::FunctionEntryOnly, InstrProfKind::BB)) {
+    if (testIncompatible(InstrProfKind::FunctionEntryOnly,
+                         InstrProfKind::FunctionEntryInstrumentation)) {
       return make_error<InstrProfError>(
           instrprof_error::unsupported_version,
           "cannot merge FunctionEntryOnly profiles and BB profiles together");
@@ -112,6 +136,8 @@ public:
     return Error::success();
   }
 
+  InstrProfKind getProfileKind() const { return ProfileKind; }
+
   // Internal interface for testing purpose only.
   void setValueProfDataEndianness(support::endianness Endianness);
   void setOutputSparse(bool Sparse);
diff --git a/llvm/include/llvm/ProfileData/MIBEntryDef.inc b/llvm/include/llvm/ProfileData/MIBEntryDef.inc
new file mode 100644
index 000000000000..f5c6f0e4924b
--- /dev/null
+++ b/llvm/include/llvm/ProfileData/MIBEntryDef.inc
@@ -0,0 +1,47 @@
+/*===-- MemEntryDef.inc - MemProf profiling runtime macros -*- C++ -*-======== *\
+|*
+|* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+|* See https://llvm.org/LICENSE.txt for license information.
+|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+|*
+\*===----------------------------------------------------------------------===*/
+/*
+ * This file defines the macros for memprof profiling data structures.
+ * Eg. usage to define the memprof meminfoblock struct:
+ *
+ * struct MemInfoBlock {
+ * #define MIBEntryDef(NameTag, Name, Type) Type Name;
+ * #include MIBEntryDef.inc
+ * #undef MIBEntryDef
+ * };
+ *
+ * This file has two identical copies. The primary copy lives in LLVM and
+ * the other one sits in compiler-rt/include/profile directory. To make changes
+ * in this file, first modify the primary copy and copy it over to compiler-rt.
+ * Testing of any change in this file can start only after the two copies are
+ * synced up.
+ *
+\*===----------------------------------------------------------------------===*/
+#ifndef MIBEntryDef
+#define MIBEntryDef(NameTag, Name, Type)
+#endif
+
+MIBEntryDef(AllocCount = 1, AllocCount, uint32_t)
+MIBEntryDef(TotalAccessCount = 2, TotalAccessCount, uint64_t)
+MIBEntryDef(MinAccessCount = 3, MinAccessCount, uint64_t)
+MIBEntryDef(MaxAccessCount = 4, MaxAccessCount, uint64_t)
+MIBEntryDef(TotalSize = 5, TotalSize, uint64_t)
+MIBEntryDef(MinSize = 6, MinSize, uint32_t)
+MIBEntryDef(MaxSize = 7, MaxSize, uint32_t)
+MIBEntryDef(AllocTimestamp = 8, AllocTimestamp, uint32_t)
+MIBEntryDef(DeallocTimestamp = 9, DeallocTimestamp, uint32_t)
+MIBEntryDef(TotalLifetime = 10, TotalLifetime, uint64_t)
+MIBEntryDef(MinLifetime = 11, MinLifetime, uint32_t)
+MIBEntryDef(MaxLifetime = 12, MaxLifetime, uint32_t)
+MIBEntryDef(AllocCpuId = 13, AllocCpuId, uint32_t)
+MIBEntryDef(DeallocCpuId = 14, DeallocCpuId, uint32_t)
+MIBEntryDef(NumMigratedCpu = 15, NumMigratedCpu, uint32_t)
+MIBEntryDef(NumLifetimeOverlaps = 16, NumLifetimeOverlaps, uint32_t)
+MIBEntryDef(NumSameAllocCpu = 17, NumSameAllocCpu, uint32_t)
+MIBEntryDef(NumSameDeallocCpu = 18, NumSameDeallocCpu, uint32_t)
+MIBEntryDef(DataTypeId = 19, DataTypeId, uint64_t)
diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h
new file mode 100644
index 000000000000..bcee3b25bf87
--- /dev/null
+++ b/llvm/include/llvm/ProfileData/MemProf.h
@@ -0,0 +1,613 @@
+#ifndef LLVM_PROFILEDATA_MEMPROF_H_
+#define LLVM_PROFILEDATA_MEMPROF_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/ProfileData/MemProfData.inc"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <cstdint>
+
+namespace llvm {
+namespace memprof {
+
+enum class Meta : uint64_t {
+  Start = 0,
+#define MIBEntryDef(NameTag, Name, Type) NameTag,
+#include "llvm/ProfileData/MIBEntryDef.inc"
+#undef MIBEntryDef
+  Size
+};
+
+using MemProfSchema = llvm::SmallVector<Meta, static_cast<int>(Meta::Size)>;
+
+// Holds the actual MemInfoBlock data with all fields. Contents may be read or
+// written partially by providing an appropriate schema to the serialize and
+// deserialize methods.
+struct PortableMemInfoBlock {
+  PortableMemInfoBlock() = default;
+  explicit PortableMemInfoBlock(const MemInfoBlock &Block) {
+#define MIBEntryDef(NameTag, Name, Type) Name = Block.Name;
+#include "llvm/ProfileData/MIBEntryDef.inc"
+#undef MIBEntryDef
+  }
+
+  PortableMemInfoBlock(const MemProfSchema &Schema, const unsigned char *Ptr) {
+    deserialize(Schema, Ptr);
+  }
+
+  // Read the contents of \p Ptr based on the \p Schema to populate the
+  // MemInfoBlock member.
+  void deserialize(const MemProfSchema &Schema, const unsigned char *Ptr) {
+    using namespace support;
+
+    for (const Meta Id : Schema) {
+      switch (Id) {
+#define MIBEntryDef(NameTag, Name, Type)                                       \
+  case Meta::Name: {                                                           \
+    Name = endian::readNext<Type, little, unaligned>(Ptr);                     \
+  } break;
+#include "llvm/ProfileData/MIBEntryDef.inc"
+#undef MIBEntryDef
+      default:
+        llvm_unreachable("Unknown meta type id, is the profile collected from "
+                         "a newer version of the runtime?");
+      }
+    }
+  }
+
+  // Write the contents of the MemInfoBlock based on the \p Schema provided to
+  // the raw_ostream \p OS.
+  void serialize(const MemProfSchema &Schema, raw_ostream &OS) const {
+    using namespace support;
+
+    endian::Writer LE(OS, little);
+    for (const Meta Id : Schema) {
+      switch (Id) {
+#define MIBEntryDef(NameTag, Name, Type)                                       \
+  case Meta::Name: {                                                           \
+    LE.write<Type>(Name);                                                      \
+  } break;
+#include "llvm/ProfileData/MIBEntryDef.inc"
+#undef MIBEntryDef
+      default:
+        llvm_unreachable("Unknown meta type id, invalid input?");
+      }
+    }
+  }
+
+  // Print out the contents of the MemInfoBlock in YAML format.
+  void printYAML(raw_ostream &OS) const {
+    OS << "      MemInfoBlock:\n";
+#define MIBEntryDef(NameTag, Name, Type)                                       \
+  OS << "        " << #Name << ": " << Name << "\n";
+#include "llvm/ProfileData/MIBEntryDef.inc"
+#undef MIBEntryDef
+  }
+
+  // Define getters for each type which can be called by analyses.
+#define MIBEntryDef(NameTag, Name, Type)                                       \
+  Type get##Name() const { return Name; }
+#include "llvm/ProfileData/MIBEntryDef.inc"
+#undef MIBEntryDef
+
+  void clear() { *this = PortableMemInfoBlock(); }
+
+  // Returns the full schema currently in use.
+  static MemProfSchema getSchema() {
+    MemProfSchema List;
+#define MIBEntryDef(NameTag, Name, Type) List.push_back(Meta::Name);
+#include "llvm/ProfileData/MIBEntryDef.inc"
+#undef MIBEntryDef
+    return List;
+  }
+
+  bool operator==(const PortableMemInfoBlock &Other) const {
+#define MIBEntryDef(NameTag, Name, Type)                                       \
+  if (Other.get##Name() != get##Name())                                        \
+    return false;
+#include "llvm/ProfileData/MIBEntryDef.inc"
+#undef MIBEntryDef
+    return true;
+  }
+
+  bool operator!=(const PortableMemInfoBlock &Other) const {
+    return !operator==(Other);
+  }
+
+  static constexpr size_t serializedSize() {
+    size_t Result = 0;
+#define MIBEntryDef(NameTag, Name, Type) Result += sizeof(Type);
+#include "llvm/ProfileData/MIBEntryDef.inc"
+#undef MIBEntryDef
+    return Result;
+  }
+
+private:
+#define MIBEntryDef(NameTag, Name, Type) Type Name = Type();
+#include "llvm/ProfileData/MIBEntryDef.inc"
+#undef MIBEntryDef
+};
+
+// A type representing the id generated by hashing the contents of the Frame.
+using FrameId = uint64_t;
+// Describes a call frame for a dynamic allocation context. The contents of
+// the frame are populated by symbolizing the stack depot call frame from the
+// compiler runtime.
+struct Frame {
+  // A uuid (uint64_t) identifying the function. It is obtained by
+  // llvm::md5(FunctionName) which returns the lower 64 bits.
+  GlobalValue::GUID Function;
+  // The symbol name for the function. Only populated in the Frame by the reader
+  // if requested during initialization. This field should not be serialized.
+  llvm::Optional<std::string> SymbolName;
+  // The source line offset of the call from the beginning of parent function.
+  uint32_t LineOffset;
+  // The source column number of the call to help distinguish multiple calls
+  // on the same line.
+  uint32_t Column;
+  // Whether the current frame is inlined.
+  bool IsInlineFrame;
+
+  Frame(const Frame &Other) {
+    Function = Other.Function;
+    SymbolName = Other.SymbolName;
+    LineOffset = Other.LineOffset;
+    Column = Other.Column;
+    IsInlineFrame = Other.IsInlineFrame;
+  }
+
+  Frame(uint64_t Hash, uint32_t Off, uint32_t Col, bool Inline)
+      : Function(Hash), LineOffset(Off), Column(Col), IsInlineFrame(Inline) {}
+
+  bool operator==(const Frame &Other) const {
+    // Ignore the SymbolName field to avoid a string compare. Comparing the
+    // function hash serves the same purpose.
+    return Other.Function == Function && Other.LineOffset == LineOffset &&
+           Other.Column == Column && Other.IsInlineFrame == IsInlineFrame;
+  }
+
+  Frame &operator=(const Frame &Other) {
+    Function = Other.Function;
+    SymbolName = Other.SymbolName;
+    LineOffset = Other.LineOffset;
+    Column = Other.Column;
+    IsInlineFrame = Other.IsInlineFrame;
+    return *this;
+  }
+
+  bool operator!=(const Frame &Other) const { return !operator==(Other); }
+
+  // Write the contents of the frame to the ostream \p OS.
+  void serialize(raw_ostream &OS) const {
+    using namespace support;
+
+    endian::Writer LE(OS, little);
+
+    // If the type of the GlobalValue::GUID changes, then we need to update
+    // the reader and the writer.
+    static_assert(std::is_same<GlobalValue::GUID, uint64_t>::value,
+                  "Expect GUID to be uint64_t.");
+    LE.write<uint64_t>(Function);
+
+    LE.write<uint32_t>(LineOffset);
+    LE.write<uint32_t>(Column);
+    LE.write<bool>(IsInlineFrame);
+  }
+
+  // Read a frame from char data which has been serialized as little endian.
+  static Frame deserialize(const unsigned char *Ptr) {
+    using namespace support;
+
+    const uint64_t F = endian::readNext<uint64_t, little, unaligned>(Ptr);
+    const uint32_t L = endian::readNext<uint32_t, little, unaligned>(Ptr);
+    const uint32_t C = endian::readNext<uint32_t, little, unaligned>(Ptr);
+    const bool I = endian::readNext<bool, little, unaligned>(Ptr);
+    return Frame(/*Function=*/F, /*LineOffset=*/L, /*Column=*/C,
+                 /*IsInlineFrame=*/I);
+  }
+
+  // Returns the size of the frame information.
+  static constexpr size_t serializedSize() {
+    return sizeof(Frame::Function) + sizeof(Frame::LineOffset) +
+           sizeof(Frame::Column) + sizeof(Frame::IsInlineFrame);
+  }
+
+  // Print the frame information in YAML format.
+  void printYAML(raw_ostream &OS) const {
+    OS << "      -\n"
+       << "        Function: " << Function << "\n"
+       << "        SymbolName: " << SymbolName.value_or("<None>") << "\n"
+       << "        LineOffset: " << LineOffset << "\n"
+       << "        Column: " << Column << "\n"
+       << "        Inline: " << IsInlineFrame << "\n";
+  }
+
+  // Return a hash value based on the contents of the frame. Here we don't use
+  // hashing from llvm ADT since we are going to persist the hash id, the hash
+  // combine algorithm in ADT uses a new randomized seed each time.
+  inline FrameId hash() const {
+    auto HashCombine = [](auto Value, size_t Seed) {
+      std::hash<decltype(Value)> Hasher;
+      // The constant used below is the 64 bit representation of the fractional
+      // part of the golden ratio. Used here for the randomness in their bit
+      // pattern.
+      return Hasher(Value) + 0x9e3779b97f4a7c15 + (Seed << 6) + (Seed >> 2);
+    };
+
+    size_t Result = 0;
+    Result ^= HashCombine(Function, Result);
+    Result ^= HashCombine(LineOffset, Result);
+    Result ^= HashCombine(Column, Result);
+    Result ^= HashCombine(IsInlineFrame, Result);
+    return static_cast<FrameId>(Result);
+  }
+};
+
+// Holds allocation information in a space efficient format where frames are
+// represented using unique identifiers.
+struct IndexedAllocationInfo {
+  // The dynamic calling context for the allocation in bottom-up (leaf-to-root)
+  // order. Frame contents are stored out-of-line.
+  llvm::SmallVector<FrameId> CallStack;
+  // The statistics obtained from the runtime for the allocation.
+  PortableMemInfoBlock Info;
+
+  IndexedAllocationInfo() = default;
+  IndexedAllocationInfo(ArrayRef<FrameId> CS, const MemInfoBlock &MB)
+      : CallStack(CS.begin(), CS.end()), Info(MB) {}
+
+  // Returns the size in bytes when this allocation info struct is serialized.
+  size_t serializedSize() const {
+    return sizeof(uint64_t) + // The number of frames to serialize.
+           sizeof(FrameId) * CallStack.size() +    // The callstack frame ids.
+           PortableMemInfoBlock::serializedSize(); // The size of the payload.
+  }
+
+  bool operator==(const IndexedAllocationInfo &Other) const {
+    if (Other.Info != Info)
+      return false;
+
+    if (Other.CallStack.size() != CallStack.size())
+      return false;
+
+    for (size_t J = 0; J < Other.CallStack.size(); J++) {
+      if (Other.CallStack[J] != CallStack[J])
+        return false;
+    }
+    return true;
+  }
+
+  bool operator!=(const IndexedAllocationInfo &Other) const {
+    return !operator==(Other);
+  }
+};
+
+// Holds allocation information with frame contents inline. The type should
+// be used for temporary in-memory instances.
+struct AllocationInfo {
+  // Same as IndexedAllocationInfo::CallStack with the frame contents inline.
+  llvm::SmallVector<Frame> CallStack;
+  // Same as IndexedAllocationInfo::Info;
+  PortableMemInfoBlock Info;
+
+  AllocationInfo() = default;
+  AllocationInfo(
+      const IndexedAllocationInfo &IndexedAI,
+      llvm::function_ref<const Frame(const FrameId)> IdToFrameCallback) {
+    for (const FrameId &Id : IndexedAI.CallStack) {
+      CallStack.push_back(IdToFrameCallback(Id));
+    }
+    Info = IndexedAI.Info;
+  }
+
+  void printYAML(raw_ostream &OS) const {
+    OS << "    -\n";
+    OS << "      Callstack:\n";
+    // TODO: Print out the frame on one line with to make it easier for deep
+    // callstacks once we have a test to check valid YAML is generated.
+    for (const Frame &F : CallStack) {
+      F.printYAML(OS);
+    }
+    Info.printYAML(OS);
+  }
+};
+
+// Holds the memprof profile information for a function. The internal
+// representation stores frame ids for efficiency. This representation should
+// be used in the profile conversion and manipulation tools.
+struct IndexedMemProfRecord {
+  // Memory allocation sites in this function for which we have memory
+  // profiling data.
+  llvm::SmallVector<IndexedAllocationInfo> AllocSites;
+  // Holds call sites in this function which are part of some memory
+  // allocation context. We store this as a list of locations, each with its
+  // list of inline locations in bottom-up order i.e. from leaf to root. The
+  // inline location list may include additional entries, users should pick
+  // the last entry in the list with the same function GUID.
+  llvm::SmallVector<llvm::SmallVector<FrameId>> CallSites;
+
+  void clear() {
+    AllocSites.clear();
+    CallSites.clear();
+  }
+
+  void merge(const IndexedMemProfRecord &Other) {
+    // TODO: Filter out duplicates which may occur if multiple memprof
+    // profiles are merged together using llvm-profdata.
+    AllocSites.append(Other.AllocSites);
+    CallSites.append(Other.CallSites);
+  }
+
+  size_t serializedSize() const {
+    size_t Result = sizeof(GlobalValue::GUID);
+    for (const IndexedAllocationInfo &N : AllocSites)
+      Result += N.serializedSize();
+
+    // The number of callsites we have information for.
+    Result += sizeof(uint64_t);
+    for (const auto &Frames : CallSites) {
+      // The number of frame ids to serialize.
+      Result += sizeof(uint64_t);
+      Result += Frames.size() * sizeof(FrameId);
+    }
+    return Result;
+  }
+
+  bool operator==(const IndexedMemProfRecord &Other) const {
+    if (Other.AllocSites.size() != AllocSites.size())
+      return false;
+
+    if (Other.CallSites.size() != CallSites.size())
+      return false;
+
+    for (size_t I = 0; I < AllocSites.size(); I++) {
+      if (AllocSites[I] != Other.AllocSites[I])
+        return false;
+    }
+
+    for (size_t I = 0; I < CallSites.size(); I++) {
+      if (CallSites[I] != Other.CallSites[I])
+        return false;
+    }
+    return true;
+  }
+
+  // Serializes the memprof records in \p Records to the ostream \p OS based
+  // on the schema provided in \p Schema.
+  void serialize(const MemProfSchema &Schema, raw_ostream &OS);
+
+  // Deserializes memprof records from the Buffer.
+  static IndexedMemProfRecord deserialize(const MemProfSchema &Schema,
+                                          const unsigned char *Buffer);
+
+  // Returns the GUID for the function name after canonicalization. For
+  // memprof, we remove any .llvm suffix added by LTO. MemProfRecords are
+  // mapped to functions using this GUID.
+  static GlobalValue::GUID getGUID(const StringRef FunctionName);
+};
+
+// Holds the memprof profile information for a function. The internal
+// representation stores frame contents inline. This representation should
+// be used for small amount of temporary, in memory instances.
+struct MemProfRecord {
+  // Same as IndexedMemProfRecord::AllocSites with frame contents inline.
+  llvm::SmallVector<AllocationInfo> AllocSites;
+  // Same as IndexedMemProfRecord::CallSites with frame contents inline.
+  llvm::SmallVector<llvm::SmallVector<Frame>> CallSites;
+
+  MemProfRecord() = default;
+  MemProfRecord(
+      const IndexedMemProfRecord &Record,
+      llvm::function_ref<const Frame(const FrameId Id)> IdToFrameCallback) {
+    for (const IndexedAllocationInfo &IndexedAI : Record.AllocSites) {
+      AllocSites.emplace_back(IndexedAI, IdToFrameCallback);
+    }
+    for (const ArrayRef<FrameId> Site : Record.CallSites) {
+      llvm::SmallVector<Frame> Frames;
+      for (const FrameId Id : Site) {
+        Frames.push_back(IdToFrameCallback(Id));
+      }
+      CallSites.push_back(Frames);
+    }
+  }
+
+  // Prints out the contents of the memprof record in YAML.
+  void print(llvm::raw_ostream &OS) const {
+    if (!AllocSites.empty()) {
+      OS << "    AllocSites:\n";
+      for (const AllocationInfo &N : AllocSites)
+        N.printYAML(OS);
+    }
+
+    if (!CallSites.empty()) {
+      OS << "    CallSites:\n";
+      for (const llvm::SmallVector<Frame> &Frames : CallSites) {
+        for (const Frame &F : Frames) {
+          OS << "    -\n";
+          F.printYAML(OS);
+        }
+      }
+    }
+  }
+};
+
+// Reads a memprof schema from a buffer. All entries in the buffer are
+// interpreted as uint64_t. The first entry in the buffer denotes the number of
+// ids in the schema. Subsequent entries are integers which map to memprof::Meta
+// enum class entries. After successfully reading the schema, the pointer is one
+// byte past the schema contents.
+Expected<MemProfSchema> readMemProfSchema(const unsigned char *&Buffer);
+
+// Trait for reading IndexedMemProfRecord data from the on-disk hash table.
+class RecordLookupTrait {
+public:
+  using data_type = const IndexedMemProfRecord &;
+  using internal_key_type = uint64_t;
+  using external_key_type = uint64_t;
+  using hash_value_type = uint64_t;
+  using offset_type = uint64_t;
+
+  RecordLookupTrait() = delete;
+  RecordLookupTrait(const MemProfSchema &S) : Schema(S) {}
+
+  static bool EqualKey(uint64_t A, uint64_t B) { return A == B; }
+  static uint64_t GetInternalKey(uint64_t K) { return K; }
+  static uint64_t GetExternalKey(uint64_t K) { return K; }
+
+  hash_value_type ComputeHash(uint64_t K) { return K; }
+
+  static std::pair<offset_type, offset_type>
+  ReadKeyDataLength(const unsigned char *&D) {
+    using namespace support;
+
+    offset_type KeyLen = endian::readNext<offset_type, little, unaligned>(D);
+    offset_type DataLen = endian::readNext<offset_type, little, unaligned>(D);
+    return std::make_pair(KeyLen, DataLen);
+  }
+
+  uint64_t ReadKey(const unsigned char *D, offset_type /*Unused*/) {
+    using namespace support;
+    return endian::readNext<external_key_type, little, unaligned>(D);
+  }
+
+  data_type ReadData(uint64_t K, const unsigned char *D,
+                     offset_type /*Unused*/) {
+    Record = IndexedMemProfRecord::deserialize(Schema, D);
+    return Record;
+  }
+
+private:
+  // Holds the memprof schema used to deserialize records.
+  MemProfSchema Schema;
+  // Holds the records from one function deserialized from the indexed format.
+  IndexedMemProfRecord Record;
+};
+
+// Trait for writing IndexedMemProfRecord data to the on-disk hash table.
+class RecordWriterTrait {
+public:
+  using key_type = uint64_t;
+  using key_type_ref = uint64_t;
+
+  using data_type = IndexedMemProfRecord;
+  using data_type_ref = IndexedMemProfRecord &;
+
+  using hash_value_type = uint64_t;
+  using offset_type = uint64_t;
+
+  // Pointer to the memprof schema to use for the generator. Unlike the reader
+  // we must use a default constructor with no params for the writer trait so we
+  // have a public member which must be initialized by the user.
+  MemProfSchema *Schema = nullptr;
+
+  RecordWriterTrait() = default;
+
+  static hash_value_type ComputeHash(key_type_ref K) { return K; }
+
+  static std::pair<offset_type, offset_type>
+  EmitKeyDataLength(raw_ostream &Out, key_type_ref K, data_type_ref V) {
+    using namespace support;
+
+    endian::Writer LE(Out, little);
+    offset_type N = sizeof(K);
+    LE.write<offset_type>(N);
+    offset_type M = V.serializedSize();
+    LE.write<offset_type>(M);
+    return std::make_pair(N, M);
+  }
+
+  void EmitKey(raw_ostream &Out, key_type_ref K, offset_type /*Unused*/) {
+    using namespace support;
+    endian::Writer LE(Out, little);
+    LE.write<uint64_t>(K);
+  }
+
+  void EmitData(raw_ostream &Out, key_type_ref /*Unused*/, data_type_ref V,
+                offset_type /*Unused*/) {
+    assert(Schema != nullptr && "MemProf schema is not initialized!");
+    V.serialize(*Schema, Out);
+  }
+};
+
+// Trait for writing frame mappings to the on-disk hash table.
+class FrameWriterTrait {
+public:
+  using key_type = FrameId;
+  using key_type_ref = FrameId;
+
+  using data_type = Frame;
+  using data_type_ref = Frame &;
+
+  using hash_value_type = FrameId;
+  using offset_type = uint64_t;
+
+  static hash_value_type ComputeHash(key_type_ref K) { return K; }
+
+  static std::pair<offset_type, offset_type>
+  EmitKeyDataLength(raw_ostream &Out, key_type_ref K, data_type_ref V) {
+    using namespace support;
+    endian::Writer LE(Out, little);
+    offset_type N = sizeof(K);
+    LE.write<offset_type>(N);
+    offset_type M = V.serializedSize();
+    LE.write<offset_type>(M);
+    return std::make_pair(N, M);
+  }
+
+  void EmitKey(raw_ostream &Out, key_type_ref K, offset_type /*Unused*/) {
+    using namespace support;
+    endian::Writer LE(Out, little);
+    LE.write<key_type>(K);
+  }
+
+  void EmitData(raw_ostream &Out, key_type_ref /*Unused*/, data_type_ref V,
+                offset_type /*Unused*/) {
+    V.serialize(Out);
+  }
+};
+
+// Trait for reading frame mappings from the on-disk hash table.
+class FrameLookupTrait {
+public:
+  using data_type = const Frame;
+  using internal_key_type = FrameId;
+  using external_key_type = FrameId;
+  using hash_value_type = FrameId;
+  using offset_type = uint64_t;
+
+  static bool EqualKey(internal_key_type A, internal_key_type B) {
+    return A == B;
+  }
+  static uint64_t GetInternalKey(internal_key_type K) { return K; }
+  static uint64_t GetExternalKey(external_key_type K) { return K; }
+
+  hash_value_type ComputeHash(internal_key_type K) { return K; }
+
+  static std::pair<offset_type, offset_type>
+  ReadKeyDataLength(const unsigned char *&D) {
+    using namespace support;
+
+    offset_type KeyLen = endian::readNext<offset_type, little, unaligned>(D);
+    offset_type DataLen = endian::readNext<offset_type, little, unaligned>(D);
+    return std::make_pair(KeyLen, DataLen);
+  }
+
+  uint64_t ReadKey(const unsigned char *D, offset_type /*Unused*/) {
+    using namespace support;
+    return endian::readNext<external_key_type, little, unaligned>(D);
+  }
+
+  data_type ReadData(uint64_t K, const unsigned char *D,
+                     offset_type /*Unused*/) {
+    return Frame::deserialize(D);
+  }
+};
+} // namespace memprof
+} // namespace llvm
+
+#endif // LLVM_PROFILEDATA_MEMPROF_H_
diff --git a/llvm/include/llvm/ProfileData/MemProfData.inc b/llvm/include/llvm/ProfileData/MemProfData.inc
index ff22a697965c..6433cef84865 100644
--- a/llvm/include/llvm/ProfileData/MemProfData.inc
+++ b/llvm/include/llvm/ProfileData/MemProfData.inc
@@ -1,5 +1,5 @@
-#ifndef LLVM_PROFILEDATA_MEMPROFDATA_INC
-#define LLVM_PROFILEDATA_MEMPROFDATA_INC
+#ifndef MEMPROF_DATA_INC
+#define MEMPROF_DATA_INC
 /*===-- MemProfData.inc - MemProf profiling runtime structures -*- C++ -*-=== *\
 |*
 |* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
@@ -80,71 +80,90 @@ PACKED(struct SegmentEntry {
   }
 });
 
+// Packed struct definition for MSVC. We can't use the PACKED macro defined in
+// MemProfData.inc since it would mean we are embedding a directive (the
+// #include for MIBEntryDef) into the macros which is undefined behaviour.
+#ifdef _MSC_VER
+__pragma(pack(push,1))
+#endif
+
 // A struct representing the heap allocation characteristics of a particular
 // runtime context. This struct is shared between the compiler-rt runtime and
 // the raw profile reader. The indexed format uses a separate, self-describing
 // backwards compatible format.
-PACKED(struct MemInfoBlock {
-  uint32_t alloc_count;
-  uint64_t total_access_count, min_access_count, max_access_count;
-  uint64_t total_size;
-  uint32_t min_size, max_size;
-  uint32_t alloc_timestamp, dealloc_timestamp;
-  uint64_t total_lifetime;
-  uint32_t min_lifetime, max_lifetime;
-  uint32_t alloc_cpu_id, dealloc_cpu_id;
-  uint32_t num_migrated_cpu;
-
-  // Only compared to prior deallocated object currently.
-  uint32_t num_lifetime_overlaps;
-  uint32_t num_same_alloc_cpu;
-  uint32_t num_same_dealloc_cpu;
-
-  uint64_t data_type_id; // TODO: hash of type name
-
-  MemInfoBlock() : alloc_count(0) {}
-
-  MemInfoBlock(uint32_t size, uint64_t access_count, uint32_t alloc_timestamp,
-               uint32_t dealloc_timestamp, uint32_t alloc_cpu, uint32_t dealloc_cpu)
-      : alloc_count(1), total_access_count(access_count),
-        min_access_count(access_count), max_access_count(access_count),
-        total_size(size), min_size(size), max_size(size),
-        alloc_timestamp(alloc_timestamp), dealloc_timestamp(dealloc_timestamp),
-        total_lifetime(dealloc_timestamp - alloc_timestamp),
-        min_lifetime(total_lifetime), max_lifetime(total_lifetime),
-        alloc_cpu_id(alloc_cpu), dealloc_cpu_id(dealloc_cpu),
-        num_lifetime_overlaps(0), num_same_alloc_cpu(0),
-        num_same_dealloc_cpu(0) {
-    num_migrated_cpu = alloc_cpu_id != dealloc_cpu_id;
-  }
-
-  void Merge(const MemInfoBlock &newMIB) {
-    alloc_count += newMIB.alloc_count;
-
-    total_access_count += newMIB.total_access_count;
-    min_access_count = newMIB.min_access_count < min_access_count ? newMIB.min_access_count : min_access_count;
-    max_access_count = newMIB.max_access_count < max_access_count ? newMIB.max_access_count : max_access_count;
-
-    total_size += newMIB.total_size;
-    min_size = newMIB.min_size < min_size ? newMIB.min_size : min_size;
-    max_size = newMIB.max_size < max_size ? newMIB.max_size : max_size;
+struct MemInfoBlock{
+
+#define MIBEntryDef(NameTag, Name, Type) Type Name;
+#include "MIBEntryDef.inc"
+#undef MIBEntryDef
+
+bool operator==(const MemInfoBlock& Other) const {
+  bool IsEqual = true;
+#define MIBEntryDef(NameTag, Name, Type) \
+  IsEqual = (IsEqual && Name == Other.Name);
+#include "MIBEntryDef.inc"
+#undef MIBEntryDef
+  return IsEqual;
+}
+
+MemInfoBlock() {
+#define MIBEntryDef(NameTag, Name, Type) Name = Type();
+#include "MIBEntryDef.inc"
+#undef MIBEntryDef
+}
+
+MemInfoBlock(uint32_t Size, uint64_t AccessCount, uint32_t AllocTs,
+             uint32_t DeallocTs, uint32_t AllocCpu, uint32_t DeallocCpu)
+    : MemInfoBlock() {
+  AllocCount = 1U;
+  TotalAccessCount = AccessCount;
+  MinAccessCount = AccessCount;
+  MaxAccessCount = AccessCount;
+  TotalSize = Size;
+  MinSize = Size;
+  MaxSize = Size;
+  AllocTimestamp = AllocTs;
+  DeallocTimestamp = DeallocTs;
+  TotalLifetime = DeallocTimestamp - AllocTimestamp;
+  MinLifetime = TotalLifetime;
+  MaxLifetime = TotalLifetime;
+  AllocCpuId = AllocCpu;
+  DeallocCpuId = DeallocCpu;
+  NumMigratedCpu = AllocCpuId != DeallocCpuId;
+}
+
+void Merge(const MemInfoBlock &newMIB) {
+  AllocCount += newMIB.AllocCount;
+
+  TotalAccessCount += newMIB.TotalAccessCount;
+  MinAccessCount = newMIB.MinAccessCount < MinAccessCount ? newMIB.MinAccessCount : MinAccessCount;
+  MaxAccessCount = newMIB.MaxAccessCount < MaxAccessCount ? newMIB.MaxAccessCount : MaxAccessCount;
+
+  TotalSize += newMIB.TotalSize;
+  MinSize = newMIB.MinSize < MinSize ? newMIB.MinSize : MinSize;
+  MaxSize = newMIB.MaxSize < MaxSize ? newMIB.MaxSize : MaxSize;
+
+  TotalLifetime += newMIB.TotalLifetime;
+  MinLifetime = newMIB.MinLifetime < MinLifetime ? newMIB.MinLifetime : MinLifetime;
+  MaxLifetime = newMIB.MaxLifetime > MaxLifetime ? newMIB.MaxLifetime : MaxLifetime;
+
+  // We know newMIB was deallocated later, so just need to check if it was
+  // allocated before last one deallocated.
+  NumLifetimeOverlaps += newMIB.AllocTimestamp < DeallocTimestamp;
+  AllocTimestamp = newMIB.AllocTimestamp;
+  DeallocTimestamp = newMIB.DeallocTimestamp;
+
+  NumSameAllocCpu += AllocCpuId == newMIB.AllocCpuId;
+  NumSameDeallocCpu += DeallocCpuId == newMIB.DeallocCpuId;
+  AllocCpuId = newMIB.AllocCpuId;
+  DeallocCpuId = newMIB.DeallocCpuId;
+}
 
-    total_lifetime += newMIB.total_lifetime;
-    min_lifetime = newMIB.min_lifetime < min_lifetime ? newMIB.min_lifetime : min_lifetime;
-    max_lifetime = newMIB.max_lifetime > max_lifetime ? newMIB.max_lifetime : max_lifetime;
-
-    // We know newMIB was deallocated later, so just need to check if it was
-    // allocated before last one deallocated.
-    num_lifetime_overlaps += newMIB.alloc_timestamp < dealloc_timestamp;
-    alloc_timestamp = newMIB.alloc_timestamp;
-    dealloc_timestamp = newMIB.dealloc_timestamp;
-
-    num_same_alloc_cpu += alloc_cpu_id == newMIB.alloc_cpu_id;
-    num_same_dealloc_cpu += dealloc_cpu_id == newMIB.dealloc_cpu_id;
-    alloc_cpu_id = newMIB.alloc_cpu_id;
-    dealloc_cpu_id = newMIB.dealloc_cpu_id;
-  }
-});
+#ifdef _MSC_VER
+} __pragma(pack(pop));
+#else
+} __attribute__((__packed__));
+#endif
 
 } // namespace memprof
 } // namespace llvm
diff --git a/llvm/include/llvm/ProfileData/RawMemProfReader.h b/llvm/include/llvm/ProfileData/RawMemProfReader.h
index 45544927a86f..34f78063aa42 100644
--- a/llvm/include/llvm/ProfileData/RawMemProfReader.h
+++ b/llvm/include/llvm/ProfileData/RawMemProfReader.h
@@ -12,31 +12,142 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
+#include "llvm/DebugInfo/Symbolize/Symbolize.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/ProfileData/InstrProfReader.h"
+#include "llvm/ProfileData/MemProf.h"
+#include "llvm/ProfileData/MemProfData.inc"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
 
+#include <cstddef>
+
 namespace llvm {
 namespace memprof {
 
+// Map from id (recorded from sanitizer stack depot) to virtual addresses for
+// each program counter address in the callstack.
+using CallStackMap = llvm::DenseMap<uint64_t, llvm::SmallVector<uint64_t>>;
+
 class RawMemProfReader {
 public:
-  RawMemProfReader(std::unique_ptr<MemoryBuffer> DataBuffer)
-      : DataBuffer(std::move(DataBuffer)) {}
-  // Prints aggregate counts for each raw profile parsed from the DataBuffer.
-  void printSummaries(raw_ostream &OS) const;
+  RawMemProfReader(const RawMemProfReader &) = delete;
+  RawMemProfReader &operator=(const RawMemProfReader &) = delete;
+
+  // Prints the contents of the profile in YAML format.
+  void printYAML(raw_ostream &OS);
 
   // Return true if the \p DataBuffer starts with magic bytes indicating it is
   // a raw binary memprof profile.
   static bool hasFormat(const MemoryBuffer &DataBuffer);
+  // Return true if the file at \p Path starts with magic bytes indicating it is
+  // a raw binary memprof profile.
+  static bool hasFormat(const StringRef Path);
 
   // Create a RawMemProfReader after sanity checking the contents of the file at
-  // \p Path.
-  static Expected<std::unique_ptr<RawMemProfReader>> create(const Twine &Path);
+  // \p Path. The binary from which the profile has been collected is specified
+  // via a path in \p ProfiledBinary.
+  static Expected<std::unique_ptr<RawMemProfReader>>
+  create(const Twine &Path, const StringRef ProfiledBinary,
+         bool KeepName = false);
+
+  using GuidMemProfRecordPair = std::pair<GlobalValue::GUID, MemProfRecord>;
+  using Iterator = InstrProfIterator<GuidMemProfRecordPair, RawMemProfReader>;
+  Iterator end() { return Iterator(); }
+  Iterator begin() {
+    Iter = FunctionProfileData.begin();
+    return Iterator(this);
+  }
+
+  Error readNextRecord(GuidMemProfRecordPair &GuidRecord);
+
+  // The RawMemProfReader only holds memory profile information.
+  InstrProfKind getProfileKind() const { return InstrProfKind::MemProf; }
+
+  // Constructor for unittests only.
+  RawMemProfReader(std::unique_ptr<llvm::symbolize::SymbolizableModule> Sym,
+                   llvm::SmallVectorImpl<SegmentEntry> &Seg,
+                   llvm::MapVector<uint64_t, MemInfoBlock> &Prof,
+                   CallStackMap &SM, bool KeepName = false)
+      : Symbolizer(std::move(Sym)), SegmentInfo(Seg.begin(), Seg.end()),
+        CallstackProfileData(Prof), StackMap(SM), KeepSymbolName(KeepName) {
+    // We don't call initialize here since there is no raw profile to read. The
+    // test should pass in the raw profile as structured data.
+
+    // If there is an error here then the mock symbolizer has not been
+    // initialized properly.
+    if (Error E = symbolizeAndFilterStackFrames())
+      report_fatal_error(std::move(E));
+    if (Error E = mapRawProfileToRecords())
+      report_fatal_error(std::move(E));
+  }
+
+  // Return a const reference to the internal Id to Frame mappings.
+  const llvm::DenseMap<FrameId, Frame> &getFrameMapping() const {
+    return IdToFrame;
+  }
+
+  // Return a const reference to the internal function profile data.
+  const llvm::MapVector<GlobalValue::GUID, IndexedMemProfRecord> &
+  getProfileData() const {
+    return FunctionProfileData;
+  }
 
 private:
-  std::unique_ptr<MemoryBuffer> DataBuffer;
-};
+  RawMemProfReader(object::OwningBinary<object::Binary> &&Bin, bool KeepName)
+      : Binary(std::move(Bin)), KeepSymbolName(KeepName) {}
+  // Initializes the RawMemProfReader with the contents in `DataBuffer`.
+  Error initialize(std::unique_ptr<MemoryBuffer> DataBuffer);
+  // Read and parse the contents of the `DataBuffer` as a binary format profile.
+  Error readRawProfile(std::unique_ptr<MemoryBuffer> DataBuffer);
+  // Symbolize and cache all the virtual addresses we encounter in the
+  // callstacks from the raw profile. Also prune callstack frames which we can't
+  // symbolize or those that belong to the runtime. For profile entries where
+  // the entire callstack is pruned, we drop the entry from the profile.
+  Error symbolizeAndFilterStackFrames();
+  // Construct memprof records for each function and store it in the
+  // `FunctionProfileData` map. A function may have allocation profile data or
+  // callsite data or both.
+  Error mapRawProfileToRecords();
+
+  // A helper method to extract the frame from the IdToFrame map.
+  const Frame &idToFrame(const FrameId Id) const {
+    auto It = IdToFrame.find(Id);
+    assert(It != IdToFrame.end() && "Id not found in map.");
+    return It->getSecond();
+  }
+
+  object::SectionedAddress getModuleOffset(uint64_t VirtualAddress);
+
+  object::OwningBinary<object::Binary> Binary;
+  std::unique_ptr<llvm::symbolize::SymbolizableModule> Symbolizer;
 
+  // The contents of the raw profile.
+  llvm::SmallVector<SegmentEntry, 16> SegmentInfo;
+  // A map from callstack id (same as key in CallStackMap below) to the heap
+  // information recorded for that allocation context.
+  llvm::MapVector<uint64_t, MemInfoBlock> CallstackProfileData;
+  CallStackMap StackMap;
+
+  // Cached symbolization from PC to Frame.
+  llvm::DenseMap<uint64_t, llvm::SmallVector<FrameId>> SymbolizedFrame;
+  llvm::DenseMap<FrameId, Frame> IdToFrame;
+
+  llvm::MapVector<GlobalValue::GUID, IndexedMemProfRecord> FunctionProfileData;
+  llvm::MapVector<GlobalValue::GUID, IndexedMemProfRecord>::iterator Iter;
+
+  // Whether to keep the symbol name for each frame after hashing.
+  bool KeepSymbolName = false;
+  // A mapping of the hash to symbol name, only used if KeepSymbolName is true.
+  llvm::DenseMap<uint64_t, std::string> GuidToSymbolName;
+};
 } // namespace memprof
 } // namespace llvm
 
diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h
index bad2139fe8f0..f11392c05318 100644
--- a/llvm/include/llvm/ProfileData/SampleProf.h
+++ b/llvm/include/llvm/ProfileData/SampleProf.h
@@ -18,15 +18,12 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSet.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/Module.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cstdint>
 #include <list>
@@ -40,6 +37,9 @@
 
 namespace llvm {
 
+class DILocation;
+class raw_ostream;
+
 const std::error_category &sampleprof_category();
 
 enum class sampleprof_error {
@@ -55,7 +55,6 @@ enum class sampleprof_error {
   not_implemented,
   counter_overflow,
   ostream_seek_unsupported,
-  compress_failed,
   uncompress_failed,
   zlib_unavailable,
   hash_mismatch
@@ -201,9 +200,9 @@ enum class SecProfSummaryFlags : uint32_t {
   /// SecFlagFSDiscriminator means this profile uses flow-sensitive
   /// discriminators.
   SecFlagFSDiscriminator = (1 << 2),
-  /// SecFlagIsCSNested means this is context-sensitive nested profile for
-  /// CSSPGO
-  SecFlagIsCSNested = (1 << 4),
+  /// SecFlagIsPreInlined means this profile contains ShouldBeInlined
+  /// contexts thus this is CS preinliner computed.
+  SecFlagIsPreInlined = (1 << 4),
 };
 
 enum class SecFuncMetadataFlags : uint32_t {
@@ -343,6 +342,15 @@ public:
                       : sampleprof_error::success;
   }
 
+  /// Decrease the number of samples for this record by \p S. Return the amout
+  /// of samples actually decreased.
+  uint64_t removeSamples(uint64_t S) {
+    if (S > NumSamples)
+      S = NumSamples;
+    NumSamples -= S;
+    return S;
+  }
+
   /// Add called function \p F with samples \p S.
   /// Optionally scale sample count \p S by \p Weight.
   ///
@@ -358,6 +366,18 @@ public:
                       : sampleprof_error::success;
   }
 
+  /// Remove called function from the call target map. Return the target sample
+  /// count of the called function.
+  uint64_t removeCalledTarget(StringRef F) {
+    uint64_t Count = 0;
+    auto I = CallTargets.find(F);
+    if (I != CallTargets.end()) {
+      Count = I->second;
+      CallTargets.erase(I);
+    }
+    return Count;
+  }
+
   /// Return true if this sample record contains function calls.
   bool hasCalls() const { return !CallTargets.empty(); }
 
@@ -367,6 +387,13 @@ public:
     return SortCallTargets(CallTargets);
   }
 
+  uint64_t getCallTargetSum() const {
+    uint64_t Sum = 0;
+    for (const auto &I : CallTargets)
+      Sum += I.second;
+    return Sum;
+  }
+
   /// Sort call targets in descending order of call frequency.
   static const SortedCallTargetSet SortCallTargets(const CallTargetMap &Targets) {
     SortedCallTargetSet SortedTargets;
@@ -413,6 +440,8 @@ enum ContextAttributeMask {
   ContextNone = 0x0,
   ContextWasInlined = 0x1,      // Leaf of context was inlined in previous build
   ContextShouldBeInlined = 0x2, // Leaf of context should be inlined
+  ContextDuplicatedIntoBase =
+      0x4, // Leaf of context is duplicated into the base profile
 };
 
 // Represents a context frame with function name and line location
@@ -524,16 +553,6 @@ public:
     }
   }
 
-  // Promote context by removing top frames with the length of
-  // `ContextFramesToRemove`. Note that with array representation of context,
-  // the promotion is effectively a slice operation with first
-  // `ContextFramesToRemove` elements removed from left.
-  void promoteOnPath(uint32_t ContextFramesToRemove) {
-    assert(ContextFramesToRemove <= FullContext.size() &&
-           "Cannot remove more than the whole context");
-    FullContext = FullContext.drop_front(ContextFramesToRemove);
-  }
-
   // Decode context string for a frame to get function name and location.
   // `ContextStr` is in the form of `FuncName:StartLine.Discriminator`.
   static void decodeContextString(StringRef ContextStr, StringRef &FName,
@@ -703,6 +722,13 @@ public:
                       : sampleprof_error::success;
   }
 
+  void removeTotalSamples(uint64_t Num) {
+    if (TotalSamples < Num)
+      TotalSamples = 0;
+    else
+      TotalSamples -= Num;
+  }
+
   void setTotalSamples(uint64_t Num) { TotalSamples = Num; }
 
   sampleprof_error addHeadSamples(uint64_t Num, uint64_t Weight = 1) {
@@ -727,6 +753,22 @@ public:
         FName, Num, Weight);
   }
 
+  // Remove a call target and decrease the body sample correspondingly. Return
+  // the number of body samples actually decreased.
+  uint64_t removeCalledTargetAndBodySample(uint32_t LineOffset,
+                                           uint32_t Discriminator,
+                                           StringRef FName) {
+    uint64_t Count = 0;
+    auto I = BodySamples.find(LineLocation(LineOffset, Discriminator));
+    if (I != BodySamples.end()) {
+      Count = I->second.removeCalledTarget(FName);
+      Count = I->second.removeSamples(Count);
+      if (!I->second.getSamples())
+        BodySamples.erase(I);
+    }
+    return Count;
+  }
+
   sampleprof_error addBodySamplesForProbe(uint32_t Index, uint64_t Num,
                                           uint64_t Weight = 1) {
     SampleRecord S;
@@ -734,6 +776,19 @@ public:
     return BodySamples[LineLocation(Index, 0)].merge(S, Weight);
   }
 
+  // Accumulate all call target samples to update the body samples.
+  void updateCallsiteSamples() {
+    for (auto &I : BodySamples) {
+      uint64_t TargetSamples = I.second.getCallTargetSum();
+      // It's possible that the body sample count can be greater than the call
+      // target sum. E.g, if some call targets are external targets, they won't
+      // be considered valid call targets, but the body sample count which is
+      // from lbr ranges can actually include them.
+      if (TargetSamples > I.second.getSamples())
+        I.second.addSamples(TargetSamples - I.second.getSamples());
+    }
+  }
+
   // Accumulate all body samples to set total samples.
   void updateTotalSamples() {
     setTotalSamples(0);
@@ -829,7 +884,7 @@ public:
   /// Return the sample count of the first instruction of the function.
   /// The function can be either a standalone symbol or an inlined function.
   uint64_t getEntrySamples() const {
-    if (FunctionSamples::ProfileIsCSFlat && getHeadSamples()) {
+    if (FunctionSamples::ProfileIsCS && getHeadSamples()) {
       // For CS profile, if we already have more accurate head samples
       // counted by branch sample from caller, use them as entry samples.
       return getHeadSamples();
@@ -1046,16 +1101,14 @@ public:
 
   static bool ProfileIsProbeBased;
 
-  static bool ProfileIsCSFlat;
+  static bool ProfileIsCS;
 
-  static bool ProfileIsCSNested;
+  static bool ProfileIsPreInlined;
 
   SampleContext &getContext() const { return Context; }
 
   void setContext(const SampleContext &FContext) { Context = FContext; }
 
-  static SampleProfileFormat Format;
-
   /// Whether the profile uses MD5 to represent string.
   static bool UseMD5;
 
diff --git a/llvm/include/llvm/ProfileData/SampleProfReader.h b/llvm/include/llvm/ProfileData/SampleProfReader.h
index a2caca246d93..7da336b9f61b 100644
--- a/llvm/include/llvm/ProfileData/SampleProfReader.h
+++ b/llvm/include/llvm/ProfileData/SampleProfReader.h
@@ -227,10 +227,8 @@
 
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/ProfileSummary.h"
 #include "llvm/ProfileData/GCOV.h"
@@ -240,7 +238,6 @@
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SymbolRemappingReader.h"
-#include <algorithm>
 #include <cstdint>
 #include <list>
 #include <memory>
@@ -473,11 +470,11 @@ public:
   /// Whether input profile is based on pseudo probes.
   bool profileIsProbeBased() const { return ProfileIsProbeBased; }
 
-  /// Whether input profile is fully context-sensitive and flat.
-  bool profileIsCSFlat() const { return ProfileIsCSFlat; }
+  /// Whether input profile is fully context-sensitive.
+  bool profileIsCS() const { return ProfileIsCS; }
 
-  /// Whether input profile is fully context-sensitive and nested.
-  bool profileIsCSNested() const { return ProfileIsCSNested; }
+  /// Whether input profile contains ShouldBeInlined contexts.
+  bool profileIsPreInlined() const { return ProfileIsPreInlined; }
 
   virtual std::unique_ptr<ProfileSymbolList> getProfileSymbolList() {
     return nullptr;
@@ -537,10 +534,10 @@ protected:
   bool ProfileIsProbeBased = false;
 
   /// Whether function profiles are context-sensitive flat profiles.
-  bool ProfileIsCSFlat = false;
+  bool ProfileIsCS = false;
 
-  /// Whether function profiles are context-sensitive nested profiles.
-  bool ProfileIsCSNested = false;
+  /// Whether function profile contains ShouldBeInlined contexts.
+  bool ProfileIsPreInlined = false;
 
   /// Number of context-sensitive profiles.
   uint32_t CSProfileCount = 0;
diff --git a/llvm/include/llvm/ProfileData/SampleProfWriter.h b/llvm/include/llvm/ProfileData/SampleProfWriter.h
index 42decd255203..aa7f1cbdd7e8 100644
--- a/llvm/include/llvm/ProfileData/SampleProfWriter.h
+++ b/llvm/include/llvm/ProfileData/SampleProfWriter.h
@@ -13,19 +13,15 @@
 #define LLVM_PROFILEDATA_SAMPLEPROFWRITER_H
 
 #include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSet.h"
 #include "llvm/IR/ProfileSummary.h"
 #include "llvm/ProfileData/SampleProf.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cstdint>
 #include <memory>
 #include <set>
 #include <system_error>
-#include <unordered_set>
 
 namespace llvm {
 namespace sampleprof {
diff --git a/llvm/include/llvm/Remarks/RemarkSerializer.h b/llvm/include/llvm/Remarks/RemarkSerializer.h
index 6217bd98d1a5..b971173ad2c6 100644
--- a/llvm/include/llvm/Remarks/RemarkSerializer.h
+++ b/llvm/include/llvm/Remarks/RemarkSerializer.h
@@ -13,7 +13,6 @@
 #ifndef LLVM_REMARKS_REMARKSERIALIZER_H
 #define LLVM_REMARKS_REMARKSERIALIZER_H
 
-#include "llvm/Remarks/Remark.h"
 #include "llvm/Remarks/RemarkFormat.h"
 #include "llvm/Remarks/RemarkStringTable.h"
 
diff --git a/llvm/include/llvm/Support/AArch64TargetParser.def b/llvm/include/llvm/Support/AArch64TargetParser.def
index a953e9439db4..e2f949856d9f 100644
--- a/llvm/include/llvm/Support/AArch64TargetParser.def
+++ b/llvm/include/llvm/Support/AArch64TargetParser.def
@@ -168,10 +168,10 @@ AARCH64_CPU_NAME("cortex-a510", ARMV9A, FK_NEON_FP_ARMV8, false,
 AARCH64_CPU_NAME("cortex-a57", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_CRC))
 AARCH64_CPU_NAME("cortex-a65", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
-                 (AArch64::AEK_DOTPROD | AArch64::AEK_FP16 | AArch64::AEK_RAS |
+                 (AArch64::AEK_DOTPROD | AArch64::AEK_FP16 |
                   AArch64::AEK_RCPC | AArch64::AEK_SSBS))
 AARCH64_CPU_NAME("cortex-a65ae", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
-                 (AArch64::AEK_DOTPROD | AArch64::AEK_FP16 | AArch64::AEK_RAS |
+                 (AArch64::AEK_DOTPROD | AArch64::AEK_FP16 |
                   AArch64::AEK_RCPC | AArch64::AEK_SSBS))
 AARCH64_CPU_NAME("cortex-a72", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_CRC))
@@ -190,10 +190,11 @@ AARCH64_CPU_NAME("cortex-a77", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
                   AArch64::AEK_SSBS))
 AARCH64_CPU_NAME("cortex-a78", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_FP16 | AArch64::AEK_DOTPROD | AArch64::AEK_RCPC |
-                  AArch64::AEK_SSBS))
+                  AArch64::AEK_SSBS | AArch64::AEK_PROFILE))
 AARCH64_CPU_NAME("cortex-a78c", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_FP16 | AArch64::AEK_DOTPROD | AArch64::AEK_RCPC |
-                  AArch64::AEK_SSBS))
+                  AArch64::AEK_SSBS | AArch64::AEK_PROFILE | AArch64::AEK_FLAGM |
+                  AArch64::AEK_PAUTH | AArch64::AEK_FP16FML))
 AARCH64_CPU_NAME("cortex-a710", ARMV9A, FK_NEON_FP_ARMV8, false,
                  (AArch64::AEK_MTE | AArch64::AEK_PAUTH | AArch64::AEK_FLAGM |
                   AArch64::AEK_SB | AArch64::AEK_I8MM | AArch64::AEK_FP16FML |
@@ -203,35 +204,37 @@ AARCH64_CPU_NAME("cortex-r82", ARMV8R, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_LSE))
 AARCH64_CPU_NAME("cortex-x1", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_FP16 | AArch64::AEK_DOTPROD | AArch64::AEK_RCPC |
-                  AArch64::AEK_SSBS))
+                  AArch64::AEK_SSBS | AArch64::AEK_PROFILE))
 AARCH64_CPU_NAME("cortex-x1c", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_FP16 | AArch64::AEK_DOTPROD | AArch64::AEK_RCPC |
-                  AArch64::AEK_SSBS | AArch64::AEK_PAUTH))
+                  AArch64::AEK_SSBS | AArch64::AEK_PAUTH | AArch64::AEK_PROFILE))
 AARCH64_CPU_NAME("cortex-x2", ARMV9A, FK_NEON_FP_ARMV8, false,
                  (AArch64::AEK_MTE | AArch64::AEK_BF16 | AArch64::AEK_I8MM |
                   AArch64::AEK_PAUTH | AArch64::AEK_SSBS | AArch64::AEK_SB |
                   AArch64::AEK_SVE | AArch64::AEK_SVE2 | AArch64::AEK_SVE2BITPERM |
                   AArch64::AEK_FP16FML))
 AARCH64_CPU_NAME("neoverse-e1", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
-                 (AArch64::AEK_DOTPROD | AArch64::AEK_FP16 | AArch64::AEK_RAS |
+                 (AArch64::AEK_DOTPROD | AArch64::AEK_FP16 |
                   AArch64::AEK_RCPC | AArch64::AEK_SSBS))
 AARCH64_CPU_NAME("neoverse-n1", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_DOTPROD | AArch64::AEK_FP16 |
-                  AArch64::AEK_PROFILE | AArch64::AEK_RAS | AArch64::AEK_RCPC |
+                  AArch64::AEK_PROFILE | AArch64::AEK_RCPC |
                   AArch64::AEK_SSBS))
 AARCH64_CPU_NAME("neoverse-n2", ARMV8_5A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_BF16 | AArch64::AEK_DOTPROD | AArch64::AEK_FP16 |
-                  AArch64::AEK_I8MM | AArch64::AEK_MTE | AArch64::AEK_RAS |
-                  AArch64::AEK_RCPC | AArch64::AEK_SB | AArch64::AEK_SSBS |
+                  AArch64::AEK_I8MM | AArch64::AEK_MTE |
+                  AArch64::AEK_SB | AArch64::AEK_SSBS |
                   AArch64::AEK_SVE | AArch64::AEK_SVE2 | AArch64::AEK_SVE2BITPERM))
 AARCH64_CPU_NAME("neoverse-512tvb", ARMV8_4A, FK_CRYPTO_NEON_FP_ARMV8, false,
-                 (AArch64::AEK_RAS | AArch64::AEK_SVE | AArch64::AEK_SSBS |
-                  AArch64::AEK_RCPC | AArch64::AEK_FP16 | AArch64::AEK_BF16 |
-                  AArch64::AEK_DOTPROD ))
+                 (AArch64::AEK_SVE | AArch64::AEK_SSBS |
+                  AArch64::AEK_FP16 | AArch64::AEK_BF16 |
+                  AArch64::AEK_DOTPROD | AArch64::AEK_PROFILE |
+                  AArch64::AEK_RAND | AArch64::AEK_FP16FML | AArch64::AEK_I8MM))
 AARCH64_CPU_NAME("neoverse-v1", ARMV8_4A, FK_CRYPTO_NEON_FP_ARMV8, false,
-                 (AArch64::AEK_RAS | AArch64::AEK_SVE | AArch64::AEK_SSBS |
-                  AArch64::AEK_RCPC | AArch64::AEK_FP16 | AArch64::AEK_BF16 |
-                  AArch64::AEK_DOTPROD ))
+                 (AArch64::AEK_SVE | AArch64::AEK_SSBS |
+                  AArch64::AEK_FP16 | AArch64::AEK_BF16 |
+                  AArch64::AEK_DOTPROD | AArch64::AEK_PROFILE |
+                  AArch64::AEK_RAND | AArch64::AEK_FP16FML | AArch64::AEK_I8MM))
 AARCH64_CPU_NAME("cyclone", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_NONE))
 AARCH64_CPU_NAME("apple-a7", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
@@ -247,11 +250,11 @@ AARCH64_CPU_NAME("apple-a11", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
 AARCH64_CPU_NAME("apple-a12", ARMV8_3A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_FP16))
 AARCH64_CPU_NAME("apple-a13", ARMV8_4A, FK_CRYPTO_NEON_FP_ARMV8, false,
-                 (AArch64::AEK_FP16 | AArch64::AEK_FP16FML))
+                 (AArch64::AEK_FP16 | AArch64::AEK_FP16FML | AArch64::AEK_SHA3))
 AARCH64_CPU_NAME("apple-a14", ARMV8_5A, FK_CRYPTO_NEON_FP_ARMV8, false,
-                 (AArch64::AEK_FP16 | AArch64::AEK_FP16FML))
+                 (AArch64::AEK_FP16 | AArch64::AEK_FP16FML | AArch64::AEK_SHA3))
 AARCH64_CPU_NAME("apple-m1", ARMV8_5A, FK_CRYPTO_NEON_FP_ARMV8, false,
-                 (AArch64::AEK_FP16 | AArch64::AEK_FP16FML))
+                 (AArch64::AEK_FP16 | AArch64::AEK_FP16FML | AArch64::AEK_SHA3))
 AARCH64_CPU_NAME("apple-s4", ARMV8_3A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_FP16))
 AARCH64_CPU_NAME("apple-s5", ARMV8_3A, FK_CRYPTO_NEON_FP_ARMV8, false,
@@ -271,17 +274,15 @@ AARCH64_CPU_NAME("kryo", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
 AARCH64_CPU_NAME("thunderx2t99", ARMV8_1A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_NONE))
 AARCH64_CPU_NAME("thunderx3t110", ARMV8_3A, FK_CRYPTO_NEON_FP_ARMV8, false,
-                 (AArch64::AEK_CRC | AEK_CRYPTO | AEK_FP | AEK_SIMD |
-                  AEK_LSE | AEK_RAND | AArch64::AEK_PROFILE |
-                  AArch64::AEK_RAS))
+                 (AArch64::AEK_NONE))
 AARCH64_CPU_NAME("thunderx", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
-                 (AArch64::AEK_CRC | AArch64::AEK_PROFILE))
+                 (AArch64::AEK_CRC))
 AARCH64_CPU_NAME("thunderxt88", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
-                 (AArch64::AEK_CRC | AArch64::AEK_PROFILE))
+                 (AArch64::AEK_CRC))
 AARCH64_CPU_NAME("thunderxt81", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
-                 (AArch64::AEK_CRC | AArch64::AEK_PROFILE))
+                 (AArch64::AEK_CRC))
 AARCH64_CPU_NAME("thunderxt83", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
-                 (AArch64::AEK_CRC | AArch64::AEK_PROFILE))
+                 (AArch64::AEK_CRC))
 AARCH64_CPU_NAME("tsv110", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_DOTPROD |
                   AArch64::AEK_FP16 | AArch64::AEK_FP16FML |
@@ -290,6 +291,8 @@ AARCH64_CPU_NAME("a64fx", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_FP16 | AArch64::AEK_SVE))
 AARCH64_CPU_NAME("carmel", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  AArch64::AEK_FP16)
+AARCH64_CPU_NAME("ampere1", ARMV8_6A, FK_CRYPTO_NEON_FP_ARMV8, false,
+                 (AArch64::AEK_FP16 | AArch64::AEK_SB | AArch64::AEK_SSBS))
 // Invalid CPU
 AARCH64_CPU_NAME("invalid", INVALID, FK_INVALID, true, AArch64::AEK_INVALID)
 #undef AARCH64_CPU_NAME
diff --git a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
index aec80291f01f..41d144cfd5c4 100644
--- a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
+++ b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
@@ -136,13 +136,17 @@ enum : int32_t {
 
 // Compute program resource register 3 for GFX10+. Must match hardware
 // definition.
-#define COMPUTE_PGM_RSRC3_GFX10(NAME, SHIFT, WIDTH) \
-  AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_ ## NAME, SHIFT, WIDTH)
+#define COMPUTE_PGM_RSRC3_GFX10_PLUS(NAME, SHIFT, WIDTH) \
+  AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_PLUS_ ## NAME, SHIFT, WIDTH)
 enum : int32_t {
-  COMPUTE_PGM_RSRC3_GFX10(SHARED_VGPR_COUNT, 0, 4), // GFX10+
-  COMPUTE_PGM_RSRC3_GFX10(RESERVED0, 4, 28),
+  COMPUTE_PGM_RSRC3_GFX10_PLUS(SHARED_VGPR_COUNT, 0, 4), // GFX10+
+  COMPUTE_PGM_RSRC3_GFX10_PLUS(INST_PREF_SIZE, 4, 6),    // GFX11+
+  COMPUTE_PGM_RSRC3_GFX10_PLUS(TRAP_ON_START, 10, 1),    // GFX11+
+  COMPUTE_PGM_RSRC3_GFX10_PLUS(TRAP_ON_END, 11, 1),      // GFX11+
+  COMPUTE_PGM_RSRC3_GFX10_PLUS(RESERVED0, 12, 19),
+  COMPUTE_PGM_RSRC3_GFX10_PLUS(IMAGE_OP, 31, 1),         // GFX11+
 };
-#undef COMPUTE_PGM_RSRC3_GFX10
+#undef COMPUTE_PGM_RSRC3_GFX10_PLUS
 
 // Kernel code properties. Must be kept backwards compatible.
 #define KERNEL_CODE_PROPERTY(NAME, SHIFT, WIDTH) \
diff --git a/llvm/include/llvm/Support/ARMBuildAttributes.h b/llvm/include/llvm/Support/ARMBuildAttributes.h
index b4405e7d4908..35f8992ca932 100644
--- a/llvm/include/llvm/Support/ARMBuildAttributes.h
+++ b/llvm/include/llvm/Support/ARMBuildAttributes.h
@@ -90,25 +90,26 @@ enum AttrType : unsigned {
 
 // Legal Values for CPU_arch, (=6), uleb128
 enum CPUArch {
-  Pre_v4   = 0,
-  v4       = 1,   // e.g. SA110
-  v4T      = 2,   // e.g. ARM7TDMI
-  v5T      = 3,   // e.g. ARM9TDMI
-  v5TE     = 4,   // e.g. ARM946E_S
-  v5TEJ    = 5,   // e.g. ARM926EJ_S
-  v6       = 6,   // e.g. ARM1136J_S
-  v6KZ     = 7,   // e.g. ARM1176JZ_S
-  v6T2     = 8,   // e.g. ARM1156T2_S
-  v6K      = 9,   // e.g. ARM1176JZ_S
-  v7       = 10,  // e.g. Cortex A8, Cortex M3
-  v6_M     = 11,  // e.g. Cortex M1
-  v6S_M    = 12,  // v6_M with the System extensions
-  v7E_M    = 13,  // v7_M with DSP extensions
-  v8_A     = 14,  // v8_A AArch32
-  v8_R     = 15,  // e.g. Cortex R52
-  v8_M_Base= 16,  // v8_M_Base AArch32
-  v8_M_Main= 17,  // v8_M_Main AArch32
-  v8_1_M_Main=21, // v8_1_M_Main AArch32
+  Pre_v4 = 0,
+  v4 = 1,           // e.g. SA110
+  v4T = 2,          // e.g. ARM7TDMI
+  v5T = 3,          // e.g. ARM9TDMI
+  v5TE = 4,         // e.g. ARM946E_S
+  v5TEJ = 5,        // e.g. ARM926EJ_S
+  v6 = 6,           // e.g. ARM1136J_S
+  v6KZ = 7,         // e.g. ARM1176JZ_S
+  v6T2 = 8,         // e.g. ARM1156T2_S
+  v6K = 9,          // e.g. ARM1176JZ_S
+  v7 = 10,          // e.g. Cortex A8, Cortex M3
+  v6_M = 11,        // e.g. Cortex M1
+  v6S_M = 12,       // v6_M with the System extensions
+  v7E_M = 13,       // v7_M with DSP extensions
+  v8_A = 14,        // v8_A AArch32
+  v8_R = 15,        // e.g. Cortex R52
+  v8_M_Base = 16,   // v8_M_Base AArch32
+  v8_M_Main = 17,   // v8_M_Main AArch32
+  v8_1_M_Main = 21, // v8_1_M_Main AArch32
+  v9_A = 22,        // v9_A AArch32
 };
 
 enum CPUArchProfile {               // (=7), uleb128
diff --git a/llvm/include/llvm/Support/ARMTargetParser.def b/llvm/include/llvm/Support/ARMTargetParser.def
index 80deeb2a6e9d..6a1ac7213dad 100644
--- a/llvm/include/llvm/Support/ARMTargetParser.def
+++ b/llvm/include/llvm/Support/ARMTargetParser.def
@@ -129,22 +129,22 @@ ARM_ARCH("armv8.8-a", ARMV8_8A, "8.8-A", "v8.8a",
           ARM::AEK_DOTPROD | ARM::AEK_BF16 | ARM::AEK_SHA2 | ARM::AEK_AES |
           ARM::AEK_I8MM))
 ARM_ARCH("armv9-a", ARMV9A, "9-A", "v9a",
-         ARMBuildAttrs::CPUArch::v8_A, FK_NEON_FP_ARMV8,
+         ARMBuildAttrs::CPUArch::v9_A, FK_NEON_FP_ARMV8,
          (ARM::AEK_SEC | ARM::AEK_MP | ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
           ARM::AEK_HWDIVTHUMB | ARM::AEK_DSP | ARM::AEK_CRC | ARM::AEK_RAS |
           ARM::AEK_DOTPROD))
 ARM_ARCH("armv9.1-a", ARMV9_1A, "9.1-A", "v9.1a",
-         ARMBuildAttrs::CPUArch::v8_A, FK_NEON_FP_ARMV8,
+         ARMBuildAttrs::CPUArch::v9_A, FK_NEON_FP_ARMV8,
          (ARM::AEK_SEC | ARM::AEK_MP | ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
           ARM::AEK_HWDIVTHUMB | ARM::AEK_DSP | ARM::AEK_CRC | ARM::AEK_RAS |
           ARM::AEK_DOTPROD | ARM::AEK_BF16 | ARM::AEK_I8MM))
 ARM_ARCH("armv9.2-a", ARMV9_2A, "9.2-A", "v9.2a",
-         ARMBuildAttrs::CPUArch::v8_A, FK_NEON_FP_ARMV8,
+         ARMBuildAttrs::CPUArch::v9_A, FK_NEON_FP_ARMV8,
          (ARM::AEK_SEC | ARM::AEK_MP | ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
           ARM::AEK_HWDIVTHUMB | ARM::AEK_DSP | ARM::AEK_CRC | ARM::AEK_RAS |
           ARM::AEK_DOTPROD | ARM::AEK_BF16 | ARM::AEK_I8MM))
 ARM_ARCH("armv9.3-a", ARMV9_3A, "9.3-A", "v9.3a",
-         ARMBuildAttrs::CPUArch::v8_A, FK_CRYPTO_NEON_FP_ARMV8,
+         ARMBuildAttrs::CPUArch::v9_A, FK_CRYPTO_NEON_FP_ARMV8,
          (ARM::AEK_SEC | ARM::AEK_MP | ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
           ARM::AEK_HWDIVTHUMB | ARM::AEK_DSP | ARM::AEK_CRC | ARM::AEK_RAS |
           ARM::AEK_DOTPROD | ARM::AEK_BF16 | ARM::AEK_I8MM))
diff --git a/llvm/include/llvm/Support/ARMWinEH.h b/llvm/include/llvm/Support/ARMWinEH.h
index 327aa9804849..dee2f31fb127 100644
--- a/llvm/include/llvm/Support/ARMWinEH.h
+++ b/llvm/include/llvm/Support/ARMWinEH.h
@@ -199,13 +199,14 @@ inline bool EpilogueFolding(const RuntimeFunction &RF) {
 inline uint16_t StackAdjustment(const RuntimeFunction &RF) {
   uint16_t Adjustment = RF.StackAdjust();
   if (Adjustment >= 0x3f4)
-    return (Adjustment & 0x3) ? ((Adjustment & 0x3) << 2) - 1 : 0;
+    return (Adjustment & 0x3) + 1;
   return Adjustment;
 }
 
 /// SavedRegisterMask - Utility function to calculate the set of saved general
 /// purpose (r0-r15) and VFP (d0-d31) registers.
-std::pair<uint16_t, uint32_t> SavedRegisterMask(const RuntimeFunction &RF);
+std::pair<uint16_t, uint32_t> SavedRegisterMask(const RuntimeFunction &RF,
+                                                bool Prologue = true);
 
 /// RuntimeFunctionARM64 - An entry in the table of procedure data (.pdata)
 ///
diff --git a/llvm/include/llvm/Support/Alignment.h b/llvm/include/llvm/Support/Alignment.h
index 1176c026ba99..1543a5713d73 100644
--- a/llvm/include/llvm/Support/Alignment.h
+++ b/llvm/include/llvm/Support/Alignment.h
@@ -84,6 +84,14 @@ public:
   /// Needed to interact with C for instance.
   uint64_t value() const { return uint64_t(1) << ShiftValue; }
 
+  // Returns the previous alignment.
+  Align previous() const {
+    assert(ShiftValue != 0 && "Undefined operation");
+    Align Out;
+    Out.ShiftValue = ShiftValue - 1;
+    return Out;
+  }
+
   /// Allow constructions of constexpr Align.
   template <size_t kValue> constexpr static LogValue Constant() {
     return LogValue{static_cast<uint8_t>(CTLog2<kValue>())};
@@ -131,7 +139,7 @@ public:
   }
 
   /// For convenience, returns a valid alignment or 1 if undefined.
-  Align valueOrOne() const { return hasValue() ? getValue() : Align(); }
+  Align valueOrOne() const { return value_or(Align()); }
 };
 
 /// Checks that SizeInBytes is a multiple of the alignment.
@@ -173,13 +181,7 @@ inline uint64_t alignTo(uint64_t Size, Align A) {
 inline uint64_t alignTo(uint64_t Size, Align A, uint64_t Skew) {
   const uint64_t Value = A.value();
   Skew %= Value;
-  return ((Size + Value - 1 - Skew) & ~(Value - 1U)) + Skew;
-}
-
-/// Returns a multiple of A needed to store `Size` bytes.
-/// Returns `Size` if current alignment is undefined.
-inline uint64_t alignTo(uint64_t Size, MaybeAlign A) {
-  return A ? alignTo(Size, A.getValue()) : Size;
+  return alignTo(Size - Skew, A) + Skew;
 }
 
 /// Aligns `Addr` to `Alignment` bytes, rounding up.
@@ -206,28 +208,12 @@ inline uint64_t offsetToAlignedAddr(const void *Addr, Align Alignment) {
 /// Returns the log2 of the alignment.
 inline unsigned Log2(Align A) { return A.ShiftValue; }
 
-/// Returns the alignment that satisfies both alignments.
-/// Same semantic as MinAlign.
-inline Align commonAlignment(Align A, Align B) { return std::min(A, B); }
-
 /// Returns the alignment that satisfies both alignments.
 /// Same semantic as MinAlign.
 inline Align commonAlignment(Align A, uint64_t Offset) {
   return Align(MinAlign(A.value(), Offset));
 }
 
-/// Returns the alignment that satisfies both alignments.
-/// Same semantic as MinAlign.
-inline MaybeAlign commonAlignment(MaybeAlign A, MaybeAlign B) {
-  return A && B ? commonAlignment(*A, *B) : A ? A : B;
-}
-
-/// Returns the alignment that satisfies both alignments.
-/// Same semantic as MinAlign.
-inline MaybeAlign commonAlignment(MaybeAlign A, uint64_t Offset) {
-  return MaybeAlign(MinAlign((*A).value(), Offset));
-}
-
 /// Returns a representation of the alignment that encodes undefined as 0.
 inline unsigned encode(MaybeAlign A) { return A ? A->ShiftValue + 1 : 0; }
 
@@ -270,14 +256,6 @@ inline bool operator>(Align Lhs, uint64_t Rhs) {
   return Lhs.value() > Rhs;
 }
 
-/// Comparisons between MaybeAlign and scalars.
-inline bool operator==(MaybeAlign Lhs, uint64_t Rhs) {
-  return Lhs ? (*Lhs).value() == Rhs : Rhs == 0;
-}
-inline bool operator!=(MaybeAlign Lhs, uint64_t Rhs) {
-  return Lhs ? (*Lhs).value() != Rhs : Rhs != 0;
-}
-
 /// Comparisons operators between Align.
 inline bool operator==(Align Lhs, Align Rhs) {
   return Lhs.ShiftValue == Rhs.ShiftValue;
@@ -314,37 +292,6 @@ bool operator>=(MaybeAlign Lhs, MaybeAlign Rhs) = delete;
 bool operator<(MaybeAlign Lhs, MaybeAlign Rhs) = delete;
 bool operator>(MaybeAlign Lhs, MaybeAlign Rhs) = delete;
 
-inline Align operator*(Align Lhs, uint64_t Rhs) {
-  assert(Rhs > 0 && "Rhs must be positive");
-  return Align(Lhs.value() * Rhs);
-}
-
-inline MaybeAlign operator*(MaybeAlign Lhs, uint64_t Rhs) {
-  assert(Rhs > 0 && "Rhs must be positive");
-  return Lhs ? Lhs.getValue() * Rhs : MaybeAlign();
-}
-
-inline Align operator/(Align Lhs, uint64_t Divisor) {
-  assert(llvm::isPowerOf2_64(Divisor) &&
-         "Divisor must be positive and a power of 2");
-  assert(Lhs != 1 && "Can't halve byte alignment");
-  return Align(Lhs.value() / Divisor);
-}
-
-inline MaybeAlign operator/(MaybeAlign Lhs, uint64_t Divisor) {
-  assert(llvm::isPowerOf2_64(Divisor) &&
-         "Divisor must be positive and a power of 2");
-  return Lhs ? Lhs.getValue() / Divisor : MaybeAlign();
-}
-
-inline Align max(MaybeAlign Lhs, Align Rhs) {
-  return Lhs && *Lhs > Rhs ? *Lhs : Rhs;
-}
-
-inline Align max(Align Lhs, MaybeAlign Rhs) {
-  return Rhs && *Rhs > Lhs ? *Rhs : Lhs;
-}
-
 #ifndef NDEBUG
 // For usage in LLVM_DEBUG macros.
 inline std::string DebugStr(const Align &A) {
diff --git a/llvm/include/llvm/Support/Allocator.h b/llvm/include/llvm/Support/Allocator.h
index ec5ed06b7fa4..5ca0c9decac3 100644
--- a/llvm/include/llvm/Support/Allocator.h
+++ b/llvm/include/llvm/Support/Allocator.h
@@ -140,6 +140,9 @@ public:
   // This method is *not* marked noalias, because
   // SpecificBumpPtrAllocator::DestroyAll() loops over all allocations, and
   // that loop is not based on the Allocate() return value.
+  //
+  // Allocate(0, N) is valid, it returns a non-null pointer (which should not
+  // be dereferenced).
   LLVM_ATTRIBUTE_RETURNS_NONNULL void *Allocate(size_t Size, Align Alignment) {
     // Keep track of how many bytes we've allocated.
     BytesAllocated += Size;
@@ -154,7 +157,9 @@ public:
 #endif
 
     // Check if we have enough space.
-    if (Adjustment + SizeToAllocate <= size_t(End - CurPtr)) {
+    if (Adjustment + SizeToAllocate <= size_t(End - CurPtr)
+        // We can't return nullptr even for a zero-sized allocation!
+        && CurPtr != nullptr) {
       char *AlignedPtr = CurPtr + Adjustment;
       CurPtr = AlignedPtr + SizeToAllocate;
       // Update the allocation point of this memory block in MemorySanitizer.
diff --git a/llvm/include/llvm/Support/BLAKE3.h b/llvm/include/llvm/Support/BLAKE3.h
new file mode 100644
index 000000000000..7b30dbccd173
--- /dev/null
+++ b/llvm/include/llvm/Support/BLAKE3.h
@@ -0,0 +1,124 @@
+//==- BLAKE3.h - BLAKE3 C++ wrapper for LLVM ---------------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a C++ wrapper of the BLAKE3 C interface.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_BLAKE3_H
+#define LLVM_SUPPORT_BLAKE3_H
+
+#include "llvm-c/blake3.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+
+/// The constant \p LLVM_BLAKE3_OUT_LEN provides the default output length,
+/// 32 bytes, which is recommended for most callers.
+///
+/// Outputs shorter than the default length of 32 bytes (256 bits) provide
+/// less security. An N-bit BLAKE3 output is intended to provide N bits of
+/// first and second preimage resistance and N/2 bits of collision
+/// resistance, for any N up to 256. Longer outputs don't provide any
+/// additional security.
+///
+/// Shorter BLAKE3 outputs are prefixes of longer ones. Explicitly
+/// requesting a short output is equivalent to truncating the default-length
+/// output.
+template <size_t NumBytes = LLVM_BLAKE3_OUT_LEN>
+using BLAKE3Result = std::array<uint8_t, NumBytes>;
+
+/// A class that wraps the BLAKE3 algorithm.
+class BLAKE3 {
+public:
+  BLAKE3() { init(); }
+
+  /// Reinitialize the internal state
+  void init() { llvm_blake3_hasher_init(&Hasher); }
+
+  /// Digest more data.
+  void update(ArrayRef<uint8_t> Data) {
+    llvm_blake3_hasher_update(&Hasher, Data.data(), Data.size());
+  }
+
+  /// Digest more data.
+  void update(StringRef Str) {
+    llvm_blake3_hasher_update(&Hasher, Str.data(), Str.size());
+  }
+
+  /// Finalize the hasher and put the result in \p Result.
+  /// This doesn't modify the hasher itself, and it's possible to finalize again
+  /// after adding more input.
+  template <size_t NumBytes = LLVM_BLAKE3_OUT_LEN>
+  void final(BLAKE3Result<NumBytes> &Result) {
+    llvm_blake3_hasher_finalize(&Hasher, Result.data(), Result.size());
+  }
+
+  /// Finalize the hasher and return an output of any length, given in bytes.
+  /// This doesn't modify the hasher itself, and it's possible to finalize again
+  /// after adding more input.
+  template <size_t NumBytes = LLVM_BLAKE3_OUT_LEN>
+  BLAKE3Result<NumBytes> final() {
+    BLAKE3Result<NumBytes> Result;
+    llvm_blake3_hasher_finalize(&Hasher, Result.data(), Result.size());
+    return Result;
+  }
+
+  /// Return the current output for the digested data since the last call to
+  /// init().
+  ///
+  /// Other hash functions distinguish between \p result() and \p final(), with
+  /// \p result() allowing more calls into \p update(), but there's no
+  // difference for the BLAKE3 hash function.
+  template <size_t NumBytes = LLVM_BLAKE3_OUT_LEN>
+  BLAKE3Result<NumBytes> result() {
+    return final<NumBytes>();
+  }
+
+  /// Returns a BLAKE3 hash for the given data.
+  template <size_t NumBytes = LLVM_BLAKE3_OUT_LEN>
+  static BLAKE3Result<NumBytes> hash(ArrayRef<uint8_t> Data) {
+    BLAKE3 Hasher;
+    Hasher.update(Data);
+    return Hasher.final<NumBytes>();
+  }
+
+private:
+  llvm_blake3_hasher Hasher;
+};
+
+/// Like \p BLAKE3 but using a class-level template parameter for specifying the
+/// hash size of the \p final() and \p result() functions.
+///
+/// This is useful for using BLAKE3 as the hasher type for \p HashBuilder with
+/// non-default hash sizes.
+template <size_t NumBytes> class TruncatedBLAKE3 : public BLAKE3 {
+public:
+  /// Finalize the hasher and put the result in \p Result.
+  /// This doesn't modify the hasher itself, and it's possible to finalize again
+  /// after adding more input.
+  void final(BLAKE3Result<NumBytes> &Result) { return BLAKE3::final(Result); }
+
+  /// Finalize the hasher and return an output of any length, given in bytes.
+  /// This doesn't modify the hasher itself, and it's possible to finalize again
+  /// after adding more input.
+  BLAKE3Result<NumBytes> final() { return BLAKE3::final<NumBytes>(); }
+
+  /// Return the current output for the digested data since the last call to
+  /// init().
+  ///
+  /// Other hash functions distinguish between \p result() and \p final(), with
+  /// \p result() allowing more calls into \p update(), but there's no
+  // difference for the BLAKE3 hash function.
+  BLAKE3Result<NumBytes> result() { return BLAKE3::result<NumBytes>(); }
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/Support/Base64.h b/llvm/include/llvm/Support/Base64.h
index 62064a35aa34..da4ae1688574 100644
--- a/llvm/include/llvm/Support/Base64.h
+++ b/llvm/include/llvm/Support/Base64.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_SUPPORT_BASE64_H
 #define LLVM_SUPPORT_BASE64_H
 
+#include <cstdint>
 #include <string>
 
 namespace llvm {
diff --git a/llvm/include/llvm/Support/BinaryStreamArray.h b/llvm/include/llvm/Support/BinaryStreamArray.h
index c3e0db4dcff0..ef2233c53ec2 100644
--- a/llvm/include/llvm/Support/BinaryStreamArray.h
+++ b/llvm/include/llvm/Support/BinaryStreamArray.h
@@ -111,6 +111,8 @@ public:
 
   bool valid() const { return Stream.valid(); }
 
+  bool isOffsetValid(uint32_t Offset) const { return at(Offset) != end(); }
+
   uint32_t skew() const { return Skew; }
   Iterator end() const { return Iterator(E); }
 
diff --git a/llvm/include/llvm/Support/BinaryStreamRef.h b/llvm/include/llvm/Support/BinaryStreamRef.h
index bc8c6a496ecf..46fc9fb293df 100644
--- a/llvm/include/llvm/Support/BinaryStreamRef.h
+++ b/llvm/include/llvm/Support/BinaryStreamRef.h
@@ -48,7 +48,7 @@ public:
   }
 
   uint64_t getLength() const {
-    if (Length.hasValue())
+    if (Length)
       return *Length;
 
     return BorrowedImpl ? (BorrowedImpl->getLength() - ViewOffset) : 0;
@@ -67,7 +67,7 @@ public:
       return Result;
 
     Result.ViewOffset += N;
-    if (Result.Length.hasValue())
+    if (Result.Length)
       *Result.Length -= N;
     return Result;
   }
@@ -87,7 +87,7 @@ public:
 
     // Since we're dropping non-zero bytes from the end, stop length-tracking
     // by setting the length of the resulting StreamRef to an explicit value.
-    if (!Result.Length.hasValue())
+    if (!Result.Length)
       Result.Length = getLength();
 
     *Result.Length -= N;
diff --git a/llvm/include/llvm/Support/BranchProbability.h b/llvm/include/llvm/Support/BranchProbability.h
index 6f071c15421f..79d70cf611d4 100644
--- a/llvm/include/llvm/Support/BranchProbability.h
+++ b/llvm/include/llvm/Support/BranchProbability.h
@@ -16,6 +16,7 @@
 #include "llvm/Support/DataTypes.h"
 #include <algorithm>
 #include <cassert>
+#include <iterator>
 #include <numeric>
 
 namespace llvm {
diff --git a/llvm/include/llvm/Support/CSKYAttributeParser.h b/llvm/include/llvm/Support/CSKYAttributeParser.h
new file mode 100644
index 000000000000..e926ebe5e306
--- /dev/null
+++ b/llvm/include/llvm/Support/CSKYAttributeParser.h
@@ -0,0 +1,43 @@
+//===---- CSKYAttributeParser.h - CSKY Attribute Parser ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_CSKYATTRIBUTEPARSER_H
+#define LLVM_SUPPORT_CSKYATTRIBUTEPARSER_H
+
+#include "llvm/Support/CSKYAttributes.h"
+#include "llvm/Support/ELFAttributeParser.h"
+
+namespace llvm {
+class CSKYAttributeParser : public ELFAttributeParser {
+  struct DisplayHandler {
+    CSKYAttrs::AttrType attribute;
+    Error (CSKYAttributeParser::*routine)(unsigned);
+  };
+  static const DisplayHandler displayRoutines[];
+
+  Error dspVersion(unsigned tag);
+  Error vdspVersion(unsigned tag);
+  Error fpuVersion(unsigned tag);
+  Error fpuABI(unsigned tag);
+  Error fpuRounding(unsigned tag);
+  Error fpuDenormal(unsigned tag);
+  Error fpuException(unsigned tag);
+  Error fpuHardFP(unsigned tag);
+
+  Error handler(uint64_t tag, bool &handled) override;
+
+public:
+  CSKYAttributeParser(ScopedPrinter *sw)
+      : ELFAttributeParser(sw, CSKYAttrs::getCSKYAttributeTags(), "csky") {}
+  CSKYAttributeParser()
+      : ELFAttributeParser(CSKYAttrs::getCSKYAttributeTags(), "csky") {}
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/Support/CSKYAttributes.h b/llvm/include/llvm/Support/CSKYAttributes.h
new file mode 100644
index 000000000000..723f2ceee8fb
--- /dev/null
+++ b/llvm/include/llvm/Support/CSKYAttributes.h
@@ -0,0 +1,95 @@
+//===---- CSKYAttributes.h - CSKY Attributes --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains enumerations for CSKY attributes.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_SUPPORT_CSKYATTRIBUTES_H
+#define LLVM_SUPPORT_CSKYATTRIBUTES_H
+
+#include "llvm/Support/ELFAttributes.h"
+
+namespace llvm {
+namespace CSKYAttrs {
+
+const TagNameMap &getCSKYAttributeTags();
+
+enum AttrType {
+  CSKY_ARCH_NAME = 4,
+  CSKY_CPU_NAME = 5,
+  CSKY_ISA_FLAGS = 6,
+  CSKY_ISA_EXT_FLAGS = 7,
+  CSKY_DSP_VERSION = 8,
+  CSKY_VDSP_VERSION = 9,
+  CSKY_FPU_VERSION = 16,
+  CSKY_FPU_ABI = 17,
+  CSKY_FPU_ROUNDING = 18,
+  CSKY_FPU_DENORMAL = 19,
+  CSKY_FPU_EXCEPTION = 20,
+  CSKY_FPU_NUMBER_MODULE = 21,
+  CSKY_FPU_HARDFP = 22
+};
+
+enum ISA_FLAGS {
+  V2_ISA_E1 = 1 << 1,
+  V2_ISA_1E2 = 1 << 2,
+  V2_ISA_2E3 = 1 << 3,
+  V2_ISA_3E7 = 1 << 4,
+  V2_ISA_7E10 = 1 << 5,
+  V2_ISA_3E3R1 = 1 << 6,
+  V2_ISA_3E3R2 = 1 << 7,
+  V2_ISA_10E60 = 1 << 8,
+  V2_ISA_3E3R3 = 1 << 9,
+  ISA_TRUST = 1 << 11,
+  ISA_CACHE = 1 << 12,
+  ISA_NVIC = 1 << 13,
+  ISA_CP = 1 << 14,
+  ISA_MP = 1 << 15,
+  ISA_MP_1E2 = 1 << 16,
+  ISA_JAVA = 1 << 17,
+  ISA_MAC = 1 << 18,
+  ISA_MAC_DSP = 1 << 19,
+  ISA_DSP = 1 << 20,
+  ISA_DSP_1E2 = 1 << 21,
+  ISA_DSP_ENHANCE = 1 << 22,
+  ISA_DSP_SILAN = 1 << 23,
+  ISA_VDSP = 1 << 24,
+  ISA_VDSP_2 = 1 << 25,
+  ISA_VDSP_2E3 = 1 << 26,
+  V2_ISA_DSPE60 = 1 << 27,
+  ISA_VDSP_2E60F = 1 << 28
+};
+
+enum ISA_EXT_FLAGS {
+  ISA_FLOAT_E1 = 1 << 0,
+  ISA_FLOAT_1E2 = 1 << 1,
+  ISA_FLOAT_1E3 = 1 << 2,
+  ISA_FLOAT_3E4 = 1 << 3,
+  ISA_FLOAT_7E60 = 1 << 4
+};
+
+enum { NONE = 0, NEEDED = 1 };
+
+enum DSP_VERSION { DSP_VERSION_EXTENSION = 1, DSP_VERSION_2 = 2 };
+
+enum VDSP_VERSION { VDSP_VERSION_1 = 1, VDSP_VERSION_2 = 2 };
+
+enum FPU_VERSION { FPU_VERSION_1 = 1, FPU_VERSION_2 = 2, FPU_VERSION_3 = 3 };
+
+enum FPU_ABI { FPU_ABI_SOFT = 1, FPU_ABI_SOFTFP = 2, FPU_ABI_HARD = 3 };
+
+enum FPU_HARDFP {
+  FPU_HARDFP_HALF = 1,
+  FPU_HARDFP_SINGLE = 2,
+  FPU_HARDFP_DOUBLE = 4
+};
+
+} // namespace CSKYAttrs
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/Support/CSKYTargetParser.def b/llvm/include/llvm/Support/CSKYTargetParser.def
new file mode 100644
index 000000000000..c93d6fdf8cce
--- /dev/null
+++ b/llvm/include/llvm/Support/CSKYTargetParser.def
@@ -0,0 +1,524 @@
+//===- CSKYTargetParser.def - CSKY target parsing defines -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides defines to build up the CSKY target parser's logic.
+//
+//===----------------------------------------------------------------------===//
+
+// NOTE: NO INCLUDE GUARD DESIRED!
+
+#ifndef CSKY_FPU
+#define CSKY_FPU(NAME, KIND, VERSION)
+#endif
+CSKY_FPU("invalid", FK_INVALID, FPUVersion::NONE)
+CSKY_FPU("auto", FK_AUTO, FPUVersion::FPV2)
+CSKY_FPU("fpv2", FK_FPV2, FPUVersion::FPV2)
+CSKY_FPU("fpv2_divd", FK_FPV2_DIVD, FPUVersion::FPV2)
+CSKY_FPU("fpv2_sf", FK_FPV2_SF, FPUVersion::FPV2)
+CSKY_FPU("fpv3", FK_FPV3, FPUVersion::FPV3)
+CSKY_FPU("fpv3_hf", FK_FPV3_HF, FPUVersion::FPV3)
+CSKY_FPU("fpv3_hsf", FK_FPV3_HSF, FPUVersion::FPV3)
+CSKY_FPU("fpv3_sdf", FK_FPV3_SDF, FPUVersion::FPV3)
+
+#undef CSKY_FPU
+
+#ifndef CSKY_ARCH
+#define CSKY_ARCH(NAME, ID, ARCH_BASE_EXT)
+#endif
+CSKY_ARCH("invalid", INVALID, CSKY::AEK_INVALID)
+CSKY_ARCH("ck801", CK801, CSKY::MAEK_E1 | CSKY::AEK_TRUST)
+CSKY_ARCH("ck802", CK802, CSKY::MAEK_E2 | CSKY::AEK_TRUST | CSKY::AEK_NVIC)
+CSKY_ARCH("ck803", CK803,
+          CSKY::MAEK_2E3 | CSKY::AEK_MP | CSKY::AEK_TRUST | CSKY::AEK_NVIC |
+              CSKY::AEK_HWDIV)
+CSKY_ARCH("ck803s", CK803S,
+          CSKY::MAEK_2E3 | CSKY::AEK_MP | CSKY::AEK_TRUST | CSKY::AEK_NVIC |
+              CSKY::AEK_HWDIV)
+CSKY_ARCH("ck804", CK804,
+          CSKY::MAEK_2E3 | CSKY::AEK_MP | CSKY::AEK_TRUST | CSKY::AEK_NVIC |
+              CSKY::AEK_HWDIV | CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3)
+CSKY_ARCH("ck805", CK805,
+          CSKY::MAEK_2E3 | CSKY::AEK_MP | CSKY::AEK_TRUST | CSKY::AEK_NVIC |
+              CSKY::AEK_HWDIV | CSKY::AEK_HIGHREG | CSKY::MAEK_3E3R2 |
+              CSKY::AEK_3E3R3 | CSKY::AEK_VDSPV2 | CSKY::AEK_VDSP2E3)
+CSKY_ARCH("ck807", CK807,
+          CSKY::MAEK_3E7 | CSKY::MAEK_MP | CSKY::MAEK_MP1E2 | CSKY::AEK_TRUST |
+              CSKY::AEK_HWDIV | CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 |
+              CSKY::AEK_DSPE60 | CSKY::AEK_HIGHREG | CSKY::AEK_HARDTP |
+              CSKY::AEK_NVIC | CSKY::AEK_CACHE)
+CSKY_ARCH("ck810", CK810,
+          CSKY::MAEK_7E10 | CSKY::MAEK_MP | CSKY::MAEK_MP1E2 | CSKY::AEK_TRUST |
+              CSKY::AEK_HWDIV | CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 |
+              CSKY::AEK_DSPE60 | CSKY::AEK_HIGHREG | CSKY::AEK_HARDTP |
+              CSKY::AEK_NVIC | CSKY::AEK_CACHE)
+CSKY_ARCH("ck810v", CK810V,
+          CSKY::MAEK_7E10 | CSKY::MAEK_MP | CSKY::MAEK_MP1E2 | CSKY::AEK_TRUST |
+              CSKY::AEK_HWDIV | CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 |
+              CSKY::AEK_DSPE60 | CSKY::AEK_HIGHREG | CSKY::AEK_HARDTP |
+              CSKY::AEK_NVIC | CSKY::AEK_CACHE | CSKY::AEK_VDSPV1)
+CSKY_ARCH("ck860", CK860,
+          CSKY::MAEK_10E60 | CSKY::MAEK_MP | CSKY::MAEK_MP1E2 |
+              CSKY::AEK_TRUST | CSKY::AEK_HWDIV | CSKY::AEK_DSPE60 |
+              CSKY::AEK_HIGHREG | CSKY::AEK_HARDTP | CSKY::AEK_NVIC |
+              CSKY::AEK_CACHE | CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3)
+CSKY_ARCH("ck860v", CK860V,
+          CSKY::MAEK_10E60 | CSKY::MAEK_MP | CSKY::MAEK_MP1E2 |
+              CSKY::AEK_TRUST | CSKY::AEK_HWDIV | CSKY::AEK_DSPE60 |
+              CSKY::AEK_HIGHREG | CSKY::AEK_HARDTP | CSKY::AEK_NVIC |
+              CSKY::AEK_CACHE | CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3 |
+              CSKY::AEK_VDSPV2 | CSKY::AEK_VDSP2E60F)
+#undef CSKY_ARCH
+
+#ifndef CSKY_ARCH_EXT_NAME
+#define CSKY_ARCH_EXT_NAME(NAME, ID, FEATURE, NEGFEATURE)
+#endif
+CSKY_ARCH_EXT_NAME("invalid", CSKY::AEK_INVALID, nullptr, nullptr)
+CSKY_ARCH_EXT_NAME("none", CSKY::AEK_NONE, nullptr, nullptr)
+CSKY_ARCH_EXT_NAME("fpuv2_sf", CSKY::AEK_FPUV2SF, "+fpuv2_sf", "-fpuv2_sf")
+CSKY_ARCH_EXT_NAME("fpuv2_df", CSKY::AEK_FPUV2DF, "+fpuv2_df", "-fpuv2_df")
+CSKY_ARCH_EXT_NAME("fdivdu", CSKY::AEK_FDIVDU, "+fdivdu", "-fdivdu")
+CSKY_ARCH_EXT_NAME("fpuv3_hi", CSKY::AEK_FPUV3HI, "+fpuv3_hi", "-fpuv3_hi")
+CSKY_ARCH_EXT_NAME("fpuv3_hf", CSKY::AEK_FPUV3HF, "+fpuv3_hf", "-fpuv3_hf")
+CSKY_ARCH_EXT_NAME("fpuv3_sf", CSKY::AEK_FPUV3SF, "+fpuv3_sf", "-fpuv3_sf")
+CSKY_ARCH_EXT_NAME("fpuv3_df", CSKY::AEK_FPUV3DF, "+fpuv3_df", "-fpuv3_df")
+CSKY_ARCH_EXT_NAME("floate1", CSKY::AEK_FLOATE1, "+floate1", "-floate1")
+CSKY_ARCH_EXT_NAME("float1e2", CSKY::AEK_FLOAT1E2, "+float1e2", "-float1e2")
+CSKY_ARCH_EXT_NAME("float1e3", CSKY::AEK_FLOAT1E3, "+float1e3", "-float1e3")
+CSKY_ARCH_EXT_NAME("float3e4", CSKY::AEK_FLOAT3E4, "+float3e4", "-float3e4")
+CSKY_ARCH_EXT_NAME("float7e60", CSKY::AEK_FLOAT7E60, "+float7e60", "-float7e60")
+CSKY_ARCH_EXT_NAME("hwdiv", CSKY::AEK_HWDIV, "+hwdiv", "-hwdiv")
+CSKY_ARCH_EXT_NAME("multiple_stld", CSKY::AEK_STLD, "+multiple_stld",
+                   "-multiple_stld")
+CSKY_ARCH_EXT_NAME("pushpop", CSKY::AEK_PUSHPOP, "+pushpop", "-pushpop")
+CSKY_ARCH_EXT_NAME("edsp", CSKY::AEK_EDSP, "+edsp", "-edsp")
+CSKY_ARCH_EXT_NAME("dsp1e2", CSKY::AEK_DSP1E2, "+dsp1e2", "-dsp1e2")
+CSKY_ARCH_EXT_NAME("dspe60", CSKY::AEK_DSPE60, "+dspe60", "-dspe60")
+CSKY_ARCH_EXT_NAME("dspv2", CSKY::AEK_DSPV2, "+dspv2", "-dspv2")
+CSKY_ARCH_EXT_NAME("dsp_silan", CSKY::AEK_DSPSILAN, "+dsp_silan", "-dsp_silan")
+CSKY_ARCH_EXT_NAME("elrw", CSKY::AEK_ELRW, "+elrw", "-elrw")
+CSKY_ARCH_EXT_NAME("trust", CSKY::AEK_TRUST, "+trust", "-trust")
+CSKY_ARCH_EXT_NAME("java", CSKY::AEK_JAVA, "+java", "-java")
+CSKY_ARCH_EXT_NAME("cache", CSKY::AEK_CACHE, "+cache", "-cache")
+CSKY_ARCH_EXT_NAME("nvic", CSKY::AEK_NVIC, "+nvic", "-nvic")
+CSKY_ARCH_EXT_NAME("doloop", CSKY::AEK_DOLOOP, "+doloop", "-doloop")
+CSKY_ARCH_EXT_NAME("high-registers", CSKY::AEK_HIGHREG, "+high-registers",
+                   "-high-registers")
+CSKY_ARCH_EXT_NAME("smart", CSKY::AEK_SMART, "+smart", "-smart")
+CSKY_ARCH_EXT_NAME("vdsp2e3", CSKY::AEK_VDSP2E3, "+vdsp2e3", "-vdsp2e3")
+CSKY_ARCH_EXT_NAME("vdsp2e60f", CSKY::AEK_VDSP2E60F, "+vdsp2e60f", "-vdsp2e60f")
+CSKY_ARCH_EXT_NAME("vdspv2", CSKY::AEK_VDSPV2, "+vdspv2", "-vdspv2")
+CSKY_ARCH_EXT_NAME("hard-tp", CSKY::AEK_HARDTP, "+hard-tp", "-hard-tp")
+CSKY_ARCH_EXT_NAME("soft-tp", CSKY::AEK_SOFTTP, "+soft-tp", "-soft-tp")
+CSKY_ARCH_EXT_NAME("istack", CSKY::AEK_ISTACK, "+istack", "-istack")
+CSKY_ARCH_EXT_NAME("constpool", CSKY::AEK_CONSTPOOL, "+constpool", "-constpool")
+CSKY_ARCH_EXT_NAME("stack-size", CSKY::AEK_STACKSIZE, "+stack-size",
+                   "-stack-size")
+CSKY_ARCH_EXT_NAME("ccrt", CSKY::AEK_CCRT, "+ccrt", "-ccrt")
+CSKY_ARCH_EXT_NAME("vdspv1", CSKY::AEK_VDSPV1, "+vdspv1", "-vdspv1")
+
+CSKY_ARCH_EXT_NAME("e1", CSKY::AEK_E1, "+e1", "-e1")
+CSKY_ARCH_EXT_NAME("e2", CSKY::AEK_E2, "+e2", "-e2")
+CSKY_ARCH_EXT_NAME("2e3", CSKY::AEK_2E3, "+2e3", "-2e3")
+CSKY_ARCH_EXT_NAME("mp", CSKY::AEK_MP, "+mp", "-mp")
+CSKY_ARCH_EXT_NAME("3e3r1", CSKY::AEK_3E3R1, "+3e3r1", "-3e3r1")
+CSKY_ARCH_EXT_NAME("3e3r2", CSKY::AEK_3E3R2, "+3e3r2", "-3e3r2")
+CSKY_ARCH_EXT_NAME("3e3r3", CSKY::AEK_3E3R3, "+3e3r3", "-3e3r3")
+CSKY_ARCH_EXT_NAME("3e7", CSKY::AEK_3E7, "+3e7", "-3e7")
+CSKY_ARCH_EXT_NAME("mp1e2", CSKY::AEK_MP1E2, "+mp1e2", "-mp1e2")
+CSKY_ARCH_EXT_NAME("7e10", CSKY::AEK_7E10, "+7e10", "-7e10")
+CSKY_ARCH_EXT_NAME("10e60", CSKY::AEK_10E60, "+10e60", "-10e60")
+
+#undef CSKY_ARCH_EXT_NAME
+
+#ifndef CSKY_CPU_NAME
+#define CSKY_CPU_NAME(NAME, ARCH_ID, DEFAULT_EXT)
+#endif
+
+CSKY_CPU_NAME("ck801", CK801, CSKY::AEK_NONE)
+CSKY_CPU_NAME("ck801t", CK801, CSKY::AEK_NONE)
+CSKY_CPU_NAME("e801", CK801, CSKY::AEK_NONE)
+
+CSKY_CPU_NAME("ck802", CK802, CSKY::AEK_NONE)
+CSKY_CPU_NAME("ck802t", CK802, CSKY::AEK_NONE)
+CSKY_CPU_NAME("ck802j", CK802, CSKY::AEK_JAVA)
+CSKY_CPU_NAME("e802", CK802, CSKY::AEK_NONE)
+CSKY_CPU_NAME("e802t", CK802, CSKY::AEK_NONE)
+CSKY_CPU_NAME("s802", CK802, CSKY::AEK_NONE)
+CSKY_CPU_NAME("s802t", CK802, CSKY::AEK_NONE)
+
+CSKY_CPU_NAME("ck803", CK803, CSKY::AEK_NONE)
+CSKY_CPU_NAME("ck803h", CK803, CSKY::AEK_NONE)
+CSKY_CPU_NAME("ck803t", CK803, CSKY::AEK_NONE)
+CSKY_CPU_NAME("ck803ht", CK803, CSKY::AEK_NONE)
+CSKY_CPU_NAME("ck803f", CK803,
+              CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3)
+CSKY_CPU_NAME("ck803fh", CK803,
+              CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3)
+CSKY_CPU_NAME("ck803e", CK803,
+              CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60)
+CSKY_CPU_NAME("ck803eh", CK803,
+              CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60)
+CSKY_CPU_NAME("ck803et", CK803,
+              CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60)
+CSKY_CPU_NAME("ck803eht", CK803,
+              CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60)
+CSKY_CPU_NAME("ck803ef", CK803,
+              CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3)
+CSKY_CPU_NAME("ck803efh", CK803,
+              CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3)
+CSKY_CPU_NAME("ck803ft", CK803,
+              CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3)
+CSKY_CPU_NAME("ck803eft", CK803,
+              CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3)
+CSKY_CPU_NAME("ck803efht", CK803,
+              CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3)
+CSKY_CPU_NAME("ck803r1", CK803,
+              CSKY::MAEK_3E3R1 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2)
+CSKY_CPU_NAME("ck803r2", CK803,
+              CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2)
+CSKY_CPU_NAME("ck803r3", CK803,
+              CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2)
+CSKY_CPU_NAME("ck803hr1", CK803,
+              CSKY::MAEK_3E3R1 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2)
+CSKY_CPU_NAME("ck803hr2", CK803,
+              CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2)
+CSKY_CPU_NAME("ck803hr3", CK803,
+              CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2)
+CSKY_CPU_NAME("ck803tr1", CK803,
+              CSKY::MAEK_3E3R1 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2)
+CSKY_CPU_NAME("ck803tr2", CK803,
+              CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2)
+CSKY_CPU_NAME("ck803tr3", CK803,
+              CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2)
+CSKY_CPU_NAME("ck803htr1", CK803,
+              CSKY::MAEK_3E3R1 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2)
+CSKY_CPU_NAME("ck803htr2", CK803,
+              CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2)
+CSKY_CPU_NAME("ck803htr3", CK803,
+              CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2)
+CSKY_CPU_NAME("ck803fr1", CK803,
+              CSKY::MAEK_3E3R1 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3)
+CSKY_CPU_NAME("ck803fr2", CK803,
+              CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3)
+CSKY_CPU_NAME("ck803fr3", CK803,
+              CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3)
+CSKY_CPU_NAME("ck803fhr1", CK803,
+              CSKY::MAEK_3E3R1 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3)
+CSKY_CPU_NAME("ck803fhr2", CK803,
+              CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3)
+CSKY_CPU_NAME("ck803fhr3", CK803,
+              CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3)
+CSKY_CPU_NAME("ck803er1", CK803,
+              CSKY::MAEK_3E3R1 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("ck803er2", CK803,
+              CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("ck803er3", CK803,
+              CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("ck803ehr1", CK803,
+              CSKY::MAEK_3E3R1 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("ck803ehr2", CK803,
+              CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("ck803ehr3", CK803,
+              CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("ck803etr1", CK803,
+              CSKY::MAEK_3E3R1 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("ck803etr2", CK803,
+              CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("ck803etr3", CK803,
+              CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("ck803ehtr1", CK803,
+              CSKY::MAEK_3E3R1 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("ck803ehtr2", CK803,
+              CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("ck803ehtr3", CK803,
+              CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("ck803efr1", CK803,
+              CSKY::MAEK_3E3R1 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3 |
+                  CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("ck803efr2", CK803,
+              CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3 |
+                  CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("ck803efr3", CK803,
+              CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3 |
+                  CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("ck803efhr1", CK803,
+              CSKY::MAEK_3E3R1 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3 |
+                  CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("ck803efhr2", CK803,
+              CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3 |
+                  CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("ck803efhr3", CK803,
+              CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3 |
+                  CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("ck803ftr1", CK803,
+              CSKY::MAEK_3E3R1 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3)
+CSKY_CPU_NAME("ck803ftr2", CK803,
+              CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3)
+CSKY_CPU_NAME("ck803ftr3", CK803,
+              CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3)
+CSKY_CPU_NAME("ck803eftr1", CK803,
+              CSKY::MAEK_3E3R1 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3 |
+                  CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("ck803eftr2", CK803,
+              CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3 |
+                  CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("ck803eftr3", CK803,
+              CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3 |
+                  CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("ck803efhtr1", CK803,
+              CSKY::MAEK_3E3R1 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3 |
+                  CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("ck803efhtr2", CK803,
+              CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3 |
+                  CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("ck803efhtr3", CK803,
+              CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3 | CSKY::AEK_DSPV2 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3 |
+                  CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("s803", CK803, CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3)
+CSKY_CPU_NAME("s803t", CK803, CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3)
+CSKY_CPU_NAME("e803", CK803, CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3)
+CSKY_CPU_NAME("e803t", CK803, CSKY::MAEK_3E3R2 | CSKY::AEK_3E3R3)
+
+CSKY_CPU_NAME("ck803s", CK803S, CSKY::AEK_NONE)
+CSKY_CPU_NAME("ck803st", CK803S, CSKY::AEK_NONE)
+CSKY_CPU_NAME("ck803se", CK803S,
+              CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60)
+CSKY_CPU_NAME("ck803sf", CK803S,
+              CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3)
+CSKY_CPU_NAME("ck803sef", CK803S,
+              CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3)
+CSKY_CPU_NAME("ck803seft", CK803S,
+              CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3)
+
+CSKY_CPU_NAME("ck804", CK804, CSKY::AEK_NONE)
+CSKY_CPU_NAME("ck804h", CK804, CSKY::AEK_NONE)
+CSKY_CPU_NAME("ck804t", CK804, CSKY::AEK_NONE)
+CSKY_CPU_NAME("ck804ht", CK804, CSKY::AEK_NONE)
+CSKY_CPU_NAME("ck804f", CK804,
+              CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3)
+CSKY_CPU_NAME("ck804fh", CK804,
+              CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3)
+CSKY_CPU_NAME("ck804e", CK804,
+              CSKY::AEK_DSPV2 | CSKY::AEK_3E3R1 | CSKY::AEK_3E3R3 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("ck804eh", CK804,
+              CSKY::AEK_DSPV2 | CSKY::AEK_3E3R1 | CSKY::AEK_3E3R3 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("ck804et", CK804,
+              CSKY::AEK_DSPV2 | CSKY::AEK_3E3R1 | CSKY::AEK_3E3R3 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("ck804eht", CK804,
+              CSKY::AEK_DSPV2 | CSKY::AEK_3E3R1 | CSKY::AEK_3E3R3 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("ck804ef", CK804,
+              CSKY::AEK_DSPV2 | CSKY::AEK_3E3R1 | CSKY::AEK_3E3R3 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("ck804efh", CK804,
+              CSKY::AEK_DSPV2 | CSKY::AEK_3E3R1 | CSKY::AEK_3E3R3 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("ck804ft", CK804,
+              CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3)
+CSKY_CPU_NAME("ck804eft", CK804,
+              CSKY::AEK_DSPV2 | CSKY::AEK_3E3R1 | CSKY::AEK_3E3R3 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("ck804efht", CK804,
+              CSKY::AEK_DSPV2 | CSKY::AEK_3E3R1 | CSKY::AEK_3E3R3 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("e804d", CK804,
+              CSKY::AEK_DSPV2 | CSKY::AEK_3E3R1 | CSKY::AEK_3E3R3 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("e804dt", CK804,
+              CSKY::AEK_DSPV2 | CSKY::AEK_3E3R1 | CSKY::AEK_3E3R3 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("e804f", CK804,
+              CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3)
+CSKY_CPU_NAME("e804ft", CK804,
+              CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3)
+CSKY_CPU_NAME("e804df", CK804,
+              CSKY::AEK_DSPV2 | CSKY::AEK_3E3R1 | CSKY::AEK_3E3R3 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3 |
+                  CSKY::AEK_HIGHREG)
+CSKY_CPU_NAME("e804dft", CK804,
+              CSKY::AEK_DSPV2 | CSKY::AEK_3E3R1 | CSKY::AEK_3E3R3 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3 |
+                  CSKY::AEK_HIGHREG)
+
+CSKY_CPU_NAME("ck805", CK805, CSKY::AEK_NONE)
+CSKY_CPU_NAME("ck805e", CK805,
+              CSKY::AEK_DSPV2 | CSKY::AEK_3E3R1 | CSKY::AEK_3E3R3)
+CSKY_CPU_NAME("ck805f", CK805,
+              CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3)
+CSKY_CPU_NAME("ck805t", CK805, CSKY::AEK_NONE)
+CSKY_CPU_NAME("ck805ef", CK805,
+              CSKY::AEK_DSPV2 | CSKY::AEK_3E3R1 | CSKY::AEK_3E3R3 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3)
+CSKY_CPU_NAME("ck805et", CK805,
+              CSKY::AEK_DSPV2 | CSKY::AEK_3E3R1 | CSKY::AEK_3E3R3)
+CSKY_CPU_NAME("ck805ft", CK805,
+              CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3)
+CSKY_CPU_NAME("ck805eft", CK805,
+              CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3 |
+                  CSKY::AEK_DSPV2 | CSKY::AEK_3E3R1 | CSKY::AEK_3E3R3)
+CSKY_CPU_NAME("i805", CK805, CSKY::AEK_NONE)
+CSKY_CPU_NAME("i805f", CK805,
+              CSKY::AEK_FPUV2SF | CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E3)
+
+CSKY_CPU_NAME("ck807", CK807, CSKY::AEK_NONE)
+CSKY_CPU_NAME("ck807e", CK807,
+              CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60)
+CSKY_CPU_NAME("ck807f", CK807,
+              CSKY::AEK_FPUV2SF | CSKY::AEK_FPUV2DF | CSKY::AEK_FDIVDU |
+                  CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E2 | CSKY::AEK_FLOAT1E3 |
+                  CSKY::AEK_FLOAT3E4)
+CSKY_CPU_NAME("ck807ef", CK807,
+              CSKY::AEK_EDSP | CSKY::AEK_DSP1E2 | CSKY::AEK_DSPE60 |
+                  CSKY::AEK_FPUV2SF | CSKY::AEK_FPUV2DF | CSKY::AEK_FDIVDU |
+                  CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E2 | CSKY::AEK_FLOAT1E3 |
+                  CSKY::AEK_FLOAT3E4)
+CSKY_CPU_NAME("c807", CK807, CSKY::AEK_NONE)
+CSKY_CPU_NAME("c807f", CK807,
+              CSKY::AEK_FPUV2SF | CSKY::AEK_FPUV2DF | CSKY::AEK_FDIVDU |
+                  CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E2 | CSKY::AEK_FLOAT1E3 |
+                  CSKY::AEK_FLOAT3E4)
+CSKY_CPU_NAME("r807", CK807, CSKY::AEK_NONE)
+CSKY_CPU_NAME("r807f", CK807,
+              CSKY::AEK_FPUV2SF | CSKY::AEK_FPUV2DF | CSKY::AEK_FDIVDU |
+                  CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E2 | CSKY::AEK_FLOAT1E3 |
+                  CSKY::AEK_FLOAT3E4)
+
+CSKY_CPU_NAME("ck810e", CK810, CSKY::AEK_NONE)
+CSKY_CPU_NAME("ck810et", CK810, CSKY::AEK_NONE)
+CSKY_CPU_NAME("ck810ef", CK810,
+              CSKY::AEK_FPUV2SF | CSKY::AEK_FPUV2DF | CSKY::AEK_FDIVDU |
+                  CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E2)
+CSKY_CPU_NAME("ck810eft", CK810,
+              CSKY::AEK_FPUV2SF | CSKY::AEK_FPUV2DF | CSKY::AEK_FDIVDU |
+                  CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E2)
+CSKY_CPU_NAME("ck810", CK810, CSKY::AEK_NONE)
+CSKY_CPU_NAME("ck810f", CK810,
+              CSKY::AEK_FPUV2SF | CSKY::AEK_FPUV2DF | CSKY::AEK_FDIVDU |
+                  CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E2)
+CSKY_CPU_NAME("ck810t", CK810, CSKY::AEK_NONE)
+CSKY_CPU_NAME("ck810ft", CK810,
+              CSKY::AEK_FPUV2SF | CSKY::AEK_FPUV2DF | CSKY::AEK_FDIVDU |
+                  CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E2)
+CSKY_CPU_NAME("c810", CK810,
+              CSKY::AEK_FPUV2SF | CSKY::AEK_FPUV2DF | CSKY::AEK_FDIVDU |
+                  CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E2)
+CSKY_CPU_NAME("c810t", CK810,
+              CSKY::AEK_FPUV2SF | CSKY::AEK_FPUV2DF | CSKY::AEK_FDIVDU |
+                  CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E2)
+
+CSKY_CPU_NAME("ck810v", CK810V, CSKY::AEK_NONE)
+CSKY_CPU_NAME("ck810ev", CK810V, CSKY::AEK_NONE)
+CSKY_CPU_NAME("ck810tv", CK810V, CSKY::AEK_NONE)
+CSKY_CPU_NAME("ck810etv", CK810V, CSKY::AEK_NONE)
+CSKY_CPU_NAME("c810v", CK810V,
+              CSKY::AEK_FPUV2SF | CSKY::AEK_FPUV2DF | CSKY::AEK_FDIVDU |
+                  CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E2)
+CSKY_CPU_NAME("ck810fv", CK810V,
+              CSKY::AEK_FPUV2SF | CSKY::AEK_FPUV2DF | CSKY::AEK_FDIVDU |
+                  CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E2)
+CSKY_CPU_NAME("ck810efv", CK810V,
+              CSKY::AEK_FPUV2SF | CSKY::AEK_FPUV2DF | CSKY::AEK_FDIVDU |
+                  CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E2)
+CSKY_CPU_NAME("ck810ftv", CK810V,
+              CSKY::AEK_FPUV2SF | CSKY::AEK_FPUV2DF | CSKY::AEK_FDIVDU |
+                  CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E2)
+CSKY_CPU_NAME("c810tv", CK810V,
+              CSKY::AEK_FPUV2SF | CSKY::AEK_FPUV2DF | CSKY::AEK_FDIVDU |
+                  CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E2)
+CSKY_CPU_NAME("c810eftv", CK810V,
+              CSKY::AEK_FPUV2SF | CSKY::AEK_FPUV2DF | CSKY::AEK_FDIVDU |
+                  CSKY::AEK_FLOATE1 | CSKY::AEK_FLOAT1E2)
+
+CSKY_CPU_NAME("ck860", CK860, CSKY::AEK_NONE)
+CSKY_CPU_NAME("ck860f", CK860,
+              CSKY::AEK_FPUV3HI | CSKY::AEK_FPUV3HF | CSKY::AEK_FPUV3SF |
+                  CSKY::AEK_FPUV3DF | CSKY::AEK_FLOAT7E60)
+CSKY_CPU_NAME("c860", CK860,
+              CSKY::AEK_FPUV3HI | CSKY::AEK_FPUV3HF | CSKY::AEK_FPUV3SF |
+                  CSKY::AEK_FPUV3DF | CSKY::AEK_FLOAT7E60)
+
+CSKY_CPU_NAME("ck860v", CK860V, CSKY::AEK_NONE)
+CSKY_CPU_NAME("ck860fv", CK860V,
+              CSKY::AEK_FPUV3HI | CSKY::AEK_FPUV3HF | CSKY::AEK_FPUV3SF |
+                  CSKY::AEK_FPUV3DF | CSKY::AEK_FLOAT7E60)
+CSKY_CPU_NAME("c860v", CK860V,
+              CSKY::AEK_FPUV3HI | CSKY::AEK_FPUV3HF | CSKY::AEK_FPUV3SF |
+                  CSKY::AEK_FPUV3DF | CSKY::AEK_FLOAT7E60)
+// Invalid CPU
+CSKY_CPU_NAME("invalid", INVALID, CSKY::AEK_INVALID)
+#undef CSKY_CPU_NAME
diff --git a/llvm/include/llvm/Support/CSKYTargetParser.h b/llvm/include/llvm/Support/CSKYTargetParser.h
new file mode 100644
index 000000000000..ca33a7ee406c
--- /dev/null
+++ b/llvm/include/llvm/Support/CSKYTargetParser.h
@@ -0,0 +1,203 @@
+//===-- CSKYTargetParser - Parser for CSKY target features --------*- C++
+//-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a target parser to recognise CSKY hardware features
+// such as FPU/CPU/ARCH/extensions and specific support such as HWDIV.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_CSKYTARGETPARSER_H
+#define LLVM_SUPPORT_CSKYTARGETPARSER_H
+
+#include "llvm/ADT/Triple.h"
+#include <vector>
+
+namespace llvm {
+class StringRef;
+
+namespace CSKY {
+
+// Arch extension modifiers for CPUs.
+enum ArchExtKind : uint64_t {
+  AEK_INVALID = 0,
+  AEK_NONE = 1,
+  AEK_FPUV2SF = 1 << 1,
+  AEK_FPUV2DF = 1 << 2,
+  AEK_FDIVDU = 1 << 3,
+  AEK_FPUV3HI = 1 << 4,
+  AEK_FPUV3HF = 1 << 5,
+  AEK_FPUV3SF = 1 << 6,
+  AEK_FPUV3DF = 1 << 7,
+  AEK_FLOATE1 = 1 << 8,
+  AEK_FLOAT1E2 = 1 << 9,
+  AEK_FLOAT1E3 = 1 << 10,
+  AEK_FLOAT3E4 = 1 << 11,
+  AEK_FLOAT7E60 = 1 << 12,
+  AEK_HWDIV = 1 << 13,
+  AEK_STLD = 1 << 14,
+  AEK_PUSHPOP = 1 << 15,
+  AEK_EDSP = 1 << 16,
+  AEK_DSP1E2 = 1 << 17,
+  AEK_DSPE60 = 1 << 18,
+  AEK_DSPV2 = 1 << 19,
+  AEK_DSPSILAN = 1 << 20,
+  AEK_ELRW = 1 << 21,
+  AEK_TRUST = 1 << 22,
+  AEK_JAVA = 1 << 23,
+  AEK_CACHE = 1 << 24,
+  AEK_NVIC = 1 << 25,
+  AEK_DOLOOP = 1 << 26,
+  AEK_HIGHREG = 1 << 27,
+  AEK_SMART = 1 << 28,
+  AEK_VDSP2E3 = 1 << 29,
+  AEK_VDSP2E60F = 1 << 30,
+  AEK_VDSPV2 = 1ULL << 31,
+  AEK_HARDTP = 1ULL << 32,
+  AEK_SOFTTP = 1ULL << 33,
+  AEK_ISTACK = 1ULL << 34,
+  AEK_CONSTPOOL = 1ULL << 35,
+  AEK_STACKSIZE = 1ULL << 36,
+  AEK_CCRT = 1ULL << 37,
+  AEK_VDSPV1 = 1ULL << 38,
+  AEK_E1 = 1ULL << 39,
+  AEK_E2 = 1ULL << 40,
+  AEK_2E3 = 1ULL << 41,
+  AEK_MP = 1ULL << 42,
+  AEK_3E3R1 = 1ULL << 43,
+  AEK_3E3R2 = 1ULL << 44,
+  AEK_3E3R3 = 1ULL << 45,
+  AEK_3E7 = 1ULL << 46,
+  AEK_MP1E2 = 1ULL << 47,
+  AEK_7E10 = 1ULL << 48,
+  AEK_10E60 = 1ULL << 49
+
+};
+
+// Arch extension modifiers for CPUs.
+enum MultiArchExtKind : uint64_t {
+  MAEK_E1 = CSKY::AEK_E1 | CSKY::AEK_ELRW,
+  MAEK_E2 = CSKY::AEK_E2 | CSKY::MAEK_E1,
+  MAEK_2E3 = CSKY::AEK_2E3 | CSKY::MAEK_E2,
+  MAEK_MP = CSKY::AEK_MP | CSKY::MAEK_2E3,
+  MAEK_3E3R1 = CSKY::AEK_3E3R1,
+  MAEK_3E3R2 = CSKY::AEK_3E3R1 | CSKY::AEK_3E3R2 | CSKY::AEK_DOLOOP,
+  MAEK_3E7 = CSKY::AEK_3E7 | CSKY::MAEK_2E3,
+  MAEK_MP1E2 = CSKY::AEK_MP1E2 | CSKY::MAEK_3E7,
+  MAEK_7E10 = CSKY::AEK_7E10 | CSKY::MAEK_3E7,
+  MAEK_10E60 = CSKY::AEK_10E60 | CSKY::MAEK_7E10,
+};
+// FPU names.
+enum CSKYFPUKind {
+#define CSKY_FPU(NAME, KIND, VERSION) KIND,
+#include "CSKYTargetParser.def"
+  FK_LAST
+};
+
+// FPU Version
+enum class FPUVersion {
+  NONE,
+  FPV2,
+  FPV3,
+};
+
+// Arch names.
+enum class ArchKind {
+#define CSKY_ARCH(NAME, ID, ARCH_BASE_EXT) ID,
+#include "CSKYTargetParser.def"
+};
+
+// List of Arch Extension names.
+// FIXME: TableGen this.
+struct ExtName {
+  const char *NameCStr;
+  size_t NameLength;
+  uint64_t ID;
+  const char *Feature;
+  const char *NegFeature;
+
+  StringRef getName() const { return StringRef(NameCStr, NameLength); }
+};
+
+const CSKY::ExtName CSKYARCHExtNames[] = {
+#define CSKY_ARCH_EXT_NAME(NAME, ID, FEATURE, NEGFEATURE)                      \
+  {NAME, sizeof(NAME) - 1, ID, FEATURE, NEGFEATURE},
+#include "CSKYTargetParser.def"
+};
+
+// List of CPU names and their arches.
+template <typename T> struct CpuNames {
+  const char *NameCStr;
+  size_t NameLength;
+  T ArchID;
+  uint64_t defaultExt;
+
+  StringRef getName() const { return StringRef(NameCStr, NameLength); }
+};
+const CpuNames<CSKY::ArchKind> CPUNames[] = {
+#define CSKY_CPU_NAME(NAME, ARCH_ID, DEFAULT_EXT)                              \
+  {NAME, sizeof(NAME) - 1, CSKY::ArchKind::ARCH_ID, DEFAULT_EXT},
+#include "llvm/Support/CSKYTargetParser.def"
+};
+
+// FIXME: TableGen this.
+// The entries must appear in the order listed in CSKY::CSKYFPUKind for correct
+// indexing
+struct FPUName {
+  const char *NameCStr;
+  size_t NameLength;
+  CSKYFPUKind ID;
+  FPUVersion FPUVer;
+
+  StringRef getName() const { return StringRef(NameCStr, NameLength); }
+};
+
+static const FPUName FPUNames[] = {
+#define CSKY_FPU(NAME, KIND, VERSION) {NAME, sizeof(NAME) - 1, KIND, VERSION},
+#include "llvm/Support/CSKYTargetParser.def"
+};
+
+// List of canonical arch names.
+template <typename T> struct ArchNames {
+  const char *NameCStr;
+  size_t NameLength;
+  T ID;
+  uint64_t archBaseExt;
+  StringRef getName() const { return StringRef(NameCStr, NameLength); }
+};
+const ArchNames<CSKY::ArchKind> ARCHNames[] = {
+#define CSKY_ARCH(NAME, ID, ARCH_BASE_EXT)                                     \
+  {NAME, sizeof(NAME) - 1, CSKY::ArchKind::ID, ARCH_BASE_EXT},
+#include "llvm/Support/CSKYTargetParser.def"
+};
+
+StringRef getArchName(ArchKind AK);
+StringRef getDefaultCPU(StringRef Arch);
+StringRef getArchExtName(uint64_t ArchExtKind);
+StringRef getArchExtFeature(StringRef ArchExt);
+uint64_t getDefaultExtensions(StringRef CPU);
+bool getExtensionFeatures(uint64_t Extensions,
+                          std::vector<StringRef> &Features);
+
+// Information by ID
+StringRef getFPUName(unsigned FPUKind);
+FPUVersion getFPUVersion(unsigned FPUKind);
+
+bool getFPUFeatures(CSKYFPUKind Kind, std::vector<StringRef> &Features);
+
+// Parser
+ArchKind parseArch(StringRef Arch);
+ArchKind parseCPUArch(StringRef CPU);
+uint64_t parseArchExt(StringRef ArchExt);
+void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);
+
+} // namespace CSKY
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/Support/Casting.h b/llvm/include/llvm/Support/Casting.h
index d6f7793d5df0..894c1f439b64 100644
--- a/llvm/include/llvm/Support/Casting.h
+++ b/llvm/include/llvm/Support/Casting.h
@@ -6,14 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the isa<X>(), cast<X>(), dyn_cast<X>(), cast_or_null<X>(),
-// and dyn_cast_or_null<X>() templates.
+// This file defines the isa<X>(), cast<X>(), dyn_cast<X>(),
+// cast_if_present<X>(), and dyn_cast_if_present<X>() templates.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_SUPPORT_CASTING_H
 #define LLVM_SUPPORT_CASTING_H
 
+#include "llvm/ADT/Optional.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/type_traits.h"
 #include <cassert>
@@ -23,43 +24,47 @@
 namespace llvm {
 
 //===----------------------------------------------------------------------===//
-//                          isa<x> Support Templates
+// simplify_type
 //===----------------------------------------------------------------------===//
 
-// Define a template that can be specialized by smart pointers to reflect the
-// fact that they are automatically dereferenced, and are not involved with the
-// template selection process...  the default implementation is a noop.
-//
-template<typename From> struct simplify_type {
+/// Define a template that can be specialized by smart pointers to reflect the
+/// fact that they are automatically dereferenced, and are not involved with the
+/// template selection process...  the default implementation is a noop.
+// TODO: rename this and/or replace it with other cast traits.
+template <typename From> struct simplify_type {
   using SimpleType = From; // The real type this represents...
 
   // An accessor to get the real value...
   static SimpleType &getSimplifiedValue(From &Val) { return Val; }
 };
 
-template<typename From> struct simplify_type<const From> {
+template <typename From> struct simplify_type<const From> {
   using NonConstSimpleType = typename simplify_type<From>::SimpleType;
-  using SimpleType =
-      typename add_const_past_pointer<NonConstSimpleType>::type;
+  using SimpleType = typename add_const_past_pointer<NonConstSimpleType>::type;
   using RetType =
       typename add_lvalue_reference_if_not_pointer<SimpleType>::type;
 
-  static RetType getSimplifiedValue(const From& Val) {
-    return simplify_type<From>::getSimplifiedValue(const_cast<From&>(Val));
+  static RetType getSimplifiedValue(const From &Val) {
+    return simplify_type<From>::getSimplifiedValue(const_cast<From &>(Val));
   }
 };
 
+// TODO: add this namespace once everyone is switched to using the new
+//       interface.
+// namespace detail {
+
+//===----------------------------------------------------------------------===//
+// isa_impl
+//===----------------------------------------------------------------------===//
+
 // The core of the implementation of isa<X> is here; To and From should be
 // the names of classes.  This template can be specialized to customize the
 // implementation of isa<> without rewriting it from scratch.
-template <typename To, typename From, typename Enabler = void>
-struct isa_impl {
-  static inline bool doit(const From &Val) {
-    return To::classof(&Val);
-  }
+template <typename To, typename From, typename Enabler = void> struct isa_impl {
+  static inline bool doit(const From &Val) { return To::classof(&Val); }
 };
 
-/// Always allow upcasts, and perform no dynamic check for them.
+// Always allow upcasts, and perform no dynamic check for them.
 template <typename To, typename From>
 struct isa_impl<To, From, std::enable_if_t<std::is_base_of<To, From>::value>> {
   static inline bool doit(const From &) { return true; }
@@ -85,103 +90,78 @@ struct isa_impl_cl<To, const std::unique_ptr<From>> {
   }
 };
 
-template <typename To, typename From> struct isa_impl_cl<To, From*> {
+template <typename To, typename From> struct isa_impl_cl<To, From *> {
   static inline bool doit(const From *Val) {
     assert(Val && "isa<> used on a null pointer");
     return isa_impl<To, From>::doit(*Val);
   }
 };
 
-template <typename To, typename From> struct isa_impl_cl<To, From*const> {
+template <typename To, typename From> struct isa_impl_cl<To, From *const> {
   static inline bool doit(const From *Val) {
     assert(Val && "isa<> used on a null pointer");
     return isa_impl<To, From>::doit(*Val);
   }
 };
 
-template <typename To, typename From> struct isa_impl_cl<To, const From*> {
+template <typename To, typename From> struct isa_impl_cl<To, const From *> {
   static inline bool doit(const From *Val) {
     assert(Val && "isa<> used on a null pointer");
     return isa_impl<To, From>::doit(*Val);
   }
 };
 
-template <typename To, typename From> struct isa_impl_cl<To, const From*const> {
+template <typename To, typename From>
+struct isa_impl_cl<To, const From *const> {
   static inline bool doit(const From *Val) {
     assert(Val && "isa<> used on a null pointer");
     return isa_impl<To, From>::doit(*Val);
   }
 };
 
-template<typename To, typename From, typename SimpleFrom>
+template <typename To, typename From, typename SimpleFrom>
 struct isa_impl_wrap {
   // When From != SimplifiedType, we can simplify the type some more by using
   // the simplify_type template.
   static bool doit(const From &Val) {
     return isa_impl_wrap<To, SimpleFrom,
-      typename simplify_type<SimpleFrom>::SimpleType>::doit(
-                          simplify_type<const From>::getSimplifiedValue(Val));
+                         typename simplify_type<SimpleFrom>::SimpleType>::
+        doit(simplify_type<const From>::getSimplifiedValue(Val));
   }
 };
 
-template<typename To, typename FromTy>
+template <typename To, typename FromTy>
 struct isa_impl_wrap<To, FromTy, FromTy> {
   // When From == SimpleType, we are as simple as we are going to get.
   static bool doit(const FromTy &Val) {
-    return isa_impl_cl<To,FromTy>::doit(Val);
+    return isa_impl_cl<To, FromTy>::doit(Val);
   }
 };
 
-// isa<X> - Return true if the parameter to the template is an instance of one
-// of the template type arguments.  Used like this:
-//
-//  if (isa<Type>(myVal)) { ... }
-//  if (isa<Type0, Type1, Type2>(myVal)) { ... }
-//
-template <class X, class Y> LLVM_NODISCARD inline bool isa(const Y &Val) {
-  return isa_impl_wrap<X, const Y,
-                       typename simplify_type<const Y>::SimpleType>::doit(Val);
-}
-
-template <typename First, typename Second, typename... Rest, typename Y>
-LLVM_NODISCARD inline bool isa(const Y &Val) {
-  return isa<First>(Val) || isa<Second, Rest...>(Val);
-}
-
-// isa_and_nonnull<X> - Functionally identical to isa, except that a null value
-// is accepted.
-//
-template <typename... X, class Y>
-LLVM_NODISCARD inline bool isa_and_nonnull(const Y &Val) {
-  if (!Val)
-    return false;
-  return isa<X...>(Val);
-}
-
 //===----------------------------------------------------------------------===//
-//                          cast<x> Support Templates
+// cast_retty + cast_retty_impl
 //===----------------------------------------------------------------------===//
 
-template<class To, class From> struct cast_retty;
+template <class To, class From> struct cast_retty;
 
 // Calculate what type the 'cast' function should return, based on a requested
 // type of To and a source type of From.
-template<class To, class From> struct cast_retty_impl {
-  using ret_type = To &;       // Normal case, return Ty&
+template <class To, class From> struct cast_retty_impl {
+  using ret_type = To &; // Normal case, return Ty&
 };
-template<class To, class From> struct cast_retty_impl<To, const From> {
+template <class To, class From> struct cast_retty_impl<To, const From> {
   using ret_type = const To &; // Normal case, return Ty&
 };
 
-template<class To, class From> struct cast_retty_impl<To, From*> {
-  using ret_type = To *;       // Pointer arg case, return Ty*
+template <class To, class From> struct cast_retty_impl<To, From *> {
+  using ret_type = To *; // Pointer arg case, return Ty*
 };
 
-template<class To, class From> struct cast_retty_impl<To, const From*> {
+template <class To, class From> struct cast_retty_impl<To, const From *> {
   using ret_type = const To *; // Constant pointer arg case, return const Ty*
 };
 
-template<class To, class From> struct cast_retty_impl<To, const From*const> {
+template <class To, class From> struct cast_retty_impl<To, const From *const> {
   using ret_type = const To *; // Constant pointer arg case, return const Ty*
 };
 
@@ -195,187 +175,604 @@ public:
   using ret_type = std::unique_ptr<ResultType>;
 };
 
-template<class To, class From, class SimpleFrom>
-struct cast_retty_wrap {
+template <class To, class From, class SimpleFrom> struct cast_retty_wrap {
   // When the simplified type and the from type are not the same, use the type
   // simplifier to reduce the type, then reuse cast_retty_impl to get the
   // resultant type.
   using ret_type = typename cast_retty<To, SimpleFrom>::ret_type;
 };
 
-template<class To, class FromTy>
-struct cast_retty_wrap<To, FromTy, FromTy> {
+template <class To, class FromTy> struct cast_retty_wrap<To, FromTy, FromTy> {
   // When the simplified type is equal to the from type, use it directly.
-  using ret_type = typename cast_retty_impl<To,FromTy>::ret_type;
+  using ret_type = typename cast_retty_impl<To, FromTy>::ret_type;
 };
 
-template<class To, class From>
-struct cast_retty {
+template <class To, class From> struct cast_retty {
   using ret_type = typename cast_retty_wrap<
       To, From, typename simplify_type<From>::SimpleType>::ret_type;
 };
 
+//===----------------------------------------------------------------------===//
+// cast_convert_val
+//===----------------------------------------------------------------------===//
+
 // Ensure the non-simple values are converted using the simplify_type template
 // that may be specialized by smart pointers...
 //
-template<class To, class From, class SimpleFrom> struct cast_convert_val {
+template <class To, class From, class SimpleFrom> struct cast_convert_val {
   // This is not a simple type, use the template to simplify it...
-  static typename cast_retty<To, From>::ret_type doit(From &Val) {
+  static typename cast_retty<To, From>::ret_type doit(const From &Val) {
     return cast_convert_val<To, SimpleFrom,
-      typename simplify_type<SimpleFrom>::SimpleType>::doit(
-                          simplify_type<From>::getSimplifiedValue(Val));
+                            typename simplify_type<SimpleFrom>::SimpleType>::
+        doit(simplify_type<From>::getSimplifiedValue(const_cast<From &>(Val)));
   }
 };
 
-template<class To, class FromTy> struct cast_convert_val<To,FromTy,FromTy> {
-  // This _is_ a simple type, just cast it.
+template <class To, class FromTy> struct cast_convert_val<To, FromTy, FromTy> {
+  // If it's a reference, switch to a pointer to do the cast and then deref it.
   static typename cast_retty<To, FromTy>::ret_type doit(const FromTy &Val) {
-    typename cast_retty<To, FromTy>::ret_type Res2
-     = (typename cast_retty<To, FromTy>::ret_type)const_cast<FromTy&>(Val);
-    return Res2;
+    return *(std::remove_reference_t<typename cast_retty<To, FromTy>::ret_type>
+                 *)&const_cast<FromTy &>(Val);
+  }
+};
+
+template <class To, class FromTy>
+struct cast_convert_val<To, FromTy *, FromTy *> {
+  // If it's a pointer, we can use c-style casting directly.
+  static typename cast_retty<To, FromTy *>::ret_type doit(const FromTy *Val) {
+    return (typename cast_retty<To, FromTy *>::ret_type) const_cast<FromTy *>(
+        Val);
   }
 };
 
+//===----------------------------------------------------------------------===//
+// is_simple_type
+//===----------------------------------------------------------------------===//
+
 template <class X> struct is_simple_type {
   static const bool value =
       std::is_same<X, typename simplify_type<X>::SimpleType>::value;
 };
 
-// cast<X> - Return the argument parameter cast to the specified type.  This
-// casting operator asserts that the type is correct, so it does not return null
-// on failure.  It does not allow a null argument (use cast_or_null for that).
-// It is typically used like this:
-//
-//  cast<Instruction>(myVal)->getParent()
-//
-template <class X, class Y>
-inline std::enable_if_t<!is_simple_type<Y>::value,
-                        typename cast_retty<X, const Y>::ret_type>
-cast(const Y &Val) {
-  assert(isa<X>(Val) && "cast<Ty>() argument of incompatible type!");
-  return cast_convert_val<
-      X, const Y, typename simplify_type<const Y>::SimpleType>::doit(Val);
+// } // namespace detail
+
+//===----------------------------------------------------------------------===//
+// CastIsPossible
+//===----------------------------------------------------------------------===//
+
+/// This struct provides a way to check if a given cast is possible. It provides
+/// a static function called isPossible that is used to check if a cast can be
+/// performed. It should be overridden like this:
+///
+/// template<> struct CastIsPossible<foo, bar> {
+///   static inline bool isPossible(const bar &b) {
+///     return bar.isFoo();
+///   }
+/// };
+template <typename To, typename From, typename Enable = void>
+struct CastIsPossible {
+  static inline bool isPossible(const From &f) {
+    return isa_impl_wrap<
+        To, const From,
+        typename simplify_type<const From>::SimpleType>::doit(f);
+  }
+};
+
+// Needed for optional unwrapping. This could be implemented with isa_impl, but
+// we want to implement things in the new method and move old implementations
+// over. In fact, some of the isa_impl templates should be moved over to
+// CastIsPossible.
+template <typename To, typename From>
+struct CastIsPossible<To, Optional<From>> {
+  static inline bool isPossible(const Optional<From> &f) {
+    assert(f.hasValue() && "CastIsPossible::isPossible called on a nullopt!");
+    return isa_impl_wrap<
+        To, const From,
+        typename simplify_type<const From>::SimpleType>::doit(*f);
+  }
+};
+
+/// Upcasting (from derived to base) and casting from a type to itself should
+/// always be possible.
+template <typename To, typename From>
+struct CastIsPossible<To, From,
+                      std::enable_if_t<std::is_base_of<To, From>::value>> {
+  static inline bool isPossible(const From &f) { return true; }
+};
+
+//===----------------------------------------------------------------------===//
+// Cast traits
+//===----------------------------------------------------------------------===//
+
+/// All of these cast traits are meant to be implementations for useful casts
+/// that users may want to use that are outside the standard behavior. An
+/// example of how to use a special cast called `CastTrait` is:
+///
+/// template<> struct CastInfo<foo, bar> : public CastTrait<foo, bar> {};
+///
+/// Essentially, if your use case falls directly into one of the use cases
+/// supported by a given cast trait, simply inherit your special CastInfo
+/// directly from one of these to avoid having to reimplement the boilerplate
+/// `isPossible/castFailed/doCast/doCastIfPossible`. A cast trait can also
+/// provide a subset of those functions.
+
+/// This cast trait just provides castFailed for the specified `To` type to make
+/// CastInfo specializations more declarative. In order to use this, the target
+/// result type must be `To` and `To` must be constructible from `nullptr`.
+template <typename To> struct NullableValueCastFailed {
+  static To castFailed() { return To(nullptr); }
+};
+
+/// This cast trait just provides the default implementation of doCastIfPossible
+/// to make CastInfo specializations more declarative. The `Derived` template
+/// parameter *must* be provided for forwarding castFailed and doCast.
+template <typename To, typename From, typename Derived>
+struct DefaultDoCastIfPossible {
+  static To doCastIfPossible(From f) {
+    if (!Derived::isPossible(f))
+      return Derived::castFailed();
+    return Derived::doCast(f);
+  }
+};
+
+namespace detail {
+/// A helper to derive the type to use with `Self` for cast traits, when the
+/// provided CRTP derived type is allowed to be void.
+template <typename OptionalDerived, typename Default>
+using SelfType = std::conditional_t<std::is_same<OptionalDerived, void>::value,
+                                    Default, OptionalDerived>;
+} // namespace detail
+
+/// This cast trait provides casting for the specific case of casting to a
+/// value-typed object from a pointer-typed object. Note that `To` must be
+/// nullable/constructible from a pointer to `From` to use this cast.
+template <typename To, typename From, typename Derived = void>
+struct ValueFromPointerCast
+    : public CastIsPossible<To, From *>,
+      public NullableValueCastFailed<To>,
+      public DefaultDoCastIfPossible<
+          To, From *,
+          detail::SelfType<Derived, ValueFromPointerCast<To, From>>> {
+  static inline To doCast(From *f) { return To(f); }
+};
+
+/// This cast trait provides std::unique_ptr casting. It has the semantics of
+/// moving the contents of the input unique_ptr into the output unique_ptr
+/// during the cast. It's also a good example of how to implement a move-only
+/// cast.
+template <typename To, typename From, typename Derived = void>
+struct UniquePtrCast : public CastIsPossible<To, From *> {
+  using Self = detail::SelfType<Derived, UniquePtrCast<To, From>>;
+  using CastResultType = std::unique_ptr<
+      std::remove_reference_t<typename cast_retty<To, From>::ret_type>>;
+
+  static inline CastResultType doCast(std::unique_ptr<From> &&f) {
+    return CastResultType((typename CastResultType::element_type *)f.release());
+  }
+
+  static inline CastResultType castFailed() { return CastResultType(nullptr); }
+
+  static inline CastResultType doCastIfPossible(std::unique_ptr<From> &&f) {
+    if (!Self::isPossible(f))
+      return castFailed();
+    return doCast(f);
+  }
+};
+
+/// This cast trait provides Optional<T> casting. This means that if you have a
+/// value type, you can cast it to another value type and have dyn_cast return
+/// an Optional<T>.
+template <typename To, typename From, typename Derived = void>
+struct OptionalValueCast
+    : public CastIsPossible<To, From>,
+      public DefaultDoCastIfPossible<
+          Optional<To>, From,
+          detail::SelfType<Derived, OptionalValueCast<To, From>>> {
+  static inline Optional<To> castFailed() { return Optional<To>{}; }
+
+  static inline Optional<To> doCast(const From &f) { return To(f); }
+};
+
+/// Provides a cast trait that strips `const` from types to make it easier to
+/// implement a const-version of a non-const cast. It just removes boilerplate
+/// and reduces the amount of code you as the user need to implement. You can
+/// use it like this:
+///
+/// template<> struct CastInfo<foo, bar> {
+///   ...verbose implementation...
+/// };
+///
+/// template<> struct CastInfo<foo, const bar> : public
+///        ConstStrippingForwardingCast<foo, const bar, CastInfo<foo, bar>> {};
+///
+template <typename To, typename From, typename ForwardTo>
+struct ConstStrippingForwardingCast {
+  // Remove the pointer if it exists, then we can get rid of consts/volatiles.
+  using DecayedFrom = std::remove_cv_t<std::remove_pointer_t<From>>;
+  // Now if it's a pointer, add it back. Otherwise, we want a ref.
+  using NonConstFrom = std::conditional_t<std::is_pointer<From>::value,
+                                          DecayedFrom *, DecayedFrom &>;
+
+  static inline bool isPossible(const From &f) {
+    return ForwardTo::isPossible(const_cast<NonConstFrom>(f));
+  }
+
+  static inline decltype(auto) castFailed() { return ForwardTo::castFailed(); }
+
+  static inline decltype(auto) doCast(const From &f) {
+    return ForwardTo::doCast(const_cast<NonConstFrom>(f));
+  }
+
+  static inline decltype(auto) doCastIfPossible(const From &f) {
+    return ForwardTo::doCastIfPossible(const_cast<NonConstFrom>(f));
+  }
+};
+
+/// Provides a cast trait that uses a defined pointer to pointer cast as a base
+/// for reference-to-reference casts. Note that it does not provide castFailed
+/// and doCastIfPossible because a pointer-to-pointer cast would likely just
+/// return `nullptr` which could cause nullptr dereference. You can use it like
+/// this:
+///
+///   template <> struct CastInfo<foo, bar *> { ... verbose implementation... };
+///
+///   template <>
+///   struct CastInfo<foo, bar>
+///       : public ForwardToPointerCast<foo, bar, CastInfo<foo, bar *>> {};
+///
+template <typename To, typename From, typename ForwardTo>
+struct ForwardToPointerCast {
+  static inline bool isPossible(const From &f) {
+    return ForwardTo::isPossible(&f);
+  }
+
+  static inline decltype(auto) doCast(const From &f) {
+    return *ForwardTo::doCast(&f);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// CastInfo
+//===----------------------------------------------------------------------===//
+
+/// This struct provides a method for customizing the way a cast is performed.
+/// It inherits from CastIsPossible, to support the case of declaring many
+/// CastIsPossible specializations without having to specialize the full
+/// CastInfo.
+///
+/// In order to specialize different behaviors, specify different functions in
+/// your CastInfo specialization.
+/// For isa<> customization, provide:
+///
+///   `static bool isPossible(const From &f)`
+///
+/// For cast<> customization, provide:
+///
+///  `static To doCast(const From &f)`
+///
+/// For dyn_cast<> and the *_if_present<> variants' customization, provide:
+///
+///  `static To castFailed()` and `static To doCastIfPossible(const From &f)`
+///
+/// Your specialization might look something like this:
+///
+///  template<> struct CastInfo<foo, bar> : public CastIsPossible<foo, bar> {
+///    static inline foo doCast(const bar &b) {
+///      return foo(const_cast<bar &>(b));
+///    }
+///    static inline foo castFailed() { return foo(); }
+///    static inline foo doCastIfPossible(const bar &b) {
+///      if (!CastInfo<foo, bar>::isPossible(b))
+///        return castFailed();
+///      return doCast(b);
+///    }
+///  };
+
+// The default implementations of CastInfo don't use cast traits for now because
+// we need to specify types all over the place due to the current expected
+// casting behavior and the way cast_retty works. New use cases can and should
+// take advantage of the cast traits whenever possible!
+
+template <typename To, typename From, typename Enable = void>
+struct CastInfo : public CastIsPossible<To, From> {
+  using Self = CastInfo<To, From, Enable>;
+
+  using CastReturnType = typename cast_retty<To, From>::ret_type;
+
+  static inline CastReturnType doCast(const From &f) {
+    return cast_convert_val<
+        To, From,
+        typename simplify_type<From>::SimpleType>::doit(const_cast<From &>(f));
+  }
+
+  // This assumes that you can construct the cast return type from `nullptr`.
+  // This is largely to support legacy use cases - if you don't want this
+  // behavior you should specialize CastInfo for your use case.
+  static inline CastReturnType castFailed() { return CastReturnType(nullptr); }
+
+  static inline CastReturnType doCastIfPossible(const From &f) {
+    if (!Self::isPossible(f))
+      return castFailed();
+    return doCast(f);
+  }
+};
+
+/// This struct provides an overload for CastInfo where From has simplify_type
+/// defined. This simply forwards to the appropriate CastInfo with the
+/// simplified type/value, so you don't have to implement both.
+template <typename To, typename From>
+struct CastInfo<To, From, std::enable_if_t<!is_simple_type<From>::value>> {
+  using Self = CastInfo<To, From>;
+  using SimpleFrom = typename simplify_type<From>::SimpleType;
+  using SimplifiedSelf = CastInfo<To, SimpleFrom>;
+
+  static inline bool isPossible(From &f) {
+    return SimplifiedSelf::isPossible(
+        simplify_type<From>::getSimplifiedValue(f));
+  }
+
+  static inline decltype(auto) doCast(From &f) {
+    return SimplifiedSelf::doCast(simplify_type<From>::getSimplifiedValue(f));
+  }
+
+  static inline decltype(auto) castFailed() {
+    return SimplifiedSelf::castFailed();
+  }
+
+  static inline decltype(auto) doCastIfPossible(From &f) {
+    return SimplifiedSelf::doCastIfPossible(
+        simplify_type<From>::getSimplifiedValue(f));
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Pre-specialized CastInfo
+//===----------------------------------------------------------------------===//
+
+/// Provide a CastInfo specialized for std::unique_ptr.
+template <typename To, typename From>
+struct CastInfo<To, std::unique_ptr<From>> : public UniquePtrCast<To, From> {};
+
+/// Provide a CastInfo specialized for Optional<From>. It's assumed that if the
+/// input is Optional<From> that the output can be Optional<To>. If that's not
+/// the case, specialize CastInfo for your use case.
+template <typename To, typename From>
+struct CastInfo<To, Optional<From>> : public OptionalValueCast<To, From> {};
+
+/// isa<X> - Return true if the parameter to the template is an instance of one
+/// of the template type arguments.  Used like this:
+///
+///  if (isa<Type>(myVal)) { ... }
+///  if (isa<Type0, Type1, Type2>(myVal)) { ... }
+template <typename To, typename From>
+LLVM_NODISCARD inline bool isa(const From &Val) {
+  return CastInfo<To, const From>::isPossible(Val);
 }
 
-template <class X, class Y>
-inline typename cast_retty<X, Y>::ret_type cast(Y &Val) {
-  assert(isa<X>(Val) && "cast<Ty>() argument of incompatible type!");
-  return cast_convert_val<X, Y,
-                          typename simplify_type<Y>::SimpleType>::doit(Val);
+template <typename First, typename Second, typename... Rest, typename From>
+LLVM_NODISCARD inline bool isa(const From &Val) {
+  return isa<First>(Val) || isa<Second, Rest...>(Val);
 }
 
-template <class X, class Y>
-inline typename cast_retty<X, Y *>::ret_type cast(Y *Val) {
-  assert(isa<X>(Val) && "cast<Ty>() argument of incompatible type!");
-  return cast_convert_val<X, Y*,
-                          typename simplify_type<Y*>::SimpleType>::doit(Val);
+/// cast<X> - Return the argument parameter cast to the specified type.  This
+/// casting operator asserts that the type is correct, so it does not return
+/// null on failure.  It does not allow a null argument (use cast_if_present for
+/// that). It is typically used like this:
+///
+///  cast<Instruction>(myVal)->getParent()
+
+template <typename To, typename From>
+LLVM_NODISCARD inline decltype(auto) cast(const From &Val) {
+  assert(isa<To>(Val) && "cast<Ty>() argument of incompatible type!");
+  return CastInfo<To, const From>::doCast(Val);
 }
 
-template <class X, class Y>
-inline typename cast_retty<X, std::unique_ptr<Y>>::ret_type
-cast(std::unique_ptr<Y> &&Val) {
-  assert(isa<X>(Val.get()) && "cast<Ty>() argument of incompatible type!");
-  using ret_type = typename cast_retty<X, std::unique_ptr<Y>>::ret_type;
-  return ret_type(
-      cast_convert_val<X, Y *, typename simplify_type<Y *>::SimpleType>::doit(
-          Val.release()));
+template <typename To, typename From>
+LLVM_NODISCARD inline decltype(auto) cast(From &Val) {
+  assert(isa<To>(Val) && "cast<Ty>() argument of incompatible type!");
+  return CastInfo<To, From>::doCast(Val);
 }
 
-// cast_or_null<X> - Functionally identical to cast, except that a null value is
-// accepted.
-//
-template <class X, class Y>
-LLVM_NODISCARD inline std::enable_if_t<
-    !is_simple_type<Y>::value, typename cast_retty<X, const Y>::ret_type>
-cast_or_null(const Y &Val) {
-  if (!Val)
-    return nullptr;
-  assert(isa<X>(Val) && "cast_or_null<Ty>() argument of incompatible type!");
-  return cast<X>(Val);
+template <typename To, typename From>
+LLVM_NODISCARD inline decltype(auto) cast(From *Val) {
+  assert(isa<To>(Val) && "cast<Ty>() argument of incompatible type!");
+  return CastInfo<To, From *>::doCast(Val);
 }
 
-template <class X, class Y>
-LLVM_NODISCARD inline std::enable_if_t<!is_simple_type<Y>::value,
-                                       typename cast_retty<X, Y>::ret_type>
-cast_or_null(Y &Val) {
-  if (!Val)
-    return nullptr;
-  assert(isa<X>(Val) && "cast_or_null<Ty>() argument of incompatible type!");
-  return cast<X>(Val);
+template <typename To, typename From>
+LLVM_NODISCARD inline decltype(auto) cast(std::unique_ptr<From> &&Val) {
+  assert(isa<To>(Val) && "cast<Ty>() argument of incompatible type!");
+  return CastInfo<To, std::unique_ptr<From>>::doCast(std::move(Val));
 }
 
-template <class X, class Y>
-LLVM_NODISCARD inline typename cast_retty<X, Y *>::ret_type
-cast_or_null(Y *Val) {
-  if (!Val) return nullptr;
-  assert(isa<X>(Val) && "cast_or_null<Ty>() argument of incompatible type!");
-  return cast<X>(Val);
+/// dyn_cast<X> - Return the argument parameter cast to the specified type. This
+/// casting operator returns null if the argument is of the wrong type, so it
+/// can be used to test for a type as well as cast if successful. The value
+/// passed in must be present, if not, use dyn_cast_if_present. This should be
+/// used in the context of an if statement like this:
+///
+///  if (const Instruction *I = dyn_cast<Instruction>(myVal)) { ... }
+
+template <typename To, typename From>
+LLVM_NODISCARD inline decltype(auto) dyn_cast(const From &Val) {
+  return CastInfo<To, const From>::doCastIfPossible(Val);
 }
 
-template <class X, class Y>
-inline typename cast_retty<X, std::unique_ptr<Y>>::ret_type
-cast_or_null(std::unique_ptr<Y> &&Val) {
-  if (!Val)
-    return nullptr;
-  return cast<X>(std::move(Val));
+template <typename To, typename From>
+LLVM_NODISCARD inline decltype(auto) dyn_cast(From &Val) {
+  return CastInfo<To, From>::doCastIfPossible(Val);
 }
 
-// dyn_cast<X> - Return the argument parameter cast to the specified type.  This
-// casting operator returns null if the argument is of the wrong type, so it can
-// be used to test for a type as well as cast if successful.  This should be
-// used in the context of an if statement like this:
-//
-//  if (const Instruction *I = dyn_cast<Instruction>(myVal)) { ... }
-//
+template <typename To, typename From>
+LLVM_NODISCARD inline decltype(auto) dyn_cast(From *Val) {
+  return CastInfo<To, From *>::doCastIfPossible(Val);
+}
 
-template <class X, class Y>
-LLVM_NODISCARD inline std::enable_if_t<
-    !is_simple_type<Y>::value, typename cast_retty<X, const Y>::ret_type>
-dyn_cast(const Y &Val) {
-  return isa<X>(Val) ? cast<X>(Val) : nullptr;
+template <typename To, typename From>
+LLVM_NODISCARD inline decltype(auto) dyn_cast(std::unique_ptr<From> &&Val) {
+  return CastInfo<To, std::unique_ptr<From>>::doCastIfPossible(std::move(Val));
 }
 
-template <class X, class Y>
-LLVM_NODISCARD inline typename cast_retty<X, Y>::ret_type dyn_cast(Y &Val) {
-  return isa<X>(Val) ? cast<X>(Val) : nullptr;
+//===----------------------------------------------------------------------===//
+// ValueIsPresent
+//===----------------------------------------------------------------------===//
+
+template <typename T>
+constexpr bool IsNullable = std::is_pointer<T>::value ||
+                            std::is_constructible<T, std::nullptr_t>::value;
+
+/// ValueIsPresent provides a way to check if a value is, well, present. For
+/// pointers, this is the equivalent of checking against nullptr, for
+/// Optionals this is the equivalent of checking hasValue(). It also
+/// provides a method for unwrapping a value (think dereferencing a
+/// pointer).
+
+// Generic values can't *not* be present.
+template <typename T, typename Enable = void> struct ValueIsPresent {
+  using UnwrappedType = T;
+  static inline bool isPresent(const T &t) { return true; }
+  static inline decltype(auto) unwrapValue(T &t) { return t; }
+};
+
+// Optional provides its own way to check if something is present.
+template <typename T> struct ValueIsPresent<Optional<T>> {
+  using UnwrappedType = T;
+  static inline bool isPresent(const Optional<T> &t) { return t.has_value(); }
+  static inline decltype(auto) unwrapValue(Optional<T> &t) {
+    return t.getValue();
+  }
+};
+
+// If something is "nullable" then we just compare it to nullptr to see if it
+// exists.
+template <typename T>
+struct ValueIsPresent<T, std::enable_if_t<IsNullable<T>>> {
+  using UnwrappedType = T;
+  static inline bool isPresent(const T &t) { return t != nullptr; }
+  static inline decltype(auto) unwrapValue(T &t) { return t; }
+};
+
+namespace detail {
+// Convenience function we can use to check if a value is present. Because of
+// simplify_type, we have to call it on the simplified type for now.
+template <typename T> inline bool isPresent(const T &t) {
+  return ValueIsPresent<typename simplify_type<T>::SimpleType>::isPresent(
+      simplify_type<T>::getSimplifiedValue(const_cast<T &>(t)));
 }
 
-template <class X, class Y>
-LLVM_NODISCARD inline typename cast_retty<X, Y *>::ret_type dyn_cast(Y *Val) {
-  return isa<X>(Val) ? cast<X>(Val) : nullptr;
+// Convenience function we can use to unwrap a value.
+template <typename T> inline decltype(auto) unwrapValue(T &t) {
+  return ValueIsPresent<T>::unwrapValue(t);
 }
+} // namespace detail
 
-// dyn_cast_or_null<X> - Functionally identical to dyn_cast, except that a null
-// value is accepted.
-//
-template <class X, class Y>
-LLVM_NODISCARD inline std::enable_if_t<
-    !is_simple_type<Y>::value, typename cast_retty<X, const Y>::ret_type>
-dyn_cast_or_null(const Y &Val) {
-  return (Val && isa<X>(Val)) ? cast<X>(Val) : nullptr;
+/// isa_and_present<X> - Functionally identical to isa, except that a null value
+/// is accepted.
+template <typename... X, class Y>
+LLVM_NODISCARD inline bool isa_and_present(const Y &Val) {
+  if (!detail::isPresent(Val))
+    return false;
+  return isa<X...>(Val);
 }
 
+template <typename... X, class Y>
+LLVM_NODISCARD inline bool isa_and_nonnull(const Y &Val) {
+  return isa_and_present<X...>(Val);
+}
+
+/// cast_if_present<X> - Functionally identical to cast, except that a null
+/// value is accepted.
 template <class X, class Y>
-LLVM_NODISCARD inline std::enable_if_t<!is_simple_type<Y>::value,
-                                       typename cast_retty<X, Y>::ret_type>
-dyn_cast_or_null(Y &Val) {
-  return (Val && isa<X>(Val)) ? cast<X>(Val) : nullptr;
+LLVM_NODISCARD inline auto cast_if_present(const Y &Val) {
+  if (!detail::isPresent(Val))
+    return CastInfo<X, const Y>::castFailed();
+  assert(isa<X>(Val) && "cast_if_present<Ty>() argument of incompatible type!");
+  return cast<X>(detail::unwrapValue(Val));
+}
+
+template <class X, class Y> LLVM_NODISCARD inline auto cast_if_present(Y &Val) {
+  if (!detail::isPresent(Val))
+    return CastInfo<X, Y>::castFailed();
+  assert(isa<X>(Val) && "cast_if_present<Ty>() argument of incompatible type!");
+  return cast<X>(detail::unwrapValue(Val));
+}
+
+template <class X, class Y> LLVM_NODISCARD inline auto cast_if_present(Y *Val) {
+  if (!detail::isPresent(Val))
+    return CastInfo<X, Y *>::castFailed();
+  assert(isa<X>(Val) && "cast_if_present<Ty>() argument of incompatible type!");
+  return cast<X>(detail::unwrapValue(Val));
 }
 
 template <class X, class Y>
-LLVM_NODISCARD inline typename cast_retty<X, Y *>::ret_type
-dyn_cast_or_null(Y *Val) {
-  return (Val && isa<X>(Val)) ? cast<X>(Val) : nullptr;
+LLVM_NODISCARD inline auto cast_if_present(std::unique_ptr<Y> &&Val) {
+  if (!detail::isPresent(Val))
+    return UniquePtrCast<X, Y>::castFailed();
+  return UniquePtrCast<X, Y>::doCast(std::move(Val));
+}
+
+// Provide a forwarding from cast_or_null to cast_if_present for current
+// users. This is deprecated and will be removed in a future patch, use
+// cast_if_present instead.
+template <class X, class Y> auto cast_or_null(const Y &Val) {
+  return cast_if_present<X>(Val);
+}
+
+template <class X, class Y> auto cast_or_null(Y &Val) {
+  return cast_if_present<X>(Val);
+}
+
+template <class X, class Y> auto cast_or_null(Y *Val) {
+  return cast_if_present<X>(Val);
+}
+
+template <class X, class Y> auto cast_or_null(std::unique_ptr<Y> &&Val) {
+  return cast_if_present<X>(std::move(Val));
+}
+
+/// dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a
+/// null (or none in the case of optionals) value is accepted.
+template <class X, class Y> auto dyn_cast_if_present(const Y &Val) {
+  if (!detail::isPresent(Val))
+    return CastInfo<X, const Y>::castFailed();
+  return CastInfo<X, const Y>::doCastIfPossible(detail::unwrapValue(Val));
+}
+
+template <class X, class Y> auto dyn_cast_if_present(Y &Val) {
+  if (!detail::isPresent(Val))
+    return CastInfo<X, Y>::castFailed();
+  return CastInfo<X, Y>::doCastIfPossible(detail::unwrapValue(Val));
+}
+
+template <class X, class Y> auto dyn_cast_if_present(Y *Val) {
+  if (!detail::isPresent(Val))
+    return CastInfo<X, Y *>::castFailed();
+  return CastInfo<X, Y *>::doCastIfPossible(detail::unwrapValue(Val));
+}
+
+// Forwards to dyn_cast_if_present to avoid breaking current users. This is
+// deprecated and will be removed in a future patch, use
+// cast_if_present instead.
+template <class X, class Y> auto dyn_cast_or_null(const Y &Val) {
+  return dyn_cast_if_present<X>(Val);
+}
+
+template <class X, class Y> auto dyn_cast_or_null(Y &Val) {
+  return dyn_cast_if_present<X>(Val);
+}
+
+template <class X, class Y> auto dyn_cast_or_null(Y *Val) {
+  return dyn_cast_if_present<X>(Val);
 }
 
-// unique_dyn_cast<X> - Given a unique_ptr<Y>, try to return a unique_ptr<X>,
-// taking ownership of the input pointer iff isa<X>(Val) is true.  If the
-// cast is successful, From refers to nullptr on exit and the casted value
-// is returned.  If the cast is unsuccessful, the function returns nullptr
-// and From is unchanged.
+/// unique_dyn_cast<X> - Given a unique_ptr<Y>, try to return a unique_ptr<X>,
+/// taking ownership of the input pointer iff isa<X>(Val) is true.  If the
+/// cast is successful, From refers to nullptr on exit and the casted value
+/// is returned.  If the cast is unsuccessful, the function returns nullptr
+/// and From is unchanged.
 template <class X, class Y>
-LLVM_NODISCARD inline auto unique_dyn_cast(std::unique_ptr<Y> &Val)
-    -> decltype(cast<X>(Val)) {
+LLVM_NODISCARD inline typename CastInfo<X, std::unique_ptr<Y>>::CastResultType
+unique_dyn_cast(std::unique_ptr<Y> &Val) {
   if (!isa<X>(Val))
     return nullptr;
   return cast<X>(std::move(Val));
@@ -386,11 +783,11 @@ LLVM_NODISCARD inline auto unique_dyn_cast(std::unique_ptr<Y> &&Val) {
   return unique_dyn_cast<X, Y>(Val);
 }
 
-// dyn_cast_or_null<X> - Functionally identical to unique_dyn_cast, except that
-// a null value is accepted.
+// unique_dyn_cast_or_null<X> - Functionally identical to unique_dyn_cast,
+// except that a null value is accepted.
 template <class X, class Y>
-LLVM_NODISCARD inline auto unique_dyn_cast_or_null(std::unique_ptr<Y> &Val)
-    -> decltype(cast<X>(Val)) {
+LLVM_NODISCARD inline typename CastInfo<X, std::unique_ptr<Y>>::CastResultType
+unique_dyn_cast_or_null(std::unique_ptr<Y> &Val) {
   if (!Val)
     return nullptr;
   return unique_dyn_cast<X, Y>(Val);
diff --git a/llvm/include/llvm/Support/CodeGen.h b/llvm/include/llvm/Support/CodeGen.h
index 9e66d84e185d..71d0ddbfe05e 100644
--- a/llvm/include/llvm/Support/CodeGen.h
+++ b/llvm/include/llvm/Support/CodeGen.h
@@ -69,6 +69,40 @@ namespace llvm {
   // Specify what functions should keep the frame pointer.
   enum class FramePointerKind { None, NonLeaf, All };
 
-}  // end llvm namespace
+  // Specify what type of zeroing callee-used registers.
+  namespace ZeroCallUsedRegs {
+  const unsigned ONLY_USED = 1U << 1;
+  const unsigned ONLY_GPR = 1U << 2;
+  const unsigned ONLY_ARG = 1U << 3;
+
+  enum class ZeroCallUsedRegsKind : unsigned int {
+    // Don't zero any call-used regs.
+    Skip = 1U << 0,
+    // Only zeros call-used GPRs used in the fn and pass args.
+    UsedGPRArg = ONLY_USED | ONLY_GPR | ONLY_ARG,
+    // Only zeros call-used GPRs used in the fn.
+    UsedGPR = ONLY_USED | ONLY_GPR,
+    // Only zeros call-used regs used in the fn and pass args.
+    UsedArg = ONLY_USED | ONLY_ARG,
+    // Only zeros call-used regs used in the fn.
+    Used = ONLY_USED,
+    // Zeros all call-used GPRs that pass args.
+    AllGPRArg = ONLY_GPR | ONLY_ARG,
+    // Zeros all call-used GPRs.
+    AllGPR = ONLY_GPR,
+    // Zeros all call-used regs that pass args.
+    AllArg = ONLY_ARG,
+    // Zeros all call-used regs.
+    All = 0,
+  };
+  } // namespace ZeroCallUsedRegs
+
+  enum class UWTableKind {
+    None = 0,  ///< No unwind table requested
+    Sync = 1,  ///< "Synchronous" unwind tables
+    Async = 2, ///< "Asynchronous" unwind tables (instr precise)
+    Default = 2,
+  };
+  } // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/Support/CommandLine.h b/llvm/include/llvm/Support/CommandLine.h
index c8e29ac42559..6461164fceff 100644
--- a/llvm/include/llvm/Support/CommandLine.h
+++ b/llvm/include/llvm/Support/CommandLine.h
@@ -49,13 +49,12 @@ class FileSystem;
 
 class StringSaver;
 
-/// cl Namespace - This namespace contains all of the command line option
-/// processing machinery.  It is intentionally a short name to make qualified
-/// usage concise.
+/// This namespace contains all of the command line option processing machinery.
+/// It is intentionally a short name to make qualified usage concise.
 namespace cl {
 
 //===----------------------------------------------------------------------===//
-// ParseCommandLineOptions - Command line option processing entry point.
+// Command line option processing entry point.
 //
 // Returns true on success. Otherwise, this will print the error message to
 // stderr and exit if \p Errs is not set (nullptr by default), or print the
@@ -78,22 +77,19 @@ bool ParseCommandLineOptions(int argc, const char *const *argv,
 using VersionPrinterTy = std::function<void(raw_ostream &)>;
 
 ///===---------------------------------------------------------------------===//
-/// SetVersionPrinter - Override the default (LLVM specific) version printer
-///                     used to print out the version when --version is given
-///                     on the command line. This allows other systems using the
-///                     CommandLine utilities to print their own version string.
+/// Override the default (LLVM specific) version printer used to print out the
+/// version when --version is given on the command line. This allows other
+/// systems using the CommandLine utilities to print their own version string.
 void SetVersionPrinter(VersionPrinterTy func);
 
 ///===---------------------------------------------------------------------===//
-/// AddExtraVersionPrinter - Add an extra printer to use in addition to the
-///                          default one. This can be called multiple times,
-///                          and each time it adds a new function to the list
-///                          which will be called after the basic LLVM version
-///                          printing is complete. Each can then add additional
-///                          information specific to the tool.
+/// Add an extra printer to use in addition to the default one. This can be
+/// called multiple times, and each time it adds a new function to the list
+/// which will be called after the basic LLVM version printing is complete.
+/// Each can then add additional information specific to the tool.
 void AddExtraVersionPrinter(VersionPrinterTy func);
 
-// PrintOptionValues - Print option values.
+// Print option values.
 // With -print-options print the difference between option values and defaults.
 // With -print-all-options print all option values.
 // (Currently not perfect, but best-effort.)
@@ -121,9 +117,9 @@ enum NumOccurrencesFlag { // Flags for the number of occurrences allowed
   Required = 0x02,        // One occurrence required
   OneOrMore = 0x03,       // One or more occurrences required
 
-  // ConsumeAfter - Indicates that this option is fed anything that follows the
-  // last positional argument required by the application (it is an error if
-  // there are zero positional arguments, and a ConsumeAfter option is used).
+  // Indicates that this option is fed anything that follows the last positional
+  // argument required by the application (it is an error if there are zero
+  // positional arguments, and a ConsumeAfter option is used).
   // Thus, for example, all arguments to LLI are processed until a filename is
   // found.  Once a filename is found, all of the succeeding arguments are
   // passed, unprocessed, to the ConsumeAfter option.
@@ -144,8 +140,8 @@ enum OptionHidden {   // Control whether -help shows this option
   ReallyHidden = 0x02 // Neither -help nor -help-hidden show this arg
 };
 
-// Formatting flags - This controls special features that the option might have
-// that cause it to be parsed differently...
+// This controls special features that the option might have that cause it to be
+// parsed differently...
 //
 // Prefix - This option allows arguments that are otherwise unrecognized to be
 // matched by options that are a prefix of the actual value.  This is useful for
@@ -170,7 +166,7 @@ enum MiscFlags {             // Miscellaneous flags to adjust argument
   PositionalEatsArgs = 0x02, // Should this positional cl::list eat -args?
   Sink = 0x04,               // Should this cl::list eat all unknown options?
 
-  // Grouping - Can this option group with other options?
+  // Can this option group with other options?
   // If this is enabled, multiple letter options are allowed to bunch together
   // with only a single hyphen for the whole group.  This allows emulation
   // of the behavior that ls uses for example: ls -la === ls -l -a
@@ -181,7 +177,6 @@ enum MiscFlags {             // Miscellaneous flags to adjust argument
 };
 
 //===----------------------------------------------------------------------===//
-// Option Category class
 //
 class OptionCategory {
 private:
@@ -205,7 +200,6 @@ public:
 OptionCategory &getGeneralCategory();
 
 //===----------------------------------------------------------------------===//
-// SubCommand class
 //
 class SubCommand {
 private:
@@ -244,14 +238,13 @@ extern ManagedStatic<SubCommand> TopLevelSubCommand;
 extern ManagedStatic<SubCommand> AllSubCommands;
 
 //===----------------------------------------------------------------------===//
-// Option Base class
 //
 class Option {
   friend class alias;
 
-  // handleOccurrences - Overriden by subclasses to handle the value passed into
-  // an argument.  Should return true if there was an error processing the
-  // argument and the program should exit.
+  // Overriden by subclasses to handle the value passed into an argument. Should
+  // return true if there was an error processing the argument and the program
+  // should exit.
   //
   virtual bool handleOccurrence(unsigned pos, StringRef ArgName,
                                 StringRef Arg) = 0;
@@ -305,7 +298,7 @@ public:
   inline unsigned getPosition() const { return Position; }
   inline unsigned getNumAdditionalVals() const { return AdditionalVals; }
 
-  // hasArgStr - Return true if the argstr != ""
+  // Return true if the argstr != ""
   bool hasArgStr() const { return !ArgStr.empty(); }
   bool isPositional() const { return getFormattingFlag() == cl::Positional; }
   bool isSink() const { return getMiscFlags() & cl::Sink; }
@@ -348,7 +341,7 @@ protected:
 public:
   virtual ~Option() = default;
 
-  // addArgument - Register this argument with the commandline system.
+  // Register this argument with the commandline system.
   //
   void addArgument();
 
@@ -361,8 +354,8 @@ public:
   // Return the width of the option tag for printing...
   virtual size_t getOptionWidth() const = 0;
 
-  // printOptionInfo - Print out information about this option.  The
-  // to-be-maintained width is specified.
+  // Print out information about this option. The to-be-maintained width is
+  // specified.
   //
   virtual void printOptionInfo(size_t GlobalWidth) const = 0;
 
@@ -388,7 +381,7 @@ public:
 
   virtual void getExtraOptionNames(SmallVectorImpl<StringRef> &) {}
 
-  // addOccurrence - Wrapper around handleOccurrence that enforces Flags.
+  // Wrapper around handleOccurrence that enforces Flags.
   //
   virtual bool addOccurrence(unsigned pos, StringRef ArgName, StringRef Value,
                              bool MultiArg = false);
@@ -408,7 +401,7 @@ public:
 // command line option parsers...
 //
 
-// desc - Modifier to set the description shown in the -help output...
+// Modifier to set the description shown in the -help output...
 struct desc {
   StringRef Desc;
 
@@ -417,8 +410,7 @@ struct desc {
   void apply(Option &O) const { O.setDescription(Desc); }
 };
 
-// value_desc - Modifier to set the value description shown in the -help
-// output...
+// Modifier to set the value description shown in the -help output...
 struct value_desc {
   StringRef Desc;
 
@@ -427,10 +419,9 @@ struct value_desc {
   void apply(Option &O) const { O.setValueStr(Desc); }
 };
 
-// init - Specify a default (initial) value for the command line argument, if
-// the default constructor for the argument type does not give you what you
-// want.  This is only valid on "opt" arguments, not on "list" arguments.
-//
+// Specify a default (initial) value for the command line argument, if the
+// default constructor for the argument type does not give you what you want.
+// This is only valid on "opt" arguments, not on "list" arguments.
 template <class Ty> struct initializer {
   const Ty &Init;
   initializer(const Ty &Val) : Init(Val) {}
@@ -442,10 +433,9 @@ template <class Ty> initializer<Ty> init(const Ty &Val) {
   return initializer<Ty>(Val);
 }
 
-// location - Allow the user to specify which external variable they want to
-// store the results of the command line argument processing into, if they don't
-// want to store it in the option itself.
-//
+// Allow the user to specify which external variable they want to store the
+// results of the command line argument processing into, if they don't want to
+// store it in the option itself.
 template <class Ty> struct LocationClass {
   Ty &Loc;
 
@@ -458,8 +448,7 @@ template <class Ty> LocationClass<Ty> location(Ty &L) {
   return LocationClass<Ty>(L);
 }
 
-// cat - Specifiy the Option category for the command line argument to belong
-// to.
+// Specify the Option category for the command line argument to belong to.
 struct cat {
   OptionCategory &Category;
 
@@ -468,7 +457,7 @@ struct cat {
   template <class Opt> void apply(Opt &O) const { O.addCategory(Category); }
 };
 
-// sub - Specify the subcommand that this option belongs to.
+// Specify the subcommand that this option belongs to.
 struct sub {
   SubCommand &Sub;
 
@@ -514,7 +503,6 @@ callback(F CB) {
 }
 
 //===----------------------------------------------------------------------===//
-// OptionValue class
 
 // Support value comparison outside the template.
 struct GenericOptionValue {
@@ -672,8 +660,8 @@ struct OptionEnumValue {
 #define clEnumValN(ENUMVAL, FLAGNAME, DESC)                                    \
   llvm::cl::OptionEnumValue { FLAGNAME, int(ENUMVAL), DESC }
 
-// values - For custom data types, allow specifying a group of values together
-// as the values that go into the mapping that the option handler uses.
+// For custom data types, allow specifying a group of values together as the
+// values that go into the mapping that the option handler uses.
 //
 class ValuesClass {
   // Use a vector instead of a map, because the lists should be short,
@@ -699,16 +687,16 @@ template <typename... OptsTy> ValuesClass values(OptsTy... Options) {
 }
 
 //===----------------------------------------------------------------------===//
-// parser class - Parameterizable parser for different data types.  By default,
-// known data types (string, int, bool) have specialized parsers, that do what
-// you would expect.  The default parser, used for data types that are not
-// built-in, uses a mapping table to map specific options to values, which is
-// used, among other things, to handle enum types.
+// Parameterizable parser for different data types. By default, known data types
+// (string, int, bool) have specialized parsers, that do what you would expect.
+// The default parser, used for data types that are not built-in, uses a mapping
+// table to map specific options to values, which is used, among other things,
+// to handle enum types.
 
 //--------------------------------------------------
-// generic_parser_base - This class holds all the non-generic code that we do
-// not need replicated for every instance of the generic parser.  This also
-// allows us to put stuff into CommandLine.cpp
+// This class holds all the non-generic code that we do not need replicated for
+// every instance of the generic parser.  This also allows us to put stuff into
+// CommandLine.cpp
 //
 class generic_parser_base {
 protected:
@@ -726,15 +714,15 @@ public:
   virtual ~generic_parser_base() = default;
   // Base class should have virtual-destructor
 
-  // getNumOptions - Virtual function implemented by generic subclass to
-  // indicate how many entries are in Values.
+  // Virtual function implemented by generic subclass to indicate how many
+  // entries are in Values.
   //
   virtual unsigned getNumOptions() const = 0;
 
-  // getOption - Return option name N.
+  // Return option name N.
   virtual StringRef getOption(unsigned N) const = 0;
 
-  // getDescription - Return description N
+  // Return description N
   virtual StringRef getDescription(unsigned N) const = 0;
 
   // Return the width of the option tag for printing...
@@ -742,8 +730,8 @@ public:
 
   virtual const GenericOptionValue &getOptionValue(unsigned N) const = 0;
 
-  // printOptionInfo - Print out information about this option.  The
-  // to-be-maintained width is specified.
+  // Print out information about this option. The to-be-maintained width is
+  // specified.
   //
   virtual void printOptionInfo(const Option &O, size_t GlobalWidth) const;
 
@@ -751,7 +739,7 @@ public:
                               const GenericOptionValue &Default,
                               size_t GlobalWidth) const;
 
-  // printOptionDiff - print the value of an option and it's default.
+  // Print the value of an option and it's default.
   //
   // Template definition ensures that the option and default have the same
   // DataType (via the same AnyOptionValue).
@@ -791,7 +779,7 @@ public:
       return ValueDisallowed;
   }
 
-  // findOption - Return the option number corresponding to the specified
+  // Return the option number corresponding to the specified
   // argument string.  If the option is not found, getNumOptions() is returned.
   //
   unsigned findOption(StringRef Name);
@@ -829,12 +817,12 @@ public:
     return Values[N].HelpStr;
   }
 
-  // getOptionValue - Return the value of option name N.
+  // Return the value of option name N.
   const GenericOptionValue &getOptionValue(unsigned N) const override {
     return Values[N].V;
   }
 
-  // parse - Return true on error.
+  // Return true on error.
   bool parse(Option &O, StringRef ArgName, StringRef Arg, DataType &V) {
     StringRef ArgVal;
     if (Owner.hasArgStr())
@@ -851,7 +839,7 @@ public:
     return O.error("Cannot find option named '" + ArgVal + "'!");
   }
 
-  /// addLiteralOption - Add an entry to the mapping table.
+  /// Add an entry to the mapping table.
   ///
   template <class DT>
   void addLiteralOption(StringRef Name, const DT &V, StringRef HelpStr) {
@@ -861,7 +849,7 @@ public:
     AddLiteralOption(Owner, Name);
   }
 
-  /// removeLiteralOption - Remove the specified option.
+  /// Remove the specified option.
   ///
   void removeLiteralOption(StringRef Name) {
     unsigned N = findOption(Name);
@@ -871,7 +859,7 @@ public:
 };
 
 //--------------------------------------------------
-// basic_parser - Super class of parsers to provide boilerplate code
+// Super class of parsers to provide boilerplate code
 //
 class basic_parser_impl { // non-template implementation of basic_parser<t>
 public:
@@ -890,16 +878,15 @@ public:
   // Return the width of the option tag for printing...
   size_t getOptionWidth(const Option &O) const;
 
-  // printOptionInfo - Print out information about this option.  The
-  // to-be-maintained width is specified.
+  // Print out information about this option. The to-be-maintained width is
+  // specified.
   //
   void printOptionInfo(const Option &O, size_t GlobalWidth) const;
 
-  // printOptionNoValue - Print a placeholder for options that don't yet support
-  // printOptionDiff().
+  // Print a placeholder for options that don't yet support printOptionDiff().
   void printOptionNoValue(const Option &O, size_t GlobalWidth) const;
 
-  // getValueName - Overload in subclass to provide a better default value.
+  // Overload in subclass to provide a better default value.
   virtual StringRef getValueName() const { return "value"; }
 
   // An out-of-line virtual method to provide a 'home' for this class.
@@ -910,8 +897,8 @@ protected:
   void printOptionName(const Option &O, size_t GlobalWidth) const;
 };
 
-// basic_parser - The real basic parser is just a template wrapper that provides
-// a typedef for the provided data type.
+// The real basic parser is just a template wrapper that provides a typedef for
+// the provided data type.
 //
 template <class DataType> class basic_parser : public basic_parser_impl {
 public:
@@ -922,8 +909,6 @@ public:
 };
 
 //--------------------------------------------------
-// parser<bool>
-//
 
 extern template class basic_parser<bool>;
 
@@ -931,7 +916,7 @@ template <> class parser<bool> : public basic_parser<bool> {
 public:
   parser(Option &O) : basic_parser(O) {}
 
-  // parse - Return true on error.
+  // Return true on error.
   bool parse(Option &O, StringRef ArgName, StringRef Arg, bool &Val);
 
   void initialize() {}
@@ -940,7 +925,7 @@ public:
     return ValueOptional;
   }
 
-  // getValueName - Do not print =<value> at all.
+  // Do not print =<value> at all.
   StringRef getValueName() const override { return StringRef(); }
 
   void printOptionDiff(const Option &O, bool V, OptVal Default,
@@ -951,7 +936,6 @@ public:
 };
 
 //--------------------------------------------------
-// parser<boolOrDefault>
 
 extern template class basic_parser<boolOrDefault>;
 
@@ -959,14 +943,14 @@ template <> class parser<boolOrDefault> : public basic_parser<boolOrDefault> {
 public:
   parser(Option &O) : basic_parser(O) {}
 
-  // parse - Return true on error.
+  // Return true on error.
   bool parse(Option &O, StringRef ArgName, StringRef Arg, boolOrDefault &Val);
 
   enum ValueExpected getValueExpectedFlagDefault() const {
     return ValueOptional;
   }
 
-  // getValueName - Do not print =<value> at all.
+  // Do not print =<value> at all.
   StringRef getValueName() const override { return StringRef(); }
 
   void printOptionDiff(const Option &O, boolOrDefault V, OptVal Default,
@@ -977,8 +961,6 @@ public:
 };
 
 //--------------------------------------------------
-// parser<int>
-//
 
 extern template class basic_parser<int>;
 
@@ -986,10 +968,10 @@ template <> class parser<int> : public basic_parser<int> {
 public:
   parser(Option &O) : basic_parser(O) {}
 
-  // parse - Return true on error.
+  // Return true on error.
   bool parse(Option &O, StringRef ArgName, StringRef Arg, int &Val);
 
-  // getValueName - Overload in subclass to provide a better default value.
+  // Overload in subclass to provide a better default value.
   StringRef getValueName() const override { return "int"; }
 
   void printOptionDiff(const Option &O, int V, OptVal Default,
@@ -1000,8 +982,6 @@ public:
 };
 
 //--------------------------------------------------
-// parser<long>
-//
 
 extern template class basic_parser<long>;
 
@@ -1009,10 +989,10 @@ template <> class parser<long> final : public basic_parser<long> {
 public:
   parser(Option &O) : basic_parser(O) {}
 
-  // parse - Return true on error.
+  // Return true on error.
   bool parse(Option &O, StringRef ArgName, StringRef Arg, long &Val);
 
-  // getValueName - Overload in subclass to provide a better default value.
+  // Overload in subclass to provide a better default value.
   StringRef getValueName() const override { return "long"; }
 
   void printOptionDiff(const Option &O, long V, OptVal Default,
@@ -1023,8 +1003,6 @@ public:
 };
 
 //--------------------------------------------------
-// parser<long long>
-//
 
 extern template class basic_parser<long long>;
 
@@ -1032,10 +1010,10 @@ template <> class parser<long long> : public basic_parser<long long> {
 public:
   parser(Option &O) : basic_parser(O) {}
 
-  // parse - Return true on error.
+  // Return true on error.
   bool parse(Option &O, StringRef ArgName, StringRef Arg, long long &Val);
 
-  // getValueName - Overload in subclass to provide a better default value.
+  // Overload in subclass to provide a better default value.
   StringRef getValueName() const override { return "long"; }
 
   void printOptionDiff(const Option &O, long long V, OptVal Default,
@@ -1046,8 +1024,6 @@ public:
 };
 
 //--------------------------------------------------
-// parser<unsigned>
-//
 
 extern template class basic_parser<unsigned>;
 
@@ -1055,10 +1031,10 @@ template <> class parser<unsigned> : public basic_parser<unsigned> {
 public:
   parser(Option &O) : basic_parser(O) {}
 
-  // parse - Return true on error.
+  // Return true on error.
   bool parse(Option &O, StringRef ArgName, StringRef Arg, unsigned &Val);
 
-  // getValueName - Overload in subclass to provide a better default value.
+  // Overload in subclass to provide a better default value.
   StringRef getValueName() const override { return "uint"; }
 
   void printOptionDiff(const Option &O, unsigned V, OptVal Default,
@@ -1069,8 +1045,6 @@ public:
 };
 
 //--------------------------------------------------
-// parser<unsigned long>
-//
 
 extern template class basic_parser<unsigned long>;
 
@@ -1079,10 +1053,10 @@ class parser<unsigned long> final : public basic_parser<unsigned long> {
 public:
   parser(Option &O) : basic_parser(O) {}
 
-  // parse - Return true on error.
+  // Return true on error.
   bool parse(Option &O, StringRef ArgName, StringRef Arg, unsigned long &Val);
 
-  // getValueName - Overload in subclass to provide a better default value.
+  // Overload in subclass to provide a better default value.
   StringRef getValueName() const override { return "ulong"; }
 
   void printOptionDiff(const Option &O, unsigned long V, OptVal Default,
@@ -1093,8 +1067,6 @@ public:
 };
 
 //--------------------------------------------------
-// parser<unsigned long long>
-//
 
 extern template class basic_parser<unsigned long long>;
 
@@ -1103,11 +1075,11 @@ class parser<unsigned long long> : public basic_parser<unsigned long long> {
 public:
   parser(Option &O) : basic_parser(O) {}
 
-  // parse - Return true on error.
+  // Return true on error.
   bool parse(Option &O, StringRef ArgName, StringRef Arg,
              unsigned long long &Val);
 
-  // getValueName - Overload in subclass to provide a better default value.
+  // Overload in subclass to provide a better default value.
   StringRef getValueName() const override { return "ulong"; }
 
   void printOptionDiff(const Option &O, unsigned long long V, OptVal Default,
@@ -1118,8 +1090,6 @@ public:
 };
 
 //--------------------------------------------------
-// parser<double>
-//
 
 extern template class basic_parser<double>;
 
@@ -1127,10 +1097,10 @@ template <> class parser<double> : public basic_parser<double> {
 public:
   parser(Option &O) : basic_parser(O) {}
 
-  // parse - Return true on error.
+  // Return true on error.
   bool parse(Option &O, StringRef ArgName, StringRef Arg, double &Val);
 
-  // getValueName - Overload in subclass to provide a better default value.
+  // Overload in subclass to provide a better default value.
   StringRef getValueName() const override { return "number"; }
 
   void printOptionDiff(const Option &O, double V, OptVal Default,
@@ -1141,8 +1111,6 @@ public:
 };
 
 //--------------------------------------------------
-// parser<float>
-//
 
 extern template class basic_parser<float>;
 
@@ -1150,10 +1118,10 @@ template <> class parser<float> : public basic_parser<float> {
 public:
   parser(Option &O) : basic_parser(O) {}
 
-  // parse - Return true on error.
+  // Return true on error.
   bool parse(Option &O, StringRef ArgName, StringRef Arg, float &Val);
 
-  // getValueName - Overload in subclass to provide a better default value.
+  // Overload in subclass to provide a better default value.
   StringRef getValueName() const override { return "number"; }
 
   void printOptionDiff(const Option &O, float V, OptVal Default,
@@ -1164,8 +1132,6 @@ public:
 };
 
 //--------------------------------------------------
-// parser<std::string>
-//
 
 extern template class basic_parser<std::string>;
 
@@ -1173,13 +1139,13 @@ template <> class parser<std::string> : public basic_parser<std::string> {
 public:
   parser(Option &O) : basic_parser(O) {}
 
-  // parse - Return true on error.
+  // Return true on error.
   bool parse(Option &, StringRef, StringRef Arg, std::string &Value) {
     Value = Arg.str();
     return false;
   }
 
-  // getValueName - Overload in subclass to provide a better default value.
+  // Overload in subclass to provide a better default value.
   StringRef getValueName() const override { return "string"; }
 
   void printOptionDiff(const Option &O, StringRef V, const OptVal &Default,
@@ -1190,8 +1156,6 @@ public:
 };
 
 //--------------------------------------------------
-// parser<char>
-//
 
 extern template class basic_parser<char>;
 
@@ -1199,13 +1163,13 @@ template <> class parser<char> : public basic_parser<char> {
 public:
   parser(Option &O) : basic_parser(O) {}
 
-  // parse - Return true on error.
+  // Return true on error.
   bool parse(Option &, StringRef, StringRef Arg, char &Value) {
     Value = Arg[0];
     return false;
   }
 
-  // getValueName - Overload in subclass to provide a better default value.
+  // Overload in subclass to provide a better default value.
   StringRef getValueName() const override { return "char"; }
 
   void printOptionDiff(const Option &O, char V, OptVal Default,
@@ -1216,8 +1180,6 @@ public:
 };
 
 //--------------------------------------------------
-// PrintOptionDiff
-//
 // This collection of wrappers is the intermediary between class opt and class
 // parser to handle all the template nastiness.
 
@@ -1261,10 +1223,10 @@ void printOptionDiff(
 }
 
 //===----------------------------------------------------------------------===//
-// applicator class - This class is used because we must use partial
-// specialization to handle literal string arguments specially (const char* does
-// not correctly respond to the apply method).  Because the syntax to use this
-// is a pain, we have the 'apply' method below to handle the nastiness...
+// This class is used because we must use partial specialization to handle
+// literal string arguments specially (const char* does not correctly respond to
+// the apply method). Because the syntax to use this is a pain, we have the
+// 'apply' method below to handle the nastiness...
 //
 template <class Mod> struct applicator {
   template <class Opt> static void opt(const Mod &M, Opt &O) { M.apply(O); }
@@ -1313,7 +1275,7 @@ template <> struct applicator<MiscFlags> {
   }
 };
 
-// apply method - Apply modifiers to an option in a type safe way.
+// Apply modifiers to an option in a type safe way.
 template <class Opt, class Mod, class... Mods>
 void apply(Opt *O, const Mod &M, const Mods &... Ms) {
   applicator<Mod>::opt(M, *O);
@@ -1325,8 +1287,6 @@ template <class Opt, class Mod> void apply(Opt *O, const Mod &M) {
 }
 
 //===----------------------------------------------------------------------===//
-// opt_storage class
-
 // Default storage class definition: external storage.  This implementation
 // assumes the user will specify a variable to store the data into with the
 // cl::location(x) modifier.
@@ -1406,7 +1366,7 @@ public:
 
   // Make sure we initialize the value with the default constructor for the
   // type.
-  opt_storage() : Value(DataType()), Default(DataType()) {}
+  opt_storage() : Value(DataType()), Default() {}
 
   template <class T> void setValue(const T &V, bool initial = false) {
     Value = V;
@@ -1425,7 +1385,7 @@ public:
 };
 
 //===----------------------------------------------------------------------===//
-// opt - A scalar command line option.
+// A scalar command line option.
 //
 template <class DataType, bool ExternalStorage = false,
           class ParserClass = parser<DataType>>
@@ -1476,6 +1436,8 @@ class opt : public Option,
     const OptionValue<DataType> &V = this->getDefault();
     if (V.hasValue())
       this->setValue(V.getValue());
+    else
+      this->setValue(T());
   }
 
   template <class T,
@@ -1528,8 +1490,6 @@ extern template class opt<char>;
 extern template class opt<bool>;
 
 //===----------------------------------------------------------------------===//
-// list_storage class
-
 // Default storage class definition: external storage.  This implementation
 // assumes the user will specify a variable to store the data into with the
 // cl::location(x) modifier.
@@ -1634,7 +1594,7 @@ public:
 };
 
 //===----------------------------------------------------------------------===//
-// list - A list of command line options.
+// A list of command line options.
 //
 template <class DataType, class StorageClass = bool,
           class ParserClass = parser<DataType>>
@@ -1716,7 +1676,7 @@ public:
       [](const typename ParserClass::parser_data_type &) {};
 };
 
-// multi_val - Modifier to set the number of additional values.
+// Modifier to set the number of additional values.
 struct multi_val {
   unsigned AdditionalVals;
   explicit multi_val(unsigned N) : AdditionalVals(N) {}
@@ -1728,8 +1688,6 @@ struct multi_val {
 };
 
 //===----------------------------------------------------------------------===//
-// bits_storage class
-
 // Default storage class definition: external storage.  This implementation
 // assumes the user will specify a variable to store the data into with the
 // cl::location(x) modifier.
@@ -1738,7 +1696,7 @@ template <class DataType, class StorageClass> class bits_storage {
   unsigned *Location = nullptr; // Where to store the bits...
 
   template <class T> static unsigned Bit(const T &V) {
-    unsigned BitPos = reinterpret_cast<unsigned>(V);
+    unsigned BitPos = static_cast<unsigned>(V);
     assert(BitPos < sizeof(unsigned) * CHAR_BIT &&
            "enum exceeds width of bit vector!");
     return 1 << BitPos;
@@ -1763,6 +1721,11 @@ public:
 
   unsigned getBits() { return *Location; }
 
+  void clear() {
+    if (Location)
+      *Location = 0;
+  }
+
   template <class T> bool isSet(const T &V) {
     return (*Location & Bit(V)) != 0;
   }
@@ -1772,10 +1735,10 @@ public:
 // This makes us exactly compatible with the bits in all cases that it is used.
 //
 template <class DataType> class bits_storage<DataType, bool> {
-  unsigned Bits; // Where to store the bits...
+  unsigned Bits{0}; // Where to store the bits...
 
   template <class T> static unsigned Bit(const T &V) {
-    unsigned BitPos = (unsigned)V;
+    unsigned BitPos = static_cast<unsigned>(V);
     assert(BitPos < sizeof(unsigned) * CHAR_BIT &&
            "enum exceeds width of bit vector!");
     return 1 << BitPos;
@@ -1786,11 +1749,13 @@ public:
 
   unsigned getBits() { return Bits; }
 
+  void clear() { Bits = 0; }
+
   template <class T> bool isSet(const T &V) { return (Bits & Bit(V)) != 0; }
 };
 
 //===----------------------------------------------------------------------===//
-// bits - A bit vector of command options.
+// A bit vector of command options.
 //
 template <class DataType, class Storage = bool,
           class ParserClass = parser<DataType>>
@@ -1832,7 +1797,7 @@ class bits : public Option, public bits_storage<DataType, Storage> {
   void printOptionValue(size_t /*GlobalWidth*/, bool /*Force*/) const override {
   }
 
-  void setDefault() override {}
+  void setDefault() override { bits_storage<DataType, Storage>::clear(); }
 
   void done() {
     addArgument();
@@ -1929,7 +1894,7 @@ public:
   }
 };
 
-// aliasfor - Modifier to set the option an alias aliases.
+// Modifier to set the option an alias aliases.
 struct aliasopt {
   Option &Opt;
 
@@ -1938,10 +1903,9 @@ struct aliasopt {
   void apply(alias &A) const { A.setAliasFor(Opt); }
 };
 
-// extrahelp - provide additional help at the end of the normal help
-// output. All occurrences of cl::extrahelp will be accumulated and
-// printed to stderr at the end of the regular help, just before
-// exit is called.
+// Provide additional help at the end of the normal help output. All occurrences
+// of cl::extrahelp will be accumulated and printed to stderr at the end of the
+// regular help, just before exit is called.
 struct extrahelp {
   StringRef morehelp;
 
@@ -2032,12 +1996,15 @@ void TokenizeGNUCommandLine(StringRef Source, StringSaver &Saver,
                             SmallVectorImpl<const char *> &NewArgv,
                             bool MarkEOLs = false);
 
-/// Tokenizes a Windows command line which may contain quotes and escaped
-/// quotes.
+/// Tokenizes a string of Windows command line arguments, which may contain
+/// quotes and escaped quotes.
 ///
 /// See MSDN docs for CommandLineToArgvW for information on the quoting rules.
 /// http://msdn.microsoft.com/en-us/library/windows/desktop/17w5ykft(v=vs.85).aspx
 ///
+/// For handling a full Windows command line including the executable name at
+/// the start, see TokenizeWindowsCommandLineFull below.
+///
 /// \param [in] Source The string to be split on whitespace with quotes.
 /// \param [in] Saver Delegates back to the caller for saving parsed strings.
 /// \param [in] MarkEOLs true if tokenizing a response file and you want end of
@@ -2054,6 +2021,23 @@ void TokenizeWindowsCommandLine(StringRef Source, StringSaver &Saver,
 void TokenizeWindowsCommandLineNoCopy(StringRef Source, StringSaver &Saver,
                                       SmallVectorImpl<StringRef> &NewArgv);
 
+/// Tokenizes a Windows full command line, including command name at the start.
+///
+/// This uses the same syntax rules as TokenizeWindowsCommandLine for all but
+/// the first token. But the first token is expected to be parsed as the
+/// executable file name in the way CreateProcess would do it, rather than the
+/// way the C library startup code would do it: CreateProcess does not consider
+/// that \ is ever an escape character (because " is not a valid filename char,
+/// hence there's never a need to escape it to be used literally).
+///
+/// Parameters are the same as for TokenizeWindowsCommandLine. In particular,
+/// if you set MarkEOLs = true, then the first word of every line will be
+/// parsed using the special rules for command names, making this function
+/// suitable for parsing a file full of commands to execute.
+void TokenizeWindowsCommandLineFull(StringRef Source, StringSaver &Saver,
+                                    SmallVectorImpl<const char *> &NewArgv,
+                                    bool MarkEOLs = false);
+
 /// String tokenization function type.  Should be compatible with either
 /// Windows or Unix command line tokenizers.
 using TokenizerCallback = void (*)(StringRef Source, StringSaver &Saver,
diff --git a/llvm/include/llvm/Support/Compiler.h b/llvm/include/llvm/Support/Compiler.h
index f3317049524f..6708b7cc95cc 100644
--- a/llvm/include/llvm/Support/Compiler.h
+++ b/llvm/include/llvm/Support/Compiler.h
@@ -39,6 +39,10 @@
 # define __has_builtin(x) 0
 #endif
 
+#ifndef __has_include
+# define __has_include(x) 0
+#endif
+
 // Only use __has_cpp_attribute in C++ mode. GCC defines __has_cpp_attribute in
 // C mode, but the :: in __has_cpp_attribute(scoped::attribute) is invalid.
 #ifndef LLVM_HAS_CPP_ATTRIBUTE
@@ -90,30 +94,14 @@
 #define LLVM_MSC_PREREQ(version) (_MSC_VER >= (version))
 
 // We require at least VS 2019.
+#if !defined(LLVM_FORCE_USE_OLD_TOOLCHAIN)
 #if !LLVM_MSC_PREREQ(1920)
 #error LLVM requires at least VS 2019.
 #endif
-
-#else
-#define LLVM_MSC_PREREQ(version) 0
 #endif
 
-/// Does the compiler support ref-qualifiers for *this?
-///
-/// Sadly, this is separate from just rvalue reference support because GCC
-/// and MSVC implemented this later than everything else. This appears to be
-/// corrected in MSVC 2019 but not MSVC 2017.
-/// FIXME: Remove LLVM_HAS_RVALUE_REFERENCE_THIS macro
-#define LLVM_HAS_RVALUE_REFERENCE_THIS 1
-
-/// Expands to '&' if ref-qualifiers for *this are supported.
-///
-/// This can be used to provide lvalue/rvalue overrides of member functions.
-/// The rvalue override should be guarded by LLVM_HAS_RVALUE_REFERENCE_THIS
-#if LLVM_HAS_RVALUE_REFERENCE_THIS
-#define LLVM_LVALUE_FUNCTION &
 #else
-#define LLVM_LVALUE_FUNCTION
+#define LLVM_MSC_PREREQ(version) 0
 #endif
 
 /// LLVM_LIBRARY_VISIBILITY - If a class marked with this attribute is linked
@@ -325,20 +313,17 @@
 #define LLVM_EXTENSION
 #endif
 
-// LLVM_ATTRIBUTE_DEPRECATED(decl, "message")
-// This macro will be removed.
-// Use C++14's attribute instead: [[deprecated("message")]]
-#define LLVM_ATTRIBUTE_DEPRECATED(decl, message) [[deprecated(message)]] decl
-
 /// LLVM_BUILTIN_UNREACHABLE - On compilers which support it, expands
 /// to an expression which states that it is undefined behavior for the
 /// compiler to reach this point.  Otherwise is not defined.
+///
+/// '#else' is intentionally left out so that other macro logic (e.g.,
+/// LLVM_ASSUME_ALIGNED and llvm_unreachable()) can detect whether
+/// LLVM_BUILTIN_UNREACHABLE has a definition.
 #if __has_builtin(__builtin_unreachable) || defined(__GNUC__)
 # define LLVM_BUILTIN_UNREACHABLE __builtin_unreachable()
 #elif defined(_MSC_VER)
 # define LLVM_BUILTIN_UNREACHABLE __assume(false)
-#else
-# define LLVM_BUILTIN_UNREACHABLE
 #endif
 
 /// LLVM_BUILTIN_TRAP - On compilers which support it, expands to an expression
@@ -411,22 +396,6 @@
 # define LLVM_PACKED_END   _Pragma("pack(pop)")
 #endif
 
-/// \macro LLVM_PTR_SIZE
-/// A constant integer equivalent to the value of sizeof(void*).
-/// Generally used in combination with alignas or when doing computation in the
-/// preprocessor.
-#ifdef __SIZEOF_POINTER__
-# define LLVM_PTR_SIZE __SIZEOF_POINTER__
-#elif defined(_WIN64)
-# define LLVM_PTR_SIZE 8
-#elif defined(_WIN32)
-# define LLVM_PTR_SIZE 4
-#elif defined(_MSC_VER)
-# error "could not determine LLVM_PTR_SIZE as a constant int for MSVC"
-#else
-# define LLVM_PTR_SIZE sizeof(void *)
-#endif
-
 /// \macro LLVM_MEMORY_SANITIZER_BUILD
 /// Whether LLVM itself is built with MemorySanitizer instrumentation.
 #if __has_feature(memory_sanitizer)
@@ -444,8 +413,21 @@
 /// Whether LLVM itself is built with AddressSanitizer instrumentation.
 #if __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
 # define LLVM_ADDRESS_SANITIZER_BUILD 1
+#if __has_include(<sanitizer/asan_interface.h>)
 # include <sanitizer/asan_interface.h>
 #else
+// These declarations exist to support ASan with MSVC. If MSVC eventually ships
+// asan_interface.h in their headers, then we can remove this.
+#ifdef __cplusplus
+extern "C" {
+#endif
+void __asan_poison_memory_region(void const volatile *addr, size_t size);
+void __asan_unpoison_memory_region(void const volatile *addr, size_t size);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+#endif
+#else
 # define LLVM_ADDRESS_SANITIZER_BUILD 0
 # define __asan_poison_memory_region(p, size)
 # define __asan_unpoison_memory_region(p, size)
diff --git a/llvm/include/llvm/Support/Compression.h b/llvm/include/llvm/Support/Compression.h
index 5bc0e56913fe..e6f898229412 100644
--- a/llvm/include/llvm/Support/Compression.h
+++ b/llvm/include/llvm/Support/Compression.h
@@ -29,8 +29,8 @@ static constexpr int BestSizeCompression = 9;
 
 bool isAvailable();
 
-Error compress(StringRef InputBuffer, SmallVectorImpl<char> &CompressedBuffer,
-               int Level = DefaultCompression);
+void compress(StringRef InputBuffer, SmallVectorImpl<char> &CompressedBuffer,
+              int Level = DefaultCompression);
 
 Error uncompress(StringRef InputBuffer, char *UncompressedBuffer,
                  size_t &UncompressedSize);
diff --git a/llvm/include/llvm/Support/ConvertUTF.h b/llvm/include/llvm/Support/ConvertUTF.h
index 374cdb907fdc..662f3aca5b54 100644
--- a/llvm/include/llvm/Support/ConvertUTF.h
+++ b/llvm/include/llvm/Support/ConvertUTF.h
@@ -126,6 +126,9 @@ typedef unsigned char   Boolean; /* 0 or 1 */
 #define UNI_UTF16_BYTE_ORDER_MARK_NATIVE  0xFEFF
 #define UNI_UTF16_BYTE_ORDER_MARK_SWAPPED 0xFFFE
 
+#define UNI_UTF32_BYTE_ORDER_MARK_NATIVE 0x0000FEFF
+#define UNI_UTF32_BYTE_ORDER_MARK_SWAPPED 0xFFFE0000
+
 typedef enum {
   conversionOK,           /* conversion successful */
   sourceExhausted,        /* partial character in source, but hit end */
@@ -281,6 +284,24 @@ bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out);
 */
 bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out);
 
+/**
+ * Converts a stream of raw bytes assumed to be UTF32 into a UTF8 std::string.
+ *
+ * \param [in] SrcBytes A buffer of what is assumed to be UTF-32 encoded text.
+ * \param [out] Out Converted UTF-8 is stored here on success.
+ * \returns true on success
+ */
+bool convertUTF32ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out);
+
+/**
+ * Converts a UTF32 string into a UTF8 std::string.
+ *
+ * \param [in] Src A buffer of UTF-32 encoded text.
+ * \param [out] Out Converted UTF-8 is stored here on success.
+ * \returns true on success
+ */
+bool convertUTF32ToUTF8String(ArrayRef<UTF32> Src, std::string &Out);
+
 /**
  * Converts a UTF-8 string into a UTF-16 string with native endianness.
  *
diff --git a/llvm/include/llvm/Support/CrashRecoveryContext.h b/llvm/include/llvm/Support/CrashRecoveryContext.h
index f60e7335e197..26ddf97b3ef0 100644
--- a/llvm/include/llvm/Support/CrashRecoveryContext.h
+++ b/llvm/include/llvm/Support/CrashRecoveryContext.h
@@ -101,6 +101,9 @@ public:
   /// return failure from RunSafely(). This function does not return.
   [[noreturn]] void HandleExit(int RetCode);
 
+  /// Return true if RetCode indicates that a signal or an exception occurred.
+  static bool isCrash(int RetCode);
+
   /// Throw again a signal or an exception, after it was catched once by a
   /// CrashRecoveryContext.
   static bool throwIfCrash(int RetCode);
diff --git a/llvm/include/llvm/Support/Debug.h b/llvm/include/llvm/Support/Debug.h
index 2ff978476c79..5788ab3b2138 100644
--- a/llvm/include/llvm/Support/Debug.h
+++ b/llvm/include/llvm/Support/Debug.h
@@ -67,8 +67,8 @@ void setCurrentDebugTypes(const char **Types, unsigned Count);
 
 #else
 #define isCurrentDebugType(X) (false)
-#define setCurrentDebugType(X)
-#define setCurrentDebugTypes(X, N)
+#define setCurrentDebugType(X) do { (void)(X); } while (false)
+#define setCurrentDebugTypes(X, N) do { (void)(X); (void)(N); } while (false)
 #define DEBUG_WITH_TYPE(TYPE, X) do { } while (false)
 #endif
 
diff --git a/llvm/include/llvm/Support/Errno.h b/llvm/include/llvm/Support/Errno.h
index 07df6765d9db..e095c66b9086 100644
--- a/llvm/include/llvm/Support/Errno.h
+++ b/llvm/include/llvm/Support/Errno.h
@@ -15,7 +15,6 @@
 
 #include <cerrno>
 #include <string>
-#include <type_traits>
 
 namespace llvm {
 namespace sys {
diff --git a/llvm/include/llvm/Support/Error.h b/llvm/include/llvm/Support/Error.h
index 881049b15b0d..1a801b6f2c7a 100644
--- a/llvm/include/llvm/Support/Error.h
+++ b/llvm/include/llvm/Support/Error.h
@@ -1269,7 +1269,7 @@ public:
   void log(raw_ostream &OS) const override {
     assert(Err && "Trying to log after takeError().");
     OS << "'" << FileName << "': ";
-    if (Line.hasValue())
+    if (Line)
       OS << "line " << Line.getValue() << ": ";
     Err->log(OS);
   }
@@ -1281,7 +1281,7 @@ public:
     return OS.str();
   }
 
-  StringRef getFileName() { return FileName; }
+  StringRef getFileName() const { return FileName; }
 
   Error takeError() { return Error(std::move(Err)); }
 
diff --git a/llvm/include/llvm/Support/ErrorHandling.h b/llvm/include/llvm/Support/ErrorHandling.h
index f980510d37f0..004b3b7868fb 100644
--- a/llvm/include/llvm/Support/ErrorHandling.h
+++ b/llvm/include/llvm/Support/ErrorHandling.h
@@ -124,19 +124,30 @@ llvm_unreachable_internal(const char *msg = nullptr, const char *file = nullptr,
 
 /// Marks that the current location is not supposed to be reachable.
 /// In !NDEBUG builds, prints the message and location info to stderr.
-/// In NDEBUG builds, becomes an optimizer hint that the current location
-/// is not supposed to be reachable.  On compilers that don't support
-/// such hints, prints a reduced message instead and aborts the program.
+/// In NDEBUG builds, if the platform does not support a builtin unreachable
+/// then we call an internal LLVM runtime function. Otherwise the behavior is
+/// controlled by the CMake flag
+///   -DLLVM_UNREACHABLE_OPTIMIZE
+/// * When "ON" (default) llvm_unreachable() becomes an optimizer hint
+///   that the current location is not supposed to be reachable: the hint
+///   turns such code path into undefined behavior.  On compilers that don't
+///   support such hints, prints a reduced message instead and aborts the
+///   program.
+/// * When "OFF", a builtin_trap is emitted instead of an
+//    optimizer hint or printing a reduced message.
 ///
-/// Use this instead of assert(0).  It conveys intent more clearly and
-/// allows compilers to omit some unnecessary code.
+/// Use this instead of assert(0). It conveys intent more clearly, suppresses
+/// diagnostics for unreachable code paths, and allows compilers to omit
+/// unnecessary code.
 #ifndef NDEBUG
 #define llvm_unreachable(msg) \
   ::llvm::llvm_unreachable_internal(msg, __FILE__, __LINE__)
-#elif defined(LLVM_BUILTIN_UNREACHABLE)
+#elif !defined(LLVM_BUILTIN_UNREACHABLE)
+#define llvm_unreachable(msg) ::llvm::llvm_unreachable_internal()
+#elif LLVM_UNREACHABLE_OPTIMIZE
 #define llvm_unreachable(msg) LLVM_BUILTIN_UNREACHABLE
 #else
-#define llvm_unreachable(msg) ::llvm::llvm_unreachable_internal()
+#define llvm_unreachable(msg) LLVM_BUILTIN_TRAP, LLVM_BUILTIN_UNREACHABLE
 #endif
 
 #endif
diff --git a/llvm/include/llvm/Support/FileUtilities.h b/llvm/include/llvm/Support/FileUtilities.h
index f8a37fe1177d..0033638c6804 100644
--- a/llvm/include/llvm/Support/FileUtilities.h
+++ b/llvm/include/llvm/Support/FileUtilities.h
@@ -110,6 +110,27 @@ namespace llvm {
   llvm::Error
   writeFileAtomically(StringRef TempPathModel, StringRef FinalPath,
                       std::function<llvm::Error(llvm::raw_ostream &)> Writer);
+
+  /// FilePermssionsApplier helps to copy permissions from an input file to
+  /// an output one. It memorizes the status of the input file and can apply
+  /// permissions and dates to the output file.
+  class FilePermissionsApplier {
+  public:
+    static Expected<FilePermissionsApplier> create(StringRef InputFilename);
+
+    /// Apply stored permissions to the \p OutputFilename.
+    /// Copy LastAccess and ModificationTime if \p CopyDates is true.
+    /// Overwrite stored permissions if \p OverwritePermissions is specified.
+    Error apply(StringRef OutputFilename, bool CopyDates = false,
+                Optional<sys::fs::perms> OverwritePermissions = None);
+
+  private:
+    FilePermissionsApplier(StringRef InputFilename, sys::fs::file_status Status)
+        : InputFilename(InputFilename), InputStatus(Status) {}
+
+    StringRef InputFilename;
+    sys::fs::file_status InputStatus;
+  };
 } // End llvm namespace
 
 #endif
diff --git a/llvm/include/llvm/Support/FormatProviders.h b/llvm/include/llvm/Support/FormatProviders.h
index 3edd8844bc7a..8101ed7968ad 100644
--- a/llvm/include/llvm/Support/FormatProviders.h
+++ b/llvm/include/llvm/Support/FormatProviders.h
@@ -313,7 +313,7 @@ struct format_provider<T,
       S = FloatStyle::Fixed;
 
     Optional<size_t> Precision = parseNumericPrecision(Style);
-    if (!Precision.hasValue())
+    if (!Precision)
       Precision = getDefaultPrecision(S);
 
     write_double(Stream, static_cast<double>(V), S, Precision);
diff --git a/llvm/include/llvm/Support/FormatVariadic.h b/llvm/include/llvm/Support/FormatVariadic.h
index a872afb5e45e..c1707b4fe9cb 100644
--- a/llvm/include/llvm/Support/FormatVariadic.h
+++ b/llvm/include/llvm/Support/FormatVariadic.h
@@ -172,7 +172,7 @@ public:
 // Formats textual output.  `Fmt` is a string consisting of one or more
 // replacement sequences with the following grammar:
 //
-// rep_field ::= "{" [index] ["," layout] [":" format] "}"
+// rep_field ::= "{" index ["," layout] [":" format] "}"
 // index     ::= <non-negative integer>
 // layout    ::= [[[char]loc]width]
 // format    ::= <any string not containing "{" or "}">
diff --git a/llvm/include/llvm/Support/HashBuilder.h b/llvm/include/llvm/Support/HashBuilder.h
index bf93a0d22da7..9d7680d2b667 100644
--- a/llvm/include/llvm/Support/HashBuilder.h
+++ b/llvm/include/llvm/Support/HashBuilder.h
@@ -39,6 +39,9 @@ struct IsHashableData
 /// Declares the hasher member, and functions forwarding directly to the hasher.
 template <typename HasherT> class HashBuilderBase {
 public:
+  template <typename HasherT_ = HasherT>
+  using HashResultTy = decltype(std::declval<HasherT_ &>().final());
+
   HasherT &getHasher() { return Hasher; }
 
   /// Forward to `HasherT::update(ArrayRef<uint8_t>)`.
@@ -59,12 +62,12 @@ public:
   }
 
   /// Forward to `HasherT::final()` if available.
-  template <typename HasherT_ = HasherT> StringRef final() {
+  template <typename HasherT_ = HasherT> HashResultTy<HasherT_> final() {
     return this->getHasher().final();
   }
 
   /// Forward to `HasherT::result()` if available.
-  template <typename HasherT_ = HasherT> StringRef result() {
+  template <typename HasherT_ = HasherT> HashResultTy<HasherT_> result() {
     return this->getHasher().result();
   }
 
diff --git a/llvm/include/llvm/Support/Host.h b/llvm/include/llvm/Support/Host.h
index b3c15f0683b9..f683371ad1d3 100644
--- a/llvm/include/llvm/Support/Host.h
+++ b/llvm/include/llvm/Support/Host.h
@@ -64,6 +64,7 @@ namespace sys {
   StringRef getHostCPUNameForPowerPC(StringRef ProcCpuinfoContent);
   StringRef getHostCPUNameForARM(StringRef ProcCpuinfoContent);
   StringRef getHostCPUNameForS390x(StringRef ProcCpuinfoContent);
+  StringRef getHostCPUNameForRISCV(StringRef ProcCpuinfoContent);
   StringRef getHostCPUNameForBPF();
 
   /// Helper functions to extract CPU details from CPUID on x86.
diff --git a/llvm/include/llvm/Support/KnownBits.h b/llvm/include/llvm/Support/KnownBits.h
index 96b7753e9b20..84e095e2bbab 100644
--- a/llvm/include/llvm/Support/KnownBits.h
+++ b/llvm/include/llvm/Support/KnownBits.h
@@ -324,7 +324,7 @@ public:
 
   /// Compute known bits resulting from multiplying LHS and RHS.
   static KnownBits mul(const KnownBits &LHS, const KnownBits &RHS,
-                       bool SelfMultiply = false);
+                       bool NoUndefSelfMultiply = false);
 
   /// Compute known bits from sign-extended multiply-hi.
   static KnownBits mulhs(const KnownBits &LHS, const KnownBits &RHS);
@@ -415,6 +415,12 @@ public:
     return KnownBits(Zero.reverseBits(), One.reverseBits());
   }
 
+  bool operator==(const KnownBits &Other) const {
+    return Zero == Other.Zero && One == Other.One;
+  }
+
+  bool operator!=(const KnownBits &Other) const { return !(*this == Other); }
+
   void print(raw_ostream &OS) const;
   void dump() const;
 };
diff --git a/llvm/include/llvm/Support/LowLevelTypeImpl.h b/llvm/include/llvm/Support/LowLevelTypeImpl.h
index dd286f5228fe..186a7e5930ec 100644
--- a/llvm/include/llvm/Support/LowLevelTypeImpl.h
+++ b/llvm/include/llvm/Support/LowLevelTypeImpl.h
@@ -207,6 +207,18 @@ public:
     return scalar(getScalarSizeInBits() / Factor);
   }
 
+  /// Produce a vector type that is \p Factor times bigger, preserving the
+  /// element type. For a scalar or pointer, this will produce a new vector with
+  /// \p Factor elements.
+  LLT multiplyElements(int Factor) const {
+    if (isVector()) {
+      return scalarOrVector(getElementCount().multiplyCoefficientBy(Factor),
+                            getElementType());
+    }
+
+    return fixed_vector(Factor, *this);
+  }
+
   bool isByteSized() const { return getSizeInBits().isKnownMultipleOf(8); }
 
   unsigned getScalarSizeInBits() const {
diff --git a/llvm/include/llvm/Support/MD5.h b/llvm/include/llvm/Support/MD5.h
index 70d046601346..fa2f477261dd 100644
--- a/llvm/include/llvm/Support/MD5.h
+++ b/llvm/include/llvm/Support/MD5.h
@@ -40,26 +40,19 @@ template <typename T> class ArrayRef;
 
 class MD5 {
 public:
-  struct MD5Result {
-    std::array<uint8_t, 16> Bytes;
-
-    operator std::array<uint8_t, 16>() const { return Bytes; }
-
-    const uint8_t &operator[](size_t I) const { return Bytes[I]; }
-    uint8_t &operator[](size_t I) { return Bytes[I]; }
-
+  struct MD5Result : public std::array<uint8_t, 16> {
     SmallString<32> digest() const;
 
     uint64_t low() const {
       // Our MD5 implementation returns the result in little endian, so the low
       // word is first.
       using namespace support;
-      return endian::read<uint64_t, little, unaligned>(Bytes.data());
+      return endian::read<uint64_t, little, unaligned>(data());
     }
 
     uint64_t high() const {
       using namespace support;
-      return endian::read<uint64_t, little, unaligned>(Bytes.data() + 8);
+      return endian::read<uint64_t, little, unaligned>(data() + 8);
     }
     std::pair<uint64_t, uint64_t> words() const {
       using namespace support;
@@ -78,20 +71,20 @@ public:
   /// Finishes off the hash and puts the result in result.
   void final(MD5Result &Result);
 
-  /// Finishes off the hash, and returns a reference to the 16-byte hash data.
-  StringRef final();
+  /// Finishes off the hash, and returns the 16-byte hash data.
+  MD5Result final();
 
-  /// Finishes off the hash, and returns a reference to the 16-byte hash data.
+  /// Finishes off the hash, and returns the 16-byte hash data.
   /// This is suitable for getting the MD5 at any time without invalidating the
   /// internal state, so that more calls can be made into `update`.
-  StringRef result();
+  MD5Result result();
 
   /// Translates the bytes in \p Res to a hex string that is
   /// deposited into \p Str. The result will be of length 32.
   static void stringifyResult(MD5Result &Result, SmallVectorImpl<char> &Str);
 
   /// Computes the hash for a given bytes.
-  static std::array<uint8_t, 16> hash(ArrayRef<uint8_t> Data);
+  static MD5Result hash(ArrayRef<uint8_t> Data);
 
 private:
   // Any 32-bit or wider unsigned integer data type will do.
@@ -109,15 +102,9 @@ private:
     MD5_u32plus block[16];
   } InternalState;
 
-  MD5Result Result;
-
   const uint8_t *body(ArrayRef<uint8_t> Data);
 };
 
-inline bool operator==(const MD5::MD5Result &LHS, const MD5::MD5Result &RHS) {
-  return LHS.Bytes == RHS.Bytes;
-}
-
 /// Helper to compute and return lower 64 bits of the given string's MD5 hash.
 inline uint64_t MD5Hash(StringRef Str) {
   using namespace support;
diff --git a/llvm/include/llvm/Support/MachineValueType.h b/llvm/include/llvm/Support/MachineValueType.h
index 643c2d8ce981..5355c50bb762 100644
--- a/llvm/include/llvm/Support/MachineValueType.h
+++ b/llvm/include/llvm/Support/MachineValueType.h
@@ -41,143 +41,149 @@ namespace llvm {
       // ValueTypes.td as well!
       Other          =   1,   // This is a non-standard value
       i1             =   2,   // This is a 1 bit integer value
-      i8             =   3,   // This is an 8 bit integer value
-      i16            =   4,   // This is a 16 bit integer value
-      i32            =   5,   // This is a 32 bit integer value
-      i64            =   6,   // This is a 64 bit integer value
-      i128           =   7,   // This is a 128 bit integer value
+      i2             =   3,   // This is a 2 bit integer value
+      i4             =   4,   // This is a 4 bit integer value
+      i8             =   5,   // This is an 8 bit integer value
+      i16            =   6,   // This is a 16 bit integer value
+      i32            =   7,   // This is a 32 bit integer value
+      i64            =   8,   // This is a 64 bit integer value
+      i128           =   9,   // This is a 128 bit integer value
 
       FIRST_INTEGER_VALUETYPE = i1,
       LAST_INTEGER_VALUETYPE  = i128,
 
-      bf16           =   8,   // This is a 16 bit brain floating point value
-      f16            =   9,   // This is a 16 bit floating point value
-      f32            =  10,   // This is a 32 bit floating point value
-      f64            =  11,   // This is a 64 bit floating point value
-      f80            =  12,   // This is a 80 bit floating point value
-      f128           =  13,   // This is a 128 bit floating point value
-      ppcf128        =  14,   // This is a PPC 128-bit floating point value
+      bf16           =  10,   // This is a 16 bit brain floating point value
+      f16            =  11,   // This is a 16 bit floating point value
+      f32            =  12,   // This is a 32 bit floating point value
+      f64            =  13,   // This is a 64 bit floating point value
+      f80            =  14,   // This is a 80 bit floating point value
+      f128           =  15,   // This is a 128 bit floating point value
+      ppcf128        =  16,   // This is a PPC 128-bit floating point value
 
       FIRST_FP_VALUETYPE = bf16,
       LAST_FP_VALUETYPE  = ppcf128,
 
-      v1i1           =  15,   //    1 x i1
-      v2i1           =  16,   //    2 x i1
-      v4i1           =  17,   //    4 x i1
-      v8i1           =  18,   //    8 x i1
-      v16i1          =  19,   //   16 x i1
-      v32i1          =  20,   //   32 x i1
-      v64i1          =  21,   //   64 x i1
-      v128i1         =  22,   //  128 x i1
-      v256i1         =  23,   //  256 x i1
-      v512i1         =  24,   //  512 x i1
-      v1024i1        =  25,   // 1024 x i1
-
-      v1i8           =  26,   //    1 x i8
-      v2i8           =  27,   //    2 x i8
-      v4i8           =  28,   //    4 x i8
-      v8i8           =  29,   //    8 x i8
-      v16i8          =  30,   //   16 x i8
-      v32i8          =  31,   //   32 x i8
-      v64i8          =  32,   //   64 x i8
-      v128i8         =  33,   //  128 x i8
-      v256i8         =  34,   //  256 x i8
-      v512i8         =  35,   //  512 x i8
-      v1024i8        =  36,   // 1024 x i8
-
-      v1i16          =  37,   //   1 x i16
-      v2i16          =  38,   //   2 x i16
-      v3i16          =  39,   //   3 x i16
-      v4i16          =  40,   //   4 x i16
-      v8i16          =  41,   //   8 x i16
-      v16i16         =  42,   //  16 x i16
-      v32i16         =  43,   //  32 x i16
-      v64i16         =  44,   //  64 x i16
-      v128i16        =  45,   // 128 x i16
-      v256i16        =  46,   // 256 x i16
-      v512i16        =  47,   // 512 x i16
-
-      v1i32          =  48,   //    1 x i32
-      v2i32          =  49,   //    2 x i32
-      v3i32          =  50,   //    3 x i32
-      v4i32          =  51,   //    4 x i32
-      v5i32          =  52,   //    5 x i32
-      v6i32          =  53,   //    6 x i32
-      v7i32          =  54,   //    7 x i32
-      v8i32          =  55,   //    8 x i32
-      v16i32         =  56,   //   16 x i32
-      v32i32         =  57,   //   32 x i32
-      v64i32         =  58,   //   64 x i32
-      v128i32        =  59,   //  128 x i32
-      v256i32        =  60,   //  256 x i32
-      v512i32        =  61,   //  512 x i32
-      v1024i32       =  62,   // 1024 x i32
-      v2048i32       =  63,   // 2048 x i32
-
-      v1i64          =  64,   //   1 x i64
-      v2i64          =  65,   //   2 x i64
-      v3i64          =  66,   //   3 x i64
-      v4i64          =  67,   //   4 x i64
-      v8i64          =  68,   //   8 x i64
-      v16i64         =  69,   //  16 x i64
-      v32i64         =  70,   //  32 x i64
-      v64i64         =  71,   //  64 x i64
-      v128i64        =  72,   // 128 x i64
-      v256i64        =  73,   // 256 x i64
-
-      v1i128         =  74,   //  1 x i128
+      v1i1           =  17,   //    1 x i1
+      v2i1           =  18,   //    2 x i1
+      v4i1           =  19,   //    4 x i1
+      v8i1           =  20,   //    8 x i1
+      v16i1          =  21,   //   16 x i1
+      v32i1          =  22,   //   32 x i1
+      v64i1          =  23,   //   64 x i1
+      v128i1         =  24,   //  128 x i1
+      v256i1         =  25,   //  256 x i1
+      v512i1         =  26,   //  512 x i1
+      v1024i1        =  27,   // 1024 x i1
+
+      v128i2         =  28,   //  128 x i2
+
+      v64i4          =  29,   //   64 x i4
+
+      v1i8           =  30,   //    1 x i8
+      v2i8           =  31,   //    2 x i8
+      v4i8           =  32,   //    4 x i8
+      v8i8           =  33,   //    8 x i8
+      v16i8          =  34,   //   16 x i8
+      v32i8          =  35,   //   32 x i8
+      v64i8          =  36,   //   64 x i8
+      v128i8         =  37,   //  128 x i8
+      v256i8         =  38,   //  256 x i8
+      v512i8         =  39,   //  512 x i8
+      v1024i8        =  40,   // 1024 x i8
+
+      v1i16          =  41,   //   1 x i16
+      v2i16          =  42,   //   2 x i16
+      v3i16          =  43,   //   3 x i16
+      v4i16          =  44,   //   4 x i16
+      v8i16          =  45,   //   8 x i16
+      v16i16         =  46,   //  16 x i16
+      v32i16         =  47,   //  32 x i16
+      v64i16         =  48,   //  64 x i16
+      v128i16        =  49,   // 128 x i16
+      v256i16        =  50,   // 256 x i16
+      v512i16        =  51,   // 512 x i16
+
+      v1i32          =  52,   //    1 x i32
+      v2i32          =  53,   //    2 x i32
+      v3i32          =  54,   //    3 x i32
+      v4i32          =  55,   //    4 x i32
+      v5i32          =  56,   //    5 x i32
+      v6i32          =  57,   //    6 x i32
+      v7i32          =  58,   //    7 x i32
+      v8i32          =  59,   //    8 x i32
+      v16i32         =  60,   //   16 x i32
+      v32i32         =  61,   //   32 x i32
+      v64i32         =  62,   //   64 x i32
+      v128i32        =  63,   //  128 x i32
+      v256i32        =  64,   //  256 x i32
+      v512i32        =  65,   //  512 x i32
+      v1024i32       =  66,   // 1024 x i32
+      v2048i32       =  67,   // 2048 x i32
+
+      v1i64          =  68,   //   1 x i64
+      v2i64          =  69,   //   2 x i64
+      v3i64          =  70,   //   3 x i64
+      v4i64          =  71,   //   4 x i64
+      v8i64          =  72,   //   8 x i64
+      v16i64         =  73,   //  16 x i64
+      v32i64         =  74,   //  32 x i64
+      v64i64         =  75,   //  64 x i64
+      v128i64        =  76,   // 128 x i64
+      v256i64        =  77,   // 256 x i64
+
+      v1i128         =  78,   //  1 x i128
 
       FIRST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE = v1i1,
       LAST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE = v1i128,
 
-      v1f16          =  75,   //    1 x f16
-      v2f16          =  76,   //    2 x f16
-      v3f16          =  77,   //    3 x f16
-      v4f16          =  78,   //    4 x f16
-      v8f16          =  79,   //    8 x f16
-      v16f16         =  80,   //   16 x f16
-      v32f16         =  81,   //   32 x f16
-      v64f16         =  82,   //   64 x f16
-      v128f16        =  83,   //  128 x f16
-      v256f16        =  84,   //  256 x f16
-      v512f16        =  85,   //  256 x f16
-
-      v2bf16         =  86,   //    2 x bf16
-      v3bf16         =  87,   //    3 x bf16
-      v4bf16         =  88,   //    4 x bf16
-      v8bf16         =  89,   //    8 x bf16
-      v16bf16        =  90,   //   16 x bf16
-      v32bf16        =  91,   //   32 x bf16
-      v64bf16        =  92,   //   64 x bf16
-      v128bf16       =  93,   //  128 x bf16
-
-      v1f32          =  94,   //    1 x f32
-      v2f32          =  95,   //    2 x f32
-      v3f32          =  96,   //    3 x f32
-      v4f32          =  97,   //    4 x f32
-      v5f32          =  98,   //    5 x f32
-      v6f32          =  99,   //    6 x f32
-      v7f32          = 100,   //    7 x f32
-      v8f32          = 101,   //    8 x f32
-      v16f32         = 102,   //   16 x f32
-      v32f32         = 103,   //   32 x f32
-      v64f32         = 104,   //   64 x f32
-      v128f32        = 105,   //  128 x f32
-      v256f32        = 106,   //  256 x f32
-      v512f32        = 107,   //  512 x f32
-      v1024f32       = 108,   // 1024 x f32
-      v2048f32       = 109,   // 2048 x f32
-
-      v1f64          = 110,   //    1 x f64
-      v2f64          = 111,   //    2 x f64
-      v3f64          = 112,   //    3 x f64
-      v4f64          = 113,   //    4 x f64
-      v8f64          = 114,   //    8 x f64
-      v16f64         = 115,   //   16 x f64
-      v32f64         = 116,   //   32 x f64
-      v64f64         = 117,   //   64 x f64
-      v128f64        = 118,   //  128 x f64
-      v256f64        = 119,   //  256 x f64
+      v1f16          =  79,   //    1 x f16
+      v2f16          =  80,   //    2 x f16
+      v3f16          =  81,   //    3 x f16
+      v4f16          =  82,   //    4 x f16
+      v8f16          =  83,   //    8 x f16
+      v16f16         =  84,   //   16 x f16
+      v32f16         =  85,   //   32 x f16
+      v64f16         =  86,   //   64 x f16
+      v128f16        =  87,   //  128 x f16
+      v256f16        =  88,   //  256 x f16
+      v512f16        =  89,   //  256 x f16
+
+      v2bf16         =  90,   //    2 x bf16
+      v3bf16         =  91,   //    3 x bf16
+      v4bf16         =  92,   //    4 x bf16
+      v8bf16         =  93,   //    8 x bf16
+      v16bf16        =  94,   //   16 x bf16
+      v32bf16        =  95,   //   32 x bf16
+      v64bf16        =  96,   //   64 x bf16
+      v128bf16       =  97,   //  128 x bf16
+
+      v1f32          =  98,   //    1 x f32
+      v2f32          =  99,   //    2 x f32
+      v3f32          = 100,   //    3 x f32
+      v4f32          = 101,   //    4 x f32
+      v5f32          = 102,   //    5 x f32
+      v6f32          = 103,   //    6 x f32
+      v7f32          = 104,   //    7 x f32
+      v8f32          = 105,   //    8 x f32
+      v16f32         = 106,   //   16 x f32
+      v32f32         = 107,   //   32 x f32
+      v64f32         = 108,   //   64 x f32
+      v128f32        = 109,   //  128 x f32
+      v256f32        = 110,   //  256 x f32
+      v512f32        = 111,   //  512 x f32
+      v1024f32       = 112,   // 1024 x f32
+      v2048f32       = 113,   // 2048 x f32
+
+      v1f64          = 114,   //    1 x f64
+      v2f64          = 115,   //    2 x f64
+      v3f64          = 116,   //    3 x f64
+      v4f64          = 117,   //    4 x f64
+      v8f64          = 118,   //    8 x f64
+      v16f64         = 119,   //   16 x f64
+      v32f64         = 120,   //   32 x f64
+      v64f64         = 121,   //   64 x f64
+      v128f64        = 122,   //  128 x f64
+      v256f64        = 123,   //  256 x f64
 
       FIRST_FP_FIXEDLEN_VECTOR_VALUETYPE = v1f16,
       LAST_FP_FIXEDLEN_VECTOR_VALUETYPE = v256f64,
@@ -185,68 +191,70 @@ namespace llvm {
       FIRST_FIXEDLEN_VECTOR_VALUETYPE = v1i1,
       LAST_FIXEDLEN_VECTOR_VALUETYPE = v256f64,
 
-      nxv1i1         = 120,   // n x  1 x i1
-      nxv2i1         = 121,   // n x  2 x i1
-      nxv4i1         = 122,   // n x  4 x i1
-      nxv8i1         = 123,   // n x  8 x i1
-      nxv16i1        = 124,   // n x 16 x i1
-      nxv32i1        = 125,   // n x 32 x i1
-      nxv64i1        = 126,   // n x 64 x i1
-
-      nxv1i8         = 127,   // n x  1 x i8
-      nxv2i8         = 128,   // n x  2 x i8
-      nxv4i8         = 129,   // n x  4 x i8
-      nxv8i8         = 130,   // n x  8 x i8
-      nxv16i8        = 131,   // n x 16 x i8
-      nxv32i8        = 132,   // n x 32 x i8
-      nxv64i8        = 133,   // n x 64 x i8
-
-      nxv1i16        = 134,  // n x  1 x i16
-      nxv2i16        = 135,  // n x  2 x i16
-      nxv4i16        = 136,  // n x  4 x i16
-      nxv8i16        = 137,  // n x  8 x i16
-      nxv16i16       = 138,  // n x 16 x i16
-      nxv32i16       = 139,  // n x 32 x i16
-
-      nxv1i32        = 140,  // n x  1 x i32
-      nxv2i32        = 141,  // n x  2 x i32
-      nxv4i32        = 142,  // n x  4 x i32
-      nxv8i32        = 143,  // n x  8 x i32
-      nxv16i32       = 144,  // n x 16 x i32
-      nxv32i32       = 145,  // n x 32 x i32
-
-      nxv1i64        = 146,  // n x  1 x i64
-      nxv2i64        = 147,  // n x  2 x i64
-      nxv4i64        = 148,  // n x  4 x i64
-      nxv8i64        = 149,  // n x  8 x i64
-      nxv16i64       = 150,  // n x 16 x i64
-      nxv32i64       = 151,  // n x 32 x i64
+      nxv1i1         = 124,   // n x  1 x i1
+      nxv2i1         = 125,   // n x  2 x i1
+      nxv4i1         = 126,   // n x  4 x i1
+      nxv8i1         = 127,   // n x  8 x i1
+      nxv16i1        = 128,   // n x 16 x i1
+      nxv32i1        = 129,   // n x 32 x i1
+      nxv64i1        = 130,   // n x 64 x i1
+
+      nxv1i8         = 131,   // n x  1 x i8
+      nxv2i8         = 132,   // n x  2 x i8
+      nxv4i8         = 133,   // n x  4 x i8
+      nxv8i8         = 134,   // n x  8 x i8
+      nxv16i8        = 135,   // n x 16 x i8
+      nxv32i8        = 136,   // n x 32 x i8
+      nxv64i8        = 137,   // n x 64 x i8
+
+      nxv1i16        = 138,  // n x  1 x i16
+      nxv2i16        = 139,  // n x  2 x i16
+      nxv4i16        = 140,  // n x  4 x i16
+      nxv8i16        = 141,  // n x  8 x i16
+      nxv16i16       = 142,  // n x 16 x i16
+      nxv32i16       = 143,  // n x 32 x i16
+
+      nxv1i32        = 144,  // n x  1 x i32
+      nxv2i32        = 145,  // n x  2 x i32
+      nxv4i32        = 146,  // n x  4 x i32
+      nxv8i32        = 147,  // n x  8 x i32
+      nxv16i32       = 148,  // n x 16 x i32
+      nxv32i32       = 149,  // n x 32 x i32
+
+      nxv1i64        = 150,  // n x  1 x i64
+      nxv2i64        = 151,  // n x  2 x i64
+      nxv4i64        = 152,  // n x  4 x i64
+      nxv8i64        = 153,  // n x  8 x i64
+      nxv16i64       = 154,  // n x 16 x i64
+      nxv32i64       = 155,  // n x 32 x i64
 
       FIRST_INTEGER_SCALABLE_VECTOR_VALUETYPE = nxv1i1,
       LAST_INTEGER_SCALABLE_VECTOR_VALUETYPE = nxv32i64,
 
-      nxv1f16        = 152,  // n x  1 x f16
-      nxv2f16        = 153,  // n x  2 x f16
-      nxv4f16        = 154,  // n x  4 x f16
-      nxv8f16        = 155,  // n x  8 x f16
-      nxv16f16       = 156,  // n x 16 x f16
-      nxv32f16       = 157,  // n x 32 x f16
-
-      nxv1bf16       = 158,  // n x  1 x bf16
-      nxv2bf16       = 159,  // n x  2 x bf16
-      nxv4bf16       = 160,  // n x  4 x bf16
-      nxv8bf16       = 161,  // n x  8 x bf16
-
-      nxv1f32        = 162,  // n x  1 x f32
-      nxv2f32        = 163,  // n x  2 x f32
-      nxv4f32        = 164,  // n x  4 x f32
-      nxv8f32        = 165,  // n x  8 x f32
-      nxv16f32       = 166,  // n x 16 x f32
-
-      nxv1f64        = 167,  // n x  1 x f64
-      nxv2f64        = 168,  // n x  2 x f64
-      nxv4f64        = 169,  // n x  4 x f64
-      nxv8f64        = 170,  // n x  8 x f64
+      nxv1f16        = 156,  // n x  1 x f16
+      nxv2f16        = 157,  // n x  2 x f16
+      nxv4f16        = 158,  // n x  4 x f16
+      nxv8f16        = 159,  // n x  8 x f16
+      nxv16f16       = 160,  // n x 16 x f16
+      nxv32f16       = 161,  // n x 32 x f16
+
+      nxv1bf16       = 162,  // n x  1 x bf16
+      nxv2bf16       = 163,  // n x  2 x bf16
+      nxv4bf16       = 164,  // n x  4 x bf16
+      nxv8bf16       = 165,  // n x  8 x bf16
+      nxv16bf16      = 166,  // n x 16 x bf16
+      nxv32bf16      = 167,  // n x 32 x bf16
+
+      nxv1f32        = 168,  // n x  1 x f32
+      nxv2f32        = 169,  // n x  2 x f32
+      nxv4f32        = 170,  // n x  4 x f32
+      nxv8f32        = 171,  // n x  8 x f32
+      nxv16f32       = 172,  // n x 16 x f32
+
+      nxv1f64        = 173,  // n x  1 x f64
+      nxv2f64        = 174,  // n x  2 x f64
+      nxv4f64        = 175,  // n x  4 x f64
+      nxv8f64        = 176,  // n x  8 x f64
 
       FIRST_FP_SCALABLE_VECTOR_VALUETYPE = nxv1f16,
       LAST_FP_SCALABLE_VECTOR_VALUETYPE = nxv8f64,
@@ -257,20 +265,20 @@ namespace llvm {
       FIRST_VECTOR_VALUETYPE = v1i1,
       LAST_VECTOR_VALUETYPE  = nxv8f64,
 
-      x86mmx         = 171,    // This is an X86 MMX value
+      x86mmx         = 177,    // This is an X86 MMX value
 
-      Glue           = 172,    // This glues nodes together during pre-RA sched
+      Glue           = 178,    // This glues nodes together during pre-RA sched
 
-      isVoid         = 173,    // This has no value
+      isVoid         = 179,    // This has no value
 
-      Untyped        = 174,    // This value takes a register, but has
+      Untyped        = 180,    // This value takes a register, but has
                                // unspecified type.  The register class
                                // will be determined by the opcode.
 
-      funcref        = 175,    // WebAssembly's funcref type
-      externref      = 176,    // WebAssembly's externref type
-      x86amx         = 177,    // This is an X86 AMX value
-      i64x8          = 178,    // 8 Consecutive GPRs (AArch64)
+      funcref        = 181,    // WebAssembly's funcref type
+      externref      = 182,    // WebAssembly's externref type
+      x86amx         = 183,    // This is an X86 AMX value
+      i64x8          = 184,    // 8 Consecutive GPRs (AArch64)
 
       FIRST_VALUETYPE =  1,    // This is always the beginning of the list.
       LAST_VALUETYPE = i64x8,  // This always remains at the end of the list.
@@ -415,10 +423,11 @@ namespace llvm {
     /// Return true if this is a 256-bit vector type.
     bool is256BitVector() const {
       return (SimpleTy == MVT::v16f16 || SimpleTy == MVT::v16bf16 ||
-              SimpleTy == MVT::v8f32  || SimpleTy == MVT::v4f64   ||
-              SimpleTy == MVT::v32i8  || SimpleTy == MVT::v16i16  ||
-              SimpleTy == MVT::v8i32  || SimpleTy == MVT::v4i64   ||
-              SimpleTy == MVT::v256i1);
+              SimpleTy == MVT::v8f32 || SimpleTy == MVT::v4f64 ||
+              SimpleTy == MVT::v32i8 || SimpleTy == MVT::v16i16 ||
+              SimpleTy == MVT::v8i32 || SimpleTy == MVT::v4i64 ||
+              SimpleTy == MVT::v256i1 || SimpleTy == MVT::v128i2 ||
+              SimpleTy == MVT::v64i4);
     }
 
     /// Return true if this is a 512-bit vector type.
@@ -517,6 +526,7 @@ namespace llvm {
     }
 
     MVT getVectorElementType() const {
+      // clang-format off
       switch (SimpleTy) {
       default:
         llvm_unreachable("Not a vector MVT!");
@@ -538,6 +548,8 @@ namespace llvm {
       case nxv16i1:
       case nxv32i1:
       case nxv64i1: return i1;
+      case v128i2: return i2;
+      case v64i4: return i4;
       case v1i8:
       case v2i8:
       case v4i8:
@@ -640,7 +652,9 @@ namespace llvm {
       case nxv1bf16:
       case nxv2bf16:
       case nxv4bf16:
-      case nxv8bf16: return bf16;
+      case nxv8bf16:
+      case nxv16bf16:
+      case nxv32bf16: return bf16;
       case v1f32:
       case v2f32:
       case v3f32:
@@ -677,6 +691,7 @@ namespace llvm {
       case nxv4f64:
       case nxv8f64: return f64;
       }
+      // clang-format on
     }
 
     /// Given a vector type, return the minimum number of elements it contains.
@@ -705,6 +720,7 @@ namespace llvm {
       case v256f32:
       case v256f64: return 256;
       case v128i1:
+      case v128i2:
       case v128i8:
       case v128i16:
       case v128i32:
@@ -714,6 +730,7 @@ namespace llvm {
       case v128f32:
       case v128f64: return 128;
       case v64i1:
+      case v64i4:
       case v64i8:
       case v64i16:
       case v64i32:
@@ -738,7 +755,8 @@ namespace llvm {
       case nxv32i16:
       case nxv32i32:
       case nxv32i64:
-      case nxv32f16: return 32;
+      case nxv32f16:
+      case nxv32bf16: return 32;
       case v16i1:
       case v16i8:
       case v16i16:
@@ -754,6 +772,7 @@ namespace llvm {
       case nxv16i32:
       case nxv16i64:
       case nxv16f16:
+      case nxv16bf16:
       case nxv16f32: return 16;
       case v8i1:
       case v8i8:
@@ -883,8 +902,10 @@ namespace llvm {
       case i1:
       case v1i1: return TypeSize::Fixed(1);
       case nxv1i1: return TypeSize::Scalable(1);
+      case i2:
       case v2i1: return TypeSize::Fixed(2);
       case nxv2i1: return TypeSize::Scalable(2);
+      case i4:
       case v4i1: return TypeSize::Fixed(4);
       case nxv4i1: return TypeSize::Scalable(4);
       case i8  :
@@ -977,6 +998,8 @@ namespace llvm {
       case v7i32:
       case v7f32: return TypeSize::Fixed(224);
       case v256i1:
+      case v128i2:
+      case v64i4:
       case v32i8:
       case v16i16:
       case v8i32:
@@ -990,6 +1013,7 @@ namespace llvm {
       case nxv8i32:
       case nxv4i64:
       case nxv16f16:
+      case nxv16bf16:
       case nxv8f32:
       case nxv4f64: return TypeSize::Scalable(256);
       case i64x8:
@@ -1007,6 +1031,7 @@ namespace llvm {
       case nxv16i32:
       case nxv8i64:
       case nxv32f16:
+      case nxv32bf16:
       case nxv16f32:
       case nxv8f64: return TypeSize::Scalable(512);
       case v1024i1:
@@ -1078,6 +1103,12 @@ namespace llvm {
       return {(BaseSize.getKnownMinSize() + 7) / 8, BaseSize.isScalable()};
     }
 
+    // Return the number of bytes overwritten by a store of this value type or
+    // this value type's element type in the case of a vector.
+    uint64_t getScalarStoreSize() const {
+      return getScalarType().getStoreSize().getFixedSize();
+    }
+
     /// Return the number of bits overwritten by a store of the specified value
     /// type.
     ///
@@ -1165,6 +1196,10 @@ namespace llvm {
         return (MVT::SimpleValueType)(MVT::INVALID_SIMPLE_VALUE_TYPE);
       case 1:
         return MVT::i1;
+      case 2:
+        return MVT::i2;
+      case 4:
+        return MVT::i4;
       case 8:
         return MVT::i8;
       case 16:
@@ -1179,6 +1214,7 @@ namespace llvm {
     }
 
     static MVT getVectorVT(MVT VT, unsigned NumElements) {
+      // clang-format off
       switch (VT.SimpleTy) {
       default:
         break;
@@ -1195,6 +1231,12 @@ namespace llvm {
         if (NumElements == 512)  return MVT::v512i1;
         if (NumElements == 1024) return MVT::v1024i1;
         break;
+      case MVT::i2:
+        if (NumElements == 128) return MVT::v128i2;
+        break;
+      case MVT::i4:
+        if (NumElements == 64) return MVT::v64i4;
+        break;
       case MVT::i8:
         if (NumElements == 1)   return MVT::v1i8;
         if (NumElements == 2)   return MVT::v2i8;
@@ -1309,6 +1351,7 @@ namespace llvm {
         break;
       }
       return (MVT::SimpleValueType)(MVT::INVALID_SIMPLE_VALUE_TYPE);
+      // clang-format on
     }
 
     static MVT getScalableVectorVT(MVT VT, unsigned NumElements) {
@@ -1370,6 +1413,8 @@ namespace llvm {
           if (NumElements == 2)  return MVT::nxv2bf16;
           if (NumElements == 4)  return MVT::nxv4bf16;
           if (NumElements == 8)  return MVT::nxv8bf16;
+          if (NumElements == 16)  return MVT::nxv16bf16;
+          if (NumElements == 32)  return MVT::nxv32bf16;
           break;
         case MVT::f32:
           if (NumElements == 1)  return MVT::nxv1f32;
diff --git a/llvm/include/llvm/Support/MathExtras.h b/llvm/include/llvm/Support/MathExtras.h
index 753b1998c40c..8079aa436933 100644
--- a/llvm/include/llvm/Support/MathExtras.h
+++ b/llvm/include/llvm/Support/MathExtras.h
@@ -571,6 +571,33 @@ inline unsigned countPopulation(T Value) {
   return detail::PopulationCounter<T, sizeof(T)>::count(Value);
 }
 
+/// Return true if the argument contains a non-empty sequence of ones with the
+/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
+/// If true, \p MaskIdx will specify the index of the lowest set bit and \p
+/// MaskLen is updated to specify the length of the mask, else neither are
+/// updated.
+inline bool isShiftedMask_32(uint32_t Value, unsigned &MaskIdx,
+                             unsigned &MaskLen) {
+  if (!isShiftedMask_32(Value))
+    return false;
+  MaskIdx = countTrailingZeros(Value);
+  MaskLen = countPopulation(Value);
+  return true;
+}
+
+/// Return true if the argument contains a non-empty sequence of ones with the
+/// remainder zero (64 bit version.) If true, \p MaskIdx will specify the index
+/// of the lowest set bit and \p MaskLen is updated to specify the length of the
+/// mask, else neither are updated.
+inline bool isShiftedMask_64(uint64_t Value, unsigned &MaskIdx,
+                             unsigned &MaskLen) {
+  if (!isShiftedMask_64(Value))
+    return false;
+  MaskIdx = countTrailingZeros(Value);
+  MaskLen = countPopulation(Value);
+  return true;
+}
+
 /// Compile time Log2.
 /// Valid only for positive powers of two.
 template <size_t kValue> constexpr inline size_t CTLog2() {
@@ -680,7 +707,7 @@ constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
 
 /// Returns the next power of two (in 64-bits) that is strictly greater than A.
 /// Returns zero on overflow.
-inline uint64_t NextPowerOf2(uint64_t A) {
+constexpr inline uint64_t NextPowerOf2(uint64_t A) {
   A |= (A >> 1);
   A |= (A >> 2);
   A |= (A >> 4);
@@ -708,27 +735,34 @@ inline uint64_t PowerOf2Ceil(uint64_t A) {
 /// Returns the next integer (mod 2**64) that is greater than or equal to
 /// \p Value and is a multiple of \p Align. \p Align must be non-zero.
 ///
-/// If non-zero \p Skew is specified, the return value will be a minimal
-/// integer that is greater than or equal to \p Value and equal to
-/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
-/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
-///
 /// Examples:
 /// \code
 ///   alignTo(5, 8) = 8
 ///   alignTo(17, 8) = 24
 ///   alignTo(~0LL, 8) = 0
 ///   alignTo(321, 255) = 510
+/// \endcode
+inline uint64_t alignTo(uint64_t Value, uint64_t Align) {
+  assert(Align != 0u && "Align can't be 0.");
+  return (Value + Align - 1) / Align * Align;
+}
+
+/// If non-zero \p Skew is specified, the return value will be a minimal integer
+/// that is greater than or equal to \p Size and equal to \p A * N + \p Skew for
+/// some integer N. If \p Skew is larger than \p A, its value is adjusted to '\p
+/// Skew mod \p A'. \p Align must be non-zero.
 ///
+/// Examples:
+/// \code
 ///   alignTo(5, 8, 7) = 7
 ///   alignTo(17, 8, 1) = 17
 ///   alignTo(~0LL, 8, 3) = 3
 ///   alignTo(321, 255, 42) = 552
 /// \endcode
-inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
+inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew) {
   assert(Align != 0u && "Align can't be 0.");
   Skew %= Align;
-  return (Value + Align - 1 - Skew) / Align * Align + Skew;
+  return alignTo(Value - Skew, Align) + Skew;
 }
 
 /// Returns the next integer (mod 2**64) that is greater than or equal to
@@ -879,7 +913,7 @@ extern const float huge_valf;
 
 
 /// Add two signed integers, computing the two's complement truncated result,
-/// returning true if overflow occured.
+/// returning true if overflow occurred.
 template <typename T>
 std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) {
 #if __has_builtin(__builtin_add_overflow)
diff --git a/llvm/include/llvm/Support/Parallel.h b/llvm/include/llvm/Support/Parallel.h
index 04caf5eac961..ff113f9b44c4 100644
--- a/llvm/include/llvm/Support/Parallel.h
+++ b/llvm/include/llvm/Support/Parallel.h
@@ -193,11 +193,11 @@ void parallelSort(RandomAccessIterator Start, RandomAccessIterator End,
   llvm::sort(Start, End, Comp);
 }
 
-void parallelForEachN(size_t Begin, size_t End, function_ref<void(size_t)> Fn);
+void parallelFor(size_t Begin, size_t End, function_ref<void(size_t)> Fn);
 
 template <class IterTy, class FuncTy>
 void parallelForEach(IterTy Begin, IterTy End, FuncTy Fn) {
-  parallelForEachN(0, End - Begin, [&](size_t I) { Fn(Begin[I]); });
+  parallelFor(0, End - Begin, [&](size_t I) { Fn(Begin[I]); });
 }
 
 template <class IterTy, class ResultTy, class ReduceFuncTy,
diff --git a/llvm/include/llvm/Support/Path.h b/llvm/include/llvm/Support/Path.h
index da5095714f48..ce69f32b6cc8 100644
--- a/llvm/include/llvm/Support/Path.h
+++ b/llvm/include/llvm/Support/Path.h
@@ -19,7 +19,6 @@
 #include "llvm/ADT/iterator.h"
 #include "llvm/Support/DataTypes.h"
 #include <iterator>
-#include <system_error>
 
 namespace llvm {
 namespace sys {
diff --git a/llvm/include/llvm/Support/PluginLoader.h b/llvm/include/llvm/Support/PluginLoader.h
index 95c087f03d9b..bdd36366d1cf 100644
--- a/llvm/include/llvm/Support/PluginLoader.h
+++ b/llvm/include/llvm/Support/PluginLoader.h
@@ -31,9 +31,9 @@ namespace llvm {
 
 #ifndef DONT_GET_PLUGIN_LOADER_OPTION
   // This causes operator= above to be invoked for every -load option.
-  static cl::opt<PluginLoader, false, cl::parser<std::string> >
-    LoadOpt("load", cl::ZeroOrMore, cl::value_desc("pluginfilename"),
-            cl::desc("Load the specified plugin"));
+  static cl::opt<PluginLoader, false, cl::parser<std::string>>
+      LoadOpt("load", cl::value_desc("pluginfilename"),
+              cl::desc("Load the specified plugin"));
 #endif
 }
 
diff --git a/llvm/include/llvm/Support/Printable.h b/llvm/include/llvm/Support/Printable.h
index 6403c32aad67..8e76f01f6ba2 100644
--- a/llvm/include/llvm/Support/Printable.h
+++ b/llvm/include/llvm/Support/Printable.h
@@ -24,12 +24,12 @@ class raw_ostream;
 /// This class is useful to construct print helpers for raw_ostream.
 ///
 /// Example:
-///     Printable PrintRegister(unsigned Register) {
+///     Printable printRegister(unsigned Register) {
 ///       return Printable([Register](raw_ostream &OS) {
 ///         OS << getRegisterName(Register);
-///       }
+///       });
 ///     }
-///     ... OS << PrintRegister(Register); ...
+///     ... OS << printRegister(Register); ...
 ///
 /// Implementation note: Ideally this would just be a typedef, but doing so
 /// leads to operator << being ambiguous as function has matching constructors
@@ -47,6 +47,6 @@ inline raw_ostream &operator<<(raw_ostream &OS, const Printable &P) {
   return OS;
 }
 
-}
+} // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/Support/Process.h b/llvm/include/llvm/Support/Process.h
index ee03efeed9b2..9f56bd9b6e61 100644
--- a/llvm/include/llvm/Support/Process.h
+++ b/llvm/include/llvm/Support/Process.h
@@ -25,7 +25,6 @@
 #define LLVM_SUPPORT_PROCESS_H
 
 #include "llvm/ADT/Optional.h"
-#include "llvm/Support/AllocatorBase.h"
 #include "llvm/Support/Chrono.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Error.h"
diff --git a/llvm/include/llvm/Support/Program.h b/llvm/include/llvm/Support/Program.h
index f91fca1c4464..4cb55c42c377 100644
--- a/llvm/include/llvm/Support/Program.h
+++ b/llvm/include/llvm/Support/Program.h
@@ -14,7 +14,6 @@
 #define LLVM_SUPPORT_PROGRAM_H
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Config/llvm-config.h"
@@ -24,6 +23,7 @@
 #include <system_error>
 
 namespace llvm {
+class BitVector;
 namespace sys {
 
   /// This is the OS-specific separator for PATH like environment variables:
diff --git a/llvm/include/llvm/Support/RISCVISAInfo.h b/llvm/include/llvm/Support/RISCVISAInfo.h
index 7fa0e6ee3acf..eac6cc0925fb 100644
--- a/llvm/include/llvm/Support/RISCVISAInfo.h
+++ b/llvm/include/llvm/Support/RISCVISAInfo.h
@@ -66,6 +66,7 @@ public:
   bool hasExtension(StringRef Ext) const;
   std::string toString() const;
   std::vector<std::string> toFeatureVector() const;
+  StringRef computeDefaultABI() const;
 
   static bool isSupportedExtensionFeature(StringRef Ext);
   static bool isSupportedExtension(StringRef Ext);
@@ -89,6 +90,7 @@ private:
   Error checkDependency();
 
   void updateImplication();
+  void updateCombination();
   void updateFLen();
   void updateMinVLen();
   void updateMaxELen();
diff --git a/llvm/include/llvm/Support/RWMutex.h b/llvm/include/llvm/Support/RWMutex.h
index 33a5d3efffee..3dd962586c36 100644
--- a/llvm/include/llvm/Support/RWMutex.h
+++ b/llvm/include/llvm/Support/RWMutex.h
@@ -93,8 +93,8 @@ private:
 /// running in multithreaded mode.
 template <bool mt_only> class SmartRWMutex {
   // shared_mutex (C++17) is more efficient than shared_timed_mutex (C++14)
-  // on Windows and always available on MSVC.
-#if defined(_MSC_VER) || __cplusplus > 201402L
+  // on Windows and always available on MSVC except with libc++.
+#if (defined(_MSC_VER) && !defined(_LIBCPP_VERSION)) || __cplusplus > 201402L
   std::shared_mutex impl;
 #else
 #if !defined(LLVM_USE_RW_MUTEX_IMPL)
diff --git a/llvm/include/llvm/Support/SHA1.h b/llvm/include/llvm/Support/SHA1.h
index efd8513cc201..ae6d62aed723 100644
--- a/llvm/include/llvm/Support/SHA1.h
+++ b/llvm/include/llvm/Support/SHA1.h
@@ -36,17 +36,17 @@ public:
   /// Digest more data.
   void update(StringRef Str);
 
-  /// Return a reference to the current raw 160-bits SHA1 for the digested data
+  /// Return the current raw 160-bits SHA1 for the digested data
   /// since the last call to init(). This call will add data to the internal
   /// state and as such is not suited for getting an intermediate result
   /// (see result()).
-  StringRef final();
+  std::array<uint8_t, 20> final();
 
-  /// Return a reference to the current raw 160-bits SHA1 for the digested data
+  /// Return the current raw 160-bits SHA1 for the digested data
   /// since the last call to init(). This is suitable for getting the SHA1 at
   /// any time without invalidating the internal state so that more calls can be
   /// made into update.
-  StringRef result();
+  std::array<uint8_t, 20> result();
 
   /// Returns a raw 160-bit SHA1 hash for the given data.
   static std::array<uint8_t, 20> hash(ArrayRef<uint8_t> Data);
@@ -68,14 +68,13 @@ private:
     uint8_t BufferOffset;
   } InternalState;
 
-  // Internal copy of the hash, populated and accessed on calls to result()
-  uint32_t HashResult[HASH_LENGTH / 4];
-
   // Helper
   void writebyte(uint8_t data);
   void hashBlock();
   void addUncounted(uint8_t data);
   void pad();
+
+  void final(std::array<uint32_t, HASH_LENGTH / 4> &HashResult);
 };
 
 } // end llvm namespace
diff --git a/llvm/include/llvm/Support/SHA256.h b/llvm/include/llvm/Support/SHA256.h
index 9e295b0b9fae..68b32c7b4834 100644
--- a/llvm/include/llvm/Support/SHA256.h
+++ b/llvm/include/llvm/Support/SHA256.h
@@ -43,17 +43,17 @@ public:
   /// Digest more data.
   void update(StringRef Str);
 
-  /// Return a reference to the current raw 256-bits SHA256 for the digested
+  /// Return the current raw 256-bits SHA256 for the digested
   /// data since the last call to init(). This call will add data to the
   /// internal state and as such is not suited for getting an intermediate
   /// result (see result()).
-  StringRef final();
+  std::array<uint8_t, 32> final();
 
-  /// Return a reference to the current raw 256-bits SHA256 for the digested
+  /// Return the current raw 256-bits SHA256 for the digested
   /// data since the last call to init(). This is suitable for getting the
   /// SHA256 at any time without invalidating the internal state so that more
   /// calls can be made into update.
-  StringRef result();
+  std::array<uint8_t, 32> result();
 
   /// Returns a raw 256-bit SHA256 hash for the given data.
   static std::array<uint8_t, 32> hash(ArrayRef<uint8_t> Data);
@@ -75,14 +75,13 @@ private:
     uint8_t BufferOffset;
   } InternalState;
 
-  // Internal copy of the hash, populated and accessed on calls to result()
-  uint32_t HashResult[HASH_LENGTH / 4];
-
   // Helper
   void writebyte(uint8_t data);
   void hashBlock();
   void addUncounted(uint8_t data);
   void pad();
+
+  void final(std::array<uint32_t, HASH_LENGTH / 4> &HashResult);
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/Support/ScopedPrinter.h b/llvm/include/llvm/Support/ScopedPrinter.h
index 6b5daf710c9f..c9eabfb3788c 100644
--- a/llvm/include/llvm/Support/ScopedPrinter.h
+++ b/llvm/include/llvm/Support/ScopedPrinter.h
@@ -81,7 +81,6 @@ struct FlagEntry {
 };
 
 raw_ostream &operator<<(raw_ostream &OS, const HexNumber &Value);
-std::string to_hexString(uint64_t Value, bool UpperCase = true);
 
 template <class T> std::string to_string(const T &Value) {
   std::string number;
@@ -95,7 +94,7 @@ std::string enumToString(T Value, ArrayRef<EnumEntry<TEnum>> EnumValues) {
   for (const EnumEntry<TEnum> &EnumItem : EnumValues)
     if (EnumItem.Value == Value)
       return std::string(EnumItem.AltName);
-  return to_hexString(Value, false);
+  return utohexstr(Value, true);
 }
 
 class ScopedPrinter {
@@ -107,7 +106,7 @@ public:
 
   ScopedPrinter(raw_ostream &OS,
                 ScopedPrinterKind Kind = ScopedPrinterKind::Base)
-      : OS(OS), IndentLevel(0), Kind(Kind) {}
+      : OS(OS), Kind(Kind) {}
 
   ScopedPrinterKind getKind() const { return Kind; }
 
@@ -498,7 +497,7 @@ private:
   }
 
   raw_ostream &OS;
-  int IndentLevel;
+  int IndentLevel = 0;
   StringRef Prefix;
   ScopedPrinterKind Kind;
 };
diff --git a/llvm/include/llvm/Support/Signals.h b/llvm/include/llvm/Support/Signals.h
index 44f5a750ff5c..937e0572d4a7 100644
--- a/llvm/include/llvm/Support/Signals.h
+++ b/llvm/include/llvm/Support/Signals.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_SUPPORT_SIGNALS_H
 #define LLVM_SUPPORT_SIGNALS_H
 
+#include <cstdint>
 #include <string>
 
 namespace llvm {
diff --git a/llvm/include/llvm/Support/Signposts.h b/llvm/include/llvm/Support/Signposts.h
index dabbba6f89d1..37089bd1c17d 100644
--- a/llvm/include/llvm/Support/Signposts.h
+++ b/llvm/include/llvm/Support/Signposts.h
@@ -16,11 +16,11 @@
 #ifndef LLVM_SUPPORT_SIGNPOSTS_H
 #define LLVM_SUPPORT_SIGNPOSTS_H
 
-#include "llvm/ADT/StringRef.h"
 #include <memory>
 
 namespace llvm {
 class SignpostEmitterImpl;
+class StringRef;
 
 /// Manages the emission of signposts into the recording method supported by
 /// the OS.
diff --git a/llvm/include/llvm/Support/SourceMgr.h b/llvm/include/llvm/Support/SourceMgr.h
index 28716b42f4ab..eced4574c82e 100644
--- a/llvm/include/llvm/Support/SourceMgr.h
+++ b/llvm/include/llvm/Support/SourceMgr.h
@@ -100,6 +100,9 @@ public:
   SourceMgr &operator=(SourceMgr &&) = default;
   ~SourceMgr() = default;
 
+  /// Return the include directories of this source manager.
+  ArrayRef<std::string> getIncludeDirs() const { return IncludeDirectories; }
+
   void setIncludeDirs(const std::vector<std::string> &Dirs) {
     IncludeDirectories = Dirs;
   }
@@ -147,6 +150,22 @@ public:
     return Buffers.size();
   }
 
+  /// Takes the source buffers from the given source manager and append them to
+  /// the current manager. `MainBufferIncludeLoc` is an optional include
+  /// location to attach to the main buffer of `SrcMgr` after it gets moved to
+  /// the current manager.
+  void takeSourceBuffersFrom(SourceMgr &SrcMgr,
+                             SMLoc MainBufferIncludeLoc = SMLoc()) {
+    if (SrcMgr.Buffers.empty())
+      return;
+
+    size_t OldNumBuffers = getNumBuffers();
+    std::move(SrcMgr.Buffers.begin(), SrcMgr.Buffers.end(),
+              std::back_inserter(Buffers));
+    SrcMgr.Buffers.clear();
+    Buffers[OldNumBuffers].IncludeLoc = MainBufferIncludeLoc;
+  }
+
   /// Search for a file with the specified name in the current directory or in
   /// one of the IncludeDirs.
   ///
@@ -156,6 +175,17 @@ public:
   unsigned AddIncludeFile(const std::string &Filename, SMLoc IncludeLoc,
                           std::string &IncludedFile);
 
+  /// Search for a file with the specified name in the current directory or in
+  /// one of the IncludeDirs, and try to open it **without** adding to the
+  /// SourceMgr. If the opened file is intended to be added to the source
+  /// manager, prefer `AddIncludeFile` instead.
+  ///
+  /// If no file is found, this returns an Error, otherwise it returns the
+  /// buffer of the stacked file. The full path to the included file can be
+  /// found in \p IncludedFile.
+  ErrorOr<std::unique_ptr<MemoryBuffer>>
+  OpenIncludeFile(const std::string &Filename, std::string &IncludedFile);
+
   /// Return the ID of the buffer containing the specified location.
   ///
   /// 0 is returned if the buffer is not found.
diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index 428cbb44705d..8df7ced0029d 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -322,6 +322,9 @@ HANDLE_TARGET_OPCODE(G_BITCAST)
 /// Generic freeze.
 HANDLE_TARGET_OPCODE(G_FREEZE)
 
+// INTRINSIC fptrunc_round intrinsic.
+HANDLE_TARGET_OPCODE(G_INTRINSIC_FPTRUNC_ROUND)
+
 /// INTRINSIC trunc intrinsic.
 HANDLE_TARGET_OPCODE(G_INTRINSIC_TRUNC)
 
@@ -617,6 +620,9 @@ HANDLE_TARGET_OPCODE(G_FABS)
 /// f64) is allowed.
 HANDLE_TARGET_OPCODE(G_FCOPYSIGN)
 
+/// Generic test for floating-point class.
+HANDLE_TARGET_OPCODE(G_IS_FPCLASS)
+
 /// Generic FP canonicalize value.
 HANDLE_TARGET_OPCODE(G_FCANONICALIZE)
 
diff --git a/llvm/include/llvm/Support/TargetParser.h b/llvm/include/llvm/Support/TargetParser.h
index 02a8d72483db..c3a6cceaee6b 100644
--- a/llvm/include/llvm/Support/TargetParser.h
+++ b/llvm/include/llvm/Support/TargetParser.h
@@ -14,14 +14,14 @@
 #ifndef LLVM_SUPPORT_TARGETPARSER_H
 #define LLVM_SUPPORT_TARGETPARSER_H
 
+#include "llvm/ADT/StringRef.h"
+#include <cstdint>
 // FIXME: vector is used because that's what clang uses for subtarget feature
 // lists, but SmallVector would probably be better
-#include "llvm/Support/RISCVISAInfo.h"
 #include <vector>
 
 namespace llvm {
 
-class StringRef;
 template <typename T> class SmallVectorImpl;
 class Triple;
 
@@ -86,6 +86,7 @@ enum GPUKind : uint32_t {
   GK_GFX909 = 65,
   GK_GFX90A = 66,
   GK_GFX90C = 67,
+  GK_GFX940 = 68,
 
   GK_GFX1010 = 71,
   GK_GFX1011 = 72,
@@ -97,9 +98,15 @@ enum GPUKind : uint32_t {
   GK_GFX1033 = 78,
   GK_GFX1034 = 79,
   GK_GFX1035 = 80,
+  GK_GFX1036 = 81,
+
+  GK_GFX1100 = 90,
+  GK_GFX1101 = 91,
+  GK_GFX1102 = 92,
+  GK_GFX1103 = 93,
 
   GK_AMDGCN_FIRST = GK_GFX600,
-  GK_AMDGCN_LAST = GK_GFX1035,
+  GK_AMDGCN_LAST = GK_GFX1103,
 };
 
 /// Instruction set architecture version.
@@ -170,7 +177,6 @@ void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values, bool IsRV64);
 void fillValidTuneCPUArchList(SmallVectorImpl<StringRef> &Values, bool IsRV64);
 bool getCPUFeaturesExceptStdExt(CPUKind Kind, std::vector<StringRef> &Features);
 StringRef resolveTuneCPUAlias(StringRef TuneCPU, bool IsRV64);
-StringRef computeDefaultABIFromArch(const llvm::RISCVISAInfo &ISAInfo);
 
 } // namespace RISCV
 
diff --git a/llvm/include/llvm/Support/ThreadPool.h b/llvm/include/llvm/Support/ThreadPool.h
index 868dd2819f83..5e67a312d5c7 100644
--- a/llvm/include/llvm/Support/ThreadPool.h
+++ b/llvm/include/llvm/Support/ThreadPool.h
@@ -13,26 +13,42 @@
 #ifndef LLVM_SUPPORT_THREADPOOL_H
 #define LLVM_SUPPORT_THREADPOOL_H
 
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/Config/llvm-config.h"
+#include "llvm/Support/RWMutex.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/thread.h"
 
 #include <future>
 
 #include <condition_variable>
+#include <deque>
 #include <functional>
 #include <memory>
 #include <mutex>
-#include <queue>
 #include <utility>
 
 namespace llvm {
 
+class ThreadPoolTaskGroup;
+
 /// A ThreadPool for asynchronous parallel execution on a defined number of
 /// threads.
 ///
 /// The pool keeps a vector of threads alive, waiting on a condition variable
 /// for some work to become available.
+///
+/// It is possible to reuse one thread pool for different groups of tasks
+/// by grouping tasks using ThreadPoolTaskGroup. All tasks are processed using
+/// the same queue, but it is possible to wait only for a specific group of
+/// tasks to finish.
+///
+/// It is also possible for worker threads to submit new tasks and wait for
+/// them. Note that this may result in a deadlock in cases such as when a task
+/// (directly or indirectly) tries to wait for its own completion, or when all
+/// available threads are used up by tasks waiting for a task that has no thread
+/// left to run on (this includes waiting on the returned future). It should be
+/// generally safe to wait() for a group as long as groups do not form a cycle.
 class ThreadPool {
 public:
   /// Construct a pool using the hardware strategy \p S for mapping hardware
@@ -47,23 +63,47 @@ public:
   /// Asynchronous submission of a task to the pool. The returned future can be
   /// used to wait for the task to finish and is *non-blocking* on destruction.
   template <typename Function, typename... Args>
-  inline auto async(Function &&F, Args &&...ArgList) {
+  auto async(Function &&F, Args &&...ArgList) {
     auto Task =
         std::bind(std::forward<Function>(F), std::forward<Args>(ArgList)...);
     return async(std::move(Task));
   }
 
+  /// Overload, task will be in the given task group.
+  template <typename Function, typename... Args>
+  auto async(ThreadPoolTaskGroup &Group, Function &&F, Args &&...ArgList) {
+    auto Task =
+        std::bind(std::forward<Function>(F), std::forward<Args>(ArgList)...);
+    return async(Group, std::move(Task));
+  }
+
   /// Asynchronous submission of a task to the pool. The returned future can be
   /// used to wait for the task to finish and is *non-blocking* on destruction.
   template <typename Func>
   auto async(Func &&F) -> std::shared_future<decltype(F())> {
-    return asyncImpl(std::function<decltype(F())()>(std::forward<Func>(F)));
+    return asyncImpl(std::function<decltype(F())()>(std::forward<Func>(F)),
+                     nullptr);
+  }
+
+  template <typename Func>
+  auto async(ThreadPoolTaskGroup &Group, Func &&F)
+      -> std::shared_future<decltype(F())> {
+    return asyncImpl(std::function<decltype(F())()>(std::forward<Func>(F)),
+                     &Group);
   }
 
   /// Blocking wait for all the threads to complete and the queue to be empty.
   /// It is an error to try to add new tasks while blocking on this call.
+  /// Calling wait() from a task would deadlock waiting for itself.
   void wait();
 
+  /// Blocking wait for only all the threads in the given group to complete.
+  /// It is possible to wait even inside a task, but waiting (directly or
+  /// indirectly) on itself will deadlock. If called from a task running on a
+  /// worker thread, the call may process pending tasks while waiting in order
+  /// not to waste the thread.
+  void wait(ThreadPoolTaskGroup &Group);
+
   // TODO: misleading legacy name warning!
   // Returns the maximum number of worker threads in the pool, not the current
   // number of threads!
@@ -98,12 +138,15 @@ private:
             std::move(F)};
   }
 
-  bool workCompletedUnlocked() { return !ActiveThreads && Tasks.empty(); }
+  /// Returns true if all tasks in the given group have finished (nullptr means
+  /// all tasks regardless of their group). QueueLock must be locked.
+  bool workCompletedUnlocked(ThreadPoolTaskGroup *Group) const;
 
   /// Asynchronous submission of a task to the pool. The returned future can be
   /// used to wait for the task to finish and is *non-blocking* on destruction.
   template <typename ResTy>
-  std::shared_future<ResTy> asyncImpl(std::function<ResTy()> Task) {
+  std::shared_future<ResTy> asyncImpl(std::function<ResTy()> Task,
+                                      ThreadPoolTaskGroup *Group) {
 
 #if LLVM_ENABLE_THREADS
     /// Wrap the Task in a std::function<void()> that sets the result of the
@@ -117,7 +160,7 @@ private:
 
       // Don't allow enqueueing after disabling the pool
       assert(EnableFlag && "Queuing a thread during ThreadPool destruction");
-      Tasks.push(std::move(R.first));
+      Tasks.emplace_back(std::make_pair(std::move(R.first), Group));
       requestedThreads = ActiveThreads + Tasks.size();
     }
     QueueCondition.notify_one();
@@ -130,7 +173,7 @@ private:
     auto Future = std::async(std::launch::deferred, std::move(Task)).share();
     // Wrap the future so that both ThreadPool::wait() can operate and the
     // returned future can be sync'ed on.
-    Tasks.push([Future]() { Future.get(); });
+    Tasks.emplace_back(std::make_pair([Future]() { Future.get(); }, Group));
     return Future;
 #endif
   }
@@ -139,25 +182,29 @@ private:
   // Grow to ensure that we have at least `requested` Threads, but do not go
   // over MaxThreadCount.
   void grow(int requested);
+
+  void processTasks(ThreadPoolTaskGroup *WaitingForGroup);
 #endif
 
   /// Threads in flight
   std::vector<llvm::thread> Threads;
   /// Lock protecting access to the Threads vector.
-  mutable std::mutex ThreadsLock;
+  mutable llvm::sys::RWMutex ThreadsLock;
 
   /// Tasks waiting for execution in the pool.
-  std::queue<std::function<void()>> Tasks;
+  std::deque<std::pair<std::function<void()>, ThreadPoolTaskGroup *>> Tasks;
 
   /// Locking and signaling for accessing the Tasks queue.
   std::mutex QueueLock;
   std::condition_variable QueueCondition;
 
-  /// Signaling for job completion
+  /// Signaling for job completion (all tasks or all tasks in a group).
   std::condition_variable CompletionCondition;
 
   /// Keep track of the number of thread actually busy
   unsigned ActiveThreads = 0;
+  /// Number of threads active for tasks in the given group (only non-zero).
+  DenseMap<ThreadPoolTaskGroup *, unsigned> ActiveGroups;
 
 #if LLVM_ENABLE_THREADS // avoids warning for unused variable
   /// Signal for the destruction of the pool, asking thread to exit.
@@ -169,6 +216,34 @@ private:
   /// Maximum number of threads to potentially grow this pool to.
   const unsigned MaxThreadCount;
 };
-}
+
+/// A group of tasks to be run on a thread pool. Thread pool tasks in different
+/// groups can run on the same threadpool but can be waited for separately.
+/// It is even possible for tasks of one group to submit and wait for tasks
+/// of another group, as long as this does not form a loop.
+class ThreadPoolTaskGroup {
+public:
+  /// The ThreadPool argument is the thread pool to forward calls to.
+  ThreadPoolTaskGroup(ThreadPool &Pool) : Pool(Pool) {}
+
+  /// Blocking destructor: will wait for all the tasks in the group to complete
+  /// by calling ThreadPool::wait().
+  ~ThreadPoolTaskGroup() { wait(); }
+
+  /// Calls ThreadPool::async() for this group.
+  template <typename Function, typename... Args>
+  inline auto async(Function &&F, Args &&...ArgList) {
+    return Pool.async(*this, std::forward<Function>(F),
+                      std::forward<Args>(ArgList)...);
+  }
+
+  /// Calls ThreadPool::wait() for this group.
+  void wait() { Pool.wait(*this); }
+
+private:
+  ThreadPool &Pool;
+};
+
+} // namespace llvm
 
 #endif // LLVM_SUPPORT_THREADPOOL_H
diff --git a/llvm/include/llvm/Support/Threading.h b/llvm/include/llvm/Support/Threading.h
index 94de950d4470..1e7e5f7b8f50 100644
--- a/llvm/include/llvm/Support/Threading.h
+++ b/llvm/include/llvm/Support/Threading.h
@@ -15,13 +15,10 @@
 #define LLVM_SUPPORT_THREADING_H
 
 #include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/FunctionExtras.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX
 #include "llvm/Support/Compiler.h"
 #include <ciso646> // So we can check the C++ standard lib macros.
-#include <functional>
 
 #if defined(_MSC_VER)
 // MSVC's call_once implementation worked since VS 2015, which is the minimum
@@ -236,15 +233,20 @@ bool llvm_is_multithreaded();
   unsigned get_cpus();
 
   enum class ThreadPriority {
+    /// Lower the current thread's priority as much as possible. Can be used
+    /// for long-running tasks that are not time critical; more energy-
+    /// efficient than Low.
     Background = 0,
-    Default = 1,
+
+    /// Lower the current thread's priority such that it does not affect
+    /// foreground tasks significantly. This is a good default for long-
+    /// running, latency-insensitive tasks to make sure cpu is not hogged
+    /// by this task.
+    Low = 1,
+
+    /// Restore the current thread's priority to default scheduling priority.
+    Default = 2,
   };
-  /// If priority is Background tries to lower current threads priority such
-  /// that it does not affect foreground tasks significantly. Can be used for
-  /// long-running, latency-insensitive tasks to make sure cpu is not hogged by
-  /// this task.
-  /// If the priority is default tries to restore current threads priority to
-  /// default scheduling priority.
   enum class SetThreadPriorityResult { FAILURE, SUCCESS };
   SetThreadPriorityResult set_thread_priority(ThreadPriority Priority);
 }
diff --git a/llvm/include/llvm/Support/TrigramIndex.h b/llvm/include/llvm/Support/TrigramIndex.h
index f772deca0301..0bfac498393f 100644
--- a/llvm/include/llvm/Support/TrigramIndex.h
+++ b/llvm/include/llvm/Support/TrigramIndex.h
@@ -27,12 +27,12 @@
 #define LLVM_SUPPORT_TRIGRAMINDEX_H
 
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
 #include <string>
 #include <unordered_map>
 #include <vector>
 
 namespace llvm {
+class StringRef;
 
 class TrigramIndex {
  public:
diff --git a/llvm/include/llvm/Support/TypeSize.h b/llvm/include/llvm/Support/TypeSize.h
index 6bddb602e8c1..0b40e970e8c9 100644
--- a/llvm/include/llvm/Support/TypeSize.h
+++ b/llvm/include/llvm/Support/TypeSize.h
@@ -362,12 +362,31 @@ public:
         LinearPolySize::get(getKnownMinValue() / RHS, isScalable()));
   }
 
+  LeafTy multiplyCoefficientBy(ScalarTy RHS) const {
+    return static_cast<LeafTy>(
+        LinearPolySize::get(getKnownMinValue() * RHS, isScalable()));
+  }
+
   LeafTy coefficientNextPowerOf2() const {
     return static_cast<LeafTy>(LinearPolySize::get(
         static_cast<ScalarTy>(llvm::NextPowerOf2(getKnownMinValue())),
         isScalable()));
   }
 
+  /// Returns true if there exists a value X where RHS.multiplyCoefficientBy(X)
+  /// will result in a value whose size matches our own.
+  bool hasKnownScalarFactor(const LinearPolySize &RHS) const {
+    return isScalable() == RHS.isScalable() &&
+           getKnownMinValue() % RHS.getKnownMinValue() == 0;
+  }
+
+  /// Returns a value X where RHS.multiplyCoefficientBy(X) will result in a
+  /// value whose size matches our own.
+  ScalarTy getKnownScalarFactor(const LinearPolySize &RHS) const {
+    assert(hasKnownScalarFactor(RHS) && "Expected RHS to be a known factor!");
+    return getKnownMinValue() / RHS.getKnownMinValue();
+  }
+
   /// Printing function.
   void print(raw_ostream &OS) const {
     if (isScalable())
diff --git a/llvm/include/llvm/Support/Unicode.h b/llvm/include/llvm/Support/Unicode.h
index ca17bba2fbb4..729775431e16 100644
--- a/llvm/include/llvm/Support/Unicode.h
+++ b/llvm/include/llvm/Support/Unicode.h
@@ -14,6 +14,10 @@
 #ifndef LLVM_SUPPORT_UNICODE_H
 #define LLVM_SUPPORT_UNICODE_H
 
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallString.h"
+#include <string>
+
 namespace llvm {
 class StringRef;
 
@@ -30,19 +34,13 @@ enum ColumnWidthErrors {
 /// terminal, so we define the semantic that should be suitable for generic case
 /// of a terminal capable to output Unicode characters.
 ///
-/// All characters from the Unicode code point range are considered printable
-/// except for:
-///   * C0 and C1 control character ranges;
-///   * default ignorable code points as per 5.21 of
-///     http://www.unicode.org/versions/Unicode6.2.0/UnicodeStandard-6.2.pdf
-///     except for U+00AD SOFT HYPHEN, as it's actually displayed on most
-///     terminals;
-///   * format characters (category = Cf);
-///   * surrogates (category = Cs);
-///   * unassigned characters (category = Cn).
+/// Printable codepoints are those in the categories L, M, N, P, S and Zs
 /// \return true if the character is considered printable.
 bool isPrintable(int UCS);
 
+// Formatting codepoints are codepoints in the Cf category.
+bool isFormatting(int UCS);
+
 /// Gets the number of positions the UTF8-encoded \p Text is likely to occupy
 /// when output on a terminal ("character width"). This depends on the
 /// implementation of the terminal, and there's no standard definition of
@@ -63,6 +61,30 @@ int columnWidthUTF8(StringRef Text);
 /// rules.
 int foldCharSimple(int C);
 
+/// Maps the name or the alias of a Unicode character to its associated
+/// codepoints.
+/// The names and aliases are derived from UnicodeData.txt and NameAliases.txt
+/// For compatibility with the semantics of named character escape sequences in
+/// C++, this mapping does an exact match sensitive to casing and spacing.
+/// \return The codepoint of the corresponding character, if any.
+Optional<char32_t> nameToCodepointStrict(StringRef Name);
+
+struct LooseMatchingResult {
+  char32_t CodePoint;
+  SmallString<64> Name;
+};
+
+Optional<LooseMatchingResult> nameToCodepointLooseMatching(StringRef Name);
+
+struct MatchForCodepointName {
+  std::string Name;
+  uint32_t Distance = 0;
+  char32_t Value = 0;
+};
+
+SmallVector<MatchForCodepointName>
+nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount);
+
 } // namespace unicode
 } // namespace sys
 } // namespace llvm
diff --git a/llvm/include/llvm/Support/VersionTuple.h b/llvm/include/llvm/Support/VersionTuple.h
index 1a1072d228f1..2020a5c06f56 100644
--- a/llvm/include/llvm/Support/VersionTuple.h
+++ b/llvm/include/llvm/Support/VersionTuple.h
@@ -17,11 +17,13 @@
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/Optional.h"
-#include "llvm/Support/HashBuilder.h"
+#include "llvm/Support/Endian.h"
 #include <string>
 #include <tuple>
 
 namespace llvm {
+template <typename HasherT, support::endianness Endianness>
+class HashBuilderImpl;
 class raw_ostream;
 class StringRef;
 
@@ -97,6 +99,12 @@ public:
     return *this;
   }
 
+  /// Return a version tuple that contains a different major version but
+  /// everything else is the same.
+  VersionTuple withMajorReplaced(unsigned NewMajor) const {
+    return VersionTuple(NewMajor, Minor, Subminor, Build);
+  }
+
   /// Return a version tuple that contains only components that are non-zero.
   VersionTuple normalize() const {
     VersionTuple Result = *this;
@@ -161,8 +169,8 @@ public:
     return !(X < Y);
   }
 
-  friend llvm::hash_code hash_value(const VersionTuple &VT) {
-    return llvm::hash_combine(VT.Major, VT.Minor, VT.Subminor, VT.Build);
+  friend hash_code hash_value(const VersionTuple &VT) {
+    return hash_combine(VT.Major, VT.Minor, VT.Subminor, VT.Build);
   }
 
   template <typename HasherT, llvm::support::endianness Endianness>
diff --git a/llvm/include/llvm/Support/VirtualFileSystem.h b/llvm/include/llvm/Support/VirtualFileSystem.h
index f5dde334b0a7..3c99b0d8efdb 100644
--- a/llvm/include/llvm/Support/VirtualFileSystem.h
+++ b/llvm/include/llvm/Support/VirtualFileSystem.h
@@ -22,6 +22,7 @@
 #include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/Support/Chrono.h"
 #include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SourceMgr.h"
@@ -58,6 +59,17 @@ public:
   // FIXME: remove when files support multiple names
   bool IsVFSMapped = false;
 
+  /// Whether this entity has an external path different from the virtual path,
+  /// and the external path is exposed by leaking it through the abstraction.
+  /// For example, a RedirectingFileSystem will set this for paths where
+  /// UseExternalName is true.
+  ///
+  /// FIXME: Currently the external path is exposed by replacing the virtual
+  /// path in this Status object. Instead, we should leave the path in the
+  /// Status intact (matching the requested virtual path) - see
+  /// FileManager::getFileRef for how how we plan to fix this.
+  bool ExposesExternalVFSPath = false;
+
   Status() = default;
   Status(const llvm::sys::fs::file_status &Status);
   Status(const Twine &Name, llvm::sys::fs::UniqueID UID,
@@ -306,6 +318,28 @@ public:
   /// \returns success if \a path has been made absolute, otherwise a
   ///          platform-specific error_code.
   virtual std::error_code makeAbsolute(SmallVectorImpl<char> &Path) const;
+
+  enum class PrintType { Summary, Contents, RecursiveContents };
+  void print(raw_ostream &OS, PrintType Type = PrintType::Contents,
+             unsigned IndentLevel = 0) const {
+    printImpl(OS, Type, IndentLevel);
+  }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  LLVM_DUMP_METHOD void dump() const;
+#endif
+
+protected:
+  virtual void printImpl(raw_ostream &OS, PrintType Type,
+                         unsigned IndentLevel) const {
+    printIndent(OS, IndentLevel);
+    OS << "FileSystem\n";
+  }
+
+  void printIndent(raw_ostream &OS, unsigned IndentLevel) const {
+    for (unsigned i = 0; i < IndentLevel; ++i)
+      OS << "  ";
+  }
 };
 
 /// Gets an \p vfs::FileSystem for the 'real' file system, as seen by
@@ -357,6 +391,8 @@ public:
   using const_iterator = FileSystemList::const_reverse_iterator;
   using reverse_iterator = FileSystemList::iterator;
   using const_reverse_iterator = FileSystemList::const_iterator;
+  using range = iterator_range<iterator>;
+  using const_range = iterator_range<const_iterator>;
 
   /// Get an iterator pointing to the most recently added file system.
   iterator overlays_begin() { return FSList.rbegin(); }
@@ -373,6 +409,13 @@ public:
   /// Get an iterator pointing one-past the most recently added file system.
   reverse_iterator overlays_rend() { return FSList.end(); }
   const_reverse_iterator overlays_rend() const { return FSList.end(); }
+
+  range overlays_range() { return llvm::reverse(FSList); }
+  const_range overlays_range() const { return llvm::reverse(FSList); }
+
+protected:
+  void printImpl(raw_ostream &OS, PrintType Type,
+                 unsigned IndentLevel) const override;
 };
 
 /// By default, this delegates all calls to the underlying file system. This
@@ -436,6 +479,24 @@ struct NewInMemoryNodeInfo {
   Status makeStatus() const;
 };
 
+class NamedNodeOrError {
+  ErrorOr<std::pair<llvm::SmallString<128>, const detail::InMemoryNode *>>
+      Value;
+
+public:
+  NamedNodeOrError(llvm::SmallString<128> Name,
+                   const detail::InMemoryNode *Node)
+      : Value(std::make_pair(Name, Node)) {}
+  NamedNodeOrError(std::error_code EC) : Value(EC) {}
+  NamedNodeOrError(llvm::errc EC) : Value(EC) {}
+
+  StringRef getName() const { return (*Value).first; }
+  explicit operator bool() const { return static_cast<bool>(Value); }
+  operator std::error_code() const { return Value.getError(); }
+  std::error_code getError() const { return Value.getError(); }
+  const detail::InMemoryNode *operator*() const { return (*Value).second; }
+};
+
 } // namespace detail
 
 /// An in-memory file system.
@@ -454,6 +515,14 @@ class InMemoryFileSystem : public FileSystem {
                Optional<llvm::sys::fs::file_type> Type,
                Optional<llvm::sys::fs::perms> Perms, MakeNodeFn MakeNode);
 
+  /// Looks up the in-memory node for the path \param P.
+  /// If \param FollowFinalSymlink is true, the returned node is guaranteed to
+  /// not be a symlink and its path may differ from \param P.
+  detail::NamedNodeOrError lookupNode(const Twine &P, bool FollowFinalSymlink,
+                                      size_t SymlinkDepth = 0) const;
+
+  class DirIterator;
+
 public:
   explicit InMemoryFileSystem(bool UseNormalizedPaths = true);
   ~InMemoryFileSystem() override;
@@ -471,18 +540,32 @@ public:
                Optional<llvm::sys::fs::perms> Perms = None);
 
   /// Add a hard link to a file.
+  ///
   /// Here hard links are not intended to be fully equivalent to the classical
   /// filesystem. Both the hard link and the file share the same buffer and
   /// status (and thus have the same UniqueID). Because of this there is no way
   /// to distinguish between the link and the file after the link has been
   /// added.
   ///
-  /// The To path must be an existing file or a hardlink. The From file must not
-  /// have been added before. The To Path must not be a directory. The From Node
-  /// is added as a hard link which points to the resolved file of To Node.
+  /// The \param Target path must be an existing file or a hardlink. The
+  /// \param NewLink file must not have been added before. The \param Target
+  /// path must not be a directory. The \param NewLink node is added as a hard
+  /// link which points to the resolved file of \param Target node.
   /// \return true if the above condition is satisfied and hardlink was
   /// successfully created, false otherwise.
-  bool addHardLink(const Twine &From, const Twine &To);
+  bool addHardLink(const Twine &NewLink, const Twine &Target);
+
+  /// Arbitrary max depth to search through symlinks. We can get into problems
+  /// if a link links to a link that links back to the link, for example.
+  static constexpr size_t MaxSymlinkDepth = 16;
+
+  /// Add a symbolic link. Unlike a HardLink, because \param Target doesn't need
+  /// to refer to a file (or refer to anything, as it happens). Also, an
+  /// in-memory directory for \param Target isn't automatically created.
+  bool addSymbolicLink(const Twine &NewLink, const Twine &Target,
+                       time_t ModificationTime, Optional<uint32_t> User = None,
+                       Optional<uint32_t> Group = None,
+                       Optional<llvm::sys::fs::perms> Perms = None);
 
   /// Add a buffer to the VFS with a path. The VFS does not own the buffer.
   /// If present, User, Group, Type and Perms apply to the newly-created file
@@ -520,6 +603,10 @@ public:
                               SmallVectorImpl<char> &Output) const override;
   std::error_code isLocal(const Twine &Path, bool &Result) override;
   std::error_code setCurrentWorkingDirectory(const Twine &Path) override;
+
+protected:
+  void printImpl(raw_ostream &OS, PrintType Type,
+                 unsigned IndentLevel) const override;
 };
 
 /// Get a globally unique ID for a virtual file or directory.
@@ -571,7 +658,10 @@ class RedirectingFileSystemParser;
 ///   'case-sensitive': <boolean, default=(true for Posix, false for Windows)>
 ///   'use-external-names': <boolean, default=true>
 ///   'overlay-relative': <boolean, default=false>
-///   'fallthrough': <boolean, default=true>
+///   'fallthrough': <boolean, default=true, deprecated - use 'redirecting-with'
+///                   instead>
+///   'redirecting-with': <string, one of 'fallthrough', 'fallback', or
+///                        'redirect-only', default='fallthrough'>
 ///
 /// Virtual directories that list their contents are represented as
 /// \verbatim
@@ -642,6 +732,20 @@ public:
   enum EntryKind { EK_Directory, EK_DirectoryRemap, EK_File };
   enum NameKind { NK_NotSet, NK_External, NK_Virtual };
 
+  /// The type of redirection to perform.
+  enum class RedirectKind {
+    /// Lookup the redirected path first (ie. the one specified in
+    /// 'external-contents') and if that fails "fallthrough" to a lookup of the
+    /// originally provided path.
+    Fallthrough,
+    /// Lookup the provided path first and if that fails, "fallback" to a
+    /// lookup of the redirected path.
+    Fallback,
+    /// Only lookup the redirected path, do not lookup the originally provided
+    /// path.
+    RedirectOnly
+  };
+
   /// A single file or directory in the VFS.
   class Entry {
     EntryKind Kind;
@@ -776,17 +880,11 @@ private:
   friend class RedirectingFSDirIterImpl;
   friend class RedirectingFileSystemParser;
 
-  bool shouldUseExternalFS() const { return IsFallthrough; }
-
   /// Canonicalize path by removing ".", "..", "./", components. This is
   /// a VFS request, do not bother about symlinks in the path components
   /// but canonicalize in order to perform the correct entry search.
   std::error_code makeCanonical(SmallVectorImpl<char> &Path) const;
 
-  /// Whether to fall back to the external file system when an operation fails
-  /// with the given error code on a path associated with the provided Entry.
-  bool shouldFallBackToExternalFS(std::error_code EC, Entry *E = nullptr) const;
-
   /// Get the File status, or error, from the underlying external file system.
   /// This returns the status with the originally requested name, while looking
   /// up the entry using the canonical path.
@@ -834,9 +932,9 @@ private:
   /// names of files.  This global value is overridable on a per-file basis.
   bool UseExternalNames = true;
 
-  /// Whether to attempt a file lookup in external file system after it wasn't
-  /// found in VFS.
-  bool IsFallthrough = true;
+  /// Determines the lookups to perform, as well as their order. See
+  /// \c RedirectKind for details.
+  RedirectKind Redirection = RedirectKind::Fallthrough;
   /// @}
 
   RedirectingFileSystem(IntrusiveRefCntPtr<FileSystem> ExternalFS);
@@ -891,15 +989,19 @@ public:
 
   StringRef getExternalContentsPrefixDir() const;
 
+  /// Sets the redirection kind to \c Fallthrough if true or \c RedirectOnly
+  /// otherwise. Will removed in the future, use \c setRedirection instead.
   void setFallthrough(bool Fallthrough);
 
+  void setRedirection(RedirectingFileSystem::RedirectKind Kind);
+
   std::vector<llvm::StringRef> getRoots() const;
 
-  void dump(raw_ostream &OS) const;
-  void dumpEntry(raw_ostream &OS, Entry *E, int NumSpaces = 0) const;
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  LLVM_DUMP_METHOD void dump() const;
-#endif
+  void printEntry(raw_ostream &OS, Entry *E, unsigned IndentLevel = 0) const;
+
+protected:
+  void printImpl(raw_ostream &OS, PrintType Type,
+                 unsigned IndentLevel) const override;
 };
 
 /// Collect all pairs of <virtual path, real path> entries from the
diff --git a/llvm/include/llvm/Support/Win64EH.h b/llvm/include/llvm/Support/Win64EH.h
index 9359fcb4286a..31345beaa66a 100644
--- a/llvm/include/llvm/Support/Win64EH.h
+++ b/llvm/include/llvm/Support/Win64EH.h
@@ -24,6 +24,9 @@ namespace Win64EH {
 /// UnwindOpcodes - Enumeration whose values specify a single operation in
 /// the prolog of a function.
 enum UnwindOpcodes {
+  // The following set of unwind opcodes is for x86_64.  They are documented at
+  // https://docs.microsoft.com/en-us/cpp/build/exception-handling-x64.
+  // Some generic values from this set are used for other architectures too.
   UOP_PushNonVol = 0,
   UOP_AllocLarge,
   UOP_AllocSmall,
@@ -57,7 +60,38 @@ enum UnwindOpcodes {
   UOP_SaveNext,
   UOP_TrapFrame,
   UOP_Context,
-  UOP_ClearUnwoundToCall
+  UOP_ClearUnwoundToCall,
+  // The following set of unwind opcodes is for ARM.  They are documented at
+  // https://docs.microsoft.com/en-us/cpp/build/arm-exception-handling
+
+  // Stack allocations use UOP_AllocSmall, UOP_AllocLarge from above, plus
+  // the following. AllocSmall, AllocLarge and AllocHuge represent a 16 bit
+  // instruction, while the WideAlloc* opcodes represent a 32 bit instruction.
+  // Small can represent a stack offset of 0x7f*4 (252) bytes, Medium can
+  // represent up to 0x3ff*4 (4092) bytes, Large up to 0xffff*4 (262140) bytes,
+  // and Huge up to 0xffffff*4 (67108860) bytes.
+  UOP_AllocHuge,
+  UOP_WideAllocMedium,
+  UOP_WideAllocLarge,
+  UOP_WideAllocHuge,
+
+  UOP_WideSaveRegMask,
+  UOP_SaveSP,
+  UOP_SaveRegsR4R7LR,
+  UOP_WideSaveRegsR4R11LR,
+  UOP_SaveFRegD8D15,
+  UOP_SaveRegMask,
+  UOP_SaveLR,
+  UOP_SaveFRegD0D15,
+  UOP_SaveFRegD16D31,
+  // Using UOP_Nop from above
+  UOP_WideNop,
+  // Using UOP_End from above
+  UOP_EndNop,
+  UOP_WideEndNop,
+  // A custom unspecified opcode, consisting of one or more bytes. This
+  // allows producing opcodes in the implementation defined/reserved range.
+  UOP_Custom,
 };
 
 /// UnwindCode - This union describes a single operation in a function prolog,
diff --git a/llvm/include/llvm/Support/WithColor.h b/llvm/include/llvm/Support/WithColor.h
index e772ea667f4f..b249f34da1fa 100644
--- a/llvm/include/llvm/Support/WithColor.h
+++ b/llvm/include/llvm/Support/WithColor.h
@@ -51,10 +51,9 @@ enum class ColorMode {
 /// An RAII object that temporarily switches an output stream to a specific
 /// color.
 class WithColor {
-  raw_ostream &OS;
-  ColorMode Mode;
-
 public:
+  using AutoDetectFunctionType = bool (*)(const raw_ostream &OS);
+
   /// To be used like this: WithColor(OS, HighlightColor::String) << "text";
   /// @param OS The output stream
   /// @param S Symbolic name for syntax element to color
@@ -132,6 +131,19 @@ public:
   /// Implement default handling for Warning.
   /// Print "warning: " to stderr.
   static void defaultWarningHandler(Error Warning);
+
+  /// Retrieve the default color auto detection function.
+  static AutoDetectFunctionType defaultAutoDetectFunction();
+
+  /// Change the global auto detection function.
+  static void
+  setAutoDetectFunction(AutoDetectFunctionType NewAutoDetectFunction);
+
+private:
+  raw_ostream &OS;
+  ColorMode Mode;
+
+  static AutoDetectFunctionType AutoDetectFunction;
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
index aca717a9f6cb..169b8e97986e 100644
--- a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
+++ b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
@@ -120,8 +120,6 @@ enum attributeBits {
   ENUM_ENTRY(IC_VEX_XS,             2,  "requires VEX and the XS prefix")      \
   ENUM_ENTRY(IC_VEX_XD,             2,  "requires VEX and the XD prefix")      \
   ENUM_ENTRY(IC_VEX_OPSIZE,         2,  "requires VEX and the OpSize prefix")  \
-  ENUM_ENTRY(IC_64BIT_VEX_OPSIZE,        4, "requires 64-bit mode and VEX")         \
-  ENUM_ENTRY(IC_64BIT_VEX_OPSIZE_ADSIZE, 5, "requires 64-bit mode, VEX, and AdSize")\
   ENUM_ENTRY(IC_VEX_W,              3,  "requires VEX and the W prefix")       \
   ENUM_ENTRY(IC_VEX_W_XS,           4,  "requires VEX, W, and XS prefix")      \
   ENUM_ENTRY(IC_VEX_W_XD,           4,  "requires VEX, W, and XD prefix")      \
diff --git a/llvm/include/llvm/Support/X86TargetParser.def b/llvm/include/llvm/Support/X86TargetParser.def
index 4443d822d3e8..58fa3b3842e7 100644
--- a/llvm/include/llvm/Support/X86TargetParser.def
+++ b/llvm/include/llvm/Support/X86TargetParser.def
@@ -211,47 +211,47 @@ X86_FEATURE       (LVI_LOAD_HARDENING,          "lvi-load-hardening")
 #undef X86_FEATURE
 
 #ifndef CPU_SPECIFIC
-#define CPU_SPECIFIC(NAME, MANGLING, FEATURES)
+#define CPU_SPECIFIC(NAME, TUNE_NAME, MANGLING, FEATURES)
 #endif
 
 #ifndef CPU_SPECIFIC_ALIAS
-#define CPU_SPECIFIC_ALIAS(NEW_NAME, NAME)
+#define CPU_SPECIFIC_ALIAS(NEW_NAME, TUNE_NAME, NAME)
 #endif
 
-CPU_SPECIFIC("generic", 'A', "")
-CPU_SPECIFIC("pentium", 'B', "")
-CPU_SPECIFIC("pentium_pro", 'C', "+cmov")
-CPU_SPECIFIC("pentium_mmx", 'D', "+mmx")
-CPU_SPECIFIC("pentium_ii", 'E', "+cmov,+mmx")
-CPU_SPECIFIC("pentium_iii", 'H', "+cmov,+mmx,+sse")
-CPU_SPECIFIC_ALIAS("pentium_iii_no_xmm_regs", "pentium_iii")
-CPU_SPECIFIC("pentium_4", 'J', "+cmov,+mmx,+sse,+sse2")
-CPU_SPECIFIC("pentium_m", 'K', "+cmov,+mmx,+sse,+sse2")
-CPU_SPECIFIC("pentium_4_sse3", 'L', "+cmov,+mmx,+sse,+sse2,+sse3")
-CPU_SPECIFIC("core_2_duo_ssse3", 'M', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3")
-CPU_SPECIFIC("core_2_duo_sse4_1", 'N', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1")
-CPU_SPECIFIC("atom", 'O', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+movbe")
-CPU_SPECIFIC("atom_sse4_2", 'c', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+popcnt")
-CPU_SPECIFIC("core_i7_sse4_2", 'P', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+popcnt")
-CPU_SPECIFIC("core_aes_pclmulqdq", 'Q', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+popcnt")
-CPU_SPECIFIC("atom_sse4_2_movbe", 'd', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt")
-CPU_SPECIFIC("goldmont", 'i', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt")
-CPU_SPECIFIC("sandybridge", 'R', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+popcnt,+avx")
-CPU_SPECIFIC_ALIAS("core_2nd_gen_avx", "sandybridge")
-CPU_SPECIFIC("ivybridge", 'S', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+popcnt,+f16c,+avx")
-CPU_SPECIFIC_ALIAS("core_3rd_gen_avx", "ivybridge")
-CPU_SPECIFIC("haswell", 'V', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt,+f16c,+avx,+fma,+bmi,+lzcnt,+avx2")
-CPU_SPECIFIC_ALIAS("core_4th_gen_avx", "haswell")
-CPU_SPECIFIC("core_4th_gen_avx_tsx", 'W', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt,+f16c,+avx,+fma,+bmi,+lzcnt,+avx2")
-CPU_SPECIFIC("broadwell", 'X', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt,+f16c,+avx,+fma,+bmi,+lzcnt,+avx2,+adx")
-CPU_SPECIFIC_ALIAS("core_5th_gen_avx", "broadwell")
-CPU_SPECIFIC("core_5th_gen_avx_tsx", 'Y', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt,+f16c,+avx,+fma,+bmi,+lzcnt,+avx2,+adx")
-CPU_SPECIFIC("knl", 'Z', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt,+f16c,+avx,+fma,+bmi,+lzcnt,+avx2,+avx512f,+adx,+avx512er,+avx512pf,+avx512cd")
-CPU_SPECIFIC_ALIAS("mic_avx512", "knl")
-CPU_SPECIFIC("skylake", 'b', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt,+f16c,+avx,+fma,+bmi,+lzcnt,+avx2,+adx,+mpx")
-CPU_SPECIFIC( "skylake_avx512", 'a', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt,+f16c,+avx,+fma,+bmi,+lzcnt,+avx2,+avx512dq,+avx512f,+adx,+avx512cd,+avx512bw,+avx512vl,+clwb")
-CPU_SPECIFIC("cannonlake", 'e', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt,+f16c,+avx,+fma,+bmi,+lzcnt,+avx2,+avx512dq,+avx512f,+adx,+avx512ifma,+avx512cd,+avx512bw,+avx512vl,+avx512vbmi")
-CPU_SPECIFIC("knm", 'j', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt,+f16c,+avx,+fma,+bmi,+lzcnt,+avx2,+avx512f,+adx,+avx512er,+avx512pf,+avx512cd,+avx5124fmaps,+avx5124vnniw,+avx512vpopcntdq")
+CPU_SPECIFIC("generic", "generic", 'A', "")
+CPU_SPECIFIC("pentium", "pentium", 'B', "")
+CPU_SPECIFIC("pentium_pro", "pentiumpro", 'C', "+cmov")
+CPU_SPECIFIC("pentium_mmx", "pentium-mmx", 'D', "+mmx")
+CPU_SPECIFIC("pentium_ii", "pentium2", 'E', "+cmov,+mmx")
+CPU_SPECIFIC("pentium_iii", "pentium3", 'H', "+cmov,+mmx,+sse")
+CPU_SPECIFIC_ALIAS("pentium_iii_no_xmm_regs", "pentium3", "pentium_iii")
+CPU_SPECIFIC("pentium_4", "pentium4", 'J', "+cmov,+mmx,+sse,+sse2")
+CPU_SPECIFIC("pentium_m", "pentium-m", 'K', "+cmov,+mmx,+sse,+sse2")
+CPU_SPECIFIC("pentium_4_sse3", "prescott", 'L', "+cmov,+mmx,+sse,+sse2,+sse3")
+CPU_SPECIFIC("core_2_duo_ssse3", "core2", 'M', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3")
+CPU_SPECIFIC("core_2_duo_sse4_1", "penryn", 'N', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1")
+CPU_SPECIFIC("atom", "atom", 'O', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+movbe")
+CPU_SPECIFIC("atom_sse4_2", "silvermont", 'c', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+popcnt")
+CPU_SPECIFIC("core_i7_sse4_2", "nehalem", 'P', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+popcnt")
+CPU_SPECIFIC("core_aes_pclmulqdq", "westmere", 'Q', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+popcnt")
+CPU_SPECIFIC("atom_sse4_2_movbe", "silvermont", 'd', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt")
+CPU_SPECIFIC("goldmont", "goldmont", 'i', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt")
+CPU_SPECIFIC("sandybridge", "sandybridge", 'R', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+popcnt,+avx")
+CPU_SPECIFIC_ALIAS("core_2nd_gen_avx", "sandybridge", "sandybridge")
+CPU_SPECIFIC("ivybridge", "ivybridge", 'S', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+popcnt,+f16c,+avx")
+CPU_SPECIFIC_ALIAS("core_3rd_gen_avx", "ivybridge", "ivybridge")
+CPU_SPECIFIC("haswell", "haswell", 'V', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt,+f16c,+avx,+fma,+bmi,+lzcnt,+avx2")
+CPU_SPECIFIC_ALIAS("core_4th_gen_avx", "haswell", "haswell")
+CPU_SPECIFIC("core_4th_gen_avx_tsx", "haswell", 'W', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt,+f16c,+avx,+fma,+bmi,+lzcnt,+avx2")
+CPU_SPECIFIC("broadwell", "broadwell", 'X', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt,+f16c,+avx,+fma,+bmi,+lzcnt,+avx2,+adx")
+CPU_SPECIFIC_ALIAS("core_5th_gen_avx", "broadwell", "broadwell")
+CPU_SPECIFIC("core_5th_gen_avx_tsx", "broadwell", 'Y', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt,+f16c,+avx,+fma,+bmi,+lzcnt,+avx2,+adx")
+CPU_SPECIFIC("knl", "knl", 'Z', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt,+f16c,+avx,+fma,+bmi,+lzcnt,+avx2,+avx512f,+adx,+avx512er,+avx512pf,+avx512cd")
+CPU_SPECIFIC_ALIAS("mic_avx512", "knl", "knl")
+CPU_SPECIFIC("skylake", "skylake", 'b', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt,+f16c,+avx,+fma,+bmi,+lzcnt,+avx2,+adx,+mpx")
+CPU_SPECIFIC( "skylake_avx512", "skylake-avx512", 'a', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt,+f16c,+avx,+fma,+bmi,+lzcnt,+avx2,+avx512dq,+avx512f,+adx,+avx512cd,+avx512bw,+avx512vl,+clwb")
+CPU_SPECIFIC("cannonlake", "cannonlake", 'e', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt,+f16c,+avx,+fma,+bmi,+lzcnt,+avx2,+avx512dq,+avx512f,+adx,+avx512ifma,+avx512cd,+avx512bw,+avx512vl,+avx512vbmi")
+CPU_SPECIFIC("knm", "knm", 'j', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt,+f16c,+avx,+fma,+bmi,+lzcnt,+avx2,+avx512f,+adx,+avx512er,+avx512pf,+avx512cd,+avx5124fmaps,+avx5124vnniw,+avx512vpopcntdq")
 
 #undef CPU_SPECIFIC_ALIAS
 #undef CPU_SPECIFIC
diff --git a/llvm/include/llvm/Support/YAMLParser.h b/llvm/include/llvm/Support/YAMLParser.h
index a4b2ab5e49ec..231cc1d28c9a 100644
--- a/llvm/include/llvm/Support/YAMLParser.h
+++ b/llvm/include/llvm/Support/YAMLParser.h
@@ -11,7 +11,6 @@
 //  See http://www.yaml.org/spec/1.2/spec.html for the full standard.
 //
 //  This currently does not implement the following:
-//    * Multi-line literal folding.
 //    * Tag resolution.
 //    * UTF-16.
 //    * BOMs anywhere other than the first Unicode scalar value in the file.
diff --git a/llvm/include/llvm/Support/YAMLTraits.h b/llvm/include/llvm/Support/YAMLTraits.h
index 7ad73543fc6e..8ade9b15642b 100644
--- a/llvm/include/llvm/Support/YAMLTraits.h
+++ b/llvm/include/llvm/Support/YAMLTraits.h
@@ -24,7 +24,6 @@
 #include "llvm/Support/YAMLParser.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
-#include <cctype>
 #include <map>
 #include <memory>
 #include <new>
@@ -63,6 +62,7 @@ struct MappingTraits {
   // static void mapping(IO &io, T &fields);
   // Optionally may provide:
   // static std::string validate(IO &io, T &fields);
+  // static void enumInput(IO &io, T &value);
   //
   // The optional flow flag will cause generated YAML to use a flow mapping
   // (e.g. { a: 0, b: 1 }):
@@ -446,6 +446,31 @@ template <class T> struct has_MappingValidateTraits<T, EmptyContext> {
   static bool const value = (sizeof(test<MappingTraits<T>>(nullptr)) == 1);
 };
 
+// Test if MappingContextTraits<T>::enumInput() is defined on type T.
+template <class T, class Context> struct has_MappingEnumInputTraits {
+  using Signature_validate = void (*)(class IO &, T &);
+
+  template <typename U>
+  static char test(SameType<Signature_validate, &U::enumInput> *);
+
+  template <typename U> static double test(...);
+
+  static bool const value =
+      (sizeof(test<MappingContextTraits<T, Context>>(nullptr)) == 1);
+};
+
+// Test if MappingTraits<T>::enumInput() is defined on type T.
+template <class T> struct has_MappingEnumInputTraits<T, EmptyContext> {
+  using Signature_validate = void (*)(class IO &, T &);
+
+  template <typename U>
+  static char test(SameType<Signature_validate, &U::enumInput> *);
+
+  template <typename U> static double test(...);
+
+  static bool const value = (sizeof(test<MappingTraits<T>>(nullptr)) == 1);
+};
+
 // Test if SequenceTraits<T> is defined on type T.
 template <class T>
 struct has_SequenceMethodTraits
@@ -537,9 +562,8 @@ template <class T> struct has_PolymorphicTraits {
 };
 
 inline bool isNumeric(StringRef S) {
-  const static auto skipDigits = [](StringRef Input) {
-    return Input.drop_front(
-        std::min(Input.find_first_not_of("0123456789"), Input.size()));
+  const auto skipDigits = [](StringRef Input) {
+    return Input.ltrim("0123456789");
   };
 
   // Make S.front() and S.drop_front().front() (if S.front() is [+-]) calls
@@ -666,8 +690,7 @@ inline QuotingType needsQuotes(StringRef S) {
   // 7.3.3 Plain Style
   // Plain scalars must not begin with most indicators, as this would cause
   // ambiguity with other YAML constructs.
-  static constexpr char Indicators[] = R"(-?:\,[]{}#&*!|>'"%@`)";
-  if (S.find_first_of(Indicators) == 0)
+  if (std::strchr(R"(-?:\,[]{}#&*!|>'"%@`)", S[0]) != nullptr)
     MaxQuotingNeeded = QuotingType::Single;
 
   for (unsigned char C : S) {
@@ -1061,9 +1084,30 @@ yamlize(IO &io, T &Val, bool, Context &Ctx) {
     io.endMapping();
 }
 
+template <typename T, typename Context>
+std::enable_if_t<!has_MappingEnumInputTraits<T, Context>::value, bool>
+yamlizeMappingEnumInput(IO &io, T &Val) {
+  return false;
+}
+
+template <typename T, typename Context>
+std::enable_if_t<has_MappingEnumInputTraits<T, Context>::value, bool>
+yamlizeMappingEnumInput(IO &io, T &Val) {
+  if (io.outputting())
+    return false;
+
+  io.beginEnumScalar();
+  MappingTraits<T>::enumInput(io, Val);
+  bool Matched = !io.matchEnumFallback();
+  io.endEnumScalar();
+  return Matched;
+}
+
 template <typename T, typename Context>
 std::enable_if_t<unvalidatedMappingTraits<T, Context>::value, void>
 yamlize(IO &io, T &Val, bool, Context &Ctx) {
+  if (yamlizeMappingEnumInput<T, Context>(io, Val))
+    return;
   if (has_FlowTraits<MappingTraits<T>>::value) {
     io.beginFlowMapping();
     detail::doMapping(io, Val, Ctx);
@@ -1624,14 +1668,13 @@ template <typename T, typename Context>
 void IO::processKeyWithDefault(const char *Key, Optional<T> &Val,
                                const Optional<T> &DefaultValue, bool Required,
                                Context &Ctx) {
-  assert(DefaultValue.hasValue() == false &&
-         "Optional<T> shouldn't have a value!");
+  assert(!DefaultValue && "Optional<T> shouldn't have a value!");
   void *SaveInfo;
   bool UseDefault = true;
-  const bool sameAsDefault = outputting() && !Val.hasValue();
-  if (!outputting() && !Val.hasValue())
+  const bool sameAsDefault = outputting() && !Val;
+  if (!outputting() && !Val)
     Val = T();
-  if (Val.hasValue() &&
+  if (Val &&
       this->preflightKey(Key, Required, sameAsDefault, UseDefault, SaveInfo)) {
 
     // When reading an Optional<X> key from a YAML description, we allow the
@@ -1648,7 +1691,7 @@ void IO::processKeyWithDefault(const char *Key, Optional<T> &Val,
     if (IsNone)
       Val = DefaultValue;
     else
-      yamlize(*this, Val.getValue(), Required, Ctx);
+      yamlize(*this, *Val, Required, Ctx);
     this->postflightKey(SaveInfo);
   } else {
     if (UseDefault)
diff --git a/llvm/include/llvm/Support/circular_raw_ostream.h b/llvm/include/llvm/Support/circular_raw_ostream.h
index d2f01ea6a7f2..17fb8fa0e476 100644
--- a/llvm/include/llvm/Support/circular_raw_ostream.h
+++ b/llvm/include/llvm/Support/circular_raw_ostream.h
@@ -38,7 +38,7 @@ namespace llvm {
     /// TheStream - The real stream we output to. We set it to be
     /// unbuffered, since we're already doing our own buffering.
     ///
-    raw_ostream *TheStream;
+    raw_ostream *TheStream = nullptr;
 
     /// OwnsStream - Are we responsible for managing the underlying
     /// stream?
@@ -51,7 +51,7 @@ namespace llvm {
 
     /// BufferArray - The actual buffer storage.
     ///
-    char *BufferArray;
+    char *BufferArray = nullptr;
 
     /// Cur - Pointer to the current output point in BufferArray.
     ///
@@ -60,7 +60,7 @@ namespace llvm {
     /// Filled - Indicate whether the buffer has been completely
     /// filled.  This helps avoid garbage output.
     ///
-    bool Filled;
+    bool Filled = false;
 
     /// Banner - A pointer to a banner to print before dumping the
     /// log.
@@ -106,9 +106,8 @@ namespace llvm {
     ///
     circular_raw_ostream(raw_ostream &Stream, const char *Header,
                          size_t BuffSize = 0, bool Owns = REFERENCE_ONLY)
-        : raw_ostream(/*unbuffered*/ true), TheStream(nullptr),
-          OwnsStream(Owns), BufferSize(BuffSize), BufferArray(nullptr),
-          Filled(false), Banner(Header) {
+        : raw_ostream(/*unbuffered*/ true), OwnsStream(Owns),
+          BufferSize(BuffSize), Banner(Header) {
       if (BufferSize != 0)
         BufferArray = new char[BufferSize];
       Cur = BufferArray;
diff --git a/llvm/include/llvm/Support/raw_sha1_ostream.h b/llvm/include/llvm/Support/raw_sha1_ostream.h
index 3991691796b5..299f6e6b5e88 100644
--- a/llvm/include/llvm/Support/raw_sha1_ostream.h
+++ b/llvm/include/llvm/Support/raw_sha1_ostream.h
@@ -30,7 +30,7 @@ class raw_sha1_ostream : public raw_ostream {
 
 public:
   /// Return the current SHA1 hash for the content of the stream
-  StringRef sha1() {
+  std::array<uint8_t, 20> sha1() {
     flush();
     return State.result();
   }
diff --git a/llvm/include/llvm/TableGen/Parser.h b/llvm/include/llvm/TableGen/Parser.h
new file mode 100644
index 000000000000..411259e4033c
--- /dev/null
+++ b/llvm/include/llvm/TableGen/Parser.h
@@ -0,0 +1,34 @@
+//===- llvm/TableGen/Parser.h - tblgen parser entry point -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares an entry point into the tablegen parser for use by tools.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TABLEGEN_PARSER_H
+#define LLVM_TABLEGEN_PARSER_H
+
+#include "llvm/ADT/STLExtras.h"
+#include <string>
+#include <vector>
+
+namespace llvm {
+class RecordKeeper;
+class SourceMgr;
+
+/// Parse the TableGen file defined within the main buffer of the given
+/// SourceMgr. On success, populates the provided RecordKeeper with the parsed
+/// records and returns false. On failure, returns true.
+///
+/// NOTE: TableGen currently relies on global state within a given parser
+///       invocation, so this function is not thread-safe.
+bool TableGenParseFile(SourceMgr &InputSrcMgr, RecordKeeper &Records);
+
+} // end namespace llvm
+
+#endif // LLVM_TABLEGEN_PARSER_H
diff --git a/llvm/include/llvm/TableGen/Record.h b/llvm/include/llvm/TableGen/Record.h
index 1157487eced3..44daad976c12 100644
--- a/llvm/include/llvm/TableGen/Record.h
+++ b/llvm/include/llvm/TableGen/Record.h
@@ -28,7 +28,6 @@
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/TrailingObjects.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
@@ -40,7 +39,7 @@
 
 namespace llvm {
 namespace detail {
-struct RecordContext;
+struct RecordKeeperImpl;
 } // namespace detail
 
 class ListRecTy;
@@ -70,15 +69,20 @@ public:
 
 private:
   RecTyKind Kind;
+  /// The RecordKeeper that uniqued this Type.
+  RecordKeeper &RK;
   /// ListRecTy of the list that has elements of this type.
   ListRecTy *ListTy = nullptr;
 
 public:
-  RecTy(RecTyKind K) : Kind(K) {}
+  RecTy(RecTyKind K, RecordKeeper &RK) : Kind(K), RK(RK) {}
   virtual ~RecTy() = default;
 
   RecTyKind getRecTyKind() const { return Kind; }
 
+  /// Return the RecordKeeper that uniqued this Type.
+  RecordKeeper &getRecordKeeper() const { return RK; }
+
   virtual std::string getAsString() const = 0;
   void print(raw_ostream &OS) const { OS << getAsString(); }
   void dump() const;
@@ -102,16 +106,16 @@ inline raw_ostream &operator<<(raw_ostream &OS, const RecTy &Ty) {
 
 /// 'bit' - Represent a single bit
 class BitRecTy : public RecTy {
-  friend detail::RecordContext;
+  friend detail::RecordKeeperImpl;
 
-  BitRecTy() : RecTy(BitRecTyKind) {}
+  BitRecTy(RecordKeeper &RK) : RecTy(BitRecTyKind, RK) {}
 
 public:
   static bool classof(const RecTy *RT) {
     return RT->getRecTyKind() == BitRecTyKind;
   }
 
-  static BitRecTy *get();
+  static BitRecTy *get(RecordKeeper &RK);
 
   std::string getAsString() const override { return "bit"; }
 
@@ -122,14 +126,15 @@ public:
 class BitsRecTy : public RecTy {
   unsigned Size;
 
-  explicit BitsRecTy(unsigned Sz) : RecTy(BitsRecTyKind), Size(Sz) {}
+  explicit BitsRecTy(RecordKeeper &RK, unsigned Sz)
+      : RecTy(BitsRecTyKind, RK), Size(Sz) {}
 
 public:
   static bool classof(const RecTy *RT) {
     return RT->getRecTyKind() == BitsRecTyKind;
   }
 
-  static BitsRecTy *get(unsigned Sz);
+  static BitsRecTy *get(RecordKeeper &RK, unsigned Sz);
 
   unsigned getNumBits() const { return Size; }
 
@@ -142,16 +147,16 @@ public:
 
 /// 'int' - Represent an integer value of no particular size
 class IntRecTy : public RecTy {
-  friend detail::RecordContext;
+  friend detail::RecordKeeperImpl;
 
-  IntRecTy() : RecTy(IntRecTyKind) {}
+  IntRecTy(RecordKeeper &RK) : RecTy(IntRecTyKind, RK) {}
 
 public:
   static bool classof(const RecTy *RT) {
     return RT->getRecTyKind() == IntRecTyKind;
   }
 
-  static IntRecTy *get();
+  static IntRecTy *get(RecordKeeper &RK);
 
   std::string getAsString() const override { return "int"; }
 
@@ -160,16 +165,16 @@ public:
 
 /// 'string' - Represent an string value
 class StringRecTy : public RecTy {
-  friend detail::RecordContext;
+  friend detail::RecordKeeperImpl;
 
-  StringRecTy() : RecTy(StringRecTyKind) {}
+  StringRecTy(RecordKeeper &RK) : RecTy(StringRecTyKind, RK) {}
 
 public:
   static bool classof(const RecTy *RT) {
     return RT->getRecTyKind() == StringRecTyKind;
   }
 
-  static StringRecTy *get();
+  static StringRecTy *get(RecordKeeper &RK);
 
   std::string getAsString() const override;
 
@@ -183,7 +188,8 @@ class ListRecTy : public RecTy {
 
   RecTy *ElementTy;
 
-  explicit ListRecTy(RecTy *T) : RecTy(ListRecTyKind), ElementTy(T) {}
+  explicit ListRecTy(RecTy *T)
+      : RecTy(ListRecTyKind, T->getRecordKeeper()), ElementTy(T) {}
 
 public:
   static bool classof(const RecTy *RT) {
@@ -202,16 +208,16 @@ public:
 
 /// 'dag' - Represent a dag fragment
 class DagRecTy : public RecTy {
-  friend detail::RecordContext;
+  friend detail::RecordKeeperImpl;
 
-  DagRecTy() : RecTy(DagRecTyKind) {}
+  DagRecTy(RecordKeeper &RK) : RecTy(DagRecTyKind, RK) {}
 
 public:
   static bool classof(const RecTy *RT) {
     return RT->getRecTyKind() == DagRecTyKind;
   }
 
-  static DagRecTy *get();
+  static DagRecTy *get(RecordKeeper &RK);
 
   std::string getAsString() const override;
 };
@@ -223,12 +229,12 @@ public:
 class RecordRecTy final : public RecTy, public FoldingSetNode,
                           public TrailingObjects<RecordRecTy, Record *> {
   friend class Record;
-  friend detail::RecordContext;
+  friend detail::RecordKeeperImpl;
 
   unsigned NumClasses;
 
-  explicit RecordRecTy(unsigned Num)
-      : RecTy(RecordRecTyKind), NumClasses(Num) {}
+  explicit RecordRecTy(RecordKeeper &RK, unsigned Num)
+      : RecTy(RecordRecTyKind, RK), NumClasses(Num) {}
 
 public:
   RecordRecTy(const RecordRecTy &) = delete;
@@ -242,7 +248,8 @@ public:
   }
 
   /// Get the record type with the given non-redundant list of superclasses.
-  static RecordRecTy *get(ArrayRef<Record *> Classes);
+  static RecordRecTy *get(RecordKeeper &RK, ArrayRef<Record *> Classes);
+  static RecordRecTy *get(Record *Class);
 
   void Profile(FoldingSetNodeID &ID) const;
 
@@ -304,6 +311,7 @@ protected:
     IK_CondOpInit,
     IK_FoldOpInit,
     IK_IsAOpInit,
+    IK_ExistsOpInit,
     IK_AnonymousNameInit,
     IK_StringInit,
     IK_VarInit,
@@ -327,6 +335,9 @@ public:
   /// Get the kind (type) of the value.
   InitKind getKind() const { return Kind; }
 
+  /// Get the record keeper that initialized this Init.
+  RecordKeeper &getRecordKeeper() const;
+
 protected:
   explicit Init(InitKind K, uint8_t Opc = 0) : Kind(K), Opc(Opc) {}
 
@@ -426,6 +437,9 @@ public:
   /// Get the type of the Init as a RecTy.
   RecTy *getType() const { return ValueTy; }
 
+  /// Get the record keeper that initialized this Init.
+  RecordKeeper &getRecordKeeper() const { return ValueTy->getRecordKeeper(); }
+
   Init *getCastTo(RecTy *Ty) const override;
   Init *convertInitializerTo(RecTy *Ty) const override;
 
@@ -440,9 +454,12 @@ public:
 
 /// '?' - Represents an uninitialized value.
 class UnsetInit : public Init {
-  friend detail::RecordContext;
+  friend detail::RecordKeeperImpl;
 
-  UnsetInit() : Init(IK_UnsetInit) {}
+  /// The record keeper that initialized this Init.
+  RecordKeeper &RK;
+
+  UnsetInit(RecordKeeper &RK) : Init(IK_UnsetInit), RK(RK) {}
 
 public:
   UnsetInit(const UnsetInit &) = delete;
@@ -453,7 +470,10 @@ public:
   }
 
   /// Get the singleton unset Init.
-  static UnsetInit *get();
+  static UnsetInit *get(RecordKeeper &RK);
+
+  /// Get the record keeper that initialized this Init.
+  RecordKeeper &getRecordKeeper() const { return RK; }
 
   Init *getCastTo(RecTy *Ty) const override;
   Init *convertInitializerTo(RecTy *Ty) const override;
@@ -473,7 +493,7 @@ public:
 
 /// 'true'/'false' - Represent a concrete initializer for a bit.
 class BitInit final : public TypedInit {
-  friend detail::RecordContext;
+  friend detail::RecordKeeperImpl;
 
   bool Value;
 
@@ -487,7 +507,7 @@ public:
     return I->getKind() == IK_BitInit;
   }
 
-  static BitInit *get(bool V);
+  static BitInit *get(RecordKeeper &RK, bool V);
 
   bool getValue() const { return Value; }
 
@@ -508,8 +528,8 @@ class BitsInit final : public TypedInit, public FoldingSetNode,
                        public TrailingObjects<BitsInit, Init *> {
   unsigned NumBits;
 
-  BitsInit(unsigned N)
-    : TypedInit(IK_BitsInit, BitsRecTy::get(N)), NumBits(N) {}
+  BitsInit(RecordKeeper &RK, unsigned N)
+      : TypedInit(IK_BitsInit, BitsRecTy::get(RK, N)), NumBits(N) {}
 
 public:
   BitsInit(const BitsInit &) = delete;
@@ -522,7 +542,7 @@ public:
     return I->getKind() == IK_BitsInit;
   }
 
-  static BitsInit *get(ArrayRef<Init *> Range);
+  static BitsInit *get(RecordKeeper &RK, ArrayRef<Init *> Range);
 
   void Profile(FoldingSetNodeID &ID) const;
 
@@ -558,8 +578,8 @@ public:
 class IntInit : public TypedInit {
   int64_t Value;
 
-  explicit IntInit(int64_t V)
-    : TypedInit(IK_IntInit, IntRecTy::get()), Value(V) {}
+  explicit IntInit(RecordKeeper &RK, int64_t V)
+      : TypedInit(IK_IntInit, IntRecTy::get(RK)), Value(V) {}
 
 public:
   IntInit(const IntInit &) = delete;
@@ -569,7 +589,7 @@ public:
     return I->getKind() == IK_IntInit;
   }
 
-  static IntInit *get(int64_t V);
+  static IntInit *get(RecordKeeper &RK, int64_t V);
 
   int64_t getValue() const { return Value; }
 
@@ -580,7 +600,7 @@ public:
   std::string getAsString() const override;
 
   Init *getBit(unsigned Bit) const override {
-    return BitInit::get((Value & (1ULL << Bit)) != 0);
+    return BitInit::get(getRecordKeeper(), (Value & (1ULL << Bit)) != 0);
   }
 };
 
@@ -588,8 +608,8 @@ public:
 class AnonymousNameInit : public TypedInit {
   unsigned Value;
 
-  explicit AnonymousNameInit(unsigned V)
-      : TypedInit(IK_AnonymousNameInit, StringRecTy::get()), Value(V) {}
+  explicit AnonymousNameInit(RecordKeeper &RK, unsigned V)
+      : TypedInit(IK_AnonymousNameInit, StringRecTy::get(RK)), Value(V) {}
 
 public:
   AnonymousNameInit(const AnonymousNameInit &) = delete;
@@ -599,7 +619,7 @@ public:
     return I->getKind() == IK_AnonymousNameInit;
   }
 
-  static AnonymousNameInit *get(unsigned);
+  static AnonymousNameInit *get(RecordKeeper &RK, unsigned);
 
   unsigned getValue() const { return Value; }
 
@@ -626,8 +646,8 @@ private:
   StringRef Value;
   StringFormat Format;
 
-  explicit StringInit(StringRef V, StringFormat Fmt)
-      : TypedInit(IK_StringInit, StringRecTy::get()), Value(V), Format(Fmt) {}
+  explicit StringInit(RecordKeeper &RK, StringRef V, StringFormat Fmt)
+      : TypedInit(IK_StringInit, StringRecTy::get(RK)), Value(V), Format(Fmt) {}
 
 public:
   StringInit(const StringInit &) = delete;
@@ -637,7 +657,8 @@ public:
     return I->getKind() == IK_StringInit;
   }
 
-  static StringInit *get(StringRef, StringFormat Fmt = SF_String);
+  static StringInit *get(RecordKeeper &RK, StringRef,
+                         StringFormat Fmt = SF_String);
 
   static StringFormat determineFormat(StringFormat Fmt1, StringFormat Fmt2) {
     return (Fmt1 == SF_Code || Fmt2 == SF_Code) ? SF_Code : SF_String;
@@ -678,7 +699,7 @@ public:
 
 private:
   explicit ListInit(unsigned N, RecTy *EltTy)
-    : TypedInit(IK_ListInit, ListRecTy::get(EltTy)), NumValues(N) {}
+      : TypedInit(IK_ListInit, ListRecTy::get(EltTy)), NumValues(N) {}
 
 public:
   ListInit(const ListInit &) = delete;
@@ -1049,8 +1070,8 @@ private:
   Init *Expr;
 
   IsAOpInit(RecTy *CheckType, Init *Expr)
-      : TypedInit(IK_IsAOpInit, IntRecTy::get()), CheckType(CheckType),
-        Expr(Expr) {}
+      : TypedInit(IK_IsAOpInit, IntRecTy::get(CheckType->getRecordKeeper())),
+        CheckType(CheckType), Expr(Expr) {}
 
 public:
   IsAOpInit(const IsAOpInit &) = delete;
@@ -1075,6 +1096,40 @@ public:
   std::string getAsString() const override;
 };
 
+/// !exists<type>(expr) - Dynamically determine if a record of `type` named
+/// `expr` exists.
+class ExistsOpInit : public TypedInit, public FoldingSetNode {
+private:
+  RecTy *CheckType;
+  Init *Expr;
+
+  ExistsOpInit(RecTy *CheckType, Init *Expr)
+      : TypedInit(IK_ExistsOpInit, IntRecTy::get(CheckType->getRecordKeeper())),
+        CheckType(CheckType), Expr(Expr) {}
+
+public:
+  ExistsOpInit(const ExistsOpInit &) = delete;
+  ExistsOpInit &operator=(const ExistsOpInit &) = delete;
+
+  static bool classof(const Init *I) { return I->getKind() == IK_ExistsOpInit; }
+
+  static ExistsOpInit *get(RecTy *CheckType, Init *Expr);
+
+  void Profile(FoldingSetNodeID &ID) const;
+
+  // Fold - If possible, fold this to a simpler init.  Return this if not
+  // possible to fold.
+  Init *Fold(Record *CurRec, bool IsFinal = false) const;
+
+  bool isComplete() const override { return false; }
+
+  Init *resolveReferences(Resolver &R) const override;
+
+  Init *getBit(unsigned Bit) const override;
+
+  std::string getAsString() const override;
+};
+
 /// 'Opcode' - Represent a reference to an entire variable object.
 class VarInit : public TypedInit {
   Init *VarName;
@@ -1118,7 +1173,8 @@ class VarBitInit final : public TypedInit {
   unsigned Bit;
 
   VarBitInit(TypedInit *T, unsigned B)
-      : TypedInit(IK_VarBitInit, BitRecTy::get()), TI(T), Bit(B) {
+      : TypedInit(IK_VarBitInit, BitRecTy::get(T->getRecordKeeper())), TI(T),
+        Bit(B) {
     assert(T->getType() &&
            (isa<IntRecTy>(T->getType()) ||
             (isa<BitsRecTy>(T->getType()) &&
@@ -1223,8 +1279,7 @@ class VarDefInit final : public TypedInit, public FoldingSetNode,
   DefInit *Def = nullptr; // after instantiation
   unsigned NumArgs;
 
-  explicit VarDefInit(Record *Class, unsigned N)
-    : TypedInit(IK_VarDefInit, RecordRecTy::get(Class)), Class(Class), NumArgs(N) {}
+  explicit VarDefInit(Record *Class, unsigned N);
 
   DefInit *instantiate();
 
@@ -1321,8 +1376,8 @@ class DagInit final : public TypedInit, public FoldingSetNode,
   unsigned NumArgNames;
 
   DagInit(Init *V, StringInit *VN, unsigned NumArgs, unsigned NumArgNames)
-      : TypedInit(IK_DagInit, DagRecTy::get()), Val(V), ValName(VN),
-        NumArgs(NumArgs), NumArgNames(NumArgNames) {}
+      : TypedInit(IK_DagInit, DagRecTy::get(V->getRecordKeeper())), Val(V),
+        ValName(VN), NumArgs(NumArgs), NumArgNames(NumArgNames) {}
 
   size_t numTrailingObjects(OverloadToken<Init *>) const { return NumArgs; }
 
@@ -1427,6 +1482,9 @@ public:
   RecordVal(Init *N, RecTy *T, FieldKind K);
   RecordVal(Init *N, SMLoc Loc, RecTy *T, FieldKind K);
 
+  /// Get the record keeper used to unique this value.
+  RecordKeeper &getRecordKeeper() const { return Name->getRecordKeeper(); }
+
   /// Get the name of the field as a StringRef.
   StringRef getName() const;
 
@@ -1527,13 +1585,14 @@ public:
   explicit Record(Init *N, ArrayRef<SMLoc> locs, RecordKeeper &records,
                   bool Anonymous = false, bool Class = false)
       : Name(N), Locs(locs.begin(), locs.end()), TrackedRecords(records),
-        ID(getNewUID()), IsAnonymous(Anonymous), IsClass(Class) {
+        ID(getNewUID(N->getRecordKeeper())), IsAnonymous(Anonymous),
+        IsClass(Class) {
     checkName();
   }
 
   explicit Record(StringRef N, ArrayRef<SMLoc> locs, RecordKeeper &records,
                   bool Class = false)
-      : Record(StringInit::get(N), locs, records, false, Class) {}
+      : Record(StringInit::get(records, N), locs, records, false, Class) {}
 
   // When copy-constructing a Record, we must still guarantee a globally unique
   // ID number. Don't copy CorrespondingDefInit either, since it's owned by the
@@ -1542,9 +1601,10 @@ public:
       : Name(O.Name), Locs(O.Locs), TemplateArgs(O.TemplateArgs),
         Values(O.Values), Assertions(O.Assertions),
         SuperClasses(O.SuperClasses), TrackedRecords(O.TrackedRecords),
-        ID(getNewUID()), IsAnonymous(O.IsAnonymous), IsClass(O.IsClass) {}
+        ID(getNewUID(O.getRecords())), IsAnonymous(O.IsAnonymous),
+        IsClass(O.IsClass) {}
 
-  static unsigned getNewUID();
+  static unsigned getNewUID(RecordKeeper &RK);
 
   unsigned getID() const { return ID; }
 
@@ -1600,7 +1660,7 @@ public:
   }
 
   const RecordVal *getValue(StringRef Name) const {
-    return getValue(StringInit::get(Name));
+    return getValue(StringInit::get(getRecords(), Name));
   }
 
   RecordVal *getValue(const Init *Name) {
@@ -1631,7 +1691,7 @@ public:
   }
 
   void removeValue(StringRef Name) {
-    removeValue(StringInit::get(Name));
+    removeValue(StringInit::get(getRecords(), Name));
   }
 
   void addAssertion(SMLoc Loc, Init *Condition, Init *Message) {
@@ -1671,11 +1731,11 @@ public:
     SuperClasses.push_back(std::make_pair(R, Range));
   }
 
-  /// If there are any field references that refer to fields
-  /// that have been filled in, we can propagate the values now.
+  /// If there are any field references that refer to fields that have been
+  /// filled in, we can propagate the values now.
   ///
-  /// This is a final resolve: any error messages, e.g. due to undefined
-  /// !cast references, are generated now.
+  /// This is a final resolve: any error messages, e.g. due to undefined !cast
+  /// references, are generated now.
   void resolveReferences(Init *NewName = nullptr);
 
   /// Apply the resolver to the name of the record as well as to the
@@ -1699,11 +1759,11 @@ public:
   // High-level methods useful to tablegen back-ends
   //
 
-  ///Return the source location for the named field.
+  /// Return the source location for the named field.
   SMLoc getFieldLoc(StringRef FieldName) const;
 
-  /// Return the initializer for a value with the specified name,
-  /// or throw an exception if the field does not exist.
+  /// Return the initializer for a value with the specified name, or throw an
+  /// exception if the field does not exist.
   Init *getValueInit(StringRef FieldName) const;
 
   /// Return true if the named field is unset.
@@ -1711,96 +1771,85 @@ public:
     return isa<UnsetInit>(getValueInit(FieldName));
   }
 
-  /// This method looks up the specified field and returns
-  /// its value as a string, throwing an exception if the field does not exist
-  /// or if the value is not a string.
+  /// This method looks up the specified field and returns its value as a
+  /// string, throwing an exception if the field does not exist or if the value
+  /// is not a string.
   StringRef getValueAsString(StringRef FieldName) const;
 
-  /// This method looks up the specified field and returns
-  /// its value as a string, throwing an exception if the field if the value is
-  /// not a string and llvm::Optional() if the field does not exist.
+  /// This method looks up the specified field and returns its value as a
+  /// string, throwing an exception if the value is not a string and
+  /// llvm::Optional() if the field does not exist.
   llvm::Optional<StringRef> getValueAsOptionalString(StringRef FieldName) const;
 
-  /// This method looks up the specified field and returns
-  /// its value as a BitsInit, throwing an exception if the field does not exist
-  /// or if the value is not the right type.
+  /// This method looks up the specified field and returns its value as a
+  /// BitsInit, throwing an exception if the field does not exist or if the
+  /// value is not the right type.
   BitsInit *getValueAsBitsInit(StringRef FieldName) const;
 
-  /// This method looks up the specified field and returns
-  /// its value as a ListInit, throwing an exception if the field does not exist
-  /// or if the value is not the right type.
+  /// This method looks up the specified field and returns its value as a
+  /// ListInit, throwing an exception if the field does not exist or if the
+  /// value is not the right type.
   ListInit *getValueAsListInit(StringRef FieldName) const;
 
-  /// This method looks up the specified field and
-  /// returns its value as a vector of records, throwing an exception if the
-  /// field does not exist or if the value is not the right type.
+  /// This method looks up the specified field and returns its value as a
+  /// vector of records, throwing an exception if the field does not exist or
+  /// if the value is not the right type.
   std::vector<Record*> getValueAsListOfDefs(StringRef FieldName) const;
 
-  /// This method looks up the specified field and
-  /// returns its value as a vector of integers, throwing an exception if the
-  /// field does not exist or if the value is not the right type.
+  /// This method looks up the specified field and returns its value as a
+  /// vector of integers, throwing an exception if the field does not exist or
+  /// if the value is not the right type.
   std::vector<int64_t> getValueAsListOfInts(StringRef FieldName) const;
 
-  /// This method looks up the specified field and
-  /// returns its value as a vector of strings, throwing an exception if the
-  /// field does not exist or if the value is not the right type.
+  /// This method looks up the specified field and returns its value as a
+  /// vector of strings, throwing an exception if the field does not exist or
+  /// if the value is not the right type.
   std::vector<StringRef> getValueAsListOfStrings(StringRef FieldName) const;
 
-  /// This method looks up the specified field and returns its
-  /// value as a Record, throwing an exception if the field does not exist or if
-  /// the value is not the right type.
+  /// This method looks up the specified field and returns its value as a
+  /// Record, throwing an exception if the field does not exist or if the value
+  /// is not the right type.
   Record *getValueAsDef(StringRef FieldName) const;
 
   /// This method looks up the specified field and returns its value as a
-  /// Record, returning null if the field exists but is "uninitialized"
-  /// (i.e. set to `?`), and throwing an exception if the field does not
-  /// exist or if its value is not the right type.
+  /// Record, returning null if the field exists but is "uninitialized" (i.e.
+  /// set to `?`), and throwing an exception if the field does not exist or if
+  /// its value is not the right type.
   Record *getValueAsOptionalDef(StringRef FieldName) const;
 
-  /// This method looks up the specified field and returns its
-  /// value as a bit, throwing an exception if the field does not exist or if
-  /// the value is not the right type.
+  /// This method looks up the specified field and returns its value as a bit,
+  /// throwing an exception if the field does not exist or if the value is not
+  /// the right type.
   bool getValueAsBit(StringRef FieldName) const;
 
-  /// This method looks up the specified field and
-  /// returns its value as a bit. If the field is unset, sets Unset to true and
-  /// returns false.
+  /// This method looks up the specified field and returns its value as a bit.
+  /// If the field is unset, sets Unset to true and returns false.
   bool getValueAsBitOrUnset(StringRef FieldName, bool &Unset) const;
 
-  /// This method looks up the specified field and returns its
-  /// value as an int64_t, throwing an exception if the field does not exist or
-  /// if the value is not the right type.
+  /// This method looks up the specified field and returns its value as an
+  /// int64_t, throwing an exception if the field does not exist or if the
+  /// value is not the right type.
   int64_t getValueAsInt(StringRef FieldName) const;
 
-  /// This method looks up the specified field and returns its
-  /// value as an Dag, throwing an exception if the field does not exist or if
-  /// the value is not the right type.
+  /// This method looks up the specified field and returns its value as an Dag,
+  /// throwing an exception if the field does not exist or if the value is not
+  /// the right type.
   DagInit *getValueAsDag(StringRef FieldName) const;
 };
 
 raw_ostream &operator<<(raw_ostream &OS, const Record &R);
 
 class RecordKeeper {
-  friend class RecordRecTy;
-
   using RecordMap = std::map<std::string, std::unique_ptr<Record>, std::less<>>;
   using GlobalMap = std::map<std::string, Init *, std::less<>>;
 
-  std::string InputFilename;
-  RecordMap Classes, Defs;
-  mutable StringMap<std::vector<Record *>> ClassRecordsMap;
-  FoldingSet<RecordRecTy> RecordTypePool;
-  std::map<std::string, Init *, std::less<>> ExtraGlobals;
-  unsigned AnonCounter = 0;
+public:
+  RecordKeeper();
+  ~RecordKeeper();
 
-  // These members are for the phase timing feature. We need a timer group,
-  // the last timer started, and a flag to say whether the last timer
-  // is the special "backend overall timer."
-  TimerGroup *TimingGroup = nullptr;
-  Timer *LastTimer = nullptr;
-  bool BackendTimer = false;
+  /// Return the internal implementation of the RecordKeeper.
+  detail::RecordKeeperImpl &getImpl() { return *Impl; }
 
-public:
   /// Get the main TableGen input file's name.
   const std::string getInputFilename() const { return InputFilename; }
 
@@ -1896,7 +1945,33 @@ public:
   std::vector<Record *> getAllDerivedDefinitions(
       ArrayRef<StringRef> ClassNames) const;
 
+  /// Get all the concrete records that inherit from specified class, if the
+  /// class is defined. Returns an empty vector if the class is not defined.
+  std::vector<Record *>
+  getAllDerivedDefinitionsIfDefined(StringRef ClassName) const;
+
   void dump() const;
+
+private:
+  RecordKeeper(RecordKeeper &&) = delete;
+  RecordKeeper(const RecordKeeper &) = delete;
+  RecordKeeper &operator=(RecordKeeper &&) = delete;
+  RecordKeeper &operator=(const RecordKeeper &) = delete;
+
+  std::string InputFilename;
+  RecordMap Classes, Defs;
+  mutable StringMap<std::vector<Record *>> ClassRecordsMap;
+  GlobalMap ExtraGlobals;
+
+  // These members are for the phase timing feature. We need a timer group,
+  // the last timer started, and a flag to say whether the last timer
+  // is the special "backend overall timer."
+  TimerGroup *TimingGroup = nullptr;
+  Timer *LastTimer = nullptr;
+  bool BackendTimer = false;
+
+  /// The internal uniquer implementation of the RecordKeeper.
+  std::unique_ptr<detail::RecordKeeperImpl> Impl;
 };
 
 /// Sorting predicate to sort record pointers by name.
diff --git a/llvm/include/llvm/Target/CGPassBuilderOption.h b/llvm/include/llvm/Target/CGPassBuilderOption.h
index f84889392d13..7a6d91061701 100644
--- a/llvm/include/llvm/Target/CGPassBuilderOption.h
+++ b/llvm/include/llvm/Target/CGPassBuilderOption.h
@@ -42,6 +42,7 @@ struct CGPassBuilderOption {
   bool DisableMergeICmps = false;
   bool DisablePartialLibcallInlining = false;
   bool DisableConstantHoisting = false;
+  bool DisableSelectOptimize = true;
   bool PrintISelInput = false;
   bool PrintGCInfo = false;
   bool RequiresCodeGenSCCOrder = false;
diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td
index 2af20ab6a53f..3e2f18b57d1e 100644
--- a/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/llvm/include/llvm/Target/GenericOpcodes.td
@@ -745,6 +745,13 @@ def G_FCANONICALIZE : GenericInstruction {
   let hasSideEffects = false;
 }
 
+// Generic opcode equivalent to the llvm.is_fpclass intrinsic.
+def G_IS_FPCLASS: GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$src, unknown:$test, unknown:$fpsem);
+  let hasSideEffects = false;
+}
+
 // FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two
 // values.
 //
@@ -965,6 +972,12 @@ def G_FNEARBYINT : GenericInstruction {
 //------------------------------------------------------------------------------
 // Opcodes for LLVM Intrinsics
 //------------------------------------------------------------------------------
+def G_INTRINSIC_FPTRUNC_ROUND : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$src1, i32imm:$round_mode);
+  let hasSideEffects = false;
+}
+
 def G_INTRINSIC_TRUNC : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1);
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 4859cf6b57b7..89f08d200021 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -118,6 +118,7 @@ def int64_matchinfo: GIDefMatchData<"int64_t">;
 def apint_matchinfo : GIDefMatchData<"APInt">;
 def build_fn_matchinfo :
 GIDefMatchData<"std::function<void(MachineIRBuilder &)>">;
+def unsigned_matchinfo: GIDefMatchData<"unsigned">;
 
 def copy_prop : GICombineRule<
   (defs root:$d),
@@ -234,6 +235,12 @@ def binop_left_undef_to_zero: GICombineRule<
          [{ return Helper.matchOperandIsUndef(*${root}, 1); }]),
   (apply [{ Helper.replaceInstWithConstant(*${root}, 0); }])>;
 
+def binop_right_undef_to_undef: GICombineRule<
+  (defs root:$root),
+  (match (wip_match_opcode G_SHL, G_ASHR, G_LSHR):$root,
+         [{ return Helper.matchOperandIsUndef(*${root}, 2); }]),
+  (apply [{ Helper.replaceInstWithUndef(*${root}); }])>;
+
 // Instructions where if any source operand is undef, the instruction can be
 // replaced with undef.
 def propagate_undef_any_op: GICombineRule<
@@ -283,6 +290,13 @@ def select_constant_cmp: GICombineRule<
   (apply [{ return Helper.replaceSingleDefInstWithOperand(*${root}, ${matchinfo}); }])
 >;
 
+def select_to_logical : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_SELECT):$root,
+    [{ return Helper.matchSelectToLogical(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])
+>;
+
 // Fold x op 0 -> x
 def right_identity_zero: GICombineRule<
   (defs root:$root),
@@ -323,6 +337,26 @@ def urem_pow2_to_mask : GICombineRule<
   (apply [{ Helper.applySimplifyURemByPow2(*${root}); }])
 >;
 
+// Push a binary operator through a select on constants.
+//
+// binop (select cond, K0, K1), K2 ->
+//   select cond, (binop K0, K2), (binop K1, K2)
+
+// Every binary operator that has constant folding. We currently do
+// not have constant folding for G_FPOW, G_FMAXNUM_IEEE or
+// G_FMINNUM_IEEE.
+def fold_binop_into_select : GICombineRule<
+  (defs root:$root, unsigned_matchinfo:$select_op_no),
+  (match (wip_match_opcode
+    G_ADD, G_SUB, G_PTR_ADD, G_AND, G_OR, G_XOR,
+    G_SDIV, G_SREM, G_UDIV, G_UREM, G_LSHR, G_ASHR, G_SHL,
+    G_SMIN, G_SMAX, G_UMIN, G_UMAX,
+    G_FMUL, G_FADD, G_FSUB, G_FDIV, G_FREM,
+    G_FMINNUM, G_FMAXNUM, G_FMINIMUM, G_FMAXIMUM):$root,
+    [{ return Helper.matchFoldBinOpIntoSelect(*${root}, ${select_op_no}); }]),
+  (apply [{ return Helper.applyFoldBinOpIntoSelect(*${root}, ${select_op_no}); }])
+>;
+
 // Transform d = [su]div(x, y) and r = [su]rem(x, y) - > d, r = [su]divrem(x, y)
 def div_rem_to_divrem_matchdata : GIDefMatchData<"MachineInstr *">;
 def div_rem_to_divrem : GICombineRule<
@@ -753,6 +787,18 @@ def mulo_by_2: GICombineRule<
          [{ return Helper.matchMulOBy2(*${root}, ${matchinfo}); }]),
   (apply [{ Helper.applyBuildFnNoErase(*${root}, ${matchinfo}); }])>;
 
+def mulo_by_0: GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_UMULO, G_SMULO):$root,
+         [{ return Helper.matchMulOBy0(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+
+def addo_by_0: GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_UADDO, G_SADDO):$root,
+         [{ return Helper.matchAddOBy0(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+
 def mulh_to_lshr : GICombineRule<
   (defs root:$root),
   (match (wip_match_opcode G_UMULH):$root,
@@ -845,10 +891,26 @@ def combine_fsub_fpext_fneg_fmul_to_fmad_or_fma: GICombineRule<
                                             *${root}, ${info}); }]),
   (apply [{ Helper.applyBuildFn(*${root}, ${info}); }])>;
 
+def combine_minmax_nan: GICombineRule<
+  (defs root:$root, unsigned_matchinfo:$info),
+  (match (wip_match_opcode G_FMINNUM, G_FMAXNUM, G_FMINIMUM, G_FMAXIMUM):$root,
+         [{ return Helper.matchCombineFMinMaxNaN(*${root}, ${info}); }]),
+  (apply [{ Helper.replaceSingleDefInstWithOperand(*${root}, ${info}); }])>;
+
+// Transform (add x, (sub y, x)) -> y
+// Transform (add (sub y, x), x) -> y
+def add_sub_reg: GICombineRule <
+  (defs root:$root, register_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_ADD):$root,
+         [{ return Helper.matchAddSubSameReg(*${root}, ${matchinfo}); }]),
+  (apply [{ return Helper.replaceSingleDefInstWithReg(*${root},
+                                                      ${matchinfo}); }])>;
+
 // FIXME: These should use the custom predicate feature once it lands.
 def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero,
                                      undef_to_negative_one,
                                      binop_left_undef_to_zero,
+                                     binop_right_undef_to_undef,
                                      propagate_undef_any_op,
                                      propagate_undef_all_ops,
                                      propagate_undef_shuffle_mask,
@@ -859,10 +921,12 @@ def identity_combines : GICombineGroup<[select_same_val, right_identity_zero,
                                         binop_same_val, binop_left_to_zero,
                                         binop_right_to_zero, p2i_to_i2p,
                                         i2p_to_p2i, anyext_trunc_fold,
-                                        fneg_fneg_fold, right_identity_one]>;
+                                        fneg_fneg_fold, right_identity_one,
+                                        add_sub_reg]>;
 
 def const_combines : GICombineGroup<[constant_fp_op, const_ptradd_to_i2p,
-                                     overlapping_and, mulo_by_2]>;
+                                     overlapping_and, mulo_by_2, mulo_by_0,
+                                     addo_by_0, combine_minmax_nan]>;
 
 def known_bits_simplifications : GICombineGroup<[
   redundant_and, redundant_sext_inreg, redundant_or, urem_pow2_to_mask,
@@ -873,7 +937,8 @@ def width_reduction_combines : GICombineGroup<[reduce_shl_of_extend,
 
 def phi_combines : GICombineGroup<[extend_through_phis]>;
 
-def select_combines : GICombineGroup<[select_undef_cmp, select_constant_cmp]>;
+def select_combines : GICombineGroup<[select_undef_cmp, select_constant_cmp,
+                                      select_to_logical]>;
 
 def trivial_combines : GICombineGroup<[copy_prop, mul_to_shl, add_p2i_to_ptradd,
                                        mul_by_neg_one]>;
@@ -900,7 +965,7 @@ def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
     truncstore_merge, div_rem_to_divrem, funnel_shift_combines,
     form_bitfield_extract, constant_fold, fabs_fneg_fold,
     intdiv_combines, mulh_combines, redundant_neg_operands,
-    and_or_disjoint_mask, fma_combines]>;
+    and_or_disjoint_mask, fma_combines, fold_binop_into_select]>;
 
 // A combine group used to for prelegalizer combiners at -O0. The combines in
 // this group have been selected based on experiments to balance code size and
diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td
index d8faa63ee877..c5b2462dc868 100644
--- a/llvm/include/llvm/Target/Target.td
+++ b/llvm/include/llvm/Target/Target.td
@@ -279,6 +279,8 @@ class RegisterClass<string namespace, list<ValueType> regTypes, int alignment,
   // heuristic. Classes with higher priority values are assigned first. This is
   // useful as it is sometimes beneficial to assign registers to highly
   // constrained classes first. The value has to be in the range [0,63].
+  // Values >= 32 should be used with care since they may overlap with other
+  // fields in the allocator's priority heuristics.
   int AllocationPriority = 0;
 
   // Generate register pressure set for this register class and any class
@@ -389,6 +391,14 @@ class RegisterTuples<list<SubRegIndex> Indices, list<dag> Regs,
   list<string> RegAsmNames = RegNames;
 }
 
+// RegisterCategory - This class is a list of RegisterClasses that belong to a
+// general cateogry --- e.g. "general purpose" or "fixed" registers. This is
+// useful for identifying registers in a generic way instead of having
+// information about a specific target's registers.
+class RegisterCategory<list<RegisterClass> classes> {
+  // Classes - A list of register classes that fall within the category.
+  list<RegisterClass> Classes = classes;
+}
 
 //===----------------------------------------------------------------------===//
 // DwarfRegNum - This class provides a mapping of the llvm register enumeration
@@ -560,6 +570,9 @@ class Instruction : InstructionEncoding {
   bit isPseudo     = false;     // Is this instruction a pseudo-instruction?
                                 // If so, won't have encoding information for
                                 // the [MC]CodeEmitter stuff.
+  bit isMeta = false;           // Is this instruction a meta-instruction?
+                                // If so, won't produce any output in the form of
+                                // executable instructions
   bit isExtractSubreg = false;  // Is this instruction a kind of extract subreg?
                                 // If so, make sure to override
                                 // TargetInstrInfo::getExtractSubregLikeInputs.
@@ -748,6 +761,33 @@ def ins;
 /// of operands.
 def variable_ops;
 
+/// variable-length instruction encoding utilities.
+/// The `ascend` operator should be used like this:
+///     (ascend 0b0010, 0b1101)
+/// Which represent a seqence of encoding fragments placing from LSB to MSB.
+/// Thus, in this case the final encoding will be 0b1101_0010.
+/// The arguments for `ascend` can either be `bits` or another DAG.
+def ascend;
+/// In addition, we can use `descend` to describe an encoding that places
+/// its arguments (i.e. encoding fragments) from MSB to LSB. For instance:
+///     (descend 0b0010, 0b1101)
+/// This results in an encoding of 0b0010_1101.
+def descend;
+/// The `operand` operator should be used like this:
+///     (operand "$src", 4)
+/// Which represents a 4-bit encoding for an instruction operand named `$src`.
+def operand;
+/// Similar to `operand`, we can reference only part of the operand's encoding:
+///     (slice "$src", 6, 8)
+///     (slice "$src", 8, 6)
+/// Both DAG represent bit 6 to 8 (total of 3 bits) in the encoding of operand
+/// `$src`.
+def slice;
+/// You can use `encoder` to specify a custom encoder function for a specific
+/// `operand` or `encoder` directive. For example:
+///     (operand "$src", 4, (encoder "encodeMyImm"))
+///     (slice "$src", 8, 6, (encoder "encodeMyReg"))
+def encoder;
 
 /// PointerLikeRegClass - Values that are designed to have pointer width are
 /// derived from this.  TableGen treats the register class as having a symbolic
@@ -1064,6 +1104,7 @@ def CFI_INSTRUCTION : StandardPseudoInstruction {
   let hasCtrlDep = true;
   let hasSideEffects = false;
   let isNotDuplicable = true;
+  let isMeta = true;
 }
 def EH_LABEL : StandardPseudoInstruction {
   let OutOperandList = (outs);
@@ -1072,6 +1113,7 @@ def EH_LABEL : StandardPseudoInstruction {
   let hasCtrlDep = true;
   let hasSideEffects = false;
   let isNotDuplicable = true;
+  let isMeta = true;
 }
 def GC_LABEL : StandardPseudoInstruction {
   let OutOperandList = (outs);
@@ -1080,6 +1122,7 @@ def GC_LABEL : StandardPseudoInstruction {
   let hasCtrlDep = true;
   let hasSideEffects = false;
   let isNotDuplicable = true;
+  let isMeta = true;
 }
 def ANNOTATION_LABEL : StandardPseudoInstruction {
   let OutOperandList = (outs);
@@ -1094,6 +1137,7 @@ def KILL : StandardPseudoInstruction {
   let InOperandList = (ins variable_ops);
   let AsmString = "";
   let hasSideEffects = false;
+  let isMeta = true;
 }
 def EXTRACT_SUBREG : StandardPseudoInstruction {
   let OutOperandList = (outs unknown:$dst);
@@ -1115,6 +1159,7 @@ def IMPLICIT_DEF : StandardPseudoInstruction {
   let hasSideEffects = false;
   let isReMaterializable = true;
   let isAsCheapAsAMove = true;
+  let isMeta = true;
 }
 def SUBREG_TO_REG : StandardPseudoInstruction {
   let OutOperandList = (outs unknown:$dst);
@@ -1134,30 +1179,35 @@ def DBG_VALUE : StandardPseudoInstruction {
   let InOperandList = (ins variable_ops);
   let AsmString = "DBG_VALUE";
   let hasSideEffects = false;
+  let isMeta = true;
 }
 def DBG_VALUE_LIST : StandardPseudoInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins variable_ops);
   let AsmString = "DBG_VALUE_LIST";
   let hasSideEffects = 0;
+  let isMeta = true;
 }
 def DBG_INSTR_REF : StandardPseudoInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins variable_ops);
   let AsmString = "DBG_INSTR_REF";
   let hasSideEffects = false;
+  let isMeta = true;
 }
 def DBG_PHI : StandardPseudoInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins variable_ops);
   let AsmString = "DBG_PHI";
   let hasSideEffects = 0;
+  let isMeta = true;
 }
 def DBG_LABEL : StandardPseudoInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins unknown:$label);
   let AsmString = "DBG_LABEL";
   let hasSideEffects = false;
+  let isMeta = true;
 }
 def REG_SEQUENCE : StandardPseudoInstruction {
   let OutOperandList = (outs unknown:$dst);
@@ -1185,18 +1235,21 @@ def LIFETIME_START : StandardPseudoInstruction {
   let InOperandList = (ins i32imm:$id);
   let AsmString = "LIFETIME_START";
   let hasSideEffects = false;
+  let isMeta = true;
 }
 def LIFETIME_END : StandardPseudoInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins i32imm:$id);
   let AsmString = "LIFETIME_END";
   let hasSideEffects = false;
+  let isMeta = true;
 }
 def PSEUDO_PROBE : StandardPseudoInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins i64imm:$guid, i64imm:$index, i8imm:$type, i32imm:$attr);
   let AsmString = "PSEUDO_PROBE";
   let hasSideEffects = 1;
+  let isMeta = true;
 }
 def ARITH_FENCE : StandardPseudoInstruction {
   let OutOperandList = (outs unknown:$dst);
@@ -1204,6 +1257,7 @@ def ARITH_FENCE : StandardPseudoInstruction {
   let AsmString = "";
   let hasSideEffects = false;
   let Constraints = "$src = $dst";
+  let isMeta = true;
 }
 
 def STACKMAP : StandardPseudoInstruction {
diff --git a/llvm/include/llvm/Target/TargetLoweringObjectFile.h b/llvm/include/llvm/Target/TargetLoweringObjectFile.h
index 392ee4334cb5..0c09cfe68478 100644
--- a/llvm/include/llvm/Target/TargetLoweringObjectFile.h
+++ b/llvm/include/llvm/Target/TargetLoweringObjectFile.h
@@ -20,6 +20,7 @@
 
 namespace llvm {
 
+struct Align;
 class Constant;
 class DataLayout;
 class Function;
@@ -276,7 +277,7 @@ public:
   }
 
   /// If supported, return the function entry point symbol.
-  /// Otherwise, returns nulltpr.
+  /// Otherwise, returns nullptr.
   /// Func must be a function or an alias which has a function as base object.
   virtual MCSymbol *getFunctionEntryPointSymbol(const GlobalValue *Func,
                                                 const TargetMachine &TM) const {
diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index acfb265a9ff9..bf37ad7010ec 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -18,7 +18,6 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Pass.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/PGOOptions.h"
@@ -30,8 +29,6 @@
 namespace llvm {
 
 class AAManager;
-template <typename IRUnitT, typename AnalysisManagerT, typename... ExtraArgTs>
-class PassManager;
 using ModulePassManager = PassManager<Module>;
 
 class Function;
@@ -225,7 +222,10 @@ public:
 
   /// Returns the code model. The choices are small, kernel, medium, large, and
   /// target default.
-  CodeModel::Model getCodeModel() const;
+  CodeModel::Model getCodeModel() const { return CMModel; }
+
+  /// Set the code model.
+  void setCodeModel(CodeModel::Model CM) { CMModel = CM; }
 
   bool isPositionIndependent() const;
 
@@ -260,6 +260,8 @@ public:
     Options.SupportsDebugEntryValues = Enable;
   }
 
+  void setCFIFixup(bool Enable) { Options.EnableCFIFixup = Enable; }
+
   bool getAIXExtendedAltivecABI() const {
     return Options.EnableAIXExtendedAltivecABI;
   }
@@ -337,13 +339,13 @@ public:
   /// This is used to construct the new pass manager's target IR analysis pass,
   /// set up appropriately for this target machine. Even the old pass manager
   /// uses this to answer queries about the IR.
-  TargetIRAnalysis getTargetIRAnalysis();
+  TargetIRAnalysis getTargetIRAnalysis() const;
 
   /// Return a TargetTransformInfo for a given function.
   ///
   /// The returned TargetTransformInfo is specialized to the subtarget
   /// corresponding to \p F.
-  virtual TargetTransformInfo getTargetTransformInfo(const Function &F);
+  virtual TargetTransformInfo getTargetTransformInfo(const Function &F) const;
 
   /// Allow the target to modify the pass manager, e.g. by calling
   /// PassManagerBuilder::addExtension.
@@ -398,6 +400,12 @@ public:
   virtual unsigned getSjLjDataSize() const { return DefaultSjLjDataSize; }
 
   static std::pair<int, int> parseBinutilsVersion(StringRef Version);
+
+  /// getAddressSpaceForPseudoSourceKind - Given the kind of memory
+  /// (e.g. stack) the target returns the corresponding address space.
+  virtual unsigned getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
+    return 0;
+  }
 };
 
 /// This class describes a target machine that is implemented with the LLVM
@@ -417,7 +425,7 @@ public:
   ///
   /// The TTI returned uses the common code generator to answer queries about
   /// the IR.
-  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+  TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
 
   /// Create a pass configuration object to be used by addPassToEmitX methods
   /// for generating a pipeline of CodeGen passes.
diff --git a/llvm/include/llvm/Target/TargetOptions.h b/llvm/include/llvm/Target/TargetOptions.h
index a636c4822832..6083d18d96f7 100644
--- a/llvm/include/llvm/Target/TargetOptions.h
+++ b/llvm/include/llvm/Target/TargetOptions.h
@@ -130,19 +130,21 @@ namespace llvm {
           HonorSignDependentRoundingFPMathOption(false), NoZerosInBSS(false),
           GuaranteedTailCallOpt(false), StackSymbolOrdering(true),
           EnableFastISel(false), EnableGlobalISel(false), UseInitArray(false),
-          DisableIntegratedAS(false), RelaxELFRelocations(false),
-          FunctionSections(false), DataSections(false),
-          IgnoreXCOFFVisibility(false), XCOFFTracebackTable(true),
-          UniqueSectionNames(true), UniqueBasicBlockSectionNames(false),
-          TrapUnreachable(false), NoTrapAfterNoreturn(false), TLSSize(0),
-          EmulatedTLS(false), ExplicitEmulatedTLS(false), EnableIPRA(false),
+          LowerGlobalDtorsViaCxaAtExit(false), DisableIntegratedAS(false),
+          RelaxELFRelocations(false), FunctionSections(false),
+          DataSections(false), IgnoreXCOFFVisibility(false),
+          XCOFFTracebackTable(true), UniqueSectionNames(true),
+          UniqueBasicBlockSectionNames(false), TrapUnreachable(false),
+          NoTrapAfterNoreturn(false), TLSSize(0), EmulatedTLS(false),
+          ExplicitEmulatedTLS(false), EnableIPRA(false),
           EmitStackSizeSection(false), EnableMachineOutliner(false),
           EnableMachineFunctionSplitter(false), SupportsDefaultOutlining(false),
           EmitAddrsig(false), EmitCallSiteInfo(false),
           SupportsDebugEntryValues(false), EnableDebugEntryValues(false),
           ValueTrackingVariableLocations(false), ForceDwarfFrameSection(false),
           XRayOmitFunctionIndex(false), DebugStrictDwarf(false),
-          Hotpatch(false),
+          Hotpatch(false), PPCGenScalarMASSEntries(false), JMCInstrument(false),
+          EnableCFIFixup(false), MisExpect(false),
           FPDenormalMode(DenormalMode::IEEE, DenormalMode::IEEE) {}
 
     /// DisableFramePointerElim - This returns true if frame pointer elimination
@@ -245,6 +247,10 @@ namespace llvm {
     /// constructors.
     unsigned UseInitArray : 1;
 
+    /// Use __cxa_atexit to register global destructors; determines how
+    /// llvm.global_dtors is lowered.
+    unsigned LowerGlobalDtorsViaCxaAtExit : 1;
+
     /// Disable the integrated assembler.
     unsigned DisableIntegratedAS : 1;
 
@@ -345,6 +351,19 @@ namespace llvm {
     /// Emit the hotpatch flag in CodeView debug.
     unsigned Hotpatch : 1;
 
+    /// Enables scalar MASS conversions
+    unsigned PPCGenScalarMASSEntries : 1;
+
+    /// Enable JustMyCode instrumentation.
+    unsigned JMCInstrument : 1;
+
+    /// Enable the CFIFixup pass.
+    unsigned EnableCFIFixup : 1;
+
+    /// When set to true, enable MisExpect Diagnostics
+    /// By default, it is set to false
+    unsigned MisExpect : 1;
+
     /// Name of the stack usage file (i.e., .su file) if user passes
     /// -fstack-usage. If empty, it can be implied that -fstack-usage is not
     /// passed on the command line.
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index d8ef7c49a5f9..47b686aca7b5 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -238,6 +238,16 @@ def SDTMaskedLoad: SDTypeProfile<1, 4, [       // masked load
   SDTCisSameNumEltsAs<0, 3>
 ]>;
 
+def SDTMaskedGather : SDTypeProfile<1, 4, [
+  SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisVec<2>, SDTCisPtrTy<3>, SDTCisVec<4>,
+  SDTCisSameNumEltsAs<0, 2>, SDTCisSameNumEltsAs<0, 4>
+]>;
+
+def SDTMaskedScatter : SDTypeProfile<0, 4, [
+  SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>,
+  SDTCisSameNumEltsAs<0, 1>, SDTCisSameNumEltsAs<0, 3>
+]>;
+
 def SDTVecShuffle : SDTypeProfile<1, 2, [
   SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>
 ]>;
@@ -365,6 +375,10 @@ def mul        : SDNode<"ISD::MUL"       , SDTIntBinOp,
                         [SDNPCommutative, SDNPAssociative]>;
 def mulhs      : SDNode<"ISD::MULHS"     , SDTIntBinOp, [SDNPCommutative]>;
 def mulhu      : SDNode<"ISD::MULHU"     , SDTIntBinOp, [SDNPCommutative]>;
+def avgfloors  : SDNode<"ISD::AVGFLOORS" , SDTIntBinOp, [SDNPCommutative]>;
+def avgflooru  : SDNode<"ISD::AVGFLOORU" , SDTIntBinOp, [SDNPCommutative]>;
+def avgceils   : SDNode<"ISD::AVGCEILS"  , SDTIntBinOp, [SDNPCommutative]>;
+def avgceilu   : SDNode<"ISD::AVGCEILU"  , SDTIntBinOp, [SDNPCommutative]>;
 def abds       : SDNode<"ISD::ABDS"      , SDTIntBinOp, [SDNPCommutative]>;
 def abdu       : SDNode<"ISD::ABDU"      , SDTIntBinOp, [SDNPCommutative]>;
 def smullohi   : SDNode<"ISD::SMUL_LOHI" , SDTIntBinHiLoOp, [SDNPCommutative]>;
@@ -648,6 +662,12 @@ def masked_st    : SDNode<"ISD::MSTORE",  SDTMaskedStore,
 def masked_ld    : SDNode<"ISD::MLOAD",  SDTMaskedLoad,
                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 
+def masked_gather : SDNode<"ISD::MGATHER", SDTMaskedGather,
+                           [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+def masked_scatter : SDNode<"ISD::MSCATTER", SDTMaskedScatter,
+                            [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
 // Do not use ld, st directly. Use load, extload, sextload, zextload, store,
 // and truncst (see below).
 def ld         : SDNode<"ISD::LOAD"       , SDTLoad,
@@ -1624,6 +1644,124 @@ def atomic_load_64 :
   let MemoryVT = i64;
 }
 
+def nonext_masked_gather :
+  PatFrag<(ops node:$def, node:$pred, node:$ptr, node:$idx),
+          (masked_gather node:$def, node:$pred, node:$ptr, node:$idx), [{
+  return cast<MaskedGatherSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
+}]>;
+
+// Any extending masked gather fragments.
+def ext_masked_gather_i8 :
+  PatFrag<(ops node:$def, node:$pred, node:$ptr, node:$idx),
+          (masked_gather node:$def, node:$pred, node:$ptr, node:$idx), [{
+  auto MGN = cast<MaskedGatherSDNode>(N);
+  return MGN->getExtensionType() == ISD::EXTLOAD &&
+         MGN->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+def ext_masked_gather_i16 :
+  PatFrag<(ops node:$def, node:$pred, node:$ptr, node:$idx),
+          (masked_gather node:$def, node:$pred, node:$ptr, node:$idx), [{
+  auto MGN = cast<MaskedGatherSDNode>(N);
+  return MGN->getExtensionType() == ISD::EXTLOAD &&
+         MGN->getMemoryVT().getScalarType() == MVT::i16;
+}]>;
+def ext_masked_gather_i32 :
+  PatFrag<(ops node:$def, node:$pred, node:$ptr, node:$idx),
+          (masked_gather node:$def, node:$pred, node:$ptr, node:$idx), [{
+  auto MGN = cast<MaskedGatherSDNode>(N);
+  return MGN->getExtensionType() == ISD::EXTLOAD &&
+         MGN->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
+
+// Sign extending masked gather fragments.
+def sext_masked_gather_i8 :
+  PatFrag<(ops node:$def, node:$pred, node:$ptr, node:$idx),
+          (masked_gather node:$def, node:$pred, node:$ptr, node:$idx), [{
+  auto MGN = cast<MaskedGatherSDNode>(N);
+  return MGN->getExtensionType() == ISD::SEXTLOAD &&
+         MGN->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+def sext_masked_gather_i16 :
+  PatFrag<(ops node:$def, node:$pred, node:$ptr, node:$idx),
+          (masked_gather node:$def, node:$pred, node:$ptr, node:$idx), [{
+  auto MGN = cast<MaskedGatherSDNode>(N);
+  return MGN->getExtensionType() == ISD::SEXTLOAD &&
+         MGN->getMemoryVT().getScalarType() == MVT::i16;
+}]>;
+def sext_masked_gather_i32 :
+  PatFrag<(ops node:$def, node:$pred, node:$ptr, node:$idx),
+          (masked_gather node:$def, node:$pred, node:$ptr, node:$idx), [{
+  auto MGN = cast<MaskedGatherSDNode>(N);
+  return MGN->getExtensionType() == ISD::SEXTLOAD &&
+         MGN->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
+
+// Zero extending masked gather fragments.
+def zext_masked_gather_i8 :
+  PatFrag<(ops node:$def, node:$pred, node:$ptr, node:$idx),
+          (masked_gather node:$def, node:$pred, node:$ptr, node:$idx), [{
+  auto MGN = cast<MaskedGatherSDNode>(N);
+  return MGN->getExtensionType() == ISD::ZEXTLOAD &&
+         MGN->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+def zext_masked_gather_i16 :
+  PatFrag<(ops node:$def, node:$pred, node:$ptr, node:$idx),
+          (masked_gather node:$def, node:$pred, node:$ptr, node:$idx), [{
+  auto MGN = cast<MaskedGatherSDNode>(N);
+  return MGN->getExtensionType() == ISD::ZEXTLOAD &&
+         MGN->getMemoryVT().getScalarType() == MVT::i16;
+}]>;
+def zext_masked_gather_i32 :
+  PatFrag<(ops node:$def, node:$pred, node:$ptr, node:$idx),
+          (masked_gather node:$def, node:$pred, node:$ptr, node:$idx), [{
+  auto MGN = cast<MaskedGatherSDNode>(N);
+  return MGN->getExtensionType() == ISD::ZEXTLOAD &&
+         MGN->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
+
+// Any/Zero extending masked gather fragments.
+def azext_masked_gather_i8 :
+  PatFrags<(ops node:$def, node:$pred, node:$ptr, node:$idx),
+           [(ext_masked_gather_i8 node:$def, node:$pred, node:$ptr, node:$idx),
+            (zext_masked_gather_i8 node:$def, node:$pred, node:$ptr, node:$idx)]>;
+def azext_masked_gather_i16 :
+  PatFrags<(ops node:$def, node:$pred, node:$ptr, node:$idx),
+           [(ext_masked_gather_i16 node:$def, node:$pred, node:$ptr, node:$idx),
+            (zext_masked_gather_i16 node:$def, node:$pred, node:$ptr, node:$idx)]>;
+def azext_masked_gather_i32 :
+  PatFrags<(ops node:$def, node:$pred, node:$ptr, node:$idx),
+           [(ext_masked_gather_i32 node:$def, node:$pred, node:$ptr, node:$idx),
+            (zext_masked_gather_i32 node:$def, node:$pred, node:$ptr, node:$idx)]>;
+
+def nontrunc_masked_scatter :
+  PatFrag<(ops node:$val, node:$pred, node:$ptr, node:$idx),
+          (masked_scatter node:$val, node:$pred, node:$ptr, node:$idx), [{
+  return !cast<MaskedScatterSDNode>(N)->isTruncatingStore();
+}]>;
+
+// Truncating masked scatter fragments.
+def trunc_masked_scatter_i8 :
+  PatFrag<(ops node:$val, node:$pred, node:$ptr, node:$idx),
+          (masked_scatter node:$val, node:$pred, node:$ptr, node:$idx), [{
+  auto MSN = cast<MaskedScatterSDNode>(N);
+  return MSN->isTruncatingStore() &&
+         MSN->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+def trunc_masked_scatter_i16 :
+  PatFrag<(ops node:$val, node:$pred, node:$ptr, node:$idx),
+          (masked_scatter node:$val, node:$pred, node:$ptr, node:$idx), [{
+  auto MSN = cast<MaskedScatterSDNode>(N);
+  return MSN->isTruncatingStore() &&
+         MSN->getMemoryVT().getScalarType() == MVT::i16;
+}]>;
+def trunc_masked_scatter_i32 :
+  PatFrag<(ops node:$val, node:$pred, node:$ptr, node:$idx),
+          (masked_scatter node:$val, node:$pred, node:$ptr, node:$idx), [{
+  auto MSN = cast<MaskedScatterSDNode>(N);
+  return MSN->isTruncatingStore() &&
+         MSN->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
+
 //===----------------------------------------------------------------------===//
 // Selection DAG Pattern Support.
 //
diff --git a/llvm/include/llvm/Testing/Support/SupportHelpers.h b/llvm/include/llvm/Testing/Support/SupportHelpers.h
index 2419fc95d817..b1c59cf97f7f 100644
--- a/llvm/include/llvm/Testing/Support/SupportHelpers.h
+++ b/llvm/include/llvm/Testing/Support/SupportHelpers.h
@@ -77,7 +77,7 @@ public:
 
     bool MatchAndExplain(const llvm::Optional<T> &Input,
                          testing::MatchResultListener *L) const override {
-      return Input && ValueMatcher.MatchAndExplain(Input.getValue(), L);
+      return Input && ValueMatcher.MatchAndExplain(*Input, L);
     }
 
     void DescribeTo(std::ostream *OS) const override {
@@ -238,6 +238,12 @@ public:
     }
   }
 
+  TempFile(const TempFile &) = delete;
+  TempFile &operator=(const TempFile &) = delete;
+
+  TempFile(TempFile &&) = default;
+  TempFile &operator=(TempFile &&) = default;
+
   /// The path to the file.
   StringRef path() const { return Path; }
 };
diff --git a/llvm/include/llvm/TextAPI/Symbol.h b/llvm/include/llvm/TextAPI/Symbol.h
index dfc84908bba2..1c25295b299d 100644
--- a/llvm/include/llvm/TextAPI/Symbol.h
+++ b/llvm/include/llvm/TextAPI/Symbol.h
@@ -11,7 +11,6 @@
 
 #include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Error.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TextAPI/ArchitectureSet.h"
 #include "llvm/TextAPI/Target.h"
diff --git a/llvm/include/llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h b/llvm/include/llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h
index 072ccf7320e8..3931c9c55c07 100644
--- a/llvm/include/llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h
+++ b/llvm/include/llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h
@@ -18,10 +18,12 @@
 #define LLVM_TRANSFORMS_AGGRESSIVEINSTCOMBINE_AGGRESSIVEINSTCOMBINE_H
 
 #include "llvm/IR/PassManager.h"
-#include "llvm/Pass.h"
 
 namespace llvm {
 
+class Function;
+class FunctionPass;
+
 class AggressiveInstCombinePass
     : public PassInfoMixin<AggressiveInstCombinePass> {
 
diff --git a/llvm/include/llvm/Transforms/Coroutines.h b/llvm/include/llvm/Transforms/Coroutines.h
deleted file mode 100644
index f68ef705fdef..000000000000
--- a/llvm/include/llvm/Transforms/Coroutines.h
+++ /dev/null
@@ -1,37 +0,0 @@
-//===-- Coroutines.h - Coroutine Transformations ----------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// Declare accessor functions for coroutine lowering passes.
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_COROUTINES_H
-#define LLVM_TRANSFORMS_COROUTINES_H
-
-namespace llvm {
-
-class Pass;
-class PassManagerBuilder;
-
-/// Add all coroutine passes to appropriate extension points.
-void addCoroutinePassesToExtensionPoints(PassManagerBuilder &Builder);
-
-/// Lower coroutine intrinsics that are not needed by later passes.
-Pass *createCoroEarlyLegacyPass();
-
-/// Split up coroutines into multiple functions driving their state machines.
-Pass *createCoroSplitLegacyPass(bool IsOptimizing = false);
-
-/// Analyze coroutines use sites, devirtualize resume/destroy calls and elide
-/// heap allocation for coroutine frame where possible.
-Pass *createCoroElideLegacyPass();
-
-/// Lower all remaining coroutine intrinsics.
-Pass *createCoroCleanupLegacyPass();
-
-}
-
-#endif
diff --git a/llvm/include/llvm/Transforms/Coroutines/CoroCleanup.h b/llvm/include/llvm/Transforms/Coroutines/CoroCleanup.h
index 7ecdc050335d..3000a38258f4 100644
--- a/llvm/include/llvm/Transforms/Coroutines/CoroCleanup.h
+++ b/llvm/include/llvm/Transforms/Coroutines/CoroCleanup.h
@@ -18,10 +18,10 @@
 
 namespace llvm {
 
-class Function;
+class Module;
 
 struct CoroCleanupPass : PassInfoMixin<CoroCleanupPass> {
-  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
   static bool isRequired() { return true; }
 };
 } // end namespace llvm
diff --git a/llvm/include/llvm/Transforms/Coroutines/CoroConditionalWrapper.h b/llvm/include/llvm/Transforms/Coroutines/CoroConditionalWrapper.h
new file mode 100644
index 000000000000..ea19ec533c4d
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Coroutines/CoroConditionalWrapper.h
@@ -0,0 +1,30 @@
+//===---- CoroConditionalWrapper.h ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_COROUTINES_COROCONDITIONALWRAPPER_H
+#define LLVM_TRANSFORMS_COROUTINES_COROCONDITIONALWRAPPER_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class Module;
+
+// Only runs passes in the contained pass manager if the module contains any
+// coroutine intrinsic declarations.
+struct CoroConditionalWrapper : PassInfoMixin<CoroConditionalWrapper> {
+  CoroConditionalWrapper(ModulePassManager &&);
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+  static bool isRequired() { return true; }
+
+private:
+  ModulePassManager PM;
+};
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_COROUTINES_COROCONDITIONALWRAPPER_H
diff --git a/llvm/include/llvm/Transforms/Coroutines/CoroEarly.h b/llvm/include/llvm/Transforms/Coroutines/CoroEarly.h
index 3f5ec2abd172..d55dcc6dfa6d 100644
--- a/llvm/include/llvm/Transforms/Coroutines/CoroEarly.h
+++ b/llvm/include/llvm/Transforms/Coroutines/CoroEarly.h
@@ -21,10 +21,10 @@
 
 namespace llvm {
 
-class Function;
+class Module;
 
 struct CoroEarlyPass : PassInfoMixin<CoroEarlyPass> {
-  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
   static bool isRequired() { return true; }
 };
 } // end namespace llvm
diff --git a/llvm/include/llvm/Transforms/IPO.h b/llvm/include/llvm/Transforms/IPO.h
index 67b9a93c47b2..6b7d4f4821f0 100644
--- a/llvm/include/llvm/Transforms/IPO.h
+++ b/llvm/include/llvm/Transforms/IPO.h
@@ -151,13 +151,6 @@ ModulePass *createDeadArgEliminationPass();
 /// bugpoint.
 ModulePass *createDeadArgHackingPass();
 
-//===----------------------------------------------------------------------===//
-/// createArgumentPromotionPass - This pass promotes "by reference" arguments to
-/// be passed by value if the number of elements passed is smaller or
-/// equal to maxElements (maxElements == 0 means always promote).
-///
-Pass *createArgumentPromotionPass(unsigned maxElements = 3);
-
 //===----------------------------------------------------------------------===//
 /// createOpenMPOptLegacyPass - OpenMP specific optimizations.
 Pass *createOpenMPOptCGSCCLegacyPass();
diff --git a/llvm/include/llvm/Transforms/IPO/AlwaysInliner.h b/llvm/include/llvm/Transforms/IPO/AlwaysInliner.h
index 78b2f909f1c9..252cfd4dc5f3 100644
--- a/llvm/include/llvm/Transforms/IPO/AlwaysInliner.h
+++ b/llvm/include/llvm/Transforms/IPO/AlwaysInliner.h
@@ -15,10 +15,12 @@
 #define LLVM_TRANSFORMS_IPO_ALWAYSINLINER_H
 
 #include "llvm/IR/PassManager.h"
-#include "llvm/Pass.h"
 
 namespace llvm {
 
+class Module;
+class Pass;
+
 /// Inlines functions marked as "always_inline".
 ///
 /// Note that this does not inline call sites marked as always_inline and does
diff --git a/llvm/include/llvm/Transforms/IPO/ArgumentPromotion.h b/llvm/include/llvm/Transforms/IPO/ArgumentPromotion.h
index 225def99678a..3865f098b8de 100644
--- a/llvm/include/llvm/Transforms/IPO/ArgumentPromotion.h
+++ b/llvm/include/llvm/Transforms/IPO/ArgumentPromotion.h
@@ -14,7 +14,6 @@
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
-class TargetTransformInfo;
 
 /// Argument promotion pass.
 ///
@@ -25,10 +24,7 @@ class ArgumentPromotionPass : public PassInfoMixin<ArgumentPromotionPass> {
   unsigned MaxElements;
 
 public:
-  ArgumentPromotionPass(unsigned MaxElements = 3u) : MaxElements(MaxElements) {}
-
-  /// Checks if a type could have padding bytes.
-  static bool isDenselyPacked(Type *type, const DataLayout &DL);
+  ArgumentPromotionPass(unsigned MaxElements = 2u) : MaxElements(MaxElements) {}
 
   PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM,
                         LazyCallGraph &CG, CGSCCUpdateResult &UR);
diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index 7eee16f71d64..17e29695ab73 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -116,15 +116,24 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/AbstractCallSite.h"
 #include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Support/Alignment.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/DOTGraphTraits.h"
 #include "llvm/Support/TimeProfiler.h"
 #include "llvm/Transforms/Utils/CallGraphUpdater.h"
 
+#include <map>
+
 namespace llvm {
 
+class DataLayout;
+class LLVMContext;
+class Pass;
+template <typename Fn> class function_ref;
 struct AADepGraphNode;
 struct AADepGraph;
 struct Attributor;
@@ -140,6 +149,24 @@ class Function;
 /// Abstract Attribute helper functions.
 namespace AA {
 
+/// Flags to distinguish intra-procedural queries from *potentially*
+/// inter-procedural queries. Not that information can be valid for both and
+/// therefore both bits might be set.
+enum ValueScope : uint8_t {
+  Intraprocedural = 1,
+  Interprocedural = 2,
+};
+
+struct ValueAndContext : public std::pair<Value *, const Instruction *> {
+  using Base = std::pair<Value *, const Instruction *>;
+  ValueAndContext(const Base &B) : Base(B) {}
+  ValueAndContext(Value &V, const Instruction *CtxI) : Base(&V, CtxI) {}
+  ValueAndContext(Value &V, const Instruction &CtxI) : Base(&V, &CtxI) {}
+
+  Value *getValue() const { return this->first; }
+  const Instruction *getCtxI() const { return this->second; }
+};
+
 /// Return true if \p I is a `nosync` instruction. Use generic reasoning and
 /// potentially the corresponding AANoSync.
 bool isNoSyncInst(Attributor &A, const Instruction &I,
@@ -147,18 +174,20 @@ bool isNoSyncInst(Attributor &A, const Instruction &I,
 
 /// Return true if \p V is dynamically unique, that is, there are no two
 /// "instances" of \p V at runtime with different values.
+/// Note: If \p ForAnalysisOnly is set we only check that the Attributor will
+/// never use \p V to represent two "instances" not that \p V could not
+/// technically represent them.
 bool isDynamicallyUnique(Attributor &A, const AbstractAttribute &QueryingAA,
-                         const Value &V);
+                         const Value &V, bool ForAnalysisOnly = true);
 
 /// Return true if \p V is a valid value in \p Scope, that is a constant or an
 /// instruction/argument of \p Scope.
 bool isValidInScope(const Value &V, const Function *Scope);
 
-/// Return true if \p V is a valid value at position \p CtxI, that is a
-/// constant, an argument of the same function as \p CtxI, or an instruction in
-/// that function that dominates \p CtxI.
-bool isValidAtPosition(const Value &V, const Instruction &CtxI,
-                       InformationCache &InfoCache);
+/// Return true if the value of \p VAC is a valid at the position of \p VAC,
+/// that is a constant, an argument of the same function, or an instruction in
+/// that function that dominates the position.
+bool isValidAtPosition(const ValueAndContext &VAC, InformationCache &InfoCache);
 
 /// Try to convert \p V to type \p Ty without introducing new instructions. If
 /// this is not possible return `nullptr`. Note: this function basically knows
@@ -192,11 +221,29 @@ bool getAssumedUnderlyingObjects(Attributor &A, const Value &Ptr,
                                  SmallVectorImpl<Value *> &Objects,
                                  const AbstractAttribute &QueryingAA,
                                  const Instruction *CtxI,
-                                 bool Intraprocedural = false);
+                                 bool &UsedAssumedInformation,
+                                 AA::ValueScope VS = Interprocedural);
+
+/// Collect all potential values \p LI could read into \p PotentialValues. That
+/// is, the only values read by \p LI are assumed to be known and all are in
+/// \p PotentialValues. \p PotentialValueOrigins will contain all the
+/// instructions that might have put a potential value into \p PotentialValues.
+/// Dependences onto \p QueryingAA are properly tracked, \p
+/// UsedAssumedInformation will inform the caller if assumed information was
+/// used.
+///
+/// \returns True if the assumed potential copies are all in \p PotentialValues,
+///          false if something went wrong and the copies could not be
+///          determined.
+bool getPotentiallyLoadedValues(
+    Attributor &A, LoadInst &LI, SmallSetVector<Value *, 4> &PotentialValues,
+    SmallSetVector<Instruction *, 4> &PotentialValueOrigins,
+    const AbstractAttribute &QueryingAA, bool &UsedAssumedInformation,
+    bool OnlyExact = false);
 
 /// Collect all potential values of the one stored by \p SI into
 /// \p PotentialCopies. That is, the only copies that were made via the
-/// store are assumed to be known and all in \p PotentialCopies. Dependences
+/// store are assumed to be known and all are in \p PotentialCopies. Dependences
 /// onto \p QueryingAA are properly tracked, \p UsedAssumedInformation will
 /// inform the caller if assumed information was used.
 ///
@@ -205,7 +252,8 @@ bool getAssumedUnderlyingObjects(Attributor &A, const Value &Ptr,
 ///          determined.
 bool getPotentialCopiesOfStoredValue(
     Attributor &A, StoreInst &SI, SmallSetVector<Value *, 4> &PotentialCopies,
-    const AbstractAttribute &QueryingAA, bool &UsedAssumedInformation);
+    const AbstractAttribute &QueryingAA, bool &UsedAssumedInformation,
+    bool OnlyExact = false);
 
 /// Return true if \p IRP is readonly. This will query respective AAs that
 /// deduce the information and introduce dependences for \p QueryingAA.
@@ -237,6 +285,26 @@ bool isPotentiallyReachable(
 
 } // namespace AA
 
+template <>
+struct DenseMapInfo<AA::ValueAndContext>
+    : public DenseMapInfo<AA::ValueAndContext::Base> {
+  using Base = DenseMapInfo<AA::ValueAndContext::Base>;
+  static inline AA::ValueAndContext getEmptyKey() {
+    return Base::getEmptyKey();
+  }
+  static inline AA::ValueAndContext getTombstoneKey() {
+    return Base::getTombstoneKey();
+  }
+  static unsigned getHashValue(const AA::ValueAndContext &VAC) {
+    return Base::getHashValue(VAC);
+  }
+
+  static bool isEqual(const AA::ValueAndContext &LHS,
+                      const AA::ValueAndContext &RHS) {
+    return Base::isEqual(LHS, RHS);
+  }
+};
+
 /// The value passed to the line option that defines the maximal initialization
 /// chain length.
 extern unsigned MaxInitializationChainLength;
@@ -1033,6 +1101,10 @@ struct InformationCache {
     return FI.CalledViaMustTail || FI.ContainsMustTailCall;
   }
 
+  bool isOnlyUsedByAssume(const Instruction &I) const {
+    return AssumeOnlyValues.contains(&I);
+  }
+
   /// Return the analysis result from a pass \p AP for function \p F.
   template <typename AP>
   typename AP::Result *getAnalysisResultForFunction(const Function &F) {
@@ -1125,6 +1197,9 @@ private:
   /// A map with knowledge retained in `llvm.assume` instructions.
   RetainedKnowledgeMap KnowledgeMap;
 
+  /// A container for all instructions that are only used by `llvm.assume`.
+  SetVector<const Instruction *> AssumeOnlyValues;
+
   /// Getters for analysis.
   AnalysisGetter &AG;
 
@@ -1143,6 +1218,53 @@ private:
   friend struct Attributor;
 };
 
+/// Configuration for the Attributor.
+struct AttributorConfig {
+
+  AttributorConfig(CallGraphUpdater &CGUpdater) : CGUpdater(CGUpdater) {}
+
+  /// Is the user of the Attributor a module pass or not. This determines what
+  /// IR we can look at and modify. If it is a module pass we might deduce facts
+  /// outside the initial function set and modify functions outside that set,
+  /// but only as part of the optimization of the functions in the initial
+  /// function set. For CGSCC passes we can look at the IR of the module slice
+  /// but never run any deduction, or perform any modification, outside the
+  /// initial function set (which we assume is the SCC).
+  bool IsModulePass = true;
+
+  /// Flag to determine if we can delete functions or keep dead ones around.
+  bool DeleteFns = true;
+
+  /// Flag to determine if we rewrite function signatures.
+  bool RewriteSignatures = true;
+
+  /// Flag to determine if we want to initialize all default AAs for an internal
+  /// function marked live.
+  /// TODO: This should probably be a callback, or maybe
+  /// identifyDefaultAbstractAttributes should be virtual, something to allow
+  /// customizable lazy initialization for internal functions.
+  bool DefaultInitializeLiveInternals = true;
+
+  /// Helper to update an underlying call graph and to delete functions.
+  CallGraphUpdater &CGUpdater;
+
+  /// If not null, a set limiting the attribute opportunities.
+  DenseSet<const char *> *Allowed = nullptr;
+
+  /// Maximum number of iterations to run until fixpoint.
+  Optional<unsigned> MaxFixpointIterations = None;
+
+  /// A callback function that returns an ORE object from a Function pointer.
+  ///{
+  using OptimizationRemarkGetter =
+      function_ref<OptimizationRemarkEmitter &(Function *)>;
+  OptimizationRemarkGetter OREGetter = nullptr;
+  ///}
+
+  /// The name of the pass running the attributor, used to emit remarks.
+  const char *PassName = nullptr;
+};
+
 /// The fixpoint analysis framework that orchestrates the attribute deduction.
 ///
 /// The Attributor provides a general abstract analysis framework (guided
@@ -1172,52 +1294,17 @@ private:
 ///       described in the file comment.
 struct Attributor {
 
-  using OptimizationRemarkGetter =
-      function_ref<OptimizationRemarkEmitter &(Function *)>;
-
   /// Constructor
   ///
   /// \param Functions The set of functions we are deriving attributes for.
   /// \param InfoCache Cache to hold various information accessible for
   ///                  the abstract attributes.
-  /// \param CGUpdater Helper to update an underlying call graph.
-  /// \param Allowed If not null, a set limiting the attribute opportunities.
-  /// \param DeleteFns Whether to delete functions.
-  /// \param RewriteSignatures Whether to rewrite function signatures.
+  /// \param Configuration The Attributor configuration which determines what
+  ///                      generic features to use.
   Attributor(SetVector<Function *> &Functions, InformationCache &InfoCache,
-             CallGraphUpdater &CGUpdater,
-             DenseSet<const char *> *Allowed = nullptr, bool DeleteFns = true,
-             bool RewriteSignatures = true)
+             AttributorConfig Configuration)
       : Allocator(InfoCache.Allocator), Functions(Functions),
-        InfoCache(InfoCache), CGUpdater(CGUpdater), Allowed(Allowed),
-        DeleteFns(DeleteFns), RewriteSignatures(RewriteSignatures),
-        MaxFixpointIterations(None), OREGetter(None), PassName("") {}
-
-  /// Constructor
-  ///
-  /// \param Functions The set of functions we are deriving attributes for.
-  /// \param InfoCache Cache to hold various information accessible for
-  ///                  the abstract attributes.
-  /// \param CGUpdater Helper to update an underlying call graph.
-  /// \param Allowed If not null, a set limiting the attribute opportunities.
-  /// \param DeleteFns Whether to delete functions
-  /// \param RewriteSignatures Whether to rewrite function signatures.
-  /// \param MaxFixpointIterations Maximum number of iterations to run until
-  ///                              fixpoint.
-  /// \param OREGetter A callback function that returns an ORE object from a
-  ///                  Function pointer.
-  /// \param PassName  The name of the pass emitting remarks.
-  Attributor(SetVector<Function *> &Functions, InformationCache &InfoCache,
-             CallGraphUpdater &CGUpdater, DenseSet<const char *> *Allowed,
-             bool DeleteFns, bool RewriteSignatures,
-             Optional<unsigned> MaxFixpointIterations,
-             OptimizationRemarkGetter OREGetter, const char *PassName)
-      : Allocator(InfoCache.Allocator), Functions(Functions),
-        InfoCache(InfoCache), CGUpdater(CGUpdater), Allowed(Allowed),
-        DeleteFns(DeleteFns), RewriteSignatures(RewriteSignatures),
-        MaxFixpointIterations(MaxFixpointIterations),
-        OREGetter(Optional<OptimizationRemarkGetter>(OREGetter)),
-        PassName(PassName) {}
+        InfoCache(InfoCache), Configuration(Configuration) {}
 
   ~Attributor();
 
@@ -1301,11 +1388,15 @@ struct Attributor {
     registerAA(AA);
 
     // For now we ignore naked and optnone functions.
-    bool Invalidate = Allowed && !Allowed->count(&AAType::ID);
-    const Function *FnScope = IRP.getAnchorScope();
-    if (FnScope)
-      Invalidate |= FnScope->hasFnAttribute(Attribute::Naked) ||
-                    FnScope->hasFnAttribute(Attribute::OptimizeNone);
+    bool Invalidate =
+        Configuration.Allowed && !Configuration.Allowed->count(&AAType::ID);
+    const Function *AnchorFn = IRP.getAnchorScope();
+    if (AnchorFn) {
+      Invalidate |=
+          AnchorFn->hasFnAttribute(Attribute::Naked) ||
+          AnchorFn->hasFnAttribute(Attribute::OptimizeNone) ||
+          (!isModulePass() && !getInfoCache().isInModuleSlice(*AnchorFn));
+    }
 
     // Avoid too many nested initializations to prevent a stack overflow.
     Invalidate |= InitializationChainLength > MaxInitializationChainLength;
@@ -1325,15 +1416,12 @@ struct Attributor {
       --InitializationChainLength;
     }
 
-    // Initialize and update is allowed for code outside of the current function
-    // set, but only if it is part of module slice we are allowed to look at.
-    // Only exception is AAIsDeadFunction whose initialization is prevented
-    // directly, since we don't to compute it twice.
-    if (FnScope && !Functions.count(const_cast<Function *>(FnScope))) {
-      if (!getInfoCache().isInModuleSlice(*FnScope)) {
-        AA.getState().indicatePessimisticFixpoint();
-        return AA;
-      }
+    // We update only AAs associated with functions in the Functions set or
+    // call sites of them.
+    if ((AnchorFn && !Functions.count(const_cast<Function *>(AnchorFn))) &&
+        !Functions.count(IRP.getAssociatedFunction())) {
+      AA.getState().indicatePessimisticFixpoint();
+      return AA;
     }
 
     // If this is queried in the manifest stage, we force the AA to indicate
@@ -1443,10 +1531,7 @@ struct Attributor {
   InformationCache &getInfoCache() { return InfoCache; }
 
   /// Return true if this is a module pass, false otherwise.
-  bool isModulePass() const {
-    return !Functions.empty() &&
-           Functions.size() == Functions.front()->getParent()->size();
-  }
+  bool isModulePass() const { return Configuration.IsModulePass; }
 
   /// Return true if we derive attributes for \p Fn
   bool isRunOn(Function &Fn) const {
@@ -1481,7 +1566,8 @@ struct Attributor {
     assert(F.hasLocalLinkage() &&
            "Only local linkage is assumed dead initially.");
 
-    identifyDefaultAbstractAttributes(const_cast<Function &>(F));
+    if (Configuration.DefaultInitializeLiveInternals)
+      identifyDefaultAbstractAttributes(const_cast<Function &>(F));
   }
 
   /// Helper function to remove callsite.
@@ -1489,7 +1575,7 @@ struct Attributor {
     if (!CI)
       return;
 
-    CGUpdater.removeCallSite(*CI);
+    Configuration.CGUpdater.removeCallSite(*CI);
   }
 
   /// Record that \p U is to be replaces with \p NV after information was
@@ -1505,11 +1591,17 @@ struct Attributor {
     return true;
   }
 
-  /// Helper function to replace all uses of \p V with \p NV. Return true if
-  /// there is any change. The flag \p ChangeDroppable indicates if dropppable
-  /// uses should be changed too.
-  bool changeValueAfterManifest(Value &V, Value &NV,
-                                bool ChangeDroppable = true) {
+  /// Helper function to replace all uses associated with \p IRP with \p NV.
+  /// Return true if there is any change. The flag \p ChangeDroppable indicates
+  /// if dropppable uses should be changed too.
+  bool changeAfterManifest(const IRPosition IRP, Value &NV,
+                           bool ChangeDroppable = true) {
+    if (IRP.getPositionKind() == IRPosition::IRP_CALL_SITE_ARGUMENT) {
+      auto *CB = cast<CallBase>(IRP.getCtxI());
+      return changeUseAfterManifest(
+          CB->getArgOperandUse(IRP.getCallSiteArgNo()), NV);
+    }
+    Value &V = IRP.getAssociatedValue();
     auto &Entry = ToBeChangedValues[&V];
     Value *&CurNV = Entry.first;
     if (CurNV && (CurNV->stripPointerCasts() == NV.stripPointerCasts() ||
@@ -1532,7 +1624,7 @@ struct Attributor {
   /// is used, e.g., to replace \p II with a call, after information was
   /// manifested.
   void registerInvokeWithDeadSuccessor(InvokeInst &II) {
-    InvokeWithDeadSuccessor.push_back(&II);
+    InvokeWithDeadSuccessor.insert(&II);
   }
 
   /// Record that \p I is deleted after information was manifested. This also
@@ -1551,7 +1643,9 @@ struct Attributor {
 
   /// Record that \p F is deleted after information was manifested.
   void deleteAfterManifest(Function &F) {
-    if (DeleteFns)
+    errs() << "Delete " << F.getName() << " : " << (Configuration.DeleteFns)
+           << "\n";
+    if (Configuration.DeleteFns)
       ToBeDeletedFunctions.insert(&F);
   }
 
@@ -1668,6 +1762,7 @@ public:
                        const AbstractAttribute &QueryingAA, const Value &V,
                        bool CheckBBLivenessOnly = false,
                        DepClassTy LivenessDepClass = DepClassTy::OPTIONAL,
+                       bool IgnoreDroppableUses = true,
                        function_ref<bool(const Use &OldU, const Use &NewU)>
                            EquivalentUseCB = nullptr);
 
@@ -1685,37 +1780,41 @@ public:
   template <typename RemarkKind, typename RemarkCallBack>
   void emitRemark(Instruction *I, StringRef RemarkName,
                   RemarkCallBack &&RemarkCB) const {
-    if (!OREGetter)
+    if (!Configuration.OREGetter)
       return;
 
     Function *F = I->getFunction();
-    auto &ORE = OREGetter.getValue()(F);
+    auto &ORE = Configuration.OREGetter(F);
 
     if (RemarkName.startswith("OMP"))
       ORE.emit([&]() {
-        return RemarkCB(RemarkKind(PassName, RemarkName, I))
+        return RemarkCB(RemarkKind(Configuration.PassName, RemarkName, I))
                << " [" << RemarkName << "]";
       });
     else
-      ORE.emit([&]() { return RemarkCB(RemarkKind(PassName, RemarkName, I)); });
+      ORE.emit([&]() {
+        return RemarkCB(RemarkKind(Configuration.PassName, RemarkName, I));
+      });
   }
 
   /// Emit a remark on a function.
   template <typename RemarkKind, typename RemarkCallBack>
   void emitRemark(Function *F, StringRef RemarkName,
                   RemarkCallBack &&RemarkCB) const {
-    if (!OREGetter)
+    if (!Configuration.OREGetter)
       return;
 
-    auto &ORE = OREGetter.getValue()(F);
+    auto &ORE = Configuration.OREGetter(F);
 
     if (RemarkName.startswith("OMP"))
       ORE.emit([&]() {
-        return RemarkCB(RemarkKind(PassName, RemarkName, F))
+        return RemarkCB(RemarkKind(Configuration.PassName, RemarkName, F))
                << " [" << RemarkName << "]";
       });
     else
-      ORE.emit([&]() { return RemarkCB(RemarkKind(PassName, RemarkName, F)); });
+      ORE.emit([&]() {
+        return RemarkCB(RemarkKind(Configuration.PassName, RemarkName, F));
+      });
   }
 
   /// Helper struct used in the communication between an abstract attribute (AA)
@@ -1824,23 +1923,24 @@ public:
   /// This method will evaluate \p Pred on call sites and return
   /// true if \p Pred holds in every call sites. However, this is only possible
   /// all call sites are known, hence the function has internal linkage.
-  /// If true is returned, \p AllCallSitesKnown is set if all possible call
-  /// sites of the function have been visited.
+  /// If true is returned, \p UsedAssumedInformation is set if assumed
+  /// information was used to skip or simplify potential call sites.
   bool checkForAllCallSites(function_ref<bool(AbstractCallSite)> Pred,
                             const AbstractAttribute &QueryingAA,
-                            bool RequireAllCallSites, bool &AllCallSitesKnown);
+                            bool RequireAllCallSites,
+                            bool &UsedAssumedInformation);
 
   /// Check \p Pred on all call sites of \p Fn.
   ///
   /// This method will evaluate \p Pred on call sites and return
   /// true if \p Pred holds in every call sites. However, this is only possible
   /// all call sites are known, hence the function has internal linkage.
-  /// If true is returned, \p AllCallSitesKnown is set if all possible call
-  /// sites of the function have been visited.
+  /// If true is returned, \p UsedAssumedInformation is set if assumed
+  /// information was used to skip or simplify potential call sites.
   bool checkForAllCallSites(function_ref<bool(AbstractCallSite)> Pred,
                             const Function &Fn, bool RequireAllCallSites,
                             const AbstractAttribute *QueryingAA,
-                            bool &AllCallSitesKnown);
+                            bool &UsedAssumedInformation);
 
   /// Check \p Pred on all values potentially returned by \p F.
   ///
@@ -1859,6 +1959,19 @@ public:
   bool checkForAllReturnedValues(function_ref<bool(Value &)> Pred,
                                  const AbstractAttribute &QueryingAA);
 
+  /// Check \p Pred on all instructions in \p Fn with an opcode present in
+  /// \p Opcodes.
+  ///
+  /// This method will evaluate \p Pred on all instructions with an opcode
+  /// present in \p Opcode and return true if \p Pred holds on all of them.
+  bool checkForAllInstructions(function_ref<bool(Instruction &)> Pred,
+                               const Function *Fn,
+                               const AbstractAttribute &QueryingAA,
+                               const ArrayRef<unsigned> &Opcodes,
+                               bool &UsedAssumedInformation,
+                               bool CheckBBLivenessOnly = false,
+                               bool CheckPotentiallyDead = false);
+
   /// Check \p Pred on all instructions with an opcode present in \p Opcodes.
   ///
   /// This method will evaluate \p Pred on all instructions with an opcode
@@ -1987,7 +2100,7 @@ private:
   /// (\see registerFunctionSignatureRewrite) and return Changed if the module
   /// was altered.
   ChangeStatus
-  rewriteFunctionSignatures(SmallPtrSetImpl<Function *> &ModifiedFns);
+  rewriteFunctionSignatures(SmallSetVector<Function *, 8> &ModifiedFns);
 
   /// Check if the Attribute \p AA should be seeded.
   /// See getOrCreateAAFor.
@@ -2011,15 +2124,12 @@ private:
   /// The information cache that holds pre-processed (LLVM-IR) information.
   InformationCache &InfoCache;
 
-  /// Helper to update an underlying call graph.
-  CallGraphUpdater &CGUpdater;
-
   /// Abstract Attribute dependency graph
   AADepGraph DG;
 
   /// Set of functions for which we modified the content such that it might
   /// impact the call graph.
-  SmallPtrSet<Function *, 8> CGModifiedFunctions;
+  SmallSetVector<Function *, 8> CGModifiedFunctions;
 
   /// Information about a dependence. If FromAA is changed ToAA needs to be
   /// updated as well.
@@ -2039,34 +2149,22 @@ private:
   using DependenceVector = SmallVector<DepInfo, 8>;
   SmallVector<DependenceVector *, 16> DependenceStack;
 
-  /// If not null, a set limiting the attribute opportunities.
-  const DenseSet<const char *> *Allowed;
-
-  /// Whether to delete functions.
-  const bool DeleteFns;
-
-  /// Whether to rewrite signatures.
-  const bool RewriteSignatures;
-
-  /// Maximum number of fixedpoint iterations.
-  Optional<unsigned> MaxFixpointIterations;
-
   /// A set to remember the functions we already assume to be live and visited.
   DenseSet<const Function *> VisitedFunctions;
 
   /// Uses we replace with a new value after manifest is done. We will remove
   /// then trivially dead instructions as well.
-  DenseMap<Use *, Value *> ToBeChangedUses;
+  SmallMapVector<Use *, Value *, 32> ToBeChangedUses;
 
   /// Values we replace with a new value after manifest is done. We will remove
   /// then trivially dead instructions as well.
-  DenseMap<Value *, std::pair<Value *, bool>> ToBeChangedValues;
+  SmallMapVector<Value *, std::pair<Value *, bool>, 32> ToBeChangedValues;
 
   /// Instructions we replace with `unreachable` insts after manifest is done.
-  SmallDenseSet<WeakVH, 16> ToBeChangedToUnreachableInsts;
+  SmallSetVector<WeakVH, 16> ToBeChangedToUnreachableInsts;
 
   /// Invoke instructions with at least a single dead successor block.
-  SmallVector<WeakVH, 16> InvokeWithDeadSuccessor;
+  SmallSetVector<WeakVH, 16> InvokeWithDeadSuccessor;
 
   /// A flag that indicates which stage of the process we are in. Initially, the
   /// phase is SEEDING. Phase is changed in `Attributor::run()`
@@ -2083,21 +2181,18 @@ private:
   /// Functions, blocks, and instructions we delete after manifest is done.
   ///
   ///{
-  SmallPtrSet<Function *, 8> ToBeDeletedFunctions;
-  SmallPtrSet<BasicBlock *, 8> ToBeDeletedBlocks;
   SmallPtrSet<BasicBlock *, 8> ManifestAddedBlocks;
-  SmallDenseSet<WeakVH, 8> ToBeDeletedInsts;
+  SmallSetVector<Function *, 8> ToBeDeletedFunctions;
+  SmallSetVector<BasicBlock *, 8> ToBeDeletedBlocks;
+  SmallSetVector<WeakVH, 8> ToBeDeletedInsts;
   ///}
 
-  /// Callback to get an OptimizationRemarkEmitter from a Function *.
-  Optional<OptimizationRemarkGetter> OREGetter;
-
   /// Container with all the query AAs that requested an update via
   /// registerForUpdate.
   SmallSetVector<AbstractAttribute *, 16> QueryAAsAwaitingUpdate;
 
-  /// The name of the pass to emit remarks for.
-  const char *PassName = "";
+  /// User provided configuration for this Attributor instance.
+  const AttributorConfig Configuration;
 
   friend AADepGraph;
   friend AttributorCallGraph;
@@ -2515,16 +2610,6 @@ struct IntegerRangeState : public AbstractState {
     unionAssumed(R.getAssumed());
   }
 
-  /// Unite known range with the passed state.
-  void unionKnown(const ConstantRange &R) {
-    // Don't loose a known range.
-    Known = Known.unionWith(R);
-    Assumed = Assumed.unionWith(Known);
-  }
-
-  /// See IntegerRangeState::unionKnown(..).
-  void unionKnown(const IntegerRangeState &R) { unionKnown(R.getKnown()); }
-
   /// Intersect known range with the passed state.
   void intersectKnown(const ConstantRange &R) {
     Assumed = Assumed.intersectWith(R);
@@ -2554,8 +2639,8 @@ struct IntegerRangeState : public AbstractState {
   IntegerRangeState operator&=(const IntegerRangeState &R) {
     // NOTE: `&=` operator seems like `intersect` but in this case, we need to
     // take `union`.
-    unionKnown(R);
-    unionAssumed(R);
+    Known = Known.unionWith(R.getKnown());
+    Assumed = Assumed.unionWith(R.getAssumed());
     return *this;
   }
 };
@@ -3363,6 +3448,12 @@ protected:
   /// Returns true if \p I is known dead.
   virtual bool isKnownDead(const Instruction *I) const = 0;
 
+  /// Return true if the underlying value is a store that is known to be
+  /// removable. This is different from dead stores as the removable store
+  /// can have an effect on live values, especially loads, but that effect
+  /// is propagated which allows us to remove the store in turn.
+  virtual bool isRemovableStore() const { return false; }
+
   /// This method is used to check if at least one instruction in a collection
   /// of instructions is live.
   template <typename T> bool isLiveInstSet(T begin, T end) const {
@@ -3618,10 +3709,10 @@ struct AAAlign : public IRAttribute<
   AAAlign(const IRPosition &IRP, Attributor &A) : IRAttribute(IRP) {}
 
   /// Return assumed alignment.
-  uint64_t getAssumedAlign() const { return getAssumed(); }
+  Align getAssumedAlign() const { return Align(getAssumed()); }
 
   /// Return known alignment.
-  uint64_t getKnownAlign() const { return getKnown(); }
+  Align getKnownAlign() const { return Align(getKnown()); }
 
   /// See AbstractAttribute::getName()
   const std::string getName() const override { return "AAAlign"; }
@@ -3641,6 +3732,46 @@ struct AAAlign : public IRAttribute<
   static const char ID;
 };
 
+/// An abstract interface to track if a value leaves it's defining function
+/// instance.
+/// TODO: We should make it a ternary AA tracking uniqueness, and uniqueness
+/// wrt. the Attributor analysis separately.
+struct AAInstanceInfo : public StateWrapper<BooleanState, AbstractAttribute> {
+  AAInstanceInfo(const IRPosition &IRP, Attributor &A)
+      : StateWrapper<BooleanState, AbstractAttribute>(IRP) {}
+
+  /// Return true if we know that the underlying value is unique in its scope
+  /// wrt. the Attributor analysis. That means it might not be unique but we can
+  /// still use pointer equality without risking to represent two instances with
+  /// one `llvm::Value`.
+  bool isKnownUniqueForAnalysis() const { return isKnown(); }
+
+  /// Return true if we assume that the underlying value is unique in its scope
+  /// wrt. the Attributor analysis. That means it might not be unique but we can
+  /// still use pointer equality without risking to represent two instances with
+  /// one `llvm::Value`.
+  bool isAssumedUniqueForAnalysis() const { return isAssumed(); }
+
+  /// Create an abstract attribute view for the position \p IRP.
+  static AAInstanceInfo &createForPosition(const IRPosition &IRP,
+                                           Attributor &A);
+
+  /// See AbstractAttribute::getName()
+  const std::string getName() const override { return "AAInstanceInfo"; }
+
+  /// See AbstractAttribute::getIdAddr()
+  const char *getIdAddr() const override { return &ID; }
+
+  /// This function should return true if the type of the \p AA is
+  /// AAInstanceInfo
+  static bool classof(const AbstractAttribute *AA) {
+    return (AA->getIdAddr() == &ID);
+  }
+
+  /// Unique ID (due to the unique address)
+  static const char ID;
+};
+
 /// An abstract interface for all nocapture attributes.
 struct AANoCapture
     : public IRAttribute<
@@ -4150,13 +4281,14 @@ struct AAValueConstantRange
 
   /// Return an assumed constant for the associated value a program point \p
   /// CtxI.
-  Optional<ConstantInt *>
-  getAssumedConstantInt(Attributor &A,
-                        const Instruction *CtxI = nullptr) const {
+  Optional<Constant *>
+  getAssumedConstant(Attributor &A, const Instruction *CtxI = nullptr) const {
     ConstantRange RangeV = getAssumedConstantRange(A, CtxI);
-    if (auto *C = RangeV.getSingleElement())
-      return cast<ConstantInt>(
-          ConstantInt::get(getAssociatedValue().getType(), *C));
+    if (auto *C = RangeV.getSingleElement()) {
+      Type *Ty = getAssociatedValue().getType();
+      return cast_or_null<Constant>(
+          AA::getWithType(*ConstantInt::get(Ty->getContext(), *C), *Ty));
+    }
     if (RangeV.isEmptySet())
       return llvm::None;
     return nullptr;
@@ -4185,10 +4317,9 @@ struct AAValueConstantRange
 /// contains every possible value (i.e. we cannot in any way limit the value
 /// that the target position can take). That never happens naturally, we only
 /// force it. As for the conditions under which we force it, see
-/// AAPotentialValues.
-template <typename MemberTy, typename KeyInfo = DenseMapInfo<MemberTy>>
-struct PotentialValuesState : AbstractState {
-  using SetTy = DenseSet<MemberTy, KeyInfo>;
+/// AAPotentialConstantValues.
+template <typename MemberTy> struct PotentialValuesState : AbstractState {
+  using SetTy = SmallSetVector<MemberTy, 8>;
 
   PotentialValuesState() : IsValidState(true), UndefIsContained(false) {}
 
@@ -4247,7 +4378,7 @@ struct PotentialValuesState : AbstractState {
     return PotentialValuesState(true);
   }
 
-  static PotentialValuesState getBestState(PotentialValuesState &PVS) {
+  static PotentialValuesState getBestState(const PotentialValuesState &PVS) {
     return getBestState();
   }
 
@@ -4278,6 +4409,12 @@ struct PotentialValuesState : AbstractState {
     return *this;
   }
 
+protected:
+  SetTy &getAssumedSet() {
+    assert(isValidState() && "This set shoud not be used when it is invalid!");
+    return Set;
+  }
+
 private:
   /// Check the size of this set, and invalidate when the size is no
   /// less than \p MaxPotentialValues threshold.
@@ -4372,10 +4509,10 @@ raw_ostream &operator<<(raw_ostream &OS,
 ///      operator we do not currently handle).
 ///
 /// TODO: Support values other than constant integers.
-struct AAPotentialValues
+struct AAPotentialConstantValues
     : public StateWrapper<PotentialConstantIntValuesState, AbstractAttribute> {
   using Base = StateWrapper<PotentialConstantIntValuesState, AbstractAttribute>;
-  AAPotentialValues(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
+  AAPotentialConstantValues(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
 
   /// See AbstractAttribute::getState(...).
   PotentialConstantIntValuesState &getState() override { return *this; }
@@ -4384,22 +4521,23 @@ struct AAPotentialValues
   }
 
   /// Create an abstract attribute view for the position \p IRP.
-  static AAPotentialValues &createForPosition(const IRPosition &IRP,
-                                              Attributor &A);
+  static AAPotentialConstantValues &createForPosition(const IRPosition &IRP,
+                                                      Attributor &A);
 
   /// Return assumed constant for the associated value
-  Optional<ConstantInt *>
-  getAssumedConstantInt(Attributor &A,
-                        const Instruction *CtxI = nullptr) const {
+  Optional<Constant *>
+  getAssumedConstant(Attributor &A, const Instruction *CtxI = nullptr) const {
     if (!isValidState())
       return nullptr;
-    if (getAssumedSet().size() == 1)
-      return cast<ConstantInt>(ConstantInt::get(getAssociatedValue().getType(),
-                                                *(getAssumedSet().begin())));
+    if (getAssumedSet().size() == 1) {
+      Type *Ty = getAssociatedValue().getType();
+      return cast_or_null<Constant>(AA::getWithType(
+          *ConstantInt::get(Ty->getContext(), *(getAssumedSet().begin())),
+          *Ty));
+    }
     if (getAssumedSet().size() == 0) {
       if (undefIsContained())
-        return cast<ConstantInt>(
-            ConstantInt::get(getAssociatedValue().getType(), 0));
+        return UndefValue::get(getAssociatedValue().getType());
       return llvm::None;
     }
 
@@ -4407,13 +4545,15 @@ struct AAPotentialValues
   }
 
   /// See AbstractAttribute::getName()
-  const std::string getName() const override { return "AAPotentialValues"; }
+  const std::string getName() const override {
+    return "AAPotentialConstantValues";
+  }
 
   /// See AbstractAttribute::getIdAddr()
   const char *getIdAddr() const override { return &ID; }
 
   /// This function should return true if the type of the \p AA is
-  /// AAPotentialValues
+  /// AAPotentialConstantValues
   static bool classof(const AbstractAttribute *AA) {
     return (AA->getIdAddr() == &ID);
   }
@@ -4744,12 +4884,10 @@ struct AAPointerInfo : public AbstractAttribute {
     Instruction *getRemoteInst() const { return RemoteI; }
 
     /// Return true if the value written is not known yet.
-    bool isWrittenValueYetUndetermined() const { return !Content.hasValue(); }
+    bool isWrittenValueYetUndetermined() const { return !Content; }
 
     /// Return true if the value written cannot be determined at all.
-    bool isWrittenValueUnknown() const {
-      return Content.hasValue() && !*Content;
-    }
+    bool isWrittenValueUnknown() const { return Content && !*Content; }
 
     /// Return the type associated with the access, if known.
     Type *getType() const { return Ty; }
@@ -4792,21 +4930,55 @@ struct AAPointerInfo : public AbstractAttribute {
   /// See AbstractAttribute::getIdAddr()
   const char *getIdAddr() const override { return &ID; }
 
-  /// Call \p CB on all accesses that might interfere with \p LI and return true
-  /// if all such accesses were known and the callback returned true for all of
-  /// them, false otherwise.
-  virtual bool forallInterferingAccesses(
-      LoadInst &LI, function_ref<bool(const Access &, bool)> CB) const = 0;
+  /// Helper to represent an access offset and size, with logic to deal with
+  /// uncertainty and check for overlapping accesses.
+  struct OffsetAndSize : public std::pair<int64_t, int64_t> {
+    using BaseTy = std::pair<int64_t, int64_t>;
+    OffsetAndSize(int64_t Offset, int64_t Size) : BaseTy(Offset, Size) {}
+    OffsetAndSize(const BaseTy &P) : BaseTy(P) {}
+    int64_t getOffset() const { return first; }
+    int64_t getSize() const { return second; }
+    static OffsetAndSize getUnknown() {
+      return OffsetAndSize(Unknown, Unknown);
+    }
+
+    /// Return true if offset or size are unknown.
+    bool offsetOrSizeAreUnknown() const {
+      return getOffset() == OffsetAndSize::Unknown ||
+             getSize() == OffsetAndSize::Unknown;
+    }
+
+    /// Return true if this offset and size pair might describe an address that
+    /// overlaps with \p OAS.
+    bool mayOverlap(const OffsetAndSize &OAS) const {
+      // Any unknown value and we are giving up -> overlap.
+      if (offsetOrSizeAreUnknown() || OAS.offsetOrSizeAreUnknown())
+        return true;
+
+      // Check if one offset point is in the other interval [offset,
+      // offset+size].
+      return OAS.getOffset() + OAS.getSize() > getOffset() &&
+             OAS.getOffset() < getOffset() + getSize();
+    }
+
+    /// Constant used to represent unknown offset or sizes.
+    static constexpr int64_t Unknown = 1 << 31;
+  };
+
+  /// Call \p CB on all accesses that might interfere with \p OAS and return
+  /// true if all such accesses were known and the callback returned true for
+  /// all of them, false otherwise. An access interferes with an offset-size
+  /// pair if it might read or write that memory region.
   virtual bool forallInterferingAccesses(
-      StoreInst &SI, function_ref<bool(const Access &, bool)> CB) const = 0;
+      OffsetAndSize OAS, function_ref<bool(const Access &, bool)> CB) const = 0;
 
-  /// Call \p CB on all write accesses that might interfere with \p LI and
+  /// Call \p CB on all accesses that might interfere with \p I and
   /// return true if all such accesses were known and the callback returned true
   /// for all of them, false otherwise. In contrast to forallInterferingAccesses
   /// this function will perform reasoning to exclude write accesses that cannot
   /// affect the load even if they on the surface look as if they would.
-  virtual bool forallInterferingWrites(
-      Attributor &A, const AbstractAttribute &QueryingAA, LoadInst &LI,
+  virtual bool forallInterferingAccesses(
+      Attributor &A, const AbstractAttribute &QueryingAA, Instruction &I,
       function_ref<bool(const Access &, bool)> CB) const = 0;
 
   /// This function should return true if the type of the \p AA is AAPointerInfo
diff --git a/llvm/include/llvm/Transforms/IPO/DeadArgumentElimination.h b/llvm/include/llvm/Transforms/IPO/DeadArgumentElimination.h
index 496ceea12bc9..a71fa3bf404d 100644
--- a/llvm/include/llvm/Transforms/IPO/DeadArgumentElimination.h
+++ b/llvm/include/llvm/Transforms/IPO/DeadArgumentElimination.h
@@ -66,25 +66,24 @@ public:
     }
   };
 
-  /// Liveness enum - During our initial pass over the program, we determine
-  /// that things are either alive or maybe alive. We don't mark anything
-  /// explicitly dead (even if we know they are), since anything not alive
-  /// with no registered uses (in Uses) will never be marked alive and will
-  /// thus become dead in the end.
+  /// During our initial pass over the program, we determine that things are
+  /// either alive or maybe alive. We don't mark anything explicitly dead (even
+  /// if we know they are), since anything not alive with no registered uses
+  /// (in Uses) will never be marked alive and will thus become dead in the end.
   enum Liveness { Live, MaybeLive };
 
-  DeadArgumentEliminationPass(bool ShouldHackArguments_ = false)
-      : ShouldHackArguments(ShouldHackArguments_) {}
+  DeadArgumentEliminationPass(bool ShouldHackArguments = false)
+      : ShouldHackArguments(ShouldHackArguments) {}
 
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &);
 
   /// Convenience wrapper
-  RetOrArg CreateRet(const Function *F, unsigned Idx) {
+  RetOrArg createRet(const Function *F, unsigned Idx) {
     return RetOrArg(F, Idx, false);
   }
 
   /// Convenience wrapper
-  RetOrArg CreateArg(const Function *F, unsigned Idx) {
+  RetOrArg createArg(const Function *F, unsigned Idx) {
     return RetOrArg(F, Idx, true);
   }
 
@@ -122,21 +121,21 @@ public:
   bool ShouldHackArguments = false;
 
 private:
-  Liveness MarkIfNotLive(RetOrArg Use, UseVector &MaybeLiveUses);
-  Liveness SurveyUse(const Use *U, UseVector &MaybeLiveUses,
+  Liveness markIfNotLive(RetOrArg Use, UseVector &MaybeLiveUses);
+  Liveness surveyUse(const Use *U, UseVector &MaybeLiveUses,
                      unsigned RetValNum = -1U);
-  Liveness SurveyUses(const Value *V, UseVector &MaybeLiveUses);
+  Liveness surveyUses(const Value *V, UseVector &MaybeLiveUses);
 
-  void SurveyFunction(const Function &F);
-  bool IsLive(const RetOrArg &RA);
-  void MarkValue(const RetOrArg &RA, Liveness L,
+  void surveyFunction(const Function &F);
+  bool isLive(const RetOrArg &RA);
+  void markValue(const RetOrArg &RA, Liveness L,
                  const UseVector &MaybeLiveUses);
-  void MarkLive(const RetOrArg &RA);
-  void MarkLive(const Function &F);
-  void PropagateLiveness(const RetOrArg &RA);
-  bool RemoveDeadStuffFromFunction(Function *F);
-  bool DeleteDeadVarargs(Function &Fn);
-  bool RemoveDeadArgumentsFromCallers(Function &Fn);
+  void markLive(const RetOrArg &RA);
+  void markLive(const Function &F);
+  void propagateLiveness(const RetOrArg &RA);
+  bool removeDeadStuffFromFunction(Function *F);
+  bool deleteDeadVarargs(Function &F);
+  bool removeDeadArgumentsFromCallers(Function &F);
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Transforms/IPO/ForceFunctionAttrs.h b/llvm/include/llvm/Transforms/IPO/ForceFunctionAttrs.h
index a2b93f8aa30d..07c7cac77354 100644
--- a/llvm/include/llvm/Transforms/IPO/ForceFunctionAttrs.h
+++ b/llvm/include/llvm/Transforms/IPO/ForceFunctionAttrs.h
@@ -14,9 +14,10 @@
 #define LLVM_TRANSFORMS_IPO_FORCEFUNCTIONATTRS_H
 
 #include "llvm/IR/PassManager.h"
-#include "llvm/Pass.h"
 
 namespace llvm {
+class Module;
+class Pass;
 
 /// Pass which forces specific function attributes into the IR, primarily as
 /// a debugging tool.
diff --git a/llvm/include/llvm/Transforms/IPO/FunctionAttrs.h b/llvm/include/llvm/Transforms/IPO/FunctionAttrs.h
index 0b6734a3929d..bcb75025f8e5 100644
--- a/llvm/include/llvm/Transforms/IPO/FunctionAttrs.h
+++ b/llvm/include/llvm/Transforms/IPO/FunctionAttrs.h
@@ -15,29 +15,22 @@
 #ifndef LLVM_TRANSFORMS_IPO_FUNCTIONATTRS_H
 #define LLVM_TRANSFORMS_IPO_FUNCTIONATTRS_H
 
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/LazyCallGraph.h"
-#include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
 
-class AAResults;
+class GlobalValueSummary;
+class ModuleSummaryIndex;
 class Function;
 class Module;
 class Pass;
 
-/// The three kinds of memory access relevant to 'readonly' and
-/// 'readnone' attributes.
-enum MemoryAccessKind {
-  MAK_ReadNone = 0,
-  MAK_ReadOnly = 1,
-  MAK_MayWrite = 2,
-  MAK_WriteOnly = 3
-};
-
 /// Returns the memory access properties of this copy of the function.
-MemoryAccessKind computeFunctionBodyMemoryAccess(Function &F, AAResults &AAR);
+FunctionModRefBehavior computeFunctionBodyMemoryAccess(Function &F,
+                                                       AAResults &AAR);
 
 /// Propagate function attributes for function summaries along the index's
 /// callgraph during thinlink
diff --git a/llvm/include/llvm/Transforms/IPO/GlobalDCE.h b/llvm/include/llvm/Transforms/IPO/GlobalDCE.h
index 0a6851849e7e..a24196efb83b 100644
--- a/llvm/include/llvm/Transforms/IPO/GlobalDCE.h
+++ b/llvm/include/llvm/Transforms/IPO/GlobalDCE.h
@@ -19,11 +19,18 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallSet.h"
-#include "llvm/IR/Module.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/PassManager.h"
 #include <unordered_map>
 
 namespace llvm {
+class Comdat;
+class Constant;
+class Function;
+class GlobalVariable;
+class Metadata;
+class Module;
+class Value;
 
 /// Pass to remove unused function declarations.
 class GlobalDCEPass : public PassInfoMixin<GlobalDCEPass> {
diff --git a/llvm/include/llvm/Transforms/IPO/IROutliner.h b/llvm/include/llvm/Transforms/IPO/IROutliner.h
index e4807a1c9c65..315587e0f922 100644
--- a/llvm/include/llvm/Transforms/IPO/IROutliner.h
+++ b/llvm/include/llvm/Transforms/IPO/IROutliner.h
@@ -43,14 +43,13 @@
 
 #include "llvm/Analysis/IRSimilarityIdentifier.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/IR/ValueMap.h"
 #include "llvm/Support/InstructionCost.h"
 #include "llvm/Transforms/Utils/CodeExtractor.h"
-#include <set>
 
 struct OutlinableGroup;
 
 namespace llvm {
+using namespace CallingConv;
 using namespace IRSimilarity;
 
 class Module;
@@ -86,6 +85,13 @@ struct OutlinableRegion {
   DenseMap<unsigned, unsigned> ExtractedArgToAgg;
   DenseMap<unsigned, unsigned> AggArgToExtracted;
 
+  /// Values in the outlined functions will often be replaced by arguments. When
+  /// finding corresponding values from one region to another, the found value
+  /// will be the value the argument previously replaced.  This structure maps
+  /// any replaced values for the region to the aggregate aggregate argument
+  /// in the overall function.
+  DenseMap<Value *, Value *> RemappedArguments;
+
   /// Marks whether we need to change the order of the arguments when mapping
   /// the old extracted function call to the new aggregate outlined function
   /// call.
@@ -168,6 +174,15 @@ struct OutlinableRegion {
   /// \return The corresponding Value to \p V if it exists, otherwise nullptr.
   Value *findCorrespondingValueIn(const OutlinableRegion &Other, Value *V);
 
+  /// Find a corresponding BasicBlock for \p BB in similar OutlinableRegion \p Other.
+  ///
+  /// \param Other [in] - The OutlinableRegion to find the corresponding
+  /// BasicBlock in.
+  /// \param BB [in] - The BasicBlock to look for in the other region.
+  /// \return The corresponding Value to \p V if it exists, otherwise nullptr.
+  BasicBlock *findCorrespondingBlockIn(const OutlinableRegion &Other,
+                                       BasicBlock *BB);
+
   /// Get the size of the code removed from the region.
   ///
   /// \param [in] TTI - The TargetTransformInfo for the parent function.
@@ -372,6 +387,25 @@ private:
       // the call in outlined functions.
       if (CI.canReturnTwice())
         return false;
+      // TODO: Update the outliner to capture whether the outlined function
+      // needs these extra attributes.
+
+      // Functions marked with the swifttailcc and tailcc calling conventions
+      // require special handling when outlining musttail functions.  The
+      // calling convention must be passed down to the outlined function as
+      // well. Further, there is special handling for musttail calls as well,
+      // requiring a return call directly after.  For now, the outliner does not
+      // support this.
+      bool IsTailCC = CI.getCallingConv() == CallingConv::SwiftTail ||
+                      CI.getCallingConv() == CallingConv::Tail;
+      if (IsTailCC && !EnableMustTailCalls)
+        return false;
+      if (CI.isMustTailCall() && !EnableMustTailCalls)
+        return false;
+      // The outliner can only handle musttail items if it is also accompanied
+      // by the tailcc or swifttailcc calling convention.
+      if (CI.isMustTailCall() && !IsTailCC)
+        return false;
       return true;
     }
     // TODO: Handle FreezeInsts.  Since a frozen value could be frozen inside
@@ -397,6 +431,9 @@ private:
     // The flag variable that marks whether we should allow intrinsics
     // instructions to be outlined.
     bool EnableIntrinsics = false;
+
+    // The flag variable that marks whether we should allow musttail calls.
+    bool EnableMustTailCalls = false;
   };
 
   /// A InstVisitor used to exclude certain instructions from being outlined.
diff --git a/llvm/include/llvm/Transforms/IPO/InferFunctionAttrs.h b/llvm/include/llvm/Transforms/IPO/InferFunctionAttrs.h
index 302695d96355..880af2b46d7f 100644
--- a/llvm/include/llvm/Transforms/IPO/InferFunctionAttrs.h
+++ b/llvm/include/llvm/Transforms/IPO/InferFunctionAttrs.h
@@ -15,11 +15,11 @@
 #ifndef LLVM_TRANSFORMS_IPO_INFERFUNCTIONATTRS_H
 #define LLVM_TRANSFORMS_IPO_INFERFUNCTIONATTRS_H
 
-#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Pass.h"
 
 namespace llvm {
+class Module;
+class Pass;
 
 /// A pass which infers function attributes from the names and signatures of
 /// function declarations in a module.
diff --git a/llvm/include/llvm/Transforms/IPO/Inliner.h b/llvm/include/llvm/Transforms/IPO/Inliner.h
index a7060943c4c0..1e154eb8f5da 100644
--- a/llvm/include/llvm/Transforms/IPO/Inliner.h
+++ b/llvm/include/llvm/Transforms/IPO/Inliner.h
@@ -16,7 +16,6 @@
 #include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Analysis/Utils/ImportedFunctionsInliningStatistics.h"
 #include "llvm/IR/PassManager.h"
-#include <utility>
 
 namespace llvm {
 
@@ -96,7 +95,9 @@ protected:
 /// passes be composed to achieve the same end result.
 class InlinerPass : public PassInfoMixin<InlinerPass> {
 public:
-  InlinerPass(bool OnlyMandatory = false) : OnlyMandatory(OnlyMandatory) {}
+  InlinerPass(bool OnlyMandatory = false,
+              ThinOrFullLTOPhase LTOPhase = ThinOrFullLTOPhase::None)
+      : OnlyMandatory(OnlyMandatory), LTOPhase(LTOPhase) {}
   InlinerPass(InlinerPass &&Arg) = default;
 
   PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM,
@@ -110,6 +111,7 @@ private:
                             FunctionAnalysisManager &FAM, Module &M);
   std::unique_ptr<InlineAdvisor> OwnedAdvisor;
   const bool OnlyMandatory;
+  const ThinOrFullLTOPhase LTOPhase;
 };
 
 /// Module pass, wrapping the inliner pass. This works in conjunction with the
@@ -122,6 +124,7 @@ class ModuleInlinerWrapperPass
 public:
   ModuleInlinerWrapperPass(
       InlineParams Params = getInlineParams(), bool MandatoryFirst = true,
+      InlineContext IC = {},
       InliningAdvisorMode Mode = InliningAdvisorMode::Default,
       unsigned MaxDevirtIterations = 0);
   ModuleInlinerWrapperPass(ModuleInlinerWrapperPass &&Arg) = default;
@@ -147,6 +150,7 @@ public:
 
 private:
   const InlineParams Params;
+  const InlineContext IC;
   const InliningAdvisorMode Mode;
   const unsigned MaxDevirtIterations;
   // TODO: Clean this up so we only have one ModulePassManager.
diff --git a/llvm/include/llvm/Transforms/IPO/Internalize.h b/llvm/include/llvm/Transforms/IPO/Internalize.h
index 41816df93360..adcf5a932be0 100644
--- a/llvm/include/llvm/Transforms/IPO/Internalize.h
+++ b/llvm/include/llvm/Transforms/IPO/Internalize.h
@@ -23,7 +23,6 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringSet.h"
-#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/PassManager.h"
 #include <functional>
 
diff --git a/llvm/include/llvm/Transforms/IPO/ModuleInliner.h b/llvm/include/llvm/Transforms/IPO/ModuleInliner.h
index 7474e48aafaf..24cfff6083ff 100644
--- a/llvm/include/llvm/Transforms/IPO/ModuleInliner.h
+++ b/llvm/include/llvm/Transforms/IPO/ModuleInliner.h
@@ -11,10 +11,7 @@
 
 #include "llvm/Analysis/InlineAdvisor.h"
 #include "llvm/Analysis/InlineCost.h"
-#include "llvm/Analysis/ReplayInlineAdvisor.h"
-#include "llvm/Analysis/Utils/ImportedFunctionsInliningStatistics.h"
 #include "llvm/IR/PassManager.h"
-#include <utility>
 
 namespace llvm {
 
@@ -30,8 +27,9 @@ namespace llvm {
 class ModuleInlinerPass : public PassInfoMixin<ModuleInlinerPass> {
 public:
   ModuleInlinerPass(InlineParams Params = getInlineParams(),
-                    InliningAdvisorMode Mode = InliningAdvisorMode::Default)
-      : Params(Params), Mode(Mode){};
+                    InliningAdvisorMode Mode = InliningAdvisorMode::Default,
+                    ThinOrFullLTOPhase LTOPhase = ThinOrFullLTOPhase::None)
+      : Params(Params), Mode(Mode), LTOPhase(LTOPhase){};
   ModuleInlinerPass(ModuleInlinerPass &&Arg) = default;
 
   PreservedAnalyses run(Module &, ModuleAnalysisManager &);
@@ -42,6 +40,7 @@ private:
   std::unique_ptr<InlineAdvisor> OwnedAdvisor;
   const InlineParams Params;
   const InliningAdvisorMode Mode;
+  const ThinOrFullLTOPhase LTOPhase;
 };
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h b/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h
index 3b944878a810..2676f2705424 100644
--- a/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h
+++ b/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h
@@ -16,7 +16,6 @@
 
 #include "llvm-c/Transforms/PassManagerBuilder.h"
 #include <functional>
-#include <memory>
 #include <string>
 #include <vector>
 
@@ -214,7 +213,6 @@ private:
   void addInitialAliasAnalysisPasses(legacy::PassManagerBase &PM) const;
   void addLTOOptimizationPasses(legacy::PassManagerBase &PM);
   void addLateLTOOptimizationPasses(legacy::PassManagerBase &PM);
-  void addPGOInstrPasses(legacy::PassManagerBase &MPM, bool IsCS);
   void addFunctionSimplificationPasses(legacy::PassManagerBase &MPM);
   void addVectorPasses(legacy::PassManagerBase &PM, bool IsFullLTO);
 
@@ -226,8 +224,6 @@ public:
 
   /// populateModulePassManager - This sets up the primary pass manager.
   void populateModulePassManager(legacy::PassManagerBase &MPM);
-  void populateLTOPassManager(legacy::PassManagerBase &PM);
-  void populateThinLTOPassManager(legacy::PassManagerBase &PM);
 };
 
 /// Registers a function for adding a standard set of passes.  This should be
diff --git a/llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h b/llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h
index 893654650caa..fff06da22cf3 100644
--- a/llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h
+++ b/llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h
@@ -18,9 +18,6 @@
 #include <queue>
 #include <set>
 
-using namespace llvm;
-using namespace sampleprof;
-
 namespace llvm {
 namespace sampleprof {
 
@@ -51,10 +48,10 @@ struct ProfiledCallGraphNode {
     }
   };
 
-  using iterator = std::set<ProfiledCallGraphEdge>::iterator;
-  using const_iterator = std::set<ProfiledCallGraphEdge>::const_iterator;
   using edge = ProfiledCallGraphEdge;
-  using edges = std::set<ProfiledCallGraphEdge, ProfiledCallGraphEdgeComparer>;
+  using edges = std::set<edge, ProfiledCallGraphEdgeComparer>;
+  using iterator = edges::iterator;
+  using const_iterator = edges::const_iterator;
 
   ProfiledCallGraphNode(StringRef FName = StringRef()) : Name(FName) {}
 
@@ -64,11 +61,11 @@ struct ProfiledCallGraphNode {
 
 class ProfiledCallGraph {
 public:
-  using iterator = std::set<ProfiledCallGraphEdge>::iterator;
+  using iterator = ProfiledCallGraphNode::iterator;
 
   // Constructor for non-CS profile.
   ProfiledCallGraph(SampleProfileMap &ProfileMap) {
-    assert(!FunctionSamples::ProfileIsCSFlat &&
+    assert(!FunctionSamples::ProfileIsCS &&
            "CS flat profile is not handled here");
     for (const auto &Samples : ProfileMap) {
       addProfiledCalls(Samples.second);
diff --git a/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h b/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h
index cf87d028600f..a97d5ee3d710 100644
--- a/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h
+++ b/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h
@@ -15,20 +15,18 @@
 #ifndef LLVM_TRANSFORMS_IPO_SAMPLECONTEXTTRACKER_H
 #define LLVM_TRANSFORMS_IPO_SAMPLECONTEXTTRACKER_H
 
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/ProfileData/SampleProf.h"
-#include <list>
 #include <map>
+#include <queue>
 #include <vector>
 
-using namespace llvm;
-using namespace sampleprof;
-
 namespace llvm {
+class CallBase;
+class DILocation;
+class Function;
+class Instruction;
 
 // Internal trie tree representation used for tracking context tree and sample
 // profiles. The path from root node to a given node represents the context of
@@ -47,11 +45,6 @@ public:
   ContextTrieNode *getOrCreateChildContext(const LineLocation &CallSite,
                                            StringRef ChildName,
                                            bool AllowCreate = true);
-
-  ContextTrieNode &moveToChildContext(const LineLocation &CallSite,
-                                      ContextTrieNode &&NodeToMove,
-                                      uint32_t ContextFramesToRemove,
-                                      bool DeleteNode = true);
   void removeChildContext(const LineLocation &CallSite, StringRef ChildName);
   std::map<uint64_t, ContextTrieNode> &getAllChildContext();
   StringRef getFuncName() const;
@@ -62,6 +55,7 @@ public:
   LineLocation getCallSiteLoc() const;
   ContextTrieNode *getParentContext() const;
   void setParentContext(ContextTrieNode *Parent);
+  void setCallSiteLoc(const LineLocation &Loc);
   void dumpNode();
   void dumpTree();
 
@@ -94,22 +88,13 @@ private:
 // calling context and the context is identified by path from root to the node.
 class SampleContextTracker {
 public:
-  struct ProfileComparer {
-    bool operator()(FunctionSamples *A, FunctionSamples *B) const {
-      // Sort function profiles by the number of total samples and their
-      // contexts.
-      if (A->getTotalSamples() == B->getTotalSamples())
-        return A->getContext() < B->getContext();
-      return A->getTotalSamples() > B->getTotalSamples();
-    }
-  };
-
-  // Keep profiles of a function sorted so that they will be processed/promoted
-  // deterministically.
-  using ContextSamplesTy = std::set<FunctionSamples *, ProfileComparer>;
+  using ContextSamplesTy = std::vector<FunctionSamples *>;
 
+  SampleContextTracker() = default;
   SampleContextTracker(SampleProfileMap &Profiles,
                        const DenseMap<uint64_t, StringRef> *GUIDToFuncNameMap);
+  // Populate the FuncToCtxtProfiles map after the trie is built.
+  void populateFuncToCtxtMap();
   // Query context profile for a specific callee with given name at a given
   // call-site. The full context is identified by location of call instruction.
   FunctionSamples *getCalleeContextSamplesFor(const CallBase &Inst,
@@ -125,6 +110,8 @@ public:
   // Get all context profile for given function.
   ContextSamplesTy &getAllContextSamplesFor(const Function &Func);
   ContextSamplesTy &getAllContextSamplesFor(StringRef Name);
+  ContextTrieNode *getOrCreateContextPath(const SampleContext &Context,
+                                          bool AllowCreate);
   // Query base profile for a given function. A base profile is a merged view
   // of all context profiles for contexts that are not inlined.
   FunctionSamples *getBaseSamplesFor(const Function &Func,
@@ -142,6 +129,64 @@ public:
   ContextTrieNode &getRootContext();
   void promoteMergeContextSamplesTree(const Instruction &Inst,
                                       StringRef CalleeName);
+
+  // Create a merged conext-less profile map.
+  void createContextLessProfileMap(SampleProfileMap &ContextLessProfiles);
+  ContextTrieNode *
+  getContextNodeForProfile(const FunctionSamples *FSamples) const {
+    auto I = ProfileToNodeMap.find(FSamples);
+    if (I == ProfileToNodeMap.end())
+      return nullptr;
+    return I->second;
+  }
+  StringMap<ContextSamplesTy> &getFuncToCtxtProfiles() {
+    return FuncToCtxtProfiles;
+  }
+
+  class Iterator : public std::iterator<std::forward_iterator_tag,
+                                        const ContextTrieNode *> {
+    std::queue<ContextTrieNode *> NodeQueue;
+
+  public:
+    explicit Iterator() = default;
+    explicit Iterator(ContextTrieNode *Node) { NodeQueue.push(Node); }
+    Iterator &operator++() {
+      assert(!NodeQueue.empty() && "Iterator already at the end");
+      ContextTrieNode *Node = NodeQueue.front();
+      NodeQueue.pop();
+      for (auto &It : Node->getAllChildContext())
+        NodeQueue.push(&It.second);
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      assert(!NodeQueue.empty() && "Iterator already at the end");
+      Iterator Ret = *this;
+      ++(*this);
+      return Ret;
+    }
+    bool operator==(const Iterator &Other) const {
+      if (NodeQueue.empty() && Other.NodeQueue.empty())
+        return true;
+      if (NodeQueue.empty() || Other.NodeQueue.empty())
+        return false;
+      return NodeQueue.front() == Other.NodeQueue.front();
+    }
+    bool operator!=(const Iterator &Other) const { return !(*this == Other); }
+    ContextTrieNode *operator*() const {
+      assert(!NodeQueue.empty() && "Invalid access to end iterator");
+      return NodeQueue.front();
+    }
+  };
+
+  Iterator begin() { return Iterator(&RootContext); }
+  Iterator end() { return Iterator(); }
+
+#ifndef NDEBUG
+  // Get a context string from root to current node.
+  std::string getContextString(const FunctionSamples &FSamples) const;
+  std::string getContextString(ContextTrieNode *Node) const;
+#endif
   // Dump the internal context profile trie.
   void dump();
 
@@ -149,21 +194,26 @@ private:
   ContextTrieNode *getContextFor(const DILocation *DIL);
   ContextTrieNode *getCalleeContextFor(const DILocation *DIL,
                                        StringRef CalleeName);
-  ContextTrieNode *getOrCreateContextPath(const SampleContext &Context,
-                                          bool AllowCreate);
   ContextTrieNode *getTopLevelContextNode(StringRef FName);
   ContextTrieNode &addTopLevelContextNode(StringRef FName);
   ContextTrieNode &promoteMergeContextSamplesTree(ContextTrieNode &NodeToPromo);
-  void mergeContextNode(ContextTrieNode &FromNode, ContextTrieNode &ToNode,
-                        uint32_t ContextFramesToRemove);
+  void mergeContextNode(ContextTrieNode &FromNode, ContextTrieNode &ToNode);
   ContextTrieNode &
   promoteMergeContextSamplesTree(ContextTrieNode &FromNode,
-                                 ContextTrieNode &ToNodeParent,
-                                 uint32_t ContextFramesToRemove);
-
+                                 ContextTrieNode &ToNodeParent);
+  ContextTrieNode &moveContextSamples(ContextTrieNode &ToNodeParent,
+                                      const LineLocation &CallSite,
+                                      ContextTrieNode &&NodeToMove);
+  void setContextNode(const FunctionSamples *FSample, ContextTrieNode *Node) {
+    ProfileToNodeMap[FSample] = Node;
+  }
   // Map from function name to context profiles (excluding base profile)
   StringMap<ContextSamplesTy> FuncToCtxtProfiles;
 
+  // Map from current FunctionSample to the belonged context trie.
+  std::unordered_map<const FunctionSamples *, ContextTrieNode *>
+      ProfileToNodeMap;
+
   // Map from function guid to real function names. Only used in md5 mode.
   const DenseMap<uint64_t, StringRef> *GUIDToFuncNameMap;
 
diff --git a/llvm/include/llvm/Transforms/IPO/SampleProfile.h b/llvm/include/llvm/Transforms/IPO/SampleProfile.h
index 704b793ab3ea..d838c8b8a83e 100644
--- a/llvm/include/llvm/Transforms/IPO/SampleProfile.h
+++ b/llvm/include/llvm/Transforms/IPO/SampleProfile.h
@@ -36,7 +36,7 @@ public:
 private:
   std::string ProfileFileName;
   std::string ProfileRemappingFileName;
-  ThinOrFullLTOPhase LTOPhase;
+  const ThinOrFullLTOPhase LTOPhase;
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h b/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h
index e73c36043cb2..ed296d2dd080 100644
--- a/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h
+++ b/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h
@@ -16,17 +16,19 @@
 #define LLVM_TRANSFORMS_IPO_SAMPLEPROFILEPROBE_H
 
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/LazyCallGraph.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/IR/PassInstrumentation.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/IR/PseudoProbe.h"
 #include "llvm/ProfileData/SampleProf.h"
-#include "llvm/Target/TargetMachine.h"
 #include <unordered_map>
 
 namespace llvm {
+class Any;
+class BasicBlock;
+class Function;
+class Instruction;
+class Loop;
+class PassInstrumentationCallbacks;
+class TargetMachine;
 
 class Module;
 
diff --git a/llvm/include/llvm/Transforms/IPO/StripDeadPrototypes.h b/llvm/include/llvm/Transforms/IPO/StripDeadPrototypes.h
index f4a15c36afc9..4a2eaad63113 100644
--- a/llvm/include/llvm/Transforms/IPO/StripDeadPrototypes.h
+++ b/llvm/include/llvm/Transforms/IPO/StripDeadPrototypes.h
@@ -16,11 +16,12 @@
 #ifndef LLVM_TRANSFORMS_IPO_STRIPDEADPROTOTYPES_H
 #define LLVM_TRANSFORMS_IPO_STRIPDEADPROTOTYPES_H
 
-#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
 
+class Module;
+
 /// Pass to remove unused function declarations.
 struct StripDeadPrototypesPass : PassInfoMixin<StripDeadPrototypesPass> {
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &);
diff --git a/llvm/include/llvm/Transforms/IPO/ThinLTOBitcodeWriter.h b/llvm/include/llvm/Transforms/IPO/ThinLTOBitcodeWriter.h
index 7acb922b37e1..469cf2bc5011 100644
--- a/llvm/include/llvm/Transforms/IPO/ThinLTOBitcodeWriter.h
+++ b/llvm/include/llvm/Transforms/IPO/ThinLTOBitcodeWriter.h
@@ -17,9 +17,10 @@
 #define LLVM_TRANSFORMS_IPO_THINLTOBITCODEWRITER_H
 
 #include <llvm/IR/PassManager.h>
-#include <llvm/Support/raw_ostream.h>
 
 namespace llvm {
+class Module;
+class raw_ostream;
 
 class ThinLTOBitcodeWriterPass
     : public PassInfoMixin<ThinLTOBitcodeWriterPass> {
diff --git a/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h b/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h
index 2e9744cfd524..47c137e70a7f 100644
--- a/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h
+++ b/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h
@@ -14,16 +14,17 @@
 #ifndef LLVM_TRANSFORMS_IPO_WHOLEPROGRAMDEVIRT_H
 #define LLVM_TRANSFORMS_IPO_WHOLEPROGRAMDEVIRT_H
 
-#include "llvm/IR/Module.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Transforms/IPO/FunctionImport.h"
 #include <cassert>
 #include <cstdint>
+#include <map>
 #include <set>
 #include <utility>
 #include <vector>
 
 namespace llvm {
+class Module;
 
 template <typename T> class ArrayRef;
 template <typename T> class MutableArrayRef;
diff --git a/llvm/include/llvm/Transforms/InstCombine/InstCombine.h b/llvm/include/llvm/Transforms/InstCombine/InstCombine.h
index 6dee38c83b36..35a3a8c3218b 100644
--- a/llvm/include/llvm/Transforms/InstCombine/InstCombine.h
+++ b/llvm/include/llvm/Transforms/InstCombine/InstCombine.h
@@ -18,6 +18,7 @@
 
 #include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
 
 #define DEBUG_TYPE "instcombine"
 #include "llvm/Transforms/Utils/InstructionWorklist.h"
diff --git a/llvm/include/llvm/Transforms/Instrumentation.h b/llvm/include/llvm/Transforms/Instrumentation.h
index a288a3972c3d..9ff45fc29b06 100644
--- a/llvm/include/llvm/Transforms/Instrumentation.h
+++ b/llvm/include/llvm/Transforms/Instrumentation.h
@@ -15,6 +15,10 @@
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
 #include <cassert>
 #include <cstdint>
 #include <limits>
@@ -75,21 +79,6 @@ struct GCOVOptions {
   std::string Exclude;
 };
 
-ModulePass *createGCOVProfilerPass(const GCOVOptions &Options =
-                                   GCOVOptions::getDefault());
-
-// PGO Instrumention. Parameter IsCS indicates if this is the context sensitive
-// instrumentation.
-ModulePass *createPGOInstrumentationGenLegacyPass(bool IsCS = false);
-ModulePass *
-createPGOInstrumentationUseLegacyPass(StringRef Filename = StringRef(""),
-                                      bool IsCS = false);
-ModulePass *createPGOInstrumentationGenCreateVarLegacyPass(
-    StringRef CSInstrName = StringRef(""));
-ModulePass *createPGOIndirectCallPromotionLegacyPass(bool InLTO = false,
-                                                     bool SamplePGO = false);
-FunctionPass *createPGOMemOPSizeOptLegacyPass();
-
 ModulePass *createCGProfileLegacyPass();
 
 // The pgo-specific indirect call promotion function declared below is used by
@@ -194,6 +183,26 @@ static inline uint32_t scaleBranchCount(uint64_t Count, uint64_t Scale) {
   assert(Scaled <= std::numeric_limits<uint32_t>::max() && "overflow 32-bits");
   return Scaled;
 }
+
+// Use to ensure the inserted instrumentation has a DebugLocation; if none is
+// attached to the source instruction, try to use a DILocation with offset 0
+// scoped to surrounding function (if it has a DebugLocation).
+//
+// Some non-call instructions may be missing debug info, but when inserting
+// instrumentation calls, some builds (e.g. LTO) want calls to have debug info
+// if the enclosing function does.
+struct InstrumentationIRBuilder : IRBuilder<> {
+  static void ensureDebugInfo(IRBuilder<> &IRB, const Function &F) {
+    if (IRB.getCurrentDebugLocation())
+      return;
+    if (DISubprogram *SP = F.getSubprogram())
+      IRB.SetCurrentDebugLocation(DILocation::get(SP->getContext(), 0, 0, SP));
+  }
+
+  explicit InstrumentationIRBuilder(Instruction *IP) : IRBuilder<>(IP) {
+    ensureDebugInfo(*this, *IP->getFunction());
+  }
+};
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_INSTRUMENTATION_H
diff --git a/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizer.h b/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizer.h
index a0d8118c23f7..d12b2cf45825 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizer.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizer.h
@@ -13,82 +13,17 @@
 #ifndef LLVM_TRANSFORMS_INSTRUMENTATION_ADDRESSSANITIZER_H
 #define LLVM_TRANSFORMS_INSTRUMENTATION_ADDRESSSANITIZER_H
 
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Pass.h"
 #include "llvm/Transforms/Instrumentation/AddressSanitizerOptions.h"
 
 namespace llvm {
-
-/// Frontend-provided metadata for source location.
-struct LocationMetadata {
-  StringRef Filename;
-  int LineNo = 0;
-  int ColumnNo = 0;
-
-  LocationMetadata() = default;
-
-  bool empty() const { return Filename.empty(); }
-  void parse(MDNode *MDN);
-};
-
-/// Frontend-provided metadata for global variables.
-class GlobalsMetadata {
-public:
-  struct Entry {
-    LocationMetadata SourceLoc;
-    StringRef Name;
-    bool IsDynInit = false;
-    bool IsExcluded = false;
-
-    Entry() = default;
-  };
-
-  /// Create a default uninitialized GlobalsMetadata instance.
-  GlobalsMetadata() = default;
-
-  /// Create an initialized GlobalsMetadata instance.
-  GlobalsMetadata(Module &M);
-
-  /// Returns metadata entry for a given global.
-  Entry get(GlobalVariable *G) const {
-    auto Pos = Entries.find(G);
-    return (Pos != Entries.end()) ? Pos->second : Entry();
-  }
-
-  /// Handle invalidation from the pass manager.
-  /// These results are never invalidated.
-  bool invalidate(Module &, const PreservedAnalyses &,
-                  ModuleAnalysisManager::Invalidator &) {
-    return false;
-  }
-  bool invalidate(Function &, const PreservedAnalyses &,
-                  FunctionAnalysisManager::Invalidator &) {
-    return false;
-  }
-
-private:
-  DenseMap<GlobalVariable *, Entry> Entries;
-};
-
-/// The ASanGlobalsMetadataAnalysis initializes and returns a GlobalsMetadata
-/// object. More specifically, ASan requires looking at all globals registered
-/// in 'llvm.asan.globals' before running, which only depends on reading module
-/// level metadata. This analysis is required to run before running the
-/// AddressSanitizerPass since it collects that metadata.
-/// The legacy pass manager equivalent of this is ASanGlobalsMetadataLegacyPass.
-class ASanGlobalsMetadataAnalysis
-    : public AnalysisInfoMixin<ASanGlobalsMetadataAnalysis> {
-public:
-  using Result = GlobalsMetadata;
-
-  Result run(Module &, ModuleAnalysisManager &);
-
-private:
-  friend AnalysisInfoMixin<ASanGlobalsMetadataAnalysis>;
-  static AnalysisKey Key;
-};
+class Function;
+class FunctionPass;
+class GlobalVariable;
+class MDNode;
+class Module;
+class ModulePass;
+class raw_ostream;
 
 struct AddressSanitizerOptions {
   bool CompileKernel = false;
@@ -98,26 +33,6 @@ struct AddressSanitizerOptions {
       AsanDetectStackUseAfterReturnMode::Runtime;
 };
 
-/// Public interface to the address sanitizer pass for instrumenting code to
-/// check for various memory errors at runtime.
-///
-/// The sanitizer itself is a function pass that works by inserting various
-/// calls to the ASan runtime library functions. The runtime library essentially
-/// replaces malloc() and free() with custom implementations that allow regions
-/// surrounding requested memory to be checked for invalid accesses.
-class AddressSanitizerPass : public PassInfoMixin<AddressSanitizerPass> {
-public:
-  AddressSanitizerPass(const AddressSanitizerOptions &Options)
-      : Options(Options){};
-  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
-  void printPipeline(raw_ostream &OS,
-                     function_ref<StringRef(StringRef)> MapClassName2PassName);
-  static bool isRequired() { return true; }
-
-private:
-  AddressSanitizerOptions Options;
-};
-
 /// Public interface to the address sanitizer module pass for instrumenting code
 /// to check for various memory errors.
 ///
@@ -142,17 +57,6 @@ private:
   AsanDtorKind DestructorKind;
 };
 
-// Insert AddressSanitizer (address basic correctness checking) instrumentation
-FunctionPass *createAddressSanitizerFunctionPass(
-    bool CompileKernel = false, bool Recover = false,
-    bool UseAfterScope = false,
-    AsanDetectStackUseAfterReturnMode UseAfterReturn =
-        AsanDetectStackUseAfterReturnMode::Runtime);
-ModulePass *createModuleAddressSanitizerLegacyPassPass(
-    bool CompileKernel = false, bool Recover = false, bool UseGlobalsGC = true,
-    bool UseOdrIndicator = true,
-    AsanDtorKind DestructorKind = AsanDtorKind::Global);
-
 struct ASanAccessInfo {
   const int32_t Packed;
   const uint8_t AccessSizeIndex;
diff --git a/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerCommon.h b/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerCommon.h
index 0a5456c5956f..7858a1c4b2fd 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerCommon.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerCommon.h
@@ -47,51 +47,6 @@ public:
   Value *getPtr() { return PtrUse->get(); }
 };
 
-// For an alloca valid between lifetime markers Start and Ends, call the
-// Callback for all possible exits out of the lifetime in the containing
-// function, which can return from the instructions in RetVec.
-//
-// Returns whether Ends covered all possible exits. If they did not,
-// the caller should remove Ends to ensure that work done at the other
-// exits does not happen outside of the lifetime.
-template <typename F>
-bool forAllReachableExits(const DominatorTree &DT, const PostDominatorTree &PDT,
-                          const Instruction *Start,
-                          const SmallVectorImpl<IntrinsicInst *> &Ends,
-                          const SmallVectorImpl<Instruction *> &RetVec,
-                          F Callback) {
-  if (Ends.size() == 1 && PDT.dominates(Ends[0], Start)) {
-    Callback(Ends[0]);
-    return true;
-  }
-  SmallVector<Instruction *, 8> ReachableRetVec;
-  unsigned NumCoveredExits = 0;
-  for (auto *RI : RetVec) {
-    if (!isPotentiallyReachable(Start, RI, nullptr, &DT))
-      continue;
-    ReachableRetVec.push_back(RI);
-    // TODO(fmayer): We don't support diamond shapes, where multiple lifetime
-    // ends together dominate the RI, but none of them does by itself.
-    // Check how often this happens and decide whether to support this here.
-    if (std::any_of(Ends.begin(), Ends.end(),
-                    [&](Instruction *End) { return DT.dominates(End, RI); }))
-      ++NumCoveredExits;
-  }
-  // If there's a mix of covered and non-covered exits, just put the untag
-  // on exits, so we avoid the redundancy of untagging twice.
-  if (NumCoveredExits == ReachableRetVec.size()) {
-    for (auto *End : Ends)
-      Callback(End);
-  } else {
-    for (auto *RI : ReachableRetVec)
-      Callback(RI);
-    // We may have inserted untag outside of the lifetime interval.
-    // Signal the caller to remove the lifetime end call for this alloca.
-    return false;
-  }
-  return true;
-}
-
 // Get AddressSanitizer parameters.
 void getAddressSanitizerParams(const Triple &TargetTriple, int LongSize,
                                bool IsKasan, uint64_t *ShadowBase,
diff --git a/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerOptions.h b/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerOptions.h
index f019d1c00a35..187aaedb6000 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerOptions.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerOptions.h
@@ -17,14 +17,13 @@ enum class AsanDtorKind {
   None,    ///< Do not emit any destructors for ASan
   Global,  ///< Append to llvm.global_dtors
   Invalid, ///< Not a valid destructor Kind.
-  // TODO(dliew): Add more more kinds.
 };
 
 /// Mode of ASan detect stack use after return
 enum class AsanDetectStackUseAfterReturnMode {
   Never,   ///< Never detect stack use after return.
-  Runtime, ///< Detect stack use after return if runtime flag is enabled
-           ///< (ASAN_OPTIONS=detect_stack_use_after_return=1)
+  Runtime, ///< Detect stack use after return if not disabled runtime with
+           ///< (ASAN_OPTIONS=detect_stack_use_after_return=0).
   Always,  ///< Always detect stack use after return.
   Invalid, ///< Not a valid detect mode.
 };
diff --git a/llvm/include/llvm/Transforms/Instrumentation/BoundsChecking.h b/llvm/include/llvm/Transforms/Instrumentation/BoundsChecking.h
index 76d586252743..5e68141e3399 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/BoundsChecking.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/BoundsChecking.h
@@ -10,9 +10,10 @@
 #define LLVM_TRANSFORMS_INSTRUMENTATION_BOUNDSCHECKING_H
 
 #include "llvm/IR/PassManager.h"
-#include "llvm/Pass.h"
 
 namespace llvm {
+class Function;
+class FunctionPass;
 
 /// A pass to instrument code and perform run-time bounds checking on loads,
 /// stores, and other memory intrinsics.
diff --git a/llvm/include/llvm/Transforms/Instrumentation/CGProfile.h b/llvm/include/llvm/Transforms/Instrumentation/CGProfile.h
index c56e4c78cad5..9f9ce42277a0 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/CGProfile.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/CGProfile.h
@@ -12,10 +12,10 @@
 #ifndef LLVM_TRANSFORMS_INSTRUMENTATION_CGPROFILE_H
 #define LLVM_TRANSFORMS_INSTRUMENTATION_CGPROFILE_H
 
-#include "llvm/ADT/MapVector.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
+class Module;
 class CGProfilePass : public PassInfoMixin<CGProfilePass> {
 public:
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
diff --git a/llvm/include/llvm/Transforms/Instrumentation/ControlHeightReduction.h b/llvm/include/llvm/Transforms/Instrumentation/ControlHeightReduction.h
index 18b428582046..0bace514c361 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/ControlHeightReduction.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/ControlHeightReduction.h
@@ -14,7 +14,6 @@
 #ifndef LLVM_TRANSFORMS_INSTRUMENTATION_CONTROLHEIGHTREDUCTION_H
 #define LLVM_TRANSFORMS_INSTRUMENTATION_CONTROLHEIGHTREDUCTION_H
 
-#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
diff --git a/llvm/include/llvm/Transforms/Instrumentation/DataFlowSanitizer.h b/llvm/include/llvm/Transforms/Instrumentation/DataFlowSanitizer.h
index 9b57b1f9a9ea..41ba05cd67f0 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/DataFlowSanitizer.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/DataFlowSanitizer.h
@@ -8,12 +8,12 @@
 #ifndef LLVM_TRANSFORMS_INSTRUMENTATION_DATAFLOWSANITIZER_H
 #define LLVM_TRANSFORMS_INSTRUMENTATION_DATAFLOWSANITIZER_H
 
-#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include <string>
 #include <vector>
 
 namespace llvm {
+class Module;
 
 class DataFlowSanitizerPass : public PassInfoMixin<DataFlowSanitizerPass> {
 private:
diff --git a/llvm/include/llvm/Transforms/Instrumentation/HWAddressSanitizer.h b/llvm/include/llvm/Transforms/Instrumentation/HWAddressSanitizer.h
index 70949026a892..d3b5b5ca5c25 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/HWAddressSanitizer.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/HWAddressSanitizer.h
@@ -13,11 +13,14 @@
 #ifndef LLVM_TRANSFORMS_INSTRUMENTATION_HWADDRESSSANITIZER_H
 #define LLVM_TRANSFORMS_INSTRUMENTATION_HWADDRESSSANITIZER_H
 
-#include "llvm/IR/Function.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Pass.h"
 
 namespace llvm {
+class FunctionPass;
+class Module;
+class StringRef;
+class raw_ostream;
 
 struct HWAddressSanitizerOptions {
   HWAddressSanitizerOptions()
@@ -47,11 +50,6 @@ private:
   HWAddressSanitizerOptions Options;
 };
 
-FunctionPass *
-createHWAddressSanitizerLegacyPassPass(bool CompileKernel = false,
-                                       bool Recover = false,
-                                       bool DisableOptimization = false);
-
 namespace HWASanAccessInfo {
 
 // Bit field positions for the accessinfo parameter to
diff --git a/llvm/include/llvm/Transforms/Instrumentation/InstrProfiling.h b/llvm/include/llvm/Transforms/Instrumentation/InstrProfiling.h
index 5873db22a5d1..90fc0670448b 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/InstrProfiling.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/InstrProfiling.h
@@ -19,7 +19,6 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Transforms/Instrumentation.h"
-#include <cstddef>
 #include <cstdint>
 #include <cstring>
 #include <vector>
@@ -57,6 +56,9 @@ private:
     }
   };
   DenseMap<GlobalVariable *, PerFunctionProfileData> ProfileDataMap;
+  /// If runtime relocation is enabled, this maps functions to the load
+  /// instruction that produces the profile relocation bias.
+  DenseMap<const Function *, LoadInst *> FunctionToProfileBiasMap;
   std::vector<GlobalValue *> CompilerUsedVars;
   std::vector<GlobalValue *> UsedVars;
   std::vector<GlobalVariable *> ReferencedNames;
diff --git a/llvm/include/llvm/Transforms/Instrumentation/MemProfiler.h b/llvm/include/llvm/Transforms/Instrumentation/MemProfiler.h
index b9ad56ba7509..b584b9984492 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/MemProfiler.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/MemProfiler.h
@@ -12,12 +12,13 @@
 #ifndef LLVM_TRANSFORMS_INSTRUMENTATION_MEMPROFILER_H
 #define LLVM_TRANSFORMS_INSTRUMENTATION_MEMPROFILER_H
 
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Pass.h"
 
 namespace llvm {
+class Function;
+class FunctionPass;
+class Module;
+class ModulePass;
 
 /// Public interface to the memory profiler pass for instrumenting code to
 /// profile memory accesses.
diff --git a/llvm/include/llvm/Transforms/Instrumentation/MemorySanitizer.h b/llvm/include/llvm/Transforms/Instrumentation/MemorySanitizer.h
index e5779dc775ba..e4654a0fc7ef 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/MemorySanitizer.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/MemorySanitizer.h
@@ -13,10 +13,15 @@
 #ifndef LLVM_TRANSFORMS_INSTRUMENTATION_MEMORYSANITIZER_H
 #define LLVM_TRANSFORMS_INSTRUMENTATION_MEMORYSANITIZER_H
 
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Pass.h"
 
 namespace llvm {
+class Function;
+class FunctionPass;
+class Module;
+class StringRef;
+class raw_ostream;
 
 struct MemorySanitizerOptions {
   MemorySanitizerOptions() : MemorySanitizerOptions(0, false, false, false){};
@@ -30,10 +35,6 @@ struct MemorySanitizerOptions {
   bool EagerChecks;
 };
 
-// Insert MemorySanitizer instrumentation (detection of uninitialized reads)
-FunctionPass *
-createMemorySanitizerLegacyPassPass(MemorySanitizerOptions Options = {});
-
 /// A function pass for msan instrumentation.
 ///
 /// Instruments functions to detect unitialized reads. This function pass
diff --git a/llvm/include/llvm/Transforms/Instrumentation/SanitizerCoverage.h b/llvm/include/llvm/Transforms/Instrumentation/SanitizerCoverage.h
index e3d268cb0781..9bacb7eb38a5 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/SanitizerCoverage.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/SanitizerCoverage.h
@@ -16,13 +16,14 @@
 #ifndef LLVM_TRANSFORMS_INSTRUMENTATION_SANITIZERCOVERAGE_H
 #define LLVM_TRANSFORMS_INSTRUMENTATION_SANITIZERCOVERAGE_H
 
-#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/SpecialCaseList.h"
 #include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Transforms/Instrumentation.h"
 
 namespace llvm {
+class Module;
+class ModulePass;
 
 /// This is the ModuleSanitizerCoverage pass used in the new pass manager. The
 /// pass instruments functions for coverage, adds initialization calls to the
diff --git a/llvm/include/llvm/Transforms/Instrumentation/ThreadSanitizer.h b/llvm/include/llvm/Transforms/Instrumentation/ThreadSanitizer.h
index e795043630d5..b3a067ba59c2 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/ThreadSanitizer.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/ThreadSanitizer.h
@@ -14,11 +14,11 @@
 #define LLVM_TRANSFORMS_INSTRUMENTATION_THREADSANITIZER_H
 
 #include "llvm/IR/PassManager.h"
-#include "llvm/Pass.h"
 
 namespace llvm {
-// Insert ThreadSanitizer (race detection) instrumentation
-FunctionPass *createThreadSanitizerLegacyPassPass();
+class Function;
+class FunctionPass;
+class Module;
 
 /// A function pass for tsan instrumentation.
 ///
diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h
index d6228700aa9a..edd492b0343d 100644
--- a/llvm/include/llvm/Transforms/Scalar.h
+++ b/llvm/include/llvm/Transforms/Scalar.h
@@ -133,7 +133,8 @@ Pass *createIndVarSimplifyPass();
 //
 Pass *createLICMPass();
 Pass *createLICMPass(unsigned LicmMssaOptCap,
-                     unsigned LicmMssaNoAccForPromotionCap);
+                     unsigned LicmMssaNoAccForPromotionCap,
+                     bool AllowSpeculation);
 
 //===----------------------------------------------------------------------===//
 //
@@ -168,13 +169,6 @@ FunctionPass *createLoopFlattenPass();
 //
 Pass *createLoopStrengthReducePass();
 
-//===----------------------------------------------------------------------===//
-//
-// LoopUnswitch - This pass is a simple loop unswitching pass.
-//
-Pass *createLoopUnswitchPass(bool OptimizeForSize = false,
-                             bool hasBranchDivergence = false);
-
 //===----------------------------------------------------------------------===//
 //
 // LoopInstSimplify - This pass simplifies instructions in a loop's body.
@@ -246,12 +240,10 @@ FunctionPass *createReassociatePass();
 //===----------------------------------------------------------------------===//
 //
 // JumpThreading - Thread control through mult-pred/multi-succ blocks where some
-// preds always go to some succ. If FreezeSelectCond is true, unfold the
-// condition of a select that unfolds to branch. Thresholds other than minus one
+// preds always go to some succ. Thresholds other than minus one
 // override the internal BB duplication default threshold.
 //
-FunctionPass *createJumpThreadingPass(bool FreezeSelectCond = false,
-                                      int Threshold = -1);
+FunctionPass *createJumpThreadingPass(int Threshold = -1);
 
 //===----------------------------------------------------------------------===//
 //
@@ -426,6 +418,12 @@ extern char &InferAddressSpacesID;
 // "block_weights" metadata.
 FunctionPass *createLowerExpectIntrinsicPass();
 
+//===----------------------------------------------------------------------===//
+//
+// TLSVariableHoist - This pass reduce duplicated TLS address call.
+//
+FunctionPass *createTLSVariableHoistPass();
+
 //===----------------------------------------------------------------------===//
 //
 // LowerConstantIntrinsicss - Expand any remaining llvm.objectsize and
diff --git a/llvm/include/llvm/Transforms/Scalar/BDCE.h b/llvm/include/llvm/Transforms/Scalar/BDCE.h
index 996622bccdba..0763f31dfad4 100644
--- a/llvm/include/llvm/Transforms/Scalar/BDCE.h
+++ b/llvm/include/llvm/Transforms/Scalar/BDCE.h
@@ -16,11 +16,12 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_BDCE_H
 #define LLVM_TRANSFORMS_SCALAR_BDCE_H
 
-#include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
 
+class Function;
+
 // The Bit-Tracking Dead Code Elimination pass.
 struct BDCEPass : PassInfoMixin<BDCEPass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
diff --git a/llvm/include/llvm/Transforms/Scalar/CallSiteSplitting.h b/llvm/include/llvm/Transforms/Scalar/CallSiteSplitting.h
index ee2b6f264086..661340f4598f 100644
--- a/llvm/include/llvm/Transforms/Scalar/CallSiteSplitting.h
+++ b/llvm/include/llvm/Transforms/Scalar/CallSiteSplitting.h
@@ -9,11 +9,12 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_CALLSITESPLITTING_H
 #define LLVM_TRANSFORMS_SCALAR_CALLSITESPLITTING_H
 
-#include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
 
+class Function;
+
 struct CallSiteSplittingPass : PassInfoMixin<CallSiteSplittingPass> {
   /// Run the pass over the function.
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
diff --git a/llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h b/llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h
index 11379e59467f..e59734b92244 100644
--- a/llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h
+++ b/llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h
@@ -40,7 +40,6 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/PassManager.h"
 #include <algorithm>
diff --git a/llvm/include/llvm/Transforms/Scalar/DCE.h b/llvm/include/llvm/Transforms/Scalar/DCE.h
index 4d83296b1d86..8d1616a7b75d 100644
--- a/llvm/include/llvm/Transforms/Scalar/DCE.h
+++ b/llvm/include/llvm/Transforms/Scalar/DCE.h
@@ -13,11 +13,12 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_DCE_H
 #define LLVM_TRANSFORMS_SCALAR_DCE_H
 
-#include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
 
+class Function;
+
 /// Basic Dead Code Elimination pass.
 class DCEPass : public PassInfoMixin<DCEPass> {
 public:
diff --git a/llvm/include/llvm/Transforms/Scalar/DFAJumpThreading.h b/llvm/include/llvm/Transforms/Scalar/DFAJumpThreading.h
index afebd9bbc122..4e9fbf65e163 100644
--- a/llvm/include/llvm/Transforms/Scalar/DFAJumpThreading.h
+++ b/llvm/include/llvm/Transforms/Scalar/DFAJumpThreading.h
@@ -13,11 +13,12 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_DFAJUMPTHREADING_H
 #define LLVM_TRANSFORMS_SCALAR_DFAJUMPTHREADING_H
 
-#include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
 
+class Function;
+
 struct DFAJumpThreadingPass : PassInfoMixin<DFAJumpThreadingPass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
diff --git a/llvm/include/llvm/Transforms/Scalar/Float2Int.h b/llvm/include/llvm/Transforms/Scalar/Float2Int.h
index 5fb47af6f795..f4bec228ea96 100644
--- a/llvm/include/llvm/Transforms/Scalar/Float2Int.h
+++ b/llvm/include/llvm/Transforms/Scalar/Float2Int.h
@@ -18,11 +18,17 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/IR/ConstantRange.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
+class DominatorTree;
+class Function;
+class Instruction;
+class LLVMContext;
+template <typename T> class Optional;
+class Type;
+class Value;
+
 class Float2IntPass : public PassInfoMixin<Float2IntPass> {
 public:
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
@@ -36,6 +42,7 @@ private:
   ConstantRange badRange();
   ConstantRange unknownRange();
   ConstantRange validateRange(ConstantRange R);
+  Optional<ConstantRange> calcRange(Instruction *I);
   void walkBackwards();
   void walkForwards();
   bool validateAndTransform();
diff --git a/llvm/include/llvm/Transforms/Scalar/GVN.h b/llvm/include/llvm/Transforms/Scalar/GVN.h
index 9e660c92124e..16ab1a490162 100644
--- a/llvm/include/llvm/Transforms/Scalar/GVN.h
+++ b/llvm/include/llvm/Transforms/Scalar/GVN.h
@@ -17,10 +17,8 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/InstructionPrecedenceTracking.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/PassManager.h"
@@ -42,6 +40,8 @@ class CallInst;
 class ExtractValueInst;
 class Function;
 class FunctionPass;
+class GetElementPtrInst;
+class ImplicitControlFlowTracking;
 class LoadInst;
 class LoopInfo;
 class MemDepResult;
@@ -178,6 +178,7 @@ public:
     Expression createCmpExpr(unsigned Opcode, CmpInst::Predicate Predicate,
                              Value *LHS, Value *RHS);
     Expression createExtractvalueExpr(ExtractValueInst *EI);
+    Expression createGEPExpr(GetElementPtrInst *GEP);
     uint32_t lookupOrAddCall(CallInst *C);
     uint32_t phiTranslateImpl(const BasicBlock *BB, const BasicBlock *PhiBlock,
                               uint32_t Num, GVNPass &Gvn);
diff --git a/llvm/include/llvm/Transforms/Scalar/GuardWidening.h b/llvm/include/llvm/Transforms/Scalar/GuardWidening.h
index d08d042ab055..fa03d5f678fd 100644
--- a/llvm/include/llvm/Transforms/Scalar/GuardWidening.h
+++ b/llvm/include/llvm/Transforms/Scalar/GuardWidening.h
@@ -15,12 +15,13 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_GUARDWIDENING_H
 #define LLVM_TRANSFORMS_SCALAR_GUARDWIDENING_H
 
-#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
 
 namespace llvm {
 
+class LPMUpdater;
+class Loop;
 class Function;
 
 struct GuardWideningPass : public PassInfoMixin<GuardWideningPass> {
diff --git a/llvm/include/llvm/Transforms/Scalar/IVUsersPrinter.h b/llvm/include/llvm/Transforms/Scalar/IVUsersPrinter.h
index a1f20d9ca983..4136c45e1905 100644
--- a/llvm/include/llvm/Transforms/Scalar/IVUsersPrinter.h
+++ b/llvm/include/llvm/Transforms/Scalar/IVUsersPrinter.h
@@ -9,11 +9,13 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_IVUSERSPRINTER_H
 #define LLVM_TRANSFORMS_SCALAR_IVUSERSPRINTER_H
 
-#include "llvm/Analysis/IVUsers.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/IR/PassManager.h"
 
 namespace llvm {
+class LPMUpdater;
+class Loop;
+class raw_ostream;
 
 /// Printer pass for the \c IVUsers for a loop.
 class IVUsersPrinterPass : public PassInfoMixin<IVUsersPrinterPass> {
diff --git a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
index 0ac7d7c62b7a..09d08bf423a6 100644
--- a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
+++ b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
@@ -16,14 +16,11 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/IR/ValueHandle.h"
-#include <memory>
 #include <utility>
 
 namespace llvm {
@@ -95,10 +92,9 @@ class JumpThreadingPass : public PassInfoMixin<JumpThreadingPass> {
 
   unsigned BBDupThreshold;
   unsigned DefaultBBDupThreshold;
-  bool InsertFreezeWhenUnfoldingSelect;
 
 public:
-  JumpThreadingPass(bool InsertFreezeWhenUnfoldingSelect = false, int T = -1);
+  JumpThreadingPass(int T = -1);
 
   // Glue for old PM.
   bool runImpl(Function &F, TargetLibraryInfo *TLI, TargetTransformInfo *TTI,
diff --git a/llvm/include/llvm/Transforms/Scalar/LICM.h b/llvm/include/llvm/Transforms/Scalar/LICM.h
index 751f75c0ccb2..f7dd40be47e5 100644
--- a/llvm/include/llvm/Transforms/Scalar/LICM.h
+++ b/llvm/include/llvm/Transforms/Scalar/LICM.h
@@ -32,46 +32,70 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_LICM_H
 #define LLVM_TRANSFORMS_SCALAR_LICM_H
 
-#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
 
 namespace llvm {
 
+class LPMUpdater;
+class Loop;
+class LoopNest;
+
 extern cl::opt<unsigned> SetLicmMssaOptCap;
 extern cl::opt<unsigned> SetLicmMssaNoAccForPromotionCap;
 
+struct LICMOptions {
+  unsigned MssaOptCap;
+  unsigned MssaNoAccForPromotionCap;
+  bool AllowSpeculation;
+
+  LICMOptions()
+      : MssaOptCap(SetLicmMssaOptCap),
+        MssaNoAccForPromotionCap(SetLicmMssaNoAccForPromotionCap),
+        AllowSpeculation(true) {}
+
+  LICMOptions(unsigned MssaOptCap, unsigned MssaNoAccForPromotionCap,
+              bool AllowSpeculation)
+      : MssaOptCap(MssaOptCap),
+        MssaNoAccForPromotionCap(MssaNoAccForPromotionCap),
+        AllowSpeculation(AllowSpeculation) {}
+};
+
 /// Performs Loop Invariant Code Motion Pass.
 class LICMPass : public PassInfoMixin<LICMPass> {
-  unsigned LicmMssaOptCap;
-  unsigned LicmMssaNoAccForPromotionCap;
+  LICMOptions Opts;
 
 public:
-  LICMPass()
-      : LicmMssaOptCap(SetLicmMssaOptCap),
-        LicmMssaNoAccForPromotionCap(SetLicmMssaNoAccForPromotionCap) {}
-  LICMPass(unsigned LicmMssaOptCap, unsigned LicmMssaNoAccForPromotionCap)
-      : LicmMssaOptCap(LicmMssaOptCap),
-        LicmMssaNoAccForPromotionCap(LicmMssaNoAccForPromotionCap) {}
+  LICMPass(unsigned MssaOptCap, unsigned MssaNoAccForPromotionCap,
+           bool AllowSpeculation)
+      : LICMPass(LICMOptions(MssaOptCap, MssaNoAccForPromotionCap,
+                             AllowSpeculation)) {}
+  LICMPass(LICMOptions Opts) : Opts(Opts) {}
+
   PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
                         LoopStandardAnalysisResults &AR, LPMUpdater &U);
+
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName);
 };
 
 /// Performs LoopNest Invariant Code Motion Pass.
 class LNICMPass : public PassInfoMixin<LNICMPass> {
-  unsigned LicmMssaOptCap;
-  unsigned LicmMssaNoAccForPromotionCap;
+  LICMOptions Opts;
 
 public:
-  LNICMPass()
-      : LicmMssaOptCap(SetLicmMssaOptCap),
-        LicmMssaNoAccForPromotionCap(SetLicmMssaNoAccForPromotionCap) {}
-  LNICMPass(unsigned LicmMssaOptCap, unsigned LicmMssaNoAccForPromotionCap)
-      : LicmMssaOptCap(LicmMssaOptCap),
-        LicmMssaNoAccForPromotionCap(LicmMssaNoAccForPromotionCap) {}
+  LNICMPass(unsigned MssaOptCap, unsigned MssaNoAccForPromotionCap,
+            bool AllowSpeculation)
+      : LNICMPass(LICMOptions(MssaOptCap, MssaNoAccForPromotionCap,
+                              AllowSpeculation)) {}
+  LNICMPass(LICMOptions Opts) : Opts(Opts) {}
+
   PreservedAnalyses run(LoopNest &L, LoopAnalysisManager &AM,
                         LoopStandardAnalysisResults &AR, LPMUpdater &U);
+
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName);
 };
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h b/llvm/include/llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h
index 3f250fc1ce8c..50a837acf4e3 100644
--- a/llvm/include/llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h
+++ b/llvm/include/llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h
@@ -8,12 +8,14 @@
 
 #ifndef LLVM_TRANSFORMS_SCALAR_LOOPACCESSANALYSISPRINTER_H
 #define LLVM_TRANSFORMS_SCALAR_LOOPACCESSANALYSISPRINTER_H
-
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/IR/PassManager.h"
 
 namespace llvm {
 
+class LPMUpdater;
+class Loop;
+class raw_ostream;
 /// Printer pass for the \c LoopAccessInfo results.
 class LoopAccessInfoPrinterPass
     : public PassInfoMixin<LoopAccessInfoPrinterPass> {
diff --git a/llvm/include/llvm/Transforms/Scalar/LoopBoundSplit.h b/llvm/include/llvm/Transforms/Scalar/LoopBoundSplit.h
index 306b6fa046df..0c597bf295b2 100644
--- a/llvm/include/llvm/Transforms/Scalar/LoopBoundSplit.h
+++ b/llvm/include/llvm/Transforms/Scalar/LoopBoundSplit.h
@@ -10,11 +10,11 @@
 #define LLVM_TRANSFORMS_SCALAR_LOOPBOUNDSPLIT_H
 
 #include "llvm/Analysis/LoopAnalysisManager.h"
-#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
 
 namespace llvm {
+class LPMUpdater;
+class Loop;
 
 /// This pass transforms loops that contain a conditional branch with induction
 /// variable. For example, it transforms left code to right code:
diff --git a/llvm/include/llvm/Transforms/Scalar/LoopDataPrefetch.h b/llvm/include/llvm/Transforms/Scalar/LoopDataPrefetch.h
index 9ebd5984cea9..d5e15ffff075 100644
--- a/llvm/include/llvm/Transforms/Scalar/LoopDataPrefetch.h
+++ b/llvm/include/llvm/Transforms/Scalar/LoopDataPrefetch.h
@@ -13,11 +13,12 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_LOOPDATAPREFETCH_H
 #define LLVM_TRANSFORMS_SCALAR_LOOPDATAPREFETCH_H
 
-#include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
 
+class Function;
+
 /// An optimization pass inserting data prefetches in loops.
 class LoopDataPrefetchPass : public PassInfoMixin<LoopDataPrefetchPass> {
 public:
diff --git a/llvm/include/llvm/Transforms/Scalar/LoopDeletion.h b/llvm/include/llvm/Transforms/Scalar/LoopDeletion.h
index 557616e2e6ba..459a5cd3ece4 100644
--- a/llvm/include/llvm/Transforms/Scalar/LoopDeletion.h
+++ b/llvm/include/llvm/Transforms/Scalar/LoopDeletion.h
@@ -14,13 +14,13 @@
 #define LLVM_TRANSFORMS_SCALAR_LOOPDELETION_H
 
 #include "llvm/Analysis/LoopAnalysisManager.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
 
 namespace llvm {
 
+class Loop;
+class LPMUpdater;
+
 class LoopDeletionPass : public PassInfoMixin<LoopDeletionPass> {
 public:
   LoopDeletionPass() = default;
diff --git a/llvm/include/llvm/Transforms/Scalar/LoopFlatten.h b/llvm/include/llvm/Transforms/Scalar/LoopFlatten.h
index 3d259bdbe986..311b843e83b5 100644
--- a/llvm/include/llvm/Transforms/Scalar/LoopFlatten.h
+++ b/llvm/include/llvm/Transforms/Scalar/LoopFlatten.h
@@ -14,11 +14,11 @@
 #define LLVM_TRANSFORMS_SCALAR_LOOPFLATTEN_H
 
 #include "llvm/Analysis/LoopAnalysisManager.h"
-#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
 
 namespace llvm {
+class LPMUpdater;
+class LoopNest;
 
 class LoopFlattenPass : public PassInfoMixin<LoopFlattenPass> {
 public:
diff --git a/llvm/include/llvm/Transforms/Scalar/LoopInterchange.h b/llvm/include/llvm/Transforms/Scalar/LoopInterchange.h
index c67a30293d2f..8fa14d747f5c 100644
--- a/llvm/include/llvm/Transforms/Scalar/LoopInterchange.h
+++ b/llvm/include/llvm/Transforms/Scalar/LoopInterchange.h
@@ -9,11 +9,14 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_LOOPINTERCHANGE_H
 #define LLVM_TRANSFORMS_SCALAR_LOOPINTERCHANGE_H
 
+#include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
 
 namespace llvm {
 
+class LPMUpdater;
+class LoopNest;
+
 struct LoopInterchangePass : public PassInfoMixin<LoopInterchangePass> {
   PreservedAnalyses run(LoopNest &L, LoopAnalysisManager &AM,
                         LoopStandardAnalysisResults &AR, LPMUpdater &U);
diff --git a/llvm/include/llvm/Transforms/Scalar/LoopPassManager.h b/llvm/include/llvm/Transforms/Scalar/LoopPassManager.h
index e83cc2b9bef0..1df510474ca7 100644
--- a/llvm/include/llvm/Transforms/Scalar/LoopPassManager.h
+++ b/llvm/include/llvm/Transforms/Scalar/LoopPassManager.h
@@ -40,8 +40,6 @@
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopNestAnalysis.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/PassInstrumentation.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Transforms/Utils/LCSSA.h"
 #include "llvm/Transforms/Utils/LoopSimplify.h"
@@ -52,6 +50,7 @@ namespace llvm {
 
 // Forward declarations of an update tracking API used in the pass manager.
 class LPMUpdater;
+class PassInstrumentation;
 
 namespace {
 
diff --git a/llvm/include/llvm/Transforms/Scalar/LoopPredication.h b/llvm/include/llvm/Transforms/Scalar/LoopPredication.h
index 252daafab7a3..83f533603419 100644
--- a/llvm/include/llvm/Transforms/Scalar/LoopPredication.h
+++ b/llvm/include/llvm/Transforms/Scalar/LoopPredication.h
@@ -14,12 +14,13 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_LOOPPREDICATION_H
 #define LLVM_TRANSFORMS_SCALAR_LOOPPREDICATION_H
 
-#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
 
 namespace llvm {
 
+class LPMUpdater;
+class Loop;
 /// Performs Loop Predication Pass.
 class LoopPredicationPass : public PassInfoMixin<LoopPredicationPass> {
 public:
diff --git a/llvm/include/llvm/Transforms/Scalar/LoopRotation.h b/llvm/include/llvm/Transforms/Scalar/LoopRotation.h
index f68ac70da324..c0e6f105a412 100644
--- a/llvm/include/llvm/Transforms/Scalar/LoopRotation.h
+++ b/llvm/include/llvm/Transforms/Scalar/LoopRotation.h
@@ -13,11 +13,12 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_LOOPROTATION_H
 #define LLVM_TRANSFORMS_SCALAR_LOOPROTATION_H
 
-#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
 
 namespace llvm {
+class LPMUpdater;
+class Loop;
 
 /// A simple loop rotation transformation.
 class LoopRotatePass : public PassInfoMixin<LoopRotatePass> {
diff --git a/llvm/include/llvm/Transforms/Scalar/LoopSimplifyCFG.h b/llvm/include/llvm/Transforms/Scalar/LoopSimplifyCFG.h
index 2d718592aef5..82c8a4406d00 100644
--- a/llvm/include/llvm/Transforms/Scalar/LoopSimplifyCFG.h
+++ b/llvm/include/llvm/Transforms/Scalar/LoopSimplifyCFG.h
@@ -16,12 +16,14 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_LOOPSIMPLIFYCFG_H
 #define LLVM_TRANSFORMS_SCALAR_LOOPSIMPLIFYCFG_H
 
-#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
 
 namespace llvm {
 
+class LPMUpdater;
+class Loop;
+
 /// Performs basic CFG simplifications to assist other loop passes.
 class LoopSimplifyCFGPass : public PassInfoMixin<LoopSimplifyCFGPass> {
 public:
diff --git a/llvm/include/llvm/Transforms/Scalar/LoopSink.h b/llvm/include/llvm/Transforms/Scalar/LoopSink.h
index 234c48cbebc5..26e50590a625 100644
--- a/llvm/include/llvm/Transforms/Scalar/LoopSink.h
+++ b/llvm/include/llvm/Transforms/Scalar/LoopSink.h
@@ -13,12 +13,12 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_LOOPSINK_H
 #define LLVM_TRANSFORMS_SCALAR_LOOPSINK_H
 
-#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
 
 namespace llvm {
 
+class Function;
+
 /// A pass that does profile-guided sinking of instructions into loops.
 ///
 /// This is a function pass as it shouldn't be composed into any kind of
diff --git a/llvm/include/llvm/Transforms/Scalar/LoopUnrollAndJamPass.h b/llvm/include/llvm/Transforms/Scalar/LoopUnrollAndJamPass.h
index 72663d3d62a8..54f70d7ed4b3 100644
--- a/llvm/include/llvm/Transforms/Scalar/LoopUnrollAndJamPass.h
+++ b/llvm/include/llvm/Transforms/Scalar/LoopUnrollAndJamPass.h
@@ -9,10 +9,12 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_LOOPUNROLLANDJAMPASS_H
 #define LLVM_TRANSFORMS_SCALAR_LOOPUNROLLANDJAMPASS_H
 
+#include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
 
 namespace llvm {
+class LPMUpdater;
+class LoopNest;
 
 /// A simple loop rotation transformation.
 class LoopUnrollAndJamPass : public PassInfoMixin<LoopUnrollAndJamPass> {
diff --git a/llvm/include/llvm/Transforms/Scalar/LoopVersioningLICM.h b/llvm/include/llvm/Transforms/Scalar/LoopVersioningLICM.h
index 87d6d6759db2..04e0012330da 100644
--- a/llvm/include/llvm/Transforms/Scalar/LoopVersioningLICM.h
+++ b/llvm/include/llvm/Transforms/Scalar/LoopVersioningLICM.h
@@ -9,10 +9,12 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_LOOPVERSIONINGLICM_H
 #define LLVM_TRANSFORMS_SCALAR_LOOPVERSIONINGLICM_H
 
+#include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
 
 namespace llvm {
+class LPMUpdater;
+class Loop;
 
 class LoopVersioningLICMPass : public PassInfoMixin<LoopVersioningLICMPass> {
 public:
diff --git a/llvm/include/llvm/Transforms/Scalar/LowerAtomic.h b/llvm/include/llvm/Transforms/Scalar/LowerAtomic.h
deleted file mode 100644
index 87d945d06901..000000000000
--- a/llvm/include/llvm/Transforms/Scalar/LowerAtomic.h
+++ /dev/null
@@ -1,35 +0,0 @@
-//===- LowerAtomic.cpp - Lower atomic intrinsics ----------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-/// \file
-// This pass lowers atomic intrinsics to non-atomic form for use in a known
-// non-preemptible environment.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_SCALAR_LOWERATOMIC_H
-#define LLVM_TRANSFORMS_SCALAR_LOWERATOMIC_H
-
-#include "llvm/IR/PassManager.h"
-
-namespace llvm {
-
-/// A pass that lowers atomic intrinsic into non-atomic intrinsics.
-class LowerAtomicPass : public PassInfoMixin<LowerAtomicPass> {
-public:
-  PreservedAnalyses run(Function &F, FunctionAnalysisManager &);
-  static bool isRequired() { return true; }
-};
-
-class AtomicRMWInst;
-/// Convert the given RMWI into primitive load and stores,
-/// assuming that doing so is legal. Return true if the lowering
-/// succeeds.
-bool lowerAtomicRMWInst(AtomicRMWInst *RMWI);
-}
-
-#endif // LLVM_TRANSFORMS_SCALAR_LOWERATOMIC_H
diff --git a/llvm/include/llvm/Transforms/Scalar/LowerAtomicPass.h b/llvm/include/llvm/Transforms/Scalar/LowerAtomicPass.h
new file mode 100644
index 000000000000..60bbf916fced
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Scalar/LowerAtomicPass.h
@@ -0,0 +1,30 @@
+//===- LowerAtomicPass.h - Lower atomic intrinsics --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+// This pass lowers atomic intrinsics to non-atomic form for use in a known
+// non-preemptible environment.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_LOWERATOMICPASS_H
+#define LLVM_TRANSFORMS_SCALAR_LOWERATOMICPASS_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+/// A pass that lowers atomic intrinsic into non-atomic intrinsics.
+class LowerAtomicPass : public PassInfoMixin<LowerAtomicPass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &);
+  static bool isRequired() { return true; }
+};
+
+}
+
+#endif // LLVM_TRANSFORMS_SCALAR_LOWERATOMICPASS_H
diff --git a/llvm/include/llvm/Transforms/Scalar/LowerConstantIntrinsics.h b/llvm/include/llvm/Transforms/Scalar/LowerConstantIntrinsics.h
index 61c7bf0454e1..e8e404bb93d6 100644
--- a/llvm/include/llvm/Transforms/Scalar/LowerConstantIntrinsics.h
+++ b/llvm/include/llvm/Transforms/Scalar/LowerConstantIntrinsics.h
@@ -15,11 +15,12 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_LOWERCONSTANTINTRINSICS_H
 #define LLVM_TRANSFORMS_SCALAR_LOWERCONSTANTINTRINSICS_H
 
-#include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
 
+class Function;
+
 struct LowerConstantIntrinsicsPass :
     PassInfoMixin<LowerConstantIntrinsicsPass> {
 public:
diff --git a/llvm/include/llvm/Transforms/Scalar/LowerExpectIntrinsic.h b/llvm/include/llvm/Transforms/Scalar/LowerExpectIntrinsic.h
index 4e47ff70d557..95ef0f73e8af 100644
--- a/llvm/include/llvm/Transforms/Scalar/LowerExpectIntrinsic.h
+++ b/llvm/include/llvm/Transforms/Scalar/LowerExpectIntrinsic.h
@@ -15,11 +15,12 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_LOWEREXPECTINTRINSIC_H
 #define LLVM_TRANSFORMS_SCALAR_LOWEREXPECTINTRINSIC_H
 
-#include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
 
+class Function;
+
 struct LowerExpectIntrinsicPass : PassInfoMixin<LowerExpectIntrinsicPass> {
   /// Run the pass over the function.
   ///
diff --git a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
index 3a4db13d670a..8103b0a92489 100644
--- a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
+++ b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
@@ -16,8 +16,6 @@
 
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/PassManager.h"
-#include <cstdint>
-#include <functional>
 
 namespace llvm {
 
@@ -63,7 +61,7 @@ private:
   bool processMemMove(MemMoveInst *M);
   bool performCallSlotOptzn(Instruction *cpyLoad, Instruction *cpyStore,
                             Value *cpyDst, Value *cpySrc, TypeSize cpyLen,
-                            Align cpyAlign, CallInst *C);
+                            Align cpyAlign, std::function<CallInst *()> GetC);
   bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep);
   bool processMemSetMemCpyDependence(MemCpyInst *MemCpy, MemSetInst *MemSet);
   bool performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, MemSetInst *MemSet);
diff --git a/llvm/include/llvm/Transforms/Scalar/MergedLoadStoreMotion.h b/llvm/include/llvm/Transforms/Scalar/MergedLoadStoreMotion.h
index 256d03675a07..71e11e59a471 100644
--- a/llvm/include/llvm/Transforms/Scalar/MergedLoadStoreMotion.h
+++ b/llvm/include/llvm/Transforms/Scalar/MergedLoadStoreMotion.h
@@ -23,10 +23,11 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_MERGEDLOADSTOREMOTION_H
 #define LLVM_TRANSFORMS_SCALAR_MERGEDLOADSTOREMOTION_H
 
-#include "llvm/IR/Module.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
+class Function;
 struct MergedLoadStoreMotionOptions {
   bool SplitFooterBB;
   MergedLoadStoreMotionOptions(bool SplitFooterBB = false)
diff --git a/llvm/include/llvm/Transforms/Scalar/PartiallyInlineLibCalls.h b/llvm/include/llvm/Transforms/Scalar/PartiallyInlineLibCalls.h
index fd5a06c5051d..b8a8fcc71e57 100644
--- a/llvm/include/llvm/Transforms/Scalar/PartiallyInlineLibCalls.h
+++ b/llvm/include/llvm/Transforms/Scalar/PartiallyInlineLibCalls.h
@@ -15,10 +15,10 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_PARTIALLYINLINELIBCALLS_H
 #define LLVM_TRANSFORMS_SCALAR_PARTIALLYINLINELIBCALLS_H
 
-#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
+class Function;
 class PartiallyInlineLibCallsPass
     : public PassInfoMixin<PartiallyInlineLibCallsPass> {
 public:
diff --git a/llvm/include/llvm/Transforms/Scalar/SCCP.h b/llvm/include/llvm/Transforms/Scalar/SCCP.h
index cd4100447880..032a9b15fc46 100644
--- a/llvm/include/llvm/Transforms/Scalar/SCCP.h
+++ b/llvm/include/llvm/Transforms/Scalar/SCCP.h
@@ -20,17 +20,19 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_SCCP_H
 #define LLVM_TRANSFORMS_SCALAR_SCCP_H
 
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Transforms/Utils/PredicateInfo.h"
-#include "llvm/Transforms/Utils/SCCPSolver.h"
+
+#include <functional>
 
 namespace llvm {
+class AssumptionCache;
+class DataLayout;
+class Function;
+class Module;
+class TargetLibraryInfo;
+class TargetTransformInfo;
+struct AnalysisResultsForFn;
 
 /// This pass performs function-level constant propagation and merging.
 class SCCPPass : public PassInfoMixin<SCCPPass> {
diff --git a/llvm/include/llvm/Transforms/Scalar/ScalarizeMaskedMemIntrin.h b/llvm/include/llvm/Transforms/Scalar/ScalarizeMaskedMemIntrin.h
index e4002159edbd..5e876fc82ac1 100644
--- a/llvm/include/llvm/Transforms/Scalar/ScalarizeMaskedMemIntrin.h
+++ b/llvm/include/llvm/Transforms/Scalar/ScalarizeMaskedMemIntrin.h
@@ -1,5 +1,5 @@
 //===- ScalarizeMaskedMemIntrin.h - Scalarize unsupported masked mem ----===//
-//                                    instrinsics
+//                                    intrinsics
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/include/llvm/Transforms/Scalar/Scalarizer.h b/llvm/include/llvm/Transforms/Scalar/Scalarizer.h
index f4472e699295..5cc67f78e5a2 100644
--- a/llvm/include/llvm/Transforms/Scalar/Scalarizer.h
+++ b/llvm/include/llvm/Transforms/Scalar/Scalarizer.h
@@ -17,14 +17,33 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_SCALARIZER_H
 #define LLVM_TRANSFORMS_SCALAR_SCALARIZER_H
 
+#include "llvm/ADT/Optional.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Pass.h"
 
 namespace llvm {
 
+class Function;
+class FunctionPass;
+
+struct ScalarizerPassOptions {
+  // These optional booleans correspond 1:1 to cl::opt<bool> options defined in
+  // Scalarizer.cpp. When the cl::opt are specified, they take precedence.
+  // When the cl::opt are not specified, the present optional booleans allow to
+  // override the cl::opt's default values.
+  llvm::Optional<bool> ScalarizeVariableInsertExtract;
+  llvm::Optional<bool> ScalarizeLoadStore;
+};
+
 class ScalarizerPass : public PassInfoMixin<ScalarizerPass> {
+  ScalarizerPassOptions Options;
+
 public:
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+  void setScalarizeVariableInsertExtract(bool Value) {
+    Options.ScalarizeVariableInsertExtract = Value;
+  }
+  void setScalarizeLoadStore(bool Value) { Options.ScalarizeLoadStore = Value; }
 };
 
 /// Create a legacy pass manager instance of the Scalarizer pass
diff --git a/llvm/include/llvm/Transforms/Scalar/SimpleLoopUnswitch.h b/llvm/include/llvm/Transforms/Scalar/SimpleLoopUnswitch.h
index dfb1619c7f2a..68c121560b13 100644
--- a/llvm/include/llvm/Transforms/Scalar/SimpleLoopUnswitch.h
+++ b/llvm/include/llvm/Transforms/Scalar/SimpleLoopUnswitch.h
@@ -9,13 +9,18 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_SIMPLELOOPUNSWITCH_H
 #define LLVM_TRANSFORMS_SCALAR_SIMPLELOOPUNSWITCH_H
 
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
-#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
 
 namespace llvm {
 
+class LPMUpdater;
+class Loop;
+class Pass;
+class StringRef;
+class raw_ostream;
+
 /// This pass transforms loops that contain branches or switches on loop-
 /// invariant conditions to have multiple loops. For example, it turns the left
 /// into the right code:
diff --git a/llvm/include/llvm/Transforms/Scalar/Sink.h b/llvm/include/llvm/Transforms/Scalar/Sink.h
index 6cbe964d1580..759153f22853 100644
--- a/llvm/include/llvm/Transforms/Scalar/Sink.h
+++ b/llvm/include/llvm/Transforms/Scalar/Sink.h
@@ -14,11 +14,12 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_SINK_H
 #define LLVM_TRANSFORMS_SCALAR_SINK_H
 
-#include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
 
+class Function;
+
 /// Move instructions into successor blocks when possible.
 class SinkingPass : public PassInfoMixin<SinkingPass> {
 public:
diff --git a/llvm/include/llvm/Transforms/Scalar/SpeculativeExecution.h b/llvm/include/llvm/Transforms/Scalar/SpeculativeExecution.h
index 41de544e7c9c..0ec2a395f875 100644
--- a/llvm/include/llvm/Transforms/Scalar/SpeculativeExecution.h
+++ b/llvm/include/llvm/Transforms/Scalar/SpeculativeExecution.h
@@ -62,10 +62,10 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_SPECULATIVEEXECUTION_H
 #define LLVM_TRANSFORMS_SCALAR_SPECULATIVEEXECUTION_H
 
-#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
+class TargetTransformInfo;
 class SpeculativeExecutionPass
     : public PassInfoMixin<SpeculativeExecutionPass> {
 public:
diff --git a/llvm/include/llvm/Transforms/Scalar/TLSVariableHoist.h b/llvm/include/llvm/Transforms/Scalar/TLSVariableHoist.h
new file mode 100644
index 000000000000..2a1b02b40eeb
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Scalar/TLSVariableHoist.h
@@ -0,0 +1,131 @@
+//==- TLSVariableHoist.h ------ Remove Redundant TLS Loads -------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass identifies/eliminates Redundant TLS Loads if related option is set.
+// For example:
+// static __thread int x;
+// int g();
+// int f(int c) {
+//   int *px = &x;
+//   while (c--)
+//     *px += g();
+//   return *px;
+// }
+//
+// will generate Redundant TLS Loads by compiling it with
+// clang++ -fPIC -ftls-model=global-dynamic -O2 -S
+//
+// .LBB0_2:                                # %while.body
+//                                         # =>This Inner Loop Header: Depth=1
+//         callq   _Z1gv@PLT
+//         movl    %eax, %ebp
+//         leaq    _ZL1x@TLSLD(%rip), %rdi
+//         callq   __tls_get_addr@PLT
+//         addl    _ZL1x@DTPOFF(%rax), %ebp
+//         movl    %ebp, _ZL1x@DTPOFF(%rax)
+//         addl    $-1, %ebx
+//         jne     .LBB0_2
+//         jmp     .LBB0_3
+// .LBB0_4:                                # %entry.while.end_crit_edge
+//         leaq    _ZL1x@TLSLD(%rip), %rdi
+//         callq   __tls_get_addr@PLT
+//         movl    _ZL1x@DTPOFF(%rax), %ebp
+//
+// The Redundant TLS Loads will hurt the performance, especially in loops.
+// So we try to eliminate/move them if required by customers, let it be:
+//
+// # %bb.0:                                # %entry
+//         ...
+//         movl    %edi, %ebx
+//         leaq    _ZL1x@TLSLD(%rip), %rdi
+//         callq   __tls_get_addr@PLT
+//         leaq    _ZL1x@DTPOFF(%rax), %r14
+//         testl   %ebx, %ebx
+//         je      .LBB0_1
+// .LBB0_2:                                # %while.body
+//                                         # =>This Inner Loop Header: Depth=1
+//         callq   _Z1gv@PLT
+//         addl    (%r14), %eax
+//         movl    %eax, (%r14)
+//         addl    $-1, %ebx
+//         jne     .LBB0_2
+//         jmp     .LBB0_3
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_TLSVARIABLEHOIST_H
+#define LLVM_TRANSFORMS_SCALAR_TLSVARIABLEHOIST_H
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class BasicBlock;
+class DominatorTree;
+class Function;
+class GlobalVariable;
+class Instruction;
+
+/// A private "module" namespace for types and utilities used by
+/// TLSVariableHoist. These are implementation details and should
+/// not be used by clients.
+namespace tlshoist {
+
+/// Keeps track of the user of a TLS variable and the operand index
+/// where the variable is used.
+struct TLSUser {
+  Instruction *Inst;
+  unsigned OpndIdx;
+
+  TLSUser(Instruction *Inst, unsigned Idx) : Inst(Inst), OpndIdx(Idx) {}
+};
+
+/// Keeps track of a TLS variable candidate and its users.
+struct TLSCandidate {
+  SmallVector<TLSUser, 8> Users;
+
+  /// Add the user to the use list and update the cost.
+  void addUser(Instruction *Inst, unsigned Idx) {
+    Users.push_back(TLSUser(Inst, Idx));
+  }
+};
+
+} // end namespace tlshoist
+
+class TLSVariableHoistPass : public PassInfoMixin<TLSVariableHoistPass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+  // Glue for old PM.
+  bool runImpl(Function &F, DominatorTree &DT, LoopInfo &LI);
+
+private:
+  DominatorTree *DT;
+  LoopInfo *LI;
+
+  /// Keeps track of TLS variable candidates found in the function.
+  using TLSCandMapType = MapVector<GlobalVariable *, tlshoist::TLSCandidate>;
+  TLSCandMapType TLSCandMap;
+
+  void collectTLSCandidates(Function &Fn);
+  void collectTLSCandidate(Instruction *Inst);
+  Instruction *getNearestLoopDomInst(BasicBlock *BB, Loop *L);
+  Instruction *getDomInst(Instruction *I1, Instruction *I2);
+  BasicBlock::iterator findInsertPos(Function &Fn, GlobalVariable *GV,
+                                     BasicBlock *&PosBB);
+  Instruction *genBitCastInst(Function &Fn, GlobalVariable *GV);
+  bool tryReplaceTLSCandidates(Function &Fn);
+  bool tryReplaceTLSCandidate(Function &Fn, GlobalVariable *GV);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_TLSVARIABLEHOIST_H
diff --git a/llvm/include/llvm/Transforms/Scalar/TailRecursionElimination.h b/llvm/include/llvm/Transforms/Scalar/TailRecursionElimination.h
index 906867644504..57b1ed9bf4fe 100644
--- a/llvm/include/llvm/Transforms/Scalar/TailRecursionElimination.h
+++ b/llvm/include/llvm/Transforms/Scalar/TailRecursionElimination.h
@@ -52,11 +52,12 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_TAILRECURSIONELIMINATION_H
 #define LLVM_TRANSFORMS_SCALAR_TAILRECURSIONELIMINATION_H
 
-#include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
 
+class Function;
+
 struct TailCallElimPass : PassInfoMixin<TailCallElimPass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
diff --git a/llvm/include/llvm/Transforms/Scalar/WarnMissedTransforms.h b/llvm/include/llvm/Transforms/Scalar/WarnMissedTransforms.h
index 64691d68b1c4..80d098a1ea52 100644
--- a/llvm/include/llvm/Transforms/Scalar/WarnMissedTransforms.h
+++ b/llvm/include/llvm/Transforms/Scalar/WarnMissedTransforms.h
@@ -14,10 +14,11 @@
 #define LLVM_TRANSFORMS_SCALAR_WARNMISSEDTRANSFORMS_H
 
 #include "llvm/IR/PassManager.h"
-#include "llvm/Pass.h"
 
 namespace llvm {
 class Function;
+class Pass;
+class PassRegistry;
 
 // New pass manager boilerplate.
 class WarnMissedTransformationsPass
diff --git a/llvm/include/llvm/Transforms/Utils.h b/llvm/include/llvm/Transforms/Utils.h
index 1e9c0a040ad2..ebd4bd318573 100644
--- a/llvm/include/llvm/Transforms/Utils.h
+++ b/llvm/include/llvm/Transforms/Utils.h
@@ -155,6 +155,12 @@ FunctionPass *createAssumeSimplifyPass();
 // don't block SCEV.
 //
 Pass *createCanonicalizeFreezeInLoopsPass();
+
+//===----------------------------------------------------------------------===//
+// LowerGlobalDtorsLegacy - Lower @llvm.global_dtors by creating wrapper
+// functions that are registered in @llvm.global_ctors and which contain a call
+// to `__cxa_atexit` to register their destructor functions.
+ModulePass *createLowerGlobalDtorsLegacyPass();
 } // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/Transforms/Utils/AssumeBundleBuilder.h b/llvm/include/llvm/Transforms/Utils/AssumeBundleBuilder.h
index d679bca69510..991ecb8efbd0 100644
--- a/llvm/include/llvm/Transforms/Utils/AssumeBundleBuilder.h
+++ b/llvm/include/llvm/Transforms/Utils/AssumeBundleBuilder.h
@@ -17,12 +17,13 @@
 #define LLVM_TRANSFORMS_UTILS_ASSUMEBUNDLEBUILDER_H
 
 #include "llvm/Analysis/AssumeBundleQueries.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/Instruction.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Pass.h"
 
 namespace llvm {
+class AssumeInst;
+class Function;
+class FunctionPass;
+class Instruction;
 class AssumptionCache;
 class DominatorTree;
 
diff --git a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
index d99b2a56559d..fcdd2aa0e060 100644
--- a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
@@ -18,21 +18,20 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/Analysis/DomTreeUpdater.h"
-#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Dominators.h"
 #include <cassert>
 
 namespace llvm {
-
+class BranchInst;
+class LandingPadInst;
+class Loop;
+class PHINode;
+template <typename PtrType> class SmallPtrSetImpl;
 class BlockFrequencyInfo;
 class BranchProbabilityInfo;
-class DominatorTree;
 class DomTreeUpdater;
 class Function;
-class Instruction;
 class LoopInfo;
 class MDNode;
 class MemoryDependenceResults;
@@ -500,7 +499,9 @@ BranchInst *GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue,
 // create the following structure:
 // A -> D0A, B -> D0A, I -> D0B, D0A -> D1, D0B -> D1
 // If BPI and BFI aren't non-null, BPI/BFI will be updated accordingly.
-bool SplitIndirectBrCriticalEdges(Function &F,
+// When `IgnoreBlocksWithoutPHI` is set to `true` critical edges leading to a
+// block without phi-instructions will not be split.
+bool SplitIndirectBrCriticalEdges(Function &F, bool IgnoreBlocksWithoutPHI,
                                   BranchProbabilityInfo *BPI = nullptr,
                                   BlockFrequencyInfo *BFI = nullptr);
 
diff --git a/llvm/include/llvm/Transforms/Utils/BreakCriticalEdges.h b/llvm/include/llvm/Transforms/Utils/BreakCriticalEdges.h
index 3644f1ed7a13..6de080ce3128 100644
--- a/llvm/include/llvm/Transforms/Utils/BreakCriticalEdges.h
+++ b/llvm/include/llvm/Transforms/Utils/BreakCriticalEdges.h
@@ -17,10 +17,11 @@
 #ifndef LLVM_TRANSFORMS_UTILS_BREAKCRITICALEDGES_H
 #define LLVM_TRANSFORMS_UTILS_BREAKCRITICALEDGES_H
 
-#include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
+
+class Function;
 struct BreakCriticalEdgesPass : public PassInfoMixin<BreakCriticalEdgesPass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
diff --git a/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h b/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h
index 87d33b9b11b7..6ea195ce31ac 100644
--- a/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h
+++ b/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h
@@ -22,23 +22,63 @@ namespace llvm {
   class IRBuilderBase;
 
   /// Analyze the name and prototype of the given function and set any
-  /// applicable attributes.
+  /// applicable attributes. Note that this merely helps optimizations on an
+  /// already existing function but does not consider mandatory attributes.
+  ///
   /// If the library function is unavailable, this doesn't modify it.
   ///
   /// Returns true if any attributes were set and false otherwise.
-  bool inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI);
-  bool inferLibFuncAttributes(Module *M, StringRef Name, const TargetLibraryInfo &TLI);
+  bool inferNonMandatoryLibFuncAttrs(Module *M, StringRef Name,
+                                     const TargetLibraryInfo &TLI);
+  bool inferNonMandatoryLibFuncAttrs(Function &F, const TargetLibraryInfo &TLI);
+
+  /// Calls getOrInsertFunction() and then makes sure to add mandatory
+  /// argument attributes.
+  FunctionCallee getOrInsertLibFunc(Module *M, const TargetLibraryInfo &TLI,
+                                    LibFunc TheLibFunc, FunctionType *T,
+                                    AttributeList AttributeList);
+  FunctionCallee getOrInsertLibFunc(Module *M, const TargetLibraryInfo &TLI,
+                                    LibFunc TheLibFunc, FunctionType *T);
+  template <typename... ArgsTy>
+  FunctionCallee getOrInsertLibFunc(Module *M, const TargetLibraryInfo &TLI,
+                               LibFunc TheLibFunc, AttributeList AttributeList,
+                               Type *RetTy, ArgsTy... Args) {
+    SmallVector<Type*, sizeof...(ArgsTy)> ArgTys{Args...};
+    return getOrInsertLibFunc(M, TLI, TheLibFunc,
+                              FunctionType::get(RetTy, ArgTys, false),
+                              AttributeList);
+  }
+  /// Same as above, but without the attributes.
+  template <typename... ArgsTy>
+  FunctionCallee getOrInsertLibFunc(Module *M, const TargetLibraryInfo &TLI,
+                             LibFunc TheLibFunc, Type *RetTy, ArgsTy... Args) {
+    return getOrInsertLibFunc(M, TLI, TheLibFunc, AttributeList{}, RetTy,
+                              Args...);
+  }
+  // Avoid an incorrect ordering that'd otherwise compile incorrectly.
+  template <typename... ArgsTy>
+  FunctionCallee
+  getOrInsertLibFunc(Module *M, const TargetLibraryInfo &TLI,
+                     LibFunc TheLibFunc, AttributeList AttributeList,
+                     FunctionType *Invalid, ArgsTy... Args) = delete;
+
+  /// Check whether the library function is available on target and also that
+  /// it in the current Module is a Function with the right type.
+  bool isLibFuncEmittable(const Module *M, const TargetLibraryInfo *TLI,
+                          LibFunc TheLibFunc);
+  bool isLibFuncEmittable(const Module *M, const TargetLibraryInfo *TLI,
+                          StringRef Name);
 
   /// Check whether the overloaded floating point function
   /// corresponding to \a Ty is available.
-  bool hasFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
+  bool hasFloatFn(const Module *M, const TargetLibraryInfo *TLI, Type *Ty,
                   LibFunc DoubleFn, LibFunc FloatFn, LibFunc LongDoubleFn);
 
   /// Get the name of the overloaded floating point function
-  /// corresponding to \a Ty.
-  StringRef getFloatFnName(const TargetLibraryInfo *TLI, Type *Ty,
-                           LibFunc DoubleFn, LibFunc FloatFn,
-                           LibFunc LongDoubleFn);
+  /// corresponding to \a Ty. Return the LibFunc in \a TheLibFunc.
+  StringRef getFloatFn(const Module *M, const TargetLibraryInfo *TLI, Type *Ty,
+                       LibFunc DoubleFn, LibFunc FloatFn, LibFunc LongDoubleFn,
+                       LibFunc &TheLibFunc);
 
   /// Return V if it is an i8*, otherwise cast it to i8*.
   Value *castToCStr(Value *V, IRBuilderBase &B);
@@ -99,6 +139,10 @@ namespace llvm {
   Value *emitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilderBase &B,
                     const DataLayout &DL, const TargetLibraryInfo *TLI);
 
+  /// Emit a call to the memrchr function, analogously to emitMemChr.
+  Value *emitMemRChr(Value *Ptr, Value *Val, Value *Len, IRBuilderBase &B,
+                    const DataLayout &DL, const TargetLibraryInfo *TLI);
+
   /// Emit a call to the memcmp function.
   Value *emitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilderBase &B,
                     const DataLayout &DL, const TargetLibraryInfo *TLI);
@@ -148,7 +192,8 @@ namespace llvm {
   /// function is known to take a single of type matching 'Op' and returns one
   /// value with the same type. If 'Op' is a long double, 'l' is added as the
   /// suffix of name, if 'Op' is a float, we add a 'f' suffix.
-  Value *emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilderBase &B,
+  Value *emitUnaryFloatFnCall(Value *Op, const TargetLibraryInfo *TLI,
+                              StringRef Name, IRBuilderBase &B,
                               const AttributeList &Attrs);
 
   /// Emit a call to the unary function DoubleFn, FloatFn or LongDoubleFn,
@@ -162,8 +207,10 @@ namespace llvm {
   /// function is known to take type matching 'Op1' and 'Op2' and return one
   /// value with the same type. If 'Op1/Op2' are long double, 'l' is added as
   /// the suffix of name, if 'Op1/Op2' are float, we add a 'f' suffix.
-  Value *emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
-                               IRBuilderBase &B, const AttributeList &Attrs);
+  Value *emitBinaryFloatFnCall(Value *Op1, Value *Op2,
+                               const TargetLibraryInfo *TLI,
+                               StringRef Name, IRBuilderBase &B,
+                               const AttributeList &Attrs);
 
   /// Emit a call to the binary function DoubleFn, FloatFn or LongDoubleFn,
   /// depending of the type of Op1.
diff --git a/llvm/include/llvm/Transforms/Utils/CallGraphUpdater.h b/llvm/include/llvm/Transforms/Utils/CallGraphUpdater.h
index e12d7e09aad6..7e6683fd0c8a 100644
--- a/llvm/include/llvm/Transforms/Utils/CallGraphUpdater.h
+++ b/llvm/include/llvm/Transforms/Utils/CallGraphUpdater.h
@@ -16,12 +16,13 @@
 #define LLVM_TRANSFORMS_UTILS_CALLGRAPHUPDATER_H
 
 #include "llvm/Analysis/CGSCCPassManager.h"
-#include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/LazyCallGraph.h"
 
 namespace llvm {
 
+class CallGraph;
+class CallGraphSCC;
+
 /// Wrapper to unify "old style" CallGraph and "new style" LazyCallGraph. This
 /// simplifies the interface and the call sites, e.g., new and old pass manager
 /// passes can share the same code.
diff --git a/llvm/include/llvm/Transforms/Utils/CallPromotionUtils.h b/llvm/include/llvm/Transforms/Utils/CallPromotionUtils.h
index daa88981d3bf..fcb384ec3613 100644
--- a/llvm/include/llvm/Transforms/Utils/CallPromotionUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/CallPromotionUtils.h
@@ -19,6 +19,7 @@ class CallBase;
 class CastInst;
 class Function;
 class MDNode;
+class Value;
 
 /// Return true if the given indirect call site can be made to call \p Callee.
 ///
@@ -73,6 +74,15 @@ CallBase &promoteCallWithIfThenElse(CallBase &CB, Function *Callee,
 ///
 bool tryPromoteCall(CallBase &CB);
 
+/// Predicate and clone the given call site.
+///
+/// This function creates an if-then-else structure at the location of the call
+/// site. The "if" condition compares the call site's called value to the given
+/// callee. The original call site is moved into the "else" block, and a clone
+/// of the call site is placed in the "then" block. The cloned instruction is
+/// returned.
+CallBase &versionCallSite(CallBase &CB, Value *Callee, MDNode *BranchWeights);
+
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_UTILS_CALLPROMOTIONUTILS_H
diff --git a/llvm/include/llvm/Transforms/Utils/CanonicalizeAliases.h b/llvm/include/llvm/Transforms/Utils/CanonicalizeAliases.h
index fdb390db3aff..0bdc1a12d1fb 100644
--- a/llvm/include/llvm/Transforms/Utils/CanonicalizeAliases.h
+++ b/llvm/include/llvm/Transforms/Utils/CanonicalizeAliases.h
@@ -13,11 +13,12 @@
 #ifndef LLVM_TRANSFORMS_UTILS_CANONICALIZEALIASES_H
 #define LLVM_TRANSFORMS_UTILS_CANONICALIZEALIASES_H
 
-#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
 
+class Module;
+
 /// Simple pass that canonicalizes aliases.
 class CanonicalizeAliasesPass : public PassInfoMixin<CanonicalizeAliasesPass> {
 public:
diff --git a/llvm/include/llvm/Transforms/Utils/CanonicalizeFreezeInLoops.h b/llvm/include/llvm/Transforms/Utils/CanonicalizeFreezeInLoops.h
index 9de032935f88..924b6cdf7ca0 100644
--- a/llvm/include/llvm/Transforms/Utils/CanonicalizeFreezeInLoops.h
+++ b/llvm/include/llvm/Transforms/Utils/CanonicalizeFreezeInLoops.h
@@ -14,10 +14,10 @@
 #define LLVM_TRANSFORMS_UTILS_CANONICALIZEFREEZEINLOOPS_H
 
 #include "llvm/Analysis/LoopAnalysisManager.h"
-#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
+class Loop;
 class LPMUpdater;
 
 /// A pass that canonicalizes freeze instructions in a loop.
diff --git a/llvm/include/llvm/Transforms/Utils/CodeExtractor.h b/llvm/include/llvm/Transforms/Utils/CodeExtractor.h
index 8aed3d0e40d9..bb23cf4a9a3c 100644
--- a/llvm/include/llvm/Transforms/Utils/CodeExtractor.h
+++ b/llvm/include/llvm/Transforms/Utils/CodeExtractor.h
@@ -17,11 +17,11 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include <limits>
 
 namespace llvm {
 
+template <typename PtrType> class SmallPtrSetImpl;
 class AllocaInst;
 class BasicBlock;
 class BlockFrequency;
@@ -92,6 +92,11 @@ public:
     BranchProbabilityInfo *BPI;
     AssumptionCache *AC;
 
+    // A block outside of the extraction set where any intermediate
+    // allocations will be placed inside. If this is null, allocations
+    // will be placed in the entry block of the function.
+    BasicBlock *AllocationBlock;
+
     // If true, varargs functions can be extracted.
     bool AllowVarArgs;
 
@@ -120,11 +125,15 @@ public:
     /// code is extracted, including vastart. If AllowAlloca is true, then
     /// extraction of blocks containing alloca instructions would be possible,
     /// however code extractor won't validate whether extraction is legal.
+    /// Any new allocations will be placed in the AllocationBlock, unless
+    /// it is null, in which case it will be placed in the entry block of
+    /// the function from which the code is being extracted.
     CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT = nullptr,
                   bool AggregateArgs = false, BlockFrequencyInfo *BFI = nullptr,
                   BranchProbabilityInfo *BPI = nullptr,
-                  AssumptionCache *AC = nullptr,
-                  bool AllowVarArgs = false, bool AllowAlloca = false,
+                  AssumptionCache *AC = nullptr, bool AllowVarArgs = false,
+                  bool AllowAlloca = false,
+                  BasicBlock *AllocationBlock = nullptr,
                   std::string Suffix = "");
 
     /// Create a code extractor for a loop body.
diff --git a/llvm/include/llvm/Transforms/Utils/CtorUtils.h b/llvm/include/llvm/Transforms/Utils/CtorUtils.h
index 3ef3ba244b43..40b290a5a6f4 100644
--- a/llvm/include/llvm/Transforms/Utils/CtorUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/CtorUtils.h
@@ -13,7 +13,7 @@
 #ifndef LLVM_TRANSFORMS_UTILS_CTORUTILS_H
 #define LLVM_TRANSFORMS_UTILS_CTORUTILS_H
 
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 
 namespace llvm {
 
@@ -22,9 +22,9 @@ class Module;
 
 /// Call "ShouldRemove" for every entry in M's global_ctor list and remove the
 /// entries for which it returns true.  Return true if anything changed.
-bool optimizeGlobalCtorsList(Module &M,
-                             function_ref<bool(Function *)> ShouldRemove);
+bool optimizeGlobalCtorsList(
+    Module &M, function_ref<bool(uint32_t, Function *)> ShouldRemove);
 
-} // End llvm namespace
+} // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/Transforms/Utils/Debugify.h b/llvm/include/llvm/Transforms/Utils/Debugify.h
index 892e354cd9ed..405bbb8e0be8 100644
--- a/llvm/include/llvm/Transforms/Utils/Debugify.h
+++ b/llvm/include/llvm/Transforms/Utils/Debugify.h
@@ -23,7 +23,8 @@
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
 
-using DebugFnMap = llvm::MapVector<llvm::StringRef, const llvm::DISubprogram *>;
+using DebugFnMap =
+    llvm::MapVector<const llvm::Function *, const llvm::DISubprogram *>;
 using DebugInstMap = llvm::MapVector<const llvm::Instruction *, bool>;
 using DebugVarMap = llvm::MapVector<const llvm::DILocalVariable *, unsigned>;
 using WeakInstValueMap =
@@ -42,9 +43,6 @@ struct DebugInfoPerPass {
   DebugVarMap DIVariables;
 };
 
-/// Map pass names to a per-pass DebugInfoPerPass instance.
-using DebugInfoPerPassMap = llvm::MapVector<llvm::StringRef, DebugInfoPerPass>;
-
 namespace llvm {
 class DIBuilder;
 
@@ -69,24 +67,24 @@ bool stripDebugifyMetadata(Module &M);
 ///
 /// \param M The module to collect debug information from.
 /// \param Functions A range of functions to collect debug information from.
-/// \param DIPreservationMap A map to collect the DI metadata.
+/// \param DebugInfoBeforePass DI metadata before a pass.
 /// \param Banner A prefix string to add to debug/error messages.
 /// \param NameOfWrappedPass A name of a pass to add to debug/error messages.
 bool collectDebugInfoMetadata(Module &M,
                               iterator_range<Module::iterator> Functions,
-                              DebugInfoPerPassMap &DIPreservationMap,
+                              DebugInfoPerPass &DebugInfoBeforePass,
                               StringRef Banner, StringRef NameOfWrappedPass);
 
 /// Check original debug information after a pass.
 ///
 /// \param M The module to collect debug information from.
 /// \param Functions A range of functions to collect debug information from.
-/// \param DIPreservationMap A map used to check collected the DI metadata.
+/// \param DebugInfoBeforePass DI metadata before a pass.
 /// \param Banner A prefix string to add to debug/error messages.
 /// \param NameOfWrappedPass A name of a pass to add to debug/error messages.
 bool checkDebugInfoMetadata(Module &M,
                             iterator_range<Module::iterator> Functions,
-                            DebugInfoPerPassMap &DIPreservationMap,
+                            DebugInfoPerPass &DebugInfoBeforePass,
                             StringRef Banner, StringRef NameOfWrappedPass,
                             StringRef OrigDIVerifyBugsReportFilePath);
 } // namespace llvm
@@ -97,11 +95,11 @@ enum class DebugifyMode { NoDebugify, SyntheticDebugInfo, OriginalDebugInfo };
 llvm::ModulePass *createDebugifyModulePass(
     enum DebugifyMode Mode = DebugifyMode::SyntheticDebugInfo,
     llvm::StringRef NameOfWrappedPass = "",
-    DebugInfoPerPassMap *DIPreservationMap = nullptr);
+    DebugInfoPerPass *DebugInfoBeforePass = nullptr);
 llvm::FunctionPass *createDebugifyFunctionPass(
     enum DebugifyMode Mode = DebugifyMode::SyntheticDebugInfo,
     llvm::StringRef NameOfWrappedPass = "",
-    DebugInfoPerPassMap *DIPreservationMap = nullptr);
+    DebugInfoPerPass *DebugInfoBeforePass = nullptr);
 
 struct NewPMDebugifyPass : public llvm::PassInfoMixin<NewPMDebugifyPass> {
   llvm::PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM);
@@ -140,14 +138,14 @@ llvm::ModulePass *createCheckDebugifyModulePass(
     bool Strip = false, llvm::StringRef NameOfWrappedPass = "",
     DebugifyStatsMap *StatsMap = nullptr,
     enum DebugifyMode Mode = DebugifyMode::SyntheticDebugInfo,
-    DebugInfoPerPassMap *DIPreservationMap = nullptr,
+    DebugInfoPerPass *DebugInfoBeforePass = nullptr,
     llvm::StringRef OrigDIVerifyBugsReportFilePath = "");
 
 llvm::FunctionPass *createCheckDebugifyFunctionPass(
     bool Strip = false, llvm::StringRef NameOfWrappedPass = "",
     DebugifyStatsMap *StatsMap = nullptr,
     enum DebugifyMode Mode = DebugifyMode::SyntheticDebugInfo,
-    DebugInfoPerPassMap *DIPreservationMap = nullptr,
+    DebugInfoPerPass *DebugInfoBeforePass = nullptr,
     llvm::StringRef OrigDIVerifyBugsReportFilePath = "");
 
 struct NewPMCheckDebugifyPass
@@ -171,7 +169,7 @@ struct DebugifyEachInstrumentation {
 class DebugifyCustomPassManager : public legacy::PassManager {
   StringRef OrigDIVerifyBugsReportFilePath;
   DebugifyStatsMap *DIStatsMap = nullptr;
-  DebugInfoPerPassMap *DIPreservationMap = nullptr;
+  DebugInfoPerPass *DebugInfoBeforePass = nullptr;
   enum DebugifyMode Mode = DebugifyMode::NoDebugify;
 
 public:
@@ -197,17 +195,17 @@ public:
     // TODO: Implement Debugify for LoopPass.
     switch (Kind) {
     case PT_Function:
-      super::add(createDebugifyFunctionPass(Mode, Name, DIPreservationMap));
+      super::add(createDebugifyFunctionPass(Mode, Name, DebugInfoBeforePass));
       super::add(P);
       super::add(createCheckDebugifyFunctionPass(
-          isSyntheticDebugInfo(), Name, DIStatsMap, Mode, DIPreservationMap,
+          isSyntheticDebugInfo(), Name, DIStatsMap, Mode, DebugInfoBeforePass,
           OrigDIVerifyBugsReportFilePath));
       break;
     case PT_Module:
-      super::add(createDebugifyModulePass(Mode, Name, DIPreservationMap));
+      super::add(createDebugifyModulePass(Mode, Name, DebugInfoBeforePass));
       super::add(P);
       super::add(createCheckDebugifyModulePass(
-          isSyntheticDebugInfo(), Name, DIStatsMap, Mode, DIPreservationMap,
+          isSyntheticDebugInfo(), Name, DIStatsMap, Mode, DebugInfoBeforePass,
           OrigDIVerifyBugsReportFilePath));
       break;
     default:
@@ -219,8 +217,8 @@ public:
   // Used within DebugifyMode::SyntheticDebugInfo mode.
   void setDIStatsMap(DebugifyStatsMap &StatMap) { DIStatsMap = &StatMap; }
   // Used within DebugifyMode::OriginalDebugInfo mode.
-  void setDIPreservationMap(DebugInfoPerPassMap &PerPassMap) {
-    DIPreservationMap = &PerPassMap;
+  void setDebugInfoBeforePass(DebugInfoPerPass &PerPassDI) {
+    DebugInfoBeforePass = &PerPassDI;
   }
   void setOrigDIVerifyBugsReportFilePath(StringRef BugsReportFilePath) {
     OrigDIVerifyBugsReportFilePath = BugsReportFilePath;
@@ -239,7 +237,7 @@ public:
   }
 
   const DebugifyStatsMap &getDebugifyStatsMap() const { return *DIStatsMap; }
-  DebugInfoPerPassMap &getDebugInfoPerPassMap() { return *DIPreservationMap; }
+  DebugInfoPerPass &getDebugInfoPerPass() { return *DebugInfoBeforePass; }
 };
 } // namespace llvm
 
diff --git a/llvm/include/llvm/Transforms/Utils/EscapeEnumerator.h b/llvm/include/llvm/Transforms/Utils/EscapeEnumerator.h
index bb5c6f04dd0c..3d8447e9bf23 100644
--- a/llvm/include/llvm/Transforms/Utils/EscapeEnumerator.h
+++ b/llvm/include/llvm/Transforms/Utils/EscapeEnumerator.h
@@ -32,7 +32,7 @@ class EscapeEnumerator {
 
   Function::iterator StateBB, StateE;
   IRBuilder<> Builder;
-  bool Done;
+  bool Done = false;
   bool HandleExceptions;
 
   DomTreeUpdater *DTU;
@@ -41,8 +41,7 @@ public:
   EscapeEnumerator(Function &F, const char *N = "cleanup",
                    bool HandleExceptions = true, DomTreeUpdater *DTU = nullptr)
       : F(F), CleanupBBName(N), StateBB(F.begin()), StateE(F.end()),
-        Builder(F.getContext()), Done(false),
-        HandleExceptions(HandleExceptions), DTU(DTU) {}
+        Builder(F.getContext()), HandleExceptions(HandleExceptions), DTU(DTU) {}
 
   IRBuilder<> *Next();
 };
diff --git a/llvm/include/llvm/Transforms/Utils/Evaluator.h b/llvm/include/llvm/Transforms/Utils/Evaluator.h
index 99e826bf855f..2b8384897c6b 100644
--- a/llvm/include/llvm/Transforms/Utils/Evaluator.h
+++ b/llvm/include/llvm/Transforms/Utils/Evaluator.h
@@ -18,8 +18,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
 #include <cassert>
 #include <deque>
@@ -27,6 +25,7 @@
 
 namespace llvm {
 
+class CallBase;
 class DataLayout;
 class Function;
 class TargetLibraryInfo;
@@ -139,6 +138,8 @@ private:
                        SmallVectorImpl<Constant *> &Formals);
 
   Constant *ComputeLoadResult(Constant *P, Type *Ty);
+  Constant *ComputeLoadResult(GlobalVariable *GV, Type *Ty,
+                              const APInt &Offset);
 
   /// As we compute SSA register values, we store their contents here. The back
   /// of the deque contains the current function and the stack contains the
diff --git a/llvm/include/llvm/Transforms/Utils/FunctionComparator.h b/llvm/include/llvm/Transforms/Utils/FunctionComparator.h
index 964fdce45744..b6b53d0f10cb 100644
--- a/llvm/include/llvm/Transforms/Utils/FunctionComparator.h
+++ b/llvm/include/llvm/Transforms/Utils/FunctionComparator.h
@@ -16,7 +16,6 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/IR/Attributes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/ValueMap.h"
@@ -28,6 +27,7 @@
 namespace llvm {
 
 class APFloat;
+class AttributeList;
 class APInt;
 class BasicBlock;
 class Constant;
diff --git a/llvm/include/llvm/Transforms/Utils/GlobalStatus.h b/llvm/include/llvm/Transforms/Utils/GlobalStatus.h
index 775dd23d8f23..60c91fc30174 100644
--- a/llvm/include/llvm/Transforms/Utils/GlobalStatus.h
+++ b/llvm/include/llvm/Transforms/Utils/GlobalStatus.h
@@ -35,6 +35,9 @@ struct GlobalStatus {
   /// can be deleted.
   bool IsLoaded = false;
 
+  /// Number of stores to the global.
+  unsigned NumStores = 0;
+
   /// Keep track of what stores to the global look like.
   enum StoredType {
     /// There is no store to this global.  It can thus be marked constant.
diff --git a/llvm/include/llvm/Transforms/Utils/InjectTLIMappings.h b/llvm/include/llvm/Transforms/Utils/InjectTLIMappings.h
index af9cdb9fd619..d2ce0c5d3988 100644
--- a/llvm/include/llvm/Transforms/Utils/InjectTLIMappings.h
+++ b/llvm/include/llvm/Transforms/Utils/InjectTLIMappings.h
@@ -18,6 +18,7 @@
 #include "llvm/Pass.h"
 
 namespace llvm {
+class Function;
 class InjectTLIMappings : public PassInfoMixin<InjectTLIMappings> {
 public:
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h
index 873127554b47..946fc84b9a2c 100644
--- a/llvm/include/llvm/Transforms/Utils/Local.h
+++ b/llvm/include/llvm/Transforms/Utils/Local.h
@@ -15,26 +15,18 @@
 #define LLVM_TRANSFORMS_UTILS_LOCAL_H
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/Utils/Local.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/SimplifyCFGOptions.h"
 #include <cstdint>
-#include <limits>
 
 namespace llvm {
 
+class DataLayout;
+class Value;
+class WeakTrackingVH;
+class WeakVH;
+template <typename T> class SmallVectorImpl;
 class AAResults;
 class AllocaInst;
 class AssumptionCache;
@@ -343,7 +335,7 @@ bool replaceAllDbgUsesWith(Instruction &From, Value &To, Instruction &DomPoint,
 
 /// Remove all instructions from a basic block other than its terminator
 /// and any present EH pad instructions. Returns a pair where the first element
-/// is the number of instructions (excluding debug info instrinsics) that have
+/// is the number of instructions (excluding debug info intrinsics) that have
 /// been removed, and the second element is the number of debug info intrinsics
 /// that have been removed.
 std::pair<unsigned, unsigned>
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 3a712d78df67..676c0c1487db 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -13,18 +13,18 @@
 #ifndef LLVM_TRANSFORMS_UTILS_LOOPUTILS_H
 #define LLVM_TRANSFORMS_UTILS_LOOPUTILS_H
 
-#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/IVDescriptors.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 
 namespace llvm {
 
 template <typename T> class DomTreeNodeBase;
 using DomTreeNode = DomTreeNodeBase<BasicBlock>;
+class StringRef;
+class AnalysisUsage;
+class TargetTransformInfo;
 class AAResults;
-class AliasSet;
-class AliasSetTracker;
 class BasicBlock;
 class BlockFrequencyInfo;
 class ICFLoopSafetyInfo;
@@ -49,8 +49,6 @@ typedef std::pair<const RuntimeCheckingPtrGroup *,
 
 template <typename T> class Optional;
 template <typename T, unsigned N> class SmallSetVector;
-template <typename T, unsigned N> class SmallVector;
-template <typename T> class SmallVectorImpl;
 template <typename T, unsigned N> class SmallPriorityWorklist;
 
 BasicBlock *InsertPreheaderForLoop(Loop *L, DominatorTree *DT, LoopInfo *LI,
@@ -150,7 +148,7 @@ protected:
 /// this function is called by \p sinkRegionForLoopNest.
 bool sinkRegion(DomTreeNode *, AAResults *, LoopInfo *, DominatorTree *,
                 BlockFrequencyInfo *, TargetLibraryInfo *,
-                TargetTransformInfo *, Loop *CurLoop, MemorySSAUpdater *,
+                TargetTransformInfo *, Loop *CurLoop, MemorySSAUpdater &,
                 ICFLoopSafetyInfo *, SinkAndHoistLICMFlags &,
                 OptimizationRemarkEmitter *, Loop *OutermostLoop = nullptr);
 
@@ -159,7 +157,7 @@ bool sinkRegion(DomTreeNode *, AAResults *, LoopInfo *, DominatorTree *,
 bool sinkRegionForLoopNest(DomTreeNode *, AAResults *, LoopInfo *,
                            DominatorTree *, BlockFrequencyInfo *,
                            TargetLibraryInfo *, TargetTransformInfo *, Loop *,
-                           MemorySSAUpdater *, ICFLoopSafetyInfo *,
+                           MemorySSAUpdater &, ICFLoopSafetyInfo *,
                            SinkAndHoistLICMFlags &,
                            OptimizationRemarkEmitter *);
 
@@ -171,10 +169,13 @@ bool sinkRegionForLoopNest(DomTreeNode *, AAResults *, LoopInfo *,
 /// BlockFrequencyInfo, TargetLibraryInfo, Loop, AliasSet information for all
 /// instructions of the loop and loop safety information as arguments.
 /// Diagnostics is emitted via \p ORE. It returns changed status.
+/// \p AllowSpeculation is whether values should be hoisted even if they are not
+/// guaranteed to execute in the loop, but are safe to speculatively execute.
 bool hoistRegion(DomTreeNode *, AAResults *, LoopInfo *, DominatorTree *,
                  BlockFrequencyInfo *, TargetLibraryInfo *, Loop *,
-                 MemorySSAUpdater *, ScalarEvolution *, ICFLoopSafetyInfo *,
-                 SinkAndHoistLICMFlags &, OptimizationRemarkEmitter *, bool);
+                 MemorySSAUpdater &, ScalarEvolution *, ICFLoopSafetyInfo *,
+                 SinkAndHoistLICMFlags &, OptimizationRemarkEmitter *, bool,
+                 bool AllowSpeculation);
 
 /// This function deletes dead loops. The caller of this function needs to
 /// guarantee that the loop is infact dead.
@@ -204,12 +205,14 @@ void breakLoopBackedge(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
 /// LoopInfo, DominatorTree, Loop, AliasSet information for all instructions
 /// of the loop and loop safety information as arguments.
 /// Diagnostics is emitted via \p ORE. It returns changed status.
+/// \p AllowSpeculation is whether values should be hoisted even if they are not
+/// guaranteed to execute in the loop, but are safe to speculatively execute.
 bool promoteLoopAccessesToScalars(
     const SmallSetVector<Value *, 8> &, SmallVectorImpl<BasicBlock *> &,
     SmallVectorImpl<Instruction *> &, SmallVectorImpl<MemoryAccess *> &,
     PredIteratorCache &, LoopInfo *, DominatorTree *, const TargetLibraryInfo *,
-    Loop *, MemorySSAUpdater *, ICFLoopSafetyInfo *,
-    OptimizationRemarkEmitter *);
+    Loop *, MemorySSAUpdater &, ICFLoopSafetyInfo *,
+    OptimizationRemarkEmitter *, bool AllowSpeculation);
 
 /// Does a BFS from a given node to all of its children inside a given loop.
 /// The returned vector of nodes includes the starting point.
@@ -342,9 +345,9 @@ void getLoopAnalysisUsage(AnalysisUsage &AU);
 /// true when moving out of loop and not true when moving into loops.
 /// If \p ORE is set use it to emit optimization remarks.
 bool canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
-                        Loop *CurLoop, AliasSetTracker *CurAST,
-                        MemorySSAUpdater *MSSAU, bool TargetExecutesOncePerLoop,
-                        SinkAndHoistLICMFlags *LICMFlags = nullptr,
+                        Loop *CurLoop, MemorySSAUpdater &MSSAU,
+                        bool TargetExecutesOncePerLoop,
+                        SinkAndHoistLICMFlags &LICMFlags,
                         OptimizationRemarkEmitter *ORE = nullptr);
 
 /// Returns the comparison predicate used when expanding a min/max reduction.
@@ -410,8 +413,10 @@ Value *createOrderedReduction(IRBuilderBase &B,
 /// of each scalar operation (VL) that will be converted into a vector (I).
 /// If OpValue is non-null, we only consider operations similar to OpValue
 /// when intersecting.
-/// Flag set: NSW, NUW, exact, and all of fast-math.
-void propagateIRFlags(Value *I, ArrayRef<Value *> VL, Value *OpValue = nullptr);
+/// Flag set: NSW, NUW (if IncludeWrapFlags is true), exact, and all of
+/// fast-math.
+void propagateIRFlags(Value *I, ArrayRef<Value *> VL, Value *OpValue = nullptr,
+                      bool IncludeWrapFlags = true);
 
 /// Returns true if we can prove that \p S is defined and always negative in
 /// loop \p L.
@@ -497,6 +502,12 @@ addRuntimeChecks(Instruction *Loc, Loop *TheLoop,
                  const SmallVectorImpl<RuntimePointerCheck> &PointerChecks,
                  SCEVExpander &Expander);
 
+Value *
+addDiffRuntimeChecks(Instruction *Loc, Loop *TheLoop,
+                     ArrayRef<PointerDiffInfo> Checks, SCEVExpander &Expander,
+                     function_ref<Value *(IRBuilderBase &, unsigned)> GetVF,
+                     unsigned IC);
+
 /// Struct to hold information about a partially invariant condition.
 struct IVConditionInfo {
   /// Instructions that need to be duplicated and checked for the unswitching
diff --git a/llvm/include/llvm/Transforms/Utils/LoopVersioning.h b/llvm/include/llvm/Transforms/Utils/LoopVersioning.h
index 4a8831ed45b2..eeab98c56b66 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopVersioning.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopVersioning.h
@@ -15,7 +15,6 @@
 #ifndef LLVM_TRANSFORMS_UTILS_LOOPVERSIONING_H
 #define LLVM_TRANSFORMS_UTILS_LOOPVERSIONING_H
 
-#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
@@ -23,6 +22,8 @@
 namespace llvm {
 
 class Loop;
+class SCEVPredicate;
+class ScalarEvolution;
 class LoopAccessInfo;
 class LoopInfo;
 struct RuntimeCheckingPtrGroup;
@@ -113,7 +114,7 @@ private:
   Loop *VersionedLoop;
   /// The fall-back loop.  I.e. control flows here if pointers in the
   /// loop may alias (memchecks failed).
-  Loop *NonVersionedLoop;
+  Loop *NonVersionedLoop = nullptr;
 
   /// This maps the instructions from VersionedLoop to their counterpart
   /// in NonVersionedLoop.
@@ -123,7 +124,7 @@ private:
   SmallVector<RuntimePointerCheck, 4> AliasChecks;
 
   /// The set of SCEV checks that we are versioning for.
-  const SCEVUnionPredicate &Preds;
+  const SCEVPredicate &Preds;
 
   /// Maps a pointer to the pointer checking group that the pointer
   /// belongs to.
diff --git a/llvm/include/llvm/Transforms/Utils/LowerAtomic.h b/llvm/include/llvm/Transforms/Utils/LowerAtomic.h
new file mode 100644
index 000000000000..c85f8e3a5646
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Utils/LowerAtomic.h
@@ -0,0 +1,37 @@
+//===- LowerAtomic.h - Lower atomic intrinsics ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+// This pass lowers atomic intrinsics to non-atomic form for use in a known
+// non-preemptible environment.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_LOWERATOMIC_H
+#define LLVM_TRANSFORMS_SCALAR_LOWERATOMIC_H
+
+#include "llvm/IR/Instructions.h"
+
+namespace llvm {
+
+class IRBuilderBase;
+
+/// Convert the given Cmpxchg into primitive load and compare.
+bool lowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI);
+
+/// Convert the given RMWI into primitive load and stores,
+/// assuming that doing so is legal. Return true if the lowering
+/// succeeds.
+bool lowerAtomicRMWInst(AtomicRMWInst *RMWI);
+
+/// Emit IR to implement the given atomicrmw operation on values in registers,
+/// returning the new value.
+Value *buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder,
+                           Value *Loaded, Value *Inc);
+}
+
+#endif // LLVM_TRANSFORMS_SCALAR_LOWERATOMIC_H
diff --git a/llvm/include/llvm/Transforms/Utils/LowerGlobalDtors.h b/llvm/include/llvm/Transforms/Utils/LowerGlobalDtors.h
new file mode 100644
index 000000000000..993a6f57361c
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Utils/LowerGlobalDtors.h
@@ -0,0 +1,28 @@
+//===- LowerGlobalDtors.h - Lower @llvm.global_dtors ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers @llvm.global_dtors by creating wrapper functions that are
+// registered in @llvm.global_ctors and which contain a call to `__cxa_atexit`
+// to register their destructor functions.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TRANSFORMS_UTILS_LOWERGLOBALDTORS_H
+#define LLVM_TRANSFORMS_UTILS_LOWERGLOBALDTORS_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class LowerGlobalDtorsPass : public PassInfoMixin<LowerGlobalDtorsPass> {
+public:
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_UTILS_LOWERGLOBALDTORS_H
diff --git a/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h b/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
index 8d0956033d9f..acf59ff580a4 100644
--- a/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
+++ b/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
@@ -14,13 +14,17 @@
 #ifndef LLVM_TRANSFORMS_UTILS_LOWERMEMINTRINSICS_H
 #define LLVM_TRANSFORMS_UTILS_LOWERMEMINTRINSICS_H
 
+#include "llvm/ADT/Optional.h"
+
 namespace llvm {
 
+class AtomicMemCpyInst;
 class ConstantInt;
 class Instruction;
 class MemCpyInst;
 class MemMoveInst;
 class MemSetInst;
+class ScalarEvolution;
 class TargetTransformInfo;
 class Value;
 struct Align;
@@ -28,10 +32,11 @@ struct Align;
 /// Emit a loop implementing the semantics of llvm.memcpy where the size is not
 /// a compile-time constant. Loop will be insterted at \p InsertBefore.
 void createMemCpyLoopUnknownSize(Instruction *InsertBefore, Value *SrcAddr,
-                                 Value *DstAddr, Value *CopyLen,
-                                 Align SrcAlign, Align DestAlign,
-                                 bool SrcIsVolatile, bool DstIsVolatile,
-                                 const TargetTransformInfo &TTI);
+                                 Value *DstAddr, Value *CopyLen, Align SrcAlign,
+                                 Align DestAlign, bool SrcIsVolatile,
+                                 bool DstIsVolatile, bool CanOverlap,
+                                 const TargetTransformInfo &TTI,
+                                 Optional<unsigned> AtomicSize = None);
 
 /// Emit a loop implementing the semantics of an llvm.memcpy whose size is a
 /// compile time constant. Loop is inserted at \p InsertBefore.
@@ -39,10 +44,12 @@ void createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
                                Value *DstAddr, ConstantInt *CopyLen,
                                Align SrcAlign, Align DestAlign,
                                bool SrcIsVolatile, bool DstIsVolatile,
-                               const TargetTransformInfo &TTI);
+                               bool CanOverlap, const TargetTransformInfo &TTI,
+                               Optional<uint32_t> AtomicCpySize = None);
 
 /// Expand \p MemCpy as a loop. \p MemCpy is not deleted.
-void expandMemCpyAsLoop(MemCpyInst *MemCpy, const TargetTransformInfo &TTI);
+void expandMemCpyAsLoop(MemCpyInst *MemCpy, const TargetTransformInfo &TTI,
+                        ScalarEvolution *SE = nullptr);
 
 /// Expand \p MemMove as a loop. \p MemMove is not deleted.
 void expandMemMoveAsLoop(MemMoveInst *MemMove);
@@ -50,6 +57,11 @@ void expandMemMoveAsLoop(MemMoveInst *MemMove);
 /// Expand \p MemSet as a loop. \p MemSet is not deleted.
 void expandMemSetAsLoop(MemSetInst *MemSet);
 
+/// Expand \p AtomicMemCpy as a loop. \p AtomicMemCpy is not deleted.
+void expandAtomicMemCpyAsLoop(AtomicMemCpyInst *AtomicMemCpy,
+                              const TargetTransformInfo &TTI,
+                              ScalarEvolution *SE);
+
 } // End llvm namespace
 
 #endif
diff --git a/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h b/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
new file mode 100644
index 000000000000..a2b85e03897b
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
@@ -0,0 +1,82 @@
+//===- MemoryTaggingSupport.h - helpers for memory tagging implementations ===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares common infrastructure for HWAddressSanitizer and
+// Aarch64StackTagging.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TRANSFORMS_UTILS_MEMORYTAGGINGSUPPORT_H
+#define LLVM_TRANSFORMS_UTILS_MEMORYTAGGINGSUPPORT_H
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Support/Alignment.h"
+
+namespace llvm {
+class DominatorTree;
+class DbgVariableIntrinsic;
+class IntrinsicInst;
+class PostDominatorTree;
+class AllocaInst;
+class Instruction;
+namespace memtag {
+// For an alloca valid between lifetime markers Start and Ends, call the
+// Callback for all possible exits out of the lifetime in the containing
+// function, which can return from the instructions in RetVec.
+//
+// Returns whether Ends covered all possible exits. If they did not,
+// the caller should remove Ends to ensure that work done at the other
+// exits does not happen outside of the lifetime.
+bool forAllReachableExits(const DominatorTree &DT, const PostDominatorTree &PDT,
+                          const LoopInfo &LI, const Instruction *Start,
+                          const SmallVectorImpl<IntrinsicInst *> &Ends,
+                          const SmallVectorImpl<Instruction *> &RetVec,
+                          llvm::function_ref<void(Instruction *)> Callback);
+
+bool isStandardLifetime(const SmallVectorImpl<IntrinsicInst *> &LifetimeStart,
+                        const SmallVectorImpl<IntrinsicInst *> &LifetimeEnd,
+                        const DominatorTree *DT, const LoopInfo *LI,
+                        size_t MaxLifetimes);
+
+Instruction *getUntagLocationIfFunctionExit(Instruction &Inst);
+
+struct AllocaInfo {
+  AllocaInst *AI;
+  SmallVector<IntrinsicInst *, 2> LifetimeStart;
+  SmallVector<IntrinsicInst *, 2> LifetimeEnd;
+  SmallVector<DbgVariableIntrinsic *, 2> DbgVariableIntrinsics;
+};
+
+struct StackInfo {
+  MapVector<AllocaInst *, AllocaInfo> AllocasToInstrument;
+  SmallVector<Instruction *, 4> UnrecognizedLifetimes;
+  SmallVector<Instruction *, 8> RetVec;
+  bool CallsReturnTwice = false;
+};
+
+class StackInfoBuilder {
+public:
+  StackInfoBuilder(std::function<bool(const AllocaInst &)> IsInterestingAlloca)
+      : IsInterestingAlloca(IsInterestingAlloca) {}
+
+  void visit(Instruction &Inst);
+  StackInfo &get() { return Info; };
+
+private:
+  StackInfo Info;
+  std::function<bool(const AllocaInst &)> IsInterestingAlloca;
+};
+
+uint64_t getAllocaSizeInBytes(const AllocaInst &AI);
+void alignAndPadAlloca(memtag::AllocaInfo &Info, llvm::Align Align);
+
+} // namespace memtag
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/Transforms/Utils/MisExpect.h b/llvm/include/llvm/Transforms/Utils/MisExpect.h
new file mode 100644
index 000000000000..064eeac4c669
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Utils/MisExpect.h
@@ -0,0 +1,77 @@
+//===--- MisExpect.h - Check the use of llvm.expect with PGO data ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This contains code to emit diagnostic messages for potentially incorrect
+// usage of the llvm.expect intrinsic. This utility extracts the threshold
+// values from metadata associated with the instrumented Branch or Switch
+// instruction. The threshold values are then used to determine if a diagnostic
+// should be emitted.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+
+namespace llvm {
+namespace misexpect {
+
+/// checkBackendInstrumentation - compares PGO counters to the thresholds used
+/// for llvm.expect and warns if the PGO counters are outside of the expected
+/// range. It extracts the expected weights from the MD_prof weights attatched
+/// to the instruction, which are are assumed to come from lowered llvm.expect
+/// intrinsics. The RealWeights parameter and the extracted expected weights are
+/// then passed to verifyMisexpect() for verification
+///
+/// \param I The Instruction being checked
+/// \param RealWeights A vector of profile weights for each target block
+void checkBackendInstrumentation(Instruction &I,
+                                 const llvm::ArrayRef<uint32_t> RealWeights);
+
+/// checkFrontendInstrumentation - compares PGO counters to the thresholds used
+/// for llvm.expect and warns if the PGO counters are outside of the expected
+/// range. It extracts the expected weights from the MD_prof weights attatched
+/// to the instruction, which are are assumed to come from profiling data
+/// attached by the frontend prior to llvm.expect intrinsic lowering. The
+/// ExpectedWeights parameter and the extracted real weights are then passed to
+/// verifyMisexpect() for verification
+///
+/// \param I The Instruction being checked
+/// \param ExpectedWeights A vector of the expected weights for each target
+/// block, this determines the threshold values used when emiting diagnostics
+void checkFrontendInstrumentation(Instruction &I,
+                                  const ArrayRef<uint32_t> ExpectedWeights);
+
+/// veryifyMisExpect - compares RealWeights to the thresholds used
+/// for llvm.expect and warns if the PGO counters are outside of the expected
+/// range.
+///
+/// \param I The Instruction being checked
+/// \param RealWeights A vector of profile weights from the profile data
+/// \param ExpectedWeights A vector of the weights attatch by llvm.expect
+void verifyMisExpect(Instruction &I, ArrayRef<uint32_t> RealWeights,
+                     const ArrayRef<uint32_t> ExpectedWeights);
+
+/// checkExpectAnnotations - compares PGO counters to the thresholds used
+/// for llvm.expect and warns if the PGO counters are outside of the expected
+/// range. It extracts the expected weights from the MD_prof weights attatched
+/// to the instruction, which are are assumed to come from lowered llvm.expect
+/// intrinsics. The RealWeights parameter and the extracted expected weights are
+/// then passed to verifyMisexpect() for verification. It is a thin wrapper
+/// around the checkFrontendInstrumentation and checkBackendInstrumentation APIs
+///
+/// \param I The Instruction being checked
+/// \param RealWeights A vector of profile weights for each target block
+/// \param IsBackend A boolean describing if this is Frontend instrumentation
+void checkExpectAnnotations(Instruction &I,
+                            const ArrayRef<uint32_t> ExistingWeights,
+                            bool IsFrontend);
+
+} // namespace misexpect
+} // namespace llvm
diff --git a/llvm/include/llvm/Transforms/Utils/ModuleUtils.h b/llvm/include/llvm/Transforms/Utils/ModuleUtils.h
index 8d459972336b..85263fc00bc3 100644
--- a/llvm/include/llvm/Transforms/Utils/ModuleUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/ModuleUtils.h
@@ -13,12 +13,13 @@
 #ifndef LLVM_TRANSFORMS_UTILS_MODULEUTILS_H
 #define LLVM_TRANSFORMS_UTILS_MODULEUTILS_H
 
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/MemoryBufferRef.h"
 #include <utility> // for std::pair
 
 namespace llvm {
+template <typename T> class SmallVectorImpl;
 
 template <typename T> class ArrayRef;
 class Module;
@@ -109,14 +110,14 @@ std::string getUniqueModuleId(Module *M);
 
 /// Embed the memory buffer \p Buf into the module \p M as a global using the
 /// specified section name.
-void embedBufferInModule(Module &M, MemoryBufferRef Buf, StringRef SectionName);
+void embedBufferInModule(Module &M, MemoryBufferRef Buf, StringRef SectionName,
+                         Align Alignment = Align(1));
 
 class CallInst;
 namespace VFABI {
 /// Overwrite the Vector Function ABI variants attribute with the names provide
 /// in \p VariantMappings.
-void setVectorVariantNames(CallInst *CI,
-                           const SmallVector<std::string, 8> &VariantMappings);
+void setVectorVariantNames(CallInst *CI, ArrayRef<std::string> VariantMappings);
 } // End VFABI namespace
 } // End llvm namespace
 
diff --git a/llvm/include/llvm/Transforms/Utils/NameAnonGlobals.h b/llvm/include/llvm/Transforms/Utils/NameAnonGlobals.h
index 03d8840a22d2..a59f9bc3ebfb 100644
--- a/llvm/include/llvm/Transforms/Utils/NameAnonGlobals.h
+++ b/llvm/include/llvm/Transforms/Utils/NameAnonGlobals.h
@@ -14,7 +14,6 @@
 #ifndef LLVM_TRANSFORMS_UTILS_NAMEANONGLOBALS_H
 #define LLVM_TRANSFORMS_UTILS_NAMEANONGLOBALS_H
 
-#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
diff --git a/llvm/include/llvm/Transforms/Utils/PredicateInfo.h b/llvm/include/llvm/Transforms/Utils/PredicateInfo.h
index c922476ac79d..e57e598b6918 100644
--- a/llvm/include/llvm/Transforms/Utils/PredicateInfo.h
+++ b/llvm/include/llvm/Transforms/Utils/PredicateInfo.h
@@ -56,7 +56,6 @@
 #include "llvm/ADT/ilist_node.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
 
@@ -65,6 +64,7 @@ namespace llvm {
 class AssumptionCache;
 class DominatorTree;
 class Function;
+class Value;
 class IntrinsicInst;
 class raw_ostream;
 
diff --git a/llvm/include/llvm/Transforms/Utils/RelLookupTableConverter.h b/llvm/include/llvm/Transforms/Utils/RelLookupTableConverter.h
index 54c257383fb5..0992a4456c9d 100644
--- a/llvm/include/llvm/Transforms/Utils/RelLookupTableConverter.h
+++ b/llvm/include/llvm/Transforms/Utils/RelLookupTableConverter.h
@@ -51,11 +51,12 @@
 #ifndef LLVM_TRANSFORMS_UTILS_RELLOOKUPTABLECONVERTER_H
 #define LLVM_TRANSFORMS_UTILS_RELLOOKUPTABLECONVERTER_H
 
-#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
 
+class Module;
+
 // Pass that converts lookup tables to relative lookup tables.
 class RelLookupTableConverterPass
     : public PassInfoMixin<RelLookupTableConverterPass> {
diff --git a/llvm/include/llvm/Transforms/Utils/SCCPSolver.h b/llvm/include/llvm/Transforms/Utils/SCCPSolver.h
index bf418e659a04..17bd072598ee 100644
--- a/llvm/include/llvm/Transforms/Utils/SCCPSolver.h
+++ b/llvm/include/llvm/Transforms/Utils/SCCPSolver.h
@@ -16,16 +16,25 @@
 
 #include "llvm/ADT/MapVector.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/ValueLattice.h"
-#include "llvm/Analysis/ValueLatticeUtils.h"
-#include "llvm/IR/InstVisitor.h"
 #include "llvm/Transforms/Utils/PredicateInfo.h"
-#include <cassert>
-#include <utility>
 #include <vector>
 
 namespace llvm {
+class Argument;
+class BasicBlock;
+class CallInst;
+class Constant;
+class DataLayout;
+class DominatorTree;
+class Function;
+class GlobalVariable;
+class Instruction;
+class LLVMContext;
+class PostDominatorTree;
+class StructType;
+class TargetLibraryInfo;
+class Value;
+class ValueLatticeElement;
 
 /// Helper struct for bundling up the analysis results per function for IPSCCP.
 struct AnalysisResultsForFn {
@@ -34,6 +43,14 @@ struct AnalysisResultsForFn {
   PostDominatorTree *PDT;
 };
 
+/// Helper struct shared between Function Specialization and SCCP Solver.
+struct ArgInfo {
+  Argument *Formal; // The Formal argument being analysed.
+  Constant *Actual; // A corresponding actual constant argument.
+
+  ArgInfo(Argument *F, Constant *A) : Formal(F), Actual(A){};
+};
+
 class SCCPInstVisitor;
 
 //===----------------------------------------------------------------------===//
@@ -134,11 +151,14 @@ public:
   /// Return a reference to the set of argument tracked functions.
   SmallPtrSetImpl<Function *> &getArgumentTrackedFunctions();
 
-  /// Mark argument \p A constant with value \p C in a new function
-  /// specialization. The argument's parent function is a specialization of the
-  /// original function \p F. All other arguments of the specialization inherit
-  /// the lattice state of their corresponding values in the original function.
-  void markArgInFuncSpecialization(Function *F, Argument *A, Constant *C);
+  /// Mark the constant arguments of a new function specialization. \p F points
+  /// to the cloned function and \p Args contains a list of constant arguments
+  /// represented as pairs of {formal,actual} values (the formal argument is
+  /// associated with the original function definition). All other arguments of
+  /// the specialization inherit the lattice state of their corresponding values
+  /// in the original function.
+  void markArgInFuncSpecialization(Function *F,
+                                   const SmallVectorImpl<ArgInfo> &Args);
 
   /// Mark all of the blocks in function \p F non-executable. Clients can used
   /// this method to erase a function from the module (e.g., if it has been
diff --git a/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h b/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
index ee06893ca660..a3e5ac3ac19d 100644
--- a/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
+++ b/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
@@ -323,6 +323,28 @@ public:
     } while (Changed);
   }
 
+  /// Check all predecessors and if all of them have the same AvailableVal use
+  /// it as value for block represented by Info. Return true if singluar value
+  /// is found.
+  bool FindSingularVal(BBInfo *Info) {
+    if (!Info->NumPreds)
+      return false;
+    ValT Singular = Info->Preds[0]->DefBB->AvailableVal;
+    if (!Singular)
+      return false;
+    for (unsigned Idx = 1; Idx < Info->NumPreds; ++Idx) {
+      ValT PredVal = Info->Preds[Idx]->DefBB->AvailableVal;
+      if (!PredVal || Singular != PredVal)
+        return false;
+    }
+    // Record Singular value.
+    (*AvailableVals)[Info->BB] = Singular;
+    assert(BBMap[Info->BB] == Info && "Info missed in BBMap?");
+    Info->AvailableVal = Singular;
+    Info->DefBB = Info->Preds[0]->DefBB;
+    return true;
+  }
+
   /// FindAvailableVal - If this block requires a PHI, first check if an
   /// existing PHI matches the PHI placement and reaching definitions computed
   /// earlier, and if not, create a new PHI.  Visit all the block's
@@ -339,6 +361,10 @@ public:
       if (Info->DefBB != Info)
         continue;
 
+      // Look for singular value.
+      if (FindSingularVal(Info))
+        continue;
+
       // Look for an existing PHI.
       FindExistingPHI(Info->BB, BlockList);
       if (Info->AvailableVal)
diff --git a/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h b/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h
index e1f681bbd367..5a4c28063a1d 100644
--- a/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h
+++ b/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h
@@ -24,7 +24,6 @@
 
 namespace llvm {
 
-class BasicBlock;
 class Function;
 class MachineBasicBlock;
 class MachineFunction;
diff --git a/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h b/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h
index 175bdde7fd05..2250e928d1e6 100644
--- a/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h
+++ b/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h
@@ -76,6 +76,7 @@ template <> struct IRTraits<BasicBlock> {
 } // end namespace afdo_detail
 
 extern cl::opt<bool> SampleProfileUseProfi;
+extern cl::opt<bool> SampleProfileInferEntryCount;
 
 template <typename BT> class SampleProfileLoaderBaseImpl {
 public:
@@ -920,7 +921,9 @@ void SampleProfileLoaderBaseImpl<BT>::finalizeWeightPropagation(
   // Samples->getHeadSamples() + 1 to avoid functions with zero count.
   if (SampleProfileUseProfi) {
     const BasicBlockT *EntryBB = getEntryBB(&F);
-    if (BlockWeights[EntryBB] > 0) {
+    ErrorOr<uint64_t> EntryWeight = getBlockWeight(EntryBB);
+    if (BlockWeights[EntryBB] > 0 &&
+        (SampleProfileInferEntryCount || !EntryWeight)) {
       getFunction(F).setEntryCount(
           ProfileCount(BlockWeights[EntryBB], Function::PCT_Real),
           &InlinedGUIDs);
diff --git a/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h b/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h
index a621cb3078c5..bd7175aa96ff 100644
--- a/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h
+++ b/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h
@@ -16,20 +16,14 @@
 #define LLVM_TRANSFORMS_UTILS_SAMPLEPROFILELOADERBASEUTIL_H
 
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/Function.h"
 #include "llvm/ProfileData/SampleProf.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Transforms/Utils/ModuleUtils.h"
 
 namespace llvm {
 using namespace sampleprof;
 
 class ProfileSummaryInfo;
+class Module;
 
 extern cl::opt<unsigned> SampleProfileMaxPropagateIterations;
 extern cl::opt<unsigned> SampleProfileRecordCoverage;
diff --git a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
index 277eb7acf238..260ed1a97831 100644
--- a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
+++ b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
@@ -15,13 +15,10 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/InstSimplifyFolder.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ScalarEvolutionNormalization.h"
-#include "llvm/Analysis/TargetFolder.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/ValueHandle.h"
@@ -293,8 +290,9 @@ public:
   Value *expandCodeForPredicate(const SCEVPredicate *Pred, Instruction *Loc);
 
   /// A specialized variant of expandCodeForPredicate, handling the case when
-  /// we are expanding code for a SCEVEqualPredicate.
-  Value *expandEqualPredicate(const SCEVEqualPredicate *Pred, Instruction *Loc);
+  /// we are expanding code for a SCEVComparePredicate.
+  Value *expandComparePredicate(const SCEVComparePredicate *Pred,
+                                Instruction *Loc);
 
   /// Generates code that evaluates if the \p AR expression will overflow.
   Value *generateOverflowCheck(const SCEVAddRecExpr *AR, Instruction *Loc,
@@ -384,8 +382,8 @@ public:
   /// Note that this function does not perform an exhaustive search. I.e if it
   /// didn't find any value it does not mean that there is no such value.
   ///
-  Optional<ScalarEvolution::ValueOffsetPair>
-  getRelatedExistingExpansion(const SCEV *S, const Instruction *At, Loop *L);
+  Value *getRelatedExistingExpansion(const SCEV *S, const Instruction *At,
+                                     Loop *L);
 
   /// Returns a suitable insert point after \p I, that dominates \p
   /// MustDominate. Skips instructions inserted by the expander.
@@ -443,21 +441,15 @@ private:
   Value *expandAddToGEP(const SCEV *Op, PointerType *PTy, Type *Ty, Value *V);
 
   /// Find a previous Value in ExprValueMap for expand.
-  ScalarEvolution::ValueOffsetPair
-  FindValueInExprValueMap(const SCEV *S, const Instruction *InsertPt);
+  Value *FindValueInExprValueMap(const SCEV *S, const Instruction *InsertPt);
 
   Value *expand(const SCEV *S);
 
   /// Determine the most "relevant" loop for the given SCEV.
   const Loop *getRelevantLoop(const SCEV *);
 
-  Value *expandSMaxExpr(const SCEVNAryExpr *S);
-
-  Value *expandUMaxExpr(const SCEVNAryExpr *S);
-
-  Value *expandSMinExpr(const SCEVNAryExpr *S);
-
-  Value *expandUMinExpr(const SCEVNAryExpr *S);
+  Value *expandMinMaxExpr(const SCEVNAryExpr *S, Intrinsic::ID IntrinID,
+                          Twine Name, bool IsSequential = false);
 
   Value *visitConstant(const SCEVConstant *S) { return S->getValue(); }
 
diff --git a/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h b/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
index fb3a7490346f..7af879638a4d 100644
--- a/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
+++ b/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
@@ -23,6 +23,7 @@ class AssumptionCache;
 struct SimplifyCFGOptions {
   int BonusInstThreshold = 1;
   bool ForwardSwitchCondToPhi = false;
+  bool ConvertSwitchRangeToICmp = false;
   bool ConvertSwitchToLookupTable = false;
   bool NeedCanonicalLoop = true;
   bool HoistCommonInsts = false;
@@ -41,6 +42,10 @@ struct SimplifyCFGOptions {
     ForwardSwitchCondToPhi = B;
     return *this;
   }
+  SimplifyCFGOptions &convertSwitchRangeToICmp(bool B) {
+    ConvertSwitchRangeToICmp = B;
+    return *this;
+  }
   SimplifyCFGOptions &convertSwitchToLookupTable(bool B) {
     ConvertSwitchToLookupTable = B;
     return *this;
diff --git a/llvm/include/llvm/Transforms/Utils/SimplifyIndVar.h b/llvm/include/llvm/Transforms/Utils/SimplifyIndVar.h
index 4ba56fb45afa..ff60811b6168 100644
--- a/llvm/include/llvm/Transforms/Utils/SimplifyIndVar.h
+++ b/llvm/include/llvm/Transforms/Utils/SimplifyIndVar.h
@@ -15,12 +15,11 @@
 #ifndef LLVM_TRANSFORMS_UTILS_SIMPLIFYINDVAR_H
 #define LLVM_TRANSFORMS_UTILS_SIMPLIFYINDVAR_H
 
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/IR/ConstantRange.h"
-#include "llvm/IR/ValueHandle.h"
-
 namespace llvm {
 
+class Type;
+class WeakTrackingVH;
+template <typename T> class SmallVectorImpl;
 class CastInst;
 class DominatorTree;
 class Loop;
diff --git a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
index a88e72fc9ba8..79a44b667445 100644
--- a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
+++ b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
@@ -14,7 +14,7 @@
 #ifndef LLVM_TRANSFORMS_UTILS_SIMPLIFYLIBCALLS_H
 #define LLVM_TRANSFORMS_UTILS_SIMPLIFYLIBCALLS_H
 
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 
 namespace llvm {
@@ -105,7 +105,7 @@ private:
   OptimizationRemarkEmitter &ORE;
   BlockFrequencyInfo *BFI;
   ProfileSummaryInfo *PSI;
-  bool UnsafeFPShrink;
+  bool UnsafeFPShrink = false;
   function_ref<void(Instruction *, Value *)> Replacer;
   function_ref<void(Instruction *)> Eraser;
 
@@ -163,6 +163,7 @@ private:
   Value *optimizeStpCpy(CallInst *CI, IRBuilderBase &B);
   Value *optimizeStrNCpy(CallInst *CI, IRBuilderBase &B);
   Value *optimizeStrLen(CallInst *CI, IRBuilderBase &B);
+  Value *optimizeStrNLen(CallInst *CI, IRBuilderBase &B);
   Value *optimizeStrPBrk(CallInst *CI, IRBuilderBase &B);
   Value *optimizeStrTo(CallInst *CI, IRBuilderBase &B);
   Value *optimizeStrSpn(CallInst *CI, IRBuilderBase &B);
@@ -234,10 +235,11 @@ private:
 
   /// hasFloatVersion - Checks if there is a float version of the specified
   /// function by checking for an existing function with name FuncName + f
-  bool hasFloatVersion(StringRef FuncName);
+  bool hasFloatVersion(const Module *M, StringRef FuncName);
 
-  /// Shared code to optimize strlen+wcslen.
-  Value *optimizeStringLength(CallInst *CI, IRBuilderBase &B, unsigned CharSize);
+  /// Shared code to optimize strlen+wcslen and strnlen+wcsnlen.
+  Value *optimizeStringLength(CallInst *CI, IRBuilderBase &B, unsigned CharSize,
+                              Value *Bound = nullptr);
 };
 } // End llvm namespace
 
diff --git a/llvm/include/llvm/Transforms/Utils/SizeOpts.h b/llvm/include/llvm/Transforms/Utils/SizeOpts.h
index 11bf5501598f..aa9e9bd6c69b 100644
--- a/llvm/include/llvm/Transforms/Utils/SizeOpts.h
+++ b/llvm/include/llvm/Transforms/Utils/SizeOpts.h
@@ -13,7 +13,6 @@
 #ifndef LLVM_TRANSFORMS_UTILS_SIZEOPTS_H
 #define LLVM_TRANSFORMS_UTILS_SIZEOPTS_H
 
-#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Support/CommandLine.h"
 
diff --git a/llvm/include/llvm/Transforms/Utils/SplitModule.h b/llvm/include/llvm/Transforms/Utils/SplitModule.h
index 42b3784db417..a5450738060a 100644
--- a/llvm/include/llvm/Transforms/Utils/SplitModule.h
+++ b/llvm/include/llvm/Transforms/Utils/SplitModule.h
@@ -15,7 +15,7 @@
 #ifndef LLVM_TRANSFORMS_UTILS_SPLITMODULE_H
 #define LLVM_TRANSFORMS_UTILS_SPLITMODULE_H
 
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include <memory>
 
 namespace llvm {
diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
index 320c36b36924..65fe8eff6442 100644
--- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
+++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
@@ -17,6 +17,7 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/InstructionCost.h"
 
 namespace llvm {
 
@@ -123,11 +124,9 @@ TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
     Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
     Optional<bool> UserUpperBound, Optional<unsigned> UserFullUnrollMaxCount);
 
-unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
-                             bool &NotDuplicatable, bool &Convergent,
-                             const TargetTransformInfo &TTI,
-                             const SmallPtrSetImpl<const Value *> &EphValues,
-                             unsigned BEInsns);
+InstructionCost ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
+    bool &NotDuplicatable, bool &Convergent, const TargetTransformInfo &TTI,
+    const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns);
 
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoadStoreVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/LoadStoreVectorizer.h
index 3636285e38f5..15a46baa190d 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoadStoreVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoadStoreVectorizer.h
@@ -10,9 +10,10 @@
 #define LLVM_TRANSFORMS_VECTORIZE_LOADSTOREVECTORIZER_H
 
 #include "llvm/IR/PassManager.h"
-#include "llvm/Pass.h"
 
 namespace llvm {
+class Pass;
+class Function;
 
 class LoadStoreVectorizerPass : public PassInfoMixin<LoadStoreVectorizerPass> {
 public:
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 32d295a2dd16..b01bd222b252 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -28,12 +28,26 @@
 
 #include "llvm/ADT/MapVector.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Support/TypeSize.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 
 namespace llvm {
+class AAResults;
+class AssumptionCache;
+class BasicBlock;
+class BlockFrequencyInfo;
+class DemandedBits;
+class DominatorTree;
+class Function;
+class Loop;
+class LoopInfo;
+class Metadata;
+class OptimizationRemarkEmitter;
+class PredicatedScalarEvolution;
+class ProfileSummaryInfo;
+class TargetLibraryInfo;
+class TargetTransformInfo;
+class Type;
 
 /// Utility class for getting and setting loop vectorizer hints in the form
 /// of loop metadata.
@@ -207,7 +221,6 @@ public:
 
   void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; }
 
-
   Instruction *getExactFPInst() { return ExactFPMathInst; }
 
   unsigned getNumRuntimePointerChecks() const {
@@ -294,6 +307,14 @@ public:
   /// Returns the widest induction type.
   Type *getWidestInductionType() { return WidestIndTy; }
 
+  /// Returns True if given store is a final invariant store of one of the
+  /// reductions found in the loop.
+  bool isInvariantStoreOfReduction(StoreInst *SI);
+
+  /// Returns True if given address is invariant and is used to store recurrent
+  /// expression
+  bool isInvariantAddressOfReduction(Value *V);
+
   /// Returns True if V is a Phi node of an induction variable in this loop.
   bool isInductionPhi(const Value *V) const;
 
@@ -301,6 +322,10 @@ public:
   /// floating point induction.
   const InductionDescriptor *getIntOrFpInductionDescriptor(PHINode *Phi) const;
 
+  /// Returns a pointer to the induction descriptor, if \p Phi is pointer
+  /// induction.
+  const InductionDescriptor *getPointerInductionDescriptor(PHINode *Phi) const;
+
   /// Returns True if V is a cast that is part of an induction def-use chain,
   /// and had been proven to be redundant under a runtime guard (in other
   /// words, the cast has the same SCEV expression as the induction phi).
diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
index cd605aacb52d..b41f3efc5b55 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -20,7 +20,6 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/None.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/PassManager.h"
 
@@ -30,7 +29,6 @@ class AAResults;
 class AssumptionCache;
 class BasicBlock;
 class CmpInst;
-class DataLayout;
 class DemandedBits;
 class DominatorTree;
 class Function;
@@ -135,7 +133,7 @@ private:
   bool vectorizeChainsInBlock(BasicBlock *BB, slpvectorizer::BoUpSLP &R);
 
   bool vectorizeStoreChain(ArrayRef<Value *> Chain, slpvectorizer::BoUpSLP &R,
-                           unsigned Idx);
+                           unsigned Idx, unsigned MinVF);
 
   bool vectorizeStores(ArrayRef<StoreInst *> Stores, slpvectorizer::BoUpSLP &R);
 
diff --git a/llvm/include/llvm/WindowsDriver/MSVCPaths.h b/llvm/include/llvm/WindowsDriver/MSVCPaths.h
new file mode 100644
index 000000000000..7256a4f66eaa
--- /dev/null
+++ b/llvm/include/llvm/WindowsDriver/MSVCPaths.h
@@ -0,0 +1,107 @@
+//===-- MSVCPaths.h - MSVC path-parsing helpers -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_MSVCPATHS_H
+#define LLVM_SUPPORT_MSVCPATHS_H
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include <string>
+
+namespace llvm {
+
+namespace vfs {
+class FileSystem;
+}
+
+enum class SubDirectoryType {
+  Bin,
+  Include,
+  Lib,
+};
+
+enum class ToolsetLayout {
+  OlderVS,
+  VS2017OrNewer,
+  DevDivInternal,
+};
+
+// Windows SDKs and VC Toolchains group their contents into subdirectories based
+// on the target architecture. This function converts an llvm::Triple::ArchType
+// to the corresponding subdirectory name.
+const char *archToWindowsSDKArch(llvm::Triple::ArchType Arch);
+
+// Similar to the above function, but for Visual Studios before VS2017.
+const char *archToLegacyVCArch(llvm::Triple::ArchType Arch);
+
+// Similar to the above function, but for DevDiv internal builds.
+const char *archToDevDivInternalArch(llvm::Triple::ArchType Arch);
+
+bool appendArchToWindowsSDKLibPath(int SDKMajor, llvm::SmallString<128> LibPath,
+                                   llvm::Triple::ArchType Arch,
+                                   std::string &path);
+
+// Get the path to a specific subdirectory in the current toolchain for
+// a given target architecture.
+// VS2017 changed the VC toolchain layout, so this should be used instead
+// of hardcoding paths.
+std::string getSubDirectoryPath(SubDirectoryType Type, ToolsetLayout VSLayout,
+                                const std::string &VCToolChainPath,
+                                llvm::Triple::ArchType TargetArch,
+                                llvm::StringRef SubdirParent = "");
+
+// Check if the Include path of a specified version of Visual Studio contains
+// specific header files. If not, they are probably shipped with Universal CRT.
+bool useUniversalCRT(ToolsetLayout VSLayout, const std::string &VCToolChainPath,
+                     llvm::Triple::ArchType TargetArch,
+                     llvm::vfs::FileSystem &VFS);
+
+/// Get Windows SDK installation directory.
+bool getWindowsSDKDir(vfs::FileSystem &VFS,
+                      llvm::Optional<llvm::StringRef> WinSdkDir,
+                      llvm::Optional<llvm::StringRef> WinSdkVersion,
+                      llvm::Optional<llvm::StringRef> WinSysRoot,
+                      std::string &Path, int &Major,
+                      std::string &WindowsSDKIncludeVersion,
+                      std::string &WindowsSDKLibVersion);
+
+bool getUniversalCRTSdkDir(vfs::FileSystem &VFS,
+                           llvm::Optional<llvm::StringRef> WinSdkDir,
+                           llvm::Optional<llvm::StringRef> WinSdkVersion,
+                           llvm::Optional<llvm::StringRef> WinSysRoot,
+                           std::string &Path,
+                           std::string &UCRTVersion);
+
+// Check command line arguments to try and find a toolchain.
+bool findVCToolChainViaCommandLine(
+    vfs::FileSystem &VFS, llvm::Optional<llvm::StringRef> VCToolsDir,
+    llvm::Optional<llvm::StringRef> VCToolsVersion,
+    llvm::Optional<llvm::StringRef> WinSysRoot, std::string &Path,
+    ToolsetLayout &VSLayout);
+
+// Check various environment variables to try and find a toolchain.
+bool findVCToolChainViaEnvironment(vfs::FileSystem &VFS, std::string &Path,
+                                   ToolsetLayout &VSLayout);
+
+// Query the Setup Config server for installs, then pick the newest version
+// and find its default VC toolchain.
+// This is the preferred way to discover new Visual Studios, as they're no
+// longer listed in the registry.
+bool findVCToolChainViaSetupConfig(vfs::FileSystem &VFS, std::string &Path,
+                                   ToolsetLayout &VSLayout);
+
+// Look in the registry for Visual Studio installs, and use that to get
+// a toolchain path. VS2017 and newer don't get added to the registry.
+// So if we find something here, we know that it's an older version.
+bool findVCToolChainViaRegistry(std::string &Path, ToolsetLayout &VSLayout);
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/WindowsDriver/MSVCSetupApi.h b/llvm/include/llvm/WindowsDriver/MSVCSetupApi.h
new file mode 100644
index 000000000000..28e6e3e08e37
--- /dev/null
+++ b/llvm/include/llvm/WindowsDriver/MSVCSetupApi.h
@@ -0,0 +1,523 @@
+// <copyright file="Program.cpp" company="Microsoft Corporation">
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT license.
+// </copyright>
+// <license>
+// The MIT License (MIT)
+//
+// Copyright (C) Microsoft Corporation. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining
+// a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the
+// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+// sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+// </license>
+
+#pragma once
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wnon-virtual-dtor"
+#endif
+
+// Constants
+//
+#ifndef E_NOTFOUND
+#define E_NOTFOUND HRESULT_FROM_WIN32(ERROR_NOT_FOUND)
+#endif
+
+#ifndef E_FILENOTFOUND
+#define E_FILENOTFOUND HRESULT_FROM_WIN32(ERROR_FILE_NOT_FOUND)
+#endif
+
+// Enumerations
+//
+/// <summary>
+/// The state of an instance.
+/// </summary>
+enum InstanceState : unsigned {
+  /// <summary>
+  /// The instance state has not been determined.
+  /// </summary>
+  eNone = 0,
+
+  /// <summary>
+  /// The instance installation path exists.
+  /// </summary>
+  eLocal = 1,
+
+  /// <summary>
+  /// A product is registered to the instance.
+  /// </summary>
+  eRegistered = 2,
+
+  /// <summary>
+  /// No reboot is required for the instance.
+  /// </summary>
+  eNoRebootRequired = 4,
+
+  /// <summary>
+  /// The instance represents a complete install.
+  /// </summary>
+  eComplete = MAXUINT,
+};
+
+// Forward interface declarations
+//
+#ifndef __ISetupInstance_FWD_DEFINED__
+#define __ISetupInstance_FWD_DEFINED__
+typedef struct ISetupInstance ISetupInstance;
+#endif
+
+#ifndef __ISetupInstance2_FWD_DEFINED__
+#define __ISetupInstance2_FWD_DEFINED__
+typedef struct ISetupInstance2 ISetupInstance2;
+#endif
+
+#ifndef __IEnumSetupInstances_FWD_DEFINED__
+#define __IEnumSetupInstances_FWD_DEFINED__
+typedef struct IEnumSetupInstances IEnumSetupInstances;
+#endif
+
+#ifndef __ISetupConfiguration_FWD_DEFINED__
+#define __ISetupConfiguration_FWD_DEFINED__
+typedef struct ISetupConfiguration ISetupConfiguration;
+#endif
+
+#ifndef __ISetupConfiguration2_FWD_DEFINED__
+#define __ISetupConfiguration2_FWD_DEFINED__
+typedef struct ISetupConfiguration2 ISetupConfiguration2;
+#endif
+
+#ifndef __ISetupPackageReference_FWD_DEFINED__
+#define __ISetupPackageReference_FWD_DEFINED__
+typedef struct ISetupPackageReference ISetupPackageReference;
+#endif
+
+#ifndef __ISetupHelper_FWD_DEFINED__
+#define __ISetupHelper_FWD_DEFINED__
+typedef struct ISetupHelper ISetupHelper;
+#endif
+
+// Forward class declarations
+//
+#ifndef __SetupConfiguration_FWD_DEFINED__
+#define __SetupConfiguration_FWD_DEFINED__
+
+#ifdef __cplusplus
+typedef class SetupConfiguration SetupConfiguration;
+#endif
+
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Interface definitions
+//
+EXTERN_C const IID IID_ISetupInstance;
+
+#if defined(__cplusplus) && !defined(CINTERFACE)
+/// <summary>
+/// Information about an instance of a product.
+/// </summary>
+struct DECLSPEC_UUID("B41463C3-8866-43B5-BC33-2B0676F7F42E")
+    DECLSPEC_NOVTABLE ISetupInstance : public IUnknown {
+  /// <summary>
+  /// Gets the instance identifier (should match the name of the parent instance
+  /// directory).
+  /// </summary>
+  /// <param name="pbstrInstanceId">The instance identifier.</param>
+  /// <returns>Standard HRESULT indicating success or failure, including
+  /// E_FILENOTFOUND if the instance state does not exist.</returns>
+  STDMETHOD(GetInstanceId)(_Out_ BSTR *pbstrInstanceId) = 0;
+
+  /// <summary>
+  /// Gets the local date and time when the installation was originally
+  /// installed.
+  /// </summary>
+  /// <param name="pInstallDate">The local date and time when the installation
+  /// was originally installed.</param>
+  /// <returns>Standard HRESULT indicating success or failure, including
+  /// E_FILENOTFOUND if the instance state does not exist and E_NOTFOUND if the
+  /// property is not defined.</returns>
+  STDMETHOD(GetInstallDate)(_Out_ LPFILETIME pInstallDate) = 0;
+
+  /// <summary>
+  /// Gets the unique name of the installation, often indicating the branch and
+  /// other information used for telemetry.
+  /// </summary>
+  /// <param name="pbstrInstallationName">The unique name of the installation,
+  /// often indicating the branch and other information used for
+  /// telemetry.</param>
+  /// <returns>Standard HRESULT indicating success or failure, including
+  /// E_FILENOTFOUND if the instance state does not exist and E_NOTFOUND if the
+  /// property is not defined.</returns>
+  STDMETHOD(GetInstallationName)(_Out_ BSTR *pbstrInstallationName) = 0;
+
+  /// <summary>
+  /// Gets the path to the installation root of the product.
+  /// </summary>
+  /// <param name="pbstrInstallationPath">The path to the installation root of
+  /// the product.</param>
+  /// <returns>Standard HRESULT indicating success or failure, including
+  /// E_FILENOTFOUND if the instance state does not exist and E_NOTFOUND if the
+  /// property is not defined.</returns>
+  STDMETHOD(GetInstallationPath)(_Out_ BSTR *pbstrInstallationPath) = 0;
+
+  /// <summary>
+  /// Gets the version of the product installed in this instance.
+  /// </summary>
+  /// <param name="pbstrInstallationVersion">The version of the product
+  /// installed in this instance.</param>
+  /// <returns>Standard HRESULT indicating success or failure, including
+  /// E_FILENOTFOUND if the instance state does not exist and E_NOTFOUND if the
+  /// property is not defined.</returns>
+  STDMETHOD(GetInstallationVersion)(_Out_ BSTR *pbstrInstallationVersion) = 0;
+
+  /// <summary>
+  /// Gets the display name (title) of the product installed in this instance.
+  /// </summary>
+  /// <param name="lcid">The LCID for the display name.</param>
+  /// <param name="pbstrDisplayName">The display name (title) of the product
+  /// installed in this instance.</param>
+  /// <returns>Standard HRESULT indicating success or failure, including
+  /// E_FILENOTFOUND if the instance state does not exist and E_NOTFOUND if the
+  /// property is not defined.</returns>
+  STDMETHOD(GetDisplayName)(_In_ LCID lcid, _Out_ BSTR *pbstrDisplayName) = 0;
+
+  /// <summary>
+  /// Gets the description of the product installed in this instance.
+  /// </summary>
+  /// <param name="lcid">The LCID for the description.</param>
+  /// <param name="pbstrDescription">The description of the product installed in
+  /// this instance.</param>
+  /// <returns>Standard HRESULT indicating success or failure, including
+  /// E_FILENOTFOUND if the instance state does not exist and E_NOTFOUND if the
+  /// property is not defined.</returns>
+  STDMETHOD(GetDescription)(_In_ LCID lcid, _Out_ BSTR *pbstrDescription) = 0;
+
+  /// <summary>
+  /// Resolves the optional relative path to the root path of the instance.
+  /// </summary>
+  /// <param name="pwszRelativePath">A relative path within the instance to
+  /// resolve, or NULL to get the root path.</param>
+  /// <param name="pbstrAbsolutePath">The full path to the optional relative
+  /// path within the instance. If the relative path is NULL, the root path will
+  /// always terminate in a backslash.</param>
+  /// <returns>Standard HRESULT indicating success or failure, including
+  /// E_FILENOTFOUND if the instance state does not exist and E_NOTFOUND if the
+  /// property is not defined.</returns>
+  STDMETHOD(ResolvePath)
+  (_In_opt_z_ LPCOLESTR pwszRelativePath, _Out_ BSTR *pbstrAbsolutePath) = 0;
+};
+#endif
+
+EXTERN_C const IID IID_ISetupInstance2;
+
+#if defined(__cplusplus) && !defined(CINTERFACE)
+/// <summary>
+/// Information about an instance of a product.
+/// </summary>
+struct DECLSPEC_UUID("89143C9A-05AF-49B0-B717-72E218A2185C")
+    DECLSPEC_NOVTABLE ISetupInstance2 : public ISetupInstance {
+  /// <summary>
+  /// Gets the state of the instance.
+  /// </summary>
+  /// <param name="pState">The state of the instance.</param>
+  /// <returns>Standard HRESULT indicating success or failure, including
+  /// E_FILENOTFOUND if the instance state does not exist.</returns>
+  STDMETHOD(GetState)(_Out_ InstanceState *pState) = 0;
+
+  /// <summary>
+  /// Gets an array of package references registered to the instance.
+  /// </summary>
+  /// <param name="ppsaPackages">Pointer to an array of <see
+  /// cref="ISetupPackageReference"/>.</param>
+  /// <returns>Standard HRESULT indicating success or failure, including
+  /// E_FILENOTFOUND if the instance state does not exist and E_NOTFOUND if the
+  /// packages property is not defined.</returns>
+  STDMETHOD(GetPackages)(_Out_ LPSAFEARRAY *ppsaPackages) = 0;
+
+  /// <summary>
+  /// Gets a pointer to the <see cref="ISetupPackageReference"/> that represents
+  /// the registered product.
+  /// </summary>
+  /// <param name="ppPackage">Pointer to an instance of <see
+  /// cref="ISetupPackageReference"/>. This may be NULL if <see
+  /// cref="GetState"/> does not return <see cref="eComplete"/>.</param>
+  /// <returns>Standard HRESULT indicating success or failure, including
+  /// E_FILENOTFOUND if the instance state does not exist and E_NOTFOUND if the
+  /// packages property is not defined.</returns>
+  STDMETHOD(GetProduct)
+  (_Outptr_result_maybenull_ ISetupPackageReference **ppPackage) = 0;
+
+  /// <summary>
+  /// Gets the relative path to the product application, if available.
+  /// </summary>
+  /// <param name="pbstrProductPath">The relative path to the product
+  /// application, if available.</param>
+  /// <returns>Standard HRESULT indicating success or failure, including
+  /// E_FILENOTFOUND if the instance state does not exist.</returns>
+  STDMETHOD(GetProductPath)
+  (_Outptr_result_maybenull_ BSTR *pbstrProductPath) = 0;
+};
+#endif
+
+EXTERN_C const IID IID_IEnumSetupInstances;
+
+#if defined(__cplusplus) && !defined(CINTERFACE)
+/// <summary>
+/// A enumerator of installed <see cref="ISetupInstance"/> objects.
+/// </summary>
+struct DECLSPEC_UUID("6380BCFF-41D3-4B2E-8B2E-BF8A6810C848")
+    DECLSPEC_NOVTABLE IEnumSetupInstances : public IUnknown {
+  /// <summary>
+  /// Retrieves the next set of product instances in the enumeration sequence.
+  /// </summary>
+  /// <param name="celt">The number of product instances to retrieve.</param>
+  /// <param name="rgelt">A pointer to an array of <see
+  /// cref="ISetupInstance"/>.</param>
+  /// <param name="pceltFetched">A pointer to the number of product instances
+  /// retrieved. If celt is 1 this parameter may be NULL.</param>
+  /// <returns>S_OK if the number of elements were fetched, S_FALSE if nothing
+  /// was fetched (at end of enumeration), E_INVALIDARG if celt is greater than
+  /// 1 and pceltFetched is NULL, or E_OUTOFMEMORY if an <see
+  /// cref="ISetupInstance"/> could not be allocated.</returns>
+  STDMETHOD(Next)
+  (_In_ ULONG celt, _Out_writes_to_(celt, *pceltFetched) ISetupInstance **rgelt,
+   _Out_opt_ _Deref_out_range_(0, celt) ULONG *pceltFetched) = 0;
+
+  /// <summary>
+  /// Skips the next set of product instances in the enumeration sequence.
+  /// </summary>
+  /// <param name="celt">The number of product instances to skip.</param>
+  /// <returns>S_OK if the number of elements could be skipped; otherwise,
+  /// S_FALSE;</returns>
+  STDMETHOD(Skip)(_In_ ULONG celt) = 0;
+
+  /// <summary>
+  /// Resets the enumeration sequence to the beginning.
+  /// </summary>
+  /// <returns>Always returns S_OK;</returns>
+  STDMETHOD(Reset)(void) = 0;
+
+  /// <summary>
+  /// Creates a new enumeration object in the same state as the current
+  /// enumeration object: the new object points to the same place in the
+  /// enumeration sequence.
+  /// </summary>
+  /// <param name="ppenum">A pointer to a pointer to a new <see
+  /// cref="IEnumSetupInstances"/> interface. If the method fails, this
+  /// parameter is undefined.</param>
+  /// <returns>S_OK if a clone was returned; otherwise, E_OUTOFMEMORY.</returns>
+  STDMETHOD(Clone)(_Deref_out_opt_ IEnumSetupInstances **ppenum) = 0;
+};
+#endif
+
+EXTERN_C const IID IID_ISetupConfiguration;
+
+#if defined(__cplusplus) && !defined(CINTERFACE)
+/// <summary>
+/// Gets information about product instances set up on the machine.
+/// </summary>
+struct DECLSPEC_UUID("42843719-DB4C-46C2-8E7C-64F1816EFD5B")
+    DECLSPEC_NOVTABLE ISetupConfiguration : public IUnknown {
+  /// <summary>
+  /// Enumerates all completed product instances installed.
+  /// </summary>
+  /// <param name="ppEnumInstances">An enumeration of completed, installed
+  /// product instances.</param>
+  /// <returns>Standard HRESULT indicating success or failure.</returns>
+  STDMETHOD(EnumInstances)(_Out_ IEnumSetupInstances **ppEnumInstances) = 0;
+
+  /// <summary>
+  /// Gets the instance for the current process path.
+  /// </summary>
+  /// <param name="ppInstance">The instance for the current process
+  /// path.</param>
+  /// <returns>The instance for the current process path, or E_NOTFOUND if not
+  /// found.</returns>
+  STDMETHOD(GetInstanceForCurrentProcess)
+  (_Out_ ISetupInstance **ppInstance) = 0;
+
+  /// <summary>
+  /// Gets the instance for the given path.
+  /// </summary>
+  /// <param name="ppInstance">The instance for the given path.</param>
+  /// <returns>The instance for the given path, or E_NOTFOUND if not
+  /// found.</returns>
+  STDMETHOD(GetInstanceForPath)
+  (_In_z_ LPCWSTR wzPath, _Out_ ISetupInstance **ppInstance) = 0;
+};
+#endif
+
+EXTERN_C const IID IID_ISetupConfiguration2;
+
+#if defined(__cplusplus) && !defined(CINTERFACE)
+/// <summary>
+/// Gets information about product instances.
+/// </summary>
+struct DECLSPEC_UUID("26AAB78C-4A60-49D6-AF3B-3C35BC93365D")
+    DECLSPEC_NOVTABLE ISetupConfiguration2 : public ISetupConfiguration {
+  /// <summary>
+  /// Enumerates all product instances.
+  /// </summary>
+  /// <param name="ppEnumInstances">An enumeration of all product
+  /// instances.</param>
+  /// <returns>Standard HRESULT indicating success or failure.</returns>
+  STDMETHOD(EnumAllInstances)(_Out_ IEnumSetupInstances **ppEnumInstances) = 0;
+};
+#endif
+
+EXTERN_C const IID IID_ISetupPackageReference;
+
+#if defined(__cplusplus) && !defined(CINTERFACE)
+/// <summary>
+/// A reference to a package.
+/// </summary>
+struct DECLSPEC_UUID("da8d8a16-b2b6-4487-a2f1-594ccccd6bf5")
+    DECLSPEC_NOVTABLE ISetupPackageReference : public IUnknown {
+  /// <summary>
+  /// Gets the general package identifier.
+  /// </summary>
+  /// <param name="pbstrId">The general package identifier.</param>
+  /// <returns>Standard HRESULT indicating success or failure.</returns>
+  STDMETHOD(GetId)(_Out_ BSTR *pbstrId) = 0;
+
+  /// <summary>
+  /// Gets the version of the package.
+  /// </summary>
+  /// <param name="pbstrVersion">The version of the package.</param>
+  /// <returns>Standard HRESULT indicating success or failure.</returns>
+  STDMETHOD(GetVersion)(_Out_ BSTR *pbstrVersion) = 0;
+
+  /// <summary>
+  /// Gets the target process architecture of the package.
+  /// </summary>
+  /// <param name="pbstrChip">The target process architecture of the
+  /// package.</param>
+  /// <returns>Standard HRESULT indicating success or failure.</returns>
+  STDMETHOD(GetChip)(_Out_ BSTR *pbstrChip) = 0;
+
+  /// <summary>
+  /// Gets the language and optional region identifier.
+  /// </summary>
+  /// <param name="pbstrLanguage">The language and optional region
+  /// identifier.</param>
+  /// <returns>Standard HRESULT indicating success or failure.</returns>
+  STDMETHOD(GetLanguage)(_Out_ BSTR *pbstrLanguage) = 0;
+
+  /// <summary>
+  /// Gets the build branch of the package.
+  /// </summary>
+  /// <param name="pbstrBranch">The build branch of the package.</param>
+  /// <returns>Standard HRESULT indicating success or failure.</returns>
+  STDMETHOD(GetBranch)(_Out_ BSTR *pbstrBranch) = 0;
+
+  /// <summary>
+  /// Gets the type of the package.
+  /// </summary>
+  /// <param name="pbstrType">The type of the package.</param>
+  /// <returns>Standard HRESULT indicating success or failure.</returns>
+  STDMETHOD(GetType)(_Out_ BSTR *pbstrType) = 0;
+
+  /// <summary>
+  /// Gets the unique identifier consisting of all defined tokens.
+  /// </summary>
+  /// <param name="pbstrUniqueId">The unique identifier consisting of all
+  /// defined tokens.</param>
+  /// <returns>Standard HRESULT indicating success or failure, including
+  /// E_UNEXPECTED if no Id was defined (required).</returns>
+  STDMETHOD(GetUniqueId)(_Out_ BSTR *pbstrUniqueId) = 0;
+};
+#endif
+
+EXTERN_C const IID IID_ISetupHelper;
+
+#if defined(__cplusplus) && !defined(CINTERFACE)
+/// <summary>
+/// Helper functions.
+/// </summary>
+/// <remarks>
+/// You can query for this interface from the <see cref="SetupConfiguration"/>
+/// class.
+/// </remarks>
+struct DECLSPEC_UUID("42b21b78-6192-463e-87bf-d577838f1d5c")
+    DECLSPEC_NOVTABLE ISetupHelper : public IUnknown {
+  /// <summary>
+  /// Parses a dotted quad version string into a 64-bit unsigned integer.
+  /// </summary>
+  /// <param name="pwszVersion">The dotted quad version string to parse, e.g.
+  /// 1.2.3.4.</param>
+  /// <param name="pullVersion">A 64-bit unsigned integer representing the
+  /// version. You can compare this to other versions.</param>
+  /// <returns>Standard HRESULT indicating success or failure.</returns>
+  STDMETHOD(ParseVersion)
+  (_In_ LPCOLESTR pwszVersion, _Out_ PULONGLONG pullVersion) = 0;
+
+  /// <summary>
+  /// Parses a dotted quad version string into a 64-bit unsigned integer.
+  /// </summary>
+  /// <param name="pwszVersionRange">The string containing 1 or 2 dotted quad
+  /// version strings to parse, e.g. [1.0,) that means 1.0.0.0 or newer.</param>
+  /// <param name="pullMinVersion">A 64-bit unsigned integer representing the
+  /// minimum version, which may be 0. You can compare this to other
+  /// versions.</param>
+  /// <param name="pullMaxVersion">A 64-bit unsigned integer representing the
+  /// maximum version, which may be MAXULONGLONG. You can compare this to other
+  /// versions.</param>
+  /// <returns>Standard HRESULT indicating success or failure.</returns>
+  STDMETHOD(ParseVersionRange)
+  (_In_ LPCOLESTR pwszVersionRange, _Out_ PULONGLONG pullMinVersion,
+   _Out_ PULONGLONG pullMaxVersion) = 0;
+};
+#endif
+
+// Class declarations
+//
+EXTERN_C const CLSID CLSID_SetupConfiguration;
+
+#ifdef __cplusplus
+/// <summary>
+/// This class implements <see cref="ISetupConfiguration"/>, <see
+/// cref="ISetupConfiguration2"/>, and <see cref="ISetupHelper"/>.
+/// </summary>
+class DECLSPEC_UUID("177F0C4A-1CD3-4DE7-A32C-71DBBB9FA36D") SetupConfiguration;
+#endif
+
+// Function declarations
+//
+/// <summary>
+/// Gets an <see cref="ISetupConfiguration"/> that provides information about
+/// product instances installed on the machine.
+/// </summary>
+/// <param name="ppConfiguration">The <see cref="ISetupConfiguration"/> that
+/// provides information about product instances installed on the
+/// machine.</param>
+/// <param name="pReserved">Reserved for future use.</param>
+/// <returns>Standard HRESULT indicating success or failure.</returns>
+STDMETHODIMP GetSetupConfiguration(_Out_ ISetupConfiguration **ppConfiguration,
+                                   _Reserved_ LPVOID pReserved);
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
diff --git a/llvm/include/llvm/module.modulemap b/llvm/include/llvm/module.modulemap
index d0693ccfd8f6..76b10621541c 100644
--- a/llvm/include/llvm/module.modulemap
+++ b/llvm/include/llvm/module.modulemap
@@ -4,6 +4,7 @@ module LLVM_Analysis {
   module * { export * }
 
   // This is intended for (repeated) textual inclusion.
+  textual header "Analysis/ScalarFuncs.def"
   textual header "Analysis/TargetLibraryInfo.def"
   textual header "Analysis/VecFuncs.def"
 }
@@ -71,6 +72,7 @@ module LLVM_BinaryFormat {
     textual header "BinaryFormat/ELFRelocs/Hexagon.def"
     textual header "BinaryFormat/ELFRelocs/i386.def"
     textual header "BinaryFormat/ELFRelocs/Lanai.def"
+    textual header "BinaryFormat/ELFRelocs/LoongArch.def"
     textual header "BinaryFormat/ELFRelocs/M68k.def"
     textual header "BinaryFormat/ELFRelocs/Mips.def"
     textual header "BinaryFormat/ELFRelocs/MSP430.def"
@@ -242,6 +244,7 @@ module LLVM_intrinsic_gen {
     export *
   }
   module IR_AbstractCallSite { header "IR/AbstractCallSite.h" export * }
+  module IR_ConstantFold { header "IR/ConstantFold.h" export * }
   module IR_ConstantFolder { header "IR/ConstantFolder.h" export * }
   module IR_GlobalVariable { header "IR/GlobalVariable.h" export * }
   module IR_NoFolder { header "IR/NoFolder.h" export * }
@@ -253,6 +256,7 @@ module LLVM_intrinsic_gen {
   module IR_InstrTypes { header "IR/InstrTypes.h" export * }
   module IR_Instructions { header "IR/Instructions.h" export * }
   module IR_TypeFinder { header "IR/TypeFinder.h" export * }
+  module IR_VectorBuilder { header "IR/VectorBuilder.h" export * }
 
 
   // Intrinsics.h
@@ -331,7 +335,6 @@ module LLVM_MC {
 module LLVM_MC_TableGen {
   requires cplusplus
   module MC_LaneBitmask { header "MC/LaneBitmask.h" export * }
-  module MC_FixedLenDisassembler { header "MC/MCFixedLenDisassembler.h" export * }
   module MC_InstrItineraries { header "MC/MCInstrItineraries.h" export * }
   module MC_Schedule { header "MC/MCSchedule.h" export * }
   module MC_SubtargetFeature { header "MC/SubtargetFeature.h" export * }
@@ -357,6 +360,7 @@ module LLVM_ProfileData {
 
   textual header "ProfileData/InstrProfData.inc"
   textual header "ProfileData/MemProfData.inc"
+  textual header "ProfileData/MIBEntryDef.inc"
 }
 
 // FIXME: Mislayered?
@@ -410,6 +414,7 @@ module LLVM_Utils {
     // These are intended for textual inclusion.
     textual header "Support/AArch64TargetParser.def"
     textual header "Support/ARMTargetParser.def"
+    textual header "Support/CSKYTargetParser.def"
     textual header "Support/RISCVTargetParser.def"
     textual header "Support/TargetOpcodes.def"
     textual header "Support/X86TargetParser.def"
diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp
index a8132e5abf54..e249c38ecd34 100644
--- a/llvm/lib/Analysis/AliasAnalysis.cpp
+++ b/llvm/lib/Analysis/AliasAnalysis.cpp
@@ -42,7 +42,6 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
@@ -680,7 +679,7 @@ ModRefInfo AAResults::getModRefInfo(const Instruction *I,
     }
   }
 
-  const MemoryLocation &Loc = OptLoc.getValueOr(MemoryLocation());
+  const MemoryLocation &Loc = OptLoc.value_or(MemoryLocation());
 
   switch (I->getOpcode()) {
   case Instruction::VAArg:
@@ -988,6 +987,28 @@ bool llvm::isIdentifiedFunctionLocal(const Value *V) {
   return isa<AllocaInst>(V) || isNoAliasCall(V) || isNoAliasOrByValArgument(V);
 }
 
+bool llvm::isEscapeSource(const Value *V) {
+  if (auto *CB = dyn_cast<CallBase>(V))
+    return !isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(CB,
+                                                                        true);
+
+  // The load case works because isNonEscapingLocalObject considers all
+  // stores to be escapes (it passes true for the StoreCaptures argument
+  // to PointerMayBeCaptured).
+  if (isa<LoadInst>(V))
+    return true;
+
+  // The inttoptr case works because isNonEscapingLocalObject considers all
+  // means of converting or equating a pointer to an int (ptrtoint, ptr store
+  // which could be followed by an integer load, ptr<->int compare) as
+  // escaping, and objects located at well-known addresses via platform-specific
+  // means cannot be considered non-escaping local objects.
+  if (isa<IntToPtrInst>(V))
+    return true;
+
+  return false;
+}
+
 bool llvm::isNotVisibleOnUnwind(const Value *Object,
                                 bool &RequiresNoCaptureBeforeUnwind) {
   RequiresNoCaptureBeforeUnwind = false;
diff --git a/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp b/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp
index 1577f1eb70b1..e3446a1f3130 100644
--- a/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp
+++ b/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp
@@ -9,9 +9,7 @@
 #include "llvm/Analysis/AliasAnalysisEvaluator.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
@@ -19,7 +17,6 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -41,30 +38,48 @@ static cl::opt<bool> PrintMustModRef("print-mustmodref", cl::ReallyHidden);
 
 static cl::opt<bool> EvalAAMD("evaluate-aa-metadata", cl::ReallyHidden);
 
-static void PrintResults(AliasResult AR, bool P, const Value *V1,
-                         const Value *V2, const Module *M) {
+static void PrintResults(AliasResult AR, bool P,
+                         std::pair<const Value *, Type *> Loc1,
+                         std::pair<const Value *, Type *> Loc2,
+                         const Module *M) {
   if (PrintAll || P) {
+    Type *Ty1 = Loc1.second, *Ty2 = Loc2.second;
+    unsigned AS1 = Loc1.first->getType()->getPointerAddressSpace();
+    unsigned AS2 = Loc2.first->getType()->getPointerAddressSpace();
     std::string o1, o2;
     {
       raw_string_ostream os1(o1), os2(o2);
-      V1->printAsOperand(os1, true, M);
-      V2->printAsOperand(os2, true, M);
+      Loc1.first->printAsOperand(os1, false, M);
+      Loc2.first->printAsOperand(os2, false, M);
     }
 
     if (o2 < o1) {
       std::swap(o1, o2);
+      std::swap(Ty1, Ty2);
+      std::swap(AS1, AS2);
       // Change offset sign for the local AR, for printing only.
       AR.swap();
     }
-    errs() << "  " << AR << ":\t" << o1 << ", " << o2 << "\n";
+    errs() << "  " << AR << ":\t";
+    Ty1->print(errs(), false, /* NoDetails */ true);
+    if (AS1 != 0)
+      errs() << " addrspace(" << AS1 << ")";
+    errs() << "* " << o1 << ", ";
+    Ty2->print(errs(), false, /* NoDetails */ true);
+    if (AS2 != 0)
+      errs() << " addrspace(" << AS2 << ")";
+    errs() << "* " << o2 << "\n";
   }
 }
 
-static inline void PrintModRefResults(const char *Msg, bool P, Instruction *I,
-                                      Value *Ptr, Module *M) {
+static inline void PrintModRefResults(
+    const char *Msg, bool P, Instruction *I,
+    std::pair<const Value *, Type *> Loc, Module *M) {
   if (PrintAll || P) {
     errs() << "  " << Msg << ":  Ptr: ";
-    Ptr->printAsOperand(errs(), true, M);
+    Loc.second->print(errs(), false, /* NoDetails */ true);
+    errs() << "* ";
+    Loc.first->printAsOperand(errs(), false, M);
     errs() << "\t<->" << *I << '\n';
   }
 }
@@ -84,11 +99,6 @@ static inline void PrintLoadStoreResults(AliasResult AR, bool P,
   }
 }
 
-static inline bool isInterestingPointer(Value *V) {
-  return V->getType()->isPointerTy()
-      && !isa<ConstantPointerNull>(V);
-}
-
 PreservedAnalyses AAEvaluator::run(Function &F, FunctionAnalysisManager &AM) {
   runInternal(F, AM.getResult<AAManager>(F));
   return PreservedAnalyses::all();
@@ -99,38 +109,21 @@ void AAEvaluator::runInternal(Function &F, AAResults &AA) {
 
   ++FunctionCount;
 
-  SetVector<Value *> Pointers;
+  SetVector<std::pair<const Value *, Type *>> Pointers;
   SmallSetVector<CallBase *, 16> Calls;
   SetVector<Value *> Loads;
   SetVector<Value *> Stores;
 
-  for (auto &I : F.args())
-    if (I.getType()->isPointerTy())    // Add all pointer arguments.
-      Pointers.insert(&I);
-
   for (Instruction &Inst : instructions(F)) {
-    if (Inst.getType()->isPointerTy()) // Add all pointer instructions.
-      Pointers.insert(&Inst);
-    if (EvalAAMD && isa<LoadInst>(&Inst))
-      Loads.insert(&Inst);
-    if (EvalAAMD && isa<StoreInst>(&Inst))
-      Stores.insert(&Inst);
-    if (auto *Call = dyn_cast<CallBase>(&Inst)) {
-      Value *Callee = Call->getCalledOperand();
-      // Skip actual functions for direct function calls.
-      if (!isa<Function>(Callee) && isInterestingPointer(Callee))
-        Pointers.insert(Callee);
-      // Consider formals.
-      for (Use &DataOp : Call->data_ops())
-        if (isInterestingPointer(DataOp))
-          Pointers.insert(DataOp);
-      Calls.insert(Call);
-    } else {
-      // Consider all operands.
-      for (Use &Op : Inst.operands())
-        if (isInterestingPointer(Op))
-          Pointers.insert(Op);
-    }
+    if (auto *LI = dyn_cast<LoadInst>(&Inst)) {
+      Pointers.insert({LI->getPointerOperand(), LI->getType()});
+      Loads.insert(LI);
+    } else if (auto *SI = dyn_cast<StoreInst>(&Inst)) {
+      Pointers.insert({SI->getPointerOperand(),
+                       SI->getValueOperand()->getType()});
+      Stores.insert(SI);
+    } else if (auto *CB = dyn_cast<CallBase>(&Inst))
+      Calls.insert(CB);
   }
 
   if (PrintAll || PrintNoAlias || PrintMayAlias || PrintPartialAlias ||
@@ -139,20 +132,12 @@ void AAEvaluator::runInternal(Function &F, AAResults &AA) {
            << " pointers, " << Calls.size() << " call sites\n";
 
   // iterate over the worklist, and run the full (n^2)/2 disambiguations
-  for (SetVector<Value *>::iterator I1 = Pointers.begin(), E = Pointers.end();
-       I1 != E; ++I1) {
-    auto I1Size = LocationSize::afterPointer();
-    Type *I1ElTy = (*I1)->getType()->getPointerElementType();
-    if (I1ElTy->isSized())
-      I1Size = LocationSize::precise(DL.getTypeStoreSize(I1ElTy));
-
-    for (SetVector<Value *>::iterator I2 = Pointers.begin(); I2 != I1; ++I2) {
-      auto I2Size = LocationSize::afterPointer();
-      Type *I2ElTy = (*I2)->getType()->getPointerElementType();
-      if (I2ElTy->isSized())
-        I2Size = LocationSize::precise(DL.getTypeStoreSize(I2ElTy));
-
-      AliasResult AR = AA.alias(*I1, I1Size, *I2, I2Size);
+  for (auto I1 = Pointers.begin(), E = Pointers.end(); I1 != E; ++I1) {
+    LocationSize Size1 = LocationSize::precise(DL.getTypeStoreSize(I1->second));
+    for (auto I2 = Pointers.begin(); I2 != I1; ++I2) {
+      LocationSize Size2 =
+          LocationSize::precise(DL.getTypeStoreSize(I2->second));
+      AliasResult AR = AA.alias(I1->first, Size1, I2->first, Size2);
       switch (AR) {
       case AliasResult::NoAlias:
         PrintResults(AR, PrintNoAlias, *I1, *I2, F.getParent());
@@ -231,13 +216,10 @@ void AAEvaluator::runInternal(Function &F, AAResults &AA) {
 
   // Mod/ref alias analysis: compare all pairs of calls and values
   for (CallBase *Call : Calls) {
-    for (auto Pointer : Pointers) {
-      auto Size = LocationSize::afterPointer();
-      Type *ElTy = Pointer->getType()->getPointerElementType();
-      if (ElTy->isSized())
-        Size = LocationSize::precise(DL.getTypeStoreSize(ElTy));
-
-      switch (AA.getModRefInfo(Call, Pointer, Size)) {
+    for (const auto &Pointer : Pointers) {
+      LocationSize Size =
+          LocationSize::precise(DL.getTypeStoreSize(Pointer.second));
+      switch (AA.getModRefInfo(Call, Pointer.first, Size)) {
       case ModRefInfo::NoModRef:
         PrintModRefResults("NoModRef", PrintNoModRef, Call, Pointer,
                            F.getParent());
diff --git a/llvm/lib/Analysis/AliasSetTracker.cpp b/llvm/lib/Analysis/AliasSetTracker.cpp
index 5dc6c7780a0c..234a73bff6a8 100644
--- a/llvm/lib/Analysis/AliasSetTracker.cpp
+++ b/llvm/lib/Analysis/AliasSetTracker.cpp
@@ -13,16 +13,12 @@
 #include "llvm/Analysis/AliasSetTracker.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/GuardUtils.h"
-#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Config/llvm-config.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Value.h"
@@ -237,8 +233,8 @@ bool AliasSet::aliasesUnknownInst(const Instruction *Inst,
   if (AliasAny)
     return true;
 
-  assert(Inst->mayReadOrWriteMemory() &&
-         "Instruction must either read or write memory.");
+  if (!Inst->mayReadOrWriteMemory())
+    return false;
 
   for (unsigned i = 0, e = UnknownInsts.size(); i != e; ++i) {
     if (auto *UnknownInst = getUnknownInst(i)) {
@@ -258,31 +254,6 @@ bool AliasSet::aliasesUnknownInst(const Instruction *Inst,
   return false;
 }
 
-Instruction* AliasSet::getUniqueInstruction() {
-  if (AliasAny)
-    // May have collapses alias set
-    return nullptr;
-  if (begin() != end()) {
-    if (!UnknownInsts.empty())
-      // Another instruction found
-      return nullptr;
-    if (std::next(begin()) != end())
-      // Another instruction found
-      return nullptr;
-    Value *Addr = begin()->getValue();
-    assert(!Addr->user_empty() &&
-           "where's the instruction which added this pointer?");
-    if (std::next(Addr->user_begin()) != Addr->user_end())
-      // Another instruction found -- this is really restrictive
-      // TODO: generalize!
-      return nullptr;
-    return cast<Instruction>(*(Addr->user_begin()));
-  }
-  if (1 != UnknownInsts.size())
-    return nullptr;
-  return cast<Instruction>(UnknownInsts[0]);
-}
-
 void AliasSetTracker::clear() {
   // Delete all the PointerRec entries.
   for (auto &I : PointerMap)
diff --git a/llvm/lib/Analysis/Analysis.cpp b/llvm/lib/Analysis/Analysis.cpp
index 177f38af13d8..460dddceaf17 100644
--- a/llvm/lib/Analysis/Analysis.cpp
+++ b/llvm/lib/Analysis/Analysis.cpp
@@ -40,14 +40,14 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializeDelinearizationPass(Registry);
   initializeDemandedBitsWrapperPassPass(Registry);
   initializeDominanceFrontierWrapperPassPass(Registry);
-  initializeDomViewerPass(Registry);
-  initializeDomPrinterPass(Registry);
-  initializeDomOnlyViewerPass(Registry);
-  initializePostDomViewerPass(Registry);
-  initializeDomOnlyPrinterPass(Registry);
-  initializePostDomPrinterPass(Registry);
-  initializePostDomOnlyViewerPass(Registry);
-  initializePostDomOnlyPrinterPass(Registry);
+  initializeDomViewerWrapperPassPass(Registry);
+  initializeDomPrinterWrapperPassPass(Registry);
+  initializeDomOnlyViewerWrapperPassPass(Registry);
+  initializePostDomViewerWrapperPassPass(Registry);
+  initializeDomOnlyPrinterWrapperPassPass(Registry);
+  initializePostDomPrinterWrapperPassPass(Registry);
+  initializePostDomOnlyViewerWrapperPassPass(Registry);
+  initializePostDomOnlyPrinterWrapperPassPass(Registry);
   initializeAAResultsWrapperPassPass(Registry);
   initializeGlobalsAAWrapperPassPass(Registry);
   initializeIVUsersWrapperPassPass(Registry);
diff --git a/llvm/lib/Analysis/AssumeBundleQueries.cpp b/llvm/lib/Analysis/AssumeBundleQueries.cpp
index 9d4fe1225b33..7440dbd29ccf 100644
--- a/llvm/lib/Analysis/AssumeBundleQueries.cpp
+++ b/llvm/lib/Analysis/AssumeBundleQueries.cpp
@@ -10,8 +10,8 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/DebugCounter.h"
diff --git a/llvm/lib/Analysis/AssumptionCache.cpp b/llvm/lib/Analysis/AssumptionCache.cpp
index 3e0214e21ecd..e7e476dfb572 100644
--- a/llvm/lib/Analysis/AssumptionCache.cpp
+++ b/llvm/lib/Analysis/AssumptionCache.cpp
@@ -11,18 +11,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/AssumeBundleQueries.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AssumeBundleQueries.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
@@ -31,7 +30,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cassert>
 #include <utility>
 
diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index 0a0b53796add..c78f822b8bcf 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -22,7 +22,6 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/CaptureTracking.h"
-#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/PhiValues.h"
@@ -45,7 +44,6 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
@@ -105,29 +103,6 @@ bool BasicAAResult::invalidate(Function &Fn, const PreservedAnalyses &PA,
 // Useful predicates
 //===----------------------------------------------------------------------===//
 
-/// Returns true if the pointer is one which would have been considered an
-/// escape by isNonEscapingLocalObject.
-static bool isEscapeSource(const Value *V) {
-  if (isa<CallBase>(V))
-    return true;
-
-  // The load case works because isNonEscapingLocalObject considers all
-  // stores to be escapes (it passes true for the StoreCaptures argument
-  // to PointerMayBeCaptured).
-  if (isa<LoadInst>(V))
-    return true;
-
-  // The inttoptr case works because isNonEscapingLocalObject considers all
-  // means of converting or equating a pointer to an int (ptrtoint, ptr store
-  // which could be followed by an integer load, ptr<->int compare) as
-  // escaping, and objects located at well-known addresses via platform-specific
-  // means cannot be considered non-escaping local objects.
-  if (isa<IntToPtrInst>(V))
-    return true;
-
-  return false;
-}
-
 /// Returns the size of the object specified by V or UnknownSize if unknown.
 static uint64_t getObjectSize(const Value *V, const DataLayout &DL,
                               const TargetLibraryInfo &TLI,
@@ -234,7 +209,7 @@ bool EarliestEscapeInfo::isNotCapturedBeforeOrAt(const Value *Object,
   if (Iter.second) {
     Instruction *EarliestCapture = FindEarliestCapture(
         Object, *const_cast<Function *>(I->getFunction()),
-        /*ReturnCaptures=*/false, /*StoreCaptures=*/true, DT);
+        /*ReturnCaptures=*/false, /*StoreCaptures=*/true, DT, EphValues);
     if (EarliestCapture) {
       auto Ins = Inst2Obj.insert({EarliestCapture, {}});
       Ins.first->second.push_back(Object);
@@ -661,8 +636,8 @@ BasicAAResult::DecomposeGEPExpression(const Value *V, const DataLayout &DL,
       unsigned TypeSize =
           DL.getTypeAllocSize(GTI.getIndexedType()).getFixedSize();
       LE = LE.mul(APInt(IndexSize, TypeSize), GEPOp->isInBounds());
-      Decomposed.Offset += LE.Offset.sextOrSelf(MaxIndexSize);
-      APInt Scale = LE.Scale.sextOrSelf(MaxIndexSize);
+      Decomposed.Offset += LE.Offset.sext(MaxIndexSize);
+      APInt Scale = LE.Scale.sext(MaxIndexSize);
 
       // If we already had an occurrence of this index variable, merge this
       // scale into it.  For example, we want to handle:
@@ -1299,8 +1274,31 @@ AliasResult BasicAAResult::aliasGEP(
     const VariableGEPIndex &Var = DecompGEP1.VarIndices[0];
     if (Var.Val.TruncBits == 0 &&
         isKnownNonZero(Var.Val.V, DL, 0, &AC, Var.CxtI, DT)) {
-      // If V != 0 then abs(VarIndex) >= abs(Scale).
-      MinAbsVarIndex = Var.Scale.abs();
+      // If V != 0, then abs(VarIndex) > 0.
+      MinAbsVarIndex = APInt(Var.Scale.getBitWidth(), 1);
+
+      // Check if abs(V*Scale) >= abs(Scale) holds in the presence of
+      // potentially wrapping math.
+      auto MultiplyByScaleNoWrap = [](const VariableGEPIndex &Var) {
+        if (Var.IsNSW)
+          return true;
+
+        int ValOrigBW = Var.Val.V->getType()->getPrimitiveSizeInBits();
+        // If Scale is small enough so that abs(V*Scale) >= abs(Scale) holds.
+        // The max value of abs(V) is 2^ValOrigBW - 1. Multiplying with a
+        // constant smaller than 2^(bitwidth(Val) - ValOrigBW) won't wrap.
+        int MaxScaleValueBW = Var.Val.getBitWidth() - ValOrigBW;
+        if (MaxScaleValueBW <= 0)
+          return false;
+        return Var.Scale.ule(
+            APInt::getMaxValue(MaxScaleValueBW).zext(Var.Scale.getBitWidth()));
+      };
+      // Refine MinAbsVarIndex, if abs(Scale*V) >= abs(Scale) holds in the
+      // presence of potentially wrapping math.
+      if (MultiplyByScaleNoWrap(Var)) {
+        // If V != 0 then abs(VarIndex) >= abs(Scale).
+        MinAbsVarIndex = Var.Scale.abs();
+      }
     }
   } else if (DecompGEP1.VarIndices.size() == 2) {
     // VarIndex = Scale*V0 + (-Scale)*V1.
@@ -1370,15 +1368,15 @@ BasicAAResult::aliasSelect(const SelectInst *SI, LocationSize SISize,
 
   // If both arms of the Select node NoAlias or MustAlias V2, then returns
   // NoAlias / MustAlias. Otherwise, returns MayAlias.
-  AliasResult Alias = getBestAAResults().alias(
-      MemoryLocation(V2, V2Size),
-      MemoryLocation(SI->getTrueValue(), SISize), AAQI);
+  AliasResult Alias =
+      getBestAAResults().alias(MemoryLocation(SI->getTrueValue(), SISize),
+                               MemoryLocation(V2, V2Size), AAQI);
   if (Alias == AliasResult::MayAlias)
     return AliasResult::MayAlias;
 
-  AliasResult ThisAlias = getBestAAResults().alias(
-      MemoryLocation(V2, V2Size),
-      MemoryLocation(SI->getFalseValue(), SISize), AAQI);
+  AliasResult ThisAlias =
+      getBestAAResults().alias(MemoryLocation(SI->getFalseValue(), SISize),
+                               MemoryLocation(V2, V2Size), AAQI);
   return MergeAliasResults(ThisAlias, Alias);
 }
 
@@ -1500,8 +1498,7 @@ AliasResult BasicAAResult::aliasPHI(const PHINode *PN, LocationSize PNSize,
   AAQueryInfo *UseAAQI = BlockInserted ? &NewAAQI : &AAQI;
 
   AliasResult Alias = getBestAAResults().alias(
-      MemoryLocation(V2, V2Size),
-      MemoryLocation(V1Srcs[0], PNSize), *UseAAQI);
+      MemoryLocation(V1Srcs[0], PNSize), MemoryLocation(V2, V2Size), *UseAAQI);
 
   // Early exit if the check of the first PHI source against V2 is MayAlias.
   // Other results are not possible.
@@ -1518,7 +1515,7 @@ AliasResult BasicAAResult::aliasPHI(const PHINode *PN, LocationSize PNSize,
     Value *V = V1Srcs[i];
 
     AliasResult ThisAlias = getBestAAResults().alias(
-        MemoryLocation(V2, V2Size), MemoryLocation(V, PNSize), *UseAAQI);
+        MemoryLocation(V, PNSize), MemoryLocation(V2, V2Size), *UseAAQI);
     Alias = MergeAliasResults(ThisAlias, Alias);
     if (Alias == AliasResult::MayAlias)
       break;
diff --git a/llvm/lib/Analysis/BlockFrequencyInfo.cpp b/llvm/lib/Analysis/BlockFrequencyInfo.cpp
index b464071a33e6..436b01764033 100644
--- a/llvm/lib/Analysis/BlockFrequencyInfo.cpp
+++ b/llvm/lib/Analysis/BlockFrequencyInfo.cpp
@@ -25,7 +25,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cassert>
 #include <string>
 
diff --git a/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp b/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
index 2a5e1f65d731..ec8d318b675b 100644
--- a/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
+++ b/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
@@ -13,7 +13,6 @@
 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/Config/llvm-config.h"
@@ -22,8 +21,8 @@
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ScaledNumber.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/ScaledNumber.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
@@ -48,7 +47,7 @@ cl::opt<bool> CheckBFIUnknownBlockQueries(
              "for debugging missed BFI updates"));
 
 cl::opt<bool> UseIterativeBFIInference(
-    "use-iterative-bfi-inference", cl::init(false), cl::Hidden, cl::ZeroOrMore,
+    "use-iterative-bfi-inference", cl::Hidden,
     cl::desc("Apply an iterative post-processing to infer correct BFI counts"));
 
 cl::opt<unsigned> IterativeBFIMaxIterationsPerBlock(
diff --git a/llvm/lib/Analysis/BranchProbabilityInfo.cpp b/llvm/lib/Analysis/BranchProbabilityInfo.cpp
index ffb80134749a..1d880424e55c 100644
--- a/llvm/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/llvm/lib/Analysis/BranchProbabilityInfo.cpp
@@ -414,8 +414,7 @@ bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
     const LoopBlock DstLoopBB = getLoopBlock(TI->getSuccessor(I - 1));
     auto EstimatedWeight = getEstimatedEdgeWeight({SrcLoopBB, DstLoopBB});
     if (EstimatedWeight &&
-        EstimatedWeight.getValue() <=
-            static_cast<uint32_t>(BlockExecWeight::UNREACHABLE))
+        *EstimatedWeight <= static_cast<uint32_t>(BlockExecWeight::UNREACHABLE))
       UnreachableIdxs.push_back(I - 1);
     else
       ReachableIdxs.push_back(I - 1);
@@ -688,7 +687,7 @@ Optional<uint32_t> BranchProbabilityInfo::getMaxEstimatedEdgeWeight(
     if (!Weight)
       return None;
 
-    if (!MaxWeight || MaxWeight.getValue() < Weight.getValue())
+    if (!MaxWeight || *MaxWeight < *Weight)
       MaxWeight = Weight;
   }
 
@@ -852,8 +851,7 @@ void BranchProbabilityInfo::computeEestimateBlockWeight(
         if (LoopWeight <= static_cast<uint32_t>(BlockExecWeight::UNREACHABLE))
           LoopWeight = static_cast<uint32_t>(BlockExecWeight::LOWEST_NON_ZERO);
 
-        EstimatedLoopWeight.insert(
-            {LoopBB.getLoopData(), LoopWeight.getValue()});
+        EstimatedLoopWeight.insert({LoopBB.getLoopData(), *LoopWeight});
         // Add all blocks entering the loop into working list.
         getLoopEnterBlocks(LoopBB, BlockWorkList);
       }
@@ -875,7 +873,7 @@ void BranchProbabilityInfo::computeEestimateBlockWeight(
       auto MaxWeight = getMaxEstimatedEdgeWeight(LoopBB, successors(BB));
 
       if (MaxWeight)
-        propagateEstimatedBlockWeight(LoopBB, DT, PDT, MaxWeight.getValue(),
+        propagateEstimatedBlockWeight(LoopBB, DT, PDT, *MaxWeight,
                                       BlockWorkList, LoopWorkList);
     }
   } while (!BlockWorkList.empty() || !LoopWorkList.empty());
@@ -913,7 +911,7 @@ bool BranchProbabilityInfo::calcEstimatedHeuristics(const BasicBlock *BB) {
       // Scale down loop exiting weight by trip count.
       Weight = std::max(
           static_cast<uint32_t>(BlockExecWeight::LOWEST_NON_ZERO),
-          Weight.getValueOr(static_cast<uint32_t>(BlockExecWeight::DEFAULT)) /
+          Weight.value_or(static_cast<uint32_t>(BlockExecWeight::DEFAULT)) /
               TC);
     }
     bool IsUnlikelyEdge = LoopBB.getLoop() && UnlikelyBlocks.contains(SuccBB);
@@ -923,15 +921,14 @@ bool BranchProbabilityInfo::calcEstimatedHeuristics(const BasicBlock *BB) {
       // 'Unlikely' blocks have twice lower weight.
       Weight = std::max(
           static_cast<uint32_t>(BlockExecWeight::LOWEST_NON_ZERO),
-          Weight.getValueOr(static_cast<uint32_t>(BlockExecWeight::DEFAULT)) /
-              2);
+          Weight.value_or(static_cast<uint32_t>(BlockExecWeight::DEFAULT)) / 2);
     }
 
     if (Weight)
       FoundEstimatedWeight = true;
 
     auto WeightVal =
-        Weight.getValueOr(static_cast<uint32_t>(BlockExecWeight::DEFAULT));
+        Weight.value_or(static_cast<uint32_t>(BlockExecWeight::DEFAULT));
     TotalWeight += WeightVal;
     SuccWeights.push_back(WeightVal);
   }
diff --git a/llvm/lib/Analysis/CFG.cpp b/llvm/lib/Analysis/CFG.cpp
index ec25ee161e2c..1902d72f2f89 100644
--- a/llvm/lib/Analysis/CFG.cpp
+++ b/llvm/lib/Analysis/CFG.cpp
@@ -127,11 +127,7 @@ bool llvm::isCriticalEdge(const Instruction *TI, const BasicBlock *Dest,
 // the outermost loop in the loop nest that contains BB.
 static const Loop *getOutermostLoop(const LoopInfo *LI, const BasicBlock *BB) {
   const Loop *L = LI->getLoopFor(BB);
-  if (L) {
-    while (const Loop *Parent = L->getParentLoop())
-      L = Parent;
-  }
-  return L;
+  return L ? L->getOutermostLoop() : nullptr;
 }
 
 bool llvm::isPotentiallyReachableFromMany(
diff --git a/llvm/lib/Analysis/CFGPrinter.cpp b/llvm/lib/Analysis/CFGPrinter.cpp
index 04ccdc590845..f8eba1a00f28 100644
--- a/llvm/lib/Analysis/CFGPrinter.cpp
+++ b/llvm/lib/Analysis/CFGPrinter.cpp
@@ -23,7 +23,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
-#include <algorithm>
+#include "llvm/Support/GraphWriter.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Analysis/CFLAndersAliasAnalysis.cpp b/llvm/lib/Analysis/CFLAndersAliasAnalysis.cpp
index 1216d03e448b..602a01867f3b 100644
--- a/llvm/lib/Analysis/CFLAndersAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/CFLAndersAliasAnalysis.cpp
@@ -831,14 +831,14 @@ CFLAndersAAResult::ensureCached(const Function &Fn) {
     scan(Fn);
     Iter = Cache.find(&Fn);
     assert(Iter != Cache.end());
-    assert(Iter->second.hasValue());
+    assert(Iter->second);
   }
   return Iter->second;
 }
 
 const AliasSummary *CFLAndersAAResult::getAliasSummary(const Function &Fn) {
   auto &FunInfo = ensureCached(Fn);
-  if (FunInfo.hasValue())
+  if (FunInfo)
     return &FunInfo->getAliasSummary();
   else
     return nullptr;
diff --git a/llvm/lib/Analysis/CFLGraph.h b/llvm/lib/Analysis/CFLGraph.h
index 02a13d673f40..60fc8d18678c 100644
--- a/llvm/lib/Analysis/CFLGraph.h
+++ b/llvm/lib/Analysis/CFLGraph.h
@@ -403,7 +403,7 @@ template <typename CFLAA> class CFLGraphBuilder {
         auto &RetParamRelations = Summary->RetParamRelations;
         for (auto &Relation : RetParamRelations) {
           auto IRelation = instantiateExternalRelation(Relation, Call);
-          if (IRelation.hasValue()) {
+          if (IRelation) {
             Graph.addNode(IRelation->From);
             Graph.addNode(IRelation->To);
             Graph.addEdge(IRelation->From, IRelation->To);
@@ -413,7 +413,7 @@ template <typename CFLAA> class CFLGraphBuilder {
         auto &RetParamAttributes = Summary->RetParamAttributes;
         for (auto &Attribute : RetParamAttributes) {
           auto IAttr = instantiateExternalAttribute(Attribute, Call);
-          if (IAttr.hasValue())
+          if (IAttr)
             Graph.addNode(IAttr->IValue, IAttr->Attr);
         }
       }
diff --git a/llvm/lib/Analysis/CFLSteensAliasAnalysis.cpp b/llvm/lib/Analysis/CFLSteensAliasAnalysis.cpp
index 090dccc53b6e..f92869c2ec63 100644
--- a/llvm/lib/Analysis/CFLSteensAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/CFLSteensAliasAnalysis.cpp
@@ -165,7 +165,7 @@ CFLSteensAAResult::FunctionInfo::FunctionInfo(
     assert(RetVal != nullptr);
     assert(RetVal->getType()->isPointerTy());
     auto RetInfo = Sets.find(InstantiatedValue{RetVal, 0});
-    if (RetInfo.hasValue())
+    if (RetInfo)
       AddToRetParamRelations(0, RetInfo->Index);
   }
 
@@ -174,7 +174,7 @@ CFLSteensAAResult::FunctionInfo::FunctionInfo(
   for (auto &Param : Fn.args()) {
     if (Param.getType()->isPointerTy()) {
       auto ParamInfo = Sets.find(InstantiatedValue{&Param, 0});
-      if (ParamInfo.hasValue())
+      if (ParamInfo)
         AddToRetParamRelations(I + 1, ParamInfo->Index);
     }
     ++I;
@@ -250,14 +250,14 @@ CFLSteensAAResult::ensureCached(Function *Fn) {
     scan(Fn);
     Iter = Cache.find(Fn);
     assert(Iter != Cache.end());
-    assert(Iter->second.hasValue());
+    assert(Iter->second);
   }
   return Iter->second;
 }
 
 const AliasSummary *CFLSteensAAResult::getAliasSummary(Function &Fn) {
   auto &FunInfo = ensureCached(&Fn);
-  if (FunInfo.hasValue())
+  if (FunInfo)
     return &FunInfo->getAliasSummary();
   else
     return nullptr;
@@ -293,15 +293,15 @@ AliasResult CFLSteensAAResult::query(const MemoryLocation &LocA,
 
   assert(Fn != nullptr);
   auto &MaybeInfo = ensureCached(Fn);
-  assert(MaybeInfo.hasValue());
+  assert(MaybeInfo);
 
   auto &Sets = MaybeInfo->getStratifiedSets();
   auto MaybeA = Sets.find(InstantiatedValue{ValA, 0});
-  if (!MaybeA.hasValue())
+  if (!MaybeA)
     return AliasResult::MayAlias;
 
   auto MaybeB = Sets.find(InstantiatedValue{ValB, 0});
-  if (!MaybeB.hasValue())
+  if (!MaybeB)
     return AliasResult::MayAlias;
 
   auto SetA = *MaybeA;
diff --git a/llvm/lib/Analysis/CGSCCPassManager.cpp b/llvm/lib/Analysis/CGSCCPassManager.cpp
index c60b70ae5b69..b2e7422bbf8b 100644
--- a/llvm/lib/Analysis/CGSCCPassManager.cpp
+++ b/llvm/lib/Analysis/CGSCCPassManager.cpp
@@ -9,6 +9,7 @@
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/PriorityWorklist.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -27,7 +28,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TimeProfiler.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cassert>
 #include <iterator>
 
@@ -164,9 +164,9 @@ ModuleToPostOrderCGSCCPassAdaptor::run(Module &M, ModuleAnalysisManager &AM) {
       InlinedInternalEdges;
 
   CGSCCUpdateResult UR = {
-      RCWorklist, CWorklist, InvalidRefSCCSet,         InvalidSCCSet,
-      nullptr,    nullptr,   PreservedAnalyses::all(), InlinedInternalEdges,
-      {}};
+      RCWorklist,           CWorklist, InvalidRefSCCSet,
+      InvalidSCCSet,        nullptr,   PreservedAnalyses::all(),
+      InlinedInternalEdges, {}};
 
   // Request PassInstrumentation from analysis manager, will use it to run
   // instrumenting callbacks for the passes later.
@@ -174,9 +174,8 @@ ModuleToPostOrderCGSCCPassAdaptor::run(Module &M, ModuleAnalysisManager &AM) {
 
   PreservedAnalyses PA = PreservedAnalyses::all();
   CG.buildRefSCCs();
-  for (auto RCI = CG.postorder_ref_scc_begin(),
-            RCE = CG.postorder_ref_scc_end();
-       RCI != RCE;) {
+  for (LazyCallGraph::RefSCC &RC :
+       llvm::make_early_inc_range(CG.postorder_ref_sccs())) {
     assert(RCWorklist.empty() &&
            "Should always start with an empty RefSCC worklist");
     // The postorder_ref_sccs range we are walking is lazily constructed, so
@@ -190,7 +189,7 @@ ModuleToPostOrderCGSCCPassAdaptor::run(Module &M, ModuleAnalysisManager &AM) {
     //
     // We also eagerly increment the iterator to the next position because
     // the CGSCC passes below may delete the current RefSCC.
-    RCWorklist.insert(&*RCI++);
+    RCWorklist.insert(&RC);
 
     do {
       LazyCallGraph::RefSCC *RC = RCWorklist.pop_back_val();
@@ -230,11 +229,15 @@ ModuleToPostOrderCGSCCPassAdaptor::run(Module &M, ModuleAnalysisManager &AM) {
           LLVM_DEBUG(dbgs() << "Skipping redundant run on SCC: " << *C << "\n");
           continue;
         }
-        if (&C->getOuterRefSCC() != RC) {
-          LLVM_DEBUG(dbgs() << "Skipping an SCC that is now part of some other "
-                               "RefSCC...\n");
-          continue;
-        }
+        // We used to also check if the current SCC is part of the current
+        // RefSCC and bail if it wasn't, since it should be in RCWorklist.
+        // However, this can cause compile time explosions in some cases on
+        // modules with a huge RefSCC. If a non-trivial amount of SCCs in the
+        // huge RefSCC can become their own child RefSCC, we create one child
+        // RefSCC, bail on the current RefSCC, visit the child RefSCC, revisit
+        // the huge RefSCC, and repeat. By visiting all SCCs in the original
+        // RefSCC we create all the child RefSCCs in one pass of the RefSCC,
+        // rather one pass of the RefSCC creating one child RefSCC at a time.
 
         // Ensure we can proxy analysis updates from the CGSCC analysis manager
         // into the the Function analysis manager by getting a proxy here.
@@ -264,11 +267,8 @@ ModuleToPostOrderCGSCCPassAdaptor::run(Module &M, ModuleAnalysisManager &AM) {
           // Check that we didn't miss any update scenario.
           assert(!InvalidSCCSet.count(C) && "Processing an invalid SCC!");
           assert(C->begin() != C->end() && "Cannot have an empty SCC!");
-          assert(&C->getOuterRefSCC() == RC &&
-                 "Processing an SCC in a different RefSCC!");
 
           LastUpdatedC = UR.UpdatedC;
-          UR.UpdatedRC = nullptr;
           UR.UpdatedC = nullptr;
 
           // Check the PassInstrumentation's BeforePass callbacks before
@@ -290,7 +290,6 @@ ModuleToPostOrderCGSCCPassAdaptor::run(Module &M, ModuleAnalysisManager &AM) {
 
           // Update the SCC and RefSCC if necessary.
           C = UR.UpdatedC ? UR.UpdatedC : C;
-          RC = UR.UpdatedRC ? UR.UpdatedRC : RC;
 
           if (UR.UpdatedC) {
             // If we're updating the SCC, also update the FAM inside the proxy's
@@ -1213,10 +1212,8 @@ static LazyCallGraph::SCC &updateCGAndAnalysisManagerForPass(
   assert(!UR.InvalidatedRefSCCs.count(RC) && "Invalidated the current RefSCC!");
   assert(&C->getOuterRefSCC() == RC && "Current SCC not in current RefSCC!");
 
-  // Record the current RefSCC and SCC for higher layers of the CGSCC pass
-  // manager now that all the updates have been applied.
-  if (RC != &InitialRC)
-    UR.UpdatedRC = RC;
+  // Record the current SCC for higher layers of the CGSCC pass manager now that
+  // all the updates have been applied.
   if (C != &InitialC)
     UR.UpdatedC = C;
 
diff --git a/llvm/lib/Analysis/CallGraph.cpp b/llvm/lib/Analysis/CallGraph.cpp
index dfbd29b7d636..f85527122b2a 100644
--- a/llvm/lib/Analysis/CallGraph.cpp
+++ b/llvm/lib/Analysis/CallGraph.cpp
@@ -21,7 +21,6 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cassert>
 
 using namespace llvm;
@@ -70,8 +69,7 @@ bool CallGraph::invalidate(Module &, const PreservedAnalyses &PA,
   // Check whether the analysis, all analyses on functions, or the function's
   // CFG have been preserved.
   auto PAC = PA.getChecker<CallGraphAnalysis>();
-  return !(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Module>>() ||
-           PAC.preservedSet<CFGAnalyses>());
+  return !(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Module>>());
 }
 
 void CallGraph::addToCallGraph(Function *F) {
diff --git a/llvm/lib/Analysis/CallGraphSCCPass.cpp b/llvm/lib/Analysis/CallGraphSCCPass.cpp
index 930cb13c0cb3..8438f33f4712 100644
--- a/llvm/lib/Analysis/CallGraphSCCPass.cpp
+++ b/llvm/lib/Analysis/CallGraphSCCPass.cpp
@@ -28,7 +28,6 @@
 #include "llvm/IR/OptBisect.h"
 #include "llvm/IR/PassTimingInfo.h"
 #include "llvm/IR/PrintPasses.h"
-#include "llvm/IR/StructuralHash.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -271,7 +270,7 @@ bool CGPassManager::RefreshCallGraph(const CallGraphSCC &CurSCC, CallGraph &CG,
           Calls.count(Call) ||
 
           // If the call edge is not from a call or invoke, or it is a
-          // instrinsic call, then the function pass RAUW'd a call with
+          // intrinsic call, then the function pass RAUW'd a call with
           // another value. This can happen when constant folding happens
           // of well known functions etc.
           (Call->getCalledFunction() &&
@@ -470,7 +469,7 @@ bool CGPassManager::RunAllPassesOnSCC(CallGraphSCC &CurSCC, CallGraph &CG,
     initializeAnalysisImpl(P);
 
 #ifdef EXPENSIVE_CHECKS
-    uint64_t RefHash = StructuralHash(CG.getModule());
+    uint64_t RefHash = P->structuralHash(CG.getModule());
 #endif
 
     // Actually run this pass on the current SCC.
@@ -480,7 +479,7 @@ bool CGPassManager::RunAllPassesOnSCC(CallGraphSCC &CurSCC, CallGraph &CG,
     Changed |= LocalChanged;
 
 #ifdef EXPENSIVE_CHECKS
-    if (!LocalChanged && (RefHash != StructuralHash(CG.getModule()))) {
+    if (!LocalChanged && (RefHash != P->structuralHash(CG.getModule()))) {
       llvm::errs() << "Pass modifies its input and doesn't report it: "
                    << P->getPassName() << "\n";
       llvm_unreachable("Pass modifies its input and doesn't report it");
diff --git a/llvm/lib/Analysis/CallPrinter.cpp b/llvm/lib/Analysis/CallPrinter.cpp
index 829532a0fa10..65e3184fad91 100644
--- a/llvm/lib/Analysis/CallPrinter.cpp
+++ b/llvm/lib/Analysis/CallPrinter.cpp
@@ -14,18 +14,23 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/CallPrinter.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/DOTGraphTraitsPass.h"
 #include "llvm/Analysis/HeatUtils.h"
-#include "llvm/Support/CommandLine.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/DOTGraphTraits.h"
+#include "llvm/Support/GraphWriter.h"
 
 using namespace llvm;
 
+namespace llvm {
+template <class GraphType> struct GraphTraits;
+}
+
 // This option shows static (relative) call counts.
 // FIXME:
 // Need to show real counts when profile data is available
@@ -212,6 +217,71 @@ struct DOTGraphTraits<CallGraphDOTInfo *> : public DefaultDOTGraphTraits {
 
 } // end llvm namespace
 
+namespace {
+void doCallGraphDOTPrinting(
+    Module &M, function_ref<BlockFrequencyInfo *(Function &)> LookupBFI) {
+  std::string Filename;
+  if (!CallGraphDotFilenamePrefix.empty())
+    Filename = (CallGraphDotFilenamePrefix + ".callgraph.dot");
+  else
+    Filename = (std::string(M.getModuleIdentifier()) + ".callgraph.dot");
+  errs() << "Writing '" << Filename << "'...";
+
+  std::error_code EC;
+  raw_fd_ostream File(Filename, EC, sys::fs::OF_Text);
+
+  CallGraph CG(M);
+  CallGraphDOTInfo CFGInfo(&M, &CG, LookupBFI);
+
+  if (!EC)
+    WriteGraph(File, &CFGInfo);
+  else
+    errs() << "  error opening file for writing!";
+  errs() << "\n";
+}
+
+void viewCallGraph(Module &M,
+                   function_ref<BlockFrequencyInfo *(Function &)> LookupBFI) {
+  CallGraph CG(M);
+  CallGraphDOTInfo CFGInfo(&M, &CG, LookupBFI);
+
+  std::string Title =
+      DOTGraphTraits<CallGraphDOTInfo *>::getGraphName(&CFGInfo);
+  ViewGraph(&CFGInfo, "callgraph", true, Title);
+}
+} // namespace
+
+namespace llvm {
+PreservedAnalyses CallGraphDOTPrinterPass::run(Module &M,
+                                               ModuleAnalysisManager &AM) {
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+
+  auto LookupBFI = [&FAM](Function &F) {
+    return &FAM.getResult<BlockFrequencyAnalysis>(F);
+  };
+
+  doCallGraphDOTPrinting(M, LookupBFI);
+
+  return PreservedAnalyses::all();
+}
+
+PreservedAnalyses CallGraphViewerPass::run(Module &M,
+                                           ModuleAnalysisManager &AM) {
+
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+
+  auto LookupBFI = [&FAM](Function &F) {
+    return &FAM.getResult<BlockFrequencyAnalysis>(F);
+  };
+
+  viewCallGraph(M, LookupBFI);
+
+  return PreservedAnalyses::all();
+}
+} // namespace llvm
+
 namespace {
 // Viewer
 class CallGraphViewer : public ModulePass {
@@ -234,12 +304,7 @@ bool CallGraphViewer::runOnModule(Module &M) {
     return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
   };
 
-  CallGraph CG(M);
-  CallGraphDOTInfo CFGInfo(&M, &CG, LookupBFI);
-
-  std::string Title =
-      DOTGraphTraits<CallGraphDOTInfo *>::getGraphName(&CFGInfo);
-  ViewGraph(&CFGInfo, "callgraph", true, Title);
+  viewCallGraph(M, LookupBFI);
 
   return false;
 }
@@ -266,24 +331,7 @@ bool CallGraphDOTPrinter::runOnModule(Module &M) {
     return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
   };
 
-  std::string Filename;
-  if (!CallGraphDotFilenamePrefix.empty())
-    Filename = (CallGraphDotFilenamePrefix + ".callgraph.dot");
-  else
-    Filename = (std::string(M.getModuleIdentifier()) + ".callgraph.dot");
-  errs() << "Writing '" << Filename << "'...";
-
-  std::error_code EC;
-  raw_fd_ostream File(Filename, EC, sys::fs::OF_Text);
-
-  CallGraph CG(M);
-  CallGraphDOTInfo CFGInfo(&M, &CG, LookupBFI);
-
-  if (!EC)
-    WriteGraph(File, &CFGInfo);
-  else
-    errs() << "  error opening file for writing!";
-  errs() << "\n";
+  doCallGraphDOTPrinting(M, LookupBFI);
 
   return false;
 }
diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp
index ba8462e659d5..f4fd660ac7e0 100644
--- a/llvm/lib/Analysis/CaptureTracking.cpp
+++ b/llvm/lib/Analysis/CaptureTracking.cpp
@@ -16,6 +16,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -44,15 +45,15 @@ STATISTIC(NumNotCapturedBefore, "Number of pointers not captured before");
 /// use it where possible. The caching version can use much higher limit or
 /// don't have this cap at all.
 static cl::opt<unsigned>
-DefaultMaxUsesToExplore("capture-tracking-max-uses-to-explore", cl::Hidden,
-                        cl::desc("Maximal number of uses to explore."),
-                        cl::init(20));
+    DefaultMaxUsesToExplore("capture-tracking-max-uses-to-explore", cl::Hidden,
+                            cl::desc("Maximal number of uses to explore."),
+                            cl::init(100));
 
 unsigned llvm::getDefaultMaxUsesToExploreForCaptureTracking() {
   return DefaultMaxUsesToExplore;
 }
 
-CaptureTracker::~CaptureTracker() {}
+CaptureTracker::~CaptureTracker() = default;
 
 bool CaptureTracker::shouldExplore(const Use *U) { return true; }
 
@@ -74,8 +75,10 @@ bool CaptureTracker::isDereferenceableOrNull(Value *O, const DataLayout &DL) {
 
 namespace {
   struct SimpleCaptureTracker : public CaptureTracker {
-    explicit SimpleCaptureTracker(bool ReturnCaptures)
-        : ReturnCaptures(ReturnCaptures) {}
+    explicit SimpleCaptureTracker(
+
+        const SmallPtrSetImpl<const Value *> &EphValues, bool ReturnCaptures)
+        : EphValues(EphValues), ReturnCaptures(ReturnCaptures) {}
 
     void tooManyUses() override { Captured = true; }
 
@@ -83,10 +86,15 @@ namespace {
       if (isa<ReturnInst>(U->getUser()) && !ReturnCaptures)
         return false;
 
+      if (EphValues.contains(U->getUser()))
+        return false;
+
       Captured = true;
       return true;
     }
 
+    const SmallPtrSetImpl<const Value *> &EphValues;
+
     bool ReturnCaptures;
 
     bool Captured = false;
@@ -154,8 +162,9 @@ namespace {
   // escape are not in a cycle.
   struct EarliestCaptures : public CaptureTracker {
 
-    EarliestCaptures(bool ReturnCaptures, Function &F, const DominatorTree &DT)
-        : DT(DT), ReturnCaptures(ReturnCaptures), F(F) {}
+    EarliestCaptures(bool ReturnCaptures, Function &F, const DominatorTree &DT,
+                     const SmallPtrSetImpl<const Value *> &EphValues)
+        : EphValues(EphValues), DT(DT), ReturnCaptures(ReturnCaptures), F(F) {}
 
     void tooManyUses() override {
       Captured = true;
@@ -167,6 +176,9 @@ namespace {
       if (isa<ReturnInst>(I) && !ReturnCaptures)
         return false;
 
+      if (EphValues.contains(I))
+        return false;
+
       if (!EarliestCapture) {
         EarliestCapture = I;
       } else if (EarliestCapture->getParent() == I->getParent()) {
@@ -193,6 +205,8 @@ namespace {
       return false;
     }
 
+    const SmallPtrSetImpl<const Value *> &EphValues;
+
     Instruction *EarliestCapture = nullptr;
 
     const DominatorTree &DT;
@@ -212,8 +226,18 @@ namespace {
 /// counts as capturing it or not.  The boolean StoreCaptures specified whether
 /// storing the value (or part of it) into memory anywhere automatically
 /// counts as capturing it or not.
-bool llvm::PointerMayBeCaptured(const Value *V,
-                                bool ReturnCaptures, bool StoreCaptures,
+bool llvm::PointerMayBeCaptured(const Value *V, bool ReturnCaptures,
+                                bool StoreCaptures, unsigned MaxUsesToExplore) {
+  SmallPtrSet<const Value *, 1> Empty;
+  return PointerMayBeCaptured(V, ReturnCaptures, StoreCaptures, Empty,
+                              MaxUsesToExplore);
+}
+
+/// Variant of the above function which accepts a set of Values that are
+/// ephemeral and cannot cause pointers to escape.
+bool llvm::PointerMayBeCaptured(const Value *V, bool ReturnCaptures,
+                                bool StoreCaptures,
+                                const SmallPtrSetImpl<const Value *> &EphValues,
                                 unsigned MaxUsesToExplore) {
   assert(!isa<GlobalValue>(V) &&
          "It doesn't make sense to ask whether a global is captured.");
@@ -224,7 +248,7 @@ bool llvm::PointerMayBeCaptured(const Value *V,
   // take advantage of this.
   (void)StoreCaptures;
 
-  SimpleCaptureTracker SCT(ReturnCaptures);
+  SimpleCaptureTracker SCT(EphValues, ReturnCaptures);
   PointerMayBeCaptured(V, &SCT, MaxUsesToExplore);
   if (SCT.Captured)
     ++NumCaptured;
@@ -266,14 +290,16 @@ bool llvm::PointerMayBeCapturedBefore(const Value *V, bool ReturnCaptures,
   return CB.Captured;
 }
 
-Instruction *llvm::FindEarliestCapture(const Value *V, Function &F,
-                                       bool ReturnCaptures, bool StoreCaptures,
-                                       const DominatorTree &DT,
-                                       unsigned MaxUsesToExplore) {
+Instruction *
+llvm::FindEarliestCapture(const Value *V, Function &F, bool ReturnCaptures,
+                          bool StoreCaptures, const DominatorTree &DT,
+
+                          const SmallPtrSetImpl<const Value *> &EphValues,
+                          unsigned MaxUsesToExplore) {
   assert(!isa<GlobalValue>(V) &&
          "It doesn't make sense to ask whether a global is captured.");
 
-  EarliestCaptures CB(ReturnCaptures, F, DT);
+  EarliestCaptures CB(ReturnCaptures, F, DT, EphValues);
   PointerMayBeCaptured(V, &CB, MaxUsesToExplore);
   if (CB.Captured)
     ++NumCapturedBefore;
@@ -282,6 +308,132 @@ Instruction *llvm::FindEarliestCapture(const Value *V, Function &F,
   return CB.EarliestCapture;
 }
 
+UseCaptureKind llvm::DetermineUseCaptureKind(
+    const Use &U,
+    function_ref<bool(Value *, const DataLayout &)> IsDereferenceableOrNull) {
+  Instruction *I = cast<Instruction>(U.getUser());
+
+  switch (I->getOpcode()) {
+  case Instruction::Call:
+  case Instruction::Invoke: {
+    auto *Call = cast<CallBase>(I);
+    // Not captured if the callee is readonly, doesn't return a copy through
+    // its return value and doesn't unwind (a readonly function can leak bits
+    // by throwing an exception or not depending on the input value).
+    if (Call->onlyReadsMemory() && Call->doesNotThrow() &&
+        Call->getType()->isVoidTy())
+      return UseCaptureKind::NO_CAPTURE;
+
+    // The pointer is not captured if returned pointer is not captured.
+    // NOTE: CaptureTracking users should not assume that only functions
+    // marked with nocapture do not capture. This means that places like
+    // getUnderlyingObject in ValueTracking or DecomposeGEPExpression
+    // in BasicAA also need to know about this property.
+    if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(Call, true))
+      return UseCaptureKind::PASSTHROUGH;
+
+    // Volatile operations effectively capture the memory location that they
+    // load and store to.
+    if (auto *MI = dyn_cast<MemIntrinsic>(Call))
+      if (MI->isVolatile())
+        return UseCaptureKind::MAY_CAPTURE;
+
+    // Calling a function pointer does not in itself cause the pointer to
+    // be captured.  This is a subtle point considering that (for example)
+    // the callee might return its own address.  It is analogous to saying
+    // that loading a value from a pointer does not cause the pointer to be
+    // captured, even though the loaded value might be the pointer itself
+    // (think of self-referential objects).
+    if (Call->isCallee(&U))
+      return UseCaptureKind::NO_CAPTURE;
+
+    // Not captured if only passed via 'nocapture' arguments.
+    if (Call->isDataOperand(&U) &&
+        !Call->doesNotCapture(Call->getDataOperandNo(&U))) {
+      // The parameter is not marked 'nocapture' - captured.
+      return UseCaptureKind::MAY_CAPTURE;
+    }
+    return UseCaptureKind::NO_CAPTURE;
+  }
+  case Instruction::Load:
+    // Volatile loads make the address observable.
+    if (cast<LoadInst>(I)->isVolatile())
+      return UseCaptureKind::MAY_CAPTURE;
+    return UseCaptureKind::NO_CAPTURE;
+  case Instruction::VAArg:
+    // "va-arg" from a pointer does not cause it to be captured.
+    return UseCaptureKind::NO_CAPTURE;
+  case Instruction::Store:
+    // Stored the pointer - conservatively assume it may be captured.
+    // Volatile stores make the address observable.
+    if (U.getOperandNo() == 0 || cast<StoreInst>(I)->isVolatile())
+      return UseCaptureKind::MAY_CAPTURE;
+    return UseCaptureKind::NO_CAPTURE;
+  case Instruction::AtomicRMW: {
+    // atomicrmw conceptually includes both a load and store from
+    // the same location.
+    // As with a store, the location being accessed is not captured,
+    // but the value being stored is.
+    // Volatile stores make the address observable.
+    auto *ARMWI = cast<AtomicRMWInst>(I);
+    if (U.getOperandNo() == 1 || ARMWI->isVolatile())
+      return UseCaptureKind::MAY_CAPTURE;
+    return UseCaptureKind::NO_CAPTURE;
+  }
+  case Instruction::AtomicCmpXchg: {
+    // cmpxchg conceptually includes both a load and store from
+    // the same location.
+    // As with a store, the location being accessed is not captured,
+    // but the value being stored is.
+    // Volatile stores make the address observable.
+    auto *ACXI = cast<AtomicCmpXchgInst>(I);
+    if (U.getOperandNo() == 1 || U.getOperandNo() == 2 || ACXI->isVolatile())
+      return UseCaptureKind::MAY_CAPTURE;
+    return UseCaptureKind::NO_CAPTURE;
+  }
+  case Instruction::BitCast:
+  case Instruction::GetElementPtr:
+  case Instruction::PHI:
+  case Instruction::Select:
+  case Instruction::AddrSpaceCast:
+    // The original value is not captured via this if the new value isn't.
+    return UseCaptureKind::PASSTHROUGH;
+  case Instruction::ICmp: {
+    unsigned Idx = U.getOperandNo();
+    unsigned OtherIdx = 1 - Idx;
+    if (auto *CPN = dyn_cast<ConstantPointerNull>(I->getOperand(OtherIdx))) {
+      // Don't count comparisons of a no-alias return value against null as
+      // captures. This allows us to ignore comparisons of malloc results
+      // with null, for example.
+      if (CPN->getType()->getAddressSpace() == 0)
+        if (isNoAliasCall(U.get()->stripPointerCasts()))
+          return UseCaptureKind::NO_CAPTURE;
+      if (!I->getFunction()->nullPointerIsDefined()) {
+        auto *O = I->getOperand(Idx)->stripPointerCastsSameRepresentation();
+        // Comparing a dereferenceable_or_null pointer against null cannot
+        // lead to pointer escapes, because if it is not null it must be a
+        // valid (in-bounds) pointer.
+        const DataLayout &DL = I->getModule()->getDataLayout();
+        if (IsDereferenceableOrNull && IsDereferenceableOrNull(O, DL))
+          return UseCaptureKind::NO_CAPTURE;
+      }
+    }
+    // Comparison against value stored in global variable. Given the pointer
+    // does not escape, its value cannot be guessed and stored separately in a
+    // global variable.
+    auto *LI = dyn_cast<LoadInst>(I->getOperand(OtherIdx));
+    if (LI && isa<GlobalVariable>(LI->getPointerOperand()))
+      return UseCaptureKind::NO_CAPTURE;
+    // Otherwise, be conservative. There are crazy ways to capture pointers
+    // using comparisons.
+    return UseCaptureKind::MAY_CAPTURE;
+  }
+  default:
+    // Something else - be conservative and say it is captured.
+    return UseCaptureKind::MAY_CAPTURE;
+  }
+}
+
 void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker,
                                 unsigned MaxUsesToExplore) {
   assert(V->getType()->isPointerTy() && "Capture is for pointers only!");
@@ -293,11 +445,10 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker,
   SmallSet<const Use *, 20> Visited;
 
   auto AddUses = [&](const Value *V) {
-    unsigned Count = 0;
     for (const Use &U : V->uses()) {
       // If there are lots of uses, conservatively say that the value
       // is captured to avoid taking too much compile time.
-      if (Count++ >= MaxUsesToExplore) {
+      if (Visited.size()  >= MaxUsesToExplore) {
         Tracker->tooManyUses();
         return false;
       }
@@ -312,144 +463,22 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker,
   if (!AddUses(V))
     return;
 
+  auto IsDereferenceableOrNull = [Tracker](Value *V, const DataLayout &DL) {
+    return Tracker->isDereferenceableOrNull(V, DL);
+  };
   while (!Worklist.empty()) {
     const Use *U = Worklist.pop_back_val();
-    Instruction *I = cast<Instruction>(U->getUser());
-
-    switch (I->getOpcode()) {
-    case Instruction::Call:
-    case Instruction::Invoke: {
-      auto *Call = cast<CallBase>(I);
-      // Not captured if the callee is readonly, doesn't return a copy through
-      // its return value and doesn't unwind (a readonly function can leak bits
-      // by throwing an exception or not depending on the input value).
-      if (Call->onlyReadsMemory() && Call->doesNotThrow() &&
-          Call->getType()->isVoidTy())
-        break;
-
-      // The pointer is not captured if returned pointer is not captured.
-      // NOTE: CaptureTracking users should not assume that only functions
-      // marked with nocapture do not capture. This means that places like
-      // getUnderlyingObject in ValueTracking or DecomposeGEPExpression
-      // in BasicAA also need to know about this property.
-      if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(Call,
-                                                                      true)) {
-        if (!AddUses(Call))
-          return;
-        break;
-      }
-
-      // Volatile operations effectively capture the memory location that they
-      // load and store to.
-      if (auto *MI = dyn_cast<MemIntrinsic>(Call))
-        if (MI->isVolatile())
-          if (Tracker->captured(U))
-            return;
-
-      // Calling a function pointer does not in itself cause the pointer to
-      // be captured.  This is a subtle point considering that (for example)
-      // the callee might return its own address.  It is analogous to saying
-      // that loading a value from a pointer does not cause the pointer to be
-      // captured, even though the loaded value might be the pointer itself
-      // (think of self-referential objects).
-      if (Call->isCallee(U))
-        break;
-
-      // Not captured if only passed via 'nocapture' arguments.
-      if (Call->isDataOperand(U) &&
-          !Call->doesNotCapture(Call->getDataOperandNo(U))) {
-        // The parameter is not marked 'nocapture' - captured.
-        if (Tracker->captured(U))
-          return;
-      }
-      break;
-    }
-    case Instruction::Load:
-      // Volatile loads make the address observable.
-      if (cast<LoadInst>(I)->isVolatile())
-        if (Tracker->captured(U))
-          return;
-      break;
-    case Instruction::VAArg:
-      // "va-arg" from a pointer does not cause it to be captured.
-      break;
-    case Instruction::Store:
-      // Stored the pointer - conservatively assume it may be captured.
-      // Volatile stores make the address observable.
-      if (U->getOperandNo() == 0 || cast<StoreInst>(I)->isVolatile())
-        if (Tracker->captured(U))
-          return;
-      break;
-    case Instruction::AtomicRMW: {
-      // atomicrmw conceptually includes both a load and store from
-      // the same location.
-      // As with a store, the location being accessed is not captured,
-      // but the value being stored is.
-      // Volatile stores make the address observable.
-      auto *ARMWI = cast<AtomicRMWInst>(I);
-      if (U->getOperandNo() == 1 || ARMWI->isVolatile())
-        if (Tracker->captured(U))
-          return;
-      break;
-    }
-    case Instruction::AtomicCmpXchg: {
-      // cmpxchg conceptually includes both a load and store from
-      // the same location.
-      // As with a store, the location being accessed is not captured,
-      // but the value being stored is.
-      // Volatile stores make the address observable.
-      auto *ACXI = cast<AtomicCmpXchgInst>(I);
-      if (U->getOperandNo() == 1 || U->getOperandNo() == 2 ||
-          ACXI->isVolatile())
-        if (Tracker->captured(U))
-          return;
-      break;
-    }
-    case Instruction::BitCast:
-    case Instruction::GetElementPtr:
-    case Instruction::PHI:
-    case Instruction::Select:
-    case Instruction::AddrSpaceCast:
-      // The original value is not captured via this if the new value isn't.
-      if (!AddUses(I))
-        return;
-      break;
-    case Instruction::ICmp: {
-      unsigned Idx = U->getOperandNo();
-      unsigned OtherIdx = 1 - Idx;
-      if (auto *CPN = dyn_cast<ConstantPointerNull>(I->getOperand(OtherIdx))) {
-        // Don't count comparisons of a no-alias return value against null as
-        // captures. This allows us to ignore comparisons of malloc results
-        // with null, for example.
-        if (CPN->getType()->getAddressSpace() == 0)
-          if (isNoAliasCall(U->get()->stripPointerCasts()))
-            break;
-        if (!I->getFunction()->nullPointerIsDefined()) {
-          auto *O = I->getOperand(Idx)->stripPointerCastsSameRepresentation();
-          // Comparing a dereferenceable_or_null pointer against null cannot
-          // lead to pointer escapes, because if it is not null it must be a
-          // valid (in-bounds) pointer.
-          if (Tracker->isDereferenceableOrNull(O, I->getModule()->getDataLayout()))
-            break;
-        }
-      }
-      // Comparison against value stored in global variable. Given the pointer
-      // does not escape, its value cannot be guessed and stored separately in a
-      // global variable.
-      auto *LI = dyn_cast<LoadInst>(I->getOperand(OtherIdx));
-      if (LI && isa<GlobalVariable>(LI->getPointerOperand()))
-        break;
-      // Otherwise, be conservative. There are crazy ways to capture pointers
-      // using comparisons.
+    switch (DetermineUseCaptureKind(*U, IsDereferenceableOrNull)) {
+    case UseCaptureKind::NO_CAPTURE:
+      continue;
+    case UseCaptureKind::MAY_CAPTURE:
       if (Tracker->captured(U))
         return;
-      break;
-    }
-    default:
-      // Something else - be conservative and say it is captured.
-      if (Tracker->captured(U))
+      continue;
+    case UseCaptureKind::PASSTHROUGH:
+      if (!AddUses(U->getUser()))
         return;
-      break;
+      continue;
     }
   }
 
diff --git a/llvm/lib/Analysis/CmpInstAnalysis.cpp b/llvm/lib/Analysis/CmpInstAnalysis.cpp
index 5b951980a0aa..20b1df6e1495 100644
--- a/llvm/lib/Analysis/CmpInstAnalysis.cpp
+++ b/llvm/lib/Analysis/CmpInstAnalysis.cpp
@@ -18,9 +18,7 @@
 
 using namespace llvm;
 
-unsigned llvm::getICmpCode(const ICmpInst *ICI, bool InvertPred) {
-  ICmpInst::Predicate Pred = InvertPred ? ICI->getInversePredicate()
-                                        : ICI->getPredicate();
+unsigned llvm::getICmpCode(CmpInst::Predicate Pred) {
   switch (Pred) {
       // False -> 0
     case ICmpInst::ICMP_UGT: return 1;  // 001
@@ -63,6 +61,18 @@ bool llvm::predicatesFoldable(ICmpInst::Predicate P1, ICmpInst::Predicate P2) {
          (CmpInst::isSigned(P2) && ICmpInst::isEquality(P1));
 }
 
+Constant *llvm::getPredForFCmpCode(unsigned Code, Type *OpTy,
+                                   CmpInst::Predicate &Pred) {
+  Pred = static_cast<FCmpInst::Predicate>(Code);
+  assert(FCmpInst::FCMP_FALSE <= Pred && Pred <= FCmpInst::FCMP_TRUE &&
+         "Unexpected FCmp predicate!");
+  if (Pred == FCmpInst::FCMP_FALSE)
+    return ConstantInt::get(CmpInst::makeCmpResultType(OpTy), 0);
+  if (Pred == FCmpInst::FCMP_TRUE)
+    return ConstantInt::get(CmpInst::makeCmpResultType(OpTy), 1);
+  return nullptr;
+}
+
 bool llvm::decomposeBitTestICmp(Value *LHS, Value *RHS,
                                 CmpInst::Predicate &Pred,
                                 Value *&X, APInt &Mask, bool LookThruTrunc) {
diff --git a/llvm/lib/Analysis/CodeMetrics.cpp b/llvm/lib/Analysis/CodeMetrics.cpp
index 27c52506352f..6d9084215dee 100644
--- a/llvm/lib/Analysis/CodeMetrics.cpp
+++ b/llvm/lib/Analysis/CodeMetrics.cpp
@@ -15,7 +15,6 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/InstructionCost.h"
@@ -118,13 +117,6 @@ void CodeMetrics::analyzeBasicBlock(
     const BasicBlock *BB, const TargetTransformInfo &TTI,
     const SmallPtrSetImpl<const Value *> &EphValues, bool PrepareForLTO) {
   ++NumBlocks;
-  // Use a proxy variable for NumInsts of type InstructionCost, so that it can
-  // use InstructionCost's arithmetic properties such as saturation when this
-  // feature is added to InstructionCost.
-  // When storing the value back to NumInsts, we can assume all costs are Valid
-  // because the IR should not contain any nodes that cannot be costed. If that
-  // happens the cost-model is broken.
-  InstructionCost NumInstsProxy = NumInsts;
   InstructionCost NumInstsBeforeThisBB = NumInsts;
   for (const Instruction &I : *BB) {
     // Skip ephemeral values.
@@ -184,8 +176,7 @@ void CodeMetrics::analyzeBasicBlock(
       if (InvI->cannotDuplicate())
         notDuplicatable = true;
 
-    NumInstsProxy += TTI.getUserCost(&I, TargetTransformInfo::TCK_CodeSize);
-    NumInsts = *NumInstsProxy.getValue();
+    NumInsts += TTI.getUserCost(&I, TargetTransformInfo::TCK_CodeSize);
   }
 
   if (isa<ReturnInst>(BB->getTerminator()))
@@ -205,6 +196,6 @@ void CodeMetrics::analyzeBasicBlock(
   notDuplicatable |= isa<IndirectBrInst>(BB->getTerminator());
 
   // Remember NumInsts for this BB.
-  InstructionCost NumInstsThisBB = NumInstsProxy - NumInstsBeforeThisBB;
-  NumBBInsts[BB] = *NumInstsThisBB.getValue();
+  InstructionCost NumInstsThisBB = NumInsts - NumInstsBeforeThisBB;
+  NumBBInsts[BB] = NumInstsThisBB;
 }
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 7cf69f613c66..a81041845052 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -57,7 +57,6 @@
 #include <cerrno>
 #include <cfenv>
 #include <cmath>
-#include <cstddef>
 #include <cstdint>
 
 using namespace llvm;
@@ -92,7 +91,7 @@ static Constant *foldConstVectorToAPInt(APInt &Result, Type *DestTy,
       return ConstantExpr::getBitCast(C, DestTy);
 
     Result <<= BitShift;
-    Result |= ElementCI->getValue().zextOrSelf(Result.getBitWidth());
+    Result |= ElementCI->getValue().zext(Result.getBitWidth());
   }
 
   return nullptr;
@@ -589,14 +588,17 @@ Constant *FoldReinterpretLoadFromConst(Constant *C, Type *LoadTy,
   if (BytesLoaded > 32 || BytesLoaded == 0)
     return nullptr;
 
-  int64_t InitializerSize = DL.getTypeAllocSize(C->getType()).getFixedSize();
-
   // If we're not accessing anything in this constant, the result is undefined.
   if (Offset <= -1 * static_cast<int64_t>(BytesLoaded))
     return UndefValue::get(IntType);
 
+  // TODO: We should be able to support scalable types.
+  TypeSize InitializerSize = DL.getTypeAllocSize(C->getType());
+  if (InitializerSize.isScalable())
+    return nullptr;
+
   // If we're not accessing anything in this constant, the result is undefined.
-  if (Offset >= InitializerSize)
+  if (Offset >= (int64_t)InitializerSize.getFixedValue())
     return UndefValue::get(IntType);
 
   unsigned char RawBytes[32] = {0};
@@ -631,6 +633,39 @@ Constant *FoldReinterpretLoadFromConst(Constant *C, Type *LoadTy,
   return ConstantInt::get(IntType->getContext(), ResultVal);
 }
 
+} // anonymous namespace
+
+// If GV is a constant with an initializer read its representation starting
+// at Offset and return it as a constant array of unsigned char.  Otherwise
+// return null.
+Constant *llvm::ReadByteArrayFromGlobal(const GlobalVariable *GV,
+                                        uint64_t Offset) {
+  if (!GV->isConstant() || !GV->hasDefinitiveInitializer())
+    return nullptr;
+
+  const DataLayout &DL = GV->getParent()->getDataLayout();
+  Constant *Init = const_cast<Constant *>(GV->getInitializer());
+  TypeSize InitSize = DL.getTypeAllocSize(Init->getType());
+  if (InitSize < Offset)
+    return nullptr;
+
+  uint64_t NBytes = InitSize - Offset;
+  if (NBytes > UINT16_MAX)
+    // Bail for large initializers in excess of 64K to avoid allocating
+    // too much memory.
+    // Offset is assumed to be less than or equal than InitSize (this
+    // is enforced in ReadDataFromGlobal).
+    return nullptr;
+
+  SmallVector<unsigned char, 256> RawBytes(static_cast<size_t>(NBytes));
+  unsigned char *CurPtr = RawBytes.data();
+
+  if (!ReadDataFromGlobal(Init, Offset, CurPtr, NBytes, DL))
+    return nullptr;
+
+  return ConstantDataArray::get(GV->getContext(), RawBytes);
+}
+
 /// If this Offset points exactly to the start of an aggregate element, return
 /// that element, otherwise return nullptr.
 Constant *getConstantAtOffset(Constant *Base, APInt Offset,
@@ -659,8 +694,6 @@ Constant *getConstantAtOffset(Constant *Base, APInt Offset,
   return C;
 }
 
-} // end anonymous namespace
-
 Constant *llvm::ConstantFoldLoadFromConst(Constant *C, Type *Ty,
                                           const APInt &Offset,
                                           const DataLayout &DL) {
@@ -864,21 +897,6 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP,
 
   Type *IntIdxTy = DL.getIndexType(Ptr->getType());
 
-  // If this is "gep i8* Ptr, (sub 0, V)", fold this as:
-  // "inttoptr (sub (ptrtoint Ptr), V)"
-  if (Ops.size() == 2 && ResElemTy->isIntegerTy(8)) {
-    auto *CE = dyn_cast<ConstantExpr>(Ops[1]);
-    assert((!CE || CE->getType() == IntIdxTy) &&
-           "CastGEPIndices didn't canonicalize index types!");
-    if (CE && CE->getOpcode() == Instruction::Sub &&
-        CE->getOperand(0)->isNullValue()) {
-      Constant *Res = ConstantExpr::getPtrToInt(Ptr, CE->getType());
-      Res = ConstantExpr::getSub(Res, CE->getOperand(1));
-      Res = ConstantExpr::getIntToPtr(Res, ResTy);
-      return ConstantFoldConstant(Res, DL, TLI);
-    }
-  }
-
   for (unsigned i = 1, e = Ops.size(); i != e; ++i)
     if (!isa<ConstantInt>(Ops[i]))
       return nullptr;
@@ -1012,8 +1030,24 @@ Constant *ConstantFoldInstOperandsImpl(const Value *InstOrCE, unsigned Opcode,
   if (Instruction::isUnaryOp(Opcode))
     return ConstantFoldUnaryOpOperand(Opcode, Ops[0], DL);
 
-  if (Instruction::isBinaryOp(Opcode))
+  if (Instruction::isBinaryOp(Opcode)) {
+    switch (Opcode) {
+    default:
+      break;
+    case Instruction::FAdd:
+    case Instruction::FSub:
+    case Instruction::FMul:
+    case Instruction::FDiv:
+    case Instruction::FRem:
+      // Handle floating point instructions separately to account for denormals
+      // TODO: If a constant expression is being folded rather than an
+      // instruction, denormals will not be flushed/treated as zero
+      if (const auto *I = dyn_cast<Instruction>(InstOrCE)) {
+        return ConstantFoldFPInstOperands(Opcode, Ops[0], Ops[1], DL, I);
+      }
+    }
     return ConstantFoldBinaryOpOperands(Opcode, Ops[0], Ops[1], DL);
+  }
 
   if (Instruction::isCast(Opcode))
     return ConstantFoldCastOperand(Opcode, Ops[0], DestTy, DL);
@@ -1027,13 +1061,21 @@ Constant *ConstantFoldInstOperandsImpl(const Value *InstOrCE, unsigned Opcode,
                                           GEP->getInRangeIndex());
   }
 
-  if (auto *CE = dyn_cast<ConstantExpr>(InstOrCE))
+  if (auto *CE = dyn_cast<ConstantExpr>(InstOrCE)) {
+    if (CE->isCompare())
+      return ConstantFoldCompareInstOperands(CE->getPredicate(), Ops[0], Ops[1],
+                                             DL, TLI);
     return CE->getWithOperands(Ops);
+  }
 
   switch (Opcode) {
   default: return nullptr;
   case Instruction::ICmp:
-  case Instruction::FCmp: llvm_unreachable("Invalid for compares");
+  case Instruction::FCmp: {
+    auto *C = cast<CmpInst>(InstOrCE);
+    return ConstantFoldCompareInstOperands(C->getPredicate(), Ops[0], Ops[1],
+                                           DL, TLI, C);
+  }
   case Instruction::Freeze:
     return isGuaranteedNotToBeUndefOrPoison(Ops[0]) ? Ops[0] : nullptr;
   case Instruction::Call:
@@ -1048,13 +1090,22 @@ Constant *ConstantFoldInstOperandsImpl(const Value *InstOrCE, unsigned Opcode,
   case Instruction::ExtractElement:
     return ConstantExpr::getExtractElement(Ops[0], Ops[1]);
   case Instruction::ExtractValue:
-    return ConstantExpr::getExtractValue(
+    return ConstantFoldExtractValueInstruction(
         Ops[0], cast<ExtractValueInst>(InstOrCE)->getIndices());
   case Instruction::InsertElement:
     return ConstantExpr::getInsertElement(Ops[0], Ops[1], Ops[2]);
+  case Instruction::InsertValue:
+    return ConstantFoldInsertValueInstruction(
+        Ops[0], Ops[1], cast<InsertValueInst>(InstOrCE)->getIndices());
   case Instruction::ShuffleVector:
     return ConstantExpr::getShuffleVector(
         Ops[0], Ops[1], cast<ShuffleVectorInst>(InstOrCE)->getShuffleMask());
+  case Instruction::Load: {
+    const auto *LI = dyn_cast<LoadInst>(InstOrCE);
+    if (LI->isVolatile())
+      return nullptr;
+    return ConstantFoldLoadFromConstPtr(Ops[0], LI->getType(), DL);
+  }
   }
 }
 
@@ -1091,13 +1142,8 @@ ConstantFoldConstantImpl(const Constant *C, const DataLayout &DL,
     Ops.push_back(NewC);
   }
 
-  if (auto *CE = dyn_cast<ConstantExpr>(C)) {
-    if (CE->isCompare())
-      return ConstantFoldCompareInstOperands(CE->getPredicate(), Ops[0], Ops[1],
-                                             DL, TLI);
-
+  if (auto *CE = dyn_cast<ConstantExpr>(C))
     return ConstantFoldInstOperandsImpl(CE, CE->getOpcode(), Ops, DL, TLI);
-  }
 
   assert(isa<ConstantVector>(C));
   return ConstantVector::get(Ops);
@@ -1150,22 +1196,6 @@ Constant *llvm::ConstantFoldInstruction(Instruction *I, const DataLayout &DL,
     Ops.push_back(Op);
   }
 
-  if (const auto *CI = dyn_cast<CmpInst>(I))
-    return ConstantFoldCompareInstOperands(CI->getPredicate(), Ops[0], Ops[1],
-                                           DL, TLI);
-
-  if (const auto *LI = dyn_cast<LoadInst>(I)) {
-    if (LI->isVolatile())
-      return nullptr;
-    return ConstantFoldLoadFromConstPtr(Ops[0], LI->getType(), DL);
-  }
-
-  if (auto *IVI = dyn_cast<InsertValueInst>(I))
-    return ConstantExpr::getInsertValue(Ops[0], Ops[1], IVI->getIndices());
-
-  if (auto *EVI = dyn_cast<ExtractValueInst>(I))
-    return ConstantExpr::getExtractValue(Ops[0], EVI->getIndices());
-
   return ConstantFoldInstOperands(I, Ops, DL, TLI);
 }
 
@@ -1182,10 +1212,9 @@ Constant *llvm::ConstantFoldInstOperands(Instruction *I,
   return ConstantFoldInstOperandsImpl(I, I->getOpcode(), Ops, DL, TLI);
 }
 
-Constant *llvm::ConstantFoldCompareInstOperands(unsigned IntPredicate,
-                                                Constant *Ops0, Constant *Ops1,
-                                                const DataLayout &DL,
-                                                const TargetLibraryInfo *TLI) {
+Constant *llvm::ConstantFoldCompareInstOperands(
+    unsigned IntPredicate, Constant *Ops0, Constant *Ops1, const DataLayout &DL,
+    const TargetLibraryInfo *TLI, const Instruction *I) {
   CmpInst::Predicate Predicate = (CmpInst::Predicate)IntPredicate;
   // fold: icmp (inttoptr x), null         -> icmp x, 0
   // fold: icmp null, (inttoptr x)         -> icmp 0, x
@@ -1287,6 +1316,11 @@ Constant *llvm::ConstantFoldCompareInstOperands(unsigned IntPredicate,
     return ConstantFoldCompareInstOperands(Predicate, Ops1, Ops0, DL, TLI);
   }
 
+  // Flush any denormal constant float input according to denormal handling
+  // mode.
+  Ops0 = FlushFPConstant(Ops0, I, /* IsOutput */ false);
+  Ops1 = FlushFPConstant(Ops1, I, /* IsOutput */ false);
+
   return ConstantExpr::getCompare(Predicate, Ops0, Ops1);
 }
 
@@ -1308,6 +1342,63 @@ Constant *llvm::ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS,
   return ConstantExpr::get(Opcode, LHS, RHS);
 }
 
+Constant *llvm::FlushFPConstant(Constant *Operand, const Instruction *I,
+                                bool IsOutput) {
+  if (!I || !I->getParent() || !I->getFunction())
+    return Operand;
+
+  ConstantFP *CFP = dyn_cast<ConstantFP>(Operand);
+  if (!CFP)
+    return Operand;
+
+  const APFloat &APF = CFP->getValueAPF();
+  Type *Ty = CFP->getType();
+  DenormalMode DenormMode =
+      I->getFunction()->getDenormalMode(Ty->getFltSemantics());
+  DenormalMode::DenormalModeKind Mode =
+      IsOutput ? DenormMode.Output : DenormMode.Input;
+  switch (Mode) {
+  default:
+    llvm_unreachable("unknown denormal mode");
+    return Operand;
+  case DenormalMode::IEEE:
+    return Operand;
+  case DenormalMode::PreserveSign:
+    if (APF.isDenormal()) {
+      return ConstantFP::get(
+          Ty->getContext(),
+          APFloat::getZero(Ty->getFltSemantics(), APF.isNegative()));
+    }
+    return Operand;
+  case DenormalMode::PositiveZero:
+    if (APF.isDenormal()) {
+      return ConstantFP::get(Ty->getContext(),
+                             APFloat::getZero(Ty->getFltSemantics(), false));
+    }
+    return Operand;
+  }
+  return Operand;
+}
+
+Constant *llvm::ConstantFoldFPInstOperands(unsigned Opcode, Constant *LHS,
+                                           Constant *RHS, const DataLayout &DL,
+                                           const Instruction *I) {
+  if (Instruction::isBinaryOp(Opcode)) {
+    // Flush denormal inputs if needed.
+    Constant *Op0 = FlushFPConstant(LHS, I, /* IsOutput */ false);
+    Constant *Op1 = FlushFPConstant(RHS, I, /* IsOutput */ false);
+
+    // Calculate constant result.
+    Constant *C = ConstantFoldBinaryOpOperands(Opcode, Op0, Op1, DL);
+
+    // Flush denormal output if needed.
+    return FlushFPConstant(C, I, /* IsOutput */ true);
+  }
+  // If instruction lacks a parent/function and the denormal mode cannot be
+  // determined, use the default (IEEE).
+  return ConstantFoldBinaryOpOperands(Opcode, LHS, RHS, DL);
+}
+
 Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C,
                                         Type *DestTy, const DataLayout &DL) {
   assert(Instruction::isCast(Opcode));
@@ -1334,6 +1425,19 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C,
             DL, BaseOffset, /*AllowNonInbounds=*/true));
         if (Base->isNullValue()) {
           FoldedValue = ConstantInt::get(CE->getContext(), BaseOffset);
+        } else {
+          // ptrtoint (gep i8, Ptr, (sub 0, V)) -> sub (ptrtoint Ptr), V
+          if (GEP->getNumIndices() == 1 &&
+              GEP->getSourceElementType()->isIntegerTy(8)) {
+            auto *Ptr = cast<Constant>(GEP->getPointerOperand());
+            auto *Sub = dyn_cast<ConstantExpr>(GEP->getOperand(1));
+            Type *IntIdxTy = DL.getIndexType(Ptr->getType());
+            if (Sub && Sub->getType() == IntIdxTy &&
+                Sub->getOpcode() == Instruction::Sub &&
+                Sub->getOperand(0)->isNullValue())
+              FoldedValue = ConstantExpr::getSub(
+                  ConstantExpr::getPtrToInt(Ptr, IntIdxTy), Sub->getOperand(1));
+          }
         }
       }
       if (FoldedValue) {
@@ -1386,6 +1490,8 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C,
 bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
   if (Call->isNoBuiltin())
     return false;
+  if (Call->getFunctionType() != F->getFunctionType())
+    return false;
   switch (F->getIntrinsicID()) {
   // Operations that do not operate floating-point numbers and do not depend on
   // FP environment can be folded even in strictfp functions.
@@ -1527,6 +1633,8 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
   case Intrinsic::experimental_constrained_trunc:
   case Intrinsic::experimental_constrained_nearbyint:
   case Intrinsic::experimental_constrained_rint:
+  case Intrinsic::experimental_constrained_fcmp:
+  case Intrinsic::experimental_constrained_fcmps:
     return true;
   default:
     return false;
@@ -1798,12 +1906,12 @@ static bool mayFoldConstrained(ConstrainedFPIntrinsic *CI,
 
   // If evaluation raised FP exception, the result can depend on rounding
   // mode. If the latter is unknown, folding is not possible.
-  if (!ORM || *ORM == RoundingMode::Dynamic)
+  if (ORM && *ORM == RoundingMode::Dynamic)
     return false;
 
   // If FP exceptions are ignored, fold the call, even if such exception is
   // raised.
-  if (!EB || *EB != fp::ExceptionBehavior::ebStrict)
+  if (EB && *EB != fp::ExceptionBehavior::ebStrict)
     return true;
 
   // Leave the calculation for runtime so that exception flags be correctly set
@@ -1979,7 +2087,7 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
     case Intrinsic::experimental_constrained_rint: {
       auto CI = cast<ConstrainedFPIntrinsic>(Call);
       RM = CI->getRoundingMode();
-      if (!RM || RM.getValue() == RoundingMode::Dynamic)
+      if (!RM || *RM == RoundingMode::Dynamic)
         return nullptr;
       break;
     }
@@ -2301,6 +2409,24 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
   return nullptr;
 }
 
+static Constant *evaluateCompare(const APFloat &Op1, const APFloat &Op2,
+                                 const ConstrainedFPIntrinsic *Call) {
+  APFloat::opStatus St = APFloat::opOK;
+  auto *FCmp = cast<ConstrainedFPCmpIntrinsic>(Call);
+  FCmpInst::Predicate Cond = FCmp->getPredicate();
+  if (FCmp->isSignaling()) {
+    if (Op1.isNaN() || Op2.isNaN())
+      St = APFloat::opInvalidOp;
+  } else {
+    if (Op1.isSignaling() || Op2.isSignaling())
+      St = APFloat::opInvalidOp;
+  }
+  bool Result = FCmpInst::compare(Op1, Op2, Cond);
+  if (mayFoldConstrained(const_cast<ConstrainedFPCmpIntrinsic *>(FCmp), St))
+    return ConstantInt::get(Call->getType()->getScalarType(), Result);
+  return nullptr;
+}
+
 static Constant *ConstantFoldScalarCall2(StringRef Name,
                                          Intrinsic::ID IntrinsicID,
                                          Type *Ty,
@@ -2329,8 +2455,6 @@ static Constant *ConstantFoldScalarCall2(StringRef Name,
   }
 
   if (const auto *Op1 = dyn_cast<ConstantFP>(Operands[0])) {
-    if (!Ty->isFloatingPointTy())
-      return nullptr;
     const APFloat &Op1V = Op1->getValueAPF();
 
     if (const auto *Op2 = dyn_cast<ConstantFP>(Operands[1])) {
@@ -2360,6 +2484,9 @@ static Constant *ConstantFoldScalarCall2(StringRef Name,
         case Intrinsic::experimental_constrained_frem:
           St = Res.mod(Op2V);
           break;
+        case Intrinsic::experimental_constrained_fcmp:
+        case Intrinsic::experimental_constrained_fcmps:
+          return evaluateCompare(Op1V, Op2V, ConstrIntr);
         }
         if (mayFoldConstrained(const_cast<ConstrainedFPIntrinsic *>(ConstrIntr),
                                St))
@@ -2484,6 +2611,11 @@ static Constant *ConstantFoldScalarCall2(StringRef Name,
     case Intrinsic::smin:
     case Intrinsic::umax:
     case Intrinsic::umin:
+      // This is the same as for binary ops - poison propagates.
+      // TODO: Poison handling should be consolidated.
+      if (isa<PoisonValue>(Operands[0]) || isa<PoisonValue>(Operands[1]))
+        return PoisonValue::get(Ty);
+
       if (!C0 && !C1)
         return UndefValue::get(Ty);
       if (!C0 || !C1)
@@ -2550,6 +2682,11 @@ static Constant *ConstantFoldScalarCall2(StringRef Name,
     }
     case Intrinsic::uadd_sat:
     case Intrinsic::sadd_sat:
+      // This is the same as for binary ops - poison propagates.
+      // TODO: Poison handling should be consolidated.
+      if (isa<PoisonValue>(Operands[0]) || isa<PoisonValue>(Operands[1]))
+        return PoisonValue::get(Ty);
+
       if (!C0 && !C1)
         return UndefValue::get(Ty);
       if (!C0 || !C1)
@@ -2560,6 +2697,11 @@ static Constant *ConstantFoldScalarCall2(StringRef Name,
         return ConstantInt::get(Ty, C0->sadd_sat(*C1));
     case Intrinsic::usub_sat:
     case Intrinsic::ssub_sat:
+      // This is the same as for binary ops - poison propagates.
+      // TODO: Poison handling should be consolidated.
+      if (isa<PoisonValue>(Operands[0]) || isa<PoisonValue>(Operands[1]))
+        return PoisonValue::get(Ty);
+
       if (!C0 && !C1)
         return UndefValue::get(Ty);
       if (!C0 || !C1)
@@ -2840,11 +2982,11 @@ static Constant *ConstantFoldScalarCall3(StringRef Name,
     unsigned Width = C0->getBitWidth();
     assert(Scale < Width && "Illegal scale.");
     unsigned ExtendedWidth = Width * 2;
-    APInt Product = (C0->sextOrSelf(ExtendedWidth) *
-                     C1->sextOrSelf(ExtendedWidth)).ashr(Scale);
+    APInt Product =
+        (C0->sext(ExtendedWidth) * C1->sext(ExtendedWidth)).ashr(Scale);
     if (IntrinsicID == Intrinsic::smul_fix_sat) {
-      APInt Max = APInt::getSignedMaxValue(Width).sextOrSelf(ExtendedWidth);
-      APInt Min = APInt::getSignedMinValue(Width).sextOrSelf(ExtendedWidth);
+      APInt Max = APInt::getSignedMaxValue(Width).sext(ExtendedWidth);
+      APInt Min = APInt::getSignedMinValue(Width).sext(ExtendedWidth);
       Product = APIntOps::smin(Product, Max);
       Product = APIntOps::smax(Product, Min);
     }
@@ -2998,7 +3140,7 @@ static Constant *ConstantFoldFixedVectorCall(
     // Gather a column of constants.
     for (unsigned J = 0, JE = Operands.size(); J != JE; ++J) {
       // Some intrinsics use a scalar type for certain arguments.
-      if (hasVectorInstrinsicScalarOpd(IntrinsicID, J)) {
+      if (isVectorIntrinsicWithScalarOpAtArg(IntrinsicID, J)) {
         Lane[J] = Operands[J];
         continue;
       }
diff --git a/llvm/lib/Analysis/ConstraintSystem.cpp b/llvm/lib/Analysis/ConstraintSystem.cpp
index 773f71ada0ee..dc774728ab3d 100644
--- a/llvm/lib/Analysis/ConstraintSystem.cpp
+++ b/llvm/lib/Analysis/ConstraintSystem.cpp
@@ -12,7 +12,6 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Debug.h"
 
-#include <algorithm>
 #include <string>
 
 using namespace llvm;
diff --git a/llvm/lib/Analysis/CostModel.cpp b/llvm/lib/Analysis/CostModel.cpp
index 326bacad01fe..52e424ae324b 100644
--- a/llvm/lib/Analysis/CostModel.cpp
+++ b/llvm/lib/Analysis/CostModel.cpp
@@ -17,7 +17,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/CostModel.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Function.h"
@@ -25,7 +24,6 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -119,7 +117,7 @@ void CostModelAnalysis::print(raw_ostream &OS, const Module*) const {
 PreservedAnalyses CostModelPrinterPass::run(Function &F,
                                             FunctionAnalysisManager &AM) {
   auto &TTI = AM.getResult<TargetIRAnalysis>(F);
-  OS << "Cost Model for function '" << F.getName() << "'\n";
+  OS << "Printing analysis 'Cost Model Analysis' for function '" << F.getName() << "':\n";
   for (BasicBlock &B : F) {
     for (Instruction &Inst : B) {
       // TODO: Use a pass parameter instead of cl::opt CostKind to determine
diff --git a/llvm/lib/Analysis/CycleAnalysis.cpp b/llvm/lib/Analysis/CycleAnalysis.cpp
index 09c7ee67e05c..17998123fce7 100644
--- a/llvm/lib/Analysis/CycleAnalysis.cpp
+++ b/llvm/lib/Analysis/CycleAnalysis.cpp
@@ -8,11 +8,15 @@
 
 #include "llvm/Analysis/CycleAnalysis.h"
 #include "llvm/ADT/GenericCycleImpl.h"
-#include "llvm/IR/CFG.h"
+#include "llvm/IR/CFG.h" // for successors found by ADL in GenericCycleImpl.h
 #include "llvm/InitializePasses.h"
 
 using namespace llvm;
 
+namespace llvm {
+class Module;
+}
+
 template class llvm::GenericCycleInfo<SSAContext>;
 template class llvm::GenericCycle<SSAContext>;
 
diff --git a/llvm/lib/Analysis/DDG.cpp b/llvm/lib/Analysis/DDG.cpp
index 7e1357959a3f..998c888dd2d9 100644
--- a/llvm/lib/Analysis/DDG.cpp
+++ b/llvm/lib/Analysis/DDG.cpp
@@ -17,13 +17,12 @@
 using namespace llvm;
 
 static cl::opt<bool> SimplifyDDG(
-    "ddg-simplify", cl::init(true), cl::Hidden, cl::ZeroOrMore,
+    "ddg-simplify", cl::init(true), cl::Hidden,
     cl::desc(
         "Simplify DDG by merging nodes that have less interesting edges."));
 
-static cl::opt<bool>
-    CreatePiBlocks("ddg-pi-blocks", cl::init(true), cl::Hidden, cl::ZeroOrMore,
-                   cl::desc("Create pi-block nodes."));
+static cl::opt<bool> CreatePiBlocks("ddg-pi-blocks", cl::init(true), cl::Hidden,
+                                    cl::desc("Create pi-block nodes."));
 
 #define DEBUG_TYPE "ddg"
 
@@ -34,7 +33,7 @@ template class llvm::DirectedGraph<DDGNode, DDGEdge>;
 //===--------------------------------------------------------------------===//
 // DDGNode implementation
 //===--------------------------------------------------------------------===//
-DDGNode::~DDGNode() {}
+DDGNode::~DDGNode() = default;
 
 bool DDGNode::collectInstructions(
     llvm::function_ref<bool(Instruction *)> const &Pred,
diff --git a/llvm/lib/Analysis/DDGPrinter.cpp b/llvm/lib/Analysis/DDGPrinter.cpp
index 0d5a936723ce..6b5acd204ec7 100644
--- a/llvm/lib/Analysis/DDGPrinter.cpp
+++ b/llvm/lib/Analysis/DDGPrinter.cpp
@@ -18,8 +18,8 @@
 
 using namespace llvm;
 
-static cl::opt<bool> DotOnly("dot-ddg-only", cl::init(false), cl::Hidden,
-                             cl::ZeroOrMore, cl::desc("simple ddg dot graph"));
+static cl::opt<bool> DotOnly("dot-ddg-only", cl::Hidden,
+                             cl::desc("simple ddg dot graph"));
 static cl::opt<std::string> DDGDotFilenamePrefix(
     "dot-ddg-filename-prefix", cl::init("ddg"), cl::Hidden,
     cl::desc("The prefix used for the DDG dot file names."));
diff --git a/llvm/lib/Analysis/Delinearization.cpp b/llvm/lib/Analysis/Delinearization.cpp
index 670532c6d9a8..c36e1d922915 100644
--- a/llvm/lib/Analysis/Delinearization.cpp
+++ b/llvm/lib/Analysis/Delinearization.cpp
@@ -24,9 +24,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/IR/Type.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
@@ -523,6 +521,44 @@ bool llvm::getIndexExpressionsFromGEP(ScalarEvolution &SE,
   return !Subscripts.empty();
 }
 
+bool llvm::tryDelinearizeFixedSizeImpl(
+    ScalarEvolution *SE, Instruction *Inst, const SCEV *AccessFn,
+    SmallVectorImpl<const SCEV *> &Subscripts, SmallVectorImpl<int> &Sizes) {
+  Value *SrcPtr = getLoadStorePointerOperand(Inst);
+
+  // Check the simple case where the array dimensions are fixed size.
+  auto *SrcGEP = dyn_cast<GetElementPtrInst>(SrcPtr);
+  if (!SrcGEP)
+    return false;
+
+  getIndexExpressionsFromGEP(*SE, SrcGEP, Subscripts, Sizes);
+
+  // Check that the two size arrays are non-empty and equal in length and
+  // value.
+  // TODO: it would be better to let the caller to clear Subscripts, similar
+  // to how we handle Sizes.
+  if (Sizes.empty() || Subscripts.size() <= 1) {
+    Subscripts.clear();
+    return false;
+  }
+
+  // Check that for identical base pointers we do not miss index offsets
+  // that have been added before this GEP is applied.
+  Value *SrcBasePtr = SrcGEP->getOperand(0)->stripPointerCasts();
+  const SCEVUnknown *SrcBase =
+      dyn_cast<SCEVUnknown>(SE->getPointerBase(AccessFn));
+  if (!SrcBase || SrcBasePtr != SrcBase->getValue()) {
+    Subscripts.clear();
+    return false;
+  }
+
+  assert(Subscripts.size() == Sizes.size() + 1 &&
+         "Expected equal number of entries in the list of size and "
+         "subscript.");
+
+  return true;
+}
+
 namespace {
 
 class Delinearization : public FunctionPass {
diff --git a/llvm/lib/Analysis/DemandedBits.cpp b/llvm/lib/Analysis/DemandedBits.cpp
index 117b12fc0701..e01ed48be376 100644
--- a/llvm/lib/Analysis/DemandedBits.cpp
+++ b/llvm/lib/Analysis/DemandedBits.cpp
@@ -21,19 +21,13 @@
 #include "llvm/Analysis/DemandedBits.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/InstIterator.h"
-#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PassManager.h"
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index f827f74d5367..3d2d84ecadb4 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -50,7 +50,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/DependenceAnalysis.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/Delinearization.h"
@@ -58,10 +57,8 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -109,11 +106,10 @@ STATISTIC(BanerjeeIndependence, "Banerjee independence");
 STATISTIC(BanerjeeSuccesses, "Banerjee successes");
 
 static cl::opt<bool>
-    Delinearize("da-delinearize", cl::init(true), cl::Hidden, cl::ZeroOrMore,
+    Delinearize("da-delinearize", cl::init(true), cl::Hidden,
                 cl::desc("Try to delinearize array references."));
 static cl::opt<bool> DisableDelinearizationChecks(
-    "da-disable-delinearization-checks", cl::init(false), cl::Hidden,
-    cl::ZeroOrMore,
+    "da-disable-delinearization-checks", cl::Hidden,
     cl::desc(
         "Disable checks that try to statically verify validity of "
         "delinearized subscripts. Enabling this option may result in incorrect "
@@ -121,7 +117,7 @@ static cl::opt<bool> DisableDelinearizationChecks(
         "dimension to underflow or overflow into another dimension."));
 
 static cl::opt<unsigned> MIVMaxLevelThreshold(
-    "da-miv-max-level-threshold", cl::init(7), cl::Hidden, cl::ZeroOrMore,
+    "da-miv-max-level-threshold", cl::init(7), cl::Hidden,
     cl::desc("Maximum depth allowed for the recursive algorithm used to "
              "explore MIV direction vectors."));
 
@@ -787,6 +783,8 @@ unsigned DependenceInfo::mapSrcLoop(const Loop *SrcLoop) const {
 unsigned DependenceInfo::mapDstLoop(const Loop *DstLoop) const {
   unsigned D = DstLoop->getLoopDepth();
   if (D > CommonLevels)
+    // This tries to make sure that we assign unique numbers to src and dst when
+    // the memory accesses reside in different loops that have the same depth.
     return D - CommonLevels + SrcLevels;
   else
     return D;
@@ -796,10 +794,16 @@ unsigned DependenceInfo::mapDstLoop(const Loop *DstLoop) const {
 // Returns true if Expression is loop invariant in LoopNest.
 bool DependenceInfo::isLoopInvariant(const SCEV *Expression,
                                      const Loop *LoopNest) const {
+  // Unlike ScalarEvolution::isLoopInvariant() we consider an access outside of
+  // any loop as invariant, because we only consier expression evaluation at a
+  // specific position (where the array access takes place), and not across the
+  // entire function.
   if (!LoopNest)
     return true;
-  return SE->isLoopInvariant(Expression, LoopNest) &&
-    isLoopInvariant(Expression, LoopNest->getParentLoop());
+
+  // If the expression is invariant in the outermost loop of the loop nest, it
+  // is invariant anywhere in the loop nest.
+  return SE->isLoopInvariant(Expression, LoopNest->getOutermostLoop());
 }
 
 
@@ -890,13 +894,25 @@ void DependenceInfo::removeMatchingExtensions(Subscript *Pair) {
   }
 }
 
-// Examine the scev and return true iff it's linear.
+// Examine the scev and return true iff it's affine.
 // Collect any loops mentioned in the set of "Loops".
 bool DependenceInfo::checkSubscript(const SCEV *Expr, const Loop *LoopNest,
                                     SmallBitVector &Loops, bool IsSrc) {
   const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Expr);
   if (!AddRec)
     return isLoopInvariant(Expr, LoopNest);
+
+  // The AddRec must depend on one of the containing loops. Otherwise,
+  // mapSrcLoop and mapDstLoop return indices outside the intended range. This
+  // can happen when a subscript in one loop references an IV from a sibling
+  // loop that could not be replaced with a concrete exit value by
+  // getSCEVAtScope.
+  const Loop *L = LoopNest;
+  while (L && AddRec->getLoop() != L)
+    L = L->getParentLoop();
+  if (!L)
+    return false;
+
   const SCEV *Start = AddRec->getStart();
   const SCEV *Step = AddRec->getStepRecurrence(*SE);
   const SCEV *UB = SE->getBackedgeTakenCount(AddRec->getLoop());
@@ -3318,59 +3334,45 @@ bool DependenceInfo::tryDelinearize(Instruction *Src, Instruction *Dst,
   return true;
 }
 
+/// Try to delinearize \p SrcAccessFn and \p DstAccessFn if the underlying
+/// arrays accessed are fixed-size arrays. Return true if delinearization was
+/// successful.
 bool DependenceInfo::tryDelinearizeFixedSize(
     Instruction *Src, Instruction *Dst, const SCEV *SrcAccessFn,
     const SCEV *DstAccessFn, SmallVectorImpl<const SCEV *> &SrcSubscripts,
     SmallVectorImpl<const SCEV *> &DstSubscripts) {
-
-  Value *SrcPtr = getLoadStorePointerOperand(Src);
-  Value *DstPtr = getLoadStorePointerOperand(Dst);
-  const SCEVUnknown *SrcBase =
-      dyn_cast<SCEVUnknown>(SE->getPointerBase(SrcAccessFn));
-  const SCEVUnknown *DstBase =
-      dyn_cast<SCEVUnknown>(SE->getPointerBase(DstAccessFn));
-  assert(SrcBase && DstBase && SrcBase == DstBase &&
-         "expected src and dst scev unknowns to be equal");
-
-  // Check the simple case where the array dimensions are fixed size.
-  auto *SrcGEP = dyn_cast<GetElementPtrInst>(SrcPtr);
-  auto *DstGEP = dyn_cast<GetElementPtrInst>(DstPtr);
-  if (!SrcGEP || !DstGEP)
+  LLVM_DEBUG({
+    const SCEVUnknown *SrcBase =
+        dyn_cast<SCEVUnknown>(SE->getPointerBase(SrcAccessFn));
+    const SCEVUnknown *DstBase =
+        dyn_cast<SCEVUnknown>(SE->getPointerBase(DstAccessFn));
+    assert(SrcBase && DstBase && SrcBase == DstBase &&
+           "expected src and dst scev unknowns to be equal");
+    });
+
+  SmallVector<int, 4> SrcSizes;
+  SmallVector<int, 4> DstSizes;
+  if (!tryDelinearizeFixedSizeImpl(SE, Src, SrcAccessFn, SrcSubscripts,
+                                   SrcSizes) ||
+      !tryDelinearizeFixedSizeImpl(SE, Dst, DstAccessFn, DstSubscripts,
+                                   DstSizes))
     return false;
 
-  SmallVector<int, 4> SrcSizes, DstSizes;
-  getIndexExpressionsFromGEP(*SE, SrcGEP, SrcSubscripts, SrcSizes);
-  getIndexExpressionsFromGEP(*SE, DstGEP, DstSubscripts, DstSizes);
-
   // Check that the two size arrays are non-empty and equal in length and
   // value.
-  if (SrcSizes.empty() || SrcSubscripts.size() <= 1 ||
-      SrcSizes.size() != DstSizes.size() ||
+  if (SrcSizes.size() != DstSizes.size() ||
       !std::equal(SrcSizes.begin(), SrcSizes.end(), DstSizes.begin())) {
     SrcSubscripts.clear();
     DstSubscripts.clear();
     return false;
   }
 
-  Value *SrcBasePtr = SrcGEP->getOperand(0);
-  Value *DstBasePtr = DstGEP->getOperand(0);
-  while (auto *PCast = dyn_cast<BitCastInst>(SrcBasePtr))
-    SrcBasePtr = PCast->getOperand(0);
-  while (auto *PCast = dyn_cast<BitCastInst>(DstBasePtr))
-    DstBasePtr = PCast->getOperand(0);
-
-  // Check that for identical base pointers we do not miss index offsets
-  // that have been added before this GEP is applied.
-  if (SrcBasePtr != SrcBase->getValue() || DstBasePtr != DstBase->getValue()) {
-    SrcSubscripts.clear();
-    DstSubscripts.clear();
-    return false;
-  }
-
   assert(SrcSubscripts.size() == DstSubscripts.size() &&
-         SrcSubscripts.size() == SrcSizes.size() + 1 &&
-         "Expected equal number of entries in the list of sizes and "
-         "subscripts.");
+         "Expected equal number of entries in the list of SrcSubscripts and "
+         "DstSubscripts.");
+
+  Value *SrcPtr = getLoadStorePointerOperand(Src);
+  Value *DstPtr = getLoadStorePointerOperand(Dst);
 
   // In general we cannot safely assume that the subscripts recovered from GEPs
   // are in the range of values defined for their corresponding array
@@ -3406,8 +3408,8 @@ bool DependenceInfo::tryDelinearizeFixedSize(
   }
   LLVM_DEBUG({
     dbgs() << "Delinearized subscripts of fixed-size array\n"
-           << "SrcGEP:" << *SrcGEP << "\n"
-           << "DstGEP:" << *DstGEP << "\n";
+           << "SrcGEP:" << *SrcPtr << "\n"
+           << "DstGEP:" << *DstPtr << "\n";
   });
   return true;
 }
diff --git a/llvm/lib/Analysis/DependenceGraphBuilder.cpp b/llvm/lib/Analysis/DependenceGraphBuilder.cpp
index 6b90db4bafe1..7ee2adf49ebb 100644
--- a/llvm/lib/Analysis/DependenceGraphBuilder.cpp
+++ b/llvm/lib/Analysis/DependenceGraphBuilder.cpp
@@ -12,6 +12,7 @@
 #include "llvm/Analysis/DependenceGraphBuilder.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/EnumeratedArray.h"
+#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/DDG.h"
diff --git a/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp b/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp
index 4a792fce51d1..79ea160afc22 100644
--- a/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp
+++ b/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 #include "llvm/Config/config.h"
-#include "llvm/Support/Casting.h"
 #if defined(LLVM_HAVE_TF_API)
 
 #include "llvm/ADT/BitVector.h"
@@ -273,8 +272,8 @@ static const std::vector<TensorSpec> TrainingOnlyFeatures{
 static const std::vector<TensorSpec> getInputFeatures() {
   std::vector<TensorSpec> InputSpecs;
   for (size_t I = 0; I < NumberOfFeatures; ++I)
-    InputSpecs.push_back(
-        TensorSpec::createSpec<int64_t>(TFFeedPrefix + FeatureNameMap[I], {1}));
+    InputSpecs.push_back(TensorSpec::createSpec<int64_t>(
+        TFFeedPrefix + FeatureMap[I].name(), FeatureMap[I].shape()));
   append_range(InputSpecs, TrainingOnlyFeatures);
   return InputSpecs;
 }
@@ -290,8 +289,7 @@ TrainingLogger::TrainingLogger(StringRef LogFileName,
   std::vector<LoggedFeatureSpec> FT;
 
   for (size_t I = 0; I < NumberOfFeatures; ++I)
-    FT.push_back(
-        {TensorSpec::createSpec<int64_t>(FeatureNameMap.at(I), {1}), None});
+    FT.push_back({FeatureMap.at(I), None});
   if (MUTR && MUTR->outputLoggedFeatureSpecs().size() > 1)
     append_range(FT, drop_begin(MUTR->outputLoggedFeatureSpecs()));
 
diff --git a/llvm/lib/Analysis/DivergenceAnalysis.cpp b/llvm/lib/Analysis/DivergenceAnalysis.cpp
index 39e80c2ad51c..1a4b09e0cac2 100644
--- a/llvm/lib/Analysis/DivergenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DivergenceAnalysis.cpp
@@ -73,15 +73,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/llvm/lib/Analysis/DomPrinter.cpp b/llvm/lib/Analysis/DomPrinter.cpp
index 6088de53028d..e9f5103e1276 100644
--- a/llvm/lib/Analysis/DomPrinter.cpp
+++ b/llvm/lib/Analysis/DomPrinter.cpp
@@ -24,74 +24,6 @@
 
 using namespace llvm;
 
-namespace llvm {
-template<>
-struct DOTGraphTraits<DomTreeNode*> : public DefaultDOTGraphTraits {
-
-  DOTGraphTraits (bool isSimple=false)
-    : DefaultDOTGraphTraits(isSimple) {}
-
-  std::string getNodeLabel(DomTreeNode *Node, DomTreeNode *Graph) {
-
-    BasicBlock *BB = Node->getBlock();
-
-    if (!BB)
-      return "Post dominance root node";
-
-
-    if (isSimple())
-      return DOTGraphTraits<DOTFuncInfo *>
-        ::getSimpleNodeLabel(BB, nullptr);
-    else
-      return DOTGraphTraits<DOTFuncInfo *>
-        ::getCompleteNodeLabel(BB, nullptr);
-  }
-};
-
-template<>
-struct DOTGraphTraits<DominatorTree*> : public DOTGraphTraits<DomTreeNode*> {
-
-  DOTGraphTraits (bool isSimple=false)
-    : DOTGraphTraits<DomTreeNode*>(isSimple) {}
-
-  static std::string getGraphName(DominatorTree *DT) {
-    return "Dominator tree";
-  }
-
-  std::string getNodeLabel(DomTreeNode *Node, DominatorTree *G) {
-    return DOTGraphTraits<DomTreeNode*>::getNodeLabel(Node, G->getRootNode());
-  }
-};
-
-template<>
-struct DOTGraphTraits<PostDominatorTree*>
-  : public DOTGraphTraits<DomTreeNode*> {
-
-  DOTGraphTraits (bool isSimple=false)
-    : DOTGraphTraits<DomTreeNode*>(isSimple) {}
-
-  static std::string getGraphName(PostDominatorTree *DT) {
-    return "Post dominator tree";
-  }
-
-  std::string getNodeLabel(DomTreeNode *Node, PostDominatorTree *G ) {
-    return DOTGraphTraits<DomTreeNode*>::getNodeLabel(Node, G->getRootNode());
-  }
-};
-}
-
-PreservedAnalyses DomTreePrinterPass::run(Function &F,
-                                          FunctionAnalysisManager &AM) {
-  WriteDOTGraphToFile(F, &AM.getResult<DominatorTreeAnalysis>(F), "dom", false);
-  return PreservedAnalyses::all();
-}
-
-PreservedAnalyses DomTreeOnlyPrinterPass::run(Function &F,
-                                              FunctionAnalysisManager &AM) {
-  WriteDOTGraphToFile(F, &AM.getResult<DominatorTreeAnalysis>(F), "domonly",
-                      true);
-  return PreservedAnalyses::all();
-}
 
 void DominatorTree::viewGraph(const Twine &Name, const Twine &Title) {
 #ifndef NDEBUG
@@ -110,166 +42,167 @@ void DominatorTree::viewGraph() {
 }
 
 namespace {
-struct DominatorTreeWrapperPassAnalysisGraphTraits {
+struct LegacyDominatorTreeWrapperPassAnalysisGraphTraits {
   static DominatorTree *getGraph(DominatorTreeWrapperPass *DTWP) {
     return &DTWP->getDomTree();
   }
 };
 
-struct DomViewer : public DOTGraphTraitsViewer<
-                       DominatorTreeWrapperPass, false, DominatorTree *,
-                       DominatorTreeWrapperPassAnalysisGraphTraits> {
+struct DomViewerWrapperPass
+    : public DOTGraphTraitsViewerWrapperPass<
+          DominatorTreeWrapperPass, false, DominatorTree *,
+          LegacyDominatorTreeWrapperPassAnalysisGraphTraits> {
   static char ID;
-  DomViewer()
-      : DOTGraphTraitsViewer<DominatorTreeWrapperPass, false, DominatorTree *,
-                             DominatorTreeWrapperPassAnalysisGraphTraits>(
-            "dom", ID) {
-    initializeDomViewerPass(*PassRegistry::getPassRegistry());
+  DomViewerWrapperPass()
+      : DOTGraphTraitsViewerWrapperPass<
+            DominatorTreeWrapperPass, false, DominatorTree *,
+            LegacyDominatorTreeWrapperPassAnalysisGraphTraits>("dom", ID) {
+    initializeDomViewerWrapperPassPass(*PassRegistry::getPassRegistry());
   }
 };
 
-struct DomOnlyViewer : public DOTGraphTraitsViewer<
-                           DominatorTreeWrapperPass, true, DominatorTree *,
-                           DominatorTreeWrapperPassAnalysisGraphTraits> {
+struct DomOnlyViewerWrapperPass
+    : public DOTGraphTraitsViewerWrapperPass<
+          DominatorTreeWrapperPass, true, DominatorTree *,
+          LegacyDominatorTreeWrapperPassAnalysisGraphTraits> {
   static char ID;
-  DomOnlyViewer()
-      : DOTGraphTraitsViewer<DominatorTreeWrapperPass, true, DominatorTree *,
-                             DominatorTreeWrapperPassAnalysisGraphTraits>(
-            "domonly", ID) {
-    initializeDomOnlyViewerPass(*PassRegistry::getPassRegistry());
+  DomOnlyViewerWrapperPass()
+      : DOTGraphTraitsViewerWrapperPass<
+            DominatorTreeWrapperPass, true, DominatorTree *,
+            LegacyDominatorTreeWrapperPassAnalysisGraphTraits>("domonly", ID) {
+    initializeDomOnlyViewerWrapperPassPass(*PassRegistry::getPassRegistry());
   }
 };
 
-struct PostDominatorTreeWrapperPassAnalysisGraphTraits {
+struct LegacyPostDominatorTreeWrapperPassAnalysisGraphTraits {
   static PostDominatorTree *getGraph(PostDominatorTreeWrapperPass *PDTWP) {
     return &PDTWP->getPostDomTree();
   }
 };
 
-struct PostDomViewer : public DOTGraphTraitsViewer<
-                          PostDominatorTreeWrapperPass, false,
-                          PostDominatorTree *,
-                          PostDominatorTreeWrapperPassAnalysisGraphTraits> {
+struct PostDomViewerWrapperPass
+    : public DOTGraphTraitsViewerWrapperPass<
+          PostDominatorTreeWrapperPass, false, PostDominatorTree *,
+          LegacyPostDominatorTreeWrapperPassAnalysisGraphTraits> {
   static char ID;
-  PostDomViewer() :
-    DOTGraphTraitsViewer<PostDominatorTreeWrapperPass, false,
-                         PostDominatorTree *,
-                         PostDominatorTreeWrapperPassAnalysisGraphTraits>(
-        "postdom", ID){
-      initializePostDomViewerPass(*PassRegistry::getPassRegistry());
-    }
+  PostDomViewerWrapperPass()
+      : DOTGraphTraitsViewerWrapperPass<
+            PostDominatorTreeWrapperPass, false, PostDominatorTree *,
+            LegacyPostDominatorTreeWrapperPassAnalysisGraphTraits>("postdom",
+                                                                   ID) {
+    initializePostDomViewerWrapperPassPass(*PassRegistry::getPassRegistry());
+  }
 };
 
-struct PostDomOnlyViewer : public DOTGraphTraitsViewer<
-                            PostDominatorTreeWrapperPass, true,
-                            PostDominatorTree *,
-                            PostDominatorTreeWrapperPassAnalysisGraphTraits> {
+struct PostDomOnlyViewerWrapperPass
+    : public DOTGraphTraitsViewerWrapperPass<
+          PostDominatorTreeWrapperPass, true, PostDominatorTree *,
+          LegacyPostDominatorTreeWrapperPassAnalysisGraphTraits> {
   static char ID;
-  PostDomOnlyViewer() :
-    DOTGraphTraitsViewer<PostDominatorTreeWrapperPass, true,
-                         PostDominatorTree *,
-                         PostDominatorTreeWrapperPassAnalysisGraphTraits>(
-        "postdomonly", ID){
-      initializePostDomOnlyViewerPass(*PassRegistry::getPassRegistry());
-    }
+  PostDomOnlyViewerWrapperPass()
+      : DOTGraphTraitsViewerWrapperPass<
+            PostDominatorTreeWrapperPass, true, PostDominatorTree *,
+            LegacyPostDominatorTreeWrapperPassAnalysisGraphTraits>(
+            "postdomonly", ID) {
+    initializePostDomOnlyViewerWrapperPassPass(
+        *PassRegistry::getPassRegistry());
+  }
 };
 } // end anonymous namespace
 
-char DomViewer::ID = 0;
-INITIALIZE_PASS(DomViewer, "view-dom",
+char DomViewerWrapperPass::ID = 0;
+INITIALIZE_PASS(DomViewerWrapperPass, "view-dom",
                 "View dominance tree of function", false, false)
 
-char DomOnlyViewer::ID = 0;
-INITIALIZE_PASS(DomOnlyViewer, "view-dom-only",
+char DomOnlyViewerWrapperPass::ID = 0;
+INITIALIZE_PASS(DomOnlyViewerWrapperPass, "view-dom-only",
                 "View dominance tree of function (with no function bodies)",
                 false, false)
 
-char PostDomViewer::ID = 0;
-INITIALIZE_PASS(PostDomViewer, "view-postdom",
+char PostDomViewerWrapperPass::ID = 0;
+INITIALIZE_PASS(PostDomViewerWrapperPass, "view-postdom",
                 "View postdominance tree of function", false, false)
 
-char PostDomOnlyViewer::ID = 0;
-INITIALIZE_PASS(PostDomOnlyViewer, "view-postdom-only",
+char PostDomOnlyViewerWrapperPass::ID = 0;
+INITIALIZE_PASS(PostDomOnlyViewerWrapperPass, "view-postdom-only",
                 "View postdominance tree of function "
                 "(with no function bodies)",
                 false, false)
 
 namespace {
-struct DomPrinter : public DOTGraphTraitsPrinter<
-                        DominatorTreeWrapperPass, false, DominatorTree *,
-                        DominatorTreeWrapperPassAnalysisGraphTraits> {
+struct DomPrinterWrapperPass
+    : public DOTGraphTraitsPrinterWrapperPass<
+          DominatorTreeWrapperPass, false, DominatorTree *,
+          LegacyDominatorTreeWrapperPassAnalysisGraphTraits> {
   static char ID;
-  DomPrinter()
-      : DOTGraphTraitsPrinter<DominatorTreeWrapperPass, false, DominatorTree *,
-                              DominatorTreeWrapperPassAnalysisGraphTraits>(
-            "dom", ID) {
-    initializeDomPrinterPass(*PassRegistry::getPassRegistry());
+  DomPrinterWrapperPass()
+      : DOTGraphTraitsPrinterWrapperPass<
+            DominatorTreeWrapperPass, false, DominatorTree *,
+            LegacyDominatorTreeWrapperPassAnalysisGraphTraits>("dom", ID) {
+    initializeDomPrinterWrapperPassPass(*PassRegistry::getPassRegistry());
   }
 };
 
-struct DomOnlyPrinter : public DOTGraphTraitsPrinter<
-                            DominatorTreeWrapperPass, true, DominatorTree *,
-                            DominatorTreeWrapperPassAnalysisGraphTraits> {
+struct DomOnlyPrinterWrapperPass
+    : public DOTGraphTraitsPrinterWrapperPass<
+          DominatorTreeWrapperPass, true, DominatorTree *,
+          LegacyDominatorTreeWrapperPassAnalysisGraphTraits> {
   static char ID;
-  DomOnlyPrinter()
-      : DOTGraphTraitsPrinter<DominatorTreeWrapperPass, true, DominatorTree *,
-                              DominatorTreeWrapperPassAnalysisGraphTraits>(
-            "domonly", ID) {
-    initializeDomOnlyPrinterPass(*PassRegistry::getPassRegistry());
+  DomOnlyPrinterWrapperPass()
+      : DOTGraphTraitsPrinterWrapperPass<
+            DominatorTreeWrapperPass, true, DominatorTree *,
+            LegacyDominatorTreeWrapperPassAnalysisGraphTraits>("domonly", ID) {
+    initializeDomOnlyPrinterWrapperPassPass(*PassRegistry::getPassRegistry());
   }
 };
 
-struct PostDomPrinter
-  : public DOTGraphTraitsPrinter<
-                            PostDominatorTreeWrapperPass, false,
-                            PostDominatorTree *,
-                            PostDominatorTreeWrapperPassAnalysisGraphTraits> {
+struct PostDomPrinterWrapperPass
+    : public DOTGraphTraitsPrinterWrapperPass<
+          PostDominatorTreeWrapperPass, false, PostDominatorTree *,
+          LegacyPostDominatorTreeWrapperPassAnalysisGraphTraits> {
   static char ID;
-  PostDomPrinter() :
-    DOTGraphTraitsPrinter<PostDominatorTreeWrapperPass, false,
-                          PostDominatorTree *,
-                          PostDominatorTreeWrapperPassAnalysisGraphTraits>(
-        "postdom", ID) {
-      initializePostDomPrinterPass(*PassRegistry::getPassRegistry());
-    }
+  PostDomPrinterWrapperPass()
+      : DOTGraphTraitsPrinterWrapperPass<
+            PostDominatorTreeWrapperPass, false, PostDominatorTree *,
+            LegacyPostDominatorTreeWrapperPassAnalysisGraphTraits>("postdom",
+                                                                   ID) {
+    initializePostDomPrinterWrapperPassPass(*PassRegistry::getPassRegistry());
+  }
 };
 
-struct PostDomOnlyPrinter
-  : public DOTGraphTraitsPrinter<
-                            PostDominatorTreeWrapperPass, true,
-                            PostDominatorTree *,
-                            PostDominatorTreeWrapperPassAnalysisGraphTraits> {
+struct PostDomOnlyPrinterWrapperPass
+    : public DOTGraphTraitsPrinterWrapperPass<
+          PostDominatorTreeWrapperPass, true, PostDominatorTree *,
+          LegacyPostDominatorTreeWrapperPassAnalysisGraphTraits> {
   static char ID;
-  PostDomOnlyPrinter() :
-    DOTGraphTraitsPrinter<PostDominatorTreeWrapperPass, true,
-                          PostDominatorTree *,
-                          PostDominatorTreeWrapperPassAnalysisGraphTraits>(
-        "postdomonly", ID) {
-      initializePostDomOnlyPrinterPass(*PassRegistry::getPassRegistry());
-    }
+  PostDomOnlyPrinterWrapperPass()
+      : DOTGraphTraitsPrinterWrapperPass<
+            PostDominatorTreeWrapperPass, true, PostDominatorTree *,
+            LegacyPostDominatorTreeWrapperPassAnalysisGraphTraits>(
+            "postdomonly", ID) {
+    initializePostDomOnlyPrinterWrapperPassPass(
+        *PassRegistry::getPassRegistry());
+  }
 };
 } // end anonymous namespace
 
+char DomPrinterWrapperPass::ID = 0;
+INITIALIZE_PASS(DomPrinterWrapperPass, "dot-dom",
+                "Print dominance tree of function to 'dot' file", false, false)
 
-
-char DomPrinter::ID = 0;
-INITIALIZE_PASS(DomPrinter, "dot-dom",
-                "Print dominance tree of function to 'dot' file",
-                false, false)
-
-char DomOnlyPrinter::ID = 0;
-INITIALIZE_PASS(DomOnlyPrinter, "dot-dom-only",
+char DomOnlyPrinterWrapperPass::ID = 0;
+INITIALIZE_PASS(DomOnlyPrinterWrapperPass, "dot-dom-only",
                 "Print dominance tree of function to 'dot' file "
                 "(with no function bodies)",
                 false, false)
 
-char PostDomPrinter::ID = 0;
-INITIALIZE_PASS(PostDomPrinter, "dot-postdom",
-                "Print postdominance tree of function to 'dot' file",
-                false, false)
+char PostDomPrinterWrapperPass::ID = 0;
+INITIALIZE_PASS(PostDomPrinterWrapperPass, "dot-postdom",
+                "Print postdominance tree of function to 'dot' file", false,
+                false)
 
-char PostDomOnlyPrinter::ID = 0;
-INITIALIZE_PASS(PostDomOnlyPrinter, "dot-postdom-only",
+char PostDomOnlyPrinterWrapperPass::ID = 0;
+INITIALIZE_PASS(PostDomOnlyPrinterWrapperPass, "dot-postdom-only",
                 "Print postdominance tree of function to 'dot' file "
                 "(with no function bodies)",
                 false, false)
@@ -278,34 +211,34 @@ INITIALIZE_PASS(PostDomOnlyPrinter, "dot-postdom-only",
 // "include/llvm/LinkAllPasses.h". Otherwise the pass would be deleted by
 // the link time optimization.
 
-FunctionPass *llvm::createDomPrinterPass() {
-  return new DomPrinter();
+FunctionPass *llvm::createDomPrinterWrapperPassPass() {
+  return new DomPrinterWrapperPass();
 }
 
-FunctionPass *llvm::createDomOnlyPrinterPass() {
-  return new DomOnlyPrinter();
+FunctionPass *llvm::createDomOnlyPrinterWrapperPassPass() {
+  return new DomOnlyPrinterWrapperPass();
 }
 
-FunctionPass *llvm::createDomViewerPass() {
-  return new DomViewer();
+FunctionPass *llvm::createDomViewerWrapperPassPass() {
+  return new DomViewerWrapperPass();
 }
 
-FunctionPass *llvm::createDomOnlyViewerPass() {
-  return new DomOnlyViewer();
+FunctionPass *llvm::createDomOnlyViewerWrapperPassPass() {
+  return new DomOnlyViewerWrapperPass();
 }
 
-FunctionPass *llvm::createPostDomPrinterPass() {
-  return new PostDomPrinter();
+FunctionPass *llvm::createPostDomPrinterWrapperPassPass() {
+  return new PostDomPrinterWrapperPass();
 }
 
-FunctionPass *llvm::createPostDomOnlyPrinterPass() {
-  return new PostDomOnlyPrinter();
+FunctionPass *llvm::createPostDomOnlyPrinterWrapperPassPass() {
+  return new PostDomOnlyPrinterWrapperPass();
 }
 
-FunctionPass *llvm::createPostDomViewerPass() {
-  return new PostDomViewer();
+FunctionPass *llvm::createPostDomViewerWrapperPassPass() {
+  return new PostDomViewerWrapperPass();
 }
 
-FunctionPass *llvm::createPostDomOnlyViewerPass() {
-  return new PostDomOnlyViewer();
+FunctionPass *llvm::createPostDomOnlyViewerWrapperPassPass() {
+  return new PostDomOnlyViewerWrapperPass();
 }
diff --git a/llvm/lib/Analysis/DomTreeUpdater.cpp b/llvm/lib/Analysis/DomTreeUpdater.cpp
index 6e299263e66d..888c16723208 100644
--- a/llvm/lib/Analysis/DomTreeUpdater.cpp
+++ b/llvm/lib/Analysis/DomTreeUpdater.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/PostDominators.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/Support/GenericDomTree.h"
 #include <algorithm>
@@ -314,98 +315,6 @@ PostDominatorTree &DomTreeUpdater::getPostDomTree() {
   return *PDT;
 }
 
-void DomTreeUpdater::insertEdge(BasicBlock *From, BasicBlock *To) {
-
-#ifndef NDEBUG
-  assert(isUpdateValid({DominatorTree::Insert, From, To}) &&
-         "Inserted edge does not appear in the CFG");
-#endif
-
-  if (!DT && !PDT)
-    return;
-
-  // Won't affect DomTree and PostDomTree; discard update.
-  if (From == To)
-    return;
-
-  if (Strategy == UpdateStrategy::Eager) {
-    if (DT)
-      DT->insertEdge(From, To);
-    if (PDT)
-      PDT->insertEdge(From, To);
-    return;
-  }
-
-  PendUpdates.push_back({DominatorTree::Insert, From, To});
-}
-
-void DomTreeUpdater::insertEdgeRelaxed(BasicBlock *From, BasicBlock *To) {
-  if (From == To)
-    return;
-
-  if (!DT && !PDT)
-    return;
-
-  if (!isUpdateValid({DominatorTree::Insert, From, To}))
-    return;
-
-  if (Strategy == UpdateStrategy::Eager) {
-    if (DT)
-      DT->insertEdge(From, To);
-    if (PDT)
-      PDT->insertEdge(From, To);
-    return;
-  }
-
-  PendUpdates.push_back({DominatorTree::Insert, From, To});
-}
-
-void DomTreeUpdater::deleteEdge(BasicBlock *From, BasicBlock *To) {
-
-#ifndef NDEBUG
-  assert(isUpdateValid({DominatorTree::Delete, From, To}) &&
-         "Deleted edge still exists in the CFG!");
-#endif
-
-  if (!DT && !PDT)
-    return;
-
-  // Won't affect DomTree and PostDomTree; discard update.
-  if (From == To)
-    return;
-
-  if (Strategy == UpdateStrategy::Eager) {
-    if (DT)
-      DT->deleteEdge(From, To);
-    if (PDT)
-      PDT->deleteEdge(From, To);
-    return;
-  }
-
-  PendUpdates.push_back({DominatorTree::Delete, From, To});
-}
-
-void DomTreeUpdater::deleteEdgeRelaxed(BasicBlock *From, BasicBlock *To) {
-  if (From == To)
-    return;
-
-  if (!DT && !PDT)
-    return;
-
-  if (!isUpdateValid({DominatorTree::Delete, From, To}))
-    return;
-
-  if (Strategy == UpdateStrategy::Eager) {
-    if (DT)
-      DT->deleteEdge(From, To);
-    if (PDT)
-      PDT->deleteEdge(From, To);
-    return;
-  }
-
-  PendUpdates.push_back({DominatorTree::Delete, From, To});
-}
-
 void DomTreeUpdater::dropOutOfDateUpdates() {
   if (Strategy == DomTreeUpdater::UpdateStrategy::Eager)
     return;
diff --git a/llvm/lib/Analysis/DominanceFrontier.cpp b/llvm/lib/Analysis/DominanceFrontier.cpp
index a8806fe5a480..ccba913ccfe5 100644
--- a/llvm/lib/Analysis/DominanceFrontier.cpp
+++ b/llvm/lib/Analysis/DominanceFrontier.cpp
@@ -15,7 +15,6 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Analysis/EHPersonalities.cpp b/llvm/lib/Analysis/EHPersonalities.cpp
index df8b7e12e8d7..277ff6ba735f 100644
--- a/llvm/lib/Analysis/EHPersonalities.cpp
+++ b/llvm/lib/Analysis/EHPersonalities.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
@@ -67,7 +68,10 @@ StringRef llvm::getEHPersonalityName(EHPersonality Pers) {
 }
 
 EHPersonality llvm::getDefaultEHPersonality(const Triple &T) {
-  return EHPersonality::GNU_C;
+  if (T.isPS5())
+    return EHPersonality::GNU_CXX;
+  else
+    return EHPersonality::GNU_C;
 }
 
 bool llvm::canSimplifyInvokeNoUnwind(const Function *F) {
diff --git a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
index 33519038e225..782c11937507 100644
--- a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
+++ b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
@@ -12,48 +12,87 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/FunctionPropertiesAnalysis.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
+#include <deque>
 
 using namespace llvm;
 
-FunctionPropertiesInfo
-FunctionPropertiesInfo::getFunctionPropertiesInfo(const Function &F,
-                                                  const LoopInfo &LI) {
-
-  FunctionPropertiesInfo FPI;
+namespace {
+int64_t getNrBlocksFromCond(const BasicBlock &BB) {
+  int64_t Ret = 0;
+  if (const auto *BI = dyn_cast<BranchInst>(BB.getTerminator())) {
+    if (BI->isConditional())
+      Ret += BI->getNumSuccessors();
+  } else if (const auto *SI = dyn_cast<SwitchInst>(BB.getTerminator())) {
+    Ret += (SI->getNumCases() + (nullptr != SI->getDefaultDest()));
+  }
+  return Ret;
+}
 
-  FPI.Uses = ((!F.hasLocalLinkage()) ? 1 : 0) + F.getNumUses();
+int64_t getUses(const Function &F) {
+  return ((!F.hasLocalLinkage()) ? 1 : 0) + F.getNumUses();
+}
+} // namespace
 
-  for (const auto &BB : F) {
-    ++FPI.BasicBlockCount;
+void FunctionPropertiesInfo::reIncludeBB(const BasicBlock &BB) {
+  updateForBB(BB, +1);
+}
 
-    if (const auto *BI = dyn_cast<BranchInst>(BB.getTerminator())) {
-      if (BI->isConditional())
-        FPI.BlocksReachedFromConditionalInstruction += BI->getNumSuccessors();
-    } else if (const auto *SI = dyn_cast<SwitchInst>(BB.getTerminator())) {
-      FPI.BlocksReachedFromConditionalInstruction +=
-          (SI->getNumCases() + (nullptr != SI->getDefaultDest()));
+void FunctionPropertiesInfo::updateForBB(const BasicBlock &BB,
+                                         int64_t Direction) {
+  assert(Direction == 1 || Direction == -1);
+  BasicBlockCount += Direction;
+  BlocksReachedFromConditionalInstruction +=
+      (Direction * getNrBlocksFromCond(BB));
+  for (const auto &I : BB) {
+    if (auto *CS = dyn_cast<CallBase>(&I)) {
+      const auto *Callee = CS->getCalledFunction();
+      if (Callee && !Callee->isIntrinsic() && !Callee->isDeclaration())
+        DirectCallsToDefinedFunctions += Direction;
     }
-
-    for (const auto &I : BB) {
-      if (auto *CS = dyn_cast<CallBase>(&I)) {
-        const auto *Callee = CS->getCalledFunction();
-        if (Callee && !Callee->isIntrinsic() && !Callee->isDeclaration())
-          ++FPI.DirectCallsToDefinedFunctions;
-      }
-      if (I.getOpcode() == Instruction::Load) {
-        ++FPI.LoadInstCount;
-      } else if (I.getOpcode() == Instruction::Store) {
-        ++FPI.StoreInstCount;
-      }
+    if (I.getOpcode() == Instruction::Load) {
+      LoadInstCount += Direction;
+    } else if (I.getOpcode() == Instruction::Store) {
+      StoreInstCount += Direction;
     }
-    // Loop Depth of the Basic Block
-    int64_t LoopDepth;
-    LoopDepth = LI.getLoopDepth(&BB);
-    if (FPI.MaxLoopDepth < LoopDepth)
-      FPI.MaxLoopDepth = LoopDepth;
   }
-  FPI.TopLevelLoopCount += llvm::size(LI);
+  TotalInstructionCount += Direction * BB.sizeWithoutDebug();
+}
+
+void FunctionPropertiesInfo::updateAggregateStats(const Function &F,
+                                                  const LoopInfo &LI) {
+
+  Uses = getUses(F);
+  TopLevelLoopCount = llvm::size(LI);
+  MaxLoopDepth = 0;
+  std::deque<const Loop *> Worklist;
+  llvm::append_range(Worklist, LI);
+  while (!Worklist.empty()) {
+    const auto *L = Worklist.front();
+    MaxLoopDepth =
+        std::max(MaxLoopDepth, static_cast<int64_t>(L->getLoopDepth()));
+    Worklist.pop_front();
+    llvm::append_range(Worklist, L->getSubLoops());
+  }
+}
+
+FunctionPropertiesInfo FunctionPropertiesInfo::getFunctionPropertiesInfo(
+    const Function &F, FunctionAnalysisManager &FAM) {
+
+  FunctionPropertiesInfo FPI;
+  // The const casts are due to the getResult API - there's no mutation of F.
+  const auto &LI = FAM.getResult<LoopAnalysis>(const_cast<Function &>(F));
+  const auto &DT =
+      FAM.getResult<DominatorTreeAnalysis>(const_cast<Function &>(F));
+  for (const auto &BB : F)
+    if (DT.isReachableFromEntry(&BB))
+      FPI.reIncludeBB(BB);
+  FPI.updateAggregateStats(F, LI);
   return FPI;
 }
 
@@ -67,15 +106,15 @@ void FunctionPropertiesInfo::print(raw_ostream &OS) const {
      << "LoadInstCount: " << LoadInstCount << "\n"
      << "StoreInstCount: " << StoreInstCount << "\n"
      << "MaxLoopDepth: " << MaxLoopDepth << "\n"
-     << "TopLevelLoopCount: " << TopLevelLoopCount << "\n\n";
+     << "TopLevelLoopCount: " << TopLevelLoopCount << "\n"
+     << "TotalInstructionCount: " << TotalInstructionCount << "\n\n";
 }
 
 AnalysisKey FunctionPropertiesAnalysis::Key;
 
 FunctionPropertiesInfo
 FunctionPropertiesAnalysis::run(Function &F, FunctionAnalysisManager &FAM) {
-  return FunctionPropertiesInfo::getFunctionPropertiesInfo(
-      F, FAM.getResult<LoopAnalysis>(F));
+  return FunctionPropertiesInfo::getFunctionPropertiesInfo(F, FAM);
 }
 
 PreservedAnalyses
@@ -86,3 +125,127 @@ FunctionPropertiesPrinterPass::run(Function &F, FunctionAnalysisManager &AM) {
   AM.getResult<FunctionPropertiesAnalysis>(F).print(OS);
   return PreservedAnalyses::all();
 }
+
+FunctionPropertiesUpdater::FunctionPropertiesUpdater(
+    FunctionPropertiesInfo &FPI, const CallBase &CB)
+    : FPI(FPI), CallSiteBB(*CB.getParent()), Caller(*CallSiteBB.getParent()) {
+  assert(isa<CallInst>(CB) || isa<InvokeInst>(CB));
+  // For BBs that are likely to change, we subtract from feature totals their
+  // contribution. Some features, like max loop counts or depths, are left
+  // invalid, as they will be updated post-inlining.
+  SmallPtrSet<const BasicBlock *, 4> LikelyToChangeBBs;
+  // The CB BB will change - it'll either be split or the callee's body (single
+  // BB) will be pasted in.
+  LikelyToChangeBBs.insert(&CallSiteBB);
+
+  // The caller's entry BB may change due to new alloca instructions.
+  LikelyToChangeBBs.insert(&*Caller.begin());
+
+  // The successors may become unreachable in the case of `invoke` inlining.
+  // We track successors separately, too, because they form a boundary, together
+  // with the CB BB ('Entry') between which the inlined callee will be pasted.
+  Successors.insert(succ_begin(&CallSiteBB), succ_end(&CallSiteBB));
+
+  // Inlining only handles invoke and calls. If this is an invoke, and inlining
+  // it pulls another invoke, the original landing pad may get split, so as to
+  // share its content with other potential users. So the edge up to which we
+  // need to invalidate and then re-account BB data is the successors of the
+  // current landing pad. We can leave the current lp, too - if it doesn't get
+  // split, then it will be the place traversal stops. Either way, the
+  // discounted BBs will be checked if reachable and re-added.
+  if (const auto *II = dyn_cast<InvokeInst>(&CB)) {
+    const auto *UnwindDest = II->getUnwindDest();
+    Successors.insert(succ_begin(UnwindDest), succ_end(UnwindDest));
+  }
+
+  // Exclude the CallSiteBB, if it happens to be its own successor (1-BB loop).
+  // We are only interested in BBs the graph moves past the callsite BB to
+  // define the frontier past which we don't want to re-process BBs. Including
+  // the callsite BB in this case would prematurely stop the traversal in
+  // finish().
+  Successors.erase(&CallSiteBB);
+
+  for (const auto *BB : Successors)
+    LikelyToChangeBBs.insert(BB);
+
+  // Commit the change. While some of the BBs accounted for above may play dual
+  // role - e.g. caller's entry BB may be the same as the callsite BB - set
+  // insertion semantics make sure we account them once. This needs to be
+  // followed in `finish`, too.
+  for (const auto *BB : LikelyToChangeBBs)
+    FPI.updateForBB(*BB, -1);
+}
+
+void FunctionPropertiesUpdater::finish(FunctionAnalysisManager &FAM) const {
+  // Update feature values from the BBs that were copied from the callee, or
+  // might have been modified because of inlining. The latter have been
+  // subtracted in the FunctionPropertiesUpdater ctor.
+  // There could be successors that were reached before but now are only
+  // reachable from elsewhere in the CFG.
+  // One example is the following diamond CFG (lines are arrows pointing down):
+  //    A
+  //  /   \
+  // B     C
+  // |     |
+  // |     D
+  // |     |
+  // |     E
+  //  \   /
+  //    F
+  // There's a call site in C that is inlined. Upon doing that, it turns out
+  // it expands to
+  //   call void @llvm.trap()
+  //   unreachable
+  // F isn't reachable from C anymore, but we did discount it when we set up
+  // FunctionPropertiesUpdater, so we need to re-include it here.
+  // At the same time, D and E were reachable before, but now are not anymore,
+  // so we need to leave D out (we discounted it at setup), and explicitly
+  // remove E.
+  SetVector<const BasicBlock *> Reinclude;
+  SetVector<const BasicBlock *> Unreachable;
+  const auto &DT =
+      FAM.getResult<DominatorTreeAnalysis>(const_cast<Function &>(Caller));
+
+  if (&CallSiteBB != &*Caller.begin())
+    Reinclude.insert(&*Caller.begin());
+
+  // Distribute the successors to the 2 buckets.
+  for (const auto *Succ : Successors)
+    if (DT.isReachableFromEntry(Succ))
+      Reinclude.insert(Succ);
+    else
+      Unreachable.insert(Succ);
+
+  // For reinclusion, we want to stop at the reachable successors, who are at
+  // the beginning of the worklist; but, starting from the callsite bb and
+  // ending at those successors, we also want to perform a traversal.
+  // IncludeSuccessorsMark is the index after which we include successors.
+  const auto IncludeSuccessorsMark = Reinclude.size();
+  bool CSInsertion = Reinclude.insert(&CallSiteBB);
+  (void)CSInsertion;
+  assert(CSInsertion);
+  for (size_t I = 0; I < Reinclude.size(); ++I) {
+    const auto *BB = Reinclude[I];
+    FPI.reIncludeBB(*BB);
+    if (I >= IncludeSuccessorsMark)
+      Reinclude.insert(succ_begin(BB), succ_end(BB));
+  }
+
+  // For exclusion, we don't need to exclude the set of BBs that were successors
+  // before and are now unreachable, because we already did that at setup. For
+  // the rest, as long as a successor is unreachable, we want to explicitly
+  // exclude it.
+  const auto AlreadyExcludedMark = Unreachable.size();
+  for (size_t I = 0; I < Unreachable.size(); ++I) {
+    const auto *U = Unreachable[I];
+    if (I >= AlreadyExcludedMark)
+      FPI.updateForBB(*U, -1);
+    for (const auto *Succ : successors(U))
+      if (!DT.isReachableFromEntry(Succ))
+        Unreachable.insert(Succ);
+  }
+
+  const auto &LI = FAM.getResult<LoopAnalysis>(const_cast<Function &>(Caller));
+  FPI.updateAggregateStats(Caller, LI);
+  assert(FPI == FunctionPropertiesInfo::getFunctionPropertiesInfo(Caller, FAM));
+}
diff --git a/llvm/lib/Analysis/GlobalsModRef.cpp b/llvm/lib/Analysis/GlobalsModRef.cpp
index 6869530148c5..e82d2fae9356 100644
--- a/llvm/lib/Analysis/GlobalsModRef.cpp
+++ b/llvm/lib/Analysis/GlobalsModRef.cpp
@@ -21,11 +21,11 @@
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
@@ -68,8 +68,8 @@ class GlobalsAAResult::FunctionInfo {
   /// should provide this much alignment at least, but this makes it clear we
   /// specifically rely on this amount of alignment.
   struct alignas(8) AlignedMap {
-    AlignedMap() {}
-    AlignedMap(const AlignedMap &Arg) : Map(Arg.Map) {}
+    AlignedMap() = default;
+    AlignedMap(const AlignedMap &Arg) = default;
     GlobalInfoMapType Map;
   };
 
@@ -102,7 +102,7 @@ class GlobalsAAResult::FunctionInfo {
                 "Insufficient low bits to store our flag and ModRef info.");
 
 public:
-  FunctionInfo() {}
+  FunctionInfo() = default;
   ~FunctionInfo() {
     delete Info.getPointer();
   }
@@ -511,6 +511,18 @@ void GlobalsAAResult::AnalyzeCallGraph(CallGraph &CG, Module &M) {
     Handles.front().I = Handles.begin();
     bool KnowNothing = false;
 
+    // Intrinsics, like any other synchronizing function, can make effects
+    // of other threads visible. Without nosync we know nothing really.
+    // Similarly, if `nocallback` is missing the function, or intrinsic,
+    // can call into the module arbitrarily. If both are set the function
+    // has an effect but will not interact with accesses of internal
+    // globals inside the module. We are conservative here for optnone
+    // functions, might not be necessary.
+    auto MaySyncOrCallIntoModule = [](const Function &F) {
+      return !F.isDeclaration() || !F.hasNoSync() ||
+             !F.hasFnAttribute(Attribute::NoCallback);
+    };
+
     // Collect the mod/ref properties due to called functions.  We only compute
     // one mod-ref set.
     for (unsigned i = 0, e = SCC.size(); i != e && !KnowNothing; ++i) {
@@ -525,7 +537,7 @@ void GlobalsAAResult::AnalyzeCallGraph(CallGraph &CG, Module &M) {
           // Can't do better than that!
         } else if (F->onlyReadsMemory()) {
           FI.addModRefInfo(ModRefInfo::Ref);
-          if (!F->isIntrinsic() && !F->onlyAccessesArgMemory())
+          if (!F->onlyAccessesArgMemory() && MaySyncOrCallIntoModule(*F))
             // This function might call back into the module and read a global -
             // consider every global as possibly being read by this function.
             FI.setMayReadAnyGlobal();
@@ -533,7 +545,7 @@ void GlobalsAAResult::AnalyzeCallGraph(CallGraph &CG, Module &M) {
           FI.addModRefInfo(ModRefInfo::ModRef);
           if (!F->onlyAccessesArgMemory())
             FI.setMayReadAnyGlobal();
-          if (!F->isIntrinsic()) {
+          if (MaySyncOrCallIntoModule(*F)) {
             KnowNothing = true;
             break;
           }
@@ -585,12 +597,7 @@ void GlobalsAAResult::AnalyzeCallGraph(CallGraph &CG, Module &M) {
         // We handle calls specially because the graph-relevant aspects are
         // handled above.
         if (auto *Call = dyn_cast<CallBase>(&I)) {
-          auto &TLI = GetTLI(*Node->getFunction());
-          if (isAllocationFn(Call, &TLI) || isFreeCall(Call, &TLI)) {
-            // FIXME: It is completely unclear why this is necessary and not
-            // handled by the above graph code.
-            FI.addModRefInfo(ModRefInfo::ModRef);
-          } else if (Function *Callee = Call->getCalledFunction()) {
+          if (Function *Callee = Call->getCalledFunction()) {
             // The callgraph doesn't include intrinsic calls.
             if (Callee->isIntrinsic()) {
               if (isa<DbgInfoIntrinsic>(Call))
@@ -979,7 +986,7 @@ GlobalsAAResult::GlobalsAAResult(GlobalsAAResult &&Arg)
   }
 }
 
-GlobalsAAResult::~GlobalsAAResult() {}
+GlobalsAAResult::~GlobalsAAResult() = default;
 
 /*static*/ GlobalsAAResult GlobalsAAResult::analyzeModule(
     Module &M, std::function<const TargetLibraryInfo &(Function &F)> GetTLI,
@@ -1010,6 +1017,24 @@ GlobalsAAResult GlobalsAA::run(Module &M, ModuleAnalysisManager &AM) {
                                         AM.getResult<CallGraphAnalysis>(M));
 }
 
+PreservedAnalyses RecomputeGlobalsAAPass::run(Module &M,
+                                              ModuleAnalysisManager &AM) {
+  if (auto *G = AM.getCachedResult<GlobalsAA>(M)) {
+    auto &CG = AM.getResult<CallGraphAnalysis>(M);
+    G->NonAddressTakenGlobals.clear();
+    G->UnknownFunctionsWithLocalLinkage = false;
+    G->IndirectGlobals.clear();
+    G->AllocsForIndirectGlobals.clear();
+    G->FunctionInfos.clear();
+    G->FunctionToSCCMap.clear();
+    G->Handles.clear();
+    G->CollectSCCMembership(CG);
+    G->AnalyzeGlobals(M);
+    G->AnalyzeCallGraph(CG, M);
+  }
+  return PreservedAnalyses::all();
+}
+
 char GlobalsAAWrapperPass::ID = 0;
 INITIALIZE_PASS_BEGIN(GlobalsAAWrapperPass, "globals-aa",
                       "Globals Alias Analysis", false, true)
diff --git a/llvm/lib/Analysis/IRSimilarityIdentifier.cpp b/llvm/lib/Analysis/IRSimilarityIdentifier.cpp
index 01681c47418a..3d51042f4da8 100644
--- a/llvm/lib/Analysis/IRSimilarityIdentifier.cpp
+++ b/llvm/lib/Analysis/IRSimilarityIdentifier.cpp
@@ -64,7 +64,7 @@ void IRInstructionData::initializeInstruction() {
   // Here we collect the operands and their types for determining whether
   // the structure of the operand use matches between two different candidates.
   for (Use &OI : Inst->operands()) {
-    if (isa<CmpInst>(Inst) && RevisedPredicate.hasValue()) {
+    if (isa<CmpInst>(Inst) && RevisedPredicate) {
       // If we have a CmpInst where the predicate is reversed, it means the
       // operands must be reversed as well.
       OperVals.insert(OperVals.begin(), OI.get());
@@ -183,7 +183,7 @@ CmpInst::Predicate IRInstructionData::getPredicate() const {
   assert(isa<CmpInst>(Inst) &&
          "Can only get a predicate from a compare instruction");
 
-  if (RevisedPredicate.hasValue())
+  if (RevisedPredicate)
     return RevisedPredicate.getValue();
   
   return cast<CmpInst>(Inst)->getPredicate();
@@ -193,7 +193,7 @@ StringRef IRInstructionData::getCalleeName() const {
   assert(isa<CallInst>(Inst) &&
          "Can only get a name from a call instruction");
 
-  assert(CalleeName.hasValue() && "CalleeName has not been set");
+  assert(CalleeName && "CalleeName has not been set");
 
   return *CalleeName;
 }
@@ -289,14 +289,12 @@ void IRInstructionMapper::convertToUnsignedVec(
     }
   }
 
-  if (HaveLegalRange) {
-    if (AddedIllegalLastTime)
-      mapToIllegalUnsigned(It, IntegerMappingForBB, InstrListForBB, true);
-    for (IRInstructionData *ID : InstrListForBB)
-      this->IDL->push_back(*ID);
-    llvm::append_range(InstrList, InstrListForBB);
-    llvm::append_range(IntegerMapping, IntegerMappingForBB);
-  }
+  if (AddedIllegalLastTime)
+    mapToIllegalUnsigned(It, IntegerMappingForBB, InstrListForBB, true);
+  for (IRInstructionData *ID : InstrListForBB)
+    this->IDL->push_back(*ID);
+  llvm::append_range(InstrList, InstrListForBB);
+  llvm::append_range(IntegerMapping, IntegerMappingForBB);
 }
 
 // TODO: This is the same as the MachineOutliner, and should be consolidated
@@ -461,6 +459,18 @@ IRSimilarityCandidate::IRSimilarityCandidate(unsigned StartIdx, unsigned Len,
   // that both of these instructions are not nullptrs.
   FirstInst = FirstInstIt;
   LastInst = LastInstIt;
+
+  // Add the basic blocks contained in the set into the global value numbering.
+  DenseSet<BasicBlock *> BBSet;
+  getBasicBlocks(BBSet);
+  for (BasicBlock *BB : BBSet) {
+    if (ValueToNumber.find(BB) != ValueToNumber.end())
+      continue;
+    
+    ValueToNumber.try_emplace(BB, LocalValNumber);
+    NumberToValue.try_emplace(LocalValNumber, BB);
+    LocalValNumber++;
+  }
 }
 
 bool IRSimilarityCandidate::isSimilar(const IRSimilarityCandidate &A,
@@ -516,19 +526,13 @@ static bool checkNumberingAndReplaceCommutative(
   for (Value *V : SourceOperands) {
     ArgVal = SourceValueToNumberMapping.find(V)->second;
 
+    // Instead of finding a current mapping, we attempt to insert a set.
     std::tie(ValueMappingIt, WasInserted) = CurrentSrcTgtNumberMapping.insert(
         std::make_pair(ArgVal, TargetValueNumbers));
 
-    // Instead of finding a current mapping, we inserted a set.  This means a
-    // mapping did not exist for the source Instruction operand, it has no
-    // current constraints we need to check.
-    if (WasInserted)
-      continue;
-
-    // If a mapping already exists for the source operand to the values in the
-    // other IRSimilarityCandidate we need to iterate over the items in other
-    // IRSimilarityCandidate's Instruction to determine whether there is a valid
-    // mapping of Value to Value.
+    // We need to iterate over the items in other IRSimilarityCandidate's
+    // Instruction to determine whether there is a valid mapping of
+    // Value to Value.
     DenseSet<unsigned> NewSet;
     for (unsigned &Curr : ValueMappingIt->second)
       // If we can find the value in the mapping, we add it to the new set.
@@ -548,7 +552,6 @@ static bool checkNumberingAndReplaceCommutative(
     if (ValueMappingIt->second.size() != 1)
       continue;
 
-
     unsigned ValToRemove = *ValueMappingIt->second.begin();
     // When there is only one item left in the mapping for and operand, remove
     // the value from the other operands.  If it results in there being no
@@ -791,7 +794,8 @@ bool IRSimilarityCandidate::compareStructure(
     // We have different paths for commutative instructions and non-commutative
     // instructions since commutative instructions could allow multiple mappings
     // to certain values.
-    if (IA->isCommutative() && !isa<FPMathOperator>(IA)) {
+    if (IA->isCommutative() && !isa<FPMathOperator>(IA) &&
+        !isa<IntrinsicInst>(IA)) {
       if (!compareCommutativeOperandMapping(
               {A, OperValsA, ValueNumberMappingA},
               {B, OperValsB, ValueNumberMappingB}))
@@ -1008,6 +1012,40 @@ void IRSimilarityCandidate::createCanonicalRelationFrom(
     CanonNumToNumber.insert(std::make_pair(CanonNum, SourceGVN));
     NumberToCanonNum.insert(std::make_pair(SourceGVN, CanonNum));
   }
+
+  DenseSet<BasicBlock *> BBSet;
+  getBasicBlocks(BBSet);
+  // Find canonical numbers for the BasicBlocks in the current candidate.
+  // This is done by finding the corresponding value for the first instruction
+  // in the block in the current candidate, finding the matching value in the
+  // source candidate.  Then by finding the parent of this value, use the
+  // canonical number of the block in the source candidate for the canonical
+  // number in the current candidate.
+  for (BasicBlock *BB : BBSet) {
+    unsigned BBGVNForCurrCand = ValueToNumber.find(BB)->second;
+
+    // We can skip the BasicBlock if the canonical numbering has already been
+    // found in a separate instruction.
+    if (NumberToCanonNum.find(BBGVNForCurrCand) != NumberToCanonNum.end())
+      continue;
+
+    // If the basic block is the starting block, then the shared instruction may
+    // not be the first instruction in the block, it will be the first
+    // instruction in the similarity region.
+    Value *FirstOutlineInst = BB == getStartBB()
+                                  ? frontInstruction()
+                                  : &*BB->instructionsWithoutDebug().begin();
+
+    unsigned FirstInstGVN = *getGVN(FirstOutlineInst);
+    unsigned FirstInstCanonNum = *getCanonicalNum(FirstInstGVN);
+    unsigned SourceGVN = *SourceCand.fromCanonicalNum(FirstInstCanonNum);
+    Value *SourceV = *SourceCand.fromGVN(SourceGVN);
+    BasicBlock *SourceBB = cast<Instruction>(SourceV)->getParent();
+    unsigned SourceBBGVN = *SourceCand.getGVN(SourceBB);
+    unsigned SourceCanonBBGVN = *SourceCand.getCanonicalNum(SourceBBGVN);
+    CanonNumToNumber.insert(std::make_pair(SourceCanonBBGVN, BBGVNForCurrCand));
+    NumberToCanonNum.insert(std::make_pair(BBGVNForCurrCand, SourceCanonBBGVN));
+  }
 }
 
 void IRSimilarityCandidate::createCanonicalMappingFor(
@@ -1162,11 +1200,12 @@ SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(
   Mapper.InstClassifier.EnableIndirectCalls = EnableIndirectCalls;
   Mapper.EnableMatchCallsByName = EnableMatchingCallsByName;
   Mapper.InstClassifier.EnableIntrinsics = EnableIntrinsics;
+  Mapper.InstClassifier.EnableMustTailCalls = EnableMustTailCalls;
 
   populateMapper(Modules, InstrList, IntegerMapping);
   findCandidates(InstrList, IntegerMapping);
 
-  return SimilarityCandidates.getValue();
+  return *SimilarityCandidates;
 }
 
 SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(Module &M) {
@@ -1175,6 +1214,7 @@ SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(Module &M) {
   Mapper.InstClassifier.EnableIndirectCalls = EnableIndirectCalls;
   Mapper.EnableMatchCallsByName = EnableMatchingCallsByName;
   Mapper.InstClassifier.EnableIntrinsics = EnableIntrinsics;
+  Mapper.InstClassifier.EnableMustTailCalls = EnableMustTailCalls;
 
   std::vector<IRInstructionData *> InstrList;
   std::vector<unsigned> IntegerMapping;
@@ -1182,7 +1222,7 @@ SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(Module &M) {
   populateMapper(M, InstrList, IntegerMapping);
   findCandidates(InstrList, IntegerMapping);
 
-  return SimilarityCandidates.getValue();
+  return *SimilarityCandidates;
 }
 
 INITIALIZE_PASS(IRSimilarityIdentifierWrapperPass, "ir-similarity-identifier",
@@ -1196,7 +1236,8 @@ IRSimilarityIdentifierWrapperPass::IRSimilarityIdentifierWrapperPass()
 
 bool IRSimilarityIdentifierWrapperPass::doInitialization(Module &M) {
   IRSI.reset(new IRSimilarityIdentifier(!DisableBranches, !DisableIndirectCalls,
-                                        MatchCallsByName, !DisableIntrinsics));
+                                        MatchCallsByName, !DisableIntrinsics,
+                                        false));
   return false;
 }
 
@@ -1214,7 +1255,8 @@ AnalysisKey IRSimilarityAnalysis::Key;
 IRSimilarityIdentifier IRSimilarityAnalysis::run(Module &M,
                                                  ModuleAnalysisManager &) {
   auto IRSI = IRSimilarityIdentifier(!DisableBranches, !DisableIndirectCalls,
-                                     MatchCallsByName, !DisableIntrinsics);
+                                     MatchCallsByName, !DisableIntrinsics,
+                                     false);
   IRSI.findSimilarity(M);
   return IRSI;
 }
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 44b1d94ebdc8..e4d706ab045c 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -11,26 +11,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/IVDescriptors.h"
-#include "llvm/ADT/ScopeExit.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/DemandedBits.h"
-#include "llvm/Analysis/DomTreeUpdater.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/MustExecute.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ValueHandle.h"
-#include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/KnownBits.h"
 
@@ -237,12 +227,10 @@ static bool checkOrderedReduction(RecurKind Kind, Instruction *ExactFPMathInst,
   return true;
 }
 
-bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
-                                           Loop *TheLoop, FastMathFlags FuncFMF,
-                                           RecurrenceDescriptor &RedDes,
-                                           DemandedBits *DB,
-                                           AssumptionCache *AC,
-                                           DominatorTree *DT) {
+bool RecurrenceDescriptor::AddReductionVar(
+    PHINode *Phi, RecurKind Kind, Loop *TheLoop, FastMathFlags FuncFMF,
+    RecurrenceDescriptor &RedDes, DemandedBits *DB, AssumptionCache *AC,
+    DominatorTree *DT, ScalarEvolution *SE) {
   if (Phi->getNumIncomingValues() != 2)
     return false;
 
@@ -259,6 +247,12 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
   // This includes users of the reduction, variables (which form a cycle
   // which ends in the phi node).
   Instruction *ExitInstruction = nullptr;
+
+  // Variable to keep last visited store instruction. By the end of the
+  // algorithm this variable will be either empty or having intermediate
+  // reduction value stored in invariant address.
+  StoreInst *IntermediateStore = nullptr;
+
   // Indicates that we found a reduction operation in our scan.
   bool FoundReduxOp = false;
 
@@ -324,6 +318,10 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
   //  - By instructions outside of the loop (safe).
   //      * One value may have several outside users, but all outside
   //        uses must be of the same value.
+  //  - By store instructions with a loop invariant address (safe with
+  //    the following restrictions):
+  //      * If there are several stores, all must have the same address.
+  //      * Final value should be stored in that loop invariant address.
   //  - By an instruction that is not part of the reduction (not safe).
   //    This is either:
   //      * An instruction type other than PHI or the reduction operation.
@@ -331,6 +329,43 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
   while (!Worklist.empty()) {
     Instruction *Cur = Worklist.pop_back_val();
 
+    // Store instructions are allowed iff it is the store of the reduction
+    // value to the same loop invariant memory location.
+    if (auto *SI = dyn_cast<StoreInst>(Cur)) {
+      if (!SE) {
+        LLVM_DEBUG(dbgs() << "Store instructions are not processed without "
+                          << "Scalar Evolution Analysis\n");
+        return false;
+      }
+
+      const SCEV *PtrScev = SE->getSCEV(SI->getPointerOperand());
+      // Check it is the same address as previous stores
+      if (IntermediateStore) {
+        const SCEV *OtherScev =
+            SE->getSCEV(IntermediateStore->getPointerOperand());
+
+        if (OtherScev != PtrScev) {
+          LLVM_DEBUG(dbgs() << "Storing reduction value to different addresses "
+                            << "inside the loop: " << *SI->getPointerOperand()
+                            << " and "
+                            << *IntermediateStore->getPointerOperand() << '\n');
+          return false;
+        }
+      }
+
+      // Check the pointer is loop invariant
+      if (!SE->isLoopInvariant(PtrScev, TheLoop)) {
+        LLVM_DEBUG(dbgs() << "Storing reduction value to non-uniform address "
+                          << "inside the loop: " << *SI->getPointerOperand()
+                          << '\n');
+        return false;
+      }
+
+      // IntermediateStore is always the last store in the loop.
+      IntermediateStore = SI;
+      continue;
+    }
+
     // No Users.
     // If the instruction has no users then this is a broken chain and can't be
     // a reduction variable.
@@ -453,10 +488,17 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
       // reductions which are represented as a cmp followed by a select.
       InstDesc IgnoredVal(false, nullptr);
       if (VisitedInsts.insert(UI).second) {
-        if (isa<PHINode>(UI))
+        if (isa<PHINode>(UI)) {
           PHIs.push_back(UI);
-        else
+        } else {
+          StoreInst *SI = dyn_cast<StoreInst>(UI);
+          if (SI && SI->getPointerOperand() == Cur) {
+            // Reduction variable chain can only be stored somewhere but it
+            // can't be used as an address.
+            return false;
+          }
           NonPHIs.push_back(UI);
+        }
       } else if (!isa<PHINode>(UI) &&
                  ((!isa<FCmpInst>(UI) && !isa<ICmpInst>(UI) &&
                    !isa<SelectInst>(UI)) ||
@@ -476,7 +518,7 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
 
   // This means we have seen one but not the other instruction of the
   // pattern or more than just a select and cmp. Zero implies that we saw a
-  // llvm.min/max instrinsic, which is always OK.
+  // llvm.min/max intrinsic, which is always OK.
   if (isMinMaxRecurrenceKind(Kind) && NumCmpSelectPatternInst != 2 &&
       NumCmpSelectPatternInst != 0)
     return false;
@@ -484,6 +526,32 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
   if (isSelectCmpRecurrenceKind(Kind) && NumCmpSelectPatternInst != 1)
     return false;
 
+  if (IntermediateStore) {
+    // Check that stored value goes to the phi node again. This way we make sure
+    // that the value stored in IntermediateStore is indeed the final reduction
+    // value.
+    if (!is_contained(Phi->operands(), IntermediateStore->getValueOperand())) {
+      LLVM_DEBUG(dbgs() << "Not a final reduction value stored: "
+                        << *IntermediateStore << '\n');
+      return false;
+    }
+
+    // If there is an exit instruction it's value should be stored in
+    // IntermediateStore
+    if (ExitInstruction &&
+        IntermediateStore->getValueOperand() != ExitInstruction) {
+      LLVM_DEBUG(dbgs() << "Last store Instruction of reduction value does not "
+                           "store last calculated value of the reduction: "
+                        << *IntermediateStore << '\n');
+      return false;
+    }
+
+    // If all uses are inside the loop (intermediate stores), then the
+    // reduction value after the loop will be the one used in the last store.
+    if (!ExitInstruction)
+      ExitInstruction = cast<Instruction>(IntermediateStore->getValueOperand());
+  }
+
   if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction)
     return false;
 
@@ -545,9 +613,9 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
   // is saved as part of the RecurrenceDescriptor.
 
   // Save the description of this reduction variable.
-  RecurrenceDescriptor RD(RdxStart, ExitInstruction, Kind, FMF, ExactFPMathInst,
-                          RecurrenceType, IsSigned, IsOrdered, CastInsts,
-                          MinWidthCastToRecurrenceType);
+  RecurrenceDescriptor RD(RdxStart, ExitInstruction, IntermediateStore, Kind,
+                          FMF, ExactFPMathInst, RecurrenceType, IsSigned,
+                          IsOrdered, CastInsts, MinWidthCastToRecurrenceType);
   RedDes = RD;
 
   return true;
@@ -771,7 +839,8 @@ bool RecurrenceDescriptor::hasMultipleUsesOf(
 bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
                                           RecurrenceDescriptor &RedDes,
                                           DemandedBits *DB, AssumptionCache *AC,
-                                          DominatorTree *DT) {
+                                          DominatorTree *DT,
+                                          ScalarEvolution *SE) {
   BasicBlock *Header = TheLoop->getHeader();
   Function &F = *Header->getParent();
   FastMathFlags FMF;
@@ -780,72 +849,85 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
   FMF.setNoSignedZeros(
       F.getFnAttribute("no-signed-zeros-fp-math").getValueAsBool());
 
-  if (AddReductionVar(Phi, RecurKind::Add, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::Add, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
     LLVM_DEBUG(dbgs() << "Found an ADD reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::Mul, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::Mul, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
     LLVM_DEBUG(dbgs() << "Found a MUL reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::Or, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::Or, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
     LLVM_DEBUG(dbgs() << "Found an OR reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::And, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::And, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
     LLVM_DEBUG(dbgs() << "Found an AND reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::Xor, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::Xor, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
     LLVM_DEBUG(dbgs() << "Found a XOR reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::SMax, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::SMax, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
     LLVM_DEBUG(dbgs() << "Found a SMAX reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::SMin, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::SMin, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
     LLVM_DEBUG(dbgs() << "Found a SMIN reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::UMax, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::UMax, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
     LLVM_DEBUG(dbgs() << "Found a UMAX reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::UMin, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::UMin, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
     LLVM_DEBUG(dbgs() << "Found a UMIN reduction PHI." << *Phi << "\n");
     return true;
   }
   if (AddReductionVar(Phi, RecurKind::SelectICmp, TheLoop, FMF, RedDes, DB, AC,
-                      DT)) {
+                      DT, SE)) {
     LLVM_DEBUG(dbgs() << "Found an integer conditional select reduction PHI."
                       << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::FMul, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::FMul, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
     LLVM_DEBUG(dbgs() << "Found an FMult reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::FAdd, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::FAdd, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
     LLVM_DEBUG(dbgs() << "Found an FAdd reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::FMax, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::FMax, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
     LLVM_DEBUG(dbgs() << "Found a float MAX reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::FMin, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::FMin, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
     LLVM_DEBUG(dbgs() << "Found a float MIN reduction PHI." << *Phi << "\n");
     return true;
   }
   if (AddReductionVar(Phi, RecurKind::SelectFCmp, TheLoop, FMF, RedDes, DB, AC,
-                      DT)) {
+                      DT, SE)) {
     LLVM_DEBUG(dbgs() << "Found a float conditional select reduction PHI."
                       << " PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::FMulAdd, TheLoop, FMF, RedDes, DB, AC,
-                      DT)) {
+  if (AddReductionVar(Phi, RecurKind::FMulAdd, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
     LLVM_DEBUG(dbgs() << "Found an FMulAdd reduction PHI." << *Phi << "\n");
     return true;
   }
@@ -917,12 +999,37 @@ bool RecurrenceDescriptor::isFirstOrderRecurrence(
         SinkCandidate->mayReadFromMemory() || SinkCandidate->isTerminator())
       return false;
 
-    // Do not try to sink an instruction multiple times (if multiple operands
-    // are first order recurrences).
-    // TODO: We can support this case, by sinking the instruction after the
-    // 'deepest' previous instruction.
-    if (SinkAfter.find(SinkCandidate) != SinkAfter.end())
-      return false;
+    // Avoid sinking an instruction multiple times (if multiple operands are
+    // first order recurrences) by sinking once - after the latest 'previous'
+    // instruction.
+    auto It = SinkAfter.find(SinkCandidate);
+    if (It != SinkAfter.end()) {
+      auto *OtherPrev = It->second;
+      // Find the earliest entry in the 'sink-after' chain. The last entry in
+      // the chain is the original 'Previous' for a recurrence handled earlier.
+      auto EarlierIt = SinkAfter.find(OtherPrev);
+      while (EarlierIt != SinkAfter.end()) {
+        Instruction *EarlierInst = EarlierIt->second;
+        EarlierIt = SinkAfter.find(EarlierInst);
+        // Bail out if order has not been preserved.
+        if (EarlierIt != SinkAfter.end() &&
+            !DT->dominates(EarlierInst, OtherPrev))
+          return false;
+        OtherPrev = EarlierInst;
+      }
+      // Bail out if order has not been preserved.
+      if (OtherPrev != It->second && !DT->dominates(It->second, OtherPrev))
+        return false;
+
+      // SinkCandidate is already being sunk after an instruction after
+      // Previous. Nothing left to do.
+      if (DT->dominates(Previous, OtherPrev) || Previous == OtherPrev)
+        return true;
+      // Otherwise, Previous comes after OtherPrev and SinkCandidate needs to be
+      // re-sunk to Previous, instead of sinking to OtherPrev. Remove
+      // SinkCandidate from SinkAfter to ensure it's insert position is updated.
+      SinkAfter.erase(SinkCandidate);
+    }
 
     // If we reach a PHI node that is not dominated by Previous, we reached a
     // header PHI. No need for sinking.
@@ -1052,7 +1159,7 @@ RecurrenceDescriptor::getReductionOpChain(PHINode *Phi, Loop *L) const {
   // to check for a pair of icmp/select, for which we use getNextInstruction and
   // isCorrectOpcode functions to step the right number of instruction, and
   // check the icmp/select pair.
-  // FIXME: We also do not attempt to look through Phi/Select's yet, which might
+  // FIXME: We also do not attempt to look through Select's yet, which might
   // be part of the reduction chain, or attempt to looks through And's to find a
   // smaller bitwidth. Subs are also currently not allowed (which are usually
   // treated as part of a add reduction) as they are expected to generally be
@@ -1062,16 +1169,21 @@ RecurrenceDescriptor::getReductionOpChain(PHINode *Phi, Loop *L) const {
   if (RedOp == Instruction::ICmp || RedOp == Instruction::FCmp)
     ExpectedUses = 2;
 
-  auto getNextInstruction = [&](Instruction *Cur) {
-    if (RedOp == Instruction::ICmp || RedOp == Instruction::FCmp) {
-      // We are expecting a icmp/select pair, which we go to the next select
-      // instruction if we can. We already know that Cur has 2 uses.
-      if (isa<SelectInst>(*Cur->user_begin()))
-        return cast<Instruction>(*Cur->user_begin());
-      else
-        return cast<Instruction>(*std::next(Cur->user_begin()));
+  auto getNextInstruction = [&](Instruction *Cur) -> Instruction * {
+    for (auto User : Cur->users()) {
+      Instruction *UI = cast<Instruction>(User);
+      if (isa<PHINode>(UI))
+        continue;
+      if (RedOp == Instruction::ICmp || RedOp == Instruction::FCmp) {
+        // We are expecting a icmp/select pair, which we go to the next select
+        // instruction if we can. We already know that Cur has 2 uses.
+        if (isa<SelectInst>(UI))
+          return UI;
+        continue;
+      }
+      return UI;
     }
-    return cast<Instruction>(*Cur->user_begin());
+    return nullptr;
   };
   auto isCorrectOpcode = [&](Instruction *Cur) {
     if (RedOp == Instruction::ICmp || RedOp == Instruction::FCmp) {
@@ -1086,22 +1198,46 @@ RecurrenceDescriptor::getReductionOpChain(PHINode *Phi, Loop *L) const {
     return Cur->getOpcode() == RedOp;
   };
 
+  // Attempt to look through Phis which are part of the reduction chain
+  unsigned ExtraPhiUses = 0;
+  Instruction *RdxInstr = LoopExitInstr;
+  if (auto ExitPhi = dyn_cast<PHINode>(LoopExitInstr)) {
+    if (ExitPhi->getNumIncomingValues() != 2)
+      return {};
+
+    Instruction *Inc0 = dyn_cast<Instruction>(ExitPhi->getIncomingValue(0));
+    Instruction *Inc1 = dyn_cast<Instruction>(ExitPhi->getIncomingValue(1));
+
+    Instruction *Chain = nullptr;
+    if (Inc0 == Phi)
+      Chain = Inc1;
+    else if (Inc1 == Phi)
+      Chain = Inc0;
+    else
+      return {};
+
+    RdxInstr = Chain;
+    ExtraPhiUses = 1;
+  }
+
   // The loop exit instruction we check first (as a quick test) but add last. We
   // check the opcode is correct (and dont allow them to be Subs) and that they
   // have expected to have the expected number of uses. They will have one use
   // from the phi and one from a LCSSA value, no matter the type.
-  if (!isCorrectOpcode(LoopExitInstr) || !LoopExitInstr->hasNUses(2))
+  if (!isCorrectOpcode(RdxInstr) || !LoopExitInstr->hasNUses(2))
     return {};
 
-  // Check that the Phi has one (or two for min/max) uses.
-  if (!Phi->hasNUses(ExpectedUses))
+  // Check that the Phi has one (or two for min/max) uses, plus an extra use
+  // for conditional reductions.
+  if (!Phi->hasNUses(ExpectedUses + ExtraPhiUses))
     return {};
+
   Instruction *Cur = getNextInstruction(Phi);
 
   // Each other instruction in the chain should have the expected number of uses
   // and be the correct opcode.
-  while (Cur != LoopExitInstr) {
-    if (!isCorrectOpcode(Cur) || !Cur->hasNUses(ExpectedUses))
+  while (Cur != RdxInstr) {
+    if (!Cur || !isCorrectOpcode(Cur) || !Cur->hasNUses(ExpectedUses))
       return {};
 
     ReductionOperations.push_back(Cur);
@@ -1428,10 +1564,14 @@ bool InductionDescriptor::isInductionPHI(
 
   ConstantInt *CV = ConstStep->getValue();
   const DataLayout &DL = Phi->getModule()->getDataLayout();
-  int64_t Size = static_cast<int64_t>(DL.getTypeAllocSize(ElementType));
-  if (!Size)
+  TypeSize TySize = DL.getTypeAllocSize(ElementType);
+  // TODO: We could potentially support this for scalable vectors if we can
+  // prove at compile time that the constant step is always a multiple of
+  // the scalable type.
+  if (TySize.isZero() || TySize.isScalable())
     return false;
 
+  int64_t Size = static_cast<int64_t>(TySize.getFixedSize());
   int64_t CVSize = CV->getSExtValue();
   if (CVSize % Size)
     return false;
diff --git a/llvm/lib/Analysis/IVUsers.cpp b/llvm/lib/Analysis/IVUsers.cpp
index 0f3929f45506..5bde947bd851 100644
--- a/llvm/lib/Analysis/IVUsers.cpp
+++ b/llvm/lib/Analysis/IVUsers.cpp
@@ -12,25 +12,21 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/IVUsers.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Config/llvm-config.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 using namespace llvm;
 
 #define DEBUG_TYPE "iv-users"
diff --git a/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp b/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
index b112ed2e4439..ebfa1c8fc08e 100644
--- a/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
+++ b/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
@@ -13,12 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/IndirectCallPromotionAnalysis.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Analysis/IndirectCallVisitor.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/InstVisitor.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -31,7 +26,7 @@ using namespace llvm;
 // The percent threshold for the direct-call target (this call site vs the
 // remaining call count) for it to be considered as the promotion target.
 static cl::opt<unsigned> ICPRemainingPercentThreshold(
-    "icp-remaining-percent-threshold", cl::init(30), cl::Hidden, cl::ZeroOrMore,
+    "icp-remaining-percent-threshold", cl::init(30), cl::Hidden,
     cl::desc("The percentage threshold against remaining unpromoted indirect "
              "call count for the promotion"));
 
@@ -39,14 +34,14 @@ static cl::opt<unsigned> ICPRemainingPercentThreshold(
 // total call count) for it to be considered as the promotion target.
 static cl::opt<unsigned>
     ICPTotalPercentThreshold("icp-total-percent-threshold", cl::init(5),
-                             cl::Hidden, cl::ZeroOrMore,
+                             cl::Hidden,
                              cl::desc("The percentage threshold against total "
                                       "count for the promotion"));
 
 // Set the maximum number of targets to promote for a single indirect-call
 // callsite.
 static cl::opt<unsigned>
-    MaxNumPromotions("icp-max-prom", cl::init(3), cl::Hidden, cl::ZeroOrMore,
+    MaxNumPromotions("icp-max-prom", cl::init(3), cl::Hidden,
                      cl::desc("Max number of promotions for a single indirect "
                               "call callsite"));
 
diff --git a/llvm/lib/Analysis/InlineAdvisor.cpp b/llvm/lib/Analysis/InlineAdvisor.cpp
index f6e3dd354ff8..cf8592c41eda 100644
--- a/llvm/lib/Analysis/InlineAdvisor.cpp
+++ b/llvm/lib/Analysis/InlineAdvisor.cpp
@@ -13,14 +13,15 @@
 
 #include "llvm/Analysis/InlineAdvisor.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ReplayInlineAdvisor.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/Utils/ImportedFunctionsInliningStatistics.h"
 #include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
@@ -55,6 +56,11 @@ static cl::opt<int>
                         cl::desc("Scale to limit the cost of inline deferral"),
                         cl::init(2), cl::Hidden);
 
+static cl::opt<bool> AnnotateInlinePhase(
+    "annotate-inline-phase", cl::Hidden, cl::init(false),
+    cl::desc("If true, annotate inline advisor remarks "
+             "with LTO and pass information."));
+
 extern cl::opt<InlinerFunctionImportStatsOpts> InlinerFunctionImportStats;
 
 namespace {
@@ -80,7 +86,8 @@ private:
   void recordUnsuccessfulInliningImpl(const InlineResult &Result) override {
     if (IsInliningRecommended)
       ORE.emit([&]() {
-        return OptimizationRemarkMissed(DEBUG_TYPE, "NotInlined", DLoc, Block)
+        return OptimizationRemarkMissed(Advisor->getAnnotatedInlinePassName(),
+                                        "NotInlined", DLoc, Block)
                << "'" << NV("Callee", Callee) << "' is not AlwaysInline into '"
                << NV("Caller", Caller)
                << "': " << NV("Reason", Result.getFailureReason());
@@ -99,7 +106,8 @@ void DefaultInlineAdvice::recordUnsuccessfulInliningImpl(
   llvm::setInlineRemark(*OriginalCB, std::string(Result.getFailureReason()) +
                                          "; " + inlineCostStr(*OIC));
   ORE.emit([&]() {
-    return OptimizationRemarkMissed(DEBUG_TYPE, "NotInlined", DLoc, Block)
+    return OptimizationRemarkMissed(Advisor->getAnnotatedInlinePassName(),
+                                    "NotInlined", DLoc, Block)
            << "'" << NV("Callee", Callee) << "' is not inlined into '"
            << NV("Caller", Caller)
            << "': " << NV("Reason", Result.getFailureReason());
@@ -108,12 +116,16 @@ void DefaultInlineAdvice::recordUnsuccessfulInliningImpl(
 
 void DefaultInlineAdvice::recordInliningWithCalleeDeletedImpl() {
   if (EmitRemarks)
-    emitInlinedIntoBasedOnCost(ORE, DLoc, Block, *Callee, *Caller, *OIC);
+    emitInlinedIntoBasedOnCost(ORE, DLoc, Block, *Callee, *Caller, *OIC,
+                               /* ForProfileContext= */ false,
+                               Advisor->getAnnotatedInlinePassName());
 }
 
 void DefaultInlineAdvice::recordInliningImpl() {
   if (EmitRemarks)
-    emitInlinedIntoBasedOnCost(ORE, DLoc, Block, *Callee, *Caller, *OIC);
+    emitInlinedIntoBasedOnCost(ORE, DLoc, Block, *Callee, *Caller, *OIC,
+                               /* ForProfileContext= */ false,
+                               Advisor->getAnnotatedInlinePassName());
 }
 
 llvm::Optional<llvm::InlineCost> static getDefaultInlineAdvice(
@@ -146,7 +158,7 @@ llvm::Optional<llvm::InlineCost> static getDefaultInlineAdvice(
   };
   return llvm::shouldInline(
       CB, GetInlineCost, ORE,
-      Params.EnableDeferral.getValueOr(EnableInlineDeferral));
+      Params.EnableDeferral.value_or(EnableInlineDeferral));
 }
 
 std::unique_ptr<InlineAdvice>
@@ -185,18 +197,18 @@ AnalysisKey InlineAdvisorAnalysis::Key;
 
 bool InlineAdvisorAnalysis::Result::tryCreate(
     InlineParams Params, InliningAdvisorMode Mode,
-    const ReplayInlinerSettings &ReplaySettings) {
+    const ReplayInlinerSettings &ReplaySettings, InlineContext IC) {
   auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
   switch (Mode) {
   case InliningAdvisorMode::Default:
     LLVM_DEBUG(dbgs() << "Using default inliner heuristic.\n");
-    Advisor.reset(new DefaultInlineAdvisor(M, FAM, Params));
+    Advisor.reset(new DefaultInlineAdvisor(M, FAM, Params, IC));
     // Restrict replay to default advisor, ML advisors are stateful so
     // replay will need augmentations to interleave with them correctly.
     if (!ReplaySettings.ReplayFile.empty()) {
       Advisor = llvm::getReplayInlineAdvisor(M, FAM, M.getContext(),
                                              std::move(Advisor), ReplaySettings,
-                                             /* EmitRemarks =*/true);
+                                             /* EmitRemarks =*/true, IC);
     }
     break;
   case InliningAdvisorMode::Development:
@@ -442,7 +454,7 @@ std::string llvm::formatCallSiteLocation(DebugLoc DLoc,
 }
 
 void llvm::addLocationToRemarks(OptimizationRemark &Remark, DebugLoc DLoc) {
-  if (!DLoc.get()) {
+  if (!DLoc) {
     return;
   }
 
@@ -499,8 +511,11 @@ void llvm::emitInlinedIntoBasedOnCost(
       PassName);
 }
 
-InlineAdvisor::InlineAdvisor(Module &M, FunctionAnalysisManager &FAM)
-    : M(M), FAM(FAM) {
+InlineAdvisor::InlineAdvisor(Module &M, FunctionAnalysisManager &FAM,
+                             Optional<InlineContext> IC)
+    : M(M), FAM(FAM), IC(IC),
+      AnnotatedInlinePassName((IC && AnnotateInlinePhase) ? llvm::AnnotateInlinePassName(*IC)
+                                 : DEBUG_TYPE) {
   if (InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No) {
     ImportedFunctionsStats =
         std::make_unique<ImportedFunctionsInliningStatistics>();
@@ -522,6 +537,48 @@ std::unique_ptr<InlineAdvice> InlineAdvisor::getMandatoryAdvice(CallBase &CB,
                                                  Advice);
 }
 
+static inline const char *getLTOPhase(ThinOrFullLTOPhase LTOPhase) {
+  switch (LTOPhase) {
+  case (ThinOrFullLTOPhase::None):
+    return "main";
+  case (ThinOrFullLTOPhase::ThinLTOPreLink):
+  case (ThinOrFullLTOPhase::FullLTOPreLink):
+    return "prelink";
+  case (ThinOrFullLTOPhase::ThinLTOPostLink):
+  case (ThinOrFullLTOPhase::FullLTOPostLink):
+    return "postlink";
+  }
+  llvm_unreachable("unreachable");
+}
+
+static inline const char *getInlineAdvisorContext(InlinePass IP) {
+  switch (IP) {
+  case (InlinePass::AlwaysInliner):
+    return "always-inline";
+  case (InlinePass::CGSCCInliner):
+    return "cgscc-inline";
+  case (InlinePass::EarlyInliner):
+    return "early-inline";
+  case (InlinePass::MLInliner):
+    return "ml-inline";
+  case (InlinePass::ModuleInliner):
+    return "module-inline";
+  case (InlinePass::ReplayCGSCCInliner):
+    return "replay-cgscc-inline";
+  case (InlinePass::ReplaySampleProfileInliner):
+    return "replay-sample-profile-inline";
+  case (InlinePass::SampleProfileInliner):
+    return "sample-profile-inline";
+  }
+
+  llvm_unreachable("unreachable");
+}
+
+std::string llvm::AnnotateInlinePassName(InlineContext IC) {
+  return std::string(getLTOPhase(IC.LTOPhase)) + "-" +
+         std::string(getInlineAdvisorContext(IC.Pass));
+}
+
 InlineAdvisor::MandatoryInliningKind
 InlineAdvisor::getMandatoryKind(CallBase &CB, FunctionAnalysisManager &FAM,
                                 OptimizationRemarkEmitter &ORE) {
@@ -536,7 +593,7 @@ InlineAdvisor::getMandatoryKind(CallBase &CB, FunctionAnalysisManager &FAM,
   auto TrivialDecision =
       llvm::getAttributeBasedInliningDecision(CB, &Callee, TIR, GetTLI);
 
-  if (TrivialDecision.hasValue()) {
+  if (TrivialDecision) {
     if (TrivialDecision->isSuccess())
       return MandatoryInliningKind::Always;
     else
@@ -568,3 +625,22 @@ InlineAdvisorAnalysisPrinterPass::run(Module &M, ModuleAnalysisManager &MAM) {
     IA->getAdvisor()->print(OS);
   return PreservedAnalyses::all();
 }
+
+PreservedAnalyses InlineAdvisorAnalysisPrinterPass::run(
+    LazyCallGraph::SCC &InitialC, CGSCCAnalysisManager &AM, LazyCallGraph &CG,
+    CGSCCUpdateResult &UR) {
+  const auto &MAMProxy =
+      AM.getResult<ModuleAnalysisManagerCGSCCProxy>(InitialC, CG);
+
+  if (InitialC.size() == 0) {
+    OS << "SCC is empty!\n";
+    return PreservedAnalyses::all();
+  }
+  Module &M = *InitialC.begin()->getFunction().getParent();
+  const auto *IA = MAMProxy.getCachedResult<InlineAdvisorAnalysis>(M);
+  if (!IA)
+    OS << "No Inline Advisor\n";
+  else
+    IA->getAdvisor()->print(OS);
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index d5411d916c77..e63497260e6e 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -18,11 +18,11 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -42,6 +42,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/raw_ostream.h"
+#include <limits>
 
 using namespace llvm;
 
@@ -51,24 +52,33 @@ STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
 
 static cl::opt<int>
     DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(225),
-                     cl::ZeroOrMore,
                      cl::desc("Default amount of inlining to perform"));
 
+// We introduce this option since there is a minor compile-time win by avoiding
+// addition of TTI attributes (target-features in particular) to inline
+// candidates when they are guaranteed to be the same as top level methods in
+// some use cases. If we avoid adding the attribute, we need an option to avoid
+// checking these attributes.
+static cl::opt<bool> IgnoreTTIInlineCompatible(
+    "ignore-tti-inline-compatible", cl::Hidden, cl::init(false),
+    cl::desc("Ignore TTI attributes compatibility check between callee/caller "
+             "during inline cost calculation"));
+
 static cl::opt<bool> PrintInstructionComments(
     "print-instruction-comments", cl::Hidden, cl::init(false),
     cl::desc("Prints comments for instruction based on inline cost analysis"));
 
 static cl::opt<int> InlineThreshold(
-    "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
+    "inline-threshold", cl::Hidden, cl::init(225),
     cl::desc("Control the amount of inlining to perform (default = 225)"));
 
 static cl::opt<int> HintThreshold(
-    "inlinehint-threshold", cl::Hidden, cl::init(325), cl::ZeroOrMore,
+    "inlinehint-threshold", cl::Hidden, cl::init(325),
     cl::desc("Threshold for inlining functions with inline hint"));
 
 static cl::opt<int>
     ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
-                          cl::init(45), cl::ZeroOrMore,
+                          cl::init(45),
                           cl::desc("Threshold for inlining cold callsites"));
 
 static cl::opt<bool> InlineEnableCostBenefitAnalysis(
@@ -76,12 +86,11 @@ static cl::opt<bool> InlineEnableCostBenefitAnalysis(
     cl::desc("Enable the cost-benefit analysis for the inliner"));
 
 static cl::opt<int> InlineSavingsMultiplier(
-    "inline-savings-multiplier", cl::Hidden, cl::init(8), cl::ZeroOrMore,
+    "inline-savings-multiplier", cl::Hidden, cl::init(8),
     cl::desc("Multiplier to multiply cycle savings by during inlining"));
 
 static cl::opt<int>
     InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(100),
-                        cl::ZeroOrMore,
                         cl::desc("The maximum size of a callee that get's "
                                  "inlined without sufficient cycle savings"));
 
@@ -89,26 +98,25 @@ static cl::opt<int>
 // PGO before we actually hook up inliner with analysis passes such as BPI and
 // BFI.
 static cl::opt<int> ColdThreshold(
-    "inlinecold-threshold", cl::Hidden, cl::init(45), cl::ZeroOrMore,
+    "inlinecold-threshold", cl::Hidden, cl::init(45),
     cl::desc("Threshold for inlining functions with cold attribute"));
 
 static cl::opt<int>
     HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
-                         cl::ZeroOrMore,
                          cl::desc("Threshold for hot callsites "));
 
 static cl::opt<int> LocallyHotCallSiteThreshold(
-    "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore,
+    "locally-hot-callsite-threshold", cl::Hidden, cl::init(525),
     cl::desc("Threshold for locally hot callsites "));
 
 static cl::opt<int> ColdCallSiteRelFreq(
-    "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore,
+    "cold-callsite-rel-freq", cl::Hidden, cl::init(2),
     cl::desc("Maximum block frequency, expressed as a percentage of caller's "
              "entry frequency, for a callsite to be cold in the absence of "
              "profile information."));
 
 static cl::opt<int> HotCallSiteRelFreq(
-    "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore,
+    "hot-callsite-rel-freq", cl::Hidden, cl::init(60),
     cl::desc("Minimum block frequency, expressed as a multiple of caller's "
              "entry frequency, for a callsite to be hot in the absence of "
              "profile information."));
@@ -117,14 +125,19 @@ static cl::opt<int> CallPenalty(
     "inline-call-penalty", cl::Hidden, cl::init(25),
     cl::desc("Call penalty that is applied per callsite when inlining"));
 
+static cl::opt<size_t>
+    StackSizeThreshold("inline-max-stacksize", cl::Hidden,
+                       cl::init(std::numeric_limits<size_t>::max()),
+                       cl::desc("Do not inline functions with a stack size "
+                                "that exceeds the specified limit"));
+
 static cl::opt<bool> OptComputeFullInlineCost(
-    "inline-cost-full", cl::Hidden, cl::init(false), cl::ZeroOrMore,
+    "inline-cost-full", cl::Hidden,
     cl::desc("Compute the full inline cost of a call site even when the cost "
              "exceeds the threshold."));
 
 static cl::opt<bool> InlineCallerSupersetNoBuiltin(
     "inline-caller-superset-nobuiltin", cl::Hidden, cl::init(true),
-    cl::ZeroOrMore,
     cl::desc("Allow inlining when caller has a superset of callee's nobuiltin "
              "attributes."));
 
@@ -132,33 +145,18 @@ static cl::opt<bool> DisableGEPConstOperand(
     "disable-gep-const-evaluation", cl::Hidden, cl::init(false),
     cl::desc("Disables evaluation of GetElementPtr with constant operands"));
 
-namespace {
-class InlineCostCallAnalyzer;
-
-/// This function behaves more like CallBase::hasFnAttr: when it looks for the
-/// requested attribute, it check both the call instruction and the called
-/// function (if it's available and operand bundles don't prohibit that).
-Attribute getFnAttr(CallBase &CB, StringRef AttrKind) {
-  Attribute CallAttr = CB.getFnAttr(AttrKind);
-  if (CallAttr.isValid())
-    return CallAttr;
-
-  // Operand bundles override attributes on the called function, but don't
-  // override attributes directly present on the call instruction.
-  if (!CB.isFnAttrDisallowedByOpBundle(AttrKind))
-    if (const Function *F = CB.getCalledFunction())
-      return F->getFnAttribute(AttrKind);
-
-  return {};
-}
-
+namespace llvm {
 Optional<int> getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind) {
-  Attribute Attr = getFnAttr(CB, AttrKind);
+  Attribute Attr = CB.getFnAttr(AttrKind);
   int AttrValue;
   if (Attr.getValueAsString().getAsInteger(10, AttrValue))
     return None;
   return AttrValue;
 }
+} // namespace llvm
+
+namespace {
+class InlineCostCallAnalyzer;
 
 // This struct is used to store information about inline cost of a
 // particular instruction
@@ -198,7 +196,7 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   friend class InstVisitor<CallAnalyzer, bool>;
 
 protected:
-  virtual ~CallAnalyzer() {}
+  virtual ~CallAnalyzer() = default;
   /// The TargetTransformInfo available for this compilation.
   const TargetTransformInfo &TTI;
 
@@ -352,7 +350,7 @@ protected:
   DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
 
   /// Keep track of dead blocks due to the constant arguments.
-  SetVector<BasicBlock *> DeadBlocks;
+  SmallPtrSet<BasicBlock *, 16> DeadBlocks;
 
   /// The mapping of the blocks to their known unique successors due to the
   /// constant arguments.
@@ -385,8 +383,7 @@ protected:
   bool canFoldInboundsGEP(GetElementPtrInst &I);
   bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
   bool simplifyCallSite(Function *F, CallBase &Call);
-  template <typename Callable>
-  bool simplifyInstruction(Instruction &I, Callable Evaluate);
+  bool simplifyInstruction(Instruction &I);
   bool simplifyIntrinsicCallIsConstant(CallBase &CB);
   ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
 
@@ -704,7 +701,7 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
       BlockFrequencyInfo *BFI = &(GetBFI(F));
       assert(BFI && "BFI must be available");
       auto ProfileCount = BFI->getBlockProfileCount(BB);
-      assert(ProfileCount.hasValue());
+      assert(ProfileCount);
       if (ProfileCount.getValue() == 0)
         ColdSize += Cost - CostAtBBStart;
     }
@@ -829,14 +826,14 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
       }
 
       auto ProfileCount = CalleeBFI->getBlockProfileCount(&BB);
-      assert(ProfileCount.hasValue());
+      assert(ProfileCount);
       CurrentSavings *= ProfileCount.getValue();
       CycleSavings += CurrentSavings;
     }
 
     // Compute the cycle savings per call.
     auto EntryProfileCount = F.getEntryCount();
-    assert(EntryProfileCount.hasValue() && EntryProfileCount->getCount());
+    assert(EntryProfileCount && EntryProfileCount->getCount());
     auto EntryCount = EntryProfileCount->getCount();
     CycleSavings += EntryCount / 2;
     CycleSavings = CycleSavings.udiv(EntryCount);
@@ -845,7 +842,7 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
     auto *CallerBB = CandidateCall.getParent();
     BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent())));
     CycleSavings += getCallsiteCost(this->CandidateCall, DL);
-    CycleSavings *= CallerBFI->getBlockProfileCount(CallerBB).getValue();
+    CycleSavings *= *CallerBFI->getBlockProfileCount(CallerBB);
 
     // Remove the cost of the cold basic blocks.
     int Size = Cost - ColdSize;
@@ -904,13 +901,18 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
             getStringFnAttrAsInt(CandidateCall, "function-inline-cost"))
       Cost = *AttrCost;
 
+    if (Optional<int> AttrCostMult = getStringFnAttrAsInt(
+            CandidateCall,
+            InlineConstants::FunctionInlineCostMultiplierAttributeName))
+      Cost *= *AttrCostMult;
+
     if (Optional<int> AttrThreshold =
             getStringFnAttrAsInt(CandidateCall, "function-inline-threshold"))
       Threshold = *AttrThreshold;
 
     if (auto Result = costBenefitAnalysis()) {
       DecidedByCostBenefit = true;
-      if (Result.getValue())
+      if (*Result)
         return InlineResult::success();
       else
         return InlineResult::failure("Cost over threshold.");
@@ -978,6 +980,8 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
     if (F.getCallingConv() == CallingConv::Cold)
       Cost += InlineConstants::ColdccPenalty;
 
+    LLVM_DEBUG(dbgs() << "      Initial cost: " << Cost << "\n");
+
     // Check if we're done. This can happen due to bonuses and penalties.
     if (Cost >= Threshold && !ComputeFullInlineCost)
       return InlineResult::failure("high cost");
@@ -1002,7 +1006,7 @@ public:
         BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold),
         CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()),
         Writer(this) {
-    AllowRecursiveCall = Params.AllowRecursiveCall.getValue();
+    AllowRecursiveCall = *Params.AllowRecursiveCall;
   }
 
   /// Annotation Writer for instruction details
@@ -1020,7 +1024,7 @@ public:
     return None;
   }
 
-  virtual ~InlineCostCallAnalyzer() {}
+  virtual ~InlineCostCallAnalyzer() = default;
   int getThreshold() const { return Threshold; }
   int getCost() const { return Cost; }
   Optional<CostBenefitPair> getCostBenefitPair() { return CostBenefit; }
@@ -1203,6 +1207,10 @@ private:
     set(InlineCostFeatureIndex::ColdCcPenalty,
         (F.getCallingConv() == CallingConv::Cold));
 
+    set(InlineCostFeatureIndex::LastCallToStaticBonus,
+        (F.hasLocalLinkage() && F.hasOneLiveUse() &&
+         &F == CandidateCall.getCalledFunction()));
+
     // FIXME: we shouldn't repeat this logic in both the Features and Cost
     // analyzer - instead, we should abstract it to a common method in the
     // CallAnalyzer
@@ -1262,7 +1270,7 @@ void InlineCostAnnotationWriter::emitInstructionAnnot(
   auto C = ICCA->getSimplifiedValue(const_cast<Instruction *>(I));
   if (C) {
     OS << ", simplified to ";
-    C.getValue()->print(OS, true);
+    (*C)->print(OS, true);
   }
   OS << "\n";
 }
@@ -1501,13 +1509,7 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
   };
 
   if (!DisableGEPConstOperand)
-    if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
-          SmallVector<Constant *, 2> Indices;
-          for (unsigned int Index = 1; Index < COps.size(); ++Index)
-            Indices.push_back(COps[Index]);
-          return ConstantExpr::getGetElementPtr(
-              I.getSourceElementType(), COps[0], Indices, I.isInBounds());
-        }))
+    if (simplifyInstruction(I))
       return true;
 
   if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
@@ -1525,11 +1527,8 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
 }
 
 /// Simplify \p I if its operands are constants and update SimplifiedValues.
-/// \p Evaluate is a callable specific to instruction type that evaluates the
-/// instruction when all the operands are constants.
-template <typename Callable>
-bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) {
-  SmallVector<Constant *, 2> COps;
+bool CallAnalyzer::simplifyInstruction(Instruction &I) {
+  SmallVector<Constant *> COps;
   for (Value *Op : I.operands()) {
     Constant *COp = dyn_cast<Constant>(Op);
     if (!COp)
@@ -1538,7 +1537,7 @@ bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) {
       return false;
     COps.push_back(COp);
   }
-  auto *C = Evaluate(COps);
+  auto *C = ConstantFoldInstOperands(&I, COps, DL);
   if (!C)
     return false;
   SimplifiedValues[&I] = C;
@@ -1568,9 +1567,7 @@ bool CallAnalyzer::simplifyIntrinsicCallIsConstant(CallBase &CB) {
 
 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
   // Propagate constants through bitcasts.
-  if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
-        return ConstantExpr::getBitCast(COps[0], I.getType());
-      }))
+  if (simplifyInstruction(I))
     return true;
 
   // Track base/offsets through casts
@@ -1590,9 +1587,7 @@ bool CallAnalyzer::visitBitCast(BitCastInst &I) {
 
 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
   // Propagate constants through ptrtoint.
-  if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
-        return ConstantExpr::getPtrToInt(COps[0], I.getType());
-      }))
+  if (simplifyInstruction(I))
     return true;
 
   // Track base/offset pairs when converted to a plain integer provided the
@@ -1622,9 +1617,7 @@ bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
 
 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
   // Propagate constants through ptrtoint.
-  if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
-        return ConstantExpr::getIntToPtr(COps[0], I.getType());
-      }))
+  if (simplifyInstruction(I))
     return true;
 
   // Track base/offset pairs when round-tripped through a pointer without
@@ -1647,9 +1640,7 @@ bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
 
 bool CallAnalyzer::visitCastInst(CastInst &I) {
   // Propagate constants through casts.
-  if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
-        return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType());
-      }))
+  if (simplifyInstruction(I))
     return true;
 
   // Disable SROA in the face of arbitrary casts we don't explicitly list
@@ -1855,7 +1846,7 @@ void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
       // current threshold, but AutoFDO + ThinLTO currently relies on this
       // behavior to prevent inlining of hot callsites during ThinLTO
       // compile phase.
-      Threshold = HotCallSiteThreshold.getValue();
+      Threshold = *HotCallSiteThreshold;
     } else if (isColdCallSite(Call, CallerBFI)) {
       LLVM_DEBUG(dbgs() << "Cold callsite.\n");
       // Do not apply bonuses for a cold callsite including the
@@ -1906,9 +1897,7 @@ void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
   // First try to handle simplified comparisons.
-  if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
-        return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]);
-      }))
+  if (simplifyInstruction(I))
     return true;
 
   if (I.getOpcode() == Instruction::FCmp)
@@ -1984,11 +1973,11 @@ bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
 
   Value *SimpleV = nullptr;
   if (auto FI = dyn_cast<FPMathOperator>(&I))
-    SimpleV = SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS,
+    SimpleV = simplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS,
                             FI->getFastMathFlags(), DL);
   else
     SimpleV =
-        SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
+        simplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
 
   if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
     SimplifiedValues[&I] = C;
@@ -2018,7 +2007,7 @@ bool CallAnalyzer::visitFNeg(UnaryOperator &I) {
   if (!COp)
     COp = SimplifiedValues.lookup(Op);
 
-  Value *SimpleV = SimplifyFNegInst(
+  Value *SimpleV = simplifyFNegInst(
       COp ? COp : Op, cast<FPMathOperator>(I).getFastMathFlags(), DL);
 
   if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
@@ -2067,9 +2056,7 @@ bool CallAnalyzer::visitStore(StoreInst &I) {
 
 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
   // Constant folding for extract value is trivial.
-  if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
-        return ConstantExpr::getExtractValue(COps[0], I.getIndices());
-      }))
+  if (simplifyInstruction(I))
     return true;
 
   // SROA can't look through these, but they may be free.
@@ -2078,11 +2065,7 @@ bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
 
 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
   // Constant folding for insert value is trivial.
-  if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
-        return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0],
-                                            /*InsertedValueOperand*/ COps[1],
-                                            I.getIndices());
-      }))
+  if (simplifyInstruction(I))
     return true;
 
   // SROA can't look through these, but they may be free.
@@ -2136,14 +2119,14 @@ bool CallAnalyzer::visitCallBase(CallBase &Call) {
   if (isa<CallInst>(Call) && cast<CallInst>(Call).cannotDuplicate())
     ContainsNoDuplicateCall = true;
 
-  Value *Callee = Call.getCalledOperand();
-  Function *F = dyn_cast_or_null<Function>(Callee);
+  Function *F = Call.getCalledFunction();
   bool IsIndirectCall = !F;
   if (IsIndirectCall) {
     // Check if this happens to be an indirect function call to a known function
     // in this inline context. If not, we've done all we can.
+    Value *Callee = Call.getCalledOperand();
     F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
-    if (!F) {
+    if (!F || F->getFunctionType() != Call.getFunctionType()) {
       onCallArgumentSetup(Call);
 
       if (!Call.onlyReadsMemory())
@@ -2552,7 +2535,7 @@ void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
     NewDead.push_back(Succ);
     while (!NewDead.empty()) {
       BasicBlock *Dead = NewDead.pop_back_val();
-      if (DeadBlocks.insert(Dead))
+      if (DeadBlocks.insert(Dead).second)
         // Continue growing the dead block lists.
         for (BasicBlock *S : successors(Dead))
           if (IsNewlyDead(S))
@@ -2707,6 +2690,11 @@ InlineResult CallAnalyzer::analyze() {
   if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
     return InlineResult::failure("noduplicate");
 
+  // If the callee's stack size exceeds the user-specified threshold,
+  // do not let it be inlined.
+  if (AllocatedSize > StackSizeThreshold)
+    return InlineResult::failure("stacksize");
+
   return finalizeAnalysis();
 }
 
@@ -2745,7 +2733,8 @@ static bool functionsHaveCompatibleAttributes(
   // object, and always returns the same object (which is overwritten on each
   // GetTLI call). Therefore we copy the first result.
   auto CalleeTLI = GetTLI(*Callee);
-  return TTI.areInlineCompatible(Caller, Callee) &&
+  return (IgnoreTTIInlineCompatible ||
+          TTI.areInlineCompatible(Caller, Callee)) &&
          GetTLI(*Caller).areInlineCompatible(CalleeTLI,
                                              InlineCallerSupersetNoBuiltin) &&
          AttributeFuncs::areInlineCompatible(*Caller, *Callee);
@@ -2864,6 +2853,9 @@ Optional<InlineResult> llvm::getAttributeBasedInliningDecision(
   // Calls to functions with always-inline attributes should be inlined
   // whenever possible.
   if (Call.hasFnAttr(Attribute::AlwaysInline)) {
+    if (Call.getAttributes().hasFnAttr(Attribute::NoInline))
+      return InlineResult::failure("noinline call site attribute");
+
     auto IsViable = isInlineViable(*Callee);
     if (IsViable.isSuccess())
       return InlineResult::success();
@@ -2911,7 +2903,7 @@ InlineCost llvm::getInlineCost(
   auto UserDecision =
       llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI);
 
-  if (UserDecision.hasValue()) {
+  if (UserDecision) {
     if (UserDecision->isSuccess())
       return llvm::InlineCost::getAlways("always inline attribute");
     return llvm::InlineCost::getNever(UserDecision->getFailureReason());
diff --git a/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp b/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp
index a2e231e2d0f4..2371ecbba615 100644
--- a/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp
+++ b/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp
@@ -15,33 +15,32 @@
 #ifdef LLVM_HAVE_TF_API
 #include "llvm/Analysis/Utils/TFUtils.h"
 #endif
+#include "llvm/IR/Function.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+AnalysisKey InlineSizeEstimatorAnalysis::Key;
+
+#ifdef LLVM_HAVE_TF_API
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/PassManager.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/raw_ostream.h"
-
 #include <algorithm>
 #include <deque>
 
-using namespace llvm;
-
-AnalysisKey InlineSizeEstimatorAnalysis::Key;
-
-#define DEBUG_TYPE "inline-size-estimator"
-
-#ifdef LLVM_HAVE_TF_API
 cl::opt<std::string> TFIR2NativeModelPath(
     "ml-inliner-ir2native-model", cl::Hidden,
     cl::desc("Path to saved model evaluating native size from IR."));
 
+#define DEBUG_TYPE "inline-size-estimator"
 namespace {
 unsigned getMaxInstructionID() {
 #define LAST_OTHER_INST(NR) return NR;
@@ -261,10 +260,10 @@ InlineSizeEstimatorAnalysis::InlineSizeEstimatorAnalysis(
 namespace llvm {
 class TFModelEvaluator {};
 } // namespace llvm
-InlineSizeEstimatorAnalysis::InlineSizeEstimatorAnalysis() {}
+InlineSizeEstimatorAnalysis::InlineSizeEstimatorAnalysis() = default;
 InlineSizeEstimatorAnalysis ::InlineSizeEstimatorAnalysis(
     InlineSizeEstimatorAnalysis &&) {}
-InlineSizeEstimatorAnalysis::~InlineSizeEstimatorAnalysis() {}
+InlineSizeEstimatorAnalysis::~InlineSizeEstimatorAnalysis() = default;
 InlineSizeEstimatorAnalysis::Result
 InlineSizeEstimatorAnalysis::run(const Function &F,
                                  FunctionAnalysisManager &FAM) {
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 4775340b3438..013e4d6489fa 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -20,7 +20,6 @@
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
@@ -36,13 +35,10 @@
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
-#include "llvm/IR/GetElementPtrTypeIterator.h"
-#include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/KnownBits.h"
 #include <algorithm>
 using namespace llvm;
@@ -52,28 +48,30 @@ using namespace llvm::PatternMatch;
 
 enum { RecursionLimit = 3 };
 
-STATISTIC(NumExpand,  "Number of expansions");
+STATISTIC(NumExpand, "Number of expansions");
 STATISTIC(NumReassoc, "Number of reassociations");
 
-static Value *SimplifyAndInst(Value *, Value *, const SimplifyQuery &, unsigned);
+static Value *simplifyAndInst(Value *, Value *, const SimplifyQuery &,
+                              unsigned);
 static Value *simplifyUnOp(unsigned, Value *, const SimplifyQuery &, unsigned);
 static Value *simplifyFPUnOp(unsigned, Value *, const FastMathFlags &,
                              const SimplifyQuery &, unsigned);
-static Value *SimplifyBinOp(unsigned, Value *, Value *, const SimplifyQuery &,
+static Value *simplifyBinOp(unsigned, Value *, Value *, const SimplifyQuery &,
                             unsigned);
-static Value *SimplifyBinOp(unsigned, Value *, Value *, const FastMathFlags &,
+static Value *simplifyBinOp(unsigned, Value *, Value *, const FastMathFlags &,
                             const SimplifyQuery &, unsigned);
-static Value *SimplifyCmpInst(unsigned, Value *, Value *, const SimplifyQuery &,
+static Value *simplifyCmpInst(unsigned, Value *, Value *, const SimplifyQuery &,
                               unsigned);
-static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
+static Value *simplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                                const SimplifyQuery &Q, unsigned MaxRecurse);
-static Value *SimplifyOrInst(Value *, Value *, const SimplifyQuery &, unsigned);
-static Value *SimplifyXorInst(Value *, Value *, const SimplifyQuery &, unsigned);
-static Value *SimplifyCastInst(unsigned, Value *, Type *,
-                               const SimplifyQuery &, unsigned);
-static Value *SimplifyGEPInst(Type *, Value *, ArrayRef<Value *>, bool,
+static Value *simplifyOrInst(Value *, Value *, const SimplifyQuery &, unsigned);
+static Value *simplifyXorInst(Value *, Value *, const SimplifyQuery &,
+                              unsigned);
+static Value *simplifyCastInst(unsigned, Value *, Type *, const SimplifyQuery &,
+                               unsigned);
+static Value *simplifyGEPInst(Type *, Value *, ArrayRef<Value *>, bool,
                               const SimplifyQuery &, unsigned);
-static Value *SimplifySelectInst(Value *, Value *, Value *,
+static Value *simplifySelectInst(Value *, Value *, Value *,
                                  const SimplifyQuery &, unsigned);
 
 static Value *foldSelectWithBinaryOp(Value *Cond, Value *TrueVal,
@@ -120,15 +118,11 @@ static Value *foldSelectWithBinaryOp(Value *Cond, Value *TrueVal,
 
 /// For a boolean type or a vector of boolean type, return false or a vector
 /// with every element false.
-static Constant *getFalse(Type *Ty) {
-  return ConstantInt::getFalse(Ty);
-}
+static Constant *getFalse(Type *Ty) { return ConstantInt::getFalse(Ty); }
 
 /// For a boolean type or a vector of boolean type, return true or a vector
 /// with every element true.
-static Constant *getTrue(Type *Ty) {
-  return ConstantInt::getTrue(Ty);
-}
+static Constant *getTrue(Type *Ty) { return ConstantInt::getTrue(Ty); }
 
 /// isSameCompare - Is V equivalent to the comparison "LHS Pred RHS"?
 static bool isSameCompare(Value *V, CmpInst::Predicate Pred, Value *LHS,
@@ -141,7 +135,7 @@ static bool isSameCompare(Value *V, CmpInst::Predicate Pred, Value *LHS,
   if (CPred == Pred && CLHS == LHS && CRHS == RHS)
     return true;
   return CPred == CmpInst::getSwappedPredicate(Pred) && CLHS == RHS &&
-    CRHS == LHS;
+         CRHS == LHS;
 }
 
 /// Simplify comparison with true or false branch of select:
@@ -153,7 +147,7 @@ static Value *simplifyCmpSelCase(CmpInst::Predicate Pred, Value *LHS,
                                  Value *RHS, Value *Cond,
                                  const SimplifyQuery &Q, unsigned MaxRecurse,
                                  Constant *TrueOrFalse) {
-  Value *SimplifiedCmp = SimplifyCmpInst(Pred, LHS, RHS, Q, MaxRecurse);
+  Value *SimplifiedCmp = simplifyCmpInst(Pred, LHS, RHS, Q, MaxRecurse);
   if (SimplifiedCmp == Cond) {
     // %cmp simplified to the select condition (%cond).
     return TrueOrFalse;
@@ -196,17 +190,17 @@ static Value *handleOtherCmpSelSimplifications(Value *TCmp, Value *FCmp,
   // checks whether folding it does not convert a well-defined value into
   // poison.
   if (match(FCmp, m_Zero()) && impliesPoison(TCmp, Cond))
-    if (Value *V = SimplifyAndInst(Cond, TCmp, Q, MaxRecurse))
+    if (Value *V = simplifyAndInst(Cond, TCmp, Q, MaxRecurse))
       return V;
   // If the true value simplified to true, then the result of the compare
   // is equal to "Cond || FCmp".
   if (match(TCmp, m_One()) && impliesPoison(FCmp, Cond))
-    if (Value *V = SimplifyOrInst(Cond, FCmp, Q, MaxRecurse))
+    if (Value *V = simplifyOrInst(Cond, FCmp, Q, MaxRecurse))
       return V;
   // Finally, if the false value simplified to true and the true value to
   // false, then the result of the compare is equal to "!Cond".
   if (match(FCmp, m_One()) && match(TCmp, m_Zero()))
-    if (Value *V = SimplifyXorInst(
+    if (Value *V = simplifyXorInst(
             Cond, Constant::getAllOnesValue(Cond->getType()), Q, MaxRecurse))
       return V;
   return nullptr;
@@ -248,12 +242,12 @@ static Value *expandBinOp(Instruction::BinaryOps Opcode, Value *V,
   if (!B || B->getOpcode() != OpcodeToExpand)
     return nullptr;
   Value *B0 = B->getOperand(0), *B1 = B->getOperand(1);
-  Value *L = SimplifyBinOp(Opcode, B0, OtherOp, Q.getWithoutUndef(),
-                           MaxRecurse);
+  Value *L =
+      simplifyBinOp(Opcode, B0, OtherOp, Q.getWithoutUndef(), MaxRecurse);
   if (!L)
     return nullptr;
-  Value *R = SimplifyBinOp(Opcode, B1, OtherOp, Q.getWithoutUndef(),
-                           MaxRecurse);
+  Value *R =
+      simplifyBinOp(Opcode, B1, OtherOp, Q.getWithoutUndef(), MaxRecurse);
   if (!R)
     return nullptr;
 
@@ -265,7 +259,7 @@ static Value *expandBinOp(Instruction::BinaryOps Opcode, Value *V,
   }
 
   // Otherwise, return "L op' R" if it simplifies.
-  Value *S = SimplifyBinOp(OpcodeToExpand, L, R, Q, MaxRecurse);
+  Value *S = simplifyBinOp(OpcodeToExpand, L, R, Q, MaxRecurse);
   if (!S)
     return nullptr;
 
@@ -275,8 +269,8 @@ static Value *expandBinOp(Instruction::BinaryOps Opcode, Value *V,
 
 /// Try to simplify binops of form "A op (B op' C)" or the commuted variant by
 /// distributing op over op'.
-static Value *expandCommutativeBinOp(Instruction::BinaryOps Opcode,
-                                     Value *L, Value *R,
+static Value *expandCommutativeBinOp(Instruction::BinaryOps Opcode, Value *L,
+                                     Value *R,
                                      Instruction::BinaryOps OpcodeToExpand,
                                      const SimplifyQuery &Q,
                                      unsigned MaxRecurse) {
@@ -293,7 +287,7 @@ static Value *expandCommutativeBinOp(Instruction::BinaryOps Opcode,
 
 /// Generic simplifications for associative binary operations.
 /// Returns the simpler value, or null if none was found.
-static Value *SimplifyAssociativeBinOp(Instruction::BinaryOps Opcode,
+static Value *simplifyAssociativeBinOp(Instruction::BinaryOps Opcode,
                                        Value *LHS, Value *RHS,
                                        const SimplifyQuery &Q,
                                        unsigned MaxRecurse) {
@@ -313,12 +307,13 @@ static Value *SimplifyAssociativeBinOp(Instruction::BinaryOps Opcode,
     Value *C = RHS;
 
     // Does "B op C" simplify?
-    if (Value *V = SimplifyBinOp(Opcode, B, C, Q, MaxRecurse)) {
+    if (Value *V = simplifyBinOp(Opcode, B, C, Q, MaxRecurse)) {
       // It does!  Return "A op V" if it simplifies or is already available.
       // If V equals B then "A op V" is just the LHS.
-      if (V == B) return LHS;
+      if (V == B)
+        return LHS;
       // Otherwise return "A op V" if it simplifies.
-      if (Value *W = SimplifyBinOp(Opcode, A, V, Q, MaxRecurse)) {
+      if (Value *W = simplifyBinOp(Opcode, A, V, Q, MaxRecurse)) {
         ++NumReassoc;
         return W;
       }
@@ -332,12 +327,13 @@ static Value *SimplifyAssociativeBinOp(Instruction::BinaryOps Opcode,
     Value *C = Op1->getOperand(1);
 
     // Does "A op B" simplify?
-    if (Value *V = SimplifyBinOp(Opcode, A, B, Q, MaxRecurse)) {
+    if (Value *V = simplifyBinOp(Opcode, A, B, Q, MaxRecurse)) {
       // It does!  Return "V op C" if it simplifies or is already available.
       // If V equals B then "V op C" is just the RHS.
-      if (V == B) return RHS;
+      if (V == B)
+        return RHS;
       // Otherwise return "V op C" if it simplifies.
-      if (Value *W = SimplifyBinOp(Opcode, V, C, Q, MaxRecurse)) {
+      if (Value *W = simplifyBinOp(Opcode, V, C, Q, MaxRecurse)) {
         ++NumReassoc;
         return W;
       }
@@ -355,12 +351,13 @@ static Value *SimplifyAssociativeBinOp(Instruction::BinaryOps Opcode,
     Value *C = RHS;
 
     // Does "C op A" simplify?
-    if (Value *V = SimplifyBinOp(Opcode, C, A, Q, MaxRecurse)) {
+    if (Value *V = simplifyBinOp(Opcode, C, A, Q, MaxRecurse)) {
       // It does!  Return "V op B" if it simplifies or is already available.
       // If V equals A then "V op B" is just the LHS.
-      if (V == A) return LHS;
+      if (V == A)
+        return LHS;
       // Otherwise return "V op B" if it simplifies.
-      if (Value *W = SimplifyBinOp(Opcode, V, B, Q, MaxRecurse)) {
+      if (Value *W = simplifyBinOp(Opcode, V, B, Q, MaxRecurse)) {
         ++NumReassoc;
         return W;
       }
@@ -374,12 +371,13 @@ static Value *SimplifyAssociativeBinOp(Instruction::BinaryOps Opcode,
     Value *C = Op1->getOperand(1);
 
     // Does "C op A" simplify?
-    if (Value *V = SimplifyBinOp(Opcode, C, A, Q, MaxRecurse)) {
+    if (Value *V = simplifyBinOp(Opcode, C, A, Q, MaxRecurse)) {
       // It does!  Return "B op V" if it simplifies or is already available.
       // If V equals C then "B op V" is just the RHS.
-      if (V == C) return RHS;
+      if (V == C)
+        return RHS;
       // Otherwise return "B op V" if it simplifies.
-      if (Value *W = SimplifyBinOp(Opcode, B, V, Q, MaxRecurse)) {
+      if (Value *W = simplifyBinOp(Opcode, B, V, Q, MaxRecurse)) {
         ++NumReassoc;
         return W;
       }
@@ -393,7 +391,7 @@ static Value *SimplifyAssociativeBinOp(Instruction::BinaryOps Opcode,
 /// try to simplify the binop by seeing whether evaluating it on both branches
 /// of the select results in the same value. Returns the common value if so,
 /// otherwise returns null.
-static Value *ThreadBinOpOverSelect(Instruction::BinaryOps Opcode, Value *LHS,
+static Value *threadBinOpOverSelect(Instruction::BinaryOps Opcode, Value *LHS,
                                     Value *RHS, const SimplifyQuery &Q,
                                     unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
@@ -412,11 +410,11 @@ static Value *ThreadBinOpOverSelect(Instruction::BinaryOps Opcode, Value *LHS,
   Value *TV;
   Value *FV;
   if (SI == LHS) {
-    TV = SimplifyBinOp(Opcode, SI->getTrueValue(), RHS, Q, MaxRecurse);
-    FV = SimplifyBinOp(Opcode, SI->getFalseValue(), RHS, Q, MaxRecurse);
+    TV = simplifyBinOp(Opcode, SI->getTrueValue(), RHS, Q, MaxRecurse);
+    FV = simplifyBinOp(Opcode, SI->getFalseValue(), RHS, Q, MaxRecurse);
   } else {
-    TV = SimplifyBinOp(Opcode, LHS, SI->getTrueValue(), Q, MaxRecurse);
-    FV = SimplifyBinOp(Opcode, LHS, SI->getFalseValue(), Q, MaxRecurse);
+    TV = simplifyBinOp(Opcode, LHS, SI->getTrueValue(), Q, MaxRecurse);
+    FV = simplifyBinOp(Opcode, LHS, SI->getFalseValue(), Q, MaxRecurse);
   }
 
   // If they simplified to the same value, then return the common value.
@@ -471,7 +469,7 @@ static Value *ThreadBinOpOverSelect(Instruction::BinaryOps Opcode, Value *LHS,
 /// We can simplify %cmp1 to true, because both branches of select are
 /// less than 3. We compose new comparison by substituting %tmp with both
 /// branches of select and see if it can be simplified.
-static Value *ThreadCmpOverSelect(CmpInst::Predicate Pred, Value *LHS,
+static Value *threadCmpOverSelect(CmpInst::Predicate Pred, Value *LHS,
                                   Value *RHS, const SimplifyQuery &Q,
                                   unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
@@ -517,7 +515,7 @@ static Value *ThreadCmpOverSelect(CmpInst::Predicate Pred, Value *LHS,
 /// try to simplify the binop by seeing whether evaluating it on the incoming
 /// phi values yields the same result for every value. If so returns the common
 /// value, otherwise returns null.
-static Value *ThreadBinOpOverPHI(Instruction::BinaryOps Opcode, Value *LHS,
+static Value *threadBinOpOverPHI(Instruction::BinaryOps Opcode, Value *LHS,
                                  Value *RHS, const SimplifyQuery &Q,
                                  unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
@@ -542,10 +540,10 @@ static Value *ThreadBinOpOverPHI(Instruction::BinaryOps Opcode, Value *LHS,
   Value *CommonValue = nullptr;
   for (Value *Incoming : PI->incoming_values()) {
     // If the incoming value is the phi node itself, it can safely be skipped.
-    if (Incoming == PI) continue;
-    Value *V = PI == LHS ?
-      SimplifyBinOp(Opcode, Incoming, RHS, Q, MaxRecurse) :
-      SimplifyBinOp(Opcode, LHS, Incoming, Q, MaxRecurse);
+    if (Incoming == PI)
+      continue;
+    Value *V = PI == LHS ? simplifyBinOp(Opcode, Incoming, RHS, Q, MaxRecurse)
+                         : simplifyBinOp(Opcode, LHS, Incoming, Q, MaxRecurse);
     // If the operation failed to simplify, or simplified to a different value
     // to previously, then give up.
     if (!V || (CommonValue && V != CommonValue))
@@ -560,7 +558,7 @@ static Value *ThreadBinOpOverPHI(Instruction::BinaryOps Opcode, Value *LHS,
 /// comparison by seeing whether comparing with all of the incoming phi values
 /// yields the same result every time. If so returns the common result,
 /// otherwise returns null.
-static Value *ThreadCmpOverPHI(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
+static Value *threadCmpOverPHI(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
                                const SimplifyQuery &Q, unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
@@ -584,11 +582,12 @@ static Value *ThreadCmpOverPHI(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
     Value *Incoming = PI->getIncomingValue(u);
     Instruction *InTI = PI->getIncomingBlock(u)->getTerminator();
     // If the incoming value is the phi node itself, it can safely be skipped.
-    if (Incoming == PI) continue;
+    if (Incoming == PI)
+      continue;
     // Change the context instruction to the "edge" that flows into the phi.
     // This is important because that is where incoming is actually "evaluated"
     // even though it is used later somewhere else.
-    Value *V = SimplifyCmpInst(Pred, Incoming, RHS, Q.getWithInstruction(InTI),
+    Value *V = simplifyCmpInst(Pred, Incoming, RHS, Q.getWithInstruction(InTI),
                                MaxRecurse);
     // If the operation failed to simplify, or simplified to a different value
     // to previously, then give up.
@@ -604,8 +603,20 @@ static Constant *foldOrCommuteConstant(Instruction::BinaryOps Opcode,
                                        Value *&Op0, Value *&Op1,
                                        const SimplifyQuery &Q) {
   if (auto *CLHS = dyn_cast<Constant>(Op0)) {
-    if (auto *CRHS = dyn_cast<Constant>(Op1))
+    if (auto *CRHS = dyn_cast<Constant>(Op1)) {
+      switch (Opcode) {
+      default:
+        break;
+      case Instruction::FAdd:
+      case Instruction::FSub:
+      case Instruction::FMul:
+      case Instruction::FDiv:
+      case Instruction::FRem:
+        if (Q.CxtI != nullptr)
+          return ConstantFoldFPInstOperands(Opcode, CLHS, CRHS, Q.DL, Q.CxtI);
+      }
       return ConstantFoldBinaryOpOperands(Opcode, CLHS, CRHS, Q.DL);
+    }
 
     // Canonicalize the constant to the RHS if this is a commutative operation.
     if (Instruction::isCommutative(Opcode))
@@ -616,7 +627,7 @@ static Constant *foldOrCommuteConstant(Instruction::BinaryOps Opcode,
 
 /// Given operands for an Add, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool IsNSW, bool IsNUW,
+static Value *simplifyAddInst(Value *Op0, Value *Op1, bool IsNSW, bool IsNUW,
                               const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (Constant *C = foldOrCommuteConstant(Instruction::Add, Op0, Op1, Q))
     return C;
@@ -647,8 +658,7 @@ static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool IsNSW, bool IsNUW,
 
   // X + ~X -> -1   since   ~X = -X-1
   Type *Ty = Op0->getType();
-  if (match(Op0, m_Not(m_Specific(Op1))) ||
-      match(Op1, m_Not(m_Specific(Op0))))
+  if (match(Op0, m_Not(m_Specific(Op1))) || match(Op1, m_Not(m_Specific(Op0))))
     return Constant::getAllOnesValue(Ty);
 
   // add nsw/nuw (xor Y, signmask), signmask --> Y
@@ -664,12 +674,12 @@ static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool IsNSW, bool IsNUW,
 
   /// i1 add -> xor.
   if (MaxRecurse && Op0->getType()->isIntOrIntVectorTy(1))
-    if (Value *V = SimplifyXorInst(Op0, Op1, Q, MaxRecurse-1))
+    if (Value *V = simplifyXorInst(Op0, Op1, Q, MaxRecurse - 1))
       return V;
 
   // Try some generic simplifications for associative operations.
-  if (Value *V = SimplifyAssociativeBinOp(Instruction::Add, Op0, Op1, Q,
-                                          MaxRecurse))
+  if (Value *V =
+          simplifyAssociativeBinOp(Instruction::Add, Op0, Op1, Q, MaxRecurse))
     return V;
 
   // Threading Add over selects and phi nodes is pointless, so don't bother.
@@ -684,45 +694,37 @@ static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool IsNSW, bool IsNUW,
   return nullptr;
 }
 
-Value *llvm::SimplifyAddInst(Value *Op0, Value *Op1, bool IsNSW, bool IsNUW,
+Value *llvm::simplifyAddInst(Value *Op0, Value *Op1, bool IsNSW, bool IsNUW,
                              const SimplifyQuery &Query) {
-  return ::SimplifyAddInst(Op0, Op1, IsNSW, IsNUW, Query, RecursionLimit);
+  return ::simplifyAddInst(Op0, Op1, IsNSW, IsNUW, Query, RecursionLimit);
 }
 
 /// Compute the base pointer and cumulative constant offsets for V.
 ///
 /// This strips all constant offsets off of V, leaving it the base pointer, and
-/// accumulates the total constant offset applied in the returned constant. It
-/// returns 0 if V is not a pointer, and returns the constant '0' if there are
-/// no constant offsets applied.
+/// accumulates the total constant offset applied in the returned constant.
+/// It returns zero if there are no constant offsets applied.
 ///
-/// This is very similar to GetPointerBaseWithConstantOffset except it doesn't
-/// follow non-inbounds geps. This allows it to remain usable for icmp ult/etc.
-/// folding.
-static Constant *stripAndComputeConstantOffsets(const DataLayout &DL, Value *&V,
-                                                bool AllowNonInbounds = false) {
+/// This is very similar to stripAndAccumulateConstantOffsets(), except it
+/// normalizes the offset bitwidth to the stripped pointer type, not the
+/// original pointer type.
+static APInt stripAndComputeConstantOffsets(const DataLayout &DL, Value *&V,
+                                            bool AllowNonInbounds = false) {
   assert(V->getType()->isPtrOrPtrVectorTy());
 
   APInt Offset = APInt::getZero(DL.getIndexTypeSizeInBits(V->getType()));
-
   V = V->stripAndAccumulateConstantOffsets(DL, Offset, AllowNonInbounds);
   // As that strip may trace through `addrspacecast`, need to sext or trunc
   // the offset calculated.
-  Type *IntIdxTy = DL.getIndexType(V->getType())->getScalarType();
-  Offset = Offset.sextOrTrunc(IntIdxTy->getIntegerBitWidth());
-
-  Constant *OffsetIntPtr = ConstantInt::get(IntIdxTy, Offset);
-  if (VectorType *VecTy = dyn_cast<VectorType>(V->getType()))
-    return ConstantVector::getSplat(VecTy->getElementCount(), OffsetIntPtr);
-  return OffsetIntPtr;
+  return Offset.sextOrTrunc(DL.getIndexTypeSizeInBits(V->getType()));
 }
 
 /// Compute the constant difference between two pointer values.
 /// If the difference is not a constant, returns zero.
 static Constant *computePointerDifference(const DataLayout &DL, Value *LHS,
                                           Value *RHS) {
-  Constant *LHSOffset = stripAndComputeConstantOffsets(DL, LHS);
-  Constant *RHSOffset = stripAndComputeConstantOffsets(DL, RHS);
+  APInt LHSOffset = stripAndComputeConstantOffsets(DL, LHS);
+  APInt RHSOffset = stripAndComputeConstantOffsets(DL, RHS);
 
   // If LHS and RHS are not related via constant offsets to the same base
   // value, there is nothing we can do here.
@@ -733,12 +735,15 @@ static Constant *computePointerDifference(const DataLayout &DL, Value *LHS,
   //    LHS - RHS
   //  = (LHSOffset + Base) - (RHSOffset + Base)
   //  = LHSOffset - RHSOffset
-  return ConstantExpr::getSub(LHSOffset, RHSOffset);
+  Constant *Res = ConstantInt::get(LHS->getContext(), LHSOffset - RHSOffset);
+  if (auto *VecTy = dyn_cast<VectorType>(LHS->getType()))
+    Res = ConstantVector::getSplat(VecTy->getElementCount(), Res);
+  return Res;
 }
 
 /// Given operands for a Sub, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
+static Value *simplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                               const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (Constant *C = foldOrCommuteConstant(Instruction::Sub, Op0, Op1, Q))
     return C;
@@ -784,17 +789,17 @@ static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
   Value *X = nullptr, *Y = nullptr, *Z = Op1;
   if (MaxRecurse && match(Op0, m_Add(m_Value(X), m_Value(Y)))) { // (X + Y) - Z
     // See if "V === Y - Z" simplifies.
-    if (Value *V = SimplifyBinOp(Instruction::Sub, Y, Z, Q, MaxRecurse-1))
+    if (Value *V = simplifyBinOp(Instruction::Sub, Y, Z, Q, MaxRecurse - 1))
       // It does!  Now see if "X + V" simplifies.
-      if (Value *W = SimplifyBinOp(Instruction::Add, X, V, Q, MaxRecurse-1)) {
+      if (Value *W = simplifyBinOp(Instruction::Add, X, V, Q, MaxRecurse - 1)) {
         // It does, we successfully reassociated!
         ++NumReassoc;
         return W;
       }
     // See if "V === X - Z" simplifies.
-    if (Value *V = SimplifyBinOp(Instruction::Sub, X, Z, Q, MaxRecurse-1))
+    if (Value *V = simplifyBinOp(Instruction::Sub, X, Z, Q, MaxRecurse - 1))
       // It does!  Now see if "Y + V" simplifies.
-      if (Value *W = SimplifyBinOp(Instruction::Add, Y, V, Q, MaxRecurse-1)) {
+      if (Value *W = simplifyBinOp(Instruction::Add, Y, V, Q, MaxRecurse - 1)) {
         // It does, we successfully reassociated!
         ++NumReassoc;
         return W;
@@ -806,17 +811,17 @@ static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
   X = Op0;
   if (MaxRecurse && match(Op1, m_Add(m_Value(Y), m_Value(Z)))) { // X - (Y + Z)
     // See if "V === X - Y" simplifies.
-    if (Value *V = SimplifyBinOp(Instruction::Sub, X, Y, Q, MaxRecurse-1))
+    if (Value *V = simplifyBinOp(Instruction::Sub, X, Y, Q, MaxRecurse - 1))
       // It does!  Now see if "V - Z" simplifies.
-      if (Value *W = SimplifyBinOp(Instruction::Sub, V, Z, Q, MaxRecurse-1)) {
+      if (Value *W = simplifyBinOp(Instruction::Sub, V, Z, Q, MaxRecurse - 1)) {
         // It does, we successfully reassociated!
         ++NumReassoc;
         return W;
       }
     // See if "V === X - Z" simplifies.
-    if (Value *V = SimplifyBinOp(Instruction::Sub, X, Z, Q, MaxRecurse-1))
+    if (Value *V = simplifyBinOp(Instruction::Sub, X, Z, Q, MaxRecurse - 1))
       // It does!  Now see if "V - Y" simplifies.
-      if (Value *W = SimplifyBinOp(Instruction::Sub, V, Y, Q, MaxRecurse-1)) {
+      if (Value *W = simplifyBinOp(Instruction::Sub, V, Y, Q, MaxRecurse - 1)) {
         // It does, we successfully reassociated!
         ++NumReassoc;
         return W;
@@ -828,9 +833,9 @@ static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
   Z = Op0;
   if (MaxRecurse && match(Op1, m_Sub(m_Value(X), m_Value(Y)))) // Z - (X - Y)
     // See if "V === Z - X" simplifies.
-    if (Value *V = SimplifyBinOp(Instruction::Sub, Z, X, Q, MaxRecurse-1))
+    if (Value *V = simplifyBinOp(Instruction::Sub, Z, X, Q, MaxRecurse - 1))
       // It does!  Now see if "V + Y" simplifies.
-      if (Value *W = SimplifyBinOp(Instruction::Add, V, Y, Q, MaxRecurse-1)) {
+      if (Value *W = simplifyBinOp(Instruction::Add, V, Y, Q, MaxRecurse - 1)) {
         // It does, we successfully reassociated!
         ++NumReassoc;
         return W;
@@ -841,22 +846,21 @@ static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
       match(Op1, m_Trunc(m_Value(Y))))
     if (X->getType() == Y->getType())
       // See if "V === X - Y" simplifies.
-      if (Value *V = SimplifyBinOp(Instruction::Sub, X, Y, Q, MaxRecurse-1))
+      if (Value *V = simplifyBinOp(Instruction::Sub, X, Y, Q, MaxRecurse - 1))
         // It does!  Now see if "trunc V" simplifies.
-        if (Value *W = SimplifyCastInst(Instruction::Trunc, V, Op0->getType(),
+        if (Value *W = simplifyCastInst(Instruction::Trunc, V, Op0->getType(),
                                         Q, MaxRecurse - 1))
           // It does, return the simplified "trunc V".
           return W;
 
   // Variations on GEP(base, I, ...) - GEP(base, i, ...) -> GEP(null, I-i, ...).
-  if (match(Op0, m_PtrToInt(m_Value(X))) &&
-      match(Op1, m_PtrToInt(m_Value(Y))))
+  if (match(Op0, m_PtrToInt(m_Value(X))) && match(Op1, m_PtrToInt(m_Value(Y))))
     if (Constant *Result = computePointerDifference(Q.DL, X, Y))
       return ConstantExpr::getIntegerCast(Result, Op0->getType(), true);
 
   // i1 sub -> xor.
   if (MaxRecurse && Op0->getType()->isIntOrIntVectorTy(1))
-    if (Value *V = SimplifyXorInst(Op0, Op1, Q, MaxRecurse-1))
+    if (Value *V = simplifyXorInst(Op0, Op1, Q, MaxRecurse - 1))
       return V;
 
   // Threading Sub over selects and phi nodes is pointless, so don't bother.
@@ -871,14 +875,14 @@ static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
   return nullptr;
 }
 
-Value *llvm::SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
+Value *llvm::simplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                              const SimplifyQuery &Q) {
-  return ::SimplifySubInst(Op0, Op1, isNSW, isNUW, Q, RecursionLimit);
+  return ::simplifySubInst(Op0, Op1, isNSW, isNUW, Q, RecursionLimit);
 }
 
 /// Given operands for a Mul, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifyMulInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
+static Value *simplifyMulInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                               unsigned MaxRecurse) {
   if (Constant *C = foldOrCommuteConstant(Instruction::Mul, Op0, Op1, Q))
     return C;
@@ -906,12 +910,12 @@ static Value *SimplifyMulInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
 
   // i1 mul -> and.
   if (MaxRecurse && Op0->getType()->isIntOrIntVectorTy(1))
-    if (Value *V = SimplifyAndInst(Op0, Op1, Q, MaxRecurse-1))
+    if (Value *V = simplifyAndInst(Op0, Op1, Q, MaxRecurse - 1))
       return V;
 
   // Try some generic simplifications for associative operations.
-  if (Value *V = SimplifyAssociativeBinOp(Instruction::Mul, Op0, Op1, Q,
-                                          MaxRecurse))
+  if (Value *V =
+          simplifyAssociativeBinOp(Instruction::Mul, Op0, Op1, Q, MaxRecurse))
     return V;
 
   // Mul distributes over Add. Try some generic simplifications based on this.
@@ -922,22 +926,22 @@ static Value *SimplifyMulInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
   // If the operation is with the result of a select instruction, check whether
   // operating on either branch of the select always yields the same value.
   if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
-    if (Value *V = ThreadBinOpOverSelect(Instruction::Mul, Op0, Op1, Q,
-                                         MaxRecurse))
+    if (Value *V =
+            threadBinOpOverSelect(Instruction::Mul, Op0, Op1, Q, MaxRecurse))
       return V;
 
   // If the operation is with the result of a phi instruction, check whether
   // operating on all incoming values of the phi always yields the same value.
   if (isa<PHINode>(Op0) || isa<PHINode>(Op1))
-    if (Value *V = ThreadBinOpOverPHI(Instruction::Mul, Op0, Op1, Q,
-                                      MaxRecurse))
+    if (Value *V =
+            threadBinOpOverPHI(Instruction::Mul, Op0, Op1, Q, MaxRecurse))
       return V;
 
   return nullptr;
 }
 
-Value *llvm::SimplifyMulInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
-  return ::SimplifyMulInst(Op0, Op1, Q, RecursionLimit);
+Value *llvm::simplifyMulInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
+  return ::simplifyMulInst(Op0, Op1, Q, RecursionLimit);
 }
 
 /// Check for common or similar folds of integer division or integer remainder.
@@ -1026,7 +1030,7 @@ static Value *simplifyDivRem(Instruction::BinaryOps Opcode, Value *Op0,
 /// when we can prove a relationship between the operands.
 static bool isICmpTrue(ICmpInst::Predicate Pred, Value *LHS, Value *RHS,
                        const SimplifyQuery &Q, unsigned MaxRecurse) {
-  Value *V = SimplifyICmpInst(Pred, LHS, RHS, Q, MaxRecurse);
+  Value *V = simplifyICmpInst(Pred, LHS, RHS, Q, MaxRecurse);
   Constant *C = dyn_cast_or_null<Constant>(V);
   return (C && C->isAllOnesValue());
 }
@@ -1122,13 +1126,13 @@ static Value *simplifyDiv(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
   // If the operation is with the result of a select instruction, check whether
   // operating on either branch of the select always yields the same value.
   if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
-    if (Value *V = ThreadBinOpOverSelect(Opcode, Op0, Op1, Q, MaxRecurse))
+    if (Value *V = threadBinOpOverSelect(Opcode, Op0, Op1, Q, MaxRecurse))
       return V;
 
   // If the operation is with the result of a phi instruction, check whether
   // operating on all incoming values of the phi always yields the same value.
   if (isa<PHINode>(Op0) || isa<PHINode>(Op1))
-    if (Value *V = ThreadBinOpOverPHI(Opcode, Op0, Op1, Q, MaxRecurse))
+    if (Value *V = threadBinOpOverPHI(Opcode, Op0, Op1, Q, MaxRecurse))
       return V;
 
   if (isDivZero(Op0, Op1, Q, MaxRecurse, IsSigned))
@@ -1164,13 +1168,13 @@ static Value *simplifyRem(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
   // If the operation is with the result of a select instruction, check whether
   // operating on either branch of the select always yields the same value.
   if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
-    if (Value *V = ThreadBinOpOverSelect(Opcode, Op0, Op1, Q, MaxRecurse))
+    if (Value *V = threadBinOpOverSelect(Opcode, Op0, Op1, Q, MaxRecurse))
       return V;
 
   // If the operation is with the result of a phi instruction, check whether
   // operating on all incoming values of the phi always yields the same value.
   if (isa<PHINode>(Op0) || isa<PHINode>(Op1))
-    if (Value *V = ThreadBinOpOverPHI(Opcode, Op0, Op1, Q, MaxRecurse))
+    if (Value *V = threadBinOpOverPHI(Opcode, Op0, Op1, Q, MaxRecurse))
       return V;
 
   // If X / Y == 0, then X % Y == X.
@@ -1182,7 +1186,7 @@ static Value *simplifyRem(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
 
 /// Given operands for an SDiv, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifySDivInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
+static Value *simplifySDivInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                                unsigned MaxRecurse) {
   // If two operands are negated and no signed overflow, return -1.
   if (isKnownNegation(Op0, Op1, /*NeedNSW=*/true))
@@ -1191,24 +1195,24 @@ static Value *SimplifySDivInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
   return simplifyDiv(Instruction::SDiv, Op0, Op1, Q, MaxRecurse);
 }
 
-Value *llvm::SimplifySDivInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
-  return ::SimplifySDivInst(Op0, Op1, Q, RecursionLimit);
+Value *llvm::simplifySDivInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
+  return ::simplifySDivInst(Op0, Op1, Q, RecursionLimit);
 }
 
 /// Given operands for a UDiv, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifyUDivInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
+static Value *simplifyUDivInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                                unsigned MaxRecurse) {
   return simplifyDiv(Instruction::UDiv, Op0, Op1, Q, MaxRecurse);
 }
 
-Value *llvm::SimplifyUDivInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
-  return ::SimplifyUDivInst(Op0, Op1, Q, RecursionLimit);
+Value *llvm::simplifyUDivInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
+  return ::simplifyUDivInst(Op0, Op1, Q, RecursionLimit);
 }
 
 /// Given operands for an SRem, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifySRemInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
+static Value *simplifySRemInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                                unsigned MaxRecurse) {
   // If the divisor is 0, the result is undefined, so assume the divisor is -1.
   // srem Op0, (sext i1 X) --> srem Op0, -1 --> 0
@@ -1223,19 +1227,19 @@ static Value *SimplifySRemInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
   return simplifyRem(Instruction::SRem, Op0, Op1, Q, MaxRecurse);
 }
 
-Value *llvm::SimplifySRemInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
-  return ::SimplifySRemInst(Op0, Op1, Q, RecursionLimit);
+Value *llvm::simplifySRemInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
+  return ::simplifySRemInst(Op0, Op1, Q, RecursionLimit);
 }
 
 /// Given operands for a URem, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifyURemInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
+static Value *simplifyURemInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                                unsigned MaxRecurse) {
   return simplifyRem(Instruction::URem, Op0, Op1, Q, MaxRecurse);
 }
 
-Value *llvm::SimplifyURemInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
-  return ::SimplifyURemInst(Op0, Op1, Q, RecursionLimit);
+Value *llvm::simplifyURemInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
+  return ::simplifyURemInst(Op0, Op1, Q, RecursionLimit);
 }
 
 /// Returns true if a shift by \c Amount always yields poison.
@@ -1268,7 +1272,7 @@ static bool isPoisonShift(Value *Amount, const SimplifyQuery &Q) {
 
 /// Given operands for an Shl, LShr or AShr, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifyShift(Instruction::BinaryOps Opcode, Value *Op0,
+static Value *simplifyShift(Instruction::BinaryOps Opcode, Value *Op0,
                             Value *Op1, bool IsNSW, const SimplifyQuery &Q,
                             unsigned MaxRecurse) {
   if (Constant *C = foldOrCommuteConstant(Opcode, Op0, Op1, Q))
@@ -1297,13 +1301,13 @@ static Value *SimplifyShift(Instruction::BinaryOps Opcode, Value *Op0,
   // If the operation is with the result of a select instruction, check whether
   // operating on either branch of the select always yields the same value.
   if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
-    if (Value *V = ThreadBinOpOverSelect(Opcode, Op0, Op1, Q, MaxRecurse))
+    if (Value *V = threadBinOpOverSelect(Opcode, Op0, Op1, Q, MaxRecurse))
       return V;
 
   // If the operation is with the result of a phi instruction, check whether
   // operating on all incoming values of the phi always yields the same value.
   if (isa<PHINode>(Op0) || isa<PHINode>(Op1))
-    if (Value *V = ThreadBinOpOverPHI(Opcode, Op0, Op1, Q, MaxRecurse))
+    if (Value *V = threadBinOpOverPHI(Opcode, Op0, Op1, Q, MaxRecurse))
       return V;
 
   // If any bits in the shift amount make that value greater than or equal to
@@ -1338,11 +1342,11 @@ static Value *SimplifyShift(Instruction::BinaryOps Opcode, Value *Op0,
 
 /// Given operands for an Shl, LShr or AShr, see if we can
 /// fold the result.  If not, this returns null.
-static Value *SimplifyRightShift(Instruction::BinaryOps Opcode, Value *Op0,
-                                 Value *Op1, bool isExact, const SimplifyQuery &Q,
-                                 unsigned MaxRecurse) {
+static Value *simplifyRightShift(Instruction::BinaryOps Opcode, Value *Op0,
+                                 Value *Op1, bool isExact,
+                                 const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (Value *V =
-          SimplifyShift(Opcode, Op0, Op1, /*IsNSW*/ false, Q, MaxRecurse))
+          simplifyShift(Opcode, Op0, Op1, /*IsNSW*/ false, Q, MaxRecurse))
     return V;
 
   // X >> X -> 0
@@ -1356,7 +1360,8 @@ static Value *SimplifyRightShift(Instruction::BinaryOps Opcode, Value *Op0,
 
   // The low bit cannot be shifted out of an exact shift if it is set.
   if (isExact) {
-    KnownBits Op0Known = computeKnownBits(Op0, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT);
+    KnownBits Op0Known =
+        computeKnownBits(Op0, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT);
     if (Op0Known.One[0])
       return Op0;
   }
@@ -1366,10 +1371,10 @@ static Value *SimplifyRightShift(Instruction::BinaryOps Opcode, Value *Op0,
 
 /// Given operands for an Shl, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
+static Value *simplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                               const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (Value *V =
-          SimplifyShift(Instruction::Shl, Op0, Op1, isNSW, Q, MaxRecurse))
+          simplifyShift(Instruction::Shl, Op0, Op1, isNSW, Q, MaxRecurse))
     return V;
 
   // undef << X -> 0
@@ -1392,18 +1397,18 @@ static Value *SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
   return nullptr;
 }
 
-Value *llvm::SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
+Value *llvm::simplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                              const SimplifyQuery &Q) {
-  return ::SimplifyShlInst(Op0, Op1, isNSW, isNUW, Q, RecursionLimit);
+  return ::simplifyShlInst(Op0, Op1, isNSW, isNUW, Q, RecursionLimit);
 }
 
 /// Given operands for an LShr, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
+static Value *simplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
                                const SimplifyQuery &Q, unsigned MaxRecurse) {
-  if (Value *V = SimplifyRightShift(Instruction::LShr, Op0, Op1, isExact, Q,
+  if (Value *V = simplifyRightShift(Instruction::LShr, Op0, Op1, isExact, Q,
                                     MaxRecurse))
-      return V;
+    return V;
 
   // (X << A) >> A -> X
   Value *X;
@@ -1429,16 +1434,16 @@ static Value *SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
   return nullptr;
 }
 
-Value *llvm::SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
+Value *llvm::simplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
                               const SimplifyQuery &Q) {
-  return ::SimplifyLShrInst(Op0, Op1, isExact, Q, RecursionLimit);
+  return ::simplifyLShrInst(Op0, Op1, isExact, Q, RecursionLimit);
 }
 
 /// Given operands for an AShr, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
+static Value *simplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
                                const SimplifyQuery &Q, unsigned MaxRecurse) {
-  if (Value *V = SimplifyRightShift(Instruction::AShr, Op0, Op1, isExact, Q,
+  if (Value *V = simplifyRightShift(Instruction::AShr, Op0, Op1, isExact, Q,
                                     MaxRecurse))
     return V;
 
@@ -1462,9 +1467,9 @@ static Value *SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
   return nullptr;
 }
 
-Value *llvm::SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
+Value *llvm::simplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
                               const SimplifyQuery &Q) {
-  return ::SimplifyAShrInst(Op0, Op1, isExact, Q, RecursionLimit);
+  return ::simplifyAShrInst(Op0, Op1, isExact, Q, RecursionLimit);
 }
 
 /// Commuted variants are assumed to be handled by calling this function again
@@ -1581,7 +1586,7 @@ static Value *simplifyUnsignedRangeCheck(ICmpInst *ZeroICmp,
 /// with the parameters swapped.
 static Value *simplifyAndOfICmpsWithSameOperands(ICmpInst *Op0, ICmpInst *Op1) {
   ICmpInst::Predicate Pred0, Pred1;
-  Value *A ,*B;
+  Value *A, *B;
   if (!match(Op0, m_ICmp(Pred0, m_Value(A), m_Value(B))) ||
       !match(Op1, m_ICmp(Pred1, m_Specific(A), m_Specific(B))))
     return nullptr;
@@ -1606,7 +1611,7 @@ static Value *simplifyAndOfICmpsWithSameOperands(ICmpInst *Op0, ICmpInst *Op1) {
 /// with the parameters swapped.
 static Value *simplifyOrOfICmpsWithSameOperands(ICmpInst *Op0, ICmpInst *Op1) {
   ICmpInst::Predicate Pred0, Pred1;
-  Value *A ,*B;
+  Value *A, *B;
   if (!match(Op0, m_ICmp(Pred0, m_Value(A), m_Value(B))) ||
       !match(Op1, m_ICmp(Pred1, m_Specific(A), m_Specific(B))))
     return nullptr;
@@ -1812,6 +1817,27 @@ static Value *simplifyAndOrOfICmpsWithLimitConst(ICmpInst *Cmp0, ICmpInst *Cmp1,
   return nullptr;
 }
 
+/// Try to simplify and/or of icmp with ctpop intrinsic.
+static Value *simplifyAndOrOfICmpsWithCtpop(ICmpInst *Cmp0, ICmpInst *Cmp1,
+                                            bool IsAnd) {
+  ICmpInst::Predicate Pred0, Pred1;
+  Value *X;
+  const APInt *C;
+  if (!match(Cmp0, m_ICmp(Pred0, m_Intrinsic<Intrinsic::ctpop>(m_Value(X)),
+                          m_APInt(C))) ||
+      !match(Cmp1, m_ICmp(Pred1, m_Specific(X), m_ZeroInt())) || C->isZero())
+    return nullptr;
+
+  // (ctpop(X) == C) || (X != 0) --> X != 0 where C > 0
+  if (!IsAnd && Pred0 == ICmpInst::ICMP_EQ && Pred1 == ICmpInst::ICMP_NE)
+    return Cmp1;
+  // (ctpop(X) != C) && (X == 0) --> X == 0 where C > 0
+  if (IsAnd && Pred0 == ICmpInst::ICMP_NE && Pred1 == ICmpInst::ICMP_EQ)
+    return Cmp1;
+
+  return nullptr;
+}
+
 static Value *simplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1,
                                  const SimplifyQuery &Q) {
   if (Value *X = simplifyUnsignedRangeCheck(Op0, Op1, /*IsAnd=*/true, Q))
@@ -1833,6 +1859,11 @@ static Value *simplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1,
   if (Value *X = simplifyAndOrOfICmpsWithZero(Op0, Op1, true))
     return X;
 
+  if (Value *X = simplifyAndOrOfICmpsWithCtpop(Op0, Op1, true))
+    return X;
+  if (Value *X = simplifyAndOrOfICmpsWithCtpop(Op1, Op0, true))
+    return X;
+
   if (Value *X = simplifyAndOfICmpsWithAdd(Op0, Op1, Q.IIQ))
     return X;
   if (Value *X = simplifyAndOfICmpsWithAdd(Op1, Op0, Q.IIQ))
@@ -1909,6 +1940,11 @@ static Value *simplifyOrOfICmps(ICmpInst *Op0, ICmpInst *Op1,
   if (Value *X = simplifyAndOrOfICmpsWithZero(Op0, Op1, false))
     return X;
 
+  if (Value *X = simplifyAndOrOfICmpsWithCtpop(Op0, Op1, false))
+    return X;
+  if (Value *X = simplifyAndOrOfICmpsWithCtpop(Op1, Op0, false))
+    return X;
+
   if (Value *X = simplifyOrOfICmpsWithAdd(Op0, Op1, Q.IIQ))
     return X;
   if (Value *X = simplifyOrOfICmpsWithAdd(Op1, Op0, Q.IIQ))
@@ -1917,8 +1953,8 @@ static Value *simplifyOrOfICmps(ICmpInst *Op0, ICmpInst *Op1,
   return nullptr;
 }
 
-static Value *simplifyAndOrOfFCmps(const TargetLibraryInfo *TLI,
-                                   FCmpInst *LHS, FCmpInst *RHS, bool IsAnd) {
+static Value *simplifyAndOrOfFCmps(const TargetLibraryInfo *TLI, FCmpInst *LHS,
+                                   FCmpInst *RHS, bool IsAnd) {
   Value *LHS0 = LHS->getOperand(0), *LHS1 = LHS->getOperand(1);
   Value *RHS0 = RHS->getOperand(0), *RHS1 = RHS->getOperand(1);
   if (LHS0->getType() != RHS0->getType())
@@ -1955,8 +1991,8 @@ static Value *simplifyAndOrOfFCmps(const TargetLibraryInfo *TLI,
   return nullptr;
 }
 
-static Value *simplifyAndOrOfCmps(const SimplifyQuery &Q,
-                                  Value *Op0, Value *Op1, bool IsAnd) {
+static Value *simplifyAndOrOfCmps(const SimplifyQuery &Q, Value *Op0,
+                                  Value *Op1, bool IsAnd) {
   // Look through casts of the 'and' operands to find compares.
   auto *Cast0 = dyn_cast<CastInst>(Op0);
   auto *Cast1 = dyn_cast<CastInst>(Op1);
@@ -2017,7 +2053,7 @@ static Value *simplifyLogicOfAddSub(Value *Op0, Value *Op1,
 
 /// Given operands for an And, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
+static Value *simplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                               unsigned MaxRecurse) {
   if (Constant *C = foldOrCommuteConstant(Instruction::And, Op0, Op1, Q))
     return C;
@@ -2043,8 +2079,7 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
     return Op0;
 
   // A & ~A  =  ~A & A  =  0
-  if (match(Op0, m_Not(m_Specific(Op1))) ||
-      match(Op1, m_Not(m_Specific(Op0))))
+  if (match(Op0, m_Not(m_Specific(Op1))) || match(Op1, m_Not(m_Specific(Op0))))
     return Constant::getNullValue(Op0->getType());
 
   // (A | ?) & A = A
@@ -2117,8 +2152,8 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
     return V;
 
   // Try some generic simplifications for associative operations.
-  if (Value *V = SimplifyAssociativeBinOp(Instruction::And, Op0, Op1, Q,
-                                          MaxRecurse))
+  if (Value *V =
+          simplifyAssociativeBinOp(Instruction::And, Op0, Op1, Q, MaxRecurse))
     return V;
 
   // And distributes over Or.  Try some generic simplifications based on this.
@@ -2142,16 +2177,16 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
     // If the operation is with the result of a select instruction, check
     // whether operating on either branch of the select always yields the same
     // value.
-    if (Value *V = ThreadBinOpOverSelect(Instruction::And, Op0, Op1, Q,
-                                         MaxRecurse))
+    if (Value *V =
+            threadBinOpOverSelect(Instruction::And, Op0, Op1, Q, MaxRecurse))
       return V;
   }
 
   // If the operation is with the result of a phi instruction, check whether
   // operating on all incoming values of the phi always yields the same value.
   if (isa<PHINode>(Op0) || isa<PHINode>(Op1))
-    if (Value *V = ThreadBinOpOverPHI(Instruction::And, Op0, Op1, Q,
-                                      MaxRecurse))
+    if (Value *V =
+            threadBinOpOverPHI(Instruction::And, Op0, Op1, Q, MaxRecurse))
       return V;
 
   // Assuming the effective width of Y is not larger than A, i.e. all bits
@@ -2174,8 +2209,7 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
     const KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
     const unsigned EffWidthY = YKnown.countMaxActiveBits();
     if (EffWidthY <= ShftCnt) {
-      const KnownBits XKnown = computeKnownBits(X, Q.DL, 0, Q.AC, Q.CxtI,
-                                                Q.DT);
+      const KnownBits XKnown = computeKnownBits(X, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
       const unsigned EffWidthX = XKnown.countMaxActiveBits();
       const APInt EffBitsY = APInt::getLowBitsSet(Width, EffWidthY);
       const APInt EffBitsX = APInt::getLowBitsSet(Width, EffWidthX) << ShftCnt;
@@ -2197,11 +2231,20 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
       match(Op1, m_c_Xor(m_Specific(Or), m_Specific(Y))))
     return Constant::getNullValue(Op0->getType());
 
+  if (Op0->getType()->isIntOrIntVectorTy(1)) {
+    // Op0&Op1 -> Op0 where Op0 implies Op1
+    if (isImpliedCondition(Op0, Op1, Q.DL).value_or(false))
+      return Op0;
+    // Op0&Op1 -> Op1 where Op1 implies Op0
+    if (isImpliedCondition(Op1, Op0, Q.DL).value_or(false))
+      return Op1;
+  }
+
   return nullptr;
 }
 
-Value *llvm::SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
-  return ::SimplifyAndInst(Op0, Op1, Q, RecursionLimit);
+Value *llvm::simplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
+  return ::simplifyAndInst(Op0, Op1, Q, RecursionLimit);
 }
 
 static Value *simplifyOrLogic(Value *X, Value *Y) {
@@ -2289,7 +2332,7 @@ static Value *simplifyOrLogic(Value *X, Value *Y) {
 
 /// Given operands for an Or, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
+static Value *simplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                              unsigned MaxRecurse) {
   if (Constant *C = foldOrCommuteConstant(Instruction::Or, Op0, Op1, Q))
     return C;
@@ -2334,6 +2377,31 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
     }
   }
 
+  // A funnel shift (rotate) can be decomposed into simpler shifts. See if we
+  // are mixing in another shift that is redundant with the funnel shift.
+
+  // (fshl X, ?, Y) | (shl X, Y) --> fshl X, ?, Y
+  // (shl X, Y) | (fshl X, ?, Y) --> fshl X, ?, Y
+  if (match(Op0,
+            m_Intrinsic<Intrinsic::fshl>(m_Value(X), m_Value(), m_Value(Y))) &&
+      match(Op1, m_Shl(m_Specific(X), m_Specific(Y))))
+    return Op0;
+  if (match(Op1,
+            m_Intrinsic<Intrinsic::fshl>(m_Value(X), m_Value(), m_Value(Y))) &&
+      match(Op0, m_Shl(m_Specific(X), m_Specific(Y))))
+    return Op1;
+
+  // (fshr ?, X, Y) | (lshr X, Y) --> fshr ?, X, Y
+  // (lshr X, Y) | (fshr ?, X, Y) --> fshr ?, X, Y
+  if (match(Op0,
+            m_Intrinsic<Intrinsic::fshr>(m_Value(), m_Value(X), m_Value(Y))) &&
+      match(Op1, m_LShr(m_Specific(X), m_Specific(Y))))
+    return Op0;
+  if (match(Op1,
+            m_Intrinsic<Intrinsic::fshr>(m_Value(), m_Value(X), m_Value(Y))) &&
+      match(Op0, m_LShr(m_Specific(X), m_Specific(Y))))
+    return Op1;
+
   if (Value *V = simplifyAndOrOfCmps(Q, Op0, Op1, false))
     return V;
 
@@ -2346,8 +2414,8 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
     return Op0;
 
   // Try some generic simplifications for associative operations.
-  if (Value *V = SimplifyAssociativeBinOp(Instruction::Or, Op0, Op1, Q,
-                                          MaxRecurse))
+  if (Value *V =
+          simplifyAssociativeBinOp(Instruction::Or, Op0, Op1, Q, MaxRecurse))
     return V;
 
   // Or distributes over And.  Try some generic simplifications based on this.
@@ -2366,8 +2434,8 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
     // If the operation is with the result of a select instruction, check
     // whether operating on either branch of the select always yields the same
     // value.
-    if (Value *V = ThreadBinOpOverSelect(Instruction::Or, Op0, Op1, Q,
-                                         MaxRecurse))
+    if (Value *V =
+            threadBinOpOverSelect(Instruction::Or, Op0, Op1, Q, MaxRecurse))
       return V;
   }
 
@@ -2389,8 +2457,7 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
           return A;
       }
       // Or commutes, try both ways.
-      if (C1->isMask() &&
-          match(B, m_c_Add(m_Specific(A), m_Value(N)))) {
+      if (C1->isMask() && match(B, m_c_Add(m_Specific(A), m_Value(N)))) {
         // Add commutes, try both ways.
         if (MaskedValueIsZero(N, *C1, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
           return B;
@@ -2401,19 +2468,28 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
   // If the operation is with the result of a phi instruction, check whether
   // operating on all incoming values of the phi always yields the same value.
   if (isa<PHINode>(Op0) || isa<PHINode>(Op1))
-    if (Value *V = ThreadBinOpOverPHI(Instruction::Or, Op0, Op1, Q, MaxRecurse))
+    if (Value *V = threadBinOpOverPHI(Instruction::Or, Op0, Op1, Q, MaxRecurse))
       return V;
 
+  if (Op0->getType()->isIntOrIntVectorTy(1)) {
+    // Op0|Op1 -> Op1 where Op0 implies Op1
+    if (isImpliedCondition(Op0, Op1, Q.DL).value_or(false))
+      return Op1;
+    // Op0|Op1 -> Op0 where Op1 implies Op0
+    if (isImpliedCondition(Op1, Op0, Q.DL).value_or(false))
+      return Op0;
+  }
+
   return nullptr;
 }
 
-Value *llvm::SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
-  return ::SimplifyOrInst(Op0, Op1, Q, RecursionLimit);
+Value *llvm::simplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
+  return ::simplifyOrInst(Op0, Op1, Q, RecursionLimit);
 }
 
 /// Given operands for a Xor, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifyXorInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
+static Value *simplifyXorInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                               unsigned MaxRecurse) {
   if (Constant *C = foldOrCommuteConstant(Instruction::Xor, Op0, Op1, Q))
     return C;
@@ -2435,8 +2511,7 @@ static Value *SimplifyXorInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
     return Constant::getNullValue(Op0->getType());
 
   // A ^ ~A  =  ~A ^ A  =  -1
-  if (match(Op0, m_Not(m_Specific(Op1))) ||
-      match(Op1, m_Not(m_Specific(Op0))))
+  if (match(Op0, m_Not(m_Specific(Op1))) || match(Op1, m_Not(m_Specific(Op0))))
     return Constant::getAllOnesValue(Op0->getType());
 
   auto foldAndOrNot = [](Value *X, Value *Y) -> Value * {
@@ -2467,8 +2542,8 @@ static Value *SimplifyXorInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
     return V;
 
   // Try some generic simplifications for associative operations.
-  if (Value *V = SimplifyAssociativeBinOp(Instruction::Xor, Op0, Op1, Q,
-                                          MaxRecurse))
+  if (Value *V =
+          simplifyAssociativeBinOp(Instruction::Xor, Op0, Op1, Q, MaxRecurse))
     return V;
 
   // Threading Xor over selects and phi nodes is pointless, so don't bother.
@@ -2483,19 +2558,18 @@ static Value *SimplifyXorInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
   return nullptr;
 }
 
-Value *llvm::SimplifyXorInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
-  return ::SimplifyXorInst(Op0, Op1, Q, RecursionLimit);
+Value *llvm::simplifyXorInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
+  return ::simplifyXorInst(Op0, Op1, Q, RecursionLimit);
 }
 
-
-static Type *GetCompareTy(Value *Op) {
+static Type *getCompareTy(Value *Op) {
   return CmpInst::makeCmpResultType(Op->getType());
 }
 
 /// Rummage around inside V looking for something equivalent to the comparison
 /// "LHS Pred RHS". Return such a value if found, otherwise return null.
 /// Helper function for analyzing max/min idioms.
-static Value *ExtractEquivalentCondition(Value *V, CmpInst::Predicate Pred,
+static Value *extractEquivalentCondition(Value *V, CmpInst::Predicate Pred,
                                          Value *LHS, Value *RHS) {
   SelectInst *SI = dyn_cast<SelectInst>(V);
   if (!SI)
@@ -2512,6 +2586,70 @@ static Value *ExtractEquivalentCondition(Value *V, CmpInst::Predicate Pred,
   return nullptr;
 }
 
+/// Return true if the underlying object (storage) must be disjoint from
+/// storage returned by any noalias return call.
+static bool isAllocDisjoint(const Value *V) {
+  // For allocas, we consider only static ones (dynamic
+  // allocas might be transformed into calls to malloc not simultaneously
+  // live with the compared-to allocation). For globals, we exclude symbols
+  // that might be resolve lazily to symbols in another dynamically-loaded
+  // library (and, thus, could be malloc'ed by the implementation).
+  if (const AllocaInst *AI = dyn_cast<AllocaInst>(V))
+    return AI->getParent() && AI->getFunction() && AI->isStaticAlloca();
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
+    return (GV->hasLocalLinkage() || GV->hasHiddenVisibility() ||
+            GV->hasProtectedVisibility() || GV->hasGlobalUnnamedAddr()) &&
+           !GV->isThreadLocal();
+  if (const Argument *A = dyn_cast<Argument>(V))
+    return A->hasByValAttr();
+  return false;
+}
+
+/// Return true if V1 and V2 are each the base of some distict storage region
+/// [V, object_size(V)] which do not overlap.  Note that zero sized regions
+/// *are* possible, and that zero sized regions do not overlap with any other.
+static bool haveNonOverlappingStorage(const Value *V1, const Value *V2) {
+  // Global variables always exist, so they always exist during the lifetime
+  // of each other and all allocas.  Global variables themselves usually have
+  // non-overlapping storage, but since their addresses are constants, the
+  // case involving two globals does not reach here and is instead handled in
+  // constant folding.
+  //
+  // Two different allocas usually have different addresses...
+  //
+  // However, if there's an @llvm.stackrestore dynamically in between two
+  // allocas, they may have the same address. It's tempting to reduce the
+  // scope of the problem by only looking at *static* allocas here. That would
+  // cover the majority of allocas while significantly reducing the likelihood
+  // of having an @llvm.stackrestore pop up in the middle. However, it's not
+  // actually impossible for an @llvm.stackrestore to pop up in the middle of
+  // an entry block. Also, if we have a block that's not attached to a
+  // function, we can't tell if it's "static" under the current definition.
+  // Theoretically, this problem could be fixed by creating a new kind of
+  // instruction kind specifically for static allocas. Such a new instruction
+  // could be required to be at the top of the entry block, thus preventing it
+  // from being subject to a @llvm.stackrestore. Instcombine could even
+  // convert regular allocas into these special allocas. It'd be nifty.
+  // However, until then, this problem remains open.
+  //
+  // So, we'll assume that two non-empty allocas have different addresses
+  // for now.
+  auto isByValArg = [](const Value *V) {
+    const Argument *A = dyn_cast<Argument>(V);
+    return A && A->hasByValAttr();
+  };
+
+  // Byval args are backed by store which does not overlap with each other,
+  // allocas, or globals.
+  if (isByValArg(V1))
+    return isa<AllocaInst>(V2) || isa<GlobalVariable>(V2) || isByValArg(V2);
+  if (isByValArg(V2))
+    return isa<AllocaInst>(V1) || isa<GlobalVariable>(V1) || isByValArg(V1);
+
+  return isa<AllocaInst>(V1) &&
+         (isa<AllocaInst>(V2) || isa<GlobalVariable>(V2));
+}
+
 // A significant optimization not implemented here is assuming that alloca
 // addresses are not equal to incoming argument values. They don't *alias*,
 // as we say, but that doesn't mean they aren't equal, so we take a
@@ -2540,9 +2678,8 @@ static Value *ExtractEquivalentCondition(Value *V, CmpInst::Predicate Pred,
 // If the C and C++ standards are ever made sufficiently restrictive in this
 // area, it may be possible to update LLVM's semantics accordingly and reinstate
 // this optimization.
-static Constant *
-computePointerICmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
-                   const SimplifyQuery &Q) {
+static Constant *computePointerICmp(CmpInst::Predicate Pred, Value *LHS,
+                                    Value *RHS, const SimplifyQuery &Q) {
   const DataLayout &DL = Q.DL;
   const TargetLibraryInfo *TLI = Q.TLI;
   const DominatorTree *DT = Q.DT;
@@ -2557,8 +2694,7 @@ computePointerICmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
   if (isa<ConstantPointerNull>(RHS) && ICmpInst::isEquality(Pred) &&
       llvm::isKnownNonZero(LHS, DL, 0, nullptr, nullptr, nullptr,
                            IIQ.UseInstrInfo))
-    return ConstantInt::get(GetCompareTy(LHS),
-                            !CmpInst::isTrueWhenEqual(Pred));
+    return ConstantInt::get(getCompareTy(LHS), !CmpInst::isTrueWhenEqual(Pred));
 
   // We can only fold certain predicates on pointer comparisons.
   switch (Pred) {
@@ -2588,88 +2724,47 @@ computePointerICmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
   // numerous hazards. AliasAnalysis and its utilities rely on special rules
   // governing loads and stores which don't apply to icmps. Also, AliasAnalysis
   // doesn't need to guarantee pointer inequality when it says NoAlias.
-  Constant *LHSOffset = stripAndComputeConstantOffsets(DL, LHS);
-  Constant *RHSOffset = stripAndComputeConstantOffsets(DL, RHS);
+
+  // Even if an non-inbounds GEP occurs along the path we can still optimize
+  // equality comparisons concerning the result.
+  bool AllowNonInbounds = ICmpInst::isEquality(Pred);
+  APInt LHSOffset = stripAndComputeConstantOffsets(DL, LHS, AllowNonInbounds);
+  APInt RHSOffset = stripAndComputeConstantOffsets(DL, RHS, AllowNonInbounds);
 
   // If LHS and RHS are related via constant offsets to the same base
   // value, we can replace it with an icmp which just compares the offsets.
   if (LHS == RHS)
-    return ConstantExpr::getICmp(Pred, LHSOffset, RHSOffset);
+    return ConstantInt::get(getCompareTy(LHS),
+                            ICmpInst::compare(LHSOffset, RHSOffset, Pred));
 
   // Various optimizations for (in)equality comparisons.
   if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) {
     // Different non-empty allocations that exist at the same time have
-    // different addresses (if the program can tell). Global variables always
-    // exist, so they always exist during the lifetime of each other and all
-    // allocas. Two different allocas usually have different addresses...
-    //
-    // However, if there's an @llvm.stackrestore dynamically in between two
-    // allocas, they may have the same address. It's tempting to reduce the
-    // scope of the problem by only looking at *static* allocas here. That would
-    // cover the majority of allocas while significantly reducing the likelihood
-    // of having an @llvm.stackrestore pop up in the middle. However, it's not
-    // actually impossible for an @llvm.stackrestore to pop up in the middle of
-    // an entry block. Also, if we have a block that's not attached to a
-    // function, we can't tell if it's "static" under the current definition.
-    // Theoretically, this problem could be fixed by creating a new kind of
-    // instruction kind specifically for static allocas. Such a new instruction
-    // could be required to be at the top of the entry block, thus preventing it
-    // from being subject to a @llvm.stackrestore. Instcombine could even
-    // convert regular allocas into these special allocas. It'd be nifty.
-    // However, until then, this problem remains open.
-    //
-    // So, we'll assume that two non-empty allocas have different addresses
-    // for now.
-    //
-    // With all that, if the offsets are within the bounds of their allocations
-    // (and not one-past-the-end! so we can't use inbounds!), and their
-    // allocations aren't the same, the pointers are not equal.
-    //
-    // Note that it's not necessary to check for LHS being a global variable
-    // address, due to canonicalization and constant folding.
-    if (isa<AllocaInst>(LHS) &&
-        (isa<AllocaInst>(RHS) || isa<GlobalVariable>(RHS))) {
-      ConstantInt *LHSOffsetCI = dyn_cast<ConstantInt>(LHSOffset);
-      ConstantInt *RHSOffsetCI = dyn_cast<ConstantInt>(RHSOffset);
+    // different addresses (if the program can tell). If the offsets are
+    // within the bounds of their allocations (and not one-past-the-end!
+    // so we can't use inbounds!), and their allocations aren't the same,
+    // the pointers are not equal.
+    if (haveNonOverlappingStorage(LHS, RHS)) {
       uint64_t LHSSize, RHSSize;
       ObjectSizeOpts Opts;
-      Opts.NullIsUnknownSize =
-          NullPointerIsDefined(cast<AllocaInst>(LHS)->getFunction());
-      if (LHSOffsetCI && RHSOffsetCI &&
-          getObjectSize(LHS, LHSSize, DL, TLI, Opts) &&
-          getObjectSize(RHS, RHSSize, DL, TLI, Opts)) {
-        const APInt &LHSOffsetValue = LHSOffsetCI->getValue();
-        const APInt &RHSOffsetValue = RHSOffsetCI->getValue();
-        if (!LHSOffsetValue.isNegative() &&
-            !RHSOffsetValue.isNegative() &&
-            LHSOffsetValue.ult(LHSSize) &&
-            RHSOffsetValue.ult(RHSSize)) {
-          return ConstantInt::get(GetCompareTy(LHS),
-                                  !CmpInst::isTrueWhenEqual(Pred));
-        }
-      }
-
-      // Repeat the above check but this time without depending on DataLayout
-      // or being able to compute a precise size.
-      if (!cast<PointerType>(LHS->getType())->isEmptyTy() &&
-          !cast<PointerType>(RHS->getType())->isEmptyTy() &&
-          LHSOffset->isNullValue() &&
-          RHSOffset->isNullValue())
-        return ConstantInt::get(GetCompareTy(LHS),
+      Opts.EvalMode = ObjectSizeOpts::Mode::Min;
+      auto *F = [](Value *V) -> Function * {
+        if (auto *I = dyn_cast<Instruction>(V))
+          return I->getFunction();
+        if (auto *A = dyn_cast<Argument>(V))
+          return A->getParent();
+        return nullptr;
+      }(LHS);
+      Opts.NullIsUnknownSize = F ? NullPointerIsDefined(F) : true;
+      if (getObjectSize(LHS, LHSSize, DL, TLI, Opts) &&
+          getObjectSize(RHS, RHSSize, DL, TLI, Opts) &&
+          !LHSOffset.isNegative() && !RHSOffset.isNegative() &&
+          LHSOffset.ult(LHSSize) && RHSOffset.ult(RHSSize)) {
+        return ConstantInt::get(getCompareTy(LHS),
                                 !CmpInst::isTrueWhenEqual(Pred));
+      }
     }
 
-    // Even if an non-inbounds GEP occurs along the path we can still optimize
-    // equality comparisons concerning the result. We avoid walking the whole
-    // chain again by starting where the last calls to
-    // stripAndComputeConstantOffsets left off and accumulate the offsets.
-    Constant *LHSNoBound = stripAndComputeConstantOffsets(DL, LHS, true);
-    Constant *RHSNoBound = stripAndComputeConstantOffsets(DL, RHS, true);
-    if (LHS == RHS)
-      return ConstantExpr::getICmp(Pred,
-                                   ConstantExpr::getAdd(LHSOffset, LHSNoBound),
-                                   ConstantExpr::getAdd(RHSOffset, RHSNoBound));
-
     // If one side of the equality comparison must come from a noalias call
     // (meaning a system memory allocation function), and the other side must
     // come from a pointer that cannot overlap with dynamically-allocated
@@ -2685,29 +2780,16 @@ computePointerICmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
     };
 
     // Is the set of underlying objects all things which must be disjoint from
-    // noalias calls. For allocas, we consider only static ones (dynamic
-    // allocas might be transformed into calls to malloc not simultaneously
-    // live with the compared-to allocation). For globals, we exclude symbols
-    // that might be resolve lazily to symbols in another dynamically-loaded
-    // library (and, thus, could be malloc'ed by the implementation).
+    // noalias calls.  We assume that indexing from such disjoint storage
+    // into the heap is undefined, and thus offsets can be safely ignored.
     auto IsAllocDisjoint = [](ArrayRef<const Value *> Objects) {
-      return all_of(Objects, [](const Value *V) {
-        if (const AllocaInst *AI = dyn_cast<AllocaInst>(V))
-          return AI->getParent() && AI->getFunction() && AI->isStaticAlloca();
-        if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
-          return (GV->hasLocalLinkage() || GV->hasHiddenVisibility() ||
-                  GV->hasProtectedVisibility() || GV->hasGlobalUnnamedAddr()) &&
-                 !GV->isThreadLocal();
-        if (const Argument *A = dyn_cast<Argument>(V))
-          return A->hasByValAttr();
-        return false;
-      });
+      return all_of(Objects, ::isAllocDisjoint);
     };
 
     if ((IsNAC(LHSUObjs) && IsAllocDisjoint(RHSUObjs)) ||
         (IsNAC(RHSUObjs) && IsAllocDisjoint(LHSUObjs)))
-        return ConstantInt::get(GetCompareTy(LHS),
-                                !CmpInst::isTrueWhenEqual(Pred));
+      return ConstantInt::get(getCompareTy(LHS),
+                              !CmpInst::isTrueWhenEqual(Pred));
 
     // Fold comparisons for non-escaping pointer even if the allocation call
     // cannot be elided. We cannot fold malloc comparison to null. Also, the
@@ -2724,7 +2806,7 @@ computePointerICmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
     // FIXME: We should also fold the compare when the pointer escapes, but the
     // compare dominates the pointer escape
     if (MI && !PointerMayBeCaptured(MI, true, true))
-      return ConstantInt::get(GetCompareTy(LHS),
+      return ConstantInt::get(getCompareTy(LHS),
                               CmpInst::isFalseWhenEqual(Pred));
   }
 
@@ -2735,7 +2817,7 @@ computePointerICmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
 /// Fold an icmp when its operands have i1 scalar type.
 static Value *simplifyICmpOfBools(CmpInst::Predicate Pred, Value *LHS,
                                   Value *RHS, const SimplifyQuery &Q) {
-  Type *ITy = GetCompareTy(LHS); // The return type.
+  Type *ITy = getCompareTy(LHS); // The return type.
   Type *OpTy = LHS->getType();   // The operand type.
   if (!OpTy->isIntOrIntVectorTy(1))
     return nullptr;
@@ -2773,7 +2855,8 @@ static Value *simplifyICmpOfBools(CmpInst::Predicate Pred, Value *LHS,
     case CmpInst::ICMP_SLE: // X <=s 0 -> true
       return getTrue(ITy);
 
-    default: break;
+    default:
+      break;
     }
   } else if (match(RHS, m_One())) {
     switch (Pred) {
@@ -2797,7 +2880,8 @@ static Value *simplifyICmpOfBools(CmpInst::Predicate Pred, Value *LHS,
     case CmpInst::ICMP_SGE: // X >=s -1 -> true
       return getTrue(ITy);
 
-    default: break;
+    default:
+      break;
     }
   }
 
@@ -2805,7 +2889,7 @@ static Value *simplifyICmpOfBools(CmpInst::Predicate Pred, Value *LHS,
   default:
     break;
   case ICmpInst::ICMP_UGE:
-    if (isImpliedCondition(RHS, LHS, Q.DL).getValueOr(false))
+    if (isImpliedCondition(RHS, LHS, Q.DL).value_or(false))
       return getTrue(ITy);
     break;
   case ICmpInst::ICMP_SGE:
@@ -2816,11 +2900,11 @@ static Value *simplifyICmpOfBools(CmpInst::Predicate Pred, Value *LHS,
     ///  0  |  1  |  1 (0 >= -1)  |  1
     ///  1  |  0  |  0 (-1 >= 0)  |  0
     ///  1  |  1  |  1 (-1 >= -1) |  1
-    if (isImpliedCondition(LHS, RHS, Q.DL).getValueOr(false))
+    if (isImpliedCondition(LHS, RHS, Q.DL).value_or(false))
       return getTrue(ITy);
     break;
   case ICmpInst::ICMP_ULE:
-    if (isImpliedCondition(LHS, RHS, Q.DL).getValueOr(false))
+    if (isImpliedCondition(LHS, RHS, Q.DL).value_or(false))
       return getTrue(ITy);
     break;
   }
@@ -2834,7 +2918,7 @@ static Value *simplifyICmpWithZero(CmpInst::Predicate Pred, Value *LHS,
   if (!match(RHS, m_Zero()))
     return nullptr;
 
-  Type *ITy = GetCompareTy(LHS); // The return type.
+  Type *ITy = getCompareTy(LHS); // The return type.
   switch (Pred) {
   default:
     llvm_unreachable("Unknown ICmp predicate!");
@@ -2893,7 +2977,7 @@ static Value *simplifyICmpWithZero(CmpInst::Predicate Pred, Value *LHS,
 
 static Value *simplifyICmpWithConstant(CmpInst::Predicate Pred, Value *LHS,
                                        Value *RHS, const InstrInfoQuery &IIQ) {
-  Type *ITy = GetCompareTy(RHS); // The return type.
+  Type *ITy = getCompareTy(RHS); // The return type.
 
   Value *X;
   // Sign-bit checks can be optimized to true/false after unsigned
@@ -2940,10 +3024,11 @@ static Value *simplifyICmpWithConstant(CmpInst::Predicate Pred, Value *LHS,
   return nullptr;
 }
 
-static Value *simplifyICmpWithBinOpOnLHS(
-    CmpInst::Predicate Pred, BinaryOperator *LBO, Value *RHS,
-    const SimplifyQuery &Q, unsigned MaxRecurse) {
-  Type *ITy = GetCompareTy(RHS); // The return type.
+static Value *simplifyICmpWithBinOpOnLHS(CmpInst::Predicate Pred,
+                                         BinaryOperator *LBO, Value *RHS,
+                                         const SimplifyQuery &Q,
+                                         unsigned MaxRecurse) {
+  Type *ITy = getCompareTy(RHS); // The return type.
 
   Value *Y = nullptr;
   // icmp pred (or X, Y), X
@@ -3078,7 +3163,6 @@ static Value *simplifyICmpWithBinOpOnLHS(
   return nullptr;
 }
 
-
 // If only one of the icmp's operands has NSW flags, try to prove that:
 //
 //   icmp slt (x + C1), (x +nsw C2)
@@ -3113,7 +3197,6 @@ static bool trySimplifyICmpWithAdds(CmpInst::Predicate Pred, Value *LHS,
          (C2->slt(*C1) && C1->isNonPositive());
 }
 
-
 /// TODO: A large part of this logic is duplicated in InstCombine's
 /// foldICmpBinOp(). We should be able to share that and avoid the code
 /// duplication.
@@ -3150,7 +3233,7 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
 
     // icmp (X+Y), X -> icmp Y, 0 for equalities or if there is no overflow.
     if ((A == RHS || B == RHS) && NoLHSWrapProblem)
-      if (Value *V = SimplifyICmpInst(Pred, A == RHS ? B : A,
+      if (Value *V = simplifyICmpInst(Pred, A == RHS ? B : A,
                                       Constant::getNullValue(RHS->getType()), Q,
                                       MaxRecurse - 1))
         return V;
@@ -3158,7 +3241,7 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
     // icmp X, (X+Y) -> icmp 0, Y for equalities or if there is no overflow.
     if ((C == LHS || D == LHS) && NoRHSWrapProblem)
       if (Value *V =
-              SimplifyICmpInst(Pred, Constant::getNullValue(LHS->getType()),
+              simplifyICmpInst(Pred, Constant::getNullValue(LHS->getType()),
                                C == LHS ? D : C, Q, MaxRecurse - 1))
         return V;
 
@@ -3186,7 +3269,7 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
         Y = A;
         Z = C;
       }
-      if (Value *V = SimplifyICmpInst(Pred, Y, Z, Q, MaxRecurse - 1))
+      if (Value *V = simplifyICmpInst(Pred, Y, Z, Q, MaxRecurse - 1))
         return V;
     }
   }
@@ -3206,15 +3289,15 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
     if (match(RHS, m_APInt(C))) {
       if (C->isStrictlyPositive()) {
         if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_NE)
-          return ConstantInt::getTrue(GetCompareTy(RHS));
+          return ConstantInt::getTrue(getCompareTy(RHS));
         if (Pred == ICmpInst::ICMP_SGE || Pred == ICmpInst::ICMP_EQ)
-          return ConstantInt::getFalse(GetCompareTy(RHS));
+          return ConstantInt::getFalse(getCompareTy(RHS));
       }
       if (C->isNonNegative()) {
         if (Pred == ICmpInst::ICMP_SLE)
-          return ConstantInt::getTrue(GetCompareTy(RHS));
+          return ConstantInt::getTrue(getCompareTy(RHS));
         if (Pred == ICmpInst::ICMP_SGT)
-          return ConstantInt::getFalse(GetCompareTy(RHS));
+          return ConstantInt::getFalse(getCompareTy(RHS));
       }
     }
   }
@@ -3237,9 +3320,9 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
         Q.IIQ.hasNoUnsignedWrap(cast<OverflowingBinaryOperator>(LBO)) ||
         match(LHS, m_Shl(m_One(), m_Value())) || !C->isZero()) {
       if (Pred == ICmpInst::ICMP_EQ)
-        return ConstantInt::getFalse(GetCompareTy(RHS));
+        return ConstantInt::getFalse(getCompareTy(RHS));
       if (Pred == ICmpInst::ICMP_NE)
-        return ConstantInt::getTrue(GetCompareTy(RHS));
+        return ConstantInt::getTrue(getCompareTy(RHS));
     }
   }
 
@@ -3248,9 +3331,9 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
   // (1 << X) <=u 0x8000 --> true
   if (match(LHS, m_Shl(m_One(), m_Value())) && match(RHS, m_SignMask())) {
     if (Pred == ICmpInst::ICMP_UGT)
-      return ConstantInt::getFalse(GetCompareTy(RHS));
+      return ConstantInt::getFalse(getCompareTy(RHS));
     if (Pred == ICmpInst::ICMP_ULE)
-      return ConstantInt::getTrue(GetCompareTy(RHS));
+      return ConstantInt::getTrue(getCompareTy(RHS));
   }
 
   if (MaxRecurse && LBO && RBO && LBO->getOpcode() == RBO->getOpcode() &&
@@ -3263,22 +3346,22 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
       if (ICmpInst::isSigned(Pred) || !Q.IIQ.isExact(LBO) ||
           !Q.IIQ.isExact(RBO))
         break;
-      if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0),
+      if (Value *V = simplifyICmpInst(Pred, LBO->getOperand(0),
                                       RBO->getOperand(0), Q, MaxRecurse - 1))
-          return V;
+        return V;
       break;
     case Instruction::SDiv:
       if (!ICmpInst::isEquality(Pred) || !Q.IIQ.isExact(LBO) ||
           !Q.IIQ.isExact(RBO))
         break;
-      if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0),
+      if (Value *V = simplifyICmpInst(Pred, LBO->getOperand(0),
                                       RBO->getOperand(0), Q, MaxRecurse - 1))
         return V;
       break;
     case Instruction::AShr:
       if (!Q.IIQ.isExact(LBO) || !Q.IIQ.isExact(RBO))
         break;
-      if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0),
+      if (Value *V = simplifyICmpInst(Pred, LBO->getOperand(0),
                                       RBO->getOperand(0), Q, MaxRecurse - 1))
         return V;
       break;
@@ -3289,7 +3372,7 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
         break;
       if (!NSW && ICmpInst::isSigned(Pred))
         break;
-      if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0),
+      if (Value *V = simplifyICmpInst(Pred, LBO->getOperand(0),
                                       RBO->getOperand(0), Q, MaxRecurse - 1))
         return V;
       break;
@@ -3299,12 +3382,12 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
   return nullptr;
 }
 
-/// Simplify integer comparisons where at least one operand of the compare
+/// simplify integer comparisons where at least one operand of the compare
 /// matches an integer min/max idiom.
 static Value *simplifyICmpWithMinMax(CmpInst::Predicate Pred, Value *LHS,
                                      Value *RHS, const SimplifyQuery &Q,
                                      unsigned MaxRecurse) {
-  Type *ITy = GetCompareTy(LHS); // The return type.
+  Type *ITy = getCompareTy(LHS); // The return type.
   Value *A, *B;
   CmpInst::Predicate P = CmpInst::BAD_ICMP_PREDICATE;
   CmpInst::Predicate EqP; // Chosen so that "A == max/min(A,B)" iff "A EqP B".
@@ -3349,13 +3432,13 @@ static Value *simplifyICmpWithMinMax(CmpInst::Predicate Pred, Value *LHS,
     case CmpInst::ICMP_SLE:
       // Equivalent to "A EqP B".  This may be the same as the condition tested
       // in the max/min; if so, we can just return that.
-      if (Value *V = ExtractEquivalentCondition(LHS, EqP, A, B))
+      if (Value *V = extractEquivalentCondition(LHS, EqP, A, B))
         return V;
-      if (Value *V = ExtractEquivalentCondition(RHS, EqP, A, B))
+      if (Value *V = extractEquivalentCondition(RHS, EqP, A, B))
         return V;
       // Otherwise, see if "A EqP B" simplifies.
       if (MaxRecurse)
-        if (Value *V = SimplifyICmpInst(EqP, A, B, Q, MaxRecurse - 1))
+        if (Value *V = simplifyICmpInst(EqP, A, B, Q, MaxRecurse - 1))
           return V;
       break;
     case CmpInst::ICMP_NE:
@@ -3363,13 +3446,13 @@ static Value *simplifyICmpWithMinMax(CmpInst::Predicate Pred, Value *LHS,
       CmpInst::Predicate InvEqP = CmpInst::getInversePredicate(EqP);
       // Equivalent to "A InvEqP B".  This may be the same as the condition
       // tested in the max/min; if so, we can just return that.
-      if (Value *V = ExtractEquivalentCondition(LHS, InvEqP, A, B))
+      if (Value *V = extractEquivalentCondition(LHS, InvEqP, A, B))
         return V;
-      if (Value *V = ExtractEquivalentCondition(RHS, InvEqP, A, B))
+      if (Value *V = extractEquivalentCondition(RHS, InvEqP, A, B))
         return V;
       // Otherwise, see if "A InvEqP B" simplifies.
       if (MaxRecurse)
-        if (Value *V = SimplifyICmpInst(InvEqP, A, B, Q, MaxRecurse - 1))
+        if (Value *V = simplifyICmpInst(InvEqP, A, B, Q, MaxRecurse - 1))
           return V;
       break;
     }
@@ -3423,13 +3506,13 @@ static Value *simplifyICmpWithMinMax(CmpInst::Predicate Pred, Value *LHS,
     case CmpInst::ICMP_ULE:
       // Equivalent to "A EqP B".  This may be the same as the condition tested
       // in the max/min; if so, we can just return that.
-      if (Value *V = ExtractEquivalentCondition(LHS, EqP, A, B))
+      if (Value *V = extractEquivalentCondition(LHS, EqP, A, B))
         return V;
-      if (Value *V = ExtractEquivalentCondition(RHS, EqP, A, B))
+      if (Value *V = extractEquivalentCondition(RHS, EqP, A, B))
         return V;
       // Otherwise, see if "A EqP B" simplifies.
       if (MaxRecurse)
-        if (Value *V = SimplifyICmpInst(EqP, A, B, Q, MaxRecurse - 1))
+        if (Value *V = simplifyICmpInst(EqP, A, B, Q, MaxRecurse - 1))
           return V;
       break;
     case CmpInst::ICMP_NE:
@@ -3437,13 +3520,13 @@ static Value *simplifyICmpWithMinMax(CmpInst::Predicate Pred, Value *LHS,
       CmpInst::Predicate InvEqP = CmpInst::getInversePredicate(EqP);
       // Equivalent to "A InvEqP B".  This may be the same as the condition
       // tested in the max/min; if so, we can just return that.
-      if (Value *V = ExtractEquivalentCondition(LHS, InvEqP, A, B))
+      if (Value *V = extractEquivalentCondition(LHS, InvEqP, A, B))
         return V;
-      if (Value *V = ExtractEquivalentCondition(RHS, InvEqP, A, B))
+      if (Value *V = extractEquivalentCondition(RHS, InvEqP, A, B))
         return V;
       // Otherwise, see if "A InvEqP B" simplifies.
       if (MaxRecurse)
-        if (Value *V = SimplifyICmpInst(InvEqP, A, B, Q, MaxRecurse - 1))
+        if (Value *V = simplifyICmpInst(InvEqP, A, B, Q, MaxRecurse - 1))
           return V;
       break;
     }
@@ -3499,11 +3582,10 @@ static Value *simplifyICmpWithDominatingAssume(CmpInst::Predicate Predicate,
         continue;
 
       CallInst *Assume = cast<CallInst>(AssumeVH);
-      if (Optional<bool> Imp =
-              isImpliedCondition(Assume->getArgOperand(0), Predicate, LHS, RHS,
-                                 Q.DL))
+      if (Optional<bool> Imp = isImpliedCondition(Assume->getArgOperand(0),
+                                                  Predicate, LHS, RHS, Q.DL))
         if (isValidAssumeForContext(Assume, Q.CxtI, Q.DT))
-          return ConstantInt::get(GetCompareTy(LHS), *Imp);
+          return ConstantInt::get(getCompareTy(LHS), *Imp);
     }
   }
 
@@ -3512,7 +3594,7 @@ static Value *simplifyICmpWithDominatingAssume(CmpInst::Predicate Predicate,
 
 /// Given operands for an ICmpInst, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
+static Value *simplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                                const SimplifyQuery &Q, unsigned MaxRecurse) {
   CmpInst::Predicate Pred = (CmpInst::Predicate)Predicate;
   assert(CmpInst::isIntPredicate(Pred) && "Not an integer compare!");
@@ -3527,7 +3609,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
   }
   assert(!isa<UndefValue>(LHS) && "Unexpected icmp undef,%X");
 
-  Type *ITy = GetCompareTy(LHS); // The return type.
+  Type *ITy = getCompareTy(LHS); // The return type.
 
   // icmp poison, X -> poison
   if (isa<PoisonValue>(RHS))
@@ -3589,15 +3671,15 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
         Q.DL.getTypeSizeInBits(SrcTy) == DstTy->getPrimitiveSizeInBits()) {
       if (Constant *RHSC = dyn_cast<Constant>(RHS)) {
         // Transfer the cast to the constant.
-        if (Value *V = SimplifyICmpInst(Pred, SrcOp,
+        if (Value *V = simplifyICmpInst(Pred, SrcOp,
                                         ConstantExpr::getIntToPtr(RHSC, SrcTy),
-                                        Q, MaxRecurse-1))
+                                        Q, MaxRecurse - 1))
           return V;
       } else if (PtrToIntInst *RI = dyn_cast<PtrToIntInst>(RHS)) {
         if (RI->getOperand(0)->getType() == SrcTy)
           // Compare without the cast.
-          if (Value *V = SimplifyICmpInst(Pred, SrcOp, RI->getOperand(0),
-                                          Q, MaxRecurse-1))
+          if (Value *V = simplifyICmpInst(Pred, SrcOp, RI->getOperand(0), Q,
+                                          MaxRecurse - 1))
             return V;
       }
     }
@@ -3608,9 +3690,9 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       if (ZExtInst *RI = dyn_cast<ZExtInst>(RHS)) {
         if (MaxRecurse && SrcTy == RI->getOperand(0)->getType())
           // Compare X and Y.  Note that signed predicates become unsigned.
-          if (Value *V = SimplifyICmpInst(ICmpInst::getUnsignedPredicate(Pred),
-                                          SrcOp, RI->getOperand(0), Q,
-                                          MaxRecurse-1))
+          if (Value *V =
+                  simplifyICmpInst(ICmpInst::getUnsignedPredicate(Pred), SrcOp,
+                                   RI->getOperand(0), Q, MaxRecurse - 1))
             return V;
       }
       // Fold (zext X) ule (sext X), (zext X) sge (sext X) to true.
@@ -3633,15 +3715,16 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
         // If the re-extended constant didn't change then this is effectively
         // also a case of comparing two zero-extended values.
         if (RExt == CI && MaxRecurse)
-          if (Value *V = SimplifyICmpInst(ICmpInst::getUnsignedPredicate(Pred),
-                                        SrcOp, Trunc, Q, MaxRecurse-1))
+          if (Value *V = simplifyICmpInst(ICmpInst::getUnsignedPredicate(Pred),
+                                          SrcOp, Trunc, Q, MaxRecurse - 1))
             return V;
 
         // Otherwise the upper bits of LHS are zero while RHS has a non-zero bit
         // there.  Use this to work out the result of the comparison.
         if (RExt != CI) {
           switch (Pred) {
-          default: llvm_unreachable("Unknown ICmp predicate!");
+          default:
+            llvm_unreachable("Unknown ICmp predicate!");
           // LHS <u RHS.
           case ICmpInst::ICMP_EQ:
           case ICmpInst::ICMP_UGT:
@@ -3657,15 +3740,15 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
           // is non-negative then LHS <s RHS.
           case ICmpInst::ICMP_SGT:
           case ICmpInst::ICMP_SGE:
-            return CI->getValue().isNegative() ?
-              ConstantInt::getTrue(CI->getContext()) :
-              ConstantInt::getFalse(CI->getContext());
+            return CI->getValue().isNegative()
+                       ? ConstantInt::getTrue(CI->getContext())
+                       : ConstantInt::getFalse(CI->getContext());
 
           case ICmpInst::ICMP_SLT:
           case ICmpInst::ICMP_SLE:
-            return CI->getValue().isNegative() ?
-              ConstantInt::getFalse(CI->getContext()) :
-              ConstantInt::getTrue(CI->getContext());
+            return CI->getValue().isNegative()
+                       ? ConstantInt::getFalse(CI->getContext())
+                       : ConstantInt::getTrue(CI->getContext());
           }
         }
       }
@@ -3677,8 +3760,8 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       if (SExtInst *RI = dyn_cast<SExtInst>(RHS)) {
         if (MaxRecurse && SrcTy == RI->getOperand(0)->getType())
           // Compare X and Y.  Note that the predicate does not change.
-          if (Value *V = SimplifyICmpInst(Pred, SrcOp, RI->getOperand(0),
-                                          Q, MaxRecurse-1))
+          if (Value *V = simplifyICmpInst(Pred, SrcOp, RI->getOperand(0), Q,
+                                          MaxRecurse - 1))
             return V;
       }
       // Fold (sext X) uge (zext X), (sext X) sle (zext X) to true.
@@ -3701,14 +3784,16 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
         // If the re-extended constant didn't change then this is effectively
         // also a case of comparing two sign-extended values.
         if (RExt == CI && MaxRecurse)
-          if (Value *V = SimplifyICmpInst(Pred, SrcOp, Trunc, Q, MaxRecurse-1))
+          if (Value *V =
+                  simplifyICmpInst(Pred, SrcOp, Trunc, Q, MaxRecurse - 1))
             return V;
 
         // Otherwise the upper bits of LHS are all equal, while RHS has varying
         // bits there.  Use this to work out the result of the comparison.
         if (RExt != CI) {
           switch (Pred) {
-          default: llvm_unreachable("Unknown ICmp predicate!");
+          default:
+            llvm_unreachable("Unknown ICmp predicate!");
           case ICmpInst::ICMP_EQ:
             return ConstantInt::getFalse(CI->getContext());
           case ICmpInst::ICMP_NE:
@@ -3718,14 +3803,14 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
           // LHS >s RHS.
           case ICmpInst::ICMP_SGT:
           case ICmpInst::ICMP_SGE:
-            return CI->getValue().isNegative() ?
-              ConstantInt::getTrue(CI->getContext()) :
-              ConstantInt::getFalse(CI->getContext());
+            return CI->getValue().isNegative()
+                       ? ConstantInt::getTrue(CI->getContext())
+                       : ConstantInt::getFalse(CI->getContext());
           case ICmpInst::ICMP_SLT:
           case ICmpInst::ICMP_SLE:
-            return CI->getValue().isNegative() ?
-              ConstantInt::getFalse(CI->getContext()) :
-              ConstantInt::getTrue(CI->getContext());
+            return CI->getValue().isNegative()
+                       ? ConstantInt::getFalse(CI->getContext())
+                       : ConstantInt::getTrue(CI->getContext());
 
           // If LHS is non-negative then LHS <u RHS.  If LHS is negative then
           // LHS >u RHS.
@@ -3733,18 +3818,18 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
           case ICmpInst::ICMP_UGE:
             // Comparison is true iff the LHS <s 0.
             if (MaxRecurse)
-              if (Value *V = SimplifyICmpInst(ICmpInst::ICMP_SLT, SrcOp,
-                                              Constant::getNullValue(SrcTy),
-                                              Q, MaxRecurse-1))
+              if (Value *V = simplifyICmpInst(ICmpInst::ICMP_SLT, SrcOp,
+                                              Constant::getNullValue(SrcTy), Q,
+                                              MaxRecurse - 1))
                 return V;
             break;
           case ICmpInst::ICMP_ULT:
           case ICmpInst::ICMP_ULE:
             // Comparison is true iff the LHS >=s 0.
             if (MaxRecurse)
-              if (Value *V = SimplifyICmpInst(ICmpInst::ICMP_SGE, SrcOp,
-                                              Constant::getNullValue(SrcTy),
-                                              Q, MaxRecurse-1))
+              if (Value *V = simplifyICmpInst(ICmpInst::ICMP_SGE, SrcOp,
+                                              Constant::getNullValue(SrcTy), Q,
+                                              MaxRecurse - 1))
                 return V;
             break;
           }
@@ -3788,26 +3873,26 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
   // If the comparison is with the result of a select instruction, check whether
   // comparing with either branch of the select always yields the same value.
   if (isa<SelectInst>(LHS) || isa<SelectInst>(RHS))
-    if (Value *V = ThreadCmpOverSelect(Pred, LHS, RHS, Q, MaxRecurse))
+    if (Value *V = threadCmpOverSelect(Pred, LHS, RHS, Q, MaxRecurse))
       return V;
 
   // If the comparison is with the result of a phi instruction, check whether
   // doing the compare with each incoming phi value yields a common result.
   if (isa<PHINode>(LHS) || isa<PHINode>(RHS))
-    if (Value *V = ThreadCmpOverPHI(Pred, LHS, RHS, Q, MaxRecurse))
+    if (Value *V = threadCmpOverPHI(Pred, LHS, RHS, Q, MaxRecurse))
       return V;
 
   return nullptr;
 }
 
-Value *llvm::SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
+Value *llvm::simplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                               const SimplifyQuery &Q) {
-  return ::SimplifyICmpInst(Predicate, LHS, RHS, Q, RecursionLimit);
+  return ::simplifyICmpInst(Predicate, LHS, RHS, Q, RecursionLimit);
 }
 
 /// Given operands for an FCmpInst, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
+static Value *simplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                                FastMathFlags FMF, const SimplifyQuery &Q,
                                unsigned MaxRecurse) {
   CmpInst::Predicate Pred = (CmpInst::Predicate)Predicate;
@@ -3815,7 +3900,8 @@ static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
 
   if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
     if (Constant *CRHS = dyn_cast<Constant>(RHS))
-      return ConstantFoldCompareInstOperands(Pred, CLHS, CRHS, Q.DL, Q.TLI);
+      return ConstantFoldCompareInstOperands(Pred, CLHS, CRHS, Q.DL, Q.TLI,
+                                             Q.CxtI);
 
     // If we have a constant, make sure it is on the RHS.
     std::swap(LHS, RHS);
@@ -3823,7 +3909,7 @@ static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
   }
 
   // Fold trivial predicates.
-  Type *RetTy = GetCompareTy(LHS);
+  Type *RetTy = getCompareTy(LHS);
   if (Pred == FCmpInst::FCMP_FALSE)
     return getFalse(RetTy);
   if (Pred == FCmpInst::FCMP_TRUE)
@@ -3943,23 +4029,29 @@ static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       // The ordered relationship and minnum/maxnum guarantee that we do not
       // have NaN constants, so ordered/unordered preds are handled the same.
       switch (Pred) {
-      case FCmpInst::FCMP_OEQ: case FCmpInst::FCMP_UEQ:
+      case FCmpInst::FCMP_OEQ:
+      case FCmpInst::FCMP_UEQ:
         // minnum(X, LesserC)  == C --> false
         // maxnum(X, GreaterC) == C --> false
         return getFalse(RetTy);
-      case FCmpInst::FCMP_ONE: case FCmpInst::FCMP_UNE:
+      case FCmpInst::FCMP_ONE:
+      case FCmpInst::FCMP_UNE:
         // minnum(X, LesserC)  != C --> true
         // maxnum(X, GreaterC) != C --> true
         return getTrue(RetTy);
-      case FCmpInst::FCMP_OGE: case FCmpInst::FCMP_UGE:
-      case FCmpInst::FCMP_OGT: case FCmpInst::FCMP_UGT:
+      case FCmpInst::FCMP_OGE:
+      case FCmpInst::FCMP_UGE:
+      case FCmpInst::FCMP_OGT:
+      case FCmpInst::FCMP_UGT:
         // minnum(X, LesserC)  >= C --> false
         // minnum(X, LesserC)  >  C --> false
         // maxnum(X, GreaterC) >= C --> true
         // maxnum(X, GreaterC) >  C --> true
         return ConstantInt::get(RetTy, IsMaxNum);
-      case FCmpInst::FCMP_OLE: case FCmpInst::FCMP_ULE:
-      case FCmpInst::FCMP_OLT: case FCmpInst::FCMP_ULT:
+      case FCmpInst::FCMP_OLE:
+      case FCmpInst::FCMP_ULE:
+      case FCmpInst::FCMP_OLT:
+      case FCmpInst::FCMP_ULT:
         // minnum(X, LesserC)  <= C --> true
         // minnum(X, LesserC)  <  C --> true
         // maxnum(X, GreaterC) <= C --> false
@@ -3997,21 +4089,21 @@ static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
   // If the comparison is with the result of a select instruction, check whether
   // comparing with either branch of the select always yields the same value.
   if (isa<SelectInst>(LHS) || isa<SelectInst>(RHS))
-    if (Value *V = ThreadCmpOverSelect(Pred, LHS, RHS, Q, MaxRecurse))
+    if (Value *V = threadCmpOverSelect(Pred, LHS, RHS, Q, MaxRecurse))
       return V;
 
   // If the comparison is with the result of a phi instruction, check whether
   // doing the compare with each incoming phi value yields a common result.
   if (isa<PHINode>(LHS) || isa<PHINode>(RHS))
-    if (Value *V = ThreadCmpOverPHI(Pred, LHS, RHS, Q, MaxRecurse))
+    if (Value *V = threadCmpOverPHI(Pred, LHS, RHS, Q, MaxRecurse))
       return V;
 
   return nullptr;
 }
 
-Value *llvm::SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
+Value *llvm::simplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                               FastMathFlags FMF, const SimplifyQuery &Q) {
-  return ::SimplifyFCmpInst(Predicate, LHS, RHS, FMF, Q, RecursionLimit);
+  return ::simplifyFCmpInst(Predicate, LHS, RHS, FMF, Q, RecursionLimit);
 }
 
 static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
@@ -4078,22 +4170,21 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
     };
 
     if (auto *B = dyn_cast<BinaryOperator>(I))
-      return PreventSelfSimplify(SimplifyBinOp(B->getOpcode(), NewOps[0],
+      return PreventSelfSimplify(simplifyBinOp(B->getOpcode(), NewOps[0],
                                                NewOps[1], Q, MaxRecurse - 1));
 
     if (CmpInst *C = dyn_cast<CmpInst>(I))
-      return PreventSelfSimplify(SimplifyCmpInst(C->getPredicate(), NewOps[0],
+      return PreventSelfSimplify(simplifyCmpInst(C->getPredicate(), NewOps[0],
                                                  NewOps[1], Q, MaxRecurse - 1));
 
     if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
-      return PreventSelfSimplify(SimplifyGEPInst(
+      return PreventSelfSimplify(simplifyGEPInst(
           GEP->getSourceElementType(), NewOps[0], makeArrayRef(NewOps).slice(1),
           GEP->isInBounds(), Q, MaxRecurse - 1));
 
     if (isa<SelectInst>(I))
-      return PreventSelfSimplify(
-          SimplifySelectInst(NewOps[0], NewOps[1], NewOps[2], Q,
-                             MaxRecurse - 1));
+      return PreventSelfSimplify(simplifySelectInst(
+          NewOps[0], NewOps[1], NewOps[2], Q, MaxRecurse - 1));
     // TODO: We could hand off more cases to instsimplify here.
   }
 
@@ -4119,14 +4210,6 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
   if (!AllowRefinement && canCreatePoison(cast<Operator>(I)))
     return nullptr;
 
-  if (CmpInst *C = dyn_cast<CmpInst>(I))
-    return ConstantFoldCompareInstOperands(C->getPredicate(), ConstOps[0],
-                                           ConstOps[1], Q.DL, Q.TLI);
-
-  if (LoadInst *LI = dyn_cast<LoadInst>(I))
-    if (!LI->isVolatile())
-      return ConstantFoldLoadFromConstPtr(ConstOps[0], LI->getType(), Q.DL);
-
   return ConstantFoldInstOperands(I, ConstOps, Q.DL, Q.TLI);
 }
 
@@ -4189,7 +4272,8 @@ static Value *simplifySelectWithFakeICmpEq(Value *CmpLHS, Value *CmpRHS,
 /// Try to simplify a select instruction when its condition operand is an
 /// integer comparison.
 static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal,
-                                         Value *FalseVal, const SimplifyQuery &Q,
+                                         Value *FalseVal,
+                                         const SimplifyQuery &Q,
                                          unsigned MaxRecurse) {
   ICmpInst::Predicate Pred;
   Value *CmpLHS, *CmpRHS;
@@ -4209,7 +4293,8 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal,
     Value *X, *Y;
     SelectPatternFlavor SPF =
         matchDecomposedSelectPattern(cast<ICmpInst>(CondVal), TrueVal, FalseVal,
-                                     X, Y).Flavor;
+                                     X, Y)
+            .Flavor;
     if (SelectPatternResult::isMinOrMax(SPF) && Pred == getMinMaxPred(SPF)) {
       APInt LimitC = getMinMaxLimit(getInverseMinMaxFlavor(SPF),
                                     X->getType()->getScalarSizeInBits());
@@ -4261,8 +4346,8 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal,
   }
 
   // Check for other compares that behave like bit test.
-  if (Value *V = simplifySelectWithFakeICmpEq(CmpLHS, CmpRHS, Pred,
-                                              TrueVal, FalseVal))
+  if (Value *V =
+          simplifySelectWithFakeICmpEq(CmpLHS, CmpRHS, Pred, TrueVal, FalseVal))
     return V;
 
   // If we have a scalar equality comparison, then we know the value in one of
@@ -4272,18 +4357,18 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal,
   // because each element of a vector select is chosen independently.
   if (Pred == ICmpInst::ICMP_EQ && !CondVal->getType()->isVectorTy()) {
     if (simplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q,
-                               /* AllowRefinement */ false, MaxRecurse) ==
-            TrueVal ||
+                               /* AllowRefinement */ false,
+                               MaxRecurse) == TrueVal ||
         simplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, Q,
-                               /* AllowRefinement */ false, MaxRecurse) ==
-            TrueVal)
+                               /* AllowRefinement */ false,
+                               MaxRecurse) == TrueVal)
       return FalseVal;
     if (simplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, Q,
-                               /* AllowRefinement */ true, MaxRecurse) ==
-            FalseVal ||
+                               /* AllowRefinement */ true,
+                               MaxRecurse) == FalseVal ||
         simplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, Q,
-                               /* AllowRefinement */ true, MaxRecurse) ==
-            FalseVal)
+                               /* AllowRefinement */ true,
+                               MaxRecurse) == FalseVal)
       return FalseVal;
   }
 
@@ -4302,11 +4387,11 @@ static Value *simplifySelectWithFCmp(Value *Cond, Value *T, Value *F,
   // This transform is safe if we do not have (do not care about) -0.0 or if
   // at least one operand is known to not be -0.0. Otherwise, the select can
   // change the sign of a zero operand.
-  bool HasNoSignedZeros = Q.CxtI && isa<FPMathOperator>(Q.CxtI) &&
-                          Q.CxtI->hasNoSignedZeros();
+  bool HasNoSignedZeros =
+      Q.CxtI && isa<FPMathOperator>(Q.CxtI) && Q.CxtI->hasNoSignedZeros();
   const APFloat *C;
   if (HasNoSignedZeros || (match(T, m_APFloat(C)) && C->isNonZero()) ||
-                          (match(F, m_APFloat(C)) && C->isNonZero())) {
+      (match(F, m_APFloat(C)) && C->isNonZero())) {
     // (T == F) ? T : F --> F
     // (F == T) ? T : F --> F
     if (Pred == FCmpInst::FCMP_OEQ)
@@ -4323,7 +4408,7 @@ static Value *simplifySelectWithFCmp(Value *Cond, Value *T, Value *F,
 
 /// Given operands for a SelectInst, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
+static Value *simplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
                                  const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (auto *CondC = dyn_cast<Constant>(Cond)) {
     if (auto *TrueC = dyn_cast<Constant>(TrueVal))
@@ -4439,14 +4524,14 @@ static Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
   return nullptr;
 }
 
-Value *llvm::SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
+Value *llvm::simplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
                                 const SimplifyQuery &Q) {
-  return ::SimplifySelectInst(Cond, TrueVal, FalseVal, Q, RecursionLimit);
+  return ::simplifySelectInst(Cond, TrueVal, FalseVal, Q, RecursionLimit);
 }
 
 /// Given operands for an GetElementPtrInst, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifyGEPInst(Type *SrcTy, Value *Ptr,
+static Value *simplifyGEPInst(Type *SrcTy, Value *Ptr,
                               ArrayRef<Value *> Indices, bool InBounds,
                               const SimplifyQuery &Q, unsigned) {
   // The type of the GEP pointer operand.
@@ -4473,6 +4558,13 @@ static Value *SimplifyGEPInst(Type *SrcTy, Value *Ptr,
     }
   }
 
+  // For opaque pointers an all-zero GEP is a no-op. For typed pointers,
+  // it may be equivalent to a bitcast.
+  if (Ptr->getType()->getScalarType()->isOpaquePointerTy() &&
+      Ptr->getType() == GEPTy &&
+      all_of(Indices, [](const auto *V) { return match(V, m_Zero()); }))
+    return Ptr;
+
   // getelementptr poison, idx -> poison
   // getelementptr baseptr, poison -> poison
   if (isa<PoisonValue>(Ptr) ||
@@ -4577,16 +4669,16 @@ static Value *SimplifyGEPInst(Type *SrcTy, Value *Ptr,
   return ConstantFoldConstant(CE, Q.DL);
 }
 
-Value *llvm::SimplifyGEPInst(Type *SrcTy, Value *Ptr, ArrayRef<Value *> Indices,
+Value *llvm::simplifyGEPInst(Type *SrcTy, Value *Ptr, ArrayRef<Value *> Indices,
                              bool InBounds, const SimplifyQuery &Q) {
-  return ::SimplifyGEPInst(SrcTy, Ptr, Indices, InBounds, Q, RecursionLimit);
+  return ::simplifyGEPInst(SrcTy, Ptr, Indices, InBounds, Q, RecursionLimit);
 }
 
 /// Given operands for an InsertValueInst, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifyInsertValueInst(Value *Agg, Value *Val,
-                                      ArrayRef<unsigned> Idxs, const SimplifyQuery &Q,
-                                      unsigned) {
+static Value *simplifyInsertValueInst(Value *Agg, Value *Val,
+                                      ArrayRef<unsigned> Idxs,
+                                      const SimplifyQuery &Q, unsigned) {
   if (Constant *CAgg = dyn_cast<Constant>(Agg))
     if (Constant *CVal = dyn_cast<Constant>(Val))
       return ConstantFoldInsertValueInstruction(CAgg, CVal, Idxs);
@@ -4611,13 +4703,13 @@ static Value *SimplifyInsertValueInst(Value *Agg, Value *Val,
   return nullptr;
 }
 
-Value *llvm::SimplifyInsertValueInst(Value *Agg, Value *Val,
+Value *llvm::simplifyInsertValueInst(Value *Agg, Value *Val,
                                      ArrayRef<unsigned> Idxs,
                                      const SimplifyQuery &Q) {
-  return ::SimplifyInsertValueInst(Agg, Val, Idxs, Q, RecursionLimit);
+  return ::simplifyInsertValueInst(Agg, Val, Idxs, Q, RecursionLimit);
 }
 
-Value *llvm::SimplifyInsertElementInst(Value *Vec, Value *Val, Value *Idx,
+Value *llvm::simplifyInsertElementInst(Value *Vec, Value *Val, Value *Idx,
                                        const SimplifyQuery &Q) {
   // Try to constant fold.
   auto *VecC = dyn_cast<Constant>(Vec);
@@ -4654,7 +4746,7 @@ Value *llvm::SimplifyInsertElementInst(Value *Vec, Value *Val, Value *Idx,
 
 /// Given operands for an ExtractValueInst, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifyExtractValueInst(Value *Agg, ArrayRef<unsigned> Idxs,
+static Value *simplifyExtractValueInst(Value *Agg, ArrayRef<unsigned> Idxs,
                                        const SimplifyQuery &, unsigned) {
   if (auto *CAgg = dyn_cast<Constant>(Agg))
     return ConstantFoldExtractValueInstruction(CAgg, Idxs);
@@ -4677,14 +4769,14 @@ static Value *SimplifyExtractValueInst(Value *Agg, ArrayRef<unsigned> Idxs,
   return nullptr;
 }
 
-Value *llvm::SimplifyExtractValueInst(Value *Agg, ArrayRef<unsigned> Idxs,
+Value *llvm::simplifyExtractValueInst(Value *Agg, ArrayRef<unsigned> Idxs,
                                       const SimplifyQuery &Q) {
-  return ::SimplifyExtractValueInst(Agg, Idxs, Q, RecursionLimit);
+  return ::simplifyExtractValueInst(Agg, Idxs, Q, RecursionLimit);
 }
 
 /// Given operands for an ExtractElementInst, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifyExtractElementInst(Value *Vec, Value *Idx,
+static Value *simplifyExtractElementInst(Value *Vec, Value *Idx,
                                          const SimplifyQuery &Q, unsigned) {
   auto *VecVTy = cast<VectorType>(Vec->getType());
   if (auto *CVec = dyn_cast<Constant>(Vec)) {
@@ -4721,13 +4813,13 @@ static Value *SimplifyExtractElementInst(Value *Vec, Value *Idx,
   return nullptr;
 }
 
-Value *llvm::SimplifyExtractElementInst(Value *Vec, Value *Idx,
+Value *llvm::simplifyExtractElementInst(Value *Vec, Value *Idx,
                                         const SimplifyQuery &Q) {
-  return ::SimplifyExtractElementInst(Vec, Idx, Q, RecursionLimit);
+  return ::simplifyExtractElementInst(Vec, Idx, Q, RecursionLimit);
 }
 
 /// See if we can fold the given phi. If not, returns null.
-static Value *SimplifyPHINode(PHINode *PN, ArrayRef<Value *> IncomingValues,
+static Value *simplifyPHINode(PHINode *PN, ArrayRef<Value *> IncomingValues,
                               const SimplifyQuery &Q) {
   // WARNING: no matter how worthwhile it may seem, we can not perform PHI CSE
   //          here, because the PHI we may succeed simplifying to was not
@@ -4739,14 +4831,15 @@ static Value *SimplifyPHINode(PHINode *PN, ArrayRef<Value *> IncomingValues,
   bool HasUndefInput = false;
   for (Value *Incoming : IncomingValues) {
     // If the incoming value is the phi node itself, it can safely be skipped.
-    if (Incoming == PN) continue;
+    if (Incoming == PN)
+      continue;
     if (Q.isUndefValue(Incoming)) {
       // Remember that we saw an undef value, but otherwise ignore them.
       HasUndefInput = true;
       continue;
     }
     if (CommonValue && Incoming != CommonValue)
-      return nullptr;  // Not the same, bail out.
+      return nullptr; // Not the same, bail out.
     CommonValue = Incoming;
   }
 
@@ -4755,17 +4848,24 @@ static Value *SimplifyPHINode(PHINode *PN, ArrayRef<Value *> IncomingValues,
   if (!CommonValue)
     return UndefValue::get(PN->getType());
 
-  // If we have a PHI node like phi(X, undef, X), where X is defined by some
-  // instruction, we cannot return X as the result of the PHI node unless it
-  // dominates the PHI block.
-  if (HasUndefInput)
+  if (HasUndefInput) {
+    // We cannot start executing a trapping constant expression on more control
+    // flow paths.
+    auto *C = dyn_cast<Constant>(CommonValue);
+    if (C && C->canTrap())
+      return nullptr;
+
+    // If we have a PHI node like phi(X, undef, X), where X is defined by some
+    // instruction, we cannot return X as the result of the PHI node unless it
+    // dominates the PHI block.
     return valueDominatesPHI(CommonValue, PN, Q.DT) ? CommonValue : nullptr;
+  }
 
   return CommonValue;
 }
 
-static Value *SimplifyCastInst(unsigned CastOpc, Value *Op,
-                               Type *Ty, const SimplifyQuery &Q, unsigned MaxRecurse) {
+static Value *simplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty,
+                               const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (auto *C = dyn_cast<Constant>(Op))
     return ConstantFoldCastOperand(CastOpc, C, Ty, Q.DL);
 
@@ -4798,9 +4898,9 @@ static Value *SimplifyCastInst(unsigned CastOpc, Value *Op,
   return nullptr;
 }
 
-Value *llvm::SimplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty,
+Value *llvm::simplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty,
                               const SimplifyQuery &Q) {
-  return ::SimplifyCastInst(CastOpc, Op, Ty, Q, RecursionLimit);
+  return ::simplifyCastInst(CastOpc, Op, Ty, Q, RecursionLimit);
 }
 
 /// For the given destination element of a shuffle, peek through shuffles to
@@ -4854,7 +4954,7 @@ static Value *foldIdentityShuffles(int DestElt, Value *Op0, Value *Op1,
   return RootVec;
 }
 
-static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1,
+static Value *simplifyShuffleVectorInst(Value *Op0, Value *Op1,
                                         ArrayRef<int> Mask, Type *RetTy,
                                         const SimplifyQuery &Q,
                                         unsigned MaxRecurse) {
@@ -4970,14 +5070,14 @@ static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1,
 }
 
 /// Given operands for a ShuffleVectorInst, fold the result or return null.
-Value *llvm::SimplifyShuffleVectorInst(Value *Op0, Value *Op1,
+Value *llvm::simplifyShuffleVectorInst(Value *Op0, Value *Op1,
                                        ArrayRef<int> Mask, Type *RetTy,
                                        const SimplifyQuery &Q) {
-  return ::SimplifyShuffleVectorInst(Op0, Op1, Mask, RetTy, Q, RecursionLimit);
+  return ::simplifyShuffleVectorInst(Op0, Op1, Mask, RetTy, Q, RecursionLimit);
 }
 
-static Constant *foldConstant(Instruction::UnaryOps Opcode,
-                              Value *&Op, const SimplifyQuery &Q) {
+static Constant *foldConstant(Instruction::UnaryOps Opcode, Value *&Op,
+                              const SimplifyQuery &Q) {
   if (auto *C = dyn_cast<Constant>(Op))
     return ConstantFoldUnaryOpOperand(Opcode, C, Q.DL);
   return nullptr;
@@ -4998,7 +5098,7 @@ static Value *simplifyFNegInst(Value *Op, FastMathFlags FMF,
   return nullptr;
 }
 
-Value *llvm::SimplifyFNegInst(Value *Op, FastMathFlags FMF,
+Value *llvm::simplifyFNegInst(Value *Op, FastMathFlags FMF,
                               const SimplifyQuery &Q) {
   return ::simplifyFNegInst(Op, FMF, Q, RecursionLimit);
 }
@@ -5049,15 +5149,10 @@ static Constant *simplifyFPOp(ArrayRef<Value *> Ops, FastMathFlags FMF,
   return nullptr;
 }
 
-// TODO: Move this out to a header file:
-static inline bool canIgnoreSNaN(fp::ExceptionBehavior EB, FastMathFlags FMF) {
-  return (EB == fp::ebIgnore || FMF.noNaNs());
-}
-
 /// Given operands for an FAdd, see if we can fold the result.  If not, this
 /// returns null.
 static Value *
-SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
+simplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                  const SimplifyQuery &Q, unsigned MaxRecurse,
                  fp::ExceptionBehavior ExBehavior = fp::ebIgnore,
                  RoundingMode Rounding = RoundingMode::NearestTiesToEven) {
@@ -5119,7 +5214,7 @@ SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
 /// Given operands for an FSub, see if we can fold the result.  If not, this
 /// returns null.
 static Value *
-SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
+simplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                  const SimplifyQuery &Q, unsigned MaxRecurse,
                  fp::ExceptionBehavior ExBehavior = fp::ebIgnore,
                  RoundingMode Rounding = RoundingMode::NearestTiesToEven) {
@@ -5130,24 +5225,28 @@ SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
   if (Constant *C = simplifyFPOp({Op0, Op1}, FMF, Q, ExBehavior, Rounding))
     return C;
 
-  if (!isDefaultFPEnvironment(ExBehavior, Rounding))
-    return nullptr;
-
   // fsub X, +0 ==> X
-  if (match(Op1, m_PosZeroFP()))
-    return Op0;
+  if (canIgnoreSNaN(ExBehavior, FMF) &&
+      (!canRoundingModeBe(Rounding, RoundingMode::TowardNegative) ||
+       FMF.noSignedZeros()))
+    if (match(Op1, m_PosZeroFP()))
+      return Op0;
 
   // fsub X, -0 ==> X, when we know X is not -0
-  if (match(Op1, m_NegZeroFP()) &&
-      (FMF.noSignedZeros() || CannotBeNegativeZero(Op0, Q.TLI)))
-    return Op0;
+  if (canIgnoreSNaN(ExBehavior, FMF))
+    if (match(Op1, m_NegZeroFP()) &&
+        (FMF.noSignedZeros() || CannotBeNegativeZero(Op0, Q.TLI)))
+      return Op0;
 
   // fsub -0.0, (fsub -0.0, X) ==> X
   // fsub -0.0, (fneg X) ==> X
   Value *X;
-  if (match(Op0, m_NegZeroFP()) &&
-      match(Op1, m_FNeg(m_Value(X))))
-    return X;
+  if (canIgnoreSNaN(ExBehavior, FMF))
+    if (match(Op0, m_NegZeroFP()) && match(Op1, m_FNeg(m_Value(X))))
+      return X;
+
+  if (!isDefaultFPEnvironment(ExBehavior, Rounding))
+    return nullptr;
 
   // fsub 0.0, (fsub 0.0, X) ==> X if signed zeros are ignored.
   // fsub 0.0, (fneg X) ==> X if signed zeros are ignored.
@@ -5170,7 +5269,7 @@ SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
   return nullptr;
 }
 
-static Value *SimplifyFMAFMul(Value *Op0, Value *Op1, FastMathFlags FMF,
+static Value *simplifyFMAFMul(Value *Op0, Value *Op1, FastMathFlags FMF,
                               const SimplifyQuery &Q, unsigned MaxRecurse,
                               fp::ExceptionBehavior ExBehavior,
                               RoundingMode Rounding) {
@@ -5201,8 +5300,8 @@ static Value *SimplifyFMAFMul(Value *Op0, Value *Op1, FastMathFlags FMF,
   // 2. Ignore non-zero negative numbers because sqrt would produce NAN.
   // 3. Ignore -0.0 because sqrt(-0.0) == -0.0, but -0.0 * -0.0 == 0.0.
   Value *X;
-  if (Op0 == Op1 && match(Op0, m_Intrinsic<Intrinsic::sqrt>(m_Value(X))) &&
-      FMF.allowReassoc() && FMF.noNaNs() && FMF.noSignedZeros())
+  if (Op0 == Op1 && match(Op0, m_Sqrt(m_Value(X))) && FMF.allowReassoc() &&
+      FMF.noNaNs() && FMF.noSignedZeros())
     return X;
 
   return nullptr;
@@ -5210,7 +5309,7 @@ static Value *SimplifyFMAFMul(Value *Op0, Value *Op1, FastMathFlags FMF,
 
 /// Given the operands for an FMul, see if we can fold the result
 static Value *
-SimplifyFMulInst(Value *Op0, Value *Op1, FastMathFlags FMF,
+simplifyFMulInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                  const SimplifyQuery &Q, unsigned MaxRecurse,
                  fp::ExceptionBehavior ExBehavior = fp::ebIgnore,
                  RoundingMode Rounding = RoundingMode::NearestTiesToEven) {
@@ -5219,43 +5318,43 @@ SimplifyFMulInst(Value *Op0, Value *Op1, FastMathFlags FMF,
       return C;
 
   // Now apply simplifications that do not require rounding.
-  return SimplifyFMAFMul(Op0, Op1, FMF, Q, MaxRecurse, ExBehavior, Rounding);
+  return simplifyFMAFMul(Op0, Op1, FMF, Q, MaxRecurse, ExBehavior, Rounding);
 }
 
-Value *llvm::SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
+Value *llvm::simplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                               const SimplifyQuery &Q,
                               fp::ExceptionBehavior ExBehavior,
                               RoundingMode Rounding) {
-  return ::SimplifyFAddInst(Op0, Op1, FMF, Q, RecursionLimit, ExBehavior,
+  return ::simplifyFAddInst(Op0, Op1, FMF, Q, RecursionLimit, ExBehavior,
                             Rounding);
 }
 
-Value *llvm::SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
+Value *llvm::simplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                               const SimplifyQuery &Q,
                               fp::ExceptionBehavior ExBehavior,
                               RoundingMode Rounding) {
-  return ::SimplifyFSubInst(Op0, Op1, FMF, Q, RecursionLimit, ExBehavior,
+  return ::simplifyFSubInst(Op0, Op1, FMF, Q, RecursionLimit, ExBehavior,
                             Rounding);
 }
 
-Value *llvm::SimplifyFMulInst(Value *Op0, Value *Op1, FastMathFlags FMF,
+Value *llvm::simplifyFMulInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                               const SimplifyQuery &Q,
                               fp::ExceptionBehavior ExBehavior,
                               RoundingMode Rounding) {
-  return ::SimplifyFMulInst(Op0, Op1, FMF, Q, RecursionLimit, ExBehavior,
+  return ::simplifyFMulInst(Op0, Op1, FMF, Q, RecursionLimit, ExBehavior,
                             Rounding);
 }
 
-Value *llvm::SimplifyFMAFMul(Value *Op0, Value *Op1, FastMathFlags FMF,
+Value *llvm::simplifyFMAFMul(Value *Op0, Value *Op1, FastMathFlags FMF,
                              const SimplifyQuery &Q,
                              fp::ExceptionBehavior ExBehavior,
                              RoundingMode Rounding) {
-  return ::SimplifyFMAFMul(Op0, Op1, FMF, Q, RecursionLimit, ExBehavior,
+  return ::simplifyFMAFMul(Op0, Op1, FMF, Q, RecursionLimit, ExBehavior,
                            Rounding);
 }
 
 static Value *
-SimplifyFDivInst(Value *Op0, Value *Op1, FastMathFlags FMF,
+simplifyFDivInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                  const SimplifyQuery &Q, unsigned,
                  fp::ExceptionBehavior ExBehavior = fp::ebIgnore,
                  RoundingMode Rounding = RoundingMode::NearestTiesToEven) {
@@ -5301,16 +5400,16 @@ SimplifyFDivInst(Value *Op0, Value *Op1, FastMathFlags FMF,
   return nullptr;
 }
 
-Value *llvm::SimplifyFDivInst(Value *Op0, Value *Op1, FastMathFlags FMF,
+Value *llvm::simplifyFDivInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                               const SimplifyQuery &Q,
                               fp::ExceptionBehavior ExBehavior,
                               RoundingMode Rounding) {
-  return ::SimplifyFDivInst(Op0, Op1, FMF, Q, RecursionLimit, ExBehavior,
+  return ::simplifyFDivInst(Op0, Op1, FMF, Q, RecursionLimit, ExBehavior,
                             Rounding);
 }
 
 static Value *
-SimplifyFRemInst(Value *Op0, Value *Op1, FastMathFlags FMF,
+simplifyFRemInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                  const SimplifyQuery &Q, unsigned,
                  fp::ExceptionBehavior ExBehavior = fp::ebIgnore,
                  RoundingMode Rounding = RoundingMode::NearestTiesToEven) {
@@ -5339,11 +5438,11 @@ SimplifyFRemInst(Value *Op0, Value *Op1, FastMathFlags FMF,
   return nullptr;
 }
 
-Value *llvm::SimplifyFRemInst(Value *Op0, Value *Op1, FastMathFlags FMF,
+Value *llvm::simplifyFRemInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                               const SimplifyQuery &Q,
                               fp::ExceptionBehavior ExBehavior,
                               RoundingMode Rounding) {
-  return ::SimplifyFRemInst(Op0, Op1, FMF, Q, RecursionLimit, ExBehavior,
+  return ::simplifyFRemInst(Op0, Op1, FMF, Q, RecursionLimit, ExBehavior,
                             Rounding);
 }
 
@@ -5365,8 +5464,8 @@ static Value *simplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q,
 /// If not, this returns null.
 /// Try to use FastMathFlags when folding the result.
 static Value *simplifyFPUnOp(unsigned Opcode, Value *Op,
-                             const FastMathFlags &FMF,
-                             const SimplifyQuery &Q, unsigned MaxRecurse) {
+                             const FastMathFlags &FMF, const SimplifyQuery &Q,
+                             unsigned MaxRecurse) {
   switch (Opcode) {
   case Instruction::FNeg:
     return simplifyFNegInst(Op, FMF, Q, MaxRecurse);
@@ -5375,56 +5474,56 @@ static Value *simplifyFPUnOp(unsigned Opcode, Value *Op,
   }
 }
 
-Value *llvm::SimplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q) {
+Value *llvm::simplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q) {
   return ::simplifyUnOp(Opcode, Op, Q, RecursionLimit);
 }
 
-Value *llvm::SimplifyUnOp(unsigned Opcode, Value *Op, FastMathFlags FMF,
+Value *llvm::simplifyUnOp(unsigned Opcode, Value *Op, FastMathFlags FMF,
                           const SimplifyQuery &Q) {
   return ::simplifyFPUnOp(Opcode, Op, FMF, Q, RecursionLimit);
 }
 
 /// Given operands for a BinaryOperator, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
+static Value *simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
                             const SimplifyQuery &Q, unsigned MaxRecurse) {
   switch (Opcode) {
   case Instruction::Add:
-    return SimplifyAddInst(LHS, RHS, false, false, Q, MaxRecurse);
+    return simplifyAddInst(LHS, RHS, false, false, Q, MaxRecurse);
   case Instruction::Sub:
-    return SimplifySubInst(LHS, RHS, false, false, Q, MaxRecurse);
+    return simplifySubInst(LHS, RHS, false, false, Q, MaxRecurse);
   case Instruction::Mul:
-    return SimplifyMulInst(LHS, RHS, Q, MaxRecurse);
+    return simplifyMulInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::SDiv:
-    return SimplifySDivInst(LHS, RHS, Q, MaxRecurse);
+    return simplifySDivInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::UDiv:
-    return SimplifyUDivInst(LHS, RHS, Q, MaxRecurse);
+    return simplifyUDivInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::SRem:
-    return SimplifySRemInst(LHS, RHS, Q, MaxRecurse);
+    return simplifySRemInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::URem:
-    return SimplifyURemInst(LHS, RHS, Q, MaxRecurse);
+    return simplifyURemInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::Shl:
-    return SimplifyShlInst(LHS, RHS, false, false, Q, MaxRecurse);
+    return simplifyShlInst(LHS, RHS, false, false, Q, MaxRecurse);
   case Instruction::LShr:
-    return SimplifyLShrInst(LHS, RHS, false, Q, MaxRecurse);
+    return simplifyLShrInst(LHS, RHS, false, Q, MaxRecurse);
   case Instruction::AShr:
-    return SimplifyAShrInst(LHS, RHS, false, Q, MaxRecurse);
+    return simplifyAShrInst(LHS, RHS, false, Q, MaxRecurse);
   case Instruction::And:
-    return SimplifyAndInst(LHS, RHS, Q, MaxRecurse);
+    return simplifyAndInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::Or:
-    return SimplifyOrInst(LHS, RHS, Q, MaxRecurse);
+    return simplifyOrInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::Xor:
-    return SimplifyXorInst(LHS, RHS, Q, MaxRecurse);
+    return simplifyXorInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::FAdd:
-    return SimplifyFAddInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
+    return simplifyFAddInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
   case Instruction::FSub:
-    return SimplifyFSubInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
+    return simplifyFSubInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
   case Instruction::FMul:
-    return SimplifyFMulInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
+    return simplifyFMulInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
   case Instruction::FDiv:
-    return SimplifyFDivInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
+    return simplifyFDivInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
   case Instruction::FRem:
-    return SimplifyFRemInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
+    return simplifyFRemInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
   default:
     llvm_unreachable("Unexpected opcode");
   }
@@ -5433,49 +5532,50 @@ static Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
 /// Given operands for a BinaryOperator, see if we can fold the result.
 /// If not, this returns null.
 /// Try to use FastMathFlags when folding the result.
-static Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
+static Value *simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
                             const FastMathFlags &FMF, const SimplifyQuery &Q,
                             unsigned MaxRecurse) {
   switch (Opcode) {
   case Instruction::FAdd:
-    return SimplifyFAddInst(LHS, RHS, FMF, Q, MaxRecurse);
+    return simplifyFAddInst(LHS, RHS, FMF, Q, MaxRecurse);
   case Instruction::FSub:
-    return SimplifyFSubInst(LHS, RHS, FMF, Q, MaxRecurse);
+    return simplifyFSubInst(LHS, RHS, FMF, Q, MaxRecurse);
   case Instruction::FMul:
-    return SimplifyFMulInst(LHS, RHS, FMF, Q, MaxRecurse);
+    return simplifyFMulInst(LHS, RHS, FMF, Q, MaxRecurse);
   case Instruction::FDiv:
-    return SimplifyFDivInst(LHS, RHS, FMF, Q, MaxRecurse);
+    return simplifyFDivInst(LHS, RHS, FMF, Q, MaxRecurse);
   default:
-    return SimplifyBinOp(Opcode, LHS, RHS, Q, MaxRecurse);
+    return simplifyBinOp(Opcode, LHS, RHS, Q, MaxRecurse);
   }
 }
 
-Value *llvm::SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
+Value *llvm::simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
                            const SimplifyQuery &Q) {
-  return ::SimplifyBinOp(Opcode, LHS, RHS, Q, RecursionLimit);
+  return ::simplifyBinOp(Opcode, LHS, RHS, Q, RecursionLimit);
 }
 
-Value *llvm::SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
+Value *llvm::simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
                            FastMathFlags FMF, const SimplifyQuery &Q) {
-  return ::SimplifyBinOp(Opcode, LHS, RHS, FMF, Q, RecursionLimit);
+  return ::simplifyBinOp(Opcode, LHS, RHS, FMF, Q, RecursionLimit);
 }
 
 /// Given operands for a CmpInst, see if we can fold the result.
-static Value *SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
+static Value *simplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                               const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (CmpInst::isIntPredicate((CmpInst::Predicate)Predicate))
-    return SimplifyICmpInst(Predicate, LHS, RHS, Q, MaxRecurse);
-  return SimplifyFCmpInst(Predicate, LHS, RHS, FastMathFlags(), Q, MaxRecurse);
+    return simplifyICmpInst(Predicate, LHS, RHS, Q, MaxRecurse);
+  return simplifyFCmpInst(Predicate, LHS, RHS, FastMathFlags(), Q, MaxRecurse);
 }
 
-Value *llvm::SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
+Value *llvm::simplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                              const SimplifyQuery &Q) {
-  return ::SimplifyCmpInst(Predicate, LHS, RHS, Q, RecursionLimit);
+  return ::simplifyCmpInst(Predicate, LHS, RHS, Q, RecursionLimit);
 }
 
-static bool IsIdempotent(Intrinsic::ID ID) {
+static bool isIdempotent(Intrinsic::ID ID) {
   switch (ID) {
-  default: return false;
+  default:
+    return false;
 
   // Unary idempotent: f(f(x)) = f(x)
   case Intrinsic::fabs:
@@ -5491,7 +5591,7 @@ static bool IsIdempotent(Intrinsic::ID ID) {
   }
 }
 
-static Value *SimplifyRelativeLoad(Constant *Ptr, Constant *Offset,
+static Value *simplifyRelativeLoad(Constant *Ptr, Constant *Offset,
                                    const DataLayout &DL) {
   GlobalValue *PtrSym;
   APInt PtrOffset;
@@ -5551,7 +5651,7 @@ static Value *simplifyUnaryIntrinsic(Function *F, Value *Op0,
                                      const SimplifyQuery &Q) {
   // Idempotent functions return the same result when called repeatedly.
   Intrinsic::ID IID = F->getIntrinsicID();
-  if (IsIdempotent(IID))
+  if (isIdempotent(IID))
     if (auto *II = dyn_cast<IntrinsicInst>(Op0))
       if (II->getIntrinsicID() == IID)
         return II;
@@ -5559,15 +5659,18 @@ static Value *simplifyUnaryIntrinsic(Function *F, Value *Op0,
   Value *X;
   switch (IID) {
   case Intrinsic::fabs:
-    if (SignBitMustBeZero(Op0, Q.TLI)) return Op0;
+    if (SignBitMustBeZero(Op0, Q.TLI))
+      return Op0;
     break;
   case Intrinsic::bswap:
     // bswap(bswap(x)) -> x
-    if (match(Op0, m_BSwap(m_Value(X)))) return X;
+    if (match(Op0, m_BSwap(m_Value(X))))
+      return X;
     break;
   case Intrinsic::bitreverse:
     // bitreverse(bitreverse(x)) -> x
-    if (match(Op0, m_BitReverse(m_Value(X)))) return X;
+    if (match(Op0, m_BitReverse(m_Value(X))))
+      return X;
     break;
   case Intrinsic::ctpop: {
     // If everything but the lowest bit is zero, that bit is the pop-count. Ex:
@@ -5581,30 +5684,34 @@ static Value *simplifyUnaryIntrinsic(Function *F, Value *Op0,
   case Intrinsic::exp:
     // exp(log(x)) -> x
     if (Q.CxtI->hasAllowReassoc() &&
-        match(Op0, m_Intrinsic<Intrinsic::log>(m_Value(X)))) return X;
+        match(Op0, m_Intrinsic<Intrinsic::log>(m_Value(X))))
+      return X;
     break;
   case Intrinsic::exp2:
     // exp2(log2(x)) -> x
     if (Q.CxtI->hasAllowReassoc() &&
-        match(Op0, m_Intrinsic<Intrinsic::log2>(m_Value(X)))) return X;
+        match(Op0, m_Intrinsic<Intrinsic::log2>(m_Value(X))))
+      return X;
     break;
   case Intrinsic::log:
     // log(exp(x)) -> x
     if (Q.CxtI->hasAllowReassoc() &&
-        match(Op0, m_Intrinsic<Intrinsic::exp>(m_Value(X)))) return X;
+        match(Op0, m_Intrinsic<Intrinsic::exp>(m_Value(X))))
+      return X;
     break;
   case Intrinsic::log2:
     // log2(exp2(x)) -> x
     if (Q.CxtI->hasAllowReassoc() &&
         (match(Op0, m_Intrinsic<Intrinsic::exp2>(m_Value(X))) ||
-         match(Op0, m_Intrinsic<Intrinsic::pow>(m_SpecificFP(2.0),
-                                                m_Value(X))))) return X;
+         match(Op0,
+               m_Intrinsic<Intrinsic::pow>(m_SpecificFP(2.0), m_Value(X)))))
+      return X;
     break;
   case Intrinsic::log10:
     // log10(pow(10.0, x)) -> x
     if (Q.CxtI->hasAllowReassoc() &&
-        match(Op0, m_Intrinsic<Intrinsic::pow>(m_SpecificFP(10.0),
-                                               m_Value(X)))) return X;
+        match(Op0, m_Intrinsic<Intrinsic::pow>(m_SpecificFP(10.0), m_Value(X))))
+      return X;
     break;
   case Intrinsic::floor:
   case Intrinsic::trunc:
@@ -5826,7 +5933,7 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1,
   case Intrinsic::load_relative:
     if (auto *C0 = dyn_cast<Constant>(Op0))
       if (auto *C1 = dyn_cast<Constant>(Op1))
-        return SimplifyRelativeLoad(C0, C1, Q.DL);
+        return simplifyRelativeLoad(C0, C1, Q.DL);
     break;
   case Intrinsic::powi:
     if (auto *Power = dyn_cast<ConstantInt>(Op1)) {
@@ -5853,7 +5960,8 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1,
   case Intrinsic::maximum:
   case Intrinsic::minimum: {
     // If the arguments are the same, this is a no-op.
-    if (Op0 == Op1) return Op0;
+    if (Op0 == Op1)
+      return Op0;
 
     // Canonicalize constant operand as Op1.
     if (isa<Constant>(Op0))
@@ -5906,14 +6014,14 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1,
 
     break;
   }
-  case Intrinsic::experimental_vector_extract: {
+  case Intrinsic::vector_extract: {
     Type *ReturnType = F->getReturnType();
 
     // (extract_vector (insert_vector _, X, 0), 0) -> X
     unsigned IdxN = cast<ConstantInt>(Op1)->getZExtValue();
     Value *X = nullptr;
-    if (match(Op0, m_Intrinsic<Intrinsic::experimental_vector_insert>(
-                       m_Value(), m_Value(X), m_Zero())) &&
+    if (match(Op0, m_Intrinsic<Intrinsic::vector_insert>(m_Value(), m_Value(X),
+                                                         m_Zero())) &&
         IdxN == 0 && X->getType() == ReturnType)
       return X;
 
@@ -6054,7 +6162,7 @@ static Value *simplifyIntrinsic(CallBase *Call, const SimplifyQuery &Q) {
 
     return nullptr;
   }
-  case Intrinsic::experimental_vector_insert: {
+  case Intrinsic::vector_insert: {
     Value *Vec = Call->getArgOperand(0);
     Value *SubVec = Call->getArgOperand(1);
     Value *Idx = Call->getArgOperand(2);
@@ -6064,8 +6172,8 @@ static Value *simplifyIntrinsic(CallBase *Call, const SimplifyQuery &Q) {
     // where: Y is X, or Y is undef
     unsigned IdxN = cast<ConstantInt>(Idx)->getZExtValue();
     Value *X = nullptr;
-    if (match(SubVec, m_Intrinsic<Intrinsic::experimental_vector_extract>(
-                          m_Value(X), m_Zero())) &&
+    if (match(SubVec,
+              m_Intrinsic<Intrinsic::vector_extract>(m_Value(X), m_Zero())) &&
         (Q.isUndefValue(Vec) || Vec == X) && IdxN == 0 &&
         X->getType() == ReturnType)
       return X;
@@ -6074,43 +6182,38 @@ static Value *simplifyIntrinsic(CallBase *Call, const SimplifyQuery &Q) {
   }
   case Intrinsic::experimental_constrained_fadd: {
     auto *FPI = cast<ConstrainedFPIntrinsic>(Call);
-    return SimplifyFAddInst(FPI->getArgOperand(0), FPI->getArgOperand(1),
+    return simplifyFAddInst(FPI->getArgOperand(0), FPI->getArgOperand(1),
                             FPI->getFastMathFlags(), Q,
                             FPI->getExceptionBehavior().getValue(),
                             FPI->getRoundingMode().getValue());
-    break;
   }
   case Intrinsic::experimental_constrained_fsub: {
     auto *FPI = cast<ConstrainedFPIntrinsic>(Call);
-    return SimplifyFSubInst(FPI->getArgOperand(0), FPI->getArgOperand(1),
+    return simplifyFSubInst(FPI->getArgOperand(0), FPI->getArgOperand(1),
                             FPI->getFastMathFlags(), Q,
                             FPI->getExceptionBehavior().getValue(),
                             FPI->getRoundingMode().getValue());
-    break;
   }
   case Intrinsic::experimental_constrained_fmul: {
     auto *FPI = cast<ConstrainedFPIntrinsic>(Call);
-    return SimplifyFMulInst(FPI->getArgOperand(0), FPI->getArgOperand(1),
+    return simplifyFMulInst(FPI->getArgOperand(0), FPI->getArgOperand(1),
                             FPI->getFastMathFlags(), Q,
                             FPI->getExceptionBehavior().getValue(),
                             FPI->getRoundingMode().getValue());
-    break;
   }
   case Intrinsic::experimental_constrained_fdiv: {
     auto *FPI = cast<ConstrainedFPIntrinsic>(Call);
-    return SimplifyFDivInst(FPI->getArgOperand(0), FPI->getArgOperand(1),
+    return simplifyFDivInst(FPI->getArgOperand(0), FPI->getArgOperand(1),
                             FPI->getFastMathFlags(), Q,
                             FPI->getExceptionBehavior().getValue(),
                             FPI->getRoundingMode().getValue());
-    break;
   }
   case Intrinsic::experimental_constrained_frem: {
     auto *FPI = cast<ConstrainedFPIntrinsic>(Call);
-    return SimplifyFRemInst(FPI->getArgOperand(0), FPI->getArgOperand(1),
+    return simplifyFRemInst(FPI->getArgOperand(0), FPI->getArgOperand(1),
                             FPI->getFastMathFlags(), Q,
                             FPI->getExceptionBehavior().getValue(),
                             FPI->getRoundingMode().getValue());
-    break;
   }
   default:
     return nullptr;
@@ -6138,7 +6241,7 @@ static Value *tryConstantFoldCall(CallBase *Call, const SimplifyQuery &Q) {
   return ConstantFoldCall(Call, F, ConstantArgs, Q.TLI);
 }
 
-Value *llvm::SimplifyCall(CallBase *Call, const SimplifyQuery &Q) {
+Value *llvm::simplifyCall(CallBase *Call, const SimplifyQuery &Q) {
   // musttail calls can only be simplified if they are also DCEd.
   // As we can't guarantee this here, don't simplify them.
   if (Call->isMustTailCall())
@@ -6161,8 +6264,17 @@ Value *llvm::SimplifyCall(CallBase *Call, const SimplifyQuery &Q) {
   return nullptr;
 }
 
+Value *llvm::simplifyConstrainedFPCall(CallBase *Call, const SimplifyQuery &Q) {
+  assert(isa<ConstrainedFPIntrinsic>(Call));
+  if (Value *V = tryConstantFoldCall(Call, Q))
+    return V;
+  if (Value *Ret = simplifyIntrinsic(Call, Q))
+    return Ret;
+  return nullptr;
+}
+
 /// Given operands for a Freeze, see if we can fold the result.
-static Value *SimplifyFreezeInst(Value *Op0, const SimplifyQuery &Q) {
+static Value *simplifyFreezeInst(Value *Op0, const SimplifyQuery &Q) {
   // Use a utility function defined in ValueTracking.
   if (llvm::isGuaranteedNotToBeUndefOrPoison(Op0, Q.AC, Q.CxtI, Q.DT))
     return Op0;
@@ -6170,11 +6282,11 @@ static Value *SimplifyFreezeInst(Value *Op0, const SimplifyQuery &Q) {
   return nullptr;
 }
 
-Value *llvm::SimplifyFreezeInst(Value *Op0, const SimplifyQuery &Q) {
-  return ::SimplifyFreezeInst(Op0, Q);
+Value *llvm::simplifyFreezeInst(Value *Op0, const SimplifyQuery &Q) {
+  return ::simplifyFreezeInst(Op0, Q);
 }
 
-static Value *SimplifyLoadInst(LoadInst *LI, Value *PtrOp,
+static Value *simplifyLoadInst(LoadInst *LI, Value *PtrOp,
                                const SimplifyQuery &Q) {
   if (LI->isVolatile())
     return nullptr;
@@ -6218,134 +6330,134 @@ static Value *simplifyInstructionWithOperands(Instruction *I,
     }
     break;
   case Instruction::FNeg:
-    Result = SimplifyFNegInst(NewOps[0], I->getFastMathFlags(), Q);
+    Result = simplifyFNegInst(NewOps[0], I->getFastMathFlags(), Q);
     break;
   case Instruction::FAdd:
-    Result = SimplifyFAddInst(NewOps[0], NewOps[1], I->getFastMathFlags(), Q);
+    Result = simplifyFAddInst(NewOps[0], NewOps[1], I->getFastMathFlags(), Q);
     break;
   case Instruction::Add:
-    Result = SimplifyAddInst(
+    Result = simplifyAddInst(
         NewOps[0], NewOps[1], Q.IIQ.hasNoSignedWrap(cast<BinaryOperator>(I)),
         Q.IIQ.hasNoUnsignedWrap(cast<BinaryOperator>(I)), Q);
     break;
   case Instruction::FSub:
-    Result = SimplifyFSubInst(NewOps[0], NewOps[1], I->getFastMathFlags(), Q);
+    Result = simplifyFSubInst(NewOps[0], NewOps[1], I->getFastMathFlags(), Q);
     break;
   case Instruction::Sub:
-    Result = SimplifySubInst(
+    Result = simplifySubInst(
         NewOps[0], NewOps[1], Q.IIQ.hasNoSignedWrap(cast<BinaryOperator>(I)),
         Q.IIQ.hasNoUnsignedWrap(cast<BinaryOperator>(I)), Q);
     break;
   case Instruction::FMul:
-    Result = SimplifyFMulInst(NewOps[0], NewOps[1], I->getFastMathFlags(), Q);
+    Result = simplifyFMulInst(NewOps[0], NewOps[1], I->getFastMathFlags(), Q);
     break;
   case Instruction::Mul:
-    Result = SimplifyMulInst(NewOps[0], NewOps[1], Q);
+    Result = simplifyMulInst(NewOps[0], NewOps[1], Q);
     break;
   case Instruction::SDiv:
-    Result = SimplifySDivInst(NewOps[0], NewOps[1], Q);
+    Result = simplifySDivInst(NewOps[0], NewOps[1], Q);
     break;
   case Instruction::UDiv:
-    Result = SimplifyUDivInst(NewOps[0], NewOps[1], Q);
+    Result = simplifyUDivInst(NewOps[0], NewOps[1], Q);
     break;
   case Instruction::FDiv:
-    Result = SimplifyFDivInst(NewOps[0], NewOps[1], I->getFastMathFlags(), Q);
+    Result = simplifyFDivInst(NewOps[0], NewOps[1], I->getFastMathFlags(), Q);
     break;
   case Instruction::SRem:
-    Result = SimplifySRemInst(NewOps[0], NewOps[1], Q);
+    Result = simplifySRemInst(NewOps[0], NewOps[1], Q);
     break;
   case Instruction::URem:
-    Result = SimplifyURemInst(NewOps[0], NewOps[1], Q);
+    Result = simplifyURemInst(NewOps[0], NewOps[1], Q);
     break;
   case Instruction::FRem:
-    Result = SimplifyFRemInst(NewOps[0], NewOps[1], I->getFastMathFlags(), Q);
+    Result = simplifyFRemInst(NewOps[0], NewOps[1], I->getFastMathFlags(), Q);
     break;
   case Instruction::Shl:
-    Result = SimplifyShlInst(
+    Result = simplifyShlInst(
         NewOps[0], NewOps[1], Q.IIQ.hasNoSignedWrap(cast<BinaryOperator>(I)),
         Q.IIQ.hasNoUnsignedWrap(cast<BinaryOperator>(I)), Q);
     break;
   case Instruction::LShr:
-    Result = SimplifyLShrInst(NewOps[0], NewOps[1],
+    Result = simplifyLShrInst(NewOps[0], NewOps[1],
                               Q.IIQ.isExact(cast<BinaryOperator>(I)), Q);
     break;
   case Instruction::AShr:
-    Result = SimplifyAShrInst(NewOps[0], NewOps[1],
+    Result = simplifyAShrInst(NewOps[0], NewOps[1],
                               Q.IIQ.isExact(cast<BinaryOperator>(I)), Q);
     break;
   case Instruction::And:
-    Result = SimplifyAndInst(NewOps[0], NewOps[1], Q);
+    Result = simplifyAndInst(NewOps[0], NewOps[1], Q);
     break;
   case Instruction::Or:
-    Result = SimplifyOrInst(NewOps[0], NewOps[1], Q);
+    Result = simplifyOrInst(NewOps[0], NewOps[1], Q);
     break;
   case Instruction::Xor:
-    Result = SimplifyXorInst(NewOps[0], NewOps[1], Q);
+    Result = simplifyXorInst(NewOps[0], NewOps[1], Q);
     break;
   case Instruction::ICmp:
-    Result = SimplifyICmpInst(cast<ICmpInst>(I)->getPredicate(), NewOps[0],
+    Result = simplifyICmpInst(cast<ICmpInst>(I)->getPredicate(), NewOps[0],
                               NewOps[1], Q);
     break;
   case Instruction::FCmp:
-    Result = SimplifyFCmpInst(cast<FCmpInst>(I)->getPredicate(), NewOps[0],
+    Result = simplifyFCmpInst(cast<FCmpInst>(I)->getPredicate(), NewOps[0],
                               NewOps[1], I->getFastMathFlags(), Q);
     break;
   case Instruction::Select:
-    Result = SimplifySelectInst(NewOps[0], NewOps[1], NewOps[2], Q);
+    Result = simplifySelectInst(NewOps[0], NewOps[1], NewOps[2], Q);
     break;
   case Instruction::GetElementPtr: {
     auto *GEPI = cast<GetElementPtrInst>(I);
     Result =
-        SimplifyGEPInst(GEPI->getSourceElementType(), NewOps[0],
+        simplifyGEPInst(GEPI->getSourceElementType(), NewOps[0],
                         makeArrayRef(NewOps).slice(1), GEPI->isInBounds(), Q);
     break;
   }
   case Instruction::InsertValue: {
     InsertValueInst *IV = cast<InsertValueInst>(I);
-    Result = SimplifyInsertValueInst(NewOps[0], NewOps[1], IV->getIndices(), Q);
+    Result = simplifyInsertValueInst(NewOps[0], NewOps[1], IV->getIndices(), Q);
     break;
   }
   case Instruction::InsertElement: {
-    Result = SimplifyInsertElementInst(NewOps[0], NewOps[1], NewOps[2], Q);
+    Result = simplifyInsertElementInst(NewOps[0], NewOps[1], NewOps[2], Q);
     break;
   }
   case Instruction::ExtractValue: {
     auto *EVI = cast<ExtractValueInst>(I);
-    Result = SimplifyExtractValueInst(NewOps[0], EVI->getIndices(), Q);
+    Result = simplifyExtractValueInst(NewOps[0], EVI->getIndices(), Q);
     break;
   }
   case Instruction::ExtractElement: {
-    Result = SimplifyExtractElementInst(NewOps[0], NewOps[1], Q);
+    Result = simplifyExtractElementInst(NewOps[0], NewOps[1], Q);
     break;
   }
   case Instruction::ShuffleVector: {
     auto *SVI = cast<ShuffleVectorInst>(I);
-    Result = SimplifyShuffleVectorInst(
+    Result = simplifyShuffleVectorInst(
         NewOps[0], NewOps[1], SVI->getShuffleMask(), SVI->getType(), Q);
     break;
   }
   case Instruction::PHI:
-    Result = SimplifyPHINode(cast<PHINode>(I), NewOps, Q);
+    Result = simplifyPHINode(cast<PHINode>(I), NewOps, Q);
     break;
   case Instruction::Call: {
     // TODO: Use NewOps
-    Result = SimplifyCall(cast<CallInst>(I), Q);
+    Result = simplifyCall(cast<CallInst>(I), Q);
     break;
   }
   case Instruction::Freeze:
-    Result = llvm::SimplifyFreezeInst(NewOps[0], Q);
+    Result = llvm::simplifyFreezeInst(NewOps[0], Q);
     break;
 #define HANDLE_CAST_INST(num, opc, clas) case Instruction::opc:
 #include "llvm/IR/Instruction.def"
 #undef HANDLE_CAST_INST
-    Result = SimplifyCastInst(I->getOpcode(), NewOps[0], I->getType(), Q);
+    Result = simplifyCastInst(I->getOpcode(), NewOps[0], I->getType(), Q);
     break;
   case Instruction::Alloca:
     // No simplifications for Alloca and it can't be constant folded.
     Result = nullptr;
     break;
   case Instruction::Load:
-    Result = SimplifyLoadInst(cast<LoadInst>(I), NewOps[0], Q);
+    Result = simplifyLoadInst(cast<LoadInst>(I), NewOps[0], Q);
     break;
   }
 
@@ -6355,7 +6467,7 @@ static Value *simplifyInstructionWithOperands(Instruction *I,
   return Result == I ? UndefValue::get(I->getType()) : Result;
 }
 
-Value *llvm::SimplifyInstructionWithOperands(Instruction *I,
+Value *llvm::simplifyInstructionWithOperands(Instruction *I,
                                              ArrayRef<Value *> NewOps,
                                              const SimplifyQuery &SQ,
                                              OptimizationRemarkEmitter *ORE) {
@@ -6364,7 +6476,7 @@ Value *llvm::SimplifyInstructionWithOperands(Instruction *I,
   return ::simplifyInstructionWithOperands(I, NewOps, SQ, ORE);
 }
 
-Value *llvm::SimplifyInstruction(Instruction *I, const SimplifyQuery &SQ,
+Value *llvm::simplifyInstruction(Instruction *I, const SimplifyQuery &SQ,
                                  OptimizationRemarkEmitter *ORE) {
   SmallVector<Value *, 8> Ops(I->operands());
   return ::simplifyInstructionWithOperands(I, Ops, SQ, ORE);
@@ -6415,7 +6527,7 @@ static bool replaceAndRecursivelySimplifyImpl(
     I = Worklist[Idx];
 
     // See if this instruction simplifies.
-    SimpleV = SimplifyInstruction(I, {DL, TLI, DT, AC});
+    SimpleV = simplifyInstruction(I, {DL, TLI, DT, AC});
     if (!SimpleV) {
       if (UnsimplifiedUsers)
         UnsimplifiedUsers->insert(I);
@@ -6478,6 +6590,6 @@ const SimplifyQuery getBestSimplifyQuery(AnalysisManager<T, TArgs...> &AM,
 }
 template const SimplifyQuery getBestSimplifyQuery(AnalysisManager<Function> &,
                                                   Function &);
-}
+} // namespace llvm
 
 void InstSimplifyFolder::anchor() {}
diff --git a/llvm/lib/Analysis/Interval.cpp b/llvm/lib/Analysis/Interval.cpp
index e228ec4f2126..f7fffcb3d5e6 100644
--- a/llvm/lib/Analysis/Interval.cpp
+++ b/llvm/lib/Analysis/Interval.cpp
@@ -13,7 +13,6 @@
 
 #include "llvm/Analysis/Interval.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Analysis/LazyCallGraph.cpp b/llvm/lib/Analysis/LazyCallGraph.cpp
index e8e9593d7030..20a905e04a9d 100644
--- a/llvm/lib/Analysis/LazyCallGraph.cpp
+++ b/llvm/lib/Analysis/LazyCallGraph.cpp
@@ -9,14 +9,13 @@
 #include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/Config/llvm-config.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/InstIterator.h"
@@ -30,12 +29,15 @@
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
-#include <cstddef>
 #include <iterator>
 #include <string>
 #include <tuple>
 #include <utility>
 
+#ifdef EXPENSIVE_CHECKS
+#include "llvm/ADT/ScopeExit.h"
+#endif
+
 using namespace llvm;
 
 #define DEBUG_TYPE "lcg"
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index e311b40ab25c..8a8e9e923b7c 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -38,7 +38,6 @@
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/raw_ostream.h"
-#include <map>
 using namespace llvm;
 using namespace PatternMatch;
 
@@ -919,7 +918,7 @@ Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueCast(
   // transfer rule on the full set since we may be able to locally infer
   // interesting facts.
   Optional<ConstantRange> LHSRes = getRangeFor(CI->getOperand(0), CI, BB);
-  if (!LHSRes.hasValue())
+  if (!LHSRes)
     // More work to do before applying this transfer rule.
     return None;
   const ConstantRange &LHSRange = LHSRes.getValue();
@@ -943,7 +942,7 @@ Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueBinaryOpImpl(
   // @foo()), 32"
   Optional<ConstantRange> LHSRes = getRangeFor(I->getOperand(0), I, BB);
   Optional<ConstantRange> RHSRes = getRangeFor(I->getOperand(1), I, BB);
-  if (!LHSRes.hasValue() || !RHSRes.hasValue())
+  if (!LHSRes || !RHSRes)
     // More work to do before applying this transfer rule.
     return None;
 
@@ -956,13 +955,6 @@ Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueBinaryOp(
     BinaryOperator *BO, BasicBlock *BB) {
   assert(BO->getOperand(0)->getType()->isSized() &&
          "all operands to binary operators are sized");
-  if (BO->getOpcode() == Instruction::Xor) {
-    // Xor is the only operation not supported by ConstantRange::binaryOp().
-    LLVM_DEBUG(dbgs() << " compute BB '" << BB->getName()
-                      << "' - overdefined (unknown binary operator).\n");
-    return ValueLatticeElement::getOverdefined();
-  }
-
   if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(BO)) {
     unsigned NoWrapKind = 0;
     if (OBO->hasNoUnsignedWrap())
@@ -1020,7 +1012,7 @@ Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueExtractValue(
 
   // Handle extractvalue of insertvalue to allow further simplification
   // based on replaced with.overflow intrinsics.
-  if (Value *V = SimplifyExtractValueInst(
+  if (Value *V = simplifyExtractValueInst(
           EVI->getAggregateOperand(), EVI->getIndices(),
           EVI->getModule()->getDataLayout()))
     return getBlockValue(V, BB, EVI);
@@ -1141,7 +1133,7 @@ static ValueLatticeElement getValueFromICmpCondition(Value *Val, ICmpInst *ICI,
     ConstantRange CR = ConstantRange::makeExactICmpRegion(EdgePred, *C);
     if (!CR.isEmptySet())
       return ValueLatticeElement::getRange(ConstantRange::getNonEmpty(
-          CR.getUnsignedMin().zextOrSelf(BitWidth), APInt(BitWidth, 0)));
+          CR.getUnsignedMin().zext(BitWidth), APInt(BitWidth, 0)));
   }
 
   return ValueLatticeElement::getOverdefined();
@@ -1278,7 +1270,7 @@ static ValueLatticeElement constantFoldUser(User *Usr, Value *Op,
   if (auto *CI = dyn_cast<CastInst>(Usr)) {
     assert(CI->getOperand(0) == Op && "Operand 0 isn't Op");
     if (auto *C = dyn_cast_or_null<ConstantInt>(
-            SimplifyCastInst(CI->getOpcode(), OpConst,
+            simplifyCastInst(CI->getOpcode(), OpConst,
                              CI->getDestTy(), DL))) {
       return ValueLatticeElement::getRange(ConstantRange(C->getValue()));
     }
@@ -1290,7 +1282,7 @@ static ValueLatticeElement constantFoldUser(User *Usr, Value *Op,
     Value *LHS = Op0Match ? OpConst : BO->getOperand(0);
     Value *RHS = Op1Match ? OpConst : BO->getOperand(1);
     if (auto *C = dyn_cast_or_null<ConstantInt>(
-            SimplifyBinOp(BO->getOpcode(), LHS, RHS, DL))) {
+            simplifyBinOp(BO->getOpcode(), LHS, RHS, DL))) {
       return ValueLatticeElement::getRange(ConstantRange(C->getValue()));
     }
   } else if (isa<FreezeInst>(Usr)) {
@@ -1361,7 +1353,7 @@ static Optional<ValueLatticeElement> getEdgeValueLocal(Value *Val,
               ValueLatticeElement OpLatticeVal =
                   getValueFromCondition(Op, Condition, isTrueDest);
               if (Optional<APInt> OpConst = OpLatticeVal.asConstantInteger()) {
-                Result = constantFoldUser(Usr, Op, OpConst.getValue(), DL);
+                Result = constantFoldUser(Usr, Op, *OpConst, DL);
                 break;
               }
             }
@@ -1432,8 +1424,9 @@ Optional<ValueLatticeElement> LazyValueInfoImpl::getEdgeValue(
   if (Constant *VC = dyn_cast<Constant>(Val))
     return ValueLatticeElement::get(VC);
 
-  ValueLatticeElement LocalResult = getEdgeValueLocal(Val, BBFrom, BBTo)
-      .getValueOr(ValueLatticeElement::getOverdefined());
+  ValueLatticeElement LocalResult =
+      getEdgeValueLocal(Val, BBFrom, BBTo)
+          .value_or(ValueLatticeElement::getOverdefined());
   if (hasSingleValue(LocalResult))
     // Can't get any more precise here
     return LocalResult;
@@ -1886,6 +1879,11 @@ void LazyValueInfo::eraseBlock(BasicBlock *BB) {
   }
 }
 
+void LazyValueInfo::clear(const Module *M) {
+  if (PImpl) {
+    getImpl(PImpl, AC, M).clear();
+  }
+}
 
 void LazyValueInfo::printLVI(Function &F, DominatorTree &DTree, raw_ostream &OS) {
   if (PImpl) {
diff --git a/llvm/lib/Analysis/LegacyDivergenceAnalysis.cpp b/llvm/lib/Analysis/LegacyDivergenceAnalysis.cpp
index 031bf3bae51d..491d44335f22 100644
--- a/llvm/lib/Analysis/LegacyDivergenceAnalysis.cpp
+++ b/llvm/lib/Analysis/LegacyDivergenceAnalysis.cpp
@@ -68,6 +68,7 @@
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp
index f9a7a5bdf434..9cfb91a22b7d 100644
--- a/llvm/lib/Analysis/Lint.cpp
+++ b/llvm/lib/Analysis/Lint.cpp
@@ -44,7 +44,6 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryLocation.h"
-#include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
@@ -69,9 +68,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/KnownBits.h"
-#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <cstdint>
@@ -169,8 +166,8 @@ public:
 };
 } // end anonymous namespace
 
-// Assert - We know that cond should be true, if not print an error message.
-#define Assert(C, ...)                                                         \
+// Check - We know that cond should be true, if not print an error message.
+#define Check(C, ...)                                                          \
   do {                                                                         \
     if (!(C)) {                                                                \
       CheckFailed(__VA_ARGS__);                                                \
@@ -181,8 +178,8 @@ public:
 void Lint::visitFunction(Function &F) {
   // This isn't undefined behavior, it's just a little unusual, and it's a
   // fairly common mistake to neglect to name a function.
-  Assert(F.hasName() || F.hasLocalLinkage(),
-         "Unusual: Unnamed function with non-local linkage", &F);
+  Check(F.hasName() || F.hasLocalLinkage(),
+        "Unusual: Unnamed function with non-local linkage", &F);
 
   // TODO: Check for irreducible control flow.
 }
@@ -195,23 +192,23 @@ void Lint::visitCallBase(CallBase &I) {
 
   if (Function *F = dyn_cast<Function>(findValue(Callee,
                                                  /*OffsetOk=*/false))) {
-    Assert(I.getCallingConv() == F->getCallingConv(),
-           "Undefined behavior: Caller and callee calling convention differ",
-           &I);
+    Check(I.getCallingConv() == F->getCallingConv(),
+          "Undefined behavior: Caller and callee calling convention differ",
+          &I);
 
     FunctionType *FT = F->getFunctionType();
     unsigned NumActualArgs = I.arg_size();
 
-    Assert(FT->isVarArg() ? FT->getNumParams() <= NumActualArgs
-                          : FT->getNumParams() == NumActualArgs,
-           "Undefined behavior: Call argument count mismatches callee "
-           "argument count",
-           &I);
+    Check(FT->isVarArg() ? FT->getNumParams() <= NumActualArgs
+                         : FT->getNumParams() == NumActualArgs,
+          "Undefined behavior: Call argument count mismatches callee "
+          "argument count",
+          &I);
 
-    Assert(FT->getReturnType() == I.getType(),
-           "Undefined behavior: Call return type mismatches "
-           "callee return type",
-           &I);
+    Check(FT->getReturnType() == I.getType(),
+          "Undefined behavior: Call return type mismatches "
+          "callee return type",
+          &I);
 
     // Check argument types (in case the callee was casted) and attributes.
     // TODO: Verify that caller and callee attributes are compatible.
@@ -221,10 +218,10 @@ void Lint::visitCallBase(CallBase &I) {
       Value *Actual = *AI;
       if (PI != PE) {
         Argument *Formal = &*PI++;
-        Assert(Formal->getType() == Actual->getType(),
-               "Undefined behavior: Call argument type mismatches "
-               "callee parameter type",
-               &I);
+        Check(Formal->getType() == Actual->getType(),
+              "Undefined behavior: Call argument type mismatches "
+              "callee parameter type",
+              &I);
 
         // Check that noalias arguments don't alias other arguments. This is
         // not fully precise because we don't know the sizes of the dereferenced
@@ -242,9 +239,9 @@ void Lint::visitCallBase(CallBase &I) {
               continue;
             if (AI != BI && (*BI)->getType()->isPointerTy()) {
               AliasResult Result = AA->alias(*AI, *BI);
-              Assert(Result != AliasResult::MustAlias &&
-                         Result != AliasResult::PartialAlias,
-                     "Unusual: noalias argument aliases another argument", &I);
+              Check(Result != AliasResult::MustAlias &&
+                        Result != AliasResult::PartialAlias,
+                    "Unusual: noalias argument aliases another argument", &I);
             }
           }
         }
@@ -271,10 +268,10 @@ void Lint::visitCallBase(CallBase &I) {
         if (PAL.hasParamAttr(ArgNo++, Attribute::ByVal))
           continue;
         Value *Obj = findValue(Arg, /*OffsetOk=*/true);
-        Assert(!isa<AllocaInst>(Obj),
-               "Undefined behavior: Call with \"tail\" keyword references "
-               "alloca",
-               &I);
+        Check(!isa<AllocaInst>(Obj),
+              "Undefined behavior: Call with \"tail\" keyword references "
+              "alloca",
+              &I);
       }
     }
   }
@@ -302,9 +299,9 @@ void Lint::visitCallBase(CallBase &I) {
                                               /*OffsetOk=*/false)))
         if (Len->getValue().isIntN(32))
           Size = LocationSize::precise(Len->getValue().getZExtValue());
-      Assert(AA->alias(MCI->getSource(), Size, MCI->getDest(), Size) !=
-                 AliasResult::MustAlias,
-             "Undefined behavior: memcpy source and destination overlap", &I);
+      Check(AA->alias(MCI->getSource(), Size, MCI->getDest(), Size) !=
+                AliasResult::MustAlias,
+            "Undefined behavior: memcpy source and destination overlap", &I);
       break;
     }
     case Intrinsic::memcpy_inline: {
@@ -319,9 +316,9 @@ void Lint::visitCallBase(CallBase &I) {
       // isn't expressive enough for what we really want to do. Known partial
       // overlap is not distinguished from the case where nothing is known.
       const LocationSize LS = LocationSize::precise(Size);
-      Assert(AA->alias(MCII->getSource(), LS, MCII->getDest(), LS) !=
-                 AliasResult::MustAlias,
-             "Undefined behavior: memcpy source and destination overlap", &I);
+      Check(AA->alias(MCII->getSource(), LS, MCII->getDest(), LS) !=
+                AliasResult::MustAlias,
+            "Undefined behavior: memcpy source and destination overlap", &I);
       break;
     }
     case Intrinsic::memmove: {
@@ -338,11 +335,17 @@ void Lint::visitCallBase(CallBase &I) {
                            MSI->getDestAlign(), nullptr, MemRef::Write);
       break;
     }
+    case Intrinsic::memset_inline: {
+      MemSetInlineInst *MSII = cast<MemSetInlineInst>(&I);
+      visitMemoryReference(I, MemoryLocation::getForDest(MSII),
+                           MSII->getDestAlign(), nullptr, MemRef::Write);
+      break;
+    }
 
     case Intrinsic::vastart:
-      Assert(I.getParent()->getParent()->isVarArg(),
-             "Undefined behavior: va_start called in a non-varargs function",
-             &I);
+      Check(I.getParent()->getParent()->isVarArg(),
+            "Undefined behavior: va_start called in a non-varargs function",
+            &I);
 
       visitMemoryReference(I, MemoryLocation::getForArgument(&I, 0, TLI), None,
                            nullptr, MemRef::Read | MemRef::Write);
@@ -367,20 +370,22 @@ void Lint::visitCallBase(CallBase &I) {
       break;
     case Intrinsic::get_active_lane_mask:
       if (auto *TripCount = dyn_cast<ConstantInt>(I.getArgOperand(1)))
-        Assert(!TripCount->isZero(), "get_active_lane_mask: operand #2 "
-               "must be greater than 0", &I);
+        Check(!TripCount->isZero(),
+              "get_active_lane_mask: operand #2 "
+              "must be greater than 0",
+              &I);
       break;
     }
 }
 
 void Lint::visitReturnInst(ReturnInst &I) {
   Function *F = I.getParent()->getParent();
-  Assert(!F->doesNotReturn(),
-         "Unusual: Return statement in function with noreturn attribute", &I);
+  Check(!F->doesNotReturn(),
+        "Unusual: Return statement in function with noreturn attribute", &I);
 
   if (Value *V = I.getReturnValue()) {
     Value *Obj = findValue(V, /*OffsetOk=*/true);
-    Assert(!isa<AllocaInst>(Obj), "Unusual: Returning alloca value", &I);
+    Check(!isa<AllocaInst>(Obj), "Unusual: Returning alloca value", &I);
   }
 }
 
@@ -395,39 +400,39 @@ void Lint::visitMemoryReference(Instruction &I, const MemoryLocation &Loc,
 
   Value *Ptr = const_cast<Value *>(Loc.Ptr);
   Value *UnderlyingObject = findValue(Ptr, /*OffsetOk=*/true);
-  Assert(!isa<ConstantPointerNull>(UnderlyingObject),
-         "Undefined behavior: Null pointer dereference", &I);
-  Assert(!isa<UndefValue>(UnderlyingObject),
-         "Undefined behavior: Undef pointer dereference", &I);
-  Assert(!isa<ConstantInt>(UnderlyingObject) ||
-             !cast<ConstantInt>(UnderlyingObject)->isMinusOne(),
-         "Unusual: All-ones pointer dereference", &I);
-  Assert(!isa<ConstantInt>(UnderlyingObject) ||
-             !cast<ConstantInt>(UnderlyingObject)->isOne(),
-         "Unusual: Address one pointer dereference", &I);
+  Check(!isa<ConstantPointerNull>(UnderlyingObject),
+        "Undefined behavior: Null pointer dereference", &I);
+  Check(!isa<UndefValue>(UnderlyingObject),
+        "Undefined behavior: Undef pointer dereference", &I);
+  Check(!isa<ConstantInt>(UnderlyingObject) ||
+            !cast<ConstantInt>(UnderlyingObject)->isMinusOne(),
+        "Unusual: All-ones pointer dereference", &I);
+  Check(!isa<ConstantInt>(UnderlyingObject) ||
+            !cast<ConstantInt>(UnderlyingObject)->isOne(),
+        "Unusual: Address one pointer dereference", &I);
 
   if (Flags & MemRef::Write) {
     if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(UnderlyingObject))
-      Assert(!GV->isConstant(), "Undefined behavior: Write to read-only memory",
-             &I);
-    Assert(!isa<Function>(UnderlyingObject) &&
-               !isa<BlockAddress>(UnderlyingObject),
-           "Undefined behavior: Write to text section", &I);
+      Check(!GV->isConstant(), "Undefined behavior: Write to read-only memory",
+            &I);
+    Check(!isa<Function>(UnderlyingObject) &&
+              !isa<BlockAddress>(UnderlyingObject),
+          "Undefined behavior: Write to text section", &I);
   }
   if (Flags & MemRef::Read) {
-    Assert(!isa<Function>(UnderlyingObject), "Unusual: Load from function body",
-           &I);
-    Assert(!isa<BlockAddress>(UnderlyingObject),
-           "Undefined behavior: Load from block address", &I);
+    Check(!isa<Function>(UnderlyingObject), "Unusual: Load from function body",
+          &I);
+    Check(!isa<BlockAddress>(UnderlyingObject),
+          "Undefined behavior: Load from block address", &I);
   }
   if (Flags & MemRef::Callee) {
-    Assert(!isa<BlockAddress>(UnderlyingObject),
-           "Undefined behavior: Call to block address", &I);
+    Check(!isa<BlockAddress>(UnderlyingObject),
+          "Undefined behavior: Call to block address", &I);
   }
   if (Flags & MemRef::Branchee) {
-    Assert(!isa<Constant>(UnderlyingObject) ||
-               isa<BlockAddress>(UnderlyingObject),
-           "Undefined behavior: Branch to non-blockaddress", &I);
+    Check(!isa<Constant>(UnderlyingObject) ||
+              isa<BlockAddress>(UnderlyingObject),
+          "Undefined behavior: Branch to non-blockaddress", &I);
   }
 
   // Check for buffer overflows and misalignment.
@@ -461,17 +466,17 @@ void Lint::visitMemoryReference(Instruction &I, const MemoryLocation &Loc,
 
     // Accesses from before the start or after the end of the object are not
     // defined.
-    Assert(!Loc.Size.hasValue() || BaseSize == MemoryLocation::UnknownSize ||
-               (Offset >= 0 && Offset + Loc.Size.getValue() <= BaseSize),
-           "Undefined behavior: Buffer overflow", &I);
+    Check(!Loc.Size.hasValue() || BaseSize == MemoryLocation::UnknownSize ||
+              (Offset >= 0 && Offset + Loc.Size.getValue() <= BaseSize),
+          "Undefined behavior: Buffer overflow", &I);
 
     // Accesses that say that the memory is more aligned than it is are not
     // defined.
     if (!Align && Ty && Ty->isSized())
       Align = DL->getABITypeAlign(Ty);
     if (BaseAlign && Align)
-      Assert(*Align <= commonAlignment(*BaseAlign, Offset),
-             "Undefined behavior: Memory reference address is misaligned", &I);
+      Check(*Align <= commonAlignment(*BaseAlign, Offset),
+            "Undefined behavior: Memory reference address is misaligned", &I);
   }
 }
 
@@ -486,34 +491,34 @@ void Lint::visitStoreInst(StoreInst &I) {
 }
 
 void Lint::visitXor(BinaryOperator &I) {
-  Assert(!isa<UndefValue>(I.getOperand(0)) || !isa<UndefValue>(I.getOperand(1)),
-         "Undefined result: xor(undef, undef)", &I);
+  Check(!isa<UndefValue>(I.getOperand(0)) || !isa<UndefValue>(I.getOperand(1)),
+        "Undefined result: xor(undef, undef)", &I);
 }
 
 void Lint::visitSub(BinaryOperator &I) {
-  Assert(!isa<UndefValue>(I.getOperand(0)) || !isa<UndefValue>(I.getOperand(1)),
-         "Undefined result: sub(undef, undef)", &I);
+  Check(!isa<UndefValue>(I.getOperand(0)) || !isa<UndefValue>(I.getOperand(1)),
+        "Undefined result: sub(undef, undef)", &I);
 }
 
 void Lint::visitLShr(BinaryOperator &I) {
   if (ConstantInt *CI = dyn_cast<ConstantInt>(findValue(I.getOperand(1),
                                                         /*OffsetOk=*/false)))
-    Assert(CI->getValue().ult(cast<IntegerType>(I.getType())->getBitWidth()),
-           "Undefined result: Shift count out of range", &I);
+    Check(CI->getValue().ult(cast<IntegerType>(I.getType())->getBitWidth()),
+          "Undefined result: Shift count out of range", &I);
 }
 
 void Lint::visitAShr(BinaryOperator &I) {
   if (ConstantInt *CI =
           dyn_cast<ConstantInt>(findValue(I.getOperand(1), /*OffsetOk=*/false)))
-    Assert(CI->getValue().ult(cast<IntegerType>(I.getType())->getBitWidth()),
-           "Undefined result: Shift count out of range", &I);
+    Check(CI->getValue().ult(cast<IntegerType>(I.getType())->getBitWidth()),
+          "Undefined result: Shift count out of range", &I);
 }
 
 void Lint::visitShl(BinaryOperator &I) {
   if (ConstantInt *CI =
           dyn_cast<ConstantInt>(findValue(I.getOperand(1), /*OffsetOk=*/false)))
-    Assert(CI->getValue().ult(cast<IntegerType>(I.getType())->getBitWidth()),
-           "Undefined result: Shift count out of range", &I);
+    Check(CI->getValue().ult(cast<IntegerType>(I.getType())->getBitWidth()),
+          "Undefined result: Shift count out of range", &I);
 }
 
 static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT,
@@ -554,30 +559,30 @@ static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT,
 }
 
 void Lint::visitSDiv(BinaryOperator &I) {
-  Assert(!isZero(I.getOperand(1), I.getModule()->getDataLayout(), DT, AC),
-         "Undefined behavior: Division by zero", &I);
+  Check(!isZero(I.getOperand(1), I.getModule()->getDataLayout(), DT, AC),
+        "Undefined behavior: Division by zero", &I);
 }
 
 void Lint::visitUDiv(BinaryOperator &I) {
-  Assert(!isZero(I.getOperand(1), I.getModule()->getDataLayout(), DT, AC),
-         "Undefined behavior: Division by zero", &I);
+  Check(!isZero(I.getOperand(1), I.getModule()->getDataLayout(), DT, AC),
+        "Undefined behavior: Division by zero", &I);
 }
 
 void Lint::visitSRem(BinaryOperator &I) {
-  Assert(!isZero(I.getOperand(1), I.getModule()->getDataLayout(), DT, AC),
-         "Undefined behavior: Division by zero", &I);
+  Check(!isZero(I.getOperand(1), I.getModule()->getDataLayout(), DT, AC),
+        "Undefined behavior: Division by zero", &I);
 }
 
 void Lint::visitURem(BinaryOperator &I) {
-  Assert(!isZero(I.getOperand(1), I.getModule()->getDataLayout(), DT, AC),
-         "Undefined behavior: Division by zero", &I);
+  Check(!isZero(I.getOperand(1), I.getModule()->getDataLayout(), DT, AC),
+        "Undefined behavior: Division by zero", &I);
 }
 
 void Lint::visitAllocaInst(AllocaInst &I) {
   if (isa<ConstantInt>(I.getArraySize()))
     // This isn't undefined behavior, it's just an obvious pessimization.
-    Assert(&I.getParent()->getParent()->getEntryBlock() == I.getParent(),
-           "Pessimization: Static alloca outside of entry block", &I);
+    Check(&I.getParent()->getParent()->getEntryBlock() == I.getParent(),
+          "Pessimization: Static alloca outside of entry block", &I);
 
   // TODO: Check for an unusual size (MSB set?)
 }
@@ -591,14 +596,14 @@ void Lint::visitIndirectBrInst(IndirectBrInst &I) {
   visitMemoryReference(I, MemoryLocation::getAfter(I.getAddress()), None,
                        nullptr, MemRef::Branchee);
 
-  Assert(I.getNumDestinations() != 0,
-         "Undefined behavior: indirectbr with no destinations", &I);
+  Check(I.getNumDestinations() != 0,
+        "Undefined behavior: indirectbr with no destinations", &I);
 }
 
 void Lint::visitExtractElementInst(ExtractElementInst &I) {
   if (ConstantInt *CI = dyn_cast<ConstantInt>(findValue(I.getIndexOperand(),
                                                         /*OffsetOk=*/false)))
-    Assert(
+    Check(
         CI->getValue().ult(
             cast<FixedVectorType>(I.getVectorOperandType())->getNumElements()),
         "Undefined result: extractelement index out of range", &I);
@@ -607,18 +612,18 @@ void Lint::visitExtractElementInst(ExtractElementInst &I) {
 void Lint::visitInsertElementInst(InsertElementInst &I) {
   if (ConstantInt *CI = dyn_cast<ConstantInt>(findValue(I.getOperand(2),
                                                         /*OffsetOk=*/false)))
-    Assert(CI->getValue().ult(
-               cast<FixedVectorType>(I.getType())->getNumElements()),
-           "Undefined result: insertelement index out of range", &I);
+    Check(CI->getValue().ult(
+              cast<FixedVectorType>(I.getType())->getNumElements()),
+          "Undefined result: insertelement index out of range", &I);
 }
 
 void Lint::visitUnreachableInst(UnreachableInst &I) {
   // This isn't undefined behavior, it's merely suspicious.
-  Assert(&I == &I.getParent()->front() ||
-             std::prev(I.getIterator())->mayHaveSideEffects(),
-         "Unusual: unreachable immediately preceded by instruction without "
-         "side effects",
-         &I);
+  Check(&I == &I.getParent()->front() ||
+            std::prev(I.getIterator())->mayHaveSideEffects(),
+        "Unusual: unreachable immediately preceded by instruction without "
+        "side effects",
+        &I);
 }
 
 /// findValue - Look through bitcasts and simple memory reference patterns
@@ -681,17 +686,12 @@ Value *Lint::findValueImpl(Value *V, bool OffsetOk,
                                CE->getOperand(0)->getType(), CE->getType(),
                                *DL))
         return findValueImpl(CE->getOperand(0), OffsetOk, Visited);
-    } else if (CE->getOpcode() == Instruction::ExtractValue) {
-      ArrayRef<unsigned> Indices = CE->getIndices();
-      if (Value *W = FindInsertedValue(CE->getOperand(0), Indices))
-        if (W != V)
-          return findValueImpl(W, OffsetOk, Visited);
     }
   }
 
   // As a last resort, try SimplifyInstruction or constant folding.
   if (Instruction *Inst = dyn_cast<Instruction>(V)) {
-    if (Value *W = SimplifyInstruction(Inst, {*DL, TLI, DT, AC}))
+    if (Value *W = simplifyInstruction(Inst, {*DL, TLI, DT, AC}))
       return findValueImpl(W, OffsetOk, Visited);
   } else if (auto *C = dyn_cast<Constant>(V)) {
     Value *W = ConstantFoldConstant(C, *DL, TLI);
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index cd0d4d6b9ca8..bc1d82cf1480 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -13,19 +13,14 @@
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumeBundleQueries.h"
-#include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/GlobalAlias.h"
-#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 
@@ -509,8 +504,8 @@ static Value *getAvailableLoadStore(Instruction *Inst, const Value *Ptr,
     if (CastInst::isBitOrNoopPointerCastable(Val->getType(), AccessTy, DL))
       return Val;
 
-    TypeSize StoreSize = DL.getTypeStoreSize(Val->getType());
-    TypeSize LoadSize = DL.getTypeStoreSize(AccessTy);
+    TypeSize StoreSize = DL.getTypeSizeInBits(Val->getType());
+    TypeSize LoadSize = DL.getTypeSizeInBits(AccessTy);
     if (TypeSize::isKnownLE(LoadSize, StoreSize))
       if (auto *C = dyn_cast<Constant>(Val))
         return ConstantFoldLoadFromConst(C, AccessTy, DL);
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 2ab78d2b7ee2..79161db9b5e4 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -47,6 +47,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
@@ -60,12 +61,12 @@
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
-#include <cstdlib>
 #include <iterator>
 #include <utility>
 #include <vector>
 
 using namespace llvm;
+using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "loop-accesses"
 
@@ -172,7 +173,8 @@ RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup(
     : High(RtCheck.Pointers[Index].End), Low(RtCheck.Pointers[Index].Start),
       AddressSpace(RtCheck.Pointers[Index]
                        .PointerValue->getType()
-                       ->getPointerAddressSpace()) {
+                       ->getPointerAddressSpace()),
+      NeedsFreeze(RtCheck.Pointers[Index].NeedsFreeze) {
   Members.push_back(Index);
 }
 
@@ -189,21 +191,20 @@ RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup(
 ///
 /// There is no conflict when the intervals are disjoint:
 /// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End)
-void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, bool WritePtr,
+void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr,
+                                    Type *AccessTy, bool WritePtr,
                                     unsigned DepSetId, unsigned ASId,
-                                    const ValueToValueMap &Strides,
-                                    PredicatedScalarEvolution &PSE) {
-  // Get the stride replaced scev.
-  const SCEV *Sc = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
+                                    PredicatedScalarEvolution &PSE,
+                                    bool NeedsFreeze) {
   ScalarEvolution *SE = PSE.getSE();
 
   const SCEV *ScStart;
   const SCEV *ScEnd;
 
-  if (SE->isLoopInvariant(Sc, Lp)) {
-    ScStart = ScEnd = Sc;
+  if (SE->isLoopInvariant(PtrExpr, Lp)) {
+    ScStart = ScEnd = PtrExpr;
   } else {
-    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
+    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrExpr);
     assert(AR && "Invalid addrec expression");
     const SCEV *Ex = PSE.getBackedgeTakenCount();
 
@@ -227,15 +228,100 @@ void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, bool WritePtr,
   // Add the size of the pointed element to ScEnd.
   auto &DL = Lp->getHeader()->getModule()->getDataLayout();
   Type *IdxTy = DL.getIndexType(Ptr->getType());
-  const SCEV *EltSizeSCEV =
-      SE->getStoreSizeOfExpr(IdxTy, Ptr->getType()->getPointerElementType());
+  const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IdxTy, AccessTy);
   ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV);
 
-  Pointers.emplace_back(Ptr, ScStart, ScEnd, WritePtr, DepSetId, ASId, Sc);
+  Pointers.emplace_back(Ptr, ScStart, ScEnd, WritePtr, DepSetId, ASId, PtrExpr,
+                        NeedsFreeze);
 }
 
-SmallVector<RuntimePointerCheck, 4>
-RuntimePointerChecking::generateChecks() const {
+void RuntimePointerChecking::tryToCreateDiffCheck(
+    const RuntimeCheckingPtrGroup &CGI, const RuntimeCheckingPtrGroup &CGJ) {
+  if (!CanUseDiffCheck)
+    return;
+
+  // If either group contains multiple different pointers, bail out.
+  // TODO: Support multiple pointers by using the minimum or maximum pointer,
+  // depending on src & sink.
+  if (CGI.Members.size() != 1 || CGJ.Members.size() != 1) {
+    CanUseDiffCheck = false;
+    return;
+  }
+
+  PointerInfo *Src = &Pointers[CGI.Members[0]];
+  PointerInfo *Sink = &Pointers[CGJ.Members[0]];
+
+  // If either pointer is read and written, multiple checks may be needed. Bail
+  // out.
+  if (!DC.getOrderForAccess(Src->PointerValue, !Src->IsWritePtr).empty() ||
+      !DC.getOrderForAccess(Sink->PointerValue, !Sink->IsWritePtr).empty()) {
+    CanUseDiffCheck = false;
+    return;
+  }
+
+  ArrayRef<unsigned> AccSrc =
+      DC.getOrderForAccess(Src->PointerValue, Src->IsWritePtr);
+  ArrayRef<unsigned> AccSink =
+      DC.getOrderForAccess(Sink->PointerValue, Sink->IsWritePtr);
+  // If either pointer is accessed multiple times, there may not be a clear
+  // src/sink relation. Bail out for now.
+  if (AccSrc.size() != 1 || AccSink.size() != 1) {
+    CanUseDiffCheck = false;
+    return;
+  }
+  // If the sink is accessed before src, swap src/sink.
+  if (AccSink[0] < AccSrc[0])
+    std::swap(Src, Sink);
+
+  auto *SrcAR = dyn_cast<SCEVAddRecExpr>(Src->Expr);
+  auto *SinkAR = dyn_cast<SCEVAddRecExpr>(Sink->Expr);
+  if (!SrcAR || !SinkAR) {
+    CanUseDiffCheck = false;
+    return;
+  }
+
+  const DataLayout &DL =
+      SinkAR->getLoop()->getHeader()->getModule()->getDataLayout();
+  SmallVector<Instruction *, 4> SrcInsts =
+      DC.getInstructionsForAccess(Src->PointerValue, Src->IsWritePtr);
+  SmallVector<Instruction *, 4> SinkInsts =
+      DC.getInstructionsForAccess(Sink->PointerValue, Sink->IsWritePtr);
+  Type *SrcTy = getLoadStoreType(SrcInsts[0]);
+  Type *DstTy = getLoadStoreType(SinkInsts[0]);
+  if (isa<ScalableVectorType>(SrcTy) || isa<ScalableVectorType>(DstTy))
+    return;
+  unsigned AllocSize =
+      std::max(DL.getTypeAllocSize(SrcTy), DL.getTypeAllocSize(DstTy));
+  IntegerType *IntTy =
+      IntegerType::get(Src->PointerValue->getContext(),
+                       DL.getPointerSizeInBits(CGI.AddressSpace));
+
+  // Only matching constant steps matching the AllocSize are supported at the
+  // moment. This simplifies the difference computation. Can be extended in the
+  // future.
+  auto *Step = dyn_cast<SCEVConstant>(SinkAR->getStepRecurrence(*SE));
+  if (!Step || Step != SrcAR->getStepRecurrence(*SE) ||
+      Step->getAPInt().abs() != AllocSize) {
+    CanUseDiffCheck = false;
+    return;
+  }
+
+  // When counting down, the dependence distance needs to be swapped.
+  if (Step->getValue()->isNegative())
+    std::swap(SinkAR, SrcAR);
+
+  const SCEV *SinkStartInt = SE->getPtrToIntExpr(SinkAR->getStart(), IntTy);
+  const SCEV *SrcStartInt = SE->getPtrToIntExpr(SrcAR->getStart(), IntTy);
+  if (isa<SCEVCouldNotCompute>(SinkStartInt) ||
+      isa<SCEVCouldNotCompute>(SrcStartInt)) {
+    CanUseDiffCheck = false;
+    return;
+  }
+  DiffChecks.emplace_back(SrcStartInt, SinkStartInt, AllocSize,
+                          Src->NeedsFreeze || Sink->NeedsFreeze);
+}
+
+SmallVector<RuntimePointerCheck, 4> RuntimePointerChecking::generateChecks() {
   SmallVector<RuntimePointerCheck, 4> Checks;
 
   for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
@@ -243,8 +329,10 @@ RuntimePointerChecking::generateChecks() const {
       const RuntimeCheckingPtrGroup &CGI = CheckingGroups[I];
       const RuntimeCheckingPtrGroup &CGJ = CheckingGroups[J];
 
-      if (needsChecking(CGI, CGJ))
+      if (needsChecking(CGI, CGJ)) {
+        tryToCreateDiffCheck(CGI, CGJ);
         Checks.push_back(std::make_pair(&CGI, &CGJ));
+      }
     }
   }
   return Checks;
@@ -285,11 +373,12 @@ bool RuntimeCheckingPtrGroup::addPointer(unsigned Index,
   return addPointer(
       Index, RtCheck.Pointers[Index].Start, RtCheck.Pointers[Index].End,
       RtCheck.Pointers[Index].PointerValue->getType()->getPointerAddressSpace(),
-      *RtCheck.SE);
+      RtCheck.Pointers[Index].NeedsFreeze, *RtCheck.SE);
 }
 
 bool RuntimeCheckingPtrGroup::addPointer(unsigned Index, const SCEV *Start,
                                          const SCEV *End, unsigned AS,
+                                         bool NeedsFreeze,
                                          ScalarEvolution &SE) {
   assert(AddressSpace == AS &&
          "all pointers in a checking group must be in the same address space");
@@ -314,6 +403,7 @@ bool RuntimeCheckingPtrGroup::addPointer(unsigned Index, const SCEV *Start,
     High = End;
 
   Members.push_back(Index);
+  this->NeedsFreeze |= NeedsFreeze;
   return true;
 }
 
@@ -371,9 +461,11 @@ void RuntimePointerChecking::groupChecks(
 
   unsigned TotalComparisons = 0;
 
-  DenseMap<Value *, unsigned> PositionMap;
-  for (unsigned Index = 0; Index < Pointers.size(); ++Index)
-    PositionMap[Pointers[Index].PointerValue] = Index;
+  DenseMap<Value *, SmallVector<unsigned>> PositionMap;
+  for (unsigned Index = 0; Index < Pointers.size(); ++Index) {
+    auto Iter = PositionMap.insert({Pointers[Index].PointerValue, {}});
+    Iter.first->second.push_back(Index);
+  }
 
   // We need to keep track of what pointers we've already seen so we
   // don't process them twice.
@@ -404,34 +496,35 @@ void RuntimePointerChecking::groupChecks(
       auto PointerI = PositionMap.find(MI->getPointer());
       assert(PointerI != PositionMap.end() &&
              "pointer in equivalence class not found in PositionMap");
-      unsigned Pointer = PointerI->second;
-      bool Merged = false;
-      // Mark this pointer as seen.
-      Seen.insert(Pointer);
-
-      // Go through all the existing sets and see if we can find one
-      // which can include this pointer.
-      for (RuntimeCheckingPtrGroup &Group : Groups) {
-        // Don't perform more than a certain amount of comparisons.
-        // This should limit the cost of grouping the pointers to something
-        // reasonable.  If we do end up hitting this threshold, the algorithm
-        // will create separate groups for all remaining pointers.
-        if (TotalComparisons > MemoryCheckMergeThreshold)
-          break;
-
-        TotalComparisons++;
-
-        if (Group.addPointer(Pointer, *this)) {
-          Merged = true;
-          break;
+      for (unsigned Pointer : PointerI->second) {
+        bool Merged = false;
+        // Mark this pointer as seen.
+        Seen.insert(Pointer);
+
+        // Go through all the existing sets and see if we can find one
+        // which can include this pointer.
+        for (RuntimeCheckingPtrGroup &Group : Groups) {
+          // Don't perform more than a certain amount of comparisons.
+          // This should limit the cost of grouping the pointers to something
+          // reasonable.  If we do end up hitting this threshold, the algorithm
+          // will create separate groups for all remaining pointers.
+          if (TotalComparisons > MemoryCheckMergeThreshold)
+            break;
+
+          TotalComparisons++;
+
+          if (Group.addPointer(Pointer, *this)) {
+            Merged = true;
+            break;
+          }
         }
-      }
 
-      if (!Merged)
-        // We couldn't add this pointer to any existing set or the threshold
-        // for the number of comparisons has been reached. Create a new group
-        // to hold the current pointer.
-        Groups.push_back(RuntimeCheckingPtrGroup(Pointer, *this));
+        if (!Merged)
+          // We couldn't add this pointer to any existing set or the threshold
+          // for the number of comparisons has been reached. Create a new group
+          // to hold the current pointer.
+          Groups.push_back(RuntimeCheckingPtrGroup(Pointer, *this));
+      }
     }
 
     // We've computed the grouped checks for this partition.
@@ -522,19 +615,19 @@ public:
       : TheLoop(TheLoop), AST(*AA), LI(LI), DepCands(DA), PSE(PSE) {}
 
   /// Register a load  and whether it is only read from.
-  void addLoad(MemoryLocation &Loc, bool IsReadOnly) {
+  void addLoad(MemoryLocation &Loc, Type *AccessTy, bool IsReadOnly) {
     Value *Ptr = const_cast<Value*>(Loc.Ptr);
     AST.add(Ptr, LocationSize::beforeOrAfterPointer(), Loc.AATags);
-    Accesses.insert(MemAccessInfo(Ptr, false));
+    Accesses[MemAccessInfo(Ptr, false)].insert(AccessTy);
     if (IsReadOnly)
       ReadOnlyPtr.insert(Ptr);
   }
 
   /// Register a store.
-  void addStore(MemoryLocation &Loc) {
+  void addStore(MemoryLocation &Loc, Type *AccessTy) {
     Value *Ptr = const_cast<Value*>(Loc.Ptr);
     AST.add(Ptr, LocationSize::beforeOrAfterPointer(), Loc.AATags);
-    Accesses.insert(MemAccessInfo(Ptr, true));
+    Accesses[MemAccessInfo(Ptr, true)].insert(AccessTy);
   }
 
   /// Check if we can emit a run-time no-alias check for \p Access.
@@ -545,12 +638,11 @@ public:
   /// we will attempt to use additional run-time checks in order to get
   /// the bounds of the pointer.
   bool createCheckForAccess(RuntimePointerChecking &RtCheck,
-                            MemAccessInfo Access,
+                            MemAccessInfo Access, Type *AccessTy,
                             const ValueToValueMap &Strides,
                             DenseMap<Value *, unsigned> &DepSetId,
                             Loop *TheLoop, unsigned &RunningDepId,
-                            unsigned ASId, bool ShouldCheckStride,
-                            bool Assume);
+                            unsigned ASId, bool ShouldCheckStride, bool Assume);
 
   /// Check whether we can check the pointers at runtime for
   /// non-intersection.
@@ -559,7 +651,7 @@ public:
   /// (i.e. the pointers have computable bounds).
   bool canCheckPtrAtRT(RuntimePointerChecking &RtCheck, ScalarEvolution *SE,
                        Loop *TheLoop, const ValueToValueMap &Strides,
-                       bool ShouldCheckWrap = false);
+                       Value *&UncomputablePtr, bool ShouldCheckWrap = false);
 
   /// Goes over all memory accesses, checks whether a RT check is needed
   /// and builds sets of dependent accesses.
@@ -583,14 +675,15 @@ public:
   MemAccessInfoList &getDependenciesToCheck() { return CheckDeps; }
 
 private:
-  typedef SetVector<MemAccessInfo> PtrAccessSet;
+  typedef MapVector<MemAccessInfo, SmallSetVector<Type *, 1>> PtrAccessMap;
 
   /// Go over all memory access and check whether runtime pointer checks
   /// are needed and build sets of dependency check candidates.
   void processMemAccesses();
 
-  /// Set of all accesses.
-  PtrAccessSet Accesses;
+  /// Map of all accesses. Values are the types used to access memory pointed to
+  /// by the pointer.
+  PtrAccessMap Accesses;
 
   /// The loop being checked.
   const Loop *TheLoop;
@@ -630,11 +723,8 @@ private:
 /// Check whether a pointer can participate in a runtime bounds check.
 /// If \p Assume, try harder to prove that we can compute the bounds of \p Ptr
 /// by adding run-time checks (overflow checks) if necessary.
-static bool hasComputableBounds(PredicatedScalarEvolution &PSE,
-                                const ValueToValueMap &Strides, Value *Ptr,
-                                Loop *L, bool Assume) {
-  const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
-
+static bool hasComputableBounds(PredicatedScalarEvolution &PSE, Value *Ptr,
+                                const SCEV *PtrScev, Loop *L, bool Assume) {
   // The bounds for loop-invariant pointer is trivial.
   if (PSE.getSE()->isLoopInvariant(PtrScev, L))
     return true;
@@ -652,12 +742,12 @@ static bool hasComputableBounds(PredicatedScalarEvolution &PSE,
 
 /// Check whether a pointer address cannot wrap.
 static bool isNoWrap(PredicatedScalarEvolution &PSE,
-                     const ValueToValueMap &Strides, Value *Ptr, Loop *L) {
+                     const ValueToValueMap &Strides, Value *Ptr, Type *AccessTy,
+                     Loop *L) {
   const SCEV *PtrScev = PSE.getSCEV(Ptr);
   if (PSE.getSE()->isLoopInvariant(PtrScev, L))
     return true;
 
-  Type *AccessTy = Ptr->getType()->getPointerElementType();
   int64_t Stride = getPtrStride(PSE, AccessTy, Ptr, L, Strides);
   if (Stride == 1 || PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW))
     return true;
@@ -689,7 +779,7 @@ static void visitPointers(Value *StartPtr, const Loop &InnermostLoop,
 }
 
 bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
-                                          MemAccessInfo Access,
+                                          MemAccessInfo Access, Type *AccessTy,
                                           const ValueToValueMap &StridesMap,
                                           DenseMap<Value *, unsigned> &DepSetId,
                                           Loop *TheLoop, unsigned &RunningDepId,
@@ -697,42 +787,75 @@ bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
                                           bool Assume) {
   Value *Ptr = Access.getPointer();
 
-  if (!hasComputableBounds(PSE, StridesMap, Ptr, TheLoop, Assume))
-    return false;
+  ScalarEvolution &SE = *PSE.getSE();
+  SmallVector<std::pair<const SCEV *, bool>> TranslatedPtrs;
+  auto *SI = dyn_cast<SelectInst>(Ptr);
+  // Look through selects in the current loop.
+  if (SI && !TheLoop->isLoopInvariant(SI)) {
+    TranslatedPtrs = {
+        std::make_pair(SE.getSCEV(SI->getOperand(1)),
+                       !isGuaranteedNotToBeUndefOrPoison(SI->getOperand(1))),
+        std::make_pair(SE.getSCEV(SI->getOperand(2)),
+                       !isGuaranteedNotToBeUndefOrPoison(SI->getOperand(2)))};
+  } else
+    TranslatedPtrs = {
+        std::make_pair(replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr), false)};
 
-  // When we run after a failing dependency check we have to make sure
-  // we don't have wrapping pointers.
-  if (ShouldCheckWrap && !isNoWrap(PSE, StridesMap, Ptr, TheLoop)) {
-    auto *Expr = PSE.getSCEV(Ptr);
-    if (!Assume || !isa<SCEVAddRecExpr>(Expr))
+  for (auto &P : TranslatedPtrs) {
+    const SCEV *PtrExpr = P.first;
+    if (!hasComputableBounds(PSE, Ptr, PtrExpr, TheLoop, Assume))
       return false;
-    PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW);
+
+    // When we run after a failing dependency check we have to make sure
+    // we don't have wrapping pointers.
+    if (ShouldCheckWrap) {
+      // Skip wrap checking when translating pointers.
+      if (TranslatedPtrs.size() > 1)
+        return false;
+
+      if (!isNoWrap(PSE, StridesMap, Ptr, AccessTy, TheLoop)) {
+        auto *Expr = PSE.getSCEV(Ptr);
+        if (!Assume || !isa<SCEVAddRecExpr>(Expr))
+          return false;
+        PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW);
+      }
+    }
+    // If there's only one option for Ptr, look it up after bounds and wrap
+    // checking, because assumptions might have been added to PSE.
+    if (TranslatedPtrs.size() == 1)
+      TranslatedPtrs[0] = std::make_pair(
+          replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr), false);
   }
 
-  // The id of the dependence set.
-  unsigned DepId;
+  for (auto &P : TranslatedPtrs) {
+    const SCEV *PtrExpr = P.first;
 
-  if (isDependencyCheckNeeded()) {
-    Value *Leader = DepCands.getLeaderValue(Access).getPointer();
-    unsigned &LeaderId = DepSetId[Leader];
-    if (!LeaderId)
-      LeaderId = RunningDepId++;
-    DepId = LeaderId;
-  } else
-    // Each access has its own dependence set.
-    DepId = RunningDepId++;
+    // The id of the dependence set.
+    unsigned DepId;
 
-  bool IsWrite = Access.getInt();
-  RtCheck.insert(TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap, PSE);
-  LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n');
+    if (isDependencyCheckNeeded()) {
+      Value *Leader = DepCands.getLeaderValue(Access).getPointer();
+      unsigned &LeaderId = DepSetId[Leader];
+      if (!LeaderId)
+        LeaderId = RunningDepId++;
+      DepId = LeaderId;
+    } else
+      // Each access has its own dependence set.
+      DepId = RunningDepId++;
+
+    bool IsWrite = Access.getInt();
+    RtCheck.insert(TheLoop, Ptr, PtrExpr, AccessTy, IsWrite, DepId, ASId, PSE,
+                   P.second);
+    LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n');
+  }
 
   return true;
- }
+}
 
 bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
                                      ScalarEvolution *SE, Loop *TheLoop,
                                      const ValueToValueMap &StridesMap,
-                                     bool ShouldCheckWrap) {
+                                     Value *&UncomputablePtr, bool ShouldCheckWrap) {
   // Find pointers with computable bounds. We are going to use this information
   // to place a runtime bound check.
   bool CanDoRT = true;
@@ -788,12 +911,15 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
     }
 
     for (auto &Access : AccessInfos) {
-      if (!createCheckForAccess(RtCheck, Access, StridesMap, DepSetId, TheLoop,
-                                RunningDepId, ASId, ShouldCheckWrap, false)) {
-        LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:"
-                          << *Access.getPointer() << '\n');
-        Retries.push_back(Access);
-        CanDoAliasSetRT = false;
+      for (auto &AccessTy : Accesses[Access]) {
+        if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
+                                  DepSetId, TheLoop, RunningDepId, ASId,
+                                  ShouldCheckWrap, false)) {
+          LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:"
+                            << *Access.getPointer() << '\n');
+          Retries.push_back(Access);
+          CanDoAliasSetRT = false;
+        }
       }
     }
 
@@ -815,13 +941,17 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
       // We know that we need these checks, so we can now be more aggressive
       // and add further checks if required (overflow checks).
       CanDoAliasSetRT = true;
-      for (auto Access : Retries)
-        if (!createCheckForAccess(RtCheck, Access, StridesMap, DepSetId,
-                                  TheLoop, RunningDepId, ASId,
-                                  ShouldCheckWrap, /*Assume=*/true)) {
-          CanDoAliasSetRT = false;
-          break;
+      for (auto Access : Retries) {
+        for (auto &AccessTy : Accesses[Access]) {
+          if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
+                                    DepSetId, TheLoop, RunningDepId, ASId,
+                                    ShouldCheckWrap, /*Assume=*/true)) {
+            CanDoAliasSetRT = false;
+            UncomputablePtr = Access.getPointer();
+            break;
+          }
         }
+      }
     }
 
     CanDoRT &= CanDoAliasSetRT;
@@ -886,9 +1016,12 @@ void AccessAnalysis::processMemAccesses() {
   LLVM_DEBUG(dbgs() << "LAA:   Accesses(" << Accesses.size() << "):\n");
   LLVM_DEBUG({
     for (auto A : Accesses)
-      dbgs() << "\t" << *A.getPointer() << " (" <<
-                (A.getInt() ? "write" : (ReadOnlyPtr.count(A.getPointer()) ?
-                                         "read-only" : "read")) << ")\n";
+      dbgs() << "\t" << *A.first.getPointer() << " ("
+             << (A.first.getInt()
+                     ? "write"
+                     : (ReadOnlyPtr.count(A.first.getPointer()) ? "read-only"
+                                                                : "read"))
+             << ")\n";
   });
 
   // The AliasSetTracker has nicely partitioned our pointers by metadata
@@ -907,13 +1040,13 @@ void AccessAnalysis::processMemAccesses() {
     UnderlyingObjToAccessMap ObjToLastAccess;
 
     // Set of access to check after all writes have been processed.
-    PtrAccessSet DeferredAccesses;
+    PtrAccessMap DeferredAccesses;
 
     // Iterate over each alias set twice, once to process read/write pointers,
     // and then to process read-only pointers.
     for (int SetIteration = 0; SetIteration < 2; ++SetIteration) {
       bool UseDeferred = SetIteration > 0;
-      PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses;
+      PtrAccessMap &S = UseDeferred ? DeferredAccesses : Accesses;
 
       for (const auto &AV : AS) {
         Value *Ptr = AV.getValue();
@@ -921,10 +1054,10 @@ void AccessAnalysis::processMemAccesses() {
         // For a single memory access in AliasSetTracker, Accesses may contain
         // both read and write, and they both need to be handled for CheckDeps.
         for (const auto &AC : S) {
-          if (AC.getPointer() != Ptr)
+          if (AC.first.getPointer() != Ptr)
             continue;
 
-          bool IsWrite = AC.getInt();
+          bool IsWrite = AC.first.getInt();
 
           // If we're using the deferred access set, then it contains only
           // reads.
@@ -946,7 +1079,9 @@ void AccessAnalysis::processMemAccesses() {
           // consecutive as "read-only" pointers (so that we check
           // "a[b[i]] +="). Hence, we need the second check for "!IsWrite".
           if (!UseDeferred && IsReadOnlyPtr) {
-            DeferredAccesses.insert(Access);
+            // We only use the pointer keys, the types vector values don't
+            // matter.
+            DeferredAccesses.insert({Access, {}});
             continue;
           }
 
@@ -1445,13 +1580,13 @@ static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
 
   const SCEV *CastedDist = &Dist;
   const SCEV *CastedProduct = Product;
-  uint64_t DistTypeSize = DL.getTypeAllocSize(Dist.getType());
-  uint64_t ProductTypeSize = DL.getTypeAllocSize(Product->getType());
+  uint64_t DistTypeSizeBits = DL.getTypeSizeInBits(Dist.getType());
+  uint64_t ProductTypeSizeBits = DL.getTypeSizeInBits(Product->getType());
 
   // The dependence distance can be positive/negative, so we sign extend Dist;
   // The multiplication of the absolute stride in bytes and the
   // backedgeTakenCount is non-negative, so we zero extend Product.
-  if (DistTypeSize > ProductTypeSize)
+  if (DistTypeSizeBits > ProductTypeSizeBits)
     CastedProduct = SE.getZeroExtendExpr(Product, Dist.getType());
   else
     CastedDist = SE.getNoopOrSignExtend(&Dist, Product->getType());
@@ -1518,8 +1653,8 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
   Value *BPtr = B.getPointer();
   bool AIsWrite = A.getInt();
   bool BIsWrite = B.getInt();
-  Type *ATy = APtr->getType()->getPointerElementType();
-  Type *BTy = BPtr->getType()->getPointerElementType();
+  Type *ATy = getLoadStoreType(InstMap[AIdx]);
+  Type *BTy = getLoadStoreType(InstMap[BIdx]);
 
   // Two reads are independent.
   if (!AIsWrite && !BIsWrite)
@@ -1842,8 +1977,6 @@ bool LoopAccessInfo::canAnalyzeLoop() {
 void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
                                  const TargetLibraryInfo *TLI,
                                  DominatorTree *DT) {
-  typedef SmallPtrSet<Value*, 16> ValueSet;
-
   // Holds the Load and Store instructions.
   SmallVector<LoadInst *, 16> Loads;
   SmallVector<StoreInst *, 16> Stores;
@@ -1975,22 +2108,26 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
   // for read and once for write, it will only appear once (on the write
   // list). This is okay, since we are going to check for conflicts between
   // writes and between reads and writes, but not between reads and reads.
-  ValueSet Seen;
+  SmallSet<std::pair<Value *, Type *>, 16> Seen;
 
   // Record uniform store addresses to identify if we have multiple stores
   // to the same address.
-  ValueSet UniformStores;
+  SmallPtrSet<Value *, 16> UniformStores;
 
   for (StoreInst *ST : Stores) {
     Value *Ptr = ST->getPointerOperand();
 
-    if (isUniform(Ptr))
+    if (isUniform(Ptr)) {
+      // Record store instructions to loop invariant addresses
+      StoresToInvariantAddresses.push_back(ST);
       HasDependenceInvolvingLoopInvariantAddress |=
           !UniformStores.insert(Ptr).second;
+    }
 
     // If we did *not* see this pointer before, insert it to  the read-write
     // list. At this phase it is only a 'write' list.
-    if (Seen.insert(Ptr).second) {
+    Type *AccessTy = getLoadStoreType(ST);
+    if (Seen.insert({Ptr, AccessTy}).second) {
       ++NumReadWrites;
 
       MemoryLocation Loc = MemoryLocation::get(ST);
@@ -2001,9 +2138,9 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
         Loc.AATags.TBAA = nullptr;
 
       visitPointers(const_cast<Value *>(Loc.Ptr), *TheLoop,
-                    [&Accesses, Loc](Value *Ptr) {
+                    [&Accesses, AccessTy, Loc](Value *Ptr) {
                       MemoryLocation NewLoc = Loc.getWithNewPtr(Ptr);
-                      Accesses.addStore(NewLoc);
+                      Accesses.addStore(NewLoc, AccessTy);
                     });
     }
   }
@@ -2027,7 +2164,8 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
     // read a few words, modify, and write a few words, and some of the
     // words may be written to the same address.
     bool IsReadOnlyPtr = false;
-    if (Seen.insert(Ptr).second ||
+    Type *AccessTy = getLoadStoreType(LD);
+    if (Seen.insert({Ptr, AccessTy}).second ||
         !getPtrStride(*PSE, LD->getType(), Ptr, TheLoop, SymbolicStrides)) {
       ++NumReads;
       IsReadOnlyPtr = true;
@@ -2049,9 +2187,9 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
       Loc.AATags.TBAA = nullptr;
 
     visitPointers(const_cast<Value *>(Loc.Ptr), *TheLoop,
-                  [&Accesses, Loc, IsReadOnlyPtr](Value *Ptr) {
+                  [&Accesses, AccessTy, Loc, IsReadOnlyPtr](Value *Ptr) {
                     MemoryLocation NewLoc = Loc.getWithNewPtr(Ptr);
-                    Accesses.addLoad(NewLoc, IsReadOnlyPtr);
+                    Accesses.addLoad(NewLoc, AccessTy, IsReadOnlyPtr);
                   });
   }
 
@@ -2069,10 +2207,14 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
 
   // Find pointers with computable bounds. We are going to use this information
   // to place a runtime bound check.
-  bool CanDoRTIfNeeded = Accesses.canCheckPtrAtRT(*PtrRtChecking, PSE->getSE(),
-                                                  TheLoop, SymbolicStrides);
+  Value *UncomputablePtr = nullptr;
+  bool CanDoRTIfNeeded =
+      Accesses.canCheckPtrAtRT(*PtrRtChecking, PSE->getSE(), TheLoop,
+                               SymbolicStrides, UncomputablePtr, false);
   if (!CanDoRTIfNeeded) {
-    recordAnalysis("CantIdentifyArrayBounds") << "cannot identify array bounds";
+    auto *I = dyn_cast_or_null<Instruction>(UncomputablePtr);
+    recordAnalysis("CantIdentifyArrayBounds", I) 
+        << "cannot identify array bounds";
     LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
                       << "the array bounds.\n");
     CanVecMem = false;
@@ -2099,12 +2241,14 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
       PtrRtChecking->Need = true;
 
       auto *SE = PSE->getSE();
-      CanDoRTIfNeeded = Accesses.canCheckPtrAtRT(*PtrRtChecking, SE, TheLoop,
-                                                 SymbolicStrides, true);
+      UncomputablePtr = nullptr;
+      CanDoRTIfNeeded = Accesses.canCheckPtrAtRT(
+          *PtrRtChecking, SE, TheLoop, SymbolicStrides, UncomputablePtr, true);
 
       // Check that we found the bounds for the pointer.
       if (!CanDoRTIfNeeded) {
-        recordAnalysis("CantCheckMemDepsAtRunTime")
+        auto *I = dyn_cast_or_null<Instruction>(UncomputablePtr);
+        recordAnalysis("CantCheckMemDepsAtRunTime", I)
             << "cannot check memory dependencies at runtime";
         LLVM_DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n");
         CanVecMem = false;
@@ -2129,13 +2273,61 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
         dbgs() << "LAA: No unsafe dependent memory operations in loop.  We"
                << (PtrRtChecking->Need ? "" : " don't")
                << " need runtime memory checks.\n");
-  else {
-    recordAnalysis("UnsafeMemDep")
-        << "unsafe dependent memory operations in loop. Use "
-           "#pragma loop distribute(enable) to allow loop distribution "
-           "to attempt to isolate the offending operations into a separate "
-           "loop";
-    LLVM_DEBUG(dbgs() << "LAA: unsafe dependent memory operations in loop\n");
+  else
+    emitUnsafeDependenceRemark();
+}
+
+void LoopAccessInfo::emitUnsafeDependenceRemark() {
+  auto Deps = getDepChecker().getDependences();
+  if (!Deps)
+    return;
+  auto Found = std::find_if(
+      Deps->begin(), Deps->end(), [](const MemoryDepChecker::Dependence &D) {
+        return MemoryDepChecker::Dependence::isSafeForVectorization(D.Type) !=
+               MemoryDepChecker::VectorizationSafetyStatus::Safe;
+      });
+  if (Found == Deps->end())
+    return;
+  MemoryDepChecker::Dependence Dep = *Found;
+
+  LLVM_DEBUG(dbgs() << "LAA: unsafe dependent memory operations in loop\n");
+
+  // Emit remark for first unsafe dependence
+  OptimizationRemarkAnalysis &R =
+      recordAnalysis("UnsafeDep", Dep.getDestination(*this))
+      << "unsafe dependent memory operations in loop. Use "
+         "#pragma loop distribute(enable) to allow loop distribution "
+         "to attempt to isolate the offending operations into a separate "
+         "loop";
+
+  switch (Dep.Type) {
+  case MemoryDepChecker::Dependence::NoDep:
+  case MemoryDepChecker::Dependence::Forward:
+  case MemoryDepChecker::Dependence::BackwardVectorizable:
+    llvm_unreachable("Unexpected dependence");
+  case MemoryDepChecker::Dependence::Backward:
+    R << "\nBackward loop carried data dependence.";
+    break;
+  case MemoryDepChecker::Dependence::ForwardButPreventsForwarding:
+    R << "\nForward loop carried data dependence that prevents "
+         "store-to-load forwarding.";
+    break;
+  case MemoryDepChecker::Dependence::BackwardVectorizableButPreventsForwarding:
+    R << "\nBackward loop carried data dependence that prevents "
+         "store-to-load forwarding.";
+    break;
+  case MemoryDepChecker::Dependence::Unknown:
+    R << "\nUnknown data dependence.";
+    break;
+  }
+
+  if (Instruction *I = Dep.getSource(*this)) {
+    DebugLoc SourceLoc = I->getDebugLoc();
+    if (auto *DD = dyn_cast_or_null<Instruction>(getPointerOperand(I)))
+      SourceLoc = DD->getDebugLoc();
+    if (SourceLoc)
+      R << " Memory location is the same as accessed at "
+        << ore::NV("Location", SourceLoc);
   }
 }
 
@@ -2212,12 +2404,12 @@ void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
   // The Stride can be positive/negative, so we sign extend Stride;
   // The backedgeTakenCount is non-negative, so we zero extend BETakenCount.
   const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
-  uint64_t StrideTypeSize = DL.getTypeAllocSize(StrideExpr->getType());
-  uint64_t BETypeSize = DL.getTypeAllocSize(BETakenCount->getType());
+  uint64_t StrideTypeSizeBits = DL.getTypeSizeInBits(StrideExpr->getType());
+  uint64_t BETypeSizeBits = DL.getTypeSizeInBits(BETakenCount->getType());
   const SCEV *CastedStride = StrideExpr;
   const SCEV *CastedBECount = BETakenCount;
   ScalarEvolution *SE = PSE->getSE();
-  if (BETypeSize >= StrideTypeSize)
+  if (BETypeSizeBits >= StrideTypeSizeBits)
     CastedStride = SE->getNoopOrSignExtend(StrideExpr, BETakenCount->getType());
   else
     CastedBECount = SE->getZeroExtendExpr(BETakenCount, StrideExpr->getType());
@@ -2232,7 +2424,7 @@ void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
                   "at most once.\n");
     return;
   }
-  LLVM_DEBUG(dbgs() << "LAA: Found a strided access that we can version.");
+  LLVM_DEBUG(dbgs() << "LAA: Found a strided access that we can version.\n");
 
   SymbolicStrides[Ptr] = Stride;
   StrideSet.insert(Stride);
@@ -2242,10 +2434,12 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
                                const TargetLibraryInfo *TLI, AAResults *AA,
                                DominatorTree *DT, LoopInfo *LI)
     : PSE(std::make_unique<PredicatedScalarEvolution>(*SE, *L)),
-      PtrRtChecking(std::make_unique<RuntimePointerChecking>(SE)),
+      PtrRtChecking(nullptr),
       DepChecker(std::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L) {
-  if (canAnalyzeLoop())
+  PtrRtChecking = std::make_unique<RuntimePointerChecking>(*DepChecker, SE);
+  if (canAnalyzeLoop()) {
     analyzeLoop(AA, LI, TLI, DT);
+  }
 }
 
 void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
@@ -2283,7 +2477,7 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
                    << "found in loop.\n";
 
   OS.indent(Depth) << "SCEV assumptions:\n";
-  PSE->getUnionPredicate().print(OS, Depth);
+  PSE->getPredicate().print(OS, Depth);
 
   OS << "\n";
 
@@ -2301,7 +2495,7 @@ const LoopAccessInfo &LoopAccessLegacyAnalysis::getInfo(Loop *L) {
   if (!LAI)
     LAI = std::make_unique<LoopAccessInfo>(L, SE, TLI, AA, DT, LI);
 
-  return *LAI.get();
+  return *LAI;
 }
 
 void LoopAccessLegacyAnalysis::print(raw_ostream &OS, const Module *M) const {
diff --git a/llvm/lib/Analysis/LoopAnalysisManager.cpp b/llvm/lib/Analysis/LoopAnalysisManager.cpp
index 4d6f8a64329a..8d71b31ca393 100644
--- a/llvm/lib/Analysis/LoopAnalysisManager.cpp
+++ b/llvm/lib/Analysis/LoopAnalysisManager.cpp
@@ -8,12 +8,9 @@
 
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/PassManagerImpl.h"
 
diff --git a/llvm/lib/Analysis/LoopCacheAnalysis.cpp b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
index ba014bd08c98..2cbf1f7f2d28 100644
--- a/llvm/lib/Analysis/LoopCacheAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
@@ -103,14 +103,24 @@ static bool isOneDimensionalArray(const SCEV &AccessFn, const SCEV &ElemSize,
   return StepRec == &ElemSize;
 }
 
-/// Compute the trip count for the given loop \p L. Return the SCEV expression
-/// for the trip count or nullptr if it cannot be computed.
-static const SCEV *computeTripCount(const Loop &L, ScalarEvolution &SE) {
+/// Compute the trip count for the given loop \p L or assume a default value if
+/// it is not a compile time constant. Return the SCEV expression for the trip
+/// count.
+static const SCEV *computeTripCount(const Loop &L, const SCEV &ElemSize,
+                                    ScalarEvolution &SE) {
   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(&L);
-  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount) ||
-      !isa<SCEVConstant>(BackedgeTakenCount))
-    return nullptr;
-  return SE.getTripCountFromExitCount(BackedgeTakenCount);
+  const SCEV *TripCount = (!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
+                           isa<SCEVConstant>(BackedgeTakenCount))
+                              ? SE.getTripCountFromExitCount(BackedgeTakenCount)
+                              : nullptr;
+
+  if (!TripCount) {
+    LLVM_DEBUG(dbgs() << "Trip count of loop " << L.getName()
+               << " could not be computed, using DefaultTripCount\n");
+    TripCount = SE.getConstant(ElemSize.getType(), DefaultTripCount);
+  }
+
+  return TripCount;
 }
 
 //===----------------------------------------------------------------------===//
@@ -274,22 +284,18 @@ CacheCostTy IndexedReference::computeRefCost(const Loop &L,
     return 1;
   }
 
-  const SCEV *TripCount = computeTripCount(L, SE);
-  if (!TripCount) {
-    LLVM_DEBUG(dbgs() << "Trip count of loop " << L.getName()
-                      << " could not be computed, using DefaultTripCount\n");
-    const SCEV *ElemSize = Sizes.back();
-    TripCount = SE.getConstant(ElemSize->getType(), DefaultTripCount);
-  }
+  const SCEV *TripCount = computeTripCount(L, *Sizes.back(), SE);
+  assert(TripCount && "Expecting valid TripCount");
   LLVM_DEBUG(dbgs() << "TripCount=" << *TripCount << "\n");
 
-  // If the indexed reference is 'consecutive' the cost is
-  // (TripCount*Stride)/CLS, otherwise the cost is TripCount.
-  const SCEV *RefCost = TripCount;
-
+  const SCEV *RefCost = nullptr;
   if (isConsecutive(L, CLS)) {
+    // If the indexed reference is 'consecutive' the cost is
+    // (TripCount*Stride)/CLS.
     const SCEV *Coeff = getLastCoefficient();
     const SCEV *ElemSize = Sizes.back();
+    assert(Coeff->getType() == ElemSize->getType() &&
+           "Expecting the same type");
     const SCEV *Stride = SE.getMulExpr(Coeff, ElemSize);
     Type *WiderType = SE.getWiderType(Stride->getType(), TripCount->getType());
     const SCEV *CacheLineSize = SE.getConstant(WiderType, CLS);
@@ -303,10 +309,33 @@ CacheCostTy IndexedReference::computeRefCost(const Loop &L,
     LLVM_DEBUG(dbgs().indent(4)
                << "Access is consecutive: RefCost=(TripCount*Stride)/CLS="
                << *RefCost << "\n");
-  } else
+  } else {
+    // If the indexed reference is not 'consecutive' the cost is proportional to
+    // the trip count and the depth of the dimension which the subject loop
+    // subscript is accessing. We try to estimate this by multiplying the cost
+    // by the trip counts of loops corresponding to the inner dimensions. For
+    // example, given the indexed reference 'A[i][j][k]', and assuming the
+    // i-loop is in the innermost position, the cost would be equal to the
+    // iterations of the i-loop multiplied by iterations of the j-loop.
+    RefCost = TripCount;
+
+    int Index = getSubscriptIndex(L);
+    assert(Index >= 0 && "Cound not locate a valid Index");
+
+    for (unsigned I = Index + 1; I < getNumSubscripts() - 1; ++I) {
+      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(getSubscript(I));
+      assert(AR && AR->getLoop() && "Expecting valid loop");
+      const SCEV *TripCount =
+          computeTripCount(*AR->getLoop(), *Sizes.back(), SE);
+      Type *WiderType = SE.getWiderType(RefCost->getType(), TripCount->getType());
+      RefCost = SE.getMulExpr(SE.getNoopOrAnyExtend(RefCost, WiderType),
+                              SE.getNoopOrAnyExtend(TripCount, WiderType));
+    }
+
     LLVM_DEBUG(dbgs().indent(4)
-               << "Access is not consecutive: RefCost=TripCount=" << *RefCost
-               << "\n");
+               << "Access is not consecutive: RefCost=" << *RefCost << "\n");
+  }
+  assert(RefCost && "Expecting a valid RefCost");
 
   // Attempt to fold RefCost into a constant.
   if (auto ConstantCost = dyn_cast<SCEVConstant>(RefCost))
@@ -319,6 +348,26 @@ CacheCostTy IndexedReference::computeRefCost(const Loop &L,
   return CacheCost::InvalidCost;
 }
 
+bool IndexedReference::tryDelinearizeFixedSize(
+    const SCEV *AccessFn, SmallVectorImpl<const SCEV *> &Subscripts) {
+  SmallVector<int, 4> ArraySizes;
+  if (!tryDelinearizeFixedSizeImpl(&SE, &StoreOrLoadInst, AccessFn, Subscripts,
+                                   ArraySizes))
+    return false;
+
+  // Populate Sizes with scev expressions to be used in calculations later.
+  for (auto Idx : seq<unsigned>(1, Subscripts.size()))
+    Sizes.push_back(
+        SE.getConstant(Subscripts[Idx]->getType(), ArraySizes[Idx - 1]));
+
+  LLVM_DEBUG({
+    dbgs() << "Delinearized subscripts of fixed-size array\n"
+           << "GEP:" << *getLoadStorePointerOperand(&StoreOrLoadInst)
+           << "\n";
+  });
+  return true;
+}
+
 bool IndexedReference::delinearize(const LoopInfo &LI) {
   assert(Subscripts.empty() && "Subscripts should be empty");
   assert(Sizes.empty() && "Sizes should be empty");
@@ -340,13 +389,25 @@ bool IndexedReference::delinearize(const LoopInfo &LI) {
       return false;
     }
 
-    AccessFn = SE.getMinusSCEV(AccessFn, BasePointer);
+    bool IsFixedSize = false;
+    // Try to delinearize fixed-size arrays.
+    if (tryDelinearizeFixedSize(AccessFn, Subscripts)) {
+      IsFixedSize = true;
+      // The last element of Sizes is the element size.
+      Sizes.push_back(ElemSize);
+      LLVM_DEBUG(dbgs().indent(2) << "In Loop '" << L->getName()
+                                  << "', AccessFn: " << *AccessFn << "\n");
+    }
 
-    LLVM_DEBUG(dbgs().indent(2) << "In Loop '" << L->getName()
-                                << "', AccessFn: " << *AccessFn << "\n");
+    AccessFn = SE.getMinusSCEV(AccessFn, BasePointer);
 
-    llvm::delinearize(SE, AccessFn, Subscripts, Sizes,
-                      SE.getElementSize(&StoreOrLoadInst));
+    // Try to delinearize parametric-size arrays.
+    if (!IsFixedSize) {
+      LLVM_DEBUG(dbgs().indent(2) << "In Loop '" << L->getName()
+                                  << "', AccessFn: " << *AccessFn << "\n");
+      llvm::delinearize(SE, AccessFn, Subscripts, Sizes,
+                        SE.getElementSize(&StoreOrLoadInst));
+    }
 
     if (Subscripts.empty() || Sizes.empty() ||
         Subscripts.size() != Sizes.size()) {
@@ -424,6 +485,16 @@ bool IndexedReference::isConsecutive(const Loop &L, unsigned CLS) const {
   return SE.isKnownPredicate(ICmpInst::ICMP_ULT, Stride, CacheLineSize);
 }
 
+int IndexedReference::getSubscriptIndex(const Loop &L) const {
+  for (auto Idx : seq<int>(0, getNumSubscripts())) {
+    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(getSubscript(Idx));
+    if (AR && AR->getLoop() == &L) {
+      return Idx;
+    }
+  }
+  return -1;
+}
+
 const SCEV *IndexedReference::getLastCoefficient() const {
   const SCEV *LastSubscript = getLastSubscript();
   auto *AR = cast<SCEVAddRecExpr>(LastSubscript);
@@ -550,7 +621,7 @@ bool CacheCost::populateReferenceGroups(ReferenceGroupsTy &RefGroups) const {
 
       bool Added = false;
       for (ReferenceGroupTy &RefGroup : RefGroups) {
-        const IndexedReference &Representative = *RefGroup.front().get();
+        const IndexedReference &Representative = *RefGroup.front();
         LLVM_DEBUG({
           dbgs() << "References:\n";
           dbgs().indent(2) << *R << "\n";
@@ -574,8 +645,8 @@ bool CacheCost::populateReferenceGroups(ReferenceGroupsTy &RefGroups) const {
         Optional<bool> HasSpacialReuse =
             R->hasSpacialReuse(Representative, CLS, AA);
 
-        if ((HasTemporalReuse.hasValue() && *HasTemporalReuse) ||
-            (HasSpacialReuse.hasValue() && *HasSpacialReuse)) {
+        if ((HasTemporalReuse && *HasTemporalReuse) ||
+            (HasSpacialReuse && *HasSpacialReuse)) {
           RefGroup.push_back(std::move(R));
           Added = true;
           break;
diff --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp
index b161c490a6bc..29c2437ff5ea 100644
--- a/llvm/lib/Analysis/LoopInfo.cpp
+++ b/llvm/lib/Analysis/LoopInfo.cpp
@@ -14,7 +14,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/IVDescriptors.h"
@@ -30,7 +29,6 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Dominators.h"
-#include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
@@ -38,9 +36,7 @@
 #include "llvm/IR/PrintPasses.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 using namespace llvm;
 
 // Explicitly instantiate methods in LoopInfoImpl.h for IR-level Loops.
@@ -740,6 +736,7 @@ void UnloopUpdater::updateBlockParents() {
   bool Changed = FoundIB;
   for (unsigned NIters = 0; Changed; ++NIters) {
     assert(NIters < Unloop.getNumBlocks() && "runaway iterative algorithm");
+    (void) NIters;
 
     // Iterate over the postorder list of blocks, propagating the nearest loop
     // from successors to predecessors as before.
@@ -1085,13 +1082,13 @@ Optional<bool> llvm::getOptionalBoolLoopAttribute(const Loop *TheLoop,
 }
 
 bool llvm::getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name) {
-  return getOptionalBoolLoopAttribute(TheLoop, Name).getValueOr(false);
+  return getOptionalBoolLoopAttribute(TheLoop, Name).value_or(false);
 }
 
 llvm::Optional<int> llvm::getOptionalIntLoopAttribute(const Loop *TheLoop,
                                                       StringRef Name) {
   const MDOperand *AttrMD =
-      findStringMetadataForLoop(TheLoop, Name).getValueOr(nullptr);
+      findStringMetadataForLoop(TheLoop, Name).value_or(nullptr);
   if (!AttrMD)
     return None;
 
@@ -1104,7 +1101,7 @@ llvm::Optional<int> llvm::getOptionalIntLoopAttribute(const Loop *TheLoop,
 
 int llvm::getIntLoopAttribute(const Loop *TheLoop, StringRef Name,
                               int Default) {
-  return getOptionalIntLoopAttribute(TheLoop, Name).getValueOr(Default);
+  return getOptionalIntLoopAttribute(TheLoop, Name).value_or(Default);
 }
 
 bool llvm::isFinite(const Loop *L) {
diff --git a/llvm/lib/Analysis/LoopNestAnalysis.cpp b/llvm/lib/Analysis/LoopNestAnalysis.cpp
index 675bb7a7749c..bff796f339ab 100644
--- a/llvm/lib/Analysis/LoopNestAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopNestAnalysis.cpp
@@ -13,8 +13,7 @@
 
 #include "llvm/Analysis/LoopNestAnalysis.h"
 #include "llvm/ADT/BreadthFirstIterator.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/PostDominators.h"
+#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/Analysis/ValueTracking.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Analysis/LoopPass.cpp b/llvm/lib/Analysis/LoopPass.cpp
index b720bab454e9..5d824aece488 100644
--- a/llvm/lib/Analysis/LoopPass.cpp
+++ b/llvm/lib/Analysis/LoopPass.cpp
@@ -13,14 +13,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/OptBisect.h"
-#include "llvm/IR/PassManager.h"
 #include "llvm/IR/PassTimingInfo.h"
 #include "llvm/IR/PrintPasses.h"
-#include "llvm/IR/StructuralHash.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/TimeProfiler.h"
@@ -192,12 +190,12 @@ bool LPPassManager::runOnFunction(Function &F) {
         PassManagerPrettyStackEntry X(P, *CurrentLoop->getHeader());
         TimeRegion PassTimer(getPassTimer(P));
 #ifdef EXPENSIVE_CHECKS
-        uint64_t RefHash = StructuralHash(F);
+        uint64_t RefHash = P->structuralHash(F);
 #endif
         LocalChanged = P->runOnLoop(CurrentLoop, *this);
 
 #ifdef EXPENSIVE_CHECKS
-        if (!LocalChanged && (RefHash != StructuralHash(F))) {
+        if (!LocalChanged && (RefHash != P->structuralHash(F))) {
           llvm::errs() << "Pass modifies its input and doesn't report it: "
                        << P->getPassName() << "\n";
           llvm_unreachable("Pass modifies its input and doesn't report it");
diff --git a/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp b/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp
index 15095d67d385..84f1eff9a732 100644
--- a/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp
+++ b/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp
@@ -13,7 +13,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/LoopUnrollAnalyzer.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/Operator.h"
 
 using namespace llvm;
 
@@ -84,9 +87,9 @@ bool UnrolledInstAnalyzer::visitBinaryOperator(BinaryOperator &I) {
   const DataLayout &DL = I.getModule()->getDataLayout();
   if (auto FI = dyn_cast<FPMathOperator>(&I))
     SimpleV =
-        SimplifyBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL);
+        simplifyBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL);
   else
-    SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL);
+    SimpleV = simplifyBinOp(I.getOpcode(), LHS, RHS, DL);
 
   if (SimpleV) {
     SimplifiedValues[&I] = SimpleV;
@@ -155,7 +158,7 @@ bool UnrolledInstAnalyzer::visitCastInst(CastInst &I) {
   // i32 0).
   if (CastInst::castIsValid(I.getOpcode(), Op, I.getType())) {
     const DataLayout &DL = I.getModule()->getDataLayout();
-    if (Value *V = SimplifyCastInst(I.getOpcode(), Op, I.getType(), DL)) {
+    if (Value *V = simplifyCastInst(I.getOpcode(), Op, I.getType(), DL)) {
       SimplifiedValues[&I] = V;
       return true;
     }
@@ -192,7 +195,7 @@ bool UnrolledInstAnalyzer::visitCmpInst(CmpInst &I) {
   }
 
   const DataLayout &DL = I.getModule()->getDataLayout();
-  if (Value *V = SimplifyCmpInst(I.getPredicate(), LHS, RHS, DL)) {
+  if (Value *V = simplifyCmpInst(I.getPredicate(), LHS, RHS, DL)) {
     SimplifiedValues[&I] = V;
     return true;
   }
diff --git a/llvm/lib/Analysis/MLInlineAdvisor.cpp b/llvm/lib/Analysis/MLInlineAdvisor.cpp
index 0480c1cd2842..f55de71ea98a 100644
--- a/llvm/lib/Analysis/MLInlineAdvisor.cpp
+++ b/llvm/lib/Analysis/MLInlineAdvisor.cpp
@@ -13,30 +13,25 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/Analysis/MLInlineAdvisor.h"
 #include "llvm/ADT/SCCIterator.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/FunctionPropertiesAnalysis.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/InlineModelFeatureMaps.h"
 #include "llvm/Analysis/LazyCallGraph.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MLModelRunner.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/ReleaseModeModelRunner.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Config/config.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Path.h"
-
-#include <limits>
-#include <unordered_map>
-#include <unordered_set>
 
 using namespace llvm;
 
 #if defined(LLVM_HAVE_TF_AOT_INLINERSIZEMODEL)
+#include "llvm/Analysis/ReleaseModeModelRunner.h"
 // codegen-ed file
 #include "InlinerSizeModel.h" // NOLINT
 
@@ -44,7 +39,7 @@ std::unique_ptr<InlineAdvisor>
 llvm::getReleaseModeAdvisor(Module &M, ModuleAnalysisManager &MAM) {
   auto AOTRunner =
       std::make_unique<ReleaseModeModelRunner<llvm::InlinerSizeModel>>(
-          M.getContext(), FeatureNameMap, DecisionName);
+          M.getContext(), FeatureMap, DecisionName);
   return std::make_unique<MLInlineAdvisor>(M, MAM, std::move(AOTRunner));
 }
 #endif
@@ -57,15 +52,21 @@ static cl::opt<float> SizeIncreaseThreshold(
              "blocking any further inlining."),
     cl::init(2.0));
 
+static cl::opt<bool> KeepFPICache(
+    "ml-advisor-keep-fpi-cache", cl::Hidden,
+    cl::desc(
+        "For test - keep the ML Inline advisor's FunctionPropertiesInfo cache"),
+    cl::init(false));
+
 // clang-format off
-const std::array<std::string, NumberOfFeatures> llvm::FeatureNameMap{
+const std::array<TensorSpec, NumberOfFeatures> llvm::FeatureMap{
+#define POPULATE_NAMES(_, NAME) TensorSpec::createSpec<int64_t>(NAME, {1} ),
 // InlineCost features - these must come first
-#define POPULATE_NAMES(INDEX_NAME, NAME) NAME,
   INLINE_COST_FEATURE_ITERATOR(POPULATE_NAMES)
 #undef POPULATE_NAMES
 
 // Non-cost features
-#define POPULATE_NAMES(INDEX_NAME, NAME, COMMENT) NAME,
+#define POPULATE_NAMES(_, NAME, __) TensorSpec::createSpec<int64_t>(NAME, {1} ),
   INLINE_FEATURE_ITERATOR(POPULATE_NAMES)
 #undef POPULATE_NAMES
 };
@@ -138,7 +139,10 @@ unsigned MLInlineAdvisor::getInitialFunctionLevel(const Function &F) const {
   return CG.lookup(F) ? FunctionLevels.at(CG.lookup(F)) : 0;
 }
 
-void MLInlineAdvisor::onPassEntry() {
+void MLInlineAdvisor::onPassEntry(LazyCallGraph::SCC *LastSCC) {
+  if (!LastSCC || ForceStop)
+    return;
+  FPICache.clear();
   // Function passes executed between InlinerPass runs may have changed the
   // module-wide features.
   // The cgscc pass manager rules are such that:
@@ -154,8 +158,8 @@ void MLInlineAdvisor::onPassEntry() {
   // care about the nature of the Edge (call or ref).
   NodeCount -= static_cast<int64_t>(NodesInLastSCC.size());
   while (!NodesInLastSCC.empty()) {
-    const auto *N = NodesInLastSCC.front();
-    NodesInLastSCC.pop_front();
+    const auto *N = *NodesInLastSCC.begin();
+    NodesInLastSCC.erase(N);
     // The Function wrapped by N could have been deleted since we last saw it.
     if (N->isDead()) {
       assert(!N->getFunction().isDeclaration());
@@ -168,34 +172,52 @@ void MLInlineAdvisor::onPassEntry() {
       assert(!AdjNode->isDead() && !AdjNode->getFunction().isDeclaration());
       auto I = AllNodes.insert(AdjNode);
       if (I.second)
-        NodesInLastSCC.push_back(AdjNode);
+        NodesInLastSCC.insert(AdjNode);
     }
   }
 
   EdgeCount -= EdgesOfLastSeenNodes;
   EdgesOfLastSeenNodes = 0;
+
+  // (Re)use NodesInLastSCC to remember the nodes in the SCC right now,
+  // in case the SCC is split before onPassExit and some nodes are split out
+  assert(NodesInLastSCC.empty());
+  for (const auto &N : *LastSCC)
+    NodesInLastSCC.insert(&N);
 }
 
 void MLInlineAdvisor::onPassExit(LazyCallGraph::SCC *LastSCC) {
-  if (!LastSCC)
+  // No need to keep this around - function passes will invalidate it.
+  if (!KeepFPICache)
+    FPICache.clear();
+  if (!LastSCC || ForceStop)
     return;
   // Keep track of the nodes and edges we last saw. Then, in onPassEntry,
   // we update the node count and edge count from the subset of these nodes that
   // survived.
-  assert(NodesInLastSCC.empty());
-  assert(NodeCount >= LastSCC->size());
   EdgesOfLastSeenNodes = 0;
+
+  // Check on nodes that were in SCC onPassEntry
+  for (auto I = NodesInLastSCC.begin(); I != NodesInLastSCC.end();) {
+    if ((*I)->isDead())
+      NodesInLastSCC.erase(*I++);
+    else
+      EdgesOfLastSeenNodes += getLocalCalls((*I++)->getFunction());
+  }
+
+  // Check on nodes that may have got added to SCC
   for (const auto &N : *LastSCC) {
     assert(!N.isDead());
-    EdgesOfLastSeenNodes += getLocalCalls(N.getFunction());
-    NodesInLastSCC.push_back(&N);
+    auto I = NodesInLastSCC.insert(&N);
+    if (I.second)
+      EdgesOfLastSeenNodes += getLocalCalls(N.getFunction());
   }
+  assert(NodeCount >= NodesInLastSCC.size());
   assert(EdgeCount >= EdgesOfLastSeenNodes);
 }
 
 int64_t MLInlineAdvisor::getLocalCalls(Function &F) {
-  return FAM.getResult<FunctionPropertiesAnalysis>(F)
-      .DirectCallsToDefinedFunctions;
+  return getCachedFPI(F).DirectCallsToDefinedFunctions;
 }
 
 // Update the internal state of the advisor, and force invalidate feature
@@ -208,13 +230,15 @@ void MLInlineAdvisor::onSuccessfulInlining(const MLInlineAdvice &Advice,
   assert(!ForceStop);
   Function *Caller = Advice.getCaller();
   Function *Callee = Advice.getCallee();
-
   // The caller features aren't valid anymore.
   {
     PreservedAnalyses PA = PreservedAnalyses::all();
     PA.abandon<FunctionPropertiesAnalysis>();
+    PA.abandon<DominatorTreeAnalysis>();
+    PA.abandon<LoopAnalysis>();
     FAM.invalidate(*Caller, PA);
   }
+  Advice.updateCachedCallerFPI(FAM);
   int64_t IRSizeAfter =
       getIRSize(*Caller) + (CalleeWasDeleted ? 0 : Advice.CalleeIRSize);
   CurrentIRSize += IRSizeAfter - (Advice.CallerIRSize + Advice.CalleeIRSize);
@@ -227,15 +251,13 @@ void MLInlineAdvisor::onSuccessfulInlining(const MLInlineAdvice &Advice,
   // For edges, we 'forget' the edges that the caller and callee used to have
   // before inlining, and add back what they currently have together.
   int64_t NewCallerAndCalleeEdges =
-      FAM.getResult<FunctionPropertiesAnalysis>(*Caller)
-          .DirectCallsToDefinedFunctions;
+      getCachedFPI(*Caller).DirectCallsToDefinedFunctions;
 
   if (CalleeWasDeleted)
     --NodeCount;
   else
     NewCallerAndCalleeEdges +=
-        FAM.getResult<FunctionPropertiesAnalysis>(*Callee)
-            .DirectCallsToDefinedFunctions;
+        getCachedFPI(*Callee).DirectCallsToDefinedFunctions;
   EdgeCount += (NewCallerAndCalleeEdges - Advice.CallerAndCalleeEdges);
   assert(CurrentIRSize >= 0 && EdgeCount >= 0 && NodeCount >= 0);
 }
@@ -248,7 +270,19 @@ int64_t MLInlineAdvisor::getModuleIRSize() const {
   return Ret;
 }
 
+FunctionPropertiesInfo &MLInlineAdvisor::getCachedFPI(Function &F) const {
+  auto InsertPair =
+      FPICache.insert(std::make_pair(&F, FunctionPropertiesInfo()));
+  if (!InsertPair.second)
+    return InsertPair.first->second;
+  InsertPair.first->second = FAM.getResult<FunctionPropertiesAnalysis>(F);
+  return InsertPair.first->second;
+}
+
 std::unique_ptr<InlineAdvice> MLInlineAdvisor::getAdviceImpl(CallBase &CB) {
+  if (auto Skip = getSkipAdviceIfUnreachableCallsite(CB))
+    return Skip;
+
   auto &Caller = *CB.getCaller();
   auto &Callee = *CB.getCalledFunction();
 
@@ -307,8 +341,8 @@ std::unique_ptr<InlineAdvice> MLInlineAdvisor::getAdviceImpl(CallBase &CB) {
     NrCtantParams += (isa<Constant>(*I));
   }
 
-  auto &CallerBefore = FAM.getResult<FunctionPropertiesAnalysis>(Caller);
-  auto &CalleeBefore = FAM.getResult<FunctionPropertiesAnalysis>(Callee);
+  auto &CallerBefore = getCachedFPI(Caller);
+  auto &CalleeBefore = getCachedFPI(Callee);
 
   *ModelRunner->getTensor<int64_t>(FeatureIndex::CalleeBasicBlockCount) =
       CalleeBefore.BasicBlockCount;
@@ -348,9 +382,19 @@ MLInlineAdvisor::getAdviceFromModel(CallBase &CB,
       this, CB, ORE, static_cast<bool>(ModelRunner->evaluate<int64_t>()));
 }
 
+std::unique_ptr<InlineAdvice>
+MLInlineAdvisor::getSkipAdviceIfUnreachableCallsite(CallBase &CB) {
+  if (!FAM.getResult<DominatorTreeAnalysis>(*CB.getCaller())
+           .isReachableFromEntry(CB.getParent()))
+    return std::make_unique<InlineAdvice>(this, CB, getCallerORE(CB), false);
+  return nullptr;
+}
+
 std::unique_ptr<InlineAdvice> MLInlineAdvisor::getMandatoryAdvice(CallBase &CB,
                                                                   bool Advice) {
   // Make sure we track inlinings in all cases - mandatory or not.
+  if (auto Skip = getSkipAdviceIfUnreachableCallsite(CB))
+    return Skip;
   if (Advice && !ForceStop)
     return getMandatoryAdviceImpl(CB);
 
@@ -366,16 +410,47 @@ MLInlineAdvisor::getMandatoryAdviceImpl(CallBase &CB) {
   return std::make_unique<MLInlineAdvice>(this, CB, getCallerORE(CB), true);
 }
 
+void MLInlineAdvisor::print(raw_ostream &OS) const {
+  OS << "[MLInlineAdvisor] Nodes: " << NodeCount << " Edges: " << EdgeCount
+     << " EdgesOfLastSeenNodes: " << EdgesOfLastSeenNodes << "\n";
+  OS << "[MLInlineAdvisor] FPI:\n";
+  for (auto I : FPICache) {
+    OS << I.getFirst()->getName() << ":\n";
+    I.getSecond().print(OS);
+    OS << "\n";
+  }
+  OS << "\n";
+}
+
+MLInlineAdvice::MLInlineAdvice(MLInlineAdvisor *Advisor, CallBase &CB,
+                               OptimizationRemarkEmitter &ORE,
+                               bool Recommendation)
+    : InlineAdvice(Advisor, CB, ORE, Recommendation),
+      CallerIRSize(Advisor->isForcedToStop() ? 0 : Advisor->getIRSize(*Caller)),
+      CalleeIRSize(Advisor->isForcedToStop() ? 0 : Advisor->getIRSize(*Callee)),
+      CallerAndCalleeEdges(Advisor->isForcedToStop()
+                               ? 0
+                               : (Advisor->getLocalCalls(*Caller) +
+                                  Advisor->getLocalCalls(*Callee))),
+      PreInlineCallerFPI(Advisor->getCachedFPI(*Caller)) {
+  if (Recommendation)
+    FPU.emplace(Advisor->getCachedFPI(*getCaller()), CB);
+}
+
 void MLInlineAdvice::reportContextForRemark(
     DiagnosticInfoOptimizationBase &OR) {
   using namespace ore;
   OR << NV("Callee", Callee->getName());
   for (size_t I = 0; I < NumberOfFeatures; ++I)
-    OR << NV(FeatureNameMap[I],
+    OR << NV(FeatureMap[I].name(),
              *getAdvisor()->getModelRunner().getTensor<int64_t>(I));
   OR << NV("ShouldInline", isInliningRecommended());
 }
 
+void MLInlineAdvice::updateCachedCallerFPI(FunctionAnalysisManager &FAM) const {
+  FPU->finish(FAM);
+}
+
 void MLInlineAdvice::recordInliningImpl() {
   ORE.emit([&]() {
     OptimizationRemark R(DEBUG_TYPE, "InliningSuccess", DLoc, Block);
@@ -397,6 +472,7 @@ void MLInlineAdvice::recordInliningWithCalleeDeletedImpl() {
 
 void MLInlineAdvice::recordUnsuccessfulInliningImpl(
     const InlineResult &Result) {
+  getAdvisor()->getCachedFPI(*Caller) = PreInlineCallerFPI;
   ORE.emit([&]() {
     OptimizationRemarkMissed R(DEBUG_TYPE, "InliningAttemptedAndUnsuccessful",
                                DLoc, Block);
@@ -405,6 +481,7 @@ void MLInlineAdvice::recordUnsuccessfulInliningImpl(
   });
 }
 void MLInlineAdvice::recordUnattemptedInliningImpl() {
+  assert(!FPU);
   ORE.emit([&]() {
     OptimizationRemarkMissed R(DEBUG_TYPE, "IniningNotAttempted", DLoc, Block);
     reportContextForRemark(R);
diff --git a/llvm/lib/Analysis/MemDepPrinter.cpp b/llvm/lib/Analysis/MemDepPrinter.cpp
index 00642347102a..305ae3e2a992 100644
--- a/llvm/lib/Analysis/MemDepPrinter.cpp
+++ b/llvm/lib/Analysis/MemDepPrinter.cpp
@@ -15,7 +15,6 @@
 #include "llvm/Analysis/Passes.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/llvm/lib/Analysis/MemDerefPrinter.cpp b/llvm/lib/Analysis/MemDerefPrinter.cpp
index 82617c7256a5..4dd5c76cc604 100644
--- a/llvm/lib/Analysis/MemDerefPrinter.cpp
+++ b/llvm/lib/Analysis/MemDerefPrinter.cpp
@@ -9,14 +9,11 @@
 #include "llvm/Analysis/MemDerefPrinter.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/Passes.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Analysis/MemoryBuiltins.cpp b/llvm/lib/Analysis/MemoryBuiltins.cpp
index 208f93aa1ac6..91501b04448e 100644
--- a/llvm/lib/Analysis/MemoryBuiltins.cpp
+++ b/llvm/lib/Analysis/MemoryBuiltins.cpp
@@ -17,7 +17,7 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/TargetFolder.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/Utils/Local.h"
@@ -43,6 +43,8 @@
 #include <cassert>
 #include <cstdint>
 #include <iterator>
+#include <numeric>
+#include <type_traits>
 #include <utility>
 
 using namespace llvm;
@@ -62,6 +64,42 @@ enum AllocType : uint8_t {
   AnyAlloc           = AllocLike | ReallocLike
 };
 
+enum class MallocFamily {
+  Malloc,
+  CPPNew,             // new(unsigned int)
+  CPPNewAligned,      // new(unsigned int, align_val_t)
+  CPPNewArray,        // new[](unsigned int)
+  CPPNewArrayAligned, // new[](unsigned long, align_val_t)
+  MSVCNew,            // new(unsigned int)
+  MSVCArrayNew,       // new[](unsigned int)
+  VecMalloc,
+  KmpcAllocShared,
+};
+
+StringRef mangledNameForMallocFamily(const MallocFamily &Family) {
+  switch (Family) {
+  case MallocFamily::Malloc:
+    return "malloc";
+  case MallocFamily::CPPNew:
+    return "_Znwm";
+  case MallocFamily::CPPNewAligned:
+    return "_ZnwmSt11align_val_t";
+  case MallocFamily::CPPNewArray:
+    return "_Znam";
+  case MallocFamily::CPPNewArrayAligned:
+    return "_ZnamSt11align_val_t";
+  case MallocFamily::MSVCNew:
+    return "??2@YAPAXI@Z";
+  case MallocFamily::MSVCArrayNew:
+    return "??_U@YAPAXI@Z";
+  case MallocFamily::VecMalloc:
+    return "vec_malloc";
+  case MallocFamily::KmpcAllocShared:
+    return "__kmpc_alloc_shared";
+  }
+  llvm_unreachable("missing an alloc family");
+}
+
 struct AllocFnsTy {
   AllocType AllocTy;
   unsigned NumParams;
@@ -69,50 +107,55 @@ struct AllocFnsTy {
   int FstParam, SndParam;
   // Alignment parameter for aligned_alloc and aligned new
   int AlignParam;
+  // Name of default allocator function to group malloc/free calls by family
+  MallocFamily Family;
 };
 
+// clang-format off
 // FIXME: certain users need more information. E.g., SimplifyLibCalls needs to
 // know which functions are nounwind, noalias, nocapture parameters, etc.
 static const std::pair<LibFunc, AllocFnsTy> AllocationFnData[] = {
-    {LibFunc_malloc,                            {MallocLike,       1,  0, -1, -1}},
-    {LibFunc_vec_malloc,                        {MallocLike,       1,  0, -1, -1}},
-    {LibFunc_valloc,                            {MallocLike,       1,  0, -1, -1}},
-    {LibFunc_Znwj,                              {OpNewLike,        1,  0, -1, -1}}, // new(unsigned int)
-    {LibFunc_ZnwjRKSt9nothrow_t,                {MallocLike,       2,  0, -1, -1}}, // new(unsigned int, nothrow)
-    {LibFunc_ZnwjSt11align_val_t,               {OpNewLike,        2,  0, -1,  1}}, // new(unsigned int, align_val_t)
-    {LibFunc_ZnwjSt11align_val_tRKSt9nothrow_t, {MallocLike,       3,  0, -1,  1}}, // new(unsigned int, align_val_t, nothrow)
-    {LibFunc_Znwm,                              {OpNewLike,        1,  0, -1, -1}}, // new(unsigned long)
-    {LibFunc_ZnwmRKSt9nothrow_t,                {MallocLike,       2,  0, -1, -1}}, // new(unsigned long, nothrow)
-    {LibFunc_ZnwmSt11align_val_t,               {OpNewLike,        2,  0, -1,  1}}, // new(unsigned long, align_val_t)
-    {LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t, {MallocLike,       3,  0, -1,  1}}, // new(unsigned long, align_val_t, nothrow)
-    {LibFunc_Znaj,                              {OpNewLike,        1,  0, -1, -1}}, // new[](unsigned int)
-    {LibFunc_ZnajRKSt9nothrow_t,                {MallocLike,       2,  0, -1, -1}}, // new[](unsigned int, nothrow)
-    {LibFunc_ZnajSt11align_val_t,               {OpNewLike,        2,  0, -1,  1}}, // new[](unsigned int, align_val_t)
-    {LibFunc_ZnajSt11align_val_tRKSt9nothrow_t, {MallocLike,       3,  0, -1,  1}}, // new[](unsigned int, align_val_t, nothrow)
-    {LibFunc_Znam,                              {OpNewLike,        1,  0, -1, -1}}, // new[](unsigned long)
-    {LibFunc_ZnamRKSt9nothrow_t,                {MallocLike,       2,  0, -1, -1}}, // new[](unsigned long, nothrow)
-    {LibFunc_ZnamSt11align_val_t,               {OpNewLike,        2,  0, -1,  1}}, // new[](unsigned long, align_val_t)
-    {LibFunc_ZnamSt11align_val_tRKSt9nothrow_t, {MallocLike,       3,  0, -1,  1}}, // new[](unsigned long, align_val_t, nothrow)
-    {LibFunc_msvc_new_int,                      {OpNewLike,        1,  0, -1, -1}}, // new(unsigned int)
-    {LibFunc_msvc_new_int_nothrow,              {MallocLike,       2,  0, -1, -1}}, // new(unsigned int, nothrow)
-    {LibFunc_msvc_new_longlong,                 {OpNewLike,        1,  0, -1, -1}}, // new(unsigned long long)
-    {LibFunc_msvc_new_longlong_nothrow,         {MallocLike,       2,  0, -1, -1}}, // new(unsigned long long, nothrow)
-    {LibFunc_msvc_new_array_int,                {OpNewLike,        1,  0, -1, -1}}, // new[](unsigned int)
-    {LibFunc_msvc_new_array_int_nothrow,        {MallocLike,       2,  0, -1, -1}}, // new[](unsigned int, nothrow)
-    {LibFunc_msvc_new_array_longlong,           {OpNewLike,        1,  0, -1, -1}}, // new[](unsigned long long)
-    {LibFunc_msvc_new_array_longlong_nothrow,   {MallocLike,       2,  0, -1, -1}}, // new[](unsigned long long, nothrow)
-    {LibFunc_aligned_alloc,                     {AlignedAllocLike, 2,  1, -1,  0}},
-    {LibFunc_memalign,                          {AlignedAllocLike, 2,  1, -1,  0}},
-    {LibFunc_calloc,                            {CallocLike,       2,  0,  1, -1}},
-    {LibFunc_vec_calloc,                        {CallocLike,       2,  0,  1, -1}},
-    {LibFunc_realloc,                           {ReallocLike,      2,  1, -1, -1}},
-    {LibFunc_vec_realloc,                       {ReallocLike,      2,  1, -1, -1}},
-    {LibFunc_reallocf,                          {ReallocLike,      2,  1, -1, -1}},
-    {LibFunc_strdup,                            {StrDupLike,       1, -1, -1, -1}},
-    {LibFunc_strndup,                           {StrDupLike,       2,  1, -1, -1}},
-    {LibFunc___kmpc_alloc_shared,               {MallocLike,       1,  0, -1, -1}},
-    // TODO: Handle "int posix_memalign(void **, size_t, size_t)"
+    {LibFunc_malloc,                            {MallocLike,       1,  0, -1, -1, MallocFamily::Malloc}},
+    {LibFunc_vec_malloc,                        {MallocLike,       1,  0, -1, -1, MallocFamily::VecMalloc}},
+    {LibFunc_valloc,                            {MallocLike,       1,  0, -1, -1, MallocFamily::Malloc}},
+    {LibFunc_Znwj,                              {OpNewLike,        1,  0, -1, -1, MallocFamily::CPPNew}},             // new(unsigned int)
+    {LibFunc_ZnwjRKSt9nothrow_t,                {MallocLike,       2,  0, -1, -1, MallocFamily::CPPNew}},             // new(unsigned int, nothrow)
+    {LibFunc_ZnwjSt11align_val_t,               {OpNewLike,        2,  0, -1,  1, MallocFamily::CPPNewAligned}},      // new(unsigned int, align_val_t)
+    {LibFunc_ZnwjSt11align_val_tRKSt9nothrow_t, {MallocLike,       3,  0, -1,  1, MallocFamily::CPPNewAligned}},      // new(unsigned int, align_val_t, nothrow)
+    {LibFunc_Znwm,                              {OpNewLike,        1,  0, -1, -1, MallocFamily::CPPNew}},             // new(unsigned long)
+    {LibFunc_ZnwmRKSt9nothrow_t,                {MallocLike,       2,  0, -1, -1, MallocFamily::CPPNew}},             // new(unsigned long, nothrow)
+    {LibFunc_ZnwmSt11align_val_t,               {OpNewLike,        2,  0, -1,  1, MallocFamily::CPPNewAligned}},      // new(unsigned long, align_val_t)
+    {LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t, {MallocLike,       3,  0, -1,  1, MallocFamily::CPPNewAligned}},      // new(unsigned long, align_val_t, nothrow)
+    {LibFunc_Znaj,                              {OpNewLike,        1,  0, -1, -1, MallocFamily::CPPNewArray}},        // new[](unsigned int)
+    {LibFunc_ZnajRKSt9nothrow_t,                {MallocLike,       2,  0, -1, -1, MallocFamily::CPPNewArray}},        // new[](unsigned int, nothrow)
+    {LibFunc_ZnajSt11align_val_t,               {OpNewLike,        2,  0, -1,  1, MallocFamily::CPPNewArrayAligned}}, // new[](unsigned int, align_val_t)
+    {LibFunc_ZnajSt11align_val_tRKSt9nothrow_t, {MallocLike,       3,  0, -1,  1, MallocFamily::CPPNewArrayAligned}}, // new[](unsigned int, align_val_t, nothrow)
+    {LibFunc_Znam,                              {OpNewLike,        1,  0, -1, -1, MallocFamily::CPPNewArray}},        // new[](unsigned long)
+    {LibFunc_ZnamRKSt9nothrow_t,                {MallocLike,       2,  0, -1, -1, MallocFamily::CPPNewArray}},        // new[](unsigned long, nothrow)
+    {LibFunc_ZnamSt11align_val_t,               {OpNewLike,        2,  0, -1,  1, MallocFamily::CPPNewArrayAligned}}, // new[](unsigned long, align_val_t)
+    {LibFunc_ZnamSt11align_val_tRKSt9nothrow_t, {MallocLike,       3,  0, -1,  1, MallocFamily::CPPNewArrayAligned}}, // new[](unsigned long, align_val_t, nothrow)
+    {LibFunc_msvc_new_int,                      {OpNewLike,        1,  0, -1, -1, MallocFamily::MSVCNew}},            // new(unsigned int)
+    {LibFunc_msvc_new_int_nothrow,              {MallocLike,       2,  0, -1, -1, MallocFamily::MSVCNew}},            // new(unsigned int, nothrow)
+    {LibFunc_msvc_new_longlong,                 {OpNewLike,        1,  0, -1, -1, MallocFamily::MSVCNew}},            // new(unsigned long long)
+    {LibFunc_msvc_new_longlong_nothrow,         {MallocLike,       2,  0, -1, -1, MallocFamily::MSVCNew}},            // new(unsigned long long, nothrow)
+    {LibFunc_msvc_new_array_int,                {OpNewLike,        1,  0, -1, -1, MallocFamily::MSVCArrayNew}},       // new[](unsigned int)
+    {LibFunc_msvc_new_array_int_nothrow,        {MallocLike,       2,  0, -1, -1, MallocFamily::MSVCArrayNew}},       // new[](unsigned int, nothrow)
+    {LibFunc_msvc_new_array_longlong,           {OpNewLike,        1,  0, -1, -1, MallocFamily::MSVCArrayNew}},       // new[](unsigned long long)
+    {LibFunc_msvc_new_array_longlong_nothrow,   {MallocLike,       2,  0, -1, -1, MallocFamily::MSVCArrayNew}},       // new[](unsigned long long, nothrow)
+    {LibFunc_aligned_alloc,                     {AlignedAllocLike, 2,  1, -1,  0, MallocFamily::Malloc}},
+    {LibFunc_memalign,                          {AlignedAllocLike, 2,  1, -1,  0, MallocFamily::Malloc}},
+    {LibFunc_calloc,                            {CallocLike,       2,  0,  1, -1, MallocFamily::Malloc}},
+    {LibFunc_vec_calloc,                        {CallocLike,       2,  0,  1, -1, MallocFamily::VecMalloc}},
+    {LibFunc_realloc,                           {ReallocLike,      2,  1, -1, -1, MallocFamily::Malloc}},
+    {LibFunc_vec_realloc,                       {ReallocLike,      2,  1, -1, -1, MallocFamily::VecMalloc}},
+    {LibFunc_reallocf,                          {ReallocLike,      2,  1, -1, -1, MallocFamily::Malloc}},
+    {LibFunc_strdup,                            {StrDupLike,       1, -1, -1, -1, MallocFamily::Malloc}},
+    {LibFunc_dunder_strdup,                     {StrDupLike,       1, -1, -1, -1, MallocFamily::Malloc}},
+    {LibFunc_strndup,                           {StrDupLike,       2,  1, -1, -1, MallocFamily::Malloc}},
+    {LibFunc_dunder_strndup,                    {StrDupLike,       2,  1, -1, -1, MallocFamily::Malloc}},
+    {LibFunc___kmpc_alloc_shared,               {MallocLike,       1,  0, -1, -1, MallocFamily::KmpcAllocShared}},
 };
+// clang-format on
 
 static const Function *getCalledFunction(const Value *V,
                                          bool &IsNoBuiltin) {
@@ -217,7 +260,7 @@ static Optional<AllocFnsTy> getAllocationSize(const Value *V,
   Result.AllocTy = MallocLike;
   Result.NumParams = Callee->getNumOperands();
   Result.FstParam = Args.first;
-  Result.SndParam = Args.second.getValueOr(-1);
+  Result.SndParam = Args.second.value_or(-1);
   // Allocsize has no way to specify an alignment argument
   Result.AlignParam = -1;
   return Result;
@@ -227,54 +270,53 @@ static Optional<AllocFnsTy> getAllocationSize(const Value *V,
 /// allocates or reallocates memory (either malloc, calloc, realloc, or strdup
 /// like).
 bool llvm::isAllocationFn(const Value *V, const TargetLibraryInfo *TLI) {
-  return getAllocationData(V, AnyAlloc, TLI).hasValue();
+  return getAllocationData(V, AnyAlloc, TLI).has_value();
 }
 bool llvm::isAllocationFn(
     const Value *V, function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
-  return getAllocationData(V, AnyAlloc, GetTLI).hasValue();
+  return getAllocationData(V, AnyAlloc, GetTLI).has_value();
 }
 
 /// Tests if a value is a call or invoke to a library function that
 /// allocates uninitialized memory (such as malloc).
 static bool isMallocLikeFn(const Value *V, const TargetLibraryInfo *TLI) {
-  return getAllocationData(V, MallocOrOpNewLike, TLI).hasValue();
+  return getAllocationData(V, MallocOrOpNewLike, TLI).has_value();
 }
 
 /// Tests if a value is a call or invoke to a library function that
 /// allocates uninitialized memory with alignment (such as aligned_alloc).
 static bool isAlignedAllocLikeFn(const Value *V, const TargetLibraryInfo *TLI) {
-  return getAllocationData(V, AlignedAllocLike, TLI)
-      .hasValue();
+  return getAllocationData(V, AlignedAllocLike, TLI).has_value();
 }
 
 /// Tests if a value is a call or invoke to a library function that
 /// allocates zero-filled memory (such as calloc).
 static bool isCallocLikeFn(const Value *V, const TargetLibraryInfo *TLI) {
-  return getAllocationData(V, CallocLike, TLI).hasValue();
+  return getAllocationData(V, CallocLike, TLI).has_value();
 }
 
 /// Tests if a value is a call or invoke to a library function that
 /// allocates memory similar to malloc or calloc.
 bool llvm::isMallocOrCallocLikeFn(const Value *V, const TargetLibraryInfo *TLI) {
-  return getAllocationData(V, MallocOrCallocLike, TLI).hasValue();
+  return getAllocationData(V, MallocOrCallocLike, TLI).has_value();
 }
 
 /// Tests if a value is a call or invoke to a library function that
 /// allocates memory (either malloc, calloc, or strdup like).
 bool llvm::isAllocLikeFn(const Value *V, const TargetLibraryInfo *TLI) {
-  return getAllocationData(V, AllocLike, TLI).hasValue();
+  return getAllocationData(V, AllocLike, TLI).has_value();
 }
 
 /// Tests if a value is a call or invoke to a library function that
 /// reallocates memory (e.g., realloc).
 bool llvm::isReallocLikeFn(const Value *V, const TargetLibraryInfo *TLI) {
-  return getAllocationData(V, ReallocLike, TLI).hasValue();
+  return getAllocationData(V, ReallocLike, TLI).has_value();
 }
 
 /// Tests if a functions is a call or invoke to a library function that
 /// reallocates memory (e.g., realloc).
 bool llvm::isReallocLikeFn(const Function *F, const TargetLibraryInfo *TLI) {
-  return getAllocationDataForFunction(F, ReallocLike, TLI).hasValue();
+  return getAllocationDataForFunction(F, ReallocLike, TLI).has_value();
 }
 
 bool llvm::isAllocRemovable(const CallBase *CB, const TargetLibraryInfo *TLI) {
@@ -291,13 +333,11 @@ bool llvm::isAllocRemovable(const CallBase *CB, const TargetLibraryInfo *TLI) {
 
 Value *llvm::getAllocAlignment(const CallBase *V,
                                const TargetLibraryInfo *TLI) {
-  assert(isAllocationFn(V, TLI));
-
   const Optional<AllocFnsTy> FnData = getAllocationData(V, AnyAlloc, TLI);
-  if (!FnData.hasValue() || FnData->AlignParam < 0) {
-    return nullptr;
+  if (FnData && FnData->AlignParam >= 0) {
+    return V->getOperand(FnData->AlignParam);
   }
-  return V->getOperand(FnData->AlignParam);
+  return V->getArgOperandWithAttribute(Attribute::AllocAlign);
 }
 
 /// When we're compiling N-bit code, and the user uses parameters that are
@@ -344,7 +384,7 @@ llvm::getAllocSize(const CallBase *CB,
       if (!Arg)
         return None;
 
-      APInt MaxSize = Arg->getValue().zextOrSelf(IntTyBits);
+      APInt MaxSize = Arg->getValue().zext(IntTyBits);
       if (Size.ugt(MaxSize))
         Size = MaxSize + 1;
     }
@@ -379,10 +419,12 @@ llvm::getAllocSize(const CallBase *CB,
   return Size;
 }
 
-Constant *llvm::getInitialValueOfAllocation(const CallBase *Alloc,
+Constant *llvm::getInitialValueOfAllocation(const Value *V,
                                             const TargetLibraryInfo *TLI,
                                             Type *Ty) {
-  assert(isAllocationFn(Alloc, TLI));
+  auto *Alloc = dyn_cast<CallBase>(V);
+  if (!Alloc)
+    return nullptr;
 
   // malloc and aligned_alloc are uninitialized (undef)
   if (isMallocLikeFn(Alloc, TLI) || isAlignedAllocLikeFn(Alloc, TLI))
@@ -395,43 +437,81 @@ Constant *llvm::getInitialValueOfAllocation(const CallBase *Alloc,
   return nullptr;
 }
 
+struct FreeFnsTy {
+  unsigned NumParams;
+  // Name of default allocator function to group malloc/free calls by family
+  MallocFamily Family;
+};
+
+// clang-format off
+static const std::pair<LibFunc, FreeFnsTy> FreeFnData[] = {
+    {LibFunc_free,                               {1, MallocFamily::Malloc}},
+    {LibFunc_vec_free,                           {1, MallocFamily::VecMalloc}},
+    {LibFunc_ZdlPv,                              {1, MallocFamily::CPPNew}},             // operator delete(void*)
+    {LibFunc_ZdaPv,                              {1, MallocFamily::CPPNewArray}},        // operator delete[](void*)
+    {LibFunc_msvc_delete_ptr32,                  {1, MallocFamily::MSVCNew}},            // operator delete(void*)
+    {LibFunc_msvc_delete_ptr64,                  {1, MallocFamily::MSVCNew}},            // operator delete(void*)
+    {LibFunc_msvc_delete_array_ptr32,            {1, MallocFamily::MSVCArrayNew}},       // operator delete[](void*)
+    {LibFunc_msvc_delete_array_ptr64,            {1, MallocFamily::MSVCArrayNew}},       // operator delete[](void*)
+    {LibFunc_ZdlPvj,                             {2, MallocFamily::CPPNew}},             // delete(void*, uint)
+    {LibFunc_ZdlPvm,                             {2, MallocFamily::CPPNew}},             // delete(void*, ulong)
+    {LibFunc_ZdlPvRKSt9nothrow_t,                {2, MallocFamily::CPPNew}},             // delete(void*, nothrow)
+    {LibFunc_ZdlPvSt11align_val_t,               {2, MallocFamily::CPPNewAligned}},      // delete(void*, align_val_t)
+    {LibFunc_ZdaPvj,                             {2, MallocFamily::CPPNewArray}},        // delete[](void*, uint)
+    {LibFunc_ZdaPvm,                             {2, MallocFamily::CPPNewArray}},        // delete[](void*, ulong)
+    {LibFunc_ZdaPvRKSt9nothrow_t,                {2, MallocFamily::CPPNewArray}},        // delete[](void*, nothrow)
+    {LibFunc_ZdaPvSt11align_val_t,               {2, MallocFamily::CPPNewArrayAligned}}, // delete[](void*, align_val_t)
+    {LibFunc_msvc_delete_ptr32_int,              {2, MallocFamily::MSVCNew}},            // delete(void*, uint)
+    {LibFunc_msvc_delete_ptr64_longlong,         {2, MallocFamily::MSVCNew}},            // delete(void*, ulonglong)
+    {LibFunc_msvc_delete_ptr32_nothrow,          {2, MallocFamily::MSVCNew}},            // delete(void*, nothrow)
+    {LibFunc_msvc_delete_ptr64_nothrow,          {2, MallocFamily::MSVCNew}},            // delete(void*, nothrow)
+    {LibFunc_msvc_delete_array_ptr32_int,        {2, MallocFamily::MSVCArrayNew}},       // delete[](void*, uint)
+    {LibFunc_msvc_delete_array_ptr64_longlong,   {2, MallocFamily::MSVCArrayNew}},       // delete[](void*, ulonglong)
+    {LibFunc_msvc_delete_array_ptr32_nothrow,    {2, MallocFamily::MSVCArrayNew}},       // delete[](void*, nothrow)
+    {LibFunc_msvc_delete_array_ptr64_nothrow,    {2, MallocFamily::MSVCArrayNew}},       // delete[](void*, nothrow)
+    {LibFunc___kmpc_free_shared,                 {2, MallocFamily::KmpcAllocShared}},    // OpenMP Offloading RTL free
+    {LibFunc_ZdlPvSt11align_val_tRKSt9nothrow_t, {3, MallocFamily::CPPNewAligned}},      // delete(void*, align_val_t, nothrow)
+    {LibFunc_ZdaPvSt11align_val_tRKSt9nothrow_t, {3, MallocFamily::CPPNewArrayAligned}}, // delete[](void*, align_val_t, nothrow)
+    {LibFunc_ZdlPvjSt11align_val_t,              {3, MallocFamily::CPPNewAligned}},      // delete(void*, unsigned int, align_val_t)
+    {LibFunc_ZdlPvmSt11align_val_t,              {3, MallocFamily::CPPNewAligned}},      // delete(void*, unsigned long, align_val_t)
+    {LibFunc_ZdaPvjSt11align_val_t,              {3, MallocFamily::CPPNewArrayAligned}}, // delete[](void*, unsigned int, align_val_t)
+    {LibFunc_ZdaPvmSt11align_val_t,              {3, MallocFamily::CPPNewArrayAligned}}, // delete[](void*, unsigned long, align_val_t)
+};
+// clang-format on
+
+Optional<FreeFnsTy> getFreeFunctionDataForFunction(const Function *Callee,
+                                                   const LibFunc TLIFn) {
+  const auto *Iter =
+      find_if(FreeFnData, [TLIFn](const std::pair<LibFunc, FreeFnsTy> &P) {
+        return P.first == TLIFn;
+      });
+  if (Iter == std::end(FreeFnData))
+    return None;
+  return Iter->second;
+}
+
+Optional<StringRef> llvm::getAllocationFamily(const Value *I,
+                                              const TargetLibraryInfo *TLI) {
+  bool IsNoBuiltin;
+  const Function *Callee = getCalledFunction(I, IsNoBuiltin);
+  if (Callee == nullptr || IsNoBuiltin)
+    return None;
+  LibFunc TLIFn;
+  if (!TLI || !TLI->getLibFunc(*Callee, TLIFn) || !TLI->has(TLIFn))
+    return None;
+  const auto AllocData = getAllocationDataForFunction(Callee, AnyAlloc, TLI);
+  if (AllocData)
+    return mangledNameForMallocFamily(AllocData.getValue().Family);
+  const auto FreeData = getFreeFunctionDataForFunction(Callee, TLIFn);
+  if (FreeData)
+    return mangledNameForMallocFamily(FreeData.getValue().Family);
+  return None;
+}
+
 /// isLibFreeFunction - Returns true if the function is a builtin free()
 bool llvm::isLibFreeFunction(const Function *F, const LibFunc TLIFn) {
-  unsigned ExpectedNumParams;
-  if (TLIFn == LibFunc_free ||
-      TLIFn == LibFunc_ZdlPv || // operator delete(void*)
-      TLIFn == LibFunc_ZdaPv || // operator delete[](void*)
-      TLIFn == LibFunc_msvc_delete_ptr32 || // operator delete(void*)
-      TLIFn == LibFunc_msvc_delete_ptr64 || // operator delete(void*)
-      TLIFn == LibFunc_msvc_delete_array_ptr32 || // operator delete[](void*)
-      TLIFn == LibFunc_msvc_delete_array_ptr64)   // operator delete[](void*)
-    ExpectedNumParams = 1;
-  else if (TLIFn == LibFunc_ZdlPvj ||              // delete(void*, uint)
-           TLIFn == LibFunc_ZdlPvm ||              // delete(void*, ulong)
-           TLIFn == LibFunc_ZdlPvRKSt9nothrow_t || // delete(void*, nothrow)
-           TLIFn == LibFunc_ZdlPvSt11align_val_t || // delete(void*, align_val_t)
-           TLIFn == LibFunc_ZdaPvj ||              // delete[](void*, uint)
-           TLIFn == LibFunc_ZdaPvm ||              // delete[](void*, ulong)
-           TLIFn == LibFunc_ZdaPvRKSt9nothrow_t || // delete[](void*, nothrow)
-           TLIFn == LibFunc_ZdaPvSt11align_val_t || // delete[](void*, align_val_t)
-           TLIFn == LibFunc_msvc_delete_ptr32_int ||      // delete(void*, uint)
-           TLIFn == LibFunc_msvc_delete_ptr64_longlong || // delete(void*, ulonglong)
-           TLIFn == LibFunc_msvc_delete_ptr32_nothrow || // delete(void*, nothrow)
-           TLIFn == LibFunc_msvc_delete_ptr64_nothrow || // delete(void*, nothrow)
-           TLIFn == LibFunc_msvc_delete_array_ptr32_int ||      // delete[](void*, uint)
-           TLIFn == LibFunc_msvc_delete_array_ptr64_longlong || // delete[](void*, ulonglong)
-           TLIFn == LibFunc_msvc_delete_array_ptr32_nothrow || // delete[](void*, nothrow)
-           TLIFn == LibFunc_msvc_delete_array_ptr64_nothrow || // delete[](void*, nothrow)
-           TLIFn == LibFunc___kmpc_free_shared) // OpenMP Offloading RTL free
-    ExpectedNumParams = 2;
-  else if (TLIFn == LibFunc_ZdaPvSt11align_val_tRKSt9nothrow_t || // delete(void*, align_val_t, nothrow)
-           TLIFn == LibFunc_ZdlPvSt11align_val_tRKSt9nothrow_t || // delete[](void*, align_val_t, nothrow)
-           TLIFn == LibFunc_ZdlPvjSt11align_val_t || // delete(void*, unsigned long, align_val_t)
-           TLIFn == LibFunc_ZdlPvmSt11align_val_t || // delete(void*, unsigned long, align_val_t)
-           TLIFn == LibFunc_ZdaPvjSt11align_val_t || // delete[](void*, unsigned int, align_val_t)
-           TLIFn == LibFunc_ZdaPvmSt11align_val_t) // delete[](void*, unsigned long, align_val_t)
-    ExpectedNumParams = 3;
-  else
+  Optional<FreeFnsTy> FnData = getFreeFunctionDataForFunction(F, TLIFn);
+  if (!FnData)
     return false;
 
   // Check free prototype.
@@ -440,7 +520,7 @@ bool llvm::isLibFreeFunction(const Function *F, const LibFunc TLIFn) {
   FunctionType *FTy = F->getFunctionType();
   if (!FTy->getReturnType()->isVoidTy())
     return false;
-  if (FTy->getNumParams() != ExpectedNumParams)
+  if (FTy->getNumParams() != FnData->NumParams)
     return false;
   if (FTy->getParamType(0) != Type::getInt8PtrTy(F->getContext()))
     return false;
@@ -491,11 +571,21 @@ Value *llvm::lowerObjectSizeCall(IntrinsicInst *ObjectSize,
                                  const DataLayout &DL,
                                  const TargetLibraryInfo *TLI,
                                  bool MustSucceed) {
+  return lowerObjectSizeCall(ObjectSize, DL, TLI, /*AAResults=*/nullptr,
+                             MustSucceed);
+}
+
+Value *llvm::lowerObjectSizeCall(IntrinsicInst *ObjectSize,
+                                 const DataLayout &DL,
+                                 const TargetLibraryInfo *TLI, AAResults *AA,
+                                 bool MustSucceed) {
   assert(ObjectSize->getIntrinsicID() == Intrinsic::objectsize &&
          "ObjectSize must be a call to llvm.objectsize!");
 
   bool MaxVal = cast<ConstantInt>(ObjectSize->getArgOperand(1))->isZero();
   ObjectSizeOpts EvalOptions;
+  EvalOptions.AA = AA;
+
   // Unless we have to fold this to something, try to be as accurate as
   // possible.
   if (MustSucceed)
@@ -559,7 +649,7 @@ STATISTIC(ObjectVisitorLoad,
 
 APInt ObjectSizeOffsetVisitor::align(APInt Size, MaybeAlign Alignment) {
   if (Options.RoundToAlign && Alignment)
-    return APInt(IntTyBits, alignTo(Size.getZExtValue(), Alignment));
+    return APInt(IntTyBits, alignTo(Size.getZExtValue(), *Alignment));
   return Size;
 }
 
@@ -573,18 +663,48 @@ ObjectSizeOffsetVisitor::ObjectSizeOffsetVisitor(const DataLayout &DL,
 }
 
 SizeOffsetType ObjectSizeOffsetVisitor::compute(Value *V) {
+  unsigned InitialIntTyBits = DL.getIndexTypeSizeInBits(V->getType());
+
+  // Stripping pointer casts can strip address space casts which can change the
+  // index type size. The invariant is that we use the value type to determine
+  // the index type size and if we stripped address space casts we have to
+  // readjust the APInt as we pass it upwards in order for the APInt to match
+  // the type the caller passed in.
+  APInt Offset(InitialIntTyBits, 0);
+  V = V->stripAndAccumulateConstantOffsets(
+      DL, Offset, /* AllowNonInbounds */ true, /* AllowInvariantGroup */ true);
+
+  // Later we use the index type size and zero but it will match the type of the
+  // value that is passed to computeImpl.
   IntTyBits = DL.getIndexTypeSizeInBits(V->getType());
   Zero = APInt::getZero(IntTyBits);
 
-  V = V->stripPointerCasts();
+  bool IndexTypeSizeChanged = InitialIntTyBits != IntTyBits;
+  if (!IndexTypeSizeChanged && Offset.isZero())
+    return computeImpl(V);
+
+  // We stripped an address space cast that changed the index type size or we
+  // accumulated some constant offset (or both). Readjust the bit width to match
+  // the argument index type size and apply the offset, as required.
+  SizeOffsetType SOT = computeImpl(V);
+  if (IndexTypeSizeChanged) {
+    if (knownSize(SOT) && !::CheckedZextOrTrunc(SOT.first, InitialIntTyBits))
+      SOT.first = APInt();
+    if (knownOffset(SOT) && !::CheckedZextOrTrunc(SOT.second, InitialIntTyBits))
+      SOT.second = APInt();
+  }
+  // If the computed offset is "unknown" we cannot add the stripped offset.
+  return {SOT.first,
+          SOT.second.getBitWidth() > 1 ? SOT.second + Offset : SOT.second};
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::computeImpl(Value *V) {
   if (Instruction *I = dyn_cast<Instruction>(V)) {
     // If we have already seen this instruction, bail out. Cycles can happen in
     // unreachable code after constant propagation.
     if (!SeenInsts.insert(I).second)
       return unknown();
 
-    if (GEPOperator *GEP = dyn_cast<GEPOperator>(V))
-      return visitGEPOperator(*GEP);
     return visit(*I);
   }
   if (Argument *A = dyn_cast<Argument>(V))
@@ -597,12 +717,6 @@ SizeOffsetType ObjectSizeOffsetVisitor::compute(Value *V) {
     return visitGlobalVariable(*GV);
   if (UndefValue *UV = dyn_cast<UndefValue>(V))
     return visitUndefValue(*UV);
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
-    if (CE->getOpcode() == Instruction::IntToPtr)
-      return unknown(); // clueless
-    if (CE->getOpcode() == Instruction::GetElementPtr)
-      return visitGEPOperator(cast<GEPOperator>(*CE));
-  }
 
   LLVM_DEBUG(dbgs() << "ObjectSizeOffsetVisitor::compute() unhandled value: "
                     << *V << '\n');
@@ -617,10 +731,10 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitAllocaInst(AllocaInst &I) {
   if (!I.getAllocatedType()->isSized())
     return unknown();
 
-  if (isa<ScalableVectorType>(I.getAllocatedType()))
+  TypeSize ElemSize = DL.getTypeAllocSize(I.getAllocatedType());
+  if (ElemSize.isScalable() && Options.EvalMode != ObjectSizeOpts::Mode::Min)
     return unknown();
-
-  APInt Size(IntTyBits, DL.getTypeAllocSize(I.getAllocatedType()));
+  APInt Size(IntTyBits, ElemSize.getKnownMinSize());
   if (!I.isArrayAllocation())
     return std::make_pair(align(Size, I.getAlign()), Zero);
 
@@ -682,15 +796,6 @@ ObjectSizeOffsetVisitor::visitExtractValueInst(ExtractValueInst&) {
   return unknown();
 }
 
-SizeOffsetType ObjectSizeOffsetVisitor::visitGEPOperator(GEPOperator &GEP) {
-  SizeOffsetType PtrData = compute(GEP.getPointerOperand());
-  APInt Offset(DL.getIndexTypeSizeInBits(GEP.getPointerOperand()->getType()), 0);
-  if (!bothKnown(PtrData) || !GEP.accumulateConstantOffset(DL, Offset))
-    return unknown();
-
-  return std::make_pair(PtrData.first, PtrData.second + Offset);
-}
-
 SizeOffsetType ObjectSizeOffsetVisitor::visitGlobalAlias(GlobalAlias &GA) {
   if (GA.isInterposable())
     return unknown();
@@ -710,42 +815,161 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitIntToPtrInst(IntToPtrInst&) {
   return unknown();
 }
 
-SizeOffsetType ObjectSizeOffsetVisitor::visitLoadInst(LoadInst&) {
-  ++ObjectVisitorLoad;
-  return unknown();
-}
+SizeOffsetType ObjectSizeOffsetVisitor::findLoadSizeOffset(
+    LoadInst &Load, BasicBlock &BB, BasicBlock::iterator From,
+    SmallDenseMap<BasicBlock *, SizeOffsetType, 8> &VisitedBlocks,
+    unsigned &ScannedInstCount) {
+  constexpr unsigned MaxInstsToScan = 128;
+
+  auto Where = VisitedBlocks.find(&BB);
+  if (Where != VisitedBlocks.end())
+    return Where->second;
+
+  auto Unknown = [this, &BB, &VisitedBlocks]() {
+    return VisitedBlocks[&BB] = unknown();
+  };
+  auto Known = [&BB, &VisitedBlocks](SizeOffsetType SO) {
+    return VisitedBlocks[&BB] = SO;
+  };
+
+  do {
+    Instruction &I = *From;
+
+    if (I.isDebugOrPseudoInst())
+      continue;
+
+    if (++ScannedInstCount > MaxInstsToScan)
+      return Unknown();
+
+    if (!I.mayWriteToMemory())
+      continue;
+
+    if (auto *SI = dyn_cast<StoreInst>(&I)) {
+      AliasResult AR =
+          Options.AA->alias(SI->getPointerOperand(), Load.getPointerOperand());
+      switch ((AliasResult::Kind)AR) {
+      case AliasResult::NoAlias:
+        continue;
+      case AliasResult::MustAlias:
+        if (SI->getValueOperand()->getType()->isPointerTy())
+          return Known(compute(SI->getValueOperand()));
+        else
+          return Unknown(); // No handling of non-pointer values by `compute`.
+      default:
+        return Unknown();
+      }
+    }
 
-SizeOffsetType ObjectSizeOffsetVisitor::visitPHINode(PHINode&) {
-  // too complex to analyze statically.
-  return unknown();
+    if (auto *CB = dyn_cast<CallBase>(&I)) {
+      Function *Callee = CB->getCalledFunction();
+      // Bail out on indirect call.
+      if (!Callee)
+        return Unknown();
+
+      LibFunc TLIFn;
+      if (!TLI || !TLI->getLibFunc(*CB->getCalledFunction(), TLIFn) ||
+          !TLI->has(TLIFn))
+        return Unknown();
+
+      // TODO: There's probably more interesting case to support here.
+      if (TLIFn != LibFunc_posix_memalign)
+        return Unknown();
+
+      AliasResult AR =
+          Options.AA->alias(CB->getOperand(0), Load.getPointerOperand());
+      switch ((AliasResult::Kind)AR) {
+      case AliasResult::NoAlias:
+        continue;
+      case AliasResult::MustAlias:
+        break;
+      default:
+        return Unknown();
+      }
+
+      // Is the error status of posix_memalign correctly checked? If not it
+      // would be incorrect to assume it succeeds and load doesn't see the
+      // previous value.
+      Optional<bool> Checked = isImpliedByDomCondition(
+          ICmpInst::ICMP_EQ, CB, ConstantInt::get(CB->getType(), 0), &Load, DL);
+      if (!Checked || !*Checked)
+        return Unknown();
+
+      Value *Size = CB->getOperand(2);
+      auto *C = dyn_cast<ConstantInt>(Size);
+      if (!C)
+        return Unknown();
+
+      return Known({C->getValue(), APInt(C->getValue().getBitWidth(), 0)});
+    }
+
+    return Unknown();
+  } while (From-- != BB.begin());
+
+  SmallVector<SizeOffsetType> PredecessorSizeOffsets;
+  for (auto *PredBB : predecessors(&BB)) {
+    PredecessorSizeOffsets.push_back(findLoadSizeOffset(
+        Load, *PredBB, BasicBlock::iterator(PredBB->getTerminator()),
+        VisitedBlocks, ScannedInstCount));
+    if (!bothKnown(PredecessorSizeOffsets.back()))
+      return Unknown();
+  }
+
+  if (PredecessorSizeOffsets.empty())
+    return Unknown();
+
+  return Known(std::accumulate(PredecessorSizeOffsets.begin() + 1,
+                               PredecessorSizeOffsets.end(),
+                               PredecessorSizeOffsets.front(),
+                               [this](SizeOffsetType LHS, SizeOffsetType RHS) {
+                                 return combineSizeOffset(LHS, RHS);
+                               }));
 }
 
-SizeOffsetType ObjectSizeOffsetVisitor::visitSelectInst(SelectInst &I) {
-  SizeOffsetType TrueSide  = compute(I.getTrueValue());
-  SizeOffsetType FalseSide = compute(I.getFalseValue());
-  if (bothKnown(TrueSide) && bothKnown(FalseSide)) {
-    if (TrueSide == FalseSide) {
-        return TrueSide;
-    }
+SizeOffsetType ObjectSizeOffsetVisitor::visitLoadInst(LoadInst &LI) {
+  if (!Options.AA) {
+    ++ObjectVisitorLoad;
+    return unknown();
+  }
 
-    APInt TrueResult = getSizeWithOverflow(TrueSide);
-    APInt FalseResult = getSizeWithOverflow(FalseSide);
+  SmallDenseMap<BasicBlock *, SizeOffsetType, 8> VisitedBlocks;
+  unsigned ScannedInstCount = 0;
+  SizeOffsetType SO =
+      findLoadSizeOffset(LI, *LI.getParent(), BasicBlock::iterator(LI),
+                         VisitedBlocks, ScannedInstCount);
+  if (!bothKnown(SO))
+    ++ObjectVisitorLoad;
+  return SO;
+}
 
-    if (TrueResult == FalseResult) {
-      return TrueSide;
-    }
-    if (Options.EvalMode == ObjectSizeOpts::Mode::Min) {
-      if (TrueResult.slt(FalseResult))
-        return TrueSide;
-      return FalseSide;
-    }
-    if (Options.EvalMode == ObjectSizeOpts::Mode::Max) {
-      if (TrueResult.sgt(FalseResult))
-        return TrueSide;
-      return FalseSide;
-    }
+SizeOffsetType ObjectSizeOffsetVisitor::combineSizeOffset(SizeOffsetType LHS,
+                                                          SizeOffsetType RHS) {
+  if (!bothKnown(LHS) || !bothKnown(RHS))
+    return unknown();
+
+  switch (Options.EvalMode) {
+  case ObjectSizeOpts::Mode::Min:
+    return (getSizeWithOverflow(LHS).slt(getSizeWithOverflow(RHS))) ? LHS : RHS;
+  case ObjectSizeOpts::Mode::Max:
+    return (getSizeWithOverflow(LHS).sgt(getSizeWithOverflow(RHS))) ? LHS : RHS;
+  case ObjectSizeOpts::Mode::Exact:
+    return (getSizeWithOverflow(LHS).eq(getSizeWithOverflow(RHS))) ? LHS
+                                                                   : unknown();
   }
-  return unknown();
+  llvm_unreachable("missing an eval mode");
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitPHINode(PHINode &PN) {
+  auto IncomingValues = PN.incoming_values();
+  return std::accumulate(IncomingValues.begin() + 1, IncomingValues.end(),
+                         compute(*IncomingValues.begin()),
+                         [this](SizeOffsetType LHS, Value *VRHS) {
+                           return combineSizeOffset(LHS, compute(VRHS));
+                         });
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitSelectInst(SelectInst &I) {
+  return combineSizeOffset(compute(I.getTrueValue()),
+                           compute(I.getFalseValue()));
 }
 
 SizeOffsetType ObjectSizeOffsetVisitor::visitUndefValue(UndefValue&) {
@@ -790,7 +1014,7 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute(Value *V) {
 
     // Erase any instructions we inserted as part of the traversal.
     for (Instruction *I : InsertedInstructions) {
-      I->replaceAllUsesWith(UndefValue::get(I->getType()));
+      I->replaceAllUsesWith(PoisonValue::get(I->getType()));
       I->eraseFromParent();
     }
   }
@@ -919,7 +1143,7 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitIntToPtrInst(IntToPtrInst&) {
   return unknown();
 }
 
-SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitLoadInst(LoadInst&) {
+SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitLoadInst(LoadInst &LI) {
   return unknown();
 }
 
@@ -937,10 +1161,10 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitPHINode(PHINode &PHI) {
     SizeOffsetEvalType EdgeData = compute_(PHI.getIncomingValue(i));
 
     if (!bothKnown(EdgeData)) {
-      OffsetPHI->replaceAllUsesWith(UndefValue::get(IntTy));
+      OffsetPHI->replaceAllUsesWith(PoisonValue::get(IntTy));
       OffsetPHI->eraseFromParent();
       InsertedInstructions.erase(OffsetPHI);
-      SizePHI->replaceAllUsesWith(UndefValue::get(IntTy));
+      SizePHI->replaceAllUsesWith(PoisonValue::get(IntTy));
       SizePHI->eraseFromParent();
       InsertedInstructions.erase(SizePHI);
       return unknown();
diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
index 36df462c7a66..690d575ef979 100644
--- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -27,11 +27,7 @@
 #include "llvm/Analysis/PhiValues.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstrTypes.h"
@@ -44,7 +40,6 @@
 #include "llvm/IR/PredIteratorCache.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
@@ -53,10 +48,8 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/MathExtras.h"
 #include <algorithm>
 #include <cassert>
-#include <cstdint>
 #include <iterator>
 #include <utility>
 
@@ -414,20 +407,17 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
       isInvariantLoad = true;
   }
 
-  // Return "true" if and only if the instruction I is either a non-simple
-  // load or a non-simple store.
-  auto isNonSimpleLoadOrStore = [](Instruction *I) -> bool {
+  // True for volatile instruction.
+  // For Load/Store return true if atomic ordering is stronger than AO,
+  // for other instruction just true if it can read or write to memory.
+  auto isComplexForReordering = [](Instruction * I, AtomicOrdering AO)->bool {
+    if (I->isVolatile())
+      return true;
     if (auto *LI = dyn_cast<LoadInst>(I))
-      return !LI->isSimple();
+      return isStrongerThan(LI->getOrdering(), AO);
     if (auto *SI = dyn_cast<StoreInst>(I))
-      return !SI->isSimple();
-    return false;
-  };
-
-  // Return "true" if I is not a load and not a store, but it does access
-  // memory.
-  auto isOtherMemAccess = [](Instruction *I) -> bool {
-    return !isa<LoadInst>(I) && !isa<StoreInst>(I) && I->mayReadOrWriteMemory();
+      return isStrongerThan(SI->getOrdering(), AO);
+    return I->mayReadOrWriteMemory();
   };
 
   // Walk backwards through the basic block, looking for dependencies.
@@ -500,8 +490,8 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
       // atomic.
       // FIXME: This is overly conservative.
       if (LI->isAtomic() && isStrongerThanUnordered(LI->getOrdering())) {
-        if (!QueryInst || isNonSimpleLoadOrStore(QueryInst) ||
-            isOtherMemAccess(QueryInst))
+        if (!QueryInst ||
+            isComplexForReordering(QueryInst, AtomicOrdering::NotAtomic))
           return MemDepResult::getClobber(LI);
         if (LI->getOrdering() != AtomicOrdering::Monotonic)
           return MemDepResult::getClobber(LI);
@@ -512,10 +502,10 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
       // If we found a pointer, check if it could be the same as our pointer.
       AliasResult R = BatchAA.alias(LoadLoc, MemLoc);
 
-      if (isLoad) {
-        if (R == AliasResult::NoAlias)
-          continue;
+      if (R == AliasResult::NoAlias)
+        continue;
 
+      if (isLoad) {
         // Must aliased loads are defs of each other.
         if (R == AliasResult::MustAlias)
           return MemDepResult::getDef(Inst);
@@ -532,10 +522,6 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
         continue;
       }
 
-      // Stores don't depend on other no-aliased accesses.
-      if (R == AliasResult::NoAlias)
-        continue;
-
       // Stores don't alias loads from read-only memory.
       if (BatchAA.pointsToConstantMemory(LoadLoc))
         continue;
@@ -549,20 +535,25 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
       // A Monotonic store is OK if the query inst is itself not atomic.
       // FIXME: This is overly conservative.
       if (!SI->isUnordered() && SI->isAtomic()) {
-        if (!QueryInst || isNonSimpleLoadOrStore(QueryInst) ||
-            isOtherMemAccess(QueryInst))
-          return MemDepResult::getClobber(SI);
-        if (SI->getOrdering() != AtomicOrdering::Monotonic)
+        if (!QueryInst ||
+            isComplexForReordering(QueryInst, AtomicOrdering::Unordered))
           return MemDepResult::getClobber(SI);
+        // Ok, if we are here the guard above guarantee us that
+        // QueryInst is a non-atomic or unordered load/store.
+        // SI is atomic with monotonic or release semantic (seq_cst for store
+        // is actually a release semantic plus total order over other seq_cst
+        // instructions, as soon as QueryInst is not seq_cst we can consider it
+        // as simple release semantic).
+        // Monotonic and Release semantic allows re-ordering before store
+        // so we are safe to go further and check the aliasing. It will prohibit
+        // re-ordering in case locations are may or must alias.
       }
 
-      // FIXME: this is overly conservative.
       // While volatile access cannot be eliminated, they do not have to clobber
       // non-aliasing locations, as normal accesses can for example be reordered
       // with volatile accesses.
       if (SI->isVolatile())
-        if (!QueryInst || isNonSimpleLoadOrStore(QueryInst) ||
-            isOtherMemAccess(QueryInst))
+        if (!QueryInst || QueryInst->isVolatile())
           return MemDepResult::getClobber(SI);
 
       // If alias analysis can tell that this store is guaranteed to not modify
@@ -743,8 +734,6 @@ MemoryDependenceResults::getNonLocalCallDependency(CallBase *QueryCall) {
     llvm::sort(Cache);
 
     ++NumCacheDirtyNonLocal;
-    // cerr << "CACHED CASE: " << DirtyBlocks.size() << " dirty: "
-    //     << Cache.size() << " cached: " << *QueryInst;
   } else {
     // Seed DirtyBlocks with each of the preds of QueryInst's block.
     BasicBlock *QueryBB = QueryCall->getParent();
@@ -1204,7 +1193,6 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
     // If we do process a large number of blocks it becomes very expensive and
     // likely it isn't worth worrying about
     if (Result.size() > NumResultsLimit) {
-      Worklist.clear();
       // Sort it now (if needed) so that recursive invocations of
       // getNonLocalPointerDepFromBB and other routines that could reuse the
       // cache value will only see properly sorted cache arrays.
diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp
index a877b19df866..2ed32227bd9e 100644
--- a/llvm/lib/Analysis/MemoryLocation.cpp
+++ b/llvm/lib/Analysis/MemoryLocation.cpp
@@ -8,12 +8,10 @@
 
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsARM.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 using namespace llvm;
diff --git a/llvm/lib/Analysis/MemorySSA.cpp b/llvm/lib/Analysis/MemorySSA.cpp
index 57f431ec21f5..76371b88812e 100644
--- a/llvm/lib/Analysis/MemorySSA.cpp
+++ b/llvm/lib/Analysis/MemorySSA.cpp
@@ -36,8 +36,8 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Operator.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Use.h"
 #include "llvm/InitializePasses.h"
@@ -49,10 +49,10 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
-#include <cstdlib>
 #include <iterator>
 #include <memory>
 #include <utility>
@@ -130,6 +130,12 @@ public:
   MemorySSAWalkerAnnotatedWriter(MemorySSA *M)
       : MSSA(M), Walker(M->getWalker()) {}
 
+  void emitBasicBlockStartAnnot(const BasicBlock *BB,
+                                formatted_raw_ostream &OS) override {
+    if (MemoryAccess *MA = MSSA->getMemoryAccess(BB))
+      OS << "; " << *MA << "\n";
+  }
+
   void emitInstructionAnnot(const Instruction *I,
                             formatted_raw_ostream &OS) override {
     if (MemoryAccess *MA = MSSA->getMemoryAccess(I)) {
@@ -732,7 +738,7 @@ template <class AliasAnalysisType> class ClobberWalker {
   struct generic_def_path_iterator
       : public iterator_facade_base<generic_def_path_iterator<T, Walker>,
                                     std::forward_iterator_tag, T *> {
-    generic_def_path_iterator() {}
+    generic_def_path_iterator() = default;
     generic_def_path_iterator(Walker *W, ListIndex N) : W(W), N(N) {}
 
     T &operator*() const { return curNode(); }
@@ -743,9 +749,9 @@ template <class AliasAnalysisType> class ClobberWalker {
     }
 
     bool operator==(const generic_def_path_iterator &O) const {
-      if (N.hasValue() != O.N.hasValue())
+      if (N.has_value() != O.N.has_value())
         return false;
-      return !N.hasValue() || *N == *O.N;
+      return !N || *N == *O.N;
     }
 
   private:
@@ -1397,6 +1403,9 @@ void MemorySSA::OptimizeUses::optimizeUsesInBlock(
       continue;
     }
 
+    if (MU->isOptimized())
+      continue;
+
     if (isUseTriviallyOptimizableToLiveOnEntry(*AA, MU->getMemoryInst())) {
       MU->setDefiningAccess(MSSA->getLiveOnEntryDef(), true, None);
       continue;
@@ -1585,10 +1594,6 @@ void MemorySSA::buildMemorySSA(BatchAAResults &BAA) {
   SmallPtrSet<BasicBlock *, 16> Visited;
   renamePass(DT->getRootNode(), LiveOnEntryDef.get(), Visited);
 
-  ClobberWalkerBase<BatchAAResults> WalkerBase(this, &BAA, DT);
-  CachingWalker<BatchAAResults> WalkerLocal(this, &WalkerBase);
-  OptimizeUses(this, &WalkerLocal, &BAA, DT).optimizeUses();
-
   // Mark the uses in unreachable blocks as live on entry, so that they go
   // somewhere.
   for (auto &BB : F)
@@ -2178,6 +2183,17 @@ bool MemorySSA::dominates(const MemoryAccess *Dominator,
   return dominates(Dominator, cast<MemoryAccess>(Dominatee.getUser()));
 }
 
+void MemorySSA::ensureOptimizedUses() {
+  if (IsOptimized)
+    return;
+
+  BatchAAResults BatchAA(*AA);
+  ClobberWalkerBase<BatchAAResults> WalkerBase(this, &BatchAA, DT);
+  CachingWalker<BatchAAResults> WalkerLocal(this, &WalkerBase);
+  OptimizeUses(this, &WalkerLocal, &BatchAA, DT).optimizeUses();
+  IsOptimized = true;
+}
+
 void MemoryAccess::print(raw_ostream &OS) const {
   switch (getValueID()) {
   case MemoryPhiVal: return static_cast<const MemoryPhi *>(this)->print(OS);
@@ -2350,6 +2366,7 @@ struct DOTGraphTraits<DOTFuncMSSAInfo *> : public DefaultDOTGraphTraits {
 
 bool MemorySSAPrinterLegacyPass::runOnFunction(Function &F) {
   auto &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
+  MSSA.ensureOptimizedUses();
   if (DotCFGMSSA != "") {
     DOTFuncMSSAInfo CFGInfo(F, MSSA);
     WriteGraph(&CFGInfo, "", false, "MSSA", DotCFGMSSA);
@@ -2382,6 +2399,7 @@ bool MemorySSAAnalysis::Result::invalidate(
 PreservedAnalyses MemorySSAPrinterPass::run(Function &F,
                                             FunctionAnalysisManager &AM) {
   auto &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
+  MSSA.ensureOptimizedUses();
   if (DotCFGMSSA != "") {
     DOTFuncMSSAInfo CFGInfo(F, MSSA);
     WriteGraph(&CFGInfo, "", false, "MSSA", DotCFGMSSA);
diff --git a/llvm/lib/Analysis/MemorySSAUpdater.cpp b/llvm/lib/Analysis/MemorySSAUpdater.cpp
index 9c841883de6d..eb75118210b9 100644
--- a/llvm/lib/Analysis/MemorySSAUpdater.cpp
+++ b/llvm/lib/Analysis/MemorySSAUpdater.cpp
@@ -10,22 +10,15 @@
 //
 //===----------------------------------------------------------------===//
 #include "llvm/Analysis/MemorySSAUpdater.h"
-#include "llvm/Analysis/LoopIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/IteratedDominanceFrontier.h"
+#include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/FormattedStream.h"
 #include <algorithm>
 
 #define DEBUG_TYPE "memoryssa"
@@ -243,6 +236,7 @@ MemoryAccess *MemorySSAUpdater::tryRemoveTrivialPhi(MemoryPhi *Phi,
 }
 
 void MemorySSAUpdater::insertUse(MemoryUse *MU, bool RenameUses) {
+  VisitedBlocks.clear();
   InsertedPHIs.clear();
   MU->setDefiningAccess(getPreviousDef(MU));
 
@@ -311,6 +305,13 @@ static void setMemoryPhiValueForBlock(MemoryPhi *MP, const BasicBlock *BB,
 // point to the correct new defs, to ensure we only have one variable, and no
 // disconnected stores.
 void MemorySSAUpdater::insertDef(MemoryDef *MD, bool RenameUses) {
+  // Don't bother updating dead code.
+  if (!MSSA->DT->isReachableFromEntry(MD->getBlock())) {
+    MD->setDefiningAccess(MSSA->getLiveOnEntryDef());
+    return;
+  }
+
+  VisitedBlocks.clear();
   InsertedPHIs.clear();
 
   // See if we had a local def, and if not, go hunting.
@@ -427,10 +428,10 @@ void MemorySSAUpdater::insertDef(MemoryDef *MD, bool RenameUses) {
   if (NewPhiSize)
     tryRemoveTrivialPhis(ArrayRef<WeakVH>(&InsertedPHIs[NewPhiIndex], NewPhiSize));
 
-  // Now that all fixups are done, rename all uses if we are asked. Skip
-  // renaming for defs in unreachable blocks.
+  // Now that all fixups are done, rename all uses if we are asked. The defs are
+  // guaranteed to be in reachable code due to the check at the method entry.
   BasicBlock *StartBlock = MD->getBlock();
-  if (RenameUses && MSSA->getDomTree().getNode(StartBlock)) {
+  if (RenameUses) {
     SmallPtrSet<BasicBlock *, 16> Visited;
     // We are guaranteed there is a def in the block, because we just got it
     // handed to us in this function.
diff --git a/llvm/lib/Analysis/ModelUnderTrainingRunner.cpp b/llvm/lib/Analysis/ModelUnderTrainingRunner.cpp
index fab51d6a7aaf..dc149f326271 100644
--- a/llvm/lib/Analysis/ModelUnderTrainingRunner.cpp
+++ b/llvm/lib/Analysis/ModelUnderTrainingRunner.cpp
@@ -22,7 +22,7 @@ ModelUnderTrainingRunner::ModelUnderTrainingRunner(
     LLVMContext &Ctx, const std::string &ModelPath,
     const std::vector<TensorSpec> &InputSpecs,
     const std::vector<LoggedFeatureSpec> &OutputSpecs)
-    : MLModelRunner(Ctx, MLModelRunner::Kind::Development),
+    : MLModelRunner(Ctx, MLModelRunner::Kind::Development, InputSpecs.size()),
       OutputSpecs(OutputSpecs) {
   Evaluator = std::make_unique<TFModelEvaluator>(
       ModelPath, InputSpecs, [&](size_t I) { return OutputSpecs[I].Spec; },
@@ -32,6 +32,10 @@ ModelUnderTrainingRunner::ModelUnderTrainingRunner(
     Evaluator.reset();
     return;
   }
+
+  for (size_t I = 0, E = InputSpecs.size(); I < E; ++I) {
+    setUpBufferForTensor(I, InputSpecs[I], Evaluator->getUntypedInput(I));
+  }
 }
 
 void *ModelUnderTrainingRunner::evaluateUntyped() {
@@ -43,24 +47,31 @@ void *ModelUnderTrainingRunner::evaluateUntyped() {
   return LastEvaluationResult->getUntypedTensorValue(0);
 }
 
-void *ModelUnderTrainingRunner::getTensorUntyped(size_t Index) {
-  return Evaluator->getUntypedInput(Index);
-}
-
 std::unique_ptr<ModelUnderTrainingRunner>
 ModelUnderTrainingRunner::createAndEnsureValid(
     LLVMContext &Ctx, const std::string &ModelPath, StringRef DecisionName,
     const std::vector<TensorSpec> &InputSpecs,
     StringRef OutputSpecsPathOverride) {
-  std::unique_ptr<ModelUnderTrainingRunner> MUTR;
   if (auto MaybeOutputSpecs = loadOutputSpecs(Ctx, DecisionName, ModelPath,
                                               OutputSpecsPathOverride))
-    MUTR.reset(new ModelUnderTrainingRunner(Ctx, ModelPath, InputSpecs,
-                                            *MaybeOutputSpecs));
+    return createAndEnsureValid(Ctx, ModelPath, DecisionName, InputSpecs,
+                                *MaybeOutputSpecs);
+  Ctx.emitError("Could not load the policy model from the provided path");
+  return nullptr;
+}
+
+std::unique_ptr<ModelUnderTrainingRunner>
+ModelUnderTrainingRunner::createAndEnsureValid(
+    LLVMContext &Ctx, const std::string &ModelPath, StringRef DecisionName,
+    const std::vector<TensorSpec> &InputSpecs,
+    const std::vector<LoggedFeatureSpec> &OutputSpecs) {
+  std::unique_ptr<ModelUnderTrainingRunner> MUTR;
+  MUTR.reset(
+      new ModelUnderTrainingRunner(Ctx, ModelPath, InputSpecs, OutputSpecs));
   if (MUTR && MUTR->isValid())
     return MUTR;
 
-  Ctx.emitError("Could not load the policy model from the provided path");
+  Ctx.emitError("Could not load or create model evaluator.");
   return nullptr;
 }
 
diff --git a/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp b/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp
index 64fd5eb1acd4..373aaa48b1d1 100644
--- a/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp
+++ b/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp
@@ -15,8 +15,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/ModuleDebugInfoPrinter.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Passes.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/InitializePasses.h"
diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
index 2880ca62a7f8..2b98634ef7bf 100644
--- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -38,7 +38,6 @@
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSummaryIndex.h"
@@ -368,7 +367,7 @@ static void computeFunctionSummary(
         // We should have named any anonymous globals
         assert(CalledFunction->hasName());
         auto ScaledCount = PSI->getProfileCount(*CB, BFI);
-        auto Hotness = ScaledCount ? getHotness(ScaledCount.getValue(), PSI)
+        auto Hotness = ScaledCount ? getHotness(*ScaledCount, PSI)
                                    : CalleeInfo::HotnessType::Unknown;
         if (ForceSummaryEdgesCold != FunctionSummary::FSHT_None)
           Hotness = CalleeInfo::HotnessType::Cold;
@@ -490,8 +489,7 @@ static void computeFunctionSummary(
                               HasIndirBranchToBlockAddress;
   GlobalValueSummary::GVFlags Flags(
       F.getLinkage(), F.getVisibility(), NotEligibleForImport,
-      /* Live = */ false, F.isDSOLocal(),
-      F.hasLinkOnceODRLinkage() && F.hasGlobalUnnamedAddr());
+      /* Live = */ false, F.isDSOLocal(), F.canBeOmittedFromSymbolTable());
   FunctionSummary::FFlags FunFlags{
       F.hasFnAttribute(Attribute::ReadNone),
       F.hasFnAttribute(Attribute::ReadOnly),
@@ -612,8 +610,7 @@ static void computeVariableSummary(ModuleSummaryIndex &Index,
   bool NonRenamableLocal = isNonRenamableLocal(V);
   GlobalValueSummary::GVFlags Flags(
       V.getLinkage(), V.getVisibility(), NonRenamableLocal,
-      /* Live = */ false, V.isDSOLocal(),
-      V.hasLinkOnceODRLinkage() && V.hasGlobalUnnamedAddr());
+      /* Live = */ false, V.isDSOLocal(), V.canBeOmittedFromSymbolTable());
 
   VTableFuncList VTableFuncs;
   // If splitting is not enabled, then we compute the summary information
@@ -655,8 +652,7 @@ computeAliasSummary(ModuleSummaryIndex &Index, const GlobalAlias &A,
   bool NonRenamableLocal = isNonRenamableLocal(A);
   GlobalValueSummary::GVFlags Flags(
       A.getLinkage(), A.getVisibility(), NonRenamableLocal,
-      /* Live = */ false, A.isDSOLocal(),
-      A.hasLinkOnceODRLinkage() && A.hasGlobalUnnamedAddr());
+      /* Live = */ false, A.isDSOLocal(), A.canBeOmittedFromSymbolTable());
   auto AS = std::make_unique<AliasSummary>(Flags);
   auto *Aliasee = A.getAliaseeObject();
   auto AliaseeVI = Index.getValueInfo(Aliasee->getGUID());
@@ -733,8 +729,7 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
               GlobalValue::InternalLinkage, GlobalValue::DefaultVisibility,
               /* NotEligibleToImport = */ true,
               /* Live = */ true,
-              /* Local */ GV->isDSOLocal(),
-              GV->hasLinkOnceODRLinkage() && GV->hasGlobalUnnamedAddr());
+              /* Local */ GV->isDSOLocal(), GV->canBeOmittedFromSymbolTable());
           CantBePromoted.insert(GV->getGUID());
           // Create the appropriate summary type.
           if (Function *F = dyn_cast<Function>(GV)) {
diff --git a/llvm/lib/Analysis/MustExecute.cpp b/llvm/lib/Analysis/MustExecute.cpp
index 5ca72f5f3623..5cff986245b9 100644
--- a/llvm/lib/Analysis/MustExecute.cpp
+++ b/llvm/lib/Analysis/MustExecute.cpp
@@ -16,14 +16,11 @@
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/AssemblyAnnotationWriter.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/InstIterator.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -143,7 +140,7 @@ static bool CanProveNotTakenFirstIteration(const BasicBlock *ExitBlock,
     return false;
   auto DL = ExitBlock->getModule()->getDataLayout();
   auto *IVStart = LHS->getIncomingValueForBlock(CurLoop->getLoopPreheader());
-  auto *SimpleValOrNull = SimplifyCmpInst(Cond->getPredicate(),
+  auto *SimpleValOrNull = simplifyCmpInst(Cond->getPredicate(),
                                           IVStart, RHS,
                                           {DL, /*TLI*/ nullptr,
                                               DT, /*AC*/ nullptr, BI});
@@ -494,7 +491,7 @@ template <typename K, typename V, typename FnTy, typename... ArgsTy>
 static V getOrCreateCachedOptional(K Key, DenseMap<K, Optional<V>> &Map,
                                    FnTy &&Fn, ArgsTy&&... args) {
   Optional<V> &OptVal = Map[Key];
-  if (!OptVal.hasValue())
+  if (!OptVal)
     OptVal = Fn(std::forward<ArgsTy>(args)...);
   return OptVal.getValue();
 }
diff --git a/llvm/lib/Analysis/NoInferenceModelRunner.cpp b/llvm/lib/Analysis/NoInferenceModelRunner.cpp
index 7178120ebe4f..1914b22f5d71 100644
--- a/llvm/lib/Analysis/NoInferenceModelRunner.cpp
+++ b/llvm/lib/Analysis/NoInferenceModelRunner.cpp
@@ -10,24 +10,14 @@
 // logs for the default policy, in 'development' mode, but never ask it to
 // 'run'.
 //===----------------------------------------------------------------------===//
-#include "llvm/Config/config.h"
-#if defined(LLVM_HAVE_TF_API)
-
 #include "llvm/Analysis/NoInferenceModelRunner.h"
-#include "llvm/Analysis/Utils/TFUtils.h"
 
 using namespace llvm;
 
 NoInferenceModelRunner::NoInferenceModelRunner(
     LLVMContext &Ctx, const std::vector<TensorSpec> &Inputs)
-    : MLModelRunner(Ctx, MLModelRunner::Kind::NoOp) {
-  ValuesBuffer.reserve(Inputs.size());
+    : MLModelRunner(Ctx, MLModelRunner::Kind::NoOp, Inputs.size()) {
+  size_t Index = 0;
   for (const auto &TS : Inputs)
-    ValuesBuffer.push_back(std::make_unique<char[]>(TS.getElementCount() *
-                                                    TS.getElementByteSize()));
-}
-
-void *NoInferenceModelRunner::getTensorUntyped(size_t Index) {
-  return ValuesBuffer[Index].get();
+    setUpBufferForTensor(Index++, TS, nullptr);
 }
-#endif // defined(LLVM_HAVE_TF_API)
diff --git a/llvm/lib/Analysis/ObjCARCAliasAnalysis.cpp b/llvm/lib/Analysis/ObjCARCAliasAnalysis.cpp
index 0826b3078672..6fe056d36668 100644
--- a/llvm/lib/Analysis/ObjCARCAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/ObjCARCAliasAnalysis.cpp
@@ -26,8 +26,6 @@
 #include "llvm/Analysis/ObjCARCAnalysisUtils.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 
diff --git a/llvm/lib/Analysis/OptimizationRemarkEmitter.cpp b/llvm/lib/Analysis/OptimizationRemarkEmitter.cpp
index 6f3d4d536c40..17b40f03a5a5 100644
--- a/llvm/lib/Analysis/OptimizationRemarkEmitter.cpp
+++ b/llvm/lib/Analysis/OptimizationRemarkEmitter.cpp
@@ -47,7 +47,7 @@ OptimizationRemarkEmitter::OptimizationRemarkEmitter(const Function *F)
 bool OptimizationRemarkEmitter::invalidate(
     Function &F, const PreservedAnalyses &PA,
     FunctionAnalysisManager::Invalidator &Inv) {
-  if (OwnedBFI.get()) {
+  if (OwnedBFI) {
     OwnedBFI.reset();
     BFI = nullptr;
   }
@@ -80,7 +80,7 @@ void OptimizationRemarkEmitter::emit(
   computeHotness(OptDiag);
 
   // Only emit it if its hotness meets the threshold.
-  if (OptDiag.getHotness().getValueOr(0) <
+  if (OptDiag.getHotness().value_or(0) <
       F->getContext().getDiagnosticsHotnessThreshold()) {
     return;
   }
diff --git a/llvm/lib/Analysis/OverflowInstAnalysis.cpp b/llvm/lib/Analysis/OverflowInstAnalysis.cpp
index 87a85e6a7364..8bfd6642f760 100644
--- a/llvm/lib/Analysis/OverflowInstAnalysis.cpp
+++ b/llvm/lib/Analysis/OverflowInstAnalysis.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/OverflowInstAnalysis.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/PatternMatch.h"
 
diff --git a/llvm/lib/Analysis/PHITransAddr.cpp b/llvm/lib/Analysis/PHITransAddr.cpp
index 02d084937ccb..7571bd0059cc 100644
--- a/llvm/lib/Analysis/PHITransAddr.cpp
+++ b/llvm/lib/Analysis/PHITransAddr.cpp
@@ -17,7 +17,6 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
@@ -35,9 +34,6 @@ static bool CanPHITrans(Instruction *Inst) {
       isa<ConstantInt>(Inst->getOperand(1)))
     return true;
 
-  //   cerr << "MEMDEP: Could not PHI translate: " << *Pointer;
-  //   if (isa<BitCastInst>(PtrInst) || isa<GetElementPtrInst>(PtrInst))
-  //     cerr << "OP:\t\t\t\t" << *PtrInst->getOperand(0);
   return false;
 }
 
@@ -226,7 +222,7 @@ Value *PHITransAddr::PHITranslateSubExpr(Value *V, BasicBlock *CurBB,
       return GEP;
 
     // Simplify the GEP to handle 'gep x, 0' -> x etc.
-    if (Value *V = SimplifyGEPInst(GEP->getSourceElementType(), GEPOps[0],
+    if (Value *V = simplifyGEPInst(GEP->getSourceElementType(), GEPOps[0],
                                    ArrayRef<Value *>(GEPOps).slice(1),
                                    GEP->isInBounds(), {DL, TLI, DT, AC})) {
       for (unsigned i = 0, e = GEPOps.size(); i != e; ++i)
@@ -240,6 +236,7 @@ Value *PHITransAddr::PHITranslateSubExpr(Value *V, BasicBlock *CurBB,
     for (User *U : APHIOp->users()) {
       if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U))
         if (GEPI->getType() == GEP->getType() &&
+            GEPI->getSourceElementType() == GEP->getSourceElementType() &&
             GEPI->getNumOperands() == GEPOps.size() &&
             GEPI->getParent()->getParent() == CurBB->getParent() &&
             (!DT || DT->dominates(GEPI->getParent(), PredBB))) {
@@ -277,7 +274,7 @@ Value *PHITransAddr::PHITranslateSubExpr(Value *V, BasicBlock *CurBB,
         }
 
     // See if the add simplifies away.
-    if (Value *Res = SimplifyAddInst(LHS, RHS, isNSW, isNUW, {DL, TLI, DT, AC})) {
+    if (Value *Res = simplifyAddInst(LHS, RHS, isNSW, isNUW, {DL, TLI, DT, AC})) {
       // If we simplified the operands, the LHS is no longer an input, but Res
       // is.
       RemoveInstInputs(LHS, InstInputs);
diff --git a/llvm/lib/Analysis/ProfileSummaryInfo.cpp b/llvm/lib/Analysis/ProfileSummaryInfo.cpp
index 268ed9d04741..9d5fa6d0a41b 100644
--- a/llvm/lib/Analysis/ProfileSummaryInfo.cpp
+++ b/llvm/lib/Analysis/ProfileSummaryInfo.cpp
@@ -15,7 +15,6 @@
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ProfileSummary.h"
 #include "llvm/InitializePasses.h"
@@ -125,7 +124,7 @@ bool ProfileSummaryInfo::isFunctionHotInCallGraph(
       for (const auto &I : BB)
         if (isa<CallInst>(I) || isa<InvokeInst>(I))
           if (auto CallCount = getProfileCount(cast<CallBase>(I), nullptr))
-            TotalCallCount += CallCount.getValue();
+            TotalCallCount += *CallCount;
     if (isHotCount(TotalCallCount))
       return true;
   }
@@ -154,7 +153,7 @@ bool ProfileSummaryInfo::isFunctionColdInCallGraph(
       for (const auto &I : BB)
         if (isa<CallInst>(I) || isa<InvokeInst>(I))
           if (auto CallCount = getProfileCount(cast<CallBase>(I), nullptr))
-            TotalCallCount += CallCount.getValue();
+            TotalCallCount += *CallCount;
     if (!isColdCount(TotalCallCount))
       return false;
   }
@@ -166,7 +165,7 @@ bool ProfileSummaryInfo::isFunctionColdInCallGraph(
 
 bool ProfileSummaryInfo::isFunctionHotnessUnknown(const Function &F) const {
   assert(hasPartialSampleProfile() && "Expect partial sample profile");
-  return !F.getEntryCount().hasValue();
+  return !F.getEntryCount();
 }
 
 template <bool isHot>
@@ -188,7 +187,7 @@ bool ProfileSummaryInfo::isFunctionHotOrColdInCallGraphNthPercentile(
       for (const auto &I : BB)
         if (isa<CallInst>(I) || isa<InvokeInst>(I))
           if (auto CallCount = getProfileCount(cast<CallBase>(I), nullptr))
-            TotalCallCount += CallCount.getValue();
+            TotalCallCount += *CallCount;
     if (isHot && isHotCountNthPercentile(PercentileCutoff, TotalCallCount))
       return true;
     if (!isHot && !isColdCountNthPercentile(PercentileCutoff, TotalCallCount))
@@ -316,11 +315,11 @@ bool ProfileSummaryInfo::isColdCountNthPercentile(int PercentileCutoff,
 }
 
 uint64_t ProfileSummaryInfo::getOrCompHotCountThreshold() const {
-  return HotCountThreshold.getValueOr(UINT64_MAX);
+  return HotCountThreshold.value_or(UINT64_MAX);
 }
 
 uint64_t ProfileSummaryInfo::getOrCompColdCountThreshold() const {
-  return ColdCountThreshold.getValueOr(0);
+  return ColdCountThreshold.value_or(0);
 }
 
 bool ProfileSummaryInfo::isHotBlock(const BasicBlock *BB,
diff --git a/llvm/lib/Analysis/PtrUseVisitor.cpp b/llvm/lib/Analysis/PtrUseVisitor.cpp
index 9a834ba4866a..49304818d7ef 100644
--- a/llvm/lib/Analysis/PtrUseVisitor.cpp
+++ b/llvm/lib/Analysis/PtrUseVisitor.cpp
@@ -14,7 +14,6 @@
 #include "llvm/Analysis/PtrUseVisitor.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include <algorithm>
 
 using namespace llvm;
 
diff --git a/llvm/lib/Analysis/RegionInfo.cpp b/llvm/lib/Analysis/RegionInfo.cpp
index 3ba0bb9eaf2c..9be23a374eca 100644
--- a/llvm/lib/Analysis/RegionInfo.cpp
+++ b/llvm/lib/Analysis/RegionInfo.cpp
@@ -10,6 +10,7 @@
 
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/DominanceFrontier.h"
 #include "llvm/InitializePasses.h"
 #ifndef NDEBUG
 #include "llvm/Analysis/RegionPrinter.h"
diff --git a/llvm/lib/Analysis/RegionPass.cpp b/llvm/lib/Analysis/RegionPass.cpp
index 10c8569096c6..ddef3be8df37 100644
--- a/llvm/lib/Analysis/RegionPass.cpp
+++ b/llvm/lib/Analysis/RegionPass.cpp
@@ -12,14 +12,16 @@
 // Most of this code has been COPIED from LoopPass.cpp
 //
 //===----------------------------------------------------------------------===//
+
 #include "llvm/Analysis/RegionPass.h"
+#include "llvm/Analysis/RegionInfo.h"
 #include "llvm/IR/OptBisect.h"
 #include "llvm/IR/PassTimingInfo.h"
 #include "llvm/IR/PrintPasses.h"
-#include "llvm/IR/StructuralHash.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "regionpassmgr"
@@ -93,12 +95,12 @@ bool RGPassManager::runOnFunction(Function &F) {
 
         TimeRegion PassTimer(getPassTimer(P));
 #ifdef EXPENSIVE_CHECKS
-        uint64_t RefHash = StructuralHash(F);
+        uint64_t RefHash = P->structuralHash(F);
 #endif
         LocalChanged = P->runOnRegion(CurrentRegion, *this);
 
 #ifdef EXPENSIVE_CHECKS
-        if (!LocalChanged && (RefHash != StructuralHash(F))) {
+        if (!LocalChanged && (RefHash != P->structuralHash(F))) {
           llvm::errs() << "Pass modifies its input and doesn't report it: "
                        << P->getPassName() << "\n";
           llvm_unreachable("Pass modifies its input and doesn't report it");
diff --git a/llvm/lib/Analysis/RegionPrinter.cpp b/llvm/lib/Analysis/RegionPrinter.cpp
index 1fb5faaa6a71..fbd3d17febff 100644
--- a/llvm/lib/Analysis/RegionPrinter.cpp
+++ b/llvm/lib/Analysis/RegionPrinter.cpp
@@ -10,15 +10,11 @@
 
 #include "llvm/Analysis/RegionPrinter.h"
 #include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/DOTGraphTraitsPass.h"
-#include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/Analysis/RegionIterator.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #ifndef NDEBUG
 #include "llvm/IR/LegacyPassManager.h"
@@ -35,28 +31,20 @@ onlySimpleRegions("only-simple-regions",
                   cl::init(false));
 
 namespace llvm {
-template<>
-struct DOTGraphTraits<RegionNode*> : public DefaultDOTGraphTraits {
 
-  DOTGraphTraits (bool isSimple=false)
-    : DefaultDOTGraphTraits(isSimple) {}
+std::string DOTGraphTraits<RegionNode *>::getNodeLabel(RegionNode *Node,
+                                                       RegionNode *Graph) {
+  if (!Node->isSubRegion()) {
+    BasicBlock *BB = Node->getNodeAs<BasicBlock>();
 
-  std::string getNodeLabel(RegionNode *Node, RegionNode *Graph) {
-
-    if (!Node->isSubRegion()) {
-      BasicBlock *BB = Node->getNodeAs<BasicBlock>();
-
-      if (isSimple())
-        return DOTGraphTraits<DOTFuncInfo *>
-          ::getSimpleNodeLabel(BB, nullptr);
-      else
-        return DOTGraphTraits<DOTFuncInfo *>
-          ::getCompleteNodeLabel(BB, nullptr);
-    }
-
-    return "Not implemented";
+    if (isSimple())
+      return DOTGraphTraits<DOTFuncInfo *>::getSimpleNodeLabel(BB, nullptr);
+    else
+      return DOTGraphTraits<DOTFuncInfo *>::getCompleteNodeLabel(BB, nullptr);
   }
-};
+
+  return "Not implemented";
+}
 
 template <>
 struct DOTGraphTraits<RegionInfo *> : public DOTGraphTraits<RegionNode *> {
@@ -138,7 +126,7 @@ struct DOTGraphTraits<RegionInfo *> : public DOTGraphTraits<RegionNode *> {
     printRegionCluster(*G->getTopLevelRegion(), GW, 4);
   }
 };
-} //end namespace llvm
+} // end namespace llvm
 
 namespace {
 
@@ -149,48 +137,49 @@ struct RegionInfoPassGraphTraits {
 };
 
 struct RegionPrinter
-    : public DOTGraphTraitsPrinter<RegionInfoPass, false, RegionInfo *,
-                                   RegionInfoPassGraphTraits> {
+    : public DOTGraphTraitsPrinterWrapperPass<
+          RegionInfoPass, false, RegionInfo *, RegionInfoPassGraphTraits> {
   static char ID;
   RegionPrinter()
-      : DOTGraphTraitsPrinter<RegionInfoPass, false, RegionInfo *,
-                              RegionInfoPassGraphTraits>("reg", ID) {
+      : DOTGraphTraitsPrinterWrapperPass<RegionInfoPass, false, RegionInfo *,
+                                         RegionInfoPassGraphTraits>("reg", ID) {
     initializeRegionPrinterPass(*PassRegistry::getPassRegistry());
   }
 };
 char RegionPrinter::ID = 0;
 
 struct RegionOnlyPrinter
-    : public DOTGraphTraitsPrinter<RegionInfoPass, true, RegionInfo *,
-                                   RegionInfoPassGraphTraits> {
+    : public DOTGraphTraitsPrinterWrapperPass<
+          RegionInfoPass, true, RegionInfo *, RegionInfoPassGraphTraits> {
   static char ID;
   RegionOnlyPrinter()
-      : DOTGraphTraitsPrinter<RegionInfoPass, true, RegionInfo *,
-                              RegionInfoPassGraphTraits>("reg", ID) {
+      : DOTGraphTraitsPrinterWrapperPass<RegionInfoPass, true, RegionInfo *,
+                                         RegionInfoPassGraphTraits>("reg", ID) {
     initializeRegionOnlyPrinterPass(*PassRegistry::getPassRegistry());
   }
 };
 char RegionOnlyPrinter::ID = 0;
 
 struct RegionViewer
-    : public DOTGraphTraitsViewer<RegionInfoPass, false, RegionInfo *,
-                                  RegionInfoPassGraphTraits> {
+    : public DOTGraphTraitsViewerWrapperPass<
+          RegionInfoPass, false, RegionInfo *, RegionInfoPassGraphTraits> {
   static char ID;
   RegionViewer()
-      : DOTGraphTraitsViewer<RegionInfoPass, false, RegionInfo *,
-                             RegionInfoPassGraphTraits>("reg", ID) {
+      : DOTGraphTraitsViewerWrapperPass<RegionInfoPass, false, RegionInfo *,
+                                        RegionInfoPassGraphTraits>("reg", ID) {
     initializeRegionViewerPass(*PassRegistry::getPassRegistry());
   }
 };
 char RegionViewer::ID = 0;
 
 struct RegionOnlyViewer
-    : public DOTGraphTraitsViewer<RegionInfoPass, true, RegionInfo *,
-                                  RegionInfoPassGraphTraits> {
+    : public DOTGraphTraitsViewerWrapperPass<RegionInfoPass, true, RegionInfo *,
+                                             RegionInfoPassGraphTraits> {
   static char ID;
   RegionOnlyViewer()
-      : DOTGraphTraitsViewer<RegionInfoPass, true, RegionInfo *,
-                             RegionInfoPassGraphTraits>("regonly", ID) {
+      : DOTGraphTraitsViewerWrapperPass<RegionInfoPass, true, RegionInfo *,
+                                        RegionInfoPassGraphTraits>("regonly",
+                                                                   ID) {
     initializeRegionOnlyViewerPass(*PassRegistry::getPassRegistry());
   }
 };
diff --git a/llvm/lib/Analysis/ReplayInlineAdvisor.cpp b/llvm/lib/Analysis/ReplayInlineAdvisor.cpp
index 294bc38c17ad..afc3d7fc4c35 100644
--- a/llvm/lib/Analysis/ReplayInlineAdvisor.cpp
+++ b/llvm/lib/Analysis/ReplayInlineAdvisor.cpp
@@ -14,9 +14,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/ReplayInlineAdvisor.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/Instructions.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Support/LineIterator.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include <memory>
 
 using namespace llvm;
@@ -26,8 +26,9 @@ using namespace llvm;
 ReplayInlineAdvisor::ReplayInlineAdvisor(
     Module &M, FunctionAnalysisManager &FAM, LLVMContext &Context,
     std::unique_ptr<InlineAdvisor> OriginalAdvisor,
-    const ReplayInlinerSettings &ReplaySettings, bool EmitRemarks)
-    : InlineAdvisor(M, FAM), OriginalAdvisor(std::move(OriginalAdvisor)),
+    const ReplayInlinerSettings &ReplaySettings, bool EmitRemarks,
+    InlineContext IC)
+    : InlineAdvisor(M, FAM, IC), OriginalAdvisor(std::move(OriginalAdvisor)),
       ReplaySettings(ReplaySettings), EmitRemarks(EmitRemarks) {
 
   auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(ReplaySettings.ReplayFile);
@@ -75,12 +76,15 @@ ReplayInlineAdvisor::ReplayInlineAdvisor(
   HasReplayRemarks = true;
 }
 
-std::unique_ptr<InlineAdvisor> llvm::getReplayInlineAdvisor(
-    Module &M, FunctionAnalysisManager &FAM, LLVMContext &Context,
-    std::unique_ptr<InlineAdvisor> OriginalAdvisor,
-    const ReplayInlinerSettings &ReplaySettings, bool EmitRemarks) {
+std::unique_ptr<InlineAdvisor>
+llvm::getReplayInlineAdvisor(Module &M, FunctionAnalysisManager &FAM,
+                             LLVMContext &Context,
+                             std::unique_ptr<InlineAdvisor> OriginalAdvisor,
+                             const ReplayInlinerSettings &ReplaySettings,
+                             bool EmitRemarks, InlineContext IC) {
   auto Advisor = std::make_unique<ReplayInlineAdvisor>(
-      M, FAM, Context, std::move(OriginalAdvisor), ReplaySettings, EmitRemarks);
+      M, FAM, Context, std::move(OriginalAdvisor), ReplaySettings, EmitRemarks,
+      IC);
   if (!Advisor->areReplayRemarksLoaded())
     Advisor.reset();
   return Advisor;
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 977fc0911355..207f4df79e45 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -79,7 +79,6 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/ScalarEvolutionDivision.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -96,7 +95,6 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
@@ -104,7 +102,6 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
@@ -125,7 +122,6 @@
 #include <algorithm>
 #include <cassert>
 #include <climits>
-#include <cstddef>
 #include <cstdint>
 #include <cstdlib>
 #include <map>
@@ -146,17 +142,21 @@ STATISTIC(NumTripCountsNotComputed,
 STATISTIC(NumBruteForceTripCountsComputed,
           "Number of loops with trip counts computed by force");
 
+#ifdef EXPENSIVE_CHECKS
+bool llvm::VerifySCEV = true;
+#else
+bool llvm::VerifySCEV = false;
+#endif
+
 static cl::opt<unsigned>
-MaxBruteForceIterations("scalar-evolution-max-iterations", cl::ReallyHidden,
-                        cl::ZeroOrMore,
-                        cl::desc("Maximum number of iterations SCEV will "
-                                 "symbolically execute a constant "
-                                 "derived loop"),
-                        cl::init(100));
-
-// FIXME: Enable this with EXPENSIVE_CHECKS when the test suite is clean.
-static cl::opt<bool> VerifySCEV(
-    "verify-scev", cl::Hidden,
+    MaxBruteForceIterations("scalar-evolution-max-iterations", cl::ReallyHidden,
+                            cl::desc("Maximum number of iterations SCEV will "
+                                     "symbolically execute a constant "
+                                     "derived loop"),
+                            cl::init(100));
+
+static cl::opt<bool, true> VerifySCEVOpt(
+    "verify-scev", cl::Hidden, cl::location(VerifySCEV),
     cl::desc("Verify ScalarEvolution's backedge taken counts (slow)"));
 static cl::opt<bool> VerifySCEVStrict(
     "verify-scev-strict", cl::Hidden,
@@ -231,6 +231,17 @@ static cl::opt<bool> UseExpensiveRangeSharpening(
     cl::desc("Use more powerful methods of sharpening expression ranges. May "
              "be costly in terms of compile time"));
 
+static cl::opt<unsigned> MaxPhiSCCAnalysisSize(
+    "scalar-evolution-max-scc-analysis-depth", cl::Hidden,
+    cl::desc("Maximum amount of nodes to process while searching SCEVUnknown "
+             "Phi strongly connected components"),
+    cl::init(8));
+
+static cl::opt<bool>
+    EnableFiniteLoopControl("scalar-evolution-finite-loop", cl::Hidden,
+                            cl::desc("Handle <= and >= in finite loops"),
+                            cl::init(true));
+
 //===----------------------------------------------------------------------===//
 //                           SCEV class definitions
 //===----------------------------------------------------------------------===//
@@ -519,12 +530,13 @@ void SCEVUnknown::deleted() {
 }
 
 void SCEVUnknown::allUsesReplacedWith(Value *New) {
+  // Clear this SCEVUnknown from various maps.
+  SE->forgetMemoizedResults(this);
+
   // Remove this SCEVUnknown from the uniquing map.
   SE->UniqueSCEVs.RemoveNode(this);
 
-  // Update this SCEVUnknown to point to the new value. This is needed
-  // because there may still be outstanding SCEVs which still point to
-  // this SCEVUnknown.
+  // Replace the value pointer in case someone is still using this SCEVUnknown.
   setValPtr(New);
 }
 
@@ -1643,10 +1655,12 @@ ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
 
       // If we have special knowledge that this addrec won't overflow,
       // we don't need to do any further analysis.
-      if (AR->hasNoUnsignedWrap())
-        return getAddRecExpr(
-            getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this, Depth + 1),
-            getZeroExtendExpr(Step, Ty, Depth + 1), L, AR->getNoWrapFlags());
+      if (AR->hasNoUnsignedWrap()) {
+        Start =
+            getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this, Depth + 1);
+        Step = getZeroExtendExpr(Step, Ty, Depth + 1);
+        return getAddRecExpr(Start, Step, L, AR->getNoWrapFlags());
+      }
 
       // Check whether the backedge-taken count is SCEVCouldNotCompute.
       // Note that this serves two purposes: It filters out loops that are
@@ -1688,11 +1702,10 @@ ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
             // Cache knowledge of AR NUW, which is propagated to this AddRec.
             setNoWrapFlags(const_cast<SCEVAddRecExpr *>(AR), SCEV::FlagNUW);
             // Return the expression with the addrec on the outside.
-            return getAddRecExpr(
-                getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this,
-                                                         Depth + 1),
-                getZeroExtendExpr(Step, Ty, Depth + 1), L,
-                AR->getNoWrapFlags());
+            Start = getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this,
+                                                             Depth + 1);
+            Step = getZeroExtendExpr(Step, Ty, Depth + 1);
+            return getAddRecExpr(Start, Step, L, AR->getNoWrapFlags());
           }
           // Similar to above, only this time treat the step value as signed.
           // This covers loops that count down.
@@ -1707,11 +1720,10 @@ ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
             // Negative step causes unsigned wrap, but it still can't self-wrap.
             setNoWrapFlags(const_cast<SCEVAddRecExpr *>(AR), SCEV::FlagNW);
             // Return the expression with the addrec on the outside.
-            return getAddRecExpr(
-                getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this,
-                                                         Depth + 1),
-                getSignExtendExpr(Step, Ty, Depth + 1), L,
-                AR->getNoWrapFlags());
+            Start = getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this,
+                                                             Depth + 1);
+            Step = getSignExtendExpr(Step, Ty, Depth + 1);
+            return getAddRecExpr(Start, Step, L, AR->getNoWrapFlags());
           }
         }
       }
@@ -1733,11 +1745,10 @@ ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
           // issue.  It's not clear that the order of checks does matter, but
           // it's one of two issue possible causes for a change which was
           // reverted.  Be conservative for the moment.
-          return getAddRecExpr(
-                getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this,
-                                                         Depth + 1),
-                getZeroExtendExpr(Step, Ty, Depth + 1), L,
-                AR->getNoWrapFlags());
+          Start =
+              getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this, Depth + 1);
+          Step = getZeroExtendExpr(Step, Ty, Depth + 1);
+          return getAddRecExpr(Start, Step, L, AR->getNoWrapFlags());
         }
         
         // For a negative step, we can extend the operands iff doing so only
@@ -1752,11 +1763,10 @@ ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
             // still can't self-wrap.
             setNoWrapFlags(const_cast<SCEVAddRecExpr *>(AR), SCEV::FlagNW);
             // Return the expression with the addrec on the outside.
-            return getAddRecExpr(
-                getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this,
-                                                         Depth + 1),
-                getSignExtendExpr(Step, Ty, Depth + 1), L,
-                AR->getNoWrapFlags());
+            Start = getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this,
+                                                             Depth + 1);
+            Step = getSignExtendExpr(Step, Ty, Depth + 1);
+            return getAddRecExpr(Start, Step, L, AR->getNoWrapFlags());
           }
         }
       }
@@ -1780,9 +1790,10 @@ ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
 
       if (proveNoWrapByVaryingStart<SCEVZeroExtendExpr>(Start, Step, L)) {
         setNoWrapFlags(const_cast<SCEVAddRecExpr *>(AR), SCEV::FlagNUW);
-        return getAddRecExpr(
-            getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this, Depth + 1),
-            getZeroExtendExpr(Step, Ty, Depth + 1), L, AR->getNoWrapFlags());
+        Start =
+            getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this, Depth + 1);
+        Step = getZeroExtendExpr(Step, Ty, Depth + 1);
+        return getAddRecExpr(Start, Step, L, AR->getNoWrapFlags());
       }
     }
 
@@ -1984,10 +1995,12 @@ ScalarEvolution::getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
 
       // If we have special knowledge that this addrec won't overflow,
       // we don't need to do any further analysis.
-      if (AR->hasNoSignedWrap())
-        return getAddRecExpr(
-            getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, Depth + 1),
-            getSignExtendExpr(Step, Ty, Depth + 1), L, SCEV::FlagNSW);
+      if (AR->hasNoSignedWrap()) {
+        Start =
+            getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, Depth + 1);
+        Step = getSignExtendExpr(Step, Ty, Depth + 1);
+        return getAddRecExpr(Start, Step, L, SCEV::FlagNSW);
+      }
 
       // Check whether the backedge-taken count is SCEVCouldNotCompute.
       // Note that this serves two purposes: It filters out loops that are
@@ -2030,11 +2043,10 @@ ScalarEvolution::getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
             // Cache knowledge of AR NSW, which is propagated to this AddRec.
             setNoWrapFlags(const_cast<SCEVAddRecExpr *>(AR), SCEV::FlagNSW);
             // Return the expression with the addrec on the outside.
-            return getAddRecExpr(
-                getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this,
-                                                         Depth + 1),
-                getSignExtendExpr(Step, Ty, Depth + 1), L,
-                AR->getNoWrapFlags());
+            Start = getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this,
+                                                             Depth + 1);
+            Step = getSignExtendExpr(Step, Ty, Depth + 1);
+            return getAddRecExpr(Start, Step, L, AR->getNoWrapFlags());
           }
           // Similar to above, only this time treat the step value as unsigned.
           // This covers loops that count up with an unsigned step.
@@ -2056,11 +2068,10 @@ ScalarEvolution::getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
             setNoWrapFlags(const_cast<SCEVAddRecExpr *>(AR), SCEV::FlagNW);
 
             // Return the expression with the addrec on the outside.
-            return getAddRecExpr(
-                getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this,
-                                                         Depth + 1),
-                getZeroExtendExpr(Step, Ty, Depth + 1), L,
-                AR->getNoWrapFlags());
+            Start = getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this,
+                                                             Depth + 1);
+            Step = getZeroExtendExpr(Step, Ty, Depth + 1);
+            return getAddRecExpr(Start, Step, L, AR->getNoWrapFlags());
           }
         }
       }
@@ -2072,9 +2083,10 @@ ScalarEvolution::getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
         // issue.  It's not clear that the order of checks does matter, but
         // it's one of two issue possible causes for a change which was
         // reverted.  Be conservative for the moment.
-        return getAddRecExpr(
-            getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, Depth + 1),
-            getSignExtendExpr(Step, Ty, Depth + 1), L, AR->getNoWrapFlags());
+        Start =
+            getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, Depth + 1);
+        Step = getSignExtendExpr(Step, Ty, Depth + 1);
+        return getAddRecExpr(Start, Step, L, AR->getNoWrapFlags());
       }
 
       // sext({C,+,Step}) --> (sext(D) + sext({C-D,+,Step}))<nuw><nsw>
@@ -2096,9 +2108,10 @@ ScalarEvolution::getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
 
       if (proveNoWrapByVaryingStart<SCEVSignExtendExpr>(Start, Step, L)) {
         setNoWrapFlags(const_cast<SCEVAddRecExpr *>(AR), SCEV::FlagNSW);
-        return getAddRecExpr(
-            getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, Depth + 1),
-            getSignExtendExpr(Step, Ty, Depth + 1), L, AR->getNoWrapFlags());
+        Start =
+            getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, Depth + 1);
+        Step = getSignExtendExpr(Step, Ty, Depth + 1);
+        return getAddRecExpr(Start, Step, L, AR->getNoWrapFlags());
       }
     }
 
@@ -2300,9 +2313,9 @@ bool ScalarEvolution::willNotOverflow(Instruction::BinaryOps BinOp, bool Signed,
 
   const SCEV *A = (this->*Extension)(
       (this->*Operation)(LHS, RHS, SCEV::FlagAnyWrap, 0), WideTy, 0);
-  const SCEV *B = (this->*Operation)((this->*Extension)(LHS, WideTy, 0),
-                                     (this->*Extension)(RHS, WideTy, 0),
-                                     SCEV::FlagAnyWrap, 0);
+  const SCEV *LHSB = (this->*Extension)(LHS, WideTy, 0);
+  const SCEV *RHSB = (this->*Extension)(RHS, WideTy, 0);
+  const SCEV *B = (this->*Operation)(LHSB, RHSB, SCEV::FlagAnyWrap, 0);
   return A == B;
 }
 
@@ -3106,12 +3119,13 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
         // TODO: There are some cases where this transformation is not
         // profitable; for example, Add = (C0 + X) * Y + Z.  Maybe the scope of
         // this transformation should be narrowed down.
-        if (Add->getNumOperands() == 2 && containsConstantInAddMulChain(Add))
-          return getAddExpr(getMulExpr(LHSC, Add->getOperand(0),
-                                       SCEV::FlagAnyWrap, Depth + 1),
-                            getMulExpr(LHSC, Add->getOperand(1),
-                                       SCEV::FlagAnyWrap, Depth + 1),
-                            SCEV::FlagAnyWrap, Depth + 1);
+        if (Add->getNumOperands() == 2 && containsConstantInAddMulChain(Add)) {
+          const SCEV *LHS = getMulExpr(LHSC, Add->getOperand(0),
+                                       SCEV::FlagAnyWrap, Depth + 1);
+          const SCEV *RHS = getMulExpr(LHSC, Add->getOperand(1),
+                                       SCEV::FlagAnyWrap, Depth + 1);
+          return getAddExpr(LHS, RHS, SCEV::FlagAnyWrap, Depth + 1);
+        }
 
       if (Ops[0]->isAllOnesValue()) {
         // If we have a mul by -1 of an add, try distributing the -1 among the
@@ -3466,12 +3480,8 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS,
       }
 
       // Fold if both operands are constant.
-      if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(LHS)) {
-        Constant *LHSCV = LHSC->getValue();
-        Constant *RHSCV = RHSC->getValue();
-        return getConstant(cast<ConstantInt>(ConstantExpr::getUDiv(LHSCV,
-                                                                   RHSCV)));
-      }
+      if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(LHS))
+        return getConstant(LHSC->getAPInt().udiv(RHSC->getAPInt()));
     }
   }
 
@@ -4002,6 +4012,59 @@ public:
 
 } // namespace
 
+/// Return true if V is poison given that AssumedPoison is already poison.
+static bool impliesPoison(const SCEV *AssumedPoison, const SCEV *S) {
+  // The only way poison may be introduced in a SCEV expression is from a
+  // poison SCEVUnknown (ConstantExprs are also represented as SCEVUnknown,
+  // not SCEVConstant). Notably, nowrap flags in SCEV nodes can *not*
+  // introduce poison -- they encode guaranteed, non-speculated knowledge.
+  //
+  // Additionally, all SCEV nodes propagate poison from inputs to outputs,
+  // with the notable exception of umin_seq, where only poison from the first
+  // operand is (unconditionally) propagated.
+  struct SCEVPoisonCollector {
+    bool LookThroughSeq;
+    SmallPtrSet<const SCEV *, 4> MaybePoison;
+    SCEVPoisonCollector(bool LookThroughSeq) : LookThroughSeq(LookThroughSeq) {}
+
+    bool follow(const SCEV *S) {
+      // TODO: We can always follow the first operand, but the SCEVTraversal
+      // API doesn't support this.
+      if (!LookThroughSeq && isa<SCEVSequentialMinMaxExpr>(S))
+        return false;
+
+      if (auto *SU = dyn_cast<SCEVUnknown>(S)) {
+        if (!isGuaranteedNotToBePoison(SU->getValue()))
+          MaybePoison.insert(S);
+      }
+      return true;
+    }
+    bool isDone() const { return false; }
+  };
+
+  // First collect all SCEVs that might result in AssumedPoison to be poison.
+  // We need to look through umin_seq here, because we want to find all SCEVs
+  // that *might* result in poison, not only those that are *required* to.
+  SCEVPoisonCollector PC1(/* LookThroughSeq */ true);
+  visitAll(AssumedPoison, PC1);
+
+  // AssumedPoison is never poison. As the assumption is false, the implication
+  // is true. Don't bother walking the other SCEV in this case.
+  if (PC1.MaybePoison.empty())
+    return true;
+
+  // Collect all SCEVs in S that, if poison, *will* result in S being poison
+  // as well. We cannot look through umin_seq here, as its argument only *may*
+  // make the result poison.
+  SCEVPoisonCollector PC2(/* LookThroughSeq */ false);
+  visitAll(S, PC2);
+
+  // Make sure that no matter which SCEV in PC1.MaybePoison is actually poison,
+  // it will also make S poison by being part of PC2.MaybePoison.
+  return all_of(PC1.MaybePoison,
+                [&](const SCEV *S) { return PC2.MaybePoison.contains(S); });
+}
+
 const SCEV *
 ScalarEvolution::getSequentialMinMaxExpr(SCEVTypes Kind,
                                          SmallVectorImpl<const SCEV *> &Ops) {
@@ -4010,11 +4073,6 @@ ScalarEvolution::getSequentialMinMaxExpr(SCEVTypes Kind,
   assert(!Ops.empty() && "Cannot get empty (u|s)(min|max)!");
   if (Ops.size() == 1)
     return Ops[0];
-  if (Ops.size() == 2 &&
-      any_of(Ops, [](const SCEV *Op) { return isa<SCEVConstant>(Op); }))
-    return getMinMaxExpr(
-        SCEVSequentialMinMaxExpr::getEquivalentNonSequentialSCEVType(Kind),
-        Ops);
 #ifndef NDEBUG
   Type *ETy = getEffectiveSCEVType(Ops[0]->getType());
   for (unsigned i = 1, e = Ops.size(); i != e; ++i) {
@@ -4063,6 +4121,39 @@ ScalarEvolution::getSequentialMinMaxExpr(SCEVTypes Kind,
       return getSequentialMinMaxExpr(Kind, Ops);
   }
 
+  const SCEV *SaturationPoint;
+  ICmpInst::Predicate Pred;
+  switch (Kind) {
+  case scSequentialUMinExpr:
+    SaturationPoint = getZero(Ops[0]->getType());
+    Pred = ICmpInst::ICMP_ULE;
+    break;
+  default:
+    llvm_unreachable("Not a sequential min/max type.");
+  }
+
+  for (unsigned i = 1, e = Ops.size(); i != e; ++i) {
+    // We can replace %x umin_seq %y with %x umin %y if either:
+    //  * %y being poison implies %x is also poison.
+    //  * %x cannot be the saturating value (e.g. zero for umin).
+    if (::impliesPoison(Ops[i], Ops[i - 1]) ||
+        isKnownViaNonRecursiveReasoning(ICmpInst::ICMP_NE, Ops[i - 1],
+                                        SaturationPoint)) {
+      SmallVector<const SCEV *> SeqOps = {Ops[i - 1], Ops[i]};
+      Ops[i - 1] = getMinMaxExpr(
+          SCEVSequentialMinMaxExpr::getEquivalentNonSequentialSCEVType(Kind),
+          SeqOps);
+      Ops.erase(Ops.begin() + i);
+      return getSequentialMinMaxExpr(Kind, Ops);
+    }
+    // Fold %x umin_seq %y to %x if %x ule %y.
+    // TODO: We might be able to prove the predicate for a later operand.
+    if (isKnownViaNonRecursiveReasoning(Pred, Ops[i - 1], Ops[i])) {
+      Ops.erase(Ops.begin() + i);
+      return getSequentialMinMaxExpr(Kind, Ops);
+    }
+  }
+
   // Okay, it looks like we really DO need an expr.  Check to see if we
   // already have one, otherwise create a new one.
   FoldingSetNodeID ID;
@@ -4265,39 +4356,20 @@ bool ScalarEvolution::containsAddRecurrence(const SCEV *S) {
   return FoundAddRec;
 }
 
-/// Try to split a SCEVAddExpr into a pair of {SCEV, ConstantInt}.
-/// If \p S is a SCEVAddExpr and is composed of a sub SCEV S' and an
-/// offset I, then return {S', I}, else return {\p S, nullptr}.
-static std::pair<const SCEV *, ConstantInt *> splitAddExpr(const SCEV *S) {
-  const auto *Add = dyn_cast<SCEVAddExpr>(S);
-  if (!Add)
-    return {S, nullptr};
-
-  if (Add->getNumOperands() != 2)
-    return {S, nullptr};
-
-  auto *ConstOp = dyn_cast<SCEVConstant>(Add->getOperand(0));
-  if (!ConstOp)
-    return {S, nullptr};
-
-  return {Add->getOperand(1), ConstOp->getValue()};
-}
-
 /// Return the ValueOffsetPair set for \p S. \p S can be represented
 /// by the value and offset from any ValueOffsetPair in the set.
-ScalarEvolution::ValueOffsetPairSetVector *
-ScalarEvolution::getSCEVValues(const SCEV *S) {
+ArrayRef<Value *> ScalarEvolution::getSCEVValues(const SCEV *S) {
   ExprValueMapType::iterator SI = ExprValueMap.find_as(S);
   if (SI == ExprValueMap.end())
-    return nullptr;
+    return None;
 #ifndef NDEBUG
   if (VerifySCEVMap) {
     // Check there is no dangling Value in the set returned.
-    for (const auto &VE : SI->second)
-      assert(ValueExprMap.count(VE.first));
+    for (Value *V : SI->second)
+      assert(ValueExprMap.count(V));
   }
 #endif
-  return &SI->second;
+  return SI->second.getArrayRef();
 }
 
 /// Erase Value from ValueExprMap and ExprValueMap. ValueExprMap.erase(V)
@@ -4306,20 +4378,11 @@ ScalarEvolution::getSCEVValues(const SCEV *S) {
 void ScalarEvolution::eraseValueFromMap(Value *V) {
   ValueExprMapType::iterator I = ValueExprMap.find_as(V);
   if (I != ValueExprMap.end()) {
-    const SCEV *S = I->second;
-    // Remove {V, 0} from the set of ExprValueMap[S]
-    if (auto *SV = getSCEVValues(S))
-      SV->remove({V, nullptr});
-
-    // Remove {V, Offset} from the set of ExprValueMap[Stripped]
-    const SCEV *Stripped;
-    ConstantInt *Offset;
-    std::tie(Stripped, Offset) = splitAddExpr(S);
-    if (Offset != nullptr) {
-      if (auto *SV = getSCEVValues(Stripped))
-        SV->remove({V, Offset});
-    }
-    ValueExprMap.erase(V);
+    auto EVIt = ExprValueMap.find(I->second);
+    bool Removed = EVIt->second.remove(V);
+    (void) Removed;
+    assert(Removed && "Value not in ExprValueMap?");
+    ValueExprMap.erase(I);
   }
 }
 
@@ -4330,7 +4393,7 @@ void ScalarEvolution::insertValueToMap(Value *V, const SCEV *S) {
   auto It = ValueExprMap.find_as(V);
   if (It == ValueExprMap.end()) {
     ValueExprMap.insert({SCEVCallbackVH(V, this), S});
-    ExprValueMap[S].insert({V, nullptr});
+    ExprValueMap[S].insert(V);
   }
 }
 
@@ -4339,33 +4402,9 @@ void ScalarEvolution::insertValueToMap(Value *V, const SCEV *S) {
 const SCEV *ScalarEvolution::getSCEV(Value *V) {
   assert(isSCEVable(V->getType()) && "Value is not SCEVable!");
 
-  const SCEV *S = getExistingSCEV(V);
-  if (S == nullptr) {
-    S = createSCEV(V);
-    // During PHI resolution, it is possible to create two SCEVs for the same
-    // V, so it is needed to double check whether V->S is inserted into
-    // ValueExprMap before insert S->{V, 0} into ExprValueMap.
-    std::pair<ValueExprMapType::iterator, bool> Pair =
-        ValueExprMap.insert({SCEVCallbackVH(V, this), S});
-    if (Pair.second) {
-      ExprValueMap[S].insert({V, nullptr});
-
-      // If S == Stripped + Offset, add Stripped -> {V, Offset} into
-      // ExprValueMap.
-      const SCEV *Stripped = S;
-      ConstantInt *Offset = nullptr;
-      std::tie(Stripped, Offset) = splitAddExpr(S);
-      // If stripped is SCEVUnknown, don't bother to save
-      // Stripped -> {V, offset}. It doesn't simplify and sometimes even
-      // increase the complexity of the expansion code.
-      // If V is GetElementPtrInst, don't save Stripped -> {V, offset}
-      // because it may generate add/sub instead of GEP in SCEV expansion.
-      if (Offset != nullptr && !isa<SCEVUnknown>(Stripped) &&
-          !isa<GetElementPtrInst>(V))
-        ExprValueMap[Stripped].insert({V, Offset});
-    }
-  }
-  return S;
+  if (const SCEV *S = getExistingSCEV(V))
+    return S;
+  return createSCEVIter(V);
 }
 
 const SCEV *ScalarEvolution::getExistingSCEV(Value *V) {
@@ -4795,7 +4834,7 @@ public:
         SelectInst *SI = cast<SelectInst>(I);
         Optional<const SCEV *> Res =
             compareWithBackedgeCondition(SI->getCondition());
-        if (Res.hasValue()) {
+        if (Res) {
           bool IsOne = cast<SCEVConstant>(Res.getValue())->getValue()->isOne();
           Result = SE.getSCEV(IsOne ? SI->getTrueValue() : SI->getFalseValue());
         }
@@ -4803,7 +4842,7 @@ public:
       }
       default: {
         Optional<const SCEV *> Res = compareWithBackedgeCondition(I);
-        if (Res.hasValue())
+        if (Res)
           Result = Res.getValue();
         break;
       }
@@ -5067,6 +5106,9 @@ static Optional<BinaryOp> MatchBinaryOp(Value *V, DominatorTree &DT) {
       // Instcombine turns add of signmask into xor as a strength reduction step.
       if (RHSC->getValue().isSignMask())
         return BinaryOp(Instruction::Add, Op->getOperand(0), Op->getOperand(1));
+    // Binary `xor` is a bit-wise `add`.
+    if (V->getType()->isIntegerTy(1))
+      return BinaryOp(Instruction::Add, Op->getOperand(0), Op->getOperand(1));
     return BinaryOp(Op);
 
   case Instruction::LShr:
@@ -5489,8 +5531,8 @@ bool PredicatedScalarEvolution::areAddRecsEqualWithPreds(
     return true;
 
   auto areExprsEqual = [&](const SCEV *Expr1, const SCEV *Expr2) -> bool {
-    if (Expr1 != Expr2 && !Preds.implies(SE.getEqualPredicate(Expr1, Expr2)) &&
-        !Preds.implies(SE.getEqualPredicate(Expr2, Expr1)))
+    if (Expr1 != Expr2 && !Preds->implies(SE.getEqualPredicate(Expr1, Expr2)) &&
+        !Preds->implies(SE.getEqualPredicate(Expr2, Expr1)))
       return false;
     return true;
   };
@@ -5872,31 +5914,53 @@ const SCEV *ScalarEvolution::createNodeForPHI(PHINode *PN) {
   if (const SCEV *S = createNodeFromSelectLikePHI(PN))
     return S;
 
-  // If the PHI has a single incoming value, follow that value, unless the
-  // PHI's incoming blocks are in a different loop, in which case doing so
-  // risks breaking LCSSA form. Instcombine would normally zap these, but
-  // it doesn't have DominatorTree information, so it may miss cases.
-  if (Value *V = SimplifyInstruction(PN, {getDataLayout(), &TLI, &DT, &AC}))
-    if (LI.replacementPreservesLCSSAForm(PN, V))
-      return getSCEV(V);
+  if (Value *V = simplifyInstruction(PN, {getDataLayout(), &TLI, &DT, &AC}))
+    return getSCEV(V);
 
   // If it's not a loop phi, we can't handle it yet.
   return getUnknown(PN);
 }
 
-const SCEV *ScalarEvolution::createNodeForSelectOrPHI(Instruction *I,
-                                                      Value *Cond,
-                                                      Value *TrueVal,
-                                                      Value *FalseVal) {
-  // Handle "constant" branch or select. This can occur for instance when a
-  // loop pass transforms an inner loop and moves on to process the outer loop.
-  if (auto *CI = dyn_cast<ConstantInt>(Cond))
-    return getSCEV(CI->isOne() ? TrueVal : FalseVal);
+bool SCEVMinMaxExprContains(const SCEV *Root, const SCEV *OperandToFind,
+                            SCEVTypes RootKind) {
+  struct FindClosure {
+    const SCEV *OperandToFind;
+    const SCEVTypes RootKind; // Must be a sequential min/max expression.
+    const SCEVTypes NonSequentialRootKind; // Non-seq variant of RootKind.
+
+    bool Found = false;
+
+    bool canRecurseInto(SCEVTypes Kind) const {
+      // We can only recurse into the SCEV expression of the same effective type
+      // as the type of our root SCEV expression, and into zero-extensions.
+      return RootKind == Kind || NonSequentialRootKind == Kind ||
+             scZeroExtend == Kind;
+    };
+
+    FindClosure(const SCEV *OperandToFind, SCEVTypes RootKind)
+        : OperandToFind(OperandToFind), RootKind(RootKind),
+          NonSequentialRootKind(
+              SCEVSequentialMinMaxExpr::getEquivalentNonSequentialSCEVType(
+                  RootKind)) {}
 
+    bool follow(const SCEV *S) {
+      Found = S == OperandToFind;
+
+      return !isDone() && canRecurseInto(S->getSCEVType());
+    }
+
+    bool isDone() const { return Found; }
+  };
+
+  FindClosure FC(OperandToFind, RootKind);
+  visitAll(Root, FC);
+  return FC.Found;
+}
+
+const SCEV *ScalarEvolution::createNodeForSelectOrPHIInstWithICmpInstCond(
+    Instruction *I, ICmpInst *Cond, Value *TrueVal, Value *FalseVal) {
   // Try to match some simple smax or umax patterns.
-  auto *ICI = dyn_cast<ICmpInst>(Cond);
-  if (!ICI)
-    return getUnknown(I);
+  auto *ICI = Cond;
 
   Value *LHS = ICI->getOperand(0);
   Value *RHS = ICI->getOperand(1);
@@ -5958,31 +6022,36 @@ const SCEV *ScalarEvolution::createNodeForSelectOrPHI(Instruction *I,
     }
     break;
   case ICmpInst::ICMP_NE:
-    // n != 0 ? n+x : 1+x  ->  umax(n, 1)+x
-    if (getTypeSizeInBits(LHS->getType()) <= getTypeSizeInBits(I->getType()) &&
-        isa<ConstantInt>(RHS) && cast<ConstantInt>(RHS)->isZero()) {
-      const SCEV *One = getOne(I->getType());
-      const SCEV *LS = getNoopOrZeroExtend(getSCEV(LHS), I->getType());
-      const SCEV *LA = getSCEV(TrueVal);
-      const SCEV *RA = getSCEV(FalseVal);
-      const SCEV *LDiff = getMinusSCEV(LA, LS);
-      const SCEV *RDiff = getMinusSCEV(RA, One);
-      if (LDiff == RDiff)
-        return getAddExpr(getUMaxExpr(One, LS), LDiff);
-    }
-    break;
+    // x != 0 ? x+y : C+y  ->  x == 0 ? C+y : x+y
+    std::swap(TrueVal, FalseVal);
+    LLVM_FALLTHROUGH;
   case ICmpInst::ICMP_EQ:
-    // n == 0 ? 1+x : n+x  ->  umax(n, 1)+x
+    // x == 0 ? C+y : x+y  ->  umax(x, C)+y   iff C u<= 1
     if (getTypeSizeInBits(LHS->getType()) <= getTypeSizeInBits(I->getType()) &&
         isa<ConstantInt>(RHS) && cast<ConstantInt>(RHS)->isZero()) {
-      const SCEV *One = getOne(I->getType());
-      const SCEV *LS = getNoopOrZeroExtend(getSCEV(LHS), I->getType());
-      const SCEV *LA = getSCEV(TrueVal);
-      const SCEV *RA = getSCEV(FalseVal);
-      const SCEV *LDiff = getMinusSCEV(LA, One);
-      const SCEV *RDiff = getMinusSCEV(RA, LS);
-      if (LDiff == RDiff)
-        return getAddExpr(getUMaxExpr(One, LS), LDiff);
+      const SCEV *X = getNoopOrZeroExtend(getSCEV(LHS), I->getType());
+      const SCEV *TrueValExpr = getSCEV(TrueVal);    // C+y
+      const SCEV *FalseValExpr = getSCEV(FalseVal);  // x+y
+      const SCEV *Y = getMinusSCEV(FalseValExpr, X); // y = (x+y)-x
+      const SCEV *C = getMinusSCEV(TrueValExpr, Y);  // C = (C+y)-y
+      if (isa<SCEVConstant>(C) && cast<SCEVConstant>(C)->getAPInt().ule(1))
+        return getAddExpr(getUMaxExpr(X, C), Y);
+    }
+    // x == 0 ? 0 : umin    (..., x, ...)  ->  umin_seq(x, umin    (...))
+    // x == 0 ? 0 : umin_seq(..., x, ...)  ->  umin_seq(x, umin_seq(...))
+    // x == 0 ? 0 : umin    (..., umin_seq(..., x, ...), ...)
+    //                    ->  umin_seq(x, umin (..., umin_seq(...), ...))
+    if (isa<ConstantInt>(RHS) && cast<ConstantInt>(RHS)->isZero() &&
+        isa<ConstantInt>(TrueVal) && cast<ConstantInt>(TrueVal)->isZero()) {
+      const SCEV *X = getSCEV(LHS);
+      while (auto *ZExt = dyn_cast<SCEVZeroExtendExpr>(X))
+        X = ZExt->getOperand();
+      if (getTypeSizeInBits(X->getType()) <= getTypeSizeInBits(I->getType())) {
+        const SCEV *FalseValExpr = getSCEV(FalseVal);
+        if (SCEVMinMaxExprContains(FalseValExpr, X, scSequentialUMinExpr))
+          return getUMinExpr(getNoopOrZeroExtend(X, I->getType()), FalseValExpr,
+                             /*Sequential=*/true);
+      }
     }
     break;
   default:
@@ -5992,12 +6061,95 @@ const SCEV *ScalarEvolution::createNodeForSelectOrPHI(Instruction *I,
   return getUnknown(I);
 }
 
+static Optional<const SCEV *>
+createNodeForSelectViaUMinSeq(ScalarEvolution *SE, const SCEV *CondExpr,
+                              const SCEV *TrueExpr, const SCEV *FalseExpr) {
+  assert(CondExpr->getType()->isIntegerTy(1) &&
+         TrueExpr->getType() == FalseExpr->getType() &&
+         TrueExpr->getType()->isIntegerTy(1) &&
+         "Unexpected operands of a select.");
+
+  // i1 cond ? i1 x : i1 C  -->  C + (i1  cond ? (i1 x - i1 C) : i1 0)
+  //                        -->  C + (umin_seq  cond, x - C)
+  //
+  // i1 cond ? i1 C : i1 x  -->  C + (i1  cond ? i1 0 : (i1 x - i1 C))
+  //                        -->  C + (i1 ~cond ? (i1 x - i1 C) : i1 0)
+  //                        -->  C + (umin_seq ~cond, x - C)
+
+  // FIXME: while we can't legally model the case where both of the hands
+  // are fully variable, we only require that the *difference* is constant.
+  if (!isa<SCEVConstant>(TrueExpr) && !isa<SCEVConstant>(FalseExpr))
+    return None;
+
+  const SCEV *X, *C;
+  if (isa<SCEVConstant>(TrueExpr)) {
+    CondExpr = SE->getNotSCEV(CondExpr);
+    X = FalseExpr;
+    C = TrueExpr;
+  } else {
+    X = TrueExpr;
+    C = FalseExpr;
+  }
+  return SE->getAddExpr(C, SE->getUMinExpr(CondExpr, SE->getMinusSCEV(X, C),
+                                           /*Sequential=*/true));
+}
+
+static Optional<const SCEV *> createNodeForSelectViaUMinSeq(ScalarEvolution *SE,
+                                                            Value *Cond,
+                                                            Value *TrueVal,
+                                                            Value *FalseVal) {
+  if (!isa<ConstantInt>(TrueVal) && !isa<ConstantInt>(FalseVal))
+    return None;
+
+  const auto *SECond = SE->getSCEV(Cond);
+  const auto *SETrue = SE->getSCEV(TrueVal);
+  const auto *SEFalse = SE->getSCEV(FalseVal);
+  return createNodeForSelectViaUMinSeq(SE, SECond, SETrue, SEFalse);
+}
+
+const SCEV *ScalarEvolution::createNodeForSelectOrPHIViaUMinSeq(
+    Value *V, Value *Cond, Value *TrueVal, Value *FalseVal) {
+  assert(Cond->getType()->isIntegerTy(1) && "Select condition is not an i1?");
+  assert(TrueVal->getType() == FalseVal->getType() &&
+         V->getType() == TrueVal->getType() &&
+         "Types of select hands and of the result must match.");
+
+  // For now, only deal with i1-typed `select`s.
+  if (!V->getType()->isIntegerTy(1))
+    return getUnknown(V);
+
+  if (Optional<const SCEV *> S =
+          createNodeForSelectViaUMinSeq(this, Cond, TrueVal, FalseVal))
+    return *S;
+
+  return getUnknown(V);
+}
+
+const SCEV *ScalarEvolution::createNodeForSelectOrPHI(Value *V, Value *Cond,
+                                                      Value *TrueVal,
+                                                      Value *FalseVal) {
+  // Handle "constant" branch or select. This can occur for instance when a
+  // loop pass transforms an inner loop and moves on to process the outer loop.
+  if (auto *CI = dyn_cast<ConstantInt>(Cond))
+    return getSCEV(CI->isOne() ? TrueVal : FalseVal);
+
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    if (auto *ICI = dyn_cast<ICmpInst>(Cond)) {
+      const SCEV *S = createNodeForSelectOrPHIInstWithICmpInstCond(
+          I, ICI, TrueVal, FalseVal);
+      if (!isa<SCEVUnknown>(S))
+        return S;
+    }
+  }
+
+  return createNodeForSelectOrPHIViaUMinSeq(V, Cond, TrueVal, FalseVal);
+}
+
 /// Expand GEP instructions into add and multiply operations. This allows them
 /// to be analyzed by regular SCEV code.
 const SCEV *ScalarEvolution::createNodeForGEP(GEPOperator *GEP) {
-  // Don't attempt to analyze GEPs over unsized objects.
-  if (!GEP->getSourceElementType()->isSized())
-    return getUnknown(GEP);
+  assert(GEP->getSourceElementType()->isSized() &&
+         "GEP source element type must be sized");
 
   SmallVector<const SCEV *, 4> IndexExprs;
   for (Value *Index : GEP->indices())
@@ -6430,7 +6582,7 @@ ScalarEvolution::getRangeRef(const SCEV *S,
 
     // Check if the IR explicitly contains !range metadata.
     Optional<ConstantRange> MDRange = GetRangeFromMetadata(U->getValue());
-    if (MDRange.hasValue())
+    if (MDRange)
       ConservativeResult = ConservativeResult.intersectWith(MDRange.getValue(),
                                                             RangeType);
 
@@ -6719,7 +6871,7 @@ ConstantRange ScalarEvolution::getRangeViaFactoring(const SCEV *Start,
       FalseValue = *FalseVal;
 
       // Re-apply the cast we peeled off earlier
-      if (CastOp.hasValue())
+      if (CastOp)
         switch (*CastOp) {
         default:
           llvm_unreachable("Unknown SCEV cast type!");
@@ -7020,6 +7172,211 @@ bool ScalarEvolution::loopIsFiniteByAssumption(const Loop *L) {
   return isFinite(L) || (isMustProgress(L) && loopHasNoSideEffects(L));
 }
 
+const SCEV *ScalarEvolution::createSCEVIter(Value *V) {
+  // Worklist item with a Value and a bool indicating whether all operands have
+  // been visited already.
+  using PointerTy = PointerIntPair<Value *, 1, bool>;
+  SmallVector<PointerTy> Stack;
+
+  Stack.emplace_back(V, true);
+  Stack.emplace_back(V, false);
+  while (!Stack.empty()) {
+    auto E = Stack.pop_back_val();
+    Value *CurV = E.getPointer();
+
+    if (getExistingSCEV(CurV))
+      continue;
+
+    SmallVector<Value *> Ops;
+    const SCEV *CreatedSCEV = nullptr;
+    // If all operands have been visited already, create the SCEV.
+    if (E.getInt()) {
+      CreatedSCEV = createSCEV(CurV);
+    } else {
+      // Otherwise get the operands we need to create SCEV's for before creating
+      // the SCEV for CurV. If the SCEV for CurV can be constructed trivially,
+      // just use it.
+      CreatedSCEV = getOperandsToCreate(CurV, Ops);
+    }
+
+    if (CreatedSCEV) {
+      insertValueToMap(CurV, CreatedSCEV);
+    } else {
+      // Queue CurV for SCEV creation, followed by its's operands which need to
+      // be constructed first.
+      Stack.emplace_back(CurV, true);
+      for (Value *Op : Ops)
+        Stack.emplace_back(Op, false);
+    }
+  }
+
+  return getExistingSCEV(V);
+}
+
+const SCEV *
+ScalarEvolution::getOperandsToCreate(Value *V, SmallVectorImpl<Value *> &Ops) {
+  if (!isSCEVable(V->getType()))
+    return getUnknown(V);
+
+  if (Instruction *I = dyn_cast<Instruction>(V)) {
+    // Don't attempt to analyze instructions in blocks that aren't
+    // reachable. Such instructions don't matter, and they aren't required
+    // to obey basic rules for definitions dominating uses which this
+    // analysis depends on.
+    if (!DT.isReachableFromEntry(I->getParent()))
+      return getUnknown(PoisonValue::get(V->getType()));
+  } else if (ConstantInt *CI = dyn_cast<ConstantInt>(V))
+    return getConstant(CI);
+  else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
+    if (!GA->isInterposable()) {
+      Ops.push_back(GA->getAliasee());
+      return nullptr;
+    }
+    return getUnknown(V);
+  } else if (!isa<ConstantExpr>(V))
+    return getUnknown(V);
+
+  Operator *U = cast<Operator>(V);
+  if (auto BO = MatchBinaryOp(U, DT)) {
+    bool IsConstArg = isa<ConstantInt>(BO->RHS);
+    switch (U->getOpcode()) {
+    case Instruction::Add: {
+      // For additions and multiplications, traverse add/mul chains for which we
+      // can potentially create a single SCEV, to reduce the number of
+      // get{Add,Mul}Expr calls.
+      do {
+        if (BO->Op) {
+          if (BO->Op != V && getExistingSCEV(BO->Op)) {
+            Ops.push_back(BO->Op);
+            break;
+          }
+        }
+        Ops.push_back(BO->RHS);
+        auto NewBO = MatchBinaryOp(BO->LHS, DT);
+        if (!NewBO || (NewBO->Opcode != Instruction::Add &&
+                       NewBO->Opcode != Instruction::Sub)) {
+          Ops.push_back(BO->LHS);
+          break;
+        }
+        BO = NewBO;
+      } while (true);
+      return nullptr;
+    }
+
+    case Instruction::Mul: {
+      do {
+        if (BO->Op) {
+          if (BO->Op != V && getExistingSCEV(BO->Op)) {
+            Ops.push_back(BO->Op);
+            break;
+          }
+        }
+        Ops.push_back(BO->RHS);
+        auto NewBO = MatchBinaryOp(BO->LHS, DT);
+        if (!NewBO || NewBO->Opcode != Instruction::Mul) {
+          Ops.push_back(BO->LHS);
+          break;
+        }
+        BO = NewBO;
+      } while (true);
+      return nullptr;
+    }
+
+    case Instruction::AShr:
+    case Instruction::Shl:
+    case Instruction::Xor:
+      if (!IsConstArg)
+        return nullptr;
+      break;
+    case Instruction::And:
+    case Instruction::Or:
+      if (!IsConstArg && BO->LHS->getType()->isIntegerTy(1))
+        return nullptr;
+      break;
+    default:
+      break;
+    }
+
+    Ops.push_back(BO->LHS);
+    Ops.push_back(BO->RHS);
+    return nullptr;
+  }
+
+  switch (U->getOpcode()) {
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::PtrToInt:
+    Ops.push_back(U->getOperand(0));
+    return nullptr;
+
+  case Instruction::BitCast:
+    if (isSCEVable(U->getType()) && isSCEVable(U->getOperand(0)->getType())) {
+      Ops.push_back(U->getOperand(0));
+      return nullptr;
+    }
+    return getUnknown(V);
+
+  case Instruction::SDiv:
+  case Instruction::SRem:
+    Ops.push_back(U->getOperand(0));
+    Ops.push_back(U->getOperand(1));
+    return nullptr;
+
+  case Instruction::GetElementPtr:
+    assert(cast<GEPOperator>(U)->getSourceElementType()->isSized() &&
+           "GEP source element type must be sized");
+    for (Value *Index : U->operands())
+      Ops.push_back(Index);
+    return nullptr;
+
+  case Instruction::IntToPtr:
+    return getUnknown(V);
+
+  case Instruction::PHI:
+    // Keep constructing SCEVs' for phis recursively for now.
+    return nullptr;
+
+  case Instruction::Select:
+    for (Value *Inc : U->operands())
+      Ops.push_back(Inc);
+    return nullptr;
+    break;
+
+  case Instruction::Call:
+  case Instruction::Invoke:
+    if (Value *RV = cast<CallBase>(U)->getReturnedArgOperand()) {
+      Ops.push_back(RV);
+      return nullptr;
+    }
+
+    if (auto *II = dyn_cast<IntrinsicInst>(U)) {
+      switch (II->getIntrinsicID()) {
+      case Intrinsic::abs:
+        Ops.push_back(II->getArgOperand(0));
+        return nullptr;
+      case Intrinsic::umax:
+      case Intrinsic::umin:
+      case Intrinsic::smax:
+      case Intrinsic::smin:
+      case Intrinsic::usub_sat:
+      case Intrinsic::uadd_sat:
+        Ops.push_back(II->getArgOperand(0));
+        Ops.push_back(II->getArgOperand(1));
+        return nullptr;
+      case Intrinsic::start_loop_iterations:
+        Ops.push_back(II->getArgOperand(0));
+        return nullptr;
+      default:
+        break;
+      }
+    }
+    break;
+  }
+
+  return nullptr;
+}
+
 const SCEV *ScalarEvolution::createSCEV(Value *V) {
   if (!isSCEVable(V->getType()))
     return getUnknown(V);
@@ -7030,7 +7387,7 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
     // to obey basic rules for definitions dominating uses which this
     // analysis depends on.
     if (!DT.isReachableFromEntry(I->getParent()))
-      return getUnknown(UndefValue::get(V->getType()));
+      return getUnknown(PoisonValue::get(V->getType()));
   } else if (ConstantInt *CI = dyn_cast<ConstantInt>(V))
     return getConstant(CI);
   else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V))
@@ -7038,6 +7395,9 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
   else if (!isa<ConstantExpr>(V))
     return getUnknown(V);
 
+  const SCEV *LHS;
+  const SCEV *RHS;
+
   Operator *U = cast<Operator>(V);
   if (auto BO = MatchBinaryOp(U, DT)) {
     switch (BO->Opcode) {
@@ -7103,8 +7463,9 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
 
           SCEV::NoWrapFlags Flags = getNoWrapFlagsFromUB(BO->Op);
           if (Flags != SCEV::FlagAnyWrap) {
-            MulOps.push_back(
-                getMulExpr(getSCEV(BO->LHS), getSCEV(BO->RHS), Flags));
+            LHS = getSCEV(BO->LHS);
+            RHS = getSCEV(BO->RHS);
+            MulOps.push_back(getMulExpr(LHS, RHS, Flags));
             break;
           }
         }
@@ -7121,14 +7482,20 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
       return getMulExpr(MulOps);
     }
     case Instruction::UDiv:
-      return getUDivExpr(getSCEV(BO->LHS), getSCEV(BO->RHS));
+      LHS = getSCEV(BO->LHS);
+      RHS = getSCEV(BO->RHS);
+      return getUDivExpr(LHS, RHS);
     case Instruction::URem:
-      return getURemExpr(getSCEV(BO->LHS), getSCEV(BO->RHS));
+      LHS = getSCEV(BO->LHS);
+      RHS = getSCEV(BO->RHS);
+      return getURemExpr(LHS, RHS);
     case Instruction::Sub: {
       SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap;
       if (BO->Op)
         Flags = getNoWrapFlagsFromUB(BO->Op);
-      return getMinusSCEV(getSCEV(BO->LHS), getSCEV(BO->RHS), Flags);
+      LHS = getSCEV(BO->LHS);
+      RHS = getSCEV(BO->RHS);
+      return getMinusSCEV(LHS, RHS, Flags);
     }
     case Instruction::And:
       // For an expression like x&255 that merely masks off the high bits,
@@ -7180,6 +7547,12 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
               MulCount);
         }
       }
+      // Binary `and` is a bit-wise `umin`.
+      if (BO->LHS->getType()->isIntegerTy(1)) {
+        LHS = getSCEV(BO->LHS);
+        RHS = getSCEV(BO->RHS);
+        return getUMinExpr(LHS, RHS);
+      }
       break;
 
     case Instruction::Or:
@@ -7199,6 +7572,12 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
                             (SCEV::NoWrapFlags)(SCEV::FlagNUW | SCEV::FlagNSW));
         }
       }
+      // Binary `or` is a bit-wise `umax`.
+      if (BO->LHS->getType()->isIntegerTy(1)) {
+        LHS = getSCEV(BO->LHS);
+        RHS = getSCEV(BO->RHS);
+        return getUMaxExpr(LHS, RHS);
+      }
       break;
 
     case Instruction::Xor:
@@ -7266,9 +7645,9 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
             Flags = (SCEV::NoWrapFlags)(Flags | SCEV::FlagNUW);
         }
 
-        Constant *X = ConstantInt::get(
+        ConstantInt *X = ConstantInt::get(
             getContext(), APInt::getOneBitSet(BitWidth, SA->getZExtValue()));
-        return getMulExpr(getSCEV(BO->LHS), getSCEV(X), Flags);
+        return getMulExpr(getSCEV(BO->LHS), getConstant(X), Flags);
       }
       break;
 
@@ -7394,14 +7773,8 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
     return createNodeForPHI(cast<PHINode>(U));
 
   case Instruction::Select:
-    // U can also be a select constant expr, which let fall through.  Since
-    // createNodeForSelect only works for a condition that is an `ICmpInst`, and
-    // constant expressions cannot have instructions as operands, we'd have
-    // returned getUnknown for a select constant expressions anyway.
-    if (isa<Instruction>(U))
-      return createNodeForSelectOrPHI(cast<Instruction>(U), U->getOperand(0),
-                                      U->getOperand(1), U->getOperand(2));
-    break;
+    return createNodeForSelectOrPHI(U, U->getOperand(0), U->getOperand(1),
+                                    U->getOperand(2));
 
   case Instruction::Call:
   case Instruction::Invoke:
@@ -7415,17 +7788,21 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
             getSCEV(II->getArgOperand(0)),
             /*IsNSW=*/cast<ConstantInt>(II->getArgOperand(1))->isOne());
       case Intrinsic::umax:
-        return getUMaxExpr(getSCEV(II->getArgOperand(0)),
-                           getSCEV(II->getArgOperand(1)));
+        LHS = getSCEV(II->getArgOperand(0));
+        RHS = getSCEV(II->getArgOperand(1));
+        return getUMaxExpr(LHS, RHS);
       case Intrinsic::umin:
-        return getUMinExpr(getSCEV(II->getArgOperand(0)),
-                           getSCEV(II->getArgOperand(1)));
+        LHS = getSCEV(II->getArgOperand(0));
+        RHS = getSCEV(II->getArgOperand(1));
+        return getUMinExpr(LHS, RHS);
       case Intrinsic::smax:
-        return getSMaxExpr(getSCEV(II->getArgOperand(0)),
-                           getSCEV(II->getArgOperand(1)));
+        LHS = getSCEV(II->getArgOperand(0));
+        RHS = getSCEV(II->getArgOperand(1));
+        return getSMaxExpr(LHS, RHS);
       case Intrinsic::smin:
-        return getSMinExpr(getSCEV(II->getArgOperand(0)),
-                           getSCEV(II->getArgOperand(1)));
+        LHS = getSCEV(II->getArgOperand(0));
+        RHS = getSCEV(II->getArgOperand(1));
+        return getSMinExpr(LHS, RHS);
       case Intrinsic::usub_sat: {
         const SCEV *X = getSCEV(II->getArgOperand(0));
         const SCEV *Y = getSCEV(II->getArgOperand(1));
@@ -7640,7 +8017,7 @@ unsigned ScalarEvolution::getSmallConstantTripMultiple(const Loop *L) {
       Res = Multiple;
     Res = (unsigned)GreatestCommonDivisor64(*Res, Multiple);
   }
-  return Res.getValueOr(1);
+  return Res.value_or(1);
 }
 
 unsigned ScalarEvolution::getSmallConstantTripMultiple(const Loop *L,
@@ -7708,7 +8085,7 @@ const SCEV *ScalarEvolution::getExitCount(const Loop *L,
 
 const SCEV *
 ScalarEvolution::getPredicatedBackedgeTakenCount(const Loop *L,
-                                                 SCEVUnionPredicate &Preds) {
+                                                 SmallVector<const SCEVPredicate *, 4> &Preds) {
   return getPredicatedBackedgeTakenInfo(L).getExact(L, this, &Preds);
 }
 
@@ -7870,7 +8247,6 @@ void ScalarEvolution::forgetLoop(const Loop *L) {
     if (LoopUsersItr != LoopUsers.end()) {
       ToForget.insert(ToForget.end(), LoopUsersItr->second.begin(),
                 LoopUsersItr->second.end());
-      LoopUsers.erase(LoopUsersItr);
     }
 
     // Drop information about expressions based on loop-header PHIs.
@@ -7900,9 +8276,7 @@ void ScalarEvolution::forgetLoop(const Loop *L) {
 }
 
 void ScalarEvolution::forgetTopmostLoop(const Loop *L) {
-  while (Loop *Parent = L->getParentLoop())
-    L = Parent;
-  forgetLoop(L);
+  forgetLoop(L->getOutermostLoop());
 }
 
 void ScalarEvolution::forgetValue(Value *V) {
@@ -7944,7 +8318,7 @@ void ScalarEvolution::forgetLoopDispositions(const Loop *L) {
 /// the relevant loop exiting block using getExact(ExitingBlock, SE).
 const SCEV *
 ScalarEvolution::BackedgeTakenInfo::getExact(const Loop *L, ScalarEvolution *SE,
-                                             SCEVUnionPredicate *Preds) const {
+                                             SmallVector<const SCEVPredicate *, 4> *Preds) const {
   // If any exits were not computable, the loop is not computable.
   if (!isComplete() || ExitNotTaken.empty())
     return SE->getCouldNotCompute();
@@ -7966,14 +8340,18 @@ ScalarEvolution::BackedgeTakenInfo::getExact(const Loop *L, ScalarEvolution *SE,
 
     Ops.push_back(BECount);
 
-    if (Preds && !ENT.hasAlwaysTruePredicate())
-      Preds->add(ENT.Predicate.get());
+    if (Preds)
+      for (auto *P : ENT.Predicates)
+        Preds->push_back(P);
 
     assert((Preds || ENT.hasAlwaysTruePredicate()) &&
            "Predicate should be always true!");
   }
 
-  return SE->getUMinFromMismatchedTypes(Ops);
+  // If an earlier exit exits on the first iteration (exit count zero), then
+  // a later poison exit count should not propagate into the result. This are
+  // exactly the semantics provided by umin_seq.
+  return SE->getUMinFromMismatchedTypes(Ops, /* Sequential */ true);
 }
 
 /// Get the exact not taken count for this loop exit.
@@ -8082,16 +8460,8 @@ ScalarEvolution::BackedgeTakenInfo::BackedgeTakenInfo(
       [&](const EdgeExitInfo &EEI) {
         BasicBlock *ExitBB = EEI.first;
         const ExitLimit &EL = EEI.second;
-        if (EL.Predicates.empty())
-          return ExitNotTakenInfo(ExitBB, EL.ExactNotTaken, EL.MaxNotTaken,
-                                  nullptr);
-
-        std::unique_ptr<SCEVUnionPredicate> Predicate(new SCEVUnionPredicate);
-        for (auto *Pred : EL.Predicates)
-          Predicate->add(Pred);
-
         return ExitNotTakenInfo(ExitBB, EL.ExactNotTaken, EL.MaxNotTaken,
-                                std::move(Predicate));
+                                EL.Predicates);
       });
   assert((isa<SCEVCouldNotCompute>(ConstantMax) ||
           isa<SCEVConstant>(ConstantMax)) &&
@@ -8385,11 +8755,6 @@ ScalarEvolution::computeExitLimitFromCondFromBinOp(
       BECount = getUMinFromMismatchedTypes(
           EL0.ExactNotTaken, EL1.ExactNotTaken,
           /*Sequential=*/!isa<BinaryOperator>(ExitCond));
-
-      // If EL0.ExactNotTaken was zero and ExitCond was a short-circuit form,
-      // it should have been simplified to zero (see the condition (3) above)
-      assert(!isa<BinaryOperator>(ExitCond) || !EL0.ExactNotTaken->isZero() ||
-             BECount->isZero());
     }
     if (EL0.MaxNotTaken == getCouldNotCompute())
       MaxBECount = EL1.MaxNotTaken;
@@ -8470,7 +8835,8 @@ ScalarEvolution::computeExitLimitFromICmp(const Loop *L,
       ControlsExit && loopHasNoAbnormalExits(L) && loopIsFiniteByAssumption(L);
   // Simplify the operands before analyzing them.
   (void)SimplifyICmpOperands(Pred, LHS, RHS, /*Depth=*/0,
-                             ControllingFiniteLoop);
+                             (EnableFiniteLoopControl ? ControllingFiniteLoop
+                                                     : false));
 
   // If we have a comparison of a chrec against a constant, try to use value
   // ranges to answer this query.
@@ -8683,7 +9049,7 @@ ScalarEvolution::ExitLimit ScalarEvolution::computeShiftCompareExitLimit(
 
         // and the kind of shift should be match the kind of shift we peeled
         // off, if any.
-        (!PostShiftOpCode.hasValue() || *PostShiftOpCode == OpCodeOut);
+        (!PostShiftOpCode || *PostShiftOpCode == OpCodeOut);
   };
 
   PHINode *PN;
@@ -8871,13 +9237,6 @@ static Constant *EvaluateExpression(Value *V, const Loop *L,
     Operands[i] = C;
   }
 
-  if (CmpInst *CI = dyn_cast<CmpInst>(I))
-    return ConstantFoldCompareInstOperands(CI->getPredicate(), Operands[0],
-                                           Operands[1], DL, TLI);
-  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
-    if (!LI->isVolatile())
-      return ConstantFoldLoadFromConstPtr(Operands[0], LI->getType(), DL);
-  }
   return ConstantFoldInstOperands(I, Operands, DL, TLI);
 }
 
@@ -9121,58 +9480,42 @@ static Constant *BuildConstantFromSCEV(const SCEV *V) {
   }
   case scAddExpr: {
     const SCEVAddExpr *SA = cast<SCEVAddExpr>(V);
-    if (Constant *C = BuildConstantFromSCEV(SA->getOperand(0))) {
-      if (PointerType *PTy = dyn_cast<PointerType>(C->getType())) {
-        unsigned AS = PTy->getAddressSpace();
-        Type *DestPtrTy = Type::getInt8PtrTy(C->getContext(), AS);
-        C = ConstantExpr::getBitCast(C, DestPtrTy);
+    Constant *C = nullptr;
+    for (const SCEV *Op : SA->operands()) {
+      Constant *OpC = BuildConstantFromSCEV(Op);
+      if (!OpC)
+        return nullptr;
+      if (!C) {
+        C = OpC;
+        continue;
       }
-      for (unsigned i = 1, e = SA->getNumOperands(); i != e; ++i) {
-        Constant *C2 = BuildConstantFromSCEV(SA->getOperand(i));
-        if (!C2)
-          return nullptr;
-
-        // First pointer!
-        if (!C->getType()->isPointerTy() && C2->getType()->isPointerTy()) {
-          unsigned AS = C2->getType()->getPointerAddressSpace();
-          std::swap(C, C2);
-          Type *DestPtrTy = Type::getInt8PtrTy(C->getContext(), AS);
-          // The offsets have been converted to bytes.  We can add bytes to an
-          // i8* by GEP with the byte count in the first index.
-          C = ConstantExpr::getBitCast(C, DestPtrTy);
-        }
-
-        // Don't bother trying to sum two pointers. We probably can't
-        // statically compute a load that results from it anyway.
-        if (C2->getType()->isPointerTy())
-          return nullptr;
-
-        if (C->getType()->isPointerTy()) {
-          C = ConstantExpr::getGetElementPtr(Type::getInt8Ty(C->getContext()),
-                                             C, C2);
-        } else {
-          C = ConstantExpr::getAdd(C, C2);
-        }
+      assert(!C->getType()->isPointerTy() &&
+             "Can only have one pointer, and it must be last");
+      if (auto *PT = dyn_cast<PointerType>(OpC->getType())) {
+        // The offsets have been converted to bytes.  We can add bytes to an
+        // i8* by GEP with the byte count in the first index.
+        Type *DestPtrTy =
+            Type::getInt8PtrTy(PT->getContext(), PT->getAddressSpace());
+        OpC = ConstantExpr::getBitCast(OpC, DestPtrTy);
+        C = ConstantExpr::getGetElementPtr(Type::getInt8Ty(C->getContext()),
+                                           OpC, C);
+      } else {
+        C = ConstantExpr::getAdd(C, OpC);
       }
-      return C;
     }
-    return nullptr;
+    return C;
   }
   case scMulExpr: {
     const SCEVMulExpr *SM = cast<SCEVMulExpr>(V);
-    if (Constant *C = BuildConstantFromSCEV(SM->getOperand(0))) {
-      // Don't bother with pointers at all.
-      if (C->getType()->isPointerTy())
+    Constant *C = nullptr;
+    for (const SCEV *Op : SM->operands()) {
+      assert(!Op->getType()->isPointerTy() && "Can't multiply pointers");
+      Constant *OpC = BuildConstantFromSCEV(Op);
+      if (!OpC)
         return nullptr;
-      for (unsigned i = 1, e = SM->getNumOperands(); i != e; ++i) {
-        Constant *C2 = BuildConstantFromSCEV(SM->getOperand(i));
-        if (!C2 || C2->getType()->isPointerTy())
-          return nullptr;
-        C = ConstantExpr::getMul(C, C2);
-      }
-      return C;
+      C = C ? ConstantExpr::getMul(C, OpC) : OpC;
     }
-    return nullptr;
+    return C;
   }
   case scUDivExpr: {
     const SCEVUDivExpr *SU = cast<SCEVUDivExpr>(V);
@@ -9297,15 +9640,7 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) {
         if (MadeImprovement) {
           Constant *C = nullptr;
           const DataLayout &DL = getDataLayout();
-          if (const CmpInst *CI = dyn_cast<CmpInst>(I))
-            C = ConstantFoldCompareInstOperands(CI->getPredicate(), Operands[0],
-                                                Operands[1], DL, &TLI);
-          else if (const LoadInst *Load = dyn_cast<LoadInst>(I)) {
-            if (!Load->isVolatile())
-              C = ConstantFoldLoadFromConstPtr(Operands[0], Load->getType(),
-                                               DL);
-          } else
-            C = ConstantFoldInstOperands(I, Operands, DL, &TLI);
+          C = ConstantFoldInstOperands(I, Operands, DL, &TLI);
           if (!C) return V;
           return getSCEV(C);
         }
@@ -9535,15 +9870,15 @@ GetQuadraticEquation(const SCEVAddRecExpr *AddRec) {
 /// (b) if neither X nor Y exist, return None,
 /// (c) if exactly one of X and Y exists, return that value.
 static Optional<APInt> MinOptional(Optional<APInt> X, Optional<APInt> Y) {
-  if (X.hasValue() && Y.hasValue()) {
+  if (X && Y) {
     unsigned W = std::max(X->getBitWidth(), Y->getBitWidth());
-    APInt XW = X->sextOrSelf(W);
-    APInt YW = Y->sextOrSelf(W);
+    APInt XW = X->sext(W);
+    APInt YW = Y->sext(W);
     return XW.slt(YW) ? *X : *Y;
   }
-  if (!X.hasValue() && !Y.hasValue())
+  if (!X && !Y)
     return None;
-  return X.hasValue() ? *X : *Y;
+  return X ? *X : *Y;
 }
 
 /// Helper function to truncate an optional APInt to a given BitWidth.
@@ -9558,7 +9893,7 @@ static Optional<APInt> MinOptional(Optional<APInt> X, Optional<APInt> Y) {
 /// equation are BW+1 bits wide (to avoid truncation when converting from
 /// the addrec to the equation).
 static Optional<APInt> TruncIfPossible(Optional<APInt> X, unsigned BitWidth) {
-  if (!X.hasValue())
+  if (!X)
     return None;
   unsigned W = X->getBitWidth();
   if (BitWidth > 1 && BitWidth < W && X->isIntN(BitWidth))
@@ -9585,13 +9920,13 @@ SolveQuadraticAddRecExact(const SCEVAddRecExpr *AddRec, ScalarEvolution &SE) {
   APInt A, B, C, M;
   unsigned BitWidth;
   auto T = GetQuadraticEquation(AddRec);
-  if (!T.hasValue())
+  if (!T)
     return None;
 
   std::tie(A, B, C, M, BitWidth) = *T;
   LLVM_DEBUG(dbgs() << __func__ << ": solving for unsigned overflow\n");
   Optional<APInt> X = APIntOps::SolveQuadraticEquationWrap(A, B, C, BitWidth+1);
-  if (!X.hasValue())
+  if (!X)
     return None;
 
   ConstantInt *CX = ConstantInt::get(SE.getContext(), *X);
@@ -9627,7 +9962,7 @@ SolveQuadraticAddRecRange(const SCEVAddRecExpr *AddRec,
   APInt A, B, C, M;
   unsigned BitWidth;
   auto T = GetQuadraticEquation(AddRec);
-  if (!T.hasValue())
+  if (!T)
     return None;
 
   // Be careful about the return value: there can be two reasons for not
@@ -9672,7 +10007,7 @@ SolveQuadraticAddRecRange(const SCEVAddRecExpr *AddRec,
     // If SolveQuadraticEquationWrap returns None, it means that there can
     // be a solution, but the function failed to find it. We cannot treat it
     // as "no solution".
-    if (!SO.hasValue() || !UO.hasValue())
+    if (!SO || !UO)
       return { None, false };
 
     // Check the smaller value first to see if it leaves the range.
@@ -9690,8 +10025,8 @@ SolveQuadraticAddRecRange(const SCEVAddRecExpr *AddRec,
 
   std::tie(A, B, C, M, BitWidth) = *T;
   // Lower bound is inclusive, subtract 1 to represent the exiting value.
-  APInt Lower = Range.getLower().sextOrSelf(A.getBitWidth()) - 1;
-  APInt Upper = Range.getUpper().sextOrSelf(A.getBitWidth());
+  APInt Lower = Range.getLower().sext(A.getBitWidth()) - 1;
+  APInt Upper = Range.getUpper().sext(A.getBitWidth());
   auto SL = SolveForBoundary(Lower);
   auto SU = SolveForBoundary(Upper);
   // If any of the solutions was unknown, no meaninigful conclusions can
@@ -9776,7 +10111,7 @@ ScalarEvolution::howFarToZero(const SCEV *V, const Loop *L, bool ControlsExit,
     // value at this index.  When solving for "X*X != 5", for example, we
     // should not accept a root of 2.
     if (auto S = SolveQuadraticAddRecExact(AddRec, *this)) {
-      const auto *R = cast<SCEVConstant>(getConstant(S.getValue()));
+      const auto *R = cast<SCEVConstant>(getConstant(*S));
       return ExitLimit(R, R, false, Predicates);
     }
     return getCouldNotCompute();
@@ -10296,7 +10631,7 @@ ScalarEvolution::getMonotonicPredicateType(const SCEVAddRecExpr *LHS,
     auto ResultSwapped =
         getMonotonicPredicateTypeImpl(LHS, ICmpInst::getSwappedPredicate(Pred));
 
-    assert(ResultSwapped.hasValue() && "should be able to analyze both!");
+    assert(ResultSwapped && "should be able to analyze both!");
     assert(ResultSwapped.getValue() != Result.getValue() &&
            "monotonicity should flip as we flip the predicate");
   }
@@ -10479,17 +10814,27 @@ bool ScalarEvolution::isKnownPredicateViaConstantRanges(
     return false;
 
   if (Pred == CmpInst::ICMP_NE) {
-    if (CheckRanges(getSignedRange(LHS), getSignedRange(RHS)) ||
-        CheckRanges(getUnsignedRange(LHS), getUnsignedRange(RHS)))
+    auto SL = getSignedRange(LHS);
+    auto SR = getSignedRange(RHS);
+    if (CheckRanges(SL, SR))
+      return true;
+    auto UL = getUnsignedRange(LHS);
+    auto UR = getUnsignedRange(RHS);
+    if (CheckRanges(UL, UR))
       return true;
     auto *Diff = getMinusSCEV(LHS, RHS);
     return !isa<SCEVCouldNotCompute>(Diff) && isKnownNonZero(Diff);
   }
 
-  if (CmpInst::isSigned(Pred))
-    return CheckRanges(getSignedRange(LHS), getSignedRange(RHS));
+  if (CmpInst::isSigned(Pred)) {
+    auto SL = getSignedRange(LHS);
+    auto SR = getSignedRange(RHS);
+    return CheckRanges(SL, SR);
+  }
 
-  return CheckRanges(getUnsignedRange(LHS), getUnsignedRange(RHS));
+  auto UL = getUnsignedRange(LHS);
+  auto UR = getUnsignedRange(RHS);
+  return CheckRanges(UL, UR);
 }
 
 bool ScalarEvolution::isKnownPredicateViaNoOverflow(ICmpInst::Predicate Pred,
@@ -12596,7 +12941,7 @@ const SCEV *SCEVAddRecExpr::getNumIterationsInRange(const ConstantRange &Range,
 
   if (isQuadratic()) {
     if (auto S = SolveQuadraticAddRecRange(this, Range, SE))
-      return SE.getConstant(S.getValue());
+      return SE.getConstant(*S);
   }
 
   return SE.getCouldNotCompute();
@@ -12636,6 +12981,15 @@ bool ScalarEvolution::containsUndefs(const SCEV *S) const {
   });
 }
 
+// Return true when S contains a value that is a nullptr.
+bool ScalarEvolution::containsErasedValue(const SCEV *S) const {
+  return SCEVExprContains(S, [](const SCEV *S) {
+    if (const auto *SU = dyn_cast<SCEVUnknown>(S))
+      return SU->getValue() == nullptr;
+    return false;
+  });
+}
+
 /// Return the size of an element read or written by Inst.
 const SCEV *ScalarEvolution::getElementSize(Instruction *Inst) {
   Type *Ty;
@@ -12820,12 +13174,13 @@ static void PrintLoopInfo(raw_ostream &OS, ScalarEvolution *SE,
   L->getHeader()->printAsOperand(OS, /*PrintType=*/false);
   OS << ": ";
 
-  SCEVUnionPredicate Pred;
-  auto PBT = SE->getPredicatedBackedgeTakenCount(L, Pred);
+  SmallVector<const SCEVPredicate *, 4> Preds;
+  auto PBT = SE->getPredicatedBackedgeTakenCount(L, Preds);
   if (!isa<SCEVCouldNotCompute>(PBT)) {
     OS << "Predicated backedge-taken count is " << *PBT << "\n";
     OS << " Predicates:\n";
-    Pred.print(OS, 4);
+    for (auto *P : Preds)
+      P->print(OS, 4);
   } else {
     OS << "Unpredictable predicated backedge-taken count. ";
   }
@@ -13202,12 +13557,10 @@ void ScalarEvolution::forgetMemoizedResultsImpl(const SCEV *S) {
 
   auto ExprIt = ExprValueMap.find(S);
   if (ExprIt != ExprValueMap.end()) {
-    for (auto &ValueAndOffset : ExprIt->second) {
-      if (ValueAndOffset.second == nullptr) {
-        auto ValueIt = ValueExprMap.find_as(ValueAndOffset.first);
-        if (ValueIt != ValueExprMap.end())
-          ValueExprMap.erase(ValueIt);
-      }
+    for (Value *V : ExprIt->second) {
+      auto ValueIt = ValueExprMap.find_as(V);
+      if (ValueIt != ValueExprMap.end())
+        ValueExprMap.erase(ValueIt);
     }
     ExprValueMap.erase(ExprIt);
   }
@@ -13258,6 +13611,43 @@ ScalarEvolution::getUsedLoops(const SCEV *S,
   SCEVTraversal<FindUsedLoops>(F).visitAll(S);
 }
 
+void ScalarEvolution::getReachableBlocks(
+    SmallPtrSetImpl<BasicBlock *> &Reachable, Function &F) {
+  SmallVector<BasicBlock *> Worklist;
+  Worklist.push_back(&F.getEntryBlock());
+  while (!Worklist.empty()) {
+    BasicBlock *BB = Worklist.pop_back_val();
+    if (!Reachable.insert(BB).second)
+      continue;
+
+    Value *Cond;
+    BasicBlock *TrueBB, *FalseBB;
+    if (match(BB->getTerminator(), m_Br(m_Value(Cond), m_BasicBlock(TrueBB),
+                                        m_BasicBlock(FalseBB)))) {
+      if (auto *C = dyn_cast<ConstantInt>(Cond)) {
+        Worklist.push_back(C->isOne() ? TrueBB : FalseBB);
+        continue;
+      }
+
+      if (auto *Cmp = dyn_cast<ICmpInst>(Cond)) {
+        const SCEV *L = getSCEV(Cmp->getOperand(0));
+        const SCEV *R = getSCEV(Cmp->getOperand(1));
+        if (isKnownPredicateViaConstantRanges(Cmp->getPredicate(), L, R)) {
+          Worklist.push_back(TrueBB);
+          continue;
+        }
+        if (isKnownPredicateViaConstantRanges(Cmp->getInversePredicate(), L,
+                                              R)) {
+          Worklist.push_back(FalseBB);
+          continue;
+        }
+      }
+    }
+
+    append_range(Worklist, successors(BB));
+  }
+}
+
 void ScalarEvolution::verify() const {
   ScalarEvolution &SE = *const_cast<ScalarEvolution *>(this);
   ScalarEvolution SE2(F, TLI, AC, DT, LI);
@@ -13282,13 +13672,44 @@ void ScalarEvolution::verify() const {
   };
 
   SCEVMapper SCM(SE2);
+  SmallPtrSet<BasicBlock *, 16> ReachableBlocks;
+  SE2.getReachableBlocks(ReachableBlocks, F);
+
+  auto GetDelta = [&](const SCEV *Old, const SCEV *New) -> const SCEV * {
+    if (containsUndefs(Old) || containsUndefs(New)) {
+      // SCEV treats "undef" as an unknown but consistent value (i.e. it does
+      // not propagate undef aggressively).  This means we can (and do) fail
+      // verification in cases where a transform makes a value go from "undef"
+      // to "undef+1" (say).  The transform is fine, since in both cases the
+      // result is "undef", but SCEV thinks the value increased by 1.
+      return nullptr;
+    }
+
+    // Unless VerifySCEVStrict is set, we only compare constant deltas.
+    const SCEV *Delta = SE2.getMinusSCEV(Old, New);
+    if (!VerifySCEVStrict && !isa<SCEVConstant>(Delta))
+      return nullptr;
+
+    return Delta;
+  };
 
   while (!LoopStack.empty()) {
     auto *L = LoopStack.pop_back_val();
     llvm::append_range(LoopStack, *L);
 
-    auto *CurBECount = SCM.visit(
-        const_cast<ScalarEvolution *>(this)->getBackedgeTakenCount(L));
+    // Only verify BECounts in reachable loops. For an unreachable loop,
+    // any BECount is legal.
+    if (!ReachableBlocks.contains(L->getHeader()))
+      continue;
+
+    // Only verify cached BECounts. Computing new BECounts may change the
+    // results of subsequent SCEV uses.
+    auto It = BackedgeTakenCounts.find(L);
+    if (It == BackedgeTakenCounts.end())
+      continue;
+
+    auto *CurBECount =
+        SCM.visit(It->second.getExact(L, const_cast<ScalarEvolution *>(this)));
     auto *NewBECount = SE2.getBackedgeTakenCount(L);
 
     if (CurBECount == SE2.getCouldNotCompute() ||
@@ -13301,16 +13722,6 @@ void ScalarEvolution::verify() const {
       continue;
     }
 
-    if (containsUndefs(CurBECount) || containsUndefs(NewBECount)) {
-      // SCEV treats "undef" as an unknown but consistent value (i.e. it does
-      // not propagate undef aggressively).  This means we can (and do) fail
-      // verification in cases where a transform makes the trip count of a loop
-      // go from "undef" to "undef+1" (say).  The transform is fine, since in
-      // both cases the loop iterates "undef" times, but SCEV thinks we
-      // increased the trip count of the loop by 1 incorrectly.
-      continue;
-    }
-
     if (SE.getTypeSizeInBits(CurBECount->getType()) >
         SE.getTypeSizeInBits(NewBECount->getType()))
       NewBECount = SE2.getZeroExtendExpr(NewBECount, CurBECount->getType());
@@ -13318,10 +13729,8 @@ void ScalarEvolution::verify() const {
              SE.getTypeSizeInBits(NewBECount->getType()))
       CurBECount = SE2.getZeroExtendExpr(CurBECount, NewBECount->getType());
 
-    const SCEV *Delta = SE2.getMinusSCEV(CurBECount, NewBECount);
-
-    // Unless VerifySCEVStrict is set, we only compare constant deltas.
-    if ((VerifySCEVStrict || isa<SCEVConstant>(Delta)) && !Delta->isZero()) {
+    const SCEV *Delta = GetDelta(CurBECount, NewBECount);
+    if (Delta && !Delta->isZero()) {
       dbgs() << "Trip Count for " << *L << " Changed!\n";
       dbgs() << "Old: " << *CurBECount << "\n";
       dbgs() << "New: " << *NewBECount << "\n";
@@ -13335,10 +13744,8 @@ void ScalarEvolution::verify() const {
   SmallVector<Loop *, 32> Worklist(LI.begin(), LI.end());
   while (!Worklist.empty()) {
     Loop *L = Worklist.pop_back_val();
-    if (ValidLoops.contains(L))
-      continue;
-    ValidLoops.insert(L);
-    Worklist.append(L->begin(), L->end());
+    if (ValidLoops.insert(L).second)
+      Worklist.append(L->begin(), L->end());
   }
   for (auto &KV : ValueExprMap) {
 #ifndef NDEBUG
@@ -13351,27 +13758,38 @@ void ScalarEvolution::verify() const {
 
     // Check that the value is also part of the reverse map.
     auto It = ExprValueMap.find(KV.second);
-    if (It == ExprValueMap.end() || !It->second.contains({KV.first, nullptr})) {
+    if (It == ExprValueMap.end() || !It->second.contains(KV.first)) {
       dbgs() << "Value " << *KV.first
              << " is in ValueExprMap but not in ExprValueMap\n";
       std::abort();
     }
-  }
 
-  for (const auto &KV : ExprValueMap) {
-    for (const auto &ValueAndOffset : KV.second) {
-      if (ValueAndOffset.second != nullptr)
+    if (auto *I = dyn_cast<Instruction>(&*KV.first)) {
+      if (!ReachableBlocks.contains(I->getParent()))
         continue;
+      const SCEV *OldSCEV = SCM.visit(KV.second);
+      const SCEV *NewSCEV = SE2.getSCEV(I);
+      const SCEV *Delta = GetDelta(OldSCEV, NewSCEV);
+      if (Delta && !Delta->isZero()) {
+        dbgs() << "SCEV for value " << *I << " changed!\n"
+               << "Old: " << *OldSCEV << "\n"
+               << "New: " << *NewSCEV << "\n"
+               << "Delta: " << *Delta << "\n";
+        std::abort();
+      }
+    }
+  }
 
-      auto It = ValueExprMap.find_as(ValueAndOffset.first);
+  for (const auto &KV : ExprValueMap) {
+    for (Value *V : KV.second) {
+      auto It = ValueExprMap.find_as(V);
       if (It == ValueExprMap.end()) {
-        dbgs() << "Value " << *ValueAndOffset.first
+        dbgs() << "Value " << *V
                << " is in ExprValueMap but not in ValueExprMap\n";
         std::abort();
       }
       if (It->second != KV.first) {
-        dbgs() << "Value " << *ValueAndOffset.first
-               << " mapped to " << *It->second
+        dbgs() << "Value " << *V << " mapped to " << *It->second
                << " rather than " << *KV.first << "\n";
         std::abort();
       }
@@ -13537,18 +13955,25 @@ void ScalarEvolutionWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
 
 const SCEVPredicate *ScalarEvolution::getEqualPredicate(const SCEV *LHS,
                                                         const SCEV *RHS) {
+  return getComparePredicate(ICmpInst::ICMP_EQ, LHS, RHS);
+}
+
+const SCEVPredicate *
+ScalarEvolution::getComparePredicate(const ICmpInst::Predicate Pred,
+                                     const SCEV *LHS, const SCEV *RHS) {
   FoldingSetNodeID ID;
   assert(LHS->getType() == RHS->getType() &&
          "Type mismatch between LHS and RHS");
   // Unique this node based on the arguments
-  ID.AddInteger(SCEVPredicate::P_Equal);
+  ID.AddInteger(SCEVPredicate::P_Compare);
+  ID.AddInteger(Pred);
   ID.AddPointer(LHS);
   ID.AddPointer(RHS);
   void *IP = nullptr;
   if (const auto *S = UniquePreds.FindNodeOrInsertPos(ID, IP))
     return S;
-  SCEVEqualPredicate *Eq = new (SCEVAllocator)
-      SCEVEqualPredicate(ID.Intern(SCEVAllocator), LHS, RHS);
+  SCEVComparePredicate *Eq = new (SCEVAllocator)
+    SCEVComparePredicate(ID.Intern(SCEVAllocator), Pred, LHS, RHS);
   UniquePreds.InsertNode(Eq, IP);
   return Eq;
 }
@@ -13585,18 +14010,24 @@ public:
   /// \p NewPreds such that the result will be an AddRecExpr.
   static const SCEV *rewrite(const SCEV *S, const Loop *L, ScalarEvolution &SE,
                              SmallPtrSetImpl<const SCEVPredicate *> *NewPreds,
-                             SCEVUnionPredicate *Pred) {
+                             const SCEVPredicate *Pred) {
     SCEVPredicateRewriter Rewriter(L, SE, NewPreds, Pred);
     return Rewriter.visit(S);
   }
 
   const SCEV *visitUnknown(const SCEVUnknown *Expr) {
     if (Pred) {
-      auto ExprPreds = Pred->getPredicatesForExpr(Expr);
-      for (auto *Pred : ExprPreds)
-        if (const auto *IPred = dyn_cast<SCEVEqualPredicate>(Pred))
-          if (IPred->getLHS() == Expr)
-            return IPred->getRHS();
+      if (auto *U = dyn_cast<SCEVUnionPredicate>(Pred)) {
+        for (auto *Pred : U->getPredicates())
+          if (const auto *IPred = dyn_cast<SCEVComparePredicate>(Pred))
+            if (IPred->getLHS() == Expr &&
+                IPred->getPredicate() == ICmpInst::ICMP_EQ)
+              return IPred->getRHS();
+      } else if (const auto *IPred = dyn_cast<SCEVComparePredicate>(Pred)) {
+        if (IPred->getLHS() == Expr &&
+            IPred->getPredicate() == ICmpInst::ICMP_EQ)
+          return IPred->getRHS();
+      }
     }
     return convertToAddRecWithPreds(Expr);
   }
@@ -13636,7 +14067,7 @@ public:
 private:
   explicit SCEVPredicateRewriter(const Loop *L, ScalarEvolution &SE,
                         SmallPtrSetImpl<const SCEVPredicate *> *NewPreds,
-                        SCEVUnionPredicate *Pred)
+                        const SCEVPredicate *Pred)
       : SCEVRewriteVisitor(SE), NewPreds(NewPreds), Pred(Pred), L(L) {}
 
   bool addOverflowAssumption(const SCEVPredicate *P) {
@@ -13670,8 +14101,7 @@ private:
     for (auto *P : PredicatedRewrite->second){
       // Wrap predicates from outer loops are not supported.
       if (auto *WP = dyn_cast<const SCEVWrapPredicate>(P)) {
-        auto *AR = cast<const SCEVAddRecExpr>(WP->getExpr());
-        if (L != AR->getLoop())
+        if (L != WP->getExpr()->getLoop())
           return Expr;
       }
       if (!addOverflowAssumption(P))
@@ -13681,14 +14111,15 @@ private:
   }
 
   SmallPtrSetImpl<const SCEVPredicate *> *NewPreds;
-  SCEVUnionPredicate *Pred;
+  const SCEVPredicate *Pred;
   const Loop *L;
 };
 
 } // end anonymous namespace
 
-const SCEV *ScalarEvolution::rewriteUsingPredicate(const SCEV *S, const Loop *L,
-                                                   SCEVUnionPredicate &Preds) {
+const SCEV *
+ScalarEvolution::rewriteUsingPredicate(const SCEV *S, const Loop *L,
+                                       const SCEVPredicate &Preds) {
   return SCEVPredicateRewriter::rewrite(S, L, *this, nullptr, &Preds);
 }
 
@@ -13715,28 +14146,36 @@ SCEVPredicate::SCEVPredicate(const FoldingSetNodeIDRef ID,
                              SCEVPredicateKind Kind)
     : FastID(ID), Kind(Kind) {}
 
-SCEVEqualPredicate::SCEVEqualPredicate(const FoldingSetNodeIDRef ID,
-                                       const SCEV *LHS, const SCEV *RHS)
-    : SCEVPredicate(ID, P_Equal), LHS(LHS), RHS(RHS) {
+SCEVComparePredicate::SCEVComparePredicate(const FoldingSetNodeIDRef ID,
+                                   const ICmpInst::Predicate Pred,
+                                   const SCEV *LHS, const SCEV *RHS)
+  : SCEVPredicate(ID, P_Compare), Pred(Pred), LHS(LHS), RHS(RHS) {
   assert(LHS->getType() == RHS->getType() && "LHS and RHS types don't match");
   assert(LHS != RHS && "LHS and RHS are the same SCEV");
 }
 
-bool SCEVEqualPredicate::implies(const SCEVPredicate *N) const {
-  const auto *Op = dyn_cast<SCEVEqualPredicate>(N);
+bool SCEVComparePredicate::implies(const SCEVPredicate *N) const {
+  const auto *Op = dyn_cast<SCEVComparePredicate>(N);
 
   if (!Op)
     return false;
 
+  if (Pred != ICmpInst::ICMP_EQ)
+    return false;
+
   return Op->LHS == LHS && Op->RHS == RHS;
 }
 
-bool SCEVEqualPredicate::isAlwaysTrue() const { return false; }
+bool SCEVComparePredicate::isAlwaysTrue() const { return false; }
 
-const SCEV *SCEVEqualPredicate::getExpr() const { return LHS; }
+void SCEVComparePredicate::print(raw_ostream &OS, unsigned Depth) const {
+  if (Pred == ICmpInst::ICMP_EQ)
+    OS.indent(Depth) << "Equal predicate: " << *LHS << " == " << *RHS << "\n";
+  else
+    OS.indent(Depth) << "Compare predicate: " << *LHS
+                     << " " << CmpInst::getPredicateName(Pred) << ") "
+                     << *RHS << "\n";
 
-void SCEVEqualPredicate::print(raw_ostream &OS, unsigned Depth) const {
-  OS.indent(Depth) << "Equal predicate: " << *LHS << " == " << *RHS << "\n";
 }
 
 SCEVWrapPredicate::SCEVWrapPredicate(const FoldingSetNodeIDRef ID,
@@ -13744,7 +14183,7 @@ SCEVWrapPredicate::SCEVWrapPredicate(const FoldingSetNodeIDRef ID,
                                      IncrementWrapFlags Flags)
     : SCEVPredicate(ID, P_Wrap), AR(AR), Flags(Flags) {}
 
-const SCEV *SCEVWrapPredicate::getExpr() const { return AR; }
+const SCEVAddRecExpr *SCEVWrapPredicate::getExpr() const { return AR; }
 
 bool SCEVWrapPredicate::implies(const SCEVPredicate *N) const {
   const auto *Op = dyn_cast<SCEVWrapPredicate>(N);
@@ -13793,38 +14232,26 @@ SCEVWrapPredicate::getImpliedFlags(const SCEVAddRecExpr *AR,
 }
 
 /// Union predicates don't get cached so create a dummy set ID for it.
-SCEVUnionPredicate::SCEVUnionPredicate()
-    : SCEVPredicate(FoldingSetNodeIDRef(nullptr, 0), P_Union) {}
+SCEVUnionPredicate::SCEVUnionPredicate(ArrayRef<const SCEVPredicate *> Preds)
+  : SCEVPredicate(FoldingSetNodeIDRef(nullptr, 0), P_Union) {
+  for (auto *P : Preds)
+    add(P);
+}
 
 bool SCEVUnionPredicate::isAlwaysTrue() const {
   return all_of(Preds,
                 [](const SCEVPredicate *I) { return I->isAlwaysTrue(); });
 }
 
-ArrayRef<const SCEVPredicate *>
-SCEVUnionPredicate::getPredicatesForExpr(const SCEV *Expr) {
-  auto I = SCEVToPreds.find(Expr);
-  if (I == SCEVToPreds.end())
-    return ArrayRef<const SCEVPredicate *>();
-  return I->second;
-}
-
 bool SCEVUnionPredicate::implies(const SCEVPredicate *N) const {
   if (const auto *Set = dyn_cast<SCEVUnionPredicate>(N))
     return all_of(Set->Preds,
                   [this](const SCEVPredicate *I) { return this->implies(I); });
 
-  auto ScevPredsIt = SCEVToPreds.find(N->getExpr());
-  if (ScevPredsIt == SCEVToPreds.end())
-    return false;
-  auto &SCEVPreds = ScevPredsIt->second;
-
-  return any_of(SCEVPreds,
+  return any_of(Preds,
                 [N](const SCEVPredicate *I) { return I->implies(N); });
 }
 
-const SCEV *SCEVUnionPredicate::getExpr() const { return nullptr; }
-
 void SCEVUnionPredicate::print(raw_ostream &OS, unsigned Depth) const {
   for (auto Pred : Preds)
     Pred->print(OS, Depth);
@@ -13837,20 +14264,15 @@ void SCEVUnionPredicate::add(const SCEVPredicate *N) {
     return;
   }
 
-  if (implies(N))
-    return;
-
-  const SCEV *Key = N->getExpr();
-  assert(Key && "Only SCEVUnionPredicate doesn't have an "
-                " associated expression!");
-
-  SCEVToPreds[Key].push_back(N);
   Preds.push_back(N);
 }
 
 PredicatedScalarEvolution::PredicatedScalarEvolution(ScalarEvolution &SE,
                                                      Loop &L)
-    : SE(SE), L(L) {}
+    : SE(SE), L(L) {
+  SmallVector<const SCEVPredicate*, 4> Empty;
+  Preds = std::make_unique<SCEVUnionPredicate>(Empty);
+}
 
 void ScalarEvolution::registerUser(const SCEV *User,
                                    ArrayRef<const SCEV *> Ops) {
@@ -13875,7 +14297,7 @@ const SCEV *PredicatedScalarEvolution::getSCEV(Value *V) {
   if (Entry.second)
     Expr = Entry.second;
 
-  const SCEV *NewSCEV = SE.rewriteUsingPredicate(Expr, &L, Preds);
+  const SCEV *NewSCEV = SE.rewriteUsingPredicate(Expr, &L, *Preds);
   Entry = {Generation, NewSCEV};
 
   return NewSCEV;
@@ -13883,22 +14305,27 @@ const SCEV *PredicatedScalarEvolution::getSCEV(Value *V) {
 
 const SCEV *PredicatedScalarEvolution::getBackedgeTakenCount() {
   if (!BackedgeCount) {
-    SCEVUnionPredicate BackedgePred;
-    BackedgeCount = SE.getPredicatedBackedgeTakenCount(&L, BackedgePred);
-    addPredicate(BackedgePred);
+    SmallVector<const SCEVPredicate *, 4> Preds;
+    BackedgeCount = SE.getPredicatedBackedgeTakenCount(&L, Preds);
+    for (auto *P : Preds)
+      addPredicate(*P);
   }
   return BackedgeCount;
 }
 
 void PredicatedScalarEvolution::addPredicate(const SCEVPredicate &Pred) {
-  if (Preds.implies(&Pred))
+  if (Preds->implies(&Pred))
     return;
-  Preds.add(&Pred);
+
+  auto &OldPreds = Preds->getPredicates();
+  SmallVector<const SCEVPredicate*, 4> NewPreds(OldPreds.begin(), OldPreds.end());
+  NewPreds.push_back(&Pred);
+  Preds = std::make_unique<SCEVUnionPredicate>(NewPreds);
   updateGeneration();
 }
 
-const SCEVUnionPredicate &PredicatedScalarEvolution::getUnionPredicate() const {
-  return Preds;
+const SCEVPredicate &PredicatedScalarEvolution::getPredicate() const {
+  return *Preds;
 }
 
 void PredicatedScalarEvolution::updateGeneration() {
@@ -13906,7 +14333,7 @@ void PredicatedScalarEvolution::updateGeneration() {
   if (++Generation == 0) {
     for (auto &II : RewriteMap) {
       const SCEV *Rewritten = II.second.second;
-      II.second = {Generation, SE.rewriteUsingPredicate(Rewritten, &L, Preds)};
+      II.second = {Generation, SE.rewriteUsingPredicate(Rewritten, &L, *Preds)};
     }
   }
 }
@@ -13952,17 +14379,17 @@ const SCEVAddRecExpr *PredicatedScalarEvolution::getAsAddRec(Value *V) {
     return nullptr;
 
   for (auto *P : NewPreds)
-    Preds.add(P);
+    addPredicate(*P);
 
-  updateGeneration();
   RewriteMap[SE.getSCEV(V)] = {Generation, New};
   return New;
 }
 
 PredicatedScalarEvolution::PredicatedScalarEvolution(
     const PredicatedScalarEvolution &Init)
-    : RewriteMap(Init.RewriteMap), SE(Init.SE), L(Init.L), Preds(Init.Preds),
-      Generation(Init.Generation), BackedgeCount(Init.BackedgeCount) {
+  : RewriteMap(Init.RewriteMap), SE(Init.SE), L(Init.L),
+    Preds(std::make_unique<SCEVUnionPredicate>(Init.Preds->getPredicates())),
+    Generation(Init.Generation), BackedgeCount(Init.BackedgeCount) {
   for (auto I : Init.FlagsMap)
     FlagsMap.insert(I);
 }
@@ -14243,12 +14670,23 @@ const SCEV *ScalarEvolution::applyLoopGuards(const SCEV *Expr, const Loop *L) {
         ExprsToRewrite.push_back(LHS);
     }
   };
-  // First, collect conditions from dominating branches. Starting at the loop
+
+  SmallVector<std::pair<Value *, bool>> Terms;
+  // First, collect information from assumptions dominating the loop.
+  for (auto &AssumeVH : AC.assumptions()) {
+    if (!AssumeVH)
+      continue;
+    auto *AssumeI = cast<CallInst>(AssumeVH);
+    if (!DT.dominates(AssumeI, L->getHeader()))
+      continue;
+    Terms.emplace_back(AssumeI->getOperand(0), true);
+  }
+
+  // Second, collect conditions from dominating branches. Starting at the loop
   // predecessor, climb up the predecessor chain, as long as there are
   // predecessors that can be found that have unique successors leading to the
   // original header.
   // TODO: share this logic with isLoopEntryGuardedByCond.
-  SmallVector<std::pair<Value *, bool>> Terms;
   for (std::pair<const BasicBlock *, const BasicBlock *> Pair(
            L->getLoopPredecessor(), L->getHeader());
        Pair.first; Pair = getPredecessorWithUniqueSuccessorForBB(Pair.first)) {
@@ -14280,8 +14718,9 @@ const SCEV *ScalarEvolution::applyLoopGuards(const SCEV *Expr, const Loop *L) {
       if (auto *Cmp = dyn_cast<ICmpInst>(Cond)) {
         auto Predicate =
             EnterIfTrue ? Cmp->getPredicate() : Cmp->getInversePredicate();
-        CollectCondition(Predicate, getSCEV(Cmp->getOperand(0)),
-                         getSCEV(Cmp->getOperand(1)), RewriteMap);
+        const auto *LHS = getSCEV(Cmp->getOperand(0));
+        const auto *RHS = getSCEV(Cmp->getOperand(1));
+        CollectCondition(Predicate, LHS, RHS, RewriteMap);
         continue;
       }
 
@@ -14294,18 +14733,6 @@ const SCEV *ScalarEvolution::applyLoopGuards(const SCEV *Expr, const Loop *L) {
     }
   }
 
-  // Also collect information from assumptions dominating the loop.
-  for (auto &AssumeVH : AC.assumptions()) {
-    if (!AssumeVH)
-      continue;
-    auto *AssumeI = cast<CallInst>(AssumeVH);
-    auto *Cmp = dyn_cast<ICmpInst>(AssumeI->getOperand(0));
-    if (!Cmp || !DT.dominates(AssumeI, L->getHeader()))
-      continue;
-    CollectCondition(Cmp->getPredicate(), getSCEV(Cmp->getOperand(0)),
-                     getSCEV(Cmp->getOperand(1)), RewriteMap);
-  }
-
   if (RewriteMap.empty())
     return Expr;
 
diff --git a/llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp b/llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
index f4fa159d1ec7..3d47dc6b30df 100644
--- a/llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
@@ -20,6 +20,7 @@
 
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/InitializePasses.h"
 using namespace llvm;
 
diff --git a/llvm/lib/Analysis/ScalarEvolutionDivision.cpp b/llvm/lib/Analysis/ScalarEvolutionDivision.cpp
index 64e908bdf342..0619569bf816 100644
--- a/llvm/lib/Analysis/ScalarEvolutionDivision.cpp
+++ b/llvm/lib/Analysis/ScalarEvolutionDivision.cpp
@@ -15,9 +15,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
 #include <cassert>
 #include <cstdint>
 
diff --git a/llvm/lib/Analysis/ScalarEvolutionNormalization.cpp b/llvm/lib/Analysis/ScalarEvolutionNormalization.cpp
index 209ae66ca53e..22dff5efec5c 100644
--- a/llvm/lib/Analysis/ScalarEvolutionNormalization.cpp
+++ b/llvm/lib/Analysis/ScalarEvolutionNormalization.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/Analysis/ScalarEvolutionNormalization.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 using namespace llvm;
 
diff --git a/llvm/lib/Analysis/ScopedNoAliasAA.cpp b/llvm/lib/Analysis/ScopedNoAliasAA.cpp
index e847bf8f0f6b..f510991b4463 100644
--- a/llvm/lib/Analysis/ScopedNoAliasAA.cpp
+++ b/llvm/lib/Analysis/ScopedNoAliasAA.cpp
@@ -36,7 +36,6 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/InitializePasses.h"
diff --git a/llvm/lib/Analysis/StackLifetime.cpp b/llvm/lib/Analysis/StackLifetime.cpp
index 9056cc01484d..52e8566aca3c 100644
--- a/llvm/lib/Analysis/StackLifetime.cpp
+++ b/llvm/lib/Analysis/StackLifetime.cpp
@@ -19,17 +19,12 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
-#include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FormattedStream.h"
 #include <algorithm>
-#include <memory>
 #include <tuple>
 
 using namespace llvm;
@@ -75,7 +70,7 @@ static const AllocaInst *findMatchingAlloca(const IntrinsicInst &II,
   auto AllocaSizeInBits = AI->getAllocationSizeInBits(DL);
   if (!AllocaSizeInBits)
     return nullptr;
-  int64_t AllocaSize = AllocaSizeInBits.getValue() / 8;
+  int64_t AllocaSize = *AllocaSizeInBits / 8;
 
   auto *Size = dyn_cast<ConstantInt>(II.getArgOperand(0));
   if (!Size)
diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
index 54f3605ee033..94b646ab7c06 100644
--- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp
+++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
@@ -15,7 +15,6 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/StackLifetime.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -384,9 +383,9 @@ bool StackSafetyLocalAnalysis::isSafeAccess(const Use &U, AllocaInst *AI,
   const SCEV *Max = SE.getMinusSCEV(ToDiffTy(SE.getConstant(Size.getUpper())),
                                     ToDiffTy(AccessSize));
   return SE.evaluatePredicateAt(ICmpInst::Predicate::ICMP_SGE, Diff, Min, I)
-             .getValueOr(false) &&
+             .value_or(false) &&
          SE.evaluatePredicateAt(ICmpInst::Predicate::ICMP_SLE, Diff, Max, I)
-             .getValueOr(false);
+             .value_or(false);
 }
 
 /// The function analyzes all local uses of Ptr (alloca or argument) and
diff --git a/llvm/lib/Analysis/StratifiedSets.h b/llvm/lib/Analysis/StratifiedSets.h
index 60ea2451b0ef..883ebd24efdc 100644
--- a/llvm/lib/Analysis/StratifiedSets.h
+++ b/llvm/lib/Analysis/StratifiedSets.h
@@ -340,10 +340,10 @@ public:
     return StratifiedSets<T>(std::move(Values), std::move(StratLinks));
   }
 
-  bool has(const T &Elem) const { return get(Elem).hasValue(); }
+  bool has(const T &Elem) const { return get(Elem).has_value(); }
 
   bool add(const T &Main) {
-    if (get(Main).hasValue())
+    if (get(Main))
       return false;
 
     auto NewIndex = getNewUnlinkedIndex();
@@ -560,7 +560,7 @@ private:
 
   Optional<StratifiedIndex> indexOf(const T &Val) {
     auto MaybeVal = get(Val);
-    if (!MaybeVal.hasValue())
+    if (!MaybeVal)
       return None;
     auto *Info = *MaybeVal;
     auto &Link = linksAt(Info->Index);
diff --git a/llvm/lib/Analysis/SyncDependenceAnalysis.cpp b/llvm/lib/Analysis/SyncDependenceAnalysis.cpp
index ff833b55bbce..3446e50a4344 100644
--- a/llvm/lib/Analysis/SyncDependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/SyncDependenceAnalysis.cpp
@@ -116,18 +116,16 @@
 //   around from the latch.
 //
 //===----------------------------------------------------------------------===//
+
 #include "llvm/Analysis/SyncDependenceAnalysis.h"
-#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 
 #include <functional>
-#include <stack>
-#include <unordered_set>
 
 #define DEBUG_TYPE "sync-dependence"
 
@@ -257,7 +255,7 @@ SyncDependenceAnalysis::SyncDependenceAnalysis(const DominatorTree &DT,
                     [&](const BasicBlock &BB) { LoopPO.appendBlock(BB); });
 }
 
-SyncDependenceAnalysis::~SyncDependenceAnalysis() {}
+SyncDependenceAnalysis::~SyncDependenceAnalysis() = default;
 
 // divergence propagator for reducible CFGs
 struct DivergencePropagator {
diff --git a/llvm/lib/Analysis/SyntheticCountsUtils.cpp b/llvm/lib/Analysis/SyntheticCountsUtils.cpp
index a3edce76cd88..29c41fda5e28 100644
--- a/llvm/lib/Analysis/SyntheticCountsUtils.cpp
+++ b/llvm/lib/Analysis/SyntheticCountsUtils.cpp
@@ -14,9 +14,6 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/Analysis/CallGraph.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/ModuleSummaryIndex.h"
 
 using namespace llvm;
@@ -57,7 +54,7 @@ void SyntheticCountsUtils<CallGraphType>::propagateFromSCC(
     if (!OptProfCount)
       continue;
     auto Callee = CGT::edge_dest(E.second);
-    AdditionalCounts[Callee] += OptProfCount.getValue();
+    AdditionalCounts[Callee] += *OptProfCount;
   }
 
   // Update the counts for the nodes in the SCC.
@@ -70,7 +67,7 @@ void SyntheticCountsUtils<CallGraphType>::propagateFromSCC(
     if (!OptProfCount)
       continue;
     auto Callee = CGT::edge_dest(E.second);
-    AddCount(Callee, OptProfCount.getValue());
+    AddCount(Callee, *OptProfCount);
   }
 }
 
diff --git a/llvm/lib/Analysis/TFUtils.cpp b/llvm/lib/Analysis/TFUtils.cpp
index 26bc63983b4e..203858c1cf06 100644
--- a/llvm/lib/Analysis/TFUtils.cpp
+++ b/llvm/lib/Analysis/TFUtils.cpp
@@ -82,6 +82,33 @@ void serialize(const Message &SE, std::string *OutStr) {
     *OutStr = SE.SerializeAsString();
   }
 }
+
+int getTFTypeIndex(TensorType TType) {
+  switch (TType) {
+  case TensorType::Double:
+    return TF_DOUBLE;
+  case TensorType::Float:
+    return TF_FLOAT;
+  case TensorType::Int8:
+    return TF_INT8;
+  case TensorType::UInt8:
+    return TF_UINT8;
+  case TensorType::Int16:
+    return TF_INT16;
+  case TensorType::UInt16:
+    return TF_UINT16;
+  case TensorType::Int32:
+    return TF_INT32;
+  case TensorType::UInt32:
+    return TF_UINT32;
+  case TensorType::Int64:
+    return TF_INT64;
+  case TensorType::UInt64:
+    return TF_UINT64;
+  case TensorType::Invalid:
+    llvm_unreachable("Unknown tensor type");
+  }
+}
 } // namespace
 
 namespace llvm {
@@ -105,116 +132,6 @@ private:
   std::vector<TF_Tensor *> Output;
 };
 
-size_t TensorSpec::getElementByteSize() const {
-  return TF_DataTypeSize(static_cast<TF_DataType>(TypeIndex));
-}
-
-TensorSpec::TensorSpec(const std::string &Name, int Port, int TypeIndex,
-                       const std::vector<int64_t> &Shape)
-    : Name(Name), Port(Port), TypeIndex(TypeIndex), Shape(Shape),
-      ElementCount(std::accumulate(Shape.begin(), Shape.end(), 1,
-                                   std::multiplies<int64_t>())) {}
-
-Optional<TensorSpec> getTensorSpecFromJSON(LLVMContext &Ctx,
-                                           const json::Value &Value) {
-  auto EmitError = [&](const llvm::Twine &Message) -> Optional<TensorSpec> {
-    std::string S;
-    llvm::raw_string_ostream OS(S);
-    OS << Value;
-    Ctx.emitError("Unable to parse JSON Value as spec (" + Message + "): " + S);
-    return None;
-  };
-  // FIXME: accept a Path as a parameter, and use it for error reporting.
-  json::Path::Root Root("tensor_spec");
-  json::ObjectMapper Mapper(Value, Root);
-  if (!Mapper)
-    return EmitError("Value is not a dict");
-
-  std::string TensorName;
-  int TensorPort = -1;
-  std::string TensorType;
-  std::vector<int64_t> TensorShape;
-
-  if (!Mapper.map<std::string>("name", TensorName))
-    return EmitError("'name' property not present or not a string");
-  if (!Mapper.map<std::string>("type", TensorType))
-    return EmitError("'type' property not present or not a string");
-  if (!Mapper.map<int>("port", TensorPort))
-    return EmitError("'port' property not present or not an int");
-  if (!Mapper.map<std::vector<int64_t>>("shape", TensorShape))
-    return EmitError("'shape' property not present or not an int array");
-
-#define PARSE_TYPE(T, E)                                                       \
-  if (TensorType == #T)                                                        \
-    return TensorSpec::createSpec<T>(TensorName, TensorShape, TensorPort);
-  TFUTILS_SUPPORTED_TYPES(PARSE_TYPE)
-#undef PARSE_TYPE
-  return None;
-}
-
-Optional<std::vector<LoggedFeatureSpec>>
-loadOutputSpecs(LLVMContext &Ctx, StringRef ExpectedDecisionName,
-                StringRef ModelPath, StringRef SpecFileOverride) {
-  SmallVector<char, 128> OutputSpecsPath;
-  StringRef FileName = SpecFileOverride;
-  if (FileName.empty()) {
-    llvm::sys::path::append(OutputSpecsPath, ModelPath, "output_spec.json");
-    FileName = {OutputSpecsPath.data(), OutputSpecsPath.size()};
-  }
-
-  auto BufferOrError = MemoryBuffer::getFileOrSTDIN(FileName);
-  if (!BufferOrError) {
-    Ctx.emitError("Error opening output specs file: " + FileName + " : " +
-                  BufferOrError.getError().message());
-    return None;
-  }
-  auto ParsedJSONValues = json::parse(BufferOrError.get()->getBuffer());
-  if (!ParsedJSONValues) {
-    Ctx.emitError("Could not parse specs file: " + FileName);
-    return None;
-  }
-  auto ValuesArray = ParsedJSONValues->getAsArray();
-  if (!ValuesArray) {
-    Ctx.emitError("Expected an array of {tensor_spec:<TensorSpec>, "
-                  "logging_name:<name>} dictionaries");
-    return None;
-  }
-  std::vector<LoggedFeatureSpec> Ret;
-  for (const auto &Value : *ValuesArray)
-    if (const auto *Obj = Value.getAsObject())
-      if (const auto *SpecPart = Obj->get("tensor_spec"))
-        if (auto TensorSpec = getTensorSpecFromJSON(Ctx, *SpecPart))
-          if (auto LoggingName = Obj->getString("logging_name")) {
-            if (!TensorSpec->isElementType<int64_t>() &&
-                !TensorSpec->isElementType<int32_t>() &&
-                !TensorSpec->isElementType<float>()) {
-              Ctx.emitError(
-                  "Only int64, int32, and float tensors are supported. "
-                  "Found unsupported type for tensor named " +
-                  TensorSpec->name());
-              return None;
-            }
-            Ret.push_back({*TensorSpec, LoggingName->str()});
-          }
-
-  if (ValuesArray->size() != Ret.size()) {
-    Ctx.emitError(
-        "Unable to parse output spec. It should be a json file containing an "
-        "array of dictionaries. Each dictionary must have a 'tensor_spec' key, "
-        "with a json object describing a TensorSpec; and a 'logging_name' key, "
-        "which is a string to use as name when logging this tensor in the "
-        "training log.");
-    return None;
-  }
-  if (Ret.empty() || *Ret[0].LoggingName != ExpectedDecisionName) {
-    Ctx.emitError("The first output spec must describe the decision tensor, "
-                  "and must have the logging_name " +
-                  StringRef(ExpectedDecisionName));
-    return None;
-  }
-  return Ret;
-}
-
 class TFModelEvaluatorImpl {
 public:
   TFModelEvaluatorImpl(StringRef SavedModelPath,
@@ -383,16 +300,29 @@ TFModelEvaluatorImpl::TFModelEvaluatorImpl(
     errs() << TF_Message(Status.get());
     invalidate();
   }
+  size_t NrSupported = 0;
   for (size_t I = 0; I < InputSpecs.size(); ++I) {
     auto &InputSpec = InputSpecs[I];
     InputFeed[I] = {
         TF_GraphOperationByName(Graph.get(), (InputSpec.name()).c_str()),
         InputSpec.port()};
+    if (!InputFeed[I].oper) {
+      continue;
+    }
+    if (NrSupported++ != I) {
+      errs()
+          << "Unsupported features must be placed at the end of the InputSpecs";
+      invalidate();
+      return;
+    }
     if (!checkReportAndInvalidate(InputFeed[I], InputSpec))
       return;
-    initInput(I, static_cast<TF_DataType>(InputSpec.typeIndex()),
+    initInput(I, static_cast<TF_DataType>(getTFTypeIndex(InputSpec.type())),
               InputSpec.shape());
   }
+  InputFeed.resize(NrSupported);
+  Input.resize(NrSupported);
+
   for (size_t I = 0; I < OutputSpecsSize; ++I) {
     auto OutputSpec = GetOutputSpecs(I);
     OutputFeed[I] = {
@@ -470,7 +400,9 @@ void TFModelEvaluatorImpl::initInput(size_t Index, TF_DataType Type,
 }
 
 void *TFModelEvaluator::getUntypedInput(size_t Index) {
-  return TF_TensorData(Impl->getInput()[Index]);
+  if (Index < Impl->getInput().size())
+    return TF_TensorData(Impl->getInput()[Index]);
+  return nullptr;
 }
 
 TFModelEvaluator::EvaluationResult::EvaluationResult(
@@ -495,13 +427,6 @@ TFModelEvaluator::EvaluationResult::getUntypedTensorValue(size_t Index) const {
   return TF_TensorData(Impl->getOutput()[Index]);
 }
 
-#define TFUTILS_GETDATATYPE_IMPL(T, E)                                         \
-  template <> int TensorSpec::getDataType<T>() { return E; }
-
-TFUTILS_SUPPORTED_TYPES(TFUTILS_GETDATATYPE_IMPL)
-
-#undef TFUTILS_GETDATATYPE_IMPL
-
 TFModelEvaluator::EvaluationResult::~EvaluationResult() {}
 TFModelEvaluator::~TFModelEvaluator() {}
 
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index 02923c2c7eb1..8ebdb65e88dc 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -659,12 +659,12 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc_stpncpy);
   }
 
-  if (T.isPS4()) {
-    // PS4 does have memalign.
+  if (T.isPS()) {
+    // PS4/PS5 do have memalign.
     TLI.setAvailable(LibFunc_memalign);
 
-    // PS4 does not have new/delete with "unsigned int" size parameter;
-    // it only has the "unsigned long" versions.
+    // PS4/PS5 do not have new/delete with "unsigned int" size parameter;
+    // they only have the "unsigned long" versions.
     TLI.setUnavailable(LibFunc_ZdaPvj);
     TLI.setUnavailable(LibFunc_ZdaPvjSt11align_val_t);
     TLI.setUnavailable(LibFunc_ZdlPvj);
@@ -1110,9 +1110,11 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_system:
     return (NumParams == 1 && FTy.getParamType(0)->isPointerTy());
   case LibFunc___kmpc_alloc_shared:
+    return NumParams == 1 && FTy.getReturnType()->isPointerTy();
   case LibFunc_malloc:
   case LibFunc_vec_malloc:
-    return (NumParams == 1 && FTy.getReturnType()->isPointerTy());
+    return NumParams == 1 && FTy.getParamType(0)->isIntegerTy(SizeTBits) &&
+           FTy.getReturnType()->isPointerTy();
   case LibFunc_memcmp:
     return NumParams == 3 && FTy.getReturnType()->isIntegerTy(32) &&
            FTy.getParamType(0)->isPointerTy() &&
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 25e9dee98e13..66f61961d01b 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -11,7 +11,6 @@
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/TargetTransformInfoImpl.h"
 #include "llvm/IR/CFG.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -21,7 +20,6 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ErrorHandling.h"
 #include <utility>
 
 using namespace llvm;
@@ -33,6 +31,11 @@ static cl::opt<bool> EnableReduxCost("costmodel-reduxcost", cl::init(false),
                                      cl::Hidden,
                                      cl::desc("Recognize reduction patterns."));
 
+static cl::opt<unsigned> CacheLineSize(
+    "cache-line-size", cl::init(0), cl::Hidden,
+    cl::desc("Use this to override the target cache line size when "
+             "specified by the user."));
+
 namespace {
 /// No-op implementation of the TTI interface using the utility base
 /// classes.
@@ -179,7 +182,7 @@ bool HardwareLoopInfo::isHardwareLoopCandidate(ScalarEvolution &SE,
 TargetTransformInfo::TargetTransformInfo(const DataLayout &DL)
     : TTIImpl(new Model<NoTTIImpl>(NoTTIImpl(DL))) {}
 
-TargetTransformInfo::~TargetTransformInfo() {}
+TargetTransformInfo::~TargetTransformInfo() = default;
 
 TargetTransformInfo::TargetTransformInfo(TargetTransformInfo &&Arg)
     : TTIImpl(std::move(Arg.TTIImpl)) {}
@@ -350,7 +353,8 @@ bool TargetTransformInfo::isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
                                         Scale, AddrSpace, I);
 }
 
-bool TargetTransformInfo::isLSRCostLess(LSRCost &C1, LSRCost &C2) const {
+bool TargetTransformInfo::isLSRCostLess(const LSRCost &C1,
+                                        const LSRCost &C2) const {
   return TTIImpl->isLSRCostLess(C1, C2);
 }
 
@@ -398,11 +402,22 @@ bool TargetTransformInfo::isLegalNTLoad(Type *DataType, Align Alignment) const {
   return TTIImpl->isLegalNTLoad(DataType, Alignment);
 }
 
+bool TargetTransformInfo::isLegalBroadcastLoad(Type *ElementTy,
+                                               ElementCount NumElements) const {
+  return TTIImpl->isLegalBroadcastLoad(ElementTy, NumElements);
+}
+
 bool TargetTransformInfo::isLegalMaskedGather(Type *DataType,
                                               Align Alignment) const {
   return TTIImpl->isLegalMaskedGather(DataType, Alignment);
 }
 
+bool TargetTransformInfo::isLegalAltInstr(
+    VectorType *VecTy, unsigned Opcode0, unsigned Opcode1,
+    const SmallBitVector &OpcodeMask) const {
+  return TTIImpl->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask);
+}
+
 bool TargetTransformInfo::isLegalMaskedScatter(Type *DataType,
                                                Align Alignment) const {
   return TTIImpl->isLegalMaskedScatter(DataType, Alignment);
@@ -470,7 +485,7 @@ bool TargetTransformInfo::isTypeLegal(Type *Ty) const {
   return TTIImpl->isTypeLegal(Ty);
 }
 
-InstructionCost TargetTransformInfo::getRegUsageForType(Type *Ty) const {
+unsigned TargetTransformInfo::getRegUsageForType(Type *Ty) const {
   return TTIImpl->getRegUsageForType(Ty);
 }
 
@@ -507,6 +522,10 @@ bool TargetTransformInfo::supportsEfficientVectorElementLoadStore() const {
   return TTIImpl->supportsEfficientVectorElementLoadStore();
 }
 
+bool TargetTransformInfo::supportsTailCalls() const {
+  return TTIImpl->supportsTailCalls();
+}
+
 bool TargetTransformInfo::enableAggressiveInterleaving(
     bool LoopHasReductions) const {
   return TTIImpl->enableAggressiveInterleaving(LoopHasReductions);
@@ -623,8 +642,9 @@ Optional<unsigned> TargetTransformInfo::getVScaleForTuning() const {
   return TTIImpl->getVScaleForTuning();
 }
 
-bool TargetTransformInfo::shouldMaximizeVectorBandwidth() const {
-  return TTIImpl->shouldMaximizeVectorBandwidth();
+bool TargetTransformInfo::shouldMaximizeVectorBandwidth(
+    TargetTransformInfo::RegisterKind K) const {
+  return TTIImpl->shouldMaximizeVectorBandwidth(K);
 }
 
 ElementCount TargetTransformInfo::getMinimumVF(unsigned ElemWidth,
@@ -637,6 +657,11 @@ unsigned TargetTransformInfo::getMaximumVF(unsigned ElemWidth,
   return TTIImpl->getMaximumVF(ElemWidth, Opcode);
 }
 
+unsigned TargetTransformInfo::getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
+                                                Type *ScalarValTy) const {
+  return TTIImpl->getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);
+}
+
 bool TargetTransformInfo::shouldConsiderAddressTypePromotion(
     const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
   return TTIImpl->shouldConsiderAddressTypePromotion(
@@ -644,7 +669,8 @@ bool TargetTransformInfo::shouldConsiderAddressTypePromotion(
 }
 
 unsigned TargetTransformInfo::getCacheLineSize() const {
-  return TTIImpl->getCacheLineSize();
+  return CacheLineSize.getNumOccurrences() > 0 ? CacheLineSize
+                                               : TTIImpl->getCacheLineSize();
 }
 
 llvm::Optional<unsigned>
@@ -742,12 +768,11 @@ InstructionCost TargetTransformInfo::getArithmeticInstrCost(
   return Cost;
 }
 
-InstructionCost TargetTransformInfo::getShuffleCost(ShuffleKind Kind,
-                                                    VectorType *Ty,
-                                                    ArrayRef<int> Mask,
-                                                    int Index,
-                                                    VectorType *SubTp) const {
-  InstructionCost Cost = TTIImpl->getShuffleCost(Kind, Ty, Mask, Index, SubTp);
+InstructionCost TargetTransformInfo::getShuffleCost(
+    ShuffleKind Kind, VectorType *Ty, ArrayRef<int> Mask, int Index,
+    VectorType *SubTp, ArrayRef<const Value *> Args) const {
+  InstructionCost Cost =
+      TTIImpl->getShuffleCost(Kind, Ty, Mask, Index, SubTp, Args);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
@@ -973,18 +998,21 @@ Value *TargetTransformInfo::getOrCreateResultFromMemIntrinsic(
 
 Type *TargetTransformInfo::getMemcpyLoopLoweringType(
     LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
-    unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign) const {
+    unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
+    Optional<uint32_t> AtomicElementSize) const {
   return TTIImpl->getMemcpyLoopLoweringType(Context, Length, SrcAddrSpace,
-                                            DestAddrSpace, SrcAlign, DestAlign);
+                                            DestAddrSpace, SrcAlign, DestAlign,
+                                            AtomicElementSize);
 }
 
 void TargetTransformInfo::getMemcpyLoopResidualLoweringType(
     SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
     unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
-    unsigned SrcAlign, unsigned DestAlign) const {
-  TTIImpl->getMemcpyLoopResidualLoweringType(OpsOut, Context, RemainingBytes,
-                                             SrcAddrSpace, DestAddrSpace,
-                                             SrcAlign, DestAlign);
+    unsigned SrcAlign, unsigned DestAlign,
+    Optional<uint32_t> AtomicCpySize) const {
+  TTIImpl->getMemcpyLoopResidualLoweringType(
+      OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
+      DestAlign, AtomicCpySize);
 }
 
 bool TargetTransformInfo::areInlineCompatible(const Function *Caller,
@@ -1155,7 +1183,7 @@ TargetTransformInfo::getInstructionThroughput(const Instruction *I) const {
   }
 }
 
-TargetTransformInfo::Concept::~Concept() {}
+TargetTransformInfo::Concept::~Concept() = default;
 
 TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
 
diff --git a/llvm/lib/Analysis/TensorSpec.cpp b/llvm/lib/Analysis/TensorSpec.cpp
new file mode 100644
index 000000000000..f6a5882371a7
--- /dev/null
+++ b/llvm/lib/Analysis/TensorSpec.cpp
@@ -0,0 +1,144 @@
+//===- TensorSpec.cpp - tensor type abstraction ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation file for the abstraction of a tensor type, and JSON loading
+// utils.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/Config/config.h"
+
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/TensorSpec.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/JSON.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <numeric>
+
+using namespace llvm;
+
+namespace llvm {
+
+#define TFUTILS_GETDATATYPE_IMPL(T, E)                                         \
+  template <> TensorType TensorSpec::getDataType<T>() { return TensorType::E; }
+
+SUPPORTED_TENSOR_TYPES(TFUTILS_GETDATATYPE_IMPL)
+
+#undef TFUTILS_GETDATATYPE_IMPL
+
+TensorSpec::TensorSpec(const std::string &Name, int Port, TensorType Type,
+                       size_t ElementSize, const std::vector<int64_t> &Shape)
+    : Name(Name), Port(Port), Type(Type), Shape(Shape),
+      ElementCount(std::accumulate(Shape.begin(), Shape.end(), 1,
+                                   std::multiplies<int64_t>())),
+      ElementSize(ElementSize) {}
+
+Optional<TensorSpec> getTensorSpecFromJSON(LLVMContext &Ctx,
+                                           const json::Value &Value) {
+  auto EmitError = [&](const llvm::Twine &Message) -> Optional<TensorSpec> {
+    std::string S;
+    llvm::raw_string_ostream OS(S);
+    OS << Value;
+    Ctx.emitError("Unable to parse JSON Value as spec (" + Message + "): " + S);
+    return None;
+  };
+  // FIXME: accept a Path as a parameter, and use it for error reporting.
+  json::Path::Root Root("tensor_spec");
+  json::ObjectMapper Mapper(Value, Root);
+  if (!Mapper)
+    return EmitError("Value is not a dict");
+
+  std::string TensorName;
+  int TensorPort = -1;
+  std::string TensorType;
+  std::vector<int64_t> TensorShape;
+
+  if (!Mapper.map<std::string>("name", TensorName))
+    return EmitError("'name' property not present or not a string");
+  if (!Mapper.map<std::string>("type", TensorType))
+    return EmitError("'type' property not present or not a string");
+  if (!Mapper.map<int>("port", TensorPort))
+    return EmitError("'port' property not present or not an int");
+  if (!Mapper.map<std::vector<int64_t>>("shape", TensorShape))
+    return EmitError("'shape' property not present or not an int array");
+
+#define PARSE_TYPE(T, E)                                                       \
+  if (TensorType == #T)                                                        \
+    return TensorSpec::createSpec<T>(TensorName, TensorShape, TensorPort);
+  SUPPORTED_TENSOR_TYPES(PARSE_TYPE)
+#undef PARSE_TYPE
+  return None;
+}
+
+Optional<std::vector<LoggedFeatureSpec>>
+loadOutputSpecs(LLVMContext &Ctx, StringRef ExpectedDecisionName,
+                StringRef ModelPath, StringRef SpecFileOverride) {
+  SmallVector<char, 128> OutputSpecsPath;
+  StringRef FileName = SpecFileOverride;
+  if (FileName.empty()) {
+    llvm::sys::path::append(OutputSpecsPath, ModelPath, "output_spec.json");
+    FileName = {OutputSpecsPath.data(), OutputSpecsPath.size()};
+  }
+
+  auto BufferOrError = MemoryBuffer::getFileOrSTDIN(FileName);
+  if (!BufferOrError) {
+    Ctx.emitError("Error opening output specs file: " + FileName + " : " +
+                  BufferOrError.getError().message());
+    return None;
+  }
+  auto ParsedJSONValues = json::parse(BufferOrError.get()->getBuffer());
+  if (!ParsedJSONValues) {
+    Ctx.emitError("Could not parse specs file: " + FileName);
+    return None;
+  }
+  auto ValuesArray = ParsedJSONValues->getAsArray();
+  if (!ValuesArray) {
+    Ctx.emitError("Expected an array of {tensor_spec:<TensorSpec>, "
+                  "logging_name:<name>} dictionaries");
+    return None;
+  }
+  std::vector<LoggedFeatureSpec> Ret;
+  for (const auto &Value : *ValuesArray)
+    if (const auto *Obj = Value.getAsObject())
+      if (const auto *SpecPart = Obj->get("tensor_spec"))
+        if (auto TensorSpec = getTensorSpecFromJSON(Ctx, *SpecPart))
+          if (auto LoggingName = Obj->getString("logging_name")) {
+            if (!TensorSpec->isElementType<int64_t>() &&
+                !TensorSpec->isElementType<int32_t>() &&
+                !TensorSpec->isElementType<float>()) {
+              Ctx.emitError(
+                  "Only int64, int32, and float tensors are supported. "
+                  "Found unsupported type for tensor named " +
+                  TensorSpec->name());
+              return None;
+            }
+            Ret.push_back({*TensorSpec, LoggingName->str()});
+          }
+
+  if (ValuesArray->size() != Ret.size()) {
+    Ctx.emitError(
+        "Unable to parse output spec. It should be a json file containing an "
+        "array of dictionaries. Each dictionary must have a 'tensor_spec' key, "
+        "with a json object describing a TensorSpec; and a 'logging_name' key, "
+        "which is a string to use as name when logging this tensor in the "
+        "training log.");
+    return None;
+  }
+  if (Ret.empty() || *Ret[0].LoggingName != ExpectedDecisionName) {
+    Ctx.emitError("The first output spec must describe the decision tensor, "
+                  "and must have the logging_name " +
+                  StringRef(ExpectedDecisionName));
+    return None;
+  }
+  return Ret;
+}
+} // namespace llvm
diff --git a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
index 627a78a2a2fd..9bcbe4a4cc1e 100644
--- a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -112,7 +112,6 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/InitializePasses.h"
diff --git a/llvm/lib/Analysis/TypeMetadataUtils.cpp b/llvm/lib/Analysis/TypeMetadataUtils.cpp
index 80051fd5f7c1..201e64770766 100644
--- a/llvm/lib/Analysis/TypeMetadataUtils.cpp
+++ b/llvm/lib/Analysis/TypeMetadataUtils.cpp
@@ -16,7 +16,6 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Analysis/VFABIDemangling.cpp b/llvm/lib/Analysis/VFABIDemangling.cpp
index 7573975a3dd3..e6d297877b62 100644
--- a/llvm/lib/Analysis/VFABIDemangling.cpp
+++ b/llvm/lib/Analysis/VFABIDemangling.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/Analysis/VectorUtils.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Analysis/ValueLatticeUtils.cpp b/llvm/lib/Analysis/ValueLatticeUtils.cpp
index 53638c351f72..2bcb4d5b0e6b 100644
--- a/llvm/lib/Analysis/ValueLatticeUtils.cpp
+++ b/llvm/lib/Analysis/ValueLatticeUtils.cpp
@@ -29,12 +29,13 @@ bool llvm::canTrackGlobalVariableInterprocedurally(GlobalVariable *GV) {
       !GV->hasDefinitiveInitializer())
     return false;
   return all_of(GV->users(), [&](User *U) {
-    // Currently all users of a global variable have to be none-volatile loads
-    // or stores and the global cannot be stored itself.
+    // Currently all users of a global variable have to be non-volatile loads
+    // or stores of the global type, and the global cannot be stored itself.
     if (auto *Store = dyn_cast<StoreInst>(U))
-      return Store->getValueOperand() != GV && !Store->isVolatile();
+      return Store->getValueOperand() != GV && !Store->isVolatile() &&
+             Store->getValueOperand()->getType() == GV->getValueType();
     if (auto *Load = dyn_cast<LoadInst>(U))
-      return !Load->isVolatile();
+      return !Load->isVolatile() && Load->getType() == GV->getValueType();
 
     return false;
   });
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index c14bdb8bc262..05d5e47bb8d7 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumeBundleQueries.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/GuardUtils.h"
 #include "llvm/Analysis/InstructionSimplify.h"
@@ -70,10 +71,8 @@
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include <algorithm>
-#include <array>
 #include <cassert>
 #include <cstdint>
-#include <iterator>
 #include <utility>
 
 using namespace llvm;
@@ -86,13 +85,12 @@ static cl::opt<unsigned> DomConditionsMaxUses("dom-conditions-max-uses",
 
 // According to the LangRef, branching on a poison condition is absolutely
 // immediate full UB.  However, historically we haven't implemented that
-// consistently as we have an important transformation (non-trivial unswitch)
-// which introduces instances of branch on poison/undef to otherwise well
-// defined programs.  This flag exists to let us test optimization benefit
-// of exploiting the specified behavior (in combination with enabling the
-// unswitch fix.)
+// consistently as we had an important transformation (non-trivial unswitch)
+// which introduced instances of branch on poison/undef to otherwise well
+// defined programs.  This issue has since been fixed, but the flag is
+// temporarily retained to easily diagnose potential regressions.
 static cl::opt<bool> BranchOnPoisonAsUB("branch-on-poison-as-ub",
-                                        cl::Hidden, cl::init(false));
+                                        cl::Hidden, cl::init(true));
 
 
 /// Returns the bitwidth of the given scalar or pointer type. For vector types,
@@ -275,13 +273,39 @@ bool llvm::haveNoCommonBitsSet(const Value *LHS, const Value *RHS,
   assert(LHS->getType()->isIntOrIntVectorTy() &&
          "LHS and RHS should be integers");
   // Look for an inverted mask: (X & ~M) op (Y & M).
-  Value *M;
-  if (match(LHS, m_c_And(m_Not(m_Value(M)), m_Value())) &&
-      match(RHS, m_c_And(m_Specific(M), m_Value())))
+  {
+    Value *M;
+    if (match(LHS, m_c_And(m_Not(m_Value(M)), m_Value())) &&
+        match(RHS, m_c_And(m_Specific(M), m_Value())))
+      return true;
+    if (match(RHS, m_c_And(m_Not(m_Value(M)), m_Value())) &&
+        match(LHS, m_c_And(m_Specific(M), m_Value())))
+      return true;
+  }
+
+  // X op (Y & ~X)
+  if (match(RHS, m_c_And(m_Not(m_Specific(LHS)), m_Value())) ||
+      match(LHS, m_c_And(m_Not(m_Specific(RHS)), m_Value())))
     return true;
-  if (match(RHS, m_c_And(m_Not(m_Value(M)), m_Value())) &&
-      match(LHS, m_c_And(m_Specific(M), m_Value())))
+
+  // X op ((X & Y) ^ Y) -- this is the canonical form of the previous pattern
+  // for constant Y.
+  Value *Y;
+  if (match(RHS,
+            m_c_Xor(m_c_And(m_Specific(LHS), m_Value(Y)), m_Deferred(Y))) ||
+      match(LHS, m_c_Xor(m_c_And(m_Specific(RHS), m_Value(Y)), m_Deferred(Y))))
     return true;
+
+  // Look for: (A & B) op ~(A | B)
+  {
+    Value *A, *B;
+    if (match(LHS, m_And(m_Value(A), m_Value(B))) &&
+        match(RHS, m_Not(m_c_Or(m_Specific(A), m_Specific(B)))))
+      return true;
+    if (match(RHS, m_And(m_Value(A), m_Value(B))) &&
+        match(LHS, m_Not(m_c_Or(m_Specific(A), m_Specific(B)))))
+      return true;
+  }
   IntegerType *IT = cast<IntegerType>(LHS->getType()->getScalarType());
   KnownBits LHSKnown(IT->getBitWidth());
   KnownBits RHSKnown(IT->getBitWidth());
@@ -451,7 +475,12 @@ static void computeKnownBitsMul(const Value *Op0, const Value *Op1, bool NSW,
     }
   }
 
-  Known = KnownBits::mul(Known, Known2);
+  bool SelfMultiply = Op0 == Op1;
+  // TODO: SelfMultiply can be poison, but not undef.
+  if (SelfMultiply)
+    SelfMultiply &=
+        isGuaranteedNotToBeUndefOrPoison(Op0, Q.AC, Q.CxtI, Q.DT, Depth + 1);
+  Known = KnownBits::mul(Known, Known2, SelfMultiply);
 
   // Only make use of no-wrap flags if we failed to compute the sign bit
   // directly.  This matters if the multiplication always overflows, in
@@ -656,7 +685,8 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
   if (V->getType()->isPointerTy()) {
     if (RetainedKnowledge RK = getKnowledgeValidInContext(
             V, {Attribute::Alignment}, Q.CxtI, Q.DT, Q.AC)) {
-      Known.Zero.setLowBits(Log2_64(RK.ArgValue));
+      if (isPowerOf2_64(RK.ArgValue))
+        Known.Zero.setLowBits(Log2_64(RK.ArgValue));
     }
   }
 
@@ -1041,7 +1071,7 @@ static void computeKnownBitsFromShiftOperator(
     // bits. This check is sunk down as far as possible to avoid the expensive
     // call to isKnownNonZero if the cheaper checks above fail.
     if (ShiftAmt == 0) {
-      if (!ShifterOperandIsNonZero.hasValue())
+      if (!ShifterOperandIsNonZero)
         ShifterOperandIsNonZero =
             isKnownNonZero(I->getOperand(1), DemandedElts, Depth + 1, Q);
       if (*ShifterOperandIsNonZero)
@@ -1726,8 +1756,7 @@ static void computeKnownBitsFromOperator(const Operator *I,
           break;
         }
 
-        unsigned FirstZeroHighBit =
-            32 - countLeadingZeros(VScaleMax.getValue());
+        unsigned FirstZeroHighBit = 32 - countLeadingZeros(*VScaleMax);
         if (FirstZeroHighBit < BitWidth)
           Known.Zero.setBitsFrom(FirstZeroHighBit);
 
@@ -2007,6 +2036,63 @@ void computeKnownBits(const Value *V, const APInt &DemandedElts,
   assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?");
 }
 
+/// Try to detect a recurrence that the value of the induction variable is
+/// always a power of two (or zero).
+static bool isPowerOfTwoRecurrence(const PHINode *PN, bool OrZero,
+                                   unsigned Depth, Query &Q) {
+  BinaryOperator *BO = nullptr;
+  Value *Start = nullptr, *Step = nullptr;
+  if (!matchSimpleRecurrence(PN, BO, Start, Step))
+    return false;
+
+  // Initial value must be a power of two.
+  for (const Use &U : PN->operands()) {
+    if (U.get() == Start) {
+      // Initial value comes from a different BB, need to adjust context
+      // instruction for analysis.
+      Q.CxtI = PN->getIncomingBlock(U)->getTerminator();
+      if (!isKnownToBeAPowerOfTwo(Start, OrZero, Depth, Q))
+        return false;
+    }
+  }
+
+  // Except for Mul, the induction variable must be on the left side of the
+  // increment expression, otherwise its value can be arbitrary.
+  if (BO->getOpcode() != Instruction::Mul && BO->getOperand(1) != Step)
+    return false;
+
+  Q.CxtI = BO->getParent()->getTerminator();
+  switch (BO->getOpcode()) {
+  case Instruction::Mul:
+    // Power of two is closed under multiplication.
+    return (OrZero || Q.IIQ.hasNoUnsignedWrap(BO) ||
+            Q.IIQ.hasNoSignedWrap(BO)) &&
+           isKnownToBeAPowerOfTwo(Step, OrZero, Depth, Q);
+  case Instruction::SDiv:
+    // Start value must not be signmask for signed division, so simply being a
+    // power of two is not sufficient, and it has to be a constant.
+    if (!match(Start, m_Power2()) || match(Start, m_SignMask()))
+      return false;
+    LLVM_FALLTHROUGH;
+  case Instruction::UDiv:
+    // Divisor must be a power of two.
+    // If OrZero is false, cannot guarantee induction variable is non-zero after
+    // division, same for Shr, unless it is exact division.
+    return (OrZero || Q.IIQ.isExact(BO)) &&
+           isKnownToBeAPowerOfTwo(Step, false, Depth, Q);
+  case Instruction::Shl:
+    return OrZero || Q.IIQ.hasNoUnsignedWrap(BO) || Q.IIQ.hasNoSignedWrap(BO);
+  case Instruction::AShr:
+    if (!match(Start, m_Power2()) || match(Start, m_SignMask()))
+      return false;
+    LLVM_FALLTHROUGH;
+  case Instruction::LShr:
+    return OrZero || Q.IIQ.isExact(BO);
+  default:
+    return false;
+  }
+}
+
 /// Return true if the given value is known to have exactly one
 /// bit set when defined. For vectors return true if every element is known to
 /// be a power of two when defined. Supports values with integer or pointer
@@ -2098,6 +2184,30 @@ bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth,
     }
   }
 
+  // A PHI node is power of two if all incoming values are power of two, or if
+  // it is an induction variable where in each step its value is a power of two.
+  if (const PHINode *PN = dyn_cast<PHINode>(V)) {
+    Query RecQ = Q;
+
+    // Check if it is an induction variable and always power of two.
+    if (isPowerOfTwoRecurrence(PN, OrZero, Depth, RecQ))
+      return true;
+
+    // Recursively check all incoming values. Limit recursion to 2 levels, so
+    // that search complexity is limited to number of operands^2.
+    unsigned NewDepth = std::max(Depth, MaxAnalysisRecursionDepth - 1);
+    return llvm::all_of(PN->operands(), [&](const Use &U) {
+      // Value is power of 2 if it is coming from PHI node itself by induction.
+      if (U.get() == PN)
+        return true;
+
+      // Change the context instruction to the incoming block where it is
+      // evaluated.
+      RecQ.CxtI = PN->getIncomingBlock(U)->getTerminator();
+      return isKnownToBeAPowerOfTwo(U.get(), OrZero, NewDepth, RecQ);
+    });
+  }
+
   // An exact divide or right shift can only shift off zero bits, so the result
   // is a power of two only if the first operand is a power of two and not
   // copying a sign bit (sdiv int_min, 2).
@@ -2588,6 +2698,9 @@ bool isKnownNonZero(const Value *V, const APInt &DemandedElts, unsigned Depth,
     if (isKnownNonZero(Op, Depth, Q) &&
         isGuaranteedNotToBePoison(Op, Q.AC, Q.CxtI, Q.DT, Depth))
       return true;
+  } else if (const auto *II = dyn_cast<IntrinsicInst>(V)) {
+    if (II->getIntrinsicID() == Intrinsic::vscale)
+      return true;
   }
 
   KnownBits Known(BitWidth);
@@ -2885,6 +2998,24 @@ static bool isSignedMinMaxClamp(const Value *Select, const Value *&In,
   return CLow->sle(*CHigh);
 }
 
+static bool isSignedMinMaxIntrinsicClamp(const IntrinsicInst *II,
+                                         const APInt *&CLow,
+                                         const APInt *&CHigh) {
+  assert((II->getIntrinsicID() == Intrinsic::smin ||
+          II->getIntrinsicID() == Intrinsic::smax) && "Must be smin/smax");
+
+  Intrinsic::ID InverseID = getInverseMinMaxIntrinsic(II->getIntrinsicID());
+  auto *InnerII = dyn_cast<IntrinsicInst>(II->getArgOperand(0));
+  if (!InnerII || InnerII->getIntrinsicID() != InverseID ||
+      !match(II->getArgOperand(1), m_APInt(CLow)) ||
+      !match(InnerII->getArgOperand(1), m_APInt(CHigh)))
+    return false;
+
+  if (II->getIntrinsicID() == Intrinsic::smin)
+    std::swap(CLow, CHigh);
+  return CLow->sle(*CHigh);
+}
+
 /// For vector constants, loop over the elements and find the constant with the
 /// minimum number of sign bits. Return 0 if the value is not a vector constant
 /// or if any element was not analyzed; otherwise, return the count for the
@@ -3225,6 +3356,12 @@ static unsigned ComputeNumSignBitsImpl(const Value *V,
 
           // Absolute value reduces number of sign bits by at most 1.
           return Tmp - 1;
+        case Intrinsic::smin:
+        case Intrinsic::smax: {
+          const APInt *CLow, *CHigh;
+          if (isSignedMinMaxIntrinsicClamp(II, CLow, CHigh))
+            return std::min(CLow->getNumSignBits(), CHigh->getNumSignBits());
+        }
         }
       }
     }
@@ -3358,9 +3495,6 @@ Intrinsic::ID llvm::getIntrinsicForCallSite(const CallBase &CB,
 /// NOTE: Do not check 'nsz' here because that fast-math-flag does not guarantee
 ///       that a value is not -0.0. It only guarantees that -0.0 may be treated
 ///       the same as +0.0 in floating-point ops.
-///
-/// NOTE: this function will need to be revisited when we support non-default
-/// rounding modes!
 bool llvm::CannotBeNegativeZero(const Value *V, const TargetLibraryInfo *TLI,
                                 unsigned Depth) {
   if (auto *CFP = dyn_cast<ConstantFP>(V))
@@ -3390,9 +3524,21 @@ bool llvm::CannotBeNegativeZero(const Value *V, const TargetLibraryInfo *TLI,
     case Intrinsic::sqrt:
     case Intrinsic::canonicalize:
       return CannotBeNegativeZero(Call->getArgOperand(0), TLI, Depth + 1);
+    case Intrinsic::experimental_constrained_sqrt: {
+      // NOTE: This rounding mode restriction may be too strict.
+      const auto *CI = cast<ConstrainedFPIntrinsic>(Call);
+      if (CI->getRoundingMode() == RoundingMode::NearestTiesToEven)
+        return CannotBeNegativeZero(Call->getArgOperand(0), TLI, Depth + 1);
+      else
+        return false;
+    }
     // fabs(x) != -0.0
     case Intrinsic::fabs:
       return true;
+    // sitofp and uitofp turn into +0.0 for zero.
+    case Intrinsic::experimental_constrained_sitofp:
+    case Intrinsic::experimental_constrained_uitofp:
+      return true;
     }
   }
 
@@ -4032,69 +4178,83 @@ bool llvm::isGEPBasedOnPointerToString(const GEPOperator *GEP,
   return true;
 }
 
+// If V refers to an initialized global constant, set Slice either to
+// its initializer if the size of its elements equals ElementSize, or,
+// for ElementSize == 8, to its representation as an array of unsiged
+// char. Return true on success.
 bool llvm::getConstantDataArrayInfo(const Value *V,
                                     ConstantDataArraySlice &Slice,
                                     unsigned ElementSize, uint64_t Offset) {
   assert(V);
 
-  // Look through bitcast instructions and geps.
-  V = V->stripPointerCasts();
+  // Drill down into the pointer expression V, ignoring any intervening
+  // casts, and determine the identity of the object it references along
+  // with the cumulative byte offset into it.
+  const GlobalVariable *GV =
+    dyn_cast<GlobalVariable>(getUnderlyingObject(V));
+  if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer())
+    // Fail if V is not based on constant global object.
+    return false;
 
-  // If the value is a GEP instruction or constant expression, treat it as an
-  // offset.
-  if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
-    // The GEP operator should be based on a pointer to string constant, and is
-    // indexing into the string constant.
-    if (!isGEPBasedOnPointerToString(GEP, ElementSize))
-      return false;
+  const DataLayout &DL = GV->getParent()->getDataLayout();
+  APInt Off(DL.getIndexTypeSizeInBits(V->getType()), 0);
 
-    // If the second index isn't a ConstantInt, then this is a variable index
-    // into the array.  If this occurs, we can't say anything meaningful about
-    // the string.
-    uint64_t StartIdx = 0;
-    if (const ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(2)))
-      StartIdx = CI->getZExtValue();
-    else
-      return false;
-    return getConstantDataArrayInfo(GEP->getOperand(0), Slice, ElementSize,
-                                    StartIdx + Offset);
-  }
+  if (GV != V->stripAndAccumulateConstantOffsets(DL, Off,
+                                                 /*AllowNonInbounds*/ true))
+    // Fail if a constant offset could not be determined.
+    return false;
 
-  // The GEP instruction, constant or instruction, must reference a global
-  // variable that is a constant and is initialized. The referenced constant
-  // initializer is the array that we'll use for optimization.
-  const GlobalVariable *GV = dyn_cast<GlobalVariable>(V);
-  if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer())
+  uint64_t StartIdx = Off.getLimitedValue();
+  if (StartIdx == UINT64_MAX)
+    // Fail if the constant offset is excessive.
     return false;
 
-  const ConstantDataArray *Array;
-  ArrayType *ArrayTy;
+  Offset += StartIdx;
+
+  ConstantDataArray *Array = nullptr;
+  ArrayType *ArrayTy = nullptr;
+
   if (GV->getInitializer()->isNullValue()) {
     Type *GVTy = GV->getValueType();
-    if ( (ArrayTy = dyn_cast<ArrayType>(GVTy)) ) {
-      // A zeroinitializer for the array; there is no ConstantDataArray.
-      Array = nullptr;
-    } else {
-      const DataLayout &DL = GV->getParent()->getDataLayout();
-      uint64_t SizeInBytes = DL.getTypeStoreSize(GVTy).getFixedSize();
-      uint64_t Length = SizeInBytes / (ElementSize / 8);
-      if (Length <= Offset)
-        return false;
+    uint64_t SizeInBytes = DL.getTypeStoreSize(GVTy).getFixedSize();
+    uint64_t Length = SizeInBytes / (ElementSize / 8);
+
+    Slice.Array = nullptr;
+    Slice.Offset = 0;
+    // Return an empty Slice for undersized constants to let callers
+    // transform even undefined library calls into simpler, well-defined
+    // expressions.  This is preferable to making the calls although it
+    // prevents sanitizers from detecting such calls.
+    Slice.Length = Length < Offset ? 0 : Length - Offset;
+    return true;
+  }
 
-      Slice.Array = nullptr;
-      Slice.Offset = 0;
-      Slice.Length = Length - Offset;
-      return true;
+  auto *Init = const_cast<Constant *>(GV->getInitializer());
+  if (auto *ArrayInit = dyn_cast<ConstantDataArray>(Init)) {
+    Type *InitElTy = ArrayInit->getElementType();
+    if (InitElTy->isIntegerTy(ElementSize)) {
+      // If Init is an initializer for an array of the expected type
+      // and size, use it as is.
+      Array = ArrayInit;
+      ArrayTy = ArrayInit->getType();
     }
-  } else {
-    // This must be a ConstantDataArray.
-    Array = dyn_cast<ConstantDataArray>(GV->getInitializer());
-    if (!Array)
+  }
+
+  if (!Array) {
+    if (ElementSize != 8)
+      // TODO: Handle conversions to larger integral types.
       return false;
-    ArrayTy = Array->getType();
+
+    // Otherwise extract the portion of the initializer starting
+    // at Offset as an array of bytes, and reset Offset.
+    Init = ReadByteArrayFromGlobal(GV, Offset);
+    if (!Init)
+      return false;
+
+    Offset = 0;
+    Array = dyn_cast<ConstantDataArray>(Init);
+    ArrayTy = dyn_cast<ArrayType>(Init->getType());
   }
-  if (!ArrayTy->getElementType()->isIntegerTy(ElementSize))
-    return false;
 
   uint64_t NumElts = ArrayTy->getArrayNumElements();
   if (Offset > NumElts)
@@ -4117,6 +4277,12 @@ bool llvm::getConstantStringInfo(const Value *V, StringRef &Str,
 
   if (Slice.Array == nullptr) {
     if (TrimAtNul) {
+      // Return a nul-terminated string even for an empty Slice.  This is
+      // safe because all existing SimplifyLibcalls callers require string
+      // arguments and the behavior of the functions they fold is undefined
+      // otherwise.  Folding the calls this way is preferable to making
+      // the undefined library calls, even though it prevents sanitizers
+      // from reporting such calls.
       Str = StringRef();
       return true;
     }
@@ -4196,9 +4362,13 @@ static uint64_t GetStringLengthH(const Value *V,
     return 0;
 
   if (Slice.Array == nullptr)
+    // Zeroinitializer (including an empty one).
     return 1;
 
-  // Search for nul characters
+  // Search for the first nul character.  Return a conservative result even
+  // when there is no nul.  This is safe since otherwise the string function
+  // being folded such as strlen is undefined, and can be preferable to
+  // making the undefined library call.
   unsigned NullIndex = 0;
   for (unsigned E = Slice.Length; NullIndex < E; ++NullIndex) {
     if (Slice.Array->getElementAsInteger(Slice.Offset + NullIndex) == 0)
@@ -4517,13 +4687,40 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V,
   const Operator *Inst = dyn_cast<Operator>(V);
   if (!Inst)
     return false;
+  return isSafeToSpeculativelyExecuteWithOpcode(Inst->getOpcode(), Inst, CtxI, DT, TLI);
+}
+
+bool llvm::isSafeToSpeculativelyExecuteWithOpcode(unsigned Opcode,
+                                        const Operator *Inst,
+                                        const Instruction *CtxI,
+                                        const DominatorTree *DT,
+                                        const TargetLibraryInfo *TLI) {
+#ifndef NDEBUG
+  if (Inst->getOpcode() != Opcode) {
+    // Check that the operands are actually compatible with the Opcode override.
+    auto hasEqualReturnAndLeadingOperandTypes =
+        [](const Operator *Inst, unsigned NumLeadingOperands) {
+          if (Inst->getNumOperands() < NumLeadingOperands)
+            return false;
+          const Type *ExpectedType = Inst->getType();
+          for (unsigned ItOp = 0; ItOp < NumLeadingOperands; ++ItOp)
+            if (Inst->getOperand(ItOp)->getType() != ExpectedType)
+              return false;
+          return true;
+        };
+    assert(!Instruction::isBinaryOp(Opcode) ||
+           hasEqualReturnAndLeadingOperandTypes(Inst, 2));
+    assert(!Instruction::isUnaryOp(Opcode) ||
+           hasEqualReturnAndLeadingOperandTypes(Inst, 1));
+  }
+#endif
 
   for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i)
     if (Constant *C = dyn_cast<Constant>(Inst->getOperand(i)))
       if (C->canTrap())
         return false;
 
-  switch (Inst->getOpcode()) {
+  switch (Opcode) {
   default:
     return true;
   case Instruction::UDiv:
@@ -4554,7 +4751,9 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V,
     return false;
   }
   case Instruction::Load: {
-    const LoadInst *LI = cast<LoadInst>(Inst);
+    const LoadInst *LI = dyn_cast<LoadInst>(Inst);
+    if (!LI)
+      return false;
     if (mustSuppressSpeculation(*LI))
       return false;
     const DataLayout &DL = LI->getModule()->getDataLayout();
@@ -4563,7 +4762,9 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V,
         TLI);
   }
   case Instruction::Call: {
-    auto *CI = cast<const CallInst>(Inst);
+    auto *CI = dyn_cast<const CallInst>(Inst);
+    if (!CI)
+      return false;
     const Function *Callee = CI->getCalledFunction();
 
     // The called function could have undefined behavior or side-effects, even
@@ -4595,8 +4796,20 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V,
   }
 }
 
-bool llvm::mayBeMemoryDependent(const Instruction &I) {
-  return I.mayReadOrWriteMemory() || !isSafeToSpeculativelyExecute(&I);
+bool llvm::mayHaveNonDefUseDependency(const Instruction &I) {
+  if (I.mayReadOrWriteMemory())
+    // Memory dependency possible
+    return true;
+  if (!isSafeToSpeculativelyExecute(&I))
+    // Can't move above a maythrow call or infinite loop.  Or if an
+    // inalloca alloca, above a stacksave call.
+    return true;
+  if (!isGuaranteedToTransferExecutionToSuccessor(&I))
+    // 1) Can't reorder two inf-loop calls, even if readonly
+    // 2) Also can't reorder an inf-loop call below a instruction which isn't
+    //    safe to speculative execute.  (Inverse of above)
+    return true;
+  return false;
 }
 
 /// Convert ConstantRange OverflowResult into ValueTracking OverflowResult.
@@ -4766,6 +4979,22 @@ OverflowResult llvm::computeOverflowForUnsignedSub(const Value *LHS,
                                                    AssumptionCache *AC,
                                                    const Instruction *CxtI,
                                                    const DominatorTree *DT) {
+  // X - (X % ?)
+  // The remainder of a value can't have greater magnitude than itself,
+  // so the subtraction can't overflow.
+
+  // X - (X -nuw ?)
+  // In the minimal case, this would simplify to "?", so there's no subtract
+  // at all. But if this analysis is used to peek through casts, for example,
+  // then determining no-overflow may allow other transforms.
+
+  // TODO: There are other patterns like this.
+  //       See simplifyICmpWithBinOpOnLHS() for candidates.
+  if (match(RHS, m_URem(m_Specific(LHS), m_Value())) ||
+      match(RHS, m_NUWSub(m_Specific(LHS), m_Value())))
+    if (isGuaranteedNotToBeUndefOrPoison(LHS, AC, CxtI, DT))
+      return OverflowResult::NeverOverflows;
+
   // Checking for conditions implied by dominating conditions may be expensive.
   // Limit it to usub_with_overflow calls for now.
   if (match(CxtI,
@@ -4789,6 +5018,19 @@ OverflowResult llvm::computeOverflowForSignedSub(const Value *LHS,
                                                  AssumptionCache *AC,
                                                  const Instruction *CxtI,
                                                  const DominatorTree *DT) {
+  // X - (X % ?)
+  // The remainder of a value can't have greater magnitude than itself,
+  // so the subtraction can't overflow.
+
+  // X - (X -nsw ?)
+  // In the minimal case, this would simplify to "?", so there's no subtract
+  // at all. But if this analysis is used to peek through casts, for example,
+  // then determining no-overflow may allow other transforms.
+  if (match(RHS, m_SRem(m_Specific(LHS), m_Value())) ||
+      match(RHS, m_NSWSub(m_Specific(LHS), m_Value())))
+    if (isGuaranteedNotToBeUndefOrPoison(LHS, AC, CxtI, DT))
+      return OverflowResult::NeverOverflows;
+
   // If LHS and RHS each have at least two sign bits, the subtraction
   // cannot overflow.
   if (ComputeNumSignBits(LHS, DL, 0, AC, CxtI, DT) > 1 &&
@@ -5100,7 +5342,9 @@ static bool isGuaranteedNotToBeUndefOrPoison(const Value *V,
   }
 
   if (auto *I = dyn_cast<LoadInst>(V))
-    if (I->getMetadata(LLVMContext::MD_noundef))
+    if (I->hasMetadata(LLVMContext::MD_noundef) ||
+        I->hasMetadata(LLVMContext::MD_dereferenceable) ||
+        I->hasMetadata(LLVMContext::MD_dereferenceable_or_null))
       return true;
 
   if (programUndefinedIfUndefOrPoison(V, PoisonOnly))
@@ -5125,10 +5369,10 @@ static bool isGuaranteedNotToBeUndefOrPoison(const Value *V,
     auto *TI = Dominator->getBlock()->getTerminator();
 
     Value *Cond = nullptr;
-    if (auto BI = dyn_cast<BranchInst>(TI)) {
+    if (auto BI = dyn_cast_or_null<BranchInst>(TI)) {
       if (BI->isConditional())
         Cond = BI->getCondition();
-    } else if (auto SI = dyn_cast<SwitchInst>(TI)) {
+    } else if (auto SI = dyn_cast_or_null<SwitchInst>(TI)) {
       Cond = SI->getCondition();
     }
 
@@ -5763,20 +6007,6 @@ static SelectPatternResult matchMinMax(CmpInst::Predicate Pred,
   if (Pred != CmpInst::ICMP_SGT && Pred != CmpInst::ICMP_SLT)
     return {SPF_UNKNOWN, SPNB_NA, false};
 
-  // Z = X -nsw Y
-  // (X >s Y) ? 0 : Z ==> (Z >s 0) ? 0 : Z ==> SMIN(Z, 0)
-  // (X <s Y) ? 0 : Z ==> (Z <s 0) ? 0 : Z ==> SMAX(Z, 0)
-  if (match(TrueVal, m_Zero()) &&
-      match(FalseVal, m_NSWSub(m_Specific(CmpLHS), m_Specific(CmpRHS))))
-    return {Pred == CmpInst::ICMP_SGT ? SPF_SMIN : SPF_SMAX, SPNB_NA, false};
-
-  // Z = X -nsw Y
-  // (X >s Y) ? Z : 0 ==> (Z >s 0) ? Z : 0 ==> SMAX(Z, 0)
-  // (X <s Y) ? Z : 0 ==> (Z <s 0) ? Z : 0 ==> SMIN(Z, 0)
-  if (match(FalseVal, m_Zero()) &&
-      match(TrueVal, m_NSWSub(m_Specific(CmpLHS), m_Specific(CmpRHS))))
-    return {Pred == CmpInst::ICMP_SGT ? SPF_SMAX : SPF_SMIN, SPNB_NA, false};
-
   const APInt *C1;
   if (!match(CmpRHS, m_APInt(C1)))
     return {SPF_UNKNOWN, SPNB_NA, false};
@@ -6576,11 +6806,38 @@ Optional<bool> llvm::isImpliedCondition(const Value *LHS, const Value *RHS,
   if (LHS == RHS)
     return LHSIsTrue;
 
-  const ICmpInst *RHSCmp = dyn_cast<ICmpInst>(RHS);
-  if (RHSCmp)
+  if (const ICmpInst *RHSCmp = dyn_cast<ICmpInst>(RHS))
     return isImpliedCondition(LHS, RHSCmp->getPredicate(),
                               RHSCmp->getOperand(0), RHSCmp->getOperand(1), DL,
                               LHSIsTrue, Depth);
+
+  if (Depth == MaxAnalysisRecursionDepth)
+    return None;
+
+  // LHS ==> (RHS1 || RHS2) if LHS ==> RHS1 or LHS ==> RHS2
+  // LHS ==> !(RHS1 && RHS2) if LHS ==> !RHS1 or LHS ==> !RHS2
+  const Value *RHS1, *RHS2;
+  if (match(RHS, m_LogicalOr(m_Value(RHS1), m_Value(RHS2)))) {
+    if (Optional<bool> Imp =
+            isImpliedCondition(LHS, RHS1, DL, LHSIsTrue, Depth + 1))
+      if (*Imp == true)
+        return true;
+    if (Optional<bool> Imp =
+            isImpliedCondition(LHS, RHS2, DL, LHSIsTrue, Depth + 1))
+      if (*Imp == true)
+        return true;
+  }
+  if (match(RHS, m_LogicalAnd(m_Value(RHS1), m_Value(RHS2)))) {
+    if (Optional<bool> Imp =
+            isImpliedCondition(LHS, RHS1, DL, LHSIsTrue, Depth + 1))
+      if (*Imp == false)
+        return false;
+    if (Optional<bool> Imp =
+            isImpliedCondition(LHS, RHS2, DL, LHSIsTrue, Depth + 1))
+      if (*Imp == false)
+        return false;
+  }
+
   return None;
 }
 
@@ -7072,66 +7329,25 @@ getOffsetFromIndex(const GEPOperator *GEP, unsigned Idx, const DataLayout &DL) {
 
 Optional<int64_t> llvm::isPointerOffset(const Value *Ptr1, const Value *Ptr2,
                                         const DataLayout &DL) {
-  Ptr1 = Ptr1->stripPointerCasts();
-  Ptr2 = Ptr2->stripPointerCasts();
+  APInt Offset1(DL.getIndexTypeSizeInBits(Ptr1->getType()), 0);
+  APInt Offset2(DL.getIndexTypeSizeInBits(Ptr2->getType()), 0);
+  Ptr1 = Ptr1->stripAndAccumulateConstantOffsets(DL, Offset1, true);
+  Ptr2 = Ptr2->stripAndAccumulateConstantOffsets(DL, Offset2, true);
 
   // Handle the trivial case first.
-  if (Ptr1 == Ptr2) {
-    return 0;
-  }
+  if (Ptr1 == Ptr2)
+    return Offset2.getSExtValue() - Offset1.getSExtValue();
 
   const GEPOperator *GEP1 = dyn_cast<GEPOperator>(Ptr1);
   const GEPOperator *GEP2 = dyn_cast<GEPOperator>(Ptr2);
 
-  // If one pointer is a GEP see if the GEP is a constant offset from the base,
-  // as in "P" and "gep P, 1".
-  // Also do this iteratively to handle the the following case:
-  //   Ptr_t1 = GEP Ptr1, c1
-  //   Ptr_t2 = GEP Ptr_t1, c2
-  //   Ptr2 = GEP Ptr_t2, c3
-  // where we will return c1+c2+c3.
-  // TODO: Handle the case when both Ptr1 and Ptr2 are GEPs of some common base
-  // -- replace getOffsetFromBase with getOffsetAndBase, check that the bases
-  // are the same, and return the difference between offsets.
-  auto getOffsetFromBase = [&DL](const GEPOperator *GEP,
-                                 const Value *Ptr) -> Optional<int64_t> {
-    const GEPOperator *GEP_T = GEP;
-    int64_t OffsetVal = 0;
-    bool HasSameBase = false;
-    while (GEP_T) {
-      auto Offset = getOffsetFromIndex(GEP_T, 1, DL);
-      if (!Offset)
-        return None;
-      OffsetVal += *Offset;
-      auto Op0 = GEP_T->getOperand(0)->stripPointerCasts();
-      if (Op0 == Ptr) {
-        HasSameBase = true;
-        break;
-      }
-      GEP_T = dyn_cast<GEPOperator>(Op0);
-    }
-    if (!HasSameBase)
-      return None;
-    return OffsetVal;
-  };
-
-  if (GEP1) {
-    auto Offset = getOffsetFromBase(GEP1, Ptr2);
-    if (Offset)
-      return -*Offset;
-  }
-  if (GEP2) {
-    auto Offset = getOffsetFromBase(GEP2, Ptr1);
-    if (Offset)
-      return Offset;
-  }
-
   // Right now we handle the case when Ptr1/Ptr2 are both GEPs with an identical
   // base.  After that base, they may have some number of common (and
   // potentially variable) indices.  After that they handle some constant
   // offset, which determines their offset from each other.  At this point, we
   // handle no other case.
-  if (!GEP1 || !GEP2 || GEP1->getOperand(0) != GEP2->getOperand(0))
+  if (!GEP1 || !GEP2 || GEP1->getOperand(0) != GEP2->getOperand(0) ||
+      GEP1->getSourceElementType() != GEP2->getSourceElementType())
     return None;
 
   // Skip any common indices and track the GEP types.
@@ -7140,9 +7356,10 @@ Optional<int64_t> llvm::isPointerOffset(const Value *Ptr1, const Value *Ptr2,
     if (GEP1->getOperand(Idx) != GEP2->getOperand(Idx))
       break;
 
-  auto Offset1 = getOffsetFromIndex(GEP1, Idx, DL);
-  auto Offset2 = getOffsetFromIndex(GEP2, Idx, DL);
-  if (!Offset1 || !Offset2)
+  auto IOffset1 = getOffsetFromIndex(GEP1, Idx, DL);
+  auto IOffset2 = getOffsetFromIndex(GEP2, Idx, DL);
+  if (!IOffset1 || !IOffset2)
     return None;
-  return *Offset2 - *Offset1;
+  return *IOffset2 - *IOffset1 + Offset2.getSExtValue() -
+         Offset1.getSExtValue();
 }
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 655c248907f6..f863a1ffad3a 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -40,7 +40,7 @@ static cl::opt<unsigned> MaxInterleaveGroupFactor(
 /// Return true if all of the intrinsic's arguments and return type are scalars
 /// for the scalar form of the intrinsic, and vectors for the vector form of the
 /// intrinsic (except operands that are marked as always being scalar by
-/// hasVectorInstrinsicScalarOpd).
+/// isVectorIntrinsicWithScalarOpAtArg).
 bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   switch (ID) {
   case Intrinsic::abs:   // Begin integer bit-manipulation.
@@ -89,6 +89,8 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   case Intrinsic::fmuladd:
   case Intrinsic::powi:
   case Intrinsic::canonicalize:
+  case Intrinsic::fptosi_sat:
+  case Intrinsic::fptoui_sat:
     return true;
   default:
     return false;
@@ -96,8 +98,8 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
 }
 
 /// Identifies if the vector form of the intrinsic has a scalar operand.
-bool llvm::hasVectorInstrinsicScalarOpd(Intrinsic::ID ID,
-                                        unsigned ScalarOpdIdx) {
+bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
+                                              unsigned ScalarOpdIdx) {
   switch (ID) {
   case Intrinsic::abs:
   case Intrinsic::ctlz:
@@ -114,11 +116,14 @@ bool llvm::hasVectorInstrinsicScalarOpd(Intrinsic::ID ID,
   }
 }
 
-bool llvm::hasVectorInstrinsicOverloadedScalarOpd(Intrinsic::ID ID,
-                                                  unsigned ScalarOpdIdx) {
+bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID,
+                                                  unsigned OpdIdx) {
   switch (ID) {
+  case Intrinsic::fptosi_sat:
+  case Intrinsic::fptoui_sat:
+    return OpdIdx == 0;
   case Intrinsic::powi:
-    return (ScalarOpdIdx == 1);
+    return OpdIdx == 1;
   default:
     return false;
   }
@@ -496,6 +501,116 @@ bool llvm::widenShuffleMaskElts(int Scale, ArrayRef<int> Mask,
   return true;
 }
 
+void llvm::processShuffleMasks(
+    ArrayRef<int> Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs,
+    unsigned NumOfUsedRegs, function_ref<void()> NoInputAction,
+    function_ref<void(ArrayRef<int>, unsigned, unsigned)> SingleInputAction,
+    function_ref<void(ArrayRef<int>, unsigned, unsigned)> ManyInputsAction) {
+  SmallVector<SmallVector<SmallVector<int>>> Res(NumOfDestRegs);
+  // Try to perform better estimation of the permutation.
+  // 1. Split the source/destination vectors into real registers.
+  // 2. Do the mask analysis to identify which real registers are
+  // permuted.
+  int Sz = Mask.size();
+  unsigned SzDest = Sz / NumOfDestRegs;
+  unsigned SzSrc = Sz / NumOfSrcRegs;
+  for (unsigned I = 0; I < NumOfDestRegs; ++I) {
+    auto &RegMasks = Res[I];
+    RegMasks.assign(NumOfSrcRegs, {});
+    // Check that the values in dest registers are in the one src
+    // register.
+    for (unsigned K = 0; K < SzDest; ++K) {
+      int Idx = I * SzDest + K;
+      if (Idx == Sz)
+        break;
+      if (Mask[Idx] >= Sz || Mask[Idx] == UndefMaskElem)
+        continue;
+      int SrcRegIdx = Mask[Idx] / SzSrc;
+      // Add a cost of PermuteTwoSrc for each new source register permute,
+      // if we have more than one source registers.
+      if (RegMasks[SrcRegIdx].empty())
+        RegMasks[SrcRegIdx].assign(SzDest, UndefMaskElem);
+      RegMasks[SrcRegIdx][K] = Mask[Idx] % SzSrc;
+    }
+  }
+  // Process split mask.
+  for (unsigned I = 0; I < NumOfUsedRegs; ++I) {
+    auto &Dest = Res[I];
+    int NumSrcRegs =
+        count_if(Dest, [](ArrayRef<int> Mask) { return !Mask.empty(); });
+    switch (NumSrcRegs) {
+    case 0:
+      // No input vectors were used!
+      NoInputAction();
+      break;
+    case 1: {
+      // Find the only mask with at least single undef mask elem.
+      auto *It =
+          find_if(Dest, [](ArrayRef<int> Mask) { return !Mask.empty(); });
+      unsigned SrcReg = std::distance(Dest.begin(), It);
+      SingleInputAction(*It, SrcReg, I);
+      break;
+    }
+    default: {
+      // The first mask is a permutation of a single register. Since we have >2
+      // input registers to shuffle, we merge the masks for 2 first registers
+      // and generate a shuffle of 2 registers rather than the reordering of the
+      // first register and then shuffle with the second register. Next,
+      // generate the shuffles of the resulting register + the remaining
+      // registers from the list.
+      auto &&CombineMasks = [](MutableArrayRef<int> FirstMask,
+                               ArrayRef<int> SecondMask) {
+        for (int Idx = 0, VF = FirstMask.size(); Idx < VF; ++Idx) {
+          if (SecondMask[Idx] != UndefMaskElem) {
+            assert(FirstMask[Idx] == UndefMaskElem &&
+                   "Expected undefined mask element.");
+            FirstMask[Idx] = SecondMask[Idx] + VF;
+          }
+        }
+      };
+      auto &&NormalizeMask = [](MutableArrayRef<int> Mask) {
+        for (int Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
+          if (Mask[Idx] != UndefMaskElem)
+            Mask[Idx] = Idx;
+        }
+      };
+      int SecondIdx;
+      do {
+        int FirstIdx = -1;
+        SecondIdx = -1;
+        MutableArrayRef<int> FirstMask, SecondMask;
+        for (unsigned I = 0; I < NumOfDestRegs; ++I) {
+          SmallVectorImpl<int> &RegMask = Dest[I];
+          if (RegMask.empty())
+            continue;
+
+          if (FirstIdx == SecondIdx) {
+            FirstIdx = I;
+            FirstMask = RegMask;
+            continue;
+          }
+          SecondIdx = I;
+          SecondMask = RegMask;
+          CombineMasks(FirstMask, SecondMask);
+          ManyInputsAction(FirstMask, FirstIdx, SecondIdx);
+          NormalizeMask(FirstMask);
+          RegMask.clear();
+          SecondMask = FirstMask;
+          SecondIdx = FirstIdx;
+        }
+        if (FirstIdx != SecondIdx && SecondIdx >= 0) {
+          CombineMasks(SecondMask, FirstMask);
+          ManyInputsAction(SecondMask, SecondIdx, FirstIdx);
+          Dest[FirstIdx].clear();
+          NormalizeMask(SecondMask);
+        }
+      } while (SecondIdx >= 0);
+      break;
+    }
+    }
+  }
+}
+
 MapVector<Instruction *, uint64_t>
 llvm::computeMinimumValueSizes(ArrayRef<BasicBlock *> Blocks, DemandedBits &DB,
                                const TargetTransformInfo *TTI) {
@@ -543,9 +658,8 @@ llvm::computeMinimumValueSizes(ArrayRef<BasicBlock *> Blocks, DemandedBits &DB,
     Value *Val = Worklist.pop_back_val();
     Value *Leader = ECs.getOrInsertLeaderValue(Val);
 
-    if (Visited.count(Val))
+    if (!Visited.insert(Val).second)
       continue;
-    Visited.insert(Val);
 
     // Non-instructions terminate a chain successfully.
     if (!isa<Instruction>(Val))
@@ -1387,7 +1501,7 @@ void VFABI::getVectorVariantNames(
 #ifndef NDEBUG
     LLVM_DEBUG(dbgs() << "VFABI: adding mapping '" << S << "'\n");
     Optional<VFInfo> Info = VFABI::tryDemangleForVFABI(S, *(CI.getModule()));
-    assert(Info.hasValue() && "Invalid name for a VFABI variant.");
+    assert(Info && "Invalid name for a VFABI variant.");
     assert(CI.getModule()->getFunction(Info.getValue().VectorName) &&
            "Vector function is missing.");
 #endif
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index e3bf41c9721b..30e6f8599208 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -567,7 +567,6 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(exact);
   KEYWORD(inbounds);
   KEYWORD(inrange);
-  KEYWORD(align);
   KEYWORD(addrspace);
   KEYWORD(section);
   KEYWORD(partition);
@@ -576,12 +575,16 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(module);
   KEYWORD(asm);
   KEYWORD(sideeffect);
-  KEYWORD(alignstack);
   KEYWORD(inteldialect);
   KEYWORD(gc);
   KEYWORD(prefix);
   KEYWORD(prologue);
 
+  KEYWORD(no_sanitize_address);
+  KEYWORD(no_sanitize_hwaddress);
+  KEYWORD(no_sanitize_memtag);
+  KEYWORD(sanitize_address_dyninit);
+
   KEYWORD(ccc);
   KEYWORD(fastcc);
   KEYWORD(coldcc);
@@ -632,82 +635,13 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(c);
 
   KEYWORD(attributes);
+  KEYWORD(sync);
+  KEYWORD(async);
 
-  KEYWORD(alwaysinline);
-  KEYWORD(allocsize);
-  KEYWORD(argmemonly);
-  KEYWORD(builtin);
-  KEYWORD(byval);
-  KEYWORD(inalloca);
-  KEYWORD(cold);
-  KEYWORD(convergent);
-  KEYWORD(dereferenceable);
-  KEYWORD(dereferenceable_or_null);
-  KEYWORD(disable_sanitizer_instrumentation);
-  KEYWORD(elementtype);
-  KEYWORD(inaccessiblememonly);
-  KEYWORD(inaccessiblemem_or_argmemonly);
-  KEYWORD(inlinehint);
-  KEYWORD(inreg);
-  KEYWORD(jumptable);
-  KEYWORD(minsize);
-  KEYWORD(naked);
-  KEYWORD(nest);
-  KEYWORD(noalias);
-  KEYWORD(nobuiltin);
-  KEYWORD(nocallback);
-  KEYWORD(nocapture);
-  KEYWORD(noduplicate);
-  KEYWORD(nofree);
-  KEYWORD(noimplicitfloat);
-  KEYWORD(noinline);
-  KEYWORD(norecurse);
-  KEYWORD(nonlazybind);
-  KEYWORD(nomerge);
-  KEYWORD(nonnull);
-  KEYWORD(noprofile);
-  KEYWORD(noredzone);
-  KEYWORD(noreturn);
-  KEYWORD(nosync);
-  KEYWORD(nocf_check);
-  KEYWORD(noundef);
-  KEYWORD(nounwind);
-  KEYWORD(nosanitize_coverage);
-  KEYWORD(null_pointer_is_valid);
-  KEYWORD(optforfuzzing);
-  KEYWORD(optnone);
-  KEYWORD(optsize);
-  KEYWORD(preallocated);
-  KEYWORD(readnone);
-  KEYWORD(readonly);
-  KEYWORD(returned);
-  KEYWORD(returns_twice);
-  KEYWORD(signext);
-  KEYWORD(speculatable);
-  KEYWORD(sret);
-  KEYWORD(ssp);
-  KEYWORD(sspreq);
-  KEYWORD(sspstrong);
-  KEYWORD(strictfp);
-  KEYWORD(safestack);
-  KEYWORD(shadowcallstack);
-  KEYWORD(sanitize_address);
-  KEYWORD(sanitize_hwaddress);
-  KEYWORD(sanitize_memtag);
-  KEYWORD(sanitize_thread);
-  KEYWORD(sanitize_memory);
-  KEYWORD(speculative_load_hardening);
-  KEYWORD(swifterror);
-  KEYWORD(swiftself);
-  KEYWORD(swiftasync);
-  KEYWORD(uwtable);
-  KEYWORD(vscale_range);
-  KEYWORD(willreturn);
-  KEYWORD(writeonly);
-  KEYWORD(zeroext);
-  KEYWORD(immarg);
-  KEYWORD(byref);
-  KEYWORD(mustprogress);
+#define GET_ATTR_NAMES
+#define ATTRIBUTE_ENUM(ENUM_NAME, DISPLAY_NAME) \
+  KEYWORD(DISPLAY_NAME);
+#include "llvm/IR/Attributes.inc"
 
   KEYWORD(type);
   KEYWORD(opaque);
@@ -781,7 +715,6 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(param);
   KEYWORD(hotness);
   KEYWORD(unknown);
-  KEYWORD(hot);
   KEYWORD(critical);
   KEYWORD(relbf);
   KEYWORD(variable);
@@ -856,7 +789,10 @@ lltok::Kind LLLexer::LexIdentifier() {
   TYPEKEYWORD("token",     Type::getTokenTy(Context));
 
   if (Keyword == "ptr") {
-    if (Context.supportsTypedPointers()) {
+    // setOpaquePointers() must be called before creating any pointer types.
+    if (!Context.hasSetOpaquePointersValue()) {
+      Context.setOpaquePointers(true);
+    } else if (Context.supportsTypedPointers()) {
       Warning("ptr type is only supported in -opaque-pointers mode");
       return lltok::Error;
     }
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 432ec151cf8a..a1cdeac2b47f 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -37,6 +37,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/Support/Casting.h"
@@ -47,7 +48,6 @@
 #include <algorithm>
 #include <cassert>
 #include <cstring>
-#include <iterator>
 #include <vector>
 
 using namespace llvm;
@@ -59,9 +59,31 @@ static std::string getTypeString(Type *T) {
   return Tmp.str();
 }
 
+static void setContextOpaquePointers(LLLexer &L, LLVMContext &C) {
+  while (true) {
+    lltok::Kind K = L.Lex();
+    // LLLexer will set the opaque pointers option in LLVMContext if it sees an
+    // explicit "ptr".
+    if (K == lltok::star || K == lltok::Error || K == lltok::Eof ||
+        isa_and_nonnull<PointerType>(L.getTyVal())) {
+      if (K == lltok::star)
+        C.setOpaquePointers(false);
+      return;
+    }
+  }
+}
+
 /// Run: module ::= toplevelentity*
 bool LLParser::Run(bool UpgradeDebugInfo,
                    DataLayoutCallbackTy DataLayoutCallback) {
+  // If we haven't decided on whether or not we're using opaque pointers, do a
+  // quick lex over the tokens to see if we explicitly construct any typed or
+  // opaque pointer types.
+  // Don't bail out on an error so we do the same work in the parsing below
+  // regardless of if --opaque-pointers is set.
+  if (!Context.hasSetOpaquePointersValue())
+    setContextOpaquePointers(OPLex, Context);
+
   // Prime the lexer.
   Lex.Lex();
 
@@ -248,7 +270,7 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
   // remangle intrinsics names as well.
   for (Function &F : llvm::make_early_inc_range(*M)) {
     if (auto Remangled = Intrinsic::remangleIntrinsicFunction(&F)) {
-      F.replaceAllUsesWith(Remangled.getValue());
+      F.replaceAllUsesWith(*Remangled);
       F.eraseFromParent();
     }
   }
@@ -1081,6 +1103,45 @@ bool LLParser::parseAliasOrIFunc(const std::string &Name, LocTy NameLoc,
   return false;
 }
 
+static bool isSanitizer(lltok::Kind Kind) {
+  switch (Kind) {
+  case lltok::kw_no_sanitize_address:
+  case lltok::kw_no_sanitize_hwaddress:
+  case lltok::kw_no_sanitize_memtag:
+  case lltok::kw_sanitize_address_dyninit:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool LLParser::parseSanitizer(GlobalVariable *GV) {
+  using SanitizerMetadata = GlobalValue::SanitizerMetadata;
+  SanitizerMetadata Meta;
+  if (GV->hasSanitizerMetadata())
+    Meta = GV->getSanitizerMetadata();
+
+  switch (Lex.getKind()) {
+  case lltok::kw_no_sanitize_address:
+    Meta.NoAddress = true;
+    break;
+  case lltok::kw_no_sanitize_hwaddress:
+    Meta.NoHWAddress = true;
+    break;
+  case lltok::kw_no_sanitize_memtag:
+    Meta.NoMemtag = true;
+    break;
+  case lltok::kw_sanitize_address_dyninit:
+    Meta.IsDynInit = true;
+    break;
+  default:
+    return tokError("non-sanitizer token passed to LLParser::parseSanitizer()");
+  }
+  GV->setSanitizerMetadata(Meta);
+  Lex.Lex();
+  return false;
+}
+
 /// parseGlobal
 ///   ::= GlobalVar '=' OptionalLinkage OptionalPreemptionSpecifier
 ///       OptionalVisibility OptionalDLLStorageClass
@@ -1168,7 +1229,7 @@ bool LLParser::parseGlobal(const std::string &Name, LocTy NameLoc,
   GV->setUnnamedAddr(UnnamedAddr);
 
   if (GVal) {
-    if (!GVal->getType()->isOpaque() && GVal->getValueType() != Ty)
+    if (GVal->getType() != Ty->getPointerTo(AddrSpace))
       return error(
           TyLoc,
           "forward reference and definition of global have different types");
@@ -1199,6 +1260,9 @@ bool LLParser::parseGlobal(const std::string &Name, LocTy NameLoc,
     } else if (Lex.getKind() == lltok::MetadataVar) {
       if (parseGlobalObjectMetadataAttachment(*GV))
         return true;
+    } else if (isSanitizer(Lex.getKind())) {
+      if (parseSanitizer(GV))
+        return true;
     } else {
       Comdat *C;
       if (parseOptionalComdat(Name, C))
@@ -1333,6 +1397,20 @@ bool LLParser::parseEnumAttribute(Attribute::AttrKind Attr, AttrBuilder &B,
     B.addDereferenceableOrNullAttr(Bytes);
     return false;
   }
+  case Attribute::UWTable: {
+    UWTableKind Kind;
+    if (parseOptionalUWTableKind(Kind))
+      return true;
+    B.addUWTableAttr(Kind);
+    return false;
+  }
+  case Attribute::AllocKind: {
+    AllocFnKind Kind = AllocFnKind::Unknown;
+    if (parseAllocKind(Kind))
+      return true;
+    B.addAllocKindAttr(Kind);
+    return false;
+  }
   default:
     B.addAttribute(Attr);
     Lex.Lex();
@@ -1996,6 +2074,56 @@ bool LLParser::parseOptionalDerefAttrBytes(lltok::Kind AttrKind,
   return false;
 }
 
+bool LLParser::parseOptionalUWTableKind(UWTableKind &Kind) {
+  Lex.Lex();
+  Kind = UWTableKind::Default;
+  if (!EatIfPresent(lltok::lparen))
+    return false;
+  LocTy KindLoc = Lex.getLoc();
+  if (Lex.getKind() == lltok::kw_sync)
+    Kind = UWTableKind::Sync;
+  else if (Lex.getKind() == lltok::kw_async)
+    Kind = UWTableKind::Async;
+  else
+    return error(KindLoc, "expected unwind table kind");
+  Lex.Lex();
+  return parseToken(lltok::rparen, "expected ')'");
+}
+
+bool LLParser::parseAllocKind(AllocFnKind &Kind) {
+  Lex.Lex();
+  LocTy ParenLoc = Lex.getLoc();
+  if (!EatIfPresent(lltok::lparen))
+    return error(ParenLoc, "expected '('");
+  LocTy KindLoc = Lex.getLoc();
+  std::string Arg;
+  if (parseStringConstant(Arg))
+    return error(KindLoc, "expected allockind value");
+  for (StringRef A : llvm::split(Arg, ",")) {
+    if (A == "alloc") {
+      Kind |= AllocFnKind::Alloc;
+    } else if (A == "realloc") {
+      Kind |= AllocFnKind::Realloc;
+    } else if (A == "free") {
+      Kind |= AllocFnKind::Free;
+    } else if (A == "uninitialized") {
+      Kind |= AllocFnKind::Uninitialized;
+    } else if (A == "zeroed") {
+      Kind |= AllocFnKind::Zeroed;
+    } else if (A == "aligned") {
+      Kind |= AllocFnKind::Aligned;
+    } else {
+      return error(KindLoc, Twine("unknown allockind ") + A);
+    }
+  }
+  ParenLoc = Lex.getLoc();
+  if (!EatIfPresent(lltok::rparen))
+    return error(ParenLoc, "expected ')'");
+  if (Kind == AllocFnKind::Unknown)
+    return error(KindLoc, "expected allockind value");
+  return false;
+}
+
 /// parseOptionalCommaAlign
 ///   ::=
 ///   ::= ',' align 4
@@ -3344,24 +3472,8 @@ bool LLParser::parseValID(ValID &ID, PerFunctionState *PFS, Type *ExpectedTy) {
     ID.Kind = ValID::t_Constant;
     return false;
   }
-  case lltok::kw_extractvalue: {
-    Lex.Lex();
-    Constant *Val;
-    SmallVector<unsigned, 4> Indices;
-    if (parseToken(lltok::lparen,
-                   "expected '(' in extractvalue constantexpr") ||
-        parseGlobalTypeAndValue(Val) || parseIndexList(Indices) ||
-        parseToken(lltok::rparen, "expected ')' in extractvalue constantexpr"))
-      return true;
-
-    if (!Val->getType()->isAggregateType())
-      return error(ID.Loc, "extractvalue operand must be aggregate type");
-    if (!ExtractValueInst::getIndexedType(Val->getType(), Indices))
-      return error(ID.Loc, "invalid indices for extractvalue");
-    ID.ConstantVal = ConstantExpr::getExtractValue(Val, Indices);
-    ID.Kind = ValID::t_Constant;
-    return false;
-  }
+  case lltok::kw_extractvalue:
+    return error(ID.Loc, "extractvalue constexprs are no longer supported");
   case lltok::kw_insertvalue: {
     Lex.Lex();
     Constant *Val0, *Val1;
@@ -3881,11 +3993,11 @@ struct MDAPSIntField : public MDFieldImpl<APSInt> {
 };
 
 struct MDSignedField : public MDFieldImpl<int64_t> {
-  int64_t Min;
-  int64_t Max;
+  int64_t Min = INT64_MIN;
+  int64_t Max = INT64_MAX;
 
   MDSignedField(int64_t Default = 0)
-      : ImplTy(Default), Min(INT64_MIN), Max(INT64_MAX) {}
+      : ImplTy(Default) {}
   MDSignedField(int64_t Default, int64_t Min, int64_t Max)
       : ImplTy(Default), Min(Min), Max(Max) {}
 };
@@ -4144,8 +4256,8 @@ bool LLParser::parseMDField(LocTy Loc, StringRef Name, DIFlagField &Result) {
 
     Val = DINode::getFlag(Lex.getStrVal());
     if (!Val)
-      return tokError(Twine("invalid debug info flag flag '") +
-                      Lex.getStrVal() + "'");
+      return tokError(Twine("invalid debug info flag '") + Lex.getStrVal() +
+                      "'");
     Lex.Lex();
     return false;
   };
@@ -4779,7 +4891,8 @@ bool LLParser::parseDISubprogram(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(declaration, MDField, );                                            \
   OPTIONAL(retainedNodes, MDField, );                                          \
   OPTIONAL(thrownTypes, MDField, );                                            \
-  OPTIONAL(annotations, MDField, );
+  OPTIONAL(annotations, MDField, );                                            \
+  OPTIONAL(targetFuncName, MDStringField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
@@ -4798,7 +4911,8 @@ bool LLParser::parseDISubprogram(MDNode *&Result, bool IsDistinct) {
       (Context, scope.Val, name.Val, linkageName.Val, file.Val, line.Val,
        type.Val, scopeLine.Val, containingType.Val, virtualIndex.Val,
        thisAdjustment.Val, flags.Val, SPFlags, unit.Val, templateParams.Val,
-       declaration.Val, retainedNodes.Val, thrownTypes.Val, annotations.Val));
+       declaration.Val, retainedNodes.Val, thrownTypes.Val, annotations.Val,
+       targetFuncName.Val));
   return false;
 }
 
@@ -4965,7 +5079,7 @@ bool LLParser::parseDITemplateValueParameter(MDNode *&Result, bool IsDistinct) {
 ///                         declaration: !4, align: 8)
 bool LLParser::parseDIGlobalVariable(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
-  REQUIRED(name, MDStringField, (/* AllowEmpty */ false));                     \
+  OPTIONAL(name, MDStringField, (/* AllowEmpty */ false));                     \
   OPTIONAL(scope, MDField, );                                                  \
   OPTIONAL(linkageName, MDStringField, );                                      \
   OPTIONAL(file, MDField, );                                                   \
@@ -5603,20 +5717,19 @@ bool LLParser::parseFunctionHeader(Function *&Fn, bool IsDefine) {
     auto FRVI = ForwardRefVals.find(FunctionName);
     if (FRVI != ForwardRefVals.end()) {
       FwdFn = FRVI->second.first;
-      if (!FwdFn->getType()->isOpaque()) {
-        if (!FwdFn->getType()->getNonOpaquePointerElementType()->isFunctionTy())
-          return error(FRVI->second.second, "invalid forward reference to "
-                                            "function as global value!");
-        if (FwdFn->getType() != PFT)
-          return error(FRVI->second.second,
-                       "invalid forward reference to "
-                       "function '" +
-                           FunctionName +
-                           "' with wrong type: "
-                           "expected '" +
-                           getTypeString(PFT) + "' but was '" +
-                           getTypeString(FwdFn->getType()) + "'");
-      }
+      if (!FwdFn->getType()->isOpaque() &&
+          !FwdFn->getType()->getNonOpaquePointerElementType()->isFunctionTy())
+        return error(FRVI->second.second, "invalid forward reference to "
+                                          "function as global value!");
+      if (FwdFn->getType() != PFT)
+        return error(FRVI->second.second,
+                     "invalid forward reference to "
+                     "function '" +
+                         FunctionName +
+                         "' with wrong type: "
+                         "expected '" +
+                         getTypeString(PFT) + "' but was '" +
+                         getTypeString(FwdFn->getType()) + "'");
       ForwardRefVals.erase(FRVI);
     } else if ((Fn = M->getFunction(FunctionName))) {
       // Reject redefinitions.
@@ -5631,8 +5744,8 @@ bool LLParser::parseFunctionHeader(Function *&Fn, bool IsDefine) {
     // types agree.
     auto I = ForwardRefValIDs.find(NumberedVals.size());
     if (I != ForwardRefValIDs.end()) {
-      FwdFn = cast<Function>(I->second.first);
-      if (!FwdFn->getType()->isOpaque() && FwdFn->getType() != PFT)
+      FwdFn = I->second.first;
+      if (FwdFn->getType() != PFT)
         return error(NameLoc, "type of definition and forward reference of '@" +
                                   Twine(NumberedVals.size()) +
                                   "' disagree: "
@@ -7322,9 +7435,9 @@ int LLParser::parseCmpXchg(Instruction *&Inst, PerFunctionState &PFS) {
       PFS.getFunction().getParent()->getDataLayout().getTypeStoreSize(
           Cmp->getType()));
 
-  AtomicCmpXchgInst *CXI = new AtomicCmpXchgInst(
-      Ptr, Cmp, New, Alignment.getValueOr(DefaultAlignment), SuccessOrdering,
-      FailureOrdering, SSID);
+  AtomicCmpXchgInst *CXI =
+      new AtomicCmpXchgInst(Ptr, Cmp, New, Alignment.value_or(DefaultAlignment),
+                            SuccessOrdering, FailureOrdering, SSID);
   CXI->setVolatile(isVolatile);
   CXI->setWeak(isWeak);
 
@@ -7390,10 +7503,12 @@ int LLParser::parseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
 
   if (Operation == AtomicRMWInst::Xchg) {
     if (!Val->getType()->isIntegerTy() &&
-        !Val->getType()->isFloatingPointTy()) {
-      return error(ValLoc,
-                   "atomicrmw " + AtomicRMWInst::getOperationName(Operation) +
-                       " operand must be an integer or floating point type");
+        !Val->getType()->isFloatingPointTy() &&
+        !Val->getType()->isPointerTy()) {
+      return error(
+          ValLoc,
+          "atomicrmw " + AtomicRMWInst::getOperationName(Operation) +
+              " operand must be an integer, floating point, or pointer type");
     }
   } else if (IsFP) {
     if (!Val->getType()->isFloatingPointTy()) {
@@ -7409,7 +7524,9 @@ int LLParser::parseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
     }
   }
 
-  unsigned Size = Val->getType()->getPrimitiveSizeInBits();
+  unsigned Size =
+      PFS.getFunction().getParent()->getDataLayout().getTypeStoreSizeInBits(
+          Val->getType());
   if (Size < 8 || (Size & (Size - 1)))
     return error(ValLoc, "atomicrmw operand must be power-of-two byte-sized"
                          " integer");
@@ -7418,7 +7535,7 @@ int LLParser::parseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
           Val->getType()));
   AtomicRMWInst *RMWI =
       new AtomicRMWInst(Operation, Ptr, Val,
-                        Alignment.getValueOr(DefaultAlignment), Ordering, SSID);
+                        Alignment.value_or(DefaultAlignment), Ordering, SSID);
   RMWI->setVolatile(isVolatile);
   Inst = RMWI;
   return AteExtraComma ? InstExtraComma : InstNormal;
diff --git a/llvm/lib/AsmParser/Parser.cpp b/llvm/lib/AsmParser/Parser.cpp
index 156fbbe71adb..95b9079f0f9c 100644
--- a/llvm/lib/AsmParser/Parser.cpp
+++ b/llvm/lib/AsmParser/Parser.cpp
@@ -11,13 +11,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/AsmParser/Parser.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/AsmParser/LLParser.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
-#include <cstring>
 #include <system_error>
 
 using namespace llvm;
diff --git a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
index 0d28d93c93c0..1613e7e42a0a 100644
--- a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
+++ b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
@@ -106,8 +106,7 @@ bool MetadataVerifier::verifyKernelArgs(msgpack::DocNode &Node) {
     return false;
   if (!verifyIntegerEntry(ArgsMap, ".offset", true))
     return false;
-  if (!verifyScalarEntry(ArgsMap, ".value_kind", true,
-                         msgpack::Type::String,
+  if (!verifyScalarEntry(ArgsMap, ".value_kind", true, msgpack::Type::String,
                          [](msgpack::DocNode &SNode) {
                            return StringSwitch<bool>(SNode.getString())
                                .Case("by_value", true)
@@ -133,6 +132,7 @@ bool MetadataVerifier::verifyKernelArgs(msgpack::DocNode &Node) {
                                .Case("hidden_none", true)
                                .Case("hidden_printf_buffer", true)
                                .Case("hidden_hostcall_buffer", true)
+                               .Case("hidden_heap_v1", true)
                                .Case("hidden_default_queue", true)
                                .Case("hidden_completion_action", true)
                                .Case("hidden_multigrid_sync_arg", true)
diff --git a/llvm/lib/BinaryFormat/COFF.cpp b/llvm/lib/BinaryFormat/COFF.cpp
new file mode 100644
index 000000000000..8fbee0218b79
--- /dev/null
+++ b/llvm/lib/BinaryFormat/COFF.cpp
@@ -0,0 +1,57 @@
+//===- llvm/BinaryFormat/COFF.cpp - The COFF format -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+
+// Maximum offsets for different string table entry encodings.
+enum : unsigned { Max7DecimalOffset = 9999999U };
+enum : uint64_t { MaxBase64Offset = 0xFFFFFFFFFULL }; // 64^6, including 0
+
+// Encode a string table entry offset in base 64, padded to 6 chars, and
+// prefixed with a double slash: '//AAAAAA', '//AAAAAB', ...
+// Buffer must be at least 8 bytes large. No terminating null appended.
+static void encodeBase64StringEntry(char *Buffer, uint64_t Value) {
+  assert(Value > Max7DecimalOffset && Value <= MaxBase64Offset &&
+         "Illegal section name encoding for value");
+
+  static const char Alphabet[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+                                 "abcdefghijklmnopqrstuvwxyz"
+                                 "0123456789+/";
+
+  Buffer[0] = '/';
+  Buffer[1] = '/';
+
+  char *Ptr = Buffer + 7;
+  for (unsigned i = 0; i < 6; ++i) {
+    unsigned Rem = Value % 64;
+    Value /= 64;
+    *(Ptr--) = Alphabet[Rem];
+  }
+}
+
+bool llvm::COFF::encodeSectionName(char *Out, uint64_t Offset) {
+  if (Offset <= Max7DecimalOffset) {
+    // Offsets of 7 digits or less are encoded in ASCII.
+    SmallVector<char, COFF::NameSize> Buffer;
+    Twine('/').concat(Twine(Offset)).toVector(Buffer);
+    assert(Buffer.size() <= COFF::NameSize && Buffer.size() >= 2);
+    std::memcpy(Out, Buffer.data(), Buffer.size());
+    return true;
+  }
+
+  if (Offset <= MaxBase64Offset) {
+    // Starting with 10,000,000, offsets are encoded as base64.
+    encodeBase64StringEntry(Out, Offset);
+    return true;
+  }
+
+  // The offset is too large to be encoded.
+  return false;
+}
diff --git a/llvm/lib/BinaryFormat/Magic.cpp b/llvm/lib/BinaryFormat/Magic.cpp
index 044e4840cb3b..d45195fb95c5 100644
--- a/llvm/lib/BinaryFormat/Magic.cpp
+++ b/llvm/lib/BinaryFormat/Magic.cpp
@@ -74,6 +74,11 @@ file_magic llvm::identify_magic(StringRef Magic) {
       return file_magic::goff_object;
     break;
 
+  case 0x10:
+    if (startswith(Magic, "\x10\xFF\x10\xAD"))
+      return file_magic::offload_binary;
+    break;
+
   case 0xDE: // 0x0B17C0DE = BC wraper
     if (startswith(Magic, "\xDE\xC0\x17\x0B"))
       return file_magic::bitcode;
@@ -185,6 +190,10 @@ file_magic llvm::identify_magic(StringRef Magic) {
   case 0x84: // Alpha 64-bit
   case 0x66: // MPS R4000 Windows
   case 0x50: // mc68K
+    if (startswith(Magic, "\x50\xed\x55\xba"))
+      return file_magic::cuda_fatbinary;
+    LLVM_FALLTHROUGH;
+
   case 0x4c: // 80386 Windows
   case 0xc4: // ARMNT Windows
     if (Magic[1] == 0x01)
@@ -221,6 +230,11 @@ file_magic llvm::identify_magic(StringRef Magic) {
     if (startswith(Magic, "--- !tapi") || startswith(Magic, "---\narchs:"))
       return file_magic::tapi_file;
     break;
+  
+  case 'D': // DirectX container file - DXBC
+    if (startswith(Magic, "DXBC"))
+      return file_magic::dxcontainer_object;
+    break;
 
   default:
     break;
diff --git a/llvm/lib/BinaryFormat/Wasm.cpp b/llvm/lib/BinaryFormat/Wasm.cpp
index 55efe31f2669..babeb12e49ef 100644
--- a/llvm/lib/BinaryFormat/Wasm.cpp
+++ b/llvm/lib/BinaryFormat/Wasm.cpp
@@ -8,7 +8,7 @@
 
 #include "llvm/BinaryFormat/Wasm.h"
 
-std::string llvm::wasm::toString(wasm::WasmSymbolType Type) {
+llvm::StringRef llvm::wasm::toString(wasm::WasmSymbolType Type) {
   switch (Type) {
   case wasm::WASM_SYMBOL_TYPE_FUNCTION:
     return "WASM_SYMBOL_TYPE_FUNCTION";
@@ -26,7 +26,7 @@ std::string llvm::wasm::toString(wasm::WasmSymbolType Type) {
   llvm_unreachable("unknown symbol type");
 }
 
-std::string llvm::wasm::relocTypetoString(uint32_t Type) {
+llvm::StringRef llvm::wasm::relocTypetoString(uint32_t Type) {
   switch (Type) {
 #define WASM_RELOC(NAME, VALUE)                                                \
   case VALUE:                                                                  \
@@ -38,6 +38,31 @@ std::string llvm::wasm::relocTypetoString(uint32_t Type) {
   }
 }
 
+llvm::StringRef llvm::wasm::sectionTypeToString(uint32_t Type) {
+#define ECase(X)                                                               \
+  case wasm::WASM_SEC_##X:                                                     \
+    return #X;
+  switch (Type) {
+    ECase(CUSTOM);
+    ECase(TYPE);
+    ECase(IMPORT);
+    ECase(FUNCTION);
+    ECase(TABLE);
+    ECase(MEMORY);
+    ECase(GLOBAL);
+    ECase(EXPORT);
+    ECase(START);
+    ECase(ELEM);
+    ECase(CODE);
+    ECase(DATA);
+    ECase(DATACOUNT);
+    ECase(TAG);
+  default:
+    llvm_unreachable("unknown section type");
+  }
+#undef ECase
+}
+
 bool llvm::wasm::relocTypeHasAddend(uint32_t Type) {
   switch (Type) {
   case R_WASM_MEMORY_ADDR_LEB:
diff --git a/llvm/lib/Bitcode/Reader/BitReader.cpp b/llvm/lib/Bitcode/Reader/BitReader.cpp
index 5ac893aef14e..da2cf0770ec5 100644
--- a/llvm/lib/Bitcode/Reader/BitReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitReader.cpp
@@ -12,7 +12,6 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/raw_ostream.h"
 #include <cstring>
 #include <string>
 
diff --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
index ffef35299981..1d16211c65bf 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
@@ -267,6 +267,7 @@ static Optional<const char *> GetCodeName(unsigned CodeID, unsigned BlockID,
       STRINGIFY_CODE(FUNC_CODE, INST_STOREATOMIC)
       STRINGIFY_CODE(FUNC_CODE, INST_CMPXCHG)
       STRINGIFY_CODE(FUNC_CODE, INST_CALLBR)
+      STRINGIFY_CODE(FUNC_CODE, BLOCKADDR_USERS)
     }
   case bitc::VALUE_SYMTAB_BLOCK_ID:
     switch (CodeID) {
@@ -735,7 +736,7 @@ Error BitcodeAnalyzer::parseBlock(unsigned BlockID, unsigned IndentLevel,
   BlockStats.NumInstances++;
 
   // BLOCKINFO is a special part of the stream.
-  bool DumpRecords = O.hasValue();
+  bool DumpRecords = O.has_value();
   if (BlockID == bitc::BLOCKINFO_BLOCK_ID) {
     if (O && !O->DumpBlockinfo)
       O->OS << Indent << "<BLOCKINFO_BLOCK/>\n";
@@ -864,7 +865,10 @@ Error BitcodeAnalyzer::parseBlock(unsigned BlockID, unsigned IndentLevel,
         O->OS << " codeid=" << Code;
       const BitCodeAbbrev *Abbv = nullptr;
       if (Entry.ID != bitc::UNABBREV_RECORD) {
-        Abbv = Stream.getAbbrev(Entry.ID);
+        Expected<const BitCodeAbbrev *> MaybeAbbv = Stream.getAbbrev(Entry.ID);
+        if (!MaybeAbbv)
+          return MaybeAbbv.takeError();
+        Abbv = MaybeAbbv.get();
         O->OS << " abbrevid=" << Entry.ID;
       }
 
@@ -894,13 +898,13 @@ Error BitcodeAnalyzer::parseBlock(unsigned BlockID, unsigned IndentLevel,
 
       // If we found a module hash, let's verify that it matches!
       if (BlockID == bitc::MODULE_BLOCK_ID && Code == bitc::MODULE_CODE_HASH &&
-          CheckHash.hasValue()) {
+          CheckHash) {
         if (Record.size() != 5)
           O->OS << " (invalid)";
         else {
           // Recompute the hash and compare it to the one in the bitcode
           SHA1 Hasher;
-          StringRef Hash;
+          std::array<uint8_t, 20> Hash;
           Hasher.update(*CheckHash);
           {
             int BlockSize = (CurrentRecordPos / 8) - BlockEntryPos;
@@ -908,14 +912,14 @@ Error BitcodeAnalyzer::parseBlock(unsigned BlockID, unsigned IndentLevel,
             Hasher.update(ArrayRef<uint8_t>(Ptr, BlockSize));
             Hash = Hasher.result();
           }
-          std::array<char, 20> RecordedHash;
+          std::array<uint8_t, 20> RecordedHash;
           int Pos = 0;
           for (auto &Val : Record) {
             assert(!(Val >> 32) && "Unexpected high bits set");
             support::endian::write32be(&RecordedHash[Pos], Val);
             Pos += 4;
           }
-          if (Hash == StringRef(RecordedHash.data(), RecordedHash.size()))
+          if (Hash == RecordedHash)
             O->OS << " (match)";
           else
             O->OS << " (!mismatch!)";
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 720ab560f988..93b07fc0db30 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -39,6 +39,7 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GVMaterializer.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalIFunc.h"
 #include "llvm/IR/GlobalObject.h"
@@ -50,6 +51,8 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
+#include "llvm/IR/IntrinsicsARM.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
@@ -91,6 +94,11 @@ static cl::opt<bool> PrintSummaryGUIDs(
     cl::desc(
         "Print the global id for each value when reading the module summary"));
 
+static cl::opt<bool> ExpandConstantExprs(
+    "expand-constant-exprs", cl::Hidden,
+    cl::desc(
+        "Expand constant expressions to instructions for testing purposes"));
+
 namespace {
 
 enum {
@@ -282,7 +290,7 @@ static Expected<bool> hasObjCCategoryInModule(BitstreamCursor &Stream) {
     case bitc::MODULE_CODE_SECTIONNAME: { // SECTIONNAME: [strchr x N]
       std::string S;
       if (convertToString(Record, 0, S))
-        return error("Invalid record");
+        return error("Invalid section name record");
       // Check for the i386 and other (x86_64, ARM) conventions
       if (S.find("__DATA,__objc_catlist") != std::string::npos ||
           S.find("__OBJC,__category") != std::string::npos)
@@ -361,7 +369,7 @@ static Expected<std::string> readModuleTriple(BitstreamCursor &Stream) {
     case bitc::MODULE_CODE_TRIPLE: {  // TRIPLE: [strchr x N]
       std::string S;
       if (convertToString(Record, 0, S))
-        return error("Invalid record");
+        return error("Invalid triple record");
       Triple = S;
       break;
     }
@@ -429,7 +437,7 @@ protected:
   std::pair<StringRef, ArrayRef<uint64_t>>
   readNameFromStrtab(ArrayRef<uint64_t> Record);
 
-  bool readBlockInfo();
+  Error readBlockInfo();
 
   // Contains an arbitrary and optional string identifying the bitcode producer
   std::string ProducerIdentification;
@@ -450,7 +458,7 @@ Error BitcodeReaderBase::error(const Twine &Message) {
 Expected<unsigned>
 BitcodeReaderBase::parseVersionRecord(ArrayRef<uint64_t> Record) {
   if (Record.empty())
-    return error("Invalid record");
+    return error("Invalid version record");
   unsigned ModuleVersion = Record[0];
   if (ModuleVersion > 2)
     return error("Invalid value");
@@ -470,6 +478,90 @@ BitcodeReaderBase::readNameFromStrtab(ArrayRef<uint64_t> Record) {
 
 namespace {
 
+/// This represents a constant expression or constant aggregate using a custom
+/// structure internal to the bitcode reader. Later, this structure will be
+/// expanded by materializeValue() either into a constant expression/aggregate,
+/// or into an instruction sequence at the point of use. This allows us to
+/// upgrade bitcode using constant expressions even if this kind of constant
+/// expression is no longer supported.
+class BitcodeConstant final : public Value,
+                              TrailingObjects<BitcodeConstant, unsigned> {
+  friend TrailingObjects;
+
+  // Value subclass ID: Pick largest possible value to avoid any clashes.
+  static constexpr uint8_t SubclassID = 255;
+
+public:
+  // Opcodes used for non-expressions. This includes constant aggregates
+  // (struct, array, vector) that might need expansion, as well as non-leaf
+  // constants that don't need expansion (no_cfi, dso_local, blockaddress),
+  // but still go through BitcodeConstant to avoid different uselist orders
+  // between the two cases.
+  static constexpr uint8_t ConstantStructOpcode = 255;
+  static constexpr uint8_t ConstantArrayOpcode = 254;
+  static constexpr uint8_t ConstantVectorOpcode = 253;
+  static constexpr uint8_t NoCFIOpcode = 252;
+  static constexpr uint8_t DSOLocalEquivalentOpcode = 251;
+  static constexpr uint8_t BlockAddressOpcode = 250;
+  static constexpr uint8_t FirstSpecialOpcode = BlockAddressOpcode;
+
+  // Separate struct to make passing different number of parameters to
+  // BitcodeConstant::create() more convenient.
+  struct ExtraInfo {
+    uint8_t Opcode;
+    uint8_t Flags;
+    unsigned Extra;
+    Type *SrcElemTy;
+
+    ExtraInfo(uint8_t Opcode, uint8_t Flags = 0, unsigned Extra = 0,
+              Type *SrcElemTy = nullptr)
+        : Opcode(Opcode), Flags(Flags), Extra(Extra), SrcElemTy(SrcElemTy) {}
+  };
+
+  uint8_t Opcode;
+  uint8_t Flags;
+  unsigned NumOperands;
+  unsigned Extra;  // GEP inrange index or blockaddress BB id.
+  Type *SrcElemTy; // GEP source element type.
+
+private:
+  BitcodeConstant(Type *Ty, const ExtraInfo &Info, ArrayRef<unsigned> OpIDs)
+      : Value(Ty, SubclassID), Opcode(Info.Opcode), Flags(Info.Flags),
+        NumOperands(OpIDs.size()), Extra(Info.Extra),
+        SrcElemTy(Info.SrcElemTy) {
+    std::uninitialized_copy(OpIDs.begin(), OpIDs.end(),
+                            getTrailingObjects<unsigned>());
+  }
+
+  BitcodeConstant &operator=(const BitcodeConstant &) = delete;
+
+public:
+  static BitcodeConstant *create(BumpPtrAllocator &A, Type *Ty,
+                                 const ExtraInfo &Info,
+                                 ArrayRef<unsigned> OpIDs) {
+    void *Mem = A.Allocate(totalSizeToAlloc<unsigned>(OpIDs.size()),
+                           alignof(BitcodeConstant));
+    return new (Mem) BitcodeConstant(Ty, Info, OpIDs);
+  }
+
+  static bool classof(const Value *V) { return V->getValueID() == SubclassID; }
+
+  ArrayRef<unsigned> getOperandIDs() const {
+    return makeArrayRef(getTrailingObjects<unsigned>(), NumOperands);
+  }
+
+  Optional<unsigned> getInRangeIndex() const {
+    assert(Opcode == Instruction::GetElementPtr);
+    if (Extra == (unsigned)-1)
+      return None;
+    return Extra;
+  }
+
+  const char *getOpcodeName() const {
+    return Instruction::getOpcodeName(Opcode);
+  }
+};
+
 class BitcodeReader : public BitcodeReaderBase, public GVMaterializer {
   LLVMContext &Context;
   Module *TheModule = nullptr;
@@ -483,8 +575,23 @@ class BitcodeReader : public BitcodeReaderBase, public GVMaterializer {
   std::vector<std::string> SectionTable;
   std::vector<std::string> GCTable;
 
-  std::vector<Type*> TypeList;
-  DenseMap<Function *, FunctionType *> FunctionTypes;
+  std::vector<Type *> TypeList;
+  /// Track type IDs of contained types. Order is the same as the contained
+  /// types of a Type*. This is used during upgrades of typed pointer IR in
+  /// opaque pointer mode.
+  DenseMap<unsigned, SmallVector<unsigned, 1>> ContainedTypeIDs;
+  /// In some cases, we need to create a type ID for a type that was not
+  /// explicitly encoded in the bitcode, or we don't know about at the current
+  /// point. For example, a global may explicitly encode the value type ID, but
+  /// not have a type ID for the pointer to value type, for which we create a
+  /// virtual type ID instead. This map stores the new type ID that was created
+  /// for the given pair of Type and contained type ID.
+  DenseMap<std::pair<Type *, unsigned>, unsigned> VirtualTypeIDs;
+  DenseMap<Function *, unsigned> FunctionTypeIDs;
+  /// Allocator for BitcodeConstants. This should come before ValueList,
+  /// because the ValueList might hold ValueHandles to these constants, so
+  /// ValueList must be destroyed before Alloc.
+  BumpPtrAllocator Alloc;
   BitcodeReaderValueList ValueList;
   Optional<MetadataLoader> MDLoader;
   std::vector<Comdat *> ComdatList;
@@ -544,6 +651,13 @@ class BitcodeReader : public BitcodeReaderBase, public GVMaterializer {
   DenseMap<Function *, std::vector<BasicBlock *>> BasicBlockFwdRefs;
   std::deque<Function *> BasicBlockFwdRefQueue;
 
+  /// These are Functions that contain BlockAddresses which refer a different
+  /// Function. When parsing the different Function, queue Functions that refer
+  /// to the different Function. Those Functions must be materialized in order
+  /// to resolve their BlockAddress constants before the different Function
+  /// gets moved into another Module.
+  std::vector<Function *> BackwardRefFunctions;
+
   /// Indicates that we are using a new encoding for instruction operands where
   /// most operands in the current FUNCTION_BLOCK are encoded relative to the
   /// instruction number, for a more compact encoding.  Some instruction
@@ -575,8 +689,8 @@ public:
   /// Main interface to parsing a bitcode buffer.
   /// \returns true if an error occurred.
   Error parseBitcodeInto(
-      Module *M, bool ShouldLazyLoadMetadata = false, bool IsImporting = false,
-      DataLayoutCallbackTy DataLayoutCallback = [](StringRef) { return None; });
+      Module *M, bool ShouldLazyLoadMetadata, bool IsImporting,
+      DataLayoutCallbackTy DataLayoutCallback);
 
   static uint64_t decodeSignRotatedValue(uint64_t V);
 
@@ -590,12 +704,21 @@ private:
   StructType *createIdentifiedStructType(LLVMContext &Context, StringRef Name);
   StructType *createIdentifiedStructType(LLVMContext &Context);
 
+  static constexpr unsigned InvalidTypeID = ~0u;
+
   Type *getTypeByID(unsigned ID);
+  Type *getPtrElementTypeByID(unsigned ID);
+  unsigned getContainedTypeID(unsigned ID, unsigned Idx = 0);
+  unsigned getVirtualTypeID(Type *Ty, ArrayRef<unsigned> ContainedTypeIDs = {});
+
+  Expected<Value *> materializeValue(unsigned ValID, BasicBlock *InsertBB);
+  Expected<Constant *> getValueForInitializer(unsigned ID);
 
-  Value *getFnValueByID(unsigned ID, Type *Ty) {
+  Value *getFnValueByID(unsigned ID, Type *Ty, unsigned TyID,
+                        BasicBlock *ConstExprInsertBB) {
     if (Ty && Ty->isMetadataTy())
       return MetadataAsValue::get(Ty->getContext(), getFnMetadataByID(ID));
-    return ValueList.getValueFwdRef(ID, Ty);
+    return ValueList.getValueFwdRef(ID, Ty, TyID, ConstExprInsertBB);
   }
 
   Metadata *getFnMetadataByID(unsigned ID) {
@@ -617,7 +740,8 @@ private:
   /// Increment Slot past the number of slots used in the record. Return true on
   /// failure.
   bool getValueTypePair(const SmallVectorImpl<uint64_t> &Record, unsigned &Slot,
-                        unsigned InstNum, Value *&ResVal) {
+                        unsigned InstNum, Value *&ResVal, unsigned &TypeID,
+                        BasicBlock *ConstExprInsertBB) {
     if (Slot == Record.size()) return true;
     unsigned ValNo = (unsigned)Record[Slot++];
     // Adjust the ValNo, if it was encoded relative to the InstNum.
@@ -626,14 +750,18 @@ private:
     if (ValNo < InstNum) {
       // If this is not a forward reference, just return the value we already
       // have.
-      ResVal = getFnValueByID(ValNo, nullptr);
+      TypeID = ValueList.getTypeID(ValNo);
+      ResVal = getFnValueByID(ValNo, nullptr, TypeID, ConstExprInsertBB);
+      assert((!ResVal || ResVal->getType() == getTypeByID(TypeID)) &&
+             "Incorrect type ID stored for value");
       return ResVal == nullptr;
     }
     if (Slot == Record.size())
       return true;
 
-    unsigned TypeNo = (unsigned)Record[Slot++];
-    ResVal = getFnValueByID(ValNo, getTypeByID(TypeNo));
+    TypeID = (unsigned)Record[Slot++];
+    ResVal = getFnValueByID(ValNo, getTypeByID(TypeID), TypeID,
+                            ConstExprInsertBB);
     return ResVal == nullptr;
   }
 
@@ -641,8 +769,9 @@ private:
   /// past the number of slots used by the value in the record. Return true if
   /// there is an error.
   bool popValue(const SmallVectorImpl<uint64_t> &Record, unsigned &Slot,
-                unsigned InstNum, Type *Ty, Value *&ResVal) {
-    if (getValue(Record, Slot, InstNum, Ty, ResVal))
+                unsigned InstNum, Type *Ty, unsigned TyID, Value *&ResVal,
+                BasicBlock *ConstExprInsertBB) {
+    if (getValue(Record, Slot, InstNum, Ty, TyID, ResVal, ConstExprInsertBB))
       return true;
     // All values currently take a single record slot.
     ++Slot;
@@ -651,38 +780,41 @@ private:
 
   /// Like popValue, but does not increment the Slot number.
   bool getValue(const SmallVectorImpl<uint64_t> &Record, unsigned Slot,
-                unsigned InstNum, Type *Ty, Value *&ResVal) {
-    ResVal = getValue(Record, Slot, InstNum, Ty);
+                unsigned InstNum, Type *Ty, unsigned TyID, Value *&ResVal,
+                BasicBlock *ConstExprInsertBB) {
+    ResVal = getValue(Record, Slot, InstNum, Ty, TyID, ConstExprInsertBB);
     return ResVal == nullptr;
   }
 
   /// Version of getValue that returns ResVal directly, or 0 if there is an
   /// error.
   Value *getValue(const SmallVectorImpl<uint64_t> &Record, unsigned Slot,
-                  unsigned InstNum, Type *Ty) {
+                  unsigned InstNum, Type *Ty, unsigned TyID,
+                  BasicBlock *ConstExprInsertBB) {
     if (Slot == Record.size()) return nullptr;
     unsigned ValNo = (unsigned)Record[Slot];
     // Adjust the ValNo, if it was encoded relative to the InstNum.
     if (UseRelativeIDs)
       ValNo = InstNum - ValNo;
-    return getFnValueByID(ValNo, Ty);
+    return getFnValueByID(ValNo, Ty, TyID, ConstExprInsertBB);
   }
 
   /// Like getValue, but decodes signed VBRs.
   Value *getValueSigned(const SmallVectorImpl<uint64_t> &Record, unsigned Slot,
-                        unsigned InstNum, Type *Ty) {
+                        unsigned InstNum, Type *Ty, unsigned TyID,
+                        BasicBlock *ConstExprInsertBB) {
     if (Slot == Record.size()) return nullptr;
     unsigned ValNo = (unsigned)decodeSignRotatedValue(Record[Slot]);
     // Adjust the ValNo, if it was encoded relative to the InstNum.
     if (UseRelativeIDs)
       ValNo = InstNum - ValNo;
-    return getFnValueByID(ValNo, Ty);
+    return getFnValueByID(ValNo, Ty, TyID, ConstExprInsertBB);
   }
 
   /// Upgrades old-style typeless byval/sret/inalloca attributes by adding the
   /// corresponding argument's pointee type. Also upgrades intrinsics that now
   /// require an elementtype attribute.
-  void propagateAttributeTypes(CallBase *CB, ArrayRef<Type *> ArgsTys);
+  Error propagateAttributeTypes(CallBase *CB, ArrayRef<unsigned> ArgsTys);
 
   /// Converts alignment exponent (i.e. power of two (or zero)) to the
   /// corresponding alignment to use. If alignment is too large, returns
@@ -827,7 +959,10 @@ BitcodeReader::BitcodeReader(BitstreamCursor Stream, StringRef Strtab,
                              StringRef ProducerIdentification,
                              LLVMContext &Context)
     : BitcodeReaderBase(std::move(Stream), Strtab), Context(Context),
-      ValueList(Context, Stream.SizeInBytes()) {
+      ValueList(this->Stream.SizeInBytes(),
+                [this](unsigned ValID, BasicBlock *InsertBB) {
+                  return materializeValue(ValID, InsertBB);
+                }) {
   this->ProducerIdentification = std::string(ProducerIdentification);
 }
 
@@ -859,6 +994,11 @@ Error BitcodeReader::materializeForwardReferencedFunctions() {
   }
   assert(BasicBlockFwdRefs.empty() && "Function missing from queue");
 
+  for (Function *F : BackwardRefFunctions)
+    if (Error Err = materialize(F))
+      return Err;
+  BackwardRefFunctions.clear();
+
   // Reset state.
   WillMaterializeAllForwardRefs = false;
   return Error::success();
@@ -1176,6 +1316,324 @@ Type *BitcodeReader::getTypeByID(unsigned ID) {
   return TypeList[ID] = createIdentifiedStructType(Context);
 }
 
+unsigned BitcodeReader::getContainedTypeID(unsigned ID, unsigned Idx) {
+  auto It = ContainedTypeIDs.find(ID);
+  if (It == ContainedTypeIDs.end())
+    return InvalidTypeID;
+
+  if (Idx >= It->second.size())
+    return InvalidTypeID;
+
+  return It->second[Idx];
+}
+
+Type *BitcodeReader::getPtrElementTypeByID(unsigned ID) {
+  if (ID >= TypeList.size())
+    return nullptr;
+
+  Type *Ty = TypeList[ID];
+  if (!Ty->isPointerTy())
+    return nullptr;
+
+  Type *ElemTy = getTypeByID(getContainedTypeID(ID, 0));
+  if (!ElemTy)
+    return nullptr;
+
+  assert(cast<PointerType>(Ty)->isOpaqueOrPointeeTypeMatches(ElemTy) &&
+         "Incorrect element type");
+  return ElemTy;
+}
+
+unsigned BitcodeReader::getVirtualTypeID(Type *Ty,
+                                         ArrayRef<unsigned> ChildTypeIDs) {
+  unsigned ChildTypeID = ChildTypeIDs.empty() ? InvalidTypeID : ChildTypeIDs[0];
+  auto CacheKey = std::make_pair(Ty, ChildTypeID);
+  auto It = VirtualTypeIDs.find(CacheKey);
+  if (It != VirtualTypeIDs.end()) {
+    // The cmpxchg return value is the only place we need more than one
+    // contained type ID, however the second one will always be the same (i1),
+    // so we don't need to include it in the cache key. This asserts that the
+    // contained types are indeed as expected and there are no collisions.
+    assert((ChildTypeIDs.empty() ||
+            ContainedTypeIDs[It->second] == ChildTypeIDs) &&
+           "Incorrect cached contained type IDs");
+    return It->second;
+  }
+
+#ifndef NDEBUG
+  if (!Ty->isOpaquePointerTy()) {
+    assert(Ty->getNumContainedTypes() == ChildTypeIDs.size() &&
+           "Wrong number of contained types");
+    for (auto Pair : zip(Ty->subtypes(), ChildTypeIDs)) {
+      assert(std::get<0>(Pair) == getTypeByID(std::get<1>(Pair)) &&
+             "Incorrect contained type ID");
+    }
+  }
+#endif
+
+  unsigned TypeID = TypeList.size();
+  TypeList.push_back(Ty);
+  if (!ChildTypeIDs.empty())
+    append_range(ContainedTypeIDs[TypeID], ChildTypeIDs);
+  VirtualTypeIDs.insert({CacheKey, TypeID});
+  return TypeID;
+}
+
+static bool isConstExprSupported(uint8_t Opcode) {
+  // These are not real constant expressions, always consider them supported.
+  if (Opcode >= BitcodeConstant::FirstSpecialOpcode)
+    return true;
+
+  return !ExpandConstantExprs;
+}
+
+Expected<Value *> BitcodeReader::materializeValue(unsigned StartValID,
+                                                  BasicBlock *InsertBB) {
+  // Quickly handle the case where there is no BitcodeConstant to resolve.
+  if (StartValID < ValueList.size() && ValueList[StartValID] &&
+      !isa<BitcodeConstant>(ValueList[StartValID]))
+    return ValueList[StartValID];
+
+  SmallDenseMap<unsigned, Value *> MaterializedValues;
+  SmallVector<unsigned> Worklist;
+  Worklist.push_back(StartValID);
+  while (!Worklist.empty()) {
+    unsigned ValID = Worklist.back();
+    if (MaterializedValues.count(ValID)) {
+      // Duplicate expression that was already handled.
+      Worklist.pop_back();
+      continue;
+    }
+
+    if (ValID >= ValueList.size() || !ValueList[ValID])
+      return error("Invalid value ID");
+
+    Value *V = ValueList[ValID];
+    auto *BC = dyn_cast<BitcodeConstant>(V);
+    if (!BC) {
+      MaterializedValues.insert({ValID, V});
+      Worklist.pop_back();
+      continue;
+    }
+
+    // Iterate in reverse, so values will get popped from the worklist in
+    // expected order.
+    SmallVector<Value *> Ops;
+    for (unsigned OpID : reverse(BC->getOperandIDs())) {
+      auto It = MaterializedValues.find(OpID);
+      if (It != MaterializedValues.end())
+        Ops.push_back(It->second);
+      else
+        Worklist.push_back(OpID);
+    }
+
+    // Some expressions have not been resolved yet, handle them first and then
+    // revisit this one.
+    if (Ops.size() != BC->getOperandIDs().size())
+      continue;
+    std::reverse(Ops.begin(), Ops.end());
+
+    SmallVector<Constant *> ConstOps;
+    for (Value *Op : Ops)
+      if (auto *C = dyn_cast<Constant>(Op))
+        ConstOps.push_back(C);
+
+    // Materialize as constant expression if possible.
+    if (isConstExprSupported(BC->Opcode) && ConstOps.size() == Ops.size()) {
+      Constant *C;
+      if (Instruction::isCast(BC->Opcode)) {
+        C = UpgradeBitCastExpr(BC->Opcode, ConstOps[0], BC->getType());
+        if (!C)
+          C = ConstantExpr::getCast(BC->Opcode, ConstOps[0], BC->getType());
+      } else if (Instruction::isUnaryOp(BC->Opcode)) {
+        C = ConstantExpr::get(BC->Opcode, ConstOps[0], BC->Flags);
+      } else if (Instruction::isBinaryOp(BC->Opcode)) {
+        C = ConstantExpr::get(BC->Opcode, ConstOps[0], ConstOps[1], BC->Flags);
+      } else {
+        switch (BC->Opcode) {
+        case BitcodeConstant::NoCFIOpcode: {
+          auto *GV = dyn_cast<GlobalValue>(ConstOps[0]);
+          if (!GV)
+            return error("no_cfi operand must be GlobalValue");
+          C = NoCFIValue::get(GV);
+          break;
+        }
+        case BitcodeConstant::DSOLocalEquivalentOpcode: {
+          auto *GV = dyn_cast<GlobalValue>(ConstOps[0]);
+          if (!GV)
+            return error("dso_local operand must be GlobalValue");
+          C = DSOLocalEquivalent::get(GV);
+          break;
+        }
+        case BitcodeConstant::BlockAddressOpcode: {
+          Function *Fn = dyn_cast<Function>(ConstOps[0]);
+          if (!Fn)
+            return error("blockaddress operand must be a function");
+
+          // If the function is already parsed we can insert the block address
+          // right away.
+          BasicBlock *BB;
+          unsigned BBID = BC->Extra;
+          if (!BBID)
+            // Invalid reference to entry block.
+            return error("Invalid ID");
+          if (!Fn->empty()) {
+            Function::iterator BBI = Fn->begin(), BBE = Fn->end();
+            for (size_t I = 0, E = BBID; I != E; ++I) {
+              if (BBI == BBE)
+                return error("Invalid ID");
+              ++BBI;
+            }
+            BB = &*BBI;
+          } else {
+            // Otherwise insert a placeholder and remember it so it can be
+            // inserted when the function is parsed.
+            auto &FwdBBs = BasicBlockFwdRefs[Fn];
+            if (FwdBBs.empty())
+              BasicBlockFwdRefQueue.push_back(Fn);
+            if (FwdBBs.size() < BBID + 1)
+              FwdBBs.resize(BBID + 1);
+            if (!FwdBBs[BBID])
+              FwdBBs[BBID] = BasicBlock::Create(Context);
+            BB = FwdBBs[BBID];
+          }
+          C = BlockAddress::get(Fn, BB);
+          break;
+        }
+        case BitcodeConstant::ConstantStructOpcode:
+          C = ConstantStruct::get(cast<StructType>(BC->getType()), ConstOps);
+          break;
+        case BitcodeConstant::ConstantArrayOpcode:
+          C = ConstantArray::get(cast<ArrayType>(BC->getType()), ConstOps);
+          break;
+        case BitcodeConstant::ConstantVectorOpcode:
+          C = ConstantVector::get(ConstOps);
+          break;
+        case Instruction::ICmp:
+        case Instruction::FCmp:
+          C = ConstantExpr::getCompare(BC->Flags, ConstOps[0], ConstOps[1]);
+          break;
+        case Instruction::GetElementPtr:
+          C = ConstantExpr::getGetElementPtr(
+              BC->SrcElemTy, ConstOps[0], makeArrayRef(ConstOps).drop_front(),
+              BC->Flags, BC->getInRangeIndex());
+          break;
+        case Instruction::Select:
+          C = ConstantExpr::getSelect(ConstOps[0], ConstOps[1], ConstOps[2]);
+          break;
+        case Instruction::ExtractElement:
+          C = ConstantExpr::getExtractElement(ConstOps[0], ConstOps[1]);
+          break;
+        case Instruction::InsertElement:
+          C = ConstantExpr::getInsertElement(ConstOps[0], ConstOps[1],
+                                             ConstOps[2]);
+          break;
+        case Instruction::ShuffleVector: {
+          SmallVector<int, 16> Mask;
+          ShuffleVectorInst::getShuffleMask(ConstOps[2], Mask);
+          C = ConstantExpr::getShuffleVector(ConstOps[0], ConstOps[1], Mask);
+          break;
+        }
+        default:
+          llvm_unreachable("Unhandled bitcode constant");
+        }
+      }
+
+      // Cache resolved constant.
+      ValueList.replaceValueWithoutRAUW(ValID, C);
+      MaterializedValues.insert({ValID, C});
+      Worklist.pop_back();
+      continue;
+    }
+
+    if (!InsertBB)
+      return error(Twine("Value referenced by initializer is an unsupported "
+                         "constant expression of type ") +
+                   BC->getOpcodeName());
+
+    // Materialize as instructions if necessary.
+    Instruction *I;
+    if (Instruction::isCast(BC->Opcode)) {
+      I = CastInst::Create((Instruction::CastOps)BC->Opcode, Ops[0],
+                           BC->getType(), "constexpr", InsertBB);
+    } else if (Instruction::isUnaryOp(BC->Opcode)) {
+      I = UnaryOperator::Create((Instruction::UnaryOps)BC->Opcode, Ops[0],
+                                "constexpr", InsertBB);
+    } else if (Instruction::isBinaryOp(BC->Opcode)) {
+      I = BinaryOperator::Create((Instruction::BinaryOps)BC->Opcode, Ops[0],
+                                 Ops[1], "constexpr", InsertBB);
+      if (isa<OverflowingBinaryOperator>(I)) {
+        if (BC->Flags & OverflowingBinaryOperator::NoSignedWrap)
+          I->setHasNoSignedWrap();
+        if (BC->Flags & OverflowingBinaryOperator::NoUnsignedWrap)
+          I->setHasNoUnsignedWrap();
+      }
+      if (isa<PossiblyExactOperator>(I) &&
+          (BC->Flags & PossiblyExactOperator::IsExact))
+        I->setIsExact();
+    } else {
+      switch (BC->Opcode) {
+      case BitcodeConstant::ConstantStructOpcode:
+      case BitcodeConstant::ConstantArrayOpcode:
+      case BitcodeConstant::ConstantVectorOpcode: {
+        Type *IdxTy = Type::getInt32Ty(BC->getContext());
+        Value *V = PoisonValue::get(BC->getType());
+        for (auto Pair : enumerate(Ops)) {
+          Value *Idx = ConstantInt::get(IdxTy, Pair.index());
+          V = InsertElementInst::Create(V, Pair.value(), Idx, "constexpr.ins",
+                                        InsertBB);
+        }
+        I = cast<Instruction>(V);
+        break;
+      }
+      case Instruction::ICmp:
+      case Instruction::FCmp:
+        I = CmpInst::Create((Instruction::OtherOps)BC->Opcode,
+                            (CmpInst::Predicate)BC->Flags, Ops[0], Ops[1],
+                            "constexpr", InsertBB);
+        break;
+      case Instruction::GetElementPtr:
+        I = GetElementPtrInst::Create(BC->SrcElemTy, Ops[0],
+                                      makeArrayRef(Ops).drop_front(),
+                                      "constexpr", InsertBB);
+        if (BC->Flags)
+          cast<GetElementPtrInst>(I)->setIsInBounds();
+        break;
+      case Instruction::Select:
+        I = SelectInst::Create(Ops[0], Ops[1], Ops[2], "constexpr", InsertBB);
+        break;
+      case Instruction::ExtractElement:
+        I = ExtractElementInst::Create(Ops[0], Ops[1], "constexpr", InsertBB);
+        break;
+      case Instruction::InsertElement:
+        I = InsertElementInst::Create(Ops[0], Ops[1], Ops[2], "constexpr",
+                                      InsertBB);
+        break;
+      case Instruction::ShuffleVector:
+        I = new ShuffleVectorInst(Ops[0], Ops[1], Ops[2], "constexpr",
+                                  InsertBB);
+        break;
+      default:
+        llvm_unreachable("Unhandled bitcode constant");
+      }
+    }
+
+    MaterializedValues.insert({ValID, I});
+    Worklist.pop_back();
+  }
+
+  return MaterializedValues[StartValID];
+}
+
+Expected<Constant *> BitcodeReader::getValueForInitializer(unsigned ID) {
+  Expected<Value *> MaybeV = materializeValue(ID, /* InsertBB */ nullptr);
+  if (!MaybeV)
+    return MaybeV.takeError();
+
+  // Result must be Constant if InsertBB is nullptr.
+  return cast<Constant>(MaybeV.get());
+}
+
 StructType *BitcodeReader::createIdentifiedStructType(LLVMContext &Context,
                                                       StringRef Name) {
   auto *Ret = StructType::create(Context, Name);
@@ -1346,7 +1804,7 @@ Error BitcodeReader::parseAttributeBlock() {
     case bitc::PARAMATTR_CODE_ENTRY_OLD: // ENTRY: [paramidx0, attr0, ...]
       // Deprecated, but still needed to read old bitcode files.
       if (Record.size() & 1)
-        return error("Invalid record");
+        return error("Invalid parameter attribute record");
 
       for (unsigned i = 0, e = Record.size(); i != e; i += 2) {
         AttrBuilder B(Context);
@@ -1437,8 +1895,14 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::Dereferenceable;
   case bitc::ATTR_KIND_DEREFERENCEABLE_OR_NULL:
     return Attribute::DereferenceableOrNull;
+  case bitc::ATTR_KIND_ALLOC_ALIGN:
+    return Attribute::AllocAlign;
+  case bitc::ATTR_KIND_ALLOC_KIND:
+    return Attribute::AllocKind;
   case bitc::ATTR_KIND_ALLOC_SIZE:
     return Attribute::AllocSize;
+  case bitc::ATTR_KIND_ALLOCATED_POINTER:
+    return Attribute::AllocatedPointer;
   case bitc::ATTR_KIND_NO_RED_ZONE:
     return Attribute::NoRedZone;
   case bitc::ATTR_KIND_NO_RETURN:
@@ -1451,6 +1915,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::NoProfile;
   case bitc::ATTR_KIND_NO_UNWIND:
     return Attribute::NoUnwind;
+  case bitc::ATTR_KIND_NO_SANITIZE_BOUNDS:
+    return Attribute::NoSanitizeBounds;
   case bitc::ATTR_KIND_NO_SANITIZE_COVERAGE:
     return Attribute::NoSanitizeCoverage;
   case bitc::ATTR_KIND_NULL_POINTER_IS_VALID:
@@ -1529,6 +1995,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::MustProgress;
   case bitc::ATTR_KIND_HOT:
     return Attribute::Hot;
+  case bitc::ATTR_KIND_PRESPLIT_COROUTINE:
+    return Attribute::PresplitCoroutine;
   }
 }
 
@@ -1586,7 +2054,7 @@ Error BitcodeReader::parseAttributeGroupBlock() {
       break;
     case bitc::PARAMATTR_GRP_CODE_ENTRY: { // ENTRY: [grpid, idx, a0, a1, ...]
       if (Record.size() < 3)
-        return error("Invalid record");
+        return error("Invalid grp record");
 
       uint64_t GrpID = Record[0];
       uint64_t Idx = Record[1]; // Index of the object this attribute refers to.
@@ -1607,6 +2075,8 @@ Error BitcodeReader::parseAttributeGroupBlock() {
             B.addStructRetAttr(nullptr);
           else if (Kind == Attribute::InAlloca)
             B.addInAllocaAttr(nullptr);
+          else if (Kind == Attribute::UWTable)
+            B.addUWTableAttr(UWTableKind::Default);
           else if (Attribute::isEnumAttrKind(Kind))
             B.addAttribute(Kind);
           else
@@ -1629,6 +2099,10 @@ Error BitcodeReader::parseAttributeGroupBlock() {
             B.addAllocSizeAttrFromRawRepr(Record[++i]);
           else if (Kind == Attribute::VScaleRange)
             B.addVScaleRangeAttrFromRawRepr(Record[++i]);
+          else if (Kind == Attribute::UWTable)
+            B.addUWTableAttr(UWTableKind(Record[++i]));
+          else if (Kind == Attribute::AllocKind)
+            B.addAllocKindAttr(static_cast<AllocFnKind>(Record[++i]));
         } else if (Record[i] == 3 || Record[i] == 4) { // String attribute
           bool HasValue = (Record[i++] == 4);
           SmallString<64> KindStr;
@@ -1647,9 +2121,7 @@ Error BitcodeReader::parseAttributeGroupBlock() {
           }
 
           B.addAttribute(KindStr.str(), ValStr.str());
-        } else {
-          assert((Record[i] == 5 || Record[i] == 6) &&
-                 "Invalid attribute group entry");
+        } else if (Record[i] == 5 || Record[i] == 6) {
           bool HasType = Record[i] == 6;
           Attribute::AttrKind Kind;
           if (Error Err = parseAttrKind(Record[++i], &Kind))
@@ -1658,6 +2130,8 @@ Error BitcodeReader::parseAttributeGroupBlock() {
             return error("Not a type attribute");
 
           B.addTypeAttr(Kind, HasType ? getTypeByID(Record[++i]) : nullptr);
+        } else {
+          return error("Invalid attribute group entry");
         }
       }
 
@@ -1708,6 +2182,7 @@ Error BitcodeReader::parseTypeTableBody() {
     // Read a record.
     Record.clear();
     Type *ResultTy = nullptr;
+    SmallVector<unsigned> ContainedIDs;
     Expected<unsigned> MaybeRecord = Stream.readRecord(Entry.ID, Record);
     if (!MaybeRecord)
       return MaybeRecord.takeError();
@@ -1718,7 +2193,7 @@ Error BitcodeReader::parseTypeTableBody() {
       // TYPE_CODE_NUMENTRY contains a count of the number of types in the
       // type list.  This allows us to reserve space.
       if (Record.empty())
-        return error("Invalid record");
+        return error("Invalid numentry record");
       TypeList.resize(Record[0]);
       continue;
     case bitc::TYPE_CODE_VOID:      // VOID
@@ -1762,7 +2237,7 @@ Error BitcodeReader::parseTypeTableBody() {
       break;
     case bitc::TYPE_CODE_INTEGER: { // INTEGER: [width]
       if (Record.empty())
-        return error("Invalid record");
+        return error("Invalid integer record");
 
       uint64_t NumBits = Record[0];
       if (NumBits < IntegerType::MIN_INT_BITS ||
@@ -1774,7 +2249,7 @@ Error BitcodeReader::parseTypeTableBody() {
     case bitc::TYPE_CODE_POINTER: { // POINTER: [pointee type] or
                                     //          [pointee type, address space]
       if (Record.empty())
-        return error("Invalid record");
+        return error("Invalid pointer record");
       unsigned AddressSpace = 0;
       if (Record.size() == 2)
         AddressSpace = Record[1];
@@ -1782,13 +2257,18 @@ Error BitcodeReader::parseTypeTableBody() {
       if (!ResultTy ||
           !PointerType::isValidElementType(ResultTy))
         return error("Invalid type");
+      if (LLVM_UNLIKELY(!Context.hasSetOpaquePointersValue()))
+        Context.setOpaquePointers(false);
+      ContainedIDs.push_back(Record[0]);
       ResultTy = PointerType::get(ResultTy, AddressSpace);
       break;
     }
     case bitc::TYPE_CODE_OPAQUE_POINTER: { // OPAQUE_POINTER: [addrspace]
       if (Record.size() != 1)
-        return error("Invalid record");
-      if (Context.supportsTypedPointers())
+        return error("Invalid opaque pointer record");
+      if (LLVM_UNLIKELY(!Context.hasSetOpaquePointersValue())) {
+        Context.setOpaquePointers(true);
+      } else if (Context.supportsTypedPointers())
         return error(
             "Opaque pointers are only supported in -opaque-pointers mode");
       unsigned AddressSpace = Record[0];
@@ -1799,7 +2279,7 @@ Error BitcodeReader::parseTypeTableBody() {
       // Deprecated, but still needed to read old bitcode files.
       // FUNCTION: [vararg, attrid, retty, paramty x N]
       if (Record.size() < 3)
-        return error("Invalid record");
+        return error("Invalid function record");
       SmallVector<Type*, 8> ArgTys;
       for (unsigned i = 3, e = Record.size(); i != e; ++i) {
         if (Type *T = getTypeByID(Record[i]))
@@ -1812,13 +2292,14 @@ Error BitcodeReader::parseTypeTableBody() {
       if (!ResultTy || ArgTys.size() < Record.size()-3)
         return error("Invalid type");
 
+      ContainedIDs.append(Record.begin() + 2, Record.end());
       ResultTy = FunctionType::get(ResultTy, ArgTys, Record[0]);
       break;
     }
     case bitc::TYPE_CODE_FUNCTION: {
       // FUNCTION: [vararg, retty, paramty x N]
       if (Record.size() < 2)
-        return error("Invalid record");
+        return error("Invalid function record");
       SmallVector<Type*, 8> ArgTys;
       for (unsigned i = 2, e = Record.size(); i != e; ++i) {
         if (Type *T = getTypeByID(Record[i])) {
@@ -1834,12 +2315,13 @@ Error BitcodeReader::parseTypeTableBody() {
       if (!ResultTy || ArgTys.size() < Record.size()-2)
         return error("Invalid type");
 
+      ContainedIDs.append(Record.begin() + 1, Record.end());
       ResultTy = FunctionType::get(ResultTy, ArgTys, Record[0]);
       break;
     }
     case bitc::TYPE_CODE_STRUCT_ANON: {  // STRUCT: [ispacked, eltty x N]
       if (Record.empty())
-        return error("Invalid record");
+        return error("Invalid anon struct record");
       SmallVector<Type*, 8> EltTys;
       for (unsigned i = 1, e = Record.size(); i != e; ++i) {
         if (Type *T = getTypeByID(Record[i]))
@@ -1849,17 +2331,18 @@ Error BitcodeReader::parseTypeTableBody() {
       }
       if (EltTys.size() != Record.size()-1)
         return error("Invalid type");
+      ContainedIDs.append(Record.begin() + 1, Record.end());
       ResultTy = StructType::get(Context, EltTys, Record[0]);
       break;
     }
     case bitc::TYPE_CODE_STRUCT_NAME:   // STRUCT_NAME: [strchr x N]
       if (convertToString(Record, 0, TypeName))
-        return error("Invalid record");
+        return error("Invalid struct name record");
       continue;
 
     case bitc::TYPE_CODE_STRUCT_NAMED: { // STRUCT: [ispacked, eltty x N]
       if (Record.empty())
-        return error("Invalid record");
+        return error("Invalid named struct record");
 
       if (NumRecords >= TypeList.size())
         return error("Invalid TYPE table");
@@ -1881,14 +2364,15 @@ Error BitcodeReader::parseTypeTableBody() {
           break;
       }
       if (EltTys.size() != Record.size()-1)
-        return error("Invalid record");
+        return error("Invalid named struct record");
       Res->setBody(EltTys, Record[0]);
+      ContainedIDs.append(Record.begin() + 1, Record.end());
       ResultTy = Res;
       break;
     }
     case bitc::TYPE_CODE_OPAQUE: {       // OPAQUE: []
       if (Record.size() != 1)
-        return error("Invalid record");
+        return error("Invalid opaque type record");
 
       if (NumRecords >= TypeList.size())
         return error("Invalid TYPE table");
@@ -1906,22 +2390,24 @@ Error BitcodeReader::parseTypeTableBody() {
     }
     case bitc::TYPE_CODE_ARRAY:     // ARRAY: [numelts, eltty]
       if (Record.size() < 2)
-        return error("Invalid record");
+        return error("Invalid array type record");
       ResultTy = getTypeByID(Record[1]);
       if (!ResultTy || !ArrayType::isValidElementType(ResultTy))
         return error("Invalid type");
+      ContainedIDs.push_back(Record[1]);
       ResultTy = ArrayType::get(ResultTy, Record[0]);
       break;
     case bitc::TYPE_CODE_VECTOR:    // VECTOR: [numelts, eltty] or
                                     //         [numelts, eltty, scalable]
       if (Record.size() < 2)
-        return error("Invalid record");
+        return error("Invalid vector type record");
       if (Record[0] == 0)
         return error("Invalid vector length");
       ResultTy = getTypeByID(Record[1]);
       if (!ResultTy || !VectorType::isValidElementType(ResultTy))
         return error("Invalid type");
       bool Scalable = Record.size() > 2 ? Record[2] : false;
+      ContainedIDs.push_back(Record[1]);
       ResultTy = VectorType::get(ResultTy, Record[0], Scalable);
       break;
     }
@@ -1932,7 +2418,10 @@ Error BitcodeReader::parseTypeTableBody() {
       return error(
           "Invalid TYPE table: Only named structs can be forward referenced");
     assert(ResultTy && "Didn't read a type?");
-    TypeList[NumRecords++] = ResultTy;
+    TypeList[NumRecords] = ResultTy;
+    if (!ContainedIDs.empty())
+      ContainedTypeIDs[NumRecords] = std::move(ContainedIDs);
+    ++NumRecords;
   }
 }
 
@@ -1968,12 +2457,12 @@ Error BitcodeReader::parseOperandBundleTags() {
     if (!MaybeRecord)
       return MaybeRecord.takeError();
     if (MaybeRecord.get() != bitc::OPERAND_BUNDLE_TAG)
-      return error("Invalid record");
+      return error("Invalid operand bundle record");
 
     // OPERAND_BUNDLE_TAG: [strchr x N]
     BundleTags.emplace_back();
     if (convertToString(Record, 0, BundleTags.back()))
-      return error("Invalid record");
+      return error("Invalid operand bundle record");
     Record.clear();
   }
 }
@@ -2012,11 +2501,11 @@ Error BitcodeReader::parseSyncScopeNames() {
     if (!MaybeRecord)
       return MaybeRecord.takeError();
     if (MaybeRecord.get() != bitc::SYNC_SCOPE_NAME)
-      return error("Invalid record");
+      return error("Invalid sync scope record");
 
     SmallString<16> SSN;
     if (convertToString(Record, 0, SSN))
-      return error("Invalid record");
+      return error("Invalid sync scope record");
 
     SSIDs.push_back(Context.getOrInsertSyncScopeID(SSN));
     Record.clear();
@@ -2056,8 +2545,9 @@ static Expected<uint64_t> jumpToValueSymbolTable(uint64_t Offset,
   Expected<BitstreamEntry> MaybeEntry = Stream.advance();
   if (!MaybeEntry)
     return MaybeEntry.takeError();
-  assert(MaybeEntry.get().Kind == BitstreamEntry::SubBlock);
-  assert(MaybeEntry.get().ID == bitc::VALUE_SYMTAB_BLOCK_ID);
+  if (MaybeEntry.get().Kind != BitstreamEntry::SubBlock ||
+      MaybeEntry.get().ID != bitc::VALUE_SYMTAB_BLOCK_ID)
+    return error("Expected value symbol table subblock");
   return CurrentBit;
 }
 
@@ -2107,11 +2597,15 @@ Error BitcodeReader::parseGlobalValueSymbolTable() {
     if (!MaybeRecord)
       return MaybeRecord.takeError();
     switch (MaybeRecord.get()) {
-    case bitc::VST_CODE_FNENTRY: // [valueid, offset]
+    case bitc::VST_CODE_FNENTRY: { // [valueid, offset]
+      unsigned ValueID = Record[0];
+      if (ValueID >= ValueList.size() || !ValueList[ValueID])
+        return error("Invalid value reference in symbol table");
       setDeferredFunctionInfo(FuncBitcodeOffsetDelta,
-                              cast<Function>(ValueList[Record[0]]), Record);
+                              cast<Function>(ValueList[ValueID]), Record);
       break;
     }
+    }
   }
 }
 
@@ -2213,10 +2707,10 @@ Error BitcodeReader::parseValueSymbolTable(uint64_t Offset) {
     }
     case bitc::VST_CODE_BBENTRY: {
       if (convertToString(Record, 1, ValueName))
-        return error("Invalid record");
+        return error("Invalid bbentry record");
       BasicBlock *BB = getBasicBlock(Record[0]);
       if (!BB)
-        return error("Invalid record");
+        return error("Invalid bbentry record");
 
       BB->setName(StringRef(ValueName.data(), ValueName.size()));
       ValueName.clear();
@@ -2253,10 +2747,10 @@ Error BitcodeReader::resolveGlobalAndIndirectSymbolInits() {
       // Not ready to resolve this yet, it requires something later in the file.
       GlobalInits.push_back(GlobalInitWorklist.back());
     } else {
-      if (Constant *C = dyn_cast_or_null<Constant>(ValueList[ValID]))
-        GlobalInitWorklist.back().first->setInitializer(C);
-      else
-        return error("Expected a constant");
+      Expected<Constant *> MaybeC = getValueForInitializer(ValID);
+      if (!MaybeC)
+        return MaybeC.takeError();
+      GlobalInitWorklist.back().first->setInitializer(MaybeC.get());
     }
     GlobalInitWorklist.pop_back();
   }
@@ -2266,9 +2760,10 @@ Error BitcodeReader::resolveGlobalAndIndirectSymbolInits() {
     if (ValID >= ValueList.size()) {
       IndirectSymbolInits.push_back(IndirectSymbolInitWorklist.back());
     } else {
-      Constant *C = dyn_cast_or_null<Constant>(ValueList[ValID]);
-      if (!C)
-        return error("Expected a constant");
+      Expected<Constant *> MaybeC = getValueForInitializer(ValID);
+      if (!MaybeC)
+        return MaybeC.takeError();
+      Constant *C = MaybeC.get();
       GlobalValue *GV = IndirectSymbolInitWorklist.back().first;
       if (auto *GA = dyn_cast<GlobalAlias>(GV)) {
         if (C->getType() != GV->getType())
@@ -2292,30 +2787,30 @@ Error BitcodeReader::resolveGlobalAndIndirectSymbolInits() {
     if (Info.PersonalityFn) {
       unsigned ValID = Info.PersonalityFn - 1;
       if (ValID < ValueList.size()) {
-        if (Constant *C = dyn_cast_or_null<Constant>(ValueList[ValID]))
-          Info.F->setPersonalityFn(C);
-        else
-          return error("Expected a constant");
+        Expected<Constant *> MaybeC = getValueForInitializer(ValID);
+        if (!MaybeC)
+          return MaybeC.takeError();
+        Info.F->setPersonalityFn(MaybeC.get());
         Info.PersonalityFn = 0;
       }
     }
     if (Info.Prefix) {
       unsigned ValID = Info.Prefix - 1;
       if (ValID < ValueList.size()) {
-        if (Constant *C = dyn_cast_or_null<Constant>(ValueList[ValID]))
-          Info.F->setPrefixData(C);
-        else
-          return error("Expected a constant");
+        Expected<Constant *> MaybeC = getValueForInitializer(ValID);
+        if (!MaybeC)
+          return MaybeC.takeError();
+        Info.F->setPrefixData(MaybeC.get());
         Info.Prefix = 0;
       }
     }
     if (Info.Prologue) {
       unsigned ValID = Info.Prologue - 1;
       if (ValID < ValueList.size()) {
-        if (Constant *C = dyn_cast_or_null<Constant>(ValueList[ValID]))
-          Info.F->setPrologueData(C);
-        else
-          return error("Expected a constant");
+        Expected<Constant *> MaybeC = getValueForInitializer(ValID);
+        if (!MaybeC)
+          return MaybeC.takeError();
+        Info.F->setPrologueData(MaybeC.get());
         Info.Prologue = 0;
       }
     }
@@ -2343,26 +2838,11 @@ Error BitcodeReader::parseConstants() {
 
   // Read all the records for this value table.
   Type *CurTy = Type::getInt32Ty(Context);
+  unsigned Int32TyID = getVirtualTypeID(CurTy);
+  unsigned CurTyID = Int32TyID;
+  Type *CurElemTy = nullptr;
   unsigned NextCstNo = ValueList.size();
 
-  struct DelayedShufTy {
-    VectorType *OpTy;
-    VectorType *RTy;
-    uint64_t Op0Idx;
-    uint64_t Op1Idx;
-    uint64_t Op2Idx;
-    unsigned CstNo;
-  };
-  std::vector<DelayedShufTy> DelayedShuffles;
-  struct DelayedSelTy {
-    Type *OpTy;
-    uint64_t Op0Idx;
-    uint64_t Op1Idx;
-    uint64_t Op2Idx;
-    unsigned CstNo;
-  };
-  std::vector<DelayedSelTy> DelayedSelectors;
-
   while (true) {
     Expected<BitstreamEntry> MaybeEntry = Stream.advanceSkippingSubblocks();
     if (!MaybeEntry)
@@ -2374,57 +2854,8 @@ Error BitcodeReader::parseConstants() {
     case BitstreamEntry::Error:
       return error("Malformed block");
     case BitstreamEntry::EndBlock:
-      // Once all the constants have been read, go through and resolve forward
-      // references.
-      //
-      // We have to treat shuffles specially because they don't have three
-      // operands anymore.  We need to convert the shuffle mask into an array,
-      // and we can't convert a forward reference.
-      for (auto &DelayedShuffle : DelayedShuffles) {
-        VectorType *OpTy = DelayedShuffle.OpTy;
-        VectorType *RTy = DelayedShuffle.RTy;
-        uint64_t Op0Idx = DelayedShuffle.Op0Idx;
-        uint64_t Op1Idx = DelayedShuffle.Op1Idx;
-        uint64_t Op2Idx = DelayedShuffle.Op2Idx;
-        uint64_t CstNo = DelayedShuffle.CstNo;
-        Constant *Op0 = ValueList.getConstantFwdRef(Op0Idx, OpTy);
-        Constant *Op1 = ValueList.getConstantFwdRef(Op1Idx, OpTy);
-        Type *ShufTy =
-            VectorType::get(Type::getInt32Ty(Context), RTy->getElementCount());
-        Constant *Op2 = ValueList.getConstantFwdRef(Op2Idx, ShufTy);
-        if (!ShuffleVectorInst::isValidOperands(Op0, Op1, Op2))
-          return error("Invalid shufflevector operands");
-        SmallVector<int, 16> Mask;
-        ShuffleVectorInst::getShuffleMask(Op2, Mask);
-        Value *V = ConstantExpr::getShuffleVector(Op0, Op1, Mask);
-        ValueList.assignValue(V, CstNo);
-      }
-      for (auto &DelayedSelector : DelayedSelectors) {
-        Type *OpTy = DelayedSelector.OpTy;
-        Type *SelectorTy = Type::getInt1Ty(Context);
-        uint64_t Op0Idx = DelayedSelector.Op0Idx;
-        uint64_t Op1Idx = DelayedSelector.Op1Idx;
-        uint64_t Op2Idx = DelayedSelector.Op2Idx;
-        uint64_t CstNo = DelayedSelector.CstNo;
-        Constant *Op1 = ValueList.getConstantFwdRef(Op1Idx, OpTy);
-        Constant *Op2 = ValueList.getConstantFwdRef(Op2Idx, OpTy);
-        // The selector might be an i1 or an <n x i1>
-        // Get the type from the ValueList before getting a forward ref.
-        if (VectorType *VTy = dyn_cast<VectorType>(OpTy)) {
-          Value *V = ValueList[Op0Idx];
-          assert(V);
-          if (SelectorTy != V->getType())
-            SelectorTy = VectorType::get(SelectorTy, VTy->getElementCount());
-        }
-        Constant *Op0 = ValueList.getConstantFwdRef(Op0Idx, SelectorTy);
-        Value *V = ConstantExpr::getSelect(Op0, Op1, Op2);
-        ValueList.assignValue(V, CstNo);
-      }
-
       if (NextCstNo != ValueList.size())
         return error("Invalid constant reference");
-
-      ValueList.resolveConstantForwardRefs();
       return Error::success();
     case BitstreamEntry::Record:
       // The interesting case.
@@ -2448,12 +2879,14 @@ Error BitcodeReader::parseConstants() {
       break;
     case bitc::CST_CODE_SETTYPE:   // SETTYPE: [typeid]
       if (Record.empty())
-        return error("Invalid record");
+        return error("Invalid settype record");
       if (Record[0] >= TypeList.size() || !TypeList[Record[0]])
-        return error("Invalid record");
+        return error("Invalid settype record");
       if (TypeList[Record[0]] == VoidType)
         return error("Invalid constant type");
-      CurTy = TypeList[Record[0]];
+      CurTyID = Record[0];
+      CurTy = TypeList[CurTyID];
+      CurElemTy = getPtrElementTypeByID(CurTyID);
       continue;  // Skip the ValueList manipulation.
     case bitc::CST_CODE_NULL:      // NULL
       if (CurTy->isVoidTy() || CurTy->isFunctionTy() || CurTy->isLabelTy())
@@ -2462,12 +2895,12 @@ Error BitcodeReader::parseConstants() {
       break;
     case bitc::CST_CODE_INTEGER:   // INTEGER: [intval]
       if (!CurTy->isIntegerTy() || Record.empty())
-        return error("Invalid record");
+        return error("Invalid integer const record");
       V = ConstantInt::get(CurTy, decodeSignRotatedValue(Record[0]));
       break;
     case bitc::CST_CODE_WIDE_INTEGER: {// WIDE_INTEGER: [n x intval]
       if (!CurTy->isIntegerTy() || Record.empty())
-        return error("Invalid record");
+        return error("Invalid wide integer const record");
 
       APInt VInt =
           readWideAPInt(Record, cast<IntegerType>(CurTy)->getBitWidth());
@@ -2477,7 +2910,7 @@ Error BitcodeReader::parseConstants() {
     }
     case bitc::CST_CODE_FLOAT: {    // FLOAT: [fpval]
       if (Record.empty())
-        return error("Invalid record");
+        return error("Invalid float const record");
       if (CurTy->isHalfTy())
         V = ConstantFP::get(Context, APFloat(APFloat::IEEEhalf(),
                                              APInt(16, (uint16_t)Record[0])));
@@ -2510,26 +2943,22 @@ Error BitcodeReader::parseConstants() {
 
     case bitc::CST_CODE_AGGREGATE: {// AGGREGATE: [n x value number]
       if (Record.empty())
-        return error("Invalid record");
+        return error("Invalid aggregate record");
 
       unsigned Size = Record.size();
-      SmallVector<Constant*, 16> Elts;
-
-      if (StructType *STy = dyn_cast<StructType>(CurTy)) {
-        for (unsigned i = 0; i != Size; ++i)
-          Elts.push_back(ValueList.getConstantFwdRef(Record[i],
-                                                     STy->getElementType(i)));
-        V = ConstantStruct::get(STy, Elts);
-      } else if (ArrayType *ATy = dyn_cast<ArrayType>(CurTy)) {
-        Type *EltTy = ATy->getElementType();
-        for (unsigned i = 0; i != Size; ++i)
-          Elts.push_back(ValueList.getConstantFwdRef(Record[i], EltTy));
-        V = ConstantArray::get(ATy, Elts);
-      } else if (VectorType *VTy = dyn_cast<VectorType>(CurTy)) {
-        Type *EltTy = VTy->getElementType();
-        for (unsigned i = 0; i != Size; ++i)
-          Elts.push_back(ValueList.getConstantFwdRef(Record[i], EltTy));
-        V = ConstantVector::get(Elts);
+      SmallVector<unsigned, 16> Elts;
+      for (unsigned i = 0; i != Size; ++i)
+        Elts.push_back(Record[i]);
+
+      if (isa<StructType>(CurTy)) {
+        V = BitcodeConstant::create(
+            Alloc, CurTy, BitcodeConstant::ConstantStructOpcode, Elts);
+      } else if (isa<ArrayType>(CurTy)) {
+        V = BitcodeConstant::create(Alloc, CurTy,
+                                    BitcodeConstant::ConstantArrayOpcode, Elts);
+      } else if (isa<VectorType>(CurTy)) {
+        V = BitcodeConstant::create(
+            Alloc, CurTy, BitcodeConstant::ConstantVectorOpcode, Elts);
       } else {
         V = UndefValue::get(CurTy);
       }
@@ -2538,7 +2967,7 @@ Error BitcodeReader::parseConstants() {
     case bitc::CST_CODE_STRING:    // STRING: [values]
     case bitc::CST_CODE_CSTRING: { // CSTRING: [values]
       if (Record.empty())
-        return error("Invalid record");
+        return error("Invalid string record");
 
       SmallString<16> Elts(Record.begin(), Record.end());
       V = ConstantDataArray::getString(Context, Elts,
@@ -2547,7 +2976,7 @@ Error BitcodeReader::parseConstants() {
     }
     case bitc::CST_CODE_DATA: {// DATA: [n x value]
       if (Record.empty())
-        return error("Invalid record");
+        return error("Invalid data record");
 
       Type *EltTy;
       if (auto *Array = dyn_cast<ArrayType>(CurTy))
@@ -2609,27 +3038,23 @@ Error BitcodeReader::parseConstants() {
     }
     case bitc::CST_CODE_CE_UNOP: {  // CE_UNOP: [opcode, opval]
       if (Record.size() < 2)
-        return error("Invalid record");
+        return error("Invalid unary op constexpr record");
       int Opc = getDecodedUnaryOpcode(Record[0], CurTy);
       if (Opc < 0) {
         V = UndefValue::get(CurTy);  // Unknown unop.
       } else {
-        Constant *LHS = ValueList.getConstantFwdRef(Record[1], CurTy);
-        unsigned Flags = 0;
-        V = ConstantExpr::get(Opc, LHS, Flags);
+        V = BitcodeConstant::create(Alloc, CurTy, Opc, (unsigned)Record[1]);
       }
       break;
     }
     case bitc::CST_CODE_CE_BINOP: {  // CE_BINOP: [opcode, opval, opval]
       if (Record.size() < 3)
-        return error("Invalid record");
+        return error("Invalid binary op constexpr record");
       int Opc = getDecodedBinaryOpcode(Record[0], CurTy);
       if (Opc < 0) {
         V = UndefValue::get(CurTy);  // Unknown binop.
       } else {
-        Constant *LHS = ValueList.getConstantFwdRef(Record[1], CurTy);
-        Constant *RHS = ValueList.getConstantFwdRef(Record[2], CurTy);
-        unsigned Flags = 0;
+        uint8_t Flags = 0;
         if (Record.size() >= 4) {
           if (Opc == Instruction::Add ||
               Opc == Instruction::Sub ||
@@ -2647,23 +3072,23 @@ Error BitcodeReader::parseConstants() {
               Flags |= SDivOperator::IsExact;
           }
         }
-        V = ConstantExpr::get(Opc, LHS, RHS, Flags);
+        V = BitcodeConstant::create(Alloc, CurTy, {(uint8_t)Opc, Flags},
+                                    {(unsigned)Record[1], (unsigned)Record[2]});
       }
       break;
     }
     case bitc::CST_CODE_CE_CAST: {  // CE_CAST: [opcode, opty, opval]
       if (Record.size() < 3)
-        return error("Invalid record");
+        return error("Invalid cast constexpr record");
       int Opc = getDecodedCastOpcode(Record[0]);
       if (Opc < 0) {
         V = UndefValue::get(CurTy);  // Unknown cast.
       } else {
-        Type *OpTy = getTypeByID(Record[1]);
+        unsigned OpTyID = Record[1];
+        Type *OpTy = getTypeByID(OpTyID);
         if (!OpTy)
-          return error("Invalid record");
-        Constant *Op = ValueList.getConstantFwdRef(Record[2], OpTy);
-        V = UpgradeBitCastExpr(Opc, Op, CurTy);
-        if (!V) V = ConstantExpr::getCast(Opc, Op, CurTy);
+          return error("Invalid cast constexpr record");
+        V = BitcodeConstant::create(Alloc, CurTy, Opc, (unsigned)Record[2]);
       }
       break;
     }
@@ -2671,6 +3096,8 @@ Error BitcodeReader::parseConstants() {
     case bitc::CST_CODE_CE_GEP: // [ty, n x operands]
     case bitc::CST_CODE_CE_GEP_WITH_INRANGE_INDEX: { // [ty, flags, n x
                                                      // operands]
+      if (Record.size() < 2)
+        return error("Constant GEP record must have at least two elements");
       unsigned OpNum = 0;
       Type *PointeeType = nullptr;
       if (BitCode == bitc::CST_CODE_CE_GEP_WITH_INRANGE_INDEX ||
@@ -2686,180 +3113,190 @@ Error BitcodeReader::parseConstants() {
       } else if (BitCode == bitc::CST_CODE_CE_INBOUNDS_GEP)
         InBounds = true;
 
-      SmallVector<Constant*, 16> Elts;
-      Type *Elt0FullTy = nullptr;
+      SmallVector<unsigned, 16> Elts;
+      unsigned BaseTypeID = Record[OpNum];
       while (OpNum != Record.size()) {
-        if (!Elt0FullTy)
-          Elt0FullTy = getTypeByID(Record[OpNum]);
-        Type *ElTy = getTypeByID(Record[OpNum++]);
+        unsigned ElTyID = Record[OpNum++];
+        Type *ElTy = getTypeByID(ElTyID);
         if (!ElTy)
-          return error("Invalid record");
-        Elts.push_back(ValueList.getConstantFwdRef(Record[OpNum++], ElTy));
+          return error("Invalid getelementptr constexpr record");
+        Elts.push_back(Record[OpNum++]);
       }
 
       if (Elts.size() < 1)
         return error("Invalid gep with no operands");
 
-      PointerType *OrigPtrTy = cast<PointerType>(Elt0FullTy->getScalarType());
-      if (!PointeeType)
-        PointeeType = OrigPtrTy->getPointerElementType();
-      else if (!OrigPtrTy->isOpaqueOrPointeeTypeMatches(PointeeType))
+      Type *BaseType = getTypeByID(BaseTypeID);
+      if (isa<VectorType>(BaseType)) {
+        BaseTypeID = getContainedTypeID(BaseTypeID, 0);
+        BaseType = getTypeByID(BaseTypeID);
+      }
+
+      PointerType *OrigPtrTy = dyn_cast_or_null<PointerType>(BaseType);
+      if (!OrigPtrTy)
+        return error("GEP base operand must be pointer or vector of pointer");
+
+      if (!PointeeType) {
+        PointeeType = getPtrElementTypeByID(BaseTypeID);
+        if (!PointeeType)
+          return error("Missing element type for old-style constant GEP");
+      } else if (!OrigPtrTy->isOpaqueOrPointeeTypeMatches(PointeeType))
         return error("Explicit gep operator type does not match pointee type "
                      "of pointer operand");
 
-      ArrayRef<Constant *> Indices(Elts.begin() + 1, Elts.end());
-      V = ConstantExpr::getGetElementPtr(PointeeType, Elts[0], Indices,
-                                         InBounds, InRangeIndex);
+      V = BitcodeConstant::create(Alloc, CurTy,
+                                  {Instruction::GetElementPtr, InBounds,
+                                   InRangeIndex.value_or(-1), PointeeType},
+                                  Elts);
       break;
     }
     case bitc::CST_CODE_CE_SELECT: {  // CE_SELECT: [opval#, opval#, opval#]
       if (Record.size() < 3)
-        return error("Invalid record");
+        return error("Invalid select constexpr record");
 
-      DelayedSelectors.push_back(
-          {CurTy, Record[0], Record[1], Record[2], NextCstNo});
-      (void)ValueList.getConstantFwdRef(NextCstNo, CurTy);
-      ++NextCstNo;
-      continue;
+      V = BitcodeConstant::create(
+          Alloc, CurTy, Instruction::Select,
+          {(unsigned)Record[0], (unsigned)Record[1], (unsigned)Record[2]});
+      break;
     }
     case bitc::CST_CODE_CE_EXTRACTELT
         : { // CE_EXTRACTELT: [opty, opval, opty, opval]
       if (Record.size() < 3)
-        return error("Invalid record");
+        return error("Invalid extractelement constexpr record");
+      unsigned OpTyID = Record[0];
       VectorType *OpTy =
-        dyn_cast_or_null<VectorType>(getTypeByID(Record[0]));
+        dyn_cast_or_null<VectorType>(getTypeByID(OpTyID));
       if (!OpTy)
-        return error("Invalid record");
-      Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
-      Constant *Op1 = nullptr;
+        return error("Invalid extractelement constexpr record");
+      unsigned IdxRecord;
       if (Record.size() == 4) {
-        Type *IdxTy = getTypeByID(Record[2]);
+        unsigned IdxTyID = Record[2];
+        Type *IdxTy = getTypeByID(IdxTyID);
         if (!IdxTy)
-          return error("Invalid record");
-        Op1 = ValueList.getConstantFwdRef(Record[3], IdxTy);
+          return error("Invalid extractelement constexpr record");
+        IdxRecord = Record[3];
       } else {
         // Deprecated, but still needed to read old bitcode files.
-        Op1 = ValueList.getConstantFwdRef(Record[2], Type::getInt32Ty(Context));
+        IdxRecord = Record[2];
       }
-      if (!Op1)
-        return error("Invalid record");
-      V = ConstantExpr::getExtractElement(Op0, Op1);
+      V = BitcodeConstant::create(Alloc, CurTy, Instruction::ExtractElement,
+                                  {(unsigned)Record[1], IdxRecord});
       break;
     }
     case bitc::CST_CODE_CE_INSERTELT
         : { // CE_INSERTELT: [opval, opval, opty, opval]
       VectorType *OpTy = dyn_cast<VectorType>(CurTy);
       if (Record.size() < 3 || !OpTy)
-        return error("Invalid record");
-      Constant *Op0 = ValueList.getConstantFwdRef(Record[0], OpTy);
-      Constant *Op1 = ValueList.getConstantFwdRef(Record[1],
-                                                  OpTy->getElementType());
-      Constant *Op2 = nullptr;
+        return error("Invalid insertelement constexpr record");
+      unsigned IdxRecord;
       if (Record.size() == 4) {
-        Type *IdxTy = getTypeByID(Record[2]);
+        unsigned IdxTyID = Record[2];
+        Type *IdxTy = getTypeByID(IdxTyID);
         if (!IdxTy)
-          return error("Invalid record");
-        Op2 = ValueList.getConstantFwdRef(Record[3], IdxTy);
+          return error("Invalid insertelement constexpr record");
+        IdxRecord = Record[3];
       } else {
         // Deprecated, but still needed to read old bitcode files.
-        Op2 = ValueList.getConstantFwdRef(Record[2], Type::getInt32Ty(Context));
+        IdxRecord = Record[2];
       }
-      if (!Op2)
-        return error("Invalid record");
-      V = ConstantExpr::getInsertElement(Op0, Op1, Op2);
+      V = BitcodeConstant::create(
+          Alloc, CurTy, Instruction::InsertElement,
+          {(unsigned)Record[0], (unsigned)Record[1], IdxRecord});
       break;
     }
     case bitc::CST_CODE_CE_SHUFFLEVEC: { // CE_SHUFFLEVEC: [opval, opval, opval]
       VectorType *OpTy = dyn_cast<VectorType>(CurTy);
       if (Record.size() < 3 || !OpTy)
-        return error("Invalid record");
-      DelayedShuffles.push_back(
-          {OpTy, OpTy, Record[0], Record[1], Record[2], NextCstNo});
-      ++NextCstNo;
-      continue;
+        return error("Invalid shufflevector constexpr record");
+      V = BitcodeConstant::create(
+          Alloc, CurTy, Instruction::ShuffleVector,
+          {(unsigned)Record[0], (unsigned)Record[1], (unsigned)Record[2]});
+      break;
     }
     case bitc::CST_CODE_CE_SHUFVEC_EX: { // [opty, opval, opval, opval]
       VectorType *RTy = dyn_cast<VectorType>(CurTy);
       VectorType *OpTy =
         dyn_cast_or_null<VectorType>(getTypeByID(Record[0]));
       if (Record.size() < 4 || !RTy || !OpTy)
-        return error("Invalid record");
-      DelayedShuffles.push_back(
-          {OpTy, RTy, Record[1], Record[2], Record[3], NextCstNo});
-      ++NextCstNo;
-      continue;
+        return error("Invalid shufflevector constexpr record");
+      V = BitcodeConstant::create(
+          Alloc, CurTy, Instruction::ShuffleVector,
+          {(unsigned)Record[1], (unsigned)Record[2], (unsigned)Record[3]});
+      break;
     }
     case bitc::CST_CODE_CE_CMP: {     // CE_CMP: [opty, opval, opval, pred]
       if (Record.size() < 4)
-        return error("Invalid record");
-      Type *OpTy = getTypeByID(Record[0]);
+        return error("Invalid cmp constexpt record");
+      unsigned OpTyID = Record[0];
+      Type *OpTy = getTypeByID(OpTyID);
       if (!OpTy)
-        return error("Invalid record");
-      Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
-      Constant *Op1 = ValueList.getConstantFwdRef(Record[2], OpTy);
-
-      if (OpTy->isFPOrFPVectorTy())
-        V = ConstantExpr::getFCmp(Record[3], Op0, Op1);
-      else
-        V = ConstantExpr::getICmp(Record[3], Op0, Op1);
+        return error("Invalid cmp constexpr record");
+      V = BitcodeConstant::create(
+          Alloc, CurTy,
+          {(uint8_t)(OpTy->isFPOrFPVectorTy() ? Instruction::FCmp
+                                              : Instruction::ICmp),
+           (uint8_t)Record[3]},
+          {(unsigned)Record[1], (unsigned)Record[2]});
       break;
     }
     // This maintains backward compatibility, pre-asm dialect keywords.
     // Deprecated, but still needed to read old bitcode files.
     case bitc::CST_CODE_INLINEASM_OLD: {
       if (Record.size() < 2)
-        return error("Invalid record");
+        return error("Invalid inlineasm record");
       std::string AsmStr, ConstrStr;
       bool HasSideEffects = Record[0] & 1;
       bool IsAlignStack = Record[0] >> 1;
       unsigned AsmStrSize = Record[1];
       if (2+AsmStrSize >= Record.size())
-        return error("Invalid record");
+        return error("Invalid inlineasm record");
       unsigned ConstStrSize = Record[2+AsmStrSize];
       if (3+AsmStrSize+ConstStrSize > Record.size())
-        return error("Invalid record");
+        return error("Invalid inlineasm record");
 
       for (unsigned i = 0; i != AsmStrSize; ++i)
         AsmStr += (char)Record[2+i];
       for (unsigned i = 0; i != ConstStrSize; ++i)
         ConstrStr += (char)Record[3+AsmStrSize+i];
       UpgradeInlineAsmString(&AsmStr);
-      // FIXME: support upgrading in opaque pointers mode.
-      V = InlineAsm::get(cast<FunctionType>(CurTy->getPointerElementType()),
-                         AsmStr, ConstrStr, HasSideEffects, IsAlignStack);
+      if (!CurElemTy)
+        return error("Missing element type for old-style inlineasm");
+      V = InlineAsm::get(cast<FunctionType>(CurElemTy), AsmStr, ConstrStr,
+                         HasSideEffects, IsAlignStack);
       break;
     }
     // This version adds support for the asm dialect keywords (e.g.,
     // inteldialect).
     case bitc::CST_CODE_INLINEASM_OLD2: {
       if (Record.size() < 2)
-        return error("Invalid record");
+        return error("Invalid inlineasm record");
       std::string AsmStr, ConstrStr;
       bool HasSideEffects = Record[0] & 1;
       bool IsAlignStack = (Record[0] >> 1) & 1;
       unsigned AsmDialect = Record[0] >> 2;
       unsigned AsmStrSize = Record[1];
       if (2+AsmStrSize >= Record.size())
-        return error("Invalid record");
+        return error("Invalid inlineasm record");
       unsigned ConstStrSize = Record[2+AsmStrSize];
       if (3+AsmStrSize+ConstStrSize > Record.size())
-        return error("Invalid record");
+        return error("Invalid inlineasm record");
 
       for (unsigned i = 0; i != AsmStrSize; ++i)
         AsmStr += (char)Record[2+i];
       for (unsigned i = 0; i != ConstStrSize; ++i)
         ConstrStr += (char)Record[3+AsmStrSize+i];
       UpgradeInlineAsmString(&AsmStr);
-      // FIXME: support upgrading in opaque pointers mode.
-      V = InlineAsm::get(cast<FunctionType>(CurTy->getPointerElementType()),
-                         AsmStr, ConstrStr, HasSideEffects, IsAlignStack,
+      if (!CurElemTy)
+        return error("Missing element type for old-style inlineasm");
+      V = InlineAsm::get(cast<FunctionType>(CurElemTy), AsmStr, ConstrStr,
+                         HasSideEffects, IsAlignStack,
                          InlineAsm::AsmDialect(AsmDialect));
       break;
     }
     // This version adds support for the unwind keyword.
     case bitc::CST_CODE_INLINEASM_OLD3: {
       if (Record.size() < 2)
-        return error("Invalid record");
+        return error("Invalid inlineasm record");
       unsigned OpNum = 0;
       std::string AsmStr, ConstrStr;
       bool HasSideEffects = Record[OpNum] & 1;
@@ -2870,10 +3307,10 @@ Error BitcodeReader::parseConstants() {
       unsigned AsmStrSize = Record[OpNum];
       ++OpNum;
       if (OpNum + AsmStrSize >= Record.size())
-        return error("Invalid record");
+        return error("Invalid inlineasm record");
       unsigned ConstStrSize = Record[OpNum + AsmStrSize];
       if (OpNum + 1 + AsmStrSize + ConstStrSize > Record.size())
-        return error("Invalid record");
+        return error("Invalid inlineasm record");
 
       for (unsigned i = 0; i != AsmStrSize; ++i)
         AsmStr += (char)Record[OpNum + i];
@@ -2881,21 +3318,22 @@ Error BitcodeReader::parseConstants() {
       for (unsigned i = 0; i != ConstStrSize; ++i)
         ConstrStr += (char)Record[OpNum + AsmStrSize + i];
       UpgradeInlineAsmString(&AsmStr);
-      // FIXME: support upgrading in opaque pointers mode.
-      V = InlineAsm::get(cast<FunctionType>(CurTy->getPointerElementType()),
-                         AsmStr, ConstrStr, HasSideEffects, IsAlignStack,
+      if (!CurElemTy)
+        return error("Missing element type for old-style inlineasm");
+      V = InlineAsm::get(cast<FunctionType>(CurElemTy), AsmStr, ConstrStr,
+                         HasSideEffects, IsAlignStack,
                          InlineAsm::AsmDialect(AsmDialect), CanThrow);
       break;
     }
     // This version adds explicit function type.
     case bitc::CST_CODE_INLINEASM: {
       if (Record.size() < 3)
-        return error("Invalid record");
+        return error("Invalid inlineasm record");
       unsigned OpNum = 0;
       auto *FnTy = dyn_cast_or_null<FunctionType>(getTypeByID(Record[OpNum]));
       ++OpNum;
       if (!FnTy)
-        return error("Invalid record");
+        return error("Invalid inlineasm record");
       std::string AsmStr, ConstrStr;
       bool HasSideEffects = Record[OpNum] & 1;
       bool IsAlignStack = (Record[OpNum] >> 1) & 1;
@@ -2905,10 +3343,10 @@ Error BitcodeReader::parseConstants() {
       unsigned AsmStrSize = Record[OpNum];
       ++OpNum;
       if (OpNum + AsmStrSize >= Record.size())
-        return error("Invalid record");
+        return error("Invalid inlineasm record");
       unsigned ConstStrSize = Record[OpNum + AsmStrSize];
       if (OpNum + 1 + AsmStrSize + ConstStrSize > Record.size())
-        return error("Invalid record");
+        return error("Invalid inlineasm record");
 
       for (unsigned i = 0; i != AsmStrSize; ++i)
         AsmStr += (char)Record[OpNum + i];
@@ -2922,75 +3360,44 @@ Error BitcodeReader::parseConstants() {
     }
     case bitc::CST_CODE_BLOCKADDRESS:{
       if (Record.size() < 3)
-        return error("Invalid record");
-      Type *FnTy = getTypeByID(Record[0]);
+        return error("Invalid blockaddress record");
+      unsigned FnTyID = Record[0];
+      Type *FnTy = getTypeByID(FnTyID);
       if (!FnTy)
-        return error("Invalid record");
-      Function *Fn =
-        dyn_cast_or_null<Function>(ValueList.getConstantFwdRef(Record[1],FnTy));
-      if (!Fn)
-        return error("Invalid record");
-
-      // If the function is already parsed we can insert the block address right
-      // away.
-      BasicBlock *BB;
-      unsigned BBID = Record[2];
-      if (!BBID)
-        // Invalid reference to entry block.
-        return error("Invalid ID");
-      if (!Fn->empty()) {
-        Function::iterator BBI = Fn->begin(), BBE = Fn->end();
-        for (size_t I = 0, E = BBID; I != E; ++I) {
-          if (BBI == BBE)
-            return error("Invalid ID");
-          ++BBI;
-        }
-        BB = &*BBI;
-      } else {
-        // Otherwise insert a placeholder and remember it so it can be inserted
-        // when the function is parsed.
-        auto &FwdBBs = BasicBlockFwdRefs[Fn];
-        if (FwdBBs.empty())
-          BasicBlockFwdRefQueue.push_back(Fn);
-        if (FwdBBs.size() < BBID + 1)
-          FwdBBs.resize(BBID + 1);
-        if (!FwdBBs[BBID])
-          FwdBBs[BBID] = BasicBlock::Create(Context);
-        BB = FwdBBs[BBID];
-      }
-      V = BlockAddress::get(Fn, BB);
+        return error("Invalid blockaddress record");
+      V = BitcodeConstant::create(
+          Alloc, CurTy,
+          {BitcodeConstant::BlockAddressOpcode, 0, (unsigned)Record[2]},
+          Record[1]);
       break;
     }
     case bitc::CST_CODE_DSO_LOCAL_EQUIVALENT: {
       if (Record.size() < 2)
-        return error("Invalid record");
-      Type *GVTy = getTypeByID(Record[0]);
+        return error("Invalid dso_local record");
+      unsigned GVTyID = Record[0];
+      Type *GVTy = getTypeByID(GVTyID);
       if (!GVTy)
-        return error("Invalid record");
-      GlobalValue *GV = dyn_cast_or_null<GlobalValue>(
-          ValueList.getConstantFwdRef(Record[1], GVTy));
-      if (!GV)
-        return error("Invalid record");
-
-      V = DSOLocalEquivalent::get(GV);
+        return error("Invalid dso_local record");
+      V = BitcodeConstant::create(
+          Alloc, CurTy, BitcodeConstant::DSOLocalEquivalentOpcode, Record[1]);
       break;
     }
     case bitc::CST_CODE_NO_CFI_VALUE: {
       if (Record.size() < 2)
-        return error("Invalid record");
-      Type *GVTy = getTypeByID(Record[0]);
+        return error("Invalid no_cfi record");
+      unsigned GVTyID = Record[0];
+      Type *GVTy = getTypeByID(GVTyID);
       if (!GVTy)
-        return error("Invalid record");
-      GlobalValue *GV = dyn_cast_or_null<GlobalValue>(
-          ValueList.getConstantFwdRef(Record[1], GVTy));
-      if (!GV)
-        return error("Invalid record");
-      V = NoCFIValue::get(GV);
+        return error("Invalid no_cfi record");
+      V = BitcodeConstant::create(Alloc, CurTy, BitcodeConstant::NoCFIOpcode,
+                                  Record[1]);
       break;
     }
     }
 
-    ValueList.assignValue(V, NextCstNo);
+    assert(V->getType() == getTypeByID(CurTyID) && "Incorrect result type ID");
+    if (Error Err = ValueList.assignValue(NextCstNo, V, CurTyID))
+      return Err;
     ++NextCstNo;
   }
 }
@@ -3146,7 +3553,7 @@ Error BitcodeReader::globalCleanup() {
       // Some types could be renamed during loading if several modules are
       // loaded in the same LLVMContext (LTO scenario). In this case we should
       // remangle intrinsics names as well.
-      RemangledIntrinsics[&F] = Remangled.getValue();
+      RemangledIntrinsics[&F] = *Remangled;
     // Look for functions that rely on old function attribute behavior.
     UpgradeFunctionAttributes(F);
   }
@@ -3211,17 +3618,17 @@ Error BitcodeReader::rememberAndSkipFunctionBodies() {
   }
 }
 
-bool BitcodeReaderBase::readBlockInfo() {
+Error BitcodeReaderBase::readBlockInfo() {
   Expected<Optional<BitstreamBlockInfo>> MaybeNewBlockInfo =
       Stream.ReadBlockInfoBlock();
   if (!MaybeNewBlockInfo)
-    return true; // FIXME Handle the error.
+    return MaybeNewBlockInfo.takeError();
   Optional<BitstreamBlockInfo> NewBlockInfo =
       std::move(MaybeNewBlockInfo.get());
   if (!NewBlockInfo)
-    return true;
+    return error("Malformed block");
   BlockInfo = std::move(*NewBlockInfo);
-  return false;
+  return Error::success();
 }
 
 Error BitcodeReader::parseComdatRecord(ArrayRef<uint64_t> Record) {
@@ -3238,6 +3645,8 @@ Error BitcodeReader::parseComdatRecord(ArrayRef<uint64_t> Record) {
     if (Record.size() < 2)
       return error("Invalid record");
     unsigned ComdatNameSize = Record[1];
+    if (ComdatNameSize > Record.size() - 2)
+      return error("Comdat name size too large");
     OldFormatName.reserve(ComdatNameSize);
     for (unsigned i = 0; i != ComdatNameSize; ++i)
       OldFormatName += (char)Record[2 + i];
@@ -3256,6 +3665,19 @@ static void inferDSOLocal(GlobalValue *GV) {
     GV->setDSOLocal(true);
 }
 
+GlobalValue::SanitizerMetadata deserializeSanitizerMetadata(unsigned V) {
+  GlobalValue::SanitizerMetadata Meta;
+  if (V & (1 << 0))
+    Meta.NoAddress = true;
+  if (V & (1 << 1))
+    Meta.NoHWAddress = true;
+  if (V & (1 << 2))
+    Meta.NoMemtag = true;
+  if (V & (1 << 3))
+    Meta.IsDynInit = true;
+  return Meta;
+}
+
 Error BitcodeReader::parseGlobalVarRecord(ArrayRef<uint64_t> Record) {
   // v1: [pointer type, isconst, initid, linkage, alignment, section,
   // visibility, threadlocal, unnamed_addr, externally_initialized,
@@ -3267,7 +3689,8 @@ Error BitcodeReader::parseGlobalVarRecord(ArrayRef<uint64_t> Record) {
 
   if (Record.size() < 6)
     return error("Invalid record");
-  Type *Ty = getTypeByID(Record[0]);
+  unsigned TyID = Record[0];
+  Type *Ty = getTypeByID(TyID);
   if (!Ty)
     return error("Invalid record");
   bool isConstant = Record[1] & 1;
@@ -3279,7 +3702,10 @@ Error BitcodeReader::parseGlobalVarRecord(ArrayRef<uint64_t> Record) {
     if (!Ty->isPointerTy())
       return error("Invalid type for value");
     AddressSpace = cast<PointerType>(Ty)->getAddressSpace();
-    Ty = Ty->getPointerElementType();
+    TyID = getContainedTypeID(TyID);
+    Ty = getTypeByID(TyID);
+    if (!Ty)
+      return error("Missing element type for old-style global");
   }
 
   uint64_t RawLinkage = Record[3];
@@ -3325,7 +3751,7 @@ Error BitcodeReader::parseGlobalVarRecord(ArrayRef<uint64_t> Record) {
   else
     upgradeDLLImportExportLinkage(NewGV, RawLinkage);
 
-  ValueList.push_back(NewGV);
+  ValueList.push_back(NewGV, getVirtualTypeID(NewGV->getType(), TyID));
 
   // Remember which value to use for the global initializer.
   if (unsigned InitID = Record[2])
@@ -3355,6 +3781,12 @@ Error BitcodeReader::parseGlobalVarRecord(ArrayRef<uint64_t> Record) {
   if (Record.size() > 15)
     NewGV->setPartition(StringRef(Strtab.data() + Record[14], Record[15]));
 
+  if (Record.size() > 16 && Record[16]) {
+    llvm::GlobalValue::SanitizerMetadata Meta =
+        deserializeSanitizerMetadata(Record[16]);
+    NewGV->setSanitizerMetadata(Meta);
+  }
+
   return Error::success();
 }
 
@@ -3368,11 +3800,16 @@ Error BitcodeReader::parseFunctionRecord(ArrayRef<uint64_t> Record) {
 
   if (Record.size() < 8)
     return error("Invalid record");
-  Type *FTy = getTypeByID(Record[0]);
+  unsigned FTyID = Record[0];
+  Type *FTy = getTypeByID(FTyID);
   if (!FTy)
     return error("Invalid record");
-  if (auto *PTy = dyn_cast<PointerType>(FTy))
-    FTy = PTy->getPointerElementType();
+  if (isa<PointerType>(FTy)) {
+    FTyID = getContainedTypeID(FTyID, 0);
+    FTy = getTypeByID(FTyID);
+    if (!FTy)
+      return error("Missing element type for old-style function");
+  }
 
   if (!isa<FunctionType>(FTy))
     return error("Invalid type for value");
@@ -3390,7 +3827,7 @@ Error BitcodeReader::parseFunctionRecord(ArrayRef<uint64_t> Record) {
 
   assert(Func->getFunctionType() == FTy &&
          "Incorrect fully specified type provided for function");
-  FunctionTypes[Func] = cast<FunctionType>(FTy);
+  FunctionTypeIDs[Func] = FTyID;
 
   Func->setCallingConv(CC);
   bool isProto = Record[2];
@@ -3412,8 +3849,11 @@ Error BitcodeReader::parseFunctionRecord(ArrayRef<uint64_t> Record) {
 
       Func->removeParamAttr(i, Kind);
 
-      Type *PTy = cast<FunctionType>(FTy)->getParamType(i);
-      Type *PtrEltTy = PTy->getPointerElementType();
+      unsigned ParamTypeID = getContainedTypeID(FTyID, i + 1);
+      Type *PtrEltTy = getPtrElementTypeByID(ParamTypeID);
+      if (!PtrEltTy)
+        return error("Missing param element type for attribute upgrade");
+
       Attribute NewAttr;
       switch (Kind) {
       case Attribute::ByVal:
@@ -3433,6 +3873,16 @@ Error BitcodeReader::parseFunctionRecord(ArrayRef<uint64_t> Record) {
     }
   }
 
+  if (Func->getCallingConv() == CallingConv::X86_INTR &&
+      !Func->arg_empty() && !Func->hasParamAttribute(0, Attribute::ByVal)) {
+    unsigned ParamTypeID = getContainedTypeID(FTyID, 1);
+    Type *ByValTy = getPtrElementTypeByID(ParamTypeID);
+    if (!ByValTy)
+      return error("Missing param element type for x86_intrcc upgrade");
+    Attribute NewAttr = Attribute::getWithByValType(Context, ByValTy);
+    Func->addParamAttr(0, NewAttr);
+  }
+
   MaybeAlign Alignment;
   if (Error Err = parseAlignmentValue(Record[5], Alignment))
     return Err;
@@ -3495,7 +3945,7 @@ Error BitcodeReader::parseFunctionRecord(ArrayRef<uint64_t> Record) {
     Func->setPartition(StringRef(Strtab.data() + Record[17], Record[18]));
   }
 
-  ValueList.push_back(Func);
+  ValueList.push_back(Func, getVirtualTypeID(Func->getType(), FTyID));
 
   if (OperandInfo.PersonalityFn || OperandInfo.Prefix || OperandInfo.Prologue)
     FunctionOperands.push_back(OperandInfo);
@@ -3527,7 +3977,8 @@ Error BitcodeReader::parseGlobalIndirectSymbolRecord(
   if (Record.size() < (3 + (unsigned)NewRecord))
     return error("Invalid record");
   unsigned OpNum = 0;
-  Type *Ty = getTypeByID(Record[OpNum++]);
+  unsigned TypeID = Record[OpNum++];
+  Type *Ty = getTypeByID(TypeID);
   if (!Ty)
     return error("Invalid record");
 
@@ -3536,8 +3987,11 @@ Error BitcodeReader::parseGlobalIndirectSymbolRecord(
     auto *PTy = dyn_cast<PointerType>(Ty);
     if (!PTy)
       return error("Invalid type for value");
-    Ty = PTy->getPointerElementType();
     AddrSpace = PTy->getAddressSpace();
+    TypeID = getContainedTypeID(TypeID);
+    Ty = getTypeByID(TypeID);
+    if (!Ty)
+      return error("Missing element type for old-style indirect symbol");
   } else {
     AddrSpace = Record[OpNum++];
   }
@@ -3582,7 +4036,7 @@ Error BitcodeReader::parseGlobalIndirectSymbolRecord(
     OpNum += 2;
   }
 
-  ValueList.push_back(NewGA);
+  ValueList.push_back(NewGA, getVirtualTypeID(NewGA->getType(), TypeID));
   IndirectSymbolInits.push_back(std::make_pair(NewGA, Val));
   return Error::success();
 }
@@ -3639,8 +4093,8 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit,
           return Err;
         break;
       case bitc::BLOCKINFO_BLOCK_ID:
-        if (readBlockInfo())
-          return error("Malformed block");
+        if (Error Err = readBlockInfo())
+          return Err;
         break;
       case bitc::PARAMATTR_BLOCK_ID:
         if (Error Err = parseAttributeBlock())
@@ -3796,7 +4250,10 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit,
       std::string S;
       if (convertToString(Record, 0, S))
         return error("Invalid record");
-      TheModule->setDataLayout(S);
+      Expected<DataLayout> MaybeDL = DataLayout::parse(S);
+      if (!MaybeDL)
+        return MaybeDL.takeError();
+      TheModule->setDataLayout(MaybeDL.get());
       break;
     }
     case bitc::MODULE_CODE_ASM: {  // ASM: [strchr x N]
@@ -3894,18 +4351,20 @@ Error BitcodeReader::typeCheckLoadStoreInst(Type *ValType, Type *PtrType) {
   return Error::success();
 }
 
-void BitcodeReader::propagateAttributeTypes(CallBase *CB,
-                                            ArrayRef<Type *> ArgsTys) {
+Error BitcodeReader::propagateAttributeTypes(CallBase *CB,
+                                             ArrayRef<unsigned> ArgTyIDs) {
+  AttributeList Attrs = CB->getAttributes();
   for (unsigned i = 0; i != CB->arg_size(); ++i) {
     for (Attribute::AttrKind Kind : {Attribute::ByVal, Attribute::StructRet,
                                      Attribute::InAlloca}) {
-      if (!CB->paramHasAttr(i, Kind) ||
-          CB->getParamAttr(i, Kind).getValueAsType())
+      if (!Attrs.hasParamAttr(i, Kind) ||
+          Attrs.getParamAttr(i, Kind).getValueAsType())
         continue;
 
-      CB->removeParamAttr(i, Kind);
+      Type *PtrEltTy = getPtrElementTypeByID(ArgTyIDs[i]);
+      if (!PtrEltTy)
+        return error("Missing element type for typed attribute upgrade");
 
-      Type *PtrEltTy = ArgsTys[i]->getPointerElementType();
       Attribute NewAttr;
       switch (Kind) {
       case Attribute::ByVal:
@@ -3921,7 +4380,7 @@ void BitcodeReader::propagateAttributeTypes(CallBase *CB,
         llvm_unreachable("not an upgraded type attribute");
       }
 
-      CB->addParamAttr(i, NewAttr);
+      Attrs = Attrs.addParamAttribute(Context, i, NewAttr);
     }
   }
 
@@ -3932,10 +4391,13 @@ void BitcodeReader::propagateAttributeTypes(CallBase *CB,
       if (!CI.hasArg())
         continue;
 
-      if (CI.isIndirect && !CB->getAttributes().getParamElementType(ArgNo)) {
-        Type *ElemTy = ArgsTys[ArgNo]->getPointerElementType();
-        CB->addParamAttr(
-            ArgNo, Attribute::get(Context, Attribute::ElementType, ElemTy));
+      if (CI.isIndirect && !Attrs.getParamElementType(ArgNo)) {
+        Type *ElemTy = getPtrElementTypeByID(ArgTyIDs[ArgNo]);
+        if (!ElemTy)
+          return error("Missing element type for inline asm upgrade");
+        Attrs = Attrs.addParamAttribute(
+            Context, ArgNo,
+            Attribute::get(Context, Attribute::ElementType, ElemTy));
       }
 
       ArgNo++;
@@ -3945,15 +4407,41 @@ void BitcodeReader::propagateAttributeTypes(CallBase *CB,
   switch (CB->getIntrinsicID()) {
   case Intrinsic::preserve_array_access_index:
   case Intrinsic::preserve_struct_access_index:
-    if (!CB->getAttributes().getParamElementType(0)) {
-      Type *ElTy = ArgsTys[0]->getPointerElementType();
+  case Intrinsic::aarch64_ldaxr:
+  case Intrinsic::aarch64_ldxr:
+  case Intrinsic::aarch64_stlxr:
+  case Intrinsic::aarch64_stxr:
+  case Intrinsic::arm_ldaex:
+  case Intrinsic::arm_ldrex:
+  case Intrinsic::arm_stlex:
+  case Intrinsic::arm_strex: {
+    unsigned ArgNo;
+    switch (CB->getIntrinsicID()) {
+    case Intrinsic::aarch64_stlxr:
+    case Intrinsic::aarch64_stxr:
+    case Intrinsic::arm_stlex:
+    case Intrinsic::arm_strex:
+      ArgNo = 1;
+      break;
+    default:
+      ArgNo = 0;
+      break;
+    }
+    if (!Attrs.getParamElementType(ArgNo)) {
+      Type *ElTy = getPtrElementTypeByID(ArgTyIDs[ArgNo]);
+      if (!ElTy)
+        return error("Missing element type for elementtype upgrade");
       Attribute NewAttr = Attribute::get(Context, Attribute::ElementType, ElTy);
-      CB->addParamAttr(0, NewAttr);
+      Attrs = Attrs.addParamAttribute(Context, ArgNo, NewAttr);
     }
     break;
+  }
   default:
     break;
   }
+
+  CB->setAttributes(Attrs);
+  return Error::success();
 }
 
 /// Lazily parse the specified function body block.
@@ -3970,18 +4458,24 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
   unsigned ModuleMDLoaderSize = MDLoader->size();
 
   // Add all the function arguments to the value table.
-#ifndef NDEBUG
   unsigned ArgNo = 0;
-  FunctionType *FTy = FunctionTypes[F];
-#endif
+  unsigned FTyID = FunctionTypeIDs[F];
   for (Argument &I : F->args()) {
-    assert(I.getType() == FTy->getParamType(ArgNo++) &&
+    unsigned ArgTyID = getContainedTypeID(FTyID, ArgNo + 1);
+    assert(I.getType() == getTypeByID(ArgTyID) &&
            "Incorrect fully specified type for Function Argument");
-    ValueList.push_back(&I);
+    ValueList.push_back(&I, ArgTyID);
+    ++ArgNo;
   }
   unsigned NextValueNo = ValueList.size();
   BasicBlock *CurBB = nullptr;
   unsigned CurBBNo = 0;
+  // Block into which constant expressions from phi nodes are materialized.
+  BasicBlock *PhiConstExprBB = nullptr;
+  // Edge blocks for phi nodes into which constant expressions have been
+  // expanded.
+  SmallMapVector<std::pair<BasicBlock *, BasicBlock *>, BasicBlock *, 4>
+    ConstExprEdgeBBs;
 
   DebugLoc LastLoc;
   auto getLastInstruction = [&]() -> Instruction * {
@@ -4050,6 +4544,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     // Read a record.
     Record.clear();
     Instruction *I = nullptr;
+    unsigned ResTypeID = InvalidTypeID;
     Expected<unsigned> MaybeBitCode = Stream.readRecord(Entry.ID, Record);
     if (!MaybeBitCode)
       return MaybeBitCode.takeError();
@@ -4091,6 +4586,31 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       continue;
     }
 
+    case bitc::FUNC_CODE_BLOCKADDR_USERS: // BLOCKADDR_USERS: [vals...]
+      // The record should not be emitted if it's an empty list.
+      if (Record.empty())
+        return error("Invalid record");
+      // When we have the RARE case of a BlockAddress Constant that is not
+      // scoped to the Function it refers to, we need to conservatively
+      // materialize the referred to Function, regardless of whether or not
+      // that Function will ultimately be linked, otherwise users of
+      // BitcodeReader might start splicing out Function bodies such that we
+      // might no longer be able to materialize the BlockAddress since the
+      // BasicBlock (and entire body of the Function) the BlockAddress refers
+      // to may have been moved. In the case that the user of BitcodeReader
+      // decides ultimately not to link the Function body, materializing here
+      // could be considered wasteful, but it's better than a deserialization
+      // failure as described. This keeps BitcodeReader unaware of complex
+      // linkage policy decisions such as those use by LTO, leaving those
+      // decisions "one layer up."
+      for (uint64_t ValID : Record)
+        if (auto *F = dyn_cast<Function>(ValueList[ValID]))
+          BackwardRefFunctions.push_back(F);
+        else
+          return error("Invalid record");
+
+      continue;
+
     case bitc::FUNC_CODE_DEBUG_LOC_AGAIN:  // DEBUG_LOC_AGAIN
       // This record indicates that the last instruction is at the same
       // location as the previous instruction with a location.
@@ -4133,7 +4653,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_UNOP: {    // UNOP: [opval, ty, opcode]
       unsigned OpNum = 0;
       Value *LHS;
-      if (getValueTypePair(Record, OpNum, NextValueNo, LHS) ||
+      unsigned TypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, LHS, TypeID, CurBB) ||
           OpNum+1 > Record.size())
         return error("Invalid record");
 
@@ -4141,6 +4662,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       if (Opc == -1)
         return error("Invalid record");
       I = UnaryOperator::Create((Instruction::UnaryOps)Opc, LHS);
+      ResTypeID = TypeID;
       InstructionList.push_back(I);
       if (OpNum < Record.size()) {
         if (isa<FPMathOperator>(I)) {
@@ -4154,8 +4676,10 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_BINOP: {    // BINOP: [opval, ty, opval, opcode]
       unsigned OpNum = 0;
       Value *LHS, *RHS;
-      if (getValueTypePair(Record, OpNum, NextValueNo, LHS) ||
-          popValue(Record, OpNum, NextValueNo, LHS->getType(), RHS) ||
+      unsigned TypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, LHS, TypeID, CurBB) ||
+          popValue(Record, OpNum, NextValueNo, LHS->getType(), TypeID, RHS,
+                   CurBB) ||
           OpNum+1 > Record.size())
         return error("Invalid record");
 
@@ -4163,6 +4687,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       if (Opc == -1)
         return error("Invalid record");
       I = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS);
+      ResTypeID = TypeID;
       InstructionList.push_back(I);
       if (OpNum < Record.size()) {
         if (Opc == Instruction::Add ||
@@ -4191,11 +4716,13 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_CAST: {    // CAST: [opval, opty, destty, castopc]
       unsigned OpNum = 0;
       Value *Op;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Op) ||
+      unsigned OpTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Op, OpTypeID, CurBB) ||
           OpNum+2 != Record.size())
         return error("Invalid record");
 
-      Type *ResTy = getTypeByID(Record[OpNum]);
+      ResTypeID = Record[OpNum];
+      Type *ResTy = getTypeByID(ResTypeID);
       int Opc = getDecodedCastOpcode(Record[OpNum + 1]);
       if (Opc == -1 || !ResTy)
         return error("Invalid record");
@@ -4220,23 +4747,31 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_GEP: { // GEP: type, [n x operands]
       unsigned OpNum = 0;
 
+      unsigned TyID;
       Type *Ty;
       bool InBounds;
 
       if (BitCode == bitc::FUNC_CODE_INST_GEP) {
         InBounds = Record[OpNum++];
-        Ty = getTypeByID(Record[OpNum++]);
+        TyID = Record[OpNum++];
+        Ty = getTypeByID(TyID);
       } else {
         InBounds = BitCode == bitc::FUNC_CODE_INST_INBOUNDS_GEP_OLD;
+        TyID = InvalidTypeID;
         Ty = nullptr;
       }
 
       Value *BasePtr;
-      if (getValueTypePair(Record, OpNum, NextValueNo, BasePtr))
+      unsigned BasePtrTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, BasePtr, BasePtrTypeID,
+                           CurBB))
         return error("Invalid record");
 
       if (!Ty) {
-        Ty = BasePtr->getType()->getScalarType()->getPointerElementType();
+        TyID = getContainedTypeID(BasePtrTypeID);
+        if (BasePtr->getType()->isVectorTy())
+          TyID = getContainedTypeID(TyID);
+        Ty = getTypeByID(TyID);
       } else if (!cast<PointerType>(BasePtr->getType()->getScalarType())
                       ->isOpaqueOrPointeeTypeMatches(Ty)) {
         return error(
@@ -4246,13 +4781,37 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       SmallVector<Value*, 16> GEPIdx;
       while (OpNum != Record.size()) {
         Value *Op;
-        if (getValueTypePair(Record, OpNum, NextValueNo, Op))
+        unsigned OpTypeID;
+        if (getValueTypePair(Record, OpNum, NextValueNo, Op, OpTypeID, CurBB))
           return error("Invalid record");
         GEPIdx.push_back(Op);
       }
 
       I = GetElementPtrInst::Create(Ty, BasePtr, GEPIdx);
 
+      ResTypeID = TyID;
+      if (cast<GEPOperator>(I)->getNumIndices() != 0) {
+        auto GTI = std::next(gep_type_begin(I));
+        for (Value *Idx : drop_begin(cast<GEPOperator>(I)->indices())) {
+          unsigned SubType = 0;
+          if (GTI.isStruct()) {
+            ConstantInt *IdxC =
+                Idx->getType()->isVectorTy()
+                    ? cast<ConstantInt>(cast<Constant>(Idx)->getSplatValue())
+                    : cast<ConstantInt>(Idx);
+            SubType = IdxC->getZExtValue();
+          }
+          ResTypeID = getContainedTypeID(ResTypeID, SubType);
+          ++GTI;
+        }
+      }
+
+      // At this point ResTypeID is the result element type. We need a pointer
+      // or vector of pointer to it.
+      ResTypeID = getVirtualTypeID(I->getType()->getScalarType(), ResTypeID);
+      if (I->getType()->isVectorTy())
+        ResTypeID = getVirtualTypeID(I->getType(), ResTypeID);
+
       InstructionList.push_back(I);
       if (InBounds)
         cast<GetElementPtrInst>(I)->setIsInBounds(true);
@@ -4263,7 +4822,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
                                        // EXTRACTVAL: [opty, opval, n x indices]
       unsigned OpNum = 0;
       Value *Agg;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Agg))
+      unsigned AggTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Agg, AggTypeID, CurBB))
         return error("Invalid record");
       Type *Ty = Agg->getType();
 
@@ -4272,6 +4832,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         return error("EXTRACTVAL: Invalid instruction with 0 indices");
 
       SmallVector<unsigned, 4> EXTRACTVALIdx;
+      ResTypeID = AggTypeID;
       for (; OpNum != RecSize; ++OpNum) {
         bool IsArray = Ty->isArrayTy();
         bool IsStruct = Ty->isStructTy();
@@ -4287,10 +4848,13 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
           return error("EXTRACTVAL: Invalid array index");
         EXTRACTVALIdx.push_back((unsigned)Index);
 
-        if (IsStruct)
+        if (IsStruct) {
           Ty = Ty->getStructElementType(Index);
-        else
+          ResTypeID = getContainedTypeID(ResTypeID, Index);
+        } else {
           Ty = Ty->getArrayElementType();
+          ResTypeID = getContainedTypeID(ResTypeID);
+        }
       }
 
       I = ExtractValueInst::Create(Agg, EXTRACTVALIdx);
@@ -4302,10 +4866,12 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
                            // INSERTVAL: [opty, opval, opty, opval, n x indices]
       unsigned OpNum = 0;
       Value *Agg;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Agg))
+      unsigned AggTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Agg, AggTypeID, CurBB))
         return error("Invalid record");
       Value *Val;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Val))
+      unsigned ValTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Val, ValTypeID, CurBB))
         return error("Invalid record");
 
       unsigned RecSize = Record.size();
@@ -4339,6 +4905,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         return error("Inserted value type doesn't match aggregate type");
 
       I = InsertValueInst::Create(Agg, Val, INSERTVALIdx);
+      ResTypeID = AggTypeID;
       InstructionList.push_back(I);
       break;
     }
@@ -4348,12 +4915,18 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       // handles select i1 ... in old bitcode
       unsigned OpNum = 0;
       Value *TrueVal, *FalseVal, *Cond;
-      if (getValueTypePair(Record, OpNum, NextValueNo, TrueVal) ||
-          popValue(Record, OpNum, NextValueNo, TrueVal->getType(), FalseVal) ||
-          popValue(Record, OpNum, NextValueNo, Type::getInt1Ty(Context), Cond))
+      unsigned TypeID;
+      Type *CondType = Type::getInt1Ty(Context);
+      if (getValueTypePair(Record, OpNum, NextValueNo, TrueVal, TypeID,
+                           CurBB) ||
+          popValue(Record, OpNum, NextValueNo, TrueVal->getType(), TypeID,
+                   FalseVal, CurBB) ||
+          popValue(Record, OpNum, NextValueNo, CondType,
+                   getVirtualTypeID(CondType), Cond, CurBB))
         return error("Invalid record");
 
       I = SelectInst::Create(Cond, TrueVal, FalseVal);
+      ResTypeID = TypeID;
       InstructionList.push_back(I);
       break;
     }
@@ -4363,9 +4936,12 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       // handles select i1 or select [N x i1]
       unsigned OpNum = 0;
       Value *TrueVal, *FalseVal, *Cond;
-      if (getValueTypePair(Record, OpNum, NextValueNo, TrueVal) ||
-          popValue(Record, OpNum, NextValueNo, TrueVal->getType(), FalseVal) ||
-          getValueTypePair(Record, OpNum, NextValueNo, Cond))
+      unsigned ValTypeID, CondTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, TrueVal, ValTypeID,
+                           CurBB) ||
+          popValue(Record, OpNum, NextValueNo, TrueVal->getType(), ValTypeID,
+                   FalseVal, CurBB) ||
+          getValueTypePair(Record, OpNum, NextValueNo, Cond, CondTypeID, CurBB))
         return error("Invalid record");
 
       // select condition can be either i1 or [N x i1]
@@ -4381,6 +4957,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       }
 
       I = SelectInst::Create(Cond, TrueVal, FalseVal);
+      ResTypeID = ValTypeID;
       InstructionList.push_back(I);
       if (OpNum < Record.size() && isa<FPMathOperator>(I)) {
         FastMathFlags FMF = getDecodedFastMathFlags(Record[OpNum]);
@@ -4393,12 +4970,14 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_EXTRACTELT: { // EXTRACTELT: [opty, opval, opval]
       unsigned OpNum = 0;
       Value *Vec, *Idx;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Vec) ||
-          getValueTypePair(Record, OpNum, NextValueNo, Idx))
+      unsigned VecTypeID, IdxTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Vec, VecTypeID, CurBB) ||
+          getValueTypePair(Record, OpNum, NextValueNo, Idx, IdxTypeID, CurBB))
         return error("Invalid record");
       if (!Vec->getType()->isVectorTy())
         return error("Invalid type for value");
       I = ExtractElementInst::Create(Vec, Idx);
+      ResTypeID = getContainedTypeID(VecTypeID);
       InstructionList.push_back(I);
       break;
     }
@@ -4406,15 +4985,18 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_INSERTELT: { // INSERTELT: [ty, opval,opval,opval]
       unsigned OpNum = 0;
       Value *Vec, *Elt, *Idx;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Vec))
+      unsigned VecTypeID, IdxTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Vec, VecTypeID, CurBB))
         return error("Invalid record");
       if (!Vec->getType()->isVectorTy())
         return error("Invalid type for value");
       if (popValue(Record, OpNum, NextValueNo,
-                   cast<VectorType>(Vec->getType())->getElementType(), Elt) ||
-          getValueTypePair(Record, OpNum, NextValueNo, Idx))
+                   cast<VectorType>(Vec->getType())->getElementType(),
+                   getContainedTypeID(VecTypeID), Elt, CurBB) ||
+          getValueTypePair(Record, OpNum, NextValueNo, Idx, IdxTypeID, CurBB))
         return error("Invalid record");
       I = InsertElementInst::Create(Vec, Elt, Idx);
+      ResTypeID = VecTypeID;
       InstructionList.push_back(I);
       break;
     }
@@ -4422,16 +5004,22 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_SHUFFLEVEC: {// SHUFFLEVEC: [opval,ty,opval,opval]
       unsigned OpNum = 0;
       Value *Vec1, *Vec2, *Mask;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Vec1) ||
-          popValue(Record, OpNum, NextValueNo, Vec1->getType(), Vec2))
+      unsigned Vec1TypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Vec1, Vec1TypeID,
+                           CurBB) ||
+          popValue(Record, OpNum, NextValueNo, Vec1->getType(), Vec1TypeID,
+                   Vec2, CurBB))
         return error("Invalid record");
 
-      if (getValueTypePair(Record, OpNum, NextValueNo, Mask))
+      unsigned MaskTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Mask, MaskTypeID, CurBB))
         return error("Invalid record");
       if (!Vec1->getType()->isVectorTy() || !Vec2->getType()->isVectorTy())
         return error("Invalid type for value");
 
       I = new ShuffleVectorInst(Vec1, Vec2, Mask);
+      ResTypeID =
+          getVirtualTypeID(I->getType(), getContainedTypeID(Vec1TypeID));
       InstructionList.push_back(I);
       break;
     }
@@ -4445,8 +5033,10 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
 
       unsigned OpNum = 0;
       Value *LHS, *RHS;
-      if (getValueTypePair(Record, OpNum, NextValueNo, LHS) ||
-          popValue(Record, OpNum, NextValueNo, LHS->getType(), RHS))
+      unsigned LHSTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, LHS, LHSTypeID, CurBB) ||
+          popValue(Record, OpNum, NextValueNo, LHS->getType(), LHSTypeID, RHS,
+                   CurBB))
         return error("Invalid record");
 
       if (OpNum >= Record.size())
@@ -4467,6 +5057,10 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       else
         I = new ICmpInst((ICmpInst::Predicate)PredVal, LHS, RHS);
 
+      ResTypeID = getVirtualTypeID(I->getType()->getScalarType());
+      if (LHS->getType()->isVectorTy())
+        ResTypeID = getVirtualTypeID(I->getType(), ResTypeID);
+
       if (FMF.any())
         I->setFastMathFlags(FMF);
       InstructionList.push_back(I);
@@ -4484,7 +5078,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
 
         unsigned OpNum = 0;
         Value *Op = nullptr;
-        if (getValueTypePair(Record, OpNum, NextValueNo, Op))
+        unsigned OpTypeID;
+        if (getValueTypePair(Record, OpNum, NextValueNo, Op, OpTypeID, CurBB))
           return error("Invalid record");
         if (OpNum != Record.size())
           return error("Invalid record");
@@ -4506,8 +5101,9 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       }
       else {
         BasicBlock *FalseDest = getBasicBlock(Record[1]);
-        Value *Cond = getValue(Record, 2, NextValueNo,
-                               Type::getInt1Ty(Context));
+        Type *CondType = Type::getInt1Ty(Context);
+        Value *Cond = getValue(Record, 2, NextValueNo, CondType,
+                               getVirtualTypeID(CondType), CurBB);
         if (!FalseDest || !Cond)
           return error("Invalid record");
         I = BranchInst::Create(TrueDest, FalseDest, Cond);
@@ -4519,8 +5115,9 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       if (Record.size() != 1 && Record.size() != 2)
         return error("Invalid record");
       unsigned Idx = 0;
-      Value *CleanupPad =
-          getValue(Record, Idx++, NextValueNo, Type::getTokenTy(Context));
+      Type *TokenTy = Type::getTokenTy(Context);
+      Value *CleanupPad = getValue(Record, Idx++, NextValueNo, TokenTy,
+                                   getVirtualTypeID(TokenTy), CurBB);
       if (!CleanupPad)
         return error("Invalid record");
       BasicBlock *UnwindDest = nullptr;
@@ -4538,8 +5135,9 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       if (Record.size() != 2)
         return error("Invalid record");
       unsigned Idx = 0;
-      Value *CatchPad =
-          getValue(Record, Idx++, NextValueNo, Type::getTokenTy(Context));
+      Type *TokenTy = Type::getTokenTy(Context);
+      Value *CatchPad = getValue(Record, Idx++, NextValueNo, TokenTy,
+                                 getVirtualTypeID(TokenTy), CurBB);
       if (!CatchPad)
         return error("Invalid record");
       BasicBlock *BB = getBasicBlock(Record[Idx++]);
@@ -4557,8 +5155,9 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
 
       unsigned Idx = 0;
 
-      Value *ParentPad =
-          getValue(Record, Idx++, NextValueNo, Type::getTokenTy(Context));
+      Type *TokenTy = Type::getTokenTy(Context);
+      Value *ParentPad = getValue(Record, Idx++, NextValueNo, TokenTy,
+                                  getVirtualTypeID(TokenTy), CurBB);
 
       unsigned NumHandlers = Record[Idx++];
 
@@ -4585,6 +5184,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       for (BasicBlock *Handler : Handlers)
         CatchSwitch->addHandler(Handler);
       I = CatchSwitch;
+      ResTypeID = getVirtualTypeID(I->getType());
       InstructionList.push_back(I);
       break;
     }
@@ -4596,15 +5196,17 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
 
       unsigned Idx = 0;
 
-      Value *ParentPad =
-          getValue(Record, Idx++, NextValueNo, Type::getTokenTy(Context));
+      Type *TokenTy = Type::getTokenTy(Context);
+      Value *ParentPad = getValue(Record, Idx++, NextValueNo, TokenTy,
+                                  getVirtualTypeID(TokenTy), CurBB);
 
       unsigned NumArgOperands = Record[Idx++];
 
       SmallVector<Value *, 2> Args;
       for (unsigned Op = 0; Op != NumArgOperands; ++Op) {
         Value *Val;
-        if (getValueTypePair(Record, Idx, NextValueNo, Val))
+        unsigned ValTypeID;
+        if (getValueTypePair(Record, Idx, NextValueNo, Val, ValTypeID, nullptr))
           return error("Invalid record");
         Args.push_back(Val);
       }
@@ -4616,6 +5218,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         I = CleanupPadInst::Create(ParentPad, Args);
       else
         I = CatchPadInst::Create(ParentPad, Args);
+      ResTypeID = getVirtualTypeID(I->getType());
       InstructionList.push_back(I);
       break;
     }
@@ -4627,10 +5230,11 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         // Hopefully someday we will have support for case ranges and can use
         // this format again.
 
-        Type *OpTy = getTypeByID(Record[1]);
+        unsigned OpTyID = Record[1];
+        Type *OpTy = getTypeByID(OpTyID);
         unsigned ValueBitWidth = cast<IntegerType>(OpTy)->getBitWidth();
 
-        Value *Cond = getValue(Record, 2, NextValueNo, OpTy);
+        Value *Cond = getValue(Record, 2, NextValueNo, OpTy, OpTyID, CurBB);
         BasicBlock *Default = getBasicBlock(Record[3]);
         if (!OpTy || !Cond || !Default)
           return error("Invalid record");
@@ -4684,8 +5288,9 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
 
       if (Record.size() < 3 || (Record.size() & 1) == 0)
         return error("Invalid record");
-      Type *OpTy = getTypeByID(Record[0]);
-      Value *Cond = getValue(Record, 1, NextValueNo, OpTy);
+      unsigned OpTyID = Record[0];
+      Type *OpTy = getTypeByID(OpTyID);
+      Value *Cond = getValue(Record, 1, NextValueNo, OpTy, OpTyID, CurBB);
       BasicBlock *Default = getBasicBlock(Record[2]);
       if (!OpTy || !Cond || !Default)
         return error("Invalid record");
@@ -4693,8 +5298,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       SwitchInst *SI = SwitchInst::Create(Cond, Default, NumCases);
       InstructionList.push_back(SI);
       for (unsigned i = 0, e = NumCases; i != e; ++i) {
-        ConstantInt *CaseVal =
-          dyn_cast_or_null<ConstantInt>(getFnValueByID(Record[3+i*2], OpTy));
+        ConstantInt *CaseVal = dyn_cast_or_null<ConstantInt>(
+            getFnValueByID(Record[3+i*2], OpTy, OpTyID, nullptr));
         BasicBlock *DestBB = getBasicBlock(Record[1+3+i*2]);
         if (!CaseVal || !DestBB) {
           delete SI;
@@ -4708,8 +5313,9 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_INDIRECTBR: { // INDIRECTBR: [opty, op0, op1, ...]
       if (Record.size() < 2)
         return error("Invalid record");
-      Type *OpTy = getTypeByID(Record[0]);
-      Value *Address = getValue(Record, 1, NextValueNo, OpTy);
+      unsigned OpTyID = Record[0];
+      Type *OpTy = getTypeByID(OpTyID);
+      Value *Address = getValue(Record, 1, NextValueNo, OpTy, OpTyID, CurBB);
       if (!OpTy || !Address)
         return error("Invalid record");
       unsigned NumDests = Record.size()-2;
@@ -4737,23 +5343,27 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       BasicBlock *NormalBB = getBasicBlock(Record[OpNum++]);
       BasicBlock *UnwindBB = getBasicBlock(Record[OpNum++]);
 
+      unsigned FTyID = InvalidTypeID;
       FunctionType *FTy = nullptr;
       if ((CCInfo >> 13) & 1) {
-        FTy = dyn_cast<FunctionType>(getTypeByID(Record[OpNum++]));
+        FTyID = Record[OpNum++];
+        FTy = dyn_cast<FunctionType>(getTypeByID(FTyID));
         if (!FTy)
           return error("Explicit invoke type is not a function type");
       }
 
       Value *Callee;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Callee))
+      unsigned CalleeTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Callee, CalleeTypeID,
+                           CurBB))
         return error("Invalid record");
 
       PointerType *CalleeTy = dyn_cast<PointerType>(Callee->getType());
       if (!CalleeTy)
         return error("Callee is not a pointer");
       if (!FTy) {
-        FTy =
-            dyn_cast<FunctionType>(Callee->getType()->getPointerElementType());
+        FTyID = getContainedTypeID(CalleeTypeID);
+        FTy = dyn_cast_or_null<FunctionType>(getTypeByID(FTyID));
         if (!FTy)
           return error("Callee is not of pointer to function type");
       } else if (!CalleeTy->isOpaqueOrPointeeTypeMatches(FTy))
@@ -4763,11 +5373,12 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         return error("Insufficient operands to call");
 
       SmallVector<Value*, 16> Ops;
-      SmallVector<Type *, 16> ArgsTys;
+      SmallVector<unsigned, 16> ArgTyIDs;
       for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i, ++OpNum) {
-        Ops.push_back(getValue(Record, OpNum, NextValueNo,
-                               FTy->getParamType(i)));
-        ArgsTys.push_back(FTy->getParamType(i));
+        unsigned ArgTyID = getContainedTypeID(FTyID, i + 1);
+        Ops.push_back(getValue(Record, OpNum, NextValueNo, FTy->getParamType(i),
+                               ArgTyID, CurBB));
+        ArgTyIDs.push_back(ArgTyID);
         if (!Ops.back())
           return error("Invalid record");
       }
@@ -4779,28 +5390,38 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         // Read type/value pairs for varargs params.
         while (OpNum != Record.size()) {
           Value *Op;
-          if (getValueTypePair(Record, OpNum, NextValueNo, Op))
+          unsigned OpTypeID;
+          if (getValueTypePair(Record, OpNum, NextValueNo, Op, OpTypeID, CurBB))
             return error("Invalid record");
           Ops.push_back(Op);
-          ArgsTys.push_back(Op->getType());
+          ArgTyIDs.push_back(OpTypeID);
         }
       }
 
+      // Upgrade the bundles if needed.
+      if (!OperandBundles.empty())
+        UpgradeOperandBundles(OperandBundles);
+
       I = InvokeInst::Create(FTy, Callee, NormalBB, UnwindBB, Ops,
                              OperandBundles);
+      ResTypeID = getContainedTypeID(FTyID);
       OperandBundles.clear();
       InstructionList.push_back(I);
       cast<InvokeInst>(I)->setCallingConv(
           static_cast<CallingConv::ID>(CallingConv::MaxID & CCInfo));
       cast<InvokeInst>(I)->setAttributes(PAL);
-      propagateAttributeTypes(cast<CallBase>(I), ArgsTys);
+      if (Error Err = propagateAttributeTypes(cast<CallBase>(I), ArgTyIDs)) {
+        I->deleteValue();
+        return Err;
+      }
 
       break;
     }
     case bitc::FUNC_CODE_INST_RESUME: { // RESUME: [opval]
       unsigned Idx = 0;
       Value *Val = nullptr;
-      if (getValueTypePair(Record, Idx, NextValueNo, Val))
+      unsigned ValTypeID;
+      if (getValueTypePair(Record, Idx, NextValueNo, Val, ValTypeID, CurBB))
         return error("Invalid record");
       I = ResumeInst::Create(Val);
       InstructionList.push_back(I);
@@ -4818,23 +5439,27 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       for (unsigned i = 0, e = NumIndirectDests; i != e; ++i)
         IndirectDests.push_back(getBasicBlock(Record[OpNum++]));
 
+      unsigned FTyID = InvalidTypeID;
       FunctionType *FTy = nullptr;
       if ((CCInfo >> bitc::CALL_EXPLICIT_TYPE) & 1) {
-        FTy = dyn_cast<FunctionType>(getTypeByID(Record[OpNum++]));
+        FTyID = Record[OpNum++];
+        FTy = dyn_cast_or_null<FunctionType>(getTypeByID(FTyID));
         if (!FTy)
           return error("Explicit call type is not a function type");
       }
 
       Value *Callee;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Callee))
+      unsigned CalleeTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Callee, CalleeTypeID,
+                           CurBB))
         return error("Invalid record");
 
       PointerType *OpTy = dyn_cast<PointerType>(Callee->getType());
       if (!OpTy)
         return error("Callee is not a pointer type");
       if (!FTy) {
-        FTy =
-            dyn_cast<FunctionType>(Callee->getType()->getPointerElementType());
+        FTyID = getContainedTypeID(CalleeTypeID);
+        FTy = dyn_cast_or_null<FunctionType>(getTypeByID(FTyID));
         if (!FTy)
           return error("Callee is not of pointer to function type");
       } else if (!OpTy->isOpaqueOrPointeeTypeMatches(FTy))
@@ -4844,18 +5469,20 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         return error("Insufficient operands to call");
 
       SmallVector<Value*, 16> Args;
-      SmallVector<Type *, 16> ArgsTys;
+      SmallVector<unsigned, 16> ArgTyIDs;
       // Read the fixed params.
       for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i, ++OpNum) {
         Value *Arg;
+        unsigned ArgTyID = getContainedTypeID(FTyID, i + 1);
         if (FTy->getParamType(i)->isLabelTy())
           Arg = getBasicBlock(Record[OpNum]);
         else
-          Arg = getValue(Record, OpNum, NextValueNo, FTy->getParamType(i));
+          Arg = getValue(Record, OpNum, NextValueNo, FTy->getParamType(i),
+                         ArgTyID, CurBB);
         if (!Arg)
           return error("Invalid record");
         Args.push_back(Arg);
-        ArgsTys.push_back(Arg->getType());
+        ArgTyIDs.push_back(ArgTyID);
       }
 
       // Read type/value pairs for varargs params.
@@ -4865,21 +5492,30 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       } else {
         while (OpNum != Record.size()) {
           Value *Op;
-          if (getValueTypePair(Record, OpNum, NextValueNo, Op))
+          unsigned OpTypeID;
+          if (getValueTypePair(Record, OpNum, NextValueNo, Op, OpTypeID, CurBB))
             return error("Invalid record");
           Args.push_back(Op);
-          ArgsTys.push_back(Op->getType());
+          ArgTyIDs.push_back(OpTypeID);
         }
       }
 
+      // Upgrade the bundles if needed.
+      if (!OperandBundles.empty())
+        UpgradeOperandBundles(OperandBundles);
+
       I = CallBrInst::Create(FTy, Callee, DefaultDest, IndirectDests, Args,
                              OperandBundles);
+      ResTypeID = getContainedTypeID(FTyID);
       OperandBundles.clear();
       InstructionList.push_back(I);
       cast<CallBrInst>(I)->setCallingConv(
           static_cast<CallingConv::ID>((0x7ff & CCInfo) >> bitc::CALL_CCONV));
       cast<CallBrInst>(I)->setAttributes(PAL);
-      propagateAttributeTypes(cast<CallBase>(I), ArgsTys);
+      if (Error Err = propagateAttributeTypes(cast<CallBase>(I), ArgTyIDs)) {
+        I->deleteValue();
+        return Err;
+      }
       break;
     }
     case bitc::FUNC_CODE_INST_UNREACHABLE: // UNREACHABLE
@@ -4888,36 +5524,76 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       break;
     case bitc::FUNC_CODE_INST_PHI: { // PHI: [ty, val0,bb0, ...]
       if (Record.empty())
-        return error("Invalid record");
+        return error("Invalid phi record");
       // The first record specifies the type.
-      Type *Ty = getTypeByID(Record[0]);
+      unsigned TyID = Record[0];
+      Type *Ty = getTypeByID(TyID);
       if (!Ty)
-        return error("Invalid record");
+        return error("Invalid phi record");
 
       // Phi arguments are pairs of records of [value, basic block].
       // There is an optional final record for fast-math-flags if this phi has a
       // floating-point type.
       size_t NumArgs = (Record.size() - 1) / 2;
       PHINode *PN = PHINode::Create(Ty, NumArgs);
-      if ((Record.size() - 1) % 2 == 1 && !isa<FPMathOperator>(PN))
-        return error("Invalid record");
+      if ((Record.size() - 1) % 2 == 1 && !isa<FPMathOperator>(PN)) {
+        PN->deleteValue();
+        return error("Invalid phi record");
+      }
       InstructionList.push_back(PN);
 
+      SmallDenseMap<BasicBlock *, Value *> Args;
       for (unsigned i = 0; i != NumArgs; i++) {
-        Value *V;
+        BasicBlock *BB = getBasicBlock(Record[i * 2 + 2]);
+        if (!BB) {
+          PN->deleteValue();
+          return error("Invalid phi BB");
+        }
+
+        // Phi nodes may contain the same predecessor multiple times, in which
+        // case the incoming value must be identical. Directly reuse the already
+        // seen value here, to avoid expanding a constant expression multiple
+        // times.
+        auto It = Args.find(BB);
+        if (It != Args.end()) {
+          PN->addIncoming(It->second, BB);
+          continue;
+        }
+
+        // If there already is a block for this edge (from a different phi),
+        // use it.
+        BasicBlock *EdgeBB = ConstExprEdgeBBs.lookup({BB, CurBB});
+        if (!EdgeBB) {
+          // Otherwise, use a temporary block (that we will discard if it
+          // turns out to be unnecessary).
+          if (!PhiConstExprBB)
+            PhiConstExprBB = BasicBlock::Create(Context, "phi.constexpr", F);
+          EdgeBB = PhiConstExprBB;
+        }
+
         // With the new function encoding, it is possible that operands have
         // negative IDs (for forward references).  Use a signed VBR
         // representation to keep the encoding small.
+        Value *V;
         if (UseRelativeIDs)
-          V = getValueSigned(Record, i * 2 + 1, NextValueNo, Ty);
+          V = getValueSigned(Record, i * 2 + 1, NextValueNo, Ty, TyID, EdgeBB);
         else
-          V = getValue(Record, i * 2 + 1, NextValueNo, Ty);
-        BasicBlock *BB = getBasicBlock(Record[i * 2 + 2]);
-        if (!V || !BB)
-          return error("Invalid record");
+          V = getValue(Record, i * 2 + 1, NextValueNo, Ty, TyID, EdgeBB);
+        if (!V) {
+          PN->deleteValue();
+          PhiConstExprBB->eraseFromParent();
+          return error("Invalid phi record");
+        }
+
+        if (EdgeBB == PhiConstExprBB && !EdgeBB->empty()) {
+          ConstExprEdgeBBs.insert({{BB, CurBB}, EdgeBB});
+          PhiConstExprBB = nullptr;
+        }
         PN->addIncoming(V, BB);
+        Args.insert({BB, V});
       }
       I = PN;
+      ResTypeID = TyID;
 
       // If there are an even number of records, the final record must be FMF.
       if (Record.size() % 2 == 0) {
@@ -4942,12 +5618,15 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         if (Record.size() < 4)
           return error("Invalid record");
       }
-      Type *Ty = getTypeByID(Record[Idx++]);
+      ResTypeID = Record[Idx++];
+      Type *Ty = getTypeByID(ResTypeID);
       if (!Ty)
         return error("Invalid record");
       if (BitCode == bitc::FUNC_CODE_INST_LANDINGPAD_OLD) {
         Value *PersFn = nullptr;
-        if (getValueTypePair(Record, Idx, NextValueNo, PersFn))
+        unsigned PersFnTypeID;
+        if (getValueTypePair(Record, Idx, NextValueNo, PersFn, PersFnTypeID,
+                             nullptr))
           return error("Invalid record");
 
         if (!F->hasPersonalityFn())
@@ -4964,8 +5643,10 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         LandingPadInst::ClauseType CT =
           LandingPadInst::ClauseType(Record[Idx++]); (void)CT;
         Value *Val;
+        unsigned ValTypeID;
 
-        if (getValueTypePair(Record, Idx, NextValueNo, Val)) {
+        if (getValueTypePair(Record, Idx, NextValueNo, Val, ValTypeID,
+                             nullptr)) {
           delete LP;
           return error("Invalid record");
         }
@@ -4985,21 +5666,23 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     }
 
     case bitc::FUNC_CODE_INST_ALLOCA: { // ALLOCA: [instty, opty, op, align]
-      if (Record.size() != 4)
+      if (Record.size() != 4 && Record.size() != 5)
         return error("Invalid record");
       using APV = AllocaPackedValues;
       const uint64_t Rec = Record[3];
       const bool InAlloca = Bitfield::get<APV::UsedWithInAlloca>(Rec);
       const bool SwiftError = Bitfield::get<APV::SwiftError>(Rec);
-      Type *Ty = getTypeByID(Record[0]);
+      unsigned TyID = Record[0];
+      Type *Ty = getTypeByID(TyID);
       if (!Bitfield::get<APV::ExplicitType>(Rec)) {
-        auto *PTy = dyn_cast_or_null<PointerType>(Ty);
-        if (!PTy)
-          return error("Old-style alloca with a non-pointer type");
-        Ty = PTy->getPointerElementType();
+        TyID = getContainedTypeID(TyID);
+        Ty = getTypeByID(TyID);
+        if (!Ty)
+          return error("Missing element type for old-style alloca");
       }
-      Type *OpTy = getTypeByID(Record[1]);
-      Value *Size = getFnValueByID(Record[2], OpTy);
+      unsigned OpTyID = Record[1];
+      Type *OpTy = getTypeByID(OpTyID);
+      Value *Size = getFnValueByID(Record[2], OpTy, OpTyID, CurBB);
       MaybeAlign Align;
       uint64_t AlignExp =
           Bitfield::get<APV::AlignLower>(Rec) |
@@ -5010,9 +5693,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       if (!Ty || !Size)
         return error("Invalid record");
 
-      // FIXME: Make this an optional field.
       const DataLayout &DL = TheModule->getDataLayout();
-      unsigned AS = DL.getAllocaAddrSpace();
+      unsigned AS = Record.size() == 5 ? Record[4] : DL.getAllocaAddrSpace();
 
       SmallPtrSet<Type *, 4> Visited;
       if (!Align && !Ty->isSized(&Visited))
@@ -5024,13 +5706,15 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       AI->setUsedWithInAlloca(InAlloca);
       AI->setSwiftError(SwiftError);
       I = AI;
+      ResTypeID = getVirtualTypeID(AI->getType(), TyID);
       InstructionList.push_back(I);
       break;
     }
     case bitc::FUNC_CODE_INST_LOAD: { // LOAD: [opty, op, align, vol]
       unsigned OpNum = 0;
       Value *Op;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Op) ||
+      unsigned OpTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Op, OpTypeID, CurBB) ||
           (OpNum + 2 != Record.size() && OpNum + 3 != Record.size()))
         return error("Invalid record");
 
@@ -5039,9 +5723,13 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
 
       Type *Ty = nullptr;
       if (OpNum + 3 == Record.size()) {
-        Ty = getTypeByID(Record[OpNum++]);
+        ResTypeID = Record[OpNum++];
+        Ty = getTypeByID(ResTypeID);
       } else {
-        Ty = Op->getType()->getPointerElementType();
+        ResTypeID = getContainedTypeID(OpTypeID);
+        Ty = getTypeByID(ResTypeID);
+        if (!Ty)
+          return error("Missing element type for old-style load");
       }
 
       if (Error Err = typeCheckLoadStoreInst(Ty, Op->getType()))
@@ -5063,7 +5751,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
        // LOADATOMIC: [opty, op, align, vol, ordering, ssid]
       unsigned OpNum = 0;
       Value *Op;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Op) ||
+      unsigned OpTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Op, OpTypeID, CurBB) ||
           (OpNum + 4 != Record.size() && OpNum + 5 != Record.size()))
         return error("Invalid record");
 
@@ -5072,9 +5761,13 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
 
       Type *Ty = nullptr;
       if (OpNum + 5 == Record.size()) {
-        Ty = getTypeByID(Record[OpNum++]);
+        ResTypeID = Record[OpNum++];
+        Ty = getTypeByID(ResTypeID);
       } else {
-        Ty = Op->getType()->getPointerElementType();
+        ResTypeID = getContainedTypeID(OpTypeID);
+        Ty = getTypeByID(ResTypeID);
+        if (!Ty)
+          return error("Missing element type for old style atomic load");
       }
 
       if (Error Err = typeCheckLoadStoreInst(Ty, Op->getType()))
@@ -5102,12 +5795,21 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_STORE_OLD: { // STORE2:[ptrty, ptr, val, align, vol]
       unsigned OpNum = 0;
       Value *Val, *Ptr;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) ||
-          (BitCode == bitc::FUNC_CODE_INST_STORE
-               ? getValueTypePair(Record, OpNum, NextValueNo, Val)
-               : popValue(Record, OpNum, NextValueNo,
-                          Ptr->getType()->getPointerElementType(), Val)) ||
-          OpNum + 2 != Record.size())
+      unsigned PtrTypeID, ValTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Ptr, PtrTypeID, CurBB))
+        return error("Invalid record");
+
+      if (BitCode == bitc::FUNC_CODE_INST_STORE) {
+        if (getValueTypePair(Record, OpNum, NextValueNo, Val, ValTypeID, CurBB))
+          return error("Invalid record");
+      } else {
+        ValTypeID = getContainedTypeID(PtrTypeID);
+        if (popValue(Record, OpNum, NextValueNo, getTypeByID(ValTypeID),
+                     ValTypeID, Val, CurBB))
+          return error("Invalid record");
+      }
+
+      if (OpNum + 2 != Record.size())
         return error("Invalid record");
 
       if (Error Err = typeCheckLoadStoreInst(Val->getType(), Ptr->getType()))
@@ -5129,13 +5831,21 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       // STOREATOMIC: [ptrty, ptr, val, align, vol, ordering, ssid]
       unsigned OpNum = 0;
       Value *Val, *Ptr;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) ||
-          !isa<PointerType>(Ptr->getType()) ||
-          (BitCode == bitc::FUNC_CODE_INST_STOREATOMIC
-               ? getValueTypePair(Record, OpNum, NextValueNo, Val)
-               : popValue(Record, OpNum, NextValueNo,
-                          Ptr->getType()->getPointerElementType(), Val)) ||
-          OpNum + 4 != Record.size())
+      unsigned PtrTypeID, ValTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Ptr, PtrTypeID, CurBB) ||
+          !isa<PointerType>(Ptr->getType()))
+        return error("Invalid record");
+      if (BitCode == bitc::FUNC_CODE_INST_STOREATOMIC) {
+        if (getValueTypePair(Record, OpNum, NextValueNo, Val, ValTypeID, CurBB))
+          return error("Invalid record");
+      } else {
+        ValTypeID = getContainedTypeID(PtrTypeID);
+        if (popValue(Record, OpNum, NextValueNo, getTypeByID(ValTypeID),
+                     ValTypeID, Val, CurBB))
+          return error("Invalid record");
+      }
+
+      if (OpNum + 4 != Record.size())
         return error("Invalid record");
 
       if (Error Err = typeCheckLoadStoreInst(Val->getType(), Ptr->getType()))
@@ -5164,20 +5874,22 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       const size_t NumRecords = Record.size();
       unsigned OpNum = 0;
       Value *Ptr = nullptr;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Ptr))
+      unsigned PtrTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Ptr, PtrTypeID, CurBB))
         return error("Invalid record");
 
       if (!isa<PointerType>(Ptr->getType()))
         return error("Cmpxchg operand is not a pointer type");
 
       Value *Cmp = nullptr;
-      if (popValue(Record, OpNum, NextValueNo,
-                   cast<PointerType>(Ptr->getType())->getPointerElementType(),
-                   Cmp))
+      unsigned CmpTypeID = getContainedTypeID(PtrTypeID);
+      if (popValue(Record, OpNum, NextValueNo, getTypeByID(CmpTypeID),
+                   CmpTypeID, Cmp, CurBB))
         return error("Invalid record");
 
       Value *New = nullptr;
-      if (popValue(Record, OpNum, NextValueNo, Cmp->getType(), New) ||
+      if (popValue(Record, OpNum, NextValueNo, Cmp->getType(), CmpTypeID,
+                   New, CurBB) ||
           NumRecords < OpNum + 3 || NumRecords > OpNum + 5)
         return error("Invalid record");
 
@@ -5214,8 +5926,11 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         // expecting the first component of a modern cmpxchg.
         CurBB->getInstList().push_back(I);
         I = ExtractValueInst::Create(I, 0);
+        ResTypeID = CmpTypeID;
       } else {
         cast<AtomicCmpXchgInst>(I)->setWeak(Record[OpNum + 4]);
+        unsigned I1TypeID = getVirtualTypeID(Type::getInt1Ty(Context));
+        ResTypeID = getVirtualTypeID(I->getType(), {CmpTypeID, I1TypeID});
       }
 
       InstructionList.push_back(I);
@@ -5227,18 +5942,21 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       const size_t NumRecords = Record.size();
       unsigned OpNum = 0;
       Value *Ptr = nullptr;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Ptr))
+      unsigned PtrTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Ptr, PtrTypeID, CurBB))
         return error("Invalid record");
 
       if (!isa<PointerType>(Ptr->getType()))
         return error("Cmpxchg operand is not a pointer type");
 
       Value *Cmp = nullptr;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Cmp))
+      unsigned CmpTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Cmp, CmpTypeID, CurBB))
         return error("Invalid record");
 
       Value *Val = nullptr;
-      if (popValue(Record, OpNum, NextValueNo, Cmp->getType(), Val))
+      if (popValue(Record, OpNum, NextValueNo, Cmp->getType(), CmpTypeID, Val,
+                   CurBB))
         return error("Invalid record");
 
       if (NumRecords < OpNum + 3 || NumRecords > OpNum + 6)
@@ -5278,6 +5996,9 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       cast<AtomicCmpXchgInst>(I)->setVolatile(IsVol);
       cast<AtomicCmpXchgInst>(I)->setWeak(IsWeak);
 
+      unsigned I1TypeID = getVirtualTypeID(Type::getInt1Ty(Context));
+      ResTypeID = getVirtualTypeID(I->getType(), {CmpTypeID, I1TypeID});
+
       InstructionList.push_back(I);
       break;
     }
@@ -5289,20 +6010,22 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       unsigned OpNum = 0;
 
       Value *Ptr = nullptr;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Ptr))
+      unsigned PtrTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Ptr, PtrTypeID, CurBB))
         return error("Invalid record");
 
       if (!isa<PointerType>(Ptr->getType()))
         return error("Invalid record");
 
       Value *Val = nullptr;
+      unsigned ValTypeID = InvalidTypeID;
       if (BitCode == bitc::FUNC_CODE_INST_ATOMICRMW_OLD) {
+        ValTypeID = getContainedTypeID(PtrTypeID);
         if (popValue(Record, OpNum, NextValueNo,
-                     cast<PointerType>(Ptr->getType())->getPointerElementType(),
-                     Val))
+                     getTypeByID(ValTypeID), ValTypeID, Val, CurBB))
           return error("Invalid record");
       } else {
-        if (getValueTypePair(Record, OpNum, NextValueNo, Val))
+        if (getValueTypePair(Record, OpNum, NextValueNo, Val, ValTypeID, CurBB))
           return error("Invalid record");
       }
 
@@ -5336,6 +6059,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
             Align(TheModule->getDataLayout().getTypeStoreSize(Val->getType()));
 
       I = new AtomicRMWInst(Operation, Ptr, Val, *Alignment, Ordering, SSID);
+      ResTypeID = ValTypeID;
       cast<AtomicRMWInst>(I)->setVolatile(IsVol);
 
       InstructionList.push_back(I);
@@ -5370,23 +6094,27 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
           return error("Fast math flags indicator set for call with no FMF");
       }
 
+      unsigned FTyID = InvalidTypeID;
       FunctionType *FTy = nullptr;
       if ((CCInfo >> bitc::CALL_EXPLICIT_TYPE) & 1) {
-        FTy = dyn_cast<FunctionType>(getTypeByID(Record[OpNum++]));
+        FTyID = Record[OpNum++];
+        FTy = dyn_cast_or_null<FunctionType>(getTypeByID(FTyID));
         if (!FTy)
           return error("Explicit call type is not a function type");
       }
 
       Value *Callee;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Callee))
+      unsigned CalleeTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Callee, CalleeTypeID,
+                           CurBB))
         return error("Invalid record");
 
       PointerType *OpTy = dyn_cast<PointerType>(Callee->getType());
       if (!OpTy)
         return error("Callee is not a pointer type");
       if (!FTy) {
-        FTy =
-            dyn_cast<FunctionType>(Callee->getType()->getPointerElementType());
+        FTyID = getContainedTypeID(CalleeTypeID);
+        FTy = dyn_cast_or_null<FunctionType>(getTypeByID(FTyID));
         if (!FTy)
           return error("Callee is not of pointer to function type");
       } else if (!OpTy->isOpaqueOrPointeeTypeMatches(FTy))
@@ -5396,15 +6124,16 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         return error("Insufficient operands to call");
 
       SmallVector<Value*, 16> Args;
-      SmallVector<Type *, 16> ArgsTys;
+      SmallVector<unsigned, 16> ArgTyIDs;
       // Read the fixed params.
       for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i, ++OpNum) {
+        unsigned ArgTyID = getContainedTypeID(FTyID, i + 1);
         if (FTy->getParamType(i)->isLabelTy())
           Args.push_back(getBasicBlock(Record[OpNum]));
         else
           Args.push_back(getValue(Record, OpNum, NextValueNo,
-                                  FTy->getParamType(i)));
-        ArgsTys.push_back(FTy->getParamType(i));
+                                  FTy->getParamType(i), ArgTyID, CurBB));
+        ArgTyIDs.push_back(ArgTyID);
         if (!Args.back())
           return error("Invalid record");
       }
@@ -5416,14 +6145,20 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       } else {
         while (OpNum != Record.size()) {
           Value *Op;
-          if (getValueTypePair(Record, OpNum, NextValueNo, Op))
+          unsigned OpTypeID;
+          if (getValueTypePair(Record, OpNum, NextValueNo, Op, OpTypeID, CurBB))
             return error("Invalid record");
           Args.push_back(Op);
-          ArgsTys.push_back(Op->getType());
+          ArgTyIDs.push_back(OpTypeID);
         }
       }
 
+      // Upgrade the bundles if needed.
+      if (!OperandBundles.empty())
+        UpgradeOperandBundles(OperandBundles);
+
       I = CallInst::Create(FTy, Callee, Args, OperandBundles);
+      ResTypeID = getContainedTypeID(FTyID);
       OperandBundles.clear();
       InstructionList.push_back(I);
       cast<CallInst>(I)->setCallingConv(
@@ -5437,7 +6172,10 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         TCK = CallInst::TCK_NoTail;
       cast<CallInst>(I)->setTailCallKind(TCK);
       cast<CallInst>(I)->setAttributes(PAL);
-      propagateAttributeTypes(cast<CallBase>(I), ArgsTys);
+      if (Error Err = propagateAttributeTypes(cast<CallBase>(I), ArgTyIDs)) {
+        I->deleteValue();
+        return Err;
+      }
       if (FMF.any()) {
         if (!isa<FPMathOperator>(I))
           return error("Fast-math-flags specified for call without "
@@ -5449,9 +6187,11 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_VAARG: { // VAARG: [valistty, valist, instty]
       if (Record.size() < 3)
         return error("Invalid record");
-      Type *OpTy = getTypeByID(Record[0]);
-      Value *Op = getValue(Record, 1, NextValueNo, OpTy);
-      Type *ResTy = getTypeByID(Record[2]);
+      unsigned OpTyID = Record[0];
+      Type *OpTy = getTypeByID(OpTyID);
+      Value *Op = getValue(Record, 1, NextValueNo, OpTy, OpTyID, CurBB);
+      ResTypeID = Record[2];
+      Type *ResTy = getTypeByID(ResTypeID);
       if (!OpTy || !Op || !ResTy)
         return error("Invalid record");
       I = new VAArgInst(Op, ResTy);
@@ -5472,7 +6212,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       unsigned OpNum = 1;
       while (OpNum != Record.size()) {
         Value *Op;
-        if (getValueTypePair(Record, OpNum, NextValueNo, Op))
+        unsigned OpTypeID;
+        if (getValueTypePair(Record, OpNum, NextValueNo, Op, OpTypeID, CurBB))
           return error("Invalid record");
         Inputs.push_back(Op);
       }
@@ -5484,12 +6225,14 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_FREEZE: { // FREEZE: [opty,opval]
       unsigned OpNum = 0;
       Value *Op = nullptr;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Op))
+      unsigned OpTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Op, OpTypeID, CurBB))
         return error("Invalid record");
       if (OpNum != Record.size())
         return error("Invalid record");
 
       I = new FreezeInst(Op);
+      ResTypeID = OpTypeID;
       InstructionList.push_back(I);
       break;
     }
@@ -5514,8 +6257,12 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     }
 
     // Non-void values get registered in the value table for future use.
-    if (!I->getType()->isVoidTy())
-      ValueList.assignValue(I, NextValueNo++);
+    if (!I->getType()->isVoidTy()) {
+      assert(I->getType() == getTypeByID(ResTypeID) &&
+             "Incorrect result type ID");
+      if (Error Err = ValueList.assignValue(NextValueNo++, I, ResTypeID))
+        return Err;
+    }
   }
 
 OutOfRecordLoop:
@@ -5541,6 +6288,19 @@ OutOfRecordLoop:
   if (MDLoader->hasFwdRefs())
     return error("Invalid function metadata: outgoing forward refs");
 
+  if (PhiConstExprBB)
+    PhiConstExprBB->eraseFromParent();
+
+  for (const auto &Pair : ConstExprEdgeBBs) {
+    BasicBlock *From = Pair.first.first;
+    BasicBlock *To = Pair.first.second;
+    BasicBlock *EdgeBB = Pair.second;
+    BranchInst::Create(To, EdgeBB);
+    From->getTerminator()->replaceSuccessorWith(To, EdgeBB);
+    To->replacePhiUsesWith(From, EdgeBB);
+    EdgeBB->moveBefore(To);
+  }
+
   // Trim the value list down to the size it was before we parsed this function.
   ValueList.shrinkTo(ModuleValueListSize);
   MDLoader->shrinkTo(ModuleMDLoaderSize);
@@ -5913,8 +6673,8 @@ Error ModuleSummaryIndexBitcodeReader::parseModule() {
         break;
       case bitc::BLOCKINFO_BLOCK_ID:
         // Need to parse these to get abbrev ids (e.g. for VST)
-        if (readBlockInfo())
-          return error("Malformed block");
+        if (Error Err = readBlockInfo())
+          return Err;
         break;
       case bitc::VALUE_SYMTAB_BLOCK_ID:
         // Should have been parsed earlier via VSTOffset, unless there
diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
index 0f4111514057..0d57ae4ef9df 100644
--- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -9,74 +9,60 @@
 #include "MetadataLoader.h"
 #include "ValueList.h"
 
-#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/None.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/ADT/ilist_iterator.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/LLVMBitCodes.h"
 #include "llvm/Bitstream/BitstreamReader.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/Attributes.h"
 #include "llvm/IR/AutoUpgrade.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/Comdat.h"
-#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/GVMaterializer.h"
-#include "llvm/IR/GlobalAlias.h"
-#include "llvm/IR/GlobalIFunc.h"
 #include "llvm/IR/GlobalObject.h"
-#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/ModuleSummaryIndex.h"
-#include "llvm/IR/OperandTraits.h"
 #include "llvm/IR/TrackingMDRef.h"
 #include "llvm/IR/Type.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/type_traits.h"
+
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
 #include <deque>
+#include <iterator>
 #include <limits>
-#include <map>
 #include <string>
-#include <system_error>
 #include <tuple>
+#include <type_traits>
 #include <utility>
 #include <vector>
+namespace llvm {
+class Argument;
+}
 
 using namespace llvm;
 
@@ -678,8 +664,8 @@ public:
 
   bool hasSeenOldLoopTags() const { return HasSeenOldLoopTags; }
 
-  Error parseMetadataAttachment(
-      Function &F, const SmallVectorImpl<Instruction *> &InstructionList);
+  Error parseMetadataAttachment(Function &F,
+                                ArrayRef<Instruction *> InstructionList);
 
   Error parseMetadataKinds();
 
@@ -1233,14 +1219,16 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
       break;
     }
 
-    Type *Ty = getTypeByID(Record[0]);
+    unsigned TyID = Record[0];
+    Type *Ty = getTypeByID(TyID);
     if (Ty->isMetadataTy() || Ty->isVoidTy()) {
       dropRecord();
       break;
     }
 
     MetadataList.assignValue(
-        LocalAsMetadata::get(ValueList.getValueFwdRef(Record[1], Ty)),
+        LocalAsMetadata::get(ValueList.getValueFwdRef(
+            Record[1], Ty, TyID, /*ConstExprInsertBB*/ nullptr)),
         NextMetadataNo);
     NextMetadataNo++;
     break;
@@ -1253,14 +1241,15 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     unsigned Size = Record.size();
     SmallVector<Metadata *, 8> Elts;
     for (unsigned i = 0; i != Size; i += 2) {
-      Type *Ty = getTypeByID(Record[i]);
+      unsigned TyID = Record[i];
+      Type *Ty = getTypeByID(TyID);
       if (!Ty)
         return error("Invalid record");
       if (Ty->isMetadataTy())
         Elts.push_back(getMD(Record[i + 1]));
       else if (!Ty->isVoidTy()) {
-        auto *MD =
-            ValueAsMetadata::get(ValueList.getValueFwdRef(Record[i + 1], Ty));
+        auto *MD = ValueAsMetadata::get(ValueList.getValueFwdRef(
+            Record[i + 1], Ty, TyID, /*ConstExprInsertBB*/ nullptr));
         assert(isa<ConstantAsMetadata>(MD) &&
                "Expected non-function-local metadata");
         Elts.push_back(MD);
@@ -1275,12 +1264,14 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     if (Record.size() != 2)
       return error("Invalid record");
 
-    Type *Ty = getTypeByID(Record[0]);
+    unsigned TyID = Record[0];
+    Type *Ty = getTypeByID(TyID);
     if (Ty->isMetadataTy() || Ty->isVoidTy())
       return error("Invalid record");
 
     MetadataList.assignValue(
-        ValueAsMetadata::get(ValueList.getValueFwdRef(Record[1], Ty)),
+        ValueAsMetadata::get(ValueList.getValueFwdRef(
+            Record[1], Ty, TyID, /*ConstExprInsertBB*/ nullptr)),
         NextMetadataNo);
     NextMetadataNo++;
     break;
@@ -1514,6 +1505,15 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
          Tag == dwarf::DW_TAG_structure_type ||
          Tag == dwarf::DW_TAG_union_type)) {
       Flags = Flags | DINode::FlagFwdDecl;
+      if (Name) {
+        // This is a hack around preserving template parameters for simplified
+        // template names - it should probably be replaced with a
+        // DICompositeType flag specifying whether template parameters are
+        // required on declarations of this type.
+        StringRef NameStr = Name->getString();
+        if (!NameStr.contains('<') || NameStr.startswith("_STN|"))
+          TemplateParams = getMDOrNull(Record[14]);
+      }
     } else {
       BaseType = getDITypeRefOrNull(Record[6]);
       OffsetInBits = Record[9];
@@ -1700,6 +1700,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     bool HasThisAdj = true;
     bool HasThrownTypes = true;
     bool HasAnnotations = false;
+    bool HasTargetFuncName = false;
     unsigned OffsetA = 0;
     unsigned OffsetB = 0;
     if (!HasSPFlags) {
@@ -1713,6 +1714,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
       HasThrownTypes = Record.size() >= 21;
     } else {
       HasAnnotations = Record.size() >= 19;
+      HasTargetFuncName = Record.size() >= 20;
     }
     Metadata *CUorFn = getMDOrNull(Record[12 + OffsetB]);
     DISubprogram *SP = GET_OR_DISTINCT(
@@ -1737,7 +1739,9 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
          HasThrownTypes ? getMDOrNull(Record[17 + OffsetB])
                         : nullptr, // thrownTypes
          HasAnnotations ? getMDOrNull(Record[18 + OffsetB])
-                        : nullptr // annotations
+                        : nullptr, // annotations
+         HasTargetFuncName ? getMDString(Record[19 + OffsetB])
+                           : nullptr // targetFuncName
          ));
     MetadataList.assignValue(SP, NextMetadataNo);
     NextMetadataNo++;
@@ -2047,8 +2051,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     break;
   }
   case bitc::METADATA_IMPORTED_ENTITY: {
-    if (Record.size() < 6 && Record.size() > 8)
-      return error("Invalid record");
+    if (Record.size() < 6 || Record.size() > 8)
+      return error("Invalid DIImportedEntity record");
 
     IsDistinct = Record[0];
     bool HasFile = (Record.size() >= 7);
@@ -2181,7 +2185,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseGlobalObjectAttachment(
 
 /// Parse metadata attachments.
 Error MetadataLoader::MetadataLoaderImpl::parseMetadataAttachment(
-    Function &F, const SmallVectorImpl<Instruction *> &InstructionList) {
+    Function &F, ArrayRef<Instruction *> InstructionList) {
   if (Error Err = Stream.EnterSubBlock(bitc::METADATA_ATTACHMENT_ID))
     return Err;
 
@@ -2357,7 +2361,7 @@ DISubprogram *MetadataLoader::lookupSubprogramForFunction(Function *F) {
 }
 
 Error MetadataLoader::parseMetadataAttachment(
-    Function &F, const SmallVectorImpl<Instruction *> &InstructionList) {
+    Function &F, ArrayRef<Instruction *> InstructionList) {
   return Pimpl->parseMetadataAttachment(F, InstructionList);
 }
 
diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.h b/llvm/lib/Bitcode/Reader/MetadataLoader.h
index 709800850f0d..653f1402bead 100644
--- a/llvm/lib/Bitcode/Reader/MetadataLoader.h
+++ b/llvm/lib/Bitcode/Reader/MetadataLoader.h
@@ -13,7 +13,6 @@
 #ifndef LLVM_LIB_BITCODE_READER_METADATALOADER_H
 #define LLVM_LIB_BITCODE_READER_METADATALOADER_H
 
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Error.h"
 
 #include <functional>
@@ -28,6 +27,7 @@ class Instruction;
 class Metadata;
 class Module;
 class Type;
+template <typename T> class ArrayRef;
 
 /// Helper class that handles loading Metadatas and keeping them available.
 class MetadataLoader {
@@ -66,8 +66,8 @@ public:
   DISubprogram *lookupSubprogramForFunction(Function *F);
 
   /// Parse a `METADATA_ATTACHMENT` block for a function.
-  Error parseMetadataAttachment(
-      Function &F, const SmallVectorImpl<Instruction *> &InstructionList);
+  Error parseMetadataAttachment(Function &F,
+                                ArrayRef<Instruction *> InstructionList);
 
   /// Parse a `METADATA_KIND` block for the current module.
   Error parseMetadataKinds();
diff --git a/llvm/lib/Bitcode/Reader/ValueList.cpp b/llvm/lib/Bitcode/Reader/ValueList.cpp
index 86ed664070f6..b9dbf904c89e 100644
--- a/llvm/lib/Bitcode/Reader/ValueList.cpp
+++ b/llvm/lib/Bitcode/Reader/ValueList.cpp
@@ -17,80 +17,44 @@
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
-#include <algorithm>
 #include <cstddef>
-#include <limits>
 
 using namespace llvm;
 
-namespace llvm {
-
-namespace {
-
-/// A class for maintaining the slot number definition
-/// as a placeholder for the actual definition for forward constants defs.
-class ConstantPlaceHolder : public ConstantExpr {
-public:
-  explicit ConstantPlaceHolder(Type *Ty, LLVMContext &Context)
-      : ConstantExpr(Ty, Instruction::UserOp1, &Op<0>(), 1) {
-    Op<0>() = UndefValue::get(Type::getInt32Ty(Context));
-  }
-
-  ConstantPlaceHolder &operator=(const ConstantPlaceHolder &) = delete;
-
-  // allocate space for exactly one operand
-  void *operator new(size_t s) { return User::operator new(s, 1); }
-
-  /// Methods to support type inquiry through isa, cast, and dyn_cast.
-  static bool classof(const Value *V) {
-    return isa<ConstantExpr>(V) &&
-           cast<ConstantExpr>(V)->getOpcode() == Instruction::UserOp1;
-  }
-
-  /// Provide fast operand accessors
-  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
-};
-
-} // end anonymous namespace
-
-// FIXME: can we inherit this from ConstantExpr?
-template <>
-struct OperandTraits<ConstantPlaceHolder>
-    : public FixedNumOperandTraits<ConstantPlaceHolder, 1> {};
-DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ConstantPlaceHolder, Value)
-
-} // end namespace llvm
-
-void BitcodeReaderValueList::assignValue(Value *V, unsigned Idx) {
+Error BitcodeReaderValueList::assignValue(unsigned Idx, Value *V,
+                                          unsigned TypeID) {
   if (Idx == size()) {
-    push_back(V);
-    return;
+    push_back(V, TypeID);
+    return Error::success();
   }
 
   if (Idx >= size())
     resize(Idx + 1);
 
-  WeakTrackingVH &OldV = ValuePtrs[Idx];
-  if (!OldV) {
-    OldV = V;
-    return;
+  auto &Old = ValuePtrs[Idx];
+  if (!Old.first) {
+    Old.first = V;
+    Old.second = TypeID;
+    return Error::success();
   }
 
-  // Handle constants and non-constants (e.g. instrs) differently for
-  // efficiency.
-  if (Constant *PHC = dyn_cast<Constant>(&*OldV)) {
-    ResolveConstants.push_back(std::make_pair(PHC, Idx));
-    OldV = V;
-  } else {
-    // If there was a forward reference to this value, replace it.
-    Value *PrevVal = OldV;
-    OldV->replaceAllUsesWith(V);
-    PrevVal->deleteValue();
-  }
+  assert(!isa<Constant>(&*Old.first) && "Shouldn't update constant");
+  // If there was a forward reference to this value, replace it.
+  Value *PrevVal = Old.first;
+  if (PrevVal->getType() != V->getType())
+    return createStringError(
+        std::errc::illegal_byte_sequence,
+        "Assigned value does not match type of forward declaration");
+  Old.first->replaceAllUsesWith(V);
+  PrevVal->deleteValue();
+  return Error::success();
 }
 
-Constant *BitcodeReaderValueList::getConstantFwdRef(unsigned Idx, Type *Ty) {
+Value *BitcodeReaderValueList::getValueFwdRef(unsigned Idx, Type *Ty,
+                                              unsigned TyID,
+                                              BasicBlock *ConstExprInsertBB) {
   // Bail out for a clearly invalid value.
   if (Idx >= RefsUpperBound)
     return nullptr;
@@ -98,31 +62,18 @@ Constant *BitcodeReaderValueList::getConstantFwdRef(unsigned Idx, Type *Ty) {
   if (Idx >= size())
     resize(Idx + 1);
 
-  if (Value *V = ValuePtrs[Idx]) {
-    if (Ty != V->getType())
-      report_fatal_error("Type mismatch in constant table!");
-    return cast<Constant>(V);
-  }
-
-  // Create and return a placeholder, which will later be RAUW'd.
-  Constant *C = new ConstantPlaceHolder(Ty, Context);
-  ValuePtrs[Idx] = C;
-  return C;
-}
-
-Value *BitcodeReaderValueList::getValueFwdRef(unsigned Idx, Type *Ty) {
-  // Bail out for a clearly invalid value.
-  if (Idx >= RefsUpperBound)
-    return nullptr;
-
-  if (Idx >= size())
-    resize(Idx + 1);
-
-  if (Value *V = ValuePtrs[Idx]) {
+  if (Value *V = ValuePtrs[Idx].first) {
     // If the types don't match, it's invalid.
     if (Ty && Ty != V->getType())
       return nullptr;
-    return V;
+
+    Expected<Value *> MaybeV = MaterializeValueFn(Idx, ConstExprInsertBB);
+    if (!MaybeV) {
+      // TODO: We might want to propagate the precise error message here.
+      consumeError(MaybeV.takeError());
+      return nullptr;
+    }
+    return MaybeV.get();
   }
 
   // No type specified, must be invalid reference.
@@ -131,86 +82,6 @@ Value *BitcodeReaderValueList::getValueFwdRef(unsigned Idx, Type *Ty) {
 
   // Create and return a placeholder, which will later be RAUW'd.
   Value *V = new Argument(Ty);
-  ValuePtrs[Idx] = V;
+  ValuePtrs[Idx] = {V, TyID};
   return V;
 }
-
-/// Once all constants are read, this method bulk resolves any forward
-/// references.  The idea behind this is that we sometimes get constants (such
-/// as large arrays) which reference *many* forward ref constants.  Replacing
-/// each of these causes a lot of thrashing when building/reuniquing the
-/// constant.  Instead of doing this, we look at all the uses and rewrite all
-/// the place holders at once for any constant that uses a placeholder.
-void BitcodeReaderValueList::resolveConstantForwardRefs() {
-  // Sort the values by-pointer so that they are efficient to look up with a
-  // binary search.
-  llvm::sort(ResolveConstants);
-
-  SmallVector<Constant *, 64> NewOps;
-
-  while (!ResolveConstants.empty()) {
-    Value *RealVal = operator[](ResolveConstants.back().second);
-    Constant *Placeholder = ResolveConstants.back().first;
-    ResolveConstants.pop_back();
-
-    // Loop over all users of the placeholder, updating them to reference the
-    // new value.  If they reference more than one placeholder, update them all
-    // at once.
-    while (!Placeholder->use_empty()) {
-      auto UI = Placeholder->user_begin();
-      User *U = *UI;
-
-      // If the using object isn't uniqued, just update the operands.  This
-      // handles instructions and initializers for global variables.
-      if (!isa<Constant>(U) || isa<GlobalValue>(U)) {
-        UI.getUse().set(RealVal);
-        continue;
-      }
-
-      // Otherwise, we have a constant that uses the placeholder.  Replace that
-      // constant with a new constant that has *all* placeholder uses updated.
-      Constant *UserC = cast<Constant>(U);
-      for (User::op_iterator I = UserC->op_begin(), E = UserC->op_end(); I != E;
-           ++I) {
-        Value *NewOp;
-        if (!isa<ConstantPlaceHolder>(*I)) {
-          // Not a placeholder reference.
-          NewOp = *I;
-        } else if (*I == Placeholder) {
-          // Common case is that it just references this one placeholder.
-          NewOp = RealVal;
-        } else {
-          // Otherwise, look up the placeholder in ResolveConstants.
-          ResolveConstantsTy::iterator It = llvm::lower_bound(
-              ResolveConstants,
-              std::pair<Constant *, unsigned>(cast<Constant>(*I), 0));
-          assert(It != ResolveConstants.end() && It->first == *I);
-          NewOp = operator[](It->second);
-        }
-
-        NewOps.push_back(cast<Constant>(NewOp));
-      }
-
-      // Make the new constant.
-      Constant *NewC;
-      if (ConstantArray *UserCA = dyn_cast<ConstantArray>(UserC)) {
-        NewC = ConstantArray::get(UserCA->getType(), NewOps);
-      } else if (ConstantStruct *UserCS = dyn_cast<ConstantStruct>(UserC)) {
-        NewC = ConstantStruct::get(UserCS->getType(), NewOps);
-      } else if (isa<ConstantVector>(UserC)) {
-        NewC = ConstantVector::get(NewOps);
-      } else {
-        assert(isa<ConstantExpr>(UserC) && "Must be a ConstantExpr.");
-        NewC = cast<ConstantExpr>(UserC)->getWithOperands(NewOps);
-      }
-
-      UserC->replaceAllUsesWith(NewC);
-      UserC->destroyConstant();
-      NewOps.clear();
-    }
-
-    // Update all ValueHandles, they should be the only users at this point.
-    Placeholder->replaceAllUsesWith(RealVal);
-    delete cast<ConstantPlaceHolder>(Placeholder);
-  }
-}
diff --git a/llvm/lib/Bitcode/Reader/ValueList.h b/llvm/lib/Bitcode/Reader/ValueList.h
index a39617018f42..995d46f01f75 100644
--- a/llvm/lib/Bitcode/Reader/ValueList.h
+++ b/llvm/lib/Bitcode/Reader/ValueList.h
@@ -14,6 +14,7 @@
 #define LLVM_LIB_BITCODE_READER_VALUELIST_H
 
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/Support/Error.h"
 #include <cassert>
 #include <utility>
 #include <vector>
@@ -21,56 +22,53 @@
 namespace llvm {
 
 class Constant;
-class LLVMContext;
+class Error;
 class Type;
 class Value;
 
 class BitcodeReaderValueList {
-  std::vector<WeakTrackingVH> ValuePtrs;
-
-  /// As we resolve forward-referenced constants, we add information about them
-  /// to this vector.  This allows us to resolve them in bulk instead of
-  /// resolving each reference at a time.  See the code in
-  /// ResolveConstantForwardRefs for more information about this.
-  ///
-  /// The key of this vector is the placeholder constant, the value is the slot
-  /// number that holds the resolved value.
-  using ResolveConstantsTy = std::vector<std::pair<Constant *, unsigned>>;
-  ResolveConstantsTy ResolveConstants;
-  LLVMContext &Context;
+  /// Maps Value ID to pair of Value* and Type ID.
+  std::vector<std::pair<WeakTrackingVH, unsigned>> ValuePtrs;
 
   /// Maximum number of valid references. Forward references exceeding the
   /// maximum must be invalid.
   unsigned RefsUpperBound;
 
-public:
-  BitcodeReaderValueList(LLVMContext &C, size_t RefsUpperBound)
-      : Context(C),
-        RefsUpperBound(std::min((size_t)std::numeric_limits<unsigned>::max(),
-                                RefsUpperBound)) {}
+  using MaterializeValueFnTy =
+      std::function<Expected<Value *>(unsigned, BasicBlock *)>;
+  MaterializeValueFnTy MaterializeValueFn;
 
-  ~BitcodeReaderValueList() {
-    assert(ResolveConstants.empty() && "Constants not resolved?");
-  }
+public:
+  BitcodeReaderValueList(size_t RefsUpperBound,
+                         MaterializeValueFnTy MaterializeValueFn)
+      : RefsUpperBound(std::min((size_t)std::numeric_limits<unsigned>::max(),
+                                RefsUpperBound)),
+        MaterializeValueFn(MaterializeValueFn) {}
 
   // vector compatibility methods
   unsigned size() const { return ValuePtrs.size(); }
   void resize(unsigned N) {
     ValuePtrs.resize(N);
   }
-  void push_back(Value *V) { ValuePtrs.emplace_back(V); }
+  void push_back(Value *V, unsigned TypeID) {
+    ValuePtrs.emplace_back(V, TypeID);
+  }
 
   void clear() {
-    assert(ResolveConstants.empty() && "Constants not resolved?");
     ValuePtrs.clear();
   }
 
   Value *operator[](unsigned i) const {
     assert(i < ValuePtrs.size());
-    return ValuePtrs[i];
+    return ValuePtrs[i].first;
   }
 
-  Value *back() const { return ValuePtrs.back(); }
+  unsigned getTypeID(unsigned ValNo) const {
+    assert(ValNo < ValuePtrs.size());
+    return ValuePtrs[ValNo].second;
+  }
+
+  Value *back() const { return ValuePtrs.back().first; }
   void pop_back() {
     ValuePtrs.pop_back();
   }
@@ -81,14 +79,15 @@ public:
     ValuePtrs.resize(N);
   }
 
-  Constant *getConstantFwdRef(unsigned Idx, Type *Ty);
-  Value *getValueFwdRef(unsigned Idx, Type *Ty);
+  void replaceValueWithoutRAUW(unsigned ValNo, Value *NewV) {
+    assert(ValNo < ValuePtrs.size());
+    ValuePtrs[ValNo].first = NewV;
+  }
 
-  void assignValue(Value *V, unsigned Idx);
+  Value *getValueFwdRef(unsigned Idx, Type *Ty, unsigned TyID,
+                        BasicBlock *ConstExprInsertBB);
 
-  /// Once all constants are read, this method bulk resolves any forward
-  /// references.
-  void resolveConstantForwardRefs();
+  Error assignValue(unsigned Idx, Value *V, unsigned TypeID);
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 4bba0b356675..941ed808bab1 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -19,6 +19,8 @@
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
@@ -610,6 +612,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
   switch (Kind) {
   case Attribute::Alignment:
     return bitc::ATTR_KIND_ALIGNMENT;
+  case Attribute::AllocAlign:
+    return bitc::ATTR_KIND_ALLOC_ALIGN;
   case Attribute::AllocSize:
     return bitc::ATTR_KIND_ALLOC_SIZE;
   case Attribute::AlwaysInline:
@@ -644,6 +648,10 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_JUMP_TABLE;
   case Attribute::MinSize:
     return bitc::ATTR_KIND_MIN_SIZE;
+  case Attribute::AllocatedPointer:
+    return bitc::ATTR_KIND_ALLOCATED_POINTER;
+  case Attribute::AllocKind:
+    return bitc::ATTR_KIND_ALLOC_KIND;
   case Attribute::Naked:
     return bitc::ATTR_KIND_NAKED;
   case Attribute::Nest:
@@ -688,6 +696,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_NO_PROFILE;
   case Attribute::NoUnwind:
     return bitc::ATTR_KIND_NO_UNWIND;
+  case Attribute::NoSanitizeBounds:
+    return bitc::ATTR_KIND_NO_SANITIZE_BOUNDS;
   case Attribute::NoSanitizeCoverage:
     return bitc::ATTR_KIND_NO_SANITIZE_COVERAGE;
   case Attribute::NullPointerIsValid:
@@ -764,6 +774,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_BYREF;
   case Attribute::MustProgress:
     return bitc::ATTR_KIND_MUSTPROGRESS;
+  case Attribute::PresplitCoroutine:
+    return bitc::ATTR_KIND_PRESPLIT_COROUTINE;
   case Attribute::EndAttrKinds:
     llvm_unreachable("Can not encode end-attribute kinds marker.");
   case Attribute::None:
@@ -1013,6 +1025,8 @@ void ModuleBitcodeWriter::writeTypeTable() {
         TypeVals.push_back(true);
       break;
     }
+    case Type::DXILPointerTyID:
+      llvm_unreachable("DXIL pointers cannot be added to IR modules");
     }
 
     // Emit the finished record.
@@ -1211,6 +1225,14 @@ static StringEncoding getStringEncoding(StringRef Str) {
   return SE_Fixed7;
 }
 
+static_assert(sizeof(GlobalValue::SanitizerMetadata) <= sizeof(unsigned),
+              "Sanitizer Metadata is too large for naive serialization.");
+static unsigned
+serializeSanitizerMetadata(const GlobalValue::SanitizerMetadata &Meta) {
+  return Meta.NoAddress | (Meta.NoHWAddress << 1) |
+         (Meta.NoMemtag << 2) | (Meta.IsDynInit << 3);
+}
+
 /// Emit top-level description of module, including target triple, inline asm,
 /// descriptors for global variables, and function prototype info.
 /// Returns the bit offset to backpatch with the location of the real VST.
@@ -1334,7 +1356,7 @@ void ModuleBitcodeWriter::writeModuleInfo() {
     // GLOBALVAR: [strtab offset, strtab size, type, isconst, initid,
     //             linkage, alignment, section, visibility, threadlocal,
     //             unnamed_addr, externally_initialized, dllstorageclass,
-    //             comdat, attributes, DSO_Local]
+    //             comdat, attributes, DSO_Local, GlobalSanitizer]
     Vals.push_back(addToStrtab(GV.getName()));
     Vals.push_back(GV.getName().size());
     Vals.push_back(VE.getTypeID(GV.getValueType()));
@@ -1350,10 +1372,8 @@ void ModuleBitcodeWriter::writeModuleInfo() {
         GV.getUnnamedAddr() != GlobalValue::UnnamedAddr::None ||
         GV.isExternallyInitialized() ||
         GV.getDLLStorageClass() != GlobalValue::DefaultStorageClass ||
-        GV.hasComdat() ||
-        GV.hasAttributes() ||
-        GV.isDSOLocal() ||
-        GV.hasPartition()) {
+        GV.hasComdat() || GV.hasAttributes() || GV.isDSOLocal() ||
+        GV.hasPartition() || GV.hasSanitizerMetadata()) {
       Vals.push_back(getEncodedVisibility(GV));
       Vals.push_back(getEncodedThreadLocalMode(GV));
       Vals.push_back(getEncodedUnnamedAddr(GV));
@@ -1367,6 +1387,10 @@ void ModuleBitcodeWriter::writeModuleInfo() {
       Vals.push_back(GV.isDSOLocal());
       Vals.push_back(addToStrtab(GV.getPartition()));
       Vals.push_back(GV.getPartition().size());
+
+      Vals.push_back((GV.hasSanitizerMetadata() ? serializeSanitizerMetadata(
+                                                      GV.getSanitizerMetadata())
+                                                : 0));
     } else {
       AbbrevToUse = SimpleGVarAbbrev;
     }
@@ -1817,6 +1841,7 @@ void ModuleBitcodeWriter::writeDISubprogram(const DISubprogram *N,
   Record.push_back(N->getThisAdjustment());
   Record.push_back(VE.getMetadataOrNullID(N->getThrownTypes().get()));
   Record.push_back(VE.getMetadataOrNullID(N->getAnnotations().get()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawTargetFuncName()));
 
   Stream.EmitRecord(bitc::METADATA_SUBPROGRAM, Record, Abbrev);
   Record.clear();
@@ -2649,6 +2674,9 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal,
         Record.push_back(VE.getValueID(C->getOperand(1)));
         Record.push_back(CE->getPredicate());
         break;
+      case Instruction::InsertValue:
+        report_fatal_error("insertvalue constexprs not supported");
+        break;
       }
     } else if (const BlockAddress *BA = dyn_cast<BlockAddress>(C)) {
       Code = bitc::CST_CODE_BLOCKADDRESS;
@@ -3068,6 +3096,10 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
     Bitfield::set<APV::ExplicitType>(Record, true);
     Bitfield::set<APV::SwiftError>(Record, AI.isSwiftError());
     Vals.push_back(Record);
+
+    unsigned AS = AI.getAddressSpace();
+    if (AS != M.getDataLayout().getAllocaAddrSpace())
+      Vals.push_back(AS);
     break;
   }
 
@@ -3347,8 +3379,10 @@ void ModuleBitcodeWriter::writeFunction(
   bool NeedsMetadataAttachment = F.hasMetadata();
 
   DILocation *LastDL = nullptr;
+  SmallSetVector<Function *, 4> BlockAddressUsers;
+
   // Finally, emit all the instructions, in order.
-  for (const BasicBlock &BB : F)
+  for (const BasicBlock &BB : F) {
     for (const Instruction &I : BB) {
       writeInstruction(I, InstID, Vals);
 
@@ -3380,6 +3414,32 @@ void ModuleBitcodeWriter::writeFunction(
       LastDL = DL;
     }
 
+    if (BlockAddress *BA = BlockAddress::lookup(&BB)) {
+      SmallVector<Value *> Worklist{BA};
+      SmallPtrSet<Value *, 8> Visited{BA};
+      while (!Worklist.empty()) {
+        Value *V = Worklist.pop_back_val();
+        for (User *U : V->users()) {
+          if (auto *I = dyn_cast<Instruction>(U)) {
+            Function *P = I->getFunction();
+            if (P != &F)
+              BlockAddressUsers.insert(P);
+          } else if (isa<Constant>(U) && !isa<GlobalValue>(U) &&
+                     Visited.insert(U).second)
+            Worklist.push_back(U);
+        }
+      }
+    }
+  }
+
+  if (!BlockAddressUsers.empty()) {
+    Vals.resize(BlockAddressUsers.size());
+    for (auto I : llvm::enumerate(BlockAddressUsers))
+      Vals[I.index()] = VE.getValueID(I.value());
+    Stream.EmitRecord(bitc::FUNC_CODE_BLOCKADDR_USERS, Vals);
+    Vals.clear();
+  }
+
   // Emit names for all the instructions etc.
   if (auto *Symtab = F.getValueSymbolTable())
     writeFunctionLevelValueSymbolTable(*Symtab);
@@ -4375,7 +4435,7 @@ void ModuleBitcodeWriter::writeModuleHash(size_t BlockStartPos) {
     uint32_t Vals[5];
     Hasher.update(ArrayRef<uint8_t>((const uint8_t *)&(Buffer)[BlockStartPos],
                                     Buffer.size() - BlockStartPos));
-    StringRef Hash = Hasher.result();
+    std::array<uint8_t, 20> Hash = Hasher.result();
     for (int Pos = 0; Pos < 20; Pos += 4) {
       Vals[Pos / 4] = support::endian::read32be(Hash.data() + Pos);
     }
@@ -4855,9 +4915,15 @@ static const char *getSectionNameForBitcode(const Triple &T) {
   case Triple::GOFF:
     llvm_unreachable("GOFF is not yet implemented");
     break;
+  case Triple::SPIRV:
+    llvm_unreachable("SPIRV is not yet implemented");
+    break;
   case Triple::XCOFF:
     llvm_unreachable("XCOFF is not yet implemented");
     break;
+  case Triple::DXContainer:
+    llvm_unreachable("DXContainer is not yet implemented");
+    break;
   }
   llvm_unreachable("Unimplemented ObjectFormatType");
 }
@@ -4874,9 +4940,15 @@ static const char *getSectionNameForCommandline(const Triple &T) {
   case Triple::GOFF:
     llvm_unreachable("GOFF is not yet implemented");
     break;
+  case Triple::SPIRV:
+    llvm_unreachable("SPIRV is not yet implemented");
+    break;
   case Triple::XCOFF:
     llvm_unreachable("XCOFF is not yet implemented");
     break;
+  case Triple::DXContainer:
+    llvm_unreachable("DXC is not yet implemented");
+    break;
   }
   llvm_unreachable("Unimplemented ObjectFormatType");
 }
@@ -4931,7 +5003,7 @@ void llvm::embedBitcodeInModule(llvm::Module &M, llvm::MemoryBufferRef Buf,
       ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, UsedElementType));
   if (llvm::GlobalVariable *Old =
           M.getGlobalVariable("llvm.embedded.module", true)) {
-    assert(Old->hasOneUse() &&
+    assert(Old->hasZeroLiveUses() &&
            "llvm.embedded.module can only be used once in llvm.compiler.used");
     GV->takeName(Old);
     Old->eraseFromParent();
@@ -4954,7 +5026,7 @@ void llvm::embedBitcodeInModule(llvm::Module &M, llvm::MemoryBufferRef Buf,
     UsedArray.push_back(
         ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, UsedElementType));
     if (llvm::GlobalVariable *Old = M.getGlobalVariable("llvm.cmdline", true)) {
-      assert(Old->hasOneUse() &&
+      assert(Old->hasZeroLiveUses() &&
              "llvm.cmdline can only be used once in llvm.compiler.used");
       GV->takeName(Old);
       Old->eraseFromParent();
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
index d884415aafd5..536d04f2fe26 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
@@ -13,7 +13,6 @@
 #include "llvm/Bitcode/BitcodeWriterPass.h"
 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
diff --git a/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp b/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
index 01f7e85bd60e..727ec2e02cc2 100644
--- a/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -50,17 +50,12 @@ namespace {
 
 struct OrderMap {
   DenseMap<const Value *, std::pair<unsigned, bool>> IDs;
-  unsigned LastGlobalConstantID = 0;
   unsigned LastGlobalValueID = 0;
 
   OrderMap() = default;
 
-  bool isGlobalConstant(unsigned ID) const {
-    return ID <= LastGlobalConstantID;
-  }
-
   bool isGlobalValue(unsigned ID) const {
-    return ID <= LastGlobalValueID && !isGlobalConstant(ID);
+    return ID <= LastGlobalValueID;
   }
 
   unsigned size() const { return IDs.size(); }
@@ -84,7 +79,7 @@ static void orderValue(const Value *V, OrderMap &OM) {
     return;
 
   if (const Constant *C = dyn_cast<Constant>(V)) {
-    if (C->getNumOperands() && !isa<GlobalValue>(C)) {
+    if (C->getNumOperands()) {
       for (const Value *Op : C->operands())
         if (!isa<BasicBlock>(Op) && !isa<GlobalValue>(Op))
           orderValue(Op, OM);
@@ -104,39 +99,40 @@ static OrderMap orderModule(const Module &M) {
   // and ValueEnumerator::incorporateFunction().
   OrderMap OM;
 
-  // In the reader, initializers of GlobalValues are set *after* all the
-  // globals have been read.  Rather than awkwardly modeling this behaviour
-  // directly in predictValueUseListOrderImpl(), just assign IDs to
-  // initializers of GlobalValues before GlobalValues themselves to model this
-  // implicitly.
-  for (const GlobalVariable &G : M.globals())
-    if (G.hasInitializer())
-      if (!isa<GlobalValue>(G.getInitializer()))
-        orderValue(G.getInitializer(), OM);
-  for (const GlobalAlias &A : M.aliases())
-    if (!isa<GlobalValue>(A.getAliasee()))
-      orderValue(A.getAliasee(), OM);
-  for (const GlobalIFunc &I : M.ifuncs())
-    if (!isa<GlobalValue>(I.getResolver()))
-      orderValue(I.getResolver(), OM);
-  for (const Function &F : M) {
-    for (const Use &U : F.operands())
-      if (!isa<GlobalValue>(U.get()))
-        orderValue(U.get(), OM);
-  }
+  // Initializers of GlobalValues are processed in
+  // BitcodeReader::ResolveGlobalAndAliasInits().  Match the order there rather
+  // than ValueEnumerator, and match the code in predictValueUseListOrderImpl()
+  // by giving IDs in reverse order.
+  //
+  // Since GlobalValues never reference each other directly (just through
+  // initializers), their relative IDs only matter for determining order of
+  // uses in their initializers.
+  for (const GlobalVariable &G : reverse(M.globals()))
+    orderValue(&G, OM);
+  for (const GlobalAlias &A : reverse(M.aliases()))
+    orderValue(&A, OM);
+  for (const GlobalIFunc &I : reverse(M.ifuncs()))
+    orderValue(&I, OM);
+  for (const Function &F : reverse(M))
+    orderValue(&F, OM);
+  OM.LastGlobalValueID = OM.size();
 
-  // As constants used in metadata operands are emitted as module-level
-  // constants, we must order them before other operands. Also, we must order
-  // these before global values, as these will be read before setting the
-  // global values' initializers. The latter matters for constants which have
-  // uses towards other constants that are used as initializers.
   auto orderConstantValue = [&OM](const Value *V) {
-    if ((isa<Constant>(V) && !isa<GlobalValue>(V)) || isa<InlineAsm>(V))
+    if (isa<Constant>(V) || isa<InlineAsm>(V))
       orderValue(V, OM);
   };
+
   for (const Function &F : M) {
     if (F.isDeclaration())
       continue;
+    // Here we need to match the union of ValueEnumerator::incorporateFunction()
+    // and WriteFunction().  Basic blocks are implicitly declared before
+    // anything else (by declaring their size).
+    for (const BasicBlock &BB : F)
+      orderValue(&BB, OM);
+
+    // Metadata used by instructions is decoded before the actual instructions,
+    // so visit any constants used by it beforehand.
     for (const BasicBlock &BB : F)
       for (const Instruction &I : BB)
         for (const Value *V : I.operands()) {
@@ -151,49 +147,17 @@ static OrderMap orderModule(const Module &M) {
             }
           }
         }
-  }
-  OM.LastGlobalConstantID = OM.size();
-
-  // Initializers of GlobalValues are processed in
-  // BitcodeReader::ResolveGlobalAndAliasInits().  Match the order there rather
-  // than ValueEnumerator, and match the code in predictValueUseListOrderImpl()
-  // by giving IDs in reverse order.
-  //
-  // Since GlobalValues never reference each other directly (just through
-  // initializers), their relative IDs only matter for determining order of
-  // uses in their initializers.
-  for (const Function &F : M)
-    orderValue(&F, OM);
-  for (const GlobalAlias &A : M.aliases())
-    orderValue(&A, OM);
-  for (const GlobalIFunc &I : M.ifuncs())
-    orderValue(&I, OM);
-  for (const GlobalVariable &G : M.globals())
-    orderValue(&G, OM);
-  OM.LastGlobalValueID = OM.size();
 
-  for (const Function &F : M) {
-    if (F.isDeclaration())
-      continue;
-    // Here we need to match the union of ValueEnumerator::incorporateFunction()
-    // and WriteFunction().  Basic blocks are implicitly declared before
-    // anything else (by declaring their size).
-    for (const BasicBlock &BB : F)
-      orderValue(&BB, OM);
     for (const Argument &A : F.args())
       orderValue(&A, OM);
     for (const BasicBlock &BB : F)
       for (const Instruction &I : BB) {
         for (const Value *Op : I.operands())
-          if ((isa<Constant>(*Op) && !isa<GlobalValue>(*Op)) ||
-              isa<InlineAsm>(*Op))
-            orderValue(Op, OM);
+          orderConstantValue(Op);
         if (auto *SVI = dyn_cast<ShuffleVectorInst>(&I))
           orderValue(SVI->getShuffleMaskForBitcode(), OM);
-      }
-    for (const BasicBlock &BB : F)
-      for (const Instruction &I : BB)
         orderValue(&I, OM);
+      }
   }
   return OM;
 }
@@ -223,18 +187,6 @@ static void predictValueUseListOrderImpl(const Value *V, const Function *F,
     auto LID = OM.lookup(LU->getUser()).first;
     auto RID = OM.lookup(RU->getUser()).first;
 
-    // Global values are processed in reverse order.
-    //
-    // Moreover, initializers of GlobalValues are set *after* all the globals
-    // have been read (despite having earlier IDs).  Rather than awkwardly
-    // modeling this behaviour here, orderModule() has assigned IDs to
-    // initializers of GlobalValues before GlobalValues themselves.
-    if (OM.isGlobalValue(LID) && OM.isGlobalValue(RID)) {
-      if (LID == RID)
-        return LU->getOperandNo() > RU->getOperandNo();
-      return LID < RID;
-    }
-
     // If ID is 4, then expect: 7 6 5 1 2 3.
     if (LID < RID) {
       if (RID <= ID)
@@ -257,9 +209,7 @@ static void predictValueUseListOrderImpl(const Value *V, const Function *F,
     return LU->getOperandNo() > RU->getOperandNo();
   });
 
-  if (llvm::is_sorted(List, [](const Entry &L, const Entry &R) {
-        return L.second < R.second;
-      }))
+  if (llvm::is_sorted(List, llvm::less_second()))
     // Order is already correct.
     return;
 
@@ -319,16 +269,25 @@ static UseListOrderStack predictUseListOrder(const Module &M) {
       predictValueUseListOrder(&A, &F, OM, Stack);
     for (const BasicBlock &BB : F)
       for (const Instruction &I : BB) {
-        for (const Value *Op : I.operands())
+        for (const Value *Op : I.operands()) {
           if (isa<Constant>(*Op) || isa<InlineAsm>(*Op)) // Visit GlobalValues.
             predictValueUseListOrder(Op, &F, OM, Stack);
+          if (const auto *MAV = dyn_cast<MetadataAsValue>(Op)) {
+            if (const auto *VAM =
+                    dyn_cast<ValueAsMetadata>(MAV->getMetadata())) {
+              predictValueUseListOrder(VAM->getValue(), &F, OM, Stack);
+            } else if (const auto *AL =
+                           dyn_cast<DIArgList>(MAV->getMetadata())) {
+              for (const auto *VAM : AL->getArgs())
+                predictValueUseListOrder(VAM->getValue(), &F, OM, Stack);
+            }
+          }
+        }
         if (auto *SVI = dyn_cast<ShuffleVectorInst>(&I))
           predictValueUseListOrder(SVI->getShuffleMaskForBitcode(), &F, OM,
                                    Stack);
-      }
-    for (const BasicBlock &BB : F)
-      for (const Instruction &I : BB)
         predictValueUseListOrder(&I, &F, OM, Stack);
+      }
   }
 
   // Visit globals last, since the module-level use-list block will be seen
@@ -939,9 +898,12 @@ void ValueEnumerator::EnumerateValue(const Value *V) {
            I != E; ++I)
         if (!isa<BasicBlock>(*I)) // Don't enumerate BB operand to BlockAddress.
           EnumerateValue(*I);
-      if (auto *CE = dyn_cast<ConstantExpr>(C))
+      if (auto *CE = dyn_cast<ConstantExpr>(C)) {
         if (CE->getOpcode() == Instruction::ShuffleVector)
           EnumerateValue(CE->getShuffleMaskForBitcode());
+        if (auto *GEP = dyn_cast<GEPOperator>(CE))
+          EnumerateType(GEP->getSourceElementType());
+      }
 
       // Finally, add the value.  Doing this could make the ValueID reference be
       // dangling, don't reuse it.
diff --git a/llvm/lib/Bitstream/Reader/BitstreamReader.cpp b/llvm/lib/Bitstream/Reader/BitstreamReader.cpp
index 28adfe6268f9..c297e16bdfdf 100644
--- a/llvm/lib/Bitstream/Reader/BitstreamReader.cpp
+++ b/llvm/lib/Bitstream/Reader/BitstreamReader.cpp
@@ -16,6 +16,10 @@ using namespace llvm;
 //===----------------------------------------------------------------------===//
 //  BitstreamCursor implementation
 //===----------------------------------------------------------------------===//
+//
+static Error error(const char *Message) {
+  return createStringError(std::errc::illegal_byte_sequence, Message);
+}
 
 /// Having read the ENTER_SUBBLOCK abbrevid, enter the block.
 Error BitstreamCursor::EnterSubBlock(unsigned BlockID, unsigned *NumWordsP) {
@@ -97,7 +101,7 @@ Expected<unsigned> BitstreamCursor::skipRecord(unsigned AbbrevID) {
     unsigned Code = MaybeCode.get();
     Expected<uint32_t> MaybeVBR = ReadVBR(6);
     if (!MaybeVBR)
-      return MaybeVBR.get();
+      return MaybeVBR.takeError();
     unsigned NumElts = MaybeVBR.get();
     for (unsigned i = 0; i != NumElts; ++i)
       if (Expected<uint64_t> Res = ReadVBR64(6))
@@ -107,7 +111,11 @@ Expected<unsigned> BitstreamCursor::skipRecord(unsigned AbbrevID) {
     return Code;
   }
 
-  const BitCodeAbbrev *Abbv = getAbbrev(AbbrevID);
+  Expected<const BitCodeAbbrev *> MaybeAbbv = getAbbrev(AbbrevID);
+  if (!MaybeAbbv)
+    return MaybeAbbv.takeError();
+
+  const BitCodeAbbrev *Abbv = MaybeAbbv.get();
   const BitCodeAbbrevOp &CodeOp = Abbv->getOperandInfo(0);
   unsigned Code;
   if (CodeOp.isLiteral())
@@ -152,7 +160,7 @@ Expected<unsigned> BitstreamCursor::skipRecord(unsigned AbbrevID) {
       // Decode the value as we are commanded.
       switch (EltEnc.getEncoding()) {
       default:
-        report_fatal_error("Array element type can't be an Array or a Blob");
+        return error("Array element type can't be an Array or a Blob");
       case BitCodeAbbrevOp::Fixed:
         assert((unsigned)EltEnc.getEncodingData() <= MaxChunkSize);
         if (Error Err =
@@ -212,8 +220,12 @@ Expected<unsigned> BitstreamCursor::readRecord(unsigned AbbrevID,
     uint32_t Code = MaybeCode.get();
     Expected<uint32_t> MaybeNumElts = ReadVBR(6);
     if (!MaybeNumElts)
-      return MaybeNumElts.takeError();
+      return error(
+          ("Failed to read size: " + toString(MaybeNumElts.takeError()))
+              .c_str());
     uint32_t NumElts = MaybeNumElts.get();
+    if (!isSizePlausible(NumElts))
+      return error("Size is not plausible");
     Vals.reserve(Vals.size() + NumElts);
 
     for (unsigned i = 0; i != NumElts; ++i)
@@ -224,7 +236,10 @@ Expected<unsigned> BitstreamCursor::readRecord(unsigned AbbrevID,
     return Code;
   }
 
-  const BitCodeAbbrev *Abbv = getAbbrev(AbbrevID);
+  Expected<const BitCodeAbbrev *> MaybeAbbv = getAbbrev(AbbrevID);
+  if (!MaybeAbbv)
+    return MaybeAbbv.takeError();
+  const BitCodeAbbrev *Abbv = MaybeAbbv.get();
 
   // Read the record code first.
   assert(Abbv->getNumOperandInfos() != 0 && "no record code in abbreviation?");
@@ -235,7 +250,7 @@ Expected<unsigned> BitstreamCursor::readRecord(unsigned AbbrevID,
   else {
     if (CodeOp.getEncoding() == BitCodeAbbrevOp::Array ||
         CodeOp.getEncoding() == BitCodeAbbrevOp::Blob)
-      report_fatal_error("Abbreviation starts with an Array or a Blob");
+      return error("Abbreviation starts with an Array or a Blob");
     if (Expected<uint64_t> MaybeCode = readAbbreviatedField(*this, CodeOp))
       Code = MaybeCode.get();
     else
@@ -262,22 +277,26 @@ Expected<unsigned> BitstreamCursor::readRecord(unsigned AbbrevID,
       // Array case.  Read the number of elements as a vbr6.
       Expected<uint32_t> MaybeNumElts = ReadVBR(6);
       if (!MaybeNumElts)
-        return MaybeNumElts.takeError();
+        return error(
+            ("Failed to read size: " + toString(MaybeNumElts.takeError()))
+                .c_str());
       uint32_t NumElts = MaybeNumElts.get();
+      if (!isSizePlausible(NumElts))
+        return error("Size is not plausible");
       Vals.reserve(Vals.size() + NumElts);
 
       // Get the element encoding.
       if (i + 2 != e)
-        report_fatal_error("Array op not second to last");
+        return error("Array op not second to last");
       const BitCodeAbbrevOp &EltEnc = Abbv->getOperandInfo(++i);
       if (!EltEnc.isEncoding())
-        report_fatal_error(
+        return error(
             "Array element type has to be an encoding of a type");
 
       // Read all the elements.
       switch (EltEnc.getEncoding()) {
       default:
-        report_fatal_error("Array element type can't be an Array or a Blob");
+        return error("Array element type can't be an Array or a Blob");
       case BitCodeAbbrevOp::Fixed:
         for (; NumElts; --NumElts)
           if (Expected<SimpleBitstreamCursor::word_t> MaybeVal =
@@ -316,13 +335,9 @@ Expected<unsigned> BitstreamCursor::readRecord(unsigned AbbrevID,
     size_t CurBitPos = GetCurrentBitNo();
     const size_t NewEnd = CurBitPos + alignTo(NumElts, 4) * 8;
 
-    // If this would read off the end of the bitcode file, just set the
-    // record to empty and return.
-    if (!canSkipToPos(NewEnd/8)) {
-      Vals.append(NumElts, 0);
-      skipToEnd();
-      break;
-    }
+    // Make sure the bitstream is large enough to contain the blob.
+    if (!canSkipToPos(NewEnd/8))
+      return error("Blob ends too soon");
 
     // Otherwise, inform the streamer that we need these bytes in memory.  Skip
     // over tail padding first, in case jumping to NewEnd invalidates the Blob
@@ -366,6 +381,9 @@ Error BitstreamCursor::ReadAbbrevRecord() {
     Expected<word_t> MaybeEncoding = Read(3);
     if (!MaybeEncoding)
       return MaybeEncoding.takeError();
+    if (!BitCodeAbbrevOp::isValidEncoding(MaybeEncoding.get()))
+      return error("Invalid encoding");
+
     BitCodeAbbrevOp::Encoding E =
         (BitCodeAbbrevOp::Encoding)MaybeEncoding.get();
     if (BitCodeAbbrevOp::hasEncodingData(E)) {
@@ -385,8 +403,7 @@ Error BitstreamCursor::ReadAbbrevRecord() {
 
       if ((E == BitCodeAbbrevOp::Fixed || E == BitCodeAbbrevOp::VBR) &&
           Data > MaxChunkSize)
-        report_fatal_error(
-            "Fixed or VBR abbrev record with size > MaxChunkData");
+        return error("Fixed or VBR abbrev record with size > MaxChunkData");
 
       Abbv->Add(BitCodeAbbrevOp(E, Data));
     } else
@@ -394,7 +411,7 @@ Error BitstreamCursor::ReadAbbrevRecord() {
   }
 
   if (Abbv->getNumOperandInfos() == 0)
-    report_fatal_error("Abbrev record with no operands");
+    return error("Abbrev record with no operands");
   CurAbbrevs.push_back(std::move(Abbv));
 
   return Error::success();
diff --git a/llvm/lib/CodeGen/Analysis.cpp b/llvm/lib/CodeGen/Analysis.cpp
index cdf5586766da..f5dbaccfcad5 100644
--- a/llvm/lib/CodeGen/Analysis.cpp
+++ b/llvm/lib/CodeGen/Analysis.cpp
@@ -21,12 +21,9 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Transforms/Utils/GlobalStatus.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp b/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
index 03e63321e3c4..1940f46232d3 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
@@ -38,8 +38,19 @@ void AIXException::emitExceptionInfoTable(const MCSymbol *LSDA,
   //   unsigned long personality;  /* Pointer to the personality routine */
   //   }
 
-  Asm->OutStreamer->SwitchSection(
-      Asm->getObjFileLowering().getCompactUnwindSection());
+  auto *EHInfo =
+      cast<MCSectionXCOFF>(Asm->getObjFileLowering().getCompactUnwindSection());
+  if (Asm->TM.getFunctionSections()) {
+    // If option -ffunction-sections is on, append the function name to the
+    // name of EH Info Table csect so that each function has its own EH Info
+    // Table csect. This helps the linker to garbage-collect EH info of unused
+    // functions.
+    SmallString<128> NameStr = EHInfo->getName();
+    raw_svector_ostream(NameStr) << '.' << Asm->MF->getFunction().getName();
+    EHInfo = Asm->OutContext.getXCOFFSection(NameStr, EHInfo->getKind(),
+                                             EHInfo->getCsectProp());
+  }
+  Asm->OutStreamer->switchSection(EHInfo);
   MCSymbol *EHInfoLabel =
       TargetLoweringObjectFileXCOFF::getEHInfoTableSymbol(Asm->MF);
   Asm->OutStreamer->emitLabel(EHInfoLabel);
@@ -74,8 +85,8 @@ void AIXException::endFunction(const MachineFunction *MF) {
   const Function &F = MF->getFunction();
   assert(F.hasPersonalityFn() &&
          "Landingpads are presented, but no personality routine is found.");
-  const GlobalValue *Per =
-      dyn_cast<GlobalValue>(F.getPersonalityFn()->stripPointerCasts());
+  const auto *Per =
+      cast<GlobalValue>(F.getPersonalityFn()->stripPointerCasts());
   const MCSymbol *PerSym = Asm->TM.getSymbol(Per);
 
   emitExceptionInfoTable(LSDALabel, PerSym);
diff --git a/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp b/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp
index 223840c21d8b..e04a29fbb42b 100644
--- a/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -14,21 +14,14 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Mangler.h"
-#include "llvm/IR/Module.h"
+#include "llvm/IR/Function.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/FormattedStream.h"
-#include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
 ARMException::ARMException(AsmPrinter *A) : DwarfCFIExceptionBase(A) {}
 
-ARMException::~ARMException() {}
+ARMException::~ARMException() = default;
 
 ARMTargetStreamer &ARMException::getTargetStreamer() {
   MCTargetStreamer &TS = *Asm->OutStreamer->getTargetStreamer();
@@ -101,7 +94,7 @@ void ARMException::emitTypeInfos(unsigned TTypeEncoding,
   // Emit the Catch TypeInfos.
   if (VerboseAsm && !TypeInfos.empty()) {
     Asm->OutStreamer->AddComment(">> Catch TypeInfos <<");
-    Asm->OutStreamer->AddBlankLine();
+    Asm->OutStreamer->addBlankLine();
     Entry = TypeInfos.size();
   }
 
@@ -116,7 +109,7 @@ void ARMException::emitTypeInfos(unsigned TTypeEncoding,
   // Emit the Exception Specifications.
   if (VerboseAsm && !FilterIds.empty()) {
     Asm->OutStreamer->AddComment(">> Filter TypeInfos <<");
-    Asm->OutStreamer->AddBlankLine();
+    Asm->OutStreamer->addBlankLine();
     Entry = 0;
   }
   for (std::vector<unsigned>::const_iterator
diff --git a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
index 65c45f73e965..b10d79f4b5a6 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
@@ -18,7 +18,6 @@
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/DIE.h"
-#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/raw_ostream.h"
@@ -563,7 +562,7 @@ void llvm::emitDWARF5AccelTable(
   if (CompUnits.empty())
     return;
 
-  Asm->OutStreamer->SwitchSection(
+  Asm->OutStreamer->switchSection(
       Asm->getObjFileLowering().getDwarfDebugNamesSection());
 
   Contents.finalize(Asm, "names");
diff --git a/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp b/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp
index 21da9d50efba..32d8dc793510 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp
@@ -17,7 +17,7 @@
 using namespace llvm;
 
 unsigned AddressPool::getIndex(const MCSymbol *Sym, bool TLS) {
-  HasBeenUsed = true;
+  resetUsedFlag(true);
   auto IterBool =
       Pool.insert(std::make_pair(Sym, AddressPoolEntry(Pool.size(), TLS)));
   return IterBool.first->second.Number;
@@ -44,7 +44,7 @@ void AddressPool::emit(AsmPrinter &Asm, MCSection *AddrSection) {
     return;
 
   // Start the dwarf addr section.
-  Asm.OutStreamer->SwitchSection(AddrSection);
+  Asm.OutStreamer->switchSection(AddrSection);
 
   MCSymbol *EndLabel = nullptr;
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 3e8e190eecc3..4a31bf85446b 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -27,6 +27,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/ConstantFolding.h"
@@ -48,7 +49,6 @@
 #include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/MachineOperand.h"
@@ -82,33 +82,26 @@
 #include "llvm/IR/PseudoProbe.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDirectives.h"
-#include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
-#include "llvm/MC/MCSectionXCOFF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
-#include "llvm/MC/MCSymbolXCOFF.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/SectionKind.h"
-#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Pass.h"
-#include "llvm/Remarks/Remark.h"
-#include "llvm/Remarks/RemarkFormat.h"
 #include "llvm/Remarks/RemarkStreamer.h"
-#include "llvm/Remarks/RemarkStringTable.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
@@ -125,7 +118,6 @@
 #include <cinttypes>
 #include <cstdint>
 #include <iterator>
-#include <limits>
 #include <memory>
 #include <string>
 #include <utility>
@@ -135,11 +127,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
 
-// FIXME: this option currently only applies to DWARF, and not CodeView, tables
-static cl::opt<bool>
-    DisableDebugInfoPrinting("disable-debug-info-print", cl::Hidden,
-                             cl::desc("Disable debug info printing"));
-
 const char DWARFGroupName[] = "dwarf";
 const char DWARFGroupDescription[] = "DWARF Emission";
 const char DbgTimerName[] = "emit";
@@ -167,6 +154,178 @@ static gcp_map_type &getGCMap(void *&P) {
   return *(gcp_map_type*)P;
 }
 
+namespace {
+class AddrLabelMapCallbackPtr final : CallbackVH {
+  AddrLabelMap *Map = nullptr;
+
+public:
+  AddrLabelMapCallbackPtr() = default;
+  AddrLabelMapCallbackPtr(Value *V) : CallbackVH(V) {}
+
+  void setPtr(BasicBlock *BB) {
+    ValueHandleBase::operator=(BB);
+  }
+
+  void setMap(AddrLabelMap *map) { Map = map; }
+
+  void deleted() override;
+  void allUsesReplacedWith(Value *V2) override;
+};
+} // namespace
+
+class llvm::AddrLabelMap {
+  MCContext &Context;
+  struct AddrLabelSymEntry {
+    /// The symbols for the label.
+    TinyPtrVector<MCSymbol *> Symbols;
+
+    Function *Fn;   // The containing function of the BasicBlock.
+    unsigned Index; // The index in BBCallbacks for the BasicBlock.
+  };
+
+  DenseMap<AssertingVH<BasicBlock>, AddrLabelSymEntry> AddrLabelSymbols;
+
+  /// Callbacks for the BasicBlock's that we have entries for.  We use this so
+  /// we get notified if a block is deleted or RAUWd.
+  std::vector<AddrLabelMapCallbackPtr> BBCallbacks;
+
+  /// This is a per-function list of symbols whose corresponding BasicBlock got
+  /// deleted.  These symbols need to be emitted at some point in the file, so
+  /// AsmPrinter emits them after the function body.
+  DenseMap<AssertingVH<Function>, std::vector<MCSymbol *>>
+      DeletedAddrLabelsNeedingEmission;
+
+public:
+  AddrLabelMap(MCContext &context) : Context(context) {}
+
+  ~AddrLabelMap() {
+    assert(DeletedAddrLabelsNeedingEmission.empty() &&
+           "Some labels for deleted blocks never got emitted");
+  }
+
+  ArrayRef<MCSymbol *> getAddrLabelSymbolToEmit(BasicBlock *BB);
+
+  void takeDeletedSymbolsForFunction(Function *F,
+                                     std::vector<MCSymbol *> &Result);
+
+  void UpdateForDeletedBlock(BasicBlock *BB);
+  void UpdateForRAUWBlock(BasicBlock *Old, BasicBlock *New);
+};
+
+ArrayRef<MCSymbol *> AddrLabelMap::getAddrLabelSymbolToEmit(BasicBlock *BB) {
+  assert(BB->hasAddressTaken() &&
+         "Shouldn't get label for block without address taken");
+  AddrLabelSymEntry &Entry = AddrLabelSymbols[BB];
+
+  // If we already had an entry for this block, just return it.
+  if (!Entry.Symbols.empty()) {
+    assert(BB->getParent() == Entry.Fn && "Parent changed");
+    return Entry.Symbols;
+  }
+
+  // Otherwise, this is a new entry, create a new symbol for it and add an
+  // entry to BBCallbacks so we can be notified if the BB is deleted or RAUWd.
+  BBCallbacks.emplace_back(BB);
+  BBCallbacks.back().setMap(this);
+  Entry.Index = BBCallbacks.size() - 1;
+  Entry.Fn = BB->getParent();
+  MCSymbol *Sym = BB->hasAddressTaken() ? Context.createNamedTempSymbol()
+                                        : Context.createTempSymbol();
+  Entry.Symbols.push_back(Sym);
+  return Entry.Symbols;
+}
+
+/// If we have any deleted symbols for F, return them.
+void AddrLabelMap::takeDeletedSymbolsForFunction(
+    Function *F, std::vector<MCSymbol *> &Result) {
+  DenseMap<AssertingVH<Function>, std::vector<MCSymbol *>>::iterator I =
+      DeletedAddrLabelsNeedingEmission.find(F);
+
+  // If there are no entries for the function, just return.
+  if (I == DeletedAddrLabelsNeedingEmission.end())
+    return;
+
+  // Otherwise, take the list.
+  std::swap(Result, I->second);
+  DeletedAddrLabelsNeedingEmission.erase(I);
+}
+
+//===- Address of Block Management ----------------------------------------===//
+
+ArrayRef<MCSymbol *>
+AsmPrinter::getAddrLabelSymbolToEmit(const BasicBlock *BB) {
+  // Lazily create AddrLabelSymbols.
+  if (!AddrLabelSymbols)
+    AddrLabelSymbols = std::make_unique<AddrLabelMap>(OutContext);
+  return AddrLabelSymbols->getAddrLabelSymbolToEmit(
+      const_cast<BasicBlock *>(BB));
+}
+
+void AsmPrinter::takeDeletedSymbolsForFunction(
+    const Function *F, std::vector<MCSymbol *> &Result) {
+  // If no blocks have had their addresses taken, we're done.
+  if (!AddrLabelSymbols)
+    return;
+  return AddrLabelSymbols->takeDeletedSymbolsForFunction(
+      const_cast<Function *>(F), Result);
+}
+
+void AddrLabelMap::UpdateForDeletedBlock(BasicBlock *BB) {
+  // If the block got deleted, there is no need for the symbol.  If the symbol
+  // was already emitted, we can just forget about it, otherwise we need to
+  // queue it up for later emission when the function is output.
+  AddrLabelSymEntry Entry = std::move(AddrLabelSymbols[BB]);
+  AddrLabelSymbols.erase(BB);
+  assert(!Entry.Symbols.empty() && "Didn't have a symbol, why a callback?");
+  BBCallbacks[Entry.Index] = nullptr; // Clear the callback.
+
+#if !LLVM_MEMORY_SANITIZER_BUILD
+  // BasicBlock is destroyed already, so this access is UB detectable by msan.
+  assert((BB->getParent() == nullptr || BB->getParent() == Entry.Fn) &&
+         "Block/parent mismatch");
+#endif
+
+  for (MCSymbol *Sym : Entry.Symbols) {
+    if (Sym->isDefined())
+      return;
+
+    // If the block is not yet defined, we need to emit it at the end of the
+    // function.  Add the symbol to the DeletedAddrLabelsNeedingEmission list
+    // for the containing Function.  Since the block is being deleted, its
+    // parent may already be removed, we have to get the function from 'Entry'.
+    DeletedAddrLabelsNeedingEmission[Entry.Fn].push_back(Sym);
+  }
+}
+
+void AddrLabelMap::UpdateForRAUWBlock(BasicBlock *Old, BasicBlock *New) {
+  // Get the entry for the RAUW'd block and remove it from our map.
+  AddrLabelSymEntry OldEntry = std::move(AddrLabelSymbols[Old]);
+  AddrLabelSymbols.erase(Old);
+  assert(!OldEntry.Symbols.empty() && "Didn't have a symbol, why a callback?");
+
+  AddrLabelSymEntry &NewEntry = AddrLabelSymbols[New];
+
+  // If New is not address taken, just move our symbol over to it.
+  if (NewEntry.Symbols.empty()) {
+    BBCallbacks[OldEntry.Index].setPtr(New); // Update the callback.
+    NewEntry = std::move(OldEntry);          // Set New's entry.
+    return;
+  }
+
+  BBCallbacks[OldEntry.Index] = nullptr; // Update the callback.
+
+  // Otherwise, we need to add the old symbols to the new block's set.
+  llvm::append_range(NewEntry.Symbols, OldEntry.Symbols);
+}
+
+void AddrLabelMapCallbackPtr::deleted() {
+  Map->UpdateForDeletedBlock(cast<BasicBlock>(getValPtr()));
+}
+
+void AddrLabelMapCallbackPtr::allUsesReplacedWith(Value *V2) {
+  Map->UpdateForRAUWBlock(cast<BasicBlock>(getValPtr()), cast<BasicBlock>(V2));
+}
+
 /// getGVAlignment - Return the alignment to use for the specified global
 /// value.  This rounds up to the preferred alignment if possible and legal.
 Align AsmPrinter::getGVAlignment(const GlobalObject *GV, const DataLayout &DL,
@@ -271,6 +430,10 @@ void AsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
 bool AsmPrinter::doInitialization(Module &M) {
   auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>();
   MMI = MMIWP ? &MMIWP->getMMI() : nullptr;
+  HasSplitStack = false;
+  HasNoSplitStack = false;
+
+  AddrLabelSymbols = nullptr;
 
   // Initialize TargetLoweringObjectFile.
   const_cast<TargetLoweringObjectFile&>(getObjFileLowering())
@@ -281,9 +444,6 @@ bool AsmPrinter::doInitialization(Module &M) {
 
   OutStreamer->initSections(false, *TM.getMCSubtargetInfo());
 
-  if (DisableDebugInfoPrinting)
-    MMI->setDebugInfoAvailability(false);
-
   // Emit the version-min deployment target directive if needed.
   //
   // FIXME: If we end up with a collection of these sorts of Darwin-specific
@@ -335,11 +495,11 @@ bool AsmPrinter::doInitialization(Module &M) {
   // Emit module-level inline asm if it exists.
   if (!M.getModuleInlineAsm().empty()) {
     OutStreamer->AddComment("Start of file scope inline assembly");
-    OutStreamer->AddBlankLine();
+    OutStreamer->addBlankLine();
     emitInlineAsm(M.getModuleInlineAsm() + "\n", *TM.getMCSubtargetInfo(),
                   TM.Options.MCOptions);
     OutStreamer->AddComment("End of file scope inline assembly");
-    OutStreamer->AddBlankLine();
+    OutStreamer->addBlankLine();
   }
 
   if (MAI->doesSupportDebugInformation()) {
@@ -351,7 +511,7 @@ bool AsmPrinter::doInitialization(Module &M) {
                             CodeViewLineTablesGroupDescription);
     }
     if (!EmitCodeView || M.getDwarfVersion()) {
-      if (!DisableDebugInfoPrinting) {
+      if (MMI->hasDebugInfo()) {
         DD = new DwarfDebug(this);
         Handlers.emplace_back(std::unique_ptr<DwarfDebug>(DD), DbgTimerName,
                               DbgTimerDescription, DWARFGroupName,
@@ -536,9 +696,9 @@ void AsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
     if (isVerbose()) {
       // When printing the control variable __emutls_v.*,
       // we don't need to print the original TLS variable name.
-      GV->printAsOperand(OutStreamer->GetCommentOS(),
-                     /*PrintType=*/false, GV->getParent());
-      OutStreamer->GetCommentOS() << '\n';
+      GV->printAsOperand(OutStreamer->getCommentOS(),
+                         /*PrintType=*/false, GV->getParent());
+      OutStreamer->getCommentOS() << '\n';
     }
   }
 
@@ -652,7 +812,7 @@ void AsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
       TheSection = getObjFileLowering().getTLSBSSSection();
       OutStreamer->emitTBSSSymbol(TheSection, MangSym, Size, Alignment.value());
     } else if (GVKind.isThreadData()) {
-      OutStreamer->SwitchSection(TheSection);
+      OutStreamer->switchSection(TheSection);
 
       emitAlignment(Alignment, GV);
       OutStreamer->emitLabel(MangSym);
@@ -661,12 +821,12 @@ void AsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
                          GV->getInitializer());
     }
 
-    OutStreamer->AddBlankLine();
+    OutStreamer->addBlankLine();
 
     // Emit the variable struct for the runtime.
     MCSection *TLVSect = getObjFileLowering().getTLSExtraDataSection();
 
-    OutStreamer->SwitchSection(TLVSect);
+    OutStreamer->switchSection(TLVSect);
     // Emit the linkage here.
     emitLinkage(GV, GVSym);
     OutStreamer->emitLabel(GVSym);
@@ -681,13 +841,13 @@ void AsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
     OutStreamer->emitIntValue(0, PtrSize);
     OutStreamer->emitSymbolValue(MangSym, PtrSize);
 
-    OutStreamer->AddBlankLine();
+    OutStreamer->addBlankLine();
     return;
   }
 
   MCSymbol *EmittedInitSym = GVSym;
 
-  OutStreamer->SwitchSection(TheSection);
+  OutStreamer->switchSection(TheSection);
 
   emitLinkage(GV, EmittedInitSym);
   emitAlignment(Alignment, GV);
@@ -704,7 +864,7 @@ void AsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
     OutStreamer->emitELFSize(EmittedInitSym,
                              MCConstantExpr::create(Size, OutContext));
 
-  OutStreamer->AddBlankLine();
+  OutStreamer->addBlankLine();
 }
 
 /// Emit the directive and value for debug thread local expression
@@ -723,7 +883,7 @@ void AsmPrinter::emitFunctionHeader() {
   const Function &F = MF->getFunction();
 
   if (isVerbose())
-    OutStreamer->GetCommentOS()
+    OutStreamer->getCommentOS()
         << "-- Begin function "
         << GlobalValue::dropLLVMManglingEscape(F.getName()) << '\n';
 
@@ -737,7 +897,7 @@ void AsmPrinter::emitFunctionHeader() {
     MF->setSection(getObjFileLowering().getUniqueSectionForFunction(F, TM));
   else
     MF->setSection(getObjFileLowering().SectionForGlobal(&F, TM));
-  OutStreamer->SwitchSection(MF->getSection());
+  OutStreamer->switchSection(MF->getSection());
 
   if (!MAI->hasVisibilityOnlyWithLinkage())
     emitVisibility(CurrentFnSym, F.getVisibility());
@@ -756,10 +916,10 @@ void AsmPrinter::emitFunctionHeader() {
     OutStreamer->emitSymbolAttribute(CurrentFnSym, MCSA_Cold);
 
   if (isVerbose()) {
-    F.printAsOperand(OutStreamer->GetCommentOS(),
-                   /*PrintType=*/false, F.getParent());
+    F.printAsOperand(OutStreamer->getCommentOS(),
+                     /*PrintType=*/false, F.getParent());
     emitFunctionHeaderComment();
-    OutStreamer->GetCommentOS() << '\n';
+    OutStreamer->getCommentOS() << '\n';
   }
 
   // Emit the prefix data.
@@ -817,7 +977,7 @@ void AsmPrinter::emitFunctionHeader() {
   // references to the dangling symbols.  Emit them at the start of the function
   // so that we don't get references to undefined symbols.
   std::vector<MCSymbol*> DeadBlockSyms;
-  MMI->takeDeletedSymbolsForFunction(&F, DeadBlockSyms);
+  takeDeletedSymbolsForFunction(&F, DeadBlockSyms);
   for (MCSymbol *DeadBlockSym : DeadBlockSyms) {
     OutStreamer->AddComment("Address taken block that was later removed");
     OutStreamer->emitLabel(DeadBlockSym);
@@ -844,6 +1004,24 @@ void AsmPrinter::emitFunctionHeader() {
   // Emit the prologue data.
   if (F.hasPrologueData())
     emitGlobalConstant(F.getParent()->getDataLayout(), F.getPrologueData());
+
+  // Emit the function prologue data for the indirect call sanitizer.
+  if (const MDNode *MD = F.getMetadata(LLVMContext::MD_func_sanitize)) {
+    assert(TM.getTargetTriple().getArch() == Triple::x86 ||
+           TM.getTargetTriple().getArch() == Triple::x86_64);
+    assert(MD->getNumOperands() == 2);
+
+    auto *PrologueSig = mdconst::extract<Constant>(MD->getOperand(0));
+    auto *FTRTTIProxy = mdconst::extract<Constant>(MD->getOperand(1));
+    assert(PrologueSig && FTRTTIProxy);
+    emitGlobalConstant(F.getParent()->getDataLayout(), PrologueSig);
+
+    const MCExpr *Proxy = lowerConstant(FTRTTIProxy);
+    const MCExpr *FnExp = MCSymbolRefExpr::create(CurrentFnSym, OutContext);
+    const MCExpr *PCRel = MCBinaryExpr::createSub(Proxy, FnExp, OutContext);
+    // Use 32 bit since only small code model is supported.
+    OutStreamer->emitValue(PCRel, 4u);
+  }
 }
 
 /// EmitFunctionEntryLabel - Emit the label that is the entrypoint for the
@@ -912,7 +1090,7 @@ void AsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
      << printReg(RegNo, MF->getSubtarget().getRegisterInfo());
 
   OutStreamer->AddComment(OS.str());
-  OutStreamer->AddBlankLine();
+  OutStreamer->addBlankLine();
 }
 
 static void emitKill(const MachineInstr *MI, AsmPrinter &AP) {
@@ -925,7 +1103,7 @@ static void emitKill(const MachineInstr *MI, AsmPrinter &AP) {
        << printReg(Op.getReg(), AP.MF->getSubtarget().getRegisterInfo());
   }
   AP.OutStreamer->AddComment(OS.str());
-  AP.OutStreamer->AddBlankLine();
+  AP.OutStreamer->addBlankLine();
 }
 
 /// emitDebugValueComment - This method handles the target-independent form
@@ -1147,32 +1325,42 @@ void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) {
 
   const MCSymbol *FunctionSymbol = getFunctionBegin();
 
-  OutStreamer->PushSection();
-  OutStreamer->SwitchSection(BBAddrMapSection);
+  OutStreamer->pushSection();
+  OutStreamer->switchSection(BBAddrMapSection);
+  OutStreamer->AddComment("version");
+  OutStreamer->emitInt8(OutStreamer->getContext().getBBAddrMapVersion());
+  OutStreamer->AddComment("feature");
+  OutStreamer->emitInt8(0);
+  OutStreamer->AddComment("function address");
   OutStreamer->emitSymbolValue(FunctionSymbol, getPointerSize());
-  // Emit the total number of basic blocks in this function.
+  OutStreamer->AddComment("number of basic blocks");
   OutStreamer->emitULEB128IntValue(MF.size());
+  const MCSymbol *PrevMBBEndSymbol = FunctionSymbol;
   // Emit BB Information for each basic block in the funciton.
   for (const MachineBasicBlock &MBB : MF) {
     const MCSymbol *MBBSymbol =
         MBB.isEntryBlock() ? FunctionSymbol : MBB.getSymbol();
-    // Emit the basic block offset.
-    emitLabelDifferenceAsULEB128(MBBSymbol, FunctionSymbol);
+    // Emit the basic block offset relative to the end of the previous block.
+    // This is zero unless the block is padded due to alignment.
+    emitLabelDifferenceAsULEB128(MBBSymbol, PrevMBBEndSymbol);
     // Emit the basic block size. When BBs have alignments, their size cannot
     // always be computed from their offsets.
     emitLabelDifferenceAsULEB128(MBB.getEndSymbol(), MBBSymbol);
     OutStreamer->emitULEB128IntValue(getBBAddrMapMetadata(MBB));
+    PrevMBBEndSymbol = MBB.getEndSymbol();
   }
-  OutStreamer->PopSection();
+  OutStreamer->popSection();
 }
 
 void AsmPrinter::emitPseudoProbe(const MachineInstr &MI) {
-  auto GUID = MI.getOperand(0).getImm();
-  auto Index = MI.getOperand(1).getImm();
-  auto Type = MI.getOperand(2).getImm();
-  auto Attr = MI.getOperand(3).getImm();
-  DILocation *DebugLoc = MI.getDebugLoc();
-  PP->emitPseudoProbe(GUID, Index, Type, Attr, DebugLoc);
+  if (PP) {
+    auto GUID = MI.getOperand(0).getImm();
+    auto Index = MI.getOperand(1).getImm();
+    auto Type = MI.getOperand(2).getImm();
+    auto Attr = MI.getOperand(3).getImm();
+    DILocation *DebugLoc = MI.getDebugLoc();
+    PP->emitPseudoProbe(GUID, Index, Type, Attr, DebugLoc);
+  }
 }
 
 void AsmPrinter::emitStackSizeSection(const MachineFunction &MF) {
@@ -1189,15 +1377,16 @@ void AsmPrinter::emitStackSizeSection(const MachineFunction &MF) {
   if (FrameInfo.hasVarSizedObjects())
     return;
 
-  OutStreamer->PushSection();
-  OutStreamer->SwitchSection(StackSizeSection);
+  OutStreamer->pushSection();
+  OutStreamer->switchSection(StackSizeSection);
 
   const MCSymbol *FunctionSymbol = getFunctionBegin();
-  uint64_t StackSize = FrameInfo.getStackSize();
+  uint64_t StackSize =
+      FrameInfo.getStackSize() + FrameInfo.getUnsafeStackSize();
   OutStreamer->emitSymbolValue(FunctionSymbol, TM.getProgramPointerSize());
   OutStreamer->emitULEB128IntValue(StackSize);
 
-  OutStreamer->PopSection();
+  OutStreamer->popSection();
 }
 
 void AsmPrinter::emitStackUsage(const MachineFunction &MF) {
@@ -1208,7 +1397,8 @@ void AsmPrinter::emitStackUsage(const MachineFunction &MF) {
     return;
 
   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
-  uint64_t StackSize = FrameInfo.getStackSize();
+  uint64_t StackSize =
+      FrameInfo.getStackSize() + FrameInfo.getUnsafeStackSize();
 
   if (StackUsageStream == nullptr) {
     std::error_code EC;
@@ -1298,7 +1488,7 @@ void AsmPrinter::emitFunctionBody() {
       }
 
       if (isVerbose())
-        emitComments(MI, OutStreamer->GetCommentOS());
+        emitComments(MI, OutStreamer->getCommentOS());
 
       switch (MI.getOpcode()) {
       case TargetOpcode::CFI_INSTRUCTION:
@@ -1460,7 +1650,7 @@ void AsmPrinter::emitFunctionBody() {
   }
 
   // Switch to the original section in case basic block sections was used.
-  OutStreamer->SwitchSection(MF->getSection());
+  OutStreamer->switchSection(MF->getSection());
 
   const Function &F = MF->getFunction();
   for (const auto &BB : F) {
@@ -1527,9 +1717,9 @@ void AsmPrinter::emitFunctionBody() {
   emitPatchableFunctionEntries();
 
   if (isVerbose())
-    OutStreamer->GetCommentOS() << "-- End function\n";
+    OutStreamer->getCommentOS() << "-- End function\n";
 
-  OutStreamer->AddBlankLine();
+  OutStreamer->addBlankLine();
 }
 
 /// Compute the number of Global Variables that uses a Constant.
@@ -1617,10 +1807,7 @@ void AsmPrinter::emitGlobalAlias(Module &M, const GlobalAlias &GA) {
   // Treat bitcasts of functions as functions also. This is important at least
   // on WebAssembly where object and function addresses can't alias each other.
   if (!IsFunction)
-    if (auto *CE = dyn_cast<ConstantExpr>(GA.getAliasee()))
-      if (CE->getOpcode() == Instruction::BitCast)
-        IsFunction =
-          CE->getOperand(0)->getType()->getPointerElementType()->isFunctionTy();
+    IsFunction = isa<Function>(GA.getAliasee()->stripPointerCasts());
 
   // AIX's assembly directive `.set` is not usable for aliasing purpose,
   // so AIX has to use the extra-label-at-definition strategy. At this
@@ -1650,13 +1837,13 @@ void AsmPrinter::emitGlobalAlias(Module &M, const GlobalAlias &GA) {
   if (IsFunction) {
     OutStreamer->emitSymbolAttribute(Name, MCSA_ELF_TypeFunction);
     if (TM.getTargetTriple().isOSBinFormatCOFF()) {
-      OutStreamer->BeginCOFFSymbolDef(Name);
-      OutStreamer->EmitCOFFSymbolStorageClass(
+      OutStreamer->beginCOFFSymbolDef(Name);
+      OutStreamer->emitCOFFSymbolStorageClass(
           GA.hasLocalLinkage() ? COFF::IMAGE_SYM_CLASS_STATIC
                                : COFF::IMAGE_SYM_CLASS_EXTERNAL);
-      OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION
+      OutStreamer->emitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION
                                       << COFF::SCT_COMPLEX_TYPE_SHIFT);
-      OutStreamer->EndCOFFSymbolDef();
+      OutStreamer->endCOFFSymbolDef();
     }
   }
 
@@ -1734,7 +1921,7 @@ void AsmPrinter::emitRemarksSection(remarks::RemarkStreamer &RS) {
   // Switch to the remarks section.
   MCSection *RemarksSection =
       OutContext.getObjectFileInfo()->getRemarksSection();
-  OutStreamer->SwitchSection(RemarksSection);
+  OutStreamer->switchSection(RemarksSection);
 
   OutStreamer->emitBinaryData(OS.str());
 }
@@ -1805,7 +1992,7 @@ bool AsmPrinter::doFinalization(Module &M) {
     // Output stubs for external and common global variables.
     MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
     if (!Stubs.empty()) {
-      OutStreamer->SwitchSection(TLOF.getDataSection());
+      OutStreamer->switchSection(TLOF.getDataSection());
       const DataLayout &DL = M.getDataLayout();
 
       emitAlignment(Align(DL.getPointerSize()));
@@ -1829,7 +2016,7 @@ bool AsmPrinter::doFinalization(Module &M) {
       for (const auto &Stub : Stubs) {
         SmallString<256> SectionName = StringRef(".rdata$");
         SectionName += Stub.first->getName();
-        OutStreamer->SwitchSection(OutContext.getCOFFSection(
+        OutStreamer->switchSection(OutContext.getCOFFSection(
             SectionName,
             COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ |
                 COFF::IMAGE_SCN_LNK_COMDAT,
@@ -1920,31 +2107,14 @@ bool AsmPrinter::doFinalization(Module &M) {
   // Emit bytes for llvm.commandline metadata.
   emitModuleCommandLines(M);
 
-  // Emit __morestack address if needed for indirect calls.
-  if (MMI->usesMorestackAddr()) {
-    Align Alignment(1);
-    MCSection *ReadOnlySection = getObjFileLowering().getSectionForConstant(
-        getDataLayout(), SectionKind::getReadOnly(),
-        /*C=*/nullptr, Alignment);
-    OutStreamer->SwitchSection(ReadOnlySection);
-
-    MCSymbol *AddrSymbol =
-        OutContext.getOrCreateSymbol(StringRef("__morestack_addr"));
-    OutStreamer->emitLabel(AddrSymbol);
-
-    unsigned PtrSize = MAI->getCodePointerSize();
-    OutStreamer->emitSymbolValue(GetExternalSymbolSymbol("__morestack"),
-                                 PtrSize);
-  }
-
   // Emit .note.GNU-split-stack and .note.GNU-no-split-stack sections if
   // split-stack is used.
-  if (TM.getTargetTriple().isOSBinFormatELF() && MMI->hasSplitStack()) {
-    OutStreamer->SwitchSection(
-        OutContext.getELFSection(".note.GNU-split-stack", ELF::SHT_PROGBITS, 0));
-    if (MMI->hasNosplitStack())
-      OutStreamer->SwitchSection(
-          OutContext.getELFSection(".note.GNU-no-split-stack", ELF::SHT_PROGBITS, 0));
+  if (TM.getTargetTriple().isOSBinFormatELF() && HasSplitStack) {
+    OutStreamer->switchSection(OutContext.getELFSection(".note.GNU-split-stack",
+                                                        ELF::SHT_PROGBITS, 0));
+    if (HasNoSplitStack)
+      OutStreamer->switchSection(OutContext.getELFSection(
+          ".note.GNU-no-split-stack", ELF::SHT_PROGBITS, 0));
   }
 
   // If we don't have any trampolines, then we don't require stack memory
@@ -1952,7 +2122,7 @@ bool AsmPrinter::doFinalization(Module &M) {
   Function *InitTrampolineIntrinsic = M.getFunction("llvm.init.trampoline");
   if (!InitTrampolineIntrinsic || InitTrampolineIntrinsic->use_empty())
     if (MCSection *S = MAI->getNonexecutableStackSection(OutContext))
-      OutStreamer->SwitchSection(S);
+      OutStreamer->switchSection(S);
 
   if (TM.Options.EmitAddrsig) {
     // Emit address-significance attributes for all globals.
@@ -1973,7 +2143,7 @@ bool AsmPrinter::doFinalization(Module &M) {
           GV.getVisibility() != GlobalValue::DefaultVisibility)
         continue;
 
-      OutStreamer->SwitchSection(
+      OutStreamer->switchSection(
           OutContext.getELFSection(".llvm_sympart", ELF::SHT_LLVM_SYMPART, 0, 0,
                                    "", false, ++UniqueID, nullptr));
       OutStreamer->emitBytes(GV.getPartition());
@@ -1989,8 +2159,9 @@ bool AsmPrinter::doFinalization(Module &M) {
   emitEndOfAsmFile(M);
 
   MMI = nullptr;
+  AddrLabelSymbols = nullptr;
 
-  OutStreamer->Finish();
+  OutStreamer->finish();
   OutStreamer->reset();
   OwnedMLI.reset();
   OwnedMDT.reset();
@@ -2009,6 +2180,16 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
   this->MF = &MF;
   const Function &F = MF.getFunction();
 
+  // Record that there are split-stack functions, so we will emit a special
+  // section to tell the linker.
+  if (MF.shouldSplitStack()) {
+    HasSplitStack = true;
+
+    if (!MF.getFrameInfo().needsSplitStackProlog())
+      HasNoSplitStack = true;
+  } else
+    HasNoSplitStack = true;
+
   // Get the function symbol.
   if (!MAI->needsFunctionDescriptors()) {
     CurrentFnSym = getSymbol(&MF.getFunction());
@@ -2113,7 +2294,7 @@ void AsmPrinter::emitConstantPool() {
         continue;
 
       if (CurSection != CPSections[i].S) {
-        OutStreamer->SwitchSection(CPSections[i].S);
+        OutStreamer->switchSection(CPSections[i].S);
         emitAlignment(Align(CPSections[i].Alignment));
         CurSection = CPSections[i].S;
         Offset = 0;
@@ -2156,7 +2337,7 @@ void AsmPrinter::emitJumpTableInfo() {
   if (JTInDiffSection) {
     // Drop it in the readonly section.
     MCSection *ReadOnlySection = TLOF.getSectionForJumpTable(F, TM);
-    OutStreamer->SwitchSection(ReadOnlySection);
+    OutStreamer->switchSection(ReadOnlySection);
   }
 
   emitAlignment(Align(MJTI->getEntryAlignment(DL)));
@@ -2392,7 +2573,7 @@ void AsmPrinter::emitXXStructorList(const DataLayout &DL, const Constant *List,
     MCSection *OutputSection =
         (IsCtor ? Obj.getStaticCtorSection(S.Priority, KeySym)
                 : Obj.getStaticDtorSection(S.Priority, KeySym));
-    OutStreamer->SwitchSection(OutputSection);
+    OutStreamer->switchSection(OutputSection);
     if (OutStreamer->getCurrentSection() != OutStreamer->getPreviousSection())
       emitAlignment(Align);
     emitXXStructor(DL, S.Func);
@@ -2423,8 +2604,8 @@ void AsmPrinter::emitModuleCommandLines(Module &M) {
   if (!NMD || !NMD->getNumOperands())
     return;
 
-  OutStreamer->PushSection();
-  OutStreamer->SwitchSection(CommandLine);
+  OutStreamer->pushSection();
+  OutStreamer->switchSection(CommandLine);
   OutStreamer->emitZeros(1);
   for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
     const MDNode *N = NMD->getOperand(i);
@@ -2434,7 +2615,7 @@ void AsmPrinter::emitModuleCommandLines(Module &M) {
     OutStreamer->emitBytes(S->getString());
     OutStreamer->emitZeros(1);
   }
-  OutStreamer->PopSection();
+  OutStreamer->popSection();
 }
 
 //===--------------------------------------------------------------------===//
@@ -2471,7 +2652,7 @@ void AsmPrinter::emitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset,
                                      unsigned Size,
                                      bool IsSectionRelative) const {
   if (MAI->needsDwarfSectionOffsetDirective() && IsSectionRelative) {
-    OutStreamer->EmitCOFFSecRel32(Label, Offset);
+    OutStreamer->emitCOFFSecRel32(Label, Offset);
     if (Size > 4)
       OutStreamer->emitZeros(Size - 4);
     return;
@@ -2541,6 +2722,9 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) {
     llvm_unreachable("Unknown constant value to lower!");
   }
 
+  // The constant expression opcodes are limited to those that are necessary
+  // to represent relocations on supported targets. Expressions involving only
+  // constant addresses are constant folded instead.
   switch (CE->getOpcode()) {
   case Instruction::AddrSpaceCast: {
     const Constant *Op = CE->getOperand(0);
@@ -2658,34 +2842,17 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) {
         return RelocExpr;
       }
     }
+
+    const MCExpr *LHS = lowerConstant(CE->getOperand(0));
+    const MCExpr *RHS = lowerConstant(CE->getOperand(1));
+    return MCBinaryExpr::createSub(LHS, RHS, Ctx);
+    break;
   }
-  // else fallthrough
-  LLVM_FALLTHROUGH;
-
-  // The MC library also has a right-shift operator, but it isn't consistently
-  // signed or unsigned between different targets.
-  case Instruction::Add:
-  case Instruction::Mul:
-  case Instruction::SDiv:
-  case Instruction::SRem:
-  case Instruction::Shl:
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor: {
+
+  case Instruction::Add: {
     const MCExpr *LHS = lowerConstant(CE->getOperand(0));
     const MCExpr *RHS = lowerConstant(CE->getOperand(1));
-    switch (CE->getOpcode()) {
-    default: llvm_unreachable("Unknown binary operator constant cast expr");
-    case Instruction::Add: return MCBinaryExpr::createAdd(LHS, RHS, Ctx);
-    case Instruction::Sub: return MCBinaryExpr::createSub(LHS, RHS, Ctx);
-    case Instruction::Mul: return MCBinaryExpr::createMul(LHS, RHS, Ctx);
-    case Instruction::SDiv: return MCBinaryExpr::createDiv(LHS, RHS, Ctx);
-    case Instruction::SRem: return MCBinaryExpr::createMod(LHS, RHS, Ctx);
-    case Instruction::Shl: return MCBinaryExpr::createShl(LHS, RHS, Ctx);
-    case Instruction::And: return MCBinaryExpr::createAnd(LHS, RHS, Ctx);
-    case Instruction::Or:  return MCBinaryExpr::createOr (LHS, RHS, Ctx);
-    case Instruction::Xor: return MCBinaryExpr::createXor(LHS, RHS, Ctx);
-    }
+    return MCBinaryExpr::createAdd(LHS, RHS, Ctx);
   }
   }
 }
@@ -2719,7 +2886,7 @@ static int isRepeatedByteSequence(const Value *V, const DataLayout &DL) {
     assert(Size % 8 == 0);
 
     // Extend the element to take zero padding into account.
-    APInt Value = CI->getValue().zextOrSelf(Size);
+    APInt Value = CI->getValue().zext(Size);
     if (!Value.isSplat(8))
       return -1;
 
@@ -2768,8 +2935,8 @@ static void emitGlobalConstantDataSequential(const DataLayout &DL,
   if (isa<IntegerType>(CDS->getElementType())) {
     for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
       if (AP.isVerbose())
-        AP.OutStreamer->GetCommentOS() << format("0x%" PRIx64 "\n",
-                                                 CDS->getElementAsInteger(i));
+        AP.OutStreamer->getCommentOS()
+            << format("0x%" PRIx64 "\n", CDS->getElementAsInteger(i));
       AP.OutStreamer->emitIntValue(CDS->getElementAsInteger(i),
                                    ElementByteSize);
     }
@@ -2855,8 +3022,8 @@ static void emitGlobalConstantFP(APFloat APF, Type *ET, AsmPrinter &AP) {
   if (AP.isVerbose()) {
     SmallString<8> StrVal;
     APF.toString(StrVal);
-    ET->print(AP.OutStreamer->GetCommentOS());
-    AP.OutStreamer->GetCommentOS() << ' ' << StrVal << '\n';
+    ET->print(AP.OutStreamer->getCommentOS());
+    AP.OutStreamer->getCommentOS() << ' ' << StrVal << '\n';
   }
 
   // Now iterate through the APInt chunks, emitting them in endian-correct
@@ -3061,8 +3228,8 @@ static void emitGlobalConstantImpl(const DataLayout &DL, const Constant *CV,
 
     if (StoreSize <= 8) {
       if (AP.isVerbose())
-        AP.OutStreamer->GetCommentOS() << format("0x%" PRIx64 "\n",
-                                                 CI->getZExtValue());
+        AP.OutStreamer->getCommentOS()
+            << format("0x%" PRIx64 "\n", CI->getZExtValue());
       AP.OutStreamer->emitIntValue(CI->getZExtValue(), StoreSize);
     } else {
       emitGlobalConstantLargeInt(CI, AP);
@@ -3163,11 +3330,12 @@ MCSymbol *AsmPrinter::createTempSymbol(const Twine &Name) const {
 }
 
 MCSymbol *AsmPrinter::GetBlockAddressSymbol(const BlockAddress *BA) const {
-  return MMI->getAddrLabelSymbol(BA->getBasicBlock());
+  return const_cast<AsmPrinter *>(this)->getAddrLabelSymbol(
+      BA->getBasicBlock());
 }
 
 MCSymbol *AsmPrinter::GetBlockAddressSymbol(const BasicBlock *BB) const {
-  return MMI->getAddrLabelSymbol(BB);
+  return const_cast<AsmPrinter *>(this)->getAddrLabelSymbol(BB);
 }
 
 /// GetCPISymbol - Return the symbol for the specified constant pool entry.
@@ -3272,7 +3440,7 @@ static void emitBasicBlockLoopComments(const MachineBasicBlock &MBB,
 
   // Otherwise, it is a loop header.  Print out information about child and
   // parent loops.
-  raw_ostream &OS = AP.OutStreamer->GetCommentOS();
+  raw_ostream &OS = AP.OutStreamer->getCommentOS();
 
   PrintParentLoopComment(OS, Loop->getParentLoop(), AP.getFunctionNumber());
 
@@ -3308,7 +3476,7 @@ void AsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
   // entry block is always placed in the function section and is handled
   // separately.
   if (MBB.isBeginSection() && !MBB.isEntryBlock()) {
-    OutStreamer->SwitchSection(
+    OutStreamer->switchSection(
         getObjFileLowering().getSectionForMachineBasicBlock(MF->getFunction(),
                                                             MBB, TM));
     CurrentSectionBeginSym = MBB.getSymbol();
@@ -3326,7 +3494,7 @@ void AsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
     // MBBs can have their address taken as part of CodeGen without having
     // their corresponding BB's address taken in IR
     if (BB && BB->hasAddressTaken())
-      for (MCSymbol *Sym : MMI->getAddrLabelSymbolToEmit(BB))
+      for (MCSymbol *Sym : getAddrLabelSymbolToEmit(BB))
         OutStreamer->emitLabel(Sym);
   }
 
@@ -3334,9 +3502,9 @@ void AsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
   if (isVerbose()) {
     if (BB) {
       if (BB->hasName()) {
-        BB->printAsOperand(OutStreamer->GetCommentOS(),
+        BB->printAsOperand(OutStreamer->getCommentOS(),
                            /*PrintType=*/false, BB->getModule());
-        OutStreamer->GetCommentOS() << '\n';
+        OutStreamer->getCommentOS() << '\n';
       }
     }
 
@@ -3563,7 +3731,7 @@ void AsmPrinter::emitXRayTable() {
   // range of sleds associated with a function.
   auto &Ctx = OutContext;
   MCSymbol *SledsStart = OutContext.createTempSymbol("xray_sleds_start", true);
-  OutStreamer->SwitchSection(InstMap);
+  OutStreamer->switchSection(InstMap);
   OutStreamer->emitLabel(SledsStart);
   for (const auto &Sled : Sleds) {
     MCSymbol *Dot = Ctx.createTempSymbol();
@@ -3590,11 +3758,11 @@ void AsmPrinter::emitXRayTable() {
   // Each entry here will be 2 * word size aligned, as we're writing down two
   // pointers. This should work for both 32-bit and 64-bit platforms.
   if (FnSledIndex) {
-    OutStreamer->SwitchSection(FnSledIndex);
+    OutStreamer->switchSection(FnSledIndex);
     OutStreamer->emitCodeAlignment(2 * WordSizeBytes, &getSubtargetInfo());
     OutStreamer->emitSymbolValue(SledsStart, WordSizeBytes, false);
     OutStreamer->emitSymbolValue(SledsEnd, WordSizeBytes, false);
-    OutStreamer->SwitchSection(PrevSection);
+    OutStreamer->switchSection(PrevSection);
   }
   Sleds.clear();
 }
@@ -3639,7 +3807,7 @@ void AsmPrinter::emitPatchableFunctionEntries() {
       }
       LinkedToSym = cast<MCSymbolELF>(CurrentFnSym);
     }
-    OutStreamer->SwitchSection(OutContext.getELFSection(
+    OutStreamer->switchSection(OutContext.getELFSection(
         "__patchable_function_entries", ELF::SHT_PROGBITS, Flags, 0, GroupName,
         F.hasComdat(), MCSection::NonUniqueID, LinkedToSym));
     emitAlignment(Align(PointerSize));
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
index fc127f4cf9da..719fec06aa33 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
@@ -10,7 +10,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ByteStreamer.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/AsmPrinter.h"
@@ -19,14 +18,11 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCDwarf.h"
-#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MachineLocation.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Target/TargetMachine.h"
 #include <cstdint>
 using namespace llvm;
 
@@ -162,7 +158,7 @@ void AsmPrinter::emitDwarfSymbolReference(const MCSymbol *Label,
     if (MAI->needsDwarfSectionOffsetDirective()) {
       assert(!isDwarf64() &&
              "emitting DWARF64 is not implemented for COFF targets");
-      OutStreamer->EmitCOFFSecRel32(Label, /*Offset=*/0);
+      OutStreamer->emitCOFFSecRel32(Label, /*Offset=*/0);
       return;
     }
 
@@ -277,6 +273,12 @@ void AsmPrinter::emitCFIInstruction(const MCCFIInstruction &Inst) const {
   case MCCFIInstruction::OpUndefined:
     OutStreamer->emitCFIUndefined(Inst.getRegister());
     break;
+  case MCCFIInstruction::OpRememberState:
+    OutStreamer->emitCFIRememberState();
+    break;
+  case MCCFIInstruction::OpRestoreState:
+    OutStreamer->emitCFIRestoreState();
+    break;
   }
 }
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index 5d0cadefdbf7..88c82cbc958b 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -17,8 +17,8 @@
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DiagnosticInfo.h"
@@ -26,9 +26,10 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -115,7 +116,7 @@ void AsmPrinter::emitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,
     report_fatal_error("Inline asm not supported by this streamer because"
                        " we don't have an asm parser for this target\n");
   Parser->setAssemblerDialect(Dialect);
-  Parser->setTargetParser(*TAP.get());
+  Parser->setTargetParser(*TAP);
   // Enable lexing Masm binary and hex integer literals in intel inline
   // assembly.
   if (Dialect == InlineAsm::AD_Intel)
@@ -398,9 +399,9 @@ void AsmPrinter::emitInlineAsm(const MachineInstr *MI) const {
   if (!RestrRegs.empty()) {
     std::string Msg = "inline asm clobber list contains reserved registers: ";
     ListSeparator LS;
-    for (const Register &RR : RestrRegs) {
+    for (const Register RR : RestrRegs) {
       Msg += LS;
-      Msg += TRI->getName(RR);
+      Msg += TRI->getRegAsmName(RR);
     }
     const char *Note =
         "Reserved registers on the clobber list may not be "
diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 52c74713551c..701c0affdfa6 100644
--- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "CodeViewDebug.h"
-#include "DwarfExpression.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
@@ -29,7 +28,6 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -41,7 +39,6 @@
 #include "llvm/DebugInfo/CodeView/EnumTables.h"
 #include "llvm/DebugInfo/CodeView/Line.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
-#include "llvm/DebugInfo/CodeView/TypeDumpVisitor.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeTableCollection.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
@@ -58,11 +55,8 @@
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/BinaryByteStream.h"
-#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -230,7 +224,7 @@ unsigned CodeViewDebug::maybeRecordFile(const DIFile *F) {
         break;
       }
     }
-    bool Success = OS.EmitCVFileDirective(NextId, FullPath, ChecksumAsBytes,
+    bool Success = OS.emitCVFileDirective(NextId, FullPath, ChecksumAsBytes,
                                           static_cast<unsigned>(CSKind));
     (void)Success;
     assert(Success && ".cv_file directive failed");
@@ -251,7 +245,7 @@ CodeViewDebug::getInlineSite(const DILocation *InlinedAt,
               .SiteFuncId;
 
     Site->SiteFuncId = NextFuncId++;
-    OS.EmitCVInlineSiteIdDirective(
+    OS.emitCVInlineSiteIdDirective(
         Site->SiteFuncId, ParentFuncId, maybeRecordFile(InlinedAt->getFile()),
         InlinedAt->getLine(), InlinedAt->getColumn(), SMLoc());
     Site->Inlinee = Inlinee;
@@ -515,7 +509,7 @@ void CodeViewDebug::maybeRecordLocation(const DebugLoc &DL,
   if (!DL || DL == PrevInstLoc)
     return;
 
-  const DIScope *Scope = DL.get()->getScope();
+  const DIScope *Scope = DL->getScope();
   if (!Scope)
     return;
 
@@ -614,18 +608,16 @@ static SourceLanguage MapDWLangToCVLang(unsigned DWLang) {
 void CodeViewDebug::beginModule(Module *M) {
   // If module doesn't have named metadata anchors or COFF debug section
   // is not available, skip any debug info related stuff.
-  NamedMDNode *CUs = M->getNamedMetadata("llvm.dbg.cu");
-  if (!CUs || !Asm->getObjFileLowering().getCOFFDebugSymbolsSection()) {
+  if (!MMI->hasDebugInfo() ||
+      !Asm->getObjFileLowering().getCOFFDebugSymbolsSection()) {
     Asm = nullptr;
     return;
   }
-  // Tell MMI that we have and need debug info.
-  MMI->setDebugInfoAvailability(true);
 
   TheCPU = mapArchToCVCPUType(Triple(M->getTargetTriple()).getArch());
 
   // Get the current source language.
-  const MDNode *Node = *CUs->operands().begin();
+  const MDNode *Node = *M->debug_compile_units_begin();
   const auto *CU = cast<DICompileUnit>(Node);
 
   CurrentSourceLanguage = MapDWLangToCVLang(CU->getSourceLanguage());
@@ -727,7 +719,7 @@ void CodeViewDebug::emitTypeInformation() {
     return;
 
   // Start the .debug$T or .debug$P section with 0x4.
-  OS.SwitchSection(Asm->getObjFileLowering().getCOFFDebugTypesSection());
+  OS.switchSection(Asm->getObjFileLowering().getCOFFDebugTypesSection());
   emitCodeViewMagicVersion();
 
   TypeTableCollection Table(TypeTable.records());
@@ -760,7 +752,7 @@ void CodeViewDebug::emitTypeGlobalHashes() {
 
   // Start the .debug$H section with the version and hash algorithm, currently
   // hardcoded to version 0, SHA1.
-  OS.SwitchSection(Asm->getObjFileLowering().getCOFFGlobalTypeHashesSection());
+  OS.switchSection(Asm->getObjFileLowering().getCOFFGlobalTypeHashesSection());
 
   OS.emitValueToAlignment(4);
   OS.AddComment("Magic");
@@ -826,6 +818,8 @@ static Version parseVersion(StringRef Name) {
     if (isdigit(C)) {
       V.Part[N] *= 10;
       V.Part[N] += C - '0';
+      V.Part[N] =
+          std::min<int>(V.Part[N], std::numeric_limits<uint16_t>::max());
     } else if (C == '.') {
       ++N;
       if (N >= 4)
@@ -867,7 +861,6 @@ void CodeViewDebug::emitCompilerInformation() {
   Version FrontVer = parseVersion(CompilerVersion);
   OS.AddComment("Frontend version");
   for (int N : FrontVer.Part) {
-    N = std::min<int>(N, std::numeric_limits<uint16_t>::max());
     OS.emitInt16(N);
   }
 
@@ -985,11 +978,11 @@ void CodeViewDebug::emitInlineeLinesSubsection() {
     assert(TypeIndices.count({SP, nullptr}));
     TypeIndex InlineeIdx = TypeIndices[{SP, nullptr}];
 
-    OS.AddBlankLine();
+    OS.addBlankLine();
     unsigned FileId = maybeRecordFile(SP->getFile());
     OS.AddComment("Inlined function " + SP->getName() + " starts at " +
                   SP->getFilename() + Twine(':') + Twine(SP->getLine()));
-    OS.AddBlankLine();
+    OS.addBlankLine();
     OS.AddComment("Type index of inlined function");
     OS.emitInt32(InlineeIdx.getIndex());
     OS.AddComment("Offset into filechecksum table");
@@ -1051,7 +1044,7 @@ void CodeViewDebug::switchToDebugSectionForSymbol(const MCSymbol *GVSym) {
       Asm->getObjFileLowering().getCOFFDebugSymbolsSection());
   DebugSec = OS.getContext().getAssociativeCOFFSection(DebugSec, KeySym);
 
-  OS.SwitchSection(DebugSec);
+  OS.switchSection(DebugSec);
 
   // Emit the magic version number if this is the first time we've switched to
   // this section.
@@ -1080,9 +1073,9 @@ void CodeViewDebug::emitDebugInfoForThunk(const Function *GV,
   OS.AddComment("PtrNext");
   OS.emitInt32(0);
   OS.AddComment("Thunk section relative address");
-  OS.EmitCOFFSecRel32(Fn, /*Offset=*/0);
+  OS.emitCOFFSecRel32(Fn, /*Offset=*/0);
   OS.AddComment("Thunk section index");
-  OS.EmitCOFFSectionIndex(Fn);
+  OS.emitCOFFSectionIndex(Fn);
   OS.AddComment("Code size");
   OS.emitAbsoluteSymbolDiff(FI.End, Fn, 2);
   OS.AddComment("Ordinal");
@@ -1132,7 +1125,7 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,
 
   // Emit FPO data, but only on 32-bit x86. No other platforms use it.
   if (Triple(MMI->getModule()->getTargetTriple()).getArch() == Triple::x86)
-    OS.EmitCVFPOData(Fn);
+    OS.emitCVFPOData(Fn);
 
   // Emit a symbol subsection, required by VS2012+ to find function boundaries.
   OS.AddComment("Symbol subsection for " + Twine(FuncName));
@@ -1160,9 +1153,9 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,
     OS.AddComment("Function type index");
     OS.emitInt32(getFuncIdForSubprogram(GV->getSubprogram()).getIndex());
     OS.AddComment("Function section relative address");
-    OS.EmitCOFFSecRel32(Fn, /*Offset=*/0);
+    OS.emitCOFFSecRel32(Fn, /*Offset=*/0);
     OS.AddComment("Function section index");
-    OS.EmitCOFFSectionIndex(Fn);
+    OS.emitCOFFSectionIndex(Fn);
     OS.AddComment("Flags");
     OS.emitInt8(0);
     // Emit the function display name as a null-terminated string.
@@ -1207,9 +1200,9 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,
       MCSymbol *Label = Annot.first;
       MDTuple *Strs = cast<MDTuple>(Annot.second);
       MCSymbol *AnnotEnd = beginSymbolRecord(SymbolKind::S_ANNOTATION);
-      OS.EmitCOFFSecRel32(Label, /*Offset=*/0);
+      OS.emitCOFFSecRel32(Label, /*Offset=*/0);
       // FIXME: Make sure we don't overflow the max record size.
-      OS.EmitCOFFSectionIndex(Label);
+      OS.emitCOFFSectionIndex(Label);
       OS.emitInt16(Strs->getNumOperands());
       for (Metadata *MD : Strs->operands()) {
         // MDStrings are null terminated, so we can do EmitBytes and get the
@@ -1227,9 +1220,9 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,
       const DIType *DITy = std::get<2>(HeapAllocSite);
       MCSymbol *HeapAllocEnd = beginSymbolRecord(SymbolKind::S_HEAPALLOCSITE);
       OS.AddComment("Call site offset");
-      OS.EmitCOFFSecRel32(BeginLabel, /*Offset=*/0);
+      OS.emitCOFFSecRel32(BeginLabel, /*Offset=*/0);
       OS.AddComment("Call site section index");
-      OS.EmitCOFFSectionIndex(BeginLabel);
+      OS.emitCOFFSectionIndex(BeginLabel);
       OS.AddComment("Call instruction length");
       OS.emitAbsoluteSymbolDiff(EndLabel, BeginLabel, 2);
       OS.AddComment("Type index");
@@ -1249,9 +1242,9 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,
   OS.emitCVLinetableDirective(FI.FuncId, Fn, FI.End);
 }
 
-CodeViewDebug::LocalVarDefRange
+CodeViewDebug::LocalVarDef
 CodeViewDebug::createDefRangeMem(uint16_t CVRegister, int Offset) {
-  LocalVarDefRange DR;
+  LocalVarDef DR;
   DR.InMemory = -1;
   DR.DataOffset = Offset;
   assert(DR.DataOffset == Offset && "truncation");
@@ -1303,19 +1296,19 @@ void CodeViewDebug::collectVariableInfoFromMFTable(
            "Frame offsets with a scalable component are not supported");
 
     // Calculate the label ranges.
-    LocalVarDefRange DefRange =
+    LocalVarDef DefRange =
         createDefRangeMem(CVReg, FrameOffset.getFixed() + ExprOffset);
 
+    LocalVariable Var;
+    Var.DIVar = VI.Var;
+
     for (const InsnRange &Range : Scope->getRanges()) {
       const MCSymbol *Begin = getLabelBeforeInsn(Range.first);
       const MCSymbol *End = getLabelAfterInsn(Range.second);
       End = End ? End : Asm->getFunctionEnd();
-      DefRange.Ranges.emplace_back(Begin, End);
+      Var.DefRanges[DefRange].emplace_back(Begin, End);
     }
 
-    LocalVariable Var;
-    Var.DIVar = VI.Var;
-    Var.DefRanges.emplace_back(std::move(DefRange));
     if (Deref)
       Var.UseReferenceType = true;
 
@@ -1374,24 +1367,18 @@ void CodeViewDebug::calculateRanges(
     // We can only handle a register or an offseted load of a register.
     if (Location->Register == 0 || Location->LoadChain.size() > 1)
       continue;
-    {
-      LocalVarDefRange DR;
-      DR.CVRegister = TRI->getCodeViewRegNum(Location->Register);
-      DR.InMemory = !Location->LoadChain.empty();
-      DR.DataOffset =
-          !Location->LoadChain.empty() ? Location->LoadChain.back() : 0;
-      if (Location->FragmentInfo) {
-        DR.IsSubfield = true;
-        DR.StructOffset = Location->FragmentInfo->OffsetInBits / 8;
-      } else {
-        DR.IsSubfield = false;
-        DR.StructOffset = 0;
-      }
 
-      if (Var.DefRanges.empty() ||
-          Var.DefRanges.back().isDifferentLocation(DR)) {
-        Var.DefRanges.emplace_back(std::move(DR));
-      }
+    LocalVarDef DR;
+    DR.CVRegister = TRI->getCodeViewRegNum(Location->Register);
+    DR.InMemory = !Location->LoadChain.empty();
+    DR.DataOffset =
+        !Location->LoadChain.empty() ? Location->LoadChain.back() : 0;
+    if (Location->FragmentInfo) {
+      DR.IsSubfield = true;
+      DR.StructOffset = Location->FragmentInfo->OffsetInBits / 8;
+    } else {
+      DR.IsSubfield = false;
+      DR.StructOffset = 0;
     }
 
     // Compute the label range.
@@ -1408,7 +1395,7 @@ void CodeViewDebug::calculateRanges(
     // If the last range end is our begin, just extend the last range.
     // Otherwise make a new range.
     SmallVectorImpl<std::pair<const MCSymbol *, const MCSymbol *>> &R =
-        Var.DefRanges.back().Ranges;
+        Var.DefRanges[DR];
     if (!R.empty() && R.back().second == Begin)
       R.back().second = End;
     else
@@ -1525,7 +1512,7 @@ void CodeViewDebug::beginFunctionImpl(const MachineFunction *MF) {
   // FIXME: Set GuardCfg when it is implemented.
   CurFn->FrameProcOpts = FPO;
 
-  OS.EmitCVFuncIdDirective(CurFn->FuncId);
+  OS.emitCVFuncIdDirective(CurFn->FuncId);
 
   // Find the end of the function prolog.  First known non-DBG_VALUE and
   // non-frame setup location marks the beginning of the function body.
@@ -1825,6 +1812,7 @@ TypeIndex CodeViewDebug::lowerTypeBasic(const DIBasicType *Ty) {
     break;
   case dwarf::DW_ATE_UTF:
     switch (ByteSize) {
+    case 1: STK = SimpleTypeKind::Character8; break;
     case 2: STK = SimpleTypeKind::Character16; break;
     case 4: STK = SimpleTypeKind::Character32; break;
     }
@@ -2820,7 +2808,9 @@ void CodeViewDebug::emitLocalVariable(const FunctionInfo &FI,
   // records and on disk formats are described in SymbolRecords.h. BytePrefix
   // should be big enough to hold all forms without memory allocation.
   SmallString<20> BytePrefix;
-  for (const LocalVarDefRange &DefRange : Var.DefRanges) {
+  for (const auto &Pair : Var.DefRanges) {
+    LocalVarDef DefRange = Pair.first;
+    const auto &Ranges = Pair.second;
     BytePrefix.clear();
     if (DefRange.InMemory) {
       int Offset = DefRange.DataOffset;
@@ -2844,7 +2834,7 @@ void CodeViewDebug::emitLocalVariable(const FunctionInfo &FI,
                : (EncFP == FI.EncodedLocalFramePtrReg))) {
         DefRangeFramePointerRelHeader DRHdr;
         DRHdr.Offset = Offset;
-        OS.emitCVDefRangeDirective(DefRange.Ranges, DRHdr);
+        OS.emitCVDefRangeDirective(Ranges, DRHdr);
       } else {
         uint16_t RegRelFlags = 0;
         if (DefRange.IsSubfield) {
@@ -2856,7 +2846,7 @@ void CodeViewDebug::emitLocalVariable(const FunctionInfo &FI,
         DRHdr.Register = Reg;
         DRHdr.Flags = RegRelFlags;
         DRHdr.BasePointerOffset = Offset;
-        OS.emitCVDefRangeDirective(DefRange.Ranges, DRHdr);
+        OS.emitCVDefRangeDirective(Ranges, DRHdr);
       }
     } else {
       assert(DefRange.DataOffset == 0 && "unexpected offset into register");
@@ -2865,12 +2855,12 @@ void CodeViewDebug::emitLocalVariable(const FunctionInfo &FI,
         DRHdr.Register = DefRange.CVRegister;
         DRHdr.MayHaveNoName = 0;
         DRHdr.OffsetInParent = DefRange.StructOffset;
-        OS.emitCVDefRangeDirective(DefRange.Ranges, DRHdr);
+        OS.emitCVDefRangeDirective(Ranges, DRHdr);
       } else {
         DefRangeRegisterHeader DRHdr;
         DRHdr.Register = DefRange.CVRegister;
         DRHdr.MayHaveNoName = 0;
-        OS.emitCVDefRangeDirective(DefRange.Ranges, DRHdr);
+        OS.emitCVDefRangeDirective(Ranges, DRHdr);
       }
     }
   }
@@ -2894,9 +2884,9 @@ void CodeViewDebug::emitLexicalBlock(const LexicalBlock &Block,
   OS.AddComment("Code size");
   OS.emitAbsoluteSymbolDiff(Block.End, Block.Begin, 4);   // Code Size
   OS.AddComment("Function section relative address");
-  OS.EmitCOFFSecRel32(Block.Begin, /*Offset=*/0);         // Func Offset
+  OS.emitCOFFSecRel32(Block.Begin, /*Offset=*/0); // Func Offset
   OS.AddComment("Function section index");
-  OS.EmitCOFFSectionIndex(FI.Begin);                      // Func Symbol
+  OS.emitCOFFSectionIndex(FI.Begin); // Func Symbol
   OS.AddComment("Lexical block name");
   emitNullTerminatedSymbolName(OS, Block.Name);           // Name
   endSymbolRecord(RecordEnd);
@@ -3181,6 +3171,11 @@ void CodeViewDebug::collectGlobalVariableInfo() {
     for (const auto *GVE : CU->getGlobalVariables()) {
       const DIGlobalVariable *DIGV = GVE->getVariable();
       const DIExpression *DIE = GVE->getExpression();
+      // Don't emit string literals in CodeView, as the only useful parts are
+      // generally the filename and line number, which isn't possible to output
+      // in CodeView. String literals should be the only unnamed GlobalVariable
+      // with debug info.
+      if (DIGV->getName().empty()) continue;
 
       if ((DIE->getNumElements() == 2) &&
           (DIE->getElement(0) == dwarf::DW_OP_plus_uconst))
@@ -3380,10 +3375,10 @@ void CodeViewDebug::emitDebugInfoForGlobal(const CVGlobalVariable &CVGV) {
     if (CVGlobalVariableOffsets.find(DIGV) != CVGlobalVariableOffsets.end())
       // Use the offset seen while collecting info on globals.
       Offset = CVGlobalVariableOffsets[DIGV];
-    OS.EmitCOFFSecRel32(GVSym, Offset);
+    OS.emitCOFFSecRel32(GVSym, Offset);
 
     OS.AddComment("Segment");
-    OS.EmitCOFFSectionIndex(GVSym);
+    OS.emitCOFFSectionIndex(GVSym);
     OS.AddComment("Name");
     const unsigned LengthOfDataRecord = 12;
     emitNullTerminatedSymbolName(OS, QualifiedName, LengthOfDataRecord);
diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
index d1fc3cdccb20..16f0082723ed 100644
--- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
+++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
@@ -50,18 +50,8 @@ class MachineFunction;
 
 /// Collects and handles line tables information in a CodeView format.
 class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
-  MCStreamer &OS;
-  BumpPtrAllocator Allocator;
-  codeview::GlobalTypeTableBuilder TypeTable;
-
-  /// Whether to emit type record hashes into .debug$H.
-  bool EmitDebugGlobalHashes = false;
-
-  /// The codeview CPU type used by the translation unit.
-  codeview::CPUType TheCPU;
-
-  /// Represents the most general definition range.
-  struct LocalVarDefRange {
+public:
+  struct LocalVarDef {
     /// Indicates that variable data is stored in memory relative to the
     /// specified register.
     int InMemory : 1;
@@ -79,23 +69,40 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
     /// location containing the data.
     uint16_t CVRegister;
 
-    /// Compares all location fields. This includes all fields except the label
-    /// ranges.
-    bool isDifferentLocation(LocalVarDefRange &O) {
-      return InMemory != O.InMemory || DataOffset != O.DataOffset ||
-             IsSubfield != O.IsSubfield || StructOffset != O.StructOffset ||
-             CVRegister != O.CVRegister;
+    uint64_t static toOpaqueValue(const LocalVarDef DR) {
+      uint64_t Val = 0;
+      std::memcpy(&Val, &DR, sizeof(Val));
+      return Val;
     }
 
-    SmallVector<std::pair<const MCSymbol *, const MCSymbol *>, 1> Ranges;
+    LocalVarDef static createFromOpaqueValue(uint64_t Val) {
+      LocalVarDef DR;
+      std::memcpy(&DR, &Val, sizeof(Val));
+      return DR;
+    }
   };
 
-  static LocalVarDefRange createDefRangeMem(uint16_t CVRegister, int Offset);
+  static_assert(sizeof(uint64_t) == sizeof(LocalVarDef), "");
+
+private:
+  MCStreamer &OS;
+  BumpPtrAllocator Allocator;
+  codeview::GlobalTypeTableBuilder TypeTable;
+
+  /// Whether to emit type record hashes into .debug$H.
+  bool EmitDebugGlobalHashes = false;
+
+  /// The codeview CPU type used by the translation unit.
+  codeview::CPUType TheCPU;
+
+  static LocalVarDef createDefRangeMem(uint16_t CVRegister, int Offset);
 
   /// Similar to DbgVariable in DwarfDebug, but not dwarf-specific.
   struct LocalVariable {
     const DILocalVariable *DIVar = nullptr;
-    SmallVector<LocalVarDefRange, 1> DefRanges;
+    MapVector<LocalVarDef,
+              SmallVector<std::pair<const MCSymbol *, const MCSymbol *>, 1>>
+        DefRanges;
     bool UseReferenceType = false;
   };
 
@@ -493,6 +500,27 @@ public:
   void beginInstruction(const MachineInstr *MI) override;
 };
 
+template <> struct DenseMapInfo<CodeViewDebug::LocalVarDef> {
+
+  static inline CodeViewDebug::LocalVarDef getEmptyKey() {
+    return CodeViewDebug::LocalVarDef::createFromOpaqueValue(~0ULL);
+  }
+
+  static inline CodeViewDebug::LocalVarDef getTombstoneKey() {
+    return CodeViewDebug::LocalVarDef::createFromOpaqueValue(~0ULL - 1ULL);
+  }
+
+  static unsigned getHashValue(const CodeViewDebug::LocalVarDef &DR) {
+    return CodeViewDebug::LocalVarDef::toOpaqueValue(DR) * 37ULL;
+  }
+
+  static bool isEqual(const CodeViewDebug::LocalVarDef &LHS,
+                      const CodeViewDebug::LocalVarDef &RHS) {
+    return CodeViewDebug::LocalVarDef::toOpaqueValue(LHS) ==
+           CodeViewDebug::LocalVarDef::toOpaqueValue(RHS);
+  }
+};
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_CODEGEN_ASMPRINTER_CODEVIEWDEBUG_H
diff --git a/llvm/lib/CodeGen/AsmPrinter/DIE.cpp b/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
index 396322c4979d..617ddbd66e4e 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -13,21 +13,15 @@
 #include "llvm/CodeGen/DIE.h"
 #include "DwarfCompileUnit.h"
 #include "DwarfDebug.h"
-#include "DwarfUnit.h"
-#include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/Config/llvm-config.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/LEB128.h"
-#include "llvm/Support/MD5.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -170,7 +164,7 @@ DIEAbbrev &DIEAbbrevSet::uniqueAbbreviation(DIE &Die) {
 void DIEAbbrevSet::Emit(const AsmPrinter *AP, MCSection *Section) const {
   if (!Abbreviations.empty()) {
     // Start the debug abbrev section.
-    AP->OutStreamer->SwitchSection(Section);
+    AP->OutStreamer->switchSection(Section);
     AP->emitDwarfAbbrevs(Abbreviations);
   }
 }
@@ -204,6 +198,7 @@ const DIE *DIE::getUnitDie() const {
   const DIE *p = this;
   while (p) {
     if (p->getTag() == dwarf::DW_TAG_compile_unit ||
+        p->getTag() == dwarf::DW_TAG_skeleton_unit ||
         p->getTag() == dwarf::DW_TAG_type_unit)
       return p;
     p = p->getParent();
@@ -378,7 +373,7 @@ void DIEInteger::emitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
   case dwarf::DW_FORM_flag_present:
     // Emit something to keep the lines and comments in sync.
     // FIXME: Is there a better way to do this?
-    Asm->OutStreamer->AddBlankLine();
+    Asm->OutStreamer->addBlankLine();
     return;
   case dwarf::DW_FORM_flag:
   case dwarf::DW_FORM_ref1:
diff --git a/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp b/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp
index e175854f7b93..5da50d7aab9f 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp
@@ -19,7 +19,6 @@
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/Endian.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
diff --git a/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp b/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
index dd795079ac1a..1358f4d25990 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/DbgEntityHistoryCalculator.h"
-#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
@@ -204,7 +203,7 @@ void DbgValueHistoryMap::trimLocationRanges(
       if (auto R = intersects(StartMI, EndMI, ScopeRanges, Ordering)) {
         // Adjust ScopeRanges to exclude ranges which subsequent location ranges
         // cannot possibly intersect.
-        ScopeRanges = ArrayRef<InsnRange>(R.getValue(), ScopeRanges.end());
+        ScopeRanges = ArrayRef<InsnRange>(*R, ScopeRanges.end());
       } else {
         // If the location range does not intersect any scope range then the
         // DBG_VALUE which opened this location range is usless, mark it for
diff --git a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
index 18fc46c74eb4..660a064687d3 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -13,7 +13,6 @@
 
 #include "llvm/CodeGen/DebugHandlerBase.h"
 #include "llvm/ADT/Optional.h"
-#include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
index 63343d2519f9..5f187acf13dc 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
@@ -11,23 +11,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "DwarfException.h"
-#include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Mangler.h"
-#include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MachineLocation.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FormattedStream.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
@@ -53,7 +43,7 @@ void DwarfCFIExceptionBase::endFragment() {
 DwarfCFIException::DwarfCFIException(AsmPrinter *A)
     : DwarfCFIExceptionBase(A) {}
 
-DwarfCFIException::~DwarfCFIException() {}
+DwarfCFIException::~DwarfCFIException() = default;
 
 /// endModule - Emit all exception information that should come after the
 /// content.
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 5913c687db48..b3f99d346faa 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -21,7 +21,6 @@
 #include "llvm/CodeGen/DIE.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -67,13 +66,13 @@ DwarfCompileUnit::DwarfCompileUnit(unsigned UID, const DICompileUnit *Node,
 /// DW_FORM_addr or DW_FORM_GNU_addr_index.
 void DwarfCompileUnit::addLabelAddress(DIE &Die, dwarf::Attribute Attribute,
                                        const MCSymbol *Label) {
+  if ((Skeleton || !DD->useSplitDwarf()) && Label)
+    DD->addArangeLabel(SymbolCU(this, Label));
+
   // Don't use the address pool in non-fission or in the skeleton unit itself.
   if ((!DD->useSplitDwarf() || !Skeleton) && DD->getDwarfVersion() < 5)
     return addLocalLabelAddress(Die, Attribute, Label);
 
-  if (Label)
-    DD->addArangeLabel(SymbolCU(this, Label));
-
   bool UseAddrOffsetFormOrExpressions =
       DD->useAddrOffsetForm() || DD->useAddrOffsetExpressions();
 
@@ -108,9 +107,6 @@ void DwarfCompileUnit::addLabelAddress(DIE &Die, dwarf::Attribute Attribute,
 void DwarfCompileUnit::addLocalLabelAddress(DIE &Die,
                                             dwarf::Attribute Attribute,
                                             const MCSymbol *Label) {
-  if (Label)
-    DD->addArangeLabel(SymbolCU(this, Label));
-
   if (Label)
     addAttribute(Die, Attribute, dwarf::DW_FORM_addr, DIELabel(Label));
   else
@@ -169,7 +165,9 @@ DIE *DwarfCompileUnit::getOrCreateGlobalVariableDIE(
   } else {
     DeclContext = GV->getScope();
     // Add name and type.
-    addString(*VariableDIE, dwarf::DW_AT_name, GV->getDisplayName());
+    StringRef DisplayName = GV->getDisplayName();
+    if (!DisplayName.empty())
+      addString(*VariableDIE, dwarf::DW_AT_name, GV->getDisplayName());
     if (GTy)
       addType(*VariableDIE, GTy);
 
@@ -303,8 +301,11 @@ void DwarfCompileUnit::addLocationAttribute(
                   DD->useGNUTLSOpcode() ? dwarf::DW_OP_GNU_push_tls_address
                                         : dwarf::DW_OP_form_tls_address);
         }
-      } else if (Asm->TM.getRelocationModel() == Reloc::RWPI ||
-                 Asm->TM.getRelocationModel() == Reloc::ROPI_RWPI) {
+      } else if ((Asm->TM.getRelocationModel() == Reloc::RWPI ||
+                  Asm->TM.getRelocationModel() == Reloc::ROPI_RWPI) &&
+                 !Asm->getObjFileLowering()
+                      .getKindForGlobal(Global, Asm->TM)
+                      .isReadOnly()) {
         auto FormAndOp = GetPointerSizedFormAndOp();
         // Constant
         addUInt(*Loc, dwarf::DW_FORM_data1, FormAndOp.Op);
@@ -505,7 +506,7 @@ DIE &DwarfCompileUnit::updateSubprogramScopeDIE(const DISubprogram *SP) {
           // FIXME: when writing dwo, we need to avoid relocations. Probably
           // the "right" solution is to treat globals the way func and data
           // symbols are (with entries in .debug_addr).
-          // For now, since we only ever use index 0, this should work as-is.       
+          // For now, since we only ever use index 0, this should work as-is.
           addUInt(*Loc, dwarf::DW_FORM_data4, FrameBase.Location.WasmLoc.Index);
         }
         addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_stack_value);
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index f2e1f6346803..61412cde34c8 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -25,7 +25,6 @@
 #include "llvm/CodeGen/LexicalScopes.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/Support/Casting.h"
-#include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <memory>
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 609b568f28be..866338a949f3 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -31,8 +31,8 @@
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/DebugInfo/DWARF/DWARFExpression.h"
 #include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
+#include "llvm/DebugInfo/DWARF/DWARFExpression.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -45,14 +45,11 @@
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/MachineLocation.h"
 #include "llvm/MC/SectionKind.h"
-#include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MD5.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
@@ -360,7 +357,7 @@ DwarfDebug::DwarfDebug(AsmPrinter *A)
     DebuggerTuning = Asm->TM.Options.DebuggerTuning;
   else if (IsDarwin)
     DebuggerTuning = DebuggerKind::LLDB;
-  else if (TT.isPS4CPU())
+  else if (TT.isPS())
     DebuggerTuning = DebuggerKind::SCE;
   else if (TT.isOSAIX())
     DebuggerTuning = DebuggerKind::DBX;
@@ -2315,7 +2312,7 @@ void DwarfDebug::emitStringOffsetsTableHeader() {
 template <typename AccelTableT>
 void DwarfDebug::emitAccel(AccelTableT &Accel, MCSection *Section,
                            StringRef TableName) {
-  Asm->OutStreamer->SwitchSection(Section);
+  Asm->OutStreamer->switchSection(Section);
 
   // Emit the full data.
   emitAppleAccelTable(Asm, Accel, TableName, Section->getBeginSymbol());
@@ -2434,12 +2431,12 @@ void DwarfDebug::emitDebugPubSections() {
     bool GnuStyle = TheU->getCUNode()->getNameTableKind() ==
                     DICompileUnit::DebugNameTableKind::GNU;
 
-    Asm->OutStreamer->SwitchSection(
+    Asm->OutStreamer->switchSection(
         GnuStyle ? Asm->getObjFileLowering().getDwarfGnuPubNamesSection()
                  : Asm->getObjFileLowering().getDwarfPubNamesSection());
     emitDebugPubSection(GnuStyle, "Names", TheU, TheU->getGlobalNames());
 
-    Asm->OutStreamer->SwitchSection(
+    Asm->OutStreamer->switchSection(
         GnuStyle ? Asm->getObjFileLowering().getDwarfGnuPubTypesSection()
                  : Asm->getObjFileLowering().getDwarfPubTypesSection());
     emitDebugPubSection(GnuStyle, "Types", TheU, TheU->getGlobalTypes());
@@ -2849,7 +2846,7 @@ void DwarfDebug::emitDebugLocImpl(MCSection *Sec) {
   if (DebugLocs.getLists().empty())
     return;
 
-  Asm->OutStreamer->SwitchSection(Sec);
+  Asm->OutStreamer->switchSection(Sec);
 
   MCSymbol *TableEnd = nullptr;
   if (getDwarfVersion() >= 5)
@@ -2880,7 +2877,7 @@ void DwarfDebug::emitDebugLocDWO() {
   }
 
   for (const auto &List : DebugLocs.getLists()) {
-    Asm->OutStreamer->SwitchSection(
+    Asm->OutStreamer->switchSection(
         Asm->getObjFileLowering().getDwarfLocDWOSection());
     Asm->OutStreamer->emitLabel(List.Label);
 
@@ -2953,8 +2950,8 @@ void DwarfDebug::emitDebugARanges() {
 
     // Sort the symbols by offset within the section.
     llvm::stable_sort(List, [&](const SymbolCU &A, const SymbolCU &B) {
-      unsigned IA = A.Sym ? Asm->OutStreamer->GetSymbolOrder(A.Sym) : 0;
-      unsigned IB = B.Sym ? Asm->OutStreamer->GetSymbolOrder(B.Sym) : 0;
+      unsigned IA = A.Sym ? Asm->OutStreamer->getSymbolOrder(A.Sym) : 0;
+      unsigned IB = B.Sym ? Asm->OutStreamer->getSymbolOrder(B.Sym) : 0;
 
       // Symbols with no order assigned should be placed at the end.
       // (e.g. section end labels)
@@ -2987,7 +2984,7 @@ void DwarfDebug::emitDebugARanges() {
   }
 
   // Start the dwarf aranges section.
-  Asm->OutStreamer->SwitchSection(
+  Asm->OutStreamer->switchSection(
       Asm->getObjFileLowering().getDwarfARangesSection());
 
   unsigned PtrSize = Asm->MAI->getCodePointerSize();
@@ -3045,15 +3042,22 @@ void DwarfDebug::emitDebugARanges() {
     for (const ArangeSpan &Span : List) {
       Asm->emitLabelReference(Span.Start, PtrSize);
 
-      // Calculate the size as being from the span start to it's end.
-      if (Span.End) {
+      // Calculate the size as being from the span start to its end.
+      //
+      // If the size is zero, then round it up to one byte. The DWARF
+      // specification requires that entries in this table have nonzero
+      // lengths.
+      auto SizeRef = SymSize.find(Span.Start);
+      if ((SizeRef == SymSize.end() || SizeRef->second != 0) && Span.End) {
         Asm->emitLabelDifference(Span.End, Span.Start, PtrSize);
       } else {
         // For symbols without an end marker (e.g. common), we
         // write a single arange entry containing just that one symbol.
-        uint64_t Size = SymSize[Span.Start];
-        if (Size == 0)
+        uint64_t Size;
+        if (SizeRef == SymSize.end() || SizeRef->second == 0)
           Size = 1;
+        else
+          Size = SizeRef->second;
 
         Asm->OutStreamer->emitIntValue(Size, PtrSize);
       }
@@ -3087,7 +3091,7 @@ void DwarfDebug::emitDebugRangesImpl(const DwarfFile &Holder, MCSection *Section
     return !Pair.second->getCUNode()->isDebugDirectivesOnly();
   }));
 
-  Asm->OutStreamer->SwitchSection(Section);
+  Asm->OutStreamer->switchSection(Section);
 
   MCSymbol *TableEnd = nullptr;
   if (getDwarfVersion() >= 5)
@@ -3239,7 +3243,7 @@ void DwarfDebug::emitDebugMacinfoImpl(MCSection *Section) {
     DIMacroNodeArray Macros = CUNode->getMacros();
     if (Macros.empty())
       continue;
-    Asm->OutStreamer->SwitchSection(Section);
+    Asm->OutStreamer->switchSection(Section);
     Asm->OutStreamer->emitLabel(U.getMacroLabelBegin());
     if (UseDebugMacroSection)
       emitMacroHeader(Asm, *this, U, getDwarfVersion());
@@ -3447,22 +3451,6 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
   CU.addDIETypeSignature(RefDie, Signature);
 }
 
-DwarfDebug::NonTypeUnitContext::NonTypeUnitContext(DwarfDebug *DD)
-    : DD(DD),
-      TypeUnitsUnderConstruction(std::move(DD->TypeUnitsUnderConstruction)), AddrPoolUsed(DD->AddrPool.hasBeenUsed()) {
-  DD->TypeUnitsUnderConstruction.clear();
-  DD->AddrPool.resetUsedFlag();
-}
-
-DwarfDebug::NonTypeUnitContext::~NonTypeUnitContext() {
-  DD->TypeUnitsUnderConstruction = std::move(TypeUnitsUnderConstruction);
-  DD->AddrPool.resetUsedFlag(AddrPoolUsed);
-}
-
-DwarfDebug::NonTypeUnitContext DwarfDebug::enterNonTypeUnitContext() {
-  return NonTypeUnitContext(this);
-}
-
 // Add the Name along with its companion DIE to the appropriate accelerator
 // table (for AccelTableKind::Dwarf it's always AccelDebugNames, for
 // AccelTableKind::Apple, we use the table we got as an argument). If
@@ -3555,6 +3543,6 @@ Optional<MD5::MD5Result> DwarfDebug::getMD5AsBytes(const DIFile *File) const {
   // An MD5 checksum is 16 bytes.
   std::string ChecksumString = fromHex(Checksum->Value);
   MD5::MD5Result CKMem;
-  std::copy(ChecksumString.begin(), ChecksumString.end(), CKMem.Bytes.data());
+  std::copy(ChecksumString.begin(), ChecksumString.end(), CKMem.data());
   return CKMem;
 }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 4e1a1b1e068d..31e4081b7141 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -14,14 +14,13 @@
 #define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFDEBUG_H
 
 #include "AddressPool.h"
-#include "DebugLocStream.h"
 #include "DebugLocEntry.h"
+#include "DebugLocStream.h"
 #include "DwarfFile.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -31,7 +30,6 @@
 #include "llvm/CodeGen/AccelTable.h"
 #include "llvm/CodeGen/DbgEntityHistoryCalculator.h"
 #include "llvm/CodeGen/DebugHandlerBase.h"
-#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Metadata.h"
@@ -80,7 +78,7 @@ private:
 public:
   DbgEntity(const DINode *N, const DILocation *IA, DbgEntityKind ID)
       : Entity(N), InlinedAt(IA), SubclassID(ID) {}
-  virtual ~DbgEntity() {}
+  virtual ~DbgEntity() = default;
 
   /// Accessors.
   /// @{
@@ -667,19 +665,6 @@ public:
   void addDwarfTypeUnitType(DwarfCompileUnit &CU, StringRef Identifier,
                             DIE &Die, const DICompositeType *CTy);
 
-  class NonTypeUnitContext {
-    DwarfDebug *DD;
-    decltype(DwarfDebug::TypeUnitsUnderConstruction) TypeUnitsUnderConstruction;
-    bool AddrPoolUsed;
-    friend class DwarfDebug;
-    NonTypeUnitContext(DwarfDebug *DD);
-  public:
-    NonTypeUnitContext(NonTypeUnitContext&&) = default;
-    ~NonTypeUnitContext();
-  };
-
-  NonTypeUnitContext enterNonTypeUnitContext();
-
   /// Add a label so that arange data can be generated for it.
   void addArangeLabel(SymbolCU SCU) { ArangeLabels.push_back(SCU); }
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
index fe438102ee98..1c21d5ee8bb1 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -329,7 +329,16 @@ bool DwarfExpression::addMachineRegExpression(const TargetRegisterInfo &TRI,
       return false;
     }
 
-  assert(DwarfRegs.size() == 1);
+  // TODO: We should not give up here but the following code needs to be changed
+  //       to deal with multiple (sub)registers first.
+  if (DwarfRegs.size() > 1) {
+    LLVM_DEBUG(dbgs() << "TODO: giving up on debug information due to "
+                         "multi-register usage.\n");
+    DwarfRegs.clear();
+    LocationKind = Unknown;
+    return false;
+  }
+
   auto Reg = DwarfRegs[0];
   bool FBReg = isFrameRegister(TRI, MachineReg);
   int SignedOffset = 0;
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp
index a67d0f032cf6..a497aa07284e 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp
@@ -12,9 +12,7 @@
 #include "DwarfUnit.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/Metadata.h"
 #include "llvm/MC/MCStreamer.h"
-#include <algorithm>
 #include <cstdint>
 
 using namespace llvm;
@@ -47,7 +45,7 @@ void DwarfFile::emitUnit(DwarfUnit *TheU, bool UseOffsets) {
   if (llvm::empty(TheU->getUnitDie().values()))
     return;
 
-  Asm->OutStreamer->SwitchSection(S);
+  Asm->OutStreamer->switchSection(S);
   TheU->emitHeader(UseOffsets);
   Asm->emitDwarfDIE(TheU->getUnitDie());
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
index a876f8ccace9..67b72f0b455d 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
@@ -39,7 +39,7 @@ DwarfStringPool::getEntryImpl(AsmPrinter &Asm, StringRef Str) {
 DwarfStringPool::EntryRef DwarfStringPool::getEntry(AsmPrinter &Asm,
                                                     StringRef Str) {
   auto &MapEntry = getEntryImpl(Asm, Str);
-  return EntryRef(MapEntry, false);
+  return EntryRef(MapEntry);
 }
 
 DwarfStringPool::EntryRef DwarfStringPool::getIndexedEntry(AsmPrinter &Asm,
@@ -47,7 +47,7 @@ DwarfStringPool::EntryRef DwarfStringPool::getIndexedEntry(AsmPrinter &Asm,
   auto &MapEntry = getEntryImpl(Asm, Str);
   if (!MapEntry.getValue().isIndexed())
     MapEntry.getValue().Index = NumIndexedStrings++;
-  return EntryRef(MapEntry, true);
+  return EntryRef(MapEntry);
 }
 
 void DwarfStringPool::emitStringOffsetsTableHeader(AsmPrinter &Asm,
@@ -55,7 +55,7 @@ void DwarfStringPool::emitStringOffsetsTableHeader(AsmPrinter &Asm,
                                                    MCSymbol *StartSym) {
   if (getNumIndexedStrings() == 0)
     return;
-  Asm.OutStreamer->SwitchSection(Section);
+  Asm.OutStreamer->switchSection(Section);
   unsigned EntrySize = Asm.getDwarfOffsetByteSize();
   // We are emitting the header for a contribution to the string offsets
   // table. The header consists of an entry with the contribution's
@@ -78,7 +78,7 @@ void DwarfStringPool::emit(AsmPrinter &Asm, MCSection *StrSection,
     return;
 
   // Start the dwarf str section.
-  Asm.OutStreamer->SwitchSection(StrSection);
+  Asm.OutStreamer->switchSection(StrSection);
 
   // Get all of the string pool entries and sort them by their offset.
   SmallVector<const StringMapEntry<EntryTy> *, 64> Entries;
@@ -117,7 +117,7 @@ void DwarfStringPool::emit(AsmPrinter &Asm, MCSection *StrSection,
         Entries[Entry.getValue().Index] = &Entry;
     }
 
-    Asm.OutStreamer->SwitchSection(OffsetSection);
+    Asm.OutStreamer->switchSection(OffsetSection);
     unsigned size = Asm.getDwarfOffsetByteSize();
     for (const auto &Entry : Entries)
       if (UseRelativeOffsets)
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 5a2bd479f277..81238b0fe0d2 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -17,12 +17,8 @@
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/None.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/iterator_range.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GlobalValue.h"
@@ -32,9 +28,7 @@
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MachineLocation.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include <cassert>
 #include <cstdint>
@@ -380,6 +374,8 @@ void DwarfUnit::addDIEEntry(DIE &Die, dwarf::Attribute Attribute,
     CU = getUnitDie().getUnit();
   if (!EntryCU)
     EntryCU = getUnitDie().getUnit();
+  assert(EntryCU == CU || !DD->useSplitDwarf() || DD->shareAcrossDWOCUs() ||
+         !static_cast<const DwarfUnit*>(CU)->isDwoUnit());
   addAttribute(Die, Attribute,
                EntryCU == CU ? dwarf::DW_FORM_ref4 : dwarf::DW_FORM_ref_addr,
                Entry);
@@ -596,10 +592,8 @@ DIE *DwarfUnit::createTypeDIE(const DIScope *Context, DIE &ContextDIE,
       // Skip updating the accelerator tables since this is not the full type.
       if (MDString *TypeId = CTy->getRawIdentifier())
         DD->addDwarfTypeUnitType(getCU(), TypeId->getString(), TyDIE, CTy);
-      else {
-        auto X = DD->enterNonTypeUnitContext();
+      else
         finishNonUnitTypeDIE(TyDIE, CTy);
-      }
       return &TyDIE;
     }
     constructTypeDIE(TyDIE, CTy);
@@ -805,7 +799,7 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy) {
   // or reference types.
   if (DTy->getDWARFAddressSpace())
     addUInt(Buffer, dwarf::DW_AT_address_class, dwarf::DW_FORM_data4,
-            DTy->getDWARFAddressSpace().getValue());
+            *DTy->getDWARFAddressSpace());
 }
 
 void DwarfUnit::constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args) {
@@ -1350,6 +1344,9 @@ void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie,
   if (SP->isRecursive())
     addFlag(SPDie, dwarf::DW_AT_recursive);
 
+  if (!SP->getTargetFuncName().empty())
+    addString(SPDie, dwarf::DW_AT_trampoline, SP->getTargetFuncName());
+
   if (DD->getDwarfVersion() >= 5 && SP->isDeleted())
     addFlag(SPDie, dwarf::DW_AT_deleted);
 }
@@ -1442,7 +1439,8 @@ DIE *DwarfUnit::getIndexTyDie() {
   addString(*IndexTyDie, dwarf::DW_AT_name, Name);
   addUInt(*IndexTyDie, dwarf::DW_AT_byte_size, None, sizeof(int64_t));
   addUInt(*IndexTyDie, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
-          dwarf::DW_ATE_unsigned);
+          dwarf::getArrayIndexTypeEncoding(
+              (dwarf::SourceLanguage)getLanguage()));
   DD->addAccelType(*CUNode, Name, *IndexTyDie, /*Flags*/ 0);
   return IndexTyDie;
 }
@@ -1847,11 +1845,5 @@ void DwarfUnit::addRnglistsBase() {
 }
 
 void DwarfTypeUnit::finishNonUnitTypeDIE(DIE& D, const DICompositeType *CTy) {
-  addFlag(D, dwarf::DW_AT_declaration);
-  StringRef Name = CTy->getName();
-  if (!Name.empty())
-    addString(D, dwarf::DW_AT_name, Name);
-  if (Name.startswith("_STN") || !Name.contains('<'))
-    addTemplateParams(D, CTy->getTemplateParams());
-  getCU().createTypeDIE(CTy);
+  DD->getAddressPool().resetUsedFlag(true);
 }
diff --git a/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
index 39f40b172c1b..31644959bdca 100644
--- a/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
@@ -19,7 +19,6 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -458,7 +457,7 @@ MCSymbol *EHStreamer::emitExceptionTable() {
   // Sometimes we want not to emit the data into separate section (e.g. ARM
   // EHABI). In this case LSDASection will be NULL.
   if (LSDASection)
-    Asm->OutStreamer->SwitchSection(LSDASection);
+    Asm->OutStreamer->switchSection(LSDASection);
   Asm->emitAlignment(Align(4));
 
   // Emit the LSDA.
@@ -806,7 +805,7 @@ void EHStreamer::emitTypeInfos(unsigned TTypeEncoding, MCSymbol *TTBaseLabel) {
   // Emit the Catch TypeInfos.
   if (VerboseAsm && !TypeInfos.empty()) {
     Asm->OutStreamer->AddComment(">> Catch TypeInfos <<");
-    Asm->OutStreamer->AddBlankLine();
+    Asm->OutStreamer->addBlankLine();
     Entry = TypeInfos.size();
   }
 
@@ -821,7 +820,7 @@ void EHStreamer::emitTypeInfos(unsigned TTypeEncoding, MCSymbol *TTBaseLabel) {
   // Emit the Exception Specifications.
   if (VerboseAsm && !FilterIds.empty()) {
     Asm->OutStreamer->AddComment(">> Filter TypeInfos <<");
-    Asm->OutStreamer->AddBlankLine();
+    Asm->OutStreamer->addBlankLine();
     Entry = 0;
   }
   for (std::vector<unsigned>::const_iterator
diff --git a/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
index 70777f07fc6c..62fd15d89512 100644
--- a/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
@@ -23,7 +23,6 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 
 using namespace llvm;
@@ -46,9 +45,8 @@ void ErlangGCPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
   unsigned IntPtrSize = M.getDataLayout().getPointerSize();
 
   // Put this in a custom .note section.
-  OS.SwitchSection(
-      AP.getObjFileLowering().getContext().getELFSection(".note.gc",
-                                                         ELF::SHT_PROGBITS, 0));
+  OS.switchSection(AP.getObjFileLowering().getContext().getELFSection(
+      ".note.gc", ELF::SHT_PROGBITS, 0));
 
   // For each function...
   for (GCModuleInfo::FuncInfoVec::iterator FI = Info.funcinfo_begin(),
diff --git a/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
index 3ade262d9af2..74fa30ab321b 100644
--- a/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
@@ -72,10 +72,10 @@ static void EmitCamlGlobal(const Module &M, AsmPrinter &AP, const char *Id) {
 
 void OcamlGCMetadataPrinter::beginAssembly(Module &M, GCModuleInfo &Info,
                                            AsmPrinter &AP) {
-  AP.OutStreamer->SwitchSection(AP.getObjFileLowering().getTextSection());
+  AP.OutStreamer->switchSection(AP.getObjFileLowering().getTextSection());
   EmitCamlGlobal(M, AP, "code_begin");
 
-  AP.OutStreamer->SwitchSection(AP.getObjFileLowering().getDataSection());
+  AP.OutStreamer->switchSection(AP.getObjFileLowering().getDataSection());
   EmitCamlGlobal(M, AP, "data_begin");
 }
 
@@ -99,16 +99,16 @@ void OcamlGCMetadataPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
                                             AsmPrinter &AP) {
   unsigned IntPtrSize = M.getDataLayout().getPointerSize();
 
-  AP.OutStreamer->SwitchSection(AP.getObjFileLowering().getTextSection());
+  AP.OutStreamer->switchSection(AP.getObjFileLowering().getTextSection());
   EmitCamlGlobal(M, AP, "code_end");
 
-  AP.OutStreamer->SwitchSection(AP.getObjFileLowering().getDataSection());
+  AP.OutStreamer->switchSection(AP.getObjFileLowering().getDataSection());
   EmitCamlGlobal(M, AP, "data_end");
 
   // FIXME: Why does ocaml emit this??
   AP.OutStreamer->emitIntValue(0, IntPtrSize);
 
-  AP.OutStreamer->SwitchSection(AP.getObjFileLowering().getDataSection());
+  AP.OutStreamer->switchSection(AP.getObjFileLowering().getDataSection());
   EmitCamlGlobal(M, AP, "frametable");
 
   int NumDescriptors = 0;
@@ -147,7 +147,7 @@ void OcamlGCMetadataPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
 
     AP.OutStreamer->AddComment("live roots for " +
                                Twine(FI->getFunction().getName()));
-    AP.OutStreamer->AddBlankLine();
+    AP.OutStreamer->addBlankLine();
 
     for (GCFunctionInfo::iterator J = FI->begin(), JE = FI->end(); J != JE;
          ++J) {
diff --git a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp
index bab187f46535..135eabc34838 100644
--- a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp
@@ -13,7 +13,7 @@
 #include "PseudoProbePrinter.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/Module.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/PseudoProbe.h"
 #include "llvm/MC/MCPseudoProbe.h"
 #include "llvm/MC/MCStreamer.h"
diff --git a/llvm/lib/CodeGen/AsmPrinter/WasmException.cpp b/llvm/lib/CodeGen/AsmPrinter/WasmException.cpp
index a17a2ca2790e..a514ff161cee 100644
--- a/llvm/lib/CodeGen/AsmPrinter/WasmException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/WasmException.cpp
@@ -12,6 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "WasmException.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
diff --git a/llvm/lib/CodeGen/AsmPrinter/WasmException.h b/llvm/lib/CodeGen/AsmPrinter/WasmException.h
index f06de786bd76..2abbe37cb6d9 100644
--- a/llvm/lib/CodeGen/AsmPrinter/WasmException.h
+++ b/llvm/lib/CodeGen/AsmPrinter/WasmException.h
@@ -15,9 +15,12 @@
 #define LLVM_LIB_CODEGEN_ASMPRINTER_WASMEXCEPTION_H
 
 #include "EHStreamer.h"
-#include "llvm/CodeGen/AsmPrinter.h"
 
 namespace llvm {
+class AsmPrinter;
+class MachineFunction;
+struct LandingPadInfo;
+template <typename T> class SmallVectorImpl;
 
 class LLVM_LIBRARY_VISIBILITY WasmException : public EHStreamer {
 public:
diff --git a/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp b/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp
index ad8432343a60..5d813b72c0b7 100644
--- a/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp
@@ -15,11 +15,8 @@
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCStreamer.h"
 
@@ -29,7 +26,7 @@ using namespace llvm;
 
 WinCFGuard::WinCFGuard(AsmPrinter *A) : Asm(A) {}
 
-WinCFGuard::~WinCFGuard() {}
+WinCFGuard::~WinCFGuard() = default;
 
 void WinCFGuard::endFunction(const MachineFunction *MF) {
 
@@ -110,19 +107,19 @@ void WinCFGuard::endModule() {
 
   // Emit the symbol index of each GFIDs entry to form the .gfids section.
   auto &OS = *Asm->OutStreamer;
-  OS.SwitchSection(Asm->OutContext.getObjectFileInfo()->getGFIDsSection());
+  OS.switchSection(Asm->OutContext.getObjectFileInfo()->getGFIDsSection());
   for (const MCSymbol *S : GFIDsEntries)
-    OS.EmitCOFFSymbolIndex(S);
+    OS.emitCOFFSymbolIndex(S);
 
   // Emit the symbol index of each GIATs entry to form the .giats section.
-  OS.SwitchSection(Asm->OutContext.getObjectFileInfo()->getGIATsSection());
+  OS.switchSection(Asm->OutContext.getObjectFileInfo()->getGIATsSection());
   for (const MCSymbol *S : GIATsEntries) {
-    OS.EmitCOFFSymbolIndex(S);
+    OS.emitCOFFSymbolIndex(S);
   }
 
   // Emit the symbol index of each longjmp target to form the .gljmp section.
-  OS.SwitchSection(Asm->OutContext.getObjectFileInfo()->getGLJMPSection());
+  OS.switchSection(Asm->OutContext.getObjectFileInfo()->getGLJMPSection());
   for (const MCSymbol *S : LongjmpTargets) {
-    OS.EmitCOFFSymbolIndex(S);
+    OS.emitCOFFSymbolIndex(S);
   }
 }
diff --git a/llvm/lib/CodeGen/AsmPrinter/WinException.cpp b/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
index ef57031c7294..c3ca9c92bf71 100644
--- a/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
@@ -23,19 +23,13 @@
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FormattedStream.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
 WinException::WinException(AsmPrinter *A) : EHStreamer(A) {
@@ -46,7 +40,7 @@ WinException::WinException(AsmPrinter *A) : EHStreamer(A) {
   isThumb = Asm->TM.getTargetTriple().isThumb();
 }
 
-WinException::~WinException() {}
+WinException::~WinException() = default;
 
 /// endModule - Emit all exception information that should come after the
 /// content.
@@ -55,13 +49,13 @@ void WinException::endModule() {
   const Module *M = MMI->getModule();
   for (const Function &F : *M)
     if (F.hasFnAttribute("safeseh"))
-      OS.EmitCOFFSafeSEH(Asm->getSymbol(&F));
+      OS.emitCOFFSafeSEH(Asm->getSymbol(&F));
 
   if (M->getModuleFlag("ehcontguard") && !EHContTargets.empty()) {
     // Emit the symbol index of each ehcont target.
-    OS.SwitchSection(Asm->OutContext.getObjectFileInfo()->getGEHContSection());
+    OS.switchSection(Asm->OutContext.getObjectFileInfo()->getGEHContSection());
     for (const MCSymbol *S : EHContTargets) {
-      OS.EmitCOFFSymbolIndex(S);
+      OS.emitCOFFSymbolIndex(S);
     }
   }
 }
@@ -122,7 +116,7 @@ void WinException::beginFunction(const MachineFunction *MF) {
 void WinException::markFunctionEnd() {
   if (isAArch64 && CurrentFuncletEntry &&
       (shouldEmitMoves || shouldEmitPersonality))
-    Asm->OutStreamer->EmitWinCFIFuncletOrFuncEnd();
+    Asm->OutStreamer->emitWinCFIFuncletOrFuncEnd();
 }
 
 /// endFunction - Gather and emit post-function exception information.
@@ -151,12 +145,12 @@ void WinException::endFunction(const MachineFunction *MF) {
     return;
 
   if (shouldEmitPersonality || shouldEmitLSDA) {
-    Asm->OutStreamer->PushSection();
+    Asm->OutStreamer->pushSection();
 
     // Just switch sections to the right xdata section.
     MCSection *XData = Asm->OutStreamer->getAssociatedXDataSection(
         Asm->OutStreamer->getCurrentSectionOnly());
-    Asm->OutStreamer->SwitchSection(XData);
+    Asm->OutStreamer->switchSection(XData);
 
     // Emit the tables appropriate to the personality function in use. If we
     // don't recognize the personality, assume it uses an Itanium-style LSDA.
@@ -171,7 +165,7 @@ void WinException::endFunction(const MachineFunction *MF) {
     else
       emitExceptionTable();
 
-    Asm->OutStreamer->PopSection();
+    Asm->OutStreamer->popSection();
   }
 
   if (!MF->getCatchretTargets().empty()) {
@@ -211,11 +205,11 @@ void WinException::beginFunclet(const MachineBasicBlock &MBB,
     Sym = getMCSymbolForMBB(Asm, &MBB);
 
     // Describe our funclet symbol as a function with internal linkage.
-    Asm->OutStreamer->BeginCOFFSymbolDef(Sym);
-    Asm->OutStreamer->EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC);
-    Asm->OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION
+    Asm->OutStreamer->beginCOFFSymbolDef(Sym);
+    Asm->OutStreamer->emitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC);
+    Asm->OutStreamer->emitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION
                                          << COFF::SCT_COMPLEX_TYPE_SHIFT);
-    Asm->OutStreamer->EndCOFFSymbolDef();
+    Asm->OutStreamer->endCOFFSymbolDef();
 
     // We want our funclet's entry point to be aligned such that no nops will be
     // present after the label.
@@ -229,7 +223,7 @@ void WinException::beginFunclet(const MachineBasicBlock &MBB,
   // Mark 'Sym' as starting our funclet.
   if (shouldEmitMoves || shouldEmitPersonality) {
     CurrentFuncletTextSection = Asm->OutStreamer->getCurrentSectionOnly();
-    Asm->OutStreamer->EmitWinCFIStartProc(Sym);
+    Asm->OutStreamer->emitWinCFIStartProc(Sym);
   }
 
   if (shouldEmitPersonality) {
@@ -248,15 +242,15 @@ void WinException::beginFunclet(const MachineBasicBlock &MBB,
     // inliner doesn't allow inlining them, this isn't a major problem in
     // practice.
     if (!CurrentFuncletEntry->isCleanupFuncletEntry())
-      Asm->OutStreamer->EmitWinEHHandler(PersHandlerSym, true, true);
+      Asm->OutStreamer->emitWinEHHandler(PersHandlerSym, true, true);
   }
 }
 
 void WinException::endFunclet() {
   if (isAArch64 && CurrentFuncletEntry &&
       (shouldEmitMoves || shouldEmitPersonality)) {
-    Asm->OutStreamer->SwitchSection(CurrentFuncletTextSection);
-    Asm->OutStreamer->EmitWinCFIFuncletOrFuncEnd();
+    Asm->OutStreamer->switchSection(CurrentFuncletTextSection);
+    Asm->OutStreamer->emitWinCFIFuncletOrFuncEnd();
   }
   endFuncletImpl();
 }
@@ -276,7 +270,7 @@ void WinException::endFuncletImpl() {
     if (Per == EHPersonality::MSVC_CXX && shouldEmitPersonality &&
         !CurrentFuncletEntry->isCleanupFuncletEntry()) {
       // Emit an UNWIND_INFO struct describing the prologue.
-      Asm->OutStreamer->EmitWinEHHandlerData();
+      Asm->OutStreamer->emitWinEHHandlerData();
 
       // If this is a C++ catch funclet (or the parent function),
       // emit a reference to the LSDA for the parent function.
@@ -287,14 +281,14 @@ void WinException::endFuncletImpl() {
     } else if (Per == EHPersonality::MSVC_TableSEH && MF->hasEHFunclets() &&
                !CurrentFuncletEntry->isEHFuncletEntry()) {
       // Emit an UNWIND_INFO struct describing the prologue.
-      Asm->OutStreamer->EmitWinEHHandlerData();
+      Asm->OutStreamer->emitWinEHHandlerData();
 
       // If this is the parent function in Win64 SEH, emit the LSDA immediately
       // following .seh_handlerdata.
       emitCSpecificHandlerTable(MF);
     } else if (shouldEmitPersonality || shouldEmitLSDA) {
       // Emit an UNWIND_INFO struct describing the prologue.
-      Asm->OutStreamer->EmitWinEHHandlerData();
+      Asm->OutStreamer->emitWinEHHandlerData();
       // In these cases, no further info is written to the .xdata section
       // right here, but is written by e.g. emitExceptionTable in endFunction()
       // above.
@@ -307,8 +301,8 @@ void WinException::endFuncletImpl() {
     // Switch back to the funclet start .text section now that we are done
     // writing to .xdata, and emit an .seh_endproc directive to mark the end of
     // the function.
-    Asm->OutStreamer->SwitchSection(CurrentFuncletTextSection);
-    Asm->OutStreamer->EmitWinCFIEndProc();
+    Asm->OutStreamer->switchSection(CurrentFuncletTextSection);
+    Asm->OutStreamer->emitWinCFIEndProc();
   }
 
   // Let's make sure we don't try to end the same funclet twice.
@@ -699,7 +693,12 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) {
   }
 
   int UnwindHelpOffset = 0;
-  if (Asm->MAI->usesWindowsCFI())
+  // TODO: The check for UnwindHelpFrameIdx against max() below (and the
+  // second check further below) can be removed if MS C++ unwinding is
+  // implemented for ARM, when test/CodeGen/ARM/Windows/wineh-basic.ll
+  // passes without the check.
+  if (Asm->MAI->usesWindowsCFI() &&
+      FuncInfo.UnwindHelpFrameIdx != std::numeric_limits<int>::max())
     UnwindHelpOffset =
         getFrameIndexOffset(FuncInfo.UnwindHelpFrameIdx, FuncInfo);
 
@@ -761,7 +760,8 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) {
   AddComment("IPToStateXData");
   OS.emitValue(create32bitRef(IPToStateXData), 4);
 
-  if (Asm->MAI->usesWindowsCFI()) {
+  if (Asm->MAI->usesWindowsCFI() &&
+      FuncInfo.UnwindHelpFrameIdx != std::numeric_limits<int>::max()) {
     AddComment("UnwindHelp");
     OS.emitInt32(UnwindHelpOffset);
   }
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 4838f6da750d..5ce6fbb5f647 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -15,7 +15,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/AtomicExpandUtils.h"
@@ -47,6 +47,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/LowerAtomic.h"
 #include <cassert>
 #include <cstdint>
 #include <iterator>
@@ -57,71 +58,72 @@ using namespace llvm;
 
 namespace {
 
-  class AtomicExpand: public FunctionPass {
-    const TargetLowering *TLI = nullptr;
+class AtomicExpand : public FunctionPass {
+  const TargetLowering *TLI = nullptr;
 
-  public:
-    static char ID; // Pass identification, replacement for typeid
+public:
+  static char ID; // Pass identification, replacement for typeid
 
-    AtomicExpand() : FunctionPass(ID) {
-      initializeAtomicExpandPass(*PassRegistry::getPassRegistry());
-    }
+  AtomicExpand() : FunctionPass(ID) {
+    initializeAtomicExpandPass(*PassRegistry::getPassRegistry());
+  }
 
-    bool runOnFunction(Function &F) override;
-
-  private:
-    bool bracketInstWithFences(Instruction *I, AtomicOrdering Order);
-    IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
-    LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
-    bool tryExpandAtomicLoad(LoadInst *LI);
-    bool expandAtomicLoadToLL(LoadInst *LI);
-    bool expandAtomicLoadToCmpXchg(LoadInst *LI);
-    StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI);
-    bool expandAtomicStore(StoreInst *SI);
-    bool tryExpandAtomicRMW(AtomicRMWInst *AI);
-    AtomicRMWInst *convertAtomicXchgToIntegerType(AtomicRMWInst *RMWI);
-    Value *
-    insertRMWLLSCLoop(IRBuilder<> &Builder, Type *ResultTy, Value *Addr,
-                      Align AddrAlign, AtomicOrdering MemOpOrder,
-                      function_ref<Value *(IRBuilder<> &, Value *)> PerformOp);
-    void expandAtomicOpToLLSC(
-        Instruction *I, Type *ResultTy, Value *Addr, Align AddrAlign,
-        AtomicOrdering MemOpOrder,
-        function_ref<Value *(IRBuilder<> &, Value *)> PerformOp);
-    void expandPartwordAtomicRMW(
-        AtomicRMWInst *I,
-        TargetLoweringBase::AtomicExpansionKind ExpansionKind);
-    AtomicRMWInst *widenPartwordAtomicRMW(AtomicRMWInst *AI);
-    bool expandPartwordCmpXchg(AtomicCmpXchgInst *I);
-    void expandAtomicRMWToMaskedIntrinsic(AtomicRMWInst *AI);
-    void expandAtomicCmpXchgToMaskedIntrinsic(AtomicCmpXchgInst *CI);
-
-    AtomicCmpXchgInst *convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI);
-    static Value *insertRMWCmpXchgLoop(
-        IRBuilder<> &Builder, Type *ResultType, Value *Addr, Align AddrAlign,
-        AtomicOrdering MemOpOrder, SyncScope::ID SSID,
-        function_ref<Value *(IRBuilder<> &, Value *)> PerformOp,
-        CreateCmpXchgInstFun CreateCmpXchg);
-    bool tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI);
-
-    bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
-    bool isIdempotentRMW(AtomicRMWInst *RMWI);
-    bool simplifyIdempotentRMW(AtomicRMWInst *RMWI);
-
-    bool expandAtomicOpToLibcall(Instruction *I, unsigned Size, Align Alignment,
-                                 Value *PointerOperand, Value *ValueOperand,
-                                 Value *CASExpected, AtomicOrdering Ordering,
-                                 AtomicOrdering Ordering2,
-                                 ArrayRef<RTLIB::Libcall> Libcalls);
-    void expandAtomicLoadToLibcall(LoadInst *LI);
-    void expandAtomicStoreToLibcall(StoreInst *LI);
-    void expandAtomicRMWToLibcall(AtomicRMWInst *I);
-    void expandAtomicCASToLibcall(AtomicCmpXchgInst *I);
-
-    friend bool
-    llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
-                                   CreateCmpXchgInstFun CreateCmpXchg);
-  };
+  bool runOnFunction(Function &F) override;
+
+private:
+  bool bracketInstWithFences(Instruction *I, AtomicOrdering Order);
+  IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
+  LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
+  bool tryExpandAtomicLoad(LoadInst *LI);
+  bool expandAtomicLoadToLL(LoadInst *LI);
+  bool expandAtomicLoadToCmpXchg(LoadInst *LI);
+  StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI);
+  bool tryExpandAtomicStore(StoreInst *SI);
+  void expandAtomicStore(StoreInst *SI);
+  bool tryExpandAtomicRMW(AtomicRMWInst *AI);
+  AtomicRMWInst *convertAtomicXchgToIntegerType(AtomicRMWInst *RMWI);
+  Value *
+  insertRMWLLSCLoop(IRBuilder<> &Builder, Type *ResultTy, Value *Addr,
+                    Align AddrAlign, AtomicOrdering MemOpOrder,
+                    function_ref<Value *(IRBuilder<> &, Value *)> PerformOp);
+  void
+  expandAtomicOpToLLSC(Instruction *I, Type *ResultTy, Value *Addr,
+                       Align AddrAlign, AtomicOrdering MemOpOrder,
+                       function_ref<Value *(IRBuilder<> &, Value *)> PerformOp);
+  void expandPartwordAtomicRMW(
+      AtomicRMWInst *I, TargetLoweringBase::AtomicExpansionKind ExpansionKind);
+  AtomicRMWInst *widenPartwordAtomicRMW(AtomicRMWInst *AI);
+  bool expandPartwordCmpXchg(AtomicCmpXchgInst *I);
+  void expandAtomicRMWToMaskedIntrinsic(AtomicRMWInst *AI);
+  void expandAtomicCmpXchgToMaskedIntrinsic(AtomicCmpXchgInst *CI);
+
+  AtomicCmpXchgInst *convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI);
+  static Value *
+  insertRMWCmpXchgLoop(IRBuilder<> &Builder, Type *ResultType, Value *Addr,
+                       Align AddrAlign, AtomicOrdering MemOpOrder,
+                       SyncScope::ID SSID,
+                       function_ref<Value *(IRBuilder<> &, Value *)> PerformOp,
+                       CreateCmpXchgInstFun CreateCmpXchg);
+  bool tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI);
+
+  bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
+  bool isIdempotentRMW(AtomicRMWInst *RMWI);
+  bool simplifyIdempotentRMW(AtomicRMWInst *RMWI);
+
+  bool expandAtomicOpToLibcall(Instruction *I, unsigned Size, Align Alignment,
+                               Value *PointerOperand, Value *ValueOperand,
+                               Value *CASExpected, AtomicOrdering Ordering,
+                               AtomicOrdering Ordering2,
+                               ArrayRef<RTLIB::Libcall> Libcalls);
+  void expandAtomicLoadToLibcall(LoadInst *LI);
+  void expandAtomicStoreToLibcall(StoreInst *LI);
+  void expandAtomicRMWToLibcall(AtomicRMWInst *I);
+  void expandAtomicCASToLibcall(AtomicCmpXchgInst *I);
+
+  friend bool
+  llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
+                                 CreateCmpXchgInstFun CreateCmpXchg);
+};
 
 } // end anonymous namespace
 
@@ -129,8 +131,8 @@ char AtomicExpand::ID = 0;
 
 char &llvm::AtomicExpandID = AtomicExpand::ID;
 
-INITIALIZE_PASS(AtomicExpand, DEBUG_TYPE, "Expand Atomic instructions",
-                false, false)
+INITIALIZE_PASS(AtomicExpand, DEBUG_TYPE, "Expand Atomic instructions", false,
+                false)
 
 FunctionPass *llvm::createAtomicExpandPass() { return new AtomicExpand(); }
 
@@ -252,7 +254,8 @@ bool AtomicExpand::runOnFunction(Function &F) {
     }
 
     if (LI) {
-      if (LI->getType()->isFloatingPointTy()) {
+      if (TLI->shouldCastAtomicLoadInIR(LI) ==
+          TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
         // TODO: add a TLI hook to control this so that each target can
         // convert to lowering the original type one at a time.
         LI = convertAtomicLoadToIntegerType(LI);
@@ -262,7 +265,8 @@ bool AtomicExpand::runOnFunction(Function &F) {
 
       MadeChange |= tryExpandAtomicLoad(LI);
     } else if (SI) {
-      if (SI->getValueOperand()->getType()->isFloatingPointTy()) {
+      if (TLI->shouldCastAtomicStoreInIR(SI) ==
+          TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
         // TODO: add a TLI hook to control this so that each target can
         // convert to lowering the original type one at a time.
         SI = convertAtomicStoreToIntegerType(SI);
@@ -271,8 +275,8 @@ bool AtomicExpand::runOnFunction(Function &F) {
         MadeChange = true;
       }
 
-      if (TLI->shouldExpandAtomicStoreInIR(SI))
-        MadeChange |= expandAtomicStore(SI);
+      if (tryExpandAtomicStore(SI))
+        MadeChange = true;
     } else if (RMWI) {
       // There are two different ways of expanding RMW instructions:
       // - into a load if it is idempotent
@@ -283,8 +287,8 @@ bool AtomicExpand::runOnFunction(Function &F) {
         MadeChange = true;
       } else {
         AtomicRMWInst::BinOp Op = RMWI->getOperation();
-        if (Op == AtomicRMWInst::Xchg &&
-            RMWI->getValOperand()->getType()->isFloatingPointTy()) {
+        if (TLI->shouldCastAtomicRMWIInIR(RMWI) ==
+            TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
           // TODO: add a TLI hook to control this so that each target can
           // convert to lowering the original type one at a time.
           RMWI = convertAtomicXchgToIntegerType(RMWI);
@@ -308,7 +312,7 @@ bool AtomicExpand::runOnFunction(Function &F) {
       // extend convertCmpXchgToInteger for floating point too.
       assert(!CASI->getCompareOperand()->getType()->isFloatingPointTy() &&
              "unimplemented - floating point not legal at IR level");
-      if (CASI->getCompareOperand()->getType()->isPointerTy() ) {
+      if (CASI->getCompareOperand()->getType()->isPointerTy()) {
         // TODO: add a TLI hook to control this so that each target can
         // convert to lowering the original type one at a time.
         CASI = convertCmpXchgToIntegerType(CASI);
@@ -351,14 +355,12 @@ IntegerType *AtomicExpand::getCorrespondingIntegerType(Type *T,
 /// convertAtomicStoreToIntegerType for background.
 LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) {
   auto *M = LI->getModule();
-  Type *NewTy = getCorrespondingIntegerType(LI->getType(),
-                                            M->getDataLayout());
+  Type *NewTy = getCorrespondingIntegerType(LI->getType(), M->getDataLayout());
 
   IRBuilder<> Builder(LI);
 
   Value *Addr = LI->getPointerOperand();
-  Type *PT = PointerType::get(NewTy,
-                              Addr->getType()->getPointerAddressSpace());
+  Type *PT = PointerType::get(NewTy, Addr->getType()->getPointerAddressSpace());
   Value *NewAddr = Builder.CreateBitCast(Addr, PT);
 
   auto *NewLI = Builder.CreateLoad(NewTy, NewAddr);
@@ -385,7 +387,9 @@ AtomicExpand::convertAtomicXchgToIntegerType(AtomicRMWInst *RMWI) {
   Value *Val = RMWI->getValOperand();
   Type *PT = PointerType::get(NewTy, RMWI->getPointerAddressSpace());
   Value *NewAddr = Builder.CreateBitCast(Addr, PT);
-  Value *NewVal = Builder.CreateBitCast(Val, NewTy);
+  Value *NewVal = Val->getType()->isPointerTy()
+                      ? Builder.CreatePtrToInt(Val, NewTy)
+                      : Builder.CreateBitCast(Val, NewTy);
 
   auto *NewRMWI =
       Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, NewAddr, NewVal,
@@ -393,7 +397,9 @@ AtomicExpand::convertAtomicXchgToIntegerType(AtomicRMWInst *RMWI) {
   NewRMWI->setVolatile(RMWI->isVolatile());
   LLVM_DEBUG(dbgs() << "Replaced " << *RMWI << " with " << *NewRMWI << "\n");
 
-  Value *NewRVal = Builder.CreateBitCast(NewRMWI, RMWI->getType());
+  Value *NewRVal = RMWI->getType()->isPointerTy()
+                       ? Builder.CreateIntToPtr(NewRMWI, RMWI->getType())
+                       : Builder.CreateBitCast(NewRMWI, RMWI->getType());
   RMWI->replaceAllUsesWith(NewRVal);
   RMWI->eraseFromParent();
   return NewRMWI;
@@ -413,11 +419,29 @@ bool AtomicExpand::tryExpandAtomicLoad(LoadInst *LI) {
     return expandAtomicLoadToLL(LI);
   case TargetLoweringBase::AtomicExpansionKind::CmpXChg:
     return expandAtomicLoadToCmpXchg(LI);
+  case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
+    LI->setAtomic(AtomicOrdering::NotAtomic);
+    return true;
   default:
     llvm_unreachable("Unhandled case in tryExpandAtomicLoad");
   }
 }
 
+bool AtomicExpand::tryExpandAtomicStore(StoreInst *SI) {
+  switch (TLI->shouldExpandAtomicStoreInIR(SI)) {
+  case TargetLoweringBase::AtomicExpansionKind::None:
+    return false;
+  case TargetLoweringBase::AtomicExpansionKind::Expand:
+    expandAtomicStore(SI);
+    return true;
+  case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
+    SI->setAtomic(AtomicOrdering::NotAtomic);
+    return true;
+  default:
+    llvm_unreachable("Unhandled case in tryExpandAtomicStore");
+  }
+}
+
 bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) {
   IRBuilder<> Builder(LI);
 
@@ -471,8 +495,7 @@ StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) {
   Value *NewVal = Builder.CreateBitCast(SI->getValueOperand(), NewTy);
 
   Value *Addr = SI->getPointerOperand();
-  Type *PT = PointerType::get(NewTy,
-                              Addr->getType()->getPointerAddressSpace());
+  Type *PT = PointerType::get(NewTy, Addr->getType()->getPointerAddressSpace());
   Value *NewAddr = Builder.CreateBitCast(Addr, PT);
 
   StoreInst *NewSI = Builder.CreateStore(NewVal, NewAddr);
@@ -484,7 +507,7 @@ StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) {
   return NewSI;
 }
 
-bool AtomicExpand::expandAtomicStore(StoreInst *SI) {
+void AtomicExpand::expandAtomicStore(StoreInst *SI) {
   // This function is only called on atomic stores that are too large to be
   // atomic if implemented as a native store. So we replace them by an
   // atomic swap, that can be implemented for example as a ldrex/strex on ARM
@@ -498,7 +521,7 @@ bool AtomicExpand::expandAtomicStore(StoreInst *SI) {
   SI->eraseFromParent();
 
   // Now we have an appropriate swap instruction, lower it as usual.
-  return tryExpandAtomicRMW(AI);
+  tryExpandAtomicRMW(AI);
 }
 
 static void createCmpXchgInstFun(IRBuilder<> &Builder, Value *Addr,
@@ -508,6 +531,7 @@ static void createCmpXchgInstFun(IRBuilder<> &Builder, Value *Addr,
   Type *OrigTy = NewVal->getType();
 
   // This code can go away when cmpxchg supports FP types.
+  assert(!OrigTy->isPointerTy());
   bool NeedBitcast = OrigTy->isFloatingPointTy();
   if (NeedBitcast) {
     IntegerType *IntTy = Builder.getIntNTy(OrigTy->getPrimitiveSizeInBits());
@@ -527,47 +551,6 @@ static void createCmpXchgInstFun(IRBuilder<> &Builder, Value *Addr,
     NewLoaded = Builder.CreateBitCast(NewLoaded, OrigTy);
 }
 
-/// Emit IR to implement the given atomicrmw operation on values in registers,
-/// returning the new value.
-static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder,
-                              Value *Loaded, Value *Inc) {
-  Value *NewVal;
-  switch (Op) {
-  case AtomicRMWInst::Xchg:
-    return Inc;
-  case AtomicRMWInst::Add:
-    return Builder.CreateAdd(Loaded, Inc, "new");
-  case AtomicRMWInst::Sub:
-    return Builder.CreateSub(Loaded, Inc, "new");
-  case AtomicRMWInst::And:
-    return Builder.CreateAnd(Loaded, Inc, "new");
-  case AtomicRMWInst::Nand:
-    return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new");
-  case AtomicRMWInst::Or:
-    return Builder.CreateOr(Loaded, Inc, "new");
-  case AtomicRMWInst::Xor:
-    return Builder.CreateXor(Loaded, Inc, "new");
-  case AtomicRMWInst::Max:
-    NewVal = Builder.CreateICmpSGT(Loaded, Inc);
-    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
-  case AtomicRMWInst::Min:
-    NewVal = Builder.CreateICmpSLE(Loaded, Inc);
-    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
-  case AtomicRMWInst::UMax:
-    NewVal = Builder.CreateICmpUGT(Loaded, Inc);
-    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
-  case AtomicRMWInst::UMin:
-    NewVal = Builder.CreateICmpULE(Loaded, Inc);
-    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
-  case AtomicRMWInst::FAdd:
-    return Builder.CreateFAdd(Loaded, Inc, "new");
-  case AtomicRMWInst::FSub:
-    return Builder.CreateFSub(Loaded, Inc, "new");
-  default:
-    llvm_unreachable("Unknown atomic op");
-  }
-}
-
 bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
   LLVMContext &Ctx = AI->getModule()->getContext();
   TargetLowering::AtomicExpansionKind Kind = TLI->shouldExpandAtomicRMWInIR(AI);
@@ -582,8 +565,8 @@ bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
                               TargetLoweringBase::AtomicExpansionKind::LLSC);
     } else {
       auto PerformOp = [&](IRBuilder<> &Builder, Value *Loaded) {
-        return performAtomicOp(AI->getOperation(), Builder, Loaded,
-                               AI->getValOperand());
+        return buildAtomicRMWValue(AI->getOperation(), Builder, Loaded,
+                                   AI->getValOperand());
       };
       expandAtomicOpToLLSC(AI, AI->getType(), AI->getPointerOperand(),
                            AI->getAlign(), AI->getOrdering(), PerformOp);
@@ -621,6 +604,12 @@ bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
     expandAtomicRMWToMaskedIntrinsic(AI);
     return true;
   }
+  case TargetLoweringBase::AtomicExpansionKind::BitTestIntrinsic: {
+    TLI->emitBitTestAtomicRMWIntrinsic(AI);
+    return true;
+  }
+  case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
+    return lowerAtomicRMWInst(AI);
   default:
     llvm_unreachable("Unhandled case in tryExpandAtomicRMW");
   }
@@ -703,7 +692,7 @@ static PartwordMaskValues createMaskInstrs(IRBuilder<> &Builder, Instruction *I,
     PMV.AlignedAddr = Addr;
     PMV.AlignedAddrAlignment = AddrAlign;
     PMV.ShiftAmt = ConstantInt::get(PMV.ValueType, 0);
-    PMV.Mask = ConstantInt::get(PMV.ValueType, ~0);
+    PMV.Mask = ConstantInt::get(PMV.ValueType, ~0, /*isSigned*/ true);
     return PMV;
   }
 
@@ -787,7 +776,7 @@ static Value *performMaskedAtomicOp(AtomicRMWInst::BinOp Op,
   case AtomicRMWInst::Sub:
   case AtomicRMWInst::Nand: {
     // The other arithmetic ops need to be masked into place.
-    Value *NewVal = performAtomicOp(Op, Builder, Loaded, Shifted_Inc);
+    Value *NewVal = buildAtomicRMWValue(Op, Builder, Loaded, Shifted_Inc);
     Value *NewVal_Masked = Builder.CreateAnd(NewVal, PMV.Mask);
     Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask);
     Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Masked);
@@ -801,7 +790,7 @@ static Value *performMaskedAtomicOp(AtomicRMWInst::BinOp Op,
     // truncate down to the original size, and expand out again after
     // doing the operation.
     Value *Loaded_Extract = extractMaskedValue(Builder, Loaded, PMV);
-    Value *NewVal = performAtomicOp(Op, Builder, Loaded_Extract, Inc);
+    Value *NewVal = buildAtomicRMWValue(Op, Builder, Loaded_Extract, Inc);
     Value *FinalVal = insertMaskedValue(Builder, Loaded, NewVal, PMV);
     return FinalVal;
   }
@@ -840,9 +829,8 @@ void AtomicExpand::expandPartwordAtomicRMW(
   Value *OldResult;
   if (ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg) {
     OldResult = insertRMWCmpXchgLoop(Builder, PMV.WordType, PMV.AlignedAddr,
-                                     PMV.AlignedAddrAlignment, MemOpOrder,
-                                     SSID, PerformPartwordOp,
-                                     createCmpXchgInstFun);
+                                     PMV.AlignedAddrAlignment, MemOpOrder, SSID,
+                                     PerformPartwordOp, createCmpXchgInstFun);
   } else {
     assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::LLSC);
     OldResult = insertRMWLLSCLoop(Builder, PMV.WordType, PMV.AlignedAddr,
@@ -1106,7 +1094,7 @@ Value *AtomicExpand::insertRMWLLSCLoop(
   //     [...]
   BasicBlock *ExitBB =
       BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
-  BasicBlock *LoopBB =  BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
+  BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
 
   // The split call above "helpfully" added a branch at the end of BB (to the
   // wrong place).
@@ -1135,7 +1123,8 @@ Value *AtomicExpand::insertRMWLLSCLoop(
 /// IR.  As a migration step, we convert back to what use to be the standard
 /// way to represent a pointer cmpxchg so that we can update backends one by
 /// one.
-AtomicCmpXchgInst *AtomicExpand::convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI) {
+AtomicCmpXchgInst *
+AtomicExpand::convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI) {
   auto *M = CI->getModule();
   Type *NewTy = getCorrespondingIntegerType(CI->getCompareOperand()->getType(),
                                             M->getDataLayout());
@@ -1143,8 +1132,7 @@ AtomicCmpXchgInst *AtomicExpand::convertCmpXchgToIntegerType(AtomicCmpXchgInst *
   IRBuilder<> Builder(CI);
 
   Value *Addr = CI->getPointerOperand();
-  Type *PT = PointerType::get(NewTy,
-                              Addr->getType()->getPointerAddressSpace());
+  Type *PT = PointerType::get(NewTy, Addr->getType()->getPointerAddressSpace());
   Value *NewAddr = Builder.CreateBitCast(Addr, PT);
 
   Value *NewCmp = Builder.CreatePtrToInt(CI->getCompareOperand(), NewTy);
@@ -1305,9 +1293,8 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   LoadedTryStore->addIncoming(UnreleasedLoad, ReleasingStoreBB);
   Value *NewValueInsert =
       insertMaskedValue(Builder, LoadedTryStore, CI->getNewValOperand(), PMV);
-  Value *StoreSuccess =
-      TLI->emitStoreConditional(Builder, NewValueInsert, PMV.AlignedAddr,
-                                MemOpOrder);
+  Value *StoreSuccess = TLI->emitStoreConditional(Builder, NewValueInsert,
+                                                  PMV.AlignedAddr, MemOpOrder);
   StoreSuccess = Builder.CreateICmpEQ(
       StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success");
   BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB : StartBB;
@@ -1418,27 +1405,27 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   return true;
 }
 
-bool AtomicExpand::isIdempotentRMW(AtomicRMWInst* RMWI) {
+bool AtomicExpand::isIdempotentRMW(AtomicRMWInst *RMWI) {
   auto C = dyn_cast<ConstantInt>(RMWI->getValOperand());
-  if(!C)
+  if (!C)
     return false;
 
   AtomicRMWInst::BinOp Op = RMWI->getOperation();
-  switch(Op) {
-    case AtomicRMWInst::Add:
-    case AtomicRMWInst::Sub:
-    case AtomicRMWInst::Or:
-    case AtomicRMWInst::Xor:
-      return C->isZero();
-    case AtomicRMWInst::And:
-      return C->isMinusOne();
-    // FIXME: we could also treat Min/Max/UMin/UMax by the INT_MIN/INT_MAX/...
-    default:
-      return false;
+  switch (Op) {
+  case AtomicRMWInst::Add:
+  case AtomicRMWInst::Sub:
+  case AtomicRMWInst::Or:
+  case AtomicRMWInst::Xor:
+    return C->isZero();
+  case AtomicRMWInst::And:
+    return C->isMinusOne();
+  // FIXME: we could also treat Min/Max/UMin/UMax by the INT_MIN/INT_MAX/...
+  default:
+    return false;
   }
 }
 
-bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst* RMWI) {
+bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst *RMWI) {
   if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) {
     tryExpandAtomicLoad(ResultingLoad);
     return true;
@@ -1524,6 +1511,8 @@ bool AtomicExpand::tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   case TargetLoweringBase::AtomicExpansionKind::MaskedIntrinsic:
     expandAtomicCmpXchgToMaskedIntrinsic(CI);
     return true;
+  case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
+    return lowerAtomicCmpXchgInst(CI);
   }
 }
 
@@ -1535,8 +1524,8 @@ bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
       Builder, AI->getType(), AI->getPointerOperand(), AI->getAlign(),
       AI->getOrdering(), AI->getSyncScopeID(),
       [&](IRBuilder<> &Builder, Value *Loaded) {
-        return performAtomicOp(AI->getOperation(), Builder, Loaded,
-                               AI->getValOperand());
+        return buildAtomicRMWValue(AI->getOperation(), Builder, Loaded,
+                                   AI->getValOperand());
       },
       CreateCmpXchg);
 
@@ -1738,11 +1727,21 @@ bool AtomicExpand::expandAtomicOpToLibcall(
   RTLIB::Libcall RTLibType;
   if (UseSizedLibcall) {
     switch (Size) {
-    case 1: RTLibType = Libcalls[1]; break;
-    case 2: RTLibType = Libcalls[2]; break;
-    case 4: RTLibType = Libcalls[3]; break;
-    case 8: RTLibType = Libcalls[4]; break;
-    case 16: RTLibType = Libcalls[5]; break;
+    case 1:
+      RTLibType = Libcalls[1];
+      break;
+    case 2:
+      RTLibType = Libcalls[2];
+      break;
+    case 4:
+      RTLibType = Libcalls[3];
+      break;
+    case 8:
+      RTLibType = Libcalls[4];
+      break;
+    case 16:
+      RTLibType = Libcalls[5];
+      break;
     }
   } else if (Libcalls[0] != RTLIB::UNKNOWN_LIBCALL) {
     RTLibType = Libcalls[0];
@@ -1806,8 +1805,8 @@ bool AtomicExpand::expandAtomicOpToLibcall(
   // that property, we'd need to extend this mechanism to support AS-specific
   // families of atomic intrinsics.
   auto PtrTypeAS = PointerOperand->getType()->getPointerAddressSpace();
-  Value *PtrVal = Builder.CreateBitCast(PointerOperand,
-                                        Type::getInt8PtrTy(Ctx, PtrTypeAS));
+  Value *PtrVal =
+      Builder.CreateBitCast(PointerOperand, Type::getInt8PtrTy(Ctx, PtrTypeAS));
   PtrVal = Builder.CreateAddrSpaceCast(PtrVal, Type::getInt8PtrTy(Ctx));
   Args.push_back(PtrVal);
 
@@ -1815,11 +1814,10 @@ bool AtomicExpand::expandAtomicOpToLibcall(
   if (CASExpected) {
     AllocaCASExpected = AllocaBuilder.CreateAlloca(CASExpected->getType());
     AllocaCASExpected->setAlignment(AllocaAlignment);
-    unsigned AllocaAS =  AllocaCASExpected->getType()->getPointerAddressSpace();
+    unsigned AllocaAS = AllocaCASExpected->getType()->getPointerAddressSpace();
 
-    AllocaCASExpected_i8 =
-      Builder.CreateBitCast(AllocaCASExpected,
-                            Type::getInt8PtrTy(Ctx, AllocaAS));
+    AllocaCASExpected_i8 = Builder.CreateBitCast(
+        AllocaCASExpected, Type::getInt8PtrTy(Ctx, AllocaAS));
     Builder.CreateLifetimeStart(AllocaCASExpected_i8, SizeVal64);
     Builder.CreateAlignedStore(CASExpected, AllocaCASExpected, AllocaAlignment);
     Args.push_back(AllocaCASExpected_i8);
@@ -1846,9 +1844,9 @@ bool AtomicExpand::expandAtomicOpToLibcall(
   if (!CASExpected && HasResult && !UseSizedLibcall) {
     AllocaResult = AllocaBuilder.CreateAlloca(I->getType());
     AllocaResult->setAlignment(AllocaAlignment);
-    unsigned AllocaAS =  AllocaResult->getType()->getPointerAddressSpace();
+    unsigned AllocaAS = AllocaResult->getType()->getPointerAddressSpace();
     AllocaResult_i8 =
-      Builder.CreateBitCast(AllocaResult, Type::getInt8PtrTy(Ctx, AllocaAS));
+        Builder.CreateBitCast(AllocaResult, Type::getInt8PtrTy(Ctx, AllocaAS));
     Builder.CreateLifetimeStart(AllocaResult_i8, SizeVal64);
     Args.push_back(AllocaResult_i8);
   }
diff --git a/llvm/lib/CodeGen/BasicBlockSections.cpp b/llvm/lib/CodeGen/BasicBlockSections.cpp
index c1901bc46d72..f05f5b9f9947 100644
--- a/llvm/lib/CodeGen/BasicBlockSections.cpp
+++ b/llvm/lib/CodeGen/BasicBlockSections.cpp
@@ -60,7 +60,7 @@
 // Basic Block Labels
 // ==================
 //
-// With -fbasic-block-sections=labels, we emit the offsets of BB addresses of
+// With -fbasic-block-sections=labels, we encode the offsets of BB addresses of
 // every function into the .llvm_bb_addr_map section. Along with the function
 // symbols, this allows for mapping of virtual addresses in PMU profiles back to
 // the corresponding basic blocks. This logic is implemented in AsmPrinter. This
@@ -69,26 +69,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/Optional.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/BasicBlockSectionsProfileReader.h"
 #include "llvm/CodeGen/BasicBlockSectionUtils.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/LineIterator.h"
-#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Target/TargetMachine.h"
 
-using llvm::SmallSet;
-using llvm::SmallVector;
-using llvm::StringMap;
-using llvm::StringRef;
 using namespace llvm;
 
 // Placing the cold clusters in a separate section mitigates against poor
@@ -108,41 +99,11 @@ cl::opt<bool> BBSectionsDetectSourceDrift(
 
 namespace {
 
-// This struct represents the cluster information for a machine basic block.
-struct BBClusterInfo {
-  // MachineBasicBlock ID.
-  unsigned MBBNumber;
-  // Cluster ID this basic block belongs to.
-  unsigned ClusterID;
-  // Position of basic block within the cluster.
-  unsigned PositionInCluster;
-};
-
-using ProgramBBClusterInfoMapTy = StringMap<SmallVector<BBClusterInfo, 4>>;
-
 class BasicBlockSections : public MachineFunctionPass {
 public:
   static char ID;
 
-  // This contains the basic-block-sections profile.
-  const MemoryBuffer *MBuf = nullptr;
-
-  // This encapsulates the BB cluster information for the whole program.
-  //
-  // For every function name, it contains the cluster information for (all or
-  // some of) its basic blocks. The cluster information for every basic block
-  // includes its cluster ID along with the position of the basic block in that
-  // cluster.
-  ProgramBBClusterInfoMapTy ProgramBBClusterInfo;
-
-  // Some functions have alias names. We use this map to find the main alias
-  // name for which we have mapping in ProgramBBClusterInfo.
-  StringMap<StringRef> FuncAliasMap;
-
-  BasicBlockSections(const MemoryBuffer *Buf)
-      : MachineFunctionPass(ID), MBuf(Buf) {
-    initializeBasicBlockSectionsPass(*PassRegistry::getPassRegistry());
-  };
+  BasicBlockSectionsProfileReader *BBSectionsProfileReader = nullptr;
 
   BasicBlockSections() : MachineFunctionPass(ID) {
     initializeBasicBlockSectionsPass(*PassRegistry::getPassRegistry());
@@ -154,9 +115,6 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override;
 
-  /// Read profiles of basic blocks if available here.
-  bool doInitialization(Module &M) override;
-
   /// Identify basic blocks that need separate sections and prepare to emit them
   /// accordingly.
   bool runOnMachineFunction(MachineFunction &MF) override;
@@ -206,21 +164,18 @@ static void updateBranches(
 
 // This function provides the BBCluster information associated with a function.
 // Returns true if a valid association exists and false otherwise.
-static bool getBBClusterInfoForFunction(
-    const MachineFunction &MF, const StringMap<StringRef> FuncAliasMap,
-    const ProgramBBClusterInfoMapTy &ProgramBBClusterInfo,
+bool getBBClusterInfoForFunction(
+    const MachineFunction &MF,
+    BasicBlockSectionsProfileReader *BBSectionsProfileReader,
     std::vector<Optional<BBClusterInfo>> &V) {
-  // Get the main alias name for the function.
-  auto FuncName = MF.getName();
-  auto R = FuncAliasMap.find(FuncName);
-  StringRef AliasName = R == FuncAliasMap.end() ? FuncName : R->second;
 
   // Find the assoicated cluster information.
-  auto P = ProgramBBClusterInfo.find(AliasName);
-  if (P == ProgramBBClusterInfo.end())
+  std::pair<bool, SmallVector<BBClusterInfo, 4>> P =
+      BBSectionsProfileReader->getBBClusterInfoForFunction(MF.getName());
+  if (!P.first)
     return false;
 
-  if (P->second.empty()) {
+  if (P.second.empty()) {
     // This indicates that sections are desired for all basic blocks of this
     // function. We clear the BBClusterInfo vector to denote this.
     V.clear();
@@ -228,7 +183,7 @@ static bool getBBClusterInfoForFunction(
   }
 
   V.resize(MF.getNumBlockIDs());
-  for (auto bbClusterInfo : P->second) {
+  for (auto bbClusterInfo : P.second) {
     // Bail out if the cluster information contains invalid MBB numbers.
     if (bbClusterInfo.MBBNumber >= MF.getNumBlockIDs())
       return false;
@@ -266,7 +221,7 @@ assignSections(MachineFunction &MF,
       // set every basic block's section ID equal to its number (basic block
       // id). This further ensures that basic blocks are ordered canonically.
       MBB.setSectionID({static_cast<unsigned int>(MBB.getNumber())});
-    } else if (FuncBBClusterInfo[MBB.getNumber()].hasValue())
+    } else if (FuncBBClusterInfo[MBB.getNumber()])
       MBB.setSectionID(FuncBBClusterInfo[MBB.getNumber()]->ClusterID);
     else {
       // BB goes into the special cold section if it is not specified in the
@@ -279,9 +234,8 @@ assignSections(MachineFunction &MF,
       // If we already have one cluster containing eh_pads, this must be updated
       // to ExceptionSectionID. Otherwise, we set it equal to the current
       // section ID.
-      EHPadsSectionID = EHPadsSectionID.hasValue()
-                            ? MBBSectionID::ExceptionSectionID
-                            : MBB.getSectionID();
+      EHPadsSectionID = EHPadsSectionID ? MBBSectionID::ExceptionSectionID
+                                        : MBB.getSectionID();
     }
   }
 
@@ -290,7 +244,7 @@ assignSections(MachineFunction &MF,
   if (EHPadsSectionID == MBBSectionID::ExceptionSectionID)
     for (auto &MBB : MF)
       if (MBB.isEHPad())
-        MBB.setSectionID(EHPadsSectionID.getValue());
+        MBB.setSectionID(*EHPadsSectionID);
 }
 
 void llvm::sortBasicBlocksAndUpdateBranches(
@@ -377,9 +331,11 @@ bool BasicBlockSections::runOnMachineFunction(MachineFunction &MF) {
     return true;
   }
 
+  BBSectionsProfileReader = &getAnalysis<BasicBlockSectionsProfileReader>();
+
   std::vector<Optional<BBClusterInfo>> FuncBBClusterInfo;
   if (BBSectionsType == BasicBlockSection::List &&
-      !getBBClusterInfoForFunction(MF, FuncAliasMap, ProgramBBClusterInfo,
+      !getBBClusterInfoForFunction(MF, BBSectionsProfileReader,
                                    FuncBBClusterInfo))
     return true;
   MF.setBBSectionsType(BBSectionsType);
@@ -427,107 +383,12 @@ bool BasicBlockSections::runOnMachineFunction(MachineFunction &MF) {
   return true;
 }
 
-// Basic Block Sections can be enabled for a subset of machine basic blocks.
-// This is done by passing a file containing names of functions for which basic
-// block sections are desired.  Additionally, machine basic block ids of the
-// functions can also be specified for a finer granularity. Moreover, a cluster
-// of basic blocks could be assigned to the same section.
-// A file with basic block sections for all of function main and three blocks
-// for function foo (of which 1 and 2 are placed in a cluster) looks like this:
-// ----------------------------
-// list.txt:
-// !main
-// !foo
-// !!1 2
-// !!4
-static Error getBBClusterInfo(const MemoryBuffer *MBuf,
-                              ProgramBBClusterInfoMapTy &ProgramBBClusterInfo,
-                              StringMap<StringRef> &FuncAliasMap) {
-  assert(MBuf);
-  line_iterator LineIt(*MBuf, /*SkipBlanks=*/true, /*CommentMarker=*/'#');
-
-  auto invalidProfileError = [&](auto Message) {
-    return make_error<StringError>(
-        Twine("Invalid profile " + MBuf->getBufferIdentifier() + " at line " +
-              Twine(LineIt.line_number()) + ": " + Message),
-        inconvertibleErrorCode());
-  };
-
-  auto FI = ProgramBBClusterInfo.end();
-
-  // Current cluster ID corresponding to this function.
-  unsigned CurrentCluster = 0;
-  // Current position in the current cluster.
-  unsigned CurrentPosition = 0;
-
-  // Temporary set to ensure every basic block ID appears once in the clusters
-  // of a function.
-  SmallSet<unsigned, 4> FuncBBIDs;
-
-  for (; !LineIt.is_at_eof(); ++LineIt) {
-    StringRef S(*LineIt);
-    if (S[0] == '@')
-      continue;
-    // Check for the leading "!"
-    if (!S.consume_front("!") || S.empty())
-      break;
-    // Check for second "!" which indicates a cluster of basic blocks.
-    if (S.consume_front("!")) {
-      if (FI == ProgramBBClusterInfo.end())
-        return invalidProfileError(
-            "Cluster list does not follow a function name specifier.");
-      SmallVector<StringRef, 4> BBIndexes;
-      S.split(BBIndexes, ' ');
-      // Reset current cluster position.
-      CurrentPosition = 0;
-      for (auto BBIndexStr : BBIndexes) {
-        unsigned long long BBIndex;
-        if (getAsUnsignedInteger(BBIndexStr, 10, BBIndex))
-          return invalidProfileError(Twine("Unsigned integer expected: '") +
-                                     BBIndexStr + "'.");
-        if (!FuncBBIDs.insert(BBIndex).second)
-          return invalidProfileError(Twine("Duplicate basic block id found '") +
-                                     BBIndexStr + "'.");
-        if (!BBIndex && CurrentPosition)
-          return invalidProfileError("Entry BB (0) does not begin a cluster.");
-
-        FI->second.emplace_back(BBClusterInfo{
-            ((unsigned)BBIndex), CurrentCluster, CurrentPosition++});
-      }
-      CurrentCluster++;
-    } else { // This is a function name specifier.
-      // Function aliases are separated using '/'. We use the first function
-      // name for the cluster info mapping and delegate all other aliases to
-      // this one.
-      SmallVector<StringRef, 4> Aliases;
-      S.split(Aliases, '/');
-      for (size_t i = 1; i < Aliases.size(); ++i)
-        FuncAliasMap.try_emplace(Aliases[i], Aliases.front());
-
-      // Prepare for parsing clusters of this function name.
-      // Start a new cluster map for this function name.
-      FI = ProgramBBClusterInfo.try_emplace(Aliases.front()).first;
-      CurrentCluster = 0;
-      FuncBBIDs.clear();
-    }
-  }
-  return Error::success();
-}
-
-bool BasicBlockSections::doInitialization(Module &M) {
-  if (!MBuf)
-    return false;
-  if (auto Err = getBBClusterInfo(MBuf, ProgramBBClusterInfo, FuncAliasMap))
-    report_fatal_error(std::move(Err));
-  return false;
-}
-
 void BasicBlockSections::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
+  AU.addRequired<BasicBlockSectionsProfileReader>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-MachineFunctionPass *
-llvm::createBasicBlockSectionsPass(const MemoryBuffer *Buf) {
-  return new BasicBlockSections(Buf);
+MachineFunctionPass *llvm::createBasicBlockSectionsPass() {
+  return new BasicBlockSections();
 }
diff --git a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp
new file mode 100644
index 000000000000..c2acf115998b
--- /dev/null
+++ b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp
@@ -0,0 +1,144 @@
+//===-- BasicBlockSectionsProfileReader.cpp -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the basic block sections profile reader pass. It parses
+// and stores the basic block sections profile file (which is specified via the
+// `-basic-block-sections` flag).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/BasicBlockSectionsProfileReader.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/LineIterator.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+using namespace llvm;
+
+char BasicBlockSectionsProfileReader::ID = 0;
+INITIALIZE_PASS(BasicBlockSectionsProfileReader, "bbsections-profile-reader",
+                "Reads and parses a basic block sections profile.", false,
+                false)
+
+bool BasicBlockSectionsProfileReader::isFunctionHot(StringRef FuncName) const {
+  return getBBClusterInfoForFunction(FuncName).first;
+}
+
+std::pair<bool, SmallVector<BBClusterInfo>>
+BasicBlockSectionsProfileReader::getBBClusterInfoForFunction(
+    StringRef FuncName) const {
+  std::pair<bool, SmallVector<BBClusterInfo>> cluster_info(false, {});
+  auto R = ProgramBBClusterInfo.find(getAliasName(FuncName));
+  if (R != ProgramBBClusterInfo.end()) {
+    cluster_info.second = R->second;
+    cluster_info.first = true;
+  }
+  return cluster_info;
+}
+
+// Basic Block Sections can be enabled for a subset of machine basic blocks.
+// This is done by passing a file containing names of functions for which basic
+// block sections are desired.  Additionally, machine basic block ids of the
+// functions can also be specified for a finer granularity. Moreover, a cluster
+// of basic blocks could be assigned to the same section.
+// A file with basic block sections for all of function main and three blocks
+// for function foo (of which 1 and 2 are placed in a cluster) looks like this:
+// ----------------------------
+// list.txt:
+// !main
+// !foo
+// !!1 2
+// !!4
+static Error getBBClusterInfo(const MemoryBuffer *MBuf,
+                              ProgramBBClusterInfoMapTy &ProgramBBClusterInfo,
+                              StringMap<StringRef> &FuncAliasMap) {
+  assert(MBuf);
+  line_iterator LineIt(*MBuf, /*SkipBlanks=*/true, /*CommentMarker=*/'#');
+
+  auto invalidProfileError = [&](auto Message) {
+    return make_error<StringError>(
+        Twine("Invalid profile " + MBuf->getBufferIdentifier() + " at line " +
+              Twine(LineIt.line_number()) + ": " + Message),
+        inconvertibleErrorCode());
+  };
+
+  auto FI = ProgramBBClusterInfo.end();
+
+  // Current cluster ID corresponding to this function.
+  unsigned CurrentCluster = 0;
+  // Current position in the current cluster.
+  unsigned CurrentPosition = 0;
+
+  // Temporary set to ensure every basic block ID appears once in the clusters
+  // of a function.
+  SmallSet<unsigned, 4> FuncBBIDs;
+
+  for (; !LineIt.is_at_eof(); ++LineIt) {
+    StringRef S(*LineIt);
+    if (S[0] == '@')
+      continue;
+    // Check for the leading "!"
+    if (!S.consume_front("!") || S.empty())
+      break;
+    // Check for second "!" which indicates a cluster of basic blocks.
+    if (S.consume_front("!")) {
+      if (FI == ProgramBBClusterInfo.end())
+        return invalidProfileError(
+            "Cluster list does not follow a function name specifier.");
+      SmallVector<StringRef, 4> BBIndexes;
+      S.split(BBIndexes, ' ');
+      // Reset current cluster position.
+      CurrentPosition = 0;
+      for (auto BBIndexStr : BBIndexes) {
+        unsigned long long BBIndex;
+        if (getAsUnsignedInteger(BBIndexStr, 10, BBIndex))
+          return invalidProfileError(Twine("Unsigned integer expected: '") +
+                                     BBIndexStr + "'.");
+        if (!FuncBBIDs.insert(BBIndex).second)
+          return invalidProfileError(Twine("Duplicate basic block id found '") +
+                                     BBIndexStr + "'.");
+        if (!BBIndex && CurrentPosition)
+          return invalidProfileError("Entry BB (0) does not begin a cluster.");
+
+        FI->second.emplace_back(BBClusterInfo{
+            ((unsigned)BBIndex), CurrentCluster, CurrentPosition++});
+      }
+      CurrentCluster++;
+    } else { // This is a function name specifier.
+      // Function aliases are separated using '/'. We use the first function
+      // name for the cluster info mapping and delegate all other aliases to
+      // this one.
+      SmallVector<StringRef, 4> Aliases;
+      S.split(Aliases, '/');
+      for (size_t i = 1; i < Aliases.size(); ++i)
+        FuncAliasMap.try_emplace(Aliases[i], Aliases.front());
+
+      // Prepare for parsing clusters of this function name.
+      // Start a new cluster map for this function name.
+      FI = ProgramBBClusterInfo.try_emplace(Aliases.front()).first;
+      CurrentCluster = 0;
+      FuncBBIDs.clear();
+    }
+  }
+  return Error::success();
+}
+
+void BasicBlockSectionsProfileReader::initializePass() {
+  if (!MBuf)
+    return;
+  if (auto Err = getBBClusterInfo(MBuf, ProgramBBClusterInfo, FuncAliasMap))
+    report_fatal_error(std::move(Err));
+}
+
+ImmutablePass *
+llvm::createBasicBlockSectionsProfileReaderPass(const MemoryBuffer *Buf) {
+  return new BasicBlockSectionsProfileReader(Buf);
+}
diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
index 0ff67f7ca00a..07be03d2dab9 100644
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -24,6 +24,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MBFIWrapper.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -32,11 +33,9 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineSizeOpts.h"
-#include "llvm/CodeGen/MBFIWrapper.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
@@ -105,6 +104,11 @@ namespace {
       AU.addRequired<TargetPassConfig>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
+
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoPHIs);
+    }
   };
 
 } // end anonymous namespace
diff --git a/llvm/lib/CodeGen/BranchFolding.h b/llvm/lib/CodeGen/BranchFolding.h
index 95d5dcfbbd0f..d0b6ed5ebe05 100644
--- a/llvm/lib/CodeGen/BranchFolding.h
+++ b/llvm/lib/CodeGen/BranchFolding.h
@@ -14,7 +14,6 @@
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/Support/Compiler.h"
-#include <cstdint>
 #include <vector>
 
 namespace llvm {
diff --git a/llvm/lib/CodeGen/BranchRelaxation.cpp b/llvm/lib/CodeGen/BranchRelaxation.cpp
index eda0f37fdeb7..29508f8f35a6 100644
--- a/llvm/lib/CodeGen/BranchRelaxation.cpp
+++ b/llvm/lib/CodeGen/BranchRelaxation.cpp
@@ -24,7 +24,6 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <cstdint>
diff --git a/llvm/lib/CodeGen/BreakFalseDeps.cpp b/llvm/lib/CodeGen/BreakFalseDeps.cpp
index 558700bd9b3b..57170c58db14 100644
--- a/llvm/lib/CodeGen/BreakFalseDeps.cpp
+++ b/llvm/lib/CodeGen/BreakFalseDeps.cpp
@@ -19,11 +19,13 @@
 
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/ReachingDefAnalysis.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegister.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 
 using namespace llvm;
diff --git a/llvm/lib/CodeGen/CFIFixup.cpp b/llvm/lib/CodeGen/CFIFixup.cpp
new file mode 100644
index 000000000000..837dbd77d073
--- /dev/null
+++ b/llvm/lib/CodeGen/CFIFixup.cpp
@@ -0,0 +1,225 @@
+//===------ CFIFixup.cpp - Insert CFI remember/restore instructions -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+
+// This pass inserts the necessary  instructions to adjust for the inconsistency
+// of the call-frame information caused by final machine basic block layout.
+// The pass relies in constraints LLVM imposes on the placement of
+// save/restore points (cf. ShrinkWrap):
+// * there is a single basic block, containing the function prologue
+// * possibly multiple epilogue blocks, where each epilogue block is
+//   complete and self-contained, i.e. CSR restore instructions (and the
+//   corresponding CFI instructions are not split across two or more blocks.
+// * prologue and epilogue blocks are outside of any loops
+// Thus, during execution, at the beginning and at the end of each basic block
+// the function can be in one of two states:
+//  - "has a call frame", if the function has executed the prologue, and
+//    has not executed any epilogue
+//  - "does not have a call frame", if the function has not executed the
+//    prologue, or has executed an epilogue
+// which can be computed by a single RPO traversal.
+
+// In order to accommodate backends which do not generate unwind info in
+// epilogues we compute an additional property "strong no call frame on entry",
+// which is set for the entry point of the function and for every block
+// reachable from the entry along a path that does not execute the prologue. If
+// this property holds, it takes precedence over the "has a call frame"
+// property.
+
+// From the point of view of the unwind tables, the "has/does not have call
+// frame" state at beginning of each block is determined by the state at the end
+// of the previous block, in layout order. Where these states differ, we insert
+// compensating CFI instructions, which come in two flavours:
+
+//   - CFI instructions, which reset the unwind table state to the initial one.
+//     This is done by a target specific hook and is expected to be trivial
+//     to implement, for example it could be:
+//       .cfi_def_cfa <sp>, 0
+//       .cfi_same_value <rN>
+//       .cfi_same_value <rN-1>
+//       ...
+//     where <rN> are the callee-saved registers.
+//   - CFI instructions, which reset the unwind table state to the one
+//     created by the function prologue. These are
+//       .cfi_restore_state
+//       .cfi_remember_state
+//     In this case we also insert a `.cfi_remember_state` after the last CFI
+//     instruction in the function prologue.
+//
+// Known limitations:
+//  * the pass cannot handle an epilogue preceding the prologue in the basic
+//    block layout
+//  * the pass does not handle functions where SP is used as a frame pointer and
+//    SP adjustments up and down are done in different basic blocks (TODO)
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/CFIFixup.h"
+
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "cfi-fixup"
+
+char CFIFixup::ID = 0;
+
+INITIALIZE_PASS(CFIFixup, "cfi-fixup",
+                "Insert CFI remember/restore state instructions", false, false)
+FunctionPass *llvm::createCFIFixup() { return new CFIFixup(); }
+
+static bool isPrologueCFIInstruction(const MachineInstr &MI) {
+  return MI.getOpcode() == TargetOpcode::CFI_INSTRUCTION &&
+         MI.getFlag(MachineInstr::FrameSetup);
+}
+
+static bool containsPrologue(const MachineBasicBlock &MBB) {
+  return llvm::any_of(MBB.instrs(), isPrologueCFIInstruction);
+}
+
+static bool containsEpilogue(const MachineBasicBlock &MBB) {
+  return llvm::any_of(llvm::reverse(MBB), [](const auto &MI) {
+    return MI.getOpcode() == TargetOpcode::CFI_INSTRUCTION &&
+           MI.getFlag(MachineInstr::FrameDestroy);
+  });
+}
+
+bool CFIFixup::runOnMachineFunction(MachineFunction &MF) {
+  const TargetFrameLowering &TFL = *MF.getSubtarget().getFrameLowering();
+  if (!TFL.enableCFIFixup(MF))
+    return false;
+
+  const unsigned NumBlocks = MF.getNumBlockIDs();
+  if (NumBlocks < 2)
+    return false;
+
+  struct BlockFlags {
+    bool Reachable : 1;
+    bool StrongNoFrameOnEntry : 1;
+    bool HasFrameOnEntry : 1;
+    bool HasFrameOnExit : 1;
+  };
+  SmallVector<BlockFlags, 32> BlockInfo(NumBlocks, {false, false, false, false});
+  BlockInfo[0].Reachable = true;
+  BlockInfo[0].StrongNoFrameOnEntry = true;
+
+  // Compute the presence/absence of frame at each basic block.
+  MachineBasicBlock *PrologueBlock = nullptr;
+  ReversePostOrderTraversal<MachineBasicBlock *> RPOT(&*MF.begin());
+  for (MachineBasicBlock *MBB : RPOT) {
+    BlockFlags &Info = BlockInfo[MBB->getNumber()];
+
+    // Set to true if the current block contains the prologue or the epilogue,
+    // respectively.
+    bool HasPrologue = false;
+    bool HasEpilogue = false;
+
+    if (!PrologueBlock && !Info.HasFrameOnEntry && containsPrologue(*MBB)) {
+      PrologueBlock = MBB;
+      HasPrologue = true;
+    }
+
+    if (Info.HasFrameOnEntry || HasPrologue)
+      HasEpilogue = containsEpilogue(*MBB);
+
+    // If the function has a call frame at the entry of the current block or the
+    // current block contains the prologue, then the function has a call frame
+    // at the exit of the block, unless the block contains the epilogue.
+    Info.HasFrameOnExit = (Info.HasFrameOnEntry || HasPrologue) && !HasEpilogue;
+
+    // Set the successors' state on entry.
+    for (MachineBasicBlock *Succ : MBB->successors()) {
+      BlockFlags &SuccInfo = BlockInfo[Succ->getNumber()];
+      SuccInfo.Reachable = true;
+      SuccInfo.StrongNoFrameOnEntry |=
+          Info.StrongNoFrameOnEntry && !HasPrologue;
+      SuccInfo.HasFrameOnEntry = Info.HasFrameOnExit;
+    }
+  }
+
+  if (!PrologueBlock)
+    return false;
+
+  // Walk the blocks of the function in "physical" order.
+  // Every block inherits the frame state (as recorded in the unwind tables)
+  // of the previous block. If the intended frame state is different, insert
+  // compensating CFI instructions.
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  bool Change = false;
+  // `InsertPt` always points to the point in a preceding block where we have to
+  // insert a `.cfi_remember_state`, in the case that the current block needs a
+  // `.cfi_restore_state`.
+  MachineBasicBlock *InsertMBB = PrologueBlock;
+  MachineBasicBlock::iterator InsertPt = PrologueBlock->begin();
+  for (MachineInstr &MI : *PrologueBlock)
+    if (isPrologueCFIInstruction(MI))
+      InsertPt = std::next(MI.getIterator());
+
+  assert(InsertPt != PrologueBlock->begin() &&
+         "Inconsistent notion of \"prologue block\"");
+
+  // No point starting before the prologue block.
+  // TODO: the unwind tables will still be incorrect if an epilogue physically
+  // preceeds the prologue.
+  MachineFunction::iterator CurrBB = std::next(PrologueBlock->getIterator());
+  bool HasFrame = BlockInfo[PrologueBlock->getNumber()].HasFrameOnExit;
+  while (CurrBB != MF.end()) {
+    const BlockFlags &Info = BlockInfo[CurrBB->getNumber()];
+    if (!Info.Reachable) {
+      ++CurrBB;
+      continue;
+    }
+
+#ifndef NDEBUG
+    if (!Info.StrongNoFrameOnEntry) {
+      for (auto *Pred : CurrBB->predecessors()) {
+        BlockFlags &PredInfo = BlockInfo[Pred->getNumber()];
+        assert((!PredInfo.Reachable ||
+                Info.HasFrameOnEntry == PredInfo.HasFrameOnExit) &&
+               "Inconsistent call frame state");
+      }
+    }
+#endif
+    if (!Info.StrongNoFrameOnEntry && Info.HasFrameOnEntry && !HasFrame) {
+      // Reset to the "after prologue" state.
+
+      // Insert a `.cfi_remember_state` into the last block known to have a
+      // stack frame.
+      unsigned CFIIndex =
+          MF.addFrameInst(MCCFIInstruction::createRememberState(nullptr));
+      BuildMI(*InsertMBB, InsertPt, DebugLoc(),
+              TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+      // Insert a `.cfi_restore_state` at the beginning of the current block.
+      CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestoreState(nullptr));
+      InsertPt = BuildMI(*CurrBB, CurrBB->begin(), DebugLoc(),
+                         TII.get(TargetOpcode::CFI_INSTRUCTION))
+                     .addCFIIndex(CFIIndex);
+      ++InsertPt;
+      InsertMBB = &*CurrBB;
+      Change = true;
+    } else if ((Info.StrongNoFrameOnEntry || !Info.HasFrameOnEntry) &&
+               HasFrame) {
+      // Reset to the state upon function entry.
+      TFL.resetCFIToInitialState(*CurrBB);
+      Change = true;
+    }
+
+    HasFrame = Info.HasFrameOnExit;
+    ++CurrBB;
+  }
+
+  return Change;
+}
diff --git a/llvm/lib/CodeGen/CFIInstrInserter.cpp b/llvm/lib/CodeGen/CFIInstrInserter.cpp
index de173a9dfd62..42523c47a671 100644
--- a/llvm/lib/CodeGen/CFIInstrInserter.cpp
+++ b/llvm/lib/CodeGen/CFIInstrInserter.cpp
@@ -19,16 +19,14 @@
 
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/Optional.h"
-#include "llvm/ADT/SetOperations.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/MC/MCDwarf.h"
 using namespace llvm;
 
 static cl::opt<bool> VerifyCFI("verify-cfiinstrs",
diff --git a/llvm/lib/CodeGen/CalcSpillWeights.cpp b/llvm/lib/CodeGen/CalcSpillWeights.cpp
index 84a0e4142bb6..689e49978d43 100644
--- a/llvm/lib/CodeGen/CalcSpillWeights.cpp
+++ b/llvm/lib/CodeGen/CalcSpillWeights.cpp
@@ -145,11 +145,6 @@ void VirtRegAuxInfo::calculateSpillWeightAndHint(LiveInterval &LI) {
   LI.setWeight(Weight);
 }
 
-float VirtRegAuxInfo::futureWeight(LiveInterval &LI, SlotIndex Start,
-                                   SlotIndex End) {
-  return weightCalcHelper(LI, &Start, &End);
-}
-
 float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
                                        SlotIndex *End) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
diff --git a/llvm/lib/CodeGen/CallingConvLower.cpp b/llvm/lib/CodeGen/CallingConvLower.cpp
index c9246f6e8754..f74ff30ab2e1 100644
--- a/llvm/lib/CodeGen/CallingConvLower.cpp
+++ b/llvm/lib/CodeGen/CallingConvLower.cpp
@@ -14,16 +14,14 @@
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SaveAndRestore.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 
 using namespace llvm;
 
@@ -72,15 +70,9 @@ bool CCState::IsShadowAllocatedReg(MCRegister Reg) const {
   if (!isAllocated(Reg))
     return false;
 
-  for (auto const &ValAssign : Locs) {
-    if (ValAssign.isRegLoc()) {
-      for (MCRegAliasIterator AI(ValAssign.getLocReg(), &TRI, true);
-           AI.isValid(); ++AI) {
-        if (*AI == Reg)
-          return false;
-      }
-    }
-  }
+  for (auto const &ValAssign : Locs)
+    if (ValAssign.isRegLoc() && TRI.regsOverlap(ValAssign.getLocReg(), Reg))
+      return false;
   return true;
 }
 
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index 7c236a9785d8..5050395fbc0f 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -24,6 +24,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeBranchFolderPassPass(Registry);
   initializeBranchRelaxationPass(Registry);
   initializeCFGuardLongjmpPass(Registry);
+  initializeCFIFixupPass(Registry);
   initializeCFIInstrInserterPass(Registry);
   initializeCheckDebugMachineModulePass(Registry);
   initializeCodeGenPreparePass(Registry);
@@ -50,6 +51,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeIndirectBrExpandPassPass(Registry);
   initializeInterleavedLoadCombinePass(Registry);
   initializeInterleavedAccessPass(Registry);
+  initializeJMCInstrumenterPass(Registry);
   initializeLiveDebugValuesPass(Registry);
   initializeLiveDebugVariablesPass(Registry);
   initializeLiveIntervalsPass(Registry);
@@ -57,6 +59,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeLiveStacksPass(Registry);
   initializeLiveVariablesPass(Registry);
   initializeLocalStackSlotPassPass(Registry);
+  initializeLowerGlobalDtorsLegacyPassPass(Registry);
   initializeLowerIntrinsicsPass(Registry);
   initializeMIRAddFSDiscriminatorsPass(Registry);
   initializeMIRCanonicalizerPass(Registry);
@@ -104,6 +107,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeRemoveRedundantDebugValuesPass(Registry);
   initializeRenameIndependentSubregsPass(Registry);
   initializeSafeStackLegacyPassPass(Registry);
+  initializeSelectOptimizePass(Registry);
   initializeShadowStackGCLoweringPass(Registry);
   initializeShrinkWrapPass(Registry);
   initializeSjLjEHPreparePass(Registry);
diff --git a/llvm/lib/CodeGen/CodeGenCommonISel.cpp b/llvm/lib/CodeGen/CodeGenCommonISel.cpp
index 877aa69c3e58..8f185a161bd0 100644
--- a/llvm/lib/CodeGen/CodeGenCommonISel.cpp
+++ b/llvm/lib/CodeGen/CodeGenCommonISel.cpp
@@ -129,7 +129,9 @@ llvm::findSplitPointForStackProtector(MachineBasicBlock *BB,
 
   MachineBasicBlock::iterator Start = BB->begin();
   MachineBasicBlock::iterator Previous = SplitPoint;
-  --Previous;
+  do {
+    --Previous;
+  } while (Previous != Start && Previous->isDebugInstr());
 
   if (TII.isTailCall(*SplitPoint) &&
       Previous->getOpcode() == TII.getCallFrameDestroyOpcode()) {
@@ -142,7 +144,7 @@ llvm::findSplitPointForStackProtector(MachineBasicBlock *BB,
     //     ADJCALLSTACKUP ...
     //     TAILJMP somewhere
     // On the other hand, it could be an unrelated call in which case this tail
-    // call has to register moves of its own and should be the split point. For
+    // call has no register moves of its own and should be the split point. For
     // example:
     //     ADJCALLSTACKDOWN
     //     CALL something_else
@@ -167,3 +169,31 @@ llvm::findSplitPointForStackProtector(MachineBasicBlock *BB,
 
   return SplitPoint;
 }
+
+unsigned llvm::getInvertedFPClassTest(unsigned Test) {
+  unsigned InvertedTest = ~Test & fcAllFlags;
+  switch (InvertedTest) {
+  default:
+    break;
+  case fcNan:
+  case fcSNan:
+  case fcQNan:
+  case fcInf:
+  case fcPosInf:
+  case fcNegInf:
+  case fcNormal:
+  case fcPosNormal:
+  case fcNegNormal:
+  case fcSubnormal:
+  case fcPosSubnormal:
+  case fcNegSubnormal:
+  case fcZero:
+  case fcPosZero:
+  case fcNegZero:
+  case fcFinite:
+  case fcPosFinite:
+  case fcNegFinite:
+    return InvertedTest;
+  }
+  return 0;
+}
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index c888adeafca5..6778af22f532 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -23,16 +23,15 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/BasicBlockSectionsProfileReader.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -174,12 +173,11 @@ static cl::opt<bool> DisablePreheaderProtect(
     cl::desc("Disable protection against removing loop preheaders"));
 
 static cl::opt<bool> ProfileGuidedSectionPrefix(
-    "profile-guided-section-prefix", cl::Hidden, cl::init(true), cl::ZeroOrMore,
+    "profile-guided-section-prefix", cl::Hidden, cl::init(true),
     cl::desc("Use profile info to add section prefix for hot/cold functions"));
 
 static cl::opt<bool> ProfileUnknownInSpecialSection(
-    "profile-unknown-in-special-section", cl::Hidden, cl::init(false),
-    cl::ZeroOrMore,
+    "profile-unknown-in-special-section", cl::Hidden,
     cl::desc("In profiling mode like sampleFDO, if a function doesn't have "
              "profile, we cannot tell the function is cold for sure because "
              "it may be a function newly added without ever being sampled. "
@@ -188,6 +186,15 @@ static cl::opt<bool> ProfileUnknownInSpecialSection(
              "to handle it in a different way than .text section, to save "
              "RAM for example. "));
 
+static cl::opt<bool> BBSectionsGuidedSectionPrefix(
+    "bbsections-guided-section-prefix", cl::Hidden, cl::init(true),
+    cl::desc("Use the basic-block-sections profile to determine the text "
+             "section prefix for hot functions. Functions with "
+             "basic-block-sections profile will be placed in `.text.hot` "
+             "regardless of their FDO profile info. Other functions won't be "
+             "impacted, i.e., their prefixes will be decided by FDO/sampleFDO "
+             "profiles."));
+
 static cl::opt<unsigned> FreqRatioToSkipMerge(
     "cgp-freq-ratio-to-skip-merge", cl::Hidden, cl::init(2),
     cl::desc("Skip merging empty blocks if (frequency of empty block) / "
@@ -274,6 +281,7 @@ class TypePromotionTransaction;
     const TargetLowering *TLI = nullptr;
     const TargetRegisterInfo *TRI;
     const TargetTransformInfo *TTI = nullptr;
+    const BasicBlockSectionsProfileReader *BBSectionsProfileReader = nullptr;
     const TargetLibraryInfo *TLInfo;
     const LoopInfo *LI;
     std::unique_ptr<BlockFrequencyInfo> BFI;
@@ -349,6 +357,7 @@ class TypePromotionTransaction;
       AU.addRequired<TargetPassConfig>();
       AU.addRequired<TargetTransformInfoWrapperPass>();
       AU.addRequired<LoopInfoWrapperPass>();
+      AU.addUsedIfAvailable<BasicBlockSectionsProfileReader>();
     }
 
   private:
@@ -401,6 +410,8 @@ class TypePromotionTransaction;
     bool optimizeFunnelShift(IntrinsicInst *Fsh);
     bool optimizeSelectInst(SelectInst *SI);
     bool optimizeShuffleVectorInst(ShuffleVectorInst *SVI);
+    bool optimizeSwitchType(SwitchInst *SI);
+    bool optimizeSwitchPhiConstants(SwitchInst *SI);
     bool optimizeSwitchInst(SwitchInst *SI);
     bool optimizeExtractElementInst(Instruction *Inst);
     bool dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT);
@@ -442,6 +453,7 @@ char CodeGenPrepare::ID = 0;
 
 INITIALIZE_PASS_BEGIN(CodeGenPrepare, DEBUG_TYPE,
                       "Optimize for code generation", false, false)
+INITIALIZE_PASS_DEPENDENCY(BasicBlockSectionsProfileReader)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
@@ -473,8 +485,14 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   BPI.reset(new BranchProbabilityInfo(F, *LI));
   BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI));
   PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  BBSectionsProfileReader =
+      getAnalysisIfAvailable<BasicBlockSectionsProfileReader>();
   OptSize = F.hasOptSize();
-  if (ProfileGuidedSectionPrefix) {
+  // Use the basic-block-sections profile to promote hot functions to .text.hot if requested.
+  if (BBSectionsGuidedSectionPrefix && BBSectionsProfileReader &&
+      BBSectionsProfileReader->isFunctionHot(F.getName())) {
+    F.setSectionPrefix("hot");
+  } else if (ProfileGuidedSectionPrefix) {
     // The hot attribute overwrites profile count based hotness while profile
     // counts based hotness overwrite the cold attribute.
     // This is a conservative behabvior.
@@ -524,7 +542,8 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
 
   // Split some critical edges where one of the sources is an indirect branch,
   // to help generate sane code for PHIs involving such edges.
-  EverMadeChange |= SplitIndirectBrCriticalEdges(F);
+  EverMadeChange |=
+      SplitIndirectBrCriticalEdges(F, /*IgnoreBlocksWithoutPHI=*/true);
 
   bool MadeChange = true;
   while (MadeChange) {
@@ -2037,7 +2056,8 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros,
     return false;
 
   // Bail if the value is never zero.
-  if (llvm::isKnownNonZero(CountZeros->getOperand(0), *DL))
+  Use &Op = CountZeros->getOperandUse(0);
+  if (isKnownNonZero(Op, *DL))
     return false;
 
   // The intrinsic will be sunk behind a compare against zero and branch.
@@ -2058,7 +2078,10 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros,
   // Replace the unconditional branch that was created by the first split with
   // a compare against zero and a conditional branch.
   Value *Zero = Constant::getNullValue(Ty);
-  Value *Cmp = Builder.CreateICmpEQ(CountZeros->getOperand(0), Zero, "cmpz");
+  // Avoid introducing branch on poison. This also replaces the ctz operand.
+  if (!isGuaranteedNotToBeUndefOrPoison(Op))
+    Op = Builder.CreateFreeze(Op, Op->getName() + ".fr");
+  Value *Cmp = Builder.CreateICmpEQ(Op, Zero, "cmpz");
   Builder.CreateCondBr(Cmp, EndBlock, CallBlock);
   StartBlock->getTerminator()->eraseFromParent();
 
@@ -2101,7 +2124,8 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
 
   // Align the pointer arguments to this call if the target thinks it's a good
   // idea
-  unsigned MinSize, PrefAlign;
+  unsigned MinSize;
+  Align PrefAlign;
   if (TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) {
     for (auto &Arg : CI->args()) {
       // We want to align both objects whose address is used directly and
@@ -2115,12 +2139,12 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
                    0);
       Value *Val = Arg->stripAndAccumulateInBoundsConstantOffsets(*DL, Offset);
       uint64_t Offset2 = Offset.getLimitedValue();
-      if ((Offset2 & (PrefAlign-1)) != 0)
+      if (!isAligned(PrefAlign, Offset2))
         continue;
       AllocaInst *AI;
-      if ((AI = dyn_cast<AllocaInst>(Val)) && AI->getAlignment() < PrefAlign &&
+      if ((AI = dyn_cast<AllocaInst>(Val)) && AI->getAlign() < PrefAlign &&
           DL->getTypeAllocSize(AI->getAllocatedType()) >= MinSize + Offset2)
-        AI->setAlignment(Align(PrefAlign));
+        AI->setAlignment(PrefAlign);
       // Global variables can only be aligned if they are defined in this
       // object (i.e. they are uniquely initialized in this object), and
       // over-aligning global variables that have an explicit section is
@@ -2130,7 +2154,7 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
           GV->getPointerAlignment(*DL) < PrefAlign &&
           DL->getTypeAllocSize(GV->getValueType()) >=
               MinSize + Offset2)
-        GV->setAlignment(MaybeAlign(PrefAlign));
+        GV->setAlignment(PrefAlign);
     }
     // If this is a memcpy (or similar) then we may be able to improve the
     // alignment
@@ -3371,7 +3395,7 @@ public:
       if (!Visited.insert(P).second)
         continue;
       if (auto *PI = dyn_cast<Instruction>(P))
-        if (Value *V = SimplifyInstruction(cast<Instruction>(PI), SQ)) {
+        if (Value *V = simplifyInstruction(cast<Instruction>(PI), SQ)) {
           for (auto *U : PI->users())
             WorkList.push_back(cast<Value>(U));
           Put(PI, V);
@@ -3416,7 +3440,7 @@ public:
 
   void destroyNewNodes(Type *CommonType) {
     // For safe erasing, replace the uses with dummy value first.
-    auto *Dummy = UndefValue::get(CommonType);
+    auto *Dummy = PoisonValue::get(CommonType);
     for (auto *I : AllPhiNodes) {
       I->replaceAllUsesWith(Dummy);
       I->eraseFromParent();
@@ -3785,7 +3809,7 @@ private:
     SmallVector<Value *, 32> Worklist;
     assert((isa<PHINode>(Original) || isa<SelectInst>(Original)) &&
            "Address must be a Phi or Select node");
-    auto *Dummy = UndefValue::get(CommonType);
+    auto *Dummy = PoisonValue::get(CommonType);
     Worklist.push_back(Original);
     while (!Worklist.empty()) {
       Value *Current = Worklist.pop_back_val();
@@ -4550,9 +4574,9 @@ bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,
     ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1));
     if (!RHS || RHS->getBitWidth() > 64)
       return false;
-    int64_t Scale = RHS->getSExtValue();
-    if (Opcode == Instruction::Shl)
-      Scale = 1LL << Scale;
+    int64_t Scale = Opcode == Instruction::Shl
+                        ? 1LL << RHS->getLimitedValue(RHS->getBitWidth() - 1)
+                        : RHS->getSExtValue();
 
     return matchScaledValue(AddrInst->getOperand(0), Scale, Depth);
   }
@@ -4783,7 +4807,6 @@ bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) {
       }
 
       // It isn't profitable to do this, roll back.
-      //cerr << "NOT FOLDING: " << *I;
       AddrMode = BackupAddrMode;
       AddrModeInsts.resize(OldSize);
       TPT.rollback(LastKnownGood);
@@ -4836,7 +4859,7 @@ static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
     TLI.ComputeConstraintToUse(OpInfo, SDValue());
 
     // If this asm operand is our Value*, and if it isn't an indirect memory
-    // operand, we can't fold it!
+    // operand, we can't fold it!  TODO: Also handle C_Address?
     if (OpInfo.CallOperandVal == OpVal &&
         (OpInfo.ConstraintType != TargetLowering::C_Memory ||
          !OpInfo.isIndirect))
@@ -5158,8 +5181,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
       // GEP, collect the GEP.  Skip the GEPs that are the new bases of
       // previously split data structures.
       LargeOffsetGEPMap[GEP->getPointerOperand()].push_back(LargeOffsetGEP);
-      if (LargeOffsetGEPID.find(GEP) == LargeOffsetGEPID.end())
-        LargeOffsetGEPID[GEP] = LargeOffsetGEPID.size();
+      LargeOffsetGEPID.insert(std::make_pair(GEP, LargeOffsetGEPID.size()));
     }
 
     NewAddrMode.OriginalValue = V;
@@ -5323,11 +5345,8 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
           // SDAG consecutive load/store merging.
           if (ResultPtr->getType() != I8PtrTy)
             ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy);
-          ResultPtr =
-              AddrMode.InBounds
-                  ? Builder.CreateInBoundsGEP(I8Ty, ResultPtr, ResultIndex,
-                                              "sunkaddr")
-                  : Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr");
+          ResultPtr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex,
+                                        "sunkaddr", AddrMode.InBounds);
         }
 
         ResultIndex = V;
@@ -5338,11 +5357,8 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
       } else {
         if (ResultPtr->getType() != I8PtrTy)
           ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy);
-        SunkAddr =
-            AddrMode.InBounds
-                ? Builder.CreateInBoundsGEP(I8Ty, ResultPtr, ResultIndex,
-                                            "sunkaddr")
-                : Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr");
+        SunkAddr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr",
+                                     AddrMode.InBounds);
       }
 
       if (SunkAddr->getType() != Addr->getType())
@@ -5619,6 +5635,7 @@ bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {
     // Compute the constraint code and ConstraintType to use.
     TLI->ComputeConstraintToUse(OpInfo, SDValue());
 
+    // TODO: Also handle C_Address?
     if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
         OpInfo.isIndirect) {
       Value *OpVal = CS->getArgOperand(ArgNo++);
@@ -6002,31 +6019,25 @@ bool CodeGenPrepare::optimizePhiType(
       for (Value *V : Phi->incoming_values()) {
         if (auto *OpPhi = dyn_cast<PHINode>(V)) {
           if (!PhiNodes.count(OpPhi)) {
-            if (Visited.count(OpPhi))
+            if (!Visited.insert(OpPhi).second)
               return false;
             PhiNodes.insert(OpPhi);
-            Visited.insert(OpPhi);
             Worklist.push_back(OpPhi);
           }
         } else if (auto *OpLoad = dyn_cast<LoadInst>(V)) {
           if (!OpLoad->isSimple())
             return false;
-          if (!Defs.count(OpLoad)) {
-            Defs.insert(OpLoad);
+          if (Defs.insert(OpLoad).second)
             Worklist.push_back(OpLoad);
-          }
         } else if (auto *OpEx = dyn_cast<ExtractElementInst>(V)) {
-          if (!Defs.count(OpEx)) {
-            Defs.insert(OpEx);
+          if (Defs.insert(OpEx).second)
             Worklist.push_back(OpEx);
-          }
         } else if (auto *OpBC = dyn_cast<BitCastInst>(V)) {
           if (!ConvertTy)
             ConvertTy = OpBC->getOperand(0)->getType();
           if (OpBC->getOperand(0)->getType() != ConvertTy)
             return false;
-          if (!Defs.count(OpBC)) {
-            Defs.insert(OpBC);
+          if (Defs.insert(OpBC).second) {
             Worklist.push_back(OpBC);
             AnyAnchored |= !isa<LoadInst>(OpBC->getOperand(0)) &&
                            !isa<ExtractElementInst>(OpBC->getOperand(0));
@@ -6127,7 +6138,7 @@ bool CodeGenPrepare::optimizePhiTypes(Function &F) {
 
   // Remove any old phi's that have been converted.
   for (auto *I : DeletedInstrs) {
-    I->replaceAllUsesWith(UndefValue::get(I->getType()));
+    I->replaceAllUsesWith(PoisonValue::get(I->getType()));
     I->eraseFromParent();
   }
 
@@ -6979,12 +6990,12 @@ bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) {
   return Changed;
 }
 
-bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {
+bool CodeGenPrepare::optimizeSwitchType(SwitchInst *SI) {
   Value *Cond = SI->getCondition();
   Type *OldType = Cond->getType();
   LLVMContext &Context = Cond->getContext();
   EVT OldVT = TLI->getValueType(*DL, OldType);
-  MVT RegType = TLI->getRegisterType(Context, OldVT);
+  MVT RegType = TLI->getPreferredSwitchConditionType(Context, OldVT);
   unsigned RegWidth = RegType.getSizeInBits();
 
   if (RegWidth <= cast<IntegerType>(OldType)->getBitWidth())
@@ -7019,7 +7030,7 @@ bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {
   ExtInst->setDebugLoc(SI->getDebugLoc());
   SI->setCondition(ExtInst);
   for (auto Case : SI->cases()) {
-    APInt NarrowConst = Case.getCaseValue()->getValue();
+    const APInt &NarrowConst = Case.getCaseValue()->getValue();
     APInt WideConst = (ExtType == Instruction::ZExt) ?
                       NarrowConst.zext(RegWidth) : NarrowConst.sext(RegWidth);
     Case.setValue(ConstantInt::get(Context, WideConst));
@@ -7028,6 +7039,89 @@ bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {
   return true;
 }
 
+bool CodeGenPrepare::optimizeSwitchPhiConstants(SwitchInst *SI) {
+  // The SCCP optimization tends to produce code like this:
+  //   switch(x) { case 42: phi(42, ...) }
+  // Materializing the constant for the phi-argument needs instructions; So we
+  // change the code to:
+  //   switch(x) { case 42: phi(x, ...) }
+
+  Value *Condition = SI->getCondition();
+  // Avoid endless loop in degenerate case.
+  if (isa<ConstantInt>(*Condition))
+    return false;
+
+  bool Changed = false;
+  BasicBlock *SwitchBB = SI->getParent();
+  Type *ConditionType = Condition->getType();
+
+  for (const SwitchInst::CaseHandle &Case : SI->cases()) {
+    ConstantInt *CaseValue = Case.getCaseValue();
+    BasicBlock *CaseBB = Case.getCaseSuccessor();
+    // Set to true if we previously checked that `CaseBB` is only reached by
+    // a single case from this switch.
+    bool CheckedForSinglePred = false;
+    for (PHINode &PHI : CaseBB->phis()) {
+      Type *PHIType = PHI.getType();
+      // If ZExt is free then we can also catch patterns like this:
+      //   switch((i32)x) { case 42: phi((i64)42, ...); }
+      // and replace `(i64)42` with `zext i32 %x to i64`.
+      bool TryZExt =
+          PHIType->isIntegerTy() &&
+          PHIType->getIntegerBitWidth() > ConditionType->getIntegerBitWidth() &&
+          TLI->isZExtFree(ConditionType, PHIType);
+      if (PHIType == ConditionType || TryZExt) {
+        // Set to true to skip this case because of multiple preds.
+        bool SkipCase = false;
+        Value *Replacement = nullptr;
+        for (unsigned I = 0, E = PHI.getNumIncomingValues(); I != E; I++) {
+          Value *PHIValue = PHI.getIncomingValue(I);
+          if (PHIValue != CaseValue) {
+            if (!TryZExt)
+              continue;
+            ConstantInt *PHIValueInt = dyn_cast<ConstantInt>(PHIValue);
+            if (!PHIValueInt ||
+                PHIValueInt->getValue() !=
+                    CaseValue->getValue().zext(PHIType->getIntegerBitWidth()))
+              continue;
+          }
+          if (PHI.getIncomingBlock(I) != SwitchBB)
+            continue;
+          // We cannot optimize if there are multiple case labels jumping to
+          // this block.  This check may get expensive when there are many
+          // case labels so we test for it last.
+          if (!CheckedForSinglePred) {
+            CheckedForSinglePred = true;
+            if (SI->findCaseDest(CaseBB) == nullptr) {
+              SkipCase = true;
+              break;
+            }
+          }
+
+          if (Replacement == nullptr) {
+            if (PHIValue == CaseValue) {
+              Replacement = Condition;
+            } else {
+              IRBuilder<> Builder(SI);
+              Replacement = Builder.CreateZExt(Condition, PHIType);
+            }
+          }
+          PHI.setIncomingValue(I, Replacement);
+          Changed = true;
+        }
+        if (SkipCase)
+          break;
+      }
+    }
+  }
+  return Changed;
+}
+
+bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {
+  bool Changed = optimizeSwitchType(SI);
+  Changed |= optimizeSwitchPhiConstants(SI);
+  return Changed;
+}
 
 namespace {
 
@@ -7777,7 +7871,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {
     // It is possible for very late stage optimizations (such as SimplifyCFG)
     // to introduce PHI nodes too late to be cleaned up.  If we detect such a
     // trivial PHI, go ahead and zap it here.
-    if (Value *V = SimplifyInstruction(P, {*DL, TLInfo})) {
+    if (Value *V = simplifyInstruction(P, {*DL, TLInfo})) {
       LargeOffsetGEPMap.erase(P);
       P->replaceAllUsesWith(V);
       P->eraseFromParent();
diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp
index 1d50e1d22b95..fd52191882cb 100644
--- a/llvm/lib/CodeGen/CommandFlags.cpp
+++ b/llvm/lib/CodeGen/CommandFlags.cpp
@@ -13,7 +13,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/CommandFlags.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
+#include "llvm/MC/MCTargetOptionsCommandFlags.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Host.h"
@@ -58,6 +63,7 @@ CGOPT(bool, EnableUnsafeFPMath)
 CGOPT(bool, EnableNoInfsFPMath)
 CGOPT(bool, EnableNoNaNsFPMath)
 CGOPT(bool, EnableNoSignedZerosFPMath)
+CGOPT(bool, EnableApproxFuncFPMath)
 CGOPT(bool, EnableNoTrappingFPMath)
 CGOPT(bool, EnableAIXExtendedAltivecABI)
 CGOPT(DenormalMode::DenormalModeKind, DenormalFPMath)
@@ -73,6 +79,7 @@ CGOPT(bool, StackSymbolOrdering)
 CGOPT(bool, StackRealign)
 CGOPT(std::string, TrapFuncName)
 CGOPT(bool, UseCtors)
+CGOPT(bool, LowerGlobalDtorsViaCxaAtExit)
 CGOPT(bool, RelaxELFRelocations)
 CGOPT_EXP(bool, DataSections)
 CGOPT_EXP(bool, FunctionSections)
@@ -94,6 +101,7 @@ CGOPT(bool, ForceDwarfFrameSection)
 CGOPT(bool, XRayOmitFunctionIndex)
 CGOPT(bool, DebugStrictDwarf)
 CGOPT(unsigned, AlignLoops)
+CGOPT(bool, JMCInstrument)
 
 codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() {
 #define CGBINDOPT(NAME)                                                        \
@@ -218,6 +226,12 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() {
       cl::init(false));
   CGBINDOPT(EnableNoSignedZerosFPMath);
 
+  static cl::opt<bool> EnableApproxFuncFPMath(
+      "enable-approx-func-fp-math",
+      cl::desc("Enable FP math optimizations that assume approx func"),
+      cl::init(false));
+  CGBINDOPT(EnableApproxFuncFPMath);
+
   static cl::opt<bool> EnableNoTrappingFPMath(
       "enable-no-trapping-fp-math",
       cl::desc("Enable setting the FP exceptions build "
@@ -333,6 +347,12 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() {
                                 cl::init(false));
   CGBINDOPT(UseCtors);
 
+  static cl::opt<bool> LowerGlobalDtorsViaCxaAtExit(
+      "lower-global-dtors-via-cxa-atexit",
+      cl::desc("Lower llvm.global_dtors (global destructors) via __cxa_atexit"),
+      cl::init(true));
+  CGBINDOPT(LowerGlobalDtorsViaCxaAtExit);
+
   static cl::opt<bool> RelaxELFRelocations(
       "relax-elf-relocations",
       cl::desc(
@@ -457,6 +477,12 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() {
                                       cl::desc("Default alignment for loops"));
   CGBINDOPT(AlignLoops);
 
+  static cl::opt<bool> JMCInstrument(
+      "enable-jmc-instrument",
+      cl::desc("Instrument functions with a call to __CheckForDebuggerJustMyCode"),
+      cl::init(false));
+  CGBINDOPT(JMCInstrument);
+
 #undef CGBINDOPT
 
   mc::RegisterMCTargetOptionsFlags();
@@ -493,6 +519,7 @@ codegen::InitTargetOptionsFromCodeGenFlags(const Triple &TheTriple) {
   Options.NoInfsFPMath = getEnableNoInfsFPMath();
   Options.NoNaNsFPMath = getEnableNoNaNsFPMath();
   Options.NoSignedZerosFPMath = getEnableNoSignedZerosFPMath();
+  Options.ApproxFuncFPMath = getEnableApproxFuncFPMath();
   Options.NoTrappingFPMath = getEnableNoTrappingFPMath();
 
   DenormalMode::DenormalModeKind DenormKind = getDenormalFPMath();
@@ -509,9 +536,10 @@ codegen::InitTargetOptionsFromCodeGenFlags(const Triple &TheTriple) {
   Options.GuaranteedTailCallOpt = getEnableGuaranteedTailCallOpt();
   Options.StackSymbolOrdering = getStackSymbolOrdering();
   Options.UseInitArray = !getUseCtors();
+  Options.LowerGlobalDtorsViaCxaAtExit = getLowerGlobalDtorsViaCxaAtExit();
   Options.RelaxELFRelocations = getRelaxELFRelocations();
   Options.DataSections =
-      getExplicitDataSections().getValueOr(TheTriple.hasDefaultDataSections());
+      getExplicitDataSections().value_or(TheTriple.hasDefaultDataSections());
   Options.FunctionSections = getFunctionSections();
   Options.IgnoreXCOFFVisibility = getIgnoreXCOFFVisibility();
   Options.XCOFFTracebackTable = getXCOFFTracebackTable();
@@ -531,6 +559,7 @@ codegen::InitTargetOptionsFromCodeGenFlags(const Triple &TheTriple) {
   Options.XRayOmitFunctionIndex = getXRayOmitFunctionIndex();
   Options.DebugStrictDwarf = getDebugStrictDwarf();
   Options.LoopAlignment = getAlignLoops();
+  Options.JMCInstrument = getJMCInstrument();
 
   Options.MCOptions = mc::InitMCTargetOptionsFromFlags();
 
@@ -643,6 +672,7 @@ void codegen::setFunctionAttributes(StringRef CPU, StringRef Features,
   HANDLE_BOOL_ATTR(EnableNoInfsFPMathView, "no-infs-fp-math");
   HANDLE_BOOL_ATTR(EnableNoNaNsFPMathView, "no-nans-fp-math");
   HANDLE_BOOL_ATTR(EnableNoSignedZerosFPMathView, "no-signed-zeros-fp-math");
+  HANDLE_BOOL_ATTR(EnableApproxFuncFPMathView, "approx-func-fp-math");
 
   if (DenormalFPMathView->getNumOccurrences() > 0 &&
       !F.hasFnAttribute("denormal-fp-math")) {
@@ -684,4 +714,3 @@ void codegen::setFunctionAttributes(StringRef CPU, StringRef Features,
   for (Function &F : M)
     setFunctionAttributes(CPU, Features, F);
 }
-
diff --git a/llvm/lib/CodeGen/DFAPacketizer.cpp b/llvm/lib/CodeGen/DFAPacketizer.cpp
index d38bacdb1aa7..42192f41dbda 100644
--- a/llvm/lib/CodeGen/DFAPacketizer.cpp
+++ b/llvm/lib/CodeGen/DFAPacketizer.cpp
@@ -30,10 +30,10 @@
 #include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/MC/MCInstrDesc.h"
-#include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp b/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
index 5579152f1ce0..ce00be634e9a 100644
--- a/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -14,7 +14,6 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
diff --git a/llvm/lib/CodeGen/DetectDeadLanes.cpp b/llvm/lib/CodeGen/DetectDeadLanes.cpp
index 1337e57f360b..565c8b405f82 100644
--- a/llvm/lib/CodeGen/DetectDeadLanes.cpp
+++ b/llvm/lib/CodeGen/DetectDeadLanes.cpp
@@ -28,12 +28,9 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
-#include "llvm/PassRegistry.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <deque>
@@ -93,7 +90,7 @@ private:
   LaneBitmask transferUsedLanes(const MachineInstr &MI, LaneBitmask UsedLanes,
                                 const MachineOperand &MO) const;
 
-  bool runOnce(MachineFunction &MF);
+  std::pair<bool, bool> runOnce(MachineFunction &MF);
 
   LaneBitmask determineInitialDefinedLanes(unsigned Reg);
   LaneBitmask determineInitialUsedLanes(unsigned Reg);
@@ -487,7 +484,7 @@ bool DetectDeadLanes::isUndefInput(const MachineOperand &MO,
   return true;
 }
 
-bool DetectDeadLanes::runOnce(MachineFunction &MF) {
+std::pair<bool, bool> DetectDeadLanes::runOnce(MachineFunction &MF) {
   // First pass: Populate defs/uses of vregs with initial values
   unsigned NumVirtRegs = MRI->getNumVirtRegs();
   for (unsigned RegIdx = 0; RegIdx < NumVirtRegs; ++RegIdx) {
@@ -528,6 +525,7 @@ bool DetectDeadLanes::runOnce(MachineFunction &MF) {
     dbgs() << "\n";
   });
 
+  bool Changed = false;
   bool Again = false;
   // Mark operands as dead/unused.
   for (MachineBasicBlock &MBB : MF) {
@@ -544,6 +542,7 @@ bool DetectDeadLanes::runOnce(MachineFunction &MF) {
           LLVM_DEBUG(dbgs()
                      << "Marking operand '" << MO << "' as dead in " << MI);
           MO.setIsDead();
+          Changed = true;
         }
         if (MO.readsReg()) {
           bool CrossCopy = false;
@@ -551,10 +550,12 @@ bool DetectDeadLanes::runOnce(MachineFunction &MF) {
             LLVM_DEBUG(dbgs()
                        << "Marking operand '" << MO << "' as undef in " << MI);
             MO.setIsUndef();
+            Changed = true;
           } else if (isUndefInput(MO, &CrossCopy)) {
             LLVM_DEBUG(dbgs()
                        << "Marking operand '" << MO << "' as undef in " << MI);
             MO.setIsUndef();
+            Changed = true;
             if (CrossCopy)
               Again = true;
           }
@@ -563,7 +564,7 @@ bool DetectDeadLanes::runOnce(MachineFunction &MF) {
     }
   }
 
-  return Again;
+  return std::make_pair(Changed, Again);
 }
 
 bool DetectDeadLanes::runOnMachineFunction(MachineFunction &MF) {
@@ -585,13 +586,16 @@ bool DetectDeadLanes::runOnMachineFunction(MachineFunction &MF) {
   WorklistMembers.resize(NumVirtRegs);
   DefinedByCopy.resize(NumVirtRegs);
 
+  bool Changed = false;
   bool Again;
   do {
-    Again = runOnce(MF);
+    bool LocalChanged;
+    std::tie(LocalChanged, Again) = runOnce(MF);
+    Changed |= LocalChanged;
   } while(Again);
 
   DefinedByCopy.clear();
   WorklistMembers.clear();
   delete[] VRegInfos;
-  return true;
+  return Changed;
 }
diff --git a/llvm/lib/CodeGen/EHContGuardCatchret.cpp b/llvm/lib/CodeGen/EHContGuardCatchret.cpp
index c18532946bf9..b26aa792bb93 100644
--- a/llvm/lib/CodeGen/EHContGuardCatchret.cpp
+++ b/llvm/lib/CodeGen/EHContGuardCatchret.cpp
@@ -17,9 +17,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/InitializePasses.h"
 
diff --git a/llvm/lib/CodeGen/EarlyIfConversion.cpp b/llvm/lib/CodeGen/EarlyIfConversion.cpp
index 6a0da4dad3c1..32858d043383 100644
--- a/llvm/lib/CodeGen/EarlyIfConversion.cpp
+++ b/llvm/lib/CodeGen/EarlyIfConversion.cpp
@@ -17,10 +17,10 @@
 
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SparseSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -30,7 +30,6 @@
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineTraceMetrics.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -664,8 +663,8 @@ void SSAIfConv::rewritePHIOperands() {
         PI.PHI->getOperand(i-1).setMBB(Head);
         PI.PHI->getOperand(i-2).setReg(DstReg);
       } else if (MBB == getFPred()) {
-        PI.PHI->RemoveOperand(i-1);
-        PI.PHI->RemoveOperand(i-2);
+        PI.PHI->removeOperand(i-1);
+        PI.PHI->removeOperand(i-2);
       }
     }
     LLVM_DEBUG(dbgs() << "          --> " << *PI.PHI);
diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp
index 60ee1812ee2c..b2639636dda7 100644
--- a/llvm/lib/CodeGen/ExpandMemCmp.cpp
+++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp
@@ -19,7 +19,6 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/Dominators.h"
@@ -32,6 +31,10 @@
 
 using namespace llvm;
 
+namespace llvm {
+class TargetLowering;
+}
+
 #define DEBUG_TYPE "expandmemcmp"
 
 STATISTIC(NumMemCmpCalls, "Number of memcmp calls");
@@ -737,7 +740,7 @@ Value *MemCmpExpansion::getMemCmpExpansion() {
 static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
                          const TargetLowering *TLI, const DataLayout *DL,
                          ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI,
-                         DomTreeUpdater *DTU) {
+                         DomTreeUpdater *DTU, const bool IsBCmp) {
   NumMemCmpCalls++;
 
   // Early exit from expansion if -Oz.
@@ -757,7 +760,8 @@ static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
   }
   // TTI call to check if target would like to expand memcmp. Also, get the
   // available load sizes.
-  const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI);
+  const bool IsUsedForZeroCmp =
+      IsBCmp || isOnlyUsedInZeroEqualityComparison(CI);
   bool OptForSize = CI->getFunction()->hasOptSize() ||
                     llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI);
   auto Options = TTI->enableMemCmpExpansion(OptForSize,
@@ -861,7 +865,7 @@ bool ExpandMemCmpPass::runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI,
     LibFunc Func;
     if (TLI->getLibFunc(*CI, Func) &&
         (Func == LibFunc_memcmp || Func == LibFunc_bcmp) &&
-        expandMemCmp(CI, TTI, TL, &DL, PSI, BFI, DTU)) {
+        expandMemCmp(CI, TTI, TL, &DL, PSI, BFI, DTU, Func == LibFunc_bcmp)) {
       return true;
     }
   }
@@ -881,7 +885,7 @@ ExpandMemCmpPass::runImpl(Function &F, const TargetLibraryInfo *TLI,
   bool MadeChanges = false;
   for (auto BBIt = F.begin(); BBIt != F.end();) {
     if (runOnBlock(*BBIt, TLI, TTI, TL, DL, PSI, BFI,
-                   DTU.hasValue() ? DTU.getPointer() : nullptr)) {
+                   DTU ? DTU.getPointer() : nullptr)) {
       MadeChanges = true;
       // If changes were made, restart the function from the beginning, since
       // the structure of the function was changed.
diff --git a/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp b/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp
index d9caa8ad42d0..086b4a4dcc47 100644
--- a/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp
+++ b/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp
@@ -13,8 +13,6 @@
 
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -104,8 +102,8 @@ bool ExpandPostRA::LowerSubregToReg(MachineInstr *MI) {
 
   if (MI->allDefsAreDead()) {
     MI->setDesc(TII->get(TargetOpcode::KILL));
-    MI->RemoveOperand(3); // SubIdx
-    MI->RemoveOperand(1); // Imm
+    MI->removeOperand(3); // SubIdx
+    MI->removeOperand(1); // Imm
     LLVM_DEBUG(dbgs() << "subreg: replaced by: " << *MI);
     return true;
   }
@@ -117,8 +115,8 @@ bool ExpandPostRA::LowerSubregToReg(MachineInstr *MI) {
     // We must leave %rax live.
     if (DstReg != InsReg) {
       MI->setDesc(TII->get(TargetOpcode::KILL));
-      MI->RemoveOperand(3);     // SubIdx
-      MI->RemoveOperand(1);     // Imm
+      MI->removeOperand(3);     // SubIdx
+      MI->removeOperand(1);     // Imm
       LLVM_DEBUG(dbgs() << "subreg: replace by: " << *MI);
       return true;
     }
diff --git a/llvm/lib/CodeGen/ExpandReductions.cpp b/llvm/lib/CodeGen/ExpandReductions.cpp
index 2bcaf750911b..f08c47d220ea 100644
--- a/llvm/lib/CodeGen/ExpandReductions.cpp
+++ b/llvm/lib/CodeGen/ExpandReductions.cpp
@@ -14,12 +14,10 @@
 #include "llvm/CodeGen/ExpandReductions.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
index bb8d2b3e9a78..7883a48d121c 100644
--- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp
+++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
@@ -23,13 +23,11 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/MathExtras.h"
 
 using namespace llvm;
 
@@ -115,6 +113,17 @@ static void replaceOperation(Value &NewOp, VPIntrinsic &OldOp) {
   OldOp.eraseFromParent();
 }
 
+static bool maySpeculateLanes(VPIntrinsic &VPI) {
+  // The result of VP reductions depends on the mask and evl.
+  if (isa<VPReductionIntrinsic>(VPI))
+    return false;
+  // Fallback to whether the intrinsic is speculatable.
+  Optional<unsigned> OpcOpt = VPI.getFunctionalOpcode();
+  unsigned FunctionalOpc = OpcOpt.value_or((unsigned)Instruction::Call);
+  return isSafeToSpeculativelyExecuteWithOpcode(FunctionalOpc,
+                                                cast<Operator>(&VPI));
+}
+
 //// } Helpers
 
 namespace {
@@ -218,8 +227,7 @@ Value *CachingVPExpander::convertEVLToMask(IRBuilder<> &Builder,
 Value *
 CachingVPExpander::expandPredicationInBinaryOperator(IRBuilder<> &Builder,
                                                      VPIntrinsic &VPI) {
-  assert((isSafeToSpeculativelyExecute(&VPI) ||
-          VPI.canIgnoreVectorLengthParam()) &&
+  assert((maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam()) &&
          "Implicitly dropping %evl in non-speculatable operator!");
 
   auto OC = static_cast<Instruction::BinaryOps>(*VPI.getFunctionalOpcode());
@@ -298,8 +306,7 @@ static Value *getNeutralReductionElement(const VPReductionIntrinsic &VPI,
 Value *
 CachingVPExpander::expandPredicationInReduction(IRBuilder<> &Builder,
                                                 VPReductionIntrinsic &VPI) {
-  assert((isSafeToSpeculativelyExecute(&VPI) ||
-          VPI.canIgnoreVectorLengthParam()) &&
+  assert((maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam()) &&
          "Implicitly dropping %evl in non-speculatable operator!");
 
   Value *Mask = VPI.getMaskParam();
@@ -473,9 +480,9 @@ struct TransformJob {
   bool isDone() const { return Strategy.shouldDoNothing(); }
 };
 
-void sanitizeStrategy(Instruction &I, VPLegalization &LegalizeStrat) {
-  // Speculatable instructions do not strictly need predication.
-  if (isSafeToSpeculativelyExecute(&I)) {
+void sanitizeStrategy(VPIntrinsic &VPI, VPLegalization &LegalizeStrat) {
+  // Operations with speculatable lanes do not strictly need predication.
+  if (maySpeculateLanes(VPI)) {
     // Converting a speculatable VP intrinsic means dropping %mask and %evl.
     // No need to expand %evl into the %mask only to ignore that code.
     if (LegalizeStrat.OpStrategy == VPLegalization::Convert)
@@ -520,7 +527,7 @@ bool CachingVPExpander::expandVectorPredication() {
     if (!VPI)
       continue;
     auto VPStrat = getVPLegalizationStrategy(*VPI);
-    sanitizeStrategy(I, VPStrat);
+    sanitizeStrategy(*VPI, VPStrat);
     if (!VPStrat.shouldDoNothing())
       Worklist.emplace_back(VPI, VPStrat);
   }
diff --git a/llvm/lib/CodeGen/FEntryInserter.cpp b/llvm/lib/CodeGen/FEntryInserter.cpp
index c2194929e2e7..68304dd41db0 100644
--- a/llvm/lib/CodeGen/FEntryInserter.cpp
+++ b/llvm/lib/CodeGen/FEntryInserter.cpp
@@ -13,12 +13,9 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 
 using namespace llvm;
diff --git a/llvm/lib/CodeGen/FaultMaps.cpp b/llvm/lib/CodeGen/FaultMaps.cpp
index 1d35b194f218..3ec666227651 100644
--- a/llvm/lib/CodeGen/FaultMaps.cpp
+++ b/llvm/lib/CodeGen/FaultMaps.cpp
@@ -52,7 +52,7 @@ void FaultMaps::serializeToFaultMapSection() {
   // Create the section.
   MCSection *FaultMapSection =
       OutContext.getObjectFileInfo()->getFaultMapSection();
-  OS.SwitchSection(FaultMapSection);
+  OS.switchSection(FaultMapSection);
 
   // Emit a dummy symbol to force section inclusion.
   OS.emitLabel(OutContext.getOrCreateSymbol(Twine("__LLVM_FaultMaps")));
diff --git a/llvm/lib/CodeGen/FinalizeISel.cpp b/llvm/lib/CodeGen/FinalizeISel.cpp
index 00040e92a829..329c9587e321 100644
--- a/llvm/lib/CodeGen/FinalizeISel.cpp
+++ b/llvm/lib/CodeGen/FinalizeISel.cpp
@@ -16,11 +16,9 @@
 
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Support/Debug.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "finalize-isel"
diff --git a/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp b/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
index ec6bf18b2769..252910fd9462 100644
--- a/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
+++ b/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
@@ -24,10 +24,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/StackMaps.h"
-#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/Statepoint.h"
 #include "llvm/InitializePasses.h"
@@ -156,12 +153,17 @@ static Register performCopyPropagation(Register Reg,
   RI = ++MachineBasicBlock::iterator(Def);
   IsKill = DestSrc->Source->isKill();
 
-  // There are no uses of original register between COPY and STATEPOINT.
-  // There can't be any after STATEPOINT, so we can eliminate Def.
   if (!Use) {
+    // There are no uses of original register between COPY and STATEPOINT.
+    // There can't be any after STATEPOINT, so we can eliminate Def.
     LLVM_DEBUG(dbgs() << "spillRegisters: removing dead copy " << *Def);
     Def->eraseFromParent();
+  } else if (IsKill) {
+    // COPY will remain in place, spill will be inserted *after* it, so it is
+    // not a kill of source anymore.
+    const_cast<MachineOperand *>(DestSrc->Source)->setIsKill(false);
   }
+
   return SrcReg;
 }
 
diff --git a/llvm/lib/CodeGen/GCMetadata.cpp b/llvm/lib/CodeGen/GCMetadata.cpp
index af5515cc6bfd..4d27143c5298 100644
--- a/llvm/lib/CodeGen/GCMetadata.cpp
+++ b/llvm/lib/CodeGen/GCMetadata.cpp
@@ -11,16 +11,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/GCMetadata.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cassert>
 #include <memory>
 #include <string>
diff --git a/llvm/lib/CodeGen/GCRootLowering.cpp b/llvm/lib/CodeGen/GCRootLowering.cpp
index 637a877810a1..80feb0045406 100644
--- a/llvm/lib/CodeGen/GCRootLowering.cpp
+++ b/llvm/lib/CodeGen/GCRootLowering.cpp
@@ -14,7 +14,6 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
@@ -24,9 +23,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/MC/MCContext.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp b/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp
index f9bfe8518083..ac140e745600 100644
--- a/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp
@@ -67,7 +67,8 @@ bool CSEConfigFull::shouldCSEOpc(unsigned Opc) {
 }
 
 bool CSEConfigConstantOnly::shouldCSEOpc(unsigned Opc) {
-  return Opc == TargetOpcode::G_CONSTANT || Opc == TargetOpcode::G_IMPLICIT_DEF;
+  return Opc == TargetOpcode::G_CONSTANT || Opc == TargetOpcode::G_FCONSTANT ||
+         Opc == TargetOpcode::G_IMPLICIT_DEF;
 }
 
 std::unique_ptr<CSEConfigBase>
@@ -88,7 +89,7 @@ void GISelCSEInfo::setMF(MachineFunction &MF) {
   this->MRI = &MF.getRegInfo();
 }
 
-GISelCSEInfo::~GISelCSEInfo() {}
+GISelCSEInfo::~GISelCSEInfo() = default;
 
 bool GISelCSEInfo::isUniqueMachineInstValid(
     const UniqueMachineInstr &UMI) const {
diff --git a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
index 1a642e233a6a..a432e4ed7fb7 100644
--- a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
@@ -12,6 +12,7 @@
 //
 
 #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -174,6 +175,7 @@ MachineInstrBuilder CSEMIRBuilder::buildInstr(unsigned Opc,
   default:
     break;
   case TargetOpcode::G_ADD:
+  case TargetOpcode::G_PTR_ADD:
   case TargetOpcode::G_AND:
   case TargetOpcode::G_ASHR:
   case TargetOpcode::G_LSHR:
@@ -185,23 +187,54 @@ MachineInstrBuilder CSEMIRBuilder::buildInstr(unsigned Opc,
   case TargetOpcode::G_UDIV:
   case TargetOpcode::G_SDIV:
   case TargetOpcode::G_UREM:
-  case TargetOpcode::G_SREM: {
+  case TargetOpcode::G_SREM:
+  case TargetOpcode::G_SMIN:
+  case TargetOpcode::G_SMAX:
+  case TargetOpcode::G_UMIN:
+  case TargetOpcode::G_UMAX: {
     // Try to constant fold these.
     assert(SrcOps.size() == 2 && "Invalid sources");
     assert(DstOps.size() == 1 && "Invalid dsts");
-    if (SrcOps[0].getLLTTy(*getMRI()).isVector()) {
+    LLT SrcTy = SrcOps[0].getLLTTy(*getMRI());
+
+    if (Opc == TargetOpcode::G_PTR_ADD &&
+        getDataLayout().isNonIntegralAddressSpace(SrcTy.getAddressSpace()))
+      break;
+
+    if (SrcTy.isVector()) {
       // Try to constant fold vector constants.
-      Register VecCst = ConstantFoldVectorBinop(
-          Opc, SrcOps[0].getReg(), SrcOps[1].getReg(), *getMRI(), *this);
-      if (VecCst)
-        return buildCopy(DstOps[0], VecCst);
+      SmallVector<APInt> VecCst = ConstantFoldVectorBinop(
+          Opc, SrcOps[0].getReg(), SrcOps[1].getReg(), *getMRI());
+      if (!VecCst.empty())
+        return buildBuildVectorConstant(DstOps[0], VecCst);
       break;
     }
+
     if (Optional<APInt> Cst = ConstantFoldBinOp(Opc, SrcOps[0].getReg(),
                                                 SrcOps[1].getReg(), *getMRI()))
       return buildConstant(DstOps[0], *Cst);
     break;
   }
+  case TargetOpcode::G_FADD:
+  case TargetOpcode::G_FSUB:
+  case TargetOpcode::G_FMUL:
+  case TargetOpcode::G_FDIV:
+  case TargetOpcode::G_FREM:
+  case TargetOpcode::G_FMINNUM:
+  case TargetOpcode::G_FMAXNUM:
+  case TargetOpcode::G_FMINNUM_IEEE:
+  case TargetOpcode::G_FMAXNUM_IEEE:
+  case TargetOpcode::G_FMINIMUM:
+  case TargetOpcode::G_FMAXIMUM:
+  case TargetOpcode::G_FCOPYSIGN: {
+    // Try to constant fold these.
+    assert(SrcOps.size() == 2 && "Invalid sources");
+    assert(DstOps.size() == 1 && "Invalid dsts");
+    if (Optional<APFloat> Cst = ConstantFoldFPBinOp(
+            Opc, SrcOps[0].getReg(), SrcOps[1].getReg(), *getMRI()))
+      return buildFConstant(DstOps[0], *Cst);
+    break;
+  }
   case TargetOpcode::G_SEXT_INREG: {
     assert(DstOps.size() == 1 && "Invalid dst ops");
     assert(SrcOps.size() == 2 && "Invalid src ops");
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 1ec7868f2234..081c8b125f17 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -11,16 +11,16 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/GlobalISel/CallLowering.h"
-#include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Target/TargetMachine.h"
@@ -698,10 +698,12 @@ bool CallLowering::handleAssignments(ValueHandler &Handler,
                       ValTy, extendOpFromFlags(Args[i].Flags[0]));
     }
 
+    bool BigEndianPartOrdering = TLI->hasBigEndianPartOrdering(OrigVT, DL);
     for (unsigned Part = 0; Part < NumParts; ++Part) {
       Register ArgReg = Args[i].Regs[Part];
       // There should be Regs.size() ArgLocs per argument.
-      VA = ArgLocs[j + Part];
+      unsigned Idx = BigEndianPartOrdering ? NumParts - 1 - Part : Part;
+      CCValAssign &VA = ArgLocs[j + Idx];
       const ISD::ArgFlagsTy Flags = Args[i].Flags[Part];
 
       if (VA.isMemLoc() && !Flags.isByVal()) {
diff --git a/llvm/lib/CodeGen/GlobalISel/Combiner.cpp b/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
index 30f8838805b5..1a5fe3e84c17 100644
--- a/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
@@ -13,14 +13,13 @@
 #include "llvm/CodeGen/GlobalISel/Combiner.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
-#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
 #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/GISelWorkList.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "gi-combiner"
@@ -57,8 +56,7 @@ class WorkListMaintainer : public GISelChangeObserver {
 
 public:
   WorkListMaintainer(WorkListTy &WorkList) : WorkList(WorkList) {}
-  virtual ~WorkListMaintainer() {
-  }
+  virtual ~WorkListMaintainer() = default;
 
   void erasingInstr(MachineInstr &MI) override {
     LLVM_DEBUG(dbgs() << "Erasing: " << MI << "\n");
@@ -115,7 +113,7 @@ bool Combiner::combineMachineInstrs(MachineFunction &MF,
 
   bool MFChanged = false;
   bool Changed;
-  MachineIRBuilder &B = *Builder.get();
+  MachineIRBuilder &B = *Builder;
 
   do {
     // Collect all instructions. Do a post order traversal for basic blocks and
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index d6a009744161..2c94f87804ac 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -8,7 +8,6 @@
 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallBitVector.h"
-#include "llvm/CodeGen/GlobalISel/Combiner.h"
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
@@ -16,23 +15,22 @@
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/LowLevelType.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/DivisionByConstantInfo.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetMachine.h"
 #include <tuple>
 
 #define DEBUG_TYPE "gi-combiner"
@@ -131,9 +129,27 @@ isBigEndian(const SmallDenseMap<int64_t, int64_t, 8> &MemOffset2Idx,
   return BigEndian;
 }
 
+bool CombinerHelper::isPreLegalize() const { return !LI; }
+
+bool CombinerHelper::isLegal(const LegalityQuery &Query) const {
+  assert(LI && "Must have LegalizerInfo to query isLegal!");
+  return LI->getAction(Query).Action == LegalizeActions::Legal;
+}
+
 bool CombinerHelper::isLegalOrBeforeLegalizer(
     const LegalityQuery &Query) const {
-  return !LI || LI->getAction(Query).Action == LegalizeActions::Legal;
+  return isPreLegalize() || isLegal(Query);
+}
+
+bool CombinerHelper::isConstantLegalOrBeforeLegalizer(const LLT Ty) const {
+  if (!Ty.isVector())
+    return isLegalOrBeforeLegalizer({TargetOpcode::G_CONSTANT, {Ty}});
+  // Vector constants are represented as a G_BUILD_VECTOR of scalar G_CONSTANTs.
+  if (isPreLegalize())
+    return true;
+  LLT EltTy = Ty.getElementType();
+  return isLegal({TargetOpcode::G_BUILD_VECTOR, {Ty, EltTy}}) &&
+         isLegal({TargetOpcode::G_CONSTANT, {EltTy}});
 }
 
 void CombinerHelper::replaceRegWith(MachineRegisterInfo &MRI, Register FromReg,
@@ -1275,12 +1291,12 @@ bool CombinerHelper::matchCombineConstantFoldFpUnary(MachineInstr &MI,
   Register SrcReg = MI.getOperand(1).getReg();
   LLT DstTy = MRI.getType(DstReg);
   Cst = constantFoldFpUnary(MI.getOpcode(), DstTy, SrcReg, MRI);
-  return Cst.hasValue();
+  return Cst.has_value();
 }
 
 void CombinerHelper::applyCombineConstantFoldFpUnary(MachineInstr &MI,
                                                      Optional<APFloat> &Cst) {
-  assert(Cst.hasValue() && "Optional is unexpectedly empty!");
+  assert(Cst && "Optional is unexpectedly empty!");
   Builder.setInstrAndDebugLoc(MI);
   MachineFunction &MF = Builder.getMF();
   auto *FPVal = ConstantFP::get(MF.getFunction().getContext(), *Cst);
@@ -2350,6 +2366,19 @@ bool CombinerHelper::matchEqualDefs(const MachineOperand &MOP1,
   if (I1->mayLoadOrStore() && !I1->isDereferenceableInvariantLoad(nullptr))
     return false;
 
+  // If both instructions are loads or stores, they are equal only if both
+  // are dereferenceable invariant loads with the same number of bits.
+  if (I1->mayLoadOrStore() && I2->mayLoadOrStore()) {
+    GLoadStore *LS1 = dyn_cast<GLoadStore>(I1);
+    GLoadStore *LS2 = dyn_cast<GLoadStore>(I2);
+    if (!LS1 || !LS2)
+      return false;
+
+    if (!I2->isDereferenceableInvariantLoad(nullptr) ||
+        (LS1->getMemSizeInBits() != LS2->getMemSizeInBits()))
+      return false;
+  }
+
   // Check for physical registers on the instructions first to avoid cases
   // like this:
   //
@@ -2397,7 +2426,7 @@ bool CombinerHelper::matchConstantOp(const MachineOperand &MOP, int64_t C) {
     return false;
   auto *MI = MRI.getVRegDef(MOP.getReg());
   auto MaybeCst = isConstantOrConstantSplatVector(*MI, MRI);
-  return MaybeCst.hasValue() && MaybeCst->getBitWidth() <= 64 &&
+  return MaybeCst && MaybeCst->getBitWidth() <= 64 &&
          MaybeCst->getSExtValue() == C;
 }
 
@@ -2916,7 +2945,7 @@ bool CombinerHelper::matchNotCmp(MachineInstr &MI,
   int64_t Cst;
   if (Ty.isVector()) {
     MachineInstr *CstDef = MRI.getVRegDef(CstReg);
-    auto MaybeCst = getBuildVectorConstantSplat(*CstDef, MRI);
+    auto MaybeCst = getIConstantSplatSExtVal(*CstDef, MRI);
     if (!MaybeCst)
       return false;
     if (!isConstValidTrue(TLI, Ty.getScalarSizeInBits(), *MaybeCst, true, IsFP))
@@ -3049,6 +3078,102 @@ void CombinerHelper::applySimplifyURemByPow2(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
+bool CombinerHelper::matchFoldBinOpIntoSelect(MachineInstr &MI,
+                                              unsigned &SelectOpNo) {
+  Register LHS = MI.getOperand(1).getReg();
+  Register RHS = MI.getOperand(2).getReg();
+
+  Register OtherOperandReg = RHS;
+  SelectOpNo = 1;
+  MachineInstr *Select = MRI.getVRegDef(LHS);
+
+  // Don't do this unless the old select is going away. We want to eliminate the
+  // binary operator, not replace a binop with a select.
+  if (Select->getOpcode() != TargetOpcode::G_SELECT ||
+      !MRI.hasOneNonDBGUse(LHS)) {
+    OtherOperandReg = LHS;
+    SelectOpNo = 2;
+    Select = MRI.getVRegDef(RHS);
+    if (Select->getOpcode() != TargetOpcode::G_SELECT ||
+        !MRI.hasOneNonDBGUse(RHS))
+      return false;
+  }
+
+  MachineInstr *SelectLHS = MRI.getVRegDef(Select->getOperand(2).getReg());
+  MachineInstr *SelectRHS = MRI.getVRegDef(Select->getOperand(3).getReg());
+
+  if (!isConstantOrConstantVector(*SelectLHS, MRI,
+                                  /*AllowFP*/ true,
+                                  /*AllowOpaqueConstants*/ false))
+    return false;
+  if (!isConstantOrConstantVector(*SelectRHS, MRI,
+                                  /*AllowFP*/ true,
+                                  /*AllowOpaqueConstants*/ false))
+    return false;
+
+  unsigned BinOpcode = MI.getOpcode();
+
+  // We know know one of the operands is a select of constants. Now verify that
+  // the other binary operator operand is either a constant, or we can handle a
+  // variable.
+  bool CanFoldNonConst =
+      (BinOpcode == TargetOpcode::G_AND || BinOpcode == TargetOpcode::G_OR) &&
+      (isNullOrNullSplat(*SelectLHS, MRI) ||
+       isAllOnesOrAllOnesSplat(*SelectLHS, MRI)) &&
+      (isNullOrNullSplat(*SelectRHS, MRI) ||
+       isAllOnesOrAllOnesSplat(*SelectRHS, MRI));
+  if (CanFoldNonConst)
+    return true;
+
+  return isConstantOrConstantVector(*MRI.getVRegDef(OtherOperandReg), MRI,
+                                    /*AllowFP*/ true,
+                                    /*AllowOpaqueConstants*/ false);
+}
+
+/// \p SelectOperand is the operand in binary operator \p MI that is the select
+/// to fold.
+bool CombinerHelper::applyFoldBinOpIntoSelect(MachineInstr &MI,
+                                              const unsigned &SelectOperand) {
+  Builder.setInstrAndDebugLoc(MI);
+
+  Register Dst = MI.getOperand(0).getReg();
+  Register LHS = MI.getOperand(1).getReg();
+  Register RHS = MI.getOperand(2).getReg();
+  MachineInstr *Select = MRI.getVRegDef(MI.getOperand(SelectOperand).getReg());
+
+  Register SelectCond = Select->getOperand(1).getReg();
+  Register SelectTrue = Select->getOperand(2).getReg();
+  Register SelectFalse = Select->getOperand(3).getReg();
+
+  LLT Ty = MRI.getType(Dst);
+  unsigned BinOpcode = MI.getOpcode();
+
+  Register FoldTrue, FoldFalse;
+
+  // We have a select-of-constants followed by a binary operator with a
+  // constant. Eliminate the binop by pulling the constant math into the select.
+  // Example: add (select Cond, CT, CF), CBO --> select Cond, CT + CBO, CF + CBO
+  if (SelectOperand == 1) {
+    // TODO: SelectionDAG verifies this actually constant folds before
+    // committing to the combine.
+
+    FoldTrue = Builder.buildInstr(BinOpcode, {Ty}, {SelectTrue, RHS}).getReg(0);
+    FoldFalse =
+        Builder.buildInstr(BinOpcode, {Ty}, {SelectFalse, RHS}).getReg(0);
+  } else {
+    FoldTrue = Builder.buildInstr(BinOpcode, {Ty}, {LHS, SelectTrue}).getReg(0);
+    FoldFalse =
+        Builder.buildInstr(BinOpcode, {Ty}, {LHS, SelectFalse}).getReg(0);
+  }
+
+  Builder.buildSelect(Dst, SelectCond, FoldTrue, FoldFalse, MI.getFlags());
+  Observer.erasingInstr(*Select);
+  Select->eraseFromParent();
+  MI.eraseFromParent();
+
+  return true;
+}
+
 Optional<SmallVector<Register, 8>>
 CombinerHelper::findCandidatesForLoadOrCombine(const MachineInstr *Root) const {
   assert(Root->getOpcode() == TargetOpcode::G_OR && "Expected G_OR only!");
@@ -3340,7 +3465,7 @@ bool CombinerHelper::matchLoadOrCombine(
   // BSWAP.
   bool IsBigEndianTarget = MF.getDataLayout().isBigEndian();
   Optional<bool> IsBigEndian = isBigEndian(MemOffset2Idx, LowestIdx);
-  if (!IsBigEndian.hasValue())
+  if (!IsBigEndian)
     return false;
   bool NeedsBSwap = IsBigEndianTarget != *IsBigEndian;
   if (NeedsBSwap && !isLegalOrBeforeLegalizer({TargetOpcode::G_BSWAP, {Ty}}))
@@ -3848,7 +3973,7 @@ bool CombinerHelper::matchExtractAllEltsFromBuildVector(
     auto Cst = getIConstantVRegVal(II.getOperand(2).getReg(), MRI);
     if (!Cst)
       return false;
-    unsigned Idx = Cst.getValue().getZExtValue();
+    unsigned Idx = Cst->getZExtValue();
     if (Idx >= NumElts)
       return false; // Out of range.
     ExtractedElts.set(Idx);
@@ -3904,10 +4029,9 @@ bool CombinerHelper::matchOrShiftToFunnelShift(MachineInstr &MI,
 
   // Given constants C0 and C1 such that C0 + C1 is bit-width:
   // (or (shl x, C0), (lshr y, C1)) -> (fshl x, y, C0) or (fshr x, y, C1)
-  // TODO: Match constant splat.
   int64_t CstShlAmt, CstLShrAmt;
-  if (mi_match(ShlAmt, MRI, m_ICst(CstShlAmt)) &&
-      mi_match(LShrAmt, MRI, m_ICst(CstLShrAmt)) &&
+  if (mi_match(ShlAmt, MRI, m_ICstOrSplat(CstShlAmt)) &&
+      mi_match(LShrAmt, MRI, m_ICstOrSplat(CstLShrAmt)) &&
       CstShlAmt + CstLShrAmt == BitWidth) {
     FshOpc = TargetOpcode::G_FSHR;
     Amt = LShrAmt;
@@ -3958,7 +4082,7 @@ void CombinerHelper::applyFunnelShiftToRotate(MachineInstr &MI) {
   Observer.changingInstr(MI);
   MI.setDesc(Builder.getTII().get(IsFSHL ? TargetOpcode::G_ROTL
                                          : TargetOpcode::G_ROTR));
-  MI.RemoveOperand(2);
+  MI.removeOperand(2);
   Observer.changedInstr(MI);
 }
 
@@ -4100,18 +4224,23 @@ bool CombinerHelper::matchAndOrDisjointMask(
     return false;
 
   Register Src;
-  int64_t MaskAnd;
-  int64_t MaskOr;
+  Register AndMaskReg;
+  int64_t AndMaskBits;
+  int64_t OrMaskBits;
   if (!mi_match(MI, MRI,
-                m_GAnd(m_GOr(m_Reg(Src), m_ICst(MaskOr)), m_ICst(MaskAnd))))
+                m_GAnd(m_GOr(m_Reg(Src), m_ICst(OrMaskBits)),
+                       m_all_of(m_ICst(AndMaskBits), m_Reg(AndMaskReg)))))
     return false;
 
-  // Check if MaskOr could turn on any bits in Src.
-  if (MaskAnd & MaskOr)
+  // Check if OrMask could turn on any bits in Src.
+  if (AndMaskBits & OrMaskBits)
     return false;
 
   MatchInfo = [=, &MI](MachineIRBuilder &B) {
     Observer.changingInstr(MI);
+    // Canonicalize the result to have the constant on the RHS.
+    if (MI.getOperand(1).getReg() == AndMaskReg)
+      MI.getOperand(2).setReg(AndMaskReg);
     MI.getOperand(1).setReg(Src);
     Observer.changedInstr(MI);
   };
@@ -4259,6 +4388,14 @@ bool CombinerHelper::matchBitfieldExtractFromShrAnd(
   if (ShrAmt < 0 || ShrAmt >= Size)
     return false;
 
+  // If the shift subsumes the mask, emit the 0 directly.
+  if (0 == (SMask >> ShrAmt)) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.buildConstant(Dst, 0);
+    };
+    return true;
+  }
+
   // Check that ubfx can do the extraction, with no holes in the mask.
   uint64_t UMask = SMask;
   UMask |= maskTrailingOnes<uint64_t>(ShrAmt);
@@ -4585,6 +4722,42 @@ bool CombinerHelper::matchMulOBy2(MachineInstr &MI, BuildFnTy &MatchInfo) {
   return true;
 }
 
+bool CombinerHelper::matchMulOBy0(MachineInstr &MI, BuildFnTy &MatchInfo) {
+  // (G_*MULO x, 0) -> 0 + no carry out
+  assert(MI.getOpcode() == TargetOpcode::G_UMULO ||
+         MI.getOpcode() == TargetOpcode::G_SMULO);
+  if (!mi_match(MI.getOperand(3).getReg(), MRI, m_SpecificICstOrSplat(0)))
+    return false;
+  Register Dst = MI.getOperand(0).getReg();
+  Register Carry = MI.getOperand(1).getReg();
+  if (!isConstantLegalOrBeforeLegalizer(MRI.getType(Dst)) ||
+      !isConstantLegalOrBeforeLegalizer(MRI.getType(Carry)))
+    return false;
+  MatchInfo = [=](MachineIRBuilder &B) {
+    B.buildConstant(Dst, 0);
+    B.buildConstant(Carry, 0);
+  };
+  return true;
+}
+
+bool CombinerHelper::matchAddOBy0(MachineInstr &MI, BuildFnTy &MatchInfo) {
+  // (G_*ADDO x, 0) -> x + no carry out
+  assert(MI.getOpcode() == TargetOpcode::G_UADDO ||
+         MI.getOpcode() == TargetOpcode::G_SADDO);
+  if (!mi_match(MI.getOperand(3).getReg(), MRI, m_SpecificICstOrSplat(0)))
+    return false;
+  Register Carry = MI.getOperand(1).getReg();
+  if (!isConstantLegalOrBeforeLegalizer(MRI.getType(Carry)))
+    return false;
+  Register Dst = MI.getOperand(0).getReg();
+  Register LHS = MI.getOperand(2).getReg();
+  MatchInfo = [=](MachineIRBuilder &B) {
+    B.buildCopy(Dst, LHS);
+    B.buildConstant(Carry, 0);
+  };
+  return true;
+}
+
 MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) {
   assert(MI.getOpcode() == TargetOpcode::G_UDIV);
   auto &UDiv = cast<GenericMachineInstr>(MI);
@@ -5376,6 +5549,106 @@ bool CombinerHelper::matchCombineFSubFpExtFNegFMulToFMadOrFMA(
   return false;
 }
 
+bool CombinerHelper::matchSelectToLogical(MachineInstr &MI,
+                                          BuildFnTy &MatchInfo) {
+  GSelect &Sel = cast<GSelect>(MI);
+  Register DstReg = Sel.getReg(0);
+  Register Cond = Sel.getCondReg();
+  Register TrueReg = Sel.getTrueReg();
+  Register FalseReg = Sel.getFalseReg();
+
+  auto *TrueDef = getDefIgnoringCopies(TrueReg, MRI);
+  auto *FalseDef = getDefIgnoringCopies(FalseReg, MRI);
+
+  const LLT CondTy = MRI.getType(Cond);
+  const LLT OpTy = MRI.getType(TrueReg);
+  if (CondTy != OpTy || OpTy.getScalarSizeInBits() != 1)
+    return false;
+
+  // We have a boolean select.
+
+  // select Cond, Cond, F --> or Cond, F
+  // select Cond, 1, F    --> or Cond, F
+  auto MaybeCstTrue = isConstantOrConstantSplatVector(*TrueDef, MRI);
+  if (Cond == TrueReg || (MaybeCstTrue && MaybeCstTrue->isOne())) {
+    MatchInfo = [=](MachineIRBuilder &MIB) {
+      MIB.buildOr(DstReg, Cond, FalseReg);
+    };
+    return true;
+  }
+
+  // select Cond, T, Cond --> and Cond, T
+  // select Cond, T, 0    --> and Cond, T
+  auto MaybeCstFalse = isConstantOrConstantSplatVector(*FalseDef, MRI);
+  if (Cond == FalseReg || (MaybeCstFalse && MaybeCstFalse->isZero())) {
+    MatchInfo = [=](MachineIRBuilder &MIB) {
+      MIB.buildAnd(DstReg, Cond, TrueReg);
+    };
+    return true;
+  }
+
+ // select Cond, T, 1 --> or (not Cond), T
+  if (MaybeCstFalse && MaybeCstFalse->isOne()) {
+    MatchInfo = [=](MachineIRBuilder &MIB) {
+      MIB.buildOr(DstReg, MIB.buildNot(OpTy, Cond), TrueReg);
+    };
+    return true;
+  }
+
+  // select Cond, 0, F --> and (not Cond), F
+  if (MaybeCstTrue && MaybeCstTrue->isZero()) {
+    MatchInfo = [=](MachineIRBuilder &MIB) {
+      MIB.buildAnd(DstReg, MIB.buildNot(OpTy, Cond), FalseReg);
+    };
+    return true;
+  }
+  return false;
+}
+
+bool CombinerHelper::matchCombineFMinMaxNaN(MachineInstr &MI,
+                                            unsigned &IdxToPropagate) {
+  bool PropagateNaN;
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case TargetOpcode::G_FMINNUM:
+  case TargetOpcode::G_FMAXNUM:
+    PropagateNaN = false;
+    break;
+  case TargetOpcode::G_FMINIMUM:
+  case TargetOpcode::G_FMAXIMUM:
+    PropagateNaN = true;
+    break;
+  }
+
+  auto MatchNaN = [&](unsigned Idx) {
+    Register MaybeNaNReg = MI.getOperand(Idx).getReg();
+    const ConstantFP *MaybeCst = getConstantFPVRegVal(MaybeNaNReg, MRI);
+    if (!MaybeCst || !MaybeCst->getValueAPF().isNaN())
+      return false;
+    IdxToPropagate = PropagateNaN ? Idx : (Idx == 1 ? 2 : 1);
+    return true;
+  };
+
+  return MatchNaN(1) || MatchNaN(2);
+}
+
+bool CombinerHelper::matchAddSubSameReg(MachineInstr &MI, Register &Src) {
+  assert(MI.getOpcode() == TargetOpcode::G_ADD && "Expected a G_ADD");
+  Register LHS = MI.getOperand(1).getReg();
+  Register RHS = MI.getOperand(2).getReg();
+
+  // Helper lambda to check for opportunities for
+  // A + (B - A) -> B
+  // (B - A) + A -> B
+  auto CheckFold = [&](Register MaybeSub, Register MaybeSameReg) {
+    Register Reg;
+    return mi_match(MaybeSub, MRI, m_GSub(m_Reg(Src), m_Reg(Reg))) &&
+           Reg == MaybeSameReg;
+  };
+  return CheckFold(LHS, RHS) || CheckFold(RHS, LHS);
+}
+
 bool CombinerHelper::tryCombine(MachineInstr &MI) {
   if (tryCombineCopy(MI))
     return true;
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
index 64c2f0d5f8e4..4f03af0fce82 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
@@ -567,6 +567,26 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
     Known = KnownBits::ashr(KnownBits::shl(Known, ShiftKnown), ShiftKnown);
     break;
   }
+  case TargetOpcode::G_UADDO:
+  case TargetOpcode::G_UADDE:
+  case TargetOpcode::G_SADDO:
+  case TargetOpcode::G_SADDE:
+  case TargetOpcode::G_USUBO:
+  case TargetOpcode::G_USUBE:
+  case TargetOpcode::G_SSUBO:
+  case TargetOpcode::G_SSUBE:
+  case TargetOpcode::G_UMULO:
+  case TargetOpcode::G_SMULO: {
+    if (MI.getOperand(1).getReg() == R) {
+      // If we know the result of a compare has the top bits zero, use this
+      // info.
+      if (TL.getBooleanContents(DstTy.isVector(), false) ==
+              TargetLowering::ZeroOrOneBooleanContent &&
+          BitWidth > 1)
+        Known.Zero.setBitsFrom(1);
+    }
+    break;
+  }
   }
 
   assert(!Known.hasConflict() && "Bits known to be one AND zero?");
@@ -673,6 +693,27 @@ unsigned GISelKnownBits::computeNumSignBits(Register R,
                                  MI.getOperand(3).getReg(), DemandedElts,
                                  Depth + 1);
   }
+  case TargetOpcode::G_SADDO:
+  case TargetOpcode::G_SADDE:
+  case TargetOpcode::G_UADDO:
+  case TargetOpcode::G_UADDE:
+  case TargetOpcode::G_SSUBO:
+  case TargetOpcode::G_SSUBE:
+  case TargetOpcode::G_USUBO:
+  case TargetOpcode::G_USUBE:
+  case TargetOpcode::G_SMULO:
+  case TargetOpcode::G_UMULO: {
+    // If compares returns 0/-1, all bits are sign bits.
+    // We know that we have an integer-based boolean since these operations
+    // are only available for integer.
+    if (MI.getOperand(1).getReg() == R) {
+      if (TL.getBooleanContents(DstTy.isVector(), false) ==
+          TargetLowering::ZeroOrNegativeOneBooleanContent)
+        return TyBits;
+    }
+
+    break;
+  }
   case TargetOpcode::G_INTRINSIC:
   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
   default: {
diff --git a/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp b/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp
index 252b931602c6..efcc40641ea8 100644
--- a/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/InitializePasses.h"
-#include "llvm/PassRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 6d415c9c7f90..a2af66d28f4a 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -16,10 +16,11 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
+#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
@@ -47,7 +48,6 @@
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
@@ -78,7 +78,6 @@
 #include "llvm/Transforms/Utils/MemoryOpRemark.h"
 #include <algorithm>
 #include <cassert>
-#include <cstddef>
 #include <cstdint>
 #include <iterator>
 #include <string>
@@ -1818,7 +1817,7 @@ static unsigned getConstrainedOpcode(Intrinsic::ID ID) {
 
 bool IRTranslator::translateConstrainedFPIntrinsic(
   const ConstrainedFPIntrinsic &FPI, MachineIRBuilder &MIRBuilder) {
-  fp::ExceptionBehavior EB = FPI.getExceptionBehavior().getValue();
+  fp::ExceptionBehavior EB = *FPI.getExceptionBehavior();
 
   unsigned Opcode = getConstrainedOpcode(FPI.getIntrinsicID());
   if (!Opcode)
@@ -2252,6 +2251,23 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     Info.OrigRet = {Register(), Type::getVoidTy(CI.getContext()), 0};
     return CLI->lowerCall(MIRBuilder, Info);
   }
+  case Intrinsic::fptrunc_round: {
+    unsigned Flags = MachineInstr::copyFlagsFromInstruction(CI);
+
+    // Convert the metadata argument to a constant integer
+    Metadata *MD = cast<MetadataAsValue>(CI.getArgOperand(1))->getMetadata();
+    Optional<RoundingMode> RoundMode =
+        convertStrToRoundingMode(cast<MDString>(MD)->getString());
+
+    // Add the Rounding mode as an integer
+    MIRBuilder
+        .buildInstr(TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND,
+                    {getOrCreateVReg(CI)},
+                    {getOrCreateVReg(*CI.getArgOperand(0))}, Flags)
+        .addImm((int)*RoundMode);
+
+    return true;
+  }
 #define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)  \
   case Intrinsic::INTRINSIC:
 #include "llvm/IR/ConstrainedOps.def"
@@ -2409,7 +2425,7 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   TargetLowering::IntrinsicInfo Info;
   // TODO: Add a GlobalISel version of getTgtMemIntrinsic.
   if (TLI.getTgtMemIntrinsic(Info, CI, *MF, ID)) {
-    Align Alignment = Info.align.getValueOr(
+    Align Alignment = Info.align.value_or(
         DL->getABITypeAlign(Info.memVT.getTypeForEVT(F->getContext())));
     LLT MemTy = Info.memVT.isSimple()
                     ? getLLTForMVT(Info.memVT.getSimpleVT())
@@ -2934,15 +2950,6 @@ void IRTranslator::finishPendingPhis() {
   }
 }
 
-bool IRTranslator::valueIsSplit(const Value &V,
-                                SmallVectorImpl<uint64_t> *Offsets) {
-  SmallVector<LLT, 4> SplitTys;
-  if (Offsets && !Offsets->empty())
-    Offsets->clear();
-  computeValueLLTs(*DL, *V.getType(), SplitTys, Offsets);
-  return SplitTys.size() > 1;
-}
-
 bool IRTranslator::translate(const Instruction &Inst) {
   CurBuilder->setDebugLoc(Inst.getDebugLoc());
 
@@ -2984,7 +2991,7 @@ bool IRTranslator::translate(const Constant &C, Register Reg) {
     // Return the scalar if it is a <1 x Ty> vector.
     unsigned NumElts = CAZ->getElementCount().getFixedValue();
     if (NumElts == 1)
-      return translateCopy(C, *CAZ->getElementValue(0u), *EntryBuilder.get());
+      return translateCopy(C, *CAZ->getElementValue(0u), *EntryBuilder);
     SmallVector<Register, 4> Ops;
     for (unsigned I = 0; I < NumElts; ++I) {
       Constant &Elt = *CAZ->getElementValue(I);
@@ -2994,8 +3001,7 @@ bool IRTranslator::translate(const Constant &C, Register Reg) {
   } else if (auto CV = dyn_cast<ConstantDataVector>(&C)) {
     // Return the scalar if it is a <1 x Ty> vector.
     if (CV->getNumElements() == 1)
-      return translateCopy(C, *CV->getElementAsConstant(0),
-                           *EntryBuilder.get());
+      return translateCopy(C, *CV->getElementAsConstant(0), *EntryBuilder);
     SmallVector<Register, 4> Ops;
     for (unsigned i = 0; i < CV->getNumElements(); ++i) {
       Constant &Elt = *CV->getElementAsConstant(i);
@@ -3013,7 +3019,7 @@ bool IRTranslator::translate(const Constant &C, Register Reg) {
     }
   } else if (auto CV = dyn_cast<ConstantVector>(&C)) {
     if (CV->getNumOperands() == 1)
-      return translateCopy(C, *CV->getOperand(0), *EntryBuilder.get());
+      return translateCopy(C, *CV->getOperand(0), *EntryBuilder);
     SmallVector<Register, 4> Ops;
     for (unsigned i = 0; i < CV->getNumOperands(); ++i) {
       Ops.push_back(getOrCreateVReg(*CV->getOperand(i)));
@@ -3255,14 +3261,13 @@ bool IRTranslator::emitSPDescriptorFailure(StackProtectorDescriptor &SPD,
     return false;
   }
 
-  // On PS4, the "return address" must still be within the calling function,
-  // even if it's at the very end, so emit an explicit TRAP here.
-  // Passing 'true' for doesNotReturn above won't generate the trap for us.
+  // On PS4/PS5, the "return address" must still be within the calling
+  // function, even if it's at the very end, so emit an explicit TRAP here.
   // WebAssembly needs an unreachable instruction after a non-returning call,
   // because the function return type can be different from __stack_chk_fail's
   // return type (void).
   const TargetMachine &TM = MF->getTarget();
-  if (TM.getTargetTriple().isPS4CPU() || TM.getTargetTriple().isWasm()) {
+  if (TM.getTargetTriple().isPS() || TM.getTargetTriple().isWasm()) {
     LLVM_DEBUG(dbgs() << "Unhandled trap emission for stack protector fail\n");
     return false;
   }
@@ -3413,7 +3418,7 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
     }
   }
 
-  if (!CLI->lowerFormalArguments(*EntryBuilder.get(), F, VRegArgs, FuncInfo)) {
+  if (!CLI->lowerFormalArguments(*EntryBuilder, F, VRegArgs, FuncInfo)) {
     OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
                                F.getSubprogram(), &F.getEntryBlock());
     R << "unable to lower arguments: " << ore::NV("Prototype", F.getType());
@@ -3469,8 +3474,13 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
         return false;
       }
 
-      if (!finalizeBasicBlock(*BB, MBB))
+      if (!finalizeBasicBlock(*BB, MBB)) {
+        OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
+                                   BB->getTerminator()->getDebugLoc(), BB);
+        R << "unable to translate basic block";
+        reportTranslationError(*MF, *TPC, *ORE, R);
         return false;
+      }
     }
 #ifndef NDEBUG
     WrapperObserver.removeObserver(&Verifier);
diff --git a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
index e5f95ca5aa73..95ae8383b6fa 100644
--- a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
@@ -12,15 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
-#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
-#include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 
 #define DEBUG_TYPE "inline-asm-lowering"
@@ -150,6 +145,7 @@ static unsigned getConstraintGenerality(TargetLowering::ConstraintType CT) {
   case TargetLowering::C_RegisterClass:
     return 2;
   case TargetLowering::C_Memory:
+  case TargetLowering::C_Address:
     return 3;
   }
   llvm_unreachable("Invalid constraint type");
@@ -310,7 +306,7 @@ bool InlineAsmLowering::lowerInlineAsm(
       // If this is an indirect operand, the operand is a pointer to the
       // accessed type.
       if (OpInfo.isIndirect) {
-        OpTy = Call.getAttributes().getParamElementType(ArgNo);
+        OpTy = Call.getParamElementType(ArgNo);
         assert(OpTy && "Indirect operand must have elementtype attribute");
       }
 
@@ -649,6 +645,8 @@ bool InlineAsmLowering::lowerInlineAsm(
       return false;
     case TargetLowering::C_Memory:
       break; // Already handled.
+    case TargetLowering::C_Address:
+      break; // Silence warning.
     case TargetLowering::C_Unknown:
       LLVM_DEBUG(dbgs() << "Unexpected unknown constraint\n");
       return false;
diff --git a/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp b/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
index 2bb5addefe48..28f3b425c67d 100644
--- a/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
@@ -12,8 +12,6 @@
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/ScopeExit.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/LazyBlockFrequencyInfo.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
@@ -23,14 +21,13 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/Config/config.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/CodeGenCoverage.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetMachine.h"
diff --git a/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp b/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp
index 1d0c106fd5db..8959d215ecd1 100644
--- a/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp
@@ -13,16 +13,9 @@
 
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cassert>
 
 #define DEBUG_TYPE "instructionselector"
 
@@ -66,6 +59,10 @@ bool InstructionSelector::isObviouslySafeToFold(MachineInstr &MI,
       std::next(MI.getIterator()) == IntoMI.getIterator())
     return true;
 
+  // Convergent instructions cannot be moved in the CFG.
+  if (MI.isConvergent() && MI.getParent() != IntoMI.getParent())
+    return false;
+
   return !MI.mayLoadOrStore() && !MI.mayRaiseFPException() &&
          !MI.hasUnmodeledSideEffects() && MI.implicit_operands().empty();
 }
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp b/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
index 1f0738a8d9d2..54a82cac95d5 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
@@ -188,6 +188,13 @@ LegalityPredicate LegalityPredicates::memSizeInBytesNotPow2(unsigned MMOIdx) {
   };
 }
 
+LegalityPredicate LegalityPredicates::memSizeNotByteSizePow2(unsigned MMOIdx) {
+  return [=](const LegalityQuery &Query) {
+    const LLT MemTy = Query.MMODescrs[MMOIdx].MemoryTy;
+    return !MemTy.isByteSized() || !isPowerOf2_32(MemTy.getSizeInBytes());
+  };
+}
+
 LegalityPredicate LegalityPredicates::numElementsNotPow2(unsigned TypeIdx) {
   return [=](const LegalityQuery &Query) {
     const LLT QueryTy = Query.Types[TypeIdx];
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp
index 75b7fcb5663a..25c1db91b05d 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp
@@ -43,6 +43,27 @@ LegalizeMutation LegalizeMutations::changeElementTo(unsigned TypeIdx,
   };
 }
 
+LegalizeMutation LegalizeMutations::changeElementCountTo(unsigned TypeIdx,
+                                                         unsigned FromTypeIdx) {
+  return [=](const LegalityQuery &Query) {
+    const LLT OldTy = Query.Types[TypeIdx];
+    const LLT NewTy = Query.Types[FromTypeIdx];
+    ElementCount NewEltCount =
+        NewTy.isVector() ? NewTy.getElementCount() : ElementCount::getFixed(1);
+    return std::make_pair(TypeIdx, OldTy.changeElementCount(NewEltCount));
+  };
+}
+
+LegalizeMutation LegalizeMutations::changeElementCountTo(unsigned TypeIdx,
+                                                         LLT NewEltTy) {
+  return [=](const LegalityQuery &Query) {
+    const LLT OldTy = Query.Types[TypeIdx];
+    ElementCount NewEltCount = NewEltTy.isVector() ? NewEltTy.getElementCount()
+                                                   : ElementCount::getFixed(1);
+    return std::make_pair(TypeIdx, OldTy.changeElementCount(NewEltCount));
+  };
+}
+
 LegalizeMutation LegalizeMutations::changeElementSizeTo(unsigned TypeIdx,
                                                         unsigned FromTypeIdx) {
   return [=](const LegalityQuery &Query) {
diff --git a/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp b/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp
index 0ab4a7f64840..f09e5b7ce783 100644
--- a/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp
@@ -14,7 +14,7 @@
 
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
 #include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
 #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
@@ -24,15 +24,11 @@
 #include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Target/TargetMachine.h"
-
-#include <iterator>
 
 #define DEBUG_TYPE "legalizer"
 
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 37bc8a65dc7c..fb046d519ac8 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -15,10 +15,13 @@
 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
@@ -1611,40 +1614,6 @@ LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
   return Legalized;
 }
 
-Register LegalizerHelper::widenWithUnmerge(LLT WideTy, Register OrigReg) {
-  Register WideReg = MRI.createGenericVirtualRegister(WideTy);
-  LLT OrigTy = MRI.getType(OrigReg);
-  LLT LCMTy = getLCMType(WideTy, OrigTy);
-
-  const int NumMergeParts = LCMTy.getSizeInBits() / WideTy.getSizeInBits();
-  const int NumUnmergeParts = LCMTy.getSizeInBits() / OrigTy.getSizeInBits();
-
-  Register UnmergeSrc = WideReg;
-
-  // Create a merge to the LCM type, padding with undef
-  // %0:_(<3 x s32>) = G_FOO => <4 x s32>
-  // =>
-  // %1:_(<4 x s32>) = G_FOO
-  // %2:_(<4 x s32>) = G_IMPLICIT_DEF
-  // %3:_(<12 x s32>) = G_CONCAT_VECTORS %1, %2, %2
-  // %0:_(<3 x s32>), %4:_, %5:_, %6:_ = G_UNMERGE_VALUES %3
-  if (NumMergeParts > 1) {
-    Register Undef = MIRBuilder.buildUndef(WideTy).getReg(0);
-    SmallVector<Register, 8> MergeParts(NumMergeParts, Undef);
-    MergeParts[0] = WideReg;
-    UnmergeSrc = MIRBuilder.buildMerge(LCMTy, MergeParts).getReg(0);
-  }
-
-  // Unmerge to the original register and pad with dead defs.
-  SmallVector<Register, 8> UnmergeResults(NumUnmergeParts);
-  UnmergeResults[0] = OrigReg;
-  for (int I = 1; I != NumUnmergeParts; ++I)
-    UnmergeResults[I] = MRI.createGenericVirtualRegister(OrigTy);
-
-  MIRBuilder.buildUnmerge(UnmergeResults, UnmergeSrc);
-  return WideReg;
-}
-
 LegalizerHelper::LegalizeResult
 LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx,
                                           LLT WideTy) {
@@ -1867,9 +1836,6 @@ LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx,
 LegalizerHelper::LegalizeResult
 LegalizerHelper::widenScalarAddSubOverflow(MachineInstr &MI, unsigned TypeIdx,
                                            LLT WideTy) {
-  if (TypeIdx == 1)
-    return UnableToLegalize; // TODO
-
   unsigned Opcode;
   unsigned ExtOpcode;
   Optional<Register> CarryIn = None;
@@ -1914,6 +1880,18 @@ LegalizerHelper::widenScalarAddSubOverflow(MachineInstr &MI, unsigned TypeIdx,
     break;
   }
 
+  if (TypeIdx == 1) {
+    unsigned BoolExtOp = MIRBuilder.getBoolExtOp(WideTy.isVector(), false);
+
+    Observer.changingInstr(MI);
+    widenScalarDst(MI, WideTy, 1);
+    if (CarryIn)
+      widenScalarSrc(MI, WideTy, 4, BoolExtOp);
+
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+
   auto LHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(2)});
   auto RHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(3)});
   // Do the arithmetic in the larger type.
@@ -1985,8 +1963,12 @@ LegalizerHelper::widenScalarAddSubShlSat(MachineInstr &MI, unsigned TypeIdx,
 LegalizerHelper::LegalizeResult
 LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx,
                                  LLT WideTy) {
-  if (TypeIdx == 1)
-    return UnableToLegalize;
+  if (TypeIdx == 1) {
+    Observer.changingInstr(MI);
+    widenScalarDst(MI, WideTy, 1);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
 
   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULO;
   Register Result = MI.getOperand(0).getReg();
@@ -2992,7 +2974,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
     if (isa<GSExtLoad>(LoadMI)) {
       auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
       MIRBuilder.buildSExtInReg(LoadReg, NewLoad, MemSizeInBits);
-    } else if (isa<GZExtLoad>(LoadMI) || WideMemTy == DstTy) {
+    } else if (isa<GZExtLoad>(LoadMI) || WideMemTy == LoadTy) {
       auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
       // The extra bits are guaranteed to be zero, since we stored them that
       // way.  A zext load from Wide thus automatically gives zext from MemVT.
@@ -3314,7 +3296,7 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
     Observer.changingInstr(MI);
     const auto &TII = MIRBuilder.getTII();
     MI.setDesc(TII.get(TargetOpcode::G_MUL));
-    MI.RemoveOperand(1);
+    MI.removeOperand(1);
     Observer.changedInstr(MI);
 
     auto HiPart = MIRBuilder.buildInstr(Opcode, {Ty}, {LHS, RHS});
@@ -4096,13 +4078,14 @@ LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
   // is a load, return the new registers in ValRegs. For a store, each elements
   // of ValRegs should be PartTy. Returns the next offset that needs to be
   // handled.
+  bool isBigEndian = MIRBuilder.getDataLayout().isBigEndian();
   auto MMO = LdStMI.getMMO();
   auto splitTypePieces = [=](LLT PartTy, SmallVectorImpl<Register> &ValRegs,
-                             unsigned Offset) -> unsigned {
+                             unsigned NumParts, unsigned Offset) -> unsigned {
     MachineFunction &MF = MIRBuilder.getMF();
     unsigned PartSize = PartTy.getSizeInBits();
     for (unsigned Idx = 0, E = NumParts; Idx != E && Offset < TotalSize;
-         Offset += PartSize, ++Idx) {
+         ++Idx) {
       unsigned ByteOffset = Offset / 8;
       Register NewAddrReg;
 
@@ -4118,16 +4101,19 @@ LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
       } else {
         MIRBuilder.buildStore(ValRegs[Idx], NewAddrReg, *NewMMO);
       }
+      Offset = isBigEndian ? Offset - PartSize : Offset + PartSize;
     }
 
     return Offset;
   };
 
-  unsigned HandledOffset = splitTypePieces(NarrowTy, NarrowRegs, 0);
+  unsigned Offset = isBigEndian ? TotalSize - NarrowTy.getSizeInBits() : 0;
+  unsigned HandledOffset =
+      splitTypePieces(NarrowTy, NarrowRegs, NumParts, Offset);
 
   // Handle the rest of the register if this isn't an even type breakdown.
   if (LeftoverTy.isValid())
-    splitTypePieces(LeftoverTy, NarrowLeftoverRegs, HandledOffset);
+    splitTypePieces(LeftoverTy, NarrowLeftoverRegs, NumLeftover, HandledOffset);
 
   if (IsLoad) {
     insertParts(ValReg, ValTy, NarrowTy, NarrowRegs,
@@ -4236,6 +4222,14 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
   case G_INTTOPTR:
   case G_PTRTOINT:
   case G_ADDRSPACE_CAST:
+  case G_UADDO:
+  case G_USUBO:
+  case G_UADDE:
+  case G_USUBE:
+  case G_SADDO:
+  case G_SSUBO:
+  case G_SADDE:
+  case G_SSUBE:
     return fewerElementsVectorMultiEltType(GMI, NumElts);
   case G_ICMP:
   case G_FCMP:
@@ -4882,10 +4876,26 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
     moreElementsVectorDst(MI, MoreTy, 0);
     Observer.changedInstr(MI);
     return Legalized;
-  case TargetOpcode::G_SELECT:
-    if (TypeIdx != 0)
-      return UnableToLegalize;
-    if (MRI.getType(MI.getOperand(1).getReg()).isVector())
+  case TargetOpcode::G_SELECT: {
+    Register DstReg = MI.getOperand(0).getReg();
+    Register CondReg = MI.getOperand(1).getReg();
+    LLT DstTy = MRI.getType(DstReg);
+    LLT CondTy = MRI.getType(CondReg);
+    if (TypeIdx == 1) {
+      if (!CondTy.isScalar() ||
+          DstTy.getElementCount() != MoreTy.getElementCount())
+        return UnableToLegalize;
+
+      // This is turning a scalar select of vectors into a vector
+      // select. Broadcast the select condition.
+      auto ShufSplat = MIRBuilder.buildShuffleSplat(MoreTy, CondReg);
+      Observer.changingInstr(MI);
+      MI.getOperand(1).setReg(ShufSplat.getReg(0));
+      Observer.changedInstr(MI);
+      return Legalized;
+    }
+
+    if (CondTy.isVector())
       return UnableToLegalize;
 
     Observer.changingInstr(MI);
@@ -4894,6 +4904,7 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
     moreElementsVectorDst(MI, MoreTy, 0);
     Observer.changedInstr(MI);
     return Legalized;
+  }
   case TargetOpcode::G_UNMERGE_VALUES:
     return UnableToLegalize;
   case TargetOpcode::G_PHI:
@@ -7229,25 +7240,32 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) {
   Register Op2Reg = MI.getOperand(3).getReg();
   LLT DstTy = MRI.getType(DstReg);
   LLT MaskTy = MRI.getType(MaskReg);
-  LLT Op1Ty = MRI.getType(Op1Reg);
   if (!DstTy.isVector())
     return UnableToLegalize;
 
-  // Vector selects can have a scalar predicate. If so, splat into a vector and
-  // finish for later legalization attempts to try again.
   if (MaskTy.isScalar()) {
+    // Turn the scalar condition into a vector condition mask.
+
     Register MaskElt = MaskReg;
-    if (MaskTy.getSizeInBits() < DstTy.getScalarSizeInBits())
-      MaskElt = MIRBuilder.buildSExt(DstTy.getElementType(), MaskElt).getReg(0);
-    // Generate a vector splat idiom to be pattern matched later.
+
+    // The condition was potentially zero extended before, but we want a sign
+    // extended boolean.
+    if (MaskTy.getSizeInBits() <= DstTy.getScalarSizeInBits() &&
+        MaskTy != LLT::scalar(1)) {
+      MaskElt = MIRBuilder.buildSExtInReg(MaskTy, MaskElt, 1).getReg(0);
+    }
+
+    // Continue the sign extension (or truncate) to match the data type.
+    MaskElt = MIRBuilder.buildSExtOrTrunc(DstTy.getElementType(),
+                                          MaskElt).getReg(0);
+
+    // Generate a vector splat idiom.
     auto ShufSplat = MIRBuilder.buildShuffleSplat(DstTy, MaskElt);
-    Observer.changingInstr(MI);
-    MI.getOperand(1).setReg(ShufSplat.getReg(0));
-    Observer.changedInstr(MI);
-    return Legalized;
+    MaskReg = ShufSplat.getReg(0);
+    MaskTy = DstTy;
   }
 
-  if (MaskTy.getSizeInBits() != Op1Ty.getSizeInBits()) {
+  if (MaskTy.getSizeInBits() != DstTy.getSizeInBits()) {
     return UnableToLegalize;
   }
 
@@ -7414,7 +7432,7 @@ static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
   unsigned NumBits = Ty.getScalarSizeInBits();
   auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
   if (!Ty.isVector() && ValVRegAndVal) {
-    APInt Scalar = ValVRegAndVal->Value.truncOrSelf(8);
+    APInt Scalar = ValVRegAndVal->Value.trunc(8);
     APInt SplatVal = APInt::getSplat(NumBits, Scalar);
     return MIB.buildConstant(Ty, SplatVal).getReg(0);
   }
@@ -7569,7 +7587,7 @@ LegalizerHelper::lowerMemcpyInline(MachineInstr &MI) {
   // See if this is a constant length copy
   auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
   // FIXME: support dynamically sized G_MEMCPY_INLINE
-  assert(LenVRegAndVal.hasValue() &&
+  assert(LenVRegAndVal &&
          "inline memcpy with dynamic size is not yet supported");
   uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
   if (KnownLen == 0) {
@@ -7609,7 +7627,7 @@ LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
 
   bool DstAlignCanChange = false;
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  Align Alignment = commonAlignment(DstAlign, SrcAlign);
+  Align Alignment = std::min(DstAlign, SrcAlign);
 
   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
@@ -7644,7 +7662,7 @@ LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
     if (!TRI->hasStackRealignment(MF))
       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
-        NewAlign = NewAlign / 2;
+        NewAlign = NewAlign.previous();
 
     if (NewAlign > Alignment) {
       Alignment = NewAlign;
@@ -7717,7 +7735,7 @@ LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
   bool DstAlignCanChange = false;
   MachineFrameInfo &MFI = MF.getFrameInfo();
   bool OptSize = shouldLowerMemFuncForSize(MF);
-  Align Alignment = commonAlignment(DstAlign, SrcAlign);
+  Align Alignment = std::min(DstAlign, SrcAlign);
 
   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
@@ -7752,7 +7770,7 @@ LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
     if (!TRI->hasStackRealignment(MF))
       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
-        NewAlign = NewAlign / 2;
+        NewAlign = NewAlign.previous();
 
     if (NewAlign > Alignment) {
       Alignment = NewAlign;
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
index 30697913a6a4..6adb7ddb5b66 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
@@ -13,7 +13,6 @@
 
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "llvm/ADT/SmallBitVector.h"
-#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -23,9 +22,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LowLevelTypeImpl.h"
-#include "llvm/Support/MathExtras.h"
 #include <algorithm>
-#include <map>
 
 using namespace llvm;
 using namespace LegalizeActions;
@@ -132,15 +129,16 @@ static bool mutationIsSane(const LegalizeRule &Rule,
     LLVM_FALLTHROUGH;
   case MoreElements: {
     // MoreElements can go from scalar to vector.
-    const unsigned OldElts = OldTy.isVector() ? OldTy.getNumElements() : 1;
+    const ElementCount OldElts = OldTy.isVector() ?
+      OldTy.getElementCount() : ElementCount::getFixed(1);
     if (NewTy.isVector()) {
       if (Rule.getAction() == FewerElements) {
         // Make sure the element count really decreased.
-        if (NewTy.getNumElements() >= OldElts)
+        if (ElementCount::isKnownGE(NewTy.getElementCount(), OldElts))
           return false;
       } else {
         // Make sure the element count really increased.
-        if (NewTy.getNumElements() <= OldElts)
+        if (ElementCount::isKnownLE(NewTy.getElementCount(), OldElts))
           return false;
       }
     } else if (Rule.getAction() == MoreElements)
diff --git a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
index de8dbd456901..d4fbf7d15089 100644
--- a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
@@ -73,6 +73,7 @@ void LoadStoreOpt::init(MachineFunction &MF) {
 
 void LoadStoreOpt::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<AAResultsWrapperPass>();
+  AU.setPreservesAll();
   getSelectionDAGFallbackAnalysisUsage(AU);
   MachineFunctionPass::getAnalysisUsage(AU);
 }
@@ -508,6 +509,12 @@ bool LoadStoreOpt::addStoreToCandidate(GStore &StoreMI,
   if (StoreMI.getMemSizeInBits() != ValueTy.getSizeInBits())
     return false;
 
+  // Avoid adding volatile or ordered stores to the candidate. We already have a
+  // check for this in instMayAlias() but that only get's called later between
+  // potential aliasing hazards.
+  if (!StoreMI.isSimple())
+    return false;
+
   Register StoreAddr = StoreMI.getPointerReg();
   auto BIO = getPointerInfo(StoreAddr, *MRI);
   Register StoreBase = BIO.BaseReg;
diff --git a/llvm/lib/CodeGen/GlobalISel/Localizer.cpp b/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
index 328a278f3d68..c1287693e74d 100644
--- a/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
@@ -13,6 +13,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/InitializePasses.h"
diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index c6720568b362..19ebf46191a9 100644
--- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -9,8 +9,6 @@
 /// This file implements the MachineIRBuidler class.
 //===----------------------------------------------------------------------===//
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
-#include "llvm/Analysis/MemoryLocation.h"
-#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -19,7 +17,7 @@
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 
 using namespace llvm;
 
@@ -568,47 +566,6 @@ MachineInstrBuilder MachineIRBuilder::buildExtract(const DstOp &Dst,
   return Extract;
 }
 
-void MachineIRBuilder::buildSequence(Register Res, ArrayRef<Register> Ops,
-                                     ArrayRef<uint64_t> Indices) {
-#ifndef NDEBUG
-  assert(Ops.size() == Indices.size() && "incompatible args");
-  assert(!Ops.empty() && "invalid trivial sequence");
-  assert(llvm::is_sorted(Indices) &&
-         "sequence offsets must be in ascending order");
-
-  assert(getMRI()->getType(Res).isValid() && "invalid operand type");
-  for (auto Op : Ops)
-    assert(getMRI()->getType(Op).isValid() && "invalid operand type");
-#endif
-
-  LLT ResTy = getMRI()->getType(Res);
-  LLT OpTy = getMRI()->getType(Ops[0]);
-  unsigned OpSize = OpTy.getSizeInBits();
-  bool MaybeMerge = true;
-  for (unsigned i = 0; i < Ops.size(); ++i) {
-    if (getMRI()->getType(Ops[i]) != OpTy || Indices[i] != i * OpSize) {
-      MaybeMerge = false;
-      break;
-    }
-  }
-
-  if (MaybeMerge && Ops.size() * OpSize == ResTy.getSizeInBits()) {
-    buildMerge(Res, Ops);
-    return;
-  }
-
-  Register ResIn = getMRI()->createGenericVirtualRegister(ResTy);
-  buildUndef(ResIn);
-
-  for (unsigned i = 0; i < Ops.size(); ++i) {
-    Register ResOut = i + 1 == Ops.size()
-                          ? Res
-                          : getMRI()->createGenericVirtualRegister(ResTy);
-    buildInsert(ResOut, ResIn, Ops[i], Indices[i]);
-    ResIn = ResOut;
-  }
-}
-
 MachineInstrBuilder MachineIRBuilder::buildUndef(const DstOp &Res) {
   return buildInstr(TargetOpcode::G_IMPLICIT_DEF, {Res}, {});
 }
@@ -666,6 +623,17 @@ MachineInstrBuilder MachineIRBuilder::buildBuildVector(const DstOp &Res,
   return buildInstr(TargetOpcode::G_BUILD_VECTOR, Res, TmpVec);
 }
 
+MachineInstrBuilder
+MachineIRBuilder::buildBuildVectorConstant(const DstOp &Res,
+                                           ArrayRef<APInt> Ops) {
+  SmallVector<SrcOp> TmpVec;
+  TmpVec.reserve(Ops.size());
+  LLT EltTy = Res.getLLTTy(*getMRI()).getElementType();
+  for (auto &Op : Ops)
+    TmpVec.push_back(buildConstant(EltTy, Op));
+  return buildInstr(TargetOpcode::G_BUILD_VECTOR, Res, TmpVec);
+}
+
 MachineInstrBuilder MachineIRBuilder::buildSplatVector(const DstOp &Res,
                                                        const SrcOp &Src) {
   SmallVector<SrcOp, 8> TmpVec(Res.getLLTTy(*getMRI()).getNumElements(), Src);
diff --git a/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
index 01af6bb51bb7..bce850ee212c 100644
--- a/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
@@ -14,8 +14,6 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
@@ -25,12 +23,13 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterBank.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/Config/llvm-config.h"
-#include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
@@ -631,7 +630,8 @@ bool RegBankSelect::assignInstr(MachineInstr &MI) {
            "Unexpected hint opcode!");
     // The only correct mapping for these is to always use the source register
     // bank.
-    const RegisterBank *RB = MRI->getRegBankOrNull(MI.getOperand(1).getReg());
+    const RegisterBank *RB =
+        RBI->getRegBank(MI.getOperand(1).getReg(), *MRI, *TRI);
     // We can assume every instruction above this one has a selected register
     // bank.
     assert(RB && "Expected source register to have a register bank?");
diff --git a/llvm/lib/CodeGen/GlobalISel/RegisterBank.cpp b/llvm/lib/CodeGen/GlobalISel/RegisterBank.cpp
deleted file mode 100644
index 5c4d18ad79c5..000000000000
--- a/llvm/lib/CodeGen/GlobalISel/RegisterBank.cpp
+++ /dev/null
@@ -1,110 +0,0 @@
-//===- llvm/CodeGen/GlobalISel/RegisterBank.cpp - Register Bank --*- C++ -*-==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-/// \file
-/// This file implements the RegisterBank class.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/Config/llvm-config.h"
-#include "llvm/Support/Debug.h"
-
-#define DEBUG_TYPE "registerbank"
-
-using namespace llvm;
-
-const unsigned RegisterBank::InvalidID = UINT_MAX;
-
-RegisterBank::RegisterBank(
-    unsigned ID, const char *Name, unsigned Size,
-    const uint32_t *CoveredClasses, unsigned NumRegClasses)
-    : ID(ID), Name(Name), Size(Size) {
-  ContainedRegClasses.resize(NumRegClasses);
-  ContainedRegClasses.setBitsInMask(CoveredClasses);
-}
-
-bool RegisterBank::verify(const TargetRegisterInfo &TRI) const {
-  assert(isValid() && "Invalid register bank");
-  for (unsigned RCId = 0, End = TRI.getNumRegClasses(); RCId != End; ++RCId) {
-    const TargetRegisterClass &RC = *TRI.getRegClass(RCId);
-
-    if (!covers(RC))
-      continue;
-    // Verify that the register bank covers all the sub classes of the
-    // classes it covers.
-
-    // Use a different (slow in that case) method than
-    // RegisterBankInfo to find the subclasses of RC, to make sure
-    // both agree on the covers.
-    for (unsigned SubRCId = 0; SubRCId != End; ++SubRCId) {
-      const TargetRegisterClass &SubRC = *TRI.getRegClass(RCId);
-
-      if (!RC.hasSubClassEq(&SubRC))
-        continue;
-
-      // Verify that the Size of the register bank is big enough to cover
-      // all the register classes it covers.
-      assert(getSize() >= TRI.getRegSizeInBits(SubRC) &&
-             "Size is not big enough for all the subclasses!");
-      assert(covers(SubRC) && "Not all subclasses are covered");
-    }
-  }
-  return true;
-}
-
-bool RegisterBank::covers(const TargetRegisterClass &RC) const {
-  assert(isValid() && "RB hasn't been initialized yet");
-  return ContainedRegClasses.test(RC.getID());
-}
-
-bool RegisterBank::isValid() const {
-  return ID != InvalidID && Name != nullptr && Size != 0 &&
-         // A register bank that does not cover anything is useless.
-         !ContainedRegClasses.empty();
-}
-
-bool RegisterBank::operator==(const RegisterBank &OtherRB) const {
-  // There must be only one instance of a given register bank alive
-  // for the whole compilation.
-  // The RegisterBankInfo is supposed to enforce that.
-  assert((OtherRB.getID() != getID() || &OtherRB == this) &&
-         "ID does not uniquely identify a RegisterBank");
-  return &OtherRB == this;
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD void RegisterBank::dump(const TargetRegisterInfo *TRI) const {
-  print(dbgs(), /* IsForDebug */ true, TRI);
-}
-#endif
-
-void RegisterBank::print(raw_ostream &OS, bool IsForDebug,
-                         const TargetRegisterInfo *TRI) const {
-  OS << getName();
-  if (!IsForDebug)
-    return;
-  OS << "(ID:" << getID() << ", Size:" << getSize() << ")\n"
-     << "isValid:" << isValid() << '\n'
-     << "Number of Covered register classes: " << ContainedRegClasses.count()
-     << '\n';
-  // Print all the subclasses if we can.
-  // This register classes may not be properly initialized yet.
-  if (!TRI || ContainedRegClasses.empty())
-    return;
-  assert(ContainedRegClasses.size() == TRI->getNumRegClasses() &&
-         "TRI does not match the initialization process?");
-  OS << "Covered register classes:\n";
-  ListSeparator LS;
-  for (unsigned RCId = 0, End = TRI->getNumRegClasses(); RCId != End; ++RCId) {
-    const TargetRegisterClass &RC = *TRI->getRegClass(RCId);
-
-    if (covers(RC))
-      OS << LS << TRI->getRegClassName(&RC);
-  }
-}
diff --git a/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp b/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
deleted file mode 100644
index 650500c7eb31..000000000000
--- a/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
+++ /dev/null
@@ -1,805 +0,0 @@
-//===- llvm/CodeGen/GlobalISel/RegisterBankInfo.cpp --------------*- C++ -*-==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-/// \file
-/// This file implements the RegisterBankInfo class.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetOpcodes.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/Config/llvm-config.h"
-#include "llvm/IR/Type.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-#include <algorithm> // For std::max.
-
-#define DEBUG_TYPE "registerbankinfo"
-
-using namespace llvm;
-
-STATISTIC(NumPartialMappingsCreated,
-          "Number of partial mappings dynamically created");
-STATISTIC(NumPartialMappingsAccessed,
-          "Number of partial mappings dynamically accessed");
-STATISTIC(NumValueMappingsCreated,
-          "Number of value mappings dynamically created");
-STATISTIC(NumValueMappingsAccessed,
-          "Number of value mappings dynamically accessed");
-STATISTIC(NumOperandsMappingsCreated,
-          "Number of operands mappings dynamically created");
-STATISTIC(NumOperandsMappingsAccessed,
-          "Number of operands mappings dynamically accessed");
-STATISTIC(NumInstructionMappingsCreated,
-          "Number of instruction mappings dynamically created");
-STATISTIC(NumInstructionMappingsAccessed,
-          "Number of instruction mappings dynamically accessed");
-
-const unsigned RegisterBankInfo::DefaultMappingID = UINT_MAX;
-const unsigned RegisterBankInfo::InvalidMappingID = UINT_MAX - 1;
-
-//------------------------------------------------------------------------------
-// RegisterBankInfo implementation.
-//------------------------------------------------------------------------------
-RegisterBankInfo::RegisterBankInfo(RegisterBank **RegBanks,
-                                   unsigned NumRegBanks)
-    : RegBanks(RegBanks), NumRegBanks(NumRegBanks) {
-#ifndef NDEBUG
-  for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx) {
-    assert(RegBanks[Idx] != nullptr && "Invalid RegisterBank");
-    assert(RegBanks[Idx]->isValid() && "RegisterBank should be valid");
-  }
-#endif // NDEBUG
-}
-
-bool RegisterBankInfo::verify(const TargetRegisterInfo &TRI) const {
-#ifndef NDEBUG
-  for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx) {
-    const RegisterBank &RegBank = getRegBank(Idx);
-    assert(Idx == RegBank.getID() &&
-           "ID does not match the index in the array");
-    LLVM_DEBUG(dbgs() << "Verify " << RegBank << '\n');
-    assert(RegBank.verify(TRI) && "RegBank is invalid");
-  }
-#endif // NDEBUG
-  return true;
-}
-
-const RegisterBank *
-RegisterBankInfo::getRegBank(Register Reg, const MachineRegisterInfo &MRI,
-                             const TargetRegisterInfo &TRI) const {
-  if (Register::isPhysicalRegister(Reg)) {
-    // FIXME: This was probably a copy to a virtual register that does have a
-    // type we could use.
-    return &getRegBankFromRegClass(getMinimalPhysRegClass(Reg, TRI), LLT());
-  }
-
-  assert(Reg && "NoRegister does not have a register bank");
-  const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
-  if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
-    return RB;
-  if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
-    return &getRegBankFromRegClass(*RC, MRI.getType(Reg));
-  return nullptr;
-}
-
-const TargetRegisterClass &
-RegisterBankInfo::getMinimalPhysRegClass(Register Reg,
-                                         const TargetRegisterInfo &TRI) const {
-  assert(Register::isPhysicalRegister(Reg) && "Reg must be a physreg");
-  const auto &RegRCIt = PhysRegMinimalRCs.find(Reg);
-  if (RegRCIt != PhysRegMinimalRCs.end())
-    return *RegRCIt->second;
-  const TargetRegisterClass *PhysRC = TRI.getMinimalPhysRegClass(Reg);
-  PhysRegMinimalRCs[Reg] = PhysRC;
-  return *PhysRC;
-}
-
-const RegisterBank *RegisterBankInfo::getRegBankFromConstraints(
-    const MachineInstr &MI, unsigned OpIdx, const TargetInstrInfo &TII,
-    const MachineRegisterInfo &MRI) const {
-  const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
-
-  // The mapping of the registers may be available via the
-  // register class constraints.
-  const TargetRegisterClass *RC = MI.getRegClassConstraint(OpIdx, &TII, TRI);
-
-  if (!RC)
-    return nullptr;
-
-  Register Reg = MI.getOperand(OpIdx).getReg();
-  const RegisterBank &RegBank = getRegBankFromRegClass(*RC, MRI.getType(Reg));
-  // Check that the target properly implemented getRegBankFromRegClass.
-  assert(RegBank.covers(*RC) &&
-         "The mapping of the register bank does not make sense");
-  return &RegBank;
-}
-
-const TargetRegisterClass *RegisterBankInfo::constrainGenericRegister(
-    Register Reg, const TargetRegisterClass &RC, MachineRegisterInfo &MRI) {
-
-  // If the register already has a class, fallback to MRI::constrainRegClass.
-  auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
-  if (RegClassOrBank.is<const TargetRegisterClass *>())
-    return MRI.constrainRegClass(Reg, &RC);
-
-  const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
-  // Otherwise, all we can do is ensure the bank covers the class, and set it.
-  if (RB && !RB->covers(RC))
-    return nullptr;
-
-  // If nothing was set or the class is simply compatible, set it.
-  MRI.setRegClass(Reg, &RC);
-  return &RC;
-}
-
-/// Check whether or not \p MI should be treated like a copy
-/// for the mappings.
-/// Copy like instruction are special for mapping because
-/// they don't have actual register constraints. Moreover,
-/// they sometimes have register classes assigned and we can
-/// just use that instead of failing to provide a generic mapping.
-static bool isCopyLike(const MachineInstr &MI) {
-  return MI.isCopy() || MI.isPHI() ||
-         MI.getOpcode() == TargetOpcode::REG_SEQUENCE;
-}
-
-const RegisterBankInfo::InstructionMapping &
-RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const {
-  // For copies we want to walk over the operands and try to find one
-  // that has a register bank since the instruction itself will not get
-  // us any constraint.
-  bool IsCopyLike = isCopyLike(MI);
-  // For copy like instruction, only the mapping of the definition
-  // is important. The rest is not constrained.
-  unsigned NumOperandsForMapping = IsCopyLike ? 1 : MI.getNumOperands();
-
-  const MachineFunction &MF = *MI.getMF();
-  const TargetSubtargetInfo &STI = MF.getSubtarget();
-  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
-  // We may need to query the instruction encoding to guess the mapping.
-  const TargetInstrInfo &TII = *STI.getInstrInfo();
-
-  // Before doing anything complicated check if the mapping is not
-  // directly available.
-  bool CompleteMapping = true;
-
-  SmallVector<const ValueMapping *, 8> OperandsMapping(NumOperandsForMapping);
-  for (unsigned OpIdx = 0, EndIdx = MI.getNumOperands(); OpIdx != EndIdx;
-       ++OpIdx) {
-    const MachineOperand &MO = MI.getOperand(OpIdx);
-    if (!MO.isReg())
-      continue;
-    Register Reg = MO.getReg();
-    if (!Reg)
-      continue;
-    // The register bank of Reg is just a side effect of the current
-    // excution and in particular, there is no reason to believe this
-    // is the best default mapping for the current instruction.  Keep
-    // it as an alternative register bank if we cannot figure out
-    // something.
-    const RegisterBank *AltRegBank = getRegBank(Reg, MRI, TRI);
-    // For copy-like instruction, we want to reuse the register bank
-    // that is already set on Reg, if any, since those instructions do
-    // not have any constraints.
-    const RegisterBank *CurRegBank = IsCopyLike ? AltRegBank : nullptr;
-    if (!CurRegBank) {
-      // If this is a target specific instruction, we can deduce
-      // the register bank from the encoding constraints.
-      CurRegBank = getRegBankFromConstraints(MI, OpIdx, TII, MRI);
-      if (!CurRegBank) {
-        // All our attempts failed, give up.
-        CompleteMapping = false;
-
-        if (!IsCopyLike)
-          // MI does not carry enough information to guess the mapping.
-          return getInvalidInstructionMapping();
-        continue;
-      }
-    }
-
-    unsigned Size = getSizeInBits(Reg, MRI, TRI);
-    const ValueMapping *ValMapping = &getValueMapping(0, Size, *CurRegBank);
-    if (IsCopyLike) {
-      if (!OperandsMapping[0]) {
-        if (MI.isRegSequence()) {
-          // For reg_sequence, the result size does not match the input.
-          unsigned ResultSize = getSizeInBits(MI.getOperand(0).getReg(),
-                                              MRI, TRI);
-          OperandsMapping[0] = &getValueMapping(0, ResultSize, *CurRegBank);
-        } else {
-          OperandsMapping[0] = ValMapping;
-        }
-      }
-
-      // The default handling assumes any register bank can be copied to any
-      // other. If this isn't the case, the target should specially deal with
-      // reg_sequence/phi. There may also be unsatisfiable copies.
-      for (; OpIdx != EndIdx; ++OpIdx) {
-        const MachineOperand &MO = MI.getOperand(OpIdx);
-        if (!MO.isReg())
-          continue;
-        Register Reg = MO.getReg();
-        if (!Reg)
-          continue;
-
-        const RegisterBank *AltRegBank = getRegBank(Reg, MRI, TRI);
-        if (AltRegBank &&
-            cannotCopy(*CurRegBank, *AltRegBank, getSizeInBits(Reg, MRI, TRI)))
-          return getInvalidInstructionMapping();
-      }
-
-      CompleteMapping = true;
-      break;
-    }
-
-    OperandsMapping[OpIdx] = ValMapping;
-  }
-
-  if (IsCopyLike && !CompleteMapping) {
-    // No way to deduce the type from what we have.
-    return getInvalidInstructionMapping();
-  }
-
-  assert(CompleteMapping && "Setting an uncomplete mapping");
-  return getInstructionMapping(
-      DefaultMappingID, /*Cost*/ 1,
-      /*OperandsMapping*/ getOperandsMapping(OperandsMapping),
-      NumOperandsForMapping);
-}
-
-/// Hashing function for PartialMapping.
-static hash_code hashPartialMapping(unsigned StartIdx, unsigned Length,
-                                    const RegisterBank *RegBank) {
-  return hash_combine(StartIdx, Length, RegBank ? RegBank->getID() : 0);
-}
-
-/// Overloaded version of hash_value for a PartialMapping.
-hash_code
-llvm::hash_value(const RegisterBankInfo::PartialMapping &PartMapping) {
-  return hashPartialMapping(PartMapping.StartIdx, PartMapping.Length,
-                            PartMapping.RegBank);
-}
-
-const RegisterBankInfo::PartialMapping &
-RegisterBankInfo::getPartialMapping(unsigned StartIdx, unsigned Length,
-                                    const RegisterBank &RegBank) const {
-  ++NumPartialMappingsAccessed;
-
-  hash_code Hash = hashPartialMapping(StartIdx, Length, &RegBank);
-  const auto &It = MapOfPartialMappings.find(Hash);
-  if (It != MapOfPartialMappings.end())
-    return *It->second;
-
-  ++NumPartialMappingsCreated;
-
-  auto &PartMapping = MapOfPartialMappings[Hash];
-  PartMapping = std::make_unique<PartialMapping>(StartIdx, Length, RegBank);
-  return *PartMapping;
-}
-
-const RegisterBankInfo::ValueMapping &
-RegisterBankInfo::getValueMapping(unsigned StartIdx, unsigned Length,
-                                  const RegisterBank &RegBank) const {
-  return getValueMapping(&getPartialMapping(StartIdx, Length, RegBank), 1);
-}
-
-static hash_code
-hashValueMapping(const RegisterBankInfo::PartialMapping *BreakDown,
-                 unsigned NumBreakDowns) {
-  if (LLVM_LIKELY(NumBreakDowns == 1))
-    return hash_value(*BreakDown);
-  SmallVector<size_t, 8> Hashes(NumBreakDowns);
-  for (unsigned Idx = 0; Idx != NumBreakDowns; ++Idx)
-    Hashes.push_back(hash_value(BreakDown[Idx]));
-  return hash_combine_range(Hashes.begin(), Hashes.end());
-}
-
-const RegisterBankInfo::ValueMapping &
-RegisterBankInfo::getValueMapping(const PartialMapping *BreakDown,
-                                  unsigned NumBreakDowns) const {
-  ++NumValueMappingsAccessed;
-
-  hash_code Hash = hashValueMapping(BreakDown, NumBreakDowns);
-  const auto &It = MapOfValueMappings.find(Hash);
-  if (It != MapOfValueMappings.end())
-    return *It->second;
-
-  ++NumValueMappingsCreated;
-
-  auto &ValMapping = MapOfValueMappings[Hash];
-  ValMapping = std::make_unique<ValueMapping>(BreakDown, NumBreakDowns);
-  return *ValMapping;
-}
-
-template <typename Iterator>
-const RegisterBankInfo::ValueMapping *
-RegisterBankInfo::getOperandsMapping(Iterator Begin, Iterator End) const {
-
-  ++NumOperandsMappingsAccessed;
-
-  // The addresses of the value mapping are unique.
-  // Therefore, we can use them directly to hash the operand mapping.
-  hash_code Hash = hash_combine_range(Begin, End);
-  auto &Res = MapOfOperandsMappings[Hash];
-  if (Res)
-    return Res.get();
-
-  ++NumOperandsMappingsCreated;
-
-  // Create the array of ValueMapping.
-  // Note: this array will not hash to this instance of operands
-  // mapping, because we use the pointer of the ValueMapping
-  // to hash and we expect them to uniquely identify an instance
-  // of value mapping.
-  Res = std::make_unique<ValueMapping[]>(std::distance(Begin, End));
-  unsigned Idx = 0;
-  for (Iterator It = Begin; It != End; ++It, ++Idx) {
-    const ValueMapping *ValMap = *It;
-    if (!ValMap)
-      continue;
-    Res[Idx] = *ValMap;
-  }
-  return Res.get();
-}
-
-const RegisterBankInfo::ValueMapping *RegisterBankInfo::getOperandsMapping(
-    const SmallVectorImpl<const RegisterBankInfo::ValueMapping *> &OpdsMapping)
-    const {
-  return getOperandsMapping(OpdsMapping.begin(), OpdsMapping.end());
-}
-
-const RegisterBankInfo::ValueMapping *RegisterBankInfo::getOperandsMapping(
-    std::initializer_list<const RegisterBankInfo::ValueMapping *> OpdsMapping)
-    const {
-  return getOperandsMapping(OpdsMapping.begin(), OpdsMapping.end());
-}
-
-static hash_code
-hashInstructionMapping(unsigned ID, unsigned Cost,
-                       const RegisterBankInfo::ValueMapping *OperandsMapping,
-                       unsigned NumOperands) {
-  return hash_combine(ID, Cost, OperandsMapping, NumOperands);
-}
-
-const RegisterBankInfo::InstructionMapping &
-RegisterBankInfo::getInstructionMappingImpl(
-    bool IsInvalid, unsigned ID, unsigned Cost,
-    const RegisterBankInfo::ValueMapping *OperandsMapping,
-    unsigned NumOperands) const {
-  assert(((IsInvalid && ID == InvalidMappingID && Cost == 0 &&
-           OperandsMapping == nullptr && NumOperands == 0) ||
-          !IsInvalid) &&
-         "Mismatch argument for invalid input");
-  ++NumInstructionMappingsAccessed;
-
-  hash_code Hash =
-      hashInstructionMapping(ID, Cost, OperandsMapping, NumOperands);
-  const auto &It = MapOfInstructionMappings.find(Hash);
-  if (It != MapOfInstructionMappings.end())
-    return *It->second;
-
-  ++NumInstructionMappingsCreated;
-
-  auto &InstrMapping = MapOfInstructionMappings[Hash];
-  InstrMapping = std::make_unique<InstructionMapping>(
-      ID, Cost, OperandsMapping, NumOperands);
-  return *InstrMapping;
-}
-
-const RegisterBankInfo::InstructionMapping &
-RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
-  const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
-  if (Mapping.isValid())
-    return Mapping;
-  llvm_unreachable("The target must implement this");
-}
-
-RegisterBankInfo::InstructionMappings
-RegisterBankInfo::getInstrPossibleMappings(const MachineInstr &MI) const {
-  InstructionMappings PossibleMappings;
-  const auto &Mapping = getInstrMapping(MI);
-  if (Mapping.isValid()) {
-    // Put the default mapping first.
-    PossibleMappings.push_back(&Mapping);
-  }
-
-  // Then the alternative mapping, if any.
-  InstructionMappings AltMappings = getInstrAlternativeMappings(MI);
-  append_range(PossibleMappings, AltMappings);
-#ifndef NDEBUG
-  for (const InstructionMapping *Mapping : PossibleMappings)
-    assert(Mapping->verify(MI) && "Mapping is invalid");
-#endif
-  return PossibleMappings;
-}
-
-RegisterBankInfo::InstructionMappings
-RegisterBankInfo::getInstrAlternativeMappings(const MachineInstr &MI) const {
-  // No alternative for MI.
-  return InstructionMappings();
-}
-
-void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) {
-  MachineInstr &MI = OpdMapper.getMI();
-  MachineRegisterInfo &MRI = OpdMapper.getMRI();
-  LLVM_DEBUG(dbgs() << "Applying default-like mapping\n");
-  for (unsigned OpIdx = 0,
-                EndIdx = OpdMapper.getInstrMapping().getNumOperands();
-       OpIdx != EndIdx; ++OpIdx) {
-    LLVM_DEBUG(dbgs() << "OpIdx " << OpIdx);
-    MachineOperand &MO = MI.getOperand(OpIdx);
-    if (!MO.isReg()) {
-      LLVM_DEBUG(dbgs() << " is not a register, nothing to be done\n");
-      continue;
-    }
-    if (!MO.getReg()) {
-      LLVM_DEBUG(dbgs() << " is $noreg, nothing to be done\n");
-      continue;
-    }
-    assert(OpdMapper.getInstrMapping().getOperandMapping(OpIdx).NumBreakDowns !=
-               0 &&
-           "Invalid mapping");
-    assert(OpdMapper.getInstrMapping().getOperandMapping(OpIdx).NumBreakDowns ==
-               1 &&
-           "This mapping is too complex for this function");
-    iterator_range<SmallVectorImpl<Register>::const_iterator> NewRegs =
-        OpdMapper.getVRegs(OpIdx);
-    if (NewRegs.empty()) {
-      LLVM_DEBUG(dbgs() << " has not been repaired, nothing to be done\n");
-      continue;
-    }
-    Register OrigReg = MO.getReg();
-    Register NewReg = *NewRegs.begin();
-    LLVM_DEBUG(dbgs() << " changed, replace " << printReg(OrigReg, nullptr));
-    MO.setReg(NewReg);
-    LLVM_DEBUG(dbgs() << " with " << printReg(NewReg, nullptr));
-
-    // The OperandsMapper creates plain scalar, we may have to fix that.
-    // Check if the types match and if not, fix that.
-    LLT OrigTy = MRI.getType(OrigReg);
-    LLT NewTy = MRI.getType(NewReg);
-    if (OrigTy != NewTy) {
-      // The default mapping is not supposed to change the size of
-      // the storage. However, right now we don't necessarily bump all
-      // the types to storage size. For instance, we can consider
-      // s16 G_AND legal whereas the storage size is going to be 32.
-      assert(OrigTy.getSizeInBits() <= NewTy.getSizeInBits() &&
-             "Types with difference size cannot be handled by the default "
-             "mapping");
-      LLVM_DEBUG(dbgs() << "\nChange type of new opd from " << NewTy << " to "
-                        << OrigTy);
-      MRI.setType(NewReg, OrigTy);
-    }
-    LLVM_DEBUG(dbgs() << '\n');
-  }
-}
-
-unsigned RegisterBankInfo::getSizeInBits(Register Reg,
-                                         const MachineRegisterInfo &MRI,
-                                         const TargetRegisterInfo &TRI) const {
-  if (Register::isPhysicalRegister(Reg)) {
-    // The size is not directly available for physical registers.
-    // Instead, we need to access a register class that contains Reg and
-    // get the size of that register class.
-    // Because this is expensive, we'll cache the register class by calling
-    auto *RC = &getMinimalPhysRegClass(Reg, TRI);
-    assert(RC && "Expecting Register class");
-    return TRI.getRegSizeInBits(*RC);
-  }
-  return TRI.getRegSizeInBits(Reg, MRI);
-}
-
-//------------------------------------------------------------------------------
-// Helper classes implementation.
-//------------------------------------------------------------------------------
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD void RegisterBankInfo::PartialMapping::dump() const {
-  print(dbgs());
-  dbgs() << '\n';
-}
-#endif
-
-bool RegisterBankInfo::PartialMapping::verify() const {
-  assert(RegBank && "Register bank not set");
-  assert(Length && "Empty mapping");
-  assert((StartIdx <= getHighBitIdx()) && "Overflow, switch to APInt?");
-  // Check if the minimum width fits into RegBank.
-  assert(RegBank->getSize() >= Length && "Register bank too small for Mask");
-  return true;
-}
-
-void RegisterBankInfo::PartialMapping::print(raw_ostream &OS) const {
-  OS << "[" << StartIdx << ", " << getHighBitIdx() << "], RegBank = ";
-  if (RegBank)
-    OS << *RegBank;
-  else
-    OS << "nullptr";
-}
-
-bool RegisterBankInfo::ValueMapping::partsAllUniform() const {
-  if (NumBreakDowns < 2)
-    return true;
-
-  const PartialMapping *First = begin();
-  for (const PartialMapping *Part = First + 1; Part != end(); ++Part) {
-    if (Part->Length != First->Length || Part->RegBank != First->RegBank)
-      return false;
-  }
-
-  return true;
-}
-
-bool RegisterBankInfo::ValueMapping::verify(unsigned MeaningfulBitWidth) const {
-  assert(NumBreakDowns && "Value mapped nowhere?!");
-  unsigned OrigValueBitWidth = 0;
-  for (const RegisterBankInfo::PartialMapping &PartMap : *this) {
-    // Check that each register bank is big enough to hold the partial value:
-    // this check is done by PartialMapping::verify
-    assert(PartMap.verify() && "Partial mapping is invalid");
-    // The original value should completely be mapped.
-    // Thus the maximum accessed index + 1 is the size of the original value.
-    OrigValueBitWidth =
-        std::max(OrigValueBitWidth, PartMap.getHighBitIdx() + 1);
-  }
-  assert(OrigValueBitWidth >= MeaningfulBitWidth &&
-         "Meaningful bits not covered by the mapping");
-  APInt ValueMask(OrigValueBitWidth, 0);
-  for (const RegisterBankInfo::PartialMapping &PartMap : *this) {
-    // Check that the union of the partial mappings covers the whole value,
-    // without overlaps.
-    // The high bit is exclusive in the APInt API, thus getHighBitIdx + 1.
-    APInt PartMapMask = APInt::getBitsSet(OrigValueBitWidth, PartMap.StartIdx,
-                                          PartMap.getHighBitIdx() + 1);
-    ValueMask ^= PartMapMask;
-    assert((ValueMask & PartMapMask) == PartMapMask &&
-           "Some partial mappings overlap");
-  }
-  assert(ValueMask.isAllOnes() && "Value is not fully mapped");
-  return true;
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD void RegisterBankInfo::ValueMapping::dump() const {
-  print(dbgs());
-  dbgs() << '\n';
-}
-#endif
-
-void RegisterBankInfo::ValueMapping::print(raw_ostream &OS) const {
-  OS << "#BreakDown: " << NumBreakDowns << " ";
-  bool IsFirst = true;
-  for (const PartialMapping &PartMap : *this) {
-    if (!IsFirst)
-      OS << ", ";
-    OS << '[' << PartMap << ']';
-    IsFirst = false;
-  }
-}
-
-bool RegisterBankInfo::InstructionMapping::verify(
-    const MachineInstr &MI) const {
-  // Check that all the register operands are properly mapped.
-  // Check the constructor invariant.
-  // For PHI, we only care about mapping the definition.
-  assert(NumOperands == (isCopyLike(MI) ? 1 : MI.getNumOperands()) &&
-         "NumOperands must match, see constructor");
-  assert(MI.getParent() && MI.getMF() &&
-         "MI must be connected to a MachineFunction");
-  const MachineFunction &MF = *MI.getMF();
-  const RegisterBankInfo *RBI = MF.getSubtarget().getRegBankInfo();
-  (void)RBI;
-
-  for (unsigned Idx = 0; Idx < NumOperands; ++Idx) {
-    const MachineOperand &MO = MI.getOperand(Idx);
-    if (!MO.isReg()) {
-      assert(!getOperandMapping(Idx).isValid() &&
-             "We should not care about non-reg mapping");
-      continue;
-    }
-    Register Reg = MO.getReg();
-    if (!Reg)
-      continue;
-    assert(getOperandMapping(Idx).isValid() &&
-           "We must have a mapping for reg operands");
-    const RegisterBankInfo::ValueMapping &MOMapping = getOperandMapping(Idx);
-    (void)MOMapping;
-    // Register size in bits.
-    // This size must match what the mapping expects.
-    assert(MOMapping.verify(RBI->getSizeInBits(
-               Reg, MF.getRegInfo(), *MF.getSubtarget().getRegisterInfo())) &&
-           "Value mapping is invalid");
-  }
-  return true;
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD void RegisterBankInfo::InstructionMapping::dump() const {
-  print(dbgs());
-  dbgs() << '\n';
-}
-#endif
-
-void RegisterBankInfo::InstructionMapping::print(raw_ostream &OS) const {
-  OS << "ID: " << getID() << " Cost: " << getCost() << " Mapping: ";
-
-  for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
-    const ValueMapping &ValMapping = getOperandMapping(OpIdx);
-    if (OpIdx)
-      OS << ", ";
-    OS << "{ Idx: " << OpIdx << " Map: " << ValMapping << '}';
-  }
-}
-
-const int RegisterBankInfo::OperandsMapper::DontKnowIdx = -1;
-
-RegisterBankInfo::OperandsMapper::OperandsMapper(
-    MachineInstr &MI, const InstructionMapping &InstrMapping,
-    MachineRegisterInfo &MRI)
-    : MRI(MRI), MI(MI), InstrMapping(InstrMapping) {
-  unsigned NumOpds = InstrMapping.getNumOperands();
-  OpToNewVRegIdx.resize(NumOpds, OperandsMapper::DontKnowIdx);
-  assert(InstrMapping.verify(MI) && "Invalid mapping for MI");
-}
-
-iterator_range<SmallVectorImpl<Register>::iterator>
-RegisterBankInfo::OperandsMapper::getVRegsMem(unsigned OpIdx) {
-  assert(OpIdx < getInstrMapping().getNumOperands() && "Out-of-bound access");
-  unsigned NumPartialVal =
-      getInstrMapping().getOperandMapping(OpIdx).NumBreakDowns;
-  int StartIdx = OpToNewVRegIdx[OpIdx];
-
-  if (StartIdx == OperandsMapper::DontKnowIdx) {
-    // This is the first time we try to access OpIdx.
-    // Create the cells that will hold all the partial values at the
-    // end of the list of NewVReg.
-    StartIdx = NewVRegs.size();
-    OpToNewVRegIdx[OpIdx] = StartIdx;
-    for (unsigned i = 0; i < NumPartialVal; ++i)
-      NewVRegs.push_back(0);
-  }
-  SmallVectorImpl<Register>::iterator End =
-      getNewVRegsEnd(StartIdx, NumPartialVal);
-
-  return make_range(&NewVRegs[StartIdx], End);
-}
-
-SmallVectorImpl<Register>::const_iterator
-RegisterBankInfo::OperandsMapper::getNewVRegsEnd(unsigned StartIdx,
-                                                 unsigned NumVal) const {
-  return const_cast<OperandsMapper *>(this)->getNewVRegsEnd(StartIdx, NumVal);
-}
-SmallVectorImpl<Register>::iterator
-RegisterBankInfo::OperandsMapper::getNewVRegsEnd(unsigned StartIdx,
-                                                 unsigned NumVal) {
-  assert((NewVRegs.size() == StartIdx + NumVal ||
-          NewVRegs.size() > StartIdx + NumVal) &&
-         "NewVRegs too small to contain all the partial mapping");
-  return NewVRegs.size() <= StartIdx + NumVal ? NewVRegs.end()
-                                              : &NewVRegs[StartIdx + NumVal];
-}
-
-void RegisterBankInfo::OperandsMapper::createVRegs(unsigned OpIdx) {
-  assert(OpIdx < getInstrMapping().getNumOperands() && "Out-of-bound access");
-  iterator_range<SmallVectorImpl<Register>::iterator> NewVRegsForOpIdx =
-      getVRegsMem(OpIdx);
-  const ValueMapping &ValMapping = getInstrMapping().getOperandMapping(OpIdx);
-  const PartialMapping *PartMap = ValMapping.begin();
-  for (Register &NewVReg : NewVRegsForOpIdx) {
-    assert(PartMap != ValMapping.end() && "Out-of-bound access");
-    assert(NewVReg == 0 && "Register has already been created");
-    // The new registers are always bound to scalar with the right size.
-    // The actual type has to be set when the target does the mapping
-    // of the instruction.
-    // The rationale is that this generic code cannot guess how the
-    // target plans to split the input type.
-    NewVReg = MRI.createGenericVirtualRegister(LLT::scalar(PartMap->Length));
-    MRI.setRegBank(NewVReg, *PartMap->RegBank);
-    ++PartMap;
-  }
-}
-
-void RegisterBankInfo::OperandsMapper::setVRegs(unsigned OpIdx,
-                                                unsigned PartialMapIdx,
-                                                Register NewVReg) {
-  assert(OpIdx < getInstrMapping().getNumOperands() && "Out-of-bound access");
-  assert(getInstrMapping().getOperandMapping(OpIdx).NumBreakDowns >
-             PartialMapIdx &&
-         "Out-of-bound access for partial mapping");
-  // Make sure the memory is initialized for that operand.
-  (void)getVRegsMem(OpIdx);
-  assert(NewVRegs[OpToNewVRegIdx[OpIdx] + PartialMapIdx] == 0 &&
-         "This value is already set");
-  NewVRegs[OpToNewVRegIdx[OpIdx] + PartialMapIdx] = NewVReg;
-}
-
-iterator_range<SmallVectorImpl<Register>::const_iterator>
-RegisterBankInfo::OperandsMapper::getVRegs(unsigned OpIdx,
-                                           bool ForDebug) const {
-  (void)ForDebug;
-  assert(OpIdx < getInstrMapping().getNumOperands() && "Out-of-bound access");
-  int StartIdx = OpToNewVRegIdx[OpIdx];
-
-  if (StartIdx == OperandsMapper::DontKnowIdx)
-    return make_range(NewVRegs.end(), NewVRegs.end());
-
-  unsigned PartMapSize =
-      getInstrMapping().getOperandMapping(OpIdx).NumBreakDowns;
-  SmallVectorImpl<Register>::const_iterator End =
-      getNewVRegsEnd(StartIdx, PartMapSize);
-  iterator_range<SmallVectorImpl<Register>::const_iterator> Res =
-      make_range(&NewVRegs[StartIdx], End);
-#ifndef NDEBUG
-  for (Register VReg : Res)
-    assert((VReg || ForDebug) && "Some registers are uninitialized");
-#endif
-  return Res;
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD void RegisterBankInfo::OperandsMapper::dump() const {
-  print(dbgs(), true);
-  dbgs() << '\n';
-}
-#endif
-
-void RegisterBankInfo::OperandsMapper::print(raw_ostream &OS,
-                                             bool ForDebug) const {
-  unsigned NumOpds = getInstrMapping().getNumOperands();
-  if (ForDebug) {
-    OS << "Mapping for " << getMI() << "\nwith " << getInstrMapping() << '\n';
-    // Print out the internal state of the index table.
-    OS << "Populated indices (CellNumber, IndexInNewVRegs): ";
-    bool IsFirst = true;
-    for (unsigned Idx = 0; Idx != NumOpds; ++Idx) {
-      if (OpToNewVRegIdx[Idx] != DontKnowIdx) {
-        if (!IsFirst)
-          OS << ", ";
-        OS << '(' << Idx << ", " << OpToNewVRegIdx[Idx] << ')';
-        IsFirst = false;
-      }
-    }
-    OS << '\n';
-  } else
-    OS << "Mapping ID: " << getInstrMapping().getID() << ' ';
-
-  OS << "Operand Mapping: ";
-  // If we have a function, we can pretty print the name of the registers.
-  // Otherwise we will print the raw numbers.
-  const TargetRegisterInfo *TRI =
-      getMI().getParent() && getMI().getMF()
-          ? getMI().getMF()->getSubtarget().getRegisterInfo()
-          : nullptr;
-  bool IsFirst = true;
-  for (unsigned Idx = 0; Idx != NumOpds; ++Idx) {
-    if (OpToNewVRegIdx[Idx] == DontKnowIdx)
-      continue;
-    if (!IsFirst)
-      OS << ", ";
-    IsFirst = false;
-    OS << '(' << printReg(getMI().getOperand(Idx).getReg(), TRI) << ", [";
-    bool IsFirstNewVReg = true;
-    for (Register VReg : getVRegs(Idx)) {
-      if (!IsFirstNewVReg)
-        OS << ", ";
-      IsFirstNewVReg = false;
-      OS << printReg(VReg, TRI);
-    }
-    OS << "])";
-  }
-}
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index 544af9a2954f..7781761bc131 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -16,14 +16,14 @@
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
-#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
-#include "llvm/CodeGen/MachineSizeOpts.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
 #include "llvm/CodeGen/StackProtector.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -31,6 +31,7 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
 
 #define DEBUG_TYPE "globalisel-utils"
 
@@ -56,6 +57,11 @@ Register llvm::constrainOperandRegClass(
   // Assume physical registers are properly constrained.
   assert(Register::isVirtualRegister(Reg) && "PhysReg not implemented");
 
+  // Save the old register class to check whether
+  // the change notifications will be required.
+  // TODO: A better approach would be to pass
+  // the observers to constrainRegToClass().
+  auto *OldRegClass = MRI.getRegClassOrNull(Reg);
   Register ConstrainedReg = constrainRegToClass(MRI, TII, RBI, Reg, RegClass);
   // If we created a new virtual register because the class is not compatible
   // then create a copy between the new and the old register.
@@ -81,7 +87,7 @@ Register llvm::constrainOperandRegClass(
     if (GISelChangeObserver *Observer = MF.getObserver()) {
       Observer->changedInstr(*RegMO.getParent());
     }
-  } else {
+  } else if (OldRegClass != MRI.getRegClassOrNull(Reg)) {
     if (GISelChangeObserver *Observer = MF.getObserver()) {
       if (!RegMO.isDef()) {
         MachineInstr *RegDef = MRI.getVRegDef(Reg);
@@ -500,6 +506,7 @@ Optional<APInt> llvm::ConstantFoldBinOp(unsigned Opcode, const Register Op1,
   default:
     break;
   case TargetOpcode::G_ADD:
+  case TargetOpcode::G_PTR_ADD:
     return C1 + C2;
   case TargetOpcode::G_AND:
     return C1 & C2;
@@ -533,6 +540,14 @@ Optional<APInt> llvm::ConstantFoldBinOp(unsigned Opcode, const Register Op1,
     if (!C2.getBoolValue())
       break;
     return C1.srem(C2);
+  case TargetOpcode::G_SMIN:
+    return APIntOps::smin(C1, C2);
+  case TargetOpcode::G_SMAX:
+    return APIntOps::smax(C1, C2);
+  case TargetOpcode::G_UMIN:
+    return APIntOps::umin(C1, C2);
+  case TargetOpcode::G_UMAX:
+    return APIntOps::umax(C1, C2);
   }
 
   return None;
@@ -592,33 +607,27 @@ Optional<APFloat> llvm::ConstantFoldFPBinOp(unsigned Opcode, const Register Op1,
   return None;
 }
 
-Register llvm::ConstantFoldVectorBinop(unsigned Opcode, const Register Op1,
-                                       const Register Op2,
-                                       const MachineRegisterInfo &MRI,
-                                       MachineIRBuilder &MIB) {
+SmallVector<APInt>
+llvm::ConstantFoldVectorBinop(unsigned Opcode, const Register Op1,
+                              const Register Op2,
+                              const MachineRegisterInfo &MRI) {
   auto *SrcVec2 = getOpcodeDef<GBuildVector>(Op2, MRI);
   if (!SrcVec2)
-    return Register();
+    return SmallVector<APInt>();
 
   auto *SrcVec1 = getOpcodeDef<GBuildVector>(Op1, MRI);
   if (!SrcVec1)
-    return Register();
+    return SmallVector<APInt>();
 
-  const LLT EltTy = MRI.getType(SrcVec1->getSourceReg(0));
-
-  SmallVector<Register, 16> FoldedElements;
+  SmallVector<APInt> FoldedElements;
   for (unsigned Idx = 0, E = SrcVec1->getNumSources(); Idx < E; ++Idx) {
     auto MaybeCst = ConstantFoldBinOp(Opcode, SrcVec1->getSourceReg(Idx),
                                       SrcVec2->getSourceReg(Idx), MRI);
     if (!MaybeCst)
-      return Register();
-    auto FoldedCstReg = MIB.buildConstant(EltTy, *MaybeCst).getReg(0);
-    FoldedElements.emplace_back(FoldedCstReg);
+      return SmallVector<APInt>();
+    FoldedElements.push_back(*MaybeCst);
   }
-  // Create the new vector constant.
-  auto CstVec =
-      MIB.buildBuildVector(MRI.getType(SrcVec1->getReg(0)), FoldedElements);
-  return CstVec.getReg(0);
+  return FoldedElements;
 }
 
 bool llvm::isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI,
@@ -1061,15 +1070,38 @@ bool llvm::isBuildVectorConstantSplat(const MachineInstr &MI,
                                     AllowUndef);
 }
 
+Optional<APInt> llvm::getIConstantSplatVal(const Register Reg,
+                                           const MachineRegisterInfo &MRI) {
+  if (auto SplatValAndReg =
+          getAnyConstantSplat(Reg, MRI, /* AllowUndef */ false)) {
+    Optional<ValueAndVReg> ValAndVReg =
+        getIConstantVRegValWithLookThrough(SplatValAndReg->VReg, MRI);
+    return ValAndVReg->Value;
+  }
+
+  return None;
+}
+
+Optional<APInt> getIConstantSplatVal(const MachineInstr &MI,
+                                     const MachineRegisterInfo &MRI) {
+  return getIConstantSplatVal(MI.getOperand(0).getReg(), MRI);
+}
+
 Optional<int64_t>
-llvm::getBuildVectorConstantSplat(const MachineInstr &MI,
-                                  const MachineRegisterInfo &MRI) {
+llvm::getIConstantSplatSExtVal(const Register Reg,
+                               const MachineRegisterInfo &MRI) {
   if (auto SplatValAndReg =
-          getAnyConstantSplat(MI.getOperand(0).getReg(), MRI, false))
+          getAnyConstantSplat(Reg, MRI, /* AllowUndef */ false))
     return getIConstantVRegSExtVal(SplatValAndReg->VReg, MRI);
   return None;
 }
 
+Optional<int64_t>
+llvm::getIConstantSplatSExtVal(const MachineInstr &MI,
+                               const MachineRegisterInfo &MRI) {
+  return getIConstantSplatSExtVal(MI.getOperand(0).getReg(), MRI);
+}
+
 Optional<FPValueAndVReg> llvm::getFConstantSplat(Register VReg,
                                                  const MachineRegisterInfo &MRI,
                                                  bool AllowUndef) {
@@ -1095,7 +1127,7 @@ Optional<RegOrConstant> llvm::getVectorSplat(const MachineInstr &MI,
   unsigned Opc = MI.getOpcode();
   if (!isBuildVectorOp(Opc))
     return None;
-  if (auto Splat = getBuildVectorConstantSplat(MI, MRI))
+  if (auto Splat = getIConstantSplatSExtVal(MI, MRI))
     return RegOrConstant(*Splat);
   auto Reg = MI.getOperand(1).getReg();
   if (any_of(make_range(MI.operands_begin() + 2, MI.operands_end()),
@@ -1104,6 +1136,26 @@ Optional<RegOrConstant> llvm::getVectorSplat(const MachineInstr &MI,
   return RegOrConstant(Reg);
 }
 
+static bool isConstantScalar(const MachineInstr &MI,
+                             const MachineRegisterInfo &MRI,
+                             bool AllowFP = true,
+                             bool AllowOpaqueConstants = true) {
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_CONSTANT:
+  case TargetOpcode::G_IMPLICIT_DEF:
+    return true;
+  case TargetOpcode::G_FCONSTANT:
+    return AllowFP;
+  case TargetOpcode::G_GLOBAL_VALUE:
+  case TargetOpcode::G_FRAME_INDEX:
+  case TargetOpcode::G_BLOCK_ADDR:
+  case TargetOpcode::G_JUMP_TABLE:
+    return AllowOpaqueConstants;
+  default:
+    return false;
+  }
+}
+
 bool llvm::isConstantOrConstantVector(MachineInstr &MI,
                                       const MachineRegisterInfo &MRI) {
   Register Def = MI.getOperand(0).getReg();
@@ -1121,19 +1173,71 @@ bool llvm::isConstantOrConstantVector(MachineInstr &MI,
   return true;
 }
 
+bool llvm::isConstantOrConstantVector(const MachineInstr &MI,
+                                      const MachineRegisterInfo &MRI,
+                                      bool AllowFP, bool AllowOpaqueConstants) {
+  if (isConstantScalar(MI, MRI, AllowFP, AllowOpaqueConstants))
+    return true;
+
+  if (!isBuildVectorOp(MI.getOpcode()))
+    return false;
+
+  const unsigned NumOps = MI.getNumOperands();
+  for (unsigned I = 1; I != NumOps; ++I) {
+    const MachineInstr *ElementDef = MRI.getVRegDef(MI.getOperand(I).getReg());
+    if (!isConstantScalar(*ElementDef, MRI, AllowFP, AllowOpaqueConstants))
+      return false;
+  }
+
+  return true;
+}
+
 Optional<APInt>
 llvm::isConstantOrConstantSplatVector(MachineInstr &MI,
                                       const MachineRegisterInfo &MRI) {
   Register Def = MI.getOperand(0).getReg();
   if (auto C = getIConstantVRegValWithLookThrough(Def, MRI))
     return C->Value;
-  auto MaybeCst = getBuildVectorConstantSplat(MI, MRI);
+  auto MaybeCst = getIConstantSplatSExtVal(MI, MRI);
   if (!MaybeCst)
     return None;
   const unsigned ScalarSize = MRI.getType(Def).getScalarSizeInBits();
   return APInt(ScalarSize, *MaybeCst, true);
 }
 
+bool llvm::isNullOrNullSplat(const MachineInstr &MI,
+                             const MachineRegisterInfo &MRI, bool AllowUndefs) {
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_IMPLICIT_DEF:
+    return AllowUndefs;
+  case TargetOpcode::G_CONSTANT:
+    return MI.getOperand(1).getCImm()->isNullValue();
+  case TargetOpcode::G_FCONSTANT: {
+    const ConstantFP *FPImm = MI.getOperand(1).getFPImm();
+    return FPImm->isZero() && !FPImm->isNegative();
+  }
+  default:
+    if (!AllowUndefs) // TODO: isBuildVectorAllZeros assumes undef is OK already
+      return false;
+    return isBuildVectorAllZeros(MI, MRI);
+  }
+}
+
+bool llvm::isAllOnesOrAllOnesSplat(const MachineInstr &MI,
+                                   const MachineRegisterInfo &MRI,
+                                   bool AllowUndefs) {
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_IMPLICIT_DEF:
+    return AllowUndefs;
+  case TargetOpcode::G_CONSTANT:
+    return MI.getOperand(1).getCImm()->isAllOnesValue();
+  default:
+    if (!AllowUndefs) // TODO: isBuildVectorAllOnes assumes undef is OK already
+      return false;
+    return isBuildVectorAllOnes(MI, MRI);
+  }
+}
+
 bool llvm::matchUnaryPredicate(
     const MachineRegisterInfo &MRI, Register Reg,
     std::function<bool(const Constant *ConstVal)> Match, bool AllowUndefs) {
diff --git a/llvm/lib/CodeGen/GlobalMerge.cpp b/llvm/lib/CodeGen/GlobalMerge.cpp
index bbd9006a5d8c..f5833d3b9086 100644
--- a/llvm/lib/CodeGen/GlobalMerge.cpp
+++ b/llvm/lib/CodeGen/GlobalMerge.cpp
@@ -592,6 +592,13 @@ void GlobalMerge::setMustKeepGlobalVariables(Module &M) {
         if (const GlobalVariable *GV =
                 dyn_cast<GlobalVariable>(U->stripPointerCasts()))
           MustKeepGlobalVariables.insert(GV);
+        else if (const ConstantArray *CA = dyn_cast<ConstantArray>(U->stripPointerCasts())) {
+          for (const Use &Elt : CA->operands()) {
+            if (const GlobalVariable *GV =
+                    dyn_cast<GlobalVariable>(Elt->stripPointerCasts()))
+              MustKeepGlobalVariables.insert(GV);
+          }
+        }
       }
     }
   }
@@ -609,6 +616,13 @@ bool GlobalMerge::doInitialization(Module &M) {
   bool Changed = false;
   setMustKeepGlobalVariables(M);
 
+  LLVM_DEBUG({
+      dbgs() << "Number of GV that must be kept:  " <<
+                MustKeepGlobalVariables.size() << "\n";
+      for (auto KeptGV = MustKeepGlobalVariables.begin();
+           KeptGV != MustKeepGlobalVariables.end(); KeptGV++)
+        dbgs() << "Kept: " << **KeptGV << "\n";
+  });
   // Grab all non-const globals.
   for (auto &GV : M.globals()) {
     // Merge is safe for "normal" internal or external globals only
diff --git a/llvm/lib/CodeGen/HardwareLoops.cpp b/llvm/lib/CodeGen/HardwareLoops.cpp
index 83b8c2d0eacb..67d6a3df7807 100644
--- a/llvm/lib/CodeGen/HardwareLoops.cpp
+++ b/llvm/lib/CodeGen/HardwareLoops.cpp
@@ -23,10 +23,8 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
@@ -37,7 +35,6 @@
 #include "llvm/PassRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
diff --git a/llvm/lib/CodeGen/IfConversion.cpp b/llvm/lib/CodeGen/IfConversion.cpp
index 1b20d1da20ad..105ab908d3fa 100644
--- a/llvm/lib/CodeGen/IfConversion.cpp
+++ b/llvm/lib/CodeGen/IfConversion.cpp
@@ -21,6 +21,7 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MBFIWrapper.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
@@ -28,16 +29,13 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MBFIWrapper.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/IR/Attributes.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/MCRegisterInfo.h"
diff --git a/llvm/lib/CodeGen/IndirectBrExpandPass.cpp b/llvm/lib/CodeGen/IndirectBrExpandPass.cpp
index 2d38a44d5a33..5be98e114673 100644
--- a/llvm/lib/CodeGen/IndirectBrExpandPass.cpp
+++ b/llvm/lib/CodeGen/IndirectBrExpandPass.cpp
@@ -32,17 +32,13 @@
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index c975013db8c8..06c660807c5c 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -23,7 +23,6 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveInterval.h"
-#include "llvm/CodeGen/LiveIntervalCalc.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/CodeGen/LiveStacks.h"
@@ -686,7 +685,7 @@ void InlineSpiller::reMaterializeAll() {
   // Remove any values that were completely rematted.
   for (Register Reg : RegsToSpill) {
     LiveInterval &LI = LIS.getInterval(Reg);
-    for (VNInfo *VNI : llvm::make_range(LI.vni_begin(), LI.vni_end())) {
+    for (VNInfo *VNI : LI.vnis()) {
       if (VNI->isUnused() || VNI->isPHIDef() || UsedValues.count(VNI))
         continue;
       MachineInstr *MI = LIS.getInstructionFromIndex(VNI->def);
@@ -839,6 +838,13 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, unsigned>> Ops,
     unsigned Idx = OpPair.second;
     assert(MI == OpPair.first && "Instruction conflict during operand folding");
     MachineOperand &MO = MI->getOperand(Idx);
+
+    // No point restoring an undef read, and we'll produce an invalid live
+    // interval.
+    // TODO: Is this really the correct way to handle undef tied uses?
+    if (MO.isUse() && !MO.readsReg() && !MO.isTied())
+      continue;
+
     if (MO.isImplicit()) {
       ImpReg = MO.getReg();
       continue;
@@ -964,7 +970,7 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, unsigned>> Ops,
       if (!MO.isReg() || !MO.isImplicit())
         break;
       if (MO.getReg() == ImpReg)
-        FoldMI->RemoveOperand(i - 1);
+        FoldMI->removeOperand(i - 1);
     }
 
   LLVM_DEBUG(dumpMachineInstrRangeWithSlotIndex(MIS.begin(), MIS.end(), LIS,
@@ -1608,7 +1614,7 @@ void HoistSpillHelper::hoistAllSpills() {
       for (unsigned i = RMEnt->getNumOperands(); i; --i) {
         MachineOperand &MO = RMEnt->getOperand(i - 1);
         if (MO.isReg() && MO.isImplicit() && MO.isDef() && !MO.isDead())
-          RMEnt->RemoveOperand(i - 1);
+          RMEnt->removeOperand(i - 1);
       }
     }
     Edit.eliminateDeadDefs(SpillsToRm, None, AA);
diff --git a/llvm/lib/CodeGen/InterferenceCache.h b/llvm/lib/CodeGen/InterferenceCache.h
index ace1691c1363..97464da9f17b 100644
--- a/llvm/lib/CodeGen/InterferenceCache.h
+++ b/llvm/lib/CodeGen/InterferenceCache.h
@@ -37,7 +37,7 @@ class LLVM_LIBRARY_VISIBILITY InterferenceCache {
     SlotIndex First;
     SlotIndex Last;
 
-    BlockInterference() {}
+    BlockInterference() = default;
   };
 
   /// Entry - A cache entry containing interference information for all aliases
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 5a20580e5479..b3f38a3b53f3 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -46,6 +46,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
@@ -57,7 +58,6 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/Type.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
diff --git a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
index 230c6846dde2..43858071025a 100644
--- a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
+++ b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
@@ -19,7 +19,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
@@ -31,9 +30,8 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
@@ -173,10 +171,10 @@ class Polynomial {
   };
 
   /// Number of Error Bits e
-  unsigned ErrorMSBs;
+  unsigned ErrorMSBs = (unsigned)-1;
 
   /// Value
-  Value *V;
+  Value *V = nullptr;
 
   /// Coefficient B
   SmallVector<std::pair<BOps, APInt>, 4> B;
@@ -185,7 +183,7 @@ class Polynomial {
   APInt A;
 
 public:
-  Polynomial(Value *V) : ErrorMSBs((unsigned)-1), V(V) {
+  Polynomial(Value *V) : V(V) {
     IntegerType *Ty = dyn_cast<IntegerType>(V->getType());
     if (Ty) {
       ErrorMSBs = 0;
@@ -195,12 +193,12 @@ public:
   }
 
   Polynomial(const APInt &A, unsigned ErrorMSBs = 0)
-      : ErrorMSBs(ErrorMSBs), V(nullptr), A(A) {}
+      : ErrorMSBs(ErrorMSBs), A(A) {}
 
   Polynomial(unsigned BitWidth, uint64_t A, unsigned ErrorMSBs = 0)
-      : ErrorMSBs(ErrorMSBs), V(nullptr), A(BitWidth, A) {}
+      : ErrorMSBs(ErrorMSBs), A(BitWidth, A) {}
 
-  Polynomial() : ErrorMSBs((unsigned)-1), V(nullptr) {}
+  Polynomial() = default;
 
   /// Increment and clamp the number of undefined bits.
   void incErrorMSBs(unsigned amt) {
@@ -1206,9 +1204,7 @@ bool InterleavedLoadCombineImpl::combine(std::list<VectorInfo> &InterleavedLoad,
           ->getNumElements();
   FixedVectorType *ILTy = FixedVectorType::get(ETy, Factor * ElementsPerSVI);
 
-  SmallVector<unsigned, 4> Indices;
-  for (unsigned i = 0; i < Factor; i++)
-    Indices.push_back(i);
+  auto Indices = llvm::to_vector<4>(llvm::seq<unsigned>(0, Factor));
   InterleavedCost = TTI.getInterleavedMemoryOpCost(
       Instruction::Load, ILTy, Factor, Indices, InsertionPoint->getAlign(),
       InsertionPoint->getPointerAddressSpace(), CostKind);
@@ -1228,7 +1224,7 @@ bool InterleavedLoadCombineImpl::combine(std::list<VectorInfo> &InterleavedLoad,
   auto MSSAU = MemorySSAUpdater(&MSSA);
   MemoryUse *MSSALoad = cast<MemoryUse>(MSSAU.createMemoryAccessBefore(
       LI, nullptr, MSSA.getMemoryAccess(InsertionPoint)));
-  MSSAU.insertUse(MSSALoad);
+  MSSAU.insertUse(MSSALoad, /*RenameUses=*/ true);
 
   // Create the final SVIs and replace all uses.
   int i = 0;
diff --git a/llvm/lib/CodeGen/JMCInstrumenter.cpp b/llvm/lib/CodeGen/JMCInstrumenter.cpp
new file mode 100644
index 000000000000..23220872b532
--- /dev/null
+++ b/llvm/lib/CodeGen/JMCInstrumenter.cpp
@@ -0,0 +1,233 @@
+//===- JMCInstrumenter.cpp - JMC Instrumentation --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// JMCInstrumenter pass:
+// - instrument each function with a call to __CheckForDebuggerJustMyCode. The
+//   sole argument should be defined in .msvcjmc. Each flag is 1 byte initilized
+//   to 1.
+// - create the dummy COMDAT function __JustMyCode_Default to prevent linking
+//   error if __CheckForDebuggerJustMyCode is not available.
+// - For MSVC:
+//   add "/alternatename:__CheckForDebuggerJustMyCode=__JustMyCode_Default" to
+//   "llvm.linker.options"
+//   For ELF:
+//   Rename __JustMyCode_Default to __CheckForDebuggerJustMyCode and mark it as
+//   weak symbol.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/DJB.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "jmc-instrument"
+
+namespace {
+struct JMCInstrumenter : public ModulePass {
+  static char ID;
+  JMCInstrumenter() : ModulePass(ID) {
+    initializeJMCInstrumenterPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnModule(Module &M) override;
+};
+char JMCInstrumenter::ID = 0;
+} // namespace
+
+INITIALIZE_PASS(
+    JMCInstrumenter, DEBUG_TYPE,
+    "Instrument function entry with call to __CheckForDebuggerJustMyCode",
+    false, false)
+
+ModulePass *llvm::createJMCInstrumenterPass() { return new JMCInstrumenter(); }
+
+namespace {
+const char CheckFunctionName[] = "__CheckForDebuggerJustMyCode";
+
+std::string getFlagName(DISubprogram &SP, bool UseX86FastCall) {
+  // absolute windows path:           windows_backslash
+  // relative windows backslash path: windows_backslash
+  // relative windows slash path:     posix
+  // absolute posix path:             posix
+  // relative posix path:             posix
+  sys::path::Style PathStyle =
+      has_root_name(SP.getDirectory(), sys::path::Style::windows_backslash) ||
+              SP.getDirectory().contains("\\") ||
+              SP.getFilename().contains("\\")
+          ? sys::path::Style::windows_backslash
+          : sys::path::Style::posix;
+  // Best effort path normalization. This is to guarantee an unique flag symbol
+  // is produced for the same directory. Some builds may want to use relative
+  // paths, or paths with a specific prefix (see the -fdebug-compilation-dir
+  // flag), so only hash paths in debuginfo. Don't expand them to absolute
+  // paths.
+  SmallString<256> FilePath(SP.getDirectory());
+  sys::path::append(FilePath, PathStyle, SP.getFilename());
+  sys::path::native(FilePath, PathStyle);
+  sys::path::remove_dots(FilePath, /*remove_dot_dot=*/true, PathStyle);
+
+  // The naming convention for the flag name is __<hash>_<file name> with '.' in
+  // <file name> replaced with '@'. For example C:\file.any.c would have a flag
+  // __D032E919_file@any@c. The naming convention match MSVC's format however
+  // the match is not required to make JMC work. The hashing function used here
+  // is different from MSVC's.
+
+  std::string Suffix;
+  for (auto C : sys::path::filename(FilePath, PathStyle))
+    Suffix.push_back(C == '.' ? '@' : C);
+
+  sys::path::remove_filename(FilePath, PathStyle);
+  return (UseX86FastCall ? "_" : "__") +
+         utohexstr(djbHash(FilePath), /*LowerCase=*/false,
+                   /*Width=*/8) +
+         "_" + Suffix;
+}
+
+void attachDebugInfo(GlobalVariable &GV, DISubprogram &SP) {
+  Module &M = *GV.getParent();
+  DICompileUnit *CU = SP.getUnit();
+  assert(CU);
+  DIBuilder DB(M, false, CU);
+
+  auto *DType =
+      DB.createBasicType("unsigned char", 8, dwarf::DW_ATE_unsigned_char,
+                         llvm::DINode::FlagArtificial);
+
+  auto *DGVE = DB.createGlobalVariableExpression(
+      CU, GV.getName(), /*LinkageName=*/StringRef(), SP.getFile(),
+      /*LineNo=*/0, DType, /*IsLocalToUnit=*/true, /*IsDefined=*/true);
+  GV.addMetadata(LLVMContext::MD_dbg, *DGVE);
+  DB.finalize();
+}
+
+FunctionType *getCheckFunctionType(LLVMContext &Ctx) {
+  Type *VoidTy = Type::getVoidTy(Ctx);
+  PointerType *VoidPtrTy = Type::getInt8PtrTy(Ctx);
+  return FunctionType::get(VoidTy, VoidPtrTy, false);
+}
+
+Function *createDefaultCheckFunction(Module &M, bool UseX86FastCall) {
+  LLVMContext &Ctx = M.getContext();
+  const char *DefaultCheckFunctionName =
+      UseX86FastCall ? "_JustMyCode_Default" : "__JustMyCode_Default";
+  // Create the function.
+  Function *DefaultCheckFunc =
+      Function::Create(getCheckFunctionType(Ctx), GlobalValue::ExternalLinkage,
+                       DefaultCheckFunctionName, &M);
+  DefaultCheckFunc->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  DefaultCheckFunc->addParamAttr(0, Attribute::NoUndef);
+  if (UseX86FastCall)
+    DefaultCheckFunc->addParamAttr(0, Attribute::InReg);
+
+  BasicBlock *EntryBB = BasicBlock::Create(Ctx, "", DefaultCheckFunc);
+  ReturnInst::Create(Ctx, EntryBB);
+  return DefaultCheckFunc;
+}
+} // namespace
+
+bool JMCInstrumenter::runOnModule(Module &M) {
+  bool Changed = false;
+  LLVMContext &Ctx = M.getContext();
+  Triple ModuleTriple(M.getTargetTriple());
+  bool IsMSVC = ModuleTriple.isKnownWindowsMSVCEnvironment();
+  bool IsELF = ModuleTriple.isOSBinFormatELF();
+  assert((IsELF || IsMSVC) && "Unsupported triple for JMC");
+  bool UseX86FastCall = IsMSVC && ModuleTriple.getArch() == Triple::x86;
+  const char *const FlagSymbolSection = IsELF ? ".just.my.code" : ".msvcjmc";
+
+  GlobalValue *CheckFunction = nullptr;
+  DenseMap<DISubprogram *, Constant *> SavedFlags(8);
+  for (auto &F : M) {
+    if (F.isDeclaration())
+      continue;
+    auto *SP = F.getSubprogram();
+    if (!SP)
+      continue;
+
+    Constant *&Flag = SavedFlags[SP];
+    if (!Flag) {
+      std::string FlagName = getFlagName(*SP, UseX86FastCall);
+      IntegerType *FlagTy = Type::getInt8Ty(Ctx);
+      Flag = M.getOrInsertGlobal(FlagName, FlagTy, [&] {
+        // FIXME: Put the GV in comdat and have linkonce_odr linkage to save
+        //        .msvcjmc section space? maybe not worth it.
+        GlobalVariable *GV = new GlobalVariable(
+            M, FlagTy, /*isConstant=*/false, GlobalValue::InternalLinkage,
+            ConstantInt::get(FlagTy, 1), FlagName);
+        GV->setSection(FlagSymbolSection);
+        GV->setAlignment(Align(1));
+        GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+        attachDebugInfo(*GV, *SP);
+        return GV;
+      });
+    }
+
+    if (!CheckFunction) {
+      Function *DefaultCheckFunc =
+          createDefaultCheckFunction(M, UseX86FastCall);
+      if (IsELF) {
+        DefaultCheckFunc->setName(CheckFunctionName);
+        DefaultCheckFunc->setLinkage(GlobalValue::WeakAnyLinkage);
+        CheckFunction = DefaultCheckFunc;
+      } else {
+        assert(!M.getFunction(CheckFunctionName) &&
+               "JMC instrument more than once?");
+        auto *CheckFunc = cast<Function>(
+            M.getOrInsertFunction(CheckFunctionName, getCheckFunctionType(Ctx))
+                .getCallee());
+        CheckFunc->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+        CheckFunc->addParamAttr(0, Attribute::NoUndef);
+        if (UseX86FastCall) {
+          CheckFunc->setCallingConv(CallingConv::X86_FastCall);
+          CheckFunc->addParamAttr(0, Attribute::InReg);
+        }
+        CheckFunction = CheckFunc;
+
+        StringRef DefaultCheckFunctionName = DefaultCheckFunc->getName();
+        appendToUsed(M, {DefaultCheckFunc});
+        Comdat *C = M.getOrInsertComdat(DefaultCheckFunctionName);
+        C->setSelectionKind(Comdat::Any);
+        DefaultCheckFunc->setComdat(C);
+        // Add a linker option /alternatename to set the default implementation
+        // for the check function.
+        // https://devblogs.microsoft.com/oldnewthing/20200731-00/?p=104024
+        std::string AltOption = std::string("/alternatename:") +
+                                CheckFunctionName + "=" +
+                                DefaultCheckFunctionName.str();
+        llvm::Metadata *Ops[] = {llvm::MDString::get(Ctx, AltOption)};
+        MDTuple *N = MDNode::get(Ctx, Ops);
+        M.getOrInsertNamedMetadata("llvm.linker.options")->addOperand(N);
+      }
+    }
+    // FIXME: it would be nice to make CI scheduling boundary, although in
+    //        practice it does not matter much.
+    auto *CI = CallInst::Create(getCheckFunctionType(Ctx), CheckFunction,
+                                {Flag}, "", &*F.begin()->getFirstInsertionPt());
+    CI->addParamAttr(0, Attribute::NoUndef);
+    if (UseX86FastCall) {
+      CI->setCallingConv(CallingConv::X86_FastCall);
+      CI->addParamAttr(0, Attribute::InReg);
+    }
+
+    Changed = true;
+  }
+  return Changed;
+}
diff --git a/llvm/lib/CodeGen/LLVMTargetMachine.cpp b/llvm/lib/CodeGen/LLVMTargetMachine.cpp
index 0d3685d4141c..3192dcadb5f5 100644
--- a/llvm/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/llvm/lib/CodeGen/LLVMTargetMachine.cpp
@@ -23,20 +23,19 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
-static cl::opt<bool> EnableTrapUnreachable("trap-unreachable",
-  cl::Hidden, cl::ZeroOrMore, cl::init(false),
-  cl::desc("Enable generating trap for unreachable"));
+static cl::opt<bool>
+    EnableTrapUnreachable("trap-unreachable", cl::Hidden,
+                          cl::desc("Enable generating trap for unreachable"));
 
 void LLVMTargetMachine::initAsmInfo() {
   MRI.reset(TheTarget.createMCRegInfo(getTargetTriple().str()));
@@ -99,7 +98,7 @@ LLVMTargetMachine::LLVMTargetMachine(const Target &T,
 }
 
 TargetTransformInfo
-LLVMTargetMachine::getTargetTransformInfo(const Function &F) {
+LLVMTargetMachine::getTargetTransformInfo(const Function &F) const {
   return TargetTransformInfo(BasicTTIImpl(this, F));
 }
 
@@ -164,22 +163,35 @@ Expected<std::unique_ptr<MCStreamer>> LLVMTargetMachine::createMCStreamer(
     // Create a code emitter if asked to show the encoding.
     std::unique_ptr<MCCodeEmitter> MCE;
     if (Options.MCOptions.ShowMCEncoding)
-      MCE.reset(getTarget().createMCCodeEmitter(MII, MRI, Context));
+      MCE.reset(getTarget().createMCCodeEmitter(MII, Context));
+
+    bool UseDwarfDirectory = false;
+    switch (Options.MCOptions.MCUseDwarfDirectory) {
+    case MCTargetOptions::DisableDwarfDirectory:
+      UseDwarfDirectory = false;
+      break;
+    case MCTargetOptions::EnableDwarfDirectory:
+      UseDwarfDirectory = true;
+      break;
+    case MCTargetOptions::DefaultDwarfDirectory:
+      UseDwarfDirectory = MAI.enableDwarfFileDirectoryDefault();
+      break;
+    }
 
     std::unique_ptr<MCAsmBackend> MAB(
         getTarget().createMCAsmBackend(STI, MRI, Options.MCOptions));
     auto FOut = std::make_unique<formatted_raw_ostream>(Out);
     MCStreamer *S = getTarget().createAsmStreamer(
         Context, std::move(FOut), Options.MCOptions.AsmVerbose,
-        Options.MCOptions.MCUseDwarfDirectory, InstPrinter, std::move(MCE),
-        std::move(MAB), Options.MCOptions.ShowMCInst);
+        UseDwarfDirectory, InstPrinter, std::move(MCE), std::move(MAB),
+        Options.MCOptions.ShowMCInst);
     AsmStreamer.reset(S);
     break;
   }
   case CGFT_ObjectFile: {
     // Create the code emitter for the target if it exists.  If not, .o file
     // emission fails.
-    MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(MII, MRI, Context);
+    MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(MII, Context);
     if (!MCE)
       return make_error<StringError>("createMCCodeEmitter failed",
                                      inconvertibleErrorCode());
@@ -252,6 +264,9 @@ bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM, MCContext *&Ctx,
          "Cannot emit MC with limited codegen pipeline");
 
   Ctx = &MMIWP->getMMI().getContext();
+  // libunwind is unable to load compact unwind dynamically, so we must generate
+  // DWARF unwind info for the JIT.
+  Options.MCOptions.EmitDwarfUnwind = EmitDwarfUnwindType::Always;
   if (Options.MCOptions.MCSaveTempLabels)
     Ctx->setAllowTemporaryLabels(false);
 
@@ -259,8 +274,7 @@ bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM, MCContext *&Ctx,
   // emission fails.
   const MCSubtargetInfo &STI = *getMCSubtargetInfo();
   const MCRegisterInfo &MRI = *getMCRegisterInfo();
-  MCCodeEmitter *MCE =
-      getTarget().createMCCodeEmitter(*getMCInstrInfo(), MRI, *Ctx);
+  MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(*getMCInstrInfo(), *Ctx);
   MCAsmBackend *MAB =
       getTarget().createMCAsmBackend(STI, MRI, Options.MCOptions);
   if (!MCE || !MAB)
diff --git a/llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp b/llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp
index 63a0d0c1c43e..39b44b917d9e 100644
--- a/llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp
+++ b/llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp
@@ -14,6 +14,7 @@
 ///===---------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/InitializePasses.h"
 
 using namespace llvm;
@@ -87,7 +88,7 @@ LazyMachineBlockFrequencyInfoPass::calculateIfNotAvailable() const {
 
   OwnedMBFI = std::make_unique<MachineBlockFrequencyInfo>();
   OwnedMBFI->calculate(*MF, MBPI, *MLI);
-  return *OwnedMBFI.get();
+  return *OwnedMBFI;
 }
 
 bool LazyMachineBlockFrequencyInfoPass::runOnMachineFunction(
diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
index 0eb6100230bd..30ca8bd871e8 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
@@ -84,21 +84,18 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/IteratedDominanceFrontier.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/LexicalScopes.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -106,27 +103,23 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/Config/llvm-config.h"
-#include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
-#include "llvm/InitializePasses.h"
 #include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/GenericIteratedDominanceFrontier.h"
 #include "llvm/Support/TypeSize.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/SSAUpdaterImpl.h"
 #include <algorithm>
 #include <cassert>
+#include <climits>
 #include <cstdint>
 #include <functional>
-#include <limits.h>
-#include <limits>
 #include <queue>
 #include <tuple>
 #include <utility>
@@ -148,6 +141,20 @@ static cl::opt<bool> EmulateOldLDV("emulate-old-livedebugvalues", cl::Hidden,
                                    cl::desc("Act like old LiveDebugValues did"),
                                    cl::init(false));
 
+// Limit for the maximum number of stack slots we should track, past which we
+// will ignore any spills. InstrRefBasedLDV gathers detailed information on all
+// stack slots which leads to high memory consumption, and in some scenarios
+// (such as asan with very many locals) the working set of the function can be
+// very large, causing many spills. In these scenarios, it is very unlikely that
+// the developer has hundreds of variables live at the same time that they're
+// carefully thinking about -- instead, they probably autogenerated the code.
+// When this happens, gracefully stop tracking excess spill slots, rather than
+// consuming all the developer's memory.
+static cl::opt<unsigned>
+    StackWorkingSetLimit("livedebugvalues-max-stack-slots", cl::Hidden,
+                         cl::desc("livedebugvalues-stack-ws-limit"),
+                         cl::init(250));
+
 /// Tracker for converting machine value locations and variable values into
 /// variable locations (the output of LiveDebugValues), recorded as DBG_VALUEs
 /// specifying block live-in locations and transfers within blocks.
@@ -252,7 +259,7 @@ public:
   /// object fields to track variable locations as we step through the block.
   /// FIXME: could just examine mloctracker instead of passing in \p mlocs?
   void
-  loadInlocs(MachineBasicBlock &MBB, ValueIDNum *MLocs,
+  loadInlocs(MachineBasicBlock &MBB, ValueTable &MLocs,
              const SmallVectorImpl<std::pair<DebugVariable, DbgValue>> &VLocs,
              unsigned NumLocs) {
     ActiveMLocs.clear();
@@ -715,6 +722,20 @@ MLocTracker::MLocTracker(MachineFunction &MF, const TargetInstrInfo &TII,
     StackSlotIdxes.insert({{Size, Offs}, Idx});
   }
 
+  // There may also be strange register class sizes (think x86 fp80s).
+  for (const TargetRegisterClass *RC : TRI.regclasses()) {
+    unsigned Size = TRI.getRegSizeInBits(*RC);
+
+    // We might see special reserved values as sizes, and classes for other
+    // stuff the machine tries to model. If it's more than 512 bits, then it
+    // is very unlikely to be a register than can be spilt.
+    if (Size > 512)
+      continue;
+
+    unsigned Idx = StackSlotIdxes.size();
+    StackSlotIdxes.insert({{Size, 0}, Idx});
+  }
+
   for (auto &Idx : StackSlotIdxes)
     StackIdxesToPos[Idx.second] = Idx.first;
 
@@ -757,9 +778,15 @@ void MLocTracker::writeRegMask(const MachineOperand *MO, unsigned CurBB,
   Masks.push_back(std::make_pair(MO, InstID));
 }
 
-SpillLocationNo MLocTracker::getOrTrackSpillLoc(SpillLoc L) {
+Optional<SpillLocationNo> MLocTracker::getOrTrackSpillLoc(SpillLoc L) {
   SpillLocationNo SpillID(SpillLocs.idFor(L));
+
   if (SpillID.id() == 0) {
+    // If there is no location, and we have reached the limit of how many stack
+    // slots to track, then don't track this one.
+    if (SpillLocs.size() >= StackWorkingSetLimit)
+      return None;
+
     // Spill location is untracked: create record for this one, and all
     // subregister slots too.
     SpillID = SpillLocationNo(SpillLocs.insert(L));
@@ -843,19 +870,72 @@ MachineInstrBuilder MLocTracker::emitLoc(Optional<LocIdx> MLoc,
     // the variable is.
     if (Offset == 0) {
       const SpillLoc &Spill = SpillLocs[SpillID.id()];
-      Expr = TRI.prependOffsetExpression(Expr, DIExpression::ApplyOffset,
-                                         Spill.SpillOffset);
       unsigned Base = Spill.SpillBase;
       MIB.addReg(Base);
-      MIB.addImm(0);
 
-      // Being on the stack makes this location indirect; if it was _already_
-      // indirect though, we need to add extra indirection. See this test for
-      // a scenario where this happens:
-      //     llvm/test/DebugInfo/X86/spill-nontrivial-param.ll
+      // There are several ways we can dereference things, and several inputs
+      // to consider:
+      // * NRVO variables will appear with IsIndirect set, but should have
+      //   nothing else in their DIExpressions,
+      // * Variables with DW_OP_stack_value in their expr already need an
+      //   explicit dereference of the stack location,
+      // * Values that don't match the variable size need DW_OP_deref_size,
+      // * Everything else can just become a simple location expression.
+
+      // We need to use deref_size whenever there's a mismatch between the
+      // size of value and the size of variable portion being read.
+      // Additionally, we should use it whenever dealing with stack_value
+      // fragments, to avoid the consumer having to determine the deref size
+      // from DW_OP_piece.
+      bool UseDerefSize = false;
+      unsigned ValueSizeInBits = getLocSizeInBits(*MLoc);
+      unsigned DerefSizeInBytes = ValueSizeInBits / 8;
+      if (auto Fragment = Var.getFragment()) {
+        unsigned VariableSizeInBits = Fragment->SizeInBits;
+        if (VariableSizeInBits != ValueSizeInBits || Expr->isComplex())
+          UseDerefSize = true;
+      } else if (auto Size = Var.getVariable()->getSizeInBits()) {
+        if (*Size != ValueSizeInBits) {
+          UseDerefSize = true;
+        }
+      }
+
       if (Properties.Indirect) {
-        std::vector<uint64_t> Elts = {dwarf::DW_OP_deref};
-        Expr = DIExpression::append(Expr, Elts);
+        // This is something like an NRVO variable, where the pointer has been
+        // spilt to the stack, or a dbg.addr pointing at a coroutine frame
+        // field. It should end up being a memory location, with the pointer
+        // to the variable loaded off the stack with a deref. It can't be a
+        // DW_OP_stack_value expression.
+        assert(!Expr->isImplicit());
+        Expr = TRI.prependOffsetExpression(
+            Expr, DIExpression::ApplyOffset | DIExpression::DerefAfter,
+            Spill.SpillOffset);
+        MIB.addImm(0);
+      } else if (UseDerefSize) {
+        // We're loading a value off the stack that's not the same size as the
+        // variable. Add / subtract stack offset, explicitly deref with a size,
+        // and add DW_OP_stack_value if not already present.
+        SmallVector<uint64_t, 2> Ops = {dwarf::DW_OP_deref_size,
+                                        DerefSizeInBytes};
+        Expr = DIExpression::prependOpcodes(Expr, Ops, true);
+        unsigned Flags = DIExpression::StackValue | DIExpression::ApplyOffset;
+        Expr = TRI.prependOffsetExpression(Expr, Flags, Spill.SpillOffset);
+        MIB.addReg(0);
+      } else if (Expr->isComplex()) {
+        // A variable with no size ambiguity, but with extra elements in it's
+        // expression. Manually dereference the stack location.
+        assert(Expr->isComplex());
+        Expr = TRI.prependOffsetExpression(
+            Expr, DIExpression::ApplyOffset | DIExpression::DerefAfter,
+            Spill.SpillOffset);
+        MIB.addReg(0);
+      } else {
+        // A plain value that has been spilt to the stack, with no further
+        // context. Request a location expression, marking the DBG_VALUE as
+        // IsIndirect.
+        Expr = TRI.prependOffsetExpression(Expr, DIExpression::ApplyOffset,
+                                           Spill.SpillOffset);
+        MIB.addImm(0);
       }
     } else {
       // This is a stack location with a weird subregister offset: emit an undef
@@ -879,7 +959,7 @@ MachineInstrBuilder MLocTracker::emitLoc(Optional<LocIdx> MLoc,
 }
 
 /// Default construct and initialize the pass.
-InstrRefBasedLDV::InstrRefBasedLDV() {}
+InstrRefBasedLDV::InstrRefBasedLDV() = default;
 
 bool InstrRefBasedLDV::isCalleeSaved(LocIdx L) const {
   unsigned Reg = MTracker->LocIdxToLocID[L];
@@ -898,7 +978,7 @@ bool InstrRefBasedLDV::isCalleeSaved(LocIdx L) const {
 // void InstrRefBasedLDV::printVarLocInMBB(..)
 #endif
 
-SpillLocationNo
+Optional<SpillLocationNo>
 InstrRefBasedLDV::extractSpillBaseRegAndOffset(const MachineInstr &MI) {
   assert(MI.hasOneMemOperand() &&
          "Spill instruction does not have exactly one memory operand?");
@@ -913,8 +993,11 @@ InstrRefBasedLDV::extractSpillBaseRegAndOffset(const MachineInstr &MI) {
   return MTracker->getOrTrackSpillLoc({Reg, Offset});
 }
 
-Optional<LocIdx> InstrRefBasedLDV::findLocationForMemOperand(const MachineInstr &MI) {
-  SpillLocationNo SpillLoc =  extractSpillBaseRegAndOffset(MI);
+Optional<LocIdx>
+InstrRefBasedLDV::findLocationForMemOperand(const MachineInstr &MI) {
+  Optional<SpillLocationNo> SpillLoc = extractSpillBaseRegAndOffset(MI);
+  if (!SpillLoc)
+    return None;
 
   // Where in the stack slot is this value defined -- i.e., what size of value
   // is this? An important question, because it could be loaded into a register
@@ -930,7 +1013,7 @@ Optional<LocIdx> InstrRefBasedLDV::findLocationForMemOperand(const MachineInstr
     // occur, but the safe action is to indicate the variable is optimised out.
     return None;
 
-  unsigned SpillID = MTracker->getSpillIDWithIdx(SpillLoc, IdxIt->second);
+  unsigned SpillID = MTracker->getSpillIDWithIdx(*SpillLoc, IdxIt->second);
   return MTracker->getSpillMLoc(SpillID);
 }
 
@@ -999,14 +1082,14 @@ bool InstrRefBasedLDV::transferDebugValue(const MachineInstr &MI) {
 }
 
 bool InstrRefBasedLDV::transferDebugInstrRef(MachineInstr &MI,
-                                             ValueIDNum **MLiveOuts,
-                                             ValueIDNum **MLiveIns) {
+                                             const ValueTable *MLiveOuts,
+                                             const ValueTable *MLiveIns) {
   if (!MI.isDebugRef())
     return false;
 
   // Only handle this instruction when we are building the variable value
   // transfer function.
-  if (!VTracker)
+  if (!VTracker && !TTracker)
     return false;
 
   unsigned InstNo = MI.getOperand(0).getImm();
@@ -1068,15 +1151,25 @@ bool InstrRefBasedLDV::transferDebugInstrRef(MachineInstr &MI,
       if (L)
         NewID = ValueIDNum(BlockNo, InstrIt->second.second, *L);
     } else if (OpNo != MachineFunction::DebugOperandMemNumber) {
-      assert(OpNo < TargetInstr.getNumOperands());
-      const MachineOperand &MO = TargetInstr.getOperand(OpNo);
-
-      // Today, this can only be a register.
-      assert(MO.isReg() && MO.isDef());
+      // Permit the debug-info to be completely wrong: identifying a nonexistant
+      // operand, or one that is not a register definition, means something
+      // unexpected happened during optimisation. Broken debug-info, however,
+      // shouldn't crash the compiler -- instead leave the variable value as
+      // None, which will make it appear "optimised out".
+      if (OpNo < TargetInstr.getNumOperands()) {
+        const MachineOperand &MO = TargetInstr.getOperand(OpNo);
+
+        if (MO.isReg() && MO.isDef() && MO.getReg()) {
+          unsigned LocID = MTracker->getLocID(MO.getReg());
+          LocIdx L = MTracker->LocIDToLocIdx[LocID];
+          NewID = ValueIDNum(BlockNo, InstrIt->second.second, L);
+        }
+      }
 
-      unsigned LocID = MTracker->getLocID(MO.getReg());
-      LocIdx L = MTracker->LocIDToLocIdx[LocID];
-      NewID = ValueIDNum(BlockNo, InstrIt->second.second, L);
+      if (!NewID) {
+        LLVM_DEBUG(
+            { dbgs() << "Seen instruction reference to illegal operand\n"; });
+      }
     }
     // else: NewID is left as None.
   } else if (PHIIt != DebugPHINumToValue.end() && PHIIt->InstrNum == InstNo) {
@@ -1162,7 +1255,8 @@ bool InstrRefBasedLDV::transferDebugInstrRef(MachineInstr &MI,
   // for DBG_INSTR_REFs as DBG_VALUEs (just, the former can refer to values that
   // aren't immediately available).
   DbgValueProperties Properties(Expr, false);
-  VTracker->defVar(MI, Properties, NewID);
+  if (VTracker)
+    VTracker->defVar(MI, Properties, NewID);
 
   // If we're on the final pass through the function, decompose this INSTR_REF
   // into a plain DBG_VALUE.
@@ -1225,7 +1319,16 @@ bool InstrRefBasedLDV::transferDebugPHI(MachineInstr &MI) {
   const MachineOperand &MO = MI.getOperand(0);
   unsigned InstrNum = MI.getOperand(1).getImm();
 
-  if (MO.isReg()) {
+  auto EmitBadPHI = [this, &MI, InstrNum](void) -> bool {
+    // Helper lambda to do any accounting when we fail to find a location for
+    // a DBG_PHI. This can happen if DBG_PHIs are malformed, or refer to a
+    // dead stack slot, for example.
+    // Record a DebugPHIRecord with an empty value + location.
+    DebugPHINumToValue.push_back({InstrNum, MI.getParent(), None, None});
+    return true;
+  };
+
+  if (MO.isReg() && MO.getReg()) {
     // The value is whatever's currently in the register. Read and record it,
     // to be analysed later.
     Register Reg = MO.getReg();
@@ -1237,57 +1340,45 @@ bool InstrRefBasedLDV::transferDebugPHI(MachineInstr &MI) {
     // Ensure this register is tracked.
     for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI)
       MTracker->lookupOrTrackRegister(*RAI);
-  } else {
+  } else if (MO.isFI()) {
     // The value is whatever's in this stack slot.
-    assert(MO.isFI());
     unsigned FI = MO.getIndex();
 
     // If the stack slot is dead, then this was optimized away.
     // FIXME: stack slot colouring should account for slots that get merged.
     if (MFI->isDeadObjectIndex(FI))
-      return true;
+      return EmitBadPHI();
 
     // Identify this spill slot, ensure it's tracked.
     Register Base;
     StackOffset Offs = TFI->getFrameIndexReference(*MI.getMF(), FI, Base);
     SpillLoc SL = {Base, Offs};
-    SpillLocationNo SpillNo = MTracker->getOrTrackSpillLoc(SL);
-
-    // Problem: what value should we extract from the stack? LLVM does not
-    // record what size the last store to the slot was, and it would become
-    // sketchy after stack slot colouring anyway. Take a look at what values
-    // are stored on the stack, and pick the largest one that wasn't def'd
-    // by a spill (i.e., the value most likely to have been def'd in a register
-    // and then spilt.
-    std::array<unsigned, 4> CandidateSizes = {64, 32, 16, 8};
-    Optional<ValueIDNum> Result = None;
-    Optional<LocIdx> SpillLoc = None;
-    for (unsigned CS : CandidateSizes) {
-      unsigned SpillID = MTracker->getLocID(SpillNo, {CS, 0});
-      SpillLoc = MTracker->getSpillMLoc(SpillID);
-      ValueIDNum Val = MTracker->readMLoc(*SpillLoc);
-      // If this value was defined in it's own position, then it was probably
-      // an aliasing index of a small value that was spilt.
-      if (Val.getLoc() != SpillLoc->asU64()) {
-        Result = Val;
-        break;
-      }
-    }
+    Optional<SpillLocationNo> SpillNo = MTracker->getOrTrackSpillLoc(SL);
 
-    // If we didn't find anything, we're probably looking at a PHI, or a memory
-    // store folded into an instruction. FIXME: Take a guess that's it's 64
-    // bits. This isn't ideal, but tracking the size that the spill is
-    // "supposed" to be is more complex, and benefits a small number of
-    // locations.
-    if (!Result) {
-      unsigned SpillID = MTracker->getLocID(SpillNo, {64, 0});
-      SpillLoc = MTracker->getSpillMLoc(SpillID);
-      Result = MTracker->readMLoc(*SpillLoc);
-    }
+    // We might be able to find a value, but have chosen not to, to avoid
+    // tracking too much stack information.
+    if (!SpillNo)
+      return EmitBadPHI();
+
+    // Any stack location DBG_PHI should have an associate bit-size.
+    assert(MI.getNumOperands() == 3 && "Stack DBG_PHI with no size?");
+    unsigned slotBitSize = MI.getOperand(2).getImm();
+
+    unsigned SpillID = MTracker->getLocID(*SpillNo, {slotBitSize, 0});
+    LocIdx SpillLoc = MTracker->getSpillMLoc(SpillID);
+    ValueIDNum Result = MTracker->readMLoc(SpillLoc);
 
     // Record this DBG_PHI for later analysis.
-    auto DbgPHI = DebugPHIRecord({InstrNum, MI.getParent(), *Result, *SpillLoc});
+    auto DbgPHI = DebugPHIRecord({InstrNum, MI.getParent(), Result, SpillLoc});
     DebugPHINumToValue.push_back(DbgPHI);
+  } else {
+    // Else: if the operand is neither a legal register or a stack slot, then
+    // we're being fed illegal debug-info. Record an empty PHI, so that any
+    // debug users trying to read this number will be put off trying to
+    // interpret the value.
+    LLVM_DEBUG(
+        { dbgs() << "Seen DBG_PHI with unrecognised operand format\n"; });
+    return EmitBadPHI();
   }
 
   return true;
@@ -1357,11 +1448,12 @@ void InstrRefBasedLDV::transferRegisterDef(MachineInstr &MI) {
 
   // If this instruction writes to a spill slot, def that slot.
   if (hasFoldedStackStore(MI)) {
-    SpillLocationNo SpillNo = extractSpillBaseRegAndOffset(MI);
-    for (unsigned int I = 0; I < MTracker->NumSlotIdxes; ++I) {
-      unsigned SpillID = MTracker->getSpillIDWithIdx(SpillNo, I);
-      LocIdx L = MTracker->getSpillMLoc(SpillID);
-      MTracker->setMLoc(L, ValueIDNum(CurBB, CurInst, L));
+    if (Optional<SpillLocationNo> SpillNo = extractSpillBaseRegAndOffset(MI)) {
+      for (unsigned int I = 0; I < MTracker->NumSlotIdxes; ++I) {
+        unsigned SpillID = MTracker->getSpillIDWithIdx(*SpillNo, I);
+        LocIdx L = MTracker->getSpillMLoc(SpillID);
+        MTracker->setMLoc(L, ValueIDNum(CurBB, CurInst, L));
+      }
     }
   }
 
@@ -1398,11 +1490,12 @@ void InstrRefBasedLDV::transferRegisterDef(MachineInstr &MI) {
 
   // Tell TTracker about any folded stack store.
   if (hasFoldedStackStore(MI)) {
-    SpillLocationNo SpillNo = extractSpillBaseRegAndOffset(MI);
-    for (unsigned int I = 0; I < MTracker->NumSlotIdxes; ++I) {
-      unsigned SpillID = MTracker->getSpillIDWithIdx(SpillNo, I);
-      LocIdx L = MTracker->getSpillMLoc(SpillID);
-      TTracker->clobberMloc(L, MI.getIterator(), true);
+    if (Optional<SpillLocationNo> SpillNo = extractSpillBaseRegAndOffset(MI)) {
+      for (unsigned int I = 0; I < MTracker->NumSlotIdxes; ++I) {
+        unsigned SpillID = MTracker->getSpillIDWithIdx(*SpillNo, I);
+        LocIdx L = MTracker->getSpillMLoc(SpillID);
+        TTracker->clobberMloc(L, MI.getIterator(), true);
+      }
     }
   }
 }
@@ -1438,23 +1531,24 @@ void InstrRefBasedLDV::performCopy(Register SrcRegNum, Register DstRegNum) {
   }
 }
 
-bool InstrRefBasedLDV::isSpillInstruction(const MachineInstr &MI,
-                                          MachineFunction *MF) {
+Optional<SpillLocationNo>
+InstrRefBasedLDV::isSpillInstruction(const MachineInstr &MI,
+                                     MachineFunction *MF) {
   // TODO: Handle multiple stores folded into one.
   if (!MI.hasOneMemOperand())
-    return false;
+    return None;
 
   // Reject any memory operand that's aliased -- we can't guarantee its value.
   auto MMOI = MI.memoperands_begin();
   const PseudoSourceValue *PVal = (*MMOI)->getPseudoValue();
   if (PVal->isAliased(MFI))
-    return false;
+    return None;
 
   if (!MI.getSpillSize(TII) && !MI.getFoldedSpillSize(TII))
-    return false; // This is not a spill instruction, since no valid size was
-                  // returned from either function.
+    return None; // This is not a spill instruction, since no valid size was
+                 // returned from either function.
 
-  return true;
+  return extractSpillBaseRegAndOffset(MI);
 }
 
 bool InstrRefBasedLDV::isLocationSpill(const MachineInstr &MI,
@@ -1511,13 +1605,11 @@ bool InstrRefBasedLDV::transferSpillOrRestoreInst(MachineInstr &MI) {
   // First, if there are any DBG_VALUEs pointing at a spill slot that is
   // written to, terminate that variable location. The value in memory
   // will have changed. DbgEntityHistoryCalculator doesn't try to detect this.
-  if (isSpillInstruction(MI, MF)) {
-    SpillLocationNo Loc = extractSpillBaseRegAndOffset(MI);
-
+  if (Optional<SpillLocationNo> Loc = isSpillInstruction(MI, MF)) {
     // Un-set this location and clobber, so that earlier locations don't
     // continue past this store.
     for (unsigned SlotIdx = 0; SlotIdx < MTracker->NumSlotIdxes; ++SlotIdx) {
-      unsigned SpillID = MTracker->getSpillIDWithIdx(Loc, SlotIdx);
+      unsigned SpillID = MTracker->getSpillIDWithIdx(*Loc, SlotIdx);
       Optional<LocIdx> MLoc = MTracker->getSpillMLoc(SpillID);
       if (!MLoc)
         continue;
@@ -1535,7 +1627,9 @@ bool InstrRefBasedLDV::transferSpillOrRestoreInst(MachineInstr &MI) {
 
   // Try to recognise spill and restore instructions that may transfer a value.
   if (isLocationSpill(MI, MF, Reg)) {
-    SpillLocationNo Loc = extractSpillBaseRegAndOffset(MI);
+    // isLocationSpill returning true should guarantee we can extract a
+    // location.
+    SpillLocationNo Loc = *extractSpillBaseRegAndOffset(MI);
 
     auto DoTransfer = [&](Register SrcReg, unsigned SpillID) {
       auto ReadValue = MTracker->readReg(SrcReg);
@@ -1562,10 +1656,9 @@ bool InstrRefBasedLDV::transferSpillOrRestoreInst(MachineInstr &MI) {
     unsigned SpillID = MTracker->getLocID(Loc, {Size, 0});
     DoTransfer(Reg, SpillID);
   } else {
-    Optional<SpillLocationNo> OptLoc = isRestoreInstruction(MI, MF, Reg);
-    if (!OptLoc)
+    Optional<SpillLocationNo> Loc = isRestoreInstruction(MI, MF, Reg);
+    if (!Loc)
       return false;
-    SpillLocationNo Loc = *OptLoc;
 
     // Assumption: we're reading from the base of the stack slot, not some
     // offset into it. It seems very unlikely LLVM would ever generate
@@ -1583,22 +1676,17 @@ bool InstrRefBasedLDV::transferSpillOrRestoreInst(MachineInstr &MI) {
       LocIdx SrcIdx = MTracker->getSpillMLoc(SpillID);
       auto ReadValue = MTracker->readMLoc(SrcIdx);
       MTracker->setReg(DestReg, ReadValue);
-
-      if (TTracker) {
-        LocIdx DstLoc = MTracker->getRegMLoc(DestReg);
-        TTracker->transferMlocs(SrcIdx, DstLoc, MI.getIterator());
-      }
     };
 
     for (MCSubRegIterator SRI(Reg, TRI, false); SRI.isValid(); ++SRI) {
       unsigned Subreg = TRI->getSubRegIndex(Reg, *SRI);
-      unsigned SpillID = MTracker->getLocID(Loc, Subreg);
+      unsigned SpillID = MTracker->getLocID(*Loc, Subreg);
       DoTransfer(*SRI, SpillID);
     }
 
     // Directly look up this registers slot idx by size, and transfer.
     unsigned Size = TRI->getRegSizeInBits(Reg, *MRI);
-    unsigned SpillID = MTracker->getLocID(Loc, {Size, 0});
+    unsigned SpillID = MTracker->getLocID(*Loc, {Size, 0});
     DoTransfer(Reg, SpillID);
   }
   return true;
@@ -1724,8 +1812,8 @@ void InstrRefBasedLDV::accumulateFragmentMap(MachineInstr &MI) {
   AllSeenFragments.insert(ThisFragment);
 }
 
-void InstrRefBasedLDV::process(MachineInstr &MI, ValueIDNum **MLiveOuts,
-                               ValueIDNum **MLiveIns) {
+void InstrRefBasedLDV::process(MachineInstr &MI, const ValueTable *MLiveOuts,
+                               const ValueTable *MLiveIns) {
   // Try to interpret an MI as a debug or transfer instruction. Only if it's
   // none of these should we interpret it's register defs as new value
   // definitions.
@@ -1775,7 +1863,10 @@ void InstrRefBasedLDV::produceMLocTransferFunction(
 
     // Step through each instruction in this block.
     for (auto &MI : MBB) {
-      process(MI);
+      // Pass in an empty unique_ptr for the value tables when accumulating the
+      // machine transfer function.
+      process(MI, nullptr, nullptr);
+
       // Also accumulate fragment map.
       if (MI.isDebugValue() || MI.isDebugRef())
         accumulateFragmentMap(MI);
@@ -1864,7 +1955,7 @@ void InstrRefBasedLDV::produceMLocTransferFunction(
 
 bool InstrRefBasedLDV::mlocJoin(
     MachineBasicBlock &MBB, SmallPtrSet<const MachineBasicBlock *, 16> &Visited,
-    ValueIDNum **OutLocs, ValueIDNum *InLocs) {
+    FuncValueTable &OutLocs, ValueTable &InLocs) {
   LLVM_DEBUG(dbgs() << "join MBB: " << MBB.getNumber() << "\n");
   bool Changed = false;
 
@@ -1965,7 +2056,7 @@ void InstrRefBasedLDV::findStackIndexInterference(
 
 void InstrRefBasedLDV::placeMLocPHIs(
     MachineFunction &MF, SmallPtrSetImpl<MachineBasicBlock *> &AllBlocks,
-    ValueIDNum **MInLocs, SmallVectorImpl<MLocTransferMap> &MLocTransfer) {
+    FuncValueTable &MInLocs, SmallVectorImpl<MLocTransferMap> &MLocTransfer) {
   SmallVector<unsigned, 4> StackUnits;
   findStackIndexInterference(StackUnits);
 
@@ -2094,7 +2185,7 @@ void InstrRefBasedLDV::placeMLocPHIs(
 }
 
 void InstrRefBasedLDV::buildMLocValueMap(
-    MachineFunction &MF, ValueIDNum **MInLocs, ValueIDNum **MOutLocs,
+    MachineFunction &MF, FuncValueTable &MInLocs, FuncValueTable &MOutLocs,
     SmallVectorImpl<MLocTransferMap> &MLocTransfer) {
   std::priority_queue<unsigned int, std::vector<unsigned int>,
                       std::greater<unsigned int>>
@@ -2236,7 +2327,7 @@ void InstrRefBasedLDV::BlockPHIPlacement(
 
 Optional<ValueIDNum> InstrRefBasedLDV::pickVPHILoc(
     const MachineBasicBlock &MBB, const DebugVariable &Var,
-    const LiveIdxT &LiveOuts, ValueIDNum **MOutLocs,
+    const LiveIdxT &LiveOuts, FuncValueTable &MOutLocs,
     const SmallVectorImpl<const MachineBasicBlock *> &BlockOrders) {
   // Collect a set of locations from predecessor where its live-out value can
   // be found.
@@ -2504,7 +2595,7 @@ void InstrRefBasedLDV::getBlocksForScope(
 void InstrRefBasedLDV::buildVLocValueMap(
     const DILocation *DILoc, const SmallSet<DebugVariable, 4> &VarsWeCareAbout,
     SmallPtrSetImpl<MachineBasicBlock *> &AssignBlocks, LiveInsT &Output,
-    ValueIDNum **MOutLocs, ValueIDNum **MInLocs,
+    FuncValueTable &MOutLocs, FuncValueTable &MInLocs,
     SmallVectorImpl<VLocTracker> &AllTheVLocs) {
   // This method is much like buildMLocValueMap: but focuses on a single
   // LexicalScope at a time. Pick out a set of blocks and variables that are
@@ -2765,6 +2856,11 @@ void InstrRefBasedLDV::placePHIsForSingleVarDefinition(
   auto ValueIt = VLocs.Vars.find(Var);
   const DbgValue &Value = ValueIt->second;
 
+  // If it's an explicit assignment of "undef", that means there is no location
+  // anyway, anywhere.
+  if (Value.Kind == DbgValue::Undef)
+    return;
+
   // Assign the variable value to entry to each dominated block that's in scope.
   // Skip the definition block -- it's assigned the variable value in the middle
   // of the block somewhere.
@@ -2790,35 +2886,6 @@ void InstrRefBasedLDV::dump_mloc_transfer(
 }
 #endif
 
-void InstrRefBasedLDV::emitLocations(
-    MachineFunction &MF, LiveInsT SavedLiveIns, ValueIDNum **MOutLocs,
-    ValueIDNum **MInLocs, DenseMap<DebugVariable, unsigned> &AllVarsNumbering,
-    const TargetPassConfig &TPC) {
-  TTracker = new TransferTracker(TII, MTracker, MF, *TRI, CalleeSavedRegs, TPC);
-  unsigned NumLocs = MTracker->getNumLocs();
-
-  // For each block, load in the machine value locations and variable value
-  // live-ins, then step through each instruction in the block. New DBG_VALUEs
-  // to be inserted will be created along the way.
-  for (MachineBasicBlock &MBB : MF) {
-    unsigned bbnum = MBB.getNumber();
-    MTracker->reset();
-    MTracker->loadFromArray(MInLocs[bbnum], bbnum);
-    TTracker->loadInlocs(MBB, MInLocs[bbnum], SavedLiveIns[MBB.getNumber()],
-                         NumLocs);
-
-    CurBB = bbnum;
-    CurInst = 1;
-    for (auto &MI : MBB) {
-      process(MI, MOutLocs, MInLocs);
-      TTracker->checkInstForNewValues(CurInst, MI.getIterator());
-      ++CurInst;
-    }
-  }
-
-   emitTransfers(AllVarsNumbering);
-}
-
 void InstrRefBasedLDV::initialSetup(MachineFunction &MF) {
   // Build some useful data structures.
 
@@ -2861,8 +2928,172 @@ void InstrRefBasedLDV::initialSetup(MachineFunction &MF) {
 #endif
 }
 
+// Produce an "ejection map" for blocks, i.e., what's the highest-numbered
+// lexical scope it's used in. When exploring in DFS order and we pass that
+// scope, the block can be processed and any tracking information freed.
+void InstrRefBasedLDV::makeDepthFirstEjectionMap(
+    SmallVectorImpl<unsigned> &EjectionMap,
+    const ScopeToDILocT &ScopeToDILocation,
+    ScopeToAssignBlocksT &ScopeToAssignBlocks) {
+  SmallPtrSet<const MachineBasicBlock *, 8> BlocksToExplore;
+  SmallVector<std::pair<LexicalScope *, ssize_t>, 4> WorkStack;
+  auto *TopScope = LS.getCurrentFunctionScope();
+
+  // Unlike lexical scope explorers, we explore in reverse order, to find the
+  // "last" lexical scope used for each block early.
+  WorkStack.push_back({TopScope, TopScope->getChildren().size() - 1});
+
+  while (!WorkStack.empty()) {
+    auto &ScopePosition = WorkStack.back();
+    LexicalScope *WS = ScopePosition.first;
+    ssize_t ChildNum = ScopePosition.second--;
+
+    const SmallVectorImpl<LexicalScope *> &Children = WS->getChildren();
+    if (ChildNum >= 0) {
+      // If ChildNum is positive, there are remaining children to explore.
+      // Push the child and its children-count onto the stack.
+      auto &ChildScope = Children[ChildNum];
+      WorkStack.push_back(
+          std::make_pair(ChildScope, ChildScope->getChildren().size() - 1));
+    } else {
+      WorkStack.pop_back();
+
+      // We've explored all children and any later blocks: examine all blocks
+      // in our scope. If they haven't yet had an ejection number set, then
+      // this scope will be the last to use that block.
+      auto DILocationIt = ScopeToDILocation.find(WS);
+      if (DILocationIt != ScopeToDILocation.end()) {
+        getBlocksForScope(DILocationIt->second, BlocksToExplore,
+                          ScopeToAssignBlocks.find(WS)->second);
+        for (auto *MBB : BlocksToExplore) {
+          unsigned BBNum = MBB->getNumber();
+          if (EjectionMap[BBNum] == 0)
+            EjectionMap[BBNum] = WS->getDFSOut();
+        }
+
+        BlocksToExplore.clear();
+      }
+    }
+  }
+}
+
+bool InstrRefBasedLDV::depthFirstVLocAndEmit(
+    unsigned MaxNumBlocks, const ScopeToDILocT &ScopeToDILocation,
+    const ScopeToVarsT &ScopeToVars, ScopeToAssignBlocksT &ScopeToAssignBlocks,
+    LiveInsT &Output, FuncValueTable &MOutLocs, FuncValueTable &MInLocs,
+    SmallVectorImpl<VLocTracker> &AllTheVLocs, MachineFunction &MF,
+    DenseMap<DebugVariable, unsigned> &AllVarsNumbering,
+    const TargetPassConfig &TPC) {
+  TTracker = new TransferTracker(TII, MTracker, MF, *TRI, CalleeSavedRegs, TPC);
+  unsigned NumLocs = MTracker->getNumLocs();
+  VTracker = nullptr;
+
+  // No scopes? No variable locations.
+  if (!LS.getCurrentFunctionScope())
+    return false;
+
+  // Build map from block number to the last scope that uses the block.
+  SmallVector<unsigned, 16> EjectionMap;
+  EjectionMap.resize(MaxNumBlocks, 0);
+  makeDepthFirstEjectionMap(EjectionMap, ScopeToDILocation,
+                            ScopeToAssignBlocks);
+
+  // Helper lambda for ejecting a block -- if nothing is going to use the block,
+  // we can translate the variable location information into DBG_VALUEs and then
+  // free all of InstrRefBasedLDV's data structures.
+  auto EjectBlock = [&](MachineBasicBlock &MBB) -> void {
+    unsigned BBNum = MBB.getNumber();
+    AllTheVLocs[BBNum].clear();
+
+    // Prime the transfer-tracker, and then step through all the block
+    // instructions, installing transfers.
+    MTracker->reset();
+    MTracker->loadFromArray(MInLocs[BBNum], BBNum);
+    TTracker->loadInlocs(MBB, MInLocs[BBNum], Output[BBNum], NumLocs);
+
+    CurBB = BBNum;
+    CurInst = 1;
+    for (auto &MI : MBB) {
+      process(MI, MOutLocs.get(), MInLocs.get());
+      TTracker->checkInstForNewValues(CurInst, MI.getIterator());
+      ++CurInst;
+    }
+
+    // Free machine-location tables for this block.
+    MInLocs[BBNum].reset();
+    MOutLocs[BBNum].reset();
+    // We don't need live-in variable values for this block either.
+    Output[BBNum].clear();
+    AllTheVLocs[BBNum].clear();
+  };
+
+  SmallPtrSet<const MachineBasicBlock *, 8> BlocksToExplore;
+  SmallVector<std::pair<LexicalScope *, ssize_t>, 4> WorkStack;
+  WorkStack.push_back({LS.getCurrentFunctionScope(), 0});
+  unsigned HighestDFSIn = 0;
+
+  // Proceed to explore in depth first order.
+  while (!WorkStack.empty()) {
+    auto &ScopePosition = WorkStack.back();
+    LexicalScope *WS = ScopePosition.first;
+    ssize_t ChildNum = ScopePosition.second++;
+
+    // We obesrve scopes with children twice here, once descending in, once
+    // ascending out of the scope nest. Use HighestDFSIn as a ratchet to ensure
+    // we don't process a scope twice. Additionally, ignore scopes that don't
+    // have a DILocation -- by proxy, this means we never tracked any variable
+    // assignments in that scope.
+    auto DILocIt = ScopeToDILocation.find(WS);
+    if (HighestDFSIn <= WS->getDFSIn() && DILocIt != ScopeToDILocation.end()) {
+      const DILocation *DILoc = DILocIt->second;
+      auto &VarsWeCareAbout = ScopeToVars.find(WS)->second;
+      auto &BlocksInScope = ScopeToAssignBlocks.find(WS)->second;
+
+      buildVLocValueMap(DILoc, VarsWeCareAbout, BlocksInScope, Output, MOutLocs,
+                        MInLocs, AllTheVLocs);
+    }
+
+    HighestDFSIn = std::max(HighestDFSIn, WS->getDFSIn());
+
+    // Descend into any scope nests.
+    const SmallVectorImpl<LexicalScope *> &Children = WS->getChildren();
+    if (ChildNum < (ssize_t)Children.size()) {
+      // There are children to explore -- push onto stack and continue.
+      auto &ChildScope = Children[ChildNum];
+      WorkStack.push_back(std::make_pair(ChildScope, 0));
+    } else {
+      WorkStack.pop_back();
+
+      // We've explored a leaf, or have explored all the children of a scope.
+      // Try to eject any blocks where this is the last scope it's relevant to.
+      auto DILocationIt = ScopeToDILocation.find(WS);
+      if (DILocationIt == ScopeToDILocation.end())
+        continue;
+
+      getBlocksForScope(DILocationIt->second, BlocksToExplore,
+                        ScopeToAssignBlocks.find(WS)->second);
+      for (auto *MBB : BlocksToExplore)
+        if (WS->getDFSOut() == EjectionMap[MBB->getNumber()])
+          EjectBlock(const_cast<MachineBasicBlock &>(*MBB));
+
+      BlocksToExplore.clear();
+    }
+  }
+
+  // Some artificial blocks may not have been ejected, meaning they're not
+  // connected to an actual legitimate scope. This can technically happen
+  // with things like the entry block. In theory, we shouldn't need to do
+  // anything for such out-of-scope blocks, but for the sake of being similar
+  // to VarLocBasedLDV, eject these too.
+  for (auto *MBB : ArtificialBlocks)
+    if (MOutLocs[MBB->getNumber()])
+      EjectBlock(*MBB);
+
+  return emitTransfers(AllVarsNumbering);
+}
+
 bool InstrRefBasedLDV::emitTransfers(
-        DenseMap<DebugVariable, unsigned> &AllVarsNumbering) {
+    DenseMap<DebugVariable, unsigned> &AllVarsNumbering) {
   // Go through all the transfers recorded in the TransferTracker -- this is
   // both the live-ins to a block, and any movements of values that happen
   // in the middle.
@@ -2944,24 +3175,24 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
   assert(MaxNumBlocks >= 0);
   ++MaxNumBlocks;
 
+  initialSetup(MF);
+
   MLocTransfer.resize(MaxNumBlocks);
   vlocs.resize(MaxNumBlocks, VLocTracker(OverlapFragments, EmptyExpr));
   SavedLiveIns.resize(MaxNumBlocks);
 
-  initialSetup(MF);
-
   produceMLocTransferFunction(MF, MLocTransfer, MaxNumBlocks);
 
   // Allocate and initialize two array-of-arrays for the live-in and live-out
   // machine values. The outer dimension is the block number; while the inner
   // dimension is a LocIdx from MLocTracker.
-  ValueIDNum **MOutLocs = new ValueIDNum *[MaxNumBlocks];
-  ValueIDNum **MInLocs = new ValueIDNum *[MaxNumBlocks];
+  FuncValueTable MOutLocs = std::make_unique<ValueTable[]>(MaxNumBlocks);
+  FuncValueTable MInLocs = std::make_unique<ValueTable[]>(MaxNumBlocks);
   unsigned NumLocs = MTracker->getNumLocs();
   for (int i = 0; i < MaxNumBlocks; ++i) {
     // These all auto-initialize to ValueIDNum::EmptyValue
-    MOutLocs[i] = new ValueIDNum[NumLocs];
-    MInLocs[i] = new ValueIDNum[NumLocs];
+    MOutLocs[i] = std::make_unique<ValueIDNum[]>(NumLocs);
+    MInLocs[i] = std::make_unique<ValueIDNum[]>(NumLocs);
   }
 
   // Solve the machine value dataflow problem using the MLocTransfer function,
@@ -2974,7 +3205,10 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
   // either live-through machine values, or PHIs.
   for (auto &DBG_PHI : DebugPHINumToValue) {
     // Identify unresolved block-live-ins.
-    ValueIDNum &Num = DBG_PHI.ValueRead;
+    if (!DBG_PHI.ValueRead)
+      continue;
+
+    ValueIDNum &Num = *DBG_PHI.ValueRead;
     if (!Num.isPHI())
       continue;
 
@@ -2995,7 +3229,7 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
     MTracker->loadFromArray(MInLocs[CurBB], CurBB);
     CurInst = 1;
     for (auto &MI : MBB) {
-      process(MI, MOutLocs, MInLocs);
+      process(MI, MOutLocs.get(), MInLocs.get());
       ++CurInst;
     }
     MTracker->reset();
@@ -3051,32 +3285,13 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
                       << VarAssignCount
                       << " variable assignments, exceeding limits.\n");
   } else {
-    // Compute the extended ranges, iterating over scopes. There might be
-    // something to be said for ordering them by size/locality, but that's for
-    // the future. For each scope, solve the variable value problem, producing
-    // a map of variables to values in SavedLiveIns.
-    for (auto &P : ScopeToVars) {
-      buildVLocValueMap(ScopeToDILocation[P.first], P.second,
-                   ScopeToAssignBlocks[P.first], SavedLiveIns, MOutLocs, MInLocs,
-                   vlocs);
-    }
-
-    // Using the computed value locations and variable values for each block,
-    // create the DBG_VALUE instructions representing the extended variable
-    // locations.
-    emitLocations(MF, SavedLiveIns, MOutLocs, MInLocs, AllVarsNumbering, *TPC);
-
-    // Did we actually make any changes? If we created any DBG_VALUEs, then yes.
-    Changed = TTracker->Transfers.size() != 0;
-  }
-
-  // Common clean-up of memory.
-  for (int Idx = 0; Idx < MaxNumBlocks; ++Idx) {
-    delete[] MOutLocs[Idx];
-    delete[] MInLocs[Idx];
+    // Optionally, solve the variable value problem and emit to blocks by using
+    // a lexical-scope-depth search. It should be functionally identical to
+    // the "else" block of this condition.
+    Changed = depthFirstVLocAndEmit(
+        MaxNumBlocks, ScopeToDILocation, ScopeToVars, ScopeToAssignBlocks,
+        SavedLiveIns, MOutLocs, MInLocs, vlocs, MF, AllVarsNumbering, *TPC);
   }
-  delete[] MOutLocs;
-  delete[] MInLocs;
 
   delete MTracker;
   delete TTracker;
@@ -3092,6 +3307,7 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
   DebugPHINumToValue.clear();
   OverlapFragments.clear();
   SeenFragments.clear();
+  SeenDbgPHIs.clear();
 
   return Changed;
 }
@@ -3193,9 +3409,10 @@ public:
   /// Machine location where any PHI must occur.
   LocIdx Loc;
   /// Table of live-in machine value numbers for blocks / locations.
-  ValueIDNum **MLiveIns;
+  const ValueTable *MLiveIns;
 
-  LDVSSAUpdater(LocIdx L, ValueIDNum **MLiveIns) : Loc(L), MLiveIns(MLiveIns) {}
+  LDVSSAUpdater(LocIdx L, const ValueTable *MLiveIns)
+      : Loc(L), MLiveIns(MLiveIns) {}
 
   void reset() {
     for (auto &Block : BlockMap)
@@ -3352,11 +3569,28 @@ public:
 
 } // end namespace llvm
 
-Optional<ValueIDNum> InstrRefBasedLDV::resolveDbgPHIs(MachineFunction &MF,
-                                                      ValueIDNum **MLiveOuts,
-                                                      ValueIDNum **MLiveIns,
-                                                      MachineInstr &Here,
-                                                      uint64_t InstrNum) {
+Optional<ValueIDNum> InstrRefBasedLDV::resolveDbgPHIs(
+    MachineFunction &MF, const ValueTable *MLiveOuts,
+    const ValueTable *MLiveIns, MachineInstr &Here, uint64_t InstrNum) {
+  assert(MLiveOuts && MLiveIns &&
+         "Tried to resolve DBG_PHI before location "
+         "tables allocated?");
+
+  // This function will be called twice per DBG_INSTR_REF, and might end up
+  // computing lots of SSA information: memoize it.
+  auto SeenDbgPHIIt = SeenDbgPHIs.find(&Here);
+  if (SeenDbgPHIIt != SeenDbgPHIs.end())
+    return SeenDbgPHIIt->second;
+
+  Optional<ValueIDNum> Result =
+      resolveDbgPHIsImpl(MF, MLiveOuts, MLiveIns, Here, InstrNum);
+  SeenDbgPHIs.insert({&Here, Result});
+  return Result;
+}
+
+Optional<ValueIDNum> InstrRefBasedLDV::resolveDbgPHIsImpl(
+    MachineFunction &MF, const ValueTable *MLiveOuts,
+    const ValueTable *MLiveIns, MachineInstr &Here, uint64_t InstrNum) {
   // Pick out records of DBG_PHI instructions that have been observed. If there
   // are none, then we cannot compute a value number.
   auto RangePair = std::equal_range(DebugPHINumToValue.begin(),
@@ -3368,17 +3602,24 @@ Optional<ValueIDNum> InstrRefBasedLDV::resolveDbgPHIs(MachineFunction &MF,
   if (LowerIt == UpperIt)
     return None;
 
+  // If any DBG_PHIs referred to a location we didn't understand, don't try to
+  // compute a value. There might be scenarios where we could recover a value
+  // for some range of DBG_INSTR_REFs, but at this point we can have high
+  // confidence that we've seen a bug.
+  auto DBGPHIRange = make_range(LowerIt, UpperIt);
+  for (const DebugPHIRecord &DBG_PHI : DBGPHIRange)
+    if (!DBG_PHI.ValueRead)
+      return None;
+
   // If there's only one DBG_PHI, then that is our value number.
   if (std::distance(LowerIt, UpperIt) == 1)
-    return LowerIt->ValueRead;
-
-  auto DBGPHIRange = make_range(LowerIt, UpperIt);
+    return *LowerIt->ValueRead;
 
   // Pick out the location (physreg, slot) where any PHIs must occur. It's
   // technically possible for us to merge values in different registers in each
   // block, but highly unlikely that LLVM will generate such code after register
   // allocation.
-  LocIdx Loc = LowerIt->ReadLoc;
+  LocIdx Loc = *LowerIt->ReadLoc;
 
   // We have several DBG_PHIs, and a use position (the Here inst). All each
   // DBG_PHI does is identify a value at a program position. We can treat each
@@ -3397,7 +3638,7 @@ Optional<ValueIDNum> InstrRefBasedLDV::resolveDbgPHIs(MachineFunction &MF,
   // for the SSAUpdater.
   for (const auto &DBG_PHI : DBGPHIRange) {
     LDVSSABlock *Block = Updater.getSSALDVBlock(DBG_PHI.MBB);
-    const ValueIDNum &Num = DBG_PHI.ValueRead;
+    const ValueIDNum &Num = *DBG_PHI.ValueRead;
     AvailableValues.insert(std::make_pair(Block, Num.asU64()));
   }
 
@@ -3431,7 +3672,7 @@ Optional<ValueIDNum> InstrRefBasedLDV::resolveDbgPHIs(MachineFunction &MF,
   // Define all the input DBG_PHI values in ValidatedValues.
   for (const auto &DBG_PHI : DBGPHIRange) {
     LDVSSABlock *Block = Updater.getSSALDVBlock(DBG_PHI.MBB);
-    const ValueIDNum &Num = DBG_PHI.ValueRead;
+    const ValueIDNum &Num = *DBG_PHI.ValueRead;
     ValidatedValues.insert(std::make_pair(Block, Num));
   }
 
@@ -3456,7 +3697,7 @@ Optional<ValueIDNum> InstrRefBasedLDV::resolveDbgPHIs(MachineFunction &MF,
         return None;
 
       ValueIDNum ValueToCheck;
-      ValueIDNum *BlockLiveOuts = MLiveOuts[PHIIt.first->BB.getNumber()];
+      const ValueTable &BlockLiveOuts = MLiveOuts[PHIIt.first->BB.getNumber()];
 
       auto VVal = ValidatedValues.find(PHIIt.first);
       if (VVal == ValidatedValues.end()) {
diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h
index e7383209c027..70aae47c8bdc 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h
+++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h
@@ -10,17 +10,14 @@
 #define LLVM_LIB_CODEGEN_LIVEDEBUGVALUES_INSTRREFBASEDLDV_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/IndexedMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/UniqueVector.h"
 #include "llvm/CodeGen/LexicalScopes.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/TargetFrameLowering.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 
 #include "LiveDebugValues.h"
@@ -171,6 +168,13 @@ public:
   static ValueIDNum TombstoneValue;
 };
 
+/// Type for a table of values in a block.
+using ValueTable = std::unique_ptr<ValueIDNum[]>;
+
+/// Type for a table-of-table-of-values, i.e., the collection of either
+/// live-in or live-out values for each block in the function.
+using FuncValueTable = std::unique_ptr<ValueTable[]>;
+
 /// Thin wrapper around an integer -- designed to give more type safety to
 /// spill location numbers.
 class SpillLocationNo {
@@ -192,7 +196,7 @@ public:
 };
 
 /// Meta qualifiers for a value. Pair of whatever expression is used to qualify
-/// the the value, and Boolean of whether or not it's indirect.
+/// the value, and Boolean of whether or not it's indirect.
 class DbgValueProperties {
 public:
   DbgValueProperties(const DIExpression *DIExpr, bool Indirect)
@@ -507,7 +511,7 @@ public:
 
   /// Load values for each location from array of ValueIDNums. Take current
   /// bbnum just in case we read a value from a hitherto untouched register.
-  void loadFromArray(ValueIDNum *Locs, unsigned NewCurBB) {
+  void loadFromArray(ValueTable &Locs, unsigned NewCurBB) {
     CurBB = NewCurBB;
     // Iterate over all tracked locations, and load each locations live-in
     // value into our local index.
@@ -616,7 +620,9 @@ public:
   void writeRegMask(const MachineOperand *MO, unsigned CurBB, unsigned InstID);
 
   /// Find LocIdx for SpillLoc \p L, creating a new one if it's not tracked.
-  SpillLocationNo getOrTrackSpillLoc(SpillLoc L);
+  /// Returns None when in scenarios where a spill slot could be tracked, but
+  /// we would likely run into resource limitations.
+  Optional<SpillLocationNo> getOrTrackSpillLoc(SpillLoc L);
 
   // Get LocIdx of a spill ID.
   LocIdx getSpillMLoc(unsigned SpillID) {
@@ -627,6 +633,19 @@ public:
   /// Return true if Idx is a spill machine location.
   bool isSpill(LocIdx Idx) const { return LocIdxToLocID[Idx] >= NumRegs; }
 
+  /// How large is this location (aka, how wide is a value defined there?).
+  unsigned getLocSizeInBits(LocIdx L) const {
+    unsigned ID = LocIdxToLocID[L];
+    if (!isSpill(L)) {
+      return TRI.getRegSizeInBits(Register(ID), MF.getRegInfo());
+    } else {
+      // The slot location on the stack is uninteresting, we care about the
+      // position of the value within the slot (which comes with a size).
+      StackSlotPos Pos = locIDToSpillIdx(ID);
+      return Pos.first;
+    }
+  }
+
   MLocIterator begin() { return MLocIterator(LocIdxToIDNum, 0); }
 
   MLocIterator end() {
@@ -678,7 +697,7 @@ public:
   /// movement of values between locations inside of a block is handled at a
   /// much later stage, in the TransferTracker class.
   MapVector<DebugVariable, DbgValue> Vars;
-  DenseMap<DebugVariable, const DILocation *> Scopes;
+  SmallDenseMap<DebugVariable, const DILocation *, 8> Scopes;
   MachineBasicBlock *MBB = nullptr;
   const OverlapMap &OverlappingFragments;
   DbgValueProperties EmptyProperties;
@@ -747,6 +766,11 @@ public:
       Scopes[Overlapped] = Loc;
     }
   }
+
+  void clear() {
+    Vars.clear();
+    Scopes.clear();
+  }
 };
 
 // XXX XXX docs
@@ -844,10 +868,16 @@ private:
   /// Record of where we observed a DBG_PHI instruction.
   class DebugPHIRecord {
   public:
-    uint64_t InstrNum;      ///< Instruction number of this DBG_PHI.
-    MachineBasicBlock *MBB; ///< Block where DBG_PHI occurred.
-    ValueIDNum ValueRead;   ///< The value number read by the DBG_PHI.
-    LocIdx ReadLoc;         ///< Register/Stack location the DBG_PHI reads.
+    /// Instruction number of this DBG_PHI.
+    uint64_t InstrNum;
+    /// Block where DBG_PHI occurred.
+    MachineBasicBlock *MBB;
+    /// The value number read by the DBG_PHI -- or None if it didn't refer to
+    /// a value.
+    Optional<ValueIDNum> ValueRead;
+    /// Register/Stack location the DBG_PHI reads -- or None if it referred to
+    /// something unexpected.
+    Optional<LocIdx> ReadLoc;
 
     operator unsigned() const { return InstrNum; }
   };
@@ -862,6 +892,12 @@ private:
   OverlapMap OverlapFragments;
   VarToFragments SeenFragments;
 
+  /// Mapping of DBG_INSTR_REF instructions to their values, for those
+  /// DBG_INSTR_REFs that call resolveDbgPHIs. These variable references solve
+  /// a mini SSA problem caused by DBG_PHIs being cloned, this collection caches
+  /// the result.
+  DenseMap<MachineInstr *, Optional<ValueIDNum>> SeenDbgPHIs;
+
   /// True if we need to examine call instructions for stack clobbers. We
   /// normally assume that they don't clobber SP, but stack probes on Windows
   /// do.
@@ -873,7 +909,8 @@ private:
   StringRef StackProbeSymbolName;
 
   /// Tests whether this instruction is a spill to a stack slot.
-  bool isSpillInstruction(const MachineInstr &MI, MachineFunction *MF);
+  Optional<SpillLocationNo> isSpillInstruction(const MachineInstr &MI,
+                                               MachineFunction *MF);
 
   /// Decide if @MI is a spill instruction and return true if it is. We use 2
   /// criteria to make this decision:
@@ -891,11 +928,12 @@ private:
 
   /// Given a spill instruction, extract the spill slot information, ensure it's
   /// tracked, and return the spill number.
-  SpillLocationNo extractSpillBaseRegAndOffset(const MachineInstr &MI);
+  Optional<SpillLocationNo>
+  extractSpillBaseRegAndOffset(const MachineInstr &MI);
 
   /// Observe a single instruction while stepping through a block.
-  void process(MachineInstr &MI, ValueIDNum **MLiveOuts = nullptr,
-               ValueIDNum **MLiveIns = nullptr);
+  void process(MachineInstr &MI, const ValueTable *MLiveOuts,
+               const ValueTable *MLiveIns);
 
   /// Examines whether \p MI is a DBG_VALUE and notifies trackers.
   /// \returns true if MI was recognized and processed.
@@ -903,8 +941,8 @@ private:
 
   /// Examines whether \p MI is a DBG_INSTR_REF and notifies trackers.
   /// \returns true if MI was recognized and processed.
-  bool transferDebugInstrRef(MachineInstr &MI, ValueIDNum **MLiveOuts,
-                             ValueIDNum **MLiveIns);
+  bool transferDebugInstrRef(MachineInstr &MI, const ValueTable *MLiveOuts,
+                             const ValueTable *MLiveIns);
 
   /// Stores value-information about where this PHI occurred, and what
   /// instruction number is associated with it.
@@ -936,9 +974,15 @@ private:
   /// \p InstrNum Debug instruction number defined by DBG_PHI instructions.
   /// \returns The machine value number at position Here, or None.
   Optional<ValueIDNum> resolveDbgPHIs(MachineFunction &MF,
-                                      ValueIDNum **MLiveOuts,
-                                      ValueIDNum **MLiveIns, MachineInstr &Here,
-                                      uint64_t InstrNum);
+                                      const ValueTable *MLiveOuts,
+                                      const ValueTable *MLiveIns,
+                                      MachineInstr &Here, uint64_t InstrNum);
+
+  Optional<ValueIDNum> resolveDbgPHIsImpl(MachineFunction &MF,
+                                          const ValueTable *MLiveOuts,
+                                          const ValueTable *MLiveIns,
+                                          MachineInstr &Here,
+                                          uint64_t InstrNum);
 
   /// Step through the function, recording register definitions and movements
   /// in an MLocTracker. Convert the observations into a per-block transfer
@@ -954,8 +998,8 @@ private:
   /// live-out arrays to the (initialized to zero) multidimensional arrays in
   /// \p MInLocs and \p MOutLocs. The outer dimension is indexed by block
   /// number, the inner by LocIdx.
-  void buildMLocValueMap(MachineFunction &MF, ValueIDNum **MInLocs,
-                         ValueIDNum **MOutLocs,
+  void buildMLocValueMap(MachineFunction &MF, FuncValueTable &MInLocs,
+                         FuncValueTable &MOutLocs,
                          SmallVectorImpl<MLocTransferMap> &MLocTransfer);
 
   /// Examine the stack indexes (i.e. offsets within the stack) to find the
@@ -966,7 +1010,7 @@ private:
   /// the IDF of each register.
   void placeMLocPHIs(MachineFunction &MF,
                      SmallPtrSetImpl<MachineBasicBlock *> &AllBlocks,
-                     ValueIDNum **MInLocs,
+                     FuncValueTable &MInLocs,
                      SmallVectorImpl<MLocTransferMap> &MLocTransfer);
 
   /// Propagate variable values to blocks in the common case where there's
@@ -997,7 +1041,7 @@ private:
   /// is true, revisiting this block is necessary.
   bool mlocJoin(MachineBasicBlock &MBB,
                 SmallPtrSet<const MachineBasicBlock *, 16> &Visited,
-                ValueIDNum **OutLocs, ValueIDNum *InLocs);
+                FuncValueTable &OutLocs, ValueTable &InLocs);
 
   /// Produce a set of blocks that are in the current lexical scope. This means
   /// those blocks that contain instructions "in" the scope, blocks where
@@ -1025,11 +1069,11 @@ private:
   /// scope, but which do contain DBG_VALUEs, which VarLocBasedImpl tracks
   /// locations through.
   void buildVLocValueMap(const DILocation *DILoc,
-                    const SmallSet<DebugVariable, 4> &VarsWeCareAbout,
-                    SmallPtrSetImpl<MachineBasicBlock *> &AssignBlocks,
-                    LiveInsT &Output, ValueIDNum **MOutLocs,
-                    ValueIDNum **MInLocs,
-                    SmallVectorImpl<VLocTracker> &AllTheVLocs);
+                         const SmallSet<DebugVariable, 4> &VarsWeCareAbout,
+                         SmallPtrSetImpl<MachineBasicBlock *> &AssignBlocks,
+                         LiveInsT &Output, FuncValueTable &MOutLocs,
+                         FuncValueTable &MInLocs,
+                         SmallVectorImpl<VLocTracker> &AllTheVLocs);
 
   /// Attempt to eliminate un-necessary PHIs on entry to a block. Examines the
   /// live-in values coming from predecessors live-outs, and replaces any PHIs
@@ -1047,21 +1091,9 @@ private:
   /// \returns Value ID of a machine PHI if an appropriate one is available.
   Optional<ValueIDNum>
   pickVPHILoc(const MachineBasicBlock &MBB, const DebugVariable &Var,
-              const LiveIdxT &LiveOuts, ValueIDNum **MOutLocs,
+              const LiveIdxT &LiveOuts, FuncValueTable &MOutLocs,
               const SmallVectorImpl<const MachineBasicBlock *> &BlockOrders);
 
-  /// Given the solutions to the two dataflow problems, machine value locations
-  /// in \p MInLocs and live-in variable values in \p SavedLiveIns, runs the
-  /// TransferTracker class over the function to produce live-in and transfer
-  /// DBG_VALUEs, then inserts them. Groups of DBG_VALUEs are inserted in the
-  /// order given by AllVarsNumbering -- this could be any stable order, but
-  /// right now "order of appearence in function, when explored in RPO", so
-  /// that we can compare explictly against VarLocBasedImpl.
-  void emitLocations(MachineFunction &MF, LiveInsT SavedLiveIns,
-                     ValueIDNum **MOutLocs, ValueIDNum **MInLocs,
-                     DenseMap<DebugVariable, unsigned> &AllVarsNumbering,
-                     const TargetPassConfig &TPC);
-
   /// Take collections of DBG_VALUE instructions stored in TTracker, and
   /// install them into their output blocks. Preserves a stable order of
   /// DBG_VALUEs produced (which would otherwise cause nondeterminism) through
@@ -1072,6 +1104,28 @@ private:
   /// RPOT block ordering.
   void initialSetup(MachineFunction &MF);
 
+  /// Produce a map of the last lexical scope that uses a block, using the
+  /// scopes DFSOut number. Mapping is block-number to DFSOut.
+  /// \p EjectionMap Pre-allocated vector in which to install the built ma.
+  /// \p ScopeToDILocation Mapping of LexicalScopes to their DILocations.
+  /// \p AssignBlocks Map of blocks where assignments happen for a scope.
+  void makeDepthFirstEjectionMap(SmallVectorImpl<unsigned> &EjectionMap,
+                                 const ScopeToDILocT &ScopeToDILocation,
+                                 ScopeToAssignBlocksT &AssignBlocks);
+
+  /// When determining per-block variable values and emitting to DBG_VALUEs,
+  /// this function explores by lexical scope depth. Doing so means that per
+  /// block information can be fully computed before exploration finishes,
+  /// allowing us to emit it and free data structures earlier than otherwise.
+  /// It's also good for locality.
+  bool depthFirstVLocAndEmit(
+      unsigned MaxNumBlocks, const ScopeToDILocT &ScopeToDILocation,
+      const ScopeToVarsT &ScopeToVars, ScopeToAssignBlocksT &ScopeToBlocks,
+      LiveInsT &Output, FuncValueTable &MOutLocs, FuncValueTable &MInLocs,
+      SmallVectorImpl<VLocTracker> &AllTheVLocs, MachineFunction &MF,
+      DenseMap<DebugVariable, unsigned> &AllVarsNumbering,
+      const TargetPassConfig &TPC);
+
   bool ExtendRanges(MachineFunction &MF, MachineDominatorTree *DomTree,
                     TargetPassConfig *TPC, unsigned InputBBLimit,
                     unsigned InputDbgValLimit) override;
diff --git a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp
index 8f697611a82c..141008ac2296 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp
@@ -8,14 +8,16 @@
 
 #include "LiveDebugValues.h"
 
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetMachine.h"
 
 /// \file LiveDebugValues.cpp
 ///
@@ -65,7 +67,7 @@ public:
   static char ID;
 
   LiveDebugValues();
-  ~LiveDebugValues() {}
+  ~LiveDebugValues() = default;
 
   /// Calculate the liveness information for the given machine function.
   bool runOnMachineFunction(MachineFunction &MF) override;
@@ -123,6 +125,11 @@ bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) {
 }
 
 bool llvm::debuginfoShouldUseDebugInstrRef(const Triple &T) {
+  // Enable by default on x86_64, disable if explicitly turned off on cmdline.
+  if (T.getArch() == llvm::Triple::x86_64 &&
+      ValueTrackingVariableLocations != cl::boolOrDefault::BOU_FALSE)
+    return true;
+
   // Enable if explicitly requested on command line.
   return ValueTrackingVariableLocations == cl::boolOrDefault::BOU_TRUE;
 }
diff --git a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h
index 8f0b2ec3e1fc..6cc1685c0022 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h
+++ b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h
@@ -9,12 +9,11 @@
 #ifndef LLVM_LIB_CODEGEN_LIVEDEBUGVALUES_LIVEDEBUGVALUES_H
 #define LLVM_LIB_CODEGEN_LIVEDEBUGVALUES_LIVEDEBUGVALUES_H
 
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/ADT/Triple.h"
-
 namespace llvm {
+class MachineDominatorTree;
+class MachineFunction;
+class TargetPassConfig;
+class Triple;
 
 // Inline namespace for types / symbols shared between different
 // LiveDebugValues implementations.
@@ -28,7 +27,7 @@ public:
   virtual bool ExtendRanges(MachineFunction &MF, MachineDominatorTree *DomTree,
                             TargetPassConfig *TPC, unsigned InputBBLimit,
                             unsigned InputDbgValLimit) = 0;
-  virtual ~LDVImpl() {}
+  virtual ~LDVImpl() = default;
 };
 
 } // namespace SharedLiveDebugValues
diff --git a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
index 42a0967bce3f..24c00b8a10ec 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
@@ -118,18 +118,15 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/UniqueVector.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/LexicalScopes.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -137,16 +134,11 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/Config/llvm-config.h"
-#include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
-#include "llvm/InitializePasses.h"
 #include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/TypeSize.h"
 #include "llvm/Support/raw_ostream.h"
@@ -922,14 +914,14 @@ private:
     std::unique_ptr<VarLocSet> &VLS = Locs[MBB];
     if (!VLS)
       VLS = std::make_unique<VarLocSet>(Alloc);
-    return *VLS.get();
+    return *VLS;
   }
 
   const VarLocSet &getVarLocsInMBB(const MachineBasicBlock *MBB,
                                    const VarLocInMBB &Locs) const {
     auto It = Locs.find(MBB);
     assert(It != Locs.end() && "MBB not in map");
-    return *It->second.get();
+    return *It->second;
   }
 
   /// Tests whether this instruction is a spill to a stack location.
@@ -1035,9 +1027,9 @@ public:
 //            Implementation
 //===----------------------------------------------------------------------===//
 
-VarLocBasedLDV::VarLocBasedLDV() { }
+VarLocBasedLDV::VarLocBasedLDV() = default;
 
-VarLocBasedLDV::~VarLocBasedLDV() { }
+VarLocBasedLDV::~VarLocBasedLDV() = default;
 
 /// Erase a variable from the set of open ranges, and additionally erase any
 /// fragments that may overlap it. If the VarLoc is a backup location, erase
@@ -1948,7 +1940,7 @@ bool VarLocBasedLDV::join(
 
     // Just copy over the Out locs to incoming locs for the first visited
     // predecessor, and for all other predecessors join the Out locs.
-    VarLocSet &OutLocVLS = *OL->second.get();
+    VarLocSet &OutLocVLS = *OL->second;
     if (!NumVisited)
       InLocsT = OutLocVLS;
     else
@@ -2007,7 +1999,7 @@ void VarLocBasedLDV::flushPendingLocs(VarLocInMBB &PendingInLocs,
   for (auto &Iter : PendingInLocs) {
     // Map is keyed on a constant pointer, unwrap it so we can insert insts.
     auto &MBB = const_cast<MachineBasicBlock &>(*Iter.first);
-    VarLocSet &Pending = *Iter.second.get();
+    VarLocSet &Pending = *Iter.second;
 
     SmallVector<VarLoc, 32> VarLocs;
     collectAllVarLocs(VarLocs, Pending, VarLocIDs);
diff --git a/llvm/lib/CodeGen/LiveDebugVariables.cpp b/llvm/lib/CodeGen/LiveDebugVariables.cpp
index 6d806135240e..35cf25330186 100644
--- a/llvm/lib/CodeGen/LiveDebugVariables.cpp
+++ b/llvm/lib/CodeGen/LiveDebugVariables.cpp
@@ -28,6 +28,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/LexicalScopes.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervals.h"
@@ -38,11 +39,9 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
@@ -50,15 +49,12 @@
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/Metadata.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
 #include <algorithm>
 #include <cassert>
 #include <iterator>
@@ -976,7 +972,7 @@ void UserValue::extendDef(
     if (Segment->end < Stop) {
       Stop = Segment->end;
       Kills = {Stop, {LII.first}};
-    } else if (Segment->end == Stop && Kills.hasValue()) {
+    } else if (Segment->end == Stop && Kills) {
       // If multiple locations end at the same place, track all of them in
       // Kills.
       Kills->second.push_back(LII.first);
@@ -1854,16 +1850,33 @@ void LDVImpl::emitDebugValues(VirtRegMap *VRM) {
       const TargetRegisterClass *TRC = MRI.getRegClass(Reg);
       unsigned SpillSize, SpillOffset;
 
-      // Test whether this location is legal with the given subreg.
+      unsigned regSizeInBits = TRI->getRegSizeInBits(*TRC);
+      if (SubReg)
+        regSizeInBits = TRI->getSubRegIdxSize(SubReg);
+
+      // Test whether this location is legal with the given subreg. If the
+      // subregister has a nonzero offset, drop this location, it's too complex
+      // to describe. (TODO: future work).
       bool Success =
           TII->getStackSlotRange(TRC, SubReg, SpillSize, SpillOffset, *MF);
 
-      if (Success) {
+      if (Success && SpillOffset == 0) {
         auto Builder = BuildMI(*OrigMBB, OrigMBB->begin(), DebugLoc(),
                                TII->get(TargetOpcode::DBG_PHI));
         Builder.addFrameIndex(VRM->getStackSlot(Reg));
         Builder.addImm(InstNum);
+        // Record how large the original value is. The stack slot might be
+        // merged and altered during optimisation, but we will want to know how
+        // large the value is, at this DBG_PHI.
+        Builder.addImm(regSizeInBits);
+      }
+
+      LLVM_DEBUG(
+      if (SpillOffset != 0) {
+        dbgs() << "DBG_PHI for Vreg " << Reg << " subreg " << SubReg <<
+                  " has nonzero offset\n";
       }
+      );
     }
     // If there was no mapping for a value ID, it's optimized out. Create no
     // DBG_PHI, and any variables using this value will become optimized out.
diff --git a/llvm/lib/CodeGen/LiveInterval.cpp b/llvm/lib/CodeGen/LiveInterval.cpp
index 9ded0fb6ae0a..9378aaeb181c 100644
--- a/llvm/lib/CodeGen/LiveInterval.cpp
+++ b/llvm/lib/CodeGen/LiveInterval.cpp
@@ -348,23 +348,8 @@ private:
 //===----------------------------------------------------------------------===//
 
 LiveRange::iterator LiveRange::find(SlotIndex Pos) {
-  // This algorithm is basically std::upper_bound.
-  // Unfortunately, std::upper_bound cannot be used with mixed types until we
-  // adopt C++0x. Many libraries can do it, but not all.
-  if (empty() || Pos >= endIndex())
-    return end();
-  iterator I = begin();
-  size_t Len = size();
-  do {
-    size_t Mid = Len >> 1;
-    if (Pos < I[Mid].end) {
-      Len = Mid;
-    } else {
-      I += Mid + 1;
-      Len -= Mid + 1;
-    }
-  } while (Len);
-  return I;
+  return llvm::partition_point(*this,
+                               [&](const Segment &X) { return X.end <= Pos; });
 }
 
 VNInfo *LiveRange::createDeadDef(SlotIndex Def, VNInfo::Allocator &VNIAlloc) {
diff --git a/llvm/lib/CodeGen/LiveIntervalCalc.cpp b/llvm/lib/CodeGen/LiveIntervalCalc.cpp
index 2756086cb8b1..3176d73b35f6 100644
--- a/llvm/lib/CodeGen/LiveIntervalCalc.cpp
+++ b/llvm/lib/CodeGen/LiveIntervalCalc.cpp
@@ -11,13 +11,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/LiveIntervalCalc.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/CodeGen/LiveInterval.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -25,12 +21,7 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cassert>
-#include <iterator>
-#include <tuple>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/LiveIntervalUnion.cpp b/llvm/lib/CodeGen/LiveIntervalUnion.cpp
index 50b31e1eb247..11a4ecf0bef9 100644
--- a/llvm/lib/CodeGen/LiveIntervalUnion.cpp
+++ b/llvm/lib/CodeGen/LiveIntervalUnion.cpp
@@ -26,7 +26,8 @@ using namespace llvm;
 #define DEBUG_TYPE "regalloc"
 
 // Merge a LiveInterval's segments. Guarantee no overlaps.
-void LiveIntervalUnion::unify(LiveInterval &VirtReg, const LiveRange &Range) {
+void LiveIntervalUnion::unify(const LiveInterval &VirtReg,
+                              const LiveRange &Range) {
   if (Range.empty())
     return;
   ++Tag;
@@ -53,7 +54,8 @@ void LiveIntervalUnion::unify(LiveInterval &VirtReg, const LiveRange &Range) {
 }
 
 // Remove a live virtual register's segments from this union.
-void LiveIntervalUnion::extract(LiveInterval &VirtReg, const LiveRange &Range) {
+void LiveIntervalUnion::extract(const LiveInterval &VirtReg,
+                                const LiveRange &Range) {
   if (Range.empty())
     return;
   ++Tag;
@@ -99,7 +101,7 @@ void LiveIntervalUnion::verify(LiveVirtRegBitSet& VisitedVRegs) {
 }
 #endif //!NDEBUG
 
-LiveInterval *LiveIntervalUnion::getOneVReg() const {
+const LiveInterval *LiveIntervalUnion::getOneVReg() const {
   if (empty())
     return nullptr;
   for (LiveSegments::const_iterator SI = Segments.begin(); SI.valid(); ++SI) {
@@ -111,7 +113,8 @@ LiveInterval *LiveIntervalUnion::getOneVReg() const {
 
 // Scan the vector of interfering virtual registers in this union. Assume it's
 // quite small.
-bool LiveIntervalUnion::Query::isSeenInterference(LiveInterval *VirtReg) const {
+bool LiveIntervalUnion::Query::isSeenInterference(
+    const LiveInterval *VirtReg) const {
   return is_contained(InterferingVRegs, VirtReg);
 }
 
@@ -147,14 +150,14 @@ LiveIntervalUnion::Query::collectInterferingVRegs(unsigned MaxInterferingRegs) {
   }
 
   LiveRange::const_iterator LREnd = LR->end();
-  LiveInterval *RecentReg = nullptr;
+  const LiveInterval *RecentReg = nullptr;
   while (LiveUnionI.valid()) {
     assert(LRI != LREnd && "Reached end of LR");
 
     // Check for overlapping interference.
     while (LRI->start < LiveUnionI.stop() && LRI->end > LiveUnionI.start()) {
       // This is an overlap, record the interfering register.
-      LiveInterval *VReg = LiveUnionI.value();
+      const LiveInterval *VReg = LiveUnionI.value();
       if (VReg != RecentReg && !isSeenInterference(VReg)) {
         RecentReg = VReg;
         InterferingVRegs.push_back(VReg);
diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp
index 9571afa434c1..7d825a8bf853 100644
--- a/llvm/lib/CodeGen/LiveIntervals.cpp
+++ b/llvm/lib/CodeGen/LiveIntervals.cpp
@@ -33,22 +33,20 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/Config/llvm-config.h"
-#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Statepoint.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/BlockFrequency.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/CodeGen/StackMaps.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -149,7 +147,7 @@ bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {
       getRegUnit(i);
   }
   LLVM_DEBUG(dump());
-  return true;
+  return false;
 }
 
 void LiveIntervals::print(raw_ostream &OS, const Module* ) const {
@@ -500,7 +498,7 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
 
   // Create new live ranges with only minimal live segments per def.
   LiveRange NewLR;
-  createSegmentsForValues(NewLR, make_range(li->vni_begin(), li->vni_end()));
+  createSegmentsForValues(NewLR, li->vnis());
   extendSegmentsToUses(NewLR, WorkList, Reg, LaneBitmask::getNone());
 
   // Move the trimmed segments back.
@@ -604,7 +602,7 @@ void LiveIntervals::shrinkToUses(LiveInterval::SubRange &SR, Register Reg) {
 
   // Create a new live ranges with only minimal live segments per def.
   LiveRange NewLR;
-  createSegmentsForValues(NewLR, make_range(SR.vni_begin(), SR.vni_end()));
+  createSegmentsForValues(NewLR, SR.vnis());
   extendSegmentsToUses(NewLR, WorkList, Reg, SR.LaneMask);
 
   // Move the trimmed ranges back.
@@ -913,11 +911,11 @@ static bool hasLiveThroughUse(const MachineInstr *MI, Register Reg) {
   return false;
 }
 
-bool LiveIntervals::checkRegMaskInterference(LiveInterval &LI,
+bool LiveIntervals::checkRegMaskInterference(const LiveInterval &LI,
                                              BitVector &UsableRegs) {
   if (LI.empty())
     return false;
-  LiveInterval::iterator LiveI = LI.begin(), LiveE = LI.end();
+  LiveInterval::const_iterator LiveI = LI.begin(), LiveE = LI.end();
 
   // Use a smaller arrays for local live ranges.
   ArrayRef<SlotIndex> Slots;
diff --git a/llvm/lib/CodeGen/LiveRangeCalc.cpp b/llvm/lib/CodeGen/LiveRangeCalc.cpp
index 3ef28042acb0..26f6e1ede1ad 100644
--- a/llvm/lib/CodeGen/LiveRangeCalc.cpp
+++ b/llvm/lib/CodeGen/LiveRangeCalc.cpp
@@ -20,11 +20,9 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/MC/LaneBitmask.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp
index 05768140cbdf..58eb4110f153 100644
--- a/llvm/lib/CodeGen/LiveRangeEdit.cpp
+++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -371,7 +371,7 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink,
       const MachineOperand &MO = MI->getOperand(i-1);
       if (MO.isReg() && Register::isPhysicalRegister(MO.getReg()))
         continue;
-      MI->RemoveOperand(i-1);
+      MI->removeOperand(i-1);
     }
     LLVM_DEBUG(dbgs() << "Converted physregs to:\t" << *MI);
   } else {
diff --git a/llvm/lib/CodeGen/LiveRangeShrink.cpp b/llvm/lib/CodeGen/LiveRangeShrink.cpp
index 054f4370b609..8e56985246db 100644
--- a/llvm/lib/CodeGen/LiveRangeShrink.cpp
+++ b/llvm/lib/CodeGen/LiveRangeShrink.cpp
@@ -23,7 +23,6 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
diff --git a/llvm/lib/CodeGen/LiveRegMatrix.cpp b/llvm/lib/CodeGen/LiveRegMatrix.cpp
index 4c0172a930b5..6ca7f00a7885 100644
--- a/llvm/lib/CodeGen/LiveRegMatrix.cpp
+++ b/llvm/lib/CodeGen/LiveRegMatrix.cpp
@@ -78,13 +78,13 @@ void LiveRegMatrix::releaseMemory() {
 
 template <typename Callable>
 static bool foreachUnit(const TargetRegisterInfo *TRI,
-                        LiveInterval &VRegInterval, MCRegister PhysReg,
+                        const LiveInterval &VRegInterval, MCRegister PhysReg,
                         Callable Func) {
   if (VRegInterval.hasSubRanges()) {
     for (MCRegUnitMaskIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
       unsigned Unit = (*Units).first;
       LaneBitmask Mask = (*Units).second;
-      for (LiveInterval::SubRange &S : VRegInterval.subranges()) {
+      for (const LiveInterval::SubRange &S : VRegInterval.subranges()) {
         if ((S.LaneMask & Mask).any()) {
           if (Func(Unit, S))
             return true;
@@ -101,7 +101,7 @@ static bool foreachUnit(const TargetRegisterInfo *TRI,
   return false;
 }
 
-void LiveRegMatrix::assign(LiveInterval &VirtReg, MCRegister PhysReg) {
+void LiveRegMatrix::assign(const LiveInterval &VirtReg, MCRegister PhysReg) {
   LLVM_DEBUG(dbgs() << "assigning " << printReg(VirtReg.reg(), TRI) << " to "
                     << printReg(PhysReg, TRI) << ':');
   assert(!VRM->hasPhys(VirtReg.reg()) && "Duplicate VirtReg assignment");
@@ -118,7 +118,7 @@ void LiveRegMatrix::assign(LiveInterval &VirtReg, MCRegister PhysReg) {
   LLVM_DEBUG(dbgs() << '\n');
 }
 
-void LiveRegMatrix::unassign(LiveInterval &VirtReg) {
+void LiveRegMatrix::unassign(const LiveInterval &VirtReg) {
   Register PhysReg = VRM->getPhys(VirtReg.reg());
   LLVM_DEBUG(dbgs() << "unassigning " << printReg(VirtReg.reg(), TRI)
                     << " from " << printReg(PhysReg, TRI) << ':');
@@ -143,7 +143,7 @@ bool LiveRegMatrix::isPhysRegUsed(MCRegister PhysReg) const {
   return false;
 }
 
-bool LiveRegMatrix::checkRegMaskInterference(LiveInterval &VirtReg,
+bool LiveRegMatrix::checkRegMaskInterference(const LiveInterval &VirtReg,
                                              MCRegister PhysReg) {
   // Check if the cached information is valid.
   // The same BitVector can be reused for all PhysRegs.
@@ -161,7 +161,7 @@ bool LiveRegMatrix::checkRegMaskInterference(LiveInterval &VirtReg,
   return !RegMaskUsable.empty() && (!PhysReg || !RegMaskUsable.test(PhysReg));
 }
 
-bool LiveRegMatrix::checkRegUnitInterference(LiveInterval &VirtReg,
+bool LiveRegMatrix::checkRegUnitInterference(const LiveInterval &VirtReg,
                                              MCRegister PhysReg) {
   if (VirtReg.empty())
     return false;
@@ -183,7 +183,8 @@ LiveIntervalUnion::Query &LiveRegMatrix::query(const LiveRange &LR,
 }
 
 LiveRegMatrix::InterferenceKind
-LiveRegMatrix::checkInterference(LiveInterval &VirtReg, MCRegister PhysReg) {
+LiveRegMatrix::checkInterference(const LiveInterval &VirtReg,
+                                 MCRegister PhysReg) {
   if (VirtReg.empty())
     return IK_Free;
 
@@ -237,7 +238,7 @@ bool LiveRegMatrix::checkInterference(SlotIndex Start, SlotIndex End,
 }
 
 Register LiveRegMatrix::getOneVReg(unsigned PhysReg) const {
-  LiveInterval *VRegInterval = nullptr;
+  const LiveInterval *VRegInterval = nullptr;
   for (MCRegUnitIterator Unit(PhysReg, TRI); Unit.isValid(); ++Unit) {
     if ((VRegInterval = Matrix[*Unit].getOneVReg()))
       return VRegInterval->reg();
diff --git a/llvm/lib/CodeGen/LiveStacks.cpp b/llvm/lib/CodeGen/LiveStacks.cpp
index 8df84ebf4f06..8fc5a929d77b 100644
--- a/llvm/lib/CodeGen/LiveStacks.cpp
+++ b/llvm/lib/CodeGen/LiveStacks.cpp
@@ -13,12 +13,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/LiveStacks.h"
-#include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/InitializePasses.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "livestacks"
diff --git a/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp b/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
index 37fd3e4853ac..5f54d7cc8472 100644
--- a/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
+++ b/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
@@ -23,7 +23,6 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -118,7 +117,7 @@ bool LocalStackSlotPass::runOnMachineFunction(MachineFunction &MF) {
   // If the target doesn't want/need this pass, or if there are no locals
   // to consider, early exit.
   if (LocalObjectCount == 0 || !TRI->requiresVirtualBaseRegisters(MF))
-    return true;
+    return false;
 
   // Make sure we have enough space to store the local offsets.
   LocalOffsets.resize(MFI.getObjectIndexEnd());
@@ -344,7 +343,7 @@ bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) {
 
   MachineBasicBlock *Entry = &Fn.front();
 
-  unsigned BaseReg = 0;
+  Register BaseReg;
   int64_t BaseOffset = 0;
 
   // Loop through the frame references and allocate for them as necessary.
@@ -414,20 +413,14 @@ bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) {
         continue;
       }
 
-      const MachineFunction *MF = MI.getMF();
-      const TargetRegisterClass *RC = TRI->getPointerRegClass(*MF);
-      BaseReg = Fn.getRegInfo().createVirtualRegister(RC);
-
-      LLVM_DEBUG(dbgs() << "  Materializing base register"
-                        << " at frame local offset "
-                        << LocalOffset + InstrOffset);
-
       // Tell the target to insert the instruction to initialize
       // the base register.
       //            MachineBasicBlock::iterator InsertionPt = Entry->begin();
       BaseReg = TRI->materializeFrameBaseRegister(Entry, FrameIdx, InstrOffset);
 
-      LLVM_DEBUG(dbgs() << " into " << printReg(BaseReg, TRI) << '\n');
+      LLVM_DEBUG(dbgs() << "  Materialized base register at frame local offset "
+                        << LocalOffset + InstrOffset
+                        << " into " << printReg(BaseReg, TRI) << '\n');
 
       // The base register already includes any offset specified
       // by the instruction, so account for that so it doesn't get
@@ -437,7 +430,7 @@ bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) {
       ++NumBaseRegisters;
       UsedBaseReg = true;
     }
-    assert(BaseReg != 0 && "Unable to allocate virtual base register!");
+    assert(BaseReg && "Unable to allocate virtual base register!");
 
     // Modify the instruction to use the new base register rather
     // than the frame index operand.
diff --git a/llvm/lib/CodeGen/LowLevelType.cpp b/llvm/lib/CodeGen/LowLevelType.cpp
index dce64ab9f5ca..b47c96e50831 100644
--- a/llvm/lib/CodeGen/LowLevelType.cpp
+++ b/llvm/lib/CodeGen/LowLevelType.cpp
@@ -15,7 +15,6 @@
 #include "llvm/ADT/APFloat.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
-#include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
 LLT llvm::getLLTForType(Type &Ty, const DataLayout &DL) {
diff --git a/llvm/lib/CodeGen/LowerEmuTLS.cpp b/llvm/lib/CodeGen/LowerEmuTLS.cpp
index a06d1d6255c7..984dc452fbfd 100644
--- a/llvm/lib/CodeGen/LowerEmuTLS.cpp
+++ b/llvm/lib/CodeGen/LowerEmuTLS.cpp
@@ -17,7 +17,6 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
diff --git a/llvm/lib/CodeGen/MIRCanonicalizerPass.cpp b/llvm/lib/CodeGen/MIRCanonicalizerPass.cpp
index 3ec8c627f131..eea24d8e9353 100644
--- a/llvm/lib/CodeGen/MIRCanonicalizerPass.cpp
+++ b/llvm/lib/CodeGen/MIRCanonicalizerPass.cpp
@@ -27,15 +27,12 @@
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
-#include <queue>
-
 using namespace llvm;
 
 #define DEBUG_TYPE "mir-canonicalizer"
@@ -106,10 +103,7 @@ rescheduleLexographically(std::vector<MachineInstr *> instructions,
     StringInstrMap.push_back({(i == std::string::npos) ? S : S.substr(i), II});
   }
 
-  llvm::sort(StringInstrMap,
-             [](const StringInstrPair &a, const StringInstrPair &b) -> bool {
-               return (a.first < b.first);
-             });
+  llvm::sort(StringInstrMap, llvm::less_first());
 
   for (auto &II : StringInstrMap) {
 
diff --git a/llvm/lib/CodeGen/MIRFSDiscriminator.cpp b/llvm/lib/CodeGen/MIRFSDiscriminator.cpp
index bf78594e9b23..3152102410d7 100644
--- a/llvm/lib/CodeGen/MIRFSDiscriminator.cpp
+++ b/llvm/lib/CodeGen/MIRFSDiscriminator.cpp
@@ -15,12 +15,14 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Function.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h"
-#include <unordered_map>
 
 using namespace llvm;
 using namespace sampleprof;
@@ -68,6 +70,8 @@ static uint64_t getCallStackHash(const MachineBasicBlock &BB,
 bool MIRAddFSDiscriminators::runOnMachineFunction(MachineFunction &MF) {
   if (!EnableFSDiscriminator)
     return false;
+  if (!MF.getFunction().isDebugInfoForProfiling())
+    return false;
 
   bool Changed = false;
   using LocationDiscriminator = std::tuple<StringRef, unsigned, unsigned>;
@@ -131,6 +135,7 @@ bool MIRAddFSDiscriminators::runOnMachineFunction(MachineFunction &MF) {
   if (Changed) {
     createFSDiscriminatorVariable(MF.getFunction().getParent());
     LLVM_DEBUG(dbgs() << "Num of FS Discriminators: " << NumNewD << "\n");
+    (void) NumNewD;
   }
 
   return Changed;
diff --git a/llvm/lib/CodeGen/MIRNamerPass.cpp b/llvm/lib/CodeGen/MIRNamerPass.cpp
index 9f61dd9ef243..bc65700aba06 100644
--- a/llvm/lib/CodeGen/MIRNamerPass.cpp
+++ b/llvm/lib/CodeGen/MIRNamerPass.cpp
@@ -18,11 +18,7 @@
 
 #include "MIRVRegNamerUtils.h"
 #include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/InitializePasses.h"
 
 using namespace llvm;
diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.cpp b/llvm/lib/CodeGen/MIRParser/MILexer.cpp
index 0ca820f160aa..b0daa20913f5 100644
--- a/llvm/lib/CodeGen/MIRParser/MILexer.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MILexer.cpp
@@ -15,7 +15,6 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
-#include <algorithm>
 #include <cassert>
 #include <cctype>
 #include <string>
@@ -250,7 +249,7 @@ static MIToken::TokenKind getIdentifierKind(StringRef Identifier) {
       .Case("dereferenceable", MIToken::kw_dereferenceable)
       .Case("invariant", MIToken::kw_invariant)
       .Case("align", MIToken::kw_align)
-      .Case("basealign", MIToken::kw_align)
+      .Case("basealign", MIToken::kw_basealign)
       .Case("addrspace", MIToken::kw_addrspace)
       .Case("stack", MIToken::kw_stack)
       .Case("got", MIToken::kw_got)
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 6477965bdc21..40ae7053ea09 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -26,8 +26,6 @@
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/AsmParser/SlotMapping.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/MIRFormatter.h"
 #include "llvm/CodeGen/MIRPrinter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -38,6 +36,8 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterBank.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -60,7 +60,6 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCInstrDesc.h"
-#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/Casting.h"
@@ -69,10 +68,8 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Target/TargetMachine.h"
-#include <algorithm>
 #include <cassert>
 #include <cctype>
 #include <cstddef>
@@ -744,7 +741,7 @@ bool MIParser::parseBasicBlockDefinition(
   MBB->setIsEHPad(IsLandingPad);
   MBB->setIsInlineAsmBrIndirectTarget(IsInlineAsmBrIndirectTarget);
   MBB->setIsEHFuncletEntry(IsEHFuncletEntry);
-  if (SectionID.hasValue()) {
+  if (SectionID) {
     MBB->setSectionID(SectionID.getValue());
     MF.setBBSectionsType(BasicBlockSection::List);
   }
@@ -1094,11 +1091,23 @@ bool MIParser::parse(MachineInstr *&MI) {
       return true;
   }
 
-  // TODO: Check for extraneous machine operands.
   MI = MF.CreateMachineInstr(MCID, DebugLocation, /*NoImplicit=*/true);
   MI->setFlags(Flags);
-  for (const auto &Operand : Operands)
+
+  unsigned NumExplicitOps = 0;
+  for (const auto &Operand : Operands) {
+    bool IsImplicitOp = Operand.Operand.isReg() && Operand.Operand.isImplicit();
+    if (!IsImplicitOp) {
+      if (!MCID.isVariadic() && NumExplicitOps >= MCID.getNumOperands() &&
+          !Operand.Operand.isValidExcessOperand())
+        return error(Operand.Begin, "too many operands for instruction");
+
+      ++NumExplicitOps;
+    }
+
     MI->addOperand(MF, Operand.Operand);
+  }
+
   if (assignRegisterTies(*MI, Operands))
     return true;
   if (PreInstrSymbol)
@@ -1609,7 +1618,7 @@ bool MIParser::assignRegisterTies(MachineInstr &MI,
       continue;
     // The parser ensures that this operand is a register use, so we just have
     // to check the tied-def operand.
-    unsigned DefIdx = Operands[I].TiedDefIdx.getValue();
+    unsigned DefIdx = *Operands[I].TiedDefIdx;
     if (DefIdx >= E)
       return error(Operands[I].Begin,
                    Twine("use of invalid tied-def operand index '" +
@@ -1714,6 +1723,15 @@ bool MIParser::parseRegisterOperand(MachineOperand &Dest,
         RegInfo->Kind == VRegInfo::REGBANK)
       return error("generic virtual registers must have a type");
   }
+
+  if (Flags & RegState::Define) {
+    if (Flags & RegState::Kill)
+      return error("cannot have a killed def operand");
+  } else {
+    if (Flags & RegState::Dead)
+      return error("cannot have a dead use operand");
+  }
+
   Dest = MachineOperand::CreateReg(
       Reg, Flags & RegState::Define, Flags & RegState::Implicit,
       Flags & RegState::Kill, Flags & RegState::Dead, Flags & RegState::Undef,
@@ -2689,19 +2707,19 @@ bool MIParser::parseCustomRegisterMaskOperand(MachineOperand &Dest) {
     return true;
 
   uint32_t *Mask = MF.allocateRegMask();
-  while (true) {
-    if (Token.isNot(MIToken::NamedRegister))
-      return error("expected a named register");
-    Register Reg;
-    if (parseNamedRegister(Reg))
-      return true;
-    lex();
-    Mask[Reg / 32] |= 1U << (Reg % 32);
+  do {
+    if (Token.isNot(MIToken::rparen)) {
+      if (Token.isNot(MIToken::NamedRegister))
+        return error("expected a named register");
+      Register Reg;
+      if (parseNamedRegister(Reg))
+        return true;
+      lex();
+      Mask[Reg / 32] |= 1U << (Reg % 32);
+    }
+
     // TODO: Report an error if the same register is used more than once.
-    if (Token.isNot(MIToken::comma))
-      break;
-    lex();
-  }
+  } while (consumeIfPresent(MIToken::comma));
 
   if (expectAndConsume(MIToken::rparen))
     return true;
@@ -3269,11 +3287,21 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {
   MDNode *Range = nullptr;
   while (consumeIfPresent(MIToken::comma)) {
     switch (Token.kind()) {
-    case MIToken::kw_align:
+    case MIToken::kw_align: {
       // align is printed if it is different than size.
-      if (parseAlignment(BaseAlignment))
+      uint64_t Alignment;
+      if (parseAlignment(Alignment))
         return true;
+      if (Ptr.Offset & (Alignment - 1)) {
+        // MachineMemOperand::getAlign never returns a value greater than the
+        // alignment of offset, so this just guards against hand-written MIR
+        // that specifies a large "align" value when it should probably use
+        // "basealign" instead.
+        return error("specified alignment is more aligned than offset");
+      }
+      BaseAlignment = Alignment;
       break;
+    }
     case MIToken::kw_basealign:
       // basealign is printed if it is different than align.
       if (parseAlignment(BaseAlignment))
diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index f144639770bc..4944cb46c5b5 100644
--- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -13,13 +13,10 @@
 
 #include "llvm/CodeGen/MIRParser/MIRParser.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/AsmParser/SlotMapping.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/MIRParser/MIParser.h"
 #include "llvm/CodeGen/MIRYamlMapping.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
@@ -29,7 +26,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
@@ -46,6 +43,8 @@
 using namespace llvm;
 
 namespace llvm {
+class MDNode;
+class RegisterBank;
 
 /// This class implements the parsing of LLVM IR that's embedded inside a MIR
 /// file.
@@ -459,6 +458,12 @@ MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF,
   MF.setExposesReturnsTwice(YamlMF.ExposesReturnsTwice);
   MF.setHasWinCFI(YamlMF.HasWinCFI);
 
+  MF.setCallsEHReturn(YamlMF.CallsEHReturn);
+  MF.setCallsUnwindInit(YamlMF.CallsUnwindInit);
+  MF.setHasEHCatchret(YamlMF.HasEHCatchret);
+  MF.setHasEHScopes(YamlMF.HasEHScopes);
+  MF.setHasEHFunclets(YamlMF.HasEHFunclets);
+
   if (YamlMF.Legalized)
     MF.getProperties().set(MachineFunctionProperties::Property::Legalized);
   if (YamlMF.RegBankSelected)
@@ -638,7 +643,7 @@ bool MIRParserImpl::parseRegisterInfo(PerFunctionMIParsingState &PFS,
   // be saved for the caller).
   if (YamlMF.CalleeSavedRegisters) {
     SmallVector<MCPhysReg, 16> CalleeSavedRegisters;
-    for (const auto &RegSource : YamlMF.CalleeSavedRegisters.getValue()) {
+    for (const auto &RegSource : *YamlMF.CalleeSavedRegisters) {
       Register Reg;
       if (parseNamedRegisterReference(PFS, Reg, RegSource.Value, Error))
         return error(Error, RegSource.SourceRange);
@@ -809,7 +814,7 @@ bool MIRParserImpl::initializeFrameInfo(PerFunctionMIParsingState &PFS,
                                  Object.CalleeSavedRestored, ObjectIdx))
       return true;
     if (Object.LocalOffset)
-      MFI.mapLocalFrameObject(ObjectIdx, Object.LocalOffset.getValue());
+      MFI.mapLocalFrameObject(ObjectIdx, *Object.LocalOffset);
     if (parseStackObjectsDebugInfo(PFS, Object, ObjectIdx))
       return true;
   }
@@ -826,6 +831,15 @@ bool MIRParserImpl::initializeFrameInfo(PerFunctionMIParsingState &PFS,
       return error(Error, YamlMFI.StackProtector.SourceRange);
     MFI.setStackProtectorIndex(FI);
   }
+
+  if (!YamlMFI.FunctionContext.Value.empty()) {
+    SMDiagnostic Error;
+    int FI;
+    if (parseStackObjectReference(PFS, FI, YamlMFI.FunctionContext.Value, Error))
+      return error(Error, YamlMFI.FunctionContext.SourceRange);
+    MFI.setFunctionContextIndex(FI);
+  }
+
   return false;
 }
 
@@ -909,7 +923,7 @@ bool MIRParserImpl::initializeConstantPool(PerFunctionMIParsingState &PFS,
       return error(Error, YamlConstant.Value.SourceRange);
     const Align PrefTypeAlign =
         M.getDataLayout().getPrefTypeAlign(Value->getType());
-    const Align Alignment = YamlConstant.Alignment.getValueOr(PrefTypeAlign);
+    const Align Alignment = YamlConstant.Alignment.value_or(PrefTypeAlign);
     unsigned Index = ConstantPool.getConstantPoolIndex(Value, Alignment);
     if (!ConstantPoolSlots.insert(std::make_pair(YamlConstant.ID.Value, Index))
              .second)
@@ -1023,7 +1037,7 @@ SMDiagnostic MIRParserImpl::diagFromBlockStringDiag(const SMDiagnostic &Error,
 MIRParser::MIRParser(std::unique_ptr<MIRParserImpl> Impl)
     : Impl(std::move(Impl)) {}
 
-MIRParser::~MIRParser() {}
+MIRParser::~MIRParser() = default;
 
 std::unique_ptr<Module>
 MIRParser::parseIRModule(DataLayoutCallbackTy DataLayoutCallback) {
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index dc72f83ad0e4..25823b1567f7 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -13,14 +13,11 @@
 
 #include "llvm/CodeGen/MIRPrinter.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/None.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
 #include "llvm/CodeGen/MIRYamlMapping.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
@@ -32,29 +29,19 @@
 #include "llvm/CodeGen/MachineModuleSlotTracker.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRPrintingPasses.h"
-#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSlotTracker.h"
 #include "llvm/IR/Value.h"
 #include "llvm/MC/LaneBitmask.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCDwarf.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -63,7 +50,6 @@
 #include "llvm/Support/LowLevelTypeImpl.h"
 #include "llvm/Support/YAMLTraits.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include <algorithm>
 #include <cassert>
@@ -209,6 +195,12 @@ void MIRPrinter::print(const MachineFunction &MF) {
   YamlMF.ExposesReturnsTwice = MF.exposesReturnsTwice();
   YamlMF.HasWinCFI = MF.hasWinCFI();
 
+  YamlMF.CallsEHReturn = MF.callsEHReturn();
+  YamlMF.CallsUnwindInit = MF.callsUnwindInit();
+  YamlMF.HasEHCatchret = MF.hasEHCatchret();
+  YamlMF.HasEHScopes = MF.hasEHScopes();
+  YamlMF.HasEHFunclets = MF.hasEHFunclets();
+
   YamlMF.Legalized = MF.getProperties().hasProperty(
       MachineFunctionProperties::Property::Legalized);
   YamlMF.RegBankSelected = MF.getProperties().hasProperty(
@@ -489,6 +481,12 @@ void MIRPrinter::convertStackObjects(yaml::MachineFunction &YMF,
         .printStackObjectReference(MFI.getStackProtectorIndex());
   }
 
+  if (MFI.hasFunctionContextIndex()) {
+    raw_string_ostream StrOS(YMF.FrameInfo.FunctionContext.Value);
+    MIPrinter(StrOS, MST, RegisterMaskIds, StackObjectOperandMapping)
+        .printStackObjectReference(MFI.getFunctionContextIndex());
+  }
+
   // Print the debug variable information.
   for (const MachineFunction::VariableDbgInfo &DebugVar :
        MF.getVariableDbgInfo()) {
@@ -693,11 +691,11 @@ void MIPrinter::print(const MachineBasicBlock &MBB) {
 
   // Print the live in registers.
   const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-  if (MRI.tracksLiveness() && !MBB.livein_empty()) {
+  if (!MBB.livein_empty()) {
     const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
     OS.indent(2) << "liveins: ";
     bool First = true;
-    for (const auto &LI : MBB.liveins()) {
+    for (const auto &LI : MBB.liveins_dbg()) {
       if (!First)
         OS << ", ";
       First = false;
diff --git a/llvm/lib/CodeGen/MIRSampleProfile.cpp b/llvm/lib/CodeGen/MIRSampleProfile.cpp
index b742ad9823c9..a8996a586909 100644
--- a/llvm/lib/CodeGen/MIRSampleProfile.cpp
+++ b/llvm/lib/CodeGen/MIRSampleProfile.cpp
@@ -15,7 +15,15 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp b/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp
index 5862504109f0..a2abe71a6bd7 100644
--- a/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp
+++ b/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp
@@ -10,7 +10,6 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineStableHash.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/Support/Debug.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/MLRegallocEvictAdvisor.cpp b/llvm/lib/CodeGen/MLRegallocEvictAdvisor.cpp
index 33782c755eb0..7daf9025d303 100644
--- a/llvm/lib/CodeGen/MLRegallocEvictAdvisor.cpp
+++ b/llvm/lib/CodeGen/MLRegallocEvictAdvisor.cpp
@@ -10,17 +10,19 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AllocationOrder.h"
 #include "RegAllocEvictionAdvisor.h"
 #include "RegAllocGreedy.h"
-#include "RegAllocScore.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/MLModelRunner.h"
+#include "llvm/Analysis/TensorSpec.h"
+#if defined(LLVM_HAVE_TF_AOT_REGALLOCEVICTMODEL) || defined(LLVM_HAVE_TF_API) 
 #include "llvm/Analysis/ModelUnderTrainingRunner.h"
 #include "llvm/Analysis/NoInferenceModelRunner.h"
+#endif
 #include "llvm/Analysis/ReleaseModeModelRunner.h"
-#include "llvm/Analysis/Utils/TFUtils.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
@@ -28,13 +30,11 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
-#include "llvm/Config/config.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/PassRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetMachine.h"
 
 #include <array>
 #include <memory>
@@ -46,10 +46,16 @@ using namespace llvm;
 // Generated header in release (AOT) mode
 #if defined(LLVM_HAVE_TF_AOT_REGALLOCEVICTMODEL)
 #include "RegallocEvictModel.h"
+using CompiledModelType = RegallocEvictModel;
+#else
+using CompiledModelType = NoopSavedModelImpl;
 #endif
 
 // Options that only make sense in development mode
 #ifdef LLVM_HAVE_TF_API
+#include "RegAllocScore.h"
+#include "llvm/Analysis/Utils/TFUtils.h"
+
 static cl::opt<std::string> TrainingLog(
     "regalloc-training-log", cl::Hidden,
     cl::desc("Training log for the register allocator eviction model"));
@@ -60,6 +66,8 @@ static cl::opt<std::string> ModelUnderTraining(
 
 #endif // #ifdef LLVM_HAVE_TF_API
 
+extern cl::opt<unsigned> EvictInterferenceCutoff;
+
 /// The score injection pass.
 /// This pass calculates the score for a function and inserts it in the log, but
 /// this happens only in development mode. It's a no-op otherwise.
@@ -240,8 +248,8 @@ using FeaturesListNormalizer = std::array<float, FeatureIDs::FeatureCount>;
 /// The ML evictor (commonalities between release and development mode)
 class MLEvictAdvisor : public RegAllocEvictionAdvisor {
 public:
-  MLEvictAdvisor(MachineFunction &MF, const RAGreedy &RA, MLModelRunner *Runner,
-                 const MachineBlockFrequencyInfo &MBFI,
+  MLEvictAdvisor(const MachineFunction &MF, const RAGreedy &RA,
+                 MLModelRunner *Runner, const MachineBlockFrequencyInfo &MBFI,
                  const MachineLoopInfo &Loops);
 
 protected:
@@ -257,14 +265,16 @@ protected:
   /// if we're just capturing the log of the default advisor, it needs to call
   /// the latter instead, so we need to pass all the necessary parameters for
   /// it. In the development case, it will also log.
-  virtual int64_t tryFindEvictionCandidatePosition(
-      LiveInterval &VirtReg, const AllocationOrder &Order, unsigned OrderLimit,
-      uint8_t CostPerUseLimit, const SmallVirtRegSet &FixedRegisters) const;
+  virtual int64_t
+  tryFindEvictionCandidatePosition(const LiveInterval &VirtReg,
+                                   const AllocationOrder &Order,
+                                   unsigned OrderLimit, uint8_t CostPerUseLimit,
+                                   const SmallVirtRegSet &FixedRegisters) const;
 
   /// Load the features of the given VirtReg (allocated or not) at column Pos,
   /// but if  that can't be evicted, return false instead.
   bool
-  loadInterferenceFeatures(LiveInterval &VirtReg, MCRegister PhysReg,
+  loadInterferenceFeatures(const LiveInterval &VirtReg, MCRegister PhysReg,
                            bool IsHint, const SmallVirtRegSet &FixedRegisters,
                            std::array<float, FeatureIDs::FeatureCount> &Largest,
                            size_t Pos) const;
@@ -273,24 +283,24 @@ private:
   static float getInitialQueueSize(const MachineFunction &MF);
 
   MCRegister tryFindEvictionCandidate(
-      LiveInterval &VirtReg, const AllocationOrder &Order,
+      const LiveInterval &VirtReg, const AllocationOrder &Order,
       uint8_t CostPerUseLimit,
       const SmallVirtRegSet &FixedRegisters) const override;
 
-  void extractFeatures(const SmallVectorImpl<LiveInterval *> &Intervals,
+  void extractFeatures(const SmallVectorImpl<const LiveInterval *> &Intervals,
                        std::array<float, FeatureIDs::FeatureCount> &Largest,
                        size_t Pos, int64_t IsHint, int64_t LocalIntfsCount,
                        float NrUrgent) const;
 
   // Point-in-time: we didn't learn this, so we always delegate to the default.
   bool canEvictHintInterference(
-      LiveInterval &VirtReg, MCRegister PhysReg,
+      const LiveInterval &VirtReg, MCRegister PhysReg,
       const SmallVirtRegSet &FixedRegisters) const override {
     return getDefaultAdvisor().canEvictHintInterference(VirtReg, PhysReg,
                                                         FixedRegisters);
   }
 
-  const LIFeatureComponents
+  const LIFeatureComponents &
   getLIFeatureComponents(const LiveInterval &LI) const;
 
   // Hold on to a default advisor for:
@@ -306,17 +316,21 @@ private:
   // This could be static and shared, but its initialization is non-trivial.
   std::bitset<FeatureIDs::FeatureCount> DoNotNormalize;
   const float InitialQSize;
+
+  using RegID = unsigned;
+  mutable DenseMap<RegID, LIFeatureComponents> CachedFeatures;
 };
 
+#define _DECL_FEATURES(type, name, shape, _)                                   \
+  TensorSpec::createSpec<type>(#name, shape),
+
+static const std::vector<TensorSpec> InputFeatures{
+    {RA_EVICT_FEATURES_LIST(_DECL_FEATURES)},
+};
+#undef _DECL_FEATURES
 // ===================================
 // Release (AOT) - specifics
 // ===================================
-#if defined(LLVM_HAVE_TF_AOT_REGALLOCEVICTMODEL)
-const std::array<std::string, FeatureIDs::FeatureCount> FeatureNames{
-#define _GETNAME(_, NAME, __, ___) #NAME,
-    RA_EVICT_FEATURES_LIST(_GETNAME)
-#undef _GETNAME
-};
 class ReleaseModeEvictionAdvisorAnalysis final
     : public RegAllocEvictionAdvisorAnalysis {
 public:
@@ -335,17 +349,16 @@ private:
   }
 
   std::unique_ptr<RegAllocEvictionAdvisor>
-  getAdvisor(MachineFunction &MF, const RAGreedy &RA) override {
+  getAdvisor(const MachineFunction &MF, const RAGreedy &RA) override {
     if (!Runner)
-      Runner = std::make_unique<ReleaseModeModelRunner<RegallocEvictModel>>(
-          MF.getFunction().getContext(), FeatureNames, DecisionName);
+      Runner = std::make_unique<ReleaseModeModelRunner<CompiledModelType>>(
+          MF.getFunction().getContext(), InputFeatures, DecisionName);
     return std::make_unique<MLEvictAdvisor>(
         MF, RA, Runner.get(), getAnalysis<MachineBlockFrequencyInfo>(),
         getAnalysis<MachineLoopInfo>());
   }
-  std::unique_ptr<ReleaseModeModelRunner<RegallocEvictModel>> Runner;
+  std::unique_ptr<ReleaseModeModelRunner<CompiledModelType>> Runner;
 };
-#endif
 
 // ===================================
 // Development mode-specifics
@@ -353,13 +366,6 @@ private:
 //
 // Features we log
 #ifdef LLVM_HAVE_TF_API
-#define _DECL_FEATURES(type, name, shape, _)                                   \
-  TensorSpec::createSpec<type>(#name, shape),
-
-static const std::vector<TensorSpec> InputFeatures{
-    {RA_EVICT_FEATURES_LIST(_DECL_FEATURES)},
-};
-#undef _DECL_FEATURES
 static const TensorSpec Output =
     TensorSpec::createSpec<int64_t>(DecisionName, {1});
 static const TensorSpec Reward = TensorSpec::createSpec<float>("reward", {1});
@@ -380,7 +386,7 @@ static const std::vector<TensorSpec> TrainingInputFeatures{
 
 class DevelopmentModeEvictAdvisor : public MLEvictAdvisor {
 public:
-  DevelopmentModeEvictAdvisor(MachineFunction &MF, const RAGreedy &RA,
+  DevelopmentModeEvictAdvisor(const MachineFunction &MF, const RAGreedy &RA,
                               MLModelRunner *Runner,
                               const MachineBlockFrequencyInfo &MBFI,
                               const MachineLoopInfo &Loops, Logger *Log)
@@ -388,8 +394,8 @@ public:
 
 private:
   int64_t tryFindEvictionCandidatePosition(
-      LiveInterval &VirtReg, const AllocationOrder &Order, unsigned OrderLimit,
-      uint8_t CostPerUseLimit,
+      const LiveInterval &VirtReg, const AllocationOrder &Order,
+      unsigned OrderLimit, uint8_t CostPerUseLimit,
       const SmallVirtRegSet &FixedRegisters) const override;
 
   Logger *const Log;
@@ -436,7 +442,7 @@ private:
   }
 
   std::unique_ptr<RegAllocEvictionAdvisor>
-  getAdvisor(MachineFunction &MF, const RAGreedy &RA) override {
+  getAdvisor(const MachineFunction &MF, const RAGreedy &RA) override {
     LLVMContext &Ctx = MF.getFunction().getContext();
     if (ModelUnderTraining.empty() && TrainingLog.empty()) {
       Ctx.emitError("Regalloc development mode should be requested with at "
@@ -496,7 +502,7 @@ float MLEvictAdvisor::getInitialQueueSize(const MachineFunction &MF) {
   return Ret;
 }
 
-MLEvictAdvisor::MLEvictAdvisor(MachineFunction &MF, const RAGreedy &RA,
+MLEvictAdvisor::MLEvictAdvisor(const MachineFunction &MF, const RAGreedy &RA,
                                MLModelRunner *Runner,
                                const MachineBlockFrequencyInfo &MBFI,
                                const MachineLoopInfo &Loops)
@@ -514,7 +520,7 @@ MLEvictAdvisor::MLEvictAdvisor(MachineFunction &MF, const RAGreedy &RA,
 }
 
 int64_t MLEvictAdvisor::tryFindEvictionCandidatePosition(
-    LiveInterval &, const AllocationOrder &, unsigned, uint8_t,
+    const LiveInterval &, const AllocationOrder &, unsigned, uint8_t,
     const SmallVirtRegSet &) const {
   int64_t Ret = Runner->evaluate<int64_t>();
   assert(Ret >= 0);
@@ -523,7 +529,7 @@ int64_t MLEvictAdvisor::tryFindEvictionCandidatePosition(
 }
 
 bool MLEvictAdvisor::loadInterferenceFeatures(
-    LiveInterval &VirtReg, MCRegister PhysReg, bool IsHint,
+    const LiveInterval &VirtReg, MCRegister PhysReg, bool IsHint,
     const SmallVirtRegSet &FixedRegisters, FeaturesListNormalizer &Largest,
     size_t Pos) const {
   // It is only possible to evict virtual register interference.
@@ -539,16 +545,18 @@ bool MLEvictAdvisor::loadInterferenceFeatures(
   // The cascade tracking is the same as in the default advisor
   unsigned Cascade = RA.getExtraInfo().getCascadeOrCurrentNext(VirtReg.reg());
 
-  SmallVector<LiveInterval *, MaxInterferences> InterferingIntervals;
+  SmallVector<const LiveInterval *, MaxInterferences> InterferingIntervals;
   for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
     LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
     // Different from the default heuristic, we don't make any assumptions about
     // what having more than 10 results in the query may mean.
-    const auto &IFIntervals = Q.interferingVRegs();
+    const auto &IFIntervals = Q.interferingVRegs(EvictInterferenceCutoff);
     if (IFIntervals.empty() && InterferingIntervals.empty())
       continue;
+    if (IFIntervals.size() >= EvictInterferenceCutoff)
+      return false;
     InterferingIntervals.append(IFIntervals.begin(), IFIntervals.end());
-    for (LiveInterval *Intf : reverse(IFIntervals)) {
+    for (const LiveInterval *Intf : reverse(IFIntervals)) {
       assert(Register::isVirtualRegister(Intf->reg()) &&
              "Only expecting virtual register interference from query");
       // This is the same set of legality checks as in the default case: don't
@@ -587,7 +595,7 @@ bool MLEvictAdvisor::loadInterferenceFeatures(
 }
 
 MCRegister MLEvictAdvisor::tryFindEvictionCandidate(
-    LiveInterval &VirtReg, const AllocationOrder &Order,
+    const LiveInterval &VirtReg, const AllocationOrder &Order,
     uint8_t CostPerUseLimit, const SmallVirtRegSet &FixedRegisters) const {
   auto MaybeOrderLimit = getOrderLimit(VirtReg, Order, CostPerUseLimit);
   if (!MaybeOrderLimit)
@@ -652,7 +660,7 @@ MCRegister MLEvictAdvisor::tryFindEvictionCandidate(
   // decision making process.
   Regs[CandidateVirtRegPos].second = !MustFindEviction;
   if (!MustFindEviction)
-    extractFeatures(SmallVector<LiveInterval *, 1>(1, &VirtReg), Largest,
+    extractFeatures(SmallVector<const LiveInterval *, 1>(1, &VirtReg), Largest,
                     CandidateVirtRegPos, /*IsHint*/ 0, /*LocalIntfsCount*/ 0,
                     /*NrUrgent*/ 0.0);
   assert(InitialQSize > 0.0 && "We couldn't have gotten here if we had "
@@ -686,9 +694,15 @@ MCRegister MLEvictAdvisor::tryFindEvictionCandidate(
   return Regs[CandidatePos].first;
 }
 
-const LIFeatureComponents
+const LIFeatureComponents &
 MLEvictAdvisor::getLIFeatureComponents(const LiveInterval &LI) const {
-  LIFeatureComponents Ret;
+  RegID ID = LI.reg().id();
+  LIFeatureComponents Empty;
+  auto I = CachedFeatures.insert(std::make_pair(ID, Empty));
+  LIFeatureComponents &Ret = I.first->getSecond();
+  if (!I.second)
+    return Ret;
+
   SmallPtrSet<MachineInstr *, 8> Visited;
   const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
 
@@ -733,7 +747,7 @@ MLEvictAdvisor::getLIFeatureComponents(const LiveInterval &LI) const {
 // Overall, this currently mimics what we do for weight calculation, but instead
 // of accummulating the various features, we keep them separate.
 void MLEvictAdvisor::extractFeatures(
-    const SmallVectorImpl<LiveInterval *> &Intervals,
+    const SmallVectorImpl<const LiveInterval *> &Intervals,
     std::array<float, FeatureIDs::FeatureCount> &Largest, size_t Pos,
     int64_t IsHint, int64_t LocalIntfsCount, float NrUrgent) const {
   int64_t NrDefsAndUses = 0;
@@ -769,7 +783,7 @@ void MLEvictAdvisor::extractFeatures(
 
     if (LI.endIndex() > EndSI)
       EndSI = LI.endIndex();
-    const LIFeatureComponents LIFC = getLIFeatureComponents(LI);
+    const LIFeatureComponents &LIFC = getLIFeatureComponents(LI);
     NrBrokenHints += VRM->hasPreferredPhys(LI.reg());
 
     NrDefsAndUses += LIFC.NrDefsAndUses;
@@ -831,8 +845,9 @@ RegAllocEvictionAdvisorAnalysis *llvm::createDevelopmentModeAdvisor() {
 }
 
 int64_t DevelopmentModeEvictAdvisor::tryFindEvictionCandidatePosition(
-    LiveInterval &VirtReg, const AllocationOrder &Order, unsigned OrderLimit,
-    uint8_t CostPerUseLimit, const SmallVirtRegSet &FixedRegisters) const {
+    const LiveInterval &VirtReg, const AllocationOrder &Order,
+    unsigned OrderLimit, uint8_t CostPerUseLimit,
+    const SmallVirtRegSet &FixedRegisters) const {
   int64_t Ret = 0;
   if (isa<ModelUnderTrainingRunner>(getRunner())) {
     Ret = MLEvictAdvisor::tryFindEvictionCandidatePosition(
@@ -885,11 +900,9 @@ bool RegAllocScoring::runOnMachineFunction(MachineFunction &MF) {
 }
 #endif // #ifdef LLVM_HAVE_TF_API
 
-#if defined(LLVM_HAVE_TF_AOT_REGALLOCEVICTMODEL)
 RegAllocEvictionAdvisorAnalysis *llvm::createReleaseModeAdvisor() {
   return new ReleaseModeEvictionAdvisorAnalysis();
 }
-#endif
 
 // In all cases except development mode, we don't need scoring.
 #if !defined(LLVM_HAVE_TF_API)
diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp
index 8c9d00d08c6a..c186d0ba9969 100644
--- a/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -11,8 +11,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -26,12 +26,10 @@
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/ModuleSlotTracker.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
@@ -53,8 +51,7 @@ MachineBasicBlock::MachineBasicBlock(MachineFunction &MF, const BasicBlock *B)
     IrrLoopHeaderWeight = B->getIrrLoopHeaderWeight();
 }
 
-MachineBasicBlock::~MachineBasicBlock() {
-}
+MachineBasicBlock::~MachineBasicBlock() = default;
 
 /// Return the MCSymbol for this basic block.
 MCSymbol *MachineBasicBlock::getSymbol() const {
@@ -135,7 +132,7 @@ void ilist_callback_traits<MachineBasicBlock>::addNodeToList(
   // Make sure the instructions have their operands in the reginfo lists.
   MachineRegisterInfo &RegInfo = MF.getRegInfo();
   for (MachineInstr &MI : N->instrs())
-    MI.AddRegOperandsToUseLists(RegInfo);
+    MI.addRegOperandsToUseLists(RegInfo);
 }
 
 void ilist_callback_traits<MachineBasicBlock>::removeNodeFromList(
@@ -153,7 +150,7 @@ void ilist_traits<MachineInstr>::addNodeToList(MachineInstr *N) {
   // Add the instruction's register operands to their corresponding
   // use/def lists.
   MachineFunction *MF = Parent->getParent();
-  N->AddRegOperandsToUseLists(MF->getRegInfo());
+  N->addRegOperandsToUseLists(MF->getRegInfo());
   MF->handleInsertion(*N);
 }
 
@@ -165,7 +162,7 @@ void ilist_traits<MachineInstr>::removeNodeFromList(MachineInstr *N) {
   // Remove from the use/def lists.
   if (MachineFunction *MF = N->getMF()) {
     MF->handleRemoval(*N);
-    N->RemoveRegOperandsFromUseLists(MF->getRegInfo());
+    N->removeRegOperandsFromUseLists(MF->getRegInfo());
   }
 
   N->setParent(nullptr);
@@ -918,6 +915,10 @@ bool MachineBasicBlock::isLayoutSuccessor(const MachineBasicBlock *MBB) const {
   return std::next(I) == MachineFunction::const_iterator(MBB);
 }
 
+const MachineBasicBlock *MachineBasicBlock::getSingleSuccessor() const {
+  return Successors.size() == 1 ? Successors[0] : nullptr;
+}
+
 MachineBasicBlock *MachineBasicBlock::getFallThrough() {
   MachineFunction::iterator Fallthrough = getIterator();
   ++Fallthrough;
@@ -1620,6 +1621,16 @@ MachineBasicBlock::liveout_iterator MachineBasicBlock::liveout_begin() const {
   return liveout_iterator(*this, ExceptionPointer, ExceptionSelector, false);
 }
 
+bool MachineBasicBlock::sizeWithoutDebugLargerThan(unsigned Limit) const {
+  unsigned Cntr = 0;
+  auto R = instructionsWithoutDebug(begin(), end());
+  for (auto I = R.begin(), E = R.end(); I != E; ++I) {
+    if (++Cntr > Limit)
+      return true;
+  }
+  return false;
+}
+
 const MBBSectionID MBBSectionID::ColdSectionID(MBBSectionID::SectionType::Cold);
 const MBBSectionID
     MBBSectionID::ExceptionSectionID(MBBSectionID::SectionType::Exception);
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index c93ffaabf74c..4cc84f22bdde 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -34,13 +34,13 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/MBFIWrapper.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineSizeOpts.h"
 #include "llvm/CodeGen/TailDuplicator.h"
@@ -50,6 +50,7 @@
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/PrintPasses.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Allocator.h"
@@ -200,10 +201,8 @@ static cl::opt<unsigned> TriangleChainCount(
     cl::init(2),
     cl::Hidden);
 
-static cl::opt<bool> EnableExtTspBlockPlacement(
-    "enable-ext-tsp-block-placement", cl::Hidden, cl::init(false),
-    cl::desc("Enable machine block placement based on the ext-tsp model, "
-             "optimizing I-cache utilization."));
+extern cl::opt<bool> EnableExtTspBlockPlacement;
+extern cl::opt<bool> ApplyExtTspWithoutProfile;
 
 namespace llvm {
 extern cl::opt<unsigned> StaticLikelyProb;
@@ -3422,7 +3421,8 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   }
 
   // Apply a post-processing optimizing block placement.
-  if (MF.size() >= 3 && EnableExtTspBlockPlacement) {
+  if (MF.size() >= 3 && EnableExtTspBlockPlacement &&
+      (ApplyExtTspWithoutProfile || MF.getFunction().hasProfileData())) {
     // Find a new placement and modify the layout of the blocks in the function.
     applyExtTsp();
 
@@ -3660,6 +3660,9 @@ bool MachineBlockPlacementStats::runOnMachineFunction(MachineFunction &F) {
   if (std::next(F.begin()) == F.end())
     return false;
 
+  if (!isFunctionInPrintList(F.getName()))
+    return false;
+
   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
   MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
 
diff --git a/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp b/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
index c9f762f9a6e7..a84377d70855 100644
--- a/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
+++ b/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
@@ -12,10 +12,8 @@
 
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
diff --git a/llvm/lib/CodeGen/MachineCSE.cpp b/llvm/lib/CodeGen/MachineCSE.cpp
index 0fcb07252d0e..e60fd9f7883a 100644
--- a/llvm/lib/CodeGen/MachineCSE.cpp
+++ b/llvm/lib/CodeGen/MachineCSE.cpp
@@ -34,7 +34,6 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCRegister.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Pass.h"
@@ -91,6 +90,11 @@ namespace {
       AU.addPreserved<MachineBlockFrequencyInfo>();
     }
 
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties()
+        .set(MachineFunctionProperties::Property::IsSSA);
+    }
+
     void releaseMemory() override {
       ScopeMap.clear();
       PREMap.clear();
diff --git a/llvm/lib/CodeGen/MachineCheckDebugify.cpp b/llvm/lib/CodeGen/MachineCheckDebugify.cpp
index bd7f0f862947..1e5b8dd0bbb0 100644
--- a/llvm/lib/CodeGen/MachineCheckDebugify.cpp
+++ b/llvm/lib/CodeGen/MachineCheckDebugify.cpp
@@ -11,13 +11,14 @@
 /// DILocalVariable which mir-debugifiy generated before.
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Transforms/Utils/Debugify.h"
+#include "llvm/Pass.h"
 
 #define DEBUG_TYPE "mir-check-debugify"
 
@@ -27,9 +28,6 @@ namespace {
 
 struct CheckDebugMachineModule : public ModulePass {
   bool runOnModule(Module &M) override {
-    MachineModuleInfo &MMI =
-        getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
-
     NamedMDNode *NMD = M.getNamedMetadata("llvm.mir.debugify");
     if (!NMD) {
       errs() << "WARNING: Please run mir-debugify to generate "
@@ -37,6 +35,9 @@ struct CheckDebugMachineModule : public ModulePass {
       return false;
     }
 
+    MachineModuleInfo &MMI =
+        getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+
     auto getDebugifyOperand = [&](unsigned Idx) -> unsigned {
       return mdconst::extract<ConstantInt>(NMD->getOperand(Idx)->getOperand(0))
           ->getZExtValue();
@@ -106,8 +107,7 @@ struct CheckDebugMachineModule : public ModulePass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineModuleInfoWrapperPass>();
-    AU.addPreserved<MachineModuleInfoWrapperPass>();
-    AU.setPreservesCFG();
+    AU.setPreservesAll();
   }
 
   static char ID; // Pass identification.
diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp
index 72ab9ee4f388..722a709af240 100644
--- a/llvm/lib/CodeGen/MachineCombiner.cpp
+++ b/llvm/lib/CodeGen/MachineCombiner.cpp
@@ -21,7 +21,6 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineSizeOpts.h"
 #include "llvm/CodeGen/MachineTraceMetrics.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -278,6 +277,8 @@ static CombinerObjective getCombinerObjective(MachineCombinerPattern P) {
   case MachineCombinerPattern::REASSOC_XA_YB:
   case MachineCombinerPattern::REASSOC_XY_AMM_BMM:
   case MachineCombinerPattern::REASSOC_XMM_AMM_BMM:
+  case MachineCombinerPattern::SUBADD_OP1:
+  case MachineCombinerPattern::SUBADD_OP2:
     return CombinerObjective::MustReduceDepth;
   case MachineCombinerPattern::REASSOC_XY_BCA:
   case MachineCombinerPattern::REASSOC_XY_BAC:
diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index 57fbe4112e47..66f0eb83e57c 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -83,8 +83,24 @@ STATISTIC(NumCopyBackwardPropagated, "Number of copy defs backward propagated");
 DEBUG_COUNTER(FwdCounter, "machine-cp-fwd",
               "Controls which register COPYs are forwarded");
 
+static cl::opt<bool> MCPUseCopyInstr("mcp-use-is-copy-instr", cl::init(false),
+                                     cl::Hidden);
+
 namespace {
 
+static Optional<DestSourcePair> isCopyInstr(const MachineInstr &MI,
+                                            const TargetInstrInfo &TII,
+                                            bool UseCopyInstr) {
+  if (UseCopyInstr)
+    return TII.isCopyInstr(MI);
+
+  if (MI.isCopy())
+    return Optional<DestSourcePair>(
+        DestSourcePair{MI.getOperand(0), MI.getOperand(1)});
+
+  return None;
+}
+
 class CopyTracker {
   struct CopyInfo {
     MachineInstr *MI;
@@ -110,7 +126,8 @@ public:
   }
 
   /// Remove register from copy maps.
-  void invalidateRegister(MCRegister Reg, const TargetRegisterInfo &TRI) {
+  void invalidateRegister(MCRegister Reg, const TargetRegisterInfo &TRI,
+                          const TargetInstrInfo &TII, bool UseCopyInstr) {
     // Since Reg might be a subreg of some registers, only invalidate Reg is not
     // enough. We have to find the COPY defines Reg or registers defined by Reg
     // and invalidate all of them.
@@ -120,8 +137,13 @@ public:
       auto I = Copies.find(*RUI);
       if (I != Copies.end()) {
         if (MachineInstr *MI = I->second.MI) {
-          RegsToInvalidate.insert(MI->getOperand(0).getReg().asMCReg());
-          RegsToInvalidate.insert(MI->getOperand(1).getReg().asMCReg());
+          Optional<DestSourcePair> CopyOperands =
+              isCopyInstr(*MI, TII, UseCopyInstr);
+          assert(CopyOperands && "Expect copy");
+
+          RegsToInvalidate.insert(
+              CopyOperands->Destination->getReg().asMCReg());
+          RegsToInvalidate.insert(CopyOperands->Source->getReg().asMCReg());
         }
         RegsToInvalidate.insert(I->second.DefRegs.begin(),
                                 I->second.DefRegs.end());
@@ -133,7 +155,8 @@ public:
   }
 
   /// Clobber a single register, removing it from the tracker's copy maps.
-  void clobberRegister(MCRegister Reg, const TargetRegisterInfo &TRI) {
+  void clobberRegister(MCRegister Reg, const TargetRegisterInfo &TRI,
+                       const TargetInstrInfo &TII, bool UseCopyInstr) {
     for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI) {
       auto I = Copies.find(*RUI);
       if (I != Copies.end()) {
@@ -142,8 +165,12 @@ public:
         markRegsUnavailable(I->second.DefRegs, TRI);
         // When we clobber the destination of a copy, we need to clobber the
         // whole register it defined.
-        if (MachineInstr *MI = I->second.MI)
-          markRegsUnavailable({MI->getOperand(0).getReg().asMCReg()}, TRI);
+        if (MachineInstr *MI = I->second.MI) {
+          Optional<DestSourcePair> CopyOperands =
+              isCopyInstr(*MI, TII, UseCopyInstr);
+          markRegsUnavailable({CopyOperands->Destination->getReg().asMCReg()},
+                              TRI);
+        }
         // Now we can erase the copy.
         Copies.erase(I);
       }
@@ -151,11 +178,13 @@ public:
   }
 
   /// Add this copy's registers into the tracker's copy maps.
-  void trackCopy(MachineInstr *MI, const TargetRegisterInfo &TRI) {
-    assert(MI->isCopy() && "Tracking non-copy?");
+  void trackCopy(MachineInstr *MI, const TargetRegisterInfo &TRI,
+                 const TargetInstrInfo &TII, bool UseCopyInstr) {
+    Optional<DestSourcePair> CopyOperands = isCopyInstr(*MI, TII, UseCopyInstr);
+    assert(CopyOperands && "Tracking non-copy?");
 
-    MCRegister Def = MI->getOperand(0).getReg().asMCReg();
-    MCRegister Src = MI->getOperand(1).getReg().asMCReg();
+    MCRegister Src = CopyOperands->Source->getReg().asMCReg();
+    MCRegister Def = CopyOperands->Destination->getReg().asMCReg();
 
     // Remember Def is defined by the copy.
     for (MCRegUnitIterator RUI(Def, &TRI); RUI.isValid(); ++RUI)
@@ -198,15 +227,22 @@ public:
   }
 
   MachineInstr *findAvailBackwardCopy(MachineInstr &I, MCRegister Reg,
-                                      const TargetRegisterInfo &TRI) {
+                                      const TargetRegisterInfo &TRI,
+                                      const TargetInstrInfo &TII,
+                                      bool UseCopyInstr) {
     MCRegUnitIterator RUI(Reg, &TRI);
     MachineInstr *AvailCopy = findCopyDefViaUnit(*RUI, TRI);
-    if (!AvailCopy ||
-        !TRI.isSubRegisterEq(AvailCopy->getOperand(1).getReg(), Reg))
+
+    if (!AvailCopy)
+      return nullptr;
+
+    Optional<DestSourcePair> CopyOperands =
+        isCopyInstr(*AvailCopy, TII, UseCopyInstr);
+    Register AvailSrc = CopyOperands->Source->getReg();
+    Register AvailDef = CopyOperands->Destination->getReg();
+    if (!TRI.isSubRegisterEq(AvailSrc, Reg))
       return nullptr;
 
-    Register AvailSrc = AvailCopy->getOperand(1).getReg();
-    Register AvailDef = AvailCopy->getOperand(0).getReg();
     for (const MachineInstr &MI :
          make_range(AvailCopy->getReverseIterator(), I.getReverseIterator()))
       for (const MachineOperand &MO : MI.operands())
@@ -219,20 +255,26 @@ public:
   }
 
   MachineInstr *findAvailCopy(MachineInstr &DestCopy, MCRegister Reg,
-                              const TargetRegisterInfo &TRI) {
+                              const TargetRegisterInfo &TRI,
+                              const TargetInstrInfo &TII, bool UseCopyInstr) {
     // We check the first RegUnit here, since we'll only be interested in the
     // copy if it copies the entire register anyway.
     MCRegUnitIterator RUI(Reg, &TRI);
     MachineInstr *AvailCopy =
         findCopyForUnit(*RUI, TRI, /*MustBeAvailable=*/true);
-    if (!AvailCopy ||
-        !TRI.isSubRegisterEq(AvailCopy->getOperand(0).getReg(), Reg))
+
+    if (!AvailCopy)
+      return nullptr;
+
+    Optional<DestSourcePair> CopyOperands =
+        isCopyInstr(*AvailCopy, TII, UseCopyInstr);
+    Register AvailSrc = CopyOperands->Source->getReg();
+    Register AvailDef = CopyOperands->Destination->getReg();
+    if (!TRI.isSubRegisterEq(AvailDef, Reg))
       return nullptr;
 
     // Check that the available copy isn't clobbered by any regmasks between
     // itself and the destination.
-    Register AvailSrc = AvailCopy->getOperand(1).getReg();
-    Register AvailDef = AvailCopy->getOperand(0).getReg();
     for (const MachineInstr &MI :
          make_range(AvailCopy->getIterator(), DestCopy.getIterator()))
       for (const MachineOperand &MO : MI.operands())
@@ -253,10 +295,14 @@ class MachineCopyPropagation : public MachineFunctionPass {
   const TargetInstrInfo *TII;
   const MachineRegisterInfo *MRI;
 
+  // Return true if this is a copy instruction and false otherwise.
+  bool UseCopyInstr;
+
 public:
   static char ID; // Pass identification, replacement for typeid
 
-  MachineCopyPropagation() : MachineFunctionPass(ID) {
+  MachineCopyPropagation(bool CopyInstr = false)
+      : MachineFunctionPass(ID), UseCopyInstr(CopyInstr || MCPUseCopyInstr) {
     initializeMachineCopyPropagationPass(*PassRegistry::getPassRegistry());
   }
 
@@ -334,9 +380,13 @@ void MachineCopyPropagation::ReadRegister(MCRegister Reg, MachineInstr &Reader,
 /// isNopCopy("ecx = COPY eax", AX, CX) == true
 /// isNopCopy("ecx = COPY eax", AH, CL) == false
 static bool isNopCopy(const MachineInstr &PreviousCopy, MCRegister Src,
-                      MCRegister Def, const TargetRegisterInfo *TRI) {
-  MCRegister PreviousSrc = PreviousCopy.getOperand(1).getReg().asMCReg();
-  MCRegister PreviousDef = PreviousCopy.getOperand(0).getReg().asMCReg();
+                      MCRegister Def, const TargetRegisterInfo *TRI,
+                      const TargetInstrInfo *TII, bool UseCopyInstr) {
+
+  Optional<DestSourcePair> CopyOperands =
+      isCopyInstr(PreviousCopy, *TII, UseCopyInstr);
+  MCRegister PreviousSrc = CopyOperands->Source->getReg().asMCReg();
+  MCRegister PreviousDef = CopyOperands->Destination->getReg().asMCReg();
   if (Src == PreviousSrc && Def == PreviousDef)
     return true;
   if (!TRI->isSubRegister(PreviousSrc, Src))
@@ -356,22 +406,26 @@ bool MachineCopyPropagation::eraseIfRedundant(MachineInstr &Copy,
     return false;
 
   // Search for an existing copy.
-  MachineInstr *PrevCopy = Tracker.findAvailCopy(Copy, Def, *TRI);
+  MachineInstr *PrevCopy =
+      Tracker.findAvailCopy(Copy, Def, *TRI, *TII, UseCopyInstr);
   if (!PrevCopy)
     return false;
 
+  auto PrevCopyOperands = isCopyInstr(*PrevCopy, *TII, UseCopyInstr);
   // Check that the existing copy uses the correct sub registers.
-  if (PrevCopy->getOperand(0).isDead())
+  if (PrevCopyOperands->Destination->isDead())
     return false;
-  if (!isNopCopy(*PrevCopy, Src, Def, TRI))
+  if (!isNopCopy(*PrevCopy, Src, Def, TRI, TII, UseCopyInstr))
     return false;
 
   LLVM_DEBUG(dbgs() << "MCP: copy is a NOP, removing: "; Copy.dump());
 
   // Copy was redundantly redefining either Src or Def. Remove earlier kill
   // flags between Copy and PrevCopy because the value will be reused now.
-  assert(Copy.isCopy());
-  Register CopyDef = Copy.getOperand(0).getReg();
+  Optional<DestSourcePair> CopyOperands = isCopyInstr(Copy, *TII, UseCopyInstr);
+  assert(CopyOperands);
+
+  Register CopyDef = CopyOperands->Destination->getReg();
   assert(CopyDef == Src || CopyDef == Def);
   for (MachineInstr &MI :
        make_range(PrevCopy->getIterator(), Copy.getIterator()))
@@ -385,7 +439,9 @@ bool MachineCopyPropagation::eraseIfRedundant(MachineInstr &Copy,
 
 bool MachineCopyPropagation::isBackwardPropagatableRegClassCopy(
     const MachineInstr &Copy, const MachineInstr &UseI, unsigned UseIdx) {
-  Register Def = Copy.getOperand(0).getReg();
+
+  Optional<DestSourcePair> CopyOperands = isCopyInstr(Copy, *TII, UseCopyInstr);
+  Register Def = CopyOperands->Destination->getReg();
 
   if (const TargetRegisterClass *URC =
           UseI.getRegClassConstraint(UseIdx, TII, TRI))
@@ -403,7 +459,8 @@ bool MachineCopyPropagation::isForwardableRegClassCopy(const MachineInstr &Copy,
                                                        const MachineInstr &UseI,
                                                        unsigned UseIdx) {
 
-  Register CopySrcReg = Copy.getOperand(1).getReg();
+  Optional<DestSourcePair> CopyOperands = isCopyInstr(Copy, *TII, UseCopyInstr);
+  Register CopySrcReg = CopyOperands->Source->getReg();
 
   // If the new register meets the opcode register constraints, then allow
   // forwarding.
@@ -411,34 +468,10 @@ bool MachineCopyPropagation::isForwardableRegClassCopy(const MachineInstr &Copy,
           UseI.getRegClassConstraint(UseIdx, TII, TRI))
     return URC->contains(CopySrcReg);
 
-  if (!UseI.isCopy())
+  auto UseICopyOperands = isCopyInstr(UseI, *TII, UseCopyInstr);
+  if (!UseICopyOperands)
     return false;
 
-  const TargetRegisterClass *CopySrcRC =
-      TRI->getMinimalPhysRegClass(CopySrcReg);
-  const TargetRegisterClass *UseDstRC =
-      TRI->getMinimalPhysRegClass(UseI.getOperand(0).getReg());
-  const TargetRegisterClass *CrossCopyRC = TRI->getCrossCopyRegClass(CopySrcRC);
-
-  // If cross copy register class is not the same as copy source register class
-  // then it is not possible to copy the register directly and requires a cross
-  // register class copy. Fowarding this copy without checking register class of
-  // UseDst may create additional cross register copies when expanding the copy
-  // instruction in later passes.
-  if (CopySrcRC != CrossCopyRC) {
-    const TargetRegisterClass *CopyDstRC =
-        TRI->getMinimalPhysRegClass(Copy.getOperand(0).getReg());
-
-    // Check if UseDstRC matches the necessary register class to copy from
-    // CopySrc's register class. If so then forwarding the copy will not
-    // introduce any cross-class copys. Else if CopyDstRC matches then keep the
-    // copy and do not forward. If neither UseDstRC or CopyDstRC matches then
-    // we may need a cross register copy later but we do not worry about it
-    // here.
-    if (UseDstRC != CrossCopyRC && CopyDstRC == CrossCopyRC)
-      return false;
-  }
-
   /// COPYs don't have register class constraints, so if the user instruction
   /// is a COPY, we just try to avoid introducing additional cross-class
   /// COPYs.  For example:
@@ -455,12 +488,34 @@ bool MachineCopyPropagation::isForwardableRegClassCopy(const MachineInstr &Copy,
   ///
   /// so we have reduced the number of cross-class COPYs and potentially
   /// introduced a nop COPY that can be removed.
-  const TargetRegisterClass *SuperRC = UseDstRC;
-  for (TargetRegisterClass::sc_iterator SuperRCI = UseDstRC->getSuperClasses();
-       SuperRC; SuperRC = *SuperRCI++)
-    if (SuperRC->contains(CopySrcReg))
-      return true;
 
+  // Allow forwarding if src and dst belong to any common class, so long as they
+  // don't belong to any (possibly smaller) common class that requires copies to
+  // go via a different class.
+  Register UseDstReg = UseICopyOperands->Destination->getReg();
+  bool Found = false;
+  bool IsCrossClass = false;
+  for (const TargetRegisterClass *RC : TRI->regclasses()) {
+    if (RC->contains(CopySrcReg) && RC->contains(UseDstReg)) {
+      Found = true;
+      if (TRI->getCrossCopyRegClass(RC) != RC) {
+        IsCrossClass = true;
+        break;
+      }
+    }
+  }
+  if (!Found)
+    return false;
+  if (!IsCrossClass)
+    return true;
+  // The forwarded copy would be cross-class. Only do this if the original copy
+  // was also cross-class.
+  Register CopyDstReg = CopyOperands->Destination->getReg();
+  for (const TargetRegisterClass *RC : TRI->regclasses()) {
+    if (RC->contains(CopySrcReg) && RC->contains(CopyDstReg) &&
+        TRI->getCrossCopyRegClass(RC) != RC)
+      return true;
+  }
   return false;
 }
 
@@ -527,13 +582,15 @@ void MachineCopyPropagation::forwardUses(MachineInstr &MI) {
     if (!MOUse.isRenamable())
       continue;
 
-    MachineInstr *Copy =
-        Tracker.findAvailCopy(MI, MOUse.getReg().asMCReg(), *TRI);
+    MachineInstr *Copy = Tracker.findAvailCopy(MI, MOUse.getReg().asMCReg(),
+                                               *TRI, *TII, UseCopyInstr);
     if (!Copy)
       continue;
 
-    Register CopyDstReg = Copy->getOperand(0).getReg();
-    const MachineOperand &CopySrc = Copy->getOperand(1);
+    Optional<DestSourcePair> CopyOperands =
+        isCopyInstr(*Copy, *TII, UseCopyInstr);
+    Register CopyDstReg = CopyOperands->Destination->getReg();
+    const MachineOperand &CopySrc = *CopyOperands->Source;
     Register CopySrcReg = CopySrc.getReg();
 
     // FIXME: Don't handle partial uses of wider COPYs yet.
@@ -557,7 +614,8 @@ void MachineCopyPropagation::forwardUses(MachineInstr &MI) {
     // Check that the instruction is not a copy that partially overwrites the
     // original copy source that we are about to use. The tracker mechanism
     // cannot cope with that.
-    if (MI.isCopy() && MI.modifiesRegister(CopySrcReg, TRI) &&
+    if (isCopyInstr(MI, *TII, UseCopyInstr) &&
+        MI.modifiesRegister(CopySrcReg, TRI) &&
         !MI.definesRegister(CopySrcReg)) {
       LLVM_DEBUG(dbgs() << "MCP: Copy source overlap with dest in " << MI);
       continue;
@@ -596,76 +654,82 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
 
   for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
     // Analyze copies (which don't overlap themselves).
-    if (MI.isCopy() && !TRI->regsOverlap(MI.getOperand(0).getReg(),
-                                         MI.getOperand(1).getReg())) {
-      assert(MI.getOperand(0).getReg().isPhysical() &&
-             MI.getOperand(1).getReg().isPhysical() &&
-             "MachineCopyPropagation should be run after register allocation!");
-
-      MCRegister Def = MI.getOperand(0).getReg().asMCReg();
-      MCRegister Src = MI.getOperand(1).getReg().asMCReg();
-
-      // The two copies cancel out and the source of the first copy
-      // hasn't been overridden, eliminate the second one. e.g.
-      //  %ecx = COPY %eax
-      //  ... nothing clobbered eax.
-      //  %eax = COPY %ecx
-      // =>
-      //  %ecx = COPY %eax
-      //
-      // or
-      //
-      //  %ecx = COPY %eax
-      //  ... nothing clobbered eax.
-      //  %ecx = COPY %eax
-      // =>
-      //  %ecx = COPY %eax
-      if (eraseIfRedundant(MI, Def, Src) || eraseIfRedundant(MI, Src, Def))
-        continue;
+    Optional<DestSourcePair> CopyOperands = isCopyInstr(MI, *TII, UseCopyInstr);
+    if (CopyOperands) {
+
+      Register RegSrc = CopyOperands->Source->getReg();
+      Register RegDef = CopyOperands->Destination->getReg();
+
+      if (!TRI->regsOverlap(RegDef, RegSrc)) {
+        assert(RegDef.isPhysical() && RegSrc.isPhysical() &&
+              "MachineCopyPropagation should be run after register allocation!");
+
+        MCRegister Def = RegDef.asMCReg();
+        MCRegister Src = RegSrc.asMCReg();
+
+        // The two copies cancel out and the source of the first copy
+        // hasn't been overridden, eliminate the second one. e.g.
+        //  %ecx = COPY %eax
+        //  ... nothing clobbered eax.
+        //  %eax = COPY %ecx
+        // =>
+        //  %ecx = COPY %eax
+        //
+        // or
+        //
+        //  %ecx = COPY %eax
+        //  ... nothing clobbered eax.
+        //  %ecx = COPY %eax
+        // =>
+        //  %ecx = COPY %eax
+        if (eraseIfRedundant(MI, Def, Src) || eraseIfRedundant(MI, Src, Def))
+          continue;
 
-      forwardUses(MI);
+        forwardUses(MI);
+
+        // Src may have been changed by forwardUses()
+        CopyOperands = isCopyInstr(MI, *TII, UseCopyInstr);
+        Src = CopyOperands->Source->getReg().asMCReg();
+
+        // If Src is defined by a previous copy, the previous copy cannot be
+        // eliminated.
+        ReadRegister(Src, MI, RegularUse);
+        for (const MachineOperand &MO : MI.implicit_operands()) {
+          if (!MO.isReg() || !MO.readsReg())
+            continue;
+          MCRegister Reg = MO.getReg().asMCReg();
+          if (!Reg)
+            continue;
+          ReadRegister(Reg, MI, RegularUse);
+        }
 
-      // Src may have been changed by forwardUses()
-      Src = MI.getOperand(1).getReg().asMCReg();
+        LLVM_DEBUG(dbgs() << "MCP: Copy is a deletion candidate: "; MI.dump());
+
+        // Copy is now a candidate for deletion.
+        if (!MRI->isReserved(Def))
+          MaybeDeadCopies.insert(&MI);
+
+        // If 'Def' is previously source of another copy, then this earlier copy's
+        // source is no longer available. e.g.
+        // %xmm9 = copy %xmm2
+        // ...
+        // %xmm2 = copy %xmm0
+        // ...
+        // %xmm2 = copy %xmm9
+        Tracker.clobberRegister(Def, *TRI, *TII, UseCopyInstr);
+        for (const MachineOperand &MO : MI.implicit_operands()) {
+          if (!MO.isReg() || !MO.isDef())
+            continue;
+          MCRegister Reg = MO.getReg().asMCReg();
+          if (!Reg)
+            continue;
+          Tracker.clobberRegister(Reg, *TRI, *TII, UseCopyInstr);
+        }
 
-      // If Src is defined by a previous copy, the previous copy cannot be
-      // eliminated.
-      ReadRegister(Src, MI, RegularUse);
-      for (const MachineOperand &MO : MI.implicit_operands()) {
-        if (!MO.isReg() || !MO.readsReg())
-          continue;
-        MCRegister Reg = MO.getReg().asMCReg();
-        if (!Reg)
-          continue;
-        ReadRegister(Reg, MI, RegularUse);
-      }
+        Tracker.trackCopy(&MI, *TRI, *TII, UseCopyInstr);
 
-      LLVM_DEBUG(dbgs() << "MCP: Copy is a deletion candidate: "; MI.dump());
-
-      // Copy is now a candidate for deletion.
-      if (!MRI->isReserved(Def))
-        MaybeDeadCopies.insert(&MI);
-
-      // If 'Def' is previously source of another copy, then this earlier copy's
-      // source is no longer available. e.g.
-      // %xmm9 = copy %xmm2
-      // ...
-      // %xmm2 = copy %xmm0
-      // ...
-      // %xmm2 = copy %xmm9
-      Tracker.clobberRegister(Def, *TRI);
-      for (const MachineOperand &MO : MI.implicit_operands()) {
-        if (!MO.isReg() || !MO.isDef())
-          continue;
-        MCRegister Reg = MO.getReg().asMCReg();
-        if (!Reg)
-          continue;
-        Tracker.clobberRegister(Reg, *TRI);
+        continue;
       }
-
-      Tracker.trackCopy(&MI, *TRI);
-
-      continue;
     }
 
     // Clobber any earlyclobber regs first.
@@ -677,7 +741,7 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
         // later.
         if (MO.isTied())
           ReadRegister(Reg, MI, RegularUse);
-        Tracker.clobberRegister(Reg, *TRI);
+        Tracker.clobberRegister(Reg, *TRI, *TII, UseCopyInstr);
       }
 
     forwardUses(MI);
@@ -713,7 +777,9 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
                MaybeDeadCopies.begin();
            DI != MaybeDeadCopies.end();) {
         MachineInstr *MaybeDead = *DI;
-        MCRegister Reg = MaybeDead->getOperand(0).getReg().asMCReg();
+        Optional<DestSourcePair> CopyOperands =
+            isCopyInstr(*MaybeDead, *TII, UseCopyInstr);
+        MCRegister Reg = CopyOperands->Destination->getReg().asMCReg();
         assert(!MRI->isReserved(Reg));
 
         if (!RegMask->clobbersPhysReg(Reg)) {
@@ -726,7 +792,7 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
 
         // Make sure we invalidate any entries in the copy maps before erasing
         // the instruction.
-        Tracker.clobberRegister(Reg, *TRI);
+        Tracker.clobberRegister(Reg, *TRI, *TII, UseCopyInstr);
 
         // erase() will return the next valid iterator pointing to the next
         // element after the erased one.
@@ -739,7 +805,7 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
 
     // Any previous copy definition or reading the Defs is no longer available.
     for (MCRegister Reg : Defs)
-      Tracker.clobberRegister(Reg, *TRI);
+      Tracker.clobberRegister(Reg, *TRI, *TII, UseCopyInstr);
   }
 
   // If MBB doesn't have successors, delete the copies whose defs are not used.
@@ -749,12 +815,16 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
     for (MachineInstr *MaybeDead : MaybeDeadCopies) {
       LLVM_DEBUG(dbgs() << "MCP: Removing copy due to no live-out succ: ";
                  MaybeDead->dump());
-      assert(!MRI->isReserved(MaybeDead->getOperand(0).getReg()));
+
+      Optional<DestSourcePair> CopyOperands =
+          isCopyInstr(*MaybeDead, *TII, UseCopyInstr);
+      assert(CopyOperands);
+
+      Register SrcReg = CopyOperands->Source->getReg();
+      Register DestReg = CopyOperands->Destination->getReg();
+      assert(!MRI->isReserved(DestReg));
 
       // Update matching debug values, if any.
-      assert(MaybeDead->isCopy());
-      Register SrcReg = MaybeDead->getOperand(1).getReg();
-      Register DestReg = MaybeDead->getOperand(0).getReg();
       SmallVector<MachineInstr *> MaybeDeadDbgUsers(
           CopyDbgUsers[MaybeDead].begin(), CopyDbgUsers[MaybeDead].end());
       MRI->updateDbgUsersToReg(DestReg.asMCReg(), SrcReg.asMCReg(),
@@ -772,10 +842,14 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
 }
 
 static bool isBackwardPropagatableCopy(MachineInstr &MI,
-                                       const MachineRegisterInfo &MRI) {
-  assert(MI.isCopy() && "MI is expected to be a COPY");
-  Register Def = MI.getOperand(0).getReg();
-  Register Src = MI.getOperand(1).getReg();
+                                       const MachineRegisterInfo &MRI,
+                                       const TargetInstrInfo &TII,
+                                       bool UseCopyInstr) {
+  Optional<DestSourcePair> CopyOperands = isCopyInstr(MI, TII, UseCopyInstr);
+  assert(CopyOperands && "MI is expected to be a COPY");
+
+  Register Def = CopyOperands->Destination->getReg();
+  Register Src = CopyOperands->Source->getReg();
 
   if (!Def || !Src)
     return false;
@@ -783,7 +857,7 @@ static bool isBackwardPropagatableCopy(MachineInstr &MI,
   if (MRI.isReserved(Def) || MRI.isReserved(Src))
     return false;
 
-  return MI.getOperand(1).isRenamable() && MI.getOperand(1).isKill();
+  return CopyOperands->Source->isRenamable() && CopyOperands->Source->isKill();
 }
 
 void MachineCopyPropagation::propagateDefs(MachineInstr &MI) {
@@ -808,13 +882,15 @@ void MachineCopyPropagation::propagateDefs(MachineInstr &MI) {
     if (!MODef.isRenamable())
       continue;
 
-    MachineInstr *Copy =
-        Tracker.findAvailBackwardCopy(MI, MODef.getReg().asMCReg(), *TRI);
+    MachineInstr *Copy = Tracker.findAvailBackwardCopy(
+        MI, MODef.getReg().asMCReg(), *TRI, *TII, UseCopyInstr);
     if (!Copy)
       continue;
 
-    Register Def = Copy->getOperand(0).getReg();
-    Register Src = Copy->getOperand(1).getReg();
+    Optional<DestSourcePair> CopyOperands =
+        isCopyInstr(*Copy, *TII, UseCopyInstr);
+    Register Def = CopyOperands->Destination->getReg();
+    Register Src = CopyOperands->Source->getReg();
 
     if (MODef.getReg() != Src)
       continue;
@@ -833,7 +909,7 @@ void MachineCopyPropagation::propagateDefs(MachineInstr &MI) {
                       << MI << "     from " << *Copy);
 
     MODef.setReg(Def);
-    MODef.setIsRenamable(Copy->getOperand(0).isRenamable());
+    MODef.setIsRenamable(CopyOperands->Destination->isRenamable());
 
     LLVM_DEBUG(dbgs() << "MCP: After replacement: " << MI << "\n");
     MaybeDeadCopies.insert(Copy);
@@ -849,20 +925,23 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock(
 
   for (MachineInstr &MI : llvm::make_early_inc_range(llvm::reverse(MBB))) {
     // Ignore non-trivial COPYs.
-    if (MI.isCopy() && MI.getNumOperands() == 2 &&
-        !TRI->regsOverlap(MI.getOperand(0).getReg(),
-                          MI.getOperand(1).getReg())) {
-
-      MCRegister Def = MI.getOperand(0).getReg().asMCReg();
-      MCRegister Src = MI.getOperand(1).getReg().asMCReg();
-
-      // Unlike forward cp, we don't invoke propagateDefs here,
-      // just let forward cp do COPY-to-COPY propagation.
-      if (isBackwardPropagatableCopy(MI, *MRI)) {
-        Tracker.invalidateRegister(Src, *TRI);
-        Tracker.invalidateRegister(Def, *TRI);
-        Tracker.trackCopy(&MI, *TRI);
-        continue;
+    Optional<DestSourcePair> CopyOperands = isCopyInstr(MI, *TII, UseCopyInstr);
+    if (CopyOperands && MI.getNumOperands() == 2) {
+      Register DefReg = CopyOperands->Destination->getReg();
+      Register SrcReg = CopyOperands->Source->getReg();
+
+      if (!TRI->regsOverlap(DefReg, SrcReg)) {
+        MCRegister Def = DefReg.asMCReg();
+        MCRegister Src = SrcReg.asMCReg();
+
+        // Unlike forward cp, we don't invoke propagateDefs here,
+        // just let forward cp do COPY-to-COPY propagation.
+        if (isBackwardPropagatableCopy(MI, *MRI, *TII, UseCopyInstr)) {
+          Tracker.invalidateRegister(Src, *TRI, *TII, UseCopyInstr);
+          Tracker.invalidateRegister(Def, *TRI, *TII, UseCopyInstr);
+          Tracker.trackCopy(&MI, *TRI, *TII, UseCopyInstr);
+          continue;
+        }
       }
     }
 
@@ -872,7 +951,7 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock(
         MCRegister Reg = MO.getReg().asMCReg();
         if (!Reg)
           continue;
-        Tracker.invalidateRegister(Reg, *TRI);
+        Tracker.invalidateRegister(Reg, *TRI, *TII, UseCopyInstr);
       }
 
     propagateDefs(MI);
@@ -884,7 +963,8 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock(
         continue;
 
       if (MO.isDef())
-        Tracker.invalidateRegister(MO.getReg().asMCReg(), *TRI);
+        Tracker.invalidateRegister(MO.getReg().asMCReg(), *TRI, *TII,
+                                   UseCopyInstr);
 
       if (MO.readsReg()) {
         if (MO.isDebug()) {
@@ -898,7 +978,8 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock(
             }
           }
         } else {
-          Tracker.invalidateRegister(MO.getReg().asMCReg(), *TRI);
+          Tracker.invalidateRegister(MO.getReg().asMCReg(), *TRI, *TII,
+                                     UseCopyInstr);
         }
       }
     }
@@ -906,8 +987,10 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock(
 
   for (auto *Copy : MaybeDeadCopies) {
 
-    Register Src = Copy->getOperand(1).getReg();
-    Register Def = Copy->getOperand(0).getReg();
+    Optional<DestSourcePair> CopyOperands =
+        isCopyInstr(*Copy, *TII, UseCopyInstr);
+    Register Src = CopyOperands->Source->getReg();
+    Register Def = CopyOperands->Destination->getReg();
     SmallVector<MachineInstr *> MaybeDeadDbgUsers(CopyDbgUsers[Copy].begin(),
                                                   CopyDbgUsers[Copy].end());
 
@@ -938,3 +1021,8 @@ bool MachineCopyPropagation::runOnMachineFunction(MachineFunction &MF) {
 
   return Changed;
 }
+
+MachineFunctionPass *
+llvm::createMachineCopyPropagationPass(bool UseCopyInstr = false) {
+  return new MachineCopyPropagation(UseCopyInstr);
+}
diff --git a/llvm/lib/CodeGen/MachineCycleAnalysis.cpp b/llvm/lib/CodeGen/MachineCycleAnalysis.cpp
index 42a5e2b7af01..6871ac35b300 100644
--- a/llvm/lib/CodeGen/MachineCycleAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineCycleAnalysis.cpp
@@ -8,50 +8,15 @@
 
 #include "llvm/CodeGen/MachineCycleAnalysis.h"
 #include "llvm/ADT/GenericCycleImpl.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineSSAContext.h"
-#include "llvm/InitializePasses.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
 template class llvm::GenericCycleInfo<llvm::MachineSSAContext>;
 template class llvm::GenericCycle<llvm::MachineSSAContext>;
 
-namespace {
-
-/// Legacy analysis pass which computes a \ref MachineCycleInfo.
-class MachineCycleInfoWrapperPass : public MachineFunctionPass {
-  MachineFunction *F = nullptr;
-  MachineCycleInfo CI;
-
-public:
-  static char ID;
-
-  MachineCycleInfoWrapperPass();
-
-  MachineCycleInfo &getCycleInfo() { return CI; }
-  const MachineCycleInfo &getCycleInfo() const { return CI; }
-
-  bool runOnMachineFunction(MachineFunction &F) override;
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-  void releaseMemory() override;
-  void print(raw_ostream &OS, const Module *M = nullptr) const override;
-
-  // TODO: verify analysis
-};
-
-class MachineCycleInfoPrinterPass : public MachineFunctionPass {
-public:
-  static char ID;
-
-  MachineCycleInfoPrinterPass();
-
-  bool runOnMachineFunction(MachineFunction &F) override;
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-};
-
-} // namespace
-
 char MachineCycleInfoWrapperPass::ID = 0;
 
 MachineCycleInfoWrapperPass::MachineCycleInfoWrapperPass()
@@ -87,6 +52,16 @@ void MachineCycleInfoWrapperPass::releaseMemory() {
   F = nullptr;
 }
 
+class MachineCycleInfoPrinterPass : public MachineFunctionPass {
+public:
+  static char ID;
+
+  MachineCycleInfoPrinterPass();
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+
 char MachineCycleInfoPrinterPass::ID = 0;
 
 MachineCycleInfoPrinterPass::MachineCycleInfoPrinterPass()
@@ -111,3 +86,62 @@ bool MachineCycleInfoPrinterPass::runOnMachineFunction(MachineFunction &F) {
   CI.print(errs());
   return false;
 }
+
+bool llvm::isCycleInvariant(const MachineCycle *Cycle, MachineInstr &I) {
+  MachineFunction *MF = I.getParent()->getParent();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+  const TargetSubtargetInfo &ST = MF->getSubtarget();
+  const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+  const TargetInstrInfo *TII = ST.getInstrInfo();
+
+  // The instruction is cycle invariant if all of its operands are.
+  for (const MachineOperand &MO : I.operands()) {
+    if (!MO.isReg())
+      continue;
+
+    Register Reg = MO.getReg();
+    if (Reg == 0)
+      continue;
+
+    // An instruction that uses or defines a physical register can't e.g. be
+    // hoisted, so mark this as not invariant.
+    if (Register::isPhysicalRegister(Reg)) {
+      if (MO.isUse()) {
+        // If the physreg has no defs anywhere, it's just an ambient register
+        // and we can freely move its uses. Alternatively, if it's allocatable,
+        // it could get allocated to something with a def during allocation.
+        // However, if the physreg is known to always be caller saved/restored
+        // then this use is safe to hoist.
+        if (!MRI->isConstantPhysReg(Reg) &&
+            !(TRI->isCallerPreservedPhysReg(Reg.asMCReg(), *I.getMF())) &&
+            !TII->isIgnorableUse(MO))
+          return false;
+        // Otherwise it's safe to move.
+        continue;
+      } else if (!MO.isDead()) {
+        // A def that isn't dead can't be moved.
+        return false;
+      } else if (any_of(Cycle->getEntries(),
+                        [&](const MachineBasicBlock *Block) {
+                          return Block->isLiveIn(Reg);
+                        })) {
+        // If the reg is live into any header of the cycle we can't hoist an
+        // instruction which would clobber it.
+        return false;
+      }
+    }
+
+    if (!MO.isUse())
+      continue;
+
+    assert(MRI->getVRegDef(Reg) && "Machine instr not mapped for this vreg?!");
+
+    // If the cycle contains the definition of an operand, then the instruction
+    // isn't cycle invariant.
+    if (Cycle->contains(MRI->getVRegDef(Reg)->getParent()))
+      return false;
+  }
+
+  // If we got this far, the instruction is cycle invariant!
+  return true;
+}
diff --git a/llvm/lib/CodeGen/MachineDebugify.cpp b/llvm/lib/CodeGen/MachineDebugify.cpp
index 599a81847592..b726a032ca18 100644
--- a/llvm/lib/CodeGen/MachineDebugify.cpp
+++ b/llvm/lib/CodeGen/MachineDebugify.cpp
@@ -16,14 +16,11 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/IR/DIBuilder.h"
-#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Transforms/Utils/Debugify.h"
diff --git a/llvm/lib/CodeGen/MachineDominanceFrontier.cpp b/llvm/lib/CodeGen/MachineDominanceFrontier.cpp
index a39dc79baaa8..346cfedde390 100644
--- a/llvm/lib/CodeGen/MachineDominanceFrontier.cpp
+++ b/llvm/lib/CodeGen/MachineDominanceFrontier.cpp
@@ -7,10 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MachineDominanceFrontier.h"
-#include "llvm/Analysis/DominanceFrontierImpl.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/MachineDominators.cpp b/llvm/lib/CodeGen/MachineDominators.cpp
index 28cff2a4f3f3..0632cde9c6f4 100644
--- a/llvm/lib/CodeGen/MachineDominators.cpp
+++ b/llvm/lib/CodeGen/MachineDominators.cpp
@@ -15,6 +15,8 @@
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
 #include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index fd5ea5cad072..f58996ea90c6 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -44,7 +44,6 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
@@ -61,7 +60,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/DOTGraphTraits.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/raw_ostream.h"
@@ -109,6 +107,27 @@ static const char *getPropertyName(MachineFunctionProperties::Property Prop) {
   llvm_unreachable("Invalid machine function property");
 }
 
+void setUnsafeStackSize(const Function &F, MachineFrameInfo &FrameInfo) {
+  if (!F.hasFnAttribute(Attribute::SafeStack))
+    return;
+
+  auto *Existing =
+      dyn_cast_or_null<MDTuple>(F.getMetadata(LLVMContext::MD_annotation));
+
+  if (!Existing || Existing->getNumOperands() != 2)
+    return;
+
+  auto *MetadataName = "unsafe-stack-size";
+  if (auto &N = Existing->getOperand(0)) {
+    if (cast<MDString>(N.get())->getString() == MetadataName) {
+      if (auto &Op = Existing->getOperand(1)) {
+        auto Val = mdconst::extract<ConstantInt>(Op)->getZExtValue();
+        FrameInfo.setUnsafeStackSize(Val);
+      }
+    }
+  }
+}
+
 // Pin the vtable to this file.
 void MachineFunction::Delegate::anchor() {}
 
@@ -133,11 +152,11 @@ void ilist_alloc_traits<MachineBasicBlock>::deleteNode(MachineBasicBlock *MBB) {
   MBB->getParent()->deleteMachineBasicBlock(MBB);
 }
 
-static inline unsigned getFnStackAlignment(const TargetSubtargetInfo *STI,
+static inline Align getFnStackAlignment(const TargetSubtargetInfo *STI,
                                            const Function &F) {
   if (auto MA = F.getFnStackAlign())
-    return MA->value();
-  return STI->getFrameLowering()->getStackAlign().value();
+    return *MA;
+  return STI->getFrameLowering()->getStackAlign();
 }
 
 MachineFunction::MachineFunction(Function &F, const LLVMTargetMachine &Target,
@@ -177,6 +196,8 @@ void MachineFunction::init() {
       /*ForcedRealign=*/CanRealignSP &&
           F.hasFnAttribute(Attribute::StackAlignment));
 
+  setUnsafeStackSize(F, *FrameInfo);
+
   if (F.hasFnAttribute(Attribute::StackAlignment))
     FrameInfo->ensureMaxAlignment(*F.getFnStackAlign());
 
@@ -208,9 +229,7 @@ void MachineFunction::init() {
          "Can't create a MachineFunction using a Module with a "
          "Target-incompatible DataLayout attached\n");
 
-  PSVManager =
-    std::make_unique<PseudoSourceValueManager>(*(getSubtarget().
-                                                  getInstrInfo()));
+  PSVManager = std::make_unique<PseudoSourceValueManager>(getTarget());
 }
 
 MachineFunction::~MachineFunction() {
@@ -837,25 +856,6 @@ void MachineFunction::addCleanup(MachineBasicBlock *LandingPad) {
   LP.TypeIds.push_back(0);
 }
 
-void MachineFunction::addSEHCatchHandler(MachineBasicBlock *LandingPad,
-                                         const Function *Filter,
-                                         const BlockAddress *RecoverBA) {
-  LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
-  SEHHandler Handler;
-  Handler.FilterOrFinally = Filter;
-  Handler.RecoverBA = RecoverBA;
-  LP.SEHHandlers.push_back(Handler);
-}
-
-void MachineFunction::addSEHCleanupHandler(MachineBasicBlock *LandingPad,
-                                           const Function *Cleanup) {
-  LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
-  SEHHandler Handler;
-  Handler.FilterOrFinally = Cleanup;
-  Handler.RecoverBA = nullptr;
-  LP.SEHHandlers.push_back(Handler);
-}
-
 void MachineFunction::setCallSiteLandingPad(MCSymbol *Sym,
                                             ArrayRef<unsigned> Sites) {
   LPadToCallSiteMap[Sym].append(Sites.begin(), Sites.end());
@@ -1012,7 +1012,32 @@ void MachineFunction::substituteDebugValuesForInst(const MachineInstr &Old,
   }
 }
 
-auto MachineFunction::salvageCopySSA(MachineInstr &MI)
+auto MachineFunction::salvageCopySSA(
+    MachineInstr &MI, DenseMap<Register, DebugInstrOperandPair> &DbgPHICache)
+    -> DebugInstrOperandPair {
+  const TargetInstrInfo &TII = *getSubtarget().getInstrInfo();
+
+  // Check whether this copy-like instruction has already been salvaged into
+  // an operand pair.
+  Register Dest;
+  if (auto CopyDstSrc = TII.isCopyInstr(MI)) {
+    Dest = CopyDstSrc->Destination->getReg();
+  } else {
+    assert(MI.isSubregToReg());
+    Dest = MI.getOperand(0).getReg();
+  }
+
+  auto CacheIt = DbgPHICache.find(Dest);
+  if (CacheIt != DbgPHICache.end())
+    return CacheIt->second;
+
+  // Calculate the instruction number to use, or install a DBG_PHI.
+  auto OperandPair = salvageCopySSAImpl(MI);
+  DbgPHICache.insert({Dest, OperandPair});
+  return OperandPair;
+}
+
+auto MachineFunction::salvageCopySSAImpl(MachineInstr &MI)
     -> DebugInstrOperandPair {
   MachineRegisterInfo &MRI = getRegInfo();
   const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
@@ -1141,26 +1166,13 @@ auto MachineFunction::salvageCopySSA(MachineInstr &MI)
   MachineBasicBlock &InsertBB = *CurInst->getParent();
 
   // We reached the start of the block before finding a defining instruction.
-  // It could be from a constant register, otherwise it must be an argument.
-  if (TRI.isConstantPhysReg(State.first)) {
-    // We can produce a DBG_PHI that identifies the constant physreg. Doesn't
-    // matter where we put it, as it's constant valued.
-    assert(CurInst->isCopy());
-  } else if (State.first == TRI.getFrameRegister(*this)) {
-    // LLVM IR is allowed to read the framepointer by calling a
-    // llvm.frameaddress.* intrinsic. We can support this by emitting a
-    // DBG_PHI $fp. This isn't ideal, because it extends the behaviours /
-    // position that DBG_PHIs appear at, limiting what can be done later.
-    // TODO: see if there's a better way of expressing these variable
-    // locations.
-    ;
-  } else {
-    // Assert that this is the entry block, or an EH pad. If it isn't, then
-    // there is some code construct we don't recognise that deals with physregs
-    // across blocks.
-    assert(!State.first.isVirtual());
-    assert(&*InsertBB.getParent()->begin() == &InsertBB || InsertBB.isEHPad());
-  }
+  // There are numerous scenarios where this can happen:
+  // * Constant physical registers,
+  // * Several intrinsics that allow LLVM-IR to read arbitary registers,
+  // * Arguments in the entry block,
+  // * Exception handling landing pads.
+  // Validating all of them is too difficult, so just insert a DBG_PHI reading
+  // the variable value at this position, rather than checking it makes sense.
 
   // Create DBG_PHI for specified physreg.
   auto Builder = BuildMI(InsertBB, InsertBB.getFirstNonPHI(), DebugLoc(),
@@ -1181,9 +1193,7 @@ void MachineFunction::finalizeDebugInstrRefs() {
     MI.getOperand(1).ChangeToRegister(0, false);
   };
 
-  if (!useDebugInstrRef())
-    return;
-
+  DenseMap<Register, DebugInstrOperandPair> ArgDbgPHIs;
   for (auto &MBB : *this) {
     for (auto &MI : MBB) {
       if (!MI.isDebugRef() || !MI.getOperand(0).isReg())
@@ -1206,7 +1216,7 @@ void MachineFunction::finalizeDebugInstrRefs() {
       // instruction that defines the source value, see salvageCopySSA docs
       // for why this is important.
       if (DefMI.isCopyLike() || TII->isCopyInstr(DefMI)) {
-        auto Result = salvageCopySSA(DefMI);
+        auto Result = salvageCopySSA(DefMI, ArgDbgPHIs);
         MI.getOperand(0).ChangeToImmediate(Result.first);
         MI.getOperand(1).setImm(Result.second);
       } else {
diff --git a/llvm/lib/CodeGen/MachineFunctionPass.cpp b/llvm/lib/CodeGen/MachineFunctionPass.cpp
index 16cde1f601f9..99494122d608 100644
--- a/llvm/lib/CodeGen/MachineFunctionPass.cpp
+++ b/llvm/lib/CodeGen/MachineFunctionPass.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Analysis/IVUsers.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/CodeGen/MachineFunction.h"
diff --git a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp
index 0e0eb8b8e00f..81c97ba6a086 100644
--- a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp
+++ b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp
@@ -24,7 +24,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/CodeGen/BasicBlockSectionUtils.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -34,7 +33,6 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 
@@ -82,7 +80,7 @@ static bool isColdBlock(const MachineBasicBlock &MBB,
                         const MachineBlockFrequencyInfo *MBFI,
                         ProfileSummaryInfo *PSI) {
   Optional<uint64_t> Count = MBFI->getBlockProfileCount(&MBB);
-  if (!Count.hasValue())
+  if (!Count)
     return true;
 
   if (PercentileCutoff > 0) {
@@ -108,9 +106,8 @@ bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) {
   // We don't want to proceed further for cold functions
   // or functions of unknown hotness. Lukewarm functions have no prefix.
   Optional<StringRef> SectionPrefix = MF.getFunction().getSectionPrefix();
-  if (SectionPrefix.hasValue() &&
-      (SectionPrefix.getValue().equals("unlikely") ||
-       SectionPrefix.getValue().equals("unknown"))) {
+  if (SectionPrefix && (SectionPrefix.getValue().equals("unlikely") ||
+                        SectionPrefix.getValue().equals("unknown"))) {
     return false;
   }
 
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index 85b266afceef..31f45e194a97 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -11,19 +11,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallBitVector.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryLocation.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -38,42 +33,30 @@
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSlotTracker.h"
 #include "llvm/IR/Operator.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/LowLevelTypeImpl.h"
-#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include <algorithm>
 #include <cassert>
-#include <cstddef>
 #include <cstdint>
 #include <cstring>
-#include <iterator>
 #include <utility>
 
 using namespace llvm;
@@ -163,19 +146,13 @@ MachineRegisterInfo *MachineInstr::getRegInfo() {
   return nullptr;
 }
 
-/// RemoveRegOperandsFromUseLists - Unlink all of the register operands in
-/// this instruction from their respective use lists.  This requires that the
-/// operands already be on their use lists.
-void MachineInstr::RemoveRegOperandsFromUseLists(MachineRegisterInfo &MRI) {
+void MachineInstr::removeRegOperandsFromUseLists(MachineRegisterInfo &MRI) {
   for (MachineOperand &MO : operands())
     if (MO.isReg())
       MRI.removeRegOperandFromUseList(&MO);
 }
 
-/// AddRegOperandsToUseLists - Add all of the register operands in
-/// this instruction from their respective use lists.  This requires that the
-/// operands not be on their use lists yet.
-void MachineInstr::AddRegOperandsToUseLists(MachineRegisterInfo &MRI) {
+void MachineInstr::addRegOperandsToUseLists(MachineRegisterInfo &MRI) {
   for (MachineOperand &MO : operands())
     if (MO.isReg())
       MRI.addRegOperandToUseList(&MO);
@@ -232,16 +209,12 @@ void MachineInstr::addOperand(MachineFunction &MF, const MachineOperand &Op) {
     }
   }
 
-#ifndef NDEBUG
-  bool isDebugOp = Op.getType() == MachineOperand::MO_Metadata ||
-                   Op.getType() == MachineOperand::MO_MCSymbol;
   // OpNo now points as the desired insertion point.  Unless this is a variadic
   // instruction, only implicit regs are allowed beyond MCID->getNumOperands().
   // RegMask operands go between the explicit and implicit operands.
-  assert((isImpReg || Op.isRegMask() || MCID->isVariadic() ||
-          OpNo < MCID->getNumOperands() || isDebugOp) &&
+  assert((MCID->isVariadic() || OpNo < MCID->getNumOperands() ||
+          Op.isValidExcessOperand()) &&
          "Trying to add an operand to a machine instr that is already done!");
-#endif
 
   MachineRegisterInfo *MRI = getRegInfo();
 
@@ -300,10 +273,7 @@ void MachineInstr::addOperand(MachineFunction &MF, const MachineOperand &Op) {
   }
 }
 
-/// RemoveOperand - Erase an operand  from an instruction, leaving it with one
-/// fewer operand than it started with.
-///
-void MachineInstr::RemoveOperand(unsigned OpNo) {
+void MachineInstr::removeOperand(unsigned OpNo) {
   assert(OpNo < getNumOperands() && "Invalid operand number");
   untieRegOperand(OpNo);
 
@@ -1401,11 +1371,10 @@ bool MachineInstr::isDereferenceableInvariantLoad(AAResults *AA) const {
       continue;
 
     // A load from a constant PseudoSourceValue is invariant.
-    if (const PseudoSourceValue *PSV = MMO->getPseudoValue())
+    if (const PseudoSourceValue *PSV = MMO->getPseudoValue()) {
       if (PSV->isConstant(&MFI))
         continue;
-
-    if (const Value *V = MMO->getValue()) {
+    } else if (const Value *V = MMO->getValue()) {
       // If we have an AliasAnalysis, ask it whether the memory is constant.
       if (AA &&
           AA->pointsToConstantMemory(
@@ -1904,7 +1873,7 @@ bool MachineInstr::addRegisterKilled(Register IncomingReg,
     unsigned OpIdx = DeadOps.back();
     if (getOperand(OpIdx).isImplicit() &&
         (!isInlineAsm() || findInlineAsmFlagIdx(OpIdx) < 0))
-      RemoveOperand(OpIdx);
+      removeOperand(OpIdx);
     else
       getOperand(OpIdx).setIsKill(false);
     DeadOps.pop_back();
@@ -1969,7 +1938,7 @@ bool MachineInstr::addRegisterDead(Register Reg,
     unsigned OpIdx = DeadOps.back();
     if (getOperand(OpIdx).isImplicit() &&
         (!isInlineAsm() || findInlineAsmFlagIdx(OpIdx) < 0))
-      RemoveOperand(OpIdx);
+      removeOperand(OpIdx);
     else
       getOperand(OpIdx).setIsDead(false);
     DeadOps.pop_back();
diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp
index 759cff179790..2f1d7b976264 100644
--- a/llvm/lib/CodeGen/MachineInstrBundle.cpp
+++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp
@@ -16,7 +16,8 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
 #include <utility>
 using namespace llvm;
 
@@ -109,7 +110,7 @@ bool FinalizeMachineBundles::runOnMachineFunction(MachineFunction &MF) {
 static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI,
                             MachineBasicBlock::instr_iterator LastMI) {
   for (auto MII = FirstMI; MII != LastMI; ++MII)
-    if (MII->getDebugLoc().get())
+    if (MII->getDebugLoc())
       return MII->getDebugLoc();
   return DebugLoc();
 }
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 500cf8e0b79b..00d75f8231c7 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -240,7 +240,7 @@ namespace {
     void ExitScopeIfDone(
         MachineDomTreeNode *Node,
         DenseMap<MachineDomTreeNode *, unsigned> &OpenChildren,
-        DenseMap<MachineDomTreeNode *, MachineDomTreeNode *> &ParentMap);
+        const DenseMap<MachineDomTreeNode *, MachineDomTreeNode *> &ParentMap);
 
     void HoistOutOfLoop(MachineDomTreeNode *HeaderN);
 
@@ -696,19 +696,16 @@ void MachineLICMBase::ExitScope(MachineBasicBlock *MBB) {
 /// destroy ancestors which are now done.
 void MachineLICMBase::ExitScopeIfDone(MachineDomTreeNode *Node,
     DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren,
-    DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> &ParentMap) {
+    const DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> &ParentMap) {
   if (OpenChildren[Node])
     return;
 
-  // Pop scope.
-  ExitScope(Node->getBlock());
-
-  // Now traverse upwards to pop ancestors whose offsprings are all done.
-  while (MachineDomTreeNode *Parent = ParentMap[Node]) {
-    unsigned Left = --OpenChildren[Parent];
-    if (Left != 0)
+  for(;;) {
+    ExitScope(Node->getBlock());
+    // Now traverse upwards to pop ancestors whose offsprings are all done.
+    MachineDomTreeNode *Parent = ParentMap.lookup(Node);
+    if (!Parent || --OpenChildren[Parent] != 0)
       break;
-    ExitScope(Parent->getBlock());
     Node = Parent;
   }
 }
@@ -999,6 +996,9 @@ bool MachineLICMBase::IsLICMCandidate(MachineInstr &I) {
   if (I.isConvergent())
     return false;
 
+  if (!TII->shouldHoist(I, CurLoop))
+    return false;
+
   return true;
 }
 
diff --git a/llvm/lib/CodeGen/MachineLoopInfo.cpp b/llvm/lib/CodeGen/MachineLoopInfo.cpp
index 9b96bc5e5e7f..5cbded4b9264 100644
--- a/llvm/lib/CodeGen/MachineLoopInfo.cpp
+++ b/llvm/lib/CodeGen/MachineLoopInfo.cpp
@@ -17,13 +17,12 @@
 #include "llvm/Analysis/LoopInfoImpl.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/MachineLoopUtils.cpp b/llvm/lib/CodeGen/MachineLoopUtils.cpp
index fdcc8472f1c2..0e8335d4974d 100644
--- a/llvm/lib/CodeGen/MachineLoopUtils.cpp
+++ b/llvm/lib/CodeGen/MachineLoopUtils.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineLoopUtils.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -64,7 +63,11 @@ MachineBasicBlock *llvm::PeelSingleBlockLoop(LoopPeelDirection Direction,
           if (Use.getParent()->getParent() != Loop)
             Uses.push_back(&Use);
         for (auto *Use : Uses) {
-          MRI.constrainRegClass(R, MRI.getRegClass(Use->getReg()));
+          const TargetRegisterClass *ConstrainRegClass =
+              MRI.constrainRegClass(R, MRI.getRegClass(Use->getReg()));
+          assert(ConstrainRegClass &&
+                 "Expected a valid constrained register class!");
+          (void)ConstrainRegClass;
           Use->setReg(R);
         }
       }
@@ -90,25 +93,24 @@ MachineBasicBlock *llvm::PeelSingleBlockLoop(LoopPeelDirection Direction,
       if (Remaps.count(R))
         R = Remaps[R];
       OrigPhi.getOperand(InitRegIdx).setReg(R);
-      MI.RemoveOperand(LoopRegIdx + 1);
-      MI.RemoveOperand(LoopRegIdx + 0);
+      MI.removeOperand(LoopRegIdx + 1);
+      MI.removeOperand(LoopRegIdx + 0);
     } else {
       // When peeling back, the initial value is the loop-carried value from
       // the original loop.
       Register LoopReg = OrigPhi.getOperand(LoopRegIdx).getReg();
       MI.getOperand(LoopRegIdx).setReg(LoopReg);
-      MI.RemoveOperand(InitRegIdx + 1);
-      MI.RemoveOperand(InitRegIdx + 0);
+      MI.removeOperand(InitRegIdx + 1);
+      MI.removeOperand(InitRegIdx + 0);
     }
   }
 
   DebugLoc DL;
   if (Direction == LPD_Front) {
-    Preheader->replaceSuccessor(Loop, NewBB);
+    Preheader->ReplaceUsesOfBlockWith(Loop, NewBB);
     NewBB->addSuccessor(Loop);
     Loop->replacePhiUsesWith(Preheader, NewBB);
-    if (TII->removeBranch(*Preheader) > 0)
-      TII->insertBranch(*Preheader, NewBB, nullptr, {}, DL);
+    Preheader->updateTerminator(Loop);
     TII->removeBranch(*NewBB);
     TII->insertBranch(*NewBB, Loop, nullptr, {}, DL);
   } else {
diff --git a/llvm/lib/CodeGen/MachineModuleInfo.cpp b/llvm/lib/CodeGen/MachineModuleInfo.cpp
index 31d4fc7d02bf..23d55a5df9f5 100644
--- a/llvm/lib/CodeGen/MachineModuleInfo.cpp
+++ b/llvm/lib/CodeGen/MachineModuleInfo.cpp
@@ -7,27 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/Value.h"
-#include "llvm/IR/ValueHandle.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCSymbolXCOFF.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
@@ -40,174 +31,24 @@
 using namespace llvm;
 using namespace llvm::dwarf;
 
+static cl::opt<bool>
+    DisableDebugInfoPrinting("disable-debug-info-print", cl::Hidden,
+                             cl::desc("Disable debug info printing"));
+
 // Out of line virtual method.
 MachineModuleInfoImpl::~MachineModuleInfoImpl() = default;
 
-namespace llvm {
-
-class MMIAddrLabelMapCallbackPtr final : CallbackVH {
-  MMIAddrLabelMap *Map = nullptr;
-
-public:
-  MMIAddrLabelMapCallbackPtr() = default;
-  MMIAddrLabelMapCallbackPtr(Value *V) : CallbackVH(V) {}
-
-  void setPtr(BasicBlock *BB) {
-    ValueHandleBase::operator=(BB);
-  }
-
-  void setMap(MMIAddrLabelMap *map) { Map = map; }
-
-  void deleted() override;
-  void allUsesReplacedWith(Value *V2) override;
-};
-
-class MMIAddrLabelMap {
-  MCContext &Context;
-  struct AddrLabelSymEntry {
-    /// The symbols for the label.
-    TinyPtrVector<MCSymbol *> Symbols;
-
-    Function *Fn;   // The containing function of the BasicBlock.
-    unsigned Index; // The index in BBCallbacks for the BasicBlock.
-  };
-
-  DenseMap<AssertingVH<BasicBlock>, AddrLabelSymEntry> AddrLabelSymbols;
-
-  /// Callbacks for the BasicBlock's that we have entries for.  We use this so
-  /// we get notified if a block is deleted or RAUWd.
-  std::vector<MMIAddrLabelMapCallbackPtr> BBCallbacks;
-
-  /// This is a per-function list of symbols whose corresponding BasicBlock got
-  /// deleted.  These symbols need to be emitted at some point in the file, so
-  /// AsmPrinter emits them after the function body.
-  DenseMap<AssertingVH<Function>, std::vector<MCSymbol*>>
-    DeletedAddrLabelsNeedingEmission;
-
-public:
-  MMIAddrLabelMap(MCContext &context) : Context(context) {}
-
-  ~MMIAddrLabelMap() {
-    assert(DeletedAddrLabelsNeedingEmission.empty() &&
-           "Some labels for deleted blocks never got emitted");
-  }
-
-  ArrayRef<MCSymbol *> getAddrLabelSymbolToEmit(BasicBlock *BB);
-
-  void takeDeletedSymbolsForFunction(Function *F,
-                                     std::vector<MCSymbol*> &Result);
-
-  void UpdateForDeletedBlock(BasicBlock *BB);
-  void UpdateForRAUWBlock(BasicBlock *Old, BasicBlock *New);
-};
-
-} // end namespace llvm
-
-ArrayRef<MCSymbol *> MMIAddrLabelMap::getAddrLabelSymbolToEmit(BasicBlock *BB) {
-  assert(BB->hasAddressTaken() &&
-         "Shouldn't get label for block without address taken");
-  AddrLabelSymEntry &Entry = AddrLabelSymbols[BB];
-
-  // If we already had an entry for this block, just return it.
-  if (!Entry.Symbols.empty()) {
-    assert(BB->getParent() == Entry.Fn && "Parent changed");
-    return Entry.Symbols;
-  }
-
-  // Otherwise, this is a new entry, create a new symbol for it and add an
-  // entry to BBCallbacks so we can be notified if the BB is deleted or RAUWd.
-  BBCallbacks.emplace_back(BB);
-  BBCallbacks.back().setMap(this);
-  Entry.Index = BBCallbacks.size() - 1;
-  Entry.Fn = BB->getParent();
-  MCSymbol *Sym = BB->hasAddressTaken() ? Context.createNamedTempSymbol()
-                                        : Context.createTempSymbol();
-  Entry.Symbols.push_back(Sym);
-  return Entry.Symbols;
-}
-
-/// If we have any deleted symbols for F, return them.
-void MMIAddrLabelMap::
-takeDeletedSymbolsForFunction(Function *F, std::vector<MCSymbol*> &Result) {
-  DenseMap<AssertingVH<Function>, std::vector<MCSymbol*>>::iterator I =
-    DeletedAddrLabelsNeedingEmission.find(F);
-
-  // If there are no entries for the function, just return.
-  if (I == DeletedAddrLabelsNeedingEmission.end()) return;
-
-  // Otherwise, take the list.
-  std::swap(Result, I->second);
-  DeletedAddrLabelsNeedingEmission.erase(I);
-}
-
-void MMIAddrLabelMap::UpdateForDeletedBlock(BasicBlock *BB) {
-  // If the block got deleted, there is no need for the symbol.  If the symbol
-  // was already emitted, we can just forget about it, otherwise we need to
-  // queue it up for later emission when the function is output.
-  AddrLabelSymEntry Entry = std::move(AddrLabelSymbols[BB]);
-  AddrLabelSymbols.erase(BB);
-  assert(!Entry.Symbols.empty() && "Didn't have a symbol, why a callback?");
-  BBCallbacks[Entry.Index] = nullptr;  // Clear the callback.
-
-  assert((BB->getParent() == nullptr || BB->getParent() == Entry.Fn) &&
-         "Block/parent mismatch");
-
-  for (MCSymbol *Sym : Entry.Symbols) {
-    if (Sym->isDefined())
-      return;
-
-    // If the block is not yet defined, we need to emit it at the end of the
-    // function.  Add the symbol to the DeletedAddrLabelsNeedingEmission list
-    // for the containing Function.  Since the block is being deleted, its
-    // parent may already be removed, we have to get the function from 'Entry'.
-    DeletedAddrLabelsNeedingEmission[Entry.Fn].push_back(Sym);
-  }
-}
-
-void MMIAddrLabelMap::UpdateForRAUWBlock(BasicBlock *Old, BasicBlock *New) {
-  // Get the entry for the RAUW'd block and remove it from our map.
-  AddrLabelSymEntry OldEntry = std::move(AddrLabelSymbols[Old]);
-  AddrLabelSymbols.erase(Old);
-  assert(!OldEntry.Symbols.empty() && "Didn't have a symbol, why a callback?");
-
-  AddrLabelSymEntry &NewEntry = AddrLabelSymbols[New];
-
-  // If New is not address taken, just move our symbol over to it.
-  if (NewEntry.Symbols.empty()) {
-    BBCallbacks[OldEntry.Index].setPtr(New);    // Update the callback.
-    NewEntry = std::move(OldEntry);             // Set New's entry.
-    return;
-  }
-
-  BBCallbacks[OldEntry.Index] = nullptr;    // Update the callback.
-
-  // Otherwise, we need to add the old symbols to the new block's set.
-  llvm::append_range(NewEntry.Symbols, OldEntry.Symbols);
-}
-
-void MMIAddrLabelMapCallbackPtr::deleted() {
-  Map->UpdateForDeletedBlock(cast<BasicBlock>(getValPtr()));
-}
-
-void MMIAddrLabelMapCallbackPtr::allUsesReplacedWith(Value *V2) {
-  Map->UpdateForRAUWBlock(cast<BasicBlock>(getValPtr()), cast<BasicBlock>(V2));
-}
-
 void MachineModuleInfo::initialize() {
   ObjFileMMI = nullptr;
   CurCallSite = 0;
   NextFnNum = 0;
-  UsesMSVCFloatingPoint = UsesMorestackAddr = false;
-  HasSplitStack = HasNosplitStack = false;
-  AddrLabelSymbols = nullptr;
+  UsesMSVCFloatingPoint = false;
+  DbgInfoAvailable = false;
 }
 
 void MachineModuleInfo::finalize() {
   Personalities.clear();
 
-  delete AddrLabelSymbols;
-  AddrLabelSymbols = nullptr;
-
   Context.reset();
   // We don't clear the ExternalContext.
 
@@ -219,16 +60,11 @@ MachineModuleInfo::MachineModuleInfo(MachineModuleInfo &&MMI)
     : TM(std::move(MMI.TM)),
       Context(MMI.TM.getTargetTriple(), MMI.TM.getMCAsmInfo(),
               MMI.TM.getMCRegisterInfo(), MMI.TM.getMCSubtargetInfo(), nullptr,
-              nullptr, false),
+              &MMI.TM.Options.MCOptions, false),
       MachineFunctions(std::move(MMI.MachineFunctions)) {
   Context.setObjectFileInfo(MMI.TM.getObjFileLowering());
   ObjFileMMI = MMI.ObjFileMMI;
   CurCallSite = MMI.CurCallSite;
-  UsesMSVCFloatingPoint = MMI.UsesMSVCFloatingPoint;
-  UsesMorestackAddr = MMI.UsesMorestackAddr;
-  HasSplitStack = MMI.HasSplitStack;
-  HasNosplitStack = MMI.HasNosplitStack;
-  AddrLabelSymbols = MMI.AddrLabelSymbols;
   ExternalContext = MMI.ExternalContext;
   TheModule = MMI.TheModule;
 }
@@ -236,7 +72,7 @@ MachineModuleInfo::MachineModuleInfo(MachineModuleInfo &&MMI)
 MachineModuleInfo::MachineModuleInfo(const LLVMTargetMachine *TM)
     : TM(*TM), Context(TM->getTargetTriple(), TM->getMCAsmInfo(),
                        TM->getMCRegisterInfo(), TM->getMCSubtargetInfo(),
-                       nullptr, nullptr, false) {
+                       nullptr, &TM->Options.MCOptions, false) {
   Context.setObjectFileInfo(TM->getObjFileLowering());
   initialize();
 }
@@ -245,7 +81,7 @@ MachineModuleInfo::MachineModuleInfo(const LLVMTargetMachine *TM,
                                      MCContext *ExtContext)
     : TM(*TM), Context(TM->getTargetTriple(), TM->getMCAsmInfo(),
                        TM->getMCRegisterInfo(), TM->getMCSubtargetInfo(),
-                       nullptr, nullptr, false),
+                       nullptr, &TM->Options.MCOptions, false),
       ExternalContext(ExtContext) {
   Context.setObjectFileInfo(TM->getObjFileLowering());
   initialize();
@@ -253,25 +89,6 @@ MachineModuleInfo::MachineModuleInfo(const LLVMTargetMachine *TM,
 
 MachineModuleInfo::~MachineModuleInfo() { finalize(); }
 
-//===- Address of Block Management ----------------------------------------===//
-
-ArrayRef<MCSymbol *>
-MachineModuleInfo::getAddrLabelSymbolToEmit(const BasicBlock *BB) {
-  // Lazily create AddrLabelSymbols.
-  if (!AddrLabelSymbols)
-    AddrLabelSymbols = new MMIAddrLabelMap(getContext());
- return AddrLabelSymbols->getAddrLabelSymbolToEmit(const_cast<BasicBlock*>(BB));
-}
-
-void MachineModuleInfo::
-takeDeletedSymbolsForFunction(const Function *F,
-                              std::vector<MCSymbol*> &Result) {
-  // If no blocks have had their addresses taken, we're done.
-  if (!AddrLabelSymbols) return;
-  return AddrLabelSymbols->
-     takeDeletedSymbolsForFunction(const_cast<Function*>(F), Result);
-}
-
 /// \name Exception Handling
 /// \{
 
@@ -318,6 +135,13 @@ void MachineModuleInfo::deleteMachineFunctionFor(Function &F) {
   LastResult = nullptr;
 }
 
+void MachineModuleInfo::insertFunction(const Function &F,
+                                       std::unique_ptr<MachineFunction> &&MF) {
+  auto I = MachineFunctions.insert(std::make_pair(&F, std::move(MF)));
+  assert(I.second && "machine function already mapped");
+  (void)I;
+}
+
 namespace {
 
 /// This pass frees the MachineFunction object associated with a Function.
@@ -409,7 +233,8 @@ bool MachineModuleInfoWrapperPass::doInitialization(Module &M) {
         Ctx.diagnose(
             DiagnosticInfoSrcMgr(SMD, M.getName(), IsInlineAsm, LocCookie));
       });
-  MMI.DbgInfoAvailable = !M.debug_compile_units().empty();
+  MMI.DbgInfoAvailable = !DisableDebugInfoPrinting &&
+                         !M.debug_compile_units().empty();
   return false;
 }
 
@@ -424,6 +249,7 @@ MachineModuleInfo MachineModuleAnalysis::run(Module &M,
                                              ModuleAnalysisManager &) {
   MachineModuleInfo MMI(TM);
   MMI.TheModule = &M;
-  MMI.DbgInfoAvailable = !M.debug_compile_units().empty();
+  MMI.DbgInfoAvailable = !DisableDebugInfoPrinting &&
+                         !M.debug_compile_units().empty();
   return MMI;
 }
diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp
index 680dbe54ffaf..46ad1de78c46 100644
--- a/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/llvm/lib/CodeGen/MachineOperand.cpp
@@ -14,9 +14,7 @@
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/Loads.h"
-#include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/CodeGen/MIRFormatter.h"
-#include "llvm/CodeGen/MIRPrinter.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
diff --git a/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp b/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp
index 5347a7b0d890..631768ec986c 100644
--- a/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp
+++ b/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp
@@ -53,10 +53,8 @@ void MachineOptimizationRemarkEmitter::emit(
   LLVMContext &Ctx = MF.getFunction().getContext();
 
   // Only emit it if its hotness meets the threshold.
-  if (OptDiag.getHotness().getValueOr(0) <
-      Ctx.getDiagnosticsHotnessThreshold()) {
+  if (OptDiag.getHotness().value_or(0) < Ctx.getDiagnosticsHotnessThreshold())
     return;
-  }
 
   Ctx.diagnose(OptDiag);
 }
diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp
index 7783b5e0d3cc..5da68abc8f6a 100644
--- a/llvm/lib/CodeGen/MachineOutliner.cpp
+++ b/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -59,6 +59,8 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/Passes.h"
@@ -82,9 +84,17 @@ using namespace llvm;
 using namespace ore;
 using namespace outliner;
 
+// Statistics for outlined functions.
 STATISTIC(NumOutlined, "Number of candidates outlined");
 STATISTIC(FunctionsCreated, "Number of functions created");
 
+// Statistics for instruction mapping.
+STATISTIC(NumLegalInUnsignedVec, "Number of legal instrs in unsigned vector");
+STATISTIC(NumIllegalInUnsignedVec,
+          "Number of illegal instrs in unsigned vector");
+STATISTIC(NumInvisible, "Number of invisible instrs in unsigned vector");
+STATISTIC(UnsignedVecSize, "Size of unsigned vector");
+
 // Set to true if the user wants the outliner to run on linkonceodr linkage
 // functions. This is false by default because the linker can dedupe linkonceodr
 // functions. Since the outliner is confined to a single module (modulo LTO),
@@ -188,6 +198,8 @@ struct InstructionMapper {
     assert(LegalInstrNumber != DenseMapInfo<unsigned>::getTombstoneKey() &&
            "Tried to assign DenseMap tombstone or empty key to instruction.");
 
+    // Statistics.
+    ++NumLegalInUnsignedVec;
     return MINumber;
   }
 
@@ -215,6 +227,8 @@ struct InstructionMapper {
     InstrListForMBB.push_back(It);
     UnsignedVecForMBB.push_back(IllegalInstrNumber);
     IllegalInstrNumber--;
+    // Statistics.
+    ++NumIllegalInUnsignedVec;
 
     assert(LegalInstrNumber < IllegalInstrNumber &&
            "Instruction mapping overflow!");
@@ -293,6 +307,7 @@ struct InstructionMapper {
       case InstrType::Invisible:
         // Normally this is set by mapTo(Blah)Unsigned, but we just want to
         // skip this instruction. So, unset the flag here.
+        ++NumInvisible;
         AddedIllegalLastTime = false;
         break;
       }
@@ -623,6 +638,15 @@ MachineFunction *MachineOutliner::createOutlinedFunction(
 
   TII.mergeOutliningCandidateAttributes(*F, OF.Candidates);
 
+  // Set uwtable, so we generate eh_frame.
+  UWTableKind UW = std::accumulate(
+      OF.Candidates.cbegin(), OF.Candidates.cend(), UWTableKind::None,
+      [](UWTableKind K, const outliner::Candidate &C) {
+        return std::max(K, C.getMF()->getFunction().getUWTableKind());
+      });
+  if (UW != UWTableKind::None)
+    F->setUWTableKind(UW);
+
   BasicBlock *EntryBB = BasicBlock::Create(C, "entry", F);
   IRBuilder<> Builder(EntryBB);
   Builder.CreateRetVoid();
@@ -641,17 +665,20 @@ MachineFunction *MachineOutliner::createOutlinedFunction(
        ++I) {
     if (I->isDebugInstr())
       continue;
-    MachineInstr *NewMI = MF.CloneMachineInstr(&*I);
+
+    // Don't keep debug information for outlined instructions.
+    auto DL = DebugLoc();
     if (I->isCFIInstruction()) {
-      unsigned CFIIndex = NewMI->getOperand(0).getCFIIndex();
+      unsigned CFIIndex = I->getOperand(0).getCFIIndex();
       MCCFIInstruction CFI = Instrs[CFIIndex];
-      (void)MF.addFrameInst(CFI);
+      BuildMI(MBB, MBB.end(), DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(MF.addFrameInst(CFI));
+    } else {
+      MachineInstr *NewMI = MF.CloneMachineInstr(&*I);
+      NewMI->dropMemRefs(MF);
+      NewMI->setDebugLoc(DL);
+      MBB.insert(MBB.end(), NewMI);
     }
-    NewMI->dropMemRefs(MF);
-
-    // Don't keep debug information for outlined instructions.
-    NewMI->setDebugLoc(DebugLoc());
-    MBB.insert(MBB.end(), NewMI);
   }
 
   // Set normal properties for a late MachineFunction.
@@ -831,9 +858,10 @@ bool MachineOutliner::outline(Module &M,
       MBB.erase(std::next(StartIt), std::next(EndIt));
 
       // Keep track of what we removed by marking them all as -1.
-      std::for_each(Mapper.UnsignedVec.begin() + C.getStartIdx(),
-                    Mapper.UnsignedVec.begin() + C.getEndIdx() + 1,
-                    [](unsigned &I) { I = static_cast<unsigned>(-1); });
+      for (unsigned &I :
+           llvm::make_range(Mapper.UnsignedVec.begin() + C.getStartIdx(),
+                            Mapper.UnsignedVec.begin() + C.getEndIdx() + 1))
+        I = static_cast<unsigned>(-1);
       OutlinedSomething = true;
 
       // Statistics.
@@ -896,6 +924,9 @@ void MachineOutliner::populateMapper(InstructionMapper &Mapper, Module &M,
       // MBB is suitable for outlining. Map it to a list of unsigneds.
       Mapper.convertToUnsignedVec(MBB, *TII);
     }
+
+    // Statistics.
+    UnsignedVecSize = Mapper.UnsignedVec.size();
   }
 }
 
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 762395542b40..8d500398f55e 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -29,6 +29,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/MachinePipeliner.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
@@ -43,6 +44,7 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/DFAPacketizer.h"
 #include "llvm/CodeGen/LiveIntervals.h"
@@ -55,7 +57,6 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachinePipeliner.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/ModuloSchedule.h"
 #include "llvm/CodeGen/RegisterPressure.h"
@@ -66,7 +67,6 @@
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Attributes.h"
-#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -109,7 +109,6 @@ STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
 
 /// A command line option to turn software pipelining on or off.
 static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
-                               cl::ZeroOrMore,
                                cl::desc("Enable Software Pipelining"));
 
 /// A command line option to enable SWP at -Os.
@@ -147,8 +146,8 @@ static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
 #endif
 
 static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
-                                     cl::ReallyHidden, cl::init(false),
-                                     cl::ZeroOrMore, cl::desc("Ignore RecMII"));
+                                     cl::ReallyHidden,
+                                     cl::desc("Ignore RecMII"));
 
 static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,
                                     cl::init(false));
@@ -169,10 +168,9 @@ static cl::opt<bool> ExperimentalCodeGen(
 namespace llvm {
 
 // A command line option to enable the CopyToPhi DAG mutation.
-cl::opt<bool>
-    SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
-                       cl::init(true), cl::ZeroOrMore,
-                       cl::desc("Enable CopyToPhi DAG Mutation"));
+cl::opt<bool> SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
+                                 cl::init(true),
+                                 cl::desc("Enable CopyToPhi DAG Mutation"));
 
 } // end namespace llvm
 
@@ -255,6 +253,7 @@ bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
              << "Failed to pipeline loop";
     });
 
+    LI.LoopPipelinerInfo.reset();
     return Changed;
   }
 
@@ -262,6 +261,7 @@ bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
 
   Changed = swingModuloScheduler(L);
 
+  LI.LoopPipelinerInfo.reset();
   return Changed;
 }
 
@@ -354,7 +354,8 @@ bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
 
   LI.LoopInductionVar = nullptr;
   LI.LoopCompare = nullptr;
-  if (!TII->analyzeLoopForPipelining(L.getTopBlock())) {
+  LI.LoopPipelinerInfo = TII->analyzeLoopForPipelining(L.getTopBlock());
+  if (!LI.LoopPipelinerInfo) {
     LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
     NumFailLoop++;
     ORE->emit([&]() {
@@ -419,7 +420,7 @@ bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
   assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
 
   SwingSchedulerDAG SMS(*this, L, getAnalysis<LiveIntervals>(), RegClassInfo,
-                        II_setByPragma);
+                        II_setByPragma, LI.LoopPipelinerInfo.get());
 
   MachineBasicBlock *MBB = L.getHeader();
   // The kernel should not include any terminator instructions.  These
@@ -513,7 +514,7 @@ void SwingSchedulerDAG::schedule() {
   // Don't pipeline large loops.
   if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) {
     LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
-                      << ", we don't pipleline large loops\n");
+                      << ", we don't pipeline large loops\n");
     NumFailLargeMaxMII++;
     Pass.ORE->emit([&]() {
       return MachineOptimizationRemarkAnalysis(
@@ -1297,8 +1298,7 @@ bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
     for (auto W : AdjK[V]) {
       if (W < S)
         continue;
-      if (B[W].count(SV) == 0)
-        B[W].insert(SV);
+      B[W].insert(SV);
     }
   }
   Stack.pop_back();
@@ -1422,7 +1422,7 @@ void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
 /// We ignore the back-edge recurrence in order to avoid unbounded recursion
 /// in the calculation of the ASAP, ALAP, etc functions.
 static bool ignoreDependence(const SDep &D, bool isPred) {
-  if (D.isArtificial())
+  if (D.isArtificial() || D.getSUnit()->isBoundaryNode())
     return true;
   return D.getKind() == SDep::Anti && isPred;
 }
@@ -1471,6 +1471,8 @@ void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
     SUnit *SU = &SUnits[I];
     for (const SDep &S : SU->Succs) {
       SUnit *succ = S.getSUnit();
+      if (succ->isBoundaryNode())
+        continue;
       if (S.getLatency() == 0)
         zeroLatencyHeight =
             std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1);
@@ -1575,7 +1577,9 @@ static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
     return Path.contains(Cur);
   bool FoundPath = false;
   for (auto &SI : Cur->Succs)
-    FoundPath |= computePath(SI.getSUnit(), Path, DestNodes, Exclude, Visited);
+    if (!ignoreDependence(SI, false))
+      FoundPath |=
+          computePath(SI.getSUnit(), Path, DestNodes, Exclude, Visited);
   for (auto &PI : Cur->Preds)
     if (PI.getKind() == SDep::Anti)
       FoundPath |=
@@ -1663,7 +1667,7 @@ void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
         LLVM_DEBUG(
             dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
                    << TRI->getRegPressureSetName(RPDelta.Excess.getPSet())
-                   << ":" << RPDelta.Excess.getUnitInc());
+                   << ":" << RPDelta.Excess.getUnitInc() << "\n");
         NS.setExceedPressure(SU);
         break;
       }
@@ -1718,7 +1722,7 @@ void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
 }
 
 /// Add the nodes that do not belong to a recurrence set into groups
-/// based upon connected componenets.
+/// based upon connected components.
 void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
   SetVector<SUnit *> NodesAdded;
   SmallPtrSet<SUnit *, 8> Visited;
@@ -1788,7 +1792,8 @@ void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
   NodesAdded.insert(SU);
   for (auto &SI : SU->Succs) {
     SUnit *Successor = SI.getSUnit();
-    if (!SI.isArtificial() && NodesAdded.count(Successor) == 0)
+    if (!SI.isArtificial() && !Successor->isBoundaryNode() &&
+        NodesAdded.count(Successor) == 0)
       addConnectedNodes(Successor, NewSet, NodesAdded);
   }
   for (auto &PI : SU->Preds) {
@@ -1803,8 +1808,7 @@ void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
 static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
                         SmallSetVector<SUnit *, 8> &Result) {
   Result.clear();
-  for (unsigned i = 0, e = Set1.size(); i != e; ++i) {
-    SUnit *SU = Set1[i];
+  for (SUnit *SU : Set1) {
     if (Set2.count(SU) != 0)
       Result.insert(SU);
   }
@@ -2080,6 +2084,11 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
       });
     } while (++NI != NE && scheduleFound);
 
+    // If a schedule is found, ensure non-pipelined instructions are in stage 0
+    if (scheduleFound)
+      scheduleFound =
+          Schedule.normalizeNonPipelinedInstructions(this, LoopPipelinerInfo);
+
     // If a schedule is found, check if it is a valid schedule too.
     if (scheduleFound)
       scheduleFound = Schedule.isValidSchedule(this);
@@ -2263,7 +2272,7 @@ MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
 bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep,
                                          bool isSucc) {
   if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
-      Dep.isArtificial())
+      Dep.isArtificial() || Dep.getSUnit()->isBoundaryNode())
     return false;
 
   if (!SwpPruneLoopCarried)
@@ -2430,7 +2439,7 @@ int SMSchedule::latestCycleInChain(const SDep &Dep) {
   while (!Worklist.empty()) {
     const SDep &Cur = Worklist.pop_back_val();
     SUnit *SuccSU = Cur.getSUnit();
-    if (Visited.count(SuccSU))
+    if (Visited.count(SuccSU) || SuccSU->isBoundaryNode())
       continue;
     std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
     if (it == InstrToCycle.end())
@@ -2697,21 +2706,91 @@ bool SMSchedule::isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD,
   return false;
 }
 
+/// Determine transitive dependences of unpipelineable instructions
+SmallSet<SUnit *, 8> SMSchedule::computeUnpipelineableNodes(
+    SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI) {
+  SmallSet<SUnit *, 8> DoNotPipeline;
+  SmallVector<SUnit *, 8> Worklist;
+
+  for (auto &SU : SSD->SUnits)
+    if (SU.isInstr() && PLI->shouldIgnoreForPipelining(SU.getInstr()))
+      Worklist.push_back(&SU);
+
+  while (!Worklist.empty()) {
+    auto SU = Worklist.pop_back_val();
+    if (DoNotPipeline.count(SU))
+      continue;
+    LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n");
+    DoNotPipeline.insert(SU);
+    for (auto &Dep : SU->Preds)
+      Worklist.push_back(Dep.getSUnit());
+    if (SU->getInstr()->isPHI())
+      for (auto &Dep : SU->Succs)
+        if (Dep.getKind() == SDep::Anti)
+          Worklist.push_back(Dep.getSUnit());
+  }
+  return DoNotPipeline;
+}
+
+// Determine all instructions upon which any unpipelineable instruction depends
+// and ensure that they are in stage 0.  If unable to do so, return false.
+bool SMSchedule::normalizeNonPipelinedInstructions(
+    SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI) {
+  SmallSet<SUnit *, 8> DNP = computeUnpipelineableNodes(SSD, PLI);
+
+  int NewLastCycle = INT_MIN;
+  for (SUnit &SU : SSD->SUnits) {
+    if (!SU.isInstr())
+      continue;
+    if (!DNP.contains(&SU) || stageScheduled(&SU) == 0) {
+      NewLastCycle = std::max(NewLastCycle, InstrToCycle[&SU]);
+      continue;
+    }
+
+    // Put the non-pipelined instruction as early as possible in the schedule
+    int NewCycle = getFirstCycle();
+    for (auto &Dep : SU.Preds)
+      NewCycle = std::max(InstrToCycle[Dep.getSUnit()], NewCycle);
+
+    int OldCycle = InstrToCycle[&SU];
+    if (OldCycle != NewCycle) {
+      InstrToCycle[&SU] = NewCycle;
+      auto &OldS = getInstructions(OldCycle);
+      llvm::erase_value(OldS, &SU);
+      getInstructions(NewCycle).emplace_back(&SU);
+      LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum
+                        << ") is not pipelined; moving from cycle " << OldCycle
+                        << " to " << NewCycle << " Instr:" << *SU.getInstr());
+    }
+    NewLastCycle = std::max(NewLastCycle, NewCycle);
+  }
+  LastCycle = NewLastCycle;
+  return true;
+}
+
 // Check if the generated schedule is valid. This function checks if
 // an instruction that uses a physical register is scheduled in a
 // different stage than the definition. The pipeliner does not handle
 // physical register values that may cross a basic block boundary.
+// Furthermore, if a physical def/use pair is assigned to the same
+// cycle, orderDependence does not guarantee def/use ordering, so that
+// case should be considered invalid.  (The test checks for both
+// earlier and same-cycle use to be more robust.)
 bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) {
   for (SUnit &SU : SSD->SUnits) {
     if (!SU.hasPhysRegDefs)
       continue;
     int StageDef = stageScheduled(&SU);
+    int CycleDef = InstrToCycle[&SU];
     assert(StageDef != -1 && "Instruction should have been scheduled.");
     for (auto &SI : SU.Succs)
-      if (SI.isAssignedRegDep())
-        if (Register::isPhysicalRegister(SI.getReg()))
+      if (SI.isAssignedRegDep() && !SI.getSUnit()->isBoundaryNode())
+        if (Register::isPhysicalRegister(SI.getReg())) {
           if (stageScheduled(SI.getSUnit()) != StageDef)
             return false;
+          if (InstrToCycle[SI.getSUnit()] <= CycleDef)
+            return false;
+        }
   }
   return true;
 }
@@ -2998,7 +3077,7 @@ bool ResourceManager::canReserveResources(const MCInstrDesc *MID) const {
   if (!SCDesc->isValid()) {
     LLVM_DEBUG({
       dbgs() << "No valid Schedule Class Desc for schedClass!\n";
-      dbgs() << "isPseduo:" << MID->isPseudo() << "\n";
+      dbgs() << "isPseudo:" << MID->isPseudo() << "\n";
     });
     return true;
   }
@@ -3038,7 +3117,7 @@ void ResourceManager::reserveResources(const MCInstrDesc *MID) {
   if (!SCDesc->isValid()) {
     LLVM_DEBUG({
       dbgs() << "No valid Schedule Class Desc for schedClass!\n";
-      dbgs() << "isPseduo:" << MID->isPseudo() << "\n";
+      dbgs() << "isPseudo:" << MID->isPseudo() << "\n";
     });
     return;
   }
diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index 1a4ad53ddf81..511bb80052c2 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -12,7 +12,6 @@
 
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/ADT/iterator_range.h"
-#include "llvm/CodeGen/LowLevelType.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -651,3 +650,18 @@ bool MachineRegisterInfo::isReservedRegUnit(unsigned Unit) const {
   }
   return false;
 }
+
+bool MachineRegisterInfo::isArgumentRegister(const MachineFunction &MF,
+                                             MCRegister Reg) const {
+  return getTargetRegisterInfo()->isArgumentRegister(MF, Reg);
+}
+
+bool MachineRegisterInfo::isFixedRegister(const MachineFunction &MF,
+                                          MCRegister Reg) const {
+  return getTargetRegisterInfo()->isFixedRegister(MF, Reg);
+}
+
+bool MachineRegisterInfo::isGeneralPurposeRegister(const MachineFunction &MF,
+                                                   MCRegister Reg) const {
+  return getTargetRegisterInfo()->isGeneralPurposeRegister(MF, Reg);
+}
diff --git a/llvm/lib/CodeGen/MachineSSAContext.cpp b/llvm/lib/CodeGen/MachineSSAContext.cpp
index 8db893535daf..01cea85ecc7c 100644
--- a/llvm/lib/CodeGen/MachineSSAContext.cpp
+++ b/llvm/lib/CodeGen/MachineSSAContext.cpp
@@ -14,7 +14,9 @@
 
 #include "llvm/CodeGen/MachineSSAContext.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index b043d4c1b0c1..4e00a211713e 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -32,7 +32,6 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachinePassRegistry.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
@@ -752,7 +751,7 @@ void ScheduleDAGMI::moveInstruction(
 }
 
 bool ScheduleDAGMI::checkSchedLimit() {
-#ifndef NDEBUG
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
   if (NumInstrsScheduled == MISchedCutoff && MISchedCutoff != ~0U) {
     CurrentTop = CurrentBottom;
     return false;
@@ -920,12 +919,10 @@ void ScheduleDAGMI::placeDebugValues() {
     MachineBasicBlock::iterator OrigPrevMI = P.second;
     if (&*RegionBegin == DbgValue)
       ++RegionBegin;
-    BB->splice(++OrigPrevMI, BB, DbgValue);
-    if (OrigPrevMI == std::prev(RegionEnd))
+    BB->splice(std::next(OrigPrevMI), BB, DbgValue);
+    if (RegionEnd != BB->end() && OrigPrevMI == &*RegionEnd)
       RegionEnd = DbgValue;
   }
-  DbgValues.clear();
-  FirstDbgValue = nullptr;
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -2008,7 +2005,7 @@ void SchedBoundary::reset() {
   ReservedCycles.clear();
   ReservedCyclesIndex.clear();
   ResourceGroupSubUnitMasks.clear();
-#ifndef NDEBUG
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
   // Track the maximum number of stall cycles that could arise either from the
   // latency of a DAG edge or the number of cycles that a processor resource is
   // reserved (SchedBoundary::ReservedCycles).
@@ -2196,7 +2193,7 @@ bool SchedBoundary::checkHazard(SUnit *SU) {
       unsigned NRCycle, InstanceIdx;
       std::tie(NRCycle, InstanceIdx) = getNextResourceCycle(SC, ResIdx, Cycles);
       if (NRCycle > CurrCycle) {
-#ifndef NDEBUG
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
         MaxObservedStall = std::max(Cycles, MaxObservedStall);
 #endif
         LLVM_DEBUG(dbgs() << "  SU(" << SU->NodeNum << ") "
@@ -2263,7 +2260,7 @@ void SchedBoundary::releaseNode(SUnit *SU, unsigned ReadyCycle, bool InPQueue,
                                 unsigned Idx) {
   assert(SU->getInstr() && "Scheduled SUnit must have instr");
 
-#ifndef NDEBUG
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
   // ReadyCycle was been bumped up to the CurrCycle when this node was
   // scheduled, but CurrCycle may have been eagerly advanced immediately after
   // scheduling, so may now be greater than ReadyCycle.
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 0dbbc218e946..006ba9273dfb 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -16,17 +16,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/SparseBitVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CFG.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineCycleAnalysis.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -93,18 +96,18 @@ static cl::opt<unsigned> SinkLoadBlocksThreshold(
     cl::init(20), cl::Hidden);
 
 static cl::opt<bool>
-SinkInstsIntoLoop("sink-insts-to-avoid-spills",
-                  cl::desc("Sink instructions into loops to avoid "
-                           "register spills"),
-                  cl::init(false), cl::Hidden);
-
-static cl::opt<unsigned> SinkIntoLoopLimit(
-    "machine-sink-loop-limit",
-    cl::desc("The maximum number of instructions considered for loop sinking."),
+    SinkInstsIntoCycle("sink-insts-to-avoid-spills",
+                       cl::desc("Sink instructions into cycles to avoid "
+                                "register spills"),
+                       cl::init(false), cl::Hidden);
+
+static cl::opt<unsigned> SinkIntoCycleLimit(
+    "machine-sink-cycle-limit",
+    cl::desc("The maximum number of instructions considered for cycle sinking."),
     cl::init(50), cl::Hidden);
 
 STATISTIC(NumSunk,      "Number of machine instructions sunk");
-STATISTIC(NumLoopSunk,  "Number of machine instructions sunk into a loop");
+STATISTIC(NumCycleSunk,  "Number of machine instructions sunk into a cycle");
 STATISTIC(NumSplit,     "Number of critical edges split");
 STATISTIC(NumCoalesces, "Number of copies coalesced");
 STATISTIC(NumPostRACopySink, "Number of copies sunk after RA");
@@ -117,7 +120,7 @@ namespace {
     MachineRegisterInfo  *MRI;     // Machine register information
     MachineDominatorTree *DT;      // Machine dominator tree
     MachinePostDominatorTree *PDT; // Machine post dominator tree
-    MachineLoopInfo *LI;
+    MachineCycleInfo *CI;
     MachineBlockFrequencyInfo *MBFI;
     const MachineBranchProbabilityInfo *MBPI;
     AliasAnalysis *AA;
@@ -178,8 +181,9 @@ namespace {
       AU.addRequired<AAResultsWrapperPass>();
       AU.addRequired<MachineDominatorTree>();
       AU.addRequired<MachinePostDominatorTree>();
-      AU.addRequired<MachineLoopInfo>();
+      AU.addRequired<MachineCycleInfoWrapperPass>();
       AU.addRequired<MachineBranchProbabilityInfo>();
+      AU.addPreserved<MachineCycleInfoWrapperPass>();
       AU.addPreserved<MachineLoopInfo>();
       if (UseBlockFreqInfo)
         AU.addRequired<MachineBlockFrequencyInfo>();
@@ -230,9 +234,9 @@ namespace {
     MachineBasicBlock *FindSuccToSinkTo(MachineInstr &MI, MachineBasicBlock *MBB,
                bool &BreakPHIEdge, AllSuccsCache &AllSuccessors);
 
-    void FindLoopSinkCandidates(MachineLoop *L, MachineBasicBlock *BB,
-                                SmallVectorImpl<MachineInstr *> &Candidates);
-    bool SinkIntoLoop(MachineLoop *L, MachineInstr &I);
+    void FindCycleSinkCandidates(MachineCycle *Cycle, MachineBasicBlock *BB,
+                                 SmallVectorImpl<MachineInstr *> &Candidates);
+    bool SinkIntoCycle(MachineCycle *Cycle, MachineInstr &I);
 
     bool isProfitableToSinkTo(Register Reg, MachineInstr &MI,
                               MachineBasicBlock *MBB,
@@ -259,7 +263,7 @@ INITIALIZE_PASS_BEGIN(MachineSinking, DEBUG_TYPE,
                       "Machine code sinking", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineCycleInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(MachineSinking, DEBUG_TYPE,
                     "Machine code sinking", false, false)
@@ -376,26 +380,27 @@ static bool mayLoadFromGOTOrConstantPool(MachineInstr &MI) {
   return false;
 }
 
-void MachineSinking::FindLoopSinkCandidates(MachineLoop *L, MachineBasicBlock *BB,
+void MachineSinking::FindCycleSinkCandidates(
+    MachineCycle *Cycle, MachineBasicBlock *BB,
     SmallVectorImpl<MachineInstr *> &Candidates) {
   for (auto &MI : *BB) {
-    LLVM_DEBUG(dbgs() << "LoopSink: Analysing candidate: " << MI);
+    LLVM_DEBUG(dbgs() << "CycleSink: Analysing candidate: " << MI);
     if (!TII->shouldSink(MI)) {
-      LLVM_DEBUG(dbgs() << "LoopSink: Instruction not a candidate for this "
+      LLVM_DEBUG(dbgs() << "CycleSink: Instruction not a candidate for this "
                            "target\n");
       continue;
     }
-    if (!L->isLoopInvariant(MI)) {
-      LLVM_DEBUG(dbgs() << "LoopSink: Instruction is not loop invariant\n");
+    if (!isCycleInvariant(Cycle, MI)) {
+      LLVM_DEBUG(dbgs() << "CycleSink: Instruction is not cycle invariant\n");
       continue;
     }
     bool DontMoveAcrossStore = true;
     if (!MI.isSafeToMove(AA, DontMoveAcrossStore)) {
-      LLVM_DEBUG(dbgs() << "LoopSink: Instruction not safe to move.\n");
+      LLVM_DEBUG(dbgs() << "CycleSink: Instruction not safe to move.\n");
       continue;
     }
     if (MI.mayLoad() && !mayLoadFromGOTOrConstantPool(MI)) {
-      LLVM_DEBUG(dbgs() << "LoopSink: Dont sink GOT or constant pool loads\n");
+      LLVM_DEBUG(dbgs() << "CycleSink: Dont sink GOT or constant pool loads\n");
       continue;
     }
     if (MI.isConvergent())
@@ -407,7 +412,7 @@ void MachineSinking::FindLoopSinkCandidates(MachineLoop *L, MachineBasicBlock *B
     if (!MRI->hasOneDef(MO.getReg()))
       continue;
 
-    LLVM_DEBUG(dbgs() << "LoopSink: Instruction added as candidate.\n");
+    LLVM_DEBUG(dbgs() << "CycleSink: Instruction added as candidate.\n");
     Candidates.push_back(&MI);
   }
 }
@@ -423,7 +428,7 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
   DT = &getAnalysis<MachineDominatorTree>();
   PDT = &getAnalysis<MachinePostDominatorTree>();
-  LI = &getAnalysis<MachineLoopInfo>();
+  CI = &getAnalysis<MachineCycleInfoWrapperPass>().getCycleInfo();
   MBFI = UseBlockFreqInfo ? &getAnalysis<MachineBlockFrequencyInfo>() : nullptr;
   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
@@ -461,32 +466,33 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
     EverMadeChange = true;
   }
 
-  if (SinkInstsIntoLoop) {
-    SmallVector<MachineLoop *, 8> Loops(LI->begin(), LI->end());
-    for (auto *L : Loops) {
-      MachineBasicBlock *Preheader = LI->findLoopPreheader(L);
+  if (SinkInstsIntoCycle) {
+    SmallVector<MachineCycle *, 8> Cycles(CI->toplevel_begin(),
+                                          CI->toplevel_end());
+    for (auto *Cycle : Cycles) {
+      MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
       if (!Preheader) {
-        LLVM_DEBUG(dbgs() << "LoopSink: Can't find preheader\n");
+        LLVM_DEBUG(dbgs() << "CycleSink: Can't find preheader\n");
         continue;
       }
       SmallVector<MachineInstr *, 8> Candidates;
-      FindLoopSinkCandidates(L, Preheader, Candidates);
+      FindCycleSinkCandidates(Cycle, Preheader, Candidates);
 
       // Walk the candidates in reverse order so that we start with the use
       // of a def-use chain, if there is any.
       // TODO: Sort the candidates using a cost-model.
       unsigned i = 0;
       for (MachineInstr *I : llvm::reverse(Candidates)) {
-        if (i++ == SinkIntoLoopLimit) {
-          LLVM_DEBUG(dbgs() << "LoopSink:   Limit reached of instructions to "
+        if (i++ == SinkIntoCycleLimit) {
+          LLVM_DEBUG(dbgs() << "CycleSink:   Limit reached of instructions to "
                                "be analysed.");
           break;
         }
 
-        if (!SinkIntoLoop(L, *I))
+        if (!SinkIntoCycle(Cycle, *I))
           break;
         EverMadeChange = true;
-        ++NumLoopSunk;
+        ++NumCycleSunk;
       }
     }
   }
@@ -508,12 +514,12 @@ bool MachineSinking::ProcessBlock(MachineBasicBlock &MBB) {
 
   // Don't bother sinking code out of unreachable blocks. In addition to being
   // unprofitable, it can also lead to infinite looping, because in an
-  // unreachable loop there may be nowhere to stop.
+  // unreachable cycle there may be nowhere to stop.
   if (!DT->isReachableFromEntry(&MBB)) return false;
 
   bool MadeChange = false;
 
-  // Cache all successors, sorted by frequency info and loop depth.
+  // Cache all successors, sorted by frequency info and cycle depth.
   AllSuccsCache AllSuccessors;
 
   // Walk the basic block bottom-up.  Remember if we saw a store.
@@ -632,13 +638,16 @@ bool MachineSinking::PostponeSplitCriticalEdge(MachineInstr &MI,
   if (!isWorthBreakingCriticalEdge(MI, FromBB, ToBB))
     return false;
 
-  // Avoid breaking back edge. From == To means backedge for single BB loop.
+  // Avoid breaking back edge. From == To means backedge for single BB cycle.
   if (!SplitEdges || FromBB == ToBB)
     return false;
 
-  // Check for backedges of more "complex" loops.
-  if (LI->getLoopFor(FromBB) == LI->getLoopFor(ToBB) &&
-      LI->isLoopHeader(ToBB))
+  MachineCycle *FromCycle = CI->getCycle(FromBB);
+  MachineCycle *ToCycle = CI->getCycle(ToBB);
+
+  // Check for backedges of more "complex" cycles.
+  if (FromCycle == ToCycle && FromCycle &&
+      (!FromCycle->isReducible() || FromCycle->getHeader() == ToBB))
     return false;
 
   // It's not always legal to break critical edges and sink the computation
@@ -741,9 +750,9 @@ bool MachineSinking::isProfitableToSinkTo(Register Reg, MachineInstr &MI,
   if (!PDT->dominates(SuccToSinkTo, MBB))
     return true;
 
-  // It is profitable to sink an instruction from a deeper loop to a shallower
-  // loop, even if the latter post-dominates the former (PR21115).
-  if (LI->getLoopDepth(MBB) > LI->getLoopDepth(SuccToSinkTo))
+  // It is profitable to sink an instruction from a deeper cycle to a shallower
+  // cycle, even if the latter post-dominates the former (PR21115).
+  if (CI->getCycleDepth(MBB) > CI->getCycleDepth(SuccToSinkTo))
     return true;
 
   // Check if only use in post dominated block is PHI instruction.
@@ -764,11 +773,11 @@ bool MachineSinking::isProfitableToSinkTo(Register Reg, MachineInstr &MI,
           FindSuccToSinkTo(MI, SuccToSinkTo, BreakPHIEdge, AllSuccessors))
     return isProfitableToSinkTo(Reg, MI, SuccToSinkTo, MBB2, AllSuccessors);
 
-  MachineLoop *ML = LI->getLoopFor(MBB);
+  MachineCycle *MCycle = CI->getCycle(MBB);
 
-  // If the instruction is not inside a loop, it is not profitable to sink MI to
+  // If the instruction is not inside a cycle, it is not profitable to sink MI to
   // a post dominate block SuccToSinkTo.
-  if (!ML)
+  if (!MCycle)
     return false;
 
   auto isRegisterPressureSetExceedLimit = [&](const TargetRegisterClass *RC) {
@@ -786,7 +795,7 @@ bool MachineSinking::isProfitableToSinkTo(Register Reg, MachineInstr &MI,
     return false;
   };
 
-  // If this instruction is inside a loop and sinking this instruction can make
+  // If this instruction is inside a Cycle and sinking this instruction can make
   // more registers live range shorten, it is still prifitable.
   for (const MachineOperand &MO : MI.operands()) {
     // Ignore non-register operands.
@@ -814,14 +823,17 @@ bool MachineSinking::isProfitableToSinkTo(Register Reg, MachineInstr &MI,
         return false;
     } else {
       MachineInstr *DefMI = MRI->getVRegDef(Reg);
-      // DefMI is defined outside of loop. There should be no live range
-      // impact for this operand. Defination outside of loop means:
-      // 1: defination is outside of loop.
-      // 2: defination is in this loop, but it is a PHI in the loop header.
-      if (LI->getLoopFor(DefMI->getParent()) != ML ||
-          (DefMI->isPHI() && LI->isLoopHeader(DefMI->getParent())))
+      if (!DefMI)
+        continue;
+      MachineCycle *Cycle = CI->getCycle(DefMI->getParent());
+      // DefMI is defined outside of cycle. There should be no live range
+      // impact for this operand. Defination outside of cycle means:
+      // 1: defination is outside of cycle.
+      // 2: defination is in this cycle, but it is a PHI in the cycle header.
+      if (Cycle != MCycle || (DefMI->isPHI() && Cycle && Cycle->isReducible() &&
+                              Cycle->getHeader() == DefMI->getParent()))
         continue;
-      // The DefMI is defined inside the loop.
+      // The DefMI is defined inside the cycle.
       // If sinking this operand makes some register pressure set exceed limit,
       // it is not profitable.
       if (isRegisterPressureSetExceedLimit(MRI->getRegClass(Reg))) {
@@ -831,8 +843,8 @@ bool MachineSinking::isProfitableToSinkTo(Register Reg, MachineInstr &MI,
     }
   }
 
-  // If MI is in loop and all its operands are alive across the whole loop or if
-  // no operand sinking make register pressure set exceed limit, it is
+  // If MI is in cycle and all its operands are alive across the whole cycle or
+  // if no operand sinking make register pressure set exceed limit, it is
   // profitable to sink MI.
   return true;
 }
@@ -864,14 +876,14 @@ MachineSinking::GetAllSortedSuccessors(MachineInstr &MI, MachineBasicBlock *MBB,
       AllSuccs.push_back(DTChild->getBlock());
   }
 
-  // Sort Successors according to their loop depth or block frequency info.
+  // Sort Successors according to their cycle depth or block frequency info.
   llvm::stable_sort(
       AllSuccs, [this](const MachineBasicBlock *L, const MachineBasicBlock *R) {
         uint64_t LHSFreq = MBFI ? MBFI->getBlockFreq(L).getFrequency() : 0;
         uint64_t RHSFreq = MBFI ? MBFI->getBlockFreq(R).getFrequency() : 0;
         bool HasBlockFreq = LHSFreq != 0 && RHSFreq != 0;
         return HasBlockFreq ? LHSFreq < RHSFreq
-                            : LI->getLoopDepth(L) < LI->getLoopDepth(R);
+                            : CI->getCycleDepth(L) < CI->getCycleDepth(R);
       });
 
   auto it = AllSuccessors.insert(std::make_pair(MBB, AllSuccs));
@@ -886,7 +898,7 @@ MachineSinking::FindSuccToSinkTo(MachineInstr &MI, MachineBasicBlock *MBB,
                                  AllSuccsCache &AllSuccessors) {
   assert (MBB && "Invalid MachineBasicBlock!");
 
-  // Loop over all the operands of the specified instruction.  If there is
+  // loop over all the operands of the specified instruction.  If there is
   // anything we can't handle, bail out.
 
   // SuccToSinkTo - This is the successor to sink this instruction to, once we
@@ -933,7 +945,7 @@ MachineSinking::FindSuccToSinkTo(MachineInstr &MI, MachineBasicBlock *MBB,
       // Otherwise, we should look at all the successors and decide which one
       // we should sink to. If we have reliable block frequency information
       // (frequency != 0) available, give successors with smaller frequencies
-      // higher priority, otherwise prioritize smaller loop depths.
+      // higher priority, otherwise prioritize smaller cycle depths.
       for (MachineBasicBlock *SuccBlock :
            GetAllSortedSuccessors(MI, MBB, AllSuccessors)) {
         bool LocalUse = false;
@@ -956,7 +968,7 @@ MachineSinking::FindSuccToSinkTo(MachineInstr &MI, MachineBasicBlock *MBB,
   }
 
   // It is not possible to sink an instruction into its own block.  This can
-  // happen with loops.
+  // happen with cycles.
   if (MBB == SuccToSinkTo)
     return nullptr;
 
@@ -1081,8 +1093,7 @@ using MIRegs = std::pair<MachineInstr *, SmallVector<unsigned, 2>>;
 /// Sink an instruction and its associated debug instructions.
 static void performSink(MachineInstr &MI, MachineBasicBlock &SuccToSinkTo,
                         MachineBasicBlock::iterator InsertPos,
-                        SmallVectorImpl<MIRegs> &DbgValuesToSink) {
-
+                        ArrayRef<MIRegs> DbgValuesToSink) {
   // If we cannot find a location to use (merge with), then we erase the debug
   // location to prevent debug-info driven tools from potentially reporting
   // wrong location information.
@@ -1101,7 +1112,7 @@ static void performSink(MachineInstr &MI, MachineBasicBlock &SuccToSinkTo,
   // DBG_VALUE location as 'undef', indicating that any earlier variable
   // location should be terminated as we've optimised away the value at this
   // point.
-  for (auto DbgValueToSink : DbgValuesToSink) {
+  for (const auto &DbgValueToSink : DbgValuesToSink) {
     MachineInstr *DbgMI = DbgValueToSink.first;
     MachineInstr *NewDbgMI = DbgMI->getMF()->CloneMachineInstr(DbgMI);
     SuccToSinkTo.insert(InsertPos, NewDbgMI);
@@ -1166,7 +1177,7 @@ bool MachineSinking::hasStoreBetween(MachineBasicBlock *From,
 
       // If this BB is too big or the block number in straight line between From
       // and To is too big, stop searching to save compiling time.
-      if (BB->size() > SinkLoadInstsPerBlockThreshold ||
+      if (BB->sizeWithoutDebugLargerThan(SinkLoadInstsPerBlockThreshold) ||
           HandledDomBlocks.size() > SinkLoadBlocksThreshold) {
         for (auto *DomBB : HandledDomBlocks) {
           if (DomBB != BB && DT->dominates(DomBB, BB))
@@ -1211,69 +1222,78 @@ bool MachineSinking::hasStoreBetween(MachineBasicBlock *From,
   return HasAliasedStore;
 }
 
-/// Sink instructions into loops if profitable. This especially tries to prevent
-/// register spills caused by register pressure if there is little to no
-/// overhead moving instructions into loops.
-bool MachineSinking::SinkIntoLoop(MachineLoop *L, MachineInstr &I) {
-  LLVM_DEBUG(dbgs() << "LoopSink: Finding sink block for: " << I);
-  MachineBasicBlock *Preheader = L->getLoopPreheader();
-  assert(Preheader && "Loop sink needs a preheader block");
+/// Sink instructions into cycles if profitable. This especially tries to
+/// prevent register spills caused by register pressure if there is little to no
+/// overhead moving instructions into cycles.
+bool MachineSinking::SinkIntoCycle(MachineCycle *Cycle, MachineInstr &I) {
+  LLVM_DEBUG(dbgs() << "CycleSink: Finding sink block for: " << I);
+  MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
+  assert(Preheader && "Cycle sink needs a preheader block");
   MachineBasicBlock *SinkBlock = nullptr;
   bool CanSink = true;
   const MachineOperand &MO = I.getOperand(0);
 
   for (MachineInstr &MI : MRI->use_instructions(MO.getReg())) {
-    LLVM_DEBUG(dbgs() << "LoopSink:   Analysing use: " << MI);
-    if (!L->contains(&MI)) {
-      LLVM_DEBUG(dbgs() << "LoopSink:   Use not in loop, can't sink.\n");
+    LLVM_DEBUG(dbgs() << "CycleSink:   Analysing use: " << MI);
+    if (!Cycle->contains(MI.getParent())) {
+      LLVM_DEBUG(dbgs() << "CycleSink:   Use not in cycle, can't sink.\n");
       CanSink = false;
       break;
     }
 
     // FIXME: Come up with a proper cost model that estimates whether sinking
-    // the instruction (and thus possibly executing it on every loop
+    // the instruction (and thus possibly executing it on every cycle
     // iteration) is more expensive than a register.
     // For now assumes that copies are cheap and thus almost always worth it.
     if (!MI.isCopy()) {
-      LLVM_DEBUG(dbgs() << "LoopSink:   Use is not a copy\n");
+      LLVM_DEBUG(dbgs() << "CycleSink:   Use is not a copy\n");
       CanSink = false;
       break;
     }
     if (!SinkBlock) {
       SinkBlock = MI.getParent();
-      LLVM_DEBUG(dbgs() << "LoopSink:   Setting sink block to: "
+      LLVM_DEBUG(dbgs() << "CycleSink:   Setting sink block to: "
                         << printMBBReference(*SinkBlock) << "\n");
       continue;
     }
     SinkBlock = DT->findNearestCommonDominator(SinkBlock, MI.getParent());
     if (!SinkBlock) {
-      LLVM_DEBUG(dbgs() << "LoopSink:   Can't find nearest dominator\n");
+      LLVM_DEBUG(dbgs() << "CycleSink:   Can't find nearest dominator\n");
       CanSink = false;
       break;
     }
-    LLVM_DEBUG(dbgs() << "LoopSink:   Setting nearest common dom block: " <<
+    LLVM_DEBUG(dbgs() << "CycleSink:   Setting nearest common dom block: " <<
                printMBBReference(*SinkBlock) << "\n");
   }
 
   if (!CanSink) {
-    LLVM_DEBUG(dbgs() << "LoopSink: Can't sink instruction.\n");
+    LLVM_DEBUG(dbgs() << "CycleSink: Can't sink instruction.\n");
     return false;
   }
   if (!SinkBlock) {
-    LLVM_DEBUG(dbgs() << "LoopSink: Not sinking, can't find sink block.\n");
+    LLVM_DEBUG(dbgs() << "CycleSink: Not sinking, can't find sink block.\n");
     return false;
   }
   if (SinkBlock == Preheader) {
-    LLVM_DEBUG(dbgs() << "LoopSink: Not sinking, sink block is the preheader\n");
+    LLVM_DEBUG(
+        dbgs() << "CycleSink: Not sinking, sink block is the preheader\n");
     return false;
   }
-  if (SinkBlock->size() > SinkLoadInstsPerBlockThreshold) {
-    LLVM_DEBUG(dbgs() << "LoopSink: Not Sinking, block too large to analyse.\n");
+  if (SinkBlock->sizeWithoutDebugLargerThan(SinkLoadInstsPerBlockThreshold)) {
+    LLVM_DEBUG(
+        dbgs() << "CycleSink: Not Sinking, block too large to analyse.\n");
     return false;
   }
 
-  LLVM_DEBUG(dbgs() << "LoopSink: Sinking instruction!\n");
-  SinkBlock->splice(SinkBlock->getFirstNonPHI(), Preheader, I);
+  LLVM_DEBUG(dbgs() << "CycleSink: Sinking instruction!\n");
+  SinkBlock->splice(SinkBlock->SkipPHIsAndLabels(SinkBlock->begin()), Preheader,
+                    I);
+
+  // Conservatively clear any kill flags on uses of sunk instruction
+  for (MachineOperand &MO : I.operands()) {
+    if (MO.isReg() && MO.readsReg())
+      RegsToClearKillFlags.insert(MO.getReg());
+  }
 
   // The instruction is moved from its basic block, so do not retain the
   // debug information.
@@ -1282,6 +1302,45 @@ bool MachineSinking::SinkIntoLoop(MachineLoop *L, MachineInstr &I) {
   return true;
 }
 
+/// Return true if a target defined block prologue instruction interferes
+/// with a sink candidate.
+static bool blockPrologueInterferes(MachineBasicBlock *BB,
+                                    MachineBasicBlock::iterator End,
+                                    MachineInstr &MI,
+                                    const TargetRegisterInfo *TRI,
+                                    const TargetInstrInfo *TII,
+                                    const MachineRegisterInfo *MRI) {
+  if (BB->begin() == End)
+    return false; // no prologue
+  for (MachineBasicBlock::iterator PI = BB->getFirstNonPHI(); PI != End; ++PI) {
+    // Only check target defined prologue instructions
+    if (!TII->isBasicBlockPrologue(*PI))
+      continue;
+    for (auto &MO : MI.operands()) {
+      if (!MO.isReg())
+        continue;
+      Register Reg = MO.getReg();
+      if (!Reg)
+        continue;
+      if (MO.isUse()) {
+        if (Register::isPhysicalRegister(Reg) &&
+            (TII->isIgnorableUse(MO) || (MRI && MRI->isConstantPhysReg(Reg))))
+          continue;
+        if (PI->modifiesRegister(Reg, TRI))
+          return true;
+      } else {
+        if (PI->readsRegister(Reg, TRI))
+          return true;
+        // Check for interference with non-dead defs
+        auto *DefOp = PI->findRegisterDefOperand(Reg, false, true, TRI);
+        if (DefOp && !DefOp->isDead())
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
 /// SinkInstruction - Determine whether it is safe to sink the specified machine
 /// instruction out of its current block into a successor.
 bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
@@ -1356,9 +1415,11 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
       TryBreak = true;
     }
 
-    // Don't sink instructions into a loop.
-    if (!TryBreak && LI->isLoopHeader(SuccToSinkTo)) {
-      LLVM_DEBUG(dbgs() << " *** NOTE: Loop header found\n");
+    // Don't sink instructions into a cycle.
+    if (!TryBreak && CI->getCycle(SuccToSinkTo) &&
+        (!CI->getCycle(SuccToSinkTo)->isReducible() ||
+         CI->getCycle(SuccToSinkTo)->getHeader() == SuccToSinkTo)) {
+      LLVM_DEBUG(dbgs() << " *** NOTE: cycle header found\n");
       TryBreak = true;
     }
 
@@ -1393,9 +1454,12 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
   }
 
   // Determine where to insert into. Skip phi nodes.
-  MachineBasicBlock::iterator InsertPos = SuccToSinkTo->begin();
-  while (InsertPos != SuccToSinkTo->end() && InsertPos->isPHI())
-    ++InsertPos;
+  MachineBasicBlock::iterator InsertPos =
+      SuccToSinkTo->SkipPHIsAndLabels(SuccToSinkTo->begin());
+  if (blockPrologueInterferes(SuccToSinkTo, InsertPos, MI, TRI, TII, MRI)) {
+    LLVM_DEBUG(dbgs() << " *** Not sinking: prologue interference\n");
+    return false;
+  }
 
   // Collect debug users of any vreg that this inst defines.
   SmallVector<MIRegs, 4> DbgUsersToSink;
@@ -1684,14 +1748,6 @@ static bool hasRegisterDependency(MachineInstr *MI,
   return HasRegDependency;
 }
 
-static SmallSet<MCRegister, 4> getRegUnits(MCRegister Reg,
-                                           const TargetRegisterInfo *TRI) {
-  SmallSet<MCRegister, 4> RegUnits;
-  for (auto RI = MCRegUnitIterator(Reg, TRI); RI.isValid(); ++RI)
-    RegUnits.insert(*RI);
-  return RegUnits;
-}
-
 bool PostRAMachineSinking::tryToSinkCopy(MachineBasicBlock &CurBB,
                                          MachineFunction &MF,
                                          const TargetRegisterInfo *TRI,
@@ -1737,14 +1793,15 @@ bool PostRAMachineSinking::tryToSinkCopy(MachineBasicBlock &CurBB,
           }
 
           // Record debug use of each reg unit.
-          SmallSet<MCRegister, 4> RegUnits = getRegUnits(MO.getReg(), TRI);
-          for (MCRegister Reg : RegUnits)
-            MIUnits[Reg].push_back(MO.getReg());
+          for (auto RI = MCRegUnitIterator(MO.getReg(), TRI); RI.isValid();
+               ++RI)
+            MIUnits[*RI].push_back(MO.getReg());
         }
       }
       if (IsValid) {
-        for (auto RegOps : MIUnits)
-          SeenDbgInstrs[RegOps.first].push_back({&MI, RegOps.second});
+        for (auto &RegOps : MIUnits)
+          SeenDbgInstrs[RegOps.first].emplace_back(&MI,
+                                                   std::move(RegOps.second));
       }
       continue;
     }
@@ -1791,22 +1848,29 @@ bool PostRAMachineSinking::tryToSinkCopy(MachineBasicBlock &CurBB,
       if (!MO.isReg() || !MO.isDef())
         continue;
 
-      SmallSet<MCRegister, 4> Units = getRegUnits(MO.getReg(), TRI);
-      for (MCRegister Reg : Units) {
-        for (auto MIRegs : SeenDbgInstrs.lookup(Reg)) {
+      for (auto RI = MCRegUnitIterator(MO.getReg(), TRI); RI.isValid(); ++RI) {
+        for (const auto &MIRegs : SeenDbgInstrs.lookup(*RI)) {
           auto &Regs = DbgValsToSinkMap[MIRegs.first];
           for (unsigned Reg : MIRegs.second)
             Regs.push_back(Reg);
         }
       }
     }
-    SmallVector<MIRegs, 4> DbgValsToSink(DbgValsToSinkMap.begin(),
-                                         DbgValsToSinkMap.end());
+    auto DbgValsToSink = DbgValsToSinkMap.takeVector();
+
+    LLVM_DEBUG(dbgs() << "Sink instr " << MI << "\tinto block " << *SuccBB);
+
+    MachineBasicBlock::iterator InsertPos =
+        SuccBB->SkipPHIsAndLabels(SuccBB->begin());
+    if (blockPrologueInterferes(SuccBB, InsertPos, MI, TRI, TII, nullptr)) {
+      LLVM_DEBUG(
+          dbgs() << " *** Not sinking: prologue interference\n");
+      continue;
+    }
 
     // Clear the kill flag if SrcReg is killed between MI and the end of the
     // block.
     clearKillFlags(&MI, CurBB, UsedOpsInCopy, UsedRegUnits, TRI);
-    MachineBasicBlock::iterator InsertPos = SuccBB->getFirstNonPHI();
     performSink(MI, *SuccBB, InsertPos, DbgValsToSink);
     updateLiveIn(&MI, SuccBB, UsedOpsInCopy, DefedRegsInCopy);
 
diff --git a/llvm/lib/CodeGen/MachineStableHash.cpp b/llvm/lib/CodeGen/MachineStableHash.cpp
index 0803c2b8b85a..a85dbf1de1ee 100644
--- a/llvm/lib/CodeGen/MachineStableHash.cpp
+++ b/llvm/lib/CodeGen/MachineStableHash.cpp
@@ -12,29 +12,30 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MachineStableHash.h"
-#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Analysis/Loads.h"
-#include "llvm/Analysis/MemoryLocation.h"
-#include "llvm/CodeGen/MIRFormatter.h"
-#include "llvm/CodeGen/MIRPrinter.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/ADT/ilist_iterator.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineInstrBundleIterator.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Register.h"
 #include "llvm/CodeGen/StableHashing.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/IRPrintingPasses.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/ModuleSlotTracker.h"
-#include "llvm/MC/MCDwarf.h"
-#include "llvm/Target/TargetIntrinsicInfo.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/ErrorHandling.h"
 
 #define DEBUG_TYPE "machine-stable-hash"
 
@@ -64,7 +65,10 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) {
   case MachineOperand::MO_Register:
     if (Register::isVirtualRegister(MO.getReg())) {
       const MachineRegisterInfo &MRI = MO.getParent()->getMF()->getRegInfo();
-      return MRI.getVRegDef(MO.getReg())->getOpcode();
+      SmallVector<unsigned> DefOpcodes;
+      for (auto &Def : MRI.def_instructions(MO.getReg()))
+        DefOpcodes.push_back(Def.getOpcode());
+      return hash_combine_range(DefOpcodes.begin(), DefOpcodes.end());
     }
 
     // Register operands don't have target flags.
@@ -192,3 +196,21 @@ stable_hash llvm::stableHashValue(const MachineInstr &MI, bool HashVRegs,
   return stable_hash_combine_range(HashComponents.begin(),
                                    HashComponents.end());
 }
+
+stable_hash llvm::stableHashValue(const MachineBasicBlock &MBB) {
+  SmallVector<stable_hash> HashComponents;
+  // TODO: Hash more stuff like block alignment and branch probabilities.
+  for (auto &MI : MBB)
+    HashComponents.push_back(stableHashValue(MI));
+  return stable_hash_combine_range(HashComponents.begin(),
+                                   HashComponents.end());
+}
+
+stable_hash llvm::stableHashValue(const MachineFunction &MF) {
+  SmallVector<stable_hash> HashComponents;
+  // TODO: Hash lots more stuff like function alignment and stack objects.
+  for (auto &MBB : MF)
+    HashComponents.push_back(stableHashValue(MBB));
+  return stable_hash_combine_range(HashComponents.begin(),
+                                   HashComponents.end());
+}
diff --git a/llvm/lib/CodeGen/MachineStripDebug.cpp b/llvm/lib/CodeGen/MachineStripDebug.cpp
index 86cf4999d4b0..6128248a028e 100644
--- a/llvm/lib/CodeGen/MachineStripDebug.cpp
+++ b/llvm/lib/CodeGen/MachineStripDebug.cpp
@@ -10,10 +10,10 @@
 /// tests can be debugified without affecting the output MIR.
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/DebugInfo.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/Debugify.h"
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index c9d3e473062b..db04f2bcc095 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -32,10 +32,10 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/EHPersonalities.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/CodeGenCommonISel.h"
 #include "llvm/CodeGen/LiveInterval.h"
-#include "llvm/CodeGen/LiveIntervalCalc.h"
 #include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRangeCalc.h"
 #include "llvm/CodeGen/LiveStacks.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -48,6 +48,8 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/RegisterBank.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
@@ -55,12 +57,14 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCTargetOptions.h"
@@ -95,6 +99,7 @@ namespace {
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
     const MachineRegisterInfo *MRI;
+    const RegisterBankInfo *RBI;
 
     unsigned foundErrors;
 
@@ -370,6 +375,7 @@ unsigned MachineVerifier::verify(const MachineFunction &MF) {
   TM = &MF.getTarget();
   TII = MF.getSubtarget().getInstrInfo();
   TRI = MF.getSubtarget().getRegisterInfo();
+  RBI = MF.getSubtarget().getRegBankInfo();
   MRI = &MF.getRegInfo();
 
   const bool isFunctionFailedISel = MF.getProperties().hasProperty(
@@ -442,7 +448,7 @@ unsigned MachineVerifier::verify(const MachineFunction &MF) {
       for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
         const MachineOperand &Op = MI.getOperand(I);
         if (Op.getParent() != &MI) {
-          // Make sure to use correct addOperand / RemoveOperand / ChangeTo
+          // Make sure to use correct addOperand / removeOperand / ChangeTo
           // functions when replacing operands of a MachineInstr.
           report("Instruction has operand with wrong parent set", &MI);
         }
@@ -1000,17 +1006,23 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
       break;
     }
 
-    if (MRI->getRegBankOrNull(Src) != MRI->getRegBankOrNull(Dst)) {
-      report(
-          Twine(OpcName, " source and destination register banks must match"),
-          MI);
+    const RegisterBank *SrcRB = RBI->getRegBank(Src, *MRI, *TRI);
+    const RegisterBank *DstRB = RBI->getRegBank(Dst, *MRI, *TRI);
+
+    // Allow only the source bank to be set.
+    if ((SrcRB && DstRB && SrcRB != DstRB) || (DstRB && !SrcRB)) {
+      report(Twine(OpcName, " cannot change register bank"), MI);
       break;
     }
 
-    if (MRI->getRegClassOrNull(Src) != MRI->getRegClassOrNull(Dst))
+    // Don't allow a class change. Do allow member class->regbank.
+    const TargetRegisterClass *DstRC = MRI->getRegClassOrNull(Dst);
+    if (DstRC && DstRC != MRI->getRegClassOrNull(Src)) {
       report(
           Twine(OpcName, " source and destination register classes must match"),
           MI);
+      break;
+    }
 
     break;
   }
@@ -1072,6 +1084,18 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
         if (ValTy.getSizeInBytes() < MMO.getSize())
           report("store memory size cannot exceed value size", MI);
       }
+
+      const AtomicOrdering Order = MMO.getSuccessOrdering();
+      if (Opc == TargetOpcode::G_STORE) {
+        if (Order == AtomicOrdering::Acquire ||
+            Order == AtomicOrdering::AcquireRelease)
+          report("atomic store cannot use acquire ordering", MI);
+
+      } else {
+        if (Order == AtomicOrdering::Release ||
+            Order == AtomicOrdering::AcquireRelease)
+          report("atomic load cannot use release ordering", MI);
+      }
     }
 
     break;
@@ -1628,6 +1652,43 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
     verifyAllRegOpsScalar(*MI, *MRI);
     break;
   }
+  case TargetOpcode::G_IS_FPCLASS: {
+    LLT DestTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT DestEltTy = DestTy.getScalarType();
+    if (!DestEltTy.isScalar()) {
+      report("Destination must be a scalar or vector of scalars", MI);
+      break;
+    }
+    LLT SrcTy = MRI->getType(MI->getOperand(1).getReg());
+    LLT SrcEltTy = SrcTy.getScalarType();
+    if (!SrcEltTy.isScalar()) {
+      report("Source must be a scalar or vector of scalars", MI);
+      break;
+    }
+    if (!verifyVectorElementMatch(DestTy, SrcTy, MI))
+      break;
+    const MachineOperand &TestMO = MI->getOperand(2);
+    if (!TestMO.isImm()) {
+      report("floating-point class set (operand 2) must be an immediate", MI);
+      break;
+    }
+    int64_t Test = TestMO.getImm();
+    if (Test < 0 || Test > fcAllFlags) {
+      report("Incorrect floating-point class set (operand 2)", MI);
+      break;
+    }
+    const MachineOperand &SemanticsMO = MI->getOperand(3);
+    if (!SemanticsMO.isImm()) {
+      report("floating-point semantics (operand 3) must be an immediate", MI);
+      break;
+    }
+    int64_t Semantics = SemanticsMO.getImm();
+    if (Semantics < 0 || Semantics > APFloat::S_MaxSemantics) {
+      report("Incorrect floating-point semantics (operand 3)", MI);
+      break;
+    }
+    break;
+  }
   default:
     break;
   }
@@ -1912,6 +1973,10 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
     if (MRI->tracksLiveness() && !MI->isDebugInstr())
       checkLiveness(MO, MONum);
 
+    if (MO->isDef() && MO->isUndef() && !MO->getSubReg() &&
+        MO->getReg().isVirtual()) // TODO: Apply to physregs too
+      report("Undef virtual register def operands require a subregister", MO, MONum);
+
     // Verify the consistency of tied operands.
     if (MO->isTied()) {
       unsigned OtherIdx = MI->findTiedOperandIdx(MONum);
@@ -2148,6 +2213,11 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
     }
     break;
 
+  case MachineOperand::MO_CFIIndex:
+    if (MO->getCFIIndex() >= MF->getFrameInstructions().size())
+      report("CFI instruction has invalid index", MO, MONum);
+    break;
+
   default:
     break;
   }
diff --git a/llvm/lib/CodeGen/MacroFusion.cpp b/llvm/lib/CodeGen/MacroFusion.cpp
index b0760322064c..fa5df68b8abc 100644
--- a/llvm/lib/CodeGen/MacroFusion.cpp
+++ b/llvm/lib/CodeGen/MacroFusion.cpp
@@ -12,11 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MacroFusion.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/CodeGen/ScheduleDAGMutation.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Support/CommandLine.h"
diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index f91a9d2c3a32..3245d9649be1 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -11,6 +11,7 @@
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/MCContext.h"
@@ -157,7 +158,7 @@ void ModuloScheduleExpander::generatePipelinedLoop() {
 
   SmallVector<MachineBasicBlock *, 4> EpilogBBs;
   // Generate the epilog instructions to complete the pipeline.
-  generateEpilog(MaxStageCount, KernelBB, VRMap, EpilogBBs, PrologBBs);
+  generateEpilog(MaxStageCount, KernelBB, BB, VRMap, EpilogBBs, PrologBBs);
 
   // We need this step because the register allocation doesn't handle some
   // situations well, so we insert copies to help out.
@@ -239,11 +240,9 @@ void ModuloScheduleExpander::generateProlog(unsigned LastStage,
 /// Generate the pipeline epilog code. The epilog code finishes the iterations
 /// that were started in either the prolog or the kernel.  We create a basic
 /// block for each stage that needs to complete.
-void ModuloScheduleExpander::generateEpilog(unsigned LastStage,
-                                            MachineBasicBlock *KernelBB,
-                                            ValueMapTy *VRMap,
-                                            MBBVectorTy &EpilogBBs,
-                                            MBBVectorTy &PrologBBs) {
+void ModuloScheduleExpander::generateEpilog(
+    unsigned LastStage, MachineBasicBlock *KernelBB, MachineBasicBlock *OrigBB,
+    ValueMapTy *VRMap, MBBVectorTy &EpilogBBs, MBBVectorTy &PrologBBs) {
   // We need to change the branch from the kernel to the first epilog block, so
   // this call to analyze branch uses the kernel rather than the original BB.
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
@@ -313,7 +312,12 @@ void ModuloScheduleExpander::generateEpilog(unsigned LastStage,
   // Create a branch to the new epilog from the kernel.
   // Remove the original branch and add a new branch to the epilog.
   TII->removeBranch(*KernelBB);
-  TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc());
+  assert((OrigBB == TBB || OrigBB == FBB) &&
+         "Unable to determine looping branch direction");
+  if (OrigBB != TBB)
+    TII->insertBranch(*KernelBB, EpilogStart, KernelBB, Cond, DebugLoc());
+  else
+    TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc());
   // Add a branch to the loop exit.
   if (EpilogBBs.size() > 0) {
     MachineBasicBlock *LastEpilogBB = EpilogBBs.back();
@@ -813,8 +817,8 @@ static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) {
       break;
     for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2)
       if (MI.getOperand(i + 1).getMBB() == Incoming) {
-        MI.RemoveOperand(i + 1);
-        MI.RemoveOperand(i);
+        MI.removeOperand(i + 1);
+        MI.removeOperand(i);
         break;
       }
   }
@@ -846,7 +850,7 @@ void ModuloScheduleExpander::addBranches(MachineBasicBlock &PreheaderBB,
     Optional<bool> StaticallyGreater =
         LoopInfo->createTripCountGreaterCondition(j + 1, *Prolog, Cond);
     unsigned numAdded = 0;
-    if (!StaticallyGreater.hasValue()) {
+    if (!StaticallyGreater) {
       Prolog->addSuccessor(Epilog);
       numAdded = TII->insertBranch(*Prolog, Epilog, LastPro, Cond, DebugLoc());
     } else if (*StaticallyGreater == false) {
@@ -999,7 +1003,7 @@ MachineInstr *ModuloScheduleExpander::cloneAndChangeInstr(
 }
 
 /// Update the machine instruction with new virtual registers.  This
-/// function may change the defintions and/or uses.
+/// function may change the definitions and/or uses.
 void ModuloScheduleExpander::updateInstruction(MachineInstr *NewMI,
                                                bool LastDef,
                                                unsigned CurStageNum,
@@ -1159,8 +1163,17 @@ void ModuloScheduleExpander::rewriteScheduledInstr(
     if (!InProlog && !Phi->isPHI() && StagePhi < StageSched)
       ReplaceReg = NewReg;
     if (ReplaceReg) {
-      MRI.constrainRegClass(ReplaceReg, MRI.getRegClass(OldReg));
-      UseOp.setReg(ReplaceReg);
+      const TargetRegisterClass *NRC =
+          MRI.constrainRegClass(ReplaceReg, MRI.getRegClass(OldReg));
+      if (NRC)
+        UseOp.setReg(ReplaceReg);
+      else {
+        Register SplitReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
+        BuildMI(*BB, UseMI, UseMI->getDebugLoc(), TII->get(TargetOpcode::COPY),
+                SplitReg)
+            .addReg(ReplaceReg);
+        UseOp.setReg(SplitReg);
+      }
     }
   }
 }
@@ -1205,8 +1218,12 @@ void EliminateDeadPhis(MachineBasicBlock *MBB, MachineRegisterInfo &MRI,
         MI.eraseFromParent();
         Changed = true;
       } else if (!KeepSingleSrcPhi && MI.getNumExplicitOperands() == 3) {
-        MRI.constrainRegClass(MI.getOperand(1).getReg(),
-                              MRI.getRegClass(MI.getOperand(0).getReg()));
+        const TargetRegisterClass *ConstrainRegClass =
+            MRI.constrainRegClass(MI.getOperand(1).getReg(),
+                                  MRI.getRegClass(MI.getOperand(0).getReg()));
+        assert(ConstrainRegClass &&
+               "Expected a valid constrained register class!");
+        (void)ConstrainRegClass;
         MRI.replaceRegWith(MI.getOperand(0).getReg(),
                            MI.getOperand(1).getReg());
         if (LIS)
@@ -1404,7 +1421,7 @@ Register KernelRewriter::remapUse(Register Reg, MachineInstr &MI) {
   while (DefaultI != Defaults.rend())
     LoopReg = phi(LoopReg, *DefaultI++, MRI.getRegClass(Reg));
 
-  if (IllegalPhiDefault.hasValue()) {
+  if (IllegalPhiDefault) {
     // The consumer optionally consumes LoopProducer in the same iteration
     // (because the producer is scheduled at an earlier cycle than the consumer)
     // or the initial value. To facilitate this we create an illegal block here
@@ -1414,7 +1431,7 @@ Register KernelRewriter::remapUse(Register Reg, MachineInstr &MI) {
     Register R = MRI.createVirtualRegister(RC);
     MachineInstr *IllegalPhi =
         BuildMI(*BB, MI, DebugLoc(), TII->get(TargetOpcode::PHI), R)
-            .addReg(IllegalPhiDefault.getValue())
+            .addReg(*IllegalPhiDefault)
             .addMBB(PreheaderBB) // Block choice is arbitrary and has no effect.
             .addReg(LoopReg)
             .addMBB(BB); // Block choice is arbitrary and has no effect.
@@ -1430,7 +1447,7 @@ Register KernelRewriter::remapUse(Register Reg, MachineInstr &MI) {
 Register KernelRewriter::phi(Register LoopReg, Optional<Register> InitReg,
                              const TargetRegisterClass *RC) {
   // If the init register is not undef, try and find an existing phi.
-  if (InitReg.hasValue()) {
+  if (InitReg) {
     auto I = Phis.find({LoopReg, InitReg.getValue()});
     if (I != Phis.end())
       return I->second;
@@ -1446,7 +1463,7 @@ Register KernelRewriter::phi(Register LoopReg, Optional<Register> InitReg,
   auto I = UndefPhis.find(LoopReg);
   if (I != UndefPhis.end()) {
     Register R = I->second;
-    if (!InitReg.hasValue())
+    if (!InitReg)
       // Found a phi taking undef as input, and this input is undef so return
       // without any more changes.
       return R;
@@ -1454,7 +1471,10 @@ Register KernelRewriter::phi(Register LoopReg, Optional<Register> InitReg,
     MachineInstr *MI = MRI.getVRegDef(R);
     MI->getOperand(1).setReg(InitReg.getValue());
     Phis.insert({{LoopReg, InitReg.getValue()}, R});
-    MRI.constrainRegClass(R, MRI.getRegClass(InitReg.getValue()));
+    const TargetRegisterClass *ConstrainRegClass =
+        MRI.constrainRegClass(R, MRI.getRegClass(InitReg.getValue()));
+    assert(ConstrainRegClass && "Expected a valid constrained register class!");
+    (void)ConstrainRegClass;
     UndefPhis.erase(I);
     return R;
   }
@@ -1463,14 +1483,18 @@ Register KernelRewriter::phi(Register LoopReg, Optional<Register> InitReg,
   if (!RC)
     RC = MRI.getRegClass(LoopReg);
   Register R = MRI.createVirtualRegister(RC);
-  if (InitReg.hasValue())
-    MRI.constrainRegClass(R, MRI.getRegClass(*InitReg));
+  if (InitReg) {
+    const TargetRegisterClass *ConstrainRegClass =
+        MRI.constrainRegClass(R, MRI.getRegClass(*InitReg));
+    assert(ConstrainRegClass && "Expected a valid constrained register class!");
+    (void)ConstrainRegClass;
+  }
   BuildMI(*BB, BB->getFirstNonPHI(), DebugLoc(), TII->get(TargetOpcode::PHI), R)
-      .addReg(InitReg.hasValue() ? *InitReg : undef(RC))
+      .addReg(InitReg ? *InitReg : undef(RC))
       .addMBB(PreheaderBB)
       .addReg(LoopReg)
       .addMBB(BB);
-  if (!InitReg.hasValue())
+  if (!InitReg)
     UndefPhis[LoopReg] = R;
   else
     Phis[{LoopReg, *InitReg}] = R;
@@ -1793,10 +1817,10 @@ void PeelingModuloScheduleExpander::peelPrologAndEpilogs() {
 
   // Iterate in reverse order over all instructions, remapping as we go.
   for (MachineBasicBlock *B : reverse(Blocks)) {
-    for (auto I = B->getFirstInstrTerminator()->getReverseIterator();
+    for (auto I = B->instr_rbegin();
          I != std::next(B->getFirstNonPHI()->getReverseIterator());) {
-      MachineInstr *MI = &*I++;
-      rewriteUsesOf(MI);
+      MachineBasicBlock::reverse_instr_iterator MI = I++;
+      rewriteUsesOf(&*MI);
     }
   }
   for (auto *MI : IllegalPhisToDelete) {
@@ -1919,7 +1943,7 @@ void PeelingModuloScheduleExpander::fixupBranches() {
     TII->removeBranch(*Prolog);
     Optional<bool> StaticallyGreater =
         LoopInfo->createTripCountGreaterCondition(TC, *Prolog, Cond);
-    if (!StaticallyGreater.hasValue()) {
+    if (!StaticallyGreater) {
       LLVM_DEBUG(dbgs() << "Dynamic: TC > " << TC << "\n");
       // Dynamically branch based on Cond.
       TII->insertBranch(*Prolog, Epilog, Fallthrough, Cond, DebugLoc());
@@ -1929,8 +1953,8 @@ void PeelingModuloScheduleExpander::fixupBranches() {
       // blocks. Leave it to unreachable-block-elim to clean up.
       Prolog->removeSuccessor(Fallthrough);
       for (MachineInstr &P : Fallthrough->phis()) {
-        P.RemoveOperand(2);
-        P.RemoveOperand(1);
+        P.removeOperand(2);
+        P.removeOperand(1);
       }
       TII->insertUnconditionalBranch(*Prolog, Epilog, DebugLoc());
       KernelDisposed = true;
@@ -1939,8 +1963,8 @@ void PeelingModuloScheduleExpander::fixupBranches() {
       // Prolog always falls through; remove incoming values in epilog.
       Prolog->removeSuccessor(Epilog);
       for (MachineInstr &P : Epilog->phis()) {
-        P.RemoveOperand(4);
-        P.RemoveOperand(3);
+        P.removeOperand(4);
+        P.removeOperand(3);
       }
     }
   }
diff --git a/llvm/lib/CodeGen/NonRelocatableStringpool.cpp b/llvm/lib/CodeGen/NonRelocatableStringpool.cpp
index db5217469fba..7304bfef55cb 100644
--- a/llvm/lib/CodeGen/NonRelocatableStringpool.cpp
+++ b/llvm/lib/CodeGen/NonRelocatableStringpool.cpp
@@ -25,7 +25,7 @@ DwarfStringPoolEntryRef NonRelocatableStringpool::getEntry(StringRef S) {
     Entry.Symbol = nullptr;
     CurrentEndOffset += S.size() + 1;
   }
-  return DwarfStringPoolEntryRef(*I.first, true);
+  return DwarfStringPoolEntryRef(*I.first);
 }
 
 StringRef NonRelocatableStringpool::internString(StringRef S) {
@@ -44,7 +44,7 @@ NonRelocatableStringpool::getEntriesForEmission() const {
   Result.reserve(Strings.size());
   for (const auto &E : Strings)
     if (E.getValue().isIndexed())
-      Result.emplace_back(E, true);
+      Result.emplace_back(E);
   llvm::sort(Result, [](const DwarfStringPoolEntryRef A,
                         const DwarfStringPoolEntryRef B) {
     return A.getIndex() < B.getIndex();
diff --git a/llvm/lib/CodeGen/OptimizePHIs.cpp b/llvm/lib/CodeGen/OptimizePHIs.cpp
index 8a6cf47c0d89..d5d262e4047a 100644
--- a/llvm/lib/CodeGen/OptimizePHIs.cpp
+++ b/llvm/lib/CodeGen/OptimizePHIs.cpp
@@ -19,7 +19,6 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
diff --git a/llvm/lib/CodeGen/PHIElimination.cpp b/llvm/lib/CodeGen/PHIElimination.cpp
index 7693ab417de9..7709095cd683 100644
--- a/llvm/lib/CodeGen/PHIElimination.cpp
+++ b/llvm/lib/CodeGen/PHIElimination.cpp
@@ -31,9 +31,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
-#include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/Pass.h"
diff --git a/llvm/lib/CodeGen/ParallelCG.cpp b/llvm/lib/CodeGen/ParallelCG.cpp
index 3e32afaafa6e..43b23368ead2 100644
--- a/llvm/lib/CodeGen/ParallelCG.cpp
+++ b/llvm/lib/CodeGen/ParallelCG.cpp
@@ -16,8 +16,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Support/ErrorOr.h"
-#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/MemoryBufferRef.h"
 #include "llvm/Support/ThreadPool.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/SplitModule.h"
diff --git a/llvm/lib/CodeGen/PatchableFunction.cpp b/llvm/lib/CodeGen/PatchableFunction.cpp
index ca44b7a53982..0f9da0637ced 100644
--- a/llvm/lib/CodeGen/PatchableFunction.cpp
+++ b/llvm/lib/CodeGen/PatchableFunction.cpp
@@ -14,11 +14,11 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
index f9b16d2630d6..31e37c4cd7e3 100644
--- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -90,7 +90,6 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <cstdint>
@@ -214,8 +213,9 @@ namespace {
                               const SmallSet<Register, 2> &TargetReg,
                               RecurrenceCycle &RC);
 
-    /// If copy instruction \p MI is a virtual register copy, track it in
-    /// the set \p CopyMIs. If this virtual register was previously seen as a
+    /// If copy instruction \p MI is a virtual register copy or a copy of a
+    /// constant physical register to a virtual register, track it in the
+    /// set \p CopyMIs. If this virtual register was previously seen as a
     /// copy, replace the uses of this copy with the previously seen copy's
     /// destination register.
     bool foldRedundantCopy(MachineInstr &MI,
@@ -810,7 +810,7 @@ protected:
   unsigned CurrentSrcIdx = 0;   ///< The index of the source being rewritten.
 public:
   Rewriter(MachineInstr &CopyLike) : CopyLike(CopyLike) {}
-  virtual ~Rewriter() {}
+  virtual ~Rewriter() = default;
 
   /// Get the next rewritable source (SrcReg, SrcSubReg) and
   /// the related value that it affects (DstReg, DstSubReg).
@@ -1022,7 +1022,7 @@ public:
       CurrentSrcIdx = -1;
       // Rewrite the operation as a COPY.
       // Get rid of the sub-register index.
-      CopyLike.RemoveOperand(2);
+      CopyLike.removeOperand(2);
       // Morph the operation into a COPY.
       CopyLike.setDesc(TII.get(TargetOpcode::COPY));
       return true;
@@ -1412,7 +1412,7 @@ bool PeepholeOptimizer::foldRedundantCopy(
 
   Register SrcReg = MI.getOperand(1).getReg();
   unsigned SrcSubReg = MI.getOperand(1).getSubReg();
-  if (!SrcReg.isVirtual())
+  if (!SrcReg.isVirtual() && !MRI->isConstantPhysReg(SrcReg))
     return false;
 
   Register DstReg = MI.getOperand(0).getReg();
@@ -1643,8 +1643,8 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
     // without any intervening re-definition of $physreg.
     DenseMap<Register, MachineInstr *> NAPhysToVirtMIs;
 
-    // Set of pairs of virtual registers and their subregs that are copied
-    // from.
+    // Set of copies to virtual registers keyed by source register.  Never
+    // holds any physreg which requires def tracking.
     DenseMap<RegSubRegPair, MachineInstr *> CopySrcMIs;
 
     bool IsLoopHeader = MLI->isLoopHeader(&MBB);
diff --git a/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp b/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp
index 82ed386db827..97b1532300b1 100644
--- a/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp
+++ b/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp
@@ -28,14 +28,11 @@
 
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Pass.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "post-RA-hazard-rec"
@@ -72,10 +69,11 @@ bool PostRAHazardRecognizer::runOnMachineFunction(MachineFunction &Fn) {
       TII->CreateTargetPostRAHazardRecognizer(Fn));
 
   // Return if the target has not implemented a hazard recognizer.
-  if (!HazardRec.get())
+  if (!HazardRec)
     return false;
 
   // Loop over all of the basic blocks
+  bool Changed = false;
   for (auto &MBB : Fn) {
     // We do not call HazardRec->reset() here to make sure we are handling noop
     // hazards at the start of basic blocks.
@@ -85,6 +83,8 @@ bool PostRAHazardRecognizer::runOnMachineFunction(MachineFunction &Fn) {
       HazardRec->EmitNoops(NumPreNoops);
       TII->insertNoops(MBB, MachineBasicBlock::iterator(MI), NumPreNoops);
       NumNoops += NumPreNoops;
+      if (NumPreNoops)
+        Changed = true;
 
       HazardRec->EmitInstruction(&MI);
       if (HazardRec->atIssueLimit()) {
@@ -92,5 +92,5 @@ bool PostRAHazardRecognizer::runOnMachineFunction(MachineFunction &Fn) {
       }
     }
   }
-  return true;
+  return Changed;
 }
diff --git a/llvm/lib/CodeGen/PostRASchedulerList.cpp b/llvm/lib/CodeGen/PostRASchedulerList.cpp
index aac46cb22084..98fc7e07a1b4 100644
--- a/llvm/lib/CodeGen/PostRASchedulerList.cpp
+++ b/llvm/lib/CodeGen/PostRASchedulerList.cpp
@@ -25,18 +25,16 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
-#include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
-#include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -72,7 +70,7 @@ DebugMod("postra-sched-debugmod",
                       cl::desc("Debug control MBBs that are scheduled"),
                       cl::init(0), cl::Hidden);
 
-AntiDepBreaker::~AntiDepBreaker() { }
+AntiDepBreaker::~AntiDepBreaker() = default;
 
 namespace {
   class PostRAScheduler : public MachineFunctionPass {
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 74b903f99284..1115c2a27956 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -18,10 +18,8 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
-#include "llvm/IR/User.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
diff --git a/llvm/lib/CodeGen/ProcessImplicitDefs.cpp b/llvm/lib/CodeGen/ProcessImplicitDefs.cpp
index d232ca3a69c3..7327f9e52efc 100644
--- a/llvm/lib/CodeGen/ProcessImplicitDefs.cpp
+++ b/llvm/lib/CodeGen/ProcessImplicitDefs.cpp
@@ -11,10 +11,11 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -45,6 +46,11 @@ public:
   void getAnalysisUsage(AnalysisUsage &au) const override;
 
   bool runOnMachineFunction(MachineFunction &MF) override;
+
+  virtual MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::IsSSA);
+  }
 };
 } // end anonymous namespace
 
@@ -124,7 +130,7 @@ void ProcessImplicitDefs::processImplicitDef(MachineInstr *MI) {
   // Using instr wasn't found, it could be in another block.
   // Leave the physreg IMPLICIT_DEF, but trim any extra operands.
   for (unsigned i = MI->getNumOperands() - 1; i; --i)
-    MI->RemoveOperand(i);
+    MI->removeOperand(i);
   LLVM_DEBUG(dbgs() << "Keeping physreg: " << *MI);
 }
 
@@ -140,7 +146,6 @@ bool ProcessImplicitDefs::runOnMachineFunction(MachineFunction &MF) {
   TII = MF.getSubtarget().getInstrInfo();
   TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
-  assert(MRI->isSSA() && "ProcessImplicitDefs only works on SSA form.");
   assert(WorkList.empty() && "Inconsistent worklist state");
 
   for (MachineBasicBlock &MBB : MF) {
diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
index 8d8a6126dad0..1a0f296d5fdc 100644
--- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -55,10 +55,8 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CodeGen.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
@@ -130,6 +128,7 @@ private:
   void replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &MF,
                            int &SPAdj);
   void insertPrologEpilogCode(MachineFunction &MF);
+  void insertZeroCallUsedRegs(MachineFunction &MF);
 };
 
 } // end anonymous namespace
@@ -284,6 +283,9 @@ bool PEI::runOnMachineFunction(MachineFunction &MF) {
     assert(!Failed && "Invalid warn-stack-size fn attr value");
     (void)Failed;
   }
+  if (MF.getFunction().hasFnAttribute(Attribute::SafeStack)) {
+    StackSize += MFI.getUnsafeStackSize();
+  }
   if (StackSize > Threshold) {
     DiagnosticInfoStackSize DiagStackSize(F, StackSize, Threshold, DS_Warning);
     F.getContext().diagnose(DiagStackSize);
@@ -837,8 +839,8 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &MF) {
   // Adjust 'Offset' to point to the end of last fixed sized preallocated
   // object.
   for (int i = MFI.getObjectIndexBegin(); i != 0; ++i) {
-    if (MFI.getStackID(i) !=
-        TargetStackID::Default) // Only allocate objects on the default stack.
+    // Only allocate objects on the default stack.
+    if (MFI.getStackID(i) != TargetStackID::Default)
       continue;
 
     int64_t FixedOff;
@@ -855,47 +857,34 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &MF) {
     if (FixedOff > Offset) Offset = FixedOff;
   }
 
+  Align MaxAlign = MFI.getMaxAlign();
   // First assign frame offsets to stack objects that are used to spill
   // callee saved registers.
-  if (StackGrowsDown && MaxCSFrameIndex >= MinCSFrameIndex) {
-    for (unsigned i = MinCSFrameIndex; i <= MaxCSFrameIndex; ++i) {
-      if (MFI.getStackID(i) !=
-          TargetStackID::Default) // Only allocate objects on the default stack.
-        continue;
+  if (MaxCSFrameIndex >= MinCSFrameIndex) {
+    for (unsigned i = 0; i <= MaxCSFrameIndex - MinCSFrameIndex; ++i) {
+      unsigned FrameIndex =
+          StackGrowsDown ? MinCSFrameIndex + i : MaxCSFrameIndex - i;
 
-      // If the stack grows down, we need to add the size to find the lowest
-      // address of the object.
-      Offset += MFI.getObjectSize(i);
-
-      // Adjust to alignment boundary
-      Offset = alignTo(Offset, MFI.getObjectAlign(i), Skew);
-
-      LLVM_DEBUG(dbgs() << "alloc FI(" << i << ") at SP[" << -Offset << "]\n");
-      MFI.setObjectOffset(i, -Offset);        // Set the computed offset
-    }
-  } else if (MaxCSFrameIndex >= MinCSFrameIndex) {
-    // Be careful about underflow in comparisons agains MinCSFrameIndex.
-    for (unsigned i = MaxCSFrameIndex; i != MinCSFrameIndex - 1; --i) {
-      if (MFI.getStackID(i) !=
-          TargetStackID::Default) // Only allocate objects on the default stack.
+      // Only allocate objects on the default stack.
+      if (MFI.getStackID(FrameIndex) != TargetStackID::Default)
         continue;
 
-      if (MFI.isDeadObjectIndex(i))
+      // TODO: should this just be if (MFI.isDeadObjectIndex(FrameIndex))
+      if (!StackGrowsDown && MFI.isDeadObjectIndex(FrameIndex))
         continue;
 
-      // Adjust to alignment boundary
-      Offset = alignTo(Offset, MFI.getObjectAlign(i), Skew);
-
-      LLVM_DEBUG(dbgs() << "alloc FI(" << i << ") at SP[" << Offset << "]\n");
-      MFI.setObjectOffset(i, Offset);
-      Offset += MFI.getObjectSize(i);
+      AdjustStackOffset(MFI, FrameIndex, StackGrowsDown, Offset, MaxAlign,
+                        Skew);
     }
   }
 
+  assert(MaxAlign == MFI.getMaxAlign() &&
+         "MFI.getMaxAlign should already account for all callee-saved "
+         "registers without a fixed stack slot");
+
   // FixedCSEnd is the stack offset to the end of the fixed and callee-save
   // stack area.
   int64_t FixedCSEnd = Offset;
-  Align MaxAlign = MFI.getMaxAlign();
 
   // Make sure the special register scavenging spill slot is closest to the
   // incoming stack pointer if a frame pointer is required and is closer
@@ -982,8 +971,8 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &MF) {
         continue;
       if (StackProtectorFI == (int)i || EHRegNodeFrameIndex == (int)i)
         continue;
-      if (MFI.getStackID(i) !=
-          TargetStackID::Default) // Only allocate objects on the default stack.
+      // Only allocate objects on the default stack.
+      if (MFI.getStackID(i) != TargetStackID::Default)
         continue;
 
       switch (MFI.getObjectSSPLayout(i)) {
@@ -1036,8 +1025,8 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &MF) {
       continue;
     if (ProtectedObjs.count(i))
       continue;
-    if (MFI.getStackID(i) !=
-        TargetStackID::Default) // Only allocate objects on the default stack.
+    // Only allocate objects on the default stack.
+    if (MFI.getStackID(i) != TargetStackID::Default)
       continue;
 
     // Add the objects that we need to allocate to our working set.
@@ -1145,6 +1134,9 @@ void PEI::insertPrologEpilogCode(MachineFunction &MF) {
   for (MachineBasicBlock *RestoreBlock : RestoreBlocks)
     TFI.emitEpilogue(MF, *RestoreBlock);
 
+  // Zero call used registers before restoring callee-saved registers.
+  insertZeroCallUsedRegs(MF);
+
   for (MachineBasicBlock *SaveBlock : SaveBlocks)
     TFI.inlineStackProbe(MF, *SaveBlock);
 
@@ -1155,11 +1147,7 @@ void PEI::insertPrologEpilogCode(MachineFunction &MF) {
   if (MF.shouldSplitStack()) {
     for (MachineBasicBlock *SaveBlock : SaveBlocks)
       TFI.adjustForSegmentedStacks(MF, *SaveBlock);
-    // Record that there are split-stack functions, so we will emit a
-    // special section to tell the linker.
-    MF.getMMI().setHasSplitStack(true);
-  } else
-    MF.getMMI().setHasNosplitStack(true);
+  }
 
   // Emit additional code that is required to explicitly handle the stack in
   // HiPE native code (if needed) when loaded in the Erlang/OTP runtime. The
@@ -1171,6 +1159,120 @@ void PEI::insertPrologEpilogCode(MachineFunction &MF) {
       TFI.adjustForHiPEPrologue(MF, *SaveBlock);
 }
 
+/// insertZeroCallUsedRegs - Zero out call used registers.
+void PEI::insertZeroCallUsedRegs(MachineFunction &MF) {
+  const Function &F = MF.getFunction();
+
+  if (!F.hasFnAttribute("zero-call-used-regs"))
+    return;
+
+  using namespace ZeroCallUsedRegs;
+
+  ZeroCallUsedRegsKind ZeroRegsKind =
+      StringSwitch<ZeroCallUsedRegsKind>(
+          F.getFnAttribute("zero-call-used-regs").getValueAsString())
+          .Case("skip", ZeroCallUsedRegsKind::Skip)
+          .Case("used-gpr-arg", ZeroCallUsedRegsKind::UsedGPRArg)
+          .Case("used-gpr", ZeroCallUsedRegsKind::UsedGPR)
+          .Case("used-arg", ZeroCallUsedRegsKind::UsedArg)
+          .Case("used", ZeroCallUsedRegsKind::Used)
+          .Case("all-gpr-arg", ZeroCallUsedRegsKind::AllGPRArg)
+          .Case("all-gpr", ZeroCallUsedRegsKind::AllGPR)
+          .Case("all-arg", ZeroCallUsedRegsKind::AllArg)
+          .Case("all", ZeroCallUsedRegsKind::All);
+
+  if (ZeroRegsKind == ZeroCallUsedRegsKind::Skip)
+    return;
+
+  const bool OnlyGPR = static_cast<unsigned>(ZeroRegsKind) & ONLY_GPR;
+  const bool OnlyUsed = static_cast<unsigned>(ZeroRegsKind) & ONLY_USED;
+  const bool OnlyArg = static_cast<unsigned>(ZeroRegsKind) & ONLY_ARG;
+
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+  const BitVector AllocatableSet(TRI.getAllocatableSet(MF));
+
+  // Mark all used registers.
+  BitVector UsedRegs(TRI.getNumRegs());
+  if (OnlyUsed)
+    for (const MachineBasicBlock &MBB : MF)
+      for (const MachineInstr &MI : MBB)
+        for (const MachineOperand &MO : MI.operands()) {
+          if (!MO.isReg())
+            continue;
+
+          MCRegister Reg = MO.getReg();
+          if (AllocatableSet[Reg] && !MO.isImplicit() &&
+              (MO.isDef() || MO.isUse()))
+            UsedRegs.set(Reg);
+        }
+
+  BitVector RegsToZero(TRI.getNumRegs());
+  for (MCRegister Reg : AllocatableSet.set_bits()) {
+    // Skip over fixed registers.
+    if (TRI.isFixedRegister(MF, Reg))
+      continue;
+
+    // Want only general purpose registers.
+    if (OnlyGPR && !TRI.isGeneralPurposeRegister(MF, Reg))
+      continue;
+
+    // Want only used registers.
+    if (OnlyUsed && !UsedRegs[Reg])
+      continue;
+
+    // Want only registers used for arguments.
+    if (OnlyArg && !TRI.isArgumentRegister(MF, Reg))
+      continue;
+
+    RegsToZero.set(Reg);
+  }
+
+  // Don't clear registers that are live when leaving the function.
+  for (const MachineBasicBlock &MBB : MF)
+    for (const MachineInstr &MI : MBB.terminators()) {
+      if (!MI.isReturn())
+        continue;
+
+      for (const auto &MO : MI.operands()) {
+        if (!MO.isReg())
+          continue;
+
+        for (MCPhysReg SReg : TRI.sub_and_superregs_inclusive(MO.getReg()))
+          RegsToZero.reset(SReg);
+      }
+    }
+
+  // Don't need to clear registers that are used/clobbered by terminating
+  // instructions.
+  for (const MachineBasicBlock &MBB : MF) {
+    if (!MBB.isReturnBlock())
+      continue;
+
+    MachineBasicBlock::const_iterator MBBI = MBB.getFirstTerminator();
+    for (MachineBasicBlock::const_iterator I = MBBI, E = MBB.end(); I != E;
+         ++I) {
+      for (const MachineOperand &MO : I->operands()) {
+        if (!MO.isReg())
+          continue;
+
+        for (const MCPhysReg &Reg :
+             TRI.sub_and_superregs_inclusive(MO.getReg()))
+          RegsToZero.reset(Reg);
+      }
+    }
+  }
+
+  // Don't clear registers that are reset before exiting.
+  for (const CalleeSavedInfo &CSI : MF.getFrameInfo().getCalleeSavedInfo())
+    for (MCRegister Reg : TRI.sub_and_superregs_inclusive(CSI.getReg()))
+      RegsToZero.reset(Reg);
+
+  const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
+  for (MachineBasicBlock &MBB : MF)
+    if (MBB.isReturnBlock())
+      TFI.emitZeroCallUsedRegs(RegsToZero, MBB);
+}
+
 /// replaceFrameIndices - Replace all MO_FrameIndex operands with physical
 /// register references and actual offsets.
 void PEI::replaceFrameIndices(MachineFunction &MF) {
diff --git a/llvm/lib/CodeGen/PseudoProbeInserter.cpp b/llvm/lib/CodeGen/PseudoProbeInserter.cpp
index 5f69f9194125..86ea3ec67178 100644
--- a/llvm/lib/CodeGen/PseudoProbeInserter.cpp
+++ b/llvm/lib/CodeGen/PseudoProbeInserter.cpp
@@ -18,11 +18,9 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PseudoProbe.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/MC/MCPseudoProbe.h"
-#include "llvm/Target/TargetMachine.h"
-#include <unordered_set>
 
 #define DEBUG_TYPE "pseudo-probe-inserter"
 
diff --git a/llvm/lib/CodeGen/PseudoSourceValue.cpp b/llvm/lib/CodeGen/PseudoSourceValue.cpp
index 74e721dbd138..40c52b9d9707 100644
--- a/llvm/lib/CodeGen/PseudoSourceValue.cpp
+++ b/llvm/lib/CodeGen/PseudoSourceValue.cpp
@@ -11,26 +11,23 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/PseudoSourceValue.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
 using namespace llvm;
 
 static const char *const PSVNames[] = {
     "Stack", "GOT", "JumpTable", "ConstantPool", "FixedStack",
     "GlobalValueCallEntry", "ExternalSymbolCallEntry"};
 
-PseudoSourceValue::PseudoSourceValue(unsigned Kind, const TargetInstrInfo &TII)
+PseudoSourceValue::PseudoSourceValue(unsigned Kind, const TargetMachine &TM)
     : Kind(Kind) {
-  AddressSpace = TII.getAddressSpaceForPseudoSourceKind(Kind);
+  AddressSpace = TM.getAddressSpaceForPseudoSourceKind(Kind);
 }
 
-
-PseudoSourceValue::~PseudoSourceValue() {}
+PseudoSourceValue::~PseudoSourceValue() = default;
 
 void PseudoSourceValue::printCustom(raw_ostream &O) const {
   if (Kind < TargetCustom)
@@ -79,9 +76,9 @@ void FixedStackPseudoSourceValue::printCustom(raw_ostream &OS) const {
   OS << "FixedStack" << FI;
 }
 
-CallEntryPseudoSourceValue::CallEntryPseudoSourceValue(
-    unsigned Kind, const TargetInstrInfo &TII)
-    : PseudoSourceValue(Kind, TII) {}
+CallEntryPseudoSourceValue::CallEntryPseudoSourceValue(unsigned Kind,
+                                                       const TargetMachine &TM)
+    : PseudoSourceValue(Kind, TM) {}
 
 bool CallEntryPseudoSourceValue::isConstant(const MachineFrameInfo *) const {
   return false;
@@ -96,20 +93,17 @@ bool CallEntryPseudoSourceValue::mayAlias(const MachineFrameInfo *) const {
 }
 
 GlobalValuePseudoSourceValue::GlobalValuePseudoSourceValue(
-    const GlobalValue *GV,
-    const TargetInstrInfo &TII)
-    : CallEntryPseudoSourceValue(GlobalValueCallEntry, TII), GV(GV) {}
+    const GlobalValue *GV, const TargetMachine &TM)
+    : CallEntryPseudoSourceValue(GlobalValueCallEntry, TM), GV(GV) {}
 ExternalSymbolPseudoSourceValue::ExternalSymbolPseudoSourceValue(
-    const char *ES, const TargetInstrInfo &TII)
-    : CallEntryPseudoSourceValue(ExternalSymbolCallEntry, TII), ES(ES) {}
+    const char *ES, const TargetMachine &TM)
+    : CallEntryPseudoSourceValue(ExternalSymbolCallEntry, TM), ES(ES) {}
 
-PseudoSourceValueManager::PseudoSourceValueManager(
-    const TargetInstrInfo &TIInfo)
-    : TII(TIInfo),
-      StackPSV(PseudoSourceValue::Stack, TII),
-      GOTPSV(PseudoSourceValue::GOT, TII),
-      JumpTablePSV(PseudoSourceValue::JumpTable, TII),
-      ConstantPoolPSV(PseudoSourceValue::ConstantPool, TII) {}
+PseudoSourceValueManager::PseudoSourceValueManager(const TargetMachine &TMInfo)
+    : TM(TMInfo), StackPSV(PseudoSourceValue::Stack, TM),
+      GOTPSV(PseudoSourceValue::GOT, TM),
+      JumpTablePSV(PseudoSourceValue::JumpTable, TM),
+      ConstantPoolPSV(PseudoSourceValue::ConstantPool, TM) {}
 
 const PseudoSourceValue *PseudoSourceValueManager::getStack() {
   return &StackPSV;
@@ -129,7 +123,7 @@ const PseudoSourceValue *
 PseudoSourceValueManager::getFixedStack(int FI) {
   std::unique_ptr<FixedStackPseudoSourceValue> &V = FSValues[FI];
   if (!V)
-    V = std::make_unique<FixedStackPseudoSourceValue>(FI, TII);
+    V = std::make_unique<FixedStackPseudoSourceValue>(FI, TM);
   return V.get();
 }
 
@@ -138,7 +132,7 @@ PseudoSourceValueManager::getGlobalValueCallEntry(const GlobalValue *GV) {
   std::unique_ptr<const GlobalValuePseudoSourceValue> &E =
       GlobalCallEntries[GV];
   if (!E)
-    E = std::make_unique<GlobalValuePseudoSourceValue>(GV, TII);
+    E = std::make_unique<GlobalValuePseudoSourceValue>(GV, TM);
   return E.get();
 }
 
@@ -147,6 +141,6 @@ PseudoSourceValueManager::getExternalSymbolCallEntry(const char *ES) {
   std::unique_ptr<const ExternalSymbolPseudoSourceValue> &E =
       ExternalCallEntries[ES];
   if (!E)
-    E = std::make_unique<ExternalSymbolPseudoSourceValue>(ES, TII);
+    E = std::make_unique<ExternalSymbolPseudoSourceValue>(ES, TM);
   return E.get();
 }
diff --git a/llvm/lib/CodeGen/RDFGraph.cpp b/llvm/lib/CodeGen/RDFGraph.cpp
index 882f8e91bf1d..ec383b9b1c65 100644
--- a/llvm/lib/CodeGen/RDFGraph.cpp
+++ b/llvm/lib/CodeGen/RDFGraph.cpp
@@ -8,6 +8,7 @@
 //
 // Target-independent, SSA-based data flow graph for register data flow (RDF).
 //
+#include "llvm/CodeGen/RDFGraph.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
@@ -18,7 +19,6 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RDFGraph.h"
 #include "llvm/CodeGen/RDFRegisters.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -27,8 +27,6 @@
 #include "llvm/IR/Function.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/MC/MCInstrDesc.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -979,18 +977,6 @@ RegisterRef DataFlowGraph::makeRegRef(const MachineOperand &Op) const {
   return RegisterRef(PRI.getRegMaskId(Op.getRegMask()), LaneBitmask::getAll());
 }
 
-RegisterRef DataFlowGraph::restrictRef(RegisterRef AR, RegisterRef BR) const {
-  if (AR.Reg == BR.Reg) {
-    LaneBitmask M = AR.Mask & BR.Mask;
-    return M.any() ? RegisterRef(AR.Reg, M) : RegisterRef();
-  }
-  // This isn't strictly correct, because the overlap may happen in the
-  // part masked out.
-  if (PRI.alias(AR, BR))
-    return AR;
-  return RegisterRef();
-}
-
 // For each stack in the map DefM, push the delimiter for block B on it.
 void DataFlowGraph::markBlock(NodeId B, DefStackMap &DefM) {
   // Push block delimiters.
diff --git a/llvm/lib/CodeGen/RDFLiveness.cpp b/llvm/lib/CodeGen/RDFLiveness.cpp
index d704cf7b3213..2fd947086b4d 100644
--- a/llvm/lib/CodeGen/RDFLiveness.cpp
+++ b/llvm/lib/CodeGen/RDFLiveness.cpp
@@ -22,6 +22,7 @@
 // and Embedded Architectures and Compilers", 8 (4),
 // <10.1145/2086696.2086706>. <hal-00647369>
 //
+#include "llvm/CodeGen/RDFLiveness.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
@@ -32,14 +33,12 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/RDFLiveness.h"
 #include "llvm/CodeGen/RDFGraph.h"
 #include "llvm/CodeGen/RDFRegisters.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -341,9 +340,8 @@ Liveness::getAllReachingDefsRecImpl(RegisterRef RefRR, NodeAddr<RefNode*> RefA,
     if (!(DA.Addr->getFlags() & NodeAttrs::PhiRef))
       continue;
     NodeAddr<PhiNode*> PA = DA.Addr->getOwner(DFG);
-    if (Visited.count(PA.Id))
+    if (!Visited.insert(PA.Id).second)
       continue;
-    Visited.insert(PA.Id);
     // Go over all phi uses and get the reaching defs for each use.
     for (auto U : PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG)) {
       const auto &T = getAllReachingDefsRecImpl(RefRR, U, Visited, TmpDefs,
diff --git a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
index 1264e6021b6e..69db8bad54f9 100644
--- a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
+++ b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
@@ -34,12 +34,7 @@ static bool isValidRegUseOf(const MachineOperand &MO, MCRegister PhysReg,
                             const TargetRegisterInfo *TRI) {
   if (!isValidRegUse(MO))
     return false;
-  if (MO.getReg() == PhysReg)
-    return true;
-  for (MCRegAliasIterator R(PhysReg, TRI, false); R.isValid(); ++R)
-    if (MO.getReg() == *R)
-      return true;
-  return false;
+  return TRI->regsOverlap(MO.getReg(), PhysReg);
 }
 
 static bool isValidRegDef(const MachineOperand &MO) {
@@ -50,12 +45,7 @@ static bool isValidRegDefOf(const MachineOperand &MO, MCRegister PhysReg,
                             const TargetRegisterInfo *TRI) {
   if (!isValidRegDef(MO))
     return false;
-  if (MO.getReg() == PhysReg)
-    return true;
-  for (MCRegAliasIterator R(PhysReg, TRI, false); R.isValid(); ++R)
-    if (MO.getReg() == *R)
-      return true;
-  return false;
+  return TRI->regsOverlap(MO.getReg(), PhysReg);
 }
 
 void ReachingDefAnalysis::enterBasicBlock(MachineBasicBlock *MBB) {
diff --git a/llvm/lib/CodeGen/RegAllocBase.cpp b/llvm/lib/CodeGen/RegAllocBase.cpp
index d891d4c2ffbb..0c18814189eb 100644
--- a/llvm/lib/CodeGen/RegAllocBase.cpp
+++ b/llvm/lib/CodeGen/RegAllocBase.cpp
@@ -85,7 +85,7 @@ void RegAllocBase::allocatePhysRegs() {
   seedLiveRegs();
 
   // Continue assigning vregs one at a time to available physical registers.
-  while (LiveInterval *VirtReg = dequeue()) {
+  while (const LiveInterval *VirtReg = dequeue()) {
     assert(!VRM->hasPhys(VirtReg->reg()) && "Register already assigned");
 
     // Unused registers can appear when the spiller coalesces snippets.
@@ -140,10 +140,7 @@ void RegAllocBase::allocatePhysRegs() {
 
       // Keep going after reporting the error.
       VRM->assignVirt2Phys(VirtReg->reg(), AllocOrder.front());
-      continue;
-    }
-
-    if (AvailablePhysReg)
+    } else if (AvailablePhysReg)
       Matrix->assign(*VirtReg, AvailablePhysReg);
 
     for (Register Reg : SplitVRegs) {
@@ -176,7 +173,7 @@ void RegAllocBase::postOptimization() {
   DeadRemats.clear();
 }
 
-void RegAllocBase::enqueue(LiveInterval *LI) {
+void RegAllocBase::enqueue(const LiveInterval *LI) {
   const Register Reg = LI->reg();
 
   assert(Reg.isVirtual() && "Can only enqueue virtual registers");
diff --git a/llvm/lib/CodeGen/RegAllocBase.h b/llvm/lib/CodeGen/RegAllocBase.h
index 1fb56dbaebb7..a8bf305a50c9 100644
--- a/llvm/lib/CodeGen/RegAllocBase.h
+++ b/llvm/lib/CodeGen/RegAllocBase.h
@@ -96,19 +96,19 @@ protected:
   virtual Spiller &spiller() = 0;
 
   /// enqueue - Add VirtReg to the priority queue of unassigned registers.
-  virtual void enqueueImpl(LiveInterval *LI) = 0;
+  virtual void enqueueImpl(const LiveInterval *LI) = 0;
 
   /// enqueue - Add VirtReg to the priority queue of unassigned registers.
-  void enqueue(LiveInterval *LI);
+  void enqueue(const LiveInterval *LI);
 
   /// dequeue - Return the next unassigned register, or NULL.
-  virtual LiveInterval *dequeue() = 0;
+  virtual const LiveInterval *dequeue() = 0;
 
   // A RegAlloc pass should override this to provide the allocation heuristics.
   // Each call must guarantee forward progess by returning an available PhysReg
   // or new set of split live virtual registers. It is up to the splitter to
   // converge quickly toward fully spilled live ranges.
-  virtual MCRegister selectOrSplit(LiveInterval &VirtReg,
+  virtual MCRegister selectOrSplit(const LiveInterval &VirtReg,
                                    SmallVectorImpl<Register> &splitLVRs) = 0;
 
   // Use this group name for NamedRegionTimer.
@@ -116,7 +116,7 @@ protected:
   static const char TimerGroupDescription[];
 
   /// Method called when the allocator is about to remove a LiveInterval.
-  virtual void aboutToRemoveInterval(LiveInterval &LI) {}
+  virtual void aboutToRemoveInterval(const LiveInterval &LI) {}
 
 public:
   /// VerifyEnabled - True when -verify-regalloc is given.
diff --git a/llvm/lib/CodeGen/RegAllocBasic.cpp b/llvm/lib/CodeGen/RegAllocBasic.cpp
index a9816b13e798..7defdf04aec8 100644
--- a/llvm/lib/CodeGen/RegAllocBasic.cpp
+++ b/llvm/lib/CodeGen/RegAllocBasic.cpp
@@ -22,9 +22,7 @@
 #include "llvm/CodeGen/LiveStacks.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/Spiller.h"
@@ -33,7 +31,6 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include <cstdlib>
 #include <queue>
 
 using namespace llvm;
@@ -45,7 +42,7 @@ static RegisterRegAlloc basicRegAlloc("basic", "basic register allocator",
 
 namespace {
   struct CompSpillWeight {
-    bool operator()(LiveInterval *A, LiveInterval *B) const {
+    bool operator()(const LiveInterval *A, const LiveInterval *B) const {
       return A->weight() < B->weight();
     }
   };
@@ -65,8 +62,9 @@ class RABasic : public MachineFunctionPass,
 
   // state
   std::unique_ptr<Spiller> SpillerInstance;
-  std::priority_queue<LiveInterval*, std::vector<LiveInterval*>,
-                      CompSpillWeight> Queue;
+  std::priority_queue<const LiveInterval *, std::vector<const LiveInterval *>,
+                      CompSpillWeight>
+      Queue;
 
   // Scratch space.  Allocated here to avoid repeated malloc calls in
   // selectOrSplit().
@@ -88,19 +86,17 @@ public:
 
   Spiller &spiller() override { return *SpillerInstance; }
 
-  void enqueueImpl(LiveInterval *LI) override {
-    Queue.push(LI);
-  }
+  void enqueueImpl(const LiveInterval *LI) override { Queue.push(LI); }
 
-  LiveInterval *dequeue() override {
+  const LiveInterval *dequeue() override {
     if (Queue.empty())
       return nullptr;
-    LiveInterval *LI = Queue.top();
+    const LiveInterval *LI = Queue.top();
     Queue.pop();
     return LI;
   }
 
-  MCRegister selectOrSplit(LiveInterval &VirtReg,
+  MCRegister selectOrSplit(const LiveInterval &VirtReg,
                            SmallVectorImpl<Register> &SplitVRegs) override;
 
   /// Perform register allocation.
@@ -119,7 +115,7 @@ public:
   // Helper for spilling all live virtual registers currently unified under preg
   // that interfere with the most recently queried lvr.  Return true if spilling
   // was successful, and append any new spilled/split intervals to splitLVRs.
-  bool spillInterferences(LiveInterval &VirtReg, MCRegister PhysReg,
+  bool spillInterferences(const LiveInterval &VirtReg, MCRegister PhysReg,
                           SmallVectorImpl<Register> &SplitVRegs);
 
   static char ID;
@@ -208,16 +204,17 @@ void RABasic::releaseMemory() {
 // Spill or split all live virtual registers currently unified under PhysReg
 // that interfere with VirtReg. The newly spilled or split live intervals are
 // returned by appending them to SplitVRegs.
-bool RABasic::spillInterferences(LiveInterval &VirtReg, MCRegister PhysReg,
+bool RABasic::spillInterferences(const LiveInterval &VirtReg,
+                                 MCRegister PhysReg,
                                  SmallVectorImpl<Register> &SplitVRegs) {
   // Record each interference and determine if all are spillable before mutating
   // either the union or live intervals.
-  SmallVector<LiveInterval*, 8> Intfs;
+  SmallVector<const LiveInterval *, 8> Intfs;
 
   // Collect interferences assigned to any alias of the physical register.
   for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
     LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
-    for (auto *Intf : reverse(Q.interferingVRegs())) {
+    for (const auto *Intf : reverse(Q.interferingVRegs())) {
       if (!Intf->isSpillable() || Intf->weight() > VirtReg.weight())
         return false;
       Intfs.push_back(Intf);
@@ -229,7 +226,7 @@ bool RABasic::spillInterferences(LiveInterval &VirtReg, MCRegister PhysReg,
 
   // Spill each interfering vreg allocated to PhysReg or an alias.
   for (unsigned i = 0, e = Intfs.size(); i != e; ++i) {
-    LiveInterval &Spill = *Intfs[i];
+    const LiveInterval &Spill = *Intfs[i];
 
     // Skip duplicates.
     if (!VRM->hasPhys(Spill.reg()))
@@ -258,7 +255,7 @@ bool RABasic::spillInterferences(LiveInterval &VirtReg, MCRegister PhysReg,
 // |vregs| * |machineregs|. And since the number of interference tests is
 // minimal, there is no value in caching them outside the scope of
 // selectOrSplit().
-MCRegister RABasic::selectOrSplit(LiveInterval &VirtReg,
+MCRegister RABasic::selectOrSplit(const LiveInterval &VirtReg,
                                   SmallVectorImpl<Register> &SplitVRegs) {
   // Populate a list of physical register spill candidates.
   SmallVector<MCRegister, 8> PhysRegSpillCands;
diff --git a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp
index fc5d1104a999..ee03feda796f 100644
--- a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp
+++ b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp
@@ -11,13 +11,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "RegAllocEvictionAdvisor.h"
+#include "AllocationOrder.h"
 #include "RegAllocGreedy.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
-#include "llvm/PassRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetMachine.h"
@@ -25,7 +26,7 @@
 using namespace llvm;
 
 static cl::opt<RegAllocEvictionAdvisorAnalysis::AdvisorMode> Mode(
-    "regalloc-enable-advisor", cl::Hidden, cl::ZeroOrMore,
+    "regalloc-enable-advisor", cl::Hidden,
     cl::init(RegAllocEvictionAdvisorAnalysis::AdvisorMode::Default),
     cl::desc("Enable regalloc advisor mode"),
     cl::values(
@@ -42,6 +43,14 @@ static cl::opt<bool> EnableLocalReassignment(
              "may be compile time intensive"),
     cl::init(false));
 
+cl::opt<unsigned> EvictInterferenceCutoff(
+    "regalloc-eviction-max-interference-cutoff", cl::Hidden,
+    cl::desc("Number of interferences after which we declare "
+             "an interference unevictable and bail out. This "
+             "is a compilation cost-saving consideration. To "
+             "disable, pass a very large number."),
+    cl::init(10));
+
 #define DEBUG_TYPE "regalloc"
 #ifdef LLVM_HAVE_TF_AOT_REGALLOCEVICTMODEL
 #define LLVM_HAVE_TF_AOT
@@ -66,7 +75,7 @@ public:
 
 private:
   std::unique_ptr<RegAllocEvictionAdvisor>
-  getAdvisor(MachineFunction &MF, const RAGreedy &RA) override {
+  getAdvisor(const MachineFunction &MF, const RAGreedy &RA) override {
     return std::make_unique<DefaultEvictionAdvisor>(MF, RA);
   }
   bool doInitialization(Module &M) override {
@@ -113,7 +122,7 @@ StringRef RegAllocEvictionAdvisorAnalysis::getPassName() const {
   llvm_unreachable("Unknown advisor kind");
 }
 
-RegAllocEvictionAdvisor::RegAllocEvictionAdvisor(MachineFunction &MF,
+RegAllocEvictionAdvisor::RegAllocEvictionAdvisor(const MachineFunction &MF,
                                                  const RAGreedy &RA)
     : MF(MF), RA(RA), Matrix(RA.getInterferenceMatrix()),
       LIS(RA.getLiveIntervals()), VRM(RA.getVirtRegMap()),
@@ -136,8 +145,8 @@ RegAllocEvictionAdvisor::RegAllocEvictionAdvisor(MachineFunction &MF,
 ///                   register.
 /// @param B          The live range to be evicted.
 /// @param BreaksHint True when B is already assigned to its preferred register.
-bool DefaultEvictionAdvisor::shouldEvict(LiveInterval &A, bool IsHint,
-                                         LiveInterval &B,
+bool DefaultEvictionAdvisor::shouldEvict(const LiveInterval &A, bool IsHint,
+                                         const LiveInterval &B,
                                          bool BreaksHint) const {
   bool CanSplit = RA.getExtraInfo().getStage(B) < RS_Spill;
 
@@ -156,7 +165,7 @@ bool DefaultEvictionAdvisor::shouldEvict(LiveInterval &A, bool IsHint,
 /// canEvictHintInterference - return true if the interference for VirtReg
 /// on the PhysReg, which is VirtReg's hint, can be evicted in favor of VirtReg.
 bool DefaultEvictionAdvisor::canEvictHintInterference(
-    LiveInterval &VirtReg, MCRegister PhysReg,
+    const LiveInterval &VirtReg, MCRegister PhysReg,
     const SmallVirtRegSet &FixedRegisters) const {
   EvictionCost MaxCost;
   MaxCost.setBrokenHints(1);
@@ -174,7 +183,7 @@ bool DefaultEvictionAdvisor::canEvictHintInterference(
 ///                when returning true.
 /// @returns True when interference can be evicted cheaper than MaxCost.
 bool DefaultEvictionAdvisor::canEvictInterferenceBasedOnCost(
-    LiveInterval &VirtReg, MCRegister PhysReg, bool IsHint,
+    const LiveInterval &VirtReg, MCRegister PhysReg, bool IsHint,
     EvictionCost &MaxCost, const SmallVirtRegSet &FixedRegisters) const {
   // It is only possible to evict virtual register interference.
   if (Matrix->checkInterference(VirtReg, PhysReg) > LiveRegMatrix::IK_VirtReg)
@@ -195,12 +204,12 @@ bool DefaultEvictionAdvisor::canEvictInterferenceBasedOnCost(
   for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
     LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
     // If there is 10 or more interferences, chances are one is heavier.
-    const auto &Interferences = Q.interferingVRegs(10);
-    if (Interferences.size() >= 10)
+    const auto &Interferences = Q.interferingVRegs(EvictInterferenceCutoff);
+    if (Interferences.size() >= EvictInterferenceCutoff)
       return false;
 
     // Check if any interfering live range is heavier than MaxWeight.
-    for (LiveInterval *Intf : reverse(Interferences)) {
+    for (const LiveInterval *Intf : reverse(Interferences)) {
       assert(Register::isVirtualRegister(Intf->reg()) &&
              "Only expecting virtual register interference from query");
 
@@ -227,7 +236,10 @@ bool DefaultEvictionAdvisor::canEvictInterferenceBasedOnCost(
                    MRI->getRegClass(Intf->reg())));
       // Only evict older cascades or live ranges without a cascade.
       unsigned IntfCascade = RA.getExtraInfo().getCascade(Intf->reg());
-      if (Cascade <= IntfCascade) {
+      if (Cascade == IntfCascade)
+        return false;
+
+      if (Cascade < IntfCascade) {
         if (!Urgent)
           return false;
         // We permit breaking cascades for urgent evictions. It should be the
@@ -261,7 +273,7 @@ bool DefaultEvictionAdvisor::canEvictInterferenceBasedOnCost(
 }
 
 MCRegister DefaultEvictionAdvisor::tryFindEvictionCandidate(
-    LiveInterval &VirtReg, const AllocationOrder &Order,
+    const LiveInterval &VirtReg, const AllocationOrder &Order,
     uint8_t CostPerUseLimit, const SmallVirtRegSet &FixedRegisters) const {
   // Keep track of the cheapest interference seen so far.
   EvictionCost BestCost;
diff --git a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h
index 1f40386db8da..d57b0ca6d53d 100644
--- a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h
+++ b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h
@@ -9,19 +9,25 @@
 #ifndef LLVM_CODEGEN_REGALLOCEVICTIONADVISOR_H
 #define LLVM_CODEGEN_REGALLOCEVICTIONADVISOR_H
 
-#include "AllocationOrder.h"
-#include "llvm/ADT/IndexedMap.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallSet.h"
-#include "llvm/CodeGen/LiveInterval.h"
-#include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/LiveRegMatrix.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/Register.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/Config/llvm-config.h"
+#include "llvm/MC/MCRegister.h"
 #include "llvm/Pass.h"
 
 namespace llvm {
+class AllocationOrder;
+class LiveInterval;
+class LiveIntervals;
+class LiveRegMatrix;
+class MachineFunction;
+class MachineRegisterInfo;
+class RegisterClassInfo;
+class TargetRegisterInfo;
+class VirtRegMap;
 
 using SmallVirtRegSet = SmallSet<Register, 16>;
 
@@ -99,15 +105,14 @@ public:
   /// Find a physical register that can be freed by evicting the FixedRegisters,
   /// or return NoRegister. The eviction decision is assumed to be correct (i.e.
   /// no fixed live ranges are evicted) and profitable.
-  virtual MCRegister
-  tryFindEvictionCandidate(LiveInterval &VirtReg, const AllocationOrder &Order,
-                           uint8_t CostPerUseLimit,
-                           const SmallVirtRegSet &FixedRegisters) const = 0;
+  virtual MCRegister tryFindEvictionCandidate(
+      const LiveInterval &VirtReg, const AllocationOrder &Order,
+      uint8_t CostPerUseLimit, const SmallVirtRegSet &FixedRegisters) const = 0;
 
   /// Find out if we can evict the live ranges occupying the given PhysReg,
   /// which is a hint (preferred register) for VirtReg.
   virtual bool
-  canEvictHintInterference(LiveInterval &VirtReg, MCRegister PhysReg,
+  canEvictHintInterference(const LiveInterval &VirtReg, MCRegister PhysReg,
                            const SmallVirtRegSet &FixedRegisters) const = 0;
 
   /// Returns true if the given \p PhysReg is a callee saved register and has
@@ -115,9 +120,9 @@ public:
   bool isUnusedCalleeSavedReg(MCRegister PhysReg) const;
 
 protected:
-  RegAllocEvictionAdvisor(MachineFunction &MF, const RAGreedy &RA);
+  RegAllocEvictionAdvisor(const MachineFunction &MF, const RAGreedy &RA);
 
-  Register canReassign(LiveInterval &VirtReg, Register PrevReg) const;
+  Register canReassign(const LiveInterval &VirtReg, Register PrevReg) const;
 
   // Get the upper limit of elements in the given Order we need to analize.
   // TODO: is this heuristic,  we could consider learning it.
@@ -173,7 +178,7 @@ public:
 
   /// Get an advisor for the given context (i.e. machine function, etc)
   virtual std::unique_ptr<RegAllocEvictionAdvisor>
-  getAdvisor(MachineFunction &MF, const RAGreedy &RA) = 0;
+  getAdvisor(const MachineFunction &MF, const RAGreedy &RA) = 0;
   AdvisorMode getAdvisorMode() const { return Mode; }
 
 protected:
@@ -200,19 +205,20 @@ RegAllocEvictionAdvisorAnalysis *createDevelopmentModeAdvisor();
 // out of RegAllocGreedy.cpp
 class DefaultEvictionAdvisor : public RegAllocEvictionAdvisor {
 public:
-  DefaultEvictionAdvisor(MachineFunction &MF, const RAGreedy &RA)
+  DefaultEvictionAdvisor(const MachineFunction &MF, const RAGreedy &RA)
       : RegAllocEvictionAdvisor(MF, RA) {}
 
 private:
-  MCRegister tryFindEvictionCandidate(LiveInterval &, const AllocationOrder &,
-                                      uint8_t,
+  MCRegister tryFindEvictionCandidate(const LiveInterval &,
+                                      const AllocationOrder &, uint8_t,
                                       const SmallVirtRegSet &) const override;
-  bool canEvictHintInterference(LiveInterval &, MCRegister,
+  bool canEvictHintInterference(const LiveInterval &, MCRegister,
                                 const SmallVirtRegSet &) const override;
-  bool canEvictInterferenceBasedOnCost(LiveInterval &, MCRegister, bool,
+  bool canEvictInterferenceBasedOnCost(const LiveInterval &, MCRegister, bool,
                                        EvictionCost &,
                                        const SmallVirtRegSet &) const;
-  bool shouldEvict(LiveInterval &A, bool, LiveInterval &B, bool) const;
+  bool shouldEvict(const LiveInterval &A, bool, const LiveInterval &B,
+                   bool) const;
 };
 } // namespace llvm
 
diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index 6653145d3d2a..72ceaa768803 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -35,14 +35,9 @@
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/Metadata.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -364,7 +359,16 @@ bool RegAllocFast::mayLiveOut(Register VirtReg) {
   // If this block loops back to itself, it is necessary to check whether the
   // use comes after the def.
   if (MBB->isSuccessor(MBB)) {
-    SelfLoopDef = MRI->getUniqueVRegDef(VirtReg);
+    // Find the first def in the self loop MBB.
+    for (const MachineInstr &DefInst : MRI->def_instructions(VirtReg)) {
+      if (DefInst.getParent() != MBB) {
+        MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
+        return true;
+      } else {
+        if (!SelfLoopDef || dominates(*MBB, DefInst.getIterator(), SelfLoopDef))
+          SelfLoopDef = &DefInst;
+      }
+    }
     if (!SelfLoopDef) {
       MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
       return true;
@@ -1117,6 +1121,12 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
   RegMasks.clear();
   BundleVirtRegsMap.clear();
 
+  auto TiedOpIsUndef = [&](const MachineOperand &MO, unsigned Idx) {
+    assert(MO.isTied());
+    unsigned TiedIdx = MI.findTiedOperandIdx(Idx);
+    const MachineOperand &TiedMO = MI.getOperand(TiedIdx);
+    return TiedMO.isUndef();
+  };
   // Scan for special cases; Apply pre-assigned register defs to state.
   bool HasPhysRegUse = false;
   bool HasRegMask = false;
@@ -1124,7 +1134,8 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
   bool HasDef = false;
   bool HasEarlyClobber = false;
   bool NeedToAssignLiveThroughs = false;
-  for (MachineOperand &MO : MI.operands()) {
+  for (unsigned I = 0; I < MI.getNumOperands(); ++I) {
+    MachineOperand &MO = MI.getOperand(I);
     if (MO.isReg()) {
       Register Reg = MO.getReg();
       if (Reg.isVirtual()) {
@@ -1135,7 +1146,8 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
             HasEarlyClobber = true;
             NeedToAssignLiveThroughs = true;
           }
-          if (MO.isTied() || (MO.getSubReg() != 0 && !MO.isUndef()))
+          if ((MO.isTied() && !TiedOpIsUndef(MO, I)) ||
+              (MO.getSubReg() != 0 && !MO.isUndef()))
             NeedToAssignLiveThroughs = true;
         }
       } else if (Reg.isPhysical()) {
@@ -1235,7 +1247,8 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
           MachineOperand &MO = MI.getOperand(OpIdx);
           LLVM_DEBUG(dbgs() << "Allocating " << MO << '\n');
           unsigned Reg = MO.getReg();
-          if (MO.isEarlyClobber() || MO.isTied() ||
+          if (MO.isEarlyClobber() ||
+              (MO.isTied() && !TiedOpIsUndef(MO, OpIdx)) ||
               (MO.getSubReg() && !MO.isUndef())) {
             defineLiveThroughVirtReg(MI, OpIdx, Reg);
           } else {
@@ -1258,7 +1271,8 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
     // Free registers occupied by defs.
     // Iterate operands in reverse order, so we see the implicit super register
     // defs first (we added them earlier in case of <def,read-undef>).
-    for (MachineOperand &MO : llvm::reverse(MI.operands())) {
+    for (signed I = MI.getNumOperands() - 1; I >= 0; --I) {
+      MachineOperand &MO = MI.getOperand(I);
       if (!MO.isReg() || !MO.isDef())
         continue;
 
@@ -1273,7 +1287,7 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
              "tied def assigned to clobbered register");
 
       // Do not free tied operands and early clobbers.
-      if (MO.isTied() || MO.isEarlyClobber())
+      if ((MO.isTied() && !TiedOpIsUndef(MO, I)) || MO.isEarlyClobber())
         continue;
       Register Reg = MO.getReg();
       if (!Reg)
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 7870574df5b2..2efb98ae200d 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -21,9 +21,7 @@
 #include "SplitKit.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/IndexedMap.h"
-#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
@@ -62,6 +60,7 @@
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/BlockFrequency.h"
@@ -71,13 +70,9 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
-#include <memory>
-#include <queue>
-#include <tuple>
 #include <utility>
 
 using namespace llvm;
@@ -127,11 +122,18 @@ CSRFirstTimeCost("regalloc-csr-first-time-cost",
               cl::desc("Cost for first time use of callee-saved register."),
               cl::init(0), cl::Hidden);
 
-static cl::opt<bool> ConsiderLocalIntervalCost(
-    "consider-local-interval-cost", cl::Hidden,
-    cl::desc("Consider the cost of local intervals created by a split "
-             "candidate when choosing the best split candidate."),
-    cl::init(false));
+static cl::opt<unsigned long> GrowRegionComplexityBudget(
+    "grow-region-complexity-budget",
+    cl::desc("growRegion() does not scale with the number of BB edges, so "
+             "limit its budget and bail out once we reach the limit."),
+    cl::init(10000), cl::Hidden);
+
+static cl::opt<bool> GreedyRegClassPriorityTrumpsGlobalness(
+    "greedy-regclass-priority-trumps-globalness",
+    cl::desc("Change the greedy register allocator's live range priority "
+             "calculation to make the AllocationPriority of the register class "
+             "more important then whether the range is global"),
+    cl::Hidden);
 
 static RegisterRegAlloc greedyRegAlloc("greedy", "greedy register allocator",
                                        createGreedyRegisterAllocator);
@@ -277,9 +279,9 @@ void RAGreedy::releaseMemory() {
   GlobalCand.clear();
 }
 
-void RAGreedy::enqueueImpl(LiveInterval *LI) { enqueue(Queue, LI); }
+void RAGreedy::enqueueImpl(const LiveInterval *LI) { enqueue(Queue, LI); }
 
-void RAGreedy::enqueue(PQueue &CurQueue, LiveInterval *LI) {
+void RAGreedy::enqueue(PQueue &CurQueue, const LiveInterval *LI) {
   // Prioritize live ranges by size, assigning larger ranges first.
   // The queue holds (size, reg) pairs.
   const unsigned Size = LI->getSize();
@@ -308,8 +310,10 @@ void RAGreedy::enqueue(PQueue &CurQueue, LiveInterval *LI) {
     // prevents excessive spilling in pathological cases.
     bool ReverseLocal = TRI->reverseLocalAssignment();
     const TargetRegisterClass &RC = *MRI->getRegClass(Reg);
-    bool ForceGlobal = !ReverseLocal &&
-      (Size / SlotIndex::InstrDist) > (2 * RCI.getNumAllocatableRegs(&RC));
+    bool ForceGlobal =
+        !ReverseLocal && (Size / SlotIndex::InstrDist) >
+                             (2 * RegClassInfo.getNumAllocatableRegs(&RC));
+    unsigned GlobalBit = 0;
 
     if (Stage == RS_Assign && !ForceGlobal && !LI->empty() &&
         LIS->intervalIsInOneMBB(*LI)) {
@@ -324,15 +328,18 @@ void RAGreedy::enqueue(PQueue &CurQueue, LiveInterval *LI) {
         // large blocks on targets with many physical registers.
         Prio = Indexes->getZeroIndex().getInstrDistance(LI->endIndex());
       }
-      Prio |= RC.AllocationPriority << 24;
     } else {
       // Allocate global and split ranges in long->short order. Long ranges that
       // don't fit should be spilled (or split) ASAP so they don't create
       // interference.  Mark a bit to prioritize global above local ranges.
-      Prio = (1u << 29) + Size;
-
-      Prio |= RC.AllocationPriority << 24;
+      Prio = Size;
+      GlobalBit = 1;
     }
+    if (RegClassPriorityTrumpsGlobalness)
+      Prio |= RC.AllocationPriority << 25 | GlobalBit << 24;
+    else
+      Prio |= GlobalBit << 29 | RC.AllocationPriority << 24;
+
     // Mark a higher bit to prioritize global and local above RS_Split.
     Prio |= (1u << 31);
 
@@ -345,9 +352,9 @@ void RAGreedy::enqueue(PQueue &CurQueue, LiveInterval *LI) {
   CurQueue.push(std::make_pair(Prio, ~Reg));
 }
 
-LiveInterval *RAGreedy::dequeue() { return dequeue(Queue); }
+const LiveInterval *RAGreedy::dequeue() { return dequeue(Queue); }
 
-LiveInterval *RAGreedy::dequeue(PQueue &CurQueue) {
+const LiveInterval *RAGreedy::dequeue(PQueue &CurQueue) {
   if (CurQueue.empty())
     return nullptr;
   LiveInterval *LI = &LIS->getInterval(~CurQueue.top().second);
@@ -360,10 +367,10 @@ LiveInterval *RAGreedy::dequeue(PQueue &CurQueue) {
 //===----------------------------------------------------------------------===//
 
 /// tryAssign - Try to assign VirtReg to an available register.
-MCRegister RAGreedy::tryAssign(LiveInterval &VirtReg,
-                             AllocationOrder &Order,
-                             SmallVectorImpl<Register> &NewVRegs,
-                             const SmallVirtRegSet &FixedRegisters) {
+MCRegister RAGreedy::tryAssign(const LiveInterval &VirtReg,
+                               AllocationOrder &Order,
+                               SmallVectorImpl<Register> &NewVRegs,
+                               const SmallVirtRegSet &FixedRegisters) {
   MCRegister PhysReg;
   for (auto I = Order.begin(), E = Order.end(); I != E && !PhysReg; ++I) {
     assert(*I);
@@ -413,7 +420,7 @@ MCRegister RAGreedy::tryAssign(LiveInterval &VirtReg,
 //                         Interference eviction
 //===----------------------------------------------------------------------===//
 
-Register RegAllocEvictionAdvisor::canReassign(LiveInterval &VirtReg,
+Register RegAllocEvictionAdvisor::canReassign(const LiveInterval &VirtReg,
                                               Register PrevReg) const {
   auto Order =
       AllocationOrder::create(VirtReg.reg(), *VRM, RegClassInfo, Matrix);
@@ -440,94 +447,11 @@ Register RegAllocEvictionAdvisor::canReassign(LiveInterval &VirtReg,
   return PhysReg;
 }
 
-/// Return true if all interferences between VirtReg and PhysReg between
-/// Start and End can be evicted.
-///
-/// \param VirtReg Live range that is about to be assigned.
-/// \param PhysReg Desired register for assignment.
-/// \param Start   Start of range to look for interferences.
-/// \param End     End of range to look for interferences.
-/// \param MaxCost Only look for cheaper candidates and update with new cost
-///                when returning true.
-/// \return True when interference can be evicted cheaper than MaxCost.
-bool RAGreedy::canEvictInterferenceInRange(const LiveInterval &VirtReg,
-                                           MCRegister PhysReg, SlotIndex Start,
-                                           SlotIndex End,
-                                           EvictionCost &MaxCost) const {
-  EvictionCost Cost;
-
-  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
-    LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
-
-    // Check if any interfering live range is heavier than MaxWeight.
-    for (const LiveInterval *Intf : reverse(Q.interferingVRegs())) {
-      // Check if interference overlast the segment in interest.
-      if (!Intf->overlaps(Start, End))
-        continue;
-
-      // Cannot evict non virtual reg interference.
-      if (!Register::isVirtualRegister(Intf->reg()))
-        return false;
-      // Never evict spill products. They cannot split or spill.
-      if (ExtraInfo->getStage(*Intf) == RS_Done)
-        return false;
-
-      // Would this break a satisfied hint?
-      bool BreaksHint = VRM->hasPreferredPhys(Intf->reg());
-      // Update eviction cost.
-      Cost.BrokenHints += BreaksHint;
-      Cost.MaxWeight = std::max(Cost.MaxWeight, Intf->weight());
-      // Abort if this would be too expensive.
-      if (!(Cost < MaxCost))
-        return false;
-    }
-  }
-
-  if (Cost.MaxWeight == 0)
-    return false;
-
-  MaxCost = Cost;
-  return true;
-}
-
-/// Return the physical register that will be best
-/// candidate for eviction by a local split interval that will be created
-/// between Start and End.
-///
-/// \param Order            The allocation order
-/// \param VirtReg          Live range that is about to be assigned.
-/// \param Start            Start of range to look for interferences
-/// \param End              End of range to look for interferences
-/// \param BestEvictweight  The eviction cost of that eviction
-/// \return The PhysReg which is the best candidate for eviction and the
-/// eviction cost in BestEvictweight
-MCRegister RAGreedy::getCheapestEvicteeWeight(const AllocationOrder &Order,
-                                              const LiveInterval &VirtReg,
-                                              SlotIndex Start, SlotIndex End,
-                                              float *BestEvictweight) const {
-  EvictionCost BestEvictCost;
-  BestEvictCost.setMax();
-  BestEvictCost.MaxWeight = VirtReg.weight();
-  MCRegister BestEvicteePhys;
-
-  // Go over all physical registers and find the best candidate for eviction
-  for (MCRegister PhysReg : Order.getOrder()) {
-
-    if (!canEvictInterferenceInRange(VirtReg, PhysReg, Start, End,
-                                     BestEvictCost))
-      continue;
-
-    // Best so far.
-    BestEvicteePhys = PhysReg;
-  }
-  *BestEvictweight = BestEvictCost.MaxWeight;
-  return BestEvicteePhys;
-}
-
 /// evictInterference - Evict any interferring registers that prevent VirtReg
 /// from being assigned to Physreg. This assumes that canEvictInterference
 /// returned true.
-void RAGreedy::evictInterference(LiveInterval &VirtReg, MCRegister PhysReg,
+void RAGreedy::evictInterference(const LiveInterval &VirtReg,
+                                 MCRegister PhysReg,
                                  SmallVectorImpl<Register> &NewVRegs) {
   // Make sure that VirtReg has a cascade number, and assign that cascade
   // number to every evicted register. These live ranges than then only be
@@ -538,25 +462,23 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, MCRegister PhysReg,
                     << " interference: Cascade " << Cascade << '\n');
 
   // Collect all interfering virtregs first.
-  SmallVector<LiveInterval*, 8> Intfs;
+  SmallVector<const LiveInterval *, 8> Intfs;
   for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
     LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
     // We usually have the interfering VRegs cached so collectInterferingVRegs()
     // should be fast, we may need to recalculate if when different physregs
     // overlap the same register unit so we had different SubRanges queried
     // against it.
-    ArrayRef<LiveInterval*> IVR = Q.interferingVRegs();
+    ArrayRef<const LiveInterval *> IVR = Q.interferingVRegs();
     Intfs.append(IVR.begin(), IVR.end());
   }
 
   // Evict them second. This will invalidate the queries.
-  for (LiveInterval *Intf : Intfs) {
+  for (const LiveInterval *Intf : Intfs) {
     // The same VirtReg may be present in multiple RegUnits. Skip duplicates.
     if (!VRM->hasPhys(Intf->reg()))
       continue;
 
-    LastEvicted.addEviction(PhysReg, VirtReg.reg(), Intf->reg());
-
     Matrix->unassign(*Intf);
     assert((ExtraInfo->getCascade(Intf->reg()) < Cascade ||
             VirtReg.isSpillable() < Intf->isSpillable()) &&
@@ -624,7 +546,8 @@ bool RegAllocEvictionAdvisor::canAllocatePhysReg(unsigned CostPerUseLimit,
 /// @param  VirtReg Currently unassigned virtual register.
 /// @param  Order   Physregs to try.
 /// @return         Physreg to assign VirtReg, or 0.
-MCRegister RAGreedy::tryEvict(LiveInterval &VirtReg, AllocationOrder &Order,
+MCRegister RAGreedy::tryEvict(const LiveInterval &VirtReg,
+                              AllocationOrder &Order,
                               SmallVectorImpl<Register> &NewVRegs,
                               uint8_t CostPerUseLimit,
                               const SmallVirtRegSet &FixedRegisters) {
@@ -782,12 +705,17 @@ bool RAGreedy::growRegion(GlobalSplitCandidate &Cand) {
   unsigned Visited = 0;
 #endif
 
+  unsigned long Budget = GrowRegionComplexityBudget;
   while (true) {
     ArrayRef<unsigned> NewBundles = SpillPlacer->getRecentPositive();
     // Find new through blocks in the periphery of PrefRegBundles.
     for (unsigned Bundle : NewBundles) {
       // Look at all blocks connected to Bundle in the full graph.
       ArrayRef<unsigned> Blocks = Bundles->getBlocks(Bundle);
+      // Limit compilation time by bailing out after we use all our budget.
+      if (Blocks.size() >= Budget)
+        return false;
+      Budget -= Blocks.size();
       for (unsigned Block : Blocks) {
         if (!Todo.test(Block))
           continue;
@@ -887,147 +815,14 @@ BlockFrequency RAGreedy::calcSpillCost() {
   return Cost;
 }
 
-/// Check if splitting Evictee will create a local split interval in
-/// basic block number BBNumber that may cause a bad eviction chain. This is
-/// intended to prevent bad eviction sequences like:
-/// movl	%ebp, 8(%esp)           # 4-byte Spill
-/// movl	%ecx, %ebp
-/// movl	%ebx, %ecx
-/// movl	%edi, %ebx
-/// movl	%edx, %edi
-/// cltd
-/// idivl	%esi
-/// movl	%edi, %edx
-/// movl	%ebx, %edi
-/// movl	%ecx, %ebx
-/// movl	%ebp, %ecx
-/// movl	16(%esp), %ebp          # 4 - byte Reload
-///
-/// Such sequences are created in 2 scenarios:
-///
-/// Scenario #1:
-/// %0 is evicted from physreg0 by %1.
-/// Evictee %0 is intended for region splitting with split candidate
-/// physreg0 (the reg %0 was evicted from).
-/// Region splitting creates a local interval because of interference with the
-/// evictor %1 (normally region splitting creates 2 interval, the "by reg"
-/// and "by stack" intervals and local interval created when interference
-/// occurs).
-/// One of the split intervals ends up evicting %2 from physreg1.
-/// Evictee %2 is intended for region splitting with split candidate
-/// physreg1.
-/// One of the split intervals ends up evicting %3 from physreg2, etc.
-///
-/// Scenario #2
-/// %0 is evicted from physreg0 by %1.
-/// %2 is evicted from physreg2 by %3 etc.
-/// Evictee %0 is intended for region splitting with split candidate
-/// physreg1.
-/// Region splitting creates a local interval because of interference with the
-/// evictor %1.
-/// One of the split intervals ends up evicting back original evictor %1
-/// from physreg0 (the reg %0 was evicted from).
-/// Another evictee %2 is intended for region splitting with split candidate
-/// physreg1.
-/// One of the split intervals ends up evicting %3 from physreg2, etc.
-///
-/// \param Evictee  The register considered to be split.
-/// \param Cand     The split candidate that determines the physical register
-///                 we are splitting for and the interferences.
-/// \param BBNumber The number of a BB for which the region split process will
-///                 create a local split interval.
-/// \param Order    The physical registers that may get evicted by a split
-///                 artifact of Evictee.
-/// \return True if splitting Evictee may cause a bad eviction chain, false
-/// otherwise.
-bool RAGreedy::splitCanCauseEvictionChain(Register Evictee,
-                                          GlobalSplitCandidate &Cand,
-                                          unsigned BBNumber,
-                                          const AllocationOrder &Order) {
-  EvictionTrack::EvictorInfo VregEvictorInfo = LastEvicted.getEvictor(Evictee);
-  unsigned Evictor = VregEvictorInfo.first;
-  MCRegister PhysReg = VregEvictorInfo.second;
-
-  // No actual evictor.
-  if (!Evictor || !PhysReg)
-    return false;
-
-  float MaxWeight = 0;
-  MCRegister FutureEvictedPhysReg =
-      getCheapestEvicteeWeight(Order, LIS->getInterval(Evictee),
-                               Cand.Intf.first(), Cand.Intf.last(), &MaxWeight);
-
-  // The bad eviction chain occurs when either the split candidate is the
-  // evicting reg or one of the split artifact will evict the evicting reg.
-  if ((PhysReg != Cand.PhysReg) && (PhysReg != FutureEvictedPhysReg))
-    return false;
-
-  Cand.Intf.moveToBlock(BBNumber);
-
-  // Check to see if the Evictor contains interference (with Evictee) in the
-  // given BB. If so, this interference caused the eviction of Evictee from
-  // PhysReg. This suggest that we will create a local interval during the
-  // region split to avoid this interference This local interval may cause a bad
-  // eviction chain.
-  if (!LIS->hasInterval(Evictor))
-    return false;
-  LiveInterval &EvictorLI = LIS->getInterval(Evictor);
-  if (EvictorLI.FindSegmentContaining(Cand.Intf.first()) == EvictorLI.end())
-    return false;
-
-  // Now, check to see if the local interval we will create is going to be
-  // expensive enough to evict somebody If so, this may cause a bad eviction
-  // chain.
-  float splitArtifactWeight =
-      VRAI->futureWeight(LIS->getInterval(Evictee),
-                         Cand.Intf.first().getPrevIndex(), Cand.Intf.last());
-  if (splitArtifactWeight >= 0 && splitArtifactWeight < MaxWeight)
-    return false;
-
-  return true;
-}
-
-/// Check if splitting VirtRegToSplit will create a local split interval
-/// in basic block number BBNumber that may cause a spill.
-///
-/// \param VirtRegToSplit The register considered to be split.
-/// \param Cand           The split candidate that determines the physical
-///                       register we are splitting for and the interferences.
-/// \param BBNumber       The number of a BB for which the region split process
-///                       will create a local split interval.
-/// \param Order          The physical registers that may get evicted by a
-///                       split artifact of VirtRegToSplit.
-/// \return True if splitting VirtRegToSplit may cause a spill, false
-/// otherwise.
-bool RAGreedy::splitCanCauseLocalSpill(unsigned VirtRegToSplit,
-                                       GlobalSplitCandidate &Cand,
-                                       unsigned BBNumber,
-                                       const AllocationOrder &Order) {
-  Cand.Intf.moveToBlock(BBNumber);
-
-  // Check if the local interval will find a non interfereing assignment.
-  for (auto PhysReg : Order.getOrder()) {
-    if (!Matrix->checkInterference(Cand.Intf.first().getPrevIndex(),
-                                   Cand.Intf.last(), PhysReg))
-      return false;
-  }
-
-  // The local interval is not able to find non interferencing assignment
-  // and not able to evict a less worthy interval, therfore, it can cause a
-  // spill.
-  return true;
-}
-
 /// calcGlobalSplitCost - Return the global split cost of following the split
 /// pattern in LiveBundles. This cost should be added to the local cost of the
 /// interference pattern in SplitConstraints.
 ///
 BlockFrequency RAGreedy::calcGlobalSplitCost(GlobalSplitCandidate &Cand,
-                                             const AllocationOrder &Order,
-                                             bool *CanCauseEvictionChain) {
+                                             const AllocationOrder &Order) {
   BlockFrequency GlobalCost = 0;
   const BitVector &LiveBundles = Cand.LiveBundles;
-  Register VirtRegToSplit = SA->getParent().reg();
   ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
   for (unsigned I = 0; I != UseBlocks.size(); ++I) {
     const SplitAnalysis::BlockInfo &BI = UseBlocks[I];
@@ -1037,29 +832,6 @@ BlockFrequency RAGreedy::calcGlobalSplitCost(GlobalSplitCandidate &Cand,
     unsigned Ins = 0;
 
     Cand.Intf.moveToBlock(BC.Number);
-    // Check wheather a local interval is going to be created during the region
-    // split. Calculate adavanced spilt cost (cost of local intervals) if option
-    // is enabled.
-    if (EnableAdvancedRASplitCost && Cand.Intf.hasInterference() && BI.LiveIn &&
-        BI.LiveOut && RegIn && RegOut) {
-
-      if (CanCauseEvictionChain &&
-          splitCanCauseEvictionChain(VirtRegToSplit, Cand, BC.Number, Order)) {
-        // This interference causes our eviction from this assignment, we might
-        // evict somebody else and eventually someone will spill, add that cost.
-        // See splitCanCauseEvictionChain for detailed description of scenarios.
-        GlobalCost += SpillPlacer->getBlockFrequency(BC.Number);
-        GlobalCost += SpillPlacer->getBlockFrequency(BC.Number);
-
-        *CanCauseEvictionChain = true;
-
-      } else if (splitCanCauseLocalSpill(VirtRegToSplit, Cand, BC.Number,
-                                         Order)) {
-        // This interference causes local interval to spill, add that cost.
-        GlobalCost += SpillPlacer->getBlockFrequency(BC.Number);
-        GlobalCost += SpillPlacer->getBlockFrequency(BC.Number);
-      }
-    }
 
     if (BI.LiveIn)
       Ins += RegIn != (BC.Entry == SpillPlacement::PrefReg);
@@ -1080,20 +852,6 @@ BlockFrequency RAGreedy::calcGlobalSplitCost(GlobalSplitCandidate &Cand,
       if (Cand.Intf.hasInterference()) {
         GlobalCost += SpillPlacer->getBlockFrequency(Number);
         GlobalCost += SpillPlacer->getBlockFrequency(Number);
-
-        // Check wheather a local interval is going to be created during the
-        // region split.
-        if (EnableAdvancedRASplitCost && CanCauseEvictionChain &&
-            splitCanCauseEvictionChain(VirtRegToSplit, Cand, Number, Order)) {
-          // This interference cause our eviction from this assignment, we might
-          // evict somebody else, add that cost.
-          // See splitCanCauseEvictionChain for detailed description of
-          // scenarios.
-          GlobalCost += SpillPlacer->getBlockFrequency(Number);
-          GlobalCost += SpillPlacer->getBlockFrequency(Number);
-
-          *CanCauseEvictionChain = true;
-        }
       }
       continue;
     }
@@ -1253,7 +1011,7 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
     MF->verify(this, "After splitting live range around region");
 }
 
-MCRegister RAGreedy::tryRegionSplit(LiveInterval &VirtReg,
+MCRegister RAGreedy::tryRegionSplit(const LiveInterval &VirtReg,
                                     AllocationOrder &Order,
                                     SmallVectorImpl<Register> &NewVRegs) {
   if (!TRI->shouldRegionSplitForVirtReg(*MF, VirtReg))
@@ -1276,19 +1034,8 @@ MCRegister RAGreedy::tryRegionSplit(LiveInterval &VirtReg,
                MBFI->printBlockFreq(dbgs(), BestCost) << '\n');
   }
 
-  bool CanCauseEvictionChain = false;
-  unsigned BestCand =
-      calculateRegionSplitCost(VirtReg, Order, BestCost, NumCands,
-                               false /*IgnoreCSR*/, &CanCauseEvictionChain);
-
-  // Split candidates with compact regions can cause a bad eviction sequence.
-  // See splitCanCauseEvictionChain for detailed description of scenarios.
-  // To avoid it, we need to comapre the cost with the spill cost and not the
-  // current max frequency.
-  if (HasCompact && (BestCost > SpillCost) && (BestCand != NoCand) &&
-    CanCauseEvictionChain) {
-    return MCRegister::NoRegister;
-  }
+  unsigned BestCand = calculateRegionSplitCost(VirtReg, Order, BestCost,
+                                               NumCands, false /*IgnoreCSR*/);
 
   // No solutions found, fall back to single block splitting.
   if (!HasCompact && BestCand == NoCand)
@@ -1297,11 +1044,11 @@ MCRegister RAGreedy::tryRegionSplit(LiveInterval &VirtReg,
   return doRegionSplit(VirtReg, BestCand, HasCompact, NewVRegs);
 }
 
-unsigned RAGreedy::calculateRegionSplitCost(LiveInterval &VirtReg,
+unsigned RAGreedy::calculateRegionSplitCost(const LiveInterval &VirtReg,
                                             AllocationOrder &Order,
                                             BlockFrequency &BestCost,
-                                            unsigned &NumCands, bool IgnoreCSR,
-                                            bool *CanCauseEvictionChain) {
+                                            unsigned &NumCands,
+                                            bool IgnoreCSR) {
   unsigned BestCand = NoCand;
   for (MCPhysReg PhysReg : Order) {
     assert(PhysReg);
@@ -1364,8 +1111,7 @@ unsigned RAGreedy::calculateRegionSplitCost(LiveInterval &VirtReg,
       continue;
     }
 
-    bool HasEvictionChain = false;
-    Cost += calcGlobalSplitCost(Cand, Order, &HasEvictionChain);
+    Cost += calcGlobalSplitCost(Cand, Order);
     LLVM_DEBUG({
       dbgs() << ", total = ";
       MBFI->printBlockFreq(dbgs(), Cost) << " with bundles";
@@ -1376,28 +1122,14 @@ unsigned RAGreedy::calculateRegionSplitCost(LiveInterval &VirtReg,
     if (Cost < BestCost) {
       BestCand = NumCands;
       BestCost = Cost;
-      // See splitCanCauseEvictionChain for detailed description of bad
-      // eviction chain scenarios.
-      if (CanCauseEvictionChain)
-        *CanCauseEvictionChain = HasEvictionChain;
     }
     ++NumCands;
   }
 
-  if (CanCauseEvictionChain && BestCand != NoCand) {
-    // See splitCanCauseEvictionChain for detailed description of bad
-    // eviction chain scenarios.
-    LLVM_DEBUG(dbgs() << "Best split candidate of vreg "
-                      << printReg(VirtReg.reg(), TRI) << "  may ");
-    if (!(*CanCauseEvictionChain))
-      LLVM_DEBUG(dbgs() << "not ");
-    LLVM_DEBUG(dbgs() << "cause bad eviction chain\n");
-  }
-
   return BestCand;
 }
 
-unsigned RAGreedy::doRegionSplit(LiveInterval &VirtReg, unsigned BestCand,
+unsigned RAGreedy::doRegionSplit(const LiveInterval &VirtReg, unsigned BestCand,
                                  bool HasCompact,
                                  SmallVectorImpl<Register> &NewVRegs) {
   SmallVector<unsigned, 8> UsedCands;
@@ -1444,7 +1176,8 @@ unsigned RAGreedy::doRegionSplit(LiveInterval &VirtReg, unsigned BestCand,
 /// tryBlockSplit - Split a global live range around every block with uses. This
 /// creates a lot of local live ranges, that will be split by tryLocalSplit if
 /// they don't allocate.
-unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order,
+unsigned RAGreedy::tryBlockSplit(const LiveInterval &VirtReg,
+                                 AllocationOrder &Order,
                                  SmallVectorImpl<Register> &NewVRegs) {
   assert(&SA->getParent() == &VirtReg && "Live range wasn't analyzed");
   Register Reg = VirtReg.reg();
@@ -1507,9 +1240,9 @@ static unsigned getNumAllocatableRegsForConstraints(
 /// be moved to a larger register class.
 ///
 /// This is similar to spilling to a larger register class.
-unsigned
-RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
-                              SmallVectorImpl<Register> &NewVRegs) {
+unsigned RAGreedy::tryInstructionSplit(const LiveInterval &VirtReg,
+                                       AllocationOrder &Order,
+                                       SmallVectorImpl<Register> &NewVRegs) {
   const TargetRegisterClass *CurRC = MRI->getRegClass(VirtReg.reg());
   // There is no point to this if there are no larger sub-classes.
   if (!RegClassInfo.isProperSubClass(CurRC))
@@ -1529,7 +1262,8 @@ RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
 
   const TargetRegisterClass *SuperRC =
       TRI->getLargestLegalSuperClass(CurRC, *MF);
-  unsigned SuperRCNumAllocatableRegs = RCI.getNumAllocatableRegs(SuperRC);
+  unsigned SuperRCNumAllocatableRegs =
+      RegClassInfo.getNumAllocatableRegs(SuperRC);
   // Split around every non-copy instruction if this split will relax
   // the constraints on the virtual register.
   // Otherwise, splitting just inserts uncoalescable copies that do not help
@@ -1539,7 +1273,7 @@ RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
       if (MI->isFullCopy() ||
           SuperRCNumAllocatableRegs ==
               getNumAllocatableRegsForConstraints(MI, VirtReg.reg(), SuperRC,
-                                                  TII, TRI, RCI)) {
+                                                  TII, TRI, RegClassInfo)) {
         LLVM_DEBUG(dbgs() << "    skip:\t" << Use << '\t' << *MI);
         continue;
       }
@@ -1649,7 +1383,8 @@ void RAGreedy::calcGapWeights(MCRegister PhysReg,
 /// tryLocalSplit - Try to split VirtReg into smaller intervals inside its only
 /// basic block.
 ///
-unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
+unsigned RAGreedy::tryLocalSplit(const LiveInterval &VirtReg,
+                                 AllocationOrder &Order,
                                  SmallVectorImpl<Register> &NewVRegs) {
   // TODO: the function currently only handles a single UseBlock; it should be
   // possible to generalize.
@@ -1879,7 +1614,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
 /// trySplit - Try to split VirtReg or one of its interferences, making it
 /// assignable.
 /// @return Physreg when VirtReg may be assigned and/or new NewVRegs.
-unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order,
+unsigned RAGreedy::trySplit(const LiveInterval &VirtReg, AllocationOrder &Order,
                             SmallVectorImpl<Register> &NewVRegs,
                             const SmallVirtRegSet &FixedRegisters) {
   // Ranges must be Split2 or less.
@@ -1928,6 +1663,18 @@ static bool hasTiedDef(MachineRegisterInfo *MRI, unsigned reg) {
   return false;
 }
 
+/// Return true if the existing assignment of \p Intf overlaps, but is not the
+/// same, as \p PhysReg.
+static bool assignedRegPartiallyOverlaps(const TargetRegisterInfo &TRI,
+                                         const VirtRegMap &VRM,
+                                         MCRegister PhysReg,
+                                         const LiveInterval &Intf) {
+  MCRegister AssignedReg = VRM.getPhys(Intf.reg());
+  if (PhysReg == AssignedReg)
+    return false;
+  return TRI.regsOverlap(PhysReg, AssignedReg);
+}
+
 /// mayRecolorAllInterferences - Check if the virtual registers that
 /// interfere with \p VirtReg on \p PhysReg (or one of its aliases) may be
 /// recolored to free \p PhysReg.
@@ -1937,8 +1684,8 @@ static bool hasTiedDef(MachineRegisterInfo *MRI, unsigned reg) {
 /// \p FixedRegisters contains all the virtual registers that cannot be
 /// recolored.
 bool RAGreedy::mayRecolorAllInterferences(
-    MCRegister PhysReg, LiveInterval &VirtReg, SmallLISet &RecoloringCandidates,
-    const SmallVirtRegSet &FixedRegisters) {
+    MCRegister PhysReg, const LiveInterval &VirtReg,
+    SmallLISet &RecoloringCandidates, const SmallVirtRegSet &FixedRegisters) {
   const TargetRegisterClass *CurRC = MRI->getRegClass(VirtReg.reg());
 
   for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
@@ -1952,13 +1699,21 @@ bool RAGreedy::mayRecolorAllInterferences(
       CutOffInfo |= CO_Interf;
       return false;
     }
-    for (LiveInterval *Intf : reverse(Q.interferingVRegs())) {
-      // If Intf is done and sit on the same register class as VirtReg,
-      // it would not be recolorable as it is in the same state as VirtReg.
-      // However, if VirtReg has tied defs and Intf doesn't, then
+    for (const LiveInterval *Intf : reverse(Q.interferingVRegs())) {
+      // If Intf is done and sits on the same register class as VirtReg, it
+      // would not be recolorable as it is in the same state as
+      // VirtReg. However there are at least two exceptions.
+      //
+      // If VirtReg has tied defs and Intf doesn't, then
       // there is still a point in examining if it can be recolorable.
+      //
+      // Additionally, if the register class has overlapping tuple members, it
+      // may still be recolorable using a different tuple. This is more likely
+      // if the existing assignment aliases with the candidate.
+      //
       if (((ExtraInfo->getStage(*Intf) == RS_Done &&
-            MRI->getRegClass(Intf->reg()) == CurRC) &&
+            MRI->getRegClass(Intf->reg()) == CurRC &&
+            !assignedRegPartiallyOverlaps(*TRI, *VRM, PhysReg, *Intf)) &&
            !(hasTiedDef(MRI, VirtReg.reg()) &&
              !hasTiedDef(MRI, Intf->reg()))) ||
           FixedRegisters.count(Intf->reg())) {
@@ -2008,18 +1763,26 @@ bool RAGreedy::mayRecolorAllInterferences(
 /// (split, spill) during the process and that must be assigned.
 /// \p FixedRegisters contains all the virtual registers that cannot be
 /// recolored.
+///
+/// \p RecolorStack tracks the original assignments of successfully recolored
+/// registers.
+///
 /// \p Depth gives the current depth of the last chance recoloring.
 /// \return a physical register that can be used for VirtReg or ~0u if none
 /// exists.
-unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg,
+unsigned RAGreedy::tryLastChanceRecoloring(const LiveInterval &VirtReg,
                                            AllocationOrder &Order,
                                            SmallVectorImpl<Register> &NewVRegs,
                                            SmallVirtRegSet &FixedRegisters,
+                                           RecoloringStack &RecolorStack,
                                            unsigned Depth) {
   if (!TRI->shouldUseLastChanceRecoloringForVirtReg(*MF, VirtReg))
     return ~0u;
 
   LLVM_DEBUG(dbgs() << "Try last chance recoloring for " << VirtReg << '\n');
+
+  const ssize_t EntryStackSize = RecolorStack.size();
+
   // Ranges must be Done.
   assert((ExtraInfo->getStage(VirtReg) >= RS_Done || !VirtReg.isSpillable()) &&
          "Last chance recoloring should really be last chance");
@@ -2035,9 +1798,7 @@ unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg,
 
   // Set of Live intervals that will need to be recolored.
   SmallLISet RecoloringCandidates;
-  // Record the original mapping virtual register to physical register in case
-  // the recoloring fails.
-  DenseMap<Register, MCRegister> VirtRegToPhysReg;
+
   // Mark VirtReg as fixed, i.e., it will not be recolored pass this point in
   // this recoloring "session".
   assert(!FixedRegisters.count(VirtReg.reg()));
@@ -2049,7 +1810,6 @@ unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg,
     LLVM_DEBUG(dbgs() << "Try to assign: " << VirtReg << " to "
                       << printReg(PhysReg, TRI) << '\n');
     RecoloringCandidates.clear();
-    VirtRegToPhysReg.clear();
     CurrentNewVRegs.clear();
 
     // It is only possible to recolor virtual register interference.
@@ -2069,18 +1829,19 @@ unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg,
       continue;
     }
 
-    // RecoloringCandidates contains all the virtual registers that interfer
-    // with VirtReg on PhysReg (or one of its aliases).
-    // Enqueue them for recoloring and perform the actual recoloring.
+    // RecoloringCandidates contains all the virtual registers that interfere
+    // with VirtReg on PhysReg (or one of its aliases). Enqueue them for
+    // recoloring and perform the actual recoloring.
     PQueue RecoloringQueue;
-    for (LiveInterval *RC : RecoloringCandidates) {
+    for (const LiveInterval *RC : RecoloringCandidates) {
       Register ItVirtReg = RC->reg();
       enqueue(RecoloringQueue, RC);
       assert(VRM->hasPhys(ItVirtReg) &&
              "Interferences are supposed to be with allocated variables");
 
       // Record the current allocation.
-      VirtRegToPhysReg[ItVirtReg] = VRM->getPhys(ItVirtReg);
+      RecolorStack.push_back(std::make_pair(RC, VRM->getPhys(ItVirtReg)));
+
       // unset the related struct.
       Matrix->unassign(*RC);
     }
@@ -2095,7 +1856,7 @@ unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg,
     // at this point for the next physical register.
     SmallVirtRegSet SaveFixedRegisters(FixedRegisters);
     if (tryRecoloringCandidates(RecoloringQueue, CurrentNewVRegs,
-                                FixedRegisters, Depth)) {
+                                FixedRegisters, RecolorStack, Depth)) {
       // Push the queued vregs into the main queue.
       for (Register NewVReg : CurrentNewVRegs)
         NewVRegs.push_back(NewVReg);
@@ -2122,13 +1883,31 @@ unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg,
       NewVRegs.push_back(R);
     }
 
-    for (LiveInterval *RC : RecoloringCandidates) {
-      Register ItVirtReg = RC->reg();
-      if (VRM->hasPhys(ItVirtReg))
-        Matrix->unassign(*RC);
-      MCRegister ItPhysReg = VirtRegToPhysReg[ItVirtReg];
-      Matrix->assign(*RC, ItPhysReg);
+    // Roll back our unsuccessful recoloring. Also roll back any successful
+    // recolorings in any recursive recoloring attempts, since it's possible
+    // they would have introduced conflicts with assignments we will be
+    // restoring further up the stack. Perform all unassignments prior to
+    // reassigning, since sub-recolorings may have conflicted with the registers
+    // we are going to restore to their original assignments.
+    for (ssize_t I = RecolorStack.size() - 1; I >= EntryStackSize; --I) {
+      const LiveInterval *LI;
+      MCRegister PhysReg;
+      std::tie(LI, PhysReg) = RecolorStack[I];
+
+      if (VRM->hasPhys(LI->reg()))
+        Matrix->unassign(*LI);
     }
+
+    for (size_t I = EntryStackSize; I != RecolorStack.size(); ++I) {
+      const LiveInterval *LI;
+      MCRegister PhysReg;
+      std::tie(LI, PhysReg) = RecolorStack[I];
+      if (!LI->empty() && !MRI->reg_nodbg_empty(LI->reg()))
+        Matrix->assign(*LI, PhysReg);
+    }
+
+    // Pop the stack of recoloring attempts.
+    RecolorStack.resize(EntryStackSize);
   }
 
   // Last chance recoloring did not worked either, give up.
@@ -2146,12 +1925,13 @@ unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg,
 bool RAGreedy::tryRecoloringCandidates(PQueue &RecoloringQueue,
                                        SmallVectorImpl<Register> &NewVRegs,
                                        SmallVirtRegSet &FixedRegisters,
+                                       RecoloringStack &RecolorStack,
                                        unsigned Depth) {
   while (!RecoloringQueue.empty()) {
-    LiveInterval *LI = dequeue(RecoloringQueue);
+    const LiveInterval *LI = dequeue(RecoloringQueue);
     LLVM_DEBUG(dbgs() << "Try to recolor: " << *LI << '\n');
-    MCRegister PhysReg =
-        selectOrSplitImpl(*LI, NewVRegs, FixedRegisters, Depth + 1);
+    MCRegister PhysReg = selectOrSplitImpl(*LI, NewVRegs, FixedRegisters,
+                                           RecolorStack, Depth + 1);
     // When splitting happens, the live-range may actually be empty.
     // In that case, this is okay to continue the recoloring even
     // if we did not find an alternative color for it. Indeed,
@@ -2178,12 +1958,14 @@ bool RAGreedy::tryRecoloringCandidates(PQueue &RecoloringQueue,
 //                            Main Entry Point
 //===----------------------------------------------------------------------===//
 
-MCRegister RAGreedy::selectOrSplit(LiveInterval &VirtReg,
+MCRegister RAGreedy::selectOrSplit(const LiveInterval &VirtReg,
                                    SmallVectorImpl<Register> &NewVRegs) {
   CutOffInfo = CO_None;
   LLVMContext &Ctx = MF->getFunction().getContext();
   SmallVirtRegSet FixedRegisters;
-  MCRegister Reg = selectOrSplitImpl(VirtReg, NewVRegs, FixedRegisters);
+  RecoloringStack RecolorStack;
+  MCRegister Reg =
+      selectOrSplitImpl(VirtReg, NewVRegs, FixedRegisters, RecolorStack);
   if (Reg == ~0U && (CutOffInfo != CO_None)) {
     uint8_t CutOffEncountered = CutOffInfo & (CO_Depth | CO_Interf);
     if (CutOffEncountered == CO_Depth)
@@ -2208,10 +1990,9 @@ MCRegister RAGreedy::selectOrSplit(LiveInterval &VirtReg,
 /// Spilling a live range in the cold path can have lower cost than using
 /// the CSR for the first time. Returns the physical register if we decide
 /// to use the CSR; otherwise return 0.
-MCRegister
-RAGreedy::tryAssignCSRFirstTime(LiveInterval &VirtReg, AllocationOrder &Order,
-                                MCRegister PhysReg, uint8_t &CostPerUseLimit,
-                                SmallVectorImpl<Register> &NewVRegs) {
+MCRegister RAGreedy::tryAssignCSRFirstTime(
+    const LiveInterval &VirtReg, AllocationOrder &Order, MCRegister PhysReg,
+    uint8_t &CostPerUseLimit, SmallVectorImpl<Register> &NewVRegs) {
   if (ExtraInfo->getStage(VirtReg) == RS_Spill && VirtReg.isSpillable()) {
     // We choose spill over using the CSR for the first time if the spill cost
     // is lower than CSRCost.
@@ -2243,7 +2024,7 @@ RAGreedy::tryAssignCSRFirstTime(LiveInterval &VirtReg, AllocationOrder &Order,
   return PhysReg;
 }
 
-void RAGreedy::aboutToRemoveInterval(LiveInterval &LI) {
+void RAGreedy::aboutToRemoveInterval(const LiveInterval &LI) {
   // Do not keep invalid information around.
   SetOfBrokenHints.remove(&LI);
 }
@@ -2317,7 +2098,7 @@ BlockFrequency RAGreedy::getBrokenHintFreq(const HintsInfo &List,
 /// For a given live range, profitability is determined by the sum of the
 /// frequencies of the non-identity copies it would introduce with the old
 /// and new register.
-void RAGreedy::tryHintRecoloring(LiveInterval &VirtReg) {
+void RAGreedy::tryHintRecoloring(const LiveInterval &VirtReg) {
   // We have a broken hint, check if it is possible to fix it by
   // reusing PhysReg for the copy-related live-ranges. Indeed, we evicted
   // some register and PhysReg may be available for the other live-ranges.
@@ -2431,7 +2212,7 @@ void RAGreedy::tryHintRecoloring(LiveInterval &VirtReg) {
 /// This is likely that we can assign the same register for b, c, and d,
 /// getting rid of 2 copies.
 void RAGreedy::tryHintsRecoloring() {
-  for (LiveInterval *LI : SetOfBrokenHints) {
+  for (const LiveInterval *LI : SetOfBrokenHints) {
     assert(Register::isVirtualRegister(LI->reg()) &&
            "Recoloring is possible only for virtual registers");
     // Some dead defs may be around (e.g., because of debug uses).
@@ -2442,9 +2223,10 @@ void RAGreedy::tryHintsRecoloring() {
   }
 }
 
-MCRegister RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
+MCRegister RAGreedy::selectOrSplitImpl(const LiveInterval &VirtReg,
                                        SmallVectorImpl<Register> &NewVRegs,
                                        SmallVirtRegSet &FixedRegisters,
+                                       RecoloringStack &RecolorStack,
                                        unsigned Depth) {
   uint8_t CostPerUseLimit = uint8_t(~0u);
   // First try assigning a free register.
@@ -2452,8 +2234,6 @@ MCRegister RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
       AllocationOrder::create(VirtReg.reg(), *VRM, RegClassInfo, Matrix);
   if (MCRegister PhysReg =
           tryAssign(VirtReg, Order, NewVRegs, FixedRegisters)) {
-    // If VirtReg got an assignment, the eviction info is no longer relevant.
-    LastEvicted.clearEvicteeInfo(VirtReg.reg());
     // When NewVRegs is not empty, we may have made decisions such as evicting
     // a virtual register, go with the earlier decisions and use the physical
     // register.
@@ -2488,9 +2268,6 @@ MCRegister RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
       // copy-related live-ranges.
       if (Hint && Hint != PhysReg)
         SetOfBrokenHints.insert(&VirtReg);
-      // If VirtReg eviction someone, the eviction info for it as an evictee is
-      // no longer relevant.
-      LastEvicted.clearEvicteeInfo(VirtReg.reg());
       return PhysReg;
     }
 
@@ -2510,18 +2287,16 @@ MCRegister RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
     // Try splitting VirtReg or interferences.
     unsigned NewVRegSizeBefore = NewVRegs.size();
     Register PhysReg = trySplit(VirtReg, Order, NewVRegs, FixedRegisters);
-    if (PhysReg || (NewVRegs.size() - NewVRegSizeBefore)) {
-      // If VirtReg got split, the eviction info is no longer relevant.
-      LastEvicted.clearEvicteeInfo(VirtReg.reg());
+    if (PhysReg || (NewVRegs.size() - NewVRegSizeBefore))
       return PhysReg;
-    }
   }
 
   // If we couldn't allocate a register from spilling, there is probably some
   // invalid inline assembly. The base class will report it.
-  if (Stage >= RS_Done || !VirtReg.isSpillable())
+  if (Stage >= RS_Done || !VirtReg.isSpillable()) {
     return tryLastChanceRecoloring(VirtReg, Order, NewVRegs, FixedRegisters,
-                                   Depth);
+                                   RecolorStack, Depth);
+  }
 
   // Finally spill VirtReg itself.
   if ((EnableDeferredSpilling ||
@@ -2713,19 +2488,27 @@ void RAGreedy::reportStats() {
   }
 }
 
+bool RAGreedy::hasVirtRegAlloc() {
+  for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
+    Register Reg = Register::index2VirtReg(I);
+    if (MRI->reg_nodbg_empty(Reg))
+      continue;
+    const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+    if (!RC)
+      continue;
+    if (ShouldAllocateClass(*TRI, *RC))
+      return true;
+  }
+
+  return false;
+}
+
 bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   LLVM_DEBUG(dbgs() << "********** GREEDY REGISTER ALLOCATION **********\n"
                     << "********** Function: " << mf.getName() << '\n');
 
   MF = &mf;
-  TRI = MF->getSubtarget().getRegisterInfo();
   TII = MF->getSubtarget().getInstrInfo();
-  RCI.runOnMachineFunction(mf);
-
-  EnableAdvancedRASplitCost =
-      ConsiderLocalIntervalCost.getNumOccurrences()
-          ? ConsiderLocalIntervalCost
-          : MF->getSubtarget().enableAdvancedRASplitCost();
 
   if (VerifyEnabled)
     MF->verify(this, "Before greedy register allocator");
@@ -2733,6 +2516,12 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   RegAllocBase::init(getAnalysis<VirtRegMap>(),
                      getAnalysis<LiveIntervals>(),
                      getAnalysis<LiveRegMatrix>());
+
+  // Early return if there is no virtual register to be allocated to a
+  // physical register.
+  if (!hasVirtRegAlloc())
+    return false;
+
   Indexes = &getAnalysis<SlotIndexes>();
   MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
   DomTree = &getAnalysis<MachineDominatorTree>();
@@ -2746,6 +2535,10 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   initializeCSRCost();
 
   RegCosts = TRI->getRegisterCosts(*MF);
+  RegClassPriorityTrumpsGlobalness =
+      GreedyRegClassPriorityTrumpsGlobalness.getNumOccurrences()
+          ? GreedyRegClassPriorityTrumpsGlobalness
+          : TRI->regClassPriorityTrumpsGlobalness(*MF);
 
   ExtraInfo.emplace();
   EvictAdvisor =
@@ -2764,7 +2557,6 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   IntfCache.init(MF, Matrix->getLiveUnions(), Indexes, LIS, TRI);
   GlobalCand.resize(32);  // This will grow as needed.
   SetOfBrokenHints.clear();
-  LastEvicted.clear();
 
   allocatePhysRegs();
   tryHintsRecoloring();
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.h b/llvm/lib/CodeGen/RegAllocGreedy.h
index e9a5fe635f26..358e74541a54 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.h
+++ b/llvm/lib/CodeGen/RegAllocGreedy.h
@@ -12,9 +12,7 @@
 #ifndef LLVM_CODEGEN_REGALLOCGREEDY_H_
 #define LLVM_CODEGEN_REGALLOCGREEDY_H_
 
-#include "AllocationOrder.h"
 #include "InterferenceCache.h"
-#include "LiveDebugVariables.h"
 #include "RegAllocBase.h"
 #include "RegAllocEvictionAdvisor.h"
 #include "SpillPlacement.h"
@@ -23,52 +21,44 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/IndexedMap.h"
-#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
-#include "llvm/CodeGen/EdgeBundles.h"
 #include "llvm/CodeGen/LiveInterval.h"
-#include "llvm/CodeGen/LiveIntervalUnion.h"
-#include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LiveRangeEdit.h"
-#include "llvm/CodeGen/LiveRegMatrix.h"
-#include "llvm/CodeGen/LiveStacks.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
-#include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/Spiller.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/CodeGen/VirtRegMap.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/BranchProbability.h"
-#include "llvm/Target/TargetMachine.h"
 #include <algorithm>
-#include <cassert>
 #include <cstdint>
 #include <memory>
 #include <queue>
-#include <tuple>
 #include <utility>
 
 namespace llvm {
+class AllocationOrder;
+class AnalysisUsage;
+class EdgeBundles;
+class LiveDebugVariables;
+class LiveIntervals;
+class LiveRegMatrix;
+class MachineBasicBlock;
+class MachineBlockFrequencyInfo;
+class MachineDominatorTree;
+class MachineLoop;
+class MachineLoopInfo;
+class MachineOptimizationRemarkEmitter;
+class MachineOptimizationRemarkMissed;
+class SlotIndex;
+class SlotIndexes;
+class TargetInstrInfo;
+class VirtRegMap;
+
 class LLVM_LIBRARY_VISIBILITY RAGreedy : public MachineFunctionPass,
                                          public RegAllocBase,
                                          private LiveRangeEdit::Delegate {
@@ -162,15 +152,18 @@ public:
 private:
   // Convenient shortcuts.
   using PQueue = std::priority_queue<std::pair<unsigned, unsigned>>;
-  using SmallLISet = SmallPtrSet<LiveInterval *, 4>;
+  using SmallLISet = SmallPtrSet<const LiveInterval *, 4>;
+
+  // We need to track all tentative recolorings so we can roll back any
+  // successful and unsuccessful recoloring attempts.
+  using RecoloringStack =
+      SmallVector<std::pair<const LiveInterval *, MCRegister>, 8>;
 
   // context
   MachineFunction *MF;
 
   // Shortcuts to some useful interface.
   const TargetInstrInfo *TII;
-  const TargetRegisterInfo *TRI;
-  RegisterClassInfo RCI;
 
   // analyses
   SlotIndexes *Indexes;
@@ -210,57 +203,6 @@ private:
   static const char *const StageName[];
 #endif
 
-  /// EvictionTrack - Keeps track of past evictions in order to optimize region
-  /// split decision.
-  class EvictionTrack {
-
-  public:
-    using EvictorInfo =
-        std::pair<Register /* evictor */, MCRegister /* physreg */>;
-    using EvicteeInfo = llvm::DenseMap<Register /* evictee */, EvictorInfo>;
-
-  private:
-    /// Each Vreg that has been evicted in the last stage of selectOrSplit will
-    /// be mapped to the evictor Vreg and the PhysReg it was evicted from.
-    EvicteeInfo Evictees;
-
-  public:
-    /// Clear all eviction information.
-    void clear() { Evictees.clear(); }
-
-    ///  Clear eviction information for the given evictee Vreg.
-    /// E.g. when Vreg get's a new allocation, the old eviction info is no
-    /// longer relevant.
-    /// \param Evictee The evictee Vreg for whom we want to clear collected
-    /// eviction info.
-    void clearEvicteeInfo(Register Evictee) { Evictees.erase(Evictee); }
-
-    /// Track new eviction.
-    /// The Evictor vreg has evicted the Evictee vreg from Physreg.
-    /// \param PhysReg The physical register Evictee was evicted from.
-    /// \param Evictor The evictor Vreg that evicted Evictee.
-    /// \param Evictee The evictee Vreg.
-    void addEviction(MCRegister PhysReg, Register Evictor, Register Evictee) {
-      Evictees[Evictee].first = Evictor;
-      Evictees[Evictee].second = PhysReg;
-    }
-
-    /// Return the Evictor Vreg which evicted Evictee Vreg from PhysReg.
-    /// \param Evictee The evictee vreg.
-    /// \return The Evictor vreg which evicted Evictee vreg from PhysReg. 0 if
-    /// nobody has evicted Evictee from PhysReg.
-    EvictorInfo getEvictor(Register Evictee) {
-      if (Evictees.count(Evictee)) {
-        return Evictees[Evictee];
-      }
-
-      return EvictorInfo(0, 0);
-    }
-  };
-
-  // Keeps track of past evictions in order to optimize region split decision.
-  EvictionTrack LastEvicted;
-
   // splitting state.
   std::unique_ptr<SplitAnalysis> SA;
   std::unique_ptr<SplitEditor> SE;
@@ -320,17 +262,17 @@ private:
   /// Callee-save register cost, calculated once per machine function.
   BlockFrequency CSRCost;
 
-  /// Enable or not the consideration of the cost of local intervals created
-  /// by a split candidate when choosing the best split candidate.
-  bool EnableAdvancedRASplitCost;
-
   /// Set of broken hints that may be reconciled later because of eviction.
-  SmallSetVector<LiveInterval *, 8> SetOfBrokenHints;
+  SmallSetVector<const LiveInterval *, 8> SetOfBrokenHints;
 
   /// The register cost values. This list will be recreated for each Machine
   /// Function
   ArrayRef<uint8_t> RegCosts;
 
+  /// Flags for the live range priority calculation, determined once per
+  /// machine function.
+  bool RegClassPriorityTrumpsGlobalness;
+
 public:
   RAGreedy(const RegClassFilterFunc F = allocateAllRegClasses);
 
@@ -341,11 +283,11 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override;
   void releaseMemory() override;
   Spiller &spiller() override { return *SpillerInstance; }
-  void enqueueImpl(LiveInterval *LI) override;
-  LiveInterval *dequeue() override;
-  MCRegister selectOrSplit(LiveInterval &,
+  void enqueueImpl(const LiveInterval *LI) override;
+  const LiveInterval *dequeue() override;
+  MCRegister selectOrSplit(const LiveInterval &,
                            SmallVectorImpl<Register> &) override;
-  void aboutToRemoveInterval(LiveInterval &) override;
+  void aboutToRemoveInterval(const LiveInterval &) override;
 
   /// Perform register allocation.
   bool runOnMachineFunction(MachineFunction &mf) override;
@@ -363,81 +305,70 @@ public:
   static char ID;
 
 private:
-  MCRegister selectOrSplitImpl(LiveInterval &, SmallVectorImpl<Register> &,
-                               SmallVirtRegSet &, unsigned = 0);
+  MCRegister selectOrSplitImpl(const LiveInterval &,
+                               SmallVectorImpl<Register> &, SmallVirtRegSet &,
+                               RecoloringStack &, unsigned = 0);
 
   bool LRE_CanEraseVirtReg(Register) override;
   void LRE_WillShrinkVirtReg(Register) override;
   void LRE_DidCloneVirtReg(Register, Register) override;
-  void enqueue(PQueue &CurQueue, LiveInterval *LI);
-  LiveInterval *dequeue(PQueue &CurQueue);
+  void enqueue(PQueue &CurQueue, const LiveInterval *LI);
+  const LiveInterval *dequeue(PQueue &CurQueue);
 
+  bool hasVirtRegAlloc();
   BlockFrequency calcSpillCost();
   bool addSplitConstraints(InterferenceCache::Cursor, BlockFrequency &);
   bool addThroughConstraints(InterferenceCache::Cursor, ArrayRef<unsigned>);
   bool growRegion(GlobalSplitCandidate &Cand);
-  bool splitCanCauseEvictionChain(Register Evictee, GlobalSplitCandidate &Cand,
-                                  unsigned BBNumber,
-                                  const AllocationOrder &Order);
-  bool splitCanCauseLocalSpill(unsigned VirtRegToSplit,
-                               GlobalSplitCandidate &Cand, unsigned BBNumber,
-                               const AllocationOrder &Order);
   BlockFrequency calcGlobalSplitCost(GlobalSplitCandidate &,
-                                     const AllocationOrder &Order,
-                                     bool *CanCauseEvictionChain);
+                                     const AllocationOrder &Order);
   bool calcCompactRegion(GlobalSplitCandidate &);
   void splitAroundRegion(LiveRangeEdit &, ArrayRef<unsigned>);
   void calcGapWeights(MCRegister, SmallVectorImpl<float> &);
-  bool canEvictInterferenceInRange(const LiveInterval &VirtReg,
-                                   MCRegister PhysReg, SlotIndex Start,
-                                   SlotIndex End, EvictionCost &MaxCost) const;
-  MCRegister getCheapestEvicteeWeight(const AllocationOrder &Order,
-                                      const LiveInterval &VirtReg,
-                                      SlotIndex Start, SlotIndex End,
-                                      float *BestEvictWeight) const;
-  void evictInterference(LiveInterval &, MCRegister,
+  void evictInterference(const LiveInterval &, MCRegister,
                          SmallVectorImpl<Register> &);
-  bool mayRecolorAllInterferences(MCRegister PhysReg, LiveInterval &VirtReg,
+  bool mayRecolorAllInterferences(MCRegister PhysReg,
+                                  const LiveInterval &VirtReg,
                                   SmallLISet &RecoloringCandidates,
                                   const SmallVirtRegSet &FixedRegisters);
 
-  MCRegister tryAssign(LiveInterval &, AllocationOrder &,
+  MCRegister tryAssign(const LiveInterval &, AllocationOrder &,
                        SmallVectorImpl<Register> &, const SmallVirtRegSet &);
-  MCRegister tryEvict(LiveInterval &, AllocationOrder &,
+  MCRegister tryEvict(const LiveInterval &, AllocationOrder &,
                       SmallVectorImpl<Register> &, uint8_t,
                       const SmallVirtRegSet &);
-  MCRegister tryRegionSplit(LiveInterval &, AllocationOrder &,
+  MCRegister tryRegionSplit(const LiveInterval &, AllocationOrder &,
                             SmallVectorImpl<Register> &);
   /// Calculate cost of region splitting.
-  unsigned calculateRegionSplitCost(LiveInterval &VirtReg,
+  unsigned calculateRegionSplitCost(const LiveInterval &VirtReg,
                                     AllocationOrder &Order,
                                     BlockFrequency &BestCost,
-                                    unsigned &NumCands, bool IgnoreCSR,
-                                    bool *CanCauseEvictionChain = nullptr);
+                                    unsigned &NumCands, bool IgnoreCSR);
   /// Perform region splitting.
-  unsigned doRegionSplit(LiveInterval &VirtReg, unsigned BestCand,
+  unsigned doRegionSplit(const LiveInterval &VirtReg, unsigned BestCand,
                          bool HasCompact, SmallVectorImpl<Register> &NewVRegs);
   /// Check other options before using a callee-saved register for the first
   /// time.
-  MCRegister tryAssignCSRFirstTime(LiveInterval &VirtReg,
+  MCRegister tryAssignCSRFirstTime(const LiveInterval &VirtReg,
                                    AllocationOrder &Order, MCRegister PhysReg,
                                    uint8_t &CostPerUseLimit,
                                    SmallVectorImpl<Register> &NewVRegs);
   void initializeCSRCost();
-  unsigned tryBlockSplit(LiveInterval &, AllocationOrder &,
+  unsigned tryBlockSplit(const LiveInterval &, AllocationOrder &,
                          SmallVectorImpl<Register> &);
-  unsigned tryInstructionSplit(LiveInterval &, AllocationOrder &,
+  unsigned tryInstructionSplit(const LiveInterval &, AllocationOrder &,
                                SmallVectorImpl<Register> &);
-  unsigned tryLocalSplit(LiveInterval &, AllocationOrder &,
+  unsigned tryLocalSplit(const LiveInterval &, AllocationOrder &,
                          SmallVectorImpl<Register> &);
-  unsigned trySplit(LiveInterval &, AllocationOrder &,
+  unsigned trySplit(const LiveInterval &, AllocationOrder &,
                     SmallVectorImpl<Register> &, const SmallVirtRegSet &);
-  unsigned tryLastChanceRecoloring(LiveInterval &, AllocationOrder &,
+  unsigned tryLastChanceRecoloring(const LiveInterval &, AllocationOrder &,
                                    SmallVectorImpl<Register> &,
-                                   SmallVirtRegSet &, unsigned);
+                                   SmallVirtRegSet &, RecoloringStack &,
+                                   unsigned);
   bool tryRecoloringCandidates(PQueue &, SmallVectorImpl<Register> &,
-                               SmallVirtRegSet &, unsigned);
-  void tryHintRecoloring(LiveInterval &);
+                               SmallVirtRegSet &, RecoloringStack &, unsigned);
+  void tryHintRecoloring(const LiveInterval &);
   void tryHintsRecoloring();
 
   /// Model the information carried by one end of a copy.
diff --git a/llvm/lib/CodeGen/RegAllocPBQP.cpp b/llvm/lib/CodeGen/RegAllocPBQP.cpp
index 93be8f689d57..8c262130fb70 100644
--- a/llvm/lib/CodeGen/RegAllocPBQP.cpp
+++ b/llvm/lib/CodeGen/RegAllocPBQP.cpp
@@ -847,6 +847,7 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
 
     while (!PBQPAllocComplete) {
       LLVM_DEBUG(dbgs() << "  PBQP Regalloc round " << Round << ":\n");
+      (void) Round;
 
       PBQPRAGraph G(PBQPRAGraph::GraphMetadata(MF, LIS, MBFI));
       initializeGraph(G, VRM, *VRegSpiller);
diff --git a/llvm/lib/CodeGen/RegAllocScore.cpp b/llvm/lib/CodeGen/RegAllocScore.cpp
index 740890831617..32fa5e07dd16 100644
--- a/llvm/lib/CodeGen/RegAllocScore.cpp
+++ b/llvm/lib/CodeGen/RegAllocScore.cpp
@@ -13,19 +13,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "RegAllocScore.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/STLForwardCompat.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/ADT/ilist_iterator.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBundleIterator.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-#include <cassert>
-#include <cstdint>
-#include <numeric>
-#include <vector>
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
 cl::opt<double> CopyWeight("regalloc-copy-weight", cl::init(0.2), cl::Hidden);
diff --git a/llvm/lib/CodeGen/RegAllocScore.h b/llvm/lib/CodeGen/RegAllocScore.h
index 3c28bb61189d..2bcd0b5895bf 100644
--- a/llvm/lib/CodeGen/RegAllocScore.h
+++ b/llvm/lib/CodeGen/RegAllocScore.h
@@ -15,21 +15,16 @@
 #ifndef LLVM_CODEGEN_REGALLOCSCORE_H_
 #define LLVM_CODEGEN_REGALLOCSCORE_H_
 
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
-#include "llvm/Analysis/Utils/TFUtils.h"
-#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/SelectionDAGNodes.h"
-#include "llvm/IR/Module.h"
-#include <cassert>
-#include <cstdint>
-#include <limits>
+#include "llvm/ADT/STLFunctionalExtras.h"
 
 namespace llvm {
 
+class AAResults;
+class MachineBasicBlock;
+class MachineBlockFrequencyInfo;
+class MachineFunction;
+class MachineInstr;
+
 /// Regalloc score.
 class RegAllocScore final {
   double CopyCounts = 0.0;
diff --git a/llvm/lib/CodeGen/RegUsageInfoCollector.cpp b/llvm/lib/CodeGen/RegUsageInfoCollector.cpp
index 5a79ac44dcf4..16afd15e29e4 100644
--- a/llvm/lib/CodeGen/RegUsageInfoCollector.cpp
+++ b/llvm/lib/CodeGen/RegUsageInfoCollector.cpp
@@ -17,16 +17,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegisterUsageInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/IR/Function.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/CodeGen/TargetFrameLowering.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/RegUsageInfoPropagate.cpp b/llvm/lib/CodeGen/RegUsageInfoPropagate.cpp
index 800d952469a5..d356962e0d78 100644
--- a/llvm/lib/CodeGen/RegUsageInfoPropagate.cpp
+++ b/llvm/lib/CodeGen/RegUsageInfoPropagate.cpp
@@ -19,8 +19,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
@@ -29,7 +29,6 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/RegisterBank.cpp b/llvm/lib/CodeGen/RegisterBank.cpp
new file mode 100644
index 000000000000..512b21aeacaf
--- /dev/null
+++ b/llvm/lib/CodeGen/RegisterBank.cpp
@@ -0,0 +1,110 @@
+//===- llvm/CodeGen/GlobalISel/RegisterBank.cpp - Register Bank --*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the RegisterBank class.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/RegisterBank.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "registerbank"
+
+using namespace llvm;
+
+const unsigned RegisterBank::InvalidID = UINT_MAX;
+
+RegisterBank::RegisterBank(
+    unsigned ID, const char *Name, unsigned Size,
+    const uint32_t *CoveredClasses, unsigned NumRegClasses)
+    : ID(ID), Name(Name), Size(Size) {
+  ContainedRegClasses.resize(NumRegClasses);
+  ContainedRegClasses.setBitsInMask(CoveredClasses);
+}
+
+bool RegisterBank::verify(const TargetRegisterInfo &TRI) const {
+  assert(isValid() && "Invalid register bank");
+  for (unsigned RCId = 0, End = TRI.getNumRegClasses(); RCId != End; ++RCId) {
+    const TargetRegisterClass &RC = *TRI.getRegClass(RCId);
+
+    if (!covers(RC))
+      continue;
+    // Verify that the register bank covers all the sub classes of the
+    // classes it covers.
+
+    // Use a different (slow in that case) method than
+    // RegisterBankInfo to find the subclasses of RC, to make sure
+    // both agree on the covers.
+    for (unsigned SubRCId = 0; SubRCId != End; ++SubRCId) {
+      const TargetRegisterClass &SubRC = *TRI.getRegClass(RCId);
+
+      if (!RC.hasSubClassEq(&SubRC))
+        continue;
+
+      // Verify that the Size of the register bank is big enough to cover
+      // all the register classes it covers.
+      assert(getSize() >= TRI.getRegSizeInBits(SubRC) &&
+             "Size is not big enough for all the subclasses!");
+      assert(covers(SubRC) && "Not all subclasses are covered");
+    }
+  }
+  return true;
+}
+
+bool RegisterBank::covers(const TargetRegisterClass &RC) const {
+  assert(isValid() && "RB hasn't been initialized yet");
+  return ContainedRegClasses.test(RC.getID());
+}
+
+bool RegisterBank::isValid() const {
+  return ID != InvalidID && Name != nullptr && Size != 0 &&
+         // A register bank that does not cover anything is useless.
+         !ContainedRegClasses.empty();
+}
+
+bool RegisterBank::operator==(const RegisterBank &OtherRB) const {
+  // There must be only one instance of a given register bank alive
+  // for the whole compilation.
+  // The RegisterBankInfo is supposed to enforce that.
+  assert((OtherRB.getID() != getID() || &OtherRB == this) &&
+         "ID does not uniquely identify a RegisterBank");
+  return &OtherRB == this;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void RegisterBank::dump(const TargetRegisterInfo *TRI) const {
+  print(dbgs(), /* IsForDebug */ true, TRI);
+}
+#endif
+
+void RegisterBank::print(raw_ostream &OS, bool IsForDebug,
+                         const TargetRegisterInfo *TRI) const {
+  OS << getName();
+  if (!IsForDebug)
+    return;
+  OS << "(ID:" << getID() << ", Size:" << getSize() << ")\n"
+     << "isValid:" << isValid() << '\n'
+     << "Number of Covered register classes: " << ContainedRegClasses.count()
+     << '\n';
+  // Print all the subclasses if we can.
+  // This register classes may not be properly initialized yet.
+  if (!TRI || ContainedRegClasses.empty())
+    return;
+  assert(ContainedRegClasses.size() == TRI->getNumRegClasses() &&
+         "TRI does not match the initialization process?");
+  OS << "Covered register classes:\n";
+  ListSeparator LS;
+  for (unsigned RCId = 0, End = TRI->getNumRegClasses(); RCId != End; ++RCId) {
+    const TargetRegisterClass &RC = *TRI->getRegClass(RCId);
+
+    if (covers(RC))
+      OS << LS << TRI->getRegClassName(&RC);
+  }
+}
diff --git a/llvm/lib/CodeGen/RegisterBankInfo.cpp b/llvm/lib/CodeGen/RegisterBankInfo.cpp
new file mode 100644
index 000000000000..de851ffc7fdc
--- /dev/null
+++ b/llvm/lib/CodeGen/RegisterBankInfo.cpp
@@ -0,0 +1,802 @@
+//===- llvm/CodeGen/GlobalISel/RegisterBankInfo.cpp --------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the RegisterBankInfo class.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/RegisterBankInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterBank.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <algorithm> // For std::max.
+
+#define DEBUG_TYPE "registerbankinfo"
+
+using namespace llvm;
+
+STATISTIC(NumPartialMappingsCreated,
+          "Number of partial mappings dynamically created");
+STATISTIC(NumPartialMappingsAccessed,
+          "Number of partial mappings dynamically accessed");
+STATISTIC(NumValueMappingsCreated,
+          "Number of value mappings dynamically created");
+STATISTIC(NumValueMappingsAccessed,
+          "Number of value mappings dynamically accessed");
+STATISTIC(NumOperandsMappingsCreated,
+          "Number of operands mappings dynamically created");
+STATISTIC(NumOperandsMappingsAccessed,
+          "Number of operands mappings dynamically accessed");
+STATISTIC(NumInstructionMappingsCreated,
+          "Number of instruction mappings dynamically created");
+STATISTIC(NumInstructionMappingsAccessed,
+          "Number of instruction mappings dynamically accessed");
+
+const unsigned RegisterBankInfo::DefaultMappingID = UINT_MAX;
+const unsigned RegisterBankInfo::InvalidMappingID = UINT_MAX - 1;
+
+//------------------------------------------------------------------------------
+// RegisterBankInfo implementation.
+//------------------------------------------------------------------------------
+RegisterBankInfo::RegisterBankInfo(RegisterBank **RegBanks,
+                                   unsigned NumRegBanks)
+    : RegBanks(RegBanks), NumRegBanks(NumRegBanks) {
+#ifndef NDEBUG
+  for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx) {
+    assert(RegBanks[Idx] != nullptr && "Invalid RegisterBank");
+    assert(RegBanks[Idx]->isValid() && "RegisterBank should be valid");
+  }
+#endif // NDEBUG
+}
+
+bool RegisterBankInfo::verify(const TargetRegisterInfo &TRI) const {
+#ifndef NDEBUG
+  for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx) {
+    const RegisterBank &RegBank = getRegBank(Idx);
+    assert(Idx == RegBank.getID() &&
+           "ID does not match the index in the array");
+    LLVM_DEBUG(dbgs() << "Verify " << RegBank << '\n');
+    assert(RegBank.verify(TRI) && "RegBank is invalid");
+  }
+#endif // NDEBUG
+  return true;
+}
+
+const RegisterBank *
+RegisterBankInfo::getRegBank(Register Reg, const MachineRegisterInfo &MRI,
+                             const TargetRegisterInfo &TRI) const {
+  if (Register::isPhysicalRegister(Reg)) {
+    // FIXME: This was probably a copy to a virtual register that does have a
+    // type we could use.
+    return &getRegBankFromRegClass(getMinimalPhysRegClass(Reg, TRI), LLT());
+  }
+
+  assert(Reg && "NoRegister does not have a register bank");
+  const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
+  if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
+    return RB;
+  if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
+    return &getRegBankFromRegClass(*RC, MRI.getType(Reg));
+  return nullptr;
+}
+
+const TargetRegisterClass &
+RegisterBankInfo::getMinimalPhysRegClass(Register Reg,
+                                         const TargetRegisterInfo &TRI) const {
+  assert(Register::isPhysicalRegister(Reg) && "Reg must be a physreg");
+  const auto &RegRCIt = PhysRegMinimalRCs.find(Reg);
+  if (RegRCIt != PhysRegMinimalRCs.end())
+    return *RegRCIt->second;
+  const TargetRegisterClass *PhysRC = TRI.getMinimalPhysRegClass(Reg);
+  PhysRegMinimalRCs[Reg] = PhysRC;
+  return *PhysRC;
+}
+
+const RegisterBank *RegisterBankInfo::getRegBankFromConstraints(
+    const MachineInstr &MI, unsigned OpIdx, const TargetInstrInfo &TII,
+    const MachineRegisterInfo &MRI) const {
+  const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+
+  // The mapping of the registers may be available via the
+  // register class constraints.
+  const TargetRegisterClass *RC = MI.getRegClassConstraint(OpIdx, &TII, TRI);
+
+  if (!RC)
+    return nullptr;
+
+  Register Reg = MI.getOperand(OpIdx).getReg();
+  const RegisterBank &RegBank = getRegBankFromRegClass(*RC, MRI.getType(Reg));
+  // Check that the target properly implemented getRegBankFromRegClass.
+  assert(RegBank.covers(*RC) &&
+         "The mapping of the register bank does not make sense");
+  return &RegBank;
+}
+
+const TargetRegisterClass *RegisterBankInfo::constrainGenericRegister(
+    Register Reg, const TargetRegisterClass &RC, MachineRegisterInfo &MRI) {
+
+  // If the register already has a class, fallback to MRI::constrainRegClass.
+  auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
+  if (RegClassOrBank.is<const TargetRegisterClass *>())
+    return MRI.constrainRegClass(Reg, &RC);
+
+  const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
+  // Otherwise, all we can do is ensure the bank covers the class, and set it.
+  if (RB && !RB->covers(RC))
+    return nullptr;
+
+  // If nothing was set or the class is simply compatible, set it.
+  MRI.setRegClass(Reg, &RC);
+  return &RC;
+}
+
+/// Check whether or not \p MI should be treated like a copy
+/// for the mappings.
+/// Copy like instruction are special for mapping because
+/// they don't have actual register constraints. Moreover,
+/// they sometimes have register classes assigned and we can
+/// just use that instead of failing to provide a generic mapping.
+static bool isCopyLike(const MachineInstr &MI) {
+  return MI.isCopy() || MI.isPHI() ||
+         MI.getOpcode() == TargetOpcode::REG_SEQUENCE;
+}
+
+const RegisterBankInfo::InstructionMapping &
+RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const {
+  // For copies we want to walk over the operands and try to find one
+  // that has a register bank since the instruction itself will not get
+  // us any constraint.
+  bool IsCopyLike = isCopyLike(MI);
+  // For copy like instruction, only the mapping of the definition
+  // is important. The rest is not constrained.
+  unsigned NumOperandsForMapping = IsCopyLike ? 1 : MI.getNumOperands();
+
+  const MachineFunction &MF = *MI.getMF();
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  // We may need to query the instruction encoding to guess the mapping.
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
+
+  // Before doing anything complicated check if the mapping is not
+  // directly available.
+  bool CompleteMapping = true;
+
+  SmallVector<const ValueMapping *, 8> OperandsMapping(NumOperandsForMapping);
+  for (unsigned OpIdx = 0, EndIdx = MI.getNumOperands(); OpIdx != EndIdx;
+       ++OpIdx) {
+    const MachineOperand &MO = MI.getOperand(OpIdx);
+    if (!MO.isReg())
+      continue;
+    Register Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    // The register bank of Reg is just a side effect of the current
+    // excution and in particular, there is no reason to believe this
+    // is the best default mapping for the current instruction.  Keep
+    // it as an alternative register bank if we cannot figure out
+    // something.
+    const RegisterBank *AltRegBank = getRegBank(Reg, MRI, TRI);
+    // For copy-like instruction, we want to reuse the register bank
+    // that is already set on Reg, if any, since those instructions do
+    // not have any constraints.
+    const RegisterBank *CurRegBank = IsCopyLike ? AltRegBank : nullptr;
+    if (!CurRegBank) {
+      // If this is a target specific instruction, we can deduce
+      // the register bank from the encoding constraints.
+      CurRegBank = getRegBankFromConstraints(MI, OpIdx, TII, MRI);
+      if (!CurRegBank) {
+        // All our attempts failed, give up.
+        CompleteMapping = false;
+
+        if (!IsCopyLike)
+          // MI does not carry enough information to guess the mapping.
+          return getInvalidInstructionMapping();
+        continue;
+      }
+    }
+
+    unsigned Size = getSizeInBits(Reg, MRI, TRI);
+    const ValueMapping *ValMapping = &getValueMapping(0, Size, *CurRegBank);
+    if (IsCopyLike) {
+      if (!OperandsMapping[0]) {
+        if (MI.isRegSequence()) {
+          // For reg_sequence, the result size does not match the input.
+          unsigned ResultSize = getSizeInBits(MI.getOperand(0).getReg(),
+                                              MRI, TRI);
+          OperandsMapping[0] = &getValueMapping(0, ResultSize, *CurRegBank);
+        } else {
+          OperandsMapping[0] = ValMapping;
+        }
+      }
+
+      // The default handling assumes any register bank can be copied to any
+      // other. If this isn't the case, the target should specially deal with
+      // reg_sequence/phi. There may also be unsatisfiable copies.
+      for (; OpIdx != EndIdx; ++OpIdx) {
+        const MachineOperand &MO = MI.getOperand(OpIdx);
+        if (!MO.isReg())
+          continue;
+        Register Reg = MO.getReg();
+        if (!Reg)
+          continue;
+
+        const RegisterBank *AltRegBank = getRegBank(Reg, MRI, TRI);
+        if (AltRegBank &&
+            cannotCopy(*CurRegBank, *AltRegBank, getSizeInBits(Reg, MRI, TRI)))
+          return getInvalidInstructionMapping();
+      }
+
+      CompleteMapping = true;
+      break;
+    }
+
+    OperandsMapping[OpIdx] = ValMapping;
+  }
+
+  if (IsCopyLike && !CompleteMapping) {
+    // No way to deduce the type from what we have.
+    return getInvalidInstructionMapping();
+  }
+
+  assert(CompleteMapping && "Setting an uncomplete mapping");
+  return getInstructionMapping(
+      DefaultMappingID, /*Cost*/ 1,
+      /*OperandsMapping*/ getOperandsMapping(OperandsMapping),
+      NumOperandsForMapping);
+}
+
+/// Hashing function for PartialMapping.
+static hash_code hashPartialMapping(unsigned StartIdx, unsigned Length,
+                                    const RegisterBank *RegBank) {
+  return hash_combine(StartIdx, Length, RegBank ? RegBank->getID() : 0);
+}
+
+/// Overloaded version of hash_value for a PartialMapping.
+hash_code
+llvm::hash_value(const RegisterBankInfo::PartialMapping &PartMapping) {
+  return hashPartialMapping(PartMapping.StartIdx, PartMapping.Length,
+                            PartMapping.RegBank);
+}
+
+const RegisterBankInfo::PartialMapping &
+RegisterBankInfo::getPartialMapping(unsigned StartIdx, unsigned Length,
+                                    const RegisterBank &RegBank) const {
+  ++NumPartialMappingsAccessed;
+
+  hash_code Hash = hashPartialMapping(StartIdx, Length, &RegBank);
+  const auto &It = MapOfPartialMappings.find(Hash);
+  if (It != MapOfPartialMappings.end())
+    return *It->second;
+
+  ++NumPartialMappingsCreated;
+
+  auto &PartMapping = MapOfPartialMappings[Hash];
+  PartMapping = std::make_unique<PartialMapping>(StartIdx, Length, RegBank);
+  return *PartMapping;
+}
+
+const RegisterBankInfo::ValueMapping &
+RegisterBankInfo::getValueMapping(unsigned StartIdx, unsigned Length,
+                                  const RegisterBank &RegBank) const {
+  return getValueMapping(&getPartialMapping(StartIdx, Length, RegBank), 1);
+}
+
+static hash_code
+hashValueMapping(const RegisterBankInfo::PartialMapping *BreakDown,
+                 unsigned NumBreakDowns) {
+  if (LLVM_LIKELY(NumBreakDowns == 1))
+    return hash_value(*BreakDown);
+  SmallVector<size_t, 8> Hashes(NumBreakDowns);
+  for (unsigned Idx = 0; Idx != NumBreakDowns; ++Idx)
+    Hashes.push_back(hash_value(BreakDown[Idx]));
+  return hash_combine_range(Hashes.begin(), Hashes.end());
+}
+
+const RegisterBankInfo::ValueMapping &
+RegisterBankInfo::getValueMapping(const PartialMapping *BreakDown,
+                                  unsigned NumBreakDowns) const {
+  ++NumValueMappingsAccessed;
+
+  hash_code Hash = hashValueMapping(BreakDown, NumBreakDowns);
+  const auto &It = MapOfValueMappings.find(Hash);
+  if (It != MapOfValueMappings.end())
+    return *It->second;
+
+  ++NumValueMappingsCreated;
+
+  auto &ValMapping = MapOfValueMappings[Hash];
+  ValMapping = std::make_unique<ValueMapping>(BreakDown, NumBreakDowns);
+  return *ValMapping;
+}
+
+template <typename Iterator>
+const RegisterBankInfo::ValueMapping *
+RegisterBankInfo::getOperandsMapping(Iterator Begin, Iterator End) const {
+
+  ++NumOperandsMappingsAccessed;
+
+  // The addresses of the value mapping are unique.
+  // Therefore, we can use them directly to hash the operand mapping.
+  hash_code Hash = hash_combine_range(Begin, End);
+  auto &Res = MapOfOperandsMappings[Hash];
+  if (Res)
+    return Res.get();
+
+  ++NumOperandsMappingsCreated;
+
+  // Create the array of ValueMapping.
+  // Note: this array will not hash to this instance of operands
+  // mapping, because we use the pointer of the ValueMapping
+  // to hash and we expect them to uniquely identify an instance
+  // of value mapping.
+  Res = std::make_unique<ValueMapping[]>(std::distance(Begin, End));
+  unsigned Idx = 0;
+  for (Iterator It = Begin; It != End; ++It, ++Idx) {
+    const ValueMapping *ValMap = *It;
+    if (!ValMap)
+      continue;
+    Res[Idx] = *ValMap;
+  }
+  return Res.get();
+}
+
+const RegisterBankInfo::ValueMapping *RegisterBankInfo::getOperandsMapping(
+    const SmallVectorImpl<const RegisterBankInfo::ValueMapping *> &OpdsMapping)
+    const {
+  return getOperandsMapping(OpdsMapping.begin(), OpdsMapping.end());
+}
+
+const RegisterBankInfo::ValueMapping *RegisterBankInfo::getOperandsMapping(
+    std::initializer_list<const RegisterBankInfo::ValueMapping *> OpdsMapping)
+    const {
+  return getOperandsMapping(OpdsMapping.begin(), OpdsMapping.end());
+}
+
+static hash_code
+hashInstructionMapping(unsigned ID, unsigned Cost,
+                       const RegisterBankInfo::ValueMapping *OperandsMapping,
+                       unsigned NumOperands) {
+  return hash_combine(ID, Cost, OperandsMapping, NumOperands);
+}
+
+const RegisterBankInfo::InstructionMapping &
+RegisterBankInfo::getInstructionMappingImpl(
+    bool IsInvalid, unsigned ID, unsigned Cost,
+    const RegisterBankInfo::ValueMapping *OperandsMapping,
+    unsigned NumOperands) const {
+  assert(((IsInvalid && ID == InvalidMappingID && Cost == 0 &&
+           OperandsMapping == nullptr && NumOperands == 0) ||
+          !IsInvalid) &&
+         "Mismatch argument for invalid input");
+  ++NumInstructionMappingsAccessed;
+
+  hash_code Hash =
+      hashInstructionMapping(ID, Cost, OperandsMapping, NumOperands);
+  const auto &It = MapOfInstructionMappings.find(Hash);
+  if (It != MapOfInstructionMappings.end())
+    return *It->second;
+
+  ++NumInstructionMappingsCreated;
+
+  auto &InstrMapping = MapOfInstructionMappings[Hash];
+  InstrMapping = std::make_unique<InstructionMapping>(
+      ID, Cost, OperandsMapping, NumOperands);
+  return *InstrMapping;
+}
+
+const RegisterBankInfo::InstructionMapping &
+RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
+  const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
+  if (Mapping.isValid())
+    return Mapping;
+  llvm_unreachable("The target must implement this");
+}
+
+RegisterBankInfo::InstructionMappings
+RegisterBankInfo::getInstrPossibleMappings(const MachineInstr &MI) const {
+  InstructionMappings PossibleMappings;
+  const auto &Mapping = getInstrMapping(MI);
+  if (Mapping.isValid()) {
+    // Put the default mapping first.
+    PossibleMappings.push_back(&Mapping);
+  }
+
+  // Then the alternative mapping, if any.
+  InstructionMappings AltMappings = getInstrAlternativeMappings(MI);
+  append_range(PossibleMappings, AltMappings);
+#ifndef NDEBUG
+  for (const InstructionMapping *Mapping : PossibleMappings)
+    assert(Mapping->verify(MI) && "Mapping is invalid");
+#endif
+  return PossibleMappings;
+}
+
+RegisterBankInfo::InstructionMappings
+RegisterBankInfo::getInstrAlternativeMappings(const MachineInstr &MI) const {
+  // No alternative for MI.
+  return InstructionMappings();
+}
+
+void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) {
+  MachineInstr &MI = OpdMapper.getMI();
+  MachineRegisterInfo &MRI = OpdMapper.getMRI();
+  LLVM_DEBUG(dbgs() << "Applying default-like mapping\n");
+  for (unsigned OpIdx = 0,
+                EndIdx = OpdMapper.getInstrMapping().getNumOperands();
+       OpIdx != EndIdx; ++OpIdx) {
+    LLVM_DEBUG(dbgs() << "OpIdx " << OpIdx);
+    MachineOperand &MO = MI.getOperand(OpIdx);
+    if (!MO.isReg()) {
+      LLVM_DEBUG(dbgs() << " is not a register, nothing to be done\n");
+      continue;
+    }
+    if (!MO.getReg()) {
+      LLVM_DEBUG(dbgs() << " is $noreg, nothing to be done\n");
+      continue;
+    }
+    assert(OpdMapper.getInstrMapping().getOperandMapping(OpIdx).NumBreakDowns !=
+               0 &&
+           "Invalid mapping");
+    assert(OpdMapper.getInstrMapping().getOperandMapping(OpIdx).NumBreakDowns ==
+               1 &&
+           "This mapping is too complex for this function");
+    iterator_range<SmallVectorImpl<Register>::const_iterator> NewRegs =
+        OpdMapper.getVRegs(OpIdx);
+    if (NewRegs.empty()) {
+      LLVM_DEBUG(dbgs() << " has not been repaired, nothing to be done\n");
+      continue;
+    }
+    Register OrigReg = MO.getReg();
+    Register NewReg = *NewRegs.begin();
+    LLVM_DEBUG(dbgs() << " changed, replace " << printReg(OrigReg, nullptr));
+    MO.setReg(NewReg);
+    LLVM_DEBUG(dbgs() << " with " << printReg(NewReg, nullptr));
+
+    // The OperandsMapper creates plain scalar, we may have to fix that.
+    // Check if the types match and if not, fix that.
+    LLT OrigTy = MRI.getType(OrigReg);
+    LLT NewTy = MRI.getType(NewReg);
+    if (OrigTy != NewTy) {
+      // The default mapping is not supposed to change the size of
+      // the storage. However, right now we don't necessarily bump all
+      // the types to storage size. For instance, we can consider
+      // s16 G_AND legal whereas the storage size is going to be 32.
+      assert(OrigTy.getSizeInBits() <= NewTy.getSizeInBits() &&
+             "Types with difference size cannot be handled by the default "
+             "mapping");
+      LLVM_DEBUG(dbgs() << "\nChange type of new opd from " << NewTy << " to "
+                        << OrigTy);
+      MRI.setType(NewReg, OrigTy);
+    }
+    LLVM_DEBUG(dbgs() << '\n');
+  }
+}
+
+unsigned RegisterBankInfo::getSizeInBits(Register Reg,
+                                         const MachineRegisterInfo &MRI,
+                                         const TargetRegisterInfo &TRI) const {
+  if (Register::isPhysicalRegister(Reg)) {
+    // The size is not directly available for physical registers.
+    // Instead, we need to access a register class that contains Reg and
+    // get the size of that register class.
+    // Because this is expensive, we'll cache the register class by calling
+    auto *RC = &getMinimalPhysRegClass(Reg, TRI);
+    assert(RC && "Expecting Register class");
+    return TRI.getRegSizeInBits(*RC);
+  }
+  return TRI.getRegSizeInBits(Reg, MRI);
+}
+
+//------------------------------------------------------------------------------
+// Helper classes implementation.
+//------------------------------------------------------------------------------
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void RegisterBankInfo::PartialMapping::dump() const {
+  print(dbgs());
+  dbgs() << '\n';
+}
+#endif
+
+bool RegisterBankInfo::PartialMapping::verify() const {
+  assert(RegBank && "Register bank not set");
+  assert(Length && "Empty mapping");
+  assert((StartIdx <= getHighBitIdx()) && "Overflow, switch to APInt?");
+  // Check if the minimum width fits into RegBank.
+  assert(RegBank->getSize() >= Length && "Register bank too small for Mask");
+  return true;
+}
+
+void RegisterBankInfo::PartialMapping::print(raw_ostream &OS) const {
+  OS << "[" << StartIdx << ", " << getHighBitIdx() << "], RegBank = ";
+  if (RegBank)
+    OS << *RegBank;
+  else
+    OS << "nullptr";
+}
+
+bool RegisterBankInfo::ValueMapping::partsAllUniform() const {
+  if (NumBreakDowns < 2)
+    return true;
+
+  const PartialMapping *First = begin();
+  for (const PartialMapping *Part = First + 1; Part != end(); ++Part) {
+    if (Part->Length != First->Length || Part->RegBank != First->RegBank)
+      return false;
+  }
+
+  return true;
+}
+
+bool RegisterBankInfo::ValueMapping::verify(unsigned MeaningfulBitWidth) const {
+  assert(NumBreakDowns && "Value mapped nowhere?!");
+  unsigned OrigValueBitWidth = 0;
+  for (const RegisterBankInfo::PartialMapping &PartMap : *this) {
+    // Check that each register bank is big enough to hold the partial value:
+    // this check is done by PartialMapping::verify
+    assert(PartMap.verify() && "Partial mapping is invalid");
+    // The original value should completely be mapped.
+    // Thus the maximum accessed index + 1 is the size of the original value.
+    OrigValueBitWidth =
+        std::max(OrigValueBitWidth, PartMap.getHighBitIdx() + 1);
+  }
+  assert(OrigValueBitWidth >= MeaningfulBitWidth &&
+         "Meaningful bits not covered by the mapping");
+  APInt ValueMask(OrigValueBitWidth, 0);
+  for (const RegisterBankInfo::PartialMapping &PartMap : *this) {
+    // Check that the union of the partial mappings covers the whole value,
+    // without overlaps.
+    // The high bit is exclusive in the APInt API, thus getHighBitIdx + 1.
+    APInt PartMapMask = APInt::getBitsSet(OrigValueBitWidth, PartMap.StartIdx,
+                                          PartMap.getHighBitIdx() + 1);
+    ValueMask ^= PartMapMask;
+    assert((ValueMask & PartMapMask) == PartMapMask &&
+           "Some partial mappings overlap");
+  }
+  assert(ValueMask.isAllOnes() && "Value is not fully mapped");
+  return true;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void RegisterBankInfo::ValueMapping::dump() const {
+  print(dbgs());
+  dbgs() << '\n';
+}
+#endif
+
+void RegisterBankInfo::ValueMapping::print(raw_ostream &OS) const {
+  OS << "#BreakDown: " << NumBreakDowns << " ";
+  bool IsFirst = true;
+  for (const PartialMapping &PartMap : *this) {
+    if (!IsFirst)
+      OS << ", ";
+    OS << '[' << PartMap << ']';
+    IsFirst = false;
+  }
+}
+
+bool RegisterBankInfo::InstructionMapping::verify(
+    const MachineInstr &MI) const {
+  // Check that all the register operands are properly mapped.
+  // Check the constructor invariant.
+  // For PHI, we only care about mapping the definition.
+  assert(NumOperands == (isCopyLike(MI) ? 1 : MI.getNumOperands()) &&
+         "NumOperands must match, see constructor");
+  assert(MI.getParent() && MI.getMF() &&
+         "MI must be connected to a MachineFunction");
+  const MachineFunction &MF = *MI.getMF();
+  const RegisterBankInfo *RBI = MF.getSubtarget().getRegBankInfo();
+  (void)RBI;
+
+  for (unsigned Idx = 0; Idx < NumOperands; ++Idx) {
+    const MachineOperand &MO = MI.getOperand(Idx);
+    if (!MO.isReg()) {
+      assert(!getOperandMapping(Idx).isValid() &&
+             "We should not care about non-reg mapping");
+      continue;
+    }
+    Register Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    assert(getOperandMapping(Idx).isValid() &&
+           "We must have a mapping for reg operands");
+    const RegisterBankInfo::ValueMapping &MOMapping = getOperandMapping(Idx);
+    (void)MOMapping;
+    // Register size in bits.
+    // This size must match what the mapping expects.
+    assert(MOMapping.verify(RBI->getSizeInBits(
+               Reg, MF.getRegInfo(), *MF.getSubtarget().getRegisterInfo())) &&
+           "Value mapping is invalid");
+  }
+  return true;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void RegisterBankInfo::InstructionMapping::dump() const {
+  print(dbgs());
+  dbgs() << '\n';
+}
+#endif
+
+void RegisterBankInfo::InstructionMapping::print(raw_ostream &OS) const {
+  OS << "ID: " << getID() << " Cost: " << getCost() << " Mapping: ";
+
+  for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
+    const ValueMapping &ValMapping = getOperandMapping(OpIdx);
+    if (OpIdx)
+      OS << ", ";
+    OS << "{ Idx: " << OpIdx << " Map: " << ValMapping << '}';
+  }
+}
+
+const int RegisterBankInfo::OperandsMapper::DontKnowIdx = -1;
+
+RegisterBankInfo::OperandsMapper::OperandsMapper(
+    MachineInstr &MI, const InstructionMapping &InstrMapping,
+    MachineRegisterInfo &MRI)
+    : MRI(MRI), MI(MI), InstrMapping(InstrMapping) {
+  unsigned NumOpds = InstrMapping.getNumOperands();
+  OpToNewVRegIdx.resize(NumOpds, OperandsMapper::DontKnowIdx);
+  assert(InstrMapping.verify(MI) && "Invalid mapping for MI");
+}
+
+iterator_range<SmallVectorImpl<Register>::iterator>
+RegisterBankInfo::OperandsMapper::getVRegsMem(unsigned OpIdx) {
+  assert(OpIdx < getInstrMapping().getNumOperands() && "Out-of-bound access");
+  unsigned NumPartialVal =
+      getInstrMapping().getOperandMapping(OpIdx).NumBreakDowns;
+  int StartIdx = OpToNewVRegIdx[OpIdx];
+
+  if (StartIdx == OperandsMapper::DontKnowIdx) {
+    // This is the first time we try to access OpIdx.
+    // Create the cells that will hold all the partial values at the
+    // end of the list of NewVReg.
+    StartIdx = NewVRegs.size();
+    OpToNewVRegIdx[OpIdx] = StartIdx;
+    for (unsigned i = 0; i < NumPartialVal; ++i)
+      NewVRegs.push_back(0);
+  }
+  SmallVectorImpl<Register>::iterator End =
+      getNewVRegsEnd(StartIdx, NumPartialVal);
+
+  return make_range(&NewVRegs[StartIdx], End);
+}
+
+SmallVectorImpl<Register>::const_iterator
+RegisterBankInfo::OperandsMapper::getNewVRegsEnd(unsigned StartIdx,
+                                                 unsigned NumVal) const {
+  return const_cast<OperandsMapper *>(this)->getNewVRegsEnd(StartIdx, NumVal);
+}
+SmallVectorImpl<Register>::iterator
+RegisterBankInfo::OperandsMapper::getNewVRegsEnd(unsigned StartIdx,
+                                                 unsigned NumVal) {
+  assert((NewVRegs.size() == StartIdx + NumVal ||
+          NewVRegs.size() > StartIdx + NumVal) &&
+         "NewVRegs too small to contain all the partial mapping");
+  return NewVRegs.size() <= StartIdx + NumVal ? NewVRegs.end()
+                                              : &NewVRegs[StartIdx + NumVal];
+}
+
+void RegisterBankInfo::OperandsMapper::createVRegs(unsigned OpIdx) {
+  assert(OpIdx < getInstrMapping().getNumOperands() && "Out-of-bound access");
+  iterator_range<SmallVectorImpl<Register>::iterator> NewVRegsForOpIdx =
+      getVRegsMem(OpIdx);
+  const ValueMapping &ValMapping = getInstrMapping().getOperandMapping(OpIdx);
+  const PartialMapping *PartMap = ValMapping.begin();
+  for (Register &NewVReg : NewVRegsForOpIdx) {
+    assert(PartMap != ValMapping.end() && "Out-of-bound access");
+    assert(NewVReg == 0 && "Register has already been created");
+    // The new registers are always bound to scalar with the right size.
+    // The actual type has to be set when the target does the mapping
+    // of the instruction.
+    // The rationale is that this generic code cannot guess how the
+    // target plans to split the input type.
+    NewVReg = MRI.createGenericVirtualRegister(LLT::scalar(PartMap->Length));
+    MRI.setRegBank(NewVReg, *PartMap->RegBank);
+    ++PartMap;
+  }
+}
+
+void RegisterBankInfo::OperandsMapper::setVRegs(unsigned OpIdx,
+                                                unsigned PartialMapIdx,
+                                                Register NewVReg) {
+  assert(OpIdx < getInstrMapping().getNumOperands() && "Out-of-bound access");
+  assert(getInstrMapping().getOperandMapping(OpIdx).NumBreakDowns >
+             PartialMapIdx &&
+         "Out-of-bound access for partial mapping");
+  // Make sure the memory is initialized for that operand.
+  (void)getVRegsMem(OpIdx);
+  assert(NewVRegs[OpToNewVRegIdx[OpIdx] + PartialMapIdx] == 0 &&
+         "This value is already set");
+  NewVRegs[OpToNewVRegIdx[OpIdx] + PartialMapIdx] = NewVReg;
+}
+
+iterator_range<SmallVectorImpl<Register>::const_iterator>
+RegisterBankInfo::OperandsMapper::getVRegs(unsigned OpIdx,
+                                           bool ForDebug) const {
+  (void)ForDebug;
+  assert(OpIdx < getInstrMapping().getNumOperands() && "Out-of-bound access");
+  int StartIdx = OpToNewVRegIdx[OpIdx];
+
+  if (StartIdx == OperandsMapper::DontKnowIdx)
+    return make_range(NewVRegs.end(), NewVRegs.end());
+
+  unsigned PartMapSize =
+      getInstrMapping().getOperandMapping(OpIdx).NumBreakDowns;
+  SmallVectorImpl<Register>::const_iterator End =
+      getNewVRegsEnd(StartIdx, PartMapSize);
+  iterator_range<SmallVectorImpl<Register>::const_iterator> Res =
+      make_range(&NewVRegs[StartIdx], End);
+#ifndef NDEBUG
+  for (Register VReg : Res)
+    assert((VReg || ForDebug) && "Some registers are uninitialized");
+#endif
+  return Res;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void RegisterBankInfo::OperandsMapper::dump() const {
+  print(dbgs(), true);
+  dbgs() << '\n';
+}
+#endif
+
+void RegisterBankInfo::OperandsMapper::print(raw_ostream &OS,
+                                             bool ForDebug) const {
+  unsigned NumOpds = getInstrMapping().getNumOperands();
+  if (ForDebug) {
+    OS << "Mapping for " << getMI() << "\nwith " << getInstrMapping() << '\n';
+    // Print out the internal state of the index table.
+    OS << "Populated indices (CellNumber, IndexInNewVRegs): ";
+    bool IsFirst = true;
+    for (unsigned Idx = 0; Idx != NumOpds; ++Idx) {
+      if (OpToNewVRegIdx[Idx] != DontKnowIdx) {
+        if (!IsFirst)
+          OS << ", ";
+        OS << '(' << Idx << ", " << OpToNewVRegIdx[Idx] << ')';
+        IsFirst = false;
+      }
+    }
+    OS << '\n';
+  } else
+    OS << "Mapping ID: " << getInstrMapping().getID() << ' ';
+
+  OS << "Operand Mapping: ";
+  // If we have a function, we can pretty print the name of the registers.
+  // Otherwise we will print the raw numbers.
+  const TargetRegisterInfo *TRI =
+      getMI().getParent() && getMI().getMF()
+          ? getMI().getMF()->getSubtarget().getRegisterInfo()
+          : nullptr;
+  bool IsFirst = true;
+  for (unsigned Idx = 0; Idx != NumOpds; ++Idx) {
+    if (OpToNewVRegIdx[Idx] == DontKnowIdx)
+      continue;
+    if (!IsFirst)
+      OS << ", ";
+    IsFirst = false;
+    OS << '(' << printReg(getMI().getOperand(Idx).getReg(), TRI) << ", [";
+    bool IsFirstNewVReg = true;
+    for (Register VReg : getVRegs(Idx)) {
+      if (!IsFirstNewVReg)
+        OS << ", ";
+      IsFirstNewVReg = false;
+      OS << printReg(VReg, TRI);
+    }
+    OS << "])";
+  }
+}
diff --git a/llvm/lib/CodeGen/RegisterClassInfo.cpp b/llvm/lib/CodeGen/RegisterClassInfo.cpp
index 65a65b9cae95..374fcc9a6014 100644
--- a/llvm/lib/CodeGen/RegisterClassInfo.cpp
+++ b/llvm/lib/CodeGen/RegisterClassInfo.cpp
@@ -19,7 +19,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -44,9 +43,11 @@ void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
   bool Update = false;
   MF = &mf;
 
+  auto &STI = MF->getSubtarget();
+
   // Allocate new array the first time we see a new target.
-  if (MF->getSubtarget().getRegisterInfo() != TRI) {
-    TRI = MF->getSubtarget().getRegisterInfo();
+  if (STI.getRegisterInfo() != TRI) {
+    TRI = STI.getRegisterInfo();
     RegClass.reset(new RCInfo[TRI->getNumRegClasses()]);
     Update = true;
   }
@@ -68,6 +69,18 @@ void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
   }
   CalleeSavedRegs = CSR;
 
+  // Even if CSR list is same, we could have had a different allocation order
+  // if ignoreCSRForAllocationOrder is evaluated differently.
+  BitVector CSRHintsForAllocOrder(TRI->getNumRegs());
+  for (const MCPhysReg *I = CSR; *I; ++I)
+    for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI)
+      CSRHintsForAllocOrder[*AI] = STI.ignoreCSRForAllocationOrder(mf, *AI);
+  if (IgnoreCSRForAllocOrder.size() != CSRHintsForAllocOrder.size() ||
+      IgnoreCSRForAllocOrder != CSRHintsForAllocOrder) {
+    Update = true;
+    IgnoreCSRForAllocOrder = CSRHintsForAllocOrder;
+  }
+
   RegCosts = TRI->getRegisterCosts(*MF);
 
   // Different reserved registers?
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index a917b0d27d4a..930d05324440 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -1647,7 +1647,7 @@ MachineInstr *RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI) {
       for (unsigned i = CopyMI->getNumOperands(); i != 0; --i) {
         MachineOperand &MO = CopyMI->getOperand(i-1);
         if (MO.isReg() && MO.isUse())
-          CopyMI->RemoveOperand(i-1);
+          CopyMI->removeOperand(i-1);
       }
       LLVM_DEBUG(dbgs() << "\tReplaced copy of <undef> value with an "
                            "implicit def\n");
diff --git a/llvm/lib/CodeGen/RegisterScavenging.cpp b/llvm/lib/CodeGen/RegisterScavenging.cpp
index 424ad7419165..289d31be2d2d 100644
--- a/llvm/lib/CodeGen/RegisterScavenging.cpp
+++ b/llvm/lib/CodeGen/RegisterScavenging.cpp
@@ -37,11 +37,9 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cassert>
 #include <iterator>
 #include <limits>
-#include <string>
 #include <utility>
 
 using namespace llvm;
diff --git a/llvm/lib/CodeGen/RegisterUsageInfo.cpp b/llvm/lib/CodeGen/RegisterUsageInfo.cpp
index 6858d7233bc5..9d9cdf9edbb3 100644
--- a/llvm/lib/CodeGen/RegisterUsageInfo.cpp
+++ b/llvm/lib/CodeGen/RegisterUsageInfo.cpp
@@ -22,8 +22,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
-#include <algorithm>
-#include <cassert>
 #include <cstdint>
 #include <utility>
 #include <vector>
diff --git a/llvm/lib/CodeGen/RemoveRedundantDebugValues.cpp b/llvm/lib/CodeGen/RemoveRedundantDebugValues.cpp
index 49859aeec78b..01886e40a4a3 100644
--- a/llvm/lib/CodeGen/RemoveRedundantDebugValues.cpp
+++ b/llvm/lib/CodeGen/RemoveRedundantDebugValues.cpp
@@ -12,13 +12,12 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Function.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
 
 /// \file RemoveRedundantDebugValues.cpp
 ///
diff --git a/llvm/lib/CodeGen/RenameIndependentSubregs.cpp b/llvm/lib/CodeGen/RenameIndependentSubregs.cpp
index 0872ec303460..466022ae0ac1 100644
--- a/llvm/lib/CodeGen/RenameIndependentSubregs.cpp
+++ b/llvm/lib/CodeGen/RenameIndependentSubregs.cpp
@@ -33,9 +33,9 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
index 0ff045fa787e..87b8ac59bdba 100644
--- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
+++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
@@ -1,4 +1,4 @@
-//=== ReplaceWithVeclib.cpp - Replace vector instrinsics with veclib calls ===//
+//=== ReplaceWithVeclib.cpp - Replace vector intrinsics with veclib calls -===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -23,7 +23,6 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
 using namespace llvm;
@@ -110,7 +109,7 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
     auto *ArgType = Arg.value()->getType();
     // Vector calls to intrinsics can still have
     // scalar operands for specific arguments.
-    if (hasVectorInstrinsicScalarOpd(IntrinsicID, Arg.index())) {
+    if (isVectorIntrinsicWithScalarOpAtArg(IntrinsicID, Arg.index())) {
       ScalarTypes.push_back(ArgType);
     } else {
       // The argument in this place should be a vector if
diff --git a/llvm/lib/CodeGen/SafeStack.cpp b/llvm/lib/CodeGen/SafeStack.cpp
index 3d8a7eecce18..e7116ec3ea28 100644
--- a/llvm/lib/CodeGen/SafeStack.cpp
+++ b/llvm/lib/CodeGen/SafeStack.cpp
@@ -17,7 +17,6 @@
 #include "SafeStackLayout.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -49,10 +48,10 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
@@ -97,31 +96,12 @@ static cl::opt<bool>
     SafeStackUsePointerAddress("safestack-use-pointer-address",
                                   cl::init(false), cl::Hidden);
 
-// Disabled by default due to PR32143.
 static cl::opt<bool> ClColoring("safe-stack-coloring",
                                 cl::desc("enable safe stack coloring"),
-                                cl::Hidden, cl::init(false));
+                                cl::Hidden, cl::init(true));
 
 namespace {
 
-/// Rewrite an SCEV expression for a memory access address to an expression that
-/// represents offset from the given alloca.
-///
-/// The implementation simply replaces all mentions of the alloca with zero.
-class AllocaOffsetRewriter : public SCEVRewriteVisitor<AllocaOffsetRewriter> {
-  const Value *AllocaPtr;
-
-public:
-  AllocaOffsetRewriter(ScalarEvolution &SE, const Value *AllocaPtr)
-      : SCEVRewriteVisitor(SE), AllocaPtr(AllocaPtr) {}
-
-  const SCEV *visitUnknown(const SCEVUnknown *Expr) {
-    if (Expr->getValue() == AllocaPtr)
-      return SE.getZero(Expr->getType());
-    return Expr;
-  }
-};
-
 /// The SafeStack pass splits the stack of each function into the safe
 /// stack, which is only accessed through memory safe dereferences (as
 /// determined statically), and the unsafe stack, which contains all
@@ -147,7 +127,7 @@ class SafeStack {
   ///
   /// 16 seems like a reasonable upper bound on the alignment of objects that we
   /// might expect to appear on the stack on most common targets.
-  static constexpr uint64_t StackAlignment = 16;
+  static constexpr Align StackAlignment = Align::Constant<16>();
 
   /// Return the value of the stack canary.
   Value *getStackGuard(IRBuilder<> &IRB, Function &F);
@@ -221,7 +201,7 @@ public:
   bool run();
 };
 
-constexpr uint64_t SafeStack::StackAlignment;
+constexpr Align SafeStack::StackAlignment;
 
 uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) {
   uint64_t Size = DL.getTypeAllocSize(AI->getAllocatedType());
@@ -236,9 +216,18 @@ uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) {
 
 bool SafeStack::IsAccessSafe(Value *Addr, uint64_t AccessSize,
                              const Value *AllocaPtr, uint64_t AllocaSize) {
-  AllocaOffsetRewriter Rewriter(SE, AllocaPtr);
-  const SCEV *Expr = Rewriter.visit(SE.getSCEV(Addr));
+  const SCEV *AddrExpr = SE.getSCEV(Addr);
+  const auto *Base = dyn_cast<SCEVUnknown>(SE.getPointerBase(AddrExpr));
+  if (!Base || Base->getValue() != AllocaPtr) {
+    LLVM_DEBUG(
+        dbgs() << "[SafeStack] "
+               << (isa<AllocaInst>(AllocaPtr) ? "Alloca " : "ByValArgument ")
+               << *AllocaPtr << "\n"
+               << "SCEV " << *AddrExpr << " not directly based on alloca\n");
+    return false;
+  }
 
+  const SCEV *Expr = SE.removePointerBase(AddrExpr);
   uint64_t BitWidth = SE.getTypeSizeInBits(Expr->getType());
   ConstantRange AccessStartRange = SE.getUnsignedRange(Expr);
   ConstantRange SizeRange =
@@ -645,6 +634,13 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
   // FIXME: no need to update BasePointer in leaf functions.
   unsigned FrameSize = alignTo(SSL.getFrameSize(), StackAlignment);
 
+  MDBuilder MDB(F.getContext());
+  SmallVector<Metadata *, 2> Data;
+  Data.push_back(MDB.createString("unsafe-stack-size"));
+  Data.push_back(MDB.createConstant(ConstantInt::get(Int32Ty, FrameSize)));
+  MDNode *MD = MDTuple::get(F.getContext(), Data);
+  F.setMetadata(LLVMContext::MD_annotation, MD);
+
   // Update shadow stack pointer in the function epilogue.
   IRB.SetInsertPoint(BasePointer->getNextNode());
 
@@ -677,13 +673,12 @@ void SafeStack::moveDynamicAllocasToUnsafeStack(
     SP = IRB.CreateSub(SP, Size);
 
     // Align the SP value to satisfy the AllocaInst, type and stack alignments.
-    uint64_t Align =
-        std::max(std::max(DL.getPrefTypeAlignment(Ty), AI->getAlignment()),
-                 StackAlignment);
+    auto Align = std::max(std::max(DL.getPrefTypeAlign(Ty), AI->getAlign()),
+                          StackAlignment);
 
-    assert(isPowerOf2_32(Align));
     Value *NewTop = IRB.CreateIntToPtr(
-        IRB.CreateAnd(SP, ConstantInt::get(IntPtrTy, ~uint64_t(Align - 1))),
+        IRB.CreateAnd(SP,
+                      ConstantInt::get(IntPtrTy, ~uint64_t(Align.value() - 1))),
         StackPtrTy);
 
     // Save the stack pointer.
diff --git a/llvm/lib/CodeGen/SafeStackLayout.cpp b/llvm/lib/CodeGen/SafeStackLayout.cpp
index 602afcfa9001..f821145f4b63 100644
--- a/llvm/lib/CodeGen/SafeStackLayout.cpp
+++ b/llvm/lib/CodeGen/SafeStackLayout.cpp
@@ -11,7 +11,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
diff --git a/llvm/lib/CodeGen/SafeStackLayout.h b/llvm/lib/CodeGen/SafeStackLayout.h
index 4ac7af2059f5..6126c7a67854 100644
--- a/llvm/lib/CodeGen/SafeStackLayout.h
+++ b/llvm/lib/CodeGen/SafeStackLayout.h
@@ -52,7 +52,7 @@ class StackLayout {
   void layoutObject(StackObject &Obj);
 
 public:
-  StackLayout(uint64_t StackAlignment) : MaxAlignment(StackAlignment) {}
+  StackLayout(Align StackAlignment) : MaxAlignment(StackAlignment) {}
 
   /// Add an object to the stack frame. Value pointer is opaque and used as a
   /// handle to retrieve the object's offset in the frame later.
diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
index 0e8e8338b46d..07dcc34fbf15 100644
--- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -14,7 +14,6 @@
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/ADT/IntEqClasses.h"
 #include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SparseSet.h"
 #include "llvm/ADT/iterator_range.h"
@@ -40,9 +39,6 @@
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Operator.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/MC/LaneBitmask.h"
@@ -65,9 +61,9 @@ using namespace llvm;
 
 #define DEBUG_TYPE "machine-scheduler"
 
-static cl::opt<bool> EnableAASchedMI("enable-aa-sched-mi", cl::Hidden,
-    cl::ZeroOrMore, cl::init(false),
-    cl::desc("Enable use of AA during MI DAG construction"));
+static cl::opt<bool>
+    EnableAASchedMI("enable-aa-sched-mi", cl::Hidden,
+                    cl::desc("Enable use of AA during MI DAG construction"));
 
 static cl::opt<bool> UseTBAA("use-tbaa-in-sched-mi", cl::Hidden,
     cl::init(true), cl::desc("Enable use of TBAA during MI DAG construction"));
diff --git a/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp b/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
index 05b2a3764cca..e7b14944acfe 100644
--- a/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
@@ -10,13 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp
new file mode 100644
index 000000000000..c199b6a6cca8
--- /dev/null
+++ b/llvm/lib/CodeGen/SelectOptimize.cpp
@@ -0,0 +1,989 @@
+//===--- SelectOptimize.cpp - Convert select to branches if profitable ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass converts selects to conditional jumps when profitable.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/ScaledNumber.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
+#include <algorithm>
+#include <memory>
+#include <queue>
+#include <stack>
+#include <string>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "select-optimize"
+
+STATISTIC(NumSelectOptAnalyzed,
+          "Number of select groups considered for conversion to branch");
+STATISTIC(NumSelectConvertedExpColdOperand,
+          "Number of select groups converted due to expensive cold operand");
+STATISTIC(NumSelectConvertedHighPred,
+          "Number of select groups converted due to high-predictability");
+STATISTIC(NumSelectUnPred,
+          "Number of select groups not converted due to unpredictability");
+STATISTIC(NumSelectColdBB,
+          "Number of select groups not converted due to cold basic block");
+STATISTIC(NumSelectConvertedLoop,
+          "Number of select groups converted due to loop-level analysis");
+STATISTIC(NumSelectsConverted, "Number of selects converted");
+
+static cl::opt<unsigned> ColdOperandThreshold(
+    "cold-operand-threshold",
+    cl::desc("Maximum frequency of path for an operand to be considered cold."),
+    cl::init(20), cl::Hidden);
+
+static cl::opt<unsigned> ColdOperandMaxCostMultiplier(
+    "cold-operand-max-cost-multiplier",
+    cl::desc("Maximum cost multiplier of TCC_expensive for the dependence "
+             "slice of a cold operand to be considered inexpensive."),
+    cl::init(1), cl::Hidden);
+
+static cl::opt<unsigned>
+    GainGradientThreshold("select-opti-loop-gradient-gain-threshold",
+                          cl::desc("Gradient gain threshold (%)."),
+                          cl::init(25), cl::Hidden);
+
+static cl::opt<unsigned>
+    GainCycleThreshold("select-opti-loop-cycle-gain-threshold",
+                       cl::desc("Minimum gain per loop (in cycles) threshold."),
+                       cl::init(4), cl::Hidden);
+
+static cl::opt<unsigned> GainRelativeThreshold(
+    "select-opti-loop-relative-gain-threshold",
+    cl::desc(
+        "Minimum relative gain per loop threshold (1/X). Defaults to 12.5%"),
+    cl::init(8), cl::Hidden);
+
+static cl::opt<unsigned> MispredictDefaultRate(
+    "mispredict-default-rate", cl::Hidden, cl::init(25),
+    cl::desc("Default mispredict rate (initialized to 25%)."));
+
+static cl::opt<bool>
+    DisableLoopLevelHeuristics("disable-loop-level-heuristics", cl::Hidden,
+                               cl::init(false),
+                               cl::desc("Disable loop-level heuristics."));
+
+namespace {
+
+class SelectOptimize : public FunctionPass {
+  const TargetMachine *TM = nullptr;
+  const TargetSubtargetInfo *TSI;
+  const TargetLowering *TLI = nullptr;
+  const TargetTransformInfo *TTI = nullptr;
+  const LoopInfo *LI;
+  DominatorTree *DT;
+  std::unique_ptr<BlockFrequencyInfo> BFI;
+  std::unique_ptr<BranchProbabilityInfo> BPI;
+  ProfileSummaryInfo *PSI;
+  OptimizationRemarkEmitter *ORE;
+  TargetSchedModel TSchedModel;
+
+public:
+  static char ID;
+
+  SelectOptimize() : FunctionPass(ID) {
+    initializeSelectOptimizePass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
+    AU.addRequired<TargetPassConfig>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+  }
+
+private:
+  // Select groups consist of consecutive select instructions with the same
+  // condition.
+  using SelectGroup = SmallVector<SelectInst *, 2>;
+  using SelectGroups = SmallVector<SelectGroup, 2>;
+
+  using Scaled64 = ScaledNumber<uint64_t>;
+
+  struct CostInfo {
+    /// Predicated cost (with selects as conditional moves).
+    Scaled64 PredCost;
+    /// Non-predicated cost (with selects converted to branches).
+    Scaled64 NonPredCost;
+  };
+
+  // Converts select instructions of a function to conditional jumps when deemed
+  // profitable. Returns true if at least one select was converted.
+  bool optimizeSelects(Function &F);
+
+  // Heuristics for determining which select instructions can be profitably
+  // conveted to branches. Separate heuristics for selects in inner-most loops
+  // and the rest of code regions (base heuristics for non-inner-most loop
+  // regions).
+  void optimizeSelectsBase(Function &F, SelectGroups &ProfSIGroups);
+  void optimizeSelectsInnerLoops(Function &F, SelectGroups &ProfSIGroups);
+
+  // Converts to branches the select groups that were deemed
+  // profitable-to-convert.
+  void convertProfitableSIGroups(SelectGroups &ProfSIGroups);
+
+  // Splits selects of a given basic block into select groups.
+  void collectSelectGroups(BasicBlock &BB, SelectGroups &SIGroups);
+
+  // Determines for which select groups it is profitable converting to branches
+  // (base and inner-most-loop heuristics).
+  void findProfitableSIGroupsBase(SelectGroups &SIGroups,
+                                  SelectGroups &ProfSIGroups);
+  void findProfitableSIGroupsInnerLoops(const Loop *L, SelectGroups &SIGroups,
+                                        SelectGroups &ProfSIGroups);
+
+  // Determines if a select group should be converted to a branch (base
+  // heuristics).
+  bool isConvertToBranchProfitableBase(const SmallVector<SelectInst *, 2> &ASI);
+
+  // Returns true if there are expensive instructions in the cold value
+  // operand's (if any) dependence slice of any of the selects of the given
+  // group.
+  bool hasExpensiveColdOperand(const SmallVector<SelectInst *, 2> &ASI);
+
+  // For a given source instruction, collect its backwards dependence slice
+  // consisting of instructions exclusively computed for producing the operands
+  // of the source instruction.
+  void getExclBackwardsSlice(Instruction *I, std::stack<Instruction *> &Slice,
+                             bool ForSinking = false);
+
+  // Returns true if the condition of the select is highly predictable.
+  bool isSelectHighlyPredictable(const SelectInst *SI);
+
+  // Loop-level checks to determine if a non-predicated version (with branches)
+  // of the given loop is more profitable than its predicated version.
+  bool checkLoopHeuristics(const Loop *L, const CostInfo LoopDepth[2]);
+
+  // Computes instruction and loop-critical-path costs for both the predicated
+  // and non-predicated version of the given loop.
+  bool computeLoopCosts(const Loop *L, const SelectGroups &SIGroups,
+                        DenseMap<const Instruction *, CostInfo> &InstCostMap,
+                        CostInfo *LoopCost);
+
+  // Returns a set of all the select instructions in the given select groups.
+  SmallPtrSet<const Instruction *, 2> getSIset(const SelectGroups &SIGroups);
+
+  // Returns the latency cost of a given instruction.
+  Optional<uint64_t> computeInstCost(const Instruction *I);
+
+  // Returns the misprediction cost of a given select when converted to branch.
+  Scaled64 getMispredictionCost(const SelectInst *SI, const Scaled64 CondCost);
+
+  // Returns the cost of a branch when the prediction is correct.
+  Scaled64 getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
+                                const SelectInst *SI);
+
+  // Returns true if the target architecture supports lowering a given select.
+  bool isSelectKindSupported(SelectInst *SI);
+};
+} // namespace
+
+char SelectOptimize::ID = 0;
+
+INITIALIZE_PASS_BEGIN(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
+                    false)
+
+FunctionPass *llvm::createSelectOptimizePass() { return new SelectOptimize(); }
+
+bool SelectOptimize::runOnFunction(Function &F) {
+  TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
+  TSI = TM->getSubtargetImpl(F);
+  TLI = TSI->getTargetLowering();
+
+  // If none of the select types is supported then skip this pass.
+  // This is an optimization pass. Legality issues will be handled by
+  // instruction selection.
+  if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) &&
+      !TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) &&
+      !TLI->isSelectSupported(TargetLowering::VectorMaskSelect))
+    return false;
+
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  BPI.reset(new BranchProbabilityInfo(F, *LI));
+  BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI));
+  PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+  TSchedModel.init(TSI);
+
+  // When optimizing for size, selects are preferable over branches.
+  if (F.hasOptSize() || llvm::shouldOptimizeForSize(&F, PSI, BFI.get()))
+    return false;
+
+  return optimizeSelects(F);
+}
+
+bool SelectOptimize::optimizeSelects(Function &F) {
+  // Determine for which select groups it is profitable converting to branches.
+  SelectGroups ProfSIGroups;
+  // Base heuristics apply only to non-loops and outer loops.
+  optimizeSelectsBase(F, ProfSIGroups);
+  // Separate heuristics for inner-most loops.
+  optimizeSelectsInnerLoops(F, ProfSIGroups);
+
+  // Convert to branches the select groups that were deemed
+  // profitable-to-convert.
+  convertProfitableSIGroups(ProfSIGroups);
+
+  // Code modified if at least one select group was converted.
+  return !ProfSIGroups.empty();
+}
+
+void SelectOptimize::optimizeSelectsBase(Function &F,
+                                         SelectGroups &ProfSIGroups) {
+  // Collect all the select groups.
+  SelectGroups SIGroups;
+  for (BasicBlock &BB : F) {
+    // Base heuristics apply only to non-loops and outer loops.
+    Loop *L = LI->getLoopFor(&BB);
+    if (L && L->isInnermost())
+      continue;
+    collectSelectGroups(BB, SIGroups);
+  }
+
+  // Determine for which select groups it is profitable converting to branches.
+  findProfitableSIGroupsBase(SIGroups, ProfSIGroups);
+}
+
+void SelectOptimize::optimizeSelectsInnerLoops(Function &F,
+                                               SelectGroups &ProfSIGroups) {
+  SmallVector<Loop *, 4> Loops(LI->begin(), LI->end());
+  // Need to check size on each iteration as we accumulate child loops.
+  for (unsigned long i = 0; i < Loops.size(); ++i)
+    for (Loop *ChildL : Loops[i]->getSubLoops())
+      Loops.push_back(ChildL);
+
+  for (Loop *L : Loops) {
+    if (!L->isInnermost())
+      continue;
+
+    SelectGroups SIGroups;
+    for (BasicBlock *BB : L->getBlocks())
+      collectSelectGroups(*BB, SIGroups);
+
+    findProfitableSIGroupsInnerLoops(L, SIGroups, ProfSIGroups);
+  }
+}
+
+/// If \p isTrue is true, return the true value of \p SI, otherwise return
+/// false value of \p SI. If the true/false value of \p SI is defined by any
+/// select instructions in \p Selects, look through the defining select
+/// instruction until the true/false value is not defined in \p Selects.
+static Value *
+getTrueOrFalseValue(SelectInst *SI, bool isTrue,
+                    const SmallPtrSet<const Instruction *, 2> &Selects) {
+  Value *V = nullptr;
+  for (SelectInst *DefSI = SI; DefSI != nullptr && Selects.count(DefSI);
+       DefSI = dyn_cast<SelectInst>(V)) {
+    assert(DefSI->getCondition() == SI->getCondition() &&
+           "The condition of DefSI does not match with SI");
+    V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
+  }
+  assert(V && "Failed to get select true/false value");
+  return V;
+}
+
+void SelectOptimize::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
+  for (SelectGroup &ASI : ProfSIGroups) {
+    // The code transformation here is a modified version of the sinking
+    // transformation in CodeGenPrepare::optimizeSelectInst with a more
+    // aggressive strategy of which instructions to sink.
+    //
+    // TODO: eliminate the redundancy of logic transforming selects to branches
+    // by removing CodeGenPrepare::optimizeSelectInst and optimizing here
+    // selects for all cases (with and without profile information).
+
+    // Transform a sequence like this:
+    //    start:
+    //       %cmp = cmp uge i32 %a, %b
+    //       %sel = select i1 %cmp, i32 %c, i32 %d
+    //
+    // Into:
+    //    start:
+    //       %cmp = cmp uge i32 %a, %b
+    //       %cmp.frozen = freeze %cmp
+    //       br i1 %cmp.frozen, label %select.true, label %select.false
+    //    select.true:
+    //       br label %select.end
+    //    select.false:
+    //       br label %select.end
+    //    select.end:
+    //       %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
+    //
+    // %cmp should be frozen, otherwise it may introduce undefined behavior.
+    // In addition, we may sink instructions that produce %c or %d into the
+    // destination(s) of the new branch.
+    // If the true or false blocks do not contain a sunken instruction, that
+    // block and its branch may be optimized away. In that case, one side of the
+    // first branch will point directly to select.end, and the corresponding PHI
+    // predecessor block will be the start block.
+
+    // Find all the instructions that can be soundly sunk to the true/false
+    // blocks. These are instructions that are computed solely for producing the
+    // operands of the select instructions in the group and can be sunk without
+    // breaking the semantics of the LLVM IR (e.g., cannot sink instructions
+    // with side effects).
+    SmallVector<std::stack<Instruction *>, 2> TrueSlices, FalseSlices;
+    typedef std::stack<Instruction *>::size_type StackSizeType;
+    StackSizeType maxTrueSliceLen = 0, maxFalseSliceLen = 0;
+    for (SelectInst *SI : ASI) {
+      // For each select, compute the sinkable dependence chains of the true and
+      // false operands.
+      if (auto *TI = dyn_cast<Instruction>(SI->getTrueValue())) {
+        std::stack<Instruction *> TrueSlice;
+        getExclBackwardsSlice(TI, TrueSlice, true);
+        maxTrueSliceLen = std::max(maxTrueSliceLen, TrueSlice.size());
+        TrueSlices.push_back(TrueSlice);
+      }
+      if (auto *FI = dyn_cast<Instruction>(SI->getFalseValue())) {
+        std::stack<Instruction *> FalseSlice;
+        getExclBackwardsSlice(FI, FalseSlice, true);
+        maxFalseSliceLen = std::max(maxFalseSliceLen, FalseSlice.size());
+        FalseSlices.push_back(FalseSlice);
+      }
+    }
+    // In the case of multiple select instructions in the same group, the order
+    // of non-dependent instructions (instructions of different dependence
+    // slices) in the true/false blocks appears to affect performance.
+    // Interleaving the slices seems to experimentally be the optimal approach.
+    // This interleaving scheduling allows for more ILP (with a natural downside
+    // of increasing a bit register pressure) compared to a simple ordering of
+    // one whole chain after another. One would expect that this ordering would
+    // not matter since the scheduling in the backend of the compiler  would
+    // take care of it, but apparently the scheduler fails to deliver optimal
+    // ILP with a naive ordering here.
+    SmallVector<Instruction *, 2> TrueSlicesInterleaved, FalseSlicesInterleaved;
+    for (StackSizeType IS = 0; IS < maxTrueSliceLen; ++IS) {
+      for (auto &S : TrueSlices) {
+        if (!S.empty()) {
+          TrueSlicesInterleaved.push_back(S.top());
+          S.pop();
+        }
+      }
+    }
+    for (StackSizeType IS = 0; IS < maxFalseSliceLen; ++IS) {
+      for (auto &S : FalseSlices) {
+        if (!S.empty()) {
+          FalseSlicesInterleaved.push_back(S.top());
+          S.pop();
+        }
+      }
+    }
+
+    // We split the block containing the select(s) into two blocks.
+    SelectInst *SI = ASI.front();
+    SelectInst *LastSI = ASI.back();
+    BasicBlock *StartBlock = SI->getParent();
+    BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI));
+    BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
+    BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock).getFrequency());
+    // Delete the unconditional branch that was just created by the split.
+    StartBlock->getTerminator()->eraseFromParent();
+
+    // Move any debug/pseudo instructions that were in-between the select
+    // group to the newly-created end block.
+    SmallVector<Instruction *, 2> DebugPseudoINS;
+    auto DIt = SI->getIterator();
+    while (&*DIt != LastSI) {
+      if (DIt->isDebugOrPseudoInst())
+        DebugPseudoINS.push_back(&*DIt);
+      DIt++;
+    }
+    for (auto DI : DebugPseudoINS) {
+      DI->moveBefore(&*EndBlock->getFirstInsertionPt());
+    }
+
+    // These are the new basic blocks for the conditional branch.
+    // At least one will become an actual new basic block.
+    BasicBlock *TrueBlock = nullptr, *FalseBlock = nullptr;
+    BranchInst *TrueBranch = nullptr, *FalseBranch = nullptr;
+    if (!TrueSlicesInterleaved.empty()) {
+      TrueBlock = BasicBlock::Create(LastSI->getContext(), "select.true.sink",
+                                     EndBlock->getParent(), EndBlock);
+      TrueBranch = BranchInst::Create(EndBlock, TrueBlock);
+      TrueBranch->setDebugLoc(LastSI->getDebugLoc());
+      for (Instruction *TrueInst : TrueSlicesInterleaved)
+        TrueInst->moveBefore(TrueBranch);
+    }
+    if (!FalseSlicesInterleaved.empty()) {
+      FalseBlock = BasicBlock::Create(LastSI->getContext(), "select.false.sink",
+                                      EndBlock->getParent(), EndBlock);
+      FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
+      FalseBranch->setDebugLoc(LastSI->getDebugLoc());
+      for (Instruction *FalseInst : FalseSlicesInterleaved)
+        FalseInst->moveBefore(FalseBranch);
+    }
+    // If there was nothing to sink, then arbitrarily choose the 'false' side
+    // for a new input value to the PHI.
+    if (TrueBlock == FalseBlock) {
+      assert(TrueBlock == nullptr &&
+             "Unexpected basic block transform while optimizing select");
+
+      FalseBlock = BasicBlock::Create(SI->getContext(), "select.false",
+                                      EndBlock->getParent(), EndBlock);
+      auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
+      FalseBranch->setDebugLoc(SI->getDebugLoc());
+    }
+
+    // Insert the real conditional branch based on the original condition.
+    // If we did not create a new block for one of the 'true' or 'false' paths
+    // of the condition, it means that side of the branch goes to the end block
+    // directly and the path originates from the start block from the point of
+    // view of the new PHI.
+    BasicBlock *TT, *FT;
+    if (TrueBlock == nullptr) {
+      TT = EndBlock;
+      FT = FalseBlock;
+      TrueBlock = StartBlock;
+    } else if (FalseBlock == nullptr) {
+      TT = TrueBlock;
+      FT = EndBlock;
+      FalseBlock = StartBlock;
+    } else {
+      TT = TrueBlock;
+      FT = FalseBlock;
+    }
+    IRBuilder<> IB(SI);
+    auto *CondFr =
+        IB.CreateFreeze(SI->getCondition(), SI->getName() + ".frozen");
+    IB.CreateCondBr(CondFr, TT, FT, SI);
+
+    SmallPtrSet<const Instruction *, 2> INS;
+    INS.insert(ASI.begin(), ASI.end());
+    // Use reverse iterator because later select may use the value of the
+    // earlier select, and we need to propagate value through earlier select
+    // to get the PHI operand.
+    for (auto It = ASI.rbegin(); It != ASI.rend(); ++It) {
+      SelectInst *SI = *It;
+      // The select itself is replaced with a PHI Node.
+      PHINode *PN = PHINode::Create(SI->getType(), 2, "", &EndBlock->front());
+      PN->takeName(SI);
+      PN->addIncoming(getTrueOrFalseValue(SI, true, INS), TrueBlock);
+      PN->addIncoming(getTrueOrFalseValue(SI, false, INS), FalseBlock);
+      PN->setDebugLoc(SI->getDebugLoc());
+
+      SI->replaceAllUsesWith(PN);
+      SI->eraseFromParent();
+      INS.erase(SI);
+      ++NumSelectsConverted;
+    }
+  }
+}
+
+void SelectOptimize::collectSelectGroups(BasicBlock &BB,
+                                         SelectGroups &SIGroups) {
+  BasicBlock::iterator BBIt = BB.begin();
+  while (BBIt != BB.end()) {
+    Instruction *I = &*BBIt++;
+    if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+      SelectGroup SIGroup;
+      SIGroup.push_back(SI);
+      while (BBIt != BB.end()) {
+        Instruction *NI = &*BBIt;
+        SelectInst *NSI = dyn_cast<SelectInst>(NI);
+        if (NSI && SI->getCondition() == NSI->getCondition()) {
+          SIGroup.push_back(NSI);
+        } else if (!NI->isDebugOrPseudoInst()) {
+          // Debug/pseudo instructions should be skipped and not prevent the
+          // formation of a select group.
+          break;
+        }
+        ++BBIt;
+      }
+
+      // If the select type is not supported, no point optimizing it.
+      // Instruction selection will take care of it.
+      if (!isSelectKindSupported(SI))
+        continue;
+
+      SIGroups.push_back(SIGroup);
+    }
+  }
+}
+
+void SelectOptimize::findProfitableSIGroupsBase(SelectGroups &SIGroups,
+                                                SelectGroups &ProfSIGroups) {
+  for (SelectGroup &ASI : SIGroups) {
+    ++NumSelectOptAnalyzed;
+    if (isConvertToBranchProfitableBase(ASI))
+      ProfSIGroups.push_back(ASI);
+  }
+}
+
+void SelectOptimize::findProfitableSIGroupsInnerLoops(
+    const Loop *L, SelectGroups &SIGroups, SelectGroups &ProfSIGroups) {
+  NumSelectOptAnalyzed += SIGroups.size();
+  // For each select group in an inner-most loop,
+  // a branch is more preferable than a select/conditional-move if:
+  // i) conversion to branches for all the select groups of the loop satisfies
+  //    loop-level heuristics including reducing the loop's critical path by
+  //    some threshold (see SelectOptimize::checkLoopHeuristics); and
+  // ii) the total cost of the select group is cheaper with a branch compared
+  //     to its predicated version. The cost is in terms of latency and the cost
+  //     of a select group is the cost of its most expensive select instruction
+  //     (assuming infinite resources and thus fully leveraging available ILP).
+
+  DenseMap<const Instruction *, CostInfo> InstCostMap;
+  CostInfo LoopCost[2] = {{Scaled64::getZero(), Scaled64::getZero()},
+                          {Scaled64::getZero(), Scaled64::getZero()}};
+  if (!computeLoopCosts(L, SIGroups, InstCostMap, LoopCost) ||
+      !checkLoopHeuristics(L, LoopCost)) {
+    return;
+  }
+
+  for (SelectGroup &ASI : SIGroups) {
+    // Assuming infinite resources, the cost of a group of instructions is the
+    // cost of the most expensive instruction of the group.
+    Scaled64 SelectCost = Scaled64::getZero(), BranchCost = Scaled64::getZero();
+    for (SelectInst *SI : ASI) {
+      SelectCost = std::max(SelectCost, InstCostMap[SI].PredCost);
+      BranchCost = std::max(BranchCost, InstCostMap[SI].NonPredCost);
+    }
+    if (BranchCost < SelectCost) {
+      OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", ASI.front());
+      OR << "Profitable to convert to branch (loop analysis). BranchCost="
+         << BranchCost.toString() << ", SelectCost=" << SelectCost.toString()
+         << ". ";
+      ORE->emit(OR);
+      ++NumSelectConvertedLoop;
+      ProfSIGroups.push_back(ASI);
+    } else {
+      OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", ASI.front());
+      ORmiss << "Select is more profitable (loop analysis). BranchCost="
+             << BranchCost.toString()
+             << ", SelectCost=" << SelectCost.toString() << ". ";
+      ORE->emit(ORmiss);
+    }
+  }
+}
+
+bool SelectOptimize::isConvertToBranchProfitableBase(
+    const SmallVector<SelectInst *, 2> &ASI) {
+  SelectInst *SI = ASI.front();
+  OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", SI);
+  OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", SI);
+
+  // Skip cold basic blocks. Better to optimize for size for cold blocks.
+  if (PSI->isColdBlock(SI->getParent(), BFI.get())) {
+    ++NumSelectColdBB;
+    ORmiss << "Not converted to branch because of cold basic block. ";
+    ORE->emit(ORmiss);
+    return false;
+  }
+
+  // If unpredictable, branch form is less profitable.
+  if (SI->getMetadata(LLVMContext::MD_unpredictable)) {
+    ++NumSelectUnPred;
+    ORmiss << "Not converted to branch because of unpredictable branch. ";
+    ORE->emit(ORmiss);
+    return false;
+  }
+
+  // If highly predictable, branch form is more profitable, unless a
+  // predictable select is inexpensive in the target architecture.
+  if (isSelectHighlyPredictable(SI) && TLI->isPredictableSelectExpensive()) {
+    ++NumSelectConvertedHighPred;
+    OR << "Converted to branch because of highly predictable branch. ";
+    ORE->emit(OR);
+    return true;
+  }
+
+  // Look for expensive instructions in the cold operand's (if any) dependence
+  // slice of any of the selects in the group.
+  if (hasExpensiveColdOperand(ASI)) {
+    ++NumSelectConvertedExpColdOperand;
+    OR << "Converted to branch because of expensive cold operand.";
+    ORE->emit(OR);
+    return true;
+  }
+
+  ORmiss << "Not profitable to convert to branch (base heuristic).";
+  ORE->emit(ORmiss);
+  return false;
+}
+
+static InstructionCost divideNearest(InstructionCost Numerator,
+                                     uint64_t Denominator) {
+  return (Numerator + (Denominator / 2)) / Denominator;
+}
+
+bool SelectOptimize::hasExpensiveColdOperand(
+    const SmallVector<SelectInst *, 2> &ASI) {
+  bool ColdOperand = false;
+  uint64_t TrueWeight, FalseWeight, TotalWeight;
+  if (ASI.front()->extractProfMetadata(TrueWeight, FalseWeight)) {
+    uint64_t MinWeight = std::min(TrueWeight, FalseWeight);
+    TotalWeight = TrueWeight + FalseWeight;
+    // Is there a path with frequency <ColdOperandThreshold% (default:20%) ?
+    ColdOperand = TotalWeight * ColdOperandThreshold > 100 * MinWeight;
+  } else if (PSI->hasProfileSummary()) {
+    OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", ASI.front());
+    ORmiss << "Profile data available but missing branch-weights metadata for "
+              "select instruction. ";
+    ORE->emit(ORmiss);
+  }
+  if (!ColdOperand)
+    return false;
+  // Check if the cold path's dependence slice is expensive for any of the
+  // selects of the group.
+  for (SelectInst *SI : ASI) {
+    Instruction *ColdI = nullptr;
+    uint64_t HotWeight;
+    if (TrueWeight < FalseWeight) {
+      ColdI = dyn_cast<Instruction>(SI->getTrueValue());
+      HotWeight = FalseWeight;
+    } else {
+      ColdI = dyn_cast<Instruction>(SI->getFalseValue());
+      HotWeight = TrueWeight;
+    }
+    if (ColdI) {
+      std::stack<Instruction *> ColdSlice;
+      getExclBackwardsSlice(ColdI, ColdSlice);
+      InstructionCost SliceCost = 0;
+      while (!ColdSlice.empty()) {
+        SliceCost += TTI->getInstructionCost(ColdSlice.top(),
+                                             TargetTransformInfo::TCK_Latency);
+        ColdSlice.pop();
+      }
+      // The colder the cold value operand of the select is the more expensive
+      // the cmov becomes for computing the cold value operand every time. Thus,
+      // the colder the cold operand is the more its cost counts.
+      // Get nearest integer cost adjusted for coldness.
+      InstructionCost AdjSliceCost =
+          divideNearest(SliceCost * HotWeight, TotalWeight);
+      if (AdjSliceCost >=
+          ColdOperandMaxCostMultiplier * TargetTransformInfo::TCC_Expensive)
+        return true;
+    }
+  }
+  return false;
+}
+
+// For a given source instruction, collect its backwards dependence slice
+// consisting of instructions exclusively computed for the purpose of producing
+// the operands of the source instruction. As an approximation
+// (sufficiently-accurate in practice), we populate this set with the
+// instructions of the backwards dependence slice that only have one-use and
+// form an one-use chain that leads to the source instruction.
+void SelectOptimize::getExclBackwardsSlice(Instruction *I,
+                                           std::stack<Instruction *> &Slice,
+                                           bool ForSinking) {
+  SmallPtrSet<Instruction *, 2> Visited;
+  std::queue<Instruction *> Worklist;
+  Worklist.push(I);
+  while (!Worklist.empty()) {
+    Instruction *II = Worklist.front();
+    Worklist.pop();
+
+    // Avoid cycles.
+    if (!Visited.insert(II).second)
+      continue;
+
+    if (!II->hasOneUse())
+      continue;
+
+    // Cannot soundly sink instructions with side-effects.
+    // Terminator or phi instructions cannot be sunk.
+    // Avoid sinking other select instructions (should be handled separetely).
+    if (ForSinking && (II->isTerminator() || II->mayHaveSideEffects() ||
+                       isa<SelectInst>(II) || isa<PHINode>(II)))
+      continue;
+
+    // Avoid considering instructions with less frequency than the source
+    // instruction (i.e., avoid colder code regions of the dependence slice).
+    if (BFI->getBlockFreq(II->getParent()) < BFI->getBlockFreq(I->getParent()))
+      continue;
+
+    // Eligible one-use instruction added to the dependence slice.
+    Slice.push(II);
+
+    // Explore all the operands of the current instruction to expand the slice.
+    for (unsigned k = 0; k < II->getNumOperands(); ++k)
+      if (auto *OpI = dyn_cast<Instruction>(II->getOperand(k)))
+        Worklist.push(OpI);
+  }
+}
+
+bool SelectOptimize::isSelectHighlyPredictable(const SelectInst *SI) {
+  uint64_t TrueWeight, FalseWeight;
+  if (SI->extractProfMetadata(TrueWeight, FalseWeight)) {
+    uint64_t Max = std::max(TrueWeight, FalseWeight);
+    uint64_t Sum = TrueWeight + FalseWeight;
+    if (Sum != 0) {
+      auto Probability = BranchProbability::getBranchProbability(Max, Sum);
+      if (Probability > TTI->getPredictableBranchThreshold())
+        return true;
+    }
+  }
+  return false;
+}
+
+bool SelectOptimize::checkLoopHeuristics(const Loop *L,
+                                         const CostInfo LoopCost[2]) {
+  // Loop-level checks to determine if a non-predicated version (with branches)
+  // of the loop is more profitable than its predicated version.
+
+  if (DisableLoopLevelHeuristics)
+    return true;
+
+  OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti",
+                                   L->getHeader()->getFirstNonPHI());
+
+  if (LoopCost[0].NonPredCost > LoopCost[0].PredCost ||
+      LoopCost[1].NonPredCost >= LoopCost[1].PredCost) {
+    ORmissL << "No select conversion in the loop due to no reduction of loop's "
+               "critical path. ";
+    ORE->emit(ORmissL);
+    return false;
+  }
+
+  Scaled64 Gain[2] = {LoopCost[0].PredCost - LoopCost[0].NonPredCost,
+                      LoopCost[1].PredCost - LoopCost[1].NonPredCost};
+
+  // Profitably converting to branches need to reduce the loop's critical path
+  // by at least some threshold (absolute gain of GainCycleThreshold cycles and
+  // relative gain of 12.5%).
+  if (Gain[1] < Scaled64::get(GainCycleThreshold) ||
+      Gain[1] * Scaled64::get(GainRelativeThreshold) < LoopCost[1].PredCost) {
+    Scaled64 RelativeGain = Scaled64::get(100) * Gain[1] / LoopCost[1].PredCost;
+    ORmissL << "No select conversion in the loop due to small reduction of "
+               "loop's critical path. Gain="
+            << Gain[1].toString()
+            << ", RelativeGain=" << RelativeGain.toString() << "%. ";
+    ORE->emit(ORmissL);
+    return false;
+  }
+
+  // If the loop's critical path involves loop-carried dependences, the gradient
+  // of the gain needs to be at least GainGradientThreshold% (defaults to 25%).
+  // This check ensures that the latency reduction for the loop's critical path
+  // keeps decreasing with sufficient rate beyond the two analyzed loop
+  // iterations.
+  if (Gain[1] > Gain[0]) {
+    Scaled64 GradientGain = Scaled64::get(100) * (Gain[1] - Gain[0]) /
+                            (LoopCost[1].PredCost - LoopCost[0].PredCost);
+    if (GradientGain < Scaled64::get(GainGradientThreshold)) {
+      ORmissL << "No select conversion in the loop due to small gradient gain. "
+                 "GradientGain="
+              << GradientGain.toString() << "%. ";
+      ORE->emit(ORmissL);
+      return false;
+    }
+  }
+  // If the gain decreases it is not profitable to convert.
+  else if (Gain[1] < Gain[0]) {
+    ORmissL
+        << "No select conversion in the loop due to negative gradient gain. ";
+    ORE->emit(ORmissL);
+    return false;
+  }
+
+  // Non-predicated version of the loop is more profitable than its
+  // predicated version.
+  return true;
+}
+
+// Computes instruction and loop-critical-path costs for both the predicated
+// and non-predicated version of the given loop.
+// Returns false if unable to compute these costs due to invalid cost of loop
+// instruction(s).
+bool SelectOptimize::computeLoopCosts(
+    const Loop *L, const SelectGroups &SIGroups,
+    DenseMap<const Instruction *, CostInfo> &InstCostMap, CostInfo *LoopCost) {
+  const auto &SIset = getSIset(SIGroups);
+  // Compute instruction and loop-critical-path costs across two iterations for
+  // both predicated and non-predicated version.
+  const unsigned Iterations = 2;
+  for (unsigned Iter = 0; Iter < Iterations; ++Iter) {
+    // Cost of the loop's critical path.
+    CostInfo &MaxCost = LoopCost[Iter];
+    for (BasicBlock *BB : L->getBlocks()) {
+      for (const Instruction &I : *BB) {
+        if (I.isDebugOrPseudoInst())
+          continue;
+        // Compute the predicated and non-predicated cost of the instruction.
+        Scaled64 IPredCost = Scaled64::getZero(),
+                 INonPredCost = Scaled64::getZero();
+
+        // Assume infinite resources that allow to fully exploit the available
+        // instruction-level parallelism.
+        // InstCost = InstLatency + max(Op1Cost, Op2Cost, … OpNCost)
+        for (const Use &U : I.operands()) {
+          auto UI = dyn_cast<Instruction>(U.get());
+          if (!UI)
+            continue;
+          if (InstCostMap.count(UI)) {
+            IPredCost = std::max(IPredCost, InstCostMap[UI].PredCost);
+            INonPredCost = std::max(INonPredCost, InstCostMap[UI].NonPredCost);
+          }
+        }
+        auto ILatency = computeInstCost(&I);
+        if (!ILatency) {
+          OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti", &I);
+          ORmissL << "Invalid instruction cost preventing analysis and "
+                     "optimization of the inner-most loop containing this "
+                     "instruction. ";
+          ORE->emit(ORmissL);
+          return false;
+        }
+        IPredCost += Scaled64::get(ILatency.getValue());
+        INonPredCost += Scaled64::get(ILatency.getValue());
+
+        // For a select that can be converted to branch,
+        // compute its cost as a branch (non-predicated cost).
+        //
+        // BranchCost = PredictedPathCost + MispredictCost
+        // PredictedPathCost = TrueOpCost * TrueProb + FalseOpCost * FalseProb
+        // MispredictCost = max(MispredictPenalty, CondCost) * MispredictRate
+        if (SIset.contains(&I)) {
+          auto SI = dyn_cast<SelectInst>(&I);
+
+          Scaled64 TrueOpCost = Scaled64::getZero(),
+                   FalseOpCost = Scaled64::getZero();
+          if (auto *TI = dyn_cast<Instruction>(SI->getTrueValue()))
+            if (InstCostMap.count(TI))
+              TrueOpCost = InstCostMap[TI].NonPredCost;
+          if (auto *FI = dyn_cast<Instruction>(SI->getFalseValue()))
+            if (InstCostMap.count(FI))
+              FalseOpCost = InstCostMap[FI].NonPredCost;
+          Scaled64 PredictedPathCost =
+              getPredictedPathCost(TrueOpCost, FalseOpCost, SI);
+
+          Scaled64 CondCost = Scaled64::getZero();
+          if (auto *CI = dyn_cast<Instruction>(SI->getCondition()))
+            if (InstCostMap.count(CI))
+              CondCost = InstCostMap[CI].NonPredCost;
+          Scaled64 MispredictCost = getMispredictionCost(SI, CondCost);
+
+          INonPredCost = PredictedPathCost + MispredictCost;
+        }
+
+        InstCostMap[&I] = {IPredCost, INonPredCost};
+        MaxCost.PredCost = std::max(MaxCost.PredCost, IPredCost);
+        MaxCost.NonPredCost = std::max(MaxCost.NonPredCost, INonPredCost);
+      }
+    }
+  }
+  return true;
+}
+
+SmallPtrSet<const Instruction *, 2>
+SelectOptimize::getSIset(const SelectGroups &SIGroups) {
+  SmallPtrSet<const Instruction *, 2> SIset;
+  for (const SelectGroup &ASI : SIGroups)
+    for (const SelectInst *SI : ASI)
+      SIset.insert(SI);
+  return SIset;
+}
+
+Optional<uint64_t> SelectOptimize::computeInstCost(const Instruction *I) {
+  InstructionCost ICost =
+      TTI->getInstructionCost(I, TargetTransformInfo::TCK_Latency);
+  if (auto OC = ICost.getValue())
+    return Optional<uint64_t>(*OC);
+  return Optional<uint64_t>(None);
+}
+
+ScaledNumber<uint64_t>
+SelectOptimize::getMispredictionCost(const SelectInst *SI,
+                                     const Scaled64 CondCost) {
+  uint64_t MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;
+
+  // Account for the default misprediction rate when using a branch
+  // (conservatively set to 25% by default).
+  uint64_t MispredictRate = MispredictDefaultRate;
+  // If the select condition is obviously predictable, then the misprediction
+  // rate is zero.
+  if (isSelectHighlyPredictable(SI))
+    MispredictRate = 0;
+
+  // CondCost is included to account for cases where the computation of the
+  // condition is part of a long dependence chain (potentially loop-carried)
+  // that would delay detection of a misprediction and increase its cost.
+  Scaled64 MispredictCost =
+      std::max(Scaled64::get(MispredictPenalty), CondCost) *
+      Scaled64::get(MispredictRate);
+  MispredictCost /= Scaled64::get(100);
+
+  return MispredictCost;
+}
+
+// Returns the cost of a branch when the prediction is correct.
+// TrueCost * TrueProbability + FalseCost * FalseProbability.
+ScaledNumber<uint64_t>
+SelectOptimize::getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
+                                     const SelectInst *SI) {
+  Scaled64 PredPathCost;
+  uint64_t TrueWeight, FalseWeight;
+  if (SI->extractProfMetadata(TrueWeight, FalseWeight)) {
+    uint64_t SumWeight = TrueWeight + FalseWeight;
+    if (SumWeight != 0) {
+      PredPathCost = TrueCost * Scaled64::get(TrueWeight) +
+                     FalseCost * Scaled64::get(FalseWeight);
+      PredPathCost /= Scaled64::get(SumWeight);
+      return PredPathCost;
+    }
+  }
+  // Without branch weight metadata, we assume 75% for the one path and 25% for
+  // the other, and pick the result with the biggest cost.
+  PredPathCost = std::max(TrueCost * Scaled64::get(3) + FalseCost,
+                          FalseCost * Scaled64::get(3) + TrueCost);
+  PredPathCost /= Scaled64::get(4);
+  return PredPathCost;
+}
+
+bool SelectOptimize::isSelectKindSupported(SelectInst *SI) {
+  bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);
+  if (VectorCond)
+    return false;
+  TargetLowering::SelectSupportKind SelectKind;
+  if (SI->getType()->isVectorTy())
+    SelectKind = TargetLowering::ScalarCondVectorVal;
+  else
+    SelectKind = TargetLowering::ScalarValSelect;
+  return TLI->isSelectSupported(SelectKind);
+}
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 041d7e5b4a4a..aa688d9dda3c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -35,7 +35,6 @@
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/DAGCombine.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
@@ -52,7 +51,6 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
@@ -426,6 +424,7 @@ namespace {
     SDValue visitREM(SDNode *N);
     SDValue visitMULHU(SDNode *N);
     SDValue visitMULHS(SDNode *N);
+    SDValue visitAVG(SDNode *N);
     SDValue visitSMUL_LOHI(SDNode *N);
     SDValue visitUMUL_LOHI(SDNode *N);
     SDValue visitMULO(SDNode *N);
@@ -511,6 +510,7 @@ namespace {
     SDValue visitMSCATTER(SDNode *N);
     SDValue visitFP_TO_FP16(SDNode *N);
     SDValue visitFP16_TO_FP(SDNode *N);
+    SDValue visitFP_TO_BF16(SDNode *N);
     SDValue visitVECREDUCE(SDNode *N);
     SDValue visitVPOp(SDNode *N);
 
@@ -520,7 +520,9 @@ namespace {
 
     SDValue XformToShuffleWithZero(SDNode *N);
     bool reassociationCanBreakAddressingModePattern(unsigned Opc,
-                                                    const SDLoc &DL, SDValue N0,
+                                                    const SDLoc &DL,
+                                                    SDNode *N,
+                                                    SDValue N0,
                                                     SDValue N1);
     SDValue reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, SDValue N0,
                                       SDValue N1);
@@ -570,6 +572,8 @@ namespace {
     SDValue BuildSDIV(SDNode *N);
     SDValue BuildSDIVPow2(SDNode *N);
     SDValue BuildUDIV(SDNode *N);
+    SDValue BuildSREMPow2(SDNode *N);
+    SDValue buildOptimizedSREM(SDValue N0, SDValue N1, SDNode *N);
     SDValue BuildLogBase2(SDValue V, const SDLoc &DL);
     SDValue BuildDivEstimate(SDValue N, SDValue Op, SDNodeFlags Flags);
     SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags);
@@ -583,11 +587,11 @@ namespace {
                                bool DemandHighBits = true);
     SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1);
     SDValue MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg,
-                              SDValue InnerPos, SDValue InnerNeg,
+                              SDValue InnerPos, SDValue InnerNeg, bool HasPos,
                               unsigned PosOpcode, unsigned NegOpcode,
                               const SDLoc &DL);
     SDValue MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos, SDValue Neg,
-                              SDValue InnerPos, SDValue InnerNeg,
+                              SDValue InnerPos, SDValue InnerNeg, bool HasPos,
                               unsigned PosOpcode, unsigned NegOpcode,
                               const SDLoc &DL);
     SDValue MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
@@ -665,9 +669,8 @@ namespace {
     /// of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2).
     /// MulNode is the original multiply, AddNode is (add x, c1),
     /// and ConstNode is c2.
-    bool isMulAddWithConstProfitable(SDNode *MulNode,
-                                     SDValue &AddNode,
-                                     SDValue &ConstNode);
+    bool isMulAddWithConstProfitable(SDNode *MulNode, SDValue AddNode,
+                                     SDValue ConstNode);
 
     /// This is a helper function for visitAND and visitZERO_EXTEND.  Returns
     /// true if the (and (load x) c) pattern matches an extload.  ExtVT returns
@@ -880,8 +883,8 @@ void DAGCombiner::deleteAndRecombine(SDNode *N) {
 // We provide an Offset so that we can create bitwidths that won't overflow.
 static void zeroExtendToMatch(APInt &LHS, APInt &RHS, unsigned Offset = 0) {
   unsigned Bits = Offset + std::max(LHS.getBitWidth(), RHS.getBitWidth());
-  LHS = LHS.zextOrSelf(Bits);
-  RHS = RHS.zextOrSelf(Bits);
+  LHS = LHS.zext(Bits);
+  RHS = RHS.zext(Bits);
 }
 
 // Return true if this node is a setcc, or is a select_cc
@@ -926,7 +929,7 @@ bool DAGCombiner::isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,
 /// it is profitable to do so.
 bool DAGCombiner::isOneUseSetCC(SDValue N) const {
   SDValue N0, N1, N2;
-  if (isSetCCEquivalent(N, N0, N1, N2) && N.getNode()->hasOneUse())
+  if (isSetCCEquivalent(N, N0, N1, N2) && N->hasOneUse())
     return true;
   return false;
 }
@@ -996,6 +999,7 @@ static bool canSplitIdx(LoadSDNode *LD) {
 
 bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc,
                                                              const SDLoc &DL,
+                                                             SDNode *N,
                                                              SDValue N0,
                                                              SDValue N1) {
   // Currently this only tries to ensure we don't undo the GEP splits done by
@@ -1004,33 +1008,62 @@ bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc,
   // (load/store (add, (add, x, offset1), offset2)) ->
   // (load/store (add, x, offset1+offset2)).
 
-  if (Opc != ISD::ADD || N0.getOpcode() != ISD::ADD)
-    return false;
+  // (load/store (add, (add, x, y), offset2)) ->
+  // (load/store (add, (add, x, offset2), y)).
 
-  if (N0.hasOneUse())
+  if (Opc != ISD::ADD || N0.getOpcode() != ISD::ADD)
     return false;
 
-  auto *C1 = dyn_cast<ConstantSDNode>(N0.getOperand(1));
   auto *C2 = dyn_cast<ConstantSDNode>(N1);
-  if (!C1 || !C2)
+  if (!C2)
     return false;
 
-  const APInt &C1APIntVal = C1->getAPIntValue();
   const APInt &C2APIntVal = C2->getAPIntValue();
-  if (C1APIntVal.getBitWidth() > 64 || C2APIntVal.getBitWidth() > 64)
+  if (C2APIntVal.getSignificantBits() > 64)
     return false;
 
-  const APInt CombinedValueIntVal = C1APIntVal + C2APIntVal;
-  if (CombinedValueIntVal.getBitWidth() > 64)
-    return false;
-  const int64_t CombinedValue = CombinedValueIntVal.getSExtValue();
-
-  for (SDNode *Node : N0->uses()) {
-    auto LoadStore = dyn_cast<MemSDNode>(Node);
-    if (LoadStore) {
-      // Is x[offset2] already not a legal addressing mode? If so then
-      // reassociating the constants breaks nothing (we test offset2 because
-      // that's the one we hope to fold into the load or store).
+  if (auto *C1 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+    if (N0.hasOneUse())
+      return false;
+
+    const APInt &C1APIntVal = C1->getAPIntValue();
+    const APInt CombinedValueIntVal = C1APIntVal + C2APIntVal;
+    if (CombinedValueIntVal.getSignificantBits() > 64)
+      return false;
+    const int64_t CombinedValue = CombinedValueIntVal.getSExtValue();
+
+    for (SDNode *Node : N->uses()) {
+      if (auto *LoadStore = dyn_cast<MemSDNode>(Node)) {
+        // Is x[offset2] already not a legal addressing mode? If so then
+        // reassociating the constants breaks nothing (we test offset2 because
+        // that's the one we hope to fold into the load or store).
+        TargetLoweringBase::AddrMode AM;
+        AM.HasBaseReg = true;
+        AM.BaseOffs = C2APIntVal.getSExtValue();
+        EVT VT = LoadStore->getMemoryVT();
+        unsigned AS = LoadStore->getAddressSpace();
+        Type *AccessTy = VT.getTypeForEVT(*DAG.getContext());
+        if (!TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, AccessTy, AS))
+          continue;
+
+        // Would x[offset1+offset2] still be a legal addressing mode?
+        AM.BaseOffs = CombinedValue;
+        if (!TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, AccessTy, AS))
+          return true;
+      }
+    }
+  } else {
+    if (auto *GA = dyn_cast<GlobalAddressSDNode>(N0.getOperand(1)))
+      if (GA->getOpcode() == ISD::GlobalAddress && TLI.isOffsetFoldingLegal(GA))
+        return false;
+
+    for (SDNode *Node : N->uses()) {
+      auto *LoadStore = dyn_cast<MemSDNode>(Node);
+      if (!LoadStore)
+        return false;
+
+      // Is x[offset2] a legal addressing mode? If so then
+      // reassociating the constants breaks address pattern
       TargetLoweringBase::AddrMode AM;
       AM.HasBaseReg = true;
       AM.BaseOffs = C2APIntVal.getSExtValue();
@@ -1038,13 +1071,9 @@ bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc,
       unsigned AS = LoadStore->getAddressSpace();
       Type *AccessTy = VT.getTypeForEVT(*DAG.getContext());
       if (!TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, AccessTy, AS))
-        continue;
-
-      // Would x[offset1+offset2] still be a legal addressing mode?
-      AM.BaseOffs = CombinedValue;
-      if (!TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, AccessTy, AS))
-        return true;
+        return false;
     }
+    return true;
   }
 
   return false;
@@ -1072,11 +1101,51 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,
     if (TLI.isReassocProfitable(DAG, N0, N1)) {
       // Reassociate: (op (op x, c1), y) -> (op (op x, y), c1)
       //              iff (op x, c1) has one use
-      if (SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N00, N1))
-        return DAG.getNode(Opc, DL, VT, OpNode, N01);
-      return SDValue();
+      SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N00, N1);
+      return DAG.getNode(Opc, DL, VT, OpNode, N01);
+    }
+  }
+
+  // Check for repeated operand logic simplifications.
+  if (Opc == ISD::AND || Opc == ISD::OR) {
+    // (N00 & N01) & N00 --> N00 & N01
+    // (N00 & N01) & N01 --> N00 & N01
+    // (N00 | N01) | N00 --> N00 | N01
+    // (N00 | N01) | N01 --> N00 | N01
+    if (N1 == N00 || N1 == N01)
+      return N0;
+  }
+  if (Opc == ISD::XOR) {
+    // (N00 ^ N01) ^ N00 --> N01
+    if (N1 == N00)
+      return N01;
+    // (N00 ^ N01) ^ N01 --> N00
+    if (N1 == N01)
+      return N00;
+  }
+
+  if (TLI.isReassocProfitable(DAG, N0, N1)) {
+    if (N1 != N01) {
+      // Reassociate if (op N00, N1) already exist
+      if (SDNode *NE = DAG.getNodeIfExists(Opc, DAG.getVTList(VT), {N00, N1})) {
+        // if Op (Op N00, N1), N01 already exist
+        // we need to stop reassciate to avoid dead loop
+        if (!DAG.doesNodeExist(Opc, DAG.getVTList(VT), {SDValue(NE, 0), N01}))
+          return DAG.getNode(Opc, DL, VT, SDValue(NE, 0), N01);
+      }
+    }
+
+    if (N1 != N00) {
+      // Reassociate if (op N01, N1) already exist
+      if (SDNode *NE = DAG.getNodeIfExists(Opc, DAG.getVTList(VT), {N01, N1})) {
+        // if Op (Op N01, N1), N00 already exist
+        // we need to stop reassciate to avoid dead loop
+        if (!DAG.doesNodeExist(Opc, DAG.getVTList(VT), {SDValue(NE, 0), N00}))
+          return DAG.getNode(Opc, DL, VT, SDValue(NE, 0), N00);
+      }
     }
   }
+
   return SDValue();
 }
 
@@ -1103,7 +1172,7 @@ SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
   assert(N->getNumValues() == NumTo && "Broken CombineTo call!");
   ++NodesCombined;
   LLVM_DEBUG(dbgs() << "\nReplacing.1 "; N->dump(&DAG); dbgs() << "\nWith: ";
-             To[0].getNode()->dump(&DAG);
+             To[0].dump(&DAG);
              dbgs() << " and " << NumTo - 1 << " other values\n");
   for (unsigned i = 0, e = NumTo; i != e; ++i)
     assert((!To[i].getNode() ||
@@ -1115,10 +1184,8 @@ SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
   if (AddTo) {
     // Push the new nodes and any users onto the worklist
     for (unsigned i = 0, e = NumTo; i != e; ++i) {
-      if (To[i].getNode()) {
-        AddToWorklist(To[i].getNode());
-        AddUsersToWorklist(To[i].getNode());
-      }
+      if (To[i].getNode())
+        AddToWorklistWithUsers(To[i].getNode());
     }
   }
 
@@ -1134,9 +1201,8 @@ void DAGCombiner::
 CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
   // Replace the old value with the new one.
   ++NodesCombined;
-  LLVM_DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.getNode()->dump(&DAG);
-             dbgs() << "\nWith: "; TLO.New.getNode()->dump(&DAG);
-             dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.dump(&DAG);
+             dbgs() << "\nWith: "; TLO.New.dump(&DAG); dbgs() << '\n');
 
   // Replace all uses.  If any nodes become isomorphic to other nodes and
   // are deleted, make sure to remove them from our worklist.
@@ -1149,7 +1215,7 @@ CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
   // Finally, if the node is now dead, remove it from the graph.  The node
   // may not be dead if the replacement process recursively simplified to
   // something else needing this node.
-  if (TLO.Old.getNode()->use_empty())
+  if (TLO.Old->use_empty())
     deleteAndRecombine(TLO.Old.getNode());
 }
 
@@ -1196,7 +1262,7 @@ void DAGCombiner::ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad) {
   SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, SDValue(ExtLoad, 0));
 
   LLVM_DEBUG(dbgs() << "\nReplacing.9 "; Load->dump(&DAG); dbgs() << "\nWith: ";
-             Trunc.getNode()->dump(&DAG); dbgs() << '\n');
+             Trunc.dump(&DAG); dbgs() << '\n');
   WorklistRemover DeadNodes(*this);
   DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), Trunc);
   DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), SDValue(ExtLoad, 1));
@@ -1295,7 +1361,7 @@ SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) {
   if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
     assert(PVT != VT && "Don't know what type to promote to!");
 
-    LLVM_DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG));
+    LLVM_DEBUG(dbgs() << "\nPromoting "; Op.dump(&DAG));
 
     bool Replace0 = false;
     SDValue N0 = Op.getOperand(0);
@@ -1322,7 +1388,7 @@ SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) {
 
     // If operands have a use ordering, make sure we deal with
     // predecessor first.
-    if (Replace0 && Replace1 && N0.getNode()->isPredecessorOf(N1.getNode())) {
+    if (Replace0 && Replace1 && N0->isPredecessorOf(N1.getNode())) {
       std::swap(N0, N1);
       std::swap(NN0, NN1);
     }
@@ -1363,11 +1429,10 @@ SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) {
   if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
     assert(PVT != VT && "Don't know what type to promote to!");
 
-    LLVM_DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG));
+    LLVM_DEBUG(dbgs() << "\nPromoting "; Op.dump(&DAG));
 
     bool Replace = false;
     SDValue N0 = Op.getOperand(0);
-    SDValue N1 = Op.getOperand(1);
     if (Opc == ISD::SRA)
       N0 = SExtPromoteOperand(N0, PVT);
     else if (Opc == ISD::SRL)
@@ -1379,6 +1444,7 @@ SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) {
       return SDValue();
 
     SDLoc DL(Op);
+    SDValue N1 = Op.getOperand(1);
     SDValue RV =
         DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, N0, N1));
 
@@ -1414,7 +1480,7 @@ SDValue DAGCombiner::PromoteExtend(SDValue Op) {
     // fold (aext (aext x)) -> (aext x)
     // fold (aext (zext x)) -> (zext x)
     // fold (aext (sext x)) -> (sext x)
-    LLVM_DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG));
+    LLVM_DEBUG(dbgs() << "\nPromoting "; Op.dump(&DAG));
     return DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, Op.getOperand(0));
   }
   return SDValue();
@@ -1455,7 +1521,7 @@ bool DAGCombiner::PromoteLoad(SDValue Op) {
     SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD);
 
     LLVM_DEBUG(dbgs() << "\nPromoting "; N->dump(&DAG); dbgs() << "\nTo: ";
-               Result.getNode()->dump(&DAG); dbgs() << '\n');
+               Result.dump(&DAG); dbgs() << '\n');
     WorklistRemover DeadNodes(*this);
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1));
@@ -1569,9 +1635,9 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
            RV.getOpcode() != ISD::DELETED_NODE &&
            "Node was deleted but visit returned new node!");
 
-    LLVM_DEBUG(dbgs() << " ... into: "; RV.getNode()->dump(&DAG));
+    LLVM_DEBUG(dbgs() << " ... into: "; RV.dump(&DAG));
 
-    if (N->getNumValues() == RV.getNode()->getNumValues())
+    if (N->getNumValues() == RV->getNumValues())
       DAG.ReplaceAllUsesWith(N, RV.getNode());
     else {
       assert(N->getValueType(0) == RV.getValueType() &&
@@ -1635,6 +1701,10 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::UREM:               return visitREM(N);
   case ISD::MULHU:              return visitMULHU(N);
   case ISD::MULHS:              return visitMULHS(N);
+  case ISD::AVGFLOORS:
+  case ISD::AVGFLOORU:
+  case ISD::AVGCEILS:
+  case ISD::AVGCEILU:           return visitAVG(N);
   case ISD::SMUL_LOHI:          return visitSMUL_LOHI(N);
   case ISD::UMUL_LOHI:          return visitUMUL_LOHI(N);
   case ISD::SMULO:
@@ -1724,6 +1794,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::LIFETIME_END:       return visitLIFETIME_END(N);
   case ISD::FP_TO_FP16:         return visitFP_TO_FP16(N);
   case ISD::FP16_TO_FP:         return visitFP16_TO_FP(N);
+  case ISD::FP_TO_BF16:         return visitFP_TO_BF16(N);
   case ISD::FREEZE:             return visitFREEZE(N);
   case ISD::VECREDUCE_FADD:
   case ISD::VECREDUCE_FMUL:
@@ -2072,8 +2143,9 @@ static bool canFoldInAddressingMode(SDNode *N, SDNode *Use, SelectionDAG &DAG,
       return false;
     VT = ST->getMemoryVT();
     AS = ST->getAddressSpace();
-  } else
+  } else {
     return false;
+  }
 
   TargetLowering::AddrMode AM;
   if (N->getOpcode() == ISD::ADD) {
@@ -2094,17 +2166,100 @@ static bool canFoldInAddressingMode(SDNode *N, SDNode *Use, SelectionDAG &DAG,
     else
       // [reg +/- reg]
       AM.Scale = 1;
-  } else
+  } else {
     return false;
+  }
 
   return TLI.isLegalAddressingMode(DAG.getDataLayout(), AM,
                                    VT.getTypeForEVT(*DAG.getContext()), AS);
 }
 
+/// This inverts a canonicalization in IR that replaces a variable select arm
+/// with an identity constant. Codegen improves if we re-use the variable
+/// operand rather than load a constant. This can also be converted into a
+/// masked vector operation if the target supports it.
+static SDValue foldSelectWithIdentityConstant(SDNode *N, SelectionDAG &DAG,
+                                              bool ShouldCommuteOperands) {
+  // Match a select as operand 1. The identity constant that we are looking for
+  // is only valid as operand 1 of a non-commutative binop.
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  if (ShouldCommuteOperands)
+    std::swap(N0, N1);
+
+  // TODO: Should this apply to scalar select too?
+  if (!N1.hasOneUse() || N1.getOpcode() != ISD::VSELECT)
+    return SDValue();
+
+  unsigned Opcode = N->getOpcode();
+  EVT VT = N->getValueType(0);
+  SDValue Cond = N1.getOperand(0);
+  SDValue TVal = N1.getOperand(1);
+  SDValue FVal = N1.getOperand(2);
+
+  // TODO: The cases should match with IR's ConstantExpr::getBinOpIdentity().
+  // TODO: Target-specific opcodes could be added. Ex: "isCommutativeBinOp()".
+  // TODO: With fast-math (NSZ), allow the opposite-sign form of zero?
+  auto isIdentityConstantForOpcode = [](unsigned Opcode, SDValue V) {
+    if (ConstantFPSDNode *C = isConstOrConstSplatFP(V)) {
+      switch (Opcode) {
+      case ISD::FADD: // X + -0.0 --> X
+        return C->isZero() && C->isNegative();
+      case ISD::FSUB: // X - 0.0 --> X
+        return C->isZero() && !C->isNegative();
+      case ISD::FMUL: // X * 1.0 --> X
+      case ISD::FDIV: // X / 1.0 --> X
+        return C->isExactlyValue(1.0);
+      }
+    }
+    if (ConstantSDNode *C = isConstOrConstSplat(V)) {
+      switch (Opcode) {
+      case ISD::ADD: // X + 0 --> X
+      case ISD::SUB: // X - 0 --> X
+      case ISD::SHL: // X << 0 --> X
+      case ISD::SRA: // X s>> 0 --> X
+      case ISD::SRL: // X u>> 0 --> X
+        return C->isZero();
+      case ISD::MUL: // X * 1 --> X
+        return C->isOne();
+      }
+    }
+    return false;
+  };
+
+  // This transform increases uses of N0, so freeze it to be safe.
+  // binop N0, (vselect Cond, IDC, FVal) --> vselect Cond, N0, (binop N0, FVal)
+  if (isIdentityConstantForOpcode(Opcode, TVal)) {
+    SDValue F0 = DAG.getFreeze(N0);
+    SDValue NewBO = DAG.getNode(Opcode, SDLoc(N), VT, F0, FVal, N->getFlags());
+    return DAG.getSelect(SDLoc(N), VT, Cond, F0, NewBO);
+  }
+  // binop N0, (vselect Cond, TVal, IDC) --> vselect Cond, (binop N0, TVal), N0
+  if (isIdentityConstantForOpcode(Opcode, FVal)) {
+    SDValue F0 = DAG.getFreeze(N0);
+    SDValue NewBO = DAG.getNode(Opcode, SDLoc(N), VT, F0, TVal, N->getFlags());
+    return DAG.getSelect(SDLoc(N), VT, Cond, NewBO, F0);
+  }
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) {
   assert(TLI.isBinOp(BO->getOpcode()) && BO->getNumValues() == 1 &&
          "Unexpected binary operator");
 
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  auto BinOpcode = BO->getOpcode();
+  EVT VT = BO->getValueType(0);
+  if (TLI.shouldFoldSelectWithIdentityConstant(BinOpcode, VT)) {
+    if (SDValue Sel = foldSelectWithIdentityConstant(BO, DAG, false))
+      return Sel;
+
+    if (TLI.isCommutativeBinOp(BO->getOpcode()))
+      if (SDValue Sel = foldSelectWithIdentityConstant(BO, DAG, true))
+        return Sel;
+  }
+
   // Don't do this unless the old select is going away. We want to eliminate the
   // binary operator, not replace a binop with a select.
   // TODO: Handle ISD::SELECT_CC.
@@ -2133,7 +2288,6 @@ SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) {
   // propagate non constant operands into select. I.e.:
   // and (select Cond, 0, -1), X --> select Cond, 0, X
   // or X, (select Cond, -1, 0) --> select Cond, -1, X
-  auto BinOpcode = BO->getOpcode();
   bool CanFoldNonConst =
       (BinOpcode == ISD::AND || BinOpcode == ISD::OR) &&
       (isNullOrNullSplat(CT) || isAllOnesOrAllOnesSplat(CT)) &&
@@ -2145,8 +2299,6 @@ SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) {
       !DAG.isConstantFPBuildVectorOrConstantFP(CBO))
     return SDValue();
 
-  EVT VT = BO->getValueType(0);
-
   // We have a select-of-constants followed by a binary operator with a
   // constant. Eliminate the binop by pulling the constant math into the select.
   // Example: add (select Cond, CT, CF), CBO --> select Cond, CT + CBO, CF + CBO
@@ -2249,6 +2401,15 @@ static SDValue foldAddSubOfSignBit(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
+static bool isADDLike(SDValue V, const SelectionDAG &DAG) {
+  unsigned Opcode = V.getOpcode();
+  if (Opcode == ISD::OR)
+    return DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1));
+  if (Opcode == ISD::XOR)
+    return isMinSignedConstant(V.getOperand(1));
+  return false;
+}
+
 /// Try to fold a node that behaves like an ADD (note that N isn't necessarily
 /// an ISD::ADD here, it could for example be an ISD::OR if we know that there
 /// are no common bits set in the operands).
@@ -2287,66 +2448,60 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {
   if (isNullConstant(N1))
     return N0;
 
-  if (isConstantOrConstantVector(N1, /* NoOpaque */ true)) {
+  if (N0.getOpcode() == ISD::SUB) {
+    SDValue N00 = N0.getOperand(0);
+    SDValue N01 = N0.getOperand(1);
+
     // fold ((A-c1)+c2) -> (A+(c2-c1))
-    if (N0.getOpcode() == ISD::SUB &&
-        isConstantOrConstantVector(N0.getOperand(1), /* NoOpaque */ true)) {
-      SDValue Sub =
-          DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N1, N0.getOperand(1)});
-      assert(Sub && "Constant folding failed");
+    if (SDValue Sub = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N1, N01}))
       return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Sub);
-    }
 
     // fold ((c1-A)+c2) -> (c1+c2)-A
-    if (N0.getOpcode() == ISD::SUB &&
-        isConstantOrConstantVector(N0.getOperand(0), /* NoOpaque */ true)) {
-      SDValue Add =
-          DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N1, N0.getOperand(0)});
-      assert(Add && "Constant folding failed");
+    if (SDValue Add = DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N1, N00}))
       return DAG.getNode(ISD::SUB, DL, VT, Add, N0.getOperand(1));
-    }
+  }
 
-    // add (sext i1 X), 1 -> zext (not i1 X)
-    // We don't transform this pattern:
-    //   add (zext i1 X), -1 -> sext (not i1 X)
-    // because most (?) targets generate better code for the zext form.
-    if (N0.getOpcode() == ISD::SIGN_EXTEND && N0.hasOneUse() &&
-        isOneOrOneSplat(N1)) {
-      SDValue X = N0.getOperand(0);
-      if ((!LegalOperations ||
-           (TLI.isOperationLegal(ISD::XOR, X.getValueType()) &&
-            TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) &&
-          X.getScalarValueSizeInBits() == 1) {
-        SDValue Not = DAG.getNOT(DL, X, X.getValueType());
-        return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Not);
-      }
+  // add (sext i1 X), 1 -> zext (not i1 X)
+  // We don't transform this pattern:
+  //   add (zext i1 X), -1 -> sext (not i1 X)
+  // because most (?) targets generate better code for the zext form.
+  if (N0.getOpcode() == ISD::SIGN_EXTEND && N0.hasOneUse() &&
+      isOneOrOneSplat(N1)) {
+    SDValue X = N0.getOperand(0);
+    if ((!LegalOperations ||
+         (TLI.isOperationLegal(ISD::XOR, X.getValueType()) &&
+          TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) &&
+        X.getScalarValueSizeInBits() == 1) {
+      SDValue Not = DAG.getNOT(DL, X, X.getValueType());
+      return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Not);
     }
+  }
 
-    // Fold (add (or x, c0), c1) -> (add x, (c0 + c1)) if (or x, c0) is
-    // equivalent to (add x, c0).
-    if (N0.getOpcode() == ISD::OR &&
-        isConstantOrConstantVector(N0.getOperand(1), /* NoOpaque */ true) &&
-        DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1))) {
-      if (SDValue Add0 = DAG.FoldConstantArithmetic(ISD::ADD, DL, VT,
-                                                    {N1, N0.getOperand(1)}))
-        return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Add0);
-    }
+  // Fold (add (or x, c0), c1) -> (add x, (c0 + c1))
+  // iff (or x, c0) is equivalent to (add x, c0).
+  // Fold (add (xor x, c0), c1) -> (add x, (c0 + c1))
+  // iff (xor x, c0) is equivalent to (add x, c0).
+  if (isADDLike(N0, DAG)) {
+    SDValue N01 = N0.getOperand(1);
+    if (SDValue Add = DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N1, N01}))
+      return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Add);
   }
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
   // reassociate add
-  if (!reassociationCanBreakAddressingModePattern(ISD::ADD, DL, N0, N1)) {
+  if (!reassociationCanBreakAddressingModePattern(ISD::ADD, DL, N, N0, N1)) {
     if (SDValue RADD = reassociateOps(ISD::ADD, DL, N0, N1, N->getFlags()))
       return RADD;
 
     // Reassociate (add (or x, c), y) -> (add add(x, y), c)) if (or x, c) is
     // equivalent to (add x, c).
+    // Reassociate (add (xor x, c), y) -> (add add(x, y), c)) if (xor x, c) is
+    // equivalent to (add x, c).
     auto ReassociateAddOr = [&](SDValue N0, SDValue N1) {
-      if (N0.getOpcode() == ISD::OR && N0.hasOneUse() &&
-          isConstantOrConstantVector(N0.getOperand(1), /* NoOpaque */ true) &&
-          DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1))) {
+      if (isADDLike(N0, DAG) && N0.hasOneUse() &&
+          isConstantOrConstantVector(N0.getOperand(1), /* NoOpaque */ true)) {
         return DAG.getNode(ISD::ADD, DL, VT,
                            DAG.getNode(ISD::ADD, DL, VT, N1, N0.getOperand(0)),
                            N0.getOperand(1));
@@ -2406,7 +2561,8 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {
                        N1.getOperand(1));
 
   // fold (A-B)+(C-D) to (A+C)-(B+D) when A or C is constant
-  if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB) {
+  if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB &&
+      N0->hasOneUse() && N1->hasOneUse()) {
     SDValue N00 = N0.getOperand(0);
     SDValue N01 = N0.getOperand(1);
     SDValue N10 = N1.getOperand(0);
@@ -2459,8 +2615,8 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {
     //   add (add x, y), 1
     // And if the target does not like this form then turn into:
     //   sub y, (xor x, -1)
-    if (!TLI.preferIncOfAddToSubOfNot(VT) && N0.hasOneUse() &&
-        N0.getOpcode() == ISD::ADD) {
+    if (!TLI.preferIncOfAddToSubOfNot(VT) && N0.getOpcode() == ISD::ADD &&
+        N0.hasOneUse()) {
       SDValue Not = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(0),
                                 DAG.getAllOnesConstant(DL, VT));
       return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(1), Not);
@@ -2468,7 +2624,7 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {
   }
 
   // (x - y) + -1  ->  add (xor y, -1), x
-  if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB &&
+  if (N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&
       isAllOnesOrAllOnesSplat(N1)) {
     SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(1), N1);
     return DAG.getNode(ISD::ADD, DL, VT, Xor, N0.getOperand(0));
@@ -2565,7 +2721,8 @@ SDValue DAGCombiner::visitADDSAT(SDNode *N) {
 
   // fold vector ops
   if (VT.isVector()) {
-    // TODO SimplifyVBinOp
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
+      return FoldedVOp;
 
     // fold (add_sat x, 0) -> x, vector edition
     if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
@@ -2611,7 +2768,7 @@ static SDValue getAsCarry(const TargetLowering &TLI, SDValue V) {
       V.getOpcode() != ISD::UADDO && V.getOpcode() != ISD::USUBO)
     return SDValue();
 
-  EVT VT = V.getNode()->getValueType(0);
+  EVT VT = V->getValueType(0);
   if (!TLI.isOperationLegalOrCustom(V.getOpcode(), VT))
     return SDValue();
 
@@ -2664,27 +2821,27 @@ SDValue DAGCombiner::visitADDLikeCommutative(SDValue N0, SDValue N1,
   //   add (add x, 1), y
   // And if the target does not like this form then turn into:
   //   sub y, (xor x, -1)
-  if (!TLI.preferIncOfAddToSubOfNot(VT) && N0.hasOneUse() &&
-      N0.getOpcode() == ISD::ADD && isOneOrOneSplat(N0.getOperand(1))) {
+  if (!TLI.preferIncOfAddToSubOfNot(VT) && N0.getOpcode() == ISD::ADD &&
+      N0.hasOneUse() && isOneOrOneSplat(N0.getOperand(1))) {
     SDValue Not = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(0),
                               DAG.getAllOnesConstant(DL, VT));
     return DAG.getNode(ISD::SUB, DL, VT, N1, Not);
   }
 
-  // Hoist one-use subtraction by non-opaque constant:
-  //   (x - C) + y  ->  (x + y) - C
-  // This is necessary because SUB(X,C) -> ADD(X,-C) doesn't work for vectors.
-  if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB &&
-      isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true)) {
-    SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), N1);
-    return DAG.getNode(ISD::SUB, DL, VT, Add, N0.getOperand(1));
-  }
-  // Hoist one-use subtraction from non-opaque constant:
-  //   (C - x) + y  ->  (y - x) + C
-  if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB &&
-      isConstantOrConstantVector(N0.getOperand(0), /*NoOpaques=*/true)) {
-    SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N1, N0.getOperand(1));
-    return DAG.getNode(ISD::ADD, DL, VT, Sub, N0.getOperand(0));
+  if (N0.getOpcode() == ISD::SUB && N0.hasOneUse()) {
+    // Hoist one-use subtraction by non-opaque constant:
+    //   (x - C) + y  ->  (x + y) - C
+    // This is necessary because SUB(X,C) -> ADD(X,-C) doesn't work for vectors.
+    if (isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true)) {
+      SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), N1);
+      return DAG.getNode(ISD::SUB, DL, VT, Add, N0.getOperand(1));
+    }
+    // Hoist one-use subtraction from non-opaque constant:
+    //   (C - x) + y  ->  (y - x) + C
+    if (isConstantOrConstantVector(N0.getOperand(0), /*NoOpaques=*/true)) {
+      SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N1, N0.getOperand(1));
+      return DAG.getNode(ISD::ADD, DL, VT, Sub, N0.getOperand(0));
+    }
   }
 
   // If the target's bool is represented as 0/1, prefer to make this 'sub 0/1'
@@ -3060,21 +3217,26 @@ static SDValue combineADDCARRYDiamond(DAGCombiner &Combiner, SelectionDAG &DAG,
 // Our goal is to identify A, B, and CarryIn and produce ADDCARRY/SUBCARRY with
 // a single path for carry/borrow out propagation:
 static SDValue combineCarryDiamond(SelectionDAG &DAG, const TargetLowering &TLI,
-                                   SDValue Carry0, SDValue Carry1, SDNode *N) {
-  if (Carry0.getResNo() != 1 || Carry1.getResNo() != 1)
+                                   SDValue N0, SDValue N1, SDNode *N) {
+  SDValue Carry0 = getAsCarry(TLI, N0);
+  if (!Carry0)
     return SDValue();
+  SDValue Carry1 = getAsCarry(TLI, N1);
+  if (!Carry1)
+    return SDValue();
+
   unsigned Opcode = Carry0.getOpcode();
   if (Opcode != Carry1.getOpcode())
     return SDValue();
   if (Opcode != ISD::UADDO && Opcode != ISD::USUBO)
     return SDValue();
 
-  // Canonicalize the add/sub of A and B as Carry0 and the add/sub of the
-  // carry/borrow in as Carry1. (The top and middle uaddo nodes respectively in
-  // the above ASCII art.)
-  if (Carry1.getOperand(0) != Carry0.getValue(0) &&
-      Carry1.getOperand(1) != Carry0.getValue(0))
+  // Canonicalize the add/sub of A and B (the top node in the above ASCII art)
+  // as Carry0 and the add/sub of the carry in as Carry1 (the middle node).
+  if (Carry1.getNode()->isOperandOf(Carry0.getNode()))
     std::swap(Carry0, Carry1);
+
+  // Check if nodes are connected in expected way.
   if (Carry1.getOperand(0) != Carry0.getValue(0) &&
       Carry1.getOperand(1) != Carry0.getValue(0))
     return SDValue();
@@ -3254,9 +3416,15 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   EVT VT = N0.getValueType();
   SDLoc DL(N);
 
+  auto PeekThroughFreeze = [](SDValue N) {
+    if (N->getOpcode() == ISD::FREEZE && N.hasOneUse())
+      return N->getOperand(0);
+    return N;
+  };
+
   // fold (sub x, x) -> 0
   // FIXME: Refactor this and xor and other similar operations together.
-  if (N0 == N1)
+  if (PeekThroughFreeze(N0) == PeekThroughFreeze(N1))
     return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations);
 
   // fold (sub c1, c2) -> c3
@@ -3314,7 +3482,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
     }
 
     // Convert 0 - abs(x).
-    if (N1->getOpcode() == ISD::ABS &&
+    if (N1.getOpcode() == ISD::ABS && N1.hasOneUse() &&
         !TLI.isOperationLegalOrCustom(ISD::ABS, VT))
       if (SDValue Result = TLI.expandABS(N1.getNode(), DAG, true))
         return Result;
@@ -3352,44 +3520,31 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
     return N0.getOperand(0);
 
   // fold (A+C1)-C2 -> A+(C1-C2)
-  if (N0.getOpcode() == ISD::ADD &&
-      isConstantOrConstantVector(N1, /* NoOpaques */ true) &&
-      isConstantOrConstantVector(N0.getOperand(1), /* NoOpaques */ true)) {
-    SDValue NewC =
-        DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0.getOperand(1), N1});
-    assert(NewC && "Constant folding failed");
-    return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), NewC);
+  if (N0.getOpcode() == ISD::ADD) {
+    SDValue N01 = N0.getOperand(1);
+    if (SDValue NewC = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N01, N1}))
+      return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), NewC);
   }
 
   // fold C2-(A+C1) -> (C2-C1)-A
   if (N1.getOpcode() == ISD::ADD) {
     SDValue N11 = N1.getOperand(1);
-    if (isConstantOrConstantVector(N0, /* NoOpaques */ true) &&
-        isConstantOrConstantVector(N11, /* NoOpaques */ true)) {
-      SDValue NewC = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0, N11});
-      assert(NewC && "Constant folding failed");
+    if (SDValue NewC = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0, N11}))
       return DAG.getNode(ISD::SUB, DL, VT, NewC, N1.getOperand(0));
-    }
   }
 
   // fold (A-C1)-C2 -> A-(C1+C2)
-  if (N0.getOpcode() == ISD::SUB &&
-      isConstantOrConstantVector(N1, /* NoOpaques */ true) &&
-      isConstantOrConstantVector(N0.getOperand(1), /* NoOpaques */ true)) {
-    SDValue NewC =
-        DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N0.getOperand(1), N1});
-    assert(NewC && "Constant folding failed");
-    return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), NewC);
+  if (N0.getOpcode() == ISD::SUB) {
+    SDValue N01 = N0.getOperand(1);
+    if (SDValue NewC = DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N01, N1}))
+      return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), NewC);
   }
 
   // fold (c1-A)-c2 -> (c1-c2)-A
-  if (N0.getOpcode() == ISD::SUB &&
-      isConstantOrConstantVector(N1, /* NoOpaques */ true) &&
-      isConstantOrConstantVector(N0.getOperand(0), /* NoOpaques */ true)) {
-    SDValue NewC =
-        DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0.getOperand(0), N1});
-    assert(NewC && "Constant folding failed");
-    return DAG.getNode(ISD::SUB, DL, VT, NewC, N0.getOperand(1));
+  if (N0.getOpcode() == ISD::SUB) {
+    SDValue N00 = N0.getOperand(0);
+    if (SDValue NewC = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N00, N1}))
+      return DAG.getNode(ISD::SUB, DL, VT, NewC, N0.getOperand(1));
   }
 
   // fold ((A+(B+or-C))-B) -> A+or-C
@@ -3584,6 +3739,15 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
     }
   }
 
+  // As with the previous fold, prefer add for more folding potential.
+  // Subtracting SMIN/0 is the same as adding SMIN/0:
+  // N0 - (X << BW-1) --> N0 + (X << BW-1)
+  if (N1.getOpcode() == ISD::SHL) {
+    ConstantSDNode *ShlC = isConstOrConstSplat(N1.getOperand(1));
+    if (ShlC && ShlC->getAPIntValue() == VT.getScalarSizeInBits() - 1)
+      return DAG.getNode(ISD::ADD, DL, VT, N1, N0);
+  }
+
   if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT)) {
     // (sub Carry, X)  ->  (addcarry (sub 0, X), 0, Carry)
     if (SDValue Carry = getAsCarry(TLI, N0)) {
@@ -3619,7 +3783,8 @@ SDValue DAGCombiner::visitSUBSAT(SDNode *N) {
 
   // fold vector ops
   if (VT.isVector()) {
-    // TODO SimplifyVBinOp
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
+      return FoldedVOp;
 
     // fold (sub_sat x, 0) -> x, vector edition
     if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
@@ -3770,19 +3935,20 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
+  SDLoc DL(N);
 
   // fold (mul x, undef) -> 0
   if (N0.isUndef() || N1.isUndef())
-    return DAG.getConstant(0, SDLoc(N), VT);
+    return DAG.getConstant(0, DL, VT);
 
   // fold (mul c1, c2) -> c1*c2
-  if (SDValue C = DAG.FoldConstantArithmetic(ISD::MUL, SDLoc(N), VT, {N0, N1}))
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::MUL, DL, VT, {N0, N1}))
     return C;
 
   // canonicalize constant to RHS (vector doesn't have to splat)
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
       !DAG.isConstantIntBuildVectorOrConstantInt(N1))
-    return DAG.getNode(ISD::MUL, SDLoc(N), VT, N1, N0);
+    return DAG.getNode(ISD::MUL, DL, VT, N1, N0);
 
   bool N1IsConst = false;
   bool N1IsOpaqueConst = false;
@@ -3790,7 +3956,7 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
 
   // fold vector ops
   if (VT.isVector()) {
-    if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N)))
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
       return FoldedVOp;
 
     N1IsConst = ISD::isConstantSplatVector(N1.getNode(), ConstValue1);
@@ -3817,17 +3983,14 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
     return NewSel;
 
   // fold (mul x, -1) -> 0-x
-  if (N1IsConst && ConstValue1.isAllOnes()) {
-    SDLoc DL(N);
+  if (N1IsConst && ConstValue1.isAllOnes())
     return DAG.getNode(ISD::SUB, DL, VT,
                        DAG.getConstant(0, DL, VT), N0);
-  }
 
   // fold (mul x, (1 << c)) -> x << c
   if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) &&
       DAG.isKnownToBeAPowerOfTwo(N1) &&
       (!VT.isVector() || Level <= AfterLegalizeVectorOps)) {
-    SDLoc DL(N);
     SDValue LogBase2 = BuildLogBase2(N1, DL);
     EVT ShiftVT = getShiftAmountTy(N0.getValueType());
     SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT);
@@ -3837,7 +4000,6 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
   // fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c
   if (N1IsConst && !N1IsOpaqueConst && ConstValue1.isNegatedPowerOf2()) {
     unsigned Log2Val = (-ConstValue1).logBase2();
-    SDLoc DL(N);
     // FIXME: If the input is something that is easily negated (e.g. a
     // single-use add), we should put the negate there.
     return DAG.getNode(ISD::SUB, DL, VT,
@@ -3882,7 +4044,6 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
       ShAmt += TZeros;
       assert(ShAmt < VT.getScalarSizeInBits() &&
              "multiply-by-constant generated out of bounds shift");
-      SDLoc DL(N);
       SDValue Shl =
           DAG.getNode(ISD::SHL, DL, VT, N0, DAG.getConstant(ShAmt, DL, VT));
       SDValue R =
@@ -3897,12 +4058,10 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
   }
 
   // (mul (shl X, c1), c2) -> (mul X, c2 << c1)
-  if (N0.getOpcode() == ISD::SHL &&
-      isConstantOrConstantVector(N1, /* NoOpaques */ true) &&
-      isConstantOrConstantVector(N0.getOperand(1), /* NoOpaques */ true)) {
-    SDValue C3 = DAG.getNode(ISD::SHL, SDLoc(N), VT, N1, N0.getOperand(1));
-    if (isConstantOrConstantVector(C3))
-      return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), C3);
+  if (N0.getOpcode() == ISD::SHL) {
+    SDValue N01 = N0.getOperand(1);
+    if (SDValue C3 = DAG.FoldConstantArithmetic(ISD::SHL, DL, VT, {N1, N01}))
+      return DAG.getNode(ISD::MUL, DL, VT, N0.getOperand(0), C3);
   }
 
   // Change (mul (shl X, C), Y) -> (shl (mul X, Y), C) when the shift has one
@@ -3912,18 +4071,17 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
 
     // Check for both (mul (shl X, C), Y)  and  (mul Y, (shl X, C)).
     if (N0.getOpcode() == ISD::SHL &&
-        isConstantOrConstantVector(N0.getOperand(1)) &&
-        N0.getNode()->hasOneUse()) {
+        isConstantOrConstantVector(N0.getOperand(1)) && N0->hasOneUse()) {
       Sh = N0; Y = N1;
     } else if (N1.getOpcode() == ISD::SHL &&
                isConstantOrConstantVector(N1.getOperand(1)) &&
-               N1.getNode()->hasOneUse()) {
+               N1->hasOneUse()) {
       Sh = N1; Y = N0;
     }
 
     if (Sh.getNode()) {
-      SDValue Mul = DAG.getNode(ISD::MUL, SDLoc(N), VT, Sh.getOperand(0), Y);
-      return DAG.getNode(ISD::SHL, SDLoc(N), VT, Mul, Sh.getOperand(1));
+      SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, Sh.getOperand(0), Y);
+      return DAG.getNode(ISD::SHL, DL, VT, Mul, Sh.getOperand(1));
     }
   }
 
@@ -3932,18 +4090,17 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
       N0.getOpcode() == ISD::ADD &&
       DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)) &&
       isMulAddWithConstProfitable(N, N0, N1))
-      return DAG.getNode(ISD::ADD, SDLoc(N), VT,
-                         DAG.getNode(ISD::MUL, SDLoc(N0), VT,
-                                     N0.getOperand(0), N1),
-                         DAG.getNode(ISD::MUL, SDLoc(N1), VT,
-                                     N0.getOperand(1), N1));
+    return DAG.getNode(
+        ISD::ADD, DL, VT,
+        DAG.getNode(ISD::MUL, SDLoc(N0), VT, N0.getOperand(0), N1),
+        DAG.getNode(ISD::MUL, SDLoc(N1), VT, N0.getOperand(1), N1));
 
   // Fold (mul (vscale * C0), C1) to (vscale * (C0 * C1)).
   if (N0.getOpcode() == ISD::VSCALE)
     if (ConstantSDNode *NC1 = isConstOrConstSplat(N1)) {
       const APInt &C0 = N0.getConstantOperandAPInt(0);
       const APInt &C1 = NC1->getAPIntValue();
-      return DAG.getVScale(SDLoc(N), VT, C0 * C1);
+      return DAG.getVScale(DL, VT, C0 * C1);
     }
 
   // Fold (mul step_vector(C0), C1) to (step_vector(C0 * C1)).
@@ -3952,7 +4109,7 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
     if (ISD::isConstantSplatVector(N1.getNode(), MulVal)) {
       const APInt &C0 = N0.getConstantOperandAPInt(0);
       APInt NewStep = C0 * MulVal;
-      return DAG.getStepVector(SDLoc(N), VT, NewStep);
+      return DAG.getStepVector(DL, VT, NewStep);
     }
 
   // Fold ((mul x, 0/undef) -> 0,
@@ -3974,7 +4131,6 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
     if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::AND, VT)) &&
         ISD::matchUnaryPredicate(N1, IsClearMask, /*AllowUndefs*/ true)) {
       assert(N1.getOpcode() == ISD::BUILD_VECTOR && "Unknown constant vector");
-      SDLoc DL(N);
       EVT LegalSVT = N1.getOperand(0).getValueType();
       SDValue Zero = DAG.getConstant(0, DL, LegalSVT);
       SDValue AllOnes = DAG.getAllOnesConstant(DL, LegalSVT);
@@ -3987,7 +4143,7 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
   }
 
   // reassociate mul
-  if (SDValue RMUL = reassociateOps(ISD::MUL, SDLoc(N), N0, N1, N->getFlags()))
+  if (SDValue RMUL = reassociateOps(ISD::MUL, DL, N0, N1, N->getFlags()))
     return RMUL;
 
   return SDValue();
@@ -4050,7 +4206,7 @@ SDValue DAGCombiner::useDivRem(SDNode *Node) {
   SDValue Op0 = Node->getOperand(0);
   SDValue Op1 = Node->getOperand(1);
   SDValue combined;
-  for (SDNode *User : Op0.getNode()->uses()) {
+  for (SDNode *User : Op0->uses()) {
     if (User == Node || User->getOpcode() == ISD::DELETED_NODE ||
         User->use_empty())
       continue;
@@ -4190,12 +4346,7 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
   return SDValue();
 }
 
-SDValue DAGCombiner::visitSDIVLike(SDValue N0, SDValue N1, SDNode *N) {
-  SDLoc DL(N);
-  EVT VT = N->getValueType(0);
-  EVT CCVT = getSetCCResultType(VT);
-  unsigned BitWidth = VT.getScalarSizeInBits();
-
+static bool isDivisorPowerOfTwo(SDValue Divisor) {
   // Helper for determining whether a value is a power-2 constant scalar or a
   // vector of such elements.
   auto IsPowerOfTwo = [](ConstantSDNode *C) {
@@ -4208,11 +4359,20 @@ SDValue DAGCombiner::visitSDIVLike(SDValue N0, SDValue N1, SDNode *N) {
     return false;
   };
 
+  return ISD::matchUnaryPredicate(Divisor, IsPowerOfTwo);
+}
+
+SDValue DAGCombiner::visitSDIVLike(SDValue N0, SDValue N1, SDNode *N) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  EVT CCVT = getSetCCResultType(VT);
+  unsigned BitWidth = VT.getScalarSizeInBits();
+
   // fold (sdiv X, pow2) -> simple ops after legalize
   // FIXME: We check for the exact bit here because the generic lowering gives
   // better results in that case. The target-specific lowering should learn how
   // to handle exact sdivs efficiently.
-  if (!N->getFlags().hasExact() && ISD::matchUnaryPredicate(N1, IsPowerOfTwo)) {
+  if (!N->getFlags().hasExact() && isDivisorPowerOfTwo(N1)) {
     // Target-specific implementation of sdiv x, pow2.
     if (SDValue Res = BuildSDIVPow2(N))
       return Res;
@@ -4368,6 +4528,16 @@ SDValue DAGCombiner::visitUDIVLike(SDValue N0, SDValue N1, SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::buildOptimizedSREM(SDValue N0, SDValue N1, SDNode *N) {
+  if (!N->getFlags().hasExact() && isDivisorPowerOfTwo(N1) &&
+      !DAG.doesNodeExist(ISD::SDIV, N->getVTList(), {N0, N1})) {
+    // Target-specific implementation of srem x, pow2.
+    if (SDValue Res = BuildSREMPow2(N))
+      return Res;
+  }
+  return SDValue();
+}
+
 // handles ISD::SREM and ISD::UREM
 SDValue DAGCombiner::visitREM(SDNode *N) {
   unsigned Opcode = N->getOpcode();
@@ -4384,10 +4554,13 @@ SDValue DAGCombiner::visitREM(SDNode *N) {
   if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1}))
     return C;
 
-  // fold (urem X, -1) -> select(X == -1, 0, x)
-  if (!isSigned && N1C && N1C->isAllOnes())
-    return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ),
-                         DAG.getConstant(0, DL, VT), N0);
+  // fold (urem X, -1) -> select(FX == -1, 0, FX)
+  // Freeze the numerator to avoid a miscompile with an undefined value.
+  if (!isSigned && N1C && N1C->isAllOnes()) {
+    SDValue F0 = DAG.getFreeze(N0);
+    SDValue EqualsNeg1 = DAG.getSetCC(DL, CCVT, F0, N1, ISD::SETEQ);
+    return DAG.getSelect(DL, VT, EqualsNeg1, DAG.getConstant(0, DL, VT), F0);
+  }
 
   if (SDValue V = simplifyDivRem(N, DAG))
     return V;
@@ -4428,6 +4601,12 @@ SDValue DAGCombiner::visitREM(SDNode *N) {
   // combine will not return a DIVREM.  Regardless, checking cheapness here
   // makes sense since the simplification results in fatter code.
   if (DAG.isKnownNeverZero(N1) && !TLI.isIntDivCheap(VT, Attr)) {
+    if (isSigned) {
+      // check if we can build faster implementation for srem
+      if (SDValue OptimizedRem = buildOptimizedSREM(N0, N1, N))
+        return OptimizedRem;
+    }
+
     SDValue OptimizedDiv =
         isSigned ? visitSDIVLike(N0, N1, N) : visitUDIVLike(N0, N1, N);
     if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != N) {
@@ -4587,6 +4766,46 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitAVG(SDNode *N) {
+  unsigned Opcode = N->getOpcode();
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  // fold (avg c1, c2)
+  if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1}))
+    return C;
+
+  // canonicalize constant to RHS.
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(Opcode, DL, N->getVTList(), N1, N0);
+
+  if (VT.isVector()) {
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
+      return FoldedVOp;
+
+    // fold (avgfloor x, 0) -> x >> 1
+    if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) {
+      if (Opcode == ISD::AVGFLOORS)
+        return DAG.getNode(ISD::SRA, DL, VT, N0, DAG.getConstant(1, DL, VT));
+      if (Opcode == ISD::AVGFLOORU)
+        return DAG.getNode(ISD::SRL, DL, VT, N0, DAG.getConstant(1, DL, VT));
+    }
+  }
+
+  // fold (avg x, undef) -> x
+  if (N0.isUndef())
+    return N1;
+  if (N1.isUndef())
+    return N0;
+
+  // TODO If we use avg for scalars anywhere, we can add (avgfl x, 0) -> x >> 1
+
+  return SDValue();
+}
+
 /// Perform optimizations common to nodes that compute two values. LoOp and HiOp
 /// give the opcodes for the two computations that are being performed. Return
 /// true if a simplification was made.
@@ -4745,7 +4964,9 @@ SDValue DAGCombiner::visitMULO(SDNode *N) {
                      DAG.getConstant(0, DL, CarryVT));
 
   // (mulo x, 2) -> (addo x, x)
-  if (N1C && N1C->getAPIntValue() == 2)
+  // FIXME: This needs a freeze.
+  if (N1C && N1C->getAPIntValue() == 2 &&
+      (!IsSigned || VT.getScalarSizeInBits() > 2))
     return DAG.getNode(IsSigned ? ISD::SADDO : ISD::UADDO, DL,
                        N->getVTList(), N0, N0);
 
@@ -4802,8 +5023,7 @@ static SDValue isSaturatingMinMax(SDValue N0, SDValue N1, SDValue N2,
       return 0;
     const APInt &C1 = N1C->getAPIntValue();
     const APInt &C2 = N3C->getAPIntValue();
-    if (C1.getBitWidth() < C2.getBitWidth() ||
-        C1 != C2.sextOrSelf(C1.getBitWidth()))
+    if (C1.getBitWidth() < C2.getBitWidth() || C1 != C2.sext(C1.getBitWidth()))
       return 0;
     return CC == ISD::SETLT ? ISD::SMIN : (CC == ISD::SETGT ? ISD::SMAX : 0);
   };
@@ -4910,7 +5130,7 @@ static SDValue PerformUMinFpToSatCombine(SDValue N0, SDValue N1, SDValue N2,
   const APInt &C1 = N1C->getAPIntValue();
   const APInt &C3 = N3C->getAPIntValue();
   if (!(C1 + 1).isPowerOf2() || C1.getBitWidth() < C3.getBitWidth() ||
-      C1 != C3.zextOrSelf(C1.getBitWidth()))
+      C1 != C3.zext(C1.getBitWidth()))
     return SDValue();
 
   unsigned BW = (C1 + 1).exactLogBase2();
@@ -4940,6 +5160,10 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
   if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1}))
     return C;
 
+  // If the operands are the same, this is a no-op.
+  if (N0 == N1)
+    return N0;
+
   // canonicalize constant to RHS
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
       !DAG.isConstantIntBuildVectorOrConstantInt(N1))
@@ -5245,29 +5469,27 @@ SDValue DAGCombiner::foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
     }
 
     // Turn compare of constants whose difference is 1 bit into add+and+setcc.
-    // TODO - support non-uniform vector amounts.
     if ((IsAnd && CC1 == ISD::SETNE) || (!IsAnd && CC1 == ISD::SETEQ)) {
       // Match a shared variable operand and 2 non-opaque constant operands.
-      ConstantSDNode *C0 = isConstOrConstSplat(LR);
-      ConstantSDNode *C1 = isConstOrConstSplat(RR);
-      if (LL == RL && C0 && C1 && !C0->isOpaque() && !C1->isOpaque()) {
+      auto MatchDiffPow2 = [&](ConstantSDNode *C0, ConstantSDNode *C1) {
+        // The difference of the constants must be a single bit.
         const APInt &CMax =
             APIntOps::umax(C0->getAPIntValue(), C1->getAPIntValue());
         const APInt &CMin =
             APIntOps::umin(C0->getAPIntValue(), C1->getAPIntValue());
-        // The difference of the constants must be a single bit.
-        if ((CMax - CMin).isPowerOf2()) {
-          // and/or (setcc X, CMax, ne), (setcc X, CMin, ne/eq) -->
-          // setcc ((sub X, CMin), ~(CMax - CMin)), 0, ne/eq
-          SDValue Max = DAG.getNode(ISD::UMAX, DL, OpVT, LR, RR);
-          SDValue Min = DAG.getNode(ISD::UMIN, DL, OpVT, LR, RR);
-          SDValue Offset = DAG.getNode(ISD::SUB, DL, OpVT, LL, Min);
-          SDValue Diff = DAG.getNode(ISD::SUB, DL, OpVT, Max, Min);
-          SDValue Mask = DAG.getNOT(DL, Diff, OpVT);
-          SDValue And = DAG.getNode(ISD::AND, DL, OpVT, Offset, Mask);
-          SDValue Zero = DAG.getConstant(0, DL, OpVT);
-          return DAG.getSetCC(DL, VT, And, Zero, CC0);
-        }
+        return !C0->isOpaque() && !C1->isOpaque() && (CMax - CMin).isPowerOf2();
+      };
+      if (LL == RL && ISD::matchBinaryPredicate(LR, RR, MatchDiffPow2)) {
+        // and/or (setcc X, CMax, ne), (setcc X, CMin, ne/eq) -->
+        // setcc ((sub X, CMin), ~(CMax - CMin)), 0, ne/eq
+        SDValue Max = DAG.getNode(ISD::UMAX, DL, OpVT, LR, RR);
+        SDValue Min = DAG.getNode(ISD::UMIN, DL, OpVT, LR, RR);
+        SDValue Offset = DAG.getNode(ISD::SUB, DL, OpVT, LL, Min);
+        SDValue Diff = DAG.getNode(ISD::SUB, DL, OpVT, Max, Min);
+        SDValue Mask = DAG.getNOT(DL, Diff, OpVT);
+        SDValue And = DAG.getNode(ISD::AND, DL, OpVT, Offset, Mask);
+        SDValue Zero = DAG.getConstant(0, DL, OpVT);
+        return DAG.getSetCC(DL, VT, And, Zero, CC0);
       }
     }
   }
@@ -5769,6 +5991,9 @@ static SDValue combineShiftAnd1ToBitTest(SDNode *And, SelectionDAG &DAG) {
   if (ShiftAmt.uge(VTBitWidth))
     return SDValue();
 
+  if (!TLI.hasBitTest(Srl.getOperand(0), Srl.getOperand(1)))
+    return SDValue();
+
   // Turn this into a bit-test pattern using mask op + setcc:
   // and (not (srl X, C)), 1 --> (and X, 1<<C) == 0
   SDLoc DL(And);
@@ -5815,6 +6040,53 @@ static SDValue foldAndToUsubsat(SDNode *N, SelectionDAG &DAG) {
   return DAG.getNode(ISD::USUBSAT, DL, VT, N0.getOperand(0), SignMask);
 }
 
+/// Given a bitwise logic operation N with a matching bitwise logic operand,
+/// fold a pattern where 2 of the source operands are identically shifted
+/// values. For example:
+/// ((X0 << Y) | Z) | (X1 << Y) --> ((X0 | X1) << Y) | Z
+static SDValue foldLogicOfShifts(SDNode *N, SDValue LogicOp, SDValue ShiftOp,
+                                 SelectionDAG &DAG) {
+  unsigned LogicOpcode = N->getOpcode();
+  assert((LogicOpcode == ISD::AND || LogicOpcode == ISD::OR ||
+          LogicOpcode == ISD::XOR)
+         && "Expected bitwise logic operation");
+
+  if (!LogicOp.hasOneUse() || !ShiftOp.hasOneUse())
+    return SDValue();
+
+  // Match another bitwise logic op and a shift.
+  unsigned ShiftOpcode = ShiftOp.getOpcode();
+  if (LogicOp.getOpcode() != LogicOpcode ||
+      !(ShiftOpcode == ISD::SHL || ShiftOpcode == ISD::SRL ||
+        ShiftOpcode == ISD::SRA))
+    return SDValue();
+
+  // Match another shift op inside the first logic operand. Handle both commuted
+  // possibilities.
+  // LOGIC (LOGIC (SH X0, Y), Z), (SH X1, Y) --> LOGIC (SH (LOGIC X0, X1), Y), Z
+  // LOGIC (LOGIC Z, (SH X0, Y)), (SH X1, Y) --> LOGIC (SH (LOGIC X0, X1), Y), Z
+  SDValue X1 = ShiftOp.getOperand(0);
+  SDValue Y = ShiftOp.getOperand(1);
+  SDValue X0, Z;
+  if (LogicOp.getOperand(0).getOpcode() == ShiftOpcode &&
+      LogicOp.getOperand(0).getOperand(1) == Y) {
+    X0 = LogicOp.getOperand(0).getOperand(0);
+    Z = LogicOp.getOperand(1);
+  } else if (LogicOp.getOperand(1).getOpcode() == ShiftOpcode &&
+             LogicOp.getOperand(1).getOperand(1) == Y) {
+    X0 = LogicOp.getOperand(1).getOperand(0);
+    Z = LogicOp.getOperand(0);
+  } else {
+    return SDValue();
+  }
+
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+  SDValue LogicX = DAG.getNode(LogicOpcode, DL, VT, X0, X1);
+  SDValue NewShift = DAG.getNode(ShiftOpcode, DL, VT, LogicX, Y);
+  return DAG.getNode(LogicOpcode, DL, VT, NewShift, Z);
+}
+
 SDValue DAGCombiner::visitAND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -5848,27 +6120,25 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     if (ISD::isConstantSplatVectorAllOnes(N1.getNode()))
       return N0;
 
-    // fold (and (masked_load) (build_vec (x, ...))) to zext_masked_load
+    // fold (and (masked_load) (splat_vec (x, ...))) to zext_masked_load
     auto *MLoad = dyn_cast<MaskedLoadSDNode>(N0);
-    auto *BVec = dyn_cast<BuildVectorSDNode>(N1);
-    if (MLoad && BVec && MLoad->getExtensionType() == ISD::EXTLOAD &&
-        N0.hasOneUse() && N1.hasOneUse()) {
+    ConstantSDNode *Splat = isConstOrConstSplat(N1, true, true);
+    if (MLoad && MLoad->getExtensionType() == ISD::EXTLOAD && N0.hasOneUse() &&
+        Splat && N1.hasOneUse()) {
       EVT LoadVT = MLoad->getMemoryVT();
       EVT ExtVT = VT;
       if (TLI.isLoadExtLegal(ISD::ZEXTLOAD, ExtVT, LoadVT)) {
         // For this AND to be a zero extension of the masked load the elements
         // of the BuildVec must mask the bottom bits of the extended element
         // type
-        if (ConstantSDNode *Splat = BVec->getConstantSplatNode()) {
-          uint64_t ElementSize =
-              LoadVT.getVectorElementType().getScalarSizeInBits();
-          if (Splat->getAPIntValue().isMask(ElementSize)) {
-            return DAG.getMaskedLoad(
-                ExtVT, SDLoc(N), MLoad->getChain(), MLoad->getBasePtr(),
-                MLoad->getOffset(), MLoad->getMask(), MLoad->getPassThru(),
-                LoadVT, MLoad->getMemOperand(), MLoad->getAddressingMode(),
-                ISD::ZEXTLOAD, MLoad->isExpandingLoad());
-          }
+        uint64_t ElementSize =
+            LoadVT.getVectorElementType().getScalarSizeInBits();
+        if (Splat->getAPIntValue().isMask(ElementSize)) {
+          return DAG.getMaskedLoad(
+              ExtVT, SDLoc(N), MLoad->getChain(), MLoad->getBasePtr(),
+              MLoad->getOffset(), MLoad->getMask(), MLoad->getPassThru(),
+              LoadVT, MLoad->getMemOperand(), MLoad->getAddressingMode(),
+              ISD::ZEXTLOAD, MLoad->isExpandingLoad());
         }
       }
     }
@@ -5944,7 +6214,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     // This can be a pure constant or a vector splat, in which case we treat the
     // vector as a scalar and use the splat value.
     APInt Constant = APInt::getZero(1);
-    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
+    if (const ConstantSDNode *C = isConstOrConstSplat(N1)) {
       Constant = C->getAPIntValue();
     } else if (BuildVectorSDNode *Vector = dyn_cast<BuildVectorSDNode>(N1)) {
       APInt SplatValue, SplatUndef;
@@ -6084,6 +6354,11 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     if (SDValue V = hoistLogicOpWithSameOpcodeHands(N))
       return V;
 
+  if (SDValue R = foldLogicOfShifts(N, N0, N1, DAG))
+    return R;
+  if (SDValue R = foldLogicOfShifts(N, N1, N0, DAG))
+    return R;
+
   // Masking the negated extension of a boolean is just the zero-extended
   // boolean:
   // and (sub 0, zext(bool X)), 1 --> zext(bool X)
@@ -6142,9 +6417,8 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
   if (SDValue Shifts = unfoldExtremeBitClearingToShifts(N))
     return Shifts;
 
-  if (TLI.hasBitTest(N0, N1))
-    if (SDValue V = combineShiftAnd1ToBitTest(N, DAG))
-      return V;
+  if (SDValue V = combineShiftAnd1ToBitTest(N, DAG))
+    return V;
 
   // Recognize the following pattern:
   //
@@ -6194,11 +6468,11 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
   bool LookPassAnd0 = false;
   bool LookPassAnd1 = false;
   if (N0.getOpcode() == ISD::AND && N0.getOperand(0).getOpcode() == ISD::SRL)
-      std::swap(N0, N1);
+    std::swap(N0, N1);
   if (N1.getOpcode() == ISD::AND && N1.getOperand(0).getOpcode() == ISD::SHL)
-      std::swap(N0, N1);
+    std::swap(N0, N1);
   if (N0.getOpcode() == ISD::AND) {
-    if (!N0.getNode()->hasOneUse())
+    if (!N0->hasOneUse())
       return SDValue();
     ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
     // Also handle 0xffff since the LHS is guaranteed to have zeros there.
@@ -6211,7 +6485,7 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
   }
 
   if (N1.getOpcode() == ISD::AND) {
-    if (!N1.getNode()->hasOneUse())
+    if (!N1->hasOneUse())
       return SDValue();
     ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
     if (!N11C || N11C->getZExtValue() != 0xFF)
@@ -6224,7 +6498,7 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
     std::swap(N0, N1);
   if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
     return SDValue();
-  if (!N0.getNode()->hasOneUse() || !N1.getNode()->hasOneUse())
+  if (!N0->hasOneUse() || !N1->hasOneUse())
     return SDValue();
 
   ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
@@ -6237,7 +6511,7 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
   // Look for (shl (and a, 0xff), 8), (srl (and a, 0xff00), 8)
   SDValue N00 = N0->getOperand(0);
   if (!LookPassAnd0 && N00.getOpcode() == ISD::AND) {
-    if (!N00.getNode()->hasOneUse())
+    if (!N00->hasOneUse())
       return SDValue();
     ConstantSDNode *N001C = dyn_cast<ConstantSDNode>(N00.getOperand(1));
     if (!N001C || N001C->getZExtValue() != 0xFF)
@@ -6248,7 +6522,7 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
 
   SDValue N10 = N1->getOperand(0);
   if (!LookPassAnd1 && N10.getOpcode() == ISD::AND) {
-    if (!N10.getNode()->hasOneUse())
+    if (!N10->hasOneUse())
       return SDValue();
     ConstantSDNode *N101C = dyn_cast<ConstantSDNode>(N10.getOperand(1));
     // Also allow 0xFFFF since the bits will be shifted out. This is needed
@@ -6266,19 +6540,23 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
   // Make sure everything beyond the low halfword gets set to zero since the SRL
   // 16 will clear the top bits.
   unsigned OpSizeInBits = VT.getSizeInBits();
-  if (DemandHighBits && OpSizeInBits > 16) {
+  if (OpSizeInBits > 16) {
     // If the left-shift isn't masked out then the only way this is a bswap is
     // if all bits beyond the low 8 are 0. In that case the entire pattern
     // reduces to a left shift anyway: leave it for other parts of the combiner.
-    if (!LookPassAnd0)
+    if (DemandHighBits && !LookPassAnd0)
       return SDValue();
 
     // However, if the right shift isn't masked out then it might be because
-    // it's not needed. See if we can spot that too.
-    if (!LookPassAnd1 &&
-        !DAG.MaskedValueIsZero(
-            N10, APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - 16)))
-      return SDValue();
+    // it's not needed. See if we can spot that too. If the high bits aren't
+    // demanded, we only need bits 23:16 to be zero. Otherwise, we need all
+    // upper bits to be zero.
+    if (!LookPassAnd1) {
+      unsigned HighBit = DemandHighBits ? OpSizeInBits : 24;
+      if (!DAG.MaskedValueIsZero(N10,
+                                 APInt::getBitsSet(OpSizeInBits, 16, HighBit)))
+        return SDValue();
+    }
   }
 
   SDValue Res = DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N00);
@@ -6298,7 +6576,7 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
 /// ((x & 0x00ff0000) << 8) |
 /// ((x & 0xff000000) >> 8)
 static bool isBSwapHWordElement(SDValue N, MutableArrayRef<SDNode *> Parts) {
-  if (!N.getNode()->hasOneUse())
+  if (!N->hasOneUse())
     return false;
 
   unsigned Opc = N.getOpcode();
@@ -6485,8 +6763,9 @@ SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {
     if (!(isBSwapHWordElement(N01, Parts) && isBSwapHWordPair(N00, Parts)) &&
         !(isBSwapHWordElement(N00, Parts) && isBSwapHWordPair(N01, Parts)))
       return SDValue();
-  } else
+  } else {
     return SDValue();
+  }
 
   // Make sure the parts are all coming from the same node.
   if (Parts[0] != Parts[1] || Parts[0] != Parts[2] || Parts[0] != Parts[3])
@@ -6524,7 +6803,7 @@ SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *N) {
   // (or (and X, C1), (and Y, C2))  -> (and (or X, Y), C3) if possible.
   if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND &&
       // Don't increase # computations.
-      (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) {
+      (N0->hasOneUse() || N1->hasOneUse())) {
     // We can only do this xform if we know that bits from X that are set in C2
     // but not in C1 are already zero.  Likewise for Y.
     if (const ConstantSDNode *N0O1C =
@@ -6552,7 +6831,7 @@ SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *N) {
       N1.getOpcode() == ISD::AND &&
       N0.getOperand(0) == N1.getOperand(0) &&
       // Don't increase # computations.
-      (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) {
+      (N0->hasOneUse() || N1->hasOneUse())) {
     SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT,
                             N0.getOperand(1), N1.getOperand(1));
     return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), X);
@@ -6567,14 +6846,38 @@ static SDValue visitORCommutative(
   EVT VT = N0.getValueType();
   if (N0.getOpcode() == ISD::AND) {
     // fold (or (and X, (xor Y, -1)), Y) -> (or X, Y)
-    if (isBitwiseNot(N0.getOperand(1)) && N0.getOperand(1).getOperand(0) == N1)
+    // TODO: Set AllowUndefs = true.
+    if (getBitwiseNotOperand(N0.getOperand(1), N0.getOperand(0),
+                             /* AllowUndefs */ false) == N1)
       return DAG.getNode(ISD::OR, SDLoc(N), VT, N0.getOperand(0), N1);
 
     // fold (or (and (xor Y, -1), X), Y) -> (or X, Y)
-    if (isBitwiseNot(N0.getOperand(0)) && N0.getOperand(0).getOperand(0) == N1)
+    if (getBitwiseNotOperand(N0.getOperand(0), N0.getOperand(1),
+                             /* AllowUndefs */ false) == N1)
       return DAG.getNode(ISD::OR, SDLoc(N), VT, N0.getOperand(1), N1);
   }
 
+  if (SDValue R = foldLogicOfShifts(N, N0, N1, DAG))
+    return R;
+
+  auto peekThroughZext = [](SDValue V) {
+    if (V->getOpcode() == ISD::ZERO_EXTEND)
+      return V->getOperand(0);
+    return V;
+  };
+
+  // (fshl X, ?, Y) | (shl X, Y) --> fshl X, ?, Y
+  if (N0.getOpcode() == ISD::FSHL && N1.getOpcode() == ISD::SHL &&
+      N0.getOperand(0) == N1.getOperand(0) &&
+      peekThroughZext(N0.getOperand(2)) == peekThroughZext(N1.getOperand(1)))
+    return N0;
+
+  // (fshr ?, X, Y) | (srl X, Y) --> fshr ?, X, Y
+  if (N0.getOpcode() == ISD::FSHR && N1.getOpcode() == ISD::SRL &&
+      N0.getOperand(1) == N1.getOperand(0) &&
+      peekThroughZext(N0.getOperand(2)) == peekThroughZext(N1.getOperand(1)))
+    return N0;
+
   return SDValue();
 }
 
@@ -6611,11 +6914,10 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
       return DAG.getAllOnesConstant(SDLoc(N), N1.getValueType());
 
     // fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask)
-    // Do this only if the resulting shuffle is legal.
-    if (isa<ShuffleVectorSDNode>(N0) &&
-        isa<ShuffleVectorSDNode>(N1) &&
-        // Avoid folding a node with illegal type.
-        TLI.isTypeLegal(VT)) {
+    // Do this only if the resulting type / shuffle is legal.
+    auto *SV0 = dyn_cast<ShuffleVectorSDNode>(N0);
+    auto *SV1 = dyn_cast<ShuffleVectorSDNode>(N1);
+    if (SV0 && SV1 && TLI.isTypeLegal(VT)) {
       bool ZeroN00 = ISD::isBuildVectorAllZeros(N0.getOperand(0).getNode());
       bool ZeroN01 = ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode());
       bool ZeroN10 = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode());
@@ -6624,11 +6926,9 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
       if ((ZeroN00 != ZeroN01) && (ZeroN10 != ZeroN11)) {
         assert((!ZeroN00 || !ZeroN01) && "Both inputs zero!");
         assert((!ZeroN10 || !ZeroN11) && "Both inputs zero!");
-        const ShuffleVectorSDNode *SV0 = cast<ShuffleVectorSDNode>(N0);
-        const ShuffleVectorSDNode *SV1 = cast<ShuffleVectorSDNode>(N1);
         bool CanFold = true;
         int NumElts = VT.getVectorNumElements();
-        SmallVector<int, 4> Mask(NumElts);
+        SmallVector<int, 4> Mask(NumElts, -1);
 
         for (int i = 0; i != NumElts; ++i) {
           int M0 = SV0->getMaskElt(i);
@@ -6640,10 +6940,8 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
 
           // If one element is zero and the otherside is undef, keep undef.
           // This also handles the case that both are undef.
-          if ((M0Zero && M1 < 0) || (M1Zero && M0 < 0)) {
-            Mask[i] = -1;
+          if ((M0Zero && M1 < 0) || (M1Zero && M0 < 0))
             continue;
-          }
 
           // Make sure only one of the elements is zero.
           if (M0Zero == M1Zero) {
@@ -6711,7 +7009,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
   auto MatchIntersect = [](ConstantSDNode *C1, ConstantSDNode *C2) {
     return !C1 || !C2 || C1->getAPIntValue().intersects(C2->getAPIntValue());
   };
-  if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
+  if (N0.getOpcode() == ISD::AND && N0->hasOneUse() &&
       ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchIntersect, true)) {
     if (SDValue COR = DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N1), VT,
                                                  {N1, N0.getOperand(1)})) {
@@ -7031,8 +7329,9 @@ static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize,
 // Neg with outer conversions stripped away.
 SDValue DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
                                        SDValue Neg, SDValue InnerPos,
-                                       SDValue InnerNeg, unsigned PosOpcode,
-                                       unsigned NegOpcode, const SDLoc &DL) {
+                                       SDValue InnerNeg, bool HasPos,
+                                       unsigned PosOpcode, unsigned NegOpcode,
+                                       const SDLoc &DL) {
   // fold (or (shl x, (*ext y)),
   //          (srl x, (*ext (sub 32, y)))) ->
   //   (rotl x, y) or (rotr x, (sub 32, y))
@@ -7043,7 +7342,6 @@ SDValue DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
   EVT VT = Shifted.getValueType();
   if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits(), DAG,
                      /*IsRotate*/ true)) {
-    bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT);
     return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, Shifted,
                        HasPos ? Pos : Neg);
   }
@@ -7059,8 +7357,9 @@ SDValue DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
 // TODO: Merge with MatchRotatePosNeg.
 SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
                                        SDValue Neg, SDValue InnerPos,
-                                       SDValue InnerNeg, unsigned PosOpcode,
-                                       unsigned NegOpcode, const SDLoc &DL) {
+                                       SDValue InnerNeg, bool HasPos,
+                                       unsigned PosOpcode, unsigned NegOpcode,
+                                       const SDLoc &DL) {
   EVT VT = N0.getValueType();
   unsigned EltBits = VT.getScalarSizeInBits();
 
@@ -7072,7 +7371,6 @@ SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
   //          (srl x1, (*ext y))) ->
   //   (fshr x0, x1, y) or (fshl x0, x1, (sub 32, y))
   if (matchRotateSub(InnerPos, InnerNeg, EltBits, DAG, /*IsRotate*/ N0 == N1)) {
-    bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT);
     return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, N0, N1,
                        HasPos ? Pos : Neg);
   }
@@ -7134,6 +7432,16 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
   bool HasROTR = hasOperation(ISD::ROTR, VT);
   bool HasFSHL = hasOperation(ISD::FSHL, VT);
   bool HasFSHR = hasOperation(ISD::FSHR, VT);
+
+  // If the type is going to be promoted and the target has enabled custom
+  // lowering for rotate, allow matching rotate by non-constants. Only allow
+  // this for scalar types.
+  if (VT.isScalarInteger() && TLI.getTypeAction(*DAG.getContext(), VT) ==
+                                  TargetLowering::TypePromoteInteger) {
+    HasROTL |= TLI.getOperationAction(ISD::ROTL, VT) == TargetLowering::Custom;
+    HasROTR |= TLI.getOperationAction(ISD::ROTR, VT) == TargetLowering::Custom;
+  }
+
   if (LegalOperations && !HasROTL && !HasROTR && !HasFSHL && !HasFSHR)
     return SDValue();
 
@@ -7187,11 +7495,6 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
   if (LHSShift.getOpcode() == RHSShift.getOpcode())
     return SDValue(); // Shifts must disagree.
 
-  // TODO: Support pre-legalization funnel-shift by constant.
-  bool IsRotate = LHSShift.getOperand(0) == RHSShift.getOperand(0);
-  if (!IsRotate && !(HasFSHL || HasFSHR))
-    return SDValue(); // Requires funnel shift support.
-
   // Canonicalize shl to left side in a shl/srl pair.
   if (RHSShift.getOpcode() == ISD::SHL) {
     std::swap(LHS, RHS);
@@ -7205,27 +7508,12 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
   SDValue RHSShiftArg = RHSShift.getOperand(0);
   SDValue RHSShiftAmt = RHSShift.getOperand(1);
 
-  // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1)
-  // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2)
-  // fold (or (shl x, C1), (srl y, C2)) -> (fshl x, y, C1)
-  // fold (or (shl x, C1), (srl y, C2)) -> (fshr x, y, C2)
-  // iff C1+C2 == EltSizeInBits
   auto MatchRotateSum = [EltSizeInBits](ConstantSDNode *LHS,
                                         ConstantSDNode *RHS) {
-    return (LHS->getAPIntValue() + RHS->getAPIntValue()) == EltSizeInBits;
-  };
-  if (ISD::matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) {
-    SDValue Res;
-    if (IsRotate && (HasROTL || HasROTR || !(HasFSHL || HasFSHR))) {
-      bool UseROTL = !LegalOperations || HasROTL;
-      Res = DAG.getNode(UseROTL ? ISD::ROTL : ISD::ROTR, DL, VT, LHSShiftArg,
-                        UseROTL ? LHSShiftAmt : RHSShiftAmt);
-    } else {
-      bool UseFSHL = !LegalOperations || HasFSHL;
-      Res = DAG.getNode(UseFSHL ? ISD::FSHL : ISD::FSHR, DL, VT, LHSShiftArg,
-                        RHSShiftArg, UseFSHL ? LHSShiftAmt : RHSShiftAmt);
-    }
+    return (LHS->getAPIntValue() + RHS->getAPIntValue()) == EltSizeInBits;
+  };
 
+  auto ApplyMasks = [&](SDValue Res) {
     // If there is an AND of either shifted operand, apply it to the result.
     if (LHSMask.getNode() || RHSMask.getNode()) {
       SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
@@ -7246,6 +7534,71 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
     }
 
     return Res;
+  };
+
+  // TODO: Support pre-legalization funnel-shift by constant.
+  bool IsRotate = LHSShift.getOperand(0) == RHSShift.getOperand(0);
+  if (!IsRotate && !(HasFSHL || HasFSHR)) {
+    if (TLI.isTypeLegal(VT) && LHS.hasOneUse() && RHS.hasOneUse() &&
+        ISD::matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) {
+      // Look for a disguised rotate by constant.
+      // The common shifted operand X may be hidden inside another 'or'.
+      SDValue X, Y;
+      auto matchOr = [&X, &Y](SDValue Or, SDValue CommonOp) {
+        if (!Or.hasOneUse() || Or.getOpcode() != ISD::OR)
+          return false;
+        if (CommonOp == Or.getOperand(0)) {
+          X = CommonOp;
+          Y = Or.getOperand(1);
+          return true;
+        }
+        if (CommonOp == Or.getOperand(1)) {
+          X = CommonOp;
+          Y = Or.getOperand(0);
+          return true;
+        }
+        return false;
+      };
+
+      SDValue Res;
+      if (matchOr(LHSShiftArg, RHSShiftArg)) {
+        // (shl (X | Y), C1) | (srl X, C2) --> (rotl X, C1) | (shl Y, C1)
+        SDValue RotX = DAG.getNode(ISD::ROTL, DL, VT, X, LHSShiftAmt);
+        SDValue ShlY = DAG.getNode(ISD::SHL, DL, VT, Y, LHSShiftAmt);
+        Res = DAG.getNode(ISD::OR, DL, VT, RotX, ShlY);
+      } else if (matchOr(RHSShiftArg, LHSShiftArg)) {
+        // (shl X, C1) | (srl (X | Y), C2) --> (rotl X, C1) | (srl Y, C2)
+        SDValue RotX = DAG.getNode(ISD::ROTL, DL, VT, X, LHSShiftAmt);
+        SDValue SrlY = DAG.getNode(ISD::SRL, DL, VT, Y, RHSShiftAmt);
+        Res = DAG.getNode(ISD::OR, DL, VT, RotX, SrlY);
+      } else {
+        return SDValue();
+      }
+
+      return ApplyMasks(Res);
+    }
+
+    return SDValue(); // Requires funnel shift support.
+  }
+
+  // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1)
+  // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2)
+  // fold (or (shl x, C1), (srl y, C2)) -> (fshl x, y, C1)
+  // fold (or (shl x, C1), (srl y, C2)) -> (fshr x, y, C2)
+  // iff C1+C2 == EltSizeInBits
+  if (ISD::matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) {
+    SDValue Res;
+    if (IsRotate && (HasROTL || HasROTR || !(HasFSHL || HasFSHR))) {
+      bool UseROTL = !LegalOperations || HasROTL;
+      Res = DAG.getNode(UseROTL ? ISD::ROTL : ISD::ROTR, DL, VT, LHSShiftArg,
+                        UseROTL ? LHSShiftAmt : RHSShiftAmt);
+    } else {
+      bool UseFSHL = !LegalOperations || HasFSHL;
+      Res = DAG.getNode(UseFSHL ? ISD::FSHL : ISD::FSHR, DL, VT, LHSShiftArg,
+                        RHSShiftArg, UseFSHL ? LHSShiftAmt : RHSShiftAmt);
+    }
+
+    return ApplyMasks(Res);
   }
 
   // Even pre-legalization, we can't easily rotate/funnel-shift by a variable
@@ -7276,26 +7629,26 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
   if (IsRotate && (HasROTL || HasROTR)) {
     SDValue TryL =
         MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt, LExtOp0,
-                          RExtOp0, ISD::ROTL, ISD::ROTR, DL);
+                          RExtOp0, HasROTL, ISD::ROTL, ISD::ROTR, DL);
     if (TryL)
       return TryL;
 
     SDValue TryR =
         MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt, RExtOp0,
-                          LExtOp0, ISD::ROTR, ISD::ROTL, DL);
+                          LExtOp0, HasROTR, ISD::ROTR, ISD::ROTL, DL);
     if (TryR)
       return TryR;
   }
 
   SDValue TryL =
       MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, LHSShiftAmt, RHSShiftAmt,
-                        LExtOp0, RExtOp0, ISD::FSHL, ISD::FSHR, DL);
+                        LExtOp0, RExtOp0, HasFSHL, ISD::FSHL, ISD::FSHR, DL);
   if (TryL)
     return TryL;
 
   SDValue TryR =
       MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, RHSShiftAmt, LHSShiftAmt,
-                        RExtOp0, LExtOp0, ISD::FSHR, ISD::FSHL, DL);
+                        RExtOp0, LExtOp0, HasFSHR, ISD::FSHR, ISD::FSHL, DL);
   if (TryR)
     return TryR;
 
@@ -7810,7 +8163,7 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
   // little endian value load
   Optional<bool> IsBigEndian = isBigEndian(
       makeArrayRef(ByteOffsets).drop_back(ZeroExtendedBytes), FirstOffset);
-  if (!IsBigEndian.hasValue())
+  if (!IsBigEndian)
     return SDValue();
 
   assert(FirstByteProvider && "must be set");
@@ -8017,6 +8370,13 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
   if (SDValue RXOR = reassociateOps(ISD::XOR, DL, N0, N1, N->getFlags()))
     return RXOR;
 
+  // look for 'add-like' folds:
+  // XOR(N0,MIN_SIGNED_VALUE) == ADD(N0,MIN_SIGNED_VALUE)
+  if ((!LegalOperations || TLI.isOperationLegal(ISD::ADD, VT)) &&
+      isMinSignedConstant(N1))
+    if (SDValue Combined = visitADDLike(N))
+      return Combined;
+
   // fold !(x cc y) -> (x !cc y)
   unsigned N0Opcode = N0.getOpcode();
   SDValue LHS, RHS, CC;
@@ -8182,6 +8542,11 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
     if (SDValue V = hoistLogicOpWithSameOpcodeHands(N))
       return V;
 
+  if (SDValue R = foldLogicOfShifts(N, N0, N1, DAG))
+    return R;
+  if (SDValue R = foldLogicOfShifts(N, N1, N0, DAG))
+    return R;
+
   // Unfold  ((x ^ y) & m) ^ y  into  (x & m) | (y & ~m)  if profitable
   if (SDValue MM = unfoldMaskedMerge(N))
     return MM;
@@ -8412,7 +8777,9 @@ SDValue DAGCombiner::visitRotate(SDNode *N) {
   }
 
   unsigned NextOp = N0.getOpcode();
-  // fold (rot* (rot* x, c2), c1) -> (rot* x, c1 +- c2 % bitsize)
+
+  // fold (rot* (rot* x, c2), c1)
+  //   -> (rot* x, ((c1 % bitsize) +- (c2 % bitsize)) % bitsize)
   if (NextOp == ISD::ROTL || NextOp == ISD::ROTR) {
     SDNode *C1 = DAG.isConstantIntBuildVectorOrConstantInt(N1);
     SDNode *C2 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1));
@@ -8420,14 +8787,19 @@ SDValue DAGCombiner::visitRotate(SDNode *N) {
       EVT ShiftVT = C1->getValueType(0);
       bool SameSide = (N->getOpcode() == NextOp);
       unsigned CombineOp = SameSide ? ISD::ADD : ISD::SUB;
-      if (SDValue CombinedShift = DAG.FoldConstantArithmetic(
-              CombineOp, dl, ShiftVT, {N1, N0.getOperand(1)})) {
-        SDValue BitsizeC = DAG.getConstant(Bitsize, dl, ShiftVT);
-        SDValue CombinedShiftNorm = DAG.FoldConstantArithmetic(
-            ISD::SREM, dl, ShiftVT, {CombinedShift, BitsizeC});
-        return DAG.getNode(N->getOpcode(), dl, VT, N0->getOperand(0),
-                           CombinedShiftNorm);
-      }
+      SDValue BitsizeC = DAG.getConstant(Bitsize, dl, ShiftVT);
+      SDValue Norm1 = DAG.FoldConstantArithmetic(ISD::UREM, dl, ShiftVT,
+                                                 {N1, BitsizeC});
+      SDValue Norm2 = DAG.FoldConstantArithmetic(ISD::UREM, dl, ShiftVT,
+                                                 {N0.getOperand(1), BitsizeC});
+      if (Norm1 && Norm2)
+        if (SDValue CombinedShift = DAG.FoldConstantArithmetic(
+                CombineOp, dl, ShiftVT, {Norm1, Norm2})) {
+          SDValue CombinedShiftNorm = DAG.FoldConstantArithmetic(
+              ISD::UREM, dl, ShiftVT, {CombinedShift, BitsizeC});
+          return DAG.getNode(N->getOpcode(), dl, VT, N0->getOperand(0),
+                             CombinedShiftNorm);
+        }
     }
   }
   return SDValue();
@@ -8587,52 +8959,63 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
     }
   }
 
-  // fold (shl (sr[la] exact X,  C1), C2) -> (shl    X, (C2-C1)) if C1 <= C2
-  // fold (shl (sr[la] exact X,  C1), C2) -> (sr[la] X, (C2-C1)) if C1  > C2
-  // TODO - support non-uniform vector shift amounts.
-  ConstantSDNode *N1C = isConstOrConstSplat(N1);
-  if (N1C && (N0.getOpcode() == ISD::SRL || N0.getOpcode() == ISD::SRA) &&
-      N0->getFlags().hasExact()) {
-    if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) {
-      uint64_t C1 = N0C1->getZExtValue();
-      uint64_t C2 = N1C->getZExtValue();
-      SDLoc DL(N);
-      if (C1 <= C2)
-        return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0),
-                           DAG.getConstant(C2 - C1, DL, ShiftVT));
-      return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0),
-                         DAG.getConstant(C1 - C2, DL, ShiftVT));
+  if (N0.getOpcode() == ISD::SRL || N0.getOpcode() == ISD::SRA) {
+    auto MatchShiftAmount = [OpSizeInBits](ConstantSDNode *LHS,
+                                           ConstantSDNode *RHS) {
+      const APInt &LHSC = LHS->getAPIntValue();
+      const APInt &RHSC = RHS->getAPIntValue();
+      return LHSC.ult(OpSizeInBits) && RHSC.ult(OpSizeInBits) &&
+             LHSC.getZExtValue() <= RHSC.getZExtValue();
+    };
+
+    SDLoc DL(N);
+
+    // fold (shl (sr[la] exact X,  C1), C2) -> (shl    X, (C2-C1)) if C1 <= C2
+    // fold (shl (sr[la] exact X,  C1), C2) -> (sr[la] X, (C2-C1)) if C1 >= C2
+    if (N0->getFlags().hasExact()) {
+      if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchShiftAmount,
+                                    /*AllowUndefs*/ false,
+                                    /*AllowTypeMismatch*/ true)) {
+        SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
+        SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N1, N01);
+        return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Diff);
+      }
+      if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchShiftAmount,
+                                    /*AllowUndefs*/ false,
+                                    /*AllowTypeMismatch*/ true)) {
+        SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
+        SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N01, N1);
+        return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0), Diff);
+      }
     }
-  }
 
-  // fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) or
-  //                               (and (srl x, (sub c1, c2), MASK)
-  // Only fold this if the inner shift has no other uses -- if it does, folding
-  // this will increase the total number of instructions.
-  // TODO - drop hasOneUse requirement if c1 == c2?
-  // TODO - support non-uniform vector shift amounts.
-  if (N1C && N0.getOpcode() == ISD::SRL && N0.hasOneUse() &&
-      TLI.shouldFoldConstantShiftPairToMask(N, Level)) {
-    if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) {
-      if (N0C1->getAPIntValue().ult(OpSizeInBits)) {
-        uint64_t c1 = N0C1->getZExtValue();
-        uint64_t c2 = N1C->getZExtValue();
-        APInt Mask = APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - c1);
-        SDValue Shift;
-        if (c2 > c1) {
-          Mask <<= c2 - c1;
-          SDLoc DL(N);
-          Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0),
-                              DAG.getConstant(c2 - c1, DL, ShiftVT));
-        } else {
-          Mask.lshrInPlace(c1 - c2);
-          SDLoc DL(N);
-          Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0),
-                              DAG.getConstant(c1 - c2, DL, ShiftVT));
-        }
-        SDLoc DL(N0);
-        return DAG.getNode(ISD::AND, DL, VT, Shift,
-                           DAG.getConstant(Mask, DL, VT));
+    // fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) or
+    //                               (and (srl x, (sub c1, c2), MASK)
+    // Only fold this if the inner shift has no other uses -- if it does,
+    // folding this will increase the total number of instructions.
+    if (N0.getOpcode() == ISD::SRL &&
+        (N0.getOperand(1) == N1 || N0.hasOneUse()) &&
+        TLI.shouldFoldConstantShiftPairToMask(N, Level)) {
+      if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchShiftAmount,
+                                    /*AllowUndefs*/ false,
+                                    /*AllowTypeMismatch*/ true)) {
+        SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
+        SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N01, N1);
+        SDValue Mask = DAG.getAllOnesConstant(DL, VT);
+        Mask = DAG.getNode(ISD::SHL, DL, VT, Mask, N01);
+        Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, Diff);
+        SDValue Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Diff);
+        return DAG.getNode(ISD::AND, DL, VT, Shift, Mask);
+      }
+      if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchShiftAmount,
+                                    /*AllowUndefs*/ false,
+                                    /*AllowTypeMismatch*/ true)) {
+        SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
+        SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N1, N01);
+        SDValue Mask = DAG.getAllOnesConstant(DL, VT);
+        Mask = DAG.getNode(ISD::SHL, DL, VT, Mask, N1);
+        SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Diff);
+        return DAG.getNode(ISD::AND, DL, VT, Shift, Mask);
       }
     }
   }
@@ -8651,7 +9034,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   // Variant of version done on multiply, except mul by a power of 2 is turned
   // into a shift.
   if ((N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::OR) &&
-      N0.getNode()->hasOneUse() &&
+      N0->hasOneUse() &&
       isConstantOrConstantVector(N1, /* No Opaques */ true) &&
       isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true) &&
       TLI.isDesirableToCommuteWithShift(N, Level)) {
@@ -8663,14 +9046,14 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   }
 
   // fold (shl (mul x, c1), c2) -> (mul x, c1 << c2)
-  if (N0.getOpcode() == ISD::MUL && N0.getNode()->hasOneUse() &&
-      isConstantOrConstantVector(N1, /* No Opaques */ true) &&
-      isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true)) {
-    SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1);
-    if (isConstantOrConstantVector(Shl))
+  if (N0.getOpcode() == ISD::MUL && N0->hasOneUse()) {
+    SDValue N01 = N0.getOperand(1);
+    if (SDValue Shl =
+            DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N1), VT, {N01, N1}))
       return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), Shl);
   }
 
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
   if (N1C && !N1C->isOpaque())
     if (SDValue NewSHL = visitShiftByConstant(N))
       return NewSHL;
@@ -8956,8 +9339,10 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
       unsigned TruncBits = LargeVT.getScalarSizeInBits() - OpSizeInBits;
       if (LargeShift->getAPIntValue() == TruncBits) {
         SDLoc DL(N);
-        SDValue Amt = DAG.getConstant(N1C->getZExtValue() + TruncBits, DL,
-                                      getShiftAmountTy(LargeVT));
+        EVT LargeShiftVT = getShiftAmountTy(LargeVT);
+        SDValue Amt = DAG.getZExtOrTrunc(N1, DL, LargeShiftVT);
+        Amt = DAG.getNode(ISD::ADD, DL, LargeShiftVT, Amt,
+                          DAG.getConstant(TruncBits, DL, LargeShiftVT));
         SDValue SRA =
             DAG.getNode(ISD::SRA, DL, LargeVT, N0Op0.getOperand(0), Amt);
         return DAG.getNode(ISD::TRUNCATE, DL, VT, SRA);
@@ -8996,6 +9381,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
     return V;
 
   EVT VT = N0.getValueType();
+  EVT ShiftVT = N1.getValueType();
   unsigned OpSizeInBits = VT.getScalarSizeInBits();
 
   // fold (srl c1, c2) -> c1 >>u c2
@@ -9037,7 +9423,6 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
     };
     if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) {
       SDLoc DL(N);
-      EVT ShiftVT = N1.getValueType();
       SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1));
       return DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Sum);
     }
@@ -9081,15 +9466,41 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
     }
   }
 
-  // fold (srl (shl x, c), c) -> (and x, cst2)
-  // TODO - (srl (shl x, c1), c2).
-  if (N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 &&
-      isConstantOrConstantVector(N1, /* NoOpaques */ true)) {
-    SDLoc DL(N);
-    SDValue Mask =
-        DAG.getNode(ISD::SRL, DL, VT, DAG.getAllOnesConstant(DL, VT), N1);
-    AddToWorklist(Mask.getNode());
-    return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), Mask);
+  // fold (srl (shl x, c1), c2) -> (and (shl x, (sub c1, c2), MASK) or
+  //                               (and (srl x, (sub c2, c1), MASK)
+  if (N0.getOpcode() == ISD::SHL &&
+      (N0.getOperand(1) == N1 || N0->hasOneUse()) &&
+      TLI.shouldFoldConstantShiftPairToMask(N, Level)) {
+    auto MatchShiftAmount = [OpSizeInBits](ConstantSDNode *LHS,
+                                           ConstantSDNode *RHS) {
+      const APInt &LHSC = LHS->getAPIntValue();
+      const APInt &RHSC = RHS->getAPIntValue();
+      return LHSC.ult(OpSizeInBits) && RHSC.ult(OpSizeInBits) &&
+             LHSC.getZExtValue() <= RHSC.getZExtValue();
+    };
+    if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchShiftAmount,
+                                  /*AllowUndefs*/ false,
+                                  /*AllowTypeMismatch*/ true)) {
+      SDLoc DL(N);
+      SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
+      SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N01, N1);
+      SDValue Mask = DAG.getAllOnesConstant(DL, VT);
+      Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, N01);
+      Mask = DAG.getNode(ISD::SHL, DL, VT, Mask, Diff);
+      SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Diff);
+      return DAG.getNode(ISD::AND, DL, VT, Shift, Mask);
+    }
+    if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchShiftAmount,
+                                  /*AllowUndefs*/ false,
+                                  /*AllowTypeMismatch*/ true)) {
+      SDLoc DL(N);
+      SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
+      SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N1, N01);
+      SDValue Mask = DAG.getAllOnesConstant(DL, VT);
+      Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, N1);
+      SDValue Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Diff);
+      return DAG.getNode(ISD::AND, DL, VT, Shift, Mask);
+    }
   }
 
   // fold (srl (anyextend x), c) -> (and (anyextend (srl x, c)), mask)
@@ -9345,6 +9756,21 @@ SDValue DAGCombiner::visitSHLSAT(SDNode *N) {
           DAG.FoldConstantArithmetic(N->getOpcode(), SDLoc(N), VT, {N0, N1}))
     return C;
 
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
+
+  if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::SHL, VT)) {
+    // fold (sshlsat x, c) -> (shl x, c)
+    if (N->getOpcode() == ISD::SSHLSAT && N1C &&
+        N1C->getAPIntValue().ult(DAG.ComputeNumSignBits(N0)))
+      return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, N1);
+
+    // fold (ushlsat x, c) -> (shl x, c)
+    if (N->getOpcode() == ISD::USHLSAT && N1C &&
+        N1C->getAPIntValue().ule(
+            DAG.computeKnownBits(N0).countMinLeadingZeros()))
+      return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, N1);
+  }
+
   return SDValue();
 }
 
@@ -9368,18 +9794,27 @@ static SDValue combineABSToABD(SDNode *N, SelectionDAG &DAG,
       (Opc0 != ISD::ZERO_EXTEND && Opc0 != ISD::SIGN_EXTEND))
     return SDValue();
 
+  EVT VT = N->getValueType(0);
   EVT VT1 = Op0.getOperand(0).getValueType();
   EVT VT2 = Op1.getOperand(0).getValueType();
-  // Check if the operands are of same type and valid size.
   unsigned ABDOpcode = (Opc0 == ISD::SIGN_EXTEND) ? ISD::ABDS : ISD::ABDU;
-  if (VT1 != VT2 || !TLI.isOperationLegalOrCustom(ABDOpcode, VT1))
-    return SDValue();
 
-  Op0 = Op0.getOperand(0);
-  Op1 = Op1.getOperand(0);
-  SDValue ABD =
-      DAG.getNode(ABDOpcode, SDLoc(N), Op0->getValueType(0), Op0, Op1);
-  return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), ABD);
+  // fold abs(sext(x) - sext(y)) -> zext(abds(x, y))
+  // fold abs(zext(x) - zext(y)) -> zext(abdu(x, y))
+  // NOTE: Extensions must be equivalent.
+  if (VT1 == VT2 && TLI.isOperationLegalOrCustom(ABDOpcode, VT1)) {
+    Op0 = Op0.getOperand(0);
+    Op1 = Op1.getOperand(0);
+    SDValue ABD = DAG.getNode(ABDOpcode, SDLoc(N), VT1, Op0, Op1);
+    return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, ABD);
+  }
+
+  // fold abs(sext(x) - sext(y)) -> abds(sext(x), sext(y))
+  // fold abs(zext(x) - zext(y)) -> abdu(zext(x), zext(y))
+  if (TLI.isOperationLegalOrCustom(ABDOpcode, VT))
+    return DAG.getNode(ABDOpcode, SDLoc(N), VT, Op0, Op1);
+
+  return SDValue();
 }
 
 SDValue DAGCombiner::visitABS(SDNode *N) {
@@ -9405,24 +9840,60 @@ SDValue DAGCombiner::visitABS(SDNode *N) {
 SDValue DAGCombiner::visitBSWAP(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
+  SDLoc DL(N);
 
   // fold (bswap c1) -> c2
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
-    return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N0);
+    return DAG.getNode(ISD::BSWAP, DL, VT, N0);
   // fold (bswap (bswap x)) -> x
   if (N0.getOpcode() == ISD::BSWAP)
-    return N0->getOperand(0);
+    return N0.getOperand(0);
 
   // Canonicalize bswap(bitreverse(x)) -> bitreverse(bswap(x)). If bitreverse
   // isn't supported, it will be expanded to bswap followed by a manual reversal
   // of bits in each byte. By placing bswaps before bitreverse, we can remove
   // the two bswaps if the bitreverse gets expanded.
   if (N0.getOpcode() == ISD::BITREVERSE && N0.hasOneUse()) {
-    SDLoc DL(N);
     SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT, N0.getOperand(0));
     return DAG.getNode(ISD::BITREVERSE, DL, VT, BSwap);
   }
 
+  // fold (bswap shl(x,c)) -> (zext(bswap(trunc(shl(x,sub(c,bw/2))))))
+  // iff x >= bw/2 (i.e. lower half is known zero)
+  unsigned BW = VT.getScalarSizeInBits();
+  if (BW >= 32 && N0.getOpcode() == ISD::SHL && N0.hasOneUse()) {
+    auto *ShAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+    EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), BW / 2);
+    if (ShAmt && ShAmt->getAPIntValue().ult(BW) &&
+        ShAmt->getZExtValue() >= (BW / 2) &&
+        (ShAmt->getZExtValue() % 16) == 0 && TLI.isTypeLegal(HalfVT) &&
+        TLI.isTruncateFree(VT, HalfVT) &&
+        (!LegalOperations || hasOperation(ISD::BSWAP, HalfVT))) {
+      SDValue Res = N0.getOperand(0);
+      if (uint64_t NewShAmt = (ShAmt->getZExtValue() - (BW / 2)))
+        Res = DAG.getNode(ISD::SHL, DL, VT, Res,
+                          DAG.getConstant(NewShAmt, DL, getShiftAmountTy(VT)));
+      Res = DAG.getZExtOrTrunc(Res, DL, HalfVT);
+      Res = DAG.getNode(ISD::BSWAP, DL, HalfVT, Res);
+      return DAG.getZExtOrTrunc(Res, DL, VT);
+    }
+  }
+
+  // Try to canonicalize bswap-of-logical-shift-by-8-bit-multiple as
+  // inverse-shift-of-bswap:
+  // bswap (X u<< C) --> (bswap X) u>> C
+  // bswap (X u>> C) --> (bswap X) u<< C
+  if ((N0.getOpcode() == ISD::SHL || N0.getOpcode() == ISD::SRL) &&
+      N0.hasOneUse()) {
+    auto *ShAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+    if (ShAmt && ShAmt->getAPIntValue().ult(BW) &&
+        ShAmt->getZExtValue() % 8 == 0) {
+      SDValue NewSwap = DAG.getNode(ISD::BSWAP, DL, VT, N0.getOperand(0));
+      unsigned InverseShift = N0.getOpcode() == ISD::SHL ? ISD::SRL : ISD::SHL;
+      return DAG.getNode(InverseShift, DL, VT, NewSwap, N0.getOperand(1));
+    }
+  }
+
   return SDValue();
 }
 
@@ -9673,7 +10144,8 @@ SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {
       if (C1Val.isPowerOf2() && C2Val.isZero()) {
         if (VT != MVT::i1)
           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
-        SDValue ShAmtC = DAG.getConstant(C1Val.exactLogBase2(), DL, VT);
+        SDValue ShAmtC =
+            DAG.getShiftAmountConstant(C1Val.exactLogBase2(), VT, DL);
         return DAG.getNode(ISD::SHL, DL, VT, Cond, ShAmtC);
       }
 
@@ -9956,7 +10428,7 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
          TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT))) {
       // Any flags available in a select/setcc fold will be on the setcc as they
       // migrated from fcmp
-      Flags = N0.getNode()->getFlags();
+      Flags = N0->getFlags();
       SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, VT, Cond0, Cond1, N1,
                                        N2, N0.getOperand(2));
       SelectNode->setFlags(Flags);
@@ -10029,14 +10501,19 @@ static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) {
       TopHalf->isZero() ? RHS->getOperand(1) : LHS->getOperand(1));
 }
 
-bool refineUniformBase(SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG) {
+bool refineUniformBase(SDValue &BasePtr, SDValue &Index, bool IndexIsScaled,
+                       SelectionDAG &DAG) {
   if (!isNullConstant(BasePtr) || Index.getOpcode() != ISD::ADD)
     return false;
 
+  // Only perform the transformation when existing operands can be reused.
+  if (IndexIsScaled)
+    return false;
+
   // For now we check only the LHS of the add.
   SDValue LHS = Index.getOperand(0);
   SDValue SplatVal = DAG.getSplatValue(LHS);
-  if (!SplatVal)
+  if (!SplatVal || SplatVal.getValueType() != BasePtr.getValueType())
     return false;
 
   BasePtr = SplatVal;
@@ -10045,23 +10522,29 @@ bool refineUniformBase(SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG) {
 }
 
 // Fold sext/zext of index into index type.
-bool refineIndexType(MaskedGatherScatterSDNode *MGS, SDValue &Index,
-                     bool Scaled, SelectionDAG &DAG) {
+bool refineIndexType(SDValue &Index, ISD::MemIndexType &IndexType, EVT DataVT,
+                     SelectionDAG &DAG) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
+  // It's always safe to look through zero extends.
   if (Index.getOpcode() == ISD::ZERO_EXTEND) {
     SDValue Op = Index.getOperand(0);
-    MGS->setIndexType(Scaled ? ISD::UNSIGNED_SCALED : ISD::UNSIGNED_UNSCALED);
-    if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType())) {
+    if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType(), DataVT)) {
+      IndexType = ISD::UNSIGNED_SCALED;
       Index = Op;
       return true;
     }
+    if (ISD::isIndexTypeSigned(IndexType)) {
+      IndexType = ISD::UNSIGNED_SCALED;
+      return true;
+    }
   }
 
-  if (Index.getOpcode() == ISD::SIGN_EXTEND) {
+  // It's only safe to look through sign extends when Index is signed.
+  if (Index.getOpcode() == ISD::SIGN_EXTEND &&
+      ISD::isIndexTypeSigned(IndexType)) {
     SDValue Op = Index.getOperand(0);
-    MGS->setIndexType(Scaled ? ISD::SIGNED_SCALED : ISD::SIGNED_UNSCALED);
-    if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType())) {
+    if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType(), DataVT)) {
       Index = Op;
       return true;
     }
@@ -10078,24 +10561,25 @@ SDValue DAGCombiner::visitMSCATTER(SDNode *N) {
   SDValue Scale = MSC->getScale();
   SDValue StoreVal = MSC->getValue();
   SDValue BasePtr = MSC->getBasePtr();
+  ISD::MemIndexType IndexType = MSC->getIndexType();
   SDLoc DL(N);
 
   // Zap scatters with a zero mask.
   if (ISD::isConstantSplatVectorAllZeros(Mask.getNode()))
     return Chain;
 
-  if (refineUniformBase(BasePtr, Index, DAG)) {
+  if (refineUniformBase(BasePtr, Index, MSC->isIndexScaled(), DAG)) {
     SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
-    return DAG.getMaskedScatter(
-        DAG.getVTList(MVT::Other), MSC->getMemoryVT(), DL, Ops,
-        MSC->getMemOperand(), MSC->getIndexType(), MSC->isTruncatingStore());
+    return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(),
+                                DL, Ops, MSC->getMemOperand(), IndexType,
+                                MSC->isTruncatingStore());
   }
 
-  if (refineIndexType(MSC, Index, MSC->isIndexScaled(), DAG)) {
+  if (refineIndexType(Index, IndexType, StoreVal.getValueType(), DAG)) {
     SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
-    return DAG.getMaskedScatter(
-        DAG.getVTList(MVT::Other), MSC->getMemoryVT(), DL, Ops,
-        MSC->getMemOperand(), MSC->getIndexType(), MSC->isTruncatingStore());
+    return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(),
+                                DL, Ops, MSC->getMemOperand(), IndexType,
+                                MSC->isTruncatingStore());
   }
 
   return SDValue();
@@ -10150,7 +10634,7 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) {
   // If this is a TRUNC followed by a masked store, fold this into a masked
   // truncating store.  We can do this even if this is already a masked
   // truncstore.
-  if ((Value.getOpcode() == ISD::TRUNCATE) && Value.getNode()->hasOneUse() &&
+  if ((Value.getOpcode() == ISD::TRUNCATE) && Value->hasOneUse() &&
       MST->isUnindexed() &&
       TLI.canCombineTruncStore(Value.getOperand(0).getValueType(),
                                MST->getMemoryVT(), LegalOperations)) {
@@ -10173,26 +10657,25 @@ SDValue DAGCombiner::visitMGATHER(SDNode *N) {
   SDValue Scale = MGT->getScale();
   SDValue PassThru = MGT->getPassThru();
   SDValue BasePtr = MGT->getBasePtr();
+  ISD::MemIndexType IndexType = MGT->getIndexType();
   SDLoc DL(N);
 
   // Zap gathers with a zero mask.
   if (ISD::isConstantSplatVectorAllZeros(Mask.getNode()))
     return CombineTo(N, PassThru, MGT->getChain());
 
-  if (refineUniformBase(BasePtr, Index, DAG)) {
+  if (refineUniformBase(BasePtr, Index, MGT->isIndexScaled(), DAG)) {
     SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
-    return DAG.getMaskedGather(DAG.getVTList(N->getValueType(0), MVT::Other),
-                               MGT->getMemoryVT(), DL, Ops,
-                               MGT->getMemOperand(), MGT->getIndexType(),
-                               MGT->getExtensionType());
+    return DAG.getMaskedGather(
+        DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
+        Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
   }
 
-  if (refineIndexType(MGT, Index, MGT->isIndexScaled(), DAG)) {
+  if (refineIndexType(Index, IndexType, N->getValueType(0), DAG)) {
     SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
-    return DAG.getMaskedGather(DAG.getVTList(N->getValueType(0), MVT::Other),
-                               MGT->getMemoryVT(), DL, Ops,
-                               MGT->getMemOperand(), MGT->getIndexType(),
-                               MGT->getExtensionType());
+    return DAG.getMaskedGather(
+        DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
+        Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
   }
 
   return SDValue();
@@ -10446,23 +10929,25 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
         Other = N1;
       }
 
+      // zext(x) >= y ? trunc(zext(x) - y) : 0
+      // --> usubsat(trunc(zext(x)),trunc(umin(y,SatLimit)))
+      // zext(x) >  y ? trunc(zext(x) - y) : 0
+      // --> usubsat(trunc(zext(x)),trunc(umin(y,SatLimit)))
+      if (Other && Other.getOpcode() == ISD::TRUNCATE &&
+          Other.getOperand(0).getOpcode() == ISD::SUB &&
+          (SatCC == ISD::SETUGE || SatCC == ISD::SETUGT)) {
+        SDValue OpLHS = Other.getOperand(0).getOperand(0);
+        SDValue OpRHS = Other.getOperand(0).getOperand(1);
+        if (LHS == OpLHS && RHS == OpRHS && LHS.getOpcode() == ISD::ZERO_EXTEND)
+          if (SDValue R = getTruncatedUSUBSAT(VT, LHS.getValueType(), LHS, RHS,
+                                              DAG, DL))
+            return R;
+      }
+
       if (Other && Other.getNumOperands() == 2) {
         SDValue CondRHS = RHS;
         SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1);
 
-        if (Other.getOpcode() == ISD::SUB &&
-            LHS.getOpcode() == ISD::ZERO_EXTEND && LHS.getOperand(0) == OpLHS &&
-            OpRHS.getOpcode() == ISD::TRUNCATE && OpRHS.getOperand(0) == RHS) {
-          // Look for a general sub with unsigned saturation first.
-          // zext(x) >= y ? x - trunc(y) : 0
-          // --> usubsat(x,trunc(umin(y,SatLimit)))
-          // zext(x) >  y ? x - trunc(y) : 0
-          // --> usubsat(x,trunc(umin(y,SatLimit)))
-          if (SatCC == ISD::SETUGE || SatCC == ISD::SETUGT)
-            return getTruncatedUSUBSAT(VT, LHS.getValueType(), LHS, RHS, DAG,
-                                       DL);
-        }
-
         if (OpLHS == LHS) {
           // Look for a general sub with unsigned saturation first.
           // x >= y ? x-y : 0 --> usubsat x, y
@@ -10493,8 +10978,8 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
 
               // Another special case: If C was a sign bit, the sub has been
               // canonicalized into a xor.
-              // FIXME: Would it be better to use computeKnownBits to determine
-              //        whether it's safe to decanonicalize the xor?
+              // FIXME: Would it be better to use computeKnownBits to
+              // determine whether it's safe to decanonicalize the xor?
               // x s< 0 ? x^C : 0 --> usubsat x, C
               APInt SplatValue;
               if (SatCC == ISD::SETLT && Other.getOpcode() == ISD::XOR &&
@@ -10560,17 +11045,18 @@ SDValue DAGCombiner::visitSELECT_CC(SDNode *N) {
                                   CC, SDLoc(N), false)) {
     AddToWorklist(SCC.getNode());
 
-    if (ConstantSDNode *SCCC = dyn_cast<ConstantSDNode>(SCC.getNode())) {
-      if (!SCCC->isZero())
-        return N2;    // cond always true -> true val
-      else
-        return N3;    // cond always false -> false val
-    } else if (SCC->isUndef()) {
-      // When the condition is UNDEF, just return the first operand. This is
-      // coherent the DAG creation, no setcc node is created in this case
+    // cond always true -> true val
+    // cond always false -> false val
+    if (auto *SCCC = dyn_cast<ConstantSDNode>(SCC.getNode()))
+      return SCCC->isZero() ? N3 : N2;
+
+    // When the condition is UNDEF, just return the first operand. This is
+    // coherent the DAG creation, no setcc node is created in this case
+    if (SCC->isUndef())
       return N2;
-    } else if (SCC.getOpcode() == ISD::SETCC) {
-      // Fold to a simpler select_cc
+
+    // Fold to a simpler select_cc
+    if (SCC.getOpcode() == ISD::SETCC) {
       SDValue SelectOp = DAG.getNode(
           ISD::SELECT_CC, SDLoc(N), N2.getValueType(), SCC.getOperand(0),
           SCC.getOperand(1), N2, N3, SCC.getOperand(2));
@@ -10853,9 +11339,8 @@ static bool ExtendUsesToFormExtLoad(EVT VT, SDNode *N, SDValue N0,
                                     const TargetLowering &TLI) {
   bool HasCopyToRegUses = false;
   bool isTruncFree = TLI.isTruncateFree(VT, N0.getValueType());
-  for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
-                            UE = N0.getNode()->use_end();
-       UI != UE; ++UI) {
+  for (SDNode::use_iterator UI = N0->use_begin(), UE = N0->use_end(); UI != UE;
+       ++UI) {
     SDNode *User = *UI;
     if (User == N)
       continue;
@@ -11187,9 +11672,12 @@ static SDValue tryToFoldExtOfLoad(SelectionDAG &DAG, DAGCombiner &Combiner,
                                   bool LegalOperations, SDNode *N, SDValue N0,
                                   ISD::LoadExtType ExtLoadType,
                                   ISD::NodeType ExtOpc) {
+  // TODO: isFixedLengthVector() should be removed and any negative effects on
+  // code generation being the result of that target's implementation of
+  // isVectorLoadExtDesirable().
   if (!ISD::isNON_EXTLoad(N0.getNode()) ||
       !ISD::isUNINDEXEDLoad(N0.getNode()) ||
-      ((LegalOperations || VT.isVector() ||
+      ((LegalOperations || VT.isFixedLengthVector() ||
         !cast<LoadSDNode>(N0)->isSimple()) &&
        !TLI.isLoadExtLegal(ExtLoadType, VT, N0.getValueType())))
     return {};
@@ -11413,6 +11901,10 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
 
+  // sext(undef) = 0 because the top bit will all be the same.
+  if (N0.isUndef())
+    return DAG.getConstant(0, DL, VT);
+
   if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
     return Res;
 
@@ -11582,10 +12074,10 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
         // Return SDValue here as the xor should have already been replaced in
         // this sext.
         return SDValue();
-      } else {
-        // Return a new sext with the new xor.
-        return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewXor);
       }
+
+      // Return a new sext with the new xor.
+      return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewXor);
     }
 
     SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
@@ -11658,6 +12150,10 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
+  // zext(undef) = 0
+  if (N0.isUndef())
+    return DAG.getConstant(0, SDLoc(N), VT);
+
   if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
     return Res;
 
@@ -11917,6 +12413,10 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
+  // aext(undef) = undef
+  if (N0.isUndef())
+    return DAG.getUNDEF(VT);
+
   if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
     return Res;
 
@@ -11954,11 +12454,10 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
       !TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(),
                           N0.getValueType())) {
     SDLoc DL(N);
-    SDValue X = N0.getOperand(0).getOperand(0);
-    X = DAG.getAnyExtOrTrunc(X, DL, VT);
-    APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits());
-    return DAG.getNode(ISD::AND, DL, VT,
-                       X, DAG.getConstant(Mask, DL, VT));
+    SDValue X = DAG.getAnyExtOrTrunc(N0.getOperand(0).getOperand(0), DL, VT);
+    SDValue Y = DAG.getNode(ISD::ANY_EXTEND, DL, VT, N0.getOperand(1));
+    assert(isa<ConstantSDNode>(Y) && "Expected constant to be folded!");
+    return DAG.getNode(ISD::AND, DL, VT, X, Y);
   }
 
   // fold (aext (load x)) -> (aext (truncate (extload x)))
@@ -12086,13 +12585,9 @@ SDValue DAGCombiner::visitAssertExt(SDNode *N) {
     // This eliminates the later assert:
     // assert (trunc (assert X, i8) to iN), i1 --> trunc (assert X, i1) to iN
     // assert (trunc (assert X, i1) to iN), i8 --> trunc (assert X, i1) to iN
+    SDLoc DL(N);
     SDValue BigA = N0.getOperand(0);
     EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT();
-    assert(BigA_AssertVT.bitsLE(N0.getValueType()) &&
-           "Asserting zero/sign-extended bits to a type larger than the "
-           "truncated destination does not provide information");
-
-    SDLoc DL(N);
     EVT MinAssertVT = AssertVT.bitsLT(BigA_AssertVT) ? AssertVT : BigA_AssertVT;
     SDValue MinAssertVTVal = DAG.getValueType(MinAssertVT);
     SDValue NewAssert = DAG.getNode(Opcode, DL, BigA.getValueType(),
@@ -12108,10 +12603,6 @@ SDValue DAGCombiner::visitAssertExt(SDNode *N) {
       Opcode == ISD::AssertZext) {
     SDValue BigA = N0.getOperand(0);
     EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT();
-    assert(BigA_AssertVT.bitsLE(N0.getValueType()) &&
-           "Asserting zero/sign-extended bits to a type larger than the "
-           "truncated destination does not provide information");
-
     if (AssertVT.bitsLT(BigA_AssertVT)) {
       SDLoc DL(N);
       SDValue NewAssert = DAG.getNode(Opcode, DL, BigA.getValueType(),
@@ -12229,13 +12720,11 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
     unsigned ActiveBits = 0;
     if (Mask.isMask()) {
       ActiveBits = Mask.countTrailingOnes();
-    } else if (Mask.isShiftedMask()) {
-      ShAmt = Mask.countTrailingZeros();
-      APInt ShiftedMask = Mask.lshr(ShAmt);
-      ActiveBits = ShiftedMask.countTrailingOnes();
+    } else if (Mask.isShiftedMask(ShAmt, ActiveBits)) {
       HasShiftedOffset = true;
-    } else
+    } else {
       return SDValue();
+    }
 
     ExtType = ISD::ZEXTLOAD;
     ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
@@ -12852,21 +13341,6 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   if (SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
 
-  // (trunc adde(X, Y, Carry)) -> (adde trunc(X), trunc(Y), Carry)
-  // (trunc addcarry(X, Y, Carry)) -> (addcarry trunc(X), trunc(Y), Carry)
-  // When the adde's carry is not used.
-  if ((N0.getOpcode() == ISD::ADDE || N0.getOpcode() == ISD::ADDCARRY) &&
-      N0.hasOneUse() && !N0.getNode()->hasAnyUseOfValue(1) &&
-      // We only do for addcarry before legalize operation
-      ((!LegalOperations && N0.getOpcode() == ISD::ADDCARRY) ||
-       TLI.isOperationLegal(N0.getOpcode(), VT))) {
-    SDLoc SL(N);
-    auto X = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0));
-    auto Y = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));
-    auto VTs = DAG.getVTList(VT, N0->getValueType(1));
-    return DAG.getNode(N0.getOpcode(), SL, VTs, X, Y, N0.getOperand(2));
-  }
-
   // fold (truncate (extract_subvector(ext x))) ->
   //      (extract_subvector x)
   // TODO: This can be generalized to cover cases where the truncate and extract
@@ -12911,6 +13385,22 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
       }
     }
     break;
+  case ISD::ADDE:
+  case ISD::ADDCARRY:
+    // (trunc adde(X, Y, Carry)) -> (adde trunc(X), trunc(Y), Carry)
+    // (trunc addcarry(X, Y, Carry)) -> (addcarry trunc(X), trunc(Y), Carry)
+    // When the adde's carry is not used.
+    // We only do for addcarry before legalize operation
+    if (((!LegalOperations && N0.getOpcode() == ISD::ADDCARRY) ||
+         TLI.isOperationLegal(N0.getOpcode(), VT)) &&
+        N0.hasOneUse() && !N0->hasAnyUseOfValue(1)) {
+      SDLoc DL(N);
+      SDValue X = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0));
+      SDValue Y = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(1));
+      SDVTList VTs = DAG.getVTList(VT, N0->getValueType(1));
+      return DAG.getNode(N0.getOpcode(), DL, VTs, X, Y, N0.getOperand(2));
+    }
+    break;
   case ISD::USUBSAT:
     // Truncate the USUBSAT only if LHS is a known zero-extension, its not
     // enough to know that the upper bits are zero we must ensure that we don't
@@ -13044,7 +13534,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
       (!LegalTypes ||
        (!LegalOperations && VT.isInteger() && N0.getValueType().isInteger() &&
         TLI.isTypeLegal(VT.getVectorElementType()))) &&
-      N0.getOpcode() == ISD::BUILD_VECTOR && N0.getNode()->hasOneUse() &&
+      N0.getOpcode() == ISD::BUILD_VECTOR && N0->hasOneUse() &&
       cast<BuildVectorSDNode>(N0)->isConstant())
     return ConstantFoldBITCASTofBUILD_VECTOR(N0.getNode(),
                                              VT.getVectorElementType());
@@ -13112,8 +13602,8 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
   // This often reduces constant pool loads.
   if (((N0.getOpcode() == ISD::FNEG && !TLI.isFNegFree(N0.getValueType())) ||
        (N0.getOpcode() == ISD::FABS && !TLI.isFAbsFree(N0.getValueType()))) &&
-      N0.getNode()->hasOneUse() && VT.isInteger() &&
-      !VT.isVector() && !N0.getValueType().isVector()) {
+      N0->hasOneUse() && VT.isInteger() && !VT.isVector() &&
+      !N0.getValueType().isVector()) {
     SDValue NewConv = DAG.getBitcast(VT, N0.getOperand(0));
     AddToWorklist(NewConv.getNode());
 
@@ -13161,9 +13651,9 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
   //                     (xor (bitcast cst), (bitcast x)), 0),
   //                    signbit)
   //     (xor (bitcast cst) (build_pair flipbit, flipbit))
-  if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse() &&
-      isa<ConstantFPSDNode>(N0.getOperand(0)) &&
-      VT.isInteger() && !VT.isVector()) {
+  if (N0.getOpcode() == ISD::FCOPYSIGN && N0->hasOneUse() &&
+      isa<ConstantFPSDNode>(N0.getOperand(0)) && VT.isInteger() &&
+      !VT.isVector()) {
     unsigned OrigXWidth = N0.getOperand(1).getValueSizeInBits();
     EVT IntXVT = EVT::getIntegerVT(*DAG.getContext(), OrigXWidth);
     if (isTypeLegal(IntXVT)) {
@@ -13245,8 +13735,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
       if (Op.getOpcode() == ISD::BITCAST &&
           Op.getOperand(0).getValueType() == VT)
         return SDValue(Op.getOperand(0));
-      if (Op.isUndef() || ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
-          ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()))
+      if (Op.isUndef() || isAnyConstantBuildVector(Op))
         return DAG.getBitcast(VT, Op);
       return SDValue();
     };
@@ -13286,6 +13775,14 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
   if (DAG.isGuaranteedNotToBeUndefOrPoison(N0, /*PoisonOnly*/ false))
     return N0;
 
+  // Fold freeze(bitcast(x)) -> bitcast(freeze(x)).
+  // TODO: Replace with pushFreezeToPreventPoisonFromPropagating fold.
+  if (N0.getOpcode() == ISD::BITCAST)
+    return DAG.getBitcast(N->getValueType(0),
+                          DAG.getNode(ISD::FREEZE, SDLoc(N0),
+                                      N0.getOperand(0).getValueType(),
+                                      N0.getOperand(0)));
+
   return SDValue();
 }
 
@@ -13377,7 +13874,7 @@ static bool isContractableFMUL(const TargetOptions &Options, SDValue N) {
 
 // Returns true if `N` can assume no infinities involved in its computation.
 static bool hasNoInfs(const TargetOptions &Options, SDValue N) {
-  return Options.NoInfsFPMath || N.getNode()->getFlags().hasNoInfs();
+  return Options.NoInfsFPMath || N->getFlags().hasNoInfs();
 }
 
 /// Try to perform FMA combining on a given FADD node.
@@ -13431,7 +13928,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
   // prefer to fold the multiply with fewer uses.
   if (Aggressive && isContractableFMUL(N0) && isContractableFMUL(N1)) {
-    if (N0.getNode()->use_size() > N1.getNode()->use_size())
+    if (N0->use_size() > N1->use_size())
       std::swap(N0, N1);
   }
 
@@ -13661,7 +14158,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   // If we have two choices trying to fold (fsub (fmul u, v), (fmul x, y)),
   // prefer to fold the multiply with fewer uses.
   if (isContractableFMUL(N0) && isContractableFMUL(N1) &&
-      (N0.getNode()->use_size() > N1.getNode()->use_size())) {
+      (N0->use_size() > N1->use_size())) {
     // fold (fsub (fmul a, b), (fmul c, d)) -> (fma (fneg c), d, (fmul a, b))
     if (SDValue V = tryToFoldXSubYZ(N0, N1))
       return V;
@@ -14784,7 +15281,7 @@ SDValue DAGCombiner::visitFREM(SDNode *N) {
   // fold (frem c1, c2) -> fmod(c1,c2)
   if (SDValue C = DAG.FoldConstantArithmetic(ISD::FREM, SDLoc(N), VT, {N0, N1}))
     return C;
-  
+
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
@@ -15107,7 +15604,7 @@ static SDValue FoldIntToFPToInt(SDNode *N, SelectionDAG &DAG) {
   // This means this is also safe for a signed input and unsigned output, since
   // a negative input would lead to undefined behavior.
   unsigned InputSize = (int)SrcVT.getScalarSizeInBits() - IsInputSigned;
-  unsigned OutputSize = (int)VT.getScalarSizeInBits() - IsOutputSigned;
+  unsigned OutputSize = (int)VT.getScalarSizeInBits();
   unsigned ActualSize = std::min(InputSize, OutputSize);
   const fltSemantics &sem = DAG.EVTToAPFloatSemantics(N0.getValueType());
 
@@ -15198,7 +15695,7 @@ SDValue DAGCombiner::visitFP_ROUND(SDNode *N) {
   }
 
   // fold (fp_round (copysign X, Y)) -> (copysign (fp_round X), Y)
-  if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse()) {
+  if (N0.getOpcode() == ISD::FCOPYSIGN && N0->hasOneUse()) {
     SDValue Tmp = DAG.getNode(ISD::FP_ROUND, SDLoc(N0), VT,
                               N0.getOperand(0), N1);
     AddToWorklist(Tmp.getNode());
@@ -15642,7 +16139,7 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
   // If the pointer is not an add/sub, or if it doesn't have multiple uses, bail
   // out.  There is no reason to make this a preinc/predec.
   if ((Ptr.getOpcode() != ISD::ADD && Ptr.getOpcode() != ISD::SUB) ||
-      Ptr.getNode()->hasOneUse())
+      Ptr->hasOneUse())
     return false;
 
   // Ask the target to do addressing mode selection.
@@ -15702,8 +16199,8 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
   // a copy of the original base pointer.
   SmallVector<SDNode *, 16> OtherUses;
   if (isa<ConstantSDNode>(Offset))
-    for (SDNode::use_iterator UI = BasePtr.getNode()->use_begin(),
-                              UE = BasePtr.getNode()->use_end();
+    for (SDNode::use_iterator UI = BasePtr->use_begin(),
+                              UE = BasePtr->use_end();
          UI != UE; ++UI) {
       SDUse &Use = UI.getUse();
       // Skip the use that is Ptr and uses of other results from BasePtr's
@@ -15741,7 +16238,7 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
   // Now check for #3 and #4.
   bool RealUse = false;
 
-  for (SDNode *Use : Ptr.getNode()->uses()) {
+  for (SDNode *Use : Ptr->uses()) {
     if (Use == N)
       continue;
     if (SDNode::hasPredecessorHelper(Use, Visited, Worklist))
@@ -15774,7 +16271,7 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
   ++PreIndexedNodes;
   ++NodesCombined;
   LLVM_DEBUG(dbgs() << "\nReplacing.4 "; N->dump(&DAG); dbgs() << "\nWith: ";
-             Result.getNode()->dump(&DAG); dbgs() << '\n');
+             Result.dump(&DAG); dbgs() << '\n');
   WorklistRemover DeadNodes(*this);
   if (IsLoad) {
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
@@ -15864,7 +16361,7 @@ static bool shouldCombineToPostInc(SDNode *N, SDValue Ptr, SDNode *PtrUse,
     return false;
 
   SmallPtrSet<const SDNode *, 32> Visited;
-  for (SDNode *Use : BasePtr.getNode()->uses()) {
+  for (SDNode *Use : BasePtr->uses()) {
     if (Use == Ptr.getNode())
       continue;
 
@@ -15901,7 +16398,7 @@ static SDNode *getPostIndexedLoadStoreOp(SDNode *N, bool &IsLoad,
                                          const TargetLowering &TLI) {
   if (!getCombineLoadStoreParts(N, ISD::POST_INC, ISD::POST_DEC, IsLoad,
                                 IsMasked, Ptr, TLI) ||
-      Ptr.getNode()->hasOneUse())
+      Ptr->hasOneUse())
     return nullptr;
 
   // Try turning it into a post-indexed load / store except when
@@ -15961,9 +16458,8 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
                                                 BasePtr, Offset, AM);
   ++PostIndexedNodes;
   ++NodesCombined;
-  LLVM_DEBUG(dbgs() << "\nReplacing.5 "; N->dump(&DAG);
-             dbgs() << "\nWith: "; Result.getNode()->dump(&DAG);
-             dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << "\nReplacing.5 "; N->dump(&DAG); dbgs() << "\nWith: ";
+             Result.dump(&DAG); dbgs() << '\n');
   WorklistRemover DeadNodes(*this);
   if (IsLoad) {
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
@@ -16204,7 +16700,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
         // Now we replace use of chain2 with chain1.  This makes the second load
         // isomorphic to the one we are deleting, and thus makes this load live.
         LLVM_DEBUG(dbgs() << "\nReplacing.6 "; N->dump(&DAG);
-                   dbgs() << "\nWith chain: "; Chain.getNode()->dump(&DAG);
+                   dbgs() << "\nWith chain: "; Chain.dump(&DAG);
                    dbgs() << "\n");
         WorklistRemover DeadNodes(*this);
         DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
@@ -16235,7 +16731,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
         } else
           Index = DAG.getUNDEF(N->getValueType(1));
         LLVM_DEBUG(dbgs() << "\nReplacing.7 "; N->dump(&DAG);
-                   dbgs() << "\nWith: "; Undef.getNode()->dump(&DAG);
+                   dbgs() << "\nWith: "; Undef.dump(&DAG);
                    dbgs() << " and 2 other values\n");
         WorklistRemover DeadNodes(*this);
         DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef);
@@ -16947,11 +17443,19 @@ ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
 
   // Check that it is legal on the target to do this.  It is legal if the new
   // VT we're shrinking to (i8/i16/i32) is legal or we're still before type
-  // legalization (and the target doesn't explicitly think this is a bad idea).
+  // legalization. If the source type is legal, but the store type isn't, see
+  // if we can use a truncating store.
   MVT VT = MVT::getIntegerVT(NumBytes * 8);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (!DC->isTypeLegal(VT))
+  bool UseTruncStore;
+  if (DC->isTypeLegal(VT))
+    UseTruncStore = false;
+  else if (TLI.isTypeLegal(IVal.getValueType()) &&
+           TLI.isTruncStoreLegal(IVal.getValueType(), VT))
+    UseTruncStore = true;
+  else
     return SDValue();
+  // Check that the target doesn't think this is a bad idea.
   if (St->getMemOperand() &&
       !TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
                               *St->getMemOperand()))
@@ -16979,10 +17483,15 @@ ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
     Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(StOffset), DL);
   }
 
+  ++OpsNarrowed;
+  if (UseTruncStore)
+    return DAG.getTruncStore(St->getChain(), SDLoc(St), IVal, Ptr,
+                             St->getPointerInfo().getWithOffset(StOffset),
+                             VT, St->getOriginalAlign());
+
   // Truncate down to the new size.
   IVal = DAG.getNode(ISD::TRUNCATE, SDLoc(IVal), VT, IVal);
 
-  ++OpsNarrowed;
   return DAG
       .getStore(St->getChain(), SDLoc(St), IVal, Ptr,
                 St->getPointerInfo().getWithOffset(StOffset),
@@ -17003,11 +17512,15 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
   SDValue Ptr   = ST->getBasePtr();
   EVT VT = Value.getValueType();
 
-  if (ST->isTruncatingStore() || VT.isVector() || !Value.hasOneUse())
+  if (ST->isTruncatingStore() || VT.isVector())
     return SDValue();
 
   unsigned Opc = Value.getOpcode();
 
+  if ((Opc != ISD::OR && Opc != ISD::XOR && Opc != ISD::AND) ||
+      !Value.hasOneUse())
+    return SDValue();
+
   // If this is "store (or X, Y), P" and X is "(and (load P), cst)", where cst
   // is a byte mask indicating a consecutive number of bytes, check to see if
   // Y is known to provide just those bytes.  If so, we try to replace the
@@ -17032,8 +17545,7 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
   if (!EnableReduceLoadOpStoreWidth)
     return SDValue();
 
-  if ((Opc != ISD::OR && Opc != ISD::XOR && Opc != ISD::AND) ||
-      Value.getOperand(1).getOpcode() != ISD::Constant)
+  if (Value.getOperand(1).getOpcode() != ISD::Constant)
     return SDValue();
 
   SDValue N0 = Value.getOperand(0);
@@ -17189,14 +17701,13 @@ SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) {
 //     (A + c1) * c3
 //     (A + c2) * c3
 // We're checking for cases where we have common "c3 * A" expressions.
-bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode,
-                                              SDValue &AddNode,
-                                              SDValue &ConstNode) {
+bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode, SDValue AddNode,
+                                              SDValue ConstNode) {
   APInt Val;
 
   // If the add only has one use, and the target thinks the folding is
   // profitable or does not lead to worse code, this would be OK to do.
-  if (AddNode.getNode()->hasOneUse() &&
+  if (AddNode->hasOneUse() &&
       TLI.isMulAddWithConstProfitable(AddNode, ConstNode))
     return true;
 
@@ -17330,7 +17841,9 @@ bool DAGCombiner::mergeStoresOfConstantsOrVecElts(
             if (isa<ConstantFPSDNode>(Val)) {
               // Not clear how to truncate FP values.
               return false;
-            } else if (auto *C = dyn_cast<ConstantSDNode>(Val))
+            }
+
+            if (auto *C = dyn_cast<ConstantSDNode>(Val))
               Val = DAG.getConstant(C->getAPIntValue()
                                         .zextOrTrunc(Val.getValueSizeInBits())
                                         .zextOrTrunc(ElementSizeBits),
@@ -17424,7 +17937,7 @@ bool DAGCombiner::mergeStoresOfConstantsOrVecElts(
   if (!UseTrunc) {
     NewStore = DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(),
                             FirstInChain->getPointerInfo(),
-                            FirstInChain->getAlign(), Flags.getValue(), AAInfo);
+                            FirstInChain->getAlign(), *Flags, AAInfo);
   } else { // Must be realized as a trunc store
     EVT LegalizedStoredValTy =
         TLI.getTypeToTransformTo(*DAG.getContext(), StoredVal.getValueType());
@@ -17436,7 +17949,7 @@ bool DAGCombiner::mergeStoresOfConstantsOrVecElts(
     NewStore = DAG.getTruncStore(
         NewChain, DL, ExtendedStoreVal, FirstInChain->getBasePtr(),
         FirstInChain->getPointerInfo(), StoredVal.getValueType() /*TVT*/,
-        FirstInChain->getAlign(), Flags.getValue(), AAInfo);
+        FirstInChain->getAlign(), *Flags, AAInfo);
   }
 
   // Replace all merged stores with the new store.
@@ -17604,11 +18117,9 @@ void DAGCombiner::getStoreMergeCandidates(
   }
 }
 
-// We need to check that merging these stores does not cause a loop in
-// the DAG. Any store candidate may depend on another candidate
-// indirectly through its operand (we already consider dependencies
-// through the chain). Check in parallel by searching up from
-// non-chain operands of candidates.
+// We need to check that merging these stores does not cause a loop in the
+// DAG. Any store candidate may depend on another candidate indirectly through
+// its operands. Check in parallel by searching up from operands of candidates.
 bool DAGCombiner::checkMergeStoreCandidatesForDependencies(
     SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores,
     SDNode *RootNode) {
@@ -17642,8 +18153,13 @@ bool DAGCombiner::checkMergeStoreCandidatesForDependencies(
     SDNode *N = StoreNodes[i].MemNode;
     // Of the 4 Store Operands:
     //   * Chain (Op 0) -> We have already considered these
-    //                    in candidate selection and can be
-    //                    safely ignored
+    //                     in candidate selection, but only by following the
+    //                     chain dependencies. We could still have a chain
+    //                     dependency to a load, that has a non-chain dep to
+    //                     another load, that depends on a store, etc. So it is
+    //                     possible to have dependencies that consist of a mix
+    //                     of chain and non-chain deps, and we need to include
+    //                     chain operands in the analysis here..
     //   * Value (Op 1) -> Cycles may happen (e.g. through load chains)
     //   * Address (Op 2) -> Merged addresses may only vary by a fixed constant,
     //                       but aren't necessarily fromt the same base node, so
@@ -17651,7 +18167,7 @@ bool DAGCombiner::checkMergeStoreCandidatesForDependencies(
     //   * (Op 3) -> Represents the pre or post-indexing offset (or undef for
     //               non-indexed stores). Not constant on all targets (e.g. ARM)
     //               and so can participate in a cycle.
-    for (unsigned j = 1; j < N->getNumOperands(); ++j)
+    for (unsigned j = 0; j < N->getNumOperands(); ++j)
       Worklist.push_back(N->getOperand(j).getNode());
   }
   // Search through DAG. We can stop early if we find a store node.
@@ -17726,7 +18242,7 @@ bool DAGCombiner::tryStoreMergeOfConstants(
   while (NumConsecutiveStores >= 2) {
     LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
     unsigned FirstStoreAS = FirstInChain->getAddressSpace();
-    unsigned FirstStoreAlign = FirstInChain->getAlignment();
+    Align FirstStoreAlign = FirstInChain->getAlign();
     unsigned LastLegalType = 1;
     unsigned LastLegalVectorType = 1;
     bool LastIntegerTrunc = false;
@@ -17814,7 +18330,7 @@ bool DAGCombiner::tryStoreMergeOfConstants(
       unsigned NumSkip = 1;
       while ((NumSkip < NumConsecutiveStores) &&
              (NumSkip < FirstZeroAfterNonZero) &&
-             (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))
+             (StoreNodes[NumSkip].MemNode->getAlign() <= FirstStoreAlign))
         NumSkip++;
 
       StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
@@ -17853,7 +18369,7 @@ bool DAGCombiner::tryStoreMergeOfExtracts(
   while (NumConsecutiveStores >= 2) {
     LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
     unsigned FirstStoreAS = FirstInChain->getAddressSpace();
-    unsigned FirstStoreAlign = FirstInChain->getAlignment();
+    Align FirstStoreAlign = FirstInChain->getAlign();
     unsigned NumStoresToMerge = 1;
     for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
       // Find a legal type for the vector store.
@@ -17884,7 +18400,7 @@ bool DAGCombiner::tryStoreMergeOfExtracts(
       // improved. Drop as many candidates as we can here.
       unsigned NumSkip = 1;
       while ((NumSkip < NumConsecutiveStores) &&
-             (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))
+             (StoreNodes[NumSkip].MemNode->getAlign() <= FirstStoreAlign))
         NumSkip++;
 
       StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
@@ -18181,7 +18697,7 @@ bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes,
     for (unsigned i = 0; i < NumElem; ++i) {
       SDValue Val = StoreNodes[i].MemNode->getOperand(1);
       CombineTo(StoreNodes[i].MemNode, NewStore);
-      if (Val.getNode()->use_empty())
+      if (Val->use_empty())
         recursivelyDeleteUnusedNodes(Val.getNode());
     }
 
@@ -18331,6 +18847,7 @@ SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) {
   default:
     llvm_unreachable("Unknown FP type");
   case MVT::f16:    // We don't do this for these yet.
+  case MVT::bf16:
   case MVT::f80:
   case MVT::f128:
   case MVT::ppcf128:
@@ -18338,7 +18855,6 @@ SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) {
   case MVT::f32:
     if ((isTypeLegal(MVT::i32) && !LegalOperations && ST->isSimple()) ||
         TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
-      ;
       Tmp = DAG.getConstant((uint32_t)CFP->getValueAPF().
                             bitcastToAPInt().getZExtValue(), SDLoc(CFP),
                             MVT::i32);
@@ -18350,7 +18866,6 @@ SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) {
     if ((TLI.isTypeLegal(MVT::i64) && !LegalOperations &&
          ST->isSimple()) ||
         TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i64)) {
-      ;
       Tmp = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
                             getZExtValue(), SDLoc(CFP), MVT::i64);
       return DAG.getStore(Chain, DL, Tmp,
@@ -18544,7 +19059,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
   // truncating store.  We can do this even if this is already a truncstore.
   if ((Value.getOpcode() == ISD::FP_ROUND ||
        Value.getOpcode() == ISD::TRUNCATE) &&
-      Value.getNode()->hasOneUse() && ST->isUnindexed() &&
+      Value->hasOneUse() && ST->isUnindexed() &&
       TLI.canCombineTruncStore(Value.getOperand(0).getValueType(),
                                ST->getMemoryVT(), LegalOperations)) {
     return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0),
@@ -18807,6 +19322,14 @@ SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) {
       }
     }
 
+    // If we failed to find a match, see if we can replace an UNDEF shuffle
+    // operand.
+    if (ElementOffset == -1 && Y.isUndef() &&
+        InsertVal0.getValueType() == Y.getValueType()) {
+      ElementOffset = Mask.size();
+      Y = InsertVal0;
+    }
+
     if (ElementOffset != -1) {
       SmallVector<int, 16> NewMask(Mask.begin(), Mask.end());
 
@@ -18905,10 +19428,9 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
     if (InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT)) {
       if (VT.isScalableVector())
         return DAG.getSplatVector(VT, DL, InVal);
-      else {
-        SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), InVal);
-        return DAG.getBuildVector(VT, DL, Ops);
-      }
+
+      SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), InVal);
+      return DAG.getBuildVector(VT, DL, Ops);
     }
     return SDValue();
   }
@@ -18920,9 +19442,19 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
 
   // We must know which element is being inserted for folds below here.
   unsigned Elt = IndexC->getZExtValue();
+
   if (SDValue Shuf = combineInsertEltToShuffle(N, Elt))
     return Shuf;
 
+  // Handle <1 x ???> vector insertion special cases.
+  if (VT.getVectorNumElements() == 1) {
+    // insert_vector_elt(x, extract_vector_elt(y, 0), 0) -> y
+    if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+        InVal.getOperand(0).getValueType() == VT &&
+        isNullConstant(InVal.getOperand(1)))
+      return InVal.getOperand(0);
+  }
+
   // Canonicalize insert_vector_elt dag nodes.
   // Example:
   // (insert_vector_elt (insert_vector_elt A, Idx0), Idx1)
@@ -18943,36 +19475,84 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
     }
   }
 
-  // If we can't generate a legal BUILD_VECTOR, exit
-  if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
-    return SDValue();
+  // Attempt to fold the insertion into a legal BUILD_VECTOR.
+  if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) {
+    auto UpdateBuildVector = [&](SmallVectorImpl<SDValue> &Ops) {
+      assert(Ops.size() == NumElts && "Unexpected vector size");
 
-  // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
-  // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
-  // vector elements.
-  SmallVector<SDValue, 8> Ops;
-  // Do not combine these two vectors if the output vector will not replace
-  // the input vector.
-  if (InVec.getOpcode() == ISD::BUILD_VECTOR && InVec.hasOneUse()) {
-    Ops.append(InVec.getNode()->op_begin(),
-               InVec.getNode()->op_end());
-  } else if (InVec.isUndef()) {
-    Ops.append(NumElts, DAG.getUNDEF(InVal.getValueType()));
-  } else {
-    return SDValue();
-  }
-  assert(Ops.size() == NumElts && "Unexpected vector size");
+      // Insert the element
+      if (Elt < Ops.size()) {
+        // All the operands of BUILD_VECTOR must have the same type;
+        // we enforce that here.
+        EVT OpVT = Ops[0].getValueType();
+        Ops[Elt] =
+            OpVT.isInteger() ? DAG.getAnyExtOrTrunc(InVal, DL, OpVT) : InVal;
+      }
+
+      // Return the new vector
+      return DAG.getBuildVector(VT, DL, Ops);
+    };
+
+    // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
+    // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
+    // vector elements.
+    SmallVector<SDValue, 8> Ops;
+
+    // Do not combine these two vectors if the output vector will not replace
+    // the input vector.
+    if (InVec.getOpcode() == ISD::BUILD_VECTOR && InVec.hasOneUse()) {
+      Ops.append(InVec->op_begin(), InVec->op_end());
+      return UpdateBuildVector(Ops);
+    }
+
+    if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR && InVec.hasOneUse()) {
+      Ops.push_back(InVec.getOperand(0));
+      Ops.append(NumElts - 1, DAG.getUNDEF(InVec.getOperand(0).getValueType()));
+      return UpdateBuildVector(Ops);
+    }
+
+    if (InVec.isUndef()) {
+      Ops.append(NumElts, DAG.getUNDEF(InVal.getValueType()));
+      return UpdateBuildVector(Ops);
+    }
+
+    // If we're inserting into the end of a vector as part of an sequence, see
+    // if we can create a BUILD_VECTOR by following the sequence back up the
+    // chain.
+    if (Elt == (NumElts - 1)) {
+      SmallVector<SDValue> ReverseInsertions;
+      ReverseInsertions.push_back(InVal);
+
+      EVT MaxEltVT = InVal.getValueType();
+      SDValue CurVec = InVec;
+      for (unsigned I = 1; I != NumElts; ++I) {
+        if (CurVec.getOpcode() != ISD::INSERT_VECTOR_ELT || !CurVec.hasOneUse())
+          break;
+
+        auto *CurIdx = dyn_cast<ConstantSDNode>(CurVec.getOperand(2));
+        if (!CurIdx || CurIdx->getAPIntValue() != ((NumElts - 1) - I))
+          break;
+        SDValue CurVal = CurVec.getOperand(1);
+        ReverseInsertions.push_back(CurVal);
+        if (VT.isInteger()) {
+          EVT CurValVT = CurVal.getValueType();
+          MaxEltVT = MaxEltVT.bitsGE(CurValVT) ? MaxEltVT : CurValVT;
+        }
+        CurVec = CurVec.getOperand(0);
+      }
 
-  // Insert the element
-  if (Elt < Ops.size()) {
-    // All the operands of BUILD_VECTOR must have the same type;
-    // we enforce that here.
-    EVT OpVT = Ops[0].getValueType();
-    Ops[Elt] = OpVT.isInteger() ? DAG.getAnyExtOrTrunc(InVal, DL, OpVT) : InVal;
+      if (ReverseInsertions.size() == NumElts) {
+        for (unsigned I = 0; I != NumElts; ++I) {
+          SDValue Val = ReverseInsertions[(NumElts - 1) - I];
+          Val = VT.isInteger() ? DAG.getAnyExtOrTrunc(Val, DL, MaxEltVT) : Val;
+          Ops.push_back(Val);
+        }
+        return DAG.getBuildVector(VT, DL, Ops);
+      }
+    }
   }
 
-  // Return the new vector
-  return DAG.getBuildVector(VT, DL, Ops);
+  return SDValue();
 }
 
 SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
@@ -19021,47 +19601,33 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
   SDValue NewPtr = TLI.getVectorElementPointer(DAG, OriginalLoad->getBasePtr(),
                                                InVecVT, EltNo);
 
-  // The replacement we need to do here is a little tricky: we need to
-  // replace an extractelement of a load with a load.
-  // Use ReplaceAllUsesOfValuesWith to do the replacement.
-  // Note that this replacement assumes that the extractvalue is the only
-  // use of the load; that's okay because we don't want to perform this
-  // transformation in other cases anyway.
+  // We are replacing a vector load with a scalar load. The new load must have
+  // identical memory op ordering to the original.
   SDValue Load;
-  SDValue Chain;
   if (ResultVT.bitsGT(VecEltVT)) {
     // If the result type of vextract is wider than the load, then issue an
     // extending load instead.
-    ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD, ResultVT,
-                                                  VecEltVT)
-                                   ? ISD::ZEXTLOAD
-                                   : ISD::EXTLOAD;
-    Load = DAG.getExtLoad(ExtType, SDLoc(EVE), ResultVT,
-                          OriginalLoad->getChain(), NewPtr, MPI, VecEltVT,
-                          Alignment, OriginalLoad->getMemOperand()->getFlags(),
+    ISD::LoadExtType ExtType =
+        TLI.isLoadExtLegal(ISD::ZEXTLOAD, ResultVT, VecEltVT) ? ISD::ZEXTLOAD
+                                                              : ISD::EXTLOAD;
+    Load = DAG.getExtLoad(ExtType, DL, ResultVT, OriginalLoad->getChain(),
+                          NewPtr, MPI, VecEltVT, Alignment,
+                          OriginalLoad->getMemOperand()->getFlags(),
                           OriginalLoad->getAAInfo());
-    Chain = Load.getValue(1);
+    DAG.makeEquivalentMemoryOrdering(OriginalLoad, Load);
   } else {
-    Load = DAG.getLoad(
-        VecEltVT, SDLoc(EVE), OriginalLoad->getChain(), NewPtr, MPI, Alignment,
-        OriginalLoad->getMemOperand()->getFlags(), OriginalLoad->getAAInfo());
-    Chain = Load.getValue(1);
+    // The result type is narrower or the same width as the vector element
+    Load = DAG.getLoad(VecEltVT, DL, OriginalLoad->getChain(), NewPtr, MPI,
+                       Alignment, OriginalLoad->getMemOperand()->getFlags(),
+                       OriginalLoad->getAAInfo());
+    DAG.makeEquivalentMemoryOrdering(OriginalLoad, Load);
     if (ResultVT.bitsLT(VecEltVT))
-      Load = DAG.getNode(ISD::TRUNCATE, SDLoc(EVE), ResultVT, Load);
+      Load = DAG.getNode(ISD::TRUNCATE, DL, ResultVT, Load);
     else
       Load = DAG.getBitcast(ResultVT, Load);
   }
-  WorklistRemover DeadNodes(*this);
-  SDValue From[] = { SDValue(EVE, 0), SDValue(OriginalLoad, 1) };
-  SDValue To[] = { Load, Chain };
-  DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
-  // Make sure to revisit this node to clean it up; it will usually be dead.
-  AddToWorklist(EVE);
-  // Since we're explicitly calling ReplaceAllUses, add the new node to the
-  // worklist explicitly as well.
-  AddToWorklistWithUsers(Load.getNode());
   ++OpsNarrowed;
-  return SDValue(EVE, 0);
+  return Load;
 }
 
 /// Transform a vector binary operation into a scalar binary operation by moving
@@ -19073,7 +19639,7 @@ static SDValue scalarizeExtractedBinop(SDNode *ExtElt, SelectionDAG &DAG,
   SDValue Index = ExtElt->getOperand(1);
   auto *IndexC = dyn_cast<ConstantSDNode>(Index);
   if (!IndexC || !TLI.isBinOp(Vec.getOpcode()) || !Vec.hasOneUse() ||
-      Vec.getNode()->getNumValues() != 1)
+      Vec->getNumValues() != 1)
     return SDValue();
 
   // Targets may want to avoid this to prevent an expensive register transfer.
@@ -19129,8 +19695,9 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
     // EXTRACT_VECTOR_ELT may widen the extracted vector.
     SDValue InOp = VecOp.getOperand(0);
     if (InOp.getValueType() != ScalarVT) {
-      assert(InOp.getValueType().isInteger() && ScalarVT.isInteger());
-      return DAG.getSExtOrTrunc(InOp, DL, ScalarVT);
+      assert(InOp.getValueType().isInteger() && ScalarVT.isInteger() &&
+             InOp.getValueType().bitsGT(ScalarVT));
+      return DAG.getNode(ISD::TRUNCATE, DL, ScalarVT, InOp);
     }
     return InOp;
   }
@@ -19588,7 +20155,7 @@ SDValue DAGCombiner::reduceBuildVecTruncToBitCast(SDNode *N) {
       if (!isa<ConstantSDNode>(ShiftAmtVal))
         return SDValue();
 
-      uint64_t ShiftAmt = In.getNode()->getConstantOperandVal(1);
+      uint64_t ShiftAmt = In.getConstantOperandVal(1);
 
       // The extracted value is not extracted at the right position
       if (ShiftAmt != i * ScalarTypeBitsize)
@@ -20029,18 +20596,39 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
       int Left = 2 * In;
       int Right = 2 * In + 1;
       SmallVector<int, 8> Mask(NumElems, -1);
-      for (unsigned i = 0; i != NumElems; ++i) {
-        if (VectorMask[i] == Left) {
-          Mask[i] = i;
-          VectorMask[i] = In;
-        } else if (VectorMask[i] == Right) {
-          Mask[i] = i + NumElems;
-          VectorMask[i] = In;
+      SDValue L = Shuffles[Left];
+      ArrayRef<int> LMask;
+      bool IsLeftShuffle = L.getOpcode() == ISD::VECTOR_SHUFFLE &&
+                           L.use_empty() && L.getOperand(1).isUndef() &&
+                           L.getOperand(0).getValueType() == L.getValueType();
+      if (IsLeftShuffle) {
+        LMask = cast<ShuffleVectorSDNode>(L.getNode())->getMask();
+        L = L.getOperand(0);
+      }
+      SDValue R = Shuffles[Right];
+      ArrayRef<int> RMask;
+      bool IsRightShuffle = R.getOpcode() == ISD::VECTOR_SHUFFLE &&
+                            R.use_empty() && R.getOperand(1).isUndef() &&
+                            R.getOperand(0).getValueType() == R.getValueType();
+      if (IsRightShuffle) {
+        RMask = cast<ShuffleVectorSDNode>(R.getNode())->getMask();
+        R = R.getOperand(0);
+      }
+      for (unsigned I = 0; I != NumElems; ++I) {
+        if (VectorMask[I] == Left) {
+          Mask[I] = I;
+          if (IsLeftShuffle)
+            Mask[I] = LMask[I];
+          VectorMask[I] = In;
+        } else if (VectorMask[I] == Right) {
+          Mask[I] = I + NumElems;
+          if (IsRightShuffle)
+            Mask[I] = RMask[I] + NumElems;
+          VectorMask[I] = In;
         }
       }
 
-      Shuffles[In] =
-          DAG.getVectorShuffle(VT, DL, Shuffles[Left], Shuffles[Right], Mask);
+      Shuffles[In] = DAG.getVectorShuffle(VT, DL, L, R, Mask);
     }
   }
   return Shuffles[0];
@@ -20628,7 +21216,7 @@ static SDValue narrowInsertExtractVectorBinOp(SDNode *Extract,
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue BinOp = Extract->getOperand(0);
   unsigned BinOpcode = BinOp.getOpcode();
-  if (!TLI.isBinOp(BinOpcode) || BinOp.getNode()->getNumValues() != 1)
+  if (!TLI.isBinOp(BinOpcode) || BinOp->getNumValues() != 1)
     return SDValue();
 
   EVT VecVT = BinOp.getValueType();
@@ -20677,7 +21265,7 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG,
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue BinOp = peekThroughBitcasts(Extract->getOperand(0));
   unsigned BOpcode = BinOp.getOpcode();
-  if (!TLI.isBinOp(BOpcode) || BinOp.getNode()->getNumValues() != 1)
+  if (!TLI.isBinOp(BOpcode) || BinOp->getNumValues() != 1)
     return SDValue();
 
   // Exclude the fake form of fneg (fsub -0.0, x) because that is likely to be
@@ -20736,8 +21324,8 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG,
                             BinOp.getOperand(0), NewExtIndex);
     SDValue Y = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
                             BinOp.getOperand(1), NewExtIndex);
-    SDValue NarrowBinOp = DAG.getNode(BOpcode, DL, NarrowBVT, X, Y,
-                                      BinOp.getNode()->getFlags());
+    SDValue NarrowBinOp =
+        DAG.getNode(BOpcode, DL, NarrowBVT, X, Y, BinOp->getFlags());
     return DAG.getBitcast(VT, NarrowBinOp);
   }
 
@@ -21018,6 +21606,12 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
     }
   }
 
+  // ty1 extract_vector(ty2 splat(V))) -> ty1 splat(V)
+  if (V.getOpcode() == ISD::SPLAT_VECTOR)
+    if (DAG.isConstantValueOfAnyType(V.getOperand(0)) || V.hasOneUse())
+      if (!LegalOperations || TLI.isOperationLegal(ISD::SPLAT_VECTOR, NVT))
+        return DAG.getSplatVector(NVT, SDLoc(N), V.getOperand(0));
+
   // Try to move vector bitcast after extract_subv by scaling extraction index:
   // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index')
   if (V.getOpcode() == ISD::BITCAST &&
@@ -21383,9 +21977,10 @@ static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN,
       SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT);
   if (SVT != VT.getScalarType())
     for (SDValue &Op : Ops)
-      Op = TLI.isZExtFree(Op.getValueType(), SVT)
-               ? DAG.getZExtOrTrunc(Op, SDLoc(SVN), SVT)
-               : DAG.getSExtOrTrunc(Op, SDLoc(SVN), SVT);
+      Op = Op.isUndef() ? DAG.getUNDEF(SVT)
+                        : (TLI.isZExtFree(Op.getValueType(), SVT)
+                               ? DAG.getZExtOrTrunc(Op, SDLoc(SVN), SVT)
+                               : DAG.getSExtOrTrunc(Op, SDLoc(SVN), SVT));
   return DAG.getBuildVector(VT, SDLoc(SVN), Ops);
 }
 
@@ -21515,6 +22110,13 @@ static SDValue combineShuffleOfSplatVal(ShuffleVectorSDNode *Shuf,
                                         SelectionDAG &DAG) {
   if (!Shuf->getOperand(1).isUndef())
     return SDValue();
+
+  // If the inner operand is a known splat with no undefs, just return that directly.
+  // TODO: Create DemandedElts mask from Shuf's mask.
+  // TODO: Allow undef elements and merge with the shuffle code below.
+  if (DAG.isSplatValue(Shuf->getOperand(0), /*AllowUndefs*/ false))
+    return Shuf->getOperand(0);
+
   auto *Splat = dyn_cast<ShuffleVectorSDNode>(Shuf->getOperand(0));
   if (!Splat || !Splat->isSplat())
     return SDValue();
@@ -21561,6 +22163,53 @@ static SDValue combineShuffleOfSplatVal(ShuffleVectorSDNode *Shuf,
                               NewMask);
 }
 
+// Combine shuffles of bitcasts into a shuffle of the bitcast type, providing
+// the mask can be treated as a larger type.
+static SDValue combineShuffleOfBitcast(ShuffleVectorSDNode *SVN,
+                                       SelectionDAG &DAG,
+                                       const TargetLowering &TLI,
+                                       bool LegalOperations) {
+  SDValue Op0 = SVN->getOperand(0);
+  SDValue Op1 = SVN->getOperand(1);
+  EVT VT = SVN->getValueType(0);
+  if (Op0.getOpcode() != ISD::BITCAST)
+    return SDValue();
+  EVT InVT = Op0.getOperand(0).getValueType();
+  if (!InVT.isVector() ||
+      (!Op1.isUndef() && (Op1.getOpcode() != ISD::BITCAST ||
+                          Op1.getOperand(0).getValueType() != InVT)))
+    return SDValue();
+  if (isAnyConstantBuildVector(Op0.getOperand(0)) &&
+      (Op1.isUndef() || isAnyConstantBuildVector(Op1.getOperand(0))))
+    return SDValue();
+
+  int VTLanes = VT.getVectorNumElements();
+  int InLanes = InVT.getVectorNumElements();
+  if (VTLanes <= InLanes || VTLanes % InLanes != 0 ||
+      (LegalOperations &&
+       !TLI.isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, InVT)))
+    return SDValue();
+  int Factor = VTLanes / InLanes;
+
+  // Check that each group of lanes in the mask are either undef or make a valid
+  // mask for the wider lane type.
+  ArrayRef<int> Mask = SVN->getMask();
+  SmallVector<int> NewMask;
+  if (!widenShuffleMaskElts(Factor, Mask, NewMask))
+    return SDValue();
+
+  if (!TLI.isShuffleMaskLegal(NewMask, InVT))
+    return SDValue();
+
+  // Create the new shuffle with the new mask and bitcast it back to the
+  // original type.
+  SDLoc DL(SVN);
+  Op0 = Op0.getOperand(0);
+  Op1 = Op1.isUndef() ? DAG.getUNDEF(InVT) : Op1.getOperand(0);
+  SDValue NewShuf = DAG.getVectorShuffle(InVT, DL, Op0, Op1, NewMask);
+  return DAG.getBitcast(VT, NewShuf);
+}
+
 /// Combine shuffle of shuffle of the form:
 /// shuf (shuf X, undef, InnerMask), undef, OuterMask --> splat X
 static SDValue formSplatFromShuffles(ShuffleVectorSDNode *OuterShuf,
@@ -21772,7 +22421,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
   if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) {
     int SplatIndex = SVN->getSplatIndex();
     if (N0.hasOneUse() && TLI.isExtractVecEltCheap(VT, SplatIndex) &&
-        TLI.isBinOp(N0.getOpcode()) && N0.getNode()->getNumValues() == 1) {
+        TLI.isBinOp(N0.getOpcode()) && N0->getNumValues() == 1) {
       // splat (vector_bo L, R), Index -->
       // splat (scalar_bo (extelt L, Index), (extelt R, Index))
       SDValue L = N0.getOperand(0), R = N0.getOperand(1);
@@ -21781,13 +22430,26 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
       SDValue Index = DAG.getVectorIdxConstant(SplatIndex, DL);
       SDValue ExtL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, L, Index);
       SDValue ExtR = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, R, Index);
-      SDValue NewBO = DAG.getNode(N0.getOpcode(), DL, EltVT, ExtL, ExtR,
-                                  N0.getNode()->getFlags());
+      SDValue NewBO =
+          DAG.getNode(N0.getOpcode(), DL, EltVT, ExtL, ExtR, N0->getFlags());
       SDValue Insert = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, NewBO);
       SmallVector<int, 16> ZeroMask(VT.getVectorNumElements(), 0);
       return DAG.getVectorShuffle(VT, DL, Insert, DAG.getUNDEF(VT), ZeroMask);
     }
 
+    // splat(scalar_to_vector(x), 0) -> build_vector(x,...,x)
+    // splat(insert_vector_elt(v, x, c), c) -> build_vector(x,...,x)
+    if ((!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) &&
+        N0.hasOneUse()) {
+      if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR && SplatIndex == 0)
+        return DAG.getSplatBuildVector(VT, SDLoc(N), N0.getOperand(0));
+
+      if (N0.getOpcode() == ISD::INSERT_VECTOR_ELT)
+        if (auto *Idx = dyn_cast<ConstantSDNode>(N0.getOperand(2)))
+          if (Idx->getAPIntValue() == SplatIndex)
+            return DAG.getSplatBuildVector(VT, SDLoc(N), N0.getOperand(1));
+    }
+
     // If this is a bit convert that changes the element type of the vector but
     // not the number of vector elements, look through it.  Be careful not to
     // look though conversions that change things like v4f32 to v2f64.
@@ -22011,6 +22673,11 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
     }
   }
 
+  // Match shuffles of bitcasts, so long as the mask can be treated as the
+  // larger type.
+  if (SDValue V = combineShuffleOfBitcast(SVN, DAG, TLI, LegalOperations))
+    return V;
+
   // Compute the combined shuffle mask for a shuffle with SV0 as the first
   // operand, and SV1 as the second operand.
   // i.e. Merge SVN(OtherSVN, N1) -> shuffle(SV0, SV1, Mask) iff Commute = false
@@ -22342,6 +23009,11 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
       N1.getOperand(1) == N2 && N1.getOperand(0).getValueType() == VT)
     return N1.getOperand(0);
 
+  // Simplify scalar inserts into an undef vector:
+  // insert_subvector undef, (splat X), N2 -> splat X
+  if (N0.isUndef() && N1.getOpcode() == ISD::SPLAT_VECTOR)
+    return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, N1.getOperand(0));
+
   // If we are inserting a bitcast value into an undef, with the same
   // number of elements, just use the bitcast input of the extract.
   // i.e. INSERT_SUBVECTOR UNDEF (BITCAST N1) N2 ->
@@ -22489,6 +23161,16 @@ SDValue DAGCombiner::visitFP16_TO_FP(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitFP_TO_BF16(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+
+  // fold (fp_to_bf16 (bf16_to_fp op)) -> op
+  if (N0->getOpcode() == ISD::BF16_TO_FP)
+    return N0->getOperand(0);
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitVECREDUCE(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N0.getValueType();
@@ -22516,6 +23198,19 @@ SDValue DAGCombiner::visitVECREDUCE(SDNode *N) {
       return DAG.getNode(NewOpcode, SDLoc(N), N->getValueType(0), N0);
   }
 
+  // vecreduce_or(insert_subvector(zero or undef, val)) -> vecreduce_or(val)
+  // vecreduce_and(insert_subvector(ones or undef, val)) -> vecreduce_and(val)
+  if (N0.getOpcode() == ISD::INSERT_SUBVECTOR &&
+      TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
+    SDValue Vec = N0.getOperand(0);
+    SDValue Subvec = N0.getOperand(1);
+    if ((Opcode == ISD::VECREDUCE_OR &&
+         (N0.getOperand(0).isUndef() || isNullOrNullSplat(Vec))) ||
+        (Opcode == ISD::VECREDUCE_AND &&
+         (N0.getOperand(0).isUndef() || isAllOnesOrAllOnesSplat(Vec))))
+      return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), Subvec);
+  }
+
   return SDValue();
 }
 
@@ -22819,7 +23514,7 @@ SDValue DAGCombiner::SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1,
     // Check to see if we got a select_cc back (to turn into setcc/select).
     // Otherwise, just return whatever node we got back, like fabs.
     if (SCC.getOpcode() == ISD::SELECT_CC) {
-      const SDNodeFlags Flags = N0.getNode()->getFlags();
+      const SDNodeFlags Flags = N0->getFlags();
       SDValue SETCC = DAG.getNode(ISD::SETCC, SDLoc(N0),
                                   N0.getValueType(),
                                   SCC.getOperand(0), SCC.getOperand(1),
@@ -23489,6 +24184,27 @@ SDValue DAGCombiner::BuildUDIV(SDNode *N) {
   return SDValue();
 }
 
+/// Given an ISD::SREM node expressing a remainder by constant power of 2,
+/// return a DAG expression that will generate the same value.
+SDValue DAGCombiner::BuildSREMPow2(SDNode *N) {
+  ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
+  if (!C)
+    return SDValue();
+
+  // Avoid division by zero.
+  if (C->isZero())
+    return SDValue();
+
+  SmallVector<SDNode *, 8> Built;
+  if (SDValue S = TLI.BuildSREMPow2(N, C->getAPIntValue(), DAG, Built)) {
+    for (SDNode *N : Built)
+      AddToWorklist(N);
+    return S;
+  }
+
+  return SDValue();
+}
+
 /// Determines the LogBase2 value for a non-null input value using the
 /// transform: LogBase2(V) = (EltBits - 1) - ctlz(V).
 SDValue DAGCombiner::BuildLogBase2(SDValue V, const SDLoc &DL) {
@@ -23798,9 +24514,8 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
   auto &Size0 = MUC0.NumBytes;
   auto &Size1 = MUC1.NumBytes;
   if (OrigAlignment0 == OrigAlignment1 && SrcValOffset0 != SrcValOffset1 &&
-      Size0.hasValue() && Size1.hasValue() && *Size0 == *Size1 &&
-      OrigAlignment0 > *Size0 && SrcValOffset0 % *Size0 == 0 &&
-      SrcValOffset1 % *Size1 == 0) {
+      Size0 && Size1 && *Size0 == *Size1 && OrigAlignment0 > *Size0 &&
+      SrcValOffset0 % *Size0 == 0 && SrcValOffset1 % *Size1 == 0) {
     int64_t OffAlign0 = SrcValOffset0 % OrigAlignment0.value();
     int64_t OffAlign1 = SrcValOffset1 % OrigAlignment1.value();
 
@@ -23819,8 +24534,8 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
     UseAA = false;
 #endif
 
-  if (UseAA && AA && MUC0.MMO->getValue() && MUC1.MMO->getValue() &&
-      Size0.hasValue() && Size1.hasValue()) {
+  if (UseAA && AA && MUC0.MMO->getValue() && MUC1.MMO->getValue() && Size0 &&
+      Size1) {
     // Use alias analysis information.
     int64_t MinOffset = std::min(SrcValOffset0, SrcValOffset1);
     int64_t Overlap0 = *Size0 + SrcValOffset0 - MinOffset;
@@ -23853,7 +24568,7 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
   unsigned Depth = 0;
 
   // Attempt to improve chain by a single step
-  std::function<bool(SDValue &)> ImproveChain = [&](SDValue &C) -> bool {
+  auto ImproveChain = [&](SDValue &C) -> bool {
     switch (C.getOpcode()) {
     case ISD::EntryToken:
       // No need to mark EntryToken.
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index d8ef79fe9a7b..ff5779967e22 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -72,7 +72,6 @@
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/DiagnosticInfo.h"
@@ -94,7 +93,6 @@
 #include "llvm/IR/Value.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInstrDesc.h"
-#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -1265,7 +1263,7 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
       // If using instruction referencing, mutate this into a DBG_INSTR_REF,
       // to be later patched up by finalizeDebugInstrRefs. Tack a deref onto
       // the expression, we don't have an "indirect" flag in DBG_INSTR_REF.
-      if (FuncInfo.MF->useDebugInstrRef() && Op->isReg()) {
+      if (UseInstrRefDebugInfo && Op->isReg()) {
         Builder->setDesc(TII.get(TargetOpcode::DBG_INSTR_REF));
         Builder->getOperand(1).ChangeToImmediate(0);
         auto *NewExpr =
@@ -1324,7 +1322,7 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
 
       // If using instruction referencing, mutate this into a DBG_INSTR_REF,
       // to be later patched up by finalizeDebugInstrRefs.
-      if (FuncInfo.MF->useDebugInstrRef()) {
+      if (UseInstrRefDebugInfo) {
         Builder->setDesc(TII.get(TargetOpcode::DBG_INSTR_REF));
         Builder->getOperand(1).ChangeToImmediate(0);
       }
@@ -1408,16 +1406,6 @@ bool FastISel::selectCast(const User *I, unsigned Opcode) {
 }
 
 bool FastISel::selectBitCast(const User *I) {
-  // If the bitcast doesn't change the type, just use the operand value.
-  if (I->getType() == I->getOperand(0)->getType()) {
-    Register Reg = getRegForValue(I->getOperand(0));
-    if (!Reg)
-      return false;
-    updateValueMap(I, Reg);
-    return true;
-  }
-
-  // Bitcasts of other values become reg-reg copies or BITCAST operators.
   EVT SrcEVT = TLI.getValueType(DL, I->getOperand(0)->getType());
   EVT DstEVT = TLI.getValueType(DL, I->getType());
   if (SrcEVT == MVT::Other || DstEVT == MVT::Other ||
@@ -1431,23 +1419,14 @@ bool FastISel::selectBitCast(const User *I) {
   if (!Op0) // Unhandled operand. Halt "fast" selection and bail.
     return false;
 
-  // First, try to perform the bitcast by inserting a reg-reg copy.
-  Register ResultReg;
+  // If the bitcast doesn't change the type, just use the operand value.
   if (SrcVT == DstVT) {
-    const TargetRegisterClass *SrcClass = TLI.getRegClassFor(SrcVT);
-    const TargetRegisterClass *DstClass = TLI.getRegClassFor(DstVT);
-    // Don't attempt a cross-class copy. It will likely fail.
-    if (SrcClass == DstClass) {
-      ResultReg = createResultReg(DstClass);
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(TargetOpcode::COPY), ResultReg).addReg(Op0);
-    }
+    updateValueMap(I, Op0);
+    return true;
   }
 
-  // If the reg-reg copy failed, select a BITCAST opcode.
-  if (!ResultReg)
-    ResultReg = fastEmit_r(SrcVT, DstVT, ISD::BITCAST, Op0);
-
+  // Otherwise, select a BITCAST opcode.
+  Register ResultReg = fastEmit_r(SrcVT, DstVT, ISD::BITCAST, Op0);
   if (!ResultReg)
     return false;
 
@@ -2251,6 +2230,11 @@ bool FastISel::tryToFoldLoad(const LoadInst *LI, const Instruction *FoldInst) {
   if (!MRI.hasOneUse(LoadReg))
     return false;
 
+  // If the register has fixups, there may be additional uses through a
+  // different alias of the register.
+  if (FuncInfo.RegsWithFixups.contains(LoadReg))
+    return false;
+
   MachineRegisterInfo::reg_iterator RI = MRI.reg_begin(LoadReg);
   MachineInstr *User = RI->getParent();
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index 85c6eca5775e..aa9c77f9cabf 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -31,13 +31,10 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetOptions.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -57,7 +54,7 @@ static bool isUsedOutsideOfDefiningBlock(const Instruction *I) {
   return false;
 }
 
-static ISD::NodeType getPreferredExtendForValue(const Value *V) {
+static ISD::NodeType getPreferredExtendForValue(const Instruction *I) {
   // For the users of the source value being used for compare instruction, if
   // the number of signed predicate is greater than unsigned predicate, we
   // prefer to use SIGN_EXTEND.
@@ -67,7 +64,7 @@ static ISD::NodeType getPreferredExtendForValue(const Value *V) {
   // can be exposed.
   ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
   unsigned NumOfSigned = 0, NumOfUnsigned = 0;
-  for (const User *U : V->users()) {
+  for (const User *U : I->users()) {
     if (const auto *CI = dyn_cast<CmpInst>(U)) {
       NumOfSigned += CI->isSigned();
       NumOfUnsigned += CI->isUnsigned();
@@ -448,9 +445,14 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
   IntVT = TLI->getTypeToTransformTo(PN->getContext(), IntVT);
   unsigned BitWidth = IntVT.getSizeInBits();
 
-  Register DestReg = ValueMap[PN];
-  if (!Register::isVirtualRegister(DestReg))
+  auto It = ValueMap.find(PN);
+  if (It == ValueMap.end())
     return;
+
+  Register DestReg = It->second;
+  if (DestReg == 0)
+    return
+  assert(Register::isVirtualRegister(DestReg) && "Expected a virtual reg");
   LiveOutRegInfo.grow(DestReg);
   LiveOutInfo &DestLOI = LiveOutRegInfo[DestReg];
 
@@ -462,7 +464,11 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
   }
 
   if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
-    APInt Val = CI->getValue().zextOrTrunc(BitWidth);
+    APInt Val;
+    if (TLI->signExtendConstant(CI))
+      Val = CI->getValue().sext(BitWidth);
+    else
+      Val = CI->getValue().zext(BitWidth);
     DestLOI.NumSignBits = Val.getNumSignBits();
     DestLOI.Known = KnownBits::makeConstant(Val);
   } else {
@@ -494,7 +500,11 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
     }
 
     if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
-      APInt Val = CI->getValue().zextOrTrunc(BitWidth);
+      APInt Val;
+      if (TLI->signExtendConstant(CI))
+        Val = CI->getValue().sext(BitWidth);
+      else
+        Val = CI->getValue().zext(BitWidth);
       DestLOI.NumSignBits = std::min(DestLOI.NumSignBits, Val.getNumSignBits());
       DestLOI.Known.Zero &= ~Val;
       DestLOI.Known.One &= Val;
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 331e0325aea3..3d3b504c6abd 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -14,22 +14,18 @@
 
 #include "InstrEmitter.h"
 #include "SDNodeDbgValue.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/PseudoProbe.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
@@ -321,8 +317,15 @@ InstrEmitter::AddRegisterOperand(MachineInstrBuilder &MIB,
       OpRC = TII->getRegClass(*II, IIOpNum, TRI, *MF);
 
     if (OpRC) {
+      unsigned MinNumRegs = MinRCSize;
+      // Don't apply any RC size limit for IMPLICIT_DEF. Each use has a unique
+      // virtual register.
+      if (Op.isMachineOpcode() &&
+          Op.getMachineOpcode() == TargetOpcode::IMPLICIT_DEF)
+        MinNumRegs = 0;
+
       const TargetRegisterClass *ConstrainedRC
-        = MRI->constrainRegClass(VReg, OpRC, MinRCSize);
+        = MRI->constrainRegClass(VReg, OpRC, MinNumRegs);
       if (!ConstrainedRC) {
         OpRC = TRI->getAllocatableClass(OpRC);
         assert(OpRC && "Constraints cannot be fulfilled for allocation");
@@ -1341,11 +1344,12 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
 /// InstrEmitter - Construct an InstrEmitter and set it to start inserting
 /// at the given position in the given block.
 InstrEmitter::InstrEmitter(const TargetMachine &TM, MachineBasicBlock *mbb,
-                           MachineBasicBlock::iterator insertpos)
+                           MachineBasicBlock::iterator insertpos,
+                           bool UseInstrRefDebugInfo)
     : MF(mbb->getParent()), MRI(&MF->getRegInfo()),
       TII(MF->getSubtarget().getInstrInfo()),
       TRI(MF->getSubtarget().getRegisterInfo()),
       TLI(MF->getSubtarget().getTargetLowering()), MBB(mbb),
       InsertPos(insertpos) {
-  EmitDebugInstrRefs = MF->useDebugInstrRef();
+  EmitDebugInstrRefs = UseInstrRefDebugInfo;
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
index ac8a70156522..ced8f064b9be 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
@@ -154,7 +154,8 @@ public:
   /// InstrEmitter - Construct an InstrEmitter and set it to start inserting
   /// at the given position in the given block.
   InstrEmitter(const TargetMachine &TM, MachineBasicBlock *mbb,
-               MachineBasicBlock::iterator insertpos);
+               MachineBasicBlock::iterator insertpos,
+               bool UseInstrRefDebugInfo);
 
 private:
   void EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 54481b94fdd8..8bdc9410d131 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -13,6 +13,7 @@
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
@@ -45,7 +46,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
-#include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <tuple>
@@ -142,12 +142,10 @@ private:
                        RTLIB::Libcall Call_F128,
                        RTLIB::Libcall Call_PPCF128,
                        SmallVectorImpl<SDValue> &Results);
-  SDValue ExpandIntLibCall(SDNode *Node, bool isSigned,
-                           RTLIB::Libcall Call_I8,
-                           RTLIB::Libcall Call_I16,
-                           RTLIB::Libcall Call_I32,
-                           RTLIB::Libcall Call_I64,
-                           RTLIB::Libcall Call_I128);
+  SDValue ExpandIntLibCall(SDNode *Node, bool isSigned, RTLIB::Libcall Call_I8,
+                           RTLIB::Libcall Call_I16, RTLIB::Libcall Call_I32,
+                           RTLIB::Libcall Call_I64, RTLIB::Libcall Call_I128,
+                           RTLIB::Libcall Call_IEXT);
   void ExpandArgFPLibCall(SDNode *Node,
                           RTLIB::Libcall Call_F32, RTLIB::Libcall Call_F64,
                           RTLIB::Libcall Call_F80, RTLIB::Libcall Call_F128,
@@ -1000,6 +998,7 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
       Action = TLI.getOperationAction(Node->getOpcode(), MVT::Other);
     break;
   case ISD::FP_TO_FP16:
+  case ISD::FP_TO_BF16:
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
   case ISD::EXTRACT_VECTOR_ELT:
@@ -1036,14 +1035,18 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   case ISD::STRICT_FSETCC:
   case ISD::STRICT_FSETCCS:
   case ISD::SETCC:
+  case ISD::VP_SETCC:
   case ISD::BR_CC: {
-    unsigned CCOperand = Node->getOpcode() == ISD::SELECT_CC ? 4 :
-                         Node->getOpcode() == ISD::STRICT_FSETCC ? 3 :
-                         Node->getOpcode() == ISD::STRICT_FSETCCS ? 3 :
-                         Node->getOpcode() == ISD::SETCC ? 2 : 1;
-    unsigned CompareOperand = Node->getOpcode() == ISD::BR_CC ? 2 :
-                              Node->getOpcode() == ISD::STRICT_FSETCC ? 1 :
-                              Node->getOpcode() == ISD::STRICT_FSETCCS ? 1 : 0;
+    unsigned Opc = Node->getOpcode();
+    unsigned CCOperand = Opc == ISD::SELECT_CC                         ? 4
+                         : Opc == ISD::STRICT_FSETCC                   ? 3
+                         : Opc == ISD::STRICT_FSETCCS                  ? 3
+                         : (Opc == ISD::SETCC || Opc == ISD::VP_SETCC) ? 2
+                                                                       : 1;
+    unsigned CompareOperand = Opc == ISD::BR_CC            ? 2
+                              : Opc == ISD::STRICT_FSETCC  ? 1
+                              : Opc == ISD::STRICT_FSETCCS ? 1
+                                                           : 0;
     MVT OpVT = Node->getOperand(CompareOperand).getSimpleValueType();
     ISD::CondCode CCCode =
         cast<CondCodeSDNode>(Node->getOperand(CCOperand))->get();
@@ -1174,6 +1177,11 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
         Node->getOpcode(),
         cast<VPStoreSDNode>(Node)->getValue().getValueType());
     break;
+  case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
+    Action = TLI.getOperationAction(
+        Node->getOpcode(),
+        cast<VPStridedStoreSDNode>(Node)->getValue().getValueType());
+    break;
   case ISD::VECREDUCE_FADD:
   case ISD::VECREDUCE_FMUL:
   case ISD::VECREDUCE_ADD:
@@ -1187,6 +1195,7 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   case ISD::VECREDUCE_UMIN:
   case ISD::VECREDUCE_FMAX:
   case ISD::VECREDUCE_FMIN:
+  case ISD::IS_FPCLASS:
     Action = TLI.getOperationAction(
         Node->getOpcode(), Node->getOperand(0).getValueType());
     break;
@@ -1212,7 +1221,7 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     break;
   default:
     if (Node->getOpcode() >= ISD::BUILTIN_OP_END) {
-      Action = TargetLowering::Legal;
+      Action = TLI.getCustomOperationAction(*Node);
     } else {
       Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
     }
@@ -1723,16 +1732,14 @@ SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp, EVT SlotVT,
 SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp, EVT SlotVT,
                                                EVT DestVT, const SDLoc &dl,
                                                SDValue Chain) {
-  unsigned SrcSize = SrcOp.getValueSizeInBits();
-  unsigned SlotSize = SlotVT.getSizeInBits();
-  unsigned DestSize = DestVT.getSizeInBits();
+  EVT SrcVT = SrcOp.getValueType();
   Type *DestType = DestVT.getTypeForEVT(*DAG.getContext());
   Align DestAlign = DAG.getDataLayout().getPrefTypeAlign(DestType);
 
   // Don't convert with stack if the load/store is expensive.
-  if ((SrcSize > SlotSize &&
+  if ((SrcVT.bitsGT(SlotVT) &&
        !TLI.isTruncStoreLegalOrCustom(SrcOp.getValueType(), SlotVT)) ||
-      (SlotSize < DestSize &&
+      (SlotVT.bitsLT(DestVT) &&
        !TLI.isLoadExtLegalOrCustom(ISD::EXTLOAD, DestVT, SlotVT)))
     return SDValue();
 
@@ -1750,20 +1757,19 @@ SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp, EVT SlotVT,
   // later than DestVT.
   SDValue Store;
 
-  if (SrcSize > SlotSize)
+  if (SrcVT.bitsGT(SlotVT))
     Store = DAG.getTruncStore(Chain, dl, SrcOp, FIPtr, PtrInfo,
                               SlotVT, SrcAlign);
   else {
-    assert(SrcSize == SlotSize && "Invalid store");
-    Store =
-        DAG.getStore(Chain, dl, SrcOp, FIPtr, PtrInfo, SrcAlign);
+    assert(SrcVT.bitsEq(SlotVT) && "Invalid store");
+    Store = DAG.getStore(Chain, dl, SrcOp, FIPtr, PtrInfo, SrcAlign);
   }
 
   // Result is a load from the stack slot.
-  if (SlotSize == DestSize)
+  if (SlotVT.bitsEq(DestVT))
     return DAG.getLoad(DestVT, dl, Store, FIPtr, PtrInfo, DestAlign);
 
-  assert(SlotSize < DestSize && "Unknown extension!");
+  assert(SlotVT.bitsLT(DestVT) && "Unknown extension!");
   return DAG.getExtLoad(ISD::EXTLOAD, dl, DestVT, Store, FIPtr, PtrInfo, SlotVT,
                         DestAlign);
 }
@@ -2101,15 +2107,17 @@ void SelectionDAGLegalize::ExpandFPLibCall(SDNode* Node,
   ExpandFPLibCall(Node, LC, Results);
 }
 
-SDValue SelectionDAGLegalize::ExpandIntLibCall(SDNode* Node, bool isSigned,
-                                               RTLIB::Libcall Call_I8,
-                                               RTLIB::Libcall Call_I16,
-                                               RTLIB::Libcall Call_I32,
-                                               RTLIB::Libcall Call_I64,
-                                               RTLIB::Libcall Call_I128) {
+SDValue SelectionDAGLegalize::ExpandIntLibCall(
+    SDNode *Node, bool isSigned, RTLIB::Libcall Call_I8,
+    RTLIB::Libcall Call_I16, RTLIB::Libcall Call_I32, RTLIB::Libcall Call_I64,
+    RTLIB::Libcall Call_I128, RTLIB::Libcall Call_IEXT) {
   RTLIB::Libcall LC;
   switch (Node->getSimpleValueType(0).SimpleTy) {
-  default: llvm_unreachable("Unexpected request for libcall!");
+
+  default:
+    LC = Call_IEXT;
+    break;
+
   case MVT::i8:   LC = Call_I8; break;
   case MVT::i16:  LC = Call_I16; break;
   case MVT::i32:  LC = Call_I32; break;
@@ -2144,7 +2152,11 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
 
   RTLIB::Libcall LC;
   switch (Node->getSimpleValueType(0).SimpleTy) {
-  default: llvm_unreachable("Unexpected request for libcall!");
+
+  default:
+    LC = isSigned ? RTLIB::SDIVREM_IEXT : RTLIB::UDIVREM_IEXT;
+    break;
+
   case MVT::i8:   LC= isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
   case MVT::i16:  LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
   case MVT::i32:  LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
@@ -2893,6 +2905,18 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
                                  Node->getValueType(0), dl)))
       Results.push_back(Tmp1);
     break;
+  case ISD::BF16_TO_FP: {
+    // Always expand bf16 to f32 casts, they lower to ext + shift.
+    SDValue Op = DAG.getNode(ISD::BITCAST, dl, MVT::i16, Node->getOperand(0));
+    Op = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op);
+    Op = DAG.getNode(
+        ISD::SHL, dl, MVT::i32, Op,
+        DAG.getConstant(16, dl,
+                        TLI.getShiftAmountTy(MVT::i32, DAG.getDataLayout())));
+    Op = DAG.getNode(ISD::BITCAST, dl, MVT::f32, Op);
+    Results.push_back(Op);
+    break;
+  }
   case ISD::SIGN_EXTEND_INREG: {
     EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
     EVT VT = Node->getValueType(0);
@@ -2904,7 +2928,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     // SIGN_EXTEND_INREG does not guarantee that the high bits are already zero.
 
     // TODO: Do this for vectors too?
-    if (ExtraVT.getSizeInBits() == 1) {
+    if (ExtraVT.isScalarInteger() && ExtraVT.getSizeInBits() == 1) {
       SDValue One = DAG.getConstant(1, dl, VT);
       SDValue And = DAG.getNode(ISD::AND, dl, VT, Node->getOperand(0), One);
       SDValue Zero = DAG.getConstant(0, dl, VT);
@@ -3135,6 +3159,15 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   case ISD::FABS:
     Results.push_back(ExpandFABS(Node));
     break;
+  case ISD::IS_FPCLASS: {
+    auto CNode = cast<ConstantSDNode>(Node->getOperand(1));
+    auto Test = static_cast<FPClassTest>(CNode->getZExtValue());
+    if (SDValue Expanded =
+            TLI.expandIS_FPCLASS(Node->getValueType(0), Node->getOperand(0),
+                                 Test, Node->getFlags(), SDLoc(Node), DAG))
+      Results.push_back(Expanded);
+    break;
+  }
   case ISD::SMIN:
   case ISD::SMAX:
   case ISD::UMIN:
@@ -3577,18 +3610,26 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(Tmp1);
     break;
   case ISD::SETCC:
+  case ISD::VP_SETCC:
   case ISD::STRICT_FSETCC:
   case ISD::STRICT_FSETCCS: {
-    bool IsStrict = Node->getOpcode() != ISD::SETCC;
+    bool IsVP = Node->getOpcode() == ISD::VP_SETCC;
+    bool IsStrict = Node->getOpcode() == ISD::STRICT_FSETCC ||
+                    Node->getOpcode() == ISD::STRICT_FSETCCS;
     bool IsSignaling = Node->getOpcode() == ISD::STRICT_FSETCCS;
     SDValue Chain = IsStrict ? Node->getOperand(0) : SDValue();
     unsigned Offset = IsStrict ? 1 : 0;
     Tmp1 = Node->getOperand(0 + Offset);
     Tmp2 = Node->getOperand(1 + Offset);
     Tmp3 = Node->getOperand(2 + Offset);
-    bool Legalized =
-        TLI.LegalizeSetCCCondCode(DAG, Node->getValueType(0), Tmp1, Tmp2, Tmp3,
-                                  NeedInvert, dl, Chain, IsSignaling);
+    SDValue Mask, EVL;
+    if (IsVP) {
+      Mask = Node->getOperand(3 + Offset);
+      EVL = Node->getOperand(4 + Offset);
+    }
+    bool Legalized = TLI.LegalizeSetCCCondCode(
+        DAG, Node->getValueType(0), Tmp1, Tmp2, Tmp3, Mask, EVL, NeedInvert, dl,
+        Chain, IsSignaling);
 
     if (Legalized) {
       // If we expanded the SETCC by swapping LHS and RHS, or by inverting the
@@ -3598,6 +3639,9 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
           Tmp1 = DAG.getNode(Node->getOpcode(), dl, Node->getVTList(),
                              {Chain, Tmp1, Tmp2, Tmp3}, Node->getFlags());
           Chain = Tmp1.getValue(1);
+        } else if (IsVP) {
+          Tmp1 = DAG.getNode(Node->getOpcode(), dl, Node->getValueType(0),
+                             {Tmp1, Tmp2, Tmp3, Mask, EVL}, Node->getFlags());
         } else {
           Tmp1 = DAG.getNode(Node->getOpcode(), dl, Node->getValueType(0), Tmp1,
                              Tmp2, Tmp3, Node->getFlags());
@@ -3606,8 +3650,13 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
 
       // If we expanded the SETCC by inverting the condition code, then wrap
       // the existing SETCC in a NOT to restore the intended condition.
-      if (NeedInvert)
-        Tmp1 = DAG.getLogicalNOT(dl, Tmp1, Tmp1->getValueType(0));
+      if (NeedInvert) {
+        if (!IsVP)
+          Tmp1 = DAG.getLogicalNOT(dl, Tmp1, Tmp1->getValueType(0));
+        else
+          Tmp1 =
+              DAG.getVPLogicalNOT(dl, Tmp1, Mask, EVL, Tmp1->getValueType(0));
+      }
 
       Results.push_back(Tmp1);
       if (IsStrict)
@@ -3622,21 +3671,12 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
 
     // Otherwise, SETCC for the given comparison type must be completely
     // illegal; expand it into a SELECT_CC.
+    // FIXME: This drops the mask/evl for VP_SETCC.
     EVT VT = Node->getValueType(0);
-    int TrueValue;
-    switch (TLI.getBooleanContents(Tmp1.getValueType())) {
-    case TargetLowering::ZeroOrOneBooleanContent:
-    case TargetLowering::UndefinedBooleanContent:
-      TrueValue = 1;
-      break;
-    case TargetLowering::ZeroOrNegativeOneBooleanContent:
-      TrueValue = -1;
-      break;
-    }
+    EVT Tmp1VT = Tmp1.getValueType();
     Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, VT, Tmp1, Tmp2,
-                       DAG.getConstant(TrueValue, dl, VT),
-                       DAG.getConstant(0, dl, VT),
-                       Tmp3);
+                       DAG.getBoolConstant(true, dl, VT, Tmp1VT),
+                       DAG.getBoolConstant(false, dl, VT, Tmp1VT), Tmp3);
     Tmp1->setFlags(Node->getFlags());
     Results.push_back(Tmp1);
     break;
@@ -3692,7 +3732,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     if (!Legalized) {
       Legalized = TLI.LegalizeSetCCCondCode(
           DAG, getSetCCResultType(Tmp1.getValueType()), Tmp1, Tmp2, CC,
-          NeedInvert, dl, Chain);
+          /*Mask*/ SDValue(), /*EVL*/ SDValue(), NeedInvert, dl, Chain);
 
       assert(Legalized && "Can't legalize SELECT_CC with legal condition!");
 
@@ -3725,9 +3765,9 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Tmp3 = Node->getOperand(3);              // RHS
     Tmp4 = Node->getOperand(1);              // CC
 
-    bool Legalized =
-        TLI.LegalizeSetCCCondCode(DAG, getSetCCResultType(Tmp2.getValueType()),
-                                  Tmp2, Tmp3, Tmp4, NeedInvert, dl, Chain);
+    bool Legalized = TLI.LegalizeSetCCCondCode(
+        DAG, getSetCCResultType(Tmp2.getValueType()), Tmp2, Tmp3, Tmp4,
+        /*Mask*/ SDValue(), /*EVL*/ SDValue(), NeedInvert, dl, Chain);
     (void)Legalized;
     assert(Legalized && "Can't legalize BR_CC with legal condition!");
 
@@ -4068,12 +4108,25 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fpowi.");
     if (!TLI.getLibcallName(LC)) {
       // Some targets don't have a powi libcall; use pow instead.
-      SDValue Exponent = DAG.getNode(ISD::SINT_TO_FP, SDLoc(Node),
-                                     Node->getValueType(0),
-                                     Node->getOperand(1));
-      Results.push_back(DAG.getNode(ISD::FPOW, SDLoc(Node),
-                                    Node->getValueType(0), Node->getOperand(0),
-                                    Exponent));
+      if (Node->isStrictFPOpcode()) {
+        SDValue Exponent =
+            DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(Node),
+                        {Node->getValueType(0), Node->getValueType(1)},
+                        {Node->getOperand(0), Node->getOperand(2)});
+        SDValue FPOW =
+            DAG.getNode(ISD::STRICT_FPOW, SDLoc(Node),
+                        {Node->getValueType(0), Node->getValueType(1)},
+                        {Exponent.getValue(1), Node->getOperand(1), Exponent});
+        Results.push_back(FPOW);
+        Results.push_back(FPOW.getValue(1));
+      } else {
+        SDValue Exponent =
+            DAG.getNode(ISD::SINT_TO_FP, SDLoc(Node), Node->getValueType(0),
+                        Node->getOperand(1));
+        Results.push_back(DAG.getNode(ISD::FPOW, SDLoc(Node),
+                                      Node->getValueType(0),
+                                      Node->getOperand(0), Exponent));
+      }
       break;
     }
     unsigned Offset = Node->isStrictFPOpcode() ? 1 : 0;
@@ -4176,6 +4229,13 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     Results.push_back(ExpandLibCall(LC, Node, false));
     break;
   }
+  case ISD::FP_TO_BF16: {
+    RTLIB::Libcall LC =
+        RTLIB::getFPROUND(Node->getOperand(0).getValueType(), MVT::bf16);
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to expand fp_to_bf16");
+    Results.push_back(ExpandLibCall(LC, Node, false));
+    break;
+  }
   case ISD::STRICT_SINT_TO_FP:
   case ISD::STRICT_UINT_TO_FP:
   case ISD::SINT_TO_FP:
@@ -4315,28 +4375,24 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
                     RTLIB::SUB_PPCF128, Results);
     break;
   case ISD::SREM:
-    Results.push_back(ExpandIntLibCall(Node, true,
-                                       RTLIB::SREM_I8,
-                                       RTLIB::SREM_I16, RTLIB::SREM_I32,
-                                       RTLIB::SREM_I64, RTLIB::SREM_I128));
+    Results.push_back(ExpandIntLibCall(
+        Node, true, RTLIB::SREM_I8, RTLIB::SREM_I16, RTLIB::SREM_I32,
+        RTLIB::SREM_I64, RTLIB::SREM_I128, RTLIB::SREM_IEXT));
     break;
   case ISD::UREM:
-    Results.push_back(ExpandIntLibCall(Node, false,
-                                       RTLIB::UREM_I8,
-                                       RTLIB::UREM_I16, RTLIB::UREM_I32,
-                                       RTLIB::UREM_I64, RTLIB::UREM_I128));
+    Results.push_back(ExpandIntLibCall(
+        Node, false, RTLIB::UREM_I8, RTLIB::UREM_I16, RTLIB::UREM_I32,
+        RTLIB::UREM_I64, RTLIB::UREM_I128, RTLIB::UREM_IEXT));
     break;
   case ISD::SDIV:
-    Results.push_back(ExpandIntLibCall(Node, true,
-                                       RTLIB::SDIV_I8,
-                                       RTLIB::SDIV_I16, RTLIB::SDIV_I32,
-                                       RTLIB::SDIV_I64, RTLIB::SDIV_I128));
+    Results.push_back(ExpandIntLibCall(
+        Node, true, RTLIB::SDIV_I8, RTLIB::SDIV_I16, RTLIB::SDIV_I32,
+        RTLIB::SDIV_I64, RTLIB::SDIV_I128, RTLIB::SDIV_IEXT));
     break;
   case ISD::UDIV:
-    Results.push_back(ExpandIntLibCall(Node, false,
-                                       RTLIB::UDIV_I8,
-                                       RTLIB::UDIV_I16, RTLIB::UDIV_I32,
-                                       RTLIB::UDIV_I64, RTLIB::UDIV_I128));
+    Results.push_back(ExpandIntLibCall(
+        Node, false, RTLIB::UDIV_I8, RTLIB::UDIV_I16, RTLIB::UDIV_I32,
+        RTLIB::UDIV_I64, RTLIB::UDIV_I128, RTLIB::UDIV_IEXT));
     break;
   case ISD::SDIVREM:
   case ISD::UDIVREM:
@@ -4344,10 +4400,9 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     ExpandDivRemLibCall(Node, Results);
     break;
   case ISD::MUL:
-    Results.push_back(ExpandIntLibCall(Node, false,
-                                       RTLIB::MUL_I8,
-                                       RTLIB::MUL_I16, RTLIB::MUL_I32,
-                                       RTLIB::MUL_I64, RTLIB::MUL_I128));
+    Results.push_back(ExpandIntLibCall(
+        Node, false, RTLIB::MUL_I8, RTLIB::MUL_I16, RTLIB::MUL_I32,
+        RTLIB::MUL_I64, RTLIB::MUL_I128, RTLIB::MUL_IEXT));
     break;
   case ISD::CTLZ_ZERO_UNDEF:
     switch (Node->getSimpleValueType(0).SimpleTy) {
@@ -4700,6 +4755,12 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     Results.push_back(DAG.getNode(ISD::FP_ROUND, dl, OVT,
                                   Tmp3, DAG.getIntPtrConstant(0, dl)));
     break;
+  case ISD::STRICT_FADD:
+  case ISD::STRICT_FSUB:
+  case ISD::STRICT_FMUL:
+  case ISD::STRICT_FDIV:
+  case ISD::STRICT_FMINNUM:
+  case ISD::STRICT_FMAXNUM:
   case ISD::STRICT_FREM:
   case ISD::STRICT_FPOW:
     Tmp1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
@@ -4724,6 +4785,22 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
                     DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2, Tmp3),
                     DAG.getIntPtrConstant(0, dl)));
     break;
+  case ISD::STRICT_FMA:
+    Tmp1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
+                       {Node->getOperand(0), Node->getOperand(1)});
+    Tmp2 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
+                       {Node->getOperand(0), Node->getOperand(2)});
+    Tmp3 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
+                       {Node->getOperand(0), Node->getOperand(3)});
+    Tmp4 = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Tmp1.getValue(1),
+                       Tmp2.getValue(1), Tmp3.getValue(1));
+    Tmp4 = DAG.getNode(Node->getOpcode(), dl, {NVT, MVT::Other},
+                       {Tmp4, Tmp1, Tmp2, Tmp3});
+    Tmp4 = DAG.getNode(ISD::STRICT_FP_ROUND, dl, {OVT, MVT::Other},
+                       {Tmp4.getValue(1), Tmp4, DAG.getIntPtrConstant(0, dl)});
+    Results.push_back(Tmp4);
+    Results.push_back(Tmp4.getValue(1));
+    break;
   case ISD::FCOPYSIGN:
   case ISD::FPOWI: {
     Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
@@ -4740,6 +4817,16 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
                                   Tmp3, DAG.getIntPtrConstant(isTrunc, dl)));
     break;
   }
+  case ISD::STRICT_FPOWI:
+    Tmp1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
+                       {Node->getOperand(0), Node->getOperand(1)});
+    Tmp2 = DAG.getNode(Node->getOpcode(), dl, {NVT, MVT::Other},
+                       {Tmp1.getValue(1), Tmp1, Node->getOperand(2)});
+    Tmp3 = DAG.getNode(ISD::STRICT_FP_ROUND, dl, {OVT, MVT::Other},
+                       {Tmp2.getValue(1), Tmp2, DAG.getIntPtrConstant(0, dl)});
+    Results.push_back(Tmp3);
+    Results.push_back(Tmp3.getValue(1));
+    break;
   case ISD::FFLOOR:
   case ISD::FCEIL:
   case ISD::FRINT:
@@ -4764,12 +4851,19 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     break;
   case ISD::STRICT_FFLOOR:
   case ISD::STRICT_FCEIL:
+  case ISD::STRICT_FRINT:
+  case ISD::STRICT_FNEARBYINT:
   case ISD::STRICT_FROUND:
+  case ISD::STRICT_FROUNDEVEN:
+  case ISD::STRICT_FTRUNC:
+  case ISD::STRICT_FSQRT:
   case ISD::STRICT_FSIN:
   case ISD::STRICT_FCOS:
   case ISD::STRICT_FLOG:
+  case ISD::STRICT_FLOG2:
   case ISD::STRICT_FLOG10:
   case ISD::STRICT_FEXP:
+  case ISD::STRICT_FEXP2:
     Tmp1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
                        {Node->getOperand(0), Node->getOperand(1)});
     Tmp2 = DAG.getNode(Node->getOpcode(), dl, {NVT, MVT::Other},
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 6bf38d7296a8..f464208cd9dc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -273,6 +273,8 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FABS(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FMINNUM(SDNode *N) {
+  if (SDValue SelCC = TLI.createSelectForFMINNUM_FMAXNUM(N, DAG))
+    return SoftenFloatRes_SELECT_CC(SelCC.getNode());
   return SoftenFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
                                                RTLIB::FMIN_F32,
                                                RTLIB::FMIN_F64,
@@ -282,6 +284,8 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FMINNUM(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FMAXNUM(SDNode *N) {
+  if (SDValue SelCC = TLI.createSelectForFMINNUM_FMAXNUM(N, DAG))
+    return SoftenFloatRes_SELECT_CC(SelCC.getNode());
   return SoftenFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
                                                RTLIB::FMAX_F32,
                                                RTLIB::FMAX_F64,
@@ -830,6 +834,7 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
   case ISD::BR_CC:       Res = SoftenFloatOp_BR_CC(N); break;
   case ISD::STRICT_FP_TO_FP16:
   case ISD::FP_TO_FP16:  // Same as FP_ROUND for softening purposes
+  case ISD::FP_TO_BF16:
   case ISD::STRICT_FP_ROUND:
   case ISD::FP_ROUND:    Res = SoftenFloatOp_FP_ROUND(N); break;
   case ISD::STRICT_FP_TO_SINT:
@@ -881,16 +886,19 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_ROUND(SDNode *N) {
   // returns an i16 so doesn't meet the constraints necessary for FP_ROUND.
   assert(N->getOpcode() == ISD::FP_ROUND || N->getOpcode() == ISD::FP_TO_FP16 ||
          N->getOpcode() == ISD::STRICT_FP_TO_FP16 ||
+         N->getOpcode() == ISD::FP_TO_BF16 ||
          N->getOpcode() == ISD::STRICT_FP_ROUND);
 
   bool IsStrict = N->isStrictFPOpcode();
   SDValue Op = N->getOperand(IsStrict ? 1 : 0);
   EVT SVT = Op.getValueType();
   EVT RVT = N->getValueType(0);
-  EVT FloatRVT = (N->getOpcode() == ISD::FP_TO_FP16 ||
-                  N->getOpcode() == ISD::STRICT_FP_TO_FP16)
-                     ? MVT::f16
-                     : RVT;
+  EVT FloatRVT = RVT;
+  if (N->getOpcode() == ISD::FP_TO_FP16 ||
+      N->getOpcode() == ISD::STRICT_FP_TO_FP16)
+    FloatRVT = MVT::f16;
+  else if (N->getOpcode() == ISD::FP_TO_BF16)
+    FloatRVT = MVT::bf16;
 
   RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, FloatRVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND libcall");
@@ -2064,9 +2072,13 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_LLRINT(SDNode *N) {
 
 static ISD::NodeType GetPromotionOpcode(EVT OpVT, EVT RetVT) {
   if (OpVT == MVT::f16) {
-      return ISD::FP16_TO_FP;
+    return ISD::FP16_TO_FP;
   } else if (RetVT == MVT::f16) {
-      return ISD::FP_TO_FP16;
+    return ISD::FP_TO_FP16;
+  } else if (OpVT == MVT::bf16) {
+    return ISD::BF16_TO_FP;
+  } else if (RetVT == MVT::bf16) {
+    return ISD::FP_TO_BF16;
   }
 
   report_fatal_error("Attempt at an invalid promotion-related conversion");
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 8c7b90b6cd33..69fd83bcd7b3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -78,6 +78,7 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::SELECT:
   case ISD::VSELECT:
   case ISD::VP_SELECT:
+  case ISD::VP_MERGE:
     Res = PromoteIntRes_Select(N);
     break;
   case ISD::SELECT_CC:   Res = PromoteIntRes_SELECT_CC(N); break;
@@ -97,6 +98,7 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::VP_ASHR:     Res = PromoteIntRes_SRA(N); break;
   case ISD::SRL:
   case ISD::VP_LSHR:     Res = PromoteIntRes_SRL(N); break;
+  case ISD::VP_TRUNCATE:
   case ISD::TRUNCATE:    Res = PromoteIntRes_TRUNCATE(N); break;
   case ISD::UNDEF:       Res = PromoteIntRes_UNDEF(N); break;
   case ISD::VAARG:       Res = PromoteIntRes_VAARG(N); break;
@@ -115,11 +117,12 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::INSERT_VECTOR_ELT:
                          Res = PromoteIntRes_INSERT_VECTOR_ELT(N); break;
   case ISD::BUILD_VECTOR:
-                         Res = PromoteIntRes_BUILD_VECTOR(N); break;
-  case ISD::SCALAR_TO_VECTOR:
-                         Res = PromoteIntRes_SCALAR_TO_VECTOR(N); break;
+    Res = PromoteIntRes_BUILD_VECTOR(N);
+    break;
   case ISD::SPLAT_VECTOR:
-                         Res = PromoteIntRes_SPLAT_VECTOR(N); break;
+  case ISD::SCALAR_TO_VECTOR:
+    Res = PromoteIntRes_ScalarOp(N);
+    break;
   case ISD::STEP_VECTOR: Res = PromoteIntRes_STEP_VECTOR(N); break;
   case ISD::CONCAT_VECTORS:
                          Res = PromoteIntRes_CONCAT_VECTORS(N); break;
@@ -133,6 +136,8 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::ZERO_EXTEND:
   case ISD::ANY_EXTEND:  Res = PromoteIntRes_INT_EXTEND(N); break;
 
+  case ISD::VP_FPTOSI:
+  case ISD::VP_FPTOUI:
   case ISD::STRICT_FP_TO_SINT:
   case ISD::STRICT_FP_TO_UINT:
   case ISD::FP_TO_SINT:
@@ -262,6 +267,10 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::FSHR:
     Res = PromoteIntRes_FunnelShift(N);
     break;
+
+  case ISD::IS_FPCLASS:
+    Res = PromoteIntRes_IS_FPCLASS(N);
+    break;
   }
 
   // If the result is null then the sub-method took care of registering it.
@@ -435,10 +444,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) {
       // interesting bits will end up at the wrong place.
       if (DAG.getDataLayout().isBigEndian()) {
         unsigned ShiftAmt = NInVT.getSizeInBits() - InVT.getSizeInBits();
-        EVT ShiftAmtTy = TLI.getShiftAmountTy(NOutVT, DAG.getDataLayout());
         assert(ShiftAmt < NOutVT.getSizeInBits() && "Too large shift amount!");
         Res = DAG.getNode(ISD::SRL, dl, NOutVT, Res,
-                          DAG.getConstant(ShiftAmt, dl, ShiftAmtTy));
+                          DAG.getShiftAmountConstant(ShiftAmt, NOutVT, dl));
       }
       return Res;
     }
@@ -446,13 +454,13 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) {
     // as the widened input type would be a legal type, we can widen the bitcast
     // and handle the promotion after.
     if (NOutVT.isVector()) {
-      unsigned WidenInSize = NInVT.getSizeInBits();
-      unsigned OutSize = OutVT.getSizeInBits();
-      if (WidenInSize % OutSize == 0) {
-        unsigned Scale = WidenInSize / OutSize;
-        EVT WideOutVT = EVT::getVectorVT(*DAG.getContext(),
-                                         OutVT.getVectorElementType(),
-                                         OutVT.getVectorNumElements() * Scale);
+      TypeSize WidenInSize = NInVT.getSizeInBits();
+      TypeSize OutSize = OutVT.getSizeInBits();
+      if (WidenInSize.hasKnownScalarFactor(OutSize)) {
+        unsigned Scale = WidenInSize.getKnownScalarFactor(OutSize);
+        EVT WideOutVT =
+            EVT::getVectorVT(*DAG.getContext(), OutVT.getVectorElementType(),
+                             OutVT.getVectorElementCount() * Scale);
         if (isTypeLegal(WideOutVT)) {
           InOp = DAG.getBitcast(WideOutVT, GetWidenedVector(InOp));
           InOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OutVT, InOp,
@@ -490,9 +498,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BSWAP(SDNode *N) {
   }
 
   unsigned DiffBits = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
-  EVT ShiftVT = TLI.getShiftAmountTy(NVT, DAG.getDataLayout());
   return DAG.getNode(ISD::SRL, dl, NVT, DAG.getNode(ISD::BSWAP, dl, NVT, Op),
-                     DAG.getConstant(DiffBits, dl, ShiftVT));
+                     DAG.getShiftAmountConstant(DiffBits, NVT, dl));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_BITREVERSE(SDNode *N) {
@@ -512,10 +519,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITREVERSE(SDNode *N) {
   }
 
   unsigned DiffBits = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
-  EVT ShiftVT = TLI.getShiftAmountTy(NVT, DAG.getDataLayout());
   return DAG.getNode(ISD::SRL, dl, NVT,
                      DAG.getNode(ISD::BITREVERSE, dl, NVT, Op),
-                     DAG.getConstant(DiffBits, dl, ShiftVT));
+                     DAG.getShiftAmountConstant(DiffBits, NVT, dl));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_PAIR(SDNode *N) {
@@ -666,6 +672,11 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_XINT(SDNode *N) {
       TLI.isOperationLegalOrCustom(ISD::STRICT_FP_TO_SINT, NVT))
     NewOpc = ISD::STRICT_FP_TO_SINT;
 
+  if (N->getOpcode() == ISD::VP_FPTOUI &&
+      !TLI.isOperationLegal(ISD::VP_FPTOUI, NVT) &&
+      TLI.isOperationLegalOrCustom(ISD::VP_FPTOSI, NVT))
+    NewOpc = ISD::VP_FPTOSI;
+
   SDValue Res;
   if (N->isStrictFPOpcode()) {
     Res = DAG.getNode(NewOpc, dl, {NVT, MVT::Other},
@@ -673,8 +684,12 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_XINT(SDNode *N) {
     // Legalize the chain result - switch anything that used the old chain to
     // use the new one.
     ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
-  } else
+  } else if (NewOpc == ISD::VP_FPTOSI || NewOpc == ISD::VP_FPTOUI) {
+    Res = DAG.getNode(NewOpc, dl, NVT, {N->getOperand(0), N->getOperand(1),
+                      N->getOperand(2)});
+  } else {
     Res = DAG.getNode(NewOpc, dl, NVT, N->getOperand(0));
+  }
 
   // Assert that the converted value fits in the original type.  If it doesn't
   // (eg: because the value being converted is too big), then the result of the
@@ -684,8 +699,11 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_XINT(SDNode *N) {
   //   before legalization: fp-to-uint16, 65534. -> 0xfffe
   //   after legalization: fp-to-sint32, 65534. -> 0x0000fffe
   return DAG.getNode((N->getOpcode() == ISD::FP_TO_UINT ||
-                      N->getOpcode() == ISD::STRICT_FP_TO_UINT) ?
-                     ISD::AssertZext : ISD::AssertSext, dl, NVT, Res,
+                      N->getOpcode() == ISD::STRICT_FP_TO_UINT ||
+                      N->getOpcode() == ISD::VP_FPTOUI)
+                         ? ISD::AssertZext
+                         : ISD::AssertSext,
+                     dl, NVT, Res,
                      DAG.getValueType(N->getValueType(0).getScalarType()));
 }
 
@@ -889,8 +907,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) {
     }
 
     unsigned SHLAmount = NewBits - OldBits;
-    EVT SHVT = TLI.getShiftAmountTy(PromotedType, DAG.getDataLayout());
-    SDValue ShiftAmount = DAG.getConstant(SHLAmount, dl, SHVT);
+    SDValue ShiftAmount =
+        DAG.getShiftAmountConstant(SHLAmount, PromotedType, dl);
     Op1Promoted =
         DAG.getNode(ISD::SHL, dl, PromotedType, Op1Promoted, ShiftAmount);
     if (!IsShift)
@@ -939,14 +957,14 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MULFIX(SDNode *N) {
     // which is extends the values that we clamp to on saturation. This could be
     // resolved by shifting one of the operands the same amount, which would
     // also shift the result we compare against, then shifting back.
-    EVT ShiftTy = TLI.getShiftAmountTy(PromotedType, DAG.getDataLayout());
-    Op1Promoted = DAG.getNode(ISD::SHL, dl, PromotedType, Op1Promoted,
-                              DAG.getConstant(DiffSize, dl, ShiftTy));
+    Op1Promoted =
+        DAG.getNode(ISD::SHL, dl, PromotedType, Op1Promoted,
+                    DAG.getShiftAmountConstant(DiffSize, PromotedType, dl));
     SDValue Result = DAG.getNode(N->getOpcode(), dl, PromotedType, Op1Promoted,
                                  Op2Promoted, N->getOperand(2));
     unsigned ShiftOp = Signed ? ISD::SRA : ISD::SRL;
     return DAG.getNode(ShiftOp, dl, PromotedType, Result,
-                       DAG.getConstant(DiffSize, dl, ShiftTy));
+                       DAG.getShiftAmountConstant(DiffSize, PromotedType, dl));
   }
   return DAG.getNode(N->getOpcode(), dl, PromotedType, Op1Promoted, Op2Promoted,
                      N->getOperand(2));
@@ -1043,17 +1061,17 @@ SDValue DAGTypeLegalizer::PromoteIntRes_DIVFIX(SDNode *N) {
     TargetLowering::LegalizeAction Action =
         TLI.getFixedPointOperationAction(N->getOpcode(), PromotedType, Scale);
     if (Action == TargetLowering::Legal || Action == TargetLowering::Custom) {
-      EVT ShiftTy = TLI.getShiftAmountTy(PromotedType, DAG.getDataLayout());
       unsigned Diff = PromotedType.getScalarSizeInBits() -
                       N->getValueType(0).getScalarSizeInBits();
       if (Saturating)
-        Op1Promoted = DAG.getNode(ISD::SHL, dl, PromotedType, Op1Promoted,
-                                  DAG.getConstant(Diff, dl, ShiftTy));
+        Op1Promoted =
+            DAG.getNode(ISD::SHL, dl, PromotedType, Op1Promoted,
+                        DAG.getShiftAmountConstant(Diff, PromotedType, dl));
       SDValue Res = DAG.getNode(N->getOpcode(), dl, PromotedType, Op1Promoted,
                                 Op2Promoted, N->getOperand(2));
       if (Saturating)
         Res = DAG.getNode(Signed ? ISD::SRA : ISD::SRL, dl, PromotedType, Res,
-                          DAG.getConstant(Diff, dl, ShiftTy));
+                          DAG.getShiftAmountConstant(Diff, PromotedType, dl));
       return Res;
     }
   }
@@ -1110,11 +1128,10 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Select(SDNode *N) {
   SDValue RHS = GetPromotedInteger(N->getOperand(2));
 
   unsigned Opcode = N->getOpcode();
-  return Opcode == ISD::VP_SELECT
-             ? DAG.getNode(Opcode, SDLoc(N), LHS.getValueType(), Mask, LHS, RHS,
-                           N->getOperand(3))
-             : DAG.getNode(Opcode, SDLoc(N), LHS.getValueType(), Mask, LHS,
-                           RHS);
+  if (Opcode == ISD::VP_SELECT || Opcode == ISD::VP_MERGE)
+    return DAG.getNode(Opcode, SDLoc(N), LHS.getValueType(), Mask, LHS, RHS,
+                       N->getOperand(3));
+  return DAG.getNode(Opcode, SDLoc(N), LHS.getValueType(), Mask, LHS, RHS);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SELECT_CC(SDNode *N) {
@@ -1167,6 +1184,14 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SETCC(SDNode *N) {
   return DAG.getSExtOrTrunc(SetCC, dl, NVT);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_IS_FPCLASS(SDNode *N) {
+  SDLoc DL(N);
+  SDValue Arg = N->getOperand(0);
+  SDValue Test = N->getOperand(1);
+  EVT NResVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  return DAG.getNode(ISD::IS_FPCLASS, DL, NResVT, Arg, Test);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_SHL(SDNode *N) {
   SDValue LHS = GetPromotedInteger(N->getOperand(0));
   SDValue RHS = N->getOperand(1);
@@ -1265,7 +1290,10 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Rotate(SDNode *N) {
 SDValue DAGTypeLegalizer::PromoteIntRes_FunnelShift(SDNode *N) {
   SDValue Hi = GetPromotedInteger(N->getOperand(0));
   SDValue Lo = GetPromotedInteger(N->getOperand(1));
-  SDValue Amt = GetPromotedInteger(N->getOperand(2));
+  SDValue Amt = N->getOperand(2);
+  if (getTypeAction(Amt.getValueType()) == TargetLowering::TypePromoteInteger)
+    Amt = ZExtPromotedInteger(Amt);
+  EVT AmtVT = Amt.getValueType();
 
   SDLoc DL(N);
   EVT OldVT = N->getOperand(0).getValueType();
@@ -1276,7 +1304,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FunnelShift(SDNode *N) {
   unsigned NewBits = VT.getScalarSizeInBits();
 
   // Amount has to be interpreted modulo the old bit width.
-  Amt = DAG.getNode(ISD::UREM, DL, VT, Amt, DAG.getConstant(OldBits, DL, VT));
+  Amt = DAG.getNode(ISD::UREM, DL, AmtVT, Amt,
+                    DAG.getConstant(OldBits, DL, AmtVT));
 
   // If the promoted type is twice the size (or more), then we use the
   // traditional funnel 'double' shift codegen. This isn't necessary if the
@@ -1296,13 +1325,13 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FunnelShift(SDNode *N) {
   }
 
   // Shift Lo up to occupy the upper bits of the promoted type.
-  SDValue ShiftOffset = DAG.getConstant(NewBits - OldBits, DL, VT);
+  SDValue ShiftOffset = DAG.getConstant(NewBits - OldBits, DL, AmtVT);
   Lo = DAG.getNode(ISD::SHL, DL, VT, Lo, ShiftOffset);
 
   // Increase Amount to shift the result into the lower bits of the promoted
   // type.
   if (IsFSHR)
-    Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, ShiftOffset);
+    Amt = DAG.getNode(ISD::ADD, DL, AmtVT, Amt, ShiftOffset);
 
   return DAG.getNode(Opcode, DL, VT, Hi, Lo, Amt);
 }
@@ -1336,11 +1365,23 @@ SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) {
 
     EVT HalfNVT = EVT::getVectorVT(*DAG.getContext(), NVT.getScalarType(),
                                    NumElts.divideCoefficientBy(2));
-    EOp1 = DAG.getNode(ISD::TRUNCATE, dl, HalfNVT, EOp1);
-    EOp2 = DAG.getNode(ISD::TRUNCATE, dl, HalfNVT, EOp2);
-
+    if (N->getOpcode() == ISD::TRUNCATE) {
+      EOp1 = DAG.getNode(ISD::TRUNCATE, dl, HalfNVT, EOp1);
+      EOp2 = DAG.getNode(ISD::TRUNCATE, dl, HalfNVT, EOp2);
+    } else {
+      assert(N->getOpcode() == ISD::VP_TRUNCATE &&
+             "Expected VP_TRUNCATE opcode");
+      SDValue MaskLo, MaskHi, EVLLo, EVLHi;
+      std::tie(MaskLo, MaskHi) = SplitMask(N->getOperand(1));
+      std::tie(EVLLo, EVLHi) =
+          DAG.SplitEVL(N->getOperand(2), N->getValueType(0), dl);
+      EOp1 = DAG.getNode(ISD::VP_TRUNCATE, dl, HalfNVT, EOp1, MaskLo, EVLLo);
+      EOp2 = DAG.getNode(ISD::VP_TRUNCATE, dl, HalfNVT, EOp2, MaskHi, EVLHi);
+    }
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, EOp1, EOp2);
   }
+  // TODO: VP_TRUNCATE need to handle when TypeWidenVector access to some
+  // targets.
   case TargetLowering::TypeWidenVector: {
     SDValue WideInOp = GetWidenedVector(InOp);
 
@@ -1362,6 +1403,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) {
   }
 
   // Truncate to NVT instead of VT
+  if (N->getOpcode() == ISD::VP_TRUNCATE)
+    return DAG.getNode(ISD::VP_TRUNCATE, dl, NVT, Res, N->getOperand(1),
+                       N->getOperand(2));
   return DAG.getNode(ISD::TRUNCATE, dl, NVT, Res);
 }
 
@@ -1432,6 +1476,19 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SADDSUBO_CARRY(SDNode *N,
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_ABS(SDNode *N) {
+  EVT OVT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
+
+  // If a larger ABS or SMAX isn't supported by the target, try to expand now.
+  // If we expand later we'll end up sign extending more than just the sra input
+  // in sra+xor+sub expansion.
+  if (!OVT.isVector() &&
+      !TLI.isOperationLegalOrCustomOrPromote(ISD::ABS, NVT) &&
+      !TLI.isOperationLegal(ISD::SMAX, NVT)) {
+    if (SDValue Res = TLI.expandABS(N, DAG))
+      return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), NVT, Res);
+  }
+
   SDValue Op0 = SExtPromotedInteger(N->getOperand(0));
   return DAG.getNode(ISD::ABS, SDLoc(N), Op0.getValueType(), Op0);
 }
@@ -1466,9 +1523,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_XMULO(SDNode *N, unsigned ResNo) {
   if (N->getOpcode() == ISD::UMULO) {
     // Unsigned overflow occurred if the high part is non-zero.
     unsigned Shift = SmallVT.getScalarSizeInBits();
-    EVT ShiftTy = TLI.getShiftAmountTy(Mul.getValueType(), DAG.getDataLayout());
-    SDValue Hi = DAG.getNode(ISD::SRL, DL, Mul.getValueType(), Mul,
-                             DAG.getConstant(Shift, DL, ShiftTy));
+    SDValue Hi =
+        DAG.getNode(ISD::SRL, DL, Mul.getValueType(), Mul,
+                    DAG.getShiftAmountConstant(Shift, Mul.getValueType(), DL));
     Overflow = DAG.getSetCC(DL, N->getValueType(1), Hi,
                             DAG.getConstant(0, DL, Hi.getValueType()),
                             ISD::SETNE);
@@ -1498,7 +1555,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_VSCALE(SDNode *N) {
   EVT VT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
 
   APInt MulImm = cast<ConstantSDNode>(N->getOperand(0))->getAPIntValue();
-  return DAG.getVScale(SDLoc(N), VT, MulImm.sextOrSelf(VT.getSizeInBits()));
+  return DAG.getVScale(SDLoc(N), VT, MulImm.sext(VT.getSizeInBits()));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_VAARG(SDNode *N) {
@@ -1578,16 +1635,19 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::CONCAT_VECTORS: Res = PromoteIntOp_CONCAT_VECTORS(N); break;
   case ISD::EXTRACT_VECTOR_ELT: Res = PromoteIntOp_EXTRACT_VECTOR_ELT(N); break;
   case ISD::INSERT_VECTOR_ELT:
-                          Res = PromoteIntOp_INSERT_VECTOR_ELT(N, OpNo);break;
-  case ISD::SCALAR_TO_VECTOR:
-                          Res = PromoteIntOp_SCALAR_TO_VECTOR(N); break;
+    Res = PromoteIntOp_INSERT_VECTOR_ELT(N, OpNo);
+    break;
   case ISD::SPLAT_VECTOR:
-                          Res = PromoteIntOp_SPLAT_VECTOR(N); break;
+  case ISD::SCALAR_TO_VECTOR:
+    Res = PromoteIntOp_ScalarOp(N);
+    break;
   case ISD::VSELECT:
   case ISD::SELECT:       Res = PromoteIntOp_SELECT(N, OpNo); break;
   case ISD::SELECT_CC:    Res = PromoteIntOp_SELECT_CC(N, OpNo); break;
+  case ISD::VP_SETCC:
   case ISD::SETCC:        Res = PromoteIntOp_SETCC(N, OpNo); break;
   case ISD::SIGN_EXTEND:  Res = PromoteIntOp_SIGN_EXTEND(N); break;
+  case ISD::VP_SITOFP:
   case ISD::SINT_TO_FP:   Res = PromoteIntOp_SINT_TO_FP(N); break;
   case ISD::STRICT_SINT_TO_FP: Res = PromoteIntOp_STRICT_SINT_TO_FP(N); break;
   case ISD::STORE:        Res = PromoteIntOp_STORE(cast<StoreSDNode>(N),
@@ -1600,8 +1660,10 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
                                                  OpNo); break;
   case ISD::MSCATTER: Res = PromoteIntOp_MSCATTER(cast<MaskedScatterSDNode>(N),
                                                   OpNo); break;
+  case ISD::VP_TRUNCATE:
   case ISD::TRUNCATE:     Res = PromoteIntOp_TRUNCATE(N); break;
   case ISD::FP16_TO_FP:
+  case ISD::VP_UITOFP:
   case ISD::UINT_TO_FP:   Res = PromoteIntOp_UINT_TO_FP(N); break;
   case ISD::STRICT_UINT_TO_FP:  Res = PromoteIntOp_STRICT_UINT_TO_FP(N); break;
   case ISD::ZERO_EXTEND:  Res = PromoteIntOp_ZERO_EXTEND(N); break;
@@ -1614,6 +1676,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::ROTL:
   case ISD::ROTR: Res = PromoteIntOp_Shift(N); break;
 
+  case ISD::FSHL:
+  case ISD::FSHR: Res = PromoteIntOp_FunnelShift(N); break;
+
   case ISD::SADDO_CARRY:
   case ISD::SSUBO_CARRY:
   case ISD::ADDCARRY:
@@ -1848,20 +1913,13 @@ SDValue DAGTypeLegalizer::PromoteIntOp_INSERT_VECTOR_ELT(SDNode *N,
                                 N->getOperand(1), Idx), 0);
 }
 
-SDValue DAGTypeLegalizer::PromoteIntOp_SCALAR_TO_VECTOR(SDNode *N) {
-  // Integer SCALAR_TO_VECTOR operands are implicitly truncated, so just promote
-  // the operand in place.
+SDValue DAGTypeLegalizer::PromoteIntOp_ScalarOp(SDNode *N) {
+  // Integer SPLAT_VECTOR/SCALAR_TO_VECTOR operands are implicitly truncated,
+  // so just promote the operand in place.
   return SDValue(DAG.UpdateNodeOperands(N,
                                 GetPromotedInteger(N->getOperand(0))), 0);
 }
 
-SDValue DAGTypeLegalizer::PromoteIntOp_SPLAT_VECTOR(SDNode *N) {
-  // Integer SPLAT_VECTOR operands are implicitly truncated, so just promote the
-  // operand in place.
-  return SDValue(
-      DAG.UpdateNodeOperands(N, GetPromotedInteger(N->getOperand(0))), 0);
-}
-
 SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) {
   assert(OpNo == 0 && "Only know how to promote the condition!");
   SDValue Cond = N->getOperand(0);
@@ -1900,7 +1958,14 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SETCC(SDNode *N, unsigned OpNo) {
   PromoteSetCCOperands(LHS, RHS, cast<CondCodeSDNode>(N->getOperand(2))->get());
 
   // The CC (#2) is always legal.
-  return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS, N->getOperand(2)), 0);
+  if (N->getOpcode() == ISD::SETCC)
+    return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS, N->getOperand(2)), 0);
+
+  assert(N->getOpcode() == ISD::VP_SETCC && "Expected VP_SETCC opcode");
+
+  return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS, N->getOperand(2),
+                                        N->getOperand(3), N->getOperand(4)),
+                 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_Shift(SDNode *N) {
@@ -1908,6 +1973,11 @@ SDValue DAGTypeLegalizer::PromoteIntOp_Shift(SDNode *N) {
                                 ZExtPromotedInteger(N->getOperand(1))), 0);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntOp_FunnelShift(SDNode *N) {
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), N->getOperand(1),
+                                ZExtPromotedInteger(N->getOperand(2))), 0);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntOp_SIGN_EXTEND(SDNode *N) {
   SDValue Op = GetPromotedInteger(N->getOperand(0));
   SDLoc dl(N);
@@ -1917,6 +1987,11 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SIGN_EXTEND(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_SINT_TO_FP(SDNode *N) {
+  if (N->getOpcode() == ISD::VP_SITOFP)
+    return SDValue(DAG.UpdateNodeOperands(N,
+                                          SExtPromotedInteger(N->getOperand(0)),
+                                          N->getOperand(1), N->getOperand(2)),
+                   0);
   return SDValue(DAG.UpdateNodeOperands(N,
                                 SExtPromotedInteger(N->getOperand(0))), 0);
 }
@@ -1980,8 +2055,8 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N,
 
 SDValue DAGTypeLegalizer::PromoteIntOp_MGATHER(MaskedGatherSDNode *N,
                                                unsigned OpNo) {
-
   SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
+
   if (OpNo == 2) {
     // The Mask
     EVT DataVT = N->getValueType(0);
@@ -2010,6 +2085,7 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MSCATTER(MaskedScatterSDNode *N,
                                                 unsigned OpNo) {
   bool TruncateStore = N->isTruncatingStore();
   SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
+
   if (OpNo == 2) {
     // The Mask
     EVT DataVT = N->getValue().getValueType();
@@ -2021,9 +2097,6 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MSCATTER(MaskedScatterSDNode *N,
       NewOps[OpNo] = SExtPromotedInteger(N->getOperand(OpNo));
     else
       NewOps[OpNo] = ZExtPromotedInteger(N->getOperand(OpNo));
-
-    N->setIndexType(TLI.getCanonicalIndexType(N->getIndexType(),
-                                              N->getMemoryVT(), NewOps[OpNo]));
   } else {
     NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo));
     TruncateStore = true;
@@ -2036,10 +2109,18 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MSCATTER(MaskedScatterSDNode *N,
 
 SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) {
   SDValue Op = GetPromotedInteger(N->getOperand(0));
+  if (N->getOpcode() == ISD::VP_TRUNCATE)
+    return DAG.getNode(ISD::VP_TRUNCATE, SDLoc(N), N->getValueType(0), Op,
+                       N->getOperand(1), N->getOperand(2));
   return DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), Op);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_UINT_TO_FP(SDNode *N) {
+  if (N->getOpcode() == ISD::VP_UITOFP)
+    return SDValue(DAG.UpdateNodeOperands(N,
+                                          ZExtPromotedInteger(N->getOperand(0)),
+                                          N->getOperand(1), N->getOperand(2)),
+                   0);
   return SDValue(DAG.UpdateNodeOperands(N,
                                 ZExtPromotedInteger(N->getOperand(0))), 0);
 }
@@ -2468,7 +2549,7 @@ void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, const APInt &Amt,
   EVT ShTy = N->getOperand(1).getValueType();
 
   if (N->getOpcode() == ISD::SHL) {
-    if (Amt.ugt(VTBits)) {
+    if (Amt.uge(VTBits)) {
       Lo = Hi = DAG.getConstant(0, DL, NVT);
     } else if (Amt.ugt(NVTBits)) {
       Lo = DAG.getConstant(0, DL, NVT);
@@ -2489,7 +2570,7 @@ void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, const APInt &Amt,
   }
 
   if (N->getOpcode() == ISD::SRL) {
-    if (Amt.ugt(VTBits)) {
+    if (Amt.uge(VTBits)) {
       Lo = Hi = DAG.getConstant(0, DL, NVT);
     } else if (Amt.ugt(NVTBits)) {
       Lo = DAG.getNode(ISD::SRL, DL,
@@ -2510,7 +2591,7 @@ void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, const APInt &Amt,
   }
 
   assert(N->getOpcode() == ISD::SRA && "Unknown shift!");
-  if (Amt.ugt(VTBits)) {
+  if (Amt.uge(VTBits)) {
     Hi = Lo = DAG.getNode(ISD::SRA, DL, NVT, InH,
                           DAG.getConstant(NVTBits - 1, DL, ShTy));
   } else if (Amt.ugt(NVTBits)) {
@@ -3132,24 +3213,23 @@ void DAGTypeLegalizer::ExpandIntRes_ABS(SDNode *N, SDValue &Lo, SDValue &Hi) {
   GetExpandedInteger(N0, Lo, Hi);
   EVT NVT = Lo.getValueType();
 
-  // If we have ADDCARRY, use the expanded form of the sra+add+xor sequence we
-  // use in LegalizeDAG. The ADD part of the expansion is based on
-  // ExpandIntRes_ADDSUB which also uses ADDCARRY/UADDO after checking that
-  // ADDCARRY is LegalOrCustom. Each of the pieces here can be further expanded
+  // If we have SUBCARRY, use the expanded form of the sra+xor+sub sequence we
+  // use in LegalizeDAG. The SUB part of the expansion is based on
+  // ExpandIntRes_ADDSUB which also uses SUBCARRY/USUBO after checking that
+  // SUBCARRY is LegalOrCustom. Each of the pieces here can be further expanded
   // if needed. Shift expansion has a special case for filling with sign bits
   // so that we will only end up with one SRA.
-  bool HasAddCarry = TLI.isOperationLegalOrCustom(
-      ISD::ADDCARRY, TLI.getTypeToExpandTo(*DAG.getContext(), NVT));
-  if (HasAddCarry) {
-    EVT ShiftAmtTy = TLI.getShiftAmountTy(NVT, DAG.getDataLayout());
-    SDValue Sign =
-        DAG.getNode(ISD::SRA, dl, NVT, Hi,
-                    DAG.getConstant(NVT.getSizeInBits() - 1, dl, ShiftAmtTy));
+  bool HasSubCarry = TLI.isOperationLegalOrCustom(
+      ISD::SUBCARRY, TLI.getTypeToExpandTo(*DAG.getContext(), NVT));
+  if (HasSubCarry) {
+    SDValue Sign = DAG.getNode(
+        ISD::SRA, dl, NVT, Hi,
+        DAG.getShiftAmountConstant(NVT.getSizeInBits() - 1, NVT, dl));
     SDVTList VTList = DAG.getVTList(NVT, getSetCCResultType(NVT));
-    Lo = DAG.getNode(ISD::UADDO, dl, VTList, Lo, Sign);
-    Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Hi, Sign, Lo.getValue(1));
     Lo = DAG.getNode(ISD::XOR, dl, NVT, Lo, Sign);
     Hi = DAG.getNode(ISD::XOR, dl, NVT, Hi, Sign);
+    Lo = DAG.getNode(ISD::USUBO, dl, VTList, Lo, Sign);
+    Hi = DAG.getNode(ISD::SUBCARRY, dl, VTList, Hi, Sign, Lo.getValue(1));
     return;
   }
 
@@ -3160,8 +3240,8 @@ void DAGTypeLegalizer::ExpandIntRes_ABS(SDNode *N, SDValue &Lo, SDValue &Hi) {
   SDValue NegLo, NegHi;
   SplitInteger(Neg, NegLo, NegHi);
 
-  SDValue HiIsNeg = DAG.getSetCC(dl, getSetCCResultType(NVT),
-                                 DAG.getConstant(0, dl, NVT), Hi, ISD::SETGT);
+  SDValue HiIsNeg = DAG.getSetCC(dl, getSetCCResultType(NVT), Hi,
+                                 DAG.getConstant(0, dl, NVT), ISD::SETLT);
   Lo = DAG.getSelect(dl, NVT, HiIsNeg, NegLo, Lo);
   Hi = DAG.getSelect(dl, NVT, HiIsNeg, NegHi, Hi);
 }
@@ -3223,12 +3303,11 @@ void DAGTypeLegalizer::ExpandIntRes_FLT_ROUNDS(SDNode *N, SDValue &Lo,
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   unsigned NBitWidth = NVT.getSizeInBits();
 
-  EVT ShiftAmtTy = TLI.getShiftAmountTy(NVT, DAG.getDataLayout());
   Lo = DAG.getNode(ISD::FLT_ROUNDS_, dl, {NVT, MVT::Other}, N->getOperand(0));
   SDValue Chain = Lo.getValue(1);
   // The high part is the sign of Lo, as -1 is a valid value for FLT_ROUNDS
   Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo,
-                   DAG.getConstant(NBitWidth - 1, dl, ShiftAmtTy));
+                   DAG.getShiftAmountConstant(NBitWidth - 1, NVT, dl));
 
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
@@ -3535,8 +3614,7 @@ void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N,
     SDValue T = DAG.getNode(ISD::MUL, dl, NVT, LLL, RLL);
     SDValue TL = DAG.getNode(ISD::AND, dl, NVT, T, Mask);
 
-    EVT ShiftAmtTy = TLI.getShiftAmountTy(NVT, DAG.getDataLayout());
-    SDValue Shift = DAG.getConstant(HalfBits, dl, ShiftAmtTy);
+    SDValue Shift = DAG.getShiftAmountConstant(HalfBits, NVT, dl);
     SDValue TH = DAG.getNode(ISD::SRL, dl, NVT, T, Shift);
     SDValue LLH = DAG.getNode(ISD::SRL, dl, NVT, LL, Shift);
     SDValue RLH = DAG.getNode(ISD::SRL, dl, NVT, RL, Shift);
@@ -3667,7 +3745,6 @@ void DAGTypeLegalizer::ExpandIntRes_MULFIX(SDNode *N, SDValue &Lo,
   unsigned NVTSize = NVT.getScalarSizeInBits();
   assert((VTSize == NVTSize * 2) && "Expected the new value type to be half "
                                     "the size of the current value type");
-  EVT ShiftTy = TLI.getShiftAmountTy(NVT, DAG.getDataLayout());
 
   // After getting the multiplication result in 4 parts, we need to perform a
   // shift right by the amount of the scale to get the result in that scale.
@@ -3690,7 +3767,7 @@ void DAGTypeLegalizer::ExpandIntRes_MULFIX(SDNode *N, SDValue &Lo,
   // shifting.
   uint64_t Part0 = Scale / NVTSize; // Part holding lowest bit needed.
   if (Scale % NVTSize) {
-    SDValue ShiftAmount = DAG.getConstant(Scale % NVTSize, dl, ShiftTy);
+    SDValue ShiftAmount = DAG.getShiftAmountConstant(Scale % NVTSize, NVT, dl);
     Lo = DAG.getNode(ISD::FSHR, dl, NVT, Result[Part0 + 1], Result[Part0],
                      ShiftAmount);
     Hi = DAG.getNode(ISD::FSHR, dl, NVT, Result[Part0 + 2], Result[Part0 + 1],
@@ -3731,8 +3808,9 @@ void DAGTypeLegalizer::ExpandIntRes_MULFIX(SDNode *N, SDValue &Lo,
   if (!Signed) {
     if (Scale < NVTSize) {
       // Overflow happened if ((HH | (HL >> Scale)) != 0).
-      SDValue HLAdjusted = DAG.getNode(ISD::SRL, dl, NVT, ResultHL,
-                                       DAG.getConstant(Scale, dl, ShiftTy));
+      SDValue HLAdjusted =
+          DAG.getNode(ISD::SRL, dl, NVT, ResultHL,
+                      DAG.getShiftAmountConstant(Scale, NVT, dl));
       SDValue Tmp = DAG.getNode(ISD::OR, dl, NVT, HLAdjusted, ResultHH);
       SatMax = DAG.getSetCC(dl, BoolNVT, Tmp, NVTZero, ISD::SETNE);
     } else if (Scale == NVTSize) {
@@ -3740,9 +3818,9 @@ void DAGTypeLegalizer::ExpandIntRes_MULFIX(SDNode *N, SDValue &Lo,
       SatMax = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETNE);
     } else if (Scale < VTSize) {
       // Overflow happened if ((HH >> (Scale - NVTSize)) != 0).
-      SDValue HLAdjusted = DAG.getNode(ISD::SRL, dl, NVT, ResultHL,
-                                       DAG.getConstant(Scale - NVTSize, dl,
-                                                       ShiftTy));
+      SDValue HLAdjusted =
+          DAG.getNode(ISD::SRL, dl, NVT, ResultHL,
+                      DAG.getShiftAmountConstant(Scale - NVTSize, NVT, dl));
       SatMax = DAG.getSetCC(dl, BoolNVT, HLAdjusted, NVTZero, ISD::SETNE);
     } else
       llvm_unreachable("Scale must be less or equal to VTSize for UMULFIXSAT"
@@ -3901,6 +3979,70 @@ void DAGTypeLegalizer::ExpandIntRes_SADDSUBO(SDNode *Node,
   ReplaceValueWith(SDValue(Node, 1), Ovf);
 }
 
+// Emit a call to __udivei4 and friends which require
+// the arguments be based on the stack
+// and extra argument that contains the number of bits of the operands.
+// Returns the result of the call operation.
+static SDValue ExpandExtIntRes_DIVREM(const TargetLowering &TLI,
+                                      const RTLIB::Libcall &LC,
+                                      SelectionDAG &DAG, SDNode *N,
+                                      const SDLoc &DL, const EVT &VT) {
+
+  SDValue InChain = DAG.getEntryNode();
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+
+  // The signature of __udivei4 is
+  // void __udivei4(unsigned int *quo, unsigned int *a, unsigned int *b,
+  // unsigned int bits)
+  EVT ArgVT = N->op_begin()->getValueType();
+  assert(ArgVT.isInteger() && ArgVT.getSizeInBits() > 128 &&
+         "Unexpected argument type for lowering");
+  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+
+  SDValue Output = DAG.CreateStackTemporary(ArgVT);
+  Entry.Node = Output;
+  Entry.Ty = ArgTy->getPointerTo();
+  Entry.IsSExt = false;
+  Entry.IsZExt = false;
+  Args.push_back(Entry);
+
+  for (const llvm::SDUse &Op : N->ops()) {
+    SDValue StackPtr = DAG.CreateStackTemporary(ArgVT);
+    InChain = DAG.getStore(InChain, DL, Op, StackPtr, MachinePointerInfo());
+    Entry.Node = StackPtr;
+    Entry.Ty = ArgTy->getPointerTo();
+    Entry.IsSExt = false;
+    Entry.IsZExt = false;
+    Args.push_back(Entry);
+  }
+
+  int Bits = N->getOperand(0)
+                 .getValueType()
+                 .getTypeForEVT(*DAG.getContext())
+                 ->getIntegerBitWidth();
+  Entry.Node = DAG.getConstant(Bits, DL, TLI.getPointerTy(DAG.getDataLayout()));
+  Entry.Ty = Type::getInt32Ty(*DAG.getContext());
+  Entry.IsSExt = false;
+  Entry.IsZExt = true;
+  Args.push_back(Entry);
+
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                         TLI.getPointerTy(DAG.getDataLayout()));
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(DL)
+      .setChain(InChain)
+      .setLibCallee(TLI.getLibcallCallingConv(LC),
+                    Type::getVoidTy(*DAG.getContext()), Callee, std::move(Args))
+      .setDiscardResult();
+
+  SDValue Chain = TLI.LowerCallTo(CLI).second;
+
+  return DAG.getLoad(ArgVT, DL, Chain, Output, MachinePointerInfo());
+}
+
 void DAGTypeLegalizer::ExpandIntRes_SDIV(SDNode *N,
                                          SDValue &Lo, SDValue &Hi) {
   EVT VT = N->getValueType(0);
@@ -3922,6 +4064,14 @@ void DAGTypeLegalizer::ExpandIntRes_SDIV(SDNode *N,
     LC = RTLIB::SDIV_I64;
   else if (VT == MVT::i128)
     LC = RTLIB::SDIV_I128;
+
+  else {
+    SDValue Result =
+        ExpandExtIntRes_DIVREM(TLI, RTLIB::SDIV_IEXT, DAG, N, dl, VT);
+    SplitInteger(Result, Lo, Hi);
+    return;
+  }
+
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SDIV!");
 
   TargetLowering::MakeLibCallOptions CallOptions;
@@ -4113,6 +4263,14 @@ void DAGTypeLegalizer::ExpandIntRes_SREM(SDNode *N,
     LC = RTLIB::SREM_I64;
   else if (VT == MVT::i128)
     LC = RTLIB::SREM_I128;
+
+  else {
+    SDValue Result =
+        ExpandExtIntRes_DIVREM(TLI, RTLIB::SREM_IEXT, DAG, N, dl, VT);
+    SplitInteger(Result, Lo, Hi);
+    return;
+  }
+
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SREM!");
 
   TargetLowering::MakeLibCallOptions CallOptions;
@@ -4288,6 +4446,14 @@ void DAGTypeLegalizer::ExpandIntRes_UDIV(SDNode *N,
     LC = RTLIB::UDIV_I64;
   else if (VT == MVT::i128)
     LC = RTLIB::UDIV_I128;
+
+  else {
+    SDValue Result =
+        ExpandExtIntRes_DIVREM(TLI, RTLIB::UDIV_IEXT, DAG, N, dl, VT);
+    SplitInteger(Result, Lo, Hi);
+    return;
+  }
+
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UDIV!");
 
   TargetLowering::MakeLibCallOptions CallOptions;
@@ -4315,6 +4481,14 @@ void DAGTypeLegalizer::ExpandIntRes_UREM(SDNode *N,
     LC = RTLIB::UREM_I64;
   else if (VT == MVT::i128)
     LC = RTLIB::UREM_I128;
+
+  else {
+    SDValue Result =
+        ExpandExtIntRes_DIVREM(TLI, RTLIB::UREM_IEXT, DAG, N, dl, VT);
+    SplitInteger(Result, Lo, Hi);
+    return;
+  }
+
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UREM!");
 
   TargetLowering::MakeLibCallOptions CallOptions;
@@ -5060,7 +5234,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_VECTOR(SDNode *N) {
   return DAG.getBuildVector(NOutVT, dl, Ops);
 }
 
-SDValue DAGTypeLegalizer::PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N) {
+SDValue DAGTypeLegalizer::PromoteIntRes_ScalarOp(SDNode *N) {
 
   SDLoc dl(N);
 
@@ -5070,35 +5244,19 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N) {
   EVT OutVT = N->getValueType(0);
   EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
   assert(NOutVT.isVector() && "This type must be promoted to a vector type");
-  EVT NOutVTElem = NOutVT.getVectorElementType();
-
-  SDValue Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVTElem, N->getOperand(0));
-
-  return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NOutVT, Op);
-}
-
-SDValue DAGTypeLegalizer::PromoteIntRes_SPLAT_VECTOR(SDNode *N) {
-  SDLoc dl(N);
-
-  SDValue SplatVal = N->getOperand(0);
-
-  assert(!SplatVal.getValueType().isVector() && "Input must be a scalar");
-
-  EVT OutVT = N->getValueType(0);
-  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
-  assert(NOutVT.isVector() && "Type must be promoted to a vector type");
   EVT NOutElemVT = NOutVT.getVectorElementType();
 
-  SDValue Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutElemVT, SplatVal);
+  SDValue Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutElemVT, N->getOperand(0));
 
-  return DAG.getNode(ISD::SPLAT_VECTOR, dl, NOutVT, Op);
+  return DAG.getNode(N->getOpcode(), dl, NOutVT, Op);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_STEP_VECTOR(SDNode *N) {
   SDLoc dl(N);
   EVT OutVT = N->getValueType(0);
   EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
-  assert(NOutVT.isVector() && "Type must be promoted to a vector type");
+  assert(NOutVT.isScalableVector() &&
+         "Type must be promoted to a scalable vector type");
   APInt StepVal = cast<ConstantSDNode>(N->getOperand(0))->getAPIntValue();
   return DAG.getStepVector(dl, NOutVT,
                            StepVal.sext(NOutVT.getScalarSizeInBits()));
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 03dcd0f6d2c9..8fe9a83b9c3d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -13,10 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "LegalizeTypes.h"
-#include "SDNodeDbgValue.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -86,46 +83,49 @@ void DAGTypeLegalizer::PerformExpensiveChecks() {
       auto ResId = ValueToIdMap.lookup(Res);
 
       unsigned Mapped = 0;
-      if (ResId && (ReplacedValues.find(ResId) != ReplacedValues.end())) {
-        Mapped |= 1;
-        // Check that remapped values are only used by nodes marked NewNode.
-        for (SDNode::use_iterator UI = Node.use_begin(), UE = Node.use_end();
-             UI != UE; ++UI)
-          if (UI.getUse().getResNo() == i)
-            assert(UI->getNodeId() == NewNode &&
-                   "Remapped value has non-trivial use!");
-
-        // Check that the final result of applying ReplacedValues is not
-        // marked NewNode.
-        auto NewValId = ReplacedValues[ResId];
-        auto I = ReplacedValues.find(NewValId);
-        while (I != ReplacedValues.end()) {
-          NewValId = I->second;
+      if (ResId) {
+        auto I = ReplacedValues.find(ResId);
+        if (I != ReplacedValues.end()) {
+          Mapped |= 1;
+          // Check that remapped values are only used by nodes marked NewNode.
+          for (SDNode::use_iterator UI = Node.use_begin(), UE = Node.use_end();
+               UI != UE; ++UI)
+            if (UI.getUse().getResNo() == i)
+              assert(UI->getNodeId() == NewNode &&
+                     "Remapped value has non-trivial use!");
+
+          // Check that the final result of applying ReplacedValues is not
+          // marked NewNode.
+          auto NewValId = I->second;
           I = ReplacedValues.find(NewValId);
+          while (I != ReplacedValues.end()) {
+            NewValId = I->second;
+            I = ReplacedValues.find(NewValId);
+          }
+          SDValue NewVal = getSDValue(NewValId);
+          (void)NewVal;
+          assert(NewVal.getNode()->getNodeId() != NewNode &&
+                 "ReplacedValues maps to a new node!");
         }
-        SDValue NewVal = getSDValue(NewValId);
-        (void)NewVal;
-        assert(NewVal.getNode()->getNodeId() != NewNode &&
-               "ReplacedValues maps to a new node!");
+        if (PromotedIntegers.count(ResId))
+          Mapped |= 2;
+        if (SoftenedFloats.count(ResId))
+          Mapped |= 4;
+        if (ScalarizedVectors.count(ResId))
+          Mapped |= 8;
+        if (ExpandedIntegers.count(ResId))
+          Mapped |= 16;
+        if (ExpandedFloats.count(ResId))
+          Mapped |= 32;
+        if (SplitVectors.count(ResId))
+          Mapped |= 64;
+        if (WidenedVectors.count(ResId))
+          Mapped |= 128;
+        if (PromotedFloats.count(ResId))
+          Mapped |= 256;
+        if (SoftPromotedHalfs.count(ResId))
+          Mapped |= 512;
       }
-      if (ResId && PromotedIntegers.find(ResId) != PromotedIntegers.end())
-        Mapped |= 2;
-      if (ResId && SoftenedFloats.find(ResId) != SoftenedFloats.end())
-        Mapped |= 4;
-      if (ResId && ScalarizedVectors.find(ResId) != ScalarizedVectors.end())
-        Mapped |= 8;
-      if (ResId && ExpandedIntegers.find(ResId) != ExpandedIntegers.end())
-        Mapped |= 16;
-      if (ResId && ExpandedFloats.find(ResId) != ExpandedFloats.end())
-        Mapped |= 32;
-      if (ResId && SplitVectors.find(ResId) != SplitVectors.end())
-        Mapped |= 64;
-      if (ResId && WidenedVectors.find(ResId) != WidenedVectors.end())
-        Mapped |= 128;
-      if (ResId && PromotedFloats.find(ResId) != PromotedFloats.end())
-        Mapped |= 256;
-      if (ResId && SoftPromotedHalfs.find(ResId) != SoftPromotedHalfs.end())
-        Mapped |= 512;
 
       if (Node.getNodeId() != Processed) {
         // Since we allow ReplacedValues to map deleted nodes, it may map nodes
@@ -143,8 +143,16 @@ void DAGTypeLegalizer::PerformExpensiveChecks() {
         }
       } else {
         if (Mapped == 0) {
-          dbgs() << "Processed value not in any map!";
-          Failed = true;
+          SDValue NodeById = IdToValueMap.lookup(ResId);
+          // It is possible the node has been remapped to another node and had
+          // its Id updated in the Value to Id table. The node it remapped to
+          // may not have been processed yet. Look up the Id in the Id to Value
+          // table and re-check the Processed state. If the node hasn't been
+          // remapped we'll get the same state as we got earlier.
+          if (NodeById->getNodeId() == Processed) {
+            dbgs() << "Processed value not in any map!";
+            Failed = true;
+          }
         } else if (Mapped & (Mapped - 1)) {
           dbgs() << "Value in multiple maps!";
           Failed = true;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 4d8daa82d8c0..de320290bda9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -19,7 +19,6 @@
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
 
 namespace llvm {
 
@@ -309,8 +308,7 @@ private:
   SDValue PromoteIntRes_VECTOR_SHUFFLE(SDNode *N);
   SDValue PromoteIntRes_VECTOR_SPLICE(SDNode *N);
   SDValue PromoteIntRes_BUILD_VECTOR(SDNode *N);
-  SDValue PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N);
-  SDValue PromoteIntRes_SPLAT_VECTOR(SDNode *N);
+  SDValue PromoteIntRes_ScalarOp(SDNode *N);
   SDValue PromoteIntRes_STEP_VECTOR(SDNode *N);
   SDValue PromoteIntRes_EXTEND_VECTOR_INREG(SDNode *N);
   SDValue PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N);
@@ -362,6 +360,7 @@ private:
   SDValue PromoteIntRes_ABS(SDNode *N);
   SDValue PromoteIntRes_Rotate(SDNode *N);
   SDValue PromoteIntRes_FunnelShift(SDNode *N);
+  SDValue PromoteIntRes_IS_FPCLASS(SDNode *N);
 
   // Integer Operand Promotion.
   bool PromoteIntegerOperand(SDNode *N, unsigned OpNo);
@@ -377,12 +376,12 @@ private:
   SDValue PromoteIntOp_EXTRACT_SUBVECTOR(SDNode *N);
   SDValue PromoteIntOp_INSERT_SUBVECTOR(SDNode *N);
   SDValue PromoteIntOp_CONCAT_VECTORS(SDNode *N);
-  SDValue PromoteIntOp_SCALAR_TO_VECTOR(SDNode *N);
-  SDValue PromoteIntOp_SPLAT_VECTOR(SDNode *N);
+  SDValue PromoteIntOp_ScalarOp(SDNode *N);
   SDValue PromoteIntOp_SELECT(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_SETCC(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_Shift(SDNode *N);
+  SDValue PromoteIntOp_FunnelShift(SDNode *N);
   SDValue PromoteIntOp_SIGN_EXTEND(SDNode *N);
   SDValue PromoteIntOp_SINT_TO_FP(SDNode *N);
   SDValue PromoteIntOp_STRICT_SINT_TO_FP(SDNode *N);
@@ -784,6 +783,7 @@ private:
   SDValue ScalarizeVecRes_UNDEF(SDNode *N);
   SDValue ScalarizeVecRes_VECTOR_SHUFFLE(SDNode *N);
   SDValue ScalarizeVecRes_FP_TO_XINT_SAT(SDNode *N);
+  SDValue ScalarizeVecRes_IS_FPCLASS(SDNode *N);
 
   SDValue ScalarizeVecRes_FIX(SDNode *N);
 
@@ -850,6 +850,7 @@ private:
   void SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_FPOWI(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_FCOPYSIGN(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_IS_FPCLASS(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue &Hi);
@@ -960,6 +961,7 @@ private:
   SDValue WidenVecRes_Convert_StrictFP(SDNode *N);
   SDValue WidenVecRes_FP_TO_XINT_SAT(SDNode *N);
   SDValue WidenVecRes_FCOPYSIGN(SDNode *N);
+  SDValue WidenVecRes_IS_FPCLASS(SDNode *N);
   SDValue WidenVecRes_POWI(SDNode *N);
   SDValue WidenVecRes_Unary(SDNode *N);
   SDValue WidenVecRes_InregOp(SDNode *N);
@@ -985,6 +987,7 @@ private:
   SDValue WidenVecOp_Convert(SDNode *N);
   SDValue WidenVecOp_FP_TO_XINT_SAT(SDNode *N);
   SDValue WidenVecOp_FCOPYSIGN(SDNode *N);
+  SDValue WidenVecOp_IS_FPCLASS(SDNode *N);
   SDValue WidenVecOp_VECREDUCE(SDNode *N);
   SDValue WidenVecOp_VECREDUCE_SEQ(SDNode *N);
   SDValue WidenVecOp_VP_REDUCE(SDNode *N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index abf6a3ac6916..842ffa2aa23e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -26,11 +26,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -41,7 +39,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MachineValueType.h"
-#include "llvm/Support/MathExtras.h"
 #include <cassert>
 #include <cstdint>
 #include <iterator>
@@ -464,6 +461,12 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::VPID: {                                                            \
     EVT LegalizeVT = LEGALPOS < 0 ? Node->getValueType(-(1 + LEGALPOS))        \
                                   : Node->getOperand(LEGALPOS).getValueType(); \
+    if (ISD::VPID == ISD::VP_SETCC) {                                          \
+      ISD::CondCode CCCode = cast<CondCodeSDNode>(Node->getOperand(2))->get(); \
+      Action = TLI.getCondCodeAction(CCCode, LegalizeVT.getSimpleVT());        \
+      if (Action != TargetLowering::Legal)                                     \
+        break;                                                                 \
+    }                                                                          \
     Action = TLI.getOperationAction(Node->getOpcode(), LegalizeVT);            \
   } break;
 #include "llvm/IR/VPIntrinsics.def"
@@ -747,6 +750,7 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
     ExpandFSUB(Node, Results);
     return;
   case ISD::SETCC:
+  case ISD::VP_SETCC:
     ExpandSETCC(Node, Results);
     return;
   case ISD::ABS:
@@ -1050,10 +1054,7 @@ SDValue VectorLegalizer::ExpandZERO_EXTEND_VECTOR_INREG(SDNode *Node) {
 
   // Shuffle the incoming lanes into the correct position, and pull all other
   // lanes from the zero vector.
-  SmallVector<int, 16> ShuffleMask;
-  ShuffleMask.reserve(NumSrcElements);
-  for (int i = 0; i < NumSrcElements; ++i)
-    ShuffleMask.push_back(i);
+  auto ShuffleMask = llvm::to_vector<16>(llvm::seq<int>(0, NumSrcElements));
 
   int ExtLaneScale = NumSrcElements / NumElements;
   int EndianOffset = DAG.getDataLayout().isBigEndian() ? ExtLaneScale - 1 : 0;
@@ -1423,6 +1424,7 @@ void VectorLegalizer::ExpandFSUB(SDNode *Node,
 void VectorLegalizer::ExpandSETCC(SDNode *Node,
                                   SmallVectorImpl<SDValue> &Results) {
   bool NeedInvert = false;
+  bool IsVP = Node->getOpcode() == ISD::VP_SETCC;
   SDLoc dl(Node);
   MVT OpVT = Node->getOperand(0).getSimpleValueType();
   ISD::CondCode CCCode = cast<CondCodeSDNode>(Node->getOperand(2))->get();
@@ -1436,20 +1438,36 @@ void VectorLegalizer::ExpandSETCC(SDNode *Node,
   SDValue LHS = Node->getOperand(0);
   SDValue RHS = Node->getOperand(1);
   SDValue CC = Node->getOperand(2);
-  bool Legalized = TLI.LegalizeSetCCCondCode(DAG, Node->getValueType(0), LHS,
-                                             RHS, CC, NeedInvert, dl, Chain);
+  SDValue Mask, EVL;
+  if (IsVP) {
+    Mask = Node->getOperand(3);
+    EVL = Node->getOperand(4);
+  }
+
+  bool Legalized =
+      TLI.LegalizeSetCCCondCode(DAG, Node->getValueType(0), LHS, RHS, CC, Mask,
+                                EVL, NeedInvert, dl, Chain);
 
   if (Legalized) {
     // If we expanded the SETCC by swapping LHS and RHS, or by inverting the
     // condition code, create a new SETCC node.
-    if (CC.getNode())
-      LHS = DAG.getNode(ISD::SETCC, dl, Node->getValueType(0), LHS, RHS, CC,
-                        Node->getFlags());
+    if (CC.getNode()) {
+      if (!IsVP)
+        LHS = DAG.getNode(ISD::SETCC, dl, Node->getValueType(0), LHS, RHS, CC,
+                          Node->getFlags());
+      else
+        LHS = DAG.getNode(ISD::VP_SETCC, dl, Node->getValueType(0),
+                          {LHS, RHS, CC, Mask, EVL}, Node->getFlags());
+    }
 
     // If we expanded the SETCC by inverting the condition code, then wrap
     // the existing SETCC in a NOT to restore the intended condition.
-    if (NeedInvert)
-      LHS = DAG.getLogicalNOT(dl, LHS, LHS->getValueType(0));
+    if (NeedInvert) {
+      if (!IsVP)
+        LHS = DAG.getLogicalNOT(dl, LHS, LHS->getValueType(0));
+      else
+        LHS = DAG.getVPLogicalNOT(dl, LHS, Mask, EVL, LHS->getValueType(0));
+    }
   } else {
     // Otherwise, SETCC for the given comparison type must be completely
     // illegal; expand it into a SELECT_CC.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 0bd44ce4c872..fa555be00ded 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -20,7 +20,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "LegalizeTypes.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TypeSize.h"
@@ -64,6 +66,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SETCC:             R = ScalarizeVecRes_SETCC(N); break;
   case ISD::UNDEF:             R = ScalarizeVecRes_UNDEF(N); break;
   case ISD::VECTOR_SHUFFLE:    R = ScalarizeVecRes_VECTOR_SHUFFLE(N); break;
+  case ISD::IS_FPCLASS:        R = ScalarizeVecRes_IS_FPCLASS(N); break;
   case ISD::ANY_EXTEND_VECTOR_INREG:
   case ISD::SIGN_EXTEND_VECTOR_INREG:
   case ISD::ZERO_EXTEND_VECTOR_INREG:
@@ -231,9 +234,16 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_StrictFPOp(SDNode *N) {
   // Now process the remaining operands.
   for (unsigned i = 1; i < NumOpers; ++i) {
     SDValue Oper = N->getOperand(i);
+    EVT OperVT = Oper.getValueType();
 
-    if (Oper.getValueType().isVector())
-      Oper = GetScalarizedVector(Oper);
+    if (OperVT.isVector()) {
+      if (getTypeAction(OperVT) == TargetLowering::TypeScalarizeVector)
+        Oper = GetScalarizedVector(Oper);
+      else
+        Oper = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                           OperVT.getVectorElementType(), Oper,
+                           DAG.getVectorIdxConstant(0, dl));
+    }
 
     Opers[i] = Oper;
   }
@@ -582,6 +592,29 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_SETCC(SDNode *N) {
   return DAG.getNode(ExtendCode, DL, NVT, Res);
 }
 
+SDValue DAGTypeLegalizer::ScalarizeVecRes_IS_FPCLASS(SDNode *N) {
+  SDLoc DL(N);
+  SDValue Arg = N->getOperand(0);
+  SDValue Test = N->getOperand(1);
+  EVT ArgVT = Arg.getValueType();
+  EVT ResultVT = N->getValueType(0).getVectorElementType();
+
+  if (getTypeAction(ArgVT) == TargetLowering::TypeScalarizeVector) {
+    Arg = GetScalarizedVector(Arg);
+  } else {
+    EVT VT = ArgVT.getVectorElementType();
+    Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Arg,
+                      DAG.getVectorIdxConstant(0, DL));
+  }
+
+  SDValue Res =
+      DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, {Arg, Test}, N->getFlags());
+  // Vectors may have a different boolean contents to scalars.  Promote the
+  // value appropriately.
+  ISD::NodeType ExtendCode =
+      TargetLowering::getExtendForContent(TLI.getBooleanContents(ArgVT));
+  return DAG.getNode(ExtendCode, DL, ResultVT, Res);
+}
 
 //===----------------------------------------------------------------------===//
 //  Operand Vector Scalarization <1 x ty> -> ty.
@@ -926,6 +959,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::INSERT_SUBVECTOR:  SplitVecRes_INSERT_SUBVECTOR(N, Lo, Hi); break;
   case ISD::FPOWI:             SplitVecRes_FPOWI(N, Lo, Hi); break;
   case ISD::FCOPYSIGN:         SplitVecRes_FCOPYSIGN(N, Lo, Hi); break;
+  case ISD::IS_FPCLASS:        SplitVecRes_IS_FPCLASS(N, Lo, Hi); break;
   case ISD::INSERT_VECTOR_ELT: SplitVecRes_INSERT_VECTOR_ELT(N, Lo, Hi); break;
   case ISD::SPLAT_VECTOR:
   case ISD::SCALAR_TO_VECTOR:
@@ -949,6 +983,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
     SplitVecRes_Gather(cast<MemSDNode>(N), Lo, Hi, /*SplitSETCC*/ true);
     break;
   case ISD::SETCC:
+  case ISD::VP_SETCC:
     SplitVecRes_SETCC(N, Lo, Hi);
     break;
   case ISD::VECTOR_REVERSE:
@@ -988,13 +1023,17 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FLOG10:
   case ISD::FLOG2:
   case ISD::FNEARBYINT:
-  case ISD::FNEG:
+  case ISD::FNEG: case ISD::VP_FNEG:
   case ISD::FREEZE:
   case ISD::ARITH_FENCE:
   case ISD::FP_EXTEND:
+  case ISD::VP_FP_EXTEND:
   case ISD::FP_ROUND:
+  case ISD::VP_FP_ROUND:
   case ISD::FP_TO_SINT:
+  case ISD::VP_FPTOSI:
   case ISD::FP_TO_UINT:
+  case ISD::VP_FPTOUI:
   case ISD::FRINT:
   case ISD::FROUND:
   case ISD::FROUNDEVEN:
@@ -1002,8 +1041,11 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FSQRT:
   case ISD::FTRUNC:
   case ISD::SINT_TO_FP:
+  case ISD::VP_SITOFP:
   case ISD::TRUNCATE:
+  case ISD::VP_TRUNCATE:
   case ISD::UINT_TO_FP:
+  case ISD::VP_UITOFP:
   case ISD::FCANONICALIZE:
     SplitVecRes_UnaryOp(N, Lo, Hi);
     break;
@@ -1011,6 +1053,8 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::ANY_EXTEND:
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
+  case ISD::VP_SIGN_EXTEND:
+  case ISD::VP_ZERO_EXTEND:
     SplitVecRes_ExtendOp(N, Lo, Hi);
     break;
 
@@ -1053,7 +1097,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::ROTR:
     SplitVecRes_BinOp(N, Lo, Hi);
     break;
-  case ISD::FMA:
+  case ISD::FMA: case ISD::VP_FMA:
   case ISD::FSHL:
   case ISD::FSHR:
     SplitVecRes_TernaryOp(N, Lo, Hi);
@@ -1175,10 +1219,28 @@ void DAGTypeLegalizer::SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo,
   GetSplitVector(N->getOperand(2), Op2Lo, Op2Hi);
   SDLoc dl(N);
 
-  Lo = DAG.getNode(N->getOpcode(), dl, Op0Lo.getValueType(), Op0Lo, Op1Lo,
-                   Op2Lo, N->getFlags());
-  Hi = DAG.getNode(N->getOpcode(), dl, Op0Hi.getValueType(), Op0Hi, Op1Hi,
-                   Op2Hi, N->getFlags());
+  const SDNodeFlags Flags = N->getFlags();
+  unsigned Opcode = N->getOpcode();
+  if (N->getNumOperands() == 3) {
+    Lo = DAG.getNode(Opcode, dl, Op0Lo.getValueType(), Op0Lo, Op1Lo, Op2Lo, Flags);
+    Hi = DAG.getNode(Opcode, dl, Op0Hi.getValueType(), Op0Hi, Op1Hi, Op2Hi, Flags);
+    return;
+  }
+
+  assert(N->getNumOperands() == 5 && "Unexpected number of operands!");
+  assert(N->isVPOpcode() && "Expected VP opcode");
+
+  SDValue MaskLo, MaskHi;
+  std::tie(MaskLo, MaskHi) = SplitMask(N->getOperand(3));
+
+  SDValue EVLLo, EVLHi;
+  std::tie(EVLLo, EVLHi) =
+      DAG.SplitEVL(N->getOperand(4), N->getValueType(0), dl);
+
+  Lo = DAG.getNode(Opcode, dl, Op0Lo.getValueType(),
+                   {Op0Lo, Op1Lo, Op2Lo, MaskLo, EVLLo}, Flags);
+  Hi = DAG.getNode(Opcode, dl, Op0Hi.getValueType(),
+                   {Op0Hi, Op1Hi, Op2Hi, MaskHi, EVLHi}, Flags);
 }
 
 void DAGTypeLegalizer::SplitVecRes_FIX(SDNode *N, SDValue &Lo, SDValue &Hi) {
@@ -1398,6 +1460,19 @@ void DAGTypeLegalizer::SplitVecRes_FCOPYSIGN(SDNode *N, SDValue &Lo,
   Hi = DAG.getNode(ISD::FCOPYSIGN, DL, LHSHi.getValueType(), LHSHi, RHSHi);
 }
 
+void DAGTypeLegalizer::SplitVecRes_IS_FPCLASS(SDNode *N, SDValue &Lo,
+                                              SDValue &Hi) {
+  SDLoc DL(N);
+  SDValue ArgLo, ArgHi;
+  SDValue Test = N->getOperand(1);
+  GetSplitVector(N->getOperand(0), ArgLo, ArgHi);
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+
+  Lo = DAG.getNode(ISD::IS_FPCLASS, DL, LoVT, ArgLo, Test, N->getFlags());
+  Hi = DAG.getNode(ISD::IS_FPCLASS, DL, HiVT, ArgHi, Test, N->getFlags());
+}
+
 void DAGTypeLegalizer::SplitVecRes_InregOp(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   SDValue LHSLo, LHSHi;
@@ -2043,8 +2118,20 @@ void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) {
   else
     std::tie(RL, RH) = DAG.SplitVectorOperand(N, 1);
 
-  Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2));
-  Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2));
+  if (N->getOpcode() == ISD::SETCC) {
+    Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2));
+    Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2));
+  } else {
+    assert(N->getOpcode() == ISD::VP_SETCC && "Expected VP_SETCC opcode");
+    SDValue MaskLo, MaskHi, EVLLo, EVLHi;
+    std::tie(MaskLo, MaskHi) = SplitMask(N->getOperand(3));
+    std::tie(EVLLo, EVLHi) =
+        DAG.SplitEVL(N->getOperand(4), N->getValueType(0), DL);
+    Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2), MaskLo,
+                     EVLLo);
+    Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2), MaskHi,
+                     EVLHi);
+  }
 }
 
 void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo,
@@ -2056,22 +2143,37 @@ void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo,
 
   // If the input also splits, handle it directly for a compile time speedup.
   // Otherwise split it by hand.
-  unsigned OpNo = N->isStrictFPOpcode() ? 1 : 0;
-  EVT InVT = N->getOperand(OpNo).getValueType();
+  EVT InVT = N->getOperand(0).getValueType();
   if (getTypeAction(InVT) == TargetLowering::TypeSplitVector)
-    GetSplitVector(N->getOperand(OpNo), Lo, Hi);
+    GetSplitVector(N->getOperand(0), Lo, Hi);
   else
-    std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, OpNo);
+    std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
 
-  if (N->getOpcode() == ISD::FP_ROUND) {
-    Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo, N->getOperand(1),
-                     N->getFlags());
-    Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi, N->getOperand(1),
-                     N->getFlags());
-  } else {
-    Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo, N->getFlags());
-    Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi, N->getFlags());
+  const SDNodeFlags Flags = N->getFlags();
+  unsigned Opcode = N->getOpcode();
+  if (N->getNumOperands() <= 2) {
+    if (Opcode == ISD::FP_ROUND) {
+      Lo = DAG.getNode(Opcode, dl, LoVT, Lo, N->getOperand(1), Flags);
+      Hi = DAG.getNode(Opcode, dl, HiVT, Hi, N->getOperand(1), Flags);
+    } else {
+      Lo = DAG.getNode(Opcode, dl, LoVT, Lo, Flags);
+      Hi = DAG.getNode(Opcode, dl, HiVT, Hi, Flags);
+    }
+    return;
   }
+
+  assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
+  assert(N->isVPOpcode() && "Expected VP opcode");
+
+  SDValue MaskLo, MaskHi;
+  std::tie(MaskLo, MaskHi) = SplitMask(N->getOperand(1));
+
+  SDValue EVLLo, EVLHi;
+  std::tie(EVLLo, EVLHi) =
+      DAG.SplitEVL(N->getOperand(2), N->getValueType(0), dl);
+
+  Lo = DAG.getNode(Opcode, dl, LoVT, {Lo, MaskLo, EVLLo}, Flags);
+  Hi = DAG.getNode(Opcode, dl, HiVT, {Hi, MaskHi, EVLHi}, Flags);
 }
 
 void DAGTypeLegalizer::SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo,
@@ -2107,14 +2209,34 @@ void DAGTypeLegalizer::SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo,
         TLI.isTypeLegal(NewSrcVT) && TLI.isTypeLegal(SplitLoVT)) {
       LLVM_DEBUG(dbgs() << "Split vector extend via incremental extend:";
                  N->dump(&DAG); dbgs() << "\n");
+      if (!N->isVPOpcode()) {
+        // Extend the source vector by one step.
+        SDValue NewSrc =
+            DAG.getNode(N->getOpcode(), dl, NewSrcVT, N->getOperand(0));
+        // Get the low and high halves of the new, extended one step, vector.
+        std::tie(Lo, Hi) = DAG.SplitVector(NewSrc, dl);
+        // Extend those vector halves the rest of the way.
+        Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo);
+        Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi);
+        return;
+      }
+
       // Extend the source vector by one step.
       SDValue NewSrc =
-          DAG.getNode(N->getOpcode(), dl, NewSrcVT, N->getOperand(0));
+          DAG.getNode(N->getOpcode(), dl, NewSrcVT, N->getOperand(0),
+                      N->getOperand(1), N->getOperand(2));
       // Get the low and high halves of the new, extended one step, vector.
       std::tie(Lo, Hi) = DAG.SplitVector(NewSrc, dl);
+
+      SDValue MaskLo, MaskHi;
+      std::tie(MaskLo, MaskHi) = SplitMask(N->getOperand(1));
+
+      SDValue EVLLo, EVLHi;
+      std::tie(EVLLo, EVLHi) =
+          DAG.SplitEVL(N->getOperand(2), N->getValueType(0), dl);
       // Extend those vector halves the rest of the way.
-      Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo);
-      Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi);
+      Lo = DAG.getNode(N->getOpcode(), dl, LoVT, {Lo, MaskLo, EVLLo});
+      Hi = DAG.getNode(N->getOpcode(), dl, HiVT, {Hi, MaskHi, EVLHi});
       return;
     }
   }
@@ -2126,108 +2248,352 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N,
                                                   SDValue &Lo, SDValue &Hi) {
   // The low and high parts of the original input give four input vectors.
   SDValue Inputs[4];
-  SDLoc dl(N);
+  SDLoc DL(N);
   GetSplitVector(N->getOperand(0), Inputs[0], Inputs[1]);
   GetSplitVector(N->getOperand(1), Inputs[2], Inputs[3]);
   EVT NewVT = Inputs[0].getValueType();
   unsigned NewElts = NewVT.getVectorNumElements();
 
+  auto &&IsConstant = [](const SDValue &N) {
+    APInt SplatValue;
+    return N.getResNo() == 0 &&
+           (ISD::isConstantSplatVector(N.getNode(), SplatValue) ||
+            ISD::isBuildVectorOfConstantSDNodes(N.getNode()));
+  };
+  auto &&BuildVector = [NewElts, &DAG = DAG, NewVT, &DL](SDValue &Input1,
+                                                         SDValue &Input2,
+                                                         ArrayRef<int> Mask) {
+    assert(Input1->getOpcode() == ISD::BUILD_VECTOR &&
+           Input2->getOpcode() == ISD::BUILD_VECTOR &&
+           "Expected build vector node.");
+    EVT EltVT = NewVT.getVectorElementType();
+    SmallVector<SDValue> Ops(NewElts, DAG.getUNDEF(EltVT));
+    for (unsigned I = 0; I < NewElts; ++I) {
+      if (Mask[I] == UndefMaskElem)
+        continue;
+      unsigned Idx = Mask[I];
+      if (Idx >= NewElts)
+        Ops[I] = Input2.getOperand(Idx - NewElts);
+      else
+        Ops[I] = Input1.getOperand(Idx);
+      // Make the type of all elements the same as the element type.
+      if (Ops[I].getValueType().bitsGT(EltVT))
+        Ops[I] = DAG.getNode(ISD::TRUNCATE, DL, EltVT, Ops[I]);
+    }
+    return DAG.getBuildVector(NewVT, DL, Ops);
+  };
+
   // If Lo or Hi uses elements from at most two of the four input vectors, then
   // express it as a vector shuffle of those two inputs.  Otherwise extract the
   // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
-  SmallVector<int, 16> Ops;
-  for (unsigned High = 0; High < 2; ++High) {
-    SDValue &Output = High ? Hi : Lo;
-
-    // Build a shuffle mask for the output, discovering on the fly which
-    // input vectors to use as shuffle operands (recorded in InputUsed).
-    // If building a suitable shuffle vector proves too hard, then bail
-    // out with useBuildVector set.
-    unsigned InputUsed[2] = { -1U, -1U }; // Not yet discovered.
-    unsigned FirstMaskIdx = High * NewElts;
-    bool useBuildVector = false;
-    for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
-      // The mask element.  This indexes into the input.
-      int Idx = N->getMaskElt(FirstMaskIdx + MaskOffset);
-
-      // The input vector this mask element indexes into.
-      unsigned Input = (unsigned)Idx / NewElts;
-
-      if (Input >= array_lengthof(Inputs)) {
-        // The mask element does not index into any input vector.
-        Ops.push_back(-1);
+  SmallVector<int> OrigMask(N->getMask().begin(), N->getMask().end());
+  // Try to pack incoming shuffles/inputs.
+  auto &&TryPeekThroughShufflesInputs = [&Inputs, &NewVT, this, NewElts,
+                                         &DL](SmallVectorImpl<int> &Mask) {
+    // Check if all inputs are shuffles of the same operands or non-shuffles.
+    MapVector<std::pair<SDValue, SDValue>, SmallVector<unsigned>> ShufflesIdxs;
+    for (unsigned Idx = 0; Idx < array_lengthof(Inputs); ++Idx) {
+      SDValue Input = Inputs[Idx];
+      auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Input.getNode());
+      if (!Shuffle ||
+          Input.getOperand(0).getValueType() != Input.getValueType())
+        continue;
+      ShufflesIdxs[std::make_pair(Input.getOperand(0), Input.getOperand(1))]
+          .push_back(Idx);
+      ShufflesIdxs[std::make_pair(Input.getOperand(1), Input.getOperand(0))]
+          .push_back(Idx);
+    }
+    for (auto &P : ShufflesIdxs) {
+      if (P.second.size() < 2)
         continue;
+      // Use shuffles operands instead of shuffles themselves.
+      // 1. Adjust mask.
+      for (int &Idx : Mask) {
+        if (Idx == UndefMaskElem)
+          continue;
+        unsigned SrcRegIdx = Idx / NewElts;
+        if (Inputs[SrcRegIdx].isUndef()) {
+          Idx = UndefMaskElem;
+          continue;
+        }
+        auto *Shuffle =
+            dyn_cast<ShuffleVectorSDNode>(Inputs[SrcRegIdx].getNode());
+        if (!Shuffle || !is_contained(P.second, SrcRegIdx))
+          continue;
+        int MaskElt = Shuffle->getMaskElt(Idx % NewElts);
+        if (MaskElt == UndefMaskElem) {
+          Idx = UndefMaskElem;
+          continue;
+        }
+        Idx = MaskElt % NewElts +
+              P.second[Shuffle->getOperand(MaskElt / NewElts) == P.first.first
+                           ? 0
+                           : 1] *
+                  NewElts;
       }
-
-      // Turn the index into an offset from the start of the input vector.
-      Idx -= Input * NewElts;
-
-      // Find or create a shuffle vector operand to hold this input.
-      unsigned OpNo;
-      for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
-        if (InputUsed[OpNo] == Input) {
-          // This input vector is already an operand.
-          break;
-        } else if (InputUsed[OpNo] == -1U) {
-          // Create a new operand for this input vector.
-          InputUsed[OpNo] = Input;
-          break;
+      // 2. Update inputs.
+      Inputs[P.second[0]] = P.first.first;
+      Inputs[P.second[1]] = P.first.second;
+      // Clear the pair data.
+      P.second.clear();
+      ShufflesIdxs[std::make_pair(P.first.second, P.first.first)].clear();
+    }
+    // Check if any concat_vectors can be simplified.
+    SmallBitVector UsedSubVector(2 * array_lengthof(Inputs));
+    for (int &Idx : Mask) {
+      if (Idx == UndefMaskElem)
+        continue;
+      unsigned SrcRegIdx = Idx / NewElts;
+      if (Inputs[SrcRegIdx].isUndef()) {
+        Idx = UndefMaskElem;
+        continue;
+      }
+      TargetLowering::LegalizeTypeAction TypeAction =
+          getTypeAction(Inputs[SrcRegIdx].getValueType());
+      if (Inputs[SrcRegIdx].getOpcode() == ISD::CONCAT_VECTORS &&
+          Inputs[SrcRegIdx].getNumOperands() == 2 &&
+          !Inputs[SrcRegIdx].getOperand(1).isUndef() &&
+          (TypeAction == TargetLowering::TypeLegal ||
+           TypeAction == TargetLowering::TypeWidenVector))
+        UsedSubVector.set(2 * SrcRegIdx + (Idx % NewElts) / (NewElts / 2));
+    }
+    if (UsedSubVector.count() > 1) {
+      SmallVector<SmallVector<std::pair<unsigned, int>, 2>> Pairs;
+      for (unsigned I = 0; I < array_lengthof(Inputs); ++I) {
+        if (UsedSubVector.test(2 * I) == UsedSubVector.test(2 * I + 1))
+          continue;
+        if (Pairs.empty() || Pairs.back().size() == 2)
+          Pairs.emplace_back();
+        if (UsedSubVector.test(2 * I)) {
+          Pairs.back().emplace_back(I, 0);
+        } else {
+          assert(UsedSubVector.test(2 * I + 1) &&
+                 "Expected to be used one of the subvectors.");
+          Pairs.back().emplace_back(I, 1);
         }
       }
-
-      if (OpNo >= array_lengthof(InputUsed)) {
-        // More than two input vectors used!  Give up on trying to create a
-        // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
-        useBuildVector = true;
-        break;
+      if (!Pairs.empty() && Pairs.front().size() > 1) {
+        // Adjust mask.
+        for (int &Idx : Mask) {
+          if (Idx == UndefMaskElem)
+            continue;
+          unsigned SrcRegIdx = Idx / NewElts;
+          auto *It = find_if(
+              Pairs, [SrcRegIdx](ArrayRef<std::pair<unsigned, int>> Idxs) {
+                return Idxs.front().first == SrcRegIdx ||
+                       Idxs.back().first == SrcRegIdx;
+              });
+          if (It == Pairs.end())
+            continue;
+          Idx = It->front().first * NewElts + (Idx % NewElts) % (NewElts / 2) +
+                (SrcRegIdx == It->front().first ? 0 : (NewElts / 2));
+        }
+        // Adjust inputs.
+        for (ArrayRef<std::pair<unsigned, int>> Idxs : Pairs) {
+          Inputs[Idxs.front().first] = DAG.getNode(
+              ISD::CONCAT_VECTORS, DL,
+              Inputs[Idxs.front().first].getValueType(),
+              Inputs[Idxs.front().first].getOperand(Idxs.front().second),
+              Inputs[Idxs.back().first].getOperand(Idxs.back().second));
+        }
       }
-
-      // Add the mask index for the new shuffle vector.
-      Ops.push_back(Idx + OpNo * NewElts);
     }
-
-    if (useBuildVector) {
-      EVT EltVT = NewVT.getVectorElementType();
-      SmallVector<SDValue, 16> SVOps;
-
-      // Extract the input elements by hand.
-      for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
-        // The mask element.  This indexes into the input.
-        int Idx = N->getMaskElt(FirstMaskIdx + MaskOffset);
-
-        // The input vector this mask element indexes into.
-        unsigned Input = (unsigned)Idx / NewElts;
-
-        if (Input >= array_lengthof(Inputs)) {
-          // The mask element is "undef" or indexes off the end of the input.
-          SVOps.push_back(DAG.getUNDEF(EltVT));
+    bool Changed;
+    do {
+      // Try to remove extra shuffles (except broadcasts) and shuffles with the
+      // reused operands.
+      Changed = false;
+      for (unsigned I = 0; I < array_lengthof(Inputs); ++I) {
+        auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Inputs[I].getNode());
+        if (!Shuffle)
           continue;
+        if (Shuffle->getOperand(0).getValueType() != NewVT)
+          continue;
+        int Op = -1;
+        if (!Inputs[I].hasOneUse() && Shuffle->getOperand(1).isUndef() &&
+            !Shuffle->isSplat()) {
+          Op = 0;
+        } else if (!Inputs[I].hasOneUse() &&
+                   !Shuffle->getOperand(1).isUndef()) {
+          // Find the only used operand, if possible.
+          for (int &Idx : Mask) {
+            if (Idx == UndefMaskElem)
+              continue;
+            unsigned SrcRegIdx = Idx / NewElts;
+            if (SrcRegIdx != I)
+              continue;
+            int MaskElt = Shuffle->getMaskElt(Idx % NewElts);
+            if (MaskElt == UndefMaskElem) {
+              Idx = UndefMaskElem;
+              continue;
+            }
+            int OpIdx = MaskElt / NewElts;
+            if (Op == -1) {
+              Op = OpIdx;
+              continue;
+            }
+            if (Op != OpIdx) {
+              Op = -1;
+              break;
+            }
+          }
+        }
+        if (Op < 0) {
+          // Try to check if one of the shuffle operands is used already.
+          for (int OpIdx = 0; OpIdx < 2; ++OpIdx) {
+            if (Shuffle->getOperand(OpIdx).isUndef())
+              continue;
+            auto *It = find(Inputs, Shuffle->getOperand(OpIdx));
+            if (It == std::end(Inputs))
+              continue;
+            int FoundOp = std::distance(std::begin(Inputs), It);
+            // Found that operand is used already.
+            // 1. Fix the mask for the reused operand.
+            for (int &Idx : Mask) {
+              if (Idx == UndefMaskElem)
+                continue;
+              unsigned SrcRegIdx = Idx / NewElts;
+              if (SrcRegIdx != I)
+                continue;
+              int MaskElt = Shuffle->getMaskElt(Idx % NewElts);
+              if (MaskElt == UndefMaskElem) {
+                Idx = UndefMaskElem;
+                continue;
+              }
+              int MaskIdx = MaskElt / NewElts;
+              if (OpIdx == MaskIdx)
+                Idx = MaskElt % NewElts + FoundOp * NewElts;
+            }
+            // 2. Set Op to the unused OpIdx.
+            Op = (OpIdx + 1) % 2;
+            break;
+          }
+        }
+        if (Op >= 0) {
+          Changed = true;
+          Inputs[I] = Shuffle->getOperand(Op);
+          // Adjust mask.
+          for (int &Idx : Mask) {
+            if (Idx == UndefMaskElem)
+              continue;
+            unsigned SrcRegIdx = Idx / NewElts;
+            if (SrcRegIdx != I)
+              continue;
+            int MaskElt = Shuffle->getMaskElt(Idx % NewElts);
+            int OpIdx = MaskElt / NewElts;
+            if (OpIdx != Op)
+              continue;
+            Idx = MaskElt % NewElts + SrcRegIdx * NewElts;
+          }
         }
-
-        // Turn the index into an offset from the start of the input vector.
-        Idx -= Input * NewElts;
-
-        // Extract the vector element by hand.
-        SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
-                                    Inputs[Input],
-                                    DAG.getVectorIdxConstant(Idx, dl)));
       }
-
-      // Construct the Lo/Hi output using a BUILD_VECTOR.
-      Output = DAG.getBuildVector(NewVT, dl, SVOps);
-    } else if (InputUsed[0] == -1U) {
-      // No input vectors were used!  The result is undefined.
-      Output = DAG.getUNDEF(NewVT);
-    } else {
-      SDValue Op0 = Inputs[InputUsed[0]];
-      // If only one input was used, use an undefined vector for the other.
-      SDValue Op1 = InputUsed[1] == -1U ?
-        DAG.getUNDEF(NewVT) : Inputs[InputUsed[1]];
-      // At least one input vector was used.  Create a new shuffle vector.
-      Output =  DAG.getVectorShuffle(NewVT, dl, Op0, Op1, Ops);
+    } while (Changed);
+  };
+  TryPeekThroughShufflesInputs(OrigMask);
+  // Proces unique inputs.
+  auto &&MakeUniqueInputs = [&Inputs, &IsConstant,
+                             NewElts](SmallVectorImpl<int> &Mask) {
+    SetVector<SDValue> UniqueInputs;
+    SetVector<SDValue> UniqueConstantInputs;
+    for (unsigned I = 0; I < array_lengthof(Inputs); ++I) {
+      if (IsConstant(Inputs[I]))
+        UniqueConstantInputs.insert(Inputs[I]);
+      else if (!Inputs[I].isUndef())
+        UniqueInputs.insert(Inputs[I]);
+    }
+    // Adjust mask in case of reused inputs. Also, need to insert constant
+    // inputs at first, otherwise it affects the final outcome.
+    if (UniqueInputs.size() != array_lengthof(Inputs)) {
+      auto &&UniqueVec = UniqueInputs.takeVector();
+      auto &&UniqueConstantVec = UniqueConstantInputs.takeVector();
+      unsigned ConstNum = UniqueConstantVec.size();
+      for (int &Idx : Mask) {
+        if (Idx == UndefMaskElem)
+          continue;
+        unsigned SrcRegIdx = Idx / NewElts;
+        if (Inputs[SrcRegIdx].isUndef()) {
+          Idx = UndefMaskElem;
+          continue;
+        }
+        const auto It = find(UniqueConstantVec, Inputs[SrcRegIdx]);
+        if (It != UniqueConstantVec.end()) {
+          Idx = (Idx % NewElts) +
+                NewElts * std::distance(UniqueConstantVec.begin(), It);
+          assert(Idx >= 0 && "Expected defined mask idx.");
+          continue;
+        }
+        const auto RegIt = find(UniqueVec, Inputs[SrcRegIdx]);
+        assert(RegIt != UniqueVec.end() && "Cannot find non-const value.");
+        Idx = (Idx % NewElts) +
+              NewElts * (std::distance(UniqueVec.begin(), RegIt) + ConstNum);
+        assert(Idx >= 0 && "Expected defined mask idx.");
+      }
+      copy(UniqueConstantVec, std::begin(Inputs));
+      copy(UniqueVec, std::next(std::begin(Inputs), ConstNum));
     }
+  };
+  MakeUniqueInputs(OrigMask);
+  SDValue OrigInputs[4];
+  copy(Inputs, std::begin(OrigInputs));
+  for (unsigned High = 0; High < 2; ++High) {
+    SDValue &Output = High ? Hi : Lo;
 
-    Ops.clear();
+    // Build a shuffle mask for the output, discovering on the fly which
+    // input vectors to use as shuffle operands.
+    unsigned FirstMaskIdx = High * NewElts;
+    SmallVector<int> Mask(NewElts * array_lengthof(Inputs), UndefMaskElem);
+    copy(makeArrayRef(OrigMask).slice(FirstMaskIdx, NewElts), Mask.begin());
+    assert(!Output && "Expected default initialized initial value.");
+    TryPeekThroughShufflesInputs(Mask);
+    MakeUniqueInputs(Mask);
+    SDValue TmpInputs[4];
+    copy(Inputs, std::begin(TmpInputs));
+    // Track changes in the output registers.
+    int UsedIdx = -1;
+    bool SecondIteration = false;
+    auto &&AccumulateResults = [&UsedIdx, &SecondIteration](unsigned Idx) {
+      if (UsedIdx < 0) {
+        UsedIdx = Idx;
+        return false;
+      }
+      if (UsedIdx >= 0 && static_cast<unsigned>(UsedIdx) == Idx)
+        SecondIteration = true;
+      return SecondIteration;
+    };
+    processShuffleMasks(
+        Mask, array_lengthof(Inputs), array_lengthof(Inputs),
+        /*NumOfUsedRegs=*/1,
+        [&Output, &DAG = DAG, NewVT]() { Output = DAG.getUNDEF(NewVT); },
+        [&Output, &DAG = DAG, NewVT, &DL, &Inputs,
+         &BuildVector](ArrayRef<int> Mask, unsigned Idx, unsigned /*Unused*/) {
+          if (Inputs[Idx]->getOpcode() == ISD::BUILD_VECTOR)
+            Output = BuildVector(Inputs[Idx], Inputs[Idx], Mask);
+          else
+            Output = DAG.getVectorShuffle(NewVT, DL, Inputs[Idx],
+                                          DAG.getUNDEF(NewVT), Mask);
+          Inputs[Idx] = Output;
+        },
+        [&AccumulateResults, &Output, &DAG = DAG, NewVT, &DL, &Inputs,
+         &TmpInputs,
+         &BuildVector](ArrayRef<int> Mask, unsigned Idx1, unsigned Idx2) {
+          if (AccumulateResults(Idx1)) {
+            if (Inputs[Idx1]->getOpcode() == ISD::BUILD_VECTOR &&
+                Inputs[Idx2]->getOpcode() == ISD::BUILD_VECTOR)
+              Output = BuildVector(Inputs[Idx1], Inputs[Idx2], Mask);
+            else
+              Output = DAG.getVectorShuffle(NewVT, DL, Inputs[Idx1],
+                                            Inputs[Idx2], Mask);
+          } else {
+            if (TmpInputs[Idx1]->getOpcode() == ISD::BUILD_VECTOR &&
+                TmpInputs[Idx2]->getOpcode() == ISD::BUILD_VECTOR)
+              Output = BuildVector(TmpInputs[Idx1], TmpInputs[Idx2], Mask);
+            else
+              Output = DAG.getVectorShuffle(NewVT, DL, TmpInputs[Idx1],
+                                            TmpInputs[Idx2], Mask);
+          }
+          Inputs[Idx1] = Output;
+        });
+    copy(OrigInputs, std::begin(Inputs));
   }
 }
 
@@ -2268,6 +2634,32 @@ void DAGTypeLegalizer::SplitVecRes_FP_TO_XINT_SAT(SDNode *N, SDValue &Lo,
   Hi = DAG.getNode(N->getOpcode(), dl, DstVTHi, SrcHi, N->getOperand(1));
 }
 
+void DAGTypeLegalizer::SplitVecRes_VECTOR_REVERSE(SDNode *N, SDValue &Lo,
+                                                  SDValue &Hi) {
+  SDValue InLo, InHi;
+  GetSplitVector(N->getOperand(0), InLo, InHi);
+  SDLoc DL(N);
+
+  Lo = DAG.getNode(ISD::VECTOR_REVERSE, DL, InHi.getValueType(), InHi);
+  Hi = DAG.getNode(ISD::VECTOR_REVERSE, DL, InLo.getValueType(), InLo);
+}
+
+void DAGTypeLegalizer::SplitVecRes_VECTOR_SPLICE(SDNode *N, SDValue &Lo,
+                                                 SDValue &Hi) {
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+
+  SDValue Expanded = TLI.expandVectorSplice(N, DAG);
+  Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, Expanded,
+                   DAG.getVectorIdxConstant(0, DL));
+  Hi =
+      DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HiVT, Expanded,
+                  DAG.getVectorIdxConstant(LoVT.getVectorMinNumElements(), DL));
+}
+
 //===----------------------------------------------------------------------===//
 //  Operand Vector Splitting
 //===----------------------------------------------------------------------===//
@@ -2294,16 +2686,19 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
     report_fatal_error("Do not know how to split this operator's "
                        "operand!\n");
 
+  case ISD::VP_SETCC:
   case ISD::SETCC:             Res = SplitVecOp_VSETCC(N); break;
   case ISD::BITCAST:           Res = SplitVecOp_BITCAST(N); break;
   case ISD::EXTRACT_SUBVECTOR: Res = SplitVecOp_EXTRACT_SUBVECTOR(N); break;
   case ISD::INSERT_SUBVECTOR:  Res = SplitVecOp_INSERT_SUBVECTOR(N, OpNo); break;
   case ISD::EXTRACT_VECTOR_ELT:Res = SplitVecOp_EXTRACT_VECTOR_ELT(N); break;
   case ISD::CONCAT_VECTORS:    Res = SplitVecOp_CONCAT_VECTORS(N); break;
+  case ISD::VP_TRUNCATE:
   case ISD::TRUNCATE:
     Res = SplitVecOp_TruncateHelper(N);
     break;
   case ISD::STRICT_FP_ROUND:
+  case ISD::VP_FP_ROUND:
   case ISD::FP_ROUND:          Res = SplitVecOp_FP_ROUND(N); break;
   case ISD::FCOPYSIGN:         Res = SplitVecOp_FCOPYSIGN(N); break;
   case ISD::STORE:
@@ -2543,6 +2938,14 @@ SDValue DAGTypeLegalizer::SplitVecOp_UnaryOp(SDNode *N) {
     // Legalize the chain result - switch anything that used the old chain to
     // use the new one.
     ReplaceValueWith(SDValue(N, 1), Ch);
+  } else if (N->getNumOperands() == 3) {
+    assert(N->isVPOpcode() && "Expected VP opcode");
+    SDValue MaskLo, MaskHi, EVLLo, EVLHi;
+    std::tie(MaskLo, MaskHi) = SplitMask(N->getOperand(1));
+    std::tie(EVLLo, EVLHi) =
+        DAG.SplitEVL(N->getOperand(2), N->getValueType(0), dl);
+    Lo = DAG.getNode(N->getOpcode(), dl, OutVT, Lo, MaskLo, EVLLo);
+    Hi = DAG.getNode(N->getOpcode(), dl, OutVT, Hi, MaskHi, EVLHi);
   } else {
     Lo = DAG.getNode(N->getOpcode(), dl, OutVT, Lo);
     Hi = DAG.getNode(N->getOpcode(), dl, OutVT, Hi);
@@ -3128,8 +3531,20 @@ SDValue DAGTypeLegalizer::SplitVecOp_VSETCC(SDNode *N) {
   EVT PartResVT = EVT::getVectorVT(Context, MVT::i1, PartEltCnt);
   EVT WideResVT = EVT::getVectorVT(Context, MVT::i1, PartEltCnt*2);
 
-  LoRes = DAG.getNode(ISD::SETCC, DL, PartResVT, Lo0, Lo1, N->getOperand(2));
-  HiRes = DAG.getNode(ISD::SETCC, DL, PartResVT, Hi0, Hi1, N->getOperand(2));
+  if (N->getOpcode() == ISD::SETCC) {
+    LoRes = DAG.getNode(ISD::SETCC, DL, PartResVT, Lo0, Lo1, N->getOperand(2));
+    HiRes = DAG.getNode(ISD::SETCC, DL, PartResVT, Hi0, Hi1, N->getOperand(2));
+  } else {
+    assert(N->getOpcode() == ISD::VP_SETCC && "Expected VP_SETCC opcode");
+    SDValue MaskLo, MaskHi, EVLLo, EVLHi;
+    std::tie(MaskLo, MaskHi) = SplitMask(N->getOperand(3));
+    std::tie(EVLLo, EVLHi) =
+        DAG.SplitEVL(N->getOperand(4), N->getValueType(0), DL);
+    LoRes = DAG.getNode(ISD::VP_SETCC, DL, PartResVT, Lo0, Lo1,
+                        N->getOperand(2), MaskLo, EVLLo);
+    HiRes = DAG.getNode(ISD::VP_SETCC, DL, PartResVT, Hi0, Hi1,
+                        N->getOperand(2), MaskHi, EVLHi);
+  }
   SDValue Con = DAG.getNode(ISD::CONCAT_VECTORS, DL, WideResVT, LoRes, HiRes);
 
   EVT OpVT = N->getOperand(0).getValueType();
@@ -3160,6 +3575,13 @@ SDValue DAGTypeLegalizer::SplitVecOp_FP_ROUND(SDNode *N) {
     SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 
                                    Lo.getValue(1), Hi.getValue(1));
     ReplaceValueWith(SDValue(N, 1), NewChain);
+  } else if (N->getOpcode() == ISD::VP_FP_ROUND) {
+    SDValue MaskLo, MaskHi, EVLLo, EVLHi;
+    std::tie(MaskLo, MaskHi) = SplitMask(N->getOperand(1));
+    std::tie(EVLLo, EVLHi) =
+        DAG.SplitEVL(N->getOperand(2), N->getValueType(0), DL);
+    Lo = DAG.getNode(ISD::VP_FP_ROUND, DL, OutVT, Lo, MaskLo, EVLLo);
+    Hi = DAG.getNode(ISD::VP_FP_ROUND, DL, OutVT, Hi, MaskHi, EVLHi);
   } else {
     Lo = DAG.getNode(ISD::FP_ROUND, DL, OutVT, Lo, N->getOperand(1));
     Hi = DAG.getNode(ISD::FP_ROUND, DL, OutVT, Hi, N->getOperand(1));
@@ -3204,6 +3626,22 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
     return;
 
   SDValue Res = SDValue();
+
+  auto unrollExpandedOp = [&]() {
+    // We're going to widen this vector op to a legal type by padding with undef
+    // elements. If the wide vector op is eventually going to be expanded to
+    // scalar libcalls, then unroll into scalar ops now to avoid unnecessary
+    // libcalls on the undef elements.
+    EVT VT = N->getValueType(0);
+    EVT WideVecVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+    if (!TLI.isOperationLegalOrCustom(N->getOpcode(), WideVecVT) &&
+        TLI.isOperationExpand(N->getOpcode(), VT.getScalarType())) {
+      Res = DAG.UnrollVectorOp(N, WideVecVT.getVectorNumElements());
+      return true;
+    }
+    return false;
+  };
+
   switch (N->getOpcode()) {
   default:
 #ifndef NDEBUG
@@ -3223,6 +3661,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::EXTRACT_SUBVECTOR: Res = WidenVecRes_EXTRACT_SUBVECTOR(N); break;
   case ISD::INSERT_VECTOR_ELT: Res = WidenVecRes_INSERT_VECTOR_ELT(N); break;
   case ISD::LOAD:              Res = WidenVecRes_LOAD(N); break;
+  case ISD::STEP_VECTOR:
   case ISD::SPLAT_VECTOR:
   case ISD::SCALAR_TO_VECTOR:
     Res = WidenVecRes_ScalarOp(N);
@@ -3235,6 +3674,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
     Res = WidenVecRes_Select(N);
     break;
   case ISD::SELECT_CC:         Res = WidenVecRes_SELECT_CC(N); break;
+  case ISD::VP_SETCC:
   case ISD::SETCC:             Res = WidenVecRes_SETCC(N); break;
   case ISD::UNDEF:             Res = WidenVecRes_UNDEF(N); break;
   case ISD::VECTOR_SHUFFLE:
@@ -3280,6 +3720,10 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::USHLSAT:
   case ISD::ROTL:
   case ISD::ROTR:
+  case ISD::AVGFLOORS:
+  case ISD::AVGFLOORU:
+  case ISD::AVGCEILS:
+  case ISD::AVGCEILU:
   // Vector-predicated binary op widening. Note that -- unlike the
   // unpredicated versions -- we don't have to worry about trapping on
   // operations like UDIV, FADD, etc., as we pass on the original vector
@@ -3297,12 +3741,19 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
     Res = WidenVecRes_Binary(N);
     break;
 
+  case ISD::FPOW:
+  case ISD::FREM:
+    if (unrollExpandedOp())
+      break;
+    // If the target has custom/legal support for the scalar FP intrinsic ops
+    // (they are probably not destined to become libcalls), then widen those
+    // like any other binary ops.
+    LLVM_FALLTHROUGH;
+
   case ISD::FADD:
   case ISD::FMUL:
-  case ISD::FPOW:
   case ISD::FSUB:
   case ISD::FDIV:
-  case ISD::FREM:
   case ISD::SDIV:
   case ISD::UDIV:
   case ISD::SREM:
@@ -3338,6 +3789,10 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
     Res = WidenVecRes_FCOPYSIGN(N);
     break;
 
+  case ISD::IS_FPCLASS:
+    Res = WidenVecRes_IS_FPCLASS(N);
+    break;
+
   case ISD::FPOWI:
     Res = WidenVecRes_POWI(N);
     break;
@@ -3350,14 +3805,23 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
 
   case ISD::ANY_EXTEND:
   case ISD::FP_EXTEND:
+  case ISD::VP_FP_EXTEND:
   case ISD::FP_ROUND:
+  case ISD::VP_FP_ROUND:
   case ISD::FP_TO_SINT:
+  case ISD::VP_FPTOSI:
   case ISD::FP_TO_UINT:
+  case ISD::VP_FPTOUI:
   case ISD::SIGN_EXTEND:
+  case ISD::VP_SIGN_EXTEND:
   case ISD::SINT_TO_FP:
+  case ISD::VP_SITOFP:
+  case ISD::VP_TRUNCATE:
   case ISD::TRUNCATE:
   case ISD::UINT_TO_FP:
+  case ISD::VP_UITOFP:
   case ISD::ZERO_EXTEND:
+  case ISD::VP_ZERO_EXTEND:
     Res = WidenVecRes_Convert(N);
     break;
 
@@ -3381,23 +3845,13 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FROUNDEVEN:
   case ISD::FSIN:
   case ISD::FSQRT:
-  case ISD::FTRUNC: {
-    // We're going to widen this vector op to a legal type by padding with undef
-    // elements. If the wide vector op is eventually going to be expanded to
-    // scalar libcalls, then unroll into scalar ops now to avoid unnecessary
-    // libcalls on the undef elements.
-    EVT VT = N->getValueType(0);
-    EVT WideVecVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
-    if (!TLI.isOperationLegalOrCustom(N->getOpcode(), WideVecVT) &&
-        TLI.isOperationExpand(N->getOpcode(), VT.getScalarType())) {
-      Res = DAG.UnrollVectorOp(N, WideVecVT.getVectorNumElements());
+  case ISD::FTRUNC:
+    if (unrollExpandedOp())
       break;
-    }
-  }
-  // If the target has custom/legal support for the scalar FP intrinsic ops
-  // (they are probably not destined to become libcalls), then widen those like
-  // any other unary ops.
-  LLVM_FALLTHROUGH;
+    // If the target has custom/legal support for the scalar FP intrinsic ops
+    // (they are probably not destined to become libcalls), then widen those
+    // like any other unary ops.
+    LLVM_FALLTHROUGH;
 
   case ISD::ABS:
   case ISD::BITREVERSE:
@@ -3407,13 +3861,13 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::CTPOP:
   case ISD::CTTZ:
   case ISD::CTTZ_ZERO_UNDEF:
-  case ISD::FNEG:
+  case ISD::FNEG: case ISD::VP_FNEG:
   case ISD::FREEZE:
   case ISD::ARITH_FENCE:
   case ISD::FCANONICALIZE:
     Res = WidenVecRes_Unary(N);
     break;
-  case ISD::FMA:
+  case ISD::FMA: case ISD::VP_FMA:
   case ISD::FSHL:
   case ISD::FSHR:
     Res = WidenVecRes_Ternary(N);
@@ -3432,7 +3886,16 @@ SDValue DAGTypeLegalizer::WidenVecRes_Ternary(SDNode *N) {
   SDValue InOp1 = GetWidenedVector(N->getOperand(0));
   SDValue InOp2 = GetWidenedVector(N->getOperand(1));
   SDValue InOp3 = GetWidenedVector(N->getOperand(2));
-  return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, InOp3);
+  if (N->getNumOperands() == 3)
+    return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, InOp3);
+
+  assert(N->getNumOperands() == 5 && "Unexpected number of operands!");
+  assert(N->isVPOpcode() && "Expected VP opcode");
+
+  SDValue Mask =
+      GetWidenedMask(N->getOperand(3), WidenVT.getVectorElementCount());
+  return DAG.getNode(N->getOpcode(), dl, WidenVT,
+                     {InOp1, InOp2, InOp3, Mask, N->getOperand(4)});
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
@@ -3552,7 +4015,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BinaryCanTrap(SDNode *N) {
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   EVT WidenEltVT = WidenVT.getVectorElementType();
   EVT VT = WidenVT;
-  unsigned NumElts =  VT.getVectorNumElements();
+  unsigned NumElts = VT.getVectorMinNumElements();
   const SDNodeFlags Flags = N->getFlags();
   while (!TLI.isTypeLegal(VT) && NumElts != 1) {
     NumElts = NumElts / 2;
@@ -3566,6 +4029,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_BinaryCanTrap(SDNode *N) {
     return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, Flags);
   }
 
+  // FIXME: Improve support for scalable vectors.
+  assert(!VT.isScalableVector() && "Scalable vectors not handled yet.");
+
   // No legal vector version so unroll the vector operation and then widen.
   if (NumElts == 1)
     return DAG.UnrollVectorOp(N, WidenVT.getVectorNumElements());
@@ -3826,6 +4292,12 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
     if (InVTEC == WidenEC) {
       if (N->getNumOperands() == 1)
         return DAG.getNode(Opcode, DL, WidenVT, InOp);
+      if (N->getNumOperands() == 3) {
+        assert(N->isVPOpcode() && "Expected VP opcode");
+        SDValue Mask =
+            GetWidenedMask(N->getOperand(1), WidenVT.getVectorElementCount());
+        return DAG.getNode(Opcode, DL, WidenVT, InOp, Mask, N->getOperand(2));
+      }
       return DAG.getNode(Opcode, DL, WidenVT, InOp, N->getOperand(1), Flags);
     }
     if (WidenVT.getSizeInBits() == InVT.getSizeInBits()) {
@@ -4007,6 +4479,13 @@ SDValue DAGTypeLegalizer::WidenVecRes_FCOPYSIGN(SDNode *N) {
   return DAG.UnrollVectorOp(N, WidenVT.getVectorNumElements());
 }
 
+SDValue DAGTypeLegalizer::WidenVecRes_IS_FPCLASS(SDNode *N) {
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Arg = GetWidenedVector(N->getOperand(0));
+  return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, {Arg, N->getOperand(1)},
+                     N->getFlags());
+}
+
 SDValue DAGTypeLegalizer::WidenVecRes_POWI(SDNode *N) {
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue InOp = GetWidenedVector(N->getOperand(0));
@@ -4018,7 +4497,16 @@ SDValue DAGTypeLegalizer::WidenVecRes_Unary(SDNode *N) {
   // Unary op widening.
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue InOp = GetWidenedVector(N->getOperand(0));
-  return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp);
+  if (N->getNumOperands() == 1)
+    return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp);
+
+  assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
+  assert(N->isVPOpcode() && "Expected VP opcode");
+
+  SDValue Mask =
+      GetWidenedMask(N->getOperand(1), WidenVT.getVectorElementCount());
+  return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT,
+                     {InOp, Mask, N->getOperand(2)});
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_InregOp(SDNode *N) {
@@ -4243,11 +4731,11 @@ SDValue DAGTypeLegalizer::WidenVecRes_INSERT_SUBVECTOR(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
-  EVT      VT = N->getValueType(0);
-  EVT      EltVT = VT.getVectorElementType();
-  EVT      WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
-  SDValue  InOp = N->getOperand(0);
-  SDValue  Idx  = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue InOp = N->getOperand(0);
+  SDValue Idx = N->getOperand(1);
   SDLoc dl(N);
 
   auto InOpTypeAction = getTypeAction(InOp.getValueType());
@@ -4264,6 +4752,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
   // Check if we can extract from the vector.
   unsigned WidenNumElts = WidenVT.getVectorMinNumElements();
   unsigned InNumElts = InVT.getVectorMinNumElements();
+  unsigned VTNumElts = VT.getVectorMinNumElements();
+  assert(IdxVal % VTNumElts == 0 &&
+         "Expected Idx to be a multiple of subvector minimum vector length");
   if (IdxVal % WidenNumElts == 0 && IdxVal + WidenNumElts < InNumElts)
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, WidenVT, InOp, Idx);
 
@@ -4277,8 +4768,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
     //    nxv2i64 extract_subvector(nxv16i64, 8)
     //    nxv2i64 extract_subvector(nxv16i64, 10)
     //    undef)
-    unsigned VTNElts = VT.getVectorMinNumElements();
-    unsigned GCD = greatestCommonDivisor(VTNElts, WidenNumElts);
+    unsigned GCD = greatestCommonDivisor(VTNumElts, WidenNumElts);
     assert((IdxVal % GCD) == 0 && "Expected Idx to be a multiple of the broken "
                                   "down type's element count");
     EVT PartVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
@@ -4287,7 +4777,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
     if (getTypeAction(PartVT) != TargetLowering::TypeWidenVector) {
       SmallVector<SDValue> Parts;
       unsigned I = 0;
-      for (; I < VTNElts / GCD; ++I)
+      for (; I < VTNumElts / GCD; ++I)
         Parts.push_back(
             DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, PartVT, InOp,
                         DAG.getVectorIdxConstant(IdxVal + I * GCD, dl)));
@@ -4304,9 +4794,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
   // We could try widening the input to the right length but for now, extract
   // the original elements, fill the rest with undefs and build a vector.
   SmallVector<SDValue, 16> Ops(WidenNumElts);
-  unsigned NumElts = VT.getVectorNumElements();
   unsigned i;
-  for (i = 0; i < NumElts; ++i)
+  for (i = 0; i < VTNumElts; ++i)
     Ops[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
                          DAG.getVectorIdxConstant(IdxVal + i, dl));
 
@@ -4783,10 +5272,10 @@ SDValue DAGTypeLegalizer::WidenVecRes_Select(SDNode *N) {
   SDValue InOp1 = GetWidenedVector(N->getOperand(1));
   SDValue InOp2 = GetWidenedVector(N->getOperand(2));
   assert(InOp1.getValueType() == WidenVT && InOp2.getValueType() == WidenVT);
-  return Opcode == ISD::VP_SELECT || Opcode == ISD::VP_MERGE
-             ? DAG.getNode(Opcode, SDLoc(N), WidenVT, Cond1, InOp1, InOp2,
-                           N->getOperand(3))
-             : DAG.getNode(Opcode, SDLoc(N), WidenVT, Cond1, InOp1, InOp2);
+  if (Opcode == ISD::VP_SELECT || Opcode == ISD::VP_MERGE)
+    return DAG.getNode(Opcode, SDLoc(N), WidenVT, Cond1, InOp1, InOp2,
+                       N->getOperand(3));
+  return DAG.getNode(Opcode, SDLoc(N), WidenVT, Cond1, InOp1, InOp2);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_SELECT_CC(SDNode *N) {
@@ -4832,13 +5321,13 @@ SDValue DAGTypeLegalizer::WidenVecRes_SETCC(SDNode *N) {
          N->getOperand(0).getValueType().isVector() &&
          "Operands must be vectors");
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+  ElementCount WidenEC = WidenVT.getVectorElementCount();
 
   SDValue InOp1 = N->getOperand(0);
   EVT InVT = InOp1.getValueType();
   assert(InVT.isVector() && "can not widen non-vector type");
-  EVT WidenInVT = EVT::getVectorVT(*DAG.getContext(),
-                                   InVT.getVectorElementType(), WidenNumElts);
+  EVT WidenInVT =
+      EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(), WidenEC);
 
   // The input and output types often differ here, and it could be that while
   // we'd prefer to widen the result type, the input operands have been split.
@@ -4865,8 +5354,14 @@ SDValue DAGTypeLegalizer::WidenVecRes_SETCC(SDNode *N) {
          InOp2.getValueType() == WidenInVT &&
          "Input not widened to expected type!");
   (void)WidenInVT;
-  return DAG.getNode(ISD::SETCC, SDLoc(N),
-                     WidenVT, InOp1, InOp2, N->getOperand(2));
+  if (N->getOpcode() == ISD::VP_SETCC) {
+    SDValue Mask =
+        GetWidenedMask(N->getOperand(3), WidenVT.getVectorElementCount());
+    return DAG.getNode(ISD::VP_SETCC, SDLoc(N), WidenVT, InOp1, InOp2,
+                       N->getOperand(2), Mask, N->getOperand(4));
+  }
+  return DAG.getNode(ISD::SETCC, SDLoc(N), WidenVT, InOp1, InOp2,
+                     N->getOperand(2));
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_STRICT_FSETCC(SDNode *N) {
@@ -4946,6 +5441,7 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
   case ISD::STRICT_FSETCCS:     Res = WidenVecOp_STRICT_FSETCC(N); break;
   case ISD::VSELECT:            Res = WidenVecOp_VSELECT(N); break;
   case ISD::FCOPYSIGN:          Res = WidenVecOp_FCOPYSIGN(N); break;
+  case ISD::IS_FPCLASS:         Res = WidenVecOp_IS_FPCLASS(N); break;
 
   case ISD::ANY_EXTEND:
   case ISD::SIGN_EXTEND:
@@ -5098,6 +5594,34 @@ SDValue DAGTypeLegalizer::WidenVecOp_FCOPYSIGN(SDNode *N) {
   return DAG.UnrollVectorOp(N);
 }
 
+SDValue DAGTypeLegalizer::WidenVecOp_IS_FPCLASS(SDNode *N) {
+  SDLoc DL(N);
+  EVT ResultVT = N->getValueType(0);
+  SDValue Test = N->getOperand(1);
+  SDValue WideArg = GetWidenedVector(N->getOperand(0));
+
+  // Process this node similarly to SETCC.
+  EVT WideResultVT = getSetCCResultType(WideArg.getValueType());
+  if (ResultVT.getScalarType() == MVT::i1)
+    WideResultVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                    WideResultVT.getVectorNumElements());
+
+  SDValue WideNode = DAG.getNode(ISD::IS_FPCLASS, DL, WideResultVT,
+                                 {WideArg, Test}, N->getFlags());
+
+  // Extract the needed results from the result vector.
+  EVT ResVT =
+      EVT::getVectorVT(*DAG.getContext(), WideResultVT.getVectorElementType(),
+                       ResultVT.getVectorNumElements());
+  SDValue CC = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, WideNode,
+                           DAG.getVectorIdxConstant(0, DL));
+
+  EVT OpVT = N->getOperand(0).getValueType();
+  ISD::NodeType ExtendCode =
+      TargetLowering::getExtendForContent(TLI.getBooleanContents(OpVT));
+  return DAG.getNode(ExtendCode, DL, ResultVT, CC);
+}
+
 SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) {
   // Since the result is legal and the input is illegal.
   EVT VT = N->getValueType(0);
@@ -5192,11 +5716,12 @@ SDValue DAGTypeLegalizer::WidenVecOp_BITCAST(SDNode *N) {
   SDLoc dl(N);
 
   // Check if we can convert between two legal vector types and extract.
-  unsigned InWidenSize = InWidenVT.getSizeInBits();
-  unsigned Size = VT.getSizeInBits();
+  TypeSize InWidenSize = InWidenVT.getSizeInBits();
+  TypeSize Size = VT.getSizeInBits();
   // x86mmx is not an acceptable vector element type, so don't try.
-  if (InWidenSize % Size == 0 && !VT.isVector() && VT != MVT::x86mmx) {
-    unsigned NewNumElts = InWidenSize / Size;
+  if (!VT.isVector() && VT != MVT::x86mmx &&
+      InWidenSize.hasKnownScalarFactor(Size)) {
+    unsigned NewNumElts = InWidenSize.getKnownScalarFactor(Size);
     EVT NewVT = EVT::getVectorVT(*DAG.getContext(), VT, NewNumElts);
     if (TLI.isTypeLegal(NewVT)) {
       SDValue BitOp = DAG.getNode(ISD::BITCAST, dl, NewVT, InOp);
@@ -5211,9 +5736,11 @@ SDValue DAGTypeLegalizer::WidenVecOp_BITCAST(SDNode *N) {
   // having to copy via memory.
   if (VT.isVector()) {
     EVT EltVT = VT.getVectorElementType();
-    unsigned EltSize = EltVT.getSizeInBits();
-    if (InWidenSize % EltSize == 0) {
-      unsigned NewNumElts = InWidenSize / EltSize;
+    unsigned EltSize = EltVT.getFixedSizeInBits();
+    if (InWidenSize.isKnownMultipleOf(EltSize)) {
+      ElementCount NewNumElts =
+          (InWidenVT.getVectorElementCount() * InWidenVT.getScalarSizeInBits())
+              .divideCoefficientBy(EltSize);
       EVT NewVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NewNumElts);
       if (TLI.isTypeLegal(NewVT)) {
         SDValue BitOp = DAG.getNode(ISD::BITCAST, dl, NewVT, InOp);
@@ -5266,18 +5793,17 @@ SDValue DAGTypeLegalizer::WidenVecOp_CONCAT_VECTORS(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_INSERT_SUBVECTOR(SDNode *N) {
+  EVT VT = N->getValueType(0);
   SDValue SubVec = N->getOperand(1);
   SDValue InVec = N->getOperand(0);
 
-  if (getTypeAction(InVec.getValueType()) == TargetLowering::TypeWidenVector)
-    InVec = GetWidenedVector(InVec);
-
   if (getTypeAction(SubVec.getValueType()) == TargetLowering::TypeWidenVector)
     SubVec = GetWidenedVector(SubVec);
 
-  if (SubVec.getValueType() == InVec.getValueType() && InVec.isUndef() &&
+  if (SubVec.getValueType().knownBitsLE(VT) && InVec.isUndef() &&
       N->getConstantOperandVal(2) == 0)
-    return SubVec;
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, InVec, SubVec,
+                       N->getOperand(2));
 
   report_fatal_error("Don't know how to widen the operands for "
                      "INSERT_SUBVECTOR");
@@ -5500,11 +6026,11 @@ SDValue DAGTypeLegalizer::WidenVecOp_VP_SCATTER(SDNode *N, unsigned OpNo) {
     Mask = GetWidenedMask(Mask, WideEC);
     WideMemVT = EVT::getVectorVT(*DAG.getContext(),
                                  VPSC->getMemoryVT().getScalarType(), WideEC);
-  } else if (OpNo == 4) {
+  } else if (OpNo == 3) {
     // Just widen the index. It's allowed to have extra elements.
     Index = GetWidenedVector(Index);
   } else
-    llvm_unreachable("Can't widen this operand of mscatter");
+    llvm_unreachable("Can't widen this operand of VP_SCATTER");
 
   SDValue Ops[] = {
       VPSC->getChain(),       DataOp, VPSC->getBasePtr(), Index, Scale, Mask,
@@ -5597,8 +6123,20 @@ SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE(SDNode *N) {
   assert(NeutralElem && "Neutral element must exist");
 
   // Pad the vector with the neutral element.
-  unsigned OrigElts = OrigVT.getVectorNumElements();
-  unsigned WideElts = WideVT.getVectorNumElements();
+  unsigned OrigElts = OrigVT.getVectorMinNumElements();
+  unsigned WideElts = WideVT.getVectorMinNumElements();
+
+  if (WideVT.isScalableVector()) {
+    unsigned GCD = greatestCommonDivisor(OrigElts, WideElts);
+    EVT SplatVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
+                                   ElementCount::getScalable(GCD));
+    SDValue SplatNeutral = DAG.getSplatVector(SplatVT, dl, NeutralElem);
+    for (unsigned Idx = OrigElts; Idx < WideElts; Idx = Idx + GCD)
+      Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Op, SplatNeutral,
+                       DAG.getVectorIdxConstant(Idx, dl));
+    return DAG.getNode(Opc, dl, N->getValueType(0), Op, Flags);
+  }
+
   for (unsigned Idx = OrigElts; Idx < WideElts; Idx++)
     Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, WideVT, Op, NeutralElem,
                      DAG.getVectorIdxConstant(Idx, dl));
@@ -5622,8 +6160,20 @@ SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE_SEQ(SDNode *N) {
   SDValue NeutralElem = DAG.getNeutralElement(BaseOpc, dl, ElemVT, Flags);
 
   // Pad the vector with the neutral element.
-  unsigned OrigElts = OrigVT.getVectorNumElements();
-  unsigned WideElts = WideVT.getVectorNumElements();
+  unsigned OrigElts = OrigVT.getVectorMinNumElements();
+  unsigned WideElts = WideVT.getVectorMinNumElements();
+
+  if (WideVT.isScalableVector()) {
+    unsigned GCD = greatestCommonDivisor(OrigElts, WideElts);
+    EVT SplatVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
+                                   ElementCount::getScalable(GCD));
+    SDValue SplatNeutral = DAG.getSplatVector(SplatVT, dl, NeutralElem);
+    for (unsigned Idx = OrigElts; Idx < WideElts; Idx = Idx + GCD)
+      Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Op, SplatNeutral,
+                       DAG.getVectorIdxConstant(Idx, dl));
+    return DAG.getNode(Opc, dl, N->getValueType(0), AccOp, Op, Flags);
+  }
+
   for (unsigned Idx = OrigElts; Idx < WideElts; Idx++)
     Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, WideVT, Op, NeutralElem,
                      DAG.getVectorIdxConstant(Idx, dl));
@@ -5795,7 +6345,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
   // Allow wider loads if they are sufficiently aligned to avoid memory faults
   // and if the original load is simple.
   unsigned LdAlign =
-      (!LD->isSimple() || LdVT.isScalableVector()) ? 0 : LD->getAlignment();
+      (!LD->isSimple() || LdVT.isScalableVector()) ? 0 : LD->getAlign().value();
 
   // Find the vector type that can load from.
   Optional<EVT> FirstVT =
@@ -6103,7 +6653,7 @@ SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT,
   EVT InVT = InOp.getValueType();
   assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
          "input and widen element type must match");
-  assert(!InVT.isScalableVector() && !NVT.isScalableVector() &&
+  assert(InVT.isScalableVector() == NVT.isScalableVector() &&
          "cannot modify scalable vectors in this way");
   SDLoc dl(InOp);
 
@@ -6111,10 +6661,10 @@ SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT,
   if (InVT == NVT)
     return InOp;
 
-  unsigned InNumElts = InVT.getVectorNumElements();
-  unsigned WidenNumElts = NVT.getVectorNumElements();
-  if (WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0) {
-    unsigned NumConcat = WidenNumElts / InNumElts;
+  ElementCount InEC = InVT.getVectorElementCount();
+  ElementCount WidenEC = NVT.getVectorElementCount();
+  if (WidenEC.hasKnownScalarFactor(InEC)) {
+    unsigned NumConcat = WidenEC.getKnownScalarFactor(InEC);
     SmallVector<SDValue, 16> Ops(NumConcat);
     SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, InVT) :
       DAG.getUNDEF(InVT);
@@ -6125,10 +6675,16 @@ SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT,
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, Ops);
   }
 
-  if (WidenNumElts < InNumElts && InNumElts % WidenNumElts)
+  if (InEC.hasKnownScalarFactor(WidenEC))
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NVT, InOp,
                        DAG.getVectorIdxConstant(0, dl));
 
+  assert(!InVT.isScalableVector() && !NVT.isScalableVector() &&
+         "Scalable vectors should have been handled already.");
+
+  unsigned InNumElts = InEC.getFixedValue();
+  unsigned WidenNumElts = WidenEC.getFixedValue();
+
   // Fall back to extract and build.
   SmallVector<SDValue, 16> Ops(WidenNumElts);
   EVT EltVT = NVT.getVectorElementType();
@@ -6144,29 +6700,3 @@ SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT,
     Ops[Idx] = FillVal;
   return DAG.getBuildVector(NVT, dl, Ops);
 }
-
-void DAGTypeLegalizer::SplitVecRes_VECTOR_REVERSE(SDNode *N, SDValue &Lo,
-                                                  SDValue &Hi) {
-  SDValue InLo, InHi;
-  GetSplitVector(N->getOperand(0), InLo, InHi);
-  SDLoc DL(N);
-
-  Lo = DAG.getNode(ISD::VECTOR_REVERSE, DL, InHi.getValueType(), InHi);
-  Hi = DAG.getNode(ISD::VECTOR_REVERSE, DL, InLo.getValueType(), InLo);
-}
-
-void DAGTypeLegalizer::SplitVecRes_VECTOR_SPLICE(SDNode *N, SDValue &Lo,
-                                                 SDValue &Hi) {
-  EVT VT = N->getValueType(0);
-  SDLoc DL(N);
-
-  EVT LoVT, HiVT;
-  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
-
-  SDValue Expanded = TLI.expandVectorSplice(N, DAG);
-  Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, Expanded,
-                   DAG.getVectorIdxConstant(0, DL));
-  Hi =
-      DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HiVT, Expanded,
-                  DAG.getVectorIdxConstant(LoVT.getVectorMinNumElements(), DL));
-}
diff --git a/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
index 3d5c4c5b1cae..e0e8d503ca92 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
@@ -20,7 +20,6 @@
 
 #include "llvm/CodeGen/ResourcePriorityQueue.h"
 #include "llvm/CodeGen/DFAPacketizer.h"
-#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
@@ -28,21 +27,18 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
 #define DEBUG_TYPE "scheduler"
 
-static cl::opt<bool> DisableDFASched("disable-dfa-sched", cl::Hidden,
-  cl::ZeroOrMore, cl::init(false),
-  cl::desc("Disable use of DFA during scheduling"));
+static cl::opt<bool>
+    DisableDFASched("disable-dfa-sched", cl::Hidden,
+                    cl::desc("Disable use of DFA during scheduling"));
 
 static cl::opt<int> RegPressureThreshold(
-  "dfa-sched-reg-pressure-threshold", cl::Hidden, cl::ZeroOrMore, cl::init(5),
-  cl::desc("Track reg pressure and switch priority to in-depth"));
+    "dfa-sched-reg-pressure-threshold", cl::Hidden, cl::init(5),
+    cl::desc("Track reg pressure and switch priority to in-depth"));
 
 ResourcePriorityQueue::ResourcePriorityQueue(SelectionDAGISel *IS)
     : Picker(this), InstrItins(IS->MF->getSubtarget().getInstrItineraryData()) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h b/llvm/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
index f64b332a7fef..9fcf692babdc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
@@ -14,6 +14,7 @@
 #define LLVM_LIB_CODEGEN_SELECTIONDAG_SDNODEDBGVALUE_H
 
 #include "llvm/IR/DebugLoc.h"
+#include "llvm/Support/Allocator.h"
 #include "llvm/Support/DataTypes.h"
 #include <utility>
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index 1b89864116cb..78fc407e9573 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -11,16 +11,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstrEmitter.h"
-#include "ScheduleDAGSDNodes.h"
 #include "SDNodeDbgValue.h"
-#include "llvm/ADT/STLExtras.h"
+#include "ScheduleDAGSDNodes.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -442,17 +440,29 @@ static MVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
 /// CheckForLiveRegDef - Return true and update live register vector if the
 /// specified register def of the specified SUnit clobbers any "live" registers.
 static bool CheckForLiveRegDef(SUnit *SU, unsigned Reg,
-                               std::vector<SUnit*> &LiveRegDefs,
+                               std::vector<SUnit *> &LiveRegDefs,
                                SmallSet<unsigned, 4> &RegAdded,
                                SmallVectorImpl<unsigned> &LRegs,
-                               const TargetRegisterInfo *TRI) {
+                               const TargetRegisterInfo *TRI,
+                               const SDNode *Node = nullptr) {
   bool Added = false;
   for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
-    if (LiveRegDefs[*AI] && LiveRegDefs[*AI] != SU) {
-      if (RegAdded.insert(*AI).second) {
-        LRegs.push_back(*AI);
-        Added = true;
-      }
+    // Check if Ref is live.
+    if (!LiveRegDefs[*AI])
+      continue;
+
+    // Allow multiple uses of the same def.
+    if (LiveRegDefs[*AI] == SU)
+      continue;
+
+    // Allow multiple uses of same def
+    if (Node && LiveRegDefs[*AI]->getNode() == Node)
+      continue;
+
+    // Add Reg to the set of interfering live regs.
+    if (RegAdded.insert(*AI).second) {
+      LRegs.push_back(*AI);
+      Added = true;
     }
   }
   return Added;
@@ -504,6 +514,15 @@ bool ScheduleDAGFast::DelayForLiveRegsBottomUp(SUnit *SU,
       }
       continue;
     }
+
+    if (Node->getOpcode() == ISD::CopyToReg) {
+      Register Reg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
+      if (Reg.isPhysical()) {
+        SDNode *SrcNode = Node->getOperand(2).getNode();
+        CheckForLiveRegDef(SU, Reg, LiveRegDefs, RegAdded, LRegs, TRI, SrcNode);
+      }
+    }
+
     if (!Node->isMachineOpcode())
       continue;
     const MCInstrDesc &MCID = TII->get(Node->getMachineOpcode());
@@ -758,7 +777,8 @@ void ScheduleDAGLinearize::Schedule() {
 
 MachineBasicBlock*
 ScheduleDAGLinearize::EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
-  InstrEmitter Emitter(DAG->getTarget(), BB, InsertPos);
+  InstrEmitter Emitter(DAG->getTarget(), BB, InsertPos,
+                       DAG->getUseInstrRefDebugInfo());
   DenseMap<SDValue, Register> VRBaseMap;
 
   LLVM_DEBUG({ dbgs() << "\n*** Final schedule ***\n"; });
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 7a5e8ac6075e..8a04ce7535a1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -1294,11 +1294,11 @@ static MVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
 
 /// CheckForLiveRegDef - Return true and update live register vector if the
 /// specified register def of the specified SUnit clobbers any "live" registers.
-static void CheckForLiveRegDef(SUnit *SU, unsigned Reg,
-                               SUnit **LiveRegDefs,
+static void CheckForLiveRegDef(SUnit *SU, unsigned Reg, SUnit **LiveRegDefs,
                                SmallSet<unsigned, 4> &RegAdded,
                                SmallVectorImpl<unsigned> &LRegs,
-                               const TargetRegisterInfo *TRI) {
+                               const TargetRegisterInfo *TRI,
+                               const SDNode *Node = nullptr) {
   for (MCRegAliasIterator AliasI(Reg, TRI, true); AliasI.isValid(); ++AliasI) {
 
     // Check if Ref is live.
@@ -1307,6 +1307,10 @@ static void CheckForLiveRegDef(SUnit *SU, unsigned Reg,
     // Allow multiple uses of the same def.
     if (LiveRegDefs[*AliasI] == SU) continue;
 
+    // Allow multiple uses of same def
+    if (Node && LiveRegDefs[*AliasI]->getNode() == Node)
+      continue;
+
     // Add Reg to the set of interfering live regs.
     if (RegAdded.insert(*AliasI).second) {
       LRegs.push_back(*AliasI);
@@ -1387,6 +1391,15 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) {
       continue;
     }
 
+    if (Node->getOpcode() == ISD::CopyToReg) {
+      Register Reg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
+      if (Reg.isPhysical()) {
+        SDNode *SrcNode = Node->getOperand(2).getNode();
+        CheckForLiveRegDef(SU, Reg, LiveRegDefs.get(), RegAdded, LRegs, TRI,
+                           SrcNode);
+      }
+    }
+
     if (!Node->isMachineOpcode())
       continue;
     // If we're in the middle of scheduling a call, don't begin scheduling
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 55f6f288f3e3..2a10157b404e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -843,7 +843,8 @@ EmitPhysRegCopy(SUnit *SU, DenseMap<SUnit*, Register> &VRBaseMap,
 /// not necessarily refer to returned BB. The emitter may split blocks.
 MachineBasicBlock *ScheduleDAGSDNodes::
 EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
-  InstrEmitter Emitter(DAG->getTarget(), BB, InsertPos);
+  InstrEmitter Emitter(DAG->getTarget(), BB, InsertPos,
+                       DAG->getUseInstrRefDebugInfo());
   DenseMap<SDValue, Register> VRBaseMap;
   DenseMap<SUnit*, Register> CopyVRBaseMap;
   SmallVector<std::pair<unsigned, MachineInstr*>, 32> Orders;
@@ -883,7 +884,7 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
 
     if (MI->isCandidateForCallSiteEntry() &&
         DAG->getTarget().Options.EmitCallSiteInfo)
-      MF.addCallArgsForwardingRegs(MI, DAG->getSDCallSiteInfo(Node));
+      MF.addCallArgsForwardingRegs(MI, DAG->getCallSiteInfo(Node));
 
     if (DAG->getNoMergeSiteInfo(Node)) {
       MI->setFlag(MachineInstr::MIFlag::NoMerge);
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
index 10940478010e..1ba1fd65b8c9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
@@ -19,19 +19,15 @@
 
 #include "ScheduleDAGSDNodes.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/LatencyPriorityQueue.h"
 #include "llvm/CodeGen/ResourcePriorityQueue.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include <climits>
 using namespace llvm;
 
 #define DEBUG_TYPE "pre-RA-sched"
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index d5998d166d25..b3b8756ae9ba 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -24,9 +24,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/MemoryLocation.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
@@ -55,7 +53,6 @@
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Compiler.h"
@@ -144,11 +141,11 @@ bool ISD::isConstantSplatVector(const SDNode *N, APInt &SplatVal) {
     unsigned EltSize =
         N->getValueType(0).getVectorElementType().getSizeInBits();
     if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
-      SplatVal = Op0->getAPIntValue().truncOrSelf(EltSize);
+      SplatVal = Op0->getAPIntValue().trunc(EltSize);
       return true;
     }
     if (auto *Op0 = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) {
-      SplatVal = Op0->getValueAPF().bitcastToAPInt().truncOrSelf(EltSize);
+      SplatVal = Op0->getValueAPF().bitcastToAPInt().trunc(EltSize);
       return true;
     }
   }
@@ -714,6 +711,7 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
     ID.AddInteger(LD->getMemoryVT().getRawBits());
     ID.AddInteger(LD->getRawSubclassData());
     ID.AddInteger(LD->getPointerInfo().getAddrSpace());
+    ID.AddInteger(LD->getMemOperand()->getFlags());
     break;
   }
   case ISD::STORE: {
@@ -721,6 +719,7 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
     ID.AddInteger(ST->getMemoryVT().getRawBits());
     ID.AddInteger(ST->getRawSubclassData());
     ID.AddInteger(ST->getPointerInfo().getAddrSpace());
+    ID.AddInteger(ST->getMemOperand()->getFlags());
     break;
   }
   case ISD::VP_LOAD: {
@@ -728,6 +727,7 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
     ID.AddInteger(ELD->getMemoryVT().getRawBits());
     ID.AddInteger(ELD->getRawSubclassData());
     ID.AddInteger(ELD->getPointerInfo().getAddrSpace());
+    ID.AddInteger(ELD->getMemOperand()->getFlags());
     break;
   }
   case ISD::VP_STORE: {
@@ -735,6 +735,21 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
     ID.AddInteger(EST->getMemoryVT().getRawBits());
     ID.AddInteger(EST->getRawSubclassData());
     ID.AddInteger(EST->getPointerInfo().getAddrSpace());
+    ID.AddInteger(EST->getMemOperand()->getFlags());
+    break;
+  }
+  case ISD::EXPERIMENTAL_VP_STRIDED_LOAD: {
+    const VPStridedLoadSDNode *SLD = cast<VPStridedLoadSDNode>(N);
+    ID.AddInteger(SLD->getMemoryVT().getRawBits());
+    ID.AddInteger(SLD->getRawSubclassData());
+    ID.AddInteger(SLD->getPointerInfo().getAddrSpace());
+    break;
+  }
+  case ISD::EXPERIMENTAL_VP_STRIDED_STORE: {
+    const VPStridedStoreSDNode *SST = cast<VPStridedStoreSDNode>(N);
+    ID.AddInteger(SST->getMemoryVT().getRawBits());
+    ID.AddInteger(SST->getRawSubclassData());
+    ID.AddInteger(SST->getPointerInfo().getAddrSpace());
     break;
   }
   case ISD::VP_GATHER: {
@@ -742,6 +757,7 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
     ID.AddInteger(EG->getMemoryVT().getRawBits());
     ID.AddInteger(EG->getRawSubclassData());
     ID.AddInteger(EG->getPointerInfo().getAddrSpace());
+    ID.AddInteger(EG->getMemOperand()->getFlags());
     break;
   }
   case ISD::VP_SCATTER: {
@@ -749,6 +765,7 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
     ID.AddInteger(ES->getMemoryVT().getRawBits());
     ID.AddInteger(ES->getRawSubclassData());
     ID.AddInteger(ES->getPointerInfo().getAddrSpace());
+    ID.AddInteger(ES->getMemOperand()->getFlags());
     break;
   }
   case ISD::MLOAD: {
@@ -756,6 +773,7 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
     ID.AddInteger(MLD->getMemoryVT().getRawBits());
     ID.AddInteger(MLD->getRawSubclassData());
     ID.AddInteger(MLD->getPointerInfo().getAddrSpace());
+    ID.AddInteger(MLD->getMemOperand()->getFlags());
     break;
   }
   case ISD::MSTORE: {
@@ -763,6 +781,7 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
     ID.AddInteger(MST->getMemoryVT().getRawBits());
     ID.AddInteger(MST->getRawSubclassData());
     ID.AddInteger(MST->getPointerInfo().getAddrSpace());
+    ID.AddInteger(MST->getMemOperand()->getFlags());
     break;
   }
   case ISD::MGATHER: {
@@ -770,6 +789,7 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
     ID.AddInteger(MG->getMemoryVT().getRawBits());
     ID.AddInteger(MG->getRawSubclassData());
     ID.AddInteger(MG->getPointerInfo().getAddrSpace());
+    ID.AddInteger(MG->getMemOperand()->getFlags());
     break;
   }
   case ISD::MSCATTER: {
@@ -777,6 +797,7 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
     ID.AddInteger(MS->getMemoryVT().getRawBits());
     ID.AddInteger(MS->getRawSubclassData());
     ID.AddInteger(MS->getPointerInfo().getAddrSpace());
+    ID.AddInteger(MS->getMemOperand()->getFlags());
     break;
   }
   case ISD::ATOMIC_CMP_SWAP:
@@ -799,11 +820,13 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
     ID.AddInteger(AT->getMemoryVT().getRawBits());
     ID.AddInteger(AT->getRawSubclassData());
     ID.AddInteger(AT->getPointerInfo().getAddrSpace());
+    ID.AddInteger(AT->getMemOperand()->getFlags());
     break;
   }
   case ISD::PREFETCH: {
     const MemSDNode *PF = cast<MemSDNode>(N);
     ID.AddInteger(PF->getPointerInfo().getAddrSpace());
+    ID.AddInteger(PF->getMemOperand()->getFlags());
     break;
   }
   case ISD::VECTOR_SHUFFLE: {
@@ -821,11 +844,18 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
     ID.AddInteger(BA->getTargetFlags());
     break;
   }
+  case ISD::AssertAlign:
+    ID.AddInteger(cast<AssertAlignSDNode>(N)->getAlign().value());
+    break;
   } // end switch (N->getOpcode())
 
-  // Target specific memory nodes could also have address spaces to check.
-  if (N->isTargetMemoryOpcode())
-    ID.AddInteger(cast<MemSDNode>(N)->getPointerInfo().getAddrSpace());
+  // Target specific memory nodes could also have address spaces and flags
+  // to check.
+  if (N->isTargetMemoryOpcode()) {
+    const MemSDNode *MN = cast<MemSDNode>(N);
+    ID.AddInteger(MN->getPointerInfo().getAddrSpace());
+    ID.AddInteger(MN->getMemOperand()->getFlags());
+  }
 }
 
 /// AddNodeIDNode - Generic routine for adding a nodes info to the NodeID
@@ -1395,6 +1425,12 @@ SDValue SelectionDAG::getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT) {
   return getNode(ISD::XOR, DL, VT, Val, TrueValue);
 }
 
+SDValue SelectionDAG::getVPLogicalNOT(const SDLoc &DL, SDValue Val,
+                                      SDValue Mask, SDValue EVL, EVT VT) {
+  SDValue TrueValue = getBoolConstant(true, DL, VT, VT);
+  return getNode(ISD::VP_XOR, DL, VT, Val, TrueValue, Mask, EVL);
+}
+
 SDValue SelectionDAG::getBoolConstant(bool V, const SDLoc &DL, EVT VT,
                                       EVT OpVT) {
   if (!V)
@@ -2433,23 +2469,9 @@ SDValue SelectionDAG::GetDemandedBits(SDValue V, const APInt &DemandedBits) {
   if (VT.isScalableVector())
     return SDValue();
 
-  APInt DemandedElts = VT.isVector()
-                           ? APInt::getAllOnes(VT.getVectorNumElements())
-                           : APInt(1, 1);
-  return GetDemandedBits(V, DemandedBits, DemandedElts);
-}
-
-/// See if the specified operand can be simplified with the knowledge that only
-/// the bits specified by DemandedBits are used in the elements specified by
-/// DemandedElts.
-/// TODO: really we should be making this into the DAG equivalent of
-/// SimplifyMultipleUseDemandedBits and not generate any new nodes.
-SDValue SelectionDAG::GetDemandedBits(SDValue V, const APInt &DemandedBits,
-                                      const APInt &DemandedElts) {
   switch (V.getOpcode()) {
   default:
-    return TLI->SimplifyMultipleUseDemandedBits(V, DemandedBits, DemandedElts,
-                                                *this);
+    return TLI->SimplifyMultipleUseDemandedBits(V, DemandedBits, *this);
   case ISD::Constant: {
     const APInt &CVal = cast<ConstantSDNode>(V)->getAPIntValue();
     APInt NewVal = CVal & DemandedBits;
@@ -2469,8 +2491,8 @@ SDValue SelectionDAG::GetDemandedBits(SDValue V, const APInt &DemandedBits,
       if (Amt >= DemandedBits.getBitWidth())
         break;
       APInt SrcDemandedBits = DemandedBits << Amt;
-      if (SDValue SimplifyLHS =
-              GetDemandedBits(V.getOperand(0), SrcDemandedBits))
+      if (SDValue SimplifyLHS = TLI->SimplifyMultipleUseDemandedBits(
+              V.getOperand(0), SrcDemandedBits, *this))
         return getNode(ISD::SRL, SDLoc(V), V.getValueType(), SimplifyLHS,
                        V.getOperand(1));
     }
@@ -2503,6 +2525,14 @@ bool SelectionDAG::MaskedValueIsZero(SDValue V, const APInt &Mask,
   return Mask.isSubsetOf(computeKnownBits(V, DemandedElts, Depth).Zero);
 }
 
+/// MaskedVectorIsZero - Return true if 'Op' is known to be zero in
+/// DemandedElts.  We use this predicate to simplify operations downstream.
+bool SelectionDAG::MaskedVectorIsZero(SDValue V, const APInt &DemandedElts,
+                                      unsigned Depth /* = 0 */) const {
+  APInt Mask = APInt::getAllOnes(V.getScalarValueSizeInBits());
+  return Mask.isSubsetOf(computeKnownBits(V, DemandedElts, Depth).Zero);
+}
+
 /// MaskedValueIsAllOnes - Return true if '(Op & Mask) == Mask'.
 bool SelectionDAG::MaskedValueIsAllOnes(SDValue V, const APInt &Mask,
                                         unsigned Depth) const {
@@ -2587,9 +2617,9 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts,
     return true;
   }
   case ISD::VECTOR_SHUFFLE: {
-    // Check if this is a shuffle node doing a splat.
-    // TODO: Do we need to handle shuffle(splat, undef, mask)?
-    int SplatIndex = -1;
+    // Check if this is a shuffle node doing a splat or a shuffle of a splat.
+    APInt DemandedLHS = APInt::getNullValue(NumElts);
+    APInt DemandedRHS = APInt::getNullValue(NumElts);
     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(V)->getMask();
     for (int i = 0; i != (int)NumElts; ++i) {
       int M = Mask[i];
@@ -2599,11 +2629,30 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts,
       }
       if (!DemandedElts[i])
         continue;
-      if (0 <= SplatIndex && SplatIndex != M)
-        return false;
-      SplatIndex = M;
+      if (M < (int)NumElts)
+        DemandedLHS.setBit(M);
+      else
+        DemandedRHS.setBit(M - NumElts);
     }
-    return true;
+
+    // If we aren't demanding either op, assume there's no splat.
+    // If we are demanding both ops, assume there's no splat.
+    if ((DemandedLHS.isZero() && DemandedRHS.isZero()) ||
+        (!DemandedLHS.isZero() && !DemandedRHS.isZero()))
+      return false;
+
+    // See if the demanded elts of the source op is a splat or we only demand
+    // one element, which should always be a splat.
+    // TODO: Handle source ops splats with undefs.
+    auto CheckSplatSrc = [&](SDValue Src, const APInt &SrcElts) {
+      APInt SrcUndefs;
+      return (SrcElts.countPopulation() == 1) ||
+             (isSplatValue(Src, SrcElts, SrcUndefs, Depth + 1) &&
+              (SrcElts & SrcUndefs).isZero());
+    };
+    if (!DemandedLHS.isZero())
+      return CheckSplatSrc(V.getOperand(0), DemandedLHS);
+    return CheckSplatSrc(V.getOperand(1), DemandedRHS);
   }
   case ISD::EXTRACT_SUBVECTOR: {
     // Offset the demanded elts by the subvector index.
@@ -2614,7 +2663,7 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts,
     uint64_t Idx = V.getConstantOperandVal(1);
     unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
     APInt UndefSrcElts;
-    APInt DemandedSrcElts = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
+    APInt DemandedSrcElts = DemandedElts.zext(NumSrcElts).shl(Idx);
     if (isSplatValue(Src, DemandedSrcElts, UndefSrcElts, Depth + 1)) {
       UndefElts = UndefSrcElts.extractBits(NumElts, Idx);
       return true;
@@ -2631,9 +2680,49 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts,
       return false;
     unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
     APInt UndefSrcElts;
-    APInt DemandedSrcElts = DemandedElts.zextOrSelf(NumSrcElts);
+    APInt DemandedSrcElts = DemandedElts.zext(NumSrcElts);
     if (isSplatValue(Src, DemandedSrcElts, UndefSrcElts, Depth + 1)) {
-      UndefElts = UndefSrcElts.truncOrSelf(NumElts);
+      UndefElts = UndefSrcElts.trunc(NumElts);
+      return true;
+    }
+    break;
+  }
+  case ISD::BITCAST: {
+    SDValue Src = V.getOperand(0);
+    EVT SrcVT = Src.getValueType();
+    unsigned SrcBitWidth = SrcVT.getScalarSizeInBits();
+    unsigned BitWidth = VT.getScalarSizeInBits();
+
+    // Ignore bitcasts from unsupported types.
+    // TODO: Add fp support?
+    if (!SrcVT.isVector() || !SrcVT.isInteger() || !VT.isInteger())
+      break;
+
+    // Bitcast 'small element' vector to 'large element' vector.
+    if ((BitWidth % SrcBitWidth) == 0) {
+      // See if each sub element is a splat.
+      unsigned Scale = BitWidth / SrcBitWidth;
+      unsigned NumSrcElts = SrcVT.getVectorNumElements();
+      APInt ScaledDemandedElts =
+          APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
+      for (unsigned I = 0; I != Scale; ++I) {
+        APInt SubUndefElts;
+        APInt SubDemandedElt = APInt::getOneBitSet(Scale, I);
+        APInt SubDemandedElts = APInt::getSplat(NumSrcElts, SubDemandedElt);
+        SubDemandedElts &= ScaledDemandedElts;
+        if (!isSplatValue(Src, SubDemandedElts, SubUndefElts, Depth + 1))
+          return false;
+
+        // Here we can't do "MatchAnyBits" operation merge for undef bits.
+        // Because some operation only use part value of the source.
+        // Take llvm.fshl.* for example:
+        // t1: v4i32 = Constant:i32<12>, undef:i32, Constant:i32<12>, undef:i32
+        // t2: v2i64 = bitcast t1
+        // t5: v2i64 = fshl t3, t4, t2
+        // We can not convert t2 to {i64 undef, i64 undef}
+        UndefElts |= APIntOps::ScaleBitMask(SubUndefElts, NumElts,
+                                            /*MatchAllBits=*/true);
+      }
       return true;
     }
     break;
@@ -2978,7 +3067,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
       break;
     uint64_t Idx = Op.getConstantOperandVal(1);
     unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
-    APInt DemandedSrcElts = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
+    APInt DemandedSrcElts = DemandedElts.zext(NumSrcElts).shl(Idx);
     Known = computeKnownBits(Src, DemandedSrcElts, Depth + 1);
     break;
   }
@@ -3083,9 +3172,18 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
     bool SelfMultiply = Op.getOperand(0) == Op.getOperand(1);
     // TODO: SelfMultiply can be poison, but not undef.
-    SelfMultiply &= isGuaranteedNotToBeUndefOrPoison(
-        Op.getOperand(0), DemandedElts, false, Depth + 1);
+    if (SelfMultiply)
+      SelfMultiply &= isGuaranteedNotToBeUndefOrPoison(
+          Op.getOperand(0), DemandedElts, false, Depth + 1);
     Known = KnownBits::mul(Known, Known2, SelfMultiply);
+
+    // If the multiplication is known not to overflow, the product of a number
+    // with itself is non-negative. Only do this if we didn't already computed
+    // the opposite value for the sign bit.
+    if (Op->getFlags().hasNoSignedWrap() &&
+        Op.getOperand(0) == Op.getOperand(1) &&
+        !Known.isNegative())
+      Known.makeNonNegative();
     break;
   }
   case ISD::MULHU: {
@@ -3128,6 +3226,16 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     Known = KnownBits::udiv(Known, Known2);
     break;
   }
+  case ISD::AVGCEILU: {
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known = Known.zext(BitWidth + 1);
+    Known2 = Known2.zext(BitWidth + 1);
+    KnownBits One = KnownBits::makeConstant(APInt(1, 1));
+    Known = KnownBits::computeForAddCarry(Known, Known2, One);
+    Known = Known.extractBits(BitWidth, 1);
+    break;
+  }
   case ISD::SELECT:
   case ISD::VSELECT:
     Known = computeKnownBits(Op.getOperand(2), DemandedElts, Depth+1);
@@ -3330,7 +3438,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
   }
   case ISD::ZERO_EXTEND_VECTOR_INREG: {
     EVT InVT = Op.getOperand(0).getValueType();
-    APInt InDemandedElts = DemandedElts.zextOrSelf(InVT.getVectorNumElements());
+    APInt InDemandedElts = DemandedElts.zext(InVT.getVectorNumElements());
     Known = computeKnownBits(Op.getOperand(0), InDemandedElts, Depth + 1);
     Known = Known.zext(BitWidth);
     break;
@@ -3342,7 +3450,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
   }
   case ISD::SIGN_EXTEND_VECTOR_INREG: {
     EVT InVT = Op.getOperand(0).getValueType();
-    APInt InDemandedElts = DemandedElts.zextOrSelf(InVT.getVectorNumElements());
+    APInt InDemandedElts = DemandedElts.zext(InVT.getVectorNumElements());
     Known = computeKnownBits(Op.getOperand(0), InDemandedElts, Depth + 1);
     // If the sign bit is known to be zero or one, then sext will extend
     // it to the top bits, else it will just zext.
@@ -3358,7 +3466,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
   }
   case ISD::ANY_EXTEND_VECTOR_INREG: {
     EVT InVT = Op.getOperand(0).getValueType();
-    APInt InDemandedElts = DemandedElts.zextOrSelf(InVT.getVectorNumElements());
+    APInt InDemandedElts = DemandedElts.zext(InVT.getVectorNumElements());
     Known = computeKnownBits(Op.getOperand(0), InDemandedElts, Depth + 1);
     Known = Known.anyext(BitWidth);
     break;
@@ -3605,6 +3713,19 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
       Known = KnownBits::smax(Known, Known2);
     else
       Known = KnownBits::smin(Known, Known2);
+
+    // For SMAX, if CstLow is non-negative we know the result will be
+    // non-negative and thus all sign bits are 0.
+    // TODO: There's an equivalent of this for smin with negative constant for
+    // known ones.
+    if (IsMax && CstLow) {
+      const APInt &ValueLow = CstLow->getAPIntValue();
+      if (ValueLow.isNonNegative()) {
+        unsigned SignBits = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+        Known.Zero.setHighBits(std::min(SignBits, ValueLow.getNumSignBits()));
+      }
+    }
+
     break;
   }
   case ISD::FP_TO_UINT_SAT: {
@@ -3905,7 +4026,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
   case ISD::SIGN_EXTEND_VECTOR_INREG: {
     SDValue Src = Op.getOperand(0);
     EVT SrcVT = Src.getValueType();
-    APInt DemandedSrcElts = DemandedElts.zextOrSelf(SrcVT.getVectorNumElements());
+    APInt DemandedSrcElts = DemandedElts.zext(SrcVT.getVectorNumElements());
     Tmp = VTBits - SrcVT.getScalarSizeInBits();
     return ComputeNumSignBits(Src, DemandedSrcElts, Depth+1) + Tmp;
   }
@@ -4192,7 +4313,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
       break;
     uint64_t Idx = Op.getConstantOperandVal(1);
     unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
-    APInt DemandedSrcElts = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
+    APInt DemandedSrcElts = DemandedElts.zext(NumSrcElts).shl(Idx);
     return ComputeNumSignBits(Src, DemandedSrcElts, Depth + 1);
   }
   case ISD::CONCAT_VECTORS: {
@@ -4585,26 +4706,54 @@ bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const {
   return false;
 }
 
+// Only bits set in Mask must be negated, other bits may be arbitrary.
+SDValue llvm::getBitwiseNotOperand(SDValue V, SDValue Mask, bool AllowUndefs) {
+  if (isBitwiseNot(V, AllowUndefs))
+    return V.getOperand(0);
+
+  // Handle any_extend (not (truncate X)) pattern, where Mask only sets
+  // bits in the non-extended part.
+  ConstantSDNode *MaskC = isConstOrConstSplat(Mask);
+  if (!MaskC || V.getOpcode() != ISD::ANY_EXTEND)
+    return SDValue();
+  SDValue ExtArg = V.getOperand(0);
+  if (ExtArg.getScalarValueSizeInBits() >=
+          MaskC->getAPIntValue().getActiveBits() &&
+      isBitwiseNot(ExtArg, AllowUndefs) &&
+      ExtArg.getOperand(0).getOpcode() == ISD::TRUNCATE &&
+      ExtArg.getOperand(0).getOperand(0).getValueType() == V.getValueType())
+    return ExtArg.getOperand(0).getOperand(0);
+  return SDValue();
+}
+
+static bool haveNoCommonBitsSetCommutative(SDValue A, SDValue B) {
+  // Match masked merge pattern (X & ~M) op (Y & M)
+  // Including degenerate case (X & ~M) op M
+  auto MatchNoCommonBitsPattern = [&](SDValue Not, SDValue Mask,
+                                      SDValue Other) {
+    if (SDValue NotOperand =
+            getBitwiseNotOperand(Not, Mask, /* AllowUndefs */ true)) {
+      if (Other == NotOperand)
+        return true;
+      if (Other->getOpcode() == ISD::AND)
+        return NotOperand == Other->getOperand(0) ||
+               NotOperand == Other->getOperand(1);
+    }
+    return false;
+  };
+  if (A->getOpcode() == ISD::AND)
+    return MatchNoCommonBitsPattern(A->getOperand(0), A->getOperand(1), B) ||
+           MatchNoCommonBitsPattern(A->getOperand(1), A->getOperand(0), B);
+  return false;
+}
+
 // FIXME: unify with llvm::haveNoCommonBitsSet.
 bool SelectionDAG::haveNoCommonBitsSet(SDValue A, SDValue B) const {
   assert(A.getValueType() == B.getValueType() &&
          "Values must have the same type");
-  // Match masked merge pattern (X & ~M) op (Y & M)
-  if (A->getOpcode() == ISD::AND && B->getOpcode() == ISD::AND) {
-    auto MatchNoCommonBitsPattern = [&](SDValue NotM, SDValue And) {
-      if (isBitwiseNot(NotM, true)) {
-        SDValue NotOperand = NotM->getOperand(0);
-        return NotOperand == And->getOperand(0) ||
-               NotOperand == And->getOperand(1);
-      }
-      return false;
-    };
-    if (MatchNoCommonBitsPattern(A->getOperand(0), B) ||
-        MatchNoCommonBitsPattern(A->getOperand(1), B) ||
-        MatchNoCommonBitsPattern(B->getOperand(0), A) ||
-        MatchNoCommonBitsPattern(B->getOperand(1), A))
-      return true;
-  }
+  if (haveNoCommonBitsSetCommutative(A, B) ||
+      haveNoCommonBitsSetCommutative(B, A))
+    return true;
   return KnownBits::haveNoCommonBitsSet(computeKnownBits(A),
                                         computeKnownBits(B));
 }
@@ -4833,9 +4982,11 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     case ISD::CTTZ_ZERO_UNDEF:
       return getConstant(Val.countTrailingZeros(), DL, VT, C->isTargetOpcode(),
                          C->isOpaque());
-    case ISD::FP16_TO_FP: {
+    case ISD::FP16_TO_FP:
+    case ISD::BF16_TO_FP: {
       bool Ignored;
-      APFloat FPV(APFloat::IEEEhalf(),
+      APFloat FPV(Opcode == ISD::FP16_TO_FP ? APFloat::IEEEhalf()
+                                            : APFloat::BFloat(),
                   (Val.getBitWidth() == 16) ? Val : Val.trunc(16));
 
       // This can return overflow, underflow, or inexact; we don't care.
@@ -4909,11 +5060,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
       if (VT == MVT::i64 && C->getValueType(0) == MVT::f64)
         return getConstant(V.bitcastToAPInt().getZExtValue(), DL, VT);
       break;
-    case ISD::FP_TO_FP16: {
+    case ISD::FP_TO_FP16:
+    case ISD::FP_TO_BF16: {
       bool Ignored;
       // This can return overflow, underflow, or inexact; we don't care.
       // FIXME need to be more flexible about rounding mode.
-      (void)V.convert(APFloat::IEEEhalf(),
+      (void)V.convert(Opcode == ISD::FP_TO_FP16 ? APFloat::IEEEhalf()
+                                                : APFloat::BFloat(),
                       APFloat::rmNearestTiesToEven, &Ignored);
       return getConstant(V.bitcastToAPInt().getZExtValue(), DL, VT);
     }
@@ -4965,6 +5118,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     break;
   case ISD::FREEZE:
     assert(VT == Operand.getValueType() && "Unexpected VT!");
+    if (isGuaranteedNotToBeUndefOrPoison(Operand))
+      return Operand;
     break;
   case ISD::TokenFactor:
   case ISD::MERGE_VALUES:
@@ -5114,7 +5269,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     assert(VT.isInteger() && VT == Operand.getValueType() &&
            "Invalid ABS!");
     if (OpOpcode == ISD::UNDEF)
-      return getUNDEF(VT);
+      return getConstant(0, DL, VT);
     break;
   case ISD::BSWAP:
     assert(VT.isInteger() && VT == Operand.getValueType() &&
@@ -5182,6 +5337,10 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     if (Operand.getValueType().getScalarType() == MVT::i1)
       return getNOT(DL, Operand, Operand.getValueType());
     break;
+  case ISD::VECREDUCE_ADD:
+    if (Operand.getValueType().getScalarType() == MVT::i1)
+      return getNode(ISD::VECREDUCE_XOR, DL, VT, Operand);
+    break;
   case ISD::VECREDUCE_SMIN:
   case ISD::VECREDUCE_UMAX:
     if (Operand.getValueType().getScalarType() == MVT::i1)
@@ -5273,6 +5432,30 @@ static llvm::Optional<APInt> FoldValue(unsigned Opcode, const APInt &C1,
     APInt C2Ext = C2.zext(FullWidth);
     return (C1Ext * C2Ext).extractBits(C1.getBitWidth(), C1.getBitWidth());
   }
+  case ISD::AVGFLOORS: {
+    unsigned FullWidth = C1.getBitWidth() + 1;
+    APInt C1Ext = C1.sext(FullWidth);
+    APInt C2Ext = C2.sext(FullWidth);
+    return (C1Ext + C2Ext).extractBits(C1.getBitWidth(), 1);
+  }
+  case ISD::AVGFLOORU: {
+    unsigned FullWidth = C1.getBitWidth() + 1;
+    APInt C1Ext = C1.zext(FullWidth);
+    APInt C2Ext = C2.zext(FullWidth);
+    return (C1Ext + C2Ext).extractBits(C1.getBitWidth(), 1);
+  }
+  case ISD::AVGCEILS: {
+    unsigned FullWidth = C1.getBitWidth() + 1;
+    APInt C1Ext = C1.sext(FullWidth);
+    APInt C2Ext = C2.sext(FullWidth);
+    return (C1Ext + C2Ext + 1).extractBits(C1.getBitWidth(), 1);
+  }
+  case ISD::AVGCEILU: {
+    unsigned FullWidth = C1.getBitWidth() + 1;
+    APInt C1Ext = C1.zext(FullWidth);
+    APInt C2Ext = C2.zext(FullWidth);
+    return (C1Ext + C2Ext + 1).extractBits(C1.getBitWidth(), 1);
+  }
   }
   return llvm::None;
 }
@@ -5355,7 +5538,7 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
         if (!FoldAttempt)
           return SDValue();
 
-        SDValue Folded = getConstant(FoldAttempt.getValue(), DL, VT);
+        SDValue Folded = getConstant(*FoldAttempt, DL, VT);
         assert((!Folded || !VT.isVector()) &&
                "Can't fold vectors ops with scalar operands");
         return Folded;
@@ -5400,7 +5583,7 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
           Optional<APInt> Fold = FoldValue(Opcode, RawBits1[I], RawBits2[I]);
           if (!Fold)
             break;
-          RawBits.push_back(Fold.getValue());
+          RawBits.push_back(*Fold);
         }
         if (RawBits.size() == NumElts.getFixedValue()) {
           // We have constant folded, but we need to cast this again back to
@@ -5416,7 +5599,7 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
           for (unsigned I = 0, E = DstBits.size(); I != E; ++I) {
             if (DstUndefs[I])
               continue;
-            Ops[I] = getConstant(DstBits[I].sextOrSelf(BVEltBits), DL, BVEltVT);
+            Ops[I] = getConstant(DstBits[I].sext(BVEltBits), DL, BVEltVT);
           }
           return getBitcast(VT, getBuildVector(BVVT, DL, Ops));
         }
@@ -5455,9 +5638,14 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
       !llvm::all_of(Ops, IsScalarOrSameVectorSize))
     return SDValue();
 
-  // If we are comparing vectors, then the result needs to be a i1 boolean
-  // that is then sign-extended back to the legal result type.
+  // If we are comparing vectors, then the result needs to be a i1 boolean that
+  // is then extended back to the legal result type depending on how booleans
+  // are represented.
   EVT SVT = (Opcode == ISD::SETCC ? MVT::i1 : VT.getScalarType());
+  ISD::NodeType ExtendCode =
+      (Opcode == ISD::SETCC && SVT != VT.getScalarType())
+          ? TargetLowering::getExtendForContent(TLI->getBooleanContents(VT))
+          : ISD::SIGN_EXTEND;
 
   // Find legal integer scalar type for constant promotion and
   // ensure that its scalar size is at least as large as source.
@@ -5494,8 +5682,18 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
 
       // Build vector (integer) scalar operands may need implicit
       // truncation - do this before constant folding.
-      if (ScalarVT.isInteger() && ScalarVT.bitsGT(InSVT))
+      if (ScalarVT.isInteger() && ScalarVT.bitsGT(InSVT)) {
+        // Don't create illegally-typed nodes unless they're constants or undef
+        // - if we fail to constant fold we can't guarantee the (dead) nodes
+        // we're creating will be cleaned up before being visited for
+        // legalization.
+        if (NewNodesMustHaveLegalTypes && !ScalarOp.isUndef() &&
+            !isa<ConstantSDNode>(ScalarOp) &&
+            TLI->getTypeAction(*getContext(), InSVT) !=
+                TargetLowering::TypeLegal)
+          return SDValue();
         ScalarOp = getNode(ISD::TRUNCATE, DL, InSVT, ScalarOp);
+      }
 
       ScalarOps.push_back(ScalarOp);
     }
@@ -5505,7 +5703,7 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
 
     // Legalize the (integer) scalar constant if necessary.
     if (LegalSVT != SVT)
-      ScalarResult = getNode(ISD::SIGN_EXTEND, DL, LegalSVT, ScalarResult);
+      ScalarResult = getNode(ExtendCode, DL, LegalSVT, ScalarResult);
 
     // Scalar folding only succeeded if the result is a constant or UNDEF.
     if (!ScalarResult.isUndef() && ScalarResult.getOpcode() != ISD::Constant &&
@@ -5629,20 +5827,34 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
   return getNode(Opcode, DL, VT, N1, N2, Flags);
 }
 
+void SelectionDAG::canonicalizeCommutativeBinop(unsigned Opcode, SDValue &N1,
+                                                SDValue &N2) const {
+  if (!TLI->isCommutativeBinOp(Opcode))
+    return;
+
+  // Canonicalize:
+  //   binop(const, nonconst) -> binop(nonconst, const)
+  bool IsN1C = isConstantIntBuildVectorOrConstantInt(N1);
+  bool IsN2C = isConstantIntBuildVectorOrConstantInt(N2);
+  bool IsN1CFP = isConstantFPBuildVectorOrConstantFP(N1);
+  bool IsN2CFP = isConstantFPBuildVectorOrConstantFP(N2);
+  if ((IsN1C && !IsN2C) || (IsN1CFP && !IsN2CFP))
+    std::swap(N1, N2);
+
+  // Canonicalize:
+  //  binop(splat(x), step_vector) -> binop(step_vector, splat(x))
+  else if (N1.getOpcode() == ISD::SPLAT_VECTOR &&
+           N2.getOpcode() == ISD::STEP_VECTOR)
+    std::swap(N1, N2);
+}
+
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
                               SDValue N1, SDValue N2, const SDNodeFlags Flags) {
   assert(N1.getOpcode() != ISD::DELETED_NODE &&
          N2.getOpcode() != ISD::DELETED_NODE &&
          "Operand is DELETED_NODE!");
-  // Canonicalize constant to RHS if commutative.
-  if (TLI->isCommutativeBinOp(Opcode)) {
-    bool IsN1C = isConstantIntBuildVectorOrConstantInt(N1);
-    bool IsN2C = isConstantIntBuildVectorOrConstantInt(N2);
-    bool IsN1CFP = isConstantFPBuildVectorOrConstantFP(N1);
-    bool IsN2CFP = isConstantFPBuildVectorOrConstantFP(N2);
-    if ((IsN1C && !IsN2C) || (IsN1CFP && !IsN2CFP))
-      std::swap(N1, N2);
-  }
+
+  canonicalizeCommutativeBinop(Opcode, N1, N2);
 
   auto *N1C = dyn_cast<ConstantSDNode>(N1);
   auto *N2C = dyn_cast<ConstantSDNode>(N2);
@@ -5946,6 +6158,10 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
         if (N1Op2C->getZExtValue() == N2C->getZExtValue()) {
           if (VT == N1.getOperand(1).getValueType())
             return N1.getOperand(1);
+          if (VT.isFloatingPoint()) {
+            assert(VT.getSizeInBits() > N1.getOperand(1).getValueType().getSizeInBits());
+            return getFPExtendOrRound(N1.getOperand(1), DL, VT);
+          }
           return getSExtOrTrunc(N1.getOperand(1), DL, VT);
         }
         return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, N1.getOperand(0), N2);
@@ -6043,9 +6259,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
       std::swap(N1, N2);
     } else {
       switch (Opcode) {
-      case ISD::SIGN_EXTEND_INREG:
       case ISD::SUB:
         return getUNDEF(VT);     // fold op(undef, arg2) -> undef
+      case ISD::SIGN_EXTEND_INREG:
       case ISD::UDIV:
       case ISD::SDIV:
       case ISD::UREM:
@@ -6534,7 +6750,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
     if (!TRI->hasStackRealignment(MF))
       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
-        NewAlign = NewAlign / 2;
+        NewAlign = NewAlign.previous();
 
     if (NewAlign > Alignment) {
       // Give the stack frame object a larger alignment if needed.
@@ -6782,17 +6998,18 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
 /// \param Size Number of bytes to write.
 /// \param Alignment Alignment of the destination in bytes.
 /// \param isVol True if destination is volatile.
+/// \param AlwaysInline Makes sure no function call is generated.
 /// \param DstPtrInfo IR information on the memory pointer.
 /// \returns New head in the control flow, if lowering was successful, empty
 /// SDValue otherwise.
 ///
 /// The function tries to replace 'llvm.memset' intrinsic with several store
 /// operations and value calculation code. This is usually profitable for small
-/// memory size.
+/// memory size or when the semantic requires inlining.
 static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl,
                                SDValue Chain, SDValue Dst, SDValue Src,
                                uint64_t Size, Align Alignment, bool isVol,
-                               MachinePointerInfo DstPtrInfo,
+                               bool AlwaysInline, MachinePointerInfo DstPtrInfo,
                                const AAMDNodes &AAInfo) {
   // Turn a memset of undef to nop.
   // FIXME: We need to honor volatile even is Src is undef.
@@ -6812,8 +7029,10 @@ static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl,
     DstAlignCanChange = true;
   bool IsZeroVal =
       isa<ConstantSDNode>(Src) && cast<ConstantSDNode>(Src)->isZero();
+  unsigned Limit = AlwaysInline ? ~0 : TLI.getMaxStoresPerMemset(OptSize);
+
   if (!TLI.findOptimalMemOpLowering(
-          MemOps, TLI.getMaxStoresPerMemset(OptSize),
+          MemOps, Limit,
           MemOp::Set(Size, DstAlignCanChange, Alignment, IsZeroVal, isVol),
           DstPtrInfo.getAddrSpace(), ~0u, MF.getFunction().getAttributes()))
     return SDValue();
@@ -6964,10 +7183,9 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst,
 }
 
 SDValue SelectionDAG::getAtomicMemcpy(SDValue Chain, const SDLoc &dl,
-                                      SDValue Dst, unsigned DstAlign,
-                                      SDValue Src, unsigned SrcAlign,
-                                      SDValue Size, Type *SizeTy,
-                                      unsigned ElemSz, bool isTailCall,
+                                      SDValue Dst, SDValue Src, SDValue Size,
+                                      Type *SizeTy, unsigned ElemSz,
+                                      bool isTailCall,
                                       MachinePointerInfo DstPtrInfo,
                                       MachinePointerInfo SrcPtrInfo) {
   // Emit a library call.
@@ -7067,10 +7285,9 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst,
 }
 
 SDValue SelectionDAG::getAtomicMemmove(SDValue Chain, const SDLoc &dl,
-                                       SDValue Dst, unsigned DstAlign,
-                                       SDValue Src, unsigned SrcAlign,
-                                       SDValue Size, Type *SizeTy,
-                                       unsigned ElemSz, bool isTailCall,
+                                       SDValue Dst, SDValue Src, SDValue Size,
+                                       Type *SizeTy, unsigned ElemSz,
+                                       bool isTailCall,
                                        MachinePointerInfo DstPtrInfo,
                                        MachinePointerInfo SrcPtrInfo) {
   // Emit a library call.
@@ -7109,7 +7326,7 @@ SDValue SelectionDAG::getAtomicMemmove(SDValue Chain, const SDLoc &dl,
 
 SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
                                 SDValue Src, SDValue Size, Align Alignment,
-                                bool isVol, bool isTailCall,
+                                bool isVol, bool AlwaysInline, bool isTailCall,
                                 MachinePointerInfo DstPtrInfo,
                                 const AAMDNodes &AAInfo) {
   // Check to see if we should lower the memset to stores first.
@@ -7122,7 +7339,7 @@ SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
 
     SDValue Result = getMemsetStores(*this, dl, Chain, Dst, Src,
                                      ConstantSize->getZExtValue(), Alignment,
-                                     isVol, DstPtrInfo, AAInfo);
+                                     isVol, false, DstPtrInfo, AAInfo);
 
     if (Result.getNode())
       return Result;
@@ -7132,45 +7349,75 @@ SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
   // code. If the target chooses to do this, this is the next best.
   if (TSI) {
     SDValue Result = TSI->EmitTargetCodeForMemset(
-        *this, dl, Chain, Dst, Src, Size, Alignment, isVol, DstPtrInfo);
+        *this, dl, Chain, Dst, Src, Size, Alignment, isVol, AlwaysInline, DstPtrInfo);
     if (Result.getNode())
       return Result;
   }
 
+  // If we really need inline code and the target declined to provide it,
+  // use a (potentially long) sequence of loads and stores.
+  if (AlwaysInline) {
+    assert(ConstantSize && "AlwaysInline requires a constant size!");
+    SDValue Result = getMemsetStores(*this, dl, Chain, Dst, Src,
+                                     ConstantSize->getZExtValue(), Alignment,
+                                     isVol, true, DstPtrInfo, AAInfo);
+    assert(Result &&
+           "getMemsetStores must return a valid sequence when AlwaysInline");
+    return Result;
+  }
+
   checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());
 
   // Emit a library call.
-  TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
-  Entry.Node = Dst; Entry.Ty = Type::getInt8PtrTy(*getContext());
-  Args.push_back(Entry);
-  Entry.Node = Src;
-  Entry.Ty = Src.getValueType().getTypeForEVT(*getContext());
-  Args.push_back(Entry);
-  Entry.Node = Size;
-  Entry.Ty = getDataLayout().getIntPtrType(*getContext());
-  Args.push_back(Entry);
+  auto &Ctx = *getContext();
+  const auto& DL = getDataLayout();
 
-  // FIXME: pass in SDLoc
   TargetLowering::CallLoweringInfo CLI(*this);
-  CLI.setDebugLoc(dl)
-      .setChain(Chain)
-      .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET),
-                    Dst.getValueType().getTypeForEVT(*getContext()),
-                    getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET),
-                                      TLI->getPointerTy(getDataLayout())),
-                    std::move(Args))
-      .setDiscardResult()
-      .setTailCall(isTailCall);
+  // FIXME: pass in SDLoc
+  CLI.setDebugLoc(dl).setChain(Chain);
+
+  ConstantSDNode *ConstantSrc = dyn_cast<ConstantSDNode>(Src);
+  const bool SrcIsZero = ConstantSrc && ConstantSrc->isZero();
+  const char *BzeroName = getTargetLoweringInfo().getLibcallName(RTLIB::BZERO);
+
+  // Helper function to create an Entry from Node and Type.
+  const auto CreateEntry = [](SDValue Node, Type *Ty) {
+    TargetLowering::ArgListEntry Entry;
+    Entry.Node = Node;
+    Entry.Ty = Ty;
+    return Entry;
+  };
 
-  std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
+  // If zeroing out and bzero is present, use it.
+  if (SrcIsZero && BzeroName) {
+    TargetLowering::ArgListTy Args;
+    Args.push_back(CreateEntry(Dst, Type::getInt8PtrTy(Ctx)));
+    Args.push_back(CreateEntry(Size, DL.getIntPtrType(Ctx)));
+    CLI.setLibCallee(
+        TLI->getLibcallCallingConv(RTLIB::BZERO), Type::getVoidTy(Ctx),
+        getExternalSymbol(BzeroName, TLI->getPointerTy(DL)), std::move(Args));
+  } else {
+    TargetLowering::ArgListTy Args;
+    Args.push_back(CreateEntry(Dst, Type::getInt8PtrTy(Ctx)));
+    Args.push_back(CreateEntry(Src, Src.getValueType().getTypeForEVT(Ctx)));
+    Args.push_back(CreateEntry(Size, DL.getIntPtrType(Ctx)));
+    CLI.setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET),
+                     Dst.getValueType().getTypeForEVT(Ctx),
+                     getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET),
+                                       TLI->getPointerTy(DL)),
+                     std::move(Args));
+  }
+
+  CLI.setDiscardResult().setTailCall(isTailCall);
+
+  std::pair<SDValue, SDValue> CallResult = TLI->LowerCallTo(CLI);
   return CallResult.second;
 }
 
 SDValue SelectionDAG::getAtomicMemset(SDValue Chain, const SDLoc &dl,
-                                      SDValue Dst, unsigned DstAlign,
-                                      SDValue Value, SDValue Size, Type *SizeTy,
-                                      unsigned ElemSz, bool isTailCall,
+                                      SDValue Dst, SDValue Value, SDValue Size,
+                                      Type *SizeTy, unsigned ElemSz,
+                                      bool isTailCall,
                                       MachinePointerInfo DstPtrInfo) {
   // Emit a library call.
   TargetLowering::ArgListTy Args;
@@ -7214,6 +7461,7 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
   ID.AddInteger(MemVT.getRawBits());
   AddNodeIDNode(ID, Opcode, VTList, Ops);
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  ID.AddInteger(MMO->getFlags());
   void* IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<AtomicSDNode>(E)->refineAlignment(MMO);
@@ -7326,6 +7574,7 @@ SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl,
     ID.AddInteger(getSyntheticNodeSubclassData<MemIntrinsicSDNode>(
         Opcode, dl.getIROrder(), VTList, MemVT, MMO));
     ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+    ID.AddInteger(MMO->getFlags());
     void *IP = nullptr;
     if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
       cast<MemIntrinsicSDNode>(E)->refineAlignment(MMO);
@@ -7498,6 +7747,7 @@ SDValue SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
   ID.AddInteger(getSyntheticNodeSubclassData<LoadSDNode>(
       dl.getIROrder(), VTs, AM, ExtType, MemVT, MMO));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  ID.AddInteger(MMO->getFlags());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<LoadSDNode>(E)->refineAlignment(MMO);
@@ -7599,6 +7849,7 @@ SDValue SelectionDAG::getStore(SDValue Chain, const SDLoc &dl, SDValue Val,
   ID.AddInteger(getSyntheticNodeSubclassData<StoreSDNode>(
       dl.getIROrder(), VTs, ISD::UNINDEXED, false, VT, MMO));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  ID.AddInteger(MMO->getFlags());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<StoreSDNode>(E)->refineAlignment(MMO);
@@ -7665,6 +7916,7 @@ SDValue SelectionDAG::getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val,
   ID.AddInteger(getSyntheticNodeSubclassData<StoreSDNode>(
       dl.getIROrder(), VTs, ISD::UNINDEXED, true, SVT, MMO));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  ID.AddInteger(MMO->getFlags());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<StoreSDNode>(E)->refineAlignment(MMO);
@@ -7693,6 +7945,7 @@ SDValue SelectionDAG::getIndexedStore(SDValue OrigStore, const SDLoc &dl,
   ID.AddInteger(ST->getMemoryVT().getRawBits());
   ID.AddInteger(ST->getRawSubclassData());
   ID.AddInteger(ST->getPointerInfo().getAddrSpace());
+  ID.AddInteger(ST->getMemOperand()->getFlags());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
     return SDValue(E, 0);
@@ -7750,6 +8003,7 @@ SDValue SelectionDAG::getLoadVP(ISD::MemIndexedMode AM,
   ID.AddInteger(getSyntheticNodeSubclassData<VPLoadSDNode>(
       dl.getIROrder(), VTs, AM, ExtType, IsExpanding, MemVT, MMO));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  ID.AddInteger(MMO->getFlags());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<VPLoadSDNode>(E)->refineAlignment(MMO);
@@ -7842,6 +8096,7 @@ SDValue SelectionDAG::getStoreVP(SDValue Chain, const SDLoc &dl, SDValue Val,
   ID.AddInteger(getSyntheticNodeSubclassData<VPStoreSDNode>(
       dl.getIROrder(), VTs, AM, IsTruncating, IsCompressing, MemVT, MMO));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  ID.AddInteger(MMO->getFlags());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<VPStoreSDNode>(E)->refineAlignment(MMO);
@@ -7912,6 +8167,7 @@ SDValue SelectionDAG::getTruncStoreVP(SDValue Chain, const SDLoc &dl,
   ID.AddInteger(getSyntheticNodeSubclassData<VPStoreSDNode>(
       dl.getIROrder(), VTs, ISD::UNINDEXED, true, IsCompressing, SVT, MMO));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  ID.AddInteger(MMO->getFlags());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<VPStoreSDNode>(E)->refineAlignment(MMO);
@@ -7942,6 +8198,7 @@ SDValue SelectionDAG::getIndexedStoreVP(SDValue OrigStore, const SDLoc &dl,
   ID.AddInteger(ST->getMemoryVT().getRawBits());
   ID.AddInteger(ST->getRawSubclassData());
   ID.AddInteger(ST->getPointerInfo().getAddrSpace());
+  ID.AddInteger(ST->getMemOperand()->getFlags());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
     return SDValue(E, 0);
@@ -7958,6 +8215,259 @@ SDValue SelectionDAG::getIndexedStoreVP(SDValue OrigStore, const SDLoc &dl,
   return V;
 }
 
+SDValue SelectionDAG::getStridedLoadVP(
+    ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT, const SDLoc &DL,
+    SDValue Chain, SDValue Ptr, SDValue Offset, SDValue Stride, SDValue Mask,
+    SDValue EVL, MachinePointerInfo PtrInfo, EVT MemVT, Align Alignment,
+    MachineMemOperand::Flags MMOFlags, const AAMDNodes &AAInfo,
+    const MDNode *Ranges, bool IsExpanding) {
+  assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
+
+  MMOFlags |= MachineMemOperand::MOLoad;
+  assert((MMOFlags & MachineMemOperand::MOStore) == 0);
+  // If we don't have a PtrInfo, infer the trivial frame index case to simplify
+  // clients.
+  if (PtrInfo.V.isNull())
+    PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr, Offset);
+
+  uint64_t Size = MemoryLocation::UnknownSize;
+  MachineFunction &MF = getMachineFunction();
+  MachineMemOperand *MMO = MF.getMachineMemOperand(PtrInfo, MMOFlags, Size,
+                                                   Alignment, AAInfo, Ranges);
+  return getStridedLoadVP(AM, ExtType, VT, DL, Chain, Ptr, Offset, Stride, Mask,
+                          EVL, MemVT, MMO, IsExpanding);
+}
+
+SDValue SelectionDAG::getStridedLoadVP(
+    ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT, const SDLoc &DL,
+    SDValue Chain, SDValue Ptr, SDValue Offset, SDValue Stride, SDValue Mask,
+    SDValue EVL, EVT MemVT, MachineMemOperand *MMO, bool IsExpanding) {
+  bool Indexed = AM != ISD::UNINDEXED;
+  assert((Indexed || Offset.isUndef()) && "Unindexed load with an offset!");
+
+  SDValue Ops[] = {Chain, Ptr, Offset, Stride, Mask, EVL};
+  SDVTList VTs = Indexed ? getVTList(VT, Ptr.getValueType(), MVT::Other)
+                         : getVTList(VT, MVT::Other);
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::EXPERIMENTAL_VP_STRIDED_LOAD, VTs, Ops);
+  ID.AddInteger(VT.getRawBits());
+  ID.AddInteger(getSyntheticNodeSubclassData<VPStridedLoadSDNode>(
+      DL.getIROrder(), VTs, AM, ExtType, IsExpanding, MemVT, MMO));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
+    cast<VPStridedLoadSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+
+  auto *N =
+      newSDNode<VPStridedLoadSDNode>(DL.getIROrder(), DL.getDebugLoc(), VTs, AM,
+                                     ExtType, IsExpanding, MemVT, MMO);
+  createOperands(N, Ops);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getStridedLoadVP(
+    EVT VT, const SDLoc &DL, SDValue Chain, SDValue Ptr, SDValue Stride,
+    SDValue Mask, SDValue EVL, MachinePointerInfo PtrInfo, MaybeAlign Alignment,
+    MachineMemOperand::Flags MMOFlags, const AAMDNodes &AAInfo,
+    const MDNode *Ranges, bool IsExpanding) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getStridedLoadVP(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, DL, Chain, Ptr,
+                          Undef, Stride, Mask, EVL, PtrInfo, VT, Alignment,
+                          MMOFlags, AAInfo, Ranges, IsExpanding);
+}
+
+SDValue SelectionDAG::getStridedLoadVP(EVT VT, const SDLoc &DL, SDValue Chain,
+                                       SDValue Ptr, SDValue Stride,
+                                       SDValue Mask, SDValue EVL,
+                                       MachineMemOperand *MMO,
+                                       bool IsExpanding) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getStridedLoadVP(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, DL, Chain, Ptr,
+                          Undef, Stride, Mask, EVL, VT, MMO, IsExpanding);
+}
+
+SDValue SelectionDAG::getExtStridedLoadVP(
+    ISD::LoadExtType ExtType, const SDLoc &DL, EVT VT, SDValue Chain,
+    SDValue Ptr, SDValue Stride, SDValue Mask, SDValue EVL,
+    MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment,
+    MachineMemOperand::Flags MMOFlags, const AAMDNodes &AAInfo,
+    bool IsExpanding) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getStridedLoadVP(ISD::UNINDEXED, ExtType, VT, DL, Chain, Ptr, Undef,
+                          Stride, Mask, EVL, PtrInfo, MemVT, Alignment,
+                          MMOFlags, AAInfo, nullptr, IsExpanding);
+}
+
+SDValue SelectionDAG::getExtStridedLoadVP(
+    ISD::LoadExtType ExtType, const SDLoc &DL, EVT VT, SDValue Chain,
+    SDValue Ptr, SDValue Stride, SDValue Mask, SDValue EVL, EVT MemVT,
+    MachineMemOperand *MMO, bool IsExpanding) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getStridedLoadVP(ISD::UNINDEXED, ExtType, VT, DL, Chain, Ptr, Undef,
+                          Stride, Mask, EVL, MemVT, MMO, IsExpanding);
+}
+
+SDValue SelectionDAG::getIndexedStridedLoadVP(SDValue OrigLoad, const SDLoc &DL,
+                                              SDValue Base, SDValue Offset,
+                                              ISD::MemIndexedMode AM) {
+  auto *SLD = cast<VPStridedLoadSDNode>(OrigLoad);
+  assert(SLD->getOffset().isUndef() &&
+         "Strided load is already a indexed load!");
+  // Don't propagate the invariant or dereferenceable flags.
+  auto MMOFlags =
+      SLD->getMemOperand()->getFlags() &
+      ~(MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
+  return getStridedLoadVP(
+      AM, SLD->getExtensionType(), OrigLoad.getValueType(), DL, SLD->getChain(),
+      Base, Offset, SLD->getStride(), SLD->getMask(), SLD->getVectorLength(),
+      SLD->getPointerInfo(), SLD->getMemoryVT(), SLD->getAlign(), MMOFlags,
+      SLD->getAAInfo(), nullptr, SLD->isExpandingLoad());
+}
+
+SDValue SelectionDAG::getStridedStoreVP(SDValue Chain, const SDLoc &DL,
+                                        SDValue Val, SDValue Ptr,
+                                        SDValue Offset, SDValue Stride,
+                                        SDValue Mask, SDValue EVL, EVT MemVT,
+                                        MachineMemOperand *MMO,
+                                        ISD::MemIndexedMode AM,
+                                        bool IsTruncating, bool IsCompressing) {
+  assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
+  bool Indexed = AM != ISD::UNINDEXED;
+  assert((Indexed || Offset.isUndef()) && "Unindexed vp_store with an offset!");
+  SDVTList VTs = Indexed ? getVTList(Ptr.getValueType(), MVT::Other)
+                         : getVTList(MVT::Other);
+  SDValue Ops[] = {Chain, Val, Ptr, Offset, Stride, Mask, EVL};
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::EXPERIMENTAL_VP_STRIDED_STORE, VTs, Ops);
+  ID.AddInteger(MemVT.getRawBits());
+  ID.AddInteger(getSyntheticNodeSubclassData<VPStridedStoreSDNode>(
+      DL.getIROrder(), VTs, AM, IsTruncating, IsCompressing, MemVT, MMO));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
+    cast<VPStridedStoreSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  auto *N = newSDNode<VPStridedStoreSDNode>(DL.getIROrder(), DL.getDebugLoc(),
+                                            VTs, AM, IsTruncating,
+                                            IsCompressing, MemVT, MMO);
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getTruncStridedStoreVP(
+    SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, SDValue Stride,
+    SDValue Mask, SDValue EVL, MachinePointerInfo PtrInfo, EVT SVT,
+    Align Alignment, MachineMemOperand::Flags MMOFlags, const AAMDNodes &AAInfo,
+    bool IsCompressing) {
+  assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
+
+  MMOFlags |= MachineMemOperand::MOStore;
+  assert((MMOFlags & MachineMemOperand::MOLoad) == 0);
+
+  if (PtrInfo.V.isNull())
+    PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr);
+
+  MachineFunction &MF = getMachineFunction();
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      PtrInfo, MMOFlags, MemoryLocation::UnknownSize, Alignment, AAInfo);
+  return getTruncStridedStoreVP(Chain, DL, Val, Ptr, Stride, Mask, EVL, SVT,
+                                MMO, IsCompressing);
+}
+
+SDValue SelectionDAG::getTruncStridedStoreVP(SDValue Chain, const SDLoc &DL,
+                                             SDValue Val, SDValue Ptr,
+                                             SDValue Stride, SDValue Mask,
+                                             SDValue EVL, EVT SVT,
+                                             MachineMemOperand *MMO,
+                                             bool IsCompressing) {
+  EVT VT = Val.getValueType();
+
+  assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
+  if (VT == SVT)
+    return getStridedStoreVP(Chain, DL, Val, Ptr, getUNDEF(Ptr.getValueType()),
+                             Stride, Mask, EVL, VT, MMO, ISD::UNINDEXED,
+                             /*IsTruncating*/ false, IsCompressing);
+
+  assert(SVT.getScalarType().bitsLT(VT.getScalarType()) &&
+         "Should only be a truncating store, not extending!");
+  assert(VT.isInteger() == SVT.isInteger() && "Can't do FP-INT conversion!");
+  assert(VT.isVector() == SVT.isVector() &&
+         "Cannot use trunc store to convert to or from a vector!");
+  assert((!VT.isVector() ||
+          VT.getVectorElementCount() == SVT.getVectorElementCount()) &&
+         "Cannot use trunc store to change the number of vector elements!");
+
+  SDVTList VTs = getVTList(MVT::Other);
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  SDValue Ops[] = {Chain, Val, Ptr, Undef, Stride, Mask, EVL};
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::EXPERIMENTAL_VP_STRIDED_STORE, VTs, Ops);
+  ID.AddInteger(SVT.getRawBits());
+  ID.AddInteger(getSyntheticNodeSubclassData<VPStridedStoreSDNode>(
+      DL.getIROrder(), VTs, ISD::UNINDEXED, true, IsCompressing, SVT, MMO));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
+    cast<VPStridedStoreSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  auto *N = newSDNode<VPStridedStoreSDNode>(DL.getIROrder(), DL.getDebugLoc(),
+                                            VTs, ISD::UNINDEXED, true,
+                                            IsCompressing, SVT, MMO);
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getIndexedStridedStoreVP(SDValue OrigStore,
+                                               const SDLoc &DL, SDValue Base,
+                                               SDValue Offset,
+                                               ISD::MemIndexedMode AM) {
+  auto *SST = cast<VPStridedStoreSDNode>(OrigStore);
+  assert(SST->getOffset().isUndef() &&
+         "Strided store is already an indexed store!");
+  SDVTList VTs = getVTList(Base.getValueType(), MVT::Other);
+  SDValue Ops[] = {
+      SST->getChain(), SST->getValue(),       Base, Offset, SST->getStride(),
+      SST->getMask(),  SST->getVectorLength()};
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::EXPERIMENTAL_VP_STRIDED_STORE, VTs, Ops);
+  ID.AddInteger(SST->getMemoryVT().getRawBits());
+  ID.AddInteger(SST->getRawSubclassData());
+  ID.AddInteger(SST->getPointerInfo().getAddrSpace());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<VPStridedStoreSDNode>(
+      DL.getIROrder(), DL.getDebugLoc(), VTs, AM, SST->isTruncatingStore(),
+      SST->isCompressingStore(), SST->getMemoryVT(), SST->getMemOperand());
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
 SDValue SelectionDAG::getGatherVP(SDVTList VTs, EVT VT, const SDLoc &dl,
                                   ArrayRef<SDValue> Ops, MachineMemOperand *MMO,
                                   ISD::MemIndexType IndexType) {
@@ -7969,6 +8479,7 @@ SDValue SelectionDAG::getGatherVP(SDVTList VTs, EVT VT, const SDLoc &dl,
   ID.AddInteger(getSyntheticNodeSubclassData<VPGatherSDNode>(
       dl.getIROrder(), VTs, VT, MMO, IndexType));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  ID.AddInteger(MMO->getFlags());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<VPGatherSDNode>(E)->refineAlignment(MMO);
@@ -8012,6 +8523,7 @@ SDValue SelectionDAG::getScatterVP(SDVTList VTs, EVT VT, const SDLoc &dl,
   ID.AddInteger(getSyntheticNodeSubclassData<VPScatterSDNode>(
       dl.getIROrder(), VTs, VT, MMO, IndexType));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  ID.AddInteger(MMO->getFlags());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<VPScatterSDNode>(E)->refineAlignment(MMO);
@@ -8061,6 +8573,7 @@ SDValue SelectionDAG::getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain,
   ID.AddInteger(getSyntheticNodeSubclassData<MaskedLoadSDNode>(
       dl.getIROrder(), VTs, AM, ExtTy, isExpanding, MemVT, MMO));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  ID.AddInteger(MMO->getFlags());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<MaskedLoadSDNode>(E)->refineAlignment(MMO);
@@ -8108,6 +8621,7 @@ SDValue SelectionDAG::getMaskedStore(SDValue Chain, const SDLoc &dl,
   ID.AddInteger(getSyntheticNodeSubclassData<MaskedStoreSDNode>(
       dl.getIROrder(), VTs, AM, IsTruncating, IsCompressing, MemVT, MMO));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  ID.AddInteger(MMO->getFlags());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<MaskedStoreSDNode>(E)->refineAlignment(MMO);
@@ -8149,13 +8663,13 @@ SDValue SelectionDAG::getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl,
   ID.AddInteger(getSyntheticNodeSubclassData<MaskedGatherSDNode>(
       dl.getIROrder(), VTs, MemVT, MMO, IndexType, ExtTy));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  ID.AddInteger(MMO->getFlags());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<MaskedGatherSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
   }
 
-  IndexType = TLI->getCanonicalIndexType(IndexType, MemVT, Ops[4]);
   auto *N = newSDNode<MaskedGatherSDNode>(dl.getIROrder(), dl.getDebugLoc(),
                                           VTs, MemVT, MMO, IndexType, ExtTy);
   createOperands(N, Ops);
@@ -8196,13 +8710,13 @@ SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl,
   ID.AddInteger(getSyntheticNodeSubclassData<MaskedScatterSDNode>(
       dl.getIROrder(), VTs, MemVT, MMO, IndexType, IsTrunc));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  ID.AddInteger(MMO->getFlags());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<MaskedScatterSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
   }
 
-  IndexType = TLI->getCanonicalIndexType(IndexType, MemVT, Ops[4]);
   auto *N = newSDNode<MaskedScatterSDNode>(dl.getIROrder(), dl.getDebugLoc(),
                                            VTs, MemVT, MMO, IndexType, IsTrunc);
   createOperands(N, Ops);
@@ -8400,6 +8914,41 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     assert(Ops[2].getValueType() == Ops[3].getValueType() &&
            "LHS/RHS of comparison should match types!");
     break;
+  case ISD::VP_ADD:
+  case ISD::VP_SUB:
+    // If it is VP_ADD/VP_SUB mask operation then turn it to VP_XOR
+    if (VT.isVector() && VT.getVectorElementType() == MVT::i1)
+      Opcode = ISD::VP_XOR;
+    break;
+  case ISD::VP_MUL:
+    // If it is VP_MUL mask operation then turn it to VP_AND
+    if (VT.isVector() && VT.getVectorElementType() == MVT::i1)
+      Opcode = ISD::VP_AND;
+    break;
+  case ISD::VP_REDUCE_MUL:
+    // If it is VP_REDUCE_MUL mask operation then turn it to VP_REDUCE_AND
+    if (VT == MVT::i1)
+      Opcode = ISD::VP_REDUCE_AND;
+    break;
+  case ISD::VP_REDUCE_ADD:
+    // If it is VP_REDUCE_ADD mask operation then turn it to VP_REDUCE_XOR
+    if (VT == MVT::i1)
+      Opcode = ISD::VP_REDUCE_XOR;
+    break;
+  case ISD::VP_REDUCE_SMAX:
+  case ISD::VP_REDUCE_UMIN:
+    // If it is VP_REDUCE_SMAX/VP_REDUCE_UMIN mask operation then turn it to
+    // VP_REDUCE_AND.
+    if (VT == MVT::i1)
+      Opcode = ISD::VP_REDUCE_AND;
+    break;
+  case ISD::VP_REDUCE_SMIN:
+  case ISD::VP_REDUCE_UMAX:
+    // If it is VP_REDUCE_SMIN/VP_REDUCE_UMAX mask operation then turn it to
+    // VP_REDUCE_OR.
+    if (VT == MVT::i1)
+      Opcode = ISD::VP_REDUCE_OR;
+    break;
   }
 
   // Memoize nodes.
@@ -8446,7 +8995,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
                               ArrayRef<SDValue> Ops, const SDNodeFlags Flags) {
   if (VTList.NumVTs == 1)
-    return getNode(Opcode, DL, VTList.VTs[0], Ops);
+    return getNode(Opcode, DL, VTList.VTs[0], Ops, Flags);
 
 #ifndef NDEBUG
   for (auto &Op : Ops)
@@ -9659,19 +10208,36 @@ void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To){
 
 namespace {
 
-  /// UseMemo - This class is used by SelectionDAG::ReplaceAllUsesOfValuesWith
-  /// to record information about a use.
-  struct UseMemo {
-    SDNode *User;
-    unsigned Index;
-    SDUse *Use;
-  };
+/// UseMemo - This class is used by SelectionDAG::ReplaceAllUsesOfValuesWith
+/// to record information about a use.
+struct UseMemo {
+  SDNode *User;
+  unsigned Index;
+  SDUse *Use;
+};
 
-  /// operator< - Sort Memos by User.
-  bool operator<(const UseMemo &L, const UseMemo &R) {
-    return (intptr_t)L.User < (intptr_t)R.User;
+/// operator< - Sort Memos by User.
+bool operator<(const UseMemo &L, const UseMemo &R) {
+  return (intptr_t)L.User < (intptr_t)R.User;
+}
+
+/// RAUOVWUpdateListener - Helper for ReplaceAllUsesOfValuesWith - When the node
+/// pointed to by a UseMemo is deleted, set the User to nullptr to indicate that
+/// the node already has been taken care of recursively.
+class RAUOVWUpdateListener : public SelectionDAG::DAGUpdateListener {
+  SmallVector<UseMemo, 4> &Uses;
+
+  void NodeDeleted(SDNode *N, SDNode *E) override {
+    for (UseMemo &Memo : Uses)
+      if (Memo.User == N)
+        Memo.User = nullptr;
   }
 
+public:
+  RAUOVWUpdateListener(SelectionDAG &d, SmallVector<UseMemo, 4> &uses)
+      : SelectionDAG::DAGUpdateListener(d), Uses(uses) {}
+};
+
 } // end anonymous namespace
 
 bool SelectionDAG::calculateDivergence(SDNode *N) {
@@ -9763,12 +10329,19 @@ void SelectionDAG::ReplaceAllUsesOfValuesWith(const SDValue *From,
 
   // Sort the uses, so that all the uses from a given User are together.
   llvm::sort(Uses);
+  RAUOVWUpdateListener Listener(*this, Uses);
 
   for (unsigned UseIndex = 0, UseIndexEnd = Uses.size();
        UseIndex != UseIndexEnd; ) {
     // We know that this user uses some value of From.  If it is the right
     // value, update it.
     SDNode *User = Uses[UseIndex].User;
+    // If the node has been deleted by recursive CSE updates when updating
+    // another node, then just skip this entry.
+    if (User == nullptr) {
+      ++UseIndex;
+      continue;
+    }
 
     // This node is about to morph, remove its old self from the CSE maps.
     RemoveNodeFromCSEMaps(User);
@@ -9965,6 +10538,11 @@ bool llvm::isOneConstant(SDValue V) {
   return Const != nullptr && Const->isOne();
 }
 
+bool llvm::isMinSignedConstant(SDValue V) {
+  ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
+  return Const != nullptr && Const->isMinSignedValue();
+}
+
 SDValue llvm::peekThroughBitcasts(SDValue V) {
   while (V.getOpcode() == ISD::BITCAST)
     V = V.getOperand(0);
@@ -10095,10 +10673,9 @@ bool llvm::isNullOrNullSplat(SDValue N, bool AllowUndefs) {
 }
 
 bool llvm::isOneOrOneSplat(SDValue N, bool AllowUndefs) {
-  // TODO: may want to use peekThroughBitcast() here.
-  unsigned BitWidth = N.getScalarValueSizeInBits();
-  ConstantSDNode *C = isConstOrConstSplat(N, AllowUndefs);
-  return C && C->isOne() && C->getValueSizeInBits(0) == BitWidth;
+  ConstantSDNode *C =
+      isConstOrConstSplat(N, AllowUndefs, /*AllowTruncation*/ true);
+  return C && C->isOne();
 }
 
 bool llvm::isAllOnesOrAllOnesSplat(SDValue N, bool AllowUndefs) {
@@ -10947,9 +11524,8 @@ bool BuildVectorSDNode::getConstantRawBits(
     auto *CInt = dyn_cast<ConstantSDNode>(Op);
     auto *CFP = dyn_cast<ConstantFPSDNode>(Op);
     assert((CInt || CFP) && "Unknown constant");
-    SrcBitElements[I] =
-        CInt ? CInt->getAPIntValue().truncOrSelf(SrcEltSizeInBits)
-             : CFP->getValueAPF().bitcastToAPInt();
+    SrcBitElements[I] = CInt ? CInt->getAPIntValue().trunc(SrcEltSizeInBits)
+                             : CFP->getValueAPF().bitcastToAPInt();
   }
 
   // Recast to dst width.
@@ -11068,6 +11644,10 @@ SDNode *SelectionDAG::isConstantFPBuildVectorOrConstantFP(SDValue N) const {
   if (ISD::isBuildVectorOfConstantFPSDNodes(N.getNode()))
     return N.getNode();
 
+  if ((N.getOpcode() == ISD::SPLAT_VECTOR) &&
+      isa<ConstantFPSDNode>(N.getOperand(0)))
+    return N.getNode();
+
   return nullptr;
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
index 6d8252046501..d236433f6fb4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
@@ -96,7 +96,7 @@ bool BaseIndexOffset::computeAliasing(const SDNode *Op0,
   if (!(BasePtr0.getBase().getNode() && BasePtr1.getBase().getNode()))
     return false;
   int64_t PtrDiff;
-  if (NumBytes0.hasValue() && NumBytes1.hasValue() &&
+  if (NumBytes0 && NumBytes1 &&
       BasePtr0.equalBaseIndex(BasePtr1, DAG, PtrDiff)) {
     // If the size of memory access is unknown, do not use it to analysis.
     // One example of unknown size memory access is to load/store scalable
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 01230a36e744..37d05cdba76d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -24,25 +24,21 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/EHPersonalities.h"
-#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryLocation.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/CodeGenCommonISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineInstrBundleIterator.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
@@ -89,7 +85,6 @@
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -102,10 +97,8 @@
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <cstddef>
-#include <cstring>
 #include <iterator>
 #include <limits>
-#include <numeric>
 #include <tuple>
 
 using namespace llvm;
@@ -224,10 +217,10 @@ static SDValue getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL,
           std::swap(Lo, Hi);
         EVT TotalVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
         Hi = DAG.getNode(ISD::ANY_EXTEND, DL, TotalVT, Hi);
-        Hi =
-            DAG.getNode(ISD::SHL, DL, TotalVT, Hi,
-                        DAG.getConstant(Lo.getValueSizeInBits(), DL,
-                                        TLI.getPointerTy(DAG.getDataLayout())));
+        Hi = DAG.getNode(ISD::SHL, DL, TotalVT, Hi,
+                         DAG.getConstant(Lo.getValueSizeInBits(), DL,
+                                         TLI.getShiftAmountTy(
+                                             TotalVT, DAG.getDataLayout())));
         Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, TotalVT, Lo);
         Val = DAG.getNode(ISD::OR, DL, TotalVT, Lo, Hi);
       }
@@ -276,7 +269,7 @@ static SDValue getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL,
       // For a truncate, see if we have any information to
       // indicate whether the truncated bits will always be
       // zero or sign-extension.
-      if (AssertOp.hasValue())
+      if (AssertOp)
         Val = DAG.getNode(*AssertOp, DL, PartEVT, Val,
                           DAG.getValueType(ValueVT));
       return DAG.getNode(ISD::TRUNCATE, DL, ValueVT, Val);
@@ -330,7 +323,7 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
                                       Optional<CallingConv::ID> CallConv) {
   assert(ValueVT.isVector() && "Not a vector value");
   assert(NumParts > 0 && "No parts to assemble!");
-  const bool IsABIRegCopy = CallConv.hasValue();
+  const bool IsABIRegCopy = CallConv.has_value();
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue Val = Parts[0];
@@ -344,7 +337,7 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
 
     if (IsABIRegCopy) {
       NumRegs = TLI.getVectorTypeBreakdownForCallingConv(
-          *DAG.getContext(), CallConv.getValue(), ValueVT, IntermediateVT,
+          *DAG.getContext(), *CallConv, ValueVT, IntermediateVT,
           NumIntermediates, RegisterVT);
     } else {
       NumRegs =
@@ -566,7 +559,7 @@ static void getCopyToParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val,
     unsigned RoundBits = RoundParts * PartBits;
     unsigned OddParts = NumParts - RoundParts;
     SDValue OddVal = DAG.getNode(ISD::SRL, DL, ValueVT, Val,
-      DAG.getShiftAmountConstant(RoundBits, ValueVT, DL, /*LegalTypes*/false));
+      DAG.getShiftAmountConstant(RoundBits, ValueVT, DL));
 
     getCopyToParts(DAG, DL, OddVal, Parts + RoundParts, OddParts, PartVT, V,
                    CallConv);
@@ -654,7 +647,7 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
   EVT ValueVT = Val.getValueType();
   assert(ValueVT.isVector() && "Not a vector");
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  const bool IsABIRegCopy = CallConv.hasValue();
+  const bool IsABIRegCopy = CallConv.has_value();
 
   if (NumParts == 1) {
     EVT PartEVT = PartVT;
@@ -733,7 +726,7 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
     DestEltCnt = ElementCount::getFixed(NumIntermediates);
 
   EVT BuiltVectorTy = EVT::getVectorVT(
-      *DAG.getContext(), IntermediateVT.getScalarType(), DestEltCnt.getValue());
+      *DAG.getContext(), IntermediateVT.getScalarType(), *DestEltCnt);
 
   if (ValueVT == BuiltVectorTy) {
     // Nothing to do.
@@ -1236,7 +1229,8 @@ void SelectionDAGBuilder::resolveDanglingDebugInfo(const Value *V,
       // in the first place we should not be more successful here). Unless we
       // have some test case that prove this to be correct we should avoid
       // calling EmitFuncArgumentDbgValue here.
-      if (!EmitFuncArgumentDbgValue(V, Variable, Expr, dl, false, Val)) {
+      if (!EmitFuncArgumentDbgValue(V, Variable, Expr, dl,
+                                    FuncArgumentDbgValueKind::Value, Val)) {
         LLVM_DEBUG(dbgs() << "Resolve dangling debug info [order="
                           << DbgSDNodeOrder << "] for:\n  " << *DI << "\n");
         LLVM_DEBUG(dbgs() << "  By mapping to:\n    "; Val.dump());
@@ -1367,7 +1361,9 @@ bool SelectionDAGBuilder::handleDebugValue(ArrayRef<const Value *> Values,
       N = UnusedArgNodeMap[V];
     if (N.getNode()) {
       // Only emit func arg dbg value for non-variadic dbg.values for now.
-      if (!IsVariadic && EmitFuncArgumentDbgValue(V, Var, Expr, dl, false, N))
+      if (!IsVariadic &&
+          EmitFuncArgumentDbgValue(V, Var, Expr, dl,
+                                   FuncArgumentDbgValueKind::Value, N))
         return true;
       if (auto *FISDN = dyn_cast<FrameIndexSDNode>(N.getNode())) {
         // Construct a FrameIndexDbgValue for FrameIndexSDNodes so we can
@@ -1639,7 +1635,9 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
         Ops.push_back(getValue(CV->getOperand(i)));
 
       return NodeMap[V] = DAG.getBuildVector(VT, getCurSDLoc(), Ops);
-    } else if (isa<ConstantAggregateZero>(C)) {
+    }
+
+    if (isa<ConstantAggregateZero>(C)) {
       EVT EltVT =
           TLI.getValueType(DAG.getDataLayout(), VecTy->getElementType());
 
@@ -1651,12 +1649,12 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
 
       if (isa<ScalableVectorType>(VecTy))
         return NodeMap[V] = DAG.getSplatVector(VT, getCurSDLoc(), Op);
-      else {
-        SmallVector<SDValue, 16> Ops;
-        Ops.assign(cast<FixedVectorType>(VecTy)->getNumElements(), Op);
-        return NodeMap[V] = DAG.getBuildVector(VT, getCurSDLoc(), Ops);
-      }
+
+      SmallVector<SDValue, 16> Ops;
+      Ops.assign(cast<FixedVectorType>(VecTy)->getNumElements(), Op);
+      return NodeMap[V] = DAG.getBuildVector(VT, getCurSDLoc(), Ops);
     }
+
     llvm_unreachable("Unknown vector constant");
   }
 
@@ -1680,11 +1678,12 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
     return RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, V);
   }
 
-  if (const MetadataAsValue *MD = dyn_cast<MetadataAsValue>(V)) {
+  if (const MetadataAsValue *MD = dyn_cast<MetadataAsValue>(V))
     return DAG.getMDNode(cast<MDNode>(MD->getMetadata()));
-  }
+
   if (const auto *BB = dyn_cast<BasicBlock>(V))
     return DAG.getBasicBlock(FuncInfo.MBBMap[BB]);
+
   llvm_unreachable("Can't get register for value!");
 }
 
@@ -2748,10 +2747,10 @@ SelectionDAGBuilder::visitSPDescriptorFailure(StackProtectorDescriptor &SPD) {
   SDValue Chain =
       TLI.makeLibCall(DAG, RTLIB::STACKPROTECTOR_CHECK_FAIL, MVT::isVoid,
                       None, CallOptions, getCurSDLoc()).second;
-  // On PS4, the "return address" must still be within the calling function,
-  // even if it's at the very end, so emit an explicit TRAP here.
+  // On PS4/PS5, the "return address" must still be within the calling
+  // function, even if it's at the very end, so emit an explicit TRAP here.
   // Passing 'true' for doesNotReturn above won't generate the trap for us.
-  if (TM.getTargetTriple().isPS4CPU())
+  if (TM.getTargetTriple().isPS())
     Chain = DAG.getNode(ISD::TRAP, getCurSDLoc(), MVT::Other, Chain);
   // WebAssembly needs an unreachable instruction after a non-returning call,
   // because the function return type can be different from __stack_chk_fail's
@@ -3150,26 +3149,12 @@ void SelectionDAGBuilder::visitShift(const User &I, unsigned Opcode) {
   EVT ShiftTy = DAG.getTargetLoweringInfo().getShiftAmountTy(
       Op1.getValueType(), DAG.getDataLayout());
 
-  // Coerce the shift amount to the right type if we can.
+  // Coerce the shift amount to the right type if we can. This exposes the
+  // truncate or zext to optimization early.
   if (!I.getType()->isVectorTy() && Op2.getValueType() != ShiftTy) {
-    unsigned ShiftSize = ShiftTy.getSizeInBits();
-    unsigned Op2Size = Op2.getValueSizeInBits();
-    SDLoc DL = getCurSDLoc();
-
-    // If the operand is smaller than the shift count type, promote it.
-    if (ShiftSize > Op2Size)
-      Op2 = DAG.getNode(ISD::ZERO_EXTEND, DL, ShiftTy, Op2);
-
-    // If the operand is larger than the shift count type but the shift
-    // count type has enough bits to represent any shift value, truncate
-    // it now. This is a common case and it exposes the truncate to
-    // optimization early.
-    else if (ShiftSize >= Log2_32_Ceil(Op1.getValueSizeInBits()))
-      Op2 = DAG.getNode(ISD::TRUNCATE, DL, ShiftTy, Op2);
-    // Otherwise we'll need to temporarily settle for some other convenient
-    // type.  Type legalization will make adjustments once the shiftee is split.
-    else
-      Op2 = DAG.getZExtOrTrunc(Op2, DL, MVT::i32);
+    assert(ShiftTy.getSizeInBits() >= Log2_32_Ceil(Op1.getValueSizeInBits()) &&
+           "Unexpected shift type");
+    Op2 = DAG.getZExtOrTrunc(Op2, getCurSDLoc(), ShiftTy);
   }
 
   bool nuw = false;
@@ -3816,13 +3801,8 @@ void SelectionDAGBuilder::visitInsertValue(const User &I) {
                            DAG.getVTList(AggValueVTs), Values));
 }
 
-void SelectionDAGBuilder::visitExtractValue(const User &I) {
-  ArrayRef<unsigned> Indices;
-  if (const ExtractValueInst *EV = dyn_cast<ExtractValueInst>(&I))
-    Indices = EV->getIndices();
-  else
-    Indices = cast<ConstantExpr>(&I)->getIndices();
-
+void SelectionDAGBuilder::visitExtractValue(const ExtractValueInst &I) {
+  ArrayRef<unsigned> Indices = I.getIndices();
   const Value *Op0 = I.getOperand(0);
   Type *AggTy = Op0->getType();
   Type *ValTy = I.getType();
@@ -4376,7 +4356,8 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I,
 // In all other cases the function returns 'false'.
 static bool getUniformBase(const Value *Ptr, SDValue &Base, SDValue &Index,
                            ISD::MemIndexType &IndexType, SDValue &Scale,
-                           SelectionDAGBuilder *SDB, const BasicBlock *CurBB) {
+                           SelectionDAGBuilder *SDB, const BasicBlock *CurBB,
+                           uint64_t ElemSize) {
   SelectionDAG& DAG = SDB->DAG;
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   const DataLayout &DL = DAG.getDataLayout();
@@ -4416,9 +4397,16 @@ static bool getUniformBase(const Value *Ptr, SDValue &Base, SDValue &Index,
   Base = SDB->getValue(BasePtr);
   Index = SDB->getValue(IndexVal);
   IndexType = ISD::SIGNED_SCALED;
-  Scale = DAG.getTargetConstant(
-              DL.getTypeAllocSize(GEP->getResultElementType()),
-              SDB->getCurSDLoc(), TLI.getPointerTy(DL));
+
+  // MGATHER/MSCATTER are only required to support scaling by one or by the
+  // element size. Other scales may be produced using target-specific DAG
+  // combines.
+  uint64_t ScaleVal = DL.getTypeAllocSize(GEP->getResultElementType());
+  if (ScaleVal != ElemSize && ScaleVal != 1)
+    return false;
+
+  Scale =
+      DAG.getTargetConstant(ScaleVal, SDB->getCurSDLoc(), TLI.getPointerTy(DL));
   return true;
 }
 
@@ -4432,7 +4420,7 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
   EVT VT = Src0.getValueType();
   Align Alignment = cast<ConstantInt>(I.getArgOperand(2))
                         ->getMaybeAlignValue()
-                        .getValueOr(DAG.getEVTAlign(VT.getScalarType()));
+                        .value_or(DAG.getEVTAlign(VT.getScalarType()));
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   SDValue Base;
@@ -4440,7 +4428,7 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
   ISD::MemIndexType IndexType;
   SDValue Scale;
   bool UniformBase = getUniformBase(Ptr, Base, Index, IndexType, Scale, this,
-                                    I.getParent());
+                                    I.getParent(), VT.getScalarStoreSize());
 
   unsigned AS = Ptr->getType()->getScalarType()->getPointerAddressSpace();
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
@@ -4451,7 +4439,7 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
   if (!UniformBase) {
     Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
     Index = getValue(Ptr);
-    IndexType = ISD::SIGNED_UNSCALED;
+    IndexType = ISD::SIGNED_SCALED;
     Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
   }
 
@@ -4538,7 +4526,7 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
   EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
   Align Alignment = cast<ConstantInt>(I.getArgOperand(1))
                         ->getMaybeAlignValue()
-                        .getValueOr(DAG.getEVTAlign(VT.getScalarType()));
+                        .value_or(DAG.getEVTAlign(VT.getScalarType()));
 
   const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
 
@@ -4548,7 +4536,7 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
   ISD::MemIndexType IndexType;
   SDValue Scale;
   bool UniformBase = getUniformBase(Ptr, Base, Index, IndexType, Scale, this,
-                                    I.getParent());
+                                    I.getParent(), VT.getScalarStoreSize());
   unsigned AS = Ptr->getType()->getScalarType()->getPointerAddressSpace();
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       MachinePointerInfo(AS), MachineMemOperand::MOLoad,
@@ -4559,7 +4547,7 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
   if (!UniformBase) {
     Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
     Index = getValue(Ptr);
-    IndexType = ISD::SIGNED_UNSCALED;
+    IndexType = ISD::SIGNED_SCALED;
     Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
   }
 
@@ -4678,7 +4666,7 @@ void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) {
   EVT MemVT = TLI.getMemValueType(DAG.getDataLayout(), I.getType());
 
   if (!TLI.supportsUnalignedAtomics() &&
-      I.getAlignment() < MemVT.getSizeInBits() / 8)
+      I.getAlign().value() < MemVT.getSizeInBits() / 8)
     report_fatal_error("Cannot generate unaligned atomic load");
 
   auto Flags = TLI.getLoadMemOperandFlags(I, DAG.getDataLayout());
@@ -4730,7 +4718,7 @@ void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) {
   EVT MemVT =
       TLI.getMemValueType(DAG.getDataLayout(), I.getValueOperand()->getType());
 
-  if (I.getAlignment() < MemVT.getSizeInBits() / 8)
+  if (I.getAlign().value() < MemVT.getSizeInBits() / 8)
     report_fatal_error("Cannot generate unaligned atomic store");
 
   auto Flags = TLI.getStoreMemOperandFlags(I, DAG.getDataLayout());
@@ -4781,7 +4769,7 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
     }
   }
 
-  // Info is set by getTgtMemInstrinsic
+  // Info is set by getTgtMemIntrinsic
   TargetLowering::IntrinsicInfo Info;
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   bool IsTgtIntrinsic = TLI.getTgtMemIntrinsic(Info, I,
@@ -4895,7 +4883,8 @@ static SDValue GetExponent(SelectionDAG &DAG, SDValue Op,
                            DAG.getConstant(0x7f800000, dl, MVT::i32));
   SDValue t1 = DAG.getNode(
       ISD::SRL, dl, MVT::i32, t0,
-      DAG.getConstant(23, dl, TLI.getPointerTy(DAG.getDataLayout())));
+      DAG.getConstant(23, dl,
+                      TLI.getShiftAmountTy(MVT::i32, DAG.getDataLayout())));
   SDValue t2 = DAG.getNode(ISD::SUB, dl, MVT::i32, t1,
                            DAG.getConstant(127, dl, MVT::i32));
   return DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, t2);
@@ -4920,10 +4909,11 @@ static SDValue getLimitedPrecisionExp2(SDValue t0, const SDLoc &dl,
   SDValue X = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0, t1);
 
   //   IntegerPartOfX <<= 23;
-  IntegerPartOfX = DAG.getNode(
-      ISD::SHL, dl, MVT::i32, IntegerPartOfX,
-      DAG.getConstant(23, dl, DAG.getTargetLoweringInfo().getPointerTy(
-                                  DAG.getDataLayout())));
+  IntegerPartOfX =
+      DAG.getNode(ISD::SHL, dl, MVT::i32, IntegerPartOfX,
+                  DAG.getConstant(23, dl,
+                                  DAG.getTargetLoweringInfo().getShiftAmountTy(
+                                      MVT::i32, DAG.getDataLayout())));
 
   SDValue TwoToFractionalPartOfX;
   if (LimitFloatPrecision <= 6) {
@@ -5351,38 +5341,36 @@ static SDValue expandPow(const SDLoc &dl, SDValue LHS, SDValue RHS,
 /// ExpandPowI - Expand a llvm.powi intrinsic.
 static SDValue ExpandPowI(const SDLoc &DL, SDValue LHS, SDValue RHS,
                           SelectionDAG &DAG) {
-  // If RHS is a constant, we can expand this out to a multiplication tree,
-  // otherwise we end up lowering to a call to __powidf2 (for example).  When
-  // optimizing for size, we only want to do this if the expansion would produce
-  // a small number of multiplies, otherwise we do the full expansion.
+  // If RHS is a constant, we can expand this out to a multiplication tree if
+  // it's beneficial on the target, otherwise we end up lowering to a call to
+  // __powidf2 (for example).
   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
-    // Get the exponent as a positive value.
     unsigned Val = RHSC->getSExtValue();
-    if ((int)Val < 0) Val = -Val;
 
     // powi(x, 0) -> 1.0
     if (Val == 0)
       return DAG.getConstantFP(1.0, DL, LHS.getValueType());
 
-    bool OptForSize = DAG.shouldOptForSize();
-    if (!OptForSize ||
-        // If optimizing for size, don't insert too many multiplies.
-        // This inserts up to 5 multiplies.
-        countPopulation(Val) + Log2_32(Val) < 7) {
+    if (DAG.getTargetLoweringInfo().isBeneficialToExpandPowI(
+            Val, DAG.shouldOptForSize())) {
+      // Get the exponent as a positive value.
+      if ((int)Val < 0)
+        Val = -Val;
       // We use the simple binary decomposition method to generate the multiply
       // sequence.  There are more optimal ways to do this (for example,
       // powi(x,15) generates one more multiply than it should), but this has
       // the benefit of being both really simple and much better than a libcall.
-      SDValue Res;  // Logically starts equal to 1.0
+      SDValue Res; // Logically starts equal to 1.0
       SDValue CurSquare = LHS;
       // TODO: Intrinsics should have fast-math-flags that propagate to these
       // nodes.
       while (Val) {
         if (Val & 1) {
           if (Res.getNode())
-            Res = DAG.getNode(ISD::FMUL, DL,Res.getValueType(), Res, CurSquare);
+            Res =
+                DAG.getNode(ISD::FMUL, DL, Res.getValueType(), Res, CurSquare);
           else
-            Res = CurSquare;  // 1.0*CurSquare.
+            Res = CurSquare; // 1.0*CurSquare.
         }
 
         CurSquare = DAG.getNode(ISD::FMUL, DL, CurSquare.getValueType(),
@@ -5503,7 +5491,7 @@ getUnderlyingArgRegs(SmallVectorImpl<std::pair<unsigned, TypeSize>> &Regs,
 /// appear for function arguments or in the prologue.
 bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
     const Value *V, DILocalVariable *Variable, DIExpression *Expr,
-    DILocation *DL, bool IsDbgDeclare, const SDValue &N) {
+    DILocation *DL, FuncArgumentDbgValueKind Kind, const SDValue &N) {
   const Argument *Arg = dyn_cast<Argument>(V);
   if (!Arg)
     return false;
@@ -5537,7 +5525,7 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
     }
   };
 
-  if (!IsDbgDeclare) {
+  if (Kind == FuncArgumentDbgValueKind::Value) {
     // ArgDbgValues are hoisted to the beginning of the entry block. So we
     // should only emit as ArgDbgValue if the dbg.value intrinsic is found in
     // the entry block.
@@ -5624,7 +5612,7 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
     }
     if (Reg) {
       Op = MachineOperand::CreateReg(Reg, false);
-      IsIndirect = IsDbgDeclare;
+      IsIndirect = Kind != FuncArgumentDbgValueKind::Value;
     }
   }
 
@@ -5672,7 +5660,8 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
           continue;
         }
         MachineInstr *NewMI =
-            MakeVRegDbgValue(RegAndSize.first, *FragmentExpr, IsDbgDeclare);
+            MakeVRegDbgValue(RegAndSize.first, *FragmentExpr,
+                             Kind != FuncArgumentDbgValueKind::Value);
         FuncInfo.ArgDbgValues.push_back(NewMI);
       }
     };
@@ -5690,7 +5679,7 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
       }
 
       Op = MachineOperand::CreateReg(VMI->second, false);
-      IsIndirect = IsDbgDeclare;
+      IsIndirect = Kind != FuncArgumentDbgValueKind::Value;
     } else if (ArgRegsAndSizes.size() > 1) {
       // This was split due to the calling convention, and no virtual register
       // mapping exists for the value.
@@ -5712,6 +5701,7 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
     NewMI = BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE), true, *Op,
                     Variable, Expr);
 
+  // Otherwise, use ArgDbgValues.
   FuncInfo.ArgDbgValues.push_back(NewMI);
   return true;
 }
@@ -5817,16 +5807,18 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::vacopy:   visitVACopy(I); return;
   case Intrinsic::returnaddress:
     setValue(&I, DAG.getNode(ISD::RETURNADDR, sdl,
-                             TLI.getPointerTy(DAG.getDataLayout()),
+                             TLI.getValueType(DAG.getDataLayout(), I.getType()),
                              getValue(I.getArgOperand(0))));
     return;
   case Intrinsic::addressofreturnaddress:
-    setValue(&I, DAG.getNode(ISD::ADDROFRETURNADDR, sdl,
-                             TLI.getPointerTy(DAG.getDataLayout())));
+    setValue(&I,
+             DAG.getNode(ISD::ADDROFRETURNADDR, sdl,
+                         TLI.getValueType(DAG.getDataLayout(), I.getType())));
     return;
   case Intrinsic::sponentry:
-    setValue(&I, DAG.getNode(ISD::SPONENTRY, sdl,
-                             TLI.getFrameIndexTy(DAG.getDataLayout())));
+    setValue(&I,
+             DAG.getNode(ISD::SPONENTRY, sdl,
+                         TLI.getValueType(DAG.getDataLayout(), I.getType())));
     return;
   case Intrinsic::frameaddress:
     setValue(&I, DAG.getNode(ISD::FRAMEADDR, sdl,
@@ -5864,7 +5856,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     // @llvm.memcpy defines 0 and 1 to both mean no alignment.
     Align DstAlign = MCI.getDestAlign().valueOrOne();
     Align SrcAlign = MCI.getSourceAlign().valueOrOne();
-    Align Alignment = commonAlignment(DstAlign, SrcAlign);
+    Align Alignment = std::min(DstAlign, SrcAlign);
     bool isVol = MCI.isVolatile();
     bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget());
     // FIXME: Support passing different dest/src alignments to the memcpy DAG
@@ -5887,7 +5879,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     // @llvm.memcpy.inline defines 0 and 1 to both mean no alignment.
     Align DstAlign = MCI.getDestAlign().valueOrOne();
     Align SrcAlign = MCI.getSourceAlign().valueOrOne();
-    Align Alignment = commonAlignment(DstAlign, SrcAlign);
+    Align Alignment = std::min(DstAlign, SrcAlign);
     bool isVol = MCI.isVolatile();
     bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget());
     // FIXME: Support passing different dest/src alignments to the memcpy DAG
@@ -5910,10 +5902,28 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     bool isVol = MSI.isVolatile();
     bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget());
     SDValue Root = isVol ? getRoot() : getMemoryRoot();
-    SDValue MS = DAG.getMemset(Root, sdl, Op1, Op2, Op3, Alignment, isVol, isTC,
+    SDValue MS = DAG.getMemset(
+        Root, sdl, Op1, Op2, Op3, Alignment, isVol, /* AlwaysInline */ false,
+        isTC, MachinePointerInfo(I.getArgOperand(0)), I.getAAMetadata());
+    updateDAGForMaybeTailCall(MS);
+    return;
+  }
+  case Intrinsic::memset_inline: {
+    const auto &MSII = cast<MemSetInlineInst>(I);
+    SDValue Dst = getValue(I.getArgOperand(0));
+    SDValue Value = getValue(I.getArgOperand(1));
+    SDValue Size = getValue(I.getArgOperand(2));
+    assert(isa<ConstantSDNode>(Size) && "memset_inline needs constant size");
+    // @llvm.memset defines 0 and 1 to both mean no alignment.
+    Align DstAlign = MSII.getDestAlign().valueOrOne();
+    bool isVol = MSII.isVolatile();
+    bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget());
+    SDValue Root = isVol ? getRoot() : getMemoryRoot();
+    SDValue MC = DAG.getMemset(Root, sdl, Dst, Value, Size, DstAlign, isVol,
+                               /* AlwaysInline */ true, isTC,
                                MachinePointerInfo(I.getArgOperand(0)),
                                I.getAAMetadata());
-    updateDAGForMaybeTailCall(MS);
+    updateDAGForMaybeTailCall(MC);
     return;
   }
   case Intrinsic::memmove: {
@@ -5924,7 +5934,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     // @llvm.memmove defines 0 and 1 to both mean no alignment.
     Align DstAlign = MMI.getDestAlign().valueOrOne();
     Align SrcAlign = MMI.getSourceAlign().valueOrOne();
-    Align Alignment = commonAlignment(DstAlign, SrcAlign);
+    Align Alignment = std::min(DstAlign, SrcAlign);
     bool isVol = MMI.isVolatile();
     bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget());
     // FIXME: Support passing different dest/src alignments to the memmove DAG
@@ -5943,15 +5953,13 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     SDValue Src = getValue(MI.getRawSource());
     SDValue Length = getValue(MI.getLength());
 
-    unsigned DstAlign = MI.getDestAlignment();
-    unsigned SrcAlign = MI.getSourceAlignment();
     Type *LengthTy = MI.getLength()->getType();
     unsigned ElemSz = MI.getElementSizeInBytes();
     bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget());
-    SDValue MC = DAG.getAtomicMemcpy(getRoot(), sdl, Dst, DstAlign, Src,
-                                     SrcAlign, Length, LengthTy, ElemSz, isTC,
-                                     MachinePointerInfo(MI.getRawDest()),
-                                     MachinePointerInfo(MI.getRawSource()));
+    SDValue MC =
+        DAG.getAtomicMemcpy(getRoot(), sdl, Dst, Src, Length, LengthTy, ElemSz,
+                            isTC, MachinePointerInfo(MI.getRawDest()),
+                            MachinePointerInfo(MI.getRawSource()));
     updateDAGForMaybeTailCall(MC);
     return;
   }
@@ -5961,15 +5969,13 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     SDValue Src = getValue(MI.getRawSource());
     SDValue Length = getValue(MI.getLength());
 
-    unsigned DstAlign = MI.getDestAlignment();
-    unsigned SrcAlign = MI.getSourceAlignment();
     Type *LengthTy = MI.getLength()->getType();
     unsigned ElemSz = MI.getElementSizeInBytes();
     bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget());
-    SDValue MC = DAG.getAtomicMemmove(getRoot(), sdl, Dst, DstAlign, Src,
-                                      SrcAlign, Length, LengthTy, ElemSz, isTC,
-                                      MachinePointerInfo(MI.getRawDest()),
-                                      MachinePointerInfo(MI.getRawSource()));
+    SDValue MC =
+        DAG.getAtomicMemmove(getRoot(), sdl, Dst, Src, Length, LengthTy, ElemSz,
+                             isTC, MachinePointerInfo(MI.getRawDest()),
+                             MachinePointerInfo(MI.getRawSource()));
     updateDAGForMaybeTailCall(MC);
     return;
   }
@@ -5979,13 +5985,12 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     SDValue Val = getValue(MI.getValue());
     SDValue Length = getValue(MI.getLength());
 
-    unsigned DstAlign = MI.getDestAlignment();
     Type *LengthTy = MI.getLength()->getType();
     unsigned ElemSz = MI.getElementSizeInBytes();
     bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget());
-    SDValue MC = DAG.getAtomicMemset(getRoot(), sdl, Dst, DstAlign, Val, Length,
-                                     LengthTy, ElemSz, isTC,
-                                     MachinePointerInfo(MI.getRawDest()));
+    SDValue MC =
+        DAG.getAtomicMemset(getRoot(), sdl, Dst, Val, Length, LengthTy, ElemSz,
+                            isTC, MachinePointerInfo(MI.getRawDest()));
     updateDAGForMaybeTailCall(MC);
     return;
   }
@@ -6085,7 +6090,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
       } else if (isa<Argument>(Address)) {
         // Address is an argument, so try to emit its dbg value using
         // virtual register info from the FuncInfo.ValueMap.
-        EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, true, N);
+        EmitFuncArgumentDbgValue(Address, Variable, Expression, dl,
+                                 FuncArgumentDbgValueKind::Declare, N);
         return;
       } else {
         SDV = DAG.getDbgValue(Variable, Expression, N.getNode(), N.getResNo(),
@@ -6095,8 +6101,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     } else {
       // If Address is an argument then try to emit its dbg value using
       // virtual register info from the FuncInfo.ValueMap.
-      if (!EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, true,
-                                    N)) {
+      if (!EmitFuncArgumentDbgValue(Address, Variable, Expression, dl,
+                                    FuncArgumentDbgValueKind::Declare, N)) {
         LLVM_DEBUG(dbgs() << "Dropping debug info for " << DI
                           << " (could not emit func-arg dbg_value)\n");
       }
@@ -6162,8 +6168,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     return;
   case Intrinsic::eh_sjlj_callsite: {
     MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
-    ConstantInt *CI = dyn_cast<ConstantInt>(I.getArgOperand(0));
-    assert(CI && "Non-constant call site value in eh.sjlj.callsite!");
+    ConstantInt *CI = cast<ConstantInt>(I.getArgOperand(0));
     assert(MMI.getCurrentCallSite() == 0 && "Overlapping call sites!");
 
     MMI.setCurrentCallSite(CI->getZExtValue());
@@ -6343,6 +6348,29 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
 #include "llvm/IR/VPIntrinsics.def"
     visitVectorPredicationIntrinsic(cast<VPIntrinsic>(I));
     return;
+  case Intrinsic::fptrunc_round: {
+    // Get the last argument, the metadata and convert it to an integer in the
+    // call
+    Metadata *MD = cast<MetadataAsValue>(I.getArgOperand(1))->getMetadata();
+    Optional<RoundingMode> RoundMode =
+        convertStrToRoundingMode(cast<MDString>(MD)->getString());
+
+    EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+
+    // Propagate fast-math-flags from IR to node(s).
+    SDNodeFlags Flags;
+    Flags.copyFMF(*cast<FPMathOperator>(&I));
+    SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
+
+    SDValue Result;
+    Result = DAG.getNode(
+        ISD::FPTRUNC_ROUND, sdl, VT, getValue(I.getArgOperand(0)),
+        DAG.getTargetConstant((int)*RoundMode, sdl,
+                              TLI.getPointerTy(DAG.getDataLayout())));
+    setValue(&I, Result);
+
+    return;
+  }
   case Intrinsic::fmuladd: {
     EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
     if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict &&
@@ -6397,6 +6425,31 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     setValue(&I, Res);
     DAG.setRoot(Res.getValue(0));
     return;
+  case Intrinsic::is_fpclass: {
+    const DataLayout DLayout = DAG.getDataLayout();
+    EVT DestVT = TLI.getValueType(DLayout, I.getType());
+    EVT ArgVT = TLI.getValueType(DLayout, I.getArgOperand(0)->getType());
+    unsigned Test = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
+    MachineFunction &MF = DAG.getMachineFunction();
+    const Function &F = MF.getFunction();
+    SDValue Op = getValue(I.getArgOperand(0));
+    SDNodeFlags Flags;
+    Flags.setNoFPExcept(
+        !F.getAttributes().hasFnAttr(llvm::Attribute::StrictFP));
+    // If ISD::IS_FPCLASS should be expanded, do it right now, because the
+    // expansion can use illegal types. Making expansion early allows
+    // legalizing these types prior to selection.
+    if (!TLI.isOperationLegalOrCustom(ISD::IS_FPCLASS, ArgVT)) {
+      SDValue Result = TLI.expandIS_FPCLASS(DestVT, Op, Test, Flags, sdl, DAG);
+      setValue(&I, Result);
+      return;
+    }
+
+    SDValue Check = DAG.getTargetConstant(Test, sdl, MVT::i32);
+    SDValue V = DAG.getNode(ISD::IS_FPCLASS, sdl, DestVT, {Op, Check}, Flags);
+    setValue(&I, V);
+    return;
+  }
   case Intrinsic::pcmarker: {
     SDValue Tmp = getValue(I.getArgOperand(0));
     DAG.setRoot(DAG.getNode(ISD::PCMARKER, sdl, MVT::Other, getRoot(), Tmp));
@@ -6843,7 +6896,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   }
   case Intrinsic::invariant_start:
     // Discard region information.
-    setValue(&I, DAG.getUNDEF(TLI.getPointerTy(DAG.getDataLayout())));
+    setValue(&I,
+             DAG.getUNDEF(TLI.getValueType(DAG.getDataLayout(), I.getType())));
     return;
   case Intrinsic::invariant_end:
     // Discard region information.
@@ -7147,7 +7201,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     setValue(&I, SetCC);
     return;
   }
-  case Intrinsic::experimental_vector_insert: {
+  case Intrinsic::vector_insert: {
     SDValue Vec = getValue(I.getOperand(0));
     SDValue SubVec = getValue(I.getOperand(1));
     SDValue Index = getValue(I.getOperand(2));
@@ -7164,7 +7218,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                              Index));
     return;
   }
-  case Intrinsic::experimental_vector_extract: {
+  case Intrinsic::vector_extract: {
     SDValue Vec = getValue(I.getOperand(0));
     SDValue Index = getValue(I.getOperand(1));
     EVT ResultVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
@@ -7242,7 +7296,7 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
   };
 
   SDVTList VTs = DAG.getVTList(ValueVTs);
-  fp::ExceptionBehavior EB = FPI.getExceptionBehavior().getValue();
+  fp::ExceptionBehavior EB = *FPI.getExceptionBehavior();
 
   SDNodeFlags Flags;
   if (EB == fp::ExceptionBehavior::ebIgnore)
@@ -7307,13 +7361,14 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
 static unsigned getISDForVPIntrinsic(const VPIntrinsic &VPIntrin) {
   Optional<unsigned> ResOPC;
   switch (VPIntrin.getIntrinsicID()) {
-#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID:
-#define BEGIN_REGISTER_VP_SDNODE(VPSD, ...) ResOPC = ISD::VPSD;
-#define END_REGISTER_VP_INTRINSIC(VPID) break;
+#define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD)                                    \
+  case Intrinsic::VPID:                                                        \
+    ResOPC = ISD::VPSD;                                                        \
+    break;
 #include "llvm/IR/VPIntrinsics.def"
   }
 
-  if (!ResOPC.hasValue())
+  if (!ResOPC)
     llvm_unreachable(
         "Inconsistency: no SDNode available for this VPIntrinsic!");
 
@@ -7324,7 +7379,7 @@ static unsigned getISDForVPIntrinsic(const VPIntrinsic &VPIntrin) {
                                                 : ISD::VP_REDUCE_FMUL;
   }
 
-  return ResOPC.getValue();
+  return *ResOPC;
 }
 
 void SelectionDAGBuilder::visitVPLoadGather(const VPIntrinsic &VPIntrin, EVT VT,
@@ -7362,11 +7417,12 @@ void SelectionDAGBuilder::visitVPLoadGather(const VPIntrinsic &VPIntrin, EVT VT,
     SDValue Base, Index, Scale;
     ISD::MemIndexType IndexType;
     bool UniformBase = getUniformBase(PtrOperand, Base, Index, IndexType, Scale,
-                                      this, VPIntrin.getParent());
+                                      this, VPIntrin.getParent(),
+                                      VT.getScalarStoreSize());
     if (!UniformBase) {
       Base = DAG.getConstant(0, DL, TLI.getPointerTy(DAG.getDataLayout()));
       Index = getValue(PtrOperand);
-      IndexType = ISD::SIGNED_UNSCALED;
+      IndexType = ISD::SIGNED_SCALED;
       Scale =
           DAG.getTargetConstant(1, DL, TLI.getPointerTy(DAG.getDataLayout()));
     }
@@ -7418,11 +7474,12 @@ void SelectionDAGBuilder::visitVPStoreScatter(const VPIntrinsic &VPIntrin,
     SDValue Base, Index, Scale;
     ISD::MemIndexType IndexType;
     bool UniformBase = getUniformBase(PtrOperand, Base, Index, IndexType, Scale,
-                                      this, VPIntrin.getParent());
+                                      this, VPIntrin.getParent(),
+                                      VT.getScalarStoreSize());
     if (!UniformBase) {
       Base = DAG.getConstant(0, DL, TLI.getPointerTy(DAG.getDataLayout()));
       Index = getValue(PtrOperand);
-      IndexType = ISD::SIGNED_UNSCALED;
+      IndexType = ISD::SIGNED_SCALED;
       Scale =
           DAG.getTargetConstant(1, DL, TLI.getPointerTy(DAG.getDataLayout()));
     }
@@ -7441,18 +7498,104 @@ void SelectionDAGBuilder::visitVPStoreScatter(const VPIntrinsic &VPIntrin,
   setValue(&VPIntrin, ST);
 }
 
+void SelectionDAGBuilder::visitVPStridedLoad(
+    const VPIntrinsic &VPIntrin, EVT VT, SmallVectorImpl<SDValue> &OpValues) {
+  SDLoc DL = getCurSDLoc();
+  Value *PtrOperand = VPIntrin.getArgOperand(0);
+  MaybeAlign Alignment = VPIntrin.getPointerAlignment();
+  if (!Alignment)
+    Alignment = DAG.getEVTAlign(VT.getScalarType());
+  AAMDNodes AAInfo = VPIntrin.getAAMetadata();
+  const MDNode *Ranges = VPIntrin.getMetadata(LLVMContext::MD_range);
+  MemoryLocation ML = MemoryLocation::getAfter(PtrOperand, AAInfo);
+  bool AddToChain = !AA || !AA->pointsToConstantMemory(ML);
+  SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode();
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MachinePointerInfo(PtrOperand), MachineMemOperand::MOLoad,
+      MemoryLocation::UnknownSize, *Alignment, AAInfo, Ranges);
+
+  SDValue LD = DAG.getStridedLoadVP(VT, DL, InChain, OpValues[0], OpValues[1],
+                                    OpValues[2], OpValues[3], MMO,
+                                    false /*IsExpanding*/);
+
+  if (AddToChain)
+    PendingLoads.push_back(LD.getValue(1));
+  setValue(&VPIntrin, LD);
+}
+
+void SelectionDAGBuilder::visitVPStridedStore(
+    const VPIntrinsic &VPIntrin, SmallVectorImpl<SDValue> &OpValues) {
+  SDLoc DL = getCurSDLoc();
+  Value *PtrOperand = VPIntrin.getArgOperand(1);
+  EVT VT = OpValues[0].getValueType();
+  MaybeAlign Alignment = VPIntrin.getPointerAlignment();
+  if (!Alignment)
+    Alignment = DAG.getEVTAlign(VT.getScalarType());
+  AAMDNodes AAInfo = VPIntrin.getAAMetadata();
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MachinePointerInfo(PtrOperand), MachineMemOperand::MOStore,
+      MemoryLocation::UnknownSize, *Alignment, AAInfo);
+
+  SDValue ST = DAG.getStridedStoreVP(
+      getMemoryRoot(), DL, OpValues[0], OpValues[1],
+      DAG.getUNDEF(OpValues[1].getValueType()), OpValues[2], OpValues[3],
+      OpValues[4], VT, MMO, ISD::UNINDEXED, /*IsTruncating*/ false,
+      /*IsCompressing*/ false);
+
+  DAG.setRoot(ST);
+  setValue(&VPIntrin, ST);
+}
+
+void SelectionDAGBuilder::visitVPCmp(const VPCmpIntrinsic &VPIntrin) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDLoc DL = getCurSDLoc();
+
+  ISD::CondCode Condition;
+  CmpInst::Predicate CondCode = VPIntrin.getPredicate();
+  bool IsFP = VPIntrin.getOperand(0)->getType()->isFPOrFPVectorTy();
+  if (IsFP) {
+    // FIXME: Regular fcmps are FPMathOperators which may have fast-math (nnan)
+    // flags, but calls that don't return floating-point types can't be
+    // FPMathOperators, like vp.fcmp. This affects constrained fcmp too.
+    Condition = getFCmpCondCode(CondCode);
+    if (TM.Options.NoNaNsFPMath)
+      Condition = getFCmpCodeWithoutNaN(Condition);
+  } else {
+    Condition = getICmpCondCode(CondCode);
+  }
+
+  SDValue Op1 = getValue(VPIntrin.getOperand(0));
+  SDValue Op2 = getValue(VPIntrin.getOperand(1));
+  // #2 is the condition code
+  SDValue MaskOp = getValue(VPIntrin.getOperand(3));
+  SDValue EVL = getValue(VPIntrin.getOperand(4));
+  MVT EVLParamVT = TLI.getVPExplicitVectorLengthTy();
+  assert(EVLParamVT.isScalarInteger() && EVLParamVT.bitsGE(MVT::i32) &&
+         "Unexpected target EVL type");
+  EVL = DAG.getNode(ISD::ZERO_EXTEND, DL, EVLParamVT, EVL);
+
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        VPIntrin.getType());
+  setValue(&VPIntrin,
+           DAG.getSetCCVP(DL, DestVT, Op1, Op2, Condition, MaskOp, EVL));
+}
+
 void SelectionDAGBuilder::visitVectorPredicationIntrinsic(
     const VPIntrinsic &VPIntrin) {
   SDLoc DL = getCurSDLoc();
   unsigned Opcode = getISDForVPIntrinsic(VPIntrin);
 
+  auto IID = VPIntrin.getIntrinsicID();
+
+  if (const auto *CmpI = dyn_cast<VPCmpIntrinsic>(&VPIntrin))
+    return visitVPCmp(*CmpI);
+
   SmallVector<EVT, 4> ValueVTs;
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   ComputeValueVTs(TLI, DAG.getDataLayout(), VPIntrin.getType(), ValueVTs);
   SDVTList VTs = DAG.getVTList(ValueVTs);
 
-  auto EVLParamPos =
-      VPIntrinsic::getVectorLengthParamPos(VPIntrin.getIntrinsicID());
+  auto EVLParamPos = VPIntrinsic::getVectorLengthParamPos(IID);
 
   MVT EVLParamVT = TLI.getVPExplicitVectorLengthTy();
   assert(EVLParamVT.isScalarInteger() && EVLParamVT.bitsGE(MVT::i32) &&
@@ -7469,7 +7612,10 @@ void SelectionDAGBuilder::visitVectorPredicationIntrinsic(
 
   switch (Opcode) {
   default: {
-    SDValue Result = DAG.getNode(Opcode, DL, VTs, OpValues);
+    SDNodeFlags SDFlags;
+    if (auto *FPMO = dyn_cast<FPMathOperator>(&VPIntrin))
+      SDFlags.copyFMF(*FPMO);
+    SDValue Result = DAG.getNode(Opcode, DL, VTs, OpValues, SDFlags);
     setValue(&VPIntrin, Result);
     break;
   }
@@ -7478,10 +7624,16 @@ void SelectionDAGBuilder::visitVectorPredicationIntrinsic(
     visitVPLoadGather(VPIntrin, ValueVTs[0], OpValues,
                       Opcode == ISD::VP_GATHER);
     break;
+  case ISD::EXPERIMENTAL_VP_STRIDED_LOAD:
+    visitVPStridedLoad(VPIntrin, ValueVTs[0], OpValues);
+    break;
   case ISD::VP_STORE:
   case ISD::VP_SCATTER:
     visitVPStoreScatter(VPIntrin, OpValues, Opcode == ISD::VP_SCATTER);
     break;
+  case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
+    visitVPStridedStore(VPIntrin, OpValues);
+    break;
   }
 }
 
@@ -7756,7 +7908,7 @@ void SelectionDAGBuilder::processIntegerCallValue(const Instruction &I,
 bool SelectionDAGBuilder::visitMemCmpBCmpCall(const CallInst &I) {
   const Value *LHS = I.getArgOperand(0), *RHS = I.getArgOperand(1);
   const Value *Size = I.getArgOperand(2);
-  const ConstantInt *CSize = dyn_cast<ConstantInt>(Size);
+  const ConstantSDNode *CSize = dyn_cast<ConstantSDNode>(getValue(Size));
   if (CSize && CSize->getZExtValue() == 0) {
     EVT CallVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
                                                           I.getType(), true);
@@ -8277,7 +8429,7 @@ public:
     // accessed type.
     if (isIndirect) {
       OpTy = ParamElemType;
-      assert(OpTy && "Indirect opernad must have elementtype attribute");
+      assert(OpTy && "Indirect operand must have elementtype attribute");
     }
 
     // Look for vector wrapped in a struct. e.g. { <16 x i8> }.
@@ -8398,8 +8550,9 @@ getRegistersForValue(SelectionDAG &DAG, const SDLoc &DL,
   SmallVector<unsigned, 4> Regs;
   const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
 
-  // No work to do for memory operations.
-  if (OpInfo.ConstraintType == TargetLowering::C_Memory)
+  // No work to do for memory/address operands.
+  if (OpInfo.ConstraintType == TargetLowering::C_Memory ||
+      OpInfo.ConstraintType == TargetLowering::C_Address)
     return None;
 
   // If this is a constraint for a single physreg, or a constraint for a
@@ -8579,7 +8732,7 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call,
     if (OpInfo.hasArg()) {
       OpInfo.CallOperandVal = Call.getArgOperand(ArgNo);
       OpInfo.CallOperand = getValue(OpInfo.CallOperandVal);
-      Type *ParamElemTy = Call.getAttributes().getParamElementType(ArgNo);
+      Type *ParamElemTy = Call.getParamElementType(ArgNo);
       EVT VT = OpInfo.getCallOperandValEVT(*DAG.getContext(), TLI,
                                            DAG.getDataLayout(), ParamElemTy);
       OpInfo.ConstraintVT = VT.isSimple() ? VT.getSimpleVT() : MVT::Other;
@@ -8657,8 +8810,9 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call,
     // Compute the constraint code and ConstraintType to use.
     TLI.ComputeConstraintToUse(OpInfo, OpInfo.CallOperand, &DAG);
 
-    if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
-        OpInfo.Type == InlineAsm::isClobber)
+    if ((OpInfo.ConstraintType == TargetLowering::C_Memory &&
+         OpInfo.Type == InlineAsm::isClobber) ||
+        OpInfo.ConstraintType == TargetLowering::C_Address)
       continue;
 
     // If this is a memory input, and if the operand is not indirect, do what we
@@ -8708,7 +8862,7 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call,
             : OpInfo;
     const auto RegError =
         getRegistersForValue(DAG, getCurSDLoc(), OpInfo, RefOpInfo);
-    if (RegError.hasValue()) {
+    if (RegError) {
       const MachineFunction &MF = DAG.getMachineFunction();
       const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
       const char *RegName = TRI.getName(RegError.getValue());
@@ -8733,6 +8887,10 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call,
       }
       return false;
     };
+    assert((OpInfo.ConstraintType != TargetLowering::C_Address ||
+            (OpInfo.Type == InlineAsm::isInput &&
+             !OpInfo.isMatchingInputConstraint())) &&
+           "Only address as input operand is allowed.");
 
     switch (OpInfo.Type) {
     case InlineAsm::isOutput:
@@ -8865,8 +9023,11 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call,
         break;
       }
 
-      if (OpInfo.ConstraintType == TargetLowering::C_Memory) {
-        assert(OpInfo.isIndirect && "Operand must be indirect to be a mem!");
+      if (OpInfo.ConstraintType == TargetLowering::C_Memory ||
+          OpInfo.ConstraintType == TargetLowering::C_Address) {
+        assert((OpInfo.isIndirect ||
+                OpInfo.ConstraintType != TargetLowering::C_Memory) &&
+               "Operand must be indirect to be a mem!");
         assert(InOperandVal.getValueType() ==
                    TLI.getPointerTy(DAG.getDataLayout()) &&
                "Memory operands expect pointer values");
@@ -9004,6 +9165,8 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call,
         break;
       case TargetLowering::C_Memory:
         break; // Already handled.
+      case TargetLowering::C_Address:
+        break; // Silence warning.
       case TargetLowering::C_Unknown:
         assert(false && "Unexpected unknown constraint");
       }
@@ -9950,8 +10113,9 @@ SDValue TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   llvm_unreachable("LowerOperation not implemented for this target!");
 }
 
-void
-SelectionDAGBuilder::CopyValueToVirtualRegister(const Value *V, unsigned Reg) {
+void SelectionDAGBuilder::CopyValueToVirtualRegister(const Value *V,
+                                                     unsigned Reg,
+                                                     ISD::NodeType ExtendType) {
   SDValue Op = getNonRegisterValue(V);
   assert((Op.getOpcode() != ISD::CopyFromReg ||
           cast<RegisterSDNode>(Op.getOperand(1))->getReg() != Reg) &&
@@ -9966,10 +10130,11 @@ SelectionDAGBuilder::CopyValueToVirtualRegister(const Value *V, unsigned Reg) {
                    None); // This is not an ABI copy.
   SDValue Chain = DAG.getEntryNode();
 
-  ISD::NodeType ExtendType = ISD::ANY_EXTEND;
-  auto PreferredExtendIt = FuncInfo.PreferredExtendType.find(V);
-  if (PreferredExtendIt != FuncInfo.PreferredExtendType.end())
-    ExtendType = PreferredExtendIt->second;
+  if (ExtendType == ISD::ANY_EXTEND) {
+    auto PreferredExtendIt = FuncInfo.PreferredExtendType.find(V);
+    if (PreferredExtendIt != FuncInfo.PreferredExtendType.end())
+      ExtendType = PreferredExtendIt->second;
+  }
   RFV.getCopyToRegs(Op, DAG, getCurSDLoc(), Chain, nullptr, V, ExtendType);
   PendingExports.push_back(Chain);
 }
@@ -10542,6 +10707,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
 /// the end.
 void
 SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   const Instruction *TI = LLVMBB->getTerminator();
 
   SmallPtrSet<MachineBasicBlock *, 4> SuccsHandled;
@@ -10579,7 +10745,13 @@ SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
         unsigned &RegOut = ConstantsOut[C];
         if (RegOut == 0) {
           RegOut = FuncInfo.CreateRegs(C);
-          CopyValueToVirtualRegister(C, RegOut);
+          // We need to zero/sign extend ConstantInt phi operands to match
+          // assumptions in FunctionLoweringInfo::ComputePHILiveOutRegInfo.
+          ISD::NodeType ExtendType = ISD::ANY_EXTEND;
+          if (auto *CI = dyn_cast<ConstantInt>(C))
+            ExtendType = TLI.signExtendConstant(CI) ? ISD::SIGN_EXTEND
+                                                    : ISD::ZERO_EXTEND;
+          CopyValueToVirtualRegister(C, RegOut, ExtendType);
         }
         Reg = RegOut;
       } else {
@@ -10599,7 +10771,6 @@ SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
       // Remember that this register needs to added to the machine PHI node as
       // the input for this MBB.
       SmallVector<EVT, 4> ValueVTs;
-      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
       ComputeValueVTs(TLI, DAG.getDataLayout(), PN.getType(), ValueVTs);
       for (unsigned vti = 0, vte = ValueVTs.size(); vti != vte; ++vti) {
         EVT VT = ValueVTs[vti];
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index ea48042a5dcf..72cca3d9b001 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -284,7 +284,8 @@ public:
     return CurInst ? CurInst->getDebugLoc() : DebugLoc();
   }
 
-  void CopyValueToVirtualRegister(const Value *V, unsigned Reg);
+  void CopyValueToVirtualRegister(const Value *V, unsigned Reg,
+                                  ISD::NodeType ExtendType = ISD::ANY_EXTEND);
 
   void visit(const Instruction &I);
 
@@ -527,7 +528,7 @@ private:
   void visitInsertElement(const User &I);
   void visitShuffleVector(const User &I);
 
-  void visitExtractValue(const User &I);
+  void visitExtractValue(const ExtractValueInst &I);
   void visitInsertValue(const User &I);
   void visitLandingPad(const LandingPadInst &LP);
 
@@ -570,6 +571,11 @@ private:
                          SmallVector<SDValue, 7> &OpValues, bool IsGather);
   void visitVPStoreScatter(const VPIntrinsic &VPIntrin,
                            SmallVector<SDValue, 7> &OpValues, bool IsScatter);
+  void visitVPStridedLoad(const VPIntrinsic &VPIntrin, EVT VT,
+                          SmallVectorImpl<SDValue> &OpValues);
+  void visitVPStridedStore(const VPIntrinsic &VPIntrin,
+                           SmallVectorImpl<SDValue> &OpValues);
+  void visitVPCmp(const VPCmpIntrinsic &VPIntrin);
   void visitVectorPredicationIntrinsic(const VPIntrinsic &VPIntrin);
 
   void visitVAStart(const CallInst &I);
@@ -602,12 +608,22 @@ private:
 
   void emitInlineAsmError(const CallBase &Call, const Twine &Message);
 
+  /// An enum that states to emit func argument dbg value the kind of intrinsic
+  /// it originally had. This controls the internal behavior of
+  /// EmitFuncArgumentDbgValue.
+  enum class FuncArgumentDbgValueKind {
+    Value,   // This was originally a llvm.dbg.value.
+    Addr,    // This was originally a llvm.dbg.addr.
+    Declare, // This was originally a llvm.dbg.declare.
+  };
+
   /// If V is an function argument then create corresponding DBG_VALUE machine
   /// instruction for it now. At the end of instruction selection, they will be
   /// inserted to the entry BB.
   bool EmitFuncArgumentDbgValue(const Value *V, DILocalVariable *Variable,
                                 DIExpression *Expr, DILocation *DL,
-                                bool IsDbgDeclare, const SDValue &N);
+                                FuncArgumentDbgValueKind Kind,
+                                const SDValue &N);
 
   /// Return the next block after MBB, or nullptr if there is none.
   MachineBasicBlock *NextBlock(MachineBasicBlock *MBB);
@@ -673,9 +689,7 @@ struct RegsForValue {
                const DataLayout &DL, unsigned Reg, Type *Ty,
                Optional<CallingConv::ID> CC);
 
-  bool isABIMangled() const {
-    return CallConv.hasValue();
-  }
+  bool isABIMangled() const { return CallConv.has_value(); }
 
   /// Add the specified values to this one.
   void append(const RegsForValue &RHS) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 77e9e53668f9..bbfc6e5ef64f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -10,9 +10,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "SDNodeDbgValue.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
-#include "llvm/ADT/None.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
@@ -45,7 +45,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Target/TargetMachine.h"
-#include "SDNodeDbgValue.h"
 #include <cstdint>
 #include <iterator>
 
@@ -231,6 +230,10 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::MUL:                        return "mul";
   case ISD::MULHU:                      return "mulhu";
   case ISD::MULHS:                      return "mulhs";
+  case ISD::AVGFLOORU:                  return "avgflooru";
+  case ISD::AVGFLOORS:                  return "avgfloors";
+  case ISD::AVGCEILU:                   return "avgceilu";
+  case ISD::AVGCEILS:                   return "avgceils";
   case ISD::ABDS:                       return "abds";
   case ISD::ABDU:                       return "abdu";
   case ISD::SDIV:                       return "sdiv";
@@ -267,6 +270,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::FCOPYSIGN:                  return "fcopysign";
   case ISD::FGETSIGN:                   return "fgetsign";
   case ISD::FCANONICALIZE:              return "fcanonicalize";
+  case ISD::IS_FPCLASS:                 return "is_fpclass";
   case ISD::FPOW:                       return "fpow";
   case ISD::STRICT_FPOW:                return "strict_fpow";
   case ISD::SMIN:                       return "smin";
@@ -361,6 +365,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::STRICT_FP16_TO_FP:          return "strict_fp16_to_fp";
   case ISD::FP_TO_FP16:                 return "fp_to_fp16";
   case ISD::STRICT_FP_TO_FP16:          return "strict_fp_to_fp16";
+  case ISD::BF16_TO_FP:                 return "bf16_to_fp";
+  case ISD::FP_TO_BF16:                 return "fp_to_bf16";
   case ISD::LROUND:                     return "lround";
   case ISD::STRICT_LROUND:              return "strict_lround";
   case ISD::LLROUND:                    return "llround";
@@ -814,6 +820,8 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
   } else if (const LifetimeSDNode *LN = dyn_cast<LifetimeSDNode>(this)) {
     if (LN->hasOffset())
       OS << "<" << LN->getOffset() << " to " << LN->getOffset() + LN->getSize() << ">";
+  } else if (const auto *AA = dyn_cast<AssertAlignSDNode>(this)) {
+    OS << '<' << AA->getAlign().value() << '>';
   }
 
   if (VerboseDAGDumping) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 3c786904620a..2b63359c2b1b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -15,11 +15,9 @@
 #include "SelectionDAGBuilder.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/None.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
@@ -29,6 +27,7 @@
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/LazyBlockFrequencyInfo.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
@@ -69,7 +68,6 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/InstIterator.h"
-#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -82,7 +80,6 @@
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/MCInstrDesc.h"
-#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/Casting.h"
@@ -370,8 +367,8 @@ static void SplitCriticalSideEffectEdges(Function &Fn, DominatorTree *DT,
     // PHI.
     for (BasicBlock::iterator I = BB.begin(); (PN = dyn_cast<PHINode>(I)); ++I)
       for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
-        ConstantExpr *CE = dyn_cast<ConstantExpr>(PN->getIncomingValue(i));
-        if (!CE || !CE->canTrap()) continue;
+        Constant *C = dyn_cast<Constant>(PN->getIncomingValue(i));
+        if (!C || !C->canTrap()) continue;
 
         // The only case we have to worry about is when the edge is critical.
         // Since this block has a PHI Node, we assume it has multiple input
@@ -425,6 +422,11 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   const Function &Fn = mf.getFunction();
   MF = &mf;
 
+  // Decide what flavour of variable location debug-info will be used, before
+  // we change the optimisation level.
+  UseInstrRefDebugInfo = mf.useDebugInstrRef();
+  CurDAG->useInstrRefDebugInfo(UseInstrRefDebugInfo);
+
   // Reset the target options before resetting the optimization
   // level below.
   // FIXME: This is a horrible hack and should be processed via
@@ -654,7 +656,8 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
 
   // For debug-info, in instruction referencing mode, we need to perform some
   // post-isel maintenence.
-  MF->finalizeDebugInstrRefs();
+  if (UseInstrRefDebugInfo)
+    MF->finalizeDebugInstrRefs();
 
   // Determine if there are any calls in this machine function.
   MachineFrameInfo &MFI = MF->getFrameInfo();
@@ -703,6 +706,7 @@ static void reportFastISelFailure(MachineFunction &MF,
     report_fatal_error(Twine(R.getMsg()));
 
   ORE.emit(R);
+  LLVM_DEBUG(dbgs() << R.getMsg() << "\n");
 }
 
 void SelectionDAGISel::SelectBasicBlock(BasicBlock::const_iterator Begin,
@@ -1380,6 +1384,8 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
   if (TM.Options.EnableFastISel) {
     LLVM_DEBUG(dbgs() << "Enabling fast-isel\n");
     FastIS = TLI->createFastISel(*FuncInfo, LibInfo);
+    if (FastIS)
+      FastIS->useInstrRefDebugInfo(UseInstrRefDebugInfo);
   }
 
   ReversePostOrderTraversal<const Function*> RPOT(&Fn);
@@ -1519,6 +1525,8 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
               BeforeInst->hasOneUse() &&
               FastIS->tryToFoldLoad(cast<LoadInst>(BeforeInst), Inst)) {
             // If we succeeded, don't re-select the load.
+            LLVM_DEBUG(dbgs()
+                       << "FastISel folded load: " << *BeforeInst << "\n");
             BI = std::next(BasicBlock::const_iterator(BeforeInst));
             --NumFastIselRemaining;
             ++NumFastIselSuccess;
@@ -3264,6 +3272,8 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       assert(RecNo < RecordedNodes.size() && "Invalid EmitMergeInputChains");
       ChainNodesMatched.push_back(RecordedNodes[RecNo].first.getNode());
 
+      // If the chained node is not the root, we can't fold it if it has
+      // multiple uses.
       // FIXME: What if other value results of the node have uses not matched
       // by this pattern?
       if (ChainNodesMatched.back() != NodeToMatch &&
@@ -3301,6 +3311,8 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
         assert(RecNo < RecordedNodes.size() && "Invalid EmitMergeInputChains");
         ChainNodesMatched.push_back(RecordedNodes[RecNo].first.getNode());
 
+        // If the chained node is not the root, we can't fold it if it has
+        // multiple uses.
         // FIXME: What if other value results of the node have uses not matched
         // by this pattern?
         if (ChainNodesMatched.back() != NodeToMatch &&
@@ -3439,12 +3451,10 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       // such nodes must have a chain, it suffices to check ChainNodesMatched.
       // We need to perform this check before potentially modifying one of the
       // nodes via MorphNode.
-      bool MayRaiseFPException = false;
-      for (auto *N : ChainNodesMatched)
-        if (mayRaiseFPException(N) && !N->getFlags().hasNoFPExcept()) {
-          MayRaiseFPException = true;
-          break;
-        }
+      bool MayRaiseFPException =
+          llvm::any_of(ChainNodesMatched, [this](SDNode *N) {
+            return mayRaiseFPException(N) && !N->getFlags().hasNoFPExcept();
+          });
 
       // Create the node.
       MachineSDNode *Res = nullptr;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
index d022e2a23ea0..b66eeb6d2bb1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
@@ -13,15 +13,11 @@
 #include "ScheduleDAGSDNodes.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "dag-printer"
@@ -181,11 +177,11 @@ LLVM_DUMP_METHOD void SelectionDAG::dumpDotGraph(const Twine &FileName,
 /// clearGraphAttrs - Clear all previously defined node graph attributes.
 /// Intended to be used from a debugging tool (eg. gdb).
 void SelectionDAG::clearGraphAttrs() {
-#ifndef NDEBUG
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
   NodeGraphAttrs.clear();
 #else
-  errs() << "SelectionDAG::clearGraphAttrs is only available in debug builds"
-         << " on systems with Graphviz or gv!\n";
+  errs() << "SelectionDAG::clearGraphAttrs is only available in builds with "
+         << "ABI breaking checks enabled on systems with Graphviz or gv!\n";
 #endif
 }
 
@@ -193,11 +189,11 @@ void SelectionDAG::clearGraphAttrs() {
 /// setGraphAttrs - Set graph attributes for a node. (eg. "color=red".)
 ///
 void SelectionDAG::setGraphAttrs(const SDNode *N, const char *Attrs) {
-#ifndef NDEBUG
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
   NodeGraphAttrs[N] = Attrs;
 #else
-  errs() << "SelectionDAG::setGraphAttrs is only available in debug builds"
-         << " on systems with Graphviz or gv!\n";
+  errs() << "SelectionDAG::setGraphAttrs is only available in builds with "
+         << "ABI breaking checks enabled on systems with Graphviz or gv!\n";
 #endif
 }
 
@@ -205,7 +201,7 @@ void SelectionDAG::setGraphAttrs(const SDNode *N, const char *Attrs) {
 /// getGraphAttrs - Get graph attributes for a node. (eg. "color=red".)
 /// Used from getNodeAttributes.
 std::string SelectionDAG::getGraphAttrs(const SDNode *N) const {
-#ifndef NDEBUG
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
   std::map<const SDNode *, std::string>::const_iterator I =
     NodeGraphAttrs.find(N);
 
@@ -214,8 +210,8 @@ std::string SelectionDAG::getGraphAttrs(const SDNode *N) const {
   else
     return "";
 #else
-  errs() << "SelectionDAG::getGraphAttrs is only available in debug builds"
-         << " on systems with Graphviz or gv!\n";
+  errs() << "SelectionDAG::getGraphAttrs is only available in builds with "
+         << "ABI breaking checks enabled on systems with Graphviz or gv!\n";
   return std::string();
 #endif
 }
@@ -223,11 +219,11 @@ std::string SelectionDAG::getGraphAttrs(const SDNode *N) const {
 /// setGraphColor - Convenience for setting node color attribute.
 ///
 void SelectionDAG::setGraphColor(const SDNode *N, const char *Color) {
-#ifndef NDEBUG
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
   NodeGraphAttrs[N] = std::string("color=") + Color;
 #else
-  errs() << "SelectionDAG::setGraphColor is only available in debug builds"
-         << " on systems with Graphviz or gv!\n";
+  errs() << "SelectionDAG::setGraphColor is only available in builds with "
+         << "ABI breaking checks enabled on systems with Graphviz or gv!\n";
 #endif
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
index dfda7d8b9f81..19a52fde44c1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -17,7 +17,10 @@
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GCMetadata.h"
@@ -27,6 +30,7 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
@@ -168,7 +172,7 @@ static Optional<int> findPreviousSpillSlot(const Value *Val,
     const auto &RelocationMap =
         Builder.FuncInfo.StatepointRelocationMaps[Relocate->getStatepoint()];
 
-    auto It = RelocationMap.find(Relocate->getDerivedPtr());
+    auto It = RelocationMap.find(Relocate);
     if (It == RelocationMap.end())
       return None;
 
@@ -192,10 +196,10 @@ static Optional<int> findPreviousSpillSlot(const Value *Val,
     for (auto &IncomingValue : Phi->incoming_values()) {
       Optional<int> SpillSlot =
           findPreviousSpillSlot(IncomingValue, Builder, LookUpDepth - 1);
-      if (!SpillSlot.hasValue())
+      if (!SpillSlot)
         return None;
 
-      if (MergedResult.hasValue() && *MergedResult != *SpillSlot)
+      if (MergedResult && *MergedResult != *SpillSlot)
         return None;
 
       MergedResult = SpillSlot;
@@ -276,7 +280,7 @@ static void reservePreviousStackSlotForValue(const Value *IncomingValue,
   const int LookUpDepth = 6;
   Optional<int> Index =
       findPreviousSpillSlot(IncomingValue, Builder, LookUpDepth);
-  if (!Index.hasValue())
+  if (!Index)
     return;
 
   const auto &StatepointSlots = Builder.FuncInfo.StatepointStackSlots;
@@ -526,14 +530,14 @@ lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
     GCStrategy &S = GFI->getStrategy();
     for (const Value *V : SI.Bases) {
       auto Opt = S.isGCManagedPointer(V->getType()->getScalarType());
-      if (Opt.hasValue()) {
+      if (Opt) {
         assert(Opt.getValue() &&
                "non gc managed base pointer found in statepoint");
       }
     }
     for (const Value *V : SI.Ptrs) {
       auto Opt = S.isGCManagedPointer(V->getType()->getScalarType());
-      if (Opt.hasValue()) {
+      if (Opt) {
         assert(Opt.getValue() &&
                "non gc managed derived pointer found in statepoint");
       }
@@ -880,8 +884,9 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT(
     DAG.getMachineNode(TargetOpcode::STATEPOINT, getCurSDLoc(), NodeTys, Ops);
   DAG.setNodeMemRefs(StatepointMCNode, MemRefs);
 
-  // For values lowered to tied-defs, create the virtual registers.  Note that
-  // for simplicity, we *always* create a vreg even within a single block.
+  // For values lowered to tied-defs, create the virtual registers if used
+  // in other blocks. For local gc.relocate record appropriate statepoint
+  // result in StatepointLoweringState.
   DenseMap<SDValue, Register> VirtRegs;
   for (const auto *Relocate : SI.GCRelocates) {
     Value *Derived = Relocate->getDerivedPtr();
@@ -889,12 +894,23 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT(
     if (!LowerAsVReg.count(SD))
       continue;
 
+    SDValue Relocated = SDValue(StatepointMCNode, LowerAsVReg[SD]);
+
+    // Handle local relocate. Note that different relocates might
+    // map to the same SDValue.
+    if (SI.StatepointInstr->getParent() == Relocate->getParent()) {
+      SDValue Res = StatepointLowering.getLocation(SD);
+      if (Res)
+        assert(Res == Relocated);
+      else
+        StatepointLowering.setLocation(SD, Relocated);
+      continue;
+    }
+
     // Handle multiple gc.relocates of the same input efficiently.
     if (VirtRegs.count(SD))
       continue;
 
-    SDValue Relocated = SDValue(StatepointMCNode, LowerAsVReg[SD]);
-
     auto *RetTy = Relocate->getType();
     Register Reg = FuncInfo.CreateRegs(RetTy);
     RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
@@ -915,8 +931,13 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT(
     SDValue SDV = getValue(V);
     SDValue Loc = StatepointLowering.getLocation(SDV);
 
+    bool IsLocal = (Relocate->getParent() == StatepointInstr->getParent());
+
     RecordType Record;
-    if (LowerAsVReg.count(SDV)) {
+    if (IsLocal && LowerAsVReg.count(SDV)) {
+      // Result is already stored in StatepointLowering
+      Record.type = RecordType::SDValueNode;
+    } else if (LowerAsVReg.count(SDV)) {
       Record.type = RecordType::VReg;
       assert(VirtRegs.count(SDV));
       Record.payload.Reg = VirtRegs[SDV];
@@ -932,7 +953,7 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT(
       if (Relocate->getParent() != StatepointInstr->getParent())
         ExportFromCurrentBlock(V);
     }
-    RelocationMap[V] = Record;
+    RelocationMap[Relocate] = Record;
   }
 
   
@@ -1148,8 +1169,8 @@ void SelectionDAGBuilder::LowerCallSiteWithDeoptBundleImpl(
   unsigned DefaultID = StatepointDirectives::DeoptBundleStatepointID;
 
   auto SD = parseStatepointDirectivesFromAttrs(Call->getAttributes());
-  SI.ID = SD.StatepointID.getValueOr(DefaultID);
-  SI.NumPatchBytes = SD.NumPatchBytes.getValueOr(0);
+  SI.ID = SD.StatepointID.value_or(DefaultID);
+  SI.NumPatchBytes = SD.NumPatchBytes.value_or(0);
 
   SI.DeoptState =
       ArrayRef<const Use>(DeoptBundle.Inputs.begin(), DeoptBundle.Inputs.end());
@@ -1210,11 +1231,19 @@ void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) {
   const Value *DerivedPtr = Relocate.getDerivedPtr();
   auto &RelocationMap =
     FuncInfo.StatepointRelocationMaps[Relocate.getStatepoint()];
-  auto SlotIt = RelocationMap.find(DerivedPtr);
+  auto SlotIt = RelocationMap.find(&Relocate);
   assert(SlotIt != RelocationMap.end() && "Relocating not lowered gc value");
   const RecordType &Record = SlotIt->second;
 
   // If relocation was done via virtual register..
+  if (Record.type == RecordType::SDValueNode) {
+    assert(Relocate.getStatepoint()->getParent() == Relocate.getParent() &&
+           "Nonlocal gc.relocate mapped via SDValue");
+    SDValue SDV = StatepointLowering.getLocation(getValue(DerivedPtr));
+    assert(SDV.getNode() && "empty SDValue");
+    setValue(&Relocate, SDV);
+    return;
+  }
   if (Record.type == RecordType::VReg) {
     Register InReg = Record.payload.Reg;
     RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index f6d1fa87676f..a6b471ea22b7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -13,13 +13,13 @@
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/CodeGenCommonISel.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -30,7 +30,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include <cctype>
 using namespace llvm;
@@ -94,6 +93,8 @@ bool TargetLowering::parametersInCSRMatch(const MachineRegisterInfo &MRI,
     // (We look for a CopyFromReg reading a virtual register that is used
     //  for the function live-in value of register Reg)
     SDValue Value = OutVals[I];
+    if (Value->getOpcode() == ISD::AssertZext)
+      Value = Value.getOperand(0);
     if (Value->getOpcode() != ISD::CopyFromReg)
       return false;
     Register ArgReg = cast<RegisterSDNode>(Value->getOperand(1))->getReg();
@@ -121,7 +122,7 @@ void TargetLoweringBase::ArgListEntry::setAttributes(const CallBase *Call,
   IsSwiftError = Call->paramHasAttr(ArgIdx, Attribute::SwiftError);
   Alignment = Call->getParamStackAlign(ArgIdx);
   IndirectType = nullptr;
-  assert(IsByVal + IsPreallocated + IsInAlloca <= 1 &&
+  assert(IsByVal + IsPreallocated + IsInAlloca + IsSRet <= 1 &&
          "multiple ABI attributes?");
   if (IsByVal) {
     IndirectType = Call->getParamByValType(ArgIdx);
@@ -132,6 +133,8 @@ void TargetLoweringBase::ArgListEntry::setAttributes(const CallBase *Call,
     IndirectType = Call->getParamPreallocatedType(ArgIdx);
   if (IsInAlloca)
     IndirectType = Call->getParamInAllocaType(ArgIdx);
+  if (IsSRet)
+    IndirectType = Call->getParamStructRetType(ArgIdx);
 }
 
 /// Generate a libcall taking the given operands as arguments and returning a
@@ -193,7 +196,8 @@ TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT,
 bool TargetLowering::findOptimalMemOpLowering(
     std::vector<EVT> &MemOps, unsigned Limit, const MemOp &Op, unsigned DstAS,
     unsigned SrcAS, const AttributeList &FuncAttributes) const {
-  if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign())
+  if (Limit != ~unsigned(0) && Op.isMemcpyWithFixedDstAlign() &&
+      Op.getSrcAlign() < Op.getDstAlign())
     return false;
 
   EVT VT = getOptimalMemOpType(Op, FuncAttributes);
@@ -905,6 +909,132 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedVectorElts(
                                          Depth);
 }
 
+// Attempt to form ext(avgfloor(A, B)) from shr(add(ext(A), ext(B)), 1).
+//      or to form ext(avgceil(A, B)) from shr(add(ext(A), ext(B), 1), 1).
+static SDValue combineShiftToAVG(SDValue Op, SelectionDAG &DAG,
+                                 const TargetLowering &TLI,
+                                 const APInt &DemandedBits,
+                                 const APInt &DemandedElts,
+                                 unsigned Depth) {
+  assert((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) &&
+         "SRL or SRA node is required here!");
+  // Is the right shift using an immediate value of 1?
+  ConstantSDNode *N1C = isConstOrConstSplat(Op.getOperand(1), DemandedElts);
+  if (!N1C || !N1C->isOne())
+    return SDValue();
+
+  // We are looking for an avgfloor
+  // add(ext, ext)
+  // or one of these as a avgceil
+  // add(add(ext, ext), 1)
+  // add(add(ext, 1), ext)
+  // add(ext, add(ext, 1))
+  SDValue Add = Op.getOperand(0);
+  if (Add.getOpcode() != ISD::ADD)
+    return SDValue();
+
+  SDValue ExtOpA = Add.getOperand(0);
+  SDValue ExtOpB = Add.getOperand(1);
+  auto MatchOperands = [&](SDValue Op1, SDValue Op2, SDValue Op3) {
+    ConstantSDNode *ConstOp;
+    if ((ConstOp = isConstOrConstSplat(Op1, DemandedElts)) &&
+        ConstOp->isOne()) {
+      ExtOpA = Op2;
+      ExtOpB = Op3;
+      return true;
+    }
+    if ((ConstOp = isConstOrConstSplat(Op2, DemandedElts)) &&
+        ConstOp->isOne()) {
+      ExtOpA = Op1;
+      ExtOpB = Op3;
+      return true;
+    }
+    if ((ConstOp = isConstOrConstSplat(Op3, DemandedElts)) &&
+        ConstOp->isOne()) {
+      ExtOpA = Op1;
+      ExtOpB = Op2;
+      return true;
+    }
+    return false;
+  };
+  bool IsCeil =
+      (ExtOpA.getOpcode() == ISD::ADD &&
+       MatchOperands(ExtOpA.getOperand(0), ExtOpA.getOperand(1), ExtOpB)) ||
+      (ExtOpB.getOpcode() == ISD::ADD &&
+       MatchOperands(ExtOpB.getOperand(0), ExtOpB.getOperand(1), ExtOpA));
+
+  // If the shift is signed (sra):
+  //  - Needs >= 2 sign bit for both operands.
+  //  - Needs >= 2 zero bits.
+  // If the shift is unsigned (srl):
+  //  - Needs >= 1 zero bit for both operands.
+  //  - Needs 1 demanded bit zero and >= 2 sign bits.
+  unsigned ShiftOpc = Op.getOpcode();
+  bool IsSigned = false;
+  unsigned KnownBits;
+  unsigned NumSignedA = DAG.ComputeNumSignBits(ExtOpA, DemandedElts, Depth);
+  unsigned NumSignedB = DAG.ComputeNumSignBits(ExtOpB, DemandedElts, Depth);
+  unsigned NumSigned = std::min(NumSignedA, NumSignedB) - 1;
+  unsigned NumZeroA =
+      DAG.computeKnownBits(ExtOpA, DemandedElts, Depth).countMinLeadingZeros();
+  unsigned NumZeroB =
+      DAG.computeKnownBits(ExtOpB, DemandedElts, Depth).countMinLeadingZeros();
+  unsigned NumZero = std::min(NumZeroA, NumZeroB);
+
+  switch (ShiftOpc) {
+  default:
+    llvm_unreachable("Unexpected ShiftOpc in combineShiftToAVG");
+  case ISD::SRA: {
+    if (NumZero >= 2 && NumSigned < NumZero) {
+      IsSigned = false;
+      KnownBits = NumZero;
+      break;
+    }
+    if (NumSigned >= 1) {
+      IsSigned = true;
+      KnownBits = NumSigned;
+      break;
+    }
+    return SDValue();
+  }
+  case ISD::SRL: {
+    if (NumZero >= 1 && NumSigned < NumZero) {
+      IsSigned = false;
+      KnownBits = NumZero;
+      break;
+    }
+    if (NumSigned >= 1 && DemandedBits.isSignBitClear()) {
+      IsSigned = true;
+      KnownBits = NumSigned;
+      break;
+    }
+    return SDValue();
+  }
+  }
+
+  unsigned AVGOpc = IsCeil ? (IsSigned ? ISD::AVGCEILS : ISD::AVGCEILU)
+                           : (IsSigned ? ISD::AVGFLOORS : ISD::AVGFLOORU);
+
+  // Find the smallest power-2 type that is legal for this vector size and
+  // operation, given the original type size and the number of known sign/zero
+  // bits.
+  EVT VT = Op.getValueType();
+  unsigned MinWidth =
+      std::max<unsigned>(VT.getScalarSizeInBits() - KnownBits, 8);
+  EVT NVT = EVT::getIntegerVT(*DAG.getContext(), PowerOf2Ceil(MinWidth));
+  if (VT.isVector())
+    NVT = EVT::getVectorVT(*DAG.getContext(), NVT, VT.getVectorElementCount());
+  if (!TLI.isOperationLegalOrCustom(AVGOpc, NVT))
+    return SDValue();
+
+  SDLoc DL(Op);
+  SDValue ResultAVG =
+      DAG.getNode(AVGOpc, DL, NVT, DAG.getNode(ISD::TRUNCATE, DL, NVT, ExtOpA),
+                  DAG.getNode(ISD::TRUNCATE, DL, NVT, ExtOpB));
+  return DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT,
+                     ResultAVG);
+}
+
 /// Look at Op. At this point, we know that only the OriginalDemandedBits of the
 /// result of Op are ever used downstream. If we can use this information to
 /// simplify Op, create a new simplified DAG node and return true, returning the
@@ -989,7 +1119,7 @@ bool TargetLowering::SimplifyDemandedBits(
     KnownBits SrcKnown;
     SDValue Src = Op.getOperand(0);
     unsigned SrcBitWidth = Src.getScalarValueSizeInBits();
-    APInt SrcDemandedBits = DemandedBits.zextOrSelf(SrcBitWidth);
+    APInt SrcDemandedBits = DemandedBits.zext(SrcBitWidth);
     if (SimplifyDemandedBits(Src, SrcDemandedBits, SrcKnown, TLO, Depth + 1))
       return true;
 
@@ -1105,7 +1235,7 @@ bool TargetLowering::SimplifyDemandedBits(
       break;
     uint64_t Idx = Op.getConstantOperandVal(1);
     unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
-    APInt DemandedSrcElts = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
+    APInt DemandedSrcElts = DemandedElts.zext(NumSrcElts).shl(Idx);
 
     if (SimplifyDemandedBits(Src, DemandedBits, DemandedSrcElts, Known, TLO,
                              Depth + 1))
@@ -1406,6 +1536,19 @@ bool TargetLowering::SimplifyDemandedBits(
     if (ShrinkDemandedConstant(Op, DemandedBits, DemandedElts, TLO))
       return true;
 
+    // Only known if known in both the LHS and RHS.
+    Known = KnownBits::commonBits(Known, Known2);
+    break;
+  case ISD::VSELECT:
+    if (SimplifyDemandedBits(Op.getOperand(2), DemandedBits, DemandedElts,
+                             Known, TLO, Depth + 1))
+      return true;
+    if (SimplifyDemandedBits(Op.getOperand(1), DemandedBits, DemandedElts,
+                             Known2, TLO, Depth + 1))
+      return true;
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+    assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
+
     // Only known if known in both the LHS and RHS.
     Known = KnownBits::commonBits(Known, Known2);
     break;
@@ -1542,6 +1685,16 @@ bool TargetLowering::SimplifyDemandedBits(
       // low bits known zero.
       Known.Zero.setLowBits(ShAmt);
 
+      // Attempt to avoid multi-use ops if we don't need anything from them.
+      if (!InDemandedMask.isAllOnesValue() || !DemandedElts.isAllOnesValue()) {
+        SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
+            Op0, InDemandedMask, DemandedElts, TLO.DAG, Depth + 1);
+        if (DemandedOp0) {
+          SDValue NewOp = TLO.DAG.getNode(ISD::SHL, dl, VT, DemandedOp0, Op1);
+          return TLO.CombineTo(Op, NewOp);
+        }
+      }
+
       // Try shrinking the operation as long as the shift amount will still be
       // in range.
       if ((ShAmt < DemandedBits.getActiveBits()) &&
@@ -1567,6 +1720,11 @@ bool TargetLowering::SimplifyDemandedBits(
     SDValue Op1 = Op.getOperand(1);
     EVT ShiftVT = Op1.getValueType();
 
+    // Try to match AVG patterns.
+    if (SDValue AVG = combineShiftToAVG(Op, TLO.DAG, *this, DemandedBits,
+                                        DemandedElts, Depth + 1))
+      return TLO.CombineTo(Op, AVG);
+
     if (const APInt *SA =
             TLO.DAG.getValidShiftAmountConstant(Op, DemandedElts)) {
       unsigned ShAmt = SA->getZExtValue();
@@ -1633,6 +1791,11 @@ bool TargetLowering::SimplifyDemandedBits(
     if (DemandedBits.isOne())
       return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT, Op0, Op1));
 
+    // Try to match AVG patterns.
+    if (SDValue AVG = combineShiftToAVG(Op, TLO.DAG, *this, DemandedBits,
+                                        DemandedElts, Depth + 1))
+      return TLO.CombineTo(Op, AVG);
+
     if (const APInt *SA =
             TLO.DAG.getValidShiftAmountConstant(Op, DemandedElts)) {
       unsigned ShAmt = SA->getZExtValue();
@@ -1727,6 +1890,22 @@ bool TargetLowering::SimplifyDemandedBits(
       Known.Zero.lshrInPlace(IsFSHL ? (BitWidth - Amt) : Amt);
       Known.One |= Known2.One;
       Known.Zero |= Known2.Zero;
+
+      // Attempt to avoid multi-use ops if we don't need anything from them.
+      if (!Demanded0.isAllOnes() || !Demanded1.isAllOnes() ||
+          !DemandedElts.isAllOnes()) {
+        SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
+            Op0, Demanded0, DemandedElts, TLO.DAG, Depth + 1);
+        SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
+            Op1, Demanded1, DemandedElts, TLO.DAG, Depth + 1);
+        if (DemandedOp0 || DemandedOp1) {
+          DemandedOp0 = DemandedOp0 ? DemandedOp0 : Op0;
+          DemandedOp1 = DemandedOp1 ? DemandedOp1 : Op1;
+          SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, VT, DemandedOp0,
+                                          DemandedOp1, Op2);
+          return TLO.CombineTo(Op, NewOp);
+        }
+      }
     }
 
     // For pow-2 bitwidths we only demand the bottom modulo amt bits.
@@ -1899,7 +2078,8 @@ bool TargetLowering::SimplifyDemandedBits(
     // bit is demanded.
     InputDemandedBits.setBit(ExVTBits - 1);
 
-    if (SimplifyDemandedBits(Op0, InputDemandedBits, Known, TLO, Depth + 1))
+    if (SimplifyDemandedBits(Op0, InputDemandedBits, DemandedElts, Known, TLO,
+                             Depth + 1))
       return true;
     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
 
@@ -1965,7 +2145,7 @@ bool TargetLowering::SimplifyDemandedBits(
     }
 
     APInt InDemandedBits = DemandedBits.trunc(InBits);
-    APInt InDemandedElts = DemandedElts.zextOrSelf(InElts);
+    APInt InDemandedElts = DemandedElts.zext(InElts);
     if (SimplifyDemandedBits(Src, InDemandedBits, InDemandedElts, Known, TLO,
                              Depth + 1))
       return true;
@@ -2002,7 +2182,7 @@ bool TargetLowering::SimplifyDemandedBits(
     }
 
     APInt InDemandedBits = DemandedBits.trunc(InBits);
-    APInt InDemandedElts = DemandedElts.zextOrSelf(InElts);
+    APInt InDemandedElts = DemandedElts.zext(InElts);
 
     // Since some of the sign extended bits are demanded, we know that the sign
     // bit is demanded.
@@ -2046,7 +2226,7 @@ bool TargetLowering::SimplifyDemandedBits(
       return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Src));
 
     APInt InDemandedBits = DemandedBits.trunc(InBits);
-    APInt InDemandedElts = DemandedElts.zextOrSelf(InElts);
+    APInt InDemandedElts = DemandedElts.zext(InElts);
     if (SimplifyDemandedBits(Src, InDemandedBits, InDemandedElts, Known, TLO,
                              Depth + 1))
       return true;
@@ -2265,9 +2445,27 @@ bool TargetLowering::SimplifyDemandedBits(
     break;
   }
   case ISD::MUL:
-    // 'Quadratic Reciprocity': mul(x,x) -> 0 if we're only demanding bit[1]
-    if (DemandedBits == 2 && Op.getOperand(0) == Op.getOperand(1))
-      return TLO.CombineTo(Op, TLO.DAG.getConstant(0, dl, VT));
+    if (DemandedBits.isPowerOf2()) {
+      // The LSB of X*Y is set only if (X & 1) == 1 and (Y & 1) == 1.
+      // If we demand exactly one bit N and we have "X * (C' << N)" where C' is
+      // odd (has LSB set), then the left-shifted low bit of X is the answer.
+      unsigned CTZ = DemandedBits.countTrailingZeros();
+      ConstantSDNode *C = isConstOrConstSplat(Op.getOperand(1), DemandedElts);
+      if (C && C->getAPIntValue().countTrailingZeros() == CTZ) {
+        EVT ShiftAmtTy = getShiftAmountTy(VT, TLO.DAG.getDataLayout());
+        SDValue AmtC = TLO.DAG.getConstant(CTZ, dl, ShiftAmtTy);
+        SDValue Shl = TLO.DAG.getNode(ISD::SHL, dl, VT, Op.getOperand(0), AmtC);
+        return TLO.CombineTo(Op, Shl);
+      }
+    }
+    // For a squared value "X * X", the bottom 2 bits are 0 and X[0] because:
+    // X * X is odd iff X is odd.
+    // 'Quadratic Reciprocity': X * X -> 0 for bit[1]
+    if (Op.getOperand(0) == Op.getOperand(1) && DemandedBits.ult(4)) {
+      SDValue One = TLO.DAG.getConstant(1, dl, VT);
+      SDValue And1 = TLO.DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), One);
+      return TLO.CombineTo(Op, And1);
+    }
     LLVM_FALLTHROUGH;
   case ISD::ADD:
   case ISD::SUB: {
@@ -2330,6 +2528,49 @@ bool TargetLowering::SimplifyDemandedBits(
       return TLO.CombineTo(Op, NewOp);
     }
 
+    // Match a multiply with a disguised negated-power-of-2 and convert to a
+    // an equivalent shift-left amount.
+    // Example: (X * MulC) + Op1 --> Op1 - (X << log2(-MulC))
+    auto getShiftLeftAmt = [&HighMask](SDValue Mul) -> unsigned {
+      if (Mul.getOpcode() != ISD::MUL || !Mul.hasOneUse())
+        return 0;
+
+      // Don't touch opaque constants. Also, ignore zero and power-of-2
+      // multiplies. Those will get folded later.
+      ConstantSDNode *MulC = isConstOrConstSplat(Mul.getOperand(1));
+      if (MulC && !MulC->isOpaque() && !MulC->isZero() &&
+          !MulC->getAPIntValue().isPowerOf2()) {
+        APInt UnmaskedC = MulC->getAPIntValue() | HighMask;
+        if (UnmaskedC.isNegatedPowerOf2())
+          return (-UnmaskedC).logBase2();
+      }
+      return 0;
+    };
+
+    auto foldMul = [&](ISD::NodeType NT, SDValue X, SDValue Y, unsigned ShlAmt) {
+      EVT ShiftAmtTy = getShiftAmountTy(VT, TLO.DAG.getDataLayout());
+      SDValue ShlAmtC = TLO.DAG.getConstant(ShlAmt, dl, ShiftAmtTy);
+      SDValue Shl = TLO.DAG.getNode(ISD::SHL, dl, VT, X, ShlAmtC);
+      SDValue Res = TLO.DAG.getNode(NT, dl, VT, Y, Shl);
+      return TLO.CombineTo(Op, Res);
+    };
+
+    if (isOperationLegalOrCustom(ISD::SHL, VT)) {
+      if (Op.getOpcode() == ISD::ADD) {
+        // (X * MulC) + Op1 --> Op1 - (X << log2(-MulC))
+        if (unsigned ShAmt = getShiftLeftAmt(Op0))
+          return foldMul(ISD::SUB, Op0.getOperand(0), Op1, ShAmt);
+        // Op0 + (X * MulC) --> Op0 - (X << log2(-MulC))
+        if (unsigned ShAmt = getShiftLeftAmt(Op1))
+          return foldMul(ISD::SUB, Op1.getOperand(0), Op0, ShAmt);
+      }
+      if (Op.getOpcode() == ISD::SUB) {
+        // Op0 - (X * MulC) --> Op0 + (X << log2(-MulC))
+        if (unsigned ShAmt = getShiftLeftAmt(Op1))
+          return foldMul(ISD::ADD, Op1.getOperand(0), Op0, ShAmt);
+      }
+    }
+
     LLVM_FALLTHROUGH;
   }
   default:
@@ -2347,7 +2588,8 @@ bool TargetLowering::SimplifyDemandedBits(
 
   // If we know the value of all of the demanded bits, return this as a
   // constant.
-  if (DemandedBits.isSubsetOf(Known.Zero | Known.One)) {
+  if (!isTargetCanonicalConstantNode(Op) &&
+      DemandedBits.isSubsetOf(Known.Zero | Known.One)) {
     // Avoid folding to a constant if any OpaqueConstant is involved.
     const SDNode *N = Op.getNode();
     for (SDNode *Op :
@@ -2370,13 +2612,12 @@ bool TargetLowering::SimplifyDemandedBits(
 
 bool TargetLowering::SimplifyDemandedVectorElts(SDValue Op,
                                                 const APInt &DemandedElts,
-                                                APInt &KnownUndef,
-                                                APInt &KnownZero,
                                                 DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
   TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
                         !DCI.isBeforeLegalizeOps());
 
+  APInt KnownUndef, KnownZero;
   bool Simplified =
       SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, TLO);
   if (Simplified) {
@@ -2447,6 +2688,10 @@ bool TargetLowering::SimplifyDemandedVectorElts(
 
   KnownUndef = KnownZero = APInt::getZero(NumElts);
 
+  const TargetLowering &TLI = TLO.DAG.getTargetLoweringInfo();
+  if (!TLI.shouldSimplifyDemandedVectorElts(Op, TLO))
+    return false;
+
   // TODO: For now we assume we know nothing about scalable vectors.
   if (VT.isScalableVector())
     return false;
@@ -2565,6 +2810,21 @@ bool TargetLowering::SimplifyDemandedVectorElts(
         if (SimplifyDemandedBits(Src, SrcDemandedBits, SrcDemandedElts, Known,
                                  TLO, Depth + 1))
           return true;
+
+        // The bitcast has split each wide element into a number of
+        // narrow subelements. We have just computed the Known bits
+        // for wide elements. See if element splitting results in
+        // some subelements being zero. Only for demanded elements!
+        for (unsigned SubElt = 0; SubElt != Scale; ++SubElt) {
+          if (!Known.Zero.extractBits(EltSizeInBits, SubElt * EltSizeInBits)
+                   .isAllOnes())
+            continue;
+          for (unsigned SrcElt = 0; SrcElt != NumSrcElts; ++SrcElt) {
+            unsigned Elt = Scale * SrcElt + SubElt;
+            if (DemandedElts[Elt])
+              KnownZero.setBit(Elt);
+          }
+        }
       }
 
       // If the src element is zero/undef then all the output elements will be -
@@ -2646,6 +2906,25 @@ bool TargetLowering::SimplifyDemandedVectorElts(
       KnownUndef.insertBits(SubUndef, i * NumSubElts);
       KnownZero.insertBits(SubZero, i * NumSubElts);
     }
+
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    if (!DemandedElts.isAllOnes()) {
+      bool FoundNewSub = false;
+      SmallVector<SDValue, 2> DemandedSubOps;
+      for (unsigned i = 0; i != NumSubVecs; ++i) {
+        SDValue SubOp = Op.getOperand(i);
+        APInt SubElts = DemandedElts.extractBits(NumSubElts, i * NumSubElts);
+        SDValue NewSubOp = SimplifyMultipleUseDemandedVectorElts(
+            SubOp, SubElts, TLO.DAG, Depth + 1);
+        DemandedSubOps.push_back(NewSubOp ? NewSubOp : SubOp);
+        FoundNewSub = NewSubOp ? true : FoundNewSub;
+      }
+      if (FoundNewSub) {
+        SDValue NewOp =
+            TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedSubOps);
+        return TLO.CombineTo(Op, NewOp);
+      }
+    }
     break;
   }
   case ISD::INSERT_SUBVECTOR: {
@@ -2699,7 +2978,7 @@ bool TargetLowering::SimplifyDemandedVectorElts(
       break;
     uint64_t Idx = Op.getConstantOperandVal(1);
     unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
-    APInt DemandedSrcElts = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
+    APInt DemandedSrcElts = DemandedElts.zext(NumSrcElts).shl(Idx);
 
     APInt SrcUndef, SrcZero;
     if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, SrcUndef, SrcZero, TLO,
@@ -2858,7 +3137,7 @@ bool TargetLowering::SimplifyDemandedVectorElts(
     APInt SrcUndef, SrcZero;
     SDValue Src = Op.getOperand(0);
     unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
-    APInt DemandedSrcElts = DemandedElts.zextOrSelf(NumSrcElts);
+    APInt DemandedSrcElts = DemandedElts.zext(NumSrcElts);
     if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, SrcUndef, SrcZero, TLO,
                                    Depth + 1))
       return true;
@@ -3618,6 +3897,115 @@ static SDValue simplifySetCCWithCTPOP(const TargetLowering &TLI, EVT VT,
   return SDValue();
 }
 
+static SDValue foldSetCCWithRotate(EVT VT, SDValue N0, SDValue N1,
+                                   ISD::CondCode Cond, const SDLoc &dl,
+                                   SelectionDAG &DAG) {
+  if (Cond != ISD::SETEQ && Cond != ISD::SETNE)
+    return SDValue();
+
+  auto *C1 = isConstOrConstSplat(N1, /* AllowUndefs */ true);
+  if (!C1 || !(C1->isZero() || C1->isAllOnes()))
+    return SDValue();
+
+  auto getRotateSource = [](SDValue X) {
+    if (X.getOpcode() == ISD::ROTL || X.getOpcode() == ISD::ROTR)
+      return X.getOperand(0);
+    return SDValue();
+  };
+
+  // Peek through a rotated value compared against 0 or -1:
+  // (rot X, Y) == 0/-1 --> X == 0/-1
+  // (rot X, Y) != 0/-1 --> X != 0/-1
+  if (SDValue R = getRotateSource(N0))
+    return DAG.getSetCC(dl, VT, R, N1, Cond);
+
+  // Peek through an 'or' of a rotated value compared against 0:
+  // or (rot X, Y), Z ==/!= 0 --> (or X, Z) ==/!= 0
+  // or Z, (rot X, Y) ==/!= 0 --> (or X, Z) ==/!= 0
+  //
+  // TODO: Add the 'and' with -1 sibling.
+  // TODO: Recurse through a series of 'or' ops to find the rotate.
+  EVT OpVT = N0.getValueType();
+  if (N0.hasOneUse() && N0.getOpcode() == ISD::OR && C1->isZero()) {
+    if (SDValue R = getRotateSource(N0.getOperand(0))) {
+      SDValue NewOr = DAG.getNode(ISD::OR, dl, OpVT, R, N0.getOperand(1));
+      return DAG.getSetCC(dl, VT, NewOr, N1, Cond);
+    }
+    if (SDValue R = getRotateSource(N0.getOperand(1))) {
+      SDValue NewOr = DAG.getNode(ISD::OR, dl, OpVT, R, N0.getOperand(0));
+      return DAG.getSetCC(dl, VT, NewOr, N1, Cond);
+    }
+  }
+
+  return SDValue();
+}
+
+static SDValue foldSetCCWithFunnelShift(EVT VT, SDValue N0, SDValue N1,
+                                        ISD::CondCode Cond, const SDLoc &dl,
+                                        SelectionDAG &DAG) {
+  // If we are testing for all-bits-clear, we might be able to do that with
+  // less shifting since bit-order does not matter.
+  if (Cond != ISD::SETEQ && Cond != ISD::SETNE)
+    return SDValue();
+
+  auto *C1 = isConstOrConstSplat(N1, /* AllowUndefs */ true);
+  if (!C1 || !C1->isZero())
+    return SDValue();
+
+  if (!N0.hasOneUse() ||
+      (N0.getOpcode() != ISD::FSHL && N0.getOpcode() != ISD::FSHR))
+    return SDValue();
+
+  unsigned BitWidth = N0.getScalarValueSizeInBits();
+  auto *ShAmtC = isConstOrConstSplat(N0.getOperand(2));
+  if (!ShAmtC || ShAmtC->getAPIntValue().uge(BitWidth))
+    return SDValue();
+
+  // Canonicalize fshr as fshl to reduce pattern-matching.
+  unsigned ShAmt = ShAmtC->getZExtValue();
+  if (N0.getOpcode() == ISD::FSHR)
+    ShAmt = BitWidth - ShAmt;
+
+  // Match an 'or' with a specific operand 'Other' in either commuted variant.
+  SDValue X, Y;
+  auto matchOr = [&X, &Y](SDValue Or, SDValue Other) {
+    if (Or.getOpcode() != ISD::OR || !Or.hasOneUse())
+      return false;
+    if (Or.getOperand(0) == Other) {
+      X = Or.getOperand(0);
+      Y = Or.getOperand(1);
+      return true;
+    }
+    if (Or.getOperand(1) == Other) {
+      X = Or.getOperand(1);
+      Y = Or.getOperand(0);
+      return true;
+    }
+    return false;
+  };
+
+  EVT OpVT = N0.getValueType();
+  EVT ShAmtVT = N0.getOperand(2).getValueType();
+  SDValue F0 = N0.getOperand(0);
+  SDValue F1 = N0.getOperand(1);
+  if (matchOr(F0, F1)) {
+    // fshl (or X, Y), X, C ==/!= 0 --> or (shl Y, C), X ==/!= 0
+    SDValue NewShAmt = DAG.getConstant(ShAmt, dl, ShAmtVT);
+    SDValue Shift = DAG.getNode(ISD::SHL, dl, OpVT, Y, NewShAmt);
+    SDValue NewOr = DAG.getNode(ISD::OR, dl, OpVT, Shift, X);
+    return DAG.getSetCC(dl, VT, NewOr, N1, Cond);
+  }
+  if (matchOr(F1, F0)) {
+    // fshl X, (or X, Y), C ==/!= 0 --> or (srl Y, BW-C), X ==/!= 0
+    SDValue NewShAmt = DAG.getConstant(BitWidth - ShAmt, dl, ShAmtVT);
+    SDValue Shift = DAG.getNode(ISD::SRL, dl, OpVT, Y, NewShAmt);
+    SDValue NewOr = DAG.getNode(ISD::OR, dl, OpVT, Shift, X);
+    return DAG.getSetCC(dl, VT, NewOr, N1, Cond);
+  }
+
+  return SDValue();
+}
+
 /// Try to simplify a setcc built with the specified operands and cc. If it is
 /// unable to simplify it, return a null SDValue.
 SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
@@ -3632,13 +4020,17 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
   if (SDValue Fold = DAG.FoldSetCC(VT, N0, N1, Cond, dl))
     return Fold;
 
+  bool N0ConstOrSplat =
+      isConstOrConstSplat(N0, /*AllowUndefs*/ false, /*AllowTruncate*/ true);
+  bool N1ConstOrSplat =
+      isConstOrConstSplat(N1, /*AllowUndefs*/ false, /*AllowTruncate*/ true);
+
   // Ensure that the constant occurs on the RHS and fold constant comparisons.
   // TODO: Handle non-splat vector constants. All undef causes trouble.
   // FIXME: We can't yet fold constant scalable vector splats, so avoid an
   // infinite loop here when we encounter one.
   ISD::CondCode SwappedCC = ISD::getSetCCSwappedOperands(Cond);
-  if (isConstOrConstSplat(N0) &&
-      (!OpVT.isScalableVector() || !isConstOrConstSplat(N1)) &&
+  if (N0ConstOrSplat && (!OpVT.isScalableVector() || !N1ConstOrSplat) &&
       (DCI.isBeforeLegalizeOps() ||
        isCondCodeLegal(SwappedCC, N0.getSimpleValueType())))
     return DAG.getSetCC(dl, VT, N1, N0, SwappedCC);
@@ -3647,13 +4039,19 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
   // -- but in reverse order -- then try to commute the operands of this setcc
   // to match. A matching pair of setcc (cmp) and sub may be combined into 1
   // instruction on some targets.
-  if (!isConstOrConstSplat(N0) && !isConstOrConstSplat(N1) &&
+  if (!N0ConstOrSplat && !N1ConstOrSplat &&
       (DCI.isBeforeLegalizeOps() ||
        isCondCodeLegal(SwappedCC, N0.getSimpleValueType())) &&
       DAG.doesNodeExist(ISD::SUB, DAG.getVTList(OpVT), {N1, N0}) &&
       !DAG.doesNodeExist(ISD::SUB, DAG.getVTList(OpVT), {N0, N1}))
     return DAG.getSetCC(dl, VT, N1, N0, SwappedCC);
 
+  if (SDValue V = foldSetCCWithRotate(VT, N0, N1, Cond, dl, DAG))
+    return V;
+
+  if (SDValue V = foldSetCCWithFunnelShift(VT, N0, N1, Cond, dl, DAG))
+    return V;
+
   if (auto *N1C = isConstOrConstSplat(N1)) {
     const APInt &C1 = N1C->getAPIntValue();
 
@@ -4399,37 +4797,30 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
       if (auto *RHSC = dyn_cast<ConstantSDNode>(N1)) {
         if (auto *LHSR = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
           // Turn (X+C1) == C2 --> X == C2-C1
-          if (N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse()) {
-            return DAG.getSetCC(dl, VT, N0.getOperand(0),
-                                DAG.getConstant(RHSC->getAPIntValue()-
-                                                LHSR->getAPIntValue(),
-                                dl, N0.getValueType()), Cond);
-          }
-
-          // Turn (X^C1) == C2 into X == C1^C2 iff X&~C1 = 0.
-          if (N0.getOpcode() == ISD::XOR)
-            // If we know that all of the inverted bits are zero, don't bother
-            // performing the inversion.
-            if (DAG.MaskedValueIsZero(N0.getOperand(0), ~LHSR->getAPIntValue()))
-              return
-                DAG.getSetCC(dl, VT, N0.getOperand(0),
-                             DAG.getConstant(LHSR->getAPIntValue() ^
-                                               RHSC->getAPIntValue(),
-                                             dl, N0.getValueType()),
-                             Cond);
+          if (N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse())
+            return DAG.getSetCC(
+                dl, VT, N0.getOperand(0),
+                DAG.getConstant(RHSC->getAPIntValue() - LHSR->getAPIntValue(),
+                                dl, N0.getValueType()),
+                Cond);
+
+          // Turn (X^C1) == C2 --> X == C1^C2
+          if (N0.getOpcode() == ISD::XOR && N0.getNode()->hasOneUse())
+            return DAG.getSetCC(
+                dl, VT, N0.getOperand(0),
+                DAG.getConstant(LHSR->getAPIntValue() ^ RHSC->getAPIntValue(),
+                                dl, N0.getValueType()),
+                Cond);
         }
 
         // Turn (C1-X) == C2 --> X == C1-C2
-        if (auto *SUBC = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
-          if (N0.getOpcode() == ISD::SUB && N0.getNode()->hasOneUse()) {
-            return
-              DAG.getSetCC(dl, VT, N0.getOperand(1),
-                           DAG.getConstant(SUBC->getAPIntValue() -
-                                             RHSC->getAPIntValue(),
-                                           dl, N0.getValueType()),
-                           Cond);
-          }
-        }
+        if (auto *SUBC = dyn_cast<ConstantSDNode>(N0.getOperand(0)))
+          if (N0.getOpcode() == ISD::SUB && N0.getNode()->hasOneUse())
+            return DAG.getSetCC(
+                dl, VT, N0.getOperand(1),
+                DAG.getConstant(SUBC->getAPIntValue() - RHSC->getAPIntValue(),
+                                dl, N0.getValueType()),
+                Cond);
 
         // Could RHSC fold directly into a compare?
         if (RHSC->getValueType(0).getSizeInBits() <= 64)
@@ -4582,13 +4973,14 @@ TargetLowering::getConstraintType(StringRef Constraint) const {
     case 'o': // offsetable
     case 'V': // not offsetable
       return C_Memory;
+    case 'p': // Address.
+      return C_Address;
     case 'n': // Simple Integer
     case 'E': // Floating Point Constant
     case 'F': // Floating Point Constant
       return C_Immediate;
     case 'i': // Simple Integer or Relocatable Constant
     case 's': // Relocatable Constant
-    case 'p': // Address.
     case 'X': // Allow ANY value.
     case 'I': // Target registers.
     case 'J':
@@ -4826,8 +5218,8 @@ TargetLowering::ParseConstraints(const DataLayout &DL,
     if (OpInfo.CallOperandVal) {
       llvm::Type *OpTy = OpInfo.CallOperandVal->getType();
       if (OpInfo.isIndirect) {
-        OpTy = Call.getAttributes().getParamElementType(ArgNo);
-        assert(OpTy && "Indirect opernad must have elementtype attribute");
+        OpTy = Call.getParamElementType(ArgNo);
+        assert(OpTy && "Indirect operand must have elementtype attribute");
       }
 
       // Look for vector wrapped in a struct. e.g. { <16 x i8> }.
@@ -4962,6 +5354,7 @@ static unsigned getConstraintGenerality(TargetLowering::ConstraintType CT) {
   case TargetLowering::C_RegisterClass:
     return 2;
   case TargetLowering::C_Memory:
+  case TargetLowering::C_Address:
     return 3;
   }
   llvm_unreachable("Invalid constraint type");
@@ -5232,6 +5625,17 @@ SDValue TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
   return SDValue();
 }
 
+SDValue
+TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
+                              SelectionDAG &DAG,
+                              SmallVectorImpl<SDNode *> &Created) const {
+  AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (TLI.isIntDivCheap(N->getValueType(0), Attr))
+    return SDValue(N, 0); // Lower SREM as SREM
+  return SDValue();
+}
+
 /// Given an ISD::SDIV node expressing a divide by constant,
 /// return a DAG expression to select that will generate the same value by
 /// multiplying by a magic number.
@@ -7016,6 +7420,30 @@ bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result,
   return true;
 }
 
+SDValue
+TargetLowering::createSelectForFMINNUM_FMAXNUM(SDNode *Node,
+                                               SelectionDAG &DAG) const {
+  unsigned Opcode = Node->getOpcode();
+  assert((Opcode == ISD::FMINNUM || Opcode == ISD::FMAXNUM ||
+          Opcode == ISD::STRICT_FMINNUM || Opcode == ISD::STRICT_FMAXNUM) &&
+         "Wrong opcode");
+
+  if (Node->getFlags().hasNoNaNs()) {
+    ISD::CondCode Pred = Opcode == ISD::FMINNUM ? ISD::SETLT : ISD::SETGT;
+    SDValue Op1 = Node->getOperand(0);
+    SDValue Op2 = Node->getOperand(1);
+    SDValue SelCC = DAG.getSelectCC(SDLoc(Node), Op1, Op2, Op1, Op2, Pred);
+    // Copy FMF flags, but always set the no-signed-zeros flag
+    // as this is implied by the FMINNUM/FMAXNUM semantics.
+    SDNodeFlags Flags = Node->getFlags();
+    Flags.setNoSignedZeros(true);
+    SelCC->setFlags(Flags);
+    return SelCC;
+  }
+
+  return SDValue();
+}
+
 SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node,
                                               SelectionDAG &DAG) const {
   SDLoc dl(Node);
@@ -7058,29 +7486,234 @@ SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node,
     }
   }
 
-  // If none of the above worked, but there are no NaNs, then expand to
-  // a compare/select sequence.  This is required for correctness since
-  // InstCombine might have canonicalized a fcmp+select sequence to a
-  // FMINNUM/FMAXNUM node.  If we were to fall through to the default
-  // expansion to libcall, we might introduce a link-time dependency
-  // on libm into a file that originally did not have one.
-  if (Node->getFlags().hasNoNaNs()) {
-    ISD::CondCode Pred =
-        Node->getOpcode() == ISD::FMINNUM ? ISD::SETLT : ISD::SETGT;
-    SDValue Op1 = Node->getOperand(0);
-    SDValue Op2 = Node->getOperand(1);
-    SDValue SelCC = DAG.getSelectCC(dl, Op1, Op2, Op1, Op2, Pred);
-    // Copy FMF flags, but always set the no-signed-zeros flag
-    // as this is implied by the FMINNUM/FMAXNUM semantics.
-    SDNodeFlags Flags = Node->getFlags();
-    Flags.setNoSignedZeros(true);
-    SelCC->setFlags(Flags);
+  if (SDValue SelCC = createSelectForFMINNUM_FMAXNUM(Node, DAG))
     return SelCC;
-  }
 
   return SDValue();
 }
 
+SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op,
+                                         unsigned Test, SDNodeFlags Flags,
+                                         const SDLoc &DL,
+                                         SelectionDAG &DAG) const {
+  EVT OperandVT = Op.getValueType();
+  assert(OperandVT.isFloatingPoint());
+
+  // Degenerated cases.
+  if (Test == 0)
+    return DAG.getBoolConstant(false, DL, ResultVT, OperandVT);
+  if ((Test & fcAllFlags) == fcAllFlags)
+    return DAG.getBoolConstant(true, DL, ResultVT, OperandVT);
+
+  // PPC double double is a pair of doubles, of which the higher part determines
+  // the value class.
+  if (OperandVT == MVT::ppcf128) {
+    Op = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::f64, Op,
+                     DAG.getConstant(1, DL, MVT::i32));
+    OperandVT = MVT::f64;
+  }
+
+  // Some checks may be represented as inversion of simpler check, for example
+  // "inf|normal|subnormal|zero" => !"nan".
+  bool IsInverted = false;
+  if (unsigned InvertedCheck = getInvertedFPClassTest(Test)) {
+    IsInverted = true;
+    Test = InvertedCheck;
+  }
+
+  // Floating-point type properties.
+  EVT ScalarFloatVT = OperandVT.getScalarType();
+  const Type *FloatTy = ScalarFloatVT.getTypeForEVT(*DAG.getContext());
+  const llvm::fltSemantics &Semantics = FloatTy->getFltSemantics();
+  bool IsF80 = (ScalarFloatVT == MVT::f80);
+
+  // Some checks can be implemented using float comparisons, if floating point
+  // exceptions are ignored.
+  if (Flags.hasNoFPExcept() &&
+      isOperationLegalOrCustom(ISD::SETCC, OperandVT.getScalarType())) {
+    if (Test == fcZero)
+      return DAG.getSetCC(DL, ResultVT, Op,
+                          DAG.getConstantFP(0.0, DL, OperandVT),
+                          IsInverted ? ISD::SETUNE : ISD::SETOEQ);
+    if (Test == fcNan)
+      return DAG.getSetCC(DL, ResultVT, Op, Op,
+                          IsInverted ? ISD::SETO : ISD::SETUO);
+  }
+
+  // In the general case use integer operations.
+  unsigned BitSize = OperandVT.getScalarSizeInBits();
+  EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), BitSize);
+  if (OperandVT.isVector())
+    IntVT = EVT::getVectorVT(*DAG.getContext(), IntVT,
+                             OperandVT.getVectorElementCount());
+  SDValue OpAsInt = DAG.getBitcast(IntVT, Op);
+
+  // Various masks.
+  APInt SignBit = APInt::getSignMask(BitSize);
+  APInt ValueMask = APInt::getSignedMaxValue(BitSize);     // All bits but sign.
+  APInt Inf = APFloat::getInf(Semantics).bitcastToAPInt(); // Exp and int bit.
+  const unsigned ExplicitIntBitInF80 = 63;
+  APInt ExpMask = Inf;
+  if (IsF80)
+    ExpMask.clearBit(ExplicitIntBitInF80);
+  APInt AllOneMantissa = APFloat::getLargest(Semantics).bitcastToAPInt() & ~Inf;
+  APInt QNaNBitMask =
+      APInt::getOneBitSet(BitSize, AllOneMantissa.getActiveBits() - 1);
+  APInt InvertionMask = APInt::getAllOnesValue(ResultVT.getScalarSizeInBits());
+
+  SDValue ValueMaskV = DAG.getConstant(ValueMask, DL, IntVT);
+  SDValue SignBitV = DAG.getConstant(SignBit, DL, IntVT);
+  SDValue ExpMaskV = DAG.getConstant(ExpMask, DL, IntVT);
+  SDValue ZeroV = DAG.getConstant(0, DL, IntVT);
+  SDValue InfV = DAG.getConstant(Inf, DL, IntVT);
+  SDValue ResultInvertionMask = DAG.getConstant(InvertionMask, DL, ResultVT);
+
+  SDValue Res;
+  const auto appendResult = [&](SDValue PartialRes) {
+    if (PartialRes) {
+      if (Res)
+        Res = DAG.getNode(ISD::OR, DL, ResultVT, Res, PartialRes);
+      else
+        Res = PartialRes;
+    }
+  };
+
+  SDValue IntBitIsSetV; // Explicit integer bit in f80 mantissa is set.
+  const auto getIntBitIsSet = [&]() -> SDValue {
+    if (!IntBitIsSetV) {
+      APInt IntBitMask(BitSize, 0);
+      IntBitMask.setBit(ExplicitIntBitInF80);
+      SDValue IntBitMaskV = DAG.getConstant(IntBitMask, DL, IntVT);
+      SDValue IntBitV = DAG.getNode(ISD::AND, DL, IntVT, OpAsInt, IntBitMaskV);
+      IntBitIsSetV = DAG.getSetCC(DL, ResultVT, IntBitV, ZeroV, ISD::SETNE);
+    }
+    return IntBitIsSetV;
+  };
+
+  // Split the value into sign bit and absolute value.
+  SDValue AbsV = DAG.getNode(ISD::AND, DL, IntVT, OpAsInt, ValueMaskV);
+  SDValue SignV = DAG.getSetCC(DL, ResultVT, OpAsInt,
+                               DAG.getConstant(0.0, DL, IntVT), ISD::SETLT);
+
+  // Tests that involve more than one class should be processed first.
+  SDValue PartialRes;
+
+  if (IsF80)
+    ; // Detect finite numbers of f80 by checking individual classes because
+      // they have different settings of the explicit integer bit.
+  else if ((Test & fcFinite) == fcFinite) {
+    // finite(V) ==> abs(V) < exp_mask
+    PartialRes = DAG.getSetCC(DL, ResultVT, AbsV, ExpMaskV, ISD::SETLT);
+    Test &= ~fcFinite;
+  } else if ((Test & fcFinite) == fcPosFinite) {
+    // finite(V) && V > 0 ==> V < exp_mask
+    PartialRes = DAG.getSetCC(DL, ResultVT, OpAsInt, ExpMaskV, ISD::SETULT);
+    Test &= ~fcPosFinite;
+  } else if ((Test & fcFinite) == fcNegFinite) {
+    // finite(V) && V < 0 ==> abs(V) < exp_mask && signbit == 1
+    PartialRes = DAG.getSetCC(DL, ResultVT, AbsV, ExpMaskV, ISD::SETLT);
+    PartialRes = DAG.getNode(ISD::AND, DL, ResultVT, PartialRes, SignV);
+    Test &= ~fcNegFinite;
+  }
+  appendResult(PartialRes);
+
+  // Check for individual classes.
+
+  if (unsigned PartialCheck = Test & fcZero) {
+    if (PartialCheck == fcPosZero)
+      PartialRes = DAG.getSetCC(DL, ResultVT, OpAsInt, ZeroV, ISD::SETEQ);
+    else if (PartialCheck == fcZero)
+      PartialRes = DAG.getSetCC(DL, ResultVT, AbsV, ZeroV, ISD::SETEQ);
+    else // ISD::fcNegZero
+      PartialRes = DAG.getSetCC(DL, ResultVT, OpAsInt, SignBitV, ISD::SETEQ);
+    appendResult(PartialRes);
+  }
+
+  if (unsigned PartialCheck = Test & fcInf) {
+    if (PartialCheck == fcPosInf)
+      PartialRes = DAG.getSetCC(DL, ResultVT, OpAsInt, InfV, ISD::SETEQ);
+    else if (PartialCheck == fcInf)
+      PartialRes = DAG.getSetCC(DL, ResultVT, AbsV, InfV, ISD::SETEQ);
+    else { // ISD::fcNegInf
+      APInt NegInf = APFloat::getInf(Semantics, true).bitcastToAPInt();
+      SDValue NegInfV = DAG.getConstant(NegInf, DL, IntVT);
+      PartialRes = DAG.getSetCC(DL, ResultVT, OpAsInt, NegInfV, ISD::SETEQ);
+    }
+    appendResult(PartialRes);
+  }
+
+  if (unsigned PartialCheck = Test & fcNan) {
+    APInt InfWithQnanBit = Inf | QNaNBitMask;
+    SDValue InfWithQnanBitV = DAG.getConstant(InfWithQnanBit, DL, IntVT);
+    if (PartialCheck == fcNan) {
+      // isnan(V) ==> abs(V) > int(inf)
+      PartialRes = DAG.getSetCC(DL, ResultVT, AbsV, InfV, ISD::SETGT);
+      if (IsF80) {
+        // Recognize unsupported values as NaNs for compatibility with glibc.
+        // In them (exp(V)==0) == int_bit.
+        SDValue ExpBits = DAG.getNode(ISD::AND, DL, IntVT, AbsV, ExpMaskV);
+        SDValue ExpIsZero =
+            DAG.getSetCC(DL, ResultVT, ExpBits, ZeroV, ISD::SETEQ);
+        SDValue IsPseudo =
+            DAG.getSetCC(DL, ResultVT, getIntBitIsSet(), ExpIsZero, ISD::SETEQ);
+        PartialRes = DAG.getNode(ISD::OR, DL, ResultVT, PartialRes, IsPseudo);
+      }
+    } else if (PartialCheck == fcQNan) {
+      // isquiet(V) ==> abs(V) >= (unsigned(Inf) | quiet_bit)
+      PartialRes =
+          DAG.getSetCC(DL, ResultVT, AbsV, InfWithQnanBitV, ISD::SETGE);
+    } else { // ISD::fcSNan
+      // issignaling(V) ==> abs(V) > unsigned(Inf) &&
+      //                    abs(V) < (unsigned(Inf) | quiet_bit)
+      SDValue IsNan = DAG.getSetCC(DL, ResultVT, AbsV, InfV, ISD::SETGT);
+      SDValue IsNotQnan =
+          DAG.getSetCC(DL, ResultVT, AbsV, InfWithQnanBitV, ISD::SETLT);
+      PartialRes = DAG.getNode(ISD::AND, DL, ResultVT, IsNan, IsNotQnan);
+    }
+    appendResult(PartialRes);
+  }
+
+  if (unsigned PartialCheck = Test & fcSubnormal) {
+    // issubnormal(V) ==> unsigned(abs(V) - 1) < (all mantissa bits set)
+    // issubnormal(V) && V>0 ==> unsigned(V - 1) < (all mantissa bits set)
+    SDValue V = (PartialCheck == fcPosSubnormal) ? OpAsInt : AbsV;
+    SDValue MantissaV = DAG.getConstant(AllOneMantissa, DL, IntVT);
+    SDValue VMinusOneV =
+        DAG.getNode(ISD::SUB, DL, IntVT, V, DAG.getConstant(1, DL, IntVT));
+    PartialRes = DAG.getSetCC(DL, ResultVT, VMinusOneV, MantissaV, ISD::SETULT);
+    if (PartialCheck == fcNegSubnormal)
+      PartialRes = DAG.getNode(ISD::AND, DL, ResultVT, PartialRes, SignV);
+    appendResult(PartialRes);
+  }
+
+  if (unsigned PartialCheck = Test & fcNormal) {
+    // isnormal(V) ==> (0 < exp < max_exp) ==> (unsigned(exp-1) < (max_exp-1))
+    APInt ExpLSB = ExpMask & ~(ExpMask.shl(1));
+    SDValue ExpLSBV = DAG.getConstant(ExpLSB, DL, IntVT);
+    SDValue ExpMinus1 = DAG.getNode(ISD::SUB, DL, IntVT, AbsV, ExpLSBV);
+    APInt ExpLimit = ExpMask - ExpLSB;
+    SDValue ExpLimitV = DAG.getConstant(ExpLimit, DL, IntVT);
+    PartialRes = DAG.getSetCC(DL, ResultVT, ExpMinus1, ExpLimitV, ISD::SETULT);
+    if (PartialCheck == fcNegNormal)
+      PartialRes = DAG.getNode(ISD::AND, DL, ResultVT, PartialRes, SignV);
+    else if (PartialCheck == fcPosNormal) {
+      SDValue PosSignV =
+          DAG.getNode(ISD::XOR, DL, ResultVT, SignV, ResultInvertionMask);
+      PartialRes = DAG.getNode(ISD::AND, DL, ResultVT, PartialRes, PosSignV);
+    }
+    if (IsF80)
+      PartialRes =
+          DAG.getNode(ISD::AND, DL, ResultVT, PartialRes, getIntBitIsSet());
+    appendResult(PartialRes);
+  }
+
+  if (!Res)
+    return DAG.getConstant(IsInverted, DL, ResultVT);
+  if (IsInverted)
+    Res = DAG.getNode(ISD::XOR, DL, ResultVT, Res, ResultInvertionMask);
+  return Res;
+}
+
 // Only expand vector types if we have the appropriate vector bit operations.
 static bool canExpandVectorCTPOP(const TargetLowering &TLI, EVT VT) {
   assert(VT.isVector() && "Expected vector type");
@@ -7116,8 +7749,6 @@ SDValue TargetLowering::expandCTPOP(SDNode *Node, SelectionDAG &DAG) const {
       DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), dl, VT);
   SDValue Mask0F =
       DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), dl, VT);
-  SDValue Mask01 =
-      DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x01)), dl, VT);
 
   // v = v - ((v >> 1) & 0x55555555...)
   Op = DAG.getNode(ISD::SUB, dl, VT, Op,
@@ -7137,13 +7768,28 @@ SDValue TargetLowering::expandCTPOP(SDNode *Node, SelectionDAG &DAG) const {
                                DAG.getNode(ISD::SRL, dl, VT, Op,
                                            DAG.getConstant(4, dl, ShVT))),
                    Mask0F);
-  // v = (v * 0x01010101...) >> (Len - 8)
-  if (Len > 8)
-    Op =
-        DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::MUL, dl, VT, Op, Mask01),
-                    DAG.getConstant(Len - 8, dl, ShVT));
 
-  return Op;
+  if (Len <= 8)
+    return Op;
+
+  // Avoid the multiply if we only have 2 bytes to add.
+  // TODO: Only doing this for scalars because vectors weren't as obviously
+  // improved.
+  if (Len == 16 && !VT.isVector()) {
+    // v = (v + (v >> 8)) & 0x00FF;
+    return DAG.getNode(ISD::AND, dl, VT,
+                     DAG.getNode(ISD::ADD, dl, VT, Op,
+                                 DAG.getNode(ISD::SRL, dl, VT, Op,
+                                             DAG.getConstant(8, dl, ShVT))),
+                     DAG.getConstant(0xFF, dl, VT));
+  }
+
+  // v = (v * 0x01010101...) >> (Len - 8)
+  SDValue Mask01 =
+      DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x01)), dl, VT);
+  return DAG.getNode(ISD::SRL, dl, VT,
+                     DAG.getNode(ISD::MUL, dl, VT, Op, Mask01),
+                     DAG.getConstant(Len - 8, dl, ShVT));
 }
 
 SDValue TargetLowering::expandCTLZ(SDNode *Node, SelectionDAG &DAG) const {
@@ -7265,6 +7911,7 @@ SDValue TargetLowering::expandABS(SDNode *N, SelectionDAG &DAG,
   if (!IsNegative && isOperationLegal(ISD::SUB, VT) &&
       isOperationLegal(ISD::UMIN, VT)) {
     SDValue Zero = DAG.getConstant(0, dl, VT);
+    Op = DAG.getFreeze(Op);
     return DAG.getNode(ISD::UMIN, dl, VT, Op,
                        DAG.getNode(ISD::SUB, dl, VT, Zero, Op));
   }
@@ -7272,6 +7919,7 @@ SDValue TargetLowering::expandABS(SDNode *N, SelectionDAG &DAG,
   // 0 - abs(x) -> smin(x, sub(0,x))
   if (IsNegative && isOperationLegal(ISD::SUB, VT) &&
       isOperationLegal(ISD::SMIN, VT)) {
+    Op = DAG.getFreeze(Op);
     SDValue Zero = DAG.getConstant(0, dl, VT);
     return DAG.getNode(ISD::SMIN, dl, VT, Op,
                        DAG.getNode(ISD::SUB, dl, VT, Zero, Op));
@@ -7285,16 +7933,17 @@ SDValue TargetLowering::expandABS(SDNode *N, SelectionDAG &DAG,
        !isOperationLegalOrCustomOrPromote(ISD::XOR, VT)))
     return SDValue();
 
+  Op = DAG.getFreeze(Op);
   SDValue Shift =
       DAG.getNode(ISD::SRA, dl, VT, Op,
                   DAG.getConstant(VT.getScalarSizeInBits() - 1, dl, ShVT));
-  if (!IsNegative) {
-    SDValue Add = DAG.getNode(ISD::ADD, dl, VT, Op, Shift);
-    return DAG.getNode(ISD::XOR, dl, VT, Add, Shift);
-  }
+  SDValue Xor = DAG.getNode(ISD::XOR, dl, VT, Op, Shift);
+
+  // abs(x) -> Y = sra (X, size(X)-1); sub (xor (X, Y), Y)
+  if (!IsNegative)
+    return DAG.getNode(ISD::SUB, dl, VT, Xor, Shift);
 
   // 0 - abs(x) -> Y = sra (X, size(X)-1); sub (Y, xor (X, Y))
-  SDValue Xor = DAG.getNode(ISD::XOR, dl, VT, Op, Shift);
   return DAG.getNode(ISD::SUB, dl, VT, Shift, Xor);
 }
 
@@ -8041,23 +8690,6 @@ SDValue TargetLowering::lowerCmpEqZeroToCtlzSrl(SDValue Op,
   return SDValue();
 }
 
-// Convert redundant addressing modes (e.g. scaling is redundant
-// when accessing bytes).
-ISD::MemIndexType
-TargetLowering::getCanonicalIndexType(ISD::MemIndexType IndexType, EVT MemVT,
-                                      SDValue Offsets) const {
-  bool IsScaledIndex =
-      (IndexType == ISD::SIGNED_SCALED) || (IndexType == ISD::UNSIGNED_SCALED);
-  bool IsSignedIndex =
-      (IndexType == ISD::SIGNED_SCALED) || (IndexType == ISD::SIGNED_UNSCALED);
-
-  // Scaling is unimportant for bytes, canonicalize to unscaled.
-  if (IsScaledIndex && MemVT.getScalarType() == MVT::i8)
-    return IsSignedIndex ? ISD::SIGNED_UNSCALED : ISD::UNSIGNED_UNSCALED;
-
-  return IndexType;
-}
-
 SDValue TargetLowering::expandIntMINMAX(SDNode *Node, SelectionDAG &DAG) const {
   SDValue Op0 = Node->getOperand(0);
   SDValue Op1 = Node->getOperand(1);
@@ -8473,8 +9105,20 @@ void TargetLowering::expandUADDSUBO(
   EVT ResultType = Node->getValueType(1);
   EVT SetCCType = getSetCCResultType(
       DAG.getDataLayout(), *DAG.getContext(), Node->getValueType(0));
-  ISD::CondCode CC = IsAdd ? ISD::SETULT : ISD::SETUGT;
-  SDValue SetCC = DAG.getSetCC(dl, SetCCType, Result, LHS, CC);
+  SDValue SetCC;
+  if (IsAdd && isOneConstant(RHS)) {
+    // Special case: uaddo X, 1 overflowed if X+1 is 0. This potential reduces
+    // the live range of X. We assume comparing with 0 is cheap.
+    // The general case (X + C) < C is not necessarily beneficial. Although we
+    // reduce the live range of X, we may introduce the materialization of
+    // constant C.
+    SetCC =
+        DAG.getSetCC(dl, SetCCType, Result,
+                     DAG.getConstant(0, dl, Node->getValueType(0)), ISD::SETEQ);
+  } else {
+    ISD::CondCode CC = IsAdd ? ISD::SETULT : ISD::SETUGT;
+    SetCC = DAG.getSetCC(dl, SetCCType, Result, LHS, CC);
+  }
   Overflow = DAG.getBoolExtOrTrunc(SetCC, dl, ResultType, ResultType);
 }
 
@@ -8773,11 +9417,11 @@ SDValue TargetLowering::expandFP_TO_INT_SAT(SDNode *Node,
   // floating-point values.
   APInt MinInt, MaxInt;
   if (IsSigned) {
-    MinInt = APInt::getSignedMinValue(SatWidth).sextOrSelf(DstWidth);
-    MaxInt = APInt::getSignedMaxValue(SatWidth).sextOrSelf(DstWidth);
+    MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
+    MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
   } else {
-    MinInt = APInt::getMinValue(SatWidth).zextOrSelf(DstWidth);
-    MaxInt = APInt::getMaxValue(SatWidth).zextOrSelf(DstWidth);
+    MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
+    MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
   }
 
   // We cannot risk emitting FP_TO_XINT nodes with a source VT of f16, as
@@ -8931,13 +9575,16 @@ SDValue TargetLowering::expandVectorSplice(SDNode *Node,
 
 bool TargetLowering::LegalizeSetCCCondCode(SelectionDAG &DAG, EVT VT,
                                            SDValue &LHS, SDValue &RHS,
-                                           SDValue &CC, bool &NeedInvert,
+                                           SDValue &CC, SDValue Mask,
+                                           SDValue EVL, bool &NeedInvert,
                                            const SDLoc &dl, SDValue &Chain,
                                            bool IsSignaling) const {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   MVT OpVT = LHS.getSimpleValueType();
   ISD::CondCode CCCode = cast<CondCodeSDNode>(CC)->get();
   NeedInvert = false;
+  assert(!EVL == !Mask && "VP Mask and EVL must either both be set or unset");
+  bool IsNonVP = !EVL;
   switch (TLI.getCondCodeAction(CCCode, OpVT)) {
   default:
     llvm_unreachable("Unknown condition code action!");
@@ -9044,17 +9691,34 @@ bool TargetLowering::LegalizeSetCCCondCode(SelectionDAG &DAG, EVT VT,
     if (CCCode != ISD::SETO && CCCode != ISD::SETUO) {
       // If we aren't the ordered or unorder operation,
       // then the pattern is (LHS CC1 RHS) Opc (LHS CC2 RHS).
-      SetCC1 = DAG.getSetCC(dl, VT, LHS, RHS, CC1, Chain, IsSignaling);
-      SetCC2 = DAG.getSetCC(dl, VT, LHS, RHS, CC2, Chain, IsSignaling);
+      if (IsNonVP) {
+        SetCC1 = DAG.getSetCC(dl, VT, LHS, RHS, CC1, Chain, IsSignaling);
+        SetCC2 = DAG.getSetCC(dl, VT, LHS, RHS, CC2, Chain, IsSignaling);
+      } else {
+        SetCC1 = DAG.getSetCCVP(dl, VT, LHS, RHS, CC1, Mask, EVL);
+        SetCC2 = DAG.getSetCCVP(dl, VT, LHS, RHS, CC2, Mask, EVL);
+      }
     } else {
       // Otherwise, the pattern is (LHS CC1 LHS) Opc (RHS CC2 RHS)
-      SetCC1 = DAG.getSetCC(dl, VT, LHS, LHS, CC1, Chain, IsSignaling);
-      SetCC2 = DAG.getSetCC(dl, VT, RHS, RHS, CC2, Chain, IsSignaling);
+      if (IsNonVP) {
+        SetCC1 = DAG.getSetCC(dl, VT, LHS, LHS, CC1, Chain, IsSignaling);
+        SetCC2 = DAG.getSetCC(dl, VT, RHS, RHS, CC2, Chain, IsSignaling);
+      } else {
+        SetCC1 = DAG.getSetCCVP(dl, VT, LHS, LHS, CC1, Mask, EVL);
+        SetCC2 = DAG.getSetCCVP(dl, VT, RHS, RHS, CC2, Mask, EVL);
+      }
     }
     if (Chain)
       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, SetCC1.getValue(1),
                           SetCC2.getValue(1));
-    LHS = DAG.getNode(Opc, dl, VT, SetCC1, SetCC2);
+    if (IsNonVP)
+      LHS = DAG.getNode(Opc, dl, VT, SetCC1, SetCC2);
+    else {
+      // Transform the binary opcode to the VP equivalent.
+      assert((Opc == ISD::OR || Opc == ISD::AND) && "Unexpected opcode");
+      Opc = Opc == ISD::OR ? ISD::VP_OR : ISD::VP_AND;
+      LHS = DAG.getNode(Opc, dl, VT, SetCC1, SetCC2, Mask, EVL);
+    }
     RHS = SDValue();
     CC = SDValue();
     return true;
diff --git a/llvm/lib/CodeGen/ShadowStackGCLowering.cpp b/llvm/lib/CodeGen/ShadowStackGCLowering.cpp
index 43a54ce33bf0..5f9ade18f15c 100644
--- a/llvm/lib/CodeGen/ShadowStackGCLowering.cpp
+++ b/llvm/lib/CodeGen/ShadowStackGCLowering.cpp
@@ -39,7 +39,6 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Transforms/Utils/EscapeEnumerator.h"
 #include <cassert>
-#include <cstddef>
 #include <string>
 #include <utility>
 #include <vector>
@@ -362,7 +361,7 @@ bool ShadowStackGCLowering::runOnFunction(Function &F) {
 
   // For each instruction that escapes...
   EscapeEnumerator EE(F, "gc_cleanup", /*HandleExceptions=*/true,
-                      DTU.hasValue() ? DTU.getPointer() : nullptr);
+                      DTU ? DTU.getPointer() : nullptr);
   while (IRBuilder<> *AtExit = EE.Next()) {
     // Pop the entry from the shadow stack. Don't reuse CurrentHead from
     // AtEntry, since that would make the value live for the entire function.
diff --git a/llvm/lib/CodeGen/SjLjEHPrepare.cpp b/llvm/lib/CodeGen/SjLjEHPrepare.cpp
index 8211e3d6a9dd..1fcee02184a9 100644
--- a/llvm/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/llvm/lib/CodeGen/SjLjEHPrepare.cpp
@@ -413,7 +413,7 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
   Val = Builder.CreateCall(StackAddrFn, {}, "sp");
   Builder.CreateStore(Val, StackPtr, /*isVolatile=*/true);
 
-  // Call the setup_dispatch instrinsic. It fills in the rest of the jmpbuf.
+  // Call the setup_dispatch intrinsic. It fills in the rest of the jmpbuf.
   Builder.CreateCall(BuiltinSetupDispatchFn, {});
 
   // Store a pointer to the function context so that the back-end will know
diff --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp
index 7f9518e4c075..140a91ae342b 100644
--- a/llvm/lib/CodeGen/SplitKit.cpp
+++ b/llvm/lib/CodeGen/SplitKit.cpp
@@ -389,17 +389,34 @@ LLVM_DUMP_METHOD void SplitEditor::dump() const {
 }
 #endif
 
-LiveInterval::SubRange &SplitEditor::getSubRangeForMaskExact(LaneBitmask LM,
-                                                             LiveInterval &LI) {
-  for (LiveInterval::SubRange &S : LI.subranges())
+/// Find a subrange corresponding to the exact lane mask @p LM in the live
+/// interval @p LI. The interval @p LI is assumed to contain such a subrange.
+/// This function is used to find corresponding subranges between the
+/// original interval and the new intervals.
+template <typename T> auto &getSubrangeImpl(LaneBitmask LM, T &LI) {
+  for (auto &S : LI.subranges())
     if (S.LaneMask == LM)
       return S;
   llvm_unreachable("SubRange for this mask not found");
 }
 
-LiveInterval::SubRange &SplitEditor::getSubRangeForMask(LaneBitmask LM,
-                                                        LiveInterval &LI) {
-  for (LiveInterval::SubRange &S : LI.subranges())
+LiveInterval::SubRange &getSubRangeForMaskExact(LaneBitmask LM,
+                                                LiveInterval &LI) {
+  return getSubrangeImpl(LM, LI);
+}
+
+const LiveInterval::SubRange &getSubRangeForMaskExact(LaneBitmask LM,
+                                                      const LiveInterval &LI) {
+  return getSubrangeImpl(LM, LI);
+}
+
+/// Find a subrange corresponding to the lane mask @p LM, or a superset of it,
+/// in the live interval @p LI. The interval @p LI is assumed to contain such
+/// a subrange.  This function is used to find corresponding subranges between
+/// the original interval and the new intervals.
+const LiveInterval::SubRange &getSubRangeForMask(LaneBitmask LM,
+                                                 const LiveInterval &LI) {
+  for (const LiveInterval::SubRange &S : LI.subranges())
     if ((S.LaneMask & LM) == LM)
       return S;
   llvm_unreachable("SubRange for this mask not found");
@@ -566,10 +583,8 @@ SlotIndex SplitEditor::buildCopy(Register FromReg, Register ToReg,
   return Def;
 }
 
-VNInfo *SplitEditor::defFromParent(unsigned RegIdx,
-                                   VNInfo *ParentVNI,
-                                   SlotIndex UseIdx,
-                                   MachineBasicBlock &MBB,
+VNInfo *SplitEditor::defFromParent(unsigned RegIdx, const VNInfo *ParentVNI,
+                                   SlotIndex UseIdx, MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator I) {
   SlotIndex Def;
   LiveInterval *LI = &LIS.getInterval(Edit->get(RegIdx));
@@ -937,7 +952,7 @@ SplitEditor::findShallowDominator(MachineBasicBlock *MBB,
 void SplitEditor::computeRedundantBackCopies(
     DenseSet<unsigned> &NotToHoistSet, SmallVectorImpl<VNInfo *> &BackCopies) {
   LiveInterval *LI = &LIS.getInterval(Edit->get(0));
-  LiveInterval *Parent = &Edit->getParent();
+  const LiveInterval *Parent = &Edit->getParent();
   SmallVector<SmallPtrSet<VNInfo *, 8>, 8> EqualVNs(Parent->getNumValNums());
   SmallPtrSet<VNInfo *, 8> DominatedVNIs;
 
@@ -952,7 +967,7 @@ void SplitEditor::computeRedundantBackCopies(
   // For VNI aggregation of each ParentVNI, collect dominated, i.e.,
   // redundant VNIs to BackCopies.
   for (unsigned i = 0, e = Parent->getNumValNums(); i != e; ++i) {
-    VNInfo *ParentVNI = Parent->getValNumInfo(i);
+    const VNInfo *ParentVNI = Parent->getValNumInfo(i);
     if (!NotToHoistSet.count(ParentVNI->id))
       continue;
     SmallPtrSetIterator<VNInfo *> It1 = EqualVNs[ParentVNI->id].begin();
@@ -990,7 +1005,7 @@ void SplitEditor::computeRedundantBackCopies(
 void SplitEditor::hoistCopies() {
   // Get the complement interval, always RegIdx 0.
   LiveInterval *LI = &LIS.getInterval(Edit->get(0));
-  LiveInterval *Parent = &Edit->getParent();
+  const LiveInterval *Parent = &Edit->getParent();
 
   // Track the nearest common dominator for all back-copies for each ParentVNI,
   // indexed by ParentVNI->id.
@@ -1067,7 +1082,7 @@ void SplitEditor::hoistCopies() {
     if (!Dom.first || Dom.second.isValid())
       continue;
     // This value needs a hoisted copy inserted at the end of Dom.first.
-    VNInfo *ParentVNI = Parent->getValNumInfo(i);
+    const VNInfo *ParentVNI = Parent->getValNumInfo(i);
     MachineBasicBlock *DefMBB = LIS.getMBBFromIndex(ParentVNI->def);
     // Get a less loopy dominator than Dom.first.
     Dom.first = findShallowDominator(Dom.first, DefMBB);
@@ -1237,11 +1252,11 @@ void SplitEditor::extendPHIRange(MachineBasicBlock &B, LiveIntervalCalc &LIC,
     SlotIndex LastUse = End.getPrevSlot();
     // The predecessor may not have a live-out value. That is OK, like an
     // undef PHI operand.
-    LiveInterval &PLI = Edit->getParent();
+    const LiveInterval &PLI = Edit->getParent();
     // Need the cast because the inputs to ?: would otherwise be deemed
     // "incompatible": SubRange vs LiveInterval.
-    LiveRange &PSR = !LM.all() ? getSubRangeForMaskExact(LM, PLI)
-                               : static_cast<LiveRange &>(PLI);
+    const LiveRange &PSR = !LM.all() ? getSubRangeForMaskExact(LM, PLI)
+                                     : static_cast<const LiveRange &>(PLI);
     if (PSR.liveAt(LastUse))
       LIC.extend(LR, End, /*PhysReg=*/0, Undefs);
   }
@@ -1254,7 +1269,7 @@ void SplitEditor::extendPHIKillRanges() {
   // remove it. Otherwise, extend the live interval to reach the end indexes
   // of all predecessor blocks.
 
-  LiveInterval &ParentLI = Edit->getParent();
+  const LiveInterval &ParentLI = Edit->getParent();
   for (const VNInfo *V : ParentLI.valnos) {
     if (V->isUnused() || !V->isPHIDef())
       continue;
@@ -1270,7 +1285,7 @@ void SplitEditor::extendPHIKillRanges() {
   SmallVector<SlotIndex, 4> Undefs;
   LiveIntervalCalc SubLIC;
 
-  for (LiveInterval::SubRange &PS : ParentLI.subranges()) {
+  for (const LiveInterval::SubRange &PS : ParentLI.subranges()) {
     for (const VNInfo *V : PS.valnos) {
       if (V->isUnused() || !V->isPHIDef())
         continue;
@@ -1337,13 +1352,34 @@ void SplitEditor::rewriteAssigned(bool ExtendRanges) {
         continue;
       // We may want to extend a live range for a partial redef, or for a use
       // tied to an early clobber.
-      Idx = Idx.getPrevSlot();
-      if (!Edit->getParent().liveAt(Idx))
+      if (!Edit->getParent().liveAt(Idx.getPrevSlot()))
         continue;
-    } else
-      Idx = Idx.getRegSlot(true);
+    } else {
+      assert(MO.isUse());
+      bool IsEarlyClobber = false;
+      if (MO.isTied()) {
+        // We want to extend a live range into `e` slot rather than `r` slot if
+        // tied-def is early clobber, because the `e` slot already contained
+        // in the live range of early-clobber tied-def operand, give an example
+        // here:
+        //  0  %0 = ...
+        // 16  early-clobber %0 = Op %0 (tied-def 0), ...
+        // 32  ... = Op %0
+        // Before extend:
+        //   %0 = [0r, 0d) [16e, 32d)
+        // The point we want to extend is 0d to 16e not 16r in this case, but if
+        // we use 16r here we will extend nothing because that already contained
+        // in [16e, 32d).
+        unsigned OpIdx = MI->getOperandNo(&MO);
+        unsigned DefOpIdx = MI->findTiedOperandIdx(OpIdx);
+        const MachineOperand &DefOp = MI->getOperand(DefOpIdx);
+        IsEarlyClobber = DefOp.isEarlyClobber();
+      }
+
+      Idx = Idx.getRegSlot(IsEarlyClobber);
+    }
 
-    SlotIndex Next = Idx.getNextSlot();
+    SlotIndex Next = Idx;
     if (LI.hasSubRanges()) {
       // We have to delay extending subranges until we have seen all operands
       // defining the register. This is because a <def,read-undef> operand
@@ -1510,9 +1546,8 @@ void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
 
   // Provide a reverse mapping from original indices to Edit ranges.
   if (LRMap) {
-    LRMap->clear();
-    for (unsigned i = 0, e = Edit->size(); i != e; ++i)
-      LRMap->push_back(i);
+    auto Seq = llvm::seq<unsigned>(0, Edit->size());
+    LRMap->assign(Seq.begin(), Seq.end());
   }
 
   // Now check if any registers were separated into multiple components.
diff --git a/llvm/lib/CodeGen/SplitKit.h b/llvm/lib/CodeGen/SplitKit.h
index 902546fe16d8..4400a797d38e 100644
--- a/llvm/lib/CodeGen/SplitKit.h
+++ b/llvm/lib/CodeGen/SplitKit.h
@@ -22,19 +22,19 @@
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervalCalc.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/SlotIndexes.h"
-#include "llvm/MC/LaneBitmask.h"
 #include "llvm/Support/Compiler.h"
 #include <utility>
 
 namespace llvm {
 
 class AAResults;
+class LiveInterval;
+class LiveRange;
 class LiveIntervals;
 class LiveRangeEdit;
 class MachineBlockFrequencyInfo;
@@ -346,19 +346,6 @@ private:
     return LICalc[SpillMode != SM_Partition && RegIdx != 0];
   }
 
-  /// Find a subrange corresponding to the exact lane mask @p LM in the live
-  /// interval @p LI. The interval @p LI is assumed to contain such a subrange.
-  /// This function is used to find corresponding subranges between the
-  /// original interval and the new intervals.
-  LiveInterval::SubRange &getSubRangeForMaskExact(LaneBitmask LM,
-                                                  LiveInterval &LI);
-
-  /// Find a subrange corresponding to the lane mask @p LM, or a superset of it,
-  /// in the live interval @p LI. The interval @p LI is assumed to contain such
-  /// a subrange.  This function is used to find corresponding subranges between
-  /// the original interval and the new intervals.
-  LiveInterval::SubRange &getSubRangeForMask(LaneBitmask LM, LiveInterval &LI);
-
   /// Add a segment to the interval LI for the value number VNI. If LI has
   /// subranges, corresponding segments will be added to them as well, but
   /// with newly created value numbers. If Original is true, dead def will
@@ -390,10 +377,8 @@ private:
 
   /// defFromParent - Define Reg from ParentVNI at UseIdx using either
   /// rematerialization or a COPY from parent. Return the new value.
-  VNInfo *defFromParent(unsigned RegIdx,
-                        VNInfo *ParentVNI,
-                        SlotIndex UseIdx,
-                        MachineBasicBlock &MBB,
+  VNInfo *defFromParent(unsigned RegIdx, const VNInfo *ParentVNI,
+                        SlotIndex UseIdx, MachineBasicBlock &MBB,
                         MachineBasicBlock::iterator I);
 
   /// removeBackCopies - Remove the copy instructions that defines the values
diff --git a/llvm/lib/CodeGen/StackColoring.cpp b/llvm/lib/CodeGen/StackColoring.cpp
index 623d5da9831e..11c6bdc69956 100644
--- a/llvm/lib/CodeGen/StackColoring.cpp
+++ b/llvm/lib/CodeGen/StackColoring.cpp
@@ -36,14 +36,12 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Use.h"
@@ -1145,6 +1143,9 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
   LLVM_DEBUG(dbgs() << "Fixed " << FixedMemOp << " machine memory operands.\n");
   LLVM_DEBUG(dbgs() << "Fixed " << FixedDbg << " debug locations.\n");
   LLVM_DEBUG(dbgs() << "Fixed " << FixedInstr << " machine instructions.\n");
+  (void) FixedMemOp;
+  (void) FixedDbg;
+  (void) FixedInstr;
 }
 
 void StackColoring::removeInvalidSlotRanges() {
@@ -1319,6 +1320,11 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
 
         int FirstSlot = SortedSlots[I];
         int SecondSlot = SortedSlots[J];
+
+        // Objects with different stack IDs cannot be merged.
+        if (MFI->getStackID(FirstSlot) != MFI->getStackID(SecondSlot))
+          continue;
+
         LiveInterval *First = &*Intervals[FirstSlot];
         LiveInterval *Second = &*Intervals[SecondSlot];
         auto &FirstS = LiveStarts[FirstSlot];
diff --git a/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp b/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp
index 3640296adbca..b83c56903133 100644
--- a/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp
+++ b/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp
@@ -17,9 +17,9 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/llvm/lib/CodeGen/StackMaps.cpp b/llvm/lib/CodeGen/StackMaps.cpp
index 36e8f129ea15..6757d6ca4f88 100644
--- a/llvm/lib/CodeGen/StackMaps.cpp
+++ b/llvm/lib/CodeGen/StackMaps.cpp
@@ -721,7 +721,7 @@ void StackMaps::serializeToStackMapSection() {
   // Create the section.
   MCSection *StackMapSection =
       OutContext.getObjectFileInfo()->getStackMapSection();
-  OS.SwitchSection(StackMapSection);
+  OS.switchSection(StackMapSection);
 
   // Emit a dummy symbol to force section inclusion.
   OS.emitLabel(OutContext.getOrCreateSymbol(Twine("__LLVM_StackMaps")));
@@ -732,7 +732,7 @@ void StackMaps::serializeToStackMapSection() {
   emitFunctionFrameRecords(OS);
   emitConstantPoolEntries(OS);
   emitCallsiteEntries(OS);
-  OS.AddBlankLine();
+  OS.addBlankLine();
 
   // Clean up.
   CSInfos.clear();
diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp
index 6765fd274686..510a8e3e4ba2 100644
--- a/llvm/lib/CodeGen/StackProtector.cpp
+++ b/llvm/lib/CodeGen/StackProtector.cpp
@@ -28,8 +28,6 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfo.h"
-#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
@@ -169,7 +167,7 @@ bool StackProtector::HasAddressTaken(const Instruction *AI,
     // If this instruction accesses memory make sure it doesn't access beyond
     // the bounds of the allocated object.
     Optional<MemoryLocation> MemLoc = MemoryLocation::getOrNone(I);
-    if (MemLoc.hasValue() && MemLoc->Size.hasValue() &&
+    if (MemLoc && MemLoc->Size.hasValue() &&
         !TypeSize::isKnownGE(AllocSize,
                              TypeSize::getFixed(MemLoc->Size.getValue())))
       return true;
diff --git a/llvm/lib/CodeGen/StackSlotColoring.cpp b/llvm/lib/CodeGen/StackSlotColoring.cpp
index 17e6f51d0899..b8c750688914 100644
--- a/llvm/lib/CodeGen/StackSlotColoring.cpp
+++ b/llvm/lib/CodeGen/StackSlotColoring.cpp
@@ -28,7 +28,6 @@
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
diff --git a/llvm/lib/CodeGen/TailDuplication.cpp b/llvm/lib/CodeGen/TailDuplication.cpp
index 20892a79d35f..bf3d2088e196 100644
--- a/llvm/lib/CodeGen/TailDuplication.cpp
+++ b/llvm/lib/CodeGen/TailDuplication.cpp
@@ -14,14 +14,14 @@
 
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MBFIWrapper.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TailDuplicator.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp
index 68a7b80d6146..ba533a491b9c 100644
--- a/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -19,17 +19,15 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
-#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineSizeOpts.h"
 #include "llvm/CodeGen/MachineSSAUpdater.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -370,8 +368,8 @@ void TailDuplicator::processPHI(
     return;
 
   // Remove PredBB from the PHI node.
-  MI->RemoveOperand(SrcOpIdx + 1);
-  MI->RemoveOperand(SrcOpIdx);
+  MI->removeOperand(SrcOpIdx + 1);
+  MI->removeOperand(SrcOpIdx);
   if (MI->getNumOperands() == 1)
     MI->eraseFromParent();
 }
@@ -385,8 +383,9 @@ void TailDuplicator::duplicateInstruction(
   // Allow duplication of CFI instructions.
   if (MI->isCFIInstruction()) {
     BuildMI(*PredBB, PredBB->end(), PredBB->findDebugLoc(PredBB->begin()),
-      TII->get(TargetOpcode::CFI_INSTRUCTION)).addCFIIndex(
-      MI->getOperand(0).getCFIIndex());
+            TII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(MI->getOperand(0).getCFIIndex())
+        .setMIFlags(MI->getFlags());
     return;
   }
   MachineInstr &NewMI = TII->duplicate(*PredBB, PredBB->end(), *MI);
@@ -496,15 +495,15 @@ void TailDuplicator::updateSuccessorsPHIs(
         for (unsigned i = MI.getNumOperands() - 2; i != Idx; i -= 2) {
           MachineOperand &MO = MI.getOperand(i + 1);
           if (MO.getMBB() == FromBB) {
-            MI.RemoveOperand(i + 1);
-            MI.RemoveOperand(i);
+            MI.removeOperand(i + 1);
+            MI.removeOperand(i);
           }
         }
       } else
         Idx = 0;
 
       // If Idx is set, the operands at Idx and Idx+1 must be removed.
-      // We reuse the location to avoid expensive RemoveOperand calls.
+      // We reuse the location to avoid expensive removeOperand calls.
 
       DenseMap<Register, AvailableValsTy>::iterator LI =
           SSAUpdateVals.find(Reg);
@@ -541,8 +540,8 @@ void TailDuplicator::updateSuccessorsPHIs(
         }
       }
       if (Idx != 0) {
-        MI.RemoveOperand(Idx + 1);
-        MI.RemoveOperand(Idx);
+        MI.removeOperand(Idx + 1);
+        MI.removeOperand(Idx);
       }
     }
   }
diff --git a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
index fbf190a52585..9430e86fe44d 100644
--- a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
+++ b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
@@ -10,17 +10,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstrTypes.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetMachine.h"
@@ -37,6 +37,11 @@ bool TargetFrameLowering::enableCalleeSaveSkip(const MachineFunction &MF) const
   return false;
 }
 
+bool TargetFrameLowering::enableCFIFixup(MachineFunction &MF) const {
+  return MF.needsFrameMoves() &&
+         !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+}
+
 /// Returns the displacement from the frame register to the stack
 /// frame of the specified index, along with the frame register used
 /// (in output arg FrameReg). This is the default implementation which
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 3f22cc4289f2..2a987ee3eedf 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
@@ -31,8 +32,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-#include <cctype>
 
 using namespace llvm;
 
@@ -40,8 +39,7 @@ static cl::opt<bool> DisableHazardRecognizer(
   "disable-sched-hazard", cl::Hidden, cl::init(false),
   cl::desc("Disable hazard detection during preRA scheduling"));
 
-TargetInstrInfo::~TargetInstrInfo() {
-}
+TargetInstrInfo::~TargetInstrInfo() = default;
 
 const TargetRegisterClass*
 TargetInstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
@@ -873,11 +871,13 @@ void TargetInstrInfo::reassociateOps(
   MachineInstrBuilder MIB1 =
       BuildMI(*MF, Prev.getDebugLoc(), TII->get(Opcode), NewVR)
           .addReg(RegX, getKillRegState(KillX))
-          .addReg(RegY, getKillRegState(KillY));
+          .addReg(RegY, getKillRegState(KillY))
+          .setMIFlags(Prev.getFlags());
   MachineInstrBuilder MIB2 =
       BuildMI(*MF, Root.getDebugLoc(), TII->get(Opcode), RegC)
           .addReg(RegA, getKillRegState(KillA))
-          .addReg(NewVR, getKillRegState(true));
+          .addReg(NewVR, getKillRegState(true))
+          .setMIFlags(Root.getFlags());
 
   setSpecialOperandAttr(Root, Prev, *MIB1, *MIB2);
 
@@ -1399,7 +1399,7 @@ std::string TargetInstrInfo::createMIROperandComment(
   return OS.str();
 }
 
-TargetInstrInfo::PipelinerLoopInfo::~PipelinerLoopInfo() {}
+TargetInstrInfo::PipelinerLoopInfo::~PipelinerLoopInfo() = default;
 
 void TargetInstrInfo::mergeOutliningCandidateAttributes(
     Function &F, std::vector<outliner::Candidate> &Candidates) const {
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index ab574232e367..6a595a4c748b 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -56,7 +56,6 @@
 #include "llvm/Transforms/Utils/SizeOpts.h"
 #include <algorithm>
 #include <cassert>
-#include <cstddef>
 #include <cstdint>
 #include <cstring>
 #include <iterator>
@@ -202,7 +201,7 @@ void TargetLoweringBase::InitLibcalls(const Triple &TT) {
     setLibcallName(RTLIB::SINCOS_PPCF128, "sincosl");
   }
 
-  if (TT.isPS4CPU()) {
+  if (TT.isPS()) {
     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
     setLibcallName(RTLIB::SINCOS_F64, "sincos");
   }
@@ -275,6 +274,11 @@ RTLIB::Libcall RTLIB::getFPROUND(EVT OpVT, EVT RetVT) {
       return FPROUND_F128_F16;
     if (OpVT == MVT::ppcf128)
       return FPROUND_PPCF128_F16;
+  } else if (RetVT == MVT::bf16) {
+    if (OpVT == MVT::f32)
+      return FPROUND_F32_BF16;
+    if (OpVT == MVT::f64)
+      return FPROUND_F64_BF16;
   } else if (RetVT == MVT::f32) {
     if (OpVT == MVT::f64)
       return FPROUND_F64_F32;
@@ -740,6 +744,30 @@ void TargetLoweringBase::initActions() {
   std::fill(std::begin(TargetDAGCombineArray),
             std::end(TargetDAGCombineArray), 0);
 
+  // We're somewhat special casing MVT::i2 and MVT::i4. Ideally we want to
+  // remove this and targets should individually set these types if not legal.
+  for (ISD::NodeType NT : enum_seq(ISD::DELETED_NODE, ISD::BUILTIN_OP_END,
+                                   force_iteration_on_noniterable_enum)) {
+    for (MVT VT : {MVT::i2, MVT::i4})
+      OpActions[(unsigned)VT.SimpleTy][NT] = Expand;
+  }
+  for (MVT AVT : MVT::all_valuetypes()) {
+    for (MVT VT : {MVT::i2, MVT::i4, MVT::v128i2, MVT::v64i4}) {
+      setTruncStoreAction(AVT, VT, Expand);
+      setLoadExtAction(ISD::EXTLOAD, AVT, VT, Expand);
+      setLoadExtAction(ISD::ZEXTLOAD, AVT, VT, Expand);
+    }
+  }
+  for (unsigned IM = (unsigned)ISD::PRE_INC;
+       IM != (unsigned)ISD::LAST_INDEXED_MODE; ++IM) {
+    for (MVT VT : {MVT::i2, MVT::i4}) {
+      setIndexedLoadAction(IM, VT, Expand);
+      setIndexedStoreAction(IM, VT, Expand);
+      setIndexedMaskedLoadAction(IM, VT, Expand);
+      setIndexedMaskedStoreAction(IM, VT, Expand);
+    }
+  }
+
   for (MVT VT : MVT::fp_valuetypes()) {
     MVT IntVT = MVT::getIntegerVT(VT.getFixedSizeInBits());
     if (IntVT.isValid()) {
@@ -763,85 +791,63 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Expand);
 
     // These operations default to expand.
-    setOperationAction(ISD::FGETSIGN, VT, Expand);
-    setOperationAction(ISD::CONCAT_VECTORS, VT, Expand);
-    setOperationAction(ISD::FMINNUM, VT, Expand);
-    setOperationAction(ISD::FMAXNUM, VT, Expand);
-    setOperationAction(ISD::FMINNUM_IEEE, VT, Expand);
-    setOperationAction(ISD::FMAXNUM_IEEE, VT, Expand);
-    setOperationAction(ISD::FMINIMUM, VT, Expand);
-    setOperationAction(ISD::FMAXIMUM, VT, Expand);
-    setOperationAction(ISD::FMAD, VT, Expand);
-    setOperationAction(ISD::SMIN, VT, Expand);
-    setOperationAction(ISD::SMAX, VT, Expand);
-    setOperationAction(ISD::UMIN, VT, Expand);
-    setOperationAction(ISD::UMAX, VT, Expand);
-    setOperationAction(ISD::ABS, VT, Expand);
-    setOperationAction(ISD::FSHL, VT, Expand);
-    setOperationAction(ISD::FSHR, VT, Expand);
-    setOperationAction(ISD::SADDSAT, VT, Expand);
-    setOperationAction(ISD::UADDSAT, VT, Expand);
-    setOperationAction(ISD::SSUBSAT, VT, Expand);
-    setOperationAction(ISD::USUBSAT, VT, Expand);
-    setOperationAction(ISD::SSHLSAT, VT, Expand);
-    setOperationAction(ISD::USHLSAT, VT, Expand);
-    setOperationAction(ISD::SMULFIX, VT, Expand);
-    setOperationAction(ISD::SMULFIXSAT, VT, Expand);
-    setOperationAction(ISD::UMULFIX, VT, Expand);
-    setOperationAction(ISD::UMULFIXSAT, VT, Expand);
-    setOperationAction(ISD::SDIVFIX, VT, Expand);
-    setOperationAction(ISD::SDIVFIXSAT, VT, Expand);
-    setOperationAction(ISD::UDIVFIX, VT, Expand);
-    setOperationAction(ISD::UDIVFIXSAT, VT, Expand);
-    setOperationAction(ISD::FP_TO_SINT_SAT, VT, Expand);
-    setOperationAction(ISD::FP_TO_UINT_SAT, VT, Expand);
+    setOperationAction({ISD::FGETSIGN,       ISD::CONCAT_VECTORS,
+                        ISD::FMINNUM,        ISD::FMAXNUM,
+                        ISD::FMINNUM_IEEE,   ISD::FMAXNUM_IEEE,
+                        ISD::FMINIMUM,       ISD::FMAXIMUM,
+                        ISD::FMAD,           ISD::SMIN,
+                        ISD::SMAX,           ISD::UMIN,
+                        ISD::UMAX,           ISD::ABS,
+                        ISD::FSHL,           ISD::FSHR,
+                        ISD::SADDSAT,        ISD::UADDSAT,
+                        ISD::SSUBSAT,        ISD::USUBSAT,
+                        ISD::SSHLSAT,        ISD::USHLSAT,
+                        ISD::SMULFIX,        ISD::SMULFIXSAT,
+                        ISD::UMULFIX,        ISD::UMULFIXSAT,
+                        ISD::SDIVFIX,        ISD::SDIVFIXSAT,
+                        ISD::UDIVFIX,        ISD::UDIVFIXSAT,
+                        ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT,
+                        ISD::IS_FPCLASS},
+                       VT, Expand);
 
     // Overflow operations default to expand
-    setOperationAction(ISD::SADDO, VT, Expand);
-    setOperationAction(ISD::SSUBO, VT, Expand);
-    setOperationAction(ISD::UADDO, VT, Expand);
-    setOperationAction(ISD::USUBO, VT, Expand);
-    setOperationAction(ISD::SMULO, VT, Expand);
-    setOperationAction(ISD::UMULO, VT, Expand);
+    setOperationAction({ISD::SADDO, ISD::SSUBO, ISD::UADDO, ISD::USUBO,
+                        ISD::SMULO, ISD::UMULO},
+                       VT, Expand);
 
     // ADDCARRY operations default to expand
-    setOperationAction(ISD::ADDCARRY, VT, Expand);
-    setOperationAction(ISD::SUBCARRY, VT, Expand);
-    setOperationAction(ISD::SETCCCARRY, VT, Expand);
-    setOperationAction(ISD::SADDO_CARRY, VT, Expand);
-    setOperationAction(ISD::SSUBO_CARRY, VT, Expand);
+    setOperationAction({ISD::ADDCARRY, ISD::SUBCARRY, ISD::SETCCCARRY,
+                        ISD::SADDO_CARRY, ISD::SSUBO_CARRY},
+                       VT, Expand);
 
     // ADDC/ADDE/SUBC/SUBE default to expand.
-    setOperationAction(ISD::ADDC, VT, Expand);
-    setOperationAction(ISD::ADDE, VT, Expand);
-    setOperationAction(ISD::SUBC, VT, Expand);
-    setOperationAction(ISD::SUBE, VT, Expand);
+    setOperationAction({ISD::ADDC, ISD::ADDE, ISD::SUBC, ISD::SUBE}, VT,
+                       Expand);
+
+    // Halving adds
+    setOperationAction(
+        {ISD::AVGFLOORS, ISD::AVGFLOORU, ISD::AVGCEILS, ISD::AVGCEILU}, VT,
+        Expand);
 
     // Absolute difference
-    setOperationAction(ISD::ABDS, VT, Expand);
-    setOperationAction(ISD::ABDU, VT, Expand);
+    setOperationAction({ISD::ABDS, ISD::ABDU}, VT, Expand);
 
     // These default to Expand so they will be expanded to CTLZ/CTTZ by default.
-    setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
-    setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
+    setOperationAction({ISD::CTLZ_ZERO_UNDEF, ISD::CTTZ_ZERO_UNDEF}, VT,
+                       Expand);
 
-    setOperationAction(ISD::BITREVERSE, VT, Expand);
-    setOperationAction(ISD::PARITY, VT, Expand);
+    setOperationAction({ISD::BITREVERSE, ISD::PARITY}, VT, Expand);
 
     // These library functions default to expand.
-    setOperationAction(ISD::FROUND, VT, Expand);
-    setOperationAction(ISD::FROUNDEVEN, VT, Expand);
-    setOperationAction(ISD::FPOWI, VT, Expand);
+    setOperationAction({ISD::FROUND, ISD::FROUNDEVEN, ISD::FPOWI}, VT, Expand);
 
     // These operations default to expand for vector types.
-    if (VT.isVector()) {
-      setOperationAction(ISD::FCOPYSIGN, VT, Expand);
-      setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
-      setOperationAction(ISD::ANY_EXTEND_VECTOR_INREG, VT, Expand);
-      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Expand);
-      setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Expand);
-      setOperationAction(ISD::SPLAT_VECTOR, VT, Expand);
-    }
+    if (VT.isVector())
+      setOperationAction({ISD::FCOPYSIGN, ISD::SIGN_EXTEND_INREG,
+                          ISD::ANY_EXTEND_VECTOR_INREG,
+                          ISD::SIGN_EXTEND_VECTOR_INREG,
+                          ISD::ZERO_EXTEND_VECTOR_INREG, ISD::SPLAT_VECTOR},
+                         VT, Expand);
 
     // Constrained floating-point operations default to expand.
 #define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)               \
@@ -852,21 +858,13 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, VT, Expand);
 
     // Vector reduction default to expand.
-    setOperationAction(ISD::VECREDUCE_FADD, VT, Expand);
-    setOperationAction(ISD::VECREDUCE_FMUL, VT, Expand);
-    setOperationAction(ISD::VECREDUCE_ADD, VT, Expand);
-    setOperationAction(ISD::VECREDUCE_MUL, VT, Expand);
-    setOperationAction(ISD::VECREDUCE_AND, VT, Expand);
-    setOperationAction(ISD::VECREDUCE_OR, VT, Expand);
-    setOperationAction(ISD::VECREDUCE_XOR, VT, Expand);
-    setOperationAction(ISD::VECREDUCE_SMAX, VT, Expand);
-    setOperationAction(ISD::VECREDUCE_SMIN, VT, Expand);
-    setOperationAction(ISD::VECREDUCE_UMAX, VT, Expand);
-    setOperationAction(ISD::VECREDUCE_UMIN, VT, Expand);
-    setOperationAction(ISD::VECREDUCE_FMAX, VT, Expand);
-    setOperationAction(ISD::VECREDUCE_FMIN, VT, Expand);
-    setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Expand);
-    setOperationAction(ISD::VECREDUCE_SEQ_FMUL, VT, Expand);
+    setOperationAction(
+        {ISD::VECREDUCE_FADD, ISD::VECREDUCE_FMUL, ISD::VECREDUCE_ADD,
+         ISD::VECREDUCE_MUL, ISD::VECREDUCE_AND, ISD::VECREDUCE_OR,
+         ISD::VECREDUCE_XOR, ISD::VECREDUCE_SMAX, ISD::VECREDUCE_SMIN,
+         ISD::VECREDUCE_UMAX, ISD::VECREDUCE_UMIN, ISD::VECREDUCE_FMAX,
+         ISD::VECREDUCE_FMIN, ISD::VECREDUCE_SEQ_FADD, ISD::VECREDUCE_SEQ_FMUL},
+        VT, Expand);
 
     // Named vector shuffles default to expand.
     setOperationAction(ISD::VECTOR_SPLICE, VT, Expand);
@@ -881,30 +879,16 @@ void TargetLoweringBase::initActions() {
   // ConstantFP nodes default to expand.  Targets can either change this to
   // Legal, in which case all fp constants are legal, or use isFPImmLegal()
   // to optimize expansions for certain constants.
-  setOperationAction(ISD::ConstantFP, MVT::f16, Expand);
-  setOperationAction(ISD::ConstantFP, MVT::f32, Expand);
-  setOperationAction(ISD::ConstantFP, MVT::f64, Expand);
-  setOperationAction(ISD::ConstantFP, MVT::f80, Expand);
-  setOperationAction(ISD::ConstantFP, MVT::f128, Expand);
+  setOperationAction(ISD::ConstantFP,
+                     {MVT::f16, MVT::f32, MVT::f64, MVT::f80, MVT::f128},
+                     Expand);
 
   // These library functions default to expand.
-  for (MVT VT : {MVT::f32, MVT::f64, MVT::f128}) {
-    setOperationAction(ISD::FCBRT,      VT, Expand);
-    setOperationAction(ISD::FLOG ,      VT, Expand);
-    setOperationAction(ISD::FLOG2,      VT, Expand);
-    setOperationAction(ISD::FLOG10,     VT, Expand);
-    setOperationAction(ISD::FEXP ,      VT, Expand);
-    setOperationAction(ISD::FEXP2,      VT, Expand);
-    setOperationAction(ISD::FFLOOR,     VT, Expand);
-    setOperationAction(ISD::FNEARBYINT, VT, Expand);
-    setOperationAction(ISD::FCEIL,      VT, Expand);
-    setOperationAction(ISD::FRINT,      VT, Expand);
-    setOperationAction(ISD::FTRUNC,     VT, Expand);
-    setOperationAction(ISD::LROUND,     VT, Expand);
-    setOperationAction(ISD::LLROUND,    VT, Expand);
-    setOperationAction(ISD::LRINT,      VT, Expand);
-    setOperationAction(ISD::LLRINT,     VT, Expand);
-  }
+  setOperationAction({ISD::FCBRT, ISD::FLOG, ISD::FLOG2, ISD::FLOG10, ISD::FEXP,
+                      ISD::FEXP2, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL,
+                      ISD::FRINT, ISD::FTRUNC, ISD::LROUND, ISD::LLROUND,
+                      ISD::LRINT, ISD::LLRINT},
+                     {MVT::f32, MVT::f64, MVT::f128}, Expand);
 
   // Default ISD::TRAP to expand (which turns it into abort).
   setOperationAction(ISD::TRAP, MVT::Other, Expand);
@@ -1394,6 +1378,16 @@ void TargetLoweringBase::computeRegisterProperties(
     }
   }
 
+  // Decide how to handle bf16. If the target does not have native bf16 support,
+  // promote it to f32, because there are no bf16 library calls (except for
+  // converting from f32 to bf16).
+  if (!isTypeLegal(MVT::bf16)) {
+    NumRegistersForVT[MVT::bf16] = NumRegistersForVT[MVT::f32];
+    RegisterTypeForVT[MVT::bf16] = RegisterTypeForVT[MVT::f32];
+    TransformToType[MVT::bf16] = MVT::f32;
+    ValueTypeActions.setTypeAction(MVT::bf16, TypePromoteFloat);
+  }
+
   // Loop over all of the vector value types to see which need transformations.
   for (unsigned i = MVT::FIRST_VECTOR_VALUETYPE;
        i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
@@ -1647,6 +1641,11 @@ bool TargetLoweringBase::isSuitableForJumpTable(const SwitchInst *SI,
          (NumCases * 100 >= Range * MinDensity);
 }
 
+MVT TargetLoweringBase::getPreferredSwitchConditionType(LLVMContext &Context,
+                                                        EVT ConditionVT) const {
+  return getRegisterType(Context, ConditionVT);
+}
+
 /// Get the EVTs and ArgFlags collections that represent the legalized return
 /// type of the given function.  This does not require a DAG or a return value,
 /// and is suitable for use before any DAGs for the function are constructed.
@@ -2066,9 +2065,11 @@ static std::string getReciprocalOpName(bool IsSqrt, EVT VT) {
 
   Name += IsSqrt ? "sqrt" : "div";
 
-  // TODO: Handle "half" or other float types?
+  // TODO: Handle other float types?
   if (VT.getScalarType() == MVT::f64) {
     Name += "d";
+  } else if (VT.getScalarType() == MVT::f16) {
+    Name += "h";
   } else {
     assert(VT.getScalarType() == MVT::f32 &&
            "Unexpected FP type for reciprocal estimate");
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index ce350034d073..f3d68bd9c92d 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -310,7 +310,7 @@ void TargetLoweringObjectFileELF::emitModuleMetadata(MCStreamer &Streamer,
     auto *S = C.getELFSection(".linker-options", ELF::SHT_LLVM_LINKER_OPTIONS,
                               ELF::SHF_EXCLUDE);
 
-    Streamer.SwitchSection(S);
+    Streamer.switchSection(S);
 
     for (const auto *Operand : LinkerOptions->operands()) {
       if (cast<MDNode>(Operand)->getNumOperands() != 2)
@@ -326,7 +326,7 @@ void TargetLoweringObjectFileELF::emitModuleMetadata(MCStreamer &Streamer,
     auto *S = C.getELFSection(".deplibs", ELF::SHT_LLVM_DEPENDENT_LIBRARIES,
                               ELF::SHF_MERGE | ELF::SHF_STRINGS, 1);
 
-    Streamer.SwitchSection(S);
+    Streamer.switchSection(S);
 
     for (const auto *Operand : DependentLibraries->operands()) {
       Streamer.emitBytes(
@@ -350,7 +350,7 @@ void TargetLoweringObjectFileELF::emitModuleMetadata(MCStreamer &Streamer,
       auto *S = C.getObjectFileInfo()->getPseudoProbeDescSection(
           TM->getFunctionSections() ? Name->getString() : StringRef());
 
-      Streamer.SwitchSection(S);
+      Streamer.switchSection(S);
       Streamer.emitInt64(GUID->getZExtValue());
       Streamer.emitInt64(Hash->getZExtValue());
       Streamer.emitULEB128IntValue(Name->getString().size());
@@ -365,11 +365,11 @@ void TargetLoweringObjectFileELF::emitModuleMetadata(MCStreamer &Streamer,
   GetObjCImageInfo(M, Version, Flags, Section);
   if (!Section.empty()) {
     auto *S = C.getELFSection(Section, ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
-    Streamer.SwitchSection(S);
+    Streamer.switchSection(S);
     Streamer.emitLabel(C.getOrCreateSymbol(StringRef("OBJC_IMAGE_INFO")));
     Streamer.emitInt32(Version);
     Streamer.emitInt32(Flags);
-    Streamer.AddBlankLine();
+    Streamer.addBlankLine();
   }
 
   emitCGProfileMetadata(Streamer, M);
@@ -399,7 +399,7 @@ void TargetLoweringObjectFileELF::emitPersonalityValue(
   MCSection *Sec = getContext().getELFNamedSection(".data", Label->getName(),
                                                    ELF::SHT_PROGBITS, Flags, 0);
   unsigned Size = DL.getPointerSize();
-  Streamer.SwitchSection(Sec);
+  Streamer.switchSection(Sec);
   Streamer.emitValueToAlignment(DL.getPointerABIAlignment(0).value());
   Streamer.emitSymbolAttribute(Label, MCSA_ELF_TypeObject);
   const MCExpr *E = MCConstantExpr::create(Size, getContext());
@@ -449,6 +449,9 @@ static SectionKind getELFKindForNamedSection(StringRef Name, SectionKind K) {
       Name == ".llvmbc" || Name == ".llvmcmd")
     return SectionKind::getMetadata();
 
+  if (Name == ".llvm.offloading")
+    return SectionKind::getExclude();
+
   if (Name.empty() || Name[0] != '.') return K;
 
   // Default implementation based on some magic section names.
@@ -507,9 +510,12 @@ static unsigned getELFSectionType(StringRef Name, SectionKind K) {
 static unsigned getELFSectionFlags(SectionKind K) {
   unsigned Flags = 0;
 
-  if (!K.isMetadata())
+  if (!K.isMetadata() && !K.isExclude())
     Flags |= ELF::SHF_ALLOC;
 
+  if (K.isExclude())
+    Flags |= ELF::SHF_EXCLUDE;
+
   if (K.isText())
     Flags |= ELF::SHF_EXECINSTR;
 
@@ -681,9 +687,10 @@ calcUniqueIDUpdateFlagsAndSize(const GlobalObject *GO, StringRef SectionName,
   }
 
   if (Retain) {
-    if ((Ctx.getAsmInfo()->useIntegratedAssembler() ||
-         Ctx.getAsmInfo()->binutilsIsAtLeast(2, 36)) &&
-        !TM.getTargetTriple().isOSSolaris())
+    if (TM.getTargetTriple().isOSSolaris())
+      Flags |= ELF::SHF_SUNW_NODISCARD;
+    else if (Ctx.getAsmInfo()->useIntegratedAssembler() ||
+             Ctx.getAsmInfo()->binutilsIsAtLeast(2, 36))
       Flags |= ELF::SHF_GNU_RETAIN;
     return NextUniqueID++;
   }
@@ -860,12 +867,15 @@ static MCSection *selectELFSectionForGlobal(
     EmitUniqueSection = true;
     Flags |= ELF::SHF_LINK_ORDER;
   }
-  if (Retain &&
-      (Ctx.getAsmInfo()->useIntegratedAssembler() ||
-       Ctx.getAsmInfo()->binutilsIsAtLeast(2, 36)) &&
-      !TM.getTargetTriple().isOSSolaris()) {
-    EmitUniqueSection = true;
-    Flags |= ELF::SHF_GNU_RETAIN;
+  if (Retain) {
+    if (TM.getTargetTriple().isOSSolaris()) {
+      EmitUniqueSection = true;
+      Flags |= ELF::SHF_SUNW_NODISCARD;
+    } else if (Ctx.getAsmInfo()->useIntegratedAssembler() ||
+               Ctx.getAsmInfo()->binutilsIsAtLeast(2, 36)) {
+      EmitUniqueSection = true;
+      Flags |= ELF::SHF_GNU_RETAIN;
+    }
   }
 
   MCSectionELF *Section = selectELFSectionForGlobal(
@@ -1171,6 +1181,15 @@ void TargetLoweringObjectFileMachO::Initialize(MCContext &Ctx,
       dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
 }
 
+MCSection *TargetLoweringObjectFileMachO::getStaticDtorSection(
+    unsigned Priority, const MCSymbol *KeySym) const {
+  // TODO(yln): Remove -lower-global-dtors-via-cxa-atexit fallback flag
+  // (LowerGlobalDtorsViaCxaAtExit) and always issue a fatal error here.
+  if (TM->Options.LowerGlobalDtorsViaCxaAtExit)
+    report_fatal_error("@llvm.global_dtors should have been lowered already");
+  return StaticDtorSection;
+}
+
 void TargetLoweringObjectFileMachO::emitModuleMetadata(MCStreamer &Streamer,
                                                        Module &M) const {
   // Emit the linker options if present.
@@ -1207,12 +1226,12 @@ void TargetLoweringObjectFileMachO::emitModuleMetadata(MCStreamer &Streamer,
   // Get the section.
   MCSectionMachO *S = getContext().getMachOSection(
       Segment, Section, TAA, StubSize, SectionKind::getData());
-  Streamer.SwitchSection(S);
+  Streamer.switchSection(S);
   Streamer.emitLabel(getContext().
                      getOrCreateSymbol(StringRef("L_OBJC_IMAGE_INFO")));
   Streamer.emitInt32(VersionVal);
   Streamer.emitInt32(ImageInfoFlags);
-  Streamer.AddBlankLine();
+  Streamer.addBlankLine();
 }
 
 static void checkMachOComdat(const GlobalValue *GV) {
@@ -1520,6 +1539,9 @@ getCOFFSectionFlags(SectionKind K, const TargetMachine &TM) {
   if (K.isMetadata())
     Flags |=
       COFF::IMAGE_SCN_MEM_DISCARDABLE;
+  else if (K.isExclude())
+    Flags |=
+      COFF::IMAGE_SCN_LNK_REMOVE | COFF::IMAGE_SCN_MEM_DISCARDABLE;
   else if (K.isText())
     Flags |=
       COFF::IMAGE_SCN_MEM_EXECUTE |
@@ -1755,11 +1777,11 @@ void TargetLoweringObjectFileCOFF::emitModuleMetadata(MCStreamer &Streamer,
                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                                    COFF::IMAGE_SCN_MEM_READ,
                                SectionKind::getReadOnly());
-    Streamer.SwitchSection(S);
+    Streamer.switchSection(S);
     Streamer.emitLabel(C.getOrCreateSymbol(StringRef("OBJC_IMAGE_INFO")));
     Streamer.emitInt32(Version);
     Streamer.emitInt32(Flags);
-    Streamer.AddBlankLine();
+    Streamer.addBlankLine();
   }
 
   emitCGProfileMetadata(Streamer, M);
@@ -1772,7 +1794,7 @@ void TargetLoweringObjectFileCOFF::emitLinkerDirectives(
     // spec, this section is a space-separated string containing flags for
     // linker.
     MCSection *Sec = getDrectveSection();
-    Streamer.SwitchSection(Sec);
+    Streamer.switchSection(Sec);
     for (const auto *Option : LinkerOptions->operands()) {
       for (const auto &Piece : cast<MDNode>(Option)->operands()) {
         // Lead with a space for consistency with our dllexport implementation.
@@ -1791,7 +1813,7 @@ void TargetLoweringObjectFileCOFF::emitLinkerDirectives(
                                  getMangler());
     OS.flush();
     if (!Flags.empty()) {
-      Streamer.SwitchSection(getDrectveSection());
+      Streamer.switchSection(getDrectveSection());
       Streamer.emitBytes(Flags);
     }
     Flags.clear();
@@ -1817,7 +1839,7 @@ void TargetLoweringObjectFileCOFF::emitLinkerDirectives(
         OS.flush();
 
         if (!Flags.empty()) {
-          Streamer.SwitchSection(getDrectveSection());
+          Streamer.switchSection(getDrectveSection());
           Streamer.emitBytes(Flags);
         }
         Flags.clear();
@@ -2170,8 +2192,7 @@ MCSection *TargetLoweringObjectFileWasm::getStaticCtorSection(
 
 MCSection *TargetLoweringObjectFileWasm::getStaticDtorSection(
     unsigned Priority, const MCSymbol *KeySym) const {
-  llvm_unreachable("@llvm.global_dtors should have been lowered already");
-  return nullptr;
+  report_fatal_error("@llvm.global_dtors should have been lowered already");
 }
 
 //===----------------------------------------------------------------------===//
@@ -2544,10 +2565,24 @@ MCSection *TargetLoweringObjectFileXCOFF::getSectionForTOCEntry(
           XCOFF::XTY_SD));
 }
 
+MCSection *TargetLoweringObjectFileXCOFF::getSectionForLSDA(
+    const Function &F, const MCSymbol &FnSym, const TargetMachine &TM) const {
+  auto *LSDA = cast<MCSectionXCOFF>(LSDASection);
+  if (TM.getFunctionSections()) {
+    // If option -ffunction-sections is on, append the function name to the
+    // name of the LSDA csect so that each function has its own LSDA csect.
+    // This helps the linker to garbage-collect EH info of unused functions.
+    SmallString<128> NameStr = LSDA->getName();
+    raw_svector_ostream(NameStr) << '.' << F.getName();
+    LSDA = getContext().getXCOFFSection(NameStr, LSDA->getKind(),
+                                        LSDA->getCsectProp());
+  }
+  return LSDA;
+}
 //===----------------------------------------------------------------------===//
 //                                  GOFF
 //===----------------------------------------------------------------------===//
-TargetLoweringObjectFileGOFF::TargetLoweringObjectFileGOFF() {}
+TargetLoweringObjectFileGOFF::TargetLoweringObjectFileGOFF() = default;
 
 MCSection *TargetLoweringObjectFileGOFF::getExplicitSectionGlobal(
     const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
@@ -2558,8 +2593,8 @@ MCSection *TargetLoweringObjectFileGOFF::SelectSectionForGlobal(
     const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
   auto *Symbol = TM.getSymbol(GO);
   if (Kind.isBSS())
-    return getContext().getGOFFSection(Symbol->getName(),
-                                       SectionKind::getBSS());
+    return getContext().getGOFFSection(Symbol->getName(), SectionKind::getBSS(),
+                                       nullptr, nullptr);
 
   return getContext().getObjectFileInfo()->getTextSection();
 }
diff --git a/llvm/lib/CodeGen/TargetOptionsImpl.cpp b/llvm/lib/CodeGen/TargetOptionsImpl.cpp
index 0731cf9b28f4..af5d10103f78 100644
--- a/llvm/lib/CodeGen/TargetOptionsImpl.cpp
+++ b/llvm/lib/CodeGen/TargetOptionsImpl.cpp
@@ -15,7 +15,6 @@
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index 05004fb935df..0bd229f4fc68 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Analysis/ScopedNoAliasAA.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
+#include "llvm/CodeGen/BasicBlockSectionsProfileReader.h"
 #include "llvm/CodeGen/CSEConfigBase.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachinePassRegistry.h"
@@ -47,7 +48,6 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils.h"
-#include "llvm/Transforms/Utils/SymbolRewriter.h"
 #include <cassert>
 #include <string>
 
@@ -115,20 +115,18 @@ static cl::opt<bool> PrintGCInfo("print-gc", cl::Hidden,
     cl::desc("Dump garbage collector data"));
 static cl::opt<cl::boolOrDefault>
     VerifyMachineCode("verify-machineinstrs", cl::Hidden,
-                      cl::desc("Verify generated machine code"),
-                      cl::ZeroOrMore);
-static cl::opt<cl::boolOrDefault> DebugifyAndStripAll(
-    "debugify-and-strip-all-safe", cl::Hidden,
-    cl::desc(
-        "Debugify MIR before and Strip debug after "
-        "each pass except those known to be unsafe when debug info is present"),
-    cl::ZeroOrMore);
+                      cl::desc("Verify generated machine code"));
+static cl::opt<cl::boolOrDefault>
+    DebugifyAndStripAll("debugify-and-strip-all-safe", cl::Hidden,
+                        cl::desc("Debugify MIR before and Strip debug after "
+                                 "each pass except those known to be unsafe "
+                                 "when debug info is present"));
 static cl::opt<cl::boolOrDefault> DebugifyCheckAndStripAll(
     "debugify-check-and-strip-all-safe", cl::Hidden,
     cl::desc(
         "Debugify MIR before, by checking and stripping the debug info after, "
-        "each pass except those known to be unsafe when debug info is present"),
-    cl::ZeroOrMore);
+        "each pass except those known to be unsafe when debug info is "
+        "present"));
 // Enable or disable the MachineOutliner.
 static cl::opt<RunOutliner> EnableMachineOutliner(
     "enable-machine-outliner", cl::desc("Enable the machine outliner"),
@@ -139,6 +137,11 @@ static cl::opt<RunOutliner> EnableMachineOutliner(
                           "Disable all outlining"),
                // Sentinel value for unspecified option.
                clEnumValN(RunOutliner::AlwaysOutline, "", "")));
+// Disable the pass to fix unwind information. Whether the pass is included in
+// the pipeline is controlled via the target options, this option serves as
+// manual override.
+static cl::opt<bool> DisableCFIFixup("disable-cfi-fixup", cl::Hidden,
+                                     cl::desc("Disable the CFI fixup pass"));
 // Enable or disable FastISel. Both options are needed, because
 // FastISel is enabled by default with -fast, and we wish to be
 // able to enable or disable fast-isel independently from -O0.
@@ -175,12 +178,12 @@ static cl::opt<bool>
 // Disable MIRProfileLoader before RegAlloc. This is for for debugging and
 // tuning purpose.
 static cl::opt<bool> DisableRAFSProfileLoader(
-    "disable-ra-fsprofile-loader", cl::init(true), cl::Hidden,
+    "disable-ra-fsprofile-loader", cl::init(false), cl::Hidden,
     cl::desc("Disable MIRProfileLoader before RegAlloc"));
 // Disable MIRProfileLoader before BloackPlacement. This is for for debugging
 // and tuning purpose.
 static cl::opt<bool> DisableLayoutFSProfileLoader(
-    "disable-layout-fsprofile-loader", cl::init(true), cl::Hidden,
+    "disable-layout-fsprofile-loader", cl::init(false), cl::Hidden,
     cl::desc("Disable MIRProfileLoader before BlockPlacement"));
 // Specify FSProfile file name.
 static cl::opt<std::string>
@@ -256,6 +259,11 @@ static cl::opt<bool> DisableExpandReductions(
     "disable-expand-reductions", cl::init(false), cl::Hidden,
     cl::desc("Disable the expand reduction intrinsics pass from running"));
 
+/// Disable the select optimization pass.
+static cl::opt<bool> DisableSelectOptimize(
+    "disable-select-optimize", cl::init(true), cl::Hidden,
+    cl::desc("Disable the select-optimization pass from running"));
+
 /// Allow standard passes to be disabled by command line options. This supports
 /// simple binary flags that either suppress the pass or do nothing.
 /// i.e. -disable-mypass=false has no effect.
@@ -490,6 +498,7 @@ CGPassBuilderOption llvm::getCGPassBuilderOption() {
   SET_BOOLEAN_OPTION(DisableConstantHoisting)
   SET_BOOLEAN_OPTION(DisableCGP)
   SET_BOOLEAN_OPTION(DisablePartialLibcallInlining)
+  SET_BOOLEAN_OPTION(DisableSelectOptimize)
   SET_BOOLEAN_OPTION(PrintLSR)
   SET_BOOLEAN_OPTION(PrintISelInput)
   SET_BOOLEAN_OPTION(PrintGCInfo)
@@ -736,21 +745,21 @@ void TargetPassConfig::addPass(Pass *P) {
   if (StopBefore == PassID && StopBeforeCount++ == StopBeforeInstanceNum)
     Stopped = true;
   if (Started && !Stopped) {
-    if (AddingMachinePasses)
+    if (AddingMachinePasses) {
+      // Construct banner message before PM->add() as that may delete the pass.
+      std::string Banner =
+          std::string("After ") + std::string(P->getPassName());
       addMachinePrePasses();
-    std::string Banner;
-    // Construct banner message before PM->add() as that may delete the pass.
-    if (AddingMachinePasses)
-      Banner = std::string("After ") + std::string(P->getPassName());
-    PM->add(P);
-    if (AddingMachinePasses)
+      PM->add(P);
       addMachinePostPasses(Banner);
+    } else {
+      PM->add(P);
+    }
 
     // Add the passes after the pass P if there is any.
-    for (const auto &IP : Impl->InsertedPasses) {
+    for (const auto &IP : Impl->InsertedPasses)
       if (IP.TargetPassID == PassID)
         addPass(IP.getInsertedPass());
-    }
   } else {
     delete P;
   }
@@ -895,6 +904,12 @@ void TargetPassConfig::addIRPasses() {
   addPass(&ShadowStackGCLoweringID);
   addPass(createLowerConstantIntrinsicsPass());
 
+  // For MachO, lower @llvm.global_dtors into @llvm_global_ctors with
+  // __cxa_atexit() calls to avoid emitting the deprecated __mod_term_func.
+  if (TM->getTargetTriple().isOSBinFormatMachO() &&
+      TM->Options.LowerGlobalDtorsViaCxaAtExit)
+    addPass(createLowerGlobalDtorsLegacyPass());
+
   // Make sure that no unreachable blocks are instruction selected.
   addPass(createUnreachableBlockEliminationPass());
 
@@ -922,6 +937,13 @@ void TargetPassConfig::addIRPasses() {
   // Allow disabling it for testing purposes.
   if (!DisableExpandReductions)
     addPass(createExpandReductionsPass());
+
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createTLSVariableHoistPass());
+
+  // Convert conditional moves to conditional jumps when profitable.
+  if (getOptLevel() != CodeGenOpt::None && !DisableSelectOptimize)
+    addPass(createSelectOptimizePass());
 }
 
 /// Turn exception handling constructs into something the code generators can
@@ -1261,12 +1283,19 @@ void TargetPassConfig::addMachinePasses() {
   // FIXME: In principle, BasicBlockSection::Labels and splitting can used
   // together. Update this check once we have addressed any issues.
   if (TM->getBBSectionsType() != llvm::BasicBlockSection::None) {
-    addPass(llvm::createBasicBlockSectionsPass(TM->getBBSectionsFuncListBuf()));
+    if (TM->getBBSectionsType() == llvm::BasicBlockSection::List) {
+      addPass(llvm::createBasicBlockSectionsProfileReaderPass(
+          TM->getBBSectionsFuncListBuf()));
+    }
+    addPass(llvm::createBasicBlockSectionsPass());
   } else if (TM->Options.EnableMachineFunctionSplitter ||
              EnableMachineFunctionSplitter) {
     addPass(createMachineFunctionSplitterPass());
   }
 
+  if (!DisableCFIFixup && TM->Options.EnableCFIFixup)
+    addPass(createCFIFixup());
+
   // Add passes that directly emit MI after all other MI passes.
   addPreEmitPass2();
 
@@ -1376,6 +1405,11 @@ FunctionPass *TargetPassConfig::createRegAllocPass(bool Optimized) {
   return createTargetRegisterAllocator(Optimized);
 }
 
+bool TargetPassConfig::isCustomizedRegAlloc() {
+  return RegAlloc !=
+         (RegisterRegAlloc::FunctionPassCtor)&useDefaultRegisterAllocator;
+}
+
 bool TargetPassConfig::addRegAssignAndRewriteFast() {
   if (RegAlloc != (RegisterRegAlloc::FunctionPassCtor)&useDefaultRegisterAllocator &&
       RegAlloc != (RegisterRegAlloc::FunctionPassCtor)&createFastRegisterAllocator)
diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
index 6bcf79547056..ac346585b0f8 100644
--- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -16,10 +16,11 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
diff --git a/llvm/lib/CodeGen/TargetSchedule.cpp b/llvm/lib/CodeGen/TargetSchedule.cpp
index ce59452fd1b8..ac07c86cab85 100644
--- a/llvm/lib/CodeGen/TargetSchedule.cpp
+++ b/llvm/lib/CodeGen/TargetSchedule.cpp
@@ -16,7 +16,6 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrItineraries.h"
diff --git a/llvm/lib/CodeGen/TargetSubtargetInfo.cpp b/llvm/lib/CodeGen/TargetSubtargetInfo.cpp
index e4520d8ccb1e..ba2c8dda7de5 100644
--- a/llvm/lib/CodeGen/TargetSubtargetInfo.cpp
+++ b/llvm/lib/CodeGen/TargetSubtargetInfo.cpp
@@ -45,10 +45,6 @@ bool TargetSubtargetInfo::enableRALocalReassignment(
   return true;
 }
 
-bool TargetSubtargetInfo::enableAdvancedRASplitCost() const {
-  return false;
-}
-
 bool TargetSubtargetInfo::enablePostRAScheduler() const {
   return getSchedModel().PostRAScheduler;
 }
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index dfd962be2882..c44fd9f97383 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -28,7 +28,6 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/iterator_range.h"
@@ -50,7 +49,6 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/MC/MCInstrDesc.h"
-#include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
@@ -163,6 +161,7 @@ class TwoAddressInstructionPass : public MachineFunctionPass {
   bool collectTiedOperands(MachineInstr *MI, TiedOperandMap&);
   void processTiedPairs(MachineInstr *MI, TiedPairList&, unsigned &Dist);
   void eliminateRegSequence(MachineBasicBlock::iterator&);
+  bool processStatepoint(MachineInstr *MI, TiedOperandMap &TiedOperands);
 
 public:
   static char ID; // Pass identification, replacement for typeid
@@ -1629,6 +1628,61 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
   }
 }
 
+// For every tied operand pair this function transforms statepoint from
+//    RegA = STATEPOINT ... RegB(tied-def N)
+// to
+//    RegB = STATEPOINT ... RegB(tied-def N)
+// and replaces all uses of RegA with RegB.
+// No extra COPY instruction is necessary because tied use is killed at
+// STATEPOINT.
+bool TwoAddressInstructionPass::processStatepoint(
+    MachineInstr *MI, TiedOperandMap &TiedOperands) {
+
+  bool NeedCopy = false;
+  for (auto &TO : TiedOperands) {
+    Register RegB = TO.first;
+    if (TO.second.size() != 1) {
+      NeedCopy = true;
+      continue;
+    }
+
+    unsigned SrcIdx = TO.second[0].first;
+    unsigned DstIdx = TO.second[0].second;
+
+    MachineOperand &DstMO = MI->getOperand(DstIdx);
+    Register RegA = DstMO.getReg();
+
+    assert(RegB == MI->getOperand(SrcIdx).getReg());
+
+    if (RegA == RegB)
+      continue;
+
+    MRI->replaceRegWith(RegA, RegB);
+
+    if (LIS) {
+      VNInfo::Allocator &A = LIS->getVNInfoAllocator();
+      LiveInterval &LI = LIS->getInterval(RegB);
+      for (auto &S : LIS->getInterval(RegA)) {
+        VNInfo *VNI = LI.getNextValue(S.start, A);
+        LiveRange::Segment NewSeg(S.start, S.end, VNI);
+        LI.addSegment(NewSeg);
+      }
+      LIS->removeInterval(RegA);
+    }
+
+    if (LV) {
+      if (MI->getOperand(SrcIdx).isKill())
+        LV->removeVirtualRegisterKilled(RegB, *MI);
+      LiveVariables::VarInfo &SrcInfo = LV->getVarInfo(RegB);
+      LiveVariables::VarInfo &DstInfo = LV->getVarInfo(RegA);
+      SrcInfo.AliveBlocks |= DstInfo.AliveBlocks;
+      for (auto *KillMI : DstInfo.Kills)
+        LV->addVirtualRegisterKilled(RegB, *KillMI, false);
+    }
+  }
+  return !NeedCopy;
+}
+
 /// Reduce two-address instructions to two operands.
 bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
   MF = &Func;
@@ -1722,6 +1776,14 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
         }
       }
 
+      if (mi->getOpcode() == TargetOpcode::STATEPOINT &&
+          processStatepoint(&*mi, TiedOperands)) {
+        TiedOperands.clear();
+        LLVM_DEBUG(dbgs() << "\t\trewrite to:\t" << *mi);
+        mi = nmi;
+        continue;
+      }
+
       // Now iterate over the information collected above.
       for (auto &TO : TiedOperands) {
         processTiedPairs(&*mi, TO.second, Dist);
@@ -1733,11 +1795,11 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
         // From %reg = INSERT_SUBREG %reg, %subreg, subidx
         // To   %reg:subidx = COPY %subreg
         unsigned SubIdx = mi->getOperand(3).getImm();
-        mi->RemoveOperand(3);
+        mi->removeOperand(3);
         assert(mi->getOperand(0).getSubReg() == 0 && "Unexpected subreg idx");
         mi->getOperand(0).setSubReg(SubIdx);
         mi->getOperand(0).setIsUndef(mi->getOperand(1).isUndef());
-        mi->RemoveOperand(1);
+        mi->removeOperand(1);
         mi->setDesc(TII->get(TargetOpcode::COPY));
         LLVM_DEBUG(dbgs() << "\t\tconvert to:\t" << *mi);
 
@@ -1858,7 +1920,7 @@ eliminateRegSequence(MachineBasicBlock::iterator &MBBI) {
     LLVM_DEBUG(dbgs() << "Turned: " << MI << " into an IMPLICIT_DEF");
     MI.setDesc(TII->get(TargetOpcode::IMPLICIT_DEF));
     for (int j = MI.getNumOperands() - 1, ee = 0; j > ee; --j)
-      MI.RemoveOperand(j);
+      MI.removeOperand(j);
   } else {
     if (LIS)
       LIS->RemoveMachineInstrFromMaps(MI);
diff --git a/llvm/lib/CodeGen/TypePromotion.cpp b/llvm/lib/CodeGen/TypePromotion.cpp
index 01ea171e5ea2..166a3c413f6a 100644
--- a/llvm/lib/CodeGen/TypePromotion.cpp
+++ b/llvm/lib/CodeGen/TypePromotion.cpp
@@ -24,15 +24,13 @@
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
-#include "llvm/IR/Verifier.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
@@ -44,9 +42,9 @@
 
 using namespace llvm;
 
-static cl::opt<bool>
-DisablePromotion("disable-type-promotion", cl::Hidden, cl::init(false),
-                 cl::desc("Disable type promotion pass"));
+static cl::opt<bool> DisablePromotion("disable-type-promotion", cl::Hidden,
+                                      cl::init(false),
+                                      cl::desc("Disable type promotion pass"));
 
 // The goal of this pass is to enable more efficient code generation for
 // operations on narrow types (i.e. types with < 32-bits) and this is a
@@ -103,17 +101,16 @@ DisablePromotion("disable-type-promotion", cl::Hidden, cl::init(false),
 namespace {
 class IRPromoter {
   LLVMContext &Ctx;
-  IntegerType *OrigTy = nullptr;
   unsigned PromotedWidth = 0;
-  SetVector<Value*> &Visited;
-  SetVector<Value*> &Sources;
-  SetVector<Instruction*> &Sinks;
+  SetVector<Value *> &Visited;
+  SetVector<Value *> &Sources;
+  SetVector<Instruction *> &Sinks;
   SmallPtrSetImpl<Instruction *> &SafeWrap;
   IntegerType *ExtTy = nullptr;
-  SmallPtrSet<Value*, 8> NewInsts;
-  SmallPtrSet<Instruction*, 4> InstsToRemove;
-  DenseMap<Value*, SmallVector<Type*, 4>> TruncTysMap;
-  SmallPtrSet<Value*, 8> Promoted;
+  SmallPtrSet<Value *, 8> NewInsts;
+  SmallPtrSet<Instruction *, 4> InstsToRemove;
+  DenseMap<Value *, SmallVector<Type *, 4>> TruncTysMap;
+  SmallPtrSet<Value *, 8> Promoted;
 
   void ReplaceAllUsersOfWith(Value *From, Value *To);
   void ExtendSources();
@@ -123,16 +120,13 @@ class IRPromoter {
   void Cleanup();
 
 public:
-  IRPromoter(LLVMContext &C, IntegerType *Ty, unsigned Width,
+  IRPromoter(LLVMContext &C, unsigned Width,
              SetVector<Value *> &visited, SetVector<Value *> &sources,
              SetVector<Instruction *> &sinks,
              SmallPtrSetImpl<Instruction *> &wrap)
-      : Ctx(C), OrigTy(Ty), PromotedWidth(Width), Visited(visited),
+      : Ctx(C), PromotedWidth(Width), Visited(visited),
         Sources(sources), Sinks(sinks), SafeWrap(wrap) {
     ExtTy = IntegerType::get(Ctx, PromotedWidth);
-    assert(OrigTy->getPrimitiveSizeInBits().getFixedSize() <
-               ExtTy->getPrimitiveSizeInBits().getFixedSize() &&
-           "Original type not smaller than extended type");
   }
 
   void Mutate();
@@ -142,8 +136,8 @@ class TypePromotion : public FunctionPass {
   unsigned TypeSize = 0;
   LLVMContext *Ctx = nullptr;
   unsigned RegisterBitWidth = 0;
-  SmallPtrSet<Value*, 16> AllVisited;
-  SmallPtrSet<Instruction*, 8> SafeToPromote;
+  SmallPtrSet<Value *, 16> AllVisited;
+  SmallPtrSet<Instruction *, 8> SafeToPromote;
   SmallPtrSet<Instruction *, 4> SafeWrap;
 
   // Does V have the same size result type as TypeSize.
@@ -190,7 +184,7 @@ public:
   bool runOnFunction(Function &F) override;
 };
 
-}
+} // namespace
 
 static bool GenerateSignBits(Instruction *I) {
   unsigned Opc = I->getOpcode();
@@ -245,7 +239,7 @@ bool TypePromotion::isSource(Value *V) {
 bool TypePromotion::isSink(Value *V) {
   // TODO The truncate also isn't actually necessary because we would already
   // proved that the data value is kept within the range of the original data
-  // type.
+  // type. We currently remove any truncs inserted for handling zext sinks.
 
   // Sinks are:
   // - points where the value in the register is being observed, such as an
@@ -269,7 +263,7 @@ bool TypePromotion::isSink(Value *V) {
 
 /// Return whether this instruction can safely wrap.
 bool TypePromotion::isSafeWrap(Instruction *I) {
-  // We can support a, potentially, wrapping instruction (I) if:
+  // We can support a potentially wrapping instruction (I) if:
   // - It is only used by an unsigned icmp.
   // - The icmp uses a constant.
   // - The wrapping value (I) is decreasing, i.e would underflow - wrapping
@@ -356,7 +350,7 @@ bool TypePromotion::isSafeWrap(Instruction *I) {
   if (!OverflowConst.isNonPositive())
     return false;
 
-  // Using C1 = OverflowConst and C2 = ICmpConst, we can use either prove that:
+  // Using C1 = OverflowConst and C2 = ICmpConst, we can either prove that:
   //   zext(x) + sext(C1) <u zext(C2)  if C1 < 0 and C1 >s C2
   //   zext(x) + sext(C1) <u sext(C2)  if C1 < 0 and C1 <=s C2
   if (OverflowConst.sgt(ICmpConst)) {
@@ -404,7 +398,7 @@ static bool isPromotedResultSafe(Instruction *I) {
 }
 
 void IRPromoter::ReplaceAllUsersOfWith(Value *From, Value *To) {
-  SmallVector<Instruction*, 4> Users;
+  SmallVector<Instruction *, 4> Users;
   Instruction *InstTo = dyn_cast<Instruction>(To);
   bool ReplacedAll = true;
 
@@ -485,12 +479,18 @@ void IRPromoter::PromoteTree() {
         continue;
 
       if (auto *Const = dyn_cast<ConstantInt>(Op)) {
-        Constant *NewConst = SafeWrap.contains(I)
+        // For subtract, we don't need to sext the constant. We only put it in
+        // SafeWrap because SafeWrap.size() is used elsewhere.
+        // For cmp, we need to sign extend a constant appearing in either
+        // operand. For add, we should only sign extend the RHS.
+        Constant *NewConst = (SafeWrap.contains(I) &&
+                              (I->getOpcode() == Instruction::ICmp || i == 1) &&
+                              I->getOpcode() != Instruction::Sub)
                                  ? ConstantExpr::getSExt(Const, ExtTy)
                                  : ConstantExpr::getZExt(Const, ExtTy);
         I->setOperand(i, NewConst);
       } else if (isa<UndefValue>(Op))
-        I->setOperand(i, UndefValue::get(ExtTy));
+        I->setOperand(i, ConstantInt::get(ExtTy, 0));
     }
 
     // Mutate the result type, unless this is an icmp or switch.
@@ -506,7 +506,7 @@ void IRPromoter::TruncateSinks() {
 
   IRBuilder<> Builder{Ctx};
 
-  auto InsertTrunc = [&](Value *V, Type *TruncTy) -> Instruction* {
+  auto InsertTrunc = [&](Value *V, Type *TruncTy) -> Instruction * {
     if (!isa<Instruction>(V) || !isa<IntegerType>(V->getType()))
       return nullptr;
 
@@ -514,7 +514,7 @@ void IRPromoter::TruncateSinks() {
       return nullptr;
 
     LLVM_DEBUG(dbgs() << "IR Promotion: Creating " << *TruncTy << " Trunc for "
-               << *V << "\n");
+                      << *V << "\n");
     Builder.SetInsertPoint(cast<Instruction>(V));
     auto *Trunc = dyn_cast<Instruction>(Builder.CreateTrunc(V, TruncTy));
     if (Trunc)
@@ -550,6 +550,11 @@ void IRPromoter::TruncateSinks() {
       continue;
     }
 
+    // Don't insert a trunc for a zext which can still legally promote.
+    if (auto ZExt = dyn_cast<ZExtInst>(I))
+      if (ZExt->getType()->getScalarSizeInBits() > PromotedWidth)
+        continue;
+
     // Now handle the others.
     for (unsigned i = 0; i < I->getNumOperands(); ++i) {
       Type *Ty = TruncTysMap[I][i];
@@ -576,16 +581,14 @@ void IRPromoter::Cleanup() {
     Value *Src = ZExt->getOperand(0);
     if (ZExt->getSrcTy() == ZExt->getDestTy()) {
       LLVM_DEBUG(dbgs() << "IR Promotion: Removing unnecessary cast: " << *ZExt
-                 << "\n");
+                        << "\n");
       ReplaceAllUsersOfWith(ZExt, Src);
       continue;
     }
 
-    // Unless they produce a value that is narrower than ExtTy, we can
-    // replace the result of the zext with the input of a newly inserted
-    // trunc.
-    if (NewInsts.count(Src) && isa<TruncInst>(Src) &&
-        Src->getType() == OrigTy) {
+    // We've inserted a trunc for a zext sink, but we already know that the
+    // input is in range, negating the need for the trunc.
+    if (NewInsts.count(Src) && isa<TruncInst>(Src)) {
       auto *Trunc = cast<TruncInst>(Src);
       assert(Trunc->getOperand(0)->getType() == ExtTy &&
              "expected inserted trunc to be operating on i32");
@@ -615,7 +618,7 @@ void IRPromoter::ConvertTruncs() {
 
     unsigned NumBits = DestTy->getScalarSizeInBits();
     ConstantInt *Mask =
-      ConstantInt::get(SrcTy, APInt::getMaxValue(NumBits).getZExtValue());
+        ConstantInt::get(SrcTy, APInt::getMaxValue(NumBits).getZExtValue());
     Value *Masked = Builder.CreateAnd(Trunc->getOperand(0), Mask);
 
     if (auto *I = dyn_cast<Instruction>(Masked))
@@ -626,8 +629,8 @@ void IRPromoter::ConvertTruncs() {
 }
 
 void IRPromoter::Mutate() {
-  LLVM_DEBUG(dbgs() << "IR Promotion: Promoting use-def chains from "
-             << OrigTy->getBitWidth() << " to " << PromotedWidth << "-bits\n");
+  LLVM_DEBUG(dbgs() << "IR Promotion: Promoting use-def chains to "
+                    << PromotedWidth << "-bits\n");
 
   // Cache original types of the values that will likely need truncating
   for (auto *I : Sinks) {
@@ -677,8 +680,7 @@ bool TypePromotion::isSupportedType(Value *V) {
   if (Ty->isVoidTy() || Ty->isPointerTy())
     return true;
 
-  if (!isa<IntegerType>(Ty) ||
-      cast<IntegerType>(Ty)->getBitWidth() == 1 ||
+  if (!isa<IntegerType>(Ty) || cast<IntegerType>(Ty)->getBitWidth() == 1 ||
       cast<IntegerType>(Ty)->getBitWidth() > RegisterBitWidth)
     return false;
 
@@ -738,13 +740,12 @@ bool TypePromotion::isSupportedValue(Value *V) {
 /// smaller than the targeted promoted type. Check that we're not trying to
 /// promote something larger than our base 'TypeSize' type.
 bool TypePromotion::isLegalToPromote(Value *V) {
-
   auto *I = dyn_cast<Instruction>(V);
   if (!I)
     return true;
 
   if (SafeToPromote.count(I))
-   return true;
+    return true;
 
   if (isPromotedResultSafe(I) || isSafeWrap(I)) {
     SafeToPromote.insert(I);
@@ -765,10 +766,10 @@ bool TypePromotion::TryToPromote(Value *V, unsigned PromotedWidth) {
   LLVM_DEBUG(dbgs() << "IR Promotion: TryToPromote: " << *V << ", from "
                     << TypeSize << " bits to " << PromotedWidth << "\n");
 
-  SetVector<Value*> WorkList;
-  SetVector<Value*> Sources;
-  SetVector<Instruction*> Sinks;
-  SetVector<Value*> CurrentVisited;
+  SetVector<Value *> WorkList;
+  SetVector<Value *> Sources;
+  SetVector<Instruction *> Sinks;
+  SetVector<Value *> CurrentVisited;
   WorkList.insert(V);
 
   // Return true if V was added to the worklist as a supported instruction,
@@ -839,14 +840,15 @@ bool TypePromotion::TryToPromote(Value *V, unsigned PromotedWidth) {
     }
   }
 
-  LLVM_DEBUG(dbgs() << "IR Promotion: Visited nodes:\n";
-             for (auto *I : CurrentVisited)
-               I->dump();
-             );
+  LLVM_DEBUG({
+    dbgs() << "IR Promotion: Visited nodes:\n";
+    for (auto *I : CurrentVisited)
+      I->dump();
+  });
 
   unsigned ToPromote = 0;
   unsigned NonFreeArgs = 0;
-  SmallPtrSet<BasicBlock*, 4> Blocks;
+  SmallPtrSet<BasicBlock *, 4> Blocks;
   for (auto *V : CurrentVisited) {
     if (auto *I = dyn_cast<Instruction>(V))
       Blocks.insert(I->getParent());
@@ -860,16 +862,16 @@ bool TypePromotion::TryToPromote(Value *V, unsigned PromotedWidth) {
 
     if (Sinks.count(cast<Instruction>(V)))
       continue;
-     ++ToPromote;
-   }
+    ++ToPromote;
+  }
 
   // DAG optimizations should be able to handle these cases better, especially
   // for function arguments.
   if (ToPromote < 2 || (Blocks.size() == 1 && (NonFreeArgs > SafeWrap.size())))
     return false;
 
-  IRPromoter Promoter(*Ctx, cast<IntegerType>(OrigTy), PromotedWidth,
-                      CurrentVisited, Sources, Sinks, SafeWrap);
+  IRPromoter Promoter(*Ctx, PromotedWidth, CurrentVisited, Sources, Sinks,
+                      SafeWrap);
   Promoter.Mutate();
   return true;
 }
@@ -893,14 +895,14 @@ bool TypePromotion::runOnFunction(Function &F) {
   const TargetSubtargetInfo *SubtargetInfo = TM.getSubtargetImpl(F);
   const TargetLowering *TLI = SubtargetInfo->getTargetLowering();
   const TargetTransformInfo &TII =
-    getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   RegisterBitWidth =
       TII.getRegisterBitWidth(TargetTransformInfo::RGK_Scalar).getFixedSize();
   Ctx = &F.getParent()->getContext();
 
   // Search up from icmps to try to promote their operands.
   for (BasicBlock &BB : F) {
-    for (auto &I : BB) {
+    for (Instruction &I : BB) {
       if (AllVisited.count(&I))
         continue;
 
@@ -909,8 +911,7 @@ bool TypePromotion::runOnFunction(Function &F) {
 
       auto *ICmp = cast<ICmpInst>(&I);
       // Skip signed or pointer compares
-      if (ICmp->isSigned() ||
-          !isa<IntegerType>(ICmp->getOperand(0)->getType()))
+      if (ICmp->isSigned() || !isa<IntegerType>(ICmp->getOperand(0)->getType()))
         continue;
 
       LLVM_DEBUG(dbgs() << "IR Promotion: Searching from: " << *ICmp << "\n");
@@ -921,13 +922,13 @@ bool TypePromotion::runOnFunction(Function &F) {
           if (SrcVT.isSimple() && TLI->isTypeLegal(SrcVT.getSimpleVT()))
             break;
 
-          if (TLI->getTypeAction(ICmp->getContext(), SrcVT) !=
+          if (TLI->getTypeAction(*Ctx, SrcVT) !=
               TargetLowering::TypePromoteInteger)
             break;
-          EVT PromotedVT = TLI->getTypeToTransformTo(ICmp->getContext(), SrcVT);
+          EVT PromotedVT = TLI->getTypeToTransformTo(*Ctx, SrcVT);
           if (RegisterBitWidth < PromotedVT.getFixedSizeInBits()) {
             LLVM_DEBUG(dbgs() << "IR Promotion: Couldn't find target register "
-                       << "for promoted type\n");
+                              << "for promoted type\n");
             break;
           }
 
@@ -936,13 +937,7 @@ bool TypePromotion::runOnFunction(Function &F) {
         }
       }
     }
-    LLVM_DEBUG(if (verifyFunction(F, &dbgs())) {
-                dbgs() << F;
-                report_fatal_error("Broken function after type promotion");
-               });
   }
-  if (MadeChange)
-    LLVM_DEBUG(dbgs() << "After TypePromotion: " << F << "\n");
 
   AllVisited.clear();
   SafeToPromote.clear();
@@ -956,6 +951,4 @@ INITIALIZE_PASS_END(TypePromotion, DEBUG_TYPE, PASS_NAME, false, false)
 
 char TypePromotion::ID = 0;
 
-FunctionPass *llvm::createTypePromotionPass() {
-  return new TypePromotion();
-}
+FunctionPass *llvm::createTypePromotionPass() { return new TypePromotion(); }
diff --git a/llvm/lib/CodeGen/UnreachableBlockElim.cpp b/llvm/lib/CodeGen/UnreachableBlockElim.cpp
index 3426a03b6083..5e8514f525e9 100644
--- a/llvm/lib/CodeGen/UnreachableBlockElim.cpp
+++ b/llvm/lib/CodeGen/UnreachableBlockElim.cpp
@@ -26,16 +26,10 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constant.h"
 #include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Type.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -131,8 +125,8 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
           for (unsigned i = start->getNumOperands() - 1; i >= 2; i-=2)
             if (start->getOperand(i).isMBB() &&
                 start->getOperand(i).getMBB() == &BB) {
-              start->RemoveOperand(i);
-              start->RemoveOperand(i-1);
+              start->removeOperand(i);
+              start->removeOperand(i-1);
             }
 
           start++;
@@ -162,8 +156,8 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
     while (phi != BB.end() && phi->isPHI()) {
       for (unsigned i = phi->getNumOperands() - 1; i >= 2; i-=2)
         if (!preds.count(phi->getOperand(i).getMBB())) {
-          phi->RemoveOperand(i);
-          phi->RemoveOperand(i-1);
+          phi->removeOperand(i);
+          phi->removeOperand(i-1);
           ModifiedPHI = true;
         }
 
diff --git a/llvm/lib/CodeGen/VLIWMachineScheduler.cpp b/llvm/lib/CodeGen/VLIWMachineScheduler.cpp
index 5f59cb4643f2..8b5b585090f5 100644
--- a/llvm/lib/CodeGen/VLIWMachineScheduler.cpp
+++ b/llvm/lib/CodeGen/VLIWMachineScheduler.cpp
@@ -27,7 +27,6 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/IR/Function.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -43,19 +42,18 @@ using namespace llvm;
 #define DEBUG_TYPE "machine-scheduler"
 
 static cl::opt<bool> IgnoreBBRegPressure("ignore-bb-reg-pressure", cl::Hidden,
-                                         cl::ZeroOrMore, cl::init(false));
+                                         cl::init(false));
 
 static cl::opt<bool> UseNewerCandidate("use-newer-candidate", cl::Hidden,
-                                       cl::ZeroOrMore, cl::init(true));
+                                       cl::init(true));
 
 static cl::opt<unsigned> SchedDebugVerboseLevel("misched-verbose-level",
-                                                cl::Hidden, cl::ZeroOrMore,
-                                                cl::init(1));
+                                                cl::Hidden, cl::init(1));
 
 // Check if the scheduler should penalize instructions that are available to
 // early due to a zero-latency dependence.
 static cl::opt<bool> CheckEarlyAvail("check-early-avail", cl::Hidden,
-                                     cl::ZeroOrMore, cl::init(true));
+                                     cl::init(true));
 
 // This value is used to determine if a register class is a high pressure set.
 // We compute the maximum number of registers needed and divided by the total
diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp
index 0c42bef82005..f577aff39ea7 100644
--- a/llvm/lib/CodeGen/ValueTypes.cpp
+++ b/llvm/lib/CodeGen/ValueTypes.cpp
@@ -12,6 +12,7 @@
 #include "llvm/IR/Type.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TypeSize.h"
+#include "llvm/Support/WithColor.h"
 using namespace llvm;
 
 EVT EVT::changeExtendedTypeToInteger() const {
@@ -179,19 +180,22 @@ std::string EVT::getEVTString() const {
 /// specified EVT.  For integer types, this returns an unsigned type.  Note
 /// that this will abort for types that cannot be represented.
 Type *EVT::getTypeForEVT(LLVMContext &Context) const {
+  // clang-format off
   switch (V.SimpleTy) {
   default:
     assert(isExtended() && "Type is not extended!");
     return LLVMTy;
   case MVT::isVoid:  return Type::getVoidTy(Context);
   case MVT::i1:      return Type::getInt1Ty(Context);
+  case MVT::i2:      return Type::getIntNTy(Context, 2);
+  case MVT::i4:      return Type::getIntNTy(Context, 4);
   case MVT::i8:      return Type::getInt8Ty(Context);
   case MVT::i16:     return Type::getInt16Ty(Context);
   case MVT::i32:     return Type::getInt32Ty(Context);
   case MVT::i64:     return Type::getInt64Ty(Context);
   case MVT::i128:    return IntegerType::get(Context, 128);
   case MVT::f16:     return Type::getHalfTy(Context);
-  case MVT::bf16:     return Type::getBFloatTy(Context);
+  case MVT::bf16:    return Type::getBFloatTy(Context);
   case MVT::f32:     return Type::getFloatTy(Context);
   case MVT::f64:     return Type::getDoubleTy(Context);
   case MVT::f80:     return Type::getX86_FP80Ty(Context);
@@ -228,6 +232,10 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
     return FixedVectorType::get(Type::getInt1Ty(Context), 512);
   case MVT::v1024i1:
     return FixedVectorType::get(Type::getInt1Ty(Context), 1024);
+  case MVT::v128i2:
+    return FixedVectorType::get(Type::getIntNTy(Context, 2), 128);
+  case MVT::v64i4:
+    return FixedVectorType::get(Type::getIntNTy(Context, 4), 64);
   case MVT::v1i8:
     return FixedVectorType::get(Type::getInt8Ty(Context), 1);
   case MVT::v2i8:
@@ -500,6 +508,10 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
     return ScalableVectorType::get(Type::getBFloatTy(Context), 4);
   case MVT::nxv8bf16:
     return ScalableVectorType::get(Type::getBFloatTy(Context), 8);
+  case MVT::nxv16bf16:
+    return ScalableVectorType::get(Type::getBFloatTy(Context), 16);
+  case MVT::nxv32bf16:
+    return ScalableVectorType::get(Type::getBFloatTy(Context), 32);
   case MVT::nxv1f32:
     return ScalableVectorType::get(Type::getFloatTy(Context), 1);
   case MVT::nxv2f32:
@@ -520,6 +532,7 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
     return ScalableVectorType::get(Type::getDoubleTy(Context), 8);
   case MVT::Metadata: return Type::getMetadataTy(Context);
   }
+  // clang-format on
 }
 
 /// Return the value type corresponding to the specified type.  This returns all
diff --git a/llvm/lib/CodeGen/WasmEHPrepare.cpp b/llvm/lib/CodeGen/WasmEHPrepare.cpp
index c04a7b28eff9..aa6645227edb 100644
--- a/llvm/lib/CodeGen/WasmEHPrepare.cpp
+++ b/llvm/lib/CodeGen/WasmEHPrepare.cpp
@@ -77,8 +77,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/WasmEHFuncInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicsWebAssembly.h"
@@ -212,9 +212,15 @@ bool WasmEHPrepare::prepareEHPads(Function &F) {
 
   assert(F.hasPersonalityFn() && "Personality function not found");
 
-  // __wasm_lpad_context global variable
+  // __wasm_lpad_context global variable.
+  // This variable should be thread local. If the target does not support TLS,
+  // we depend on CoalesceFeaturesAndStripAtomics to downgrade it to
+  // non-thread-local ones, in which case we don't allow this object to be
+  // linked with other objects using shared memory.
   LPadContextGV = cast<GlobalVariable>(
       M.getOrInsertGlobal("__wasm_lpad_context", LPadContextTy));
+  LPadContextGV->setThreadLocalMode(GlobalValue::GeneralDynamicTLSModel);
+
   LPadIndexField = IRB.CreateConstGEP2_32(LPadContextTy, LPadContextGV, 0, 0,
                                           "lpad_index_gep");
   LSDAField =
diff --git a/llvm/lib/CodeGen/WinEHPrepare.cpp b/llvm/lib/CodeGen/WinEHPrepare.cpp
index d31183e46d65..b835503ee9ed 100644
--- a/llvm/lib/CodeGen/WinEHPrepare.cpp
+++ b/llvm/lib/CodeGen/WinEHPrepare.cpp
@@ -19,14 +19,14 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/MC/MCSymbol.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -1256,4 +1256,4 @@ void WinEHFuncInfo::addIPToStateRange(const InvokeInst *II,
   LabelToStateMap[InvokeBegin] = std::make_pair(InvokeStateMap[II], InvokeEnd);
 }
 
-WinEHFuncInfo::WinEHFuncInfo() {}
+WinEHFuncInfo::WinEHFuncInfo() = default;
diff --git a/llvm/lib/DWARFLinker/DWARFLinker.cpp b/llvm/lib/DWARFLinker/DWARFLinker.cpp
index b56095ca9a96..50c52190c1f6 100644
--- a/llvm/lib/DWARFLinker/DWARFLinker.cpp
+++ b/llvm/lib/DWARFLinker/DWARFLinker.cpp
@@ -10,7 +10,6 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/NonRelocatableStringpool.h"
 #include "llvm/DWARFLinker/DWARFLinkerDeclContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
@@ -19,9 +18,11 @@
 #include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
+#include "llvm/DebugInfo/DWARF/DWARFExpression.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
 #include "llvm/DebugInfo/DWARF/DWARFSection.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
+#include "llvm/MC/MCDwarf.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -132,9 +133,9 @@ static bool isTypeTag(uint16_t Tag) {
   return false;
 }
 
-AddressesMap::~AddressesMap() {}
+AddressesMap::~AddressesMap() = default;
 
-DwarfEmitter::~DwarfEmitter() {}
+DwarfEmitter::~DwarfEmitter() = default;
 
 static Optional<StringRef> StripTemplateParameters(StringRef Name) {
   // We are looking for template parameters to strip from Name. e.g.
@@ -360,16 +361,16 @@ static bool analyzeContextInfo(
     }
 
     Info.ParentIdx = Current.ParentIdx;
-    bool InClangModule = CU.isClangModule() || Current.InImportedModule;
-    if (CU.hasODR() || InClangModule) {
+    Info.InModuleScope = CU.isClangModule() || Current.InImportedModule;
+    if (CU.hasODR() || Info.InModuleScope) {
       if (Current.Context) {
         auto PtrInvalidPair = Contexts.getChildDeclContext(
-            *Current.Context, Current.Die, CU, InClangModule);
+            *Current.Context, Current.Die, CU, Info.InModuleScope);
         Current.Context = PtrInvalidPair.getPointer();
         Info.Ctxt =
             PtrInvalidPair.getInt() ? nullptr : PtrInvalidPair.getPointer();
         if (Info.Ctxt)
-          Info.Ctxt->setDefinedInClangModule(InClangModule);
+          Info.Ctxt->setDefinedInClangModule(Info.InModuleScope);
       } else
         Info.Ctxt = Current.Context = nullptr;
     }
@@ -439,8 +440,7 @@ unsigned DWARFLinker::shouldKeepVariableDIE(AddressesMap &RelocMgr,
   // if the variable has a valid relocation, so that the DIEInfo is filled.
   // However, we don't want a static variable in a function to force us to keep
   // the enclosing function, unless requested explicitly.
-  const bool HasLiveMemoryLocation =
-      RelocMgr.hasLiveMemoryLocation(DIE, MyInfo);
+  const bool HasLiveMemoryLocation = RelocMgr.isLiveVariable(DIE, MyInfo);
   if (!HasLiveMemoryLocation || ((Flags & TF_InFunctionScope) &&
                                  !LLVM_UNLIKELY(Options.KeepFunctionForStatic)))
     return Flags;
@@ -468,8 +468,8 @@ unsigned DWARFLinker::shouldKeepSubprogramDIE(
   if (!LowPc)
     return Flags;
 
-  assert(LowPc.hasValue() && "low_pc attribute is not an address.");
-  if (!RelocMgr.hasLiveAddressRange(DIE, MyInfo))
+  assert(LowPc && "low_pc attribute is not an address.");
+  if (!RelocMgr.isLiveSubprogram(DIE, MyInfo))
     return Flags;
 
   if (Options.Verbose) {
@@ -490,7 +490,7 @@ unsigned DWARFLinker::shouldKeepSubprogramDIE(
     // generation bugs aside, this is really wrong in the case of labels, where
     // a label marking the end of a function will have a PC == CU's high_pc.
     if (dwarf::toAddress(OrigUnit.getUnitDIE().find(dwarf::DW_AT_high_pc))
-            .getValueOr(UINT64_MAX) <= LowPc)
+            .value_or(UINT64_MAX) <= LowPc)
       return Flags;
     Unit.addLabelLowPc(*LowPc, MyInfo.AddrAdjust);
     return Flags | TF_Keep;
@@ -616,6 +616,27 @@ void DWARFLinker::lookForChildDIEsToKeep(
   }
 }
 
+static bool isODRCanonicalCandidate(const DWARFDie &Die, CompileUnit &CU) {
+  CompileUnit::DIEInfo &Info = CU.getInfo(Die);
+
+  if (!Info.Ctxt || (Die.getTag() == dwarf::DW_TAG_namespace))
+    return false;
+
+  if (!CU.hasODR() && !Info.InModuleScope)
+    return false;
+
+  return !Info.Incomplete && Info.Ctxt != CU.getInfo(Info.ParentIdx).Ctxt;
+}
+
+void DWARFLinker::markODRCanonicalDie(const DWARFDie &Die, CompileUnit &CU) {
+  CompileUnit::DIEInfo &Info = CU.getInfo(Die);
+
+  Info.ODRMarkingDone = true;
+  if (Info.Keep && isODRCanonicalCandidate(Die, CU) &&
+      !Info.Ctxt->hasCanonicalDIE())
+    Info.Ctxt->setHasCanonicalDIE();
+}
+
 /// Look at DIEs referenced by the given DIE and decide whether they should be
 /// kept. All DIEs referenced though attributes should be kept.
 void DWARFLinker::lookForRefDIEsToKeep(
@@ -645,8 +666,6 @@ void DWARFLinker::lookForRefDIEsToKeep(
     if (auto RefDie =
             resolveDIEReference(File, Units, Val, Die, ReferencedCU)) {
       CompileUnit::DIEInfo &Info = ReferencedCU->getInfo(RefDie);
-      bool IsModuleRef = Info.Ctxt && Info.Ctxt->getCanonicalDIEOffset() &&
-                         Info.Ctxt->isDefinedInClangModule();
       // If the referenced DIE has a DeclContext that has already been
       // emitted, then do not keep the one in this CU. We'll link to
       // the canonical DIE in cloneDieReferenceAttribute.
@@ -657,15 +676,14 @@ void DWARFLinker::lookForRefDIEsToKeep(
       //
       // FIXME: compatibility with dsymutil-classic. There is no
       // reason not to unique ref_addr references.
-      if (AttrSpec.Form != dwarf::DW_FORM_ref_addr && (UseOdr || IsModuleRef) &&
-          Info.Ctxt &&
-          Info.Ctxt != ReferencedCU->getInfo(Info.ParentIdx).Ctxt &&
-          Info.Ctxt->getCanonicalDIEOffset() && isODRAttribute(AttrSpec.Attr))
+      if (AttrSpec.Form != dwarf::DW_FORM_ref_addr &&
+          isODRAttribute(AttrSpec.Attr) && Info.Ctxt &&
+          Info.Ctxt->hasCanonicalDIE())
         continue;
 
       // Keep a module forward declaration if there is no definition.
       if (!(isODRAttribute(AttrSpec.Attr) && Info.Ctxt &&
-            Info.Ctxt->getCanonicalDIEOffset()))
+            Info.Ctxt->hasCanonicalDIE()))
         Info.Prune = false;
       ReferencedDIEs.emplace_back(RefDie, *ReferencedCU);
     }
@@ -756,6 +774,9 @@ void DWARFLinker::lookForDIEsToKeep(AddressesMap &AddressesMap,
       lookForParentDIEsToKeep(Current.AncestorIdx, Current.CU, Current.Flags,
                               Worklist);
       continue;
+    case WorklistItemType::MarkODRCanonicalDie:
+      markODRCanonicalDie(Current.Die, Current.CU);
+      continue;
     case WorklistItemType::LookForDIEsToKeep:
       break;
     }
@@ -778,6 +799,16 @@ void DWARFLinker::lookForDIEsToKeep(AddressesMap &AddressesMap,
       Current.Flags = shouldKeepDIE(AddressesMap, Ranges, Current.Die, File,
                                     Current.CU, MyInfo, Current.Flags);
 
+    // We need to mark context for the canonical die in the end of normal
+    // traversing(not TF_DependencyWalk) or after normal traversing if die
+    // was not marked as kept.
+    if (!(Current.Flags & TF_DependencyWalk) ||
+        (MyInfo.ODRMarkingDone && !MyInfo.Keep)) {
+      if (Current.CU.hasODR() || MyInfo.InModuleScope)
+        Worklist.emplace_back(Current.Die, Current.CU,
+                              WorklistItemType::MarkODRCanonicalDie);
+    }
+
     // Finish by looking for child DIEs. Because of the LIFO worklist we need
     // to schedule that work before any subsequent items are added to the
     // worklist.
@@ -845,7 +876,7 @@ void DWARFLinker::assignAbbrev(DIEAbbrev &Abbrev) {
 
 unsigned DWARFLinker::DIECloner::cloneStringAttribute(
     DIE &Die, AttributeSpec AttrSpec, const DWARFFormValue &Val,
-    const DWARFUnit &U, OffsetsStringPool &StringPool, AttributesInfo &Info) {
+    const DWARFUnit &, OffsetsStringPool &StringPool, AttributesInfo &Info) {
   Optional<const char *> String = dwarf::toString(Val);
   if (!String)
     return 0;
@@ -875,7 +906,6 @@ unsigned DWARFLinker::DIECloner::cloneDieReferenceAttribute(
 
   DIE *NewRefDie = nullptr;
   CompileUnit *RefUnit = nullptr;
-  DeclContext *Ctxt = nullptr;
 
   DWARFDie RefDie =
       Linker.resolveDIEReference(File, CompileUnits, Val, InputDIE, RefUnit);
@@ -888,14 +918,14 @@ unsigned DWARFLinker::DIECloner::cloneDieReferenceAttribute(
 
   // If we already have emitted an equivalent DeclContext, just point
   // at it.
-  if (isODRAttribute(AttrSpec.Attr)) {
-    Ctxt = RefInfo.Ctxt;
-    if (Ctxt && Ctxt->getCanonicalDIEOffset()) {
-      DIEInteger Attr(Ctxt->getCanonicalDIEOffset());
-      Die.addValue(DIEAlloc, dwarf::Attribute(AttrSpec.Attr),
-                   dwarf::DW_FORM_ref_addr, Attr);
-      return U.getRefAddrByteSize();
-    }
+  if (isODRAttribute(AttrSpec.Attr) && RefInfo.Ctxt &&
+      RefInfo.Ctxt->getCanonicalDIEOffset()) {
+    assert(RefInfo.Ctxt->hasCanonicalDIE() &&
+           "Offset to canonical die is set, but context is not marked");
+    DIEInteger Attr(RefInfo.Ctxt->getCanonicalDIEOffset());
+    Die.addValue(DIEAlloc, dwarf::Attribute(AttrSpec.Attr),
+                 dwarf::DW_FORM_ref_addr, Attr);
+    return U.getRefAddrByteSize();
   }
 
   if (!RefInfo.Clone) {
@@ -925,7 +955,7 @@ unsigned DWARFLinker::DIECloner::cloneDieReferenceAttribute(
       // A forward reference. Note and fixup later.
       Attr = 0xBADDEF;
       Unit.noteForwardReference(
-          NewRefDie, RefUnit, Ctxt,
+          NewRefDie, RefUnit, RefInfo.Ctxt,
           Die.addValue(DIEAlloc, dwarf::Attribute(AttrSpec.Attr),
                        dwarf::DW_FORM_ref_addr, DIEInteger(Attr)));
     }
@@ -1356,10 +1386,10 @@ DIE *DWARFLinker::DIECloner::cloneDIE(const DWARFDie &InputDIE,
 
   assert(Die->getTag() == InputDIE.getTag());
   Die->setOffset(OutOffset);
-  if ((Unit.hasODR() || Unit.isClangModule()) && !Info.Incomplete &&
-      Die->getTag() != dwarf::DW_TAG_namespace && Info.Ctxt &&
-      Info.Ctxt != Unit.getInfo(Info.ParentIdx).Ctxt &&
-      !Info.Ctxt->getCanonicalDIEOffset()) {
+  if (isODRCanonicalCandidate(InputDIE, Unit) && Info.Ctxt &&
+      (Info.Ctxt->getCanonicalDIEOffset() == 0)) {
+    if (!Info.Ctxt->hasCanonicalDIE())
+      Info.Ctxt->setHasCanonicalDIE();
     // We are about to emit a DIE that is the root of its own valid
     // DeclContext tree. Make the current offset the canonical offset
     // for this context.
@@ -1384,8 +1414,7 @@ DIE *DWARFLinker::DIECloner::cloneDIE(const DWARFDie &InputDIE,
       DWARFDataExtractor(DIECopy, Data.isLittleEndian(), Data.getAddressSize());
 
   // Modify the copy with relocated addresses.
-  if (ObjFile.Addresses->areRelocationsResolved() &&
-      ObjFile.Addresses->applyValidRelocs(DIECopy, Offset,
+  if (ObjFile.Addresses->applyValidRelocs(DIECopy, Offset,
                                           Data.isLittleEndian())) {
     // If we applied relocations, we store the value of high_pc that was
     // potentially stored in the input DIE. If high_pc is an address
@@ -1481,12 +1510,12 @@ DIE *DWARFLinker::DIECloner::cloneDIE(const DWARFDie &InputDIE,
     uint32_t Hash = hashFullyQualifiedName(InputDIE, Unit, File);
     uint64_t RuntimeLang =
         dwarf::toUnsigned(InputDIE.find(dwarf::DW_AT_APPLE_runtime_class))
-            .getValueOr(0);
+            .value_or(0);
     bool ObjCClassIsImplementation =
         (RuntimeLang == dwarf::DW_LANG_ObjC ||
          RuntimeLang == dwarf::DW_LANG_ObjC_plus_plus) &&
         dwarf::toUnsigned(InputDIE.find(dwarf::DW_AT_APPLE_objc_complete_type))
-            .getValueOr(0);
+            .value_or(0);
     Unit.addTypeAccelerator(Die, AttrInfo.Name, ObjCClassIsImplementation,
                             Hash);
   }
@@ -1788,16 +1817,19 @@ void DWARFLinker::patchLineTableForUnit(CompileUnit &Unit,
 
 void DWARFLinker::emitAcceleratorEntriesForUnit(CompileUnit &Unit) {
   switch (Options.TheAccelTableKind) {
-  case AccelTableKind::Apple:
+  case DwarfLinkerAccelTableKind::None:
+    // Nothing to do.
+    break;
+  case DwarfLinkerAccelTableKind::Apple:
     emitAppleAcceleratorEntriesForUnit(Unit);
     break;
-  case AccelTableKind::Dwarf:
+  case DwarfLinkerAccelTableKind::Dwarf:
     emitDwarfAcceleratorEntriesForUnit(Unit);
     break;
-  case AccelTableKind::Pub:
+  case DwarfLinkerAccelTableKind::Pub:
     emitPubAcceleratorEntriesForUnit(Unit);
     break;
-  case AccelTableKind::Default:
+  case DwarfLinkerAccelTableKind::Default:
     llvm_unreachable("The default must be updated to a concrete value.");
     break;
   }
@@ -2216,7 +2248,7 @@ uint64_t DWARFLinker::DIECloner::cloneAllCompileUnits(
 }
 
 void DWARFLinker::updateAccelKind(DWARFContext &Dwarf) {
-  if (Options.TheAccelTableKind != AccelTableKind::Default)
+  if (Options.TheAccelTableKind != DwarfLinkerAccelTableKind::Default)
     return;
 
   auto &DwarfObj = Dwarf.getDWARFObj();
@@ -2342,11 +2374,11 @@ bool DWARFLinker::link() {
   // would affect the decision. However, as they're built with the same
   // compiler and flags, it is safe to assume that they will follow the
   // decision made here.
-  if (Options.TheAccelTableKind == AccelTableKind::Default) {
+  if (Options.TheAccelTableKind == DwarfLinkerAccelTableKind::Default) {
     if (AtLeastOneDwarfAccelTable && !AtLeastOneAppleAccelTable)
-      Options.TheAccelTableKind = AccelTableKind::Dwarf;
+      Options.TheAccelTableKind = DwarfLinkerAccelTableKind::Dwarf;
     else
-      Options.TheAccelTableKind = AccelTableKind::Apple;
+      Options.TheAccelTableKind = DwarfLinkerAccelTableKind::Apple;
   }
 
   for (LinkContext &OptContext : ObjectContexts) {
@@ -2362,6 +2394,10 @@ bool DWARFLinker::link() {
 
     if (!OptContext.File.Dwarf)
       continue;
+
+    if (Options.VerifyInputDWARF)
+      verify(OptContext.File);
+
     // Look for relocations that correspond to address map entries.
 
     // there was findvalidrelocations previously ... probably we need to gather
@@ -2521,19 +2557,22 @@ bool DWARFLinker::link() {
       TheDwarfEmitter->emitAbbrevs(Abbreviations, MaxDwarfVersion);
       TheDwarfEmitter->emitStrings(OffsetsStringPool);
       switch (Options.TheAccelTableKind) {
-      case AccelTableKind::Apple:
+      case DwarfLinkerAccelTableKind::None:
+        // Nothing to do.
+        break;
+      case DwarfLinkerAccelTableKind::Apple:
         TheDwarfEmitter->emitAppleNames(AppleNames);
         TheDwarfEmitter->emitAppleNamespaces(AppleNamespaces);
         TheDwarfEmitter->emitAppleTypes(AppleTypes);
         TheDwarfEmitter->emitAppleObjc(AppleObjc);
         break;
-      case AccelTableKind::Dwarf:
+      case DwarfLinkerAccelTableKind::Dwarf:
         TheDwarfEmitter->emitDebugNames(DebugNames);
         break;
-      case AccelTableKind::Pub:
+      case DwarfLinkerAccelTableKind::Pub:
         // Already emitted by emitPubAcceleratorEntriesForUnit.
         break;
-      case AccelTableKind::Default:
+      case DwarfLinkerAccelTableKind::Default:
         llvm_unreachable("Default should have already been resolved.");
         break;
       }
@@ -2631,4 +2670,15 @@ bool DWARFLinker::link() {
   return true;
 }
 
+bool DWARFLinker::verify(const DWARFFile &File) {
+  assert(File.Dwarf);
+
+  DIDumpOptions DumpOpts;
+  if (!File.Dwarf->verify(llvm::outs(), DumpOpts.noImplicitRecursion())) {
+    reportWarning("input verification failed", File);
+    return false;
+  }
+  return true;
+}
+
 } // namespace llvm
diff --git a/llvm/lib/DWARFLinker/DWARFLinkerCompileUnit.cpp b/llvm/lib/DWARFLinker/DWARFLinkerCompileUnit.cpp
index acecb1788d10..e9e8be7fd008 100644
--- a/llvm/lib/DWARFLinker/DWARFLinkerCompileUnit.cpp
+++ b/llvm/lib/DWARFLinker/DWARFLinkerCompileUnit.cpp
@@ -90,9 +90,11 @@ void CompileUnit::fixupForwardReferences() {
     PatchLocation Attr;
     DeclContext *Ctxt;
     std::tie(RefDie, RefUnit, Ctxt, Attr) = Ref;
-    if (Ctxt && Ctxt->getCanonicalDIEOffset())
+    if (Ctxt && Ctxt->hasCanonicalDIE()) {
+      assert(Ctxt->getCanonicalDIEOffset() &&
+             "Canonical die offset is not set");
       Attr.set(Ctxt->getCanonicalDIEOffset());
-    else
+    } else
       Attr.set(RefDie->getOffset() + RefUnit->getStartOffset());
   }
 }
diff --git a/llvm/lib/DWARFLinker/DWARFLinkerDeclContext.cpp b/llvm/lib/DWARFLinker/DWARFLinkerDeclContext.cpp
index 5ab2ad0780a2..dfdfc5857569 100644
--- a/llvm/lib/DWARFLinker/DWARFLinkerDeclContext.cpp
+++ b/llvm/lib/DWARFLinker/DWARFLinkerDeclContext.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DWARFLinker/DWARFLinkerDeclContext.h"
+#include "llvm/DWARFLinker/DWARFLinkerCompileUnit.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
diff --git a/llvm/lib/DWARFLinker/DWARFStreamer.cpp b/llvm/lib/DWARFLinker/DWARFStreamer.cpp
index 99e12fce6513..55ff6b14f945 100644
--- a/llvm/lib/DWARFLinker/DWARFStreamer.cpp
+++ b/llvm/lib/DWARFLinker/DWARFStreamer.cpp
@@ -18,7 +18,6 @@
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/MCTargetOptionsCommandFlags.h"
 #include "llvm/MC/TargetRegistry.h"
@@ -68,7 +67,7 @@ bool DwarfStreamer::init(Triple TheTriple,
   if (!MII)
     return error("no instr info info for target " + TripleName, Context), false;
 
-  MCE = TheTarget->createMCCodeEmitter(*MII, *MRI, *MC);
+  MCE = TheTarget->createMCCodeEmitter(*MII, *MC);
   if (!MCE)
     return error("no code emitter for target " + TripleName, Context), false;
 
@@ -114,10 +113,10 @@ bool DwarfStreamer::init(Triple TheTriple,
   return true;
 }
 
-void DwarfStreamer::finish() { MS->Finish(); }
+void DwarfStreamer::finish() { MS->finish(); }
 
 void DwarfStreamer::switchToDebugInfoSection(unsigned DwarfVersion) {
-  MS->SwitchSection(MOFI->getDwarfInfoSection());
+  MS->switchSection(MOFI->getDwarfInfoSection());
   MC->setDwarfVersion(DwarfVersion);
 }
 
@@ -175,14 +174,14 @@ void DwarfStreamer::emitCompileUnitHeader(CompileUnit &Unit,
 void DwarfStreamer::emitAbbrevs(
     const std::vector<std::unique_ptr<DIEAbbrev>> &Abbrevs,
     unsigned DwarfVersion) {
-  MS->SwitchSection(MOFI->getDwarfAbbrevSection());
+  MS->switchSection(MOFI->getDwarfAbbrevSection());
   MC->setDwarfVersion(DwarfVersion);
   Asm->emitDwarfAbbrevs(Abbrevs);
 }
 
 /// Recursively emit the DIE tree rooted at \p Die.
 void DwarfStreamer::emitDIE(DIE &Die) {
-  MS->SwitchSection(MOFI->getDwarfInfoSection());
+  MS->switchSection(MOFI->getDwarfInfoSection());
   Asm->emitDwarfDIE(Die);
   DebugInfoSectionSize += Die.getSize();
 }
@@ -201,7 +200,7 @@ void DwarfStreamer::emitSectionContents(StringRef SecData, StringRef SecName) {
           .Default(nullptr);
 
   if (Section) {
-    MS->SwitchSection(Section);
+    MS->switchSection(Section);
 
     MS->emitBytes(SecData);
   }
@@ -221,7 +220,7 @@ void DwarfStreamer::emitPaperTrailWarningsDie(DIE &Die) {
 
 /// Emit the debug_str section stored in \p Pool.
 void DwarfStreamer::emitStrings(const NonRelocatableStringpool &Pool) {
-  Asm->OutStreamer->SwitchSection(MOFI->getDwarfStrSection());
+  Asm->OutStreamer->switchSection(MOFI->getDwarfStrSection());
   std::vector<DwarfStringPoolEntryRef> Entries = Pool.getEntriesForEmission();
   for (auto Entry : Entries) {
     // Emit the string itself.
@@ -233,7 +232,7 @@ void DwarfStreamer::emitStrings(const NonRelocatableStringpool &Pool) {
 #if 0
   if (DwarfVersion >= 5) {
     // Emit an empty string offset section.
-    Asm->OutStreamer->SwitchSection(MOFI->getDwarfStrOffSection());
+    Asm->OutStreamer->switchSection(MOFI->getDwarfStrOffSection());
     Asm->emitDwarfUnitLength(4, "Length of String Offsets Set");
     Asm->emitInt16(DwarfVersion);
     Asm->emitInt16(0);
@@ -256,7 +255,7 @@ void DwarfStreamer::emitDebugNames(
     UniqueIdToCuMap[CU.ID] = Id++;
   }
 
-  Asm->OutStreamer->SwitchSection(MOFI->getDwarfDebugNamesSection());
+  Asm->OutStreamer->switchSection(MOFI->getDwarfDebugNamesSection());
   emitDWARF5AccelTable(
       Asm.get(), Table, CompUnits,
       [&UniqueIdToCuMap](const DWARF5AccelTableStaticData &Entry) {
@@ -266,7 +265,7 @@ void DwarfStreamer::emitDebugNames(
 
 void DwarfStreamer::emitAppleNamespaces(
     AccelTable<AppleAccelTableStaticOffsetData> &Table) {
-  Asm->OutStreamer->SwitchSection(MOFI->getDwarfAccelNamespaceSection());
+  Asm->OutStreamer->switchSection(MOFI->getDwarfAccelNamespaceSection());
   auto *SectionBegin = Asm->createTempSymbol("namespac_begin");
   Asm->OutStreamer->emitLabel(SectionBegin);
   emitAppleAccelTable(Asm.get(), Table, "namespac", SectionBegin);
@@ -274,7 +273,7 @@ void DwarfStreamer::emitAppleNamespaces(
 
 void DwarfStreamer::emitAppleNames(
     AccelTable<AppleAccelTableStaticOffsetData> &Table) {
-  Asm->OutStreamer->SwitchSection(MOFI->getDwarfAccelNamesSection());
+  Asm->OutStreamer->switchSection(MOFI->getDwarfAccelNamesSection());
   auto *SectionBegin = Asm->createTempSymbol("names_begin");
   Asm->OutStreamer->emitLabel(SectionBegin);
   emitAppleAccelTable(Asm.get(), Table, "names", SectionBegin);
@@ -282,7 +281,7 @@ void DwarfStreamer::emitAppleNames(
 
 void DwarfStreamer::emitAppleObjc(
     AccelTable<AppleAccelTableStaticOffsetData> &Table) {
-  Asm->OutStreamer->SwitchSection(MOFI->getDwarfAccelObjCSection());
+  Asm->OutStreamer->switchSection(MOFI->getDwarfAccelObjCSection());
   auto *SectionBegin = Asm->createTempSymbol("objc_begin");
   Asm->OutStreamer->emitLabel(SectionBegin);
   emitAppleAccelTable(Asm.get(), Table, "objc", SectionBegin);
@@ -290,7 +289,7 @@ void DwarfStreamer::emitAppleObjc(
 
 void DwarfStreamer::emitAppleTypes(
     AccelTable<AppleAccelTableStaticTypeData> &Table) {
-  Asm->OutStreamer->SwitchSection(MOFI->getDwarfAccelTypesSection());
+  Asm->OutStreamer->switchSection(MOFI->getDwarfAccelTypesSection());
   auto *SectionBegin = Asm->createTempSymbol("types_begin");
   Asm->OutStreamer->emitLabel(SectionBegin);
   emitAppleAccelTable(Asm.get(), Table, "types", SectionBegin);
@@ -300,7 +299,7 @@ void DwarfStreamer::emitAppleTypes(
 void DwarfStreamer::emitSwiftAST(StringRef Buffer) {
   MCSection *SwiftASTSection = MOFI->getDwarfSwiftASTSection();
   SwiftASTSection->setAlignment(Align(32));
-  MS->SwitchSection(SwiftASTSection);
+  MS->switchSection(SwiftASTSection);
   MS->emitBytes(Buffer);
 }
 
@@ -312,7 +311,7 @@ void DwarfStreamer::emitSwiftReflectionSection(
   if (ReflectionSection == nullptr)
     return;
   ReflectionSection->setAlignment(Align(Alignment));
-  MS->SwitchSection(ReflectionSection);
+  MS->switchSection(ReflectionSection);
   MS->emitBytes(Buffer);
 }
 
@@ -325,7 +324,7 @@ void DwarfStreamer::emitRangesEntries(
     const FunctionIntervals::const_iterator &FuncRange,
     const std::vector<DWARFDebugRangeList::RangeListEntry> &Entries,
     unsigned AddressSize) {
-  MS->SwitchSection(MC->getObjectFileInfo()->getDwarfRangesSection());
+  MS->switchSection(MC->getObjectFileInfo()->getDwarfRangesSection());
 
   // Offset each range by the right amount.
   int64_t PcOffset = Entries.empty() ? 0 : FuncRange.value() + UnitPcOffset;
@@ -377,7 +376,7 @@ void DwarfStreamer::emitUnitRangesEntries(CompileUnit &Unit,
   llvm::sort(Ranges);
 
   if (!Ranges.empty()) {
-    MS->SwitchSection(MC->getObjectFileInfo()->getDwarfARangesSection());
+    MS->switchSection(MC->getObjectFileInfo()->getDwarfARangesSection());
 
     MCSymbol *BeginLabel = Asm->createTempSymbol("Barange");
     MCSymbol *EndLabel = Asm->createTempSymbol("Earange");
@@ -419,7 +418,7 @@ void DwarfStreamer::emitUnitRangesEntries(CompileUnit &Unit,
   if (!DoDebugRanges)
     return;
 
-  MS->SwitchSection(MC->getObjectFileInfo()->getDwarfRangesSection());
+  MS->switchSection(MC->getObjectFileInfo()->getDwarfRangesSection());
   // Offset each range by the right amount.
   int64_t PcOffset = -Unit.getLowPc();
   // Emit coalesced ranges.
@@ -447,7 +446,7 @@ void DwarfStreamer::emitLocationsForUnit(
   if (Attributes.empty())
     return;
 
-  MS->SwitchSection(MC->getObjectFileInfo()->getDwarfLocSection());
+  MS->switchSection(MC->getObjectFileInfo()->getDwarfLocSection());
 
   unsigned AddressSize = Unit.getOrigUnit().getAddressByteSize();
   uint64_t BaseAddressMarker = (AddressSize == 8)
@@ -509,7 +508,7 @@ void DwarfStreamer::emitLineTableForUnit(MCDwarfLineTableParams Params,
                                          std::vector<DWARFDebugLine::Row> &Rows,
                                          unsigned PointerSize) {
   // Switch to the section where the table will be emitted into.
-  MS->SwitchSection(MC->getObjectFileInfo()->getDwarfLineSection());
+  MS->switchSection(MC->getObjectFileInfo()->getDwarfLineSection());
   MCSymbol *LineStartSym = MC->createTempSymbol();
   MCSymbol *LineEndSym = MC->createTempSymbol();
 
@@ -650,7 +649,7 @@ void DwarfStreamer::emitLineTableForUnit(MCDwarfLineTableParams Params,
 /// Copy the debug_line over to the updated binary while unobfuscating the file
 /// names and directories.
 void DwarfStreamer::translateLineTable(DataExtractor Data, uint64_t Offset) {
-  MS->SwitchSection(MC->getObjectFileInfo()->getDwarfLineSection());
+  MS->switchSection(MC->getObjectFileInfo()->getDwarfLineSection());
   StringRef Contents = Data.getData();
 
   // We have to deconstruct the line table header, because it contains to
@@ -738,7 +737,7 @@ void DwarfStreamer::emitPubSectionForUnit(
     return;
 
   // Start the dwarf pubnames section.
-  Asm->OutStreamer->SwitchSection(Sec);
+  Asm->OutStreamer->switchSection(Sec);
   MCSymbol *BeginLabel = Asm->createTempSymbol("pub" + SecName + "_begin");
   MCSymbol *EndLabel = Asm->createTempSymbol("pub" + SecName + "_end");
 
@@ -785,7 +784,7 @@ void DwarfStreamer::emitPubTypesForUnit(const CompileUnit &Unit) {
 
 /// Emit a CIE into the debug_frame section.
 void DwarfStreamer::emitCIE(StringRef CIEBytes) {
-  MS->SwitchSection(MC->getObjectFileInfo()->getDwarfFrameSection());
+  MS->switchSection(MC->getObjectFileInfo()->getDwarfFrameSection());
 
   MS->emitBytes(CIEBytes);
   FrameSectionSize += CIEBytes.size();
@@ -796,7 +795,7 @@ void DwarfStreamer::emitCIE(StringRef CIEBytes) {
 /// which will be replaced with the parameter values.
 void DwarfStreamer::emitFDE(uint32_t CIEOffset, uint32_t AddrSize,
                             uint32_t Address, StringRef FDEBytes) {
-  MS->SwitchSection(MC->getObjectFileInfo()->getDwarfFrameSection());
+  MS->switchSection(MC->getObjectFileInfo()->getDwarfFrameSection());
 
   MS->emitIntValue(FDEBytes.size() + 4 + AddrSize, 4);
   MS->emitIntValue(CIEOffset, 4);
diff --git a/llvm/lib/DWP/DWP.cpp b/llvm/lib/DWP/DWP.cpp
index f6538c0549d0..34615a73e328 100644
--- a/llvm/lib/DWP/DWP.cpp
+++ b/llvm/lib/DWP/DWP.cpp
@@ -16,6 +16,7 @@
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCTargetOptionsCommandFlags.h"
 #include "llvm/Object/Decompressor.h"
+#include "llvm/Support/MemoryBuffer.h"
 
 using namespace llvm;
 using namespace llvm::object;
@@ -181,7 +182,7 @@ addAllTypesFromDWP(MCStreamer &Out,
                    const DWARFUnitIndex &TUIndex, MCSection *OutputTypes,
                    StringRef Types, const UnitIndexEntry &TUEntry,
                    uint32_t &TypesOffset, unsigned TypesContributionIndex) {
-  Out.SwitchSection(OutputTypes);
+  Out.switchSection(OutputTypes);
   for (const DWARFUnitIndex::Entry &E : TUIndex.getRows()) {
     auto *I = E.getContributions();
     if (!I)
@@ -215,7 +216,7 @@ static void addAllTypesFromTypesSection(
     MCSection *OutputTypes, const std::vector<StringRef> &TypesSections,
     const UnitIndexEntry &CUEntry, uint32_t &TypesOffset) {
   for (StringRef Types : TypesSections) {
-    Out.SwitchSection(OutputTypes);
+    Out.switchSection(OutputTypes);
     uint64_t Offset = 0;
     DataExtractor Data(Types, true, 0);
     while (Data.isValidOffset(Offset)) {
@@ -373,7 +374,7 @@ void writeStringsAndOffsets(MCStreamer &Out, DWPStringPool &Strings,
 
   Data = DataExtractor(CurStrOffsetSection, true, 0);
 
-  Out.SwitchSection(StrOffsetSection);
+  Out.switchSection(StrOffsetSection);
 
   uint64_t HeaderSize = debugStrOffsetsHeaderSize(Data, Version);
   uint64_t Offset = 0;
@@ -427,7 +428,7 @@ void writeIndex(MCStreamer &Out, MCSection *Section,
     ++I;
   }
 
-  Out.SwitchSection(Section);
+  Out.switchSection(Section);
   Out.emitIntValue(IndexVersion, 4);        // Version
   Out.emitIntValue(Columns, 4);             // Columns
   Out.emitIntValue(IndexEntries.size(), 4); // Num Units
@@ -526,7 +527,7 @@ Error handleSection(
   else if (OutSection == InfoSection)
     CurInfoSection.push_back(Contents);
   else {
-    Out.SwitchSection(OutSection);
+    Out.switchSection(OutSection);
     Out.emitBytes(Contents);
   }
   return Error::success();
@@ -633,7 +634,7 @@ Error write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
         ContributionOffsets[getContributionIndex(DW_SECT_INFO, IndexVersion)];
     if (CurCUIndexSection.empty()) {
       bool FoundCUUnit = false;
-      Out.SwitchSection(InfoSection);
+      Out.switchSection(InfoSection);
       for (StringRef Info : CurInfoSection) {
         uint64_t UnitOffset = 0;
         while (Info.size() > UnitOffset) {
@@ -668,7 +669,7 @@ Error write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
             FoundCUUnit = true;
           } else if (Header.UnitType == dwarf::DW_UT_split_type) {
             auto P = TypeIndexEntries.insert(
-                std::make_pair(Header.Signature.getValue(), Entry));
+                std::make_pair(*Header.Signature, Entry));
             if (!P.second)
               continue;
           }
@@ -703,7 +704,7 @@ Error write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
                                   utostr(CUIndex.getVersion()) +
                                   " and expecting " + utostr(IndexVersion));
 
-    Out.SwitchSection(InfoSection);
+    Out.switchSection(InfoSection);
     for (const DWARFUnitIndex::Entry &E : CUIndex.getRows()) {
       auto *I = E.getContributions();
       if (!I)
diff --git a/llvm/lib/DebugInfo/CodeView/AppendingTypeTableBuilder.cpp b/llvm/lib/DebugInfo/CodeView/AppendingTypeTableBuilder.cpp
index 4d8b15530b9e..3ab7f722eaee 100644
--- a/llvm/lib/DebugInfo/CodeView/AppendingTypeTableBuilder.cpp
+++ b/llvm/lib/DebugInfo/CodeView/AppendingTypeTableBuilder.cpp
@@ -8,18 +8,11 @@
 
 #include "llvm/DebugInfo/CodeView/AppendingTypeTableBuilder.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/ContinuationRecordBuilder.h"
-#include "llvm/DebugInfo/CodeView/RecordSerialization.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/Support/Allocator.h"
-#include "llvm/Support/BinaryByteStream.h"
-#include "llvm/Support/BinaryStreamWriter.h"
-#include "llvm/Support/Endian.h"
-#include "llvm/Support/Error.h"
-#include <algorithm>
+#include "llvm/Support/ErrorHandling.h"
 #include <cassert>
 #include <cstdint>
 #include <cstring>
diff --git a/llvm/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp b/llvm/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp
index 48b9b0496ffe..2154aa2b8d00 100644
--- a/llvm/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp
+++ b/llvm/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp
@@ -8,8 +8,12 @@
 
 #include "llvm/DebugInfo/CodeView/CVSymbolVisitor.h"
 
-#include "llvm/DebugInfo/CodeView/CodeViewError.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/DebugInfo/CodeView/SymbolRecordHelpers.h"
 #include "llvm/DebugInfo/CodeView/SymbolVisitorCallbacks.h"
+#include "llvm/Support/BinaryStreamArray.h"
+#include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -80,3 +84,72 @@ Error CVSymbolVisitor::visitSymbolStream(const CVSymbolArray &Symbols,
   }
   return Error::success();
 }
+
+Error CVSymbolVisitor::visitSymbolStreamFiltered(const CVSymbolArray &Symbols,
+                                                 const FilterOptions &Filter) {
+  if (!Filter.SymbolOffset)
+    return visitSymbolStream(Symbols);
+  uint32_t SymbolOffset = *Filter.SymbolOffset;
+  uint32_t ParentRecurseDepth = Filter.ParentRecursiveDepth.value_or(0);
+  uint32_t ChildrenRecurseDepth = Filter.ChildRecursiveDepth.value_or(0);
+  if (!Symbols.isOffsetValid(SymbolOffset))
+    return createStringError(inconvertibleErrorCode(), "Invalid symbol offset");
+  CVSymbol Sym = *Symbols.at(SymbolOffset);
+  uint32_t SymEndOffset =
+      symbolOpensScope(Sym.kind()) ? getScopeEndOffset(Sym) : 0;
+
+  std::vector<uint32_t> ParentOffsets;
+  std::vector<uint32_t> ParentEndOffsets;
+  uint32_t ChildrenDepth = 0;
+  for (auto Begin = Symbols.begin(), End = Symbols.end(); Begin != End;
+       ++Begin) {
+    uint32_t BeginOffset = Begin.offset();
+    CVSymbol BeginSym = *Begin;
+    if (BeginOffset < SymbolOffset) {
+      if (symbolOpensScope(Begin->kind())) {
+        uint32_t EndOffset = getScopeEndOffset(BeginSym);
+        if (SymbolOffset < EndOffset) {
+          ParentOffsets.push_back(BeginOffset);
+          ParentEndOffsets.push_back(EndOffset);
+        }
+      }
+    } else if (BeginOffset == SymbolOffset) {
+      // Found symbol at offset. Visit its parent up to ParentRecurseDepth.
+      if (ParentRecurseDepth >= ParentOffsets.size())
+        ParentRecurseDepth = ParentOffsets.size();
+      uint32_t StartIndex = ParentOffsets.size() - ParentRecurseDepth;
+      while (StartIndex < ParentOffsets.size()) {
+        if (!Symbols.isOffsetValid(ParentOffsets[StartIndex]))
+          break;
+        CVSymbol Parent = *Symbols.at(ParentOffsets[StartIndex]);
+        if (auto EC = visitSymbolRecord(Parent, ParentOffsets[StartIndex]))
+          return EC;
+        ++StartIndex;
+      }
+      if (auto EC = visitSymbolRecord(Sym, SymbolOffset))
+        return EC;
+    } else if (BeginOffset <= SymEndOffset) {
+      if (ChildrenRecurseDepth) {
+        // Visit children.
+        if (symbolEndsScope(Begin->kind()))
+          --ChildrenDepth;
+        if (ChildrenDepth < ChildrenRecurseDepth ||
+            BeginOffset == SymEndOffset) {
+          if (auto EC = visitSymbolRecord(BeginSym, BeginOffset))
+            return EC;
+        }
+        if (symbolOpensScope(Begin->kind()))
+          ++ChildrenDepth;
+      }
+    } else {
+      // Visit parents' ends.
+      if (ParentRecurseDepth && BeginOffset == ParentEndOffsets.back()) {
+        if (auto EC = visitSymbolRecord(BeginSym, BeginOffset))
+          return EC;
+        ParentEndOffsets.pop_back();
+        --ParentRecurseDepth;
+      }
+    }
+  }
+  return Error::success();
+}
diff --git a/llvm/lib/DebugInfo/CodeView/CVTypeVisitor.cpp b/llvm/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
index dd6f75f97a4a..5da300f710d5 100644
--- a/llvm/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
+++ b/llvm/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
@@ -8,11 +8,12 @@
 
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
 
-#include "llvm/DebugInfo/CodeView/CodeViewError.h"
 #include "llvm/DebugInfo/CodeView/TypeCollection.h"
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
-#include "llvm/DebugInfo/CodeView/TypeRecordMapping.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
+#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
 #include "llvm/Support/BinaryByteStream.h"
 #include "llvm/Support/BinaryStreamReader.h"
 
diff --git a/llvm/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp b/llvm/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp
index 1af59ff679dd..a66f9af98835 100644
--- a/llvm/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp
+++ b/llvm/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp
@@ -8,7 +8,9 @@
 
 #include "llvm/DebugInfo/CodeView/CodeViewRecordIO.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/GUID.h"
 #include "llvm/DebugInfo/CodeView/RecordSerialization.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/BinaryStreamWriter.h"
 
@@ -68,10 +70,10 @@ uint32_t CodeViewRecordIO::maxFieldLength() const {
   Optional<uint32_t> Min = Limits.front().bytesRemaining(Offset);
   for (auto X : makeArrayRef(Limits).drop_front()) {
     Optional<uint32_t> ThisMin = X.bytesRemaining(Offset);
-    if (ThisMin.hasValue())
-      Min = (Min.hasValue()) ? std::min(*Min, *ThisMin) : *ThisMin;
+    if (ThisMin)
+      Min = Min ? std::min(*Min, *ThisMin) : *ThisMin;
   }
-  assert(Min.hasValue() && "Every field must have a maximum length!");
+  assert(Min && "Every field must have a maximum length!");
 
   return *Min;
 }
@@ -279,17 +281,24 @@ void CodeViewRecordIO::emitEncodedSignedInteger(const int64_t &Value,
   // FIXME: There are no test cases covering this function.
   // This may be because we always consider enumerators to be unsigned.
   // See FIXME at CodeViewDebug.cpp : CodeViewDebug::lowerTypeEnum.
-  if (Value >= std::numeric_limits<int8_t>::min()) {
+  if (Value < LF_NUMERIC && Value >= 0) {
+    emitComment(Comment);
+    Streamer->emitIntValue(Value, 2);
+    incrStreamedLen(2);
+  } else if (Value >= std::numeric_limits<int8_t>::min() &&
+             Value <= std::numeric_limits<int8_t>::max()) {
     Streamer->emitIntValue(LF_CHAR, 2);
     emitComment(Comment);
     Streamer->emitIntValue(Value, 1);
     incrStreamedLen(3);
-  } else if (Value >= std::numeric_limits<int16_t>::min()) {
+  } else if (Value >= std::numeric_limits<int16_t>::min() &&
+             Value <= std::numeric_limits<int16_t>::max()) {
     Streamer->emitIntValue(LF_SHORT, 2);
     emitComment(Comment);
     Streamer->emitIntValue(Value, 2);
     incrStreamedLen(4);
-  } else if (Value >= std::numeric_limits<int32_t>::min()) {
+  } else if (Value >= std::numeric_limits<int32_t>::min() &&
+             Value <= std::numeric_limits<int32_t>::max()) {
     Streamer->emitIntValue(LF_LONG, 2);
     emitComment(Comment);
     Streamer->emitIntValue(Value, 4);
@@ -328,17 +337,23 @@ void CodeViewRecordIO::emitEncodedUnsignedInteger(const uint64_t &Value,
 }
 
 Error CodeViewRecordIO::writeEncodedSignedInteger(const int64_t &Value) {
-  if (Value >= std::numeric_limits<int8_t>::min()) {
+  if (Value < LF_NUMERIC && Value >= 0) {
+    if (auto EC = Writer->writeInteger<int16_t>(Value))
+      return EC;
+  } else if (Value >= std::numeric_limits<int8_t>::min() &&
+             Value <= std::numeric_limits<int8_t>::max()) {
     if (auto EC = Writer->writeInteger<uint16_t>(LF_CHAR))
       return EC;
     if (auto EC = Writer->writeInteger<int8_t>(Value))
       return EC;
-  } else if (Value >= std::numeric_limits<int16_t>::min()) {
+  } else if (Value >= std::numeric_limits<int16_t>::min() &&
+             Value <= std::numeric_limits<int16_t>::max()) {
     if (auto EC = Writer->writeInteger<uint16_t>(LF_SHORT))
       return EC;
     if (auto EC = Writer->writeInteger<int16_t>(Value))
       return EC;
-  } else if (Value >= std::numeric_limits<int32_t>::min()) {
+  } else if (Value >= std::numeric_limits<int32_t>::min() &&
+             Value <= std::numeric_limits<int32_t>::max()) {
     if (auto EC = Writer->writeInteger<uint16_t>(LF_LONG))
       return EC;
     if (auto EC = Writer->writeInteger<int32_t>(Value))
diff --git a/llvm/lib/DebugInfo/CodeView/ContinuationRecordBuilder.cpp b/llvm/lib/DebugInfo/CodeView/ContinuationRecordBuilder.cpp
index c7b1c65f2f9a..a3dbb3954d5c 100644
--- a/llvm/lib/DebugInfo/CodeView/ContinuationRecordBuilder.cpp
+++ b/llvm/lib/DebugInfo/CodeView/ContinuationRecordBuilder.cpp
@@ -46,10 +46,10 @@ static inline TypeLeafKind getTypeLeafKind(ContinuationRecordKind CK) {
 ContinuationRecordBuilder::ContinuationRecordBuilder()
     : SegmentWriter(Buffer), Mapping(SegmentWriter) {}
 
-ContinuationRecordBuilder::~ContinuationRecordBuilder() {}
+ContinuationRecordBuilder::~ContinuationRecordBuilder() = default;
 
 void ContinuationRecordBuilder::begin(ContinuationRecordKind RecordKind) {
-  assert(!Kind.hasValue());
+  assert(!Kind);
   Kind = RecordKind;
   Buffer.clear();
   SegmentWriter.setOffset(0);
@@ -76,7 +76,7 @@ void ContinuationRecordBuilder::begin(ContinuationRecordKind RecordKind) {
 
 template <typename RecordType>
 void ContinuationRecordBuilder::writeMemberType(RecordType &Record) {
-  assert(Kind.hasValue());
+  assert(Kind);
 
   uint32_t OriginalOffset = SegmentWriter.getOffset();
   CVMemberRecord CVMR;
@@ -158,7 +158,7 @@ CVType ContinuationRecordBuilder::createSegmentRecord(
   RecordPrefix *Prefix = reinterpret_cast<RecordPrefix *>(Data.data());
   Prefix->RecordLen = Data.size() - sizeof(RecordPrefix::RecordLen);
 
-  if (RefersTo.hasValue()) {
+  if (RefersTo) {
     auto Continuation = Data.take_back(ContinuationLength);
     ContinuationRecord *CR =
         reinterpret_cast<ContinuationRecord *>(Continuation.data());
diff --git a/llvm/lib/DebugInfo/CodeView/DebugCrossExSubsection.cpp b/llvm/lib/DebugInfo/CodeView/DebugCrossExSubsection.cpp
index b23410409f88..b48f57955db1 100644
--- a/llvm/lib/DebugInfo/CodeView/DebugCrossExSubsection.cpp
+++ b/llvm/lib/DebugInfo/CodeView/DebugCrossExSubsection.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/DebugInfo/CodeView/DebugCrossExSubsection.h"
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/Error.h"
 #include <cstdint>
diff --git a/llvm/lib/DebugInfo/CodeView/DebugFrameDataSubsection.cpp b/llvm/lib/DebugInfo/CodeView/DebugFrameDataSubsection.cpp
index 9bc69abea102..c083c61d1595 100644
--- a/llvm/lib/DebugInfo/CodeView/DebugFrameDataSubsection.cpp
+++ b/llvm/lib/DebugInfo/CodeView/DebugFrameDataSubsection.cpp
@@ -8,6 +8,8 @@
 
 #include "llvm/DebugInfo/CodeView/DebugFrameDataSubsection.h"
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamWriter.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
diff --git a/llvm/lib/DebugInfo/CodeView/DebugInlineeLinesSubsection.cpp b/llvm/lib/DebugInfo/CodeView/DebugInlineeLinesSubsection.cpp
index 48ec7e4ecdd6..665511c592f9 100644
--- a/llvm/lib/DebugInfo/CodeView/DebugInlineeLinesSubsection.cpp
+++ b/llvm/lib/DebugInfo/CodeView/DebugInlineeLinesSubsection.cpp
@@ -10,6 +10,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
+#include "llvm/DebugInfo/CodeView/RecordSerialization.h"
 #include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/Endian.h"
diff --git a/llvm/lib/DebugInfo/CodeView/DebugSubsection.cpp b/llvm/lib/DebugInfo/CodeView/DebugSubsection.cpp
index 3f93463fe6d6..01581181dfe0 100644
--- a/llvm/lib/DebugInfo/CodeView/DebugSubsection.cpp
+++ b/llvm/lib/DebugInfo/CodeView/DebugSubsection.cpp
@@ -10,6 +10,6 @@
 
 using namespace llvm::codeview;
 
-DebugSubsectionRef::~DebugSubsectionRef() {}
+DebugSubsectionRef::~DebugSubsectionRef() = default;
 
-DebugSubsection::~DebugSubsection() {}
+DebugSubsection::~DebugSubsection() = default;
diff --git a/llvm/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp b/llvm/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp
index 3c8a30101450..adc6cabd7da1 100644
--- a/llvm/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp
+++ b/llvm/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp
@@ -13,7 +13,6 @@
 #include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MathExtras.h"
-#include <algorithm>
 #include <cassert>
 #include <cstdint>
 
diff --git a/llvm/lib/DebugInfo/CodeView/DebugSubsectionVisitor.cpp b/llvm/lib/DebugInfo/CodeView/DebugSubsectionVisitor.cpp
index 7968b6a2d757..50f6fb93dec1 100644
--- a/llvm/lib/DebugInfo/CodeView/DebugSubsectionVisitor.cpp
+++ b/llvm/lib/DebugInfo/CodeView/DebugSubsectionVisitor.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/DebugInfo/CodeView/DebugSubsectionVisitor.h"
 
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
 #include "llvm/DebugInfo/CodeView/DebugCrossExSubsection.h"
 #include "llvm/DebugInfo/CodeView/DebugCrossImpSubsection.h"
@@ -20,7 +21,7 @@
 #include "llvm/DebugInfo/CodeView/DebugSymbolsSubsection.h"
 #include "llvm/DebugInfo/CodeView/DebugUnknownSubsection.h"
 #include "llvm/Support/BinaryStreamReader.h"
-#include "llvm/Support/BinaryStreamRef.h"
+#include "llvm/Support/SwapByteOrder.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
diff --git a/llvm/lib/DebugInfo/CodeView/DebugSymbolsSubsection.cpp b/llvm/lib/DebugInfo/CodeView/DebugSymbolsSubsection.cpp
index c833103663e4..2b20b3e95db6 100644
--- a/llvm/lib/DebugInfo/CodeView/DebugSymbolsSubsection.cpp
+++ b/llvm/lib/DebugInfo/CodeView/DebugSymbolsSubsection.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/CodeView/DebugSymbolsSubsection.h"
+#include "llvm/Support/BinaryStreamWriter.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
diff --git a/llvm/lib/DebugInfo/CodeView/Formatters.cpp b/llvm/lib/DebugInfo/CodeView/Formatters.cpp
index f1f51bcb39cc..73a589212227 100644
--- a/llvm/lib/DebugInfo/CodeView/Formatters.cpp
+++ b/llvm/lib/DebugInfo/CodeView/Formatters.cpp
@@ -9,8 +9,10 @@
 #include "llvm/DebugInfo/CodeView/Formatters.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/DebugInfo/CodeView/GUID.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cassert>
 
 using namespace llvm;
diff --git a/llvm/lib/DebugInfo/CodeView/GlobalTypeTableBuilder.cpp b/llvm/lib/DebugInfo/CodeView/GlobalTypeTableBuilder.cpp
index 7cd9ca7498f5..142af382efba 100644
--- a/llvm/lib/DebugInfo/CodeView/GlobalTypeTableBuilder.cpp
+++ b/llvm/lib/DebugInfo/CodeView/GlobalTypeTableBuilder.cpp
@@ -8,18 +8,12 @@
 
 #include "llvm/DebugInfo/CodeView/GlobalTypeTableBuilder.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/None.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/ContinuationRecordBuilder.h"
-#include "llvm/DebugInfo/CodeView/RecordSerialization.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/Support/Allocator.h"
-#include "llvm/Support/BinaryByteStream.h"
-#include "llvm/Support/BinaryStreamWriter.h"
-#include "llvm/Support/Endian.h"
-#include "llvm/Support/Error.h"
-#include <algorithm>
+#include "llvm/Support/ErrorHandling.h"
 #include <cassert>
 #include <cstdint>
 #include <cstring>
diff --git a/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp b/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
index c0fc3e0ef65a..1d49a1ed4712 100644
--- a/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
+++ b/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
@@ -9,11 +9,12 @@
 #include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/None.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
 #include "llvm/DebugInfo/CodeView/RecordName.h"
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/RecordSerialization.h"
 #include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
diff --git a/llvm/lib/DebugInfo/CodeView/MergingTypeTableBuilder.cpp b/llvm/lib/DebugInfo/CodeView/MergingTypeTableBuilder.cpp
index 13ce3ae82c26..62d228599eae 100644
--- a/llvm/lib/DebugInfo/CodeView/MergingTypeTableBuilder.cpp
+++ b/llvm/lib/DebugInfo/CodeView/MergingTypeTableBuilder.cpp
@@ -8,18 +8,13 @@
 
 #include "llvm/DebugInfo/CodeView/MergingTypeTableBuilder.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/None.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/ContinuationRecordBuilder.h"
-#include "llvm/DebugInfo/CodeView/RecordSerialization.h"
+#include "llvm/DebugInfo/CodeView/TypeHashing.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/Support/Allocator.h"
-#include "llvm/Support/BinaryByteStream.h"
-#include "llvm/Support/BinaryStreamWriter.h"
-#include "llvm/Support/Endian.h"
-#include "llvm/Support/Error.h"
-#include <algorithm>
+#include "llvm/Support/ErrorHandling.h"
 #include <cassert>
 #include <cstdint>
 #include <cstring>
diff --git a/llvm/lib/DebugInfo/CodeView/RecordName.cpp b/llvm/lib/DebugInfo/CodeView/RecordName.cpp
index 1ca899789bef..5fbbc4a5d497 100644
--- a/llvm/lib/DebugInfo/CodeView/RecordName.cpp
+++ b/llvm/lib/DebugInfo/CodeView/RecordName.cpp
@@ -10,9 +10,13 @@
 
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/DebugInfo/CodeView/CVSymbolVisitor.h"
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecordMapping.h"
+#include "llvm/DebugInfo/CodeView/TypeCollection.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
 #include "llvm/Support/FormatVariadic.h"
 
diff --git a/llvm/lib/DebugInfo/CodeView/RecordSerialization.cpp b/llvm/lib/DebugInfo/CodeView/RecordSerialization.cpp
index 63ce302a4e09..d76905df8681 100644
--- a/llvm/lib/DebugInfo/CodeView/RecordSerialization.cpp
+++ b/llvm/lib/DebugInfo/CodeView/RecordSerialization.cpp
@@ -13,9 +13,9 @@
 #include "llvm/DebugInfo/CodeView/RecordSerialization.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/APSInt.h"
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/Support/BinaryByteStream.h"
 
 using namespace llvm;
diff --git a/llvm/lib/DebugInfo/CodeView/SimpleTypeSerializer.cpp b/llvm/lib/DebugInfo/CodeView/SimpleTypeSerializer.cpp
index d963e34628db..cf0c877fdbf8 100644
--- a/llvm/lib/DebugInfo/CodeView/SimpleTypeSerializer.cpp
+++ b/llvm/lib/DebugInfo/CodeView/SimpleTypeSerializer.cpp
@@ -7,7 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/CodeView/SimpleTypeSerializer.h"
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
+#include "llvm/DebugInfo/CodeView/RecordSerialization.h"
 #include "llvm/DebugInfo/CodeView/TypeRecordMapping.h"
 #include "llvm/Support/BinaryStreamWriter.h"
 
@@ -29,7 +30,7 @@ static void addPadding(BinaryStreamWriter &Writer) {
 
 SimpleTypeSerializer::SimpleTypeSerializer() : ScratchBuffer(MaxRecordLength) {}
 
-SimpleTypeSerializer::~SimpleTypeSerializer() {}
+SimpleTypeSerializer::~SimpleTypeSerializer() = default;
 
 template <typename T>
 ArrayRef<uint8_t> SimpleTypeSerializer::serialize(T &Record) {
diff --git a/llvm/lib/DebugInfo/CodeView/StringsAndChecksums.cpp b/llvm/lib/DebugInfo/CodeView/StringsAndChecksums.cpp
index 9e204eec8604..81aa44fb2086 100644
--- a/llvm/lib/DebugInfo/CodeView/StringsAndChecksums.cpp
+++ b/llvm/lib/DebugInfo/CodeView/StringsAndChecksums.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/CodeView/StringsAndChecksums.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
 #include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h"
diff --git a/llvm/lib/DebugInfo/CodeView/SymbolDumper.cpp b/llvm/lib/DebugInfo/CodeView/SymbolDumper.cpp
index 45b63983beb4..cfb12dbae845 100644
--- a/llvm/lib/DebugInfo/CodeView/SymbolDumper.cpp
+++ b/llvm/lib/DebugInfo/CodeView/SymbolDumper.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/CodeView/SymbolDumper.h"
-#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/CodeView/CVSymbolVisitor.h"
 #include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h"
 #include "llvm/DebugInfo/CodeView/EnumTables.h"
@@ -20,8 +20,6 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ScopedPrinter.h"
 
-#include <system_error>
-
 using namespace llvm;
 using namespace llvm::codeview;
 
diff --git a/llvm/lib/DebugInfo/CodeView/SymbolRecordHelpers.cpp b/llvm/lib/DebugInfo/CodeView/SymbolRecordHelpers.cpp
index 2562c633bb99..d8b350bf26ba 100644
--- a/llvm/lib/DebugInfo/CodeView/SymbolRecordHelpers.cpp
+++ b/llvm/lib/DebugInfo/CodeView/SymbolRecordHelpers.cpp
@@ -8,7 +8,7 @@
 
 #include "llvm/DebugInfo/CodeView/SymbolRecordHelpers.h"
 
-#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
 
 using namespace llvm;
diff --git a/llvm/lib/DebugInfo/CodeView/SymbolSerializer.cpp b/llvm/lib/DebugInfo/CodeView/SymbolSerializer.cpp
index de9bb42b1798..5fb8d497b957 100644
--- a/llvm/lib/DebugInfo/CodeView/SymbolSerializer.cpp
+++ b/llvm/lib/DebugInfo/CodeView/SymbolSerializer.cpp
@@ -8,9 +8,9 @@
 
 #include "llvm/DebugInfo/CodeView/SymbolSerializer.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
 #include <cassert>
 #include <cstdint>
 #include <cstring>
@@ -24,7 +24,7 @@ SymbolSerializer::SymbolSerializer(BumpPtrAllocator &Allocator,
       Mapping(Writer, Container) {}
 
 Error SymbolSerializer::visitSymbolBegin(CVSymbol &Record) {
-  assert(!CurrentSymbol.hasValue() && "Already in a symbol mapping!");
+  assert(!CurrentSymbol && "Already in a symbol mapping!");
 
   Writer.setOffset(0);
 
@@ -39,7 +39,7 @@ Error SymbolSerializer::visitSymbolBegin(CVSymbol &Record) {
 }
 
 Error SymbolSerializer::visitSymbolEnd(CVSymbol &Record) {
-  assert(CurrentSymbol.hasValue() && "Not in a symbol mapping!");
+  assert(CurrentSymbol && "Not in a symbol mapping!");
 
   if (auto EC = Mapping.visitSymbolEnd(Record))
     return EC;
diff --git a/llvm/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp b/llvm/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
index d5fea5ee5e29..5d27c9f29984 100644
--- a/llvm/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
+++ b/llvm/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
@@ -8,14 +8,15 @@
 
 #include "llvm/DebugInfo/CodeView/TypeDumpVisitor.h"
 
-#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
-#include "llvm/DebugInfo/CodeView/Formatters.h"
+#include "llvm/DebugInfo/CodeView/RecordSerialization.h"
 #include "llvm/DebugInfo/CodeView/TypeCollection.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
diff --git a/llvm/lib/DebugInfo/CodeView/TypeHashing.cpp b/llvm/lib/DebugInfo/CodeView/TypeHashing.cpp
index 2dbc11a84f0b..fc85d8186eaa 100644
--- a/llvm/lib/DebugInfo/CodeView/TypeHashing.cpp
+++ b/llvm/lib/DebugInfo/CodeView/TypeHashing.cpp
@@ -76,5 +76,6 @@ GloballyHashedType::hashType(ArrayRef<uint8_t> RecordData,
   auto TrailingBytes = RecordData.drop_front(Off);
   S.update(TrailingBytes);
 
-  return {S.final().take_back(8)};
+  std::array<uint8_t, 20> Hash = S.final();
+  return {ArrayRef<uint8_t>(Hash).take_back(8)};
 }
diff --git a/llvm/lib/DebugInfo/CodeView/TypeIndex.cpp b/llvm/lib/DebugInfo/CodeView/TypeIndex.cpp
index 604d342448d3..3aead9d50041 100644
--- a/llvm/lib/DebugInfo/CodeView/TypeIndex.cpp
+++ b/llvm/lib/DebugInfo/CodeView/TypeIndex.cpp
@@ -33,6 +33,7 @@ static const SimpleTypeEntry SimpleTypeNames[] = {
     {"wchar_t*", SimpleTypeKind::WideCharacter},
     {"char16_t*", SimpleTypeKind::Character16},
     {"char32_t*", SimpleTypeKind::Character32},
+    {"char8_t*", SimpleTypeKind::Character8},
     {"__int8*", SimpleTypeKind::SByte},
     {"unsigned __int8*", SimpleTypeKind::Byte},
     {"short*", SimpleTypeKind::Int16Short},
diff --git a/llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp b/llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp
index d272999bdab8..27f63b9edcd0 100644
--- a/llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp
+++ b/llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp
@@ -7,10 +7,28 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/CodeView/TypeRecordMapping.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/Twine.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/CodeViewRecordIO.h"
 #include "llvm/DebugInfo/CodeView/EnumTables.h"
+#include "llvm/DebugInfo/CodeView/RecordSerialization.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MD5.h"
+#include "llvm/Support/ScopedPrinter.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <vector>
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -210,8 +228,8 @@ static Error mapNameAndUniqueName(CodeViewRecordIO &IO, StringRef &Name,
 }
 
 Error TypeRecordMapping::visitTypeBegin(CVType &CVR) {
-  assert(!TypeKind.hasValue() && "Already in a type mapping!");
-  assert(!MemberKind.hasValue() && "Already in a member mapping!");
+  assert(!TypeKind && "Already in a type mapping!");
+  assert(!MemberKind && "Already in a member mapping!");
 
   // FieldList and MethodList records can be any length because they can be
   // split with continuation records.  All other record types cannot be
@@ -242,8 +260,8 @@ Error TypeRecordMapping::visitTypeBegin(CVType &CVR, TypeIndex Index) {
 }
 
 Error TypeRecordMapping::visitTypeEnd(CVType &Record) {
-  assert(TypeKind.hasValue() && "Not in a type mapping!");
-  assert(!MemberKind.hasValue() && "Still in a member mapping!");
+  assert(TypeKind && "Not in a type mapping!");
+  assert(!MemberKind && "Still in a member mapping!");
 
   error(IO.endRecord());
 
@@ -252,8 +270,8 @@ Error TypeRecordMapping::visitTypeEnd(CVType &Record) {
 }
 
 Error TypeRecordMapping::visitMemberBegin(CVMemberRecord &Record) {
-  assert(TypeKind.hasValue() && "Not in a type mapping!");
-  assert(!MemberKind.hasValue() && "Already in a member mapping!");
+  assert(TypeKind && "Not in a type mapping!");
+  assert(!MemberKind && "Already in a member mapping!");
 
   // The largest possible subrecord is one in which there is a record prefix,
   // followed by the subrecord, followed by a continuation, and that entire
@@ -278,8 +296,8 @@ Error TypeRecordMapping::visitMemberBegin(CVMemberRecord &Record) {
 }
 
 Error TypeRecordMapping::visitMemberEnd(CVMemberRecord &Record) {
-  assert(TypeKind.hasValue() && "Not in a type mapping!");
-  assert(MemberKind.hasValue() && "Not in a member mapping!");
+  assert(TypeKind && "Not in a type mapping!");
+  assert(MemberKind && "Not in a member mapping!");
 
   if (IO.isReading()) {
     if (auto EC = IO.skipPadding())
diff --git a/llvm/lib/DebugInfo/CodeView/TypeStreamMerger.cpp b/llvm/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
index 587a68142a4a..7ddfb7ab2f8d 100644
--- a/llvm/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
+++ b/llvm/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/CodeView/TypeStreamMerger.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/DebugInfo/CodeView/GlobalTypeTableBuilder.h"
 #include "llvm/DebugInfo/CodeView/MergingTypeTableBuilder.h"
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
@@ -487,7 +487,7 @@ Expected<bool> TypeStreamMerger::shouldRemapType(const CVType &Type) {
     if (auto EC = TypeDeserializer::deserializeAs(const_cast<CVType &>(Type),
                                                   EP))
       return joinErrors(std::move(EC), errorCorruptRecord());
-    if (PCHSignature.hasValue())
+    if (PCHSignature)
       return errorCorruptRecord();
     PCHSignature.emplace(EP.getSignature());
     return false;
diff --git a/llvm/lib/DebugInfo/CodeView/TypeTableCollection.cpp b/llvm/lib/DebugInfo/CodeView/TypeTableCollection.cpp
index e517e8846d69..910a32730e39 100644
--- a/llvm/lib/DebugInfo/CodeView/TypeTableCollection.cpp
+++ b/llvm/lib/DebugInfo/CodeView/TypeTableCollection.cpp
@@ -8,9 +8,10 @@
 
 #include "llvm/DebugInfo/CodeView/TypeTableCollection.h"
 
-#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/RecordName.h"
-#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp b/llvm/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
index 1be5a752453a..e2ea5910932d 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
@@ -11,10 +11,10 @@
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
 #include "llvm/Support/DataExtractor.h"
-#include "llvm/Support/Format.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstddef>
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp b/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
index c77d4d4d989c..5727b3bdb05c 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
@@ -10,7 +10,6 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/BinaryFormat/Dwarf.h"
-#include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/DJB.h"
 #include "llvm/Support/Errc.h"
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFAddressRange.cpp b/llvm/lib/DebugInfo/DWARF/DWARFAddressRange.cpp
index 25d2e852a7fe..2d6c145f9237 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFAddressRange.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFAddressRange.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/DWARF/DWARFAddressRange.h"
+#include "llvm/DebugInfo/DIContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
-#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp b/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
index d68ecd4f8a42..6461f2ac031d 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
@@ -7,8 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
+#include "llvm/DebugInfo/DIContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
+
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
index ef50ad53650a..c785026f8461 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
@@ -15,6 +16,7 @@
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h"
 #include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
+#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugAddr.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h"
@@ -29,7 +31,11 @@
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
 #include "llvm/DebugInfo/DWARF/DWARFGdbIndex.h"
+#include "llvm/DebugInfo/DWARF/DWARFListTable.h"
+#include "llvm/DebugInfo/DWARF/DWARFLocationExpression.h"
+#include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
 #include "llvm/DebugInfo/DWARF/DWARFSection.h"
+#include "llvm/DebugInfo/DWARF/DWARFTypeUnit.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
 #include "llvm/DebugInfo/DWARF/DWARFVerifier.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -115,7 +121,7 @@ collectContributionData(DWARFContext::unit_iterator_range Units) {
                 const Optional<StrOffsetsContributionDescriptor> &R) {
                if (L && R)
                  return L->Base < R->Base;
-               return R.hasValue();
+               return R.has_value();
              });
 
   // Uniquify contributions, as it is possible that units (specifically
@@ -383,7 +389,7 @@ void DWARFContext::dump(
     OS << '\n' << Name << " contents:\n";
     if (auto DumpOffset = DumpOffsets[DIDT_ID_DebugInfo])
       for (const auto &U : Units)
-        U->getDIEForOffset(DumpOffset.getValue())
+        U->getDIEForOffset(*DumpOffset)
             .dump(OS, 0, DumpOpts.noImplicitRecursion());
     else
       for (const auto &U : Units)
@@ -763,6 +769,10 @@ bool DWARFContext::verify(raw_ostream &OS, DIDumpOptions DumpOpts) {
   DWARFVerifier verifier(OS, *this, DumpOpts);
 
   Success &= verifier.handleDebugAbbrev();
+  if (DumpOpts.DumpType & DIDT_DebugCUIndex)
+    Success &= verifier.handleDebugCUIndex();
+  if (DumpOpts.DumpType & DIDT_DebugTUIndex)
+    Success &= verifier.handleDebugTUIndex();
   if (DumpOpts.DumpType & DIDT_DebugInfo)
     Success &= verifier.handleDebugInfo();
   if (DumpOpts.DumpType & DIDT_DebugLine)
@@ -993,6 +1003,22 @@ Expected<const DWARFDebugLine::LineTable *> DWARFContext::getLineTableForUnit(
                                    RecoverableErrorHandler);
 }
 
+void DWARFContext::clearLineTableForUnit(DWARFUnit *U) {
+  if (!Line)
+    return;
+
+  auto UnitDIE = U->getUnitDIE();
+  if (!UnitDIE)
+    return;
+
+  auto Offset = toSectionOffset(UnitDIE.find(DW_AT_stmt_list));
+  if (!Offset)
+    return;
+
+  uint64_t stmtOffset = *Offset + U->getLineTableOffset();
+  Line->clearLineTable(stmtOffset);
+}
+
 void DWARFContext::parseNormalUnits() {
   if (!NormalUnits.empty())
     return;
@@ -1027,7 +1053,25 @@ DWARFCompileUnit *DWARFContext::getCompileUnitForAddress(uint64_t Address) {
   // First, get the offset of the compile unit.
   uint64_t CUOffset = getDebugAranges()->findAddress(Address);
   // Retrieve the compile unit.
-  return getCompileUnitForOffset(CUOffset);
+  if (DWARFCompileUnit *OffsetCU = getCompileUnitForOffset(CUOffset))
+    return OffsetCU;
+
+  // Global variables are often not found by the above search, for one of two
+  // reasons:
+  //   1. .debug_aranges may not include global variables. On clang, it seems we
+  //      put the globals in the aranges, but this isn't true for gcc.
+  //   2. Even if the global variable is in a .debug_arange, global variables
+  //      may not be captured in the [start, end) addresses described by the
+  //      parent compile unit.
+  //
+  // So, we walk the CU's and their child DI's manually, looking for the
+  // specific global variable.
+  for (std::unique_ptr<DWARFUnit> &CU : compile_units()) {
+    if (DWARFDie Die = CU->getVariableForAddress(Address)) {
+      return static_cast<DWARFCompileUnit *>(CU.get());
+    }
+  }
+  return nullptr;
 }
 
 DWARFContext::DIEsForAddress DWARFContext::getDIEsForAddress(uint64_t Address) {
@@ -1097,64 +1141,6 @@ static bool getFunctionNameAndStartLineForAddress(
   return FoundResult;
 }
 
-static Optional<uint64_t> getTypeSize(DWARFDie Type, uint64_t PointerSize) {
-  if (auto SizeAttr = Type.find(DW_AT_byte_size))
-    if (Optional<uint64_t> Size = SizeAttr->getAsUnsignedConstant())
-      return Size;
-
-  switch (Type.getTag()) {
-  case DW_TAG_pointer_type:
-  case DW_TAG_reference_type:
-  case DW_TAG_rvalue_reference_type:
-    return PointerSize;
-  case DW_TAG_ptr_to_member_type: {
-    if (DWARFDie BaseType = Type.getAttributeValueAsReferencedDie(DW_AT_type))
-      if (BaseType.getTag() == DW_TAG_subroutine_type)
-        return 2 * PointerSize;
-    return PointerSize;
-  }
-  case DW_TAG_const_type:
-  case DW_TAG_immutable_type:
-  case DW_TAG_volatile_type:
-  case DW_TAG_restrict_type:
-  case DW_TAG_typedef: {
-    if (DWARFDie BaseType = Type.getAttributeValueAsReferencedDie(DW_AT_type))
-      return getTypeSize(BaseType, PointerSize);
-    break;
-  }
-  case DW_TAG_array_type: {
-    DWARFDie BaseType = Type.getAttributeValueAsReferencedDie(DW_AT_type);
-    if (!BaseType)
-      return Optional<uint64_t>();
-    Optional<uint64_t> BaseSize = getTypeSize(BaseType, PointerSize);
-    if (!BaseSize)
-      return Optional<uint64_t>();
-    uint64_t Size = *BaseSize;
-    for (DWARFDie Child : Type) {
-      if (Child.getTag() != DW_TAG_subrange_type)
-        continue;
-
-      if (auto ElemCountAttr = Child.find(DW_AT_count))
-        if (Optional<uint64_t> ElemCount =
-                ElemCountAttr->getAsUnsignedConstant())
-          Size *= *ElemCount;
-      if (auto UpperBoundAttr = Child.find(DW_AT_upper_bound))
-        if (Optional<int64_t> UpperBound =
-                UpperBoundAttr->getAsSignedConstant()) {
-          int64_t LowerBound = 0;
-          if (auto LowerBoundAttr = Child.find(DW_AT_lower_bound))
-            LowerBound = LowerBoundAttr->getAsSignedConstant().getValueOr(0);
-          Size *= *UpperBound - LowerBound + 1;
-        }
-    }
-    return Size;
-  }
-  default:
-    break;
-  }
-  return Optional<uint64_t>();
-}
-
 static Optional<int64_t>
 getExpressionFrameOffset(ArrayRef<uint8_t> Expr,
                          Optional<unsigned> FrameBaseReg) {
@@ -1215,7 +1201,7 @@ void DWARFContext::addLocalsForDie(DWARFCompileUnit *CU, DWARFDie Subprogram,
       if (Optional<const char *> Name = dwarf::toString(*NameAttr))
         Local.Name = *Name;
     if (auto Type = Die.getAttributeValueAsReferencedDie(DW_AT_type))
-      Local.Size = getTypeSize(Type, getCUAddrSize());
+      Local.Size = Type.getTypeSize(getCUAddrSize());
     if (auto DeclFileAttr = Die.find(DW_AT_decl_file)) {
       if (const auto *LT = CU->getContext().getLineTableForUnit(CU))
         LT->getFileNameByIndex(
@@ -1256,7 +1242,6 @@ DWARFContext::getLocalsForAddress(object::SectionedAddress Address) {
 DILineInfo DWARFContext::getLineInfoForAddress(object::SectionedAddress Address,
                                                DILineInfoSpecifier Spec) {
   DILineInfo Result;
-
   DWARFCompileUnit *CU = getCompileUnitForAddress(Address.Address);
   if (!CU)
     return Result;
@@ -1271,6 +1256,22 @@ DILineInfo DWARFContext::getLineInfoForAddress(object::SectionedAddress Address,
           Spec.FLIKind, Result);
     }
   }
+
+  return Result;
+}
+
+DILineInfo
+DWARFContext::getLineInfoForDataAddress(object::SectionedAddress Address) {
+  DILineInfo Result;
+  DWARFCompileUnit *CU = getCompileUnitForAddress(Address.Address);
+  if (!CU)
+    return Result;
+
+  if (DWARFDie Die = CU->getVariableForAddress(Address.Address)) {
+    Result.FileName = Die.getDeclFile(FileLineInfoKind::AbsoluteFilePath);
+    Result.Line = Die.getDeclLine();
+  }
+
   return Result;
 }
 
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp
index da6f6ad903f4..b18b64382b41 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp
@@ -7,7 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
-#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFObject.h"
+#include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
+#include "llvm/Support/Errc.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugAddr.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugAddr.cpp
index 5b1c62e6a259..81fac4763ec1 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugAddr.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugAddr.cpp
@@ -9,6 +9,7 @@
 #include "llvm/DebugInfo/DWARF/DWARFDebugAddr.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/Support/Errc.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
index 1a1b8ea0976f..49ee27db6d54 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
@@ -7,10 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/DWARF/DWARFDebugAranges.h"
-#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
+#include "llvm/DebugInfo/DWARF/DWARFAddressRange.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h"
-#include "llvm/Support/DataExtractor.h"
+#include "llvm/DebugInfo/DWARF/DWARFObject.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -20,15 +22,15 @@ using namespace llvm;
 
 void DWARFDebugAranges::extract(
     DWARFDataExtractor DebugArangesData,
-    function_ref<void(Error)> RecoverableErrorHandler) {
+    function_ref<void(Error)> RecoverableErrorHandler,
+    function_ref<void(Error)> WarningHandler) {
   if (!DebugArangesData.isValidOffset(0))
     return;
   uint64_t Offset = 0;
   DWARFDebugArangeSet Set;
 
   while (DebugArangesData.isValidOffset(Offset)) {
-    if (Error E =
-            Set.extract(DebugArangesData, &Offset, RecoverableErrorHandler)) {
+    if (Error E = Set.extract(DebugArangesData, &Offset, WarningHandler)) {
       RecoverableErrorHandler(std::move(E));
       return;
     }
@@ -50,7 +52,8 @@ void DWARFDebugAranges::generate(DWARFContext *CTX) {
   // Extract aranges from .debug_aranges section.
   DWARFDataExtractor ArangesData(CTX->getDWARFObj().getArangesSection(),
                                  CTX->isLittleEndian(), 0);
-  extract(ArangesData, CTX->getRecoverableErrorHandler());
+  extract(ArangesData, CTX->getRecoverableErrorHandler(),
+          CTX->getWarningHandler());
 
   // Generate aranges from DIEs: even if .debug_aranges section is present,
   // it may describe only a small subset of compilation units, so we need to
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
index 92a461dbd941..cf9057c99dbd 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
@@ -12,8 +12,9 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/DebugInfo/DIContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
 #include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Errc.h"
@@ -1100,8 +1101,8 @@ Error DWARFDebugFrame::parse(DWARFDataExtractor Data) {
           default:
             return createStringError(
                 errc::invalid_argument,
-                "unknown augmentation character in entry at 0x%" PRIx64,
-                StartOffset);
+                "unknown augmentation character %c in entry at 0x%" PRIx64,
+                AugmentationString[i], StartOffset);
           case 'L':
             LSDAPointerEncoding = Data.getU8(&Offset);
             break;
@@ -1137,10 +1138,14 @@ Error DWARFDebugFrame::parse(DWARFDataExtractor Data) {
             // B-Key is used for signing functions associated with this
             // augmentation string
             break;
+            // This stack frame contains MTE tagged data, so needs to be
+            // untagged on unwind.
+          case 'G':
+            break;
           }
         }
 
-        if (AugmentationLength.hasValue()) {
+        if (AugmentationLength) {
           if (Offset != EndAugmentationOffset)
             return createStringError(errc::invalid_argument,
                                      "parsing augmentation data at 0x%" PRIx64
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
index 385bde51e2e7..7dbeebc2770f 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
@@ -9,10 +9,11 @@
 #include "llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
-#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Errc.h"
 #include <cstddef>
 #include <cstdint>
 
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
index f36d3f87257a..2e0780e249aa 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
@@ -12,12 +12,12 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
+#include "llvm/DebugInfo/DWARF/DWARFDie.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
-#include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
@@ -29,6 +29,10 @@
 using namespace llvm;
 using namespace dwarf;
 
+namespace llvm {
+class DwarfContext;
+}
+
 using FileLineInfoKind = DILineInfoSpecifier::FileLineInfoKind;
 
 namespace {
@@ -337,7 +341,7 @@ parseV5DirFileTables(const DWARFDataExtractor &DebugLineData,
               errc::invalid_argument,
               "failed to parse file entry because the MD5 hash is invalid");
         std::uninitialized_copy_n(Value.getAsBlock().getValue().begin(), 16,
-                                  FileEntry.Checksum.Bytes.begin());
+                                  FileEntry.Checksum.begin());
         break;
       default:
         break;
@@ -597,6 +601,10 @@ Expected<const DWARFDebugLine::LineTable *> DWARFDebugLine::getOrParseLineTable(
   return LT;
 }
 
+void DWARFDebugLine::clearLineTable(uint64_t Offset) {
+  LineTableMap.erase(Offset);
+}
+
 static StringRef getOpcodeName(uint8_t Opcode, uint8_t OpcodeBase) {
   assert(Opcode != 0);
   if (Opcode < OpcodeBase)
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
index f39c7871d603..b68af4cfafef 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
@@ -9,13 +9,13 @@
 #include "llvm/DebugInfo/DWARF/DWARFDebugLoc.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/Dwarf.h"
-#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/DIContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFAddressRange.h"
 #include "llvm/DebugInfo/DWARF/DWARFExpression.h"
-#include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
+#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
+#include "llvm/DebugInfo/DWARF/DWARFLocationExpression.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cinttypes>
@@ -24,6 +24,10 @@
 using namespace llvm;
 using object::SectionedAddress;
 
+namespace llvm {
+class DWARFObject;
+}
+
 namespace {
 class DWARFLocationInterpreter {
   Optional<object::SectionedAddress> Base;
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
index 7a81d7ff064b..80daea64814a 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
@@ -7,9 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/DWARF/DWARFDebugMacro.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/BinaryFormat/Dwarf.h"
-#include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
+#include "llvm/DebugInfo/DWARF/DWARFDie.h"
+#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdint>
@@ -112,7 +115,7 @@ Error DWARFDebugMacro::parseImpl(
   if (IsMacro && Data.isValidOffset(Offset)) {
     // Keep a mapping from Macro contribution to CUs, this will
     // be needed while retrieving macro from DW_MACRO_define_strx form.
-    for (const auto &U : Units.getValue())
+    for (const auto &U : *Units)
       if (auto CUDIE = U->getUnitDIE())
         // Skip units which does not contibutes to macro section.
         if (auto MacroOffset = toSectionOffset(CUDIE.find(DW_AT_macros)))
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
index ec7889a3728a..96c546250974 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -14,19 +14,20 @@
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugLoc.h"
 #include "llvm/DebugInfo/DWARF/DWARFExpression.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
+#include "llvm/DebugInfo/DWARF/DWARFTypePrinter.h"
+#include "llvm/DebugInfo/DWARF/DWARFTypeUnit.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/FormatAdapters.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cassert>
 #include <cinttypes>
 #include <cstdint>
@@ -106,586 +107,10 @@ static void dumpLocationExpr(raw_ostream &OS, const DWARFFormValue &FormValue,
       .print(OS, DumpOpts, MRI, U);
 }
 
-static DWARFDie resolveReferencedType(DWARFDie D,
-                                      dwarf::Attribute Attr = DW_AT_type) {
-  return D.getAttributeValueAsReferencedDie(Attr).resolveTypeUnitReference();
-}
 static DWARFDie resolveReferencedType(DWARFDie D, DWARFFormValue F) {
   return D.getAttributeValueAsReferencedDie(F).resolveTypeUnitReference();
 }
 
-namespace {
-
-// FIXME: We should have pretty printers per language. Currently we print
-// everything as if it was C++ and fall back to the TAG type name.
-struct DWARFTypePrinter {
-  raw_ostream &OS;
-  bool Word = true;
-  bool EndedWithTemplate = false;
-
-  DWARFTypePrinter(raw_ostream &OS) : OS(OS) {}
-
-  /// Dump the name encoded in the type tag.
-  void appendTypeTagName(dwarf::Tag T) {
-    StringRef TagStr = TagString(T);
-    static constexpr StringRef Prefix = "DW_TAG_";
-    static constexpr StringRef Suffix = "_type";
-    if (!TagStr.startswith(Prefix) || !TagStr.endswith(Suffix))
-      return;
-    OS << TagStr.substr(Prefix.size(),
-                        TagStr.size() - (Prefix.size() + Suffix.size()))
-       << " ";
-  }
-
-  void appendArrayType(const DWARFDie &D) {
-    for (const DWARFDie &C : D.children()) {
-      if (C.getTag() != DW_TAG_subrange_type)
-        continue;
-      Optional<uint64_t> LB;
-      Optional<uint64_t> Count;
-      Optional<uint64_t> UB;
-      Optional<unsigned> DefaultLB;
-      if (Optional<DWARFFormValue> L = C.find(DW_AT_lower_bound))
-        LB = L->getAsUnsignedConstant();
-      if (Optional<DWARFFormValue> CountV = C.find(DW_AT_count))
-        Count = CountV->getAsUnsignedConstant();
-      if (Optional<DWARFFormValue> UpperV = C.find(DW_AT_upper_bound))
-        UB = UpperV->getAsUnsignedConstant();
-      if (Optional<DWARFFormValue> LV =
-              D.getDwarfUnit()->getUnitDIE().find(DW_AT_language))
-        if (Optional<uint64_t> LC = LV->getAsUnsignedConstant())
-          if ((DefaultLB =
-                   LanguageLowerBound(static_cast<dwarf::SourceLanguage>(*LC))))
-            if (LB && *LB == *DefaultLB)
-              LB = None;
-      if (!LB && !Count && !UB)
-        OS << "[]";
-      else if (!LB && (Count || UB) && DefaultLB)
-        OS << '[' << (Count ? *Count : *UB - *DefaultLB + 1) << ']';
-      else {
-        OS << "[[";
-        if (LB)
-          OS << *LB;
-        else
-          OS << '?';
-        OS << ", ";
-        if (Count)
-          if (LB)
-            OS << *LB + *Count;
-          else
-            OS << "? + " << *Count;
-        else if (UB)
-          OS << *UB + 1;
-        else
-          OS << '?';
-        OS << ")]";
-      }
-    }
-    EndedWithTemplate = false;
-  }
-
-  DWARFDie skipQualifiers(DWARFDie D) {
-    while (D && (D.getTag() == DW_TAG_const_type ||
-                 D.getTag() == DW_TAG_volatile_type))
-      D = resolveReferencedType(D);
-    return D;
-  }
-
-  bool needsParens(DWARFDie D) {
-    D = skipQualifiers(D);
-    return D && (D.getTag() == DW_TAG_subroutine_type || D.getTag() == DW_TAG_array_type);
-  }
-
-  void appendPointerLikeTypeBefore(DWARFDie D, DWARFDie Inner, StringRef Ptr) {
-    appendQualifiedNameBefore(Inner);
-    if (Word)
-      OS << ' ';
-    if (needsParens(Inner))
-      OS << '(';
-    OS << Ptr;
-    Word = false;
-    EndedWithTemplate = false;
-  }
-
-  DWARFDie
-  appendUnqualifiedNameBefore(DWARFDie D,
-                              std::string *OriginalFullName = nullptr) {
-    Word = true;
-    if (!D) {
-      OS << "void";
-      return DWARFDie();
-    }
-    DWARFDie InnerDIE;
-    auto Inner = [&] { return InnerDIE = resolveReferencedType(D); };
-    const dwarf::Tag T = D.getTag();
-    switch (T) {
-    case DW_TAG_pointer_type: {
-      appendPointerLikeTypeBefore(D, Inner(), "*");
-      break;
-    }
-    case DW_TAG_subroutine_type: {
-      appendQualifiedNameBefore(Inner());
-      if (Word) {
-        OS << ' ';
-      }
-      Word = false;
-      break;
-    }
-    case DW_TAG_array_type: {
-      appendQualifiedNameBefore(Inner());
-      break;
-    }
-    case DW_TAG_reference_type:
-      appendPointerLikeTypeBefore(D, Inner(), "&");
-      break;
-    case DW_TAG_rvalue_reference_type:
-      appendPointerLikeTypeBefore(D, Inner(), "&&");
-      break;
-    case DW_TAG_ptr_to_member_type: {
-      appendQualifiedNameBefore(Inner());
-      if (needsParens(InnerDIE))
-        OS << '(';
-      else if (Word)
-        OS << ' ';
-      if (DWARFDie Cont = resolveReferencedType(D, DW_AT_containing_type)) {
-        appendQualifiedName(Cont);
-        OS << "::";
-      }
-      OS << "*";
-      Word = false;
-      break;
-    }
-    case DW_TAG_const_type:
-    case DW_TAG_volatile_type:
-      appendConstVolatileQualifierBefore(D);
-      break;
-    case DW_TAG_namespace: {
-      if (const char *Name = dwarf::toString(D.find(DW_AT_name), nullptr))
-        OS << Name;
-      else
-        OS << "(anonymous namespace)";
-      break;
-    }
-    case DW_TAG_unspecified_type: {
-      StringRef TypeName = D.getShortName();
-      if (TypeName == "decltype(nullptr)")
-        TypeName = "std::nullptr_t";
-      Word = true;
-      OS << TypeName;
-      EndedWithTemplate = false;
-      break;
-    }
-      /*
-    case DW_TAG_structure_type:
-    case DW_TAG_class_type:
-    case DW_TAG_enumeration_type:
-    case DW_TAG_base_type:
-    */
-    default: {
-      const char *NamePtr = dwarf::toString(D.find(DW_AT_name), nullptr);
-      if (!NamePtr) {
-        appendTypeTagName(D.getTag());
-        return DWARFDie();
-      }
-      Word = true;
-      StringRef Name = NamePtr;
-      static constexpr StringRef MangledPrefix = "_STN";
-      if (Name.startswith(MangledPrefix)) {
-        Name = Name.drop_front(MangledPrefix.size());
-        auto Separator = Name.find('|');
-        assert(Separator != StringRef::npos);
-        StringRef BaseName = Name.substr(0, Separator);
-        StringRef TemplateArgs = Name.substr(Separator + 1);
-        if (OriginalFullName)
-          *OriginalFullName = (BaseName + TemplateArgs).str();
-        Name = BaseName;
-      } else
-        EndedWithTemplate = Name.endswith(">");
-      OS << Name;
-      // This check would be insufficient for operator overloads like
-      // "operator>>" - but for now Clang doesn't try to simplify them, so this
-      // is OK. Add more nuanced operator overload handling here if/when needed.
-      if (Name.endswith(">"))
-        break;
-      if (!appendTemplateParameters(D))
-        break;
-
-      if (EndedWithTemplate)
-        OS << ' ';
-      OS << '>';
-      EndedWithTemplate = true;
-      Word = true;
-      break;
-    }
-    }
-    return InnerDIE;
-  }
-
-  void appendUnqualifiedNameAfter(DWARFDie D, DWARFDie Inner,
-                                  bool SkipFirstParamIfArtificial = false) {
-    if (!D)
-      return;
-    switch (D.getTag()) {
-    case DW_TAG_subroutine_type: {
-      appendSubroutineNameAfter(D, Inner, SkipFirstParamIfArtificial, false,
-                                false);
-      break;
-    }
-    case DW_TAG_array_type: {
-      appendArrayType(D);
-      break;
-    }
-    case DW_TAG_const_type:
-    case DW_TAG_volatile_type:
-      appendConstVolatileQualifierAfter(D);
-      break;
-    case DW_TAG_ptr_to_member_type:
-    case DW_TAG_reference_type:
-    case DW_TAG_rvalue_reference_type:
-    case DW_TAG_pointer_type: {
-      if (needsParens(Inner))
-        OS << ')';
-      appendUnqualifiedNameAfter(Inner, resolveReferencedType(Inner),
-                                 /*SkipFirstParamIfArtificial=*/D.getTag() ==
-                                     DW_TAG_ptr_to_member_type);
-      break;
-    }
-      /*
-    case DW_TAG_structure_type:
-    case DW_TAG_class_type:
-    case DW_TAG_enumeration_type:
-    case DW_TAG_base_type:
-    case DW_TAG_namespace:
-    */
-    default:
-      break;
-    }
-  }
-
-  void appendQualifiedName(DWARFDie D) {
-    if (D)
-      appendScopes(D.getParent());
-    appendUnqualifiedName(D);
-  }
-  DWARFDie appendQualifiedNameBefore(DWARFDie D) {
-    if (D)
-      appendScopes(D.getParent());
-    return appendUnqualifiedNameBefore(D);
-  }
-  bool appendTemplateParameters(DWARFDie D, bool *FirstParameter = nullptr) {
-    bool FirstParameterValue = true;
-    bool IsTemplate = false;
-    if (!FirstParameter)
-      FirstParameter = &FirstParameterValue;
-    for (const DWARFDie &C : D) {
-      auto Sep = [&] {
-        if (*FirstParameter)
-          OS << '<';
-        else
-          OS << ", ";
-        IsTemplate = true;
-        EndedWithTemplate = false;
-        *FirstParameter = false;
-      };
-      if (C.getTag() == dwarf::DW_TAG_GNU_template_parameter_pack) {
-        IsTemplate = true;
-        appendTemplateParameters(C, FirstParameter);
-      }
-      if (C.getTag() == dwarf::DW_TAG_template_value_parameter) {
-        DWARFDie T = resolveReferencedType(C);
-        Sep();
-        if (T.getTag() == DW_TAG_enumeration_type) {
-          auto V = C.find(DW_AT_const_value);
-          bool FoundEnumerator = false;
-          for (const DWARFDie &Enumerator : T) {
-            auto EV = Enumerator.find(DW_AT_const_value);
-            if (V && EV &&
-                V->getAsSignedConstant() == EV->getAsSignedConstant()) {
-              if (T.find(DW_AT_enum_class)) {
-                appendQualifiedName(T);
-                OS << "::";
-              } else
-                appendScopes(T.getParent());
-              OS << Enumerator.getShortName();
-              FoundEnumerator = true;
-              break;
-            }
-          }
-          if (FoundEnumerator)
-            continue;
-          OS << '(';
-          appendQualifiedName(T);
-          OS << ')';
-          OS << to_string(*V->getAsSignedConstant());
-          continue;
-        }
-        // /Maybe/ we could do pointer type parameters, looking for the
-        // symbol in the ELF symbol table to get back to the variable...
-        // but probably not worth it.
-        if (T.getTag() == DW_TAG_pointer_type)
-          continue;
-        const char *RawName = dwarf::toString(T.find(DW_AT_name), nullptr);
-        assert(RawName);
-        StringRef Name = RawName;
-        auto V = C.find(DW_AT_const_value);
-        bool IsQualifiedChar = false;
-        if (Name == "bool") {
-          OS << (*V->getAsUnsignedConstant() ? "true" : "false");
-        } else if (Name == "short") {
-          OS << "(short)";
-          OS << to_string(*V->getAsSignedConstant());
-        } else if (Name == "unsigned short") {
-          OS << "(unsigned short)";
-          OS << to_string(*V->getAsSignedConstant());
-        } else if (Name == "int")
-          OS << to_string(*V->getAsSignedConstant());
-        else if (Name == "long") {
-          OS << to_string(*V->getAsSignedConstant());
-          OS << "L";
-        } else if (Name == "long long") {
-          OS << to_string(*V->getAsSignedConstant());
-          OS << "LL";
-        } else if (Name == "unsigned int") {
-          OS << to_string(*V->getAsUnsignedConstant());
-          OS << "U";
-        } else if (Name == "unsigned long") {
-          OS << to_string(*V->getAsUnsignedConstant());
-          OS << "UL";
-        } else if (Name == "unsigned long long") {
-          OS << to_string(*V->getAsUnsignedConstant());
-          OS << "ULL";
-        } else if (Name == "char" ||
-                   (IsQualifiedChar =
-                        (Name == "unsigned char" || Name == "signed char"))) {
-          // FIXME: check T's DW_AT_type to see if it's signed or not (since
-          // char signedness is implementation defined).
-          auto Val = *V->getAsSignedConstant();
-          // Copied/hacked up from Clang's CharacterLiteral::print - incomplete
-          // (doesn't actually support different character types/widths, sign
-          // handling's not done, and doesn't correctly test if a character is
-          // printable or needs to use a numeric escape sequence instead)
-          if (IsQualifiedChar) {
-            OS << '(';
-            OS << Name;
-            OS << ')';
-          }
-          switch (Val) {
-          case '\\':
-            OS << "'\\\\'";
-            break;
-          case '\'':
-            OS << "'\\''";
-            break;
-          case '\a':
-            // TODO: K&R: the meaning of '\\a' is different in traditional C
-            OS << "'\\a'";
-            break;
-          case '\b':
-            OS << "'\\b'";
-            break;
-          case '\f':
-            OS << "'\\f'";
-            break;
-          case '\n':
-            OS << "'\\n'";
-            break;
-          case '\r':
-            OS << "'\\r'";
-            break;
-          case '\t':
-            OS << "'\\t'";
-            break;
-          case '\v':
-            OS << "'\\v'";
-            break;
-          default:
-            if ((Val & ~0xFFu) == ~0xFFu)
-              Val &= 0xFFu;
-            if (Val < 127 && Val >= 32) {
-              OS << "'";
-              OS << (char)Val;
-              OS << "'";
-            } else if (Val < 256)
-              OS << to_string(llvm::format("'\\x%02x'", Val));
-            else if (Val <= 0xFFFF)
-              OS << to_string(llvm::format("'\\u%04x'", Val));
-            else
-              OS << to_string(llvm::format("'\\U%08x'", Val));
-          }
-        }
-        continue;
-      }
-      if (C.getTag() == dwarf::DW_TAG_GNU_template_template_param) {
-        const char *RawName =
-            dwarf::toString(C.find(DW_AT_GNU_template_name), nullptr);
-        assert(RawName);
-        StringRef Name = RawName;
-        Sep();
-        OS << Name;
-        continue;
-      }
-      if (C.getTag() != dwarf::DW_TAG_template_type_parameter)
-        continue;
-      auto TypeAttr = C.find(DW_AT_type);
-      Sep();
-      appendQualifiedName(TypeAttr ? resolveReferencedType(C, *TypeAttr)
-                                   : DWARFDie());
-    }
-    if (IsTemplate && *FirstParameter && FirstParameter == &FirstParameterValue)
-      OS << '<';
-    return IsTemplate;
-  }
-  void decomposeConstVolatile(DWARFDie &N, DWARFDie &T, DWARFDie &C,
-                              DWARFDie &V) {
-    (N.getTag() == DW_TAG_const_type ? C : V) = N;
-    T = resolveReferencedType(N);
-    if (T) {
-      auto Tag = T.getTag();
-      if (Tag == DW_TAG_const_type) {
-        C = T;
-        T = resolveReferencedType(T);
-      } else if (Tag == DW_TAG_volatile_type) {
-        V = T;
-        T = resolveReferencedType(T);
-      }
-    }
-  }
-  void appendConstVolatileQualifierAfter(DWARFDie N) {
-    DWARFDie C;
-    DWARFDie V;
-    DWARFDie T;
-    decomposeConstVolatile(N, T, C, V);
-    if (T && T.getTag() == DW_TAG_subroutine_type)
-      appendSubroutineNameAfter(T, resolveReferencedType(T), false, C.isValid(),
-                                V.isValid());
-    else
-      appendUnqualifiedNameAfter(T, resolveReferencedType(T));
-  }
-  void appendConstVolatileQualifierBefore(DWARFDie N) {
-    DWARFDie C;
-    DWARFDie V;
-    DWARFDie T;
-    decomposeConstVolatile(N, T, C, V);
-    bool Subroutine = T && T.getTag() == DW_TAG_subroutine_type;
-    DWARFDie A = T;
-    while (A && A.getTag() == DW_TAG_array_type)
-      A = resolveReferencedType(A);
-    bool Leading =
-        (!A || (A.getTag() != DW_TAG_pointer_type &&
-                A.getTag() != llvm::dwarf::DW_TAG_ptr_to_member_type)) &&
-        !Subroutine;
-    if (Leading) {
-      if (C)
-        OS << "const ";
-      if (V)
-        OS << "volatile ";
-    }
-    appendQualifiedNameBefore(T);
-    if (!Leading && !Subroutine) {
-      Word = true;
-      if (C)
-        OS << "const";
-      if (V) {
-        if (C)
-          OS << ' ';
-        OS << "volatile";
-      }
-    }
-  }
-
-  /// Recursively append the DIE type name when applicable.
-  void appendUnqualifiedName(DWARFDie D,
-                             std::string *OriginalFullName = nullptr) {
-    // FIXME: We should have pretty printers per language. Currently we print
-    // everything as if it was C++ and fall back to the TAG type name.
-    DWARFDie Inner = appendUnqualifiedNameBefore(D, OriginalFullName);
-    appendUnqualifiedNameAfter(D, Inner);
-  }
-
-  void appendSubroutineNameAfter(DWARFDie D, DWARFDie Inner,
-                                 bool SkipFirstParamIfArtificial, bool Const,
-                                 bool Volatile) {
-    DWARFDie FirstParamIfArtificial;
-    OS << '(';
-    EndedWithTemplate = false;
-    bool First = true;
-    bool RealFirst = true;
-    for (DWARFDie P : D) {
-      if (P.getTag() != DW_TAG_formal_parameter &&
-          P.getTag() != DW_TAG_unspecified_parameters)
-        return;
-      DWARFDie T = resolveReferencedType(P);
-      if (SkipFirstParamIfArtificial && RealFirst && P.find(DW_AT_artificial)) {
-        FirstParamIfArtificial = T;
-        RealFirst = false;
-        continue;
-      }
-      if (!First) {
-        OS << ", ";
-      }
-      First = false;
-      if (P.getTag() == DW_TAG_unspecified_parameters)
-        OS << "...";
-      else
-        appendQualifiedName(T);
-    }
-    EndedWithTemplate = false;
-    OS << ')';
-    if (FirstParamIfArtificial) {
-      if (DWARFDie P = FirstParamIfArtificial) {
-        if (P.getTag() == DW_TAG_pointer_type) {
-          DWARFDie C;
-          DWARFDie V;
-          auto CVStep = [&](DWARFDie CV) {
-            if (DWARFDie U = resolveReferencedType(CV)) {
-              if (U.getTag() == DW_TAG_const_type)
-                return C = U;
-              if (U.getTag() == DW_TAG_volatile_type)
-                return V = U;
-            }
-            return DWARFDie();
-          };
-          if (DWARFDie CV = CVStep(P)) {
-            CVStep(CV);
-          }
-          if (C)
-            OS << " const";
-          if (V)
-            OS << " volatile";
-        }
-      }
-    } else {
-      if (Const)
-        OS << " const";
-      if (Volatile)
-        OS << " volatile";
-    }
-    if (D.find(DW_AT_reference))
-      OS << " &";
-    if (D.find(DW_AT_rvalue_reference))
-      OS << " &&";
-    appendUnqualifiedNameAfter(Inner, resolveReferencedType(Inner));
-  }
-  void appendScopes(DWARFDie D) {
-    if (D.getTag() == DW_TAG_compile_unit)
-      return;
-    if (D.getTag() == DW_TAG_type_unit)
-      return;
-    if (D.getTag() == DW_TAG_skeleton_unit)
-      return;
-    if (D.getTag() == DW_TAG_subprogram)
-      return;
-    if (D.getTag() == DW_TAG_lexical_block)
-      return;
-    D = D.resolveTypeUnitReference();
-    if (DWARFDie P = D.getParent())
-      appendScopes(P);
-    appendUnqualifiedName(D);
-    OS << "::";
-  }
-};
-} // anonymous namespace
-
 static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
                           const DWARFAttribute &AttrValue, unsigned Indent,
                           DIDumpOptions DumpOpts) {
@@ -713,8 +138,7 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
     Color = HighlightColor::String;
     if (const auto *LT = U->getContext().getLineTableForUnit(U))
       if (LT->getFileNameByIndex(
-              FormValue.getAsUnsignedConstant().getValue(),
-              U->getCompilationDir(),
+              *FormValue.getAsUnsignedConstant(), U->getCompilationDir(),
               DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath, File)) {
         File = '"' + File + '"';
         Name = File;
@@ -768,7 +192,7 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
             Die.getAttributeValueAsReferencedDie(FormValue).getName(
                 DINameKind::LinkageName))
       OS << Space << "\"" << Name << '\"';
-  } else if (Attr == DW_AT_type) {
+  } else if (Attr == DW_AT_type || Attr == DW_AT_containing_type) {
     DWARFDie D = resolveReferencedType(Die, FormValue);
     if (D && !D.isNULL()) {
       OS << Space << "\"";
@@ -1061,6 +485,66 @@ void DWARFDie::getCallerFrame(uint32_t &CallFile, uint32_t &CallLine,
   CallDiscriminator = toUnsigned(find(DW_AT_GNU_discriminator), 0);
 }
 
+Optional<uint64_t> DWARFDie::getTypeSize(uint64_t PointerSize) {
+  if (auto SizeAttr = find(DW_AT_byte_size))
+    if (Optional<uint64_t> Size = SizeAttr->getAsUnsignedConstant())
+      return Size;
+
+  switch (getTag()) {
+  case DW_TAG_pointer_type:
+  case DW_TAG_reference_type:
+  case DW_TAG_rvalue_reference_type:
+    return PointerSize;
+  case DW_TAG_ptr_to_member_type: {
+    if (DWARFDie BaseType = getAttributeValueAsReferencedDie(DW_AT_type))
+      if (BaseType.getTag() == DW_TAG_subroutine_type)
+        return 2 * PointerSize;
+    return PointerSize;
+  }
+  case DW_TAG_const_type:
+  case DW_TAG_immutable_type:
+  case DW_TAG_volatile_type:
+  case DW_TAG_restrict_type:
+  case DW_TAG_typedef: {
+    if (DWARFDie BaseType = getAttributeValueAsReferencedDie(DW_AT_type))
+      return BaseType.getTypeSize(PointerSize);
+    break;
+  }
+  case DW_TAG_array_type: {
+    DWARFDie BaseType = getAttributeValueAsReferencedDie(DW_AT_type);
+    if (!BaseType)
+      return None;
+    Optional<uint64_t> BaseSize = BaseType.getTypeSize(PointerSize);
+    if (!BaseSize)
+      return None;
+    uint64_t Size = *BaseSize;
+    for (DWARFDie Child : *this) {
+      if (Child.getTag() != DW_TAG_subrange_type)
+        continue;
+
+      if (auto ElemCountAttr = Child.find(DW_AT_count))
+        if (Optional<uint64_t> ElemCount =
+                ElemCountAttr->getAsUnsignedConstant())
+          Size *= *ElemCount;
+      if (auto UpperBoundAttr = Child.find(DW_AT_upper_bound))
+        if (Optional<int64_t> UpperBound =
+                UpperBoundAttr->getAsSignedConstant()) {
+          int64_t LowerBound = 0;
+          if (auto LowerBoundAttr = Child.find(DW_AT_lower_bound))
+            LowerBound = LowerBoundAttr->getAsSignedConstant().value_or(0);
+          Size *= *UpperBound - LowerBound + 1;
+        }
+    }
+    return Size;
+  }
+  default:
+    if (DWARFDie BaseType = getAttributeValueAsReferencedDie(DW_AT_type))
+      return BaseType.getTypeSize(PointerSize);
+    break;
+  }
+  return None;
+}
+
 /// Helper to dump a DIE with all of its parents, but no siblings.
 static unsigned dumpParentChain(DWARFDie Die, raw_ostream &OS, unsigned Indent,
                                 DIDumpOptions DumpOpts, unsigned Depth = 0) {
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp b/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp
index 86991a3949dd..1fecd5ee6902 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp
@@ -13,7 +13,10 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
-#include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
+#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
+#include "llvm/DebugInfo/DWARF/DWARFObject.h"
+#include "llvm/DebugInfo/DWARF/DWARFSection.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp b/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
index ace7000f07b2..3f140d21c53c 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
@@ -9,10 +9,10 @@
 #include "llvm/DebugInfo/DWARF/DWARFGdbIndex.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cassert>
 #include <cinttypes>
 #include <cstdint>
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFTypePrinter.cpp b/llvm/lib/DebugInfo/DWARF/DWARFTypePrinter.cpp
new file mode 100644
index 000000000000..86cc07b0d0f2
--- /dev/null
+++ b/llvm/lib/DebugInfo/DWARF/DWARFTypePrinter.cpp
@@ -0,0 +1,608 @@
+#include "llvm/DebugInfo/DWARF/DWARFTypePrinter.h"
+#include "llvm/DebugInfo/DWARF/DWARFDie.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
+#include "llvm/Support/ScopedPrinter.h"
+namespace llvm {
+using namespace dwarf;
+void DWARFTypePrinter::appendTypeTagName(dwarf::Tag T) {
+  StringRef TagStr = TagString(T);
+  static constexpr StringRef Prefix = "DW_TAG_";
+  static constexpr StringRef Suffix = "_type";
+  if (!TagStr.startswith(Prefix) || !TagStr.endswith(Suffix))
+    return;
+  OS << TagStr.substr(Prefix.size(),
+                      TagStr.size() - (Prefix.size() + Suffix.size()))
+     << " ";
+}
+
+void DWARFTypePrinter::appendArrayType(const DWARFDie &D) {
+  for (const DWARFDie &C : D.children()) {
+    if (C.getTag() != DW_TAG_subrange_type)
+      continue;
+    Optional<uint64_t> LB;
+    Optional<uint64_t> Count;
+    Optional<uint64_t> UB;
+    Optional<unsigned> DefaultLB;
+    if (Optional<DWARFFormValue> L = C.find(DW_AT_lower_bound))
+      LB = L->getAsUnsignedConstant();
+    if (Optional<DWARFFormValue> CountV = C.find(DW_AT_count))
+      Count = CountV->getAsUnsignedConstant();
+    if (Optional<DWARFFormValue> UpperV = C.find(DW_AT_upper_bound))
+      UB = UpperV->getAsUnsignedConstant();
+    if (Optional<DWARFFormValue> LV =
+            D.getDwarfUnit()->getUnitDIE().find(DW_AT_language))
+      if (Optional<uint64_t> LC = LV->getAsUnsignedConstant())
+        if ((DefaultLB =
+                 LanguageLowerBound(static_cast<dwarf::SourceLanguage>(*LC))))
+          if (LB && *LB == *DefaultLB)
+            LB = None;
+    if (!LB && !Count && !UB)
+      OS << "[]";
+    else if (!LB && (Count || UB) && DefaultLB)
+      OS << '[' << (Count ? *Count : *UB - *DefaultLB + 1) << ']';
+    else {
+      OS << "[[";
+      if (LB)
+        OS << *LB;
+      else
+        OS << '?';
+      OS << ", ";
+      if (Count)
+        if (LB)
+          OS << *LB + *Count;
+        else
+          OS << "? + " << *Count;
+      else if (UB)
+        OS << *UB + 1;
+      else
+        OS << '?';
+      OS << ")]";
+    }
+  }
+  EndedWithTemplate = false;
+}
+
+static DWARFDie resolveReferencedType(DWARFDie D,
+                                      dwarf::Attribute Attr = DW_AT_type) {
+  return D.getAttributeValueAsReferencedDie(Attr).resolveTypeUnitReference();
+}
+static DWARFDie resolveReferencedType(DWARFDie D, DWARFFormValue F) {
+  return D.getAttributeValueAsReferencedDie(F).resolveTypeUnitReference();
+}
+DWARFDie DWARFTypePrinter::skipQualifiers(DWARFDie D) {
+  while (D && (D.getTag() == DW_TAG_const_type ||
+               D.getTag() == DW_TAG_volatile_type))
+    D = resolveReferencedType(D);
+  return D;
+}
+
+bool DWARFTypePrinter::needsParens(DWARFDie D) {
+  D = skipQualifiers(D);
+  return D && (D.getTag() == DW_TAG_subroutine_type ||
+               D.getTag() == DW_TAG_array_type);
+}
+
+void DWARFTypePrinter::appendPointerLikeTypeBefore(DWARFDie D, DWARFDie Inner,
+                                                   StringRef Ptr) {
+  appendQualifiedNameBefore(Inner);
+  if (Word)
+    OS << ' ';
+  if (needsParens(Inner))
+    OS << '(';
+  OS << Ptr;
+  Word = false;
+  EndedWithTemplate = false;
+}
+
+DWARFDie
+DWARFTypePrinter::appendUnqualifiedNameBefore(DWARFDie D,
+                                              std::string *OriginalFullName) {
+  Word = true;
+  if (!D) {
+    OS << "void";
+    return DWARFDie();
+  }
+  DWARFDie InnerDIE;
+  auto Inner = [&] { return InnerDIE = resolveReferencedType(D); };
+  const dwarf::Tag T = D.getTag();
+  switch (T) {
+  case DW_TAG_pointer_type: {
+    appendPointerLikeTypeBefore(D, Inner(), "*");
+    break;
+  }
+  case DW_TAG_subroutine_type: {
+    appendQualifiedNameBefore(Inner());
+    if (Word) {
+      OS << ' ';
+    }
+    Word = false;
+    break;
+  }
+  case DW_TAG_array_type: {
+    appendQualifiedNameBefore(Inner());
+    break;
+  }
+  case DW_TAG_reference_type:
+    appendPointerLikeTypeBefore(D, Inner(), "&");
+    break;
+  case DW_TAG_rvalue_reference_type:
+    appendPointerLikeTypeBefore(D, Inner(), "&&");
+    break;
+  case DW_TAG_ptr_to_member_type: {
+    appendQualifiedNameBefore(Inner());
+    if (needsParens(InnerDIE))
+      OS << '(';
+    else if (Word)
+      OS << ' ';
+    if (DWARFDie Cont = resolveReferencedType(D, DW_AT_containing_type)) {
+      appendQualifiedName(Cont);
+      EndedWithTemplate = false;
+      OS << "::";
+    }
+    OS << "*";
+    Word = false;
+    break;
+  }
+  case DW_TAG_const_type:
+  case DW_TAG_volatile_type:
+    appendConstVolatileQualifierBefore(D);
+    break;
+  case DW_TAG_namespace: {
+    if (const char *Name = dwarf::toString(D.find(DW_AT_name), nullptr))
+      OS << Name;
+    else
+      OS << "(anonymous namespace)";
+    break;
+  }
+  case DW_TAG_unspecified_type: {
+    StringRef TypeName = D.getShortName();
+    if (TypeName == "decltype(nullptr)")
+      TypeName = "std::nullptr_t";
+    Word = true;
+    OS << TypeName;
+    EndedWithTemplate = false;
+    break;
+  }
+    /*
+  case DW_TAG_structure_type:
+  case DW_TAG_class_type:
+  case DW_TAG_enumeration_type:
+  case DW_TAG_base_type:
+  */
+  default: {
+    const char *NamePtr = dwarf::toString(D.find(DW_AT_name), nullptr);
+    if (!NamePtr) {
+      appendTypeTagName(D.getTag());
+      return DWARFDie();
+    }
+    Word = true;
+    StringRef Name = NamePtr;
+    static constexpr StringRef MangledPrefix = "_STN|";
+    if (Name.startswith(MangledPrefix)) {
+      Name = Name.drop_front(MangledPrefix.size());
+      auto Separator = Name.find('|');
+      assert(Separator != StringRef::npos);
+      StringRef BaseName = Name.substr(0, Separator);
+      StringRef TemplateArgs = Name.substr(Separator + 1);
+      if (OriginalFullName)
+        *OriginalFullName = (BaseName + TemplateArgs).str();
+      Name = BaseName;
+    } else
+      EndedWithTemplate = Name.endswith(">");
+    OS << Name;
+    // This check would be insufficient for operator overloads like
+    // "operator>>" - but for now Clang doesn't try to simplify them, so this
+    // is OK. Add more nuanced operator overload handling here if/when needed.
+    if (Name.endswith(">"))
+      break;
+    if (!appendTemplateParameters(D))
+      break;
+
+    if (EndedWithTemplate)
+      OS << ' ';
+    OS << '>';
+    EndedWithTemplate = true;
+    Word = true;
+    break;
+  }
+  }
+  return InnerDIE;
+}
+
+void DWARFTypePrinter::appendUnqualifiedNameAfter(
+    DWARFDie D, DWARFDie Inner, bool SkipFirstParamIfArtificial) {
+  if (!D)
+    return;
+  switch (D.getTag()) {
+  case DW_TAG_subroutine_type: {
+    appendSubroutineNameAfter(D, Inner, SkipFirstParamIfArtificial, false,
+                              false);
+    break;
+  }
+  case DW_TAG_array_type: {
+    appendArrayType(D);
+    break;
+  }
+  case DW_TAG_const_type:
+  case DW_TAG_volatile_type:
+    appendConstVolatileQualifierAfter(D);
+    break;
+  case DW_TAG_ptr_to_member_type:
+  case DW_TAG_reference_type:
+  case DW_TAG_rvalue_reference_type:
+  case DW_TAG_pointer_type: {
+    if (needsParens(Inner))
+      OS << ')';
+    appendUnqualifiedNameAfter(Inner, resolveReferencedType(Inner),
+                               /*SkipFirstParamIfArtificial=*/D.getTag() ==
+                                   DW_TAG_ptr_to_member_type);
+    break;
+  }
+    /*
+  case DW_TAG_structure_type:
+  case DW_TAG_class_type:
+  case DW_TAG_enumeration_type:
+  case DW_TAG_base_type:
+  case DW_TAG_namespace:
+  */
+  default:
+    break;
+  }
+}
+
+void DWARFTypePrinter::appendQualifiedName(DWARFDie D) {
+  if (D)
+    appendScopes(D.getParent());
+  appendUnqualifiedName(D);
+}
+DWARFDie DWARFTypePrinter::appendQualifiedNameBefore(DWARFDie D) {
+  if (D)
+    appendScopes(D.getParent());
+  return appendUnqualifiedNameBefore(D);
+}
+bool DWARFTypePrinter::appendTemplateParameters(DWARFDie D,
+                                                bool *FirstParameter) {
+  bool FirstParameterValue = true;
+  bool IsTemplate = false;
+  if (!FirstParameter)
+    FirstParameter = &FirstParameterValue;
+  for (const DWARFDie &C : D) {
+    auto Sep = [&] {
+      if (*FirstParameter)
+        OS << '<';
+      else
+        OS << ", ";
+      IsTemplate = true;
+      EndedWithTemplate = false;
+      *FirstParameter = false;
+    };
+    if (C.getTag() == dwarf::DW_TAG_GNU_template_parameter_pack) {
+      IsTemplate = true;
+      appendTemplateParameters(C, FirstParameter);
+    }
+    if (C.getTag() == dwarf::DW_TAG_template_value_parameter) {
+      DWARFDie T = resolveReferencedType(C);
+      Sep();
+      if (T.getTag() == DW_TAG_enumeration_type) {
+        OS << '(';
+        appendQualifiedName(T);
+        OS << ')';
+        auto V = C.find(DW_AT_const_value);
+        OS << std::to_string(*V->getAsSignedConstant());
+        continue;
+      }
+      // /Maybe/ we could do pointer type parameters, looking for the
+      // symbol in the ELF symbol table to get back to the variable...
+      // but probably not worth it.
+      if (T.getTag() == DW_TAG_pointer_type)
+        continue;
+      const char *RawName = dwarf::toString(T.find(DW_AT_name), nullptr);
+      assert(RawName);
+      StringRef Name = RawName;
+      auto V = C.find(DW_AT_const_value);
+      bool IsQualifiedChar = false;
+      if (Name == "bool") {
+        OS << (*V->getAsUnsignedConstant() ? "true" : "false");
+      } else if (Name == "short") {
+        OS << "(short)";
+        OS << std::to_string(*V->getAsSignedConstant());
+      } else if (Name == "unsigned short") {
+        OS << "(unsigned short)";
+        OS << std::to_string(*V->getAsSignedConstant());
+      } else if (Name == "int")
+        OS << std::to_string(*V->getAsSignedConstant());
+      else if (Name == "long") {
+        OS << std::to_string(*V->getAsSignedConstant());
+        OS << "L";
+      } else if (Name == "long long") {
+        OS << std::to_string(*V->getAsSignedConstant());
+        OS << "LL";
+      } else if (Name == "unsigned int") {
+        OS << std::to_string(*V->getAsUnsignedConstant());
+        OS << "U";
+      } else if (Name == "unsigned long") {
+        OS << std::to_string(*V->getAsUnsignedConstant());
+        OS << "UL";
+      } else if (Name == "unsigned long long") {
+        OS << std::to_string(*V->getAsUnsignedConstant());
+        OS << "ULL";
+      } else if (Name == "char" ||
+                 (IsQualifiedChar =
+                      (Name == "unsigned char" || Name == "signed char"))) {
+        // FIXME: check T's DW_AT_type to see if it's signed or not (since
+        // char signedness is implementation defined).
+        auto Val = *V->getAsSignedConstant();
+        // Copied/hacked up from Clang's CharacterLiteral::print - incomplete
+        // (doesn't actually support different character types/widths, sign
+        // handling's not done, and doesn't correctly test if a character is
+        // printable or needs to use a numeric escape sequence instead)
+        if (IsQualifiedChar) {
+          OS << '(';
+          OS << Name;
+          OS << ')';
+        }
+        switch (Val) {
+        case '\\':
+          OS << "'\\\\'";
+          break;
+        case '\'':
+          OS << "'\\''";
+          break;
+        case '\a':
+          // TODO: K&R: the meaning of '\\a' is different in traditional C
+          OS << "'\\a'";
+          break;
+        case '\b':
+          OS << "'\\b'";
+          break;
+        case '\f':
+          OS << "'\\f'";
+          break;
+        case '\n':
+          OS << "'\\n'";
+          break;
+        case '\r':
+          OS << "'\\r'";
+          break;
+        case '\t':
+          OS << "'\\t'";
+          break;
+        case '\v':
+          OS << "'\\v'";
+          break;
+        default:
+          if ((Val & ~0xFFu) == ~0xFFu)
+            Val &= 0xFFu;
+          if (Val < 127 && Val >= 32) {
+            OS << "'";
+            OS << (char)Val;
+            OS << "'";
+          } else if (Val < 256)
+            OS << to_string(llvm::format("'\\x%02x'", Val));
+          else if (Val <= 0xFFFF)
+            OS << to_string(llvm::format("'\\u%04x'", Val));
+          else
+            OS << to_string(llvm::format("'\\U%08x'", Val));
+        }
+      }
+      continue;
+    }
+    if (C.getTag() == dwarf::DW_TAG_GNU_template_template_param) {
+      const char *RawName =
+          dwarf::toString(C.find(DW_AT_GNU_template_name), nullptr);
+      assert(RawName);
+      StringRef Name = RawName;
+      Sep();
+      OS << Name;
+      continue;
+    }
+    if (C.getTag() != dwarf::DW_TAG_template_type_parameter)
+      continue;
+    auto TypeAttr = C.find(DW_AT_type);
+    Sep();
+    appendQualifiedName(TypeAttr ? resolveReferencedType(C, *TypeAttr)
+                                 : DWARFDie());
+  }
+  if (IsTemplate && *FirstParameter && FirstParameter == &FirstParameterValue) {
+    OS << '<';
+    EndedWithTemplate = false;
+  }
+  return IsTemplate;
+}
+void DWARFTypePrinter::decomposeConstVolatile(DWARFDie &N, DWARFDie &T,
+                                              DWARFDie &C, DWARFDie &V) {
+  (N.getTag() == DW_TAG_const_type ? C : V) = N;
+  T = resolveReferencedType(N);
+  if (T) {
+    auto Tag = T.getTag();
+    if (Tag == DW_TAG_const_type) {
+      C = T;
+      T = resolveReferencedType(T);
+    } else if (Tag == DW_TAG_volatile_type) {
+      V = T;
+      T = resolveReferencedType(T);
+    }
+  }
+}
+void DWARFTypePrinter::appendConstVolatileQualifierAfter(DWARFDie N) {
+  DWARFDie C;
+  DWARFDie V;
+  DWARFDie T;
+  decomposeConstVolatile(N, T, C, V);
+  if (T && T.getTag() == DW_TAG_subroutine_type)
+    appendSubroutineNameAfter(T, resolveReferencedType(T), false, C.isValid(),
+                              V.isValid());
+  else
+    appendUnqualifiedNameAfter(T, resolveReferencedType(T));
+}
+void DWARFTypePrinter::appendConstVolatileQualifierBefore(DWARFDie N) {
+  DWARFDie C;
+  DWARFDie V;
+  DWARFDie T;
+  decomposeConstVolatile(N, T, C, V);
+  bool Subroutine = T && T.getTag() == DW_TAG_subroutine_type;
+  DWARFDie A = T;
+  while (A && A.getTag() == DW_TAG_array_type)
+    A = resolveReferencedType(A);
+  bool Leading =
+      (!A || (A.getTag() != DW_TAG_pointer_type &&
+              A.getTag() != llvm::dwarf::DW_TAG_ptr_to_member_type)) &&
+      !Subroutine;
+  if (Leading) {
+    if (C)
+      OS << "const ";
+    if (V)
+      OS << "volatile ";
+  }
+  appendQualifiedNameBefore(T);
+  if (!Leading && !Subroutine) {
+    Word = true;
+    if (C)
+      OS << "const";
+    if (V) {
+      if (C)
+        OS << ' ';
+      OS << "volatile";
+    }
+  }
+}
+void DWARFTypePrinter::appendUnqualifiedName(DWARFDie D,
+                                             std::string *OriginalFullName) {
+  // FIXME: We should have pretty printers per language. Currently we print
+  // everything as if it was C++ and fall back to the TAG type name.
+  DWARFDie Inner = appendUnqualifiedNameBefore(D, OriginalFullName);
+  appendUnqualifiedNameAfter(D, Inner);
+}
+void DWARFTypePrinter::appendSubroutineNameAfter(
+    DWARFDie D, DWARFDie Inner, bool SkipFirstParamIfArtificial, bool Const,
+    bool Volatile) {
+  DWARFDie FirstParamIfArtificial;
+  OS << '(';
+  EndedWithTemplate = false;
+  bool First = true;
+  bool RealFirst = true;
+  for (DWARFDie P : D) {
+    if (P.getTag() != DW_TAG_formal_parameter &&
+        P.getTag() != DW_TAG_unspecified_parameters)
+      return;
+    DWARFDie T = resolveReferencedType(P);
+    if (SkipFirstParamIfArtificial && RealFirst && P.find(DW_AT_artificial)) {
+      FirstParamIfArtificial = T;
+      RealFirst = false;
+      continue;
+    }
+    if (!First) {
+      OS << ", ";
+    }
+    First = false;
+    if (P.getTag() == DW_TAG_unspecified_parameters)
+      OS << "...";
+    else
+      appendQualifiedName(T);
+  }
+  EndedWithTemplate = false;
+  OS << ')';
+  if (FirstParamIfArtificial) {
+    if (DWARFDie P = FirstParamIfArtificial) {
+      if (P.getTag() == DW_TAG_pointer_type) {
+        auto CVStep = [&](DWARFDie CV) {
+          if (DWARFDie U = resolveReferencedType(CV)) {
+            Const |= U.getTag() == DW_TAG_const_type;
+            Volatile |= U.getTag() == DW_TAG_volatile_type;
+            return U;
+          }
+          return DWARFDie();
+        };
+        if (DWARFDie CV = CVStep(P)) {
+          CVStep(CV);
+        }
+      }
+    }
+  }
+
+  if (auto CC = D.find(DW_AT_calling_convention)) {
+    switch (*CC->getAsUnsignedConstant()) {
+    case CallingConvention::DW_CC_BORLAND_stdcall:
+      OS << " __attribute__((stdcall))";
+      break;
+    case CallingConvention::DW_CC_BORLAND_msfastcall:
+      OS << " __attribute__((fastcall))";
+      break;
+    case CallingConvention::DW_CC_BORLAND_thiscall:
+      OS << " __attribute__((thiscall))";
+      break;
+    case CallingConvention::DW_CC_LLVM_vectorcall:
+      OS << " __attribute__((vectorcall))";
+      break;
+    case CallingConvention::DW_CC_BORLAND_pascal:
+      OS << " __attribute__((pascal))";
+      break;
+    case CallingConvention::DW_CC_LLVM_Win64:
+      OS << " __attribute__((ms_abi))";
+      break;
+    case CallingConvention::DW_CC_LLVM_X86_64SysV:
+      OS << " __attribute__((sysv_abi))";
+      break;
+    case CallingConvention::DW_CC_LLVM_AAPCS:
+      // AArch64VectorCall missing?
+      OS << " __attribute__((pcs(\"aapcs\")))";
+      break;
+    case CallingConvention::DW_CC_LLVM_AAPCS_VFP:
+      OS << " __attribute__((pcs(\"aapcs-vfp\")))";
+      break;
+    case CallingConvention::DW_CC_LLVM_IntelOclBicc:
+      OS << " __attribute__((intel_ocl_bicc))";
+      break;
+    case CallingConvention::DW_CC_LLVM_SpirFunction:
+    case CallingConvention::DW_CC_LLVM_OpenCLKernel:
+      // These aren't available as attributes, but maybe we should still
+      // render them somehow? (Clang doesn't render them, but that's an issue
+      // for template names too - since then the DWARF names of templates
+      // instantiated with function types with these calling conventions won't
+      // have distinct names - so we'd need to fix that too)
+      break;
+    case CallingConvention::DW_CC_LLVM_Swift:
+      // SwiftAsync missing
+      OS << " __attribute__((swiftcall))";
+      break;
+    case CallingConvention::DW_CC_LLVM_PreserveMost:
+      OS << " __attribute__((preserve_most))";
+      break;
+    case CallingConvention::DW_CC_LLVM_PreserveAll:
+      OS << " __attribute__((preserve_all))";
+      break;
+    case CallingConvention::DW_CC_LLVM_X86RegCall:
+      OS << " __attribute__((regcall))";
+      break;
+    }
+  }
+
+  if (Const)
+    OS << " const";
+  if (Volatile)
+    OS << " volatile";
+  if (D.find(DW_AT_reference))
+    OS << " &";
+  if (D.find(DW_AT_rvalue_reference))
+    OS << " &&";
+
+  appendUnqualifiedNameAfter(Inner, resolveReferencedType(Inner));
+}
+void DWARFTypePrinter::appendScopes(DWARFDie D) {
+  if (D.getTag() == DW_TAG_compile_unit)
+    return;
+  if (D.getTag() == DW_TAG_type_unit)
+    return;
+  if (D.getTag() == DW_TAG_skeleton_unit)
+    return;
+  if (D.getTag() == DW_TAG_subprogram)
+    return;
+  if (D.getTag() == DW_TAG_lexical_block)
+    return;
+  D = D.resolveTypeUnitReference();
+  if (DWARFDie P = D.getParent())
+    appendScopes(P);
+  appendUnqualifiedName(D);
+  OS << "::";
+}
+} // namespace llvm
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp b/llvm/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
index a301b65dd444..fe16ca06132b 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
@@ -8,9 +8,7 @@
 
 #include "llvm/DebugInfo/DWARF/DWARFTypeUnit.h"
 #include "llvm/DebugInfo/DIContext.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
-#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cinttypes>
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp b/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
index eed0a60ec75e..74667fcb92bc 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -9,15 +9,23 @@
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
 #include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugLoc.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugRnglists.h"
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
+#include "llvm/DebugInfo/DWARF/DWARFExpression.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
+#include "llvm/DebugInfo/DWARF/DWARFListTable.h"
+#include "llvm/DebugInfo/DWARF/DWARFObject.h"
+#include "llvm/DebugInfo/DWARF/DWARFSection.h"
 #include "llvm/DebugInfo/DWARF/DWARFTypeUnit.h"
+#include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Path.h"
@@ -25,7 +33,6 @@
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
-#include <cstdio>
 #include <utility>
 #include <vector>
 
@@ -79,7 +86,14 @@ void DWARFUnitVector::addUnitsImpl(
       if (!IndexEntry && IsDWO) {
         const DWARFUnitIndex &Index = getDWARFUnitIndex(
             Context, Header.isTypeUnit() ? DW_SECT_EXT_TYPES : DW_SECT_INFO);
-        IndexEntry = Index.getFromOffset(Header.getOffset());
+        if (Index) {
+          if (Header.isTypeUnit())
+            IndexEntry = Index.getFromHash(Header.getTypeHash());
+          else if (auto DWOId = Header.getDWOId())
+            IndexEntry = Index.getFromHash(*DWOId);
+        }
+        if (!IndexEntry)
+          IndexEntry = Index.getFromOffset(Header.getOffset());
       }
       if (IndexEntry && !Header.applyIndexEntry(IndexEntry))
         return nullptr;
@@ -366,6 +380,9 @@ void DWARFUnit::clear() {
   AddrOffsetSectionBase = None;
   SU = nullptr;
   clearDIEs(false);
+  AddrDieMap.clear();
+  if (DWO)
+    DWO->clear();
   DWO.reset();
 }
 
@@ -407,7 +424,7 @@ void DWARFUnit::extractDIEsToVector(
     assert((Parents.back() == UINT32_MAX || Parents.back() <= Dies.size()) &&
            "Wrong parent index");
 
-    // Extract die. Stop if any error occured.
+    // Extract die. Stop if any error occurred.
     if (!DIE.extractFast(*this, &DIEOffset, DebugInfoData, NextCUOffset,
                          Parents.back()))
       break;
@@ -607,7 +624,7 @@ bool DWARFUnit::parseDWO() {
     DWO->setAddrOffsetSection(AddrOffsetSection, *AddrOffsetSectionBase);
   if (getVersion() == 4) {
     auto DWORangesBase = UnitDie.getRangesBaseAttribute();
-    DWO->setRangesSection(RangeSection, DWORangesBase.getValueOr(0));
+    DWO->setRangesSection(RangeSection, DWORangesBase.value_or(0));
   }
 
   return true;
@@ -735,6 +752,100 @@ DWARFDie DWARFUnit::getSubroutineForAddress(uint64_t Address) {
   return R->second.second;
 }
 
+void DWARFUnit::updateVariableDieMap(DWARFDie Die) {
+  for (DWARFDie Child : Die) {
+    if (isType(Child.getTag()))
+      continue;
+    updateVariableDieMap(Child);
+  }
+
+  if (Die.getTag() != DW_TAG_variable)
+    return;
+
+  Expected<DWARFLocationExpressionsVector> Locations =
+      Die.getLocations(DW_AT_location);
+  if (!Locations) {
+    // Missing DW_AT_location is fine here.
+    consumeError(Locations.takeError());
+    return;
+  }
+
+  uint64_t Address = UINT64_MAX;
+
+  for (const DWARFLocationExpression &Location : *Locations) {
+    uint8_t AddressSize = getAddressByteSize();
+    DataExtractor Data(Location.Expr, /*IsLittleEndian=*/true, AddressSize);
+    DWARFExpression Expr(Data, AddressSize);
+    auto It = Expr.begin();
+    if (It == Expr.end())
+      continue;
+
+    // Match exactly the main sequence used to describe global variables:
+    // `DW_OP_addr[x] [+ DW_OP_plus_uconst]`. Currently, this is the sequence
+    // that LLVM produces for DILocalVariables and DIGlobalVariables. If, in
+    // future, the DWARF producer (`DwarfCompileUnit::addLocationAttribute()` is
+    // a good starting point) is extended to use further expressions, this code
+    // needs to be updated.
+    uint64_t LocationAddr;
+    if (It->getCode() == dwarf::DW_OP_addr) {
+      LocationAddr = It->getRawOperand(0);
+    } else if (It->getCode() == dwarf::DW_OP_addrx) {
+      uint64_t DebugAddrOffset = It->getRawOperand(0);
+      if (auto Pointer = getAddrOffsetSectionItem(DebugAddrOffset)) {
+        LocationAddr = Pointer->Address;
+      }
+    } else {
+      continue;
+    }
+
+    // Read the optional 2nd operand, a DW_OP_plus_uconst.
+    if (++It != Expr.end()) {
+      if (It->getCode() != dwarf::DW_OP_plus_uconst)
+        continue;
+
+      LocationAddr += It->getRawOperand(0);
+
+      // Probe for a 3rd operand, if it exists, bail.
+      if (++It != Expr.end())
+        continue;
+    }
+
+    Address = LocationAddr;
+    break;
+  }
+
+  // Get the size of the global variable. If all else fails (i.e. the global has
+  // no type), then we use a size of one to still allow symbolization of the
+  // exact address.
+  uint64_t GVSize = 1;
+  if (DWARFDie BaseType = Die.getAttributeValueAsReferencedDie(DW_AT_type))
+    if (Optional<uint64_t> Size = Die.getTypeSize(getAddressByteSize()))
+      GVSize = *Size;
+
+  if (Address != UINT64_MAX)
+    VariableDieMap[Address] = {Address + GVSize, Die};
+}
+
+DWARFDie DWARFUnit::getVariableForAddress(uint64_t Address) {
+  extractDIEsIfNeeded(false);
+
+  auto RootDie = getUnitDIE();
+
+  auto RootLookup = RootsParsedForVariables.insert(RootDie.getOffset());
+  if (RootLookup.second)
+    updateVariableDieMap(RootDie);
+
+  auto R = VariableDieMap.upper_bound(Address);
+  if (R == VariableDieMap.begin())
+    return DWARFDie();
+
+  // upper_bound's previous item contains Address.
+  --R;
+  if (Address >= R->second.first)
+    return DWARFDie();
+  return R->second.second;
+}
+
 void
 DWARFUnit::getInlinedChainForAddress(uint64_t Address,
                                      SmallVectorImpl<DWARFDie> &InlinedChain) {
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp b/llvm/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
index d27fd08db14e..d161beef2202 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
@@ -9,6 +9,7 @@
 #include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index ca7ac785b550..c704f8f583af 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -6,17 +6,28 @@
 //
 //===----------------------------------------------------------------------===//
 #include "llvm/DebugInfo/DWARF/DWARFVerifier.h"
+#include "llvm/ADT/IntervalMap.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
+#include "llvm/DebugInfo/DWARF/DWARFAttribute.h"
 #include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugLoc.h"
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
 #include "llvm/DebugInfo/DWARF/DWARFExpression.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
+#include "llvm/DebugInfo/DWARF/DWARFLocationExpression.h"
+#include "llvm/DebugInfo/DWARF/DWARFObject.h"
 #include "llvm/DebugInfo/DWARF/DWARFSection.h"
-#include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
+#include "llvm/Object/Error.h"
 #include "llvm/Support/DJB.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
@@ -28,6 +39,10 @@ using namespace llvm;
 using namespace dwarf;
 using namespace object;
 
+namespace llvm {
+class DWARFDebugInfoEntry;
+}
+
 Optional<DWARFAddressRange>
 DWARFVerifier::DieRangeInfo::insert(const DWARFAddressRange &R) {
   auto Begin = Ranges.begin();
@@ -381,6 +396,59 @@ unsigned DWARFVerifier::verifyUnitSection(const DWARFSection &S) {
   return NumDebugInfoErrors;
 }
 
+unsigned DWARFVerifier::verifyIndex(StringRef Name,
+                                    DWARFSectionKind InfoColumnKind,
+                                    StringRef IndexStr) {
+  if (IndexStr.empty())
+    return 0;
+  OS << "Verifying " << Name << "...\n";
+  DWARFUnitIndex Index(InfoColumnKind);
+  DataExtractor D(IndexStr, DCtx.isLittleEndian(), 0);
+  if (!Index.parse(D))
+    return 1;
+  using MapType = IntervalMap<uint32_t, uint64_t>;
+  MapType::Allocator Alloc;
+  std::vector<std::unique_ptr<MapType>> Sections(Index.getColumnKinds().size());
+  for (const DWARFUnitIndex::Entry &E : Index.getRows()) {
+    uint64_t Sig = E.getSignature();
+    if (!E.getContributions())
+      continue;
+    for (auto E : enumerate(InfoColumnKind == DW_SECT_INFO
+                                ? makeArrayRef(E.getContributions(),
+                                               Index.getColumnKinds().size())
+                                : makeArrayRef(E.getContribution(), 1))) {
+      const DWARFUnitIndex::Entry::SectionContribution &SC = E.value();
+      int Col = E.index();
+      if (SC.Length == 0)
+        continue;
+      if (!Sections[Col])
+        Sections[Col] = std::make_unique<MapType>(Alloc);
+      auto &M = *Sections[Col];
+      auto I = M.find(SC.Offset);
+      if (I != M.end() && I.start() < (SC.Offset + SC.Length)) {
+        error() << llvm::formatv(
+            "overlapping index entries for entries {0:x16} "
+            "and {1:x16} for column {2}\n",
+            *I, Sig, toString(Index.getColumnKinds()[Col]));
+        return 1;
+      }
+      M.insert(SC.Offset, SC.Offset + SC.Length - 1, Sig);
+    }
+  }
+
+  return 0;
+}
+
+bool DWARFVerifier::handleDebugCUIndex() {
+  return verifyIndex(".debug_cu_index", DWARFSectionKind::DW_SECT_INFO,
+                     DCtx.getDWARFObj().getCUIndexSection()) == 0;
+}
+
+bool DWARFVerifier::handleDebugTUIndex() {
+  return verifyIndex(".debug_tu_index", DWARFSectionKind::DW_SECT_EXT_TYPES,
+                     DCtx.getDWARFObj().getTUIndexSection()) == 0;
+}
+
 bool DWARFVerifier::handleDebugInfo() {
   const DWARFObject &DObj = DCtx.getDWARFObj();
   unsigned NumErrors = 0;
diff --git a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
index 6eef6f84ab40..473a69b34ac3 100644
--- a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
+++ b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
@@ -10,6 +10,7 @@
 #include <unordered_set>
 
 #include "llvm/DebugInfo/DIContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ThreadPool.h"
@@ -287,12 +288,12 @@ static void convertFunctionLineTable(raw_ostream &Log, CUInfo &CUI,
     // linker problems or LTO or other DWARF re-linking so it is worth emitting
     // an error, but not worth stopping the creation of the GSYM.
     if (!FI.Range.contains(RowAddress)) {
-      if (RowAddress < FI.Range.Start) {
+      if (RowAddress < FI.Range.start()) {
         Log << "error: DIE has a start address whose LowPC is between the "
           "line table Row[" << RowIndex << "] with address "
           << HEX64(RowAddress) << " and the next one.\n";
         Die.dump(Log, 0, DIDumpOptions::getForSingleDIE());
-        RowAddress = FI.Range.Start;
+        RowAddress = FI.Range.start();
       } else {
         continue;
       }
@@ -403,8 +404,7 @@ void DwarfTransformer::handleDie(raw_ostream &OS, CUInfo &CUI, DWARFDie Die) {
       }
 
       FunctionInfo FI;
-      FI.setStartAddress(Range.LowPC);
-      FI.setEndAddress(Range.HighPC);
+      FI.Range = {Range.LowPC, Range.HighPC};
       FI.Name = *NameIndex;
       if (CUI.LineTable) {
         convertFunctionLineTable(OS, CUI, Die, Gsym, FI);
@@ -427,11 +427,28 @@ void DwarfTransformer::handleDie(raw_ostream &OS, CUInfo &CUI, DWARFDie Die) {
 
 Error DwarfTransformer::convert(uint32_t NumThreads) {
   size_t NumBefore = Gsym.getNumFunctionInfos();
+  auto getDie = [&](DWARFUnit &DwarfUnit) -> DWARFDie {
+    DWARFDie ReturnDie = DwarfUnit.getUnitDIE(false);
+    if (llvm::Optional<uint64_t> DWOId = DwarfUnit.getDWOId()) {
+      DWARFUnit *DWOCU = DwarfUnit.getNonSkeletonUnitDIE(false).getDwarfUnit();
+      if (!DWOCU->isDWOUnit()) {
+        std::string DWOName = dwarf::toString(
+            DwarfUnit.getUnitDIE().find(
+                {dwarf::DW_AT_dwo_name, dwarf::DW_AT_GNU_dwo_name}),
+            "");
+        Log << "warning: Unable to retrieve DWO .debug_info section for "
+            << DWOName << "\n";
+      } else {
+        ReturnDie = DWOCU->getUnitDIE(false);
+      }
+    }
+    return ReturnDie;
+  };
   if (NumThreads == 1) {
     // Parse all DWARF data from this thread, use the same string/file table
     // for everything
     for (const auto &CU : DICtx.compile_units()) {
-      DWARFDie Die = CU->getUnitDIE(false);
+      DWARFDie Die = getDie(*CU);
       CUInfo CUI(DICtx, dyn_cast<DWARFCompileUnit>(CU.get()));
       handleDie(Log, CUI, Die);
     }
@@ -456,7 +473,7 @@ Error DwarfTransformer::convert(uint32_t NumThreads) {
     // Now convert all DWARF to GSYM in a thread pool.
     std::mutex LogMutex;
     for (const auto &CU : DICtx.compile_units()) {
-      DWARFDie Die = CU->getUnitDIE(false /*CUDieOnly*/);
+      DWARFDie Die = getDie(*CU);
       if (Die) {
         CUInfo CUI(DICtx, dyn_cast<DWARFCompileUnit>(CU.get()));
         pool.async([this, CUI, &LogMutex, Die]() mutable {
diff --git a/llvm/lib/DebugInfo/GSYM/ExtractRanges.cpp b/llvm/lib/DebugInfo/GSYM/ExtractRanges.cpp
new file mode 100644
index 000000000000..4a42100c86da
--- /dev/null
+++ b/llvm/lib/DebugInfo/GSYM/ExtractRanges.cpp
@@ -0,0 +1,79 @@
+//===- ExtractRanges.cpp ----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/GSYM/ExtractRanges.h"
+#include "llvm/DebugInfo/GSYM/FileWriter.h"
+#include "llvm/Support/DataExtractor.h"
+#include <algorithm>
+#include <inttypes.h>
+
+namespace llvm {
+namespace gsym {
+
+void encodeRange(const AddressRange &Range, FileWriter &O, uint64_t BaseAddr) {
+  assert(Range.start() >= BaseAddr);
+  O.writeULEB(Range.start() - BaseAddr);
+  O.writeULEB(Range.size());
+}
+
+AddressRange decodeRange(DataExtractor &Data, uint64_t BaseAddr,
+                         uint64_t &Offset) {
+  const uint64_t AddrOffset = Data.getULEB128(&Offset);
+  const uint64_t Size = Data.getULEB128(&Offset);
+  const uint64_t StartAddr = BaseAddr + AddrOffset;
+
+  return {StartAddr, StartAddr + Size};
+}
+
+void encodeRanges(const AddressRanges &Ranges, FileWriter &O,
+                  uint64_t BaseAddr) {
+  O.writeULEB(Ranges.size());
+  if (Ranges.empty())
+    return;
+  for (auto Range : Ranges)
+    encodeRange(Range, O, BaseAddr);
+}
+
+void decodeRanges(AddressRanges &Ranges, DataExtractor &Data, uint64_t BaseAddr,
+                  uint64_t &Offset) {
+  Ranges.clear();
+  uint64_t NumRanges = Data.getULEB128(&Offset);
+  Ranges.reserve(NumRanges);
+  for (uint64_t RangeIdx = 0; RangeIdx < NumRanges; RangeIdx++)
+    Ranges.insert(decodeRange(Data, BaseAddr, Offset));
+}
+
+void skipRange(DataExtractor &Data, uint64_t &Offset) {
+  Data.getULEB128(&Offset);
+  Data.getULEB128(&Offset);
+}
+
+uint64_t skipRanges(DataExtractor &Data, uint64_t &Offset) {
+  uint64_t NumRanges = Data.getULEB128(&Offset);
+  for (uint64_t I = 0; I < NumRanges; ++I)
+    skipRange(Data, Offset);
+  return NumRanges;
+}
+
+} // namespace gsym
+
+raw_ostream &operator<<(raw_ostream &OS, const AddressRange &R) {
+  return OS << '[' << HEX64(R.start()) << " - " << HEX64(R.end()) << ")";
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const AddressRanges &AR) {
+  size_t Size = AR.size();
+  for (size_t I = 0; I < Size; ++I) {
+    if (I)
+      OS << ' ';
+    OS << AR[I];
+  }
+  return OS;
+}
+
+} // namespace llvm
diff --git a/llvm/lib/DebugInfo/GSYM/FunctionInfo.cpp b/llvm/lib/DebugInfo/GSYM/FunctionInfo.cpp
index cef1b9498c5c..4f5d240cdf72 100644
--- a/llvm/lib/DebugInfo/GSYM/FunctionInfo.cpp
+++ b/llvm/lib/DebugInfo/GSYM/FunctionInfo.cpp
@@ -36,12 +36,11 @@ raw_ostream &llvm::gsym::operator<<(raw_ostream &OS, const FunctionInfo &FI) {
 llvm::Expected<FunctionInfo> FunctionInfo::decode(DataExtractor &Data,
                                                   uint64_t BaseAddr) {
   FunctionInfo FI;
-  FI.Range.Start = BaseAddr;
   uint64_t Offset = 0;
   if (!Data.isValidOffsetForDataOfSize(Offset, 4))
     return createStringError(std::errc::io_error,
         "0x%8.8" PRIx64 ": missing FunctionInfo Size", Offset);
-  FI.Range.End = FI.Range.Start + Data.getU32(&Offset);
+  FI.Range = {BaseAddr, BaseAddr + Data.getU32(&Offset)};
   if (!Data.isValidOffsetForDataOfSize(Offset, 4))
     return createStringError(std::errc::io_error,
         "0x%8.8" PRIx64 ": missing FunctionInfo Name", Offset);
@@ -109,13 +108,13 @@ llvm::Expected<uint64_t> FunctionInfo::encode(FileWriter &O) const {
   // Write the name of this function as a uint32_t string table offset.
   O.writeU32(Name);
 
-  if (OptLineTable.hasValue()) {
+  if (OptLineTable) {
     O.writeU32(InfoType::LineTableInfo);
     // Write a uint32_t length as zero for now, we will fix this up after
     // writing the LineTable out with the number of bytes that were written.
     O.writeU32(0);
     const auto StartOffset = O.tell();
-    llvm::Error err = OptLineTable->encode(O, Range.Start);
+    llvm::Error err = OptLineTable->encode(O, Range.start());
     if (err)
       return std::move(err);
     const auto Length = O.tell() - StartOffset;
@@ -127,13 +126,13 @@ llvm::Expected<uint64_t> FunctionInfo::encode(FileWriter &O) const {
   }
 
   // Write out the inline function info if we have any and if it is valid.
-  if (Inline.hasValue()) {
+  if (Inline) {
     O.writeU32(InfoType::InlineInfo);
     // Write a uint32_t length as zero for now, we will fix this up after
     // writing the LineTable out with the number of bytes that were written.
     O.writeU32(0);
     const auto StartOffset = O.tell();
-    llvm::Error err = Inline->encode(O, Range.Start);
+    llvm::Error err = Inline->encode(O, Range.start());
     if (err)
       return std::move(err);
     const auto Length = O.tell() - StartOffset;
@@ -157,9 +156,8 @@ llvm::Expected<LookupResult> FunctionInfo::lookup(DataExtractor &Data,
                                                   uint64_t Addr) {
   LookupResult LR;
   LR.LookupAddr = Addr;
-  LR.FuncRange.Start = FuncAddr;
   uint64_t Offset = 0;
-  LR.FuncRange.End = FuncAddr + Data.getU32(&Offset);
+  LR.FuncRange = {FuncAddr, FuncAddr + Data.getU32(&Offset)};
   uint32_t NameOffset = Data.getU32(&Offset);
   // The "lookup" functions doesn't report errors as accurately as the "decode"
   // function as it is meant to be fast. For more accurage errors we could call
diff --git a/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp b/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp
index 1c20a59469dc..8281938770cf 100644
--- a/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp
+++ b/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp
@@ -271,7 +271,7 @@ llvm::Error GsymCreator::finalize(llvm::raw_ostream &OS) {
                            }
                          }
                        } else if (Prev.Range.size() == 0 &&
-                                  Curr.Range.contains(Prev.Range.Start)) {
+                                  Curr.Range.contains(Prev.Range.start())) {
                          if (!Quiet) {
                            OS << "warning: removing symbol:\n"
                               << Prev << "\nKeeping:\n"
@@ -291,8 +291,8 @@ llvm::Error GsymCreator::finalize(llvm::raw_ostream &OS) {
   // has no size when doing lookups.
   if (!Funcs.empty() && Funcs.back().Range.size() == 0 && ValidTextRanges) {
     if (auto Range =
-            ValidTextRanges->getRangeThatContains(Funcs.back().Range.Start)) {
-      Funcs.back().Range.End = Range->End;
+            ValidTextRanges->getRangeThatContains(Funcs.back().Range.start())) {
+      Funcs.back().Range = {Funcs.back().Range.start(), Range->end()};
     }
   }
   OS << "Pruned " << NumBefore - Funcs.size() << " functions, ended with "
diff --git a/llvm/lib/DebugInfo/GSYM/GsymReader.cpp b/llvm/lib/DebugInfo/GSYM/GsymReader.cpp
index 2ad18bf63d5d..0c585cc8d306 100644
--- a/llvm/lib/DebugInfo/GSYM/GsymReader.cpp
+++ b/llvm/lib/DebugInfo/GSYM/GsymReader.cpp
@@ -48,7 +48,7 @@ llvm::Expected<GsymReader> GsymReader::copyBuffer(StringRef Bytes) {
 
 llvm::Expected<llvm::gsym::GsymReader>
 GsymReader::create(std::unique_ptr<MemoryBuffer> &MemBuffer) {
-  if (!MemBuffer.get())
+  if (!MemBuffer)
     return createStringError(std::errc::invalid_argument,
                              "invalid memory buffer");
   GsymReader GR(std::move(MemBuffer));
diff --git a/llvm/lib/DebugInfo/GSYM/InlineInfo.cpp b/llvm/lib/DebugInfo/GSYM/InlineInfo.cpp
index 21679b1b78aa..f7c4637a8a5b 100644
--- a/llvm/lib/DebugInfo/GSYM/InlineInfo.cpp
+++ b/llvm/lib/DebugInfo/GSYM/InlineInfo.cpp
@@ -75,7 +75,7 @@ llvm::Optional<InlineInfo::InlineArray> InlineInfo::getInlineStack(uint64_t Addr
 
 static bool skip(DataExtractor &Data, uint64_t &Offset, bool SkippedRanges) {
   if (!SkippedRanges) {
-    if (AddressRanges::skip(Data, Offset) == 0)
+    if (skipRanges(Data, Offset) == 0)
       return false;
   }
   bool HasChildren = Data.getU8(&Offset) != 0;
@@ -109,7 +109,7 @@ static bool lookup(const GsymReader &GR, DataExtractor &Data, uint64_t &Offset,
                    uint64_t BaseAddr, uint64_t Addr, SourceLocations &SrcLocs,
                    llvm::Error &Err) {
   InlineInfo Inline;
-  Inline.Ranges.decode(Data, BaseAddr, Offset);
+  decodeRanges(Inline.Ranges, Data, BaseAddr, Offset);
   if (Inline.Ranges.empty())
     return true;
   // Check if the address is contained within the inline information, and if
@@ -128,7 +128,7 @@ static bool lookup(const GsymReader &GR, DataExtractor &Data, uint64_t &Offset,
   if (HasChildren) {
     // Child address ranges are encoded relative to the first address in the
     // parent InlineInfo object.
-    const auto ChildBaseAddr = Inline.Ranges[0].Start;
+    const auto ChildBaseAddr = Inline.Ranges[0].start();
     bool Done = false;
     while (!Done)
       Done = lookup(GR, Data, Offset, ChildBaseAddr, Addr, SrcLocs, Err);
@@ -150,7 +150,7 @@ static bool lookup(const GsymReader &GR, DataExtractor &Data, uint64_t &Offset,
     SrcLoc.Base = GR.getString(CallFile->Base);
     SrcLoc.Line = Inline.CallLine;
     SrcLocs.back().Name = GR.getString(Inline.Name);
-    SrcLocs.back().Offset = Addr - Inline.Ranges[0].Start;
+    SrcLocs.back().Offset = Addr - Inline.Ranges[0].start();
     SrcLocs.push_back(SrcLoc);
   }
   return true;
@@ -182,7 +182,7 @@ static llvm::Expected<InlineInfo> decode(DataExtractor &Data, uint64_t &Offset,
   if (!Data.isValidOffset(Offset))
     return createStringError(std::errc::io_error,
         "0x%8.8" PRIx64 ": missing InlineInfo address ranges data", Offset);
-  Inline.Ranges.decode(Data, BaseAddr, Offset);
+  decodeRanges(Inline.Ranges, Data, BaseAddr, Offset);
   if (Inline.Ranges.empty())
     return Inline;
   if (!Data.isValidOffsetForDataOfSize(Offset, 1))
@@ -205,7 +205,7 @@ static llvm::Expected<InlineInfo> decode(DataExtractor &Data, uint64_t &Offset,
   if (HasChildren) {
     // Child address ranges are encoded relative to the first address in the
     // parent InlineInfo object.
-    const auto ChildBaseAddr = Inline.Ranges[0].Start;
+    const auto ChildBaseAddr = Inline.Ranges[0].start();
     while (true) {
       llvm::Expected<InlineInfo> Child = decode(Data, Offset, ChildBaseAddr);
       if (!Child)
@@ -232,7 +232,7 @@ llvm::Error InlineInfo::encode(FileWriter &O, uint64_t BaseAddr) const {
   if (!isValid())
     return createStringError(std::errc::invalid_argument,
                              "attempted to encode invalid InlineInfo object");
-  Ranges.encode(O, BaseAddr);
+  encodeRanges(Ranges, O, BaseAddr);
   bool HasChildren = !Children.empty();
   O.writeU8(HasChildren);
   O.writeU32(Name);
@@ -242,7 +242,7 @@ llvm::Error InlineInfo::encode(FileWriter &O, uint64_t BaseAddr) const {
     // Child address ranges are encoded as relative to the first
     // address in the Ranges for this object. This keeps the offsets
     // small and allows for efficient encoding using ULEB offsets.
-    const uint64_t ChildBaseAddr = Ranges[0].Start;
+    const uint64_t ChildBaseAddr = Ranges[0].start();
     for (const auto &Child : Children) {
       // Make sure all child address ranges are contained in the parent address
       // ranges.
diff --git a/llvm/lib/DebugInfo/GSYM/LookupResult.cpp b/llvm/lib/DebugInfo/GSYM/LookupResult.cpp
index 8a624226b1d3..00a5b1bbfaa5 100644
--- a/llvm/lib/DebugInfo/GSYM/LookupResult.cpp
+++ b/llvm/lib/DebugInfo/GSYM/LookupResult.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/DebugInfo/GSYM/LookupResult.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/DebugInfo/GSYM/ExtractRanges.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
@@ -42,7 +43,7 @@ raw_ostream &llvm::gsym::operator<<(raw_ostream &OS, const SourceLocation &SL) {
     OS << " @ ";
     if (!SL.Dir.empty()) {
       OS << SL.Dir;
-      if (SL.Dir.contains('\\') and not SL.Dir.contains('/'))
+      if (SL.Dir.contains('\\') && !SL.Dir.contains('/'))
         OS << '\\';
       else
         OS << '/';
diff --git a/llvm/lib/DebugInfo/GSYM/Range.cpp b/llvm/lib/DebugInfo/GSYM/Range.cpp
deleted file mode 100644
index c1e8eccd0daa..000000000000
--- a/llvm/lib/DebugInfo/GSYM/Range.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-//===- Range.cpp ------------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/GSYM/Range.h"
-#include "llvm/DebugInfo/GSYM/FileWriter.h"
-#include "llvm/Support/DataExtractor.h"
-#include <algorithm>
-#include <inttypes.h>
-
-using namespace llvm;
-using namespace gsym;
-
-
-void AddressRanges::insert(AddressRange Range) {
-  if (Range.size() == 0)
-    return;
-
-  auto It = llvm::upper_bound(Ranges, Range);
-  auto It2 = It;
-  while (It2 != Ranges.end() && It2->Start < Range.End)
-    ++It2;
-  if (It != It2) {
-    Range.End = std::max(Range.End, It2[-1].End);
-    It = Ranges.erase(It, It2);
-  }
-  if (It != Ranges.begin() && Range.Start < It[-1].End)
-    It[-1].End = std::max(It[-1].End, Range.End);
-  else
-    Ranges.insert(It, Range);
-}
-
-bool AddressRanges::contains(uint64_t Addr) const {
-  auto It = std::partition_point(
-      Ranges.begin(), Ranges.end(),
-      [=](const AddressRange &R) { return R.Start <= Addr; });
-  return It != Ranges.begin() && Addr < It[-1].End;
-}
-
-bool AddressRanges::contains(AddressRange Range) const {
-  if (Range.size() == 0)
-    return false;
-  auto It = std::partition_point(
-      Ranges.begin(), Ranges.end(),
-      [=](const AddressRange &R) { return R.Start <= Range.Start; });
-  if (It == Ranges.begin())
-    return false;
-  return Range.End <= It[-1].End;
-}
-
-Optional<AddressRange>
-AddressRanges::getRangeThatContains(uint64_t Addr) const {
-  auto It = std::partition_point(
-      Ranges.begin(), Ranges.end(),
-      [=](const AddressRange &R) { return R.Start <= Addr; });
-  if (It != Ranges.begin() && Addr < It[-1].End)
-    return It[-1];
-  return llvm::None;
-}
-
-raw_ostream &llvm::gsym::operator<<(raw_ostream &OS, const AddressRange &R) {
-  return OS << '[' << HEX64(R.Start) << " - " << HEX64(R.End) << ")";
-}
-
-raw_ostream &llvm::gsym::operator<<(raw_ostream &OS, const AddressRanges &AR) {
-  size_t Size = AR.size();
-  for (size_t I = 0; I < Size; ++I) {
-    if (I)
-      OS << ' ';
-    OS << AR[I];
-  }
-  return OS;
-}
-
-void AddressRange::encode(FileWriter &O, uint64_t BaseAddr) const {
-  assert(Start >= BaseAddr);
-  O.writeULEB(Start - BaseAddr);
-  O.writeULEB(size());
-}
-
-void AddressRange::decode(DataExtractor &Data, uint64_t BaseAddr,
-                          uint64_t &Offset) {
-  const uint64_t AddrOffset = Data.getULEB128(&Offset);
-  const uint64_t Size = Data.getULEB128(&Offset);
-  const uint64_t StartAddr = BaseAddr + AddrOffset;
-  Start = StartAddr;
-  End = StartAddr + Size;
-}
-
-void AddressRanges::encode(FileWriter &O, uint64_t BaseAddr) const {
-  O.writeULEB(Ranges.size());
-  if (Ranges.empty())
-    return;
-  for (auto Range : Ranges)
-    Range.encode(O, BaseAddr);
-}
-
-void AddressRanges::decode(DataExtractor &Data, uint64_t BaseAddr,
-                           uint64_t &Offset) {
-  clear();
-  uint64_t NumRanges = Data.getULEB128(&Offset);
-  if (NumRanges == 0)
-    return;
-  Ranges.resize(NumRanges);
-  for (auto &Range : Ranges)
-    Range.decode(Data, BaseAddr, Offset);
-}
-
-void AddressRange::skip(DataExtractor &Data, uint64_t &Offset) {
-  Data.getULEB128(&Offset);
-  Data.getULEB128(&Offset);
-}
-
-uint64_t AddressRanges::skip(DataExtractor &Data, uint64_t &Offset) {
-  uint64_t NumRanges = Data.getULEB128(&Offset);
-  for (uint64_t I=0; I<NumRanges; ++I)
-    AddressRange::skip(Data, Offset);
-  return NumRanges;
-}
diff --git a/llvm/lib/DebugInfo/MSF/MappedBlockStream.cpp b/llvm/lib/DebugInfo/MSF/MappedBlockStream.cpp
index 00fc70ca5a54..94935d63452e 100644
--- a/llvm/lib/DebugInfo/MSF/MappedBlockStream.cpp
+++ b/llvm/lib/DebugInfo/MSF/MappedBlockStream.cpp
@@ -8,7 +8,6 @@
 
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/DebugInfo/MSF/MSFCommon.h"
 #include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/Endian.h"
diff --git a/llvm/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
index b6f11a942a26..c12ac38c2317 100644
--- a/llvm/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
@@ -10,12 +10,10 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/BinaryFormat/COFF.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
-#include "llvm/DebugInfo/MSF/MSFCommon.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/PDB/Native/DbiModuleDescriptor.h"
-#include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/Support/BinaryStreamWriter.h"
@@ -25,6 +23,12 @@ using namespace llvm::codeview;
 using namespace llvm::msf;
 using namespace llvm::pdb;
 
+namespace llvm {
+namespace codeview {
+class DebugSubsection;
+}
+} // namespace llvm
+
 static uint32_t calculateDiSymbolStreamSize(uint32_t SymbolByteSize,
                                             uint32_t C13Size) {
   uint32_t Size = sizeof(uint32_t);   // Signature
@@ -44,7 +48,7 @@ DbiModuleDescriptorBuilder::DbiModuleDescriptorBuilder(StringRef ModuleName,
   Layout.Mod = ModIndex;
 }
 
-DbiModuleDescriptorBuilder::~DbiModuleDescriptorBuilder() {}
+DbiModuleDescriptorBuilder::~DbiModuleDescriptorBuilder() = default;
 
 uint16_t DbiModuleDescriptorBuilder::getStreamIndex() const {
   return Layout.ModDiStream;
diff --git a/llvm/lib/DebugInfo/PDB/Native/DbiModuleList.cpp b/llvm/lib/DebugInfo/PDB/Native/DbiModuleList.cpp
index 5cf014e881cd..009cd113f652 100644
--- a/llvm/lib/DebugInfo/PDB/Native/DbiModuleList.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/DbiModuleList.cpp
@@ -10,6 +10,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Error.h"
 #include <algorithm>
diff --git a/llvm/lib/DebugInfo/PDB/Native/DbiStream.cpp b/llvm/lib/DebugInfo/PDB/Native/DbiStream.cpp
index 4eb16804171d..1a2267334049 100644
--- a/llvm/lib/DebugInfo/PDB/Native/DbiStream.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/DbiStream.cpp
@@ -9,7 +9,6 @@
 #include "llvm/DebugInfo/PDB/Native/DbiStream.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/PDB/Native/DbiModuleDescriptor.h"
 #include "llvm/DebugInfo/PDB/Native/ISectionContribVisitor.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
@@ -20,7 +19,6 @@
 #include "llvm/Support/BinaryStreamArray.h"
 #include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Error.h"
-#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 
diff --git a/llvm/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
index 0584966a98c5..3a719bd07c8a 100644
--- a/llvm/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
@@ -14,7 +14,6 @@
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h"
-#include "llvm/DebugInfo/PDB/Native/DbiStream.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/Object/COFF.h"
 #include "llvm/Support/BinaryStreamWriter.h"
@@ -30,7 +29,7 @@ DbiStreamBuilder::DbiStreamBuilder(msf::MSFBuilder &Msf)
       PdbDllVersion(0), PdbDllRbld(0), Flags(0), MachineType(PDB_Machine::x86),
       Header(nullptr) {}
 
-DbiStreamBuilder::~DbiStreamBuilder() {}
+DbiStreamBuilder::~DbiStreamBuilder() = default;
 
 void DbiStreamBuilder::setVersionHeader(PdbRaw_DbiVer V) { VerHeader = V; }
 
@@ -72,7 +71,7 @@ void DbiStreamBuilder::setPublicsStreamIndex(uint32_t Index) {
 }
 
 void DbiStreamBuilder::addNewFpoData(const codeview::FrameData &FD) {
-  if (!NewFpoData.hasValue())
+  if (!NewFpoData)
     NewFpoData.emplace(false);
 
   NewFpoData->addFrameData(FD);
@@ -286,7 +285,7 @@ Error DbiStreamBuilder::finalize() {
 }
 
 Error DbiStreamBuilder::finalizeMsfLayout() {
-  if (NewFpoData.hasValue()) {
+  if (NewFpoData) {
     DbgStreams[(int)DbgHeaderType::NewFPO].emplace();
     DbgStreams[(int)DbgHeaderType::NewFPO]->Size =
         NewFpoData->calculateSerializedSize();
@@ -307,7 +306,7 @@ Error DbiStreamBuilder::finalizeMsfLayout() {
   }
 
   for (auto &S : DbgStreams) {
-    if (!S.hasValue())
+    if (!S)
       continue;
     auto ExpectedIndex = Msf.addStream(S->Size);
     if (!ExpectedIndex)
@@ -428,14 +427,14 @@ Error DbiStreamBuilder::commit(const msf::MSFLayout &Layout,
 
   for (auto &Stream : DbgStreams) {
     uint16_t StreamNumber = kInvalidStreamIndex;
-    if (Stream.hasValue())
+    if (Stream)
       StreamNumber = Stream->StreamNumber;
     if (auto EC = Writer.writeInteger(StreamNumber))
       return EC;
   }
 
   for (auto &Stream : DbgStreams) {
-    if (!Stream.hasValue())
+    if (!Stream)
       continue;
     assert(Stream->StreamNumber != kInvalidStreamIndex);
 
diff --git a/llvm/lib/DebugInfo/PDB/Native/EnumTables.cpp b/llvm/lib/DebugInfo/PDB/Native/EnumTables.cpp
index 37192ba36a04..32bad9cea7ce 100644
--- a/llvm/lib/DebugInfo/PDB/Native/EnumTables.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/EnumTables.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/DebugInfo/PDB/Native/EnumTables.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
+#include "llvm/Support/ScopedPrinter.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/Native/FormatUtil.cpp b/llvm/lib/DebugInfo/PDB/Native/FormatUtil.cpp
new file mode 100644
index 000000000000..a167d45982a9
--- /dev/null
+++ b/llvm/lib/DebugInfo/PDB/Native/FormatUtil.cpp
@@ -0,0 +1,207 @@
+//===- FormatUtil.cpp ----------------------------------------- *- C++ --*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/FormatUtil.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/Support/FormatAdapters.h"
+#include "llvm/Support/FormatVariadic.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::pdb;
+
+std::string llvm::pdb::typesetItemList(ArrayRef<std::string> Opts,
+                                       uint32_t IndentLevel, uint32_t GroupSize,
+                                       StringRef Sep) {
+  std::string Result;
+  while (!Opts.empty()) {
+    ArrayRef<std::string> ThisGroup;
+    ThisGroup = Opts.take_front(GroupSize);
+    Opts = Opts.drop_front(ThisGroup.size());
+    Result += join(ThisGroup, Sep);
+    if (!Opts.empty()) {
+      Result += Sep;
+      Result += "\n";
+      Result += std::string(formatv("{0}", fmt_repeat(' ', IndentLevel)));
+    }
+  }
+  return Result;
+}
+
+std::string llvm::pdb::typesetStringList(uint32_t IndentLevel,
+                                         ArrayRef<StringRef> Strings) {
+  std::string Result = "[";
+  for (const auto &S : Strings) {
+    Result += std::string(formatv("\n{0}{1}", fmt_repeat(' ', IndentLevel), S));
+  }
+  Result += "]";
+  return Result;
+}
+
+std::string llvm::pdb::formatChunkKind(DebugSubsectionKind Kind,
+                                       bool Friendly) {
+  if (Friendly) {
+    switch (Kind) {
+      RETURN_CASE(DebugSubsectionKind, None, "none");
+      RETURN_CASE(DebugSubsectionKind, Symbols, "symbols");
+      RETURN_CASE(DebugSubsectionKind, Lines, "lines");
+      RETURN_CASE(DebugSubsectionKind, StringTable, "strings");
+      RETURN_CASE(DebugSubsectionKind, FileChecksums, "checksums");
+      RETURN_CASE(DebugSubsectionKind, FrameData, "frames");
+      RETURN_CASE(DebugSubsectionKind, InlineeLines, "inlinee lines");
+      RETURN_CASE(DebugSubsectionKind, CrossScopeImports, "xmi");
+      RETURN_CASE(DebugSubsectionKind, CrossScopeExports, "xme");
+      RETURN_CASE(DebugSubsectionKind, ILLines, "il lines");
+      RETURN_CASE(DebugSubsectionKind, FuncMDTokenMap, "func md token map");
+      RETURN_CASE(DebugSubsectionKind, TypeMDTokenMap, "type md token map");
+      RETURN_CASE(DebugSubsectionKind, MergedAssemblyInput,
+                  "merged assembly input");
+      RETURN_CASE(DebugSubsectionKind, CoffSymbolRVA, "coff symbol rva");
+    }
+  } else {
+    switch (Kind) {
+      RETURN_CASE(DebugSubsectionKind, None, "none");
+      RETURN_CASE(DebugSubsectionKind, Symbols, "DEBUG_S_SYMBOLS");
+      RETURN_CASE(DebugSubsectionKind, Lines, "DEBUG_S_LINES");
+      RETURN_CASE(DebugSubsectionKind, StringTable, "DEBUG_S_STRINGTABLE");
+      RETURN_CASE(DebugSubsectionKind, FileChecksums, "DEBUG_S_FILECHKSMS");
+      RETURN_CASE(DebugSubsectionKind, FrameData, "DEBUG_S_FRAMEDATA");
+      RETURN_CASE(DebugSubsectionKind, InlineeLines, "DEBUG_S_INLINEELINES");
+      RETURN_CASE(DebugSubsectionKind, CrossScopeImports,
+                  "DEBUG_S_CROSSSCOPEIMPORTS");
+      RETURN_CASE(DebugSubsectionKind, CrossScopeExports,
+                  "DEBUG_S_CROSSSCOPEEXPORTS");
+      RETURN_CASE(DebugSubsectionKind, ILLines, "DEBUG_S_IL_LINES");
+      RETURN_CASE(DebugSubsectionKind, FuncMDTokenMap,
+                  "DEBUG_S_FUNC_MDTOKEN_MAP");
+      RETURN_CASE(DebugSubsectionKind, TypeMDTokenMap,
+                  "DEBUG_S_TYPE_MDTOKEN_MAP");
+      RETURN_CASE(DebugSubsectionKind, MergedAssemblyInput,
+                  "DEBUG_S_MERGED_ASSEMBLYINPUT");
+      RETURN_CASE(DebugSubsectionKind, CoffSymbolRVA,
+                  "DEBUG_S_COFF_SYMBOL_RVA");
+    }
+  }
+  return formatUnknownEnum(Kind);
+}
+
+std::string llvm::pdb::formatSymbolKind(SymbolKind K) {
+  switch (uint32_t(K)) {
+#define SYMBOL_RECORD(EnumName, value, name)                                   \
+  case EnumName:                                                               \
+    return #EnumName;
+#define CV_SYMBOL(EnumName, value) SYMBOL_RECORD(EnumName, value, EnumName)
+#include "llvm/DebugInfo/CodeView/CodeViewSymbols.def"
+  }
+  return formatUnknownEnum(K);
+}
+
+std::string llvm::pdb::formatTypeLeafKind(TypeLeafKind K) {
+  switch (K) {
+#define TYPE_RECORD(EnumName, value, name)                                     \
+  case EnumName:                                                               \
+    return #EnumName;
+#include "llvm/DebugInfo/CodeView/CodeViewTypes.def"
+  default:
+    return formatv("UNKNOWN RECORD ({0:X})",
+                   static_cast<std::underlying_type_t<TypeLeafKind>>(K))
+        .str();
+  }
+}
+
+std::string llvm::pdb::formatSegmentOffset(uint16_t Segment, uint32_t Offset) {
+  return std::string(formatv("{0:4}:{1:4}", Segment, Offset));
+}
+
+#define PUSH_CHARACTERISTIC_FLAG(Enum, TheOpt, Value, Style, Descriptive)      \
+  PUSH_FLAG(Enum, TheOpt, Value,                                               \
+            ((Style == CharacteristicStyle::HeaderDefinition) ? #TheOpt        \
+                                                              : Descriptive))
+
+#define PUSH_MASKED_CHARACTERISTIC_FLAG(Enum, Mask, TheOpt, Value, Style,      \
+                                        Descriptive)                           \
+  PUSH_MASKED_FLAG(Enum, Mask, TheOpt, Value,                                  \
+                   ((Style == CharacteristicStyle::HeaderDefinition)           \
+                        ? #TheOpt                                              \
+                        : Descriptive))
+
+std::string llvm::pdb::formatSectionCharacteristics(uint32_t IndentLevel,
+                                                    uint32_t C,
+                                                    uint32_t FlagsPerLine,
+                                                    StringRef Separator,
+                                                    CharacteristicStyle Style) {
+  using SC = COFF::SectionCharacteristics;
+  std::vector<std::string> Opts;
+  if (C == COFF::SC_Invalid)
+    return "invalid";
+  if (C == 0)
+    return "none";
+  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_TYPE_NOLOAD, C, Style, "noload");
+  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_TYPE_NO_PAD, C, Style, "no padding");
+  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_CNT_CODE, C, Style, "code");
+  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_CNT_INITIALIZED_DATA, C, Style,
+                           "initialized data");
+  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_CNT_UNINITIALIZED_DATA, C, Style,
+                           "uninitialized data");
+  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_LNK_OTHER, C, Style, "other");
+  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_LNK_INFO, C, Style, "info");
+  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_LNK_REMOVE, C, Style, "remove");
+  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_LNK_COMDAT, C, Style, "comdat");
+  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_GPREL, C, Style, "gp rel");
+  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_MEM_PURGEABLE, C, Style, "purgeable");
+  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_MEM_16BIT, C, Style, "16-bit");
+  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_MEM_LOCKED, C, Style, "locked");
+  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_MEM_PRELOAD, C, Style, "preload");
+  PUSH_MASKED_CHARACTERISTIC_FLAG(SC, 0xF00000, IMAGE_SCN_ALIGN_1BYTES, C,
+                                  Style, "1 byte align");
+  PUSH_MASKED_CHARACTERISTIC_FLAG(SC, 0xF00000, IMAGE_SCN_ALIGN_2BYTES, C,
+                                  Style, "2 byte align");
+  PUSH_MASKED_CHARACTERISTIC_FLAG(SC, 0xF00000, IMAGE_SCN_ALIGN_4BYTES, C,
+                                  Style, "4 byte align");
+  PUSH_MASKED_CHARACTERISTIC_FLAG(SC, 0xF00000, IMAGE_SCN_ALIGN_8BYTES, C,
+                                  Style, "8 byte align");
+  PUSH_MASKED_CHARACTERISTIC_FLAG(SC, 0xF00000, IMAGE_SCN_ALIGN_16BYTES, C,
+                                  Style, "16 byte align");
+  PUSH_MASKED_CHARACTERISTIC_FLAG(SC, 0xF00000, IMAGE_SCN_ALIGN_32BYTES, C,
+                                  Style, "32 byte align");
+  PUSH_MASKED_CHARACTERISTIC_FLAG(SC, 0xF00000, IMAGE_SCN_ALIGN_64BYTES, C,
+                                  Style, "64 byte align");
+  PUSH_MASKED_CHARACTERISTIC_FLAG(SC, 0xF00000, IMAGE_SCN_ALIGN_128BYTES, C,
+                                  Style, "128 byte align");
+  PUSH_MASKED_CHARACTERISTIC_FLAG(SC, 0xF00000, IMAGE_SCN_ALIGN_256BYTES, C,
+                                  Style, "256 byte align");
+  PUSH_MASKED_CHARACTERISTIC_FLAG(SC, 0xF00000, IMAGE_SCN_ALIGN_512BYTES, C,
+                                  Style, "512 byte align");
+  PUSH_MASKED_CHARACTERISTIC_FLAG(SC, 0xF00000, IMAGE_SCN_ALIGN_1024BYTES, C,
+                                  Style, "1024 byte align");
+  PUSH_MASKED_CHARACTERISTIC_FLAG(SC, 0xF00000, IMAGE_SCN_ALIGN_2048BYTES, C,
+                                  Style, "2048 byte align");
+  PUSH_MASKED_CHARACTERISTIC_FLAG(SC, 0xF00000, IMAGE_SCN_ALIGN_4096BYTES, C,
+                                  Style, "4096 byte align");
+  PUSH_MASKED_CHARACTERISTIC_FLAG(SC, 0xF00000, IMAGE_SCN_ALIGN_8192BYTES, C,
+                                  Style, "8192 byte align");
+  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_LNK_NRELOC_OVFL, C, Style,
+                           "noreloc overflow");
+  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_MEM_DISCARDABLE, C, Style,
+                           "discardable");
+  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_MEM_NOT_CACHED, C, Style,
+                           "not cached");
+  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_MEM_NOT_PAGED, C, Style, "not paged");
+  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_MEM_SHARED, C, Style, "shared");
+  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_MEM_EXECUTE, C, Style,
+                           "execute permissions");
+  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_MEM_READ, C, Style,
+                           "read permissions");
+  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_MEM_WRITE, C, Style,
+                           "write permissions");
+  return typesetItemList(Opts, IndentLevel, FlagsPerLine, Separator);
+}
diff --git a/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
index 9084e689d165..262873c6e6ab 100644
--- a/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
@@ -14,7 +14,7 @@
 
 #include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h"
 #include "llvm/DebugInfo/CodeView/RecordName.h"
-#include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
+#include "llvm/DebugInfo/CodeView/RecordSerialization.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/CodeView/SymbolSerializer.h"
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
@@ -22,6 +22,7 @@
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
 #include "llvm/DebugInfo/PDB/Native/Hash.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/Support/BinaryItemStream.h"
 #include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/Parallel.h"
@@ -196,7 +197,7 @@ void GSIStreamBuilder::finalizeGlobalBuckets(uint32_t RecordZeroOffset) {
 void GSIHashStreamBuilder::finalizeBuckets(
     uint32_t RecordZeroOffset, MutableArrayRef<BulkPublic> Records) {
   // Hash every name in parallel.
-  parallelForEachN(0, Records.size(), [&](size_t I) {
+  parallelFor(0, Records.size(), [&](size_t I) {
     Records[I].setBucketIdx(hashStringV1(Records[I].Name) % IPHR_HASH);
   });
 
@@ -231,7 +232,7 @@ void GSIHashStreamBuilder::finalizeBuckets(
   // bucket can properly early-out when it detects the record won't be found.
   // The algorithm used here corresponds to the function
   // caseInsensitiveComparePchPchCchCch in the reference implementation.
-  parallelForEachN(0, IPHR_HASH, [&](size_t I) {
+  parallelFor(0, IPHR_HASH, [&](size_t I) {
     auto B = HashRecords.begin() + BucketStarts[I];
     auto E = HashRecords.begin() + BucketCursors[I];
     if (B == E)
@@ -286,7 +287,7 @@ GSIStreamBuilder::GSIStreamBuilder(msf::MSFBuilder &Msf)
     : Msf(Msf), PSH(std::make_unique<GSIHashStreamBuilder>()),
       GSH(std::make_unique<GSIHashStreamBuilder>()) {}
 
-GSIStreamBuilder::~GSIStreamBuilder() {}
+GSIStreamBuilder::~GSIStreamBuilder() = default;
 
 uint32_t GSIStreamBuilder::calculatePublicsHashStreamSize() const {
   uint32_t Size = 0;
diff --git a/llvm/lib/DebugInfo/PDB/Native/GlobalsStream.cpp b/llvm/lib/DebugInfo/PDB/Native/GlobalsStream.cpp
index f27d60f46815..7217fe38be55 100644
--- a/llvm/lib/DebugInfo/PDB/Native/GlobalsStream.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/GlobalsStream.cpp
@@ -21,6 +21,7 @@
 #include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
 
 #include "llvm/DebugInfo/CodeView/RecordName.h"
+#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/PDB/Native/Hash.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/DebugInfo/PDB/Native/SymbolStream.h"
@@ -141,14 +142,12 @@ readGSIHashBuckets(FixedStreamArray<support::ulittle32_t> &HashBuckets,
     return joinErrors(std::move(EC),
                       make_error<RawError>(raw_error_code::corrupt_file,
                                            "Could not read a bitmap."));
-  uint32_t NumBuckets1 = 0;
   uint32_t CompressedBucketIdx = 0;
   for (uint32_t I = 0; I <= IPHR_HASH; ++I) {
     uint8_t WordIdx = I / 32;
     uint8_t BitIdx = I % 32;
     bool IsSet = HashBitmap[WordIdx] & (1U << BitIdx);
     if (IsSet) {
-      ++NumBuckets1;
       BucketMap[I] = CompressedBucketIdx++;
     } else {
       BucketMap[I] = -1;
diff --git a/llvm/lib/DebugInfo/PDB/Native/HashTable.cpp b/llvm/lib/DebugInfo/PDB/Native/HashTable.cpp
index dfdcdf1f4eaf..030a59821914 100644
--- a/llvm/lib/DebugInfo/PDB/Native/HashTable.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/HashTable.cpp
@@ -7,14 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/Native/HashTable.h"
-#include "llvm/ADT/Optional.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MathExtras.h"
-#include <algorithm>
-#include <cassert>
 #include <cstdint>
 #include <utility>
 
diff --git a/llvm/lib/DebugInfo/PDB/Native/InfoStream.cpp b/llvm/lib/DebugInfo/PDB/Native/InfoStream.cpp
index f41bb32d69af..927a0ffee28c 100644
--- a/llvm/lib/DebugInfo/PDB/Native/InfoStream.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/InfoStream.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/Native/InfoStream.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
@@ -16,7 +14,7 @@
 
 using namespace llvm;
 using namespace llvm::codeview;
-using namespace llvm::msf;
+// using namespace llvm::msf;
 using namespace llvm::pdb;
 
 InfoStream::InfoStream(std::unique_ptr<BinaryStream> Stream)
diff --git a/llvm/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp
index 42daa7cae799..e8f5a451b08e 100644
--- a/llvm/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp
@@ -10,11 +10,9 @@
 
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/PDB/Native/InfoStream.h"
 #include "llvm/DebugInfo/PDB/Native/NamedStreamMap.h"
-#include "llvm/DebugInfo/PDB/Native/PDBFileBuilder.h"
-#include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/BinaryStreamWriter.h"
 
 using namespace llvm;
diff --git a/llvm/lib/DebugInfo/PDB/Native/InjectedSourceStream.cpp b/llvm/lib/DebugInfo/PDB/Native/InjectedSourceStream.cpp
index 3f4101db7b93..f1e8adeb1b21 100644
--- a/llvm/lib/DebugInfo/PDB/Native/InjectedSourceStream.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/InjectedSourceStream.cpp
@@ -9,7 +9,7 @@
 #include "llvm/DebugInfo/PDB/Native/InjectedSourceStream.h"
 
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/PDB/Native/Hash.h"
+#include "llvm/DebugInfo/PDB/Native/HashTable.h"
 #include "llvm/DebugInfo/PDB/Native/PDBStringTable.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
diff --git a/llvm/lib/DebugInfo/PDB/Native/InputFile.cpp b/llvm/lib/DebugInfo/PDB/Native/InputFile.cpp
new file mode 100644
index 000000000000..495b25077737
--- /dev/null
+++ b/llvm/lib/DebugInfo/PDB/Native/InputFile.cpp
@@ -0,0 +1,587 @@
+//===- InputFile.cpp ------------------------------------------ *- C++ --*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/InputFile.h"
+
+#include "llvm/BinaryFormat/Magic.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
+#include "llvm/DebugInfo/CodeView/StringsAndChecksums.h"
+#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
+#include "llvm/DebugInfo/PDB/Native/DbiStream.h"
+#include "llvm/DebugInfo/PDB/Native/FormatUtil.h"
+#include "llvm/DebugInfo/PDB/Native/LinePrinter.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/PDBStringTable.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/TpiStream.h"
+#include "llvm/DebugInfo/PDB/PDB.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FormatVariadic.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::object;
+using namespace llvm::pdb;
+
+InputFile::InputFile() = default;
+InputFile::~InputFile() = default;
+
+Expected<ModuleDebugStreamRef>
+llvm::pdb::getModuleDebugStream(PDBFile &File, StringRef &ModuleName,
+                                uint32_t Index) {
+  Expected<DbiStream &> DbiOrErr = File.getPDBDbiStream();
+  if (!DbiOrErr)
+    return DbiOrErr.takeError();
+  DbiStream &Dbi = *DbiOrErr;
+  const auto &Modules = Dbi.modules();
+  if (Index >= Modules.getModuleCount())
+    return make_error<RawError>(raw_error_code::index_out_of_bounds,
+                                "Invalid module index");
+
+  auto Modi = Modules.getModuleDescriptor(Index);
+
+  ModuleName = Modi.getModuleName();
+
+  uint16_t ModiStream = Modi.getModuleStreamIndex();
+  if (ModiStream == kInvalidStreamIndex)
+    return make_error<RawError>(raw_error_code::no_stream,
+                                "Module stream not present");
+
+  auto ModStreamData = File.createIndexedStream(ModiStream);
+
+  ModuleDebugStreamRef ModS(Modi, std::move(ModStreamData));
+  if (auto EC = ModS.reload())
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Invalid module stream");
+
+  return std::move(ModS);
+}
+
+Expected<ModuleDebugStreamRef> llvm::pdb::getModuleDebugStream(PDBFile &File,
+                                                               uint32_t Index) {
+  Expected<DbiStream &> DbiOrErr = File.getPDBDbiStream();
+  if (!DbiOrErr)
+    return DbiOrErr.takeError();
+  DbiStream &Dbi = *DbiOrErr;
+  const auto &Modules = Dbi.modules();
+  auto Modi = Modules.getModuleDescriptor(Index);
+
+  uint16_t ModiStream = Modi.getModuleStreamIndex();
+  if (ModiStream == kInvalidStreamIndex)
+    return make_error<RawError>(raw_error_code::no_stream,
+                                "Module stream not present");
+
+  auto ModStreamData = File.createIndexedStream(ModiStream);
+
+  ModuleDebugStreamRef ModS(Modi, std::move(ModStreamData));
+  if (Error Err = ModS.reload())
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Invalid module stream");
+
+  return std::move(ModS);
+}
+
+static inline bool isCodeViewDebugSubsection(object::SectionRef Section,
+                                             StringRef Name,
+                                             BinaryStreamReader &Reader) {
+  if (Expected<StringRef> NameOrErr = Section.getName()) {
+    if (*NameOrErr != Name)
+      return false;
+  } else {
+    consumeError(NameOrErr.takeError());
+    return false;
+  }
+
+  Expected<StringRef> ContentsOrErr = Section.getContents();
+  if (!ContentsOrErr) {
+    consumeError(ContentsOrErr.takeError());
+    return false;
+  }
+
+  Reader = BinaryStreamReader(*ContentsOrErr, support::little);
+  uint32_t Magic;
+  if (Reader.bytesRemaining() < sizeof(uint32_t))
+    return false;
+  cantFail(Reader.readInteger(Magic));
+  if (Magic != COFF::DEBUG_SECTION_MAGIC)
+    return false;
+  return true;
+}
+
+static inline bool isDebugSSection(object::SectionRef Section,
+                                   DebugSubsectionArray &Subsections) {
+  BinaryStreamReader Reader;
+  if (!isCodeViewDebugSubsection(Section, ".debug$S", Reader))
+    return false;
+
+  cantFail(Reader.readArray(Subsections, Reader.bytesRemaining()));
+  return true;
+}
+
+static bool isDebugTSection(SectionRef Section, CVTypeArray &Types) {
+  BinaryStreamReader Reader;
+  if (!isCodeViewDebugSubsection(Section, ".debug$T", Reader) &&
+      !isCodeViewDebugSubsection(Section, ".debug$P", Reader))
+    return false;
+  cantFail(Reader.readArray(Types, Reader.bytesRemaining()));
+  return true;
+}
+
+static std::string formatChecksumKind(FileChecksumKind Kind) {
+  switch (Kind) {
+    RETURN_CASE(FileChecksumKind, None, "None");
+    RETURN_CASE(FileChecksumKind, MD5, "MD5");
+    RETURN_CASE(FileChecksumKind, SHA1, "SHA-1");
+    RETURN_CASE(FileChecksumKind, SHA256, "SHA-256");
+  }
+  return formatUnknownEnum(Kind);
+}
+
+template <typename... Args>
+static void formatInternal(LinePrinter &Printer, bool Append, Args &&...args) {
+  if (Append)
+    Printer.format(std::forward<Args>(args)...);
+  else
+    Printer.formatLine(std::forward<Args>(args)...);
+}
+
+SymbolGroup::SymbolGroup(InputFile *File, uint32_t GroupIndex) : File(File) {
+  if (!File)
+    return;
+
+  if (File->isPdb())
+    initializeForPdb(GroupIndex);
+  else {
+    Name = ".debug$S";
+    uint32_t I = 0;
+    for (const auto &S : File->obj().sections()) {
+      DebugSubsectionArray SS;
+      if (!isDebugSSection(S, SS))
+        continue;
+
+      if (!SC.hasChecksums() || !SC.hasStrings())
+        SC.initialize(SS);
+
+      if (I == GroupIndex)
+        Subsections = SS;
+
+      if (SC.hasChecksums() && SC.hasStrings())
+        break;
+    }
+    rebuildChecksumMap();
+  }
+}
+
+StringRef SymbolGroup::name() const { return Name; }
+
+void SymbolGroup::updateDebugS(const codeview::DebugSubsectionArray &SS) {
+  Subsections = SS;
+}
+
+void SymbolGroup::updatePdbModi(uint32_t Modi) { initializeForPdb(Modi); }
+
+void SymbolGroup::initializeForPdb(uint32_t Modi) {
+  assert(File && File->isPdb());
+
+  // PDB always uses the same string table, but each module has its own
+  // checksums.  So we only set the strings if they're not already set.
+  if (!SC.hasStrings()) {
+    auto StringTable = File->pdb().getStringTable();
+    if (StringTable)
+      SC.setStrings(StringTable->getStringTable());
+    else
+      consumeError(StringTable.takeError());
+  }
+
+  SC.resetChecksums();
+  auto MDS = getModuleDebugStream(File->pdb(), Name, Modi);
+  if (!MDS) {
+    consumeError(MDS.takeError());
+    return;
+  }
+
+  DebugStream = std::make_shared<ModuleDebugStreamRef>(std::move(*MDS));
+  Subsections = DebugStream->getSubsectionsArray();
+  SC.initialize(Subsections);
+  rebuildChecksumMap();
+}
+
+void SymbolGroup::rebuildChecksumMap() {
+  if (!SC.hasChecksums())
+    return;
+
+  for (const auto &Entry : SC.checksums()) {
+    auto S = SC.strings().getString(Entry.FileNameOffset);
+    if (!S)
+      continue;
+    ChecksumsByFile[*S] = Entry;
+  }
+}
+
+const ModuleDebugStreamRef &SymbolGroup::getPdbModuleStream() const {
+  assert(File && File->isPdb() && DebugStream);
+  return *DebugStream;
+}
+
+Expected<StringRef> SymbolGroup::getNameFromStringTable(uint32_t Offset) const {
+  return SC.strings().getString(Offset);
+}
+
+Expected<StringRef> SymbolGroup::getNameFromChecksums(uint32_t Offset) const {
+  StringRef Name;
+  if (!SC.hasChecksums()) {
+    return std::move(Name);
+  }
+
+  auto Iter = SC.checksums().getArray().at(Offset);
+  if (Iter == SC.checksums().getArray().end()) {
+    return std::move(Name);
+  }
+
+  uint32_t FO = Iter->FileNameOffset;
+  auto ExpectedFile = getNameFromStringTable(FO);
+  if (!ExpectedFile) {
+    return std::move(Name);
+  }
+
+  return *ExpectedFile;
+}
+
+void SymbolGroup::formatFromFileName(LinePrinter &Printer, StringRef File,
+                                     bool Append) const {
+  auto FC = ChecksumsByFile.find(File);
+  if (FC == ChecksumsByFile.end()) {
+    formatInternal(Printer, Append, "- (no checksum) {0}", File);
+    return;
+  }
+
+  formatInternal(Printer, Append, "- ({0}: {1}) {2}",
+                 formatChecksumKind(FC->getValue().Kind),
+                 toHex(FC->getValue().Checksum), File);
+}
+
+void SymbolGroup::formatFromChecksumsOffset(LinePrinter &Printer,
+                                            uint32_t Offset,
+                                            bool Append) const {
+  if (!SC.hasChecksums()) {
+    formatInternal(Printer, Append, "(unknown file name offset {0})", Offset);
+    return;
+  }
+
+  auto Iter = SC.checksums().getArray().at(Offset);
+  if (Iter == SC.checksums().getArray().end()) {
+    formatInternal(Printer, Append, "(unknown file name offset {0})", Offset);
+    return;
+  }
+
+  uint32_t FO = Iter->FileNameOffset;
+  auto ExpectedFile = getNameFromStringTable(FO);
+  if (!ExpectedFile) {
+    formatInternal(Printer, Append, "(unknown file name offset {0})", Offset);
+    consumeError(ExpectedFile.takeError());
+    return;
+  }
+  if (Iter->Kind == FileChecksumKind::None) {
+    formatInternal(Printer, Append, "{0} (no checksum)", *ExpectedFile);
+  } else {
+    formatInternal(Printer, Append, "{0} ({1}: {2})", *ExpectedFile,
+                   formatChecksumKind(Iter->Kind), toHex(Iter->Checksum));
+  }
+}
+
+Expected<InputFile> InputFile::open(StringRef Path, bool AllowUnknownFile) {
+  InputFile IF;
+  if (!llvm::sys::fs::exists(Path))
+    return make_error<StringError>(formatv("File {0} not found", Path),
+                                   inconvertibleErrorCode());
+
+  file_magic Magic;
+  if (auto EC = identify_magic(Path, Magic))
+    return make_error<StringError>(
+        formatv("Unable to identify file type for file {0}", Path), EC);
+
+  if (Magic == file_magic::coff_object) {
+    Expected<OwningBinary<Binary>> BinaryOrErr = createBinary(Path);
+    if (!BinaryOrErr)
+      return BinaryOrErr.takeError();
+
+    IF.CoffObject = std::move(*BinaryOrErr);
+    IF.PdbOrObj = llvm::cast<COFFObjectFile>(IF.CoffObject.getBinary());
+    return std::move(IF);
+  }
+
+  if (Magic == file_magic::pdb) {
+    std::unique_ptr<IPDBSession> Session;
+    if (auto Err = loadDataForPDB(PDB_ReaderType::Native, Path, Session))
+      return std::move(Err);
+
+    IF.PdbSession.reset(static_cast<NativeSession *>(Session.release()));
+    IF.PdbOrObj = &IF.PdbSession->getPDBFile();
+
+    return std::move(IF);
+  }
+
+  if (!AllowUnknownFile)
+    return make_error<StringError>(
+        formatv("File {0} is not a supported file type", Path),
+        inconvertibleErrorCode());
+
+  auto Result = MemoryBuffer::getFile(Path, /*IsText=*/false,
+                                      /*RequiresNullTerminator=*/false);
+  if (!Result)
+    return make_error<StringError>(
+        formatv("File {0} could not be opened", Path), Result.getError());
+
+  IF.UnknownFile = std::move(*Result);
+  IF.PdbOrObj = IF.UnknownFile.get();
+  return std::move(IF);
+}
+
+PDBFile &InputFile::pdb() {
+  assert(isPdb());
+  return *PdbOrObj.get<PDBFile *>();
+}
+
+const PDBFile &InputFile::pdb() const {
+  assert(isPdb());
+  return *PdbOrObj.get<PDBFile *>();
+}
+
+object::COFFObjectFile &InputFile::obj() {
+  assert(isObj());
+  return *PdbOrObj.get<object::COFFObjectFile *>();
+}
+
+const object::COFFObjectFile &InputFile::obj() const {
+  assert(isObj());
+  return *PdbOrObj.get<object::COFFObjectFile *>();
+}
+
+MemoryBuffer &InputFile::unknown() {
+  assert(isUnknown());
+  return *PdbOrObj.get<MemoryBuffer *>();
+}
+
+const MemoryBuffer &InputFile::unknown() const {
+  assert(isUnknown());
+  return *PdbOrObj.get<MemoryBuffer *>();
+}
+
+StringRef InputFile::getFilePath() const {
+  if (isPdb())
+    return pdb().getFilePath();
+  if (isObj())
+    return obj().getFileName();
+  assert(isUnknown());
+  return unknown().getBufferIdentifier();
+}
+
+bool InputFile::hasTypes() const {
+  if (isPdb())
+    return pdb().hasPDBTpiStream();
+
+  for (const auto &Section : obj().sections()) {
+    CVTypeArray Types;
+    if (isDebugTSection(Section, Types))
+      return true;
+  }
+  return false;
+}
+
+bool InputFile::hasIds() const {
+  if (isObj())
+    return false;
+  return pdb().hasPDBIpiStream();
+}
+
+bool InputFile::isPdb() const { return PdbOrObj.is<PDBFile *>(); }
+
+bool InputFile::isObj() const {
+  return PdbOrObj.is<object::COFFObjectFile *>();
+}
+
+bool InputFile::isUnknown() const { return PdbOrObj.is<MemoryBuffer *>(); }
+
+codeview::LazyRandomTypeCollection &
+InputFile::getOrCreateTypeCollection(TypeCollectionKind Kind) {
+  if (Types && Kind == kTypes)
+    return *Types;
+  if (Ids && Kind == kIds)
+    return *Ids;
+
+  if (Kind == kIds) {
+    assert(isPdb() && pdb().hasPDBIpiStream());
+  }
+
+  // If the collection was already initialized, we should have just returned it
+  // in step 1.
+  if (isPdb()) {
+    TypeCollectionPtr &Collection = (Kind == kIds) ? Ids : Types;
+    auto &Stream = cantFail((Kind == kIds) ? pdb().getPDBIpiStream()
+                                           : pdb().getPDBTpiStream());
+
+    auto &Array = Stream.typeArray();
+    uint32_t Count = Stream.getNumTypeRecords();
+    auto Offsets = Stream.getTypeIndexOffsets();
+    Collection =
+        std::make_unique<LazyRandomTypeCollection>(Array, Count, Offsets);
+    return *Collection;
+  }
+
+  assert(isObj());
+  assert(Kind == kTypes);
+  assert(!Types);
+
+  for (const auto &Section : obj().sections()) {
+    CVTypeArray Records;
+    if (!isDebugTSection(Section, Records))
+      continue;
+
+    Types = std::make_unique<LazyRandomTypeCollection>(Records, 100);
+    return *Types;
+  }
+
+  Types = std::make_unique<LazyRandomTypeCollection>(100);
+  return *Types;
+}
+
+codeview::LazyRandomTypeCollection &InputFile::types() {
+  return getOrCreateTypeCollection(kTypes);
+}
+
+codeview::LazyRandomTypeCollection &InputFile::ids() {
+  // Object files have only one type stream that contains both types and ids.
+  // Similarly, some PDBs don't contain an IPI stream, and for those both types
+  // and IDs are in the same stream.
+  if (isObj() || !pdb().hasPDBIpiStream())
+    return types();
+
+  return getOrCreateTypeCollection(kIds);
+}
+
+iterator_range<SymbolGroupIterator> InputFile::symbol_groups() {
+  return make_range<SymbolGroupIterator>(symbol_groups_begin(),
+                                         symbol_groups_end());
+}
+
+SymbolGroupIterator InputFile::symbol_groups_begin() {
+  return SymbolGroupIterator(*this);
+}
+
+SymbolGroupIterator InputFile::symbol_groups_end() {
+  return SymbolGroupIterator();
+}
+
+SymbolGroupIterator::SymbolGroupIterator() : Value(nullptr) {}
+
+SymbolGroupIterator::SymbolGroupIterator(InputFile &File) : Value(&File) {
+  if (File.isObj()) {
+    SectionIter = File.obj().section_begin();
+    scanToNextDebugS();
+  }
+}
+
+bool SymbolGroupIterator::operator==(const SymbolGroupIterator &R) const {
+  bool E = isEnd();
+  bool RE = R.isEnd();
+  if (E || RE)
+    return E == RE;
+
+  if (Value.File != R.Value.File)
+    return false;
+  return Index == R.Index;
+}
+
+const SymbolGroup &SymbolGroupIterator::operator*() const {
+  assert(!isEnd());
+  return Value;
+}
+SymbolGroup &SymbolGroupIterator::operator*() {
+  assert(!isEnd());
+  return Value;
+}
+
+SymbolGroupIterator &SymbolGroupIterator::operator++() {
+  assert(Value.File && !isEnd());
+  ++Index;
+  if (isEnd())
+    return *this;
+
+  if (Value.File->isPdb()) {
+    Value.updatePdbModi(Index);
+    return *this;
+  }
+
+  scanToNextDebugS();
+  return *this;
+}
+
+void SymbolGroupIterator::scanToNextDebugS() {
+  assert(SectionIter);
+  auto End = Value.File->obj().section_end();
+  auto &Iter = *SectionIter;
+  assert(!isEnd());
+
+  while (++Iter != End) {
+    DebugSubsectionArray SS;
+    SectionRef SR = *Iter;
+    if (!isDebugSSection(SR, SS))
+      continue;
+
+    Value.updateDebugS(SS);
+    return;
+  }
+}
+
+bool SymbolGroupIterator::isEnd() const {
+  if (!Value.File)
+    return true;
+  if (Value.File->isPdb()) {
+    DbiStream &Dbi = cantFail(Value.File->pdb().getPDBDbiStream());
+    uint32_t Count = Dbi.modules().getModuleCount();
+    assert(Index <= Count);
+    return Index == Count;
+  }
+
+  assert(SectionIter);
+  return *SectionIter == Value.File->obj().section_end();
+}
+
+static bool isMyCode(const SymbolGroup &Group) {
+  if (Group.getFile().isObj())
+    return true;
+
+  StringRef Name = Group.name();
+  if (Name.startswith("Import:"))
+    return false;
+  if (Name.endswith_insensitive(".dll"))
+    return false;
+  if (Name.equals_insensitive("* linker *"))
+    return false;
+  if (Name.startswith_insensitive("f:\\binaries\\Intermediate\\vctools"))
+    return false;
+  if (Name.startswith_insensitive("f:\\dd\\vctools\\crt"))
+    return false;
+  return true;
+}
+
+bool llvm::pdb::shouldDumpSymbolGroup(uint32_t Idx, const SymbolGroup &Group,
+                                      const FilterOptions &Filters) {
+  if (Filters.JustMyCode && !isMyCode(Group))
+    return false;
+
+  // If the arg was not specified on the command line, always dump all modules.
+  if (!Filters.DumpModi)
+    return true;
+
+  // Otherwise, only dump if this is the same module specified.
+  return (Filters.DumpModi == Idx);
+}
diff --git a/llvm/lib/DebugInfo/PDB/Native/LinePrinter.cpp b/llvm/lib/DebugInfo/PDB/Native/LinePrinter.cpp
new file mode 100644
index 000000000000..c12fedc23833
--- /dev/null
+++ b/llvm/lib/DebugInfo/PDB/Native/LinePrinter.cpp
@@ -0,0 +1,340 @@
+//===- LinePrinter.cpp ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/LinePrinter.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
+#include "llvm/DebugInfo/MSF/MSFCommon.h"
+#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
+#include "llvm/DebugInfo/PDB/Native/InputFile.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/UDTLayout.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormatAdapters.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/Regex.h"
+
+#include <algorithm>
+
+using namespace llvm;
+using namespace llvm::msf;
+using namespace llvm::pdb;
+
+namespace {
+bool IsItemExcluded(llvm::StringRef Item,
+                    std::list<llvm::Regex> &IncludeFilters,
+                    std::list<llvm::Regex> &ExcludeFilters) {
+  if (Item.empty())
+    return false;
+
+  auto match_pred = [Item](llvm::Regex &R) { return R.match(Item); };
+
+  // Include takes priority over exclude.  If the user specified include
+  // filters, and none of them include this item, them item is gone.
+  if (!IncludeFilters.empty() && !any_of(IncludeFilters, match_pred))
+    return true;
+
+  if (any_of(ExcludeFilters, match_pred))
+    return true;
+
+  return false;
+}
+} // namespace
+
+using namespace llvm;
+
+LinePrinter::LinePrinter(int Indent, bool UseColor, llvm::raw_ostream &Stream,
+                         const FilterOptions &Filters)
+    : OS(Stream), IndentSpaces(Indent), CurrentIndent(0), UseColor(UseColor),
+      Filters(Filters) {
+  SetFilters(ExcludeTypeFilters, Filters.ExcludeTypes.begin(),
+             Filters.ExcludeTypes.end());
+  SetFilters(ExcludeSymbolFilters, Filters.ExcludeSymbols.begin(),
+             Filters.ExcludeSymbols.end());
+  SetFilters(ExcludeCompilandFilters, Filters.ExcludeCompilands.begin(),
+             Filters.ExcludeCompilands.end());
+
+  SetFilters(IncludeTypeFilters, Filters.IncludeTypes.begin(),
+             Filters.IncludeTypes.end());
+  SetFilters(IncludeSymbolFilters, Filters.IncludeSymbols.begin(),
+             Filters.IncludeSymbols.end());
+  SetFilters(IncludeCompilandFilters, Filters.IncludeCompilands.begin(),
+             Filters.IncludeCompilands.end());
+}
+
+void LinePrinter::Indent(uint32_t Amount) {
+  if (Amount == 0)
+    Amount = IndentSpaces;
+  CurrentIndent += Amount;
+}
+
+void LinePrinter::Unindent(uint32_t Amount) {
+  if (Amount == 0)
+    Amount = IndentSpaces;
+  CurrentIndent = std::max<int>(0, CurrentIndent - Amount);
+}
+
+void LinePrinter::NewLine() {
+  OS << "\n";
+  OS.indent(CurrentIndent);
+}
+
+void LinePrinter::print(const Twine &T) { OS << T; }
+
+void LinePrinter::printLine(const Twine &T) {
+  NewLine();
+  OS << T;
+}
+
+bool LinePrinter::IsClassExcluded(const ClassLayout &Class) {
+  if (IsTypeExcluded(Class.getName(), Class.getSize()))
+    return true;
+  if (Class.deepPaddingSize() < Filters.PaddingThreshold)
+    return true;
+  return false;
+}
+
+void LinePrinter::formatBinary(StringRef Label, ArrayRef<uint8_t> Data,
+                               uint64_t StartOffset) {
+  NewLine();
+  OS << Label << " (";
+  if (!Data.empty()) {
+    OS << "\n";
+    OS << format_bytes_with_ascii(Data, StartOffset, 32, 4,
+                                  CurrentIndent + IndentSpaces, true);
+    NewLine();
+  }
+  OS << ")";
+}
+
+void LinePrinter::formatBinary(StringRef Label, ArrayRef<uint8_t> Data,
+                               uint64_t Base, uint64_t StartOffset) {
+  NewLine();
+  OS << Label << " (";
+  if (!Data.empty()) {
+    OS << "\n";
+    Base += StartOffset;
+    OS << format_bytes_with_ascii(Data, Base, 32, 4,
+                                  CurrentIndent + IndentSpaces, true);
+    NewLine();
+  }
+  OS << ")";
+}
+
+namespace {
+struct Run {
+  Run() = default;
+  explicit Run(uint32_t Block) : Block(Block) {}
+  uint32_t Block = 0;
+  uint64_t ByteLen = 0;
+};
+} // namespace
+
+static std::vector<Run> computeBlockRuns(uint32_t BlockSize,
+                                         const msf::MSFStreamLayout &Layout) {
+  std::vector<Run> Runs;
+  if (Layout.Length == 0)
+    return Runs;
+
+  ArrayRef<support::ulittle32_t> Blocks = Layout.Blocks;
+  assert(!Blocks.empty());
+  uint64_t StreamBytesRemaining = Layout.Length;
+  uint32_t CurrentBlock = Blocks[0];
+  Runs.emplace_back(CurrentBlock);
+  while (!Blocks.empty()) {
+    Run *CurrentRun = &Runs.back();
+    uint32_t NextBlock = Blocks.front();
+    if (NextBlock < CurrentBlock || (NextBlock - CurrentBlock > 1)) {
+      Runs.emplace_back(NextBlock);
+      CurrentRun = &Runs.back();
+    }
+    uint64_t Used =
+        std::min(static_cast<uint64_t>(BlockSize), StreamBytesRemaining);
+    CurrentRun->ByteLen += Used;
+    StreamBytesRemaining -= Used;
+    CurrentBlock = NextBlock;
+    Blocks = Blocks.drop_front();
+  }
+  return Runs;
+}
+
+static std::pair<Run, uint64_t> findRun(uint64_t Offset, ArrayRef<Run> Runs) {
+  for (const auto &R : Runs) {
+    if (Offset < R.ByteLen)
+      return std::make_pair(R, Offset);
+    Offset -= R.ByteLen;
+  }
+  llvm_unreachable("Invalid offset!");
+}
+
+void LinePrinter::formatMsfStreamData(StringRef Label, PDBFile &File,
+                                      uint32_t StreamIdx,
+                                      StringRef StreamPurpose, uint64_t Offset,
+                                      uint64_t Size) {
+  if (StreamIdx >= File.getNumStreams()) {
+    formatLine("Stream {0}: Not present", StreamIdx);
+    return;
+  }
+  if (Size + Offset > File.getStreamByteSize(StreamIdx)) {
+    formatLine(
+        "Stream {0}: Invalid offset and size, range out of stream bounds",
+        StreamIdx);
+    return;
+  }
+
+  auto S = File.createIndexedStream(StreamIdx);
+  if (!S) {
+    NewLine();
+    formatLine("Stream {0}: Not present", StreamIdx);
+    return;
+  }
+
+  uint64_t End =
+      (Size == 0) ? S->getLength() : std::min(Offset + Size, S->getLength());
+  Size = End - Offset;
+
+  formatLine("Stream {0}: {1} (dumping {2:N} / {3:N} bytes)", StreamIdx,
+             StreamPurpose, Size, S->getLength());
+  AutoIndent Indent(*this);
+  BinaryStreamRef Slice(*S);
+  BinarySubstreamRef Substream;
+  Substream.Offset = Offset;
+  Substream.StreamData = Slice.drop_front(Offset).keep_front(Size);
+
+  auto Layout = File.getStreamLayout(StreamIdx);
+  formatMsfStreamData(Label, File, Layout, Substream);
+}
+
+void LinePrinter::formatMsfStreamData(StringRef Label, PDBFile &File,
+                                      const msf::MSFStreamLayout &Stream,
+                                      BinarySubstreamRef Substream) {
+  BinaryStreamReader Reader(Substream.StreamData);
+
+  auto Runs = computeBlockRuns(File.getBlockSize(), Stream);
+
+  NewLine();
+  OS << Label << " (";
+  while (Reader.bytesRemaining() > 0) {
+    OS << "\n";
+
+    Run FoundRun;
+    uint64_t RunOffset;
+    std::tie(FoundRun, RunOffset) = findRun(Substream.Offset, Runs);
+    assert(FoundRun.ByteLen >= RunOffset);
+    uint64_t Len = FoundRun.ByteLen - RunOffset;
+    Len = std::min(Len, Reader.bytesRemaining());
+    uint64_t Base = FoundRun.Block * File.getBlockSize() + RunOffset;
+    ArrayRef<uint8_t> Data;
+    consumeError(Reader.readBytes(Data, Len));
+    OS << format_bytes_with_ascii(Data, Base, 32, 4,
+                                  CurrentIndent + IndentSpaces, true);
+    if (Reader.bytesRemaining() > 0) {
+      NewLine();
+      OS << formatv("  {0}",
+                    fmt_align("<discontinuity>", AlignStyle::Center, 114, '-'));
+    }
+    Substream.Offset += Len;
+  }
+  NewLine();
+  OS << ")";
+}
+
+void LinePrinter::formatMsfStreamBlocks(
+    PDBFile &File, const msf::MSFStreamLayout &StreamLayout) {
+  auto Blocks = makeArrayRef(StreamLayout.Blocks);
+  uint64_t L = StreamLayout.Length;
+
+  while (L > 0) {
+    NewLine();
+    assert(!Blocks.empty());
+    OS << formatv("Block {0} (\n", uint32_t(Blocks.front()));
+    uint64_t UsedBytes =
+        std::min(L, static_cast<uint64_t>(File.getBlockSize()));
+    ArrayRef<uint8_t> BlockData =
+        cantFail(File.getBlockData(Blocks.front(), File.getBlockSize()));
+    uint64_t BaseOffset = Blocks.front();
+    BaseOffset *= File.getBlockSize();
+    OS << format_bytes_with_ascii(BlockData, BaseOffset, 32, 4,
+                                  CurrentIndent + IndentSpaces, true);
+    NewLine();
+    OS << ")";
+    NewLine();
+    L -= UsedBytes;
+    Blocks = Blocks.drop_front();
+  }
+}
+
+bool LinePrinter::IsTypeExcluded(llvm::StringRef TypeName, uint64_t Size) {
+  if (IsItemExcluded(TypeName, IncludeTypeFilters, ExcludeTypeFilters))
+    return true;
+  if (Size < Filters.SizeThreshold)
+    return true;
+  return false;
+}
+
+bool LinePrinter::IsSymbolExcluded(llvm::StringRef SymbolName) {
+  return IsItemExcluded(SymbolName, IncludeSymbolFilters, ExcludeSymbolFilters);
+}
+
+bool LinePrinter::IsCompilandExcluded(llvm::StringRef CompilandName) {
+  return IsItemExcluded(CompilandName, IncludeCompilandFilters,
+                        ExcludeCompilandFilters);
+}
+
+WithColor::WithColor(LinePrinter &P, PDB_ColorItem C)
+    : OS(P.OS), UseColor(P.hasColor()) {
+  if (UseColor)
+    applyColor(C);
+}
+
+WithColor::~WithColor() {
+  if (UseColor)
+    OS.resetColor();
+}
+
+void WithColor::applyColor(PDB_ColorItem C) {
+  switch (C) {
+  case PDB_ColorItem::None:
+    OS.resetColor();
+    return;
+  case PDB_ColorItem::Comment:
+    OS.changeColor(raw_ostream::GREEN, false);
+    return;
+  case PDB_ColorItem::Address:
+    OS.changeColor(raw_ostream::YELLOW, /*bold=*/true);
+    return;
+  case PDB_ColorItem::Keyword:
+    OS.changeColor(raw_ostream::MAGENTA, true);
+    return;
+  case PDB_ColorItem::Register:
+  case PDB_ColorItem::Offset:
+    OS.changeColor(raw_ostream::YELLOW, false);
+    return;
+  case PDB_ColorItem::Type:
+    OS.changeColor(raw_ostream::CYAN, true);
+    return;
+  case PDB_ColorItem::Identifier:
+    OS.changeColor(raw_ostream::CYAN, false);
+    return;
+  case PDB_ColorItem::Path:
+    OS.changeColor(raw_ostream::CYAN, false);
+    return;
+  case PDB_ColorItem::Padding:
+  case PDB_ColorItem::SectionHeader:
+    OS.changeColor(raw_ostream::RED, true);
+    return;
+  case PDB_ColorItem::LiteralValue:
+    OS.changeColor(raw_ostream::GREEN, true);
+    return;
+  }
+}
diff --git a/llvm/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp b/llvm/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp
index 1445f0bd9e1b..f0e96a7cd659 100644
--- a/llvm/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp
@@ -10,16 +10,17 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
-#include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
-#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecordHelpers.h"
+#include "llvm/DebugInfo/MSF/MSFCommon.h"
+#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/PDB/Native/DbiModuleDescriptor.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/BinaryStreamArray.h"
 #include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Error.h"
-#include <algorithm>
 #include <cstdint>
 
 using namespace llvm;
diff --git a/llvm/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp b/llvm/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp
index 1d873b87b347..500923e57fbb 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp
@@ -7,21 +7,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/Native/NamedStreamMap.h"
+#include "llvm/ADT/SparseBitVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/iterator_range.h"
 #include "llvm/DebugInfo/PDB/Native/Hash.h"
 #include "llvm/DebugInfo/PDB/Native/HashTable.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/Support/BinaryStreamReader.h"
-#include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
-#include <tuple>
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeCompilandSymbol.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeCompilandSymbol.cpp
index 7717f062eac1..d24364312b31 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeCompilandSymbol.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeCompilandSymbol.cpp
@@ -9,8 +9,6 @@
 #include "llvm/DebugInfo/PDB/Native/NativeCompilandSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 
-#include "llvm/ADT/STLExtras.h"
-
 namespace llvm {
 namespace pdb {
 
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeEnumGlobals.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeEnumGlobals.cpp
index 54646867bc5f..b861fc2435b8 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeEnumGlobals.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeEnumGlobals.cpp
@@ -8,13 +8,15 @@
 
 #include "llvm/DebugInfo/PDB/Native/NativeEnumGlobals.h"
 
-#include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/SymbolCache.h"
 #include "llvm/DebugInfo/PDB/Native/SymbolStream.h"
 #include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeEnumInjectedSources.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeEnumInjectedSources.cpp
index 5e6412275063..65e253ed115f 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeEnumInjectedSources.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeEnumInjectedSources.cpp
@@ -8,9 +8,11 @@
 
 #include "llvm/DebugInfo/PDB/Native/NativeEnumInjectedSources.h"
 
-#include "llvm/DebugInfo/PDB/Native/InfoStream.h"
+#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
+#include "llvm/DebugInfo/PDB/Native/HashTable.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/Native/PDBStringTable.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 
 namespace llvm {
 namespace pdb {
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeEnumLineNumbers.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeEnumLineNumbers.cpp
index 1e4b07646335..b912bf77e579 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeEnumLineNumbers.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeEnumLineNumbers.cpp
@@ -8,13 +8,11 @@
 
 #include "llvm/DebugInfo/PDB/Native/NativeEnumLineNumbers.h"
 
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/DebugInfo/CodeView/DebugLinesSubsection.h"
-#include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/Native/NativeLineNumber.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSourceFile.h"
+
+#include <vector>
 
 using namespace llvm;
 using namespace llvm::codeview;
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp
index c6621924b516..7108b8efff83 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp
@@ -8,13 +8,10 @@
 
 #include "llvm/DebugInfo/PDB/Native/NativeEnumModules.h"
 
-#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
-#include "llvm/DebugInfo/PDB/Native/NativeCompilandSymbol.h"
-#include "llvm/DebugInfo/PDB/Native/NativeExeSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/Native/SymbolCache.h"
 #include "llvm/DebugInfo/PDB/PDBSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolCompiland.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
 
 namespace llvm {
 namespace pdb {
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeEnumSymbols.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeEnumSymbols.cpp
index feede1dbc958..24fe2244cfc5 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeEnumSymbols.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeEnumSymbols.cpp
@@ -8,11 +8,11 @@
 
 #include "llvm/DebugInfo/PDB/Native/NativeEnumSymbols.h"
 
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
-#include "llvm/DebugInfo/PDB/Native/NativeTypeEnum.h"
+#include "llvm/DebugInfo/PDB/Native/SymbolCache.h"
 #include "llvm/DebugInfo/PDB/PDBSymbol.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeEnumTypes.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeEnumTypes.cpp
index 2524e10cb6c5..6912b8dc838e 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeEnumTypes.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeEnumTypes.cpp
@@ -8,13 +8,16 @@
 
 #include "llvm/DebugInfo/PDB/Native/NativeEnumTypes.h"
 
-#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
+#include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
 #include "llvm/DebugInfo/CodeView/TypeRecordHelpers.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
-#include "llvm/DebugInfo/PDB/Native/NativeTypeEnum.h"
+#include "llvm/DebugInfo/PDB/Native/SymbolCache.h"
 #include "llvm/DebugInfo/PDB/PDBSymbol.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp
index 895f8943157a..ae0f66c31fde 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp
@@ -8,14 +8,14 @@
 
 #include "llvm/DebugInfo/PDB/Native/NativeExeSymbol.h"
 
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/Native/DbiStream.h"
 #include "llvm/DebugInfo/PDB/Native/InfoStream.h"
-#include "llvm/DebugInfo/PDB/Native/NativeCompilandSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeEnumModules.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/Native/SymbolCache.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolCompiland.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeFunctionSymbol.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeFunctionSymbol.cpp
index 7f3b35c297b4..b1caa5add5b3 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeFunctionSymbol.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeFunctionSymbol.cpp
@@ -8,11 +8,15 @@
 
 #include "llvm/DebugInfo/PDB/Native/NativeFunctionSymbol.h"
 
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/DebugInfo/PDB/Native/ModuleDebugStream.h"
 #include "llvm/DebugInfo/PDB/Native/NativeEnumSymbols.h"
-#include "llvm/DebugInfo/PDB/Native/NativeTypeBuiltin.h"
-#include "llvm/DebugInfo/PDB/Native/NativeTypeEnum.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/Native/SymbolCache.h"
+#include "llvm/DebugInfo/PDB/PDBExtras.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -25,7 +29,7 @@ NativeFunctionSymbol::NativeFunctionSymbol(NativeSession &Session,
     : NativeRawSymbol(Session, PDB_SymType::Function, Id), Sym(Sym),
       RecordOffset(Offset) {}
 
-NativeFunctionSymbol::~NativeFunctionSymbol() {}
+NativeFunctionSymbol::~NativeFunctionSymbol() = default;
 
 void NativeFunctionSymbol::dump(raw_ostream &OS, int Indent,
                                 PdbSymbolIdField ShowIdFields,
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeInlineSiteSymbol.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeInlineSiteSymbol.cpp
index 8314353c3890..99ec627fcd26 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeInlineSiteSymbol.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeInlineSiteSymbol.cpp
@@ -12,8 +12,14 @@
 #include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
+#include "llvm/DebugInfo/PDB/Native/ModuleDebugStream.h"
 #include "llvm/DebugInfo/PDB/Native/NativeEnumLineNumbers.h"
+#include "llvm/DebugInfo/PDB/Native/NativeLineNumber.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/SymbolCache.h"
 #include "llvm/DebugInfo/PDB/Native/TpiStream.h"
+#include "llvm/DebugInfo/PDB/PDBExtras.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -25,7 +31,7 @@ NativeInlineSiteSymbol::NativeInlineSiteSymbol(
     : NativeRawSymbol(Session, PDB_SymType::InlineSite, Id), Sym(Sym),
       ParentAddr(ParentAddr) {}
 
-NativeInlineSiteSymbol::~NativeInlineSiteSymbol() {}
+NativeInlineSiteSymbol::~NativeInlineSiteSymbol() = default;
 
 void NativeInlineSiteSymbol::dump(raw_ostream &OS, int Indent,
                                   PdbSymbolIdField ShowIdFields,
@@ -98,29 +104,81 @@ void NativeInlineSiteSymbol::getLineOffset(uint32_t OffsetInFunc,
   LineOffset = 0;
   FileOffset = 0;
   uint32_t CodeOffset = 0;
+  Optional<uint32_t> CodeOffsetBase;
+  Optional<uint32_t> CodeOffsetEnd;
+  Optional<int32_t> CurLineOffset;
+  Optional<int32_t> NextLineOffset;
+  Optional<uint32_t> NextFileOffset;
+  auto UpdateCodeOffset = [&](uint32_t Delta) {
+    if (!CodeOffsetBase)
+      CodeOffsetBase = CodeOffset;
+    else if (!CodeOffsetEnd)
+      CodeOffsetEnd = *CodeOffsetBase + Delta;
+  };
+  auto UpdateLineOffset = [&](int32_t Delta) {
+    LineOffset += Delta;
+    if (!CodeOffsetBase || !CurLineOffset)
+      CurLineOffset = LineOffset;
+    else
+      NextLineOffset = LineOffset;
+  };
+  auto UpdateFileOffset = [&](uint32_t Offset) {
+    if (!CodeOffsetBase)
+      FileOffset = Offset;
+    else
+      NextFileOffset = Offset;
+  };
+  auto ValidateAndReset = [&]() {
+    // Current range is finished. Check if OffsetInFunc is in the range.
+    if (CodeOffsetBase && CodeOffsetEnd && CurLineOffset) {
+      if (CodeOffsetBase <= OffsetInFunc && OffsetInFunc < CodeOffsetEnd) {
+        LineOffset = *CurLineOffset;
+        return true;
+      }
+      // Set base, end, file offset and line offset for next range.
+      if (NextFileOffset)
+        FileOffset = *NextFileOffset;
+      if (NextLineOffset) {
+        CurLineOffset = NextLineOffset;
+        NextLineOffset = None;
+      }
+      CodeOffsetBase = CodeOffsetEnd;
+      CodeOffsetEnd = NextFileOffset = None;
+    }
+    return false;
+  };
   for (const auto &Annot : Sym.annotations()) {
     switch (Annot.OpCode) {
     case BinaryAnnotationsOpCode::CodeOffset:
     case BinaryAnnotationsOpCode::ChangeCodeOffset:
-    case BinaryAnnotationsOpCode::ChangeCodeLength:
+    case BinaryAnnotationsOpCode::ChangeCodeOffsetBase:
       CodeOffset += Annot.U1;
+      UpdateCodeOffset(Annot.U1);
+      break;
+    case BinaryAnnotationsOpCode::ChangeCodeLength:
+      UpdateCodeOffset(Annot.U1);
       break;
     case BinaryAnnotationsOpCode::ChangeCodeLengthAndCodeOffset:
       CodeOffset += Annot.U2;
+      UpdateCodeOffset(Annot.U2);
+      UpdateCodeOffset(Annot.U1);
       break;
     case BinaryAnnotationsOpCode::ChangeLineOffset:
+      UpdateLineOffset(Annot.S1);
+      break;
     case BinaryAnnotationsOpCode::ChangeCodeOffsetAndLineOffset:
       CodeOffset += Annot.U1;
-      LineOffset += Annot.S1;
+      UpdateCodeOffset(Annot.U1);
+      UpdateLineOffset(Annot.S1);
       break;
     case BinaryAnnotationsOpCode::ChangeFile:
-      FileOffset = Annot.U1;
+      UpdateFileOffset(Annot.U1);
       break;
     default:
       break;
     }
 
-    if (CodeOffset >= OffsetInFunc)
+    if (ValidateAndReset())
       return;
   }
 }
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeLineNumber.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeLineNumber.cpp
index 155ed0cdb828..aa7d6ac6f29d 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeLineNumber.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeLineNumber.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/Native/NativeLineNumber.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativePublicSymbol.cpp b/llvm/lib/DebugInfo/PDB/Native/NativePublicSymbol.cpp
index 1265e688b867..339af6108009 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativePublicSymbol.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativePublicSymbol.cpp
@@ -9,8 +9,7 @@
 #include "llvm/DebugInfo/PDB/Native/NativePublicSymbol.h"
 
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
-#include "llvm/DebugInfo/PDB/Native/NativeTypeBuiltin.h"
-#include "llvm/DebugInfo/PDB/Native/NativeTypeEnum.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -20,7 +19,7 @@ NativePublicSymbol::NativePublicSymbol(NativeSession &Session, SymIndexId Id,
                                        const codeview::PublicSym32 &Sym)
     : NativeRawSymbol(Session, PDB_SymType::PublicSymbol, Id), Sym(Sym) {}
 
-NativePublicSymbol::~NativePublicSymbol() {}
+NativePublicSymbol::~NativePublicSymbol() = default;
 
 void NativePublicSymbol::dump(raw_ostream &OS, int Indent,
                               PdbSymbolIdField ShowIdFields,
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp
index 2ad552470b61..89f9f9836fec 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp
@@ -10,7 +10,6 @@
 #include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
-#include "llvm/Support/FormatVariadic.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp
index 7212a0e65035..cf314c3bede3 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp
@@ -8,31 +8,33 @@
 
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/BinaryFormat/Magic.h"
+#include "llvm/DebugInfo/MSF/MSFCommon.h"
+#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/IPDBSourceFile.h"
+#include "llvm/DebugInfo/PDB/Native/DbiModuleDescriptor.h"
+#include "llvm/DebugInfo/PDB/Native/DbiModuleList.h"
 #include "llvm/DebugInfo/PDB/Native/DbiStream.h"
 #include "llvm/DebugInfo/PDB/Native/ISectionContribVisitor.h"
-#include "llvm/DebugInfo/PDB/Native/NativeCompilandSymbol.h"
+#include "llvm/DebugInfo/PDB/Native/ModuleDebugStream.h"
 #include "llvm/DebugInfo/PDB/Native/NativeEnumInjectedSources.h"
-#include "llvm/DebugInfo/PDB/Native/NativeEnumTypes.h"
 #include "llvm/DebugInfo/PDB/Native/NativeExeSymbol.h"
-#include "llvm/DebugInfo/PDB/Native/NativeTypeBuiltin.h"
-#include "llvm/DebugInfo/PDB/Native/NativeTypeEnum.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/RawConstants.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/DebugInfo/PDB/Native/SymbolCache.h"
-#include "llvm/DebugInfo/PDB/Native/TpiStream.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolCompiland.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
+#include "llvm/Object/Binary.h"
 #include "llvm/Object/COFF.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/BinaryStreamArray.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorOr.h"
-#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 
@@ -45,6 +47,12 @@ using namespace llvm;
 using namespace llvm::msf;
 using namespace llvm::pdb;
 
+namespace llvm {
+namespace codeview {
+union DebugInfo;
+}
+} // namespace llvm
+
 static DbiStream *getDbiStreamPtr(PDBFile &File) {
   Expected<DbiStream &> DbiS = File.getPDBDbiStream();
   if (DbiS)
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeSourceFile.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeSourceFile.cpp
index fd813dee6b9f..8d6f8ebebf4c 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeSourceFile.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeSourceFile.cpp
@@ -8,6 +8,8 @@
 
 #include "llvm/DebugInfo/PDB/Native/NativeSourceFile.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/PDBStringTable.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeSymbolEnumerator.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeSymbolEnumerator.cpp
index e5f1dcaf801e..a6e8cbf71548 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeSymbolEnumerator.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeSymbolEnumerator.cpp
@@ -8,7 +8,7 @@
 
 #include "llvm/DebugInfo/PDB/Native/NativeSymbolEnumerator.h"
 
-#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 #include "llvm/DebugInfo/PDB/Native/NativeTypeBuiltin.h"
 #include "llvm/DebugInfo/PDB/Native/NativeTypeEnum.h"
 
@@ -22,7 +22,7 @@ NativeSymbolEnumerator::NativeSymbolEnumerator(
     : NativeRawSymbol(Session, PDB_SymType::Data, Id), Parent(Parent),
       Record(std::move(Record)) {}
 
-NativeSymbolEnumerator::~NativeSymbolEnumerator() {}
+NativeSymbolEnumerator::~NativeSymbolEnumerator() = default;
 
 void NativeSymbolEnumerator::dump(raw_ostream &OS, int Indent,
                                   PdbSymbolIdField ShowIdFields,
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeTypeArray.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeTypeArray.cpp
index 63ac9fae0e87..e98f357ac485 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeTypeArray.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeTypeArray.cpp
@@ -8,9 +8,10 @@
 
 #include "llvm/DebugInfo/PDB/Native/NativeTypeArray.h"
 
-#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
-#include "llvm/DebugInfo/PDB/Native/NativeTypeBuiltin.h"
-#include "llvm/DebugInfo/PDB/Native/NativeTypeEnum.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/Native/SymbolCache.h"
+#include "llvm/DebugInfo/PDB/PDBExtras.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -21,7 +22,7 @@ NativeTypeArray::NativeTypeArray(NativeSession &Session, SymIndexId Id,
                                  codeview::ArrayRecord Record)
     : NativeRawSymbol(Session, PDB_SymType::ArrayType, Id), Record(Record),
       Index(TI) {}
-NativeTypeArray::~NativeTypeArray() {}
+NativeTypeArray::~NativeTypeArray() = default;
 
 void NativeTypeArray::dump(raw_ostream &OS, int Indent,
                            PdbSymbolIdField ShowIdFields,
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeTypeBuiltin.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeTypeBuiltin.cpp
index a08663aa91ba..80f892c7b118 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeTypeBuiltin.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeTypeBuiltin.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/Native/NativeTypeBuiltin.h"
-#include "llvm/Support/FormatVariadic.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -19,7 +18,7 @@ NativeTypeBuiltin::NativeTypeBuiltin(NativeSession &PDBSession, SymIndexId Id,
     : NativeRawSymbol(PDBSession, PDB_SymType::BuiltinType, Id),
       Session(PDBSession), Mods(Mods), Type(T), Length(L) {}
 
-NativeTypeBuiltin::~NativeTypeBuiltin() {}
+NativeTypeBuiltin::~NativeTypeBuiltin() = default;
 
 void NativeTypeBuiltin::dump(raw_ostream &OS, int Indent,
                              PdbSymbolIdField ShowIdFields,
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeTypeEnum.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeTypeEnum.cpp
index aaec3a5e7c60..ec37d276e66b 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeTypeEnum.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeTypeEnum.cpp
@@ -9,8 +9,9 @@
 #include "llvm/DebugInfo/PDB/Native/NativeTypeEnum.h"
 
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/PDB/Native/NativeEnumTypes.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSymbolEnumerator.h"
 #include "llvm/DebugInfo/PDB/Native/NativeTypeBuiltin.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
@@ -18,8 +19,6 @@
 #include "llvm/DebugInfo/PDB/Native/TpiStream.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
 
-#include "llvm/Support/FormatVariadic.h"
-
 #include <cassert>
 
 using namespace llvm;
@@ -68,10 +67,13 @@ NativeEnumEnumEnumerators::NativeEnumEnumEnumerators(
 
   ContinuationIndex = ClassParent.getEnumRecord().FieldList;
   while (ContinuationIndex) {
-    CVType FieldList = Types.getType(*ContinuationIndex);
-    assert(FieldList.kind() == LF_FIELDLIST);
+    CVType FieldListCVT = Types.getType(*ContinuationIndex);
+    assert(FieldListCVT.kind() == LF_FIELDLIST);
     ContinuationIndex.reset();
-    cantFail(visitMemberRecordStream(FieldList.data(), *this));
+    FieldListRecord FieldList;
+    cantFail(TypeDeserializer::deserializeAs<FieldListRecord>(FieldListCVT,
+                                                              FieldList));
+    cantFail(visitMemberRecordStream(FieldList.Data, *this));
   }
 }
 
@@ -123,7 +125,7 @@ NativeTypeEnum::NativeTypeEnum(NativeSession &Session, SymIndexId Id,
     : NativeRawSymbol(Session, PDB_SymType::Enum, Id),
       UnmodifiedType(&UnmodifiedType), Modifiers(std::move(Modifier)) {}
 
-NativeTypeEnum::~NativeTypeEnum() {}
+NativeTypeEnum::~NativeTypeEnum() = default;
 
 void NativeTypeEnum::dump(raw_ostream &OS, int Indent,
                           PdbSymbolIdField ShowIdFields,
@@ -138,7 +140,7 @@ void NativeTypeEnum::dump(raw_ostream &OS, int Indent,
   dumpSymbolField(OS, "name", getName(), Indent);
   dumpSymbolIdField(OS, "typeId", getTypeId(), Indent, Session,
                     PdbSymbolIdField::Type, ShowIdFields, RecurseIdFields);
-  if (Modifiers.hasValue())
+  if (Modifiers)
     dumpSymbolIdField(OS, "unmodifiedTypeId", getUnmodifiedTypeId(), Indent,
                       Session, PdbSymbolIdField::UnmodifiedType, ShowIdFields,
                       RecurseIdFields);
@@ -206,6 +208,8 @@ PDB_BuiltinType NativeTypeEnum::getBuiltinType() const {
     return PDB_BuiltinType::Char16;
   case SimpleTypeKind::Character32:
     return PDB_BuiltinType::Char32;
+  case SimpleTypeKind::Character8:
+    return PDB_BuiltinType::Char8;
   case SimpleTypeKind::Int128:
   case SimpleTypeKind::Int128Oct:
   case SimpleTypeKind::Int16:
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeTypeFunctionSig.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeTypeFunctionSig.cpp
index f98a4c3043eb..7db3f1c63128 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeTypeFunctionSig.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeTypeFunctionSig.cpp
@@ -10,9 +10,10 @@
 
 #include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
 #include "llvm/DebugInfo/PDB/Native/NativeEnumTypes.h"
-#include "llvm/DebugInfo/PDB/PDBExtras.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/Native/TpiStream.h"
+#include "llvm/DebugInfo/PDB/PDBExtras.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -96,7 +97,7 @@ void NativeTypeFunctionSig::initialize() {
   }
 }
 
-NativeTypeFunctionSig::~NativeTypeFunctionSig() {}
+NativeTypeFunctionSig::~NativeTypeFunctionSig() = default;
 
 void NativeTypeFunctionSig::initializeArgList(codeview::TypeIndex ArgListTI) {
   TpiStream &Tpi = cantFail(Session.getPDBFile().getPDBTpiStream());
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeTypePointer.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeTypePointer.cpp
index 32dcfc235954..14b903ccef5a 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeTypePointer.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeTypePointer.cpp
@@ -7,8 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/Native/NativeTypePointer.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 
-#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 
 #include <cassert>
 
@@ -29,7 +30,7 @@ NativeTypePointer::NativeTypePointer(NativeSession &Session, SymIndexId Id,
     : NativeRawSymbol(Session, PDB_SymType::PointerType, Id), TI(TI),
       Record(std::move(Record)) {}
 
-NativeTypePointer::~NativeTypePointer() {}
+NativeTypePointer::~NativeTypePointer() = default;
 
 void NativeTypePointer::dump(raw_ostream &OS, int Indent,
                              PdbSymbolIdField ShowIdFields,
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeTypeTypedef.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeTypeTypedef.cpp
index 72964a9e0d4d..11cd349b72ca 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeTypeTypedef.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeTypeTypedef.cpp
@@ -1,4 +1,6 @@
 #include "llvm/DebugInfo/PDB/Native/NativeTypeTypedef.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/PDBExtras.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -9,7 +11,7 @@ NativeTypeTypedef::NativeTypeTypedef(NativeSession &Session, SymIndexId Id,
     : NativeRawSymbol(Session, PDB_SymType::Typedef, Id),
       Record(std::move(Typedef)) {}
 
-NativeTypeTypedef::~NativeTypeTypedef() {}
+NativeTypeTypedef::~NativeTypeTypedef() = default;
 
 void NativeTypeTypedef::dump(raw_ostream &OS, int Indent,
                              PdbSymbolIdField ShowIdFields,
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeTypeUDT.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeTypeUDT.cpp
index 917ec14e58d6..b708fb644e7a 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeTypeUDT.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeTypeUDT.cpp
@@ -7,10 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/Native/NativeTypeUDT.h"
-
-#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
-
-#include <cassert>
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/Native/SymbolCache.h"
+#include "llvm/DebugInfo/PDB/PDBExtras.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -32,7 +33,7 @@ NativeTypeUDT::NativeTypeUDT(NativeSession &Session, SymIndexId Id,
     : NativeRawSymbol(Session, PDB_SymType::UDT, Id),
       UnmodifiedType(&UnmodifiedType), Modifiers(std::move(Modifier)) {}
 
-NativeTypeUDT::~NativeTypeUDT() {}
+NativeTypeUDT::~NativeTypeUDT() = default;
 
 void NativeTypeUDT::dump(raw_ostream &OS, int Indent,
                          PdbSymbolIdField ShowIdFields,
@@ -44,7 +45,7 @@ void NativeTypeUDT::dump(raw_ostream &OS, int Indent,
   dumpSymbolIdField(OS, "lexicalParentId", 0, Indent, Session,
                     PdbSymbolIdField::LexicalParent, ShowIdFields,
                     RecurseIdFields);
-  if (Modifiers.hasValue())
+  if (Modifiers)
     dumpSymbolIdField(OS, "unmodifiedTypeId", getUnmodifiedTypeId(), Indent,
                       Session, PdbSymbolIdField::UnmodifiedType, ShowIdFields,
                       RecurseIdFields);
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeTypeVTShape.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeTypeVTShape.cpp
index 837fe19ec88c..63bb3f046e23 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeTypeVTShape.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeTypeVTShape.cpp
@@ -1,4 +1,7 @@
 #include "llvm/DebugInfo/PDB/Native/NativeTypeVTShape.h"
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/PDBExtras.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
@@ -10,7 +13,7 @@ NativeTypeVTShape::NativeTypeVTShape(NativeSession &Session, SymIndexId Id,
     : NativeRawSymbol(Session, PDB_SymType::VTableShape, Id), TI(TI),
       Record(std::move(SR)) {}
 
-NativeTypeVTShape::~NativeTypeVTShape() {}
+NativeTypeVTShape::~NativeTypeVTShape() = default;
 
 void NativeTypeVTShape::dump(raw_ostream &OS, int Indent,
                              PdbSymbolIdField ShowIdFields,
diff --git a/llvm/lib/DebugInfo/PDB/Native/PDBFile.cpp b/llvm/lib/DebugInfo/PDB/Native/PDBFile.cpp
index 5c61530c470d..471d183a5f53 100644
--- a/llvm/lib/DebugInfo/PDB/Native/PDBFile.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/PDBFile.cpp
@@ -8,7 +8,6 @@
 
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/DebugInfo/MSF/MSFCommon.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/PDB/Native/DbiStream.h"
diff --git a/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
index f33125474e3a..641043a8e186 100644
--- a/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
@@ -7,34 +7,41 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/Native/PDBFileBuilder.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/GUID.h"
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
-#include "llvm/DebugInfo/PDB/Native/DbiStream.h"
+#include "llvm/DebugInfo/MSF/MSFCommon.h"
+#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h"
 #include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h"
-#include "llvm/DebugInfo/PDB/Native/InfoStream.h"
 #include "llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h"
 #include "llvm/DebugInfo/PDB/Native/PDBStringTableBuilder.h"
+#include "llvm/DebugInfo/PDB/Native/RawConstants.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
-#include "llvm/DebugInfo/PDB/Native/TpiStream.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h"
-#include "llvm/Support/BinaryStream.h"
 #include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/CRC.h"
-#include "llvm/Support/Chrono.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/xxhash.h"
 
+#include <ctime>
+
 using namespace llvm;
 using namespace llvm::codeview;
 using namespace llvm::msf;
 using namespace llvm::pdb;
 using namespace llvm::support;
 
+namespace llvm {
+class WritableBinaryStream;
+}
+
 PDBFileBuilder::PDBFileBuilder(BumpPtrAllocator &Allocator)
     : Allocator(Allocator), InjectedSourceHashTraits(Strings),
       InjectedSourceTable(2) {}
 
-PDBFileBuilder::~PDBFileBuilder() {}
+PDBFileBuilder::~PDBFileBuilder() = default;
 
 Error PDBFileBuilder::initialize(uint32_t BlockSize) {
   auto ExpectedMsf = MSFBuilder::create(Allocator, BlockSize);
@@ -348,7 +355,7 @@ Error PDBFileBuilder::commit(StringRef Filename, codeview::GUID *Guid) {
     H->Age = Info->getAge();
     H->Guid = Info->getGuid();
     Optional<uint32_t> Sig = Info->getSignature();
-    H->Signature = Sig.hasValue() ? *Sig : time(nullptr);
+    H->Signature = Sig ? *Sig : time(nullptr);
   }
 
   return Buffer.commit();
diff --git a/llvm/lib/DebugInfo/PDB/Native/PDBStringTable.cpp b/llvm/lib/DebugInfo/PDB/Native/PDBStringTable.cpp
index 2be1656e06bb..5bd12f50f1d7 100644
--- a/llvm/lib/DebugInfo/PDB/Native/PDBStringTable.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/PDBStringTable.cpp
@@ -8,7 +8,6 @@
 
 #include "llvm/DebugInfo/PDB/Native/PDBStringTable.h"
 
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/DebugInfo/PDB/Native/Hash.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
diff --git a/llvm/lib/DebugInfo/PDB/Native/PDBStringTableBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/PDBStringTableBuilder.cpp
index f7f36901e4d4..45a5bdb48f01 100644
--- a/llvm/lib/DebugInfo/PDB/Native/PDBStringTableBuilder.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/PDBStringTableBuilder.cpp
@@ -71,7 +71,7 @@ static uint32_t computeBucketCount(uint32_t NumStrings) {
   // This list contains all StringCount, BucketCount pairs where BucketCount was
   // just incremented.  It ends before the first BucketCount entry where
   // BucketCount * 3 would overflow a 32-bit unsigned int.
-  static std::map<uint32_t, uint32_t> StringsToBuckets = {
+  static const std::pair<uint32_t, uint32_t> StringsToBuckets[] = {
       {0, 1},
       {1, 2},
       {2, 4},
@@ -124,8 +124,9 @@ static uint32_t computeBucketCount(uint32_t NumStrings) {
       {517197275, 1034394550},
       {775795913, 1551591826},
       {1163693870, 2327387740}};
-  auto Entry = StringsToBuckets.lower_bound(NumStrings);
-  assert(Entry != StringsToBuckets.end());
+  const auto *Entry = llvm::lower_bound(
+      StringsToBuckets, std::make_pair(NumStrings, 0U), llvm::less_first());
+  assert(Entry != std::end(StringsToBuckets));
   return Entry->second;
 }
 
diff --git a/llvm/lib/DebugInfo/PDB/Native/PublicsStream.cpp b/llvm/lib/DebugInfo/PDB/Native/PublicsStream.cpp
index a33bf03bf8fb..c7b9f443da5e 100644
--- a/llvm/lib/DebugInfo/PDB/Native/PublicsStream.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/PublicsStream.cpp
@@ -22,14 +22,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/Native/PublicsStream.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
-#include <algorithm>
 #include <cstdint>
 
 using namespace llvm;
diff --git a/llvm/lib/DebugInfo/PDB/Native/SymbolCache.cpp b/llvm/lib/DebugInfo/PDB/Native/SymbolCache.cpp
index f9e67014477e..f89f09aa3399 100644
--- a/llvm/lib/DebugInfo/PDB/Native/SymbolCache.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/SymbolCache.cpp
@@ -1,20 +1,25 @@
 #include "llvm/DebugInfo/PDB/Native/SymbolCache.h"
 
-#include "llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
 #include "llvm/DebugInfo/CodeView/DebugLinesSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
+#include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
 #include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
+#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeRecordHelpers.h"
+#include "llvm/DebugInfo/PDB/IPDBSourceFile.h"
+#include "llvm/DebugInfo/PDB/Native/DbiModuleList.h"
 #include "llvm/DebugInfo/PDB/Native/DbiStream.h"
-#include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
-#include "llvm/DebugInfo/PDB/Native/ISectionContribVisitor.h"
+#include "llvm/DebugInfo/PDB/Native/ModuleDebugStream.h"
 #include "llvm/DebugInfo/PDB/Native/NativeCompilandSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeEnumGlobals.h"
 #include "llvm/DebugInfo/PDB/Native/NativeEnumLineNumbers.h"
-#include "llvm/DebugInfo/PDB/Native/NativeEnumSymbols.h"
 #include "llvm/DebugInfo/PDB/Native/NativeEnumTypes.h"
 #include "llvm/DebugInfo/PDB/Native/NativeFunctionSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeInlineSiteSymbol.h"
+#include "llvm/DebugInfo/PDB/Native/NativeLineNumber.h"
 #include "llvm/DebugInfo/PDB/Native/NativePublicSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
@@ -32,7 +37,6 @@
 #include "llvm/DebugInfo/PDB/Native/TpiStream.h"
 #include "llvm/DebugInfo/PDB/PDBSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolCompiland.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -60,6 +64,7 @@ static const struct BuiltinTypeEntry {
     {codeview::SimpleTypeKind::WideCharacter, PDB_BuiltinType::WCharT, 2},
     {codeview::SimpleTypeKind::Character16, PDB_BuiltinType::Char16, 2},
     {codeview::SimpleTypeKind::Character32, PDB_BuiltinType::Char32, 4},
+    {codeview::SimpleTypeKind::Character8, PDB_BuiltinType::Char8, 1},
     {codeview::SimpleTypeKind::SignedCharacter, PDB_BuiltinType::Char, 1},
     {codeview::SimpleTypeKind::UnsignedCharacter, PDB_BuiltinType::UInt, 1},
     {codeview::SimpleTypeKind::Float32, PDB_BuiltinType::Float, 4},
diff --git a/llvm/lib/DebugInfo/PDB/Native/SymbolStream.cpp b/llvm/lib/DebugInfo/PDB/Native/SymbolStream.cpp
index 003840b6e67e..5802d1c77527 100644
--- a/llvm/lib/DebugInfo/PDB/Native/SymbolStream.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/SymbolStream.cpp
@@ -8,10 +8,7 @@
 
 #include "llvm/DebugInfo/PDB/Native/SymbolStream.h"
 
-#include "llvm/DebugInfo/CodeView/CodeView.h"
-#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Endian.h"
 
 using namespace llvm;
@@ -22,7 +19,7 @@ using namespace llvm::pdb;
 SymbolStream::SymbolStream(std::unique_ptr<MappedBlockStream> Stream)
     : Stream(std::move(Stream)) {}
 
-SymbolStream::~SymbolStream() {}
+SymbolStream::~SymbolStream() = default;
 
 Error SymbolStream::reload() {
   BinaryStreamReader Reader(*Stream);
diff --git a/llvm/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
index 5f4f497690b6..986e45e050c7 100644
--- a/llvm/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
@@ -9,17 +9,13 @@
 #include "llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/CodeView/RecordSerialization.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
-#include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/BinaryByteStream.h"
-#include "llvm/Support/BinaryStreamArray.h"
-#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
diff --git a/llvm/lib/DebugInfo/PDB/PDB.cpp b/llvm/lib/DebugInfo/PDB/PDB.cpp
index e5b7731f6f4a..d106ba8fefc1 100644
--- a/llvm/lib/DebugInfo/PDB/PDB.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDB.cpp
@@ -15,7 +15,6 @@
 #endif
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/MemoryBuffer.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBContext.cpp b/llvm/lib/DebugInfo/PDB/PDBContext.cpp
index 0ebb70e010d5..e600fb7385f1 100644
--- a/llvm/lib/DebugInfo/PDB/PDBContext.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBContext.cpp
@@ -14,6 +14,8 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolData.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolFunc.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolPublicSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 #include "llvm/Object/COFF.h"
 
 using namespace llvm;
@@ -62,6 +64,13 @@ DILineInfo PDBContext::getLineInfoForAddress(object::SectionedAddress Address,
   return Result;
 }
 
+DILineInfo
+PDBContext::getLineInfoForDataAddress(object::SectionedAddress Address) {
+  // Unimplemented. S_GDATA and S_LDATA in CodeView (used to describe global
+  // variables) aren't capable of carrying line information.
+  return DILineInfo();
+}
+
 DILineInfoTable
 PDBContext::getLineInfoForAddressRange(object::SectionedAddress Address,
                                        uint64_t Size,
diff --git a/llvm/lib/DebugInfo/PDB/PDBExtras.cpp b/llvm/lib/DebugInfo/PDB/PDBExtras.cpp
index a6d7ca0da7a9..571510e6bad9 100644
--- a/llvm/lib/DebugInfo/PDB/PDBExtras.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBExtras.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/PDBExtras.h"
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -64,6 +63,7 @@ raw_ostream &llvm::pdb::operator<<(raw_ostream &OS,
     CASE_OUTPUT_ENUM_CLASS_NAME(PDB_BuiltinType, HResult, OS)
     CASE_OUTPUT_ENUM_CLASS_NAME(PDB_BuiltinType, Char16, OS)
     CASE_OUTPUT_ENUM_CLASS_NAME(PDB_BuiltinType, Char32, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_BuiltinType, Char8, OS)
   }
   return OS;
 }
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbol.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbol.cpp
index d6bc7ee9c951..4eb5af9bd292 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbol.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbol.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymbol.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/PDBExtras.h"
@@ -43,7 +44,6 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolUnknown.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolUsingNamespace.h"
 #include "llvm/DebugInfo/PDB/PDBTypes.h"
-#include <algorithm>
 #include <memory>
 
 using namespace llvm;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolAnnotation.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolAnnotation.cpp
index 0fa83efb7ae0..089f4de0f422 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolAnnotation.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolAnnotation.cpp
@@ -10,8 +10,6 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
-#include <utility>
-
 using namespace llvm;
 using namespace llvm::pdb;
 
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolBlock.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolBlock.cpp
index 9452282a8817..49ee4937521b 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolBlock.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolBlock.cpp
@@ -9,9 +9,6 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolBlock.h"
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp
index 529100b23ba5..bd60489b6bed 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp
@@ -9,10 +9,11 @@
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/IPDBSourceFile.h"
 
+#include "llvm/DebugInfo/PDB/ConcreteSymbolEnumerator.h"
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolCompiland.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolCompilandDetails.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolCompilandEnv.h"
-#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Path.h"
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolCompilandDetails.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolCompilandDetails.cpp
index 0d86dfe1e632..f775ac949cd8 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolCompilandDetails.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolCompilandDetails.cpp
@@ -9,9 +9,6 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolCompilandDetails.h"
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolCompilandEnv.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolCompilandEnv.cpp
index 61f119405fd9..2c2ed59c1726 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolCompilandEnv.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolCompilandEnv.cpp
@@ -10,9 +10,7 @@
 
 #include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-
-#include <utility>
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolCustom.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolCustom.cpp
index 6c9a4aa76c3d..405b07c2b689 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolCustom.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolCustom.cpp
@@ -10,9 +10,6 @@
 
 #include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolData.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolData.cpp
index d2b82111ccd5..c604b5cd3a6a 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolData.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolData.cpp
@@ -7,12 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/PDBSymbolData.h"
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/IPDBSectionContrib.h"
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
-#include <utility>
-
 using namespace llvm;
 using namespace llvm::pdb;
 
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolExe.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolExe.cpp
index c85756c43e47..3887c23b18ef 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolExe.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolExe.cpp
@@ -8,10 +8,10 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
 
+#include "llvm/DebugInfo/PDB/ConcreteSymbolEnumerator.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypePointer.h"
-
-#include <utility>
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolFunc.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolFunc.cpp
index cb0329bc0ed7..59d57e83fc10 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolFunc.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolFunc.cpp
@@ -10,7 +10,9 @@
 
 #include "llvm/DebugInfo/PDB/ConcreteSymbolEnumerator.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
+#include "llvm/DebugInfo/PDB/Native/NativeTypeFunctionSig.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolData.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h"
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolFuncDebugEnd.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolFuncDebugEnd.cpp
index 66433dc17b49..5c72e3f62121 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolFuncDebugEnd.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolFuncDebugEnd.cpp
@@ -9,9 +9,6 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolFuncDebugEnd.h"
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolFuncDebugStart.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolFuncDebugStart.cpp
index fe32c93c0121..fd537a9eeea4 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolFuncDebugStart.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolFuncDebugStart.cpp
@@ -8,10 +8,8 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymbolFuncDebugStart.h"
 
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolLabel.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolLabel.cpp
index 1fffe69a0c83..896719a6a8e2 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolLabel.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolLabel.cpp
@@ -10,8 +10,6 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
-#include <utility>
-
 using namespace llvm;
 using namespace llvm::pdb;
 
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolPublicSymbol.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolPublicSymbol.cpp
index 08697683f641..a00b1be40e18 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolPublicSymbol.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolPublicSymbol.cpp
@@ -8,10 +8,8 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymbolPublicSymbol.h"
 
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolThunk.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolThunk.cpp
index 6483858183e5..42502a55ef76 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolThunk.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolThunk.cpp
@@ -10,8 +10,6 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
-#include <utility>
-
 using namespace llvm;
 using namespace llvm::pdb;
 
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeArray.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeArray.cpp
index a0d521abe43f..bb4eb43f22e5 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeArray.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeArray.cpp
@@ -10,8 +10,6 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
-#include <utility>
-
 using namespace llvm;
 using namespace llvm::pdb;
 
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeBaseClass.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeBaseClass.cpp
index 08467059b5e1..539c3547a4b0 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeBaseClass.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeBaseClass.cpp
@@ -8,10 +8,8 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h"
 
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp
index a0dd9ef601c0..eca2a09c1f77 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp
@@ -10,8 +10,6 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
-#include <utility>
-
 using namespace llvm;
 using namespace llvm::pdb;
 
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeCustom.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeCustom.cpp
index 6723894c90ea..a616b4e26cb1 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeCustom.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeCustom.cpp
@@ -9,9 +9,6 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeCustom.h"
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeDimension.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeDimension.cpp
index 4a25a391f278..2828ce4df3f8 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeDimension.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeDimension.cpp
@@ -10,9 +10,6 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeDimension.h"
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeEnum.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeEnum.cpp
index b9fdf6aec811..db8ca327da1e 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeEnum.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeEnum.cpp
@@ -8,11 +8,10 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
 
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
 
-#include <utility>
-
 using namespace llvm;
 using namespace llvm::pdb;
 
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeFriend.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeFriend.cpp
index 4ffea42cbb0a..d4bd9996d786 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeFriend.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeFriend.cpp
@@ -9,9 +9,6 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeFriend.h"
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeFunctionArg.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeFunctionArg.cpp
index 683e93548fb1..acda57f44e33 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeFunctionArg.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeFunctionArg.cpp
@@ -10,8 +10,6 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
-#include <utility>
-
 using namespace llvm;
 using namespace llvm::pdb;
 
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeManaged.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeManaged.cpp
index e80e6c716572..fa6e630e3c45 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeManaged.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeManaged.cpp
@@ -9,9 +9,6 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeManaged.h"
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolTypePointer.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolTypePointer.cpp
index 462fc315359b..9e238c7caa37 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolTypePointer.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolTypePointer.cpp
@@ -8,11 +8,8 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymbolTypePointer.h"
 
-#include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
-#include <utility>
-
 using namespace llvm;
 using namespace llvm::pdb;
 
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeTypedef.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeTypedef.cpp
index 70749d9bf5f5..c2ce21c6ca69 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeTypedef.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeTypedef.cpp
@@ -10,8 +10,6 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
-#include <utility>
-
 using namespace llvm;
 using namespace llvm::pdb;
 
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeUDT.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeUDT.cpp
index d302c29a3bec..122111d32027 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeUDT.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeUDT.cpp
@@ -8,16 +8,8 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
 
-#include "llvm/DebugInfo/PDB/IPDBSession.h"
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolData.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h"
-
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeVTable.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeVTable.cpp
index 4e2a45116d51..a4d81888e457 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeVTable.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeVTable.cpp
@@ -10,8 +10,6 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
-#include <utility>
-
 using namespace llvm;
 using namespace llvm::pdb;
 
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeVTableShape.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeVTableShape.cpp
index 78957620e083..835a86e165af 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeVTableShape.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeVTableShape.cpp
@@ -9,9 +9,6 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h"
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolUnknown.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolUnknown.cpp
index 650d01183171..85294a4cded2 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolUnknown.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolUnknown.cpp
@@ -9,9 +9,6 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolUnknown.h"
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolUsingNamespace.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolUsingNamespace.cpp
index 74afbdb18086..98aaaa9b10b9 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolUsingNamespace.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolUsingNamespace.cpp
@@ -9,9 +9,6 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolUsingNamespace.h"
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/UDTLayout.cpp b/llvm/lib/DebugInfo/PDB/UDTLayout.cpp
index 55854bb49888..6e388834f199 100644
--- a/llvm/lib/DebugInfo/PDB/UDTLayout.cpp
+++ b/llvm/lib/DebugInfo/PDB/UDTLayout.cpp
@@ -10,6 +10,8 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/PDBSymbol.h"
@@ -17,6 +19,7 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolFunc.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypePointer.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h"
diff --git a/llvm/lib/DebugInfo/Symbolize/DIFetcher.cpp b/llvm/lib/DebugInfo/Symbolize/DIFetcher.cpp
new file mode 100644
index 000000000000..119830de595a
--- /dev/null
+++ b/llvm/lib/DebugInfo/Symbolize/DIFetcher.cpp
@@ -0,0 +1,57 @@
+//===-- lib/DebugInfo/Symbolize/DIFetcher.cpp -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the implementation of the local debug info fetcher, which
+/// searches cache directories.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/Symbolize/DIFetcher.h"
+
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
+
+namespace llvm {
+namespace symbolize {
+
+Optional<std::string>
+LocalDIFetcher::fetchBuildID(ArrayRef<uint8_t> BuildID) const {
+  auto GetDebugPath = [&](StringRef Directory) {
+    SmallString<128> Path{Directory};
+    sys::path::append(Path, ".build-id",
+                      llvm::toHex(BuildID[0], /*LowerCase=*/true),
+                      llvm::toHex(BuildID.slice(1), /*LowerCase=*/true));
+    Path += ".debug";
+    return Path;
+  };
+  if (DebugFileDirectory.empty()) {
+    SmallString<128> Path = GetDebugPath(
+#if defined(__NetBSD__)
+        // Try /usr/libdata/debug/.build-id/../...
+        "/usr/libdata/debug"
+#else
+        // Try /usr/lib/debug/.build-id/../...
+        "/usr/lib/debug"
+#endif
+    );
+    if (llvm::sys::fs::exists(Path))
+      return std::string(Path);
+  } else {
+    for (const auto &Directory : DebugFileDirectory) {
+      // Try <debug-file-directory>/.build-id/../...
+      SmallString<128> Path = GetDebugPath(Directory);
+      if (llvm::sys::fs::exists(Path))
+        return std::string(Path);
+    }
+  }
+  return None;
+}
+
+} // namespace symbolize
+} // namespace llvm
diff --git a/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp b/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp
index e29968d113bd..877380213f21 100644
--- a/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp
+++ b/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp
@@ -16,9 +16,7 @@
 #include "llvm/DebugInfo/DIContext.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/LineIterator.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cmath>
@@ -208,6 +206,10 @@ void PlainPrinterBase::print(const Request &Request, const DIGlobal &Global) {
     Name = DILineInfo::Addr2LineBadString;
   OS << Name << "\n";
   OS << Global.Start << " " << Global.Size << "\n";
+  if (Global.DeclFile.empty())
+    OS << "??:?\n";
+  else
+    OS << Global.DeclFile << ":" << Global.DeclLine << "\n";
   printFooter();
 }
 
diff --git a/llvm/lib/DebugInfo/Symbolize/Markup.cpp b/llvm/lib/DebugInfo/Symbolize/Markup.cpp
new file mode 100644
index 000000000000..9bc65e763287
--- /dev/null
+++ b/llvm/lib/DebugInfo/Symbolize/Markup.cpp
@@ -0,0 +1,202 @@
+//===- lib/DebugInfo/Symbolize/Markup.cpp ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the log symbolizer markup data model and parser.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/Symbolize/Markup.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+
+namespace llvm {
+namespace symbolize {
+
+// Matches the following:
+//   "\033[0m"
+//   "\033[1m"
+//   "\033[30m" -- "\033[37m"
+static const char SGRSyntaxStr[] = "\033\\[([0-1]|3[0-7])m";
+
+MarkupParser::MarkupParser(StringSet<> MultilineTags)
+    : MultilineTags(std::move(MultilineTags)), SGRSyntax(SGRSyntaxStr) {}
+
+static StringRef takeTo(StringRef Str, StringRef::iterator Pos) {
+  return Str.take_front(Pos - Str.begin());
+}
+static void advanceTo(StringRef &Str, StringRef::iterator Pos) {
+  Str = Str.drop_front(Pos - Str.begin());
+}
+
+void MarkupParser::parseLine(StringRef Line) {
+  Buffer.clear();
+  NextIdx = 0;
+  FinishedMultiline.clear();
+  this->Line = Line;
+}
+
+Optional<MarkupNode> MarkupParser::nextNode() {
+  // Pull something out of the buffer if possible.
+  if (!Buffer.empty()) {
+    if (NextIdx < Buffer.size())
+      return std::move(Buffer[NextIdx++]);
+    NextIdx = 0;
+    Buffer.clear();
+  }
+
+  // The buffer is empty, so parse the next bit of the line.
+
+  if (Line.empty())
+    return None;
+
+  if (!InProgressMultiline.empty()) {
+    if (Optional<StringRef> MultilineEnd = parseMultiLineEnd(Line)) {
+      llvm::append_range(InProgressMultiline, *MultilineEnd);
+      assert(FinishedMultiline.empty() &&
+             "At most one multi-line element can be finished at a time.");
+      FinishedMultiline.swap(InProgressMultiline);
+      // Parse the multi-line element as if it were contiguous.
+      advanceTo(Line, MultilineEnd->end());
+      return *parseElement(FinishedMultiline);
+    }
+
+    // The whole line is part of the multi-line element.
+    llvm::append_range(InProgressMultiline, Line);
+    Line = Line.drop_front(Line.size());
+    return None;
+  }
+
+  // Find the first valid markup element, if any.
+  if (Optional<MarkupNode> Element = parseElement(Line)) {
+    parseTextOutsideMarkup(takeTo(Line, Element->Text.begin()));
+    Buffer.push_back(std::move(*Element));
+    advanceTo(Line, Element->Text.end());
+    return nextNode();
+  }
+
+  // Since there were no valid elements remaining, see if the line opens a
+  // multi-line element.
+  if (Optional<StringRef> MultilineBegin = parseMultiLineBegin(Line)) {
+    // Emit any text before the element.
+    parseTextOutsideMarkup(takeTo(Line, MultilineBegin->begin()));
+
+    // Begin recording the multi-line element.
+    llvm::append_range(InProgressMultiline, *MultilineBegin);
+    Line = Line.drop_front(Line.size());
+    return nextNode();
+  }
+
+  // The line doesn't contain any more markup elements, so emit it as text.
+  parseTextOutsideMarkup(Line);
+  Line = Line.drop_front(Line.size());
+  return nextNode();
+}
+
+void MarkupParser::flush() {
+  if (InProgressMultiline.empty())
+    return;
+  FinishedMultiline.swap(InProgressMultiline);
+  parseTextOutsideMarkup(FinishedMultiline);
+}
+
+// Finds and returns the next valid markup element in the given line. Returns
+// None if the line contains no valid elements.
+Optional<MarkupNode> MarkupParser::parseElement(StringRef Line) {
+  while (true) {
+    // Find next element using begin and end markers.
+    size_t BeginPos = Line.find("{{{");
+    if (BeginPos == StringRef::npos)
+      return None;
+    size_t EndPos = Line.find("}}}", BeginPos + 3);
+    if (EndPos == StringRef::npos)
+      return None;
+    EndPos += 3;
+    MarkupNode Element;
+    Element.Text = Line.slice(BeginPos, EndPos);
+    Line = Line.substr(EndPos);
+
+    // Parse tag.
+    StringRef Content = Element.Text.drop_front(3).drop_back(3);
+    StringRef FieldsContent;
+    std::tie(Element.Tag, FieldsContent) = Content.split(':');
+    if (Element.Tag.empty())
+      continue;
+
+    // Parse fields.
+    if (!FieldsContent.empty())
+      FieldsContent.split(Element.Fields, ":");
+    else if (Content.back() == ':')
+      Element.Fields.push_back(FieldsContent);
+
+    return Element;
+  }
+}
+
+static MarkupNode textNode(StringRef Text) {
+  MarkupNode Node;
+  Node.Text = Text;
+  return Node;
+}
+
+// Parses a region of text known to be outside any markup elements. Such text
+// may still contain SGR control codes, so the region is further subdivided into
+// control codes and true text regions.
+void MarkupParser::parseTextOutsideMarkup(StringRef Text) {
+  if (Text.empty())
+    return;
+  SmallVector<StringRef> Matches;
+  while (SGRSyntax.match(Text, &Matches)) {
+    // Emit any text before the SGR element.
+    if (Matches.begin()->begin() != Text.begin())
+      Buffer.push_back(textNode(takeTo(Text, Matches.begin()->begin())));
+
+    Buffer.push_back(textNode(*Matches.begin()));
+    advanceTo(Text, Matches.begin()->end());
+  }
+  if (!Text.empty())
+    Buffer.push_back(textNode(Text));
+}
+
+// Given that a line doesn't contain any valid markup, see if it ends with the
+// start of a multi-line element. If so, returns the beginning.
+Optional<StringRef> MarkupParser::parseMultiLineBegin(StringRef Line) {
+  // A multi-line begin marker must be the last one on the line.
+  size_t BeginPos = Line.rfind("{{{");
+  if (BeginPos == StringRef::npos)
+    return None;
+  size_t BeginTagPos = BeginPos + 3;
+
+  // If there are any end markers afterwards, the begin marker cannot belong to
+  // a multi-line element.
+  size_t EndPos = Line.find("}}}", BeginTagPos);
+  if (EndPos != StringRef::npos)
+    return None;
+
+  // Check whether the tag is registered multi-line.
+  size_t EndTagPos = Line.find(':', BeginTagPos);
+  if (EndTagPos == StringRef::npos)
+    return None;
+  StringRef Tag = Line.slice(BeginTagPos, EndTagPos);
+  if (!MultilineTags.contains(Tag))
+    return None;
+  return Line.substr(BeginPos);
+}
+
+// See if the line begins with the ending of an in-progress multi-line element.
+// If so, return the ending.
+Optional<StringRef> MarkupParser::parseMultiLineEnd(StringRef Line) {
+  size_t EndPos = Line.find("}}}");
+  if (EndPos == StringRef::npos)
+    return None;
+  return Line.take_front(EndPos + 3);
+}
+
+} // end namespace symbolize
+} // end namespace llvm
diff --git a/llvm/lib/DebugInfo/Symbolize/MarkupFilter.cpp b/llvm/lib/DebugInfo/Symbolize/MarkupFilter.cpp
new file mode 100644
index 000000000000..3363fe5e531f
--- /dev/null
+++ b/llvm/lib/DebugInfo/Symbolize/MarkupFilter.cpp
@@ -0,0 +1,143 @@
+//===-- lib/DebugInfo/Symbolize/MarkupFilter.cpp -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the implementation of a filter that replaces symbolizer
+/// markup with human-readable expressions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/Symbolize/MarkupFilter.h"
+
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Demangle/Demangle.h"
+#include "llvm/Support/WithColor.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::symbolize;
+
+MarkupFilter::MarkupFilter(raw_ostream &OS, Optional<bool> ColorsEnabled)
+    : OS(OS), ColorsEnabled(ColorsEnabled.value_or(
+                  WithColor::defaultAutoDetectFunction()(OS))) {}
+
+void MarkupFilter::beginLine(StringRef Line) {
+  this->Line = Line;
+  resetColor();
+}
+
+void MarkupFilter::filter(const MarkupNode &Node) {
+  if (!checkTag(Node))
+    return;
+
+  if (trySGR(Node))
+    return;
+
+  if (Node.Tag == "symbol") {
+    if (!checkNumFields(Node, 1))
+      return;
+    highlight();
+    OS << llvm::demangle(Node.Fields.front().str());
+    restoreColor();
+    return;
+  }
+
+  OS << Node.Text;
+}
+
+bool MarkupFilter::trySGR(const MarkupNode &Node) {
+  if (Node.Text == "\033[0m") {
+    resetColor();
+    return true;
+  }
+  if (Node.Text == "\033[1m") {
+    Bold = true;
+    if (ColorsEnabled)
+      OS.changeColor(raw_ostream::Colors::SAVEDCOLOR, Bold);
+    return true;
+  }
+  auto SGRColor = StringSwitch<Optional<raw_ostream::Colors>>(Node.Text)
+                      .Case("\033[30m", raw_ostream::Colors::BLACK)
+                      .Case("\033[31m", raw_ostream::Colors::RED)
+                      .Case("\033[32m", raw_ostream::Colors::GREEN)
+                      .Case("\033[33m", raw_ostream::Colors::YELLOW)
+                      .Case("\033[34m", raw_ostream::Colors::BLUE)
+                      .Case("\033[35m", raw_ostream::Colors::MAGENTA)
+                      .Case("\033[36m", raw_ostream::Colors::CYAN)
+                      .Case("\033[37m", raw_ostream::Colors::WHITE)
+                      .Default(llvm::None);
+  if (SGRColor) {
+    Color = *SGRColor;
+    if (ColorsEnabled)
+      OS.changeColor(*Color);
+    return true;
+  }
+
+  return false;
+}
+
+// Begin highlighting text by picking a different color than the current color
+// state.
+void MarkupFilter::highlight() {
+  if (!ColorsEnabled)
+    return;
+  OS.changeColor(Color == raw_ostream::Colors::BLUE ? raw_ostream::Colors::CYAN
+                                                    : raw_ostream::Colors::BLUE,
+                 Bold);
+}
+
+// Set the output stream's color to the current color and bold state of the SGR
+// abstract machine.
+void MarkupFilter::restoreColor() {
+  if (!ColorsEnabled)
+    return;
+  if (Color) {
+    OS.changeColor(*Color, Bold);
+  } else {
+    OS.resetColor();
+    if (Bold)
+      OS.changeColor(raw_ostream::Colors::SAVEDCOLOR, Bold);
+  }
+}
+
+// Set the SGR and output stream's color and bold states back to the default.
+void MarkupFilter::resetColor() {
+  if (!Color && !Bold)
+    return;
+  Color.reset();
+  Bold = false;
+  if (ColorsEnabled)
+    OS.resetColor();
+}
+
+bool MarkupFilter::checkTag(const MarkupNode &Node) const {
+  if (any_of(Node.Tag, [](char C) { return C < 'a' || C > 'z'; })) {
+    WithColor::error(errs()) << "tags must be all lowercase characters\n";
+    reportLocation(Node.Tag.begin());
+    return false;
+  }
+  return true;
+}
+
+bool MarkupFilter::checkNumFields(const MarkupNode &Node, size_t Size) const {
+  if (Node.Fields.size() != Size) {
+    WithColor::error(errs()) << "expected " << Size << " fields; found "
+                             << Node.Fields.size() << "\n";
+    reportLocation(Node.Tag.end());
+    return false;
+  }
+  return true;
+}
+
+void MarkupFilter::reportLocation(StringRef::iterator Loc) const {
+  errs() << Line;
+  WithColor(errs().indent(Loc - Line.begin()), HighlightColor::String) << '^';
+  errs() << '\n';
+}
diff --git a/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp b/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
index a9c78830fa59..d8ee9264b64f 100644
--- a/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
+++ b/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
@@ -10,7 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SymbolizableObjectFile.h"
+#include "llvm/DebugInfo/Symbolize/SymbolizableObjectFile.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/BinaryFormat/COFF.h"
@@ -327,6 +327,14 @@ DIGlobal SymbolizableObjectFile::symbolizeData(
   std::string FileName;
   getNameFromSymbolTable(ModuleOffset.Address, Res.Name, Res.Start, Res.Size,
                          FileName);
+  Res.DeclFile = FileName;
+
+  // Try and get a better filename:lineno pair from the debuginfo, if present.
+  DILineInfo DL = DebugInfoContext->getLineInfoForDataAddress(ModuleOffset);
+  if (DL.Line != 0) {
+    Res.DeclFile = DL.FileName;
+    Res.DeclLine = DL.Line;
+  }
   return Res;
 }
 
diff --git a/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h b/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h
deleted file mode 100644
index 8fb003fff0ae..000000000000
--- a/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h
+++ /dev/null
@@ -1,103 +0,0 @@
-//===- SymbolizableObjectFile.h ---------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares the SymbolizableObjectFile class.
-//
-//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_DEBUGINFO_SYMBOLIZE_SYMBOLIZABLEOBJECTFILE_H
-#define LLVM_LIB_DEBUGINFO_SYMBOLIZE_SYMBOLIZABLEOBJECTFILE_H
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/DebugInfo/DIContext.h"
-#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
-#include "llvm/Support/Error.h"
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-namespace llvm {
-
-class DataExtractor;
-
-namespace symbolize {
-
-class SymbolizableObjectFile : public SymbolizableModule {
-public:
-  static Expected<std::unique_ptr<SymbolizableObjectFile>>
-  create(const object::ObjectFile *Obj, std::unique_ptr<DIContext> DICtx,
-         bool UntagAddresses);
-
-  DILineInfo symbolizeCode(object::SectionedAddress ModuleOffset,
-                           DILineInfoSpecifier LineInfoSpecifier,
-                           bool UseSymbolTable) const override;
-  DIInliningInfo symbolizeInlinedCode(object::SectionedAddress ModuleOffset,
-                                      DILineInfoSpecifier LineInfoSpecifier,
-                                      bool UseSymbolTable) const override;
-  DIGlobal symbolizeData(object::SectionedAddress ModuleOffset) const override;
-  std::vector<DILocal>
-  symbolizeFrame(object::SectionedAddress ModuleOffset) const override;
-
-  // Return true if this is a 32-bit x86 PE COFF module.
-  bool isWin32Module() const override;
-
-  // Returns the preferred base of the module, i.e. where the loader would place
-  // it in memory assuming there were no conflicts.
-  uint64_t getModulePreferredBase() const override;
-
-private:
-  bool shouldOverrideWithSymbolTable(FunctionNameKind FNKind,
-                                     bool UseSymbolTable) const;
-
-  bool getNameFromSymbolTable(uint64_t Address, std::string &Name,
-                              uint64_t &Addr, uint64_t &Size,
-                              std::string &FileName) const;
-  // For big-endian PowerPC64 ELF, OpdAddress is the address of the .opd
-  // (function descriptor) section and OpdExtractor refers to its contents.
-  Error addSymbol(const object::SymbolRef &Symbol, uint64_t SymbolSize,
-                  DataExtractor *OpdExtractor = nullptr,
-                  uint64_t OpdAddress = 0);
-  Error addCoffExportSymbols(const object::COFFObjectFile *CoffObj);
-
-  /// Search for the first occurence of specified Address in ObjectFile.
-  uint64_t getModuleSectionIndexForAddress(uint64_t Address) const;
-
-  const object::ObjectFile *Module;
-  std::unique_ptr<DIContext> DebugInfoContext;
-  bool UntagAddresses;
-
-  struct SymbolDesc {
-    uint64_t Addr;
-    // If size is 0, assume that symbol occupies the whole memory range up to
-    // the following symbol.
-    uint64_t Size;
-
-    StringRef Name;
-    // Non-zero if this is an ELF local symbol. See the comment in
-    // getNameFromSymbolTable.
-    uint32_t ELFLocalSymIdx;
-
-    bool operator<(const SymbolDesc &RHS) const {
-      return Addr != RHS.Addr ? Addr < RHS.Addr : Size < RHS.Size;
-    }
-  };
-  std::vector<SymbolDesc> Symbols;
-  // (index, filename) pairs of ELF STT_FILE symbols.
-  std::vector<std::pair<uint32_t, StringRef>> FileSymbols;
-
-  SymbolizableObjectFile(const object::ObjectFile *Obj,
-                         std::unique_ptr<DIContext> DICtx,
-                         bool UntagAddresses);
-};
-
-} // end namespace symbolize
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_DEBUGINFO_SYMBOLIZE_SYMBOLIZABLEOBJECTFILE_H
diff --git a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
index 5ec79df17fed..d2ff8aa7c995 100644
--- a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
+++ b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
@@ -12,22 +12,19 @@
 
 #include "llvm/DebugInfo/Symbolize/Symbolize.h"
 
-#include "SymbolizableObjectFile.h"
-
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/BinaryFormat/COFF.h"
-#include "llvm/Config/config.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/PDB/PDB.h"
 #include "llvm/DebugInfo/PDB/PDBContext.h"
-#include "llvm/Debuginfod/Debuginfod.h"
+#include "llvm/DebugInfo/Symbolize/DIFetcher.h"
+#include "llvm/DebugInfo/Symbolize/SymbolizableObjectFile.h"
 #include "llvm/Demangle/Demangle.h"
 #include "llvm/Object/COFF.h"
+#include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Object/MachOUniversal.h"
 #include "llvm/Support/CRC.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/Compression.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
@@ -38,8 +35,20 @@
 #include <cstring>
 
 namespace llvm {
+namespace codeview {
+union DebugInfo;
+}
+namespace object {
+template <class ELFT> class ELFFile;
+}
 namespace symbolize {
 
+LLVMSymbolizer::LLVMSymbolizer() = default;
+
+LLVMSymbolizer::LLVMSymbolizer(const Options &Opts) : Opts(Opts) {}
+
+LLVMSymbolizer::~LLVMSymbolizer() = default;
+
 template <typename T>
 Expected<DILineInfo>
 LLVMSymbolizer::symbolizeCodeCommon(const T &ModuleSpecifier,
@@ -81,6 +90,12 @@ LLVMSymbolizer::symbolizeCode(const std::string &ModuleName,
   return symbolizeCodeCommon(ModuleName, ModuleOffset);
 }
 
+Expected<DILineInfo>
+LLVMSymbolizer::symbolizeCode(ArrayRef<uint8_t> BuildID,
+                              object::SectionedAddress ModuleOffset) {
+  return symbolizeCodeCommon(BuildID, ModuleOffset);
+}
+
 template <typename T>
 Expected<DIInliningInfo> LLVMSymbolizer::symbolizeInlinedCodeCommon(
     const T &ModuleSpecifier, object::SectionedAddress ModuleOffset) {
@@ -124,6 +139,12 @@ LLVMSymbolizer::symbolizeInlinedCode(const std::string &ModuleName,
   return symbolizeInlinedCodeCommon(ModuleName, ModuleOffset);
 }
 
+Expected<DIInliningInfo>
+LLVMSymbolizer::symbolizeInlinedCode(ArrayRef<uint8_t> BuildID,
+                                     object::SectionedAddress ModuleOffset) {
+  return symbolizeInlinedCodeCommon(BuildID, ModuleOffset);
+}
+
 template <typename T>
 Expected<DIGlobal>
 LLVMSymbolizer::symbolizeDataCommon(const T &ModuleSpecifier,
@@ -163,6 +184,12 @@ LLVMSymbolizer::symbolizeData(const std::string &ModuleName,
   return symbolizeDataCommon(ModuleName, ModuleOffset);
 }
 
+Expected<DIGlobal>
+LLVMSymbolizer::symbolizeData(ArrayRef<uint8_t> BuildID,
+                              object::SectionedAddress ModuleOffset) {
+  return symbolizeDataCommon(BuildID, ModuleOffset);
+}
+
 template <typename T>
 Expected<std::vector<DILocal>>
 LLVMSymbolizer::symbolizeFrameCommon(const T &ModuleSpecifier,
@@ -198,11 +225,20 @@ LLVMSymbolizer::symbolizeFrame(const std::string &ModuleName,
   return symbolizeFrameCommon(ModuleName, ModuleOffset);
 }
 
+Expected<std::vector<DILocal>>
+LLVMSymbolizer::symbolizeFrame(ArrayRef<uint8_t> BuildID,
+                               object::SectionedAddress ModuleOffset) {
+  return symbolizeFrameCommon(BuildID, ModuleOffset);
+}
+
 void LLVMSymbolizer::flush() {
   ObjectForUBPathAndArch.clear();
+  LRUBinaries.clear();
+  CacheSize = 0;
   BinaryForPath.clear();
   ObjectPairForPathArch.clear();
   Modules.clear();
+  BuildIDPaths.clear();
 }
 
 namespace {
@@ -230,51 +266,6 @@ bool checkFileCRC(StringRef Path, uint32_t CRCHash) {
   return CRCHash == llvm::crc32(arrayRefFromStringRef(MB.get()->getBuffer()));
 }
 
-bool findDebugBinary(const std::string &OrigPath,
-                     const std::string &DebuglinkName, uint32_t CRCHash,
-                     const std::string &FallbackDebugPath,
-                     std::string &Result) {
-  SmallString<16> OrigDir(OrigPath);
-  llvm::sys::path::remove_filename(OrigDir);
-  SmallString<16> DebugPath = OrigDir;
-  // Try relative/path/to/original_binary/debuglink_name
-  llvm::sys::path::append(DebugPath, DebuglinkName);
-  if (checkFileCRC(DebugPath, CRCHash)) {
-    Result = std::string(DebugPath.str());
-    return true;
-  }
-  // Try relative/path/to/original_binary/.debug/debuglink_name
-  DebugPath = OrigDir;
-  llvm::sys::path::append(DebugPath, ".debug", DebuglinkName);
-  if (checkFileCRC(DebugPath, CRCHash)) {
-    Result = std::string(DebugPath.str());
-    return true;
-  }
-  // Make the path absolute so that lookups will go to
-  // "/usr/lib/debug/full/path/to/debug", not
-  // "/usr/lib/debug/to/debug"
-  llvm::sys::fs::make_absolute(OrigDir);
-  if (!FallbackDebugPath.empty()) {
-    // Try <FallbackDebugPath>/absolute/path/to/original_binary/debuglink_name
-    DebugPath = FallbackDebugPath;
-  } else {
-#if defined(__NetBSD__)
-    // Try /usr/libdata/debug/absolute/path/to/original_binary/debuglink_name
-    DebugPath = "/usr/libdata/debug";
-#else
-    // Try /usr/lib/debug/absolute/path/to/original_binary/debuglink_name
-    DebugPath = "/usr/lib/debug";
-#endif
-  }
-  llvm::sys::path::append(DebugPath, llvm::sys::path::relative_path(OrigDir),
-                          DebuglinkName);
-  if (checkFileCRC(DebugPath, CRCHash)) {
-    Result = std::string(DebugPath.str());
-    return true;
-  }
-  return false;
-}
-
 bool getGNUDebuglinkContents(const ObjectFile *Obj, std::string &DebugName,
                              uint32_t &CRCHash) {
   if (!Obj)
@@ -351,50 +342,6 @@ Optional<ArrayRef<uint8_t>> getBuildID(const ELFObjectFileBase *Obj) {
   return BuildID;
 }
 
-bool findDebugBinary(const std::vector<std::string> &DebugFileDirectory,
-                     const ArrayRef<uint8_t> BuildID, std::string &Result) {
-  auto getDebugPath = [&](StringRef Directory) {
-    SmallString<128> Path{Directory};
-    sys::path::append(Path, ".build-id",
-                      llvm::toHex(BuildID[0], /*LowerCase=*/true),
-                      llvm::toHex(BuildID.slice(1), /*LowerCase=*/true));
-    Path += ".debug";
-    return Path;
-  };
-  if (DebugFileDirectory.empty()) {
-    SmallString<128> Path = getDebugPath(
-#if defined(__NetBSD__)
-        // Try /usr/libdata/debug/.build-id/../...
-        "/usr/libdata/debug"
-#else
-        // Try /usr/lib/debug/.build-id/../...
-        "/usr/lib/debug"
-#endif
-    );
-    if (llvm::sys::fs::exists(Path)) {
-      Result = std::string(Path.str());
-      return true;
-    }
-  } else {
-    for (const auto &Directory : DebugFileDirectory) {
-      // Try <debug-file-directory>/.build-id/../...
-      SmallString<128> Path = getDebugPath(Directory);
-      if (llvm::sys::fs::exists(Path)) {
-        Result = std::string(Path.str());
-        return true;
-      }
-    }
-  }
-  // Try debuginfod client cache and known servers.
-  Expected<std::string> PathOrErr = getCachedOrDownloadDebuginfo(BuildID);
-  if (!PathOrErr) {
-    consumeError(PathOrErr.takeError());
-    return false;
-  }
-  Result = *PathOrErr;
-  return true;
-}
-
 } // end anonymous namespace
 
 ObjectFile *LLVMSymbolizer::lookUpDsymFile(const std::string &ExePath,
@@ -437,8 +384,7 @@ ObjectFile *LLVMSymbolizer::lookUpDebuglinkObject(const std::string &Path,
   std::string DebugBinaryPath;
   if (!getGNUDebuglinkContents(Obj, DebuglinkName, CRCHash))
     return nullptr;
-  if (!findDebugBinary(Path, DebuglinkName, CRCHash, Opts.FallbackDebugPath,
-                       DebugBinaryPath))
+  if (!findDebugBinary(Path, DebuglinkName, CRCHash, DebugBinaryPath))
     return nullptr;
   auto DbgObjOrErr = getOrCreateObject(DebugBinaryPath, ArchName);
   if (!DbgObjOrErr) {
@@ -458,7 +404,7 @@ ObjectFile *LLVMSymbolizer::lookUpBuildIDObject(const std::string &Path,
   if (BuildID->size() < 2)
     return nullptr;
   std::string DebugBinaryPath;
-  if (!findDebugBinary(Opts.DebugFileDirectory, *BuildID, DebugBinaryPath))
+  if (!getOrFindDebugBinary(*BuildID, DebugBinaryPath))
     return nullptr;
   auto DbgObjOrErr = getOrCreateObject(DebugBinaryPath, ArchName);
   if (!DbgObjOrErr) {
@@ -468,12 +414,97 @@ ObjectFile *LLVMSymbolizer::lookUpBuildIDObject(const std::string &Path,
   return DbgObjOrErr.get();
 }
 
+bool LLVMSymbolizer::findDebugBinary(const std::string &OrigPath,
+                                     const std::string &DebuglinkName,
+                                     uint32_t CRCHash, std::string &Result) {
+  SmallString<16> OrigDir(OrigPath);
+  llvm::sys::path::remove_filename(OrigDir);
+  SmallString<16> DebugPath = OrigDir;
+  // Try relative/path/to/original_binary/debuglink_name
+  llvm::sys::path::append(DebugPath, DebuglinkName);
+  if (checkFileCRC(DebugPath, CRCHash)) {
+    Result = std::string(DebugPath.str());
+    return true;
+  }
+  // Try relative/path/to/original_binary/.debug/debuglink_name
+  DebugPath = OrigDir;
+  llvm::sys::path::append(DebugPath, ".debug", DebuglinkName);
+  if (checkFileCRC(DebugPath, CRCHash)) {
+    Result = std::string(DebugPath.str());
+    return true;
+  }
+  // Make the path absolute so that lookups will go to
+  // "/usr/lib/debug/full/path/to/debug", not
+  // "/usr/lib/debug/to/debug"
+  llvm::sys::fs::make_absolute(OrigDir);
+  if (!Opts.FallbackDebugPath.empty()) {
+    // Try <FallbackDebugPath>/absolute/path/to/original_binary/debuglink_name
+    DebugPath = Opts.FallbackDebugPath;
+  } else {
+#if defined(__NetBSD__)
+    // Try /usr/libdata/debug/absolute/path/to/original_binary/debuglink_name
+    DebugPath = "/usr/libdata/debug";
+#else
+    // Try /usr/lib/debug/absolute/path/to/original_binary/debuglink_name
+    DebugPath = "/usr/lib/debug";
+#endif
+  }
+  llvm::sys::path::append(DebugPath, llvm::sys::path::relative_path(OrigDir),
+                          DebuglinkName);
+  if (checkFileCRC(DebugPath, CRCHash)) {
+    Result = std::string(DebugPath.str());
+    return true;
+  }
+  return false;
+}
+
+static StringRef getBuildIDStr(ArrayRef<uint8_t> BuildID) {
+  return StringRef(reinterpret_cast<const char *>(BuildID.data()),
+                   BuildID.size());
+}
+
+bool LLVMSymbolizer::getOrFindDebugBinary(const ArrayRef<uint8_t> BuildID,
+                                          std::string &Result) {
+  StringRef BuildIDStr = getBuildIDStr(BuildID);
+  auto I = BuildIDPaths.find(BuildIDStr);
+  if (I != BuildIDPaths.end()) {
+    Result = I->second;
+    return true;
+  }
+  auto recordPath = [&](StringRef Path) {
+    Result = Path.str();
+    auto InsertResult = BuildIDPaths.insert({BuildIDStr, Result});
+    assert(InsertResult.second);
+    (void)InsertResult;
+  };
+
+  Optional<std::string> Path;
+  Path = LocalDIFetcher(Opts.DebugFileDirectory).fetchBuildID(BuildID);
+  if (Path) {
+    recordPath(*Path);
+    return true;
+  }
+
+  // Try caller-provided debug info fetchers.
+  for (const std::unique_ptr<DIFetcher> &Fetcher : DIFetchers) {
+    Path = Fetcher->fetchBuildID(BuildID);
+    if (Path) {
+      recordPath(*Path);
+      return true;
+    }
+  }
+
+  return false;
+}
+
 Expected<LLVMSymbolizer::ObjectPair>
 LLVMSymbolizer::getOrCreateObjectPair(const std::string &Path,
                                       const std::string &ArchName) {
   auto I = ObjectPairForPathArch.find(std::make_pair(Path, ArchName));
-  if (I != ObjectPairForPathArch.end())
+  if (I != ObjectPairForPathArch.end()) {
+    recordAccess(BinaryForPath.find(Path)->second);
     return I->second;
+  }
 
   auto ObjOrErr = getOrCreateObject(Path, ArchName);
   if (!ObjOrErr) {
@@ -495,7 +526,12 @@ LLVMSymbolizer::getOrCreateObjectPair(const std::string &Path,
   if (!DbgObj)
     DbgObj = Obj;
   ObjectPair Res = std::make_pair(Obj, DbgObj);
-  ObjectPairForPathArch.emplace(std::make_pair(Path, ArchName), Res);
+  std::string DbgObjPath = DbgObj->getFileName().str();
+  auto Pair =
+      ObjectPairForPathArch.emplace(std::make_pair(Path, ArchName), Res);
+  BinaryForPath.find(DbgObjPath)->second.pushEvictor([this, I = Pair.first]() {
+    ObjectPairForPathArch.erase(I);
+  });
   return Res;
 }
 
@@ -505,13 +541,19 @@ LLVMSymbolizer::getOrCreateObject(const std::string &Path,
   Binary *Bin;
   auto Pair = BinaryForPath.emplace(Path, OwningBinary<Binary>());
   if (!Pair.second) {
-    Bin = Pair.first->second.getBinary();
+    Bin = Pair.first->second->getBinary();
+    recordAccess(Pair.first->second);
   } else {
     Expected<OwningBinary<Binary>> BinOrErr = createBinary(Path);
     if (!BinOrErr)
       return BinOrErr.takeError();
-    Pair.first->second = std::move(BinOrErr.get());
-    Bin = Pair.first->second.getBinary();
+
+    CachedBinary &CachedBin = Pair.first->second;
+    CachedBin = std::move(BinOrErr.get());
+    CachedBin.pushEvictor([this, I = Pair.first]() { BinaryForPath.erase(I); });
+    LRUBinaries.push_back(CachedBin);
+    CacheSize += CachedBin.size();
+    Bin = CachedBin->getBinary();
   }
 
   if (!Bin)
@@ -530,8 +572,10 @@ LLVMSymbolizer::getOrCreateObject(const std::string &Path,
       return ObjOrErr.takeError();
     }
     ObjectFile *Res = ObjOrErr->get();
-    ObjectForUBPathAndArch.emplace(std::make_pair(Path, ArchName),
-                                   std::move(ObjOrErr.get()));
+    auto Pair = ObjectForUBPathAndArch.emplace(std::make_pair(Path, ArchName),
+                                               std::move(ObjOrErr.get()));
+    BinaryForPath.find(Path)->second.pushEvictor(
+        [this, Iter = Pair.first]() { ObjectForUBPathAndArch.erase(Iter); });
     return Res;
   }
   if (Bin->isObject()) {
@@ -559,10 +603,6 @@ LLVMSymbolizer::createModuleInfo(const ObjectFile *Obj,
 
 Expected<SymbolizableModule *>
 LLVMSymbolizer::getOrCreateModuleInfo(const std::string &ModuleName) {
-  auto I = Modules.find(ModuleName);
-  if (I != Modules.end())
-    return I->second.get();
-
   std::string BinaryName = ModuleName;
   std::string ArchName = Opts.DefaultArch;
   size_t ColonPos = ModuleName.find_last_of(':');
@@ -574,6 +614,13 @@ LLVMSymbolizer::getOrCreateModuleInfo(const std::string &ModuleName) {
       ArchName = ArchStr;
     }
   }
+
+  auto I = Modules.find(ModuleName);
+  if (I != Modules.end()) {
+    recordAccess(BinaryForPath.find(BinaryName)->second);
+    return I->second.get();
+  }
+
   auto ObjectsOrErr = getOrCreateObjectPair(BinaryName, ArchName);
   if (!ObjectsOrErr) {
     // Failed to find valid object file.
@@ -608,7 +655,15 @@ LLVMSymbolizer::getOrCreateModuleInfo(const std::string &ModuleName) {
     Context = DWARFContext::create(
         *Objects.second, DWARFContext::ProcessDebugRelocations::Process,
         nullptr, Opts.DWPName);
-  return createModuleInfo(Objects.first, std::move(Context), ModuleName);
+  auto ModuleOrErr =
+      createModuleInfo(Objects.first, std::move(Context), ModuleName);
+  if (ModuleOrErr) {
+    auto I = Modules.find(ModuleName);
+    BinaryForPath.find(BinaryName)->second.pushEvictor([this, I]() {
+      Modules.erase(I);
+    });
+  }
+  return ModuleOrErr;
 }
 
 Expected<SymbolizableModule *>
@@ -623,6 +678,17 @@ LLVMSymbolizer::getOrCreateModuleInfo(const ObjectFile &Obj) {
   return createModuleInfo(&Obj, std::move(Context), ObjName);
 }
 
+Expected<SymbolizableModule *>
+LLVMSymbolizer::getOrCreateModuleInfo(ArrayRef<uint8_t> BuildID) {
+  std::string Path;
+  if (!getOrFindDebugBinary(BuildID, Path)) {
+    return createStringError(errc::no_such_file_or_directory,
+                             Twine("could not find build ID '") +
+                                 toHex(BuildID) + "'");
+  }
+  return getOrCreateModuleInfo(Path);
+}
+
 namespace {
 
 // Undo these various manglings for Win32 extern "C" functions:
@@ -680,5 +746,35 @@ LLVMSymbolizer::DemangleName(const std::string &Name,
   return Name;
 }
 
+void LLVMSymbolizer::recordAccess(CachedBinary &Bin) {
+  if (Bin->getBinary())
+    LRUBinaries.splice(LRUBinaries.end(), LRUBinaries, Bin.getIterator());
+}
+
+void LLVMSymbolizer::pruneCache() {
+  // Evict the LRU binary until the max cache size is reached or there's <= 1
+  // item in the cache. The MRU binary is always kept to avoid thrashing if it's
+  // larger than the cache size.
+  while (CacheSize > Opts.MaxCacheSize && !LRUBinaries.empty() &&
+         std::next(LRUBinaries.begin()) != LRUBinaries.end()) {
+    CachedBinary &Bin = LRUBinaries.front();
+    CacheSize -= Bin.size();
+    LRUBinaries.pop_front();
+    Bin.evict();
+  }
+}
+
+void CachedBinary::pushEvictor(std::function<void()> NewEvictor) {
+  if (Evictor) {
+    this->Evictor = [OldEvictor = std::move(this->Evictor),
+                     NewEvictor = std::move(NewEvictor)]() {
+      NewEvictor();
+      OldEvictor();
+    };
+  } else {
+    this->Evictor = std::move(NewEvictor);
+  }
+}
+
 } // namespace symbolize
 } // namespace llvm
diff --git a/llvm/lib/Debuginfod/DIFetcher.cpp b/llvm/lib/Debuginfod/DIFetcher.cpp
new file mode 100644
index 000000000000..f0c134654534
--- /dev/null
+++ b/llvm/lib/Debuginfod/DIFetcher.cpp
@@ -0,0 +1,28 @@
+//===- llvm/DebugInfod/DIFetcher.cpp - Debug info fetcher -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines a DIFetcher implementation for obtaining debug info
+/// from debuginfod.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Debuginfod/DIFetcher.h"
+
+#include "llvm/Debuginfod/Debuginfod.h"
+
+using namespace llvm;
+
+Optional<std::string>
+DebuginfodDIFetcher::fetchBuildID(ArrayRef<uint8_t> BuildID) const {
+  Expected<std::string> PathOrErr = getCachedOrDownloadDebuginfo(BuildID);
+  if (PathOrErr)
+    return *PathOrErr;
+  consumeError(PathOrErr.takeError());
+  return None;
+}
diff --git a/llvm/lib/Debuginfod/Debuginfod.cpp b/llvm/lib/Debuginfod/Debuginfod.cpp
index 27614572766d..7b1c36fdbe09 100644
--- a/llvm/lib/Debuginfod/Debuginfod.cpp
+++ b/llvm/lib/Debuginfod/Debuginfod.cpp
@@ -115,6 +115,41 @@ Expected<std::string> getCachedOrDownloadArtifact(StringRef UniqueKey,
                                      getDefaultDebuginfodTimeout());
 }
 
+namespace {
+
+/// A simple handler which streams the returned data to a cache file. The cache
+/// file is only created if a 200 OK status is observed.
+class StreamedHTTPResponseHandler : public HTTPResponseHandler {
+  using CreateStreamFn =
+      std::function<Expected<std::unique_ptr<CachedFileStream>>()>;
+  CreateStreamFn CreateStream;
+  HTTPClient &Client;
+  std::unique_ptr<CachedFileStream> FileStream;
+
+public:
+  StreamedHTTPResponseHandler(CreateStreamFn CreateStream, HTTPClient &Client)
+      : CreateStream(CreateStream), Client(Client) {}
+  virtual ~StreamedHTTPResponseHandler() = default;
+
+  Error handleBodyChunk(StringRef BodyChunk) override;
+};
+
+} // namespace
+
+Error StreamedHTTPResponseHandler::handleBodyChunk(StringRef BodyChunk) {
+  if (!FileStream) {
+    if (Client.responseCode() != 200)
+      return Error::success();
+    Expected<std::unique_ptr<CachedFileStream>> FileStreamOrError =
+        CreateStream();
+    if (!FileStreamOrError)
+      return FileStreamOrError.takeError();
+    FileStream = std::move(*FileStreamOrError);
+  }
+  *FileStream->OS << BodyChunk;
+  return Error::success();
+}
+
 Expected<std::string> getCachedOrDownloadArtifact(
     StringRef UniqueKey, StringRef UrlPath, StringRef CacheDirectoryPath,
     ArrayRef<StringRef> DebuginfodUrls, std::chrono::milliseconds Timeout) {
@@ -155,28 +190,18 @@ Expected<std::string> getCachedOrDownloadArtifact(
     SmallString<64> ArtifactUrl;
     sys::path::append(ArtifactUrl, sys::path::Style::posix, ServerUrl, UrlPath);
 
-    Expected<HTTPResponseBuffer> ResponseOrErr = Client.get(ArtifactUrl);
-    if (!ResponseOrErr)
-      return ResponseOrErr.takeError();
+    // Perform the HTTP request and if successful, write the response body to
+    // the cache.
+    StreamedHTTPResponseHandler Handler([&]() { return CacheAddStream(Task); },
+                                        Client);
+    HTTPRequest Request(ArtifactUrl);
+    Error Err = Client.perform(Request, Handler);
+    if (Err)
+      return std::move(Err);
 
-    HTTPResponseBuffer &Response = *ResponseOrErr;
-    if (Response.Code != 200)
+    if (Client.responseCode() != 200)
       continue;
 
-    // We have retrieved the artifact from this server, and now add it to the
-    // file cache.
-    Expected<std::unique_ptr<CachedFileStream>> FileStreamOrErr =
-        CacheAddStream(Task);
-    if (!FileStreamOrErr)
-      return FileStreamOrErr.takeError();
-    std::unique_ptr<CachedFileStream> &FileStream = *FileStreamOrErr;
-    if (!Response.Body)
-      return createStringError(
-          errc::io_error, "Unallocated MemoryBuffer in HTTPResponseBuffer.");
-
-    *FileStream->OS << StringRef(Response.Body->getBufferStart(),
-                                 Response.Body->getBufferSize());
-
     // Return the path to the artifact on disk.
     return std::string(AbsCachedArtifactPath);
   }
diff --git a/llvm/lib/Debuginfod/HTTPClient.cpp b/llvm/lib/Debuginfod/HTTPClient.cpp
index 65f457933b92..3376eaa7cd0d 100644
--- a/llvm/lib/Debuginfod/HTTPClient.cpp
+++ b/llvm/lib/Debuginfod/HTTPClient.cpp
@@ -7,9 +7,8 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-///
-/// This file defines the methods of the HTTPRequest, HTTPClient, and
-/// BufferedHTTPResponseHandler classes.
+/// This file defines the implementation of the HTTPClient library for issuing
+/// HTTP requests and handling the responses.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -34,44 +33,6 @@ bool operator==(const HTTPRequest &A, const HTTPRequest &B) {
 
 HTTPResponseHandler::~HTTPResponseHandler() = default;
 
-static inline bool parseContentLengthHeader(StringRef LineRef,
-                                            size_t &ContentLength) {
-  // Content-Length is a mandatory header, and the only one we handle.
-  return LineRef.consume_front("Content-Length: ") &&
-         to_integer(LineRef.trim(), ContentLength, 10);
-}
-
-Error BufferedHTTPResponseHandler::handleHeaderLine(StringRef HeaderLine) {
-  if (ResponseBuffer.Body)
-    return Error::success();
-
-  size_t ContentLength;
-  if (parseContentLengthHeader(HeaderLine, ContentLength))
-    ResponseBuffer.Body =
-        WritableMemoryBuffer::getNewUninitMemBuffer(ContentLength);
-
-  return Error::success();
-}
-
-Error BufferedHTTPResponseHandler::handleBodyChunk(StringRef BodyChunk) {
-  if (!ResponseBuffer.Body)
-    return createStringError(errc::io_error,
-                             "Unallocated response buffer. HTTP Body data "
-                             "received before Content-Length header.");
-  if (Offset + BodyChunk.size() > ResponseBuffer.Body->getBufferSize())
-    return createStringError(errc::io_error,
-                             "Content size exceeds buffer size.");
-  memcpy(ResponseBuffer.Body->getBufferStart() + Offset, BodyChunk.data(),
-         BodyChunk.size());
-  Offset += BodyChunk.size();
-  return Error::success();
-}
-
-Error BufferedHTTPResponseHandler::handleStatusCode(unsigned Code) {
-  ResponseBuffer.Code = Code;
-  return Error::success();
-}
-
 bool HTTPClient::IsInitialized = false;
 
 class HTTPClientCleanup {
@@ -80,18 +41,6 @@ public:
 };
 static const HTTPClientCleanup Cleanup;
 
-Expected<HTTPResponseBuffer> HTTPClient::perform(const HTTPRequest &Request) {
-  BufferedHTTPResponseHandler Handler;
-  if (Error Err = perform(Request, Handler))
-    return std::move(Err);
-  return std::move(Handler.ResponseBuffer);
-}
-
-Expected<HTTPResponseBuffer> HTTPClient::get(StringRef Url) {
-  HTTPRequest Request(Url);
-  return perform(Request);
-}
-
 #ifdef LLVM_ENABLE_CURL
 
 bool HTTPClient::isAvailable() { return true; }
@@ -128,18 +77,6 @@ struct CurlHTTPRequest {
   llvm::Error ErrorState = Error::success();
 };
 
-static size_t curlHeaderFunction(char *Contents, size_t Size, size_t NMemb,
-                                 CurlHTTPRequest *CurlRequest) {
-  assert(Size == 1 && "The Size passed by libCURL to CURLOPT_HEADERFUNCTION "
-                      "should always be 1.");
-  if (Error Err =
-          CurlRequest->Handler.handleHeaderLine(StringRef(Contents, NMemb))) {
-    CurlRequest->storeError(std::move(Err));
-    return 0;
-  }
-  return NMemb;
-}
-
 static size_t curlWriteFunction(char *Contents, size_t Size, size_t NMemb,
                                 CurlHTTPRequest *CurlRequest) {
   Size *= NMemb;
@@ -156,10 +93,10 @@ HTTPClient::HTTPClient() {
          "Must call HTTPClient::initialize() at the beginning of main().");
   if (Curl)
     return;
-  assert((Curl = curl_easy_init()) && "Curl could not be initialized.");
+  Curl = curl_easy_init();
+  assert(Curl && "Curl could not be initialized");
   // Set the callback hooks.
   curl_easy_setopt(Curl, CURLOPT_WRITEFUNCTION, curlWriteFunction);
-  curl_easy_setopt(Curl, CURLOPT_HEADERFUNCTION, curlHeaderFunction);
 }
 
 HTTPClient::~HTTPClient() { curl_easy_cleanup(Curl); }
@@ -176,22 +113,19 @@ Error HTTPClient::perform(const HTTPRequest &Request,
 
   CurlHTTPRequest CurlRequest(Handler);
   curl_easy_setopt(Curl, CURLOPT_WRITEDATA, &CurlRequest);
-  curl_easy_setopt(Curl, CURLOPT_HEADERDATA, &CurlRequest);
   CURLcode CurlRes = curl_easy_perform(Curl);
   if (CurlRes != CURLE_OK)
     return joinErrors(std::move(CurlRequest.ErrorState),
                       createStringError(errc::io_error,
                                         "curl_easy_perform() failed: %s\n",
                                         curl_easy_strerror(CurlRes)));
-  if (CurlRequest.ErrorState)
-    return std::move(CurlRequest.ErrorState);
+  return std::move(CurlRequest.ErrorState);
+}
 
-  unsigned Code;
+unsigned HTTPClient::responseCode() {
+  long Code = 0;
   curl_easy_getinfo(Curl, CURLINFO_RESPONSE_CODE, &Code);
-  if (Error Err = Handler.handleStatusCode(Code))
-    return joinErrors(std::move(CurlRequest.ErrorState), std::move(Err));
-
-  return std::move(CurlRequest.ErrorState);
+  return Code;
 }
 
 #else
@@ -213,4 +147,8 @@ Error HTTPClient::perform(const HTTPRequest &Request,
   llvm_unreachable("No HTTP Client implementation available.");
 }
 
+unsigned HTTPClient::responseCode() {
+  llvm_unreachable("No HTTP Client implementation available.");
+}
+
 #endif
diff --git a/llvm/lib/Demangle/Demangle.cpp b/llvm/lib/Demangle/Demangle.cpp
index 13aa2864c183..9d128424cabf 100644
--- a/llvm/lib/Demangle/Demangle.cpp
+++ b/llvm/lib/Demangle/Demangle.cpp
@@ -51,7 +51,7 @@ bool llvm::nonMicrosoftDemangle(const char *MangledName, std::string &Result) {
   if (isItaniumEncoding(MangledName))
     Demangled = itaniumDemangle(MangledName, nullptr, nullptr, nullptr);
   else if (isRustEncoding(MangledName))
-    Demangled = rustDemangle(MangledName, nullptr, nullptr, nullptr);
+    Demangled = rustDemangle(MangledName);
   else if (isDLangEncoding(MangledName))
     Demangled = dlangDemangle(MangledName);
 
diff --git a/llvm/lib/Demangle/ItaniumDemangle.cpp b/llvm/lib/Demangle/ItaniumDemangle.cpp
index 1a5db755e37b..1c9209d8f369 100644
--- a/llvm/lib/Demangle/ItaniumDemangle.cpp
+++ b/llvm/lib/Demangle/ItaniumDemangle.cpp
@@ -172,6 +172,50 @@ struct DumpVisitor {
       return printStr("TemplateParamKind::Template");
     }
   }
+  void print(Node::Prec P) {
+    switch (P) {
+    case Node::Prec::Primary:
+      return printStr("Node::Prec::Primary");
+    case Node::Prec::Postfix:
+      return printStr("Node::Prec::Postfix");
+    case Node::Prec::Unary:
+      return printStr("Node::Prec::Unary");
+    case Node::Prec::Cast:
+      return printStr("Node::Prec::Cast");
+    case Node::Prec::PtrMem:
+      return printStr("Node::Prec::PtrMem");
+    case Node::Prec::Multiplicative:
+      return printStr("Node::Prec::Multiplicative");
+    case Node::Prec::Additive:
+      return printStr("Node::Prec::Additive");
+    case Node::Prec::Shift:
+      return printStr("Node::Prec::Shift");
+    case Node::Prec::Spaceship:
+      return printStr("Node::Prec::Spaceship");
+    case Node::Prec::Relational:
+      return printStr("Node::Prec::Relational");
+    case Node::Prec::Equality:
+      return printStr("Node::Prec::Equality");
+    case Node::Prec::And:
+      return printStr("Node::Prec::And");
+    case Node::Prec::Xor:
+      return printStr("Node::Prec::Xor");
+    case Node::Prec::Ior:
+      return printStr("Node::Prec::Ior");
+    case Node::Prec::AndIf:
+      return printStr("Node::Prec::AndIf");
+    case Node::Prec::OrIf:
+      return printStr("Node::Prec::OrIf");
+    case Node::Prec::Conditional:
+      return printStr("Node::Prec::Conditional");
+    case Node::Prec::Assign:
+      return printStr("Node::Prec::Assign");
+    case Node::Prec::Comma:
+      return printStr("Node::Prec::Comma");
+    case Node::Prec::Default:
+      return printStr("Node::Prec::Default");
+    }
+  }
 
   void newLine() {
     printStr("\n");
@@ -404,8 +448,8 @@ char *ItaniumPartialDemangler::getFunctionBaseName(char *Buf, size_t *N) const {
     case Node::KAbiTagAttr:
       Name = static_cast<const AbiTagAttr *>(Name)->Base;
       continue;
-    case Node::KStdQualifiedName:
-      Name = static_cast<const StdQualifiedName *>(Name)->Child;
+    case Node::KModuleEntity:
+      Name = static_cast<const ModuleEntity *>(Name)->Name;
       continue;
     case Node::KNestedName:
       Name = static_cast<const NestedName *>(Name)->Name;
@@ -445,10 +489,10 @@ char *ItaniumPartialDemangler::getFunctionDeclContextName(char *Buf,
     break;
   }
 
+  if (Name->getKind() == Node::KModuleEntity)
+    Name = static_cast<const ModuleEntity *>(Name)->Name;
+
   switch (Name->getKind()) {
-  case Node::KStdQualifiedName:
-    OB += "std";
-    break;
   case Node::KNestedName:
     static_cast<const NestedName *>(Name)->Qual->print(OB);
     break;
@@ -550,8 +594,8 @@ bool ItaniumPartialDemangler::isCtorOrDtor() const {
     case Node::KNestedName:
       N = static_cast<const NestedName *>(N)->Name;
       break;
-    case Node::KStdQualifiedName:
-      N = static_cast<const StdQualifiedName *>(N)->Child;
+    case Node::KModuleEntity:
+      N = static_cast<const ModuleEntity *>(N)->Name;
       break;
     }
   }
diff --git a/llvm/lib/Demangle/MicrosoftDemangle.cpp b/llvm/lib/Demangle/MicrosoftDemangle.cpp
index d8da3b48e25b..b4e98a20f389 100644
--- a/llvm/lib/Demangle/MicrosoftDemangle.cpp
+++ b/llvm/lib/Demangle/MicrosoftDemangle.cpp
@@ -245,8 +245,8 @@ demanglePointerCVQualifiers(StringView &MangledName) {
 }
 
 StringView Demangler::copyString(StringView Borrowed) {
-  char *Stable = Arena.allocUnalignedBuffer(Borrowed.size() + 1);
-  std::strcpy(Stable, Borrowed.begin());
+  char *Stable = Arena.allocUnalignedBuffer(Borrowed.size());
+  std::memcpy(Stable, Borrowed.begin(), Borrowed.size());
 
   return {Stable, Borrowed.size()};
 }
@@ -823,11 +823,15 @@ SymbolNode *Demangler::parse(StringView &MangledName) {
 }
 
 TagTypeNode *Demangler::parseTagUniqueName(StringView &MangledName) {
-  if (!MangledName.consumeFront(".?A"))
+  if (!MangledName.consumeFront(".?A")) {
+    Error = true;
     return nullptr;
+  }
   MangledName.consumeFront(".?A");
-  if (MangledName.empty())
+  if (MangledName.empty()) {
+    Error = true;
     return nullptr;
+  }
 
   return demangleClassType(MangledName);
 }
@@ -970,12 +974,9 @@ void Demangler::memorizeIdentifier(IdentifierNode *Identifier) {
     // FIXME: Propagate out-of-memory as an error?
     std::terminate();
   Identifier->output(OB, OF_Default);
-  OB << '\0';
-  char *Name = OB.getBuffer();
-
-  StringView Owned = copyString(Name);
+  StringView Owned = copyString(OB);
   memorizeString(Owned);
-  std::free(Name);
+  std::free(OB.getBuffer());
 }
 
 IdentifierNode *
@@ -1279,7 +1280,6 @@ Demangler::demangleStringLiteral(StringView &MangledName) {
   bool IsWcharT = false;
   bool IsNegative = false;
   size_t CrcEndPos = 0;
-  char *ResultBuffer = nullptr;
 
   EncodedStringLiteralNode *Result = Arena.alloc<EncodedStringLiteralNode>();
 
@@ -1375,10 +1375,8 @@ Demangler::demangleStringLiteral(StringView &MangledName) {
     }
   }
 
-  OB << '\0';
-  ResultBuffer = OB.getBuffer();
-  Result->DecodedString = copyString(ResultBuffer);
-  std::free(ResultBuffer);
+  Result->DecodedString = copyString(OB);
+  std::free(OB.getBuffer());
   return Result;
 
 StringLiteralError:
@@ -1455,10 +1453,9 @@ Demangler::demangleLocallyScopedNamePiece(StringView &MangledName) {
   Scope->output(OB, OF_Default);
   OB << '\'';
   OB << "::`" << Number << "'";
-  OB << '\0';
-  char *Result = OB.getBuffer();
-  Identifier->Name = copyString(Result);
-  std::free(Result);
+
+  Identifier->Name = copyString(OB);
+  std::free(OB.getBuffer());
   return Identifier;
 }
 
@@ -2322,8 +2319,8 @@ void Demangler::dumpBackReferences() {
     TypeNode *T = Backrefs.FunctionParams[I];
     T->output(OB, OF_Default);
 
-    std::printf("  [%d] - %.*s\n", (int)I, (int)OB.getCurrentPosition(),
-                OB.getBuffer());
+    StringView B = OB;
+    std::printf("  [%d] - %.*s\n", (int)I, (int)B.size(), B.begin());
   }
   std::free(OB.getBuffer());
 
diff --git a/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp b/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp
index d07d05a08c55..494cdabad41f 100644
--- a/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp
+++ b/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp
@@ -121,8 +121,8 @@ std::string Node::toString(OutputFlags Flags) const {
   OutputBuffer OB;
   initializeOutputBuffer(nullptr, nullptr, OB, 1024);
   this->output(OB, Flags);
-  OB << '\0';
-  std::string Owned(OB.getBuffer());
+  StringView SV = OB;
+  std::string Owned(SV.begin(), SV.end());
   std::free(OB.getBuffer());
   return Owned;
 }
diff --git a/llvm/lib/Demangle/RustDemangle.cpp b/llvm/lib/Demangle/RustDemangle.cpp
index dcac0bd63859..32b10db2a968 100644
--- a/llvm/lib/Demangle/RustDemangle.cpp
+++ b/llvm/lib/Demangle/RustDemangle.cpp
@@ -24,8 +24,8 @@
 using namespace llvm;
 
 using llvm::itanium_demangle::OutputBuffer;
+using llvm::itanium_demangle::ScopedOverride;
 using llvm::itanium_demangle::StringView;
-using llvm::itanium_demangle::SwapAndRestore;
 
 namespace {
 
@@ -119,7 +119,7 @@ private:
     if (!Print)
       return;
 
-    SwapAndRestore<size_t> SavePosition(Position, Position);
+    ScopedOverride<size_t> SavePosition(Position, Position);
     Position = Backref;
     Demangler();
   }
@@ -147,57 +147,27 @@ private:
 
 } // namespace
 
-char *llvm::rustDemangle(const char *MangledName, char *Buf, size_t *N,
-                         int *Status) {
-  if (MangledName == nullptr || (Buf != nullptr && N == nullptr)) {
-    if (Status != nullptr)
-      *Status = demangle_invalid_args;
+char *llvm::rustDemangle(const char *MangledName) {
+  if (MangledName == nullptr)
     return nullptr;
-  }
 
   // Return early if mangled name doesn't look like a Rust symbol.
   StringView Mangled(MangledName);
-  if (!Mangled.startsWith("_R")) {
-    if (Status != nullptr)
-      *Status = demangle_invalid_mangled_name;
+  if (!Mangled.startsWith("_R"))
     return nullptr;
-  }
 
   Demangler D;
-  if (!initializeOutputBuffer(nullptr, nullptr, D.Output, 1024)) {
-    if (Status != nullptr)
-      *Status = demangle_memory_alloc_failure;
+  if (!initializeOutputBuffer(nullptr, nullptr, D.Output, 1024))
     return nullptr;
-  }
 
   if (!D.demangle(Mangled)) {
-    if (Status != nullptr)
-      *Status = demangle_invalid_mangled_name;
     std::free(D.Output.getBuffer());
     return nullptr;
   }
 
   D.Output += '\0';
-  char *Demangled = D.Output.getBuffer();
-  size_t DemangledLen = D.Output.getCurrentPosition();
-
-  if (Buf != nullptr) {
-    if (DemangledLen <= *N) {
-      std::memcpy(Buf, Demangled, DemangledLen);
-      std::free(Demangled);
-      Demangled = Buf;
-    } else {
-      std::free(Buf);
-    }
-  }
-
-  if (N != nullptr)
-    *N = DemangledLen;
-
-  if (Status != nullptr)
-    *Status = demangle_success;
 
-  return Demangled;
+  return D.Output.getBuffer();
 }
 
 Demangler::Demangler(size_t MaxRecursionLevel)
@@ -241,7 +211,7 @@ bool Demangler::demangle(StringView Mangled) {
   demanglePath(IsInType::No);
 
   if (Position != Input.size()) {
-    SwapAndRestore<bool> SavePrint(Print, false);
+    ScopedOverride<bool> SavePrint(Print, false);
     demanglePath(IsInType::No);
   }
 
@@ -279,7 +249,7 @@ bool Demangler::demanglePath(IsInType InType, LeaveGenericsOpen LeaveOpen) {
     Error = true;
     return false;
   }
-  SwapAndRestore<size_t> SaveRecursionLevel(RecursionLevel, RecursionLevel + 1);
+  ScopedOverride<size_t> SaveRecursionLevel(RecursionLevel, RecursionLevel + 1);
 
   switch (consume()) {
   case 'C': {
@@ -380,7 +350,7 @@ bool Demangler::demanglePath(IsInType InType, LeaveGenericsOpen LeaveOpen) {
 // <impl-path> = [<disambiguator>] <path>
 // <disambiguator> = "s" <base-62-number>
 void Demangler::demangleImplPath(IsInType InType) {
-  SwapAndRestore<bool> SavePrint(Print, false);
+  ScopedOverride<bool> SavePrint(Print, false);
   parseOptionalBase62Number('s');
   demanglePath(InType);
 }
@@ -574,7 +544,7 @@ void Demangler::demangleType() {
     Error = true;
     return;
   }
-  SwapAndRestore<size_t> SaveRecursionLevel(RecursionLevel, RecursionLevel + 1);
+  ScopedOverride<size_t> SaveRecursionLevel(RecursionLevel, RecursionLevel + 1);
 
   size_t Start = Position;
   char C = consume();
@@ -657,7 +627,7 @@ void Demangler::demangleType() {
 // <abi> = "C"
 //       | <undisambiguated-identifier>
 void Demangler::demangleFnSig() {
-  SwapAndRestore<size_t> SaveBoundLifetimes(BoundLifetimes, BoundLifetimes);
+  ScopedOverride<size_t> SaveBoundLifetimes(BoundLifetimes, BoundLifetimes);
   demangleOptionalBinder();
 
   if (consumeIf('U'))
@@ -699,7 +669,7 @@ void Demangler::demangleFnSig() {
 
 // <dyn-bounds> = [<binder>] {<dyn-trait>} "E"
 void Demangler::demangleDynBounds() {
-  SwapAndRestore<size_t> SaveBoundLifetimes(BoundLifetimes, BoundLifetimes);
+  ScopedOverride<size_t> SaveBoundLifetimes(BoundLifetimes, BoundLifetimes);
   print("dyn ");
   demangleOptionalBinder();
   for (size_t I = 0; !Error && !consumeIf('E'); ++I) {
@@ -763,7 +733,7 @@ void Demangler::demangleConst() {
     Error = true;
     return;
   }
-  SwapAndRestore<size_t> SaveRecursionLevel(RecursionLevel, RecursionLevel + 1);
+  ScopedOverride<size_t> SaveRecursionLevel(RecursionLevel, RecursionLevel + 1);
 
   char C = consume();
   BasicType Type;
diff --git a/llvm/lib/ExecutionEngine/GDBRegistrationListener.cpp b/llvm/lib/ExecutionEngine/GDBRegistrationListener.cpp
index 1fb37ce7c57c..29a623ebe449 100644
--- a/llvm/lib/ExecutionEngine/GDBRegistrationListener.cpp
+++ b/llvm/lib/ExecutionEngine/GDBRegistrationListener.cpp
@@ -13,6 +13,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Mutex.h"
 #include <mutex>
 
@@ -70,7 +71,7 @@ LLVM_ATTRIBUTE_USED void requiredSymbolDefinitionsFromOrcTargetProcess() {
 }
 
 struct RegisteredObjectInfo {
-  RegisteredObjectInfo() {}
+  RegisteredObjectInfo() = default;
 
   RegisteredObjectInfo(std::size_t Size, jit_code_entry *Entry,
                        OwningBinary<ObjectFile> Obj)
@@ -96,7 +97,7 @@ class GDBJITRegistrationListener : public JITEventListener {
 
 public:
   /// Instantiates the JIT service.
-  GDBJITRegistrationListener() {}
+  GDBJITRegistrationListener() = default;
 
   /// Unregisters each object that was previously registered and releases all
   /// internal resources.
diff --git a/llvm/lib/ExecutionEngine/Interpreter/Interpreter.h b/llvm/lib/ExecutionEngine/Interpreter/Interpreter.h
index fd7fa21df196..3dfe736dc5be 100644
--- a/llvm/lib/ExecutionEngine/Interpreter/Interpreter.h
+++ b/llvm/lib/ExecutionEngine/Interpreter/Interpreter.h
@@ -37,7 +37,7 @@ class AllocaHolder {
   std::vector<void *> Allocations;
 
 public:
-  AllocaHolder() {}
+  AllocaHolder() = default;
 
   // Make this type move-only.
   AllocaHolder(AllocaHolder &&) = default;
diff --git a/llvm/lib/ExecutionEngine/JITLink/DWARFRecordSectionSplitter.cpp b/llvm/lib/ExecutionEngine/JITLink/DWARFRecordSectionSplitter.cpp
new file mode 100644
index 000000000000..0fc366bf505f
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/JITLink/DWARFRecordSectionSplitter.cpp
@@ -0,0 +1,117 @@
+//===-------- JITLink_DWARFRecordSectionSplitter.cpp - JITLink-------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/JITLink/DWARFRecordSectionSplitter.h"
+#include "llvm/Support/BinaryStreamReader.h"
+
+#define DEBUG_TYPE "jitlink"
+
+namespace llvm {
+namespace jitlink {
+
+DWARFRecordSectionSplitter::DWARFRecordSectionSplitter(StringRef SectionName)
+    : SectionName(SectionName) {}
+
+Error DWARFRecordSectionSplitter::operator()(LinkGraph &G) {
+  auto *Section = G.findSectionByName(SectionName);
+
+  if (!Section) {
+    LLVM_DEBUG({
+      dbgs() << "DWARFRecordSectionSplitter: No " << SectionName
+             << " section. Nothing to do\n";
+    });
+    return Error::success();
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "DWARFRecordSectionSplitter: Processing " << SectionName
+           << "...\n";
+  });
+
+  DenseMap<Block *, LinkGraph::SplitBlockCache> Caches;
+
+  {
+    // Pre-build the split caches.
+    for (auto *B : Section->blocks())
+      Caches[B] = LinkGraph::SplitBlockCache::value_type();
+    for (auto *Sym : Section->symbols())
+      Caches[&Sym->getBlock()]->push_back(Sym);
+    for (auto *B : Section->blocks())
+      llvm::sort(*Caches[B], [](const Symbol *LHS, const Symbol *RHS) {
+        return LHS->getOffset() > RHS->getOffset();
+      });
+  }
+
+  // Iterate over blocks (we do this by iterating over Caches entries rather
+  // than Section->blocks() as we will be inserting new blocks along the way,
+  // which would invalidate iterators in the latter sequence.
+  for (auto &KV : Caches) {
+    auto &B = *KV.first;
+    auto &BCache = KV.second;
+    if (auto Err = processBlock(G, B, BCache))
+      return Err;
+  }
+
+  return Error::success();
+}
+
+Error DWARFRecordSectionSplitter::processBlock(
+    LinkGraph &G, Block &B, LinkGraph::SplitBlockCache &Cache) {
+  LLVM_DEBUG(dbgs() << "  Processing block at " << B.getAddress() << "\n");
+
+  // Section should not contain zero-fill blocks.
+  if (B.isZeroFill())
+    return make_error<JITLinkError>("Unexpected zero-fill block in " +
+                                    SectionName + " section");
+
+  if (B.getSize() == 0) {
+    LLVM_DEBUG(dbgs() << "    Block is empty. Skipping.\n");
+    return Error::success();
+  }
+
+  BinaryStreamReader BlockReader(
+      StringRef(B.getContent().data(), B.getContent().size()),
+      G.getEndianness());
+
+  while (true) {
+    uint64_t RecordStartOffset = BlockReader.getOffset();
+
+    LLVM_DEBUG({
+      dbgs() << "    Processing CFI record at "
+             << formatv("{0:x16}", B.getAddress()) << "\n";
+    });
+
+    uint32_t Length;
+    if (auto Err = BlockReader.readInteger(Length))
+      return Err;
+    if (Length != 0xffffffff) {
+      if (auto Err = BlockReader.skip(Length))
+        return Err;
+    } else {
+      uint64_t ExtendedLength;
+      if (auto Err = BlockReader.readInteger(ExtendedLength))
+        return Err;
+      if (auto Err = BlockReader.skip(ExtendedLength))
+        return Err;
+    }
+
+    // If this was the last block then there's nothing to split
+    if (BlockReader.empty()) {
+      LLVM_DEBUG(dbgs() << "      Extracted " << B << "\n");
+      return Error::success();
+    }
+
+    uint64_t BlockSize = BlockReader.getOffset() - RecordStartOffset;
+    auto &NewBlock = G.splitBlock(B, BlockSize);
+    (void)NewBlock;
+    LLVM_DEBUG(dbgs() << "      Extracted " << NewBlock << "\n");
+  }
+}
+
+} // namespace jitlink
+} // namespace llvm
diff --git a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp
index 2ae193595fc0..b1492cd74508 100644
--- a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp
@@ -10,6 +10,7 @@
 
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/Config/config.h"
+#include "llvm/ExecutionEngine/JITLink/DWARFRecordSectionSplitter.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.h"
 #include "llvm/Support/DynamicLibrary.h"
 
@@ -18,109 +19,13 @@
 namespace llvm {
 namespace jitlink {
 
-EHFrameSplitter::EHFrameSplitter(StringRef EHFrameSectionName)
-    : EHFrameSectionName(EHFrameSectionName) {}
-
-Error EHFrameSplitter::operator()(LinkGraph &G) {
-  auto *EHFrame = G.findSectionByName(EHFrameSectionName);
-
-  if (!EHFrame) {
-    LLVM_DEBUG({
-      dbgs() << "EHFrameSplitter: No " << EHFrameSectionName
-             << " section. Nothing to do\n";
-    });
-    return Error::success();
-  }
-
-  LLVM_DEBUG({
-    dbgs() << "EHFrameSplitter: Processing " << EHFrameSectionName << "...\n";
-  });
-
-  DenseMap<Block *, LinkGraph::SplitBlockCache> Caches;
-
-  {
-    // Pre-build the split caches.
-    for (auto *B : EHFrame->blocks())
-      Caches[B] = LinkGraph::SplitBlockCache::value_type();
-    for (auto *Sym : EHFrame->symbols())
-      Caches[&Sym->getBlock()]->push_back(Sym);
-    for (auto *B : EHFrame->blocks())
-      llvm::sort(*Caches[B], [](const Symbol *LHS, const Symbol *RHS) {
-        return LHS->getOffset() > RHS->getOffset();
-      });
-  }
-
-  // Iterate over blocks (we do this by iterating over Caches entries rather
-  // than EHFrame->blocks() as we will be inserting new blocks along the way,
-  // which would invalidate iterators in the latter sequence.
-  for (auto &KV : Caches) {
-    auto &B = *KV.first;
-    auto &BCache = KV.second;
-    if (auto Err = processBlock(G, B, BCache))
-      return Err;
-  }
-
-  return Error::success();
-}
-
-Error EHFrameSplitter::processBlock(LinkGraph &G, Block &B,
-                                    LinkGraph::SplitBlockCache &Cache) {
-  LLVM_DEBUG(dbgs() << "  Processing block at " << B.getAddress() << "\n");
-
-  // eh-frame should not contain zero-fill blocks.
-  if (B.isZeroFill())
-    return make_error<JITLinkError>("Unexpected zero-fill block in " +
-                                    EHFrameSectionName + " section");
-
-  if (B.getSize() == 0) {
-    LLVM_DEBUG(dbgs() << "    Block is empty. Skipping.\n");
-    return Error::success();
-  }
-
-  BinaryStreamReader BlockReader(
-      StringRef(B.getContent().data(), B.getContent().size()),
-      G.getEndianness());
-
-  while (true) {
-    uint64_t RecordStartOffset = BlockReader.getOffset();
-
-    LLVM_DEBUG({
-      dbgs() << "    Processing CFI record at "
-             << formatv("{0:x16}", B.getAddress()) << "\n";
-    });
-
-    uint32_t Length;
-    if (auto Err = BlockReader.readInteger(Length))
-      return Err;
-    if (Length != 0xffffffff) {
-      if (auto Err = BlockReader.skip(Length))
-        return Err;
-    } else {
-      uint64_t ExtendedLength;
-      if (auto Err = BlockReader.readInteger(ExtendedLength))
-        return Err;
-      if (auto Err = BlockReader.skip(ExtendedLength))
-        return Err;
-    }
-
-    // If this was the last block then there's nothing to split
-    if (BlockReader.empty()) {
-      LLVM_DEBUG(dbgs() << "      Extracted " << B << "\n");
-      return Error::success();
-    }
-
-    uint64_t BlockSize = BlockReader.getOffset() - RecordStartOffset;
-    auto &NewBlock = G.splitBlock(B, BlockSize);
-    (void)NewBlock;
-    LLVM_DEBUG(dbgs() << "      Extracted " << NewBlock << "\n");
-  }
-}
-
 EHFrameEdgeFixer::EHFrameEdgeFixer(StringRef EHFrameSectionName,
-                                   unsigned PointerSize, Edge::Kind Delta64,
-                                   Edge::Kind Delta32, Edge::Kind NegDelta32)
+                                   unsigned PointerSize, Edge::Kind Pointer32,
+                                   Edge::Kind Pointer64, Edge::Kind Delta32,
+                                   Edge::Kind Delta64, Edge::Kind NegDelta32)
     : EHFrameSectionName(EHFrameSectionName), PointerSize(PointerSize),
-      Delta64(Delta64), Delta32(Delta32), NegDelta32(NegDelta32) {}
+      Pointer32(Pointer32), Pointer64(Pointer64), Delta32(Delta32),
+      Delta64(Delta64), NegDelta32(NegDelta32) {}
 
 Error EHFrameEdgeFixer::operator()(LinkGraph &G) {
   auto *EHFrame = G.findSectionByName(EHFrameSectionName);
@@ -147,7 +52,16 @@ Error EHFrameEdgeFixer::operator()(LinkGraph &G) {
   // Build a map of all blocks and symbols in the text sections. We will use
   // these for finding / building edge targets when processing FDEs.
   for (auto &Sec : G.sections()) {
-    PC.AddrToSyms.addSymbols(Sec.symbols());
+    // Just record the most-canonical symbol (for eh-frame purposes) at each
+    // address.
+    for (auto *Sym : Sec.symbols()) {
+      auto &CurSym = PC.AddrToSym[Sym->getAddress()];
+      if (!CurSym || (std::make_tuple(Sym->getLinkage(), Sym->getScope(),
+                                      !Sym->hasName(), Sym->getName()) <
+                      std::make_tuple(CurSym->getLinkage(), CurSym->getScope(),
+                                      !CurSym->hasName(), CurSym->getName())))
+        CurSym = Sym;
+    }
     if (auto Err = PC.AddrToBlock.addBlocks(Sec.blocks(),
                                             BlockAddressMap::includeNonNull))
       return Err;
@@ -172,10 +86,7 @@ Error EHFrameEdgeFixer::operator()(LinkGraph &G) {
 
 Error EHFrameEdgeFixer::processBlock(ParseContext &PC, Block &B) {
 
-  LLVM_DEBUG({
-    dbgs() << "  Processing block at " << formatv("{0:x16}", B.getAddress())
-           << "\n";
-  });
+  LLVM_DEBUG(dbgs() << "  Processing block at " << B.getAddress() << "\n");
 
   // eh-frame should not contain zero-fill blocks.
   if (B.isZeroFill())
@@ -209,7 +120,7 @@ Error EHFrameEdgeFixer::processBlock(ParseContext &PC, Block &B) {
 
     LLVM_DEBUG({
       dbgs() << "    Processing CFI record at "
-             << formatv("{0:x16}", B.getAddress() + RecordStartOffset) << "\n";
+             << (B.getAddress() + RecordStartOffset) << "\n";
     });
 
     // Get the record length.
@@ -244,7 +155,7 @@ Error EHFrameEdgeFixer::processBlock(ParseContext &PC, Block &B) {
     if (CIEDelta == 0) {
       if (auto Err = processCIE(PC, B, RecordStartOffset,
                                 CIEDeltaFieldOffset + RecordRemaining,
-                                CIEDeltaFieldOffset))
+                                CIEDeltaFieldOffset, BlockEdges))
         return Err;
     } else {
       if (auto Err = processFDE(PC, B, RecordStartOffset,
@@ -263,7 +174,8 @@ Error EHFrameEdgeFixer::processBlock(ParseContext &PC, Block &B) {
 
 Error EHFrameEdgeFixer::processCIE(ParseContext &PC, Block &B,
                                    size_t RecordOffset, size_t RecordLength,
-                                   size_t CIEDeltaFieldOffset) {
+                                   size_t CIEDeltaFieldOffset,
+                                   const BlockEdgeMap &BlockEdges) {
 
   LLVM_DEBUG(dbgs() << "      Record is CIE\n");
 
@@ -301,10 +213,6 @@ Error EHFrameEdgeFixer::processCIE(ParseContext &PC, Block &B,
     uint64_t CodeAlignmentFactor = 0;
     if (auto Err = RecordReader.readULEB128(CodeAlignmentFactor))
       return Err;
-    if (CodeAlignmentFactor != 1)
-      return make_error<JITLinkError>("Unsupported CIE code alignment factor " +
-                                      Twine(CodeAlignmentFactor) +
-                                      " (expected 1)");
   }
 
   // Read and validate the data alignment factor.
@@ -312,76 +220,65 @@ Error EHFrameEdgeFixer::processCIE(ParseContext &PC, Block &B,
     int64_t DataAlignmentFactor = 0;
     if (auto Err = RecordReader.readSLEB128(DataAlignmentFactor))
       return Err;
-    if (DataAlignmentFactor != -8)
-      return make_error<JITLinkError>("Unsupported CIE data alignment factor " +
-                                      Twine(DataAlignmentFactor) +
-                                      " (expected -8)");
   }
 
   // Skip the return address register field.
   if (auto Err = RecordReader.skip(1))
     return Err;
 
-  uint64_t AugmentationDataLength = 0;
-  if (auto Err = RecordReader.readULEB128(AugmentationDataLength))
-    return Err;
+  if (AugInfo->AugmentationDataPresent) {
 
-  uint32_t AugmentationDataStartOffset = RecordReader.getOffset();
+    CIEInfo.AugmentationDataPresent = true;
 
-  uint8_t *NextField = &AugInfo->Fields[0];
-  while (uint8_t Field = *NextField++) {
-    switch (Field) {
-    case 'L': {
-      CIEInfo.FDEsHaveLSDAField = true;
-      uint8_t LSDAPointerEncoding;
-      if (auto Err = RecordReader.readInteger(LSDAPointerEncoding))
-        return Err;
-      if (!isSupportedPointerEncoding(LSDAPointerEncoding))
-        return make_error<JITLinkError>(
-            "Unsupported LSDA pointer encoding " +
-            formatv("{0:x2}", LSDAPointerEncoding) + " in CIE at " +
-            formatv("{0:x16}", CIESymbol.getAddress()));
-      CIEInfo.LSDAPointerEncoding = LSDAPointerEncoding;
-      break;
-    }
-    case 'P': {
-      uint8_t PersonalityPointerEncoding = 0;
-      if (auto Err = RecordReader.readInteger(PersonalityPointerEncoding))
-        return Err;
-      if (PersonalityPointerEncoding !=
-          (dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
-           dwarf::DW_EH_PE_sdata4))
-        return make_error<JITLinkError>(
-            "Unspported personality pointer "
-            "encoding " +
-            formatv("{0:x2}", PersonalityPointerEncoding) + " in CIE at " +
-            formatv("{0:x16}", CIESymbol.getAddress()));
-      uint32_t PersonalityPointerAddress;
-      if (auto Err = RecordReader.readInteger(PersonalityPointerAddress))
-        return Err;
-      break;
-    }
-    case 'R': {
-      uint8_t FDEPointerEncoding;
-      if (auto Err = RecordReader.readInteger(FDEPointerEncoding))
-        return Err;
-      if (!isSupportedPointerEncoding(FDEPointerEncoding))
-        return make_error<JITLinkError>(
-            "Unsupported FDE pointer encoding " +
-            formatv("{0:x2}", FDEPointerEncoding) + " in CIE at " +
-            formatv("{0:x16}", CIESymbol.getAddress()));
-      CIEInfo.FDEPointerEncoding = FDEPointerEncoding;
-      break;
-    }
-    default:
-      llvm_unreachable("Invalid augmentation string field");
+    uint64_t AugmentationDataLength = 0;
+    if (auto Err = RecordReader.readULEB128(AugmentationDataLength))
+      return Err;
+
+    uint32_t AugmentationDataStartOffset = RecordReader.getOffset();
+
+    uint8_t *NextField = &AugInfo->Fields[0];
+    while (uint8_t Field = *NextField++) {
+      switch (Field) {
+      case 'L':
+        CIEInfo.LSDAPresent = true;
+        if (auto PE = readPointerEncoding(RecordReader, B, "LSDA"))
+          CIEInfo.LSDAEncoding = *PE;
+        else
+          return PE.takeError();
+        break;
+      case 'P': {
+        auto PersonalityPointerEncoding =
+            readPointerEncoding(RecordReader, B, "personality");
+        if (!PersonalityPointerEncoding)
+          return PersonalityPointerEncoding.takeError();
+        if (auto Err =
+                getOrCreateEncodedPointerEdge(
+                    PC, BlockEdges, *PersonalityPointerEncoding, RecordReader,
+                    B, RecordOffset + RecordReader.getOffset(), "personality")
+                    .takeError())
+          return Err;
+        break;
+      }
+      case 'R':
+        if (auto PE = readPointerEncoding(RecordReader, B, "address")) {
+          CIEInfo.AddressEncoding = *PE;
+          if (CIEInfo.AddressEncoding == dwarf::DW_EH_PE_omit)
+            return make_error<JITLinkError>(
+                "Invalid address encoding DW_EH_PE_omit in CIE at " +
+                formatv("{0:x}", (B.getAddress() + RecordOffset).getValue()));
+        } else
+          return PE.takeError();
+        break;
+      default:
+        llvm_unreachable("Invalid augmentation string field");
+      }
     }
-  }
 
-  if (RecordReader.getOffset() - AugmentationDataStartOffset >
-      AugmentationDataLength)
-    return make_error<JITLinkError>("Read past the end of the augmentation "
-                                    "data while parsing fields");
+    if (RecordReader.getOffset() - AugmentationDataStartOffset >
+        AugmentationDataLength)
+      return make_error<JITLinkError>("Read past the end of the augmentation "
+                                      "data while parsing fields");
+  }
 
   assert(!PC.CIEInfos.count(CIESymbol.getAddress()) &&
          "Multiple CIEs recorded at the same address?");
@@ -394,7 +291,7 @@ Error EHFrameEdgeFixer::processFDE(ParseContext &PC, Block &B,
                                    size_t RecordOffset, size_t RecordLength,
                                    size_t CIEDeltaFieldOffset,
                                    uint32_t CIEDelta,
-                                   BlockEdgeMap &BlockEdges) {
+                                   const BlockEdgeMap &BlockEdges) {
   LLVM_DEBUG(dbgs() << "      Record is FDE\n");
 
   orc::ExecutorAddr RecordAddress = B.getAddress() + RecordOffset;
@@ -422,8 +319,8 @@ Error EHFrameEdgeFixer::processFDE(ParseContext &PC, Block &B,
 
       LLVM_DEBUG({
         dbgs() << "        Adding edge at "
-               << formatv("{0:x16}", RecordAddress + CIEDeltaFieldOffset)
-               << " to CIE at: " << formatv("{0:x16}", CIEAddress) << "\n";
+               << (RecordAddress + CIEDeltaFieldOffset)
+               << " to CIE at: " << CIEAddress << "\n";
       });
       if (auto CIEInfoOrErr = PC.findCIEInfo(CIEAddress))
         CIEInfo = *CIEInfoOrErr;
@@ -435,8 +332,8 @@ Error EHFrameEdgeFixer::processFDE(ParseContext &PC, Block &B,
     } else {
       LLVM_DEBUG({
         dbgs() << "        Already has edge at "
-               << formatv("{0:x16}", RecordAddress + CIEDeltaFieldOffset)
-               << " to CIE at " << formatv("{0:x16}", CIEAddress) << "\n";
+               << (RecordAddress + CIEDeltaFieldOffset) << " to CIE at "
+               << CIEAddress << "\n";
       });
       auto &EI = CIEEdgeItr->second;
       if (EI.Addend)
@@ -451,107 +348,41 @@ Error EHFrameEdgeFixer::processFDE(ParseContext &PC, Block &B,
     }
   }
 
-  {
-    // Process the PC-Begin field.
-    Block *PCBeginBlock = nullptr;
-    orc::ExecutorAddrDiff PCBeginFieldOffset = RecordReader.getOffset();
-    auto PCEdgeItr = BlockEdges.find(RecordOffset + PCBeginFieldOffset);
-    if (PCEdgeItr == BlockEdges.end()) {
-      auto PCBeginPtrInfo =
-          readEncodedPointer(CIEInfo->FDEPointerEncoding,
-                             RecordAddress + PCBeginFieldOffset, RecordReader);
-      if (!PCBeginPtrInfo)
-        return PCBeginPtrInfo.takeError();
-      orc::ExecutorAddr PCBegin = PCBeginPtrInfo->first;
-      Edge::Kind PCBeginEdgeKind = PCBeginPtrInfo->second;
-      LLVM_DEBUG({
-        dbgs() << "        Adding edge at "
-               << (RecordAddress + PCBeginFieldOffset) << " to PC at "
-               << formatv("{0:x16}", PCBegin) << "\n";
-      });
-      auto PCBeginSym = getOrCreateSymbol(PC, PCBegin);
-      if (!PCBeginSym)
-        return PCBeginSym.takeError();
-      B.addEdge(PCBeginEdgeKind, RecordOffset + PCBeginFieldOffset, *PCBeginSym,
-                0);
-      PCBeginBlock = &PCBeginSym->getBlock();
-    } else {
-      auto &EI = PCEdgeItr->second;
-      LLVM_DEBUG({
-        dbgs() << "        Already has edge at "
-               << formatv("{0:x16}", RecordAddress + PCBeginFieldOffset)
-               << " to PC at " << formatv("{0:x16}", EI.Target->getAddress());
-        if (EI.Addend)
-          dbgs() << " + " << formatv("{0:x16}", EI.Addend);
-        dbgs() << "\n";
-      });
-
-      // Make sure the existing edge points at a defined block.
-      if (!EI.Target->isDefined()) {
-        auto EdgeAddr = RecordAddress + PCBeginFieldOffset;
-        return make_error<JITLinkError>("FDE edge at " +
-                                        formatv("{0:x16}", EdgeAddr) +
-                                        " points at external block");
-      }
-      PCBeginBlock = &EI.Target->getBlock();
-      if (auto Err = RecordReader.skip(
-              getPointerEncodingDataSize(CIEInfo->FDEPointerEncoding)))
-        return Err;
-    }
-
+  // Process the PC-Begin field.
+  LLVM_DEBUG({
+    dbgs() << "        Processing PC-begin at "
+           << (RecordAddress + RecordReader.getOffset()) << "\n";
+  });
+  if (auto PCBegin = getOrCreateEncodedPointerEdge(
+          PC, BlockEdges, CIEInfo->AddressEncoding, RecordReader, B,
+          RecordReader.getOffset(), "PC begin")) {
+    assert(*PCBegin && "PC-begin symbol not set");
     // Add a keep-alive edge from the FDE target to the FDE to ensure that the
     // FDE is kept alive if its target is.
-    assert(PCBeginBlock && "PC-begin block not recorded");
     LLVM_DEBUG({
       dbgs() << "        Adding keep-alive edge from target at "
-             << formatv("{0:x16}", PCBeginBlock->getAddress()) << " to FDE at "
-             << formatv("{0:x16}", RecordAddress) << "\n";
+             << (*PCBegin)->getBlock().getAddress() << " to FDE at "
+             << RecordAddress << "\n";
     });
-    PCBeginBlock->addEdge(Edge::KeepAlive, 0, FDESymbol, 0);
-  }
+    (*PCBegin)->getBlock().addEdge(Edge::KeepAlive, 0, FDESymbol, 0);
+  } else
+    return PCBegin.takeError();
 
   // Skip over the PC range size field.
-  if (auto Err = RecordReader.skip(
-          getPointerEncodingDataSize(CIEInfo->FDEPointerEncoding)))
+  if (auto Err = skipEncodedPointer(CIEInfo->AddressEncoding, RecordReader))
     return Err;
 
-  if (CIEInfo->FDEsHaveLSDAField) {
+  if (CIEInfo->AugmentationDataPresent) {
     uint64_t AugmentationDataSize;
     if (auto Err = RecordReader.readULEB128(AugmentationDataSize))
       return Err;
 
-    orc::ExecutorAddrDiff LSDAFieldOffset = RecordReader.getOffset();
-    auto LSDAEdgeItr = BlockEdges.find(RecordOffset + LSDAFieldOffset);
-    if (LSDAEdgeItr == BlockEdges.end()) {
-      auto LSDAPointerInfo =
-          readEncodedPointer(CIEInfo->LSDAPointerEncoding,
-                             RecordAddress + LSDAFieldOffset, RecordReader);
-      if (!LSDAPointerInfo)
-        return LSDAPointerInfo.takeError();
-      orc::ExecutorAddr LSDA = LSDAPointerInfo->first;
-      Edge::Kind LSDAEdgeKind = LSDAPointerInfo->second;
-      auto LSDASym = getOrCreateSymbol(PC, LSDA);
-      if (!LSDASym)
-        return LSDASym.takeError();
-      LLVM_DEBUG({
-        dbgs() << "        Adding edge at "
-               << formatv("{0:x16}", RecordAddress + LSDAFieldOffset)
-               << " to LSDA at " << formatv("{0:x16}", LSDA) << "\n";
-      });
-      B.addEdge(LSDAEdgeKind, RecordOffset + LSDAFieldOffset, *LSDASym, 0);
-    } else {
-      LLVM_DEBUG({
-        auto &EI = LSDAEdgeItr->second;
-        dbgs() << "        Already has edge at "
-               << formatv("{0:x16}", RecordAddress + LSDAFieldOffset)
-               << " to LSDA at " << formatv("{0:x16}", EI.Target->getAddress());
-        if (EI.Addend)
-          dbgs() << " + " << formatv("{0:x16}", EI.Addend);
-        dbgs() << "\n";
-      });
-      if (auto Err = RecordReader.skip(AugmentationDataSize))
+    if (CIEInfo->LSDAPresent)
+      if (auto Err = getOrCreateEncodedPointerEdge(
+                         PC, BlockEdges, CIEInfo->LSDAEncoding, RecordReader, B,
+                         RecordReader.getOffset(), "LSDA")
+                         .takeError())
         return Err;
-    }
   } else {
     LLVM_DEBUG(dbgs() << "        Record does not have LSDA field.\n");
   }
@@ -600,129 +431,163 @@ EHFrameEdgeFixer::parseAugmentationString(BinaryStreamReader &RecordReader) {
   return std::move(AugInfo);
 }
 
-bool EHFrameEdgeFixer::isSupportedPointerEncoding(uint8_t PointerEncoding) {
+Expected<uint8_t> EHFrameEdgeFixer::readPointerEncoding(BinaryStreamReader &R,
+                                                        Block &InBlock,
+                                                        const char *FieldName) {
   using namespace dwarf;
 
-  // We only support PC-rel for now.
-  if ((PointerEncoding & 0x70) != DW_EH_PE_pcrel)
-    return false;
-
-  // readEncodedPointer does not handle indirect.
-  if (PointerEncoding & DW_EH_PE_indirect)
-    return false;
+  uint8_t PointerEncoding;
+  if (auto Err = R.readInteger(PointerEncoding))
+    return std::move(Err);
 
-  // Supported datatypes.
+  bool Supported = true;
   switch (PointerEncoding & 0xf) {
-  case DW_EH_PE_absptr:
-  case DW_EH_PE_udata4:
-  case DW_EH_PE_udata8:
-  case DW_EH_PE_sdata4:
-  case DW_EH_PE_sdata8:
-    return true;
+  case DW_EH_PE_uleb128:
+  case DW_EH_PE_udata2:
+  case DW_EH_PE_sleb128:
+  case DW_EH_PE_sdata2:
+    Supported = false;
+    break;
+  }
+  if (Supported) {
+    switch (PointerEncoding & 0x70) {
+    case DW_EH_PE_textrel:
+    case DW_EH_PE_datarel:
+    case DW_EH_PE_funcrel:
+    case DW_EH_PE_aligned:
+      Supported = false;
+      break;
+    }
   }
 
-  return false;
+  if (Supported)
+    return PointerEncoding;
+
+  return make_error<JITLinkError>("Unsupported pointer encoding " +
+                                  formatv("{0:x2}", PointerEncoding) + " for " +
+                                  FieldName + "in CFI record at " +
+                                  formatv("{0:x16}", InBlock.getAddress()));
 }
 
-unsigned EHFrameEdgeFixer::getPointerEncodingDataSize(uint8_t PointerEncoding) {
+Error EHFrameEdgeFixer::skipEncodedPointer(uint8_t PointerEncoding,
+                                           BinaryStreamReader &RecordReader) {
   using namespace dwarf;
 
-  assert(isSupportedPointerEncoding(PointerEncoding) &&
-         "Unsupported pointer encoding");
+  // Switch absptr to corresponding udata encoding.
+  if ((PointerEncoding & 0xf) == DW_EH_PE_absptr)
+    PointerEncoding |= (PointerSize == 8) ? DW_EH_PE_udata8 : DW_EH_PE_udata4;
+
   switch (PointerEncoding & 0xf) {
-  case DW_EH_PE_absptr:
-    return PointerSize;
   case DW_EH_PE_udata4:
   case DW_EH_PE_sdata4:
-    return 4;
+    if (auto Err = RecordReader.skip(4))
+      return Err;
+    break;
   case DW_EH_PE_udata8:
   case DW_EH_PE_sdata8:
-    return 8;
+    if (auto Err = RecordReader.skip(8))
+      return Err;
+    break;
   default:
-    llvm_unreachable("Unsupported encoding");
+    llvm_unreachable("Unrecognized encoding");
   }
+  return Error::success();
 }
 
-Expected<std::pair<orc::ExecutorAddr, Edge::Kind>>
-EHFrameEdgeFixer::readEncodedPointer(uint8_t PointerEncoding,
-                                     orc::ExecutorAddr PointerFieldAddress,
-                                     BinaryStreamReader &RecordReader) {
-  assert(isSupportedPointerEncoding(PointerEncoding) &&
-         "Unsupported pointer encoding");
-
+Expected<Symbol *> EHFrameEdgeFixer::getOrCreateEncodedPointerEdge(
+    ParseContext &PC, const BlockEdgeMap &BlockEdges, uint8_t PointerEncoding,
+    BinaryStreamReader &RecordReader, Block &BlockToFix,
+    size_t PointerFieldOffset, const char *FieldName) {
   using namespace dwarf;
 
-  // Isolate data type, remap absptr to udata4 or udata8. This relies on us
-  // having verified that the graph uses 32-bit or 64-bit pointers only at the
-  // start of this pass.
-  uint8_t EffectiveType = PointerEncoding & 0xf;
-  if (EffectiveType == DW_EH_PE_absptr)
-    EffectiveType = (PointerSize == 8) ? DW_EH_PE_udata8 : DW_EH_PE_udata4;
+  if (PointerEncoding == DW_EH_PE_omit)
+    return nullptr;
+
+  // If there's already an edge here then just skip the encoded pointer and
+  // return the edge's target.
+  {
+    auto EdgeI = BlockEdges.find(PointerFieldOffset);
+    if (EdgeI != BlockEdges.end()) {
+      LLVM_DEBUG({
+        dbgs() << "        Existing edge at "
+               << (BlockToFix.getAddress() + PointerFieldOffset) << " to "
+               << FieldName << " at " << EdgeI->second.Target->getAddress();
+        if (EdgeI->second.Target->hasName())
+          dbgs() << " (" << EdgeI->second.Target->getName() << ")";
+        dbgs() << "\n";
+      });
+      if (auto Err = skipEncodedPointer(PointerEncoding, RecordReader))
+        return std::move(Err);
+      return EdgeI->second.Target;
+    }
+  }
+
+  // Switch absptr to corresponding udata encoding.
+  if ((PointerEncoding & 0xf) == DW_EH_PE_absptr)
+    PointerEncoding |= (PointerSize == 8) ? DW_EH_PE_udata8 : DW_EH_PE_udata4;
 
-  orc::ExecutorAddr Addr;
-  Edge::Kind PointerEdgeKind = Edge::Invalid;
-  switch (EffectiveType) {
+  // We need to create an edge. Start by reading the field value.
+  uint64_t FieldValue;
+  bool Is64Bit = false;
+  switch (PointerEncoding & 0xf) {
   case DW_EH_PE_udata4: {
     uint32_t Val;
     if (auto Err = RecordReader.readInteger(Val))
       return std::move(Err);
-    Addr = PointerFieldAddress + Val;
-    PointerEdgeKind = Delta32;
-    break;
-  }
-  case DW_EH_PE_udata8: {
-    uint64_t Val;
-    if (auto Err = RecordReader.readInteger(Val))
-      return std::move(Err);
-    Addr = PointerFieldAddress + Val;
-    PointerEdgeKind = Delta64;
+    FieldValue = Val;
     break;
   }
   case DW_EH_PE_sdata4: {
-    int32_t Val;
+    uint32_t Val;
     if (auto Err = RecordReader.readInteger(Val))
       return std::move(Err);
-    Addr = PointerFieldAddress + Val;
-    PointerEdgeKind = Delta32;
+    FieldValue = Val;
     break;
   }
-  case DW_EH_PE_sdata8: {
-    int64_t Val;
-    if (auto Err = RecordReader.readInteger(Val))
+  case DW_EH_PE_udata8:
+  case DW_EH_PE_sdata8:
+    Is64Bit = true;
+    if (auto Err = RecordReader.readInteger(FieldValue))
       return std::move(Err);
-    Addr = PointerFieldAddress + Val;
-    PointerEdgeKind = Delta64;
     break;
-  }
+  default:
+    llvm_unreachable("Unsupported encoding");
   }
 
-  if (PointerEdgeKind == Edge::Invalid)
-    return make_error<JITLinkError>(
-        "Unspported edge kind for encoded pointer at " +
-        formatv("{0:x}", PointerFieldAddress));
+  // Find the edge target and edge kind to use.
+  orc::ExecutorAddr Target;
+  Edge::Kind PtrEdgeKind = Edge::Invalid;
+  if ((PointerEncoding & 0x70) == DW_EH_PE_pcrel) {
+    Target = BlockToFix.getAddress() + PointerFieldOffset;
+    PtrEdgeKind = Is64Bit ? Delta64 : Delta32;
+  } else
+    PtrEdgeKind = Is64Bit ? Pointer64 : Pointer32;
+  Target += FieldValue;
+
+  // Find or create a symbol to point the edge at.
+  auto TargetSym = getOrCreateSymbol(PC, Target);
+  if (!TargetSym)
+    return TargetSym.takeError();
+  BlockToFix.addEdge(PtrEdgeKind, PointerFieldOffset, *TargetSym, 0);
 
-  return std::make_pair(Addr, Delta64);
+  LLVM_DEBUG({
+    dbgs() << "        Adding edge at "
+           << (BlockToFix.getAddress() + PointerFieldOffset) << " to "
+           << FieldName << " at " << TargetSym->getAddress();
+    if (TargetSym->hasName())
+      dbgs() << " (" << TargetSym->getName() << ")";
+    dbgs() << "\n";
+  });
+
+  return &*TargetSym;
 }
 
 Expected<Symbol &> EHFrameEdgeFixer::getOrCreateSymbol(ParseContext &PC,
                                                        orc::ExecutorAddr Addr) {
-  Symbol *CanonicalSym = nullptr;
-
-  auto UpdateCanonicalSym = [&](Symbol *Sym) {
-    if (!CanonicalSym || Sym->getLinkage() < CanonicalSym->getLinkage() ||
-        Sym->getScope() < CanonicalSym->getScope() ||
-        (Sym->hasName() && !CanonicalSym->hasName()) ||
-        Sym->getName() < CanonicalSym->getName())
-      CanonicalSym = Sym;
-  };
-
-  if (auto *SymbolsAtAddr = PC.AddrToSyms.getSymbolsAt(Addr))
-    for (auto *Sym : *SymbolsAtAddr)
-      UpdateCanonicalSym(Sym);
-
-  // If we found an existing symbol at the given address then use it.
-  if (CanonicalSym)
-    return *CanonicalSym;
+  // See whether we have a canonical symbol for the given address already.
+  auto CanonicalSymI = PC.AddrToSym.find(Addr);
+  if (CanonicalSymI != PC.AddrToSym.end())
+    return *CanonicalSymI->second;
 
   // Otherwise search for a block covering the address and create a new symbol.
   auto *B = PC.AddrToBlock.getBlockCovering(Addr);
@@ -730,7 +595,10 @@ Expected<Symbol &> EHFrameEdgeFixer::getOrCreateSymbol(ParseContext &PC,
     return make_error<JITLinkError>("No symbol or block covering address " +
                                     formatv("{0:x16}", Addr));
 
-  return PC.G.addAnonymousSymbol(*B, Addr - B->getAddress(), 0, false, false);
+  auto &S =
+      PC.G.addAnonymousSymbol(*B, Addr - B->getAddress(), 0, false, false);
+  PC.AddrToSym[S.getAddress()] = &S;
+  return S;
 }
 
 char EHFrameNullTerminator::NullTerminatorBlockContent[4] = {0, 0, 0, 0};
@@ -756,7 +624,7 @@ Error EHFrameNullTerminator::operator()(LinkGraph &G) {
   return Error::success();
 }
 
-EHFrameRegistrar::~EHFrameRegistrar() {}
+EHFrameRegistrar::~EHFrameRegistrar() = default;
 
 Error InProcessEHFrameRegistrar::registerEHFrames(
     orc::ExecutorAddrRange EHFrameSection) {
diff --git a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h
index ef4b47b9aa28..55cf7fc63ee7 100644
--- a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h
+++ b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h
@@ -21,27 +21,16 @@
 namespace llvm {
 namespace jitlink {
 
-/// A LinkGraph pass that splits blocks in an eh-frame section into sub-blocks
-/// representing individual eh-frames.
-/// EHFrameSplitter should not be run without EHFrameEdgeFixer, which is
-/// responsible for adding FDE-to-CIE edges.
-class EHFrameSplitter {
-public:
-  EHFrameSplitter(StringRef EHFrameSectionName);
-  Error operator()(LinkGraph &G);
-
-private:
-  Error processBlock(LinkGraph &G, Block &B, LinkGraph::SplitBlockCache &Cache);
-
-  StringRef EHFrameSectionName;
-};
-
 /// A LinkGraph pass that adds missing FDE-to-CIE, FDE-to-PC and FDE-to-LSDA
 /// edges.
 class EHFrameEdgeFixer {
 public:
+  /// Create an eh-frame edge fixer.
+  /// If a given edge-kind is not supported on the target architecture then
+  /// Edge::Invalid should be used.
   EHFrameEdgeFixer(StringRef EHFrameSectionName, unsigned PointerSize,
-                   Edge::Kind Delta64, Edge::Kind Delta32,
+                   Edge::Kind Pointer32, Edge::Kind Pointer64,
+                   Edge::Kind Delta32, Edge::Kind Delta64,
                    Edge::Kind NegDelta32);
   Error operator()(LinkGraph &G);
 
@@ -57,9 +46,10 @@ private:
     CIEInformation() = default;
     CIEInformation(Symbol &CIESymbol) : CIESymbol(&CIESymbol) {}
     Symbol *CIESymbol = nullptr;
-    bool FDEsHaveLSDAField = false;
-    uint8_t FDEPointerEncoding = 0;
-    uint8_t LSDAPointerEncoding = 0;
+    bool AugmentationDataPresent = false;
+    bool LSDAPresent = false;
+    uint8_t LSDAEncoding = 0;
+    uint8_t AddressEncoding = 0;
   };
 
   struct EdgeTarget {
@@ -87,33 +77,38 @@ private:
     LinkGraph &G;
     CIEInfosMap CIEInfos;
     BlockAddressMap AddrToBlock;
-    SymbolAddressMap AddrToSyms;
+    DenseMap<orc::ExecutorAddr, Symbol *> AddrToSym;
   };
 
   Error processBlock(ParseContext &PC, Block &B);
   Error processCIE(ParseContext &PC, Block &B, size_t RecordOffset,
-                   size_t RecordLength, size_t CIEDeltaFieldOffset);
+                   size_t RecordLength, size_t CIEDeltaFieldOffset,
+                   const BlockEdgeMap &BlockEdges);
   Error processFDE(ParseContext &PC, Block &B, size_t RecordOffset,
                    size_t RecordLength, size_t CIEDeltaFieldOffset,
-                   uint32_t CIEDelta, BlockEdgeMap &BlockEdges);
+                   uint32_t CIEDelta, const BlockEdgeMap &BlockEdges);
 
   Expected<AugmentationInfo>
   parseAugmentationString(BinaryStreamReader &RecordReader);
 
-  static bool isSupportedPointerEncoding(uint8_t PointerEncoding);
-  unsigned getPointerEncodingDataSize(uint8_t PointerEncoding);
-  Expected<std::pair<orc::ExecutorAddr, Edge::Kind>>
-  readEncodedPointer(uint8_t PointerEncoding,
-                     orc::ExecutorAddr PointerFieldAddress,
-                     BinaryStreamReader &RecordReader);
+  Expected<uint8_t> readPointerEncoding(BinaryStreamReader &RecordReader,
+                                        Block &InBlock, const char *FieldName);
+  Error skipEncodedPointer(uint8_t PointerEncoding,
+                           BinaryStreamReader &RecordReader);
+  Expected<Symbol *> getOrCreateEncodedPointerEdge(
+      ParseContext &PC, const BlockEdgeMap &BlockEdges, uint8_t PointerEncoding,
+      BinaryStreamReader &RecordReader, Block &BlockToFix,
+      size_t PointerFieldOffset, const char *FieldName);
 
   Expected<Symbol &> getOrCreateSymbol(ParseContext &PC,
                                        orc::ExecutorAddr Addr);
 
   StringRef EHFrameSectionName;
   unsigned PointerSize;
-  Edge::Kind Delta64;
+  Edge::Kind Pointer32;
+  Edge::Kind Pointer64;
   Edge::Kind Delta32;
+  Edge::Kind Delta64;
   Edge::Kind NegDelta32;
 };
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.cpp b/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.cpp
index 2194a4fbf1f4..5a983c219627 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.cpp
@@ -27,7 +27,7 @@ namespace jitlink {
 StringRef ELFLinkGraphBuilderBase::CommonSectionName(".common");
 ArrayRef<const char *> ELFLinkGraphBuilderBase::DwarfSectionNames = DWSecNames;
 
-ELFLinkGraphBuilderBase::~ELFLinkGraphBuilderBase() {}
+ELFLinkGraphBuilderBase::~ELFLinkGraphBuilderBase() = default;
 
 } // end namespace jitlink
 } // end namespace llvm
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp
index dd3eb97c21a0..98da3f155c3e 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp
@@ -11,20 +11,21 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/JITLink/ELF_aarch64.h"
+#include "EHFrameSupportImpl.h"
 #include "ELFLinkGraphBuilder.h"
 #include "JITLinkGeneric.h"
 #include "llvm/BinaryFormat/ELF.h"
+#include "llvm/ExecutionEngine/JITLink/DWARFRecordSectionSplitter.h"
 #include "llvm/ExecutionEngine/JITLink/aarch64.h"
 #include "llvm/Object/ELFObjectFile.h"
-#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Endian.h"
 
 #define DEBUG_TYPE "jitlink"
 
 using namespace llvm;
 using namespace llvm::jitlink;
 
-namespace llvm {
-namespace jitlink {
+namespace {
 
 class ELFJITLinker_aarch64 : public JITLinker<ELFJITLinker_aarch64> {
   friend class JITLinker<ELFJITLinker_aarch64>;
@@ -37,50 +38,77 @@ public:
 
 private:
   Error applyFixup(LinkGraph &G, Block &B, const Edge &E) const {
-    using namespace aarch64;
-    using namespace llvm::support;
-
-    char *BlockWorkingMem = B.getAlreadyMutableContent().data();
-    char *FixupPtr = BlockWorkingMem + E.getOffset();
-    auto FixupAddress = B.getAddress() + E.getOffset();
-    switch (E.getKind()) {
-    case aarch64::R_AARCH64_CALL26: {
-      assert((FixupAddress.getValue() & 0x3) == 0 &&
-             "Call-inst is not 32-bit aligned");
-      int64_t Value = E.getTarget().getAddress() - FixupAddress + E.getAddend();
-
-      if (static_cast<uint64_t>(Value) & 0x3)
-        return make_error<JITLinkError>("Call target is not 32-bit aligned");
-
-      if (!isInt<28>(Value))
-        return makeTargetOutOfRangeError(G, B, E);
-
-      uint32_t RawInstr = *(little32_t *)FixupPtr;
-      assert((RawInstr & 0x7fffffff) == 0x14000000 &&
-             "RawInstr isn't a B or BR immediate instruction");
-      uint32_t Imm = (static_cast<uint32_t>(Value) & ((1 << 28) - 1)) >> 2;
-      uint32_t FixedInstr = RawInstr | Imm;
-      *(little32_t *)FixupPtr = FixedInstr;
-      break;
-    }
-    }
-    return Error::success();
+    return aarch64::applyFixup(G, B, E);
   }
 };
 
 template <typename ELFT>
 class ELFLinkGraphBuilder_aarch64 : public ELFLinkGraphBuilder<ELFT> {
 private:
-  static Expected<aarch64::EdgeKind_aarch64>
+  enum ELFAArch64RelocationKind : Edge::Kind {
+    ELFCall26 = Edge::FirstRelocation,
+    ELFAdrPage21,
+    ELFAddAbs12,
+    ELFLdSt8Abs12,
+    ELFLdSt16Abs12,
+    ELFLdSt32Abs12,
+    ELFLdSt64Abs12,
+    ELFLdSt128Abs12,
+    ELFMovwAbsG0,
+    ELFMovwAbsG1,
+    ELFMovwAbsG2,
+    ELFMovwAbsG3,
+    ELFAbs64,
+    ELFPrel32,
+    ELFPrel64,
+    ELFAdrGOTPage21,
+    ELFLd64GOTLo12,
+  };
+
+  static Expected<ELFAArch64RelocationKind>
   getRelocationKind(const uint32_t Type) {
     using namespace aarch64;
     switch (Type) {
     case ELF::R_AARCH64_CALL26:
-      return EdgeKind_aarch64::R_AARCH64_CALL26;
+    case ELF::R_AARCH64_JUMP26:
+      return ELFCall26;
+    case ELF::R_AARCH64_ADR_PREL_PG_HI21:
+      return ELFAdrPage21;
+    case ELF::R_AARCH64_ADD_ABS_LO12_NC:
+      return ELFAddAbs12;
+    case ELF::R_AARCH64_LDST8_ABS_LO12_NC:
+      return ELFLdSt8Abs12;
+    case ELF::R_AARCH64_LDST16_ABS_LO12_NC:
+      return ELFLdSt16Abs12;
+    case ELF::R_AARCH64_LDST32_ABS_LO12_NC:
+      return ELFLdSt32Abs12;
+    case ELF::R_AARCH64_LDST64_ABS_LO12_NC:
+      return ELFLdSt64Abs12;
+    case ELF::R_AARCH64_LDST128_ABS_LO12_NC:
+      return ELFLdSt128Abs12;
+    case ELF::R_AARCH64_MOVW_UABS_G0_NC:
+      return ELFMovwAbsG0;
+    case ELF::R_AARCH64_MOVW_UABS_G1_NC:
+      return ELFMovwAbsG1;
+    case ELF::R_AARCH64_MOVW_UABS_G2_NC:
+      return ELFMovwAbsG2;
+    case ELF::R_AARCH64_MOVW_UABS_G3:
+      return ELFMovwAbsG3;
+    case ELF::R_AARCH64_ABS64:
+      return ELFAbs64;
+    case ELF::R_AARCH64_PREL32:
+      return ELFPrel32;
+    case ELF::R_AARCH64_PREL64:
+      return ELFPrel64;
+    case ELF::R_AARCH64_ADR_GOT_PAGE:
+      return ELFAdrGOTPage21;
+    case ELF::R_AARCH64_LD64_GOT_LO12_NC:
+      return ELFLd64GOTLo12;
     }
 
-    return make_error<JITLinkError>("Unsupported aarch64 relocation:" +
-                                    formatv("{0:d}", Type));
+    return make_error<JITLinkError>(
+        "Unsupported aarch64 relocation:" + formatv("{0:d}: ", Type) +
+        object::getELFRelocationTypeName(ELF::EM_AARCH64, Type));
   }
 
   Error addRelocations() override {
@@ -99,6 +127,7 @@ private:
   Error addSingleRelocation(const typename ELFT::Rela &Rel,
                             const typename ELFT::Shdr &FixupSect,
                             Block &BlockToFix) {
+    using support::ulittle32_t;
     using Base = ELFLinkGraphBuilder<ELFT>;
 
     uint32_t SymbolIndex = Rel.getSymbol(false);
@@ -116,18 +145,159 @@ private:
           inconvertibleErrorCode());
 
     uint32_t Type = Rel.getType(false);
-    Expected<aarch64::EdgeKind_aarch64> Kind = getRelocationKind(Type);
-    if (!Kind)
-      return Kind.takeError();
+    Expected<ELFAArch64RelocationKind> RelocKind = getRelocationKind(Type);
+    if (!RelocKind)
+      return RelocKind.takeError();
 
     int64_t Addend = Rel.r_addend;
     orc::ExecutorAddr FixupAddress =
         orc::ExecutorAddr(FixupSect.sh_addr) + Rel.r_offset;
     Edge::OffsetT Offset = FixupAddress - BlockToFix.getAddress();
-    Edge GE(*Kind, Offset, *GraphSymbol, Addend);
+
+    // Get a pointer to the fixup content.
+    const void *FixupContent = BlockToFix.getContent().data() +
+                               (FixupAddress - BlockToFix.getAddress());
+
+    Edge::Kind Kind = Edge::Invalid;
+
+    switch (*RelocKind) {
+    case ELFCall26: {
+      Kind = aarch64::Branch26;
+      break;
+    }
+    case ELFAdrPage21: {
+      Kind = aarch64::Page21;
+      break;
+    }
+    case ELFAddAbs12: {
+      Kind = aarch64::PageOffset12;
+      break;
+    }
+    case ELFLdSt8Abs12: {
+      uint32_t Instr = *(const ulittle32_t *)FixupContent;
+      if (!aarch64::isLoadStoreImm12(Instr) ||
+          aarch64::getPageOffset12Shift(Instr) != 0)
+        return make_error<JITLinkError>(
+            "R_AARCH64_LDST8_ABS_LO12_NC target is not a "
+            "LDRB/STRB (imm12) instruction");
+
+      Kind = aarch64::PageOffset12;
+      break;
+    }
+    case ELFLdSt16Abs12: {
+      uint32_t Instr = *(const ulittle32_t *)FixupContent;
+      if (!aarch64::isLoadStoreImm12(Instr) ||
+          aarch64::getPageOffset12Shift(Instr) != 1)
+        return make_error<JITLinkError>(
+            "R_AARCH64_LDST16_ABS_LO12_NC target is not a "
+            "LDRH/STRH (imm12) instruction");
+
+      Kind = aarch64::PageOffset12;
+      break;
+    }
+    case ELFLdSt32Abs12: {
+      uint32_t Instr = *(const ulittle32_t *)FixupContent;
+      if (!aarch64::isLoadStoreImm12(Instr) ||
+          aarch64::getPageOffset12Shift(Instr) != 2)
+        return make_error<JITLinkError>(
+            "R_AARCH64_LDST32_ABS_LO12_NC target is not a "
+            "LDR/STR (imm12, 32 bit) instruction");
+
+      Kind = aarch64::PageOffset12;
+      break;
+    }
+    case ELFLdSt64Abs12: {
+      uint32_t Instr = *(const ulittle32_t *)FixupContent;
+      if (!aarch64::isLoadStoreImm12(Instr) ||
+          aarch64::getPageOffset12Shift(Instr) != 3)
+        return make_error<JITLinkError>(
+            "R_AARCH64_LDST64_ABS_LO12_NC target is not a "
+            "LDR/STR (imm12, 64 bit) instruction");
+
+      Kind = aarch64::PageOffset12;
+      break;
+    }
+    case ELFLdSt128Abs12: {
+      uint32_t Instr = *(const ulittle32_t *)FixupContent;
+      if (!aarch64::isLoadStoreImm12(Instr) ||
+          aarch64::getPageOffset12Shift(Instr) != 4)
+        return make_error<JITLinkError>(
+            "R_AARCH64_LDST128_ABS_LO12_NC target is not a "
+            "LDR/STR (imm12, 128 bit) instruction");
+
+      Kind = aarch64::PageOffset12;
+      break;
+    }
+    case ELFMovwAbsG0: {
+      uint32_t Instr = *(const ulittle32_t *)FixupContent;
+      if (!aarch64::isMoveWideImm16(Instr) ||
+          aarch64::getMoveWide16Shift(Instr) != 0)
+        return make_error<JITLinkError>(
+            "R_AARCH64_MOVW_UABS_G0_NC target is not a "
+            "MOVK/MOVZ (imm16, LSL #0) instruction");
+
+      Kind = aarch64::MoveWide16;
+      break;
+    }
+    case ELFMovwAbsG1: {
+      uint32_t Instr = *(const ulittle32_t *)FixupContent;
+      if (!aarch64::isMoveWideImm16(Instr) ||
+          aarch64::getMoveWide16Shift(Instr) != 16)
+        return make_error<JITLinkError>(
+            "R_AARCH64_MOVW_UABS_G1_NC target is not a "
+            "MOVK/MOVZ (imm16, LSL #16) instruction");
+
+      Kind = aarch64::MoveWide16;
+      break;
+    }
+    case ELFMovwAbsG2: {
+      uint32_t Instr = *(const ulittle32_t *)FixupContent;
+      if (!aarch64::isMoveWideImm16(Instr) ||
+          aarch64::getMoveWide16Shift(Instr) != 32)
+        return make_error<JITLinkError>(
+            "R_AARCH64_MOVW_UABS_G2_NC target is not a "
+            "MOVK/MOVZ (imm16, LSL #32) instruction");
+
+      Kind = aarch64::MoveWide16;
+      break;
+    }
+    case ELFMovwAbsG3: {
+      uint32_t Instr = *(const ulittle32_t *)FixupContent;
+      if (!aarch64::isMoveWideImm16(Instr) ||
+          aarch64::getMoveWide16Shift(Instr) != 48)
+        return make_error<JITLinkError>(
+            "R_AARCH64_MOVW_UABS_G3 target is not a "
+            "MOVK/MOVZ (imm16, LSL #48) instruction");
+
+      Kind = aarch64::MoveWide16;
+      break;
+    }
+    case ELFAbs64: {
+      Kind = aarch64::Pointer64;
+      break;
+    }
+    case ELFPrel32: {
+      Kind = aarch64::Delta32;
+      break;
+    }
+    case ELFPrel64: {
+      Kind = aarch64::Delta64;
+      break;
+    }
+    case ELFAdrGOTPage21: {
+      Kind = aarch64::GOTPage21;
+      break;
+    }
+    case ELFLd64GOTLo12: {
+      Kind = aarch64::GOTPageOffset12;
+      break;
+    }
+    };
+
+    Edge GE(Kind, Offset, *GraphSymbol, Addend);
     LLVM_DEBUG({
       dbgs() << "    ";
-      printEdge(dbgs(), BlockToFix, GE, aarch64::getEdgeKindName(*Kind));
+      printEdge(dbgs(), BlockToFix, GE, aarch64::getEdgeKindName(Kind));
       dbgs() << "\n";
     });
 
@@ -135,6 +305,48 @@ private:
     return Error::success();
   }
 
+  /// Return the string name of the given ELF aarch64 edge kind.
+  const char *getELFAArch64RelocationKindName(Edge::Kind R) {
+    switch (R) {
+    case ELFCall26:
+      return "ELFCall26";
+    case ELFAdrPage21:
+      return "ELFAdrPage21";
+    case ELFAddAbs12:
+      return "ELFAddAbs12";
+    case ELFLdSt8Abs12:
+      return "ELFLdSt8Abs12";
+    case ELFLdSt16Abs12:
+      return "ELFLdSt16Abs12";
+    case ELFLdSt32Abs12:
+      return "ELFLdSt32Abs12";
+    case ELFLdSt64Abs12:
+      return "ELFLdSt64Abs12";
+    case ELFLdSt128Abs12:
+      return "ELFLdSt128Abs12";
+    case ELFMovwAbsG0:
+      return "ELFMovwAbsG0";
+    case ELFMovwAbsG1:
+      return "ELFMovwAbsG1";
+    case ELFMovwAbsG2:
+      return "ELFMovwAbsG2";
+    case ELFMovwAbsG3:
+      return "ELFMovwAbsG3";
+    case ELFAbs64:
+      return "ELFAbs64";
+    case ELFPrel32:
+      return "ELFPrel32";
+    case ELFPrel64:
+      return "ELFPrel64";
+    case ELFAdrGOTPage21:
+      return "ELFAdrGOTPage21";
+    case ELFLd64GOTLo12:
+      return "ELFLd64GOTLo12";
+    default:
+      return getGenericEdgeKindName(static_cast<Edge::Kind>(R));
+    }
+  }
+
 public:
   ELFLinkGraphBuilder_aarch64(StringRef FileName,
                               const object::ELFFile<ELFT> &Obj, const Triple T)
@@ -142,6 +354,20 @@ public:
                                   aarch64::getEdgeKindName) {}
 };
 
+Error buildTables_ELF_aarch64(LinkGraph &G) {
+  LLVM_DEBUG(dbgs() << "Visiting edges in graph:\n");
+
+  aarch64::GOTTableManager GOT;
+  aarch64::PLTTableManager PLT(GOT);
+  visitExistingEdges(G, GOT, PLT);
+  return Error::success();
+}
+
+} // namespace
+
+namespace llvm {
+namespace jitlink {
+
 Expected<std::unique_ptr<LinkGraph>>
 createLinkGraphFromELFObject_aarch64(MemoryBufferRef ObjectBuffer) {
   LLVM_DEBUG({
@@ -168,11 +394,22 @@ void link_ELF_aarch64(std::unique_ptr<LinkGraph> G,
   PassConfiguration Config;
   const Triple &TT = G->getTargetTriple();
   if (Ctx->shouldAddDefaultTargetPasses(TT)) {
+    // Add eh-frame passses.
+    Config.PrePrunePasses.push_back(DWARFRecordSectionSplitter(".eh_frame"));
+    Config.PrePrunePasses.push_back(EHFrameEdgeFixer(
+        ".eh_frame", 8, aarch64::Pointer32, aarch64::Pointer64,
+        aarch64::Delta32, aarch64::Delta64, aarch64::NegDelta32));
+
+    // Add a mark-live pass.
     if (auto MarkLive = Ctx->getMarkLivePass(TT))
       Config.PrePrunePasses.push_back(std::move(MarkLive));
     else
       Config.PrePrunePasses.push_back(markAllSymbolsLive);
+
+    // Add an in-place GOT/Stubs build pass.
+    Config.PostPrunePasses.push_back(buildTables_ELF_aarch64);
   }
+
   if (auto Err = Ctx->modifyPassConfig(*G, Config))
     return Ctx->notifyFailed(std::move(Err));
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
index f83001417e94..197ab71f5274 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
@@ -160,23 +160,16 @@ static Expected<const Edge &> getRISCVPCRelHi20(const Edge &E) {
 }
 
 static uint32_t extractBits(uint32_t Num, unsigned Low, unsigned Size) {
-  return (Num & (((1ULL << (Size + 1)) - 1) << Low)) >> Low;
+  return (Num & (((1ULL << Size) - 1) << Low)) >> Low;
 }
 
-inline Error checkAlignment(llvm::orc::ExecutorAddr loc, uint64_t v, int n,
-                            const Edge &E) {
-  if (v & (n - 1))
-    return make_error<JITLinkError>("0x" + llvm::utohexstr(loc.getValue()) +
-                                    " improper alignment for relocation " +
-                                    formatv("{0:d}", E.getKind()) + ": 0x" +
-                                    llvm::utohexstr(v) + " is not aligned to " +
-                                    Twine(n) + " bytes");
-  return Error::success();
+static inline bool isAlignmentCorrect(uint64_t Value, int N) {
+  return (Value & (N - 1)) ? false : true;
 }
 
-static inline bool isInRangeForImmS32(int64_t Value) {
-  return (Value >= std::numeric_limits<int32_t>::min() &&
-          Value <= std::numeric_limits<int32_t>::max());
+// Requires 0 < N <= 64.
+static inline bool isInRangeForImm(int64_t Value, int N) {
+  return Value == llvm::SignExtend64(Value, N);
 }
 
 class ELFJITLinker_riscv : public JITLinker<ELFJITLinker_riscv> {
@@ -208,23 +201,36 @@ private:
     }
     case R_RISCV_BRANCH: {
       int64_t Value = E.getTarget().getAddress() + E.getAddend() - FixupAddress;
-      Error AlignmentIssue = checkAlignment(FixupAddress, Value, 2, E);
-      if (AlignmentIssue) {
-        return AlignmentIssue;
-      }
-      int64_t Lo = Value & 0xFFF;
-      uint32_t Imm31_25 = extractBits(Lo, 5, 6) << 25 | extractBits(Lo, 12, 1)
-                                                            << 31;
-      uint32_t Imm11_7 = extractBits(Lo, 1, 4) << 8 | extractBits(Lo, 11, 1)
-                                                          << 7;
+      if (LLVM_UNLIKELY(!isInRangeForImm(Value >> 1, 12)))
+        return makeTargetOutOfRangeError(G, B, E);
+      if (LLVM_UNLIKELY(!isAlignmentCorrect(Value, 2)))
+        return makeAlignmentError(FixupAddress, Value, 2, E);
+      uint32_t Imm31_25 =
+          extractBits(Value, 5, 6) << 25 | extractBits(Value, 12, 1) << 31;
+      uint32_t Imm11_7 =
+          extractBits(Value, 1, 4) << 8 | extractBits(Value, 11, 1) << 7;
       uint32_t RawInstr = *(little32_t *)FixupPtr;
       *(little32_t *)FixupPtr = (RawInstr & 0x1FFF07F) | Imm31_25 | Imm11_7;
       break;
     }
+    case R_RISCV_JAL: {
+      int64_t Value = E.getTarget().getAddress() + E.getAddend() - FixupAddress;
+      if (LLVM_UNLIKELY(!isInRangeForImm(Value >> 1, 20)))
+        return makeTargetOutOfRangeError(G, B, E);
+      if (LLVM_UNLIKELY(!isAlignmentCorrect(Value, 2)))
+        return makeAlignmentError(FixupAddress, Value, 2, E);
+      uint32_t Imm20 = extractBits(Value, 20, 1) << 31;
+      uint32_t Imm10_1 = extractBits(Value, 1, 10) << 21;
+      uint32_t Imm11 = extractBits(Value, 11, 1) << 20;
+      uint32_t Imm19_12 = extractBits(Value, 12, 8) << 12;
+      uint32_t RawInstr = *(little32_t *)FixupPtr;
+      *(little32_t *)FixupPtr = RawInstr | Imm20 | Imm10_1 | Imm11 | Imm19_12;
+      break;
+    }
     case R_RISCV_HI20: {
       int64_t Value = (E.getTarget().getAddress() + E.getAddend()).getValue();
       int64_t Hi = Value + 0x800;
-      if (LLVM_UNLIKELY(!isInRangeForImmS32(Hi)))
+      if (LLVM_UNLIKELY(!isInRangeForImm(Hi, 32)))
         return makeTargetOutOfRangeError(G, B, E);
       uint32_t RawInstr = *(little32_t *)FixupPtr;
       *(little32_t *)FixupPtr =
@@ -244,7 +250,7 @@ private:
     case R_RISCV_CALL: {
       int64_t Value = E.getTarget().getAddress() + E.getAddend() - FixupAddress;
       int64_t Hi = Value + 0x800;
-      if (LLVM_UNLIKELY(!isInRangeForImmS32(Hi)))
+      if (LLVM_UNLIKELY(!isInRangeForImm(Hi, 32)))
         return makeTargetOutOfRangeError(G, B, E);
       int32_t Lo = Value & 0xFFF;
       uint32_t RawInstrAuipc = *(little32_t *)FixupPtr;
@@ -258,7 +264,7 @@ private:
     case R_RISCV_PCREL_HI20: {
       int64_t Value = E.getTarget().getAddress() + E.getAddend() - FixupAddress;
       int64_t Hi = Value + 0x800;
-      if (LLVM_UNLIKELY(!isInRangeForImmS32(Hi)))
+      if (LLVM_UNLIKELY(!isInRangeForImm(Hi, 32)))
         return makeTargetOutOfRangeError(G, B, E);
       uint32_t RawInstr = *(little32_t *)FixupPtr;
       *(little32_t *)FixupPtr =
@@ -359,6 +365,13 @@ private:
       *FixupPtr = static_cast<uint8_t>(Value);
       break;
     }
+    case R_RISCV_SUB6: {
+      int64_t Value =
+          *(reinterpret_cast<const uint8_t *>(FixupAddress.getValue())) & 0x3f;
+      Value -= E.getTarget().getAddress().getValue() - E.getAddend();
+      *FixupPtr = (*FixupPtr & 0xc0) | (static_cast<uint8_t>(Value) & 0x3f);
+      break;
+    }
     case R_RISCV_SET6: {
       int64_t Value = (E.getTarget().getAddress() + E.getAddend()).getValue();
       uint32_t RawData = *(little32_t *)FixupPtr;
@@ -410,6 +423,8 @@ private:
       return EdgeKind_riscv::R_RISCV_64;
     case ELF::R_RISCV_BRANCH:
       return EdgeKind_riscv::R_RISCV_BRANCH;
+    case ELF::R_RISCV_JAL:
+      return EdgeKind_riscv::R_RISCV_JAL;
     case ELF::R_RISCV_HI20:
       return EdgeKind_riscv::R_RISCV_HI20;
     case ELF::R_RISCV_LO12_I:
@@ -442,6 +457,8 @@ private:
       return EdgeKind_riscv::R_RISCV_SUB16;
     case ELF::R_RISCV_SUB8:
       return EdgeKind_riscv::R_RISCV_SUB8;
+    case ELF::R_RISCV_SUB6:
+      return EdgeKind_riscv::R_RISCV_SUB6;
     case ELF::R_RISCV_SET6:
       return EdgeKind_riscv::R_RISCV_SET6;
     case ELF::R_RISCV_SET8:
@@ -454,8 +471,9 @@ private:
       return EdgeKind_riscv::R_RISCV_32_PCREL;
     }
 
-    return make_error<JITLinkError>("Unsupported riscv relocation:" +
-                                    formatv("{0:d}", Type));
+    return make_error<JITLinkError>(
+        "Unsupported riscv relocation:" + formatv("{0:d}: ", Type) +
+        object::getELFRelocationTypeName(ELF::EM_RISCV, Type));
   }
 
   Error addRelocations() override {
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
index 79d2cdbb30f1..8f21274bd1a3 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/JITLink/ELF_x86_64.h"
+#include "llvm/ExecutionEngine/JITLink/DWARFRecordSectionSplitter.h"
 #include "llvm/ExecutionEngine/JITLink/JITLink.h"
 #include "llvm/ExecutionEngine/JITLink/TableManager.h"
 #include "llvm/ExecutionEngine/JITLink/x86_64.h"
@@ -96,17 +97,6 @@ Error buildTables_ELF_x86_64(LinkGraph &G) {
 }
 } // namespace
 
-static const char *getELFX86_64RelocName(uint32_t Type) {
-  switch (Type) {
-#define ELF_RELOC(Name, Number)                                                \
-  case Number:                                                                 \
-    return #Name;
-#include "llvm/BinaryFormat/ELFRelocs/x86_64.def"
-#undef ELF_RELOC
-  }
-  return "Unrecognized ELF/x86-64 relocation type";
-}
-
 namespace llvm {
 namespace jitlink {
 
@@ -145,9 +135,9 @@ private:
     case ELF::R_X86_64_TLSGD:
       return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel32TLV;
     }
-    return make_error<JITLinkError>("Unsupported x86-64 relocation type " +
-                                    formatv("{0:d}: ", Type) +
-                                    getELFX86_64RelocName(Type));
+    return make_error<JITLinkError>(
+        "Unsupported x86-64 relocation type " + formatv("{0:d}: ", Type) +
+        object::getELFRelocationTypeName(ELF::EM_X86_64, Type));
   }
 
   Error addRelocations() override {
@@ -379,10 +369,10 @@ void link_ELF_x86_64(std::unique_ptr<LinkGraph> G,
 
   if (Ctx->shouldAddDefaultTargetPasses(G->getTargetTriple())) {
 
-    Config.PrePrunePasses.push_back(EHFrameSplitter(".eh_frame"));
-    Config.PrePrunePasses.push_back(
-        EHFrameEdgeFixer(".eh_frame", x86_64::PointerSize, x86_64::Delta64,
-                         x86_64::Delta32, x86_64::NegDelta32));
+    Config.PrePrunePasses.push_back(DWARFRecordSectionSplitter(".eh_frame"));
+    Config.PrePrunePasses.push_back(EHFrameEdgeFixer(
+        ".eh_frame", x86_64::PointerSize, x86_64::Pointer32, x86_64::Pointer64,
+        x86_64::Delta32, x86_64::Delta64, x86_64::NegDelta32));
     Config.PrePrunePasses.push_back(EHFrameNullTerminator(".eh_frame"));
 
     // Construct a JITLinker and run the link function.
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
index 78a603cfed17..43efe0725cfe 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
@@ -336,7 +336,7 @@ raw_ostream &operator<<(raw_ostream &OS, const SymbolLookupFlags &LF) {
 
 void JITLinkAsyncLookupContinuation::anchor() {}
 
-JITLinkContext::~JITLinkContext() {}
+JITLinkContext::~JITLinkContext() = default;
 
 bool JITLinkContext::shouldAddDefaultTargetPasses(const Triple &TT) const {
   return true;
@@ -393,6 +393,15 @@ Error makeTargetOutOfRangeError(const LinkGraph &G, const Block &B,
   return make_error<JITLinkError>(std::move(ErrMsg));
 }
 
+Error makeAlignmentError(llvm::orc::ExecutorAddr Loc, uint64_t Value, int N,
+                         const Edge &E) {
+  return make_error<JITLinkError>("0x" + llvm::utohexstr(Loc.getValue()) +
+                                  " improper alignment for relocation " +
+                                  formatv("{0:d}", E.getKind()) + ": 0x" +
+                                  llvm::utohexstr(Value) +
+                                  " is not aligned to " + Twine(N) + " bytes");
+}
+
 Expected<std::unique_ptr<LinkGraph>>
 createLinkGraphFromObject(MemoryBufferRef ObjectBuffer) {
   auto Magic = identify_magic(ObjectBuffer.getBuffer());
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
index 35ee050c8566..6d321a080829 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
@@ -20,7 +20,7 @@
 namespace llvm {
 namespace jitlink {
 
-JITLinkerBase::~JITLinkerBase() {}
+JITLinkerBase::~JITLinkerBase() = default;
 
 void JITLinkerBase::linkPhase1(std::unique_ptr<JITLinkerBase> Self) {
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
index 9315ac4f6120..acb759d6ce79 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
@@ -211,7 +211,7 @@ SimpleSegmentAlloc::Create(JITLinkMemoryManager &MemMgr, const JITLinkDylib *JD,
 SimpleSegmentAlloc::SimpleSegmentAlloc(SimpleSegmentAlloc &&) = default;
 SimpleSegmentAlloc &
 SimpleSegmentAlloc::operator=(SimpleSegmentAlloc &&) = default;
-SimpleSegmentAlloc::~SimpleSegmentAlloc() {}
+SimpleSegmentAlloc::~SimpleSegmentAlloc() = default;
 
 SimpleSegmentAlloc::SegmentInfo SimpleSegmentAlloc::getSegInfo(AllocGroup AG) {
   auto I = ContentBlocks.find(AG);
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
index 62574604458c..1bf12f438be0 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
@@ -19,7 +19,7 @@ static const char *CommonSectionName = "__common";
 namespace llvm {
 namespace jitlink {
 
-MachOLinkGraphBuilder::~MachOLinkGraphBuilder() {}
+MachOLinkGraphBuilder::~MachOLinkGraphBuilder() = default;
 
 Expected<std::unique_ptr<LinkGraph>> MachOLinkGraphBuilder::buildGraph() {
 
@@ -368,7 +368,7 @@ Error MachOLinkGraphBuilder::graphifyRegularSymbols() {
                                         Twine(KV.first));
       NSym.GraphSymbol = &G->addAbsoluteSymbol(
           *NSym.Name, orc::ExecutorAddr(NSym.Value), 0, Linkage::Strong,
-          Scope::Default, NSym.Desc & MachO::N_NO_DEAD_STRIP);
+          getScope(*NSym.Name, NSym.Type), NSym.Desc & MachO::N_NO_DEAD_STRIP);
       break;
     case MachO::N_SECT:
       SecIndexToSymbols[NSym.Sect - 1].push_back(&NSym);
@@ -644,17 +644,27 @@ Error MachOLinkGraphBuilder::graphifyCStringSection(
   // Scan section for null characters.
   for (size_t I = 0; I != NSec.Size; ++I)
     if (NSec.Data[I] == '\0') {
-      orc::ExecutorAddrDiff BlockEnd = I + 1;
-      size_t BlockSize = BlockEnd - BlockStart;
+      size_t BlockSize = I + 1 - BlockStart;
       // Create a block for this null terminated string.
       auto &B = G->createContentBlock(*NSec.GraphSection,
                                       {NSec.Data + BlockStart, BlockSize},
-                                      NSec.Address + BlockStart, 1, 0);
+                                      NSec.Address + BlockStart, NSec.Alignment,
+                                      BlockStart % NSec.Alignment);
 
       LLVM_DEBUG({
-        dbgs() << "    Created block " << formatv("{0:x}", B.getAddress())
-               << " -- " << formatv("{0:x}", B.getAddress() + B.getSize())
-               << " for \"" << StringRef(B.getContent().data()) << "\"\n";
+        dbgs() << "    Created block " << B.getRange()
+               << ", align = " << B.getAlignment()
+               << ", align-ofs = " << B.getAlignmentOffset() << " for \"";
+        for (size_t J = 0; J != std::min(B.getSize(), size_t(16)); ++J)
+          switch (B.getContent()[J]) {
+          case '\0': break;
+          case '\n': dbgs() << "\\n"; break;
+          case '\t': dbgs() << "\\t"; break;
+          default:   dbgs() << B.getContent()[J]; break;
+          }
+        if (B.getSize() > 16)
+          dbgs() << "...";
+        dbgs() << "\"\n";
       });
 
       // If there's no symbol at the start of this block then create one.
@@ -663,15 +673,13 @@ Error MachOLinkGraphBuilder::graphifyCStringSection(
         auto &S = G->addAnonymousSymbol(B, 0, BlockSize, false, false);
         setCanonicalSymbol(NSec, S);
         LLVM_DEBUG({
-          dbgs() << "      Adding anonymous symbol for c-string block "
-                 << formatv("{0:x16} -- {1:x16}", S.getAddress(),
-                            S.getAddress() + BlockSize)
-                 << "\n";
+          dbgs() << "      Adding symbol for c-string block " << B.getRange()
+                 << ": <anonymous symbol> at offset 0\n";
         });
       }
 
       // Process any remaining symbols that point into this block.
-      auto LastCanonicalAddr = B.getAddress() + BlockEnd;
+      auto LastCanonicalAddr = B.getAddress() + BlockSize;
       while (!NSyms.empty() && orc::ExecutorAddr(NSyms.back()->Value) <
                                    B.getAddress() + BlockSize) {
         auto &NSym = *NSyms.back();
@@ -686,8 +694,15 @@ Error MachOLinkGraphBuilder::graphifyCStringSection(
           LastCanonicalAddr = orc::ExecutorAddr(NSym.Value);
         }
 
-        createStandardGraphSymbol(NSym, B, SymSize, SectionIsText, SymLive,
-                                  IsCanonical);
+        auto &Sym = createStandardGraphSymbol(NSym, B, SymSize, SectionIsText,
+                                              SymLive, IsCanonical);
+        (void)Sym;
+        LLVM_DEBUG({
+          dbgs() << "      Adding symbol for c-string block " << B.getRange()
+                 << ": "
+                 << (Sym.hasName() ? Sym.getName() : "<anonymous symbol>")
+                 << " at offset " << formatv("{0:x}", Sym.getOffset()) << "\n";
+        });
 
         NSyms.pop_back();
       }
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
index 3ca2e40c7263..dd50314d3ed7 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
@@ -11,15 +11,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/JITLink/MachO_arm64.h"
+#include "llvm/ExecutionEngine/JITLink/DWARFRecordSectionSplitter.h"
+#include "llvm/ExecutionEngine/JITLink/aarch64.h"
 
 #include "MachOLinkGraphBuilder.h"
-#include "PerGraphGOTAndPLTStubsBuilder.h"
 
 #define DEBUG_TYPE "jitlink"
 
 using namespace llvm;
 using namespace llvm::jitlink;
-using namespace llvm::jitlink::MachO_arm64_Edges;
 
 namespace {
 
@@ -27,19 +27,39 @@ class MachOLinkGraphBuilder_arm64 : public MachOLinkGraphBuilder {
 public:
   MachOLinkGraphBuilder_arm64(const object::MachOObjectFile &Obj)
       : MachOLinkGraphBuilder(Obj, Triple("arm64-apple-darwin"),
-                              getMachOARM64RelocationKindName),
+                              aarch64::getEdgeKindName),
         NumSymbols(Obj.getSymtabLoadCommand().nsyms) {}
 
 private:
+  enum MachOARM64RelocationKind : Edge::Kind {
+    MachOBranch26 = Edge::FirstRelocation,
+    MachOPointer32,
+    MachOPointer64,
+    MachOPointer64Anon,
+    MachOPage21,
+    MachOPageOffset12,
+    MachOGOTPage21,
+    MachOGOTPageOffset12,
+    MachOTLVPage21,
+    MachOTLVPageOffset12,
+    MachOPointerToGOT,
+    MachOPairedAddend,
+    MachOLDRLiteral19,
+    MachODelta32,
+    MachODelta64,
+    MachONegDelta32,
+    MachONegDelta64,
+  };
+
   static Expected<MachOARM64RelocationKind>
   getRelocationKind(const MachO::relocation_info &RI) {
     switch (RI.r_type) {
     case MachO::ARM64_RELOC_UNSIGNED:
       if (!RI.r_pcrel) {
         if (RI.r_length == 3)
-          return RI.r_extern ? Pointer64 : Pointer64Anon;
+          return RI.r_extern ? MachOPointer64 : MachOPointer64Anon;
         else if (RI.r_length == 2)
-          return Pointer32;
+          return MachOPointer32;
       }
       break;
     case MachO::ARM64_RELOC_SUBTRACTOR:
@@ -48,46 +68,46 @@ private:
       // They may be turned into NegDelta<W> by parsePairRelocation.
       if (!RI.r_pcrel && RI.r_extern) {
         if (RI.r_length == 2)
-          return Delta32;
+          return MachODelta32;
         else if (RI.r_length == 3)
-          return Delta64;
+          return MachODelta64;
       }
       break;
     case MachO::ARM64_RELOC_BRANCH26:
       if (RI.r_pcrel && RI.r_extern && RI.r_length == 2)
-        return Branch26;
+        return MachOBranch26;
       break;
     case MachO::ARM64_RELOC_PAGE21:
       if (RI.r_pcrel && RI.r_extern && RI.r_length == 2)
-        return Page21;
+        return MachOPage21;
       break;
     case MachO::ARM64_RELOC_PAGEOFF12:
       if (!RI.r_pcrel && RI.r_extern && RI.r_length == 2)
-        return PageOffset12;
+        return MachOPageOffset12;
       break;
     case MachO::ARM64_RELOC_GOT_LOAD_PAGE21:
       if (RI.r_pcrel && RI.r_extern && RI.r_length == 2)
-        return GOTPage21;
+        return MachOGOTPage21;
       break;
     case MachO::ARM64_RELOC_GOT_LOAD_PAGEOFF12:
       if (!RI.r_pcrel && RI.r_extern && RI.r_length == 2)
-        return GOTPageOffset12;
+        return MachOGOTPageOffset12;
       break;
     case MachO::ARM64_RELOC_POINTER_TO_GOT:
       if (RI.r_pcrel && RI.r_extern && RI.r_length == 2)
-        return PointerToGOT;
+        return MachOPointerToGOT;
       break;
     case MachO::ARM64_RELOC_ADDEND:
       if (!RI.r_pcrel && !RI.r_extern && RI.r_length == 2)
-        return PairedAddend;
+        return MachOPairedAddend;
       break;
     case MachO::ARM64_RELOC_TLVP_LOAD_PAGE21:
       if (RI.r_pcrel && RI.r_extern && RI.r_length == 2)
-        return TLVPage21;
+        return MachOTLVPage21;
       break;
     case MachO::ARM64_RELOC_TLVP_LOAD_PAGEOFF12:
       if (!RI.r_pcrel && RI.r_extern && RI.r_length == 2)
-        return TLVPageOffset12;
+        return MachOTLVPageOffset12;
       break;
     }
 
@@ -101,8 +121,7 @@ private:
         ", length=" + formatv("{0:d}", RI.r_length));
   }
 
-  using PairRelocInfo =
-      std::tuple<MachOARM64RelocationKind, Symbol *, uint64_t>;
+  using PairRelocInfo = std::tuple<Edge::Kind, Symbol *, uint64_t>;
 
   // Parses paired SUBTRACTOR/UNSIGNED relocations and, on success,
   // returns the edge kind and addend to be used.
@@ -114,8 +133,8 @@ private:
                       object::relocation_iterator &RelEnd) {
     using namespace support;
 
-    assert(((SubtractorKind == Delta32 && SubRI.r_length == 2) ||
-            (SubtractorKind == Delta64 && SubRI.r_length == 3)) &&
+    assert(((SubtractorKind == MachODelta32 && SubRI.r_length == 2) ||
+            (SubtractorKind == MachODelta64 && SubRI.r_length == 3)) &&
            "Subtractor kind should match length");
     assert(SubRI.r_extern && "SUBTRACTOR reloc symbol should be extern");
     assert(!SubRI.r_pcrel && "SUBTRACTOR reloc should not be PCRel");
@@ -165,17 +184,18 @@ private:
       FixupValue -= ToSymbol->getAddress().getValue();
     }
 
-    MachOARM64RelocationKind DeltaKind;
+    Edge::Kind DeltaKind;
     Symbol *TargetSymbol;
     uint64_t Addend;
     if (&BlockToFix == &FromSymbol->getAddressable()) {
       TargetSymbol = ToSymbol;
-      DeltaKind = (SubRI.r_length == 3) ? Delta64 : Delta32;
+      DeltaKind = (SubRI.r_length == 3) ? aarch64::Delta64 : aarch64::Delta32;
       Addend = FixupValue + (FixupAddress - FromSymbol->getAddress());
       // FIXME: handle extern 'from'.
     } else if (&BlockToFix == &ToSymbol->getAddressable()) {
       TargetSymbol = &*FromSymbol;
-      DeltaKind = (SubRI.r_length == 3) ? NegDelta64 : NegDelta32;
+      DeltaKind =
+          (SubRI.r_length == 3) ? aarch64::NegDelta64 : aarch64::NegDelta32;
       Addend = FixupValue - (FixupAddress - ToSymbol->getAddress());
     } else {
       // BlockToFix was neither FromSymbol nor ToSymbol.
@@ -229,9 +249,9 @@ private:
         MachO::relocation_info RI = getRelocationInfo(RelItr);
 
         // Validate the relocation kind.
-        auto Kind = getRelocationKind(RI);
-        if (!Kind)
-          return Kind.takeError();
+        auto MachORelocKind = getRelocationKind(RI);
+        if (!MachORelocKind)
+          return MachORelocKind.takeError();
 
         // Find the address of the value to fix up.
         orc::ExecutorAddr FixupAddress =
@@ -255,6 +275,8 @@ private:
           return make_error<JITLinkError>(
               "Relocation content extends past end of fixup block");
 
+        Edge::Kind Kind = Edge::Invalid;
+
         // Get a pointer to the fixup content.
         const char *FixupContent = BlockToFix->getContent().data() +
                                    (FixupAddress - BlockToFix->getAddress());
@@ -263,7 +285,7 @@ private:
         Symbol *TargetSymbol = nullptr;
         uint64_t Addend = 0;
 
-        if (*Kind == PairedAddend) {
+        if (*MachORelocKind == MachOPairedAddend) {
           // If this is an Addend relocation then process it and move to the
           // paired reloc.
 
@@ -275,19 +297,21 @@ private:
           ++RelItr;
           RI = getRelocationInfo(RelItr);
 
-          Kind = getRelocationKind(RI);
-          if (!Kind)
-            return Kind.takeError();
+          MachORelocKind = getRelocationKind(RI);
+          if (!MachORelocKind)
+            return MachORelocKind.takeError();
 
-          if (*Kind != Branch26 && *Kind != Page21 && *Kind != PageOffset12)
+          if (*MachORelocKind != MachOBranch26 &&
+              *MachORelocKind != MachOPage21 &&
+              *MachORelocKind != MachOPageOffset12)
             return make_error<JITLinkError>(
                 "Invalid relocation pair: Addend + " +
-                StringRef(getMachOARM64RelocationKindName(*Kind)));
+                StringRef(getMachOARM64RelocationKindName(*MachORelocKind)));
 
           LLVM_DEBUG({
             dbgs() << "    Addend: value = " << formatv("{0:x6}", Addend)
-                   << ", pair is " << getMachOARM64RelocationKindName(*Kind)
-                   << "\n";
+                   << ", pair is "
+                   << getMachOARM64RelocationKindName(*MachORelocKind) << "\n";
           });
 
           // Find the address of the value to fix up.
@@ -298,8 +322,8 @@ private:
                                             "different target");
         }
 
-        switch (*Kind) {
-        case Branch26: {
+        switch (*MachORelocKind) {
+        case MachOBranch26: {
           if (auto TargetSymbolOrErr = findSymbolByIndex(RI.r_symbolnum))
             TargetSymbol = TargetSymbolOrErr->GraphSymbol;
           else
@@ -308,23 +332,26 @@ private:
           if ((Instr & 0x7fffffff) != 0x14000000)
             return make_error<JITLinkError>("BRANCH26 target is not a B or BL "
                                             "instruction with a zero addend");
+          Kind = aarch64::Branch26;
           break;
         }
-        case Pointer32:
+        case MachOPointer32:
           if (auto TargetSymbolOrErr = findSymbolByIndex(RI.r_symbolnum))
             TargetSymbol = TargetSymbolOrErr->GraphSymbol;
           else
             return TargetSymbolOrErr.takeError();
           Addend = *(const ulittle32_t *)FixupContent;
+          Kind = aarch64::Pointer32;
           break;
-        case Pointer64:
+        case MachOPointer64:
           if (auto TargetSymbolOrErr = findSymbolByIndex(RI.r_symbolnum))
             TargetSymbol = TargetSymbolOrErr->GraphSymbol;
           else
             return TargetSymbolOrErr.takeError();
           Addend = *(const ulittle64_t *)FixupContent;
+          Kind = aarch64::Pointer64;
           break;
-        case Pointer64Anon: {
+        case MachOPointer64Anon: {
           orc::ExecutorAddr TargetAddress(*(const ulittle64_t *)FixupContent);
           auto TargetNSec = findSectionByIndex(RI.r_symbolnum - 1);
           if (!TargetNSec)
@@ -335,11 +362,12 @@ private:
           else
             return TargetSymbolOrErr.takeError();
           Addend = TargetAddress - TargetSymbol->getAddress();
+          Kind = aarch64::Pointer64Anon;
           break;
         }
-        case Page21:
-        case TLVPage21:
-        case GOTPage21: {
+        case MachOPage21:
+        case MachOTLVPage21:
+        case MachOGOTPage21: {
           if (auto TargetSymbolOrErr = findSymbolByIndex(RI.r_symbolnum))
             TargetSymbol = TargetSymbolOrErr->GraphSymbol;
           else
@@ -349,9 +377,17 @@ private:
             return make_error<JITLinkError>("PAGE21/GOTPAGE21 target is not an "
                                             "ADRP instruction with a zero "
                                             "addend");
+
+          if (*MachORelocKind == MachOPage21) {
+            Kind = aarch64::Page21;
+          } else if (*MachORelocKind == MachOTLVPage21) {
+            Kind = aarch64::TLVPage21;
+          } else if (*MachORelocKind == MachOGOTPage21) {
+            Kind = aarch64::GOTPage21;
+          }
           break;
         }
-        case PageOffset12: {
+        case MachOPageOffset12: {
           if (auto TargetSymbolOrErr = findSymbolByIndex(RI.r_symbolnum))
             TargetSymbol = TargetSymbolOrErr->GraphSymbol;
           else
@@ -361,10 +397,11 @@ private:
           if (EncodedAddend != 0)
             return make_error<JITLinkError>("GOTPAGEOFF12 target has non-zero "
                                             "encoded addend");
+          Kind = aarch64::PageOffset12;
           break;
         }
-        case TLVPageOffset12:
-        case GOTPageOffset12: {
+        case MachOTLVPageOffset12:
+        case MachOGOTPageOffset12: {
           if (auto TargetSymbolOrErr = findSymbolByIndex(RI.r_symbolnum))
             TargetSymbol = TargetSymbolOrErr->GraphSymbol;
           else
@@ -374,27 +411,35 @@ private:
             return make_error<JITLinkError>("GOTPAGEOFF12 target is not an LDR "
                                             "immediate instruction with a zero "
                                             "addend");
+
+          if (*MachORelocKind == MachOTLVPageOffset12) {
+            Kind = aarch64::TLVPageOffset12;
+          } else if (*MachORelocKind == MachOGOTPageOffset12) {
+            Kind = aarch64::GOTPageOffset12;
+          }
           break;
         }
-        case PointerToGOT:
+        case MachOPointerToGOT:
           if (auto TargetSymbolOrErr = findSymbolByIndex(RI.r_symbolnum))
             TargetSymbol = TargetSymbolOrErr->GraphSymbol;
           else
             return TargetSymbolOrErr.takeError();
+
+          Kind = aarch64::PointerToGOT;
           break;
-        case Delta32:
-        case Delta64: {
+        case MachODelta32:
+        case MachODelta64: {
           // We use Delta32/Delta64 to represent SUBTRACTOR relocations.
           // parsePairRelocation handles the paired reloc, and returns the
           // edge kind to be used (either Delta32/Delta64, or
           // NegDelta32/NegDelta64, depending on the direction of the
           // subtraction) along with the addend.
           auto PairInfo =
-              parsePairRelocation(*BlockToFix, *Kind, RI, FixupAddress,
-                                  FixupContent, ++RelItr, RelEnd);
+              parsePairRelocation(*BlockToFix, *MachORelocKind, RI,
+                                  FixupAddress, FixupContent, ++RelItr, RelEnd);
           if (!PairInfo)
             return PairInfo.takeError();
-          std::tie(*Kind, TargetSymbol, Addend) = *PairInfo;
+          std::tie(Kind, TargetSymbol, Addend) = *PairInfo;
           assert(TargetSymbol && "No target symbol from parsePairRelocation?");
           break;
         }
@@ -405,108 +450,59 @@ private:
 
         LLVM_DEBUG({
           dbgs() << "    ";
-          Edge GE(*Kind, FixupAddress - BlockToFix->getAddress(), *TargetSymbol,
+          Edge GE(Kind, FixupAddress - BlockToFix->getAddress(), *TargetSymbol,
                   Addend);
-          printEdge(dbgs(), *BlockToFix, GE,
-                    getMachOARM64RelocationKindName(*Kind));
+          printEdge(dbgs(), *BlockToFix, GE, aarch64::getEdgeKindName(Kind));
           dbgs() << "\n";
         });
-        BlockToFix->addEdge(*Kind, FixupAddress - BlockToFix->getAddress(),
+        BlockToFix->addEdge(Kind, FixupAddress - BlockToFix->getAddress(),
                             *TargetSymbol, Addend);
       }
     }
     return Error::success();
   }
 
-  unsigned NumSymbols = 0;
-};
-
-class PerGraphGOTAndPLTStubsBuilder_MachO_arm64
-    : public PerGraphGOTAndPLTStubsBuilder<
-          PerGraphGOTAndPLTStubsBuilder_MachO_arm64> {
-public:
-  using PerGraphGOTAndPLTStubsBuilder<
-      PerGraphGOTAndPLTStubsBuilder_MachO_arm64>::PerGraphGOTAndPLTStubsBuilder;
-
-  bool isGOTEdgeToFix(Edge &E) const {
-    return E.getKind() == GOTPage21 || E.getKind() == GOTPageOffset12 ||
-           E.getKind() == TLVPage21 || E.getKind() == TLVPageOffset12 ||
-           E.getKind() == PointerToGOT;
-  }
-
-  Symbol &createGOTEntry(Symbol &Target) {
-    auto &GOTEntryBlock = G.createContentBlock(
-        getGOTSection(), getGOTEntryBlockContent(), orc::ExecutorAddr(), 8, 0);
-    GOTEntryBlock.addEdge(Pointer64, 0, Target, 0);
-    return G.addAnonymousSymbol(GOTEntryBlock, 0, 8, false, false);
-  }
-
-  void fixGOTEdge(Edge &E, Symbol &GOTEntry) {
-    if (E.getKind() == GOTPage21 || E.getKind() == GOTPageOffset12 ||
-        E.getKind() == TLVPage21 || E.getKind() == TLVPageOffset12) {
-      // Update the target, but leave the edge addend as-is.
-      E.setTarget(GOTEntry);
-    } else if (E.getKind() == PointerToGOT) {
-      E.setTarget(GOTEntry);
-      E.setKind(Delta32);
-    } else
-      llvm_unreachable("Not a GOT edge?");
-  }
-
-  bool isExternalBranchEdge(Edge &E) {
-    return E.getKind() == Branch26 && !E.getTarget().isDefined();
-  }
-
-  Symbol &createPLTStub(Symbol &Target) {
-    auto &StubContentBlock = G.createContentBlock(
-        getStubsSection(), getStubBlockContent(), orc::ExecutorAddr(), 1, 0);
-    // Re-use GOT entries for stub targets.
-    auto &GOTEntrySymbol = getGOTEntry(Target);
-    StubContentBlock.addEdge(LDRLiteral19, 0, GOTEntrySymbol, 0);
-    return G.addAnonymousSymbol(StubContentBlock, 0, 8, true, false);
-  }
-
-  void fixPLTEdge(Edge &E, Symbol &Stub) {
-    assert(E.getKind() == Branch26 && "Not a Branch32 edge?");
-    assert(E.getAddend() == 0 && "Branch32 edge has non-zero addend?");
-    E.setTarget(Stub);
-  }
-
-private:
-  Section &getGOTSection() {
-    if (!GOTSection)
-      GOTSection = &G.createSection("$__GOT", MemProt::Read | MemProt::Exec);
-    return *GOTSection;
-  }
-
-  Section &getStubsSection() {
-    if (!StubsSection)
-      StubsSection =
-          &G.createSection("$__STUBS", MemProt::Read | MemProt::Exec);
-    return *StubsSection;
-  }
-
-  ArrayRef<char> getGOTEntryBlockContent() {
-    return {reinterpret_cast<const char *>(NullGOTEntryContent),
-            sizeof(NullGOTEntryContent)};
-  }
-
-  ArrayRef<char> getStubBlockContent() {
-    return {reinterpret_cast<const char *>(StubContent), sizeof(StubContent)};
+  /// Return the string name of the given MachO arm64 edge kind.
+  const char *getMachOARM64RelocationKindName(Edge::Kind R) {
+    switch (R) {
+    case MachOBranch26:
+      return "MachOBranch26";
+    case MachOPointer64:
+      return "MachOPointer64";
+    case MachOPointer64Anon:
+      return "MachOPointer64Anon";
+    case MachOPage21:
+      return "MachOPage21";
+    case MachOPageOffset12:
+      return "MachOPageOffset12";
+    case MachOGOTPage21:
+      return "MachOGOTPage21";
+    case MachOGOTPageOffset12:
+      return "MachOGOTPageOffset12";
+    case MachOTLVPage21:
+      return "MachOTLVPage21";
+    case MachOTLVPageOffset12:
+      return "MachOTLVPageOffset12";
+    case MachOPointerToGOT:
+      return "MachOPointerToGOT";
+    case MachOPairedAddend:
+      return "MachOPairedAddend";
+    case MachOLDRLiteral19:
+      return "MachOLDRLiteral19";
+    case MachODelta32:
+      return "MachODelta32";
+    case MachODelta64:
+      return "MachODelta64";
+    case MachONegDelta32:
+      return "MachONegDelta32";
+    case MachONegDelta64:
+      return "MachONegDelta64";
+    default:
+      return getGenericEdgeKindName(static_cast<Edge::Kind>(R));
+    }
   }
 
-  static const uint8_t NullGOTEntryContent[8];
-  static const uint8_t StubContent[8];
-  Section *GOTSection = nullptr;
-  Section *StubsSection = nullptr;
-};
-
-const uint8_t
-    PerGraphGOTAndPLTStubsBuilder_MachO_arm64::NullGOTEntryContent[8] = {
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
-const uint8_t PerGraphGOTAndPLTStubsBuilder_MachO_arm64::StubContent[8] = {
-    0x10, 0x00, 0x00, 0x58, // LDR x16, <literal>
-    0x00, 0x02, 0x1f, 0xd6  // BR  x16
+  unsigned NumSymbols = 0;
 };
 
 } // namespace
@@ -514,6 +510,15 @@ const uint8_t PerGraphGOTAndPLTStubsBuilder_MachO_arm64::StubContent[8] = {
 namespace llvm {
 namespace jitlink {
 
+Error buildTables_MachO_arm64(LinkGraph &G) {
+  LLVM_DEBUG(dbgs() << "Visiting edges in graph:\n");
+
+  aarch64::GOTTableManager GOT;
+  aarch64::PLTTableManager PLT(GOT);
+  visitExistingEdges(G, GOT, PLT);
+  return Error::success();
+}
+
 class MachOJITLinker_arm64 : public JITLinker<MachOJITLinker_arm64> {
   friend class JITLinker<MachOJITLinker_arm64>;
 
@@ -524,162 +529,8 @@ public:
       : JITLinker(std::move(Ctx), std::move(G), std::move(PassConfig)) {}
 
 private:
-
-  static unsigned getPageOffset12Shift(uint32_t Instr) {
-    constexpr uint32_t LoadStoreImm12Mask = 0x3b000000;
-    constexpr uint32_t Vec128Mask = 0x04800000;
-
-    if ((Instr & LoadStoreImm12Mask) == 0x39000000) {
-      uint32_t ImplicitShift = Instr >> 30;
-      if (ImplicitShift == 0)
-        if ((Instr & Vec128Mask) == Vec128Mask)
-          ImplicitShift = 4;
-
-      return ImplicitShift;
-    }
-
-    return 0;
-  }
-
   Error applyFixup(LinkGraph &G, Block &B, const Edge &E) const {
-    using namespace support;
-
-    char *BlockWorkingMem = B.getAlreadyMutableContent().data();
-    char *FixupPtr = BlockWorkingMem + E.getOffset();
-    orc::ExecutorAddr FixupAddress = B.getAddress() + E.getOffset();
-
-    switch (E.getKind()) {
-    case Branch26: {
-      assert((FixupAddress.getValue() & 0x3) == 0 &&
-             "Branch-inst is not 32-bit aligned");
-
-      int64_t Value = E.getTarget().getAddress() - FixupAddress + E.getAddend();
-
-      if (static_cast<uint64_t>(Value) & 0x3)
-        return make_error<JITLinkError>("Branch26 target is not 32-bit "
-                                        "aligned");
-
-      if (Value < -(1 << 27) || Value > ((1 << 27) - 1))
-        return makeTargetOutOfRangeError(G, B, E);
-
-      uint32_t RawInstr = *(little32_t *)FixupPtr;
-      assert((RawInstr & 0x7fffffff) == 0x14000000 &&
-             "RawInstr isn't a B or BR immediate instruction");
-      uint32_t Imm = (static_cast<uint32_t>(Value) & ((1 << 28) - 1)) >> 2;
-      uint32_t FixedInstr = RawInstr | Imm;
-      *(little32_t *)FixupPtr = FixedInstr;
-      break;
-    }
-    case Pointer32: {
-      uint64_t Value = E.getTarget().getAddress().getValue() + E.getAddend();
-      if (Value > std::numeric_limits<uint32_t>::max())
-        return makeTargetOutOfRangeError(G, B, E);
-      *(ulittle32_t *)FixupPtr = Value;
-      break;
-    }
-    case Pointer64:
-    case Pointer64Anon: {
-      uint64_t Value = E.getTarget().getAddress().getValue() + E.getAddend();
-      *(ulittle64_t *)FixupPtr = Value;
-      break;
-    }
-    case Page21:
-    case TLVPage21:
-    case GOTPage21: {
-      assert((E.getKind() != GOTPage21 || E.getAddend() == 0) &&
-             "GOTPAGE21 with non-zero addend");
-      uint64_t TargetPage =
-          (E.getTarget().getAddress().getValue() + E.getAddend()) &
-          ~static_cast<uint64_t>(4096 - 1);
-      uint64_t PCPage =
-          FixupAddress.getValue() & ~static_cast<uint64_t>(4096 - 1);
-
-      int64_t PageDelta = TargetPage - PCPage;
-      if (PageDelta < -(1 << 30) || PageDelta > ((1 << 30) - 1))
-        return makeTargetOutOfRangeError(G, B, E);
-
-      uint32_t RawInstr = *(ulittle32_t *)FixupPtr;
-      assert((RawInstr & 0xffffffe0) == 0x90000000 &&
-             "RawInstr isn't an ADRP instruction");
-      uint32_t ImmLo = (static_cast<uint64_t>(PageDelta) >> 12) & 0x3;
-      uint32_t ImmHi = (static_cast<uint64_t>(PageDelta) >> 14) & 0x7ffff;
-      uint32_t FixedInstr = RawInstr | (ImmLo << 29) | (ImmHi << 5);
-      *(ulittle32_t *)FixupPtr = FixedInstr;
-      break;
-    }
-    case PageOffset12: {
-      uint64_t TargetOffset =
-          (E.getTarget().getAddress() + E.getAddend()).getValue() & 0xfff;
-
-      uint32_t RawInstr = *(ulittle32_t *)FixupPtr;
-      unsigned ImmShift = getPageOffset12Shift(RawInstr);
-
-      if (TargetOffset & ((1 << ImmShift) - 1))
-        return make_error<JITLinkError>("PAGEOFF12 target is not aligned");
-
-      uint32_t EncodedImm = (TargetOffset >> ImmShift) << 10;
-      uint32_t FixedInstr = RawInstr | EncodedImm;
-      *(ulittle32_t *)FixupPtr = FixedInstr;
-      break;
-    }
-    case TLVPageOffset12:
-    case GOTPageOffset12: {
-      assert(E.getAddend() == 0 && "GOTPAGEOF12 with non-zero addend");
-
-      uint32_t RawInstr = *(ulittle32_t *)FixupPtr;
-      assert((RawInstr & 0xfffffc00) == 0xf9400000 &&
-             "RawInstr isn't a 64-bit LDR immediate");
-
-      uint32_t TargetOffset = E.getTarget().getAddress().getValue() & 0xfff;
-      assert((TargetOffset & 0x7) == 0 && "GOT entry is not 8-byte aligned");
-      uint32_t EncodedImm = (TargetOffset >> 3) << 10;
-      uint32_t FixedInstr = RawInstr | EncodedImm;
-      *(ulittle32_t *)FixupPtr = FixedInstr;
-      break;
-    }
-    case LDRLiteral19: {
-      assert((FixupAddress.getValue() & 0x3) == 0 &&
-             "LDR is not 32-bit aligned");
-      assert(E.getAddend() == 0 && "LDRLiteral19 with non-zero addend");
-      uint32_t RawInstr = *(ulittle32_t *)FixupPtr;
-      assert(RawInstr == 0x58000010 && "RawInstr isn't a 64-bit LDR literal");
-      int64_t Delta = E.getTarget().getAddress() - FixupAddress;
-      if (Delta & 0x3)
-        return make_error<JITLinkError>("LDR literal target is not 32-bit "
-                                        "aligned");
-      if (Delta < -(1 << 20) || Delta > ((1 << 20) - 1))
-        return makeTargetOutOfRangeError(G, B, E);
-
-      uint32_t EncodedImm =
-        ((static_cast<uint32_t>(Delta) >> 2) & 0x7ffff) << 5;
-      uint32_t FixedInstr = RawInstr | EncodedImm;
-      *(ulittle32_t *)FixupPtr = FixedInstr;
-      break;
-    }
-    case Delta32:
-    case Delta64:
-    case NegDelta32:
-    case NegDelta64: {
-      int64_t Value;
-      if (E.getKind() == Delta32 || E.getKind() == Delta64)
-        Value = E.getTarget().getAddress() - FixupAddress + E.getAddend();
-      else
-        Value = FixupAddress - E.getTarget().getAddress() + E.getAddend();
-
-      if (E.getKind() == Delta32 || E.getKind() == NegDelta32) {
-        if (Value < std::numeric_limits<int32_t>::min() ||
-            Value > std::numeric_limits<int32_t>::max())
-          return makeTargetOutOfRangeError(G, B, E);
-        *(little32_t *)FixupPtr = Value;
-      } else
-        *(little64_t *)FixupPtr = Value;
-      break;
-    }
-    default:
-      llvm_unreachable("Unrecognized edge kind");
-    }
-
-    return Error::success();
+    return aarch64::applyFixup(G, B, E);
   }
 
   uint64_t NullValue = 0;
@@ -712,13 +563,14 @@ void link_MachO_arm64(std::unique_ptr<LinkGraph> G,
     // Add eh-frame passses.
     // FIXME: Prune eh-frames for which compact-unwind is available once
     // we support compact-unwind registration with libunwind.
-    Config.PrePrunePasses.push_back(EHFrameSplitter("__TEXT,__eh_frame"));
     Config.PrePrunePasses.push_back(
-        EHFrameEdgeFixer("__TEXT,__eh_frame", 8, Delta64, Delta32, NegDelta32));
+        DWARFRecordSectionSplitter("__TEXT,__eh_frame"));
+    Config.PrePrunePasses.push_back(EHFrameEdgeFixer(
+        "__TEXT,__eh_frame", 8, aarch64::Pointer32, aarch64::Pointer64,
+        aarch64::Delta32, aarch64::Delta64, aarch64::NegDelta32));
 
     // Add an in-place GOT/Stubs pass.
-    Config.PostPrunePasses.push_back(
-        PerGraphGOTAndPLTStubsBuilder_MachO_arm64::asPass);
+    Config.PostPrunePasses.push_back(buildTables_MachO_arm64);
   }
 
   if (auto Err = Ctx->modifyPassConfig(*G, Config))
@@ -728,44 +580,5 @@ void link_MachO_arm64(std::unique_ptr<LinkGraph> G,
   MachOJITLinker_arm64::link(std::move(Ctx), std::move(G), std::move(Config));
 }
 
-const char *getMachOARM64RelocationKindName(Edge::Kind R) {
-  switch (R) {
-  case Branch26:
-    return "Branch26";
-  case Pointer64:
-    return "Pointer64";
-  case Pointer64Anon:
-    return "Pointer64Anon";
-  case Page21:
-    return "Page21";
-  case PageOffset12:
-    return "PageOffset12";
-  case GOTPage21:
-    return "GOTPage21";
-  case GOTPageOffset12:
-    return "GOTPageOffset12";
-  case TLVPage21:
-    return "TLVPage21";
-  case TLVPageOffset12:
-    return "TLVPageOffset12";
-  case PointerToGOT:
-    return "PointerToGOT";
-  case PairedAddend:
-    return "PairedAddend";
-  case LDRLiteral19:
-    return "LDRLiteral19";
-  case Delta32:
-    return "Delta32";
-  case Delta64:
-    return "Delta64";
-  case NegDelta32:
-    return "NegDelta32";
-  case NegDelta64:
-    return "NegDelta64";
-  default:
-    return getGenericEdgeKindName(static_cast<Edge::Kind>(R));
-  }
-}
-
 } // end namespace jitlink
 } // end namespace llvm
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
index 82afaa3aa3c5..6dfd5548fcfd 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
@@ -11,10 +11,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/JITLink/MachO_x86_64.h"
+#include "llvm/ExecutionEngine/JITLink/DWARFRecordSectionSplitter.h"
 #include "llvm/ExecutionEngine/JITLink/x86_64.h"
 
 #include "MachOLinkGraphBuilder.h"
-#include "PerGraphGOTAndPLTStubsBuilder.h"
 
 #define DEBUG_TYPE "jitlink"
 
@@ -504,12 +504,13 @@ void link_MachO_x86_64(std::unique_ptr<LinkGraph> G,
 }
 
 LinkGraphPassFunction createEHFrameSplitterPass_MachO_x86_64() {
-  return EHFrameSplitter("__TEXT,__eh_frame");
+  return DWARFRecordSectionSplitter("__TEXT,__eh_frame");
 }
 
 LinkGraphPassFunction createEHFrameEdgeFixerPass_MachO_x86_64() {
   return EHFrameEdgeFixer("__TEXT,__eh_frame", x86_64::PointerSize,
-                          x86_64::Delta64, x86_64::Delta32, x86_64::NegDelta32);
+                          x86_64::Pointer32, x86_64::Pointer64, x86_64::Delta32,
+                          x86_64::Delta64, x86_64::NegDelta32);
 }
 
 } // end namespace jitlink
diff --git a/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp b/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp
index 6dccc4811885..28a6f9ce90d9 100644
--- a/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp
@@ -18,13 +18,55 @@ namespace llvm {
 namespace jitlink {
 namespace aarch64 {
 
-const char *getEdgeKindName(Edge::Kind K) {
-  switch (K) {
-  case R_AARCH64_CALL26:
-    return "R_AARCH64_CALL26";
+const uint8_t NullGOTEntryContent[8] = {0x00, 0x00, 0x00, 0x00,
+                                        0x00, 0x00, 0x00, 0x00};
+
+const uint8_t StubContent[8] = {
+    0x10, 0x00, 0x00, 0x58, // LDR x16, <literal>
+    0x00, 0x02, 0x1f, 0xd6  // BR  x16
+};
+
+const char *getEdgeKindName(Edge::Kind R) {
+  switch (R) {
+  case Branch26:
+    return "Branch26";
+  case Pointer64:
+    return "Pointer64";
+  case Pointer64Anon:
+    return "Pointer64Anon";
+  case Page21:
+    return "Page21";
+  case PageOffset12:
+    return "PageOffset12";
+  case MoveWide16:
+    return "MoveWide16";
+  case GOTPage21:
+    return "GOTPage21";
+  case GOTPageOffset12:
+    return "GOTPageOffset12";
+  case TLVPage21:
+    return "TLVPage21";
+  case TLVPageOffset12:
+    return "TLVPageOffset12";
+  case PointerToGOT:
+    return "PointerToGOT";
+  case PairedAddend:
+    return "PairedAddend";
+  case LDRLiteral19:
+    return "LDRLiteral19";
+  case Delta32:
+    return "Delta32";
+  case Delta64:
+    return "Delta64";
+  case NegDelta32:
+    return "NegDelta32";
+  case NegDelta64:
+    return "NegDelta64";
+  default:
+    return getGenericEdgeKindName(static_cast<Edge::Kind>(R));
   }
-  return getGenericEdgeKindName(K);
 }
+
 } // namespace aarch64
 } // namespace jitlink
 } // namespace llvm
diff --git a/llvm/lib/ExecutionEngine/JITLink/riscv.cpp b/llvm/lib/ExecutionEngine/JITLink/riscv.cpp
index 3ce2cf10a24c..3848cc6b5f01 100644
--- a/llvm/lib/ExecutionEngine/JITLink/riscv.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/riscv.cpp
@@ -26,6 +26,8 @@ const char *getEdgeKindName(Edge::Kind K) {
     return "R_RISCV_64";
   case R_RISCV_BRANCH:
     return "R_RISCV_BRANCH";
+  case R_RISCV_JAL:
+    return "R_RISCV_JAL";
   case R_RISCV_HI20:
     return "R_RISCV_HI20";
   case R_RISCV_LO12_I:
@@ -56,6 +58,8 @@ const char *getEdgeKindName(Edge::Kind K) {
     return "R_RISCV_SUB16";
   case R_RISCV_SUB8:
     return "R_RISCV_SUB8";
+  case R_RISCV_SUB6:
+    return "R_RISCV_SUB6";
   case R_RISCV_SET6:
     return "R_RISCV_SET6";
   case R_RISCV_SET8:
diff --git a/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index ed912280ac82..4ac901daa5c8 100644
--- a/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -19,6 +19,7 @@
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/DynamicLibrary.h"
diff --git a/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h b/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h
index a5dd420c9132..f6c4cdbb8c91 100644
--- a/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h
+++ b/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h
@@ -72,8 +72,7 @@ class MCJIT : public ExecutionEngine {
 
   class OwningModuleContainer {
   public:
-    OwningModuleContainer() {
-    }
+    OwningModuleContainer() = default;
     ~OwningModuleContainer() {
       freeModulePtrSet(AddedModules);
       freeModulePtrSet(LoadedModules);
diff --git a/llvm/lib/ExecutionEngine/Orc/CompileUtils.cpp b/llvm/lib/ExecutionEngine/Orc/CompileUtils.cpp
index f34247005258..fad7428e1f90 100644
--- a/llvm/lib/ExecutionEngine/Orc/CompileUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/CompileUtils.cpp
@@ -12,6 +12,7 @@
 #include "llvm/ExecutionEngine/ObjectCache.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp
index e5cb8103919a..dd80630a33c1 100644
--- a/llvm/lib/ExecutionEngine/Orc/Core.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp
@@ -62,7 +62,7 @@ void ResourceTracker::makeDefunct() {
   JDAndFlag.store(Val);
 }
 
-ResourceManager::~ResourceManager() {}
+ResourceManager::~ResourceManager() = default;
 
 ResourceTrackerDefunct::ResourceTrackerDefunct(ResourceTrackerSP RT)
     : RT(std::move(RT)) {}
@@ -76,9 +76,21 @@ void ResourceTrackerDefunct::log(raw_ostream &OS) const {
 }
 
 FailedToMaterialize::FailedToMaterialize(
+    std::shared_ptr<SymbolStringPool> SSP,
     std::shared_ptr<SymbolDependenceMap> Symbols)
-    : Symbols(std::move(Symbols)) {
+    : SSP(std::move(SSP)), Symbols(std::move(Symbols)) {
+  assert(this->SSP && "String pool cannot be null");
   assert(!this->Symbols->empty() && "Can not fail to resolve an empty set");
+
+  // FIXME: Use a new dep-map type for FailedToMaterialize errors so that we
+  // don't have to manually retain/release.
+  for (auto &KV : *this->Symbols)
+    KV.first->Retain();
+}
+
+FailedToMaterialize::~FailedToMaterialize() {
+  for (auto &KV : *Symbols)
+    KV.first->Release();
 }
 
 std::error_code FailedToMaterialize::convertToErrorCode() const {
@@ -251,9 +263,21 @@ StringRef AbsoluteSymbolsMaterializationUnit::getName() const {
 
 void AbsoluteSymbolsMaterializationUnit::materialize(
     std::unique_ptr<MaterializationResponsibility> R) {
-  // No dependencies, so these calls can't fail.
-  cantFail(R->notifyResolved(Symbols));
-  cantFail(R->notifyEmitted());
+  // Even though these are just absolute symbols we need to check for failure
+  // to resolve/emit: the tracker for these symbols may have been removed while
+  // the materialization was in flight (e.g. due to a failure in some action
+  // triggered by the queries attached to the resolution/emission of these
+  // symbols).
+  if (auto Err = R->notifyResolved(Symbols)) {
+    R->getExecutionSession().reportError(std::move(Err));
+    R->failMaterialization();
+    return;
+  }
+  if (auto Err = R->notifyEmitted()) {
+    R->getExecutionSession().reportError(std::move(Err));
+    R->failMaterialization();
+    return;
+  }
 }
 
 void AbsoluteSymbolsMaterializationUnit::discard(const JITDylib &JD,
@@ -485,13 +509,16 @@ Expected<SymbolAliasMap> buildSimpleReexportsAliasMap(JITDylib &SourceJD,
 
 class InProgressLookupState {
 public:
+  // FIXME: Reduce the number of SymbolStringPtrs here. See
+  //        https://github.com/llvm/llvm-project/issues/55576.
+
   InProgressLookupState(LookupKind K, JITDylibSearchOrder SearchOrder,
                         SymbolLookupSet LookupSet, SymbolState RequiredState)
       : K(K), SearchOrder(std::move(SearchOrder)),
         LookupSet(std::move(LookupSet)), RequiredState(RequiredState) {
     DefGeneratorCandidates = this->LookupSet;
   }
-  virtual ~InProgressLookupState() {}
+  virtual ~InProgressLookupState() = default;
   virtual void complete(std::unique_ptr<InProgressLookupState> IPLS) = 0;
   virtual void fail(Error Err) = 0;
 
@@ -609,7 +636,7 @@ void LookupState::continueLookup(Error Err) {
   ES.OL_applyQueryPhase1(std::move(IPLS), std::move(Err));
 }
 
-DefinitionGenerator::~DefinitionGenerator() {}
+DefinitionGenerator::~DefinitionGenerator() = default;
 
 JITDylib::~JITDylib() {
   LLVM_DEBUG(dbgs() << "Destroying JITDylib " << getName() << "\n");
@@ -959,6 +986,7 @@ Error JITDylib::resolve(MaterializationResponsibility &MR,
           auto FailedSymbolsDepMap = std::make_shared<SymbolDependenceMap>();
           (*FailedSymbolsDepMap)[this] = std::move(SymbolsInErrorState);
           return make_error<FailedToMaterialize>(
+              getExecutionSession().getSymbolStringPool(),
               std::move(FailedSymbolsDepMap));
         }
 
@@ -1036,6 +1064,7 @@ Error JITDylib::emit(MaterializationResponsibility &MR,
           auto FailedSymbolsDepMap = std::make_shared<SymbolDependenceMap>();
           (*FailedSymbolsDepMap)[this] = std::move(SymbolsInErrorState);
           return make_error<FailedToMaterialize>(
+              getExecutionSession().getSymbolStringPool(),
               std::move(FailedSymbolsDepMap));
         }
 
@@ -1411,12 +1440,11 @@ void JITDylib::dump(raw_ostream &OS) {
     for (auto &KV : Symbols) {
       OS << "    \"" << *KV.first << "\": ";
       if (auto Addr = KV.second.getAddress())
-        OS << format("0x%016" PRIx64, Addr) << ", " << KV.second.getFlags()
-           << " ";
+        OS << format("0x%016" PRIx64, Addr);
       else
         OS << "<not resolved> ";
 
-      OS << KV.second.getFlags() << " " << KV.second.getState();
+      OS << " " << KV.second.getFlags() << " " << KV.second.getState();
 
       if (KV.second.hasMaterializerAttached()) {
         OS << " (Materializer ";
@@ -1751,7 +1779,7 @@ void JITDylib::transferEmittedNodeDependencies(
   }
 }
 
-Platform::~Platform() {}
+Platform::~Platform() = default;
 
 Expected<DenseMap<JITDylib *, SymbolMap>> Platform::lookupInitSymbols(
     ExecutionSession &ES,
@@ -1858,6 +1886,12 @@ ExecutionSession::ExecutionSession(std::unique_ptr<ExecutorProcessControl> EPC)
   this->EPC->ES = this;
 }
 
+ExecutionSession::~ExecutionSession() {
+  // You must call endSession prior to destroying the session.
+  assert(!SessionOpen &&
+         "Session still open. Did you forget to call endSession?");
+}
+
 Error ExecutionSession::endSession() {
   LLVM_DEBUG(dbgs() << "Ending ExecutionSession " << this << "\n");
 
@@ -1869,7 +1903,7 @@ Error ExecutionSession::endSession() {
   // TODO: notifiy platform? run static deinits?
 
   Error Err = Error::success();
-  for (auto &JD : JITDylibsToClose)
+  for (auto &JD : reverse(JITDylibsToClose))
     Err = joinErrors(std::move(Err), JD->clear());
 
   Err = joinErrors(std::move(Err), EPC->disconnect());
@@ -1987,9 +2021,8 @@ JITDylib::getDFSLinkOrder(ArrayRef<JITDylibSP> JDs) {
 
         for (auto &KV : llvm::reverse(Result.back()->LinkOrder)) {
           auto &JD = *KV.first;
-          if (Visited.count(&JD))
+          if (!Visited.insert(&JD).second)
             continue;
-          Visited.insert(&JD);
           WorkStack.push_back(&JD);
         }
       }
@@ -2071,7 +2104,7 @@ void ExecutionSession::lookup(
 
 Expected<SymbolMap>
 ExecutionSession::lookup(const JITDylibSearchOrder &SearchOrder,
-                         const SymbolLookupSet &Symbols, LookupKind K,
+                         SymbolLookupSet Symbols, LookupKind K,
                          SymbolState RequiredState,
                          RegisterDependenciesFunction RegisterDependencies) {
 #if LLVM_ENABLE_THREADS
@@ -2103,7 +2136,7 @@ ExecutionSession::lookup(const JITDylibSearchOrder &SearchOrder,
 #endif
 
   // Perform the asynchronous lookup.
-  lookup(K, SearchOrder, Symbols, RequiredState, NotifyComplete,
+  lookup(K, SearchOrder, std::move(Symbols), RequiredState, NotifyComplete,
          RegisterDependencies);
 
 #if LLVM_ENABLE_THREADS
@@ -2257,7 +2290,8 @@ Error ExecutionSession::removeResourceTracker(ResourceTracker &RT) {
         joinErrors(std::move(Err), L->handleRemoveResources(RT.getKeyUnsafe()));
 
   for (auto &Q : QueriesToFail)
-    Q->handleFailed(make_error<FailedToMaterialize>(FailedSymbols));
+    Q->handleFailed(
+        make_error<FailedToMaterialize>(getSymbolStringPool(), FailedSymbols));
 
   return Err;
 }
@@ -2337,7 +2371,8 @@ Error ExecutionSession::IL_updateCandidatesFor(
         if (SymI->second.getFlags().hasError()) {
           auto FailedSymbolsMap = std::make_shared<SymbolDependenceMap>();
           (*FailedSymbolsMap)[&JD] = {Name};
-          return make_error<FailedToMaterialize>(std::move(FailedSymbolsMap));
+          return make_error<FailedToMaterialize>(getSymbolStringPool(),
+                                                 std::move(FailedSymbolsMap));
         }
 
         // Otherwise this is a match. Remove it from the candidate set.
@@ -2611,7 +2646,7 @@ void ExecutionSession::OL_completeLookup(
               auto FailedSymbolsMap = std::make_shared<SymbolDependenceMap>();
               (*FailedSymbolsMap)[&JD] = {Name};
               return make_error<FailedToMaterialize>(
-                  std::move(FailedSymbolsMap));
+                  getSymbolStringPool(), std::move(FailedSymbolsMap));
             }
 
             // Otherwise this is a match.
@@ -2947,7 +2982,8 @@ void ExecutionSession::OL_notifyFailed(MaterializationResponsibility &MR) {
   });
 
   for (auto &Q : FailedQueries)
-    Q->handleFailed(make_error<FailedToMaterialize>(FailedSymbols));
+    Q->handleFailed(
+        make_error<FailedToMaterialize>(getSymbolStringPool(), FailedSymbols));
 }
 
 Error ExecutionSession::OL_replace(MaterializationResponsibility &MR,
diff --git a/llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp
index 4ff6b7fd54df..1e68ea1225e6 100644
--- a/llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp
@@ -42,7 +42,7 @@ class DebugObjectSection {
 public:
   virtual void setTargetMemoryRange(SectionRange Range) = 0;
   virtual void dump(raw_ostream &OS, StringRef Name) {}
-  virtual ~DebugObjectSection() {}
+  virtual ~DebugObjectSection() = default;
 };
 
 template <typename ELFT>
diff --git a/llvm/lib/ExecutionEngine/Orc/DebugUtils.cpp b/llvm/lib/ExecutionEngine/Orc/DebugUtils.cpp
index 5b386a458f1f..028bd245fb55 100644
--- a/llvm/lib/ExecutionEngine/Orc/DebugUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/DebugUtils.cpp
@@ -297,6 +297,13 @@ raw_ostream &operator<<(raw_ostream &OS, const SymbolState &S) {
   llvm_unreachable("Invalid state");
 }
 
+raw_ostream &operator<<(raw_ostream &OS, const SymbolStringPool &SSP) {
+  std::lock_guard<std::mutex> Lock(SSP.PoolMutex);
+  for (auto &KV : SSP.Pool)
+    OS << KV.first() << ": " << KV.second << "\n";
+  return OS;
+}
+
 DumpObjects::DumpObjects(std::string DumpDir, std::string IdentifierOverride)
     : DumpDir(std::move(DumpDir)),
       IdentifierOverride(std::move(IdentifierOverride)) {
diff --git a/llvm/lib/ExecutionEngine/Orc/DebuggerSupportPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/DebuggerSupportPlugin.cpp
index 6916ee4a827f..3c44fe81b4a9 100644
--- a/llvm/lib/ExecutionEngine/Orc/DebuggerSupportPlugin.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/DebuggerSupportPlugin.cpp
@@ -48,7 +48,7 @@ public:
 
   MachODebugObjectSynthesizerBase(LinkGraph &G, ExecutorAddr RegisterActionAddr)
       : G(G), RegisterActionAddr(RegisterActionAddr) {}
-  virtual ~MachODebugObjectSynthesizerBase() {}
+  virtual ~MachODebugObjectSynthesizerBase() = default;
 
   Error preserveDebugSections() {
     if (G.findSectionByName(SynthDebugSectionName)) {
@@ -349,10 +349,11 @@ public:
     }
 
     SectionRange R(MachOContainerBlock->getSection());
-    G.allocActions().push_back({cantFail(shared::WrapperFunctionCall::Create<
-                                         SPSArgList<SPSExecutorAddrRange>>(
-                                    RegisterActionAddr, R.getRange())),
-                                {}});
+    G.allocActions().push_back(
+        {cantFail(shared::WrapperFunctionCall::Create<
+                  shared::SPSArgList<shared::SPSExecutorAddrRange>>(
+             RegisterActionAddr, R.getRange())),
+         {}});
     return Error::success();
   }
 
diff --git a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
index d02760703f06..e476c549412a 100644
--- a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
@@ -10,6 +10,7 @@
 
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/ExecutionEngine/JITLink/ELF_x86_64.h"
+#include "llvm/ExecutionEngine/JITLink/aarch64.h"
 #include "llvm/ExecutionEngine/JITLink/x86_64.h"
 #include "llvm/ExecutionEngine/Orc/DebugUtils.h"
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
@@ -47,6 +48,11 @@ public:
       Endianness = support::endianness::little;
       EdgeKind = jitlink::x86_64::Pointer64;
       break;
+    case Triple::aarch64:
+      PointerSize = 8;
+      Endianness = support::endianness::little;
+      EdgeKind = jitlink::aarch64::Pointer64;
+      break;
     default:
       llvm_unreachable("Unrecognized architecture");
     }
@@ -95,8 +101,6 @@ StringRef InitArrayFuncSectionName = ".init_array";
 StringRef ThreadBSSSectionName = ".tbss";
 StringRef ThreadDataSectionName = ".tdata";
 
-StringRef InitSectionNames[] = {InitArrayFuncSectionName};
-
 } // end anonymous namespace
 
 namespace llvm {
@@ -117,8 +121,12 @@ ELFNixPlatform::Create(ExecutionSession &ES,
                                    inconvertibleErrorCode());
 
   // Create default aliases if the caller didn't supply any.
-  if (!RuntimeAliases)
-    RuntimeAliases = standardPlatformAliases(ES);
+  if (!RuntimeAliases) {
+    auto StandardRuntimeAliases = standardPlatformAliases(ES, PlatformJD);
+    if (!StandardRuntimeAliases)
+      return StandardRuntimeAliases.takeError();
+    RuntimeAliases = std::move(*StandardRuntimeAliases);
+  }
 
   // Define the aliases.
   if (auto Err = PlatformJD.define(symbolAliases(std::move(*RuntimeAliases))))
@@ -189,10 +197,53 @@ static void addAliases(ExecutionSession &ES, SymbolAliasMap &Aliases,
   }
 }
 
-SymbolAliasMap ELFNixPlatform::standardPlatformAliases(ExecutionSession &ES) {
+Expected<SymbolAliasMap>
+ELFNixPlatform::standardPlatformAliases(ExecutionSession &ES,
+                                        JITDylib &PlatformJD) {
   SymbolAliasMap Aliases;
   addAliases(ES, Aliases, requiredCXXAliases());
   addAliases(ES, Aliases, standardRuntimeUtilityAliases());
+
+  // Determine whether or not the libunwind extended-API function for
+  // dynamically registering an entire .eh_frame section is available.
+  // If it is not, we assume that libgcc_s is being used, and alias to
+  // its __register_frame with the same functionality.
+  auto RTRegisterFrame = ES.intern("__orc_rt_register_eh_frame_section");
+  auto LibUnwindRegisterFrame = ES.intern("__unw_add_dynamic_eh_frame_section");
+  auto RTDeregisterFrame = ES.intern("__orc_rt_deregister_eh_frame_section");
+  auto LibUnwindDeregisterFrame =
+      ES.intern("__unw_remove_dynamic_eh_frame_section");
+  auto SM = ES.lookup(makeJITDylibSearchOrder(&PlatformJD),
+                      SymbolLookupSet()
+                          .add(LibUnwindRegisterFrame,
+                               SymbolLookupFlags::WeaklyReferencedSymbol)
+                          .add(LibUnwindDeregisterFrame,
+                               SymbolLookupFlags::WeaklyReferencedSymbol));
+  if (!SM) { // Weak-ref means no "missing symbol" errors, so this must be
+             // something more serious that we should report.
+    return SM.takeError();
+  } else if (SM->size() == 2) {
+    LLVM_DEBUG({
+      dbgs() << "Using libunwind " << LibUnwindRegisterFrame
+             << " for unwind info registration\n";
+    });
+    Aliases[std::move(RTRegisterFrame)] = {LibUnwindRegisterFrame,
+                                           JITSymbolFlags::Exported};
+    Aliases[std::move(RTDeregisterFrame)] = {LibUnwindDeregisterFrame,
+                                             JITSymbolFlags::Exported};
+  } else {
+    // Since LLVM libunwind is not present, we assume that unwinding
+    // is provided by libgcc
+    LLVM_DEBUG({
+      dbgs() << "Using libgcc __register_frame"
+             << " for unwind info registration\n";
+    });
+    Aliases[std::move(RTRegisterFrame)] = {ES.intern("__register_frame"),
+                                           JITSymbolFlags::Exported};
+    Aliases[std::move(RTDeregisterFrame)] = {ES.intern("__deregister_frame"),
+                                             JITSymbolFlags::Exported};
+  }
+
   return Aliases;
 }
 
@@ -210,6 +261,10 @@ ELFNixPlatform::standardRuntimeUtilityAliases() {
   static const std::pair<const char *, const char *>
       StandardRuntimeUtilityAliases[] = {
           {"__orc_rt_run_program", "__orc_rt_elfnix_run_program"},
+          {"__orc_rt_jit_dlerror", "__orc_rt_elfnix_jit_dlerror"},
+          {"__orc_rt_jit_dlopen", "__orc_rt_elfnix_jit_dlopen"},
+          {"__orc_rt_jit_dlclose", "__orc_rt_elfnix_jit_dlclose"},
+          {"__orc_rt_jit_dlsym", "__orc_rt_elfnix_jit_dlsym"},
           {"__orc_rt_log_error", "__orc_rt_log_error_to_stderr"}};
 
   return ArrayRef<std::pair<const char *, const char *>>(
@@ -217,16 +272,16 @@ ELFNixPlatform::standardRuntimeUtilityAliases() {
 }
 
 bool ELFNixPlatform::isInitializerSection(StringRef SecName) {
-  for (auto &Name : InitSectionNames) {
-    if (Name.equals(SecName))
-      return true;
-  }
+  if (SecName.consume_front(InitArrayFuncSectionName) &&
+      (SecName.empty() || SecName[0] == '.'))
+    return true;
   return false;
 }
 
 bool ELFNixPlatform::supportedTarget(const Triple &TT) {
   switch (TT.getArch()) {
   case Triple::x86_64:
+  case Triple::aarch64:
     return true;
   default:
     return false;
@@ -723,16 +778,15 @@ Error ELFNixPlatform::ELFNixPlatformPlugin::preserveInitSections(
     jitlink::LinkGraph &G, MaterializationResponsibility &MR) {
 
   JITLinkSymbolSet InitSectionSymbols;
-  for (auto &InitSectionName : InitSectionNames) {
+  for (auto &InitSection : G.sections()) {
     // Skip non-init sections.
-    auto *InitSection = G.findSectionByName(InitSectionName);
-    if (!InitSection)
+    if (!isInitializerSection(InitSection.getName()))
       continue;
 
     // Make a pass over live symbols in the section: those blocks are already
     // preserved.
     DenseSet<jitlink::Block *> AlreadyLiveBlocks;
-    for (auto &Sym : InitSection->symbols()) {
+    for (auto &Sym : InitSection.symbols()) {
       auto &B = Sym->getBlock();
       if (Sym->isLive() && Sym->getOffset() == 0 &&
           Sym->getSize() == B.getSize() && !AlreadyLiveBlocks.count(&B)) {
@@ -742,7 +796,7 @@ Error ELFNixPlatform::ELFNixPlatformPlugin::preserveInitSections(
     }
 
     // Add anonymous symbols to preserve any not-already-preserved blocks.
-    for (auto *B : InitSection->blocks())
+    for (auto *B : InitSection.blocks())
       if (!AlreadyLiveBlocks.count(B))
         InitSectionSymbols.insert(
             &G.addAnonymousSymbol(*B, 0, B->getSize(), false, true));
@@ -763,9 +817,9 @@ Error ELFNixPlatform::ELFNixPlatformPlugin::registerInitSections(
 
   LLVM_DEBUG({ dbgs() << "ELFNixPlatform::registerInitSections\n"; });
 
-  for (auto InitSectionName : InitSectionNames) {
-    if (auto *Sec = G.findSectionByName(InitSectionName)) {
-      InitSections.push_back(Sec);
+  for (auto &Sec : G.sections()) {
+    if (isInitializerSection(Sec.getName())) {
+      InitSections.push_back(&Sec);
     }
   }
 
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCDebugObjectRegistrar.cpp b/llvm/lib/ExecutionEngine/Orc/EPCDebugObjectRegistrar.cpp
index f3fe0555fa75..c591acdd646b 100644
--- a/llvm/lib/ExecutionEngine/Orc/EPCDebugObjectRegistrar.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/EPCDebugObjectRegistrar.cpp
@@ -45,7 +45,8 @@ createJITLoaderGDBRegistrar(ExecutionSession &ES) {
 
 Error EPCDebugObjectRegistrar::registerDebugObject(
     ExecutorAddrRange TargetMem) {
-  return ES.callSPSWrapper<void(SPSExecutorAddrRange)>(RegisterFn, TargetMem);
+  return ES.callSPSWrapper<void(shared::SPSExecutorAddrRange)>(RegisterFn,
+                                                               TargetMem);
 }
 
 } // namespace orc
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp
index b901a2d2da23..48aaab96e71f 100644
--- a/llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp
@@ -88,7 +88,6 @@ EPCTrampolinePool::EPCTrampolinePool(EPCIndirectionUtils &EPCIU)
 }
 
 Error EPCTrampolinePool::deallocatePool() {
-  Error Err = Error::success();
   std::promise<MSVCPError> DeallocResultP;
   auto DeallocResultF = DeallocResultP.get_future();
 
@@ -234,7 +233,7 @@ Error EPCIndirectStubsManager::updatePointer(StringRef Name,
 namespace llvm {
 namespace orc {
 
-EPCIndirectionUtils::ABISupport::~ABISupport() {}
+EPCIndirectionUtils::ABISupport::~ABISupport() = default;
 
 Expected<std::unique_ptr<EPCIndirectionUtils>>
 EPCIndirectionUtils::Create(ExecutorProcessControl &EPC) {
@@ -261,6 +260,9 @@ EPCIndirectionUtils::Create(ExecutorProcessControl &EPC) {
   case Triple::mips64el:
     return CreateWithABI<OrcMips64>(EPC);
 
+  case Triple::riscv64:
+    return CreateWithABI<OrcRiscv64>(EPC);
+
   case Triple::x86_64:
     if (TT.getOS() == Triple::OSType::Win32)
       return CreateWithABI<OrcX86_64_Win32>(EPC);
@@ -302,7 +304,8 @@ EPCIndirectionUtils::writeResolverBlock(JITTargetAddress ReentryFnAddr,
     return Alloc.takeError();
 
   auto SegInfo = Alloc->getSegInfo(MemProt::Read | MemProt::Exec);
-  ABI->writeResolverCode(SegInfo.WorkingMem.data(), SegInfo.Addr.getValue(),
+  ResolverBlockAddr = SegInfo.Addr.getValue();
+  ABI->writeResolverCode(SegInfo.WorkingMem.data(), ResolverBlockAddr,
                          ReentryFnAddr, ReentryCtxAddr);
 
   auto FA = Alloc->finalize();
@@ -310,7 +313,7 @@ EPCIndirectionUtils::writeResolverBlock(JITTargetAddress ReentryFnAddr,
     return FA.takeError();
 
   ResolverBlock = std::move(*FA);
-  return SegInfo.Addr.getValue();
+  return ResolverBlockAddr;
 }
 
 std::unique_ptr<IndirectStubsManager>
diff --git a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
index ae2d47fb8c5e..95cf89ec3f8b 100644
--- a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
@@ -62,7 +62,7 @@ CtorDtorIterator::Element CtorDtorIterator::operator*() const {
       break;
     } else if (ConstantExpr *CE = dyn_cast_or_null<ConstantExpr>(FuncC)) {
       if (CE->isCast())
-        FuncC = dyn_cast_or_null<ConstantExpr>(CE->getOperand(0));
+        FuncC = CE->getOperand(0);
       else
         break;
     } else {
@@ -273,10 +273,10 @@ Expected<std::unique_ptr<StaticLibraryDefinitionGenerator>>
 StaticLibraryDefinitionGenerator::Load(
     ObjectLayer &L, const char *FileName,
     GetObjectFileInterface GetObjFileInterface) {
-  auto ArchiveBuffer = errorOrToExpected(MemoryBuffer::getFile(FileName));
+  auto ArchiveBuffer = MemoryBuffer::getFile(FileName);
 
   if (!ArchiveBuffer)
-    return ArchiveBuffer.takeError();
+    return createFileError(FileName, ArchiveBuffer.getError());
 
   return Create(L, std::move(*ArchiveBuffer), std::move(GetObjFileInterface));
 }
@@ -288,7 +288,7 @@ StaticLibraryDefinitionGenerator::Load(
 
   auto B = object::createBinary(FileName);
   if (!B)
-    return B.takeError();
+    return createFileError(FileName, B.takeError());
 
   // If this is a regular archive then create an instance from it.
   if (isa<object::Archive>(B->getBinary()))
diff --git a/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp b/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
index 2eb835551adb..412b9f95ea62 100644
--- a/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
@@ -19,9 +19,9 @@
 namespace llvm {
 namespace orc {
 
-ExecutorProcessControl::MemoryAccess::~MemoryAccess() {}
+ExecutorProcessControl::MemoryAccess::~MemoryAccess() = default;
 
-ExecutorProcessControl::~ExecutorProcessControl() {}
+ExecutorProcessControl::~ExecutorProcessControl() = default;
 
 SelfExecutorProcessControl::SelfExecutorProcessControl(
     std::shared_ptr<SymbolStringPool> SSP, std::unique_ptr<TaskDispatcher> D,
diff --git a/llvm/lib/ExecutionEngine/Orc/IRCompileLayer.cpp b/llvm/lib/ExecutionEngine/Orc/IRCompileLayer.cpp
index aadc437c80c4..69aba1fff59a 100644
--- a/llvm/lib/ExecutionEngine/Orc/IRCompileLayer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/IRCompileLayer.cpp
@@ -11,7 +11,7 @@
 namespace llvm {
 namespace orc {
 
-IRCompileLayer::IRCompiler::~IRCompiler() {}
+IRCompileLayer::IRCompiler::~IRCompiler() = default;
 
 IRCompileLayer::IRCompileLayer(ExecutionSession &ES, ObjectLayer &BaseLayer,
                                std::unique_ptr<IRCompiler> Compile)
diff --git a/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
index 7a71d2f781d7..38cab526704f 100644
--- a/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
@@ -59,7 +59,7 @@ private:
 namespace llvm {
 namespace orc {
 
-TrampolinePool::~TrampolinePool() {}
+TrampolinePool::~TrampolinePool() = default;
 void IndirectStubsManager::anchor() {}
 
 Expected<JITTargetAddress>
@@ -152,6 +152,11 @@ createLocalCompileCallbackManager(const Triple &T, ExecutionSession &ES,
       return CCMgrT::Create(ES, ErrorHandlerAddress);
     }
 
+    case Triple::riscv64: {
+      typedef orc::LocalJITCompileCallbackManager<orc::OrcRiscv64> CCMgrT;
+      return CCMgrT::Create(ES, ErrorHandlerAddress);
+    }
+
     case Triple::x86_64: {
       if (T.getOS() == Triple::OSType::Win32) {
         typedef orc::LocalJITCompileCallbackManager<orc::OrcX86_64_Win32> CCMgrT;
@@ -206,6 +211,12 @@ createLocalIndirectStubsManagerBuilder(const Triple &T) {
                       orc::LocalIndirectStubsManager<orc::OrcMips64>>();
       };
 
+    case Triple::riscv64:
+      return []() {
+        return std::make_unique<
+            orc::LocalIndirectStubsManager<orc::OrcRiscv64>>();
+      };
+
     case Triple::x86_64:
       if (T.getOS() == Triple::OSType::Win32) {
         return [](){
@@ -431,8 +442,7 @@ Error addFunctionPointerRelocationsToCurrentSymbol(jitlink::Symbol &Sym,
 
     auto RelocOffInInstr =
         MIA.getMemoryOperandRelocationOffset(Instr, InstrSize);
-    if (!RelocOffInInstr.hasValue() ||
-        InstrSize - RelocOffInInstr.getValue() != 4) {
+    if (!RelocOffInInstr || InstrSize - *RelocOffInInstr != 4) {
       LLVM_DEBUG(dbgs() << "Skipping unknown self-relocation at "
                         << InstrStart);
       continue;
diff --git a/llvm/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp b/llvm/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp
index 0fbf79b8a56d..c60f4b3b263c 100644
--- a/llvm/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp
@@ -19,6 +19,7 @@ JITTargetMachineBuilder::JITTargetMachineBuilder(Triple TT)
     : TT(std::move(TT)) {
   Options.EmulatedTLS = true;
   Options.ExplicitEmulatedTLS = true;
+  Options.UseInitArray = true;
 }
 
 Expected<JITTargetMachineBuilder> JITTargetMachineBuilder::detectHost() {
diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
index 91949c9d7eeb..6d67e6d87b56 100644
--- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -143,7 +143,7 @@ public:
         JITEvaluatedSymbol(pointerToJITTargetAddress(this),
                            JITSymbolFlags::Exported);
     StdInterposes[J.mangleAndIntern("__lljit.cxa_atexit_helper")] =
-        JITEvaluatedSymbol(pointerToJITTargetAddress(registerAtExitHelper),
+        JITEvaluatedSymbol(pointerToJITTargetAddress(registerCxaAtExitHelper),
                            JITSymbolFlags());
 
     cantFail(
@@ -162,6 +162,9 @@ public:
     PerJDInterposes[J.mangleAndIntern("__lljit.run_atexits_helper")] =
         JITEvaluatedSymbol(pointerToJITTargetAddress(runAtExitsHelper),
                            JITSymbolFlags());
+    PerJDInterposes[J.mangleAndIntern("__lljit.atexit_helper")] =
+        JITEvaluatedSymbol(pointerToJITTargetAddress(registerAtExitHelper),
+                           JITSymbolFlags());
     cantFail(JD.define(absoluteSymbols(std::move(PerJDInterposes))));
 
     auto Ctx = std::make_unique<LLVMContext>();
@@ -190,6 +193,14 @@ public:
         GlobalValue::HiddenVisibility, "__lljit.run_atexits_helper",
         {PlatformInstanceDecl, DSOHandle});
 
+    auto *IntTy = Type::getIntNTy(*Ctx, sizeof(int) * CHAR_BIT);
+    auto *AtExitCallbackTy = FunctionType::get(VoidTy, {}, false);
+    auto *AtExitCallbackPtrTy = PointerType::getUnqual(AtExitCallbackTy);
+    addHelperAndWrapper(*M, "atexit",
+                        FunctionType::get(IntTy, {AtExitCallbackPtrTy}, false),
+                        GlobalValue::HiddenVisibility, "__lljit.atexit_helper",
+                        {PlatformInstanceDecl, DSOHandle});
+
     return J.addIRModule(JD, ThreadSafeModule(std::move(M), std::move(Ctx)));
   }
 
@@ -413,16 +424,25 @@ private:
         .takeError();
   }
 
-  static void registerAtExitHelper(void *Self, void (*F)(void *), void *Ctx,
-                                   void *DSOHandle) {
+  static void registerCxaAtExitHelper(void *Self, void (*F)(void *), void *Ctx,
+                                      void *DSOHandle) {
     LLVM_DEBUG({
-      dbgs() << "Registering atexit function " << (void *)F << " for JD "
+      dbgs() << "Registering cxa atexit function " << (void *)F << " for JD "
              << (*static_cast<JITDylib **>(DSOHandle))->getName() << "\n";
     });
     static_cast<GenericLLVMIRPlatformSupport *>(Self)->AtExitMgr.registerAtExit(
         F, Ctx, DSOHandle);
   }
 
+  static void registerAtExitHelper(void *Self, void *DSOHandle, void (*F)()) {
+    LLVM_DEBUG({
+      dbgs() << "Registering atexit function " << (void *)F << " for JD "
+             << (*static_cast<JITDylib **>(DSOHandle))->getName() << "\n";
+    });
+    static_cast<GenericLLVMIRPlatformSupport *>(Self)->AtExitMgr.registerAtExit(
+        reinterpret_cast<void (*)(void *)>(F), nullptr, DSOHandle);
+  }
+
   static void runAtExitsHelper(void *Self, void *DSOHandle) {
     LLVM_DEBUG({
       dbgs() << "Running atexit functions for JD "
@@ -450,12 +470,12 @@ private:
     auto *IntTy = Type::getIntNTy(*Ctx, sizeof(int) * CHAR_BIT);
     auto *VoidTy = Type::getVoidTy(*Ctx);
     auto *BytePtrTy = PointerType::getUnqual(Int8Ty);
-    auto *AtExitCallbackTy = FunctionType::get(VoidTy, {BytePtrTy}, false);
-    auto *AtExitCallbackPtrTy = PointerType::getUnqual(AtExitCallbackTy);
+    auto *CxaAtExitCallbackTy = FunctionType::get(VoidTy, {BytePtrTy}, false);
+    auto *CxaAtExitCallbackPtrTy = PointerType::getUnqual(CxaAtExitCallbackTy);
 
     addHelperAndWrapper(
         *M, "__cxa_atexit",
-        FunctionType::get(IntTy, {AtExitCallbackPtrTy, BytePtrTy, BytePtrTy},
+        FunctionType::get(IntTy, {CxaAtExitCallbackPtrTy, BytePtrTy, BytePtrTy},
                           false),
         GlobalValue::DefaultVisibility, "__lljit.cxa_atexit_helper",
         {PlatformInstanceDecl});
@@ -521,11 +541,7 @@ GlobalCtorDtorScraper::operator()(ThreadSafeModule TSM,
 
       for (auto E : COrDtors)
         InitsOrDeInits.push_back(std::make_pair(E.Func, E.Priority));
-      llvm::sort(InitsOrDeInits,
-                 [](const std::pair<Function *, unsigned> &LHS,
-                    const std::pair<Function *, unsigned> &RHS) {
-                   return LHS.first < RHS.first;
-                 });
+      llvm::sort(InitsOrDeInits, llvm::less_second());
 
       auto *InitOrDeInitFuncEntryBlock =
           BasicBlock::Create(Ctx, "entry", InitOrDeInitFunc);
@@ -589,7 +605,7 @@ void LLJIT::PlatformSupport::setInitTransform(
   J.InitHelperTransformLayer->setTransform(std::move(T));
 }
 
-LLJIT::PlatformSupport::~PlatformSupport() {}
+LLJIT::PlatformSupport::~PlatformSupport() = default;
 
 Error LLJITBuilderState::prepareForConstruction() {
 
@@ -701,10 +717,14 @@ Error LLJIT::addObjectFile(JITDylib &JD, std::unique_ptr<MemoryBuffer> Obj) {
   return addObjectFile(JD.getDefaultResourceTracker(), std::move(Obj));
 }
 
-Expected<JITEvaluatedSymbol> LLJIT::lookupLinkerMangled(JITDylib &JD,
-                                                        SymbolStringPtr Name) {
-  return ES->lookup(
-      makeJITDylibSearchOrder(&JD, JITDylibLookupFlags::MatchAllSymbols), Name);
+Expected<ExecutorAddr> LLJIT::lookupLinkerMangled(JITDylib &JD,
+                                                  SymbolStringPtr Name) {
+  if (auto Sym = ES->lookup(
+        makeJITDylibSearchOrder(&JD, JITDylibLookupFlags::MatchAllSymbols),
+        Name))
+    return ExecutorAddr(Sym->getAddress());
+  else
+    return Sym.takeError();
 }
 
 Expected<std::unique_ptr<ObjectLayer>>
@@ -897,7 +917,7 @@ LLLazyJIT::LLLazyJIT(LLLazyJITBuilderState &S, Error &Err) : LLJIT(S, Err) {
     LCTMgr = std::move(S.LCTMgr);
   else {
     if (auto LCTMgrOrErr = createLocalLazyCallThroughManager(
-            S.TT, *ES, S.LazyCompileFailureAddr))
+        S.TT, *ES, S.LazyCompileFailureAddr.getValue()))
       LCTMgr = std::move(*LCTMgrOrErr);
     else {
       Err = LCTMgrOrErr.takeError();
diff --git a/llvm/lib/ExecutionEngine/Orc/Layer.cpp b/llvm/lib/ExecutionEngine/Orc/Layer.cpp
index adb8861793b1..4a50f2d7a153 100644
--- a/llvm/lib/ExecutionEngine/Orc/Layer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Layer.cpp
@@ -19,7 +19,7 @@
 namespace llvm {
 namespace orc {
 
-IRLayer::~IRLayer() {}
+IRLayer::~IRLayer() = default;
 
 Error IRLayer::add(ResourceTrackerSP RT, ThreadSafeModule TSM) {
   assert(RT && "RT can not be null");
@@ -158,7 +158,7 @@ char ObjectLayer::ID;
 
 ObjectLayer::ObjectLayer(ExecutionSession &ES) : ES(ES) {}
 
-ObjectLayer::~ObjectLayer() {}
+ObjectLayer::~ObjectLayer() = default;
 
 Error ObjectLayer::add(ResourceTrackerSP RT, std::unique_ptr<MemoryBuffer> O,
                        MaterializationUnit::Interface I) {
diff --git a/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp b/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp
index 66453e6a632f..20b655bdf4b1 100644
--- a/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp
@@ -131,6 +131,10 @@ createLocalLazyCallThroughManager(const Triple &T, ExecutionSession &ES,
   case Triple::mips64el:
     return LocalLazyCallThroughManager::Create<OrcMips64>(ES, ErrorHandlerAddr);
 
+  case Triple::riscv64:
+    return LocalLazyCallThroughManager::Create<OrcRiscv64>(ES,
+                                                           ErrorHandlerAddr);
+
   case Triple::x86_64:
     if (T.getOS() == Triple::OSType::Win32)
       return LocalLazyCallThroughManager::Create<OrcX86_64_Win32>(
diff --git a/llvm/lib/ExecutionEngine/Orc/LookupAndRecordAddrs.cpp b/llvm/lib/ExecutionEngine/Orc/LookupAndRecordAddrs.cpp
index 44cb78c773c9..3452267e4df4 100644
--- a/llvm/lib/ExecutionEngine/Orc/LookupAndRecordAddrs.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/LookupAndRecordAddrs.cpp
@@ -24,7 +24,7 @@ void lookupAndRecordAddrs(
     Symbols.add(KV.first, LookupFlags);
 
   ES.lookup(
-      K, SearchOrder, Symbols, SymbolState::Ready,
+      K, SearchOrder, std::move(Symbols), SymbolState::Ready,
       [Pairs = std::move(Pairs),
        OnRec = std::move(OnRecorded)](Expected<SymbolMap> Result) mutable {
         if (!Result)
@@ -47,7 +47,7 @@ Error lookupAndRecordAddrs(
   std::promise<MSVCPError> ResultP;
   auto ResultF = ResultP.get_future();
   lookupAndRecordAddrs([&](Error Err) { ResultP.set_value(std::move(Err)); },
-                       ES, K, SearchOrder, Pairs, LookupFlags);
+                       ES, K, SearchOrder, std::move(Pairs), LookupFlags);
   return ResultF.get();
 }
 
diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
index a364719855b4..d5274b06a76f 100644
--- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -22,6 +22,39 @@ using namespace llvm;
 using namespace llvm::orc;
 using namespace llvm::orc::shared;
 
+namespace llvm {
+namespace orc {
+namespace shared {
+
+using SPSMachOJITDylibDepInfo = SPSTuple<bool, SPSSequence<SPSExecutorAddr>>;
+using SPSMachOJITDylibDepInfoMap =
+    SPSSequence<SPSTuple<SPSExecutorAddr, SPSMachOJITDylibDepInfo>>;
+
+template <>
+class SPSSerializationTraits<SPSMachOJITDylibDepInfo,
+                             MachOPlatform::MachOJITDylibDepInfo> {
+public:
+  static size_t size(const MachOPlatform::MachOJITDylibDepInfo &DDI) {
+    return SPSMachOJITDylibDepInfo::AsArgList::size(DDI.Sealed, DDI.DepHeaders);
+  }
+
+  static bool serialize(SPSOutputBuffer &OB,
+                        const MachOPlatform::MachOJITDylibDepInfo &DDI) {
+    return SPSMachOJITDylibDepInfo::AsArgList::serialize(OB, DDI.Sealed,
+                                                         DDI.DepHeaders);
+  }
+
+  static bool deserialize(SPSInputBuffer &IB,
+                          MachOPlatform::MachOJITDylibDepInfo &DDI) {
+    return SPSMachOJITDylibDepInfo::AsArgList::deserialize(IB, DDI.Sealed,
+                                                           DDI.DepHeaders);
+  }
+};
+
+} // namespace shared
+} // namespace orc
+} // namespace llvm
+
 namespace {
 
 class MachOHeaderMaterializationUnit : public MaterializationUnit {
@@ -199,11 +232,25 @@ MachOPlatform::Create(ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer,
 }
 
 Error MachOPlatform::setupJITDylib(JITDylib &JD) {
-  return JD.define(std::make_unique<MachOHeaderMaterializationUnit>(
-      *this, MachOHeaderStartSymbol));
+  if (auto Err = JD.define(std::make_unique<MachOHeaderMaterializationUnit>(
+          *this, MachOHeaderStartSymbol)))
+    return Err;
+
+  return ES.lookup({&JD}, MachOHeaderStartSymbol).takeError();
 }
 
-Error MachOPlatform::teardownJITDylib(JITDylib &JD) { return Error::success(); }
+Error MachOPlatform::teardownJITDylib(JITDylib &JD) {
+  std::lock_guard<std::mutex> Lock(PlatformMutex);
+  auto I = JITDylibToHeaderAddr.find(&JD);
+  if (I != JITDylibToHeaderAddr.end()) {
+    assert(HeaderAddrToJITDylib.count(I->second) &&
+           "HeaderAddrToJITDylib missing entry");
+    HeaderAddrToJITDylib.erase(I->second);
+    JITDylibToHeaderAddr.erase(I);
+  }
+  JITDylibToPThreadKey.erase(&JD);
+  return Error::success();
+}
 
 Error MachOPlatform::notifyAdding(ResourceTracker &RT,
                                   const MaterializationUnit &MU) {
@@ -255,6 +302,10 @@ MachOPlatform::standardRuntimeUtilityAliases() {
   static const std::pair<const char *, const char *>
       StandardRuntimeUtilityAliases[] = {
           {"___orc_rt_run_program", "___orc_rt_macho_run_program"},
+          {"___orc_rt_jit_dlerror", "___orc_rt_macho_jit_dlerror"},
+          {"___orc_rt_jit_dlopen", "___orc_rt_macho_jit_dlopen"},
+          {"___orc_rt_jit_dlclose", "___orc_rt_macho_jit_dlclose"},
+          {"___orc_rt_jit_dlsym", "___orc_rt_macho_jit_dlsym"},
           {"___orc_rt_log_error", "___orc_rt_log_error_to_stderr"}};
 
   return ArrayRef<std::pair<const char *, const char *>>(
@@ -305,16 +356,6 @@ MachOPlatform::MachOPlatform(
 
   State = BootstrapPhase2;
 
-  // PlatformJD hasn't been 'set-up' by the platform yet (since we're creating
-  // the platform now), so set it up.
-  if (auto E2 = setupJITDylib(PlatformJD)) {
-    Err = std::move(E2);
-    return;
-  }
-
-  RegisteredInitSymbols[&PlatformJD].add(
-      MachOHeaderStartSymbol, SymbolLookupFlags::WeaklyReferencedSymbol);
-
   // Associate wrapper function tags with JIT-side function implementations.
   if (auto E2 = associateRuntimeSupportFunctions(PlatformJD)) {
     Err = std::move(E2);
@@ -329,23 +370,24 @@ MachOPlatform::MachOPlatform(
     return;
   }
 
+  // PlatformJD hasn't been set up by the platform yet (since we're creating
+  // the platform now), so set it up.
+  if (auto E2 = setupJITDylib(PlatformJD)) {
+    Err = std::move(E2);
+    return;
+  }
+
   State = Initialized;
 }
 
 Error MachOPlatform::associateRuntimeSupportFunctions(JITDylib &PlatformJD) {
   ExecutionSession::JITDispatchHandlerAssociationMap WFs;
 
-  using GetInitializersSPSSig =
-      SPSExpected<SPSMachOJITDylibInitializerSequence>(SPSString);
-  WFs[ES.intern("___orc_rt_macho_get_initializers_tag")] =
-      ES.wrapAsyncWithSPS<GetInitializersSPSSig>(
-          this, &MachOPlatform::rt_getInitializers);
-
-  using GetDeinitializersSPSSig =
-      SPSExpected<SPSMachOJITDylibDeinitializerSequence>(SPSExecutorAddr);
-  WFs[ES.intern("___orc_rt_macho_get_deinitializers_tag")] =
-      ES.wrapAsyncWithSPS<GetDeinitializersSPSSig>(
-          this, &MachOPlatform::rt_getDeinitializers);
+  using PushInitializersSPSSig =
+      SPSExpected<SPSMachOJITDylibDepInfoMap>(SPSExecutorAddr);
+  WFs[ES.intern("___orc_rt_macho_push_initializers_tag")] =
+      ES.wrapAsyncWithSPS<PushInitializersSPSSig>(
+          this, &MachOPlatform::rt_pushInitializers);
 
   using LookupSymbolSPSSig =
       SPSExpected<SPSExecutorAddr>(SPSExecutorAddr, SPSString);
@@ -356,53 +398,83 @@ Error MachOPlatform::associateRuntimeSupportFunctions(JITDylib &PlatformJD) {
   return ES.registerJITDispatchHandlers(PlatformJD, std::move(WFs));
 }
 
-void MachOPlatform::getInitializersBuildSequencePhase(
-    SendInitializerSequenceFn SendResult, JITDylib &JD,
-    std::vector<JITDylibSP> DFSLinkOrder) {
-  MachOJITDylibInitializerSequence FullInitSeq;
-  {
-    std::lock_guard<std::mutex> Lock(PlatformMutex);
-    for (auto &InitJD : reverse(DFSLinkOrder)) {
-      LLVM_DEBUG({
-        dbgs() << "MachOPlatform: Appending inits for \"" << InitJD->getName()
-               << "\" to sequence\n";
-      });
-      auto ISItr = InitSeqs.find(InitJD.get());
-      if (ISItr != InitSeqs.end()) {
-        FullInitSeq.emplace_back(std::move(ISItr->second));
-        InitSeqs.erase(ISItr);
-      }
-    }
-  }
-
-  SendResult(std::move(FullInitSeq));
-}
-
-void MachOPlatform::getInitializersLookupPhase(
-    SendInitializerSequenceFn SendResult, JITDylib &JD) {
-
-  auto DFSLinkOrder = JD.getDFSLinkOrder();
-  if (!DFSLinkOrder) {
-    SendResult(DFSLinkOrder.takeError());
-    return;
-  }
-
+void MachOPlatform::pushInitializersLoop(
+    PushInitializersSendResultFn SendResult, JITDylibSP JD) {
   DenseMap<JITDylib *, SymbolLookupSet> NewInitSymbols;
+  DenseMap<JITDylib *, SmallVector<JITDylib *>> JDDepMap;
+  SmallVector<JITDylib *, 16> Worklist({JD.get()});
+
   ES.runSessionLocked([&]() {
-    for (auto &InitJD : *DFSLinkOrder) {
-      auto RISItr = RegisteredInitSymbols.find(InitJD.get());
+    while (!Worklist.empty()) {
+      // FIXME: Check for defunct dylibs.
+
+      auto DepJD = Worklist.back();
+      Worklist.pop_back();
+
+      // If we've already visited this JITDylib on this iteration then continue.
+      if (JDDepMap.count(DepJD))
+        continue;
+
+      // Add dep info.
+      auto &DM = JDDepMap[DepJD];
+      DepJD->withLinkOrderDo([&](const JITDylibSearchOrder &O) {
+        for (auto &KV : O) {
+          if (KV.first == DepJD)
+            continue;
+          DM.push_back(KV.first);
+          Worklist.push_back(KV.first);
+        }
+      });
+
+      // Add any registered init symbols.
+      auto RISItr = RegisteredInitSymbols.find(DepJD);
       if (RISItr != RegisteredInitSymbols.end()) {
-        NewInitSymbols[InitJD.get()] = std::move(RISItr->second);
+        NewInitSymbols[DepJD] = std::move(RISItr->second);
         RegisteredInitSymbols.erase(RISItr);
       }
     }
   });
 
-  // If there are no further init symbols to look up then move on to the next
-  // phase.
+  // If there are no further init symbols to look up then send the link order
+  // (as a list of header addresses) to the caller.
   if (NewInitSymbols.empty()) {
-    getInitializersBuildSequencePhase(std::move(SendResult), JD,
-                                      std::move(*DFSLinkOrder));
+
+    // To make the list intelligible to the runtime we need to convert all
+    // JITDylib pointers to their header addresses.
+    DenseMap<JITDylib *, ExecutorAddr> HeaderAddrs;
+    HeaderAddrs.reserve(JDDepMap.size());
+    {
+      std::lock_guard<std::mutex> Lock(PlatformMutex);
+      for (auto &KV : JDDepMap) {
+        auto I = JITDylibToHeaderAddr.find(KV.first);
+        if (I == JITDylibToHeaderAddr.end()) {
+          // The header address should have been materialized by the previous
+          // round, but we need to handle the pathalogical case where someone
+          // removes the symbol on another thread while we're running.
+          SendResult(
+              make_error<StringError>("JITDylib " + KV.first->getName() +
+                                          " has no registered header address",
+                                      inconvertibleErrorCode()));
+          return;
+        }
+        HeaderAddrs[KV.first] = I->second;
+      }
+    }
+
+    // Build the dep info map to return.
+    MachOJITDylibDepInfoMap DIM;
+    DIM.reserve(JDDepMap.size());
+    for (auto &KV : JDDepMap) {
+      assert(HeaderAddrs.count(KV.first) && "Missing header addr");
+      auto H = HeaderAddrs[KV.first];
+      MachOJITDylibDepInfo DepInfo;
+      for (auto &Dep : KV.second) {
+        assert(HeaderAddrs.count(Dep) && "Missing header addr");
+        DepInfo.DepHeaders.push_back(HeaderAddrs[Dep]);
+      }
+      DIM.push_back(std::make_pair(H, std::move(DepInfo)));
+    }
+    SendResult(DIM);
     return;
   }
 
@@ -412,58 +484,38 @@ void MachOPlatform::getInitializersLookupPhase(
         if (Err)
           SendResult(std::move(Err));
         else
-          getInitializersLookupPhase(std::move(SendResult), JD);
+          pushInitializersLoop(std::move(SendResult), JD);
       },
       ES, std::move(NewInitSymbols));
 }
 
-void MachOPlatform::rt_getInitializers(SendInitializerSequenceFn SendResult,
-                                       StringRef JDName) {
-  LLVM_DEBUG({
-    dbgs() << "MachOPlatform::rt_getInitializers(\"" << JDName << "\")\n";
-  });
-
-  JITDylib *JD = ES.getJITDylibByName(JDName);
-  if (!JD) {
-    LLVM_DEBUG({
-      dbgs() << "  No such JITDylib \"" << JDName << "\". Sending error.\n";
-    });
-    SendResult(make_error<StringError>("No JITDylib named " + JDName,
-                                       inconvertibleErrorCode()));
-    return;
-  }
-
-  getInitializersLookupPhase(std::move(SendResult), *JD);
-}
-
-void MachOPlatform::rt_getDeinitializers(SendDeinitializerSequenceFn SendResult,
-                                         ExecutorAddr Handle) {
-  LLVM_DEBUG({
-    dbgs() << "MachOPlatform::rt_getDeinitializers(\""
-           << formatv("{0:x}", Handle.getValue()) << "\")\n";
-  });
-
-  JITDylib *JD = nullptr;
-
+void MachOPlatform::rt_pushInitializers(PushInitializersSendResultFn SendResult,
+                                        ExecutorAddr JDHeaderAddr) {
+  JITDylibSP JD;
   {
     std::lock_guard<std::mutex> Lock(PlatformMutex);
-    auto I = HeaderAddrToJITDylib.find(Handle);
+    auto I = HeaderAddrToJITDylib.find(JDHeaderAddr);
     if (I != HeaderAddrToJITDylib.end())
       JD = I->second;
   }
 
+  LLVM_DEBUG({
+    dbgs() << "MachOPlatform::rt_pushInitializers(" << JDHeaderAddr << ") ";
+    if (JD)
+      dbgs() << "pushing initializers for " << JD->getName() << "\n";
+    else
+      dbgs() << "No JITDylib for header address.\n";
+  });
+
   if (!JD) {
-    LLVM_DEBUG({
-      dbgs() << "  No JITDylib for handle "
-             << formatv("{0:x}", Handle.getValue()) << "\n";
-    });
-    SendResult(make_error<StringError>("No JITDylib associated with handle " +
-                                           formatv("{0:x}", Handle.getValue()),
-                                       inconvertibleErrorCode()));
+    SendResult(
+        make_error<StringError>("No JITDylib with header addr " +
+                                    formatv("{0:x}", JDHeaderAddr.getValue()),
+                                inconvertibleErrorCode()));
     return;
   }
 
-  SendResult(MachOJITDylibDeinitializerSequence());
+  pushInitializersLoop(std::move(SendResult), JD);
 }
 
 void MachOPlatform::rt_lookupSymbol(SendSymbolAddressFn SendResult,
@@ -526,10 +578,14 @@ Error MachOPlatform::bootstrapMachORuntime(JITDylib &PlatformJD) {
             &orc_rt_macho_platform_bootstrap},
            {ES.intern("___orc_rt_macho_platform_shutdown"),
             &orc_rt_macho_platform_shutdown},
-           {ES.intern("___orc_rt_macho_register_thread_data_section"),
-            &orc_rt_macho_register_thread_data_section},
-           {ES.intern("___orc_rt_macho_deregister_thread_data_section"),
-            &orc_rt_macho_deregister_thread_data_section},
+           {ES.intern("___orc_rt_macho_register_jitdylib"),
+            &orc_rt_macho_register_jitdylib},
+           {ES.intern("___orc_rt_macho_deregister_jitdylib"),
+            &orc_rt_macho_deregister_jitdylib},
+           {ES.intern("___orc_rt_macho_register_object_platform_sections"),
+            &orc_rt_macho_register_object_platform_sections},
+           {ES.intern("___orc_rt_macho_deregister_object_platform_sections"),
+            &orc_rt_macho_deregister_object_platform_sections},
            {ES.intern("___orc_rt_macho_create_pthread_key"),
             &orc_rt_macho_create_pthread_key}}))
     return Err;
@@ -537,45 +593,6 @@ Error MachOPlatform::bootstrapMachORuntime(JITDylib &PlatformJD) {
   return ES.callSPSWrapper<void()>(orc_rt_macho_platform_bootstrap);
 }
 
-Error MachOPlatform::registerInitInfo(
-    JITDylib &JD, ExecutorAddr ObjCImageInfoAddr,
-    ArrayRef<jitlink::Section *> InitSections) {
-
-  std::unique_lock<std::mutex> Lock(PlatformMutex);
-
-  MachOJITDylibInitializers *InitSeq = nullptr;
-  {
-    auto I = InitSeqs.find(&JD);
-    if (I == InitSeqs.end()) {
-      // If there's no init sequence entry yet then we need to look up the
-      // header symbol to force creation of one.
-      Lock.unlock();
-
-      auto SearchOrder =
-          JD.withLinkOrderDo([](const JITDylibSearchOrder &SO) { return SO; });
-      if (auto Err = ES.lookup(SearchOrder, MachOHeaderStartSymbol).takeError())
-        return Err;
-
-      Lock.lock();
-      I = InitSeqs.find(&JD);
-      assert(I != InitSeqs.end() &&
-             "Entry missing after header symbol lookup?");
-    }
-    InitSeq = &I->second;
-  }
-
-  InitSeq->ObjCImageInfoAddress = ObjCImageInfoAddr;
-
-  for (auto *Sec : InitSections) {
-    // FIXME: Avoid copy here.
-    jitlink::SectionRange R(*Sec);
-    InitSeq->InitSections[Sec->getName()].push_back(
-        {ExecutorAddr(R.getStart()), ExecutorAddr(R.getEnd())});
-  }
-
-  return Error::success();
-}
-
 Expected<uint64_t> MachOPlatform::createPThreadKey() {
   if (!orc_rt_macho_create_pthread_key)
     return make_error<StringError>(
@@ -617,11 +634,6 @@ void MachOPlatform::MachOPlatformPlugin::modifyPassConfig(
         return Err;
       return processObjCImageInfo(G, MR);
     });
-
-    Config.PostFixupPasses.push_back(
-        [this, &JD = MR.getTargetJITDylib()](jitlink::LinkGraph &G) {
-          return registerInitSections(G, JD);
-        });
   }
 
   // --- Add passes for eh-frame and TLV support ---
@@ -639,10 +651,12 @@ void MachOPlatform::MachOPlatformPlugin::modifyPassConfig(
         return fixTLVSectionsAndEdges(G, JD);
       });
 
-  // Add a pass to register the final addresses of the eh-frame and TLV sections
-  // with the runtime.
-  Config.PostFixupPasses.push_back(
-      [this](jitlink::LinkGraph &G) { return registerEHAndTLVSections(G); });
+  // Add a pass to register the final addresses of any special sections in the
+  // object with the runtime.
+  Config.PostAllocationPasses.push_back(
+      [this, &JD = MR.getTargetJITDylib()](jitlink::LinkGraph &G) {
+        return registerObjectPlatformSections(G, JD);
+      });
 }
 
 ObjectLinkingLayer::Plugin::SyntheticSymbolDependenciesMap
@@ -661,7 +675,6 @@ MachOPlatform::MachOPlatformPlugin::getSyntheticSymbolDependencies(
 
 Error MachOPlatform::MachOPlatformPlugin::associateJITDylibHeaderSymbol(
     jitlink::LinkGraph &G, MaterializationResponsibility &MR) {
-
   auto I = llvm::find_if(G.defined_symbols(), [this](jitlink::Symbol *Sym) {
     return Sym->getName() == *MP.MachOHeaderStartSymbol;
   });
@@ -670,10 +683,14 @@ Error MachOPlatform::MachOPlatformPlugin::associateJITDylibHeaderSymbol(
   auto &JD = MR.getTargetJITDylib();
   std::lock_guard<std::mutex> Lock(MP.PlatformMutex);
   auto HeaderAddr = (*I)->getAddress();
+  MP.JITDylibToHeaderAddr[&JD] = HeaderAddr;
   MP.HeaderAddrToJITDylib[HeaderAddr] = &JD;
-  assert(!MP.InitSeqs.count(&JD) && "InitSeq entry for JD already exists");
-  MP.InitSeqs.insert(
-      std::make_pair(&JD, MachOJITDylibInitializers(JD.getName(), HeaderAddr)));
+  G.allocActions().push_back(
+      {cantFail(
+           WrapperFunctionCall::Create<SPSArgList<SPSString, SPSExecutorAddr>>(
+               MP.orc_rt_macho_register_jitdylib, JD.getName(), HeaderAddr)),
+       cantFail(WrapperFunctionCall::Create<SPSArgList<SPSExecutorAddr>>(
+           MP.orc_rt_macho_deregister_jitdylib, HeaderAddr))});
   return Error::success();
 }
 
@@ -792,37 +809,6 @@ Error MachOPlatform::MachOPlatformPlugin::processObjCImageInfo(
   return Error::success();
 }
 
-Error MachOPlatform::MachOPlatformPlugin::registerInitSections(
-    jitlink::LinkGraph &G, JITDylib &JD) {
-
-  ExecutorAddr ObjCImageInfoAddr;
-  SmallVector<jitlink::Section *> InitSections;
-
-  if (auto *ObjCImageInfoSec = G.findSectionByName(ObjCImageInfoSectionName)) {
-    if (auto Addr = jitlink::SectionRange(*ObjCImageInfoSec).getStart())
-      ObjCImageInfoAddr = Addr;
-  }
-
-  for (auto InitSectionName : InitSectionNames)
-    if (auto *Sec = G.findSectionByName(InitSectionName))
-      InitSections.push_back(Sec);
-
-  // Dump the scraped inits.
-  LLVM_DEBUG({
-    dbgs() << "MachOPlatform: Scraped " << G.getName() << " init sections:\n";
-    if (ObjCImageInfoAddr)
-      dbgs() << "  " << ObjCImageInfoSectionName << ": "
-             << formatv("{0:x}", ObjCImageInfoAddr.getValue()) << "\n";
-    for (auto *Sec : InitSections) {
-      jitlink::SectionRange R(*Sec);
-      dbgs() << "  " << Sec->getName() << ": "
-             << formatv("[ {0:x} -- {1:x} ]", R.getStart(), R.getEnd()) << "\n";
-    }
-  });
-
-  return MP.registerInitInfo(JD, ObjCImageInfoAddr, InitSections);
-}
-
 Error MachOPlatform::MachOPlatformPlugin::fixTLVSectionsAndEdges(
     jitlink::LinkGraph &G, JITDylib &JD) {
 
@@ -879,11 +865,10 @@ Error MachOPlatform::MachOPlatformPlugin::fixTLVSectionsAndEdges(
   return Error::success();
 }
 
-Error MachOPlatform::MachOPlatformPlugin::registerEHAndTLVSections(
-    jitlink::LinkGraph &G) {
+Error MachOPlatform::MachOPlatformPlugin::registerObjectPlatformSections(
+    jitlink::LinkGraph &G, JITDylib &JD) {
 
-  // Add a pass to register the final addresses of the eh-frame and TLV sections
-  // with the runtime.
+  // Add an action to register the eh-frame.
   if (auto *EHFrameSection = G.findSectionByName(EHFrameSectionName)) {
     jitlink::SectionRange R(*EHFrameSection);
     if (!R.empty())
@@ -912,6 +897,8 @@ Error MachOPlatform::MachOPlatformPlugin::registerEHAndTLVSections(
       ThreadDataSection = ThreadBSSSection;
   }
 
+  SmallVector<std::pair<StringRef, ExecutorAddrRange>, 8> MachOPlatformSecs;
+
   // Having merged thread BSS (if present) and thread data (if present),
   // record the resulting section range.
   if (ThreadDataSection) {
@@ -922,16 +909,64 @@ Error MachOPlatform::MachOPlatformPlugin::registerEHAndTLVSections(
                                        "MachOPlatform has not finished booting",
                                        inconvertibleErrorCode());
 
-      G.allocActions().push_back(
-          {cantFail(
-               WrapperFunctionCall::Create<SPSArgList<SPSExecutorAddrRange>>(
-                   MP.orc_rt_macho_register_thread_data_section, R.getRange())),
-           cantFail(
-               WrapperFunctionCall::Create<SPSArgList<SPSExecutorAddrRange>>(
-                   MP.orc_rt_macho_deregister_thread_data_section,
-                   R.getRange()))});
+      MachOPlatformSecs.push_back({ThreadDataSectionName, R.getRange()});
+    }
+  }
+
+  // If any platform sections were found then add an allocation action to call
+  // the registration function.
+  StringRef PlatformSections[] = {
+      ModInitFuncSectionName,   ObjCClassListSectionName,
+      ObjCImageInfoSectionName, ObjCSelRefsSectionName,
+      Swift5ProtoSectionName,   Swift5ProtosSectionName,
+      Swift5TypesSectionName,
+  };
+
+  for (auto &SecName : PlatformSections) {
+    auto *Sec = G.findSectionByName(SecName);
+    if (!Sec)
+      continue;
+    jitlink::SectionRange R(*Sec);
+    if (R.empty())
+      continue;
+
+    MachOPlatformSecs.push_back({SecName, R.getRange()});
+  }
+
+  if (!MachOPlatformSecs.empty()) {
+    Optional<ExecutorAddr> HeaderAddr;
+    {
+      std::lock_guard<std::mutex> Lock(MP.PlatformMutex);
+      auto I = MP.JITDylibToHeaderAddr.find(&JD);
+      if (I != MP.JITDylibToHeaderAddr.end())
+        HeaderAddr = I->second;
     }
+
+    if (!HeaderAddr)
+      return make_error<StringError>("Missing header for " + JD.getName(),
+                                     inconvertibleErrorCode());
+
+    // Dump the scraped inits.
+    LLVM_DEBUG({
+      dbgs() << "MachOPlatform: Scraped " << G.getName() << " init sections:\n";
+      for (auto &KV : MachOPlatformSecs)
+        dbgs() << "  " << KV.first << ": " << KV.second << "\n";
+    });
+
+    using SPSRegisterObjectPlatformSectionsArgs =
+        SPSArgList<SPSExecutorAddr,
+                   SPSSequence<SPSTuple<SPSString, SPSExecutorAddrRange>>>;
+    G.allocActions().push_back(
+        {cantFail(
+             WrapperFunctionCall::Create<SPSRegisterObjectPlatformSectionsArgs>(
+                 MP.orc_rt_macho_register_object_platform_sections, *HeaderAddr,
+                 MachOPlatformSecs)),
+         cantFail(
+             WrapperFunctionCall::Create<SPSRegisterObjectPlatformSectionsArgs>(
+                 MP.orc_rt_macho_deregister_object_platform_sections,
+                 *HeaderAddr, MachOPlatformSecs))});
   }
+
   return Error::success();
 }
 
diff --git a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
new file mode 100644
index 000000000000..8b3fbd7117e2
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
@@ -0,0 +1,152 @@
+//===- MemoryMapper.cpp - Cross-process memory mapper ------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/MemoryMapper.h"
+
+namespace llvm {
+namespace orc {
+
+MemoryMapper::~MemoryMapper() {}
+
+void InProcessMemoryMapper::reserve(size_t NumBytes,
+                                    OnReservedFunction OnReserved) {
+  std::error_code EC;
+  auto MB = sys::Memory::allocateMappedMemory(
+      NumBytes, nullptr, sys::Memory::MF_READ | sys::Memory::MF_WRITE, EC);
+
+  if (EC)
+    return OnReserved(errorCodeToError(EC));
+
+  {
+    std::lock_guard<std::mutex> Lock(Mutex);
+    Reservations[MB.base()].Size = MB.allocatedSize();
+  }
+
+  OnReserved(
+      ExecutorAddrRange(ExecutorAddr::fromPtr(MB.base()), MB.allocatedSize()));
+}
+
+char *InProcessMemoryMapper::prepare(ExecutorAddr Addr, size_t ContentSize) {
+  return Addr.toPtr<char *>();
+}
+
+void InProcessMemoryMapper::initialize(MemoryMapper::AllocInfo &AI,
+                                       OnInitializedFunction OnInitialized) {
+  ExecutorAddr MinAddr(~0ULL);
+
+  for (auto &Segment : AI.Segments) {
+    auto Base = AI.MappingBase + Segment.Offset;
+    auto Size = Segment.ContentSize + Segment.ZeroFillSize;
+
+    if (Base < MinAddr)
+      MinAddr = Base;
+
+    std::memset((Base + Segment.ContentSize).toPtr<void *>(), 0,
+                Segment.ZeroFillSize);
+
+    if (auto EC = sys::Memory::protectMappedMemory({Base.toPtr<void *>(), Size},
+                                                   Segment.Prot)) {
+      return OnInitialized(errorCodeToError(EC));
+    }
+    if (Segment.Prot & sys::Memory::MF_EXEC)
+      sys::Memory::InvalidateInstructionCache(Base.toPtr<void *>(), Size);
+  }
+
+  auto DeinitializeActions = shared::runFinalizeActions(AI.Actions);
+  if (!DeinitializeActions)
+    return OnInitialized(DeinitializeActions.takeError());
+
+  {
+    std::lock_guard<std::mutex> Lock(Mutex);
+    Allocations[MinAddr].DeinitializationActions =
+        std::move(*DeinitializeActions);
+    Reservations[AI.MappingBase.toPtr<void *>()].Allocations.push_back(MinAddr);
+  }
+
+  OnInitialized(MinAddr);
+}
+
+void InProcessMemoryMapper::deinitialize(
+    ArrayRef<ExecutorAddr> Bases,
+    MemoryMapper::OnDeinitializedFunction OnDeinitialized) {
+  Error AllErr = Error::success();
+
+  {
+    std::lock_guard<std::mutex> Lock(Mutex);
+
+    for (auto Base : Bases) {
+
+      if (Error Err = shared::runDeallocActions(
+              Allocations[Base].DeinitializationActions)) {
+        AllErr = joinErrors(std::move(AllErr), std::move(Err));
+      }
+
+      Allocations.erase(Base);
+    }
+  }
+
+  OnDeinitialized(std::move(AllErr));
+}
+
+void InProcessMemoryMapper::release(ArrayRef<ExecutorAddr> Bases,
+                                    OnReleasedFunction OnReleased) {
+  Error Err = Error::success();
+
+  for (auto Base : Bases) {
+    std::vector<ExecutorAddr> AllocAddrs;
+    size_t Size;
+    {
+      std::lock_guard<std::mutex> Lock(Mutex);
+      auto &R = Reservations[Base.toPtr<void *>()];
+      Size = R.Size;
+      AllocAddrs.swap(R.Allocations);
+    }
+
+    // deinitialize sub allocations
+    std::promise<MSVCPError> P;
+    auto F = P.get_future();
+    deinitialize(AllocAddrs, [&](Error Err) { P.set_value(std::move(Err)); });
+    if (Error E = F.get()) {
+      Err = joinErrors(std::move(Err), std::move(E));
+    }
+
+    // free the memory
+    auto MB = sys::MemoryBlock(Base.toPtr<void *>(), Size);
+
+    auto EC = sys::Memory::releaseMappedMemory(MB);
+    if (EC) {
+      Err = joinErrors(std::move(Err), errorCodeToError(EC));
+    }
+
+    std::lock_guard<std::mutex> Lock(Mutex);
+    Reservations.erase(Base.toPtr<void *>());
+  }
+
+  OnReleased(std::move(Err));
+}
+
+InProcessMemoryMapper::~InProcessMemoryMapper() {
+  std::vector<ExecutorAddr> ReservationAddrs;
+  {
+    std::lock_guard<std::mutex> Lock(Mutex);
+
+    ReservationAddrs.reserve(Reservations.size());
+    for (const auto &R : Reservations) {
+      ReservationAddrs.push_back(ExecutorAddr::fromPtr(R.getFirst()));
+    }
+  }
+
+  std::promise<MSVCPError> P;
+  auto F = P.get_future();
+  release(ReservationAddrs, [&](Error Err) { P.set_value(std::move(Err)); });
+  cantFail(F.get());
+}
+
+} // namespace orc
+
+} // namespace llvm
diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectFileInterface.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectFileInterface.cpp
index c1ad569dd65d..394a555e453b 100644
--- a/llvm/lib/ExecutionEngine/Orc/ObjectFileInterface.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ObjectFileInterface.cpp
@@ -63,7 +63,6 @@ getMachOObjectFileSymbolInfo(ExecutionSession &ES,
     auto Name = Sym.getName();
     if (!Name)
       return Name.takeError();
-    auto InternedName = ES.intern(*Name);
     auto SymFlags = JITSymbolFlags::fromObjectSymbol(Sym);
     if (!SymFlags)
       return SymFlags.takeError();
@@ -72,7 +71,7 @@ getMachOObjectFileSymbolInfo(ExecutionSession &ES,
     if (Name->startswith("l"))
       *SymFlags &= ~JITSymbolFlags::Exported;
 
-    I.SymbolFlags[InternedName] = std::move(*SymFlags);
+    I.SymbolFlags[ES.intern(*Name)] = std::move(*SymFlags);
   }
 
   for (auto &Sec : Obj.sections()) {
@@ -121,7 +120,7 @@ getELFObjectFileSymbolInfo(ExecutionSession &ES,
     auto Name = Sym.getName();
     if (!Name)
       return Name.takeError();
-    auto InternedName = ES.intern(*Name);
+
     auto SymFlags = JITSymbolFlags::fromObjectSymbol(Sym);
     if (!SymFlags)
       return SymFlags.takeError();
@@ -130,7 +129,7 @@ getELFObjectFileSymbolInfo(ExecutionSession &ES,
     if (Sym.getBinding() == ELF::STB_GNU_UNIQUE)
       *SymFlags |= JITSymbolFlags::Weak;
 
-    I.SymbolFlags[InternedName] = std::move(*SymFlags);
+    I.SymbolFlags[ES.intern(*Name)] = std::move(*SymFlags);
   }
 
   SymbolStringPtr InitSymbol;
@@ -175,12 +174,12 @@ getGenericObjectFileSymbolInfo(ExecutionSession &ES,
     auto Name = Sym.getName();
     if (!Name)
       return Name.takeError();
-    auto InternedName = ES.intern(*Name);
+
     auto SymFlags = JITSymbolFlags::fromObjectSymbol(Sym);
     if (!SymFlags)
       return SymFlags.takeError();
 
-    I.SymbolFlags[InternedName] = std::move(*SymFlags);
+    I.SymbolFlags[ES.intern(*Name)] = std::move(*SymFlags);
   }
 
   return I;
diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
index 32c5998a789b..5ddb35cbafd5 100644
--- a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
@@ -78,9 +78,12 @@ private:
   }
 
   static bool hasELFInitSection(LinkGraph &G) {
-    for (auto &Sec : G.sections())
-      if (Sec.getName() == ".init_array")
+    for (auto &Sec : G.sections()) {
+      auto SecName = Sec.getName();
+      if (SecName.consume_front(".init_array") &&
+          (SecName.empty() || SecName[0] == '.'))
         return true;
+    }
     return false;
   }
 
@@ -226,12 +229,13 @@ public:
       }
 
     for (auto *Sym : G.absolute_symbols())
-      if (Sym->hasName()) {
+      if (Sym->hasName() && Sym->getScope() != Scope::Local) {
         auto InternedName = ES.intern(Sym->getName());
         JITSymbolFlags Flags;
-        Flags |= JITSymbolFlags::Absolute;
         if (Sym->isCallable())
           Flags |= JITSymbolFlags::Callable;
+        if (Sym->getScope() == Scope::Default)
+          Flags |= JITSymbolFlags::Exported;
         if (Sym->getLinkage() == Linkage::Weak)
           Flags |= JITSymbolFlags::Weak;
         InternedResult[InternedName] =
@@ -607,7 +611,7 @@ private:
   DenseMap<SymbolStringPtr, SymbolNameSet> InternalNamedSymbolDeps;
 };
 
-ObjectLinkingLayer::Plugin::~Plugin() {}
+ObjectLinkingLayer::Plugin::~Plugin() = default;
 
 char ObjectLinkingLayer::ID;
 
diff --git a/llvm/lib/ExecutionEngine/Orc/OrcABISupport.cpp b/llvm/lib/ExecutionEngine/Orc/OrcABISupport.cpp
index 18b3c5e12b1c..ef764a3f0d7f 100644
--- a/llvm/lib/ExecutionEngine/Orc/OrcABISupport.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/OrcABISupport.cpp
@@ -906,5 +906,176 @@ void OrcMips64::writeIndirectStubsBlock(
     Stub[8 * I + 7] = 0x00000000;                            // nop
   }
 }
+
+void OrcRiscv64::writeResolverCode(char *ResolverWorkingMem,
+                                   JITTargetAddress ResolverTargetAddress,
+                                   JITTargetAddress ReentryFnAddr,
+                                   JITTargetAddress ReentryCtxAddr) {
+
+  const uint32_t ResolverCode[] = {
+      0xef810113, // 0x00: addi sp,sp,-264
+      0x00813023, // 0x04: sd s0,0(sp)
+      0x00913423, // 0x08: sd s1,8(sp)
+      0x01213823, // 0x0c: sd s2,16(sp)
+      0x01313c23, // 0x10: sd s3,24(sp)
+      0x03413023, // 0x14: sd s4,32(sp)
+      0x03513423, // 0x18: sd s5,40(sp)
+      0x03613823, // 0x1c: sd s6,48(sp)
+      0x03713c23, // 0x20: sd s7,56(sp)
+      0x05813023, // 0x24: sd s8,64(sp)
+      0x05913423, // 0x28: sd s9,72(sp)
+      0x05a13823, // 0x2c: sd s10,80(sp)
+      0x05b13c23, // 0x30: sd s11,88(sp)
+      0x06113023, // 0x34: sd ra,96(sp)
+      0x06a13423, // 0x38: sd a0,104(sp)
+      0x06b13823, // 0x3c: sd a1,112(sp)
+      0x06c13c23, // 0x40: sd a2,120(sp)
+      0x08d13023, // 0x44: sd a3,128(sp)
+      0x08e13423, // 0x48: sd a4,136(sp)
+      0x08f13823, // 0x4c: sd a5,144(sp)
+      0x09013c23, // 0x50: sd a6,152(sp)
+      0x0b113023, // 0x54: sd a7,160(sp)
+      0x0a813427, // 0x58: fsd fs0,168(sp)
+      0x0a913827, // 0x5c: fsd fs1,176(sp)
+      0x0b213c27, // 0x60: fsd fs2,184(sp)
+      0x0d313027, // 0x64: fsd fs3,192(sp)
+      0x0d413427, // 0x68: fsd fs4,200(sp)
+      0x0d513827, // 0x6c: fsd fs5,208(sp)
+      0x0d613c27, // 0x70: fsd fs6,216(sp)
+      0x0f713027, // 0x74: fsd fs7,224(sp)
+      0x0f813427, // 0x78: fsd fs8,232(sp)
+      0x0f913827, // 0x7c: fsd fs9,240(sp)
+      0x0fa13c27, // 0x80: fsd fs10,248(sp)
+      0x11b13027, // 0x84: fsd fs11,256(sp)
+      0x00000517, // 0x88: auipc a0,0x0
+      0x0b053503, // 0x8c: ld a0,176(a0) # 0x138
+      0x00030593, // 0x90: mv a1,t1
+      0xff458593, // 0x94: addi a1,a1,-12
+      0x00000617, // 0x98: auipc a2,0x0
+      0x0a863603, // 0x9c: ld a2,168(a2) # 0x140
+      0x000600e7, // 0xa0: jalr a2
+      0x00050293, // 0xa4: mv t0,a0
+      0x00013403, // 0xa8: ld s0,0(sp)
+      0x00813483, // 0xac: ld s1,8(sp)
+      0x01013903, // 0xb0: ld s2,16(sp)
+      0x01813983, // 0xb4: ld s3,24(sp)
+      0x02013a03, // 0xb8: ld s4,32(sp)
+      0x02813a83, // 0xbc: ld s5,40(sp)
+      0x03013b03, // 0xc0: ld s6,48(sp)
+      0x03813b83, // 0xc4: ld s7,56(sp)
+      0x04013c03, // 0xc8: ld s8,64(sp)
+      0x04813c83, // 0xcc: ld s9,72(sp)
+      0x05013d03, // 0xd0: ld s10,80(sp)
+      0x05813d83, // 0xd4: ld s11,88(sp)
+      0x06013083, // 0xd8: ld ra,96(sp)
+      0x06813503, // 0xdc: ld a0,104(sp)
+      0x07013583, // 0xe0: ld a1,112(sp)
+      0x07813603, // 0xe4: ld a2,120(sp)
+      0x08013683, // 0xe8: ld a3,128(sp)
+      0x08813703, // 0xec: ld a4,136(sp)
+      0x09013783, // 0xf0: ld a5,144(sp)
+      0x09813803, // 0xf4: ld a6,152(sp)
+      0x0a013883, // 0xf8: ld a7,160(sp)
+      0x0a813407, // 0xfc: fld fs0,168(sp)
+      0x0b013487, // 0x100: fld fs1,176(sp)
+      0x0b813907, // 0x104: fld fs2,184(sp)
+      0x0c013987, // 0x108: fld fs3,192(sp)
+      0x0c813a07, // 0x10c: fld fs4,200(sp)
+      0x0d013a87, // 0x110: fld fs5,208(sp)
+      0x0d813b07, // 0x114: fld fs6,216(sp)
+      0x0e013b87, // 0x118: fld fs7,224(sp)
+      0x0e813c07, // 0x11c: fld fs8,232(sp)
+      0x0f013c87, // 0x120: fld fs9,240(sp)
+      0x0f813d07, // 0x124: fld fs10,248(sp)
+      0x10013d87, // 0x128: fld fs11,256(sp)
+      0x10810113, // 0x12c: addi sp,sp,264
+      0x00028067, // 0x130: jr t0
+      0x12345678, // 0x134: padding to align at 8 byte
+      0x12345678, // 0x138: Lreentry_ctx_ptr:
+      0xdeadbeef, // 0x13c:      .quad 0
+      0x98765432, // 0x140: Lreentry_fn_ptr:
+      0xcafef00d  // 0x144:      .quad 0
+  };
+
+  const unsigned ReentryCtxAddrOffset = 0x138;
+  const unsigned ReentryFnAddrOffset = 0x140;
+
+  memcpy(ResolverWorkingMem, ResolverCode, sizeof(ResolverCode));
+  memcpy(ResolverWorkingMem + ReentryFnAddrOffset, &ReentryFnAddr,
+         sizeof(uint64_t));
+  memcpy(ResolverWorkingMem + ReentryCtxAddrOffset, &ReentryCtxAddr,
+         sizeof(uint64_t));
+}
+
+void OrcRiscv64::writeTrampolines(char *TrampolineBlockWorkingMem,
+                                  JITTargetAddress TrampolineBlockTargetAddress,
+                                  JITTargetAddress ResolverAddr,
+                                  unsigned NumTrampolines) {
+
+  unsigned OffsetToPtr = alignTo(NumTrampolines * TrampolineSize, 8);
+
+  memcpy(TrampolineBlockWorkingMem + OffsetToPtr, &ResolverAddr,
+         sizeof(uint64_t));
+
+  uint32_t *Trampolines =
+      reinterpret_cast<uint32_t *>(TrampolineBlockWorkingMem);
+  for (unsigned I = 0; I < NumTrampolines; ++I, OffsetToPtr -= TrampolineSize) {
+    uint32_t Hi20 = (OffsetToPtr + 0x800) & 0xFFFFF000;
+    uint32_t Lo12 = OffsetToPtr - Hi20;
+    Trampolines[4 * I + 0] = 0x00000297 | Hi20; // auipc t0, %hi(Lptr)
+    Trampolines[4 * I + 1] =
+        0x0002b283 | ((Lo12 & 0xFFF) << 20);    // ld t0, %lo(Lptr)
+    Trampolines[4 * I + 2] = 0x00028367;        // jalr t1, t0
+    Trampolines[4 * I + 3] = 0xdeadface;        // padding
+  }
+}
+
+void OrcRiscv64::writeIndirectStubsBlock(
+    char *StubsBlockWorkingMem, JITTargetAddress StubsBlockTargetAddress,
+    JITTargetAddress PointersBlockTargetAddress, unsigned NumStubs) {
+  // Stub format is:
+  //
+  // .section __orc_stubs
+  // stub1:
+  //                 auipc   t0, %hi(ptr1)  ; PC-rel load of ptr1
+  //                 ld      t0, %lo(t0)
+  //                 jr      t0             ; Jump to resolver
+  //                 .quad 0                ; Pad to 16 bytes
+  // stub2:
+  //                 auipc   t0, %hi(ptr1)  ; PC-rel load of ptr1
+  //                 ld      t0, %lo(t0)
+  //                 jr      t0             ; Jump to resolver
+  //                 .quad 0
+  //
+  // ...
+  //
+  // .section __orc_ptrs
+  // ptr1:
+  //                 .quad 0x0
+  // ptr2:
+  //                 .quad 0x0
+  //
+  // ...
+
+  assert(stubAndPointerRangesOk<OrcRiscv64>(
+             StubsBlockTargetAddress, PointersBlockTargetAddress, NumStubs) &&
+         "PointersBlock is out of range");
+
+  uint32_t *Stub = reinterpret_cast<uint32_t *>(StubsBlockWorkingMem);
+
+  for (unsigned I = 0; I < NumStubs; ++I) {
+    uint64_t PtrDisplacement =
+        PointersBlockTargetAddress - StubsBlockTargetAddress;
+    uint32_t Hi20 = (PtrDisplacement + 0x800) & 0xFFFFF000;
+    uint32_t Lo12 = PtrDisplacement - Hi20;
+    Stub[4 * I + 0] = 0x00000297 | Hi20;                   // auipc t0, %hi(Lptr)
+    Stub[4 * I + 1] = 0x0002b283 | ((Lo12 & 0xFFF) << 20); // ld t0, %lo(Lptr)
+    Stub[4 * I + 2] = 0x00028067;                          // jr t0
+    Stub[4 * I + 3] = 0xfeedbeef;                          // padding
+    PointersBlockTargetAddress += PointerSize;
+    StubsBlockTargetAddress += StubSize;
+  }
+}
+
 } // End namespace orc.
 } // End namespace llvm.
diff --git a/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp b/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp
index 71be8dfdc004..b7eab6b85ecf 100644
--- a/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp
@@ -106,82 +106,6 @@ DEFINE_SIMPLE_CONVERSION_FUNCTIONS(LLJITBuilder, LLVMOrcLLJITBuilderRef)
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(LLJIT, LLVMOrcLLJITRef)
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(TargetMachine, LLVMTargetMachineRef)
 
-namespace llvm {
-namespace orc {
-
-class CAPIDefinitionGenerator final : public DefinitionGenerator {
-public:
-  CAPIDefinitionGenerator(
-      void *Ctx,
-      LLVMOrcCAPIDefinitionGeneratorTryToGenerateFunction TryToGenerate)
-      : Ctx(Ctx), TryToGenerate(TryToGenerate) {}
-
-  Error tryToGenerate(LookupState &LS, LookupKind K, JITDylib &JD,
-                      JITDylibLookupFlags JDLookupFlags,
-                      const SymbolLookupSet &LookupSet) override {
-
-    // Take the lookup state.
-    LLVMOrcLookupStateRef LSR = ::wrap(OrcV2CAPIHelper::extractLookupState(LS));
-
-    // Translate the lookup kind.
-    LLVMOrcLookupKind CLookupKind;
-    switch (K) {
-    case LookupKind::Static:
-      CLookupKind = LLVMOrcLookupKindStatic;
-      break;
-    case LookupKind::DLSym:
-      CLookupKind = LLVMOrcLookupKindDLSym;
-      break;
-    }
-
-    // Translate the JITDylibSearchFlags.
-    LLVMOrcJITDylibLookupFlags CJDLookupFlags;
-    switch (JDLookupFlags) {
-    case JITDylibLookupFlags::MatchExportedSymbolsOnly:
-      CJDLookupFlags = LLVMOrcJITDylibLookupFlagsMatchExportedSymbolsOnly;
-      break;
-    case JITDylibLookupFlags::MatchAllSymbols:
-      CJDLookupFlags = LLVMOrcJITDylibLookupFlagsMatchAllSymbols;
-      break;
-    }
-
-    // Translate the lookup set.
-    std::vector<LLVMOrcCLookupSetElement> CLookupSet;
-    CLookupSet.reserve(LookupSet.size());
-    for (auto &KV : LookupSet) {
-      LLVMOrcSymbolLookupFlags SLF;
-      LLVMOrcSymbolStringPoolEntryRef Name =
-        ::wrap(OrcV2CAPIHelper::getRawPoolEntryPtr(KV.first));
-      switch (KV.second) {
-      case SymbolLookupFlags::RequiredSymbol:
-        SLF = LLVMOrcSymbolLookupFlagsRequiredSymbol;
-        break;
-      case SymbolLookupFlags::WeaklyReferencedSymbol:
-        SLF = LLVMOrcSymbolLookupFlagsWeaklyReferencedSymbol;
-        break;
-      }
-      CLookupSet.push_back({Name, SLF});
-    }
-
-    // Run the C TryToGenerate function.
-    auto Err = unwrap(TryToGenerate(::wrap(this), Ctx, &LSR, CLookupKind,
-                                    ::wrap(&JD), CJDLookupFlags,
-                                    CLookupSet.data(), CLookupSet.size()));
-
-    // Restore the lookup state.
-    OrcV2CAPIHelper::resetLookupState(LS, ::unwrap(LSR));
-
-    return Err;
-  }
-
-private:
-  void *Ctx;
-  LLVMOrcCAPIDefinitionGeneratorTryToGenerateFunction TryToGenerate;
-};
-
-} // end namespace orc
-} // end namespace llvm
-
 namespace {
 
 class OrcCAPIMaterializationUnit : public llvm::orc::MaterializationUnit {
@@ -282,8 +206,134 @@ toSymbolDependenceMap(LLVMOrcCDependenceMapPairs Pairs, size_t NumPairs) {
   return SDM;
 }
 
+static LookupKind toLookupKind(LLVMOrcLookupKind K) {
+  switch (K) {
+  case LLVMOrcLookupKindStatic:
+    return LookupKind::Static;
+  case LLVMOrcLookupKindDLSym:
+    return LookupKind::DLSym;
+  }
+  llvm_unreachable("unrecognized LLVMOrcLookupKind value");
+}
+
+static LLVMOrcLookupKind fromLookupKind(LookupKind K) {
+  switch (K) {
+  case LookupKind::Static:
+    return LLVMOrcLookupKindStatic;
+  case LookupKind::DLSym:
+    return LLVMOrcLookupKindDLSym;
+  }
+  llvm_unreachable("unrecognized LookupKind value");
+}
+
+static JITDylibLookupFlags
+toJITDylibLookupFlags(LLVMOrcJITDylibLookupFlags LF) {
+  switch (LF) {
+  case LLVMOrcJITDylibLookupFlagsMatchExportedSymbolsOnly:
+    return JITDylibLookupFlags::MatchExportedSymbolsOnly;
+  case LLVMOrcJITDylibLookupFlagsMatchAllSymbols:
+    return JITDylibLookupFlags::MatchAllSymbols;
+  }
+  llvm_unreachable("unrecognized LLVMOrcJITDylibLookupFlags value");
+}
+
+static LLVMOrcJITDylibLookupFlags
+fromJITDylibLookupFlags(JITDylibLookupFlags LF) {
+  switch (LF) {
+  case JITDylibLookupFlags::MatchExportedSymbolsOnly:
+    return LLVMOrcJITDylibLookupFlagsMatchExportedSymbolsOnly;
+  case JITDylibLookupFlags::MatchAllSymbols:
+    return LLVMOrcJITDylibLookupFlagsMatchAllSymbols;
+  }
+  llvm_unreachable("unrecognized JITDylibLookupFlags value");
+}
+
+static SymbolLookupFlags toSymbolLookupFlags(LLVMOrcSymbolLookupFlags SLF) {
+  switch (SLF) {
+  case LLVMOrcSymbolLookupFlagsRequiredSymbol:
+    return SymbolLookupFlags::RequiredSymbol;
+  case LLVMOrcSymbolLookupFlagsWeaklyReferencedSymbol:
+    return SymbolLookupFlags::WeaklyReferencedSymbol;
+  }
+  llvm_unreachable("unrecognized LLVMOrcSymbolLookupFlags value");
+}
+
+static LLVMOrcSymbolLookupFlags fromSymbolLookupFlags(SymbolLookupFlags SLF) {
+  switch (SLF) {
+  case SymbolLookupFlags::RequiredSymbol:
+    return LLVMOrcSymbolLookupFlagsRequiredSymbol;
+  case SymbolLookupFlags::WeaklyReferencedSymbol:
+    return LLVMOrcSymbolLookupFlagsWeaklyReferencedSymbol;
+  }
+  llvm_unreachable("unrecognized SymbolLookupFlags value");
+}
+
+static LLVMJITEvaluatedSymbol
+fromJITEvaluatedSymbol(const JITEvaluatedSymbol &S) {
+  return {S.getAddress(), fromJITSymbolFlags(S.getFlags())};
+}
+
 } // end anonymous namespace
 
+namespace llvm {
+namespace orc {
+
+class CAPIDefinitionGenerator final : public DefinitionGenerator {
+public:
+  CAPIDefinitionGenerator(
+      LLVMOrcDisposeCAPIDefinitionGeneratorFunction Dispose, void *Ctx,
+      LLVMOrcCAPIDefinitionGeneratorTryToGenerateFunction TryToGenerate)
+      : Dispose(Dispose), Ctx(Ctx), TryToGenerate(TryToGenerate) {}
+
+  ~CAPIDefinitionGenerator() {
+    if (Dispose)
+      Dispose(Ctx);
+  }
+
+  Error tryToGenerate(LookupState &LS, LookupKind K, JITDylib &JD,
+                      JITDylibLookupFlags JDLookupFlags,
+                      const SymbolLookupSet &LookupSet) override {
+
+    // Take the lookup state.
+    LLVMOrcLookupStateRef LSR = ::wrap(OrcV2CAPIHelper::extractLookupState(LS));
+
+    // Translate the lookup kind.
+    LLVMOrcLookupKind CLookupKind = fromLookupKind(K);
+
+    // Translate the JITDylibLookupFlags.
+    LLVMOrcJITDylibLookupFlags CJDLookupFlags =
+        fromJITDylibLookupFlags(JDLookupFlags);
+
+    // Translate the lookup set.
+    std::vector<LLVMOrcCLookupSetElement> CLookupSet;
+    CLookupSet.reserve(LookupSet.size());
+    for (auto &KV : LookupSet) {
+      LLVMOrcSymbolStringPoolEntryRef Name =
+          ::wrap(OrcV2CAPIHelper::getRawPoolEntryPtr(KV.first));
+      LLVMOrcSymbolLookupFlags SLF = fromSymbolLookupFlags(KV.second);
+      CLookupSet.push_back({Name, SLF});
+    }
+
+    // Run the C TryToGenerate function.
+    auto Err = unwrap(TryToGenerate(::wrap(this), Ctx, &LSR, CLookupKind,
+                                    ::wrap(&JD), CJDLookupFlags,
+                                    CLookupSet.data(), CLookupSet.size()));
+
+    // Restore the lookup state.
+    OrcV2CAPIHelper::resetLookupState(LS, ::unwrap(LSR));
+
+    return Err;
+  }
+
+private:
+  LLVMOrcDisposeCAPIDefinitionGeneratorFunction Dispose;
+  void *Ctx;
+  LLVMOrcCAPIDefinitionGeneratorTryToGenerateFunction TryToGenerate;
+};
+
+} // end namespace orc
+} // end namespace llvm
+
 void LLVMOrcExecutionSessionSetErrorReporter(
     LLVMOrcExecutionSessionRef ES, LLVMOrcErrorReporterFunction ReportError,
     void *Ctx) {
@@ -307,6 +357,42 @@ LLVMOrcExecutionSessionIntern(LLVMOrcExecutionSessionRef ES, const char *Name) {
       OrcV2CAPIHelper::moveFromSymbolStringPtr(unwrap(ES)->intern(Name)));
 }
 
+void LLVMOrcExecutionSessionLookup(
+    LLVMOrcExecutionSessionRef ES, LLVMOrcLookupKind K,
+    LLVMOrcCJITDylibSearchOrder SearchOrder, size_t SearchOrderSize,
+    LLVMOrcCLookupSet Symbols, size_t SymbolsSize,
+    LLVMOrcExecutionSessionLookupHandleResultFunction HandleResult, void *Ctx) {
+  assert(ES && "ES cannot be null");
+  assert(SearchOrder && "SearchOrder cannot be null");
+  assert(Symbols && "Symbols cannot be null");
+  assert(HandleResult && "HandleResult cannot be null");
+
+  JITDylibSearchOrder SO;
+  for (size_t I = 0; I != SearchOrderSize; ++I)
+    SO.push_back({unwrap(SearchOrder[I].JD),
+                  toJITDylibLookupFlags(SearchOrder[I].JDLookupFlags)});
+
+  SymbolLookupSet SLS;
+  for (size_t I = 0; I != SymbolsSize; ++I)
+    SLS.add(OrcV2CAPIHelper::moveToSymbolStringPtr(unwrap(Symbols[I].Name)),
+            toSymbolLookupFlags(Symbols[I].LookupFlags));
+
+  unwrap(ES)->lookup(
+      toLookupKind(K), SO, std::move(SLS), SymbolState::Ready,
+      [HandleResult, Ctx](Expected<SymbolMap> Result) {
+        if (Result) {
+          SmallVector<LLVMOrcCSymbolMapPair> CResult;
+          for (auto &KV : *Result)
+            CResult.push_back(LLVMOrcCSymbolMapPair{
+                wrap(OrcV2CAPIHelper::getRawPoolEntryPtr(KV.first)),
+                fromJITEvaluatedSymbol(KV.second)});
+          HandleResult(LLVMErrorSuccess, CResult.data(), CResult.size(), Ctx);
+        } else
+          HandleResult(wrap(Result.takeError()), nullptr, 0, Ctx);
+      },
+      NoDependenciesToRegister);
+}
+
 void LLVMOrcRetainSymbolStringPoolEntry(LLVMOrcSymbolStringPoolEntryRef S) {
   OrcV2CAPIHelper::retainPoolEntry(unwrap(S));
 }
@@ -589,11 +675,19 @@ void LLVMOrcJITDylibAddGenerator(LLVMOrcJITDylibRef JD,
 }
 
 LLVMOrcDefinitionGeneratorRef LLVMOrcCreateCustomCAPIDefinitionGenerator(
-    LLVMOrcCAPIDefinitionGeneratorTryToGenerateFunction F, void *Ctx) {
-  auto DG = std::make_unique<CAPIDefinitionGenerator>(Ctx, F);
+    LLVMOrcCAPIDefinitionGeneratorTryToGenerateFunction F, void *Ctx,
+    LLVMOrcDisposeCAPIDefinitionGeneratorFunction Dispose) {
+  auto DG = std::make_unique<CAPIDefinitionGenerator>(Dispose, Ctx, F);
   return wrap(DG.release());
 }
 
+void LLVMOrcLookupStateContinueLookup(LLVMOrcLookupStateRef S,
+                                      LLVMErrorRef Err) {
+  LookupState LS;
+  OrcV2CAPIHelper::resetLookupState(LS, ::unwrap(S));
+  LS.continueLookup(unwrap(Err));
+}
+
 LLVMErrorRef LLVMOrcCreateDynamicLibrarySearchGeneratorForProcess(
     LLVMOrcDefinitionGeneratorRef *Result, char GlobalPrefix,
     LLVMOrcSymbolPredicate Filter, void *FilterCtx) {
@@ -951,7 +1045,7 @@ LLVMErrorRef LLVMOrcLLJITLookup(LLVMOrcLLJITRef J,
     return wrap(Sym.takeError());
   }
 
-  *Result = Sym->getAddress();
+  *Result = Sym->getValue();
   return LLVMErrorSuccess;
 }
 
diff --git a/llvm/lib/ExecutionEngine/Orc/Shared/SimpleRemoteEPCUtils.cpp b/llvm/lib/ExecutionEngine/Orc/Shared/SimpleRemoteEPCUtils.cpp
index 64fc717b7b56..2bb204e688fc 100644
--- a/llvm/lib/ExecutionEngine/Orc/Shared/SimpleRemoteEPCUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Shared/SimpleRemoteEPCUtils.cpp
@@ -43,8 +43,8 @@ const char *DispatchFnName = "__llvm_orc_SimpleRemoteEPC_dispatch_fn";
 
 } // end namespace SimpleRemoteEPCDefaultBootstrapSymbolNames
 
-SimpleRemoteEPCTransportClient::~SimpleRemoteEPCTransportClient() {}
-SimpleRemoteEPCTransport::~SimpleRemoteEPCTransport() {}
+SimpleRemoteEPCTransportClient::~SimpleRemoteEPCTransportClient() = default;
+SimpleRemoteEPCTransport::~SimpleRemoteEPCTransport() = default;
 
 Expected<std::unique_ptr<FDSimpleRemoteEPCTransport>>
 FDSimpleRemoteEPCTransport::Create(SimpleRemoteEPCTransportClient &C, int InFD,
diff --git a/llvm/lib/ExecutionEngine/Orc/Speculation.cpp b/llvm/lib/ExecutionEngine/Orc/Speculation.cpp
index 0b4755fe23cf..b52d01318c0d 100644
--- a/llvm/lib/ExecutionEngine/Orc/Speculation.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Speculation.cpp
@@ -85,7 +85,7 @@ void IRSpeculationLayer::emit(std::unique_ptr<MaterializationResponsibility> R,
 
         auto IRNames = QueryAnalysis(Fn);
         // Instrument and register if Query has result
-        if (IRNames.hasValue()) {
+        if (IRNames) {
 
           // Emit globals for each function.
           auto LoadValueTy = Type::getInt8Ty(MContext);
@@ -126,7 +126,7 @@ void IRSpeculationLayer::emit(std::unique_ptr<MaterializationResponsibility> R,
 
           assert(Mutator.GetInsertBlock()->getParent() == &Fn &&
                  "IR builder association mismatch?");
-          S.registerSymbols(internToJITSymbols(IRNames.getValue()),
+          S.registerSymbols(internToJITSymbols(*IRNames),
                             &R->getTargetJITDylib());
         }
       }
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleRemoteEPCServer.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleRemoteEPCServer.cpp
index b6b21bde1182..8ab0af3eab6e 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleRemoteEPCServer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleRemoteEPCServer.cpp
@@ -22,9 +22,9 @@ using namespace llvm::orc::shared;
 namespace llvm {
 namespace orc {
 
-ExecutorBootstrapService::~ExecutorBootstrapService() {}
+ExecutorBootstrapService::~ExecutorBootstrapService() = default;
 
-SimpleRemoteEPCServer::Dispatcher::~Dispatcher() {}
+SimpleRemoteEPCServer::Dispatcher::~Dispatcher() = default;
 
 #if LLVM_ENABLE_THREADS
 void SimpleRemoteEPCServer::ThreadDispatcher::dispatch(
diff --git a/llvm/lib/ExecutionEngine/Orc/TaskDispatch.cpp b/llvm/lib/ExecutionEngine/Orc/TaskDispatch.cpp
index 111c84ec87ed..11a99986f2ee 100644
--- a/llvm/lib/ExecutionEngine/Orc/TaskDispatch.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TaskDispatch.cpp
@@ -16,7 +16,7 @@ char GenericNamedTask::ID = 0;
 const char *GenericNamedTask::DefaultDescription = "Generic Task";
 
 void Task::anchor() {}
-TaskDispatcher::~TaskDispatcher() {}
+TaskDispatcher::~TaskDispatcher() = default;
 
 void InPlaceTaskDispatcher::dispatch(std::unique_ptr<Task> T) { T->run(); }
 
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
index 9c8d402364c6..bc42eebf3fec 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
@@ -29,7 +29,7 @@
 
 namespace llvm {
 
-RTDyldMemoryManager::~RTDyldMemoryManager() {}
+RTDyldMemoryManager::~RTDyldMemoryManager() = default;
 
 #if defined(HAVE_REGISTER_FRAME) && defined(HAVE_DEREGISTER_FRAME) &&          \
     !defined(__SEH__) && !defined(__USING_SJLJ_EXCEPTIONS__)
@@ -95,18 +95,16 @@ void RTDyldMemoryManager::registerEHFramesInProcess(uint8_t *Addr,
   // and projects/libunwind/src/UnwindLevel1-gcc-ext.c.
   const char *P = (const char *)Addr;
   const char *End = P + Size;
-  do  {
+  while (P != End)
     P = processFDE(P, false);
-  } while(P != End);
 }
 
 void RTDyldMemoryManager::deregisterEHFramesInProcess(uint8_t *Addr,
                                                       size_t Size) {
   const char *P = (const char *)Addr;
   const char *End = P + Size;
-  do  {
+  while (P != End)
     P = processFDE(P, true);
-  } while(P != End);
 }
 
 #else
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index 3f38d26869d4..2e0cba849165 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -66,7 +66,7 @@ std::error_code RuntimeDyldError::convertToErrorCode() const {
 }
 
 // Empty out-of-line virtual destructor as the key function.
-RuntimeDyldImpl::~RuntimeDyldImpl() {}
+RuntimeDyldImpl::~RuntimeDyldImpl() = default;
 
 // Pin LoadedObjectInfo's vtables to this file.
 void RuntimeDyld::LoadedObjectInfo::anchor() {}
@@ -1311,7 +1311,7 @@ RuntimeDyld::RuntimeDyld(RuntimeDyld::MemoryManager &MemMgr,
   ProcessAllSections = false;
 }
 
-RuntimeDyld::~RuntimeDyld() {}
+RuntimeDyld::~RuntimeDyld() = default;
 
 static std::unique_ptr<RuntimeDyldCOFF>
 createRuntimeDyldCOFF(
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
index 33db23408cf2..ae1bb5a1da4b 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
@@ -15,6 +15,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/MSVCErrorWorkarounds.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include <cctype>
 #include <memory>
@@ -892,7 +893,7 @@ RuntimeDyldChecker::RuntimeDyldChecker(
           std::move(GetGOTInfo), Endianness, Disassembler, InstPrinter,
           ErrStream)) {}
 
-RuntimeDyldChecker::~RuntimeDyldChecker() {}
+RuntimeDyldChecker::~RuntimeDyldChecker() = default;
 
 bool RuntimeDyldChecker::check(StringRef CheckExpr) const {
   return Impl->check(CheckExpr);
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index f92618afdff6..da1102fc9f07 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -216,7 +216,7 @@ namespace llvm {
 RuntimeDyldELF::RuntimeDyldELF(RuntimeDyld::MemoryManager &MemMgr,
                                JITSymbolResolver &Resolver)
     : RuntimeDyldImpl(MemMgr, Resolver), GOTSectionID(0), CurrentGOTIndex(0) {}
-RuntimeDyldELF::~RuntimeDyldELF() {}
+RuntimeDyldELF::~RuntimeDyldELF() = default;
 
 void RuntimeDyldELF::registerEHFrames() {
   for (int i = 0, e = UnregisteredEHFrameSections.size(); i != e; ++i) {
@@ -446,6 +446,13 @@ void RuntimeDyldELF::resolveAArch64Relocation(const SectionEntry &Section,
     write(isBE, TargetPtr, static_cast<uint32_t>(Result));
     break;
   }
+  case ELF::R_AARCH64_PREL16: {
+    uint64_t Result = Value + Addend - FinalAddress;
+    assert(static_cast<int64_t>(Result) >= INT16_MIN &&
+           static_cast<int64_t>(Result) <= UINT16_MAX);
+    write(isBE, TargetPtr, static_cast<uint16_t>(Result & 0xffffU));
+    break;
+  }
   case ELF::R_AARCH64_PREL32: {
     uint64_t Result = Value + Addend - FinalAddress;
     assert(static_cast<int64_t>(Result) >= INT32_MIN &&
diff --git a/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp b/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp
index 56b232b9dbcd..b23e33039c35 100644
--- a/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp
@@ -238,7 +238,7 @@ SectionMemoryManager::~SectionMemoryManager() {
   }
 }
 
-SectionMemoryManager::MemoryMapper::~MemoryMapper() {}
+SectionMemoryManager::MemoryMapper::~MemoryMapper() = default;
 
 void SectionMemoryManager::anchor() {}
 
diff --git a/llvm/lib/FileCheck/FileCheck.cpp b/llvm/lib/FileCheck/FileCheck.cpp
index 6186af444e73..bf13b6c325ec 100644
--- a/llvm/lib/FileCheck/FileCheck.cpp
+++ b/llvm/lib/FileCheck/FileCheck.cpp
@@ -1651,6 +1651,8 @@ std::string Check::FileCheckType::getDescription(StringRef Prefix) const {
   switch (Kind) {
   case Check::CheckNone:
     return "invalid";
+  case Check::CheckMisspelled:
+    return "misspelled";
   case Check::CheckPlain:
     if (Count > 1)
       return WithModifiers("-COUNT");
@@ -1680,7 +1682,8 @@ std::string Check::FileCheckType::getDescription(StringRef Prefix) const {
 }
 
 static std::pair<Check::FileCheckType, StringRef>
-FindCheckType(const FileCheckRequest &Req, StringRef Buffer, StringRef Prefix) {
+FindCheckType(const FileCheckRequest &Req, StringRef Buffer, StringRef Prefix,
+              bool &Misspelled) {
   if (Buffer.size() <= Prefix.size())
     return {Check::CheckNone, StringRef()};
 
@@ -1722,7 +1725,9 @@ FindCheckType(const FileCheckRequest &Req, StringRef Buffer, StringRef Prefix) {
   if (Rest.front() == '{')
     return ConsumeModifiers(Check::CheckPlain);
 
-  if (!Rest.consume_front("-"))
+  if (Rest.consume_front("_"))
+    Misspelled = true;
+  else if (!Rest.consume_front("-"))
     return {Check::CheckNone, StringRef()};
 
   if (Rest.consume_front("COUNT-")) {
@@ -1766,6 +1771,15 @@ FindCheckType(const FileCheckRequest &Req, StringRef Buffer, StringRef Prefix) {
   return {Check::CheckNone, Rest};
 }
 
+static std::pair<Check::FileCheckType, StringRef>
+FindCheckType(const FileCheckRequest &Req, StringRef Buffer, StringRef Prefix) {
+  bool Misspelled = false;
+  auto Res = FindCheckType(Req, Buffer, Prefix, Misspelled);
+  if (Res.first != Check::CheckNone && Misspelled)
+    return {Check::CheckMisspelled, Res.second};
+  return Res;
+}
+
 // From the given position, find the next character after the word.
 static size_t SkipWord(StringRef Str, size_t Loc) {
   while (Loc < Str.size() && IsPartOfWord(Str[Loc]))
@@ -1939,6 +1953,16 @@ bool FileCheck::readCheckFile(
     Buffer = AfterSuffix.empty() ? Buffer.drop_front(UsedPrefix.size())
                                  : AfterSuffix;
 
+    // Complain about misspelled directives.
+    if (CheckTy == Check::CheckMisspelled) {
+      StringRef UsedDirective(UsedPrefix.data(),
+                              AfterSuffix.data() - UsedPrefix.data());
+      SM.PrintMessage(SMLoc::getFromPointer(UsedDirective.data()),
+                      SourceMgr::DK_Error,
+                      "misspelled directive '" + UsedDirective + "'");
+      return true;
+    }
+
     // Complain about useful-looking but unsupported suffixes.
     if (CheckTy == Check::CheckBadNot) {
       SM.PrintMessage(SMLoc::getFromPointer(Buffer.data()), SourceMgr::DK_Error,
diff --git a/llvm/lib/Frontend/OpenMP/OMPContext.cpp b/llvm/lib/Frontend/OpenMP/OMPContext.cpp
index 11d8da097c6c..6e8856f481af 100644
--- a/llvm/lib/Frontend/OpenMP/OMPContext.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPContext.cpp
@@ -13,7 +13,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Frontend/OpenMP/OMPContext.h"
-#include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
@@ -214,7 +213,7 @@ static int isVariantApplicableInContextHelper(
       });
 
     Optional<bool> Result = HandleTrait(Property, IsActiveTrait);
-    if (Result.hasValue())
+    if (Result)
       return Result.getValue();
   }
 
@@ -235,7 +234,7 @@ static int isVariantApplicableInContextHelper(
         ConstructMatches->push_back(ConstructIdx - 1);
 
       Optional<bool> Result = HandleTrait(Property, FoundInOrder);
-      if (Result.hasValue())
+      if (Result)
         return Result.getValue();
 
       if (!FoundInOrder) {
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 99001269e1f8..9b08a24e14d4 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -15,15 +15,15 @@
 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/MDBuilder.h"
@@ -31,17 +31,14 @@
 #include "llvm/IR/Value.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Error.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/CodeExtractor.h"
 #include "llvm/Transforms/Utils/LoopPeel.h"
-#include "llvm/Transforms/Utils/ModuleUtils.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
 
 #include <cstdint>
-#include <sstream>
 
 #define DEBUG_TYPE "openmp-ir-builder"
 
@@ -72,8 +69,263 @@ static bool isConflictIP(IRBuilder<>::InsertPoint IP1,
     return false;
   return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
 }
+
+static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType) {
+  // Valid ordered/unordered and base algorithm combinations.
+  switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
+  case OMPScheduleType::UnorderedStaticChunked:
+  case OMPScheduleType::UnorderedStatic:
+  case OMPScheduleType::UnorderedDynamicChunked:
+  case OMPScheduleType::UnorderedGuidedChunked:
+  case OMPScheduleType::UnorderedRuntime:
+  case OMPScheduleType::UnorderedAuto:
+  case OMPScheduleType::UnorderedTrapezoidal:
+  case OMPScheduleType::UnorderedGreedy:
+  case OMPScheduleType::UnorderedBalanced:
+  case OMPScheduleType::UnorderedGuidedIterativeChunked:
+  case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
+  case OMPScheduleType::UnorderedSteal:
+  case OMPScheduleType::UnorderedStaticBalancedChunked:
+  case OMPScheduleType::UnorderedGuidedSimd:
+  case OMPScheduleType::UnorderedRuntimeSimd:
+  case OMPScheduleType::OrderedStaticChunked:
+  case OMPScheduleType::OrderedStatic:
+  case OMPScheduleType::OrderedDynamicChunked:
+  case OMPScheduleType::OrderedGuidedChunked:
+  case OMPScheduleType::OrderedRuntime:
+  case OMPScheduleType::OrderedAuto:
+  case OMPScheduleType::OrderdTrapezoidal:
+  case OMPScheduleType::NomergeUnorderedStaticChunked:
+  case OMPScheduleType::NomergeUnorderedStatic:
+  case OMPScheduleType::NomergeUnorderedDynamicChunked:
+  case OMPScheduleType::NomergeUnorderedGuidedChunked:
+  case OMPScheduleType::NomergeUnorderedRuntime:
+  case OMPScheduleType::NomergeUnorderedAuto:
+  case OMPScheduleType::NomergeUnorderedTrapezoidal:
+  case OMPScheduleType::NomergeUnorderedGreedy:
+  case OMPScheduleType::NomergeUnorderedBalanced:
+  case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
+  case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
+  case OMPScheduleType::NomergeUnorderedSteal:
+  case OMPScheduleType::NomergeOrderedStaticChunked:
+  case OMPScheduleType::NomergeOrderedStatic:
+  case OMPScheduleType::NomergeOrderedDynamicChunked:
+  case OMPScheduleType::NomergeOrderedGuidedChunked:
+  case OMPScheduleType::NomergeOrderedRuntime:
+  case OMPScheduleType::NomergeOrderedAuto:
+  case OMPScheduleType::NomergeOrderedTrapezoidal:
+    break;
+  default:
+    return false;
+  }
+
+  // Must not set both monotonicity modifiers at the same time.
+  OMPScheduleType MonotonicityFlags =
+      SchedType & OMPScheduleType::MonotonicityMask;
+  if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
+    return false;
+
+  return true;
+}
 #endif
 
+/// Determine which scheduling algorithm to use, determined from schedule clause
+/// arguments.
+static OMPScheduleType
+getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
+                          bool HasSimdModifier) {
+  // Currently, the default schedule it static.
+  switch (ClauseKind) {
+  case OMP_SCHEDULE_Default:
+  case OMP_SCHEDULE_Static:
+    return HasChunks ? OMPScheduleType::BaseStaticChunked
+                     : OMPScheduleType::BaseStatic;
+  case OMP_SCHEDULE_Dynamic:
+    return OMPScheduleType::BaseDynamicChunked;
+  case OMP_SCHEDULE_Guided:
+    return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
+                           : OMPScheduleType::BaseGuidedChunked;
+  case OMP_SCHEDULE_Auto:
+    return llvm::omp::OMPScheduleType::BaseAuto;
+  case OMP_SCHEDULE_Runtime:
+    return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
+                           : OMPScheduleType::BaseRuntime;
+  }
+  llvm_unreachable("unhandled schedule clause argument");
+}
+
+/// Adds ordering modifier flags to schedule type.
+static OMPScheduleType
+getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType,
+                              bool HasOrderedClause) {
+  assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
+             OMPScheduleType::None &&
+         "Must not have ordering nor monotonicity flags already set");
+
+  OMPScheduleType OrderingModifier = HasOrderedClause
+                                         ? OMPScheduleType::ModifierOrdered
+                                         : OMPScheduleType::ModifierUnordered;
+  OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
+
+  // Unsupported combinations
+  if (OrderingScheduleType ==
+      (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
+    return OMPScheduleType::OrderedGuidedChunked;
+  else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
+                                    OMPScheduleType::ModifierOrdered))
+    return OMPScheduleType::OrderedRuntime;
+
+  return OrderingScheduleType;
+}
+
+/// Adds monotonicity modifier flags to schedule type.
+static OMPScheduleType
+getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType,
+                                  bool HasSimdModifier, bool HasMonotonic,
+                                  bool HasNonmonotonic, bool HasOrderedClause) {
+  assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
+             OMPScheduleType::None &&
+         "Must not have monotonicity flags already set");
+  assert((!HasMonotonic || !HasNonmonotonic) &&
+         "Monotonic and Nonmonotonic are contradicting each other");
+
+  if (HasMonotonic) {
+    return ScheduleType | OMPScheduleType::ModifierMonotonic;
+  } else if (HasNonmonotonic) {
+    return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
+  } else {
+    // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
+    // If the static schedule kind is specified or if the ordered clause is
+    // specified, and if the nonmonotonic modifier is not specified, the
+    // effect is as if the monotonic modifier is specified. Otherwise, unless
+    // the monotonic modifier is specified, the effect is as if the
+    // nonmonotonic modifier is specified.
+    OMPScheduleType BaseScheduleType =
+        ScheduleType & ~OMPScheduleType::ModifierMask;
+    if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
+        (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
+        HasOrderedClause) {
+      // The monotonic is used by default in openmp runtime library, so no need
+      // to set it.
+      return ScheduleType;
+    } else {
+      return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
+    }
+  }
+}
+
+/// Determine the schedule type using schedule and ordering clause arguments.
+static OMPScheduleType
+computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
+                          bool HasSimdModifier, bool HasMonotonicModifier,
+                          bool HasNonmonotonicModifier, bool HasOrderedClause) {
+  OMPScheduleType BaseSchedule =
+      getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier);
+  OMPScheduleType OrderedSchedule =
+      getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
+  OMPScheduleType Result = getOpenMPMonotonicityScheduleType(
+      OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
+      HasNonmonotonicModifier, HasOrderedClause);
+
+  assert(isValidWorkshareLoopScheduleType(Result));
+  return Result;
+}
+
+/// Make \p Source branch to \p Target.
+///
+/// Handles two situations:
+/// * \p Source already has an unconditional branch.
+/// * \p Source is a degenerate block (no terminator because the BB is
+///             the current head of the IR construction).
+static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL) {
+  if (Instruction *Term = Source->getTerminator()) {
+    auto *Br = cast<BranchInst>(Term);
+    assert(!Br->isConditional() &&
+           "BB's terminator must be an unconditional branch (or degenerate)");
+    BasicBlock *Succ = Br->getSuccessor(0);
+    Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
+    Br->setSuccessor(0, Target);
+    return;
+  }
+
+  auto *NewBr = BranchInst::Create(Target, Source);
+  NewBr->setDebugLoc(DL);
+}
+
+void llvm::spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New,
+                    bool CreateBranch) {
+  assert(New->getFirstInsertionPt() == New->begin() &&
+         "Target BB must not have PHI nodes");
+
+  // Move instructions to new block.
+  BasicBlock *Old = IP.getBlock();
+  New->getInstList().splice(New->begin(), Old->getInstList(), IP.getPoint(),
+                            Old->end());
+
+  if (CreateBranch)
+    BranchInst::Create(New, Old);
+}
+
+void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
+  DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
+  BasicBlock *Old = Builder.GetInsertBlock();
+
+  spliceBB(Builder.saveIP(), New, CreateBranch);
+  if (CreateBranch)
+    Builder.SetInsertPoint(Old->getTerminator());
+  else
+    Builder.SetInsertPoint(Old);
+
+  // SetInsertPoint also updates the Builder's debug location, but we want to
+  // keep the one the Builder was configured to use.
+  Builder.SetCurrentDebugLocation(DebugLoc);
+}
+
+BasicBlock *llvm::splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch,
+                          llvm::Twine Name) {
+  BasicBlock *Old = IP.getBlock();
+  BasicBlock *New = BasicBlock::Create(
+      Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
+      Old->getParent(), Old->getNextNode());
+  spliceBB(IP, New, CreateBranch);
+  New->replaceSuccessorsPhiUsesWith(Old, New);
+  return New;
+}
+
+BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
+                          llvm::Twine Name) {
+  DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
+  BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
+  if (CreateBranch)
+    Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
+  else
+    Builder.SetInsertPoint(Builder.GetInsertBlock());
+  // SetInsertPoint also updates the Builder's debug location, but we want to
+  // keep the one the Builder was configured to use.
+  Builder.SetCurrentDebugLocation(DebugLoc);
+  return New;
+}
+
+BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
+                          llvm::Twine Name) {
+  DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
+  BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
+  if (CreateBranch)
+    Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
+  else
+    Builder.SetInsertPoint(Builder.GetInsertBlock());
+  // SetInsertPoint also updates the Builder's debug location, but we want to
+  // keep the one the Builder was configured to use.
+  Builder.SetCurrentDebugLocation(DebugLoc);
+  return New;
+}
+
+BasicBlock *llvm::splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch,
+                                    llvm::Twine Suffix) {
+  BasicBlock *Old = Builder.GetInsertBlock();
+  return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
+}
+
 void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
   LLVMContext &Ctx = Fn.getContext();
 
@@ -199,6 +451,7 @@ void OpenMPIRBuilder::finalize(Function *Fn) {
                             /* AssumptionCache */ nullptr,
                             /* AllowVarArgs */ true,
                             /* AllowAlloca */ true,
+                            /* AllocaBlock*/ OI.OuterAllocaBB,
                             /* Suffix */ ".omp_par");
 
     LLVM_DEBUG(dbgs() << "Before     outlining: " << *OuterFn << "\n");
@@ -500,6 +753,44 @@ OpenMPIRBuilder::createCancel(const LocationDescription &Loc,
   return Builder.saveIP();
 }
 
+void OpenMPIRBuilder::emitOffloadingEntry(Constant *Addr, StringRef Name,
+                                          uint64_t Size, int32_t Flags,
+                                          StringRef SectionName) {
+  Type *Int8PtrTy = Type::getInt8PtrTy(M.getContext());
+  Type *Int32Ty = Type::getInt32Ty(M.getContext());
+  Type *SizeTy = M.getDataLayout().getIntPtrType(M.getContext());
+
+  Constant *AddrName = ConstantDataArray::getString(M.getContext(), Name);
+
+  // Create the constant string used to look up the symbol in the device.
+  auto *Str =
+      new llvm::GlobalVariable(M, AddrName->getType(), /*isConstant=*/true,
+                               llvm::GlobalValue::InternalLinkage, AddrName,
+                               ".omp_offloading.entry_name");
+  Str->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
+
+  // Construct the offloading entry.
+  Constant *EntryData[] = {
+      ConstantExpr::getPointerBitCastOrAddrSpaceCast(Addr, Int8PtrTy),
+      ConstantExpr::getPointerBitCastOrAddrSpaceCast(Str, Int8PtrTy),
+      ConstantInt::get(SizeTy, Size),
+      ConstantInt::get(Int32Ty, Flags),
+      ConstantInt::get(Int32Ty, 0),
+  };
+  Constant *EntryInitializer =
+      ConstantStruct::get(OpenMPIRBuilder::OffloadEntry, EntryData);
+
+  auto *Entry = new GlobalVariable(
+      M, OpenMPIRBuilder::OffloadEntry,
+      /* isConstant = */ true, GlobalValue::WeakAnyLinkage, EntryInitializer,
+      ".omp_offloading.entry." + Name, nullptr, GlobalValue::NotThreadLocal,
+      M.getDataLayout().getDefaultGlobalsAddressSpace());
+
+  // The entry has to be created in the section the linker expects it to be.
+  Entry->setSection(SectionName);
+  Entry->setAlignment(Align(1));
+}
+
 void OpenMPIRBuilder::emitCancelationCheckImpl(Value *CancelFlag,
                                                omp::Directive CanceledDirective,
                                                FinalizeCallbackTy ExitCB) {
@@ -670,7 +961,7 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel(
   // Let the caller create the body.
   assert(BodyGenCB && "Expected body generation callback!");
   InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
-  BodyGenCB(InnerAllocaIP, CodeGenIP, *PRegPreFiniBB);
+  BodyGenCB(InnerAllocaIP, CodeGenIP);
 
   LLVM_DEBUG(dbgs() << "After  body codegen: " << *OuterFn << "\n");
 
@@ -777,6 +1068,7 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel(
   InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
   FiniCB(PreFiniIP);
 
+  OI.OuterAllocaBB = OuterAllocaBlock;
   OI.EntryBB = PRegEntryBB;
   OI.ExitBB = PRegExitBB;
 
@@ -800,6 +1092,7 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel(
                           /* AssumptionCache */ nullptr,
                           /* AllowVarArgs */ true,
                           /* AllowAlloca */ true,
+                          /* AllocationBlock */ OuterAllocaBlock,
                           /* Suffix */ ".omp_par");
 
   // Find inputs to, outputs from the code region.
@@ -960,10 +1253,185 @@ void OpenMPIRBuilder::createTaskyield(const LocationDescription &Loc) {
   emitTaskyieldImpl(Loc);
 }
 
+OpenMPIRBuilder::InsertPointTy
+OpenMPIRBuilder::createTask(const LocationDescription &Loc,
+                            InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB,
+                            bool Tied, Value *Final) {
+  if (!updateToLocation(Loc))
+    return InsertPointTy();
+
+  // The current basic block is split into four basic blocks. After outlining,
+  // they will be mapped as follows:
+  // ```
+  // def current_fn() {
+  //   current_basic_block:
+  //     br label %task.exit
+  //   task.exit:
+  //     ; instructions after task
+  // }
+  // def outlined_fn() {
+  //   task.alloca:
+  //     br label %task.body
+  //   task.body:
+  //     ret void
+  // }
+  // ```
+  BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
+  BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
+  BasicBlock *TaskAllocaBB =
+      splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
+
+  OutlineInfo OI;
+  OI.EntryBB = TaskAllocaBB;
+  OI.OuterAllocaBB = AllocaIP.getBlock();
+  OI.ExitBB = TaskExitBB;
+  OI.PostOutlineCB = [this, &Loc, Tied, Final](Function &OutlinedFn) {
+    // The input IR here looks like the following-
+    // ```
+    // func @current_fn() {
+    //   outlined_fn(%args)
+    // }
+    // func @outlined_fn(%args) { ... }
+    // ```
+    //
+    // This is changed to the following-
+    //
+    // ```
+    // func @current_fn() {
+    //   runtime_call(..., wrapper_fn, ...)
+    // }
+    // func @wrapper_fn(..., %args) {
+    //   outlined_fn(%args)
+    // }
+    // func @outlined_fn(%args) { ... }
+    // ```
+
+    // The stale call instruction will be replaced with a new call instruction
+    // for runtime call with a wrapper function.
+    assert(OutlinedFn.getNumUses() == 1 &&
+           "there must be a single user for the outlined function");
+    CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
+
+    // HasTaskData is true if any variables are captured in the outlined region,
+    // false otherwise.
+    bool HasTaskData = StaleCI->arg_size() > 0;
+    Builder.SetInsertPoint(StaleCI);
+
+    // Gather the arguments for emitting the runtime call for
+    // @__kmpc_omp_task_alloc
+    Function *TaskAllocFn =
+        getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
+
+    // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
+    // call.
+    uint32_t SrcLocStrSize;
+    Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+    Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+    Value *ThreadID = getOrCreateThreadID(Ident);
+
+    // Argument - `flags`
+    // Task is tied iff (Flags & 1) == 1.
+    // Task is untied iff (Flags & 1) == 0.
+    // Task is final iff (Flags & 2) == 2.
+    // Task is not final iff (Flags & 2) == 0.
+    // TODO: Handle the other flags.
+    Value *Flags = Builder.getInt32(Tied);
+    if (Final) {
+      Value *FinalFlag =
+          Builder.CreateSelect(Final, Builder.getInt32(2), Builder.getInt32(0));
+      Flags = Builder.CreateOr(FinalFlag, Flags);
+    }
+
+    // Argument - `sizeof_kmp_task_t` (TaskSize)
+    // Tasksize refers to the size in bytes of kmp_task_t data structure
+    // including private vars accessed in task.
+    Value *TaskSize = Builder.getInt64(0);
+    if (HasTaskData) {
+      AllocaInst *ArgStructAlloca =
+          dyn_cast<AllocaInst>(StaleCI->getArgOperand(0));
+      assert(ArgStructAlloca &&
+             "Unable to find the alloca instruction corresponding to arguments "
+             "for extracted function");
+      StructType *ArgStructType =
+          dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
+      assert(ArgStructType && "Unable to find struct type corresponding to "
+                              "arguments for extracted function");
+      TaskSize =
+          Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
+    }
+
+    // TODO: Argument - sizeof_shareds
+
+    // Argument - task_entry (the wrapper function)
+    // If the outlined function has some captured variables (i.e. HasTaskData is
+    // true), then the wrapper function will have an additional argument (the
+    // struct containing captured variables). Otherwise, no such argument will
+    // be present.
+    SmallVector<Type *> WrapperArgTys{Builder.getInt32Ty()};
+    if (HasTaskData)
+      WrapperArgTys.push_back(OutlinedFn.getArg(0)->getType());
+    FunctionCallee WrapperFuncVal = M.getOrInsertFunction(
+        (Twine(OutlinedFn.getName()) + ".wrapper").str(),
+        FunctionType::get(Builder.getInt32Ty(), WrapperArgTys, false));
+    Function *WrapperFunc = dyn_cast<Function>(WrapperFuncVal.getCallee());
+    PointerType *WrapperFuncBitcastType =
+        FunctionType::get(Builder.getInt32Ty(),
+                          {Builder.getInt32Ty(), Builder.getInt8PtrTy()}, false)
+            ->getPointerTo();
+    Value *WrapperFuncBitcast =
+        ConstantExpr::getBitCast(WrapperFunc, WrapperFuncBitcastType);
+
+    // Emit the @__kmpc_omp_task_alloc runtime call
+    // The runtime call returns a pointer to an area where the task captured
+    // variables must be copied before the task is run (NewTaskData)
+    CallInst *NewTaskData = Builder.CreateCall(
+        TaskAllocFn,
+        {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
+         /*sizeof_task=*/TaskSize, /*sizeof_shared=*/Builder.getInt64(0),
+         /*task_func=*/WrapperFuncBitcast});
+
+    // Copy the arguments for outlined function
+    if (HasTaskData) {
+      Value *TaskData = StaleCI->getArgOperand(0);
+      Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
+      Builder.CreateMemCpy(NewTaskData, Alignment, TaskData, Alignment,
+                           TaskSize);
+    }
+
+    // Emit the @__kmpc_omp_task runtime call to spawn the task
+    Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
+    Builder.CreateCall(TaskFn, {Ident, ThreadID, NewTaskData});
+
+    StaleCI->eraseFromParent();
+
+    // Emit the body for wrapper function
+    BasicBlock *WrapperEntryBB =
+        BasicBlock::Create(M.getContext(), "", WrapperFunc);
+    Builder.SetInsertPoint(WrapperEntryBB);
+    if (HasTaskData)
+      Builder.CreateCall(&OutlinedFn, {WrapperFunc->getArg(1)});
+    else
+      Builder.CreateCall(&OutlinedFn);
+    Builder.CreateRet(Builder.getInt32(0));
+  };
+
+  addOutlineInfo(std::move(OI));
+
+  InsertPointTy TaskAllocaIP =
+      InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
+  InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
+  BodyGenCB(TaskAllocaIP, TaskBodyIP);
+  Builder.SetInsertPoint(TaskExitBB);
+
+  return Builder.saveIP();
+}
+
 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createSections(
     const LocationDescription &Loc, InsertPointTy AllocaIP,
     ArrayRef<StorableBodyGenCallbackTy> SectionCBs, PrivatizeCallbackTy PrivCB,
     FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
+  assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
+
   if (!updateToLocation(Loc))
     return Loc.IP;
 
@@ -1006,26 +1474,25 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createSections(
   // section_loop.after:
   // <FiniCB>;
   auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) {
-    auto *CurFn = CodeGenIP.getBlock()->getParent();
-    auto *ForIncBB = CodeGenIP.getBlock()->getSingleSuccessor();
-    auto *ForExitBB = CodeGenIP.getBlock()
-                          ->getSinglePredecessor()
-                          ->getTerminator()
-                          ->getSuccessor(1);
-    SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, ForIncBB);
     Builder.restoreIP(CodeGenIP);
+    BasicBlock *Continue =
+        splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
+    Function *CurFn = Continue->getParent();
+    SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
+
     unsigned CaseNumber = 0;
     for (auto SectionCB : SectionCBs) {
-      auto *CaseBB = BasicBlock::Create(M.getContext(),
-                                        "omp_section_loop.body.case", CurFn);
+      BasicBlock *CaseBB = BasicBlock::Create(
+          M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
       SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
       Builder.SetInsertPoint(CaseBB);
-      SectionCB(InsertPointTy(), Builder.saveIP(), *ForExitBB);
+      BranchInst *CaseEndBr = Builder.CreateBr(Continue);
+      SectionCB(InsertPointTy(),
+                {CaseEndBr->getParent(), CaseEndBr->getIterator()});
       CaseNumber++;
     }
     // remove the existing terminator from body BB since there can be no
     // terminators after switch/case
-    CodeGenIP.getBlock()->getTerminator()->eraseFromParent();
   };
   // Loop body ends here
   // LowerBound, UpperBound, and STride for createCanonicalLoop
@@ -1035,29 +1502,22 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createSections(
   Value *ST = ConstantInt::get(I32Ty, 1);
   llvm::CanonicalLoopInfo *LoopInfo = createCanonicalLoop(
       Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
-  Builder.SetInsertPoint(AllocaIP.getBlock()->getTerminator());
-  AllocaIP = Builder.saveIP();
   InsertPointTy AfterIP =
       applyStaticWorkshareLoop(Loc.DL, LoopInfo, AllocaIP, !IsNowait);
-  BasicBlock *LoopAfterBB = AfterIP.getBlock();
-  Instruction *SplitPos = LoopAfterBB->getTerminator();
-  if (!isa_and_nonnull<BranchInst>(SplitPos))
-    SplitPos = new UnreachableInst(Builder.getContext(), LoopAfterBB);
-  // ExitBB after LoopAfterBB because LoopAfterBB is used for FinalizationCB,
-  // which requires a BB with branch
-  BasicBlock *ExitBB =
-      LoopAfterBB->splitBasicBlock(SplitPos, "omp_sections.end");
-  SplitPos->eraseFromParent();
 
   // Apply the finalization callback in LoopAfterBB
   auto FiniInfo = FinalizationStack.pop_back_val();
   assert(FiniInfo.DK == OMPD_sections &&
          "Unexpected finalization stack state!");
-  Builder.SetInsertPoint(LoopAfterBB->getTerminator());
-  FiniInfo.FiniCB(Builder.saveIP());
-  Builder.SetInsertPoint(ExitBB);
+  if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
+    Builder.restoreIP(AfterIP);
+    BasicBlock *FiniBB =
+        splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
+    CB(Builder.saveIP());
+    AfterIP = {FiniBB, FiniBB->begin()};
+  }
 
-  return Builder.saveIP();
+  return AfterIP;
 }
 
 OpenMPIRBuilder::InsertPointTy
@@ -1402,10 +1862,8 @@ OpenMPIRBuilder::createCanonicalLoop(const LocationDescription &Loc,
     // Split the loop at the insertion point: Branch to the preheader and move
     // every following instruction to after the loop (the After BB). Also, the
     // new successor is the loop's after block.
+    spliceBB(Builder, After, /*CreateBranch=*/false);
     Builder.CreateBr(CL->getPreheader());
-    After->getInstList().splice(After->begin(), BB->getInstList(),
-                                Builder.GetInsertPoint(), BB->end());
-    After->replaceSuccessorsPhiUsesWith(BB, After);
   }
 
   // Emit the body content. We do it after connecting the loop to the CFG to
@@ -1506,20 +1964,10 @@ static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M,
   llvm_unreachable("unknown OpenMP loop iterator bitwidth");
 }
 
-// Sets the number of loop iterations to the given value. This value must be
-// valid in the condition block (i.e., defined in the preheader) and is
-// interpreted as an unsigned integer.
-void setCanonicalLoopTripCount(CanonicalLoopInfo *CLI, Value *TripCount) {
-  Instruction *CmpI = &CLI->getCond()->front();
-  assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
-  CmpI->setOperand(1, TripCount);
-  CLI->assertOK();
-}
-
 OpenMPIRBuilder::InsertPointTy
 OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
                                           InsertPointTy AllocaIP,
-                                          bool NeedsBarrier, Value *Chunk) {
+                                          bool NeedsBarrier) {
   assert(CLI->isValid() && "Requires a valid canonical loop");
   assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
          "Require dedicated allocate IP");
@@ -1559,38 +2007,31 @@ OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
   Builder.CreateStore(UpperBound, PUpperBound);
   Builder.CreateStore(One, PStride);
 
-  // FIXME: schedule(static) is NOT the same as schedule(static,1)
-  if (!Chunk)
-    Chunk = One;
-
   Value *ThreadNum = getOrCreateThreadID(SrcLoc);
 
-  Constant *SchedulingType =
-      ConstantInt::get(I32Type, static_cast<int>(OMPScheduleType::Static));
+  Constant *SchedulingType = ConstantInt::get(
+      I32Type, static_cast<int>(OMPScheduleType::UnorderedStatic));
 
   // Call the "init" function and update the trip count of the loop with the
   // value it produced.
   Builder.CreateCall(StaticInit,
                      {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound,
-                      PUpperBound, PStride, One, Chunk});
+                      PUpperBound, PStride, One, Zero});
   Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
   Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
   Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
   Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
-  setCanonicalLoopTripCount(CLI, TripCount);
+  CLI->setTripCount(TripCount);
 
   // Update all uses of the induction variable except the one in the condition
   // block that compares it with the actual upper bound, and the increment in
   // the latch block.
-  // TODO: this can eventually move to CanonicalLoopInfo or to a new
-  // CanonicalLoopInfoUpdater interface.
-  Builder.SetInsertPoint(CLI->getBody(), CLI->getBody()->getFirstInsertionPt());
-  Value *UpdatedIV = Builder.CreateAdd(IV, LowerBound);
-  IV->replaceUsesWithIf(UpdatedIV, [&](Use &U) {
-    auto *Instr = dyn_cast<Instruction>(U.getUser());
-    return !Instr ||
-           (Instr->getParent() != CLI->getCond() &&
-            Instr->getParent() != CLI->getLatch() && Instr != UpdatedIV);
+
+  CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
+    Builder.SetInsertPoint(CLI->getBody(),
+                           CLI->getBody()->getFirstInsertionPt());
+    Builder.SetCurrentDebugLocation(DL);
+    return Builder.CreateAdd(OldIV, LowerBound);
   });
 
   // In the "exit" block, call the "fini" function.
@@ -1610,11 +2051,198 @@ OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
   return AfterIP;
 }
 
-OpenMPIRBuilder::InsertPointTy
-OpenMPIRBuilder::applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
-                                    InsertPointTy AllocaIP, bool NeedsBarrier) {
-  // Currently only supports static schedules.
-  return applyStaticWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier);
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
+    DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
+    bool NeedsBarrier, Value *ChunkSize) {
+  assert(CLI->isValid() && "Requires a valid canonical loop");
+  assert(ChunkSize && "Chunk size is required");
+
+  LLVMContext &Ctx = CLI->getFunction()->getContext();
+  Value *IV = CLI->getIndVar();
+  Value *OrigTripCount = CLI->getTripCount();
+  Type *IVTy = IV->getType();
+  assert(IVTy->getIntegerBitWidth() <= 64 &&
+         "Max supported tripcount bitwidth is 64 bits");
+  Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
+                                                        : Type::getInt64Ty(Ctx);
+  Type *I32Type = Type::getInt32Ty(M.getContext());
+  Constant *Zero = ConstantInt::get(InternalIVTy, 0);
+  Constant *One = ConstantInt::get(InternalIVTy, 1);
+
+  // Declare useful OpenMP runtime functions.
+  FunctionCallee StaticInit =
+      getKmpcForStaticInitForType(InternalIVTy, M, *this);
+  FunctionCallee StaticFini =
+      getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
+
+  // Allocate space for computed loop bounds as expected by the "init" function.
+  Builder.restoreIP(AllocaIP);
+  Builder.SetCurrentDebugLocation(DL);
+  Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
+  Value *PLowerBound =
+      Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
+  Value *PUpperBound =
+      Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
+  Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
+
+  // Set up the source location value for the OpenMP runtime.
+  Builder.restoreIP(CLI->getPreheaderIP());
+  Builder.SetCurrentDebugLocation(DL);
+
+  // TODO: Detect overflow in ubsan or max-out with current tripcount.
+  Value *CastedChunkSize =
+      Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize");
+  Value *CastedTripCount =
+      Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
+
+  Constant *SchedulingType = ConstantInt::get(
+      I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked));
+  Builder.CreateStore(Zero, PLowerBound);
+  Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
+  Builder.CreateStore(OrigUpperBound, PUpperBound);
+  Builder.CreateStore(One, PStride);
+
+  // Call the "init" function and update the trip count of the loop with the
+  // value it produced.
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
+  Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+  Value *ThreadNum = getOrCreateThreadID(SrcLoc);
+  Builder.CreateCall(StaticInit,
+                     {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
+                      /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
+                      /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
+                      /*pstride=*/PStride, /*incr=*/One,
+                      /*chunk=*/CastedChunkSize});
+
+  // Load values written by the "init" function.
+  Value *FirstChunkStart =
+      Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
+  Value *FirstChunkStop =
+      Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
+  Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
+  Value *ChunkRange =
+      Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
+  Value *NextChunkStride =
+      Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
+
+  // Create outer "dispatch" loop for enumerating the chunks.
+  BasicBlock *DispatchEnter = splitBB(Builder, true);
+  Value *DispatchCounter;
+  CanonicalLoopInfo *DispatchCLI = createCanonicalLoop(
+      {Builder.saveIP(), DL},
+      [&](InsertPointTy BodyIP, Value *Counter) { DispatchCounter = Counter; },
+      FirstChunkStart, CastedTripCount, NextChunkStride,
+      /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
+      "dispatch");
+
+  // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
+  // not have to preserve the canonical invariant.
+  BasicBlock *DispatchBody = DispatchCLI->getBody();
+  BasicBlock *DispatchLatch = DispatchCLI->getLatch();
+  BasicBlock *DispatchExit = DispatchCLI->getExit();
+  BasicBlock *DispatchAfter = DispatchCLI->getAfter();
+  DispatchCLI->invalidate();
+
+  // Rewire the original loop to become the chunk loop inside the dispatch loop.
+  redirectTo(DispatchAfter, CLI->getAfter(), DL);
+  redirectTo(CLI->getExit(), DispatchLatch, DL);
+  redirectTo(DispatchBody, DispatchEnter, DL);
+
+  // Prepare the prolog of the chunk loop.
+  Builder.restoreIP(CLI->getPreheaderIP());
+  Builder.SetCurrentDebugLocation(DL);
+
+  // Compute the number of iterations of the chunk loop.
+  Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
+  Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
+  Value *IsLastChunk =
+      Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
+  Value *CountUntilOrigTripCount =
+      Builder.CreateSub(CastedTripCount, DispatchCounter);
+  Value *ChunkTripCount = Builder.CreateSelect(
+      IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
+  Value *BackcastedChunkTC =
+      Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
+  CLI->setTripCount(BackcastedChunkTC);
+
+  // Update all uses of the induction variable except the one in the condition
+  // block that compares it with the actual upper bound, and the increment in
+  // the latch block.
+  Value *BackcastedDispatchCounter =
+      Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
+  CLI->mapIndVar([&](Instruction *) -> Value * {
+    Builder.restoreIP(CLI->getBodyIP());
+    return Builder.CreateAdd(IV, BackcastedDispatchCounter);
+  });
+
+  // In the "exit" block, call the "fini" function.
+  Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
+  Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
+
+  // Add the barrier if requested.
+  if (NeedsBarrier)
+    createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
+                  /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
+
+#ifndef NDEBUG
+  // Even though we currently do not support applying additional methods to it,
+  // the chunk loop should remain a canonical loop.
+  CLI->assertOK();
+#endif
+
+  return {DispatchAfter, DispatchAfter->getFirstInsertionPt()};
+}
+
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoop(
+    DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
+    bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind,
+    llvm::Value *ChunkSize, bool HasSimdModifier, bool HasMonotonicModifier,
+    bool HasNonmonotonicModifier, bool HasOrderedClause) {
+  OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
+      SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
+      HasNonmonotonicModifier, HasOrderedClause);
+
+  bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
+                   OMPScheduleType::ModifierOrdered;
+  switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
+  case OMPScheduleType::BaseStatic:
+    assert(!ChunkSize && "No chunk size with static-chunked schedule");
+    if (IsOrdered)
+      return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
+                                       NeedsBarrier, ChunkSize);
+    // FIXME: Monotonicity ignored?
+    return applyStaticWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier);
+
+  case OMPScheduleType::BaseStaticChunked:
+    if (IsOrdered)
+      return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
+                                       NeedsBarrier, ChunkSize);
+    // FIXME: Monotonicity ignored?
+    return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier,
+                                           ChunkSize);
+
+  case OMPScheduleType::BaseRuntime:
+  case OMPScheduleType::BaseAuto:
+  case OMPScheduleType::BaseGreedy:
+  case OMPScheduleType::BaseBalanced:
+  case OMPScheduleType::BaseSteal:
+  case OMPScheduleType::BaseGuidedSimd:
+  case OMPScheduleType::BaseRuntimeSimd:
+    assert(!ChunkSize &&
+           "schedule type does not support user-defined chunk sizes");
+    LLVM_FALLTHROUGH;
+  case OMPScheduleType::BaseDynamicChunked:
+  case OMPScheduleType::BaseGuidedChunked:
+  case OMPScheduleType::BaseGuidedIterativeChunked:
+  case OMPScheduleType::BaseGuidedAnalyticalChunked:
+  case OMPScheduleType::BaseStaticBalancedChunked:
+    return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
+                                     NeedsBarrier, ChunkSize);
+
+  default:
+    llvm_unreachable("Unknown/unimplemented schedule kind");
+  }
 }
 
 /// Returns an LLVM function to call for initializing loop bounds using OpenMP
@@ -1649,12 +2277,32 @@ getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
   llvm_unreachable("unknown OpenMP loop iterator bitwidth");
 }
 
+/// Returns an LLVM function to call for finalizing the dynamic loop using
+/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
+/// interpret integers as unsigned similarly to CanonicalLoopInfo.
+static FunctionCallee
+getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
+  unsigned Bitwidth = Ty->getIntegerBitWidth();
+  if (Bitwidth == 32)
+    return OMPBuilder.getOrCreateRuntimeFunction(
+        M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
+  if (Bitwidth == 64)
+    return OMPBuilder.getOrCreateRuntimeFunction(
+        M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
+  llvm_unreachable("unknown OpenMP loop iterator bitwidth");
+}
+
 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyDynamicWorkshareLoop(
     DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
     OMPScheduleType SchedType, bool NeedsBarrier, Value *Chunk) {
   assert(CLI->isValid() && "Requires a valid canonical loop");
   assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
          "Require dedicated allocate IP");
+  assert(isValidWorkshareLoopScheduleType(SchedType) &&
+         "Require valid schedule type");
+
+  bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
+                 OMPScheduleType::ModifierOrdered;
 
   // Set up the source location value for OpenMP runtime.
   Builder.SetCurrentDebugLocation(DL);
@@ -1692,6 +2340,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyDynamicWorkshareLoop(
   BasicBlock *Header = CLI->getHeader();
   BasicBlock *Exit = CLI->getExit();
   BasicBlock *Cond = CLI->getCond();
+  BasicBlock *Latch = CLI->getLatch();
   InsertPointTy AfterIP = CLI->getAfterIP();
 
   // The CLI will be "broken" in the code below, as the loop is no longer
@@ -1751,6 +2400,13 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyDynamicWorkshareLoop(
   assert(BI->getSuccessor(1) == Exit);
   BI->setSuccessor(1, OuterCond);
 
+  // Call the "fini" function if "ordered" is present in wsloop directive.
+  if (Ordered) {
+    Builder.SetInsertPoint(&Latch->back());
+    FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
+    Builder.CreateCall(DynamicFini, {SrcLoc, ThreadNum});
+  }
+
   // Add the barrier if requested.
   if (NeedsBarrier) {
     Builder.SetInsertPoint(&Exit->back());
@@ -1763,27 +2419,6 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyDynamicWorkshareLoop(
   return AfterIP;
 }
 
-/// Make \p Source branch to \p Target.
-///
-/// Handles two situations:
-/// * \p Source already has an unconditional branch.
-/// * \p Source is a degenerate block (no terminator because the BB is
-///             the current head of the IR construction).
-static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL) {
-  if (Instruction *Term = Source->getTerminator()) {
-    auto *Br = cast<BranchInst>(Term);
-    assert(!Br->isConditional() &&
-           "BB's terminator must be an unconditional branch (or degenerate)");
-    BasicBlock *Succ = Br->getSuccessor(0);
-    Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
-    Br->setSuccessor(0, Target);
-    return;
-  }
-
-  auto *NewBr = BranchInst::Create(Target, Source);
-  NewBr->setDebugLoc(DL);
-}
-
 /// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
 /// after this \p OldTarget will be orphaned.
 static void redirectAllPredecessorsTo(BasicBlock *OldTarget,
@@ -2385,16 +3020,17 @@ static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) {
   unsigned NumInlineCandidates;
   bool NotDuplicatable;
   bool Convergent;
-  unsigned LoopSize =
+  InstructionCost LoopSizeIC =
       ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
                           TTI, EphValues, UP.BEInsns);
-  LLVM_DEBUG(dbgs() << "Estimated loop size is " << LoopSize << "\n");
+  LLVM_DEBUG(dbgs() << "Estimated loop size is " << LoopSizeIC << "\n");
 
   // Loop is not unrollable if the loop contains certain instructions.
-  if (NotDuplicatable || Convergent) {
+  if (NotDuplicatable || Convergent || !LoopSizeIC.isValid()) {
     LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
     return 1;
   }
+  unsigned LoopSize = *LoopSizeIC.getValue();
 
   // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
   // be able to use it.
@@ -2506,10 +3142,9 @@ OpenMPIRBuilder::createCopyPrivate(const LocationDescription &Loc,
   return Builder.saveIP();
 }
 
-OpenMPIRBuilder::InsertPointTy
-OpenMPIRBuilder::createSingle(const LocationDescription &Loc,
-                              BodyGenCallbackTy BodyGenCB,
-                              FinalizeCallbackTy FiniCB, llvm::Value *DidIt) {
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createSingle(
+    const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
+    FinalizeCallbackTy FiniCB, bool IsNowait, llvm::Value *DidIt) {
 
   if (!updateToLocation(Loc))
     return Loc.IP;
@@ -2537,9 +3172,16 @@ OpenMPIRBuilder::createSingle(const LocationDescription &Loc,
   //		.... single region ...
   // 		__kmpc_end_single
   // }
-
-  return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
-                              /*Conditional*/ true, /*hasFinalize*/ true);
+  // __kmpc_barrier
+
+  EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
+                       /*Conditional*/ true,
+                       /*hasFinalize*/ true);
+  if (!IsNowait)
+    createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
+                  omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
+                  /* CheckCancelFlag */ false);
+  return Builder.saveIP();
 }
 
 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createCritical(
@@ -2674,48 +3316,28 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::EmitOMPInlinedRegion(
 
   // generate body
   BodyGenCB(/* AllocaIP */ InsertPointTy(),
-            /* CodeGenIP */ Builder.saveIP(), *FiniBB);
-
-  // If we didn't emit a branch to FiniBB during body generation, it means
-  // FiniBB is unreachable (e.g. while(1);). stop generating all the
-  // unreachable blocks, and remove anything we are not going to use.
-  auto SkipEmittingRegion = FiniBB->hasNPredecessors(0);
-  if (SkipEmittingRegion) {
-    FiniBB->eraseFromParent();
-    ExitCall->eraseFromParent();
-    // Discard finalization if we have it.
-    if (HasFinalize) {
-      assert(!FinalizationStack.empty() &&
-             "Unexpected finalization stack state!");
-      FinalizationStack.pop_back();
-    }
-  } else {
-    // emit exit call and do any needed finalization.
-    auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
-    assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
-           FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
-           "Unexpected control flow graph state!!");
-    emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
-    assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
-           "Unexpected Control Flow State!");
-    MergeBlockIntoPredecessor(FiniBB);
-  }
+            /* CodeGenIP */ Builder.saveIP());
+
+  // emit exit call and do any needed finalization.
+  auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
+  assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
+         FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
+         "Unexpected control flow graph state!!");
+  emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
+  assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
+         "Unexpected Control Flow State!");
+  MergeBlockIntoPredecessor(FiniBB);
 
   // If we are skipping the region of a non conditional, remove the exit
   // block, and clear the builder's insertion point.
   assert(SplitPos->getParent() == ExitBB &&
          "Unexpected Insertion point location!");
-  if (!Conditional && SkipEmittingRegion) {
-    ExitBB->eraseFromParent();
-    Builder.ClearInsertionPoint();
-  } else {
-    auto merged = MergeBlockIntoPredecessor(ExitBB);
-    BasicBlock *ExitPredBB = SplitPos->getParent();
-    auto InsertBB = merged ? ExitPredBB : ExitBB;
-    if (!isa_and_nonnull<BranchInst>(SplitPos))
-      SplitPos->eraseFromParent();
-    Builder.SetInsertPoint(InsertBB);
-  }
+  auto merged = MergeBlockIntoPredecessor(ExitBB);
+  BasicBlock *ExitPredBB = SplitPos->getParent();
+  auto InsertBB = merged ? ExitPredBB : ExitBB;
+  if (!isa_and_nonnull<BranchInst>(SplitPos))
+    SplitPos->eraseFromParent();
+  Builder.SetInsertPoint(InsertBB);
 
   return Builder.saveIP();
 }
@@ -3171,6 +3793,7 @@ bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
     }
     break;
   case Write:
+  case Compare:
   case Update:
     if (AO == AtomicOrdering::Release || AO == AtomicOrdering::AcquireRelease ||
         AO == AtomicOrdering::SequentiallyConsistent) {
@@ -3290,9 +3913,10 @@ OpenMPIRBuilder::createAtomicWrite(const LocationDescription &Loc,
 }
 
 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicUpdate(
-    const LocationDescription &Loc, Instruction *AllocIP, AtomicOpValue &X,
+    const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
     Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
     AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr) {
+  assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
   if (!updateToLocation(Loc))
     return Loc.IP;
 
@@ -3309,7 +3933,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicUpdate(
            "OpenMP atomic does not support LT or GT operations");
   });
 
-  emitAtomicUpdate(AllocIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp,
+  emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp,
                    X.IsVolatile, IsXBinopExpr);
   checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
   return Builder.saveIP();
@@ -3344,20 +3968,39 @@ Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
 }
 
 std::pair<Value *, Value *> OpenMPIRBuilder::emitAtomicUpdate(
-    Instruction *AllocIP, Value *X, Type *XElemTy, Value *Expr,
+    InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
     AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
     AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr) {
-  bool DoCmpExch =
-      ((RMWOp == AtomicRMWInst::BAD_BINOP) || (RMWOp == AtomicRMWInst::FAdd)) ||
-      (RMWOp == AtomicRMWInst::FSub) ||
-      (RMWOp == AtomicRMWInst::Sub && !IsXBinopExpr);
+  // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
+  // or a complex datatype.
+  bool emitRMWOp = false;
+  switch (RMWOp) {
+  case AtomicRMWInst::Add:
+  case AtomicRMWInst::And:
+  case AtomicRMWInst::Nand:
+  case AtomicRMWInst::Or:
+  case AtomicRMWInst::Xor:
+  case AtomicRMWInst::Xchg:
+    emitRMWOp = XElemTy;
+    break;
+  case AtomicRMWInst::Sub:
+    emitRMWOp = (IsXBinopExpr && XElemTy);
+    break;
+  default:
+    emitRMWOp = false;
+  }
+  emitRMWOp &= XElemTy->isIntegerTy();
 
   std::pair<Value *, Value *> Res;
-  if (XElemTy->isIntegerTy() && !DoCmpExch) {
+  if (emitRMWOp) {
     Res.first = Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
     // not needed except in case of postfix captures. Generate anyway for
     // consistency with the else part. Will be removed with any DCE pass.
-    Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
+    // AtomicRMWInst::Xchg does not have a coressponding instruction.
+    if (RMWOp == AtomicRMWInst::Xchg)
+      Res.second = Res.first;
+    else
+      Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
   } else {
     unsigned Addrspace = cast<PointerType>(X->getType())->getAddressSpace();
     IntegerType *IntCastTy =
@@ -3380,12 +4023,12 @@ std::pair<Value *, Value *> OpenMPIRBuilder::emitAtomicUpdate(
     BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
                                                 X->getName() + ".atomic.cont");
     ContBB->getTerminator()->eraseFromParent();
+    Builder.restoreIP(AllocaIP);
+    AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
+    NewAtomicAddr->setName(X->getName() + "x.new.val");
     Builder.SetInsertPoint(ContBB);
     llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
     PHI->addIncoming(OldVal, CurBB);
-    AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
-    NewAtomicAddr->setName(X->getName() + "x.new.val");
-    NewAtomicAddr->moveBefore(AllocIP);
     IntegerType *NewAtomicCastTy =
         IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
     bool IsIntTy = XElemTy->isIntegerTy();
@@ -3407,7 +4050,7 @@ std::pair<Value *, Value *> OpenMPIRBuilder::emitAtomicUpdate(
 
     Value *Upd = UpdateOp(OldExprVal, Builder);
     Builder.CreateStore(Upd, NewAtomicAddr);
-    LoadInst *DesiredVal = Builder.CreateLoad(XElemTy, NewAtomicIntAddr);
+    LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicIntAddr);
     Value *XAddr =
         (IsIntTy)
             ? X
@@ -3415,7 +4058,7 @@ std::pair<Value *, Value *> OpenMPIRBuilder::emitAtomicUpdate(
     AtomicOrdering Failure =
         llvm::AtomicCmpXchgInst::getStrongestFailureOrdering(AO);
     AtomicCmpXchgInst *Result = Builder.CreateAtomicCmpXchg(
-        XAddr, OldExprVal, DesiredVal, llvm::MaybeAlign(), AO, Failure);
+        XAddr, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
     Result->setVolatile(VolatileX);
     Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
     Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
@@ -3439,7 +4082,7 @@ std::pair<Value *, Value *> OpenMPIRBuilder::emitAtomicUpdate(
 }
 
 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCapture(
-    const LocationDescription &Loc, Instruction *AllocIP, AtomicOpValue &X,
+    const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
     AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
     AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
     bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr) {
@@ -3450,7 +4093,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCapture(
     Type *XTy = X.Var->getType();
     assert(XTy->isPointerTy() &&
            "OMP Atomic expects a pointer to target memory");
-    Type *XElemTy = XTy->getPointerElementType();
+    Type *XElemTy = X.ElemTy;
     assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
             XElemTy->isPointerTy()) &&
            "OMP atomic capture expected a scalar type");
@@ -3462,7 +4105,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCapture(
   // 'x' is simply atomically rewritten with 'expr'.
   AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
   std::pair<Value *, Value *> Result =
-      emitAtomicUpdate(AllocIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp,
+      emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp,
                        X.IsVolatile, IsXBinopExpr);
 
   Value *CapturedVal = (IsPostfixUpdate ? Result.first : Result.second);
@@ -3472,6 +4115,155 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCapture(
   return Builder.saveIP();
 }
 
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
+    const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
+    AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
+    omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
+    bool IsFailOnly) {
+
+  if (!updateToLocation(Loc))
+    return Loc.IP;
+
+  assert(X.Var->getType()->isPointerTy() &&
+         "OMP atomic expects a pointer to target memory");
+  assert((X.ElemTy->isIntegerTy() || X.ElemTy->isPointerTy()) &&
+         "OMP atomic compare expected a integer scalar type");
+  // compare capture
+  if (V.Var) {
+    assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
+    assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
+  }
+
+  if (Op == OMPAtomicCompareOp::EQ) {
+    AtomicOrdering Failure = AtomicCmpXchgInst::getStrongestFailureOrdering(AO);
+    AtomicCmpXchgInst *Result =
+        Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
+    if (V.Var) {
+      Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
+      assert(OldValue->getType() == V.ElemTy &&
+             "OldValue and V must be of same type");
+      if (IsPostfixUpdate) {
+        Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
+      } else {
+        Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
+        if (IsFailOnly) {
+          // CurBB----
+          //   |     |
+          //   v     |
+          // ContBB  |
+          //   |     |
+          //   v     |
+          // ExitBB <-
+          //
+          // where ContBB only contains the store of old value to 'v'.
+          BasicBlock *CurBB = Builder.GetInsertBlock();
+          Instruction *CurBBTI = CurBB->getTerminator();
+          CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
+          BasicBlock *ExitBB = CurBB->splitBasicBlock(
+              CurBBTI, X.Var->getName() + ".atomic.exit");
+          BasicBlock *ContBB = CurBB->splitBasicBlock(
+              CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
+          ContBB->getTerminator()->eraseFromParent();
+          CurBB->getTerminator()->eraseFromParent();
+
+          Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
+
+          Builder.SetInsertPoint(ContBB);
+          Builder.CreateStore(OldValue, V.Var);
+          Builder.CreateBr(ExitBB);
+
+          if (UnreachableInst *ExitTI =
+                  dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
+            CurBBTI->eraseFromParent();
+            Builder.SetInsertPoint(ExitBB);
+          } else {
+            Builder.SetInsertPoint(ExitTI);
+          }
+        } else {
+          Value *CapturedValue =
+              Builder.CreateSelect(SuccessOrFail, E, OldValue);
+          Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
+        }
+      }
+    }
+    // The comparison result has to be stored.
+    if (R.Var) {
+      assert(R.Var->getType()->isPointerTy() &&
+             "r.var must be of pointer type");
+      assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
+
+      Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
+      Value *ResultCast = R.IsSigned
+                              ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
+                              : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
+      Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
+    }
+  } else {
+    assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
+           "Op should be either max or min at this point");
+    assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
+
+    // Reverse the ordop as the OpenMP forms are different from LLVM forms.
+    // Let's take max as example.
+    // OpenMP form:
+    // x = x > expr ? expr : x;
+    // LLVM form:
+    // *ptr = *ptr > val ? *ptr : val;
+    // We need to transform to LLVM form.
+    // x = x <= expr ? x : expr;
+    AtomicRMWInst::BinOp NewOp;
+    if (IsXBinopExpr) {
+      if (X.IsSigned)
+        NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
+                                              : AtomicRMWInst::Max;
+      else
+        NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
+                                              : AtomicRMWInst::UMax;
+    } else {
+      if (X.IsSigned)
+        NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
+                                              : AtomicRMWInst::Min;
+      else
+        NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
+                                              : AtomicRMWInst::UMin;
+    }
+
+    AtomicRMWInst *OldValue =
+        Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
+    if (V.Var) {
+      Value *CapturedValue = nullptr;
+      if (IsPostfixUpdate) {
+        CapturedValue = OldValue;
+      } else {
+        CmpInst::Predicate Pred;
+        switch (NewOp) {
+        case AtomicRMWInst::Max:
+          Pred = CmpInst::ICMP_SGT;
+          break;
+        case AtomicRMWInst::UMax:
+          Pred = CmpInst::ICMP_UGT;
+          break;
+        case AtomicRMWInst::Min:
+          Pred = CmpInst::ICMP_SLT;
+          break;
+        case AtomicRMWInst::UMin:
+          Pred = CmpInst::ICMP_ULT;
+          break;
+        default:
+          llvm_unreachable("unexpected comparison op");
+        }
+        Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
+        CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
+      }
+      Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
+    }
+  }
+
+  checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
+
+  return Builder.saveIP();
+}
+
 GlobalVariable *
 OpenMPIRBuilder::createOffloadMapnames(SmallVectorImpl<llvm::Constant *> &Names,
                                        std::string VarName) {
@@ -3543,6 +4335,51 @@ BasicBlock *CanonicalLoopInfo::getPreheader() const {
   llvm_unreachable("Missing preheader");
 }
 
+void CanonicalLoopInfo::setTripCount(Value *TripCount) {
+  assert(isValid() && "Requires a valid canonical loop");
+
+  Instruction *CmpI = &getCond()->front();
+  assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
+  CmpI->setOperand(1, TripCount);
+
+#ifndef NDEBUG
+  assertOK();
+#endif
+}
+
+void CanonicalLoopInfo::mapIndVar(
+    llvm::function_ref<Value *(Instruction *)> Updater) {
+  assert(isValid() && "Requires a valid canonical loop");
+
+  Instruction *OldIV = getIndVar();
+
+  // Record all uses excluding those introduced by the updater. Uses by the
+  // CanonicalLoopInfo itself to keep track of the number of iterations are
+  // excluded.
+  SmallVector<Use *> ReplacableUses;
+  for (Use &U : OldIV->uses()) {
+    auto *User = dyn_cast<Instruction>(U.getUser());
+    if (!User)
+      continue;
+    if (User->getParent() == getCond())
+      continue;
+    if (User->getParent() == getLatch())
+      continue;
+    ReplacableUses.push_back(&U);
+  }
+
+  // Run the updater that may introduce new uses
+  Value *NewIV = Updater(OldIV);
+
+  // Replace the old uses with the value returned by the updater.
+  for (Use *U : ReplacableUses)
+    U->set(NewIV);
+
+#ifndef NDEBUG
+  assertOK();
+#endif
+}
+
 void CanonicalLoopInfo::assertOK() const {
 #ifndef NDEBUG
   // No constraints if this object currently does not describe a loop.
diff --git a/llvm/lib/FuzzMutate/FuzzerCLI.cpp b/llvm/lib/FuzzMutate/FuzzerCLI.cpp
index 879d5a10b37b..90a1a35e2e3e 100644
--- a/llvm/lib/FuzzMutate/FuzzerCLI.cpp
+++ b/llvm/lib/FuzzMutate/FuzzerCLI.cpp
@@ -9,16 +9,9 @@
 #include "llvm/FuzzMutate/FuzzerCLI.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/Bitcode/BitcodeReader.h"
-#include "llvm/Bitcode/BitcodeWriter.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/IR/Verifier.h"
 
 using namespace llvm;
 
@@ -166,44 +159,3 @@ int llvm::runFuzzerOnInputs(int ArgC, char *ArgV[], FuzzerTestFun TestOne,
   }
   return 0;
 }
-
-std::unique_ptr<Module> llvm::parseModule(
-    const uint8_t *Data, size_t Size, LLVMContext &Context) {
-
-  if (Size <= 1)
-    // We get bogus data given an empty corpus - just create a new module.
-    return std::make_unique<Module>("M", Context);
-
-  auto Buffer = MemoryBuffer::getMemBuffer(
-      StringRef(reinterpret_cast<const char *>(Data), Size), "Fuzzer input",
-      /*RequiresNullTerminator=*/false);
-
-  SMDiagnostic Err;
-  auto M = parseBitcodeFile(Buffer->getMemBufferRef(), Context);
-  if (Error E = M.takeError()) {
-    errs() << toString(std::move(E)) << "\n";
-    return nullptr;
-  }
-  return std::move(M.get());
-}
-
-size_t llvm::writeModule(const Module &M, uint8_t *Dest, size_t MaxSize) {
-  std::string Buf;
-  {
-    raw_string_ostream OS(Buf);
-    WriteBitcodeToFile(M, OS);
-  }
-  if (Buf.size() > MaxSize)
-      return 0;
-  memcpy(Dest, Buf.data(), Buf.size());
-  return Buf.size();
-}
-
-std::unique_ptr<Module> llvm::parseAndVerify(const uint8_t *Data, size_t Size,
-                                             LLVMContext &Context) {
-  auto M = parseModule(Data, Size, Context);
-  if (!M || verifyModule(*M, &errs()))
-    return nullptr;
-
-  return M;
-}
diff --git a/llvm/lib/FuzzMutate/IRMutator.cpp b/llvm/lib/FuzzMutate/IRMutator.cpp
index 0cd0f538fdbc..b62a326a40cc 100644
--- a/llvm/lib/FuzzMutate/IRMutator.cpp
+++ b/llvm/lib/FuzzMutate/IRMutator.cpp
@@ -9,6 +9,8 @@
 #include "llvm/FuzzMutate/IRMutator.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/FuzzMutate/Operations.h"
 #include "llvm/FuzzMutate/Random.h"
 #include "llvm/FuzzMutate/RandomIRBuilder.h"
@@ -17,7 +19,9 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
 #include "llvm/Transforms/Scalar/DCE.h"
 
 using namespace llvm;
@@ -33,14 +37,15 @@ static void createEmptyFunction(Module &M) {
 }
 
 void IRMutationStrategy::mutate(Module &M, RandomIRBuilder &IB) {
-  if (M.empty())
-    createEmptyFunction(M);
-
   auto RS = makeSampler<Function *>(IB.Rand);
   for (Function &F : M)
     if (!F.isDeclaration())
       RS.sample(&F, /*Weight=*/1);
-  mutate(*RS.getSelection(), IB);
+
+  if (RS.isEmpty())
+    createEmptyFunction(M);
+  else
+    mutate(*RS.getSelection(), IB);
 }
 
 void IRMutationStrategy::mutate(Function &F, RandomIRBuilder &IB) {
@@ -243,3 +248,44 @@ void InstModificationIRStrategy::mutate(Instruction &Inst,
   if (RS)
     RS.getSelection()();
 }
+
+std::unique_ptr<Module> llvm::parseModule(const uint8_t *Data, size_t Size,
+                                          LLVMContext &Context) {
+
+  if (Size <= 1)
+    // We get bogus data given an empty corpus - just create a new module.
+    return std::make_unique<Module>("M", Context);
+
+  auto Buffer = MemoryBuffer::getMemBuffer(
+      StringRef(reinterpret_cast<const char *>(Data), Size), "Fuzzer input",
+      /*RequiresNullTerminator=*/false);
+
+  SMDiagnostic Err;
+  auto M = parseBitcodeFile(Buffer->getMemBufferRef(), Context);
+  if (Error E = M.takeError()) {
+    errs() << toString(std::move(E)) << "\n";
+    return nullptr;
+  }
+  return std::move(M.get());
+}
+
+size_t llvm::writeModule(const Module &M, uint8_t *Dest, size_t MaxSize) {
+  std::string Buf;
+  {
+    raw_string_ostream OS(Buf);
+    WriteBitcodeToFile(M, OS);
+  }
+  if (Buf.size() > MaxSize)
+    return 0;
+  memcpy(Dest, Buf.data(), Buf.size());
+  return Buf.size();
+}
+
+std::unique_ptr<Module> llvm::parseAndVerify(const uint8_t *Data, size_t Size,
+                                             LLVMContext &Context) {
+  auto M = parseModule(Data, Size, Context);
+  if (!M || verifyModule(*M, &errs()))
+    return nullptr;
+
+  return M;
+}
diff --git a/llvm/lib/FuzzMutate/Operations.cpp b/llvm/lib/FuzzMutate/Operations.cpp
index 221a3a84b49b..7443d49967c5 100644
--- a/llvm/lib/FuzzMutate/Operations.cpp
+++ b/llvm/lib/FuzzMutate/Operations.cpp
@@ -169,14 +169,21 @@ OpDescriptor llvm::fuzzerop::splitBlockDescriptor(unsigned Weight) {
 
 OpDescriptor llvm::fuzzerop::gepDescriptor(unsigned Weight) {
   auto buildGEP = [](ArrayRef<Value *> Srcs, Instruction *Inst) {
-    Type *Ty = Srcs[0]->getType()->getPointerElementType();
-    auto Indices = makeArrayRef(Srcs).drop_front(1);
+    // TODO: It would be better to generate a random type here, rather than
+    // generating a random value and picking its type.
+    Type *Ty = Srcs[0]->getType()->isOpaquePointerTy()
+                   ? Srcs[1]->getType()
+                   : Srcs[0]->getType()->getNonOpaquePointerElementType();
+    auto Indices = makeArrayRef(Srcs).drop_front(2);
     return GetElementPtrInst::Create(Ty, Srcs[0], Indices, "G", Inst);
   };
   // TODO: Handle aggregates and vectors
   // TODO: Support multiple indices.
   // TODO: Try to avoid meaningless accesses.
-  return {Weight, {sizedPtrType(), anyIntType()}, buildGEP};
+  SourcePred sizedType(
+      [](ArrayRef<Value *>, const Value *V) { return V->getType()->isSized(); },
+      None);
+  return {Weight, {sizedPtrType(), sizedType, anyIntType()}, buildGEP};
 }
 
 static uint64_t getAggregateNumElements(Type *T) {
@@ -302,12 +309,12 @@ static SourcePred validShuffleVectorIndex() {
     return ShuffleVectorInst::isValidOperands(Cur[0], Cur[1], V);
   };
   auto Make = [](ArrayRef<Value *> Cur, ArrayRef<Type *> Ts) {
-    auto *FirstTy = cast<FixedVectorType>(Cur[0]->getType());
+    auto *FirstTy = cast<VectorType>(Cur[0]->getType());
     auto *Int32Ty = Type::getInt32Ty(Cur[0]->getContext());
     // TODO: It's straighforward to make up reasonable values, but listing them
     // exhaustively would be insane. Come up with a couple of sensible ones.
     return std::vector<Constant *>{UndefValue::get(
-        FixedVectorType::get(Int32Ty, FirstTy->getNumElements()))};
+        VectorType::get(Int32Ty, FirstTy->getElementCount()))};
   };
   return {Pred, Make};
 }
diff --git a/llvm/lib/FuzzMutate/RandomIRBuilder.cpp b/llvm/lib/FuzzMutate/RandomIRBuilder.cpp
index 27c3bdfb22a8..9ac31ebccb99 100644
--- a/llvm/lib/FuzzMutate/RandomIRBuilder.cpp
+++ b/llvm/lib/FuzzMutate/RandomIRBuilder.cpp
@@ -8,10 +8,10 @@
 
 #include "llvm/FuzzMutate/RandomIRBuilder.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/FuzzMutate/OpDescriptor.h"
 #include "llvm/FuzzMutate/Random.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 
@@ -53,8 +53,11 @@ Value *RandomIRBuilder::newSource(BasicBlock &BB, ArrayRef<Instruction *> Insts,
       IP = ++I->getIterator();
       assert(IP != BB.end() && "guaranteed by the findPointer");
     }
-    auto *NewLoad =
-        new LoadInst(Ptr->getType()->getPointerElementType(), Ptr, "L", &*IP);
+    // For opaque pointers, pick the type independently.
+    Type *AccessTy = Ptr->getType()->isOpaquePointerTy()
+                         ? RS.getSelection()->getType()
+                         : Ptr->getType()->getNonOpaquePointerElementType();
+    auto *NewLoad = new LoadInst(AccessTy, Ptr, "L", &*IP);
 
     // Only sample this load if it really matches the descriptor
     if (Pred.matches(Srcs, NewLoad))
@@ -139,9 +142,12 @@ Value *RandomIRBuilder::findPointer(BasicBlock &BB,
     if (Inst->isTerminator())
       return false;
 
-    if (auto PtrTy = dyn_cast<PointerType>(Inst->getType())) {
+    if (auto *PtrTy = dyn_cast<PointerType>(Inst->getType())) {
+      if (PtrTy->isOpaque())
+        return true;
+
       // We can never generate loads from non first class or non sized types
-      Type *ElemTy = PtrTy->getPointerElementType();
+      Type *ElemTy = PtrTy->getNonOpaquePointerElementType();
       if (!ElemTy->isSized() || !ElemTy->isFirstClassType())
         return false;
 
diff --git a/llvm/lib/IR/AbstractCallSite.cpp b/llvm/lib/IR/AbstractCallSite.cpp
index 2e41799e13e9..b7a10846a0d3 100644
--- a/llvm/lib/IR/AbstractCallSite.cpp
+++ b/llvm/lib/IR/AbstractCallSite.cpp
@@ -16,7 +16,6 @@
 
 #include "llvm/IR/AbstractCallSite.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Support/Debug.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 179754e275b0..596348ddb462 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -223,9 +223,7 @@ predictValueUseListOrder(const Value *V, unsigned ID, const OrderMap &OM) {
     return LU->getOperandNo() > RU->getOperandNo();
   });
 
-  if (llvm::is_sorted(List, [](const Entry &L, const Entry &R) {
-        return L.second < R.second;
-      }))
+  if (llvm::is_sorted(List, llvm::less_second()))
     // Order is already correct.
     return {};
 
@@ -612,6 +610,11 @@ void TypePrinting::print(Type *Ty, raw_ostream &OS) {
     OS << '>';
     return;
   }
+  case Type::DXILPointerTyID:
+    // DXIL pointer types are only handled by the DirectX backend. To avoid
+    // extra dependencies we just print the pointer's address here.
+    OS << "dxil-ptr (" << Ty << ")";
+    return;
   }
   llvm_unreachable("Invalid TypeID");
 }
@@ -641,7 +644,7 @@ void TypePrinting::printStructBody(StructType *STy, raw_ostream &OS) {
     OS << '>';
 }
 
-AbstractSlotTrackerStorage::~AbstractSlotTrackerStorage() {}
+AbstractSlotTrackerStorage::~AbstractSlotTrackerStorage() = default;
 
 namespace llvm {
 
@@ -1290,7 +1293,7 @@ struct AsmWriterContext {
   /// prints a Metadata as operand.
   virtual void onWriteMetadataAsOperand(const Metadata *) {}
 
-  virtual ~AsmWriterContext() {}
+  virtual ~AsmWriterContext() = default;
 };
 } // end anonymous namespace
 
@@ -2072,7 +2075,7 @@ static void writeDIFile(raw_ostream &Out, const DIFile *N, AsmWriterContext &) {
   // Print all values for checksum together, or not at all.
   if (N->getChecksum())
     Printer.printChecksum(*N->getChecksum());
-  Printer.printString("source", N->getSource().getValueOr(StringRef()),
+  Printer.printString("source", N->getSource().value_or(StringRef()),
                       /* ShouldSkipEmpty */ true);
   Out << ")";
 }
@@ -2131,6 +2134,7 @@ static void writeDISubprogram(raw_ostream &Out, const DISubprogram *N,
   Printer.printMetadata("retainedNodes", N->getRawRetainedNodes());
   Printer.printMetadata("thrownTypes", N->getRawThrownTypes());
   Printer.printMetadata("annotations", N->getRawAnnotations());
+  Printer.printString("targetFuncName", N->getTargetFuncName());
   Out << ")";
 }
 
@@ -3531,6 +3535,19 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
     Out << '"';
   }
 
+  using SanitizerMetadata = llvm::GlobalValue::SanitizerMetadata;
+  if (GV->hasSanitizerMetadata()) {
+    SanitizerMetadata MD = GV->getSanitizerMetadata();
+    if (MD.NoAddress)
+      Out << ", no_sanitize_address";
+    if (MD.NoHWAddress)
+      Out << ", no_sanitize_hwaddress";
+    if (MD.NoMemtag)
+      Out << ", no_sanitize_memtag";
+    if (MD.IsDynInit)
+      Out << ", sanitize_address_dyninit";
+  }
+
   maybePrintComdat(Out, *GV);
   if (MaybeAlign A = GV->getAlign())
     Out << ", align " << A->value();
@@ -4708,9 +4725,8 @@ struct MDTreeAsmWriterContext : public AsmWriterContext {
       : AsmWriterContext(TP, ST, M), Level(0U), Visited({InitMD}), MainOS(OS) {}
 
   void onWriteMetadataAsOperand(const Metadata *MD) override {
-    if (Visited.count(MD))
+    if (!Visited.insert(MD).second)
       return;
-    Visited.insert(MD);
 
     std::string Str;
     raw_string_ostream SS(Str);
diff --git a/llvm/lib/IR/Assumptions.cpp b/llvm/lib/IR/Assumptions.cpp
index 3d24ae062841..27977d5d56b0 100644
--- a/llvm/lib/IR/Assumptions.cpp
+++ b/llvm/lib/IR/Assumptions.cpp
@@ -107,4 +107,5 @@ StringSet<> llvm::KnownAssumptionStrings({
     "omp_no_openmp_routines", // OpenMP 5.1
     "omp_no_parallelism",     // OpenMP 5.1
     "ompx_spmd_amenable",     // OpenMPOpt extension
+    "ompx_no_call_asm",       // OpenMPOpt extension
 });
diff --git a/llvm/lib/IR/AttributeImpl.h b/llvm/lib/IR/AttributeImpl.h
index 1153fb827b56..5eb958f5786a 100644
--- a/llvm/lib/IR/AttributeImpl.h
+++ b/llvm/lib/IR/AttributeImpl.h
@@ -255,6 +255,8 @@ public:
   std::pair<unsigned, Optional<unsigned>> getAllocSizeArgs() const;
   unsigned getVScaleRangeMin() const;
   Optional<unsigned> getVScaleRangeMax() const;
+  UWTableKind getUWTableKind() const;
+  AllocFnKind getAllocKind() const;
   std::string getAsString(bool InAttrGrp) const;
   Type *getAttributeType(Attribute::AttrKind Kind) const;
 
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
index 43fde64c3734..6d9f94b5eefd 100644
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -28,7 +28,6 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -56,12 +55,11 @@ static const unsigned AllocSizeNumElemsNotPresent = -1;
 
 static uint64_t packAllocSizeArgs(unsigned ElemSizeArg,
                                   const Optional<unsigned> &NumElemsArg) {
-  assert((!NumElemsArg.hasValue() ||
-          *NumElemsArg != AllocSizeNumElemsNotPresent) &&
+  assert((!NumElemsArg || *NumElemsArg != AllocSizeNumElemsNotPresent) &&
          "Attempting to pack a reserved value");
 
   return uint64_t(ElemSizeArg) << 32 |
-         NumElemsArg.getValueOr(AllocSizeNumElemsNotPresent);
+         NumElemsArg.value_or(AllocSizeNumElemsNotPresent);
 }
 
 static std::pair<unsigned, Optional<unsigned>>
@@ -77,7 +75,7 @@ unpackAllocSizeArgs(uint64_t Num) {
 
 static uint64_t packVScaleRangeArgs(unsigned MinValue,
                                     Optional<unsigned> MaxValue) {
-  return uint64_t(MinValue) << 32 | MaxValue.getValueOr(0);
+  return uint64_t(MinValue) << 32 | MaxValue.value_or(0);
 }
 
 static std::pair<unsigned, Optional<unsigned>>
@@ -205,6 +203,11 @@ Attribute Attribute::getWithInAllocaType(LLVMContext &Context, Type *Ty) {
   return get(Context, InAlloca, Ty);
 }
 
+Attribute Attribute::getWithUWTableKind(LLVMContext &Context,
+                                        UWTableKind Kind) {
+  return get(Context, UWTable, uint64_t(Kind));
+}
+
 Attribute
 Attribute::getWithAllocSizeArgs(LLVMContext &Context, unsigned ElemSizeArg,
                                 const Optional<unsigned> &NumElemsArg) {
@@ -366,6 +369,18 @@ Optional<unsigned> Attribute::getVScaleRangeMax() const {
   return unpackVScaleRangeArgs(pImpl->getValueAsInt()).second;
 }
 
+UWTableKind Attribute::getUWTableKind() const {
+  assert(hasAttribute(Attribute::UWTable) &&
+         "Trying to get unwind table kind from non-uwtable attribute");
+  return UWTableKind(pImpl->getValueAsInt());
+}
+
+AllocFnKind Attribute::getAllocKind() const {
+  assert(hasAttribute(Attribute::AllocKind) &&
+         "Trying to get allockind value from non-allockind attribute");
+  return AllocFnKind(pImpl->getValueAsInt());
+}
+
 std::string Attribute::getAsString(bool InAttrGrp) const {
   if (!pImpl) return {};
 
@@ -422,7 +437,38 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
     unsigned MinValue = getVScaleRangeMin();
     Optional<unsigned> MaxValue = getVScaleRangeMax();
     return ("vscale_range(" + Twine(MinValue) + "," +
-            Twine(MaxValue.getValueOr(0)) + ")")
+            Twine(MaxValue.value_or(0)) + ")")
+        .str();
+  }
+
+  if (hasAttribute(Attribute::UWTable)) {
+    UWTableKind Kind = getUWTableKind();
+    if (Kind != UWTableKind::None) {
+      return Kind == UWTableKind::Default
+                 ? "uwtable"
+                 : ("uwtable(" +
+                    Twine(Kind == UWTableKind::Sync ? "sync" : "async") + ")")
+                       .str();
+    }
+  }
+
+  if (hasAttribute(Attribute::AllocKind)) {
+    AllocFnKind Kind = getAllocKind();
+    SmallVector<StringRef> parts;
+    if ((Kind & AllocFnKind::Alloc) != AllocFnKind::Unknown)
+      parts.push_back("alloc");
+    if ((Kind & AllocFnKind::Realloc) != AllocFnKind::Unknown)
+      parts.push_back("realloc");
+    if ((Kind & AllocFnKind::Free) != AllocFnKind::Unknown)
+      parts.push_back("free");
+    if ((Kind & AllocFnKind::Uninitialized) != AllocFnKind::Unknown)
+      parts.push_back("uninitialized");
+    if ((Kind & AllocFnKind::Zeroed) != AllocFnKind::Unknown)
+      parts.push_back("zeroed");
+    if ((Kind & AllocFnKind::Aligned) != AllocFnKind::Unknown)
+      parts.push_back("aligned");
+    return ("allockind(\"" +
+            Twine(llvm::join(parts.begin(), parts.end(), ",")) + "\")")
         .str();
   }
 
@@ -710,6 +756,14 @@ Optional<unsigned> AttributeSet::getVScaleRangeMax() const {
   return SetNode ? SetNode->getVScaleRangeMax() : None;
 }
 
+UWTableKind AttributeSet::getUWTableKind() const {
+  return SetNode ? SetNode->getUWTableKind() : UWTableKind::None;
+}
+
+AllocFnKind AttributeSet::getAllocKind() const {
+  return SetNode ? SetNode->getAllocKind() : AllocFnKind::Unknown;
+}
+
 std::string AttributeSet::getAsString(bool InAttrGrp) const {
   return SetNode ? SetNode->getAsString(InAttrGrp) : "";
 }
@@ -876,6 +930,18 @@ Optional<unsigned> AttributeSetNode::getVScaleRangeMax() const {
   return None;
 }
 
+UWTableKind AttributeSetNode::getUWTableKind() const {
+  if (auto A = findEnumAttribute(Attribute::UWTable))
+    return A->getUWTableKind();
+  return UWTableKind::None;
+}
+
+AllocFnKind AttributeSetNode::getAllocKind() const {
+  if (auto A = findEnumAttribute(Attribute::AllocKind))
+    return A->getAllocKind();
+  return AllocFnKind::Unknown;
+}
+
 std::string AttributeSetNode::getAsString(bool InAttrGrp) const {
   std::string Str;
   for (iterator I = begin(), E = end(); I != E; ++I) {
@@ -987,11 +1053,7 @@ AttributeList::get(LLVMContext &C,
   if (Attrs.empty())
     return {};
 
-  assert(llvm::is_sorted(Attrs,
-                         [](const std::pair<unsigned, Attribute> &LHS,
-                            const std::pair<unsigned, Attribute> &RHS) {
-                           return LHS.first < RHS.first;
-                         }) &&
+  assert(llvm::is_sorted(Attrs, llvm::less_first()) &&
          "Misordered Attributes list!");
   assert(llvm::all_of(Attrs,
                       [](const std::pair<unsigned, Attribute> &Pair) {
@@ -1024,11 +1086,7 @@ AttributeList::get(LLVMContext &C,
   if (Attrs.empty())
     return {};
 
-  assert(llvm::is_sorted(Attrs,
-                         [](const std::pair<unsigned, AttributeSet> &LHS,
-                            const std::pair<unsigned, AttributeSet> &RHS) {
-                           return LHS.first < RHS.first;
-                         }) &&
+  assert(llvm::is_sorted(Attrs, llvm::less_first()) &&
          "Misordered Attributes list!");
   assert(llvm::none_of(Attrs,
                        [](const std::pair<unsigned, AttributeSet> &Pair) {
@@ -1428,6 +1486,14 @@ AttributeList::getParamDereferenceableOrNullBytes(unsigned Index) const {
   return getParamAttrs(Index).getDereferenceableOrNullBytes();
 }
 
+UWTableKind AttributeList::getUWTableKind() const {
+  return getFnAttrs().getUWTableKind();
+}
+
+AllocFnKind AttributeList::getAllocKind() const {
+  return getFnAttrs().getAllocKind();
+}
+
 std::string AttributeList::getAsString(unsigned Index, bool InAttrGrp) const {
   return getAttributes(Index).getAsString(InAttrGrp);
 }
@@ -1649,6 +1715,16 @@ AttrBuilder &AttrBuilder::addVScaleRangeAttrFromRawRepr(uint64_t RawArgs) {
   return addRawIntAttr(Attribute::VScaleRange, RawArgs);
 }
 
+AttrBuilder &AttrBuilder::addUWTableAttr(UWTableKind Kind) {
+  if (Kind == UWTableKind::None)
+    return *this;
+  return addRawIntAttr(Attribute::UWTable, uint64_t(Kind));
+}
+
+AttrBuilder &AttrBuilder::addAllocKindAttr(AllocFnKind Kind) {
+  return addRawIntAttr(Attribute::AllocKind, static_cast<uint64_t>(Kind));
+}
+
 Type *AttrBuilder::getTypeAttr(Attribute::AttrKind Kind) const {
   assert(Attribute::isTypeAttrKind(Kind) && "Not a type attribute");
   Attribute A = getAttribute(Kind);
@@ -1732,39 +1808,51 @@ bool AttrBuilder::operator==(const AttrBuilder &B) const {
 //===----------------------------------------------------------------------===//
 
 /// Which attributes cannot be applied to a type.
-AttributeMask AttributeFuncs::typeIncompatible(Type *Ty) {
+AttributeMask AttributeFuncs::typeIncompatible(Type *Ty,
+                                               AttributeSafetyKind ASK) {
   AttributeMask Incompatible;
 
-  if (!Ty->isIntegerTy())
+  if (!Ty->isIntegerTy()) {
     // Attributes that only apply to integers.
-    Incompatible.addAttribute(Attribute::SExt)
-      .addAttribute(Attribute::ZExt);
+    if (ASK & ASK_SAFE_TO_DROP)
+      Incompatible.addAttribute(Attribute::AllocAlign);
+    if (ASK & ASK_UNSAFE_TO_DROP)
+      Incompatible.addAttribute(Attribute::SExt).addAttribute(Attribute::ZExt);
+  }
 
-  if (!Ty->isPointerTy())
+  if (!Ty->isPointerTy()) {
     // Attributes that only apply to pointers.
-    Incompatible.addAttribute(Attribute::Nest)
-        .addAttribute(Attribute::NoAlias)
-        .addAttribute(Attribute::NoCapture)
-        .addAttribute(Attribute::NonNull)
-        .addAttribute(Attribute::ReadNone)
-        .addAttribute(Attribute::ReadOnly)
-        .addAttribute(Attribute::SwiftError)
-        .addAttribute(Attribute::Dereferenceable)
-        .addAttribute(Attribute::DereferenceableOrNull)
-        .addAttribute(Attribute::Preallocated)
-        .addAttribute(Attribute::InAlloca)
-        .addAttribute(Attribute::ByVal)
-        .addAttribute(Attribute::StructRet)
-        .addAttribute(Attribute::ByRef)
-        .addAttribute(Attribute::ElementType);
-
-  if (!Ty->isPtrOrPtrVectorTy())
+    if (ASK & ASK_SAFE_TO_DROP)
+      Incompatible.addAttribute(Attribute::NoAlias)
+          .addAttribute(Attribute::NoCapture)
+          .addAttribute(Attribute::NonNull)
+          .addAttribute(Attribute::ReadNone)
+          .addAttribute(Attribute::ReadOnly)
+          .addAttribute(Attribute::Dereferenceable)
+          .addAttribute(Attribute::DereferenceableOrNull);
+    if (ASK & ASK_UNSAFE_TO_DROP)
+      Incompatible.addAttribute(Attribute::Nest)
+          .addAttribute(Attribute::SwiftError)
+          .addAttribute(Attribute::Preallocated)
+          .addAttribute(Attribute::InAlloca)
+          .addAttribute(Attribute::ByVal)
+          .addAttribute(Attribute::StructRet)
+          .addAttribute(Attribute::ByRef)
+          .addAttribute(Attribute::ElementType)
+          .addAttribute(Attribute::AllocatedPointer);
+  }
+
     // Attributes that only apply to pointers or vectors of pointers.
-    Incompatible.addAttribute(Attribute::Alignment);
+  if (!Ty->isPtrOrPtrVectorTy()) {
+    if (ASK & ASK_SAFE_TO_DROP)
+      Incompatible.addAttribute(Attribute::Alignment);
+  }
 
   // Some attributes can apply to all "values" but there are no `void` values.
-  if (Ty->isVoidTy())
-    Incompatible.addAttribute(Attribute::NoUndef);
+  if (Ty->isVoidTy()) {
+    if (ASK & ASK_SAFE_TO_DROP)
+      Incompatible.addAttribute(Attribute::NoUndef);
+  }
 
   return Incompatible;
 }
@@ -1976,3 +2064,14 @@ void AttributeFuncs::mergeAttributesForOutlining(Function &Base,
   // that aspect in the merged function.
   mergeFnAttrs(Base, ToMerge);
 }
+
+void AttributeFuncs::updateMinLegalVectorWidthAttr(Function &Fn,
+                                                   uint64_t Width) {
+  Attribute Attr = Fn.getFnAttribute("min-legal-vector-width");
+  if (Attr.isValid()) {
+    uint64_t OldWidth;
+    Attr.getValueAsString().getAsInteger(0, OldWidth);
+    if (Width > OldWidth)
+      Fn.addFnAttr("min-legal-vector-width", llvm::utostr(Width));
+  }
+}
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 11839c7572e3..75594f90c926 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -14,6 +14,7 @@
 
 #include "llvm/IR/AutoUpgrade.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
@@ -575,19 +576,6 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
                                         F->arg_begin()->getType());
       return true;
     }
-    static const Regex vldRegex("^arm\\.neon\\.vld([1234]|[234]lane)\\.v[a-z0-9]*$");
-    if (vldRegex.match(Name)) {
-      auto fArgs = F->getFunctionType()->params();
-      SmallVector<Type *, 4> Tys(fArgs.begin(), fArgs.end());
-      // Can't use Intrinsic::getDeclaration here as the return types might
-      // then only be structurally equal.
-      FunctionType* fType = FunctionType::get(F->getReturnType(), Tys, false);
-      StringRef Suffix =
-          F->getContext().supportsTypedPointers() ? "p0i8" : "p0";
-      NewFn = Function::Create(fType, F->getLinkage(), F->getAddressSpace(),
-                               "llvm." + Name + "." + Suffix, F->getParent());
-      return true;
-    }
     static const Regex vstRegex("^arm\\.neon\\.vst([1234]|[234]lane)\\.v[a-z0-9]*$");
     if (vstRegex.match(Name)) {
       static const Intrinsic::ID StoreInts[] = {Intrinsic::arm_neon_vst1,
@@ -760,6 +748,23 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
     break;
   }
   case 'e': {
+    if (Name.startswith("experimental.vector.extract.")) {
+      rename(F);
+      Type *Tys[] = {F->getReturnType(), F->arg_begin()->getType()};
+      NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                        Intrinsic::vector_extract, Tys);
+      return true;
+    }
+
+    if (Name.startswith("experimental.vector.insert.")) {
+      rename(F);
+      auto Args = F->getFunctionType()->params();
+      Type *Tys[] = {Args[0], Args[1]};
+      NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                        Intrinsic::vector_insert, Tys);
+      return true;
+    }
+
     SmallVector<StringRef, 2> Groups;
     static const Regex R("^experimental.vector.reduce.([a-z]+)\\.[a-z][0-9]+");
     if (R.match(Name, &Groups)) {
@@ -1016,10 +1021,35 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
     if (UpgradeX86IntrinsicFunction(F, Name, NewFn))
       return true;
   }
+
+  auto *ST = dyn_cast<StructType>(F->getReturnType());
+  if (ST && (!ST->isLiteral() || ST->isPacked())) {
+    // Replace return type with literal non-packed struct. Only do this for
+    // intrinsics declared to return a struct, not for intrinsics with
+    // overloaded return type, in which case the exact struct type will be
+    // mangled into the name.
+    SmallVector<Intrinsic::IITDescriptor> Desc;
+    Intrinsic::getIntrinsicInfoTableEntries(F->getIntrinsicID(), Desc);
+    if (Desc.front().Kind == Intrinsic::IITDescriptor::Struct) {
+      auto *FT = F->getFunctionType();
+      auto *NewST = StructType::get(ST->getContext(), ST->elements());
+      auto *NewFT = FunctionType::get(NewST, FT->params(), FT->isVarArg());
+      std::string Name = F->getName().str();
+      rename(F);
+      NewFn = Function::Create(NewFT, F->getLinkage(), F->getAddressSpace(),
+                               Name, F->getParent());
+
+      // The new function may also need remangling.
+      if (auto Result = llvm::Intrinsic::remangleIntrinsicFunction(F))
+        NewFn = *Result;
+      return true;
+    }
+  }
+
   // Remangle our intrinsic since we upgrade the mangling
   auto Result = llvm::Intrinsic::remangleIntrinsicFunction(F);
   if (Result != None) {
-    NewFn = Result.getValue();
+    NewFn = *Result;
     return true;
   }
 
@@ -1237,7 +1267,7 @@ static Value *UpgradeX86ALIGNIntrinsics(IRBuilder<> &Builder, Value *Op0,
   return EmitX86Select(Builder, Mask, Align, Passthru);
 }
 
-static Value *UpgradeX86VPERMT2Intrinsics(IRBuilder<> &Builder, CallInst &CI,
+static Value *UpgradeX86VPERMT2Intrinsics(IRBuilder<> &Builder, CallBase &CI,
                                           bool ZeroMask, bool IndexForm) {
   Type *Ty = CI.getType();
   unsigned VecWidth = Ty->getPrimitiveSizeInBits();
@@ -1298,7 +1328,7 @@ static Value *UpgradeX86VPERMT2Intrinsics(IRBuilder<> &Builder, CallInst &CI,
   return EmitX86Select(Builder, CI.getArgOperand(3), V, PassThru);
 }
 
-static Value *UpgradeX86BinaryIntrinsics(IRBuilder<> &Builder, CallInst &CI,
+static Value *UpgradeX86BinaryIntrinsics(IRBuilder<> &Builder, CallBase &CI,
                                          Intrinsic::ID IID) {
   Type *Ty = CI.getType();
   Value *Op0 = CI.getOperand(0);
@@ -1314,7 +1344,7 @@ static Value *UpgradeX86BinaryIntrinsics(IRBuilder<> &Builder, CallInst &CI,
   return Res;
 }
 
-static Value *upgradeX86Rotate(IRBuilder<> &Builder, CallInst &CI,
+static Value *upgradeX86Rotate(IRBuilder<> &Builder, CallBase &CI,
                                bool IsRotateRight) {
   Type *Ty = CI.getType();
   Value *Src = CI.getArgOperand(0);
@@ -1341,7 +1371,7 @@ static Value *upgradeX86Rotate(IRBuilder<> &Builder, CallInst &CI,
   return Res;
 }
 
-static Value *upgradeX86vpcom(IRBuilder<> &Builder, CallInst &CI, unsigned Imm,
+static Value *upgradeX86vpcom(IRBuilder<> &Builder, CallBase &CI, unsigned Imm,
                               bool IsSigned) {
   Type *Ty = CI.getType();
   Value *LHS = CI.getArgOperand(0);
@@ -1380,7 +1410,7 @@ static Value *upgradeX86vpcom(IRBuilder<> &Builder, CallInst &CI, unsigned Imm,
   return Ext;
 }
 
-static Value *upgradeX86ConcatShift(IRBuilder<> &Builder, CallInst &CI,
+static Value *upgradeX86ConcatShift(IRBuilder<> &Builder, CallBase &CI,
                                     bool IsShiftRight, bool ZeroMask) {
   Type *Ty = CI.getType();
   Value *Op0 = CI.getArgOperand(0);
@@ -1459,7 +1489,7 @@ static Value *UpgradeMaskedLoad(IRBuilder<> &Builder,
   return Builder.CreateMaskedLoad(ValTy, Ptr, Alignment, Mask, Passthru);
 }
 
-static Value *upgradeAbs(IRBuilder<> &Builder, CallInst &CI) {
+static Value *upgradeAbs(IRBuilder<> &Builder, CallBase &CI) {
   Type *Ty = CI.getType();
   Value *Op0 = CI.getArgOperand(0);
   Function *F = Intrinsic::getDeclaration(CI.getModule(), Intrinsic::abs, Ty);
@@ -1469,7 +1499,7 @@ static Value *upgradeAbs(IRBuilder<> &Builder, CallInst &CI) {
   return Res;
 }
 
-static Value *upgradePMULDQ(IRBuilder<> &Builder, CallInst &CI, bool IsSigned) {
+static Value *upgradePMULDQ(IRBuilder<> &Builder, CallBase &CI, bool IsSigned) {
   Type *Ty = CI.getType();
 
   // Arguments have a vXi32 type so cast to vXi64.
@@ -1521,7 +1551,7 @@ static Value *ApplyX86MaskOn1BitsVec(IRBuilder<> &Builder, Value *Vec,
   return Builder.CreateBitCast(Vec, Builder.getIntNTy(std::max(NumElts, 8U)));
 }
 
-static Value *upgradeMaskedCompare(IRBuilder<> &Builder, CallInst &CI,
+static Value *upgradeMaskedCompare(IRBuilder<> &Builder, CallBase &CI,
                                    unsigned CC, bool Signed) {
   Value *Op0 = CI.getArgOperand(0);
   unsigned NumElts = cast<FixedVectorType>(Op0->getType())->getNumElements();
@@ -1553,7 +1583,7 @@ static Value *upgradeMaskedCompare(IRBuilder<> &Builder, CallInst &CI,
 }
 
 // Replace a masked intrinsic with an older unmasked intrinsic.
-static Value *UpgradeX86MaskedShift(IRBuilder<> &Builder, CallInst &CI,
+static Value *UpgradeX86MaskedShift(IRBuilder<> &Builder, CallBase &CI,
                                     Intrinsic::ID IID) {
   Function *Intrin = Intrinsic::getDeclaration(CI.getModule(), IID);
   Value *Rep = Builder.CreateCall(Intrin,
@@ -1561,7 +1591,7 @@ static Value *UpgradeX86MaskedShift(IRBuilder<> &Builder, CallInst &CI,
   return EmitX86Select(Builder, CI.getArgOperand(3), Rep, CI.getArgOperand(2));
 }
 
-static Value* upgradeMaskedMove(IRBuilder<> &Builder, CallInst &CI) {
+static Value* upgradeMaskedMove(IRBuilder<> &Builder, CallBase &CI) {
   Value* A = CI.getArgOperand(0);
   Value* B = CI.getArgOperand(1);
   Value* Src = CI.getArgOperand(2);
@@ -1576,7 +1606,7 @@ static Value* upgradeMaskedMove(IRBuilder<> &Builder, CallInst &CI) {
 }
 
 
-static Value* UpgradeMaskToInt(IRBuilder<> &Builder, CallInst &CI) {
+static Value* UpgradeMaskToInt(IRBuilder<> &Builder, CallBase &CI) {
   Value* Op = CI.getArgOperand(0);
   Type* ReturnOp = CI.getType();
   unsigned NumElts = cast<FixedVectorType>(CI.getType())->getNumElements();
@@ -1586,7 +1616,7 @@ static Value* UpgradeMaskToInt(IRBuilder<> &Builder, CallInst &CI) {
 
 // Replace intrinsic with unmasked version and a select.
 static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder,
-                                      CallInst &CI, Value *&Rep) {
+                                      CallBase &CI, Value *&Rep) {
   Name = Name.substr(12); // Remove avx512.mask.
 
   unsigned VecWidth = CI.getType()->getPrimitiveSizeInBits();
@@ -1834,7 +1864,7 @@ void llvm::UpgradeInlineAsmString(std::string *AsmStr) {
   }
 }
 
-static Value *UpgradeARMIntrinsicCall(StringRef Name, CallInst *CI, Function *F,
+static Value *UpgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F,
                                       IRBuilder<> &Builder) {
   if (Name == "mve.vctp64.old") {
     // Replace the old v4i1 vctp64 with a v2i1 vctp and predicate-casts to the
@@ -1921,12 +1951,12 @@ static Value *UpgradeARMIntrinsicCall(StringRef Name, CallInst *CI, Function *F,
     Function *Fn = Intrinsic::getDeclaration(F->getParent(), ID, Tys);
     return Builder.CreateCall(Fn, Ops, CI->getName());
   }
-  llvm_unreachable("Unknown function for ARM CallInst upgrade.");
+  llvm_unreachable("Unknown function for ARM CallBase upgrade.");
 }
 
 /// Upgrade a call to an old intrinsic. All argument and return casting must be
 /// provided to seamlessly integrate with existing context.
-void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
+void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
   Function *F = CI->getCalledFunction();
   LLVMContext &C = CI->getContext();
   IRBuilder<> Builder(C);
@@ -3774,7 +3804,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     } else if (IsARM) {
       Rep = UpgradeARMIntrinsicCall(Name, CI, F, Builder);
     } else {
-      llvm_unreachable("Unknown function for CallInst upgrade.");
+      llvm_unreachable("Unknown function for CallBase upgrade.");
     }
 
     if (Rep)
@@ -3783,12 +3813,33 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     return;
   }
 
-  const auto &DefaultCase = [&NewFn, &CI]() -> void {
-    // Handle generic mangling change, but nothing else
-    assert(
-        (CI->getCalledFunction()->getName() != NewFn->getName()) &&
-        "Unknown function for CallInst upgrade and isn't just a name change");
-    CI->setCalledFunction(NewFn);
+  const auto &DefaultCase = [&]() -> void {
+    if (CI->getFunctionType() == NewFn->getFunctionType()) {
+      // Handle generic mangling change.
+      assert(
+          (CI->getCalledFunction()->getName() != NewFn->getName()) &&
+          "Unknown function for CallBase upgrade and isn't just a name change");
+      CI->setCalledFunction(NewFn);
+      return;
+    }
+
+    // This must be an upgrade from a named to a literal struct.
+    auto *OldST = cast<StructType>(CI->getType());
+    assert(OldST != NewFn->getReturnType() && "Return type must have changed");
+    assert(OldST->getNumElements() ==
+               cast<StructType>(NewFn->getReturnType())->getNumElements() &&
+           "Must have same number of elements");
+
+    SmallVector<Value *> Args(CI->args());
+    Value *NewCI = Builder.CreateCall(NewFn, Args);
+    Value *Res = PoisonValue::get(OldST);
+    for (unsigned Idx = 0; Idx < OldST->getNumElements(); ++Idx) {
+      Value *Elem = Builder.CreateExtractValue(NewCI, Idx);
+      Res = Builder.CreateInsertValue(Res, Elem, Idx);
+    }
+    CI->replaceAllUsesWith(Res);
+    CI->eraseFromParent();
+    return;
   };
   CallInst *NewCall = nullptr;
   switch (NewFn->getIntrinsicID()) {
@@ -3796,13 +3847,6 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     DefaultCase();
     return;
   }
-  case Intrinsic::arm_neon_vld1:
-  case Intrinsic::arm_neon_vld2:
-  case Intrinsic::arm_neon_vld3:
-  case Intrinsic::arm_neon_vld4:
-  case Intrinsic::arm_neon_vld2lane:
-  case Intrinsic::arm_neon_vld3lane:
-  case Intrinsic::arm_neon_vld4lane:
   case Intrinsic::arm_neon_vst1:
   case Intrinsic::arm_neon_vst2:
   case Intrinsic::arm_neon_vst3:
@@ -3885,8 +3929,11 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
 
   case Intrinsic::ptr_annotation:
     // Upgrade from versions that lacked the annotation attribute argument.
-    assert(CI->arg_size() == 4 &&
-           "Before LLVM 12.0 this intrinsic took four arguments");
+    if (CI->arg_size() != 4) {
+      DefaultCase();
+      return;
+    }
+
     // Create a new call with an added null annotation attribute argument.
     NewCall = Builder.CreateCall(
         NewFn,
@@ -4047,6 +4094,12 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     Value *Args[4] = {CI->getArgOperand(0), CI->getArgOperand(1),
                       CI->getArgOperand(2), CI->getArgOperand(4)};
     NewCall = Builder.CreateCall(NewFn, Args);
+    AttributeList OldAttrs = CI->getAttributes();
+    AttributeList NewAttrs = AttributeList::get(
+        C, OldAttrs.getFnAttrs(), OldAttrs.getRetAttrs(),
+        {OldAttrs.getParamAttrs(0), OldAttrs.getParamAttrs(1),
+         OldAttrs.getParamAttrs(2), OldAttrs.getParamAttrs(4)});
+    NewCall->setAttributes(NewAttrs);
     auto *MemCI = cast<MemIntrinsic>(NewCall);
     // All mem intrinsics support dest alignment.
     const ConstantInt *Align = cast<ConstantInt>(CI->getArgOperand(3));
@@ -4074,8 +4127,8 @@ void llvm::UpgradeCallsToIntrinsic(Function *F) {
     // Replace all users of the old function with the new function or new
     // instructions. This is not a range loop because the call is deleted.
     for (User *U : make_early_inc_range(F->users()))
-      if (CallInst *CI = dyn_cast<CallInst>(U))
-        UpgradeIntrinsicCall(CI, NewFn);
+      if (CallBase *CB = dyn_cast<CallBase>(U))
+        UpgradeIntrinsicCall(CB, NewFn);
 
     // Remove old function, no longer used, from the module.
     F->eraseFromParent();
@@ -4126,7 +4179,7 @@ Instruction *llvm::UpgradeBitCastInst(unsigned Opc, Value *V, Type *DestTy,
   return nullptr;
 }
 
-Value *llvm::UpgradeBitCastExpr(unsigned Opc, Constant *C, Type *DestTy) {
+Constant *llvm::UpgradeBitCastExpr(unsigned Opc, Constant *C, Type *DestTy) {
   if (Opc != Instruction::BitCast)
     return nullptr;
 
@@ -4358,6 +4411,24 @@ bool llvm::UpgradeModuleFlags(Module &M) {
         }
       }
     }
+
+    // Upgrade branch protection and return address signing module flags. The
+    // module flag behavior for these fields were Error and now they are Min.
+    if (ID->getString() == "branch-target-enforcement" ||
+        ID->getString().startswith("sign-return-address")) {
+      if (auto *Behavior =
+              mdconst::dyn_extract_or_null<ConstantInt>(Op->getOperand(0))) {
+        if (Behavior->getLimitedValue() == Module::Error) {
+          Type *Int32Ty = Type::getInt32Ty(M.getContext());
+          Metadata *Ops[3] = {
+              ConstantAsMetadata::get(ConstantInt::get(Int32Ty, Module::Min)),
+              Op->getOperand(1), Op->getOperand(2)};
+          ModFlags->setOperand(I, MDNode::get(M.getContext(), Ops));
+          Changed = true;
+        }
+      }
+    }
+
     // Upgrade Objective-C Image Info Section. Removed the whitespce in the
     // section name so that llvm-lto will not complain about mismatching
     // module flags that is functionally the same.
@@ -4469,7 +4540,7 @@ namespace {
 // folding and other libcall simplification. The nobuiltin attribute on the
 // callsite has the same effect.
 struct StrictFPUpgradeVisitor : public InstVisitor<StrictFPUpgradeVisitor> {
-  StrictFPUpgradeVisitor() {}
+  StrictFPUpgradeVisitor() = default;
 
   void visitCallBase(CallBase &Call) {
     if (!Call.isStrictFP())
@@ -4492,13 +4563,6 @@ void llvm::UpgradeFunctionAttributes(Function &F) {
     SFPV.visit(F);
   }
 
-  if (F.getCallingConv() == CallingConv::X86_INTR &&
-      !F.arg_empty() && !F.hasParamAttribute(0, Attribute::ByVal)) {
-    Type *ByValTy = F.getArg(0)->getType()->getPointerElementType();
-    Attribute NewAttr = Attribute::getWithByValType(F.getContext(), ByValTy);
-    F.addParamAttr(0, NewAttr);
-  }
-
   // Remove all incompatibile attributes from function.
   F.removeRetAttrs(AttributeFuncs::typeIncompatible(F.getReturnType()));
   for (auto &Arg : F.args())
@@ -4628,3 +4692,15 @@ void llvm::UpgradeAttributes(AttrBuilder &B) {
       B.addAttribute(Attribute::NullPointerIsValid);
   }
 }
+
+void llvm::UpgradeOperandBundles(std::vector<OperandBundleDef> &Bundles) {
+
+  // clang.arc.attachedcall bundles are now required to have an operand.
+  // If they don't, it's okay to drop them entirely: when there is an operand,
+  // the "attachedcall" is meaningful and required, but without an operand,
+  // it's just a marker NOP.  Dropping it merely prevents an optimization.
+  erase_if(Bundles, [&](OperandBundleDef &OBD) {
+    return OBD.getTag() == "clang.arc.attachedcall" &&
+           OBD.inputs().empty();
+  });
+}
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 99e3afaa8ba8..f064ff503eba 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -148,12 +148,6 @@ const Module *BasicBlock::getModule() const {
   return getParent()->getParent();
 }
 
-const Instruction *BasicBlock::getTerminator() const {
-  if (InstList.empty() || !InstList.back().isTerminator())
-    return nullptr;
-  return &InstList.back();
-}
-
 const CallInst *BasicBlock::getTerminatingMustTailCall() const {
   if (InstList.empty())
     return nullptr;
diff --git a/llvm/lib/IR/BuiltinGCs.cpp b/llvm/lib/IR/BuiltinGCs.cpp
index 31ee86383e78..e9ef034c488f 100644
--- a/llvm/lib/IR/BuiltinGCs.cpp
+++ b/llvm/lib/IR/BuiltinGCs.cpp
@@ -53,7 +53,7 @@ public:
 /// while introducing only minor runtime overhead.
 class ShadowStackGC : public GCStrategy {
 public:
-  ShadowStackGC() {}
+  ShadowStackGC() = default;
 };
 
 /// A GCStrategy which serves as an example for the usage of a statepoint based
diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index 936b1fc2ff6f..41b4f2919221 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -16,7 +16,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ConstantFold.h"
+#include "llvm/IR/ConstantFold.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/Constants.h"
@@ -379,7 +379,7 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
                opc != Instruction::AddrSpaceCast &&
                // Do not fold bitcast (gep) with inrange index, as this loses
                // information.
-               !cast<GEPOperator>(CE)->getInRangeIndex().hasValue() &&
+               !cast<GEPOperator>(CE)->getInRangeIndex() &&
                // Do not fold if the gep type is a vector, as bitcasting
                // operand 0 of a vector gep will result in a bitcast between
                // different sizes.
@@ -435,14 +435,8 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
     if (ConstantFP *FPC = dyn_cast<ConstantFP>(V)) {
       bool ignored;
       APFloat Val = FPC->getValueAPF();
-      Val.convert(DestTy->isHalfTy() ? APFloat::IEEEhalf() :
-                  DestTy->isFloatTy() ? APFloat::IEEEsingle() :
-                  DestTy->isDoubleTy() ? APFloat::IEEEdouble() :
-                  DestTy->isX86_FP80Ty() ? APFloat::x87DoubleExtended() :
-                  DestTy->isFP128Ty() ? APFloat::IEEEquad() :
-                  DestTy->isPPC_FP128Ty() ? APFloat::PPCDoubleDouble() :
-                  APFloat::Bogus(),
-                  APFloat::rmNearestTiesToEven, &ignored);
+      Val.convert(DestTy->getFltSemantics(), APFloat::rmNearestTiesToEven,
+                  &ignored);
       return ConstantFP::get(V->getContext(), Val);
     }
     return nullptr; // Can't fold.
@@ -683,6 +677,11 @@ Constant *llvm::ConstantFoldInsertElementInstruction(Constant *Val,
   if (isa<UndefValue>(Idx))
     return PoisonValue::get(Val->getType());
 
+  // Inserting null into all zeros is still all zeros.
+  // TODO: This is true for undef and poison splats too.
+  if (isa<ConstantAggregateZero>(Val) && Elt->isNullValue())
+    return Val;
+
   ConstantInt *CIdx = dyn_cast<ConstantInt>(Idx);
   if (!CIdx) return nullptr;
 
@@ -724,7 +723,7 @@ Constant *llvm::ConstantFoldShuffleVectorInstruction(Constant *V1, Constant *V2,
 
   // Undefined shuffle mask -> undefined value.
   if (all_of(Mask, [](int Elt) { return Elt == UndefMaskElem; })) {
-    return UndefValue::get(FixedVectorType::get(EltTy, MaskNumElts));
+    return UndefValue::get(VectorType::get(EltTy, MaskEltCount));
   }
 
   // If the mask is all zeros this is a splat, no need to go through all
@@ -2036,8 +2035,18 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
     // If inbounds, we can choose an out-of-bounds pointer as a base pointer.
     return InBounds ? PoisonValue::get(GEPTy) : UndefValue::get(GEPTy);
 
-  Constant *Idx0 = cast<Constant>(Idxs[0]);
-  if (Idxs.size() == 1 && (Idx0->isNullValue() || isa<UndefValue>(Idx0)))
+  auto IsNoOp = [&]() {
+    // For non-opaque pointers having multiple indices will change the result
+    // type of the GEP.
+    if (!C->getType()->getScalarType()->isOpaquePointerTy() && Idxs.size() != 1)
+      return false;
+
+    return all_of(Idxs, [](Value *Idx) {
+      Constant *IdxC = cast<Constant>(Idx);
+      return IdxC->isNullValue() || isa<UndefValue>(IdxC);
+    });
+  };
+  if (IsNoOp())
     return GEPTy->isVectorTy() && !C->getType()->isVectorTy()
                ? ConstantVector::getSplat(
                      cast<VectorType>(GEPTy)->getElementCount(), C)
@@ -2090,6 +2099,7 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
     //   i32* getelementptr ([3 x i32]* %X, i64 0, i64 0)
     //
     // Don't fold if the cast is changing address spaces.
+    Constant *Idx0 = cast<Constant>(Idxs[0]);
     if (CE->isCast() && Idxs.size() > 1 && Idx0->isNullValue()) {
       PointerType *SrcPtrTy =
         dyn_cast<PointerType>(CE->getOperand(0)->getType());
diff --git a/llvm/lib/IR/ConstantFold.h b/llvm/lib/IR/ConstantFold.h
deleted file mode 100644
index 1aa44f4d21e5..000000000000
--- a/llvm/lib/IR/ConstantFold.h
+++ /dev/null
@@ -1,57 +0,0 @@
-//===-- ConstantFolding.h - Internal Constant Folding Interface -*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the (internal) constant folding interfaces for LLVM.  These
-// interfaces are used by the ConstantExpr::get* methods to automatically fold
-// constants when possible.
-//
-// These operators may return a null object if they don't know how to perform
-// the specified operation on the specified constant types.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_IR_CONSTANTFOLD_H
-#define LLVM_LIB_IR_CONSTANTFOLD_H
-
-#include "llvm/ADT/Optional.h"
-#include "llvm/IR/InstrTypes.h"
-
-namespace llvm {
-template <typename T> class ArrayRef;
-  class Value;
-  class Constant;
-  class Type;
-
-  // Constant fold various types of instruction...
-  Constant *ConstantFoldCastInstruction(
-    unsigned opcode,     ///< The opcode of the cast
-    Constant *V,         ///< The source constant
-    Type *DestTy   ///< The destination type
-  );
-  Constant *ConstantFoldSelectInstruction(Constant *Cond,
-                                          Constant *V1, Constant *V2);
-  Constant *ConstantFoldExtractElementInstruction(Constant *Val, Constant *Idx);
-  Constant *ConstantFoldInsertElementInstruction(Constant *Val, Constant *Elt,
-                                                 Constant *Idx);
-  Constant *ConstantFoldShuffleVectorInstruction(Constant *V1, Constant *V2,
-                                                 ArrayRef<int> Mask);
-  Constant *ConstantFoldExtractValueInstruction(Constant *Agg,
-                                                ArrayRef<unsigned> Idxs);
-  Constant *ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val,
-                                               ArrayRef<unsigned> Idxs);
-  Constant *ConstantFoldUnaryInstruction(unsigned Opcode, Constant *V);
-  Constant *ConstantFoldBinaryInstruction(unsigned Opcode, Constant *V1,
-                                          Constant *V2);
-  Constant *ConstantFoldCompareInstruction(CmpInst::Predicate Predicate,
-                                           Constant *C1, Constant *C2);
-  Constant *ConstantFoldGetElementPtr(Type *Ty, Constant *C, bool InBounds,
-                                      Optional<unsigned> InRangeIndex,
-                                      ArrayRef<Value *> Idxs);
-} // End llvm namespace
-
-#endif
diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp
index a0f2179bddb4..9d239101d8fd 100644
--- a/llvm/lib/IR/ConstantRange.cpp
+++ b/llvm/lib/IR/ConstantRange.cpp
@@ -75,6 +75,24 @@ ConstantRange ConstantRange::fromKnownBits(const KnownBits &Known,
   return ConstantRange(Lower, Upper + 1);
 }
 
+KnownBits ConstantRange::toKnownBits() const {
+  // TODO: We could return conflicting known bits here, but consumers are
+  // likely not prepared for that.
+  if (isEmptySet())
+    return KnownBits(getBitWidth());
+
+  // We can only retain the top bits that are the same between min and max.
+  APInt Min = getUnsignedMin();
+  APInt Max = getUnsignedMax();
+  KnownBits Known = KnownBits::makeConstant(Min);
+  if (Optional<unsigned> DifferentBit =
+          APIntOps::GetMostSignificantDifferentBit(Min, Max)) {
+    Known.Zero.clearLowBits(*DifferentBit + 1);
+    Known.One.clearLowBits(*DifferentBit + 1);
+  }
+  return Known;
+}
+
 ConstantRange ConstantRange::makeAllowedICmpRegion(CmpInst::Predicate Pred,
                                                    const ConstantRange &CR) {
   if (CR.isEmptySet())
@@ -721,15 +739,23 @@ ConstantRange ConstantRange::castOp(Instruction::CastOps CastOp,
   case Instruction::UIToFP: {
     // TODO: use input range if available
     auto BW = getBitWidth();
-    APInt Min = APInt::getMinValue(BW).zextOrSelf(ResultBitWidth);
-    APInt Max = APInt::getMaxValue(BW).zextOrSelf(ResultBitWidth);
+    APInt Min = APInt::getMinValue(BW);
+    APInt Max = APInt::getMaxValue(BW);
+    if (ResultBitWidth > BW) {
+      Min = Min.zext(ResultBitWidth);
+      Max = Max.zext(ResultBitWidth);
+    }
     return ConstantRange(std::move(Min), std::move(Max));
   }
   case Instruction::SIToFP: {
     // TODO: use input range if available
     auto BW = getBitWidth();
-    APInt SMin = APInt::getSignedMinValue(BW).sextOrSelf(ResultBitWidth);
-    APInt SMax = APInt::getSignedMaxValue(BW).sextOrSelf(ResultBitWidth);
+    APInt SMin = APInt::getSignedMinValue(BW);
+    APInt SMax = APInt::getSignedMaxValue(BW);
+    if (ResultBitWidth > BW) {
+      SMin = SMin.sext(ResultBitWidth);
+      SMax = SMax.sext(ResultBitWidth);
+    }
     return ConstantRange(std::move(SMin), std::move(SMax));
   }
   case Instruction::FPTrunc:
@@ -1212,7 +1238,10 @@ ConstantRange ConstantRange::sdiv(const ConstantRange &RHS) const {
   // separately by combining division results with the appropriate signs.
   APInt Zero = APInt::getZero(getBitWidth());
   APInt SignedMin = APInt::getSignedMinValue(getBitWidth());
-  ConstantRange PosFilter(APInt(getBitWidth(), 1), SignedMin);
+  // There are no positive 1-bit values. The 1 would get interpreted as -1.
+  ConstantRange PosFilter =
+      getBitWidth() == 1 ? getEmpty()
+                         : ConstantRange(APInt(getBitWidth(), 1), SignedMin);
   ConstantRange NegFilter(SignedMin, Zero);
   ConstantRange PosL = intersectWith(PosFilter);
   ConstantRange NegL = intersectWith(NegFilter);
@@ -1368,34 +1397,29 @@ ConstantRange ConstantRange::binaryNot() const {
   return ConstantRange(APInt::getAllOnes(getBitWidth())).sub(*this);
 }
 
-ConstantRange
-ConstantRange::binaryAnd(const ConstantRange &Other) const {
+ConstantRange ConstantRange::binaryAnd(const ConstantRange &Other) const {
   if (isEmptySet() || Other.isEmptySet())
     return getEmpty();
 
-  // Use APInt's implementation of AND for single element ranges.
-  if (isSingleElement() && Other.isSingleElement())
-    return {*getSingleElement() & *Other.getSingleElement()};
-
-  // TODO: replace this with something less conservative
-
-  APInt umin = APIntOps::umin(Other.getUnsignedMax(), getUnsignedMax());
-  return getNonEmpty(APInt::getZero(getBitWidth()), std::move(umin) + 1);
+  ConstantRange KnownBitsRange =
+      fromKnownBits(toKnownBits() & Other.toKnownBits(), false);
+  ConstantRange UMinUMaxRange =
+      getNonEmpty(APInt::getZero(getBitWidth()),
+                  APIntOps::umin(Other.getUnsignedMax(), getUnsignedMax()) + 1);
+  return KnownBitsRange.intersectWith(UMinUMaxRange);
 }
 
-ConstantRange
-ConstantRange::binaryOr(const ConstantRange &Other) const {
+ConstantRange ConstantRange::binaryOr(const ConstantRange &Other) const {
   if (isEmptySet() || Other.isEmptySet())
     return getEmpty();
 
-  // Use APInt's implementation of OR for single element ranges.
-  if (isSingleElement() && Other.isSingleElement())
-    return {*getSingleElement() | *Other.getSingleElement()};
-
-  // TODO: replace this with something less conservative
-
-  APInt umax = APIntOps::umax(getUnsignedMin(), Other.getUnsignedMin());
-  return getNonEmpty(std::move(umax), APInt::getZero(getBitWidth()));
+  ConstantRange KnownBitsRange =
+      fromKnownBits(toKnownBits() | Other.toKnownBits(), false);
+  // Upper wrapped range.
+  ConstantRange UMaxUMinRange =
+      getNonEmpty(APIntOps::umax(getUnsignedMin(), Other.getUnsignedMin()),
+                  APInt::getZero(getBitWidth()));
+  return KnownBitsRange.intersectWith(UMaxUMinRange);
 }
 
 ConstantRange ConstantRange::binaryXor(const ConstantRange &Other) const {
@@ -1412,8 +1436,7 @@ ConstantRange ConstantRange::binaryXor(const ConstantRange &Other) const {
   if (isSingleElement() && getSingleElement()->isAllOnes())
     return Other.binaryNot();
 
-  // TODO: replace this with something less conservative
-  return getFull();
+  return fromKnownBits(toKnownBits() ^ Other.toKnownBits(), /*IsSigned*/false);
 }
 
 ConstantRange
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index b862a159127f..0bf5e09d6647 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -11,12 +11,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/Constants.h"
-#include "ConstantFold.h"
 #include "LLVMContextImpl.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/ConstantFold.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
@@ -27,7 +27,6 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
@@ -353,26 +352,14 @@ Constant *Constant::getNullValue(Type *Ty) {
   case Type::IntegerTyID:
     return ConstantInt::get(Ty, 0);
   case Type::HalfTyID:
-    return ConstantFP::get(Ty->getContext(),
-                           APFloat::getZero(APFloat::IEEEhalf()));
   case Type::BFloatTyID:
-    return ConstantFP::get(Ty->getContext(),
-                           APFloat::getZero(APFloat::BFloat()));
   case Type::FloatTyID:
-    return ConstantFP::get(Ty->getContext(),
-                           APFloat::getZero(APFloat::IEEEsingle()));
   case Type::DoubleTyID:
-    return ConstantFP::get(Ty->getContext(),
-                           APFloat::getZero(APFloat::IEEEdouble()));
   case Type::X86_FP80TyID:
-    return ConstantFP::get(Ty->getContext(),
-                           APFloat::getZero(APFloat::x87DoubleExtended()));
   case Type::FP128TyID:
-    return ConstantFP::get(Ty->getContext(),
-                           APFloat::getZero(APFloat::IEEEquad()));
   case Type::PPC_FP128TyID:
-    return ConstantFP::get(Ty->getContext(), APFloat(APFloat::PPCDoubleDouble(),
-                                                     APInt::getZero(128)));
+    return ConstantFP::get(Ty->getContext(),
+                           APFloat::getZero(Ty->getFltSemantics()));
   case Type::PointerTyID:
     return ConstantPointerNull::get(cast<PointerType>(Ty));
   case Type::StructTyID:
@@ -560,8 +547,6 @@ void llvm::deleteConstant(Constant *C) {
       delete static_cast<InsertElementConstantExpr *>(C);
     else if (isa<ShuffleVectorConstantExpr>(C))
       delete static_cast<ShuffleVectorConstantExpr *>(C);
-    else if (isa<ExtractValueConstantExpr>(C))
-      delete static_cast<ExtractValueConstantExpr *>(C);
     else if (isa<InsertValueConstantExpr>(C))
       delete static_cast<InsertValueConstantExpr *>(C);
     else if (isa<GetElementPtrConstantExpr>(C))
@@ -577,38 +562,47 @@ void llvm::deleteConstant(Constant *C) {
 }
 
 static bool canTrapImpl(const Constant *C,
-                        SmallPtrSetImpl<const ConstantExpr *> &NonTrappingOps) {
-  assert(C->getType()->isFirstClassType() && "Cannot evaluate aggregate vals!");
-  // The only thing that could possibly trap are constant exprs.
+                        SmallPtrSetImpl<const Constant *> &NonTrappingOps) {
+  assert(C->getType()->isFirstClassType() &&
+         "Cannot evaluate non-first-class types!");
+  // ConstantExpr or ConstantAggregate trap if any operands can trap.
+  if (isa<ConstantExpr>(C) || isa<ConstantAggregate>(C)) {
+    for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i) {
+      const Constant *Op = cast<Constant>(C->getOperand(i));
+      if (isa<ConstantExpr>(Op) || isa<ConstantAggregate>(Op)) {
+        if (NonTrappingOps.insert(Op).second && canTrapImpl(Op, NonTrappingOps))
+          return true;
+      }
+    }
+  }
+
+  // The only leafs that can trap are constant expressions.
   const ConstantExpr *CE = dyn_cast<ConstantExpr>(C);
   if (!CE)
     return false;
 
-  // ConstantExpr traps if any operands can trap.
-  for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i) {
-    if (ConstantExpr *Op = dyn_cast<ConstantExpr>(CE->getOperand(i))) {
-      if (NonTrappingOps.insert(Op).second && canTrapImpl(Op, NonTrappingOps))
-        return true;
-    }
-  }
-
   // Otherwise, only specific operations can trap.
   switch (CE->getOpcode()) {
   default:
     return false;
-  case Instruction::UDiv:
   case Instruction::SDiv:
-  case Instruction::URem:
   case Instruction::SRem:
-    // Div and rem can trap if the RHS is not known to be non-zero.
-    if (!isa<ConstantInt>(CE->getOperand(1)) ||CE->getOperand(1)->isNullValue())
+    // Signed div/rem can trap for SignedMin / -1.
+    if (!CE->getOperand(0)->isNotMinSignedValue() &&
+        (!isa<ConstantInt>(CE->getOperand(1)) ||
+         CE->getOperand(1)->isAllOnesValue()))
       return true;
-    return false;
+    LLVM_FALLTHROUGH;
+  case Instruction::UDiv:
+  case Instruction::URem:
+    // Div and rem can trap if the RHS is not known to be non-zero.
+    return !isa<ConstantInt>(CE->getOperand(1)) ||
+           CE->getOperand(1)->isNullValue();
   }
 }
 
 bool Constant::canTrap() const {
-  SmallPtrSet<const ConstantExpr *, 4> NonTrappingOps;
+  SmallPtrSet<const Constant *, 4> NonTrappingOps;
   return canTrapImpl(this, NonTrappingOps);
 }
 
@@ -742,9 +736,13 @@ static bool constantIsDead(const Constant *C, bool RemoveDeadUsers) {
       ++I;
   }
 
-  if (RemoveDeadUsers)
+  if (RemoveDeadUsers) {
+    // If C is only used by metadata, it should not be preserved but should
+    // have its uses replaced.
+    ReplaceableMetadataImpl::SalvageDebugInfo(*C);
     const_cast<Constant *>(C)->destroyConstant();
-
+  }
+  
   return true;
 }
 
@@ -1046,9 +1044,9 @@ Constant *ConstantFP::getSNaN(Type *Ty, bool Negative, APInt *Payload) {
   return C;
 }
 
-Constant *ConstantFP::getNegativeZero(Type *Ty) {
+Constant *ConstantFP::getZero(Type *Ty, bool Negative) {
   const fltSemantics &Semantics = Ty->getScalarType()->getFltSemantics();
-  APFloat NegZero = APFloat::getZero(Semantics, /*Negative=*/true);
+  APFloat NegZero = APFloat::getZero(Semantics, Negative);
   Constant *C = get(Ty->getContext(), NegZero);
 
   if (VectorType *VTy = dyn_cast<VectorType>(Ty))
@@ -1057,7 +1055,6 @@ Constant *ConstantFP::getNegativeZero(Type *Ty) {
   return C;
 }
 
-
 Constant *ConstantFP::getZeroValueForNegation(Type *Ty) {
   if (Ty->isFPOrFPVectorTy())
     return getNegativeZero(Ty);
@@ -1492,15 +1489,10 @@ bool ConstantExpr::isCompare() const {
 }
 
 bool ConstantExpr::hasIndices() const {
-  return getOpcode() == Instruction::ExtractValue ||
-         getOpcode() == Instruction::InsertValue;
+  return getOpcode() == Instruction::InsertValue;
 }
 
 ArrayRef<unsigned> ConstantExpr::getIndices() const {
-  if (const ExtractValueConstantExpr *EVCE =
-        dyn_cast<ExtractValueConstantExpr>(this))
-    return EVCE->Indices;
-
   return cast<InsertValueConstantExpr>(this)->Indices;
 }
 
@@ -1550,8 +1542,6 @@ Constant *ConstantExpr::getWithOperands(ArrayRef<Constant *> Ops, Type *Ty,
   case Instruction::InsertValue:
     return ConstantExpr::getInsertValue(Ops[0], Ops[1], getIndices(),
                                         OnlyIfReducedTy);
-  case Instruction::ExtractValue:
-    return ConstantExpr::getExtractValue(Ops[0], getIndices(), OnlyIfReducedTy);
   case Instruction::FNeg:
     return ConstantExpr::getFNeg(Ops[0]);
   case Instruction::ShuffleVector:
@@ -2065,6 +2055,17 @@ Constant *ConstantExpr::getTruncOrBitCast(Constant *C, Type *Ty) {
   return getTrunc(C, Ty);
 }
 
+Constant *ConstantExpr::getSExtOrTrunc(Constant *C, Type *Ty) {
+  assert(C->getType()->isIntOrIntVectorTy() && Ty->isIntOrIntVectorTy() &&
+         "Can only sign extend/truncate integers!");
+  Type *CTy = C->getType();
+  if (CTy->getScalarSizeInBits() < Ty->getScalarSizeInBits())
+    return getSExt(C, Ty);
+  if (CTy->getScalarSizeInBits() > Ty->getScalarSizeInBits())
+    return getTrunc(C, Ty);
+  return C;
+}
+
 Constant *ConstantExpr::getPointerCast(Constant *S, Type *Ty) {
   assert(S->getType()->isPtrOrPtrVectorTy() && "Invalid cast");
   assert((Ty->isIntOrIntVectorTy() || Ty->isPtrOrPtrVectorTy()) &&
@@ -2233,8 +2234,8 @@ Constant *ConstantExpr::getPtrToInt(Constant *C, Type *DstTy,
          "PtrToInt destination must be integer or integer vector");
   assert(isa<VectorType>(C->getType()) == isa<VectorType>(DstTy));
   if (isa<VectorType>(C->getType()))
-    assert(cast<FixedVectorType>(C->getType())->getNumElements() ==
-               cast<FixedVectorType>(DstTy)->getNumElements() &&
+    assert(cast<VectorType>(C->getType())->getElementCount() ==
+               cast<VectorType>(DstTy)->getElementCount() &&
            "Invalid cast between a different number of vector elements");
   return getFoldedCast(Instruction::PtrToInt, C, DstTy, OnlyIfReduced);
 }
@@ -2667,30 +2668,6 @@ Constant *ConstantExpr::getInsertValue(Constant *Agg, Constant *Val,
   return pImpl->ExprConstants.getOrCreate(ReqTy, Key);
 }
 
-Constant *ConstantExpr::getExtractValue(Constant *Agg, ArrayRef<unsigned> Idxs,
-                                        Type *OnlyIfReducedTy) {
-  assert(Agg->getType()->isFirstClassType() &&
-         "Tried to create extractelement operation on non-first-class type!");
-
-  Type *ReqTy = ExtractValueInst::getIndexedType(Agg->getType(), Idxs);
-  (void)ReqTy;
-  assert(ReqTy && "extractvalue indices invalid!");
-
-  assert(Agg->getType()->isFirstClassType() &&
-         "Non-first-class type for constant extractvalue expression");
-  if (Constant *FC = ConstantFoldExtractValueInstruction(Agg, Idxs))
-    return FC;
-
-  if (OnlyIfReducedTy == ReqTy)
-    return nullptr;
-
-  Constant *ArgVec[] = { Agg };
-  const ConstantExprKeyType Key(Instruction::ExtractValue, ArgVec, 0, 0, Idxs);
-
-  LLVMContextImpl *pImpl = Agg->getContext().pImpl;
-  return pImpl->ExprConstants.getOrCreate(ReqTy, Key);
-}
-
 Constant *ConstantExpr::getNeg(Constant *C, bool HasNUW, bool HasNSW) {
   assert(C->getType()->isIntOrIntVectorTy() &&
          "Cannot NEG a nonintegral value!");
@@ -2833,7 +2810,7 @@ Constant *ConstantExpr::getExactLogBase2(Constant *C) {
 }
 
 Constant *ConstantExpr::getBinOpIdentity(unsigned Opcode, Type *Ty,
-                                         bool AllowRHSConstant) {
+                                         bool AllowRHSConstant, bool NSZ) {
   assert(Instruction::isBinaryOp(Opcode) && "Only binops allowed");
 
   // Commutative opcodes: it does not matter if AllowRHSConstant is set.
@@ -2848,8 +2825,7 @@ Constant *ConstantExpr::getBinOpIdentity(unsigned Opcode, Type *Ty,
       case Instruction::And: // X & -1 = X
         return Constant::getAllOnesValue(Ty);
       case Instruction::FAdd: // X + -0.0 = X
-        // TODO: If the fadd has 'nsz', should we return +0.0?
-        return ConstantFP::getNegativeZero(Ty);
+        return ConstantFP::getZero(Ty, !NSZ);
       case Instruction::FMul: // X * 1.0 = X
         return ConstantFP::get(Ty, 1.0);
       default:
@@ -3544,8 +3520,6 @@ Instruction *ConstantExpr::getAsInstruction(Instruction *InsertBefore) const {
   case Instruction::InsertValue:
     return InsertValueInst::Create(Ops[0], Ops[1], getIndices(), "",
                                    InsertBefore);
-  case Instruction::ExtractValue:
-    return ExtractValueInst::Create(Ops[0], getIndices(), "", InsertBefore);
   case Instruction::ShuffleVector:
     return new ShuffleVectorInst(Ops[0], Ops[1], getShuffleMask(), "",
                                  InsertBefore);
diff --git a/llvm/lib/IR/ConstantsContext.h b/llvm/lib/IR/ConstantsContext.h
index 4056c5748081..21ef1c0d9f64 100644
--- a/llvm/lib/IR/ConstantsContext.h
+++ b/llvm/lib/IR/ConstantsContext.h
@@ -209,36 +209,6 @@ public:
   }
 };
 
-/// ExtractValueConstantExpr - This class is private to
-/// Constants.cpp, and is used behind the scenes to implement
-/// extractvalue constant exprs.
-class ExtractValueConstantExpr final : public ConstantExpr {
-public:
-  ExtractValueConstantExpr(Constant *Agg, ArrayRef<unsigned> IdxList,
-                           Type *DestTy)
-      : ConstantExpr(DestTy, Instruction::ExtractValue, &Op<0>(), 1),
-        Indices(IdxList.begin(), IdxList.end()) {
-    Op<0>() = Agg;
-  }
-
-  // allocate space for exactly one operand
-  void *operator new(size_t S) { return User::operator new(S, 1); }
-  void operator delete(void *Ptr) { User::operator delete(Ptr); }
-
-  /// Indices - These identify which value to extract.
-  const SmallVector<unsigned, 4> Indices;
-
-  /// Transparently provide more efficient getOperand methods.
-  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
-
-  static bool classof(const ConstantExpr *CE) {
-    return CE->getOpcode() == Instruction::ExtractValue;
-  }
-  static bool classof(const Value *V) {
-    return isa<ConstantExpr>(V) && classof(cast<ConstantExpr>(V));
-  }
-};
-
 /// InsertValueConstantExpr - This class is private to
 /// Constants.cpp, and is used behind the scenes to implement
 /// insertvalue constant exprs.
@@ -362,11 +332,6 @@ struct OperandTraits<ShuffleVectorConstantExpr>
     : public FixedNumOperandTraits<ShuffleVectorConstantExpr, 2> {};
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ShuffleVectorConstantExpr, Value)
 
-template <>
-struct OperandTraits<ExtractValueConstantExpr>
-    : public FixedNumOperandTraits<ExtractValueConstantExpr, 1> {};
-DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ExtractValueConstantExpr, Value)
-
 template <>
 struct OperandTraits<InsertValueConstantExpr>
     : public FixedNumOperandTraits<InsertValueConstantExpr, 2> {};
@@ -620,8 +585,6 @@ public:
       return new ShuffleVectorConstantExpr(Ops[0], Ops[1], ShuffleMask);
     case Instruction::InsertValue:
       return new InsertValueConstantExpr(Ops[0], Ops[1], Indexes, Ty);
-    case Instruction::ExtractValue:
-      return new ExtractValueConstantExpr(Ops[0], Indexes, Ty);
     case Instruction::GetElementPtr:
       return GetElementPtrConstantExpr::Create(ExplicitTy, Ops[0], Ops.slice(1),
                                                Ty, SubclassOptionalData);
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index 7ed156d552b1..4b9189ca5baa 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -115,6 +115,10 @@ void LLVMContextSetDiscardValueNames(LLVMContextRef C, LLVMBool Discard) {
   unwrap(C)->setDiscardValueNames(Discard);
 }
 
+void LLVMContextSetOpaquePointers(LLVMContextRef C, LLVMBool OpaquePointers) {
+  unwrap(C)->setOpaquePointers(OpaquePointers);
+}
+
 void LLVMContextDispose(LLVMContextRef C) {
   delete unwrap(C);
 }
@@ -534,6 +538,8 @@ LLVMTypeKind LLVMGetTypeKind(LLVMTypeRef Ty) {
     return LLVMTokenTypeKind;
   case Type::ScalableVectorTyID:
     return LLVMScalableVectorTypeKind;
+  case Type::DXILPointerTyID:
+    llvm_unreachable("DXIL pointers are unsupported via the C API");
   }
   llvm_unreachable("Unhandled TypeID.");
 }
@@ -786,6 +792,10 @@ LLVMTypeRef LLVMPointerType(LLVMTypeRef ElementType, unsigned AddressSpace) {
   return wrap(PointerType::get(unwrap(ElementType), AddressSpace));
 }
 
+LLVMBool LLVMPointerTypeIsOpaque(LLVMTypeRef Ty) {
+  return unwrap(Ty)->isOpaquePointerTy();
+}
+
 LLVMTypeRef LLVMVectorType(LLVMTypeRef ElementType, unsigned ElementCount) {
   return wrap(FixedVectorType::get(unwrap(ElementType), ElementCount));
 }
@@ -798,7 +808,7 @@ LLVMTypeRef LLVMScalableVectorType(LLVMTypeRef ElementType,
 LLVMTypeRef LLVMGetElementType(LLVMTypeRef WrappedTy) {
   auto *Ty = unwrap<Type>(WrappedTy);
   if (auto *PTy = dyn_cast<PointerType>(Ty))
-    return wrap(PTy->getPointerElementType());
+    return wrap(PTy->getNonOpaquePointerElementType());
   if (auto *ATy = dyn_cast<ArrayType>(Ty))
     return wrap(ATy->getElementType());
   return wrap(cast<VectorType>(Ty)->getElementType());
@@ -822,6 +832,10 @@ unsigned LLVMGetVectorSize(LLVMTypeRef VectorTy) {
 
 /*--.. Operations on other types ...........................................--*/
 
+LLVMTypeRef LLVMPointerTypeInContext(LLVMContextRef C, unsigned AddressSpace) {
+  return wrap(PointerType::get(*unwrap(C), AddressSpace));
+}
+
 LLVMTypeRef LLVMVoidTypeInContext(LLVMContextRef C)  {
   return wrap(Type::getVoidTy(*unwrap(C)));
 }
@@ -1431,6 +1445,10 @@ LLVMValueRef LLVMConstString(const char *Str, unsigned Length,
                                   DontNullTerminate);
 }
 
+LLVMValueRef LLVMGetAggregateElement(LLVMValueRef C, unsigned Idx) {
+  return wrap(unwrap<Constant>(C)->getAggregateElement(Idx));
+}
+
 LLVMValueRef LLVMGetElementAsConstant(LLVMValueRef C, unsigned idx) {
   return wrap(unwrap<ConstantDataSequential>(C)->getElementAsConstant(idx));
 }
@@ -1857,12 +1875,6 @@ LLVMValueRef LLVMConstShuffleVector(LLVMValueRef VectorAConstant,
                                              IntMask));
 }
 
-LLVMValueRef LLVMConstExtractValue(LLVMValueRef AggConstant, unsigned *IdxList,
-                                   unsigned NumIdx) {
-  return wrap(ConstantExpr::getExtractValue(unwrap<Constant>(AggConstant),
-                                            makeArrayRef(IdxList, NumIdx)));
-}
-
 LLVMValueRef LLVMConstInsertValue(LLVMValueRef AggConstant,
                                   LLVMValueRef ElementValueConstant,
                                   unsigned *IdxList, unsigned NumIdx) {
@@ -2061,13 +2073,13 @@ LLVMTypeRef LLVMGlobalGetValueType(LLVMValueRef Global) {
 unsigned LLVMGetAlignment(LLVMValueRef V) {
   Value *P = unwrap<Value>(V);
   if (GlobalObject *GV = dyn_cast<GlobalObject>(P))
-    return GV->getAlignment();
+    return GV->getAlign() ? GV->getAlign()->value() : 0;
   if (AllocaInst *AI = dyn_cast<AllocaInst>(P))
-    return AI->getAlignment();
+    return AI->getAlign().value();
   if (LoadInst *LI = dyn_cast<LoadInst>(P))
-    return LI->getAlignment();
+    return LI->getAlign().value();
   if (StoreInst *SI = dyn_cast<StoreInst>(P))
-    return SI->getAlignment();
+    return SI->getAlign().value();
   if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(P))
     return RMWI->getAlign().value();
   if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(P))
@@ -3919,6 +3931,12 @@ LLVMValueRef LLVMBuildFPCast(LLVMBuilderRef B, LLVMValueRef Val,
   return wrap(unwrap(B)->CreateFPCast(unwrap(Val), unwrap(DestTy), Name));
 }
 
+LLVMOpcode LLVMGetCastOpcode(LLVMValueRef Src, LLVMBool SrcIsSigned,
+                             LLVMTypeRef DestTy, LLVMBool DestIsSigned) {
+  return map_to_llvmopcode(CastInst::getCastOpcode(
+      unwrap(Src), SrcIsSigned, unwrap(DestTy), DestIsSigned));
+}
+
 /*--.. Comparisons .........................................................--*/
 
 LLVMValueRef LLVMBuildICmp(LLVMBuilderRef B, LLVMIntPredicate Op,
diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
index dc5768dd4f26..34ffc9425281 100644
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -19,7 +19,6 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
 
 using namespace llvm;
 using namespace llvm::dwarf;
@@ -293,6 +292,22 @@ DIStringType *DIBuilder::createStringType(StringRef Name, uint64_t SizeInBits) {
                            SizeInBits, 0);
 }
 
+DIStringType *DIBuilder::createStringType(StringRef Name,
+                                          DIVariable *StringLength,
+                                          DIExpression *StrLocationExp) {
+  assert(!Name.empty() && "Unable to create type without name");
+  return DIStringType::get(VMContext, dwarf::DW_TAG_string_type, Name,
+                           StringLength, nullptr, StrLocationExp, 0, 0, 0);
+}
+
+DIStringType *DIBuilder::createStringType(StringRef Name,
+                                          DIExpression *StringLengthExp,
+                                          DIExpression *StrLocationExp) {
+  assert(!Name.empty() && "Unable to create type without name");
+  return DIStringType::get(VMContext, dwarf::DW_TAG_string_type, Name, nullptr,
+                           StringLengthExp, StrLocationExp, 0, 0, 0);
+}
+
 DIDerivedType *DIBuilder::createQualifiedType(unsigned Tag, DIType *FromTy) {
   return DIDerivedType::get(VMContext, Tag, "", nullptr, 0, nullptr, FromTy, 0,
                             0, 0, None, DINode::FlagZero);
@@ -831,14 +846,15 @@ DISubprogram *DIBuilder::createFunction(
     unsigned LineNo, DISubroutineType *Ty, unsigned ScopeLine,
     DINode::DIFlags Flags, DISubprogram::DISPFlags SPFlags,
     DITemplateParameterArray TParams, DISubprogram *Decl,
-    DITypeArray ThrownTypes, DINodeArray Annotations) {
+    DITypeArray ThrownTypes, DINodeArray Annotations,
+    StringRef TargetFuncName) {
   bool IsDefinition = SPFlags & DISubprogram::SPFlagDefinition;
   auto *Node = getSubprogram(
       /*IsDistinct=*/IsDefinition, VMContext, getNonCompileUnitScope(Context),
       Name, LinkageName, File, LineNo, Ty, ScopeLine, nullptr, 0, 0, Flags,
       SPFlags, IsDefinition ? CUNode : nullptr, TParams, Decl,
       MDTuple::getTemporary(VMContext, None).release(), ThrownTypes,
-      Annotations);
+      Annotations, TargetFuncName);
 
   if (IsDefinition)
     AllSubprograms.push_back(Node);
diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp
index b9fc5261fefe..50799327c78a 100644
--- a/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -15,6 +15,7 @@
 #include "MetadataImpl.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
@@ -26,7 +27,7 @@ using namespace llvm;
 namespace llvm {
 // Use FS-AFDO discriminator.
 cl::opt<bool> EnableFSDiscriminator(
-    "enable-fs-discriminator", cl::Hidden, cl::init(false),
+    "enable-fs-discriminator", cl::Hidden,
     cl::desc("Enable adding flow sensitive discriminators"));
 } // namespace llvm
 
@@ -77,8 +78,8 @@ DILocation *DILocation::getImpl(LLVMContext &Context, unsigned Line,
   Ops.push_back(Scope);
   if (InlinedAt)
     Ops.push_back(InlinedAt);
-  return storeImpl(new (Ops.size()) DILocation(Context, Storage, Line, Column,
-                                               Ops, ImplicitCode),
+  return storeImpl(new (Ops.size(), Storage) DILocation(
+                       Context, Storage, Line, Column, Ops, ImplicitCode),
                    Storage, Context.pImpl->DILocations);
 }
 
@@ -180,6 +181,7 @@ void DILocation::decodeDiscriminator(unsigned D, unsigned &BD, unsigned &DF,
   CI = getUnsignedFromPrefixEncoding(
       getNextComponentInDiscriminator(getNextComponentInDiscriminator(D)));
 }
+dwarf::Tag DINode::getTag() const { return (dwarf::Tag)SubclassData16; }
 
 DINode::DIFlags DINode::getFlag(StringRef Flag) {
   return StringSwitch<DIFlags>(Flag)
@@ -282,6 +284,7 @@ static bool isCanonical(const MDString *S) {
 }
 #endif
 
+dwarf::Tag GenericDINode::getTag() const { return (dwarf::Tag)SubclassData16; }
 GenericDINode *GenericDINode::getImpl(LLVMContext &Context, unsigned Tag,
                                       MDString *Header,
                                       ArrayRef<Metadata *> DwarfOps,
@@ -301,7 +304,7 @@ GenericDINode *GenericDINode::getImpl(LLVMContext &Context, unsigned Tag,
   // Use a nullptr for empty headers.
   assert(isCanonical(Header) && "Expected canonical MDString");
   Metadata *PreOps[] = {Header};
-  return storeImpl(new (DwarfOps.size() + 1) GenericDINode(
+  return storeImpl(new (DwarfOps.size() + 1, Storage) GenericDINode(
                        Context, Storage, Hash, Tag, PreOps, DwarfOps),
                    Storage, Context.pImpl->GenericDINodes);
 }
@@ -326,20 +329,25 @@ void GenericDINode::recalculateHash() {
     }                                                                          \
   } while (false)
 #define DEFINE_GETIMPL_STORE(CLASS, ARGS, OPS)                                 \
-  return storeImpl(new (array_lengthof(OPS))                                   \
+  return storeImpl(new (array_lengthof(OPS), Storage)                          \
                        CLASS(Context, Storage, UNWRAP_ARGS(ARGS), OPS),        \
                    Storage, Context.pImpl->CLASS##s)
 #define DEFINE_GETIMPL_STORE_NO_OPS(CLASS, ARGS)                               \
-  return storeImpl(new (0u) CLASS(Context, Storage, UNWRAP_ARGS(ARGS)),        \
+  return storeImpl(new (0u, Storage)                                           \
+                       CLASS(Context, Storage, UNWRAP_ARGS(ARGS)),             \
                    Storage, Context.pImpl->CLASS##s)
 #define DEFINE_GETIMPL_STORE_NO_CONSTRUCTOR_ARGS(CLASS, OPS)                   \
-  return storeImpl(new (array_lengthof(OPS)) CLASS(Context, Storage, OPS),     \
+  return storeImpl(new (array_lengthof(OPS), Storage)                          \
+                       CLASS(Context, Storage, OPS),                           \
                    Storage, Context.pImpl->CLASS##s)
 #define DEFINE_GETIMPL_STORE_N(CLASS, ARGS, OPS, NUM_OPS)                      \
-  return storeImpl(new (NUM_OPS)                                               \
+  return storeImpl(new (NUM_OPS, Storage)                                      \
                        CLASS(Context, Storage, UNWRAP_ARGS(ARGS), OPS),        \
                    Storage, Context.pImpl->CLASS##s)
 
+DISubrange::DISubrange(LLVMContext &C, StorageType Storage,
+                       ArrayRef<Metadata *> Ops)
+    : DINode(C, DISubrangeKind, Storage, dwarf::DW_TAG_subrange_type, Ops) {}
 DISubrange *DISubrange::getImpl(LLVMContext &Context, int64_t Count, int64_t Lo,
                                 StorageType Storage, bool ShouldCreate) {
   auto *CountNode = ConstantAsMetadata::get(
@@ -450,6 +458,10 @@ DISubrange::BoundType DISubrange::getStride() const {
 
   return BoundType();
 }
+DIGenericSubrange::DIGenericSubrange(LLVMContext &C, StorageType Storage,
+                                     ArrayRef<Metadata *> Ops)
+    : DINode(C, DIGenericSubrangeKind, Storage, dwarf::DW_TAG_generic_subrange,
+             Ops) {}
 
 DIGenericSubrange *DIGenericSubrange::getImpl(LLVMContext &Context,
                                               Metadata *CountNode, Metadata *LB,
@@ -529,6 +541,13 @@ DIGenericSubrange::BoundType DIGenericSubrange::getStride() const {
   return BoundType();
 }
 
+DIEnumerator::DIEnumerator(LLVMContext &C, StorageType Storage,
+                           const APInt &Value, bool IsUnsigned,
+                           ArrayRef<Metadata *> Ops)
+    : DINode(C, DIEnumeratorKind, Storage, dwarf::DW_TAG_enumerator, Ops),
+      Value(Value) {
+  SubclassData32 = IsUnsigned;
+}
 DIEnumerator *DIEnumerator::getImpl(LLVMContext &Context, const APInt &Value,
                                     bool IsUnsigned, MDString *Name,
                                     StorageType Storage, bool ShouldCreate) {
@@ -580,6 +599,36 @@ DIStringType *DIStringType::getImpl(LLVMContext &Context, unsigned Tag,
   DEFINE_GETIMPL_STORE(DIStringType, (Tag, SizeInBits, AlignInBits, Encoding),
                        Ops);
 }
+DIType *DIDerivedType::getClassType() const {
+  assert(getTag() == dwarf::DW_TAG_ptr_to_member_type);
+  return cast_or_null<DIType>(getExtraData());
+}
+uint32_t DIDerivedType::getVBPtrOffset() const {
+  assert(getTag() == dwarf::DW_TAG_inheritance);
+  if (auto *CM = cast_or_null<ConstantAsMetadata>(getExtraData()))
+    if (auto *CI = dyn_cast_or_null<ConstantInt>(CM->getValue()))
+      return static_cast<uint32_t>(CI->getZExtValue());
+  return 0;
+}
+Constant *DIDerivedType::getStorageOffsetInBits() const {
+  assert(getTag() == dwarf::DW_TAG_member && isBitField());
+  if (auto *C = cast_or_null<ConstantAsMetadata>(getExtraData()))
+    return C->getValue();
+  return nullptr;
+}
+
+Constant *DIDerivedType::getConstant() const {
+  assert(getTag() == dwarf::DW_TAG_member && isStaticMember());
+  if (auto *C = cast_or_null<ConstantAsMetadata>(getExtraData()))
+    return C->getValue();
+  return nullptr;
+}
+Constant *DIDerivedType::getDiscriminantValue() const {
+  assert(getTag() == dwarf::DW_TAG_member && !isStaticMember());
+  if (auto *C = cast_or_null<ConstantAsMetadata>(getExtraData()))
+    return C->getValue();
+  return nullptr;
+}
 
 DIDerivedType *DIDerivedType::getImpl(
     LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *File,
@@ -701,6 +750,12 @@ DICompositeType *DICompositeType::getODRTypeIfExists(LLVMContext &Context,
     return nullptr;
   return Context.pImpl->DITypeMap->lookup(&Identifier);
 }
+DISubroutineType::DISubroutineType(LLVMContext &C, StorageType Storage,
+                                   DIFlags Flags, uint8_t CC,
+                                   ArrayRef<Metadata *> Ops)
+    : DIType(C, DISubroutineTypeKind, Storage, dwarf::DW_TAG_subroutine_type, 0,
+             0, 0, 0, Flags, Ops),
+      CC(CC) {}
 
 DISubroutineType *DISubroutineType::getImpl(LLVMContext &Context, DIFlags Flags,
                                             uint8_t CC, Metadata *TypeArray,
@@ -711,6 +766,12 @@ DISubroutineType *DISubroutineType::getImpl(LLVMContext &Context, DIFlags Flags,
   DEFINE_GETIMPL_STORE(DISubroutineType, (Flags, CC), Ops);
 }
 
+DIFile::DIFile(LLVMContext &C, StorageType Storage,
+               Optional<ChecksumInfo<MDString *>> CS, Optional<MDString *> Src,
+               ArrayRef<Metadata *> Ops)
+    : DIScope(C, DIFileKind, Storage, dwarf::DW_TAG_file_type, Ops),
+      Checksum(CS), Source(Src) {}
+
 // FIXME: Implement this string-enum correspondence with a .def file and macros,
 // so that the association is explicit rather than implied.
 static const char *ChecksumKindName[DIFile::CSK_Last] = {
@@ -746,9 +807,23 @@ DIFile *DIFile::getImpl(LLVMContext &Context, MDString *Filename,
   assert((!Source || isCanonical(*Source)) && "Expected canonical MDString");
   DEFINE_GETIMPL_LOOKUP(DIFile, (Filename, Directory, CS, Source));
   Metadata *Ops[] = {Filename, Directory, CS ? CS->Value : nullptr,
-                     Source.getValueOr(nullptr)};
+                     Source.value_or(nullptr)};
   DEFINE_GETIMPL_STORE(DIFile, (CS, Source), Ops);
 }
+DICompileUnit::DICompileUnit(LLVMContext &C, StorageType Storage,
+                             unsigned SourceLanguage, bool IsOptimized,
+                             unsigned RuntimeVersion, unsigned EmissionKind,
+                             uint64_t DWOId, bool SplitDebugInlining,
+                             bool DebugInfoForProfiling, unsigned NameTableKind,
+                             bool RangesBaseAddress, ArrayRef<Metadata *> Ops)
+    : DIScope(C, DICompileUnitKind, Storage, dwarf::DW_TAG_compile_unit, Ops),
+      SourceLanguage(SourceLanguage), IsOptimized(IsOptimized),
+      RuntimeVersion(RuntimeVersion), EmissionKind(EmissionKind), DWOId(DWOId),
+      SplitDebugInlining(SplitDebugInlining),
+      DebugInfoForProfiling(DebugInfoForProfiling),
+      NameTableKind(NameTableKind), RangesBaseAddress(RangesBaseAddress) {
+  assert(Storage != Uniqued);
+}
 
 DICompileUnit *DICompileUnit::getImpl(
     LLVMContext &Context, unsigned SourceLanguage, Metadata *File,
@@ -775,7 +850,7 @@ DICompileUnit *DICompileUnit::getImpl(
                      Macros,
                      SysRoot,
                      SDK};
-  return storeImpl(new (array_lengthof(Ops)) DICompileUnit(
+  return storeImpl(new (array_lengthof(Ops), Storage) DICompileUnit(
                        Context, Storage, SourceLanguage, IsOptimized,
                        RuntimeVersion, EmissionKind, DWOId, SplitDebugInlining,
                        DebugInfoForProfiling, NameTableKind, RangesBaseAddress,
@@ -827,6 +902,30 @@ const char *DICompileUnit::nameTableKindString(DebugNameTableKind NTK) {
   }
   return nullptr;
 }
+DISubprogram::DISubprogram(LLVMContext &C, StorageType Storage, unsigned Line,
+                           unsigned ScopeLine, unsigned VirtualIndex,
+                           int ThisAdjustment, DIFlags Flags, DISPFlags SPFlags,
+                           ArrayRef<Metadata *> Ops)
+    : DILocalScope(C, DISubprogramKind, Storage, dwarf::DW_TAG_subprogram, Ops),
+      Line(Line), ScopeLine(ScopeLine), VirtualIndex(VirtualIndex),
+      ThisAdjustment(ThisAdjustment), Flags(Flags), SPFlags(SPFlags) {
+  static_assert(dwarf::DW_VIRTUALITY_max < 4, "Virtuality out of range");
+}
+DISubprogram::DISPFlags
+DISubprogram::toSPFlags(bool IsLocalToUnit, bool IsDefinition, bool IsOptimized,
+                        unsigned Virtuality, bool IsMainSubprogram) {
+  // We're assuming virtuality is the low-order field.
+  static_assert(int(SPFlagVirtual) == int(dwarf::DW_VIRTUALITY_virtual) &&
+                    int(SPFlagPureVirtual) ==
+                        int(dwarf::DW_VIRTUALITY_pure_virtual),
+                "Virtuality constant mismatch");
+  return static_cast<DISPFlags>(
+      (Virtuality & SPFlagVirtuality) |
+      (IsLocalToUnit ? SPFlagLocalToUnit : SPFlagZero) |
+      (IsDefinition ? SPFlagDefinition : SPFlagZero) |
+      (IsOptimized ? SPFlagOptimized : SPFlagZero) |
+      (IsMainSubprogram ? SPFlagMainSubprogram : SPFlagZero));
+}
 
 DISubprogram *DILocalScope::getSubprogram() const {
   if (auto *Block = dyn_cast<DILexicalBlockBase>(this))
@@ -881,27 +980,33 @@ DISubprogram *DISubprogram::getImpl(
     unsigned ScopeLine, Metadata *ContainingType, unsigned VirtualIndex,
     int ThisAdjustment, DIFlags Flags, DISPFlags SPFlags, Metadata *Unit,
     Metadata *TemplateParams, Metadata *Declaration, Metadata *RetainedNodes,
-    Metadata *ThrownTypes, Metadata *Annotations, StorageType Storage,
-    bool ShouldCreate) {
+    Metadata *ThrownTypes, Metadata *Annotations, MDString *TargetFuncName,
+    StorageType Storage, bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
   assert(isCanonical(LinkageName) && "Expected canonical MDString");
+  assert(isCanonical(TargetFuncName) && "Expected canonical MDString");
   DEFINE_GETIMPL_LOOKUP(DISubprogram,
                         (Scope, Name, LinkageName, File, Line, Type, ScopeLine,
                          ContainingType, VirtualIndex, ThisAdjustment, Flags,
                          SPFlags, Unit, TemplateParams, Declaration,
-                         RetainedNodes, ThrownTypes, Annotations));
-  SmallVector<Metadata *, 12> Ops = {
+                         RetainedNodes, ThrownTypes, Annotations,
+                         TargetFuncName));
+  SmallVector<Metadata *, 13> Ops = {
       File,           Scope,          Name,        LinkageName,
       Type,           Unit,           Declaration, RetainedNodes,
-      ContainingType, TemplateParams, ThrownTypes, Annotations};
-  if (!Annotations) {
+      ContainingType, TemplateParams, ThrownTypes, Annotations,
+      TargetFuncName};
+  if (!TargetFuncName) {
     Ops.pop_back();
-    if (!ThrownTypes) {
+    if (!Annotations) {
       Ops.pop_back();
-      if (!TemplateParams) {
+      if (!ThrownTypes) {
         Ops.pop_back();
-        if (!ContainingType)
+        if (!TemplateParams) {
           Ops.pop_back();
+          if (!ContainingType)
+            Ops.pop_back();
+        }
       }
     }
   }
@@ -915,6 +1020,10 @@ bool DISubprogram::describes(const Function *F) const {
   assert(F && "Invalid function");
   return F->getSubprogram() == this;
 }
+DILexicalBlockBase::DILexicalBlockBase(LLVMContext &C, unsigned ID,
+                                       StorageType Storage,
+                                       ArrayRef<Metadata *> Ops)
+    : DILocalScope(C, ID, Storage, dwarf::DW_TAG_lexical_block, Ops) {}
 
 DILexicalBlock *DILexicalBlock::getImpl(LLVMContext &Context, Metadata *Scope,
                                         Metadata *File, unsigned Line,
@@ -940,6 +1049,10 @@ DILexicalBlockFile *DILexicalBlockFile::getImpl(LLVMContext &Context,
   DEFINE_GETIMPL_STORE(DILexicalBlockFile, (Discriminator), Ops);
 }
 
+DINamespace::DINamespace(LLVMContext &Context, StorageType Storage,
+                         bool ExportSymbols, ArrayRef<Metadata *> Ops)
+    : DIScope(Context, DINamespaceKind, Storage, dwarf::DW_TAG_namespace, Ops),
+      ExportSymbols(ExportSymbols) {}
 DINamespace *DINamespace::getImpl(LLVMContext &Context, Metadata *Scope,
                                   MDString *Name, bool ExportSymbols,
                                   StorageType Storage, bool ShouldCreate) {
@@ -950,6 +1063,11 @@ DINamespace *DINamespace::getImpl(LLVMContext &Context, Metadata *Scope,
   DEFINE_GETIMPL_STORE(DINamespace, (ExportSymbols), Ops);
 }
 
+DICommonBlock::DICommonBlock(LLVMContext &Context, StorageType Storage,
+                             unsigned LineNo, ArrayRef<Metadata *> Ops)
+    : DIScope(Context, DICommonBlockKind, Storage, dwarf::DW_TAG_common_block,
+              Ops),
+      LineNo(LineNo) {}
 DICommonBlock *DICommonBlock::getImpl(LLVMContext &Context, Metadata *Scope,
                                       Metadata *Decl, MDString *Name,
                                       Metadata *File, unsigned LineNo,
@@ -961,6 +1079,10 @@ DICommonBlock *DICommonBlock::getImpl(LLVMContext &Context, Metadata *Scope,
   DEFINE_GETIMPL_STORE(DICommonBlock, (LineNo), Ops);
 }
 
+DIModule::DIModule(LLVMContext &Context, StorageType Storage, unsigned LineNo,
+                   bool IsDecl, ArrayRef<Metadata *> Ops)
+    : DIScope(Context, DIModuleKind, Storage, dwarf::DW_TAG_module, Ops),
+      LineNo(LineNo), IsDecl(IsDecl) {}
 DIModule *DIModule::getImpl(LLVMContext &Context, Metadata *File,
                             Metadata *Scope, MDString *Name,
                             MDString *ConfigurationMacros,
@@ -974,6 +1096,13 @@ DIModule *DIModule::getImpl(LLVMContext &Context, Metadata *File,
                      IncludePath, APINotesFile};
   DEFINE_GETIMPL_STORE(DIModule, (LineNo, IsDecl), Ops);
 }
+DITemplateTypeParameter::DITemplateTypeParameter(LLVMContext &Context,
+                                                 StorageType Storage,
+                                                 bool IsDefault,
+                                                 ArrayRef<Metadata *> Ops)
+    : DITemplateParameter(Context, DITemplateTypeParameterKind, Storage,
+                          dwarf::DW_TAG_template_type_parameter, IsDefault,
+                          Ops) {}
 
 DITemplateTypeParameter *
 DITemplateTypeParameter::getImpl(LLVMContext &Context, MDString *Name,
@@ -1039,6 +1168,11 @@ DILocalVariable::getImpl(LLVMContext &Context, Metadata *Scope, MDString *Name,
   DEFINE_GETIMPL_STORE(DILocalVariable, (Line, Arg, Flags, AlignInBits), Ops);
 }
 
+DIVariable::DIVariable(LLVMContext &C, unsigned ID, StorageType Storage,
+                       signed Line, ArrayRef<Metadata *> Ops,
+                       uint32_t AlignInBits)
+    : DINode(C, ID, Storage, dwarf::DW_TAG_variable, Ops), Line(Line),
+      AlignInBits(AlignInBits) {}
 Optional<uint64_t> DIVariable::getSizeInBits() const {
   // This is used by the Verifier so be mindful of broken types.
   const Metadata *RawType = getRawType();
@@ -1062,6 +1196,9 @@ Optional<uint64_t> DIVariable::getSizeInBits() const {
   return None;
 }
 
+DILabel::DILabel(LLVMContext &C, StorageType Storage, unsigned Line,
+                 ArrayRef<Metadata *> Ops)
+    : DINode(C, DILabelKind, Storage, dwarf::DW_TAG_label, Ops), Line(Line) {}
 DILabel *DILabel::getImpl(LLVMContext &Context, Metadata *Scope, MDString *Name,
                           Metadata *File, unsigned Line, StorageType Storage,
                           bool ShouldCreate) {
@@ -1078,6 +1215,12 @@ DIExpression *DIExpression::getImpl(LLVMContext &Context,
   DEFINE_GETIMPL_LOOKUP(DIExpression, (Elements));
   DEFINE_GETIMPL_STORE_NO_OPS(DIExpression, (Elements));
 }
+bool DIExpression::isEntryValue() const {
+  return getNumElements() > 0 && getElement(0) == dwarf::DW_OP_LLVM_entry_value;
+}
+bool DIExpression::startsWithDeref() const {
+  return getNumElements() > 0 && getElement(0) == dwarf::DW_OP_deref;
+}
 
 unsigned DIExpression::ExprOperand::getSize() const {
   uint64_t Op = getOp();
@@ -1439,7 +1582,7 @@ DIExpression *DIExpression::appendToStack(const DIExpression *Expr,
   //
   // Match .* DW_OP_stack_value (DW_OP_LLVM_fragment A B)?.
   Optional<FragmentInfo> FI = Expr->getFragmentInfo();
-  unsigned DropUntilStackValue = FI.hasValue() ? 3 : 0;
+  unsigned DropUntilStackValue = FI ? 3 : 0;
   ArrayRef<uint64_t> ExprOpsBeforeFragment =
       Expr->getElements().drop_back(DropUntilStackValue);
   bool NeedsDeref = (Expr->getNumElements() > DropUntilStackValue) &&
@@ -1597,6 +1740,11 @@ DIGlobalVariableExpression::getImpl(LLVMContext &Context, Metadata *Variable,
   Metadata *Ops[] = {Variable, Expression};
   DEFINE_GETIMPL_STORE_NO_CONSTRUCTOR_ARGS(DIGlobalVariableExpression, Ops);
 }
+DIObjCProperty::DIObjCProperty(LLVMContext &C, StorageType Storage,
+                               unsigned Line, unsigned Attributes,
+                               ArrayRef<Metadata *> Ops)
+    : DINode(C, DIObjCPropertyKind, Storage, dwarf::DW_TAG_APPLE_property, Ops),
+      Line(Line), Attributes(Attributes) {}
 
 DIObjCProperty *DIObjCProperty::getImpl(
     LLVMContext &Context, MDString *Name, Metadata *File, unsigned Line,
diff --git a/llvm/lib/IR/DiagnosticHandler.cpp b/llvm/lib/IR/DiagnosticHandler.cpp
index 7b40728a34e8..683eade50291 100644
--- a/llvm/lib/IR/DiagnosticHandler.cpp
+++ b/llvm/lib/IR/DiagnosticHandler.cpp
@@ -47,8 +47,7 @@ static cl::opt<PassRemarksOpt, true, cl::parser<std::string>> PassRemarks(
     "pass-remarks", cl::value_desc("pattern"),
     cl::desc("Enable optimization remarks from passes whose name match "
              "the given regular expression"),
-    cl::Hidden, cl::location(PassRemarksPassedOptLoc), cl::ValueRequired,
-    cl::ZeroOrMore);
+    cl::Hidden, cl::location(PassRemarksPassedOptLoc), cl::ValueRequired);
 
 // -pass-remarks-missed
 //    Command line flag to enable emitOptimizationRemarkMissed()
@@ -56,8 +55,7 @@ static cl::opt<PassRemarksOpt, true, cl::parser<std::string>> PassRemarksMissed(
     "pass-remarks-missed", cl::value_desc("pattern"),
     cl::desc("Enable missed optimization remarks from passes whose name match "
              "the given regular expression"),
-    cl::Hidden, cl::location(PassRemarksMissedOptLoc), cl::ValueRequired,
-    cl::ZeroOrMore);
+    cl::Hidden, cl::location(PassRemarksMissedOptLoc), cl::ValueRequired);
 
 // -pass-remarks-analysis
 //    Command line flag to enable emitOptimizationRemarkAnalysis()
@@ -67,8 +65,7 @@ static cl::opt<PassRemarksOpt, true, cl::parser<std::string>>
         cl::desc(
             "Enable optimization analysis remarks from passes whose name match "
             "the given regular expression"),
-        cl::Hidden, cl::location(PassRemarksAnalysisOptLoc), cl::ValueRequired,
-        cl::ZeroOrMore);
+        cl::Hidden, cl::location(PassRemarksAnalysisOptLoc), cl::ValueRequired);
 }
 
 bool DiagnosticHandler::isAnalysisRemarkEnabled(StringRef PassName) const {
diff --git a/llvm/lib/IR/DiagnosticInfo.cpp b/llvm/lib/IR/DiagnosticInfo.cpp
index f46f0fdd947d..50fe6829ad86 100644
--- a/llvm/lib/IR/DiagnosticInfo.cpp
+++ b/llvm/lib/IR/DiagnosticInfo.cpp
@@ -393,6 +393,17 @@ std::string DiagnosticInfoOptimizationBase::getMsg() const {
   return OS.str();
 }
 
+DiagnosticInfoMisExpect::DiagnosticInfoMisExpect(const Instruction *Inst,
+                                                 Twine &Msg)
+    : DiagnosticInfoWithLocationBase(DK_MisExpect, DS_Warning,
+                                     *Inst->getParent()->getParent(),
+                                     Inst->getDebugLoc()),
+      Msg(Msg) {}
+
+void DiagnosticInfoMisExpect::print(DiagnosticPrinter &DP) const {
+  DP << getLocationStr() << ": " << getMsg();
+}
+
 void OptimizationRemarkAnalysisFPCommute::anchor() {}
 void OptimizationRemarkAnalysisAliasing::anchor() {}
 
diff --git a/llvm/lib/IR/Dominators.cpp b/llvm/lib/IR/Dominators.cpp
index aac8936c7bd6..09be2a8ef605 100644
--- a/llvm/lib/IR/Dominators.cpp
+++ b/llvm/lib/IR/Dominators.cpp
@@ -25,7 +25,6 @@
 #include "llvm/PassRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
 #include <cassert>
diff --git a/llvm/lib/IR/FPEnv.cpp b/llvm/lib/IR/FPEnv.cpp
index c6e0938e71a6..48ee84080e98 100644
--- a/llvm/lib/IR/FPEnv.cpp
+++ b/llvm/lib/IR/FPEnv.cpp
@@ -14,6 +14,9 @@
 
 #include "llvm/IR/FPEnv.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
 
 namespace llvm {
 
@@ -82,4 +85,46 @@ convertExceptionBehaviorToStr(fp::ExceptionBehavior UseExcept) {
   }
   return ExceptStr;
 }
+
+Intrinsic::ID getConstrainedIntrinsicID(const Instruction &Instr) {
+  Intrinsic::ID IID = Intrinsic::not_intrinsic;
+  switch (Instr.getOpcode()) {
+  case Instruction::FCmp:
+    // Unlike other instructions FCmp can be mapped to one of two intrinsic
+    // functions. We choose the non-signaling variant.
+    IID = Intrinsic::experimental_constrained_fcmp;
+    break;
+
+    // Instructions
+#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)                         \
+  case Instruction::NAME:                                                      \
+    IID = Intrinsic::INTRINSIC;                                                \
+    break;
+#define FUNCTION(NAME, NARG, ROUND_MODE, INTRINSIC)
+#define CMP_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)
+#include "llvm/IR/ConstrainedOps.def"
+
+  // Intrinsic calls.
+  case Instruction::Call:
+    if (auto *IntrinCall = dyn_cast<IntrinsicInst>(&Instr)) {
+      switch (IntrinCall->getIntrinsicID()) {
+#define FUNCTION(NAME, NARG, ROUND_MODE, INTRINSIC)                            \
+  case Intrinsic::NAME:                                                        \
+    IID = Intrinsic::INTRINSIC;                                                \
+    break;
+#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)
+#define CMP_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)
+#include "llvm/IR/ConstrainedOps.def"
+      default:
+        break;
+      }
+    }
+    break;
+  default:
+    break;
+  }
+
+  return IID;
+}
+
 } // namespace llvm
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 726ba80da41b..53df94366760 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -36,6 +36,7 @@
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsARM.h"
 #include "llvm/IR/IntrinsicsBPF.h"
+#include "llvm/IR/IntrinsicsDirectX.h"
 #include "llvm/IR/IntrinsicsHexagon.h"
 #include "llvm/IR/IntrinsicsMips.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
@@ -339,8 +340,9 @@ Function *Function::createWithDefaultAttr(FunctionType *Ty,
                                           Module *M) {
   auto *F = new Function(Ty, Linkage, AddrSpace, N, M);
   AttrBuilder B(F->getContext());
-  if (M->getUwtable())
-    B.addAttribute(Attribute::UWTable);
+  UWTableKind UWTable = M->getUwtable();
+  if (UWTable != UWTableKind::None)
+    B.addUWTableAttr(UWTable);
   switch (M->getFramePointer()) {
   case FramePointerKind::None:
     // 0 ("none") is the default.
@@ -926,25 +928,25 @@ std::string Intrinsic::getNameNoUnnamedTypes(ID Id, ArrayRef<Type *> Tys) {
 enum IIT_Info {
   // Common values should be encoded with 0-15.
   IIT_Done = 0,
-  IIT_I1   = 1,
-  IIT_I8   = 2,
-  IIT_I16  = 3,
-  IIT_I32  = 4,
-  IIT_I64  = 5,
-  IIT_F16  = 6,
-  IIT_F32  = 7,
-  IIT_F64  = 8,
-  IIT_V2   = 9,
-  IIT_V4   = 10,
-  IIT_V8   = 11,
-  IIT_V16  = 12,
-  IIT_V32  = 13,
-  IIT_PTR  = 14,
-  IIT_ARG  = 15,
+  IIT_I1 = 1,
+  IIT_I8 = 2,
+  IIT_I16 = 3,
+  IIT_I32 = 4,
+  IIT_I64 = 5,
+  IIT_F16 = 6,
+  IIT_F32 = 7,
+  IIT_F64 = 8,
+  IIT_V2 = 9,
+  IIT_V4 = 10,
+  IIT_V8 = 11,
+  IIT_V16 = 12,
+  IIT_V32 = 13,
+  IIT_PTR = 14,
+  IIT_ARG = 15,
 
   // Values from 16+ are only encodable with the inefficient encoding.
-  IIT_V64  = 16,
-  IIT_MMX  = 17,
+  IIT_V64 = 16,
+  IIT_MMX = 17,
   IIT_TOKEN = 18,
   IIT_METADATA = 19,
   IIT_EMPTYSTRUCT = 20,
@@ -955,7 +957,7 @@ enum IIT_Info {
   IIT_EXTEND_ARG = 25,
   IIT_TRUNC_ARG = 26,
   IIT_ANYPTR = 27,
-  IIT_V1   = 28,
+  IIT_V1 = 28,
   IIT_VARARG = 29,
   IIT_HALF_VEC_ARG = 30,
   IIT_SAME_VEC_WIDTH_ARG = 31,
@@ -978,11 +980,14 @@ enum IIT_Info {
   IIT_BF16 = 48,
   IIT_STRUCT9 = 49,
   IIT_V256 = 50,
-  IIT_AMX  = 51,
+  IIT_AMX = 51,
   IIT_PPCF128 = 52,
   IIT_V3 = 53,
   IIT_EXTERNREF = 54,
-  IIT_FUNCREF = 55
+  IIT_FUNCREF = 55,
+  IIT_ANYPTR_TO_ELT = 56,
+  IIT_I2 = 57,
+  IIT_I4 = 58,
 };
 
 static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
@@ -1035,6 +1040,12 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
   case IIT_I1:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 1));
     return;
+  case IIT_I2:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 2));
+    return;
+  case IIT_I4:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 4));
+    return;
   case IIT_I8:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 8));
     return;
@@ -1156,6 +1167,13 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::PtrToElt, ArgInfo));
     return;
   }
+  case IIT_ANYPTR_TO_ELT: {
+    unsigned short ArgNo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    unsigned short RefNo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(
+        IITDescriptor::get(IITDescriptor::AnyPtrToElt, ArgNo, RefNo));
+    return;
+  }
   case IIT_VEC_OF_ANYPTRS_TO_ELT: {
     unsigned short ArgNo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
     unsigned short RefNo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
@@ -1347,6 +1365,9 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
   case IITDescriptor::VecOfAnyPtrsToElt:
     // Return the overloaded type (which determines the pointers address space)
     return Tys[D.getOverloadArgNumber()];
+  case IITDescriptor::AnyPtrToElt:
+    // Return the overloaded type (which determines the pointers address space)
+    return Tys[D.getOverloadArgNumber()];
   }
   llvm_unreachable("unhandled");
 }
@@ -1406,10 +1427,10 @@ Function *Intrinsic::getDeclaration(Module *M, ID id, ArrayRef<Type*> Tys) {
           .getCallee());
 }
 
-// This defines the "Intrinsic::getIntrinsicForGCCBuiltin()" method.
-#define GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
+// This defines the "Intrinsic::getIntrinsicForClangBuiltin()" method.
+#define GET_LLVM_INTRINSIC_FOR_CLANG_BUILTIN
 #include "llvm/IR/IntrinsicImpl.inc"
-#undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
+#undef GET_LLVM_INTRINSIC_FOR_CLANG_BUILTIN
 
 // This defines the "Intrinsic::getIntrinsicForMSBuiltin()" method.
 #define GET_LLVM_INTRINSIC_FOR_MS_BUILTIN
@@ -1463,19 +1484,37 @@ static bool matchIntrinsicType(
       PointerType *PT = dyn_cast<PointerType>(Ty);
       if (!PT || PT->getAddressSpace() != D.Pointer_AddressSpace)
         return true;
-      if (!PT->isOpaque())
+      if (!PT->isOpaque()) {
+        /* Manually consume a pointer to empty struct descriptor, which is
+         * used for externref. We don't want to enforce that the struct is
+         * anonymous in this case. (This renders externref intrinsics
+         * non-unique, but this will go away with opaque pointers anyway.) */
+        if (Infos.front().Kind == IITDescriptor::Struct &&
+            Infos.front().Struct_NumElements == 0) {
+          Infos = Infos.slice(1);
+          return false;
+        }
         return matchIntrinsicType(PT->getNonOpaquePointerElementType(), Infos,
                                   ArgTys, DeferredChecks, IsDeferredCheck);
+      }
       // Consume IIT descriptors relating to the pointer element type.
-      while (Infos.front().Kind == IITDescriptor::Pointer)
+      // FIXME: Intrinsic type matching of nested single value types or even
+      // aggregates doesn't work properly with opaque pointers but hopefully
+      // doesn't happen in practice.
+      while (Infos.front().Kind == IITDescriptor::Pointer ||
+             Infos.front().Kind == IITDescriptor::Vector)
         Infos = Infos.slice(1);
+      assert((Infos.front().Kind != IITDescriptor::Argument ||
+              Infos.front().getArgumentKind() == IITDescriptor::AK_MatchType) &&
+             "Unsupported polymorphic pointer type with opaque pointer");
       Infos = Infos.slice(1);
       return false;
     }
 
     case IITDescriptor::Struct: {
       StructType *ST = dyn_cast<StructType>(Ty);
-      if (!ST || ST->getNumElements() != D.Struct_NumElements)
+      if (!ST || !ST->isLiteral() || ST->isPacked() ||
+          ST->getNumElements() != D.Struct_NumElements)
         return true;
 
       for (unsigned i = 0, e = D.Struct_NumElements; i != e; ++i)
@@ -1587,6 +1626,30 @@ static bool matchIntrinsicType(
       return !ThisArgType->isOpaqueOrPointeeTypeMatches(
           ReferenceType->getElementType());
     }
+    case IITDescriptor::AnyPtrToElt: {
+      unsigned RefArgNumber = D.getRefArgNumber();
+      if (RefArgNumber >= ArgTys.size()) {
+        if (IsDeferredCheck)
+          return true;
+        // If forward referencing, already add the pointer type and
+        // defer the checks for later.
+        ArgTys.push_back(Ty);
+        return DeferCheck(Ty);
+      }
+
+      if (!IsDeferredCheck) {
+        assert(D.getOverloadArgNumber() == ArgTys.size() &&
+               "Table consistency error");
+        ArgTys.push_back(Ty);
+      }
+
+      auto *ReferenceType = dyn_cast<VectorType>(ArgTys[RefArgNumber]);
+      auto *ThisArgType = dyn_cast<PointerType>(Ty);
+      if (!ThisArgType || !ReferenceType)
+        return true;
+      return !ThisArgType->isOpaqueOrPointeeTypeMatches(
+          ReferenceType->getElementType());
+    }
     case IITDescriptor::VecOfAnyPtrsToElt: {
       unsigned RefArgNumber = D.getRefArgNumber();
       if (RefArgNumber >= ArgTys.size()) {
@@ -1802,7 +1865,7 @@ bool Function::hasAddressTaken(const User **PutOffender,
         *PutOffender = FU;
       return true;
     }
-    if (!Call->isCallee(&U)) {
+    if (!Call->isCallee(&U) || Call->getFunctionType() != getFunctionType()) {
       if (IgnoreARCAttachedCall &&
           Call->isOperandBundleOfType(LLVMContext::OB_clang_arc_attachedcall,
                                       U.getOperandNo()))
@@ -1909,7 +1972,7 @@ void Function::setEntryCount(ProfileCount Count,
                              const DenseSet<GlobalValue::GUID> *S) {
 #if !defined(NDEBUG)
   auto PrevCount = getEntryCount();
-  assert(!PrevCount.hasValue() || PrevCount->getType() == Count.getType());
+  assert(!PrevCount || PrevCount->getType() == Count.getType());
 #endif
 
   auto ImportGUIDs = getImportGUIDs();
diff --git a/llvm/lib/IR/GVMaterializer.cpp b/llvm/lib/IR/GVMaterializer.cpp
index 35397309a103..dc3b0e0fc236 100644
--- a/llvm/lib/IR/GVMaterializer.cpp
+++ b/llvm/lib/IR/GVMaterializer.cpp
@@ -14,4 +14,4 @@
 #include "llvm/IR/GVMaterializer.h"
 using namespace llvm;
 
-GVMaterializer::~GVMaterializer() {}
+GVMaterializer::~GVMaterializer() = default;
diff --git a/llvm/lib/IR/Globals.cpp b/llvm/lib/IR/Globals.cpp
index 47e8bc0a916d..3265050261c8 100644
--- a/llvm/lib/IR/Globals.cpp
+++ b/llvm/lib/IR/Globals.cpp
@@ -67,6 +67,10 @@ void GlobalValue::copyAttributesFrom(const GlobalValue *Src) {
   setDLLStorageClass(Src->getDLLStorageClass());
   setDSOLocal(Src->isDSOLocal());
   setPartition(Src->getPartition());
+  if (Src->hasSanitizerMetadata())
+    setSanitizerMetadata(Src->getSanitizerMetadata());
+  else
+    removeSanitizerMetadata();
 }
 
 void GlobalValue::removeFromParent() {
@@ -217,6 +221,25 @@ void GlobalValue::setPartition(StringRef S) {
   HasPartition = !S.empty();
 }
 
+using SanitizerMetadata = GlobalValue::SanitizerMetadata;
+const SanitizerMetadata &GlobalValue::getSanitizerMetadata() const {
+  assert(hasSanitizerMetadata());
+  assert(getContext().pImpl->GlobalValueSanitizerMetadata.count(this));
+  return getContext().pImpl->GlobalValueSanitizerMetadata[this];
+}
+
+void GlobalValue::setSanitizerMetadata(SanitizerMetadata Meta) {
+  getContext().pImpl->GlobalValueSanitizerMetadata[this] = Meta;
+  HasSanitizerMetadata = true;
+}
+
+void GlobalValue::removeSanitizerMetadata() {
+  DenseMap<const GlobalValue *, SanitizerMetadata> &MetadataMap =
+      getContext().pImpl->GlobalValueSanitizerMetadata;
+  MetadataMap.erase(this);
+  HasSanitizerMetadata = false;
+}
+
 StringRef GlobalObject::getSectionImpl() const {
   assert(hasSection());
   return getContext().pImpl->GlobalObjectSections[this];
@@ -262,7 +285,7 @@ bool GlobalObject::canIncreaseAlignment() const {
   // alignment specified. (If it is assigned a section, the global
   // could be densely packed with other objects in the section, and
   // increasing the alignment could cause padding issues.)
-  if (hasSection() && getAlign().hasValue())
+  if (hasSection() && getAlign())
     return false;
 
   // On ELF platforms, we're further restricted in that we can't
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 4e8f1b506811..d0c622fe2389 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/None.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
@@ -68,6 +69,21 @@ Value *IRBuilderBase::getCastedInt8PtrValue(Value *Ptr) {
   return CreateBitCast(Ptr, getInt8PtrTy(PT->getAddressSpace()));
 }
 
+DebugLoc IRBuilderBase::getCurrentDebugLocation() const {
+  for (auto &KV : MetadataToCopy)
+    if (KV.first == LLVMContext::MD_dbg)
+      return {cast<DILocation>(KV.second)};
+
+  return {};
+}
+void IRBuilderBase::SetInstDebugLocation(Instruction *I) const {
+  for (const auto &KV : MetadataToCopy)
+    if (KV.first == LLVMContext::MD_dbg) {
+      I->setDebugLoc(DebugLoc(KV.second));
+      return;
+    }
+}
+
 static CallInst *createCallHelper(Function *Callee, ArrayRef<Value *> Ops,
                                   IRBuilderBase *Builder,
                                   const Twine &Name = "",
@@ -133,7 +149,36 @@ CallInst *IRBuilderBase::CreateMemSet(Value *Ptr, Value *Val, Value *Size,
   CallInst *CI = createCallHelper(TheFn, Ops, this);
 
   if (Align)
-    cast<MemSetInst>(CI)->setDestAlignment(Align->value());
+    cast<MemSetInst>(CI)->setDestAlignment(*Align);
+
+  // Set the TBAA info if present.
+  if (TBAATag)
+    CI->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+
+  if (ScopeTag)
+    CI->setMetadata(LLVMContext::MD_alias_scope, ScopeTag);
+
+  if (NoAliasTag)
+    CI->setMetadata(LLVMContext::MD_noalias, NoAliasTag);
+
+  return CI;
+}
+
+CallInst *IRBuilderBase::CreateMemSetInline(Value *Dst, MaybeAlign DstAlign,
+                                            Value *Val, Value *Size,
+                                            bool IsVolatile, MDNode *TBAATag,
+                                            MDNode *ScopeTag,
+                                            MDNode *NoAliasTag) {
+  Dst = getCastedInt8PtrValue(Dst);
+  Value *Ops[] = {Dst, Val, Size, getInt1(IsVolatile)};
+  Type *Tys[] = {Dst->getType(), Size->getType()};
+  Module *M = BB->getParent()->getParent();
+  Function *TheFn = Intrinsic::getDeclaration(M, Intrinsic::memset_inline, Tys);
+
+  CallInst *CI = createCallHelper(TheFn, Ops, this);
+
+  if (DstAlign)
+    cast<MemSetInlineInst>(CI)->setDestAlignment(*DstAlign);
 
   // Set the TBAA info if present.
   if (TBAATag)
@@ -672,34 +717,29 @@ getStatepointBundles(Optional<ArrayRef<T1>> TransitionArgs,
 template <typename T0, typename T1, typename T2, typename T3>
 static CallInst *CreateGCStatepointCallCommon(
     IRBuilderBase *Builder, uint64_t ID, uint32_t NumPatchBytes,
-    Value *ActualCallee, uint32_t Flags, ArrayRef<T0> CallArgs,
-    Optional<ArrayRef<T1>> TransitionArgs,
-    Optional<ArrayRef<T2>> DeoptArgs, ArrayRef<T3> GCArgs,
-    const Twine &Name) {
-  // Extract out the type of the callee.
-  auto *FuncPtrType = cast<PointerType>(ActualCallee->getType());
-  assert(isa<FunctionType>(FuncPtrType->getPointerElementType()) &&
-         "actual callee must be a callable value");
-
+    FunctionCallee ActualCallee, uint32_t Flags, ArrayRef<T0> CallArgs,
+    Optional<ArrayRef<T1>> TransitionArgs, Optional<ArrayRef<T2>> DeoptArgs,
+    ArrayRef<T3> GCArgs, const Twine &Name) {
   Module *M = Builder->GetInsertBlock()->getParent()->getParent();
   // Fill in the one generic type'd argument (the function is also vararg)
-  Type *ArgTypes[] = { FuncPtrType };
   Function *FnStatepoint =
-    Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_statepoint,
-                              ArgTypes);
-
-  std::vector<Value *> Args =
-      getStatepointArgs(*Builder, ID, NumPatchBytes, ActualCallee, Flags,
-                        CallArgs);
-
-  return Builder->CreateCall(FnStatepoint, Args,
-                             getStatepointBundles(TransitionArgs, DeoptArgs,
-                                                  GCArgs),
-                             Name);
+      Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_statepoint,
+                                {ActualCallee.getCallee()->getType()});
+
+  std::vector<Value *> Args = getStatepointArgs(
+      *Builder, ID, NumPatchBytes, ActualCallee.getCallee(), Flags, CallArgs);
+
+  CallInst *CI = Builder->CreateCall(
+      FnStatepoint, Args,
+      getStatepointBundles(TransitionArgs, DeoptArgs, GCArgs), Name);
+  CI->addParamAttr(2,
+                   Attribute::get(Builder->getContext(), Attribute::ElementType,
+                                  ActualCallee.getFunctionType()));
+  return CI;
 }
 
 CallInst *IRBuilderBase::CreateGCStatepointCall(
-    uint64_t ID, uint32_t NumPatchBytes, Value *ActualCallee,
+    uint64_t ID, uint32_t NumPatchBytes, FunctionCallee ActualCallee,
     ArrayRef<Value *> CallArgs, Optional<ArrayRef<Value *>> DeoptArgs,
     ArrayRef<Value *> GCArgs, const Twine &Name) {
   return CreateGCStatepointCallCommon<Value *, Value *, Value *, Value *>(
@@ -708,17 +748,17 @@ CallInst *IRBuilderBase::CreateGCStatepointCall(
 }
 
 CallInst *IRBuilderBase::CreateGCStatepointCall(
-    uint64_t ID, uint32_t NumPatchBytes, Value *ActualCallee, uint32_t Flags,
-    ArrayRef<Value *> CallArgs, Optional<ArrayRef<Use>> TransitionArgs,
-    Optional<ArrayRef<Use>> DeoptArgs, ArrayRef<Value *> GCArgs,
-    const Twine &Name) {
+    uint64_t ID, uint32_t NumPatchBytes, FunctionCallee ActualCallee,
+    uint32_t Flags, ArrayRef<Value *> CallArgs,
+    Optional<ArrayRef<Use>> TransitionArgs, Optional<ArrayRef<Use>> DeoptArgs,
+    ArrayRef<Value *> GCArgs, const Twine &Name) {
   return CreateGCStatepointCallCommon<Value *, Use, Use, Value *>(
       this, ID, NumPatchBytes, ActualCallee, Flags, CallArgs, TransitionArgs,
       DeoptArgs, GCArgs, Name);
 }
 
 CallInst *IRBuilderBase::CreateGCStatepointCall(
-    uint64_t ID, uint32_t NumPatchBytes, Value *ActualCallee,
+    uint64_t ID, uint32_t NumPatchBytes, FunctionCallee ActualCallee,
     ArrayRef<Use> CallArgs, Optional<ArrayRef<Value *>> DeoptArgs,
     ArrayRef<Value *> GCArgs, const Twine &Name) {
   return CreateGCStatepointCallCommon<Use, Value *, Value *, Value *>(
@@ -729,32 +769,31 @@ CallInst *IRBuilderBase::CreateGCStatepointCall(
 template <typename T0, typename T1, typename T2, typename T3>
 static InvokeInst *CreateGCStatepointInvokeCommon(
     IRBuilderBase *Builder, uint64_t ID, uint32_t NumPatchBytes,
-    Value *ActualInvokee, BasicBlock *NormalDest, BasicBlock *UnwindDest,
-    uint32_t Flags, ArrayRef<T0> InvokeArgs,
+    FunctionCallee ActualInvokee, BasicBlock *NormalDest,
+    BasicBlock *UnwindDest, uint32_t Flags, ArrayRef<T0> InvokeArgs,
     Optional<ArrayRef<T1>> TransitionArgs, Optional<ArrayRef<T2>> DeoptArgs,
     ArrayRef<T3> GCArgs, const Twine &Name) {
-  // Extract out the type of the callee.
-  auto *FuncPtrType = cast<PointerType>(ActualInvokee->getType());
-  assert(isa<FunctionType>(FuncPtrType->getPointerElementType()) &&
-         "actual callee must be a callable value");
-
   Module *M = Builder->GetInsertBlock()->getParent()->getParent();
   // Fill in the one generic type'd argument (the function is also vararg)
-  Function *FnStatepoint = Intrinsic::getDeclaration(
-      M, Intrinsic::experimental_gc_statepoint, {FuncPtrType});
+  Function *FnStatepoint =
+      Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_statepoint,
+                                {ActualInvokee.getCallee()->getType()});
 
   std::vector<Value *> Args =
-      getStatepointArgs(*Builder, ID, NumPatchBytes, ActualInvokee, Flags,
-                        InvokeArgs);
+      getStatepointArgs(*Builder, ID, NumPatchBytes, ActualInvokee.getCallee(),
+                        Flags, InvokeArgs);
 
-  return Builder->CreateInvoke(FnStatepoint, NormalDest, UnwindDest, Args,
-                               getStatepointBundles(TransitionArgs, DeoptArgs,
-                                                    GCArgs),
-                               Name);
+  InvokeInst *II = Builder->CreateInvoke(
+      FnStatepoint, NormalDest, UnwindDest, Args,
+      getStatepointBundles(TransitionArgs, DeoptArgs, GCArgs), Name);
+  II->addParamAttr(2,
+                   Attribute::get(Builder->getContext(), Attribute::ElementType,
+                                  ActualInvokee.getFunctionType()));
+  return II;
 }
 
 InvokeInst *IRBuilderBase::CreateGCStatepointInvoke(
-    uint64_t ID, uint32_t NumPatchBytes, Value *ActualInvokee,
+    uint64_t ID, uint32_t NumPatchBytes, FunctionCallee ActualInvokee,
     BasicBlock *NormalDest, BasicBlock *UnwindDest,
     ArrayRef<Value *> InvokeArgs, Optional<ArrayRef<Value *>> DeoptArgs,
     ArrayRef<Value *> GCArgs, const Twine &Name) {
@@ -765,19 +804,21 @@ InvokeInst *IRBuilderBase::CreateGCStatepointInvoke(
 }
 
 InvokeInst *IRBuilderBase::CreateGCStatepointInvoke(
-    uint64_t ID, uint32_t NumPatchBytes, Value *ActualInvokee,
+    uint64_t ID, uint32_t NumPatchBytes, FunctionCallee ActualInvokee,
     BasicBlock *NormalDest, BasicBlock *UnwindDest, uint32_t Flags,
     ArrayRef<Value *> InvokeArgs, Optional<ArrayRef<Use>> TransitionArgs,
-    Optional<ArrayRef<Use>> DeoptArgs, ArrayRef<Value *> GCArgs, const Twine &Name) {
+    Optional<ArrayRef<Use>> DeoptArgs, ArrayRef<Value *> GCArgs,
+    const Twine &Name) {
   return CreateGCStatepointInvokeCommon<Value *, Use, Use, Value *>(
       this, ID, NumPatchBytes, ActualInvokee, NormalDest, UnwindDest, Flags,
       InvokeArgs, TransitionArgs, DeoptArgs, GCArgs, Name);
 }
 
 InvokeInst *IRBuilderBase::CreateGCStatepointInvoke(
-    uint64_t ID, uint32_t NumPatchBytes, Value *ActualInvokee,
+    uint64_t ID, uint32_t NumPatchBytes, FunctionCallee ActualInvokee,
     BasicBlock *NormalDest, BasicBlock *UnwindDest, ArrayRef<Use> InvokeArgs,
-    Optional<ArrayRef<Value *>> DeoptArgs, ArrayRef<Value *> GCArgs, const Twine &Name) {
+    Optional<ArrayRef<Value *>> DeoptArgs, ArrayRef<Value *> GCArgs,
+    const Twine &Name) {
   return CreateGCStatepointInvokeCommon<Use, Value *, Value *, Value *>(
       this, ID, NumPatchBytes, ActualInvokee, NormalDest, UnwindDest,
       uint32_t(StatepointFlags::None), InvokeArgs, None, DeoptArgs, GCArgs,
@@ -785,31 +826,26 @@ InvokeInst *IRBuilderBase::CreateGCStatepointInvoke(
 }
 
 CallInst *IRBuilderBase::CreateGCResult(Instruction *Statepoint,
-                                       Type *ResultType,
-                                       const Twine &Name) {
- Intrinsic::ID ID = Intrinsic::experimental_gc_result;
- Module *M = BB->getParent()->getParent();
- Type *Types[] = {ResultType};
- Function *FnGCResult = Intrinsic::getDeclaration(M, ID, Types);
+                                        Type *ResultType, const Twine &Name) {
+  Intrinsic::ID ID = Intrinsic::experimental_gc_result;
+  Module *M = BB->getParent()->getParent();
+  Type *Types[] = {ResultType};
+  Function *FnGCResult = Intrinsic::getDeclaration(M, ID, Types);
 
- Value *Args[] = {Statepoint};
- return createCallHelper(FnGCResult, Args, this, Name);
+  Value *Args[] = {Statepoint};
+  return createCallHelper(FnGCResult, Args, this, Name);
 }
 
 CallInst *IRBuilderBase::CreateGCRelocate(Instruction *Statepoint,
-                                         int BaseOffset,
-                                         int DerivedOffset,
-                                         Type *ResultType,
-                                         const Twine &Name) {
- Module *M = BB->getParent()->getParent();
- Type *Types[] = {ResultType};
- Function *FnGCRelocate =
-     Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_relocate, Types);
+                                          int BaseOffset, int DerivedOffset,
+                                          Type *ResultType, const Twine &Name) {
+  Module *M = BB->getParent()->getParent();
+  Type *Types[] = {ResultType};
+  Function *FnGCRelocate =
+      Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_relocate, Types);
 
- Value *Args[] = {Statepoint,
-                  getInt32(BaseOffset),
-                  getInt32(DerivedOffset)};
- return createCallHelper(FnGCRelocate, Args, this, Name);
+  Value *Args[] = {Statepoint, getInt32(BaseOffset), getInt32(DerivedOffset)};
+  return createCallHelper(FnGCRelocate, Args, this, Name);
 }
 
 CallInst *IRBuilderBase::CreateGCGetPointerBase(Value *DerivedPtr,
@@ -1262,8 +1298,8 @@ CallInst *IRBuilderBase::CreateAlignmentAssumption(const DataLayout &DL,
   return CreateAlignmentAssumptionHelper(DL, PtrValue, Alignment, OffsetValue);
 }
 
-IRBuilderDefaultInserter::~IRBuilderDefaultInserter() {}
-IRBuilderCallbackInserter::~IRBuilderCallbackInserter() {}
-IRBuilderFolder::~IRBuilderFolder() {}
+IRBuilderDefaultInserter::~IRBuilderDefaultInserter() = default;
+IRBuilderCallbackInserter::~IRBuilderCallbackInserter() = default;
+IRBuilderFolder::~IRBuilderFolder() = default;
 void ConstantFolder::anchor() {}
 void NoFolder::anchor() {}
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 36a20679863b..bf76c89f26ca 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -492,6 +492,9 @@ static bool haveSameSpecialState(const Instruction *I1, const Instruction *I2,
   if (const ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(I1))
     return SVI->getShuffleMask() ==
            cast<ShuffleVectorInst>(I2)->getShuffleMask();
+  if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I1))
+    return GEP->getSourceElementType() ==
+           cast<GetElementPtrInst>(I2)->getSourceElementType();
 
   return true;
 }
@@ -695,7 +698,7 @@ bool Instruction::mayHaveSideEffects() const {
 
 bool Instruction::isSafeToRemove() const {
   return (!isa<CallInst>(this) || !this->mayHaveSideEffects()) &&
-         !this->isTerminator();
+         !this->isTerminator() && !this->isEHPad();
 }
 
 bool Instruction::willReturn() const {
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 7798af3b19b9..6a91edb75dd2 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -128,7 +128,7 @@ Value *PHINode::removeIncomingValue(unsigned Idx, bool DeletePHIIfEmpty) {
   // If the PHI node is dead, because it has zero entries, nuke it now.
   if (getNumOperands() == 0 && DeletePHIIfEmpty) {
     // If anyone is using this PHI, make them use a dummy value instead...
-    replaceAllUsesWith(UndefValue::get(getType()));
+    replaceAllUsesWith(PoisonValue::get(getType()));
     eraseFromParent();
   }
   return Removed;
@@ -325,13 +325,13 @@ bool CallBase::isReturnNonNull() const {
   return false;
 }
 
-Value *CallBase::getReturnedArgOperand() const {
+Value *CallBase::getArgOperandWithAttribute(Attribute::AttrKind Kind) const {
   unsigned Index;
 
-  if (Attrs.hasAttrSomewhere(Attribute::Returned, &Index))
+  if (Attrs.hasAttrSomewhere(Kind, &Index))
     return getArgOperand(Index - AttributeList::FirstArgIndex);
   if (const Function *F = getCalledFunction())
-    if (F->getAttributes().hasAttrSomewhere(Attribute::Returned, &Index))
+    if (F->getAttributes().hasAttrSomewhere(Kind, &Index))
       return getArgOperand(Index - AttributeList::FirstArgIndex);
 
   return nullptr;
@@ -372,6 +372,27 @@ bool CallBase::hasFnAttrOnCalledFunction(StringRef Kind) const {
   return false;
 }
 
+template <typename AK>
+Attribute CallBase::getFnAttrOnCalledFunction(AK Kind) const {
+  // Operand bundles override attributes on the called function, but don't
+  // override attributes directly present on the call instruction.
+  if (isFnAttrDisallowedByOpBundle(Kind))
+    return Attribute();
+  Value *V = getCalledOperand();
+  if (auto *CE = dyn_cast<ConstantExpr>(V))
+    if (CE->getOpcode() == BitCast)
+      V = CE->getOperand(0);
+
+  if (auto *F = dyn_cast<Function>(V))
+    return F->getAttributes().getFnAttr(Kind);
+
+  return Attribute();
+}
+
+template Attribute
+CallBase::getFnAttrOnCalledFunction(Attribute::AttrKind Kind) const;
+template Attribute CallBase::getFnAttrOnCalledFunction(StringRef Kind) const;
+
 void CallBase::getOperandBundlesAsDefs(
     SmallVectorImpl<OperandBundleDef> &Defs) const {
   for (unsigned i = 0, e = getNumOperandBundles(); i != e; ++i)
@@ -482,9 +503,10 @@ CallBase *CallBase::removeOperandBundle(CallBase *CB, uint32_t ID,
 
 bool CallBase::hasReadingOperandBundles() const {
   // Implementation note: this is a conservative implementation of operand
-  // bundle semantics, where *any* non-assume operand bundle forces a callsite
-  // to be at least readonly.
-  return hasOperandBundles() && getIntrinsicID() != Intrinsic::assume;
+  // bundle semantics, where *any* non-assume operand bundle (other than
+  // ptrauth) forces a callsite to be at least readonly.
+  return hasOperandBundlesOtherThan(LLVMContext::OB_ptrauth) &&
+         getIntrinsicID() != Intrinsic::assume;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2194,7 +2216,13 @@ bool ShuffleVectorInst::isIdentityMask(ArrayRef<int> Mask) {
 bool ShuffleVectorInst::isReverseMask(ArrayRef<int> Mask) {
   if (!isSingleSourceMask(Mask))
     return false;
-  for (int i = 0, NumElts = Mask.size(); i < NumElts; ++i) {
+
+  // The number of elements in the mask must be at least 2.
+  int NumElts = Mask.size();
+  if (NumElts < 2)
+    return false;
+
+  for (int i = 0; i < NumElts; ++i) {
     if (Mask[i] == -1)
       continue;
     if (Mask[i] != (NumElts - 1 - i) && Mask[i] != (NumElts + NumElts - 1 - i))
@@ -3060,16 +3088,18 @@ unsigned CastInst::isEliminableCastPair(
       return 0;
     }
     case 8: {
-      // ext, trunc -> bitcast,    if the SrcTy and DstTy are same size
+      // ext, trunc -> bitcast,    if the SrcTy and DstTy are the same
       // ext, trunc -> ext,        if sizeof(SrcTy) < sizeof(DstTy)
       // ext, trunc -> trunc,      if sizeof(SrcTy) > sizeof(DstTy)
       unsigned SrcSize = SrcTy->getScalarSizeInBits();
       unsigned DstSize = DstTy->getScalarSizeInBits();
-      if (SrcSize == DstSize)
+      if (SrcTy == DstTy)
         return Instruction::BitCast;
-      else if (SrcSize < DstSize)
+      if (SrcSize < DstSize)
         return firstOp;
-      return secondOp;
+      if (SrcSize > DstSize)
+        return secondOp;
+      return 0;
     }
     case 9:
       // zext, sext -> zext, because sext can't sign extend after zext
@@ -4447,7 +4477,7 @@ void SwitchInstProfUpdateWrapper::addCase(
     Weights.getValue()[SI.getNumSuccessors() - 1] = *W;
   } else if (Weights) {
     Changed = true;
-    Weights.getValue().push_back(W.getValueOr(0));
+    Weights.getValue().push_back(W.value_or(0));
   }
   if (Weights)
     assert(SI.getNumSuccessors() == Weights->size() &&
@@ -4467,7 +4497,7 @@ SwitchInstProfUpdateWrapper::CaseWeightOpt
 SwitchInstProfUpdateWrapper::getSuccessorWeight(unsigned idx) {
   if (!Weights)
     return None;
-  return Weights.getValue()[idx];
+  return (*Weights)[idx];
 }
 
 void SwitchInstProfUpdateWrapper::setSuccessorWeight(
@@ -4479,7 +4509,7 @@ void SwitchInstProfUpdateWrapper::setSuccessorWeight(
     Weights = SmallVector<uint32_t, 8>(SI.getNumSuccessors(), 0);
 
   if (Weights) {
-    auto &OldW = Weights.getValue()[idx];
+    auto &OldW = (*Weights)[idx];
     if (*W != OldW) {
       Changed = true;
       OldW = *W;
diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp
index e27758c5de02..b132a9dcb812 100644
--- a/llvm/lib/IR/IntrinsicInst.cpp
+++ b/llvm/lib/IR/IntrinsicInst.cpp
@@ -236,8 +236,8 @@ bool ConstrainedFPIntrinsic::isDefaultFPEnvironment() const {
   return true;
 }
 
-FCmpInst::Predicate ConstrainedFPCmpIntrinsic::getPredicate() const {
-  Metadata *MD = cast<MetadataAsValue>(getArgOperand(2))->getMetadata();
+static FCmpInst::Predicate getFPPredicateFromMD(const Value *Op) {
+  Metadata *MD = cast<MetadataAsValue>(Op)->getMetadata();
   if (!MD || !isa<MDString>(MD))
     return FCmpInst::BAD_FCMP_PREDICATE;
   return StringSwitch<FCmpInst::Predicate>(cast<MDString>(MD)->getString())
@@ -258,6 +258,10 @@ FCmpInst::Predicate ConstrainedFPCmpIntrinsic::getPredicate() const {
       .Default(FCmpInst::BAD_FCMP_PREDICATE);
 }
 
+FCmpInst::Predicate ConstrainedFPCmpIntrinsic::getPredicate() const {
+  return getFPPredicateFromMD(getArgOperand(2));
+}
+
 bool ConstrainedFPIntrinsic::isUnaryOp() const {
   switch (getIntrinsicID()) {
   default:
@@ -299,13 +303,18 @@ ElementCount VPIntrinsic::getStaticVectorLength() const {
   };
 
   Value *VPMask = getMaskParam();
-  assert(VPMask && "No mask param?");
+  if (!VPMask) {
+    assert((getIntrinsicID() == Intrinsic::vp_merge ||
+            getIntrinsicID() == Intrinsic::vp_select) &&
+           "Unexpected VP intrinsic without mask operand");
+    return GetVectorLengthOfType(getType());
+  }
   return GetVectorLengthOfType(VPMask->getType());
 }
 
 Value *VPIntrinsic::getMaskParam() const {
   if (auto MaskPos = getMaskParamPos(getIntrinsicID()))
-    return getArgOperand(MaskPos.getValue());
+    return getArgOperand(*MaskPos);
   return nullptr;
 }
 
@@ -316,7 +325,7 @@ void VPIntrinsic::setMaskParam(Value *NewMask) {
 
 Value *VPIntrinsic::getVectorLengthParam() const {
   if (auto EVLPos = getVectorLengthParamPos(getIntrinsicID()))
-    return getArgOperand(EVLPos.getValue());
+    return getArgOperand(*EVLPos);
   return nullptr;
 }
 
@@ -354,7 +363,7 @@ VPIntrinsic::getVectorLengthParamPos(Intrinsic::ID IntrinsicID) {
 /// scatter.
 MaybeAlign VPIntrinsic::getPointerAlignment() const {
   Optional<unsigned> PtrParamOpt = getMemoryPointerParamPos(getIntrinsicID());
-  assert(PtrParamOpt.hasValue() && "no pointer argument!");
+  assert(PtrParamOpt && "no pointer argument!");
   return getParamAlign(PtrParamOpt.getValue());
 }
 
@@ -380,7 +389,7 @@ Optional<unsigned> VPIntrinsic::getMemoryPointerParamPos(Intrinsic::ID VPID) {
 /// \return The data (payload) operand of this store or scatter.
 Value *VPIntrinsic::getMemoryDataParam() const {
   auto DataParamOpt = getMemoryDataParamPos(getIntrinsicID());
-  if (!DataParamOpt.hasValue())
+  if (!DataParamOpt)
     return nullptr;
   return getArgOperand(DataParamOpt.getValue());
 }
@@ -492,6 +501,20 @@ Function *VPIntrinsic::getDeclarationForParams(Module *M, Intrinsic::ID VPID,
     VPFunc = Intrinsic::getDeclaration(M, VPID, OverloadTy);
     break;
   }
+  case Intrinsic::vp_trunc:
+  case Intrinsic::vp_sext:
+  case Intrinsic::vp_zext:
+  case Intrinsic::vp_fptoui:
+  case Intrinsic::vp_fptosi:
+  case Intrinsic::vp_uitofp:
+  case Intrinsic::vp_sitofp:
+  case Intrinsic::vp_fptrunc:
+  case Intrinsic::vp_fpext:
+  case Intrinsic::vp_ptrtoint:
+  case Intrinsic::vp_inttoptr:
+    VPFunc =
+        Intrinsic::getDeclaration(M, VPID, {ReturnType, Params[0]->getType()});
+    break;
   case Intrinsic::vp_merge:
   case Intrinsic::vp_select:
     VPFunc = Intrinsic::getDeclaration(M, VPID, {Params[1]->getType()});
@@ -500,6 +523,10 @@ Function *VPIntrinsic::getDeclarationForParams(Module *M, Intrinsic::ID VPID,
     VPFunc = Intrinsic::getDeclaration(
         M, VPID, {ReturnType, Params[0]->getType()});
     break;
+  case Intrinsic::experimental_vp_strided_load:
+    VPFunc = Intrinsic::getDeclaration(
+        M, VPID, {ReturnType, Params[0]->getType(), Params[1]->getType()});
+    break;
   case Intrinsic::vp_gather:
     VPFunc = Intrinsic::getDeclaration(
         M, VPID, {ReturnType, Params[0]->getType()});
@@ -508,6 +535,11 @@ Function *VPIntrinsic::getDeclarationForParams(Module *M, Intrinsic::ID VPID,
     VPFunc = Intrinsic::getDeclaration(
         M, VPID, {Params[0]->getType(), Params[1]->getType()});
     break;
+  case Intrinsic::experimental_vp_strided_store:
+    VPFunc = Intrinsic::getDeclaration(
+        M, VPID,
+        {Params[0]->getType(), Params[1]->getType(), Params[2]->getType()});
+    break;
   case Intrinsic::vp_scatter:
     VPFunc = Intrinsic::getDeclaration(
         M, VPID, {Params[0]->getType(), Params[1]->getType()});
@@ -529,6 +561,67 @@ bool VPReductionIntrinsic::isVPReduction(Intrinsic::ID ID) {
   return false;
 }
 
+bool VPCastIntrinsic::isVPCast(Intrinsic::ID ID) {
+  switch (ID) {
+  default:
+    break;
+#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID:
+#define VP_PROPERTY_CASTOP return true;
+#define END_REGISTER_VP_INTRINSIC(VPID) break;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+  return false;
+}
+
+bool VPCmpIntrinsic::isVPCmp(Intrinsic::ID ID) {
+  switch (ID) {
+  default:
+    break;
+#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID:
+#define VP_PROPERTY_CMP(CCPOS, ...) return true;
+#define END_REGISTER_VP_INTRINSIC(VPID) break;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+  return false;
+}
+
+static ICmpInst::Predicate getIntPredicateFromMD(const Value *Op) {
+  Metadata *MD = cast<MetadataAsValue>(Op)->getMetadata();
+  if (!MD || !isa<MDString>(MD))
+    return ICmpInst::BAD_ICMP_PREDICATE;
+  return StringSwitch<ICmpInst::Predicate>(cast<MDString>(MD)->getString())
+      .Case("eq", ICmpInst::ICMP_EQ)
+      .Case("ne", ICmpInst::ICMP_NE)
+      .Case("ugt", ICmpInst::ICMP_UGT)
+      .Case("uge", ICmpInst::ICMP_UGE)
+      .Case("ult", ICmpInst::ICMP_ULT)
+      .Case("ule", ICmpInst::ICMP_ULE)
+      .Case("sgt", ICmpInst::ICMP_SGT)
+      .Case("sge", ICmpInst::ICMP_SGE)
+      .Case("slt", ICmpInst::ICMP_SLT)
+      .Case("sle", ICmpInst::ICMP_SLE)
+      .Default(ICmpInst::BAD_ICMP_PREDICATE);
+}
+
+CmpInst::Predicate VPCmpIntrinsic::getPredicate() const {
+  bool IsFP = true;
+  Optional<unsigned> CCArgIdx;
+  switch (getIntrinsicID()) {
+  default:
+    break;
+#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID:
+#define VP_PROPERTY_CMP(CCPOS, ISFP)                                           \
+  CCArgIdx = CCPOS;                                                            \
+  IsFP = ISFP;                                                                 \
+  break;
+#define END_REGISTER_VP_INTRINSIC(VPID) break;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+  assert(CCArgIdx && "Unexpected vector-predicated comparison");
+  return IsFP ? getFPPredicateFromMD(getArgOperand(*CCArgIdx))
+              : getIntPredicateFromMD(getArgOperand(*CCArgIdx));
+}
+
 unsigned VPReductionIntrinsic::getVectorParamPos() const {
   return *VPReductionIntrinsic::getVectorParamPos(getIntrinsicID());
 }
diff --git a/llvm/lib/IR/LLVMContext.cpp b/llvm/lib/IR/LLVMContext.cpp
index e19ead98a616..4a1d5d3dcdf6 100644
--- a/llvm/lib/IR/LLVMContext.cpp
+++ b/llvm/lib/IR/LLVMContext.cpp
@@ -82,6 +82,11 @@ LLVMContext::LLVMContext() : pImpl(new LLVMContextImpl(*this)) {
          "clang.arc.attachedcall operand bundle id drifted!");
   (void)ClangAttachedCall;
 
+  auto *PtrauthEntry = pImpl->getOrInsertBundleTag("ptrauth");
+  assert(PtrauthEntry->second == LLVMContext::OB_ptrauth &&
+         "ptrauth operand bundle id drifted!");
+  (void)PtrauthEntry;
+
   SyncScope::ID SingleThreadSSID =
       pImpl->getOrInsertSyncScopeID("singlethread");
   assert(SingleThreadSSID == SyncScope::SingleThread &&
@@ -133,13 +138,25 @@ bool LLVMContext::getDiagnosticsHotnessRequested() const {
 void LLVMContext::setDiagnosticsHotnessThreshold(Optional<uint64_t> Threshold) {
   pImpl->DiagnosticsHotnessThreshold = Threshold;
 }
-
+void LLVMContext::setMisExpectWarningRequested(bool Requested) {
+  pImpl->MisExpectWarningRequested = Requested;
+}
+bool LLVMContext::getMisExpectWarningRequested() const {
+  return pImpl->MisExpectWarningRequested;
+}
 uint64_t LLVMContext::getDiagnosticsHotnessThreshold() const {
-  return pImpl->DiagnosticsHotnessThreshold.getValueOr(UINT64_MAX);
+  return pImpl->DiagnosticsHotnessThreshold.value_or(UINT64_MAX);
+}
+void LLVMContext::setDiagnosticsMisExpectTolerance(
+    Optional<uint64_t> Tolerance) {
+  pImpl->DiagnosticsMisExpectTolerance = Tolerance;
+}
+uint64_t LLVMContext::getDiagnosticsMisExpectTolerance() const {
+  return pImpl->DiagnosticsMisExpectTolerance.value_or(0);
 }
 
 bool LLVMContext::isDiagnosticsHotnessThresholdSetFromPSI() const {
-  return !pImpl->DiagnosticsHotnessThreshold.hasValue();
+  return !pImpl->DiagnosticsHotnessThreshold.has_value();
 }
 
 remarks::RemarkStreamer *LLVMContext::getMainRemarkStreamer() {
@@ -346,12 +363,18 @@ std::unique_ptr<DiagnosticHandler> LLVMContext::getDiagnosticHandler() {
   return std::move(pImpl->DiagHandler);
 }
 
-void LLVMContext::enableOpaquePointers() const {
-  assert(pImpl->PointerTypes.empty() && pImpl->ASPointerTypes.empty() &&
-         "Must be called before creating any pointer types");
-  pImpl->setOpaquePointers(true);
+bool LLVMContext::hasSetOpaquePointersValue() const {
+  return pImpl->hasOpaquePointersValue();
+}
+
+void LLVMContext::setOpaquePointers(bool Enable) const {
+  pImpl->setOpaquePointers(Enable);
 }
 
 bool LLVMContext::supportsTypedPointers() const {
   return !pImpl->getOpaquePointers();
 }
+
+Any &LLVMContext::getTargetData() const {
+  return pImpl->TargetDataStorage;
+}
diff --git a/llvm/lib/IR/LLVMContextImpl.cpp b/llvm/lib/IR/LLVMContextImpl.cpp
index 8f9530290459..06b3a3afef9d 100644
--- a/llvm/lib/IR/LLVMContextImpl.cpp
+++ b/llvm/lib/IR/LLVMContextImpl.cpp
@@ -36,7 +36,7 @@ using namespace llvm;
 
 static cl::opt<bool>
     OpaquePointersCL("opaque-pointers", cl::desc("Use opaque pointers"),
-                     cl::init(false));
+                     cl::init(true));
 
 LLVMContextImpl::LLVMContextImpl(LLVMContext &C)
     : DiagHandler(std::make_unique<DiagnosticHandler>()),
@@ -47,7 +47,11 @@ LLVMContextImpl::LLVMContextImpl(LLVMContext &C)
       X86_FP80Ty(C, Type::X86_FP80TyID), FP128Ty(C, Type::FP128TyID),
       PPC_FP128Ty(C, Type::PPC_FP128TyID), X86_MMXTy(C, Type::X86_MMXTyID),
       X86_AMXTy(C, Type::X86_AMXTyID), Int1Ty(C, 1), Int8Ty(C, 8),
-      Int16Ty(C, 16), Int32Ty(C, 32), Int64Ty(C, 64), Int128Ty(C, 128) {}
+      Int16Ty(C, 16), Int32Ty(C, 32), Int64Ty(C, 64), Int128Ty(C, 128) {
+  if (OpaquePointersCL.getNumOccurrences()) {
+    OpaquePointers = OpaquePointersCL;
+  }
+}
 
 LLVMContextImpl::~LLVMContextImpl() {
   // NOTE: We need to delete the contents of OwnedModules, but Module's dtor
@@ -245,10 +249,18 @@ void LLVMContextImpl::setOptPassGate(OptPassGate& OPG) {
   this->OPG = &OPG;
 }
 
+bool LLVMContextImpl::hasOpaquePointersValue() {
+  return OpaquePointers.has_value();
+}
+
 bool LLVMContextImpl::getOpaquePointers() {
-  if (LLVM_UNLIKELY(!(OpaquePointers.hasValue())))
+  if (LLVM_UNLIKELY(!OpaquePointers))
     OpaquePointers = OpaquePointersCL;
   return *OpaquePointers;
 }
 
-void LLVMContextImpl::setOpaquePointers(bool OP) { OpaquePointers = OP; }
+void LLVMContextImpl::setOpaquePointers(bool OP) {
+  assert((!OpaquePointers || OpaquePointers.getValue() == OP) &&
+         "Cannot change opaque pointers mode once set");
+  OpaquePointers = OP;
+}
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index 70242f4d8f20..47add940f603 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -17,6 +17,7 @@
 #include "ConstantsContext.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
+#include "llvm/ADT/Any.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseMapInfo.h"
@@ -686,7 +687,7 @@ template <> struct MDNodeKeyImpl<DIFile> {
   unsigned getHashValue() const {
     return hash_combine(Filename, Directory, Checksum ? Checksum->Kind : 0,
                         Checksum ? Checksum->Value : nullptr,
-                        Source.getValueOr(nullptr));
+                        Source.value_or(nullptr));
   }
 };
 
@@ -709,6 +710,7 @@ template <> struct MDNodeKeyImpl<DISubprogram> {
   Metadata *RetainedNodes;
   Metadata *ThrownTypes;
   Metadata *Annotations;
+  MDString *TargetFuncName;
 
   MDNodeKeyImpl(Metadata *Scope, MDString *Name, MDString *LinkageName,
                 Metadata *File, unsigned Line, Metadata *Type,
@@ -716,14 +718,15 @@ template <> struct MDNodeKeyImpl<DISubprogram> {
                 unsigned VirtualIndex, int ThisAdjustment, unsigned Flags,
                 unsigned SPFlags, Metadata *Unit, Metadata *TemplateParams,
                 Metadata *Declaration, Metadata *RetainedNodes,
-                Metadata *ThrownTypes, Metadata *Annotations)
+                Metadata *ThrownTypes, Metadata *Annotations,
+                MDString *TargetFuncName)
       : Scope(Scope), Name(Name), LinkageName(LinkageName), File(File),
         Line(Line), Type(Type), ScopeLine(ScopeLine),
         ContainingType(ContainingType), VirtualIndex(VirtualIndex),
         ThisAdjustment(ThisAdjustment), Flags(Flags), SPFlags(SPFlags),
         Unit(Unit), TemplateParams(TemplateParams), Declaration(Declaration),
         RetainedNodes(RetainedNodes), ThrownTypes(ThrownTypes),
-        Annotations(Annotations) {}
+        Annotations(Annotations), TargetFuncName(TargetFuncName) {}
   MDNodeKeyImpl(const DISubprogram *N)
       : Scope(N->getRawScope()), Name(N->getRawName()),
         LinkageName(N->getRawLinkageName()), File(N->getRawFile()),
@@ -736,7 +739,8 @@ template <> struct MDNodeKeyImpl<DISubprogram> {
         Declaration(N->getRawDeclaration()),
         RetainedNodes(N->getRawRetainedNodes()),
         ThrownTypes(N->getRawThrownTypes()),
-        Annotations(N->getRawAnnotations()) {}
+        Annotations(N->getRawAnnotations()),
+        TargetFuncName(N->getRawTargetFuncName()) {}
 
   bool isKeyOf(const DISubprogram *RHS) const {
     return Scope == RHS->getRawScope() && Name == RHS->getRawName() &&
@@ -752,7 +756,8 @@ template <> struct MDNodeKeyImpl<DISubprogram> {
            Declaration == RHS->getRawDeclaration() &&
            RetainedNodes == RHS->getRawRetainedNodes() &&
            ThrownTypes == RHS->getRawThrownTypes() &&
-           Annotations == RHS->getRawAnnotations();
+           Annotations == RHS->getRawAnnotations() &&
+           TargetFuncName == RHS->getRawTargetFuncName();
   }
 
   bool isDefinition() const { return SPFlags & DISubprogram::SPFlagDefinition; }
@@ -1380,12 +1385,19 @@ public:
   /// If threshold option is not specified, it is disabled (0) by default.
   Optional<uint64_t> DiagnosticsHotnessThreshold = 0;
 
+  /// The percentage of difference between profiling branch weights and
+  // llvm.expect branch weights to tolerate when emiting MisExpect diagnostics
+  Optional<uint64_t> DiagnosticsMisExpectTolerance = 0;
+  bool MisExpectWarningRequested = false;
+
   /// The specialized remark streamer used by LLVM's OptimizationRemarkEmitter.
   std::unique_ptr<LLVMRemarkStreamer> LLVMRS;
 
   LLVMContext::YieldCallbackTy YieldCallback = nullptr;
   void *YieldOpaqueHandle = nullptr;
 
+  DenseMap<const Value *, ValueName *> ValueNames;
+
   using IntMapTy =
       DenseMap<APInt, std::unique_ptr<ConstantInt>, DenseMapAPIntKeyInfo>;
   IntMapTy IntConstants;
@@ -1402,8 +1414,6 @@ public:
   DenseMap<Value *, ValueAsMetadata *> ValuesAsMetadata;
   DenseMap<Metadata *, MetadataAsValue *> MetadataAsValues;
 
-  DenseMap<const Value *, ValueName *> ValueNames;
-
 #define HANDLE_MDNODE_LEAF_UNIQUABLE(CLASS)                                    \
   DenseSet<CLASS *, CLASS##Info> CLASS##s;
 #include "llvm/IR/Metadata.def"
@@ -1450,14 +1460,14 @@ public:
   ConstantInt *TheTrueVal = nullptr;
   ConstantInt *TheFalseVal = nullptr;
 
-  std::unique_ptr<ConstantTokenNone> TheNoneToken;
-
   // Basic type instances.
   Type VoidTy, LabelTy, HalfTy, BFloatTy, FloatTy, DoubleTy, MetadataTy,
       TokenTy;
   Type X86_FP80Ty, FP128Ty, PPC_FP128Ty, X86_MMXTy, X86_AMXTy;
   IntegerType Int1Ty, Int8Ty, Int16Ty, Int32Ty, Int64Ty, Int128Ty;
 
+  std::unique_ptr<ConstantTokenNone> TheNoneToken;
+
   BumpPtrAllocator Alloc;
   UniqueStringSaver Saver{Alloc};
 
@@ -1493,6 +1503,9 @@ public:
   /// Collection of per-GlobalValue partitions used in this context.
   DenseMap<const GlobalValue *, StringRef> GlobalValuePartitions;
 
+  DenseMap<const GlobalValue *, GlobalValue::SanitizerMetadata>
+      GlobalValueSanitizerMetadata;
+
   /// DiscriminatorTable - This table maps file:line locations to an
   /// integer representing the next DWARF path discriminator to assign to
   /// instructions in different blocks at the same location.
@@ -1555,8 +1568,11 @@ public:
   // TODO: clean up the following after we no longer support non-opaque pointer
   // types.
   bool getOpaquePointers();
+  bool hasOpaquePointersValue();
   void setOpaquePointers(bool OP);
 
+  llvm::Any TargetDataStorage;
+
 private:
   Optional<bool> OpaquePointers;
 };
diff --git a/llvm/lib/IR/LegacyPassManager.cpp b/llvm/lib/IR/LegacyPassManager.cpp
index 08cf909a83f9..ef3465177647 100644
--- a/llvm/lib/IR/LegacyPassManager.cpp
+++ b/llvm/lib/IR/LegacyPassManager.cpp
@@ -29,10 +29,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 
-#ifdef EXPENSIVE_CHECKS
-#include "llvm/IR/StructuralHash.h"
-#endif
-
 using namespace llvm;
 
 // See PassManagers.h for Pass Manager infrastructure overview.
@@ -1429,12 +1425,12 @@ bool FPPassManager::runOnFunction(Function &F) {
       PassManagerPrettyStackEntry X(FP, F);
       TimeRegion PassTimer(getPassTimer(FP));
 #ifdef EXPENSIVE_CHECKS
-      uint64_t RefHash = StructuralHash(F);
+      uint64_t RefHash = FP->structuralHash(F);
 #endif
       LocalChanged |= FP->runOnFunction(F);
 
 #if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
-      if (!LocalChanged && (RefHash != StructuralHash(F))) {
+      if (!LocalChanged && (RefHash != FP->structuralHash(F))) {
         llvm::errs() << "Pass modifies its input and doesn't report it: "
                      << FP->getPassName() << "\n";
         llvm_unreachable("Pass modifies its input and doesn't report it");
@@ -1543,13 +1539,13 @@ MPPassManager::runOnModule(Module &M) {
       TimeRegion PassTimer(getPassTimer(MP));
 
 #ifdef EXPENSIVE_CHECKS
-      uint64_t RefHash = StructuralHash(M);
+      uint64_t RefHash = MP->structuralHash(M);
 #endif
 
       LocalChanged |= MP->runOnModule(M);
 
 #ifdef EXPENSIVE_CHECKS
-      assert((LocalChanged || (RefHash == StructuralHash(M))) &&
+      assert((LocalChanged || (RefHash == MP->structuralHash(M))) &&
              "Pass modifies its input and doesn't report it.");
 #endif
 
@@ -1767,4 +1763,4 @@ void FunctionPass::assignPassManager(PMStack &PMS,
   PM->add(this);
 }
 
-legacy::PassManagerBase::~PassManagerBase() {}
+legacy::PassManagerBase::~PassManagerBase() = default;
diff --git a/llvm/lib/IR/MDBuilder.cpp b/llvm/lib/IR/MDBuilder.cpp
index 35af8490287b..fc59fda9fe22 100644
--- a/llvm/lib/IR/MDBuilder.cpp
+++ b/llvm/lib/IR/MDBuilder.cpp
@@ -150,6 +150,14 @@ MDNode *MDBuilder::mergeCallbackEncodings(MDNode *ExistingCallbacks,
   return MDNode::get(Context, Ops);
 }
 
+MDNode *MDBuilder::createRTTIPointerPrologue(Constant *PrologueSig,
+                                             Constant *RTTI) {
+  SmallVector<Metadata *, 4> Ops;
+  Ops.push_back(createConstant(PrologueSig));
+  Ops.push_back(createConstant(RTTI));
+  return MDNode::get(Context, Ops);
+}
+
 MDNode *MDBuilder::createAnonymousAARoot(StringRef Name, MDNode *Extra) {
   SmallVector<Metadata *, 3> Args(1, nullptr);
   if (Extra)
diff --git a/llvm/lib/IR/Mangler.cpp b/llvm/lib/IR/Mangler.cpp
index 2399ea27ee9d..b8e3e40e4c1d 100644
--- a/llvm/lib/IR/Mangler.cpp
+++ b/llvm/lib/IR/Mangler.cpp
@@ -144,7 +144,7 @@ void Mangler::getNameWithPrefix(raw_ostream &OS, const GlobalValue *GV,
 
   // Mangle functions with Microsoft calling conventions specially.  Only do
   // this mangling for x86_64 vectorcall and 32-bit x86.
-  const Function *MSFunc = dyn_cast<Function>(GV);
+  const Function *MSFunc = dyn_cast_or_null<Function>(GV->getAliaseeObject());
 
   // Don't add byte count suffixes when '\01' or '?' are in the first
   // character.
diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp
index 226718ecac28..ae2401026ebf 100644
--- a/llvm/lib/IR/Metadata.cpp
+++ b/llvm/lib/IR/Metadata.cpp
@@ -245,6 +245,36 @@ void ReplaceableMetadataImpl::moveRef(void *Ref, void *New,
          "Reference without owner must be direct");
 }
 
+void ReplaceableMetadataImpl::SalvageDebugInfo(const Constant &C) {
+  if (!C.isUsedByMetadata()) {
+    return;
+  }
+
+  LLVMContext &Context = C.getType()->getContext();
+  auto &Store = Context.pImpl->ValuesAsMetadata;
+  auto I = Store.find(&C);
+  ValueAsMetadata *MD = I->second;
+  using UseTy =
+      std::pair<void *, std::pair<MetadataTracking::OwnerTy, uint64_t>>;
+  // Copy out uses and update value of Constant used by debug info metadata with undef below
+  SmallVector<UseTy, 8> Uses(MD->UseMap.begin(), MD->UseMap.end());
+
+  for (const auto &Pair : Uses) {
+    MetadataTracking::OwnerTy Owner = Pair.second.first;
+    if (!Owner)
+      continue;
+    if (!Owner.is<Metadata *>())
+      continue;
+    auto *OwnerMD = dyn_cast<MDNode>(Owner.get<Metadata *>());
+    if (!OwnerMD)
+      continue;
+    if (isa<DINode>(OwnerMD)) {
+      OwnerMD->handleChangedOperand(
+          Pair.first, ValueAsMetadata::get(UndefValue::get(C.getType())));
+    }
+  }
+}
+
 void ReplaceableMetadataImpl::replaceAllUsesWith(Metadata *MD) {
   if (UseMap.empty())
     return;
@@ -252,9 +282,7 @@ void ReplaceableMetadataImpl::replaceAllUsesWith(Metadata *MD) {
   // Copy out uses since UseMap will get touched below.
   using UseTy = std::pair<void *, std::pair<OwnerTy, uint64_t>>;
   SmallVector<UseTy, 8> Uses(UseMap.begin(), UseMap.end());
-  llvm::sort(Uses, [](const UseTy &L, const UseTy &R) {
-    return L.second.second < R.second.second;
-  });
+  llvm::sort(Uses, llvm::less_second());
   for (const auto &Pair : Uses) {
     // Check that this Ref hasn't disappeared after RAUW (when updating a
     // previous Ref).
@@ -493,35 +521,26 @@ StringRef MDString::getString() const {
       "Alignment is insufficient after objects prepended to " #CLASS);
 #include "llvm/IR/Metadata.def"
 
-void *MDNode::operator new(size_t Size, unsigned NumOps) {
-  size_t OpSize = NumOps * sizeof(MDOperand);
+void *MDNode::operator new(size_t Size, size_t NumOps, StorageType Storage) {
   // uint64_t is the most aligned type we need support (ensured by static_assert
   // above)
-  OpSize = alignTo(OpSize, alignof(uint64_t));
-  void *Ptr = reinterpret_cast<char *>(::operator new(OpSize + Size)) + OpSize;
-  MDOperand *O = static_cast<MDOperand *>(Ptr);
-  for (MDOperand *E = O - NumOps; O != E; --O)
-    (void)new (O - 1) MDOperand;
-  return Ptr;
+  size_t AllocSize =
+      alignTo(Header::getAllocSize(Storage, NumOps), alignof(uint64_t));
+  char *Mem = reinterpret_cast<char *>(::operator new(AllocSize + Size));
+  Header *H = new (Mem + AllocSize - sizeof(Header)) Header(NumOps, Storage);
+  return reinterpret_cast<void *>(H + 1);
 }
 
-// Repress memory sanitization, due to use-after-destroy by operator
-// delete. Bug report 24578 identifies this issue.
-LLVM_NO_SANITIZE_MEMORY_ATTRIBUTE void MDNode::operator delete(void *Mem) {
-  MDNode *N = static_cast<MDNode *>(Mem);
-  size_t OpSize = N->NumOperands * sizeof(MDOperand);
-  OpSize = alignTo(OpSize, alignof(uint64_t));
-
-  MDOperand *O = static_cast<MDOperand *>(Mem);
-  for (MDOperand *E = O - N->NumOperands; O != E; --O)
-    (O - 1)->~MDOperand();
-  ::operator delete(reinterpret_cast<char *>(Mem) - OpSize);
+void MDNode::operator delete(void *N) {
+  Header *H = reinterpret_cast<Header *>(N) - 1;
+  void *Mem = H->getAllocation();
+  H->~Header();
+  ::operator delete(Mem);
 }
 
 MDNode::MDNode(LLVMContext &Context, unsigned ID, StorageType Storage,
                ArrayRef<Metadata *> Ops1, ArrayRef<Metadata *> Ops2)
-    : Metadata(ID, Storage), NumOperands(Ops1.size() + Ops2.size()),
-      NumUnresolved(0), Context(Context) {
+    : Metadata(ID, Storage), Context(Context) {
   unsigned Op = 0;
   for (Metadata *MD : Ops1)
     setOperand(Op++, MD);
@@ -547,6 +566,87 @@ TempMDNode MDNode::clone() const {
   }
 }
 
+MDNode::Header::Header(size_t NumOps, StorageType Storage) {
+  IsLarge = isLarge(NumOps);
+  IsResizable = isResizable(Storage);
+  SmallSize = getSmallSize(NumOps, IsResizable, IsLarge);
+  if (IsLarge) {
+    SmallNumOps = 0;
+    new (getLargePtr()) LargeStorageVector();
+    getLarge().resize(NumOps);
+    return;
+  }
+  SmallNumOps = NumOps;
+  MDOperand *O = reinterpret_cast<MDOperand *>(this) - SmallSize;
+  for (MDOperand *E = O + SmallSize; O != E;)
+    (void)new (O++) MDOperand();
+}
+
+MDNode::Header::~Header() {
+  if (IsLarge) {
+    getLarge().~LargeStorageVector();
+    return;
+  }
+  MDOperand *O = reinterpret_cast<MDOperand *>(this);
+  for (MDOperand *E = O - SmallSize; O != E; --O)
+    (void)(O - 1)->~MDOperand();
+}
+
+void *MDNode::Header::getLargePtr() const {
+  static_assert(alignof(LargeStorageVector) <= alignof(Header),
+                "LargeStorageVector too strongly aligned");
+  return reinterpret_cast<char *>(const_cast<Header *>(this)) -
+         sizeof(LargeStorageVector);
+}
+
+void *MDNode::Header::getSmallPtr() {
+  static_assert(alignof(MDOperand) <= alignof(Header),
+                "MDOperand too strongly aligned");
+  return reinterpret_cast<char *>(const_cast<Header *>(this)) -
+         sizeof(MDOperand) * SmallSize;
+}
+
+void MDNode::Header::resize(size_t NumOps) {
+  assert(IsResizable && "Node is not resizable");
+  if (operands().size() == NumOps)
+    return;
+
+  if (IsLarge)
+    getLarge().resize(NumOps);
+  else if (NumOps <= SmallSize)
+    resizeSmall(NumOps);
+  else
+    resizeSmallToLarge(NumOps);
+}
+
+void MDNode::Header::resizeSmall(size_t NumOps) {
+  assert(!IsLarge && "Expected a small MDNode");
+  assert(NumOps <= SmallSize && "NumOps too large for small resize");
+
+  MutableArrayRef<MDOperand> ExistingOps = operands();
+  assert(NumOps != ExistingOps.size() && "Expected a different size");
+
+  int NumNew = (int)NumOps - (int)ExistingOps.size();
+  MDOperand *O = ExistingOps.end();
+  for (int I = 0, E = NumNew; I < E; ++I)
+    (O++)->reset();
+  for (int I = 0, E = NumNew; I > E; --I)
+    (--O)->reset();
+  SmallNumOps = NumOps;
+  assert(O == operands().end() && "Operands not (un)initialized until the end");
+}
+
+void MDNode::Header::resizeSmallToLarge(size_t NumOps) {
+  assert(!IsLarge && "Expected a small MDNode");
+  assert(NumOps > SmallSize && "Expected NumOps to be larger than allocation");
+  LargeStorageVector NewOps;
+  NewOps.resize(NumOps);
+  llvm::move(operands(), NewOps.begin());
+  resizeSmall(0);
+  new (getLargePtr()) LargeStorageVector(std::move(NewOps));
+  IsLarge = true;
+}
+
 static bool isOperandUnresolved(Metadata *Op) {
   if (auto *N = dyn_cast_or_null<MDNode>(Op))
     return !N->isResolved();
@@ -554,9 +654,9 @@ static bool isOperandUnresolved(Metadata *Op) {
 }
 
 void MDNode::countUnresolvedOperands() {
-  assert(NumUnresolved == 0 && "Expected unresolved ops to be uncounted");
+  assert(getNumUnresolved() == 0 && "Expected unresolved ops to be uncounted");
   assert(isUniqued() && "Expected this to be uniqued");
-  NumUnresolved = count_if(operands(), isOperandUnresolved);
+  setNumUnresolved(count_if(operands(), isOperandUnresolved));
 }
 
 void MDNode::makeUniqued() {
@@ -570,7 +670,7 @@ void MDNode::makeUniqued() {
   // Make this 'uniqued'.
   Storage = Uniqued;
   countUnresolvedOperands();
-  if (!NumUnresolved) {
+  if (!getNumUnresolved()) {
     dropReplaceableUses();
     assert(isResolved() && "Expected this to be resolved");
   }
@@ -594,14 +694,14 @@ void MDNode::resolve() {
   assert(isUniqued() && "Expected this to be uniqued");
   assert(!isResolved() && "Expected this to be unresolved");
 
-  NumUnresolved = 0;
+  setNumUnresolved(0);
   dropReplaceableUses();
 
   assert(isResolved() && "Expected this to be resolved");
 }
 
 void MDNode::dropReplaceableUses() {
-  assert(!NumUnresolved && "Unexpected unresolved operand");
+  assert(!getNumUnresolved() && "Unexpected unresolved operand");
 
   // Drop any RAUW support.
   if (Context.hasReplaceableUses())
@@ -610,13 +710,13 @@ void MDNode::dropReplaceableUses() {
 
 void MDNode::resolveAfterOperandChange(Metadata *Old, Metadata *New) {
   assert(isUniqued() && "Expected this to be uniqued");
-  assert(NumUnresolved != 0 && "Expected unresolved operands");
+  assert(getNumUnresolved() != 0 && "Expected unresolved operands");
 
   // Check if an operand was resolved.
   if (!isOperandUnresolved(Old)) {
     if (isOperandUnresolved(New))
       // An operand was un-resolved!
-      ++NumUnresolved;
+      setNumUnresolved(getNumUnresolved() + 1);
   } else if (!isOperandUnresolved(New))
     decrementUnresolvedOperandCount();
 }
@@ -627,7 +727,8 @@ void MDNode::decrementUnresolvedOperandCount() {
     return;
 
   assert(isUniqued() && "Expected this to be uniqued");
-  if (--NumUnresolved)
+  setNumUnresolved(getNumUnresolved() - 1);
+  if (getNumUnresolved())
     return;
 
   // Last unresolved operand has just been resolved.
@@ -702,7 +803,7 @@ void MDTuple::recalculateHash() {
 }
 
 void MDNode::dropAllReferences() {
-  for (unsigned I = 0, E = NumOperands; I != E; ++I)
+  for (unsigned I = 0, E = getNumOperands(); I != E; ++I)
     setOperand(I, nullptr);
   if (Context.hasReplaceableUses()) {
     Context.getReplaceableUses()->resolveAllUses(/* ResolveUsers */ false);
@@ -838,7 +939,8 @@ MDTuple *MDTuple::getImpl(LLVMContext &Context, ArrayRef<Metadata *> MDs,
     assert(ShouldCreate && "Expected non-uniqued nodes to always be created");
   }
 
-  return storeImpl(new (MDs.size()) MDTuple(Context, Storage, Hash, MDs),
+  return storeImpl(new (MDs.size(), Storage)
+                       MDTuple(Context, Storage, Hash, MDs),
                    Storage, Context.pImpl->MDTuples);
 }
 
@@ -850,7 +952,7 @@ void MDNode::deleteTemporary(MDNode *N) {
 
 void MDNode::storeDistinctInContext() {
   assert(!Context.hasReplaceableUses() && "Unexpected replaceable uses");
-  assert(!NumUnresolved && "Unexpected unresolved nodes");
+  assert(!getNumUnresolved() && "Unexpected unresolved nodes");
   Storage = Distinct;
   assert(isResolved() && "Expected this to be resolved");
 
@@ -883,7 +985,7 @@ void MDNode::replaceOperandWith(unsigned I, Metadata *New) {
 }
 
 void MDNode::setOperand(unsigned I, Metadata *New) {
-  assert(I < NumOperands);
+  assert(I < getNumOperands());
   mutable_begin()[I].reset(New, isUniqued() ? this : nullptr);
 }
 
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index 4974b372db2a..5cd74d53da75 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -71,8 +71,7 @@ template class llvm::SymbolTableListTraits<GlobalIFunc>;
 
 Module::Module(StringRef MID, LLVMContext &C)
     : Context(C), ValSymTab(std::make_unique<ValueSymbolTable>(-1)),
-      Materializer(), ModuleID(std::string(MID)),
-      SourceFileName(std::string(MID)), DL("") {
+      ModuleID(std::string(MID)), SourceFileName(std::string(MID)), DL("") {
   Context.addModule(this);
 }
 
@@ -671,12 +670,15 @@ void Module::setRtLibUseGOT() {
   addModuleFlag(ModFlagBehavior::Max, "RtLibUseGOT", 1);
 }
 
-bool Module::getUwtable() const {
-  auto *Val = cast_or_null<ConstantAsMetadata>(getModuleFlag("uwtable"));
-  return Val && (cast<ConstantInt>(Val->getValue())->getZExtValue() > 0);
+UWTableKind Module::getUwtable() const {
+  if (auto *Val = cast_or_null<ConstantAsMetadata>(getModuleFlag("uwtable")))
+    return UWTableKind(cast<ConstantInt>(Val->getValue())->getZExtValue());
+  return UWTableKind::None;
 }
 
-void Module::setUwtable() { addModuleFlag(ModFlagBehavior::Max, "uwtable", 1); }
+void Module::setUwtable(UWTableKind Kind) {
+  addModuleFlag(ModFlagBehavior::Max, "uwtable", uint32_t(Kind));
+}
 
 FramePointerKind Module::getFramePointer() const {
   auto *Val = cast_or_null<ConstantAsMetadata>(getModuleFlag("frame-pointer"));
@@ -734,7 +736,7 @@ void Module::setOverrideStackAlignment(unsigned Align) {
   addModuleFlag(ModFlagBehavior::Error, "override-stack-alignment", Align);
 }
 
-void Module::setSDKVersion(const VersionTuple &V) {
+static void addSDKVersionMD(const VersionTuple &V, Module &M, StringRef Name) {
   SmallVector<unsigned, 3> Entries;
   Entries.push_back(V.getMajor());
   if (auto Minor = V.getMinor()) {
@@ -744,8 +746,12 @@ void Module::setSDKVersion(const VersionTuple &V) {
     // Ignore the 'build' component as it can't be represented in the object
     // file.
   }
-  addModuleFlag(ModFlagBehavior::Warning, "SDK Version",
-                ConstantDataArray::get(Context, Entries));
+  M.addModuleFlag(Module::ModFlagBehavior::Warning, Name,
+                  ConstantDataArray::get(M.getContext(), Entries));
+}
+
+void Module::setSDKVersion(const VersionTuple &V) {
+  addSDKVersionMD(V, *this, "SDK Version");
 }
 
 static VersionTuple getSDKVersionMD(Metadata *MD) {
@@ -818,6 +824,15 @@ StringRef Module::getDarwinTargetVariantTriple() const {
   return "";
 }
 
+void Module::setDarwinTargetVariantTriple(StringRef T) {
+  addModuleFlag(ModFlagBehavior::Override, "darwin.target_variant.triple",
+                MDString::get(getContext(), T));
+}
+
 VersionTuple Module::getDarwinTargetVariantSDKVersion() const {
   return getSDKVersionMD(getModuleFlag("darwin.target_variant.SDK Version"));
 }
+
+void Module::setDarwinTargetVariantSDKVersion(VersionTuple Version) {
+  addSDKVersionMD(Version, *this, "darwin.target_variant.SDK Version");
+}
diff --git a/llvm/lib/IR/Pass.cpp b/llvm/lib/IR/Pass.cpp
index 755ea57c63fd..fe0bfd81a81e 100644
--- a/llvm/lib/IR/Pass.cpp
+++ b/llvm/lib/IR/Pass.cpp
@@ -27,6 +27,10 @@
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 
+#ifdef EXPENSIVE_CHECKS
+#include "llvm/IR/StructuralHash.h"
+#endif
+
 using namespace llvm;
 
 #define DEBUG_TYPE "ir"
@@ -133,6 +137,12 @@ LLVM_DUMP_METHOD void Pass::dump() const {
 }
 #endif
 
+#ifdef EXPENSIVE_CHECKS
+uint64_t Pass::structuralHash(Module &M) const { return StructuralHash(M); }
+
+uint64_t Pass::structuralHash(Function &F) const { return StructuralHash(F); }
+#endif
+
 //===----------------------------------------------------------------------===//
 // ImmutablePass Implementation
 //
diff --git a/llvm/lib/IR/ReplaceConstant.cpp b/llvm/lib/IR/ReplaceConstant.cpp
index d2f676192e7f..069da26e63b1 100644
--- a/llvm/lib/IR/ReplaceConstant.cpp
+++ b/llvm/lib/IR/ReplaceConstant.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/IR/ReplaceConstant.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/ValueMap.h"
 
diff --git a/llvm/lib/IR/SafepointIRVerifier.cpp b/llvm/lib/IR/SafepointIRVerifier.cpp
index d8634e0ac7dd..5d3fa28f7d0a 100644
--- a/llvm/lib/IR/SafepointIRVerifier.cpp
+++ b/llvm/lib/IR/SafepointIRVerifier.cpp
@@ -357,6 +357,17 @@ static enum BaseType getBaseType(const Value *Val) {
       Worklist.push_back(SI->getFalseValue());
       continue;
     }
+    if (const auto *GCRelocate = dyn_cast<GCRelocateInst>(V)) {
+      // GCRelocates do not change null-ness or constant-ness of the value.
+      // So we can continue with derived pointer this instruction relocates.
+      Worklist.push_back(GCRelocate->getDerivedPtr());
+      continue;
+    }
+    if (const auto *FI = dyn_cast<FreezeInst>(V)) {
+      // Freeze does not change null-ness or constant-ness of the value.
+      Worklist.push_back(FI->getOperand(0));
+      continue;
+    }
     if (isa<Constant>(V)) {
       // We found at least one base pointer which is non-null, so this derived
       // pointer is not exclusively derived from null.
diff --git a/llvm/lib/IR/Use.cpp b/llvm/lib/IR/Use.cpp
index 601a9df5279e..99a89386d75f 100644
--- a/llvm/lib/IR/Use.cpp
+++ b/llvm/lib/IR/Use.cpp
@@ -11,10 +11,6 @@
 
 namespace llvm {
 
-class User;
-template <typename> struct simplify_type;
-class Value;
-
 void Use::swap(Use &RHS) {
   if (Val == RHS.Val)
     return;
diff --git a/llvm/lib/IR/User.cpp b/llvm/lib/IR/User.cpp
index 68489075cd88..637af7aaa245 100644
--- a/llvm/lib/IR/User.cpp
+++ b/llvm/lib/IR/User.cpp
@@ -18,8 +18,9 @@ class BasicBlock;
 //                                 User Class
 //===----------------------------------------------------------------------===//
 
-void User::replaceUsesOfWith(Value *From, Value *To) {
-  if (From == To) return;   // Duh what?
+bool User::replaceUsesOfWith(Value *From, Value *To) {
+  bool Changed = false;
+  if (From == To) return Changed;   // Duh what?
 
   assert((!isa<Constant>(this) || isa<GlobalValue>(this)) &&
          "Cannot call User::replaceUsesOfWith on a constant!");
@@ -30,11 +31,16 @@ void User::replaceUsesOfWith(Value *From, Value *To) {
       // "To", adding "this" to the uses list of To, and
       // most importantly, removing "this" from the use list of "From".
       setOperand(i, To);
+      Changed = true;
     }
   if (auto DVI = dyn_cast_or_null<DbgVariableIntrinsic>(this)) {
-    if (is_contained(DVI->location_ops(), From))
+    if (is_contained(DVI->location_ops(), From)) {
       DVI->replaceVariableLocationOp(From, To);
+      Changed = true;
+    }
   }
+
+  return Changed;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp
index 18aef37e2023..3990536f3da5 100644
--- a/llvm/lib/IR/Value.cpp
+++ b/llvm/lib/IR/Value.cpp
@@ -28,7 +28,6 @@
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -377,6 +376,7 @@ void Value::setName(const Twine &NewName) {
 }
 
 void Value::takeName(Value *V) {
+  assert(V != this && "Illegal call to this->takeName(this)!");
   ValueSymbolTable *ST = nullptr;
   // If this value has a name, drop it.
   if (hasName()) {
@@ -408,7 +408,7 @@ void Value::takeName(Value *V) {
     }
   }
 
-  // Get V's ST, this should always succed, because V has a name.
+  // Get V's ST, this should always succeed, because V has a name.
   ValueSymbolTable *VST;
   bool Failure = getSymTab(V, VST);
   assert(!Failure && "V has a name, so it should have a ST!"); (void)Failure;
@@ -963,6 +963,9 @@ Align Value::getPointerAlignment(const DataLayout &DL) const {
       return Align(CI->getLimitedValue());
     }
   } else if (auto *CstPtr = dyn_cast<Constant>(this)) {
+    // Strip pointer casts to avoid creating unnecessary ptrtoint expression
+    // if the only "reduction" is combining a bitcast + ptrtoint.
+    CstPtr = CstPtr->stripPointerCasts();
     if (auto *CstInt = dyn_cast_or_null<ConstantInt>(ConstantExpr::getPtrToInt(
             const_cast<Constant *>(CstPtr), DL.getIntPtrType(getType()),
             /*OnlyIfReduced=*/true))) {
@@ -1017,20 +1020,16 @@ bool Value::isSwiftError() const {
 }
 
 bool Value::isTransitiveUsedByMetadataOnly() const {
-  if (use_empty())
-    return false;
-  llvm::SmallVector<const User *, 32> WorkList;
-  llvm::SmallPtrSet<const User *, 32> Visited;
-  WorkList.insert(WorkList.begin(), user_begin(), user_end());
+  SmallVector<const User *, 32> WorkList(user_begin(), user_end());
+  SmallPtrSet<const User *, 32> Visited(user_begin(), user_end());
   while (!WorkList.empty()) {
     const User *U = WorkList.pop_back_val();
-    Visited.insert(U);
     // If it is transitively used by a global value or a non-constant value,
     // it's obviously not only used by metadata.
     if (!isa<Constant>(U) || isa<GlobalValue>(U))
       return false;
     for (const User *UU : U->users())
-      if (!Visited.count(UU))
+      if (Visited.insert(UU).second)
         WorkList.push_back(UU);
   }
   return true;
diff --git a/llvm/lib/IR/VectorBuilder.cpp b/llvm/lib/IR/VectorBuilder.cpp
new file mode 100644
index 000000000000..e7be7a98a593
--- /dev/null
+++ b/llvm/lib/IR/VectorBuilder.cpp
@@ -0,0 +1,103 @@
+//===- VectorBuilder.cpp - Builder for VP Intrinsics ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the VectorBuilder class, which is used as a convenient
+// way to create VP intrinsics as if they were LLVM instructions with a
+// consistent and simplified interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/IR/FPEnv.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/IntrinsicInst.h>
+#include <llvm/IR/Intrinsics.h>
+#include <llvm/IR/VectorBuilder.h>
+
+namespace llvm {
+
+void VectorBuilder::handleError(const char *ErrorMsg) const {
+  if (ErrorHandling == Behavior::SilentlyReturnNone)
+    return;
+  report_fatal_error(ErrorMsg);
+}
+
+Module &VectorBuilder::getModule() const {
+  return *Builder.GetInsertBlock()->getModule();
+}
+
+Value *VectorBuilder::getAllTrueMask() {
+  auto *BoolTy = Builder.getInt1Ty();
+  auto *MaskTy = VectorType::get(BoolTy, StaticVectorLength);
+  return ConstantInt::getAllOnesValue(MaskTy);
+}
+
+Value &VectorBuilder::requestMask() {
+  if (Mask)
+    return *Mask;
+
+  return *getAllTrueMask();
+}
+
+Value &VectorBuilder::requestEVL() {
+  if (ExplicitVectorLength)
+    return *ExplicitVectorLength;
+
+  assert(!StaticVectorLength.isScalable() && "TODO vscale lowering");
+  auto *IntTy = Builder.getInt32Ty();
+  return *ConstantInt::get(IntTy, StaticVectorLength.getFixedValue());
+}
+
+Value *VectorBuilder::createVectorInstruction(unsigned Opcode, Type *ReturnTy,
+                                              ArrayRef<Value *> InstOpArray,
+                                              const Twine &Name) {
+  auto VPID = VPIntrinsic::getForOpcode(Opcode);
+  if (VPID == Intrinsic::not_intrinsic)
+    return returnWithError<Value *>("No VPIntrinsic for this opcode");
+
+  auto MaskPosOpt = VPIntrinsic::getMaskParamPos(VPID);
+  auto VLenPosOpt = VPIntrinsic::getVectorLengthParamPos(VPID);
+  size_t NumInstParams = InstOpArray.size();
+  size_t NumVPParams =
+      NumInstParams + MaskPosOpt.has_value() + VLenPosOpt.has_value();
+
+  SmallVector<Value *, 6> IntrinParams;
+
+  // Whether the mask and vlen parameter are at the end of the parameter list.
+  bool TrailingMaskAndVLen =
+      std::min<size_t>(MaskPosOpt.value_or(NumInstParams),
+                       VLenPosOpt.value_or(NumInstParams)) >= NumInstParams;
+
+  if (TrailingMaskAndVLen) {
+    // Fast path for trailing mask, vector length.
+    IntrinParams.append(InstOpArray.begin(), InstOpArray.end());
+    IntrinParams.resize(NumVPParams);
+  } else {
+    IntrinParams.resize(NumVPParams);
+    // Insert mask and evl operands in between the instruction operands.
+    for (size_t VPParamIdx = 0, ParamIdx = 0; VPParamIdx < NumVPParams;
+         ++VPParamIdx) {
+      if ((MaskPosOpt && MaskPosOpt.value_or(NumVPParams) == VPParamIdx) ||
+          (VLenPosOpt && VLenPosOpt.value_or(NumVPParams) == VPParamIdx))
+        continue;
+      assert(ParamIdx < NumInstParams);
+      IntrinParams[VPParamIdx] = InstOpArray[ParamIdx++];
+    }
+  }
+
+  if (MaskPosOpt)
+    IntrinParams[*MaskPosOpt] = &requestMask();
+  if (VLenPosOpt)
+    IntrinParams[*VLenPosOpt] = &requestEVL();
+
+  auto *VPDecl = VPIntrinsic::getDeclarationForParams(&getModule(), VPID,
+                                                      ReturnTy, IntrinParams);
+  return Builder.CreateCall(VPDecl, IntrinParams, Name);
+}
+
+} // namespace llvm
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 989d01e2e395..75d02f4c8c82 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -84,6 +84,8 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
+#include "llvm/IR/IntrinsicsARM.h"
 #include "llvm/IR/IntrinsicsWebAssembly.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
@@ -100,7 +102,6 @@
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
@@ -278,6 +279,12 @@ namespace {
 class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   friend class InstVisitor<Verifier>;
 
+  // ISD::ArgFlagsTy::MemAlign only have 4 bits for alignment, so
+  // the alignment size should not exceed 2^15. Since encode(Align)
+  // would plus the shift value by 1, the alignment size should
+  // not exceed 2^14, otherwise it can NOT be properly lowered
+  // in backend.
+  static constexpr unsigned ParamMaxAlignment = 1 << 14;
   DominatorTree DT;
 
   /// When verifying a basic block, keep track of all of the
@@ -465,6 +472,7 @@ private:
   void visitAnnotationMetadata(MDNode *Annotation);
   void visitAliasScopeMetadata(const MDNode *MD);
   void visitAliasScopeListMetadata(const MDNode *MD);
+  void visitAccessGroupMetadata(const MDNode *MD);
 
   template <class Ty> bool isValidMetadataArray(const MDTuple &N);
 #define HANDLE_SPECIALIZED_MDNODE_LEAF(CLASS) void visit##CLASS(const CLASS &N);
@@ -521,6 +529,7 @@ private:
   void visitUserOp2(Instruction &I) { visitUserOp1(I); }
   void visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call);
   void visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI);
+  void visitVPIntrinsic(VPIntrinsic &VPI);
   void visitDbgIntrinsic(StringRef Kind, DbgVariableIntrinsic &DII);
   void visitDbgLabelIntrinsic(StringRef Kind, DbgLabelInst &DLI);
   void visitAtomicCmpXchgInst(AtomicCmpXchgInst &CXI);
@@ -587,17 +596,27 @@ private:
 } // end anonymous namespace
 
 /// We know that cond should be true, if not print an error message.
-#define Assert(C, ...) \
-  do { if (!(C)) { CheckFailed(__VA_ARGS__); return; } } while (false)
+#define Check(C, ...)                                                          \
+  do {                                                                         \
+    if (!(C)) {                                                                \
+      CheckFailed(__VA_ARGS__);                                                \
+      return;                                                                  \
+    }                                                                          \
+  } while (false)
 
 /// We know that a debug info condition should be true, if not print
 /// an error message.
-#define AssertDI(C, ...) \
-  do { if (!(C)) { DebugInfoCheckFailed(__VA_ARGS__); return; } } while (false)
+#define CheckDI(C, ...)                                                        \
+  do {                                                                         \
+    if (!(C)) {                                                                \
+      DebugInfoCheckFailed(__VA_ARGS__);                                       \
+      return;                                                                  \
+    }                                                                          \
+  } while (false)
 
 void Verifier::visit(Instruction &I) {
   for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i)
-    Assert(I.getOperand(i) != nullptr, "Operand is null", &I);
+    Check(I.getOperand(i) != nullptr, "Operand is null", &I);
   InstVisitor<Verifier>::visit(I);
 }
 
@@ -620,43 +639,43 @@ static void forEachUser(const Value *User,
 }
 
 void Verifier::visitGlobalValue(const GlobalValue &GV) {
-  Assert(!GV.isDeclaration() || GV.hasValidDeclarationLinkage(),
-         "Global is external, but doesn't have external or weak linkage!", &GV);
+  Check(!GV.isDeclaration() || GV.hasValidDeclarationLinkage(),
+        "Global is external, but doesn't have external or weak linkage!", &GV);
 
   if (const GlobalObject *GO = dyn_cast<GlobalObject>(&GV)) {
 
     if (MaybeAlign A = GO->getAlign()) {
-      Assert(A->value() <= Value::MaximumAlignment,
-             "huge alignment values are unsupported", GO);
+      Check(A->value() <= Value::MaximumAlignment,
+            "huge alignment values are unsupported", GO);
     }
   }
-  Assert(!GV.hasAppendingLinkage() || isa<GlobalVariable>(GV),
-         "Only global variables can have appending linkage!", &GV);
+  Check(!GV.hasAppendingLinkage() || isa<GlobalVariable>(GV),
+        "Only global variables can have appending linkage!", &GV);
 
   if (GV.hasAppendingLinkage()) {
     const GlobalVariable *GVar = dyn_cast<GlobalVariable>(&GV);
-    Assert(GVar && GVar->getValueType()->isArrayTy(),
-           "Only global arrays can have appending linkage!", GVar);
+    Check(GVar && GVar->getValueType()->isArrayTy(),
+          "Only global arrays can have appending linkage!", GVar);
   }
 
   if (GV.isDeclarationForLinker())
-    Assert(!GV.hasComdat(), "Declaration may not be in a Comdat!", &GV);
+    Check(!GV.hasComdat(), "Declaration may not be in a Comdat!", &GV);
 
   if (GV.hasDLLImportStorageClass()) {
-    Assert(!GV.isDSOLocal(),
-           "GlobalValue with DLLImport Storage is dso_local!", &GV);
+    Check(!GV.isDSOLocal(), "GlobalValue with DLLImport Storage is dso_local!",
+          &GV);
 
-    Assert((GV.isDeclaration() &&
-            (GV.hasExternalLinkage() || GV.hasExternalWeakLinkage())) ||
-               GV.hasAvailableExternallyLinkage(),
-           "Global is marked as dllimport, but not external", &GV);
+    Check((GV.isDeclaration() &&
+           (GV.hasExternalLinkage() || GV.hasExternalWeakLinkage())) ||
+              GV.hasAvailableExternallyLinkage(),
+          "Global is marked as dllimport, but not external", &GV);
   }
 
   if (GV.isImplicitDSOLocal())
-    Assert(GV.isDSOLocal(),
-           "GlobalValue with local linkage or non-default "
-           "visibility must be dso_local!",
-           &GV);
+    Check(GV.isDSOLocal(),
+          "GlobalValue with local linkage or non-default "
+          "visibility must be dso_local!",
+          &GV);
 
   forEachUser(&GV, GlobalValueVisited, [&](const Value *V) -> bool {
     if (const Instruction *I = dyn_cast<Instruction>(V)) {
@@ -680,25 +699,25 @@ void Verifier::visitGlobalValue(const GlobalValue &GV) {
 
 void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
   if (GV.hasInitializer()) {
-    Assert(GV.getInitializer()->getType() == GV.getValueType(),
-           "Global variable initializer type does not match global "
-           "variable type!",
-           &GV);
+    Check(GV.getInitializer()->getType() == GV.getValueType(),
+          "Global variable initializer type does not match global "
+          "variable type!",
+          &GV);
     // If the global has common linkage, it must have a zero initializer and
     // cannot be constant.
     if (GV.hasCommonLinkage()) {
-      Assert(GV.getInitializer()->isNullValue(),
-             "'common' global must have a zero initializer!", &GV);
-      Assert(!GV.isConstant(), "'common' global may not be marked constant!",
-             &GV);
-      Assert(!GV.hasComdat(), "'common' global may not be in a Comdat!", &GV);
+      Check(GV.getInitializer()->isNullValue(),
+            "'common' global must have a zero initializer!", &GV);
+      Check(!GV.isConstant(), "'common' global may not be marked constant!",
+            &GV);
+      Check(!GV.hasComdat(), "'common' global may not be in a Comdat!", &GV);
     }
   }
 
   if (GV.hasName() && (GV.getName() == "llvm.global_ctors" ||
                        GV.getName() == "llvm.global_dtors")) {
-    Assert(!GV.hasInitializer() || GV.hasAppendingLinkage(),
-           "invalid linkage for intrinsic global variable", &GV);
+    Check(!GV.hasInitializer() || GV.hasAppendingLinkage(),
+          "invalid linkage for intrinsic global variable", &GV);
     // Don't worry about emitting an error for it not being an array,
     // visitGlobalValue will complain on appending non-array.
     if (ArrayType *ATy = dyn_cast<ArrayType>(GV.getValueType())) {
@@ -706,42 +725,41 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
       PointerType *FuncPtrTy =
           FunctionType::get(Type::getVoidTy(Context), false)->
           getPointerTo(DL.getProgramAddressSpace());
-      Assert(STy &&
-                 (STy->getNumElements() == 2 || STy->getNumElements() == 3) &&
-                 STy->getTypeAtIndex(0u)->isIntegerTy(32) &&
-                 STy->getTypeAtIndex(1) == FuncPtrTy,
-             "wrong type for intrinsic global variable", &GV);
-      Assert(STy->getNumElements() == 3,
-             "the third field of the element type is mandatory, "
-             "specify i8* null to migrate from the obsoleted 2-field form");
+      Check(STy && (STy->getNumElements() == 2 || STy->getNumElements() == 3) &&
+                STy->getTypeAtIndex(0u)->isIntegerTy(32) &&
+                STy->getTypeAtIndex(1) == FuncPtrTy,
+            "wrong type for intrinsic global variable", &GV);
+      Check(STy->getNumElements() == 3,
+            "the third field of the element type is mandatory, "
+            "specify i8* null to migrate from the obsoleted 2-field form");
       Type *ETy = STy->getTypeAtIndex(2);
       Type *Int8Ty = Type::getInt8Ty(ETy->getContext());
-      Assert(ETy->isPointerTy() &&
-                 cast<PointerType>(ETy)->isOpaqueOrPointeeTypeMatches(Int8Ty),
-             "wrong type for intrinsic global variable", &GV);
+      Check(ETy->isPointerTy() &&
+                cast<PointerType>(ETy)->isOpaqueOrPointeeTypeMatches(Int8Ty),
+            "wrong type for intrinsic global variable", &GV);
     }
   }
 
   if (GV.hasName() && (GV.getName() == "llvm.used" ||
                        GV.getName() == "llvm.compiler.used")) {
-    Assert(!GV.hasInitializer() || GV.hasAppendingLinkage(),
-           "invalid linkage for intrinsic global variable", &GV);
+    Check(!GV.hasInitializer() || GV.hasAppendingLinkage(),
+          "invalid linkage for intrinsic global variable", &GV);
     Type *GVType = GV.getValueType();
     if (ArrayType *ATy = dyn_cast<ArrayType>(GVType)) {
       PointerType *PTy = dyn_cast<PointerType>(ATy->getElementType());
-      Assert(PTy, "wrong type for intrinsic global variable", &GV);
+      Check(PTy, "wrong type for intrinsic global variable", &GV);
       if (GV.hasInitializer()) {
         const Constant *Init = GV.getInitializer();
         const ConstantArray *InitArray = dyn_cast<ConstantArray>(Init);
-        Assert(InitArray, "wrong initalizer for intrinsic global variable",
-               Init);
+        Check(InitArray, "wrong initalizer for intrinsic global variable",
+              Init);
         for (Value *Op : InitArray->operands()) {
           Value *V = Op->stripPointerCasts();
-          Assert(isa<GlobalVariable>(V) || isa<Function>(V) ||
-                     isa<GlobalAlias>(V),
-                 Twine("invalid ") + GV.getName() + " member", V);
-          Assert(V->hasName(),
-                 Twine("members of ") + GV.getName() + " must be named", V);
+          Check(isa<GlobalVariable>(V) || isa<Function>(V) ||
+                    isa<GlobalAlias>(V),
+                Twine("invalid ") + GV.getName() + " member", V);
+          Check(V->hasName(),
+                Twine("members of ") + GV.getName() + " must be named", V);
         }
       }
     }
@@ -754,20 +772,20 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
     if (auto *GVE = dyn_cast<DIGlobalVariableExpression>(MD))
       visitDIGlobalVariableExpression(*GVE);
     else
-      AssertDI(false, "!dbg attachment of global variable must be a "
-                      "DIGlobalVariableExpression");
+      CheckDI(false, "!dbg attachment of global variable must be a "
+                     "DIGlobalVariableExpression");
   }
 
   // Scalable vectors cannot be global variables, since we don't know
   // the runtime size. If the global is an array containing scalable vectors,
   // that will be caught by the isValidElementType methods in StructType or
   // ArrayType instead.
-  Assert(!isa<ScalableVectorType>(GV.getValueType()),
-         "Globals cannot contain scalable vectors", &GV);
+  Check(!isa<ScalableVectorType>(GV.getValueType()),
+        "Globals cannot contain scalable vectors", &GV);
 
   if (auto *STy = dyn_cast<StructType>(GV.getValueType()))
-    Assert(!STy->containsScalableVectorType(),
-           "Globals cannot contain scalable vectors", &GV);
+    Check(!STy->containsScalableVectorType(),
+          "Globals cannot contain scalable vectors", &GV);
 
   if (!GV.hasInitializer()) {
     visitGlobalValue(GV);
@@ -789,14 +807,14 @@ void Verifier::visitAliaseeSubExpr(const GlobalAlias &GA, const Constant &C) {
 void Verifier::visitAliaseeSubExpr(SmallPtrSetImpl<const GlobalAlias*> &Visited,
                                    const GlobalAlias &GA, const Constant &C) {
   if (const auto *GV = dyn_cast<GlobalValue>(&C)) {
-    Assert(!GV->isDeclarationForLinker(), "Alias must point to a definition",
-           &GA);
+    Check(!GV->isDeclarationForLinker(), "Alias must point to a definition",
+          &GA);
 
     if (const auto *GA2 = dyn_cast<GlobalAlias>(GV)) {
-      Assert(Visited.insert(GA2).second, "Aliases cannot form a cycle", &GA);
+      Check(Visited.insert(GA2).second, "Aliases cannot form a cycle", &GA);
 
-      Assert(!GA2->isInterposable(), "Alias cannot point to an interposable alias",
-             &GA);
+      Check(!GA2->isInterposable(),
+            "Alias cannot point to an interposable alias", &GA);
     } else {
       // Only continue verifying subexpressions of GlobalAliases.
       // Do not recurse into global initializers.
@@ -817,17 +835,17 @@ void Verifier::visitAliaseeSubExpr(SmallPtrSetImpl<const GlobalAlias*> &Visited,
 }
 
 void Verifier::visitGlobalAlias(const GlobalAlias &GA) {
-  Assert(GlobalAlias::isValidLinkage(GA.getLinkage()),
-         "Alias should have private, internal, linkonce, weak, linkonce_odr, "
-         "weak_odr, or external linkage!",
-         &GA);
+  Check(GlobalAlias::isValidLinkage(GA.getLinkage()),
+        "Alias should have private, internal, linkonce, weak, linkonce_odr, "
+        "weak_odr, or external linkage!",
+        &GA);
   const Constant *Aliasee = GA.getAliasee();
-  Assert(Aliasee, "Aliasee cannot be NULL!", &GA);
-  Assert(GA.getType() == Aliasee->getType(),
-         "Alias and aliasee types should match!", &GA);
+  Check(Aliasee, "Aliasee cannot be NULL!", &GA);
+  Check(GA.getType() == Aliasee->getType(),
+        "Alias and aliasee types should match!", &GA);
 
-  Assert(isa<GlobalValue>(Aliasee) || isa<ConstantExpr>(Aliasee),
-         "Aliasee should be either GlobalValue or ConstantExpr", &GA);
+  Check(isa<GlobalValue>(Aliasee) || isa<ConstantExpr>(Aliasee),
+        "Aliasee should be either GlobalValue or ConstantExpr", &GA);
 
   visitAliaseeSubExpr(GA, *Aliasee);
 
@@ -835,30 +853,35 @@ void Verifier::visitGlobalAlias(const GlobalAlias &GA) {
 }
 
 void Verifier::visitGlobalIFunc(const GlobalIFunc &GI) {
+  Check(GlobalIFunc::isValidLinkage(GI.getLinkage()),
+        "IFunc should have private, internal, linkonce, weak, linkonce_odr, "
+        "weak_odr, or external linkage!",
+        &GI);
   // Pierce through ConstantExprs and GlobalAliases and check that the resolver
-  // has a Function 
+  // is a Function definition.
   const Function *Resolver = GI.getResolverFunction();
-  Assert(Resolver, "IFunc must have a Function resolver", &GI);
+  Check(Resolver, "IFunc must have a Function resolver", &GI);
+  Check(!Resolver->isDeclarationForLinker(),
+        "IFunc resolver must be a definition", &GI);
 
   // Check that the immediate resolver operand (prior to any bitcasts) has the
-  // correct type
+  // correct type.
   const Type *ResolverTy = GI.getResolver()->getType();
   const Type *ResolverFuncTy =
       GlobalIFunc::getResolverFunctionType(GI.getValueType());
-  Assert(ResolverTy == ResolverFuncTy->getPointerTo(),
-         "IFunc resolver has incorrect type", &GI);
+  Check(ResolverTy == ResolverFuncTy->getPointerTo(),
+        "IFunc resolver has incorrect type", &GI);
 }
 
 void Verifier::visitNamedMDNode(const NamedMDNode &NMD) {
   // There used to be various other llvm.dbg.* nodes, but we don't support
   // upgrading them and we want to reserve the namespace for future uses.
   if (NMD.getName().startswith("llvm.dbg."))
-    AssertDI(NMD.getName() == "llvm.dbg.cu",
-             "unrecognized named metadata node in the llvm.dbg namespace",
-             &NMD);
+    CheckDI(NMD.getName() == "llvm.dbg.cu",
+            "unrecognized named metadata node in the llvm.dbg namespace", &NMD);
   for (const MDNode *MD : NMD.operands()) {
     if (NMD.getName() == "llvm.dbg.cu")
-      AssertDI(MD && isa<DICompileUnit>(MD), "invalid compile unit", &NMD, MD);
+      CheckDI(MD && isa<DICompileUnit>(MD), "invalid compile unit", &NMD, MD);
 
     if (!MD)
       continue;
@@ -873,8 +896,8 @@ void Verifier::visitMDNode(const MDNode &MD, AreDebugLocsAllowed AllowLocs) {
   if (!MDNodes.insert(&MD).second)
     return;
 
-  Assert(&MD.getContext() == &Context,
-         "MDNode context does not match Module context!", &MD);
+  Check(&MD.getContext() == &Context,
+        "MDNode context does not match Module context!", &MD);
 
   switch (MD.getMetadataID()) {
   default:
@@ -891,10 +914,10 @@ void Verifier::visitMDNode(const MDNode &MD, AreDebugLocsAllowed AllowLocs) {
   for (const Metadata *Op : MD.operands()) {
     if (!Op)
       continue;
-    Assert(!isa<LocalAsMetadata>(Op), "Invalid operand for global metadata!",
-           &MD, Op);
-    AssertDI(!isa<DILocation>(Op) || AllowLocs == AreDebugLocsAllowed::Yes,
-             "DILocation not allowed within this metadata node", &MD, Op);
+    Check(!isa<LocalAsMetadata>(Op), "Invalid operand for global metadata!",
+          &MD, Op);
+    CheckDI(!isa<DILocation>(Op) || AllowLocs == AreDebugLocsAllowed::Yes,
+            "DILocation not allowed within this metadata node", &MD, Op);
     if (auto *N = dyn_cast<MDNode>(Op)) {
       visitMDNode(*N, AllowLocs);
       continue;
@@ -906,26 +929,26 @@ void Verifier::visitMDNode(const MDNode &MD, AreDebugLocsAllowed AllowLocs) {
   }
 
   // Check these last, so we diagnose problems in operands first.
-  Assert(!MD.isTemporary(), "Expected no forward declarations!", &MD);
-  Assert(MD.isResolved(), "All nodes should be resolved!", &MD);
+  Check(!MD.isTemporary(), "Expected no forward declarations!", &MD);
+  Check(MD.isResolved(), "All nodes should be resolved!", &MD);
 }
 
 void Verifier::visitValueAsMetadata(const ValueAsMetadata &MD, Function *F) {
-  Assert(MD.getValue(), "Expected valid value", &MD);
-  Assert(!MD.getValue()->getType()->isMetadataTy(),
-         "Unexpected metadata round-trip through values", &MD, MD.getValue());
+  Check(MD.getValue(), "Expected valid value", &MD);
+  Check(!MD.getValue()->getType()->isMetadataTy(),
+        "Unexpected metadata round-trip through values", &MD, MD.getValue());
 
   auto *L = dyn_cast<LocalAsMetadata>(&MD);
   if (!L)
     return;
 
-  Assert(F, "function-local metadata used outside a function", L);
+  Check(F, "function-local metadata used outside a function", L);
 
   // If this was an instruction, bb, or argument, verify that it is in the
   // function that we expect.
   Function *ActualF = nullptr;
   if (Instruction *I = dyn_cast<Instruction>(L->getValue())) {
-    Assert(I->getParent(), "function-local metadata not in basic block", L, I);
+    Check(I->getParent(), "function-local metadata not in basic block", L, I);
     ActualF = I->getParent()->getParent();
   } else if (BasicBlock *BB = dyn_cast<BasicBlock>(L->getValue()))
     ActualF = BB->getParent();
@@ -933,7 +956,7 @@ void Verifier::visitValueAsMetadata(const ValueAsMetadata &MD, Function *F) {
     ActualF = A->getParent();
   assert(ActualF && "Unimplemented function local metadata case!");
 
-  Assert(ActualF == F, "function-local metadata used in wrong function", L);
+  Check(ActualF == F, "function-local metadata used in wrong function", L);
 }
 
 void Verifier::visitMetadataAsValue(const MetadataAsValue &MDV, Function *F) {
@@ -957,125 +980,125 @@ static bool isScope(const Metadata *MD) { return !MD || isa<DIScope>(MD); }
 static bool isDINode(const Metadata *MD) { return !MD || isa<DINode>(MD); }
 
 void Verifier::visitDILocation(const DILocation &N) {
-  AssertDI(N.getRawScope() && isa<DILocalScope>(N.getRawScope()),
-           "location requires a valid scope", &N, N.getRawScope());
+  CheckDI(N.getRawScope() && isa<DILocalScope>(N.getRawScope()),
+          "location requires a valid scope", &N, N.getRawScope());
   if (auto *IA = N.getRawInlinedAt())
-    AssertDI(isa<DILocation>(IA), "inlined-at should be a location", &N, IA);
+    CheckDI(isa<DILocation>(IA), "inlined-at should be a location", &N, IA);
   if (auto *SP = dyn_cast<DISubprogram>(N.getRawScope()))
-    AssertDI(SP->isDefinition(), "scope points into the type hierarchy", &N);
+    CheckDI(SP->isDefinition(), "scope points into the type hierarchy", &N);
 }
 
 void Verifier::visitGenericDINode(const GenericDINode &N) {
-  AssertDI(N.getTag(), "invalid tag", &N);
+  CheckDI(N.getTag(), "invalid tag", &N);
 }
 
 void Verifier::visitDIScope(const DIScope &N) {
   if (auto *F = N.getRawFile())
-    AssertDI(isa<DIFile>(F), "invalid file", &N, F);
+    CheckDI(isa<DIFile>(F), "invalid file", &N, F);
 }
 
 void Verifier::visitDISubrange(const DISubrange &N) {
-  AssertDI(N.getTag() == dwarf::DW_TAG_subrange_type, "invalid tag", &N);
+  CheckDI(N.getTag() == dwarf::DW_TAG_subrange_type, "invalid tag", &N);
   bool HasAssumedSizedArraySupport = dwarf::isFortran(CurrentSourceLang);
-  AssertDI(HasAssumedSizedArraySupport || N.getRawCountNode() ||
-               N.getRawUpperBound(),
-           "Subrange must contain count or upperBound", &N);
-  AssertDI(!N.getRawCountNode() || !N.getRawUpperBound(),
-           "Subrange can have any one of count or upperBound", &N);
+  CheckDI(HasAssumedSizedArraySupport || N.getRawCountNode() ||
+              N.getRawUpperBound(),
+          "Subrange must contain count or upperBound", &N);
+  CheckDI(!N.getRawCountNode() || !N.getRawUpperBound(),
+          "Subrange can have any one of count or upperBound", &N);
   auto *CBound = N.getRawCountNode();
-  AssertDI(!CBound || isa<ConstantAsMetadata>(CBound) ||
-               isa<DIVariable>(CBound) || isa<DIExpression>(CBound),
-           "Count must be signed constant or DIVariable or DIExpression", &N);
+  CheckDI(!CBound || isa<ConstantAsMetadata>(CBound) ||
+              isa<DIVariable>(CBound) || isa<DIExpression>(CBound),
+          "Count must be signed constant or DIVariable or DIExpression", &N);
   auto Count = N.getCount();
-  AssertDI(!Count || !Count.is<ConstantInt *>() ||
-               Count.get<ConstantInt *>()->getSExtValue() >= -1,
-           "invalid subrange count", &N);
+  CheckDI(!Count || !Count.is<ConstantInt *>() ||
+              Count.get<ConstantInt *>()->getSExtValue() >= -1,
+          "invalid subrange count", &N);
   auto *LBound = N.getRawLowerBound();
-  AssertDI(!LBound || isa<ConstantAsMetadata>(LBound) ||
-               isa<DIVariable>(LBound) || isa<DIExpression>(LBound),
-           "LowerBound must be signed constant or DIVariable or DIExpression",
-           &N);
+  CheckDI(!LBound || isa<ConstantAsMetadata>(LBound) ||
+              isa<DIVariable>(LBound) || isa<DIExpression>(LBound),
+          "LowerBound must be signed constant or DIVariable or DIExpression",
+          &N);
   auto *UBound = N.getRawUpperBound();
-  AssertDI(!UBound || isa<ConstantAsMetadata>(UBound) ||
-               isa<DIVariable>(UBound) || isa<DIExpression>(UBound),
-           "UpperBound must be signed constant or DIVariable or DIExpression",
-           &N);
+  CheckDI(!UBound || isa<ConstantAsMetadata>(UBound) ||
+              isa<DIVariable>(UBound) || isa<DIExpression>(UBound),
+          "UpperBound must be signed constant or DIVariable or DIExpression",
+          &N);
   auto *Stride = N.getRawStride();
-  AssertDI(!Stride || isa<ConstantAsMetadata>(Stride) ||
-               isa<DIVariable>(Stride) || isa<DIExpression>(Stride),
-           "Stride must be signed constant or DIVariable or DIExpression", &N);
+  CheckDI(!Stride || isa<ConstantAsMetadata>(Stride) ||
+              isa<DIVariable>(Stride) || isa<DIExpression>(Stride),
+          "Stride must be signed constant or DIVariable or DIExpression", &N);
 }
 
 void Verifier::visitDIGenericSubrange(const DIGenericSubrange &N) {
-  AssertDI(N.getTag() == dwarf::DW_TAG_generic_subrange, "invalid tag", &N);
-  AssertDI(N.getRawCountNode() || N.getRawUpperBound(),
-           "GenericSubrange must contain count or upperBound", &N);
-  AssertDI(!N.getRawCountNode() || !N.getRawUpperBound(),
-           "GenericSubrange can have any one of count or upperBound", &N);
+  CheckDI(N.getTag() == dwarf::DW_TAG_generic_subrange, "invalid tag", &N);
+  CheckDI(N.getRawCountNode() || N.getRawUpperBound(),
+          "GenericSubrange must contain count or upperBound", &N);
+  CheckDI(!N.getRawCountNode() || !N.getRawUpperBound(),
+          "GenericSubrange can have any one of count or upperBound", &N);
   auto *CBound = N.getRawCountNode();
-  AssertDI(!CBound || isa<DIVariable>(CBound) || isa<DIExpression>(CBound),
-           "Count must be signed constant or DIVariable or DIExpression", &N);
+  CheckDI(!CBound || isa<DIVariable>(CBound) || isa<DIExpression>(CBound),
+          "Count must be signed constant or DIVariable or DIExpression", &N);
   auto *LBound = N.getRawLowerBound();
-  AssertDI(LBound, "GenericSubrange must contain lowerBound", &N);
-  AssertDI(isa<DIVariable>(LBound) || isa<DIExpression>(LBound),
-           "LowerBound must be signed constant or DIVariable or DIExpression",
-           &N);
+  CheckDI(LBound, "GenericSubrange must contain lowerBound", &N);
+  CheckDI(isa<DIVariable>(LBound) || isa<DIExpression>(LBound),
+          "LowerBound must be signed constant or DIVariable or DIExpression",
+          &N);
   auto *UBound = N.getRawUpperBound();
-  AssertDI(!UBound || isa<DIVariable>(UBound) || isa<DIExpression>(UBound),
-           "UpperBound must be signed constant or DIVariable or DIExpression",
-           &N);
+  CheckDI(!UBound || isa<DIVariable>(UBound) || isa<DIExpression>(UBound),
+          "UpperBound must be signed constant or DIVariable or DIExpression",
+          &N);
   auto *Stride = N.getRawStride();
-  AssertDI(Stride, "GenericSubrange must contain stride", &N);
-  AssertDI(isa<DIVariable>(Stride) || isa<DIExpression>(Stride),
-           "Stride must be signed constant or DIVariable or DIExpression", &N);
+  CheckDI(Stride, "GenericSubrange must contain stride", &N);
+  CheckDI(isa<DIVariable>(Stride) || isa<DIExpression>(Stride),
+          "Stride must be signed constant or DIVariable or DIExpression", &N);
 }
 
 void Verifier::visitDIEnumerator(const DIEnumerator &N) {
-  AssertDI(N.getTag() == dwarf::DW_TAG_enumerator, "invalid tag", &N);
+  CheckDI(N.getTag() == dwarf::DW_TAG_enumerator, "invalid tag", &N);
 }
 
 void Verifier::visitDIBasicType(const DIBasicType &N) {
-  AssertDI(N.getTag() == dwarf::DW_TAG_base_type ||
-               N.getTag() == dwarf::DW_TAG_unspecified_type ||
-               N.getTag() == dwarf::DW_TAG_string_type,
-           "invalid tag", &N);
+  CheckDI(N.getTag() == dwarf::DW_TAG_base_type ||
+              N.getTag() == dwarf::DW_TAG_unspecified_type ||
+              N.getTag() == dwarf::DW_TAG_string_type,
+          "invalid tag", &N);
 }
 
 void Verifier::visitDIStringType(const DIStringType &N) {
-  AssertDI(N.getTag() == dwarf::DW_TAG_string_type, "invalid tag", &N);
-  AssertDI(!(N.isBigEndian() && N.isLittleEndian()) ,
-            "has conflicting flags", &N);
+  CheckDI(N.getTag() == dwarf::DW_TAG_string_type, "invalid tag", &N);
+  CheckDI(!(N.isBigEndian() && N.isLittleEndian()), "has conflicting flags",
+          &N);
 }
 
 void Verifier::visitDIDerivedType(const DIDerivedType &N) {
   // Common scope checks.
   visitDIScope(N);
 
-  AssertDI(N.getTag() == dwarf::DW_TAG_typedef ||
-               N.getTag() == dwarf::DW_TAG_pointer_type ||
-               N.getTag() == dwarf::DW_TAG_ptr_to_member_type ||
-               N.getTag() == dwarf::DW_TAG_reference_type ||
-               N.getTag() == dwarf::DW_TAG_rvalue_reference_type ||
-               N.getTag() == dwarf::DW_TAG_const_type ||
-               N.getTag() == dwarf::DW_TAG_immutable_type ||
-               N.getTag() == dwarf::DW_TAG_volatile_type ||
-               N.getTag() == dwarf::DW_TAG_restrict_type ||
-               N.getTag() == dwarf::DW_TAG_atomic_type ||
-               N.getTag() == dwarf::DW_TAG_member ||
-               N.getTag() == dwarf::DW_TAG_inheritance ||
-               N.getTag() == dwarf::DW_TAG_friend ||
-               N.getTag() == dwarf::DW_TAG_set_type,
-           "invalid tag", &N);
+  CheckDI(N.getTag() == dwarf::DW_TAG_typedef ||
+              N.getTag() == dwarf::DW_TAG_pointer_type ||
+              N.getTag() == dwarf::DW_TAG_ptr_to_member_type ||
+              N.getTag() == dwarf::DW_TAG_reference_type ||
+              N.getTag() == dwarf::DW_TAG_rvalue_reference_type ||
+              N.getTag() == dwarf::DW_TAG_const_type ||
+              N.getTag() == dwarf::DW_TAG_immutable_type ||
+              N.getTag() == dwarf::DW_TAG_volatile_type ||
+              N.getTag() == dwarf::DW_TAG_restrict_type ||
+              N.getTag() == dwarf::DW_TAG_atomic_type ||
+              N.getTag() == dwarf::DW_TAG_member ||
+              N.getTag() == dwarf::DW_TAG_inheritance ||
+              N.getTag() == dwarf::DW_TAG_friend ||
+              N.getTag() == dwarf::DW_TAG_set_type,
+          "invalid tag", &N);
   if (N.getTag() == dwarf::DW_TAG_ptr_to_member_type) {
-    AssertDI(isType(N.getRawExtraData()), "invalid pointer to member type", &N,
-             N.getRawExtraData());
+    CheckDI(isType(N.getRawExtraData()), "invalid pointer to member type", &N,
+            N.getRawExtraData());
   }
 
   if (N.getTag() == dwarf::DW_TAG_set_type) {
     if (auto *T = N.getRawBaseType()) {
       auto *Enum = dyn_cast_or_null<DICompositeType>(T);
       auto *Basic = dyn_cast_or_null<DIBasicType>(T);
-      AssertDI(
+      CheckDI(
           (Enum && Enum->getTag() == dwarf::DW_TAG_enumeration_type) ||
               (Basic && (Basic->getEncoding() == dwarf::DW_ATE_unsigned ||
                          Basic->getEncoding() == dwarf::DW_ATE_signed ||
@@ -1086,16 +1109,16 @@ void Verifier::visitDIDerivedType(const DIDerivedType &N) {
     }
   }
 
-  AssertDI(isScope(N.getRawScope()), "invalid scope", &N, N.getRawScope());
-  AssertDI(isType(N.getRawBaseType()), "invalid base type", &N,
-           N.getRawBaseType());
+  CheckDI(isScope(N.getRawScope()), "invalid scope", &N, N.getRawScope());
+  CheckDI(isType(N.getRawBaseType()), "invalid base type", &N,
+          N.getRawBaseType());
 
   if (N.getDWARFAddressSpace()) {
-    AssertDI(N.getTag() == dwarf::DW_TAG_pointer_type ||
-                 N.getTag() == dwarf::DW_TAG_reference_type ||
-                 N.getTag() == dwarf::DW_TAG_rvalue_reference_type,
-             "DWARF address space only applies to pointer or reference types",
-             &N);
+    CheckDI(N.getTag() == dwarf::DW_TAG_pointer_type ||
+                N.getTag() == dwarf::DW_TAG_reference_type ||
+                N.getTag() == dwarf::DW_TAG_rvalue_reference_type,
+            "DWARF address space only applies to pointer or reference types",
+            &N);
   }
 }
 
@@ -1109,10 +1132,10 @@ static bool hasConflictingReferenceFlags(unsigned Flags) {
 
 void Verifier::visitTemplateParams(const MDNode &N, const Metadata &RawParams) {
   auto *Params = dyn_cast<MDTuple>(&RawParams);
-  AssertDI(Params, "invalid template params", &N, &RawParams);
+  CheckDI(Params, "invalid template params", &N, &RawParams);
   for (Metadata *Op : Params->operands()) {
-    AssertDI(Op && isa<DITemplateParameter>(Op), "invalid template parameter",
-             &N, Params, Op);
+    CheckDI(Op && isa<DITemplateParameter>(Op), "invalid template parameter",
+            &N, Params, Op);
   }
 }
 
@@ -1120,83 +1143,83 @@ void Verifier::visitDICompositeType(const DICompositeType &N) {
   // Common scope checks.
   visitDIScope(N);
 
-  AssertDI(N.getTag() == dwarf::DW_TAG_array_type ||
-               N.getTag() == dwarf::DW_TAG_structure_type ||
-               N.getTag() == dwarf::DW_TAG_union_type ||
-               N.getTag() == dwarf::DW_TAG_enumeration_type ||
-               N.getTag() == dwarf::DW_TAG_class_type ||
-               N.getTag() == dwarf::DW_TAG_variant_part ||
-               N.getTag() == dwarf::DW_TAG_namelist,
-           "invalid tag", &N);
-
-  AssertDI(isScope(N.getRawScope()), "invalid scope", &N, N.getRawScope());
-  AssertDI(isType(N.getRawBaseType()), "invalid base type", &N,
-           N.getRawBaseType());
-
-  AssertDI(!N.getRawElements() || isa<MDTuple>(N.getRawElements()),
-           "invalid composite elements", &N, N.getRawElements());
-  AssertDI(isType(N.getRawVTableHolder()), "invalid vtable holder", &N,
-           N.getRawVTableHolder());
-  AssertDI(!hasConflictingReferenceFlags(N.getFlags()),
-           "invalid reference flags", &N);
+  CheckDI(N.getTag() == dwarf::DW_TAG_array_type ||
+              N.getTag() == dwarf::DW_TAG_structure_type ||
+              N.getTag() == dwarf::DW_TAG_union_type ||
+              N.getTag() == dwarf::DW_TAG_enumeration_type ||
+              N.getTag() == dwarf::DW_TAG_class_type ||
+              N.getTag() == dwarf::DW_TAG_variant_part ||
+              N.getTag() == dwarf::DW_TAG_namelist,
+          "invalid tag", &N);
+
+  CheckDI(isScope(N.getRawScope()), "invalid scope", &N, N.getRawScope());
+  CheckDI(isType(N.getRawBaseType()), "invalid base type", &N,
+          N.getRawBaseType());
+
+  CheckDI(!N.getRawElements() || isa<MDTuple>(N.getRawElements()),
+          "invalid composite elements", &N, N.getRawElements());
+  CheckDI(isType(N.getRawVTableHolder()), "invalid vtable holder", &N,
+          N.getRawVTableHolder());
+  CheckDI(!hasConflictingReferenceFlags(N.getFlags()),
+          "invalid reference flags", &N);
   unsigned DIBlockByRefStruct = 1 << 4;
-  AssertDI((N.getFlags() & DIBlockByRefStruct) == 0,
-           "DIBlockByRefStruct on DICompositeType is no longer supported", &N);
+  CheckDI((N.getFlags() & DIBlockByRefStruct) == 0,
+          "DIBlockByRefStruct on DICompositeType is no longer supported", &N);
 
   if (N.isVector()) {
     const DINodeArray Elements = N.getElements();
-    AssertDI(Elements.size() == 1 &&
-             Elements[0]->getTag() == dwarf::DW_TAG_subrange_type,
-             "invalid vector, expected one element of type subrange", &N);
+    CheckDI(Elements.size() == 1 &&
+                Elements[0]->getTag() == dwarf::DW_TAG_subrange_type,
+            "invalid vector, expected one element of type subrange", &N);
   }
 
   if (auto *Params = N.getRawTemplateParams())
     visitTemplateParams(N, *Params);
 
   if (auto *D = N.getRawDiscriminator()) {
-    AssertDI(isa<DIDerivedType>(D) && N.getTag() == dwarf::DW_TAG_variant_part,
-             "discriminator can only appear on variant part");
+    CheckDI(isa<DIDerivedType>(D) && N.getTag() == dwarf::DW_TAG_variant_part,
+            "discriminator can only appear on variant part");
   }
 
   if (N.getRawDataLocation()) {
-    AssertDI(N.getTag() == dwarf::DW_TAG_array_type,
-             "dataLocation can only appear in array type");
+    CheckDI(N.getTag() == dwarf::DW_TAG_array_type,
+            "dataLocation can only appear in array type");
   }
 
   if (N.getRawAssociated()) {
-    AssertDI(N.getTag() == dwarf::DW_TAG_array_type,
-             "associated can only appear in array type");
+    CheckDI(N.getTag() == dwarf::DW_TAG_array_type,
+            "associated can only appear in array type");
   }
 
   if (N.getRawAllocated()) {
-    AssertDI(N.getTag() == dwarf::DW_TAG_array_type,
-             "allocated can only appear in array type");
+    CheckDI(N.getTag() == dwarf::DW_TAG_array_type,
+            "allocated can only appear in array type");
   }
 
   if (N.getRawRank()) {
-    AssertDI(N.getTag() == dwarf::DW_TAG_array_type,
-             "rank can only appear in array type");
+    CheckDI(N.getTag() == dwarf::DW_TAG_array_type,
+            "rank can only appear in array type");
   }
 }
 
 void Verifier::visitDISubroutineType(const DISubroutineType &N) {
-  AssertDI(N.getTag() == dwarf::DW_TAG_subroutine_type, "invalid tag", &N);
+  CheckDI(N.getTag() == dwarf::DW_TAG_subroutine_type, "invalid tag", &N);
   if (auto *Types = N.getRawTypeArray()) {
-    AssertDI(isa<MDTuple>(Types), "invalid composite elements", &N, Types);
+    CheckDI(isa<MDTuple>(Types), "invalid composite elements", &N, Types);
     for (Metadata *Ty : N.getTypeArray()->operands()) {
-      AssertDI(isType(Ty), "invalid subroutine type ref", &N, Types, Ty);
+      CheckDI(isType(Ty), "invalid subroutine type ref", &N, Types, Ty);
     }
   }
-  AssertDI(!hasConflictingReferenceFlags(N.getFlags()),
-           "invalid reference flags", &N);
+  CheckDI(!hasConflictingReferenceFlags(N.getFlags()),
+          "invalid reference flags", &N);
 }
 
 void Verifier::visitDIFile(const DIFile &N) {
-  AssertDI(N.getTag() == dwarf::DW_TAG_file_type, "invalid tag", &N);
+  CheckDI(N.getTag() == dwarf::DW_TAG_file_type, "invalid tag", &N);
   Optional<DIFile::ChecksumInfo<StringRef>> Checksum = N.getChecksum();
   if (Checksum) {
-    AssertDI(Checksum->Kind <= DIFile::ChecksumKind::CSK_Last,
-             "invalid checksum kind", &N);
+    CheckDI(Checksum->Kind <= DIFile::ChecksumKind::CSK_Last,
+            "invalid checksum kind", &N);
     size_t Size;
     switch (Checksum->Kind) {
     case DIFile::CSK_MD5:
@@ -1209,137 +1232,137 @@ void Verifier::visitDIFile(const DIFile &N) {
       Size = 64;
       break;
     }
-    AssertDI(Checksum->Value.size() == Size, "invalid checksum length", &N);
-    AssertDI(Checksum->Value.find_if_not(llvm::isHexDigit) == StringRef::npos,
-             "invalid checksum", &N);
+    CheckDI(Checksum->Value.size() == Size, "invalid checksum length", &N);
+    CheckDI(Checksum->Value.find_if_not(llvm::isHexDigit) == StringRef::npos,
+            "invalid checksum", &N);
   }
 }
 
 void Verifier::visitDICompileUnit(const DICompileUnit &N) {
-  AssertDI(N.isDistinct(), "compile units must be distinct", &N);
-  AssertDI(N.getTag() == dwarf::DW_TAG_compile_unit, "invalid tag", &N);
+  CheckDI(N.isDistinct(), "compile units must be distinct", &N);
+  CheckDI(N.getTag() == dwarf::DW_TAG_compile_unit, "invalid tag", &N);
 
   // Don't bother verifying the compilation directory or producer string
   // as those could be empty.
-  AssertDI(N.getRawFile() && isa<DIFile>(N.getRawFile()), "invalid file", &N,
-           N.getRawFile());
-  AssertDI(!N.getFile()->getFilename().empty(), "invalid filename", &N,
-           N.getFile());
+  CheckDI(N.getRawFile() && isa<DIFile>(N.getRawFile()), "invalid file", &N,
+          N.getRawFile());
+  CheckDI(!N.getFile()->getFilename().empty(), "invalid filename", &N,
+          N.getFile());
 
   CurrentSourceLang = (dwarf::SourceLanguage)N.getSourceLanguage();
 
   verifySourceDebugInfo(N, *N.getFile());
 
-  AssertDI((N.getEmissionKind() <= DICompileUnit::LastEmissionKind),
-           "invalid emission kind", &N);
+  CheckDI((N.getEmissionKind() <= DICompileUnit::LastEmissionKind),
+          "invalid emission kind", &N);
 
   if (auto *Array = N.getRawEnumTypes()) {
-    AssertDI(isa<MDTuple>(Array), "invalid enum list", &N, Array);
+    CheckDI(isa<MDTuple>(Array), "invalid enum list", &N, Array);
     for (Metadata *Op : N.getEnumTypes()->operands()) {
       auto *Enum = dyn_cast_or_null<DICompositeType>(Op);
-      AssertDI(Enum && Enum->getTag() == dwarf::DW_TAG_enumeration_type,
-               "invalid enum type", &N, N.getEnumTypes(), Op);
+      CheckDI(Enum && Enum->getTag() == dwarf::DW_TAG_enumeration_type,
+              "invalid enum type", &N, N.getEnumTypes(), Op);
     }
   }
   if (auto *Array = N.getRawRetainedTypes()) {
-    AssertDI(isa<MDTuple>(Array), "invalid retained type list", &N, Array);
+    CheckDI(isa<MDTuple>(Array), "invalid retained type list", &N, Array);
     for (Metadata *Op : N.getRetainedTypes()->operands()) {
-      AssertDI(Op && (isa<DIType>(Op) ||
-                      (isa<DISubprogram>(Op) &&
-                       !cast<DISubprogram>(Op)->isDefinition())),
-               "invalid retained type", &N, Op);
+      CheckDI(
+          Op && (isa<DIType>(Op) || (isa<DISubprogram>(Op) &&
+                                     !cast<DISubprogram>(Op)->isDefinition())),
+          "invalid retained type", &N, Op);
     }
   }
   if (auto *Array = N.getRawGlobalVariables()) {
-    AssertDI(isa<MDTuple>(Array), "invalid global variable list", &N, Array);
+    CheckDI(isa<MDTuple>(Array), "invalid global variable list", &N, Array);
     for (Metadata *Op : N.getGlobalVariables()->operands()) {
-      AssertDI(Op && (isa<DIGlobalVariableExpression>(Op)),
-               "invalid global variable ref", &N, Op);
+      CheckDI(Op && (isa<DIGlobalVariableExpression>(Op)),
+              "invalid global variable ref", &N, Op);
     }
   }
   if (auto *Array = N.getRawImportedEntities()) {
-    AssertDI(isa<MDTuple>(Array), "invalid imported entity list", &N, Array);
+    CheckDI(isa<MDTuple>(Array), "invalid imported entity list", &N, Array);
     for (Metadata *Op : N.getImportedEntities()->operands()) {
-      AssertDI(Op && isa<DIImportedEntity>(Op), "invalid imported entity ref",
-               &N, Op);
+      CheckDI(Op && isa<DIImportedEntity>(Op), "invalid imported entity ref",
+              &N, Op);
     }
   }
   if (auto *Array = N.getRawMacros()) {
-    AssertDI(isa<MDTuple>(Array), "invalid macro list", &N, Array);
+    CheckDI(isa<MDTuple>(Array), "invalid macro list", &N, Array);
     for (Metadata *Op : N.getMacros()->operands()) {
-      AssertDI(Op && isa<DIMacroNode>(Op), "invalid macro ref", &N, Op);
+      CheckDI(Op && isa<DIMacroNode>(Op), "invalid macro ref", &N, Op);
     }
   }
   CUVisited.insert(&N);
 }
 
 void Verifier::visitDISubprogram(const DISubprogram &N) {
-  AssertDI(N.getTag() == dwarf::DW_TAG_subprogram, "invalid tag", &N);
-  AssertDI(isScope(N.getRawScope()), "invalid scope", &N, N.getRawScope());
+  CheckDI(N.getTag() == dwarf::DW_TAG_subprogram, "invalid tag", &N);
+  CheckDI(isScope(N.getRawScope()), "invalid scope", &N, N.getRawScope());
   if (auto *F = N.getRawFile())
-    AssertDI(isa<DIFile>(F), "invalid file", &N, F);
+    CheckDI(isa<DIFile>(F), "invalid file", &N, F);
   else
-    AssertDI(N.getLine() == 0, "line specified with no file", &N, N.getLine());
+    CheckDI(N.getLine() == 0, "line specified with no file", &N, N.getLine());
   if (auto *T = N.getRawType())
-    AssertDI(isa<DISubroutineType>(T), "invalid subroutine type", &N, T);
-  AssertDI(isType(N.getRawContainingType()), "invalid containing type", &N,
-           N.getRawContainingType());
+    CheckDI(isa<DISubroutineType>(T), "invalid subroutine type", &N, T);
+  CheckDI(isType(N.getRawContainingType()), "invalid containing type", &N,
+          N.getRawContainingType());
   if (auto *Params = N.getRawTemplateParams())
     visitTemplateParams(N, *Params);
   if (auto *S = N.getRawDeclaration())
-    AssertDI(isa<DISubprogram>(S) && !cast<DISubprogram>(S)->isDefinition(),
-             "invalid subprogram declaration", &N, S);
+    CheckDI(isa<DISubprogram>(S) && !cast<DISubprogram>(S)->isDefinition(),
+            "invalid subprogram declaration", &N, S);
   if (auto *RawNode = N.getRawRetainedNodes()) {
     auto *Node = dyn_cast<MDTuple>(RawNode);
-    AssertDI(Node, "invalid retained nodes list", &N, RawNode);
+    CheckDI(Node, "invalid retained nodes list", &N, RawNode);
     for (Metadata *Op : Node->operands()) {
-      AssertDI(Op && (isa<DILocalVariable>(Op) || isa<DILabel>(Op)),
-               "invalid retained nodes, expected DILocalVariable or DILabel",
-               &N, Node, Op);
+      CheckDI(Op && (isa<DILocalVariable>(Op) || isa<DILabel>(Op)),
+              "invalid retained nodes, expected DILocalVariable or DILabel", &N,
+              Node, Op);
     }
   }
-  AssertDI(!hasConflictingReferenceFlags(N.getFlags()),
-           "invalid reference flags", &N);
+  CheckDI(!hasConflictingReferenceFlags(N.getFlags()),
+          "invalid reference flags", &N);
 
   auto *Unit = N.getRawUnit();
   if (N.isDefinition()) {
     // Subprogram definitions (not part of the type hierarchy).
-    AssertDI(N.isDistinct(), "subprogram definitions must be distinct", &N);
-    AssertDI(Unit, "subprogram definitions must have a compile unit", &N);
-    AssertDI(isa<DICompileUnit>(Unit), "invalid unit type", &N, Unit);
+    CheckDI(N.isDistinct(), "subprogram definitions must be distinct", &N);
+    CheckDI(Unit, "subprogram definitions must have a compile unit", &N);
+    CheckDI(isa<DICompileUnit>(Unit), "invalid unit type", &N, Unit);
     if (N.getFile())
       verifySourceDebugInfo(*N.getUnit(), *N.getFile());
   } else {
     // Subprogram declarations (part of the type hierarchy).
-    AssertDI(!Unit, "subprogram declarations must not have a compile unit", &N);
+    CheckDI(!Unit, "subprogram declarations must not have a compile unit", &N);
   }
 
   if (auto *RawThrownTypes = N.getRawThrownTypes()) {
     auto *ThrownTypes = dyn_cast<MDTuple>(RawThrownTypes);
-    AssertDI(ThrownTypes, "invalid thrown types list", &N, RawThrownTypes);
+    CheckDI(ThrownTypes, "invalid thrown types list", &N, RawThrownTypes);
     for (Metadata *Op : ThrownTypes->operands())
-      AssertDI(Op && isa<DIType>(Op), "invalid thrown type", &N, ThrownTypes,
-               Op);
+      CheckDI(Op && isa<DIType>(Op), "invalid thrown type", &N, ThrownTypes,
+              Op);
   }
 
   if (N.areAllCallsDescribed())
-    AssertDI(N.isDefinition(),
-             "DIFlagAllCallsDescribed must be attached to a definition");
+    CheckDI(N.isDefinition(),
+            "DIFlagAllCallsDescribed must be attached to a definition");
 }
 
 void Verifier::visitDILexicalBlockBase(const DILexicalBlockBase &N) {
-  AssertDI(N.getTag() == dwarf::DW_TAG_lexical_block, "invalid tag", &N);
-  AssertDI(N.getRawScope() && isa<DILocalScope>(N.getRawScope()),
-           "invalid local scope", &N, N.getRawScope());
+  CheckDI(N.getTag() == dwarf::DW_TAG_lexical_block, "invalid tag", &N);
+  CheckDI(N.getRawScope() && isa<DILocalScope>(N.getRawScope()),
+          "invalid local scope", &N, N.getRawScope());
   if (auto *SP = dyn_cast<DISubprogram>(N.getRawScope()))
-    AssertDI(SP->isDefinition(), "scope points into the type hierarchy", &N);
+    CheckDI(SP->isDefinition(), "scope points into the type hierarchy", &N);
 }
 
 void Verifier::visitDILexicalBlock(const DILexicalBlock &N) {
   visitDILexicalBlockBase(N);
 
-  AssertDI(N.getLine() || !N.getColumn(),
-           "cannot have column info without line info", &N);
+  CheckDI(N.getLine() || !N.getColumn(),
+          "cannot have column info without line info", &N);
 }
 
 void Verifier::visitDILexicalBlockFile(const DILexicalBlockFile &N) {
@@ -1347,95 +1370,95 @@ void Verifier::visitDILexicalBlockFile(const DILexicalBlockFile &N) {
 }
 
 void Verifier::visitDICommonBlock(const DICommonBlock &N) {
-  AssertDI(N.getTag() == dwarf::DW_TAG_common_block, "invalid tag", &N);
+  CheckDI(N.getTag() == dwarf::DW_TAG_common_block, "invalid tag", &N);
   if (auto *S = N.getRawScope())
-    AssertDI(isa<DIScope>(S), "invalid scope ref", &N, S);
+    CheckDI(isa<DIScope>(S), "invalid scope ref", &N, S);
   if (auto *S = N.getRawDecl())
-    AssertDI(isa<DIGlobalVariable>(S), "invalid declaration", &N, S);
+    CheckDI(isa<DIGlobalVariable>(S), "invalid declaration", &N, S);
 }
 
 void Verifier::visitDINamespace(const DINamespace &N) {
-  AssertDI(N.getTag() == dwarf::DW_TAG_namespace, "invalid tag", &N);
+  CheckDI(N.getTag() == dwarf::DW_TAG_namespace, "invalid tag", &N);
   if (auto *S = N.getRawScope())
-    AssertDI(isa<DIScope>(S), "invalid scope ref", &N, S);
+    CheckDI(isa<DIScope>(S), "invalid scope ref", &N, S);
 }
 
 void Verifier::visitDIMacro(const DIMacro &N) {
-  AssertDI(N.getMacinfoType() == dwarf::DW_MACINFO_define ||
-               N.getMacinfoType() == dwarf::DW_MACINFO_undef,
-           "invalid macinfo type", &N);
-  AssertDI(!N.getName().empty(), "anonymous macro", &N);
+  CheckDI(N.getMacinfoType() == dwarf::DW_MACINFO_define ||
+              N.getMacinfoType() == dwarf::DW_MACINFO_undef,
+          "invalid macinfo type", &N);
+  CheckDI(!N.getName().empty(), "anonymous macro", &N);
   if (!N.getValue().empty()) {
     assert(N.getValue().data()[0] != ' ' && "Macro value has a space prefix");
   }
 }
 
 void Verifier::visitDIMacroFile(const DIMacroFile &N) {
-  AssertDI(N.getMacinfoType() == dwarf::DW_MACINFO_start_file,
-           "invalid macinfo type", &N);
+  CheckDI(N.getMacinfoType() == dwarf::DW_MACINFO_start_file,
+          "invalid macinfo type", &N);
   if (auto *F = N.getRawFile())
-    AssertDI(isa<DIFile>(F), "invalid file", &N, F);
+    CheckDI(isa<DIFile>(F), "invalid file", &N, F);
 
   if (auto *Array = N.getRawElements()) {
-    AssertDI(isa<MDTuple>(Array), "invalid macro list", &N, Array);
+    CheckDI(isa<MDTuple>(Array), "invalid macro list", &N, Array);
     for (Metadata *Op : N.getElements()->operands()) {
-      AssertDI(Op && isa<DIMacroNode>(Op), "invalid macro ref", &N, Op);
+      CheckDI(Op && isa<DIMacroNode>(Op), "invalid macro ref", &N, Op);
     }
   }
 }
 
 void Verifier::visitDIArgList(const DIArgList &N) {
-  AssertDI(!N.getNumOperands(),
-           "DIArgList should have no operands other than a list of "
-           "ValueAsMetadata",
-           &N);
+  CheckDI(!N.getNumOperands(),
+          "DIArgList should have no operands other than a list of "
+          "ValueAsMetadata",
+          &N);
 }
 
 void Verifier::visitDIModule(const DIModule &N) {
-  AssertDI(N.getTag() == dwarf::DW_TAG_module, "invalid tag", &N);
-  AssertDI(!N.getName().empty(), "anonymous module", &N);
+  CheckDI(N.getTag() == dwarf::DW_TAG_module, "invalid tag", &N);
+  CheckDI(!N.getName().empty(), "anonymous module", &N);
 }
 
 void Verifier::visitDITemplateParameter(const DITemplateParameter &N) {
-  AssertDI(isType(N.getRawType()), "invalid type ref", &N, N.getRawType());
+  CheckDI(isType(N.getRawType()), "invalid type ref", &N, N.getRawType());
 }
 
 void Verifier::visitDITemplateTypeParameter(const DITemplateTypeParameter &N) {
   visitDITemplateParameter(N);
 
-  AssertDI(N.getTag() == dwarf::DW_TAG_template_type_parameter, "invalid tag",
-           &N);
+  CheckDI(N.getTag() == dwarf::DW_TAG_template_type_parameter, "invalid tag",
+          &N);
 }
 
 void Verifier::visitDITemplateValueParameter(
     const DITemplateValueParameter &N) {
   visitDITemplateParameter(N);
 
-  AssertDI(N.getTag() == dwarf::DW_TAG_template_value_parameter ||
-               N.getTag() == dwarf::DW_TAG_GNU_template_template_param ||
-               N.getTag() == dwarf::DW_TAG_GNU_template_parameter_pack,
-           "invalid tag", &N);
+  CheckDI(N.getTag() == dwarf::DW_TAG_template_value_parameter ||
+              N.getTag() == dwarf::DW_TAG_GNU_template_template_param ||
+              N.getTag() == dwarf::DW_TAG_GNU_template_parameter_pack,
+          "invalid tag", &N);
 }
 
 void Verifier::visitDIVariable(const DIVariable &N) {
   if (auto *S = N.getRawScope())
-    AssertDI(isa<DIScope>(S), "invalid scope", &N, S);
+    CheckDI(isa<DIScope>(S), "invalid scope", &N, S);
   if (auto *F = N.getRawFile())
-    AssertDI(isa<DIFile>(F), "invalid file", &N, F);
+    CheckDI(isa<DIFile>(F), "invalid file", &N, F);
 }
 
 void Verifier::visitDIGlobalVariable(const DIGlobalVariable &N) {
   // Checks common to all variables.
   visitDIVariable(N);
 
-  AssertDI(N.getTag() == dwarf::DW_TAG_variable, "invalid tag", &N);
-  AssertDI(isType(N.getRawType()), "invalid type ref", &N, N.getRawType());
-  // Assert only if the global variable is not an extern
+  CheckDI(N.getTag() == dwarf::DW_TAG_variable, "invalid tag", &N);
+  CheckDI(isType(N.getRawType()), "invalid type ref", &N, N.getRawType());
+  // Check only if the global variable is not an extern
   if (N.isDefinition())
-    AssertDI(N.getType(), "missing global variable type", &N);
+    CheckDI(N.getType(), "missing global variable type", &N);
   if (auto *Member = N.getRawStaticDataMemberDeclaration()) {
-    AssertDI(isa<DIDerivedType>(Member),
-             "invalid static data member declaration", &N, Member);
+    CheckDI(isa<DIDerivedType>(Member),
+            "invalid static data member declaration", &N, Member);
   }
 }
 
@@ -1443,32 +1466,32 @@ void Verifier::visitDILocalVariable(const DILocalVariable &N) {
   // Checks common to all variables.
   visitDIVariable(N);
 
-  AssertDI(isType(N.getRawType()), "invalid type ref", &N, N.getRawType());
-  AssertDI(N.getTag() == dwarf::DW_TAG_variable, "invalid tag", &N);
-  AssertDI(N.getRawScope() && isa<DILocalScope>(N.getRawScope()),
-           "local variable requires a valid scope", &N, N.getRawScope());
+  CheckDI(isType(N.getRawType()), "invalid type ref", &N, N.getRawType());
+  CheckDI(N.getTag() == dwarf::DW_TAG_variable, "invalid tag", &N);
+  CheckDI(N.getRawScope() && isa<DILocalScope>(N.getRawScope()),
+          "local variable requires a valid scope", &N, N.getRawScope());
   if (auto Ty = N.getType())
-    AssertDI(!isa<DISubroutineType>(Ty), "invalid type", &N, N.getType());
+    CheckDI(!isa<DISubroutineType>(Ty), "invalid type", &N, N.getType());
 }
 
 void Verifier::visitDILabel(const DILabel &N) {
   if (auto *S = N.getRawScope())
-    AssertDI(isa<DIScope>(S), "invalid scope", &N, S);
+    CheckDI(isa<DIScope>(S), "invalid scope", &N, S);
   if (auto *F = N.getRawFile())
-    AssertDI(isa<DIFile>(F), "invalid file", &N, F);
+    CheckDI(isa<DIFile>(F), "invalid file", &N, F);
 
-  AssertDI(N.getTag() == dwarf::DW_TAG_label, "invalid tag", &N);
-  AssertDI(N.getRawScope() && isa<DILocalScope>(N.getRawScope()),
-           "label requires a valid scope", &N, N.getRawScope());
+  CheckDI(N.getTag() == dwarf::DW_TAG_label, "invalid tag", &N);
+  CheckDI(N.getRawScope() && isa<DILocalScope>(N.getRawScope()),
+          "label requires a valid scope", &N, N.getRawScope());
 }
 
 void Verifier::visitDIExpression(const DIExpression &N) {
-  AssertDI(N.isValid(), "invalid expression", &N);
+  CheckDI(N.isValid(), "invalid expression", &N);
 }
 
 void Verifier::visitDIGlobalVariableExpression(
     const DIGlobalVariableExpression &GVE) {
-  AssertDI(GVE.getVariable(), "missing variable");
+  CheckDI(GVE.getVariable(), "missing variable");
   if (auto *Var = GVE.getVariable())
     visitDIGlobalVariable(*Var);
   if (auto *Expr = GVE.getExpression()) {
@@ -1479,21 +1502,21 @@ void Verifier::visitDIGlobalVariableExpression(
 }
 
 void Verifier::visitDIObjCProperty(const DIObjCProperty &N) {
-  AssertDI(N.getTag() == dwarf::DW_TAG_APPLE_property, "invalid tag", &N);
+  CheckDI(N.getTag() == dwarf::DW_TAG_APPLE_property, "invalid tag", &N);
   if (auto *T = N.getRawType())
-    AssertDI(isType(T), "invalid type ref", &N, T);
+    CheckDI(isType(T), "invalid type ref", &N, T);
   if (auto *F = N.getRawFile())
-    AssertDI(isa<DIFile>(F), "invalid file", &N, F);
+    CheckDI(isa<DIFile>(F), "invalid file", &N, F);
 }
 
 void Verifier::visitDIImportedEntity(const DIImportedEntity &N) {
-  AssertDI(N.getTag() == dwarf::DW_TAG_imported_module ||
-               N.getTag() == dwarf::DW_TAG_imported_declaration,
-           "invalid tag", &N);
+  CheckDI(N.getTag() == dwarf::DW_TAG_imported_module ||
+              N.getTag() == dwarf::DW_TAG_imported_declaration,
+          "invalid tag", &N);
   if (auto *S = N.getRawScope())
-    AssertDI(isa<DIScope>(S), "invalid scope for imported entity", &N, S);
-  AssertDI(isDINode(N.getRawEntity()), "invalid imported entity", &N,
-           N.getRawEntity());
+    CheckDI(isa<DIScope>(S), "invalid scope for imported entity", &N, S);
+  CheckDI(isDINode(N.getRawEntity()), "invalid imported entity", &N,
+          N.getRawEntity());
 }
 
 void Verifier::visitComdat(const Comdat &C) {
@@ -1501,8 +1524,8 @@ void Verifier::visitComdat(const Comdat &C) {
   // Entities with private linkage don't have entries in the symbol table.
   if (TT.isOSBinFormatCOFF())
     if (const GlobalValue *GV = M.getNamedValue(C.getName()))
-      Assert(!GV->hasPrivateLinkage(),
-             "comdat global value has private linkage", GV);
+      Check(!GV->hasPrivateLinkage(), "comdat global value has private linkage",
+            GV);
 }
 
 void Verifier::visitModuleIdents() {
@@ -1513,12 +1536,12 @@ void Verifier::visitModuleIdents() {
   // llvm.ident takes a list of metadata entry. Each entry has only one string.
   // Scan each llvm.ident entry and make sure that this requirement is met.
   for (const MDNode *N : Idents->operands()) {
-    Assert(N->getNumOperands() == 1,
-           "incorrect number of operands in llvm.ident metadata", N);
-    Assert(dyn_cast_or_null<MDString>(N->getOperand(0)),
-           ("invalid value for llvm.ident metadata entry operand"
-            "(the operand should be a string)"),
-           N->getOperand(0));
+    Check(N->getNumOperands() == 1,
+          "incorrect number of operands in llvm.ident metadata", N);
+    Check(dyn_cast_or_null<MDString>(N->getOperand(0)),
+          ("invalid value for llvm.ident metadata entry operand"
+           "(the operand should be a string)"),
+          N->getOperand(0));
   }
 }
 
@@ -1531,12 +1554,12 @@ void Verifier::visitModuleCommandLines() {
   // string. Scan each llvm.commandline entry and make sure that this
   // requirement is met.
   for (const MDNode *N : CommandLines->operands()) {
-    Assert(N->getNumOperands() == 1,
-           "incorrect number of operands in llvm.commandline metadata", N);
-    Assert(dyn_cast_or_null<MDString>(N->getOperand(0)),
-           ("invalid value for llvm.commandline metadata entry operand"
-            "(the operand should be a string)"),
-           N->getOperand(0));
+    Check(N->getNumOperands() == 1,
+          "incorrect number of operands in llvm.commandline metadata", N);
+    Check(dyn_cast_or_null<MDString>(N->getOperand(0)),
+          ("invalid value for llvm.commandline metadata entry operand"
+           "(the operand should be a string)"),
+          N->getOperand(0));
   }
 }
 
@@ -1577,21 +1600,20 @@ Verifier::visitModuleFlag(const MDNode *Op,
                           SmallVectorImpl<const MDNode *> &Requirements) {
   // Each module flag should have three arguments, the merge behavior (a
   // constant int), the flag ID (an MDString), and the value.
-  Assert(Op->getNumOperands() == 3,
-         "incorrect number of operands in module flag", Op);
+  Check(Op->getNumOperands() == 3,
+        "incorrect number of operands in module flag", Op);
   Module::ModFlagBehavior MFB;
   if (!Module::isValidModFlagBehavior(Op->getOperand(0), MFB)) {
-    Assert(
-        mdconst::dyn_extract_or_null<ConstantInt>(Op->getOperand(0)),
-        "invalid behavior operand in module flag (expected constant integer)",
-        Op->getOperand(0));
-    Assert(false,
-           "invalid behavior operand in module flag (unexpected constant)",
-           Op->getOperand(0));
+    Check(mdconst::dyn_extract_or_null<ConstantInt>(Op->getOperand(0)),
+          "invalid behavior operand in module flag (expected constant integer)",
+          Op->getOperand(0));
+    Check(false,
+          "invalid behavior operand in module flag (unexpected constant)",
+          Op->getOperand(0));
   }
   MDString *ID = dyn_cast_or_null<MDString>(Op->getOperand(1));
-  Assert(ID, "invalid ID operand in module flag (expected metadata string)",
-         Op->getOperand(1));
+  Check(ID, "invalid ID operand in module flag (expected metadata string)",
+        Op->getOperand(1));
 
   // Check the values for behaviors with additional requirements.
   switch (MFB) {
@@ -1601,10 +1623,17 @@ Verifier::visitModuleFlag(const MDNode *Op,
     // These behavior types accept any value.
     break;
 
+  case Module::Min: {
+    Check(mdconst::dyn_extract_or_null<ConstantInt>(Op->getOperand(2)),
+          "invalid value for 'min' module flag (expected constant integer)",
+          Op->getOperand(2));
+    break;
+  }
+
   case Module::Max: {
-    Assert(mdconst::dyn_extract_or_null<ConstantInt>(Op->getOperand(2)),
-           "invalid value for 'max' module flag (expected constant integer)",
-           Op->getOperand(2));
+    Check(mdconst::dyn_extract_or_null<ConstantInt>(Op->getOperand(2)),
+          "invalid value for 'max' module flag (expected constant integer)",
+          Op->getOperand(2));
     break;
   }
 
@@ -1612,13 +1641,13 @@ Verifier::visitModuleFlag(const MDNode *Op,
     // The value should itself be an MDNode with two operands, a flag ID (an
     // MDString), and a value.
     MDNode *Value = dyn_cast<MDNode>(Op->getOperand(2));
-    Assert(Value && Value->getNumOperands() == 2,
-           "invalid value for 'require' module flag (expected metadata pair)",
-           Op->getOperand(2));
-    Assert(isa<MDString>(Value->getOperand(0)),
-           ("invalid value for 'require' module flag "
-            "(first value operand should be a string)"),
-           Value->getOperand(0));
+    Check(Value && Value->getNumOperands() == 2,
+          "invalid value for 'require' module flag (expected metadata pair)",
+          Op->getOperand(2));
+    Check(isa<MDString>(Value->getOperand(0)),
+          ("invalid value for 'require' module flag "
+           "(first value operand should be a string)"),
+          Value->getOperand(0));
 
     // Append it to the list of requirements, to check once all module flags are
     // scanned.
@@ -1629,10 +1658,10 @@ Verifier::visitModuleFlag(const MDNode *Op,
   case Module::Append:
   case Module::AppendUnique: {
     // These behavior types require the operand be an MDNode.
-    Assert(isa<MDNode>(Op->getOperand(2)),
-           "invalid value for 'append'-type module flag "
-           "(expected a metadata node)",
-           Op->getOperand(2));
+    Check(isa<MDNode>(Op->getOperand(2)),
+          "invalid value for 'append'-type module flag "
+          "(expected a metadata node)",
+          Op->getOperand(2));
     break;
   }
   }
@@ -1640,29 +1669,29 @@ Verifier::visitModuleFlag(const MDNode *Op,
   // Unless this is a "requires" flag, check the ID is unique.
   if (MFB != Module::Require) {
     bool Inserted = SeenIDs.insert(std::make_pair(ID, Op)).second;
-    Assert(Inserted,
-           "module flag identifiers must be unique (or of 'require' type)", ID);
+    Check(Inserted,
+          "module flag identifiers must be unique (or of 'require' type)", ID);
   }
 
   if (ID->getString() == "wchar_size") {
     ConstantInt *Value
       = mdconst::dyn_extract_or_null<ConstantInt>(Op->getOperand(2));
-    Assert(Value, "wchar_size metadata requires constant integer argument");
+    Check(Value, "wchar_size metadata requires constant integer argument");
   }
 
   if (ID->getString() == "Linker Options") {
     // If the llvm.linker.options named metadata exists, we assume that the
     // bitcode reader has upgraded the module flag. Otherwise the flag might
     // have been created by a client directly.
-    Assert(M.getNamedMetadata("llvm.linker.options"),
-           "'Linker Options' named metadata no longer supported");
+    Check(M.getNamedMetadata("llvm.linker.options"),
+          "'Linker Options' named metadata no longer supported");
   }
 
   if (ID->getString() == "SemanticInterposition") {
     ConstantInt *Value =
         mdconst::dyn_extract_or_null<ConstantInt>(Op->getOperand(2));
-    Assert(Value,
-           "SemanticInterposition metadata requires constant integer argument");
+    Check(Value,
+          "SemanticInterposition metadata requires constant integer argument");
   }
 
   if (ID->getString() == "CG Profile") {
@@ -1676,16 +1705,16 @@ void Verifier::visitModuleFlagCGProfileEntry(const MDOperand &MDO) {
     if (!FuncMDO)
       return;
     auto F = dyn_cast<ValueAsMetadata>(FuncMDO);
-    Assert(F && isa<Function>(F->getValue()->stripPointerCasts()),
-           "expected a Function or null", FuncMDO);
+    Check(F && isa<Function>(F->getValue()->stripPointerCasts()),
+          "expected a Function or null", FuncMDO);
   };
   auto Node = dyn_cast_or_null<MDNode>(MDO);
-  Assert(Node && Node->getNumOperands() == 3, "expected a MDNode triple", MDO);
+  Check(Node && Node->getNumOperands() == 3, "expected a MDNode triple", MDO);
   CheckFunction(Node->getOperand(0));
   CheckFunction(Node->getOperand(1));
   auto Count = dyn_cast_or_null<ConstantAsMetadata>(Node->getOperand(2));
-  Assert(Count && Count->getType()->isIntegerTy(),
-         "expected an integer constant", Node->getOperand(2));
+  Check(Count && Count->getType()->isIntegerTy(),
+        "expected an integer constant", Node->getOperand(2));
 }
 
 void Verifier::verifyAttributeTypes(AttributeSet Attrs, const Value *V) {
@@ -1724,15 +1753,14 @@ void Verifier::verifyParameterAttrs(AttributeSet Attrs, Type *Ty,
   verifyAttributeTypes(Attrs, V);
 
   for (Attribute Attr : Attrs)
-    Assert(Attr.isStringAttribute() ||
-           Attribute::canUseAsParamAttr(Attr.getKindAsEnum()),
-           "Attribute '" + Attr.getAsString() +
-               "' does not apply to parameters",
-           V);
+    Check(Attr.isStringAttribute() ||
+              Attribute::canUseAsParamAttr(Attr.getKindAsEnum()),
+          "Attribute '" + Attr.getAsString() + "' does not apply to parameters",
+          V);
 
   if (Attrs.hasAttribute(Attribute::ImmArg)) {
-    Assert(Attrs.getNumAttributes() == 1,
-           "Attribute 'immarg' is incompatible with other attributes", V);
+    Check(Attrs.getNumAttributes() == 1,
+          "Attribute 'immarg' is incompatible with other attributes", V);
   }
 
   // Check for mutually incompatible attributes.  Only inreg is compatible with
@@ -1745,52 +1773,52 @@ void Verifier::verifyParameterAttrs(AttributeSet Attrs, Type *Ty,
                Attrs.hasAttribute(Attribute::InReg);
   AttrCount += Attrs.hasAttribute(Attribute::Nest);
   AttrCount += Attrs.hasAttribute(Attribute::ByRef);
-  Assert(AttrCount <= 1,
-         "Attributes 'byval', 'inalloca', 'preallocated', 'inreg', 'nest', "
-         "'byref', and 'sret' are incompatible!",
-         V);
-
-  Assert(!(Attrs.hasAttribute(Attribute::InAlloca) &&
-           Attrs.hasAttribute(Attribute::ReadOnly)),
-         "Attributes "
-         "'inalloca and readonly' are incompatible!",
-         V);
-
-  Assert(!(Attrs.hasAttribute(Attribute::StructRet) &&
-           Attrs.hasAttribute(Attribute::Returned)),
-         "Attributes "
-         "'sret and returned' are incompatible!",
-         V);
-
-  Assert(!(Attrs.hasAttribute(Attribute::ZExt) &&
-           Attrs.hasAttribute(Attribute::SExt)),
-         "Attributes "
-         "'zeroext and signext' are incompatible!",
-         V);
-
-  Assert(!(Attrs.hasAttribute(Attribute::ReadNone) &&
-           Attrs.hasAttribute(Attribute::ReadOnly)),
-         "Attributes "
-         "'readnone and readonly' are incompatible!",
-         V);
-
-  Assert(!(Attrs.hasAttribute(Attribute::ReadNone) &&
-           Attrs.hasAttribute(Attribute::WriteOnly)),
-         "Attributes "
-         "'readnone and writeonly' are incompatible!",
-         V);
-
-  Assert(!(Attrs.hasAttribute(Attribute::ReadOnly) &&
-           Attrs.hasAttribute(Attribute::WriteOnly)),
-         "Attributes "
-         "'readonly and writeonly' are incompatible!",
-         V);
-
-  Assert(!(Attrs.hasAttribute(Attribute::NoInline) &&
-           Attrs.hasAttribute(Attribute::AlwaysInline)),
-         "Attributes "
-         "'noinline and alwaysinline' are incompatible!",
-         V);
+  Check(AttrCount <= 1,
+        "Attributes 'byval', 'inalloca', 'preallocated', 'inreg', 'nest', "
+        "'byref', and 'sret' are incompatible!",
+        V);
+
+  Check(!(Attrs.hasAttribute(Attribute::InAlloca) &&
+          Attrs.hasAttribute(Attribute::ReadOnly)),
+        "Attributes "
+        "'inalloca and readonly' are incompatible!",
+        V);
+
+  Check(!(Attrs.hasAttribute(Attribute::StructRet) &&
+          Attrs.hasAttribute(Attribute::Returned)),
+        "Attributes "
+        "'sret and returned' are incompatible!",
+        V);
+
+  Check(!(Attrs.hasAttribute(Attribute::ZExt) &&
+          Attrs.hasAttribute(Attribute::SExt)),
+        "Attributes "
+        "'zeroext and signext' are incompatible!",
+        V);
+
+  Check(!(Attrs.hasAttribute(Attribute::ReadNone) &&
+          Attrs.hasAttribute(Attribute::ReadOnly)),
+        "Attributes "
+        "'readnone and readonly' are incompatible!",
+        V);
+
+  Check(!(Attrs.hasAttribute(Attribute::ReadNone) &&
+          Attrs.hasAttribute(Attribute::WriteOnly)),
+        "Attributes "
+        "'readnone and writeonly' are incompatible!",
+        V);
+
+  Check(!(Attrs.hasAttribute(Attribute::ReadOnly) &&
+          Attrs.hasAttribute(Attribute::WriteOnly)),
+        "Attributes "
+        "'readonly and writeonly' are incompatible!",
+        V);
+
+  Check(!(Attrs.hasAttribute(Attribute::NoInline) &&
+          Attrs.hasAttribute(Attribute::AlwaysInline)),
+        "Attributes "
+        "'noinline and alwaysinline' are incompatible!",
+        V);
 
   AttributeMask IncompatibleAttrs = AttributeFuncs::typeIncompatible(Ty);
   for (Attribute Attr : Attrs) {
@@ -1804,55 +1832,61 @@ void Verifier::verifyParameterAttrs(AttributeSet Attrs, Type *Ty,
 
   if (PointerType *PTy = dyn_cast<PointerType>(Ty)) {
     if (Attrs.hasAttribute(Attribute::ByVal)) {
+      if (Attrs.hasAttribute(Attribute::Alignment)) {
+        Align AttrAlign = Attrs.getAlignment().valueOrOne();
+        Align MaxAlign(ParamMaxAlignment);
+        Check(AttrAlign <= MaxAlign,
+              "Attribute 'align' exceed the max size 2^14", V);
+      }
       SmallPtrSet<Type *, 4> Visited;
-      Assert(Attrs.getByValType()->isSized(&Visited),
-             "Attribute 'byval' does not support unsized types!", V);
+      Check(Attrs.getByValType()->isSized(&Visited),
+            "Attribute 'byval' does not support unsized types!", V);
     }
     if (Attrs.hasAttribute(Attribute::ByRef)) {
       SmallPtrSet<Type *, 4> Visited;
-      Assert(Attrs.getByRefType()->isSized(&Visited),
-             "Attribute 'byref' does not support unsized types!", V);
+      Check(Attrs.getByRefType()->isSized(&Visited),
+            "Attribute 'byref' does not support unsized types!", V);
     }
     if (Attrs.hasAttribute(Attribute::InAlloca)) {
       SmallPtrSet<Type *, 4> Visited;
-      Assert(Attrs.getInAllocaType()->isSized(&Visited),
-             "Attribute 'inalloca' does not support unsized types!", V);
+      Check(Attrs.getInAllocaType()->isSized(&Visited),
+            "Attribute 'inalloca' does not support unsized types!", V);
     }
     if (Attrs.hasAttribute(Attribute::Preallocated)) {
       SmallPtrSet<Type *, 4> Visited;
-      Assert(Attrs.getPreallocatedType()->isSized(&Visited),
-             "Attribute 'preallocated' does not support unsized types!", V);
+      Check(Attrs.getPreallocatedType()->isSized(&Visited),
+            "Attribute 'preallocated' does not support unsized types!", V);
     }
     if (!PTy->isOpaque()) {
       if (!isa<PointerType>(PTy->getNonOpaquePointerElementType()))
-        Assert(!Attrs.hasAttribute(Attribute::SwiftError),
-               "Attribute 'swifterror' only applies to parameters "
-               "with pointer to pointer type!",
-               V);
+        Check(!Attrs.hasAttribute(Attribute::SwiftError),
+              "Attribute 'swifterror' only applies to parameters "
+              "with pointer to pointer type!",
+              V);
       if (Attrs.hasAttribute(Attribute::ByRef)) {
-        Assert(Attrs.getByRefType() == PTy->getNonOpaquePointerElementType(),
-               "Attribute 'byref' type does not match parameter!", V);
+        Check(Attrs.getByRefType() == PTy->getNonOpaquePointerElementType(),
+              "Attribute 'byref' type does not match parameter!", V);
       }
 
       if (Attrs.hasAttribute(Attribute::ByVal) && Attrs.getByValType()) {
-        Assert(Attrs.getByValType() == PTy->getNonOpaquePointerElementType(),
-               "Attribute 'byval' type does not match parameter!", V);
+        Check(Attrs.getByValType() == PTy->getNonOpaquePointerElementType(),
+              "Attribute 'byval' type does not match parameter!", V);
       }
 
       if (Attrs.hasAttribute(Attribute::Preallocated)) {
-        Assert(Attrs.getPreallocatedType() ==
-                   PTy->getNonOpaquePointerElementType(),
-               "Attribute 'preallocated' type does not match parameter!", V);
+        Check(Attrs.getPreallocatedType() ==
+                  PTy->getNonOpaquePointerElementType(),
+              "Attribute 'preallocated' type does not match parameter!", V);
       }
 
       if (Attrs.hasAttribute(Attribute::InAlloca)) {
-        Assert(Attrs.getInAllocaType() == PTy->getNonOpaquePointerElementType(),
-               "Attribute 'inalloca' type does not match parameter!", V);
+        Check(Attrs.getInAllocaType() == PTy->getNonOpaquePointerElementType(),
+              "Attribute 'inalloca' type does not match parameter!", V);
       }
 
       if (Attrs.hasAttribute(Attribute::ElementType)) {
-        Assert(Attrs.getElementType() == PTy->getNonOpaquePointerElementType(),
-               "Attribute 'elementtype' type does not match parameter!", V);
+        Check(Attrs.getElementType() == PTy->getNonOpaquePointerElementType(),
+              "Attribute 'elementtype' type does not match parameter!", V);
       }
     }
   }
@@ -1877,14 +1911,14 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
     return;
 
   if (AttributeListsVisited.insert(Attrs.getRawPointer()).second) {
-    Assert(Attrs.hasParentContext(Context),
-           "Attribute list does not match Module context!", &Attrs, V);
+    Check(Attrs.hasParentContext(Context),
+          "Attribute list does not match Module context!", &Attrs, V);
     for (const auto &AttrSet : Attrs) {
-      Assert(!AttrSet.hasAttributes() || AttrSet.hasParentContext(Context),
-             "Attribute set does not match Module context!", &AttrSet, V);
+      Check(!AttrSet.hasAttributes() || AttrSet.hasParentContext(Context),
+            "Attribute set does not match Module context!", &AttrSet, V);
       for (const auto &A : AttrSet) {
-        Assert(A.hasParentContext(Context),
-               "Attribute does not match Module context!", &A, V);
+        Check(A.hasParentContext(Context),
+              "Attribute does not match Module context!", &A, V);
       }
     }
   }
@@ -1899,11 +1933,11 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
   // Verify return value attributes.
   AttributeSet RetAttrs = Attrs.getRetAttrs();
   for (Attribute RetAttr : RetAttrs)
-    Assert(RetAttr.isStringAttribute() ||
-           Attribute::canUseAsRetAttr(RetAttr.getKindAsEnum()),
-           "Attribute '" + RetAttr.getAsString() +
-               "' does not apply to function return values",
-           V);
+    Check(RetAttr.isStringAttribute() ||
+              Attribute::canUseAsRetAttr(RetAttr.getKindAsEnum()),
+          "Attribute '" + RetAttr.getAsString() +
+              "' does not apply to function return values",
+          V);
 
   verifyParameterAttrs(RetAttrs, FT->getReturnType(), V);
 
@@ -1913,56 +1947,55 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
     AttributeSet ArgAttrs = Attrs.getParamAttrs(i);
 
     if (!IsIntrinsic) {
-      Assert(!ArgAttrs.hasAttribute(Attribute::ImmArg),
-             "immarg attribute only applies to intrinsics",V);
+      Check(!ArgAttrs.hasAttribute(Attribute::ImmArg),
+            "immarg attribute only applies to intrinsics", V);
       if (!IsInlineAsm)
-        Assert(!ArgAttrs.hasAttribute(Attribute::ElementType),
-               "Attribute 'elementtype' can only be applied to intrinsics"
-               " and inline asm.", V);
+        Check(!ArgAttrs.hasAttribute(Attribute::ElementType),
+              "Attribute 'elementtype' can only be applied to intrinsics"
+              " and inline asm.",
+              V);
     }
 
     verifyParameterAttrs(ArgAttrs, Ty, V);
 
     if (ArgAttrs.hasAttribute(Attribute::Nest)) {
-      Assert(!SawNest, "More than one parameter has attribute nest!", V);
+      Check(!SawNest, "More than one parameter has attribute nest!", V);
       SawNest = true;
     }
 
     if (ArgAttrs.hasAttribute(Attribute::Returned)) {
-      Assert(!SawReturned, "More than one parameter has attribute returned!",
-             V);
-      Assert(Ty->canLosslesslyBitCastTo(FT->getReturnType()),
-             "Incompatible argument and return types for 'returned' attribute",
-             V);
+      Check(!SawReturned, "More than one parameter has attribute returned!", V);
+      Check(Ty->canLosslesslyBitCastTo(FT->getReturnType()),
+            "Incompatible argument and return types for 'returned' attribute",
+            V);
       SawReturned = true;
     }
 
     if (ArgAttrs.hasAttribute(Attribute::StructRet)) {
-      Assert(!SawSRet, "Cannot have multiple 'sret' parameters!", V);
-      Assert(i == 0 || i == 1,
-             "Attribute 'sret' is not on first or second parameter!", V);
+      Check(!SawSRet, "Cannot have multiple 'sret' parameters!", V);
+      Check(i == 0 || i == 1,
+            "Attribute 'sret' is not on first or second parameter!", V);
       SawSRet = true;
     }
 
     if (ArgAttrs.hasAttribute(Attribute::SwiftSelf)) {
-      Assert(!SawSwiftSelf, "Cannot have multiple 'swiftself' parameters!", V);
+      Check(!SawSwiftSelf, "Cannot have multiple 'swiftself' parameters!", V);
       SawSwiftSelf = true;
     }
 
     if (ArgAttrs.hasAttribute(Attribute::SwiftAsync)) {
-      Assert(!SawSwiftAsync, "Cannot have multiple 'swiftasync' parameters!", V);
+      Check(!SawSwiftAsync, "Cannot have multiple 'swiftasync' parameters!", V);
       SawSwiftAsync = true;
     }
 
     if (ArgAttrs.hasAttribute(Attribute::SwiftError)) {
-      Assert(!SawSwiftError, "Cannot have multiple 'swifterror' parameters!",
-             V);
+      Check(!SawSwiftError, "Cannot have multiple 'swifterror' parameters!", V);
       SawSwiftError = true;
     }
 
     if (ArgAttrs.hasAttribute(Attribute::InAlloca)) {
-      Assert(i == FT->getNumParams() - 1,
-             "inalloca isn't on the last parameter!", V);
+      Check(i == FT->getNumParams() - 1,
+            "inalloca isn't on the last parameter!", V);
     }
   }
 
@@ -1971,53 +2004,53 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
 
   verifyAttributeTypes(Attrs.getFnAttrs(), V);
   for (Attribute FnAttr : Attrs.getFnAttrs())
-    Assert(FnAttr.isStringAttribute() ||
-           Attribute::canUseAsFnAttr(FnAttr.getKindAsEnum()),
-           "Attribute '" + FnAttr.getAsString() +
-               "' does not apply to functions!",
-           V);
-
-  Assert(!(Attrs.hasFnAttr(Attribute::ReadNone) &&
-           Attrs.hasFnAttr(Attribute::ReadOnly)),
-         "Attributes 'readnone and readonly' are incompatible!", V);
-
-  Assert(!(Attrs.hasFnAttr(Attribute::ReadNone) &&
-           Attrs.hasFnAttr(Attribute::WriteOnly)),
-         "Attributes 'readnone and writeonly' are incompatible!", V);
-
-  Assert(!(Attrs.hasFnAttr(Attribute::ReadOnly) &&
-           Attrs.hasFnAttr(Attribute::WriteOnly)),
-         "Attributes 'readonly and writeonly' are incompatible!", V);
-
-  Assert(!(Attrs.hasFnAttr(Attribute::ReadNone) &&
-           Attrs.hasFnAttr(Attribute::InaccessibleMemOrArgMemOnly)),
-         "Attributes 'readnone and inaccessiblemem_or_argmemonly' are "
-         "incompatible!",
-         V);
-
-  Assert(!(Attrs.hasFnAttr(Attribute::ReadNone) &&
-           Attrs.hasFnAttr(Attribute::InaccessibleMemOnly)),
-         "Attributes 'readnone and inaccessiblememonly' are incompatible!", V);
-
-  Assert(!(Attrs.hasFnAttr(Attribute::NoInline) &&
-           Attrs.hasFnAttr(Attribute::AlwaysInline)),
-         "Attributes 'noinline and alwaysinline' are incompatible!", V);
+    Check(FnAttr.isStringAttribute() ||
+              Attribute::canUseAsFnAttr(FnAttr.getKindAsEnum()),
+          "Attribute '" + FnAttr.getAsString() +
+              "' does not apply to functions!",
+          V);
+
+  Check(!(Attrs.hasFnAttr(Attribute::ReadNone) &&
+          Attrs.hasFnAttr(Attribute::ReadOnly)),
+        "Attributes 'readnone and readonly' are incompatible!", V);
+
+  Check(!(Attrs.hasFnAttr(Attribute::ReadNone) &&
+          Attrs.hasFnAttr(Attribute::WriteOnly)),
+        "Attributes 'readnone and writeonly' are incompatible!", V);
+
+  Check(!(Attrs.hasFnAttr(Attribute::ReadOnly) &&
+          Attrs.hasFnAttr(Attribute::WriteOnly)),
+        "Attributes 'readonly and writeonly' are incompatible!", V);
+
+  Check(!(Attrs.hasFnAttr(Attribute::ReadNone) &&
+          Attrs.hasFnAttr(Attribute::InaccessibleMemOrArgMemOnly)),
+        "Attributes 'readnone and inaccessiblemem_or_argmemonly' are "
+        "incompatible!",
+        V);
+
+  Check(!(Attrs.hasFnAttr(Attribute::ReadNone) &&
+          Attrs.hasFnAttr(Attribute::InaccessibleMemOnly)),
+        "Attributes 'readnone and inaccessiblememonly' are incompatible!", V);
+
+  Check(!(Attrs.hasFnAttr(Attribute::NoInline) &&
+          Attrs.hasFnAttr(Attribute::AlwaysInline)),
+        "Attributes 'noinline and alwaysinline' are incompatible!", V);
 
   if (Attrs.hasFnAttr(Attribute::OptimizeNone)) {
-    Assert(Attrs.hasFnAttr(Attribute::NoInline),
-           "Attribute 'optnone' requires 'noinline'!", V);
+    Check(Attrs.hasFnAttr(Attribute::NoInline),
+          "Attribute 'optnone' requires 'noinline'!", V);
 
-    Assert(!Attrs.hasFnAttr(Attribute::OptimizeForSize),
-           "Attributes 'optsize and optnone' are incompatible!", V);
+    Check(!Attrs.hasFnAttr(Attribute::OptimizeForSize),
+          "Attributes 'optsize and optnone' are incompatible!", V);
 
-    Assert(!Attrs.hasFnAttr(Attribute::MinSize),
-           "Attributes 'minsize and optnone' are incompatible!", V);
+    Check(!Attrs.hasFnAttr(Attribute::MinSize),
+          "Attributes 'minsize and optnone' are incompatible!", V);
   }
 
   if (Attrs.hasFnAttr(Attribute::JumpTable)) {
     const GlobalValue *GV = cast<GlobalValue>(V);
-    Assert(GV->hasGlobalUnnamedAddr(),
-           "Attribute 'jumptable' requires 'unnamed_addr'", V);
+    Check(GV->hasGlobalUnnamedAddr(),
+          "Attribute 'jumptable' requires 'unnamed_addr'", V);
   }
 
   if (Attrs.hasFnAttr(Attribute::AllocSize)) {
@@ -2047,6 +2080,25 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
       return;
   }
 
+  if (Attrs.hasFnAttr(Attribute::AllocKind)) {
+    AllocFnKind K = Attrs.getAllocKind();
+    AllocFnKind Type =
+        K & (AllocFnKind::Alloc | AllocFnKind::Realloc | AllocFnKind::Free);
+    if (!is_contained(
+            {AllocFnKind::Alloc, AllocFnKind::Realloc, AllocFnKind::Free},
+            Type))
+      CheckFailed(
+          "'allockind()' requires exactly one of alloc, realloc, and free");
+    if ((Type == AllocFnKind::Free) &&
+        ((K & (AllocFnKind::Uninitialized | AllocFnKind::Zeroed |
+               AllocFnKind::Aligned)) != AllocFnKind::Unknown))
+      CheckFailed("'allockind(\"free\")' doesn't allow uninitialized, zeroed, "
+                  "or aligned modifiers.");
+    AllocFnKind ZeroedUninit = AllocFnKind::Uninitialized | AllocFnKind::Zeroed;
+    if ((K & ZeroedUninit) == ZeroedUninit)
+      CheckFailed("'allockind()' can't be both zeroed and uninitialized");
+  }
+
   if (Attrs.hasFnAttr(Attribute::VScaleRange)) {
     unsigned VScaleMin = Attrs.getFnAttrs().getVScaleRangeMin();
     if (VScaleMin == 0)
@@ -2073,27 +2125,27 @@ void Verifier::verifyFunctionMetadata(
   for (const auto &Pair : MDs) {
     if (Pair.first == LLVMContext::MD_prof) {
       MDNode *MD = Pair.second;
-      Assert(MD->getNumOperands() >= 2,
-             "!prof annotations should have no less than 2 operands", MD);
+      Check(MD->getNumOperands() >= 2,
+            "!prof annotations should have no less than 2 operands", MD);
 
       // Check first operand.
-      Assert(MD->getOperand(0) != nullptr, "first operand should not be null",
-             MD);
-      Assert(isa<MDString>(MD->getOperand(0)),
-             "expected string with name of the !prof annotation", MD);
+      Check(MD->getOperand(0) != nullptr, "first operand should not be null",
+            MD);
+      Check(isa<MDString>(MD->getOperand(0)),
+            "expected string with name of the !prof annotation", MD);
       MDString *MDS = cast<MDString>(MD->getOperand(0));
       StringRef ProfName = MDS->getString();
-      Assert(ProfName.equals("function_entry_count") ||
-                 ProfName.equals("synthetic_function_entry_count"),
-             "first operand should be 'function_entry_count'"
-             " or 'synthetic_function_entry_count'",
-             MD);
+      Check(ProfName.equals("function_entry_count") ||
+                ProfName.equals("synthetic_function_entry_count"),
+            "first operand should be 'function_entry_count'"
+            " or 'synthetic_function_entry_count'",
+            MD);
 
       // Check second operand.
-      Assert(MD->getOperand(1) != nullptr, "second operand should not be null",
-             MD);
-      Assert(isa<ConstantAsMetadata>(MD->getOperand(1)),
-             "expected integer argument to function_entry_count", MD);
+      Check(MD->getOperand(1) != nullptr, "second operand should not be null",
+            MD);
+      Check(isa<ConstantAsMetadata>(MD->getOperand(1)),
+            "expected integer argument to function_entry_count", MD);
     }
   }
 }
@@ -2115,8 +2167,8 @@ void Verifier::visitConstantExprsRecursively(const Constant *EntryC) {
     if (const auto *GV = dyn_cast<GlobalValue>(C)) {
       // Global Values get visited separately, but we do need to make sure
       // that the global value is in the correct module
-      Assert(GV->getParent() == &M, "Referencing global in another module!",
-             EntryC, &M, GV, GV->getParent());
+      Check(GV->getParent() == &M, "Referencing global in another module!",
+            EntryC, &M, GV, GV->getParent());
       continue;
     }
 
@@ -2134,9 +2186,9 @@ void Verifier::visitConstantExprsRecursively(const Constant *EntryC) {
 
 void Verifier::visitConstantExpr(const ConstantExpr *CE) {
   if (CE->getOpcode() == Instruction::BitCast)
-    Assert(CastInst::castIsValid(Instruction::BitCast, CE->getOperand(0),
-                                 CE->getType()),
-           "Invalid bitcast", CE);
+    Check(CastInst::castIsValid(Instruction::BitCast, CE->getOperand(0),
+                                CE->getType()),
+          "Invalid bitcast", CE);
 }
 
 bool Verifier::verifyAttributeCount(AttributeList Attrs, unsigned Params) {
@@ -2155,17 +2207,17 @@ void Verifier::verifyInlineAsmCall(const CallBase &Call) {
 
     if (CI.isIndirect) {
       const Value *Arg = Call.getArgOperand(ArgNo);
-      Assert(Arg->getType()->isPointerTy(),
-             "Operand for indirect constraint must have pointer type",
-             &Call);
+      Check(Arg->getType()->isPointerTy(),
+            "Operand for indirect constraint must have pointer type", &Call);
 
-      Assert(Call.getAttributes().getParamElementType(ArgNo),
-             "Operand for indirect constraint must have elementtype attribute",
-             &Call);
+      Check(Call.getParamElementType(ArgNo),
+            "Operand for indirect constraint must have elementtype attribute",
+            &Call);
     } else {
-      Assert(!Call.paramHasAttr(ArgNo, Attribute::ElementType),
-             "Elementtype attribute can only be applied for indirect "
-             "constraints", &Call);
+      Check(!Call.paramHasAttr(ArgNo, Attribute::ElementType),
+            "Elementtype attribute can only be applied for indirect "
+            "constraints",
+            &Call);
     }
 
     ArgNo++;
@@ -2178,50 +2230,50 @@ void Verifier::verifyStatepoint(const CallBase &Call) {
          Call.getCalledFunction()->getIntrinsicID() ==
              Intrinsic::experimental_gc_statepoint);
 
-  Assert(!Call.doesNotAccessMemory() && !Call.onlyReadsMemory() &&
-             !Call.onlyAccessesArgMemory(),
-         "gc.statepoint must read and write all memory to preserve "
-         "reordering restrictions required by safepoint semantics",
-         Call);
+  Check(!Call.doesNotAccessMemory() && !Call.onlyReadsMemory() &&
+            !Call.onlyAccessesArgMemory(),
+        "gc.statepoint must read and write all memory to preserve "
+        "reordering restrictions required by safepoint semantics",
+        Call);
 
   const int64_t NumPatchBytes =
       cast<ConstantInt>(Call.getArgOperand(1))->getSExtValue();
   assert(isInt<32>(NumPatchBytes) && "NumPatchBytesV is an i32!");
-  Assert(NumPatchBytes >= 0,
-         "gc.statepoint number of patchable bytes must be "
-         "positive",
-         Call);
-
-  const Value *Target = Call.getArgOperand(2);
-  auto *PT = dyn_cast<PointerType>(Target->getType());
-  Assert(PT && PT->getPointerElementType()->isFunctionTy(),
-         "gc.statepoint callee must be of function pointer type", Call, Target);
-  FunctionType *TargetFuncType =
-      cast<FunctionType>(PT->getPointerElementType());
+  Check(NumPatchBytes >= 0,
+        "gc.statepoint number of patchable bytes must be "
+        "positive",
+        Call);
+
+  Type *TargetElemType = Call.getParamElementType(2);
+  Check(TargetElemType,
+        "gc.statepoint callee argument must have elementtype attribute", Call);
+  FunctionType *TargetFuncType = dyn_cast<FunctionType>(TargetElemType);
+  Check(TargetFuncType,
+        "gc.statepoint callee elementtype must be function type", Call);
 
   const int NumCallArgs = cast<ConstantInt>(Call.getArgOperand(3))->getZExtValue();
-  Assert(NumCallArgs >= 0,
-         "gc.statepoint number of arguments to underlying call "
-         "must be positive",
-         Call);
+  Check(NumCallArgs >= 0,
+        "gc.statepoint number of arguments to underlying call "
+        "must be positive",
+        Call);
   const int NumParams = (int)TargetFuncType->getNumParams();
   if (TargetFuncType->isVarArg()) {
-    Assert(NumCallArgs >= NumParams,
-           "gc.statepoint mismatch in number of vararg call args", Call);
+    Check(NumCallArgs >= NumParams,
+          "gc.statepoint mismatch in number of vararg call args", Call);
 
     // TODO: Remove this limitation
-    Assert(TargetFuncType->getReturnType()->isVoidTy(),
-           "gc.statepoint doesn't support wrapping non-void "
-           "vararg functions yet",
-           Call);
+    Check(TargetFuncType->getReturnType()->isVoidTy(),
+          "gc.statepoint doesn't support wrapping non-void "
+          "vararg functions yet",
+          Call);
   } else
-    Assert(NumCallArgs == NumParams,
-           "gc.statepoint mismatch in number of call args", Call);
+    Check(NumCallArgs == NumParams,
+          "gc.statepoint mismatch in number of call args", Call);
 
   const uint64_t Flags
     = cast<ConstantInt>(Call.getArgOperand(4))->getZExtValue();
-  Assert((Flags & ~(uint64_t)StatepointFlags::MaskAll) == 0,
-         "unknown flag used in gc.statepoint flags argument", Call);
+  Check((Flags & ~(uint64_t)StatepointFlags::MaskAll) == 0,
+        "unknown flag used in gc.statepoint flags argument", Call);
 
   // Verify that the types of the call parameter arguments match
   // the type of the wrapped callee.
@@ -2229,63 +2281,62 @@ void Verifier::verifyStatepoint(const CallBase &Call) {
   for (int i = 0; i < NumParams; i++) {
     Type *ParamType = TargetFuncType->getParamType(i);
     Type *ArgType = Call.getArgOperand(5 + i)->getType();
-    Assert(ArgType == ParamType,
-           "gc.statepoint call argument does not match wrapped "
-           "function type",
-           Call);
+    Check(ArgType == ParamType,
+          "gc.statepoint call argument does not match wrapped "
+          "function type",
+          Call);
 
     if (TargetFuncType->isVarArg()) {
       AttributeSet ArgAttrs = Attrs.getParamAttrs(5 + i);
-      Assert(!ArgAttrs.hasAttribute(Attribute::StructRet),
-             "Attribute 'sret' cannot be used for vararg call arguments!",
-             Call);
+      Check(!ArgAttrs.hasAttribute(Attribute::StructRet),
+            "Attribute 'sret' cannot be used for vararg call arguments!", Call);
     }
   }
 
   const int EndCallArgsInx = 4 + NumCallArgs;
 
   const Value *NumTransitionArgsV = Call.getArgOperand(EndCallArgsInx + 1);
-  Assert(isa<ConstantInt>(NumTransitionArgsV),
-         "gc.statepoint number of transition arguments "
-         "must be constant integer",
-         Call);
+  Check(isa<ConstantInt>(NumTransitionArgsV),
+        "gc.statepoint number of transition arguments "
+        "must be constant integer",
+        Call);
   const int NumTransitionArgs =
       cast<ConstantInt>(NumTransitionArgsV)->getZExtValue();
-  Assert(NumTransitionArgs == 0,
-         "gc.statepoint w/inline transition bundle is deprecated", Call);
+  Check(NumTransitionArgs == 0,
+        "gc.statepoint w/inline transition bundle is deprecated", Call);
   const int EndTransitionArgsInx = EndCallArgsInx + 1 + NumTransitionArgs;
 
   const Value *NumDeoptArgsV = Call.getArgOperand(EndTransitionArgsInx + 1);
-  Assert(isa<ConstantInt>(NumDeoptArgsV),
-         "gc.statepoint number of deoptimization arguments "
-         "must be constant integer",
-         Call);
+  Check(isa<ConstantInt>(NumDeoptArgsV),
+        "gc.statepoint number of deoptimization arguments "
+        "must be constant integer",
+        Call);
   const int NumDeoptArgs = cast<ConstantInt>(NumDeoptArgsV)->getZExtValue();
-  Assert(NumDeoptArgs == 0,
-         "gc.statepoint w/inline deopt operands is deprecated", Call);
+  Check(NumDeoptArgs == 0,
+        "gc.statepoint w/inline deopt operands is deprecated", Call);
 
   const int ExpectedNumArgs = 7 + NumCallArgs;
-  Assert(ExpectedNumArgs == (int)Call.arg_size(),
-         "gc.statepoint too many arguments", Call);
+  Check(ExpectedNumArgs == (int)Call.arg_size(),
+        "gc.statepoint too many arguments", Call);
 
   // Check that the only uses of this gc.statepoint are gc.result or
   // gc.relocate calls which are tied to this statepoint and thus part
   // of the same statepoint sequence
   for (const User *U : Call.users()) {
     const CallInst *UserCall = dyn_cast<const CallInst>(U);
-    Assert(UserCall, "illegal use of statepoint token", Call, U);
+    Check(UserCall, "illegal use of statepoint token", Call, U);
     if (!UserCall)
       continue;
-    Assert(isa<GCRelocateInst>(UserCall) || isa<GCResultInst>(UserCall),
-           "gc.result or gc.relocate are the only value uses "
-           "of a gc.statepoint",
-           Call, U);
+    Check(isa<GCRelocateInst>(UserCall) || isa<GCResultInst>(UserCall),
+          "gc.result or gc.relocate are the only value uses "
+          "of a gc.statepoint",
+          Call, U);
     if (isa<GCResultInst>(UserCall)) {
-      Assert(UserCall->getArgOperand(0) == &Call,
-             "gc.result connected to wrong gc.statepoint", Call, UserCall);
+      Check(UserCall->getArgOperand(0) == &Call,
+            "gc.result connected to wrong gc.statepoint", Call, UserCall);
     } else if (isa<GCRelocateInst>(Call)) {
-      Assert(UserCall->getArgOperand(0) == &Call,
-             "gc.relocate connected to wrong gc.statepoint", Call, UserCall);
+      Check(UserCall->getArgOperand(0) == &Call,
+            "gc.relocate connected to wrong gc.statepoint", Call, UserCall);
     }
   }
 
@@ -2304,11 +2355,11 @@ void Verifier::verifyFrameRecoverIndices() {
     Function *F = Counts.first;
     unsigned EscapedObjectCount = Counts.second.first;
     unsigned MaxRecoveredIndex = Counts.second.second;
-    Assert(MaxRecoveredIndex <= EscapedObjectCount,
-           "all indices passed to llvm.localrecover must be less than the "
-           "number of arguments passed to llvm.localescape in the parent "
-           "function",
-           F);
+    Check(MaxRecoveredIndex <= EscapedObjectCount,
+          "all indices passed to llvm.localrecover must be less than the "
+          "number of arguments passed to llvm.localescape in the parent "
+          "function",
+          F);
   }
 }
 
@@ -2345,8 +2396,8 @@ void Verifier::verifySiblingFuncletUnwinds() {
             CycleNodes.push_back(CycleTerminator);
           CyclePad = getSuccPad(CycleTerminator);
         } while (CyclePad != SuccPad);
-        Assert(false, "EH pads can't handle each other's exceptions",
-               ArrayRef<Instruction *>(CycleNodes));
+        Check(false, "EH pads can't handle each other's exceptions",
+              ArrayRef<Instruction *>(CycleNodes));
       }
       // Don't re-walk a node we've already checked
       if (!Visited.insert(SuccPad).second)
@@ -2374,24 +2425,24 @@ void Verifier::visitFunction(const Function &F) {
   FunctionType *FT = F.getFunctionType();
   unsigned NumArgs = F.arg_size();
 
-  Assert(&Context == &F.getContext(),
-         "Function context does not match Module context!", &F);
+  Check(&Context == &F.getContext(),
+        "Function context does not match Module context!", &F);
 
-  Assert(!F.hasCommonLinkage(), "Functions may not have common linkage", &F);
-  Assert(FT->getNumParams() == NumArgs,
-         "# formal arguments must match # of arguments for function type!", &F,
-         FT);
-  Assert(F.getReturnType()->isFirstClassType() ||
-             F.getReturnType()->isVoidTy() || F.getReturnType()->isStructTy(),
-         "Functions cannot return aggregate values!", &F);
+  Check(!F.hasCommonLinkage(), "Functions may not have common linkage", &F);
+  Check(FT->getNumParams() == NumArgs,
+        "# formal arguments must match # of arguments for function type!", &F,
+        FT);
+  Check(F.getReturnType()->isFirstClassType() ||
+            F.getReturnType()->isVoidTy() || F.getReturnType()->isStructTy(),
+        "Functions cannot return aggregate values!", &F);
 
-  Assert(!F.hasStructRetAttr() || F.getReturnType()->isVoidTy(),
-         "Invalid struct return type!", &F);
+  Check(!F.hasStructRetAttr() || F.getReturnType()->isVoidTy(),
+        "Invalid struct return type!", &F);
 
   AttributeList Attrs = F.getAttributes();
 
-  Assert(verifyAttributeCount(Attrs, FT->getNumParams()),
-         "Attribute after last parameter!", &F);
+  Check(verifyAttributeCount(Attrs, FT->getNumParams()),
+        "Attribute after last parameter!", &F);
 
   bool IsIntrinsic = F.isIntrinsic();
 
@@ -2401,11 +2452,11 @@ void Verifier::visitFunction(const Function &F) {
   // On function declarations/definitions, we do not support the builtin
   // attribute. We do not check this in VerifyFunctionAttrs since that is
   // checking for Attributes that can/can not ever be on functions.
-  Assert(!Attrs.hasFnAttr(Attribute::Builtin),
-         "Attribute 'builtin' can only be applied to a callsite.", &F);
+  Check(!Attrs.hasFnAttr(Attribute::Builtin),
+        "Attribute 'builtin' can only be applied to a callsite.", &F);
 
-  Assert(!Attrs.hasAttrSomewhere(Attribute::ElementType),
-         "Attribute 'elementtype' can only be applied to a callsite.", &F);
+  Check(!Attrs.hasAttrSomewhere(Attribute::ElementType),
+        "Attribute 'elementtype' can only be applied to a callsite.", &F);
 
   // Check that this function meets the restrictions on this calling convention.
   // Sometimes varargs is used for perfectly forwarding thunks, so some of these
@@ -2415,38 +2466,37 @@ void Verifier::visitFunction(const Function &F) {
   case CallingConv::C:
     break;
   case CallingConv::X86_INTR: {
-    Assert(F.arg_empty() || Attrs.hasParamAttr(0, Attribute::ByVal),
-           "Calling convention parameter requires byval", &F);
+    Check(F.arg_empty() || Attrs.hasParamAttr(0, Attribute::ByVal),
+          "Calling convention parameter requires byval", &F);
     break;
   }
   case CallingConv::AMDGPU_KERNEL:
   case CallingConv::SPIR_KERNEL:
-    Assert(F.getReturnType()->isVoidTy(),
-           "Calling convention requires void return type", &F);
+    Check(F.getReturnType()->isVoidTy(),
+          "Calling convention requires void return type", &F);
     LLVM_FALLTHROUGH;
   case CallingConv::AMDGPU_VS:
   case CallingConv::AMDGPU_HS:
   case CallingConv::AMDGPU_GS:
   case CallingConv::AMDGPU_PS:
   case CallingConv::AMDGPU_CS:
-    Assert(!F.hasStructRetAttr(),
-           "Calling convention does not allow sret", &F);
+    Check(!F.hasStructRetAttr(), "Calling convention does not allow sret", &F);
     if (F.getCallingConv() != CallingConv::SPIR_KERNEL) {
       const unsigned StackAS = DL.getAllocaAddrSpace();
       unsigned i = 0;
       for (const Argument &Arg : F.args()) {
-        Assert(!Attrs.hasParamAttr(i, Attribute::ByVal),
-               "Calling convention disallows byval", &F);
-        Assert(!Attrs.hasParamAttr(i, Attribute::Preallocated),
-               "Calling convention disallows preallocated", &F);
-        Assert(!Attrs.hasParamAttr(i, Attribute::InAlloca),
-               "Calling convention disallows inalloca", &F);
+        Check(!Attrs.hasParamAttr(i, Attribute::ByVal),
+              "Calling convention disallows byval", &F);
+        Check(!Attrs.hasParamAttr(i, Attribute::Preallocated),
+              "Calling convention disallows preallocated", &F);
+        Check(!Attrs.hasParamAttr(i, Attribute::InAlloca),
+              "Calling convention disallows inalloca", &F);
 
         if (Attrs.hasParamAttr(i, Attribute::ByRef)) {
           // FIXME: Should also disallow LDS and GDS, but we don't have the enum
           // value here.
-          Assert(Arg.getType()->getPointerAddressSpace() != StackAS,
-                 "Calling convention disallows stack byref", &F);
+          Check(Arg.getType()->getPointerAddressSpace() != StackAS,
+                "Calling convention disallows stack byref", &F);
         }
 
         ++i;
@@ -2459,27 +2509,28 @@ void Verifier::visitFunction(const Function &F) {
   case CallingConv::Intel_OCL_BI:
   case CallingConv::PTX_Kernel:
   case CallingConv::PTX_Device:
-    Assert(!F.isVarArg(), "Calling convention does not support varargs or "
-                          "perfect forwarding!",
-           &F);
+    Check(!F.isVarArg(),
+          "Calling convention does not support varargs or "
+          "perfect forwarding!",
+          &F);
     break;
   }
 
   // Check that the argument values match the function type for this function...
   unsigned i = 0;
   for (const Argument &Arg : F.args()) {
-    Assert(Arg.getType() == FT->getParamType(i),
-           "Argument value does not match function argument type!", &Arg,
-           FT->getParamType(i));
-    Assert(Arg.getType()->isFirstClassType(),
-           "Function arguments must have first-class types!", &Arg);
+    Check(Arg.getType() == FT->getParamType(i),
+          "Argument value does not match function argument type!", &Arg,
+          FT->getParamType(i));
+    Check(Arg.getType()->isFirstClassType(),
+          "Function arguments must have first-class types!", &Arg);
     if (!IsIntrinsic) {
-      Assert(!Arg.getType()->isMetadataTy(),
-             "Function takes metadata but isn't an intrinsic", &Arg, &F);
-      Assert(!Arg.getType()->isTokenTy(),
-             "Function takes token but isn't an intrinsic", &Arg, &F);
-      Assert(!Arg.getType()->isX86_AMXTy(),
-             "Function takes x86_amx but isn't an intrinsic", &Arg, &F);
+      Check(!Arg.getType()->isMetadataTy(),
+            "Function takes metadata but isn't an intrinsic", &Arg, &F);
+      Check(!Arg.getType()->isTokenTy(),
+            "Function takes token but isn't an intrinsic", &Arg, &F);
+      Check(!Arg.getType()->isX86_AMXTy(),
+            "Function takes x86_amx but isn't an intrinsic", &Arg, &F);
     }
 
     // Check that swifterror argument is only used by loads and stores.
@@ -2490,10 +2541,10 @@ void Verifier::visitFunction(const Function &F) {
   }
 
   if (!IsIntrinsic) {
-    Assert(!F.getReturnType()->isTokenTy(),
-           "Function returns a token but isn't an intrinsic", &F);
-    Assert(!F.getReturnType()->isX86_AMXTy(),
-           "Function returns a x86_amx but isn't an intrinsic", &F);
+    Check(!F.getReturnType()->isTokenTy(),
+          "Function returns a token but isn't an intrinsic", &F);
+    Check(!F.getReturnType()->isX86_AMXTy(),
+          "Function returns a x86_amx but isn't an intrinsic", &F);
   }
 
   // Get the function metadata attachments.
@@ -2506,44 +2557,44 @@ void Verifier::visitFunction(const Function &F) {
   if (F.hasPersonalityFn()) {
     auto *Per = dyn_cast<Function>(F.getPersonalityFn()->stripPointerCasts());
     if (Per)
-      Assert(Per->getParent() == F.getParent(),
-             "Referencing personality function in another module!",
-             &F, F.getParent(), Per, Per->getParent());
+      Check(Per->getParent() == F.getParent(),
+            "Referencing personality function in another module!", &F,
+            F.getParent(), Per, Per->getParent());
   }
 
   if (F.isMaterializable()) {
     // Function has a body somewhere we can't see.
-    Assert(MDs.empty(), "unmaterialized function cannot have metadata", &F,
-           MDs.empty() ? nullptr : MDs.front().second);
+    Check(MDs.empty(), "unmaterialized function cannot have metadata", &F,
+          MDs.empty() ? nullptr : MDs.front().second);
   } else if (F.isDeclaration()) {
     for (const auto &I : MDs) {
       // This is used for call site debug information.
-      AssertDI(I.first != LLVMContext::MD_dbg ||
-                   !cast<DISubprogram>(I.second)->isDistinct(),
-               "function declaration may only have a unique !dbg attachment",
-               &F);
-      Assert(I.first != LLVMContext::MD_prof,
-             "function declaration may not have a !prof attachment", &F);
+      CheckDI(I.first != LLVMContext::MD_dbg ||
+                  !cast<DISubprogram>(I.second)->isDistinct(),
+              "function declaration may only have a unique !dbg attachment",
+              &F);
+      Check(I.first != LLVMContext::MD_prof,
+            "function declaration may not have a !prof attachment", &F);
 
       // Verify the metadata itself.
       visitMDNode(*I.second, AreDebugLocsAllowed::Yes);
     }
-    Assert(!F.hasPersonalityFn(),
-           "Function declaration shouldn't have a personality routine", &F);
+    Check(!F.hasPersonalityFn(),
+          "Function declaration shouldn't have a personality routine", &F);
   } else {
     // Verify that this function (which has a body) is not named "llvm.*".  It
     // is not legal to define intrinsics.
-    Assert(!IsIntrinsic, "llvm intrinsics cannot be defined!", &F);
+    Check(!IsIntrinsic, "llvm intrinsics cannot be defined!", &F);
 
     // Check the entry node
     const BasicBlock *Entry = &F.getEntryBlock();
-    Assert(pred_empty(Entry),
-           "Entry block to function must not have predecessors!", Entry);
+    Check(pred_empty(Entry),
+          "Entry block to function must not have predecessors!", Entry);
 
     // The address of the entry block cannot be taken, unless it is dead.
     if (Entry->hasAddressTaken()) {
-      Assert(!BlockAddress::lookup(Entry)->isConstantUsed(),
-             "blockaddress may not be used with the entry block!", Entry);
+      Check(!BlockAddress::lookup(Entry)->isConstantUsed(),
+            "blockaddress may not be used with the entry block!", Entry);
     }
 
     unsigned NumDebugAttachments = 0, NumProfAttachments = 0;
@@ -2556,26 +2607,26 @@ void Verifier::visitFunction(const Function &F) {
         break;
       case LLVMContext::MD_dbg: {
         ++NumDebugAttachments;
-        AssertDI(NumDebugAttachments == 1,
-                 "function must have a single !dbg attachment", &F, I.second);
-        AssertDI(isa<DISubprogram>(I.second),
-                 "function !dbg attachment must be a subprogram", &F, I.second);
-        AssertDI(cast<DISubprogram>(I.second)->isDistinct(),
-                 "function definition may only have a distinct !dbg attachment",
-                 &F);
+        CheckDI(NumDebugAttachments == 1,
+                "function must have a single !dbg attachment", &F, I.second);
+        CheckDI(isa<DISubprogram>(I.second),
+                "function !dbg attachment must be a subprogram", &F, I.second);
+        CheckDI(cast<DISubprogram>(I.second)->isDistinct(),
+                "function definition may only have a distinct !dbg attachment",
+                &F);
 
         auto *SP = cast<DISubprogram>(I.second);
         const Function *&AttachedTo = DISubprogramAttachments[SP];
-        AssertDI(!AttachedTo || AttachedTo == &F,
-                 "DISubprogram attached to more than one function", SP, &F);
+        CheckDI(!AttachedTo || AttachedTo == &F,
+                "DISubprogram attached to more than one function", SP, &F);
         AttachedTo = &F;
         AllowLocs = AreDebugLocsAllowed::Yes;
         break;
       }
       case LLVMContext::MD_prof:
         ++NumProfAttachments;
-        Assert(NumProfAttachments == 1,
-               "function must have a single !prof attachment", &F, I.second);
+        Check(NumProfAttachments == 1,
+              "function must have a single !prof attachment", &F, I.second);
         break;
       }
 
@@ -2592,28 +2643,27 @@ void Verifier::visitFunction(const Function &F) {
     const User *U;
     if (F.hasAddressTaken(&U, false, true, false,
                           /*IgnoreARCAttachedCall=*/true))
-      Assert(false, "Invalid user of intrinsic instruction!", U);
+      Check(false, "Invalid user of intrinsic instruction!", U);
   }
 
   // Check intrinsics' signatures.
   switch (F.getIntrinsicID()) {
   case Intrinsic::experimental_gc_get_pointer_base: {
     FunctionType *FT = F.getFunctionType();
-    Assert(FT->getNumParams() == 1, "wrong number of parameters", F);
-    Assert(isa<PointerType>(F.getReturnType()),
-           "gc.get.pointer.base must return a pointer", F);
-    Assert(FT->getParamType(0) == F.getReturnType(),
-           "gc.get.pointer.base operand and result must be of the same type",
-           F);
+    Check(FT->getNumParams() == 1, "wrong number of parameters", F);
+    Check(isa<PointerType>(F.getReturnType()),
+          "gc.get.pointer.base must return a pointer", F);
+    Check(FT->getParamType(0) == F.getReturnType(),
+          "gc.get.pointer.base operand and result must be of the same type", F);
     break;
   }
   case Intrinsic::experimental_gc_get_pointer_offset: {
     FunctionType *FT = F.getFunctionType();
-    Assert(FT->getNumParams() == 1, "wrong number of parameters", F);
-    Assert(isa<PointerType>(FT->getParamType(0)),
-           "gc.get.pointer.offset operand must be a pointer", F);
-    Assert(F.getReturnType()->isIntegerTy(),
-           "gc.get.pointer.offset must return integer", F);
+    Check(FT->getNumParams() == 1, "wrong number of parameters", F);
+    Check(isa<PointerType>(FT->getParamType(0)),
+          "gc.get.pointer.offset operand must be a pointer", F);
+    Check(F.getReturnType()->isIntegerTy(),
+          "gc.get.pointer.offset must return integer", F);
     break;
   }
   }
@@ -2638,12 +2688,11 @@ void Verifier::visitFunction(const Function &F) {
       return;
 
     Metadata *Parent = DL->getRawScope();
-    AssertDI(Parent && isa<DILocalScope>(Parent),
-             "DILocation's scope must be a DILocalScope", N, &F, &I, DL,
-             Parent);
+    CheckDI(Parent && isa<DILocalScope>(Parent),
+            "DILocation's scope must be a DILocalScope", N, &F, &I, DL, Parent);
 
     DILocalScope *Scope = DL->getInlinedAtScope();
-    Assert(Scope, "Failed to find DILocalScope", DL);
+    Check(Scope, "Failed to find DILocalScope", DL);
 
     if (!Seen.insert(Scope).second)
       return;
@@ -2655,9 +2704,9 @@ void Verifier::visitFunction(const Function &F) {
     if (SP && ((Scope != SP) && !Seen.insert(SP).second))
       return;
 
-    AssertDI(SP->describes(&F),
-             "!dbg attachment points at wrong subprogram for function", N, &F,
-             &I, DL, Scope, SP);
+    CheckDI(SP->describes(&F),
+            "!dbg attachment points at wrong subprogram for function", N, &F,
+            &I, DL, Scope, SP);
   };
   for (auto &BB : F)
     for (auto &I : BB) {
@@ -2677,7 +2726,7 @@ void Verifier::visitBasicBlock(BasicBlock &BB) {
   InstsInThisBlock.clear();
 
   // Ensure that basic blocks have terminators!
-  Assert(BB.getTerminator(), "Basic Block does not have terminator!", &BB);
+  Check(BB.getTerminator(), "Basic Block does not have terminator!", &BB);
 
   // Check constraints that this basic block imposes on all of the PHI nodes in
   // it.
@@ -2686,10 +2735,10 @@ void Verifier::visitBasicBlock(BasicBlock &BB) {
     SmallVector<std::pair<BasicBlock*, Value*>, 8> Values;
     llvm::sort(Preds);
     for (const PHINode &PN : BB.phis()) {
-      Assert(PN.getNumIncomingValues() == Preds.size(),
-             "PHINode should have one entry for each predecessor of its "
-             "parent basic block!",
-             &PN);
+      Check(PN.getNumIncomingValues() == Preds.size(),
+            "PHINode should have one entry for each predecessor of its "
+            "parent basic block!",
+            &PN);
 
       // Get and sort all incoming values in the PHI node...
       Values.clear();
@@ -2704,17 +2753,17 @@ void Verifier::visitBasicBlock(BasicBlock &BB) {
         // particular basic block in this PHI node, that the incoming values are
         // all identical.
         //
-        Assert(i == 0 || Values[i].first != Values[i - 1].first ||
-                   Values[i].second == Values[i - 1].second,
-               "PHI node has multiple entries for the same basic block with "
-               "different incoming values!",
-               &PN, Values[i].first, Values[i].second, Values[i - 1].second);
+        Check(i == 0 || Values[i].first != Values[i - 1].first ||
+                  Values[i].second == Values[i - 1].second,
+              "PHI node has multiple entries for the same basic block with "
+              "different incoming values!",
+              &PN, Values[i].first, Values[i].second, Values[i - 1].second);
 
         // Check to make sure that the predecessors and PHI node entries are
         // matched up.
-        Assert(Values[i].first == Preds[i],
-               "PHI node entries do not match predecessors!", &PN,
-               Values[i].first, Preds[i]);
+        Check(Values[i].first == Preds[i],
+              "PHI node entries do not match predecessors!", &PN,
+              Values[i].first, Preds[i]);
       }
     }
   }
@@ -2722,21 +2771,21 @@ void Verifier::visitBasicBlock(BasicBlock &BB) {
   // Check that all instructions have their parent pointers set up correctly.
   for (auto &I : BB)
   {
-    Assert(I.getParent() == &BB, "Instruction has bogus parent pointer!");
+    Check(I.getParent() == &BB, "Instruction has bogus parent pointer!");
   }
 }
 
 void Verifier::visitTerminator(Instruction &I) {
   // Ensure that terminators only exist at the end of the basic block.
-  Assert(&I == I.getParent()->getTerminator(),
-         "Terminator found in the middle of a basic block!", I.getParent());
+  Check(&I == I.getParent()->getTerminator(),
+        "Terminator found in the middle of a basic block!", I.getParent());
   visitInstruction(I);
 }
 
 void Verifier::visitBranchInst(BranchInst &BI) {
   if (BI.isConditional()) {
-    Assert(BI.getCondition()->getType()->isIntegerTy(1),
-           "Branch condition is not 'i1' type!", &BI, BI.getCondition());
+    Check(BI.getCondition()->getType()->isIntegerTy(1),
+          "Branch condition is not 'i1' type!", &BI, BI.getCondition());
   }
   visitTerminator(BI);
 }
@@ -2745,15 +2794,15 @@ void Verifier::visitReturnInst(ReturnInst &RI) {
   Function *F = RI.getParent()->getParent();
   unsigned N = RI.getNumOperands();
   if (F->getReturnType()->isVoidTy())
-    Assert(N == 0,
-           "Found return instr that returns non-void in Function of void "
-           "return type!",
-           &RI, F->getReturnType());
+    Check(N == 0,
+          "Found return instr that returns non-void in Function of void "
+          "return type!",
+          &RI, F->getReturnType());
   else
-    Assert(N == 1 && F->getReturnType() == RI.getOperand(0)->getType(),
-           "Function return type does not match operand "
-           "type of return inst!",
-           &RI, F->getReturnType());
+    Check(N == 1 && F->getReturnType() == RI.getOperand(0)->getType(),
+          "Function return type does not match operand "
+          "type of return inst!",
+          &RI, F->getReturnType());
 
   // Check to make sure that the return value has necessary properties for
   // terminators...
@@ -2761,46 +2810,45 @@ void Verifier::visitReturnInst(ReturnInst &RI) {
 }
 
 void Verifier::visitSwitchInst(SwitchInst &SI) {
-  Assert(SI.getType()->isVoidTy(), "Switch must have void result type!", &SI);
+  Check(SI.getType()->isVoidTy(), "Switch must have void result type!", &SI);
   // Check to make sure that all of the constants in the switch instruction
   // have the same type as the switched-on value.
   Type *SwitchTy = SI.getCondition()->getType();
   SmallPtrSet<ConstantInt*, 32> Constants;
   for (auto &Case : SI.cases()) {
-    Assert(Case.getCaseValue()->getType() == SwitchTy,
-           "Switch constants must all be same type as switch value!", &SI);
-    Assert(Constants.insert(Case.getCaseValue()).second,
-           "Duplicate integer as switch case", &SI, Case.getCaseValue());
+    Check(Case.getCaseValue()->getType() == SwitchTy,
+          "Switch constants must all be same type as switch value!", &SI);
+    Check(Constants.insert(Case.getCaseValue()).second,
+          "Duplicate integer as switch case", &SI, Case.getCaseValue());
   }
 
   visitTerminator(SI);
 }
 
 void Verifier::visitIndirectBrInst(IndirectBrInst &BI) {
-  Assert(BI.getAddress()->getType()->isPointerTy(),
-         "Indirectbr operand must have pointer type!", &BI);
+  Check(BI.getAddress()->getType()->isPointerTy(),
+        "Indirectbr operand must have pointer type!", &BI);
   for (unsigned i = 0, e = BI.getNumDestinations(); i != e; ++i)
-    Assert(BI.getDestination(i)->getType()->isLabelTy(),
-           "Indirectbr destinations must all have pointer type!", &BI);
+    Check(BI.getDestination(i)->getType()->isLabelTy(),
+          "Indirectbr destinations must all have pointer type!", &BI);
 
   visitTerminator(BI);
 }
 
 void Verifier::visitCallBrInst(CallBrInst &CBI) {
-  Assert(CBI.isInlineAsm(), "Callbr is currently only used for asm-goto!",
-         &CBI);
+  Check(CBI.isInlineAsm(), "Callbr is currently only used for asm-goto!", &CBI);
   const InlineAsm *IA = cast<InlineAsm>(CBI.getCalledOperand());
-  Assert(!IA->canThrow(), "Unwinding from Callbr is not allowed");
+  Check(!IA->canThrow(), "Unwinding from Callbr is not allowed");
   for (unsigned i = 0, e = CBI.getNumSuccessors(); i != e; ++i)
-    Assert(CBI.getSuccessor(i)->getType()->isLabelTy(),
-           "Callbr successors must all have pointer type!", &CBI);
+    Check(CBI.getSuccessor(i)->getType()->isLabelTy(),
+          "Callbr successors must all have pointer type!", &CBI);
   for (unsigned i = 0, e = CBI.getNumOperands(); i != e; ++i) {
-    Assert(i >= CBI.arg_size() || !isa<BasicBlock>(CBI.getOperand(i)),
-           "Using an unescaped label as a callbr argument!", &CBI);
+    Check(i >= CBI.arg_size() || !isa<BasicBlock>(CBI.getOperand(i)),
+          "Using an unescaped label as a callbr argument!", &CBI);
     if (isa<BasicBlock>(CBI.getOperand(i)))
       for (unsigned j = i + 1; j != e; ++j)
-        Assert(CBI.getOperand(i) != CBI.getOperand(j),
-               "Duplicate callbr destination!", &CBI);
+        Check(CBI.getOperand(i) != CBI.getOperand(j),
+              "Duplicate callbr destination!", &CBI);
   }
   {
     SmallPtrSet<BasicBlock *, 4> ArgBBs;
@@ -2808,7 +2856,7 @@ void Verifier::visitCallBrInst(CallBrInst &CBI) {
       if (auto *BA = dyn_cast<BlockAddress>(V))
         ArgBBs.insert(BA->getBasicBlock());
     for (BasicBlock *BB : CBI.getIndirectDests())
-      Assert(ArgBBs.count(BB), "Indirect label missing from arglist.", &CBI);
+      Check(ArgBBs.count(BB), "Indirect label missing from arglist.", &CBI);
   }
 
   verifyInlineAsmCall(CBI);
@@ -2816,12 +2864,12 @@ void Verifier::visitCallBrInst(CallBrInst &CBI) {
 }
 
 void Verifier::visitSelectInst(SelectInst &SI) {
-  Assert(!SelectInst::areInvalidOperands(SI.getOperand(0), SI.getOperand(1),
-                                         SI.getOperand(2)),
-         "Invalid operands for select instruction!", &SI);
+  Check(!SelectInst::areInvalidOperands(SI.getOperand(0), SI.getOperand(1),
+                                        SI.getOperand(2)),
+        "Invalid operands for select instruction!", &SI);
 
-  Assert(SI.getTrueValue()->getType() == SI.getType(),
-         "Select values must have same type as select instruction!", &SI);
+  Check(SI.getTrueValue()->getType() == SI.getType(),
+        "Select values must have same type as select instruction!", &SI);
   visitInstruction(SI);
 }
 
@@ -2829,7 +2877,7 @@ void Verifier::visitSelectInst(SelectInst &SI) {
 /// a pass, if any exist, it's an error.
 ///
 void Verifier::visitUserOp1(Instruction &I) {
-  Assert(false, "User-defined operators should not live outside of a pass!", &I);
+  Check(false, "User-defined operators should not live outside of a pass!", &I);
 }
 
 void Verifier::visitTruncInst(TruncInst &I) {
@@ -2841,11 +2889,11 @@ void Verifier::visitTruncInst(TruncInst &I) {
   unsigned SrcBitSize = SrcTy->getScalarSizeInBits();
   unsigned DestBitSize = DestTy->getScalarSizeInBits();
 
-  Assert(SrcTy->isIntOrIntVectorTy(), "Trunc only operates on integer", &I);
-  Assert(DestTy->isIntOrIntVectorTy(), "Trunc only produces integer", &I);
-  Assert(SrcTy->isVectorTy() == DestTy->isVectorTy(),
-         "trunc source and destination must both be a vector or neither", &I);
-  Assert(SrcBitSize > DestBitSize, "DestTy too big for Trunc", &I);
+  Check(SrcTy->isIntOrIntVectorTy(), "Trunc only operates on integer", &I);
+  Check(DestTy->isIntOrIntVectorTy(), "Trunc only produces integer", &I);
+  Check(SrcTy->isVectorTy() == DestTy->isVectorTy(),
+        "trunc source and destination must both be a vector or neither", &I);
+  Check(SrcBitSize > DestBitSize, "DestTy too big for Trunc", &I);
 
   visitInstruction(I);
 }
@@ -2856,14 +2904,14 @@ void Verifier::visitZExtInst(ZExtInst &I) {
   Type *DestTy = I.getType();
 
   // Get the size of the types in bits, we'll need this later
-  Assert(SrcTy->isIntOrIntVectorTy(), "ZExt only operates on integer", &I);
-  Assert(DestTy->isIntOrIntVectorTy(), "ZExt only produces an integer", &I);
-  Assert(SrcTy->isVectorTy() == DestTy->isVectorTy(),
-         "zext source and destination must both be a vector or neither", &I);
+  Check(SrcTy->isIntOrIntVectorTy(), "ZExt only operates on integer", &I);
+  Check(DestTy->isIntOrIntVectorTy(), "ZExt only produces an integer", &I);
+  Check(SrcTy->isVectorTy() == DestTy->isVectorTy(),
+        "zext source and destination must both be a vector or neither", &I);
   unsigned SrcBitSize = SrcTy->getScalarSizeInBits();
   unsigned DestBitSize = DestTy->getScalarSizeInBits();
 
-  Assert(SrcBitSize < DestBitSize, "Type too small for ZExt", &I);
+  Check(SrcBitSize < DestBitSize, "Type too small for ZExt", &I);
 
   visitInstruction(I);
 }
@@ -2877,11 +2925,11 @@ void Verifier::visitSExtInst(SExtInst &I) {
   unsigned SrcBitSize = SrcTy->getScalarSizeInBits();
   unsigned DestBitSize = DestTy->getScalarSizeInBits();
 
-  Assert(SrcTy->isIntOrIntVectorTy(), "SExt only operates on integer", &I);
-  Assert(DestTy->isIntOrIntVectorTy(), "SExt only produces an integer", &I);
-  Assert(SrcTy->isVectorTy() == DestTy->isVectorTy(),
-         "sext source and destination must both be a vector or neither", &I);
-  Assert(SrcBitSize < DestBitSize, "Type too small for SExt", &I);
+  Check(SrcTy->isIntOrIntVectorTy(), "SExt only operates on integer", &I);
+  Check(DestTy->isIntOrIntVectorTy(), "SExt only produces an integer", &I);
+  Check(SrcTy->isVectorTy() == DestTy->isVectorTy(),
+        "sext source and destination must both be a vector or neither", &I);
+  Check(SrcBitSize < DestBitSize, "Type too small for SExt", &I);
 
   visitInstruction(I);
 }
@@ -2894,11 +2942,11 @@ void Verifier::visitFPTruncInst(FPTruncInst &I) {
   unsigned SrcBitSize = SrcTy->getScalarSizeInBits();
   unsigned DestBitSize = DestTy->getScalarSizeInBits();
 
-  Assert(SrcTy->isFPOrFPVectorTy(), "FPTrunc only operates on FP", &I);
-  Assert(DestTy->isFPOrFPVectorTy(), "FPTrunc only produces an FP", &I);
-  Assert(SrcTy->isVectorTy() == DestTy->isVectorTy(),
-         "fptrunc source and destination must both be a vector or neither", &I);
-  Assert(SrcBitSize > DestBitSize, "DestTy too big for FPTrunc", &I);
+  Check(SrcTy->isFPOrFPVectorTy(), "FPTrunc only operates on FP", &I);
+  Check(DestTy->isFPOrFPVectorTy(), "FPTrunc only produces an FP", &I);
+  Check(SrcTy->isVectorTy() == DestTy->isVectorTy(),
+        "fptrunc source and destination must both be a vector or neither", &I);
+  Check(SrcBitSize > DestBitSize, "DestTy too big for FPTrunc", &I);
 
   visitInstruction(I);
 }
@@ -2912,11 +2960,11 @@ void Verifier::visitFPExtInst(FPExtInst &I) {
   unsigned SrcBitSize = SrcTy->getScalarSizeInBits();
   unsigned DestBitSize = DestTy->getScalarSizeInBits();
 
-  Assert(SrcTy->isFPOrFPVectorTy(), "FPExt only operates on FP", &I);
-  Assert(DestTy->isFPOrFPVectorTy(), "FPExt only produces an FP", &I);
-  Assert(SrcTy->isVectorTy() == DestTy->isVectorTy(),
-         "fpext source and destination must both be a vector or neither", &I);
-  Assert(SrcBitSize < DestBitSize, "DestTy too small for FPExt", &I);
+  Check(SrcTy->isFPOrFPVectorTy(), "FPExt only operates on FP", &I);
+  Check(DestTy->isFPOrFPVectorTy(), "FPExt only produces an FP", &I);
+  Check(SrcTy->isVectorTy() == DestTy->isVectorTy(),
+        "fpext source and destination must both be a vector or neither", &I);
+  Check(SrcBitSize < DestBitSize, "DestTy too small for FPExt", &I);
 
   visitInstruction(I);
 }
@@ -2929,17 +2977,17 @@ void Verifier::visitUIToFPInst(UIToFPInst &I) {
   bool SrcVec = SrcTy->isVectorTy();
   bool DstVec = DestTy->isVectorTy();
 
-  Assert(SrcVec == DstVec,
-         "UIToFP source and dest must both be vector or scalar", &I);
-  Assert(SrcTy->isIntOrIntVectorTy(),
-         "UIToFP source must be integer or integer vector", &I);
-  Assert(DestTy->isFPOrFPVectorTy(), "UIToFP result must be FP or FP vector",
-         &I);
+  Check(SrcVec == DstVec,
+        "UIToFP source and dest must both be vector or scalar", &I);
+  Check(SrcTy->isIntOrIntVectorTy(),
+        "UIToFP source must be integer or integer vector", &I);
+  Check(DestTy->isFPOrFPVectorTy(), "UIToFP result must be FP or FP vector",
+        &I);
 
   if (SrcVec && DstVec)
-    Assert(cast<VectorType>(SrcTy)->getElementCount() ==
-               cast<VectorType>(DestTy)->getElementCount(),
-           "UIToFP source and dest vector length mismatch", &I);
+    Check(cast<VectorType>(SrcTy)->getElementCount() ==
+              cast<VectorType>(DestTy)->getElementCount(),
+          "UIToFP source and dest vector length mismatch", &I);
 
   visitInstruction(I);
 }
@@ -2952,17 +3000,17 @@ void Verifier::visitSIToFPInst(SIToFPInst &I) {
   bool SrcVec = SrcTy->isVectorTy();
   bool DstVec = DestTy->isVectorTy();
 
-  Assert(SrcVec == DstVec,
-         "SIToFP source and dest must both be vector or scalar", &I);
-  Assert(SrcTy->isIntOrIntVectorTy(),
-         "SIToFP source must be integer or integer vector", &I);
-  Assert(DestTy->isFPOrFPVectorTy(), "SIToFP result must be FP or FP vector",
-         &I);
+  Check(SrcVec == DstVec,
+        "SIToFP source and dest must both be vector or scalar", &I);
+  Check(SrcTy->isIntOrIntVectorTy(),
+        "SIToFP source must be integer or integer vector", &I);
+  Check(DestTy->isFPOrFPVectorTy(), "SIToFP result must be FP or FP vector",
+        &I);
 
   if (SrcVec && DstVec)
-    Assert(cast<VectorType>(SrcTy)->getElementCount() ==
-               cast<VectorType>(DestTy)->getElementCount(),
-           "SIToFP source and dest vector length mismatch", &I);
+    Check(cast<VectorType>(SrcTy)->getElementCount() ==
+              cast<VectorType>(DestTy)->getElementCount(),
+          "SIToFP source and dest vector length mismatch", &I);
 
   visitInstruction(I);
 }
@@ -2975,17 +3023,16 @@ void Verifier::visitFPToUIInst(FPToUIInst &I) {
   bool SrcVec = SrcTy->isVectorTy();
   bool DstVec = DestTy->isVectorTy();
 
-  Assert(SrcVec == DstVec,
-         "FPToUI source and dest must both be vector or scalar", &I);
-  Assert(SrcTy->isFPOrFPVectorTy(), "FPToUI source must be FP or FP vector",
-         &I);
-  Assert(DestTy->isIntOrIntVectorTy(),
-         "FPToUI result must be integer or integer vector", &I);
+  Check(SrcVec == DstVec,
+        "FPToUI source and dest must both be vector or scalar", &I);
+  Check(SrcTy->isFPOrFPVectorTy(), "FPToUI source must be FP or FP vector", &I);
+  Check(DestTy->isIntOrIntVectorTy(),
+        "FPToUI result must be integer or integer vector", &I);
 
   if (SrcVec && DstVec)
-    Assert(cast<VectorType>(SrcTy)->getElementCount() ==
-               cast<VectorType>(DestTy)->getElementCount(),
-           "FPToUI source and dest vector length mismatch", &I);
+    Check(cast<VectorType>(SrcTy)->getElementCount() ==
+              cast<VectorType>(DestTy)->getElementCount(),
+          "FPToUI source and dest vector length mismatch", &I);
 
   visitInstruction(I);
 }
@@ -2998,17 +3045,16 @@ void Verifier::visitFPToSIInst(FPToSIInst &I) {
   bool SrcVec = SrcTy->isVectorTy();
   bool DstVec = DestTy->isVectorTy();
 
-  Assert(SrcVec == DstVec,
-         "FPToSI source and dest must both be vector or scalar", &I);
-  Assert(SrcTy->isFPOrFPVectorTy(), "FPToSI source must be FP or FP vector",
-         &I);
-  Assert(DestTy->isIntOrIntVectorTy(),
-         "FPToSI result must be integer or integer vector", &I);
+  Check(SrcVec == DstVec,
+        "FPToSI source and dest must both be vector or scalar", &I);
+  Check(SrcTy->isFPOrFPVectorTy(), "FPToSI source must be FP or FP vector", &I);
+  Check(DestTy->isIntOrIntVectorTy(),
+        "FPToSI result must be integer or integer vector", &I);
 
   if (SrcVec && DstVec)
-    Assert(cast<VectorType>(SrcTy)->getElementCount() ==
-               cast<VectorType>(DestTy)->getElementCount(),
-           "FPToSI source and dest vector length mismatch", &I);
+    Check(cast<VectorType>(SrcTy)->getElementCount() ==
+              cast<VectorType>(DestTy)->getElementCount(),
+          "FPToSI source and dest vector length mismatch", &I);
 
   visitInstruction(I);
 }
@@ -3018,17 +3064,17 @@ void Verifier::visitPtrToIntInst(PtrToIntInst &I) {
   Type *SrcTy = I.getOperand(0)->getType();
   Type *DestTy = I.getType();
 
-  Assert(SrcTy->isPtrOrPtrVectorTy(), "PtrToInt source must be pointer", &I);
+  Check(SrcTy->isPtrOrPtrVectorTy(), "PtrToInt source must be pointer", &I);
 
-  Assert(DestTy->isIntOrIntVectorTy(), "PtrToInt result must be integral", &I);
-  Assert(SrcTy->isVectorTy() == DestTy->isVectorTy(), "PtrToInt type mismatch",
-         &I);
+  Check(DestTy->isIntOrIntVectorTy(), "PtrToInt result must be integral", &I);
+  Check(SrcTy->isVectorTy() == DestTy->isVectorTy(), "PtrToInt type mismatch",
+        &I);
 
   if (SrcTy->isVectorTy()) {
     auto *VSrc = cast<VectorType>(SrcTy);
     auto *VDest = cast<VectorType>(DestTy);
-    Assert(VSrc->getElementCount() == VDest->getElementCount(),
-           "PtrToInt Vector width mismatch", &I);
+    Check(VSrc->getElementCount() == VDest->getElementCount(),
+          "PtrToInt Vector width mismatch", &I);
   }
 
   visitInstruction(I);
@@ -3039,23 +3085,22 @@ void Verifier::visitIntToPtrInst(IntToPtrInst &I) {
   Type *SrcTy = I.getOperand(0)->getType();
   Type *DestTy = I.getType();
 
-  Assert(SrcTy->isIntOrIntVectorTy(),
-         "IntToPtr source must be an integral", &I);
-  Assert(DestTy->isPtrOrPtrVectorTy(), "IntToPtr result must be a pointer", &I);
+  Check(SrcTy->isIntOrIntVectorTy(), "IntToPtr source must be an integral", &I);
+  Check(DestTy->isPtrOrPtrVectorTy(), "IntToPtr result must be a pointer", &I);
 
-  Assert(SrcTy->isVectorTy() == DestTy->isVectorTy(), "IntToPtr type mismatch",
-         &I);
+  Check(SrcTy->isVectorTy() == DestTy->isVectorTy(), "IntToPtr type mismatch",
+        &I);
   if (SrcTy->isVectorTy()) {
     auto *VSrc = cast<VectorType>(SrcTy);
     auto *VDest = cast<VectorType>(DestTy);
-    Assert(VSrc->getElementCount() == VDest->getElementCount(),
-           "IntToPtr Vector width mismatch", &I);
+    Check(VSrc->getElementCount() == VDest->getElementCount(),
+          "IntToPtr Vector width mismatch", &I);
   }
   visitInstruction(I);
 }
 
 void Verifier::visitBitCastInst(BitCastInst &I) {
-  Assert(
+  Check(
       CastInst::castIsValid(Instruction::BitCast, I.getOperand(0), I.getType()),
       "Invalid bitcast", &I);
   visitInstruction(I);
@@ -3065,16 +3110,16 @@ void Verifier::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
   Type *SrcTy = I.getOperand(0)->getType();
   Type *DestTy = I.getType();
 
-  Assert(SrcTy->isPtrOrPtrVectorTy(), "AddrSpaceCast source must be a pointer",
-         &I);
-  Assert(DestTy->isPtrOrPtrVectorTy(), "AddrSpaceCast result must be a pointer",
-         &I);
-  Assert(SrcTy->getPointerAddressSpace() != DestTy->getPointerAddressSpace(),
-         "AddrSpaceCast must be between different address spaces", &I);
+  Check(SrcTy->isPtrOrPtrVectorTy(), "AddrSpaceCast source must be a pointer",
+        &I);
+  Check(DestTy->isPtrOrPtrVectorTy(), "AddrSpaceCast result must be a pointer",
+        &I);
+  Check(SrcTy->getPointerAddressSpace() != DestTy->getPointerAddressSpace(),
+        "AddrSpaceCast must be between different address spaces", &I);
   if (auto *SrcVTy = dyn_cast<VectorType>(SrcTy))
-    Assert(SrcVTy->getElementCount() ==
-               cast<VectorType>(DestTy)->getElementCount(),
-           "AddrSpaceCast vector pointer number of elements mismatch", &I);
+    Check(SrcVTy->getElementCount() ==
+              cast<VectorType>(DestTy)->getElementCount(),
+          "AddrSpaceCast vector pointer number of elements mismatch", &I);
   visitInstruction(I);
 }
 
@@ -3085,18 +3130,18 @@ void Verifier::visitPHINode(PHINode &PN) {
   // This can be tested by checking whether the instruction before this is
   // either nonexistent (because this is begin()) or is a PHI node.  If not,
   // then there is some other instruction before a PHI.
-  Assert(&PN == &PN.getParent()->front() ||
-             isa<PHINode>(--BasicBlock::iterator(&PN)),
-         "PHI nodes not grouped at top of basic block!", &PN, PN.getParent());
+  Check(&PN == &PN.getParent()->front() ||
+            isa<PHINode>(--BasicBlock::iterator(&PN)),
+        "PHI nodes not grouped at top of basic block!", &PN, PN.getParent());
 
   // Check that a PHI doesn't yield a Token.
-  Assert(!PN.getType()->isTokenTy(), "PHI nodes cannot have token type!");
+  Check(!PN.getType()->isTokenTy(), "PHI nodes cannot have token type!");
 
   // Check that all of the values of the PHI node have the same type as the
   // result, and that the incoming blocks are really basic blocks.
   for (Value *IncValue : PN.incoming_values()) {
-    Assert(PN.getType() == IncValue->getType(),
-           "PHI node operands are not the same type as the result!", &PN);
+    Check(PN.getType() == IncValue->getType(),
+          "PHI node operands are not the same type as the result!", &PN);
   }
 
   // All other PHI node constraints are checked in the visitBasicBlock method.
@@ -3105,54 +3150,68 @@ void Verifier::visitPHINode(PHINode &PN) {
 }
 
 void Verifier::visitCallBase(CallBase &Call) {
-  Assert(Call.getCalledOperand()->getType()->isPointerTy(),
-         "Called function must be a pointer!", Call);
+  Check(Call.getCalledOperand()->getType()->isPointerTy(),
+        "Called function must be a pointer!", Call);
   PointerType *FPTy = cast<PointerType>(Call.getCalledOperand()->getType());
 
-  Assert(FPTy->isOpaqueOrPointeeTypeMatches(Call.getFunctionType()),
-         "Called function is not the same type as the call!", Call);
+  Check(FPTy->isOpaqueOrPointeeTypeMatches(Call.getFunctionType()),
+        "Called function is not the same type as the call!", Call);
 
   FunctionType *FTy = Call.getFunctionType();
 
   // Verify that the correct number of arguments are being passed
   if (FTy->isVarArg())
-    Assert(Call.arg_size() >= FTy->getNumParams(),
-           "Called function requires more parameters than were provided!",
-           Call);
+    Check(Call.arg_size() >= FTy->getNumParams(),
+          "Called function requires more parameters than were provided!", Call);
   else
-    Assert(Call.arg_size() == FTy->getNumParams(),
-           "Incorrect number of arguments passed to called function!", Call);
+    Check(Call.arg_size() == FTy->getNumParams(),
+          "Incorrect number of arguments passed to called function!", Call);
 
   // Verify that all arguments to the call match the function type.
   for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
-    Assert(Call.getArgOperand(i)->getType() == FTy->getParamType(i),
-           "Call parameter type does not match function signature!",
-           Call.getArgOperand(i), FTy->getParamType(i), Call);
+    Check(Call.getArgOperand(i)->getType() == FTy->getParamType(i),
+          "Call parameter type does not match function signature!",
+          Call.getArgOperand(i), FTy->getParamType(i), Call);
 
   AttributeList Attrs = Call.getAttributes();
 
-  Assert(verifyAttributeCount(Attrs, Call.arg_size()),
-         "Attribute after last parameter!", Call);
+  Check(verifyAttributeCount(Attrs, Call.arg_size()),
+        "Attribute after last parameter!", Call);
+
+  auto VerifyTypeAlign = [&](Type *Ty, const Twine &Message) {
+    if (!Ty->isSized())
+      return;
+    Align ABIAlign = DL.getABITypeAlign(Ty);
+    Align MaxAlign(ParamMaxAlignment);
+    Check(ABIAlign <= MaxAlign,
+          "Incorrect alignment of " + Message + " to called function!", Call);
+  };
+
+  VerifyTypeAlign(FTy->getReturnType(), "return type");
+  for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
+    Type *Ty = FTy->getParamType(i);
+    VerifyTypeAlign(Ty, "argument passed");
+  }
 
   Function *Callee =
       dyn_cast<Function>(Call.getCalledOperand()->stripPointerCasts());
   bool IsIntrinsic = Callee && Callee->isIntrinsic();
   if (IsIntrinsic)
-    Assert(Callee->getValueType() == FTy,
-           "Intrinsic called with incompatible signature", Call);
+    Check(Callee->getValueType() == FTy,
+          "Intrinsic called with incompatible signature", Call);
 
   if (Attrs.hasFnAttr(Attribute::Speculatable)) {
     // Don't allow speculatable on call sites, unless the underlying function
     // declaration is also speculatable.
-    Assert(Callee && Callee->isSpeculatable(),
-           "speculatable attribute may not apply to call sites", Call);
+    Check(Callee && Callee->isSpeculatable(),
+          "speculatable attribute may not apply to call sites", Call);
   }
 
   if (Attrs.hasFnAttr(Attribute::Preallocated)) {
-    Assert(Call.getCalledFunction()->getIntrinsicID() ==
-               Intrinsic::call_preallocated_arg,
-           "preallocated as a call site attribute can only be on "
-           "llvm.call.preallocated.arg");
+    Check(Call.getCalledFunction()->getIntrinsicID() ==
+              Intrinsic::call_preallocated_arg,
+          "preallocated as a call site attribute can only be on "
+          "llvm.call.preallocated.arg");
   }
 
   // Verify call attributes.
@@ -3164,8 +3223,8 @@ void Verifier::visitCallBase(CallBase &Call) {
   if (Call.hasInAllocaArgument()) {
     Value *InAllocaArg = Call.getArgOperand(FTy->getNumParams() - 1);
     if (auto AI = dyn_cast<AllocaInst>(InAllocaArg->stripInBoundsOffsets()))
-      Assert(AI->isUsedWithInAlloca(),
-             "inalloca argument for call has mismatched alloca", AI, Call);
+      Check(AI->isUsedWithInAlloca(),
+            "inalloca argument for call has mismatched alloca", AI, Call);
   }
 
   // For each argument of the callsite, if it has the swifterror argument,
@@ -3175,31 +3234,30 @@ void Verifier::visitCallBase(CallBase &Call) {
     if (Call.paramHasAttr(i, Attribute::SwiftError)) {
       Value *SwiftErrorArg = Call.getArgOperand(i);
       if (auto AI = dyn_cast<AllocaInst>(SwiftErrorArg->stripInBoundsOffsets())) {
-        Assert(AI->isSwiftError(),
-               "swifterror argument for call has mismatched alloca", AI, Call);
+        Check(AI->isSwiftError(),
+              "swifterror argument for call has mismatched alloca", AI, Call);
         continue;
       }
       auto ArgI = dyn_cast<Argument>(SwiftErrorArg);
-      Assert(ArgI,
-             "swifterror argument should come from an alloca or parameter",
-             SwiftErrorArg, Call);
-      Assert(ArgI->hasSwiftErrorAttr(),
-             "swifterror argument for call has mismatched parameter", ArgI,
-             Call);
+      Check(ArgI, "swifterror argument should come from an alloca or parameter",
+            SwiftErrorArg, Call);
+      Check(ArgI->hasSwiftErrorAttr(),
+            "swifterror argument for call has mismatched parameter", ArgI,
+            Call);
     }
 
     if (Attrs.hasParamAttr(i, Attribute::ImmArg)) {
       // Don't allow immarg on call sites, unless the underlying declaration
       // also has the matching immarg.
-      Assert(Callee && Callee->hasParamAttribute(i, Attribute::ImmArg),
-             "immarg may not apply only to call sites",
-             Call.getArgOperand(i), Call);
+      Check(Callee && Callee->hasParamAttribute(i, Attribute::ImmArg),
+            "immarg may not apply only to call sites", Call.getArgOperand(i),
+            Call);
     }
 
     if (Call.paramHasAttr(i, Attribute::ImmArg)) {
       Value *ArgVal = Call.getArgOperand(i);
-      Assert(isa<ConstantInt>(ArgVal) || isa<ConstantFP>(ArgVal),
-             "immarg operand has non-immediate parameter", ArgVal, Call);
+      Check(isa<ConstantInt>(ArgVal) || isa<ConstantFP>(ArgVal),
+            "immarg operand has non-immediate parameter", ArgVal, Call);
     }
 
     if (Call.paramHasAttr(i, Attribute::Preallocated)) {
@@ -3207,10 +3265,10 @@ void Verifier::visitCallBase(CallBase &Call) {
       bool hasOB =
           Call.countOperandBundlesOfType(LLVMContext::OB_preallocated) != 0;
       bool isMustTail = Call.isMustTailCall();
-      Assert(hasOB != isMustTail,
-             "preallocated operand either requires a preallocated bundle or "
-             "the call to be musttail (but not both)",
-             ArgVal, Call);
+      Check(hasOB != isMustTail,
+            "preallocated operand either requires a preallocated bundle or "
+            "the call to be musttail (but not both)",
+            ArgVal, Call);
     }
   }
 
@@ -3233,17 +3291,17 @@ void Verifier::visitCallBase(CallBase &Call) {
       verifyParameterAttrs(ArgAttrs, Ty, &Call);
 
       if (ArgAttrs.hasAttribute(Attribute::Nest)) {
-        Assert(!SawNest, "More than one parameter has attribute nest!", Call);
+        Check(!SawNest, "More than one parameter has attribute nest!", Call);
         SawNest = true;
       }
 
       if (ArgAttrs.hasAttribute(Attribute::Returned)) {
-        Assert(!SawReturned, "More than one parameter has attribute returned!",
-               Call);
-        Assert(Ty->canLosslesslyBitCastTo(FTy->getReturnType()),
-               "Incompatible argument and return types for 'returned' "
-               "attribute",
-               Call);
+        Check(!SawReturned, "More than one parameter has attribute returned!",
+              Call);
+        Check(Ty->canLosslesslyBitCastTo(FTy->getReturnType()),
+              "Incompatible argument and return types for 'returned' "
+              "attribute",
+              Call);
         SawReturned = true;
       }
 
@@ -3252,32 +3310,32 @@ void Verifier::visitCallBase(CallBase &Call) {
       if (!Call.getCalledFunction() ||
           Call.getCalledFunction()->getIntrinsicID() !=
               Intrinsic::experimental_gc_statepoint)
-        Assert(!ArgAttrs.hasAttribute(Attribute::StructRet),
-               "Attribute 'sret' cannot be used for vararg call arguments!",
-               Call);
+        Check(!ArgAttrs.hasAttribute(Attribute::StructRet),
+              "Attribute 'sret' cannot be used for vararg call arguments!",
+              Call);
 
       if (ArgAttrs.hasAttribute(Attribute::InAlloca))
-        Assert(Idx == Call.arg_size() - 1,
-               "inalloca isn't on the last argument!", Call);
+        Check(Idx == Call.arg_size() - 1,
+              "inalloca isn't on the last argument!", Call);
     }
   }
 
   // Verify that there's no metadata unless it's a direct call to an intrinsic.
   if (!IsIntrinsic) {
     for (Type *ParamTy : FTy->params()) {
-      Assert(!ParamTy->isMetadataTy(),
-             "Function has metadata parameter but isn't an intrinsic", Call);
-      Assert(!ParamTy->isTokenTy(),
-             "Function has token parameter but isn't an intrinsic", Call);
+      Check(!ParamTy->isMetadataTy(),
+            "Function has metadata parameter but isn't an intrinsic", Call);
+      Check(!ParamTy->isTokenTy(),
+            "Function has token parameter but isn't an intrinsic", Call);
     }
   }
 
   // Verify that indirect calls don't return tokens.
   if (!Call.getCalledFunction()) {
-    Assert(!FTy->getReturnType()->isTokenTy(),
-           "Return type cannot be token for indirect call!");
-    Assert(!FTy->getReturnType()->isX86_AMXTy(),
-           "Return type cannot be x86_amx for indirect call!");
+    Check(!FTy->getReturnType()->isTokenTy(),
+          "Return type cannot be token for indirect call!");
+    Check(!FTy->getReturnType()->isX86_AMXTy(),
+          "Return type cannot be x86_amx for indirect call!");
   }
 
   if (Function *F = Call.getCalledFunction())
@@ -3285,69 +3343,83 @@ void Verifier::visitCallBase(CallBase &Call) {
       visitIntrinsicCall(ID, Call);
 
   // Verify that a callsite has at most one "deopt", at most one "funclet", at
-  // most one "gc-transition", at most one "cfguardtarget",
-  // and at most one "preallocated" operand bundle.
+  // most one "gc-transition", at most one "cfguardtarget", at most one
+  // "preallocated" operand bundle, and at most one "ptrauth" operand bundle.
   bool FoundDeoptBundle = false, FoundFuncletBundle = false,
        FoundGCTransitionBundle = false, FoundCFGuardTargetBundle = false,
        FoundPreallocatedBundle = false, FoundGCLiveBundle = false,
+       FoundPtrauthBundle = false,
        FoundAttachedCallBundle = false;
   for (unsigned i = 0, e = Call.getNumOperandBundles(); i < e; ++i) {
     OperandBundleUse BU = Call.getOperandBundleAt(i);
     uint32_t Tag = BU.getTagID();
     if (Tag == LLVMContext::OB_deopt) {
-      Assert(!FoundDeoptBundle, "Multiple deopt operand bundles", Call);
+      Check(!FoundDeoptBundle, "Multiple deopt operand bundles", Call);
       FoundDeoptBundle = true;
     } else if (Tag == LLVMContext::OB_gc_transition) {
-      Assert(!FoundGCTransitionBundle, "Multiple gc-transition operand bundles",
-             Call);
+      Check(!FoundGCTransitionBundle, "Multiple gc-transition operand bundles",
+            Call);
       FoundGCTransitionBundle = true;
     } else if (Tag == LLVMContext::OB_funclet) {
-      Assert(!FoundFuncletBundle, "Multiple funclet operand bundles", Call);
+      Check(!FoundFuncletBundle, "Multiple funclet operand bundles", Call);
       FoundFuncletBundle = true;
-      Assert(BU.Inputs.size() == 1,
-             "Expected exactly one funclet bundle operand", Call);
-      Assert(isa<FuncletPadInst>(BU.Inputs.front()),
-             "Funclet bundle operands should correspond to a FuncletPadInst",
-             Call);
+      Check(BU.Inputs.size() == 1,
+            "Expected exactly one funclet bundle operand", Call);
+      Check(isa<FuncletPadInst>(BU.Inputs.front()),
+            "Funclet bundle operands should correspond to a FuncletPadInst",
+            Call);
     } else if (Tag == LLVMContext::OB_cfguardtarget) {
-      Assert(!FoundCFGuardTargetBundle,
-             "Multiple CFGuardTarget operand bundles", Call);
+      Check(!FoundCFGuardTargetBundle, "Multiple CFGuardTarget operand bundles",
+            Call);
       FoundCFGuardTargetBundle = true;
-      Assert(BU.Inputs.size() == 1,
-             "Expected exactly one cfguardtarget bundle operand", Call);
+      Check(BU.Inputs.size() == 1,
+            "Expected exactly one cfguardtarget bundle operand", Call);
+    } else if (Tag == LLVMContext::OB_ptrauth) {
+      Check(!FoundPtrauthBundle, "Multiple ptrauth operand bundles", Call);
+      FoundPtrauthBundle = true;
+      Check(BU.Inputs.size() == 2,
+            "Expected exactly two ptrauth bundle operands", Call);
+      Check(isa<ConstantInt>(BU.Inputs[0]) &&
+                BU.Inputs[0]->getType()->isIntegerTy(32),
+            "Ptrauth bundle key operand must be an i32 constant", Call);
+      Check(BU.Inputs[1]->getType()->isIntegerTy(64),
+            "Ptrauth bundle discriminator operand must be an i64", Call);
     } else if (Tag == LLVMContext::OB_preallocated) {
-      Assert(!FoundPreallocatedBundle, "Multiple preallocated operand bundles",
-             Call);
+      Check(!FoundPreallocatedBundle, "Multiple preallocated operand bundles",
+            Call);
       FoundPreallocatedBundle = true;
-      Assert(BU.Inputs.size() == 1,
-             "Expected exactly one preallocated bundle operand", Call);
+      Check(BU.Inputs.size() == 1,
+            "Expected exactly one preallocated bundle operand", Call);
       auto Input = dyn_cast<IntrinsicInst>(BU.Inputs.front());
-      Assert(Input &&
-                 Input->getIntrinsicID() == Intrinsic::call_preallocated_setup,
-             "\"preallocated\" argument must be a token from "
-             "llvm.call.preallocated.setup",
-             Call);
+      Check(Input &&
+                Input->getIntrinsicID() == Intrinsic::call_preallocated_setup,
+            "\"preallocated\" argument must be a token from "
+            "llvm.call.preallocated.setup",
+            Call);
     } else if (Tag == LLVMContext::OB_gc_live) {
-      Assert(!FoundGCLiveBundle, "Multiple gc-live operand bundles",
-             Call);
+      Check(!FoundGCLiveBundle, "Multiple gc-live operand bundles", Call);
       FoundGCLiveBundle = true;
     } else if (Tag == LLVMContext::OB_clang_arc_attachedcall) {
-      Assert(!FoundAttachedCallBundle,
-             "Multiple \"clang.arc.attachedcall\" operand bundles", Call);
+      Check(!FoundAttachedCallBundle,
+            "Multiple \"clang.arc.attachedcall\" operand bundles", Call);
       FoundAttachedCallBundle = true;
       verifyAttachedCallBundle(Call, BU);
     }
   }
 
+  // Verify that callee and callsite agree on whether to use pointer auth.
+  Check(!(Call.getCalledFunction() && FoundPtrauthBundle),
+        "Direct call cannot have a ptrauth bundle", Call);
+
   // Verify that each inlinable callsite of a debug-info-bearing function in a
   // debug-info-bearing function has a debug location attached to it. Failure to
   // do so causes assertion failures when the inliner sets up inline scope info.
   if (Call.getFunction()->getSubprogram() && Call.getCalledFunction() &&
       Call.getCalledFunction()->getSubprogram())
-    AssertDI(Call.getDebugLoc(),
-             "inlinable function call in a function with "
-             "debug info must have a !dbg location",
-             Call);
+    CheckDI(Call.getDebugLoc(),
+            "inlinable function call in a function with "
+            "debug info must have a !dbg location",
+            Call);
 
   if (Call.isInlineAsm())
     verifyInlineAsmCall(Call);
@@ -3357,16 +3429,16 @@ void Verifier::visitCallBase(CallBase &Call) {
 
 void Verifier::verifyTailCCMustTailAttrs(const AttrBuilder &Attrs,
                                          StringRef Context) {
-  Assert(!Attrs.contains(Attribute::InAlloca),
-         Twine("inalloca attribute not allowed in ") + Context);
-  Assert(!Attrs.contains(Attribute::InReg),
-         Twine("inreg attribute not allowed in ") + Context);
-  Assert(!Attrs.contains(Attribute::SwiftError),
-         Twine("swifterror attribute not allowed in ") + Context);
-  Assert(!Attrs.contains(Attribute::Preallocated),
-         Twine("preallocated attribute not allowed in ") + Context);
-  Assert(!Attrs.contains(Attribute::ByRef),
-         Twine("byref attribute not allowed in ") + Context);
+  Check(!Attrs.contains(Attribute::InAlloca),
+        Twine("inalloca attribute not allowed in ") + Context);
+  Check(!Attrs.contains(Attribute::InReg),
+        Twine("inreg attribute not allowed in ") + Context);
+  Check(!Attrs.contains(Attribute::SwiftError),
+        Twine("swifterror attribute not allowed in ") + Context);
+  Check(!Attrs.contains(Attribute::Preallocated),
+        Twine("preallocated attribute not allowed in ") + Context);
+  Check(!Attrs.contains(Attribute::ByRef),
+        Twine("byref attribute not allowed in ") + Context);
 }
 
 /// Two types are "congruent" if they are identical, or if they are both pointer
@@ -3403,19 +3475,19 @@ static AttrBuilder getParameterABIAttributes(LLVMContext& C, unsigned I, Attribu
 }
 
 void Verifier::verifyMustTailCall(CallInst &CI) {
-  Assert(!CI.isInlineAsm(), "cannot use musttail call with inline asm", &CI);
+  Check(!CI.isInlineAsm(), "cannot use musttail call with inline asm", &CI);
 
   Function *F = CI.getParent()->getParent();
   FunctionType *CallerTy = F->getFunctionType();
   FunctionType *CalleeTy = CI.getFunctionType();
-  Assert(CallerTy->isVarArg() == CalleeTy->isVarArg(),
-         "cannot guarantee tail call due to mismatched varargs", &CI);
-  Assert(isTypeCongruent(CallerTy->getReturnType(), CalleeTy->getReturnType()),
-         "cannot guarantee tail call due to mismatched return types", &CI);
+  Check(CallerTy->isVarArg() == CalleeTy->isVarArg(),
+        "cannot guarantee tail call due to mismatched varargs", &CI);
+  Check(isTypeCongruent(CallerTy->getReturnType(), CalleeTy->getReturnType()),
+        "cannot guarantee tail call due to mismatched return types", &CI);
 
   // - The calling conventions of the caller and callee must match.
-  Assert(F->getCallingConv() == CI.getCallingConv(),
-         "cannot guarantee tail call due to mismatched calling conv", &CI);
+  Check(F->getCallingConv() == CI.getCallingConv(),
+        "cannot guarantee tail call due to mismatched calling conv", &CI);
 
   // - The call must immediately precede a :ref:`ret <i_ret>` instruction,
   //   or a pointer bitcast followed by a ret instruction.
@@ -3426,19 +3498,18 @@ void Verifier::verifyMustTailCall(CallInst &CI) {
 
   // Handle the optional bitcast.
   if (BitCastInst *BI = dyn_cast_or_null<BitCastInst>(Next)) {
-    Assert(BI->getOperand(0) == RetVal,
-           "bitcast following musttail call must use the call", BI);
+    Check(BI->getOperand(0) == RetVal,
+          "bitcast following musttail call must use the call", BI);
     RetVal = BI;
     Next = BI->getNextNode();
   }
 
   // Check the return.
   ReturnInst *Ret = dyn_cast_or_null<ReturnInst>(Next);
-  Assert(Ret, "musttail call must precede a ret with an optional bitcast",
-         &CI);
-  Assert(!Ret->getReturnValue() || Ret->getReturnValue() == RetVal ||
-             isa<UndefValue>(Ret->getReturnValue()),
-         "musttail call result must be returned", Ret);
+  Check(Ret, "musttail call must precede a ret with an optional bitcast", &CI);
+  Check(!Ret->getReturnValue() || Ret->getReturnValue() == RetVal ||
+            isa<UndefValue>(Ret->getReturnValue()),
+        "musttail call result must be returned", Ret);
 
   AttributeList CallerAttrs = F->getAttributes();
   AttributeList CalleeAttrs = CI.getAttributes();
@@ -3460,8 +3531,8 @@ void Verifier::verifyMustTailCall(CallInst &CI) {
       verifyTailCCMustTailAttrs(ABIAttrs, Context);
     }
     // - Varargs functions are not allowed
-    Assert(!CallerTy->isVarArg(), Twine("cannot guarantee ") + CCName +
-                                      " tail call for varargs function");
+    Check(!CallerTy->isVarArg(), Twine("cannot guarantee ") + CCName +
+                                     " tail call for varargs function");
     return;
   }
 
@@ -3469,11 +3540,10 @@ void Verifier::verifyMustTailCall(CallInst &CI) {
   //   parameters or return types may differ in pointee type, but not
   //   address space.
   if (!CI.getCalledFunction() || !CI.getCalledFunction()->isIntrinsic()) {
-    Assert(CallerTy->getNumParams() == CalleeTy->getNumParams(),
-           "cannot guarantee tail call due to mismatched parameter counts",
-           &CI);
+    Check(CallerTy->getNumParams() == CalleeTy->getNumParams(),
+          "cannot guarantee tail call due to mismatched parameter counts", &CI);
     for (unsigned I = 0, E = CallerTy->getNumParams(); I != E; ++I) {
-      Assert(
+      Check(
           isTypeCongruent(CallerTy->getParamType(I), CalleeTy->getParamType(I)),
           "cannot guarantee tail call due to mismatched parameter types", &CI);
     }
@@ -3484,10 +3554,10 @@ void Verifier::verifyMustTailCall(CallInst &CI) {
   for (unsigned I = 0, E = CallerTy->getNumParams(); I != E; ++I) {
     AttrBuilder CallerABIAttrs = getParameterABIAttributes(F->getContext(), I, CallerAttrs);
     AttrBuilder CalleeABIAttrs = getParameterABIAttributes(F->getContext(), I, CalleeAttrs);
-    Assert(CallerABIAttrs == CalleeABIAttrs,
-           "cannot guarantee tail call due to mismatched ABI impacting "
-           "function attributes",
-           &CI, CI.getOperand(I));
+    Check(CallerABIAttrs == CalleeABIAttrs,
+          "cannot guarantee tail call due to mismatched ABI impacting "
+          "function attributes",
+          &CI, CI.getOperand(I));
   }
 }
 
@@ -3503,7 +3573,7 @@ void Verifier::visitInvokeInst(InvokeInst &II) {
 
   // Verify that the first non-PHI instruction of the unwind destination is an
   // exception handling instruction.
-  Assert(
+  Check(
       II.getUnwindDest()->isEHPad(),
       "The unwind destination does not have an exception handling instruction!",
       &II);
@@ -3514,17 +3584,17 @@ void Verifier::visitInvokeInst(InvokeInst &II) {
 /// visitUnaryOperator - Check the argument to the unary operator.
 ///
 void Verifier::visitUnaryOperator(UnaryOperator &U) {
-  Assert(U.getType() == U.getOperand(0)->getType(),
-         "Unary operators must have same type for"
-         "operands and result!",
-         &U);
+  Check(U.getType() == U.getOperand(0)->getType(),
+        "Unary operators must have same type for"
+        "operands and result!",
+        &U);
 
   switch (U.getOpcode()) {
   // Check that floating-point arithmetic operators are only used with
   // floating-point operands.
   case Instruction::FNeg:
-    Assert(U.getType()->isFPOrFPVectorTy(),
-           "FNeg operator only works with float types!", &U);
+    Check(U.getType()->isFPOrFPVectorTy(),
+          "FNeg operator only works with float types!", &U);
     break;
   default:
     llvm_unreachable("Unknown UnaryOperator opcode!");
@@ -3537,8 +3607,8 @@ void Verifier::visitUnaryOperator(UnaryOperator &U) {
 /// of the same type!
 ///
 void Verifier::visitBinaryOperator(BinaryOperator &B) {
-  Assert(B.getOperand(0)->getType() == B.getOperand(1)->getType(),
-         "Both operands to a binary operator are not of the same type!", &B);
+  Check(B.getOperand(0)->getType() == B.getOperand(1)->getType(),
+        "Both operands to a binary operator are not of the same type!", &B);
 
   switch (B.getOpcode()) {
   // Check that integer arithmetic operators are only used with
@@ -3550,12 +3620,12 @@ void Verifier::visitBinaryOperator(BinaryOperator &B) {
   case Instruction::UDiv:
   case Instruction::SRem:
   case Instruction::URem:
-    Assert(B.getType()->isIntOrIntVectorTy(),
-           "Integer arithmetic operators only work with integral types!", &B);
-    Assert(B.getType() == B.getOperand(0)->getType(),
-           "Integer arithmetic operators must have same type "
-           "for operands and result!",
-           &B);
+    Check(B.getType()->isIntOrIntVectorTy(),
+          "Integer arithmetic operators only work with integral types!", &B);
+    Check(B.getType() == B.getOperand(0)->getType(),
+          "Integer arithmetic operators must have same type "
+          "for operands and result!",
+          &B);
     break;
   // Check that floating-point arithmetic operators are only used with
   // floating-point operands.
@@ -3564,32 +3634,31 @@ void Verifier::visitBinaryOperator(BinaryOperator &B) {
   case Instruction::FMul:
   case Instruction::FDiv:
   case Instruction::FRem:
-    Assert(B.getType()->isFPOrFPVectorTy(),
-           "Floating-point arithmetic operators only work with "
-           "floating-point types!",
-           &B);
-    Assert(B.getType() == B.getOperand(0)->getType(),
-           "Floating-point arithmetic operators must have same type "
-           "for operands and result!",
-           &B);
+    Check(B.getType()->isFPOrFPVectorTy(),
+          "Floating-point arithmetic operators only work with "
+          "floating-point types!",
+          &B);
+    Check(B.getType() == B.getOperand(0)->getType(),
+          "Floating-point arithmetic operators must have same type "
+          "for operands and result!",
+          &B);
     break;
   // Check that logical operators are only used with integral operands.
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor:
-    Assert(B.getType()->isIntOrIntVectorTy(),
-           "Logical operators only work with integral types!", &B);
-    Assert(B.getType() == B.getOperand(0)->getType(),
-           "Logical operators must have same type for operands and result!",
-           &B);
+    Check(B.getType()->isIntOrIntVectorTy(),
+          "Logical operators only work with integral types!", &B);
+    Check(B.getType() == B.getOperand(0)->getType(),
+          "Logical operators must have same type for operands and result!", &B);
     break;
   case Instruction::Shl:
   case Instruction::LShr:
   case Instruction::AShr:
-    Assert(B.getType()->isIntOrIntVectorTy(),
-           "Shifts only work with integral types!", &B);
-    Assert(B.getType() == B.getOperand(0)->getType(),
-           "Shift return type must be same as operands!", &B);
+    Check(B.getType()->isIntOrIntVectorTy(),
+          "Shifts only work with integral types!", &B);
+    Check(B.getType() == B.getOperand(0)->getType(),
+          "Shift return type must be same as operands!", &B);
     break;
   default:
     llvm_unreachable("Unknown BinaryOperator opcode!");
@@ -3602,14 +3671,13 @@ void Verifier::visitICmpInst(ICmpInst &IC) {
   // Check that the operands are the same type
   Type *Op0Ty = IC.getOperand(0)->getType();
   Type *Op1Ty = IC.getOperand(1)->getType();
-  Assert(Op0Ty == Op1Ty,
-         "Both operands to ICmp instruction are not of the same type!", &IC);
+  Check(Op0Ty == Op1Ty,
+        "Both operands to ICmp instruction are not of the same type!", &IC);
   // Check that the operands are the right type
-  Assert(Op0Ty->isIntOrIntVectorTy() || Op0Ty->isPtrOrPtrVectorTy(),
-         "Invalid operand types for ICmp instruction", &IC);
+  Check(Op0Ty->isIntOrIntVectorTy() || Op0Ty->isPtrOrPtrVectorTy(),
+        "Invalid operand types for ICmp instruction", &IC);
   // Check that the predicate is valid.
-  Assert(IC.isIntPredicate(),
-         "Invalid predicate in ICmp instruction!", &IC);
+  Check(IC.isIntPredicate(), "Invalid predicate in ICmp instruction!", &IC);
 
   visitInstruction(IC);
 }
@@ -3618,63 +3686,61 @@ void Verifier::visitFCmpInst(FCmpInst &FC) {
   // Check that the operands are the same type
   Type *Op0Ty = FC.getOperand(0)->getType();
   Type *Op1Ty = FC.getOperand(1)->getType();
-  Assert(Op0Ty == Op1Ty,
-         "Both operands to FCmp instruction are not of the same type!", &FC);
+  Check(Op0Ty == Op1Ty,
+        "Both operands to FCmp instruction are not of the same type!", &FC);
   // Check that the operands are the right type
-  Assert(Op0Ty->isFPOrFPVectorTy(),
-         "Invalid operand types for FCmp instruction", &FC);
+  Check(Op0Ty->isFPOrFPVectorTy(), "Invalid operand types for FCmp instruction",
+        &FC);
   // Check that the predicate is valid.
-  Assert(FC.isFPPredicate(),
-         "Invalid predicate in FCmp instruction!", &FC);
+  Check(FC.isFPPredicate(), "Invalid predicate in FCmp instruction!", &FC);
 
   visitInstruction(FC);
 }
 
 void Verifier::visitExtractElementInst(ExtractElementInst &EI) {
-  Assert(
-      ExtractElementInst::isValidOperands(EI.getOperand(0), EI.getOperand(1)),
-      "Invalid extractelement operands!", &EI);
+  Check(ExtractElementInst::isValidOperands(EI.getOperand(0), EI.getOperand(1)),
+        "Invalid extractelement operands!", &EI);
   visitInstruction(EI);
 }
 
 void Verifier::visitInsertElementInst(InsertElementInst &IE) {
-  Assert(InsertElementInst::isValidOperands(IE.getOperand(0), IE.getOperand(1),
-                                            IE.getOperand(2)),
-         "Invalid insertelement operands!", &IE);
+  Check(InsertElementInst::isValidOperands(IE.getOperand(0), IE.getOperand(1),
+                                           IE.getOperand(2)),
+        "Invalid insertelement operands!", &IE);
   visitInstruction(IE);
 }
 
 void Verifier::visitShuffleVectorInst(ShuffleVectorInst &SV) {
-  Assert(ShuffleVectorInst::isValidOperands(SV.getOperand(0), SV.getOperand(1),
-                                            SV.getShuffleMask()),
-         "Invalid shufflevector operands!", &SV);
+  Check(ShuffleVectorInst::isValidOperands(SV.getOperand(0), SV.getOperand(1),
+                                           SV.getShuffleMask()),
+        "Invalid shufflevector operands!", &SV);
   visitInstruction(SV);
 }
 
 void Verifier::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   Type *TargetTy = GEP.getPointerOperandType()->getScalarType();
 
-  Assert(isa<PointerType>(TargetTy),
-         "GEP base pointer is not a vector or a vector of pointers", &GEP);
-  Assert(GEP.getSourceElementType()->isSized(), "GEP into unsized type!", &GEP);
+  Check(isa<PointerType>(TargetTy),
+        "GEP base pointer is not a vector or a vector of pointers", &GEP);
+  Check(GEP.getSourceElementType()->isSized(), "GEP into unsized type!", &GEP);
 
   SmallVector<Value *, 16> Idxs(GEP.indices());
-  Assert(all_of(
-      Idxs, [](Value* V) { return V->getType()->isIntOrIntVectorTy(); }),
+  Check(
+      all_of(Idxs, [](Value *V) { return V->getType()->isIntOrIntVectorTy(); }),
       "GEP indexes must be integers", &GEP);
   Type *ElTy =
       GetElementPtrInst::getIndexedType(GEP.getSourceElementType(), Idxs);
-  Assert(ElTy, "Invalid indices for GEP pointer type!", &GEP);
+  Check(ElTy, "Invalid indices for GEP pointer type!", &GEP);
 
-  Assert(GEP.getType()->isPtrOrPtrVectorTy() &&
-             GEP.getResultElementType() == ElTy,
-         "GEP is not of right type for indices!", &GEP, ElTy);
+  Check(GEP.getType()->isPtrOrPtrVectorTy() &&
+            GEP.getResultElementType() == ElTy,
+        "GEP is not of right type for indices!", &GEP, ElTy);
 
   if (auto *GEPVTy = dyn_cast<VectorType>(GEP.getType())) {
     // Additional checks for vector GEPs.
     ElementCount GEPWidth = GEPVTy->getElementCount();
     if (GEP.getPointerOperandType()->isVectorTy())
-      Assert(
+      Check(
           GEPWidth ==
               cast<VectorType>(GEP.getPointerOperandType())->getElementCount(),
           "Vector GEP result width doesn't match operand's", &GEP);
@@ -3682,16 +3748,16 @@ void Verifier::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       Type *IndexTy = Idx->getType();
       if (auto *IndexVTy = dyn_cast<VectorType>(IndexTy)) {
         ElementCount IndexWidth = IndexVTy->getElementCount();
-        Assert(IndexWidth == GEPWidth, "Invalid GEP index vector width", &GEP);
+        Check(IndexWidth == GEPWidth, "Invalid GEP index vector width", &GEP);
       }
-      Assert(IndexTy->isIntOrIntVectorTy(),
-             "All GEP indices should be of integer type");
+      Check(IndexTy->isIntOrIntVectorTy(),
+            "All GEP indices should be of integer type");
     }
   }
 
   if (auto *PTy = dyn_cast<PointerType>(GEP.getType())) {
-    Assert(GEP.getAddressSpace() == PTy->getAddressSpace(),
-           "GEP address space doesn't match type", &GEP);
+    Check(GEP.getAddressSpace() == PTy->getAddressSpace(),
+          "GEP address space doesn't match type", &GEP);
   }
 
   visitInstruction(GEP);
@@ -3706,33 +3772,33 @@ void Verifier::visitRangeMetadata(Instruction &I, MDNode *Range, Type *Ty) {
          "precondition violation");
 
   unsigned NumOperands = Range->getNumOperands();
-  Assert(NumOperands % 2 == 0, "Unfinished range!", Range);
+  Check(NumOperands % 2 == 0, "Unfinished range!", Range);
   unsigned NumRanges = NumOperands / 2;
-  Assert(NumRanges >= 1, "It should have at least one range!", Range);
+  Check(NumRanges >= 1, "It should have at least one range!", Range);
 
   ConstantRange LastRange(1, true); // Dummy initial value
   for (unsigned i = 0; i < NumRanges; ++i) {
     ConstantInt *Low =
         mdconst::dyn_extract<ConstantInt>(Range->getOperand(2 * i));
-    Assert(Low, "The lower limit must be an integer!", Low);
+    Check(Low, "The lower limit must be an integer!", Low);
     ConstantInt *High =
         mdconst::dyn_extract<ConstantInt>(Range->getOperand(2 * i + 1));
-    Assert(High, "The upper limit must be an integer!", High);
-    Assert(High->getType() == Low->getType() && High->getType() == Ty,
-           "Range types must match instruction type!", &I);
+    Check(High, "The upper limit must be an integer!", High);
+    Check(High->getType() == Low->getType() && High->getType() == Ty,
+          "Range types must match instruction type!", &I);
 
     APInt HighV = High->getValue();
     APInt LowV = Low->getValue();
     ConstantRange CurRange(LowV, HighV);
-    Assert(!CurRange.isEmptySet() && !CurRange.isFullSet(),
-           "Range must not be empty!", Range);
+    Check(!CurRange.isEmptySet() && !CurRange.isFullSet(),
+          "Range must not be empty!", Range);
     if (i != 0) {
-      Assert(CurRange.intersectWith(LastRange).isEmptySet(),
-             "Intervals are overlapping", Range);
-      Assert(LowV.sgt(LastRange.getLower()), "Intervals are not in order",
-             Range);
-      Assert(!isContiguous(CurRange, LastRange), "Intervals are contiguous",
-             Range);
+      Check(CurRange.intersectWith(LastRange).isEmptySet(),
+            "Intervals are overlapping", Range);
+      Check(LowV.sgt(LastRange.getLower()), "Intervals are not in order",
+            Range);
+      Check(!isContiguous(CurRange, LastRange), "Intervals are contiguous",
+            Range);
     }
     LastRange = ConstantRange(LowV, HighV);
   }
@@ -3742,41 +3808,41 @@ void Verifier::visitRangeMetadata(Instruction &I, MDNode *Range, Type *Ty) {
     APInt FirstHigh =
         mdconst::dyn_extract<ConstantInt>(Range->getOperand(1))->getValue();
     ConstantRange FirstRange(FirstLow, FirstHigh);
-    Assert(FirstRange.intersectWith(LastRange).isEmptySet(),
-           "Intervals are overlapping", Range);
-    Assert(!isContiguous(FirstRange, LastRange), "Intervals are contiguous",
-           Range);
+    Check(FirstRange.intersectWith(LastRange).isEmptySet(),
+          "Intervals are overlapping", Range);
+    Check(!isContiguous(FirstRange, LastRange), "Intervals are contiguous",
+          Range);
   }
 }
 
 void Verifier::checkAtomicMemAccessSize(Type *Ty, const Instruction *I) {
   unsigned Size = DL.getTypeSizeInBits(Ty);
-  Assert(Size >= 8, "atomic memory access' size must be byte-sized", Ty, I);
-  Assert(!(Size & (Size - 1)),
-         "atomic memory access' operand must have a power-of-two size", Ty, I);
+  Check(Size >= 8, "atomic memory access' size must be byte-sized", Ty, I);
+  Check(!(Size & (Size - 1)),
+        "atomic memory access' operand must have a power-of-two size", Ty, I);
 }
 
 void Verifier::visitLoadInst(LoadInst &LI) {
   PointerType *PTy = dyn_cast<PointerType>(LI.getOperand(0)->getType());
-  Assert(PTy, "Load operand must be a pointer.", &LI);
+  Check(PTy, "Load operand must be a pointer.", &LI);
   Type *ElTy = LI.getType();
   if (MaybeAlign A = LI.getAlign()) {
-    Assert(A->value() <= Value::MaximumAlignment,
-           "huge alignment values are unsupported", &LI);
+    Check(A->value() <= Value::MaximumAlignment,
+          "huge alignment values are unsupported", &LI);
   }
-  Assert(ElTy->isSized(), "loading unsized types is not allowed", &LI);
+  Check(ElTy->isSized(), "loading unsized types is not allowed", &LI);
   if (LI.isAtomic()) {
-    Assert(LI.getOrdering() != AtomicOrdering::Release &&
-               LI.getOrdering() != AtomicOrdering::AcquireRelease,
-           "Load cannot have Release ordering", &LI);
-    Assert(ElTy->isIntOrPtrTy() || ElTy->isFloatingPointTy(),
-           "atomic load operand must have integer, pointer, or floating point "
-           "type!",
-           ElTy, &LI);
+    Check(LI.getOrdering() != AtomicOrdering::Release &&
+              LI.getOrdering() != AtomicOrdering::AcquireRelease,
+          "Load cannot have Release ordering", &LI);
+    Check(ElTy->isIntOrPtrTy() || ElTy->isFloatingPointTy(),
+          "atomic load operand must have integer, pointer, or floating point "
+          "type!",
+          ElTy, &LI);
     checkAtomicMemAccessSize(ElTy, &LI);
   } else {
-    Assert(LI.getSyncScopeID() == SyncScope::System,
-           "Non-atomic load cannot have SynchronizationScope specified", &LI);
+    Check(LI.getSyncScopeID() == SyncScope::System,
+          "Non-atomic load cannot have SynchronizationScope specified", &LI);
   }
 
   visitInstruction(LI);
@@ -3784,27 +3850,27 @@ void Verifier::visitLoadInst(LoadInst &LI) {
 
 void Verifier::visitStoreInst(StoreInst &SI) {
   PointerType *PTy = dyn_cast<PointerType>(SI.getOperand(1)->getType());
-  Assert(PTy, "Store operand must be a pointer.", &SI);
+  Check(PTy, "Store operand must be a pointer.", &SI);
   Type *ElTy = SI.getOperand(0)->getType();
-  Assert(PTy->isOpaqueOrPointeeTypeMatches(ElTy),
-         "Stored value type does not match pointer operand type!", &SI, ElTy);
+  Check(PTy->isOpaqueOrPointeeTypeMatches(ElTy),
+        "Stored value type does not match pointer operand type!", &SI, ElTy);
   if (MaybeAlign A = SI.getAlign()) {
-    Assert(A->value() <= Value::MaximumAlignment,
-           "huge alignment values are unsupported", &SI);
+    Check(A->value() <= Value::MaximumAlignment,
+          "huge alignment values are unsupported", &SI);
   }
-  Assert(ElTy->isSized(), "storing unsized types is not allowed", &SI);
+  Check(ElTy->isSized(), "storing unsized types is not allowed", &SI);
   if (SI.isAtomic()) {
-    Assert(SI.getOrdering() != AtomicOrdering::Acquire &&
-               SI.getOrdering() != AtomicOrdering::AcquireRelease,
-           "Store cannot have Acquire ordering", &SI);
-    Assert(ElTy->isIntOrPtrTy() || ElTy->isFloatingPointTy(),
-           "atomic store operand must have integer, pointer, or floating point "
-           "type!",
-           ElTy, &SI);
+    Check(SI.getOrdering() != AtomicOrdering::Acquire &&
+              SI.getOrdering() != AtomicOrdering::AcquireRelease,
+          "Store cannot have Acquire ordering", &SI);
+    Check(ElTy->isIntOrPtrTy() || ElTy->isFloatingPointTy(),
+          "atomic store operand must have integer, pointer, or floating point "
+          "type!",
+          ElTy, &SI);
     checkAtomicMemAccessSize(ElTy, &SI);
   } else {
-    Assert(SI.getSyncScopeID() == SyncScope::System,
-           "Non-atomic store cannot have SynchronizationScope specified", &SI);
+    Check(SI.getSyncScopeID() == SyncScope::System,
+          "Non-atomic store cannot have SynchronizationScope specified", &SI);
   }
   visitInstruction(SI);
 }
@@ -3814,10 +3880,10 @@ void Verifier::verifySwiftErrorCall(CallBase &Call,
                                     const Value *SwiftErrorVal) {
   for (const auto &I : llvm::enumerate(Call.args())) {
     if (I.value() == SwiftErrorVal) {
-      Assert(Call.paramHasAttr(I.index(), Attribute::SwiftError),
-             "swifterror value when used in a callsite should be marked "
-             "with swifterror attribute",
-             SwiftErrorVal, Call);
+      Check(Call.paramHasAttr(I.index(), Attribute::SwiftError),
+            "swifterror value when used in a callsite should be marked "
+            "with swifterror attribute",
+            SwiftErrorVal, Call);
     }
   }
 }
@@ -3826,16 +3892,17 @@ void Verifier::verifySwiftErrorValue(const Value *SwiftErrorVal) {
   // Check that swifterror value is only used by loads, stores, or as
   // a swifterror argument.
   for (const User *U : SwiftErrorVal->users()) {
-    Assert(isa<LoadInst>(U) || isa<StoreInst>(U) || isa<CallInst>(U) ||
-           isa<InvokeInst>(U),
-           "swifterror value can only be loaded and stored from, or "
-           "as a swifterror argument!",
-           SwiftErrorVal, U);
+    Check(isa<LoadInst>(U) || isa<StoreInst>(U) || isa<CallInst>(U) ||
+              isa<InvokeInst>(U),
+          "swifterror value can only be loaded and stored from, or "
+          "as a swifterror argument!",
+          SwiftErrorVal, U);
     // If it is used by a store, check it is the second operand.
     if (auto StoreI = dyn_cast<StoreInst>(U))
-      Assert(StoreI->getOperand(1) == SwiftErrorVal,
-             "swifterror value should be the second operand when used "
-             "by stores", SwiftErrorVal, U);
+      Check(StoreI->getOperand(1) == SwiftErrorVal,
+            "swifterror value should be the second operand when used "
+            "by stores",
+            SwiftErrorVal, U);
     if (auto *Call = dyn_cast<CallBase>(U))
       verifySwiftErrorCall(*const_cast<CallBase *>(Call), SwiftErrorVal);
   }
@@ -3843,16 +3910,20 @@ void Verifier::verifySwiftErrorValue(const Value *SwiftErrorVal) {
 
 void Verifier::visitAllocaInst(AllocaInst &AI) {
   SmallPtrSet<Type*, 4> Visited;
-  Assert(AI.getAllocatedType()->isSized(&Visited),
-         "Cannot allocate unsized type", &AI);
-  Assert(AI.getArraySize()->getType()->isIntegerTy(),
-         "Alloca array size must have integer type", &AI);
+  Check(AI.getAllocatedType()->isSized(&Visited),
+        "Cannot allocate unsized type", &AI);
+  Check(AI.getArraySize()->getType()->isIntegerTy(),
+        "Alloca array size must have integer type", &AI);
   if (MaybeAlign A = AI.getAlign()) {
-    Assert(A->value() <= Value::MaximumAlignment,
-           "huge alignment values are unsupported", &AI);
+    Check(A->value() <= Value::MaximumAlignment,
+          "huge alignment values are unsupported", &AI);
   }
 
   if (AI.isSwiftError()) {
+    Check(AI.getAllocatedType()->isPointerTy(),
+          "swifterror alloca must have pointer type", &AI);
+    Check(!AI.isArrayAllocation(),
+          "swifterror alloca must not be array allocation", &AI);
     verifySwiftErrorValue(&AI);
   }
 
@@ -3861,64 +3932,65 @@ void Verifier::visitAllocaInst(AllocaInst &AI) {
 
 void Verifier::visitAtomicCmpXchgInst(AtomicCmpXchgInst &CXI) {
   Type *ElTy = CXI.getOperand(1)->getType();
-  Assert(ElTy->isIntOrPtrTy(),
-         "cmpxchg operand must have integer or pointer type", ElTy, &CXI);
+  Check(ElTy->isIntOrPtrTy(),
+        "cmpxchg operand must have integer or pointer type", ElTy, &CXI);
   checkAtomicMemAccessSize(ElTy, &CXI);
   visitInstruction(CXI);
 }
 
 void Verifier::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
-  Assert(RMWI.getOrdering() != AtomicOrdering::Unordered,
-         "atomicrmw instructions cannot be unordered.", &RMWI);
+  Check(RMWI.getOrdering() != AtomicOrdering::Unordered,
+        "atomicrmw instructions cannot be unordered.", &RMWI);
   auto Op = RMWI.getOperation();
   Type *ElTy = RMWI.getOperand(1)->getType();
   if (Op == AtomicRMWInst::Xchg) {
-    Assert(ElTy->isIntegerTy() || ElTy->isFloatingPointTy(), "atomicrmw " +
-           AtomicRMWInst::getOperationName(Op) +
-           " operand must have integer or floating point type!",
-           &RMWI, ElTy);
+    Check(ElTy->isIntegerTy() || ElTy->isFloatingPointTy() ||
+              ElTy->isPointerTy(),
+          "atomicrmw " + AtomicRMWInst::getOperationName(Op) +
+              " operand must have integer or floating point type!",
+          &RMWI, ElTy);
   } else if (AtomicRMWInst::isFPOperation(Op)) {
-    Assert(ElTy->isFloatingPointTy(), "atomicrmw " +
-           AtomicRMWInst::getOperationName(Op) +
-           " operand must have floating point type!",
-           &RMWI, ElTy);
+    Check(ElTy->isFloatingPointTy(),
+          "atomicrmw " + AtomicRMWInst::getOperationName(Op) +
+              " operand must have floating point type!",
+          &RMWI, ElTy);
   } else {
-    Assert(ElTy->isIntegerTy(), "atomicrmw " +
-           AtomicRMWInst::getOperationName(Op) +
-           " operand must have integer type!",
-           &RMWI, ElTy);
+    Check(ElTy->isIntegerTy(),
+          "atomicrmw " + AtomicRMWInst::getOperationName(Op) +
+              " operand must have integer type!",
+          &RMWI, ElTy);
   }
   checkAtomicMemAccessSize(ElTy, &RMWI);
-  Assert(AtomicRMWInst::FIRST_BINOP <= Op && Op <= AtomicRMWInst::LAST_BINOP,
-         "Invalid binary operation!", &RMWI);
+  Check(AtomicRMWInst::FIRST_BINOP <= Op && Op <= AtomicRMWInst::LAST_BINOP,
+        "Invalid binary operation!", &RMWI);
   visitInstruction(RMWI);
 }
 
 void Verifier::visitFenceInst(FenceInst &FI) {
   const AtomicOrdering Ordering = FI.getOrdering();
-  Assert(Ordering == AtomicOrdering::Acquire ||
-             Ordering == AtomicOrdering::Release ||
-             Ordering == AtomicOrdering::AcquireRelease ||
-             Ordering == AtomicOrdering::SequentiallyConsistent,
-         "fence instructions may only have acquire, release, acq_rel, or "
-         "seq_cst ordering.",
-         &FI);
+  Check(Ordering == AtomicOrdering::Acquire ||
+            Ordering == AtomicOrdering::Release ||
+            Ordering == AtomicOrdering::AcquireRelease ||
+            Ordering == AtomicOrdering::SequentiallyConsistent,
+        "fence instructions may only have acquire, release, acq_rel, or "
+        "seq_cst ordering.",
+        &FI);
   visitInstruction(FI);
 }
 
 void Verifier::visitExtractValueInst(ExtractValueInst &EVI) {
-  Assert(ExtractValueInst::getIndexedType(EVI.getAggregateOperand()->getType(),
-                                          EVI.getIndices()) == EVI.getType(),
-         "Invalid ExtractValueInst operands!", &EVI);
+  Check(ExtractValueInst::getIndexedType(EVI.getAggregateOperand()->getType(),
+                                         EVI.getIndices()) == EVI.getType(),
+        "Invalid ExtractValueInst operands!", &EVI);
 
   visitInstruction(EVI);
 }
 
 void Verifier::visitInsertValueInst(InsertValueInst &IVI) {
-  Assert(ExtractValueInst::getIndexedType(IVI.getAggregateOperand()->getType(),
-                                          IVI.getIndices()) ==
-             IVI.getOperand(1)->getType(),
-         "Invalid InsertValueInst operands!", &IVI);
+  Check(ExtractValueInst::getIndexedType(IVI.getAggregateOperand()->getType(),
+                                         IVI.getIndices()) ==
+            IVI.getOperand(1)->getType(),
+        "Invalid InsertValueInst operands!", &IVI);
 
   visitInstruction(IVI);
 }
@@ -3936,7 +4008,7 @@ void Verifier::visitEHPadPredecessors(Instruction &I) {
   BasicBlock *BB = I.getParent();
   Function *F = BB->getParent();
 
-  Assert(BB != &F->getEntryBlock(), "EH pad cannot be in entry block.", &I);
+  Check(BB != &F->getEntryBlock(), "EH pad cannot be in entry block.", &I);
 
   if (auto *LPI = dyn_cast<LandingPadInst>(&I)) {
     // The landingpad instruction defines its parent as a landing pad block. The
@@ -3944,22 +4016,22 @@ void Verifier::visitEHPadPredecessors(Instruction &I) {
     // invoke.
     for (BasicBlock *PredBB : predecessors(BB)) {
       const auto *II = dyn_cast<InvokeInst>(PredBB->getTerminator());
-      Assert(II && II->getUnwindDest() == BB && II->getNormalDest() != BB,
-             "Block containing LandingPadInst must be jumped to "
-             "only by the unwind edge of an invoke.",
-             LPI);
+      Check(II && II->getUnwindDest() == BB && II->getNormalDest() != BB,
+            "Block containing LandingPadInst must be jumped to "
+            "only by the unwind edge of an invoke.",
+            LPI);
     }
     return;
   }
   if (auto *CPI = dyn_cast<CatchPadInst>(&I)) {
     if (!pred_empty(BB))
-      Assert(BB->getUniquePredecessor() == CPI->getCatchSwitch()->getParent(),
-             "Block containg CatchPadInst must be jumped to "
-             "only by its catchswitch.",
-             CPI);
-    Assert(BB != CPI->getCatchSwitch()->getUnwindDest(),
-           "Catchswitch cannot unwind to one of its catchpads",
-           CPI->getCatchSwitch(), CPI);
+      Check(BB->getUniquePredecessor() == CPI->getCatchSwitch()->getParent(),
+            "Block containg CatchPadInst must be jumped to "
+            "only by its catchswitch.",
+            CPI);
+    Check(BB != CPI->getCatchSwitch()->getUnwindDest(),
+          "Catchswitch cannot unwind to one of its catchpads",
+          CPI->getCatchSwitch(), CPI);
     return;
   }
 
@@ -3971,39 +4043,39 @@ void Verifier::visitEHPadPredecessors(Instruction &I) {
     Instruction *TI = PredBB->getTerminator();
     Value *FromPad;
     if (auto *II = dyn_cast<InvokeInst>(TI)) {
-      Assert(II->getUnwindDest() == BB && II->getNormalDest() != BB,
-             "EH pad must be jumped to via an unwind edge", ToPad, II);
+      Check(II->getUnwindDest() == BB && II->getNormalDest() != BB,
+            "EH pad must be jumped to via an unwind edge", ToPad, II);
       if (auto Bundle = II->getOperandBundle(LLVMContext::OB_funclet))
         FromPad = Bundle->Inputs[0];
       else
         FromPad = ConstantTokenNone::get(II->getContext());
     } else if (auto *CRI = dyn_cast<CleanupReturnInst>(TI)) {
       FromPad = CRI->getOperand(0);
-      Assert(FromPad != ToPadParent, "A cleanupret must exit its cleanup", CRI);
+      Check(FromPad != ToPadParent, "A cleanupret must exit its cleanup", CRI);
     } else if (auto *CSI = dyn_cast<CatchSwitchInst>(TI)) {
       FromPad = CSI;
     } else {
-      Assert(false, "EH pad must be jumped to via an unwind edge", ToPad, TI);
+      Check(false, "EH pad must be jumped to via an unwind edge", ToPad, TI);
     }
 
     // The edge may exit from zero or more nested pads.
     SmallSet<Value *, 8> Seen;
     for (;; FromPad = getParentPad(FromPad)) {
-      Assert(FromPad != ToPad,
-             "EH pad cannot handle exceptions raised within it", FromPad, TI);
+      Check(FromPad != ToPad,
+            "EH pad cannot handle exceptions raised within it", FromPad, TI);
       if (FromPad == ToPadParent) {
         // This is a legal unwind edge.
         break;
       }
-      Assert(!isa<ConstantTokenNone>(FromPad),
-             "A single unwind edge may only enter one EH pad", TI);
-      Assert(Seen.insert(FromPad).second,
-             "EH pad jumps through a cycle of pads", FromPad);
+      Check(!isa<ConstantTokenNone>(FromPad),
+            "A single unwind edge may only enter one EH pad", TI);
+      Check(Seen.insert(FromPad).second, "EH pad jumps through a cycle of pads",
+            FromPad);
 
       // This will be diagnosed on the corresponding instruction already. We
       // need the extra check here to make sure getParentPad() works.
-      Assert(isa<FuncletPadInst>(FromPad) || isa<CatchSwitchInst>(FromPad),
-             "Parent pad must be catchpad/cleanuppad/catchswitch", TI);
+      Check(isa<FuncletPadInst>(FromPad) || isa<CatchSwitchInst>(FromPad),
+            "Parent pad must be catchpad/cleanuppad/catchswitch", TI);
     }
   }
 }
@@ -4011,38 +4083,37 @@ void Verifier::visitEHPadPredecessors(Instruction &I) {
 void Verifier::visitLandingPadInst(LandingPadInst &LPI) {
   // The landingpad instruction is ill-formed if it doesn't have any clauses and
   // isn't a cleanup.
-  Assert(LPI.getNumClauses() > 0 || LPI.isCleanup(),
-         "LandingPadInst needs at least one clause or to be a cleanup.", &LPI);
+  Check(LPI.getNumClauses() > 0 || LPI.isCleanup(),
+        "LandingPadInst needs at least one clause or to be a cleanup.", &LPI);
 
   visitEHPadPredecessors(LPI);
 
   if (!LandingPadResultTy)
     LandingPadResultTy = LPI.getType();
   else
-    Assert(LandingPadResultTy == LPI.getType(),
-           "The landingpad instruction should have a consistent result type "
-           "inside a function.",
-           &LPI);
+    Check(LandingPadResultTy == LPI.getType(),
+          "The landingpad instruction should have a consistent result type "
+          "inside a function.",
+          &LPI);
 
   Function *F = LPI.getParent()->getParent();
-  Assert(F->hasPersonalityFn(),
-         "LandingPadInst needs to be in a function with a personality.", &LPI);
+  Check(F->hasPersonalityFn(),
+        "LandingPadInst needs to be in a function with a personality.", &LPI);
 
   // The landingpad instruction must be the first non-PHI instruction in the
   // block.
-  Assert(LPI.getParent()->getLandingPadInst() == &LPI,
-         "LandingPadInst not the first non-PHI instruction in the block.",
-         &LPI);
+  Check(LPI.getParent()->getLandingPadInst() == &LPI,
+        "LandingPadInst not the first non-PHI instruction in the block.", &LPI);
 
   for (unsigned i = 0, e = LPI.getNumClauses(); i < e; ++i) {
     Constant *Clause = LPI.getClause(i);
     if (LPI.isCatch(i)) {
-      Assert(isa<PointerType>(Clause->getType()),
-             "Catch operand does not have pointer type!", &LPI);
+      Check(isa<PointerType>(Clause->getType()),
+            "Catch operand does not have pointer type!", &LPI);
     } else {
-      Assert(LPI.isFilter(i), "Clause is neither catch nor filter!", &LPI);
-      Assert(isa<ConstantArray>(Clause) || isa<ConstantAggregateZero>(Clause),
-             "Filter operand is not an array of constants!", &LPI);
+      Check(LPI.isFilter(i), "Clause is neither catch nor filter!", &LPI);
+      Check(isa<ConstantArray>(Clause) || isa<ConstantAggregateZero>(Clause),
+            "Filter operand is not an array of constants!", &LPI);
     }
   }
 
@@ -4050,16 +4121,16 @@ void Verifier::visitLandingPadInst(LandingPadInst &LPI) {
 }
 
 void Verifier::visitResumeInst(ResumeInst &RI) {
-  Assert(RI.getFunction()->hasPersonalityFn(),
-         "ResumeInst needs to be in a function with a personality.", &RI);
+  Check(RI.getFunction()->hasPersonalityFn(),
+        "ResumeInst needs to be in a function with a personality.", &RI);
 
   if (!LandingPadResultTy)
     LandingPadResultTy = RI.getValue()->getType();
   else
-    Assert(LandingPadResultTy == RI.getValue()->getType(),
-           "The resume instruction should have a consistent result type "
-           "inside a function.",
-           &RI);
+    Check(LandingPadResultTy == RI.getValue()->getType(),
+          "The resume instruction should have a consistent result type "
+          "inside a function.",
+          &RI);
 
   visitTerminator(RI);
 }
@@ -4068,26 +4139,26 @@ void Verifier::visitCatchPadInst(CatchPadInst &CPI) {
   BasicBlock *BB = CPI.getParent();
 
   Function *F = BB->getParent();
-  Assert(F->hasPersonalityFn(),
-         "CatchPadInst needs to be in a function with a personality.", &CPI);
+  Check(F->hasPersonalityFn(),
+        "CatchPadInst needs to be in a function with a personality.", &CPI);
 
-  Assert(isa<CatchSwitchInst>(CPI.getParentPad()),
-         "CatchPadInst needs to be directly nested in a CatchSwitchInst.",
-         CPI.getParentPad());
+  Check(isa<CatchSwitchInst>(CPI.getParentPad()),
+        "CatchPadInst needs to be directly nested in a CatchSwitchInst.",
+        CPI.getParentPad());
 
   // The catchpad instruction must be the first non-PHI instruction in the
   // block.
-  Assert(BB->getFirstNonPHI() == &CPI,
-         "CatchPadInst not the first non-PHI instruction in the block.", &CPI);
+  Check(BB->getFirstNonPHI() == &CPI,
+        "CatchPadInst not the first non-PHI instruction in the block.", &CPI);
 
   visitEHPadPredecessors(CPI);
   visitFuncletPadInst(CPI);
 }
 
 void Verifier::visitCatchReturnInst(CatchReturnInst &CatchReturn) {
-  Assert(isa<CatchPadInst>(CatchReturn.getOperand(0)),
-         "CatchReturnInst needs to be provided a CatchPad", &CatchReturn,
-         CatchReturn.getOperand(0));
+  Check(isa<CatchPadInst>(CatchReturn.getOperand(0)),
+        "CatchReturnInst needs to be provided a CatchPad", &CatchReturn,
+        CatchReturn.getOperand(0));
 
   visitTerminator(CatchReturn);
 }
@@ -4096,18 +4167,17 @@ void Verifier::visitCleanupPadInst(CleanupPadInst &CPI) {
   BasicBlock *BB = CPI.getParent();
 
   Function *F = BB->getParent();
-  Assert(F->hasPersonalityFn(),
-         "CleanupPadInst needs to be in a function with a personality.", &CPI);
+  Check(F->hasPersonalityFn(),
+        "CleanupPadInst needs to be in a function with a personality.", &CPI);
 
   // The cleanuppad instruction must be the first non-PHI instruction in the
   // block.
-  Assert(BB->getFirstNonPHI() == &CPI,
-         "CleanupPadInst not the first non-PHI instruction in the block.",
-         &CPI);
+  Check(BB->getFirstNonPHI() == &CPI,
+        "CleanupPadInst not the first non-PHI instruction in the block.", &CPI);
 
   auto *ParentPad = CPI.getParentPad();
-  Assert(isa<ConstantTokenNone>(ParentPad) || isa<FuncletPadInst>(ParentPad),
-         "CleanupPadInst has an invalid parent.", &CPI);
+  Check(isa<ConstantTokenNone>(ParentPad) || isa<FuncletPadInst>(ParentPad),
+        "CleanupPadInst has an invalid parent.", &CPI);
 
   visitEHPadPredecessors(CPI);
   visitFuncletPadInst(CPI);
@@ -4121,8 +4191,8 @@ void Verifier::visitFuncletPadInst(FuncletPadInst &FPI) {
 
   while (!Worklist.empty()) {
     FuncletPadInst *CurrentPad = Worklist.pop_back_val();
-    Assert(Seen.insert(CurrentPad).second,
-           "FuncletPadInst must not be nested within itself", CurrentPad);
+    Check(Seen.insert(CurrentPad).second,
+          "FuncletPadInst must not be nested within itself", CurrentPad);
     Value *UnresolvedAncestorPad = nullptr;
     for (User *U : CurrentPad->users()) {
       BasicBlock *UnwindDest;
@@ -4150,7 +4220,7 @@ void Verifier::visitFuncletPadInst(FuncletPadInst &FPI) {
         Worklist.push_back(CPI);
         continue;
       } else {
-        Assert(isa<CatchReturnInst>(U), "Bogus funclet pad use", U);
+        Check(isa<CatchReturnInst>(U), "Bogus funclet pad use", U);
         continue;
       }
 
@@ -4200,10 +4270,11 @@ void Verifier::visitFuncletPadInst(FuncletPadInst &FPI) {
         // This unwind edge exits FPI.  Make sure it agrees with other
         // such edges.
         if (FirstUser) {
-          Assert(UnwindPad == FirstUnwindPad, "Unwind edges out of a funclet "
-                                              "pad must have the same unwind "
-                                              "dest",
-                 &FPI, U, FirstUser);
+          Check(UnwindPad == FirstUnwindPad,
+                "Unwind edges out of a funclet "
+                "pad must have the same unwind "
+                "dest",
+                &FPI, U, FirstUser);
         } else {
           FirstUser = U;
           FirstUnwindPad = UnwindPad;
@@ -4262,10 +4333,10 @@ void Verifier::visitFuncletPadInst(FuncletPadInst &FPI) {
         SwitchUnwindPad = SwitchUnwindDest->getFirstNonPHI();
       else
         SwitchUnwindPad = ConstantTokenNone::get(FPI.getContext());
-      Assert(SwitchUnwindPad == FirstUnwindPad,
-             "Unwind edges out of a catch must have the same unwind dest as "
-             "the parent catchswitch",
-             &FPI, FirstUser, CatchSwitch);
+      Check(SwitchUnwindPad == FirstUnwindPad,
+            "Unwind edges out of a catch must have the same unwind dest as "
+            "the parent catchswitch",
+            &FPI, FirstUser, CatchSwitch);
     }
   }
 
@@ -4276,38 +4347,38 @@ void Verifier::visitCatchSwitchInst(CatchSwitchInst &CatchSwitch) {
   BasicBlock *BB = CatchSwitch.getParent();
 
   Function *F = BB->getParent();
-  Assert(F->hasPersonalityFn(),
-         "CatchSwitchInst needs to be in a function with a personality.",
-         &CatchSwitch);
+  Check(F->hasPersonalityFn(),
+        "CatchSwitchInst needs to be in a function with a personality.",
+        &CatchSwitch);
 
   // The catchswitch instruction must be the first non-PHI instruction in the
   // block.
-  Assert(BB->getFirstNonPHI() == &CatchSwitch,
-         "CatchSwitchInst not the first non-PHI instruction in the block.",
-         &CatchSwitch);
+  Check(BB->getFirstNonPHI() == &CatchSwitch,
+        "CatchSwitchInst not the first non-PHI instruction in the block.",
+        &CatchSwitch);
 
   auto *ParentPad = CatchSwitch.getParentPad();
-  Assert(isa<ConstantTokenNone>(ParentPad) || isa<FuncletPadInst>(ParentPad),
-         "CatchSwitchInst has an invalid parent.", ParentPad);
+  Check(isa<ConstantTokenNone>(ParentPad) || isa<FuncletPadInst>(ParentPad),
+        "CatchSwitchInst has an invalid parent.", ParentPad);
 
   if (BasicBlock *UnwindDest = CatchSwitch.getUnwindDest()) {
     Instruction *I = UnwindDest->getFirstNonPHI();
-    Assert(I->isEHPad() && !isa<LandingPadInst>(I),
-           "CatchSwitchInst must unwind to an EH block which is not a "
-           "landingpad.",
-           &CatchSwitch);
+    Check(I->isEHPad() && !isa<LandingPadInst>(I),
+          "CatchSwitchInst must unwind to an EH block which is not a "
+          "landingpad.",
+          &CatchSwitch);
 
     // Record catchswitch sibling unwinds for verifySiblingFuncletUnwinds
     if (getParentPad(I) == ParentPad)
       SiblingFuncletInfo[&CatchSwitch] = &CatchSwitch;
   }
 
-  Assert(CatchSwitch.getNumHandlers() != 0,
-         "CatchSwitchInst cannot have empty handler list", &CatchSwitch);
+  Check(CatchSwitch.getNumHandlers() != 0,
+        "CatchSwitchInst cannot have empty handler list", &CatchSwitch);
 
   for (BasicBlock *Handler : CatchSwitch.handlers()) {
-    Assert(isa<CatchPadInst>(Handler->getFirstNonPHI()),
-           "CatchSwitchInst handlers must be catchpads", &CatchSwitch, Handler);
+    Check(isa<CatchPadInst>(Handler->getFirstNonPHI()),
+          "CatchSwitchInst handlers must be catchpads", &CatchSwitch, Handler);
   }
 
   visitEHPadPredecessors(CatchSwitch);
@@ -4315,16 +4386,16 @@ void Verifier::visitCatchSwitchInst(CatchSwitchInst &CatchSwitch) {
 }
 
 void Verifier::visitCleanupReturnInst(CleanupReturnInst &CRI) {
-  Assert(isa<CleanupPadInst>(CRI.getOperand(0)),
-         "CleanupReturnInst needs to be provided a CleanupPad", &CRI,
-         CRI.getOperand(0));
+  Check(isa<CleanupPadInst>(CRI.getOperand(0)),
+        "CleanupReturnInst needs to be provided a CleanupPad", &CRI,
+        CRI.getOperand(0));
 
   if (BasicBlock *UnwindDest = CRI.getUnwindDest()) {
     Instruction *I = UnwindDest->getFirstNonPHI();
-    Assert(I->isEHPad() && !isa<LandingPadInst>(I),
-           "CleanupReturnInst must unwind to an EH block which is not a "
-           "landingpad.",
-           &CRI);
+    Check(I->isEHPad() && !isa<LandingPadInst>(I),
+          "CleanupReturnInst must unwind to an EH block which is not a "
+          "landingpad.",
+          &CRI);
   }
 
   visitTerminator(CRI);
@@ -4351,39 +4422,45 @@ void Verifier::verifyDominatesUse(Instruction &I, unsigned i) {
     return;
 
   const Use &U = I.getOperandUse(i);
-  Assert(DT.dominates(Op, U),
-         "Instruction does not dominate all uses!", Op, &I);
+  Check(DT.dominates(Op, U), "Instruction does not dominate all uses!", Op, &I);
 }
 
 void Verifier::visitDereferenceableMetadata(Instruction& I, MDNode* MD) {
-  Assert(I.getType()->isPointerTy(), "dereferenceable, dereferenceable_or_null "
-         "apply only to pointer types", &I);
-  Assert((isa<LoadInst>(I) || isa<IntToPtrInst>(I)),
-         "dereferenceable, dereferenceable_or_null apply only to load"
-         " and inttoptr instructions, use attributes for calls or invokes", &I);
-  Assert(MD->getNumOperands() == 1, "dereferenceable, dereferenceable_or_null "
-         "take one operand!", &I);
+  Check(I.getType()->isPointerTy(),
+        "dereferenceable, dereferenceable_or_null "
+        "apply only to pointer types",
+        &I);
+  Check((isa<LoadInst>(I) || isa<IntToPtrInst>(I)),
+        "dereferenceable, dereferenceable_or_null apply only to load"
+        " and inttoptr instructions, use attributes for calls or invokes",
+        &I);
+  Check(MD->getNumOperands() == 1,
+        "dereferenceable, dereferenceable_or_null "
+        "take one operand!",
+        &I);
   ConstantInt *CI = mdconst::dyn_extract<ConstantInt>(MD->getOperand(0));
-  Assert(CI && CI->getType()->isIntegerTy(64), "dereferenceable, "
-         "dereferenceable_or_null metadata value must be an i64!", &I);
+  Check(CI && CI->getType()->isIntegerTy(64),
+        "dereferenceable, "
+        "dereferenceable_or_null metadata value must be an i64!",
+        &I);
 }
 
 void Verifier::visitProfMetadata(Instruction &I, MDNode *MD) {
-  Assert(MD->getNumOperands() >= 2,
-         "!prof annotations should have no less than 2 operands", MD);
+  Check(MD->getNumOperands() >= 2,
+        "!prof annotations should have no less than 2 operands", MD);
 
   // Check first operand.
-  Assert(MD->getOperand(0) != nullptr, "first operand should not be null", MD);
-  Assert(isa<MDString>(MD->getOperand(0)),
-         "expected string with name of the !prof annotation", MD);
+  Check(MD->getOperand(0) != nullptr, "first operand should not be null", MD);
+  Check(isa<MDString>(MD->getOperand(0)),
+        "expected string with name of the !prof annotation", MD);
   MDString *MDS = cast<MDString>(MD->getOperand(0));
   StringRef ProfName = MDS->getString();
 
   // Check consistency of !prof branch_weights metadata.
   if (ProfName.equals("branch_weights")) {
     if (isa<InvokeInst>(&I)) {
-      Assert(MD->getNumOperands() == 2 || MD->getNumOperands() == 3,
-             "Wrong number of InvokeInst branch_weights operands", MD);
+      Check(MD->getNumOperands() == 2 || MD->getNumOperands() == 3,
+            "Wrong number of InvokeInst branch_weights operands", MD);
     } else {
       unsigned ExpectedNumOperands = 0;
       if (BranchInst *BI = dyn_cast<BranchInst>(&I))
@@ -4400,94 +4477,112 @@ void Verifier::visitProfMetadata(Instruction &I, MDNode *MD) {
         CheckFailed("!prof branch_weights are not allowed for this instruction",
                     MD);
 
-      Assert(MD->getNumOperands() == 1 + ExpectedNumOperands,
-             "Wrong number of operands", MD);
+      Check(MD->getNumOperands() == 1 + ExpectedNumOperands,
+            "Wrong number of operands", MD);
     }
     for (unsigned i = 1; i < MD->getNumOperands(); ++i) {
       auto &MDO = MD->getOperand(i);
-      Assert(MDO, "second operand should not be null", MD);
-      Assert(mdconst::dyn_extract<ConstantInt>(MDO),
-             "!prof brunch_weights operand is not a const int");
+      Check(MDO, "second operand should not be null", MD);
+      Check(mdconst::dyn_extract<ConstantInt>(MDO),
+            "!prof brunch_weights operand is not a const int");
     }
   }
 }
 
 void Verifier::visitAnnotationMetadata(MDNode *Annotation) {
-  Assert(isa<MDTuple>(Annotation), "annotation must be a tuple");
-  Assert(Annotation->getNumOperands() >= 1,
-         "annotation must have at least one operand");
+  Check(isa<MDTuple>(Annotation), "annotation must be a tuple");
+  Check(Annotation->getNumOperands() >= 1,
+        "annotation must have at least one operand");
   for (const MDOperand &Op : Annotation->operands())
-    Assert(isa<MDString>(Op.get()), "operands must be strings");
+    Check(isa<MDString>(Op.get()), "operands must be strings");
 }
 
 void Verifier::visitAliasScopeMetadata(const MDNode *MD) {
   unsigned NumOps = MD->getNumOperands();
-  Assert(NumOps >= 2 && NumOps <= 3, "scope must have two or three operands",
-         MD);
-  Assert(MD->getOperand(0).get() == MD || isa<MDString>(MD->getOperand(0)),
-         "first scope operand must be self-referential or string", MD);
+  Check(NumOps >= 2 && NumOps <= 3, "scope must have two or three operands",
+        MD);
+  Check(MD->getOperand(0).get() == MD || isa<MDString>(MD->getOperand(0)),
+        "first scope operand must be self-referential or string", MD);
   if (NumOps == 3)
-    Assert(isa<MDString>(MD->getOperand(2)),
-           "third scope operand must be string (if used)", MD);
+    Check(isa<MDString>(MD->getOperand(2)),
+          "third scope operand must be string (if used)", MD);
 
   MDNode *Domain = dyn_cast<MDNode>(MD->getOperand(1));
-  Assert(Domain != nullptr, "second scope operand must be MDNode", MD);
+  Check(Domain != nullptr, "second scope operand must be MDNode", MD);
 
   unsigned NumDomainOps = Domain->getNumOperands();
-  Assert(NumDomainOps >= 1 && NumDomainOps <= 2,
-         "domain must have one or two operands", Domain);
-  Assert(Domain->getOperand(0).get() == Domain ||
-             isa<MDString>(Domain->getOperand(0)),
-         "first domain operand must be self-referential or string", Domain);
+  Check(NumDomainOps >= 1 && NumDomainOps <= 2,
+        "domain must have one or two operands", Domain);
+  Check(Domain->getOperand(0).get() == Domain ||
+            isa<MDString>(Domain->getOperand(0)),
+        "first domain operand must be self-referential or string", Domain);
   if (NumDomainOps == 2)
-    Assert(isa<MDString>(Domain->getOperand(1)),
-           "second domain operand must be string (if used)", Domain);
+    Check(isa<MDString>(Domain->getOperand(1)),
+          "second domain operand must be string (if used)", Domain);
 }
 
 void Verifier::visitAliasScopeListMetadata(const MDNode *MD) {
   for (const MDOperand &Op : MD->operands()) {
     const MDNode *OpMD = dyn_cast<MDNode>(Op);
-    Assert(OpMD != nullptr, "scope list must consist of MDNodes", MD);
+    Check(OpMD != nullptr, "scope list must consist of MDNodes", MD);
     visitAliasScopeMetadata(OpMD);
   }
 }
 
+void Verifier::visitAccessGroupMetadata(const MDNode *MD) {
+  auto IsValidAccessScope = [](const MDNode *MD) {
+    return MD->getNumOperands() == 0 && MD->isDistinct();
+  };
+
+  // It must be either an access scope itself...
+  if (IsValidAccessScope(MD))
+    return;
+
+  // ...or a list of access scopes.
+  for (const MDOperand &Op : MD->operands()) {
+    const MDNode *OpMD = dyn_cast<MDNode>(Op);
+    Check(OpMD != nullptr, "Access scope list must consist of MDNodes", MD);
+    Check(IsValidAccessScope(OpMD),
+          "Access scope list contains invalid access scope", MD);
+  }
+}
+
 /// verifyInstruction - Verify that an instruction is well formed.
 ///
 void Verifier::visitInstruction(Instruction &I) {
   BasicBlock *BB = I.getParent();
-  Assert(BB, "Instruction not embedded in basic block!", &I);
+  Check(BB, "Instruction not embedded in basic block!", &I);
 
   if (!isa<PHINode>(I)) {   // Check that non-phi nodes are not self referential
     for (User *U : I.users()) {
-      Assert(U != (User *)&I || !DT.isReachableFromEntry(BB),
-             "Only PHI nodes may reference their own value!", &I);
+      Check(U != (User *)&I || !DT.isReachableFromEntry(BB),
+            "Only PHI nodes may reference their own value!", &I);
     }
   }
 
   // Check that void typed values don't have names
-  Assert(!I.getType()->isVoidTy() || !I.hasName(),
-         "Instruction has a name, but provides a void value!", &I);
+  Check(!I.getType()->isVoidTy() || !I.hasName(),
+        "Instruction has a name, but provides a void value!", &I);
 
   // Check that the return value of the instruction is either void or a legal
   // value type.
-  Assert(I.getType()->isVoidTy() || I.getType()->isFirstClassType(),
-         "Instruction returns a non-scalar type!", &I);
+  Check(I.getType()->isVoidTy() || I.getType()->isFirstClassType(),
+        "Instruction returns a non-scalar type!", &I);
 
   // Check that the instruction doesn't produce metadata. Calls are already
   // checked against the callee type.
-  Assert(!I.getType()->isMetadataTy() || isa<CallInst>(I) || isa<InvokeInst>(I),
-         "Invalid use of metadata!", &I);
+  Check(!I.getType()->isMetadataTy() || isa<CallInst>(I) || isa<InvokeInst>(I),
+        "Invalid use of metadata!", &I);
 
   // Check that all uses of the instruction, if they are instructions
   // themselves, actually have parent basic blocks.  If the use is not an
   // instruction, it is an error!
   for (Use &U : I.uses()) {
     if (Instruction *Used = dyn_cast<Instruction>(U.getUser()))
-      Assert(Used->getParent() != nullptr,
-             "Instruction referencing"
-             " instruction not embedded in a basic block!",
-             &I, Used);
+      Check(Used->getParent() != nullptr,
+            "Instruction referencing"
+            " instruction not embedded in a basic block!",
+            &I, Used);
     else {
       CheckFailed("Use of instruction is not an instruction!", U);
       return;
@@ -4499,12 +4594,12 @@ void Verifier::visitInstruction(Instruction &I) {
   const CallBase *CBI = dyn_cast<CallBase>(&I);
 
   for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
-    Assert(I.getOperand(i) != nullptr, "Instruction has null operand!", &I);
+    Check(I.getOperand(i) != nullptr, "Instruction has null operand!", &I);
 
     // Check to make sure that only first-class-values are operands to
     // instructions.
     if (!I.getOperand(i)->getType()->isFirstClassType()) {
-      Assert(false, "Instruction operands must be first-class values!", &I);
+      Check(false, "Instruction operands must be first-class values!", &I);
     }
 
     if (Function *F = dyn_cast<Function>(I.getOperand(i))) {
@@ -4520,43 +4615,43 @@ void Verifier::visitInstruction(Instruction &I) {
       // taken. Ignore cases where the address of the intrinsic function is used
       // as the argument of operand bundle "clang.arc.attachedcall" as those
       // cases are handled in verifyAttachedCallBundle.
-      Assert((!F->isIntrinsic() ||
-              (CBI && &CBI->getCalledOperandUse() == &I.getOperandUse(i)) ||
-              IsAttachedCallOperand(F, CBI, i)),
-             "Cannot take the address of an intrinsic!", &I);
-      Assert(
-          !F->isIntrinsic() || isa<CallInst>(I) ||
-              F->getIntrinsicID() == Intrinsic::donothing ||
-              F->getIntrinsicID() == Intrinsic::seh_try_begin ||
-              F->getIntrinsicID() == Intrinsic::seh_try_end ||
-              F->getIntrinsicID() == Intrinsic::seh_scope_begin ||
-              F->getIntrinsicID() == Intrinsic::seh_scope_end ||
-              F->getIntrinsicID() == Intrinsic::coro_resume ||
-              F->getIntrinsicID() == Intrinsic::coro_destroy ||
-              F->getIntrinsicID() == Intrinsic::experimental_patchpoint_void ||
-              F->getIntrinsicID() == Intrinsic::experimental_patchpoint_i64 ||
-              F->getIntrinsicID() == Intrinsic::experimental_gc_statepoint ||
-              F->getIntrinsicID() == Intrinsic::wasm_rethrow ||
-              IsAttachedCallOperand(F, CBI, i),
-          "Cannot invoke an intrinsic other than donothing, patchpoint, "
-          "statepoint, coro_resume, coro_destroy or clang.arc.attachedcall",
-          &I);
-      Assert(F->getParent() == &M, "Referencing function in another module!",
-             &I, &M, F, F->getParent());
+      Check((!F->isIntrinsic() ||
+             (CBI && &CBI->getCalledOperandUse() == &I.getOperandUse(i)) ||
+             IsAttachedCallOperand(F, CBI, i)),
+            "Cannot take the address of an intrinsic!", &I);
+      Check(!F->isIntrinsic() || isa<CallInst>(I) ||
+                F->getIntrinsicID() == Intrinsic::donothing ||
+                F->getIntrinsicID() == Intrinsic::seh_try_begin ||
+                F->getIntrinsicID() == Intrinsic::seh_try_end ||
+                F->getIntrinsicID() == Intrinsic::seh_scope_begin ||
+                F->getIntrinsicID() == Intrinsic::seh_scope_end ||
+                F->getIntrinsicID() == Intrinsic::coro_resume ||
+                F->getIntrinsicID() == Intrinsic::coro_destroy ||
+                F->getIntrinsicID() ==
+                    Intrinsic::experimental_patchpoint_void ||
+                F->getIntrinsicID() == Intrinsic::experimental_patchpoint_i64 ||
+                F->getIntrinsicID() == Intrinsic::experimental_gc_statepoint ||
+                F->getIntrinsicID() == Intrinsic::wasm_rethrow ||
+                IsAttachedCallOperand(F, CBI, i),
+            "Cannot invoke an intrinsic other than donothing, patchpoint, "
+            "statepoint, coro_resume, coro_destroy or clang.arc.attachedcall",
+            &I);
+      Check(F->getParent() == &M, "Referencing function in another module!", &I,
+            &M, F, F->getParent());
     } else if (BasicBlock *OpBB = dyn_cast<BasicBlock>(I.getOperand(i))) {
-      Assert(OpBB->getParent() == BB->getParent(),
-             "Referring to a basic block in another function!", &I);
+      Check(OpBB->getParent() == BB->getParent(),
+            "Referring to a basic block in another function!", &I);
     } else if (Argument *OpArg = dyn_cast<Argument>(I.getOperand(i))) {
-      Assert(OpArg->getParent() == BB->getParent(),
-             "Referring to an argument in another function!", &I);
+      Check(OpArg->getParent() == BB->getParent(),
+            "Referring to an argument in another function!", &I);
     } else if (GlobalValue *GV = dyn_cast<GlobalValue>(I.getOperand(i))) {
-      Assert(GV->getParent() == &M, "Referencing global in another module!", &I,
-             &M, GV, GV->getParent());
+      Check(GV->getParent() == &M, "Referencing global in another module!", &I,
+            &M, GV, GV->getParent());
     } else if (isa<Instruction>(I.getOperand(i))) {
       verifyDominatesUse(I, i);
     } else if (isa<InlineAsm>(I.getOperand(i))) {
-      Assert(CBI && &CBI->getCalledOperandUse() == &I.getOperandUse(i),
-             "Cannot take the address of an inline asm!", &I);
+      Check(CBI && &CBI->getCalledOperandUse() == &I.getOperandUse(i),
+            "Cannot take the address of an inline asm!", &I);
     } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(I.getOperand(i))) {
       if (CE->getType()->isPtrOrPtrVectorTy()) {
         // If we have a ConstantExpr pointer, we need to see if it came from an
@@ -4567,39 +4662,39 @@ void Verifier::visitInstruction(Instruction &I) {
   }
 
   if (MDNode *MD = I.getMetadata(LLVMContext::MD_fpmath)) {
-    Assert(I.getType()->isFPOrFPVectorTy(),
-           "fpmath requires a floating point result!", &I);
-    Assert(MD->getNumOperands() == 1, "fpmath takes one operand!", &I);
+    Check(I.getType()->isFPOrFPVectorTy(),
+          "fpmath requires a floating point result!", &I);
+    Check(MD->getNumOperands() == 1, "fpmath takes one operand!", &I);
     if (ConstantFP *CFP0 =
             mdconst::dyn_extract_or_null<ConstantFP>(MD->getOperand(0))) {
       const APFloat &Accuracy = CFP0->getValueAPF();
-      Assert(&Accuracy.getSemantics() == &APFloat::IEEEsingle(),
-             "fpmath accuracy must have float type", &I);
-      Assert(Accuracy.isFiniteNonZero() && !Accuracy.isNegative(),
-             "fpmath accuracy not a positive number!", &I);
+      Check(&Accuracy.getSemantics() == &APFloat::IEEEsingle(),
+            "fpmath accuracy must have float type", &I);
+      Check(Accuracy.isFiniteNonZero() && !Accuracy.isNegative(),
+            "fpmath accuracy not a positive number!", &I);
     } else {
-      Assert(false, "invalid fpmath accuracy!", &I);
+      Check(false, "invalid fpmath accuracy!", &I);
     }
   }
 
   if (MDNode *Range = I.getMetadata(LLVMContext::MD_range)) {
-    Assert(isa<LoadInst>(I) || isa<CallInst>(I) || isa<InvokeInst>(I),
-           "Ranges are only for loads, calls and invokes!", &I);
+    Check(isa<LoadInst>(I) || isa<CallInst>(I) || isa<InvokeInst>(I),
+          "Ranges are only for loads, calls and invokes!", &I);
     visitRangeMetadata(I, Range, I.getType());
   }
 
   if (I.hasMetadata(LLVMContext::MD_invariant_group)) {
-    Assert(isa<LoadInst>(I) || isa<StoreInst>(I),
-           "invariant.group metadata is only for loads and stores", &I);
+    Check(isa<LoadInst>(I) || isa<StoreInst>(I),
+          "invariant.group metadata is only for loads and stores", &I);
   }
 
   if (I.getMetadata(LLVMContext::MD_nonnull)) {
-    Assert(I.getType()->isPointerTy(), "nonnull applies only to pointer types",
-           &I);
-    Assert(isa<LoadInst>(I),
-           "nonnull applies only to load instructions, use attributes"
-           " for calls or invokes",
-           &I);
+    Check(I.getType()->isPointerTy(), "nonnull applies only to pointer types",
+          &I);
+    Check(isa<LoadInst>(I),
+          "nonnull applies only to load instructions, use attributes"
+          " for calls or invokes",
+          &I);
   }
 
   if (MDNode *MD = I.getMetadata(LLVMContext::MD_dereferenceable))
@@ -4616,20 +4711,25 @@ void Verifier::visitInstruction(Instruction &I) {
   if (MDNode *MD = I.getMetadata(LLVMContext::MD_alias_scope))
     visitAliasScopeListMetadata(MD);
 
+  if (MDNode *MD = I.getMetadata(LLVMContext::MD_access_group))
+    visitAccessGroupMetadata(MD);
+
   if (MDNode *AlignMD = I.getMetadata(LLVMContext::MD_align)) {
-    Assert(I.getType()->isPointerTy(), "align applies only to pointer types",
-           &I);
-    Assert(isa<LoadInst>(I), "align applies only to load instructions, "
-           "use attributes for calls or invokes", &I);
-    Assert(AlignMD->getNumOperands() == 1, "align takes one operand!", &I);
+    Check(I.getType()->isPointerTy(), "align applies only to pointer types",
+          &I);
+    Check(isa<LoadInst>(I),
+          "align applies only to load instructions, "
+          "use attributes for calls or invokes",
+          &I);
+    Check(AlignMD->getNumOperands() == 1, "align takes one operand!", &I);
     ConstantInt *CI = mdconst::dyn_extract<ConstantInt>(AlignMD->getOperand(0));
-    Assert(CI && CI->getType()->isIntegerTy(64),
-           "align metadata value must be an i64!", &I);
+    Check(CI && CI->getType()->isIntegerTy(64),
+          "align metadata value must be an i64!", &I);
     uint64_t Align = CI->getZExtValue();
-    Assert(isPowerOf2_64(Align),
-           "align metadata value must be a power of 2!", &I);
-    Assert(Align <= Value::MaximumAlignment,
-           "alignment is larger that implementation defined limit", &I);
+    Check(isPowerOf2_64(Align), "align metadata value must be a power of 2!",
+          &I);
+    Check(Align <= Value::MaximumAlignment,
+          "alignment is larger that implementation defined limit", &I);
   }
 
   if (MDNode *MD = I.getMetadata(LLVMContext::MD_prof))
@@ -4639,7 +4739,7 @@ void Verifier::visitInstruction(Instruction &I) {
     visitAnnotationMetadata(Annotation);
 
   if (MDNode *N = I.getDebugLoc().getAsMDNode()) {
-    AssertDI(isa<DILocation>(N), "invalid !dbg metadata attachment", &I, N);
+    CheckDI(isa<DILocation>(N), "invalid !dbg metadata attachment", &I, N);
     visitMDNode(*N, AreDebugLocsAllowed::Yes);
   }
 
@@ -4665,8 +4765,8 @@ void Verifier::visitInstruction(Instruction &I) {
 /// Allow intrinsics to be verified in different ways.
 void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
   Function *IF = Call.getCalledFunction();
-  Assert(IF->isDeclaration(), "Intrinsic functions should never be defined!",
-         IF);
+  Check(IF->isDeclaration(), "Intrinsic functions should never be defined!",
+        IF);
 
   // Verify that the intrinsic prototype lines up with what the .td files
   // describe.
@@ -4681,21 +4781,21 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
   SmallVector<Type *, 4> ArgTys;
   Intrinsic::MatchIntrinsicTypesResult Res =
       Intrinsic::matchIntrinsicSignature(IFTy, TableRef, ArgTys);
-  Assert(Res != Intrinsic::MatchIntrinsicTypes_NoMatchRet,
-         "Intrinsic has incorrect return type!", IF);
-  Assert(Res != Intrinsic::MatchIntrinsicTypes_NoMatchArg,
-         "Intrinsic has incorrect argument type!", IF);
+  Check(Res != Intrinsic::MatchIntrinsicTypes_NoMatchRet,
+        "Intrinsic has incorrect return type!", IF);
+  Check(Res != Intrinsic::MatchIntrinsicTypes_NoMatchArg,
+        "Intrinsic has incorrect argument type!", IF);
 
   // Verify if the intrinsic call matches the vararg property.
   if (IsVarArg)
-    Assert(!Intrinsic::matchIntrinsicVarArg(IsVarArg, TableRef),
-           "Intrinsic was not defined with variable arguments!", IF);
+    Check(!Intrinsic::matchIntrinsicVarArg(IsVarArg, TableRef),
+          "Intrinsic was not defined with variable arguments!", IF);
   else
-    Assert(!Intrinsic::matchIntrinsicVarArg(IsVarArg, TableRef),
-           "Callsite was not defined with variable arguments!", IF);
+    Check(!Intrinsic::matchIntrinsicVarArg(IsVarArg, TableRef),
+          "Callsite was not defined with variable arguments!", IF);
 
   // All descriptors should be absorbed by now.
-  Assert(TableRef.empty(), "Intrinsic has too few arguments!", IF);
+  Check(TableRef.empty(), "Intrinsic has too few arguments!", IF);
 
   // Now that we have the intrinsic ID and the actual argument types (and we
   // know they are legal for the intrinsic!) get the intrinsic name through the
@@ -4703,11 +4803,11 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
   // the name.
   const std::string ExpectedName =
       Intrinsic::getName(ID, ArgTys, IF->getParent(), IFTy);
-  Assert(ExpectedName == IF->getName(),
-         "Intrinsic name not mangled correctly for type arguments! "
-         "Should be: " +
-             ExpectedName,
-         IF);
+  Check(ExpectedName == IF->getName(),
+        "Intrinsic name not mangled correctly for type arguments! "
+        "Should be: " +
+            ExpectedName,
+        IF);
 
   // If the intrinsic takes MDNode arguments, verify that they are either global
   // or are local to *this* function.
@@ -4715,8 +4815,8 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     if (auto *MD = dyn_cast<MetadataAsValue>(V))
       visitMetadataAsValue(*MD, Call.getCaller());
     if (auto *Const = dyn_cast<Constant>(V))
-      Assert(!Const->getType()->isX86_AMXTy(),
-             "const x86_amx is not allowed in argument!");
+      Check(!Const->getType()->isX86_AMXTy(),
+            "const x86_amx is not allowed in argument!");
   }
 
   switch (ID) {
@@ -4724,36 +4824,35 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     break;
   case Intrinsic::assume: {
     for (auto &Elem : Call.bundle_op_infos()) {
-      Assert(Elem.Tag->getKey() == "ignore" ||
-                 Attribute::isExistingAttribute(Elem.Tag->getKey()),
-             "tags must be valid attribute names", Call);
+      Check(Elem.Tag->getKey() == "ignore" ||
+                Attribute::isExistingAttribute(Elem.Tag->getKey()),
+            "tags must be valid attribute names", Call);
       Attribute::AttrKind Kind =
           Attribute::getAttrKindFromName(Elem.Tag->getKey());
       unsigned ArgCount = Elem.End - Elem.Begin;
       if (Kind == Attribute::Alignment) {
-        Assert(ArgCount <= 3 && ArgCount >= 2,
-               "alignment assumptions should have 2 or 3 arguments", Call);
-        Assert(Call.getOperand(Elem.Begin)->getType()->isPointerTy(),
-               "first argument should be a pointer", Call);
-        Assert(Call.getOperand(Elem.Begin + 1)->getType()->isIntegerTy(),
-               "second argument should be an integer", Call);
+        Check(ArgCount <= 3 && ArgCount >= 2,
+              "alignment assumptions should have 2 or 3 arguments", Call);
+        Check(Call.getOperand(Elem.Begin)->getType()->isPointerTy(),
+              "first argument should be a pointer", Call);
+        Check(Call.getOperand(Elem.Begin + 1)->getType()->isIntegerTy(),
+              "second argument should be an integer", Call);
         if (ArgCount == 3)
-          Assert(Call.getOperand(Elem.Begin + 2)->getType()->isIntegerTy(),
-                 "third argument should be an integer if present", Call);
+          Check(Call.getOperand(Elem.Begin + 2)->getType()->isIntegerTy(),
+                "third argument should be an integer if present", Call);
         return;
       }
-      Assert(ArgCount <= 2, "too many arguments", Call);
+      Check(ArgCount <= 2, "too many arguments", Call);
       if (Kind == Attribute::None)
         break;
       if (Attribute::isIntAttrKind(Kind)) {
-        Assert(ArgCount == 2, "this attribute should have 2 arguments", Call);
-        Assert(isa<ConstantInt>(Call.getOperand(Elem.Begin + 1)),
-               "the second argument should be a constant integral value", Call);
+        Check(ArgCount == 2, "this attribute should have 2 arguments", Call);
+        Check(isa<ConstantInt>(Call.getOperand(Elem.Begin + 1)),
+              "the second argument should be a constant integral value", Call);
       } else if (Attribute::canUseAsParamAttr(Kind)) {
-        Assert((ArgCount) == 1, "this attribute should have one argument",
-               Call);
+        Check((ArgCount) == 1, "this attribute should have one argument", Call);
       } else if (Attribute::canUseAsFnAttr(Kind)) {
-        Assert((ArgCount) == 0, "this attribute has no argument", Call);
+        Check((ArgCount) == 0, "this attribute has no argument", Call);
       }
     }
     break;
@@ -4763,23 +4862,47 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     if (isa<ConstantPointerNull>(InfoArg))
       break;
     auto *GV = dyn_cast<GlobalVariable>(InfoArg);
-    Assert(GV && GV->isConstant() && GV->hasDefinitiveInitializer(),
-           "info argument of llvm.coro.id must refer to an initialized "
-           "constant");
+    Check(GV && GV->isConstant() && GV->hasDefinitiveInitializer(),
+          "info argument of llvm.coro.id must refer to an initialized "
+          "constant");
     Constant *Init = GV->getInitializer();
-    Assert(isa<ConstantStruct>(Init) || isa<ConstantArray>(Init),
-           "info argument of llvm.coro.id must refer to either a struct or "
-           "an array");
+    Check(isa<ConstantStruct>(Init) || isa<ConstantArray>(Init),
+          "info argument of llvm.coro.id must refer to either a struct or "
+          "an array");
     break;
   }
+  case Intrinsic::fptrunc_round: {
+    // Check the rounding mode
+    Metadata *MD = nullptr;
+    auto *MAV = dyn_cast<MetadataAsValue>(Call.getOperand(1));
+    if (MAV)
+      MD = MAV->getMetadata();
+
+    Check(MD != nullptr, "missing rounding mode argument", Call);
+
+    Check(isa<MDString>(MD),
+          ("invalid value for llvm.fptrunc.round metadata operand"
+           " (the operand should be a string)"),
+          MD);
+
+    Optional<RoundingMode> RoundMode =
+        convertStrToRoundingMode(cast<MDString>(MD)->getString());
+    Check(RoundMode && *RoundMode != RoundingMode::Dynamic,
+          "unsupported rounding mode argument", Call);
+    break;
+  }
+#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID:
+#include "llvm/IR/VPIntrinsics.def"
+    visitVPIntrinsic(cast<VPIntrinsic>(Call));
+    break;
 #define INSTRUCTION(NAME, NARGS, ROUND_MODE, INTRINSIC)                        \
   case Intrinsic::INTRINSIC:
 #include "llvm/IR/ConstrainedOps.def"
     visitConstrainedFPIntrinsic(cast<ConstrainedFPIntrinsic>(Call));
     break;
   case Intrinsic::dbg_declare: // llvm.dbg.declare
-    Assert(isa<MetadataAsValue>(Call.getArgOperand(0)),
-           "invalid llvm.dbg.declare intrinsic call 1", Call);
+    Check(isa<MetadataAsValue>(Call.getArgOperand(0)),
+          "invalid llvm.dbg.declare intrinsic call 1", Call);
     visitDbgIntrinsic("declare", cast<DbgVariableIntrinsic>(Call));
     break;
   case Intrinsic::dbg_addr: // llvm.dbg.addr
@@ -4794,18 +4917,19 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
   case Intrinsic::memcpy:
   case Intrinsic::memcpy_inline:
   case Intrinsic::memmove:
-  case Intrinsic::memset: {
+  case Intrinsic::memset:
+  case Intrinsic::memset_inline: {
     const auto *MI = cast<MemIntrinsic>(&Call);
     auto IsValidAlignment = [&](unsigned Alignment) -> bool {
       return Alignment == 0 || isPowerOf2_32(Alignment);
     };
-    Assert(IsValidAlignment(MI->getDestAlignment()),
-           "alignment of arg 0 of memory intrinsic must be 0 or a power of 2",
-           Call);
+    Check(IsValidAlignment(MI->getDestAlignment()),
+          "alignment of arg 0 of memory intrinsic must be 0 or a power of 2",
+          Call);
     if (const auto *MTI = dyn_cast<MemTransferInst>(MI)) {
-      Assert(IsValidAlignment(MTI->getSourceAlignment()),
-             "alignment of arg 1 of memory intrinsic must be 0 or a power of 2",
-             Call);
+      Check(IsValidAlignment(MTI->getSourceAlignment()),
+            "alignment of arg 1 of memory intrinsic must be 0 or a power of 2",
+            Call);
     }
 
     break;
@@ -4818,50 +4942,50 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     ConstantInt *ElementSizeCI =
         cast<ConstantInt>(AMI->getRawElementSizeInBytes());
     const APInt &ElementSizeVal = ElementSizeCI->getValue();
-    Assert(ElementSizeVal.isPowerOf2(),
-           "element size of the element-wise atomic memory intrinsic "
-           "must be a power of 2",
-           Call);
+    Check(ElementSizeVal.isPowerOf2(),
+          "element size of the element-wise atomic memory intrinsic "
+          "must be a power of 2",
+          Call);
 
     auto IsValidAlignment = [&](uint64_t Alignment) {
       return isPowerOf2_64(Alignment) && ElementSizeVal.ule(Alignment);
     };
     uint64_t DstAlignment = AMI->getDestAlignment();
-    Assert(IsValidAlignment(DstAlignment),
-           "incorrect alignment of the destination argument", Call);
+    Check(IsValidAlignment(DstAlignment),
+          "incorrect alignment of the destination argument", Call);
     if (const auto *AMT = dyn_cast<AtomicMemTransferInst>(AMI)) {
       uint64_t SrcAlignment = AMT->getSourceAlignment();
-      Assert(IsValidAlignment(SrcAlignment),
-             "incorrect alignment of the source argument", Call);
+      Check(IsValidAlignment(SrcAlignment),
+            "incorrect alignment of the source argument", Call);
     }
     break;
   }
   case Intrinsic::call_preallocated_setup: {
     auto *NumArgs = dyn_cast<ConstantInt>(Call.getArgOperand(0));
-    Assert(NumArgs != nullptr,
-           "llvm.call.preallocated.setup argument must be a constant");
+    Check(NumArgs != nullptr,
+          "llvm.call.preallocated.setup argument must be a constant");
     bool FoundCall = false;
     for (User *U : Call.users()) {
       auto *UseCall = dyn_cast<CallBase>(U);
-      Assert(UseCall != nullptr,
-             "Uses of llvm.call.preallocated.setup must be calls");
+      Check(UseCall != nullptr,
+            "Uses of llvm.call.preallocated.setup must be calls");
       const Function *Fn = UseCall->getCalledFunction();
       if (Fn && Fn->getIntrinsicID() == Intrinsic::call_preallocated_arg) {
         auto *AllocArgIndex = dyn_cast<ConstantInt>(UseCall->getArgOperand(1));
-        Assert(AllocArgIndex != nullptr,
-               "llvm.call.preallocated.alloc arg index must be a constant");
+        Check(AllocArgIndex != nullptr,
+              "llvm.call.preallocated.alloc arg index must be a constant");
         auto AllocArgIndexInt = AllocArgIndex->getValue();
-        Assert(AllocArgIndexInt.sge(0) &&
-                   AllocArgIndexInt.slt(NumArgs->getValue()),
-               "llvm.call.preallocated.alloc arg index must be between 0 and "
-               "corresponding "
-               "llvm.call.preallocated.setup's argument count");
+        Check(AllocArgIndexInt.sge(0) &&
+                  AllocArgIndexInt.slt(NumArgs->getValue()),
+              "llvm.call.preallocated.alloc arg index must be between 0 and "
+              "corresponding "
+              "llvm.call.preallocated.setup's argument count");
       } else if (Fn && Fn->getIntrinsicID() ==
                            Intrinsic::call_preallocated_teardown) {
         // nothing to do
       } else {
-        Assert(!FoundCall, "Can have at most one call corresponding to a "
-                           "llvm.call.preallocated.setup");
+        Check(!FoundCall, "Can have at most one call corresponding to a "
+                          "llvm.call.preallocated.setup");
         FoundCall = true;
         size_t NumPreallocatedArgs = 0;
         for (unsigned i = 0; i < UseCall->arg_size(); i++) {
@@ -4869,14 +4993,14 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
             ++NumPreallocatedArgs;
           }
         }
-        Assert(NumPreallocatedArgs != 0,
-               "cannot use preallocated intrinsics on a call without "
-               "preallocated arguments");
-        Assert(NumArgs->equalsInt(NumPreallocatedArgs),
-               "llvm.call.preallocated.setup arg size must be equal to number "
-               "of preallocated arguments "
-               "at call site",
-               Call, *UseCall);
+        Check(NumPreallocatedArgs != 0,
+              "cannot use preallocated intrinsics on a call without "
+              "preallocated arguments");
+        Check(NumArgs->equalsInt(NumPreallocatedArgs),
+              "llvm.call.preallocated.setup arg size must be equal to number "
+              "of preallocated arguments "
+              "at call site",
+              Call, *UseCall);
         // getOperandBundle() cannot be called if more than one of the operand
         // bundle exists. There is already a check elsewhere for this, so skip
         // here if we see more than one.
@@ -4886,33 +5010,33 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
         }
         auto PreallocatedBundle =
             UseCall->getOperandBundle(LLVMContext::OB_preallocated);
-        Assert(PreallocatedBundle,
-               "Use of llvm.call.preallocated.setup outside intrinsics "
-               "must be in \"preallocated\" operand bundle");
-        Assert(PreallocatedBundle->Inputs.front().get() == &Call,
-               "preallocated bundle must have token from corresponding "
-               "llvm.call.preallocated.setup");
+        Check(PreallocatedBundle,
+              "Use of llvm.call.preallocated.setup outside intrinsics "
+              "must be in \"preallocated\" operand bundle");
+        Check(PreallocatedBundle->Inputs.front().get() == &Call,
+              "preallocated bundle must have token from corresponding "
+              "llvm.call.preallocated.setup");
       }
     }
     break;
   }
   case Intrinsic::call_preallocated_arg: {
     auto *Token = dyn_cast<CallBase>(Call.getArgOperand(0));
-    Assert(Token && Token->getCalledFunction()->getIntrinsicID() ==
-                        Intrinsic::call_preallocated_setup,
-           "llvm.call.preallocated.arg token argument must be a "
-           "llvm.call.preallocated.setup");
-    Assert(Call.hasFnAttr(Attribute::Preallocated),
-           "llvm.call.preallocated.arg must be called with a \"preallocated\" "
-           "call site attribute");
+    Check(Token && Token->getCalledFunction()->getIntrinsicID() ==
+                       Intrinsic::call_preallocated_setup,
+          "llvm.call.preallocated.arg token argument must be a "
+          "llvm.call.preallocated.setup");
+    Check(Call.hasFnAttr(Attribute::Preallocated),
+          "llvm.call.preallocated.arg must be called with a \"preallocated\" "
+          "call site attribute");
     break;
   }
   case Intrinsic::call_preallocated_teardown: {
     auto *Token = dyn_cast<CallBase>(Call.getArgOperand(0));
-    Assert(Token && Token->getCalledFunction()->getIntrinsicID() ==
-                        Intrinsic::call_preallocated_setup,
-           "llvm.call.preallocated.teardown token argument must be a "
-           "llvm.call.preallocated.setup");
+    Check(Token && Token->getCalledFunction()->getIntrinsicID() ==
+                       Intrinsic::call_preallocated_setup,
+          "llvm.call.preallocated.teardown token argument must be a "
+          "llvm.call.preallocated.setup");
     break;
   }
   case Intrinsic::gcroot:
@@ -4921,46 +5045,46 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     if (ID == Intrinsic::gcroot) {
       AllocaInst *AI =
           dyn_cast<AllocaInst>(Call.getArgOperand(0)->stripPointerCasts());
-      Assert(AI, "llvm.gcroot parameter #1 must be an alloca.", Call);
-      Assert(isa<Constant>(Call.getArgOperand(1)),
-             "llvm.gcroot parameter #2 must be a constant.", Call);
+      Check(AI, "llvm.gcroot parameter #1 must be an alloca.", Call);
+      Check(isa<Constant>(Call.getArgOperand(1)),
+            "llvm.gcroot parameter #2 must be a constant.", Call);
       if (!AI->getAllocatedType()->isPointerTy()) {
-        Assert(!isa<ConstantPointerNull>(Call.getArgOperand(1)),
-               "llvm.gcroot parameter #1 must either be a pointer alloca, "
-               "or argument #2 must be a non-null constant.",
-               Call);
+        Check(!isa<ConstantPointerNull>(Call.getArgOperand(1)),
+              "llvm.gcroot parameter #1 must either be a pointer alloca, "
+              "or argument #2 must be a non-null constant.",
+              Call);
       }
     }
 
-    Assert(Call.getParent()->getParent()->hasGC(),
-           "Enclosing function does not use GC.", Call);
+    Check(Call.getParent()->getParent()->hasGC(),
+          "Enclosing function does not use GC.", Call);
     break;
   case Intrinsic::init_trampoline:
-    Assert(isa<Function>(Call.getArgOperand(1)->stripPointerCasts()),
-           "llvm.init_trampoline parameter #2 must resolve to a function.",
-           Call);
+    Check(isa<Function>(Call.getArgOperand(1)->stripPointerCasts()),
+          "llvm.init_trampoline parameter #2 must resolve to a function.",
+          Call);
     break;
   case Intrinsic::prefetch:
-    Assert(cast<ConstantInt>(Call.getArgOperand(1))->getZExtValue() < 2 &&
-           cast<ConstantInt>(Call.getArgOperand(2))->getZExtValue() < 4,
-           "invalid arguments to llvm.prefetch", Call);
+    Check(cast<ConstantInt>(Call.getArgOperand(1))->getZExtValue() < 2 &&
+              cast<ConstantInt>(Call.getArgOperand(2))->getZExtValue() < 4,
+          "invalid arguments to llvm.prefetch", Call);
     break;
   case Intrinsic::stackprotector:
-    Assert(isa<AllocaInst>(Call.getArgOperand(1)->stripPointerCasts()),
-           "llvm.stackprotector parameter #2 must resolve to an alloca.", Call);
+    Check(isa<AllocaInst>(Call.getArgOperand(1)->stripPointerCasts()),
+          "llvm.stackprotector parameter #2 must resolve to an alloca.", Call);
     break;
   case Intrinsic::localescape: {
     BasicBlock *BB = Call.getParent();
-    Assert(BB == &BB->getParent()->front(),
-           "llvm.localescape used outside of entry block", Call);
-    Assert(!SawFrameEscape,
-           "multiple calls to llvm.localescape in one function", Call);
+    Check(BB == &BB->getParent()->front(),
+          "llvm.localescape used outside of entry block", Call);
+    Check(!SawFrameEscape, "multiple calls to llvm.localescape in one function",
+          Call);
     for (Value *Arg : Call.args()) {
       if (isa<ConstantPointerNull>(Arg))
         continue; // Null values are allowed as placeholders.
       auto *AI = dyn_cast<AllocaInst>(Arg->stripPointerCasts());
-      Assert(AI && AI->isStaticAlloca(),
-             "llvm.localescape only accepts static allocas", Call);
+      Check(AI && AI->isStaticAlloca(),
+            "llvm.localescape only accepts static allocas", Call);
     }
     FrameEscapeInfo[BB->getParent()].first = Call.arg_size();
     SawFrameEscape = true;
@@ -4969,10 +5093,10 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
   case Intrinsic::localrecover: {
     Value *FnArg = Call.getArgOperand(0)->stripPointerCasts();
     Function *Fn = dyn_cast<Function>(FnArg);
-    Assert(Fn && !Fn->isDeclaration(),
-           "llvm.localrecover first "
-           "argument must be function defined in this module",
-           Call);
+    Check(Fn && !Fn->isDeclaration(),
+          "llvm.localrecover first "
+          "argument must be function defined in this module",
+          Call);
     auto *IdxArg = cast<ConstantInt>(Call.getArgOperand(2));
     auto &Entry = FrameEscapeInfo[Fn];
     Entry.second = unsigned(
@@ -4982,39 +5106,38 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
 
   case Intrinsic::experimental_gc_statepoint:
     if (auto *CI = dyn_cast<CallInst>(&Call))
-      Assert(!CI->isInlineAsm(),
-             "gc.statepoint support for inline assembly unimplemented", CI);
-    Assert(Call.getParent()->getParent()->hasGC(),
-           "Enclosing function does not use GC.", Call);
+      Check(!CI->isInlineAsm(),
+            "gc.statepoint support for inline assembly unimplemented", CI);
+    Check(Call.getParent()->getParent()->hasGC(),
+          "Enclosing function does not use GC.", Call);
 
     verifyStatepoint(Call);
     break;
   case Intrinsic::experimental_gc_result: {
-    Assert(Call.getParent()->getParent()->hasGC(),
-           "Enclosing function does not use GC.", Call);
+    Check(Call.getParent()->getParent()->hasGC(),
+          "Enclosing function does not use GC.", Call);
     // Are we tied to a statepoint properly?
     const auto *StatepointCall = dyn_cast<CallBase>(Call.getArgOperand(0));
     const Function *StatepointFn =
         StatepointCall ? StatepointCall->getCalledFunction() : nullptr;
-    Assert(StatepointFn && StatepointFn->isDeclaration() &&
-               StatepointFn->getIntrinsicID() ==
-                   Intrinsic::experimental_gc_statepoint,
-           "gc.result operand #1 must be from a statepoint", Call,
-           Call.getArgOperand(0));
-
-    // Assert that result type matches wrapped callee.
-    const Value *Target = StatepointCall->getArgOperand(2);
-    auto *PT = cast<PointerType>(Target->getType());
-    auto *TargetFuncType = cast<FunctionType>(PT->getPointerElementType());
-    Assert(Call.getType() == TargetFuncType->getReturnType(),
-           "gc.result result type does not match wrapped callee", Call);
+    Check(StatepointFn && StatepointFn->isDeclaration() &&
+              StatepointFn->getIntrinsicID() ==
+                  Intrinsic::experimental_gc_statepoint,
+          "gc.result operand #1 must be from a statepoint", Call,
+          Call.getArgOperand(0));
+
+    // Check that result type matches wrapped callee.
+    auto *TargetFuncType =
+        cast<FunctionType>(StatepointCall->getParamElementType(2));
+    Check(Call.getType() == TargetFuncType->getReturnType(),
+          "gc.result result type does not match wrapped callee", Call);
     break;
   }
   case Intrinsic::experimental_gc_relocate: {
-    Assert(Call.arg_size() == 3, "wrong number of arguments", Call);
+    Check(Call.arg_size() == 3, "wrong number of arguments", Call);
 
-    Assert(isa<PointerType>(Call.getType()->getScalarType()),
-           "gc.relocate must return a pointer or a vector of pointers", Call);
+    Check(isa<PointerType>(Call.getType()->getScalarType()),
+          "gc.relocate must return a pointer or a vector of pointers", Call);
 
     // Check that this relocate is correctly tied to the statepoint
 
@@ -5027,19 +5150,19 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
 
       // Landingpad relocates should have only one predecessor with invoke
       // statepoint terminator
-      Assert(InvokeBB, "safepoints should have unique landingpads",
-             LandingPad->getParent());
-      Assert(InvokeBB->getTerminator(), "safepoint block should be well formed",
-             InvokeBB);
-      Assert(isa<GCStatepointInst>(InvokeBB->getTerminator()),
-             "gc relocate should be linked to a statepoint", InvokeBB);
+      Check(InvokeBB, "safepoints should have unique landingpads",
+            LandingPad->getParent());
+      Check(InvokeBB->getTerminator(), "safepoint block should be well formed",
+            InvokeBB);
+      Check(isa<GCStatepointInst>(InvokeBB->getTerminator()),
+            "gc relocate should be linked to a statepoint", InvokeBB);
     } else {
       // In all other cases relocate should be tied to the statepoint directly.
       // This covers relocates on a normal return path of invoke statepoint and
       // relocates of a call statepoint.
       auto Token = Call.getArgOperand(0);
-      Assert(isa<GCStatepointInst>(Token),
-             "gc relocate is incorrectly tied to the statepoint", Call, Token);
+      Check(isa<GCStatepointInst>(Token),
+            "gc relocate is incorrectly tied to the statepoint", Call, Token);
     }
 
     // Verify rest of the relocate arguments.
@@ -5048,22 +5171,22 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
 
     // Both the base and derived must be piped through the safepoint.
     Value *Base = Call.getArgOperand(1);
-    Assert(isa<ConstantInt>(Base),
-           "gc.relocate operand #2 must be integer offset", Call);
+    Check(isa<ConstantInt>(Base),
+          "gc.relocate operand #2 must be integer offset", Call);
 
     Value *Derived = Call.getArgOperand(2);
-    Assert(isa<ConstantInt>(Derived),
-           "gc.relocate operand #3 must be integer offset", Call);
+    Check(isa<ConstantInt>(Derived),
+          "gc.relocate operand #3 must be integer offset", Call);
 
     const uint64_t BaseIndex = cast<ConstantInt>(Base)->getZExtValue();
     const uint64_t DerivedIndex = cast<ConstantInt>(Derived)->getZExtValue();
 
     // Check the bounds
     if (auto Opt = StatepointCall.getOperandBundle(LLVMContext::OB_gc_live)) {
-      Assert(BaseIndex < Opt->Inputs.size(),
-             "gc.relocate: statepoint base index out of bounds", Call);
-      Assert(DerivedIndex < Opt->Inputs.size(),
-             "gc.relocate: statepoint derived index out of bounds", Call);
+      Check(BaseIndex < Opt->Inputs.size(),
+            "gc.relocate: statepoint base index out of bounds", Call);
+      Check(DerivedIndex < Opt->Inputs.size(),
+            "gc.relocate: statepoint derived index out of bounds", Call);
     }
 
     // Relocated value must be either a pointer type or vector-of-pointer type,
@@ -5071,15 +5194,15 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     // relocated pointer. It can be casted to the correct type later if it's
     // desired. However, they must have the same address space and 'vectorness'
     GCRelocateInst &Relocate = cast<GCRelocateInst>(Call);
-    Assert(Relocate.getDerivedPtr()->getType()->isPtrOrPtrVectorTy(),
-           "gc.relocate: relocated value must be a gc pointer", Call);
+    Check(Relocate.getDerivedPtr()->getType()->isPtrOrPtrVectorTy(),
+          "gc.relocate: relocated value must be a gc pointer", Call);
 
     auto ResultType = Call.getType();
     auto DerivedType = Relocate.getDerivedPtr()->getType();
-    Assert(ResultType->isVectorTy() == DerivedType->isVectorTy(),
-           "gc.relocate: vector relocates to vector and pointer to pointer",
-           Call);
-    Assert(
+    Check(ResultType->isVectorTy() == DerivedType->isVectorTy(),
+          "gc.relocate: vector relocates to vector and pointer to pointer",
+          Call);
+    Check(
         ResultType->getPointerAddressSpace() ==
             DerivedType->getPointerAddressSpace(),
         "gc.relocate: relocating a pointer shouldn't change its address space",
@@ -5088,39 +5211,43 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
   }
   case Intrinsic::eh_exceptioncode:
   case Intrinsic::eh_exceptionpointer: {
-    Assert(isa<CatchPadInst>(Call.getArgOperand(0)),
-           "eh.exceptionpointer argument must be a catchpad", Call);
+    Check(isa<CatchPadInst>(Call.getArgOperand(0)),
+          "eh.exceptionpointer argument must be a catchpad", Call);
     break;
   }
   case Intrinsic::get_active_lane_mask: {
-    Assert(Call.getType()->isVectorTy(), "get_active_lane_mask: must return a "
-           "vector", Call);
+    Check(Call.getType()->isVectorTy(),
+          "get_active_lane_mask: must return a "
+          "vector",
+          Call);
     auto *ElemTy = Call.getType()->getScalarType();
-    Assert(ElemTy->isIntegerTy(1), "get_active_lane_mask: element type is not "
-           "i1", Call);
+    Check(ElemTy->isIntegerTy(1),
+          "get_active_lane_mask: element type is not "
+          "i1",
+          Call);
     break;
   }
   case Intrinsic::masked_load: {
-    Assert(Call.getType()->isVectorTy(), "masked_load: must return a vector",
-           Call);
+    Check(Call.getType()->isVectorTy(), "masked_load: must return a vector",
+          Call);
 
     Value *Ptr = Call.getArgOperand(0);
     ConstantInt *Alignment = cast<ConstantInt>(Call.getArgOperand(1));
     Value *Mask = Call.getArgOperand(2);
     Value *PassThru = Call.getArgOperand(3);
-    Assert(Mask->getType()->isVectorTy(), "masked_load: mask must be vector",
-           Call);
-    Assert(Alignment->getValue().isPowerOf2(),
-           "masked_load: alignment must be a power of 2", Call);
+    Check(Mask->getType()->isVectorTy(), "masked_load: mask must be vector",
+          Call);
+    Check(Alignment->getValue().isPowerOf2(),
+          "masked_load: alignment must be a power of 2", Call);
 
     PointerType *PtrTy = cast<PointerType>(Ptr->getType());
-    Assert(PtrTy->isOpaqueOrPointeeTypeMatches(Call.getType()),
-           "masked_load: return must match pointer type", Call);
-    Assert(PassThru->getType() == Call.getType(),
-           "masked_load: pass through and return type must match", Call);
-    Assert(cast<VectorType>(Mask->getType())->getElementCount() ==
-               cast<VectorType>(Call.getType())->getElementCount(),
-           "masked_load: vector mask must be same length as return", Call);
+    Check(PtrTy->isOpaqueOrPointeeTypeMatches(Call.getType()),
+          "masked_load: return must match pointer type", Call);
+    Check(PassThru->getType() == Call.getType(),
+          "masked_load: pass through and return type must match", Call);
+    Check(cast<VectorType>(Mask->getType())->getElementCount() ==
+              cast<VectorType>(Call.getType())->getElementCount(),
+          "masked_load: vector mask must be same length as return", Call);
     break;
   }
   case Intrinsic::masked_store: {
@@ -5128,61 +5255,61 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     Value *Ptr = Call.getArgOperand(1);
     ConstantInt *Alignment = cast<ConstantInt>(Call.getArgOperand(2));
     Value *Mask = Call.getArgOperand(3);
-    Assert(Mask->getType()->isVectorTy(), "masked_store: mask must be vector",
-           Call);
-    Assert(Alignment->getValue().isPowerOf2(),
-           "masked_store: alignment must be a power of 2", Call);
+    Check(Mask->getType()->isVectorTy(), "masked_store: mask must be vector",
+          Call);
+    Check(Alignment->getValue().isPowerOf2(),
+          "masked_store: alignment must be a power of 2", Call);
 
     PointerType *PtrTy = cast<PointerType>(Ptr->getType());
-    Assert(PtrTy->isOpaqueOrPointeeTypeMatches(Val->getType()),
-           "masked_store: storee must match pointer type", Call);
-    Assert(cast<VectorType>(Mask->getType())->getElementCount() ==
-               cast<VectorType>(Val->getType())->getElementCount(),
-           "masked_store: vector mask must be same length as value", Call);
+    Check(PtrTy->isOpaqueOrPointeeTypeMatches(Val->getType()),
+          "masked_store: storee must match pointer type", Call);
+    Check(cast<VectorType>(Mask->getType())->getElementCount() ==
+              cast<VectorType>(Val->getType())->getElementCount(),
+          "masked_store: vector mask must be same length as value", Call);
     break;
   }
 
   case Intrinsic::masked_gather: {
     const APInt &Alignment =
         cast<ConstantInt>(Call.getArgOperand(1))->getValue();
-    Assert(Alignment.isZero() || Alignment.isPowerOf2(),
-           "masked_gather: alignment must be 0 or a power of 2", Call);
+    Check(Alignment.isZero() || Alignment.isPowerOf2(),
+          "masked_gather: alignment must be 0 or a power of 2", Call);
     break;
   }
   case Intrinsic::masked_scatter: {
     const APInt &Alignment =
         cast<ConstantInt>(Call.getArgOperand(2))->getValue();
-    Assert(Alignment.isZero() || Alignment.isPowerOf2(),
-           "masked_scatter: alignment must be 0 or a power of 2", Call);
+    Check(Alignment.isZero() || Alignment.isPowerOf2(),
+          "masked_scatter: alignment must be 0 or a power of 2", Call);
     break;
   }
 
   case Intrinsic::experimental_guard: {
-    Assert(isa<CallInst>(Call), "experimental_guard cannot be invoked", Call);
-    Assert(Call.countOperandBundlesOfType(LLVMContext::OB_deopt) == 1,
-           "experimental_guard must have exactly one "
-           "\"deopt\" operand bundle");
+    Check(isa<CallInst>(Call), "experimental_guard cannot be invoked", Call);
+    Check(Call.countOperandBundlesOfType(LLVMContext::OB_deopt) == 1,
+          "experimental_guard must have exactly one "
+          "\"deopt\" operand bundle");
     break;
   }
 
   case Intrinsic::experimental_deoptimize: {
-    Assert(isa<CallInst>(Call), "experimental_deoptimize cannot be invoked",
-           Call);
-    Assert(Call.countOperandBundlesOfType(LLVMContext::OB_deopt) == 1,
-           "experimental_deoptimize must have exactly one "
-           "\"deopt\" operand bundle");
-    Assert(Call.getType() == Call.getFunction()->getReturnType(),
-           "experimental_deoptimize return type must match caller return type");
+    Check(isa<CallInst>(Call), "experimental_deoptimize cannot be invoked",
+          Call);
+    Check(Call.countOperandBundlesOfType(LLVMContext::OB_deopt) == 1,
+          "experimental_deoptimize must have exactly one "
+          "\"deopt\" operand bundle");
+    Check(Call.getType() == Call.getFunction()->getReturnType(),
+          "experimental_deoptimize return type must match caller return type");
 
     if (isa<CallInst>(Call)) {
       auto *RI = dyn_cast<ReturnInst>(Call.getNextNode());
-      Assert(RI,
-             "calls to experimental_deoptimize must be followed by a return");
+      Check(RI,
+            "calls to experimental_deoptimize must be followed by a return");
 
       if (!Call.getType()->isVoidTy() && RI)
-        Assert(RI->getReturnValue() == &Call,
-               "calls to experimental_deoptimize must be followed by a return "
-               "of the value computed by experimental_deoptimize");
+        Check(RI->getReturnValue() == &Call,
+              "calls to experimental_deoptimize must be followed by a return "
+              "of the value computed by experimental_deoptimize");
     }
 
     break;
@@ -5197,15 +5324,15 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
   case Intrinsic::vector_reduce_umax:
   case Intrinsic::vector_reduce_umin: {
     Type *ArgTy = Call.getArgOperand(0)->getType();
-    Assert(ArgTy->isIntOrIntVectorTy() && ArgTy->isVectorTy(),
-           "Intrinsic has incorrect argument type!");
+    Check(ArgTy->isIntOrIntVectorTy() && ArgTy->isVectorTy(),
+          "Intrinsic has incorrect argument type!");
     break;
   }
   case Intrinsic::vector_reduce_fmax:
   case Intrinsic::vector_reduce_fmin: {
     Type *ArgTy = Call.getArgOperand(0)->getType();
-    Assert(ArgTy->isFPOrFPVectorTy() && ArgTy->isVectorTy(),
-           "Intrinsic has incorrect argument type!");
+    Check(ArgTy->isFPOrFPVectorTy() && ArgTy->isVectorTy(),
+          "Intrinsic has incorrect argument type!");
     break;
   }
   case Intrinsic::vector_reduce_fadd:
@@ -5213,8 +5340,8 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     // Unlike the other reductions, the first argument is a start value. The
     // second argument is the vector to be reduced.
     Type *ArgTy = Call.getArgOperand(1)->getType();
-    Assert(ArgTy->isFPOrFPVectorTy() && ArgTy->isVectorTy(),
-           "Intrinsic has incorrect argument type!");
+    Check(ArgTy->isFPOrFPVectorTy() && ArgTy->isVectorTy(),
+          "Intrinsic has incorrect argument type!");
     break;
   }
   case Intrinsic::smul_fix:
@@ -5227,27 +5354,26 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
   case Intrinsic::udiv_fix_sat: {
     Value *Op1 = Call.getArgOperand(0);
     Value *Op2 = Call.getArgOperand(1);
-    Assert(Op1->getType()->isIntOrIntVectorTy(),
-           "first operand of [us][mul|div]_fix[_sat] must be an int type or "
-           "vector of ints");
-    Assert(Op2->getType()->isIntOrIntVectorTy(),
-           "second operand of [us][mul|div]_fix[_sat] must be an int type or "
-           "vector of ints");
+    Check(Op1->getType()->isIntOrIntVectorTy(),
+          "first operand of [us][mul|div]_fix[_sat] must be an int type or "
+          "vector of ints");
+    Check(Op2->getType()->isIntOrIntVectorTy(),
+          "second operand of [us][mul|div]_fix[_sat] must be an int type or "
+          "vector of ints");
 
     auto *Op3 = cast<ConstantInt>(Call.getArgOperand(2));
-    Assert(Op3->getType()->getBitWidth() <= 32,
-           "third argument of [us][mul|div]_fix[_sat] must fit within 32 bits");
+    Check(Op3->getType()->getBitWidth() <= 32,
+          "third argument of [us][mul|div]_fix[_sat] must fit within 32 bits");
 
     if (ID == Intrinsic::smul_fix || ID == Intrinsic::smul_fix_sat ||
         ID == Intrinsic::sdiv_fix || ID == Intrinsic::sdiv_fix_sat) {
-      Assert(
-          Op3->getZExtValue() < Op1->getType()->getScalarSizeInBits(),
-          "the scale of s[mul|div]_fix[_sat] must be less than the width of "
-          "the operands");
+      Check(Op3->getZExtValue() < Op1->getType()->getScalarSizeInBits(),
+            "the scale of s[mul|div]_fix[_sat] must be less than the width of "
+            "the operands");
     } else {
-      Assert(Op3->getZExtValue() <= Op1->getType()->getScalarSizeInBits(),
-             "the scale of u[mul|div]_fix[_sat] must be less than or equal "
-             "to the width of the operands");
+      Check(Op3->getZExtValue() <= Op1->getType()->getScalarSizeInBits(),
+            "the scale of u[mul|div]_fix[_sat] must be less than or equal "
+            "to the width of the operands");
     }
     break;
   }
@@ -5257,22 +5383,22 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
   case Intrinsic::llrint: {
     Type *ValTy = Call.getArgOperand(0)->getType();
     Type *ResultTy = Call.getType();
-    Assert(!ValTy->isVectorTy() && !ResultTy->isVectorTy(),
-           "Intrinsic does not support vectors", &Call);
+    Check(!ValTy->isVectorTy() && !ResultTy->isVectorTy(),
+          "Intrinsic does not support vectors", &Call);
     break;
   }
   case Intrinsic::bswap: {
     Type *Ty = Call.getType();
     unsigned Size = Ty->getScalarSizeInBits();
-    Assert(Size % 16 == 0, "bswap must be an even number of bytes", &Call);
+    Check(Size % 16 == 0, "bswap must be an even number of bytes", &Call);
     break;
   }
   case Intrinsic::invariant_start: {
     ConstantInt *InvariantSize = dyn_cast<ConstantInt>(Call.getArgOperand(0));
-    Assert(InvariantSize &&
-               (!InvariantSize->isNegative() || InvariantSize->isMinusOne()),
-           "invariant_start parameter must be -1, 0 or a positive number",
-           &Call);
+    Check(InvariantSize &&
+              (!InvariantSize->isNegative() || InvariantSize->isMinusOne()),
+          "invariant_start parameter must be -1, 0 or a positive number",
+          &Call);
     break;
   }
   case Intrinsic::matrix_multiply:
@@ -5333,27 +5459,29 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
       llvm_unreachable("unexpected intrinsic");
     }
 
-    Assert(ResultTy->getElementType()->isIntegerTy() ||
-           ResultTy->getElementType()->isFloatingPointTy(),
-           "Result type must be an integer or floating-point type!", IF);
+    Check(ResultTy->getElementType()->isIntegerTy() ||
+              ResultTy->getElementType()->isFloatingPointTy(),
+          "Result type must be an integer or floating-point type!", IF);
 
     if (Op0ElemTy)
-      Assert(ResultTy->getElementType() == Op0ElemTy,
-             "Vector element type mismatch of the result and first operand "
-             "vector!", IF);
+      Check(ResultTy->getElementType() == Op0ElemTy,
+            "Vector element type mismatch of the result and first operand "
+            "vector!",
+            IF);
 
     if (Op1ElemTy)
-      Assert(ResultTy->getElementType() == Op1ElemTy,
-             "Vector element type mismatch of the result and second operand "
-             "vector!", IF);
+      Check(ResultTy->getElementType() == Op1ElemTy,
+            "Vector element type mismatch of the result and second operand "
+            "vector!",
+            IF);
 
-    Assert(cast<FixedVectorType>(ResultTy)->getNumElements() ==
-               NumRows->getZExtValue() * NumColumns->getZExtValue(),
-           "Result of a matrix operation does not fit in the returned vector!");
+    Check(cast<FixedVectorType>(ResultTy)->getNumElements() ==
+              NumRows->getZExtValue() * NumColumns->getZExtValue(),
+          "Result of a matrix operation does not fit in the returned vector!");
 
     if (Stride)
-      Assert(Stride->getZExtValue() >= NumRows->getZExtValue(),
-             "Stride must be greater or equal than the number of rows!", IF);
+      Check(Stride->getZExtValue() >= NumRows->getZExtValue(),
+            "Stride must be greater or equal than the number of rows!", IF);
 
     break;
   }
@@ -5366,25 +5494,25 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
       if (Attrs.hasFnAttr(Attribute::VScaleRange))
         KnownMinNumElements *= Attrs.getFnAttrs().getVScaleRangeMin();
     }
-    Assert((Idx < 0 && std::abs(Idx) <= KnownMinNumElements) ||
-               (Idx >= 0 && Idx < KnownMinNumElements),
-           "The splice index exceeds the range [-VL, VL-1] where VL is the "
-           "known minimum number of elements in the vector. For scalable "
-           "vectors the minimum number of elements is determined from "
-           "vscale_range.",
-           &Call);
+    Check((Idx < 0 && std::abs(Idx) <= KnownMinNumElements) ||
+              (Idx >= 0 && Idx < KnownMinNumElements),
+          "The splice index exceeds the range [-VL, VL-1] where VL is the "
+          "known minimum number of elements in the vector. For scalable "
+          "vectors the minimum number of elements is determined from "
+          "vscale_range.",
+          &Call);
     break;
   }
   case Intrinsic::experimental_stepvector: {
     VectorType *VecTy = dyn_cast<VectorType>(Call.getType());
-    Assert(VecTy && VecTy->getScalarType()->isIntegerTy() &&
-               VecTy->getScalarSizeInBits() >= 8,
-           "experimental_stepvector only supported for vectors of integers "
-           "with a bitwidth of at least 8.",
-           &Call);
+    Check(VecTy && VecTy->getScalarType()->isIntegerTy() &&
+              VecTy->getScalarSizeInBits() >= 8,
+          "experimental_stepvector only supported for vectors of integers "
+          "with a bitwidth of at least 8.",
+          &Call);
     break;
   }
-  case Intrinsic::experimental_vector_insert: {
+  case Intrinsic::vector_insert: {
     Value *Vec = Call.getArgOperand(0);
     Value *SubVec = Call.getArgOperand(1);
     Value *Idx = Call.getArgOperand(2);
@@ -5395,27 +5523,26 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
 
     ElementCount VecEC = VecTy->getElementCount();
     ElementCount SubVecEC = SubVecTy->getElementCount();
-    Assert(VecTy->getElementType() == SubVecTy->getElementType(),
-           "experimental_vector_insert parameters must have the same element "
-           "type.",
-           &Call);
-    Assert(IdxN % SubVecEC.getKnownMinValue() == 0,
-           "experimental_vector_insert index must be a constant multiple of "
-           "the subvector's known minimum vector length.");
+    Check(VecTy->getElementType() == SubVecTy->getElementType(),
+          "vector_insert parameters must have the same element "
+          "type.",
+          &Call);
+    Check(IdxN % SubVecEC.getKnownMinValue() == 0,
+          "vector_insert index must be a constant multiple of "
+          "the subvector's known minimum vector length.");
 
     // If this insertion is not the 'mixed' case where a fixed vector is
     // inserted into a scalable vector, ensure that the insertion of the
     // subvector does not overrun the parent vector.
     if (VecEC.isScalable() == SubVecEC.isScalable()) {
-      Assert(
-          IdxN < VecEC.getKnownMinValue() &&
-              IdxN + SubVecEC.getKnownMinValue() <= VecEC.getKnownMinValue(),
-          "subvector operand of experimental_vector_insert would overrun the "
-          "vector being inserted into.");
+      Check(IdxN < VecEC.getKnownMinValue() &&
+                IdxN + SubVecEC.getKnownMinValue() <= VecEC.getKnownMinValue(),
+            "subvector operand of vector_insert would overrun the "
+            "vector being inserted into.");
     }
     break;
   }
-  case Intrinsic::experimental_vector_extract: {
+  case Intrinsic::vector_extract: {
     Value *Vec = Call.getArgOperand(0);
     Value *Idx = Call.getArgOperand(1);
     unsigned IdxN = cast<ConstantInt>(Idx)->getZExtValue();
@@ -5426,21 +5553,21 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     ElementCount VecEC = VecTy->getElementCount();
     ElementCount ResultEC = ResultTy->getElementCount();
 
-    Assert(ResultTy->getElementType() == VecTy->getElementType(),
-           "experimental_vector_extract result must have the same element "
-           "type as the input vector.",
-           &Call);
-    Assert(IdxN % ResultEC.getKnownMinValue() == 0,
-           "experimental_vector_extract index must be a constant multiple of "
-           "the result type's known minimum vector length.");
+    Check(ResultTy->getElementType() == VecTy->getElementType(),
+          "vector_extract result must have the same element "
+          "type as the input vector.",
+          &Call);
+    Check(IdxN % ResultEC.getKnownMinValue() == 0,
+          "vector_extract index must be a constant multiple of "
+          "the result type's known minimum vector length.");
 
     // If this extraction is not the 'mixed' case where a fixed vector is is
     // extracted from a scalable vector, ensure that the extraction does not
     // overrun the parent vector.
     if (VecEC.isScalable() == ResultEC.isScalable()) {
-      Assert(IdxN < VecEC.getKnownMinValue() &&
-                 IdxN + ResultEC.getKnownMinValue() <= VecEC.getKnownMinValue(),
-             "experimental_vector_extract would overrun.");
+      Check(IdxN < VecEC.getKnownMinValue() &&
+                IdxN + ResultEC.getKnownMinValue() <= VecEC.getKnownMinValue(),
+            "vector_extract would overrun.");
     }
     break;
   }
@@ -5449,11 +5576,24 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     break;
   }
   case Intrinsic::preserve_array_access_index:
-  case Intrinsic::preserve_struct_access_index: {
-    Type *ElemTy = Call.getAttributes().getParamElementType(0);
-    Assert(ElemTy,
-           "Intrinsic requires elementtype attribute on first argument.",
-           &Call);
+  case Intrinsic::preserve_struct_access_index:
+  case Intrinsic::aarch64_ldaxr:
+  case Intrinsic::aarch64_ldxr:
+  case Intrinsic::arm_ldaex:
+  case Intrinsic::arm_ldrex: {
+    Type *ElemTy = Call.getParamElementType(0);
+    Check(ElemTy, "Intrinsic requires elementtype attribute on first argument.",
+          &Call);
+    break;
+  }
+  case Intrinsic::aarch64_stlxr:
+  case Intrinsic::aarch64_stxr:
+  case Intrinsic::arm_stlex:
+  case Intrinsic::arm_strex: {
+    Type *ElemTy = Call.getAttributes().getParamElementType(1);
+    Check(ElemTy,
+          "Intrinsic requires elementtype attribute on second argument.",
+          &Call);
     break;
   }
   };
@@ -5478,6 +5618,101 @@ static DISubprogram *getSubprogram(Metadata *LocalScope) {
   return nullptr;
 }
 
+void Verifier::visitVPIntrinsic(VPIntrinsic &VPI) {
+  if (auto *VPCast = dyn_cast<VPCastIntrinsic>(&VPI)) {
+    auto *RetTy = cast<VectorType>(VPCast->getType());
+    auto *ValTy = cast<VectorType>(VPCast->getOperand(0)->getType());
+    Check(RetTy->getElementCount() == ValTy->getElementCount(),
+          "VP cast intrinsic first argument and result vector lengths must be "
+          "equal",
+          *VPCast);
+
+    switch (VPCast->getIntrinsicID()) {
+    default:
+      llvm_unreachable("Unknown VP cast intrinsic");
+    case Intrinsic::vp_trunc:
+      Check(RetTy->isIntOrIntVectorTy() && ValTy->isIntOrIntVectorTy(),
+            "llvm.vp.trunc intrinsic first argument and result element type "
+            "must be integer",
+            *VPCast);
+      Check(RetTy->getScalarSizeInBits() < ValTy->getScalarSizeInBits(),
+            "llvm.vp.trunc intrinsic the bit size of first argument must be "
+            "larger than the bit size of the return type",
+            *VPCast);
+      break;
+    case Intrinsic::vp_zext:
+    case Intrinsic::vp_sext:
+      Check(RetTy->isIntOrIntVectorTy() && ValTy->isIntOrIntVectorTy(),
+            "llvm.vp.zext or llvm.vp.sext intrinsic first argument and result "
+            "element type must be integer",
+            *VPCast);
+      Check(RetTy->getScalarSizeInBits() > ValTy->getScalarSizeInBits(),
+            "llvm.vp.zext or llvm.vp.sext intrinsic the bit size of first "
+            "argument must be smaller than the bit size of the return type",
+            *VPCast);
+      break;
+    case Intrinsic::vp_fptoui:
+    case Intrinsic::vp_fptosi:
+      Check(
+          RetTy->isIntOrIntVectorTy() && ValTy->isFPOrFPVectorTy(),
+          "llvm.vp.fptoui or llvm.vp.fptosi intrinsic first argument element "
+          "type must be floating-point and result element type must be integer",
+          *VPCast);
+      break;
+    case Intrinsic::vp_uitofp:
+    case Intrinsic::vp_sitofp:
+      Check(
+          RetTy->isFPOrFPVectorTy() && ValTy->isIntOrIntVectorTy(),
+          "llvm.vp.uitofp or llvm.vp.sitofp intrinsic first argument element "
+          "type must be integer and result element type must be floating-point",
+          *VPCast);
+      break;
+    case Intrinsic::vp_fptrunc:
+      Check(RetTy->isFPOrFPVectorTy() && ValTy->isFPOrFPVectorTy(),
+            "llvm.vp.fptrunc intrinsic first argument and result element type "
+            "must be floating-point",
+            *VPCast);
+      Check(RetTy->getScalarSizeInBits() < ValTy->getScalarSizeInBits(),
+            "llvm.vp.fptrunc intrinsic the bit size of first argument must be "
+            "larger than the bit size of the return type",
+            *VPCast);
+      break;
+    case Intrinsic::vp_fpext:
+      Check(RetTy->isFPOrFPVectorTy() && ValTy->isFPOrFPVectorTy(),
+            "llvm.vp.fpext intrinsic first argument and result element type "
+            "must be floating-point",
+            *VPCast);
+      Check(RetTy->getScalarSizeInBits() > ValTy->getScalarSizeInBits(),
+            "llvm.vp.fpext intrinsic the bit size of first argument must be "
+            "smaller than the bit size of the return type",
+            *VPCast);
+      break;
+    case Intrinsic::vp_ptrtoint:
+      Check(RetTy->isIntOrIntVectorTy() && ValTy->isPtrOrPtrVectorTy(),
+            "llvm.vp.ptrtoint intrinsic first argument element type must be "
+            "pointer and result element type must be integer",
+            *VPCast);
+      break;
+    case Intrinsic::vp_inttoptr:
+      Check(RetTy->isPtrOrPtrVectorTy() && ValTy->isIntOrIntVectorTy(),
+            "llvm.vp.inttoptr intrinsic first argument element type must be "
+            "integer and result element type must be pointer",
+            *VPCast);
+      break;
+    }
+  }
+  if (VPI.getIntrinsicID() == Intrinsic::vp_fcmp) {
+    auto Pred = cast<VPCmpIntrinsic>(&VPI)->getPredicate();
+    Check(CmpInst::isFPPredicate(Pred),
+          "invalid predicate for VP FP comparison intrinsic", &VPI);
+  }
+  if (VPI.getIntrinsicID() == Intrinsic::vp_icmp) {
+    auto Pred = cast<VPCmpIntrinsic>(&VPI)->getPredicate();
+    Check(CmpInst::isIntPredicate(Pred),
+          "invalid predicate for VP integer comparison intrinsic", &VPI);
+  }
+}
+
 void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
   unsigned NumOperands;
   bool HasRoundingMD;
@@ -5495,16 +5730,16 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
   // Compare intrinsics carry an extra predicate metadata operand.
   if (isa<ConstrainedFPCmpIntrinsic>(FPI))
     NumOperands += 1;
-  Assert((FPI.arg_size() == NumOperands),
-         "invalid arguments for constrained FP intrinsic", &FPI);
+  Check((FPI.arg_size() == NumOperands),
+        "invalid arguments for constrained FP intrinsic", &FPI);
 
   switch (FPI.getIntrinsicID()) {
   case Intrinsic::experimental_constrained_lrint:
   case Intrinsic::experimental_constrained_llrint: {
     Type *ValTy = FPI.getArgOperand(0)->getType();
     Type *ResultTy = FPI.getType();
-    Assert(!ValTy->isVectorTy() && !ResultTy->isVectorTy(),
-           "Intrinsic does not support vectors", &FPI);
+    Check(!ValTy->isVectorTy() && !ResultTy->isVectorTy(),
+          "Intrinsic does not support vectors", &FPI);
   }
     break;
 
@@ -5512,16 +5747,16 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
   case Intrinsic::experimental_constrained_llround: {
     Type *ValTy = FPI.getArgOperand(0)->getType();
     Type *ResultTy = FPI.getType();
-    Assert(!ValTy->isVectorTy() && !ResultTy->isVectorTy(),
-           "Intrinsic does not support vectors", &FPI);
+    Check(!ValTy->isVectorTy() && !ResultTy->isVectorTy(),
+          "Intrinsic does not support vectors", &FPI);
     break;
   }
 
   case Intrinsic::experimental_constrained_fcmp:
   case Intrinsic::experimental_constrained_fcmps: {
     auto Pred = cast<ConstrainedFPCmpIntrinsic>(&FPI)->getPredicate();
-    Assert(CmpInst::isFPPredicate(Pred),
-           "invalid predicate for constrained FP comparison intrinsic", &FPI);
+    Check(CmpInst::isFPPredicate(Pred),
+          "invalid predicate for constrained FP comparison intrinsic", &FPI);
     break;
   }
 
@@ -5529,21 +5764,21 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
   case Intrinsic::experimental_constrained_fptoui: {
     Value *Operand = FPI.getArgOperand(0);
     uint64_t NumSrcElem = 0;
-    Assert(Operand->getType()->isFPOrFPVectorTy(),
-           "Intrinsic first argument must be floating point", &FPI);
+    Check(Operand->getType()->isFPOrFPVectorTy(),
+          "Intrinsic first argument must be floating point", &FPI);
     if (auto *OperandT = dyn_cast<VectorType>(Operand->getType())) {
       NumSrcElem = cast<FixedVectorType>(OperandT)->getNumElements();
     }
 
     Operand = &FPI;
-    Assert((NumSrcElem > 0) == Operand->getType()->isVectorTy(),
-           "Intrinsic first argument and result disagree on vector use", &FPI);
-    Assert(Operand->getType()->isIntOrIntVectorTy(),
-           "Intrinsic result must be an integer", &FPI);
+    Check((NumSrcElem > 0) == Operand->getType()->isVectorTy(),
+          "Intrinsic first argument and result disagree on vector use", &FPI);
+    Check(Operand->getType()->isIntOrIntVectorTy(),
+          "Intrinsic result must be an integer", &FPI);
     if (auto *OperandT = dyn_cast<VectorType>(Operand->getType())) {
-      Assert(NumSrcElem == cast<FixedVectorType>(OperandT)->getNumElements(),
-             "Intrinsic first argument and result vector lengths must be equal",
-             &FPI);
+      Check(NumSrcElem == cast<FixedVectorType>(OperandT)->getNumElements(),
+            "Intrinsic first argument and result vector lengths must be equal",
+            &FPI);
     }
   }
     break;
@@ -5552,21 +5787,21 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
   case Intrinsic::experimental_constrained_uitofp: {
     Value *Operand = FPI.getArgOperand(0);
     uint64_t NumSrcElem = 0;
-    Assert(Operand->getType()->isIntOrIntVectorTy(),
-           "Intrinsic first argument must be integer", &FPI);
+    Check(Operand->getType()->isIntOrIntVectorTy(),
+          "Intrinsic first argument must be integer", &FPI);
     if (auto *OperandT = dyn_cast<VectorType>(Operand->getType())) {
       NumSrcElem = cast<FixedVectorType>(OperandT)->getNumElements();
     }
 
     Operand = &FPI;
-    Assert((NumSrcElem > 0) == Operand->getType()->isVectorTy(),
-           "Intrinsic first argument and result disagree on vector use", &FPI);
-    Assert(Operand->getType()->isFPOrFPVectorTy(),
-           "Intrinsic result must be a floating point", &FPI);
+    Check((NumSrcElem > 0) == Operand->getType()->isVectorTy(),
+          "Intrinsic first argument and result disagree on vector use", &FPI);
+    Check(Operand->getType()->isFPOrFPVectorTy(),
+          "Intrinsic result must be a floating point", &FPI);
     if (auto *OperandT = dyn_cast<VectorType>(Operand->getType())) {
-      Assert(NumSrcElem == cast<FixedVectorType>(OperandT)->getNumElements(),
-             "Intrinsic first argument and result vector lengths must be equal",
-             &FPI);
+      Check(NumSrcElem == cast<FixedVectorType>(OperandT)->getNumElements(),
+            "Intrinsic first argument and result vector lengths must be equal",
+            &FPI);
     }
   } break;
 
@@ -5576,26 +5811,26 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
     Type *OperandTy = Operand->getType();
     Value *Result = &FPI;
     Type *ResultTy = Result->getType();
-    Assert(OperandTy->isFPOrFPVectorTy(),
-           "Intrinsic first argument must be FP or FP vector", &FPI);
-    Assert(ResultTy->isFPOrFPVectorTy(),
-           "Intrinsic result must be FP or FP vector", &FPI);
-    Assert(OperandTy->isVectorTy() == ResultTy->isVectorTy(),
-           "Intrinsic first argument and result disagree on vector use", &FPI);
+    Check(OperandTy->isFPOrFPVectorTy(),
+          "Intrinsic first argument must be FP or FP vector", &FPI);
+    Check(ResultTy->isFPOrFPVectorTy(),
+          "Intrinsic result must be FP or FP vector", &FPI);
+    Check(OperandTy->isVectorTy() == ResultTy->isVectorTy(),
+          "Intrinsic first argument and result disagree on vector use", &FPI);
     if (OperandTy->isVectorTy()) {
-      Assert(cast<FixedVectorType>(OperandTy)->getNumElements() ==
-                 cast<FixedVectorType>(ResultTy)->getNumElements(),
-             "Intrinsic first argument and result vector lengths must be equal",
-             &FPI);
+      Check(cast<FixedVectorType>(OperandTy)->getNumElements() ==
+                cast<FixedVectorType>(ResultTy)->getNumElements(),
+            "Intrinsic first argument and result vector lengths must be equal",
+            &FPI);
     }
     if (FPI.getIntrinsicID() == Intrinsic::experimental_constrained_fptrunc) {
-      Assert(OperandTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits(),
-             "Intrinsic first argument's type must be larger than result type",
-             &FPI);
+      Check(OperandTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits(),
+            "Intrinsic first argument's type must be larger than result type",
+            &FPI);
     } else {
-      Assert(OperandTy->getScalarSizeInBits() < ResultTy->getScalarSizeInBits(),
-             "Intrinsic first argument's type must be smaller than result type",
-             &FPI);
+      Check(OperandTy->getScalarSizeInBits() < ResultTy->getScalarSizeInBits(),
+            "Intrinsic first argument's type must be smaller than result type",
+            &FPI);
     }
   }
     break;
@@ -5609,25 +5844,25 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
   // match the specification in the intrinsic call table. Thus, no
   // argument type check is needed here.
 
-  Assert(FPI.getExceptionBehavior().hasValue(),
-         "invalid exception behavior argument", &FPI);
+  Check(FPI.getExceptionBehavior().has_value(),
+        "invalid exception behavior argument", &FPI);
   if (HasRoundingMD) {
-    Assert(FPI.getRoundingMode().hasValue(),
-           "invalid rounding mode argument", &FPI);
+    Check(FPI.getRoundingMode().has_value(), "invalid rounding mode argument",
+          &FPI);
   }
 }
 
 void Verifier::visitDbgIntrinsic(StringRef Kind, DbgVariableIntrinsic &DII) {
   auto *MD = DII.getRawLocation();
-  AssertDI(isa<ValueAsMetadata>(MD) || isa<DIArgList>(MD) ||
-               (isa<MDNode>(MD) && !cast<MDNode>(MD)->getNumOperands()),
-           "invalid llvm.dbg." + Kind + " intrinsic address/value", &DII, MD);
-  AssertDI(isa<DILocalVariable>(DII.getRawVariable()),
-         "invalid llvm.dbg." + Kind + " intrinsic variable", &DII,
-         DII.getRawVariable());
-  AssertDI(isa<DIExpression>(DII.getRawExpression()),
-         "invalid llvm.dbg." + Kind + " intrinsic expression", &DII,
-         DII.getRawExpression());
+  CheckDI(isa<ValueAsMetadata>(MD) || isa<DIArgList>(MD) ||
+              (isa<MDNode>(MD) && !cast<MDNode>(MD)->getNumOperands()),
+          "invalid llvm.dbg." + Kind + " intrinsic address/value", &DII, MD);
+  CheckDI(isa<DILocalVariable>(DII.getRawVariable()),
+          "invalid llvm.dbg." + Kind + " intrinsic variable", &DII,
+          DII.getRawVariable());
+  CheckDI(isa<DIExpression>(DII.getRawExpression()),
+          "invalid llvm.dbg." + Kind + " intrinsic expression", &DII,
+          DII.getRawExpression());
 
   // Ignore broken !dbg attachments; they're checked elsewhere.
   if (MDNode *N = DII.getDebugLoc().getAsMDNode())
@@ -5640,29 +5875,30 @@ void Verifier::visitDbgIntrinsic(StringRef Kind, DbgVariableIntrinsic &DII) {
   // The scopes for variables and !dbg attachments must agree.
   DILocalVariable *Var = DII.getVariable();
   DILocation *Loc = DII.getDebugLoc();
-  AssertDI(Loc, "llvm.dbg." + Kind + " intrinsic requires a !dbg attachment",
-           &DII, BB, F);
+  CheckDI(Loc, "llvm.dbg." + Kind + " intrinsic requires a !dbg attachment",
+          &DII, BB, F);
 
   DISubprogram *VarSP = getSubprogram(Var->getRawScope());
   DISubprogram *LocSP = getSubprogram(Loc->getRawScope());
   if (!VarSP || !LocSP)
     return; // Broken scope chains are checked elsewhere.
 
-  AssertDI(VarSP == LocSP, "mismatched subprogram between llvm.dbg." + Kind +
-                               " variable and !dbg attachment",
-           &DII, BB, F, Var, Var->getScope()->getSubprogram(), Loc,
-           Loc->getScope()->getSubprogram());
+  CheckDI(VarSP == LocSP,
+          "mismatched subprogram between llvm.dbg." + Kind +
+              " variable and !dbg attachment",
+          &DII, BB, F, Var, Var->getScope()->getSubprogram(), Loc,
+          Loc->getScope()->getSubprogram());
 
   // This check is redundant with one in visitLocalVariable().
-  AssertDI(isType(Var->getRawType()), "invalid type ref", Var,
-           Var->getRawType());
+  CheckDI(isType(Var->getRawType()), "invalid type ref", Var,
+          Var->getRawType());
   verifyFnArgs(DII);
 }
 
 void Verifier::visitDbgLabelIntrinsic(StringRef Kind, DbgLabelInst &DLI) {
-  AssertDI(isa<DILabel>(DLI.getRawLabel()),
-         "invalid llvm.dbg." + Kind + " intrinsic variable", &DLI,
-         DLI.getRawLabel());
+  CheckDI(isa<DILabel>(DLI.getRawLabel()),
+          "invalid llvm.dbg." + Kind + " intrinsic variable", &DLI,
+          DLI.getRawLabel());
 
   // Ignore broken !dbg attachments; they're checked elsewhere.
   if (MDNode *N = DLI.getDebugLoc().getAsMDNode())
@@ -5675,18 +5911,19 @@ void Verifier::visitDbgLabelIntrinsic(StringRef Kind, DbgLabelInst &DLI) {
   // The scopes for variables and !dbg attachments must agree.
   DILabel *Label = DLI.getLabel();
   DILocation *Loc = DLI.getDebugLoc();
-  Assert(Loc, "llvm.dbg." + Kind + " intrinsic requires a !dbg attachment",
-         &DLI, BB, F);
+  Check(Loc, "llvm.dbg." + Kind + " intrinsic requires a !dbg attachment", &DLI,
+        BB, F);
 
   DISubprogram *LabelSP = getSubprogram(Label->getRawScope());
   DISubprogram *LocSP = getSubprogram(Loc->getRawScope());
   if (!LabelSP || !LocSP)
     return;
 
-  AssertDI(LabelSP == LocSP, "mismatched subprogram between llvm.dbg." + Kind +
-                             " label and !dbg attachment",
-           &DLI, BB, F, Label, Label->getScope()->getSubprogram(), Loc,
-           Loc->getScope()->getSubprogram());
+  CheckDI(LabelSP == LocSP,
+          "mismatched subprogram between llvm.dbg." + Kind +
+              " label and !dbg attachment",
+          &DLI, BB, F, Label, Label->getScope()->getSubprogram(), Loc,
+          Loc->getScope()->getSubprogram());
 }
 
 void Verifier::verifyFragmentExpression(const DbgVariableIntrinsic &I) {
@@ -5726,9 +5963,9 @@ void Verifier::verifyFragmentExpression(const DIVariable &V,
 
   unsigned FragSize = Fragment.SizeInBits;
   unsigned FragOffset = Fragment.OffsetInBits;
-  AssertDI(FragSize + FragOffset <= *VarSize,
-         "fragment is larger than or outside of variable", Desc, &V);
-  AssertDI(FragSize != *VarSize, "fragment covers entire variable", Desc, &V);
+  CheckDI(FragSize + FragOffset <= *VarSize,
+          "fragment is larger than or outside of variable", Desc, &V);
+  CheckDI(FragSize != *VarSize, "fragment covers entire variable", Desc, &V);
 }
 
 void Verifier::verifyFnArgs(const DbgVariableIntrinsic &I) {
@@ -5743,7 +5980,7 @@ void Verifier::verifyFnArgs(const DbgVariableIntrinsic &I) {
     return;
 
   DILocalVariable *Var = I.getVariable();
-  AssertDI(Var, "dbg intrinsic without variable");
+  CheckDI(Var, "dbg intrinsic without variable");
 
   unsigned ArgNo = Var->getArg();
   if (!ArgNo)
@@ -5756,8 +5993,8 @@ void Verifier::verifyFnArgs(const DbgVariableIntrinsic &I) {
 
   auto *Prev = DebugFnArgs[ArgNo - 1];
   DebugFnArgs[ArgNo - 1] = Var;
-  AssertDI(!Prev || (Prev == Var), "conflicting debug info for argument", &I,
-           Prev, Var);
+  CheckDI(!Prev || (Prev == Var), "conflicting debug info for argument", &I,
+          Prev, Var);
 }
 
 void Verifier::verifyNotEntryValue(const DbgVariableIntrinsic &I) {
@@ -5767,7 +6004,7 @@ void Verifier::verifyNotEntryValue(const DbgVariableIntrinsic &I) {
   if (!E || !E->isValid())
     return;
 
-  AssertDI(!E->isEntryValue(), "Entry values are only allowed in MIR", &I);
+  CheckDI(!E->isEntryValue(), "Entry values are only allowed in MIR", &I);
 }
 
 void Verifier::verifyCompileUnits() {
@@ -5781,7 +6018,7 @@ void Verifier::verifyCompileUnits() {
   if (CUs)
     Listed.insert(CUs->op_begin(), CUs->op_end());
   for (auto *CU : CUVisited)
-    AssertDI(Listed.count(CU), "DICompileUnit not listed in llvm.dbg.cu", CU);
+    CheckDI(Listed.count(CU), "DICompileUnit not listed in llvm.dbg.cu", CU);
   CUVisited.clear();
 }
 
@@ -5791,10 +6028,10 @@ void Verifier::verifyDeoptimizeCallingConvs() {
 
   const Function *First = DeoptimizeDeclarations[0];
   for (auto *F : makeArrayRef(DeoptimizeDeclarations).slice(1)) {
-    Assert(First->getCallingConv() == F->getCallingConv(),
-           "All llvm.experimental.deoptimize declarations must have the same "
-           "calling convention",
-           First, F);
+    Check(First->getCallingConv() == F->getCallingConv(),
+          "All llvm.experimental.deoptimize declarations must have the same "
+          "calling convention",
+          First, F);
   }
 }
 
@@ -5802,39 +6039,39 @@ void Verifier::verifyAttachedCallBundle(const CallBase &Call,
                                         const OperandBundleUse &BU) {
   FunctionType *FTy = Call.getFunctionType();
 
-  Assert((FTy->getReturnType()->isPointerTy() ||
-          (Call.doesNotReturn() && FTy->getReturnType()->isVoidTy())),
-         "a call with operand bundle \"clang.arc.attachedcall\" must call a "
-         "function returning a pointer or a non-returning function that has a "
-         "void return type",
-         Call);
+  Check((FTy->getReturnType()->isPointerTy() ||
+         (Call.doesNotReturn() && FTy->getReturnType()->isVoidTy())),
+        "a call with operand bundle \"clang.arc.attachedcall\" must call a "
+        "function returning a pointer or a non-returning function that has a "
+        "void return type",
+        Call);
 
-  Assert(BU.Inputs.size() == 1 && isa<Function>(BU.Inputs.front()),
-         "operand bundle \"clang.arc.attachedcall\" requires one function as "
-         "an argument",
-         Call);
+  Check(BU.Inputs.size() == 1 && isa<Function>(BU.Inputs.front()),
+        "operand bundle \"clang.arc.attachedcall\" requires one function as "
+        "an argument",
+        Call);
 
   auto *Fn = cast<Function>(BU.Inputs.front());
   Intrinsic::ID IID = Fn->getIntrinsicID();
 
   if (IID) {
-    Assert((IID == Intrinsic::objc_retainAutoreleasedReturnValue ||
-            IID == Intrinsic::objc_unsafeClaimAutoreleasedReturnValue),
-           "invalid function argument", Call);
+    Check((IID == Intrinsic::objc_retainAutoreleasedReturnValue ||
+           IID == Intrinsic::objc_unsafeClaimAutoreleasedReturnValue),
+          "invalid function argument", Call);
   } else {
     StringRef FnName = Fn->getName();
-    Assert((FnName == "objc_retainAutoreleasedReturnValue" ||
-            FnName == "objc_unsafeClaimAutoreleasedReturnValue"),
-           "invalid function argument", Call);
+    Check((FnName == "objc_retainAutoreleasedReturnValue" ||
+           FnName == "objc_unsafeClaimAutoreleasedReturnValue"),
+          "invalid function argument", Call);
   }
 }
 
 void Verifier::verifySourceDebugInfo(const DICompileUnit &U, const DIFile &F) {
-  bool HasSource = F.getSource().hasValue();
+  bool HasSource = F.getSource().has_value();
   if (!HasSourceDebugInfo.count(&U))
     HasSourceDebugInfo[&U] = HasSource;
-  AssertDI(HasSource == HasSourceDebugInfo[&U],
-           "inconsistent use of embedded source");
+  CheckDI(HasSource == HasSourceDebugInfo[&U],
+          "inconsistent use of embedded source");
 }
 
 void Verifier::verifyNoAliasScopeDecl() {
@@ -5847,16 +6084,15 @@ void Verifier::verifyNoAliasScopeDecl() {
            "Not a llvm.experimental.noalias.scope.decl ?");
     const auto *ScopeListMV = dyn_cast<MetadataAsValue>(
         II->getOperand(Intrinsic::NoAliasScopeDeclScopeArg));
-    Assert(ScopeListMV != nullptr,
-           "llvm.experimental.noalias.scope.decl must have a MetadataAsValue "
-           "argument",
-           II);
+    Check(ScopeListMV != nullptr,
+          "llvm.experimental.noalias.scope.decl must have a MetadataAsValue "
+          "argument",
+          II);
 
     const auto *ScopeListMD = dyn_cast<MDNode>(ScopeListMV->getMetadata());
-    Assert(ScopeListMD != nullptr, "!id.scope.list must point to an MDNode",
-           II);
-    Assert(ScopeListMD->getNumOperands() == 1,
-           "!id.scope.list must point to a list with a single scope", II);
+    Check(ScopeListMD != nullptr, "!id.scope.list must point to an MDNode", II);
+    Check(ScopeListMD->getNumOperands() == 1,
+          "!id.scope.list must point to a list with a single scope", II);
     visitAliasScopeListMetadata(ScopeListMD);
   }
 
@@ -5899,10 +6135,10 @@ void Verifier::verifyNoAliasScopeDecl() {
       for (auto *I : llvm::make_range(ItCurrent, ItNext))
         for (auto *J : llvm::make_range(ItCurrent, ItNext))
           if (I != J)
-            Assert(!DT.dominates(I, J),
-                   "llvm.experimental.noalias.scope.decl dominates another one "
-                   "with the same scope",
-                   I);
+            Check(!DT.dominates(I, J),
+                  "llvm.experimental.noalias.scope.decl dominates another one "
+                  "with the same scope",
+                  I);
     ItCurrent = ItNext;
   }
 }
@@ -5995,7 +6231,7 @@ template <typename... Tys> void TBAAVerifier::CheckFailed(Tys &&... Args) {
     return Diagnostic->CheckFailed(Args...);
 }
 
-#define AssertTBAA(C, ...)                                                     \
+#define CheckTBAA(C, ...)                                                      \
   do {                                                                         \
     if (!(C)) {                                                                \
       CheckFailed(__VA_ARGS__);                                                \
@@ -6185,7 +6421,7 @@ MDNode *TBAAVerifier::getFieldNodeFromTBAABaseNode(Instruction &I,
 
   // Scalar nodes have only one possible "field" -- their parent in the access
   // hierarchy.  Offset must be zero at this point, but our caller is supposed
-  // to Assert that.
+  // to check that.
   if (BaseNode->getNumOperands() == 2)
     return cast<MDNode>(BaseNode->getOperand(1));
 
@@ -6227,17 +6463,17 @@ static bool isNewFormatTBAATypeNode(llvm::MDNode *Type) {
 }
 
 bool TBAAVerifier::visitTBAAMetadata(Instruction &I, const MDNode *MD) {
-  AssertTBAA(isa<LoadInst>(I) || isa<StoreInst>(I) || isa<CallInst>(I) ||
-                 isa<VAArgInst>(I) || isa<AtomicRMWInst>(I) ||
-                 isa<AtomicCmpXchgInst>(I),
-             "This instruction shall not have a TBAA access tag!", &I);
+  CheckTBAA(isa<LoadInst>(I) || isa<StoreInst>(I) || isa<CallInst>(I) ||
+                isa<VAArgInst>(I) || isa<AtomicRMWInst>(I) ||
+                isa<AtomicCmpXchgInst>(I),
+            "This instruction shall not have a TBAA access tag!", &I);
 
   bool IsStructPathTBAA =
       isa<MDNode>(MD->getOperand(0)) && MD->getNumOperands() >= 3;
 
-  AssertTBAA(
-      IsStructPathTBAA,
-      "Old-style TBAA is no longer allowed, use struct-path TBAA instead", &I);
+  CheckTBAA(IsStructPathTBAA,
+            "Old-style TBAA is no longer allowed, use struct-path TBAA instead",
+            &I);
 
   MDNode *BaseNode = dyn_cast_or_null<MDNode>(MD->getOperand(0));
   MDNode *AccessType = dyn_cast_or_null<MDNode>(MD->getOperand(1));
@@ -6245,18 +6481,18 @@ bool TBAAVerifier::visitTBAAMetadata(Instruction &I, const MDNode *MD) {
   bool IsNewFormat = isNewFormatTBAATypeNode(AccessType);
 
   if (IsNewFormat) {
-    AssertTBAA(MD->getNumOperands() == 4 || MD->getNumOperands() == 5,
-               "Access tag metadata must have either 4 or 5 operands", &I, MD);
+    CheckTBAA(MD->getNumOperands() == 4 || MD->getNumOperands() == 5,
+              "Access tag metadata must have either 4 or 5 operands", &I, MD);
   } else {
-    AssertTBAA(MD->getNumOperands() < 5,
-               "Struct tag metadata must have either 3 or 4 operands", &I, MD);
+    CheckTBAA(MD->getNumOperands() < 5,
+              "Struct tag metadata must have either 3 or 4 operands", &I, MD);
   }
 
   // Check the access size field.
   if (IsNewFormat) {
     auto *AccessSizeNode = mdconst::dyn_extract_or_null<ConstantInt>(
         MD->getOperand(3));
-    AssertTBAA(AccessSizeNode, "Access size field must be a constant", &I, MD);
+    CheckTBAA(AccessSizeNode, "Access size field must be a constant", &I, MD);
   }
 
   // Check the immutability flag.
@@ -6264,28 +6500,28 @@ bool TBAAVerifier::visitTBAAMetadata(Instruction &I, const MDNode *MD) {
   if (MD->getNumOperands() == ImmutabilityFlagOpNo + 1) {
     auto *IsImmutableCI = mdconst::dyn_extract_or_null<ConstantInt>(
         MD->getOperand(ImmutabilityFlagOpNo));
-    AssertTBAA(IsImmutableCI,
-               "Immutability tag on struct tag metadata must be a constant",
-               &I, MD);
-    AssertTBAA(
+    CheckTBAA(IsImmutableCI,
+              "Immutability tag on struct tag metadata must be a constant", &I,
+              MD);
+    CheckTBAA(
         IsImmutableCI->isZero() || IsImmutableCI->isOne(),
         "Immutability part of the struct tag metadata must be either 0 or 1",
         &I, MD);
   }
 
-  AssertTBAA(BaseNode && AccessType,
-             "Malformed struct tag metadata: base and access-type "
-             "should be non-null and point to Metadata nodes",
-             &I, MD, BaseNode, AccessType);
+  CheckTBAA(BaseNode && AccessType,
+            "Malformed struct tag metadata: base and access-type "
+            "should be non-null and point to Metadata nodes",
+            &I, MD, BaseNode, AccessType);
 
   if (!IsNewFormat) {
-    AssertTBAA(isValidScalarTBAANode(AccessType),
-               "Access type node must be a valid scalar type", &I, MD,
-               AccessType);
+    CheckTBAA(isValidScalarTBAANode(AccessType),
+              "Access type node must be a valid scalar type", &I, MD,
+              AccessType);
   }
 
   auto *OffsetCI = mdconst::dyn_extract_or_null<ConstantInt>(MD->getOperand(2));
-  AssertTBAA(OffsetCI, "Offset must be constant integer", &I, MD);
+  CheckTBAA(OffsetCI, "Offset must be constant integer", &I, MD);
 
   APInt Offset = OffsetCI->getValue();
   bool SeenAccessTypeInPath = false;
@@ -6313,21 +6549,21 @@ bool TBAAVerifier::visitTBAAMetadata(Instruction &I, const MDNode *MD) {
     SeenAccessTypeInPath |= BaseNode == AccessType;
 
     if (isValidScalarTBAANode(BaseNode) || BaseNode == AccessType)
-      AssertTBAA(Offset == 0, "Offset not zero at the point of scalar access",
-                 &I, MD, &Offset);
+      CheckTBAA(Offset == 0, "Offset not zero at the point of scalar access",
+                &I, MD, &Offset);
 
-    AssertTBAA(BaseNodeBitWidth == Offset.getBitWidth() ||
-                   (BaseNodeBitWidth == 0 && Offset == 0) ||
-                   (IsNewFormat && BaseNodeBitWidth == ~0u),
-               "Access bit-width not the same as description bit-width", &I, MD,
-               BaseNodeBitWidth, Offset.getBitWidth());
+    CheckTBAA(BaseNodeBitWidth == Offset.getBitWidth() ||
+                  (BaseNodeBitWidth == 0 && Offset == 0) ||
+                  (IsNewFormat && BaseNodeBitWidth == ~0u),
+              "Access bit-width not the same as description bit-width", &I, MD,
+              BaseNodeBitWidth, Offset.getBitWidth());
 
     if (IsNewFormat && SeenAccessTypeInPath)
       break;
   }
 
-  AssertTBAA(SeenAccessTypeInPath, "Did not see access type in access path!",
-             &I, MD);
+  CheckTBAA(SeenAccessTypeInPath, "Did not see access type in access path!", &I,
+            MD);
   return true;
 }
 
diff --git a/llvm/lib/InterfaceStub/ELFObjHandler.cpp b/llvm/lib/InterfaceStub/ELFObjHandler.cpp
index cb72f57f7bde..13801cd2cbc0 100644
--- a/llvm/lib/InterfaceStub/ELFObjHandler.cpp
+++ b/llvm/lib/InterfaceStub/ELFObjHandler.cpp
@@ -17,7 +17,6 @@
 #include "llvm/Support/FileOutputBuffer.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Process.h"
 
 using llvm::object::ELFObjectFile;
 
@@ -195,7 +194,7 @@ public:
     for (const std::string &Lib : Stub.NeededLibs)
       DynStr.Content.add(Lib);
     if (Stub.SoName)
-      DynStr.Content.add(Stub.SoName.getValue());
+      DynStr.Content.add(*Stub.SoName);
 
     std::vector<OutputSection<ELFT> *> Sections = {&DynSym, &DynStr, &DynTab,
                                                    &ShStrTab};
@@ -218,7 +217,8 @@ public:
       // time as long as it is not SHN_UNDEF. Set shndx to 1, which
       // points to ".dynsym".
       uint16_t Shndx = Sym.Undefined ? SHN_UNDEF : 1;
-      DynSym.Content.add(DynStr.Content.getOffset(Sym.Name), Sym.Size, Bind,
+      uint64_t Size = Sym.Size.value_or(0);
+      DynSym.Content.add(DynStr.Content.getOffset(Sym.Name), Size, Bind,
                          convertIFSSymbolTypeToELF(Sym.Type), 0, Shndx);
     }
     DynSym.Size = DynSym.Content.getSize();
@@ -226,11 +226,12 @@ public:
     // Poplulate dynamic table.
     size_t DynSymIndex = DynTab.Content.addAddr(DT_SYMTAB, 0);
     size_t DynStrIndex = DynTab.Content.addAddr(DT_STRTAB, 0);
+    DynTab.Content.addValue(DT_STRSZ, DynSym.Size);
     for (const std::string &Lib : Stub.NeededLibs)
       DynTab.Content.addValue(DT_NEEDED, DynStr.Content.getOffset(Lib));
     if (Stub.SoName)
       DynTab.Content.addValue(DT_SONAME,
-                              DynStr.Content.getOffset(Stub.SoName.getValue()));
+                              DynStr.Content.getOffset(*Stub.SoName));
     DynTab.Size = DynTab.Content.getSize();
     // Calculate sections' addresses and offsets.
     uint64_t CurrentOffset = sizeof(Elf_Ehdr);
@@ -249,8 +250,7 @@ public:
     fillStrTabShdr(ShStrTab);
 
     // Finish initializing the ELF header.
-    initELFHeader<ELFT>(ElfHeader,
-                        static_cast<uint16_t>(Stub.Target.Arch.getValue()));
+    initELFHeader<ELFT>(ElfHeader, static_cast<uint16_t>(*Stub.Target.Arch));
     ElfHeader.e_shstrndx = ShStrTab.Index;
     ElfHeader.e_shnum = LastSection->Index + 1;
     ElfHeader.e_shoff =
@@ -334,6 +334,89 @@ private:
     write(Data + shdrOffset(Sec), Sec.Shdr);
   }
 };
+
+/// This function takes an error, and appends a string of text to the end of
+/// that error. Since "appending" to an Error isn't supported behavior of an
+/// Error, this function technically creates a new error with the combined
+/// message and consumes the old error.
+///
+/// @param Err Source error.
+/// @param After Text to append at the end of Err's error message.
+Error appendToError(Error Err, StringRef After) {
+  std::string Message;
+  raw_string_ostream Stream(Message);
+  Stream << Err;
+  Stream << " " << After;
+  consumeError(std::move(Err));
+  return createError(Stream.str());
+}
+
+template <class ELFT> class DynSym {
+  using Elf_Shdr_Range = typename ELFT::ShdrRange;
+  using Elf_Shdr = typename ELFT::Shdr;
+
+public:
+  static Expected<DynSym> create(const ELFFile<ELFT> &ElfFile,
+                                 const DynamicEntries &DynEnt) {
+    Expected<Elf_Shdr_Range> Shdrs = ElfFile.sections();
+    if (!Shdrs)
+      return Shdrs.takeError();
+    return DynSym(ElfFile, DynEnt, *Shdrs);
+  }
+
+  Expected<const uint8_t *> getDynSym() {
+    if (DynSymHdr)
+      return ElfFile.base() + DynSymHdr->sh_offset;
+    return getDynamicData(DynEnt.DynSymAddr, "dynamic symbol table");
+  }
+
+  Expected<StringRef> getDynStr() {
+    if (DynSymHdr)
+      return ElfFile.getStringTableForSymtab(*DynSymHdr, Shdrs);
+    Expected<const uint8_t *> DataOrErr = getDynamicData(
+        DynEnt.StrTabAddr, "dynamic string table", DynEnt.StrSize);
+    if (!DataOrErr)
+      return DataOrErr.takeError();
+    return StringRef(reinterpret_cast<const char *>(*DataOrErr),
+                     DynEnt.StrSize);
+  }
+
+private:
+  DynSym(const ELFFile<ELFT> &ElfFile, const DynamicEntries &DynEnt,
+         Elf_Shdr_Range Shdrs)
+      : ElfFile(ElfFile), DynEnt(DynEnt), Shdrs(Shdrs),
+        DynSymHdr(findDynSymHdr()) {}
+
+  const Elf_Shdr *findDynSymHdr() {
+    for (const Elf_Shdr &Sec : Shdrs)
+      if (Sec.sh_type == SHT_DYNSYM) {
+        // If multiple .dynsym are present, use the first one.
+        // This behavior aligns with llvm::object::ELFFile::getDynSymtabSize()
+        return &Sec;
+      }
+    return nullptr;
+  }
+
+  Expected<const uint8_t *> getDynamicData(uint64_t EntAddr, StringRef Name,
+                                           uint64_t Size = 0) {
+    Expected<const uint8_t *> SecPtr = ElfFile.toMappedAddr(EntAddr);
+    if (!SecPtr)
+      return appendToError(
+          SecPtr.takeError(),
+          ("when locating " + Name + " section contents").str());
+    Expected<const uint8_t *> SecEndPtr = ElfFile.toMappedAddr(EntAddr + Size);
+    if (!SecEndPtr)
+      return appendToError(
+          SecEndPtr.takeError(),
+          ("when locating " + Name + " section contents").str());
+    return *SecPtr;
+  }
+
+  const ELFFile<ELFT> &ElfFile;
+  const DynamicEntries &DynEnt;
+  Elf_Shdr_Range Shdrs;
+  const Elf_Shdr *DynSymHdr;
+};
 } // end anonymous namespace
 
 /// This function behaves similarly to StringRef::substr(), but attempts to
@@ -353,22 +436,6 @@ static Expected<StringRef> terminatedSubstr(StringRef Str, size_t Offset) {
   return Str.substr(Offset, StrLen);
 }
 
-/// This function takes an error, and appends a string of text to the end of
-/// that error. Since "appending" to an Error isn't supported behavior of an
-/// Error, this function technically creates a new error with the combined
-/// message and consumes the old error.
-///
-/// @param Err Source error.
-/// @param After Text to append at the end of Err's error message.
-Error appendToError(Error Err, StringRef After) {
-  std::string Message;
-  raw_string_ostream Stream(Message);
-  Stream << Err;
-  Stream << " " << After;
-  consumeError(std::move(Err));
-  return createError(Stream.str());
-}
-
 /// This function populates a DynamicEntries struct using an ELFT::DynRange.
 /// After populating the struct, the members are validated with
 /// some basic correctness checks.
@@ -425,7 +492,7 @@ static Error populateDynamic(DynamicEntries &Dyn,
     return createError(
         "Couldn't locate dynamic symbol table (no DT_SYMTAB entry)");
   }
-  if (Dyn.SONameOffset.hasValue() && *Dyn.SONameOffset >= Dyn.StrSize) {
+  if (Dyn.SONameOffset && *Dyn.SONameOffset >= Dyn.StrSize) {
     return createStringError(object_error::parse_failed,
                              "DT_SONAME string offset (0x%016" PRIx64
                              ") outside of dynamic string table",
@@ -507,7 +574,6 @@ template <class ELFT>
 static Expected<std::unique_ptr<IFSStub>>
 buildStub(const ELFObjectFile<ELFT> &ElfObj) {
   using Elf_Dyn_Range = typename ELFT::DynRange;
-  using Elf_Phdr_Range = typename ELFT::PhdrRange;
   using Elf_Sym_Range = typename ELFT::SymRange;
   using Elf_Sym = typename ELFT::Sym;
   std::unique_ptr<IFSStub> DestStub = std::make_unique<IFSStub>();
@@ -518,25 +584,19 @@ buildStub(const ELFObjectFile<ELFT> &ElfObj) {
     return DynTable.takeError();
   }
 
-  // Fetch program headers.
-  Expected<Elf_Phdr_Range> PHdrs = ElfFile.program_headers();
-  if (!PHdrs) {
-    return PHdrs.takeError();
-  }
-
   // Collect relevant .dynamic entries.
   DynamicEntries DynEnt;
   if (Error Err = populateDynamic<ELFT>(DynEnt, *DynTable))
     return std::move(Err);
+  Expected<DynSym<ELFT>> EDynSym = DynSym<ELFT>::create(ElfFile, DynEnt);
+  if (!EDynSym)
+    return EDynSym.takeError();
 
-  // Get pointer to in-memory location of .dynstr section.
-  Expected<const uint8_t *> DynStrPtr = ElfFile.toMappedAddr(DynEnt.StrTabAddr);
-  if (!DynStrPtr)
-    return appendToError(DynStrPtr.takeError(),
-                         "when locating .dynstr section contents");
+  Expected<StringRef> EDynStr = EDynSym->getDynStr();
+  if (!EDynStr)
+    return EDynStr.takeError();
 
-  StringRef DynStr(reinterpret_cast<const char *>(DynStrPtr.get()),
-                   DynEnt.StrSize);
+  StringRef DynStr = *EDynStr;
 
   // Populate Arch from ELF header.
   DestStub->Target.Arch = static_cast<IFSArch>(ElfFile.getHeader().e_machine);
@@ -547,7 +607,7 @@ buildStub(const ELFObjectFile<ELFT> &ElfObj) {
   DestStub->Target.ObjectFormat = "ELF";
 
   // Populate SoName from .dynamic entries and dynamic string table.
-  if (DynEnt.SONameOffset.hasValue()) {
+  if (DynEnt.SONameOffset) {
     Expected<StringRef> NameOrErr =
         terminatedSubstr(DynStr, *DynEnt.SONameOffset);
     if (!NameOrErr) {
@@ -572,8 +632,7 @@ buildStub(const ELFObjectFile<ELFT> &ElfObj) {
     return SymCount.takeError();
   if (*SymCount > 0) {
     // Get pointer to in-memory location of .dynsym section.
-    Expected<const uint8_t *> DynSymPtr =
-        ElfFile.toMappedAddr(DynEnt.DynSymAddr);
+    Expected<const uint8_t *> DynSymPtr = EDynSym->getDynSym();
     if (!DynSymPtr)
       return appendToError(DynSymPtr.takeError(),
                            "when locating .dynsym section contents");
diff --git a/llvm/lib/InterfaceStub/IFSHandler.cpp b/llvm/lib/InterfaceStub/IFSHandler.cpp
index 4ccbb18ca04a..71189e79360e 100644
--- a/llvm/lib/InterfaceStub/IFSHandler.cpp
+++ b/llvm/lib/InterfaceStub/IFSHandler.cpp
@@ -7,14 +7,17 @@
 //===-----------------------------------------------------------------------===/
 
 #include "llvm/InterfaceStub/IFSHandler.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/InterfaceStub/IFSStub.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/GlobPattern.h"
 #include "llvm/Support/LineIterator.h"
 #include "llvm/Support/YAMLTraits.h"
+#include <functional>
 
 using namespace llvm;
 using namespace llvm::ifs;
@@ -115,11 +118,12 @@ template <> struct MappingTraits<IFSSymbol> {
     IO.mapRequired("Type", Symbol.Type);
     // The need for symbol size depends on the symbol type.
     if (Symbol.Type == IFSSymbolType::NoType) {
-      IO.mapOptional("Size", Symbol.Size, (uint64_t)0);
-    } else if (Symbol.Type == IFSSymbolType::Func) {
-      Symbol.Size = 0;
-    } else {
-      IO.mapRequired("Size", Symbol.Size);
+      // Size is None, so we are reading it in, or it is non 0 so we
+      // should emit it.
+      if (!Symbol.Size || *Symbol.Size)
+        IO.mapOptional("Size", Symbol.Size);
+    } else if (Symbol.Type != IFSSymbolType::Func) {
+      IO.mapOptional("Size", Symbol.Size);
     }
     IO.mapOptional("Undefined", Symbol.Undefined, false);
     IO.mapOptional("Weak", Symbol.Weak, false);
@@ -189,7 +193,7 @@ Expected<std::unique_ptr<IFSStub>> ifs::readIFSFromBuffer(StringRef Buf) {
         std::make_error_code(std::errc::invalid_argument));
   if (Stub->Target.ArchString) {
     Stub->Target.Arch =
-        ELF::convertArchNameToEMachine(Stub->Target.ArchString.getValue());
+        ELF::convertArchNameToEMachine(*Stub->Target.ArchString);
   }
   return std::move(Stub);
 }
@@ -262,7 +266,7 @@ Error ifs::validateIFSTarget(IFSStub &Stub, bool ParseTriple) {
           ValidationEC);
     }
     if (ParseTriple) {
-      IFSTarget TargetFromTriple = parseTriple(Stub.Target.Triple.getValue());
+      IFSTarget TargetFromTriple = parseTriple(*Stub.Target.Triple);
       Stub.Target.Arch = TargetFromTriple.Arch;
       Stub.Target.BitWidth = TargetFromTriple.BitWidth;
       Stub.Target.Endianness = TargetFromTriple.Endianness;
@@ -328,12 +332,28 @@ void ifs::stripIFSTarget(IFSStub &Stub, bool StripTriple, bool StripArch,
   }
 }
 
-void ifs::stripIFSUndefinedSymbols(IFSStub &Stub) {
-  for (auto Iter = Stub.Symbols.begin(); Iter != Stub.Symbols.end();) {
-    if (Iter->Undefined) {
-      Iter = Stub.Symbols.erase(Iter);
-    } else {
-      Iter++;
-    }
+Error ifs::filterIFSSyms(IFSStub &Stub, bool StripUndefined,
+                         const std::vector<std::string> &Exclude) {
+  std::function<bool(const IFSSymbol &)> Filter = [](const IFSSymbol &) {
+    return false;
+  };
+
+  if (StripUndefined) {
+    Filter = [Filter](const IFSSymbol &Sym) {
+      return Sym.Undefined || Filter(Sym);
+    };
+  }
+
+  for (StringRef Glob : Exclude) {
+    Expected<llvm::GlobPattern> PatternOrErr = llvm::GlobPattern::create(Glob);
+    if (!PatternOrErr)
+      return PatternOrErr.takeError();
+    Filter = [Pattern = *PatternOrErr, Filter](const IFSSymbol &Sym) {
+      return Pattern.match(Sym.Name) || Filter(Sym);
+    };
   }
+
+  llvm::erase_if(Stub.Symbols, Filter);
+
+  return Error::success();
 }
diff --git a/llvm/lib/InterfaceStub/IFSStub.cpp b/llvm/lib/InterfaceStub/IFSStub.cpp
index 1ce7a66869b8..f043f7e9e383 100644
--- a/llvm/lib/InterfaceStub/IFSStub.cpp
+++ b/llvm/lib/InterfaceStub/IFSStub.cpp
@@ -8,7 +8,7 @@
 
 #include "llvm/InterfaceStub/IFSStub.h"
 #include "llvm/BinaryFormat/ELF.h"
-#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 using namespace llvm::ifs;
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 418aad26fdd6..a9e04ba760ca 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -134,7 +134,6 @@ void llvm::computeLTOCacheKey(
   AddUnsigned(Conf.CGOptLevel);
   AddUnsigned(Conf.CGFileType);
   AddUnsigned(Conf.OptLevel);
-  AddUnsigned(Conf.UseNewPM);
   AddUnsigned(Conf.Freestanding);
   AddString(Conf.OptPipeline);
   AddString(Conf.AAPipeline);
@@ -640,11 +639,11 @@ Error LTO::addModule(InputFile &Input, unsigned ModI,
   if (!LTOInfo)
     return LTOInfo.takeError();
 
-  if (EnableSplitLTOUnit.hasValue()) {
+  if (EnableSplitLTOUnit) {
     // If only some modules were split, flag this in the index so that
     // we can skip or error on optimizations that need consistently split
     // modules (whole program devirt and lower type tests).
-    if (EnableSplitLTOUnit.getValue() != LTOInfo->EnableSplitLTOUnit)
+    if (*EnableSplitLTOUnit != LTOInfo->EnableSplitLTOUnit)
       ThinLTO.CombinedIndex.setPartiallySplitLTOUnits();
   } else
     EnableSplitLTOUnit = LTOInfo->EnableSplitLTOUnit;
@@ -820,9 +819,10 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
       // For now they aren't reported correctly by ModuleSymbolTable.
       auto &CommonRes = RegularLTO.Commons[std::string(Sym.getIRName())];
       CommonRes.Size = std::max(CommonRes.Size, Sym.getCommonSize());
-      MaybeAlign SymAlign(Sym.getCommonAlignment());
-      if (SymAlign)
-        CommonRes.Align = max(*SymAlign, CommonRes.Align);
+      if (uint32_t SymAlignValue = Sym.getCommonAlignment()) {
+        const Align SymAlign(SymAlignValue);
+        CommonRes.Align = std::max(SymAlign, CommonRes.Align.valueOrOne());
+      }
       CommonRes.Prevailing |= Res.Prevailing;
     }
   }
@@ -885,8 +885,7 @@ Error LTO::linkRegularLTO(RegularLTOState::AddedModule Mod,
     Keep.push_back(GV);
   }
 
-  return RegularLTO.Mover->move(std::move(Mod.M), Keep,
-                                [](GlobalValue &, IRMover::ValueAdder) {},
+  return RegularLTO.Mover->move(std::move(Mod.M), Keep, nullptr,
                                 /* IsPerformingImport */ false);
 }
 
@@ -1162,14 +1161,18 @@ protected:
   const Config &Conf;
   ModuleSummaryIndex &CombinedIndex;
   const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries;
+  lto::IndexWriteCallback OnWrite;
+  bool ShouldEmitImportsFiles;
 
 public:
   ThinBackendProc(const Config &Conf, ModuleSummaryIndex &CombinedIndex,
-                  const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries)
+                  const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
+                  lto::IndexWriteCallback OnWrite, bool ShouldEmitImportsFiles)
       : Conf(Conf), CombinedIndex(CombinedIndex),
-        ModuleToDefinedGVSummaries(ModuleToDefinedGVSummaries) {}
+        ModuleToDefinedGVSummaries(ModuleToDefinedGVSummaries),
+        OnWrite(OnWrite), ShouldEmitImportsFiles(ShouldEmitImportsFiles) {}
 
-  virtual ~ThinBackendProc() {}
+  virtual ~ThinBackendProc() = default;
   virtual Error start(
       unsigned Task, BitcodeModule BM,
       const FunctionImporter::ImportMapTy &ImportList,
@@ -1178,6 +1181,30 @@ public:
       MapVector<StringRef, BitcodeModule> &ModuleMap) = 0;
   virtual Error wait() = 0;
   virtual unsigned getThreadCount() = 0;
+
+  // Write sharded indices and (optionally) imports to disk
+  Error emitFiles(const FunctionImporter::ImportMapTy &ImportList,
+                  llvm::StringRef ModulePath,
+                  const std::string &NewModulePath) {
+    std::map<std::string, GVSummaryMapTy> ModuleToSummariesForIndex;
+    std::error_code EC;
+    gatherImportedSummariesForModule(ModulePath, ModuleToDefinedGVSummaries,
+                                     ImportList, ModuleToSummariesForIndex);
+
+    raw_fd_ostream OS(NewModulePath + ".thinlto.bc", EC,
+                      sys::fs::OpenFlags::OF_None);
+    if (EC)
+      return errorCodeToError(EC);
+    writeIndexToFile(CombinedIndex, OS, &ModuleToSummariesForIndex);
+
+    if (ShouldEmitImportsFiles) {
+      EC = EmitImportsFiles(ModulePath, NewModulePath + ".imports",
+                            ModuleToSummariesForIndex);
+      if (EC)
+        return errorCodeToError(EC);
+    }
+    return Error::success();
+  }
 };
 
 namespace {
@@ -1191,15 +1218,19 @@ class InProcessThinBackend : public ThinBackendProc {
   Optional<Error> Err;
   std::mutex ErrMu;
 
+  bool ShouldEmitIndexFiles;
+
 public:
   InProcessThinBackend(
       const Config &Conf, ModuleSummaryIndex &CombinedIndex,
       ThreadPoolStrategy ThinLTOParallelism,
       const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
-      AddStreamFn AddStream, FileCache Cache)
-      : ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries),
+      AddStreamFn AddStream, FileCache Cache, lto::IndexWriteCallback OnWrite,
+      bool ShouldEmitIndexFiles, bool ShouldEmitImportsFiles)
+      : ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries,
+                        OnWrite, ShouldEmitImportsFiles),
         BackendThreadPool(ThinLTOParallelism), AddStream(std::move(AddStream)),
-        Cache(std::move(Cache)) {
+        Cache(std::move(Cache)), ShouldEmitIndexFiles(ShouldEmitIndexFiles) {
     for (auto &Name : CombinedIndex.cfiFunctionDefs())
       CfiFunctionDefs.insert(
           GlobalValue::getGUID(GlobalValue::dropLLVMManglingEscape(Name)));
@@ -1228,6 +1259,11 @@ public:
 
     auto ModuleID = BM.getModuleIdentifier();
 
+    if (ShouldEmitIndexFiles) {
+      if (auto E = emitFiles(ImportList, ModuleID, ModuleID.str()))
+        return E;
+    }
+
     if (!Cache || !CombinedIndex.modulePaths().count(ModuleID) ||
         all_of(CombinedIndex.getModuleHash(ModuleID),
                [](uint32_t V) { return V == 0; }))
@@ -1286,6 +1322,9 @@ public:
         },
         BM, std::ref(CombinedIndex), std::ref(ImportList), std::ref(ExportList),
         std::ref(ResolvedODR), std::ref(DefinedGlobals), std::ref(ModuleMap));
+
+    if (OnWrite)
+      OnWrite(std::string(ModulePath));
     return Error::success();
   }
 
@@ -1303,13 +1342,16 @@ public:
 };
 } // end anonymous namespace
 
-ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism) {
+ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism,
+                                            lto::IndexWriteCallback OnWrite,
+                                            bool ShouldEmitIndexFiles,
+                                            bool ShouldEmitImportsFiles) {
   return [=](const Config &Conf, ModuleSummaryIndex &CombinedIndex,
              const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
              AddStreamFn AddStream, FileCache Cache) {
     return std::make_unique<InProcessThinBackend>(
         Conf, CombinedIndex, Parallelism, ModuleToDefinedGVSummaries, AddStream,
-        Cache);
+        Cache, OnWrite, ShouldEmitIndexFiles, ShouldEmitImportsFiles);
   };
 }
 
@@ -1336,9 +1378,7 @@ std::string lto::getThinLTOOutputFile(const std::string &Path,
 namespace {
 class WriteIndexesThinBackend : public ThinBackendProc {
   std::string OldPrefix, NewPrefix;
-  bool ShouldEmitImportsFiles;
   raw_fd_ostream *LinkedObjectsFile;
-  lto::IndexWriteCallback OnWrite;
 
 public:
   WriteIndexesThinBackend(
@@ -1346,10 +1386,10 @@ public:
       const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
       std::string OldPrefix, std::string NewPrefix, bool ShouldEmitImportsFiles,
       raw_fd_ostream *LinkedObjectsFile, lto::IndexWriteCallback OnWrite)
-      : ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries),
+      : ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries,
+                        OnWrite, ShouldEmitImportsFiles),
         OldPrefix(OldPrefix), NewPrefix(NewPrefix),
-        ShouldEmitImportsFiles(ShouldEmitImportsFiles),
-        LinkedObjectsFile(LinkedObjectsFile), OnWrite(OnWrite) {}
+        LinkedObjectsFile(LinkedObjectsFile) {}
 
   Error start(
       unsigned Task, BitcodeModule BM,
@@ -1364,23 +1404,8 @@ public:
     if (LinkedObjectsFile)
       *LinkedObjectsFile << NewModulePath << '\n';
 
-    std::map<std::string, GVSummaryMapTy> ModuleToSummariesForIndex;
-    gatherImportedSummariesForModule(ModulePath, ModuleToDefinedGVSummaries,
-                                     ImportList, ModuleToSummariesForIndex);
-
-    std::error_code EC;
-    raw_fd_ostream OS(NewModulePath + ".thinlto.bc", EC,
-                      sys::fs::OpenFlags::OF_None);
-    if (EC)
-      return errorCodeToError(EC);
-    writeIndexToFile(CombinedIndex, OS, &ModuleToSummariesForIndex);
-
-    if (ShouldEmitImportsFiles) {
-      EC = EmitImportsFiles(ModulePath, NewModulePath + ".imports",
-                            ModuleToSummariesForIndex);
-      if (EC)
-        return errorCodeToError(EC);
-    }
+    if (auto E = emitFiles(ImportList, ModulePath, NewModulePath))
+      return E;
 
     if (OnWrite)
       OnWrite(std::string(ModulePath));
@@ -1621,9 +1646,8 @@ lto::setupStatsFile(StringRef StatsFilename) {
 // is to sort them per size so that the largest module get schedule as soon as
 // possible. This is purely a compile-time optimization.
 std::vector<int> lto::generateModulesOrdering(ArrayRef<BitcodeModule *> R) {
-  std::vector<int> ModulesOrdering;
-  ModulesOrdering.resize(R.size());
-  std::iota(ModulesOrdering.begin(), ModulesOrdering.end(), 0);
+  auto Seq = llvm::seq<int>(0, R.size());
+  std::vector<int> ModulesOrdering(Seq.begin(), Seq.end());
   llvm::sort(ModulesOrdering, [&](int LeftIndex, int RightIndex) {
     auto LSize = R[LeftIndex]->getBuffer().size();
     auto RSize = R[RightIndex]->getBuffer().size();
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 3877def53c3f..5d50e92ae377 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -18,7 +18,6 @@
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/IR/LLVMRemarkStreamer.h"
@@ -41,8 +40,6 @@
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/FunctionImportUtils.h"
 #include "llvm/Transforms/Utils/SplitModule.h"
@@ -298,6 +295,8 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM,
       report_fatal_error(Twine("unable to parse pass pipeline description '") +
                          Conf.OptPipeline + "': " + toString(std::move(Err)));
     }
+  } else if (Conf.UseDefaultPipeline) {
+    MPM.addPass(PB.buildPerModuleDefaultPipeline(OL));
   } else if (IsThinLTO) {
     MPM.addPass(PB.buildThinLTODefaultPipeline(OL, ImportSummary));
   } else {
@@ -310,39 +309,6 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM,
   MPM.run(Mod, MAM);
 }
 
-static void runOldPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM,
-                           bool IsThinLTO, ModuleSummaryIndex *ExportSummary,
-                           const ModuleSummaryIndex *ImportSummary) {
-  legacy::PassManager passes;
-  passes.add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
-
-  PassManagerBuilder PMB;
-  PMB.LibraryInfo = new TargetLibraryInfoImpl(Triple(TM->getTargetTriple()));
-  if (Conf.Freestanding)
-    PMB.LibraryInfo->disableAllFunctions();
-  PMB.Inliner = createFunctionInliningPass();
-  PMB.ExportSummary = ExportSummary;
-  PMB.ImportSummary = ImportSummary;
-  // Unconditionally verify input since it is not verified before this
-  // point and has unknown origin.
-  PMB.VerifyInput = true;
-  PMB.VerifyOutput = !Conf.DisableVerify;
-  PMB.LoopVectorize = true;
-  PMB.SLPVectorize = true;
-  PMB.OptLevel = Conf.OptLevel;
-  PMB.PGOSampleUse = Conf.SampleProfile;
-  PMB.EnablePGOCSInstrGen = Conf.RunCSIRInstr;
-  if (!Conf.RunCSIRInstr && !Conf.CSIRProfile.empty()) {
-    PMB.EnablePGOCSInstrUse = true;
-    PMB.PGOInstrUse = Conf.CSIRProfile;
-  }
-  if (IsThinLTO)
-    PMB.populateThinLTOPassManager(passes);
-  else
-    PMB.populateLTOPassManager(passes);
-  passes.run(Mod);
-}
-
 bool lto::opt(const Config &Conf, TargetMachine *TM, unsigned Task, Module &Mod,
               bool IsThinLTO, ModuleSummaryIndex *ExportSummary,
               const ModuleSummaryIndex *ImportSummary,
@@ -365,12 +331,8 @@ bool lto::opt(const Config &Conf, TargetMachine *TM, unsigned Task, Module &Mod,
                                /*Cmdline*/ CmdArgs);
   }
   // FIXME: Plumb the combined index into the new pass manager.
-  if (Conf.UseNewPM || !Conf.OptPipeline.empty()) {
-    runNewPMPasses(Conf, Mod, TM, Conf.OptLevel, IsThinLTO, ExportSummary,
-                   ImportSummary);
-  } else {
-    runOldPMPasses(Conf, Mod, TM, IsThinLTO, ExportSummary, ImportSummary);
-  }
+  runNewPMPasses(Conf, Mod, TM, Conf.OptLevel, IsThinLTO, ExportSummary,
+                 ImportSummary);
   return !Conf.PostOptModuleHook || Conf.PostOptModuleHook(Task, Mod);
 }
 
diff --git a/llvm/lib/LTO/LTOCodeGenerator.cpp b/llvm/lib/LTO/LTOCodeGenerator.cpp
index fdc9896aca78..2abf249cbd62 100644
--- a/llvm/lib/LTO/LTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/LTOCodeGenerator.cpp
@@ -66,11 +66,7 @@
 using namespace llvm;
 
 const char* LTOCodeGenerator::getVersionString() {
-#ifdef LLVM_VERSION_INFO
-  return PACKAGE_NAME " version " PACKAGE_VERSION ", " LLVM_VERSION_INFO;
-#else
   return PACKAGE_NAME " version " PACKAGE_VERSION;
-#endif
 }
 
 namespace llvm {
@@ -132,7 +128,7 @@ LTOCodeGenerator::LTOCodeGenerator(LLVMContext &Context)
   };
 }
 
-LTOCodeGenerator::~LTOCodeGenerator() {}
+LTOCodeGenerator::~LTOCodeGenerator() = default;
 
 void LTOCodeGenerator::setAsmUndefinedRefs(LTOModule *Mod) {
   for (const StringRef &Undef : Mod->getAsmUndefinedRefs())
diff --git a/llvm/lib/LTO/LTOModule.cpp b/llvm/lib/LTO/LTOModule.cpp
index 4cc1b307c553..5ad5e857296d 100644
--- a/llvm/lib/LTO/LTOModule.cpp
+++ b/llvm/lib/LTO/LTOModule.cpp
@@ -50,7 +50,7 @@ LTOModule::LTOModule(std::unique_ptr<Module> M, MemoryBufferRef MBRef,
   SymTab.addModule(Mod.get());
 }
 
-LTOModule::~LTOModule() {}
+LTOModule::~LTOModule() = default;
 
 /// isBitcodeFile - Returns 'true' if the file (or memory contents) is LLVM
 /// bitcode.
diff --git a/llvm/lib/LTO/SummaryBasedOptimizations.cpp b/llvm/lib/LTO/SummaryBasedOptimizations.cpp
index 9e9d5c84d50d..bd3565771c29 100644
--- a/llvm/lib/LTO/SummaryBasedOptimizations.cpp
+++ b/llvm/lib/LTO/SummaryBasedOptimizations.cpp
@@ -55,7 +55,7 @@ void llvm::computeSyntheticCounts(ModuleSummaryIndex &Index) {
   };
   auto GetEntryCount = [](ValueInfo V) {
     if (V.getSummaryList().size()) {
-      auto S = V.getSummaryList().front().get()->getBaseObject();
+      auto S = V.getSummaryList().front()->getBaseObject();
       auto *F = cast<FunctionSummary>(S);
       return F->entryCount();
     } else {
diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
index 37e85b6af6ba..a1041b3c85f5 100644
--- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -21,16 +21,15 @@
 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/Bitcode/BitcodeWriterPass.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LLVMRemarkStreamer.h"
-#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/PassTimingInfo.h"
 #include "llvm/IR/Verifier.h"
@@ -54,11 +53,9 @@
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/FunctionAttrs.h"
 #include "llvm/Transforms/IPO/FunctionImport.h"
 #include "llvm/Transforms/IPO/Internalize.h"
-#include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/IPO/WholeProgramDevirt.h"
 #include "llvm/Transforms/ObjCARC.h"
 #include "llvm/Transforms/Utils/FunctionImportUtils.h"
@@ -239,38 +236,7 @@ crossImportIntoModule(Module &TheModule, const ModuleSummaryIndex &Index,
 
 static void optimizeModule(Module &TheModule, TargetMachine &TM,
                            unsigned OptLevel, bool Freestanding,
-                           ModuleSummaryIndex *Index) {
-  // Populate the PassManager
-  PassManagerBuilder PMB;
-  PMB.LibraryInfo = new TargetLibraryInfoImpl(TM.getTargetTriple());
-  if (Freestanding)
-    PMB.LibraryInfo->disableAllFunctions();
-  PMB.Inliner = createFunctionInliningPass();
-  // FIXME: should get it from the bitcode?
-  PMB.OptLevel = OptLevel;
-  PMB.LoopVectorize = true;
-  PMB.SLPVectorize = true;
-  // Already did this in verifyLoadedModule().
-  PMB.VerifyInput = false;
-  PMB.VerifyOutput = false;
-  PMB.ImportSummary = Index;
-
-  legacy::PassManager PM;
-
-  // Add the TTI (required to inform the vectorizer about register size for
-  // instance)
-  PM.add(createTargetTransformInfoWrapperPass(TM.getTargetIRAnalysis()));
-
-  // Add optimizations
-  PMB.populateThinLTOPassManager(PM);
-
-  PM.run(TheModule);
-}
-
-static void optimizeModuleNewPM(Module &TheModule, TargetMachine &TM,
-                                unsigned OptLevel, bool Freestanding,
-                                bool DebugPassManager,
-                                ModuleSummaryIndex *Index) {
+                           bool DebugPassManager, ModuleSummaryIndex *Index) {
   Optional<PGOOptions> PGOOpt;
   LoopAnalysisManager LAM;
   FunctionAnalysisManager FAM;
@@ -485,7 +451,7 @@ ProcessThinLTOModule(Module &TheModule, ModuleSummaryIndex &Index,
                      const ThinLTOCodeGenerator::CachingOptions &CacheOptions,
                      bool DisableCodeGen, StringRef SaveTempsDir,
                      bool Freestanding, unsigned OptLevel, unsigned count,
-                     bool UseNewPM, bool DebugPassManager) {
+                     bool DebugPassManager) {
 
   // "Benchmark"-like optimization: single-source case
   bool SingleModule = (ModuleMap.size() == 1);
@@ -525,11 +491,8 @@ ProcessThinLTOModule(Module &TheModule, ModuleSummaryIndex &Index,
     saveTempBitcode(TheModule, SaveTempsDir, count, ".3.imported.bc");
   }
 
-  if (UseNewPM)
-    optimizeModuleNewPM(TheModule, TM, OptLevel, Freestanding, DebugPassManager,
-                        &Index);
-  else
-    optimizeModule(TheModule, TM, OptLevel, Freestanding, &Index);
+  optimizeModule(TheModule, TM, OptLevel, Freestanding, DebugPassManager,
+                 &Index);
 
   saveTempBitcode(TheModule, SaveTempsDir, count, ".4.opt.bc");
 
@@ -953,7 +916,7 @@ void ThinLTOCodeGenerator::optimize(Module &TheModule) {
 
   // Optimize now
   optimizeModule(TheModule, *TMBuilder.create(), OptLevel, Freestanding,
-                 nullptr);
+                 DebugPassManager, nullptr);
 }
 
 /// Write out the generated object file, either from CacheEntryPath or from
@@ -1216,7 +1179,7 @@ void ThinLTOCodeGenerator::run() {
             ExportList, GUIDPreservedSymbols,
             ModuleToDefinedGVSummaries[ModuleIdentifier], CacheOptions,
             DisableCodeGen, SaveTempsDir, Freestanding, OptLevel, count,
-            UseNewPM, DebugPassManager);
+            DebugPassManager);
 
         // Commit to the cache (if enabled)
         CacheEntry.write(*OutputBuffer);
diff --git a/llvm/lib/LineEditor/LineEditor.cpp b/llvm/lib/LineEditor/LineEditor.cpp
index 37c4b79f8e29..09ec65a1d9c9 100644
--- a/llvm/lib/LineEditor/LineEditor.cpp
+++ b/llvm/lib/LineEditor/LineEditor.cpp
@@ -29,8 +29,8 @@ std::string LineEditor::getDefaultHistoryPath(StringRef ProgName) {
   return std::string();
 }
 
-LineEditor::CompleterConcept::~CompleterConcept() {}
-LineEditor::ListCompleterConcept::~ListCompleterConcept() {}
+LineEditor::CompleterConcept::~CompleterConcept() = default;
+LineEditor::ListCompleterConcept::~ListCompleterConcept() = default;
 
 std::string LineEditor::ListCompleterConcept::getCommonPrefix(
     const std::vector<Completion> &Comps) {
diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index b475ea81d107..5a819e2d736c 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -9,19 +9,24 @@
 #include "llvm/Linker/IRMover.h"
 #include "LinkDiagnosticInfo.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/IR/AutoUpgrade.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/GVMaterializer.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PseudoProbe.h"
 #include "llvm/IR/TypeFinder.h"
 #include "llvm/Object/ModuleSymbolTable.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 #include <utility>
 using namespace llvm;
 
@@ -381,7 +386,7 @@ class IRLinker {
   std::unique_ptr<Module> SrcM;
 
   /// See IRMover::move().
-  std::function<void(GlobalValue &, IRMover::ValueAdder)> AddLazyFor;
+  IRMover::LazyCallback AddLazyFor;
 
   TypeMapTy TypeMap;
   GlobalValueMaterializer GValMaterializer;
@@ -524,8 +529,7 @@ public:
   IRLinker(Module &DstM, MDMapT &SharedMDs,
            IRMover::IdentifiedStructTypeSet &Set, std::unique_ptr<Module> SrcM,
            ArrayRef<GlobalValue *> ValuesToLink,
-           std::function<void(GlobalValue &, IRMover::ValueAdder)> AddLazyFor,
-           bool IsPerformingImport)
+           IRMover::LazyCallback AddLazyFor, bool IsPerformingImport)
       : DstM(DstM), SrcM(std::move(SrcM)), AddLazyFor(std::move(AddLazyFor)),
         TypeMap(Set), GValMaterializer(*this), LValMaterializer(*this),
         SharedMDs(SharedMDs), IsPerformingImport(IsPerformingImport),
@@ -987,10 +991,11 @@ bool IRLinker::shouldLink(GlobalValue *DGV, GlobalValue &SGV) {
   // Callback to the client to give a chance to lazily add the Global to the
   // list of value to link.
   bool LazilyAdded = false;
-  AddLazyFor(SGV, [this, &LazilyAdded](GlobalValue &GV) {
-    maybeAdd(&GV);
-    LazilyAdded = true;
-  });
+  if (AddLazyFor)
+    AddLazyFor(SGV, [this, &LazilyAdded](GlobalValue &GV) {
+      maybeAdd(&GV);
+      LazilyAdded = true;
+    });
   return LazilyAdded;
 }
 
@@ -1041,7 +1046,7 @@ Expected<Constant *> IRLinker::linkGlobalValueProto(GlobalValue *SGV,
   if (Function *F = dyn_cast<Function>(NewGV))
     if (auto Remangled = Intrinsic::remangleIntrinsicFunction(F)) {
       NewGV->eraseFromParent();
-      NewGV = Remangled.getValue();
+      NewGV = *Remangled;
       NeedsRenaming = false;
     }
 
@@ -1229,8 +1234,15 @@ void IRLinker::linkNamedMDNodes() {
       continue;
     // Don't import pseudo probe descriptors here for thinLTO. They will be
     // emitted by the originating module.
-    if (IsPerformingImport && NMD.getName() == PseudoProbeDescMetadataName)
+    if (IsPerformingImport && NMD.getName() == PseudoProbeDescMetadataName) {
+      if (!DstM.getNamedMetadata(NMD.getName()))
+        emitWarning("Pseudo-probe ignored: source module '" +
+                    SrcM->getModuleIdentifier() +
+                    "' is compiled with -fpseudo-probe-for-profiling while "
+                    "destination module '" +
+                    DstM.getModuleIdentifier() + "' is not\n");
       continue;
+    }
     NamedMDNode *DestNMD = DstM.getOrInsertNamedMetadata(NMD.getName());
     // Add Src elements into Dest node.
     for (const MDNode *Op : NMD.operands())
@@ -1245,6 +1257,9 @@ Error IRLinker::linkModuleFlagsMetadata() {
   if (!SrcModFlags)
     return Error::success();
 
+  // Check for module flag for updates before do anything.
+  UpgradeModuleFlags(*SrcM);
+
   // If the destination module doesn't have module flags yet, then just copy
   // over the source module's flags.
   NamedMDNode *DstModFlags = DstM.getOrInsertModuleFlagsMetadata();
@@ -1327,11 +1342,15 @@ Error IRLinker::linkModuleFlagsMetadata() {
 
     // Diagnose inconsistent merge behavior types.
     if (SrcBehaviorValue != DstBehaviorValue) {
+      bool MinAndWarn = (SrcBehaviorValue == Module::Min &&
+                         DstBehaviorValue == Module::Warning) ||
+                        (DstBehaviorValue == Module::Min &&
+                         SrcBehaviorValue == Module::Warning);
       bool MaxAndWarn = (SrcBehaviorValue == Module::Max &&
                          DstBehaviorValue == Module::Warning) ||
                         (DstBehaviorValue == Module::Max &&
                          SrcBehaviorValue == Module::Warning);
-      if (!MaxAndWarn)
+      if (!(MaxAndWarn || MinAndWarn))
         return stringErr("linking module flags '" + ID->getString() +
                          "': IDs have conflicting behaviors in '" +
                          SrcM->getModuleIdentifier() + "' and '" +
@@ -1360,6 +1379,25 @@ Error IRLinker::linkModuleFlagsMetadata() {
       emitWarning(Str);
     }
 
+    // Choose the minimum if either source or destination request Min behavior.
+    if (DstBehaviorValue == Module::Min || SrcBehaviorValue == Module::Min) {
+      ConstantInt *DstValue =
+          mdconst::extract<ConstantInt>(DstOp->getOperand(2));
+      ConstantInt *SrcValue =
+          mdconst::extract<ConstantInt>(SrcOp->getOperand(2));
+
+      // The resulting flag should have a Min behavior, and contain the minimum
+      // value from between the source and destination values.
+      Metadata *FlagOps[] = {
+          (DstBehaviorValue != Module::Min ? SrcOp : DstOp)->getOperand(0), ID,
+          (SrcValue->getZExtValue() < DstValue->getZExtValue() ? SrcOp : DstOp)
+              ->getOperand(2)};
+      MDNode *Flag = MDNode::get(DstM.getContext(), FlagOps);
+      DstModFlags->setOperand(DstIndex, Flag);
+      Flags[ID].first = Flag;
+      continue;
+    }
+
     // Choose the maximum if either source or destination request Max behavior.
     if (DstBehaviorValue == Module::Max || SrcBehaviorValue == Module::Max) {
       ConstantInt *DstValue =
@@ -1673,10 +1711,9 @@ IRMover::IRMover(Module &M) : Composite(M) {
   }
 }
 
-Error IRMover::move(
-    std::unique_ptr<Module> Src, ArrayRef<GlobalValue *> ValuesToLink,
-    std::function<void(GlobalValue &, ValueAdder Add)> AddLazyFor,
-    bool IsPerformingImport) {
+Error IRMover::move(std::unique_ptr<Module> Src,
+                    ArrayRef<GlobalValue *> ValuesToLink,
+                    LazyCallback AddLazyFor, bool IsPerformingImport) {
   IRLinker TheIRLinker(Composite, SharedMDs, IdentifiedStructTypes,
                        std::move(Src), ValuesToLink, std::move(AddLazyFor),
                        IsPerformingImport);
diff --git a/llvm/lib/Linker/LinkModules.cpp b/llvm/lib/Linker/LinkModules.cpp
index f9f51bf17d95..17c3f09a23b7 100644
--- a/llvm/lib/Linker/LinkModules.cpp
+++ b/llvm/lib/Linker/LinkModules.cpp
@@ -14,7 +14,6 @@
 #include "llvm-c/Linker.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/IR/Comdat.h"
-#include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
@@ -573,11 +572,13 @@ bool ModuleLinker::run() {
   // FIXME: Propagate Errors through to the caller instead of emitting
   // diagnostics.
   bool HasErrors = false;
-  if (Error E = Mover.move(std::move(SrcM), ValuesToLink.getArrayRef(),
-                           [this](GlobalValue &GV, IRMover::ValueAdder Add) {
-                             addLazyFor(GV, Add);
-                           },
-                           /* IsPerformingImport */ false)) {
+  if (Error E =
+          Mover.move(std::move(SrcM), ValuesToLink.getArrayRef(),
+                     IRMover::LazyCallback(
+                         [this](GlobalValue &GV, IRMover::ValueAdder Add) {
+                           addLazyFor(GV, Add);
+                         }),
+                     /* IsPerformingImport */ false)) {
     handleAllErrors(std::move(E), [&](ErrorInfoBase &EIB) {
       DstM.getContext().diagnose(LinkDiagnosticInfo(DS_Error, EIB.message()));
       HasErrors = true;
diff --git a/llvm/lib/MC/ConstantPools.cpp b/llvm/lib/MC/ConstantPools.cpp
index d8a08a4bd439..c3ab88b94476 100644
--- a/llvm/lib/MC/ConstantPools.cpp
+++ b/llvm/lib/MC/ConstantPools.cpp
@@ -39,25 +39,38 @@ void ConstantPool::emitEntries(MCStreamer &Streamer) {
 const MCExpr *ConstantPool::addEntry(const MCExpr *Value, MCContext &Context,
                                      unsigned Size, SMLoc Loc) {
   const MCConstantExpr *C = dyn_cast<MCConstantExpr>(Value);
+  const MCSymbolRefExpr *S = dyn_cast<MCSymbolRefExpr>(Value);
 
   // Check if there is existing entry for the same constant. If so, reuse it.
-  auto Itr = C ? CachedEntries.find(C->getValue()) : CachedEntries.end();
-  if (Itr != CachedEntries.end())
-    return Itr->second;
+  if (C) {
+    auto CItr = CachedConstantEntries.find(C->getValue());
+    if (CItr != CachedConstantEntries.end())
+      return CItr->second;
+  }
+
+  // Check if there is existing entry for the same symbol. If so, reuse it.
+  if (S) {
+    auto SItr = CachedSymbolEntries.find(&(S->getSymbol()));
+    if (SItr != CachedSymbolEntries.end())
+      return SItr->second;
+  }
 
   MCSymbol *CPEntryLabel = Context.createTempSymbol();
 
   Entries.push_back(ConstantPoolEntry(CPEntryLabel, Value, Size, Loc));
   const auto SymRef = MCSymbolRefExpr::create(CPEntryLabel, Context);
   if (C)
-    CachedEntries[C->getValue()] = SymRef;
+    CachedConstantEntries[C->getValue()] = SymRef;
+  if (S)
+    CachedSymbolEntries[&(S->getSymbol())] = SymRef;
   return SymRef;
 }
 
 bool ConstantPool::empty() { return Entries.empty(); }
 
 void ConstantPool::clearCache() {
-  CachedEntries.clear();
+  CachedConstantEntries.clear();
+  CachedSymbolEntries.clear();
 }
 
 //
@@ -79,7 +92,7 @@ AssemblerConstantPools::getOrCreateConstantPool(MCSection *Section) {
 static void emitConstantPool(MCStreamer &Streamer, MCSection *Section,
                              ConstantPool &CP) {
   if (!CP.empty()) {
-    Streamer.SwitchSection(Section);
+    Streamer.switchSection(Section);
     CP.emitEntries(Streamer);
   }
 }
diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp
index 883735fcc293..eda495693595 100644
--- a/llvm/lib/MC/ELFObjectWriter.cpp
+++ b/llvm/lib/MC/ELFObjectWriter.cpp
@@ -13,10 +13,10 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -28,18 +28,18 @@
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCFragment.h"
-#include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Support/Alignment.h"
-#include "llvm/Support/Allocator.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compression.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -47,8 +47,6 @@
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SMLoc.h"
-#include "llvm/Support/StringSaver.h"
-#include "llvm/Support/SwapByteOrder.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
@@ -223,8 +221,6 @@ class ELFObjectWriter : public MCObjectWriter {
   DenseMap<const MCSymbolELF *, const MCSymbolELF *> Renames;
 
   bool SeenGnuAbi = false;
-  bool EmitAddrsigSection = false;
-  std::vector<const MCSymbol *> AddrsigSyms;
 
   bool hasRelocationAddend() const;
 
@@ -264,10 +260,6 @@ public:
 
   void markGnuAbi() override { SeenGnuAbi = true; }
   bool seenGnuAbi() const { return SeenGnuAbi; }
-  void emitAddrsigSection() override { EmitAddrsigSection = true; }
-  void addAddrsigSymbol(const MCSymbol *Sym) override {
-    AddrsigSyms.push_back(Sym);
-  }
 
   friend struct ELFWriter;
 };
@@ -549,9 +541,27 @@ void ELFWriter::writeSymbol(SymbolTableWriter &Writer, uint32_t StringIndex,
   uint64_t Size = 0;
 
   const MCExpr *ESize = MSD.Symbol->getSize();
-  if (!ESize && Base)
+  if (!ESize && Base) {
+    // For expressions like .set y, x+1, if y's size is unset, inherit from x.
     ESize = Base->getSize();
 
+    // For `.size x, 2; y = x; .size y, 1; z = y; z1 = z; .symver y, y@v1`, z,
+    // z1, and y@v1's st_size equals y's. However, `Base` is `x` which will give
+    // us 2. Follow the MCSymbolRefExpr assignment chain, which covers most
+    // needs. MCBinaryExpr is not handled.
+    const MCSymbolELF *Sym = &Symbol;
+    while (Sym->isVariable()) {
+      if (auto *Expr =
+              dyn_cast<MCSymbolRefExpr>(Sym->getVariableValue(false))) {
+        Sym = cast<MCSymbolELF>(&Expr->getSymbol());
+        if (!Sym->getSize())
+          continue;
+        ESize = Sym->getSize();
+      }
+      break;
+    }
+  }
+
   if (ESize) {
     int64_t Res;
     if (!ESize->evaluateKnownAbsolute(Res, Layout))
@@ -850,13 +860,9 @@ void ELFWriter::writeSectionData(const MCAssembler &Asm, MCSection &Sec,
   auto &MC = Asm.getContext();
   const auto &MAI = MC.getAsmInfo();
 
-  // Compressing debug_frame requires handling alignment fragments which is
-  // more work (possibly generalizing MCAssembler.cpp:writeFragment to allow
-  // for writing to arbitrary buffers) for little benefit.
   bool CompressionEnabled =
       MAI->compressDebugSections() != DebugCompressionType::None;
-  if (!CompressionEnabled || !SectionName.startswith(".debug_") ||
-      SectionName == ".debug_frame") {
+  if (!CompressionEnabled || !SectionName.startswith(".debug_")) {
     Asm.writeSectionData(W.OS, &Section, Layout);
     return;
   }
@@ -870,13 +876,8 @@ void ELFWriter::writeSectionData(const MCAssembler &Asm, MCSection &Sec,
   Asm.writeSectionData(VecOS, &Section, Layout);
 
   SmallVector<char, 128> CompressedContents;
-  if (Error E = zlib::compress(
-          StringRef(UncompressedData.data(), UncompressedData.size()),
-          CompressedContents)) {
-    consumeError(std::move(E));
-    W.OS << UncompressedData;
-    return;
-  }
+  zlib::compress(StringRef(UncompressedData.data(), UncompressedData.size()),
+                 CompressedContents);
 
   bool ZlibStyle = MAI->compressDebugSections() == DebugCompressionType::Z;
   if (!maybeWriteCompression(UncompressedData.size(), CompressedContents,
@@ -1336,6 +1337,7 @@ bool ELFObjectWriter::shouldRelocateWithSymbol(const MCAssembler &Asm,
     // can update it.
     return true;
   case ELF::STB_GLOBAL:
+  case ELF::STB_GNU_UNIQUE:
     // Global ELF symbols can be preempted by the dynamic linker. The relocation
     // has to point to the symbol for a reason analogous to the STB_WEAK case.
     return true;
diff --git a/llvm/lib/MC/MCAsmBackend.cpp b/llvm/lib/MC/MCAsmBackend.cpp
index 7989dd57907c..4ed9d8593336 100644
--- a/llvm/lib/MC/MCAsmBackend.cpp
+++ b/llvm/lib/MC/MCAsmBackend.cpp
@@ -8,11 +8,13 @@
 
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/ADT/None.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/STLArrayExtras.h"
+#include "llvm/MC/MCDXContainerWriter.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCMachObjectWriter.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSPIRVObjectWriter.h"
 #include "llvm/MC/MCWasmObjectWriter.h"
 #include "llvm/MC/MCWinCOFFObjectWriter.h"
 #include "llvm/MC/MCXCOFFObjectWriter.h"
@@ -39,12 +41,18 @@ MCAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
   case Triple::COFF:
     return createWinCOFFObjectWriter(
         cast<MCWinCOFFObjectTargetWriter>(std::move(TW)), OS);
+  case Triple::SPIRV:
+    return createSPIRVObjectWriter(
+        cast<MCSPIRVObjectTargetWriter>(std::move(TW)), OS);
   case Triple::Wasm:
     return createWasmObjectWriter(cast<MCWasmObjectTargetWriter>(std::move(TW)),
                                   OS);
   case Triple::XCOFF:
     return createXCOFFObjectWriter(
         cast<MCXCOFFObjectTargetWriter>(std::move(TW)), OS);
+  case Triple::DXContainer:
+    return createDXContainerObjectWriter(
+        cast<MCDXContainerTargetWriter>(std::move(TW)), OS);
   default:
     llvm_unreachable("unexpected object format");
   }
diff --git a/llvm/lib/MC/MCAsmInfo.cpp b/llvm/lib/MC/MCAsmInfo.cpp
index f52503d7b160..b8d0021ed432 100644
--- a/llvm/lib/MC/MCAsmInfo.cpp
+++ b/llvm/lib/MC/MCAsmInfo.cpp
@@ -114,7 +114,10 @@ MCAsmInfo::getExprForFDESymbol(const MCSymbol *Sym,
 }
 
 bool MCAsmInfo::isAcceptableChar(char C) const {
-  return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '@';
+  if (C == '@')
+    return doesAllowAtInName();
+
+  return isAlnum(C) || C == '_' || C == '$' || C == '.';
 }
 
 bool MCAsmInfo::isValidUnquotedName(StringRef Name) const {
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index 61ec941f50b8..6f8934d66ef4 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/Optional.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
@@ -31,13 +30,13 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbolXCOFF.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Path.h"
-#include <cctype>
 
 using namespace llvm;
 
@@ -127,7 +126,7 @@ public:
   /// Return a raw_ostream that comments can be written to.
   /// Unlike AddComment, you are required to terminate comments with \n if you
   /// use this method.
-  raw_ostream &GetCommentOS() override {
+  raw_ostream &getCommentOS() override {
     if (!IsVerboseAsm)
       return nulls();  // Discard comments unless in verbose asm mode.
     return CommentStream;
@@ -139,9 +138,7 @@ public:
   void emitExplicitComments() override;
 
   /// Emit a blank line to a .s file to pretty it up.
-  void AddBlankLine() override {
-    EmitEOL();
-  }
+  void addBlankLine() override { EmitEOL(); }
 
   /// @name MCStreamer Interface
   /// @{
@@ -180,15 +177,15 @@ public:
   bool emitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override;
 
   void emitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) override;
-  void BeginCOFFSymbolDef(const MCSymbol *Symbol) override;
-  void EmitCOFFSymbolStorageClass(int StorageClass) override;
-  void EmitCOFFSymbolType(int Type) override;
-  void EndCOFFSymbolDef() override;
-  void EmitCOFFSafeSEH(MCSymbol const *Symbol) override;
-  void EmitCOFFSymbolIndex(MCSymbol const *Symbol) override;
-  void EmitCOFFSectionIndex(MCSymbol const *Symbol) override;
-  void EmitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset) override;
-  void EmitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) override;
+  void beginCOFFSymbolDef(const MCSymbol *Symbol) override;
+  void emitCOFFSymbolStorageClass(int StorageClass) override;
+  void emitCOFFSymbolType(int Type) override;
+  void endCOFFSymbolDef() override;
+  void emitCOFFSafeSEH(MCSymbol const *Symbol) override;
+  void emitCOFFSymbolIndex(MCSymbol const *Symbol) override;
+  void emitCOFFSectionIndex(MCSymbol const *Symbol) override;
+  void emitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset) override;
+  void emitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) override;
   void emitXCOFFLocalCommonSymbol(MCSymbol *LabelSym, uint64_t Size,
                                   MCSymbol *CsectSym,
                                   unsigned ByteAlign) override;
@@ -198,6 +195,8 @@ public:
   void emitXCOFFRenameDirective(const MCSymbol *Name,
                                 StringRef Rename) override;
 
+  void emitXCOFFRefDirective(StringRef Name) override;
+
   void emitELFSize(MCSymbol *Symbol, const MCExpr *Value) override;
   void emitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                         unsigned ByteAlignment) override;
@@ -276,11 +275,11 @@ public:
                              StringRef FileName) override;
   MCSymbol *getDwarfLineTableSymbol(unsigned CUID) override;
 
-  bool EmitCVFileDirective(unsigned FileNo, StringRef Filename,
+  bool emitCVFileDirective(unsigned FileNo, StringRef Filename,
                            ArrayRef<uint8_t> Checksum,
                            unsigned ChecksumKind) override;
-  bool EmitCVFuncIdDirective(unsigned FuncId) override;
-  bool EmitCVInlineSiteIdDirective(unsigned FunctionId, unsigned IAFunc,
+  bool emitCVFuncIdDirective(unsigned FuncId) override;
+  bool emitCVInlineSiteIdDirective(unsigned FunctionId, unsigned IAFunc,
                                    unsigned IAFile, unsigned IALine,
                                    unsigned IACol, SMLoc Loc) override;
   void emitCVLocDirective(unsigned FunctionId, unsigned FileNo, unsigned Line,
@@ -316,10 +315,11 @@ public:
   void emitCVStringTableDirective() override;
   void emitCVFileChecksumsDirective() override;
   void emitCVFileChecksumOffsetDirective(unsigned FileNo) override;
-  void EmitCVFPOData(const MCSymbol *ProcSym, SMLoc L) override;
+  void emitCVFPOData(const MCSymbol *ProcSym, SMLoc L) override;
 
   void emitIdent(StringRef IdentString) override;
   void emitCFIBKeyFrame() override;
+  void emitCFIMTETaggedFrame() override;
   void emitCFISections(bool EH, bool Debug) override;
   void emitCFIDefCfa(int64_t Register, int64_t Offset) override;
   void emitCFIDefCfaOffset(int64_t Offset) override;
@@ -344,25 +344,25 @@ public:
   void emitCFINegateRAState() override;
   void emitCFIReturnColumn(int64_t Register) override;
 
-  void EmitWinCFIStartProc(const MCSymbol *Symbol, SMLoc Loc) override;
-  void EmitWinCFIEndProc(SMLoc Loc) override;
-  void EmitWinCFIFuncletOrFuncEnd(SMLoc Loc) override;
-  void EmitWinCFIStartChained(SMLoc Loc) override;
-  void EmitWinCFIEndChained(SMLoc Loc) override;
-  void EmitWinCFIPushReg(MCRegister Register, SMLoc Loc) override;
-  void EmitWinCFISetFrame(MCRegister Register, unsigned Offset,
+  void emitWinCFIStartProc(const MCSymbol *Symbol, SMLoc Loc) override;
+  void emitWinCFIEndProc(SMLoc Loc) override;
+  void emitWinCFIFuncletOrFuncEnd(SMLoc Loc) override;
+  void emitWinCFIStartChained(SMLoc Loc) override;
+  void emitWinCFIEndChained(SMLoc Loc) override;
+  void emitWinCFIPushReg(MCRegister Register, SMLoc Loc) override;
+  void emitWinCFISetFrame(MCRegister Register, unsigned Offset,
                           SMLoc Loc) override;
-  void EmitWinCFIAllocStack(unsigned Size, SMLoc Loc) override;
-  void EmitWinCFISaveReg(MCRegister Register, unsigned Offset,
+  void emitWinCFIAllocStack(unsigned Size, SMLoc Loc) override;
+  void emitWinCFISaveReg(MCRegister Register, unsigned Offset,
                          SMLoc Loc) override;
-  void EmitWinCFISaveXMM(MCRegister Register, unsigned Offset,
+  void emitWinCFISaveXMM(MCRegister Register, unsigned Offset,
                          SMLoc Loc) override;
-  void EmitWinCFIPushFrame(bool Code, SMLoc Loc) override;
-  void EmitWinCFIEndProlog(SMLoc Loc) override;
+  void emitWinCFIPushFrame(bool Code, SMLoc Loc) override;
+  void emitWinCFIEndProlog(SMLoc Loc) override;
 
-  void EmitWinEHHandler(const MCSymbol *Sym, bool Unwind, bool Except,
+  void emitWinEHHandler(const MCSymbol *Sym, bool Unwind, bool Except,
                         SMLoc Loc) override;
-  void EmitWinEHHandlerData(SMLoc Loc) override;
+  void emitWinEHHandlerData(SMLoc Loc) override;
 
   void emitCGProfileEntry(const MCSymbolRefExpr *From,
                           const MCSymbolRefExpr *To, uint64_t Count) override;
@@ -502,7 +502,7 @@ void MCAsmStreamer::changeSection(MCSection *Section,
   if (MCTargetStreamer *TS = getTargetStreamer()) {
     TS->changeSection(getCurrentSectionOnly(), Section, Subsection, OS);
   } else {
-    Section->PrintSwitchToSection(*MAI, getContext().getTargetTriple(), OS,
+    Section->printSwitchToSection(*MAI, getContext().getTargetTriple(), OS,
                                   Subsection);
   }
 }
@@ -761,6 +761,8 @@ bool MCAsmStreamer::emitSymbolAttribute(MCSymbol *Symbol,
   case MCSA_WeakDefAutoPrivate: OS << "\t.weak_def_can_be_hidden\t"; break;
   case MCSA_Cold:
     // Assemblers currently do not support a .cold directive.
+  case MCSA_Exported:
+    // Non-AIX assemblers currently do not support exported visibility.
     return false;
   }
 
@@ -787,47 +789,47 @@ void MCAsmStreamer::emitSyntaxDirective() {
   // with may have a value of prefix or noprefix.
 }
 
-void MCAsmStreamer::BeginCOFFSymbolDef(const MCSymbol *Symbol) {
+void MCAsmStreamer::beginCOFFSymbolDef(const MCSymbol *Symbol) {
   OS << "\t.def\t";
   Symbol->print(OS, MAI);
   OS << ';';
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCOFFSymbolStorageClass (int StorageClass) {
+void MCAsmStreamer::emitCOFFSymbolStorageClass(int StorageClass) {
   OS << "\t.scl\t" << StorageClass << ';';
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCOFFSymbolType (int Type) {
+void MCAsmStreamer::emitCOFFSymbolType(int Type) {
   OS << "\t.type\t" << Type << ';';
   EmitEOL();
 }
 
-void MCAsmStreamer::EndCOFFSymbolDef() {
+void MCAsmStreamer::endCOFFSymbolDef() {
   OS << "\t.endef";
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCOFFSafeSEH(MCSymbol const *Symbol) {
+void MCAsmStreamer::emitCOFFSafeSEH(MCSymbol const *Symbol) {
   OS << "\t.safeseh\t";
   Symbol->print(OS, MAI);
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCOFFSymbolIndex(MCSymbol const *Symbol) {
+void MCAsmStreamer::emitCOFFSymbolIndex(MCSymbol const *Symbol) {
   OS << "\t.symidx\t";
   Symbol->print(OS, MAI);
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCOFFSectionIndex(MCSymbol const *Symbol) {
+void MCAsmStreamer::emitCOFFSectionIndex(MCSymbol const *Symbol) {
   OS << "\t.secidx\t";
   Symbol->print(OS, MAI);
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset) {
+void MCAsmStreamer::emitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset) {
   OS << "\t.secrel32\t";
   Symbol->print(OS, MAI);
   if (Offset != 0)
@@ -835,7 +837,7 @@ void MCAsmStreamer::EmitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset) {
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) {
+void MCAsmStreamer::emitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) {
   OS << "\t.rva\t";
   Symbol->print(OS, MAI);
   if (Offset > 0)
@@ -903,6 +905,9 @@ void MCAsmStreamer::emitXCOFFSymbolLinkageWithVisibility(
   case MCSA_Protected:
     OS << ",protected";
     break;
+  case MCSA_Exported:
+    OS << ",exported";
+    break;
   default:
     report_fatal_error("unexpected value for Visibility type");
   }
@@ -931,6 +936,11 @@ void MCAsmStreamer::emitXCOFFRenameDirective(const MCSymbol *Name,
   EmitEOL();
 }
 
+void MCAsmStreamer::emitXCOFFRefDirective(StringRef Name) {
+  OS << "\t.ref " << Name;
+  EmitEOL();
+}
+
 void MCAsmStreamer::emitELFSize(MCSymbol *Symbol, const MCExpr *Value) {
   assert(MAI->hasDotTypeDotSizeDirective());
   OS << "\t.size\t";
@@ -988,7 +998,7 @@ void MCAsmStreamer::emitZerofill(MCSection *Section, MCSymbol *Symbol,
                                  uint64_t Size, unsigned ByteAlignment,
                                  SMLoc Loc) {
   if (Symbol)
-    AssignFragment(Symbol, &Section->getDummyFragment());
+    assignFragment(Symbol, &Section->getDummyFragment());
 
   // Note: a .zerofill directive does not switch sections.
   OS << ".zerofill ";
@@ -1015,7 +1025,7 @@ void MCAsmStreamer::emitZerofill(MCSection *Section, MCSymbol *Symbol,
 // e.g. _a.
 void MCAsmStreamer::emitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
                                    uint64_t Size, unsigned ByteAlignment) {
-  AssignFragment(Symbol, &Section->getDummyFragment());
+  assignFragment(Symbol, &Section->getDummyFragment());
 
   assert(Symbol && "Symbol shouldn't be NULL!");
   // Instead of using the Section we'll just use the shortcut.
@@ -1643,7 +1653,7 @@ MCSymbol *MCAsmStreamer::getDwarfLineTableSymbol(unsigned CUID) {
   return MCStreamer::getDwarfLineTableSymbol(0);
 }
 
-bool MCAsmStreamer::EmitCVFileDirective(unsigned FileNo, StringRef Filename,
+bool MCAsmStreamer::emitCVFileDirective(unsigned FileNo, StringRef Filename,
                                         ArrayRef<uint8_t> Checksum,
                                         unsigned ChecksumKind) {
   if (!getContext().getCVContext().addFile(*this, FileNo, Filename, Checksum,
@@ -1666,19 +1676,19 @@ bool MCAsmStreamer::EmitCVFileDirective(unsigned FileNo, StringRef Filename,
   return true;
 }
 
-bool MCAsmStreamer::EmitCVFuncIdDirective(unsigned FuncId) {
+bool MCAsmStreamer::emitCVFuncIdDirective(unsigned FuncId) {
   OS << "\t.cv_func_id " << FuncId << '\n';
-  return MCStreamer::EmitCVFuncIdDirective(FuncId);
+  return MCStreamer::emitCVFuncIdDirective(FuncId);
 }
 
-bool MCAsmStreamer::EmitCVInlineSiteIdDirective(unsigned FunctionId,
+bool MCAsmStreamer::emitCVInlineSiteIdDirective(unsigned FunctionId,
                                                 unsigned IAFunc,
                                                 unsigned IAFile,
                                                 unsigned IALine, unsigned IACol,
                                                 SMLoc Loc) {
   OS << "\t.cv_inline_site_id " << FunctionId << " within " << IAFunc
      << " inlined_at " << IAFile << ' ' << IALine << ' ' << IACol << '\n';
-  return MCStreamer::EmitCVInlineSiteIdDirective(FunctionId, IAFunc, IAFile,
+  return MCStreamer::emitCVInlineSiteIdDirective(FunctionId, IAFunc, IAFile,
                                                  IALine, IACol, Loc);
 }
 
@@ -1795,7 +1805,7 @@ void MCAsmStreamer::emitCVFileChecksumOffsetDirective(unsigned FileNo) {
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCVFPOData(const MCSymbol *ProcSym, SMLoc L) {
+void MCAsmStreamer::emitCVFPOData(const MCSymbol *ProcSym, SMLoc L) {
   OS << "\t.cv_fpo_data\t";
   ProcSym->print(OS, MAI);
   EmitEOL();
@@ -2016,59 +2026,69 @@ void MCAsmStreamer::emitCFIBKeyFrame() {
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitWinCFIStartProc(const MCSymbol *Symbol, SMLoc Loc) {
-  MCStreamer::EmitWinCFIStartProc(Symbol, Loc);
+void MCAsmStreamer::emitCFIMTETaggedFrame() {
+  MCStreamer::emitCFIMTETaggedFrame();
+  OS << "\t.cfi_mte_tagged_frame";
+  EmitEOL();
+}
+
+void MCAsmStreamer::emitWinCFIStartProc(const MCSymbol *Symbol, SMLoc Loc) {
+  MCStreamer::emitWinCFIStartProc(Symbol, Loc);
 
   OS << ".seh_proc ";
   Symbol->print(OS, MAI);
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitWinCFIEndProc(SMLoc Loc) {
-  MCStreamer::EmitWinCFIEndProc(Loc);
+void MCAsmStreamer::emitWinCFIEndProc(SMLoc Loc) {
+  MCStreamer::emitWinCFIEndProc(Loc);
 
   OS << "\t.seh_endproc";
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitWinCFIFuncletOrFuncEnd(SMLoc Loc) {
-  MCStreamer::EmitWinCFIFuncletOrFuncEnd(Loc);
+void MCAsmStreamer::emitWinCFIFuncletOrFuncEnd(SMLoc Loc) {
+  MCStreamer::emitWinCFIFuncletOrFuncEnd(Loc);
 
   OS << "\t.seh_endfunclet";
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitWinCFIStartChained(SMLoc Loc) {
-  MCStreamer::EmitWinCFIStartChained(Loc);
+void MCAsmStreamer::emitWinCFIStartChained(SMLoc Loc) {
+  MCStreamer::emitWinCFIStartChained(Loc);
 
   OS << "\t.seh_startchained";
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitWinCFIEndChained(SMLoc Loc) {
-  MCStreamer::EmitWinCFIEndChained(Loc);
+void MCAsmStreamer::emitWinCFIEndChained(SMLoc Loc) {
+  MCStreamer::emitWinCFIEndChained(Loc);
 
   OS << "\t.seh_endchained";
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitWinEHHandler(const MCSymbol *Sym, bool Unwind,
+void MCAsmStreamer::emitWinEHHandler(const MCSymbol *Sym, bool Unwind,
                                      bool Except, SMLoc Loc) {
-  MCStreamer::EmitWinEHHandler(Sym, Unwind, Except, Loc);
+  MCStreamer::emitWinEHHandler(Sym, Unwind, Except, Loc);
 
   OS << "\t.seh_handler ";
   Sym->print(OS, MAI);
+  char Marker = '@';
+  const Triple &T = getContext().getTargetTriple();
+  if (T.getArch() == Triple::arm || T.getArch() == Triple::thumb)
+    Marker = '%';
   if (Unwind)
-    OS << ", @unwind";
+    OS << ", " << Marker << "unwind";
   if (Except)
-    OS << ", @except";
+    OS << ", " << Marker << "except";
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitWinEHHandlerData(SMLoc Loc) {
-  MCStreamer::EmitWinEHHandlerData(Loc);
+void MCAsmStreamer::emitWinEHHandlerData(SMLoc Loc) {
+  MCStreamer::emitWinEHHandlerData(Loc);
 
-  // Switch sections. Don't call SwitchSection directly, because that will
+  // Switch sections. Don't call switchSection directly, because that will
   // cause the section switch to be visible in the emitted assembly.
   // We only do this so the section switch that terminates the handler
   // data block is visible.
@@ -2081,23 +2101,23 @@ void MCAsmStreamer::EmitWinEHHandlerData(SMLoc Loc) {
 
   MCSection *TextSec = &CurFrame->Function->getSection();
   MCSection *XData = getAssociatedXDataSection(TextSec);
-  SwitchSectionNoChange(XData);
+  switchSectionNoChange(XData);
 
   OS << "\t.seh_handlerdata";
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitWinCFIPushReg(MCRegister Register, SMLoc Loc) {
-  MCStreamer::EmitWinCFIPushReg(Register, Loc);
+void MCAsmStreamer::emitWinCFIPushReg(MCRegister Register, SMLoc Loc) {
+  MCStreamer::emitWinCFIPushReg(Register, Loc);
 
   OS << "\t.seh_pushreg ";
   InstPrinter->printRegName(OS, Register);
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitWinCFISetFrame(MCRegister Register, unsigned Offset,
+void MCAsmStreamer::emitWinCFISetFrame(MCRegister Register, unsigned Offset,
                                        SMLoc Loc) {
-  MCStreamer::EmitWinCFISetFrame(Register, Offset, Loc);
+  MCStreamer::emitWinCFISetFrame(Register, Offset, Loc);
 
   OS << "\t.seh_setframe ";
   InstPrinter->printRegName(OS, Register);
@@ -2105,16 +2125,16 @@ void MCAsmStreamer::EmitWinCFISetFrame(MCRegister Register, unsigned Offset,
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitWinCFIAllocStack(unsigned Size, SMLoc Loc) {
-  MCStreamer::EmitWinCFIAllocStack(Size, Loc);
+void MCAsmStreamer::emitWinCFIAllocStack(unsigned Size, SMLoc Loc) {
+  MCStreamer::emitWinCFIAllocStack(Size, Loc);
 
   OS << "\t.seh_stackalloc " << Size;
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitWinCFISaveReg(MCRegister Register, unsigned Offset,
+void MCAsmStreamer::emitWinCFISaveReg(MCRegister Register, unsigned Offset,
                                       SMLoc Loc) {
-  MCStreamer::EmitWinCFISaveReg(Register, Offset, Loc);
+  MCStreamer::emitWinCFISaveReg(Register, Offset, Loc);
 
   OS << "\t.seh_savereg ";
   InstPrinter->printRegName(OS, Register);
@@ -2122,9 +2142,9 @@ void MCAsmStreamer::EmitWinCFISaveReg(MCRegister Register, unsigned Offset,
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitWinCFISaveXMM(MCRegister Register, unsigned Offset,
+void MCAsmStreamer::emitWinCFISaveXMM(MCRegister Register, unsigned Offset,
                                       SMLoc Loc) {
-  MCStreamer::EmitWinCFISaveXMM(Register, Offset, Loc);
+  MCStreamer::emitWinCFISaveXMM(Register, Offset, Loc);
 
   OS << "\t.seh_savexmm ";
   InstPrinter->printRegName(OS, Register);
@@ -2132,8 +2152,8 @@ void MCAsmStreamer::EmitWinCFISaveXMM(MCRegister Register, unsigned Offset,
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitWinCFIPushFrame(bool Code, SMLoc Loc) {
-  MCStreamer::EmitWinCFIPushFrame(Code, Loc);
+void MCAsmStreamer::emitWinCFIPushFrame(bool Code, SMLoc Loc) {
+  MCStreamer::emitWinCFIPushFrame(Code, Loc);
 
   OS << "\t.seh_pushframe";
   if (Code)
@@ -2141,8 +2161,8 @@ void MCAsmStreamer::EmitWinCFIPushFrame(bool Code, SMLoc Loc) {
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitWinCFIEndProlog(SMLoc Loc) {
-  MCStreamer::EmitWinCFIEndProlog(Loc);
+void MCAsmStreamer::emitWinCFIEndProlog(SMLoc Loc) {
+  MCStreamer::emitWinCFIEndProlog(Loc);
 
   OS << "\t.seh_endprologue";
   EmitEOL();
@@ -2161,7 +2181,7 @@ void MCAsmStreamer::emitCGProfileEntry(const MCSymbolRefExpr *From,
 
 void MCAsmStreamer::AddEncodingComment(const MCInst &Inst,
                                        const MCSubtargetInfo &STI) {
-  raw_ostream &OS = GetCommentOS();
+  raw_ostream &OS = getCommentOS();
   SmallString<256> Code;
   SmallVector<MCFixup, 4> Fixups;
   raw_svector_ostream VecOS(Code);
@@ -2245,8 +2265,10 @@ void MCAsmStreamer::AddEncodingComment(const MCInst &Inst,
     MCFixup &F = Fixups[i];
     const MCFixupKindInfo &Info =
         getAssembler().getBackend().getFixupKindInfo(F.getKind());
-    OS << "  fixup " << char('A' + i) << " - " << "offset: " << F.getOffset()
-       << ", value: " << *F.getValue() << ", kind: " << Info.Name << "\n";
+    OS << "  fixup " << char('A' + i) << " - "
+       << "offset: " << F.getOffset() << ", value: ";
+    F.getValue()->print(OS, MAI);
+    OS << ", kind: " << Info.Name << "\n";
   }
 }
 
@@ -2265,8 +2287,8 @@ void MCAsmStreamer::emitInstruction(const MCInst &Inst,
 
   // Show the MCInst if enabled.
   if (ShowInst) {
-    Inst.dump_pretty(GetCommentOS(), InstPrinter.get(), "\n ");
-    GetCommentOS() << "\n";
+    Inst.dump_pretty(getCommentOS(), InstPrinter.get(), "\n ");
+    getCommentOS() << "\n";
   }
 
   if(getTargetStreamer())
@@ -2276,7 +2298,7 @@ void MCAsmStreamer::emitInstruction(const MCInst &Inst,
 
   StringRef Comments = CommentToEmit;
   if (Comments.size() && Comments.back() != '\n')
-    GetCommentOS() << "\n";
+    getCommentOS() << "\n";
 
   EmitEOL();
 }
@@ -2365,7 +2387,7 @@ void MCAsmStreamer::finishImpl() {
   if (!Tables.empty()) {
     assert(Tables.size() == 1 && "asm output only supports one line table");
     if (auto *Label = Tables.begin()->second.getLabel()) {
-      SwitchSection(getContext().getObjectFileInfo()->getDwarfLineSection());
+      switchSection(getContext().getObjectFileInfo()->getDwarfLineSection());
       emitLabel(Label);
     }
   }
@@ -2492,7 +2514,7 @@ void MCAsmStreamer::doFinalizationAtSectionEnd(MCSection *Section) {
   if (MAI->usesDwarfFileAndLocDirectives())
     return;
 
-  SwitchSectionNoChange(Section);
+  switchSectionNoChange(Section);
 
   MCSymbol *Sym = getCurrentSectionOnly()->getEndSymbol(getContext());
 
diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp
index a8837bbf57c7..a33d7ea9ebfe 100644
--- a/llvm/lib/MC/MCAssembler.cpp
+++ b/llvm/lib/MC/MCAssembler.cpp
@@ -27,7 +27,6 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSection.h"
-#include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/Alignment.h"
@@ -36,16 +35,18 @@
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
-#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <cstdint>
-#include <cstring>
 #include <tuple>
 #include <utility>
 
 using namespace llvm;
 
+namespace llvm {
+class MCSubtargetInfo;
+}
+
 #define DEBUG_TYPE "assembler"
 
 namespace {
@@ -330,11 +331,11 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
   case MCFragment::FT_Align: {
     const MCAlignFragment &AF = cast<MCAlignFragment>(F);
     unsigned Offset = Layout.getFragmentOffset(&AF);
-    unsigned Size = offsetToAlignment(Offset, Align(AF.getAlignment()));
+    unsigned Size = offsetToAlignment(Offset, AF.getAlignment());
 
     // Insert extra Nops for code alignment if the target define
     // shouldInsertExtraNopBytesForCodeAlign target hook.
-    if (AF.getParent()->UseCodeAlign() && AF.hasEmitNops() &&
+    if (AF.getParent()->useCodeAlign() && AF.hasEmitNops() &&
         getBackend().shouldInsertExtraNopBytesForCodeAlign(AF, Size))
       return Size;
 
@@ -342,7 +343,7 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
     // minimum nop size.
     if (Size > 0 && AF.hasEmitNops()) {
       while (Size % getBackend().getMinimumNopSize())
-        Size += AF.getAlignment();
+        Size += AF.getAlignment().value();
     }
     if (Size > AF.getMaxBytesToEmit())
       return 0;
@@ -873,7 +874,7 @@ void MCAssembler::layout(MCAsmLayout &Layout) {
         MCAlignFragment &AF = cast<MCAlignFragment>(Frag);
         // Insert fixup type for code alignment if the target define
         // shouldInsertFixupForCodeAlign target hook.
-        if (Sec.UseCodeAlign() && AF.hasEmitNops())
+        if (Sec.useCodeAlign() && AF.hasEmitNops())
           getBackend().shouldInsertFixupForCodeAlign(*this, Layout, AF);
         continue;
       }
diff --git a/llvm/lib/MC/MCCodeView.cpp b/llvm/lib/MC/MCCodeView.cpp
index 3da1a9c3e331..375d54696cb2 100644
--- a/llvm/lib/MC/MCCodeView.cpp
+++ b/llvm/lib/MC/MCCodeView.cpp
@@ -17,6 +17,7 @@
 #include "llvm/DebugInfo/CodeView/Line.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCValue.h"
@@ -25,7 +26,7 @@
 using namespace llvm;
 using namespace llvm::codeview;
 
-CodeViewContext::CodeViewContext() {}
+CodeViewContext::CodeViewContext() = default;
 
 CodeViewContext::~CodeViewContext() {
   // If someone inserted strings into the string table but never actually
@@ -334,8 +335,8 @@ void CodeViewContext::emitLineTableForFunction(MCObjectStreamer &OS,
   OS.emitInt32(uint32_t(DebugSubsectionKind::Lines));
   OS.emitAbsoluteSymbolDiff(LineEnd, LineBegin, 4);
   OS.emitLabel(LineBegin);
-  OS.EmitCOFFSecRel32(FuncBegin, /*Offset=*/0);
-  OS.EmitCOFFSectionIndex(FuncBegin);
+  OS.emitCOFFSecRel32(FuncBegin, /*Offset=*/0);
+  OS.emitCOFFSectionIndex(FuncBegin);
 
   // Actual line info.
   std::vector<MCCVLoc> Locs = getFunctionLineEntries(FuncId);
diff --git a/llvm/lib/MC/MCContext.cpp b/llvm/lib/MC/MCContext.cpp
index eafcee1e0607..4be84ca7feb5 100644
--- a/llvm/lib/MC/MCContext.cpp
+++ b/llvm/lib/MC/MCContext.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCContext.h"
+#include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
@@ -15,21 +16,25 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/BinaryFormat/ELF.h"
+#include "llvm/BinaryFormat/Wasm.h"
 #include "llvm/BinaryFormat/XCOFF.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCCodeView.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFragment.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCLabel.h"
-#include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSectionDXContainer.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionGOFF.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSectionSPIRV.h"
 #include "llvm/MC/MCSectionWasm.h"
 #include "llvm/MC/MCSectionXCOFF.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolCOFF.h"
 #include "llvm/MC/MCSymbolELF.h"
@@ -37,13 +42,14 @@
 #include "llvm/MC/MCSymbolMachO.h"
 #include "llvm/MC/MCSymbolWasm.h"
 #include "llvm/MC/MCSymbolXCOFF.h"
+#include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/SectionKind.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/Signals.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
@@ -103,6 +109,12 @@ MCContext::MCContext(const Triple &TheTriple, const MCAsmInfo *mai,
   case Triple::GOFF:
     Env = IsGOFF;
     break;
+  case Triple::DXContainer:
+    Env = IsDXContainer;
+    break;
+  case Triple::SPIRV:
+    Env = IsSPIRV;
+    break;
   case Triple::UnknownObjectFormat:
     report_fatal_error("Cannot initialize MC for unknown object file format.");
     break;
@@ -134,11 +146,14 @@ void MCContext::reset() {
 
   // Call the destructors so the fragments are freed
   COFFAllocator.DestroyAll();
+  DXCAllocator.DestroyAll();
   ELFAllocator.DestroyAll();
   GOFFAllocator.DestroyAll();
   MachOAllocator.DestroyAll();
+  WasmAllocator.DestroyAll();
   XCOFFAllocator.DestroyAll();
   MCInstAllocator.DestroyAll();
+  SPIRVAllocator.DestroyAll();
 
   MCSubtargetAllocator.DestroyAll();
   InlineAsmUsedLabelNames.clear();
@@ -163,6 +178,7 @@ void MCContext::reset() {
   COFFUniquingMap.clear();
   WasmUniquingMap.clear();
   XCOFFUniquingMap.clear();
+  DXCUniquingMap.clear();
 
   ELFEntrySizeMap.clear();
   ELFSeenGenericMergeableSections.clear();
@@ -243,6 +259,11 @@ MCSymbol *MCContext::createSymbolImpl(const StringMapEntry<bool> *Name,
     return new (Name, *this) MCSymbolWasm(Name, IsTemporary);
   case MCContext::IsXCOFF:
     return createXCOFFSymbolImpl(Name, IsTemporary);
+  case MCContext::IsDXContainer:
+    break;
+  case MCContext::IsSPIRV:
+    return new (Name, *this)
+        MCSymbol(MCSymbol::SymbolKindUnset, Name, IsTemporary);
   }
   return new (Name, *this) MCSymbol(MCSymbol::SymbolKindUnset, Name,
                                     IsTemporary);
@@ -616,11 +637,14 @@ Optional<unsigned> MCContext::getELFUniqueIDForEntsize(StringRef SectionName,
   return (I != ELFEntrySizeMap.end()) ? Optional<unsigned>(I->second) : None;
 }
 
-MCSectionGOFF *MCContext::getGOFFSection(StringRef Section, SectionKind Kind) {
+MCSectionGOFF *MCContext::getGOFFSection(StringRef Section, SectionKind Kind,
+                                         MCSection *Parent,
+                                         const MCExpr *SubsectionId) {
   // Do the lookup. If we don't have a hit, return a new section.
   auto &GOFFSection = GOFFUniquingMap[Section.str()];
   if (!GOFFSection)
-    GOFFSection = new (GOFFAllocator.Allocate()) MCSectionGOFF(Section, Kind);
+    GOFFSection = new (GOFFAllocator.Allocate())
+        MCSectionGOFF(Section, Kind, Parent, SubsectionId);
 
   return GOFFSection;
 }
@@ -732,13 +756,19 @@ MCSectionWasm *MCContext::getWasmSection(const Twine &Section, SectionKind Kind,
   return Result;
 }
 
+bool MCContext::hasXCOFFSection(StringRef Section,
+                                XCOFF::CsectProperties CsectProp) const {
+  return XCOFFUniquingMap.count(
+             XCOFFSectionKey(Section.str(), CsectProp.MappingClass)) != 0;
+}
+
 MCSectionXCOFF *MCContext::getXCOFFSection(
     StringRef Section, SectionKind Kind,
     Optional<XCOFF::CsectProperties> CsectProp, bool MultiSymbolsAllowed,
     const char *BeginSymName,
     Optional<XCOFF::DwarfSectionSubtypeFlags> DwarfSectionSubtypeFlags) {
-  bool IsDwarfSec = DwarfSectionSubtypeFlags.hasValue();
-  assert((IsDwarfSec != CsectProp.hasValue()) && "Invalid XCOFF section!");
+  bool IsDwarfSec = DwarfSectionSubtypeFlags.has_value();
+  assert((IsDwarfSec != CsectProp.has_value()) && "Invalid XCOFF section!");
 
   // Do the lookup. If we have a hit, return it.
   auto IterBool = XCOFFUniquingMap.insert(std::make_pair(
@@ -796,6 +826,44 @@ MCSectionXCOFF *MCContext::getXCOFFSection(
   return Result;
 }
 
+MCSectionSPIRV *MCContext::getSPIRVSection() {
+  MCSymbol *Begin = nullptr;
+  MCSectionSPIRV *Result = new (SPIRVAllocator.Allocate())
+      MCSectionSPIRV(SectionKind::getText(), Begin);
+
+  auto *F = new MCDataFragment();
+  Result->getFragmentList().insert(Result->begin(), F);
+  F->setParent(Result);
+
+  if (Begin)
+    Begin->setFragment(F);
+
+  return Result;
+}
+
+MCSectionDXContainer *MCContext::getDXContainerSection(StringRef Section,
+                                                       SectionKind K) {
+  // Do the lookup, if we have a hit, return it.
+  auto ItInsertedPair = DXCUniquingMap.try_emplace(Section);
+  if (!ItInsertedPair.second)
+    return ItInsertedPair.first->second;
+
+  auto MapIt = ItInsertedPair.first;
+  // Grab the name from the StringMap. Since the Section is going to keep a
+  // copy of this StringRef we need to make sure the underlying string stays
+  // alive as long as we need it.
+  StringRef Name = MapIt->first();
+  MapIt->second =
+      new (DXCAllocator.Allocate()) MCSectionDXContainer(Name, K, nullptr);
+
+  // The first fragment will store the header
+  auto *F = new MCDataFragment();
+  MapIt->second->getFragmentList().insert(MapIt->second->begin(), F);
+  F->setParent(MapIt->second);
+
+  return MapIt->second;
+}
+
 MCSubtargetInfo &MCContext::getSubtargetCopy(const MCSubtargetInfo &STI) {
   return *new (MCSubtargetAllocator.Allocate()) MCSubtargetInfo(STI);
 }
@@ -835,6 +903,12 @@ void MCContext::RemapDebugPaths() {
 // Dwarf Management
 //===----------------------------------------------------------------------===//
 
+EmitDwarfUnwindType MCContext::emitDwarfUnwindInfo() const {
+  if (!TargetOptions)
+    return EmitDwarfUnwindType::Default;
+  return TargetOptions->EmitDwarfUnwind;
+}
+
 void MCContext::setGenDwarfRootFile(StringRef InputFileName, StringRef Buffer) {
   // MCDwarf needs the root file as well as the compilation directory.
   // If we find a '.file 0' directive that will supersede these values.
@@ -906,9 +980,9 @@ void MCContext::finalizeDwarfSections(MCStreamer &MCOS) {
 }
 
 CodeViewContext &MCContext::getCVContext() {
-  if (!CVContext.get())
+  if (!CVContext)
     CVContext.reset(new CodeViewContext);
-  return *CVContext.get();
+  return *CVContext;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/MC/MCDXContainerStreamer.cpp b/llvm/lib/MC/MCDXContainerStreamer.cpp
new file mode 100644
index 000000000000..3cb452f3dfa5
--- /dev/null
+++ b/llvm/lib/MC/MCDXContainerStreamer.cpp
@@ -0,0 +1,31 @@
+//===- lib/MC/MCDXContainerStreamer.cpp - DXContainer Impl ----*- C++ -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the object streamer for DXContainer files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCDXContainerStreamer.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/TargetRegistry.h"
+
+using namespace llvm;
+
+void MCDXContainerStreamer::emitInstToData(const MCInst &,
+                                           const MCSubtargetInfo &) {}
+
+MCStreamer *llvm::createDXContainerStreamer(
+    MCContext &Context, std::unique_ptr<MCAsmBackend> &&MAB,
+    std::unique_ptr<MCObjectWriter> &&OW, std::unique_ptr<MCCodeEmitter> &&CE,
+    bool RelaxAll) {
+  auto *S = new MCDXContainerStreamer(Context, std::move(MAB), std::move(OW),
+                                      std::move(CE));
+  if (RelaxAll)
+    S->getAssembler().setRelaxAll(true);
+  return S;
+}
diff --git a/llvm/lib/MC/MCDXContainerWriter.cpp b/llvm/lib/MC/MCDXContainerWriter.cpp
new file mode 100644
index 000000000000..f5dad702d6f6
--- /dev/null
+++ b/llvm/lib/MC/MCDXContainerWriter.cpp
@@ -0,0 +1,143 @@
+//===- llvm/MC/MCDXContainerWriter.cpp - DXContainer Writer -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCDXContainerWriter.h"
+#include "llvm/BinaryFormat/DXContainer.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/EndianStream.h"
+
+using namespace llvm;
+
+MCDXContainerTargetWriter::~MCDXContainerTargetWriter() {}
+
+namespace {
+class DXContainerObjectWriter : public MCObjectWriter {
+  ::support::endian::Writer W;
+
+  /// The target specific DXContainer writer instance.
+  std::unique_ptr<MCDXContainerTargetWriter> TargetObjectWriter;
+
+public:
+  DXContainerObjectWriter(std::unique_ptr<MCDXContainerTargetWriter> MOTW,
+                          raw_pwrite_stream &OS)
+      : W(OS, support::little), TargetObjectWriter(std::move(MOTW)) {}
+
+  ~DXContainerObjectWriter() override {}
+
+private:
+  void recordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
+                        const MCFragment *Fragment, const MCFixup &Fixup,
+                        MCValue Target, uint64_t &FixedValue) override {}
+
+  void executePostLayoutBinding(MCAssembler &Asm,
+                                const MCAsmLayout &Layout) override {}
+
+  uint64_t writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
+};
+} // namespace
+
+uint64_t DXContainerObjectWriter::writeObject(MCAssembler &Asm,
+                                              const MCAsmLayout &Layout) {
+  // Start the file size as the header plus the size of the part offsets.
+  // Presently DXContainer files usually contain 7-10 parts. Reserving space for
+  // 16 part offsets gives us a little room for growth.
+  llvm::SmallVector<uint64_t, 16> PartOffsets;
+  uint64_t PartOffset = 0;
+  for (const MCSection &Sec : Asm) {
+    uint64_t SectionSize = Layout.getSectionAddressSize(&Sec);
+    // Skip empty sections.
+    if (SectionSize == 0)
+      continue;
+
+    assert(SectionSize < std::numeric_limits<uint32_t>::max() &&
+           "Section size too large for DXContainer");
+
+    PartOffsets.push_back(PartOffset);
+    PartOffset += sizeof(dxbc::PartHeader) + SectionSize;
+    PartOffset = alignTo(PartOffset, Align(4ul));
+  }
+  assert(PartOffset < std::numeric_limits<uint32_t>::max() &&
+         "Part data too large for DXContainer");
+
+  uint64_t PartStart =
+      sizeof(dxbc::Header) + (PartOffsets.size() * sizeof(uint32_t));
+  uint64_t FileSize = PartStart + PartOffset;
+  assert(FileSize < std::numeric_limits<uint32_t>::max() &&
+         "File size too large for DXContainer");
+
+  // Write the header.
+  W.write<char>({'D', 'X', 'B', 'C'});
+  // Write 16-bytes of 0's for the hash.
+  W.OS.write_zeros(16);
+  // Write 1.0 for file format version.
+  W.write<uint16_t>(1u);
+  W.write<uint16_t>(0u);
+  // Write the file size.
+  W.write<uint32_t>(static_cast<uint32_t>(FileSize));
+  // Write the number of parts.
+  W.write<uint32_t>(static_cast<uint32_t>(PartOffsets.size()));
+  // Write the offsets for the part headers for each part.
+  for (uint64_t Offset : PartOffsets)
+    W.write<uint32_t>(static_cast<uint32_t>(PartStart + Offset));
+
+  for (const MCSection &Sec : Asm) {
+    uint64_t SectionSize = Layout.getSectionAddressSize(&Sec);
+    // Skip empty sections.
+    if (SectionSize == 0)
+      continue;
+
+    unsigned Start = W.OS.tell();
+    // Write section header.
+    W.write<char>(ArrayRef<char>(Sec.getName().data(), 4));
+
+    uint64_t PartSize = SectionSize + sizeof(dxbc::PartHeader);
+
+    if (Sec.getName() == "DXIL")
+      PartSize += sizeof(dxbc::ProgramHeader);
+    // DXContainer parts should be 4-byte aligned.
+    PartSize = alignTo(PartSize, Align(4));
+    W.write<uint32_t>(static_cast<uint32_t>(PartSize));
+    if (Sec.getName() == "DXIL") {
+      dxbc::ProgramHeader Header;
+      memset(reinterpret_cast<void *>(&Header), 0, sizeof(dxbc::ProgramHeader));
+
+      const Triple &TT = Asm.getContext().getTargetTriple();
+      VersionTuple Version = TT.getOSVersion();
+      Header.MajorVersion = static_cast<uint8_t>(Version.getMajor());
+      if (Version.getMinor())
+        Header.MinorVersion = static_cast<uint8_t>(*Version.getMinor());
+      if (TT.hasEnvironment())
+        Header.ShaderKind =
+            static_cast<uint16_t>(TT.getEnvironment() - Triple::Pixel);
+
+      // The program header's size field is in 32-bit words.
+      Header.Size = (SectionSize + sizeof(dxbc::ProgramHeader) + 3) / 4;
+      memcpy(Header.Bitcode.Magic, "DXIL", 4);
+      Header.Bitcode.Offset = sizeof(dxbc::BitcodeHeader);
+      Header.Bitcode.Size = SectionSize;
+      if (sys::IsBigEndianHost)
+        Header.swapBytes();
+      W.write<char>(ArrayRef<char>(reinterpret_cast<char *>(&Header),
+                                   sizeof(dxbc::ProgramHeader)));
+    }
+    Asm.writeSectionData(W.OS, &Sec, Layout);
+    unsigned Size = W.OS.tell() - Start;
+    W.OS.write_zeros(offsetToAlignment(Size, Align(4)));
+  }
+  return 0;
+}
+
+std::unique_ptr<MCObjectWriter> llvm::createDXContainerObjectWriter(
+    std::unique_ptr<MCDXContainerTargetWriter> MOTW, raw_pwrite_stream &OS) {
+  return std::make_unique<DXContainerObjectWriter>(std::move(MOTW), OS);
+}
diff --git a/llvm/lib/MC/MCDisassembler/Disassembler.cpp b/llvm/lib/MC/MCDisassembler/Disassembler.cpp
index aaa3b747682c..f0c61840e413 100644
--- a/llvm/lib/MC/MCDisassembler/Disassembler.cpp
+++ b/llvm/lib/MC/MCDisassembler/Disassembler.cpp
@@ -30,7 +30,6 @@
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
-#include <cstddef>
 #include <cstring>
 
 using namespace llvm;
diff --git a/llvm/lib/MC/MCDisassembler/Disassembler.h b/llvm/lib/MC/MCDisassembler/Disassembler.h
index e5aab53a7613..3cb2479d388f 100644
--- a/llvm/lib/MC/MCDisassembler/Disassembler.h
+++ b/llvm/lib/MC/MCDisassembler/Disassembler.h
@@ -16,7 +16,7 @@
 #ifndef LLVM_LIB_MC_MCDISASSEMBLER_DISASSEMBLER_H
 #define LLVM_LIB_MC_MCDISASSEMBLER_DISASSEMBLER_H
 
-#include "llvm-c/Disassembler.h"
+#include "llvm-c/DisassemblerTypes.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
diff --git a/llvm/lib/MC/MCDisassembler/MCDisassembler.cpp b/llvm/lib/MC/MCDisassembler/MCDisassembler.cpp
index a58e8f6d9bcc..0c041186936d 100644
--- a/llvm/lib/MC/MCDisassembler/MCDisassembler.cpp
+++ b/llvm/lib/MC/MCDisassembler/MCDisassembler.cpp
@@ -8,9 +8,6 @@
 
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 
 using namespace llvm;
 
@@ -25,11 +22,12 @@ MCDisassembler::onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size,
 
 bool MCDisassembler::tryAddingSymbolicOperand(MCInst &Inst, int64_t Value,
                                               uint64_t Address, bool IsBranch,
-                                              uint64_t Offset,
+                                              uint64_t Offset, uint64_t OpSize,
                                               uint64_t InstSize) const {
   if (Symbolizer)
-    return Symbolizer->tryAddingSymbolicOperand(
-        Inst, *CommentStream, Value, Address, IsBranch, Offset, InstSize);
+    return Symbolizer->tryAddingSymbolicOperand(Inst, *CommentStream, Value,
+                                                Address, IsBranch, Offset,
+                                                OpSize, InstSize);
   return false;
 }
 
@@ -85,10 +83,11 @@ bool XCOFFSymbolInfo::operator<(const XCOFFSymbolInfo &SymInfo) const {
     return SymInfo.IsLabel;
 
   // Symbols with a StorageMappingClass have higher priority than those without.
-  if (StorageMappingClass.hasValue() != SymInfo.StorageMappingClass.hasValue())
-    return SymInfo.StorageMappingClass.hasValue();
+  if (StorageMappingClass.has_value() !=
+      SymInfo.StorageMappingClass.has_value())
+    return SymInfo.StorageMappingClass.has_value();
 
-  if (StorageMappingClass.hasValue()) {
+  if (StorageMappingClass) {
     return getSMCPriority(StorageMappingClass.getValue()) <
            getSMCPriority(SymInfo.StorageMappingClass.getValue());
   }
diff --git a/llvm/lib/MC/MCDisassembler/MCExternalSymbolizer.cpp b/llvm/lib/MC/MCDisassembler/MCExternalSymbolizer.cpp
index 7befef86303c..e3f4cdd21557 100644
--- a/llvm/lib/MC/MCDisassembler/MCExternalSymbolizer.cpp
+++ b/llvm/lib/MC/MCDisassembler/MCExternalSymbolizer.cpp
@@ -31,19 +31,15 @@ class Triple;
 // is found an MCExpr is created with that, else an MCExpr with Value is
 // created. This function returns true if it adds an operand to the MCInst and
 // false otherwise.
-bool MCExternalSymbolizer::tryAddingSymbolicOperand(MCInst &MI,
-                                                    raw_ostream &cStream,
-                                                    int64_t Value,
-                                                    uint64_t Address,
-                                                    bool IsBranch,
-                                                    uint64_t Offset,
-                                                    uint64_t InstSize) {
+bool MCExternalSymbolizer::tryAddingSymbolicOperand(
+    MCInst &MI, raw_ostream &cStream, int64_t Value, uint64_t Address,
+    bool IsBranch, uint64_t Offset, uint64_t OpSize, uint64_t InstSize) {
   struct LLVMOpInfo1 SymbolicOp;
   std::memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
   SymbolicOp.Value = Value;
 
   if (!GetOpInfo ||
-      !GetOpInfo(DisInfo, Address, Offset, InstSize, 1, &SymbolicOp)) {
+      !GetOpInfo(DisInfo, Address, Offset, OpSize, InstSize, 1, &SymbolicOp)) {
     // Clear SymbolicOp.Value from above and also all other fields.
     std::memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
 
@@ -53,10 +49,10 @@ bool MCExternalSymbolizer::tryAddingSymbolicOperand(MCInst &MI,
     // that always makes sense to guess.  But in the case of an immediate it is
     // a bit more questionable if it is an address of a symbol or some other
     // reference.  So if the immediate Value comes from a width of 1 byte,
-    // InstSize, we will not guess it is an address of a symbol.  Because in
+    // OpSize, we will not guess it is an address of a symbol.  Because in
     // object files assembled starting at address 0 this usually leads to
     // incorrect symbolication.
-    if (!SymbolLookUp || (InstSize == 1 && !IsBranch))
+    if (!SymbolLookUp || (OpSize == 1 && !IsBranch))
       return false;
 
     uint64_t ReferenceType;
diff --git a/llvm/lib/MC/MCDisassembler/MCRelocationInfo.cpp b/llvm/lib/MC/MCDisassembler/MCRelocationInfo.cpp
index 735be23206e4..137c44680080 100644
--- a/llvm/lib/MC/MCDisassembler/MCRelocationInfo.cpp
+++ b/llvm/lib/MC/MCDisassembler/MCRelocationInfo.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCDisassembler/MCRelocationInfo.h"
-#include "llvm-c/Disassembler.h"
+#include "llvm-c/DisassemblerTypes.h"
 #include "llvm/MC/TargetRegistry.h"
 
 using namespace llvm;
diff --git a/llvm/lib/MC/MCDwarf.cpp b/llvm/lib/MC/MCDwarf.cpp
index 2cb5a000f88a..4cbb9981fde2 100644
--- a/llvm/lib/MC/MCDwarf.cpp
+++ b/llvm/lib/MC/MCDwarf.cpp
@@ -269,7 +269,7 @@ void MCDwarfLineTable::emit(MCStreamer *MCOS, MCDwarfLineTableParams Params) {
     LineStr = MCDwarfLineStr(context);
 
   // Switch to the section where the table will be emitted into.
-  MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfLineSection());
+  MCOS->switchSection(context.getObjectFileInfo()->getDwarfLineSection());
 
   // Handle the rest of the Compile Units.
   for (const auto &CUIDTablePair : LineTables) {
@@ -285,7 +285,7 @@ void MCDwarfDwoLineTable::Emit(MCStreamer &MCOS, MCDwarfLineTableParams Params,
   if (!HasSplitLineTable)
     return;
   Optional<MCDwarfLineStr> NoLineStr(None);
-  MCOS.SwitchSection(Section);
+  MCOS.switchSection(Section);
   MCOS.emitLabel(Header.Emit(&MCOS, Params, None, NoLineStr).second);
 }
 
@@ -332,14 +332,20 @@ static void emitAbsValue(MCStreamer &OS, const MCExpr *Value, unsigned Size) {
 
 void MCDwarfLineStr::emitSection(MCStreamer *MCOS) {
   // Switch to the .debug_line_str section.
-  MCOS->SwitchSection(
+  MCOS->switchSection(
       MCOS->getContext().getObjectFileInfo()->getDwarfLineStrSection());
+  SmallString<0> Data = getFinalizedData();
+  MCOS->emitBinaryData(Data.str());
+}
+
+SmallString<0> MCDwarfLineStr::getFinalizedData() {
   // Emit the strings without perturbing the offsets we used.
-  LineStrings.finalizeInOrder();
+  if (!LineStrings.isFinalized())
+    LineStrings.finalizeInOrder();
   SmallString<0> Data;
   Data.resize(LineStrings.getSize());
   LineStrings.write((uint8_t *)Data.data());
-  MCOS->emitBinaryData(Data.str());
+  return Data;
 }
 
 void MCDwarfLineStr::emitRef(MCStreamer *MCOS, StringRef Path) {
@@ -387,16 +393,14 @@ static void emitOneV5FileEntry(MCStreamer *MCOS, const MCDwarfFile &DwarfFile,
   if (EmitMD5) {
     const MD5::MD5Result &Cksum = *DwarfFile.Checksum;
     MCOS->emitBinaryData(
-        StringRef(reinterpret_cast<const char *>(Cksum.Bytes.data()),
-                  Cksum.Bytes.size()));
+        StringRef(reinterpret_cast<const char *>(Cksum.data()), Cksum.size()));
   }
   if (HasSource) {
     if (LineStr)
-      LineStr->emitRef(MCOS, DwarfFile.Source.getValueOr(StringRef()));
+      LineStr->emitRef(MCOS, DwarfFile.Source.value_or(StringRef()));
     else {
-      MCOS->emitBytes(
-          DwarfFile.Source.getValueOr(StringRef())); // Source and...
-      MCOS->emitBytes(StringRef("\0", 1));           // its null terminator.
+      MCOS->emitBytes(DwarfFile.Source.value_or(StringRef())); // Source and...
+      MCOS->emitBytes(StringRef("\0", 1)); // its null terminator.
     }
   }
 }
@@ -583,7 +587,7 @@ MCDwarfLineTableHeader::tryGetFile(StringRef &Directory,
   // Keep track of whether any or all files have an MD5 checksum.
   // If any files have embedded source, they all must.
   if (MCDwarfFiles.empty()) {
-    trackMD5Usage(Checksum.hasValue());
+    trackMD5Usage(Checksum.has_value());
     HasSource = (Source != None);
   }
   if (DwarfVersion >= 5 && isRootFile(RootFile, Directory, FileName, Checksum))
@@ -646,7 +650,7 @@ MCDwarfLineTableHeader::tryGetFile(StringRef &Directory,
   File.Name = std::string(FileName);
   File.DirIndex = DirIndex;
   File.Checksum = Checksum;
-  trackMD5Usage(Checksum.hasValue());
+  trackMD5Usage(Checksum.has_value());
   File.Source = Source;
   if (Source)
     HasSource = true;
@@ -764,7 +768,7 @@ static void EmitAbbrev(MCStreamer *MCOS, uint64_t Name, uint64_t Form) {
 // the data for .debug_abbrev section which contains three DIEs.
 static void EmitGenDwarfAbbrev(MCStreamer *MCOS) {
   MCContext &context = MCOS->getContext();
-  MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfAbbrevSection());
+  MCOS->switchSection(context.getObjectFileInfo()->getDwarfAbbrevSection());
 
   // DW_TAG_compile_unit DIE abbrev (1).
   MCOS->emitULEB128IntValue(1);
@@ -817,7 +821,7 @@ static void EmitGenDwarfAranges(MCStreamer *MCOS,
 
   auto &Sections = context.getGenDwarfSectionSyms();
 
-  MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfARangesSection());
+  MCOS->switchSection(context.getObjectFileInfo()->getDwarfARangesSection());
 
   unsigned UnitLengthBytes =
       dwarf::getUnitLengthFieldByteSize(context.getDwarfFormat());
@@ -896,7 +900,7 @@ static void EmitGenDwarfInfo(MCStreamer *MCOS,
                              const MCSymbol *RangesSymbol) {
   MCContext &context = MCOS->getContext();
 
-  MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfInfoSection());
+  MCOS->switchSection(context.getObjectFileInfo()->getDwarfInfoSection());
 
   // Create a symbol at the start and end of this section used in here for the
   // expression to calculate the length in the header.
@@ -1073,7 +1077,7 @@ static MCSymbol *emitGenDwarfRanges(MCStreamer *MCOS) {
   MCSymbol *RangesSymbol;
 
   if (MCOS->getContext().getDwarfVersion() >= 5) {
-    MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfRnglistsSection());
+    MCOS->switchSection(context.getObjectFileInfo()->getDwarfRnglistsSection());
     MCSymbol *EndSymbol = mcdwarf::emitListsTableHeaderStart(*MCOS);
     MCOS->AddComment("Offset entry count");
     MCOS->emitInt32(0);
@@ -1093,7 +1097,7 @@ static MCSymbol *emitGenDwarfRanges(MCStreamer *MCOS) {
     MCOS->emitInt8(dwarf::DW_RLE_end_of_list);
     MCOS->emitLabel(EndSymbol);
   } else {
-    MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfRangesSection());
+    MCOS->switchSection(context.getObjectFileInfo()->getDwarfRangesSection());
     RangesSymbol = context.createTempSymbol("debug_ranges_start");
     MCOS->emitLabel(RangesSymbol);
     for (MCSection *Sec : Sections) {
@@ -1154,18 +1158,18 @@ void MCGenDwarfInfo::Emit(MCStreamer *MCOS) {
       MCOS->getContext().getDwarfVersion() >= 3;
   CreateDwarfSectionSymbols |= UseRangesSection;
 
-  MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfInfoSection());
+  MCOS->switchSection(context.getObjectFileInfo()->getDwarfInfoSection());
   if (CreateDwarfSectionSymbols) {
     InfoSectionSymbol = context.createTempSymbol();
     MCOS->emitLabel(InfoSectionSymbol);
   }
-  MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfAbbrevSection());
+  MCOS->switchSection(context.getObjectFileInfo()->getDwarfAbbrevSection());
   if (CreateDwarfSectionSymbols) {
     AbbrevSectionSymbol = context.createTempSymbol();
     MCOS->emitLabel(AbbrevSectionSymbol);
   }
 
-  MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfARangesSection());
+  MCOS->switchSection(context.getObjectFileInfo()->getDwarfARangesSection());
 
   // Output the data for .debug_aranges section.
   EmitGenDwarfAranges(MCOS, InfoSectionSymbol);
@@ -1599,6 +1603,8 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(const MCDwarfFrameInfo &Frame) {
       Augmentation += "S";
     if (Frame.IsBKeyFrame)
       Augmentation += "B";
+    if (Frame.IsMTETaggedFrame)
+      Augmentation += "G";
     Streamer.emitBytes(Augmentation);
   }
   Streamer.emitInt8(0);
@@ -1835,8 +1841,6 @@ template <> struct DenseMapInfo<CIEKey> {
 
 void MCDwarfFrameEmitter::Emit(MCObjectStreamer &Streamer, MCAsmBackend *MAB,
                                bool IsEH) {
-  Streamer.generateCompactUnwindEncodings(MAB);
-
   MCContext &Context = Streamer.getContext();
   const MCObjectFileInfo *MOFI = Context.getObjectFileInfo();
   const MCAsmInfo *AsmInfo = Context.getAsmInfo();
@@ -1846,11 +1850,12 @@ void MCDwarfFrameEmitter::Emit(MCObjectStreamer &Streamer, MCAsmBackend *MAB,
   // Emit the compact unwind info if available.
   bool NeedsEHFrameSection = !MOFI->getSupportsCompactUnwindWithoutEHFrame();
   if (IsEH && MOFI->getCompactUnwindSection()) {
+    Streamer.generateCompactUnwindEncodings(MAB);
     bool SectionEmitted = false;
     for (const MCDwarfFrameInfo &Frame : FrameArray) {
       if (Frame.CompactUnwindEncoding == 0) continue;
       if (!SectionEmitted) {
-        Streamer.SwitchSection(MOFI->getCompactUnwindSection());
+        Streamer.switchSection(MOFI->getCompactUnwindSection());
         Streamer.emitValueToAlignment(AsmInfo->getCodePointerSize());
         SectionEmitted = true;
       }
@@ -1867,7 +1872,7 @@ void MCDwarfFrameEmitter::Emit(MCObjectStreamer &Streamer, MCAsmBackend *MAB,
       IsEH ? *const_cast<MCObjectFileInfo *>(MOFI)->getEHFrameSection()
            : *MOFI->getDwarfFrameSection();
 
-  Streamer.SwitchSection(&Section);
+  Streamer.switchSection(&Section);
   MCSymbol *SectionStart = Context.createTempSymbol();
   Streamer.emitLabel(SectionStart);
 
diff --git a/llvm/lib/MC/MCELFStreamer.cpp b/llvm/lib/MC/MCELFStreamer.cpp
index fbf3c860368a..ca7f28e1386e 100644
--- a/llvm/lib/MC/MCELFStreamer.cpp
+++ b/llvm/lib/MC/MCELFStreamer.cpp
@@ -90,11 +90,11 @@ void MCELFStreamer::mergeFragment(MCDataFragment *DF,
 
 void MCELFStreamer::initSections(bool NoExecStack, const MCSubtargetInfo &STI) {
   MCContext &Ctx = getContext();
-  SwitchSection(Ctx.getObjectFileInfo()->getTextSection());
+  switchSection(Ctx.getObjectFileInfo()->getTextSection());
   emitCodeAlignment(Ctx.getObjectFileInfo()->getTextSectionAlignment(), &STI);
 
   if (NoExecStack)
-    SwitchSection(Ctx.getAsmInfo()->getNonexecutableStackSection(Ctx));
+    switchSection(Ctx.getAsmInfo()->getNonexecutableStackSection(Ctx));
 }
 
 void MCELFStreamer::emitLabel(MCSymbol *S, SMLoc Loc) {
@@ -215,6 +215,7 @@ bool MCELFStreamer::emitSymbolAttribute(MCSymbol *S, MCSymbolAttr Attribute) {
   case MCSA_WeakDefAutoPrivate:
   case MCSA_Invalid:
   case MCSA_IndirectSymbol:
+  case MCSA_Exported:
     return false;
 
   case MCSA_NoDeadStrip:
@@ -317,13 +318,13 @@ void MCELFStreamer::emitCommonSymbol(MCSymbol *S, uint64_t Size,
     MCSection &Section = *getAssembler().getContext().getELFSection(
         ".bss", ELF::SHT_NOBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
     MCSectionSubPair P = getCurrentSection();
-    SwitchSection(&Section);
+    switchSection(&Section);
 
     emitValueToAlignment(ByteAlignment, 0, 1, 0);
     emitLabel(Symbol);
     emitZeros(Size);
 
-    SwitchSection(P.first, P.second);
+    switchSection(P.first, P.second);
   } else {
     if(Symbol->declareCommon(Size, ByteAlignment))
       report_fatal_error(Twine("Symbol: ") + Symbol->getName() +
@@ -381,15 +382,15 @@ void MCELFStreamer::emitCGProfileEntry(const MCSymbolRefExpr *From,
 void MCELFStreamer::emitIdent(StringRef IdentString) {
   MCSection *Comment = getAssembler().getContext().getELFSection(
       ".comment", ELF::SHT_PROGBITS, ELF::SHF_MERGE | ELF::SHF_STRINGS, 1);
-  PushSection();
-  SwitchSection(Comment);
+  pushSection();
+  switchSection(Comment);
   if (!SeenIdent) {
     emitInt8(0);
     SeenIdent = true;
   }
   emitBytes(IdentString);
   emitInt8(0);
-  PopSection();
+  popSection();
 }
 
 void MCELFStreamer::fixSymbolsInTLSFixups(const MCExpr *expr) {
@@ -511,8 +512,8 @@ void MCELFStreamer::finalizeCGProfile() {
   MCSection *CGProfile = getAssembler().getContext().getELFSection(
       ".llvm.call-graph-profile", ELF::SHT_LLVM_CALL_GRAPH_PROFILE,
       ELF::SHF_EXCLUDE, /*sizeof(Elf_CGProfile_Impl<>)=*/8);
-  PushSection();
-  SwitchSection(CGProfile);
+  pushSection();
+  switchSection(CGProfile);
   uint64_t Offset = 0;
   for (MCAssembler::CGProfileEntry &E : Asm.CGProfile) {
     finalizeCGProfileEntry(E.From, Offset);
@@ -520,7 +521,7 @@ void MCELFStreamer::finalizeCGProfile() {
     emitIntValue(E.Count, sizeof(uint64_t));
     Offset += sizeof(uint64_t);
   }
-  PopSection();
+  popSection();
 }
 
 void MCELFStreamer::emitInstToFragment(const MCInst &Inst,
@@ -832,10 +833,10 @@ void MCELFStreamer::createAttributesSection(
 
   // Switch section to AttributeSection or get/create the section.
   if (AttributeSection) {
-    SwitchSection(AttributeSection);
+    switchSection(AttributeSection);
   } else {
     AttributeSection = getContext().getELFSection(Section, Type, 0);
-    SwitchSection(AttributeSection);
+    switchSection(AttributeSection);
 
     // Format version
     emitInt8(0x41);
diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
index 10d494b5ac61..45a3d938257a 100644
--- a/llvm/lib/MC/MCExpr.cpp
+++ b/llvm/lib/MC/MCExpr.cpp
@@ -8,7 +8,6 @@
 
 #include "llvm/MC/MCExpr.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/MC/MCAsmBackend.h"
@@ -76,8 +75,9 @@ void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI, bool InParens) const {
     const MCSymbol &Sym = SRE.getSymbol();
     // Parenthesize names that start with $ so that they don't look like
     // absolute names.
-    bool UseParens =
-        !InParens && !Sym.getName().empty() && Sym.getName()[0] == '$';
+    bool UseParens = MAI && MAI->useParensForDollarSignNames() && !InParens &&
+                     !Sym.getName().empty() && Sym.getName()[0] == '$';
+
     if (UseParens) {
       OS << '(';
       Sym.print(OS, MAI);
diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp
index 4634de863b2f..4e6459c5d6e4 100644
--- a/llvm/lib/MC/MCFragment.cpp
+++ b/llvm/lib/MC/MCFragment.cpp
@@ -376,7 +376,7 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
     if (AF->hasEmitNops())
       OS << " (emit nops)";
     OS << "\n       ";
-    OS << " Alignment:" << AF->getAlignment()
+    OS << " Alignment:" << AF->getAlignment().value()
        << " Value:" << AF->getValue() << " ValueSize:" << AF->getValueSize()
        << " MaxBytesToEmit:" << AF->getMaxBytesToEmit() << ">";
     break;
diff --git a/llvm/lib/MC/MCInstPrinter.cpp b/llvm/lib/MC/MCInstPrinter.cpp
index 7ce92b968f47..843afe359529 100644
--- a/llvm/lib/MC/MCInstPrinter.cpp
+++ b/llvm/lib/MC/MCInstPrinter.cpp
@@ -12,6 +12,7 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
diff --git a/llvm/lib/MC/MCInstrAnalysis.cpp b/llvm/lib/MC/MCInstrAnalysis.cpp
index 4ed1c6286a72..85434b15bb5e 100644
--- a/llvm/lib/MC/MCInstrAnalysis.cpp
+++ b/llvm/lib/MC/MCInstrAnalysis.cpp
@@ -9,11 +9,12 @@
 #include "llvm/MC/MCInstrAnalysis.h"
 
 #include "llvm/ADT/APInt.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/MC/MCInstrInfo.h"
 #include <cstdint>
 
+namespace llvm {
+class MCSubtargetInfo;
+}
+
 using namespace llvm;
 
 bool MCInstrAnalysis::clearsSuperRegisters(const MCRegisterInfo &MRI,
diff --git a/llvm/lib/MC/MCInstrDesc.cpp b/llvm/lib/MC/MCInstrDesc.cpp
index b5c43f5edc0d..49a4a2cb546a 100644
--- a/llvm/lib/MC/MCInstrDesc.cpp
+++ b/llvm/lib/MC/MCInstrDesc.cpp
@@ -14,7 +14,6 @@
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCSubtargetInfo.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/MC/MCMachOStreamer.cpp b/llvm/lib/MC/MCMachOStreamer.cpp
index 88aeeb980738..9f22b9b0a866 100644
--- a/llvm/lib/MC/MCMachOStreamer.cpp
+++ b/llvm/lib/MC/MCMachOStreamer.cpp
@@ -10,7 +10,6 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
@@ -19,17 +18,16 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCFragment.h"
-#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCLinkerOptimizationHint.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionMachO.h"
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolMachO.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/MC/SectionKind.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -37,6 +35,13 @@
 #include <cassert>
 #include <vector>
 
+namespace llvm {
+class MCInst;
+class MCStreamer;
+class MCSubtargetInfo;
+class Triple;
+} // namespace llvm
+
 using namespace llvm;
 
 namespace {
@@ -126,6 +131,7 @@ public:
 
   void finalizeCGProfileEntry(const MCSymbolRefExpr *&SRE);
   void finalizeCGProfile();
+  void createAddrSigSection();
 };
 
 } // end anonymous namespace.
@@ -353,6 +359,7 @@ bool MCMachOStreamer::emitSymbolAttribute(MCSymbol *Sym,
   case MCSA_Weak:
   case MCSA_Local:
   case MCSA_LGlobal:
+  case MCSA_Exported:
     return false;
 
   case MCSA_Global:
@@ -455,8 +462,8 @@ void MCMachOStreamer::emitZerofill(MCSection *Section, MCSymbol *Symbol,
             // section.
   }
 
-  PushSection();
-  SwitchSection(Section);
+  pushSection();
+  switchSection(Section);
 
   // The symbol may not be present, which only creates the section.
   if (Symbol) {
@@ -464,7 +471,7 @@ void MCMachOStreamer::emitZerofill(MCSection *Section, MCSymbol *Symbol,
     emitLabel(Symbol);
     emitZeros(Size);
   }
-  PopSection();
+  popSection();
 }
 
 // This should always be called with the thread local bss section.  Like the
@@ -524,6 +531,7 @@ void MCMachOStreamer::finishImpl() {
 
   finalizeCGProfile();
 
+  createAddrSigSection();
   this->MCObjectStreamer::finishImpl();
 }
 
@@ -574,3 +582,16 @@ MCStreamer *llvm::createMachOStreamer(MCContext &Context,
     S->getAssembler().setRelaxAll(true);
   return S;
 }
+
+// Create the AddrSig section and first data fragment here as its layout needs
+// to be computed immediately after in order for it to be exported correctly.
+void MCMachOStreamer::createAddrSigSection() {
+  MCAssembler &Asm = getAssembler();
+  MCObjectWriter &writer = Asm.getWriter();
+  if (!writer.getEmitAddrsigSection())
+    return;
+  MCSection *AddrSigSection =
+      Asm.getContext().getObjectFileInfo()->getAddrSigSection();
+  Asm.registerSection(*AddrSigSection);
+  new MCDataFragment(AddrSigSection);
+}
diff --git a/llvm/lib/MC/MCNullStreamer.cpp b/llvm/lib/MC/MCNullStreamer.cpp
index 40b7eba58b03..83e8962451d5 100644
--- a/llvm/lib/MC/MCNullStreamer.cpp
+++ b/llvm/lib/MC/MCNullStreamer.cpp
@@ -7,9 +7,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/StringRef.h"
-#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/SMLoc.h"
+namespace llvm {
+class MCContext;
+class MCExpr;
+class MCSection;
+class MCSymbol;
+} // namespace llvm
 
 using namespace llvm;
 
@@ -36,10 +42,10 @@ namespace {
                       uint64_t Size = 0, unsigned ByteAlignment = 0,
                       SMLoc Loc = SMLoc()) override {}
     void emitGPRel32Value(const MCExpr *Value) override {}
-    void BeginCOFFSymbolDef(const MCSymbol *Symbol) override {}
-    void EmitCOFFSymbolStorageClass(int StorageClass) override {}
-    void EmitCOFFSymbolType(int Type) override {}
-    void EndCOFFSymbolDef() override {}
+    void beginCOFFSymbolDef(const MCSymbol *Symbol) override {}
+    void emitCOFFSymbolStorageClass(int StorageClass) override {}
+    void emitCOFFSymbolType(int Type) override {}
+    void endCOFFSymbolDef() override {}
     void
     emitXCOFFSymbolLinkageWithVisibility(MCSymbol *Symbol, MCSymbolAttr Linkage,
                                          MCSymbolAttr Visibility) override {}
diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp
index b7890e7f0937..d6fe952c0c1d 100644
--- a/llvm/lib/MC/MCObjectFileInfo.cpp
+++ b/llvm/lib/MC/MCObjectFileInfo.cpp
@@ -16,11 +16,14 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSectionDXContainer.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionGOFF.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSectionSPIRV.h"
 #include "llvm/MC/MCSectionWasm.h"
 #include "llvm/MC/MCSectionXCOFF.h"
+#include "llvm/Support/Casting.h"
 
 using namespace llvm;
 
@@ -62,8 +65,18 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(const Triple &T) {
       (T.getArch() == Triple::aarch64 || T.getArch() == Triple::aarch64_32))
     SupportsCompactUnwindWithoutEHFrame = true;
 
-  if (T.isWatchABI())
+  switch (Ctx->emitDwarfUnwindInfo()) {
+  case EmitDwarfUnwindType::Always:
+    OmitDwarfIfHaveCompactUnwind = false;
+    break;
+  case EmitDwarfUnwindType::NoCompactUnwind:
     OmitDwarfIfHaveCompactUnwind = true;
+    break;
+  case EmitDwarfUnwindType::Default:
+    OmitDwarfIfHaveCompactUnwind =
+        T.isWatchABI() || SupportsCompactUnwindWithoutEHFrame;
+    break;
+  }
 
   FDECFIEncoding = dwarf::DW_EH_PE_pcrel;
 
@@ -180,6 +193,9 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(const Triple &T) {
                            MachO::S_THREAD_LOCAL_VARIABLE_POINTERS,
                            SectionKind::getMetadata());
 
+  AddrSigSection = Ctx->getMachOSection("__DATA", "__llvm_addrsig", 0,
+                                        SectionKind::getData());
+
   // Exception Handling.
   LSDASection = Ctx->getMachOSection("__TEXT", "__gcc_except_tab", 0,
                                      SectionKind::getReadOnlyWithRel());
@@ -518,8 +534,13 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) {
 }
 
 void MCObjectFileInfo::initGOFFMCObjectFileInfo(const Triple &T) {
-  TextSection = Ctx->getGOFFSection(".text", SectionKind::getText());
-  BSSSection = Ctx->getGOFFSection(".bss", SectionKind::getBSS());
+  TextSection =
+      Ctx->getGOFFSection(".text", SectionKind::getText(), nullptr, nullptr);
+  BSSSection =
+      Ctx->getGOFFSection(".bss", SectionKind::getBSS(), nullptr, nullptr);
+  PPA1Section =
+      Ctx->getGOFFSection(".ppa1", SectionKind::getMetadata(), TextSection,
+                          MCConstantExpr::create(GOFF::SK_PPA1, *Ctx));
 }
 
 void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
@@ -554,8 +575,9 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
       ".rdata", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ,
       SectionKind::getReadOnly());
 
-  if (T.getArch() == Triple::x86_64 || T.getArch() == Triple::aarch64) {
-    // On Windows 64 with SEH, the LSDA is emitted into the .xdata section
+  if (T.getArch() == Triple::x86_64 || T.getArch() == Triple::aarch64 ||
+      T.getArch() == Triple::arm || T.getArch() == Triple::thumb) {
+    // On Windows with SEH, the LSDA is emitted into the .xdata section
     LSDASection = nullptr;
   } else {
     LSDASection = Ctx->getCOFFSection(".gcc_except_table",
@@ -803,6 +825,11 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
                                         SectionKind::getReadOnly());
 }
 
+void MCObjectFileInfo::initSPIRVMCObjectFileInfo(const Triple &T) {
+  // Put everything in a single binary section.
+  TextSection = Ctx->getSPIRVSection();
+}
+
 void MCObjectFileInfo::initWasmMCObjectFileInfo(const Triple &T) {
   TextSection = Ctx->getWasmSection(".text", SectionKind::getText());
   DataSection = Ctx->getWasmSection(".data", SectionKind::getData());
@@ -993,7 +1020,12 @@ void MCObjectFileInfo::initXCOFFMCObjectFileInfo(const Triple &T) {
       /* MultiSymbolsAllowed */ true, ".dwmac", XCOFF::SSUBTYP_DWMAC);
 }
 
-MCObjectFileInfo::~MCObjectFileInfo() {}
+void MCObjectFileInfo::initDXContainerObjectFileInfo(const Triple &T) {
+  // At the moment the DXBC section should end up empty.
+  TextSection = Ctx->getDXContainerSection("DXBC", SectionKind::getText());
+}
+
+MCObjectFileInfo::~MCObjectFileInfo() = default;
 
 void MCObjectFileInfo::initMCObjectFileInfo(MCContext &MCCtx, bool PIC,
                                             bool LargeCodeModel) {
@@ -1031,12 +1063,18 @@ void MCObjectFileInfo::initMCObjectFileInfo(MCContext &MCCtx, bool PIC,
   case MCContext::IsGOFF:
     initGOFFMCObjectFileInfo(TheTriple);
     break;
+  case MCContext::IsSPIRV:
+    initSPIRVMCObjectFileInfo(TheTriple);
+    break;
   case MCContext::IsWasm:
     initWasmMCObjectFileInfo(TheTriple);
     break;
   case MCContext::IsXCOFF:
     initXCOFFMCObjectFileInfo(TheTriple);
     break;
+  case MCContext::IsDXContainer:
+    initDXContainerObjectFileInfo(TheTriple);
+    break;
   }
 }
 
@@ -1052,7 +1090,9 @@ MCSection *MCObjectFileInfo::getDwarfComdatSection(const char *Name,
   case Triple::MachO:
   case Triple::COFF:
   case Triple::GOFF:
+  case Triple::SPIRV:
   case Triple::XCOFF:
+  case Triple::DXContainer:
   case Triple::UnknownObjectFormat:
     report_fatal_error("Cannot get DWARF comdat section for this object file "
                        "format: not implemented.");
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index ebbbd6ad4e16..0c4ed201a0c5 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCObjectStreamer.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
@@ -37,7 +36,7 @@ MCObjectStreamer::MCObjectStreamer(MCContext &Context,
     setAllowAutoPadding(Assembler->getBackend().allowAutoPadding());
 }
 
-MCObjectStreamer::~MCObjectStreamer() {}
+MCObjectStreamer::~MCObjectStreamer() = default;
 
 // AssemblerPtr is used for evaluation of expressions and causes
 // difference between asm and object outputs. Return nullptr to in
@@ -561,7 +560,7 @@ void MCObjectStreamer::emitDwarfLineEndEntry(MCSection *Section,
   // Switch back the dwarf line section, in case endSection had to switch the
   // section.
   MCContext &Ctx = getContext();
-  SwitchSection(Ctx.getObjectFileInfo()->getDwarfLineSection());
+  switchSection(Ctx.getObjectFileInfo()->getDwarfLineSection());
 
   const MCAsmInfo *AsmInfo = Ctx.getAsmInfo();
   emitDwarfAdvanceLineAddr(INT64_MAX, LastLabel, SectionEnd,
@@ -648,7 +647,8 @@ void MCObjectStreamer::emitValueToAlignment(unsigned ByteAlignment,
                                             unsigned MaxBytesToEmit) {
   if (MaxBytesToEmit == 0)
     MaxBytesToEmit = ByteAlignment;
-  insert(new MCAlignFragment(ByteAlignment, Value, ValueSize, MaxBytesToEmit));
+  insert(new MCAlignFragment(Align(ByteAlignment), Value, ValueSize,
+                             MaxBytesToEmit));
 
   // Update the maximum alignment on the current section if necessary.
   MCSection *CurSec = getCurrentSectionOnly();
@@ -796,7 +796,7 @@ MCObjectStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name,
                                      const MCExpr *Expr, SMLoc Loc,
                                      const MCSubtargetInfo &STI) {
   Optional<MCFixupKind> MaybeKind = Assembler->getBackend().getFixupKind(Name);
-  if (!MaybeKind.hasValue())
+  if (!MaybeKind)
     return std::make_pair(true, std::string("unknown relocation name"));
 
   MCFixupKind Kind = *MaybeKind;
diff --git a/llvm/lib/MC/MCObjectWriter.cpp b/llvm/lib/MC/MCObjectWriter.cpp
index a058bbe0ba0b..89ff5800da5b 100644
--- a/llvm/lib/MC/MCObjectWriter.cpp
+++ b/llvm/lib/MC/MCObjectWriter.cpp
@@ -7,10 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCSymbol.h"
+namespace llvm {
+class MCSection;
+}
 
 using namespace llvm;
 
diff --git a/llvm/lib/MC/MCParser/AsmLexer.cpp b/llvm/lib/MC/MCParser/AsmLexer.cpp
index bf9b9e916d6f..c3bc3bff6fa2 100644
--- a/llvm/lib/MC/MCParser/AsmLexer.cpp
+++ b/llvm/lib/MC/MCParser/AsmLexer.cpp
@@ -251,12 +251,12 @@ AsmToken AsmLexer::LexLineComment() {
 }
 
 static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
-  // Skip ULL, UL, U, L and LL suffices.
-  if (CurPtr[0] == 'U')
+  // Skip case-insensitive ULL, UL, U, L and LL suffixes.
+  if (CurPtr[0] == 'U' || CurPtr[0] == 'u')
     ++CurPtr;
-  if (CurPtr[0] == 'L')
+  if (CurPtr[0] == 'L' || CurPtr[0] == 'l')
     ++CurPtr;
-  if (CurPtr[0] == 'L')
+  if (CurPtr[0] == 'L' || CurPtr[0] == 'l')
     ++CurPtr;
 }
 
diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index 0cea491f227d..ccc8e80e76ff 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -33,7 +33,6 @@
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCParser/AsmCond.h"
 #include "llvm/MC/MCParser/AsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
@@ -541,6 +540,7 @@ private:
     DK_PSEUDO_PROBE,
     DK_LTO_DISCARD,
     DK_LTO_SET_CONDITIONAL,
+    DK_CFI_MTE_TAGGED_FRAME,
     DK_END
   };
 
@@ -793,12 +793,19 @@ AsmParser::AsmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
   case MCContext::IsGOFF:
     PlatformParser.reset(createGOFFAsmParser());
     break;
+  case MCContext::IsSPIRV:
+    report_fatal_error(
+        "Need to implement createSPIRVAsmParser for SPIRV format.");
+    break;
   case MCContext::IsWasm:
     PlatformParser.reset(createWasmAsmParser());
     break;
   case MCContext::IsXCOFF:
     PlatformParser.reset(createXCOFFAsmParser());
     break;
+  case MCContext::IsDXContainer:
+    llvm_unreachable("DXContainer is not supported yet");
+    break;
   }
 
   PlatformParser->Initialize(*this);
@@ -1067,7 +1074,7 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
     if (auto *TS = Out.getTargetStreamer())
       TS->emitConstantPools();
 
-    Out.Finish(Lexer.getLoc());
+    Out.finish(Lexer.getLoc());
   }
 
   return HadError || getContext().hadError();
@@ -1780,7 +1787,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
     // if this is a line comment we can drop it safely
     if (getTok().getString().empty() || getTok().getString().front() == '\r' ||
         getTok().getString().front() == '\n')
-      Out.AddBlankLine();
+      Out.addBlankLine();
     Lex();
     return false;
   }
@@ -1937,7 +1944,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
     }
 
     // Consume any end of statement token, if present, to avoid spurious
-    // AddBlankLine calls().
+    // addBlankLine calls().
     if (getTok().is(AsmToken::EndOfStatement)) {
       Lex();
     }
@@ -3445,10 +3452,14 @@ bool AsmParser::parseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
     // up to one.
     if (Alignment == 0)
       Alignment = 1;
-    if (!isPowerOf2_64(Alignment))
+    else if (!isPowerOf2_64(Alignment)) {
       ReturnVal |= Error(AlignmentLoc, "alignment must be a power of 2");
-    if (!isUInt<32>(Alignment))
+      Alignment = PowerOf2Floor(Alignment);
+    }
+    if (!isUInt<32>(Alignment)) {
       ReturnVal |= Error(AlignmentLoc, "alignment must be smaller than 2**32");
+      Alignment = 1u << 31;
+    }
   }
 
   // Diagnose non-sensical max bytes to align.
@@ -3471,9 +3482,9 @@ bool AsmParser::parseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
   // directive.
   const MCSection *Section = getStreamer().getCurrentSectionOnly();
   assert(Section && "must have section to emit alignment");
-  bool UseCodeAlign = Section->UseCodeAlign();
+  bool useCodeAlign = Section->useCodeAlign();
   if ((!HasFillExpr || Lexer.getMAI().getTextAlignFillValue() == FillExpr) &&
-      ValueSize == 1 && UseCodeAlign) {
+      ValueSize == 1 && useCodeAlign) {
     getStreamer().emitCodeAlignment(Alignment, &getTargetParser().getSTI(),
                                     MaxBytesToFill);
   } else {
@@ -3571,8 +3582,8 @@ bool AsmParser::parseDirectiveFile(SMLoc DirectiveLoc) {
     if (HasMD5) {
       MD5::MD5Result Sum;
       for (unsigned i = 0; i != 8; ++i) {
-        Sum.Bytes[i] = uint8_t(MD5Hi >> ((7 - i) * 8));
-        Sum.Bytes[i + 8] = uint8_t(MD5Lo >> ((7 - i) * 8));
+        Sum[i] = uint8_t(MD5Hi >> ((7 - i) * 8));
+        Sum[i + 8] = uint8_t(MD5Lo >> ((7 - i) * 8));
       }
       CKMem = Sum;
     }
@@ -3743,8 +3754,7 @@ bool AsmParser::parseDirectiveCVFile() {
         parseEscapedString(Checksum) ||
         parseIntToken(ChecksumKind,
                       "expected checksum kind in '.cv_file' directive") ||
-        parseToken(AsmToken::EndOfStatement,
-                   "unexpected token in '.cv_file' directive"))
+        parseEOL())
       return true;
   }
 
@@ -3754,7 +3764,7 @@ bool AsmParser::parseDirectiveCVFile() {
   ArrayRef<uint8_t> ChecksumAsBytes(reinterpret_cast<const uint8_t *>(CKMem),
                                     Checksum.size());
 
-  if (!getStreamer().EmitCVFileDirective(FileNumber, Filename, ChecksumAsBytes,
+  if (!getStreamer().emitCVFileDirective(FileNumber, Filename, ChecksumAsBytes,
                                          static_cast<uint8_t>(ChecksumKind)))
     return Error(FileNumberLoc, "file number already allocated");
 
@@ -3790,12 +3800,10 @@ bool AsmParser::parseDirectiveCVFuncId() {
   SMLoc FunctionIdLoc = getTok().getLoc();
   int64_t FunctionId;
 
-  if (parseCVFunctionId(FunctionId, ".cv_func_id") ||
-      parseToken(AsmToken::EndOfStatement,
-                 "unexpected token in '.cv_func_id' directive"))
+  if (parseCVFunctionId(FunctionId, ".cv_func_id") || parseEOL())
     return true;
 
-  if (!getStreamer().EmitCVFuncIdDirective(FunctionId))
+  if (!getStreamer().emitCVFuncIdDirective(FunctionId))
     return Error(FunctionIdLoc, "function id already allocated");
 
   return false;
@@ -3851,11 +3859,10 @@ bool AsmParser::parseDirectiveCVInlineSiteId() {
     Lex();
   }
 
-  if (parseToken(AsmToken::EndOfStatement,
-                 "unexpected token in '.cv_inline_site_id' directive"))
+  if (parseEOL())
     return true;
 
-  if (!getStreamer().EmitCVInlineSiteIdDirective(FunctionId, IAFunc, IAFile,
+  if (!getStreamer().emitCVInlineSiteIdDirective(FunctionId, IAFunc, IAFile,
                                                  IALine, IACol, FunctionIdLoc))
     return Error(FunctionIdLoc, "function id already allocated");
 
@@ -3976,7 +3983,7 @@ bool AsmParser::parseDirectiveCVInlineLinetable() {
                                   "expected identifier in directive"))
     return true;
 
-  if (parseToken(AsmToken::EndOfStatement, "Expected End of Statement"))
+  if (parseEOL())
     return true;
 
   MCSymbol *FnStartSym = getContext().getOrCreateSymbol(FnStartName);
@@ -4137,7 +4144,7 @@ bool AsmParser::parseDirectiveCVFileChecksumOffset() {
   int64_t FileNo;
   if (parseIntToken(FileNo, "expected identifier in directive"))
     return true;
-  if (parseToken(AsmToken::EndOfStatement, "Expected End of Statement"))
+  if (parseEOL())
     return true;
   getStreamer().emitCVFileChecksumOffsetDirective(FileNo);
   return false;
@@ -4153,7 +4160,7 @@ bool AsmParser::parseDirectiveCVFPOData() {
   if (parseEOL())
     return true;
   MCSymbol *ProcSym = getContext().getOrCreateSymbol(ProcName);
-  getStreamer().EmitCVFPOData(ProcSym, DirLoc);
+  getStreamer().emitCVFPOData(ProcSym, DirLoc);
   return false;
 }
 
@@ -5550,6 +5557,7 @@ void AsmParser::initializeDirectiveKindMap() {
   DirectiveKindMap[".cfi_register"] = DK_CFI_REGISTER;
   DirectiveKindMap[".cfi_window_save"] = DK_CFI_WINDOW_SAVE;
   DirectiveKindMap[".cfi_b_key_frame"] = DK_CFI_B_KEY_FRAME;
+  DirectiveKindMap[".cfi_mte_tagged_frame"] = DK_CFI_MTE_TAGGED_FRAME;
   DirectiveKindMap[".macros_on"] = DK_MACROS_ON;
   DirectiveKindMap[".macros_off"] = DK_MACROS_OFF;
   DirectiveKindMap[".macro"] = DK_MACRO;
@@ -6022,22 +6030,25 @@ bool AsmParser::parseMSInlineAsm(
       }
 
       bool isOutput = (i == 1) && Desc.mayStore();
+      bool Restricted = Operand.isMemUseUpRegs();
       SMLoc Start = SMLoc::getFromPointer(SymName.data());
-      int64_t Size = Operand.isMemPlaceholder(Desc) ? 0 : SymName.size();
       if (isOutput) {
         ++InputIdx;
         OutputDecls.push_back(OpDecl);
         OutputDeclsAddressOf.push_back(Operand.needAddressOf());
         OutputConstraints.push_back(("=" + Constraint).str());
-        AsmStrRewrites.emplace_back(AOK_Output, Start, Size);
+        AsmStrRewrites.emplace_back(AOK_Output, Start, SymName.size(), 0,
+                                    Restricted);
       } else {
         InputDecls.push_back(OpDecl);
         InputDeclsAddressOf.push_back(Operand.needAddressOf());
         InputConstraints.push_back(Constraint.str());
         if (Desc.OpInfo[i - 1].isBranchTarget())
-          AsmStrRewrites.emplace_back(AOK_CallInput, Start, SymName.size());
+          AsmStrRewrites.emplace_back(AOK_CallInput, Start, SymName.size(), 0,
+                                      Restricted);
         else
-          AsmStrRewrites.emplace_back(AOK_Input, Start, Size);
+          AsmStrRewrites.emplace_back(AOK_Input, Start, SymName.size(), 0,
+                                      Restricted);
       }
     }
 
@@ -6152,17 +6163,19 @@ bool AsmParser::parseMSInlineAsm(
       OS << Ctx.getAsmInfo()->getPrivateLabelPrefix() << AR.Label;
       break;
     case AOK_Input:
-      if (AR.Len)
-        OS << '$' << InputIdx;
-      ++InputIdx;
+      if (AR.IntelExpRestricted)
+        OS << "${" << InputIdx++ << ":P}";
+      else
+        OS << '$' << InputIdx++;
       break;
     case AOK_CallInput:
       OS << "${" << InputIdx++ << ":P}";
       break;
     case AOK_Output:
-      if (AR.Len)
-        OS << '$' << OutputIdx;
-      ++OutputIdx;
+      if (AR.IntelExpRestricted)
+        OS << "${" << OutputIdx++ << ":P}";
+      else
+        OS << '$' << OutputIdx++;
       break;
     case AOK_SizeDirective:
       switch (AR.Val) {
@@ -6299,7 +6312,7 @@ bool HLASMAsmParser::parseStatement(ParseStatementInfo &Info,
     // if this is a line comment we can drop it safely
     if (getTok().getString().empty() || getTok().getString().front() == '\r' ||
         getTok().getString().front() == '\n')
-      Out.AddBlankLine();
+      Out.addBlankLine();
     Lex();
     return false;
   }
@@ -6315,7 +6328,7 @@ bool HLASMAsmParser::parseStatement(ParseStatementInfo &Info,
   if (Lexer.is(AsmToken::EndOfStatement)) {
     if (getTok().getString().front() == '\n' ||
         getTok().getString().front() == '\r') {
-      Out.AddBlankLine();
+      Out.addBlankLine();
       Lex();
       return false;
     }
diff --git a/llvm/lib/MC/MCParser/COFFAsmParser.cpp b/llvm/lib/MC/MCParser/COFFAsmParser.cpp
index 0077c91cfdbd..b78595f5bab4 100644
--- a/llvm/lib/MC/MCParser/COFFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/COFFAsmParser.cpp
@@ -13,11 +13,8 @@
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDirectives.h"
-#include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParserExtension.h"
-#include "llvm/MC/MCParser/MCTargetAsmParser.h"
-#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/SectionKind.h"
@@ -322,7 +319,7 @@ bool COFFAsmParser::ParseSectionSwitch(StringRef Section,
     return TokError("unexpected token in section switching directive");
   Lex();
 
-  getStreamer().SwitchSection(getContext().getCOFFSection(
+  getStreamer().switchSection(getContext().getCOFFSection(
       Section, Characteristics, Kind, COMDATSymName, Type));
 
   return false;
@@ -419,7 +416,7 @@ bool COFFAsmParser::ParseDirectiveDef(StringRef, SMLoc) {
 
   MCSymbol *Sym = getContext().getOrCreateSymbol(SymbolName);
 
-  getStreamer().BeginCOFFSymbolDef(Sym);
+  getStreamer().beginCOFFSymbolDef(Sym);
 
   Lex();
   return false;
@@ -434,7 +431,7 @@ bool COFFAsmParser::ParseDirectiveScl(StringRef, SMLoc) {
     return TokError("unexpected token in directive");
 
   Lex();
-  getStreamer().EmitCOFFSymbolStorageClass(SymbolStorageClass);
+  getStreamer().emitCOFFSymbolStorageClass(SymbolStorageClass);
   return false;
 }
 
@@ -447,13 +444,13 @@ bool COFFAsmParser::ParseDirectiveType(StringRef, SMLoc) {
     return TokError("unexpected token in directive");
 
   Lex();
-  getStreamer().EmitCOFFSymbolType(Type);
+  getStreamer().emitCOFFSymbolType(Type);
   return false;
 }
 
 bool COFFAsmParser::ParseDirectiveEndef(StringRef, SMLoc) {
   Lex();
-  getStreamer().EndCOFFSymbolDef();
+  getStreamer().endCOFFSymbolDef();
   return false;
 }
 
@@ -482,7 +479,7 @@ bool COFFAsmParser::ParseDirectiveSecRel32(StringRef, SMLoc) {
   MCSymbol *Symbol = getContext().getOrCreateSymbol(SymbolID);
 
   Lex();
-  getStreamer().EmitCOFFSecRel32(Symbol, Offset);
+  getStreamer().emitCOFFSecRel32(Symbol, Offset);
   return false;
 }
 
@@ -508,7 +505,7 @@ bool COFFAsmParser::ParseDirectiveRVA(StringRef, SMLoc) {
 
     MCSymbol *Symbol = getContext().getOrCreateSymbol(SymbolID);
 
-    getStreamer().EmitCOFFImgRel32(Symbol, Offset);
+    getStreamer().emitCOFFImgRel32(Symbol, Offset);
     return false;
   };
 
@@ -528,7 +525,7 @@ bool COFFAsmParser::ParseDirectiveSafeSEH(StringRef, SMLoc) {
   MCSymbol *Symbol = getContext().getOrCreateSymbol(SymbolID);
 
   Lex();
-  getStreamer().EmitCOFFSafeSEH(Symbol);
+  getStreamer().emitCOFFSafeSEH(Symbol);
   return false;
 }
 
@@ -543,7 +540,7 @@ bool COFFAsmParser::ParseDirectiveSecIdx(StringRef, SMLoc) {
   MCSymbol *Symbol = getContext().getOrCreateSymbol(SymbolID);
 
   Lex();
-  getStreamer().EmitCOFFSectionIndex(Symbol);
+  getStreamer().emitCOFFSectionIndex(Symbol);
   return false;
 }
 
@@ -558,7 +555,7 @@ bool COFFAsmParser::ParseDirectiveSymIdx(StringRef, SMLoc) {
   MCSymbol *Symbol = getContext().getOrCreateSymbol(SymbolID);
 
   Lex();
-  getStreamer().EmitCOFFSymbolIndex(Symbol);
+  getStreamer().emitCOFFSymbolIndex(Symbol);
   return false;
 }
 
@@ -621,31 +618,31 @@ bool COFFAsmParser::ParseSEHDirectiveStartProc(StringRef, SMLoc Loc) {
   MCSymbol *Symbol = getContext().getOrCreateSymbol(SymbolID);
 
   Lex();
-  getStreamer().EmitWinCFIStartProc(Symbol, Loc);
+  getStreamer().emitWinCFIStartProc(Symbol, Loc);
   return false;
 }
 
 bool COFFAsmParser::ParseSEHDirectiveEndProc(StringRef, SMLoc Loc) {
   Lex();
-  getStreamer().EmitWinCFIEndProc(Loc);
+  getStreamer().emitWinCFIEndProc(Loc);
   return false;
 }
 
 bool COFFAsmParser::ParseSEHDirectiveEndFuncletOrFunc(StringRef, SMLoc Loc) {
   Lex();
-  getStreamer().EmitWinCFIFuncletOrFuncEnd(Loc);
+  getStreamer().emitWinCFIFuncletOrFuncEnd(Loc);
   return false;
 }
 
 bool COFFAsmParser::ParseSEHDirectiveStartChained(StringRef, SMLoc Loc) {
   Lex();
-  getStreamer().EmitWinCFIStartChained(Loc);
+  getStreamer().emitWinCFIStartChained(Loc);
   return false;
 }
 
 bool COFFAsmParser::ParseSEHDirectiveEndChained(StringRef, SMLoc Loc) {
   Lex();
-  getStreamer().EmitWinCFIEndChained(Loc);
+  getStreamer().emitWinCFIEndChained(Loc);
   return false;
 }
 
@@ -671,13 +668,13 @@ bool COFFAsmParser::ParseSEHDirectiveHandler(StringRef, SMLoc Loc) {
   MCSymbol *handler = getContext().getOrCreateSymbol(SymbolID);
 
   Lex();
-  getStreamer().EmitWinEHHandler(handler, unwind, except, Loc);
+  getStreamer().emitWinEHHandler(handler, unwind, except, Loc);
   return false;
 }
 
 bool COFFAsmParser::ParseSEHDirectiveHandlerData(StringRef, SMLoc Loc) {
   Lex();
-  getStreamer().EmitWinEHHandlerData();
+  getStreamer().emitWinEHHandlerData();
   return false;
 }
 
@@ -690,20 +687,20 @@ bool COFFAsmParser::ParseSEHDirectiveAllocStack(StringRef, SMLoc Loc) {
     return TokError("unexpected token in directive");
 
   Lex();
-  getStreamer().EmitWinCFIAllocStack(Size, Loc);
+  getStreamer().emitWinCFIAllocStack(Size, Loc);
   return false;
 }
 
 bool COFFAsmParser::ParseSEHDirectiveEndProlog(StringRef, SMLoc Loc) {
   Lex();
-  getStreamer().EmitWinCFIEndProlog(Loc);
+  getStreamer().emitWinCFIEndProlog(Loc);
   return false;
 }
 
 bool COFFAsmParser::ParseAtUnwindOrAtExcept(bool &unwind, bool &except) {
   StringRef identifier;
-  if (getLexer().isNot(AsmToken::At))
-    return TokError("a handler attribute must begin with '@'");
+  if (getLexer().isNot(AsmToken::At) && getLexer().isNot(AsmToken::Percent))
+    return TokError("a handler attribute must begin with '@' or '%'");
   SMLoc startLoc = getLexer().getLoc();
   Lex();
   if (getParser().parseIdentifier(identifier))
diff --git a/llvm/lib/MC/MCParser/COFFMasmParser.cpp b/llvm/lib/MC/MCParser/COFFMasmParser.cpp
index 9da880f3b2ea..c5fedef40782 100644
--- a/llvm/lib/MC/MCParser/COFFMasmParser.cpp
+++ b/llvm/lib/MC/MCParser/COFFMasmParser.cpp
@@ -7,25 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCDirectives.h"
-#include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParserExtension.h"
-#include "llvm/MC/MCParser/MCAsmParserUtils.h"
-#include "llvm/MC/MCParser/MCTargetAsmParser.h"
-#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbolCOFF.h"
 #include "llvm/MC/SectionKind.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/SMLoc.h"
-#include <cassert>
 #include <cstdint>
-#include <limits>
 #include <utility>
 
 using namespace llvm;
@@ -245,7 +238,7 @@ bool COFFMasmParser::ParseSectionSwitch(StringRef Section,
     return TokError("unexpected token in section switching directive");
   Lex();
 
-  getStreamer().SwitchSection(getContext().getCOFFSection(
+  getStreamer().switchSection(getContext().getCOFFSection(
       Section, Characteristics, Kind, COMDATSymName, Type));
 
   return false;
@@ -273,7 +266,7 @@ bool COFFMasmParser::ParseDirectiveSegment(StringRef Directive, SMLoc Loc) {
             COFF::IMAGE_SCN_MEM_READ;
   }
   SectionKind Kind = computeSectionKind(Flags);
-  getStreamer().SwitchSection(getContext().getCOFFSection(
+  getStreamer().switchSection(getContext().getCOFFSection(
       SectionName, Flags, Kind, "", (COFF::COMDATType)(0)));
   return false;
 }
@@ -300,13 +293,13 @@ bool COFFMasmParser::ParseDirectiveIncludelib(StringRef Directive, SMLoc Loc) {
 
   unsigned Flags = COFF::IMAGE_SCN_MEM_PRELOAD | COFF::IMAGE_SCN_MEM_16BIT;
   SectionKind Kind = computeSectionKind(Flags);
-  getStreamer().PushSection();
-  getStreamer().SwitchSection(getContext().getCOFFSection(
+  getStreamer().pushSection();
+  getStreamer().switchSection(getContext().getCOFFSection(
       ".drectve", Flags, Kind, "", (COFF::COMDATType)(0)));
   getStreamer().emitBytes("/DEFAULTLIB:");
   getStreamer().emitBytes(Lib);
   getStreamer().emitBytes(" ");
-  getStreamer().PopSection();
+  getStreamer().popSection();
   return false;
 }
 
@@ -343,7 +336,7 @@ bool COFFMasmParser::ParseDirectiveProc(StringRef Directive, SMLoc Loc) {
       getTok().getString().equals_insensitive("frame")) {
     Lex();
     Framed = true;
-    getStreamer().EmitWinCFIStartProc(Sym, Loc);
+    getStreamer().emitWinCFIStartProc(Sym, Loc);
   }
   getStreamer().emitLabel(Sym, Loc);
 
@@ -364,7 +357,7 @@ bool COFFMasmParser::ParseDirectiveEndProc(StringRef Directive, SMLoc Loc) {
                                CurrentProcedure + "'");
 
   if (CurrentProcedureFramed) {
-    getStreamer().EmitWinCFIEndProc(Loc);
+    getStreamer().emitWinCFIEndProc(Loc);
   }
   CurrentProcedure = "";
   CurrentProcedureFramed = false;
@@ -398,13 +391,13 @@ bool COFFMasmParser::ParseSEHDirectiveAllocStack(StringRef Directive,
     return Error(SizeLoc, "expected integer size");
   if (Size % 8 != 0)
     return Error(SizeLoc, "stack size must be a multiple of 8");
-  getStreamer().EmitWinCFIAllocStack(static_cast<unsigned>(Size), Loc);
+  getStreamer().emitWinCFIAllocStack(static_cast<unsigned>(Size), Loc);
   return false;
 }
 
 bool COFFMasmParser::ParseSEHDirectiveEndProlog(StringRef Directive,
                                                 SMLoc Loc) {
-  getStreamer().EmitWinCFIEndProlog(Loc);
+  getStreamer().emitWinCFIEndProlog(Loc);
   return false;
 }
 
diff --git a/llvm/lib/MC/MCParser/DarwinAsmParser.cpp b/llvm/lib/MC/MCParser/DarwinAsmParser.cpp
index 308b3842c61e..bc59531eecb8 100644
--- a/llvm/lib/MC/MCParser/DarwinAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/DarwinAsmParser.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -15,7 +14,6 @@
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDirectives.h"
-#include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCAsmParserExtension.h"
@@ -29,7 +27,6 @@
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <string>
@@ -483,7 +480,7 @@ bool DarwinAsmParser::parseSectionSwitch(StringRef Segment, StringRef Section,
 
   // FIXME: Arch specific.
   bool isText = TAA & MachO::S_ATTR_PURE_INSTRUCTIONS;
-  getStreamer().SwitchSection(getContext().getMachOSection(
+  getStreamer().switchSection(getContext().getMachOSection(
       Segment, Section, TAA, StubSize,
       isText ? SectionKind::getText() : SectionKind::getData()));
 
@@ -722,7 +719,7 @@ bool DarwinAsmParser::parseDirectiveSection(StringRef, SMLoc) {
 
   // FIXME: Arch specific.
   bool isText = Segment == "__TEXT";  // FIXME: Hack.
-  getStreamer().SwitchSection(getContext().getMachOSection(
+  getStreamer().switchSection(getContext().getMachOSection(
       Segment, Section, TAA, StubSize,
       isText ? SectionKind::getText() : SectionKind::getData()));
   return false;
@@ -731,10 +728,10 @@ bool DarwinAsmParser::parseDirectiveSection(StringRef, SMLoc) {
 /// ParseDirectivePushSection:
 ///   ::= .pushsection identifier (',' identifier)*
 bool DarwinAsmParser::parseDirectivePushSection(StringRef S, SMLoc Loc) {
-  getStreamer().PushSection();
+  getStreamer().pushSection();
 
   if (parseDirectiveSection(S, Loc)) {
-    getStreamer().PopSection();
+    getStreamer().popSection();
     return true;
   }
 
@@ -744,7 +741,7 @@ bool DarwinAsmParser::parseDirectivePushSection(StringRef S, SMLoc Loc) {
 /// ParseDirectivePopSection:
 ///   ::= .popsection
 bool DarwinAsmParser::parseDirectivePopSection(StringRef, SMLoc) {
-  if (!getStreamer().PopSection())
+  if (!getStreamer().popSection())
     return TokError(".popsection without corresponding .pushsection");
   return false;
 }
@@ -755,7 +752,7 @@ bool DarwinAsmParser::parseDirectivePrevious(StringRef DirName, SMLoc) {
   MCSectionSubPair PreviousSection = getStreamer().getPreviousSection();
   if (!PreviousSection.first)
     return TokError(".previous without corresponding .section");
-  getStreamer().SwitchSection(PreviousSection.first, PreviousSection.second);
+  getStreamer().switchSection(PreviousSection.first, PreviousSection.second);
   return false;
 }
 
@@ -1152,11 +1149,12 @@ static Triple::OSType getOSTypeFromPlatform(MachO::PlatformType Type) {
   case MachO::PLATFORM_TVOS:    return Triple::TvOS;
   case MachO::PLATFORM_WATCHOS: return Triple::WatchOS;
   case MachO::PLATFORM_BRIDGEOS:         /* silence warning */ break;
+  case MachO::PLATFORM_DRIVERKIT:
+    return Triple::DriverKit;
   case MachO::PLATFORM_MACCATALYST: return Triple::IOS;
   case MachO::PLATFORM_IOSSIMULATOR:     /* silence warning */ break;
   case MachO::PLATFORM_TVOSSIMULATOR:    /* silence warning */ break;
   case MachO::PLATFORM_WATCHOSSIMULATOR: /* silence warning */ break;
-  case MachO::PLATFORM_DRIVERKIT:        /* silence warning */ break;
   }
   llvm_unreachable("Invalid mach-o platform type");
 }
@@ -1175,6 +1173,7 @@ bool DarwinAsmParser::parseBuildVersion(StringRef Directive, SMLoc Loc) {
     .Case("tvos", MachO::PLATFORM_TVOS)
     .Case("watchos", MachO::PLATFORM_WATCHOS)
     .Case("macCatalyst", MachO::PLATFORM_MACCATALYST)
+    .Case("driverkit", MachO::PLATFORM_DRIVERKIT)
     .Default(0);
   if (Platform == 0)
     return Error(PlatformLoc, "unknown platform name");
diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
index e814cf003656..04a234be3b47 100644
--- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
@@ -12,11 +12,9 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDirectives.h"
-#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCAsmParserExtension.h"
-#include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
@@ -214,7 +212,7 @@ bool ELFAsmParser::ParseSectionSwitch(StringRef Section, unsigned Type,
   }
   Lex();
 
-  getStreamer().SwitchSection(getContext().getELFSection(Section, Type, Flags),
+  getStreamer().switchSection(getContext().getELFSection(Section, Type, Flags),
                               Subsection);
 
   return false;
@@ -284,7 +282,8 @@ bool ELFAsmParser::ParseSectionName(StringRef &SectionName) {
   return false;
 }
 
-static unsigned parseSectionFlags(StringRef flagsStr, bool *UseLastGroup) {
+static unsigned parseSectionFlags(const Triple &TT, StringRef flagsStr,
+                                  bool *UseLastGroup) {
   unsigned flags = 0;
 
   // If a valid numerical value is set for the section flag, use it verbatim
@@ -333,7 +332,10 @@ static unsigned parseSectionFlags(StringRef flagsStr, bool *UseLastGroup) {
       flags |= ELF::SHF_GROUP;
       break;
     case 'R':
-      flags |= ELF::SHF_GNU_RETAIN;
+      if (TT.isOSSolaris())
+        flags |= ELF::SHF_SUNW_NODISCARD;
+      else
+        flags |= ELF::SHF_GNU_RETAIN;
       break;
     case '?':
       *UseLastGroup = true;
@@ -377,10 +379,10 @@ unsigned ELFAsmParser::parseSunStyleSectionFlags() {
 
 
 bool ELFAsmParser::ParseDirectivePushSection(StringRef s, SMLoc loc) {
-  getStreamer().PushSection();
+  getStreamer().pushSection();
 
   if (ParseSectionArguments(/*IsPush=*/true, loc)) {
-    getStreamer().PopSection();
+    getStreamer().popSection();
     return true;
   }
 
@@ -388,7 +390,7 @@ bool ELFAsmParser::ParseDirectivePushSection(StringRef s, SMLoc loc) {
 }
 
 bool ELFAsmParser::ParseDirectivePopSection(StringRef, SMLoc) {
-  if (!getStreamer().PopSection())
+  if (!getStreamer().popSection())
     return TokError(".popsection without corresponding .pushsection");
   return false;
 }
@@ -571,7 +573,8 @@ bool ELFAsmParser::ParseSectionArguments(bool IsPush, SMLoc loc) {
     } else {
       StringRef FlagsStr = getTok().getStringContents();
       Lex();
-      extraFlags = parseSectionFlags(FlagsStr, &UseLastGroup);
+      extraFlags = parseSectionFlags(getContext().getTargetTriple(), FlagsStr,
+                                     &UseLastGroup);
     }
 
     if (extraFlags == -1U)
@@ -675,7 +678,7 @@ EndStmt:
   MCSectionELF *Section =
       getContext().getELFSection(SectionName, Type, Flags, Size, GroupName,
                                  IsComdat, UniqueID, LinkedToSym);
-  getStreamer().SwitchSection(Section, Subsection);
+  getStreamer().switchSection(Section, Subsection);
   // Check that flags are used consistently. However, the GNU assembler permits
   // to leave out in subsequent uses of the same sections; for compatibility,
   // do likewise.
@@ -715,7 +718,7 @@ bool ELFAsmParser::ParseDirectivePrevious(StringRef DirName, SMLoc) {
   MCSectionSubPair PreviousSection = getStreamer().getPreviousSection();
   if (PreviousSection.first == nullptr)
       return TokError(".previous without corresponding .section");
-  getStreamer().SwitchSection(PreviousSection.first, PreviousSection.second);
+  getStreamer().switchSection(PreviousSection.first, PreviousSection.second);
 
   return false;
 }
@@ -857,15 +860,15 @@ bool ELFAsmParser::ParseDirectiveVersion(StringRef, SMLoc) {
 
   MCSection *Note = getContext().getELFSection(".note", ELF::SHT_NOTE, 0);
 
-  getStreamer().PushSection();
-  getStreamer().SwitchSection(Note);
+  getStreamer().pushSection();
+  getStreamer().switchSection(Note);
   getStreamer().emitInt32(Data.size() + 1); // namesz
   getStreamer().emitInt32(0);               // descsz = 0 (no description).
   getStreamer().emitInt32(1);               // type = NT_VERSION
   getStreamer().emitBytes(Data);            // name
   getStreamer().emitInt8(0);                // NUL
   getStreamer().emitValueToAlignment(4);
-  getStreamer().PopSection();
+  getStreamer().popSection();
   return false;
 }
 
@@ -907,7 +910,7 @@ bool ELFAsmParser::ParseDirectiveSubsection(StringRef, SMLoc) {
 
   Lex();
 
-  getStreamer().SubSection(Subsection);
+  getStreamer().subSection(Subsection);
   return false;
 }
 
diff --git a/llvm/lib/MC/MCParser/GOFFAsmParser.cpp b/llvm/lib/MC/MCParser/GOFFAsmParser.cpp
index c2a7eaee8029..c3fc04607273 100644
--- a/llvm/lib/MC/MCParser/GOFFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/GOFFAsmParser.cpp
@@ -6,16 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParserExtension.h"
-#include "llvm/MC/MCSectionGOFF.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbolGOFF.h"
 
 using namespace llvm;
 
@@ -31,7 +22,7 @@ class GOFFAsmParser : public MCAsmParserExtension {
   }
 
 public:
-  GOFFAsmParser() {}
+  GOFFAsmParser() = default;
 
   void Initialize(MCAsmParser &Parser) override {
     // Call the base implementation.
diff --git a/llvm/lib/MC/MCParser/MCAsmLexer.cpp b/llvm/lib/MC/MCParser/MCAsmLexer.cpp
index 497055bc1760..632c52479d70 100644
--- a/llvm/lib/MC/MCParser/MCAsmLexer.cpp
+++ b/llvm/lib/MC/MCParser/MCAsmLexer.cpp
@@ -9,7 +9,6 @@
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/SMLoc.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
diff --git a/llvm/lib/MC/MCParser/MCAsmParser.cpp b/llvm/lib/MC/MCParser/MCAsmParser.cpp
index d797c2d3f288..7fc1dbf56f98 100644
--- a/llvm/lib/MC/MCParser/MCAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/MCAsmParser.cpp
@@ -25,7 +25,7 @@ cl::opt<unsigned> AsmMacroMaxNestingDepth(
     "asm-macro-max-nesting-depth", cl::init(20), cl::Hidden,
     cl::desc("The maximum nesting depth allowed for assembly macros."));
 
-MCAsmParser::MCAsmParser() {}
+MCAsmParser::MCAsmParser() = default;
 
 MCAsmParser::~MCAsmParser() = default;
 
diff --git a/llvm/lib/MC/MCParser/MCAsmParserExtension.cpp b/llvm/lib/MC/MCParser/MCAsmParserExtension.cpp
index 0b5046cd8fad..f5a10ce9805b 100644
--- a/llvm/lib/MC/MCParser/MCAsmParserExtension.cpp
+++ b/llvm/lib/MC/MCParser/MCAsmParserExtension.cpp
@@ -8,6 +8,8 @@
 
 #include "llvm/MC/MCParser/MCAsmParserExtension.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCStreamer.h"
 
 using namespace llvm;
diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp
index f9433240743d..8c582d225e30 100644
--- a/llvm/lib/MC/MCParser/MasmParser.cpp
+++ b/llvm/lib/MC/MCParser/MasmParser.cpp
@@ -14,7 +14,6 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
@@ -36,21 +35,19 @@
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCParser/AsmCond.h"
 #include "llvm/MC/MCParser/AsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCAsmParserExtension.h"
-#include "llvm/MC/MCParser/MCAsmParserUtils.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCTargetOptions.h"
-#include "llvm/MC/MCValue.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -64,7 +61,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
-#include <cctype>
 #include <climits>
 #include <cstddef>
 #include <cstdint>
@@ -980,6 +976,8 @@ private:
   bool parseDirectiveEnds(StringRef Name, SMLoc NameLoc);
   bool parseDirectiveNestedEnds();
 
+  bool parseDirectiveExtern();
+
   /// Parse a directive like ".globl" which accepts a single symbol (which
   /// should be a label or an external).
   bool parseDirectiveSymbolAttribute(MCSymbolAttr Attr);
@@ -1192,7 +1190,7 @@ bool MasmParser::expandMacros() {
     }
   }
 
-  if (!ExpandedValue.hasValue())
+  if (!ExpandedValue)
     return true;
   std::unique_ptr<MemoryBuffer> Instantiation =
       MemoryBuffer::getMemBufferCopy(*ExpandedValue, "<instantiation>");
@@ -1431,7 +1429,7 @@ bool MasmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
   // Finalize the output stream if there are no errors and if the client wants
   // us to.
   if (!HadError && !NoFinalize)
-    Out.Finish(Lexer.getLoc());
+    Out.finish(Lexer.getLoc());
 
   return HadError || getContext().hadError();
 }
@@ -2094,7 +2092,7 @@ bool MasmParser::parseStatement(ParseStatementInfo &Info,
     // If this is a line comment we can drop it safely.
     if (getTok().getString().empty() || getTok().getString().front() == '\r' ||
         getTok().getString().front() == '\n')
-      Out.AddBlankLine();
+      Out.addBlankLine();
     Lex();
     return false;
   }
@@ -2283,7 +2281,7 @@ bool MasmParser::parseStatement(ParseStatementInfo &Info,
     }
 
     // Consume any end of statement token, if present, to avoid spurious
-    // AddBlankLine calls().
+    // addBlankLine calls().
     if (getTok().is(AsmToken::EndOfStatement)) {
       Lex();
     }
@@ -2409,8 +2407,7 @@ bool MasmParser::parseStatement(ParseStatementInfo &Info,
     case DK_ORG:
       return parseDirectiveOrg();
     case DK_EXTERN:
-      eatToEndOfStatement(); // .extern is the default, ignore it.
-      return false;
+      return parseDirectiveExtern();
     case DK_PUBLIC:
       return parseDirectiveSymbolAttribute(MCSA_Global);
     case DK_COMM:
@@ -2905,7 +2902,7 @@ bool MasmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
       if (Body[Pos] == '&')
         break;
       if (isMacroParameterChar(Body[Pos])) {
-        if (!CurrentQuote.hasValue())
+        if (!CurrentQuote)
           break;
         if (IdentifierPos == End)
           IdentifierPos = Pos;
@@ -2914,7 +2911,7 @@ bool MasmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
       }
 
       // Track quotation status
-      if (!CurrentQuote.hasValue()) {
+      if (!CurrentQuote) {
         if (Body[Pos] == '\'' || Body[Pos] == '"')
           CurrentQuote = Body[Pos];
       } else if (Body[Pos] == CurrentQuote) {
@@ -3333,7 +3330,7 @@ bool MasmParser::handleMacroInvocation(const MCAsmMacro *M, SMLoc NameLoc) {
     ParseStatementInfo Info(&AsmStrRewrites);
     bool Parsed = parseStatement(Info, nullptr);
 
-    if (!Parsed && Info.ExitValue.hasValue()) {
+    if (!Parsed && Info.ExitValue) {
       ExitValue = std::move(*Info.ExitValue);
       break;
     }
@@ -3628,7 +3625,7 @@ bool MasmParser::parseTextItem(std::string &Data) {
       if (BuiltinIt != BuiltinSymbolMap.end()) {
         llvm::Optional<std::string> BuiltinText =
             evaluateBuiltinTextMacro(BuiltinIt->getValue(), StartLoc);
-        if (!BuiltinText.hasValue()) {
+        if (!BuiltinText) {
           // Not a text macro; break without substituting
           break;
         }
@@ -4242,7 +4239,7 @@ bool MasmParser::parseStructInitializer(const StructInfo &Structure,
 
   auto &FieldInitializers = Initializer.FieldInitializers;
   size_t FieldIndex = 0;
-  if (EndToken.hasValue()) {
+  if (EndToken) {
     // Initialize all fields with given initializers.
     while (getTok().isNot(EndToken.getValue()) &&
            FieldIndex < Structure.Fields.size()) {
@@ -4275,7 +4272,7 @@ bool MasmParser::parseStructInitializer(const StructInfo &Structure,
     FieldInitializers.push_back(Field.Contents);
   }
 
-  if (EndToken.hasValue()) {
+  if (EndToken) {
     if (EndToken.getValue() == AsmToken::Greater)
       return parseAngleBracketClose();
 
@@ -4763,7 +4760,7 @@ bool MasmParser::emitAlignTo(int64_t Alignment) {
     // directive.
     const MCSection *Section = getStreamer().getCurrentSectionOnly();
     assert(Section && "must have section to emit alignment");
-    if (Section->UseCodeAlign()) {
+    if (Section->useCodeAlign()) {
       getStreamer().emitCodeAlignment(Alignment, &getTargetParser().getSTI(),
                                       /*MaxBytesToEmit=*/0);
     } else {
@@ -4911,8 +4908,8 @@ bool MasmParser::parseDirectiveFile(SMLoc DirectiveLoc) {
     if (HasMD5) {
       MD5::MD5Result Sum;
       for (unsigned i = 0; i != 8; ++i) {
-        Sum.Bytes[i] = uint8_t(MD5Hi >> ((7 - i) * 8));
-        Sum.Bytes[i + 8] = uint8_t(MD5Lo >> ((7 - i) * 8));
+        Sum[i] = uint8_t(MD5Hi >> ((7 - i) * 8));
+        Sum[i + 8] = uint8_t(MD5Lo >> ((7 - i) * 8));
       }
       CKMem = Sum;
     }
@@ -4952,8 +4949,7 @@ bool MasmParser::parseDirectiveLine() {
     (void)LineNumber;
     // FIXME: Do something with the .line.
   }
-  if (parseToken(AsmToken::EndOfStatement,
-                 "unexpected token in '.line' directive"))
+  if (parseEOL())
     return true;
 
   return false;
@@ -5086,8 +5082,7 @@ bool MasmParser::parseDirectiveCVFile() {
         parseEscapedString(Checksum) ||
         parseIntToken(ChecksumKind,
                       "expected checksum kind in '.cv_file' directive") ||
-        parseToken(AsmToken::EndOfStatement,
-                   "unexpected token in '.cv_file' directive"))
+        parseEOL())
       return true;
   }
 
@@ -5097,7 +5092,7 @@ bool MasmParser::parseDirectiveCVFile() {
   ArrayRef<uint8_t> ChecksumAsBytes(reinterpret_cast<const uint8_t *>(CKMem),
                                     Checksum.size());
 
-  if (!getStreamer().EmitCVFileDirective(FileNumber, Filename, ChecksumAsBytes,
+  if (!getStreamer().emitCVFileDirective(FileNumber, Filename, ChecksumAsBytes,
                                          static_cast<uint8_t>(ChecksumKind)))
     return Error(FileNumberLoc, "file number already allocated");
 
@@ -5133,12 +5128,10 @@ bool MasmParser::parseDirectiveCVFuncId() {
   SMLoc FunctionIdLoc = getTok().getLoc();
   int64_t FunctionId;
 
-  if (parseCVFunctionId(FunctionId, ".cv_func_id") ||
-      parseToken(AsmToken::EndOfStatement,
-                 "unexpected token in '.cv_func_id' directive"))
+  if (parseCVFunctionId(FunctionId, ".cv_func_id") || parseEOL())
     return true;
 
-  if (!getStreamer().EmitCVFuncIdDirective(FunctionId))
+  if (!getStreamer().emitCVFuncIdDirective(FunctionId))
     return Error(FunctionIdLoc, "function id already allocated");
 
   return false;
@@ -5194,11 +5187,10 @@ bool MasmParser::parseDirectiveCVInlineSiteId() {
     Lex();
   }
 
-  if (parseToken(AsmToken::EndOfStatement,
-                 "unexpected token in '.cv_inline_site_id' directive"))
+  if (parseEOL())
     return true;
 
-  if (!getStreamer().EmitCVInlineSiteIdDirective(FunctionId, IAFunc, IAFile,
+  if (!getStreamer().emitCVInlineSiteIdDirective(FunctionId, IAFunc, IAFile,
                                                  IALine, IACol, FunctionIdLoc))
     return Error(FunctionIdLoc, "function id already allocated");
 
@@ -5321,7 +5313,7 @@ bool MasmParser::parseDirectiveCVInlineLinetable() {
                                   "expected identifier in directive"))
     return true;
 
-  if (parseToken(AsmToken::EndOfStatement, "Expected End of Statement"))
+  if (parseEOL())
     return true;
 
   MCSymbol *FnStartSym = getContext().getOrCreateSymbol(FnStartName);
@@ -5482,7 +5474,7 @@ bool MasmParser::parseDirectiveCVFileChecksumOffset() {
   int64_t FileNo;
   if (parseIntToken(FileNo, "expected identifier in directive"))
     return true;
-  if (parseToken(AsmToken::EndOfStatement, "Expected End of Statement"))
+  if (parseEOL())
     return true;
   getStreamer().emitCVFileChecksumOffsetDirective(FileNo);
   return false;
@@ -5498,7 +5490,7 @@ bool MasmParser::parseDirectiveCVFPOData() {
   if (parseEOL("unexpected tokens"))
     return addErrorSuffix(" in '.cv_fpo_data' directive");
   MCSymbol *ProcSym = getContext().getOrCreateSymbol(ProcName);
-  getStreamer().EmitCVFPOData(ProcSym, DirLoc);
+  getStreamer().emitCVFPOData(ProcSym, DirLoc);
   return false;
 }
 
@@ -5791,8 +5783,7 @@ bool MasmParser::parseDirectiveCFIReturnColumn(SMLoc DirectiveLoc) {
 /// parseDirectiveCFISignalFrame
 /// ::= .cfi_signal_frame
 bool MasmParser::parseDirectiveCFISignalFrame() {
-  if (parseToken(AsmToken::EndOfStatement,
-                 "unexpected token in '.cfi_signal_frame'"))
+  if (parseEOL())
     return true;
 
   getStreamer().emitCFISignalFrame();
@@ -6023,6 +6014,39 @@ bool MasmParser::parseDirectivePurgeMacro(SMLoc DirectiveLoc) {
   return false;
 }
 
+bool MasmParser::parseDirectiveExtern() {
+  // .extern is the default - but we still need to take any provided type info.
+  auto parseOp = [&]() -> bool {
+    StringRef Name;
+    SMLoc NameLoc = getTok().getLoc();
+    if (parseIdentifier(Name))
+      return Error(NameLoc, "expected name");
+    if (parseToken(AsmToken::Colon))
+      return true;
+
+    StringRef TypeName;
+    SMLoc TypeLoc = getTok().getLoc();
+    if (parseIdentifier(TypeName))
+      return Error(TypeLoc, "expected type");
+    if (!TypeName.equals_insensitive("proc")) {
+      AsmTypeInfo Type;
+      if (lookUpType(TypeName, Type))
+        return Error(TypeLoc, "unrecognized type");
+      KnownType[Name.lower()] = Type;
+    }
+
+    MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
+    Sym->setExternal(true);
+    getStreamer().emitSymbolAttribute(Sym, MCSA_Extern);
+
+    return false;
+  };
+
+  if (parseMany(parseOp))
+    return addErrorSuffix(" in directive 'extern'");
+  return false;
+}
+
 /// parseDirectiveSymbolAttribute
 ///  ::= { ".globl", ".weak", ... } [ identifier ( , identifier )* ]
 bool MasmParser::parseDirectiveSymbolAttribute(MCSymbolAttr Attr) {
@@ -6091,8 +6115,7 @@ bool MasmParser::parseDirectiveComm(bool IsLocal) {
     }
   }
 
-  if (parseToken(AsmToken::EndOfStatement,
-                 "unexpected token in '.comm' or '.lcomm' directive"))
+  if (parseEOL())
     return true;
 
   // NOTE: a size of zero for a .comm should create a undefined symbol
@@ -6138,8 +6161,7 @@ bool MasmParser::parseDirectiveComment(SMLoc DirectiveLoc) {
     Lex();  // eat end of statement
   } while (
       !StringRef(parseStringTo(AsmToken::EndOfStatement)).contains(Delimiter));
-  return parseToken(AsmToken::EndOfStatement,
-                    "unexpected token in 'comment' directive");
+  return parseEOL();
 }
 
 /// parseDirectiveInclude
@@ -6173,9 +6195,7 @@ bool MasmParser::parseDirectiveIf(SMLoc DirectiveLoc, DirectiveKind DirKind) {
     eatToEndOfStatement();
   } else {
     int64_t ExprValue;
-    if (parseAbsoluteExpression(ExprValue) ||
-        parseToken(AsmToken::EndOfStatement,
-                   "unexpected token in '.if' directive"))
+    if (parseAbsoluteExpression(ExprValue) || parseEOL())
       return true;
 
     switch (DirKind) {
@@ -6208,8 +6228,7 @@ bool MasmParser::parseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank) {
     if (parseTextItem(Str))
       return TokError("expected text item parameter for 'ifb' directive");
 
-    if (parseToken(AsmToken::EndOfStatement,
-                   "unexpected token in 'ifb' directive"))
+    if (parseEOL())
       return true;
 
     TheCondState.CondMet = ExpectBlank == Str.empty();
@@ -6275,7 +6294,7 @@ bool MasmParser::parseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined) {
     if (!is_defined) {
       StringRef Name;
       if (check(parseIdentifier(Name), "expected identifier after 'ifdef'") ||
-          parseToken(AsmToken::EndOfStatement, "unexpected token in 'ifdef'"))
+          parseEOL())
         return true;
 
       if (BuiltinSymbolMap.find(Name.lower()) != BuiltinSymbolMap.end()) {
@@ -6316,8 +6335,7 @@ bool MasmParser::parseDirectiveElseIf(SMLoc DirectiveLoc,
     if (parseAbsoluteExpression(ExprValue))
       return true;
 
-    if (parseToken(AsmToken::EndOfStatement,
-                   "unexpected token in '.elseif' directive"))
+    if (parseEOL())
       return true;
 
     switch (DirKind) {
@@ -6360,8 +6378,7 @@ bool MasmParser::parseDirectiveElseIfb(SMLoc DirectiveLoc, bool ExpectBlank) {
       return TokError("expected text item parameter for 'elseifnb' directive");
     }
 
-    if (parseToken(AsmToken::EndOfStatement,
-                   "unexpected token in 'elseifb' directive"))
+    if (parseEOL())
       return true;
 
     TheCondState.CondMet = ExpectBlank == Str.empty();
@@ -6398,8 +6415,7 @@ bool MasmParser::parseDirectiveElseIfdef(SMLoc DirectiveLoc,
       StringRef Name;
       if (check(parseIdentifier(Name),
                 "expected identifier after 'elseifdef'") ||
-          parseToken(AsmToken::EndOfStatement,
-                     "unexpected token in 'elseifdef'"))
+          parseEOL())
         return true;
 
       if (BuiltinSymbolMap.find(Name.lower()) != BuiltinSymbolMap.end()) {
@@ -6475,8 +6491,7 @@ bool MasmParser::parseDirectiveElseIfidn(SMLoc DirectiveLoc, bool ExpectEqual,
 /// parseDirectiveElse
 /// ::= else
 bool MasmParser::parseDirectiveElse(SMLoc DirectiveLoc) {
-  if (parseToken(AsmToken::EndOfStatement,
-                 "unexpected token in 'else' directive"))
+  if (parseEOL())
     return true;
 
   if (TheCondState.TheCond != AsmCond::IfCond &&
@@ -6498,8 +6513,7 @@ bool MasmParser::parseDirectiveElse(SMLoc DirectiveLoc) {
 /// parseDirectiveEnd
 /// ::= end
 bool MasmParser::parseDirectiveEnd(SMLoc DirectiveLoc) {
-  if (parseToken(AsmToken::EndOfStatement,
-                 "unexpected token in 'end' directive"))
+  if (parseEOL())
     return true;
 
   while (Lexer.isNot(AsmToken::Eof))
@@ -6687,8 +6701,7 @@ bool MasmParser::parseDirectiveErrorIfe(SMLoc DirectiveLoc, bool ExpectZero) {
 /// parseDirectiveEndIf
 /// ::= .endif
 bool MasmParser::parseDirectiveEndIf(SMLoc DirectiveLoc) {
-  if (parseToken(AsmToken::EndOfStatement,
-                 "unexpected token in '.endif' directive"))
+  if (parseEOL())
     return true;
 
   if ((TheCondState.TheCond == AsmCond::NoCond) || TheCondStack.empty())
@@ -6982,9 +6995,7 @@ bool MasmParser::parseDirectiveRepeat(SMLoc DirectiveLoc, StringRef Dir) {
     return Error(CountLoc, "unexpected token in '" + Dir + "' directive");
   }
 
-  if (check(Count < 0, CountLoc, "Count is negative") ||
-      parseToken(AsmToken::EndOfStatement,
-                 "unexpected token in '" + Dir + "' directive"))
+  if (check(Count < 0, CountLoc, "Count is negative") || parseEOL())
     return true;
 
   // Lex the repeat definition.
@@ -7099,7 +7110,7 @@ bool MasmParser::parseDirectiveFor(SMLoc DirectiveLoc, StringRef Dir) {
   if (parseToken(AsmToken::Greater,
                  "values in '" + Dir +
                      "' directive must be enclosed in angle brackets") ||
-      parseToken(AsmToken::EndOfStatement, "expected End of Statement"))
+      parseEOL())
     return true;
 
   // Lex the for definition.
@@ -7149,7 +7160,7 @@ bool MasmParser::parseDirectiveForc(SMLoc DirectiveLoc, StringRef Directive) {
     }
     Argument.resize(End);
   }
-  if (parseToken(AsmToken::EndOfStatement, "expected end of statement"))
+  if (parseEOL())
     return true;
 
   // Lex the irpc definition.
diff --git a/llvm/lib/MC/MCParser/WasmAsmParser.cpp b/llvm/lib/MC/MCParser/WasmAsmParser.cpp
index 833530bef3bf..a84d00d82b76 100644
--- a/llvm/lib/MC/MCParser/WasmAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/WasmAsmParser.cpp
@@ -21,11 +21,11 @@
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCAsmParserExtension.h"
+#include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCSectionWasm.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolWasm.h"
-#include "llvm/Support/MachineValueType.h"
+#include "llvm/Support/Casting.h"
 
 using namespace llvm;
 
@@ -53,6 +53,7 @@ public:
     this->MCAsmParserExtension::Initialize(*Parser);
 
     addDirectiveHandler<&WasmAsmParser::parseSectionDirectiveText>(".text");
+    addDirectiveHandler<&WasmAsmParser::parseSectionDirectiveData>(".data");
     addDirectiveHandler<&WasmAsmParser::parseSectionDirective>(".section");
     addDirectiveHandler<&WasmAsmParser::parseDirectiveSize>(".size");
     addDirectiveHandler<&WasmAsmParser::parseDirectiveType>(".type");
@@ -90,6 +91,12 @@ public:
     return false;
   }
 
+  bool parseSectionDirectiveData(StringRef, SMLoc) {
+    auto *S = getContext().getObjectFileInfo()->getDataSection();
+    getStreamer().switchSection(S);
+    return false;
+  }
+
   uint32_t parseSectionFlags(StringRef FlagStr, bool &Passive, bool &Group) {
     uint32_t flags = 0;
     for (char C : FlagStr) {
@@ -181,7 +188,7 @@ public:
 
     // TODO: Parse UniqueID
     MCSectionWasm *WS = getContext().getWasmSection(
-        Name, Kind.getValue(), Flags, GroupName, MCContext::GenericSectionID);
+        Name, *Kind, Flags, GroupName, MCContext::GenericSectionID);
 
     if (WS->getSegmentFlags() != Flags)
       Parser->Error(loc, "changed section flags for " + Name +
@@ -194,7 +201,7 @@ public:
       WS->setPassive();
     }
 
-    getStreamer().SwitchSection(WS);
+    getStreamer().switchSection(WS);
     return false;
   }
 
diff --git a/llvm/lib/MC/MCParser/XCOFFAsmParser.cpp b/llvm/lib/MC/MCParser/XCOFFAsmParser.cpp
index 7494fe07734c..d20a65f6a476 100644
--- a/llvm/lib/MC/MCParser/XCOFFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/XCOFFAsmParser.cpp
@@ -8,15 +8,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/BinaryFormat/XCOFF.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCAsmParserExtension.h"
-#include "llvm/MC/MCSectionXCOFF.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCSymbolXCOFF.h"
-#include "llvm/Support/MachineValueType.h"
 
 using namespace llvm;
 
@@ -35,7 +28,7 @@ class XCOFFAsmParser : public MCAsmParserExtension {
   }
 
 public:
-  XCOFFAsmParser() {}
+  XCOFFAsmParser() = default;
 
   void Initialize(MCAsmParser &P) override {
     Parser = &P;
diff --git a/llvm/lib/MC/MCPseudoProbe.cpp b/llvm/lib/MC/MCPseudoProbe.cpp
index ebf38327f4dc..5277ce87bee0 100644
--- a/llvm/lib/MC/MCPseudoProbe.cpp
+++ b/llvm/lib/MC/MCPseudoProbe.cpp
@@ -9,9 +9,10 @@
 #include "llvm/MC/MCPseudoProbe.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectStreamer.h"
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/raw_ostream.h"
@@ -182,7 +183,7 @@ void MCPseudoProbeSection::emit(MCObjectStreamer *MCOS) {
     if (auto *S =
             Ctx.getObjectFileInfo()->getPseudoProbeSection(ProbeSec.first)) {
       // Switch to the .pseudoprobe section or a comdat group.
-      MCOS->SwitchSection(S);
+      MCOS->switchSection(S);
       // Emit probes grouped by GUID.
       ProbeSec.second.emit(MCOS, LastProbe);
     }
@@ -229,8 +230,7 @@ void MCDecodedPseudoProbe::getInlineContext(
   // It will add the string of each node's inline site during iteration.
   // Note that it won't include the probe's belonging function(leaf location)
   while (Cur->hasInlineSite()) {
-    StringRef FuncName =
-        getProbeFNameForGUID(GUID2FuncMAP, std::get<0>(Cur->ISite));
+    StringRef FuncName = getProbeFNameForGUID(GUID2FuncMAP, Cur->Parent->Guid);
     ContextStack.emplace_back(
         MCPseduoProbeFrameLocation(FuncName, std::get<1>(Cur->ISite)));
     Cur = static_cast<MCDecodedPseudoProbeInlineTree *>(Cur->Parent);
@@ -357,8 +357,9 @@ bool MCPseudoProbeDecoder::buildGUID2FuncDescMap(const uint8_t *Start,
   return true;
 }
 
-bool MCPseudoProbeDecoder::buildAddress2ProbeMap(const uint8_t *Start,
-                                                 std::size_t Size) {
+bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
+    MCDecodedPseudoProbeInlineTree *Cur, uint64_t &LastAddr,
+    std::unordered_set<uint64_t> &GuildFilter) {
   // The pseudo_probe section encodes an inline forest and each tree has a
   // format like:
   //  FUNCTION BODY (one for each uninlined function present in the text
@@ -389,101 +390,110 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(const uint8_t *Start,
   //           FUNCTION BODY
   //             A FUNCTION BODY entry describing the inlined function.
 
-  Data = Start;
-  End = Data + Size;
-
-  MCDecodedPseudoProbeInlineTree *Root = &DummyInlineRoot;
-  MCDecodedPseudoProbeInlineTree *Cur = &DummyInlineRoot;
-  uint64_t LastAddr = 0;
   uint32_t Index = 0;
-  // A DFS-based decoding
-  while (Data < End) {
-    if (Root == Cur) {
-      // Use a sequential id for top level inliner.
-      Index = Root->getChildren().size();
-    } else {
-      // Read inline site for inlinees
-      auto ErrorOrIndex = readUnsignedNumber<uint32_t>();
-      if (!ErrorOrIndex)
-        return false;
-      Index = std::move(*ErrorOrIndex);
-    }
-    // Switch/add to a new tree node(inlinee)
-    Cur = Cur->getOrAddNode(std::make_tuple(Cur->Guid, Index));
-    // Read guid
-    auto ErrorOrCurGuid = readUnencodedNumber<uint64_t>();
-    if (!ErrorOrCurGuid)
+  if (Cur == &DummyInlineRoot) {
+    // Use a sequential id for top level inliner.
+    Index = Cur->getChildren().size();
+  } else {
+    // Read inline site for inlinees
+    auto ErrorOrIndex = readUnsignedNumber<uint32_t>();
+    if (!ErrorOrIndex)
       return false;
-    Cur->Guid = std::move(*ErrorOrCurGuid);
-    // Read number of probes in the current node.
-    auto ErrorOrNodeCount = readUnsignedNumber<uint32_t>();
-    if (!ErrorOrNodeCount)
+    Index = std::move(*ErrorOrIndex);
+  }
+
+  // Read guid
+  auto ErrorOrCurGuid = readUnencodedNumber<uint64_t>();
+  if (!ErrorOrCurGuid)
+    return false;
+  uint64_t Guid = std::move(*ErrorOrCurGuid);
+
+  // Decide if top-level node should be disgarded.
+  if (Cur == &DummyInlineRoot && !GuildFilter.empty() &&
+      !GuildFilter.count(Guid))
+    Cur = nullptr;
+
+  // If the incoming node is null, all its children nodes should be disgarded.
+  if (Cur) {
+    // Switch/add to a new tree node(inlinee)
+    Cur = Cur->getOrAddNode(std::make_tuple(Guid, Index));
+    Cur->Guid = Guid;
+  }
+
+  // Read number of probes in the current node.
+  auto ErrorOrNodeCount = readUnsignedNumber<uint32_t>();
+  if (!ErrorOrNodeCount)
+    return false;
+  uint32_t NodeCount = std::move(*ErrorOrNodeCount);
+  // Read number of direct inlinees
+  auto ErrorOrCurChildrenToProcess = readUnsignedNumber<uint32_t>();
+  if (!ErrorOrCurChildrenToProcess)
+    return false;
+  // Read all probes in this node
+  for (std::size_t I = 0; I < NodeCount; I++) {
+    // Read index
+    auto ErrorOrIndex = readUnsignedNumber<uint32_t>();
+    if (!ErrorOrIndex)
       return false;
-    uint32_t NodeCount = std::move(*ErrorOrNodeCount);
-    // Read number of direct inlinees
-    auto ErrorOrCurChildrenToProcess = readUnsignedNumber<uint32_t>();
-    if (!ErrorOrCurChildrenToProcess)
+    uint32_t Index = std::move(*ErrorOrIndex);
+    // Read type | flag.
+    auto ErrorOrValue = readUnencodedNumber<uint8_t>();
+    if (!ErrorOrValue)
       return false;
-    Cur->ChildrenToProcess = std::move(*ErrorOrCurChildrenToProcess);
-    // Read all probes in this node
-    for (std::size_t I = 0; I < NodeCount; I++) {
-      // Read index
-      auto ErrorOrIndex = readUnsignedNumber<uint32_t>();
-      if (!ErrorOrIndex)
+    uint8_t Value = std::move(*ErrorOrValue);
+    uint8_t Kind = Value & 0xf;
+    uint8_t Attr = (Value & 0x70) >> 4;
+    // Read address
+    uint64_t Addr = 0;
+    if (Value & 0x80) {
+      auto ErrorOrOffset = readSignedNumber<int64_t>();
+      if (!ErrorOrOffset)
         return false;
-      uint32_t Index = std::move(*ErrorOrIndex);
-      // Read type | flag.
-      auto ErrorOrValue = readUnencodedNumber<uint8_t>();
-      if (!ErrorOrValue)
+      int64_t Offset = std::move(*ErrorOrOffset);
+      Addr = LastAddr + Offset;
+    } else {
+      auto ErrorOrAddr = readUnencodedNumber<int64_t>();
+      if (!ErrorOrAddr)
         return false;
-      uint8_t Value = std::move(*ErrorOrValue);
-      uint8_t Kind = Value & 0xf;
-      uint8_t Attr = (Value & 0x70) >> 4;
-      // Read address
-      uint64_t Addr = 0;
-      if (Value & 0x80) {
-        auto ErrorOrOffset = readSignedNumber<int64_t>();
-        if (!ErrorOrOffset)
-          return false;
-        int64_t Offset = std::move(*ErrorOrOffset);
-        Addr = LastAddr + Offset;
-      } else {
-        auto ErrorOrAddr = readUnencodedNumber<int64_t>();
-        if (!ErrorOrAddr)
-          return false;
-        Addr = std::move(*ErrorOrAddr);
-      }
+      Addr = std::move(*ErrorOrAddr);
+    }
+
+    if (Cur) {
       // Populate Address2ProbesMap
       auto &Probes = Address2ProbesMap[Addr];
       Probes.emplace_back(Addr, Cur->Guid, Index, PseudoProbeType(Kind), Attr,
                           Cur);
       Cur->addProbes(&Probes.back());
-      LastAddr = Addr;
     }
+    LastAddr = Addr;
+  }
 
-    // Look for the parent for the next node by subtracting the current
-    // node count from tree counts along the parent chain. The first node
-    // in the chain that has a non-zero tree count is the target.
-    while (Cur != Root) {
-      if (Cur->ChildrenToProcess == 0) {
-        Cur = static_cast<MCDecodedPseudoProbeInlineTree *>(Cur->Parent);
-        if (Cur != Root) {
-          assert(Cur->ChildrenToProcess > 0 &&
-                 "Should have some unprocessed nodes");
-          Cur->ChildrenToProcess -= 1;
-        }
-      } else {
-        break;
-      }
-    }
+  uint32_t ChildrenToProcess = std::move(*ErrorOrCurChildrenToProcess);
+  for (uint32_t I = 0; I < ChildrenToProcess; I++) {
+    buildAddress2ProbeMap(Cur, LastAddr, GuildFilter);
   }
 
+  return true;
+}
+
+bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
+    const uint8_t *Start, std::size_t Size,
+    std::unordered_set<uint64_t> &GuildFilter) {
+  Data = Start;
+  End = Data + Size;
+  uint64_t LastAddr = 0;
+  while (Data < End)
+    buildAddress2ProbeMap(&DummyInlineRoot, LastAddr, GuildFilter);
   assert(Data == End && "Have unprocessed data in pseudo_probe section");
-  assert(Cur == Root &&
-         " Cur should point to root when the forest is fully built up");
   return true;
 }
 
+bool MCPseudoProbeDecoder::buildAddress2ProbeMap(const uint8_t *Start,
+                                                 std::size_t Size) {
+  std::unordered_set<uint64_t> GuildFilter;
+  return buildAddress2ProbeMap(Start, Size, GuildFilter);
+}
+
 void MCPseudoProbeDecoder::printGUID2FuncDescMap(raw_ostream &OS) {
   OS << "Pseudo Probe Desc:\n";
   // Make the output deterministic
@@ -563,5 +573,5 @@ const MCPseudoProbeFuncDesc *MCPseudoProbeDecoder::getInlinerDescForProbe(
   MCDecodedPseudoProbeInlineTree *InlinerNode = Probe->getInlineTreeNode();
   if (!InlinerNode->hasInlineSite())
     return nullptr;
-  return getFuncDescForGUID(std::get<0>(InlinerNode->ISite));
+  return getFuncDescForGUID(InlinerNode->Parent->Guid);
 }
diff --git a/llvm/lib/MC/MCRegisterInfo.cpp b/llvm/lib/MC/MCRegisterInfo.cpp
index d491c0eb7e06..d6c4fe10fc98 100644
--- a/llvm/lib/MC/MCRegisterInfo.cpp
+++ b/llvm/lib/MC/MCRegisterInfo.cpp
@@ -122,3 +122,14 @@ int MCRegisterInfo::getCodeViewRegNum(MCRegister RegNum) const {
                                                            : Twine(RegNum)));
   return I->second;
 }
+
+bool MCRegisterInfo::regsOverlap(MCRegister RegA, MCRegister RegB) const {
+  // Regunits are numerically ordered. Find a common unit.
+  MCRegUnitIterator RUA(RegA, this);
+  MCRegUnitIterator RUB(RegB, this);
+  do {
+    if (*RUA == *RUB)
+      return true;
+  } while (*RUA < *RUB ? (++RUA).isValid() : (++RUB).isValid());
+  return false;
+}
diff --git a/llvm/lib/MC/MCSPIRVStreamer.cpp b/llvm/lib/MC/MCSPIRVStreamer.cpp
new file mode 100644
index 000000000000..863db7f36f29
--- /dev/null
+++ b/llvm/lib/MC/MCSPIRVStreamer.cpp
@@ -0,0 +1,45 @@
+//===- lib/MC/MCSPIRVStreamer.cpp - SPIR-V Object Output ------*- C++ -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file assembles .s files and emits SPIR-V .o object files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCSPIRVStreamer.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/TargetRegistry.h"
+
+using namespace llvm;
+
+void MCSPIRVStreamer::emitInstToData(const MCInst &Inst,
+                                     const MCSubtargetInfo &STI) {
+  MCAssembler &Assembler = getAssembler();
+  SmallVector<MCFixup, 0> Fixups;
+  SmallString<256> Code;
+  raw_svector_ostream VecOS(Code);
+  Assembler.getEmitter().encodeInstruction(Inst, VecOS, Fixups, STI);
+
+  // Append the encoded instruction to the current data fragment (or create a
+  // new such fragment if the current fragment is not a data fragment).
+  MCDataFragment *DF = getOrCreateDataFragment();
+
+  DF->setHasInstructions(STI);
+  DF->getContents().append(Code.begin(), Code.end());
+}
+
+MCStreamer *llvm::createSPIRVStreamer(MCContext &Context,
+                                      std::unique_ptr<MCAsmBackend> &&MAB,
+                                      std::unique_ptr<MCObjectWriter> &&OW,
+                                      std::unique_ptr<MCCodeEmitter> &&CE,
+                                      bool RelaxAll) {
+  MCSPIRVStreamer *S = new MCSPIRVStreamer(Context, std::move(MAB),
+                                           std::move(OW), std::move(CE));
+  if (RelaxAll)
+    S->getAssembler().setRelaxAll(true);
+  return S;
+}
diff --git a/llvm/lib/MC/MCSchedule.cpp b/llvm/lib/MC/MCSchedule.cpp
index db08e2044113..98eb7eada064 100644
--- a/llvm/lib/MC/MCSchedule.cpp
+++ b/llvm/lib/MC/MCSchedule.cpp
@@ -98,7 +98,7 @@ MCSchedModel::getReciprocalThroughput(const MCSubtargetInfo &STI,
     double Temp = NumUnits * 1.0 / I->Cycles;
     Throughput = Throughput ? std::min(Throughput.getValue(), Temp) : Temp;
   }
-  if (Throughput.hasValue())
+  if (Throughput)
     return 1.0 / Throughput.getValue();
 
   // If no throughput value was calculated, assume that we can execute at the
@@ -142,7 +142,7 @@ MCSchedModel::getReciprocalThroughput(unsigned SchedClass,
     double Temp = countPopulation(I->getUnits()) * 1.0 / I->getCycles();
     Throughput = Throughput ? std::min(Throughput.getValue(), Temp) : Temp;
   }
-  if (Throughput.hasValue())
+  if (Throughput)
     return 1.0 / Throughput.getValue();
 
   // If there are no execution resources specified for this class, then assume
diff --git a/llvm/lib/MC/MCSection.cpp b/llvm/lib/MC/MCSection.cpp
index 8342abacec09..7547558fe6e2 100644
--- a/llvm/lib/MC/MCSection.cpp
+++ b/llvm/lib/MC/MCSection.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCSection.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/MC/MCContext.h"
@@ -15,7 +16,6 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <utility>
 
 using namespace llvm;
diff --git a/llvm/lib/MC/MCSectionCOFF.cpp b/llvm/lib/MC/MCSectionCOFF.cpp
index 387bf2c884e5..f7ca0375544a 100644
--- a/llvm/lib/MC/MCSectionCOFF.cpp
+++ b/llvm/lib/MC/MCSectionCOFF.cpp
@@ -14,9 +14,9 @@
 
 using namespace llvm;
 
-// ShouldOmitSectionDirective - Decides whether a '.section' directive
+// shouldOmitSectionDirective - Decides whether a '.section' directive
 // should be printed before the section name
-bool MCSectionCOFF::ShouldOmitSectionDirective(StringRef Name,
+bool MCSectionCOFF::shouldOmitSectionDirective(StringRef Name,
                                                const MCAsmInfo &MAI) const {
   if (COMDATSymbol)
     return false;
@@ -34,11 +34,11 @@ void MCSectionCOFF::setSelection(int Selection) const {
   Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
 }
 
-void MCSectionCOFF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
+void MCSectionCOFF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                                          raw_ostream &OS,
                                          const MCExpr *Subsection) const {
   // standard sections don't require the '.section'
-  if (ShouldOmitSectionDirective(getName(), MAI)) {
+  if (shouldOmitSectionDirective(getName(), MAI)) {
     OS << '\t' << getName() << '\n';
     return;
   }
@@ -104,9 +104,7 @@ void MCSectionCOFF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
   OS << '\n';
 }
 
-bool MCSectionCOFF::UseCodeAlign() const {
-  return getKind().isText();
-}
+bool MCSectionCOFF::useCodeAlign() const { return getKind().isText(); }
 
 bool MCSectionCOFF::isVirtualSection() const {
   return getCharacteristics() & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA;
diff --git a/llvm/lib/MC/MCSectionDXContainer.cpp b/llvm/lib/MC/MCSectionDXContainer.cpp
new file mode 100644
index 000000000000..065b506c21ce
--- /dev/null
+++ b/llvm/lib/MC/MCSectionDXContainer.cpp
@@ -0,0 +1,15 @@
+//===- lib/MC/MCSectionDXContainer.cpp - DXContainer Section --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCSectionDXContainer.h"
+
+using namespace llvm;
+
+void MCSectionDXContainer::printSwitchToSection(const MCAsmInfo &,
+                                                const Triple &, raw_ostream &,
+                                                const MCExpr *) const {}
diff --git a/llvm/lib/MC/MCSectionELF.cpp b/llvm/lib/MC/MCSectionELF.cpp
index d18876507cd7..27dc1826819b 100644
--- a/llvm/lib/MC/MCSectionELF.cpp
+++ b/llvm/lib/MC/MCSectionELF.cpp
@@ -19,7 +19,7 @@ using namespace llvm;
 
 // Decides whether a '.section' directive
 // should be printed before the section name.
-bool MCSectionELF::ShouldOmitSectionDirective(StringRef Name,
+bool MCSectionELF::shouldOmitSectionDirective(StringRef Name,
                                               const MCAsmInfo &MAI) const {
   if (isUnique())
     return false;
@@ -50,10 +50,10 @@ static void printName(raw_ostream &OS, StringRef Name) {
   OS << '"';
 }
 
-void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
+void MCSectionELF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                                         raw_ostream &OS,
                                         const MCExpr *Subsection) const {
-  if (ShouldOmitSectionDirective(getName(), MAI)) {
+  if (shouldOmitSectionDirective(getName(), MAI)) {
     OS << '\t' << getName();
     if (Subsection) {
       OS << '\t';
@@ -105,6 +105,11 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
   if (Flags & ELF::SHF_GNU_RETAIN)
     OS << 'R';
 
+  // If there are os-specific flags, print them.
+  if (T.isOSSolaris())
+    if (Flags & ELF::SHF_SUNW_NODISCARD)
+      OS << 'R';
+
   // If there are target-specific flags, print them.
   Triple::ArchType Arch = T.getArch();
   if (Arch == Triple::xcore) {
@@ -160,6 +165,8 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
     OS << "llvm_sympart";
   else if (Type == ELF::SHT_LLVM_BB_ADDR_MAP)
     OS << "llvm_bb_addr_map";
+  else if (Type == ELF::SHT_LLVM_BB_ADDR_MAP_V0)
+    OS << "llvm_bb_addr_map_v0";
   else
     report_fatal_error("unsupported type 0x" + Twine::utohexstr(Type) +
                        " for section " + getName());
@@ -196,7 +203,7 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
   }
 }
 
-bool MCSectionELF::UseCodeAlign() const {
+bool MCSectionELF::useCodeAlign() const {
   return getFlags() & ELF::SHF_EXECINSTR;
 }
 
diff --git a/llvm/lib/MC/MCSectionMachO.cpp b/llvm/lib/MC/MCSectionMachO.cpp
index d914e64ca23a..1c210fb0f4c8 100644
--- a/llvm/lib/MC/MCSectionMachO.cpp
+++ b/llvm/lib/MC/MCSectionMachO.cpp
@@ -7,9 +7,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCSectionMachO.h"
-#include "llvm/MC/MCContext.h"
+#include "llvm/MC/SectionKind.h"
 #include "llvm/Support/raw_ostream.h"
-#include <cctype>
+
+namespace llvm {
+class MCAsmInfo;
+class MCExpr;
+class MCSymbol;
+class Triple;
+} // namespace llvm
+
 using namespace llvm;
 
 /// SectionTypeDescriptors - These are strings that describe the various section
@@ -19,7 +26,7 @@ static constexpr struct {
   StringLiteral AssemblerName, EnumName;
 } SectionTypeDescriptors[MachO::LAST_KNOWN_SECTION_TYPE + 1] = {
     {StringLiteral("regular"), StringLiteral("S_REGULAR")}, // 0x00
-    {StringLiteral(""), StringLiteral("S_ZEROFILL")},       // 0x01
+    {StringLiteral("zerofill"), StringLiteral("S_ZEROFILL")}, // 0x01
     {StringLiteral("cstring_literals"),
      StringLiteral("S_CSTRING_LITERALS")}, // 0x02
     {StringLiteral("4byte_literals"),
@@ -95,7 +102,7 @@ MCSectionMachO::MCSectionMachO(StringRef Segment, StringRef Section,
   }
 }
 
-void MCSectionMachO::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
+void MCSectionMachO::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                                           raw_ostream &OS,
                                           const MCExpr *Subsection) const {
   OS << "\t.section\t" << getSegmentName() << ',' << getName();
@@ -159,7 +166,7 @@ void MCSectionMachO::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
   OS << '\n';
 }
 
-bool MCSectionMachO::UseCodeAlign() const {
+bool MCSectionMachO::useCodeAlign() const {
   return hasAttribute(MachO::S_ATTR_PURE_INSTRUCTIONS);
 }
 
diff --git a/llvm/lib/MC/MCSectionWasm.cpp b/llvm/lib/MC/MCSectionWasm.cpp
index 459913263268..e90f401b1efa 100644
--- a/llvm/lib/MC/MCSectionWasm.cpp
+++ b/llvm/lib/MC/MCSectionWasm.cpp
@@ -9,7 +9,6 @@
 #include "llvm/MC/MCSectionWasm.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolWasm.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -45,7 +44,7 @@ static void printName(raw_ostream &OS, StringRef Name) {
   OS << '"';
 }
 
-void MCSectionWasm::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
+void MCSectionWasm::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                                          raw_ostream &OS,
                                          const MCExpr *Subsection) const {
 
@@ -102,6 +101,6 @@ void MCSectionWasm::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
   }
 }
 
-bool MCSectionWasm::UseCodeAlign() const { return false; }
+bool MCSectionWasm::useCodeAlign() const { return false; }
 
 bool MCSectionWasm::isVirtualSection() const { return false; }
diff --git a/llvm/lib/MC/MCSectionXCOFF.cpp b/llvm/lib/MC/MCSectionXCOFF.cpp
index 2ff4839d3706..ee8fa04c421f 100644
--- a/llvm/lib/MC/MCSectionXCOFF.cpp
+++ b/llvm/lib/MC/MCSectionXCOFF.cpp
@@ -8,10 +8,12 @@
 
 #include "llvm/MC/MCSectionXCOFF.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+namespace llvm {
+class MCExpr;
+class Triple;
+} // namespace llvm
 
 using namespace llvm;
 
@@ -22,7 +24,7 @@ void MCSectionXCOFF::printCsectDirective(raw_ostream &OS) const {
      << '\n';
 }
 
-void MCSectionXCOFF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
+void MCSectionXCOFF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                                           raw_ostream &OS,
                                           const MCExpr *Subsection) const {
   if (getKind().isText()) {
@@ -117,7 +119,7 @@ void MCSectionXCOFF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
   report_fatal_error("Printing for this SectionKind is unimplemented.");
 }
 
-bool MCSectionXCOFF::UseCodeAlign() const { return getKind().isText(); }
+bool MCSectionXCOFF::useCodeAlign() const { return getKind().isText(); }
 
 bool MCSectionXCOFF::isVirtualSection() const {
   // DWARF sections are always not virtual.
diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp
index a14f0de65a9d..a229d282dabe 100644
--- a/llvm/lib/MC/MCStreamer.cpp
+++ b/llvm/lib/MC/MCStreamer.cpp
@@ -12,6 +12,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/COFF.h"
+#include "llvm/BinaryFormat/MachO.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -59,7 +60,7 @@ void MCTargetStreamer::changeSection(const MCSection *CurSection,
                                      MCSection *Section,
                                      const MCExpr *Subsection,
                                      raw_ostream &OS) {
-  Section->PrintSwitchToSection(*Streamer.getContext().getAsmInfo(),
+  Section->printSwitchToSection(*Streamer.getContext().getAsmInfo(),
                                 Streamer.getContext().getTargetTriple(), OS,
                                 Subsection);
 }
@@ -96,7 +97,7 @@ MCStreamer::MCStreamer(MCContext &Ctx)
   SectionStack.push_back(std::pair<MCSectionSubPair, MCSectionSubPair>());
 }
 
-MCStreamer::~MCStreamer() {}
+MCStreamer::~MCStreamer() = default;
 
 void MCStreamer::reset() {
   DwarfFrameInfos.clear();
@@ -107,7 +108,7 @@ void MCStreamer::reset() {
   SectionStack.push_back(std::pair<MCSectionSubPair, MCSectionSubPair>());
 }
 
-raw_ostream &MCStreamer::GetCommentOS() {
+raw_ostream &MCStreamer::getCommentOS() {
   // By default, discard comments.
   return nulls();
 }
@@ -186,7 +187,7 @@ void MCStreamer::emitSymbolValue(const MCSymbol *Sym, unsigned Size,
   if (!IsSectionRelative)
     emitValueImpl(MCSymbolRefExpr::create(Sym, getContext()), Size);
   else
-    EmitCOFFSecRel32(Sym, /*Offset=*/0);
+    emitCOFFSecRel32(Sym, /*Offset=*/0);
 }
 
 void MCStreamer::emitDTPRel64Value(const MCExpr *Value) {
@@ -251,6 +252,13 @@ void MCStreamer::emitCFIBKeyFrame() {
   CurFrame->IsBKeyFrame = true;
 }
 
+void MCStreamer::emitCFIMTETaggedFrame() {
+  MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
+  if (!CurFrame)
+    return;
+  CurFrame->IsMTETaggedFrame = true;
+}
+
 void MCStreamer::emitDwarfLocDirective(unsigned FileNo, unsigned Line,
                                        unsigned Column, unsigned Flags,
                                        unsigned Isa, unsigned Discriminator,
@@ -283,18 +291,18 @@ MCDwarfFrameInfo *MCStreamer::getCurrentDwarfFrameInfo() {
   return &DwarfFrameInfos.back();
 }
 
-bool MCStreamer::EmitCVFileDirective(unsigned FileNo, StringRef Filename,
+bool MCStreamer::emitCVFileDirective(unsigned FileNo, StringRef Filename,
                                      ArrayRef<uint8_t> Checksum,
                                      unsigned ChecksumKind) {
   return getContext().getCVContext().addFile(*this, FileNo, Filename, Checksum,
                                              ChecksumKind);
 }
 
-bool MCStreamer::EmitCVFuncIdDirective(unsigned FunctionId) {
+bool MCStreamer::emitCVFuncIdDirective(unsigned FunctionId) {
   return getContext().getCVContext().recordFunctionId(FunctionId);
 }
 
-bool MCStreamer::EmitCVInlineSiteIdDirective(unsigned FunctionId,
+bool MCStreamer::emitCVInlineSiteIdDirective(unsigned FunctionId,
                                              unsigned IAFunc, unsigned IAFile,
                                              unsigned IALine, unsigned IACol,
                                              SMLoc Loc) {
@@ -400,10 +408,10 @@ void MCStreamer::emitEHSymAttributes(const MCSymbol *Symbol,
 }
 
 void MCStreamer::initSections(bool NoExecStack, const MCSubtargetInfo &STI) {
-  SwitchSection(getContext().getObjectFileInfo()->getTextSection());
+  switchSection(getContext().getObjectFileInfo()->getTextSection());
 }
 
-void MCStreamer::AssignFragment(MCSymbol *Symbol, MCFragment *Fragment) {
+void MCStreamer::assignFragment(MCSymbol *Symbol, MCFragment *Fragment) {
   assert(Fragment);
   Symbol->setFragment(Fragment);
 
@@ -698,7 +706,7 @@ WinEH::FrameInfo *MCStreamer::EnsureValidWinFrameInfo(SMLoc Loc) {
   return CurrentWinFrameInfo;
 }
 
-void MCStreamer::EmitWinCFIStartProc(const MCSymbol *Symbol, SMLoc Loc) {
+void MCStreamer::emitWinCFIStartProc(const MCSymbol *Symbol, SMLoc Loc) {
   const MCAsmInfo *MAI = Context.getAsmInfo();
   if (!MAI->usesWindowsCFI())
     return getContext().reportError(
@@ -716,7 +724,7 @@ void MCStreamer::EmitWinCFIStartProc(const MCSymbol *Symbol, SMLoc Loc) {
   CurrentWinFrameInfo->TextSection = getCurrentSectionOnly();
 }
 
-void MCStreamer::EmitWinCFIEndProc(SMLoc Loc) {
+void MCStreamer::emitWinCFIEndProc(SMLoc Loc) {
   WinEH::FrameInfo *CurFrame = EnsureValidWinFrameInfo(Loc);
   if (!CurFrame)
     return;
@@ -730,11 +738,11 @@ void MCStreamer::EmitWinCFIEndProc(SMLoc Loc) {
 
   for (size_t I = CurrentProcWinFrameInfoStartIndex, E = WinFrameInfos.size();
        I != E; ++I)
-    EmitWindowsUnwindTables(WinFrameInfos[I].get());
-  SwitchSection(CurFrame->TextSection);
+    emitWindowsUnwindTables(WinFrameInfos[I].get());
+  switchSection(CurFrame->TextSection);
 }
 
-void MCStreamer::EmitWinCFIFuncletOrFuncEnd(SMLoc Loc) {
+void MCStreamer::emitWinCFIFuncletOrFuncEnd(SMLoc Loc) {
   WinEH::FrameInfo *CurFrame = EnsureValidWinFrameInfo(Loc);
   if (!CurFrame)
     return;
@@ -745,7 +753,7 @@ void MCStreamer::EmitWinCFIFuncletOrFuncEnd(SMLoc Loc) {
   CurFrame->FuncletOrFuncEnd = Label;
 }
 
-void MCStreamer::EmitWinCFIStartChained(SMLoc Loc) {
+void MCStreamer::emitWinCFIStartChained(SMLoc Loc) {
   WinEH::FrameInfo *CurFrame = EnsureValidWinFrameInfo(Loc);
   if (!CurFrame)
     return;
@@ -758,7 +766,7 @@ void MCStreamer::EmitWinCFIStartChained(SMLoc Loc) {
   CurrentWinFrameInfo->TextSection = getCurrentSectionOnly();
 }
 
-void MCStreamer::EmitWinCFIEndChained(SMLoc Loc) {
+void MCStreamer::emitWinCFIEndChained(SMLoc Loc) {
   WinEH::FrameInfo *CurFrame = EnsureValidWinFrameInfo(Loc);
   if (!CurFrame)
     return;
@@ -772,7 +780,7 @@ void MCStreamer::EmitWinCFIEndChained(SMLoc Loc) {
   CurrentWinFrameInfo = const_cast<WinEH::FrameInfo *>(CurFrame->ChainedParent);
 }
 
-void MCStreamer::EmitWinEHHandler(const MCSymbol *Sym, bool Unwind, bool Except,
+void MCStreamer::emitWinEHHandler(const MCSymbol *Sym, bool Unwind, bool Except,
                                   SMLoc Loc) {
   WinEH::FrameInfo *CurFrame = EnsureValidWinFrameInfo(Loc);
   if (!CurFrame)
@@ -789,7 +797,7 @@ void MCStreamer::EmitWinEHHandler(const MCSymbol *Sym, bool Unwind, bool Except,
     CurFrame->HandlesExceptions = true;
 }
 
-void MCStreamer::EmitWinEHHandlerData(SMLoc Loc) {
+void MCStreamer::emitWinEHHandlerData(SMLoc Loc) {
   WinEH::FrameInfo *CurFrame = EnsureValidWinFrameInfo(Loc);
   if (!CurFrame)
     return;
@@ -853,7 +861,7 @@ static unsigned encodeSEHRegNum(MCContext &Ctx, MCRegister Reg) {
   return Ctx.getRegisterInfo()->getSEHRegNum(Reg);
 }
 
-void MCStreamer::EmitWinCFIPushReg(MCRegister Register, SMLoc Loc) {
+void MCStreamer::emitWinCFIPushReg(MCRegister Register, SMLoc Loc) {
   WinEH::FrameInfo *CurFrame = EnsureValidWinFrameInfo(Loc);
   if (!CurFrame)
     return;
@@ -865,7 +873,7 @@ void MCStreamer::EmitWinCFIPushReg(MCRegister Register, SMLoc Loc) {
   CurFrame->Instructions.push_back(Inst);
 }
 
-void MCStreamer::EmitWinCFISetFrame(MCRegister Register, unsigned Offset,
+void MCStreamer::emitWinCFISetFrame(MCRegister Register, unsigned Offset,
                                     SMLoc Loc) {
   WinEH::FrameInfo *CurFrame = EnsureValidWinFrameInfo(Loc);
   if (!CurFrame)
@@ -887,7 +895,7 @@ void MCStreamer::EmitWinCFISetFrame(MCRegister Register, unsigned Offset,
   CurFrame->Instructions.push_back(Inst);
 }
 
-void MCStreamer::EmitWinCFIAllocStack(unsigned Size, SMLoc Loc) {
+void MCStreamer::emitWinCFIAllocStack(unsigned Size, SMLoc Loc) {
   WinEH::FrameInfo *CurFrame = EnsureValidWinFrameInfo(Loc);
   if (!CurFrame)
     return;
@@ -904,7 +912,7 @@ void MCStreamer::EmitWinCFIAllocStack(unsigned Size, SMLoc Loc) {
   CurFrame->Instructions.push_back(Inst);
 }
 
-void MCStreamer::EmitWinCFISaveReg(MCRegister Register, unsigned Offset,
+void MCStreamer::emitWinCFISaveReg(MCRegister Register, unsigned Offset,
                                    SMLoc Loc) {
   WinEH::FrameInfo *CurFrame = EnsureValidWinFrameInfo(Loc);
   if (!CurFrame)
@@ -921,7 +929,7 @@ void MCStreamer::EmitWinCFISaveReg(MCRegister Register, unsigned Offset,
   CurFrame->Instructions.push_back(Inst);
 }
 
-void MCStreamer::EmitWinCFISaveXMM(MCRegister Register, unsigned Offset,
+void MCStreamer::emitWinCFISaveXMM(MCRegister Register, unsigned Offset,
                                    SMLoc Loc) {
   WinEH::FrameInfo *CurFrame = EnsureValidWinFrameInfo(Loc);
   if (!CurFrame)
@@ -936,7 +944,7 @@ void MCStreamer::EmitWinCFISaveXMM(MCRegister Register, unsigned Offset,
   CurFrame->Instructions.push_back(Inst);
 }
 
-void MCStreamer::EmitWinCFIPushFrame(bool Code, SMLoc Loc) {
+void MCStreamer::emitWinCFIPushFrame(bool Code, SMLoc Loc) {
   WinEH::FrameInfo *CurFrame = EnsureValidWinFrameInfo(Loc);
   if (!CurFrame)
     return;
@@ -950,7 +958,7 @@ void MCStreamer::EmitWinCFIPushFrame(bool Code, SMLoc Loc) {
   CurFrame->Instructions.push_back(Inst);
 }
 
-void MCStreamer::EmitWinCFIEndProlog(SMLoc Loc) {
+void MCStreamer::emitWinCFIEndProlog(SMLoc Loc) {
   WinEH::FrameInfo *CurFrame = EnsureValidWinFrameInfo(Loc);
   if (!CurFrame)
     return;
@@ -960,15 +968,15 @@ void MCStreamer::EmitWinCFIEndProlog(SMLoc Loc) {
   CurFrame->PrologEnd = Label;
 }
 
-void MCStreamer::EmitCOFFSafeSEH(MCSymbol const *Symbol) {}
+void MCStreamer::emitCOFFSafeSEH(MCSymbol const *Symbol) {}
 
-void MCStreamer::EmitCOFFSymbolIndex(MCSymbol const *Symbol) {}
+void MCStreamer::emitCOFFSymbolIndex(MCSymbol const *Symbol) {}
 
-void MCStreamer::EmitCOFFSectionIndex(MCSymbol const *Symbol) {}
+void MCStreamer::emitCOFFSectionIndex(MCSymbol const *Symbol) {}
 
-void MCStreamer::EmitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset) {}
+void MCStreamer::emitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset) {}
 
-void MCStreamer::EmitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) {}
+void MCStreamer::emitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) {}
 
 /// EmitRawText - If this file is backed by an assembly streamer, this dumps
 /// the specified string in the output .s file.  This capability is
@@ -987,13 +995,11 @@ void MCStreamer::emitRawText(const Twine &T) {
   emitRawTextImpl(T.toStringRef(Str));
 }
 
-void MCStreamer::EmitWindowsUnwindTables() {
-}
+void MCStreamer::emitWindowsUnwindTables() {}
 
-void MCStreamer::EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) {
-}
+void MCStreamer::emitWindowsUnwindTables(WinEH::FrameInfo *Frame) {}
 
-void MCStreamer::Finish(SMLoc EndLoc) {
+void MCStreamer::finish(SMLoc EndLoc) {
   if ((!DwarfFrameInfos.empty() && !DwarfFrameInfos.back().End) ||
       (!WinFrameInfos.empty() && !WinFrameInfos.back()->End)) {
     getContext().reportError(EndLoc, "Unfinished frame!");
@@ -1145,20 +1151,20 @@ void MCStreamer::emitAbsoluteSymbolDiffAsULEB128(const MCSymbol *Hi,
 void MCStreamer::emitAssemblerFlag(MCAssemblerFlag Flag) {}
 void MCStreamer::emitThumbFunc(MCSymbol *Func) {}
 void MCStreamer::emitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {}
-void MCStreamer::BeginCOFFSymbolDef(const MCSymbol *Symbol) {
+void MCStreamer::beginCOFFSymbolDef(const MCSymbol *Symbol) {
   llvm_unreachable("this directive only supported on COFF targets");
 }
-void MCStreamer::EndCOFFSymbolDef() {
+void MCStreamer::endCOFFSymbolDef() {
   llvm_unreachable("this directive only supported on COFF targets");
 }
 void MCStreamer::emitFileDirective(StringRef Filename) {}
 void MCStreamer::emitFileDirective(StringRef Filename, StringRef CompilerVerion,
                                    StringRef TimeStamp, StringRef Description) {
 }
-void MCStreamer::EmitCOFFSymbolStorageClass(int StorageClass) {
+void MCStreamer::emitCOFFSymbolStorageClass(int StorageClass) {
   llvm_unreachable("this directive only supported on COFF targets");
 }
-void MCStreamer::EmitCOFFSymbolType(int Type) {
+void MCStreamer::emitCOFFSymbolType(int Type) {
   llvm_unreachable("this directive only supported on COFF targets");
 }
 void MCStreamer::emitXCOFFLocalCommonSymbol(MCSymbol *LabelSym, uint64_t Size,
@@ -1180,6 +1186,10 @@ void MCStreamer::emitXCOFFRenameDirective(const MCSymbol *Name,
                    "XCOFF targets");
 }
 
+void MCStreamer::emitXCOFFRefDirective(StringRef Name) {
+  llvm_unreachable("emitXCOFFRefDirective is only supported on XCOFF targets");
+}
+
 void MCStreamer::emitELFSize(MCSymbol *Symbol, const MCExpr *Value) {}
 void MCStreamer::emitELFSymverDirective(const MCSymbol *OriginalSym,
                                         StringRef Name, bool KeepOriginalSym) {}
@@ -1212,7 +1222,7 @@ void MCStreamer::emitBundleLock(bool AlignToEnd) {}
 void MCStreamer::finishImpl() {}
 void MCStreamer::emitBundleUnlock() {}
 
-void MCStreamer::SwitchSection(MCSection *Section, const MCExpr *Subsection) {
+void MCStreamer::switchSection(MCSection *Section, const MCExpr *Subsection) {
   assert(Section && "Cannot switch to a null section!");
   MCSectionSubPair curSection = SectionStack.back().first;
   SectionStack.back().second = curSection;
@@ -1233,7 +1243,7 @@ MCSymbol *MCStreamer::endSection(MCSection *Section) {
   if (Sym->isInSection())
     return Sym;
 
-  SwitchSection(Section);
+  switchSection(Section);
   emitLabel(Sym);
   return Sym;
 }
@@ -1281,6 +1291,9 @@ static VersionTuple getMachoBuildVersionSupportedOS(const Triple &Target) {
     return VersionTuple(12);
   case Triple::WatchOS:
     return VersionTuple(5);
+  case Triple::DriverKit:
+    // DriverKit always uses the build version load command.
+    return VersionTuple();
   default:
     break;
   }
@@ -1305,6 +1318,8 @@ getMachoBuildVersionPlatformType(const Triple &Target) {
   case Triple::WatchOS:
     return Target.isSimulatorEnvironment() ? MachO::PLATFORM_WATCHOSSIMULATOR
                                            : MachO::PLATFORM_WATCHOS;
+  case Triple::DriverKit:
+    return MachO::PLATFORM_DRIVERKIT;
   default:
     break;
   }
@@ -1334,6 +1349,9 @@ void MCStreamer::emitVersionForTarget(
   case Triple::WatchOS:
     Version = Target.getWatchOSVersion();
     break;
+  case Triple::DriverKit:
+    Version = Target.getDriverKitVersion();
+    break;
   default:
     llvm_unreachable("unexpected OS type");
   }
@@ -1353,15 +1371,14 @@ void MCStreamer::emitVersionForTarget(
       emitDarwinTargetVariantBuildVersion(
           getMachoBuildVersionPlatformType(Target),
           LinkedTargetVersion.getMajor(),
-          LinkedTargetVersion.getMinor().getValueOr(0),
-          LinkedTargetVersion.getSubminor().getValueOr(0), SDKVersion);
+          LinkedTargetVersion.getMinor().value_or(0),
+          LinkedTargetVersion.getSubminor().value_or(0), SDKVersion);
       return;
     }
     emitBuildVersion(getMachoBuildVersionPlatformType(Target),
                      LinkedTargetVersion.getMajor(),
-                     LinkedTargetVersion.getMinor().getValueOr(0),
-                     LinkedTargetVersion.getSubminor().getValueOr(0),
-                     SDKVersion);
+                     LinkedTargetVersion.getMinor().value_or(0),
+                     LinkedTargetVersion.getSubminor().value_or(0), SDKVersion);
     ShouldEmitBuildVersion = true;
   }
 
@@ -1372,8 +1389,8 @@ void MCStreamer::emitVersionForTarget(
       emitDarwinTargetVariantBuildVersion(
           getMachoBuildVersionPlatformType(*TVT),
           TVLinkedTargetVersion.getMajor(),
-          TVLinkedTargetVersion.getMinor().getValueOr(0),
-          TVLinkedTargetVersion.getSubminor().getValueOr(0),
+          TVLinkedTargetVersion.getMinor().value_or(0),
+          TVLinkedTargetVersion.getSubminor().value_or(0),
           DarwinTargetVariantSDKVersion);
     }
   }
@@ -1383,6 +1400,6 @@ void MCStreamer::emitVersionForTarget(
 
   emitVersionMin(getMachoVersionMinLoadCommandType(Target),
                  LinkedTargetVersion.getMajor(),
-                 LinkedTargetVersion.getMinor().getValueOr(0),
-                 LinkedTargetVersion.getSubminor().getValueOr(0), SDKVersion);
+                 LinkedTargetVersion.getMinor().value_or(0),
+                 LinkedTargetVersion.getSubminor().value_or(0), SDKVersion);
 }
diff --git a/llvm/lib/MC/MCSymbol.cpp b/llvm/lib/MC/MCSymbol.cpp
index 67cab9a92722..4017225a81c4 100644
--- a/llvm/lib/MC/MCSymbol.cpp
+++ b/llvm/lib/MC/MCSymbol.cpp
@@ -11,7 +11,6 @@
 #include "llvm/Config/llvm-config.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFragment.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
diff --git a/llvm/lib/MC/MCSymbolELF.cpp b/llvm/lib/MC/MCSymbolELF.cpp
index 1830b87fd856..820a91f57c17 100644
--- a/llvm/lib/MC/MCSymbolELF.cpp
+++ b/llvm/lib/MC/MCSymbolELF.cpp
@@ -8,7 +8,6 @@
 
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/BinaryFormat/ELF.h"
-#include "llvm/MC/MCFixupKindInfo.h"
 
 namespace llvm {
 
diff --git a/llvm/lib/MC/MCTargetOptions.cpp b/llvm/lib/MC/MCTargetOptions.cpp
index eb57917ee8fd..c2946da3ee66 100644
--- a/llvm/lib/MC/MCTargetOptions.cpp
+++ b/llvm/lib/MC/MCTargetOptions.cpp
@@ -13,11 +13,12 @@ using namespace llvm;
 
 MCTargetOptions::MCTargetOptions()
     : MCRelaxAll(false), MCNoExecStack(false), MCFatalWarnings(false),
-      MCNoWarn(false), MCNoDeprecatedWarn(false),
-      MCNoTypeCheck(false), MCSaveTempLabels(false),
-      MCUseDwarfDirectory(false), MCIncrementalLinkerCompatible(false),
+      MCNoWarn(false), MCNoDeprecatedWarn(false), MCNoTypeCheck(false),
+      MCSaveTempLabels(false), MCIncrementalLinkerCompatible(false),
       ShowMCEncoding(false), ShowMCInst(false), AsmVerbose(false),
-      PreserveAsmComments(true), Dwarf64(false) {}
+      PreserveAsmComments(true), Dwarf64(false),
+      EmitDwarfUnwind(EmitDwarfUnwindType::Default),
+      MCUseDwarfDirectory(DefaultDwarfDirectory) {}
 
 StringRef MCTargetOptions::getABIName() const {
   return ABIName;
diff --git a/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp b/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp
index 762c8d43063c..a310dc894021 100644
--- a/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp
+++ b/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp
@@ -1,5 +1,4 @@
-//===-- MCTargetOptionsCommandFlags.cpp --------------------------*- C++
-//-*-===//
+//===-- MCTargetOptionsCommandFlags.cpp -----------------------*- C++ //-*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -39,6 +38,7 @@ MCOPT_EXP(bool, RelaxAll)
 MCOPT(bool, IncrementalLinkerCompatible)
 MCOPT(int, DwarfVersion)
 MCOPT(bool, Dwarf64)
+MCOPT(EmitDwarfUnwindType, EmitDwarfUnwind)
 MCOPT(bool, ShowMCInst)
 MCOPT(bool, FatalWarnings)
 MCOPT(bool, NoWarn)
@@ -73,6 +73,19 @@ llvm::mc::RegisterMCTargetOptionsFlags::RegisterMCTargetOptionsFlags() {
       cl::desc("Generate debugging info in the 64-bit DWARF format"));
   MCBINDOPT(Dwarf64);
 
+  static cl::opt<EmitDwarfUnwindType> EmitDwarfUnwind(
+      "emit-dwarf-unwind", cl::desc("Whether to emit DWARF EH frame entries."),
+      cl::init(EmitDwarfUnwindType::Default),
+      cl::values(clEnumValN(EmitDwarfUnwindType::Always, "always",
+                            "Always emit EH frame entries"),
+                 clEnumValN(EmitDwarfUnwindType::NoCompactUnwind,
+                            "no-compact-unwind",
+                            "Only emit EH frame entries when compact unwind is "
+                            "not available"),
+                 clEnumValN(EmitDwarfUnwindType::Default, "default",
+                            "Use target platform default")));
+  MCBINDOPT(EmitDwarfUnwind);
+
   static cl::opt<bool> ShowMCInst(
       "asm-show-inst",
       cl::desc("Emit internal instruction representation to assembly file"));
@@ -116,5 +129,7 @@ MCTargetOptions llvm::mc::InitMCTargetOptionsFromFlags() {
   Options.MCNoWarn = getNoWarn();
   Options.MCNoDeprecatedWarn = getNoDeprecatedWarn();
   Options.MCNoTypeCheck = getNoTypeCheck();
+  Options.EmitDwarfUnwind = getEmitDwarfUnwind();
+
   return Options;
 }
diff --git a/llvm/lib/MC/MCWasmStreamer.cpp b/llvm/lib/MC/MCWasmStreamer.cpp
index 90249fb7380a..ce948c7435f5 100644
--- a/llvm/lib/MC/MCWasmStreamer.cpp
+++ b/llvm/lib/MC/MCWasmStreamer.cpp
@@ -11,27 +11,30 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCWasmStreamer.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionWasm.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolWasm.h"
-#include "llvm/MC/MCValue.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 
+namespace llvm {
+class MCContext;
+class MCStreamer;
+class MCSubtargetInfo;
+} // namespace llvm
+
 using namespace llvm;
 
 MCWasmStreamer::~MCWasmStreamer() = default; // anchor.
@@ -118,6 +121,7 @@ bool MCWasmStreamer::emitSymbolAttribute(MCSymbol *S, MCSymbolAttr Attribute) {
   case MCSA_Invalid:
   case MCSA_IndirectSymbol:
   case MCSA_Protected:
+  case MCSA_Exported:
     return false;
 
   case MCSA_Hidden:
diff --git a/llvm/lib/MC/MCWin64EH.cpp b/llvm/lib/MC/MCWin64EH.cpp
index 2a93c352c68a..ffabe0fe8978 100644
--- a/llvm/lib/MC/MCWin64EH.cpp
+++ b/llvm/lib/MC/MCWin64EH.cpp
@@ -7,15 +7,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCWin64EH.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectStreamer.h"
-#include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Win64EH.h"
+namespace llvm {
+class MCSection;
+}
 
 using namespace llvm;
 
@@ -226,14 +228,14 @@ void llvm::Win64EH::UnwindEmitter::Emit(MCStreamer &Streamer) const {
   // Emit the unwind info structs first.
   for (const auto &CFI : Streamer.getWinFrameInfos()) {
     MCSection *XData = Streamer.getAssociatedXDataSection(CFI->TextSection);
-    Streamer.SwitchSection(XData);
+    Streamer.switchSection(XData);
     ::EmitUnwindInfo(Streamer, CFI.get());
   }
 
   // Now emit RUNTIME_FUNCTION entries.
   for (const auto &CFI : Streamer.getWinFrameInfos()) {
     MCSection *PData = Streamer.getAssociatedPDataSection(CFI->TextSection);
-    Streamer.SwitchSection(PData);
+    Streamer.switchSection(PData);
     EmitRuntimeFunction(Streamer, CFI.get());
   }
 }
@@ -244,13 +246,26 @@ void llvm::Win64EH::UnwindEmitter::EmitUnwindInfo(MCStreamer &Streamer,
   // Switch sections (the static function above is meant to be called from
   // here and from Emit().
   MCSection *XData = Streamer.getAssociatedXDataSection(info->TextSection);
-  Streamer.SwitchSection(XData);
+  Streamer.switchSection(XData);
 
   ::EmitUnwindInfo(Streamer, info);
 }
 
-static int64_t GetAbsDifference(MCStreamer &Streamer, const MCSymbol *LHS,
-                                const MCSymbol *RHS) {
+static const MCExpr *GetSubDivExpr(MCStreamer &Streamer, const MCSymbol *LHS,
+                                   const MCSymbol *RHS, int Div) {
+  MCContext &Context = Streamer.getContext();
+  const MCExpr *Expr =
+      MCBinaryExpr::createSub(MCSymbolRefExpr::create(LHS, Context),
+                              MCSymbolRefExpr::create(RHS, Context), Context);
+  if (Div != 1)
+    Expr = MCBinaryExpr::createDiv(Expr, MCConstantExpr::create(Div, Context),
+                                   Context);
+  return Expr;
+}
+
+static Optional<int64_t> GetOptionalAbsDifference(MCStreamer &Streamer,
+                                                  const MCSymbol *LHS,
+                                                  const MCSymbol *RHS) {
   MCContext &Context = Streamer.getContext();
   const MCExpr *Diff =
       MCBinaryExpr::createSub(MCSymbolRefExpr::create(LHS, Context),
@@ -261,10 +276,18 @@ static int64_t GetAbsDifference(MCStreamer &Streamer, const MCSymbol *LHS,
   // unusual constructs, like an inline asm with an alignment directive.
   int64_t value;
   if (!Diff->evaluateAsAbsolute(value, OS->getAssembler()))
-    report_fatal_error("Failed to evaluate function length in SEH unwind info");
+    return None;
   return value;
 }
 
+static int64_t GetAbsDifference(MCStreamer &Streamer, const MCSymbol *LHS,
+                                const MCSymbol *RHS) {
+  Optional<int64_t> MaybeDiff = GetOptionalAbsDifference(Streamer, LHS, RHS);
+  if (!MaybeDiff)
+    report_fatal_error("Failed to evaluate function length in SEH unwind info");
+  return *MaybeDiff;
+}
+
 static uint32_t ARM64CountOfUnwindCodes(ArrayRef<WinEH::Instruction> Insns) {
   uint32_t Count = 0;
   for (const auto &I : Insns) {
@@ -350,7 +373,7 @@ static uint32_t ARM64CountOfUnwindCodes(ArrayRef<WinEH::Instruction> Insns) {
 
 // Unwind opcode encodings and restrictions are documented at
 // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
-static void ARM64EmitUnwindCode(MCStreamer &streamer, const MCSymbol *begin,
+static void ARM64EmitUnwindCode(MCStreamer &streamer,
                                 const WinEH::Instruction &inst) {
   uint8_t b, reg;
   switch (static_cast<Win64EH::UnwindOpcodes>(inst.Operation)) {
@@ -513,7 +536,7 @@ static void ARM64EmitUnwindCode(MCStreamer &streamer, const MCSymbol *begin,
 }
 
 // Returns the epilog symbol of an epilog with the exact same unwind code
-// sequence, if it exists.  Otherwise, returns nulltpr.
+// sequence, if it exists.  Otherwise, returns nullptr.
 // EpilogInstrs - Unwind codes for the current epilog.
 // Epilogs - Epilogs that potentialy match the current epilog.
 static MCSymbol*
@@ -524,18 +547,16 @@ FindMatchingEpilog(const std::vector<WinEH::Instruction>& EpilogInstrs,
     auto InstrsIter = info->EpilogMap.find(EpilogStart);
     assert(InstrsIter != info->EpilogMap.end() &&
            "Epilog not found in EpilogMap");
-    const auto &Instrs = InstrsIter->second;
+    const auto &Instrs = InstrsIter->second.Instructions;
 
     if (Instrs.size() != EpilogInstrs.size())
       continue;
 
     bool Match = true;
     for (unsigned i = 0; i < Instrs.size(); ++i)
-      if (Instrs[i].Operation != EpilogInstrs[i].Operation ||
-          Instrs[i].Offset != EpilogInstrs[i].Offset ||
-          Instrs[i].Register != EpilogInstrs[i].Register) {
-         Match = false;
-         break;
+      if (Instrs[i] != EpilogInstrs[i]) {
+        Match = false;
+        break;
       }
 
     if (Match)
@@ -544,8 +565,8 @@ FindMatchingEpilog(const std::vector<WinEH::Instruction>& EpilogInstrs,
   return nullptr;
 }
 
-static void simplifyOpcodes(std::vector<WinEH::Instruction> &Instructions,
-                            bool Reverse) {
+static void simplifyARM64Opcodes(std::vector<WinEH::Instruction> &Instructions,
+                                 bool Reverse) {
   unsigned PrevOffset = -1;
   unsigned PrevRegister = -1;
 
@@ -606,26 +627,37 @@ static void simplifyOpcodes(std::vector<WinEH::Instruction> &Instructions,
   }
 }
 
-static int checkPackedEpilog(MCStreamer &streamer, WinEH::FrameInfo *info,
-                             int PrologCodeBytes) {
-  // Can only pack if there's one single epilog
-  if (info->EpilogMap.size() != 1)
-    return -1;
-
-  const std::vector<WinEH::Instruction> &Epilog =
-      info->EpilogMap.begin()->second;
-
-  // Can pack if the epilog is a subset of the prolog but not vice versa
-  if (Epilog.size() > info->Instructions.size())
+// Check if an epilog exists as a subset of the end of a prolog (backwards).
+static int
+getARM64OffsetInProlog(const std::vector<WinEH::Instruction> &Prolog,
+                       const std::vector<WinEH::Instruction> &Epilog) {
+  // Can't find an epilog as a subset if it is longer than the prolog.
+  if (Epilog.size() > Prolog.size())
     return -1;
 
   // Check that the epilog actually is a perfect match for the end (backwrds)
   // of the prolog.
   for (int I = Epilog.size() - 1; I >= 0; I--) {
-    if (info->Instructions[I] != Epilog[Epilog.size() - 1 - I])
+    if (Prolog[I] != Epilog[Epilog.size() - 1 - I])
       return -1;
   }
 
+  // If the epilog was a subset of the prolog, find its offset.
+  if (Epilog.size() == Prolog.size())
+    return 0;
+  return ARM64CountOfUnwindCodes(ArrayRef<WinEH::Instruction>(
+      &Prolog[Epilog.size()], Prolog.size() - Epilog.size()));
+}
+
+static int checkARM64PackedEpilog(MCStreamer &streamer, WinEH::FrameInfo *info,
+                                  int PrologCodeBytes) {
+  // Can only pack if there's one single epilog
+  if (info->EpilogMap.size() != 1)
+    return -1;
+
+  const std::vector<WinEH::Instruction> &Epilog =
+      info->EpilogMap.begin()->second.Instructions;
+
   // Check that the epilog actually is at the very end of the function,
   // otherwise it can't be packed.
   uint32_t DistanceFromEnd = (uint32_t)GetAbsDifference(
@@ -633,24 +665,33 @@ static int checkPackedEpilog(MCStreamer &streamer, WinEH::FrameInfo *info,
   if (DistanceFromEnd / 4 != Epilog.size())
     return -1;
 
-  int Offset = Epilog.size() == info->Instructions.size()
-                   ? 0
-                   : ARM64CountOfUnwindCodes(ArrayRef<WinEH::Instruction>(
-                         &info->Instructions[Epilog.size()],
-                         info->Instructions.size() - Epilog.size()));
+  int RetVal = -1;
+  // Even if we don't end up sharing opcodes with the prolog, we can still
+  // write the offset as a packed offset, if the single epilog is located at
+  // the end of the function and the offset (pointing after the prolog) fits
+  // as a packed offset.
+  if (PrologCodeBytes <= 31 &&
+      PrologCodeBytes + ARM64CountOfUnwindCodes(Epilog) <= 124)
+    RetVal = PrologCodeBytes;
+
+  int Offset = getARM64OffsetInProlog(info->Instructions, Epilog);
+  if (Offset < 0)
+    return RetVal;
 
   // Check that the offset and prolog size fits in the first word; it's
   // unclear whether the epilog count in the extension word can be taken
   // as packed epilog offset.
   if (Offset > 31 || PrologCodeBytes > 124)
-    return -1;
+    return RetVal;
 
+  // As we choose to express the epilog as part of the prolog, remove the
+  // epilog from the map, so we don't try to emit its opcodes.
   info->EpilogMap.clear();
   return Offset;
 }
 
-static bool tryPackedUnwind(WinEH::FrameInfo *info, uint32_t FuncLength,
-                            int PackedEpilogOffset) {
+static bool tryARM64PackedUnwind(WinEH::FrameInfo *info, uint32_t FuncLength,
+                                 int PackedEpilogOffset) {
   if (PackedEpilogOffset == 0) {
     // Fully symmetric prolog and epilog, should be ok for packed format.
     // For CR=3, the corresponding synthesized epilog actually lacks the
@@ -842,6 +883,16 @@ static bool tryPackedUnwind(WinEH::FrameInfo *info, uint32_t FuncLength,
   if (Nops != 0 && Nops != 4)
     return false;
   int H = Nops == 4;
+  // There's an inconsistency regarding packed unwind info with homed
+  // parameters; according to the documentation, the epilog shouldn't have
+  // the same corresponding nops (and thus, to set the H bit, we should
+  // require an epilog which isn't exactly symmetrical - we shouldn't accept
+  // an exact mirrored epilog for those cases), but in practice,
+  // RtlVirtualUnwind behaves as if it does expect the epilogue to contain
+  // the same nops. See https://github.com/llvm/llvm-project/issues/54879.
+  // To play it safe, don't produce packed unwind info with homed parameters.
+  if (H)
+    return false;
   int IntSZ = 8 * RegI;
   if (StandaloneLR)
     IntSZ += 8;
@@ -901,9 +952,9 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info,
     return;
   }
 
-  simplifyOpcodes(info->Instructions, false);
+  simplifyARM64Opcodes(info->Instructions, false);
   for (auto &I : info->EpilogMap)
-    simplifyOpcodes(I.second, true);
+    simplifyARM64Opcodes(I.second.Instructions, true);
 
   MCContext &context = streamer.getContext();
   MCSymbol *Label = context.createTempSymbol();
@@ -951,10 +1002,12 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info,
   uint32_t PrologCodeBytes = ARM64CountOfUnwindCodes(info->Instructions);
   uint32_t TotalCodeBytes = PrologCodeBytes;
 
-  int PackedEpilogOffset = checkPackedEpilog(streamer, info, PrologCodeBytes);
+  int PackedEpilogOffset =
+      checkARM64PackedEpilog(streamer, info, PrologCodeBytes);
 
-  if (PackedEpilogOffset >= 0 && !info->HandlesExceptions &&
-      FuncLength <= 0x7ff && TryPacked) {
+  if (PackedEpilogOffset >= 0 &&
+      uint32_t(PackedEpilogOffset) < PrologCodeBytes &&
+      !info->HandlesExceptions && FuncLength <= 0x7ff && TryPacked) {
     // Matching prolog/epilog and no exception handlers; check if the
     // prolog matches the patterns that can be described by the packed
     // format.
@@ -963,7 +1016,7 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info,
     // unwind info there. Keep using that as indicator that this unwind
     // info has been generated already.
 
-    if (tryPackedUnwind(info, FuncLength, PackedEpilogOffset))
+    if (tryARM64PackedUnwind(info, FuncLength, PackedEpilogOffset))
       return;
   }
 
@@ -974,11 +1027,12 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info,
 
   for (auto &I : info->EpilogMap) {
     MCSymbol *EpilogStart = I.first;
-    auto &EpilogInstrs = I.second;
+    auto &EpilogInstrs = I.second.Instructions;
     uint32_t CodeBytes = ARM64CountOfUnwindCodes(EpilogInstrs);
 
     MCSymbol* MatchingEpilog =
       FindMatchingEpilog(EpilogInstrs, AddedEpilogs, info);
+    int PrologOffset;
     if (MatchingEpilog) {
       assert(EpilogInfo.find(MatchingEpilog) != EpilogInfo.end() &&
              "Duplicate epilog not found");
@@ -986,6 +1040,12 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info,
       // Clear the unwind codes in the EpilogMap, so that they don't get output
       // in the logic below.
       EpilogInstrs.clear();
+    } else if ((PrologOffset = getARM64OffsetInProlog(info->Instructions,
+                                                      EpilogInstrs)) >= 0) {
+      EpilogInfo[EpilogStart] = PrologOffset;
+      // Clear the unwind codes in the EpilogMap, so that they don't get output
+      // in the logic below.
+      EpilogInstrs.clear();
     } else {
       EpilogInfo[EpilogStart] = TotalCodeBytes;
       TotalCodeBytes += CodeBytes;
@@ -1016,8 +1076,6 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info,
   // Extended Code Words, Extended Epilog Count
   if (ExtensionWord) {
     // FIXME: We should be able to split unwind info into multiple sections.
-    // FIXME: We should share epilog codes across epilogs, where possible,
-    // which would make this issue show up less frequently.
     if (CodeWords > 0xFF || EpilogCount > 0xFFFF)
       report_fatal_error("SEH unwind data splitting not yet implemented");
     uint32_t row2 = 0x0;
@@ -1026,17 +1084,19 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info,
     streamer.emitInt32(row2);
   }
 
-  // Epilog Start Index, Epilog Start Offset
-  for (auto &I : EpilogInfo) {
-    MCSymbol *EpilogStart = I.first;
-    uint32_t EpilogIndex = I.second;
-    uint32_t EpilogOffset =
-        (uint32_t)GetAbsDifference(streamer, EpilogStart, info->Begin);
-    if (EpilogOffset)
-      EpilogOffset /= 4;
-    uint32_t row3 = EpilogOffset;
-    row3 |= (EpilogIndex & 0x3FF) << 22;
-    streamer.emitInt32(row3);
+  if (PackedEpilogOffset < 0) {
+    // Epilog Start Index, Epilog Start Offset
+    for (auto &I : EpilogInfo) {
+      MCSymbol *EpilogStart = I.first;
+      uint32_t EpilogIndex = I.second;
+      uint32_t EpilogOffset =
+          (uint32_t)GetAbsDifference(streamer, EpilogStart, info->Begin);
+      if (EpilogOffset)
+        EpilogOffset /= 4;
+      uint32_t row3 = EpilogOffset;
+      row3 |= (EpilogIndex & 0x3FF) << 22;
+      streamer.emitInt32(row3);
+    }
   }
 
   // Emit prolog unwind instructions (in reverse order).
@@ -1044,14 +1104,14 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info,
   for (uint8_t c = 0; c < numInst; ++c) {
     WinEH::Instruction inst = info->Instructions.back();
     info->Instructions.pop_back();
-    ARM64EmitUnwindCode(streamer, info->Begin, inst);
+    ARM64EmitUnwindCode(streamer, inst);
   }
 
   // Emit epilog unwind instructions
   for (auto &I : info->EpilogMap) {
-    auto &EpilogInstrs = I.second;
+    auto &EpilogInstrs = I.second.Instructions;
     for (const WinEH::Instruction &inst : EpilogInstrs)
-      ARM64EmitUnwindCode(streamer, info->Begin, inst);
+      ARM64EmitUnwindCode(streamer, inst);
   }
 
   int32_t BytesMod = CodeWords * 4 - TotalCodeBytes;
@@ -1066,8 +1126,1087 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info,
         4);
 }
 
-static void ARM64EmitRuntimeFunction(MCStreamer &streamer,
-                                     const WinEH::FrameInfo *info) {
+static uint32_t ARMCountOfUnwindCodes(ArrayRef<WinEH::Instruction> Insns) {
+  uint32_t Count = 0;
+  for (const auto &I : Insns) {
+    switch (static_cast<Win64EH::UnwindOpcodes>(I.Operation)) {
+    default:
+      llvm_unreachable("Unsupported ARM unwind code");
+    case Win64EH::UOP_AllocSmall:
+      Count += 1;
+      break;
+    case Win64EH::UOP_AllocLarge:
+      Count += 3;
+      break;
+    case Win64EH::UOP_AllocHuge:
+      Count += 4;
+      break;
+    case Win64EH::UOP_WideAllocMedium:
+      Count += 2;
+      break;
+    case Win64EH::UOP_WideAllocLarge:
+      Count += 3;
+      break;
+    case Win64EH::UOP_WideAllocHuge:
+      Count += 4;
+      break;
+    case Win64EH::UOP_WideSaveRegMask:
+      Count += 2;
+      break;
+    case Win64EH::UOP_SaveSP:
+      Count += 1;
+      break;
+    case Win64EH::UOP_SaveRegsR4R7LR:
+      Count += 1;
+      break;
+    case Win64EH::UOP_WideSaveRegsR4R11LR:
+      Count += 1;
+      break;
+    case Win64EH::UOP_SaveFRegD8D15:
+      Count += 1;
+      break;
+    case Win64EH::UOP_SaveRegMask:
+      Count += 2;
+      break;
+    case Win64EH::UOP_SaveLR:
+      Count += 2;
+      break;
+    case Win64EH::UOP_SaveFRegD0D15:
+      Count += 2;
+      break;
+    case Win64EH::UOP_SaveFRegD16D31:
+      Count += 2;
+      break;
+    case Win64EH::UOP_Nop:
+    case Win64EH::UOP_WideNop:
+    case Win64EH::UOP_End:
+    case Win64EH::UOP_EndNop:
+    case Win64EH::UOP_WideEndNop:
+      Count += 1;
+      break;
+    case Win64EH::UOP_Custom: {
+      int J;
+      for (J = 3; J > 0; J--)
+        if (I.Offset & (0xffu << (8 * J)))
+          break;
+      Count += J + 1;
+      break;
+    }
+    }
+  }
+  return Count;
+}
+
+static uint32_t ARMCountOfInstructionBytes(ArrayRef<WinEH::Instruction> Insns,
+                                           bool *HasCustom = nullptr) {
+  uint32_t Count = 0;
+  for (const auto &I : Insns) {
+    switch (static_cast<Win64EH::UnwindOpcodes>(I.Operation)) {
+    default:
+      llvm_unreachable("Unsupported ARM unwind code");
+    case Win64EH::UOP_AllocSmall:
+    case Win64EH::UOP_AllocLarge:
+    case Win64EH::UOP_AllocHuge:
+      Count += 2;
+      break;
+    case Win64EH::UOP_WideAllocMedium:
+    case Win64EH::UOP_WideAllocLarge:
+    case Win64EH::UOP_WideAllocHuge:
+      Count += 4;
+      break;
+    case Win64EH::UOP_WideSaveRegMask:
+    case Win64EH::UOP_WideSaveRegsR4R11LR:
+      Count += 4;
+      break;
+    case Win64EH::UOP_SaveSP:
+      Count += 2;
+      break;
+    case Win64EH::UOP_SaveRegMask:
+    case Win64EH::UOP_SaveRegsR4R7LR:
+      Count += 2;
+      break;
+    case Win64EH::UOP_SaveFRegD8D15:
+    case Win64EH::UOP_SaveFRegD0D15:
+    case Win64EH::UOP_SaveFRegD16D31:
+      Count += 4;
+      break;
+    case Win64EH::UOP_SaveLR:
+      Count += 4;
+      break;
+    case Win64EH::UOP_Nop:
+    case Win64EH::UOP_EndNop:
+      Count += 2;
+      break;
+    case Win64EH::UOP_WideNop:
+    case Win64EH::UOP_WideEndNop:
+      Count += 4;
+      break;
+    case Win64EH::UOP_End:
+      // This doesn't map to any instruction
+      break;
+    case Win64EH::UOP_Custom:
+      // We can't reason about what instructions this maps to; return a
+      // phony number to make sure we don't accidentally do epilog packing.
+      Count += 1000;
+      if (HasCustom)
+        *HasCustom = true;
+      break;
+    }
+  }
+  return Count;
+}
+
+static void checkARMInstructions(MCStreamer &Streamer,
+                                 ArrayRef<WinEH::Instruction> Insns,
+                                 const MCSymbol *Begin, const MCSymbol *End,
+                                 StringRef Name, StringRef Type) {
+  if (!End)
+    return;
+  Optional<int64_t> MaybeDistance =
+      GetOptionalAbsDifference(Streamer, End, Begin);
+  if (!MaybeDistance)
+    return;
+  uint32_t Distance = (uint32_t)*MaybeDistance;
+  bool HasCustom = false;
+  uint32_t InstructionBytes = ARMCountOfInstructionBytes(Insns, &HasCustom);
+  if (HasCustom)
+    return;
+  if (Distance != InstructionBytes) {
+    Streamer.getContext().reportError(
+        SMLoc(), "Incorrect size for " + Name + " " + Type + ": " +
+                     Twine(Distance) +
+                     " bytes of instructions in range, but .seh directives "
+                     "corresponding to " +
+                     Twine(InstructionBytes) + " bytes\n");
+  }
+}
+
+static bool isARMTerminator(const WinEH::Instruction &inst) {
+  switch (static_cast<Win64EH::UnwindOpcodes>(inst.Operation)) {
+  case Win64EH::UOP_End:
+  case Win64EH::UOP_EndNop:
+  case Win64EH::UOP_WideEndNop:
+    return true;
+  default:
+    return false;
+  }
+}
+
+// Unwind opcode encodings and restrictions are documented at
+// https://docs.microsoft.com/en-us/cpp/build/arm-exception-handling
+static void ARMEmitUnwindCode(MCStreamer &streamer,
+                              const WinEH::Instruction &inst) {
+  uint32_t w, lr;
+  int i;
+  switch (static_cast<Win64EH::UnwindOpcodes>(inst.Operation)) {
+  default:
+    llvm_unreachable("Unsupported ARM unwind code");
+  case Win64EH::UOP_AllocSmall:
+    assert((inst.Offset & 3) == 0);
+    assert(inst.Offset / 4 <= 0x7f);
+    streamer.emitInt8(inst.Offset / 4);
+    break;
+  case Win64EH::UOP_WideSaveRegMask:
+    assert((inst.Register & ~0x5fff) == 0);
+    lr = (inst.Register >> 14) & 1;
+    w = 0x8000 | (inst.Register & 0x1fff) | (lr << 13);
+    streamer.emitInt8((w >> 8) & 0xff);
+    streamer.emitInt8((w >> 0) & 0xff);
+    break;
+  case Win64EH::UOP_SaveSP:
+    assert(inst.Register <= 0x0f);
+    streamer.emitInt8(0xc0 | inst.Register);
+    break;
+  case Win64EH::UOP_SaveRegsR4R7LR:
+    assert(inst.Register >= 4 && inst.Register <= 7);
+    assert(inst.Offset <= 1);
+    streamer.emitInt8(0xd0 | (inst.Register - 4) | (inst.Offset << 2));
+    break;
+  case Win64EH::UOP_WideSaveRegsR4R11LR:
+    assert(inst.Register >= 8 && inst.Register <= 11);
+    assert(inst.Offset <= 1);
+    streamer.emitInt8(0xd8 | (inst.Register - 8) | (inst.Offset << 2));
+    break;
+  case Win64EH::UOP_SaveFRegD8D15:
+    assert(inst.Register >= 8 && inst.Register <= 15);
+    streamer.emitInt8(0xe0 | (inst.Register - 8));
+    break;
+  case Win64EH::UOP_WideAllocMedium:
+    assert((inst.Offset & 3) == 0);
+    assert(inst.Offset / 4 <= 0x3ff);
+    w = 0xe800 | (inst.Offset / 4);
+    streamer.emitInt8((w >> 8) & 0xff);
+    streamer.emitInt8((w >> 0) & 0xff);
+    break;
+  case Win64EH::UOP_SaveRegMask:
+    assert((inst.Register & ~0x40ff) == 0);
+    lr = (inst.Register >> 14) & 1;
+    w = 0xec00 | (inst.Register & 0x0ff) | (lr << 8);
+    streamer.emitInt8((w >> 8) & 0xff);
+    streamer.emitInt8((w >> 0) & 0xff);
+    break;
+  case Win64EH::UOP_SaveLR:
+    assert((inst.Offset & 3) == 0);
+    assert(inst.Offset / 4 <= 0x0f);
+    streamer.emitInt8(0xef);
+    streamer.emitInt8(inst.Offset / 4);
+    break;
+  case Win64EH::UOP_SaveFRegD0D15:
+    assert(inst.Register <= 15);
+    assert(inst.Offset <= 15);
+    assert(inst.Register <= inst.Offset);
+    streamer.emitInt8(0xf5);
+    streamer.emitInt8((inst.Register << 4) | inst.Offset);
+    break;
+  case Win64EH::UOP_SaveFRegD16D31:
+    assert(inst.Register >= 16 && inst.Register <= 31);
+    assert(inst.Offset >= 16 && inst.Offset <= 31);
+    assert(inst.Register <= inst.Offset);
+    streamer.emitInt8(0xf6);
+    streamer.emitInt8(((inst.Register - 16) << 4) | (inst.Offset - 16));
+    break;
+  case Win64EH::UOP_AllocLarge:
+    assert((inst.Offset & 3) == 0);
+    assert(inst.Offset / 4 <= 0xffff);
+    w = inst.Offset / 4;
+    streamer.emitInt8(0xf7);
+    streamer.emitInt8((w >> 8) & 0xff);
+    streamer.emitInt8((w >> 0) & 0xff);
+    break;
+  case Win64EH::UOP_AllocHuge:
+    assert((inst.Offset & 3) == 0);
+    assert(inst.Offset / 4 <= 0xffffff);
+    w = inst.Offset / 4;
+    streamer.emitInt8(0xf8);
+    streamer.emitInt8((w >> 16) & 0xff);
+    streamer.emitInt8((w >> 8) & 0xff);
+    streamer.emitInt8((w >> 0) & 0xff);
+    break;
+  case Win64EH::UOP_WideAllocLarge:
+    assert((inst.Offset & 3) == 0);
+    assert(inst.Offset / 4 <= 0xffff);
+    w = inst.Offset / 4;
+    streamer.emitInt8(0xf9);
+    streamer.emitInt8((w >> 8) & 0xff);
+    streamer.emitInt8((w >> 0) & 0xff);
+    break;
+  case Win64EH::UOP_WideAllocHuge:
+    assert((inst.Offset & 3) == 0);
+    assert(inst.Offset / 4 <= 0xffffff);
+    w = inst.Offset / 4;
+    streamer.emitInt8(0xfa);
+    streamer.emitInt8((w >> 16) & 0xff);
+    streamer.emitInt8((w >> 8) & 0xff);
+    streamer.emitInt8((w >> 0) & 0xff);
+    break;
+  case Win64EH::UOP_Nop:
+    streamer.emitInt8(0xfb);
+    break;
+  case Win64EH::UOP_WideNop:
+    streamer.emitInt8(0xfc);
+    break;
+  case Win64EH::UOP_EndNop:
+    streamer.emitInt8(0xfd);
+    break;
+  case Win64EH::UOP_WideEndNop:
+    streamer.emitInt8(0xfe);
+    break;
+  case Win64EH::UOP_End:
+    streamer.emitInt8(0xff);
+    break;
+  case Win64EH::UOP_Custom:
+    for (i = 3; i > 0; i--)
+      if (inst.Offset & (0xffu << (8 * i)))
+        break;
+    for (; i >= 0; i--)
+      streamer.emitInt8((inst.Offset >> (8 * i)) & 0xff);
+    break;
+  }
+}
+
+// Check if an epilog exists as a subset of the end of a prolog (backwards).
+// An epilog may end with one out of three different end opcodes; if this
+// is the first epilog that shares opcodes with the prolog, we can tolerate
+// that this opcode differs (and the caller will update the prolog to use
+// the same end opcode as the epilog). If another epilog already shares
+// opcodes with the prolog, the ending opcode must be a strict match.
+static int getARMOffsetInProlog(const std::vector<WinEH::Instruction> &Prolog,
+                                const std::vector<WinEH::Instruction> &Epilog,
+                                bool CanTweakProlog) {
+  // Can't find an epilog as a subset if it is longer than the prolog.
+  if (Epilog.size() > Prolog.size())
+    return -1;
+
+  // Check that the epilog actually is a perfect match for the end (backwrds)
+  // of the prolog.
+  // If we can adjust the prolog afterwards, don't check that the end opcodes
+  // match.
+  int EndIdx = CanTweakProlog ? 1 : 0;
+  for (int I = Epilog.size() - 1; I >= EndIdx; I--) {
+    // TODO: Could also allow minor mismatches, e.g. "add sp, #16" vs
+    // "push {r0-r3}".
+    if (Prolog[I] != Epilog[Epilog.size() - 1 - I])
+      return -1;
+  }
+
+  if (CanTweakProlog) {
+    // Check that both prolog and epilog end with an expected end opcode.
+    if (Prolog.front().Operation != Win64EH::UOP_End)
+      return -1;
+    if (Epilog.back().Operation != Win64EH::UOP_End &&
+        Epilog.back().Operation != Win64EH::UOP_EndNop &&
+        Epilog.back().Operation != Win64EH::UOP_WideEndNop)
+      return -1;
+  }
+
+  // If the epilog was a subset of the prolog, find its offset.
+  if (Epilog.size() == Prolog.size())
+    return 0;
+  return ARMCountOfUnwindCodes(ArrayRef<WinEH::Instruction>(
+      &Prolog[Epilog.size()], Prolog.size() - Epilog.size()));
+}
+
+static int checkARMPackedEpilog(MCStreamer &streamer, WinEH::FrameInfo *info,
+                                int PrologCodeBytes) {
+  // Can only pack if there's one single epilog
+  if (info->EpilogMap.size() != 1)
+    return -1;
+
+  const WinEH::FrameInfo::Epilog &EpilogInfo = info->EpilogMap.begin()->second;
+  // Can only pack if the epilog is unconditional
+  if (EpilogInfo.Condition != 0xe) // ARMCC::AL
+    return -1;
+
+  const std::vector<WinEH::Instruction> &Epilog = EpilogInfo.Instructions;
+  // Make sure we have at least the trailing end opcode
+  if (info->Instructions.empty() || Epilog.empty())
+    return -1;
+
+  // Check that the epilog actually is at the very end of the function,
+  // otherwise it can't be packed.
+  Optional<int64_t> MaybeDistance = GetOptionalAbsDifference(
+      streamer, info->FuncletOrFuncEnd, info->EpilogMap.begin()->first);
+  if (!MaybeDistance)
+    return -1;
+  uint32_t DistanceFromEnd = (uint32_t)*MaybeDistance;
+  uint32_t InstructionBytes = ARMCountOfInstructionBytes(Epilog);
+  if (DistanceFromEnd != InstructionBytes)
+    return -1;
+
+  int RetVal = -1;
+  // Even if we don't end up sharing opcodes with the prolog, we can still
+  // write the offset as a packed offset, if the single epilog is located at
+  // the end of the function and the offset (pointing after the prolog) fits
+  // as a packed offset.
+  if (PrologCodeBytes <= 31 &&
+      PrologCodeBytes + ARMCountOfUnwindCodes(Epilog) <= 63)
+    RetVal = PrologCodeBytes;
+
+  int Offset =
+      getARMOffsetInProlog(info->Instructions, Epilog, /*CanTweakProlog=*/true);
+  if (Offset < 0)
+    return RetVal;
+
+  // Check that the offset and prolog size fits in the first word; it's
+  // unclear whether the epilog count in the extension word can be taken
+  // as packed epilog offset.
+  if (Offset > 31 || PrologCodeBytes > 63)
+    return RetVal;
+
+  // Replace the regular end opcode of the prolog with the one from the
+  // epilog.
+  info->Instructions.front() = Epilog.back();
+
+  // As we choose to express the epilog as part of the prolog, remove the
+  // epilog from the map, so we don't try to emit its opcodes.
+  info->EpilogMap.clear();
+  return Offset;
+}
+
+static bool parseRegMask(unsigned Mask, bool &HasLR, bool &HasR11,
+                         unsigned &Folded, int &IntRegs) {
+  if (Mask & (1 << 14)) {
+    HasLR = true;
+    Mask &= ~(1 << 14);
+  }
+  if (Mask & (1 << 11)) {
+    HasR11 = true;
+    Mask &= ~(1 << 11);
+  }
+  Folded = 0;
+  IntRegs = -1;
+  if (!Mask)
+    return true;
+  int First = 0;
+  // Shift right until we have the bits at the bottom
+  while ((Mask & 1) == 0) {
+    First++;
+    Mask >>= 1;
+  }
+  if ((Mask & (Mask + 1)) != 0)
+    return false; // Not a consecutive series of bits? Can't be packed.
+  // Count the bits
+  int N = 0;
+  while (Mask & (1 << N))
+    N++;
+  if (First < 4) {
+    if (First + N < 4)
+      return false;
+    Folded = 4 - First;
+    N -= Folded;
+    First = 4;
+  }
+  if (First > 4)
+    return false; // Can't be packed
+  if (N >= 1)
+    IntRegs = N - 1;
+  return true;
+}
+
+static bool tryARMPackedUnwind(MCStreamer &streamer, WinEH::FrameInfo *info,
+                               uint32_t FuncLength) {
+  int Step = 0;
+  bool Homing = false;
+  bool HasR11 = false;
+  bool HasChain = false;
+  bool HasLR = false;
+  int IntRegs = -1;   // r4 - r(4+N)
+  int FloatRegs = -1; // d8 - d(8+N)
+  unsigned PF = 0;    // Number of extra pushed registers
+  unsigned StackAdjust = 0;
+  // Iterate over the prolog and check that all opcodes exactly match
+  // the canonical order and form.
+  for (const WinEH::Instruction &Inst : info->Instructions) {
+    switch (Inst.Operation) {
+    default:
+      llvm_unreachable("Unsupported ARM unwind code");
+    case Win64EH::UOP_Custom:
+    case Win64EH::UOP_AllocLarge:
+    case Win64EH::UOP_AllocHuge:
+    case Win64EH::UOP_WideAllocLarge:
+    case Win64EH::UOP_WideAllocHuge:
+    case Win64EH::UOP_SaveFRegD0D15:
+    case Win64EH::UOP_SaveFRegD16D31:
+      // Can't be packed
+      return false;
+    case Win64EH::UOP_SaveSP:
+      // Can't be packed; we can't rely on restoring sp from r11 when
+      // unwinding a packed prologue.
+      return false;
+    case Win64EH::UOP_SaveLR:
+      // Can't be present in a packed prologue
+      return false;
+
+    case Win64EH::UOP_End:
+    case Win64EH::UOP_EndNop:
+    case Win64EH::UOP_WideEndNop:
+      if (Step != 0)
+        return false;
+      Step = 1;
+      break;
+
+    case Win64EH::UOP_SaveRegsR4R7LR:
+    case Win64EH::UOP_WideSaveRegsR4R11LR:
+      // push {r4-r11,lr}
+      if (Step != 1 && Step != 2)
+        return false;
+      assert(Inst.Register >= 4 && Inst.Register <= 11); // r4-rX
+      assert(Inst.Offset <= 1);                          // Lr
+      IntRegs = Inst.Register - 4;
+      if (Inst.Register == 11) {
+        HasR11 = true;
+        IntRegs--;
+      }
+      if (Inst.Offset)
+        HasLR = true;
+      Step = 3;
+      break;
+
+    case Win64EH::UOP_SaveRegMask:
+      if (Step == 1 && Inst.Register == 0x0f) {
+        // push {r0-r3}
+        Homing = true;
+        Step = 2;
+        break;
+      }
+      LLVM_FALLTHROUGH;
+    case Win64EH::UOP_WideSaveRegMask:
+      if (Step != 1 && Step != 2)
+        return false;
+      // push {r4-r9,r11,lr}
+      // push {r11,lr}
+      // push {r1-r5}
+      if (!parseRegMask(Inst.Register, HasLR, HasR11, PF, IntRegs))
+        return false;
+      Step = 3;
+      break;
+
+    case Win64EH::UOP_Nop:
+      // mov r11, sp
+      if (Step != 3 || !HasR11 || IntRegs >= 0 || PF > 0)
+        return false;
+      HasChain = true;
+      Step = 4;
+      break;
+    case Win64EH::UOP_WideNop:
+      // add.w r11, sp, #xx
+      if (Step != 3 || !HasR11 || (IntRegs < 0 && PF == 0))
+        return false;
+      HasChain = true;
+      Step = 4;
+      break;
+
+    case Win64EH::UOP_SaveFRegD8D15:
+      if (Step != 1 && Step != 2 && Step != 3 && Step != 4)
+        return false;
+      assert(Inst.Register >= 8 && Inst.Register <= 15);
+      if (Inst.Register == 15)
+        return false; // Can't pack this case, R==7 means no IntRegs
+      if (IntRegs >= 0)
+        return false;
+      FloatRegs = Inst.Register - 8;
+      Step = 5;
+      break;
+
+    case Win64EH::UOP_AllocSmall:
+    case Win64EH::UOP_WideAllocMedium:
+      if (Step != 1 && Step != 2 && Step != 3 && Step != 4 && Step != 5)
+        return false;
+      if (PF > 0) // Can't have both folded and explicit stack allocation
+        return false;
+      if (Inst.Offset / 4 >= 0x3f4)
+        return false;
+      StackAdjust = Inst.Offset / 4;
+      Step = 6;
+      break;
+    }
+  }
+  if (HasR11 && !HasChain) {
+    if (IntRegs + 4 == 10) {
+      // r11 stored, but not chaining; can be packed if already saving r4-r10
+      // and we can fit r11 into this range.
+      IntRegs++;
+      HasR11 = false;
+    } else
+      return false;
+  }
+  if (HasChain && !HasLR)
+    return false;
+
+  // Packed uneind info can't express multiple epilogues.
+  if (info->EpilogMap.size() > 1)
+    return false;
+
+  unsigned EF = 0;
+  int Ret = 0;
+  if (info->EpilogMap.size() == 0) {
+    Ret = 3; // No epilogue
+  } else {
+    // As the prologue and epilogue aren't exact mirrors of each other,
+    // we have to check the epilogue too and see if it matches what we've
+    // concluded from the prologue.
+    const WinEH::FrameInfo::Epilog &EpilogInfo =
+        info->EpilogMap.begin()->second;
+    if (EpilogInfo.Condition != 0xe) // ARMCC::AL
+      return false;
+    const std::vector<WinEH::Instruction> &Epilog = EpilogInfo.Instructions;
+    Optional<int64_t> MaybeDistance = GetOptionalAbsDifference(
+        streamer, info->FuncletOrFuncEnd, info->EpilogMap.begin()->first);
+    if (!MaybeDistance)
+      return false;
+    uint32_t DistanceFromEnd = (uint32_t)*MaybeDistance;
+    uint32_t InstructionBytes = ARMCountOfInstructionBytes(Epilog);
+    if (DistanceFromEnd != InstructionBytes)
+      return false;
+
+    bool GotStackAdjust = false;
+    bool GotFloatRegs = false;
+    bool GotIntRegs = false;
+    bool GotHomingRestore = false;
+    bool GotLRRestore = false;
+    bool NeedsReturn = false;
+    bool GotReturn = false;
+
+    Step = 6;
+    for (const WinEH::Instruction &Inst : Epilog) {
+      switch (Inst.Operation) {
+      default:
+        llvm_unreachable("Unsupported ARM unwind code");
+      case Win64EH::UOP_Custom:
+      case Win64EH::UOP_AllocLarge:
+      case Win64EH::UOP_AllocHuge:
+      case Win64EH::UOP_WideAllocLarge:
+      case Win64EH::UOP_WideAllocHuge:
+      case Win64EH::UOP_SaveFRegD0D15:
+      case Win64EH::UOP_SaveFRegD16D31:
+      case Win64EH::UOP_SaveSP:
+      case Win64EH::UOP_Nop:
+      case Win64EH::UOP_WideNop:
+        // Can't be packed in an epilogue
+        return false;
+
+      case Win64EH::UOP_AllocSmall:
+      case Win64EH::UOP_WideAllocMedium:
+        if (Inst.Offset / 4 >= 0x3f4)
+          return false;
+        if (Step == 6) {
+          if (Homing && FloatRegs < 0 && IntRegs < 0 && StackAdjust == 0 &&
+              PF == 0 && Inst.Offset == 16) {
+            GotHomingRestore = true;
+            Step = 10;
+          } else {
+            if (StackAdjust > 0) {
+              // Got stack adjust in prologue too; must match.
+              if (StackAdjust != Inst.Offset / 4)
+                return false;
+              GotStackAdjust = true;
+            } else if (PF == Inst.Offset / 4) {
+              // Folded prologue, non-folded epilogue
+              StackAdjust = Inst.Offset / 4;
+              GotStackAdjust = true;
+            } else {
+              // StackAdjust == 0 in prologue, mismatch
+              return false;
+            }
+            Step = 7;
+          }
+        } else if (Step == 7 || Step == 8 || Step == 9) {
+          if (!Homing || Inst.Offset != 16)
+            return false;
+          GotHomingRestore = true;
+          Step = 10;
+        } else
+          return false;
+        break;
+
+      case Win64EH::UOP_SaveFRegD8D15:
+        if (Step != 6 && Step != 7)
+          return false;
+        assert(Inst.Register >= 8 && Inst.Register <= 15);
+        if (FloatRegs != (int)(Inst.Register - 8))
+          return false;
+        GotFloatRegs = true;
+        Step = 8;
+        break;
+
+      case Win64EH::UOP_SaveRegsR4R7LR:
+      case Win64EH::UOP_WideSaveRegsR4R11LR: {
+        // push {r4-r11,lr}
+        if (Step != 6 && Step != 7 && Step != 8)
+          return false;
+        assert(Inst.Register >= 4 && Inst.Register <= 11); // r4-rX
+        assert(Inst.Offset <= 1);                          // Lr
+        if (Homing && HasLR) {
+          // If homing and LR is backed up, we can either restore LR here
+          // and return with Ret == 1 or 2, or return with SaveLR below
+          if (Inst.Offset) {
+            GotLRRestore = true;
+            NeedsReturn = true;
+          } else {
+            // Expecting a separate SaveLR below
+          }
+        } else {
+          if (HasLR != (Inst.Offset == 1))
+            return false;
+        }
+        GotLRRestore = Inst.Offset == 1;
+        if (IntRegs < 0) // This opcode must include r4
+          return false;
+        int Expected = IntRegs;
+        if (HasChain) {
+          // Can't express r11 here unless IntRegs describe r4-r10
+          if (IntRegs != 6)
+            return false;
+          Expected++;
+        }
+        if (Expected != (int)(Inst.Register - 4))
+          return false;
+        GotIntRegs = true;
+        Step = 9;
+        break;
+      }
+
+      case Win64EH::UOP_SaveRegMask:
+      case Win64EH::UOP_WideSaveRegMask: {
+        if (Step != 6 && Step != 7 && Step != 8)
+          return false;
+        // push {r4-r9,r11,lr}
+        // push {r11,lr}
+        // push {r1-r5}
+        bool CurHasLR = false, CurHasR11 = false;
+        int Regs;
+        if (!parseRegMask(Inst.Register, CurHasLR, CurHasR11, EF, Regs))
+          return false;
+        if (EF > 0) {
+          if (EF != PF && EF != StackAdjust)
+            return false;
+        }
+        if (Homing && HasLR) {
+          // If homing and LR is backed up, we can either restore LR here
+          // and return with Ret == 1 or 2, or return with SaveLR below
+          if (CurHasLR) {
+            GotLRRestore = true;
+            NeedsReturn = true;
+          } else {
+            // Expecting a separate SaveLR below
+          }
+        } else {
+          if (CurHasLR != HasLR)
+            return false;
+          GotLRRestore = CurHasLR;
+        }
+        int Expected = IntRegs;
+        if (HasChain) {
+          // If we have chaining, the mask must have included r11.
+          if (!CurHasR11)
+            return false;
+        } else if (Expected == 7) {
+          // If we don't have chaining, the mask could still include r11,
+          // expressed as part of IntRegs Instead.
+          Expected--;
+          if (!CurHasR11)
+            return false;
+        } else {
+          // Neither HasChain nor r11 included in IntRegs, must not have r11
+          // here either.
+          if (CurHasR11)
+            return false;
+        }
+        if (Expected != Regs)
+          return false;
+        GotIntRegs = true;
+        Step = 9;
+        break;
+      }
+
+      case Win64EH::UOP_SaveLR:
+        if (Step != 6 && Step != 7 && Step != 8 && Step != 9)
+          return false;
+        if (!Homing || Inst.Offset != 20 || GotLRRestore)
+          return false;
+        GotLRRestore = true;
+        GotHomingRestore = true;
+        Step = 10;
+        break;
+
+      case Win64EH::UOP_EndNop:
+      case Win64EH::UOP_WideEndNop:
+        GotReturn = true;
+        Ret = (Inst.Operation == Win64EH::UOP_EndNop) ? 1 : 2;
+        LLVM_FALLTHROUGH;
+      case Win64EH::UOP_End:
+        if (Step != 6 && Step != 7 && Step != 8 && Step != 9 && Step != 10)
+          return false;
+        Step = 11;
+        break;
+      }
+    }
+
+    if (Step != 11)
+      return false;
+    if (StackAdjust > 0 && !GotStackAdjust && EF == 0)
+      return false;
+    if (FloatRegs >= 0 && !GotFloatRegs)
+      return false;
+    if (IntRegs >= 0 && !GotIntRegs)
+      return false;
+    if (Homing && !GotHomingRestore)
+      return false;
+    if (HasLR && !GotLRRestore)
+      return false;
+    if (NeedsReturn && !GotReturn)
+      return false;
+  }
+
+  assert(PF == 0 || EF == 0 ||
+         StackAdjust == 0); // Can't have adjust in all three
+  if (PF > 0 || EF > 0) {
+    StackAdjust = PF > 0 ? (PF - 1) : (EF - 1);
+    assert(StackAdjust <= 3);
+    StackAdjust |= 0x3f0;
+    if (PF > 0)
+      StackAdjust |= 1 << 2;
+    if (EF > 0)
+      StackAdjust |= 1 << 3;
+  }
+
+  assert(FuncLength <= 0x7FF && "FuncLength should have been checked earlier");
+  int Flag = info->Fragment ? 0x02 : 0x01;
+  int H = Homing ? 1 : 0;
+  int L = HasLR ? 1 : 0;
+  int C = HasChain ? 1 : 0;
+  assert(IntRegs < 0 || FloatRegs < 0);
+  unsigned Reg, R;
+  if (IntRegs >= 0) {
+    Reg = IntRegs;
+    assert(Reg <= 7);
+    R = 0;
+  } else if (FloatRegs >= 0) {
+    Reg = FloatRegs;
+    assert(Reg < 7);
+    R = 1;
+  } else {
+    // No int or float regs stored (except possibly R11,LR)
+    Reg = 7;
+    R = 1;
+  }
+  info->PackedInfo |= Flag << 0;
+  info->PackedInfo |= (FuncLength & 0x7FF) << 2;
+  info->PackedInfo |= (Ret & 0x3) << 13;
+  info->PackedInfo |= H << 15;
+  info->PackedInfo |= Reg << 16;
+  info->PackedInfo |= R << 19;
+  info->PackedInfo |= L << 20;
+  info->PackedInfo |= C << 21;
+  assert(StackAdjust <= 0x3ff);
+  info->PackedInfo |= StackAdjust << 22;
+  return true;
+}
+
+// Populate the .xdata section.  The format of .xdata on ARM is documented at
+// https://docs.microsoft.com/en-us/cpp/build/arm-exception-handling
+static void ARMEmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info,
+                              bool TryPacked = true) {
+  // If this UNWIND_INFO already has a symbol, it's already been emitted.
+  if (info->Symbol)
+    return;
+  // If there's no unwind info here (not even a terminating UOP_End), the
+  // unwind info is considered bogus and skipped. If this was done in
+  // response to an explicit .seh_handlerdata, the associated trailing
+  // handler data is left orphaned in the xdata section.
+  if (info->empty()) {
+    info->EmitAttempted = true;
+    return;
+  }
+  if (info->EmitAttempted) {
+    // If we tried to emit unwind info before (due to an explicit
+    // .seh_handlerdata directive), but skipped it (because there was no
+    // valid information to emit at the time), and it later got valid unwind
+    // opcodes, we can't emit it here, because the trailing handler data
+    // was already emitted elsewhere in the xdata section.
+    streamer.getContext().reportError(
+        SMLoc(), "Earlier .seh_handlerdata for " + info->Function->getName() +
+                     " skipped due to no unwind info at the time "
+                     "(.seh_handlerdata too early?), but the function later "
+                     "did get unwind info that can't be emitted");
+    return;
+  }
+
+  MCContext &context = streamer.getContext();
+  MCSymbol *Label = context.createTempSymbol();
+
+  streamer.emitValueToAlignment(4);
+  streamer.emitLabel(Label);
+  info->Symbol = Label;
+
+  if (!info->PrologEnd)
+    streamer.getContext().reportError(SMLoc(), "Prologue in " +
+                                                   info->Function->getName() +
+                                                   " not correctly terminated");
+
+  if (info->PrologEnd && !info->Fragment)
+    checkARMInstructions(streamer, info->Instructions, info->Begin,
+                         info->PrologEnd, info->Function->getName(),
+                         "prologue");
+  for (auto &I : info->EpilogMap) {
+    MCSymbol *EpilogStart = I.first;
+    auto &Epilog = I.second;
+    checkARMInstructions(streamer, Epilog.Instructions, EpilogStart, Epilog.End,
+                         info->Function->getName(), "epilogue");
+    if (Epilog.Instructions.empty() ||
+        !isARMTerminator(Epilog.Instructions.back()))
+      streamer.getContext().reportError(
+          SMLoc(), "Epilogue in " + info->Function->getName() +
+                       " not correctly terminated");
+  }
+
+  Optional<int64_t> RawFuncLength;
+  const MCExpr *FuncLengthExpr = nullptr;
+  if (!info->FuncletOrFuncEnd) {
+    report_fatal_error("FuncletOrFuncEnd not set");
+  } else {
+    // As the size of many thumb2 instructions isn't known until later,
+    // we can't always rely on being able to calculate the absolute
+    // length of the function here. If we can't calculate it, defer it
+    // to a relocation.
+    //
+    // In such a case, we won't know if the function is too long so that
+    // the unwind info would need to be split (but this isn't implemented
+    // anyway).
+    RawFuncLength =
+        GetOptionalAbsDifference(streamer, info->FuncletOrFuncEnd, info->Begin);
+    if (!RawFuncLength)
+      FuncLengthExpr =
+          GetSubDivExpr(streamer, info->FuncletOrFuncEnd, info->Begin, 2);
+  }
+  uint32_t FuncLength = 0;
+  if (RawFuncLength)
+    FuncLength = (uint32_t)*RawFuncLength / 2;
+  if (FuncLength > 0x3FFFF)
+    report_fatal_error("SEH unwind data splitting not yet implemented");
+  uint32_t PrologCodeBytes = ARMCountOfUnwindCodes(info->Instructions);
+  uint32_t TotalCodeBytes = PrologCodeBytes;
+
+  if (!info->HandlesExceptions && RawFuncLength && FuncLength <= 0x7ff &&
+      TryPacked) {
+    // No exception handlers; check if the prolog and epilog matches the
+    // patterns that can be described by the packed format. If we don't
+    // know the exact function length yet, we can't do this.
+
+    // info->Symbol was already set even if we didn't actually write any
+    // unwind info there. Keep using that as indicator that this unwind
+    // info has been generated already.
+
+    if (tryARMPackedUnwind(streamer, info, FuncLength))
+      return;
+  }
+
+  int PackedEpilogOffset =
+      checkARMPackedEpilog(streamer, info, PrologCodeBytes);
+
+  // Process epilogs.
+  MapVector<MCSymbol *, uint32_t> EpilogInfo;
+  // Epilogs processed so far.
+  std::vector<MCSymbol *> AddedEpilogs;
+
+  bool CanTweakProlog = true;
+  for (auto &I : info->EpilogMap) {
+    MCSymbol *EpilogStart = I.first;
+    auto &EpilogInstrs = I.second.Instructions;
+    uint32_t CodeBytes = ARMCountOfUnwindCodes(EpilogInstrs);
+
+    MCSymbol *MatchingEpilog =
+        FindMatchingEpilog(EpilogInstrs, AddedEpilogs, info);
+    int PrologOffset;
+    if (MatchingEpilog) {
+      assert(EpilogInfo.find(MatchingEpilog) != EpilogInfo.end() &&
+             "Duplicate epilog not found");
+      EpilogInfo[EpilogStart] = EpilogInfo.lookup(MatchingEpilog);
+      // Clear the unwind codes in the EpilogMap, so that they don't get output
+      // in the logic below.
+      EpilogInstrs.clear();
+    } else if ((PrologOffset = getARMOffsetInProlog(
+                    info->Instructions, EpilogInstrs, CanTweakProlog)) >= 0) {
+      if (CanTweakProlog) {
+        // Replace the regular end opcode of the prolog with the one from the
+        // epilog.
+        info->Instructions.front() = EpilogInstrs.back();
+        // Later epilogs need a strict match for the end opcode.
+        CanTweakProlog = false;
+      }
+      EpilogInfo[EpilogStart] = PrologOffset;
+      // Clear the unwind codes in the EpilogMap, so that they don't get output
+      // in the logic below.
+      EpilogInstrs.clear();
+    } else {
+      EpilogInfo[EpilogStart] = TotalCodeBytes;
+      TotalCodeBytes += CodeBytes;
+      AddedEpilogs.push_back(EpilogStart);
+    }
+  }
+
+  // Code Words, Epilog count, F, E, X, Vers, Function Length
+  uint32_t row1 = 0x0;
+  uint32_t CodeWords = TotalCodeBytes / 4;
+  uint32_t CodeWordsMod = TotalCodeBytes % 4;
+  if (CodeWordsMod)
+    CodeWords++;
+  uint32_t EpilogCount =
+      PackedEpilogOffset >= 0 ? PackedEpilogOffset : info->EpilogMap.size();
+  bool ExtensionWord = EpilogCount > 31 || CodeWords > 15;
+  if (!ExtensionWord) {
+    row1 |= (EpilogCount & 0x1F) << 23;
+    row1 |= (CodeWords & 0x0F) << 28;
+  }
+  if (info->HandlesExceptions) // X
+    row1 |= 1 << 20;
+  if (PackedEpilogOffset >= 0) // E
+    row1 |= 1 << 21;
+  if (info->Fragment) // F
+    row1 |= 1 << 22;
+  row1 |= FuncLength & 0x3FFFF;
+  if (RawFuncLength)
+    streamer.emitInt32(row1);
+  else
+    streamer.emitValue(
+        MCBinaryExpr::createOr(FuncLengthExpr,
+                               MCConstantExpr::create(row1, context), context),
+        4);
+
+  // Extended Code Words, Extended Epilog Count
+  if (ExtensionWord) {
+    // FIXME: We should be able to split unwind info into multiple sections.
+    if (CodeWords > 0xFF || EpilogCount > 0xFFFF)
+      report_fatal_error("SEH unwind data splitting not yet implemented");
+    uint32_t row2 = 0x0;
+    row2 |= (CodeWords & 0xFF) << 16;
+    row2 |= (EpilogCount & 0xFFFF);
+    streamer.emitInt32(row2);
+  }
+
+  if (PackedEpilogOffset < 0) {
+    // Epilog Start Index, Epilog Start Offset
+    for (auto &I : EpilogInfo) {
+      MCSymbol *EpilogStart = I.first;
+      uint32_t EpilogIndex = I.second;
+
+      Optional<int64_t> MaybeEpilogOffset =
+          GetOptionalAbsDifference(streamer, EpilogStart, info->Begin);
+      const MCExpr *OffsetExpr = nullptr;
+      uint32_t EpilogOffset = 0;
+      if (MaybeEpilogOffset)
+        EpilogOffset = *MaybeEpilogOffset / 2;
+      else
+        OffsetExpr = GetSubDivExpr(streamer, EpilogStart, info->Begin, 2);
+
+      assert(info->EpilogMap.find(EpilogStart) != info->EpilogMap.end());
+      unsigned Condition = info->EpilogMap[EpilogStart].Condition;
+      assert(Condition <= 0xf);
+
+      uint32_t row3 = EpilogOffset;
+      row3 |= Condition << 20;
+      row3 |= (EpilogIndex & 0x3FF) << 24;
+      if (MaybeEpilogOffset)
+        streamer.emitInt32(row3);
+      else
+        streamer.emitValue(
+            MCBinaryExpr::createOr(
+                OffsetExpr, MCConstantExpr::create(row3, context), context),
+            4);
+    }
+  }
+
+  // Emit prolog unwind instructions (in reverse order).
+  uint8_t numInst = info->Instructions.size();
+  for (uint8_t c = 0; c < numInst; ++c) {
+    WinEH::Instruction inst = info->Instructions.back();
+    info->Instructions.pop_back();
+    ARMEmitUnwindCode(streamer, inst);
+  }
+
+  // Emit epilog unwind instructions
+  for (auto &I : info->EpilogMap) {
+    auto &EpilogInstrs = I.second.Instructions;
+    for (uint32_t i = 0; i < EpilogInstrs.size(); i++) {
+      WinEH::Instruction inst = EpilogInstrs[i];
+      ARMEmitUnwindCode(streamer, inst);
+    }
+  }
+
+  int32_t BytesMod = CodeWords * 4 - TotalCodeBytes;
+  assert(BytesMod >= 0);
+  for (int i = 0; i < BytesMod; i++)
+    streamer.emitInt8(0xFB);
+
+  if (info->HandlesExceptions)
+    streamer.emitValue(
+        MCSymbolRefExpr::create(info->ExceptionHandler,
+                                MCSymbolRefExpr::VK_COFF_IMGREL32, context),
+        4);
+}
+
+static void ARMEmitRuntimeFunction(MCStreamer &streamer,
+                                   const WinEH::FrameInfo *info) {
   MCContext &context = streamer.getContext();
 
   streamer.emitValueToAlignment(4);
@@ -1088,7 +2227,7 @@ void llvm::Win64EH::ARM64UnwindEmitter::Emit(MCStreamer &Streamer) const {
     if (Info->empty())
       continue;
     MCSection *XData = Streamer.getAssociatedXDataSection(CFI->TextSection);
-    Streamer.SwitchSection(XData);
+    Streamer.switchSection(XData);
     ARM64EmitUnwindInfo(Streamer, Info);
   }
 
@@ -1101,8 +2240,8 @@ void llvm::Win64EH::ARM64UnwindEmitter::Emit(MCStreamer &Streamer) const {
     if (!Info->Symbol)
       continue;
     MCSection *PData = Streamer.getAssociatedPDataSection(CFI->TextSection);
-    Streamer.SwitchSection(PData);
-    ARM64EmitRuntimeFunction(Streamer, Info);
+    Streamer.switchSection(PData);
+    ARMEmitRuntimeFunction(Streamer, Info);
   }
 }
 
@@ -1116,12 +2255,57 @@ void llvm::Win64EH::ARM64UnwindEmitter::EmitUnwindInfo(MCStreamer &Streamer,
   // end hasn't been marked yet, the xdata function length won't cover the
   // whole function, only up to this point.
   if (!info->FuncletOrFuncEnd) {
-    Streamer.SwitchSection(info->TextSection);
+    Streamer.switchSection(info->TextSection);
     info->FuncletOrFuncEnd = Streamer.emitCFILabel();
   }
   // Switch sections (the static function above is meant to be called from
   // here and from Emit().
   MCSection *XData = Streamer.getAssociatedXDataSection(info->TextSection);
-  Streamer.SwitchSection(XData);
+  Streamer.switchSection(XData);
   ARM64EmitUnwindInfo(Streamer, info, /* TryPacked = */ !HandlerData);
 }
+
+void llvm::Win64EH::ARMUnwindEmitter::Emit(MCStreamer &Streamer) const {
+  // Emit the unwind info structs first.
+  for (const auto &CFI : Streamer.getWinFrameInfos()) {
+    WinEH::FrameInfo *Info = CFI.get();
+    if (Info->empty())
+      continue;
+    MCSection *XData = Streamer.getAssociatedXDataSection(CFI->TextSection);
+    Streamer.switchSection(XData);
+    ARMEmitUnwindInfo(Streamer, Info);
+  }
+
+  // Now emit RUNTIME_FUNCTION entries.
+  for (const auto &CFI : Streamer.getWinFrameInfos()) {
+    WinEH::FrameInfo *Info = CFI.get();
+    // ARMEmitUnwindInfo above clears the info struct, so we can't check
+    // empty here. But if a Symbol is set, we should create the corresponding
+    // pdata entry.
+    if (!Info->Symbol)
+      continue;
+    MCSection *PData = Streamer.getAssociatedPDataSection(CFI->TextSection);
+    Streamer.switchSection(PData);
+    ARMEmitRuntimeFunction(Streamer, Info);
+  }
+}
+
+void llvm::Win64EH::ARMUnwindEmitter::EmitUnwindInfo(MCStreamer &Streamer,
+                                                     WinEH::FrameInfo *info,
+                                                     bool HandlerData) const {
+  // Called if there's an .seh_handlerdata directive before the end of the
+  // function. This forces writing the xdata record already here - and
+  // in this case, the function isn't actually ended already, but the xdata
+  // record needs to know the function length. In these cases, if the funclet
+  // end hasn't been marked yet, the xdata function length won't cover the
+  // whole function, only up to this point.
+  if (!info->FuncletOrFuncEnd) {
+    Streamer.switchSection(info->TextSection);
+    info->FuncletOrFuncEnd = Streamer.emitCFILabel();
+  }
+  // Switch sections (the static function above is meant to be called from
+  // here and from Emit().
+  MCSection *XData = Streamer.getAssociatedXDataSection(info->TextSection);
+  Streamer.switchSection(XData);
+  ARMEmitUnwindInfo(Streamer, info, /* TryPacked = */ !HandlerData);
+}
diff --git a/llvm/lib/MC/MCWinCOFFStreamer.cpp b/llvm/lib/MC/MCWinCOFFStreamer.cpp
index 0dfe5a5c2bdb..ad883131eae1 100644
--- a/llvm/lib/MC/MCWinCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCWinCOFFStreamer.cpp
@@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/MC/MCWinCOFFStreamer.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Triple.h"
@@ -27,14 +28,12 @@
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSymbolCOFF.h"
-#include "llvm/MC/MCWinCOFFStreamer.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
-#include <cassert>
 #include <cstdint>
 
 using namespace llvm;
@@ -71,16 +70,16 @@ void MCWinCOFFStreamer::initSections(bool NoExecStack,
   // FIXME: this is identical to the ELF one.
   // This emulates the same behavior of GNU as. This makes it easier
   // to compare the output as the major sections are in the same order.
-  SwitchSection(getContext().getObjectFileInfo()->getTextSection());
+  switchSection(getContext().getObjectFileInfo()->getTextSection());
   emitCodeAlignment(4, &STI);
 
-  SwitchSection(getContext().getObjectFileInfo()->getDataSection());
+  switchSection(getContext().getObjectFileInfo()->getDataSection());
   emitCodeAlignment(4, &STI);
 
-  SwitchSection(getContext().getObjectFileInfo()->getBSSSection());
+  switchSection(getContext().getObjectFileInfo()->getBSSSection());
   emitCodeAlignment(4, &STI);
 
-  SwitchSection(getContext().getObjectFileInfo()->getTextSection());
+  switchSection(getContext().getObjectFileInfo()->getTextSection());
 }
 
 void MCWinCOFFStreamer::emitLabel(MCSymbol *S, SMLoc Loc) {
@@ -134,7 +133,7 @@ void MCWinCOFFStreamer::emitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
   llvm_unreachable("not implemented");
 }
 
-void MCWinCOFFStreamer::BeginCOFFSymbolDef(MCSymbol const *S) {
+void MCWinCOFFStreamer::beginCOFFSymbolDef(MCSymbol const *S) {
   auto *Symbol = cast<MCSymbolCOFF>(S);
   if (CurSymbol)
     Error("starting a new symbol definition without completing the "
@@ -142,7 +141,7 @@ void MCWinCOFFStreamer::BeginCOFFSymbolDef(MCSymbol const *S) {
   CurSymbol = Symbol;
 }
 
-void MCWinCOFFStreamer::EmitCOFFSymbolStorageClass(int StorageClass) {
+void MCWinCOFFStreamer::emitCOFFSymbolStorageClass(int StorageClass) {
   if (!CurSymbol) {
     Error("storage class specified outside of symbol definition");
     return;
@@ -158,7 +157,7 @@ void MCWinCOFFStreamer::EmitCOFFSymbolStorageClass(int StorageClass) {
   cast<MCSymbolCOFF>(CurSymbol)->setClass((uint16_t)StorageClass);
 }
 
-void MCWinCOFFStreamer::EmitCOFFSymbolType(int Type) {
+void MCWinCOFFStreamer::emitCOFFSymbolType(int Type) {
   if (!CurSymbol) {
     Error("symbol type specified outside of a symbol definition");
     return;
@@ -173,13 +172,13 @@ void MCWinCOFFStreamer::EmitCOFFSymbolType(int Type) {
   cast<MCSymbolCOFF>(CurSymbol)->setType((uint16_t)Type);
 }
 
-void MCWinCOFFStreamer::EndCOFFSymbolDef() {
+void MCWinCOFFStreamer::endCOFFSymbolDef() {
   if (!CurSymbol)
     Error("ending symbol definition without starting one");
   CurSymbol = nullptr;
 }
 
-void MCWinCOFFStreamer::EmitCOFFSafeSEH(MCSymbol const *Symbol) {
+void MCWinCOFFStreamer::emitCOFFSafeSEH(MCSymbol const *Symbol) {
   // SafeSEH is a feature specific to 32-bit x86.  It does not exist (and is
   // unnecessary) on all platforms which use table-based exception dispatch.
   if (getContext().getTargetTriple().getArch() != Triple::x86)
@@ -205,7 +204,7 @@ void MCWinCOFFStreamer::EmitCOFFSafeSEH(MCSymbol const *Symbol) {
                    << COFF::SCT_COMPLEX_TYPE_SHIFT);
 }
 
-void MCWinCOFFStreamer::EmitCOFFSymbolIndex(MCSymbol const *Symbol) {
+void MCWinCOFFStreamer::emitCOFFSymbolIndex(MCSymbol const *Symbol) {
   MCSection *Sec = getCurrentSectionOnly();
   getAssembler().registerSection(*Sec);
   if (Sec->getAlignment() < 4)
@@ -216,7 +215,7 @@ void MCWinCOFFStreamer::EmitCOFFSymbolIndex(MCSymbol const *Symbol) {
   getAssembler().registerSymbol(*Symbol);
 }
 
-void MCWinCOFFStreamer::EmitCOFFSectionIndex(const MCSymbol *Symbol) {
+void MCWinCOFFStreamer::emitCOFFSectionIndex(const MCSymbol *Symbol) {
   visitUsedSymbol(*Symbol);
   MCDataFragment *DF = getOrCreateDataFragment();
   const MCSymbolRefExpr *SRE = MCSymbolRefExpr::create(Symbol, getContext());
@@ -225,7 +224,7 @@ void MCWinCOFFStreamer::EmitCOFFSectionIndex(const MCSymbol *Symbol) {
   DF->getContents().resize(DF->getContents().size() + 2, 0);
 }
 
-void MCWinCOFFStreamer::EmitCOFFSecRel32(const MCSymbol *Symbol,
+void MCWinCOFFStreamer::emitCOFFSecRel32(const MCSymbol *Symbol,
                                          uint64_t Offset) {
   visitUsedSymbol(*Symbol);
   MCDataFragment *DF = getOrCreateDataFragment();
@@ -243,7 +242,7 @@ void MCWinCOFFStreamer::EmitCOFFSecRel32(const MCSymbol *Symbol,
   DF->getContents().resize(DF->getContents().size() + 4, 0);
 }
 
-void MCWinCOFFStreamer::EmitCOFFImgRel32(const MCSymbol *Symbol,
+void MCWinCOFFStreamer::emitCOFFImgRel32(const MCSymbol *Symbol,
                                          int64_t Offset) {
   visitUsedSymbol(*Symbol);
   MCDataFragment *DF = getOrCreateDataFragment();
@@ -287,10 +286,10 @@ void MCWinCOFFStreamer::emitCommonSymbol(MCSymbol *S, uint64_t Size,
     OS << " -aligncomm:\"" << Symbol->getName() << "\","
        << Log2_32_Ceil(ByteAlignment);
 
-    PushSection();
-    SwitchSection(MFI->getDrectveSection());
+    pushSection();
+    switchSection(MFI->getDrectveSection());
     emitBytes(Directive);
-    PopSection();
+    popSection();
   }
 }
 
@@ -299,13 +298,13 @@ void MCWinCOFFStreamer::emitLocalCommonSymbol(MCSymbol *S, uint64_t Size,
   auto *Symbol = cast<MCSymbolCOFF>(S);
 
   MCSection *Section = getContext().getObjectFileInfo()->getBSSSection();
-  PushSection();
-  SwitchSection(Section);
+  pushSection();
+  switchSection(Section);
   emitValueToAlignment(ByteAlignment, 0, 1, 0);
   emitLabel(Symbol);
   Symbol->setExternal(false);
   emitZeros(Size);
-  PopSection();
+  popSection();
 }
 
 void MCWinCOFFStreamer::emitWeakReference(MCSymbol *AliasS,
@@ -334,7 +333,7 @@ void MCWinCOFFStreamer::emitIdent(StringRef IdentString) {
   llvm_unreachable("not implemented");
 }
 
-void MCWinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) {
+void MCWinCOFFStreamer::emitWinEHHandlerData(SMLoc Loc) {
   llvm_unreachable("not implemented");
 }
 
diff --git a/llvm/lib/MC/MCWinEH.cpp b/llvm/lib/MC/MCWinEH.cpp
index e58a0b2cf654..1a6d5a3b562e 100644
--- a/llvm/lib/MC/MCWinEH.cpp
+++ b/llvm/lib/MC/MCWinEH.cpp
@@ -7,18 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCWinEH.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/BinaryFormat/COFF.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCObjectFileInfo.h"
-#include "llvm/MC/MCSectionCOFF.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbol.h"
 
 namespace llvm {
 namespace WinEH {
 
-UnwindEmitter::~UnwindEmitter() {}
+UnwindEmitter::~UnwindEmitter() = default;
 
 }
 }
diff --git a/llvm/lib/MC/MCXCOFFStreamer.cpp b/llvm/lib/MC/MCXCOFFStreamer.cpp
index 90604782de13..a4a42279d6e2 100644
--- a/llvm/lib/MC/MCXCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCXCOFFStreamer.cpp
@@ -13,12 +13,14 @@
 #include "llvm/MC/MCXCOFFStreamer.h"
 #include "llvm/BinaryFormat/XCOFF.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSectionXCOFF.h"
 #include "llvm/MC/MCSymbolXCOFF.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Casting.h"
 
 using namespace llvm;
 
@@ -54,6 +56,9 @@ bool MCXCOFFStreamer::emitSymbolAttribute(MCSymbol *Sym,
   case llvm::MCSA_Protected:
     Symbol->setVisibilityType(XCOFF::SYM_V_PROTECTED);
     break;
+  case llvm::MCSA_Exported:
+    Symbol->setVisibilityType(XCOFF::SYM_V_EXPORTED);
+    break;
   default:
     report_fatal_error("Not implemented yet.");
   }
diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp
index 56bb03ad8d42..78d0d9cec556 100644
--- a/llvm/lib/MC/MachObjectWriter.cpp
+++ b/llvm/lib/MC/MachObjectWriter.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionMachO.h"
@@ -29,6 +30,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LEB128.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -751,6 +753,24 @@ static MachO::LoadCommandType getLCFromMCVM(MCVersionMinType Type) {
   llvm_unreachable("Invalid mc version min type");
 }
 
+// Encode addrsig data as symbol indexes in variable length encoding.
+void MachObjectWriter::writeAddrsigSection(MCAssembler &Asm) {
+  MCSection *AddrSigSection =
+      Asm.getContext().getObjectFileInfo()->getAddrSigSection();
+  MCSection::FragmentListType &fragmentList = AddrSigSection->getFragmentList();
+  if (!fragmentList.size())
+    return;
+
+  assert(fragmentList.size() == 1);
+  MCFragment *pFragment = &*fragmentList.begin();
+  MCDataFragment *pDataFragment = dyn_cast_or_null<MCDataFragment>(pFragment);
+  assert(pDataFragment);
+
+  raw_svector_ostream OS(pDataFragment->getContents());
+  for (const MCSymbol *sym : this->getAddrsigSyms())
+    encodeULEB128(sym->getIndex(), OS);
+}
+
 uint64_t MachObjectWriter::writeObject(MCAssembler &Asm,
                                        const MCAsmLayout &Layout) {
   uint64_t StartOffset = W.OS.tell();
@@ -758,6 +778,7 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm,
   // Compute symbol table information and bind symbol indices.
   computeSymbolTable(Asm, LocalSymbolData, ExternalSymbolData,
                      UndefinedSymbolData);
+  writeAddrsigSection(Asm);
 
   if (!Asm.CGProfile.empty()) {
     MCSection *CGProfileSection = Asm.getContext().getMachOSection(
@@ -894,8 +915,8 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm,
       [&](const MCAssembler::VersionInfoType &VersionInfo) {
         auto EncodeVersion = [](VersionTuple V) -> uint32_t {
           assert(!V.empty() && "empty version");
-          unsigned Update = V.getSubminor().getValueOr(0);
-          unsigned Minor = V.getMinor().getValueOr(0);
+          unsigned Update = V.getSubminor().value_or(0);
+          unsigned Minor = V.getMinor().value_or(0);
           assert(Update < 256 && "unencodable update target version");
           assert(Minor < 256 && "unencodable minor target version");
           assert(V.getMajor() < 65536 && "unencodable major target version");
diff --git a/llvm/lib/MC/SPIRVObjectWriter.cpp b/llvm/lib/MC/SPIRVObjectWriter.cpp
new file mode 100644
index 000000000000..4a07740e8d14
--- /dev/null
+++ b/llvm/lib/MC/SPIRVObjectWriter.cpp
@@ -0,0 +1,76 @@
+//===- llvm/MC/MCSPIRVObjectWriter.cpp - SPIR-V Object Writer ----*- C++ *-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCSPIRVObjectWriter.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/EndianStream.h"
+
+using namespace llvm;
+
+class SPIRVObjectWriter : public MCObjectWriter {
+  ::support::endian::Writer W;
+
+  /// The target specific SPIR-V writer instance.
+  std::unique_ptr<MCSPIRVObjectTargetWriter> TargetObjectWriter;
+
+public:
+  SPIRVObjectWriter(std::unique_ptr<MCSPIRVObjectTargetWriter> MOTW,
+                    raw_pwrite_stream &OS)
+      : W(OS, support::little), TargetObjectWriter(std::move(MOTW)) {}
+
+  ~SPIRVObjectWriter() override {}
+
+private:
+  void recordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
+                        const MCFragment *Fragment, const MCFixup &Fixup,
+                        MCValue Target, uint64_t &FixedValue) override {}
+
+  void executePostLayoutBinding(MCAssembler &Asm,
+                                const MCAsmLayout &Layout) override {}
+
+  uint64_t writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
+  void writeHeader(const MCAssembler &Asm);
+};
+
+void SPIRVObjectWriter::writeHeader(const MCAssembler &Asm) {
+  constexpr uint32_t MagicNumber = 0x07230203;
+
+  // TODO: set the version on a min-necessary basis (just like the translator
+  // does) requires some refactoring of MCAssembler::VersionInfoType.
+  constexpr uint32_t Major = 1;
+  constexpr uint32_t Minor = 0;
+  constexpr uint32_t VersionNumber = 0 | (Major << 16) | (Minor << 8);
+  // TODO: check if we could use anything other than 0 (spec allows).
+  constexpr uint32_t GeneratorMagicNumber = 0;
+  // TODO: do not hardcode this as well.
+  constexpr uint32_t Bound = 900;
+  constexpr uint32_t Schema = 0;
+
+  W.write<uint32_t>(MagicNumber);
+  W.write<uint32_t>(VersionNumber);
+  W.write<uint32_t>(GeneratorMagicNumber);
+  W.write<uint32_t>(Bound);
+  W.write<uint32_t>(Schema);
+}
+
+uint64_t SPIRVObjectWriter::writeObject(MCAssembler &Asm,
+                                        const MCAsmLayout &Layout) {
+  uint64_t StartOffset = W.OS.tell();
+  writeHeader(Asm);
+  for (const MCSection &S : Asm)
+    Asm.writeSectionData(W.OS, &S, Layout);
+  return W.OS.tell() - StartOffset;
+}
+
+std::unique_ptr<MCObjectWriter>
+llvm::createSPIRVObjectWriter(std::unique_ptr<MCSPIRVObjectTargetWriter> MOTW,
+                              raw_pwrite_stream &OS) {
+  return std::make_unique<SPIRVObjectWriter>(std::move(MOTW), OS);
+}
diff --git a/llvm/lib/MC/SubtargetFeature.cpp b/llvm/lib/MC/SubtargetFeature.cpp
index 3155adcf2674..d53cc2f7e37b 100644
--- a/llvm/lib/MC/SubtargetFeature.cpp
+++ b/llvm/lib/MC/SubtargetFeature.cpp
@@ -20,10 +20,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cstring>
-#include <iterator>
 #include <string>
 #include <vector>
 
diff --git a/llvm/lib/MC/TargetRegistry.cpp b/llvm/lib/MC/TargetRegistry.cpp
index 09684b1e5ad2..57444fd23784 100644
--- a/llvm/lib/MC/TargetRegistry.cpp
+++ b/llvm/lib/MC/TargetRegistry.cpp
@@ -33,7 +33,7 @@ const Target *TargetRegistry::lookupTarget(const std::string &ArchName,
                      [&](const Target &T) { return ArchName == T.getName(); });
 
     if (I == targets().end()) {
-      Error = "error: invalid target '" + ArchName + "'.\n";
+      Error = "invalid target '" + ArchName + "'.\n";
       return nullptr;
     }
 
@@ -49,7 +49,7 @@ const Target *TargetRegistry::lookupTarget(const std::string &ArchName,
     std::string TempError;
     TheTarget = TargetRegistry::lookupTarget(TheTriple.getTriple(), TempError);
     if (!TheTarget) {
-      Error = ": error: unable to get target for '"
+      Error = "unable to get target for '"
             + TheTriple.getTriple()
             + "', see --version and --triple.\n";
       return nullptr;
diff --git a/llvm/lib/MC/WasmObjectWriter.cpp b/llvm/lib/MC/WasmObjectWriter.cpp
index 636c1d238932..7cc11d24f286 100644
--- a/llvm/lib/MC/WasmObjectWriter.cpp
+++ b/llvm/lib/MC/WasmObjectWriter.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/BinaryFormat/Wasm.h"
 #include "llvm/BinaryFormat/WasmTraits.h"
 #include "llvm/Config/llvm-config.h"
@@ -31,7 +30,6 @@
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
-#include "llvm/Support/StringSaver.h"
 #include <vector>
 
 using namespace llvm;
@@ -125,12 +123,11 @@ struct WasmCustomSection {
   StringRef Name;
   MCSectionWasm *Section;
 
-  uint32_t OutputContentsOffset;
-  uint32_t OutputIndex;
+  uint32_t OutputContentsOffset = 0;
+  uint32_t OutputIndex = InvalidIndex;
 
   WasmCustomSection(StringRef Name, MCSectionWasm *Section)
-      : Name(Name), Section(Section), OutputContentsOffset(0),
-        OutputIndex(InvalidIndex) {}
+      : Name(Name), Section(Section) {}
 };
 
 #if !defined(NDEBUG)
@@ -140,36 +137,58 @@ raw_ostream &operator<<(raw_ostream &OS, const WasmRelocationEntry &Rel) {
 }
 #endif
 
-// Write X as an (unsigned) LEB value at offset Offset in Stream, padded
+// Write Value as an (unsigned) LEB value at offset Offset in Stream, padded
 // to allow patching.
-template <int W>
-void writePatchableLEB(raw_pwrite_stream &Stream, uint64_t X, uint64_t Offset) {
+template <typename T, int W>
+void writePatchableULEB(raw_pwrite_stream &Stream, T Value, uint64_t Offset) {
   uint8_t Buffer[W];
-  unsigned SizeLen = encodeULEB128(X, Buffer, W);
+  unsigned SizeLen = encodeULEB128(Value, Buffer, W);
   assert(SizeLen == W);
   Stream.pwrite((char *)Buffer, SizeLen, Offset);
 }
 
-// Write X as an signed LEB value at offset Offset in Stream, padded
+// Write Value as an signed LEB value at offset Offset in Stream, padded
 // to allow patching.
-template <int W>
-void writePatchableSLEB(raw_pwrite_stream &Stream, int64_t X, uint64_t Offset) {
+template <typename T, int W>
+void writePatchableSLEB(raw_pwrite_stream &Stream, T Value, uint64_t Offset) {
   uint8_t Buffer[W];
-  unsigned SizeLen = encodeSLEB128(X, Buffer, W);
+  unsigned SizeLen = encodeSLEB128(Value, Buffer, W);
   assert(SizeLen == W);
   Stream.pwrite((char *)Buffer, SizeLen, Offset);
 }
 
-// Write X as a plain integer value at offset Offset in Stream.
-static void patchI32(raw_pwrite_stream &Stream, uint32_t X, uint64_t Offset) {
+static void writePatchableU32(raw_pwrite_stream &Stream, uint32_t Value,
+                              uint64_t Offset) {
+  writePatchableULEB<uint32_t, 5>(Stream, Value, Offset);
+}
+
+static void writePatchableS32(raw_pwrite_stream &Stream, int32_t Value,
+                              uint64_t Offset) {
+  writePatchableSLEB<int32_t, 5>(Stream, Value, Offset);
+}
+
+static void writePatchableU64(raw_pwrite_stream &Stream, uint64_t Value,
+                              uint64_t Offset) {
+  writePatchableSLEB<uint64_t, 10>(Stream, Value, Offset);
+}
+
+static void writePatchableS64(raw_pwrite_stream &Stream, int64_t Value,
+                              uint64_t Offset) {
+  writePatchableSLEB<int64_t, 10>(Stream, Value, Offset);
+}
+
+// Write Value as a plain integer value at offset Offset in Stream.
+static void patchI32(raw_pwrite_stream &Stream, uint32_t Value,
+                     uint64_t Offset) {
   uint8_t Buffer[4];
-  support::endian::write32le(Buffer, X);
+  support::endian::write32le(Buffer, Value);
   Stream.pwrite((char *)Buffer, sizeof(Buffer), Offset);
 }
 
-static void patchI64(raw_pwrite_stream &Stream, uint64_t X, uint64_t Offset) {
+static void patchI64(raw_pwrite_stream &Stream, uint64_t Value,
+                     uint64_t Offset) {
   uint8_t Buffer[8];
-  support::endian::write64le(Buffer, X);
+  support::endian::write64le(Buffer, Value);
   Stream.pwrite((char *)Buffer, sizeof(Buffer), Offset);
 }
 
@@ -423,8 +442,8 @@ void WasmObjectWriter::endSection(SectionBookkeeping &Section) {
 
   // Write the final section size to the payload_len field, which follows
   // the section id byte.
-  writePatchableLEB<5>(static_cast<raw_pwrite_stream &>(W->OS), Size,
-                       Section.SizeOffset);
+  writePatchableU32(static_cast<raw_pwrite_stream &>(W->OS), Size,
+                    Section.SizeOffset);
 }
 
 // Emit the Wasm header.
@@ -755,7 +774,7 @@ void WasmObjectWriter::applyRelocations(
                       RelEntry.Offset;
 
     LLVM_DEBUG(dbgs() << "applyRelocation: " << RelEntry << "\n");
-    auto Value = getProvisionalValue(RelEntry, Layout);
+    uint64_t Value = getProvisionalValue(RelEntry, Layout);
 
     switch (RelEntry.Type) {
     case wasm::R_WASM_FUNCTION_INDEX_LEB:
@@ -764,10 +783,10 @@ void WasmObjectWriter::applyRelocations(
     case wasm::R_WASM_MEMORY_ADDR_LEB:
     case wasm::R_WASM_TAG_INDEX_LEB:
     case wasm::R_WASM_TABLE_NUMBER_LEB:
-      writePatchableLEB<5>(Stream, Value, Offset);
+      writePatchableU32(Stream, Value, Offset);
       break;
     case wasm::R_WASM_MEMORY_ADDR_LEB64:
-      writePatchableLEB<10>(Stream, Value, Offset);
+      writePatchableU64(Stream, Value, Offset);
       break;
     case wasm::R_WASM_TABLE_INDEX_I32:
     case wasm::R_WASM_MEMORY_ADDR_I32:
@@ -787,14 +806,14 @@ void WasmObjectWriter::applyRelocations(
     case wasm::R_WASM_MEMORY_ADDR_SLEB:
     case wasm::R_WASM_MEMORY_ADDR_REL_SLEB:
     case wasm::R_WASM_MEMORY_ADDR_TLS_SLEB:
-      writePatchableSLEB<5>(Stream, Value, Offset);
+      writePatchableS32(Stream, Value, Offset);
       break;
     case wasm::R_WASM_TABLE_INDEX_SLEB64:
     case wasm::R_WASM_TABLE_INDEX_REL_SLEB64:
     case wasm::R_WASM_MEMORY_ADDR_SLEB64:
     case wasm::R_WASM_MEMORY_ADDR_REL_SLEB64:
     case wasm::R_WASM_MEMORY_ADDR_TLS_SLEB64:
-      writePatchableSLEB<10>(Stream, Value, Offset);
+      writePatchableS64(Stream, Value, Offset);
       break;
     default:
       llvm_unreachable("invalid relocation type");
@@ -912,25 +931,29 @@ void WasmObjectWriter::writeGlobalSection(ArrayRef<wasm::WasmGlobal> Globals) {
   for (const wasm::WasmGlobal &Global : Globals) {
     encodeULEB128(Global.Type.Type, W->OS);
     W->OS << char(Global.Type.Mutable);
-    W->OS << char(Global.InitExpr.Opcode);
-    switch (Global.Type.Type) {
-    case wasm::WASM_TYPE_I32:
-      encodeSLEB128(0, W->OS);
-      break;
-    case wasm::WASM_TYPE_I64:
-      encodeSLEB128(0, W->OS);
-      break;
-    case wasm::WASM_TYPE_F32:
-      writeI32(0);
-      break;
-    case wasm::WASM_TYPE_F64:
-      writeI64(0);
-      break;
-    case wasm::WASM_TYPE_EXTERNREF:
-      writeValueType(wasm::ValType::EXTERNREF);
-      break;
-    default:
-      llvm_unreachable("unexpected type");
+    if (Global.InitExpr.Extended) {
+      llvm_unreachable("extected init expressions not supported");
+    } else {
+      W->OS << char(Global.InitExpr.Inst.Opcode);
+      switch (Global.Type.Type) {
+      case wasm::WASM_TYPE_I32:
+        encodeSLEB128(0, W->OS);
+        break;
+      case wasm::WASM_TYPE_I64:
+        encodeSLEB128(0, W->OS);
+        break;
+      case wasm::WASM_TYPE_F32:
+        writeI32(0);
+        break;
+      case wasm::WASM_TYPE_F64:
+        writeI64(0);
+        break;
+      case wasm::WASM_TYPE_EXTERNREF:
+        writeValueType(wasm::ValType::EXTERNREF);
+        break;
+      default:
+        llvm_unreachable("unexpected type");
+      }
     }
     W->OS << char(wasm::WASM_OPCODE_END);
   }
@@ -1547,9 +1570,9 @@ uint64_t WasmObjectWriter::writeOneObject(MCAssembler &Asm,
         continue;
 
       const auto &WS = static_cast<const MCSymbolWasm &>(S);
-      LLVM_DEBUG(dbgs()
-                 << "MCSymbol: "
-                 << toString(WS.getType().getValueOr(wasm::WASM_SYMBOL_TYPE_DATA))
+      LLVM_DEBUG(
+          dbgs() << "MCSymbol: "
+                 << toString(WS.getType().value_or(wasm::WASM_SYMBOL_TYPE_DATA))
                  << " '" << S << "'"
                  << " isDefined=" << S.isDefined() << " isExternal="
                  << S.isExternal() << " isTemporary=" << S.isTemporary()
@@ -1639,21 +1662,22 @@ uint64_t WasmObjectWriter::writeOneObject(MCAssembler &Asm,
           wasm::WasmGlobal Global;
           Global.Type = WS.getGlobalType();
           Global.Index = NumGlobalImports + Globals.size();
+          Global.InitExpr.Extended = false;
           switch (Global.Type.Type) {
           case wasm::WASM_TYPE_I32:
-            Global.InitExpr.Opcode = wasm::WASM_OPCODE_I32_CONST;
+            Global.InitExpr.Inst.Opcode = wasm::WASM_OPCODE_I32_CONST;
             break;
           case wasm::WASM_TYPE_I64:
-            Global.InitExpr.Opcode = wasm::WASM_OPCODE_I64_CONST;
+            Global.InitExpr.Inst.Opcode = wasm::WASM_OPCODE_I64_CONST;
             break;
           case wasm::WASM_TYPE_F32:
-            Global.InitExpr.Opcode = wasm::WASM_OPCODE_F32_CONST;
+            Global.InitExpr.Inst.Opcode = wasm::WASM_OPCODE_F32_CONST;
             break;
           case wasm::WASM_TYPE_F64:
-            Global.InitExpr.Opcode = wasm::WASM_OPCODE_F64_CONST;
+            Global.InitExpr.Inst.Opcode = wasm::WASM_OPCODE_F64_CONST;
             break;
           case wasm::WASM_TYPE_EXTERNREF:
-            Global.InitExpr.Opcode = wasm::WASM_OPCODE_REF_NULL;
+            Global.InitExpr.Inst.Opcode = wasm::WASM_OPCODE_REF_NULL;
             break;
           default:
             llvm_unreachable("unexpected type");
@@ -1785,7 +1809,7 @@ uint64_t WasmObjectWriter::writeOneObject(MCAssembler &Asm,
 
     wasm::WasmSymbolInfo Info;
     Info.Name = WS.getName();
-    Info.Kind = WS.getType().getValueOr(wasm::WASM_SYMBOL_TYPE_DATA);
+    Info.Kind = WS.getType().value_or(wasm::WASM_SYMBOL_TYPE_DATA);
     Info.Flags = Flags;
     if (!WS.isData()) {
       assert(WasmIndices.count(&WS) > 0);
@@ -1852,7 +1876,8 @@ uint64_t WasmObjectWriter::writeOneObject(MCAssembler &Asm,
     const MCFragment &AlignFrag = *IT;
     if (AlignFrag.getKind() != MCFragment::FT_Align)
       report_fatal_error(".init_array section should be aligned");
-    if (cast<MCAlignFragment>(AlignFrag).getAlignment() != (is64Bit() ? 8 : 4))
+    if (cast<MCAlignFragment>(AlignFrag).getAlignment() !=
+        Align(is64Bit() ? 8 : 4))
       report_fatal_error(".init_array section should be aligned for pointers");
 
     const MCFragment &Frag = *std::next(IT);
diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp
index 73c687331d30..33e496b7a864 100644
--- a/llvm/lib/MC/WinCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp
@@ -41,7 +41,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
-#include <cstddef>
 #include <cstdint>
 #include <cstring>
 #include <ctime>
@@ -155,9 +154,7 @@ public:
   bool UseBigObj;
   bool UseOffsetLabels = false;
 
-  bool EmitAddrsigSection = false;
   MCSectionCOFF *AddrsigSection;
-  std::vector<const MCSymbol *> AddrsigSyms;
 
   MCSectionCOFF *CGProfileSection = nullptr;
 
@@ -221,11 +218,6 @@ public:
   void assignSectionNumbers();
   void assignFileOffsets(MCAssembler &Asm, const MCAsmLayout &Layout);
 
-  void emitAddrsigSection() override { EmitAddrsigSection = true; }
-  void addAddrsigSymbol(const MCSymbol *Sym) override {
-    AddrsigSyms.push_back(Sym);
-  }
-
   uint64_t writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
 };
 
@@ -452,32 +444,6 @@ void WinCOFFObjectWriter::DefineSymbol(const MCSymbol &MCSym,
   Sym->MC = &MCSym;
 }
 
-// Maximum offsets for different string table entry encodings.
-enum : unsigned { Max7DecimalOffset = 9999999U };
-enum : uint64_t { MaxBase64Offset = 0xFFFFFFFFFULL }; // 64^6, including 0
-
-// Encode a string table entry offset in base 64, padded to 6 chars, and
-// prefixed with a double slash: '//AAAAAA', '//AAAAAB', ...
-// Buffer must be at least 8 bytes large. No terminating null appended.
-static void encodeBase64StringEntry(char *Buffer, uint64_t Value) {
-  assert(Value > Max7DecimalOffset && Value <= MaxBase64Offset &&
-         "Illegal section name encoding for value");
-
-  static const char Alphabet[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-                                 "abcdefghijklmnopqrstuvwxyz"
-                                 "0123456789+/";
-
-  Buffer[0] = '/';
-  Buffer[1] = '/';
-
-  char *Ptr = Buffer + 7;
-  for (unsigned i = 0; i < 6; ++i) {
-    unsigned Rem = Value % 64;
-    Value /= 64;
-    *(Ptr--) = Alphabet[Rem];
-  }
-}
-
 void WinCOFFObjectWriter::SetSectionName(COFFSection &S) {
   if (S.Name.size() <= COFF::NameSize) {
     std::memcpy(S.Header.Name, S.Name.c_str(), S.Name.size());
@@ -485,19 +451,8 @@ void WinCOFFObjectWriter::SetSectionName(COFFSection &S) {
   }
 
   uint64_t StringTableEntry = Strings.getOffset(S.Name);
-  if (StringTableEntry <= Max7DecimalOffset) {
-    SmallVector<char, COFF::NameSize> Buffer;
-    Twine('/').concat(Twine(StringTableEntry)).toVector(Buffer);
-    assert(Buffer.size() <= COFF::NameSize && Buffer.size() >= 2);
-    std::memcpy(S.Header.Name, Buffer.data(), Buffer.size());
-    return;
-  }
-  if (StringTableEntry <= MaxBase64Offset) {
-    // Starting with 10,000,000, offsets are encoded as base64.
-    encodeBase64StringEntry(S.Header.Name, StringTableEntry);
-    return;
-  }
-  report_fatal_error("COFF string table is greater than 64 GB.");
+  if (!COFF::encodeSectionName(S.Header.Name, StringTableEntry))
+    report_fatal_error("COFF string table is greater than 64 GB.");
 }
 
 void WinCOFFObjectWriter::SetSymbolName(COFFSymbol &S) {
@@ -1003,7 +958,7 @@ void WinCOFFObjectWriter::assignFileOffsets(MCAssembler &Asm,
   for (const auto &Section : Asm) {
     COFFSection *Sec = SectionMap[&Section];
 
-    if (Sec->Number == -1)
+    if (!Sec || Sec->Number == -1)
       continue;
 
     Sec->Header.SizeOfRawData = Layout.getSectionAddressSize(&Section);
diff --git a/llvm/lib/MC/XCOFFObjectWriter.cpp b/llvm/lib/MC/XCOFFObjectWriter.cpp
index 177253d7a9d7..977e77bf67fd 100644
--- a/llvm/lib/MC/XCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/XCOFFObjectWriter.cpp
@@ -22,8 +22,9 @@
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/MCXCOFFObjectWriter.h"
 #include "llvm/MC/StringTableBuilder.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/EndianStream.h"
-#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 
 #include <deque>
@@ -65,6 +66,10 @@ struct Symbol {
   const MCSymbolXCOFF *const MCSym;
   uint32_t SymbolTableIndex;
 
+  XCOFF::VisibilityType getVisibilityType() const {
+    return MCSym->getVisibilityType();
+  }
+
   XCOFF::StorageClass getStorageClass() const {
     return MCSym->getStorageClass();
   }
@@ -77,12 +82,15 @@ struct Symbol {
 struct XCOFFSection {
   const MCSectionXCOFF *const MCSec;
   uint32_t SymbolTableIndex;
-  uint32_t Address;
-  uint32_t Size;
+  uint64_t Address;
+  uint64_t Size;
 
   SmallVector<Symbol, 1> Syms;
   SmallVector<XCOFFRelocation, 1> Relocations;
   StringRef getSymbolTableName() const { return MCSec->getSymbolTableName(); }
+  XCOFF::VisibilityType getVisibilityType() const {
+    return MCSec->getVisibilityType();
+  }
   XCOFFSection(const MCSectionXCOFF *MCSec)
       : MCSec(MCSec), SymbolTableIndex(-1), Address(-1), Size(0) {}
 };
@@ -100,10 +108,10 @@ struct SectionEntry {
   char Name[XCOFF::NameSize];
   // The physical/virtual address of the section. For an object file
   // these values are equivalent.
-  uint32_t Address;
-  uint32_t Size;
-  uint32_t FileOffsetToData;
-  uint32_t FileOffsetToRelocations;
+  uint64_t Address;
+  uint64_t Size;
+  uint64_t FileOffsetToData;
+  uint64_t FileOffsetToRelocations;
   uint32_t RelocationCount;
   int32_t Flags;
 
@@ -136,7 +144,7 @@ struct SectionEntry {
     Index = UninitializedIndex;
   }
 
-  virtual ~SectionEntry() {}
+  virtual ~SectionEntry() = default;
 };
 
 // Represents the data related to a section excluding the csects that make up
@@ -165,16 +173,21 @@ struct CsectSectionEntry : public SectionEntry {
       Group->clear();
   }
 
-  virtual ~CsectSectionEntry() {}
+  virtual ~CsectSectionEntry() = default;
 };
 
 struct DwarfSectionEntry : public SectionEntry {
   // For DWARF section entry.
   std::unique_ptr<XCOFFSection> DwarfSect;
 
+  // For DWARF section, we must use real size in the section header. MemorySize
+  // is for the size the DWARF section occupies including paddings.
+  uint32_t MemorySize;
+
   DwarfSectionEntry(StringRef N, int32_t Flags,
                     std::unique_ptr<XCOFFSection> Sect)
-      : SectionEntry(N, Flags | XCOFF::STYP_DWARF), DwarfSect(std::move(Sect)) {
+      : SectionEntry(N, Flags | XCOFF::STYP_DWARF), DwarfSect(std::move(Sect)),
+        MemorySize(0) {
     assert(DwarfSect->MCSec->isDwarfSect() &&
            "This should be a DWARF section!");
     assert(N.size() <= XCOFF::NameSize && "section name too long");
@@ -183,20 +196,24 @@ struct DwarfSectionEntry : public SectionEntry {
 
   DwarfSectionEntry(DwarfSectionEntry &&s) = default;
 
-  virtual ~DwarfSectionEntry() {}
+  virtual ~DwarfSectionEntry() = default;
 };
 
 class XCOFFObjectWriter : public MCObjectWriter {
 
   uint32_t SymbolTableEntryCount = 0;
-  uint32_t SymbolTableOffset = 0;
+  uint64_t SymbolTableOffset = 0;
   uint16_t SectionCount = 0;
-  uint32_t RelocationEntryOffset = 0;
+  uint64_t RelocationEntryOffset = 0;
+  std::vector<std::pair<std::string, size_t>> FileNames;
 
   support::endian::Writer W;
   std::unique_ptr<MCXCOFFObjectTargetWriter> TargetObjectWriter;
   StringTableBuilder Strings;
 
+  const uint64_t MaxRawDataSize =
+      TargetObjectWriter->is64Bit() ? UINT64_MAX : UINT32_MAX;
+
   // Maps the MCSection representation to its corresponding XCOFFSection
   // wrapper. Needed for finding the XCOFFSection to insert an MCSymbol into
   // from its containing MCSectionXCOFF.
@@ -244,26 +261,39 @@ class XCOFFObjectWriter : public MCObjectWriter {
 
   uint64_t writeObject(MCAssembler &, const MCAsmLayout &) override;
 
-  static bool nameShouldBeInStringTable(const StringRef &);
+  bool is64Bit() const { return TargetObjectWriter->is64Bit(); }
+  bool nameShouldBeInStringTable(const StringRef &);
   void writeSymbolName(const StringRef &);
-  void writeSymbolTableEntryForCsectMemberLabel(const Symbol &,
-                                                const XCOFFSection &, int16_t,
-                                                uint64_t);
-  void writeSymbolTableEntryForControlSection(const XCOFFSection &, int16_t,
-                                              XCOFF::StorageClass);
-  void writeSymbolTableEntryForDwarfSection(const XCOFFSection &, int16_t);
+
+  void writeSymbolEntryForCsectMemberLabel(const Symbol &SymbolRef,
+                                           const XCOFFSection &CSectionRef,
+                                           int16_t SectionIndex,
+                                           uint64_t SymbolOffset);
+  void writeSymbolEntryForControlSection(const XCOFFSection &CSectionRef,
+                                         int16_t SectionIndex,
+                                         XCOFF::StorageClass StorageClass);
+  void writeSymbolEntryForDwarfSection(const XCOFFSection &DwarfSectionRef,
+                                       int16_t SectionIndex);
   void writeFileHeader();
   void writeSectionHeaderTable();
   void writeSections(const MCAssembler &Asm, const MCAsmLayout &Layout);
   void writeSectionForControlSectionEntry(const MCAssembler &Asm,
                                           const MCAsmLayout &Layout,
                                           const CsectSectionEntry &CsectEntry,
-                                          uint32_t &CurrentAddressLocation);
+                                          uint64_t &CurrentAddressLocation);
   void writeSectionForDwarfSectionEntry(const MCAssembler &Asm,
                                         const MCAsmLayout &Layout,
                                         const DwarfSectionEntry &DwarfEntry,
-                                        uint32_t &CurrentAddressLocation);
+                                        uint64_t &CurrentAddressLocation);
   void writeSymbolTable(const MCAsmLayout &Layout);
+  void writeSymbolAuxDwarfEntry(uint64_t LengthOfSectionPortion,
+                                uint64_t NumberOfRelocEnt = 0);
+  void writeSymbolAuxCsectEntry(uint64_t SectionOrLength,
+                                uint8_t SymbolAlignmentAndType,
+                                uint8_t StorageMappingClass);
+  void writeSymbolEntry(StringRef SymbolName, uint64_t Value,
+                        int16_t SectionNumber, uint16_t SymbolType,
+                        uint8_t StorageClass, uint8_t NumberOfAuxEntries = 1);
   void writeRelocations();
   void writeRelocation(XCOFFRelocation Reloc, const XCOFFSection &Section);
 
@@ -278,10 +308,8 @@ class XCOFFObjectWriter : public MCObjectWriter {
   void assignAddressesAndIndices(const MCAsmLayout &);
   void finalizeSectionInfo();
 
-  bool
-  needsAuxiliaryHeader() const { /* TODO aux header support not implemented. */
-    return false;
-  }
+  // TODO aux header support not implemented.
+  bool needsAuxiliaryHeader() const { return false; }
 
   // Returns the size of the auxiliary header to be written to the object file.
   size_t auxiliaryHeaderSize() const {
@@ -293,6 +321,10 @@ class XCOFFObjectWriter : public MCObjectWriter {
 public:
   XCOFFObjectWriter(std::unique_ptr<MCXCOFFObjectTargetWriter> MOTW,
                     raw_pwrite_stream &OS);
+
+  void writeWord(uint64_t Word) {
+    is64Bit() ? W.write<uint64_t>(Word) : W.write<uint32_t>(Word);
+  }
 };
 
 XCOFFObjectWriter::XCOFFObjectWriter(
@@ -396,9 +428,6 @@ static MCSectionXCOFF *getContainingCsect(const MCSymbolXCOFF *XSym) {
 
 void XCOFFObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
                                                  const MCAsmLayout &Layout) {
-  if (TargetObjectWriter->is64Bit())
-    report_fatal_error("64-bit XCOFF object files are not supported yet.");
-
   for (const auto &S : Asm) {
     const auto *MCSec = cast<const MCSectionXCOFF>(&S);
     assert(SectionMap.find(MCSec) == SectionMap.end() &&
@@ -424,7 +453,7 @@ void XCOFFObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
       SectionMap[MCSec] = DwarfSec.get();
 
       DwarfSectionEntry SecEntry(MCSec->getName(),
-                                 MCSec->getDwarfSubtypeFlags().getValue(),
+                                 *MCSec->getDwarfSubtypeFlags(),
                                  std::move(DwarfSec));
       DwarfSections.push_back(std::move(SecEntry));
     } else
@@ -470,6 +499,15 @@ void XCOFFObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
       Strings.add(XSym->getSymbolTableName());
   }
 
+  FileNames = Asm.getFileNames();
+  // Emit ".file" as the source file name when there is no file name.
+  if (FileNames.empty())
+    FileNames.emplace_back(".file", 0);
+  for (const std::pair<std::string, size_t> &F : FileNames) {
+    if (nameShouldBeInStringTable(F.first))
+      Strings.add(F.first);
+  }
+
   Strings.finalize();
   assignAddressesAndIndices(Layout);
 }
@@ -547,10 +585,9 @@ void XCOFFObjectWriter::recordRelocation(MCAssembler &Asm,
     FixedValue = TOCEntryOffset;
   }
 
-  assert(
-      (TargetObjectWriter->is64Bit() ||
-       Fixup.getOffset() <= UINT32_MAX - Layout.getFragmentOffset(Fragment)) &&
-      "Fragment offset + fixup offset is overflowed in 32-bit mode.");
+  assert((Fixup.getOffset() <=
+          MaxRawDataSize - Layout.getFragmentOffset(Fragment)) &&
+         "Fragment offset + fixup offset is overflowed.");
   uint32_t FixupOffsetInCsect =
       Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
 
@@ -590,7 +627,7 @@ void XCOFFObjectWriter::recordRelocation(MCAssembler &Asm,
 
 void XCOFFObjectWriter::writeSections(const MCAssembler &Asm,
                                       const MCAsmLayout &Layout) {
-  uint32_t CurrentAddressLocation = 0;
+  uint64_t CurrentAddressLocation = 0;
   for (const auto *Section : Sections)
     writeSectionForControlSectionEntry(Asm, Layout, *Section,
                                        CurrentAddressLocation);
@@ -607,9 +644,6 @@ uint64_t XCOFFObjectWriter::writeObject(MCAssembler &Asm,
   if (Asm.isIncrementalLinkerCompatible())
     report_fatal_error("Incremental linking not supported for XCOFF.");
 
-  if (TargetObjectWriter->is64Bit())
-    report_fatal_error("64-bit XCOFF object files are not supported yet.");
-
   finalizeSectionInfo();
   uint64_t StartOffset = W.OS.tell();
 
@@ -617,7 +651,6 @@ uint64_t XCOFFObjectWriter::writeObject(MCAssembler &Asm,
   writeSectionHeaderTable();
   writeSections(Asm, Layout);
   writeRelocations();
-
   writeSymbolTable(Layout);
   // Write the string table.
   Strings.write(W.OS);
@@ -626,142 +659,130 @@ uint64_t XCOFFObjectWriter::writeObject(MCAssembler &Asm,
 }
 
 bool XCOFFObjectWriter::nameShouldBeInStringTable(const StringRef &SymbolName) {
-  return SymbolName.size() > XCOFF::NameSize;
+  return SymbolName.size() > XCOFF::NameSize || is64Bit();
 }
 
 void XCOFFObjectWriter::writeSymbolName(const StringRef &SymbolName) {
+  // Magic, Offset or SymbolName.
   if (nameShouldBeInStringTable(SymbolName)) {
     W.write<int32_t>(0);
     W.write<uint32_t>(Strings.getOffset(SymbolName));
   } else {
-    char Name[XCOFF::NameSize+1];
+    char Name[XCOFF::NameSize + 1];
     std::strncpy(Name, SymbolName.data(), XCOFF::NameSize);
     ArrayRef<char> NameRef(Name, XCOFF::NameSize);
     W.write(NameRef);
   }
 }
 
-void XCOFFObjectWriter::writeSymbolTableEntryForCsectMemberLabel(
-    const Symbol &SymbolRef, const XCOFFSection &CSectionRef,
-    int16_t SectionIndex, uint64_t SymbolOffset) {
-  // Name or Zeros and string table offset
-  writeSymbolName(SymbolRef.getSymbolTableName());
-  assert(SymbolOffset <= UINT32_MAX - CSectionRef.Address &&
-         "Symbol address overflows.");
-  W.write<uint32_t>(CSectionRef.Address + SymbolOffset);
-  W.write<int16_t>(SectionIndex);
+void XCOFFObjectWriter::writeSymbolEntry(StringRef SymbolName, uint64_t Value,
+                                         int16_t SectionNumber,
+                                         uint16_t SymbolType,
+                                         uint8_t StorageClass,
+                                         uint8_t NumberOfAuxEntries) {
+  if (is64Bit()) {
+    W.write<uint64_t>(Value);
+    W.write<uint32_t>(Strings.getOffset(SymbolName));
+  } else {
+    writeSymbolName(SymbolName);
+    W.write<uint32_t>(Value);
+  }
+  W.write<int16_t>(SectionNumber);
   // Basic/Derived type. See the description of the n_type field for symbol
   // table entries for a detailed description. Since we don't yet support
   // visibility, and all other bits are either optionally set or reserved, this
   // is always zero.
-  // TODO FIXME How to assert a symbol's visibilty is default?
+  if (SymbolType != 0)
+    report_fatal_error("Emitting non-zero visibilities is not supported yet.");
   // TODO Set the function indicator (bit 10, 0x0020) for functions
   // when debugging is enabled.
-  W.write<uint16_t>(0);
-  W.write<uint8_t>(SymbolRef.getStorageClass());
-  // Always 1 aux entry for now.
-  W.write<uint8_t>(1);
-
-  // Now output the auxiliary entry.
-  W.write<uint32_t>(CSectionRef.SymbolTableIndex);
-  // Parameter typecheck hash. Not supported.
-  W.write<uint32_t>(0);
-  // Typecheck section number. Not supported.
-  W.write<uint16_t>(0);
-  // Symbol type: Label
-  W.write<uint8_t>(XCOFF::XTY_LD);
-  // Storage mapping class.
-  W.write<uint8_t>(CSectionRef.MCSec->getMappingClass());
-  // Reserved (x_stab).
-  W.write<uint32_t>(0);
-  // Reserved (x_snstab).
-  W.write<uint16_t>(0);
+  W.write<uint16_t>(SymbolType);
+  W.write<uint8_t>(StorageClass);
+  W.write<uint8_t>(NumberOfAuxEntries);
 }
 
-void XCOFFObjectWriter::writeSymbolTableEntryForDwarfSection(
+void XCOFFObjectWriter::writeSymbolAuxCsectEntry(uint64_t SectionOrLength,
+                                                 uint8_t SymbolAlignmentAndType,
+                                                 uint8_t StorageMappingClass) {
+  W.write<uint32_t>(is64Bit() ? Lo_32(SectionOrLength) : SectionOrLength);
+  W.write<uint32_t>(0); // ParameterHashIndex
+  W.write<uint16_t>(0); // TypeChkSectNum
+  W.write<uint8_t>(SymbolAlignmentAndType);
+  W.write<uint8_t>(StorageMappingClass);
+  if (is64Bit()) {
+    W.write<uint32_t>(Hi_32(SectionOrLength));
+    W.OS.write_zeros(1); // Reserved
+    W.write<uint8_t>(XCOFF::AUX_CSECT);
+  } else {
+    W.write<uint32_t>(0); // StabInfoIndex
+    W.write<uint16_t>(0); // StabSectNum
+  }
+}
+
+void XCOFFObjectWriter::writeSymbolAuxDwarfEntry(
+    uint64_t LengthOfSectionPortion, uint64_t NumberOfRelocEnt) {
+  writeWord(LengthOfSectionPortion);
+  if (!is64Bit())
+    W.OS.write_zeros(4); // Reserved
+  writeWord(NumberOfRelocEnt);
+  if (is64Bit()) {
+    W.OS.write_zeros(1); // Reserved
+    W.write<uint8_t>(XCOFF::AUX_SECT);
+  } else {
+    W.OS.write_zeros(6); // Reserved
+  }
+}
+
+void XCOFFObjectWriter::writeSymbolEntryForCsectMemberLabel(
+    const Symbol &SymbolRef, const XCOFFSection &CSectionRef,
+    int16_t SectionIndex, uint64_t SymbolOffset) {
+  assert(SymbolOffset <= MaxRawDataSize - CSectionRef.Address &&
+         "Symbol address overflowed.");
+
+  writeSymbolEntry(SymbolRef.getSymbolTableName(),
+                   CSectionRef.Address + SymbolOffset, SectionIndex,
+                   SymbolRef.getVisibilityType(), SymbolRef.getStorageClass());
+
+  writeSymbolAuxCsectEntry(CSectionRef.SymbolTableIndex, XCOFF::XTY_LD,
+                           CSectionRef.MCSec->getMappingClass());
+}
+
+void XCOFFObjectWriter::writeSymbolEntryForDwarfSection(
     const XCOFFSection &DwarfSectionRef, int16_t SectionIndex) {
   assert(DwarfSectionRef.MCSec->isDwarfSect() && "Not a DWARF section!");
 
-  // n_name, n_zeros, n_offset
-  writeSymbolName(DwarfSectionRef.getSymbolTableName());
-  // n_value
-  W.write<uint32_t>(0);
-  // n_scnum
-  W.write<int16_t>(SectionIndex);
-  // n_type
-  W.write<uint16_t>(0);
-  // n_sclass
-  W.write<uint8_t>(XCOFF::C_DWARF);
-  // Always 1 aux entry for now.
-  W.write<uint8_t>(1);
-
-  // Now output the auxiliary entry.
-  // x_scnlen
-  W.write<uint32_t>(DwarfSectionRef.Size);
-  // Reserved
-  W.write<uint32_t>(0);
-  // x_nreloc. Set to 0 for now.
-  W.write<uint32_t>(0);
-  // Reserved
-  W.write<uint32_t>(0);
-  // Reserved
-  W.write<uint16_t>(0);
+  writeSymbolEntry(DwarfSectionRef.getSymbolTableName(), /*Value=*/0,
+                   SectionIndex, /*SymbolType=*/0, XCOFF::C_DWARF);
+
+  writeSymbolAuxDwarfEntry(DwarfSectionRef.Size);
 }
 
-void XCOFFObjectWriter::writeSymbolTableEntryForControlSection(
+void XCOFFObjectWriter::writeSymbolEntryForControlSection(
     const XCOFFSection &CSectionRef, int16_t SectionIndex,
     XCOFF::StorageClass StorageClass) {
-  // n_name, n_zeros, n_offset
-  writeSymbolName(CSectionRef.getSymbolTableName());
-  // n_value
-  W.write<uint32_t>(CSectionRef.Address);
-  // n_scnum
-  W.write<int16_t>(SectionIndex);
-  // Basic/Derived type. See the description of the n_type field for symbol
-  // table entries for a detailed description. Since we don't yet support
-  // visibility, and all other bits are either optionally set or reserved, this
-  // is always zero.
-  // TODO FIXME How to assert a symbol's visibilty is default?
-  // TODO Set the function indicator (bit 10, 0x0020) for functions
-  // when debugging is enabled.
-  W.write<uint16_t>(0);
-  // n_sclass
-  W.write<uint8_t>(StorageClass);
-  // Always 1 aux entry for now.
-  W.write<uint8_t>(1);
-
-  // Now output the auxiliary entry.
-  W.write<uint32_t>(CSectionRef.Size);
-  // Parameter typecheck hash. Not supported.
-  W.write<uint32_t>(0);
-  // Typecheck section number. Not supported.
-  W.write<uint16_t>(0);
-  // Symbol type.
-  W.write<uint8_t>(getEncodedType(CSectionRef.MCSec));
-  // Storage mapping class.
-  W.write<uint8_t>(CSectionRef.MCSec->getMappingClass());
-  // Reserved (x_stab).
-  W.write<uint32_t>(0);
-  // Reserved (x_snstab).
-  W.write<uint16_t>(0);
+  writeSymbolEntry(CSectionRef.getSymbolTableName(), CSectionRef.Address,
+                   SectionIndex, CSectionRef.getVisibilityType(), StorageClass);
+
+  writeSymbolAuxCsectEntry(CSectionRef.Size, getEncodedType(CSectionRef.MCSec),
+                           CSectionRef.MCSec->getMappingClass());
 }
 
 void XCOFFObjectWriter::writeFileHeader() {
-  // Magic.
-  W.write<uint16_t>(0x01df);
-  // Number of sections.
+  W.write<uint16_t>(is64Bit() ? XCOFF::XCOFF64 : XCOFF::XCOFF32);
   W.write<uint16_t>(SectionCount);
-  // Timestamp field. For reproducible output we write a 0, which represents no
-  // timestamp.
-  W.write<int32_t>(0);
-  // Byte Offset to the start of the symbol table.
-  W.write<uint32_t>(SymbolTableOffset);
-  // Number of entries in the symbol table.
-  W.write<int32_t>(SymbolTableEntryCount);
-  // Size of the optional header.
-  W.write<uint16_t>(0);
-  // Flags.
-  W.write<uint16_t>(0);
+  W.write<int32_t>(0); // TimeStamp
+  writeWord(SymbolTableOffset);
+  if (is64Bit()) {
+    W.write<uint16_t>(0); // AuxHeaderSize. No optional header for an object
+                          // file that is not to be loaded.
+    W.write<uint16_t>(0); // Flags
+    W.write<int32_t>(SymbolTableEntryCount);
+  } else {
+    W.write<int32_t>(SymbolTableEntryCount);
+    W.write<uint16_t>(0); // AuxHeaderSize. No optional header for an object
+                          // file that is not to be loaded.
+    W.write<uint16_t>(0); // Flags
+  }
 }
 
 void XCOFFObjectWriter::writeSectionHeaderTable() {
@@ -777,28 +798,25 @@ void XCOFFObjectWriter::writeSectionHeaderTable() {
     // Write the Physical Address and Virtual Address. In an object file these
     // are the same.
     // We use 0 for DWARF sections' Physical and Virtual Addresses.
-    if (!IsDwarf) {
-      W.write<uint32_t>(Sec->Address);
-      W.write<uint32_t>(Sec->Address);
+    writeWord(IsDwarf ? 0 : Sec->Address);
+    writeWord(IsDwarf ? 0 : Sec->Address);
+
+    writeWord(Sec->Size);
+    writeWord(Sec->FileOffsetToData);
+    writeWord(Sec->FileOffsetToRelocations);
+    writeWord(0); // FileOffsetToLineNumberInfo. Not supported yet.
+
+    if (is64Bit()) {
+      W.write<uint32_t>(Sec->RelocationCount);
+      W.write<uint32_t>(0); // NumberOfLineNumbers. Not supported yet.
+      W.write<int32_t>(Sec->Flags);
+      W.OS.write_zeros(4);
     } else {
-      W.write<uint32_t>(0);
-      W.write<uint32_t>(0);
+      W.write<uint16_t>(Sec->RelocationCount);
+      W.write<uint16_t>(0); // NumberOfLineNumbers. Not supported yet.
+      W.write<int32_t>(Sec->Flags);
     }
 
-    W.write<uint32_t>(Sec->Size);
-    W.write<uint32_t>(Sec->FileOffsetToData);
-    W.write<uint32_t>(Sec->FileOffsetToRelocations);
-
-    // Line number pointer. Not supported yet.
-    W.write<uint32_t>(0);
-
-    W.write<uint16_t>(Sec->RelocationCount);
-
-    // Line number counts. Not supported yet.
-    W.write<uint16_t>(0);
-
-    W.write<int32_t>(Sec->Flags);
-
     return true;
   };
 
@@ -811,11 +829,11 @@ void XCOFFObjectWriter::writeSectionHeaderTable() {
 void XCOFFObjectWriter::writeRelocation(XCOFFRelocation Reloc,
                                         const XCOFFSection &Section) {
   if (Section.MCSec->isCsect())
-    W.write<uint32_t>(Section.Address + Reloc.FixupOffsetInCsect);
+    writeWord(Section.Address + Reloc.FixupOffsetInCsect);
   else {
     // DWARF sections' address is set to 0.
     assert(Section.MCSec->isDwarfSect() && "unsupport section type!");
-    W.write<uint32_t>(Reloc.FixupOffsetInCsect);
+    writeWord(Reloc.FixupOffsetInCsect);
   }
   W.write<uint32_t>(Reloc.SymbolTableIndex);
   W.write<uint8_t>(Reloc.SignAndSize);
@@ -845,34 +863,18 @@ void XCOFFObjectWriter::writeRelocations() {
 }
 
 void XCOFFObjectWriter::writeSymbolTable(const MCAsmLayout &Layout) {
-  // Write symbol 0 as C_FILE.
-  // FIXME: support 64-bit C_FILE symbol.
-  //
-  // n_name. The n_name of a C_FILE symbol is the source filename when no
-  // auxiliary entries are present. The source filename is alternatively
-  // provided by an auxiliary entry, in which case the n_name of the C_FILE
-  // symbol is `.file`.
-  // FIXME: add the real source filename.
-  writeSymbolName(".file");
-  // n_value. The n_value of a C_FILE symbol is its symbol table index.
-  W.write<uint32_t>(0);
-  // n_scnum. N_DEBUG is a reserved section number for indicating a special
-  // symbolic debugging symbol.
-  W.write<int16_t>(XCOFF::ReservedSectionNum::N_DEBUG);
-  // n_type. The n_type field of a C_FILE symbol encodes the source language and
-  // CPU version info; zero indicates no info.
-  W.write<uint16_t>(0);
-  // n_sclass. The C_FILE symbol provides source file-name information,
-  // source-language ID and CPU-version ID information and some other optional
-  // infos.
-  W.write<uint8_t>(XCOFF::C_FILE);
-  // n_numaux. No aux entry for now.
-  W.write<uint8_t>(0);
+  // Write C_FILE symbols.
+  // The n_name of a C_FILE symbol is the source file's name when no auxiliary
+  // entries are present.
+  for (const std::pair<std::string, size_t> &F : FileNames) {
+    writeSymbolEntry(F.first, /*Value=*/0, XCOFF::ReservedSectionNum::N_DEBUG,
+                     /*SymbolType=*/0, XCOFF::C_FILE,
+                     /*NumberOfAuxEntries=*/0);
+  }
 
   for (const auto &Csect : UndefinedCsects) {
-    writeSymbolTableEntryForControlSection(Csect,
-                                           XCOFF::ReservedSectionNum::N_UNDEF,
-                                           Csect.MCSec->getStorageClass());
+    writeSymbolEntryForControlSection(Csect, XCOFF::ReservedSectionNum::N_UNDEF,
+                                      Csect.MCSec->getStorageClass());
   }
 
   for (const auto *Section : Sections) {
@@ -887,19 +889,19 @@ void XCOFFObjectWriter::writeSymbolTable(const MCAsmLayout &Layout) {
       const int16_t SectionIndex = Section->Index;
       for (const auto &Csect : *Group) {
         // Write out the control section first and then each symbol in it.
-        writeSymbolTableEntryForControlSection(Csect, SectionIndex,
-                                               Csect.MCSec->getStorageClass());
+        writeSymbolEntryForControlSection(Csect, SectionIndex,
+                                          Csect.MCSec->getStorageClass());
 
         for (const auto &Sym : Csect.Syms)
-          writeSymbolTableEntryForCsectMemberLabel(
+          writeSymbolEntryForCsectMemberLabel(
               Sym, Csect, SectionIndex, Layout.getSymbolOffset(*(Sym.MCSym)));
       }
     }
   }
 
   for (const auto &DwarfSection : DwarfSections)
-    writeSymbolTableEntryForDwarfSection(*DwarfSection.DwarfSect,
-                                         DwarfSection.Index);
+    writeSymbolEntryForDwarfSection(*DwarfSection.DwarfSect,
+                                    DwarfSection.Index);
 }
 
 void XCOFFObjectWriter::finalizeSectionInfo() {
@@ -914,8 +916,10 @@ void XCOFFObjectWriter::finalizeSectionInfo() {
 
       for (auto &Csect : *Group) {
         const size_t CsectRelocCount = Csect.Relocations.size();
-        if (CsectRelocCount >= XCOFF::RelocOverflow ||
-            Section->RelocationCount >= XCOFF::RelocOverflow - CsectRelocCount)
+        // An XCOFF64 file may not contain an overflow section header.
+        if (!is64Bit() && (CsectRelocCount >= XCOFF::RelocOverflow ||
+                           Section->RelocationCount >=
+                               XCOFF::RelocOverflow - CsectRelocCount))
           report_fatal_error(
               "relocation entries overflowed; overflow section is "
               "not implemented yet");
@@ -938,10 +942,12 @@ void XCOFFObjectWriter::finalizeSectionInfo() {
       return false;
 
     Sec->FileOffsetToRelocations = RawPointer;
-    const uint32_t RelocationSizeInSec =
-        Sec->RelocationCount * XCOFF::RelocationSerializationSize32;
+    const uint64_t RelocationSizeInSec =
+        Sec->RelocationCount * (is64Bit()
+                                    ? XCOFF::RelocationSerializationSize64
+                                    : XCOFF::RelocationSerializationSize32);
     RawPointer += RelocationSizeInSec;
-    if (RawPointer > UINT32_MAX)
+    if (RawPointer > MaxRawDataSize)
       report_fatal_error("Relocation data overflowed this object file.");
 
     return true;
@@ -960,8 +966,8 @@ void XCOFFObjectWriter::finalizeSectionInfo() {
 }
 
 void XCOFFObjectWriter::assignAddressesAndIndices(const MCAsmLayout &Layout) {
-  // The first symbol table entry (at index 0) is for the file name.
-  uint32_t SymbolTableIndex = 1;
+  // The symbol table starts with all the C_FILE symbols.
+  uint32_t SymbolTableIndex = FileNames.size();
 
   // Calculate indices for undefined symbols.
   for (auto &Csect : UndefinedCsects) {
@@ -976,10 +982,11 @@ void XCOFFObjectWriter::assignAddressesAndIndices(const MCAsmLayout &Layout) {
   // The address corrresponds to the address of sections and symbols in the
   // object file. We place the shared address 0 immediately after the
   // section header table.
-  uint32_t Address = 0;
+  uint64_t Address = 0;
   // Section indices are 1-based in XCOFF.
   int32_t SectionIndex = 1;
   bool HasTDataSection = false;
+  uint32_t PaddingsBeforeDwarf = 0;
 
   for (auto *Section : Sections) {
     const bool IsEmpty =
@@ -1039,6 +1046,19 @@ void XCOFFObjectWriter::assignAddressesAndIndices(const MCAsmLayout &Layout) {
     Section->Size = Address - Section->Address;
   }
 
+  // Start to generate DWARF sections. Sections other than DWARF section use
+  // DefaultSectionAlign as the default alignment, while DWARF sections have
+  // their own alignments. If these two alignments are not the same, we need
+  // some paddings here and record the paddings bytes for FileOffsetToData
+  // calculation.
+  if (!DwarfSections.empty())
+    PaddingsBeforeDwarf =
+        alignTo(Address,
+                (*DwarfSections.begin()).DwarfSect->MCSec->getAlignment()) -
+        Address;
+
+  DwarfSectionEntry *LastDwarfSection = nullptr;
+
   for (auto &DwarfSection : DwarfSections) {
     assert((SectionIndex <= MaxSectionIndex) && "Section index overflow!");
 
@@ -1066,40 +1086,52 @@ void XCOFFObjectWriter::assignAddressesAndIndices(const MCAsmLayout &Layout) {
     // For DWARF section, we must use the real size which may be not aligned.
     DwarfSection.Size = DwarfSect.Size = Layout.getSectionAddressSize(MCSec);
 
-    // Make the Address align to default alignment for follow section.
-    Address = alignTo(DwarfSect.Address + DwarfSect.Size, DefaultSectionAlign);
+    Address = DwarfSection.Address + DwarfSection.Size;
+
+    if (LastDwarfSection)
+      LastDwarfSection->MemorySize =
+          DwarfSection.Address - LastDwarfSection->Address;
+    LastDwarfSection = &DwarfSection;
+  }
+  if (LastDwarfSection) {
+    // Make the final DWARF section address align to the default section
+    // alignment for follow contents.
+    Address = alignTo(LastDwarfSection->Address + LastDwarfSection->Size,
+                      DefaultSectionAlign);
+    LastDwarfSection->MemorySize = Address - LastDwarfSection->Address;
   }
 
   SymbolTableEntryCount = SymbolTableIndex;
 
   // Calculate the RawPointer value for each section.
-  uint64_t RawPointer = XCOFF::FileHeaderSize32 + auxiliaryHeaderSize() +
-                        SectionCount * XCOFF::SectionHeaderSize32;
+  uint64_t RawPointer =
+      (is64Bit() ? (XCOFF::FileHeaderSize64 +
+                    SectionCount * XCOFF::SectionHeaderSize64)
+                 : (XCOFF::FileHeaderSize32 +
+                    SectionCount * XCOFF::SectionHeaderSize32)) +
+      auxiliaryHeaderSize();
+
   for (auto *Sec : Sections) {
     if (Sec->Index == SectionEntry::UninitializedIndex || Sec->IsVirtual)
       continue;
 
     Sec->FileOffsetToData = RawPointer;
     RawPointer += Sec->Size;
-    if (RawPointer > UINT32_MAX)
+    if (RawPointer > MaxRawDataSize)
       report_fatal_error("Section raw data overflowed this object file.");
   }
 
-  for (auto &DwarfSection : DwarfSections) {
-    // Address of csect sections are always aligned to DefaultSectionAlign, but
-    // address of DWARF section are aligned to Section alignment which may be
-    // bigger than DefaultSectionAlign, need to execlude the padding bits.
-    RawPointer =
-          alignTo(RawPointer, DwarfSection.DwarfSect->MCSec->getAlignment());
+  // Increase the raw pointer for the padding bytes between csect sections and
+  // DWARF sections.
+  if (!DwarfSections.empty())
+    RawPointer += PaddingsBeforeDwarf;
 
+  for (auto &DwarfSection : DwarfSections) {
     DwarfSection.FileOffsetToData = RawPointer;
-    // Some section entries, like DWARF section size is not aligned, so
-    // RawPointer may be not aligned.
-    RawPointer += DwarfSection.Size;
-    // Make sure RawPointer is aligned.
-    RawPointer = alignTo(RawPointer, DefaultSectionAlign);
 
-    assert(RawPointer <= UINT32_MAX &&
+    RawPointer += DwarfSection.MemorySize;
+
+    assert(RawPointer <= MaxRawDataSize &&
            "Section raw data overflowed this object file.");
   }
 
@@ -1108,7 +1140,7 @@ void XCOFFObjectWriter::assignAddressesAndIndices(const MCAsmLayout &Layout) {
 
 void XCOFFObjectWriter::writeSectionForControlSectionEntry(
     const MCAssembler &Asm, const MCAsmLayout &Layout,
-    const CsectSectionEntry &CsectEntry, uint32_t &CurrentAddressLocation) {
+    const CsectSectionEntry &CsectEntry, uint64_t &CurrentAddressLocation) {
   // Nothing to write for this Section.
   if (CsectEntry.Index == SectionEntry::UninitializedIndex)
     return;
@@ -1146,7 +1178,7 @@ void XCOFFObjectWriter::writeSectionForControlSectionEntry(
   // The size of the tail padding in a section is the end virtual address of
   // the current section minus the the end virtual address of the last csect
   // in that section.
-  if (uint32_t PaddingSize =
+  if (uint64_t PaddingSize =
           CsectEntry.Address + CsectEntry.Size - CurrentAddressLocation) {
     W.OS.write_zeros(PaddingSize);
     CurrentAddressLocation += PaddingSize;
@@ -1155,7 +1187,7 @@ void XCOFFObjectWriter::writeSectionForControlSectionEntry(
 
 void XCOFFObjectWriter::writeSectionForDwarfSectionEntry(
     const MCAssembler &Asm, const MCAsmLayout &Layout,
-    const DwarfSectionEntry &DwarfEntry, uint32_t &CurrentAddressLocation) {
+    const DwarfSectionEntry &DwarfEntry, uint64_t &CurrentAddressLocation) {
   // There could be a gap (without corresponding zero padding) between
   // sections. For example DWARF section alignment is bigger than
   // DefaultSectionAlign.
@@ -1163,7 +1195,7 @@ void XCOFFObjectWriter::writeSectionForDwarfSectionEntry(
          "CurrentAddressLocation should be less than or equal to section "
          "address.");
 
-  if (uint32_t PaddingSize = DwarfEntry.Address - CurrentAddressLocation)
+  if (uint64_t PaddingSize = DwarfEntry.Address - CurrentAddressLocation)
     W.OS.write_zeros(PaddingSize);
 
   if (DwarfEntry.Size)
diff --git a/llvm/lib/MCA/CustomBehaviour.cpp b/llvm/lib/MCA/CustomBehaviour.cpp
index a9ea8edff059..a10a2f5c56f0 100644
--- a/llvm/lib/MCA/CustomBehaviour.cpp
+++ b/llvm/lib/MCA/CustomBehaviour.cpp
@@ -16,7 +16,7 @@
 namespace llvm {
 namespace mca {
 
-CustomBehaviour::~CustomBehaviour() {}
+CustomBehaviour::~CustomBehaviour() = default;
 
 unsigned CustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst,
                                             const InstRef &IR) {
diff --git a/llvm/lib/MCA/HardwareUnits/LSUnit.cpp b/llvm/lib/MCA/HardwareUnits/LSUnit.cpp
index 121d320f10e6..bdc8b3d0e390 100644
--- a/llvm/lib/MCA/HardwareUnits/LSUnit.cpp
+++ b/llvm/lib/MCA/HardwareUnits/LSUnit.cpp
@@ -39,7 +39,7 @@ LSUnitBase::LSUnitBase(const MCSchedModel &SM, unsigned LQ, unsigned SQ,
   }
 }
 
-LSUnitBase::~LSUnitBase() {}
+LSUnitBase::~LSUnitBase() = default;
 
 void LSUnitBase::cycleEvent() {
   for (const std::pair<unsigned, std::unique_ptr<MemoryGroup>> &G : Groups)
@@ -67,17 +67,17 @@ void LSUnitBase::dump() const {
 #endif
 
 unsigned LSUnit::dispatch(const InstRef &IR) {
-  const InstrDesc &Desc = IR.getInstruction()->getDesc();
-  bool IsStoreBarrier = IR.getInstruction()->isAStoreBarrier();
-  bool IsLoadBarrier = IR.getInstruction()->isALoadBarrier();
-  assert((Desc.MayLoad || Desc.MayStore) && "Not a memory operation!");
+  const Instruction &IS = *IR.getInstruction();
+  bool IsStoreBarrier = IS.isAStoreBarrier();
+  bool IsLoadBarrier = IS.isALoadBarrier();
+  assert((IS.getMayLoad() || IS.getMayStore()) && "Not a memory operation!");
 
-  if (Desc.MayLoad)
+  if (IS.getMayLoad())
     acquireLQSlot();
-  if (Desc.MayStore)
+  if (IS.getMayStore())
     acquireSQSlot();
 
-  if (Desc.MayStore) {
+  if (IS.getMayStore()) {
     unsigned NewGID = createMemoryGroup();
     MemoryGroup &NewGroup = getGroup(NewGID);
     NewGroup.addInstruction();
@@ -115,7 +115,7 @@ unsigned LSUnit::dispatch(const InstRef &IR) {
     if (IsStoreBarrier)
       CurrentStoreBarrierGroupID = NewGID;
 
-    if (Desc.MayLoad) {
+    if (IS.getMayLoad()) {
       CurrentLoadGroupID = NewGID;
       if (IsLoadBarrier)
         CurrentLoadBarrierGroupID = NewGID;
@@ -124,7 +124,7 @@ unsigned LSUnit::dispatch(const InstRef &IR) {
     return NewGID;
   }
 
-  assert(Desc.MayLoad && "Expected a load!");
+  assert(IS.getMayLoad() && "Expected a load!");
 
   unsigned ImmediateLoadDominator =
       std::max(CurrentLoadGroupID, CurrentLoadBarrierGroupID);
@@ -194,10 +194,10 @@ unsigned LSUnit::dispatch(const InstRef &IR) {
 }
 
 LSUnit::Status LSUnit::isAvailable(const InstRef &IR) const {
-  const InstrDesc &Desc = IR.getInstruction()->getDesc();
-  if (Desc.MayLoad && isLQFull())
+  const Instruction &IS = *IR.getInstruction();
+  if (IS.getMayLoad() && isLQFull())
     return LSUnit::LSU_LQUEUE_FULL;
-  if (Desc.MayStore && isSQFull())
+  if (IS.getMayStore() && isSQFull())
     return LSUnit::LSU_SQUEUE_FULL;
   return LSUnit::LSU_AVAILABLE;
 }
@@ -212,9 +212,9 @@ void LSUnitBase::onInstructionExecuted(const InstRef &IR) {
 }
 
 void LSUnitBase::onInstructionRetired(const InstRef &IR) {
-  const InstrDesc &Desc = IR.getInstruction()->getDesc();
-  bool IsALoad = Desc.MayLoad;
-  bool IsAStore = Desc.MayStore;
+  const Instruction &IS = *IR.getInstruction();
+  bool IsALoad = IS.getMayLoad();
+  bool IsAStore = IS.getMayStore();
   assert((IsALoad || IsAStore) && "Expected a memory operation!");
 
   if (IsALoad) {
diff --git a/llvm/lib/MCA/IncrementalSourceMgr.cpp b/llvm/lib/MCA/IncrementalSourceMgr.cpp
new file mode 100644
index 000000000000..10b86b501a2e
--- /dev/null
+++ b/llvm/lib/MCA/IncrementalSourceMgr.cpp
@@ -0,0 +1,51 @@
+//===-------------------- IncrementalSourceMgr.cpp ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines some implementations for IncrementalSourceMgr.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MCA/IncrementalSourceMgr.h"
+#ifndef NDEBUG
+#include "llvm/Support/Format.h"
+#endif
+
+using namespace llvm;
+using namespace llvm::mca;
+
+void IncrementalSourceMgr::clear() {
+  Staging.clear();
+  InstStorage.clear();
+  TotalCounter = 0U;
+  EOS = false;
+}
+
+void IncrementalSourceMgr::updateNext() {
+  ++TotalCounter;
+  Instruction *I = Staging.front();
+  Staging.pop_front();
+  I->reset();
+
+  if (InstFreedCB)
+    InstFreedCB(I);
+}
+
+#ifndef NDEBUG
+void IncrementalSourceMgr::printStatistic(raw_ostream &OS) {
+  unsigned MaxInstStorageSize = InstStorage.size();
+  if (MaxInstStorageSize <= TotalCounter) {
+    auto Ratio = double(MaxInstStorageSize) / double(TotalCounter);
+    OS << "Cache ratio = " << MaxInstStorageSize << " / " << TotalCounter
+       << llvm::format(" (%.2f%%)", (1.0 - Ratio) * 100.0) << "\n";
+  } else {
+    OS << "Error: Number of created instructions "
+       << "are larger than the number of issued instructions\n";
+  }
+}
+#endif
diff --git a/llvm/lib/MCA/InstrBuilder.cpp b/llvm/lib/MCA/InstrBuilder.cpp
index d8283f8d2682..45acea253587 100644
--- a/llvm/lib/MCA/InstrBuilder.cpp
+++ b/llvm/lib/MCA/InstrBuilder.cpp
@@ -14,16 +14,19 @@
 #include "llvm/MCA/InstrBuilder.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 
-#define DEBUG_TYPE "llvm-mca"
+#define DEBUG_TYPE "llvm-mca-instrbuilder"
 
 namespace llvm {
 namespace mca {
 
+char RecycledInstErr::ID = 0;
+
 InstrBuilder::InstrBuilder(const llvm::MCSubtargetInfo &sti,
                            const llvm::MCInstrInfo &mcii,
                            const llvm::MCRegisterInfo &mri,
@@ -572,6 +575,7 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI) {
 
   LLVM_DEBUG(dbgs() << "\n\t\tOpcode Name= " << MCII.getName(Opcode) << '\n');
   LLVM_DEBUG(dbgs() << "\t\tSchedClassID=" << SchedClassID << '\n');
+  LLVM_DEBUG(dbgs() << "\t\tOpcode=" << Opcode << '\n');
 
   // Create a new empty descriptor.
   std::unique_ptr<InstrDesc> ID = std::make_unique<InstrDesc>();
@@ -593,13 +597,6 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI) {
     FirstReturnInst = false;
   }
 
-  ID->MayLoad = MCDesc.mayLoad();
-  ID->MayStore = MCDesc.mayStore();
-  ID->HasSideEffects = MCDesc.hasUnmodeledSideEffects();
-  ID->BeginGroup = SCDesc.BeginGroup;
-  ID->EndGroup = SCDesc.EndGroup;
-  ID->RetireOOO = SCDesc.RetireOOO;
-
   initializeUsedResources(*ID, SCDesc, STI, ProcResourceMasks);
   computeMaxLatency(*ID, MCDesc, SCDesc, STI);
 
@@ -618,7 +615,7 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI) {
 
   // Now add the new descriptor.
   bool IsVariadic = MCDesc.isVariadic();
-  if (!IsVariadic && !IsVariant) {
+  if ((ID->IsRecyclable = !IsVariadic && !IsVariant)) {
     Descriptors[MCI.getOpcode()] = std::move(ID);
     return *Descriptors[MCI.getOpcode()];
   }
@@ -638,14 +635,43 @@ InstrBuilder::getOrCreateInstrDesc(const MCInst &MCI) {
   return createInstrDescImpl(MCI);
 }
 
+STATISTIC(NumVariantInst, "Number of MCInsts that doesn't have static Desc");
+
 Expected<std::unique_ptr<Instruction>>
 InstrBuilder::createInstruction(const MCInst &MCI) {
   Expected<const InstrDesc &> DescOrErr = getOrCreateInstrDesc(MCI);
   if (!DescOrErr)
     return DescOrErr.takeError();
   const InstrDesc &D = *DescOrErr;
-  std::unique_ptr<Instruction> NewIS =
-      std::make_unique<Instruction>(D, MCI.getOpcode());
+  Instruction *NewIS = nullptr;
+  std::unique_ptr<Instruction> CreatedIS;
+  bool IsInstRecycled = false;
+
+  if (!D.IsRecyclable)
+    ++NumVariantInst;
+
+  if (D.IsRecyclable && InstRecycleCB) {
+    if (auto *I = InstRecycleCB(D)) {
+      NewIS = I;
+      NewIS->reset();
+      IsInstRecycled = true;
+    }
+  }
+  if (!IsInstRecycled) {
+    CreatedIS = std::make_unique<Instruction>(D, MCI.getOpcode());
+    NewIS = CreatedIS.get();
+  }
+
+  const MCInstrDesc &MCDesc = MCII.get(MCI.getOpcode());
+  const MCSchedClassDesc &SCDesc =
+      *STI.getSchedModel().getSchedClassDesc(D.SchedClassID);
+
+  NewIS->setMayLoad(MCDesc.mayLoad());
+  NewIS->setMayStore(MCDesc.mayStore());
+  NewIS->setHasSideEffects(MCDesc.hasUnmodeledSideEffects());
+  NewIS->setBeginGroup(SCDesc.BeginGroup);
+  NewIS->setEndGroup(SCDesc.EndGroup);
+  NewIS->setRetireOOO(SCDesc.RetireOOO);
 
   // Check if this is a dependency breaking instruction.
   APInt Mask;
@@ -663,6 +689,7 @@ InstrBuilder::createInstruction(const MCInst &MCI) {
 
   // Initialize Reads first.
   MCPhysReg RegID = 0;
+  size_t Idx = 0U;
   for (const ReadDescriptor &RD : D.Reads) {
     if (!RD.isImplicitRead()) {
       // explicit read.
@@ -681,15 +708,22 @@ InstrBuilder::createInstruction(const MCInst &MCI) {
       continue;
 
     // Okay, this is a register operand. Create a ReadState for it.
-    NewIS->getUses().emplace_back(RD, RegID);
-    ReadState &RS = NewIS->getUses().back();
+    ReadState *RS = nullptr;
+    if (IsInstRecycled && Idx < NewIS->getUses().size()) {
+      NewIS->getUses()[Idx] = ReadState(RD, RegID);
+      RS = &NewIS->getUses()[Idx++];
+    } else {
+      NewIS->getUses().emplace_back(RD, RegID);
+      RS = &NewIS->getUses().back();
+      ++Idx;
+    }
 
     if (IsDepBreaking) {
       // A mask of all zeroes means: explicit input operands are not
       // independent.
       if (Mask.isZero()) {
         if (!RD.isImplicitRead())
-          RS.setIndependentFromDef();
+          RS->setIndependentFromDef();
       } else {
         // Check if this register operand is independent according to `Mask`.
         // Note that Mask may not have enough bits to describe all explicit and
@@ -699,15 +733,21 @@ InstrBuilder::createInstruction(const MCInst &MCI) {
         if (Mask.getBitWidth() > RD.UseIndex) {
           // Okay. This map describe register use `RD.UseIndex`.
           if (Mask[RD.UseIndex])
-            RS.setIndependentFromDef();
+            RS->setIndependentFromDef();
         }
       }
     }
   }
+  if (IsInstRecycled && Idx < NewIS->getUses().size())
+    NewIS->getUses().pop_back_n(NewIS->getUses().size() - Idx);
 
   // Early exit if there are no writes.
-  if (D.Writes.empty())
-    return std::move(NewIS);
+  if (D.Writes.empty()) {
+    if (IsInstRecycled)
+      return llvm::make_error<RecycledInstErr>(NewIS);
+    else
+      return std::move(CreatedIS);
+  }
 
   // Track register writes that implicitly clear the upper portion of the
   // underlying super-registers using an APInt.
@@ -720,6 +760,7 @@ InstrBuilder::createInstruction(const MCInst &MCI) {
 
   // Initialize writes.
   unsigned WriteIndex = 0;
+  Idx = 0U;
   for (const WriteDescriptor &WD : D.Writes) {
     RegID = WD.isImplicitWrite() ? WD.RegisterID
                                  : MCI.getOperand(WD.OpIndex).getReg();
@@ -730,13 +771,26 @@ InstrBuilder::createInstruction(const MCInst &MCI) {
     }
 
     assert(RegID && "Expected a valid register ID!");
-    NewIS->getDefs().emplace_back(WD, RegID,
-                                  /* ClearsSuperRegs */ WriteMask[WriteIndex],
-                                  /* WritesZero */ IsZeroIdiom);
+    if (IsInstRecycled && Idx < NewIS->getDefs().size()) {
+      NewIS->getDefs()[Idx++] =
+          WriteState(WD, RegID,
+                     /* ClearsSuperRegs */ WriteMask[WriteIndex],
+                     /* WritesZero */ IsZeroIdiom);
+    } else {
+      NewIS->getDefs().emplace_back(WD, RegID,
+                                    /* ClearsSuperRegs */ WriteMask[WriteIndex],
+                                    /* WritesZero */ IsZeroIdiom);
+      ++Idx;
+    }
     ++WriteIndex;
   }
+  if (IsInstRecycled && Idx < NewIS->getDefs().size())
+    NewIS->getDefs().pop_back_n(NewIS->getDefs().size() - Idx);
 
-  return std::move(NewIS);
+  if (IsInstRecycled)
+    return llvm::make_error<RecycledInstErr>(NewIS);
+  else
+    return std::move(CreatedIS);
 }
 } // namespace mca
 } // namespace llvm
diff --git a/llvm/lib/MCA/Instruction.cpp b/llvm/lib/MCA/Instruction.cpp
index e658b869a67e..d4adfce59713 100644
--- a/llvm/lib/MCA/Instruction.cpp
+++ b/llvm/lib/MCA/Instruction.cpp
@@ -148,6 +148,18 @@ const CriticalDependency &Instruction::computeCriticalRegDep() {
   return CriticalRegDep;
 }
 
+void Instruction::reset() {
+  // Note that this won't clear read/write descriptors
+  // or other non-trivial fields
+  Stage = IS_INVALID;
+  CyclesLeft = UNKNOWN_CYCLES;
+  clearOptimizableMove();
+  RCUTokenID = 0;
+  LSUTokenID = 0;
+  CriticalResourceMask = 0;
+  IsEliminated = false;
+}
+
 void Instruction::dispatch(unsigned RCUToken) {
   assert(Stage == IS_INVALID);
   Stage = IS_DISPATCHED;
diff --git a/llvm/lib/MCA/Pipeline.cpp b/llvm/lib/MCA/Pipeline.cpp
index 22b9d0799f77..c94fe1422a69 100644
--- a/llvm/lib/MCA/Pipeline.cpp
+++ b/llvm/lib/MCA/Pipeline.cpp
@@ -38,7 +38,8 @@ Expected<unsigned> Pipeline::run() {
   assert(!Stages.empty() && "Unexpected empty pipeline found!");
 
   do {
-    notifyCycleBegin();
+    if (!isPaused())
+      notifyCycleBegin();
     if (Error Err = runCycle())
       return std::move(Err);
     notifyCycleEnd();
@@ -53,15 +54,25 @@ Error Pipeline::runCycle() {
   // Update stages before we start processing new instructions.
   for (auto I = Stages.rbegin(), E = Stages.rend(); I != E && !Err; ++I) {
     const std::unique_ptr<Stage> &S = *I;
-    Err = S->cycleStart();
+    if (isPaused())
+      Err = S->cycleResume();
+    else
+      Err = S->cycleStart();
   }
 
+  CurrentState = State::Started;
+
   // Now fetch and execute new instructions.
   InstRef IR;
   Stage &FirstStage = *Stages[0];
   while (!Err && FirstStage.isAvailable(IR))
     Err = FirstStage.execute(IR);
 
+  if (Err.isA<InstStreamPause>()) {
+    CurrentState = State::Paused;
+    return Err;
+  }
+
   // Update stages in preparation for a new cycle.
   for (const std::unique_ptr<Stage> &S : Stages) {
     Err = S->cycleEnd();
diff --git a/llvm/lib/MCA/Stages/DispatchStage.cpp b/llvm/lib/MCA/Stages/DispatchStage.cpp
index 66228bd5a862..10e433bf1689 100644
--- a/llvm/lib/MCA/Stages/DispatchStage.cpp
+++ b/llvm/lib/MCA/Stages/DispatchStage.cpp
@@ -78,7 +78,6 @@ bool DispatchStage::canDispatch(const InstRef &IR) const {
 Error DispatchStage::dispatch(InstRef IR) {
   assert(!CarryOver && "Cannot dispatch another instruction!");
   Instruction &IS = *IR.getInstruction();
-  const InstrDesc &Desc = IS.getDesc();
   const unsigned NumMicroOps = IS.getNumMicroOps();
   if (NumMicroOps > DispatchWidth) {
     assert(AvailableEntries == DispatchWidth);
@@ -91,7 +90,7 @@ Error DispatchStage::dispatch(InstRef IR) {
   }
 
   // Check if this instructions ends the dispatch group.
-  if (Desc.EndGroup)
+  if (IS.getEndGroup())
     AvailableEntries = 0;
 
   // Check if this is an optimizable reg-reg move or an XCHG-like instruction.
@@ -159,12 +158,11 @@ bool DispatchStage::isAvailable(const InstRef &IR) const {
 
   const Instruction &Inst = *IR.getInstruction();
   unsigned NumMicroOps = Inst.getNumMicroOps();
-  const InstrDesc &Desc = Inst.getDesc();
   unsigned Required = std::min(NumMicroOps, DispatchWidth);
   if (Required > AvailableEntries)
     return false;
 
-  if (Desc.BeginGroup && AvailableEntries != DispatchWidth)
+  if (Inst.getBeginGroup() && AvailableEntries != DispatchWidth)
     return false;
 
   // The dispatch logic doesn't internally buffer instructions.  It only accepts
diff --git a/llvm/lib/MCA/Stages/EntryStage.cpp b/llvm/lib/MCA/Stages/EntryStage.cpp
index 66135790a4cd..6b3fbb8c6236 100644
--- a/llvm/lib/MCA/Stages/EntryStage.cpp
+++ b/llvm/lib/MCA/Stages/EntryStage.cpp
@@ -19,7 +19,7 @@ namespace llvm {
 namespace mca {
 
 bool EntryStage::hasWorkToComplete() const {
-  return static_cast<bool>(CurrentInstruction);
+  return static_cast<bool>(CurrentInstruction) || !SM.isEnd();
 }
 
 bool EntryStage::isAvailable(const InstRef & /* unused */) const {
@@ -28,15 +28,20 @@ bool EntryStage::isAvailable(const InstRef & /* unused */) const {
   return false;
 }
 
-void EntryStage::getNextInstruction() {
+Error EntryStage::getNextInstruction() {
   assert(!CurrentInstruction && "There is already an instruction to process!");
-  if (!SM.hasNext())
-    return;
+  if (!SM.hasNext()) {
+    if (!SM.isEnd())
+      return llvm::make_error<InstStreamPause>();
+    else
+      return llvm::ErrorSuccess();
+  }
   SourceRef SR = SM.peekNext();
   std::unique_ptr<Instruction> Inst = std::make_unique<Instruction>(SR.second);
   CurrentInstruction = InstRef(SR.first, Inst.get());
   Instructions.emplace_back(std::move(Inst));
   SM.updateNext();
+  return llvm::ErrorSuccess();
 }
 
 llvm::Error EntryStage::execute(InstRef & /*unused */) {
@@ -46,16 +51,20 @@ llvm::Error EntryStage::execute(InstRef & /*unused */) {
 
   // Move the program counter.
   CurrentInstruction.invalidate();
-  getNextInstruction();
-  return llvm::ErrorSuccess();
+  return getNextInstruction();
 }
 
 llvm::Error EntryStage::cycleStart() {
   if (!CurrentInstruction)
-    getNextInstruction();
+    return getNextInstruction();
   return llvm::ErrorSuccess();
 }
 
+llvm::Error EntryStage::cycleResume() {
+  assert(!CurrentInstruction);
+  return getNextInstruction();
+}
+
 llvm::Error EntryStage::cycleEnd() {
   // Find the first instruction which hasn't been retired.
   auto Range = make_range(&Instructions[NumRetired], Instructions.end());
diff --git a/llvm/lib/MCA/Stages/ExecuteStage.cpp b/llvm/lib/MCA/Stages/ExecuteStage.cpp
index 2b11f73b19df..369e2f5a4ef1 100644
--- a/llvm/lib/MCA/Stages/ExecuteStage.cpp
+++ b/llvm/lib/MCA/Stages/ExecuteStage.cpp
@@ -165,8 +165,8 @@ static void verifyInstructionEliminated(const InstRef &IR) {
 
   // Ensure that instructions eliminated at register renaming stage are in a
   // consistent state.
-  const InstrDesc &Desc = Inst.getDesc();
-  assert(!Desc.MayLoad && !Desc.MayStore && "Cannot eliminate a memory op!");
+  assert(!Inst.getMayLoad() && !Inst.getMayStore() &&
+         "Cannot eliminate a memory op!");
 }
 #endif
 
diff --git a/llvm/lib/MCA/Stages/InOrderIssueStage.cpp b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp
index abfbc80f17c9..0f1737dc3cbc 100644
--- a/llvm/lib/MCA/Stages/InOrderIssueStage.cpp
+++ b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp
@@ -63,7 +63,6 @@ bool InOrderIssueStage::isAvailable(const InstRef &IR) const {
 
   const Instruction &Inst = *IR.getInstruction();
   unsigned NumMicroOps = Inst.getNumMicroOps();
-  const InstrDesc &Desc = Inst.getDesc();
 
   bool ShouldCarryOver = NumMicroOps > getIssueWidth();
   if (Bandwidth < NumMicroOps && !ShouldCarryOver)
@@ -71,7 +70,7 @@ bool InOrderIssueStage::isAvailable(const InstRef &IR) const {
 
   // Instruction with BeginGroup must be the first instruction to be issued in a
   // cycle.
-  if (Desc.BeginGroup && NumIssued != 0)
+  if (Inst.getBeginGroup() && NumIssued != 0)
     return false;
 
   return true;
@@ -140,7 +139,7 @@ bool InOrderIssueStage::canExecute(const InstRef &IR) {
   }
 
   if (LastWriteBackCycle) {
-    if (!IR.getInstruction()->getDesc().RetireOOO) {
+    if (!IR.getInstruction()->getRetireOOO()) {
       unsigned NextWriteBackCycle = findFirstWriteBackCycle(IR);
       // Delay the instruction to ensure that writes happen in program order.
       if (NextWriteBackCycle < LastWriteBackCycle) {
@@ -254,7 +253,7 @@ llvm::Error InOrderIssueStage::tryIssue(InstRef &IR) {
     LLVM_DEBUG(dbgs() << "[N] Carry over #" << IR << " \n");
   } else {
     NumIssued += NumMicroOps;
-    Bandwidth = Desc.EndGroup ? 0 : Bandwidth - NumMicroOps;
+    Bandwidth = IS.getEndGroup() ? 0 : Bandwidth - NumMicroOps;
   }
 
   // If the instruction has a latency of 0, we need to handle
@@ -272,7 +271,7 @@ llvm::Error InOrderIssueStage::tryIssue(InstRef &IR) {
 
   IssuedInst.push_back(IR);
 
-  if (!IR.getInstruction()->getDesc().RetireOOO)
+  if (!IR.getInstruction()->getRetireOOO())
     LastWriteBackCycle = IS.getCyclesLeft();
 
   return llvm::ErrorSuccess();
@@ -325,7 +324,7 @@ void InOrderIssueStage::updateCarriedOver() {
 
   LLVM_DEBUG(dbgs() << "[N] Carry over (complete) #" << CarriedOver << " \n");
 
-  if (CarriedOver.getInstruction()->getDesc().EndGroup)
+  if (CarriedOver.getInstruction()->getEndGroup())
     Bandwidth = 0;
   else
     Bandwidth -= CarryOver;
diff --git a/llvm/lib/MCA/Stages/Stage.cpp b/llvm/lib/MCA/Stages/Stage.cpp
index ed512ac9711c..5613d4d6bd07 100644
--- a/llvm/lib/MCA/Stages/Stage.cpp
+++ b/llvm/lib/MCA/Stages/Stage.cpp
@@ -24,5 +24,6 @@ void Stage::addListener(HWEventListener *Listener) {
   Listeners.insert(Listener);
 }
 
+char InstStreamPause::ID = 0;
 } // namespace mca
 } // namespace llvm
diff --git a/llvm/lib/ObjCopy/Archive.cpp b/llvm/lib/ObjCopy/Archive.cpp
new file mode 100644
index 000000000000..742ca0b890cf
--- /dev/null
+++ b/llvm/lib/ObjCopy/Archive.cpp
@@ -0,0 +1,110 @@
+//===- Archive.cpp --------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Archive.h"
+#include "llvm/ObjCopy/CommonConfig.h"
+#include "llvm/ObjCopy/MultiFormatConfig.h"
+#include "llvm/ObjCopy/ObjCopy.h"
+#include "llvm/Object/Error.h"
+#include "llvm/Object/MachO.h"
+#include "llvm/Support/FileOutputBuffer.h"
+#include "llvm/Support/SmallVectorMemoryBuffer.h"
+
+namespace llvm {
+namespace objcopy {
+
+using namespace llvm::object;
+
+Expected<std::vector<NewArchiveMember>>
+createNewArchiveMembers(const MultiFormatConfig &Config, const Archive &Ar) {
+  std::vector<NewArchiveMember> NewArchiveMembers;
+  Error Err = Error::success();
+  for (const Archive::Child &Child : Ar.children(Err)) {
+    Expected<StringRef> ChildNameOrErr = Child.getName();
+    if (!ChildNameOrErr)
+      return createFileError(Ar.getFileName(), ChildNameOrErr.takeError());
+
+    Expected<std::unique_ptr<Binary>> ChildOrErr = Child.getAsBinary();
+    if (!ChildOrErr)
+      return createFileError(Ar.getFileName() + "(" + *ChildNameOrErr + ")",
+                             ChildOrErr.takeError());
+
+    SmallVector<char, 0> Buffer;
+    raw_svector_ostream MemStream(Buffer);
+
+    if (Error E = executeObjcopyOnBinary(Config, *ChildOrErr->get(), MemStream))
+      return std::move(E);
+
+    Expected<NewArchiveMember> Member = NewArchiveMember::getOldMember(
+        Child, Config.getCommonConfig().DeterministicArchives);
+    if (!Member)
+      return createFileError(Ar.getFileName(), Member.takeError());
+
+    Member->Buf = std::make_unique<SmallVectorMemoryBuffer>(
+        std::move(Buffer), ChildNameOrErr.get());
+    Member->MemberName = Member->Buf->getBufferIdentifier();
+    NewArchiveMembers.push_back(std::move(*Member));
+  }
+  if (Err)
+    return createFileError(Config.getCommonConfig().InputFilename,
+                           std::move(Err));
+  return std::move(NewArchiveMembers);
+}
+
+// For regular archives this function simply calls llvm::writeArchive,
+// For thin archives it writes the archive file itself as well as its members.
+static Error deepWriteArchive(StringRef ArcName,
+                              ArrayRef<NewArchiveMember> NewMembers,
+                              bool WriteSymtab, object::Archive::Kind Kind,
+                              bool Deterministic, bool Thin) {
+  if (Kind == object::Archive::K_BSD && !NewMembers.empty() &&
+      NewMembers.front().detectKindFromObject() == object::Archive::K_DARWIN)
+    Kind = object::Archive::K_DARWIN;
+
+  if (Error E = writeArchive(ArcName, NewMembers, WriteSymtab, Kind,
+                             Deterministic, Thin))
+    return createFileError(ArcName, std::move(E));
+
+  if (!Thin)
+    return Error::success();
+
+  for (const NewArchiveMember &Member : NewMembers) {
+    // For regular files (as is the case for deepWriteArchive),
+    // FileOutputBuffer::create will return OnDiskBuffer.
+    // OnDiskBuffer uses a temporary file and then renames it. So in reality
+    // there is no inefficiency / duplicated in-memory buffers in this case. For
+    // now in-memory buffers can not be completely avoided since
+    // NewArchiveMember still requires them even though writeArchive does not
+    // write them on disk.
+    Expected<std::unique_ptr<FileOutputBuffer>> FB =
+        FileOutputBuffer::create(Member.MemberName, Member.Buf->getBufferSize(),
+                                 FileOutputBuffer::F_executable);
+    if (!FB)
+      return FB.takeError();
+    std::copy(Member.Buf->getBufferStart(), Member.Buf->getBufferEnd(),
+              (*FB)->getBufferStart());
+    if (Error E = (*FB)->commit())
+      return E;
+  }
+  return Error::success();
+}
+
+Error executeObjcopyOnArchive(const MultiFormatConfig &Config,
+                              const object::Archive &Ar) {
+  Expected<std::vector<NewArchiveMember>> NewArchiveMembersOrErr =
+      createNewArchiveMembers(Config, Ar);
+  if (!NewArchiveMembersOrErr)
+    return NewArchiveMembersOrErr.takeError();
+  const CommonConfig &CommonConfig = Config.getCommonConfig();
+  return deepWriteArchive(CommonConfig.OutputFilename, *NewArchiveMembersOrErr,
+                          Ar.hasSymbolTable(), Ar.kind(),
+                          CommonConfig.DeterministicArchives, Ar.isThin());
+}
+
+} // end namespace objcopy
+} // end namespace llvm
diff --git a/llvm/lib/ObjCopy/Archive.h b/llvm/lib/ObjCopy/Archive.h
new file mode 100644
index 000000000000..08aae563505c
--- /dev/null
+++ b/llvm/lib/ObjCopy/Archive.h
@@ -0,0 +1,31 @@
+//===- Archive.h ------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_OBJCOPY_ARCHIVE_H
+#define LLVM_LIB_OBJCOPY_ARCHIVE_H
+
+#include "llvm/Object/ArchiveWriter.h"
+#include "llvm/Support/Error.h"
+#include <vector>
+
+namespace llvm {
+namespace objcopy {
+
+class MultiFormatConfig;
+
+/// Applies the transformations described by \p Config to
+/// each member in archive \p Ar.
+/// \returns Vector of transformed archive members.
+Expected<std::vector<NewArchiveMember>>
+createNewArchiveMembers(const MultiFormatConfig &Config,
+                        const object::Archive &Ar);
+
+} // end namespace objcopy
+} // end namespace llvm
+
+#endif // LLVM_LIB_OBJCOPY_ARCHIVE_H
diff --git a/llvm/lib/ObjCopy/COFF/COFFObjcopy.cpp b/llvm/lib/ObjCopy/COFF/COFFObjcopy.cpp
new file mode 100644
index 000000000000..cda93ce0fb3c
--- /dev/null
+++ b/llvm/lib/ObjCopy/COFF/COFFObjcopy.cpp
@@ -0,0 +1,311 @@
+//===- COFFObjcopy.cpp ----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ObjCopy/COFF/COFFObjcopy.h"
+#include "COFFObject.h"
+#include "COFFReader.h"
+#include "COFFWriter.h"
+#include "llvm/ObjCopy/COFF/COFFConfig.h"
+#include "llvm/ObjCopy/CommonConfig.h"
+
+#include "llvm/Object/Binary.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Support/CRC.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Path.h"
+#include <cassert>
+
+namespace llvm {
+namespace objcopy {
+namespace coff {
+
+using namespace object;
+using namespace COFF;
+
+static bool isDebugSection(const Section &Sec) {
+  return Sec.Name.startswith(".debug");
+}
+
+static uint64_t getNextRVA(const Object &Obj) {
+  if (Obj.getSections().empty())
+    return 0;
+  const Section &Last = Obj.getSections().back();
+  return alignTo(Last.Header.VirtualAddress + Last.Header.VirtualSize,
+                 Obj.IsPE ? Obj.PeHeader.SectionAlignment : 1);
+}
+
+static Expected<std::vector<uint8_t>>
+createGnuDebugLinkSectionContents(StringRef File) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> LinkTargetOrErr =
+      MemoryBuffer::getFile(File);
+  if (!LinkTargetOrErr)
+    return createFileError(File, LinkTargetOrErr.getError());
+  auto LinkTarget = std::move(*LinkTargetOrErr);
+  uint32_t CRC32 = llvm::crc32(arrayRefFromStringRef(LinkTarget->getBuffer()));
+
+  StringRef FileName = sys::path::filename(File);
+  size_t CRCPos = alignTo(FileName.size() + 1, 4);
+  std::vector<uint8_t> Data(CRCPos + 4);
+  memcpy(Data.data(), FileName.data(), FileName.size());
+  support::endian::write32le(Data.data() + CRCPos, CRC32);
+  return Data;
+}
+
+// Adds named section with given contents to the object.
+static void addSection(Object &Obj, StringRef Name, ArrayRef<uint8_t> Contents,
+                       uint32_t Characteristics) {
+  bool NeedVA = Characteristics & (IMAGE_SCN_MEM_EXECUTE | IMAGE_SCN_MEM_READ |
+                                   IMAGE_SCN_MEM_WRITE);
+
+  Section Sec;
+  Sec.setOwnedContents(Contents);
+  Sec.Name = Name;
+  Sec.Header.VirtualSize = NeedVA ? Sec.getContents().size() : 0u;
+  Sec.Header.VirtualAddress = NeedVA ? getNextRVA(Obj) : 0u;
+  Sec.Header.SizeOfRawData =
+      NeedVA ? alignTo(Sec.Header.VirtualSize,
+                       Obj.IsPE ? Obj.PeHeader.FileAlignment : 1)
+             : Sec.getContents().size();
+  // Sec.Header.PointerToRawData is filled in by the writer.
+  Sec.Header.PointerToRelocations = 0;
+  Sec.Header.PointerToLinenumbers = 0;
+  // Sec.Header.NumberOfRelocations is filled in by the writer.
+  Sec.Header.NumberOfLinenumbers = 0;
+  Sec.Header.Characteristics = Characteristics;
+
+  Obj.addSections(Sec);
+}
+
+static Error addGnuDebugLink(Object &Obj, StringRef DebugLinkFile) {
+  Expected<std::vector<uint8_t>> Contents =
+      createGnuDebugLinkSectionContents(DebugLinkFile);
+  if (!Contents)
+    return Contents.takeError();
+
+  addSection(Obj, ".gnu_debuglink", *Contents,
+             IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_MEM_READ |
+                 IMAGE_SCN_MEM_DISCARDABLE);
+
+  return Error::success();
+}
+
+static uint32_t flagsToCharacteristics(SectionFlag AllFlags, uint32_t OldChar) {
+  // Need to preserve alignment flags.
+  const uint32_t PreserveMask =
+      IMAGE_SCN_ALIGN_1BYTES | IMAGE_SCN_ALIGN_2BYTES | IMAGE_SCN_ALIGN_4BYTES |
+      IMAGE_SCN_ALIGN_8BYTES | IMAGE_SCN_ALIGN_16BYTES |
+      IMAGE_SCN_ALIGN_32BYTES | IMAGE_SCN_ALIGN_64BYTES |
+      IMAGE_SCN_ALIGN_128BYTES | IMAGE_SCN_ALIGN_256BYTES |
+      IMAGE_SCN_ALIGN_512BYTES | IMAGE_SCN_ALIGN_1024BYTES |
+      IMAGE_SCN_ALIGN_2048BYTES | IMAGE_SCN_ALIGN_4096BYTES |
+      IMAGE_SCN_ALIGN_8192BYTES;
+
+  // Setup new section characteristics based on the flags provided in command
+  // line.
+  uint32_t NewCharacteristics = (OldChar & PreserveMask) | IMAGE_SCN_MEM_READ;
+
+  if ((AllFlags & SectionFlag::SecAlloc) && !(AllFlags & SectionFlag::SecLoad))
+    NewCharacteristics |= IMAGE_SCN_CNT_UNINITIALIZED_DATA;
+  if (AllFlags & SectionFlag::SecNoload)
+    NewCharacteristics |= IMAGE_SCN_LNK_REMOVE;
+  if (!(AllFlags & SectionFlag::SecReadonly))
+    NewCharacteristics |= IMAGE_SCN_MEM_WRITE;
+  if (AllFlags & SectionFlag::SecDebug)
+    NewCharacteristics |=
+        IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_MEM_DISCARDABLE;
+  if (AllFlags & SectionFlag::SecCode)
+    NewCharacteristics |= IMAGE_SCN_CNT_CODE | IMAGE_SCN_MEM_EXECUTE;
+  if (AllFlags & SectionFlag::SecData)
+    NewCharacteristics |= IMAGE_SCN_CNT_INITIALIZED_DATA;
+  if (AllFlags & SectionFlag::SecShare)
+    NewCharacteristics |= IMAGE_SCN_MEM_SHARED;
+  if (AllFlags & SectionFlag::SecExclude)
+    NewCharacteristics |= IMAGE_SCN_LNK_REMOVE;
+
+  return NewCharacteristics;
+}
+
+static Error handleArgs(const CommonConfig &Config,
+                        const COFFConfig &COFFConfig, Object &Obj) {
+  // Perform the actual section removals.
+  Obj.removeSections([&Config](const Section &Sec) {
+    // Contrary to --only-keep-debug, --only-section fully removes sections that
+    // aren't mentioned.
+    if (!Config.OnlySection.empty() && !Config.OnlySection.matches(Sec.Name))
+      return true;
+
+    if (Config.StripDebug || Config.StripAll || Config.StripAllGNU ||
+        Config.DiscardMode == DiscardType::All || Config.StripUnneeded) {
+      if (isDebugSection(Sec) &&
+          (Sec.Header.Characteristics & IMAGE_SCN_MEM_DISCARDABLE) != 0)
+        return true;
+    }
+
+    if (Config.ToRemove.matches(Sec.Name))
+      return true;
+
+    return false;
+  });
+
+  if (Config.OnlyKeepDebug) {
+    // For --only-keep-debug, we keep all other sections, but remove their
+    // content. The VirtualSize field in the section header is kept intact.
+    Obj.truncateSections([](const Section &Sec) {
+      return !isDebugSection(Sec) && Sec.Name != ".buildid" &&
+             ((Sec.Header.Characteristics &
+               (IMAGE_SCN_CNT_CODE | IMAGE_SCN_CNT_INITIALIZED_DATA)) != 0);
+    });
+  }
+
+  // StripAll removes all symbols and thus also removes all relocations.
+  if (Config.StripAll || Config.StripAllGNU)
+    for (Section &Sec : Obj.getMutableSections())
+      Sec.Relocs.clear();
+
+  // If we need to do per-symbol removals, initialize the Referenced field.
+  if (Config.StripUnneeded || Config.DiscardMode == DiscardType::All ||
+      !Config.SymbolsToRemove.empty())
+    if (Error E = Obj.markSymbols())
+      return E;
+
+  for (Symbol &Sym : Obj.getMutableSymbols()) {
+    auto I = Config.SymbolsToRename.find(Sym.Name);
+    if (I != Config.SymbolsToRename.end())
+      Sym.Name = I->getValue();
+  }
+
+  auto ToRemove = [&](const Symbol &Sym) -> Expected<bool> {
+    // For StripAll, all relocations have been stripped and we remove all
+    // symbols.
+    if (Config.StripAll || Config.StripAllGNU)
+      return true;
+
+    if (Config.SymbolsToRemove.matches(Sym.Name)) {
+      // Explicitly removing a referenced symbol is an error.
+      if (Sym.Referenced)
+        return createStringError(
+            llvm::errc::invalid_argument,
+            "'" + Config.OutputFilename + "': not stripping symbol '" +
+                Sym.Name.str() + "' because it is named in a relocation");
+      return true;
+    }
+
+    if (!Sym.Referenced) {
+      // With --strip-unneeded, GNU objcopy removes all unreferenced local
+      // symbols, and any unreferenced undefined external.
+      // With --strip-unneeded-symbol we strip only specific unreferenced
+      // local symbol instead of removing all of such.
+      if (Sym.Sym.StorageClass == IMAGE_SYM_CLASS_STATIC ||
+          Sym.Sym.SectionNumber == 0)
+        if (Config.StripUnneeded ||
+            Config.UnneededSymbolsToRemove.matches(Sym.Name))
+          return true;
+
+      // GNU objcopy keeps referenced local symbols and external symbols
+      // if --discard-all is set, similar to what --strip-unneeded does,
+      // but undefined local symbols are kept when --discard-all is set.
+      if (Config.DiscardMode == DiscardType::All &&
+          Sym.Sym.StorageClass == IMAGE_SYM_CLASS_STATIC &&
+          Sym.Sym.SectionNumber != 0)
+        return true;
+    }
+
+    return false;
+  };
+
+  // Actually do removals of symbols.
+  if (Error Err = Obj.removeSymbols(ToRemove))
+    return Err;
+
+  if (!Config.SetSectionFlags.empty())
+    for (Section &Sec : Obj.getMutableSections()) {
+      const auto It = Config.SetSectionFlags.find(Sec.Name);
+      if (It != Config.SetSectionFlags.end())
+        Sec.Header.Characteristics = flagsToCharacteristics(
+            It->second.NewFlags, Sec.Header.Characteristics);
+    }
+
+  for (const NewSectionInfo &NewSection : Config.AddSection) {
+    uint32_t Characteristics;
+    const auto It = Config.SetSectionFlags.find(NewSection.SectionName);
+    if (It != Config.SetSectionFlags.end())
+      Characteristics = flagsToCharacteristics(It->second.NewFlags, 0);
+    else
+      Characteristics = IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_ALIGN_1BYTES;
+
+    addSection(Obj, NewSection.SectionName,
+               makeArrayRef(reinterpret_cast<const uint8_t *>(
+                                NewSection.SectionData->getBufferStart()),
+                            NewSection.SectionData->getBufferSize()),
+               Characteristics);
+  }
+
+  for (const NewSectionInfo &NewSection : Config.UpdateSection) {
+    auto It = llvm::find_if(Obj.getMutableSections(), [&](auto &Sec) {
+      return Sec.Name == NewSection.SectionName;
+    });
+    if (It == Obj.getMutableSections().end())
+      return createStringError(errc::invalid_argument,
+                               "could not find section with name '%s'",
+                               NewSection.SectionName.str().c_str());
+    size_t ContentSize = It->getContents().size();
+    if (!ContentSize)
+      return createStringError(
+          errc::invalid_argument,
+          "section '%s' cannot be updated because it does not have contents",
+          NewSection.SectionName.str().c_str());
+    if (ContentSize < NewSection.SectionData->getBufferSize())
+      return createStringError(
+          errc::invalid_argument,
+          "new section cannot be larger than previous section");
+    It->setOwnedContents({NewSection.SectionData->getBufferStart(),
+                          NewSection.SectionData->getBufferEnd()});
+  }
+
+  if (!Config.AddGnuDebugLink.empty())
+    if (Error E = addGnuDebugLink(Obj, Config.AddGnuDebugLink))
+      return E;
+
+  if (COFFConfig.Subsystem || COFFConfig.MajorSubsystemVersion ||
+      COFFConfig.MinorSubsystemVersion) {
+    if (!Obj.IsPE)
+      return createStringError(
+          errc::invalid_argument,
+          "'" + Config.OutputFilename +
+              "': unable to set subsystem on a relocatable object file");
+    if (COFFConfig.Subsystem)
+      Obj.PeHeader.Subsystem = *COFFConfig.Subsystem;
+    if (COFFConfig.MajorSubsystemVersion)
+      Obj.PeHeader.MajorSubsystemVersion = *COFFConfig.MajorSubsystemVersion;
+    if (COFFConfig.MinorSubsystemVersion)
+      Obj.PeHeader.MinorSubsystemVersion = *COFFConfig.MinorSubsystemVersion;
+  }
+
+  return Error::success();
+}
+
+Error executeObjcopyOnBinary(const CommonConfig &Config,
+                             const COFFConfig &COFFConfig, COFFObjectFile &In,
+                             raw_ostream &Out) {
+  COFFReader Reader(In);
+  Expected<std::unique_ptr<Object>> ObjOrErr = Reader.create();
+  if (!ObjOrErr)
+    return createFileError(Config.InputFilename, ObjOrErr.takeError());
+  Object *Obj = ObjOrErr->get();
+  assert(Obj && "Unable to deserialize COFF object");
+  if (Error E = handleArgs(Config, COFFConfig, *Obj))
+    return createFileError(Config.InputFilename, std::move(E));
+  COFFWriter Writer(*Obj, Out);
+  if (Error E = Writer.write())
+    return createFileError(Config.OutputFilename, std::move(E));
+  return Error::success();
+}
+
+} // end namespace coff
+} // end namespace objcopy
+} // end namespace llvm
diff --git a/llvm/lib/ObjCopy/COFF/COFFObject.cpp b/llvm/lib/ObjCopy/COFF/COFFObject.cpp
new file mode 100644
index 000000000000..1d27b7eaa891
--- /dev/null
+++ b/llvm/lib/ObjCopy/COFF/COFFObject.cpp
@@ -0,0 +1,132 @@
+//===- COFFObject.cpp -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "COFFObject.h"
+#include "llvm/ADT/DenseSet.h"
+#include <algorithm>
+
+namespace llvm {
+namespace objcopy {
+namespace coff {
+
+using namespace object;
+
+void Object::addSymbols(ArrayRef<Symbol> NewSymbols) {
+  for (Symbol S : NewSymbols) {
+    S.UniqueId = NextSymbolUniqueId++;
+    Symbols.emplace_back(S);
+  }
+  updateSymbols();
+}
+
+void Object::updateSymbols() {
+  SymbolMap = DenseMap<size_t, Symbol *>(Symbols.size());
+  for (Symbol &Sym : Symbols)
+    SymbolMap[Sym.UniqueId] = &Sym;
+}
+
+const Symbol *Object::findSymbol(size_t UniqueId) const {
+  return SymbolMap.lookup(UniqueId);
+}
+
+Error Object::removeSymbols(
+    function_ref<Expected<bool>(const Symbol &)> ToRemove) {
+  Error Errs = Error::success();
+  llvm::erase_if(Symbols, [ToRemove, &Errs](const Symbol &Sym) {
+    Expected<bool> ShouldRemove = ToRemove(Sym);
+    if (!ShouldRemove) {
+      Errs = joinErrors(std::move(Errs), ShouldRemove.takeError());
+      return false;
+    }
+    return *ShouldRemove;
+  });
+
+  updateSymbols();
+  return Errs;
+}
+
+Error Object::markSymbols() {
+  for (Symbol &Sym : Symbols)
+    Sym.Referenced = false;
+  for (const Section &Sec : Sections) {
+    for (const Relocation &R : Sec.Relocs) {
+      auto It = SymbolMap.find(R.Target);
+      if (It == SymbolMap.end())
+        return createStringError(object_error::invalid_symbol_index,
+                                 "relocation target %zu not found", R.Target);
+      It->second->Referenced = true;
+    }
+  }
+  return Error::success();
+}
+
+void Object::addSections(ArrayRef<Section> NewSections) {
+  for (Section S : NewSections) {
+    S.UniqueId = NextSectionUniqueId++;
+    Sections.emplace_back(S);
+  }
+  updateSections();
+}
+
+void Object::updateSections() {
+  SectionMap = DenseMap<ssize_t, Section *>(Sections.size());
+  size_t Index = 1;
+  for (Section &S : Sections) {
+    SectionMap[S.UniqueId] = &S;
+    S.Index = Index++;
+  }
+}
+
+const Section *Object::findSection(ssize_t UniqueId) const {
+  return SectionMap.lookup(UniqueId);
+}
+
+void Object::removeSections(function_ref<bool(const Section &)> ToRemove) {
+  DenseSet<ssize_t> AssociatedSections;
+  auto RemoveAssociated = [&AssociatedSections](const Section &Sec) {
+    return AssociatedSections.contains(Sec.UniqueId);
+  };
+  do {
+    DenseSet<ssize_t> RemovedSections;
+    llvm::erase_if(Sections, [ToRemove, &RemovedSections](const Section &Sec) {
+      bool Remove = ToRemove(Sec);
+      if (Remove)
+        RemovedSections.insert(Sec.UniqueId);
+      return Remove;
+    });
+    // Remove all symbols referring to the removed sections.
+    AssociatedSections.clear();
+    llvm::erase_if(
+        Symbols, [&RemovedSections, &AssociatedSections](const Symbol &Sym) {
+          // If there are sections that are associative to a removed
+          // section,
+          // remove those as well as nothing will include them (and we can't
+          // leave them dangling).
+          if (RemovedSections.contains(Sym.AssociativeComdatTargetSectionId))
+            AssociatedSections.insert(Sym.TargetSectionId);
+          return RemovedSections.contains(Sym.TargetSectionId);
+        });
+    ToRemove = RemoveAssociated;
+  } while (!AssociatedSections.empty());
+  updateSections();
+  updateSymbols();
+}
+
+void Object::truncateSections(function_ref<bool(const Section &)> ToTruncate) {
+  for (Section &Sec : Sections) {
+    if (ToTruncate(Sec)) {
+      Sec.clearContents();
+      Sec.Relocs.clear();
+      Sec.Header.SizeOfRawData = 0;
+    }
+  }
+}
+
+} // end namespace coff
+} // end namespace objcopy
+} // end namespace llvm
diff --git a/llvm/lib/ObjCopy/COFF/COFFObject.h b/llvm/lib/ObjCopy/COFF/COFFObject.h
new file mode 100644
index 000000000000..66c0a19429ce
--- /dev/null
+++ b/llvm/lib/ObjCopy/COFF/COFFObject.h
@@ -0,0 +1,212 @@
+//===- COFFObject.h ---------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_OBJCOPY_COFF_COFFOBJECT_H
+#define LLVM_LIB_OBJCOPY_COFF_COFFOBJECT_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/Object/COFF.h"
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+namespace llvm {
+namespace objcopy {
+namespace coff {
+
+struct Relocation {
+  Relocation() = default;
+  Relocation(const object::coff_relocation &R) : Reloc(R) {}
+
+  object::coff_relocation Reloc;
+  size_t Target = 0;
+  StringRef TargetName; // Used for diagnostics only
+};
+
+struct Section {
+  object::coff_section Header;
+  std::vector<Relocation> Relocs;
+  StringRef Name;
+  ssize_t UniqueId;
+  size_t Index;
+
+  ArrayRef<uint8_t> getContents() const {
+    if (!OwnedContents.empty())
+      return OwnedContents;
+    return ContentsRef;
+  }
+
+  void setContentsRef(ArrayRef<uint8_t> Data) {
+    OwnedContents.clear();
+    ContentsRef = Data;
+  }
+
+  void setOwnedContents(std::vector<uint8_t> &&Data) {
+    ContentsRef = ArrayRef<uint8_t>();
+    OwnedContents = std::move(Data);
+    Header.SizeOfRawData = OwnedContents.size();
+  }
+
+  void clearContents() {
+    ContentsRef = ArrayRef<uint8_t>();
+    OwnedContents.clear();
+  }
+
+private:
+  ArrayRef<uint8_t> ContentsRef;
+  std::vector<uint8_t> OwnedContents;
+};
+
+struct AuxSymbol {
+  AuxSymbol(ArrayRef<uint8_t> In) {
+    assert(In.size() == sizeof(Opaque));
+    std::copy(In.begin(), In.end(), Opaque);
+  }
+
+  ArrayRef<uint8_t> getRef() const {
+    return ArrayRef<uint8_t>(Opaque, sizeof(Opaque));
+  }
+
+  uint8_t Opaque[sizeof(object::coff_symbol16)];
+};
+
+struct Symbol {
+  object::coff_symbol32 Sym;
+  StringRef Name;
+  std::vector<AuxSymbol> AuxData;
+  StringRef AuxFile;
+  ssize_t TargetSectionId;
+  ssize_t AssociativeComdatTargetSectionId = 0;
+  Optional<size_t> WeakTargetSymbolId;
+  size_t UniqueId;
+  size_t RawIndex;
+  bool Referenced;
+};
+
+struct Object {
+  bool IsPE = false;
+
+  object::dos_header DosHeader;
+  ArrayRef<uint8_t> DosStub;
+
+  object::coff_file_header CoffFileHeader;
+
+  bool Is64 = false;
+  object::pe32plus_header PeHeader;
+  uint32_t BaseOfData = 0; // pe32plus_header lacks this field.
+
+  std::vector<object::data_directory> DataDirectories;
+
+  ArrayRef<Symbol> getSymbols() const { return Symbols; }
+  // This allows mutating individual Symbols, but not mutating the list
+  // of symbols itself.
+  iterator_range<std::vector<Symbol>::iterator> getMutableSymbols() {
+    return make_range(Symbols.begin(), Symbols.end());
+  }
+
+  const Symbol *findSymbol(size_t UniqueId) const;
+
+  void addSymbols(ArrayRef<Symbol> NewSymbols);
+  Error removeSymbols(function_ref<Expected<bool>(const Symbol &)> ToRemove);
+
+  // Set the Referenced field on all Symbols, based on relocations in
+  // all sections.
+  Error markSymbols();
+
+  ArrayRef<Section> getSections() const { return Sections; }
+  // This allows mutating individual Sections, but not mutating the list
+  // of sections itself.
+  iterator_range<std::vector<Section>::iterator> getMutableSections() {
+    return make_range(Sections.begin(), Sections.end());
+  }
+
+  const Section *findSection(ssize_t UniqueId) const;
+
+  void addSections(ArrayRef<Section> NewSections);
+  void removeSections(function_ref<bool(const Section &)> ToRemove);
+  void truncateSections(function_ref<bool(const Section &)> ToTruncate);
+
+private:
+  std::vector<Symbol> Symbols;
+  DenseMap<size_t, Symbol *> SymbolMap;
+
+  size_t NextSymbolUniqueId = 0;
+
+  std::vector<Section> Sections;
+  DenseMap<ssize_t, Section *> SectionMap;
+
+  ssize_t NextSectionUniqueId = 1; // Allow a UniqueId 0 to mean undefined.
+
+  // Update SymbolMap.
+  void updateSymbols();
+
+  // Update SectionMap and Index in each Section.
+  void updateSections();
+};
+
+// Copy between coff_symbol16 and coff_symbol32.
+// The source and destination files can use either coff_symbol16 or
+// coff_symbol32, while we always store them as coff_symbol32 in the
+// intermediate data structure.
+template <class Symbol1Ty, class Symbol2Ty>
+void copySymbol(Symbol1Ty &Dest, const Symbol2Ty &Src) {
+  static_assert(sizeof(Dest.Name.ShortName) == sizeof(Src.Name.ShortName),
+                "Mismatched name sizes");
+  memcpy(Dest.Name.ShortName, Src.Name.ShortName, sizeof(Dest.Name.ShortName));
+  Dest.Value = Src.Value;
+  Dest.SectionNumber = Src.SectionNumber;
+  Dest.Type = Src.Type;
+  Dest.StorageClass = Src.StorageClass;
+  Dest.NumberOfAuxSymbols = Src.NumberOfAuxSymbols;
+}
+
+// Copy between pe32_header and pe32plus_header.
+// We store the intermediate state in a pe32plus_header.
+template <class PeHeader1Ty, class PeHeader2Ty>
+void copyPeHeader(PeHeader1Ty &Dest, const PeHeader2Ty &Src) {
+  Dest.Magic = Src.Magic;
+  Dest.MajorLinkerVersion = Src.MajorLinkerVersion;
+  Dest.MinorLinkerVersion = Src.MinorLinkerVersion;
+  Dest.SizeOfCode = Src.SizeOfCode;
+  Dest.SizeOfInitializedData = Src.SizeOfInitializedData;
+  Dest.SizeOfUninitializedData = Src.SizeOfUninitializedData;
+  Dest.AddressOfEntryPoint = Src.AddressOfEntryPoint;
+  Dest.BaseOfCode = Src.BaseOfCode;
+  Dest.ImageBase = Src.ImageBase;
+  Dest.SectionAlignment = Src.SectionAlignment;
+  Dest.FileAlignment = Src.FileAlignment;
+  Dest.MajorOperatingSystemVersion = Src.MajorOperatingSystemVersion;
+  Dest.MinorOperatingSystemVersion = Src.MinorOperatingSystemVersion;
+  Dest.MajorImageVersion = Src.MajorImageVersion;
+  Dest.MinorImageVersion = Src.MinorImageVersion;
+  Dest.MajorSubsystemVersion = Src.MajorSubsystemVersion;
+  Dest.MinorSubsystemVersion = Src.MinorSubsystemVersion;
+  Dest.Win32VersionValue = Src.Win32VersionValue;
+  Dest.SizeOfImage = Src.SizeOfImage;
+  Dest.SizeOfHeaders = Src.SizeOfHeaders;
+  Dest.CheckSum = Src.CheckSum;
+  Dest.Subsystem = Src.Subsystem;
+  Dest.DLLCharacteristics = Src.DLLCharacteristics;
+  Dest.SizeOfStackReserve = Src.SizeOfStackReserve;
+  Dest.SizeOfStackCommit = Src.SizeOfStackCommit;
+  Dest.SizeOfHeapReserve = Src.SizeOfHeapReserve;
+  Dest.SizeOfHeapCommit = Src.SizeOfHeapCommit;
+  Dest.LoaderFlags = Src.LoaderFlags;
+  Dest.NumberOfRvaAndSize = Src.NumberOfRvaAndSize;
+}
+
+} // end namespace coff
+} // end namespace objcopy
+} // end namespace llvm
+
+#endif // LLVM_LIB_OBJCOPY_COFF_COFFOBJECT_H
diff --git a/llvm/lib/ObjCopy/COFF/COFFReader.cpp b/llvm/lib/ObjCopy/COFF/COFFReader.cpp
new file mode 100644
index 000000000000..44bf303078dd
--- /dev/null
+++ b/llvm/lib/ObjCopy/COFF/COFFReader.cpp
@@ -0,0 +1,226 @@
+//===- COFFReader.cpp -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "COFFReader.h"
+#include "COFFObject.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cstddef>
+#include <cstdint>
+
+namespace llvm {
+namespace objcopy {
+namespace coff {
+
+using namespace object;
+using namespace COFF;
+
+Error COFFReader::readExecutableHeaders(Object &Obj) const {
+  const dos_header *DH = COFFObj.getDOSHeader();
+  Obj.Is64 = COFFObj.is64();
+  if (!DH)
+    return Error::success();
+
+  Obj.IsPE = true;
+  Obj.DosHeader = *DH;
+  if (DH->AddressOfNewExeHeader > sizeof(*DH))
+    Obj.DosStub = ArrayRef<uint8_t>(reinterpret_cast<const uint8_t *>(&DH[1]),
+                                    DH->AddressOfNewExeHeader - sizeof(*DH));
+
+  if (COFFObj.is64()) {
+    Obj.PeHeader = *COFFObj.getPE32PlusHeader();
+  } else {
+    const pe32_header *PE32 = COFFObj.getPE32Header();
+    copyPeHeader(Obj.PeHeader, *PE32);
+    // The pe32plus_header (stored in Object) lacks the BaseOfData field.
+    Obj.BaseOfData = PE32->BaseOfData;
+  }
+
+  for (size_t I = 0; I < Obj.PeHeader.NumberOfRvaAndSize; I++) {
+    const data_directory *Dir = COFFObj.getDataDirectory(I);
+    if (!Dir)
+      return errorCodeToError(object_error::parse_failed);
+    Obj.DataDirectories.emplace_back(*Dir);
+  }
+  return Error::success();
+}
+
+Error COFFReader::readSections(Object &Obj) const {
+  std::vector<Section> Sections;
+  // Section indexing starts from 1.
+  for (size_t I = 1, E = COFFObj.getNumberOfSections(); I <= E; I++) {
+    Expected<const coff_section *> SecOrErr = COFFObj.getSection(I);
+    if (!SecOrErr)
+      return SecOrErr.takeError();
+    const coff_section *Sec = *SecOrErr;
+    Sections.push_back(Section());
+    Section &S = Sections.back();
+    S.Header = *Sec;
+    S.Header.Characteristics &= ~COFF::IMAGE_SCN_LNK_NRELOC_OVFL;
+    ArrayRef<uint8_t> Contents;
+    if (Error E = COFFObj.getSectionContents(Sec, Contents))
+      return E;
+    S.setContentsRef(Contents);
+    ArrayRef<coff_relocation> Relocs = COFFObj.getRelocations(Sec);
+    for (const coff_relocation &R : Relocs)
+      S.Relocs.push_back(R);
+    if (Expected<StringRef> NameOrErr = COFFObj.getSectionName(Sec))
+      S.Name = *NameOrErr;
+    else
+      return NameOrErr.takeError();
+  }
+  Obj.addSections(Sections);
+  return Error::success();
+}
+
+Error COFFReader::readSymbols(Object &Obj, bool IsBigObj) const {
+  std::vector<Symbol> Symbols;
+  Symbols.reserve(COFFObj.getRawNumberOfSymbols());
+  ArrayRef<Section> Sections = Obj.getSections();
+  for (uint32_t I = 0, E = COFFObj.getRawNumberOfSymbols(); I < E;) {
+    Expected<COFFSymbolRef> SymOrErr = COFFObj.getSymbol(I);
+    if (!SymOrErr)
+      return SymOrErr.takeError();
+    COFFSymbolRef SymRef = *SymOrErr;
+
+    Symbols.push_back(Symbol());
+    Symbol &Sym = Symbols.back();
+    // Copy symbols from the original form into an intermediate coff_symbol32.
+    if (IsBigObj)
+      copySymbol(Sym.Sym,
+                 *reinterpret_cast<const coff_symbol32 *>(SymRef.getRawPtr()));
+    else
+      copySymbol(Sym.Sym,
+                 *reinterpret_cast<const coff_symbol16 *>(SymRef.getRawPtr()));
+    auto NameOrErr = COFFObj.getSymbolName(SymRef);
+    if (!NameOrErr)
+      return NameOrErr.takeError();
+    Sym.Name = *NameOrErr;
+
+    ArrayRef<uint8_t> AuxData = COFFObj.getSymbolAuxData(SymRef);
+    size_t SymSize = IsBigObj ? sizeof(coff_symbol32) : sizeof(coff_symbol16);
+    assert(AuxData.size() == SymSize * SymRef.getNumberOfAuxSymbols());
+    // The auxillary symbols are structs of sizeof(coff_symbol16) each.
+    // In the big object format (where symbols are coff_symbol32), each
+    // auxillary symbol is padded with 2 bytes at the end. Copy each
+    // auxillary symbol to the Sym.AuxData vector. For file symbols,
+    // the whole range of aux symbols are interpreted as one null padded
+    // string instead.
+    if (SymRef.isFileRecord())
+      Sym.AuxFile = StringRef(reinterpret_cast<const char *>(AuxData.data()),
+                              AuxData.size())
+                        .rtrim('\0');
+    else
+      for (size_t I = 0; I < SymRef.getNumberOfAuxSymbols(); I++)
+        Sym.AuxData.push_back(AuxData.slice(I * SymSize, sizeof(AuxSymbol)));
+
+    // Find the unique id of the section
+    if (SymRef.getSectionNumber() <=
+        0) // Special symbol (undefined/absolute/debug)
+      Sym.TargetSectionId = SymRef.getSectionNumber();
+    else if (static_cast<uint32_t>(SymRef.getSectionNumber() - 1) <
+             Sections.size())
+      Sym.TargetSectionId = Sections[SymRef.getSectionNumber() - 1].UniqueId;
+    else
+      return createStringError(object_error::parse_failed,
+                               "section number out of range");
+    // For section definitions, check if it is comdat associative, and if
+    // it is, find the target section unique id.
+    const coff_aux_section_definition *SD = SymRef.getSectionDefinition();
+    const coff_aux_weak_external *WE = SymRef.getWeakExternal();
+    if (SD && SD->Selection == IMAGE_COMDAT_SELECT_ASSOCIATIVE) {
+      int32_t Index = SD->getNumber(IsBigObj);
+      if (Index <= 0 || static_cast<uint32_t>(Index - 1) >= Sections.size())
+        return createStringError(object_error::parse_failed,
+                                 "unexpected associative section index");
+      Sym.AssociativeComdatTargetSectionId = Sections[Index - 1].UniqueId;
+    } else if (WE) {
+      // This is a raw symbol index for now, but store it in the Symbol
+      // until we've added them to the Object, which assigns the final
+      // unique ids.
+      Sym.WeakTargetSymbolId = WE->TagIndex;
+    }
+    I += 1 + SymRef.getNumberOfAuxSymbols();
+  }
+  Obj.addSymbols(Symbols);
+  return Error::success();
+}
+
+Error COFFReader::setSymbolTargets(Object &Obj) const {
+  std::vector<const Symbol *> RawSymbolTable;
+  for (const Symbol &Sym : Obj.getSymbols()) {
+    RawSymbolTable.push_back(&Sym);
+    for (size_t I = 0; I < Sym.Sym.NumberOfAuxSymbols; I++)
+      RawSymbolTable.push_back(nullptr);
+  }
+  for (Symbol &Sym : Obj.getMutableSymbols()) {
+    // Convert WeakTargetSymbolId from the original raw symbol index to
+    // a proper unique id.
+    if (Sym.WeakTargetSymbolId) {
+      if (*Sym.WeakTargetSymbolId >= RawSymbolTable.size())
+        return createStringError(object_error::parse_failed,
+                                 "weak external reference out of range");
+      const Symbol *Target = RawSymbolTable[*Sym.WeakTargetSymbolId];
+      if (Target == nullptr)
+        return createStringError(object_error::parse_failed,
+                                 "invalid SymbolTableIndex");
+      Sym.WeakTargetSymbolId = Target->UniqueId;
+    }
+  }
+  for (Section &Sec : Obj.getMutableSections()) {
+    for (Relocation &R : Sec.Relocs) {
+      if (R.Reloc.SymbolTableIndex >= RawSymbolTable.size())
+        return createStringError(object_error::parse_failed,
+                                 "SymbolTableIndex out of range");
+      const Symbol *Sym = RawSymbolTable[R.Reloc.SymbolTableIndex];
+      if (Sym == nullptr)
+        return createStringError(object_error::parse_failed,
+                                 "invalid SymbolTableIndex");
+      R.Target = Sym->UniqueId;
+      R.TargetName = Sym->Name;
+    }
+  }
+  return Error::success();
+}
+
+Expected<std::unique_ptr<Object>> COFFReader::create() const {
+  auto Obj = std::make_unique<Object>();
+
+  bool IsBigObj = false;
+  if (const coff_file_header *CFH = COFFObj.getCOFFHeader()) {
+    Obj->CoffFileHeader = *CFH;
+  } else {
+    const coff_bigobj_file_header *CBFH = COFFObj.getCOFFBigObjHeader();
+    if (!CBFH)
+      return createStringError(object_error::parse_failed,
+                               "no COFF file header returned");
+    // Only copying the few fields from the bigobj header that we need
+    // and won't recreate in the end.
+    Obj->CoffFileHeader.Machine = CBFH->Machine;
+    Obj->CoffFileHeader.TimeDateStamp = CBFH->TimeDateStamp;
+    IsBigObj = true;
+  }
+
+  if (Error E = readExecutableHeaders(*Obj))
+    return std::move(E);
+  if (Error E = readSections(*Obj))
+    return std::move(E);
+  if (Error E = readSymbols(*Obj, IsBigObj))
+    return std::move(E);
+  if (Error E = setSymbolTargets(*Obj))
+    return std::move(E);
+
+  return std::move(Obj);
+}
+
+} // end namespace coff
+} // end namespace objcopy
+} // end namespace llvm
diff --git a/llvm/lib/ObjCopy/COFF/COFFReader.h b/llvm/lib/ObjCopy/COFF/COFFReader.h
new file mode 100644
index 000000000000..b4957f844392
--- /dev/null
+++ b/llvm/lib/ObjCopy/COFF/COFFReader.h
@@ -0,0 +1,41 @@
+//===- COFFReader.h ---------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_OBJCOPY_COFF_COFFREADER_H
+#define LLVM_LIB_OBJCOPY_COFF_COFFREADER_H
+
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Support/Error.h"
+
+namespace llvm {
+namespace objcopy {
+namespace coff {
+
+struct Object;
+
+using object::COFFObjectFile;
+
+class COFFReader {
+  const COFFObjectFile &COFFObj;
+
+  Error readExecutableHeaders(Object &Obj) const;
+  Error readSections(Object &Obj) const;
+  Error readSymbols(Object &Obj, bool IsBigObj) const;
+  Error setSymbolTargets(Object &Obj) const;
+
+public:
+  explicit COFFReader(const COFFObjectFile &O) : COFFObj(O) {}
+  Expected<std::unique_ptr<Object>> create() const;
+};
+
+} // end namespace coff
+} // end namespace objcopy
+} // end namespace llvm
+
+#endif // LLVM_LIB_OBJCOPY_COFF_COFFREADER_H
diff --git a/llvm/lib/ObjCopy/COFF/COFFWriter.cpp b/llvm/lib/ObjCopy/COFF/COFFWriter.cpp
new file mode 100644
index 000000000000..88eb4d14ba25
--- /dev/null
+++ b/llvm/lib/ObjCopy/COFF/COFFWriter.cpp
@@ -0,0 +1,466 @@
+//===- COFFWriter.cpp -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "COFFWriter.h"
+#include "COFFObject.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cstddef>
+#include <cstdint>
+
+namespace llvm {
+namespace objcopy {
+namespace coff {
+
+using namespace object;
+using namespace COFF;
+
+Error COFFWriter::finalizeRelocTargets() {
+  for (Section &Sec : Obj.getMutableSections()) {
+    for (Relocation &R : Sec.Relocs) {
+      const Symbol *Sym = Obj.findSymbol(R.Target);
+      if (Sym == nullptr)
+        return createStringError(object_error::invalid_symbol_index,
+                                 "relocation target '%s' (%zu) not found",
+                                 R.TargetName.str().c_str(), R.Target);
+      R.Reloc.SymbolTableIndex = Sym->RawIndex;
+    }
+  }
+  return Error::success();
+}
+
+Error COFFWriter::finalizeSymbolContents() {
+  for (Symbol &Sym : Obj.getMutableSymbols()) {
+    if (Sym.TargetSectionId <= 0) {
+      // Undefined, or a special kind of symbol. These negative values
+      // are stored in the SectionNumber field which is unsigned.
+      Sym.Sym.SectionNumber = static_cast<uint32_t>(Sym.TargetSectionId);
+    } else {
+      const Section *Sec = Obj.findSection(Sym.TargetSectionId);
+      if (Sec == nullptr)
+        return createStringError(object_error::invalid_symbol_index,
+                                 "symbol '%s' points to a removed section",
+                                 Sym.Name.str().c_str());
+      Sym.Sym.SectionNumber = Sec->Index;
+
+      if (Sym.Sym.NumberOfAuxSymbols == 1 &&
+          Sym.Sym.StorageClass == IMAGE_SYM_CLASS_STATIC) {
+        coff_aux_section_definition *SD =
+            reinterpret_cast<coff_aux_section_definition *>(
+                Sym.AuxData[0].Opaque);
+        uint32_t SDSectionNumber;
+        if (Sym.AssociativeComdatTargetSectionId == 0) {
+          // Not a comdat associative section; just set the Number field to
+          // the number of the section itself.
+          SDSectionNumber = Sec->Index;
+        } else {
+          Sec = Obj.findSection(Sym.AssociativeComdatTargetSectionId);
+          if (Sec == nullptr)
+            return createStringError(
+                object_error::invalid_symbol_index,
+                "symbol '%s' is associative to a removed section",
+                Sym.Name.str().c_str());
+          SDSectionNumber = Sec->Index;
+        }
+        // Update the section definition with the new section number.
+        SD->NumberLowPart = static_cast<uint16_t>(SDSectionNumber);
+        SD->NumberHighPart = static_cast<uint16_t>(SDSectionNumber >> 16);
+      }
+    }
+    // Check that we actually have got AuxData to match the weak symbol target
+    // we want to set. Only >= 1 would be required, but only == 1 makes sense.
+    if (Sym.WeakTargetSymbolId && Sym.Sym.NumberOfAuxSymbols == 1) {
+      coff_aux_weak_external *WE =
+          reinterpret_cast<coff_aux_weak_external *>(Sym.AuxData[0].Opaque);
+      const Symbol *Target = Obj.findSymbol(*Sym.WeakTargetSymbolId);
+      if (Target == nullptr)
+        return createStringError(object_error::invalid_symbol_index,
+                                 "symbol '%s' is missing its weak target",
+                                 Sym.Name.str().c_str());
+      WE->TagIndex = Target->RawIndex;
+    }
+  }
+  return Error::success();
+}
+
+void COFFWriter::layoutSections() {
+  for (auto &S : Obj.getMutableSections()) {
+    if (S.Header.SizeOfRawData > 0)
+      S.Header.PointerToRawData = FileSize;
+    FileSize += S.Header.SizeOfRawData; // For executables, this is already
+                                        // aligned to FileAlignment.
+    if (S.Relocs.size() >= 0xffff) {
+      S.Header.Characteristics |= COFF::IMAGE_SCN_LNK_NRELOC_OVFL;
+      S.Header.NumberOfRelocations = 0xffff;
+      S.Header.PointerToRelocations = FileSize;
+      FileSize += sizeof(coff_relocation);
+    } else {
+      S.Header.NumberOfRelocations = S.Relocs.size();
+      S.Header.PointerToRelocations = S.Relocs.size() ? FileSize : 0;
+    }
+
+    FileSize += S.Relocs.size() * sizeof(coff_relocation);
+    FileSize = alignTo(FileSize, FileAlignment);
+
+    if (S.Header.Characteristics & IMAGE_SCN_CNT_INITIALIZED_DATA)
+      SizeOfInitializedData += S.Header.SizeOfRawData;
+  }
+}
+
+Expected<size_t> COFFWriter::finalizeStringTable() {
+  for (const auto &S : Obj.getSections())
+    if (S.Name.size() > COFF::NameSize)
+      StrTabBuilder.add(S.Name);
+
+  for (const auto &S : Obj.getSymbols())
+    if (S.Name.size() > COFF::NameSize)
+      StrTabBuilder.add(S.Name);
+
+  StrTabBuilder.finalize();
+
+  for (auto &S : Obj.getMutableSections()) {
+    memset(S.Header.Name, 0, sizeof(S.Header.Name));
+    if (S.Name.size() <= COFF::NameSize) {
+      // Short names can go in the field directly.
+      memcpy(S.Header.Name, S.Name.data(), S.Name.size());
+    } else {
+      // Offset of the section name in the string table.
+      size_t Offset = StrTabBuilder.getOffset(S.Name);
+      if (!COFF::encodeSectionName(S.Header.Name, Offset))
+        return createStringError(object_error::invalid_section_index,
+                                 "COFF string table is greater than 64GB, "
+                                 "unable to encode section name offset");
+    }
+  }
+  for (auto &S : Obj.getMutableSymbols()) {
+    if (S.Name.size() > COFF::NameSize) {
+      S.Sym.Name.Offset.Zeroes = 0;
+      S.Sym.Name.Offset.Offset = StrTabBuilder.getOffset(S.Name);
+    } else {
+      strncpy(S.Sym.Name.ShortName, S.Name.data(), COFF::NameSize);
+    }
+  }
+  return StrTabBuilder.getSize();
+}
+
+template <class SymbolTy>
+std::pair<size_t, size_t> COFFWriter::finalizeSymbolTable() {
+  size_t RawSymIndex = 0;
+  for (auto &S : Obj.getMutableSymbols()) {
+    // Symbols normally have NumberOfAuxSymbols set correctly all the time.
+    // For file symbols, we need to know the output file's symbol size to be
+    // able to calculate the number of slots it occupies.
+    if (!S.AuxFile.empty())
+      S.Sym.NumberOfAuxSymbols =
+          alignTo(S.AuxFile.size(), sizeof(SymbolTy)) / sizeof(SymbolTy);
+    S.RawIndex = RawSymIndex;
+    RawSymIndex += 1 + S.Sym.NumberOfAuxSymbols;
+  }
+  return std::make_pair(RawSymIndex * sizeof(SymbolTy), sizeof(SymbolTy));
+}
+
+Error COFFWriter::finalize(bool IsBigObj) {
+  size_t SymTabSize, SymbolSize;
+  std::tie(SymTabSize, SymbolSize) = IsBigObj
+                                         ? finalizeSymbolTable<coff_symbol32>()
+                                         : finalizeSymbolTable<coff_symbol16>();
+
+  if (Error E = finalizeRelocTargets())
+    return E;
+  if (Error E = finalizeSymbolContents())
+    return E;
+
+  size_t SizeOfHeaders = 0;
+  FileAlignment = 1;
+  size_t PeHeaderSize = 0;
+  if (Obj.IsPE) {
+    Obj.DosHeader.AddressOfNewExeHeader =
+        sizeof(Obj.DosHeader) + Obj.DosStub.size();
+    SizeOfHeaders += Obj.DosHeader.AddressOfNewExeHeader + sizeof(PEMagic);
+
+    FileAlignment = Obj.PeHeader.FileAlignment;
+    Obj.PeHeader.NumberOfRvaAndSize = Obj.DataDirectories.size();
+
+    PeHeaderSize = Obj.Is64 ? sizeof(pe32plus_header) : sizeof(pe32_header);
+    SizeOfHeaders +=
+        PeHeaderSize + sizeof(data_directory) * Obj.DataDirectories.size();
+  }
+  Obj.CoffFileHeader.NumberOfSections = Obj.getSections().size();
+  SizeOfHeaders +=
+      IsBigObj ? sizeof(coff_bigobj_file_header) : sizeof(coff_file_header);
+  SizeOfHeaders += sizeof(coff_section) * Obj.getSections().size();
+  SizeOfHeaders = alignTo(SizeOfHeaders, FileAlignment);
+
+  Obj.CoffFileHeader.SizeOfOptionalHeader =
+      PeHeaderSize + sizeof(data_directory) * Obj.DataDirectories.size();
+
+  FileSize = SizeOfHeaders;
+  SizeOfInitializedData = 0;
+
+  layoutSections();
+
+  if (Obj.IsPE) {
+    Obj.PeHeader.SizeOfHeaders = SizeOfHeaders;
+    Obj.PeHeader.SizeOfInitializedData = SizeOfInitializedData;
+
+    if (!Obj.getSections().empty()) {
+      const Section &S = Obj.getSections().back();
+      Obj.PeHeader.SizeOfImage =
+          alignTo(S.Header.VirtualAddress + S.Header.VirtualSize,
+                  Obj.PeHeader.SectionAlignment);
+    }
+
+    // If the PE header had a checksum, clear it, since it isn't valid
+    // any longer. (We don't calculate a new one.)
+    Obj.PeHeader.CheckSum = 0;
+  }
+
+  Expected<size_t> StrTabSizeOrErr = finalizeStringTable();
+  if (!StrTabSizeOrErr)
+    return StrTabSizeOrErr.takeError();
+
+  size_t StrTabSize = *StrTabSizeOrErr;
+
+  size_t PointerToSymbolTable = FileSize;
+  // StrTabSize <= 4 is the size of an empty string table, only consisting
+  // of the length field.
+  if (SymTabSize == 0 && StrTabSize <= 4 && Obj.IsPE) {
+    // For executables, don't point to the symbol table and skip writing
+    // the length field, if both the symbol and string tables are empty.
+    PointerToSymbolTable = 0;
+    StrTabSize = 0;
+  }
+
+  size_t NumRawSymbols = SymTabSize / SymbolSize;
+  Obj.CoffFileHeader.PointerToSymbolTable = PointerToSymbolTable;
+  Obj.CoffFileHeader.NumberOfSymbols = NumRawSymbols;
+  FileSize += SymTabSize + StrTabSize;
+  FileSize = alignTo(FileSize, FileAlignment);
+
+  return Error::success();
+}
+
+void COFFWriter::writeHeaders(bool IsBigObj) {
+  uint8_t *Ptr = reinterpret_cast<uint8_t *>(Buf->getBufferStart());
+  if (Obj.IsPE) {
+    memcpy(Ptr, &Obj.DosHeader, sizeof(Obj.DosHeader));
+    Ptr += sizeof(Obj.DosHeader);
+    memcpy(Ptr, Obj.DosStub.data(), Obj.DosStub.size());
+    Ptr += Obj.DosStub.size();
+    memcpy(Ptr, PEMagic, sizeof(PEMagic));
+    Ptr += sizeof(PEMagic);
+  }
+  if (!IsBigObj) {
+    memcpy(Ptr, &Obj.CoffFileHeader, sizeof(Obj.CoffFileHeader));
+    Ptr += sizeof(Obj.CoffFileHeader);
+  } else {
+    // Generate a coff_bigobj_file_header, filling it in with the values
+    // from Obj.CoffFileHeader. All extra fields that don't exist in
+    // coff_file_header can be set to hardcoded values.
+    coff_bigobj_file_header BigObjHeader;
+    BigObjHeader.Sig1 = IMAGE_FILE_MACHINE_UNKNOWN;
+    BigObjHeader.Sig2 = 0xffff;
+    BigObjHeader.Version = BigObjHeader::MinBigObjectVersion;
+    BigObjHeader.Machine = Obj.CoffFileHeader.Machine;
+    BigObjHeader.TimeDateStamp = Obj.CoffFileHeader.TimeDateStamp;
+    memcpy(BigObjHeader.UUID, BigObjMagic, sizeof(BigObjMagic));
+    BigObjHeader.unused1 = 0;
+    BigObjHeader.unused2 = 0;
+    BigObjHeader.unused3 = 0;
+    BigObjHeader.unused4 = 0;
+    // The value in Obj.CoffFileHeader.NumberOfSections is truncated, thus
+    // get the original one instead.
+    BigObjHeader.NumberOfSections = Obj.getSections().size();
+    BigObjHeader.PointerToSymbolTable = Obj.CoffFileHeader.PointerToSymbolTable;
+    BigObjHeader.NumberOfSymbols = Obj.CoffFileHeader.NumberOfSymbols;
+
+    memcpy(Ptr, &BigObjHeader, sizeof(BigObjHeader));
+    Ptr += sizeof(BigObjHeader);
+  }
+  if (Obj.IsPE) {
+    if (Obj.Is64) {
+      memcpy(Ptr, &Obj.PeHeader, sizeof(Obj.PeHeader));
+      Ptr += sizeof(Obj.PeHeader);
+    } else {
+      pe32_header PeHeader;
+      copyPeHeader(PeHeader, Obj.PeHeader);
+      // The pe32plus_header (stored in Object) lacks the BaseOfData field.
+      PeHeader.BaseOfData = Obj.BaseOfData;
+
+      memcpy(Ptr, &PeHeader, sizeof(PeHeader));
+      Ptr += sizeof(PeHeader);
+    }
+    for (const auto &DD : Obj.DataDirectories) {
+      memcpy(Ptr, &DD, sizeof(DD));
+      Ptr += sizeof(DD);
+    }
+  }
+  for (const auto &S : Obj.getSections()) {
+    memcpy(Ptr, &S.Header, sizeof(S.Header));
+    Ptr += sizeof(S.Header);
+  }
+}
+
+void COFFWriter::writeSections() {
+  for (const auto &S : Obj.getSections()) {
+    uint8_t *Ptr = reinterpret_cast<uint8_t *>(Buf->getBufferStart()) +
+                   S.Header.PointerToRawData;
+    ArrayRef<uint8_t> Contents = S.getContents();
+    std::copy(Contents.begin(), Contents.end(), Ptr);
+
+    // For executable sections, pad the remainder of the raw data size with
+    // 0xcc, which is int3 on x86.
+    if ((S.Header.Characteristics & IMAGE_SCN_CNT_CODE) &&
+        S.Header.SizeOfRawData > Contents.size())
+      memset(Ptr + Contents.size(), 0xcc,
+             S.Header.SizeOfRawData - Contents.size());
+
+    Ptr += S.Header.SizeOfRawData;
+
+    if (S.Relocs.size() >= 0xffff) {
+      object::coff_relocation R;
+      R.VirtualAddress = S.Relocs.size() + 1;
+      R.SymbolTableIndex = 0;
+      R.Type = 0;
+      memcpy(Ptr, &R, sizeof(R));
+      Ptr += sizeof(R);
+    }
+    for (const auto &R : S.Relocs) {
+      memcpy(Ptr, &R.Reloc, sizeof(R.Reloc));
+      Ptr += sizeof(R.Reloc);
+    }
+  }
+}
+
+template <class SymbolTy> void COFFWriter::writeSymbolStringTables() {
+  uint8_t *Ptr = reinterpret_cast<uint8_t *>(Buf->getBufferStart()) +
+                 Obj.CoffFileHeader.PointerToSymbolTable;
+  for (const auto &S : Obj.getSymbols()) {
+    // Convert symbols back to the right size, from coff_symbol32.
+    copySymbol<SymbolTy, coff_symbol32>(*reinterpret_cast<SymbolTy *>(Ptr),
+                                        S.Sym);
+    Ptr += sizeof(SymbolTy);
+    if (!S.AuxFile.empty()) {
+      // For file symbols, just write the string into the aux symbol slots,
+      // assuming that the unwritten parts are initialized to zero in the memory
+      // mapped file.
+      std::copy(S.AuxFile.begin(), S.AuxFile.end(), Ptr);
+      Ptr += S.Sym.NumberOfAuxSymbols * sizeof(SymbolTy);
+    } else {
+      // For other auxillary symbols, write their opaque payload into one symbol
+      // table slot each. For big object files, the symbols are larger than the
+      // opaque auxillary symbol struct and we leave padding at the end of each
+      // entry.
+      for (const AuxSymbol &AuxSym : S.AuxData) {
+        ArrayRef<uint8_t> Ref = AuxSym.getRef();
+        std::copy(Ref.begin(), Ref.end(), Ptr);
+        Ptr += sizeof(SymbolTy);
+      }
+    }
+  }
+  if (StrTabBuilder.getSize() > 4 || !Obj.IsPE) {
+    // Always write a string table in object files, even an empty one.
+    StrTabBuilder.write(Ptr);
+    Ptr += StrTabBuilder.getSize();
+  }
+}
+
+Error COFFWriter::write(bool IsBigObj) {
+  if (Error E = finalize(IsBigObj))
+    return E;
+
+  Buf = WritableMemoryBuffer::getNewMemBuffer(FileSize);
+  if (!Buf)
+    return createStringError(llvm::errc::not_enough_memory,
+                             "failed to allocate memory buffer of " +
+                                 Twine::utohexstr(FileSize) + " bytes.");
+
+  writeHeaders(IsBigObj);
+  writeSections();
+  if (IsBigObj)
+    writeSymbolStringTables<coff_symbol32>();
+  else
+    writeSymbolStringTables<coff_symbol16>();
+
+  if (Obj.IsPE)
+    if (Error E = patchDebugDirectory())
+      return E;
+
+  // TODO: Implement direct writing to the output stream (without intermediate
+  // memory buffer Buf).
+  Out.write(Buf->getBufferStart(), Buf->getBufferSize());
+  return Error::success();
+}
+
+Expected<uint32_t> COFFWriter::virtualAddressToFileAddress(uint32_t RVA) {
+  for (const auto &S : Obj.getSections()) {
+    if (RVA >= S.Header.VirtualAddress &&
+        RVA < S.Header.VirtualAddress + S.Header.SizeOfRawData)
+      return S.Header.PointerToRawData + RVA - S.Header.VirtualAddress;
+  }
+  return createStringError(object_error::parse_failed,
+                           "debug directory payload not found");
+}
+
+// Locate which sections contain the debug directories, iterate over all
+// the debug_directory structs in there, and set the PointerToRawData field
+// in all of them, according to their new physical location in the file.
+Error COFFWriter::patchDebugDirectory() {
+  if (Obj.DataDirectories.size() <= DEBUG_DIRECTORY)
+    return Error::success();
+  const data_directory *Dir = &Obj.DataDirectories[DEBUG_DIRECTORY];
+  if (Dir->Size <= 0)
+    return Error::success();
+  for (const auto &S : Obj.getSections()) {
+    if (Dir->RelativeVirtualAddress >= S.Header.VirtualAddress &&
+        Dir->RelativeVirtualAddress <
+            S.Header.VirtualAddress + S.Header.SizeOfRawData) {
+      if (Dir->RelativeVirtualAddress + Dir->Size >
+          S.Header.VirtualAddress + S.Header.SizeOfRawData)
+        return createStringError(object_error::parse_failed,
+                                 "debug directory extends past end of section");
+
+      size_t Offset = Dir->RelativeVirtualAddress - S.Header.VirtualAddress;
+      uint8_t *Ptr = reinterpret_cast<uint8_t *>(Buf->getBufferStart()) +
+                     S.Header.PointerToRawData + Offset;
+      uint8_t *End = Ptr + Dir->Size;
+      while (Ptr < End) {
+        debug_directory *Debug = reinterpret_cast<debug_directory *>(Ptr);
+        if (Debug->PointerToRawData) {
+          if (Expected<uint32_t> FilePosOrErr =
+                  virtualAddressToFileAddress(Debug->AddressOfRawData))
+            Debug->PointerToRawData = *FilePosOrErr;
+          else
+            return FilePosOrErr.takeError();
+        }
+        Ptr += sizeof(debug_directory);
+        Offset += sizeof(debug_directory);
+      }
+      // Debug directory found and patched, all done.
+      return Error::success();
+    }
+  }
+  return createStringError(object_error::parse_failed,
+                           "debug directory not found");
+}
+
+Error COFFWriter::write() {
+  bool IsBigObj = Obj.getSections().size() > MaxNumberOfSections16;
+  if (IsBigObj && Obj.IsPE)
+    return createStringError(object_error::parse_failed,
+                             "too many sections for executable");
+  return write(IsBigObj);
+}
+
+} // end namespace coff
+} // end namespace objcopy
+} // end namespace llvm
diff --git a/llvm/lib/ObjCopy/COFF/COFFWriter.h b/llvm/lib/ObjCopy/COFF/COFFWriter.h
new file mode 100644
index 000000000000..b7dca69e9a81
--- /dev/null
+++ b/llvm/lib/ObjCopy/COFF/COFFWriter.h
@@ -0,0 +1,63 @@
+//===- COFFWriter.h ---------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_OBJCOPY_COFF_COFFWRITER_H
+#define LLVM_LIB_OBJCOPY_COFF_COFFWRITER_H
+
+#include "llvm/MC/StringTableBuilder.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <cstddef>
+#include <utility>
+
+namespace llvm {
+namespace objcopy {
+namespace coff {
+
+struct Object;
+
+class COFFWriter {
+  Object &Obj;
+  std::unique_ptr<WritableMemoryBuffer> Buf;
+  raw_ostream &Out;
+
+  size_t FileSize;
+  size_t FileAlignment;
+  size_t SizeOfInitializedData;
+  StringTableBuilder StrTabBuilder;
+
+  template <class SymbolTy> std::pair<size_t, size_t> finalizeSymbolTable();
+  Error finalizeRelocTargets();
+  Error finalizeSymbolContents();
+  void layoutSections();
+  Expected<size_t> finalizeStringTable();
+
+  Error finalize(bool IsBigObj);
+
+  void writeHeaders(bool IsBigObj);
+  void writeSections();
+  template <class SymbolTy> void writeSymbolStringTables();
+
+  Error write(bool IsBigObj);
+
+  Error patchDebugDirectory();
+  Expected<uint32_t> virtualAddressToFileAddress(uint32_t RVA);
+
+public:
+  virtual ~COFFWriter() {}
+  Error write();
+
+  COFFWriter(Object &Obj, raw_ostream &Out)
+      : Obj(Obj), Out(Out), StrTabBuilder(StringTableBuilder::WinCOFF) {}
+};
+
+} // end namespace coff
+} // end namespace objcopy
+} // end namespace llvm
+
+#endif // LLVM_LIB_OBJCOPY_COFF_COFFWRITER_H
diff --git a/llvm/lib/ObjCopy/CommonConfig.cpp b/llvm/lib/ObjCopy/CommonConfig.cpp
new file mode 100644
index 000000000000..e85715d0c44c
--- /dev/null
+++ b/llvm/lib/ObjCopy/CommonConfig.cpp
@@ -0,0 +1,50 @@
+//===- CommonConfig.cpp ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ObjCopy/CommonConfig.h"
+
+namespace llvm {
+namespace objcopy {
+
+Expected<NameOrPattern>
+NameOrPattern::create(StringRef Pattern, MatchStyle MS,
+                      function_ref<Error(Error)> ErrorCallback) {
+  switch (MS) {
+  case MatchStyle::Literal:
+    return NameOrPattern(Pattern);
+  case MatchStyle::Wildcard: {
+    SmallVector<char, 32> Data;
+    bool IsPositiveMatch = true;
+    if (Pattern[0] == '!') {
+      IsPositiveMatch = false;
+      Pattern = Pattern.drop_front();
+    }
+    Expected<GlobPattern> GlobOrErr = GlobPattern::create(Pattern);
+
+    // If we couldn't create it as a glob, report the error, but try again
+    // with a literal if the error reporting is non-fatal.
+    if (!GlobOrErr) {
+      if (Error E = ErrorCallback(GlobOrErr.takeError()))
+        return std::move(E);
+      return create(Pattern, MatchStyle::Literal, ErrorCallback);
+    }
+
+    return NameOrPattern(std::make_shared<GlobPattern>(*GlobOrErr),
+                         IsPositiveMatch);
+  }
+  case MatchStyle::Regex: {
+    SmallVector<char, 32> Data;
+    return NameOrPattern(std::make_shared<Regex>(
+        ("^" + Pattern.ltrim('^').rtrim('$') + "$").toStringRef(Data)));
+  }
+  }
+  llvm_unreachable("Unhandled llvm.objcopy.MatchStyle enum");
+}
+
+} // end namespace objcopy
+} // end namespace llvm
diff --git a/llvm/lib/ObjCopy/ConfigManager.cpp b/llvm/lib/ObjCopy/ConfigManager.cpp
new file mode 100644
index 000000000000..9d8883a15c0b
--- /dev/null
+++ b/llvm/lib/ObjCopy/ConfigManager.cpp
@@ -0,0 +1,97 @@
+//===- ConfigManager.cpp --------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ObjCopy/ConfigManager.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
+
+namespace llvm {
+namespace objcopy {
+
+Expected<const COFFConfig &> ConfigManager::getCOFFConfig() const {
+  if (!Common.SplitDWO.empty() || !Common.SymbolsPrefix.empty() ||
+      !Common.AllocSectionsPrefix.empty() || !Common.DumpSection.empty() ||
+      !Common.KeepSection.empty() || !Common.SymbolsToGlobalize.empty() ||
+      !Common.SymbolsToKeep.empty() || !Common.SymbolsToLocalize.empty() ||
+      !Common.SymbolsToWeaken.empty() || !Common.SymbolsToKeepGlobal.empty() ||
+      !Common.SectionsToRename.empty() || !Common.SetSectionAlignment.empty() ||
+      Common.ExtractDWO || Common.PreserveDates || Common.StripDWO ||
+      Common.StripNonAlloc || Common.StripSections || Common.Weaken ||
+      Common.DecompressDebugSections ||
+      Common.DiscardMode == DiscardType::Locals || !Common.SymbolsToAdd.empty())
+    return createStringError(llvm::errc::invalid_argument,
+                             "option is not supported for COFF");
+
+  return COFF;
+}
+
+Expected<const MachOConfig &> ConfigManager::getMachOConfig() const {
+  if (!Common.SplitDWO.empty() || !Common.SymbolsPrefix.empty() ||
+      !Common.AllocSectionsPrefix.empty() || !Common.KeepSection.empty() ||
+      !Common.SymbolsToGlobalize.empty() || !Common.SymbolsToKeep.empty() ||
+      !Common.SymbolsToLocalize.empty() || !Common.SymbolsToWeaken.empty() ||
+      !Common.SymbolsToKeepGlobal.empty() || !Common.SectionsToRename.empty() ||
+      !Common.UnneededSymbolsToRemove.empty() ||
+      !Common.SetSectionAlignment.empty() || !Common.SetSectionFlags.empty() ||
+      Common.ExtractDWO || Common.PreserveDates || Common.StripAllGNU ||
+      Common.StripDWO || Common.StripNonAlloc || Common.StripSections ||
+      Common.Weaken || Common.DecompressDebugSections || Common.StripUnneeded ||
+      Common.DiscardMode == DiscardType::Locals || !Common.SymbolsToAdd.empty())
+    return createStringError(llvm::errc::invalid_argument,
+                             "option is not supported for MachO");
+
+  return MachO;
+}
+
+Expected<const WasmConfig &> ConfigManager::getWasmConfig() const {
+  if (!Common.AddGnuDebugLink.empty() || Common.ExtractPartition ||
+      !Common.SplitDWO.empty() || !Common.SymbolsPrefix.empty() ||
+      !Common.AllocSectionsPrefix.empty() ||
+      Common.DiscardMode != DiscardType::None || !Common.SymbolsToAdd.empty() ||
+      !Common.SymbolsToGlobalize.empty() || !Common.SymbolsToLocalize.empty() ||
+      !Common.SymbolsToKeep.empty() || !Common.SymbolsToRemove.empty() ||
+      !Common.UnneededSymbolsToRemove.empty() ||
+      !Common.SymbolsToWeaken.empty() || !Common.SymbolsToKeepGlobal.empty() ||
+      !Common.SectionsToRename.empty() || !Common.SetSectionAlignment.empty() ||
+      !Common.SetSectionFlags.empty() || !Common.SymbolsToRename.empty())
+    return createStringError(llvm::errc::invalid_argument,
+                             "only flags for section dumping, removal, and "
+                             "addition are supported");
+
+  return Wasm;
+}
+
+Expected<const XCOFFConfig &> ConfigManager::getXCOFFConfig() const {
+  if (!Common.AddGnuDebugLink.empty() || Common.ExtractPartition ||
+      !Common.SplitDWO.empty() || !Common.SymbolsPrefix.empty() ||
+      !Common.AllocSectionsPrefix.empty() ||
+      Common.DiscardMode != DiscardType::None || !Common.AddSection.empty() ||
+      !Common.DumpSection.empty() || !Common.SymbolsToAdd.empty() ||
+      !Common.KeepSection.empty() || !Common.OnlySection.empty() ||
+      !Common.ToRemove.empty() || !Common.SymbolsToGlobalize.empty() ||
+      !Common.SymbolsToKeep.empty() || !Common.SymbolsToLocalize.empty() ||
+      !Common.SymbolsToRemove.empty() ||
+      !Common.UnneededSymbolsToRemove.empty() ||
+      !Common.SymbolsToWeaken.empty() || !Common.SymbolsToKeepGlobal.empty() ||
+      !Common.SectionsToRename.empty() || !Common.SetSectionAlignment.empty() ||
+      !Common.SetSectionFlags.empty() || !Common.SymbolsToRename.empty() ||
+      Common.ExtractDWO || Common.ExtractMainPartition ||
+      Common.OnlyKeepDebug || Common.PreserveDates || Common.StripAllGNU ||
+      Common.StripDWO || Common.StripDebug || Common.StripNonAlloc ||
+      Common.StripSections || Common.Weaken || Common.StripUnneeded ||
+      Common.DecompressDebugSections) {
+    return createStringError(
+        llvm::errc::invalid_argument,
+        "no flags are supported yet, only basic copying is allowed");
+  }
+
+  return XCOFF;
+}
+
+} // end namespace objcopy
+} // end namespace llvm
diff --git a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
new file mode 100644
index 000000000000..2d388f8a867e
--- /dev/null
+++ b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
@@ -0,0 +1,821 @@
+//===- ELFObjcopy.cpp -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ObjCopy/ELF/ELFObjcopy.h"
+#include "ELFObject.h"
+#include "llvm/ADT/BitmaskEnum.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/ObjCopy/CommonConfig.h"
+#include "llvm/ObjCopy/ELF/ELFConfig.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/ELFTypes.h"
+#include "llvm/Object/Error.h"
+#include "llvm/Option/Option.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compression.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Memory.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdlib>
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <system_error>
+#include <utility>
+
+using namespace llvm;
+using namespace llvm::ELF;
+using namespace llvm::objcopy;
+using namespace llvm::objcopy::elf;
+using namespace llvm::object;
+
+using SectionPred = std::function<bool(const SectionBase &Sec)>;
+
+static bool isDebugSection(const SectionBase &Sec) {
+  return StringRef(Sec.Name).startswith(".debug") || Sec.Name == ".gdb_index";
+}
+
+static bool isDWOSection(const SectionBase &Sec) {
+  return StringRef(Sec.Name).endswith(".dwo");
+}
+
+static bool onlyKeepDWOPred(const Object &Obj, const SectionBase &Sec) {
+  // We can't remove the section header string table.
+  if (&Sec == Obj.SectionNames)
+    return false;
+  // Short of keeping the string table we want to keep everything that is a DWO
+  // section and remove everything else.
+  return !isDWOSection(Sec);
+}
+
+static uint64_t getNewShfFlags(SectionFlag AllFlags) {
+  uint64_t NewFlags = 0;
+  if (AllFlags & SectionFlag::SecAlloc)
+    NewFlags |= ELF::SHF_ALLOC;
+  if (!(AllFlags & SectionFlag::SecReadonly))
+    NewFlags |= ELF::SHF_WRITE;
+  if (AllFlags & SectionFlag::SecCode)
+    NewFlags |= ELF::SHF_EXECINSTR;
+  if (AllFlags & SectionFlag::SecMerge)
+    NewFlags |= ELF::SHF_MERGE;
+  if (AllFlags & SectionFlag::SecStrings)
+    NewFlags |= ELF::SHF_STRINGS;
+  if (AllFlags & SectionFlag::SecExclude)
+    NewFlags |= ELF::SHF_EXCLUDE;
+  return NewFlags;
+}
+
+static uint64_t getSectionFlagsPreserveMask(uint64_t OldFlags,
+                                            uint64_t NewFlags) {
+  // Preserve some flags which should not be dropped when setting flags.
+  // Also, preserve anything OS/processor dependant.
+  const uint64_t PreserveMask =
+      (ELF::SHF_COMPRESSED | ELF::SHF_GROUP | ELF::SHF_LINK_ORDER |
+       ELF::SHF_MASKOS | ELF::SHF_MASKPROC | ELF::SHF_TLS |
+       ELF::SHF_INFO_LINK) &
+      ~ELF::SHF_EXCLUDE;
+  return (OldFlags & PreserveMask) | (NewFlags & ~PreserveMask);
+}
+
+static void setSectionFlagsAndType(SectionBase &Sec, SectionFlag Flags) {
+  Sec.Flags = getSectionFlagsPreserveMask(Sec.Flags, getNewShfFlags(Flags));
+
+  // In GNU objcopy, certain flags promote SHT_NOBITS to SHT_PROGBITS. This rule
+  // may promote more non-ALLOC sections than GNU objcopy, but it is fine as
+  // non-ALLOC SHT_NOBITS sections do not make much sense.
+  if (Sec.Type == SHT_NOBITS &&
+      (!(Sec.Flags & ELF::SHF_ALLOC) ||
+       Flags & (SectionFlag::SecContents | SectionFlag::SecLoad)))
+    Sec.Type = SHT_PROGBITS;
+}
+
+static ElfType getOutputElfType(const Binary &Bin) {
+  // Infer output ELF type from the input ELF object
+  if (isa<ELFObjectFile<ELF32LE>>(Bin))
+    return ELFT_ELF32LE;
+  if (isa<ELFObjectFile<ELF64LE>>(Bin))
+    return ELFT_ELF64LE;
+  if (isa<ELFObjectFile<ELF32BE>>(Bin))
+    return ELFT_ELF32BE;
+  if (isa<ELFObjectFile<ELF64BE>>(Bin))
+    return ELFT_ELF64BE;
+  llvm_unreachable("Invalid ELFType");
+}
+
+static ElfType getOutputElfType(const MachineInfo &MI) {
+  // Infer output ELF type from the binary arch specified
+  if (MI.Is64Bit)
+    return MI.IsLittleEndian ? ELFT_ELF64LE : ELFT_ELF64BE;
+  else
+    return MI.IsLittleEndian ? ELFT_ELF32LE : ELFT_ELF32BE;
+}
+
+static std::unique_ptr<Writer> createELFWriter(const CommonConfig &Config,
+                                               Object &Obj, raw_ostream &Out,
+                                               ElfType OutputElfType) {
+  // Depending on the initial ELFT and OutputFormat we need a different Writer.
+  switch (OutputElfType) {
+  case ELFT_ELF32LE:
+    return std::make_unique<ELFWriter<ELF32LE>>(Obj, Out, !Config.StripSections,
+                                                Config.OnlyKeepDebug);
+  case ELFT_ELF64LE:
+    return std::make_unique<ELFWriter<ELF64LE>>(Obj, Out, !Config.StripSections,
+                                                Config.OnlyKeepDebug);
+  case ELFT_ELF32BE:
+    return std::make_unique<ELFWriter<ELF32BE>>(Obj, Out, !Config.StripSections,
+                                                Config.OnlyKeepDebug);
+  case ELFT_ELF64BE:
+    return std::make_unique<ELFWriter<ELF64BE>>(Obj, Out, !Config.StripSections,
+                                                Config.OnlyKeepDebug);
+  }
+  llvm_unreachable("Invalid output format");
+}
+
+static std::unique_ptr<Writer> createWriter(const CommonConfig &Config,
+                                            Object &Obj, raw_ostream &Out,
+                                            ElfType OutputElfType) {
+  switch (Config.OutputFormat) {
+  case FileFormat::Binary:
+    return std::make_unique<BinaryWriter>(Obj, Out);
+  case FileFormat::IHex:
+    return std::make_unique<IHexWriter>(Obj, Out);
+  default:
+    return createELFWriter(Config, Obj, Out, OutputElfType);
+  }
+}
+
+template <class... Ts>
+static Error makeStringError(std::error_code EC, const Twine &Msg,
+                             Ts &&...Args) {
+  std::string FullMsg = (EC.message() + ": " + Msg).str();
+  return createStringError(EC, FullMsg.c_str(), std::forward<Ts>(Args)...);
+}
+
+static Error dumpSectionToFile(StringRef SecName, StringRef Filename,
+                               Object &Obj) {
+  for (auto &Sec : Obj.sections()) {
+    if (Sec.Name == SecName) {
+      if (Sec.Type == SHT_NOBITS)
+        return createStringError(object_error::parse_failed,
+                                 "cannot dump section '%s': it has no contents",
+                                 SecName.str().c_str());
+      Expected<std::unique_ptr<FileOutputBuffer>> BufferOrErr =
+          FileOutputBuffer::create(Filename, Sec.OriginalData.size());
+      if (!BufferOrErr)
+        return BufferOrErr.takeError();
+      std::unique_ptr<FileOutputBuffer> Buf = std::move(*BufferOrErr);
+      std::copy(Sec.OriginalData.begin(), Sec.OriginalData.end(),
+                Buf->getBufferStart());
+      if (Error E = Buf->commit())
+        return E;
+      return Error::success();
+    }
+  }
+  return createStringError(object_error::parse_failed, "section '%s' not found",
+                           SecName.str().c_str());
+}
+
+static bool isCompressable(const SectionBase &Sec) {
+  return !(Sec.Flags & ELF::SHF_COMPRESSED) &&
+         StringRef(Sec.Name).startswith(".debug");
+}
+
+static Error replaceDebugSections(
+    Object &Obj, function_ref<bool(const SectionBase &)> ShouldReplace,
+    function_ref<Expected<SectionBase *>(const SectionBase *)> AddSection) {
+  // Build a list of the debug sections we are going to replace.
+  // We can't call `AddSection` while iterating over sections,
+  // because it would mutate the sections array.
+  SmallVector<SectionBase *, 13> ToReplace;
+  for (auto &Sec : Obj.sections())
+    if (ShouldReplace(Sec))
+      ToReplace.push_back(&Sec);
+
+  // Build a mapping from original section to a new one.
+  DenseMap<SectionBase *, SectionBase *> FromTo;
+  for (SectionBase *S : ToReplace) {
+    Expected<SectionBase *> NewSection = AddSection(S);
+    if (!NewSection)
+      return NewSection.takeError();
+
+    FromTo[S] = *NewSection;
+  }
+
+  return Obj.replaceSections(FromTo);
+}
+
+static bool isAArch64MappingSymbol(const Symbol &Sym) {
+  if (Sym.Binding != STB_LOCAL || Sym.Type != STT_NOTYPE ||
+      Sym.getShndx() == SHN_UNDEF)
+    return false;
+  StringRef Name = Sym.Name;
+  if (!Name.consume_front("$x") && !Name.consume_front("$d"))
+    return false;
+  return Name.empty() || Name.startswith(".");
+}
+
+static bool isArmMappingSymbol(const Symbol &Sym) {
+  if (Sym.Binding != STB_LOCAL || Sym.Type != STT_NOTYPE ||
+      Sym.getShndx() == SHN_UNDEF)
+    return false;
+  StringRef Name = Sym.Name;
+  if (!Name.consume_front("$a") && !Name.consume_front("$d") &&
+      !Name.consume_front("$t"))
+    return false;
+  return Name.empty() || Name.startswith(".");
+}
+
+// Check if the symbol should be preserved because it is required by ABI.
+static bool isRequiredByABISymbol(const Object &Obj, const Symbol &Sym) {
+  switch (Obj.Machine) {
+  case EM_AARCH64:
+    // Mapping symbols should be preserved for a relocatable object file.
+    return Obj.isRelocatable() && isAArch64MappingSymbol(Sym);
+  case EM_ARM:
+    // Mapping symbols should be preserved for a relocatable object file.
+    return Obj.isRelocatable() && isArmMappingSymbol(Sym);
+  default:
+    return false;
+  }
+}
+
+static bool isUnneededSymbol(const Symbol &Sym) {
+  return !Sym.Referenced &&
+         (Sym.Binding == STB_LOCAL || Sym.getShndx() == SHN_UNDEF) &&
+         Sym.Type != STT_SECTION;
+}
+
+static Error updateAndRemoveSymbols(const CommonConfig &Config,
+                                    const ELFConfig &ELFConfig, Object &Obj) {
+  // TODO: update or remove symbols only if there is an option that affects
+  // them.
+  if (!Obj.SymbolTable)
+    return Error::success();
+
+  Obj.SymbolTable->updateSymbols([&](Symbol &Sym) {
+    // Common and undefined symbols don't make sense as local symbols, and can
+    // even cause crashes if we localize those, so skip them.
+    if (!Sym.isCommon() && Sym.getShndx() != SHN_UNDEF &&
+        ((ELFConfig.LocalizeHidden &&
+          (Sym.Visibility == STV_HIDDEN || Sym.Visibility == STV_INTERNAL)) ||
+         Config.SymbolsToLocalize.matches(Sym.Name)))
+      Sym.Binding = STB_LOCAL;
+
+    // Note: these two globalize flags have very similar names but different
+    // meanings:
+    //
+    // --globalize-symbol: promote a symbol to global
+    // --keep-global-symbol: all symbols except for these should be made local
+    //
+    // If --globalize-symbol is specified for a given symbol, it will be
+    // global in the output file even if it is not included via
+    // --keep-global-symbol. Because of that, make sure to check
+    // --globalize-symbol second.
+    if (!Config.SymbolsToKeepGlobal.empty() &&
+        !Config.SymbolsToKeepGlobal.matches(Sym.Name) &&
+        Sym.getShndx() != SHN_UNDEF)
+      Sym.Binding = STB_LOCAL;
+
+    if (Config.SymbolsToGlobalize.matches(Sym.Name) &&
+        Sym.getShndx() != SHN_UNDEF)
+      Sym.Binding = STB_GLOBAL;
+
+    // SymbolsToWeaken applies to both STB_GLOBAL and STB_GNU_UNIQUE.
+    if (Config.SymbolsToWeaken.matches(Sym.Name) && Sym.Binding != STB_LOCAL)
+      Sym.Binding = STB_WEAK;
+
+    if (Config.Weaken && Sym.Binding != STB_LOCAL &&
+        Sym.getShndx() != SHN_UNDEF)
+      Sym.Binding = STB_WEAK;
+
+    const auto I = Config.SymbolsToRename.find(Sym.Name);
+    if (I != Config.SymbolsToRename.end())
+      Sym.Name = std::string(I->getValue());
+
+    if (!Config.SymbolsPrefix.empty() && Sym.Type != STT_SECTION)
+      Sym.Name = (Config.SymbolsPrefix + Sym.Name).str();
+  });
+
+  // The purpose of this loop is to mark symbols referenced by sections
+  // (like GroupSection or RelocationSection). This way, we know which
+  // symbols are still 'needed' and which are not.
+  if (Config.StripUnneeded || !Config.UnneededSymbolsToRemove.empty() ||
+      !Config.OnlySection.empty()) {
+    for (SectionBase &Sec : Obj.sections())
+      Sec.markSymbols();
+  }
+
+  auto RemoveSymbolsPred = [&](const Symbol &Sym) {
+    if (Config.SymbolsToKeep.matches(Sym.Name) ||
+        (ELFConfig.KeepFileSymbols && Sym.Type == STT_FILE))
+      return false;
+
+    if (Config.SymbolsToRemove.matches(Sym.Name))
+      return true;
+
+    if (Config.StripAll || Config.StripAllGNU)
+      return true;
+
+    if (isRequiredByABISymbol(Obj, Sym))
+      return false;
+
+    if (Config.StripDebug && Sym.Type == STT_FILE)
+      return true;
+
+    if ((Config.DiscardMode == DiscardType::All ||
+         (Config.DiscardMode == DiscardType::Locals &&
+          StringRef(Sym.Name).startswith(".L"))) &&
+        Sym.Binding == STB_LOCAL && Sym.getShndx() != SHN_UNDEF &&
+        Sym.Type != STT_FILE && Sym.Type != STT_SECTION)
+      return true;
+
+    if ((Config.StripUnneeded ||
+         Config.UnneededSymbolsToRemove.matches(Sym.Name)) &&
+        (!Obj.isRelocatable() || isUnneededSymbol(Sym)))
+      return true;
+
+    // We want to remove undefined symbols if all references have been stripped.
+    if (!Config.OnlySection.empty() && !Sym.Referenced &&
+        Sym.getShndx() == SHN_UNDEF)
+      return true;
+
+    return false;
+  };
+
+  return Obj.removeSymbols(RemoveSymbolsPred);
+}
+
+static Error replaceAndRemoveSections(const CommonConfig &Config,
+                                      const ELFConfig &ELFConfig, Object &Obj) {
+  SectionPred RemovePred = [](const SectionBase &) { return false; };
+
+  // Removes:
+  if (!Config.ToRemove.empty()) {
+    RemovePred = [&Config](const SectionBase &Sec) {
+      return Config.ToRemove.matches(Sec.Name);
+    };
+  }
+
+  if (Config.StripDWO)
+    RemovePred = [RemovePred](const SectionBase &Sec) {
+      return isDWOSection(Sec) || RemovePred(Sec);
+    };
+
+  if (Config.ExtractDWO)
+    RemovePred = [RemovePred, &Obj](const SectionBase &Sec) {
+      return onlyKeepDWOPred(Obj, Sec) || RemovePred(Sec);
+    };
+
+  if (Config.StripAllGNU)
+    RemovePred = [RemovePred, &Obj](const SectionBase &Sec) {
+      if (RemovePred(Sec))
+        return true;
+      if ((Sec.Flags & SHF_ALLOC) != 0)
+        return false;
+      if (&Sec == Obj.SectionNames)
+        return false;
+      switch (Sec.Type) {
+      case SHT_SYMTAB:
+      case SHT_REL:
+      case SHT_RELA:
+      case SHT_STRTAB:
+        return true;
+      }
+      return isDebugSection(Sec);
+    };
+
+  if (Config.StripSections) {
+    RemovePred = [RemovePred](const SectionBase &Sec) {
+      return RemovePred(Sec) || Sec.ParentSegment == nullptr;
+    };
+  }
+
+  if (Config.StripDebug || Config.StripUnneeded) {
+    RemovePred = [RemovePred](const SectionBase &Sec) {
+      return RemovePred(Sec) || isDebugSection(Sec);
+    };
+  }
+
+  if (Config.StripNonAlloc)
+    RemovePred = [RemovePred, &Obj](const SectionBase &Sec) {
+      if (RemovePred(Sec))
+        return true;
+      if (&Sec == Obj.SectionNames)
+        return false;
+      return (Sec.Flags & SHF_ALLOC) == 0 && Sec.ParentSegment == nullptr;
+    };
+
+  if (Config.StripAll)
+    RemovePred = [RemovePred, &Obj](const SectionBase &Sec) {
+      if (RemovePred(Sec))
+        return true;
+      if (&Sec == Obj.SectionNames)
+        return false;
+      if (StringRef(Sec.Name).startswith(".gnu.warning"))
+        return false;
+      // We keep the .ARM.attribute section to maintain compatibility
+      // with Debian derived distributions. This is a bug in their
+      // patchset as documented here:
+      // https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=943798
+      if (Sec.Type == SHT_ARM_ATTRIBUTES)
+        return false;
+      if (Sec.ParentSegment != nullptr)
+        return false;
+      return (Sec.Flags & SHF_ALLOC) == 0;
+    };
+
+  if (Config.ExtractPartition || Config.ExtractMainPartition) {
+    RemovePred = [RemovePred](const SectionBase &Sec) {
+      if (RemovePred(Sec))
+        return true;
+      if (Sec.Type == SHT_LLVM_PART_EHDR || Sec.Type == SHT_LLVM_PART_PHDR)
+        return true;
+      return (Sec.Flags & SHF_ALLOC) != 0 && !Sec.ParentSegment;
+    };
+  }
+
+  // Explicit copies:
+  if (!Config.OnlySection.empty()) {
+    RemovePred = [&Config, RemovePred, &Obj](const SectionBase &Sec) {
+      // Explicitly keep these sections regardless of previous removes.
+      if (Config.OnlySection.matches(Sec.Name))
+        return false;
+
+      // Allow all implicit removes.
+      if (RemovePred(Sec))
+        return true;
+
+      // Keep special sections.
+      if (Obj.SectionNames == &Sec)
+        return false;
+      if (Obj.SymbolTable == &Sec ||
+          (Obj.SymbolTable && Obj.SymbolTable->getStrTab() == &Sec))
+        return false;
+
+      // Remove everything else.
+      return true;
+    };
+  }
+
+  if (!Config.KeepSection.empty()) {
+    RemovePred = [&Config, RemovePred](const SectionBase &Sec) {
+      // Explicitly keep these sections regardless of previous removes.
+      if (Config.KeepSection.matches(Sec.Name))
+        return false;
+      // Otherwise defer to RemovePred.
+      return RemovePred(Sec);
+    };
+  }
+
+  // This has to be the last predicate assignment.
+  // If the option --keep-symbol has been specified
+  // and at least one of those symbols is present
+  // (equivalently, the updated symbol table is not empty)
+  // the symbol table and the string table should not be removed.
+  if ((!Config.SymbolsToKeep.empty() || ELFConfig.KeepFileSymbols) &&
+      Obj.SymbolTable && !Obj.SymbolTable->empty()) {
+    RemovePred = [&Obj, RemovePred](const SectionBase &Sec) {
+      if (&Sec == Obj.SymbolTable || &Sec == Obj.SymbolTable->getStrTab())
+        return false;
+      return RemovePred(Sec);
+    };
+  }
+
+  if (Error E = Obj.removeSections(ELFConfig.AllowBrokenLinks, RemovePred))
+    return E;
+
+  if (Config.CompressionType != DebugCompressionType::None) {
+    if (Error Err = replaceDebugSections(
+            Obj, isCompressable,
+            [&Config, &Obj](const SectionBase *S) -> Expected<SectionBase *> {
+              return &Obj.addSection<CompressedSection>(
+                  CompressedSection(*S, Config.CompressionType));
+            }))
+      return Err;
+  } else if (Config.DecompressDebugSections) {
+    if (Error Err = replaceDebugSections(
+            Obj,
+            [](const SectionBase &S) { return isa<CompressedSection>(&S); },
+            [&Obj](const SectionBase *S) {
+              const CompressedSection *CS = cast<CompressedSection>(S);
+              return &Obj.addSection<DecompressedSection>(*CS);
+            }))
+      return Err;
+  }
+
+  return Error::success();
+}
+
+// Add symbol to the Object symbol table with the specified properties.
+static void addSymbol(Object &Obj, const NewSymbolInfo &SymInfo,
+                      uint8_t DefaultVisibility) {
+  SectionBase *Sec = Obj.findSection(SymInfo.SectionName);
+  uint64_t Value = Sec ? Sec->Addr + SymInfo.Value : SymInfo.Value;
+
+  uint8_t Bind = ELF::STB_GLOBAL;
+  uint8_t Type = ELF::STT_NOTYPE;
+  uint8_t Visibility = DefaultVisibility;
+
+  for (SymbolFlag FlagValue : SymInfo.Flags)
+    switch (FlagValue) {
+    case SymbolFlag::Global:
+      Bind = ELF::STB_GLOBAL;
+      break;
+    case SymbolFlag::Local:
+      Bind = ELF::STB_LOCAL;
+      break;
+    case SymbolFlag::Weak:
+      Bind = ELF::STB_WEAK;
+      break;
+    case SymbolFlag::Default:
+      Visibility = ELF::STV_DEFAULT;
+      break;
+    case SymbolFlag::Hidden:
+      Visibility = ELF::STV_HIDDEN;
+      break;
+    case SymbolFlag::Protected:
+      Visibility = ELF::STV_PROTECTED;
+      break;
+    case SymbolFlag::File:
+      Type = ELF::STT_FILE;
+      break;
+    case SymbolFlag::Section:
+      Type = ELF::STT_SECTION;
+      break;
+    case SymbolFlag::Object:
+      Type = ELF::STT_OBJECT;
+      break;
+    case SymbolFlag::Function:
+      Type = ELF::STT_FUNC;
+      break;
+    case SymbolFlag::IndirectFunction:
+      Type = ELF::STT_GNU_IFUNC;
+      break;
+    default: /* Other flag values are ignored for ELF. */
+      break;
+    };
+
+  Obj.SymbolTable->addSymbol(
+      SymInfo.SymbolName, Bind, Type, Sec, Value, Visibility,
+      Sec ? (uint16_t)SYMBOL_SIMPLE_INDEX : (uint16_t)SHN_ABS, 0);
+}
+
+static Error
+handleUserSection(const NewSectionInfo &NewSection,
+                  function_ref<Error(StringRef, ArrayRef<uint8_t>)> F) {
+  ArrayRef<uint8_t> Data(reinterpret_cast<const uint8_t *>(
+                             NewSection.SectionData->getBufferStart()),
+                         NewSection.SectionData->getBufferSize());
+  return F(NewSection.SectionName, Data);
+}
+
+// This function handles the high level operations of GNU objcopy including
+// handling command line options. It's important to outline certain properties
+// we expect to hold of the command line operations. Any operation that "keeps"
+// should keep regardless of a remove. Additionally any removal should respect
+// any previous removals. Lastly whether or not something is removed shouldn't
+// depend a) on the order the options occur in or b) on some opaque priority
+// system. The only priority is that keeps/copies overrule removes.
+static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig,
+                        Object &Obj) {
+  if (Config.OutputArch) {
+    Obj.Machine = Config.OutputArch.getValue().EMachine;
+    Obj.OSABI = Config.OutputArch.getValue().OSABI;
+  }
+
+  if (!Config.SplitDWO.empty() && Config.ExtractDWO) {
+    return Obj.removeSections(
+        ELFConfig.AllowBrokenLinks,
+        [&Obj](const SectionBase &Sec) { return onlyKeepDWOPred(Obj, Sec); });
+  }
+
+  // Dump sections before add/remove for compatibility with GNU objcopy.
+  for (StringRef Flag : Config.DumpSection) {
+    StringRef SectionName;
+    StringRef FileName;
+    std::tie(SectionName, FileName) = Flag.split('=');
+    if (Error E = dumpSectionToFile(SectionName, FileName, Obj))
+      return E;
+  }
+
+  // It is important to remove the sections first. For example, we want to
+  // remove the relocation sections before removing the symbols. That allows
+  // us to avoid reporting the inappropriate errors about removing symbols
+  // named in relocations.
+  if (Error E = replaceAndRemoveSections(Config, ELFConfig, Obj))
+    return E;
+
+  if (Error E = updateAndRemoveSymbols(Config, ELFConfig, Obj))
+    return E;
+
+  if (!Config.SectionsToRename.empty()) {
+    std::vector<RelocationSectionBase *> RelocSections;
+    DenseSet<SectionBase *> RenamedSections;
+    for (SectionBase &Sec : Obj.sections()) {
+      auto *RelocSec = dyn_cast<RelocationSectionBase>(&Sec);
+      const auto Iter = Config.SectionsToRename.find(Sec.Name);
+      if (Iter != Config.SectionsToRename.end()) {
+        const SectionRename &SR = Iter->second;
+        Sec.Name = std::string(SR.NewName);
+        if (SR.NewFlags)
+          setSectionFlagsAndType(Sec, SR.NewFlags.getValue());
+        RenamedSections.insert(&Sec);
+      } else if (RelocSec && !(Sec.Flags & SHF_ALLOC))
+        // Postpone processing relocation sections which are not specified in
+        // their explicit '--rename-section' commands until after their target
+        // sections are renamed.
+        // Dynamic relocation sections (i.e. ones with SHF_ALLOC) should be
+        // renamed only explicitly. Otherwise, renaming, for example, '.got.plt'
+        // would affect '.rela.plt', which is not desirable.
+        RelocSections.push_back(RelocSec);
+    }
+
+    // Rename relocation sections according to their target sections.
+    for (RelocationSectionBase *RelocSec : RelocSections) {
+      auto Iter = RenamedSections.find(RelocSec->getSection());
+      if (Iter != RenamedSections.end())
+        RelocSec->Name = (RelocSec->getNamePrefix() + (*Iter)->Name).str();
+    }
+  }
+
+  // Add a prefix to allocated sections and their relocation sections. This
+  // should be done after renaming the section by Config.SectionToRename to
+  // imitate the GNU objcopy behavior.
+  if (!Config.AllocSectionsPrefix.empty()) {
+    DenseSet<SectionBase *> PrefixedSections;
+    for (SectionBase &Sec : Obj.sections()) {
+      if (Sec.Flags & SHF_ALLOC) {
+        Sec.Name = (Config.AllocSectionsPrefix + Sec.Name).str();
+        PrefixedSections.insert(&Sec);
+      } else if (auto *RelocSec = dyn_cast<RelocationSectionBase>(&Sec)) {
+        // Rename relocation sections associated to the allocated sections.
+        // For example, if we rename .text to .prefix.text, we also rename
+        // .rel.text to .rel.prefix.text.
+        //
+        // Dynamic relocation sections (SHT_REL[A] with SHF_ALLOC) are handled
+        // above, e.g., .rela.plt is renamed to .prefix.rela.plt, not
+        // .rela.prefix.plt since GNU objcopy does so.
+        const SectionBase *TargetSec = RelocSec->getSection();
+        if (TargetSec && (TargetSec->Flags & SHF_ALLOC)) {
+          // If the relocation section comes *after* the target section, we
+          // don't add Config.AllocSectionsPrefix because we've already added
+          // the prefix to TargetSec->Name. Otherwise, if the relocation
+          // section comes *before* the target section, we add the prefix.
+          if (PrefixedSections.count(TargetSec))
+            Sec.Name = (RelocSec->getNamePrefix() + TargetSec->Name).str();
+          else
+            Sec.Name = (RelocSec->getNamePrefix() + Config.AllocSectionsPrefix +
+                        TargetSec->Name)
+                           .str();
+        }
+      }
+    }
+  }
+
+  if (!Config.SetSectionAlignment.empty()) {
+    for (SectionBase &Sec : Obj.sections()) {
+      auto I = Config.SetSectionAlignment.find(Sec.Name);
+      if (I != Config.SetSectionAlignment.end())
+        Sec.Align = I->second;
+    }
+  }
+
+  if (Config.OnlyKeepDebug)
+    for (auto &Sec : Obj.sections())
+      if (Sec.Flags & SHF_ALLOC && Sec.Type != SHT_NOTE)
+        Sec.Type = SHT_NOBITS;
+
+  for (const NewSectionInfo &AddedSection : Config.AddSection) {
+    auto AddSection = [&](StringRef Name, ArrayRef<uint8_t> Data) {
+      OwnedDataSection &NewSection =
+          Obj.addSection<OwnedDataSection>(Name, Data);
+      if (Name.startswith(".note") && Name != ".note.GNU-stack")
+        NewSection.Type = SHT_NOTE;
+      return Error::success();
+    };
+    if (Error E = handleUserSection(AddedSection, AddSection))
+      return E;
+  }
+
+  for (const NewSectionInfo &NewSection : Config.UpdateSection) {
+    auto UpdateSection = [&](StringRef Name, ArrayRef<uint8_t> Data) {
+      return Obj.updateSection(Name, Data);
+    };
+    if (Error E = handleUserSection(NewSection, UpdateSection))
+      return E;
+  }
+
+  if (!Config.AddGnuDebugLink.empty())
+    Obj.addSection<GnuDebugLinkSection>(Config.AddGnuDebugLink,
+                                        Config.GnuDebugLinkCRC32);
+
+  // If the symbol table was previously removed, we need to create a new one
+  // before adding new symbols.
+  if (!Obj.SymbolTable && !Config.SymbolsToAdd.empty())
+    if (Error E = Obj.addNewSymbolTable())
+      return E;
+
+  for (const NewSymbolInfo &SI : Config.SymbolsToAdd)
+    addSymbol(Obj, SI, ELFConfig.NewSymbolVisibility);
+
+  // --set-section-flags works with sections added by --add-section.
+  if (!Config.SetSectionFlags.empty()) {
+    for (auto &Sec : Obj.sections()) {
+      const auto Iter = Config.SetSectionFlags.find(Sec.Name);
+      if (Iter != Config.SetSectionFlags.end()) {
+        const SectionFlagsUpdate &SFU = Iter->second;
+        setSectionFlagsAndType(Sec, SFU.NewFlags);
+      }
+    }
+  }
+
+  if (ELFConfig.EntryExpr)
+    Obj.Entry = ELFConfig.EntryExpr(Obj.Entry);
+  return Error::success();
+}
+
+static Error writeOutput(const CommonConfig &Config, Object &Obj,
+                         raw_ostream &Out, ElfType OutputElfType) {
+  std::unique_ptr<Writer> Writer =
+      createWriter(Config, Obj, Out, OutputElfType);
+  if (Error E = Writer->finalize())
+    return E;
+  return Writer->write();
+}
+
+Error objcopy::elf::executeObjcopyOnIHex(const CommonConfig &Config,
+                                         const ELFConfig &ELFConfig,
+                                         MemoryBuffer &In, raw_ostream &Out) {
+  IHexReader Reader(&In);
+  Expected<std::unique_ptr<Object>> Obj = Reader.create(true);
+  if (!Obj)
+    return Obj.takeError();
+
+  const ElfType OutputElfType =
+      getOutputElfType(Config.OutputArch.value_or(MachineInfo()));
+  if (Error E = handleArgs(Config, ELFConfig, **Obj))
+    return E;
+  return writeOutput(Config, **Obj, Out, OutputElfType);
+}
+
+Error objcopy::elf::executeObjcopyOnRawBinary(const CommonConfig &Config,
+                                              const ELFConfig &ELFConfig,
+                                              MemoryBuffer &In,
+                                              raw_ostream &Out) {
+  BinaryReader Reader(&In, ELFConfig.NewSymbolVisibility);
+  Expected<std::unique_ptr<Object>> Obj = Reader.create(true);
+  if (!Obj)
+    return Obj.takeError();
+
+  // Prefer OutputArch (-O<format>) if set, otherwise fallback to BinaryArch
+  // (-B<arch>).
+  const ElfType OutputElfType =
+      getOutputElfType(Config.OutputArch.value_or(MachineInfo()));
+  if (Error E = handleArgs(Config, ELFConfig, **Obj))
+    return E;
+  return writeOutput(Config, **Obj, Out, OutputElfType);
+}
+
+Error objcopy::elf::executeObjcopyOnBinary(const CommonConfig &Config,
+                                           const ELFConfig &ELFConfig,
+                                           object::ELFObjectFileBase &In,
+                                           raw_ostream &Out) {
+  ELFReader Reader(&In, Config.ExtractPartition);
+  Expected<std::unique_ptr<Object>> Obj =
+      Reader.create(!Config.SymbolsToAdd.empty());
+  if (!Obj)
+    return Obj.takeError();
+  // Prefer OutputArch (-O<format>) if set, otherwise infer it from the input.
+  const ElfType OutputElfType =
+      Config.OutputArch ? getOutputElfType(Config.OutputArch.getValue())
+                        : getOutputElfType(In);
+
+  if (Error E = handleArgs(Config, ELFConfig, **Obj))
+    return createFileError(Config.InputFilename, std::move(E));
+
+  if (Error E = writeOutput(Config, **Obj, Out, OutputElfType))
+    return createFileError(Config.InputFilename, std::move(E));
+
+  return Error::success();
+}
diff --git a/llvm/lib/ObjCopy/ELF/ELFObject.cpp b/llvm/lib/ObjCopy/ELF/ELFObject.cpp
new file mode 100644
index 000000000000..b241bd817ff5
--- /dev/null
+++ b/llvm/lib/ObjCopy/ELF/ELFObject.cpp
@@ -0,0 +1,2795 @@
+//===- ELFObject.cpp ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ELFObject.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Support/Compression.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileOutputBuffer.h"
+#include "llvm/Support/Path.h"
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::ELF;
+using namespace llvm::objcopy::elf;
+using namespace llvm::object;
+
+template <class ELFT> void ELFWriter<ELFT>::writePhdr(const Segment &Seg) {
+  uint8_t *B = reinterpret_cast<uint8_t *>(Buf->getBufferStart()) +
+               Obj.ProgramHdrSegment.Offset + Seg.Index * sizeof(Elf_Phdr);
+  Elf_Phdr &Phdr = *reinterpret_cast<Elf_Phdr *>(B);
+  Phdr.p_type = Seg.Type;
+  Phdr.p_flags = Seg.Flags;
+  Phdr.p_offset = Seg.Offset;
+  Phdr.p_vaddr = Seg.VAddr;
+  Phdr.p_paddr = Seg.PAddr;
+  Phdr.p_filesz = Seg.FileSize;
+  Phdr.p_memsz = Seg.MemSize;
+  Phdr.p_align = Seg.Align;
+}
+
+Error SectionBase::removeSectionReferences(
+    bool, function_ref<bool(const SectionBase *)>) {
+  return Error::success();
+}
+
+Error SectionBase::removeSymbols(function_ref<bool(const Symbol &)>) {
+  return Error::success();
+}
+
+Error SectionBase::initialize(SectionTableRef) { return Error::success(); }
+void SectionBase::finalize() {}
+void SectionBase::markSymbols() {}
+void SectionBase::replaceSectionReferences(
+    const DenseMap<SectionBase *, SectionBase *> &) {}
+void SectionBase::onRemove() {}
+
+template <class ELFT> void ELFWriter<ELFT>::writeShdr(const SectionBase &Sec) {
+  uint8_t *B =
+      reinterpret_cast<uint8_t *>(Buf->getBufferStart()) + Sec.HeaderOffset;
+  Elf_Shdr &Shdr = *reinterpret_cast<Elf_Shdr *>(B);
+  Shdr.sh_name = Sec.NameIndex;
+  Shdr.sh_type = Sec.Type;
+  Shdr.sh_flags = Sec.Flags;
+  Shdr.sh_addr = Sec.Addr;
+  Shdr.sh_offset = Sec.Offset;
+  Shdr.sh_size = Sec.Size;
+  Shdr.sh_link = Sec.Link;
+  Shdr.sh_info = Sec.Info;
+  Shdr.sh_addralign = Sec.Align;
+  Shdr.sh_entsize = Sec.EntrySize;
+}
+
+template <class ELFT> Error ELFSectionSizer<ELFT>::visit(Section &) {
+  return Error::success();
+}
+
+template <class ELFT> Error ELFSectionSizer<ELFT>::visit(OwnedDataSection &) {
+  return Error::success();
+}
+
+template <class ELFT> Error ELFSectionSizer<ELFT>::visit(StringTableSection &) {
+  return Error::success();
+}
+
+template <class ELFT>
+Error ELFSectionSizer<ELFT>::visit(DynamicRelocationSection &) {
+  return Error::success();
+}
+
+template <class ELFT>
+Error ELFSectionSizer<ELFT>::visit(SymbolTableSection &Sec) {
+  Sec.EntrySize = sizeof(Elf_Sym);
+  Sec.Size = Sec.Symbols.size() * Sec.EntrySize;
+  // Align to the largest field in Elf_Sym.
+  Sec.Align = ELFT::Is64Bits ? sizeof(Elf_Xword) : sizeof(Elf_Word);
+  return Error::success();
+}
+
+template <class ELFT>
+Error ELFSectionSizer<ELFT>::visit(RelocationSection &Sec) {
+  Sec.EntrySize = Sec.Type == SHT_REL ? sizeof(Elf_Rel) : sizeof(Elf_Rela);
+  Sec.Size = Sec.Relocations.size() * Sec.EntrySize;
+  // Align to the largest field in Elf_Rel(a).
+  Sec.Align = ELFT::Is64Bits ? sizeof(Elf_Xword) : sizeof(Elf_Word);
+  return Error::success();
+}
+
+template <class ELFT>
+Error ELFSectionSizer<ELFT>::visit(GnuDebugLinkSection &) {
+  return Error::success();
+}
+
+template <class ELFT> Error ELFSectionSizer<ELFT>::visit(GroupSection &Sec) {
+  Sec.Size = sizeof(Elf_Word) + Sec.GroupMembers.size() * sizeof(Elf_Word);
+  return Error::success();
+}
+
+template <class ELFT>
+Error ELFSectionSizer<ELFT>::visit(SectionIndexSection &) {
+  return Error::success();
+}
+
+template <class ELFT> Error ELFSectionSizer<ELFT>::visit(CompressedSection &) {
+  return Error::success();
+}
+
+template <class ELFT>
+Error ELFSectionSizer<ELFT>::visit(DecompressedSection &) {
+  return Error::success();
+}
+
+Error BinarySectionWriter::visit(const SectionIndexSection &Sec) {
+  return createStringError(errc::operation_not_permitted,
+                           "cannot write symbol section index table '" +
+                               Sec.Name + "' ");
+}
+
+Error BinarySectionWriter::visit(const SymbolTableSection &Sec) {
+  return createStringError(errc::operation_not_permitted,
+                           "cannot write symbol table '" + Sec.Name +
+                               "' out to binary");
+}
+
+Error BinarySectionWriter::visit(const RelocationSection &Sec) {
+  return createStringError(errc::operation_not_permitted,
+                           "cannot write relocation section '" + Sec.Name +
+                               "' out to binary");
+}
+
+Error BinarySectionWriter::visit(const GnuDebugLinkSection &Sec) {
+  return createStringError(errc::operation_not_permitted,
+                           "cannot write '" + Sec.Name + "' out to binary");
+}
+
+Error BinarySectionWriter::visit(const GroupSection &Sec) {
+  return createStringError(errc::operation_not_permitted,
+                           "cannot write '" + Sec.Name + "' out to binary");
+}
+
+Error SectionWriter::visit(const Section &Sec) {
+  if (Sec.Type != SHT_NOBITS)
+    llvm::copy(Sec.Contents, Out.getBufferStart() + Sec.Offset);
+
+  return Error::success();
+}
+
+static bool addressOverflows32bit(uint64_t Addr) {
+  // Sign extended 32 bit addresses (e.g 0xFFFFFFFF80000000) are ok
+  return Addr > UINT32_MAX && Addr + 0x80000000 > UINT32_MAX;
+}
+
+template <class T> static T checkedGetHex(StringRef S) {
+  T Value;
+  bool Fail = S.getAsInteger(16, Value);
+  assert(!Fail);
+  (void)Fail;
+  return Value;
+}
+
+// Fills exactly Len bytes of buffer with hexadecimal characters
+// representing value 'X'
+template <class T, class Iterator>
+static Iterator toHexStr(T X, Iterator It, size_t Len) {
+  // Fill range with '0'
+  std::fill(It, It + Len, '0');
+
+  for (long I = Len - 1; I >= 0; --I) {
+    unsigned char Mod = static_cast<unsigned char>(X) & 15;
+    *(It + I) = hexdigit(Mod, false);
+    X >>= 4;
+  }
+  assert(X == 0);
+  return It + Len;
+}
+
+uint8_t IHexRecord::getChecksum(StringRef S) {
+  assert((S.size() & 1) == 0);
+  uint8_t Checksum = 0;
+  while (!S.empty()) {
+    Checksum += checkedGetHex<uint8_t>(S.take_front(2));
+    S = S.drop_front(2);
+  }
+  return -Checksum;
+}
+
+IHexLineData IHexRecord::getLine(uint8_t Type, uint16_t Addr,
+                                 ArrayRef<uint8_t> Data) {
+  IHexLineData Line(getLineLength(Data.size()));
+  assert(Line.size());
+  auto Iter = Line.begin();
+  *Iter++ = ':';
+  Iter = toHexStr(Data.size(), Iter, 2);
+  Iter = toHexStr(Addr, Iter, 4);
+  Iter = toHexStr(Type, Iter, 2);
+  for (uint8_t X : Data)
+    Iter = toHexStr(X, Iter, 2);
+  StringRef S(Line.data() + 1, std::distance(Line.begin() + 1, Iter));
+  Iter = toHexStr(getChecksum(S), Iter, 2);
+  *Iter++ = '\r';
+  *Iter++ = '\n';
+  assert(Iter == Line.end());
+  return Line;
+}
+
+static Error checkRecord(const IHexRecord &R) {
+  switch (R.Type) {
+  case IHexRecord::Data:
+    if (R.HexData.size() == 0)
+      return createStringError(
+          errc::invalid_argument,
+          "zero data length is not allowed for data records");
+    break;
+  case IHexRecord::EndOfFile:
+    break;
+  case IHexRecord::SegmentAddr:
+    // 20-bit segment address. Data length must be 2 bytes
+    // (4 bytes in hex)
+    if (R.HexData.size() != 4)
+      return createStringError(
+          errc::invalid_argument,
+          "segment address data should be 2 bytes in size");
+    break;
+  case IHexRecord::StartAddr80x86:
+  case IHexRecord::StartAddr:
+    if (R.HexData.size() != 8)
+      return createStringError(errc::invalid_argument,
+                               "start address data should be 4 bytes in size");
+    // According to Intel HEX specification '03' record
+    // only specifies the code address within the 20-bit
+    // segmented address space of the 8086/80186. This
+    // means 12 high order bits should be zeroes.
+    if (R.Type == IHexRecord::StartAddr80x86 &&
+        R.HexData.take_front(3) != "000")
+      return createStringError(errc::invalid_argument,
+                               "start address exceeds 20 bit for 80x86");
+    break;
+  case IHexRecord::ExtendedAddr:
+    // 16-31 bits of linear base address
+    if (R.HexData.size() != 4)
+      return createStringError(
+          errc::invalid_argument,
+          "extended address data should be 2 bytes in size");
+    break;
+  default:
+    // Unknown record type
+    return createStringError(errc::invalid_argument, "unknown record type: %u",
+                             static_cast<unsigned>(R.Type));
+  }
+  return Error::success();
+}
+
+// Checks that IHEX line contains valid characters.
+// This allows converting hexadecimal data to integers
+// without extra verification.
+static Error checkChars(StringRef Line) {
+  assert(!Line.empty());
+  if (Line[0] != ':')
+    return createStringError(errc::invalid_argument,
+                             "missing ':' in the beginning of line.");
+
+  for (size_t Pos = 1; Pos < Line.size(); ++Pos)
+    if (hexDigitValue(Line[Pos]) == -1U)
+      return createStringError(errc::invalid_argument,
+                               "invalid character at position %zu.", Pos + 1);
+  return Error::success();
+}
+
+Expected<IHexRecord> IHexRecord::parse(StringRef Line) {
+  assert(!Line.empty());
+
+  // ':' + Length + Address + Type + Checksum with empty data ':LLAAAATTCC'
+  if (Line.size() < 11)
+    return createStringError(errc::invalid_argument,
+                             "line is too short: %zu chars.", Line.size());
+
+  if (Error E = checkChars(Line))
+    return std::move(E);
+
+  IHexRecord Rec;
+  size_t DataLen = checkedGetHex<uint8_t>(Line.substr(1, 2));
+  if (Line.size() != getLength(DataLen))
+    return createStringError(errc::invalid_argument,
+                             "invalid line length %zu (should be %zu)",
+                             Line.size(), getLength(DataLen));
+
+  Rec.Addr = checkedGetHex<uint16_t>(Line.substr(3, 4));
+  Rec.Type = checkedGetHex<uint8_t>(Line.substr(7, 2));
+  Rec.HexData = Line.substr(9, DataLen * 2);
+
+  if (getChecksum(Line.drop_front(1)) != 0)
+    return createStringError(errc::invalid_argument, "incorrect checksum.");
+  if (Error E = checkRecord(Rec))
+    return std::move(E);
+  return Rec;
+}
+
+static uint64_t sectionPhysicalAddr(const SectionBase *Sec) {
+  Segment *Seg = Sec->ParentSegment;
+  if (Seg && Seg->Type != ELF::PT_LOAD)
+    Seg = nullptr;
+  return Seg ? Seg->PAddr + Sec->OriginalOffset - Seg->OriginalOffset
+             : Sec->Addr;
+}
+
+void IHexSectionWriterBase::writeSection(const SectionBase *Sec,
+                                         ArrayRef<uint8_t> Data) {
+  assert(Data.size() == Sec->Size);
+  const uint32_t ChunkSize = 16;
+  uint32_t Addr = sectionPhysicalAddr(Sec) & 0xFFFFFFFFU;
+  while (!Data.empty()) {
+    uint64_t DataSize = std::min<uint64_t>(Data.size(), ChunkSize);
+    if (Addr > SegmentAddr + BaseAddr + 0xFFFFU) {
+      if (Addr > 0xFFFFFU) {
+        // Write extended address record, zeroing segment address
+        // if needed.
+        if (SegmentAddr != 0)
+          SegmentAddr = writeSegmentAddr(0U);
+        BaseAddr = writeBaseAddr(Addr);
+      } else {
+        // We can still remain 16-bit
+        SegmentAddr = writeSegmentAddr(Addr);
+      }
+    }
+    uint64_t SegOffset = Addr - BaseAddr - SegmentAddr;
+    assert(SegOffset <= 0xFFFFU);
+    DataSize = std::min(DataSize, 0x10000U - SegOffset);
+    writeData(0, SegOffset, Data.take_front(DataSize));
+    Addr += DataSize;
+    Data = Data.drop_front(DataSize);
+  }
+}
+
+uint64_t IHexSectionWriterBase::writeSegmentAddr(uint64_t Addr) {
+  assert(Addr <= 0xFFFFFU);
+  uint8_t Data[] = {static_cast<uint8_t>((Addr & 0xF0000U) >> 12), 0};
+  writeData(2, 0, Data);
+  return Addr & 0xF0000U;
+}
+
+uint64_t IHexSectionWriterBase::writeBaseAddr(uint64_t Addr) {
+  assert(Addr <= 0xFFFFFFFFU);
+  uint64_t Base = Addr & 0xFFFF0000U;
+  uint8_t Data[] = {static_cast<uint8_t>(Base >> 24),
+                    static_cast<uint8_t>((Base >> 16) & 0xFF)};
+  writeData(4, 0, Data);
+  return Base;
+}
+
+void IHexSectionWriterBase::writeData(uint8_t, uint16_t,
+                                      ArrayRef<uint8_t> Data) {
+  Offset += IHexRecord::getLineLength(Data.size());
+}
+
+Error IHexSectionWriterBase::visit(const Section &Sec) {
+  writeSection(&Sec, Sec.Contents);
+  return Error::success();
+}
+
+Error IHexSectionWriterBase::visit(const OwnedDataSection &Sec) {
+  writeSection(&Sec, Sec.Data);
+  return Error::success();
+}
+
+Error IHexSectionWriterBase::visit(const StringTableSection &Sec) {
+  // Check that sizer has already done its work
+  assert(Sec.Size == Sec.StrTabBuilder.getSize());
+  // We are free to pass an invalid pointer to writeSection as long
+  // as we don't actually write any data. The real writer class has
+  // to override this method .
+  writeSection(&Sec, {nullptr, static_cast<size_t>(Sec.Size)});
+  return Error::success();
+}
+
+Error IHexSectionWriterBase::visit(const DynamicRelocationSection &Sec) {
+  writeSection(&Sec, Sec.Contents);
+  return Error::success();
+}
+
+void IHexSectionWriter::writeData(uint8_t Type, uint16_t Addr,
+                                  ArrayRef<uint8_t> Data) {
+  IHexLineData HexData = IHexRecord::getLine(Type, Addr, Data);
+  memcpy(Out.getBufferStart() + Offset, HexData.data(), HexData.size());
+  Offset += HexData.size();
+}
+
+Error IHexSectionWriter::visit(const StringTableSection &Sec) {
+  assert(Sec.Size == Sec.StrTabBuilder.getSize());
+  std::vector<uint8_t> Data(Sec.Size);
+  Sec.StrTabBuilder.write(Data.data());
+  writeSection(&Sec, Data);
+  return Error::success();
+}
+
+Error Section::accept(SectionVisitor &Visitor) const {
+  return Visitor.visit(*this);
+}
+
+Error Section::accept(MutableSectionVisitor &Visitor) {
+  return Visitor.visit(*this);
+}
+
+Error SectionWriter::visit(const OwnedDataSection &Sec) {
+  llvm::copy(Sec.Data, Out.getBufferStart() + Sec.Offset);
+  return Error::success();
+}
+
+static constexpr std::array<uint8_t, 4> ZlibGnuMagic = {{'Z', 'L', 'I', 'B'}};
+
+static bool isDataGnuCompressed(ArrayRef<uint8_t> Data) {
+  return Data.size() > ZlibGnuMagic.size() &&
+         std::equal(ZlibGnuMagic.begin(), ZlibGnuMagic.end(), Data.data());
+}
+
+template <class ELFT>
+static std::tuple<uint64_t, uint64_t>
+getDecompressedSizeAndAlignment(ArrayRef<uint8_t> Data) {
+  const bool IsGnuDebug = isDataGnuCompressed(Data);
+  const uint64_t DecompressedSize =
+      IsGnuDebug
+          ? support::endian::read64be(Data.data() + ZlibGnuMagic.size())
+          : reinterpret_cast<const Elf_Chdr_Impl<ELFT> *>(Data.data())->ch_size;
+  const uint64_t DecompressedAlign =
+      IsGnuDebug ? 1
+                 : reinterpret_cast<const Elf_Chdr_Impl<ELFT> *>(Data.data())
+                       ->ch_addralign;
+
+  return std::make_tuple(DecompressedSize, DecompressedAlign);
+}
+
+template <class ELFT>
+Error ELFSectionWriter<ELFT>::visit(const DecompressedSection &Sec) {
+  const size_t DataOffset = isDataGnuCompressed(Sec.OriginalData)
+                                ? (ZlibGnuMagic.size() + sizeof(Sec.Size))
+                                : sizeof(Elf_Chdr_Impl<ELFT>);
+
+  StringRef CompressedContent(
+      reinterpret_cast<const char *>(Sec.OriginalData.data()) + DataOffset,
+      Sec.OriginalData.size() - DataOffset);
+
+  SmallVector<char, 128> DecompressedContent;
+  if (Error Err = zlib::uncompress(CompressedContent, DecompressedContent,
+                                   static_cast<size_t>(Sec.Size)))
+    return createStringError(errc::invalid_argument,
+                             "'" + Sec.Name + "': " + toString(std::move(Err)));
+
+  uint8_t *Buf = reinterpret_cast<uint8_t *>(Out.getBufferStart()) + Sec.Offset;
+  std::copy(DecompressedContent.begin(), DecompressedContent.end(), Buf);
+
+  return Error::success();
+}
+
+Error BinarySectionWriter::visit(const DecompressedSection &Sec) {
+  return createStringError(errc::operation_not_permitted,
+                           "cannot write compressed section '" + Sec.Name +
+                               "' ");
+}
+
+Error DecompressedSection::accept(SectionVisitor &Visitor) const {
+  return Visitor.visit(*this);
+}
+
+Error DecompressedSection::accept(MutableSectionVisitor &Visitor) {
+  return Visitor.visit(*this);
+}
+
+Error OwnedDataSection::accept(SectionVisitor &Visitor) const {
+  return Visitor.visit(*this);
+}
+
+Error OwnedDataSection::accept(MutableSectionVisitor &Visitor) {
+  return Visitor.visit(*this);
+}
+
+void OwnedDataSection::appendHexData(StringRef HexData) {
+  assert((HexData.size() & 1) == 0);
+  while (!HexData.empty()) {
+    Data.push_back(checkedGetHex<uint8_t>(HexData.take_front(2)));
+    HexData = HexData.drop_front(2);
+  }
+  Size = Data.size();
+}
+
+Error BinarySectionWriter::visit(const CompressedSection &Sec) {
+  return createStringError(errc::operation_not_permitted,
+                           "cannot write compressed section '" + Sec.Name +
+                               "' ");
+}
+
+template <class ELFT>
+Error ELFSectionWriter<ELFT>::visit(const CompressedSection &Sec) {
+  uint8_t *Buf = reinterpret_cast<uint8_t *>(Out.getBufferStart()) + Sec.Offset;
+  Elf_Chdr_Impl<ELFT> Chdr;
+  switch (Sec.CompressionType) {
+  case DebugCompressionType::None:
+    std::copy(Sec.OriginalData.begin(), Sec.OriginalData.end(), Buf);
+    return Error::success();
+  case DebugCompressionType::GNU:
+    llvm_unreachable("unexpected zlib-gnu");
+    break;
+  case DebugCompressionType::Z:
+    Chdr.ch_type = ELF::ELFCOMPRESS_ZLIB;
+    break;
+  }
+  Chdr.ch_size = Sec.DecompressedSize;
+  Chdr.ch_addralign = Sec.DecompressedAlign;
+  memcpy(Buf, &Chdr, sizeof(Chdr));
+  Buf += sizeof(Chdr);
+
+  std::copy(Sec.CompressedData.begin(), Sec.CompressedData.end(), Buf);
+  return Error::success();
+}
+
+CompressedSection::CompressedSection(const SectionBase &Sec,
+                                     DebugCompressionType CompressionType)
+    : SectionBase(Sec), CompressionType(CompressionType),
+      DecompressedSize(Sec.OriginalData.size()), DecompressedAlign(Sec.Align) {
+  zlib::compress(StringRef(reinterpret_cast<const char *>(OriginalData.data()),
+                           OriginalData.size()),
+                 CompressedData);
+
+  assert(CompressionType != DebugCompressionType::None);
+  Flags |= ELF::SHF_COMPRESSED;
+  size_t ChdrSize =
+      std::max(std::max(sizeof(object::Elf_Chdr_Impl<object::ELF64LE>),
+                        sizeof(object::Elf_Chdr_Impl<object::ELF64BE>)),
+               std::max(sizeof(object::Elf_Chdr_Impl<object::ELF32LE>),
+                        sizeof(object::Elf_Chdr_Impl<object::ELF32BE>)));
+  Size = ChdrSize + CompressedData.size();
+  Align = 8;
+}
+
+CompressedSection::CompressedSection(ArrayRef<uint8_t> CompressedData,
+                                     uint64_t DecompressedSize,
+                                     uint64_t DecompressedAlign)
+    : CompressionType(DebugCompressionType::None),
+      DecompressedSize(DecompressedSize), DecompressedAlign(DecompressedAlign) {
+  OriginalData = CompressedData;
+}
+
+Error CompressedSection::accept(SectionVisitor &Visitor) const {
+  return Visitor.visit(*this);
+}
+
+Error CompressedSection::accept(MutableSectionVisitor &Visitor) {
+  return Visitor.visit(*this);
+}
+
+void StringTableSection::addString(StringRef Name) { StrTabBuilder.add(Name); }
+
+uint32_t StringTableSection::findIndex(StringRef Name) const {
+  return StrTabBuilder.getOffset(Name);
+}
+
+void StringTableSection::prepareForLayout() {
+  StrTabBuilder.finalize();
+  Size = StrTabBuilder.getSize();
+}
+
+Error SectionWriter::visit(const StringTableSection &Sec) {
+  Sec.StrTabBuilder.write(reinterpret_cast<uint8_t *>(Out.getBufferStart()) +
+                          Sec.Offset);
+  return Error::success();
+}
+
+Error StringTableSection::accept(SectionVisitor &Visitor) const {
+  return Visitor.visit(*this);
+}
+
+Error StringTableSection::accept(MutableSectionVisitor &Visitor) {
+  return Visitor.visit(*this);
+}
+
+template <class ELFT>
+Error ELFSectionWriter<ELFT>::visit(const SectionIndexSection &Sec) {
+  uint8_t *Buf = reinterpret_cast<uint8_t *>(Out.getBufferStart()) + Sec.Offset;
+  llvm::copy(Sec.Indexes, reinterpret_cast<Elf_Word *>(Buf));
+  return Error::success();
+}
+
+Error SectionIndexSection::initialize(SectionTableRef SecTable) {
+  Size = 0;
+  Expected<SymbolTableSection *> Sec =
+      SecTable.getSectionOfType<SymbolTableSection>(
+          Link,
+          "Link field value " + Twine(Link) + " in section " + Name +
+              " is invalid",
+          "Link field value " + Twine(Link) + " in section " + Name +
+              " is not a symbol table");
+  if (!Sec)
+    return Sec.takeError();
+
+  setSymTab(*Sec);
+  Symbols->setShndxTable(this);
+  return Error::success();
+}
+
+void SectionIndexSection::finalize() { Link = Symbols->Index; }
+
+Error SectionIndexSection::accept(SectionVisitor &Visitor) const {
+  return Visitor.visit(*this);
+}
+
+Error SectionIndexSection::accept(MutableSectionVisitor &Visitor) {
+  return Visitor.visit(*this);
+}
+
+static bool isValidReservedSectionIndex(uint16_t Index, uint16_t Machine) {
+  switch (Index) {
+  case SHN_ABS:
+  case SHN_COMMON:
+    return true;
+  }
+
+  if (Machine == EM_AMDGPU) {
+    return Index == SHN_AMDGPU_LDS;
+  }
+
+  if (Machine == EM_MIPS) {
+    switch (Index) {
+    case SHN_MIPS_ACOMMON:
+    case SHN_MIPS_SCOMMON:
+    case SHN_MIPS_SUNDEFINED:
+      return true;
+    }
+  }
+
+  if (Machine == EM_HEXAGON) {
+    switch (Index) {
+    case SHN_HEXAGON_SCOMMON:
+    case SHN_HEXAGON_SCOMMON_1:
+    case SHN_HEXAGON_SCOMMON_2:
+    case SHN_HEXAGON_SCOMMON_4:
+    case SHN_HEXAGON_SCOMMON_8:
+      return true;
+    }
+  }
+  return false;
+}
+
+// Large indexes force us to clarify exactly what this function should do. This
+// function should return the value that will appear in st_shndx when written
+// out.
+uint16_t Symbol::getShndx() const {
+  if (DefinedIn != nullptr) {
+    if (DefinedIn->Index >= SHN_LORESERVE)
+      return SHN_XINDEX;
+    return DefinedIn->Index;
+  }
+
+  if (ShndxType == SYMBOL_SIMPLE_INDEX) {
+    // This means that we don't have a defined section but we do need to
+    // output a legitimate section index.
+    return SHN_UNDEF;
+  }
+
+  assert(ShndxType == SYMBOL_ABS || ShndxType == SYMBOL_COMMON ||
+         (ShndxType >= SYMBOL_LOPROC && ShndxType <= SYMBOL_HIPROC) ||
+         (ShndxType >= SYMBOL_LOOS && ShndxType <= SYMBOL_HIOS));
+  return static_cast<uint16_t>(ShndxType);
+}
+
+bool Symbol::isCommon() const { return getShndx() == SHN_COMMON; }
+
+void SymbolTableSection::assignIndices() {
+  uint32_t Index = 0;
+  for (auto &Sym : Symbols)
+    Sym->Index = Index++;
+}
+
+void SymbolTableSection::addSymbol(Twine Name, uint8_t Bind, uint8_t Type,
+                                   SectionBase *DefinedIn, uint64_t Value,
+                                   uint8_t Visibility, uint16_t Shndx,
+                                   uint64_t SymbolSize) {
+  Symbol Sym;
+  Sym.Name = Name.str();
+  Sym.Binding = Bind;
+  Sym.Type = Type;
+  Sym.DefinedIn = DefinedIn;
+  if (DefinedIn != nullptr)
+    DefinedIn->HasSymbol = true;
+  if (DefinedIn == nullptr) {
+    if (Shndx >= SHN_LORESERVE)
+      Sym.ShndxType = static_cast<SymbolShndxType>(Shndx);
+    else
+      Sym.ShndxType = SYMBOL_SIMPLE_INDEX;
+  }
+  Sym.Value = Value;
+  Sym.Visibility = Visibility;
+  Sym.Size = SymbolSize;
+  Sym.Index = Symbols.size();
+  Symbols.emplace_back(std::make_unique<Symbol>(Sym));
+  Size += this->EntrySize;
+}
+
+Error SymbolTableSection::removeSectionReferences(
+    bool AllowBrokenLinks, function_ref<bool(const SectionBase *)> ToRemove) {
+  if (ToRemove(SectionIndexTable))
+    SectionIndexTable = nullptr;
+  if (ToRemove(SymbolNames)) {
+    if (!AllowBrokenLinks)
+      return createStringError(
+          llvm::errc::invalid_argument,
+          "string table '%s' cannot be removed because it is "
+          "referenced by the symbol table '%s'",
+          SymbolNames->Name.data(), this->Name.data());
+    SymbolNames = nullptr;
+  }
+  return removeSymbols(
+      [ToRemove](const Symbol &Sym) { return ToRemove(Sym.DefinedIn); });
+}
+
+void SymbolTableSection::updateSymbols(function_ref<void(Symbol &)> Callable) {
+  for (SymPtr &Sym : llvm::drop_begin(Symbols))
+    Callable(*Sym);
+  std::stable_partition(
+      std::begin(Symbols), std::end(Symbols),
+      [](const SymPtr &Sym) { return Sym->Binding == STB_LOCAL; });
+  assignIndices();
+}
+
+Error SymbolTableSection::removeSymbols(
+    function_ref<bool(const Symbol &)> ToRemove) {
+  Symbols.erase(
+      std::remove_if(std::begin(Symbols) + 1, std::end(Symbols),
+                     [ToRemove](const SymPtr &Sym) { return ToRemove(*Sym); }),
+      std::end(Symbols));
+  Size = Symbols.size() * EntrySize;
+  assignIndices();
+  return Error::success();
+}
+
+void SymbolTableSection::replaceSectionReferences(
+    const DenseMap<SectionBase *, SectionBase *> &FromTo) {
+  for (std::unique_ptr<Symbol> &Sym : Symbols)
+    if (SectionBase *To = FromTo.lookup(Sym->DefinedIn))
+      Sym->DefinedIn = To;
+}
+
+Error SymbolTableSection::initialize(SectionTableRef SecTable) {
+  Size = 0;
+  Expected<StringTableSection *> Sec =
+      SecTable.getSectionOfType<StringTableSection>(
+          Link,
+          "Symbol table has link index of " + Twine(Link) +
+              " which is not a valid index",
+          "Symbol table has link index of " + Twine(Link) +
+              " which is not a string table");
+  if (!Sec)
+    return Sec.takeError();
+
+  setStrTab(*Sec);
+  return Error::success();
+}
+
+void SymbolTableSection::finalize() {
+  uint32_t MaxLocalIndex = 0;
+  for (std::unique_ptr<Symbol> &Sym : Symbols) {
+    Sym->NameIndex =
+        SymbolNames == nullptr ? 0 : SymbolNames->findIndex(Sym->Name);
+    if (Sym->Binding == STB_LOCAL)
+      MaxLocalIndex = std::max(MaxLocalIndex, Sym->Index);
+  }
+  // Now we need to set the Link and Info fields.
+  Link = SymbolNames == nullptr ? 0 : SymbolNames->Index;
+  Info = MaxLocalIndex + 1;
+}
+
+void SymbolTableSection::prepareForLayout() {
+  // Reserve proper amount of space in section index table, so we can
+  // layout sections correctly. We will fill the table with correct
+  // indexes later in fillShdnxTable.
+  if (SectionIndexTable)
+    SectionIndexTable->reserve(Symbols.size());
+
+  // Add all of our strings to SymbolNames so that SymbolNames has the right
+  // size before layout is decided.
+  // If the symbol names section has been removed, don't try to add strings to
+  // the table.
+  if (SymbolNames != nullptr)
+    for (std::unique_ptr<Symbol> &Sym : Symbols)
+      SymbolNames->addString(Sym->Name);
+}
+
+void SymbolTableSection::fillShndxTable() {
+  if (SectionIndexTable == nullptr)
+    return;
+  // Fill section index table with real section indexes. This function must
+  // be called after assignOffsets.
+  for (const std::unique_ptr<Symbol> &Sym : Symbols) {
+    if (Sym->DefinedIn != nullptr && Sym->DefinedIn->Index >= SHN_LORESERVE)
+      SectionIndexTable->addIndex(Sym->DefinedIn->Index);
+    else
+      SectionIndexTable->addIndex(SHN_UNDEF);
+  }
+}
+
+Expected<const Symbol *>
+SymbolTableSection::getSymbolByIndex(uint32_t Index) const {
+  if (Symbols.size() <= Index)
+    return createStringError(errc::invalid_argument,
+                             "invalid symbol index: " + Twine(Index));
+  return Symbols[Index].get();
+}
+
+Expected<Symbol *> SymbolTableSection::getSymbolByIndex(uint32_t Index) {
+  Expected<const Symbol *> Sym =
+      static_cast<const SymbolTableSection *>(this)->getSymbolByIndex(Index);
+  if (!Sym)
+    return Sym.takeError();
+
+  return const_cast<Symbol *>(*Sym);
+}
+
+template <class ELFT>
+Error ELFSectionWriter<ELFT>::visit(const SymbolTableSection &Sec) {
+  Elf_Sym *Sym = reinterpret_cast<Elf_Sym *>(Out.getBufferStart() + Sec.Offset);
+  // Loop though symbols setting each entry of the symbol table.
+  for (const std::unique_ptr<Symbol> &Symbol : Sec.Symbols) {
+    Sym->st_name = Symbol->NameIndex;
+    Sym->st_value = Symbol->Value;
+    Sym->st_size = Symbol->Size;
+    Sym->st_other = Symbol->Visibility;
+    Sym->setBinding(Symbol->Binding);
+    Sym->setType(Symbol->Type);
+    Sym->st_shndx = Symbol->getShndx();
+    ++Sym;
+  }
+  return Error::success();
+}
+
+Error SymbolTableSection::accept(SectionVisitor &Visitor) const {
+  return Visitor.visit(*this);
+}
+
+Error SymbolTableSection::accept(MutableSectionVisitor &Visitor) {
+  return Visitor.visit(*this);
+}
+
+StringRef RelocationSectionBase::getNamePrefix() const {
+  switch (Type) {
+  case SHT_REL:
+    return ".rel";
+  case SHT_RELA:
+    return ".rela";
+  default:
+    llvm_unreachable("not a relocation section");
+  }
+}
+
+Error RelocationSection::removeSectionReferences(
+    bool AllowBrokenLinks, function_ref<bool(const SectionBase *)> ToRemove) {
+  if (ToRemove(Symbols)) {
+    if (!AllowBrokenLinks)
+      return createStringError(
+          llvm::errc::invalid_argument,
+          "symbol table '%s' cannot be removed because it is "
+          "referenced by the relocation section '%s'",
+          Symbols->Name.data(), this->Name.data());
+    Symbols = nullptr;
+  }
+
+  for (const Relocation &R : Relocations) {
+    if (!R.RelocSymbol || !R.RelocSymbol->DefinedIn ||
+        !ToRemove(R.RelocSymbol->DefinedIn))
+      continue;
+    return createStringError(llvm::errc::invalid_argument,
+                             "section '%s' cannot be removed: (%s+0x%" PRIx64
+                             ") has relocation against symbol '%s'",
+                             R.RelocSymbol->DefinedIn->Name.data(),
+                             SecToApplyRel->Name.data(), R.Offset,
+                             R.RelocSymbol->Name.c_str());
+  }
+
+  return Error::success();
+}
+
+template <class SymTabType>
+Error RelocSectionWithSymtabBase<SymTabType>::initialize(
+    SectionTableRef SecTable) {
+  if (Link != SHN_UNDEF) {
+    Expected<SymTabType *> Sec = SecTable.getSectionOfType<SymTabType>(
+        Link,
+        "Link field value " + Twine(Link) + " in section " + Name +
+            " is invalid",
+        "Link field value " + Twine(Link) + " in section " + Name +
+            " is not a symbol table");
+    if (!Sec)
+      return Sec.takeError();
+
+    setSymTab(*Sec);
+  }
+
+  if (Info != SHN_UNDEF) {
+    Expected<SectionBase *> Sec =
+        SecTable.getSection(Info, "Info field value " + Twine(Info) +
+                                      " in section " + Name + " is invalid");
+    if (!Sec)
+      return Sec.takeError();
+
+    setSection(*Sec);
+  } else
+    setSection(nullptr);
+
+  return Error::success();
+}
+
+template <class SymTabType>
+void RelocSectionWithSymtabBase<SymTabType>::finalize() {
+  this->Link = Symbols ? Symbols->Index : 0;
+
+  if (SecToApplyRel != nullptr)
+    this->Info = SecToApplyRel->Index;
+}
+
+template <class ELFT>
+static void setAddend(Elf_Rel_Impl<ELFT, false> &, uint64_t) {}
+
+template <class ELFT>
+static void setAddend(Elf_Rel_Impl<ELFT, true> &Rela, uint64_t Addend) {
+  Rela.r_addend = Addend;
+}
+
+template <class RelRange, class T>
+static void writeRel(const RelRange &Relocations, T *Buf, bool IsMips64EL) {
+  for (const auto &Reloc : Relocations) {
+    Buf->r_offset = Reloc.Offset;
+    setAddend(*Buf, Reloc.Addend);
+    Buf->setSymbolAndType(Reloc.RelocSymbol ? Reloc.RelocSymbol->Index : 0,
+                          Reloc.Type, IsMips64EL);
+    ++Buf;
+  }
+}
+
+template <class ELFT>
+Error ELFSectionWriter<ELFT>::visit(const RelocationSection &Sec) {
+  uint8_t *Buf = reinterpret_cast<uint8_t *>(Out.getBufferStart()) + Sec.Offset;
+  if (Sec.Type == SHT_REL)
+    writeRel(Sec.Relocations, reinterpret_cast<Elf_Rel *>(Buf),
+             Sec.getObject().IsMips64EL);
+  else
+    writeRel(Sec.Relocations, reinterpret_cast<Elf_Rela *>(Buf),
+             Sec.getObject().IsMips64EL);
+  return Error::success();
+}
+
+Error RelocationSection::accept(SectionVisitor &Visitor) const {
+  return Visitor.visit(*this);
+}
+
+Error RelocationSection::accept(MutableSectionVisitor &Visitor) {
+  return Visitor.visit(*this);
+}
+
+Error RelocationSection::removeSymbols(
+    function_ref<bool(const Symbol &)> ToRemove) {
+  for (const Relocation &Reloc : Relocations)
+    if (Reloc.RelocSymbol && ToRemove(*Reloc.RelocSymbol))
+      return createStringError(
+          llvm::errc::invalid_argument,
+          "not stripping symbol '%s' because it is named in a relocation",
+          Reloc.RelocSymbol->Name.data());
+  return Error::success();
+}
+
+void RelocationSection::markSymbols() {
+  for (const Relocation &Reloc : Relocations)
+    if (Reloc.RelocSymbol)
+      Reloc.RelocSymbol->Referenced = true;
+}
+
+void RelocationSection::replaceSectionReferences(
+    const DenseMap<SectionBase *, SectionBase *> &FromTo) {
+  // Update the target section if it was replaced.
+  if (SectionBase *To = FromTo.lookup(SecToApplyRel))
+    SecToApplyRel = To;
+}
+
+Error SectionWriter::visit(const DynamicRelocationSection &Sec) {
+  llvm::copy(Sec.Contents, Out.getBufferStart() + Sec.Offset);
+  return Error::success();
+}
+
+Error DynamicRelocationSection::accept(SectionVisitor &Visitor) const {
+  return Visitor.visit(*this);
+}
+
+Error DynamicRelocationSection::accept(MutableSectionVisitor &Visitor) {
+  return Visitor.visit(*this);
+}
+
+Error DynamicRelocationSection::removeSectionReferences(
+    bool AllowBrokenLinks, function_ref<bool(const SectionBase *)> ToRemove) {
+  if (ToRemove(Symbols)) {
+    if (!AllowBrokenLinks)
+      return createStringError(
+          llvm::errc::invalid_argument,
+          "symbol table '%s' cannot be removed because it is "
+          "referenced by the relocation section '%s'",
+          Symbols->Name.data(), this->Name.data());
+    Symbols = nullptr;
+  }
+
+  // SecToApplyRel contains a section referenced by sh_info field. It keeps
+  // a section to which the relocation section applies. When we remove any
+  // sections we also remove their relocation sections. Since we do that much
+  // earlier, this assert should never be triggered.
+  assert(!SecToApplyRel || !ToRemove(SecToApplyRel));
+  return Error::success();
+}
+
+Error Section::removeSectionReferences(
+    bool AllowBrokenDependency,
+    function_ref<bool(const SectionBase *)> ToRemove) {
+  if (ToRemove(LinkSection)) {
+    if (!AllowBrokenDependency)
+      return createStringError(llvm::errc::invalid_argument,
+                               "section '%s' cannot be removed because it is "
+                               "referenced by the section '%s'",
+                               LinkSection->Name.data(), this->Name.data());
+    LinkSection = nullptr;
+  }
+  return Error::success();
+}
+
+void GroupSection::finalize() {
+  this->Info = Sym ? Sym->Index : 0;
+  this->Link = SymTab ? SymTab->Index : 0;
+  // Linker deduplication for GRP_COMDAT is based on Sym->Name. The local/global
+  // status is not part of the equation. If Sym is localized, the intention is
+  // likely to make the group fully localized. Drop GRP_COMDAT to suppress
+  // deduplication. See https://groups.google.com/g/generic-abi/c/2X6mR-s2zoc
+  if ((FlagWord & GRP_COMDAT) && Sym && Sym->Binding == STB_LOCAL)
+    this->FlagWord &= ~GRP_COMDAT;
+}
+
+Error GroupSection::removeSectionReferences(
+    bool AllowBrokenLinks, function_ref<bool(const SectionBase *)> ToRemove) {
+  if (ToRemove(SymTab)) {
+    if (!AllowBrokenLinks)
+      return createStringError(
+          llvm::errc::invalid_argument,
+          "section '.symtab' cannot be removed because it is "
+          "referenced by the group section '%s'",
+          this->Name.data());
+    SymTab = nullptr;
+    Sym = nullptr;
+  }
+  llvm::erase_if(GroupMembers, ToRemove);
+  return Error::success();
+}
+
+Error GroupSection::removeSymbols(function_ref<bool(const Symbol &)> ToRemove) {
+  if (ToRemove(*Sym))
+    return createStringError(llvm::errc::invalid_argument,
+                             "symbol '%s' cannot be removed because it is "
+                             "referenced by the section '%s[%d]'",
+                             Sym->Name.data(), this->Name.data(), this->Index);
+  return Error::success();
+}
+
+void GroupSection::markSymbols() {
+  if (Sym)
+    Sym->Referenced = true;
+}
+
+void GroupSection::replaceSectionReferences(
+    const DenseMap<SectionBase *, SectionBase *> &FromTo) {
+  for (SectionBase *&Sec : GroupMembers)
+    if (SectionBase *To = FromTo.lookup(Sec))
+      Sec = To;
+}
+
+void GroupSection::onRemove() {
+  // As the header section of the group is removed, drop the Group flag in its
+  // former members.
+  for (SectionBase *Sec : GroupMembers)
+    Sec->Flags &= ~SHF_GROUP;
+}
+
+Error Section::initialize(SectionTableRef SecTable) {
+  if (Link == ELF::SHN_UNDEF)
+    return Error::success();
+
+  Expected<SectionBase *> Sec =
+      SecTable.getSection(Link, "Link field value " + Twine(Link) +
+                                    " in section " + Name + " is invalid");
+  if (!Sec)
+    return Sec.takeError();
+
+  LinkSection = *Sec;
+
+  if (LinkSection->Type == ELF::SHT_SYMTAB)
+    LinkSection = nullptr;
+
+  return Error::success();
+}
+
+void Section::finalize() { this->Link = LinkSection ? LinkSection->Index : 0; }
+
+void GnuDebugLinkSection::init(StringRef File) {
+  FileName = sys::path::filename(File);
+  // The format for the .gnu_debuglink starts with the file name and is
+  // followed by a null terminator and then the CRC32 of the file. The CRC32
+  // should be 4 byte aligned. So we add the FileName size, a 1 for the null
+  // byte, and then finally push the size to alignment and add 4.
+  Size = alignTo(FileName.size() + 1, 4) + 4;
+  // The CRC32 will only be aligned if we align the whole section.
+  Align = 4;
+  Type = OriginalType = ELF::SHT_PROGBITS;
+  Name = ".gnu_debuglink";
+  // For sections not found in segments, OriginalOffset is only used to
+  // establish the order that sections should go in. By using the maximum
+  // possible offset we cause this section to wind up at the end.
+  OriginalOffset = std::numeric_limits<uint64_t>::max();
+}
+
+GnuDebugLinkSection::GnuDebugLinkSection(StringRef File,
+                                         uint32_t PrecomputedCRC)
+    : FileName(File), CRC32(PrecomputedCRC) {
+  init(File);
+}
+
+template <class ELFT>
+Error ELFSectionWriter<ELFT>::visit(const GnuDebugLinkSection &Sec) {
+  unsigned char *Buf =
+      reinterpret_cast<uint8_t *>(Out.getBufferStart()) + Sec.Offset;
+  Elf_Word *CRC =
+      reinterpret_cast<Elf_Word *>(Buf + Sec.Size - sizeof(Elf_Word));
+  *CRC = Sec.CRC32;
+  llvm::copy(Sec.FileName, Buf);
+  return Error::success();
+}
+
+Error GnuDebugLinkSection::accept(SectionVisitor &Visitor) const {
+  return Visitor.visit(*this);
+}
+
+Error GnuDebugLinkSection::accept(MutableSectionVisitor &Visitor) {
+  return Visitor.visit(*this);
+}
+
+template <class ELFT>
+Error ELFSectionWriter<ELFT>::visit(const GroupSection &Sec) {
+  ELF::Elf32_Word *Buf =
+      reinterpret_cast<ELF::Elf32_Word *>(Out.getBufferStart() + Sec.Offset);
+  support::endian::write32<ELFT::TargetEndianness>(Buf++, Sec.FlagWord);
+  for (SectionBase *S : Sec.GroupMembers)
+    support::endian::write32<ELFT::TargetEndianness>(Buf++, S->Index);
+  return Error::success();
+}
+
+Error GroupSection::accept(SectionVisitor &Visitor) const {
+  return Visitor.visit(*this);
+}
+
+Error GroupSection::accept(MutableSectionVisitor &Visitor) {
+  return Visitor.visit(*this);
+}
+
+// Returns true IFF a section is wholly inside the range of a segment
+static bool sectionWithinSegment(const SectionBase &Sec, const Segment &Seg) {
+  // If a section is empty it should be treated like it has a size of 1. This is
+  // to clarify the case when an empty section lies on a boundary between two
+  // segments and ensures that the section "belongs" to the second segment and
+  // not the first.
+  uint64_t SecSize = Sec.Size ? Sec.Size : 1;
+
+  // Ignore just added sections.
+  if (Sec.OriginalOffset == std::numeric_limits<uint64_t>::max())
+    return false;
+
+  if (Sec.Type == SHT_NOBITS) {
+    if (!(Sec.Flags & SHF_ALLOC))
+      return false;
+
+    bool SectionIsTLS = Sec.Flags & SHF_TLS;
+    bool SegmentIsTLS = Seg.Type == PT_TLS;
+    if (SectionIsTLS != SegmentIsTLS)
+      return false;
+
+    return Seg.VAddr <= Sec.Addr &&
+           Seg.VAddr + Seg.MemSize >= Sec.Addr + SecSize;
+  }
+
+  return Seg.Offset <= Sec.OriginalOffset &&
+         Seg.Offset + Seg.FileSize >= Sec.OriginalOffset + SecSize;
+}
+
+// Returns true IFF a segment's original offset is inside of another segment's
+// range.
+static bool segmentOverlapsSegment(const Segment &Child,
+                                   const Segment &Parent) {
+
+  return Parent.OriginalOffset <= Child.OriginalOffset &&
+         Parent.OriginalOffset + Parent.FileSize > Child.OriginalOffset;
+}
+
+static bool compareSegmentsByOffset(const Segment *A, const Segment *B) {
+  // Any segment without a parent segment should come before a segment
+  // that has a parent segment.
+  if (A->OriginalOffset < B->OriginalOffset)
+    return true;
+  if (A->OriginalOffset > B->OriginalOffset)
+    return false;
+  return A->Index < B->Index;
+}
+
+void BasicELFBuilder::initFileHeader() {
+  Obj->Flags = 0x0;
+  Obj->Type = ET_REL;
+  Obj->OSABI = ELFOSABI_NONE;
+  Obj->ABIVersion = 0;
+  Obj->Entry = 0x0;
+  Obj->Machine = EM_NONE;
+  Obj->Version = 1;
+}
+
+void BasicELFBuilder::initHeaderSegment() { Obj->ElfHdrSegment.Index = 0; }
+
+StringTableSection *BasicELFBuilder::addStrTab() {
+  auto &StrTab = Obj->addSection<StringTableSection>();
+  StrTab.Name = ".strtab";
+
+  Obj->SectionNames = &StrTab;
+  return &StrTab;
+}
+
+SymbolTableSection *BasicELFBuilder::addSymTab(StringTableSection *StrTab) {
+  auto &SymTab = Obj->addSection<SymbolTableSection>();
+
+  SymTab.Name = ".symtab";
+  SymTab.Link = StrTab->Index;
+
+  // The symbol table always needs a null symbol
+  SymTab.addSymbol("", 0, 0, nullptr, 0, 0, 0, 0);
+
+  Obj->SymbolTable = &SymTab;
+  return &SymTab;
+}
+
+Error BasicELFBuilder::initSections() {
+  for (SectionBase &Sec : Obj->sections())
+    if (Error Err = Sec.initialize(Obj->sections()))
+      return Err;
+
+  return Error::success();
+}
+
+void BinaryELFBuilder::addData(SymbolTableSection *SymTab) {
+  auto Data = ArrayRef<uint8_t>(
+      reinterpret_cast<const uint8_t *>(MemBuf->getBufferStart()),
+      MemBuf->getBufferSize());
+  auto &DataSection = Obj->addSection<Section>(Data);
+  DataSection.Name = ".data";
+  DataSection.Type = ELF::SHT_PROGBITS;
+  DataSection.Size = Data.size();
+  DataSection.Flags = ELF::SHF_ALLOC | ELF::SHF_WRITE;
+
+  std::string SanitizedFilename = MemBuf->getBufferIdentifier().str();
+  std::replace_if(
+      std::begin(SanitizedFilename), std::end(SanitizedFilename),
+      [](char C) { return !isAlnum(C); }, '_');
+  Twine Prefix = Twine("_binary_") + SanitizedFilename;
+
+  SymTab->addSymbol(Prefix + "_start", STB_GLOBAL, STT_NOTYPE, &DataSection,
+                    /*Value=*/0, NewSymbolVisibility, 0, 0);
+  SymTab->addSymbol(Prefix + "_end", STB_GLOBAL, STT_NOTYPE, &DataSection,
+                    /*Value=*/DataSection.Size, NewSymbolVisibility, 0, 0);
+  SymTab->addSymbol(Prefix + "_size", STB_GLOBAL, STT_NOTYPE, nullptr,
+                    /*Value=*/DataSection.Size, NewSymbolVisibility, SHN_ABS,
+                    0);
+}
+
+Expected<std::unique_ptr<Object>> BinaryELFBuilder::build() {
+  initFileHeader();
+  initHeaderSegment();
+
+  SymbolTableSection *SymTab = addSymTab(addStrTab());
+  if (Error Err = initSections())
+    return std::move(Err);
+  addData(SymTab);
+
+  return std::move(Obj);
+}
+
+// Adds sections from IHEX data file. Data should have been
+// fully validated by this time.
+void IHexELFBuilder::addDataSections() {
+  OwnedDataSection *Section = nullptr;
+  uint64_t SegmentAddr = 0, BaseAddr = 0;
+  uint32_t SecNo = 1;
+
+  for (const IHexRecord &R : Records) {
+    uint64_t RecAddr;
+    switch (R.Type) {
+    case IHexRecord::Data:
+      // Ignore empty data records
+      if (R.HexData.empty())
+        continue;
+      RecAddr = R.Addr + SegmentAddr + BaseAddr;
+      if (!Section || Section->Addr + Section->Size != RecAddr) {
+        // OriginalOffset field is only used to sort sections before layout, so
+        // instead of keeping track of real offsets in IHEX file, and as
+        // layoutSections() and layoutSectionsForOnlyKeepDebug() use
+        // llvm::stable_sort(), we can just set it to a constant (zero).
+        Section = &Obj->addSection<OwnedDataSection>(
+            ".sec" + std::to_string(SecNo), RecAddr,
+            ELF::SHF_ALLOC | ELF::SHF_WRITE, 0);
+        SecNo++;
+      }
+      Section->appendHexData(R.HexData);
+      break;
+    case IHexRecord::EndOfFile:
+      break;
+    case IHexRecord::SegmentAddr:
+      // 20-bit segment address.
+      SegmentAddr = checkedGetHex<uint16_t>(R.HexData) << 4;
+      break;
+    case IHexRecord::StartAddr80x86:
+    case IHexRecord::StartAddr:
+      Obj->Entry = checkedGetHex<uint32_t>(R.HexData);
+      assert(Obj->Entry <= 0xFFFFFU);
+      break;
+    case IHexRecord::ExtendedAddr:
+      // 16-31 bits of linear base address
+      BaseAddr = checkedGetHex<uint16_t>(R.HexData) << 16;
+      break;
+    default:
+      llvm_unreachable("unknown record type");
+    }
+  }
+}
+
+Expected<std::unique_ptr<Object>> IHexELFBuilder::build() {
+  initFileHeader();
+  initHeaderSegment();
+  StringTableSection *StrTab = addStrTab();
+  addSymTab(StrTab);
+  if (Error Err = initSections())
+    return std::move(Err);
+  addDataSections();
+
+  return std::move(Obj);
+}
+
+template <class ELFT>
+ELFBuilder<ELFT>::ELFBuilder(const ELFObjectFile<ELFT> &ElfObj, Object &Obj,
+                             Optional<StringRef> ExtractPartition)
+    : ElfFile(ElfObj.getELFFile()), Obj(Obj),
+      ExtractPartition(ExtractPartition) {
+  Obj.IsMips64EL = ElfFile.isMips64EL();
+}
+
+template <class ELFT> void ELFBuilder<ELFT>::setParentSegment(Segment &Child) {
+  for (Segment &Parent : Obj.segments()) {
+    // Every segment will overlap with itself but we don't want a segment to
+    // be its own parent so we avoid that situation.
+    if (&Child != &Parent && segmentOverlapsSegment(Child, Parent)) {
+      // We want a canonical "most parental" segment but this requires
+      // inspecting the ParentSegment.
+      if (compareSegmentsByOffset(&Parent, &Child))
+        if (Child.ParentSegment == nullptr ||
+            compareSegmentsByOffset(&Parent, Child.ParentSegment)) {
+          Child.ParentSegment = &Parent;
+        }
+    }
+  }
+}
+
+template <class ELFT> Error ELFBuilder<ELFT>::findEhdrOffset() {
+  if (!ExtractPartition)
+    return Error::success();
+
+  for (const SectionBase &Sec : Obj.sections()) {
+    if (Sec.Type == SHT_LLVM_PART_EHDR && Sec.Name == *ExtractPartition) {
+      EhdrOffset = Sec.Offset;
+      return Error::success();
+    }
+  }
+  return createStringError(errc::invalid_argument,
+                           "could not find partition named '" +
+                               *ExtractPartition + "'");
+}
+
+template <class ELFT>
+Error ELFBuilder<ELFT>::readProgramHeaders(const ELFFile<ELFT> &HeadersFile) {
+  uint32_t Index = 0;
+
+  Expected<typename ELFFile<ELFT>::Elf_Phdr_Range> Headers =
+      HeadersFile.program_headers();
+  if (!Headers)
+    return Headers.takeError();
+
+  for (const typename ELFFile<ELFT>::Elf_Phdr &Phdr : *Headers) {
+    if (Phdr.p_offset + Phdr.p_filesz > HeadersFile.getBufSize())
+      return createStringError(
+          errc::invalid_argument,
+          "program header with offset 0x" + Twine::utohexstr(Phdr.p_offset) +
+              " and file size 0x" + Twine::utohexstr(Phdr.p_filesz) +
+              " goes past the end of the file");
+
+    ArrayRef<uint8_t> Data{HeadersFile.base() + Phdr.p_offset,
+                           (size_t)Phdr.p_filesz};
+    Segment &Seg = Obj.addSegment(Data);
+    Seg.Type = Phdr.p_type;
+    Seg.Flags = Phdr.p_flags;
+    Seg.OriginalOffset = Phdr.p_offset + EhdrOffset;
+    Seg.Offset = Phdr.p_offset + EhdrOffset;
+    Seg.VAddr = Phdr.p_vaddr;
+    Seg.PAddr = Phdr.p_paddr;
+    Seg.FileSize = Phdr.p_filesz;
+    Seg.MemSize = Phdr.p_memsz;
+    Seg.Align = Phdr.p_align;
+    Seg.Index = Index++;
+    for (SectionBase &Sec : Obj.sections())
+      if (sectionWithinSegment(Sec, Seg)) {
+        Seg.addSection(&Sec);
+        if (!Sec.ParentSegment || Sec.ParentSegment->Offset > Seg.Offset)
+          Sec.ParentSegment = &Seg;
+      }
+  }
+
+  auto &ElfHdr = Obj.ElfHdrSegment;
+  ElfHdr.Index = Index++;
+  ElfHdr.OriginalOffset = ElfHdr.Offset = EhdrOffset;
+
+  const typename ELFT::Ehdr &Ehdr = HeadersFile.getHeader();
+  auto &PrHdr = Obj.ProgramHdrSegment;
+  PrHdr.Type = PT_PHDR;
+  PrHdr.Flags = 0;
+  // The spec requires us to have p_vaddr % p_align == p_offset % p_align.
+  // Whereas this works automatically for ElfHdr, here OriginalOffset is
+  // always non-zero and to ensure the equation we assign the same value to
+  // VAddr as well.
+  PrHdr.OriginalOffset = PrHdr.Offset = PrHdr.VAddr = EhdrOffset + Ehdr.e_phoff;
+  PrHdr.PAddr = 0;
+  PrHdr.FileSize = PrHdr.MemSize = Ehdr.e_phentsize * Ehdr.e_phnum;
+  // The spec requires us to naturally align all the fields.
+  PrHdr.Align = sizeof(Elf_Addr);
+  PrHdr.Index = Index++;
+
+  // Now we do an O(n^2) loop through the segments in order to match up
+  // segments.
+  for (Segment &Child : Obj.segments())
+    setParentSegment(Child);
+  setParentSegment(ElfHdr);
+  setParentSegment(PrHdr);
+
+  return Error::success();
+}
+
+template <class ELFT>
+Error ELFBuilder<ELFT>::initGroupSection(GroupSection *GroupSec) {
+  if (GroupSec->Align % sizeof(ELF::Elf32_Word) != 0)
+    return createStringError(errc::invalid_argument,
+                             "invalid alignment " + Twine(GroupSec->Align) +
+                                 " of group section '" + GroupSec->Name + "'");
+  SectionTableRef SecTable = Obj.sections();
+  if (GroupSec->Link != SHN_UNDEF) {
+    auto SymTab = SecTable.template getSectionOfType<SymbolTableSection>(
+        GroupSec->Link,
+        "link field value '" + Twine(GroupSec->Link) + "' in section '" +
+            GroupSec->Name + "' is invalid",
+        "link field value '" + Twine(GroupSec->Link) + "' in section '" +
+            GroupSec->Name + "' is not a symbol table");
+    if (!SymTab)
+      return SymTab.takeError();
+
+    Expected<Symbol *> Sym = (*SymTab)->getSymbolByIndex(GroupSec->Info);
+    if (!Sym)
+      return createStringError(errc::invalid_argument,
+                               "info field value '" + Twine(GroupSec->Info) +
+                                   "' in section '" + GroupSec->Name +
+                                   "' is not a valid symbol index");
+    GroupSec->setSymTab(*SymTab);
+    GroupSec->setSymbol(*Sym);
+  }
+  if (GroupSec->Contents.size() % sizeof(ELF::Elf32_Word) ||
+      GroupSec->Contents.empty())
+    return createStringError(errc::invalid_argument,
+                             "the content of the section " + GroupSec->Name +
+                                 " is malformed");
+  const ELF::Elf32_Word *Word =
+      reinterpret_cast<const ELF::Elf32_Word *>(GroupSec->Contents.data());
+  const ELF::Elf32_Word *End =
+      Word + GroupSec->Contents.size() / sizeof(ELF::Elf32_Word);
+  GroupSec->setFlagWord(
+      support::endian::read32<ELFT::TargetEndianness>(Word++));
+  for (; Word != End; ++Word) {
+    uint32_t Index = support::endian::read32<ELFT::TargetEndianness>(Word);
+    Expected<SectionBase *> Sec = SecTable.getSection(
+        Index, "group member index " + Twine(Index) + " in section '" +
+                   GroupSec->Name + "' is invalid");
+    if (!Sec)
+      return Sec.takeError();
+
+    GroupSec->addMember(*Sec);
+  }
+
+  return Error::success();
+}
+
+template <class ELFT>
+Error ELFBuilder<ELFT>::initSymbolTable(SymbolTableSection *SymTab) {
+  Expected<const Elf_Shdr *> Shdr = ElfFile.getSection(SymTab->Index);
+  if (!Shdr)
+    return Shdr.takeError();
+
+  Expected<StringRef> StrTabData = ElfFile.getStringTableForSymtab(**Shdr);
+  if (!StrTabData)
+    return StrTabData.takeError();
+
+  ArrayRef<Elf_Word> ShndxData;
+
+  Expected<typename ELFFile<ELFT>::Elf_Sym_Range> Symbols =
+      ElfFile.symbols(*Shdr);
+  if (!Symbols)
+    return Symbols.takeError();
+
+  for (const typename ELFFile<ELFT>::Elf_Sym &Sym : *Symbols) {
+    SectionBase *DefSection = nullptr;
+
+    Expected<StringRef> Name = Sym.getName(*StrTabData);
+    if (!Name)
+      return Name.takeError();
+
+    if (Sym.st_shndx == SHN_XINDEX) {
+      if (SymTab->getShndxTable() == nullptr)
+        return createStringError(errc::invalid_argument,
+                                 "symbol '" + *Name +
+                                     "' has index SHN_XINDEX but no "
+                                     "SHT_SYMTAB_SHNDX section exists");
+      if (ShndxData.data() == nullptr) {
+        Expected<const Elf_Shdr *> ShndxSec =
+            ElfFile.getSection(SymTab->getShndxTable()->Index);
+        if (!ShndxSec)
+          return ShndxSec.takeError();
+
+        Expected<ArrayRef<Elf_Word>> Data =
+            ElfFile.template getSectionContentsAsArray<Elf_Word>(**ShndxSec);
+        if (!Data)
+          return Data.takeError();
+
+        ShndxData = *Data;
+        if (ShndxData.size() != Symbols->size())
+          return createStringError(
+              errc::invalid_argument,
+              "symbol section index table does not have the same number of "
+              "entries as the symbol table");
+      }
+      Elf_Word Index = ShndxData[&Sym - Symbols->begin()];
+      Expected<SectionBase *> Sec = Obj.sections().getSection(
+          Index,
+          "symbol '" + *Name + "' has invalid section index " + Twine(Index));
+      if (!Sec)
+        return Sec.takeError();
+
+      DefSection = *Sec;
+    } else if (Sym.st_shndx >= SHN_LORESERVE) {
+      if (!isValidReservedSectionIndex(Sym.st_shndx, Obj.Machine)) {
+        return createStringError(
+            errc::invalid_argument,
+            "symbol '" + *Name +
+                "' has unsupported value greater than or equal "
+                "to SHN_LORESERVE: " +
+                Twine(Sym.st_shndx));
+      }
+    } else if (Sym.st_shndx != SHN_UNDEF) {
+      Expected<SectionBase *> Sec = Obj.sections().getSection(
+          Sym.st_shndx, "symbol '" + *Name +
+                            "' is defined has invalid section index " +
+                            Twine(Sym.st_shndx));
+      if (!Sec)
+        return Sec.takeError();
+
+      DefSection = *Sec;
+    }
+
+    SymTab->addSymbol(*Name, Sym.getBinding(), Sym.getType(), DefSection,
+                      Sym.getValue(), Sym.st_other, Sym.st_shndx, Sym.st_size);
+  }
+
+  return Error::success();
+}
+
+template <class ELFT>
+static void getAddend(uint64_t &, const Elf_Rel_Impl<ELFT, false> &) {}
+
+template <class ELFT>
+static void getAddend(uint64_t &ToSet, const Elf_Rel_Impl<ELFT, true> &Rela) {
+  ToSet = Rela.r_addend;
+}
+
+template <class T>
+static Error initRelocations(RelocationSection *Relocs, T RelRange) {
+  for (const auto &Rel : RelRange) {
+    Relocation ToAdd;
+    ToAdd.Offset = Rel.r_offset;
+    getAddend(ToAdd.Addend, Rel);
+    ToAdd.Type = Rel.getType(Relocs->getObject().IsMips64EL);
+
+    if (uint32_t Sym = Rel.getSymbol(Relocs->getObject().IsMips64EL)) {
+      if (!Relocs->getObject().SymbolTable)
+        return createStringError(
+            errc::invalid_argument,
+            "'" + Relocs->Name + "': relocation references symbol with index " +
+                Twine(Sym) + ", but there is no symbol table");
+      Expected<Symbol *> SymByIndex =
+          Relocs->getObject().SymbolTable->getSymbolByIndex(Sym);
+      if (!SymByIndex)
+        return SymByIndex.takeError();
+
+      ToAdd.RelocSymbol = *SymByIndex;
+    }
+
+    Relocs->addRelocation(ToAdd);
+  }
+
+  return Error::success();
+}
+
+Expected<SectionBase *> SectionTableRef::getSection(uint32_t Index,
+                                                    Twine ErrMsg) {
+  if (Index == SHN_UNDEF || Index > Sections.size())
+    return createStringError(errc::invalid_argument, ErrMsg);
+  return Sections[Index - 1].get();
+}
+
+template <class T>
+Expected<T *> SectionTableRef::getSectionOfType(uint32_t Index,
+                                                Twine IndexErrMsg,
+                                                Twine TypeErrMsg) {
+  Expected<SectionBase *> BaseSec = getSection(Index, IndexErrMsg);
+  if (!BaseSec)
+    return BaseSec.takeError();
+
+  if (T *Sec = dyn_cast<T>(*BaseSec))
+    return Sec;
+
+  return createStringError(errc::invalid_argument, TypeErrMsg);
+}
+
+template <class ELFT>
+Expected<SectionBase &> ELFBuilder<ELFT>::makeSection(const Elf_Shdr &Shdr) {
+  switch (Shdr.sh_type) {
+  case SHT_REL:
+  case SHT_RELA:
+    if (Shdr.sh_flags & SHF_ALLOC) {
+      if (Expected<ArrayRef<uint8_t>> Data = ElfFile.getSectionContents(Shdr))
+        return Obj.addSection<DynamicRelocationSection>(*Data);
+      else
+        return Data.takeError();
+    }
+    return Obj.addSection<RelocationSection>(Obj);
+  case SHT_STRTAB:
+    // If a string table is allocated we don't want to mess with it. That would
+    // mean altering the memory image. There are no special link types or
+    // anything so we can just use a Section.
+    if (Shdr.sh_flags & SHF_ALLOC) {
+      if (Expected<ArrayRef<uint8_t>> Data = ElfFile.getSectionContents(Shdr))
+        return Obj.addSection<Section>(*Data);
+      else
+        return Data.takeError();
+    }
+    return Obj.addSection<StringTableSection>();
+  case SHT_HASH:
+  case SHT_GNU_HASH:
+    // Hash tables should refer to SHT_DYNSYM which we're not going to change.
+    // Because of this we don't need to mess with the hash tables either.
+    if (Expected<ArrayRef<uint8_t>> Data = ElfFile.getSectionContents(Shdr))
+      return Obj.addSection<Section>(*Data);
+    else
+      return Data.takeError();
+  case SHT_GROUP:
+    if (Expected<ArrayRef<uint8_t>> Data = ElfFile.getSectionContents(Shdr))
+      return Obj.addSection<GroupSection>(*Data);
+    else
+      return Data.takeError();
+  case SHT_DYNSYM:
+    if (Expected<ArrayRef<uint8_t>> Data = ElfFile.getSectionContents(Shdr))
+      return Obj.addSection<DynamicSymbolTableSection>(*Data);
+    else
+      return Data.takeError();
+  case SHT_DYNAMIC:
+    if (Expected<ArrayRef<uint8_t>> Data = ElfFile.getSectionContents(Shdr))
+      return Obj.addSection<DynamicSection>(*Data);
+    else
+      return Data.takeError();
+  case SHT_SYMTAB: {
+    auto &SymTab = Obj.addSection<SymbolTableSection>();
+    Obj.SymbolTable = &SymTab;
+    return SymTab;
+  }
+  case SHT_SYMTAB_SHNDX: {
+    auto &ShndxSection = Obj.addSection<SectionIndexSection>();
+    Obj.SectionIndexTable = &ShndxSection;
+    return ShndxSection;
+  }
+  case SHT_NOBITS:
+    return Obj.addSection<Section>(ArrayRef<uint8_t>());
+  default: {
+    Expected<ArrayRef<uint8_t>> Data = ElfFile.getSectionContents(Shdr);
+    if (!Data)
+      return Data.takeError();
+
+    Expected<StringRef> Name = ElfFile.getSectionName(Shdr);
+    if (!Name)
+      return Name.takeError();
+
+    if (Name->startswith(".zdebug") || (Shdr.sh_flags & ELF::SHF_COMPRESSED)) {
+      uint64_t DecompressedSize, DecompressedAlign;
+      std::tie(DecompressedSize, DecompressedAlign) =
+          getDecompressedSizeAndAlignment<ELFT>(*Data);
+      return Obj.addSection<CompressedSection>(
+          CompressedSection(*Data, DecompressedSize, DecompressedAlign));
+    }
+
+    return Obj.addSection<Section>(*Data);
+  }
+  }
+}
+
+template <class ELFT> Error ELFBuilder<ELFT>::readSectionHeaders() {
+  uint32_t Index = 0;
+  Expected<typename ELFFile<ELFT>::Elf_Shdr_Range> Sections =
+      ElfFile.sections();
+  if (!Sections)
+    return Sections.takeError();
+
+  for (const typename ELFFile<ELFT>::Elf_Shdr &Shdr : *Sections) {
+    if (Index == 0) {
+      ++Index;
+      continue;
+    }
+    Expected<SectionBase &> Sec = makeSection(Shdr);
+    if (!Sec)
+      return Sec.takeError();
+
+    Expected<StringRef> SecName = ElfFile.getSectionName(Shdr);
+    if (!SecName)
+      return SecName.takeError();
+    Sec->Name = SecName->str();
+    Sec->Type = Sec->OriginalType = Shdr.sh_type;
+    Sec->Flags = Sec->OriginalFlags = Shdr.sh_flags;
+    Sec->Addr = Shdr.sh_addr;
+    Sec->Offset = Shdr.sh_offset;
+    Sec->OriginalOffset = Shdr.sh_offset;
+    Sec->Size = Shdr.sh_size;
+    Sec->Link = Shdr.sh_link;
+    Sec->Info = Shdr.sh_info;
+    Sec->Align = Shdr.sh_addralign;
+    Sec->EntrySize = Shdr.sh_entsize;
+    Sec->Index = Index++;
+    Sec->OriginalIndex = Sec->Index;
+    Sec->OriginalData = ArrayRef<uint8_t>(
+        ElfFile.base() + Shdr.sh_offset,
+        (Shdr.sh_type == SHT_NOBITS) ? (size_t)0 : Shdr.sh_size);
+  }
+
+  return Error::success();
+}
+
+template <class ELFT> Error ELFBuilder<ELFT>::readSections(bool EnsureSymtab) {
+  uint32_t ShstrIndex = ElfFile.getHeader().e_shstrndx;
+  if (ShstrIndex == SHN_XINDEX) {
+    Expected<const Elf_Shdr *> Sec = ElfFile.getSection(0);
+    if (!Sec)
+      return Sec.takeError();
+
+    ShstrIndex = (*Sec)->sh_link;
+  }
+
+  if (ShstrIndex == SHN_UNDEF)
+    Obj.HadShdrs = false;
+  else {
+    Expected<StringTableSection *> Sec =
+        Obj.sections().template getSectionOfType<StringTableSection>(
+            ShstrIndex,
+            "e_shstrndx field value " + Twine(ShstrIndex) + " in elf header " +
+                " is invalid",
+            "e_shstrndx field value " + Twine(ShstrIndex) + " in elf header " +
+                " does not reference a string table");
+    if (!Sec)
+      return Sec.takeError();
+
+    Obj.SectionNames = *Sec;
+  }
+
+  // If a section index table exists we'll need to initialize it before we
+  // initialize the symbol table because the symbol table might need to
+  // reference it.
+  if (Obj.SectionIndexTable)
+    if (Error Err = Obj.SectionIndexTable->initialize(Obj.sections()))
+      return Err;
+
+  // Now that all of the sections have been added we can fill out some extra
+  // details about symbol tables. We need the symbol table filled out before
+  // any relocations.
+  if (Obj.SymbolTable) {
+    if (Error Err = Obj.SymbolTable->initialize(Obj.sections()))
+      return Err;
+    if (Error Err = initSymbolTable(Obj.SymbolTable))
+      return Err;
+  } else if (EnsureSymtab) {
+    if (Error Err = Obj.addNewSymbolTable())
+      return Err;
+  }
+
+  // Now that all sections and symbols have been added we can add
+  // relocations that reference symbols and set the link and info fields for
+  // relocation sections.
+  for (SectionBase &Sec : Obj.sections()) {
+    if (&Sec == Obj.SymbolTable)
+      continue;
+    if (Error Err = Sec.initialize(Obj.sections()))
+      return Err;
+    if (auto RelSec = dyn_cast<RelocationSection>(&Sec)) {
+      Expected<typename ELFFile<ELFT>::Elf_Shdr_Range> Sections =
+          ElfFile.sections();
+      if (!Sections)
+        return Sections.takeError();
+
+      const typename ELFFile<ELFT>::Elf_Shdr *Shdr =
+          Sections->begin() + RelSec->Index;
+      if (RelSec->Type == SHT_REL) {
+        Expected<typename ELFFile<ELFT>::Elf_Rel_Range> Rels =
+            ElfFile.rels(*Shdr);
+        if (!Rels)
+          return Rels.takeError();
+
+        if (Error Err = initRelocations(RelSec, *Rels))
+          return Err;
+      } else {
+        Expected<typename ELFFile<ELFT>::Elf_Rela_Range> Relas =
+            ElfFile.relas(*Shdr);
+        if (!Relas)
+          return Relas.takeError();
+
+        if (Error Err = initRelocations(RelSec, *Relas))
+          return Err;
+      }
+    } else if (auto GroupSec = dyn_cast<GroupSection>(&Sec)) {
+      if (Error Err = initGroupSection(GroupSec))
+        return Err;
+    }
+  }
+
+  return Error::success();
+}
+
+template <class ELFT> Error ELFBuilder<ELFT>::build(bool EnsureSymtab) {
+  if (Error E = readSectionHeaders())
+    return E;
+  if (Error E = findEhdrOffset())
+    return E;
+
+  // The ELFFile whose ELF headers and program headers are copied into the
+  // output file. Normally the same as ElfFile, but if we're extracting a
+  // loadable partition it will point to the partition's headers.
+  Expected<ELFFile<ELFT>> HeadersFile = ELFFile<ELFT>::create(toStringRef(
+      {ElfFile.base() + EhdrOffset, ElfFile.getBufSize() - EhdrOffset}));
+  if (!HeadersFile)
+    return HeadersFile.takeError();
+
+  const typename ELFFile<ELFT>::Elf_Ehdr &Ehdr = HeadersFile->getHeader();
+  Obj.OSABI = Ehdr.e_ident[EI_OSABI];
+  Obj.ABIVersion = Ehdr.e_ident[EI_ABIVERSION];
+  Obj.Type = Ehdr.e_type;
+  Obj.Machine = Ehdr.e_machine;
+  Obj.Version = Ehdr.e_version;
+  Obj.Entry = Ehdr.e_entry;
+  Obj.Flags = Ehdr.e_flags;
+
+  if (Error E = readSections(EnsureSymtab))
+    return E;
+  return readProgramHeaders(*HeadersFile);
+}
+
+Writer::~Writer() = default;
+
+Reader::~Reader() = default;
+
+Expected<std::unique_ptr<Object>>
+BinaryReader::create(bool /*EnsureSymtab*/) const {
+  return BinaryELFBuilder(MemBuf, NewSymbolVisibility).build();
+}
+
+Expected<std::vector<IHexRecord>> IHexReader::parse() const {
+  SmallVector<StringRef, 16> Lines;
+  std::vector<IHexRecord> Records;
+  bool HasSections = false;
+
+  MemBuf->getBuffer().split(Lines, '\n');
+  Records.reserve(Lines.size());
+  for (size_t LineNo = 1; LineNo <= Lines.size(); ++LineNo) {
+    StringRef Line = Lines[LineNo - 1].trim();
+    if (Line.empty())
+      continue;
+
+    Expected<IHexRecord> R = IHexRecord::parse(Line);
+    if (!R)
+      return parseError(LineNo, R.takeError());
+    if (R->Type == IHexRecord::EndOfFile)
+      break;
+    HasSections |= (R->Type == IHexRecord::Data);
+    Records.push_back(*R);
+  }
+  if (!HasSections)
+    return parseError(-1U, "no sections");
+
+  return std::move(Records);
+}
+
+Expected<std::unique_ptr<Object>>
+IHexReader::create(bool /*EnsureSymtab*/) const {
+  Expected<std::vector<IHexRecord>> Records = parse();
+  if (!Records)
+    return Records.takeError();
+
+  return IHexELFBuilder(*Records).build();
+}
+
+Expected<std::unique_ptr<Object>> ELFReader::create(bool EnsureSymtab) const {
+  auto Obj = std::make_unique<Object>();
+  if (auto *O = dyn_cast<ELFObjectFile<ELF32LE>>(Bin)) {
+    ELFBuilder<ELF32LE> Builder(*O, *Obj, ExtractPartition);
+    if (Error Err = Builder.build(EnsureSymtab))
+      return std::move(Err);
+    return std::move(Obj);
+  } else if (auto *O = dyn_cast<ELFObjectFile<ELF64LE>>(Bin)) {
+    ELFBuilder<ELF64LE> Builder(*O, *Obj, ExtractPartition);
+    if (Error Err = Builder.build(EnsureSymtab))
+      return std::move(Err);
+    return std::move(Obj);
+  } else if (auto *O = dyn_cast<ELFObjectFile<ELF32BE>>(Bin)) {
+    ELFBuilder<ELF32BE> Builder(*O, *Obj, ExtractPartition);
+    if (Error Err = Builder.build(EnsureSymtab))
+      return std::move(Err);
+    return std::move(Obj);
+  } else if (auto *O = dyn_cast<ELFObjectFile<ELF64BE>>(Bin)) {
+    ELFBuilder<ELF64BE> Builder(*O, *Obj, ExtractPartition);
+    if (Error Err = Builder.build(EnsureSymtab))
+      return std::move(Err);
+    return std::move(Obj);
+  }
+  return createStringError(errc::invalid_argument, "invalid file type");
+}
+
+template <class ELFT> void ELFWriter<ELFT>::writeEhdr() {
+  Elf_Ehdr &Ehdr = *reinterpret_cast<Elf_Ehdr *>(Buf->getBufferStart());
+  std::fill(Ehdr.e_ident, Ehdr.e_ident + 16, 0);
+  Ehdr.e_ident[EI_MAG0] = 0x7f;
+  Ehdr.e_ident[EI_MAG1] = 'E';
+  Ehdr.e_ident[EI_MAG2] = 'L';
+  Ehdr.e_ident[EI_MAG3] = 'F';
+  Ehdr.e_ident[EI_CLASS] = ELFT::Is64Bits ? ELFCLASS64 : ELFCLASS32;
+  Ehdr.e_ident[EI_DATA] =
+      ELFT::TargetEndianness == support::big ? ELFDATA2MSB : ELFDATA2LSB;
+  Ehdr.e_ident[EI_VERSION] = EV_CURRENT;
+  Ehdr.e_ident[EI_OSABI] = Obj.OSABI;
+  Ehdr.e_ident[EI_ABIVERSION] = Obj.ABIVersion;
+
+  Ehdr.e_type = Obj.Type;
+  Ehdr.e_machine = Obj.Machine;
+  Ehdr.e_version = Obj.Version;
+  Ehdr.e_entry = Obj.Entry;
+  // We have to use the fully-qualified name llvm::size
+  // since some compilers complain on ambiguous resolution.
+  Ehdr.e_phnum = llvm::size(Obj.segments());
+  Ehdr.e_phoff = (Ehdr.e_phnum != 0) ? Obj.ProgramHdrSegment.Offset : 0;
+  Ehdr.e_phentsize = (Ehdr.e_phnum != 0) ? sizeof(Elf_Phdr) : 0;
+  Ehdr.e_flags = Obj.Flags;
+  Ehdr.e_ehsize = sizeof(Elf_Ehdr);
+  if (WriteSectionHeaders && Obj.sections().size() != 0) {
+    Ehdr.e_shentsize = sizeof(Elf_Shdr);
+    Ehdr.e_shoff = Obj.SHOff;
+    // """
+    // If the number of sections is greater than or equal to
+    // SHN_LORESERVE (0xff00), this member has the value zero and the actual
+    // number of section header table entries is contained in the sh_size field
+    // of the section header at index 0.
+    // """
+    auto Shnum = Obj.sections().size() + 1;
+    if (Shnum >= SHN_LORESERVE)
+      Ehdr.e_shnum = 0;
+    else
+      Ehdr.e_shnum = Shnum;
+    // """
+    // If the section name string table section index is greater than or equal
+    // to SHN_LORESERVE (0xff00), this member has the value SHN_XINDEX (0xffff)
+    // and the actual index of the section name string table section is
+    // contained in the sh_link field of the section header at index 0.
+    // """
+    if (Obj.SectionNames->Index >= SHN_LORESERVE)
+      Ehdr.e_shstrndx = SHN_XINDEX;
+    else
+      Ehdr.e_shstrndx = Obj.SectionNames->Index;
+  } else {
+    Ehdr.e_shentsize = 0;
+    Ehdr.e_shoff = 0;
+    Ehdr.e_shnum = 0;
+    Ehdr.e_shstrndx = 0;
+  }
+}
+
+template <class ELFT> void ELFWriter<ELFT>::writePhdrs() {
+  for (auto &Seg : Obj.segments())
+    writePhdr(Seg);
+}
+
+template <class ELFT> void ELFWriter<ELFT>::writeShdrs() {
+  // This reference serves to write the dummy section header at the begining
+  // of the file. It is not used for anything else
+  Elf_Shdr &Shdr =
+      *reinterpret_cast<Elf_Shdr *>(Buf->getBufferStart() + Obj.SHOff);
+  Shdr.sh_name = 0;
+  Shdr.sh_type = SHT_NULL;
+  Shdr.sh_flags = 0;
+  Shdr.sh_addr = 0;
+  Shdr.sh_offset = 0;
+  // See writeEhdr for why we do this.
+  uint64_t Shnum = Obj.sections().size() + 1;
+  if (Shnum >= SHN_LORESERVE)
+    Shdr.sh_size = Shnum;
+  else
+    Shdr.sh_size = 0;
+  // See writeEhdr for why we do this.
+  if (Obj.SectionNames != nullptr && Obj.SectionNames->Index >= SHN_LORESERVE)
+    Shdr.sh_link = Obj.SectionNames->Index;
+  else
+    Shdr.sh_link = 0;
+  Shdr.sh_info = 0;
+  Shdr.sh_addralign = 0;
+  Shdr.sh_entsize = 0;
+
+  for (SectionBase &Sec : Obj.sections())
+    writeShdr(Sec);
+}
+
+template <class ELFT> Error ELFWriter<ELFT>::writeSectionData() {
+  for (SectionBase &Sec : Obj.sections())
+    // Segments are responsible for writing their contents, so only write the
+    // section data if the section is not in a segment. Note that this renders
+    // sections in segments effectively immutable.
+    if (Sec.ParentSegment == nullptr)
+      if (Error Err = Sec.accept(*SecWriter))
+        return Err;
+
+  return Error::success();
+}
+
+template <class ELFT> void ELFWriter<ELFT>::writeSegmentData() {
+  for (Segment &Seg : Obj.segments()) {
+    size_t Size = std::min<size_t>(Seg.FileSize, Seg.getContents().size());
+    std::memcpy(Buf->getBufferStart() + Seg.Offset, Seg.getContents().data(),
+                Size);
+  }
+
+  for (auto it : Obj.getUpdatedSections()) {
+    SectionBase *Sec = it.first;
+    ArrayRef<uint8_t> Data = it.second;
+
+    auto *Parent = Sec->ParentSegment;
+    assert(Parent && "This section should've been part of a segment.");
+    uint64_t Offset =
+        Sec->OriginalOffset - Parent->OriginalOffset + Parent->Offset;
+    llvm::copy(Data, Buf->getBufferStart() + Offset);
+  }
+
+  // Iterate over removed sections and overwrite their old data with zeroes.
+  for (auto &Sec : Obj.removedSections()) {
+    Segment *Parent = Sec.ParentSegment;
+    if (Parent == nullptr || Sec.Type == SHT_NOBITS || Sec.Size == 0)
+      continue;
+    uint64_t Offset =
+        Sec.OriginalOffset - Parent->OriginalOffset + Parent->Offset;
+    std::memset(Buf->getBufferStart() + Offset, 0, Sec.Size);
+  }
+}
+
+template <class ELFT>
+ELFWriter<ELFT>::ELFWriter(Object &Obj, raw_ostream &Buf, bool WSH,
+                           bool OnlyKeepDebug)
+    : Writer(Obj, Buf), WriteSectionHeaders(WSH && Obj.HadShdrs),
+      OnlyKeepDebug(OnlyKeepDebug) {}
+
+Error Object::updateSection(StringRef Name, ArrayRef<uint8_t> Data) {
+  auto It = llvm::find_if(Sections,
+                          [&](const SecPtr &Sec) { return Sec->Name == Name; });
+  if (It == Sections.end())
+    return createStringError(errc::invalid_argument, "section '%s' not found",
+                             Name.str().c_str());
+
+  auto *OldSec = It->get();
+  if (!OldSec->hasContents())
+    return createStringError(
+        errc::invalid_argument,
+        "section '%s' cannot be updated because it does not have contents",
+        Name.str().c_str());
+
+  if (Data.size() > OldSec->Size && OldSec->ParentSegment)
+    return createStringError(errc::invalid_argument,
+                             "cannot fit data of size %zu into section '%s' "
+                             "with size %zu that is part of a segment",
+                             Data.size(), Name.str().c_str(), OldSec->Size);
+
+  if (!OldSec->ParentSegment) {
+    *It = std::make_unique<OwnedDataSection>(*OldSec, Data);
+  } else {
+    // The segment writer will be in charge of updating these contents.
+    OldSec->Size = Data.size();
+    UpdatedSections[OldSec] = Data;
+  }
+
+  return Error::success();
+}
+
+Error Object::removeSections(
+    bool AllowBrokenLinks, std::function<bool(const SectionBase &)> ToRemove) {
+
+  auto Iter = std::stable_partition(
+      std::begin(Sections), std::end(Sections), [=](const SecPtr &Sec) {
+        if (ToRemove(*Sec))
+          return false;
+        if (auto RelSec = dyn_cast<RelocationSectionBase>(Sec.get())) {
+          if (auto ToRelSec = RelSec->getSection())
+            return !ToRemove(*ToRelSec);
+        }
+        return true;
+      });
+  if (SymbolTable != nullptr && ToRemove(*SymbolTable))
+    SymbolTable = nullptr;
+  if (SectionNames != nullptr && ToRemove(*SectionNames))
+    SectionNames = nullptr;
+  if (SectionIndexTable != nullptr && ToRemove(*SectionIndexTable))
+    SectionIndexTable = nullptr;
+  // Now make sure there are no remaining references to the sections that will
+  // be removed. Sometimes it is impossible to remove a reference so we emit
+  // an error here instead.
+  std::unordered_set<const SectionBase *> RemoveSections;
+  RemoveSections.reserve(std::distance(Iter, std::end(Sections)));
+  for (auto &RemoveSec : make_range(Iter, std::end(Sections))) {
+    for (auto &Segment : Segments)
+      Segment->removeSection(RemoveSec.get());
+    RemoveSec->onRemove();
+    RemoveSections.insert(RemoveSec.get());
+  }
+
+  // For each section that remains alive, we want to remove the dead references.
+  // This either might update the content of the section (e.g. remove symbols
+  // from symbol table that belongs to removed section) or trigger an error if
+  // a live section critically depends on a section being removed somehow
+  // (e.g. the removed section is referenced by a relocation).
+  for (auto &KeepSec : make_range(std::begin(Sections), Iter)) {
+    if (Error E = KeepSec->removeSectionReferences(
+            AllowBrokenLinks, [&RemoveSections](const SectionBase *Sec) {
+              return RemoveSections.find(Sec) != RemoveSections.end();
+            }))
+      return E;
+  }
+
+  // Transfer removed sections into the Object RemovedSections container for use
+  // later.
+  std::move(Iter, Sections.end(), std::back_inserter(RemovedSections));
+  // Now finally get rid of them all together.
+  Sections.erase(Iter, std::end(Sections));
+  return Error::success();
+}
+
+Error Object::replaceSections(
+    const DenseMap<SectionBase *, SectionBase *> &FromTo) {
+  auto SectionIndexLess = [](const SecPtr &Lhs, const SecPtr &Rhs) {
+    return Lhs->Index < Rhs->Index;
+  };
+  assert(llvm::is_sorted(Sections, SectionIndexLess) &&
+         "Sections are expected to be sorted by Index");
+  // Set indices of new sections so that they can be later sorted into positions
+  // of removed ones.
+  for (auto &I : FromTo)
+    I.second->Index = I.first->Index;
+
+  // Notify all sections about the replacement.
+  for (auto &Sec : Sections)
+    Sec->replaceSectionReferences(FromTo);
+
+  if (Error E = removeSections(
+          /*AllowBrokenLinks=*/false,
+          [=](const SectionBase &Sec) { return FromTo.count(&Sec) > 0; }))
+    return E;
+  llvm::sort(Sections, SectionIndexLess);
+  return Error::success();
+}
+
+Error Object::removeSymbols(function_ref<bool(const Symbol &)> ToRemove) {
+  if (SymbolTable)
+    for (const SecPtr &Sec : Sections)
+      if (Error E = Sec->removeSymbols(ToRemove))
+        return E;
+  return Error::success();
+}
+
+Error Object::addNewSymbolTable() {
+  assert(!SymbolTable && "Object must not has a SymbolTable.");
+
+  // Reuse an existing SHT_STRTAB section if it exists.
+  StringTableSection *StrTab = nullptr;
+  for (SectionBase &Sec : sections()) {
+    if (Sec.Type == ELF::SHT_STRTAB && !(Sec.Flags & SHF_ALLOC)) {
+      StrTab = static_cast<StringTableSection *>(&Sec);
+
+      // Prefer a string table that is not the section header string table, if
+      // such a table exists.
+      if (SectionNames != &Sec)
+        break;
+    }
+  }
+  if (!StrTab)
+    StrTab = &addSection<StringTableSection>();
+
+  SymbolTableSection &SymTab = addSection<SymbolTableSection>();
+  SymTab.Name = ".symtab";
+  SymTab.Link = StrTab->Index;
+  if (Error Err = SymTab.initialize(sections()))
+    return Err;
+  SymTab.addSymbol("", 0, 0, nullptr, 0, 0, 0, 0);
+
+  SymbolTable = &SymTab;
+
+  return Error::success();
+}
+
+// Orders segments such that if x = y->ParentSegment then y comes before x.
+static void orderSegments(std::vector<Segment *> &Segments) {
+  llvm::stable_sort(Segments, compareSegmentsByOffset);
+}
+
+// This function finds a consistent layout for a list of segments starting from
+// an Offset. It assumes that Segments have been sorted by orderSegments and
+// returns an Offset one past the end of the last segment.
+static uint64_t layoutSegments(std::vector<Segment *> &Segments,
+                               uint64_t Offset) {
+  assert(llvm::is_sorted(Segments, compareSegmentsByOffset));
+  // The only way a segment should move is if a section was between two
+  // segments and that section was removed. If that section isn't in a segment
+  // then it's acceptable, but not ideal, to simply move it to after the
+  // segments. So we can simply layout segments one after the other accounting
+  // for alignment.
+  for (Segment *Seg : Segments) {
+    // We assume that segments have been ordered by OriginalOffset and Index
+    // such that a parent segment will always come before a child segment in
+    // OrderedSegments. This means that the Offset of the ParentSegment should
+    // already be set and we can set our offset relative to it.
+    if (Seg->ParentSegment != nullptr) {
+      Segment *Parent = Seg->ParentSegment;
+      Seg->Offset =
+          Parent->Offset + Seg->OriginalOffset - Parent->OriginalOffset;
+    } else {
+      Seg->Offset =
+          alignTo(Offset, std::max<uint64_t>(Seg->Align, 1), Seg->VAddr);
+    }
+    Offset = std::max(Offset, Seg->Offset + Seg->FileSize);
+  }
+  return Offset;
+}
+
+// This function finds a consistent layout for a list of sections. It assumes
+// that the ->ParentSegment of each section has already been laid out. The
+// supplied starting Offset is used for the starting offset of any section that
+// does not have a ParentSegment. It returns either the offset given if all
+// sections had a ParentSegment or an offset one past the last section if there
+// was a section that didn't have a ParentSegment.
+template <class Range>
+static uint64_t layoutSections(Range Sections, uint64_t Offset) {
+  // Now the offset of every segment has been set we can assign the offsets
+  // of each section. For sections that are covered by a segment we should use
+  // the segment's original offset and the section's original offset to compute
+  // the offset from the start of the segment. Using the offset from the start
+  // of the segment we can assign a new offset to the section. For sections not
+  // covered by segments we can just bump Offset to the next valid location.
+  // While it is not necessary, layout the sections in the order based on their
+  // original offsets to resemble the input file as close as possible.
+  std::vector<SectionBase *> OutOfSegmentSections;
+  uint32_t Index = 1;
+  for (auto &Sec : Sections) {
+    Sec.Index = Index++;
+    if (Sec.ParentSegment != nullptr) {
+      auto Segment = *Sec.ParentSegment;
+      Sec.Offset =
+          Segment.Offset + (Sec.OriginalOffset - Segment.OriginalOffset);
+    } else
+      OutOfSegmentSections.push_back(&Sec);
+  }
+
+  llvm::stable_sort(OutOfSegmentSections,
+                    [](const SectionBase *Lhs, const SectionBase *Rhs) {
+                      return Lhs->OriginalOffset < Rhs->OriginalOffset;
+                    });
+  for (auto *Sec : OutOfSegmentSections) {
+    Offset = alignTo(Offset, Sec->Align == 0 ? 1 : Sec->Align);
+    Sec->Offset = Offset;
+    if (Sec->Type != SHT_NOBITS)
+      Offset += Sec->Size;
+  }
+  return Offset;
+}
+
+// Rewrite sh_offset after some sections are changed to SHT_NOBITS and thus
+// occupy no space in the file.
+static uint64_t layoutSectionsForOnlyKeepDebug(Object &Obj, uint64_t Off) {
+  // The layout algorithm requires the sections to be handled in the order of
+  // their offsets in the input file, at least inside segments.
+  std::vector<SectionBase *> Sections;
+  Sections.reserve(Obj.sections().size());
+  uint32_t Index = 1;
+  for (auto &Sec : Obj.sections()) {
+    Sec.Index = Index++;
+    Sections.push_back(&Sec);
+  }
+  llvm::stable_sort(Sections,
+                    [](const SectionBase *Lhs, const SectionBase *Rhs) {
+                      return Lhs->OriginalOffset < Rhs->OriginalOffset;
+                    });
+
+  for (auto *Sec : Sections) {
+    auto *FirstSec = Sec->ParentSegment && Sec->ParentSegment->Type == PT_LOAD
+                         ? Sec->ParentSegment->firstSection()
+                         : nullptr;
+
+    // The first section in a PT_LOAD has to have congruent offset and address
+    // modulo the alignment, which usually equals the maximum page size.
+    if (FirstSec && FirstSec == Sec)
+      Off = alignTo(Off, Sec->ParentSegment->Align, Sec->Addr);
+
+    // sh_offset is not significant for SHT_NOBITS sections, but the congruence
+    // rule must be followed if it is the first section in a PT_LOAD. Do not
+    // advance Off.
+    if (Sec->Type == SHT_NOBITS) {
+      Sec->Offset = Off;
+      continue;
+    }
+
+    if (!FirstSec) {
+      // FirstSec being nullptr generally means that Sec does not have the
+      // SHF_ALLOC flag.
+      Off = Sec->Align ? alignTo(Off, Sec->Align) : Off;
+    } else if (FirstSec != Sec) {
+      // The offset is relative to the first section in the PT_LOAD segment. Use
+      // sh_offset for non-SHF_ALLOC sections.
+      Off = Sec->OriginalOffset - FirstSec->OriginalOffset + FirstSec->Offset;
+    }
+    Sec->Offset = Off;
+    Off += Sec->Size;
+  }
+  return Off;
+}
+
+// Rewrite p_offset and p_filesz of non-PT_PHDR segments after sh_offset values
+// have been updated.
+static uint64_t layoutSegmentsForOnlyKeepDebug(std::vector<Segment *> &Segments,
+                                               uint64_t HdrEnd) {
+  uint64_t MaxOffset = 0;
+  for (Segment *Seg : Segments) {
+    if (Seg->Type == PT_PHDR)
+      continue;
+
+    // The segment offset is generally the offset of the first section.
+    //
+    // For a segment containing no section (see sectionWithinSegment), if it has
+    // a parent segment, copy the parent segment's offset field. This works for
+    // empty PT_TLS. If no parent segment, use 0: the segment is not useful for
+    // debugging anyway.
+    const SectionBase *FirstSec = Seg->firstSection();
+    uint64_t Offset =
+        FirstSec ? FirstSec->Offset
+                 : (Seg->ParentSegment ? Seg->ParentSegment->Offset : 0);
+    uint64_t FileSize = 0;
+    for (const SectionBase *Sec : Seg->Sections) {
+      uint64_t Size = Sec->Type == SHT_NOBITS ? 0 : Sec->Size;
+      if (Sec->Offset + Size > Offset)
+        FileSize = std::max(FileSize, Sec->Offset + Size - Offset);
+    }
+
+    // If the segment includes EHDR and program headers, don't make it smaller
+    // than the headers.
+    if (Seg->Offset < HdrEnd && HdrEnd <= Seg->Offset + Seg->FileSize) {
+      FileSize += Offset - Seg->Offset;
+      Offset = Seg->Offset;
+      FileSize = std::max(FileSize, HdrEnd - Offset);
+    }
+
+    Seg->Offset = Offset;
+    Seg->FileSize = FileSize;
+    MaxOffset = std::max(MaxOffset, Offset + FileSize);
+  }
+  return MaxOffset;
+}
+
+template <class ELFT> void ELFWriter<ELFT>::initEhdrSegment() {
+  Segment &ElfHdr = Obj.ElfHdrSegment;
+  ElfHdr.Type = PT_PHDR;
+  ElfHdr.Flags = 0;
+  ElfHdr.VAddr = 0;
+  ElfHdr.PAddr = 0;
+  ElfHdr.FileSize = ElfHdr.MemSize = sizeof(Elf_Ehdr);
+  ElfHdr.Align = 0;
+}
+
+template <class ELFT> void ELFWriter<ELFT>::assignOffsets() {
+  // We need a temporary list of segments that has a special order to it
+  // so that we know that anytime ->ParentSegment is set that segment has
+  // already had its offset properly set.
+  std::vector<Segment *> OrderedSegments;
+  for (Segment &Segment : Obj.segments())
+    OrderedSegments.push_back(&Segment);
+  OrderedSegments.push_back(&Obj.ElfHdrSegment);
+  OrderedSegments.push_back(&Obj.ProgramHdrSegment);
+  orderSegments(OrderedSegments);
+
+  uint64_t Offset;
+  if (OnlyKeepDebug) {
+    // For --only-keep-debug, the sections that did not preserve contents were
+    // changed to SHT_NOBITS. We now rewrite sh_offset fields of sections, and
+    // then rewrite p_offset/p_filesz of program headers.
+    uint64_t HdrEnd =
+        sizeof(Elf_Ehdr) + llvm::size(Obj.segments()) * sizeof(Elf_Phdr);
+    Offset = layoutSectionsForOnlyKeepDebug(Obj, HdrEnd);
+    Offset = std::max(Offset,
+                      layoutSegmentsForOnlyKeepDebug(OrderedSegments, HdrEnd));
+  } else {
+    // Offset is used as the start offset of the first segment to be laid out.
+    // Since the ELF Header (ElfHdrSegment) must be at the start of the file,
+    // we start at offset 0.
+    Offset = layoutSegments(OrderedSegments, 0);
+    Offset = layoutSections(Obj.sections(), Offset);
+  }
+  // If we need to write the section header table out then we need to align the
+  // Offset so that SHOffset is valid.
+  if (WriteSectionHeaders)
+    Offset = alignTo(Offset, sizeof(Elf_Addr));
+  Obj.SHOff = Offset;
+}
+
+template <class ELFT> size_t ELFWriter<ELFT>::totalSize() const {
+  // We already have the section header offset so we can calculate the total
+  // size by just adding up the size of each section header.
+  if (!WriteSectionHeaders)
+    return Obj.SHOff;
+  size_t ShdrCount = Obj.sections().size() + 1; // Includes null shdr.
+  return Obj.SHOff + ShdrCount * sizeof(Elf_Shdr);
+}
+
+template <class ELFT> Error ELFWriter<ELFT>::write() {
+  // Segment data must be written first, so that the ELF header and program
+  // header tables can overwrite it, if covered by a segment.
+  writeSegmentData();
+  writeEhdr();
+  writePhdrs();
+  if (Error E = writeSectionData())
+    return E;
+  if (WriteSectionHeaders)
+    writeShdrs();
+
+  // TODO: Implement direct writing to the output stream (without intermediate
+  // memory buffer Buf).
+  Out.write(Buf->getBufferStart(), Buf->getBufferSize());
+  return Error::success();
+}
+
+static Error removeUnneededSections(Object &Obj) {
+  // We can remove an empty symbol table from non-relocatable objects.
+  // Relocatable objects typically have relocation sections whose
+  // sh_link field points to .symtab, so we can't remove .symtab
+  // even if it is empty.
+  if (Obj.isRelocatable() || Obj.SymbolTable == nullptr ||
+      !Obj.SymbolTable->empty())
+    return Error::success();
+
+  // .strtab can be used for section names. In such a case we shouldn't
+  // remove it.
+  auto *StrTab = Obj.SymbolTable->getStrTab() == Obj.SectionNames
+                     ? nullptr
+                     : Obj.SymbolTable->getStrTab();
+  return Obj.removeSections(false, [&](const SectionBase &Sec) {
+    return &Sec == Obj.SymbolTable || &Sec == StrTab;
+  });
+}
+
+template <class ELFT> Error ELFWriter<ELFT>::finalize() {
+  // It could happen that SectionNames has been removed and yet the user wants
+  // a section header table output. We need to throw an error if a user tries
+  // to do that.
+  if (Obj.SectionNames == nullptr && WriteSectionHeaders)
+    return createStringError(llvm::errc::invalid_argument,
+                             "cannot write section header table because "
+                             "section header string table was removed");
+
+  if (Error E = removeUnneededSections(Obj))
+    return E;
+
+  // We need to assign indexes before we perform layout because we need to know
+  // if we need large indexes or not. We can assign indexes first and check as
+  // we go to see if we will actully need large indexes.
+  bool NeedsLargeIndexes = false;
+  if (Obj.sections().size() >= SHN_LORESERVE) {
+    SectionTableRef Sections = Obj.sections();
+    // Sections doesn't include the null section header, so account for this
+    // when skipping the first N sections.
+    NeedsLargeIndexes =
+        any_of(drop_begin(Sections, SHN_LORESERVE - 1),
+               [](const SectionBase &Sec) { return Sec.HasSymbol; });
+    // TODO: handle case where only one section needs the large index table but
+    // only needs it because the large index table hasn't been removed yet.
+  }
+
+  if (NeedsLargeIndexes) {
+    // This means we definitely need to have a section index table but if we
+    // already have one then we should use it instead of making a new one.
+    if (Obj.SymbolTable != nullptr && Obj.SectionIndexTable == nullptr) {
+      // Addition of a section to the end does not invalidate the indexes of
+      // other sections and assigns the correct index to the new section.
+      auto &Shndx = Obj.addSection<SectionIndexSection>();
+      Obj.SymbolTable->setShndxTable(&Shndx);
+      Shndx.setSymTab(Obj.SymbolTable);
+    }
+  } else {
+    // Since we don't need SectionIndexTable we should remove it and all
+    // references to it.
+    if (Obj.SectionIndexTable != nullptr) {
+      // We do not support sections referring to the section index table.
+      if (Error E = Obj.removeSections(false /*AllowBrokenLinks*/,
+                                       [this](const SectionBase &Sec) {
+                                         return &Sec == Obj.SectionIndexTable;
+                                       }))
+        return E;
+    }
+  }
+
+  // Make sure we add the names of all the sections. Importantly this must be
+  // done after we decide to add or remove SectionIndexes.
+  if (Obj.SectionNames != nullptr)
+    for (const SectionBase &Sec : Obj.sections())
+      Obj.SectionNames->addString(Sec.Name);
+
+  initEhdrSegment();
+
+  // Before we can prepare for layout the indexes need to be finalized.
+  // Also, the output arch may not be the same as the input arch, so fix up
+  // size-related fields before doing layout calculations.
+  uint64_t Index = 0;
+  auto SecSizer = std::make_unique<ELFSectionSizer<ELFT>>();
+  for (SectionBase &Sec : Obj.sections()) {
+    Sec.Index = Index++;
+    if (Error Err = Sec.accept(*SecSizer))
+      return Err;
+  }
+
+  // The symbol table does not update all other sections on update. For
+  // instance, symbol names are not added as new symbols are added. This means
+  // that some sections, like .strtab, don't yet have their final size.
+  if (Obj.SymbolTable != nullptr)
+    Obj.SymbolTable->prepareForLayout();
+
+  // Now that all strings are added we want to finalize string table builders,
+  // because that affects section sizes which in turn affects section offsets.
+  for (SectionBase &Sec : Obj.sections())
+    if (auto StrTab = dyn_cast<StringTableSection>(&Sec))
+      StrTab->prepareForLayout();
+
+  assignOffsets();
+
+  // layoutSections could have modified section indexes, so we need
+  // to fill the index table after assignOffsets.
+  if (Obj.SymbolTable != nullptr)
+    Obj.SymbolTable->fillShndxTable();
+
+  // Finally now that all offsets and indexes have been set we can finalize any
+  // remaining issues.
+  uint64_t Offset = Obj.SHOff + sizeof(Elf_Shdr);
+  for (SectionBase &Sec : Obj.sections()) {
+    Sec.HeaderOffset = Offset;
+    Offset += sizeof(Elf_Shdr);
+    if (WriteSectionHeaders)
+      Sec.NameIndex = Obj.SectionNames->findIndex(Sec.Name);
+    Sec.finalize();
+  }
+
+  size_t TotalSize = totalSize();
+  Buf = WritableMemoryBuffer::getNewMemBuffer(TotalSize);
+  if (!Buf)
+    return createStringError(errc::not_enough_memory,
+                             "failed to allocate memory buffer of " +
+                                 Twine::utohexstr(TotalSize) + " bytes");
+
+  SecWriter = std::make_unique<ELFSectionWriter<ELFT>>(*Buf);
+  return Error::success();
+}
+
+Error BinaryWriter::write() {
+  for (const SectionBase &Sec : Obj.allocSections())
+    if (Error Err = Sec.accept(*SecWriter))
+      return Err;
+
+  // TODO: Implement direct writing to the output stream (without intermediate
+  // memory buffer Buf).
+  Out.write(Buf->getBufferStart(), Buf->getBufferSize());
+  return Error::success();
+}
+
+Error BinaryWriter::finalize() {
+  // Compute the section LMA based on its sh_offset and the containing segment's
+  // p_offset and p_paddr. Also compute the minimum LMA of all non-empty
+  // sections as MinAddr. In the output, the contents between address 0 and
+  // MinAddr will be skipped.
+  uint64_t MinAddr = UINT64_MAX;
+  for (SectionBase &Sec : Obj.allocSections()) {
+    if (Sec.ParentSegment != nullptr)
+      Sec.Addr =
+          Sec.Offset - Sec.ParentSegment->Offset + Sec.ParentSegment->PAddr;
+    if (Sec.Type != SHT_NOBITS && Sec.Size > 0)
+      MinAddr = std::min(MinAddr, Sec.Addr);
+  }
+
+  // Now that every section has been laid out we just need to compute the total
+  // file size. This might not be the same as the offset returned by
+  // layoutSections, because we want to truncate the last segment to the end of
+  // its last non-empty section, to match GNU objcopy's behaviour.
+  TotalSize = 0;
+  for (SectionBase &Sec : Obj.allocSections())
+    if (Sec.Type != SHT_NOBITS && Sec.Size > 0) {
+      Sec.Offset = Sec.Addr - MinAddr;
+      TotalSize = std::max(TotalSize, Sec.Offset + Sec.Size);
+    }
+
+  Buf = WritableMemoryBuffer::getNewMemBuffer(TotalSize);
+  if (!Buf)
+    return createStringError(errc::not_enough_memory,
+                             "failed to allocate memory buffer of " +
+                                 Twine::utohexstr(TotalSize) + " bytes");
+  SecWriter = std::make_unique<BinarySectionWriter>(*Buf);
+  return Error::success();
+}
+
+bool IHexWriter::SectionCompare::operator()(const SectionBase *Lhs,
+                                            const SectionBase *Rhs) const {
+  return (sectionPhysicalAddr(Lhs) & 0xFFFFFFFFU) <
+         (sectionPhysicalAddr(Rhs) & 0xFFFFFFFFU);
+}
+
+uint64_t IHexWriter::writeEntryPointRecord(uint8_t *Buf) {
+  IHexLineData HexData;
+  uint8_t Data[4] = {};
+  // We don't write entry point record if entry is zero.
+  if (Obj.Entry == 0)
+    return 0;
+
+  if (Obj.Entry <= 0xFFFFFU) {
+    Data[0] = ((Obj.Entry & 0xF0000U) >> 12) & 0xFF;
+    support::endian::write(&Data[2], static_cast<uint16_t>(Obj.Entry),
+                           support::big);
+    HexData = IHexRecord::getLine(IHexRecord::StartAddr80x86, 0, Data);
+  } else {
+    support::endian::write(Data, static_cast<uint32_t>(Obj.Entry),
+                           support::big);
+    HexData = IHexRecord::getLine(IHexRecord::StartAddr, 0, Data);
+  }
+  memcpy(Buf, HexData.data(), HexData.size());
+  return HexData.size();
+}
+
+uint64_t IHexWriter::writeEndOfFileRecord(uint8_t *Buf) {
+  IHexLineData HexData = IHexRecord::getLine(IHexRecord::EndOfFile, 0, {});
+  memcpy(Buf, HexData.data(), HexData.size());
+  return HexData.size();
+}
+
+Error IHexWriter::write() {
+  IHexSectionWriter Writer(*Buf);
+  // Write sections.
+  for (const SectionBase *Sec : Sections)
+    if (Error Err = Sec->accept(Writer))
+      return Err;
+
+  uint64_t Offset = Writer.getBufferOffset();
+  // Write entry point address.
+  Offset += writeEntryPointRecord(
+      reinterpret_cast<uint8_t *>(Buf->getBufferStart()) + Offset);
+  // Write EOF.
+  Offset += writeEndOfFileRecord(
+      reinterpret_cast<uint8_t *>(Buf->getBufferStart()) + Offset);
+  assert(Offset == TotalSize);
+
+  // TODO: Implement direct writing to the output stream (without intermediate
+  // memory buffer Buf).
+  Out.write(Buf->getBufferStart(), Buf->getBufferSize());
+  return Error::success();
+}
+
+Error IHexWriter::checkSection(const SectionBase &Sec) {
+  uint64_t Addr = sectionPhysicalAddr(&Sec);
+  if (addressOverflows32bit(Addr) || addressOverflows32bit(Addr + Sec.Size - 1))
+    return createStringError(
+        errc::invalid_argument,
+        "Section '%s' address range [0x%llx, 0x%llx] is not 32 bit",
+        Sec.Name.c_str(), Addr, Addr + Sec.Size - 1);
+  return Error::success();
+}
+
+Error IHexWriter::finalize() {
+  // We can't write 64-bit addresses.
+  if (addressOverflows32bit(Obj.Entry))
+    return createStringError(errc::invalid_argument,
+                             "Entry point address 0x%llx overflows 32 bits",
+                             Obj.Entry);
+
+  for (const SectionBase &Sec : Obj.sections())
+    if ((Sec.Flags & ELF::SHF_ALLOC) && Sec.Type != ELF::SHT_NOBITS &&
+        Sec.Size > 0) {
+      if (Error E = checkSection(Sec))
+        return E;
+      Sections.insert(&Sec);
+    }
+
+  std::unique_ptr<WritableMemoryBuffer> EmptyBuffer =
+      WritableMemoryBuffer::getNewMemBuffer(0);
+  if (!EmptyBuffer)
+    return createStringError(errc::not_enough_memory,
+                             "failed to allocate memory buffer of 0 bytes");
+
+  IHexSectionWriterBase LengthCalc(*EmptyBuffer);
+  for (const SectionBase *Sec : Sections)
+    if (Error Err = Sec->accept(LengthCalc))
+      return Err;
+
+  // We need space to write section records + StartAddress record
+  // (if start adress is not zero) + EndOfFile record.
+  TotalSize = LengthCalc.getBufferOffset() +
+              (Obj.Entry ? IHexRecord::getLineLength(4) : 0) +
+              IHexRecord::getLineLength(0);
+
+  Buf = WritableMemoryBuffer::getNewMemBuffer(TotalSize);
+  if (!Buf)
+    return createStringError(errc::not_enough_memory,
+                             "failed to allocate memory buffer of " +
+                                 Twine::utohexstr(TotalSize) + " bytes");
+
+  return Error::success();
+}
+
+namespace llvm {
+namespace objcopy {
+namespace elf {
+
+template class ELFBuilder<ELF64LE>;
+template class ELFBuilder<ELF64BE>;
+template class ELFBuilder<ELF32LE>;
+template class ELFBuilder<ELF32BE>;
+
+template class ELFWriter<ELF64LE>;
+template class ELFWriter<ELF64BE>;
+template class ELFWriter<ELF32LE>;
+template class ELFWriter<ELF32BE>;
+
+} // end namespace elf
+} // end namespace objcopy
+} // end namespace llvm
diff --git a/llvm/lib/ObjCopy/ELF/ELFObject.h b/llvm/lib/ObjCopy/ELF/ELFObject.h
new file mode 100644
index 000000000000..f33bbb029c9b
--- /dev/null
+++ b/llvm/lib/ObjCopy/ELF/ELFObject.h
@@ -0,0 +1,1108 @@
+//===- ELFObject.h ----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_OBJCOPY_ELF_ELFOBJECT_H
+#define LLVM_LIB_OBJCOPY_ELF_ELFOBJECT_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/StringTableBuilder.h"
+#include "llvm/ObjCopy/CommonConfig.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/FileOutputBuffer.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <set>
+#include <vector>
+
+namespace llvm {
+enum class DebugCompressionType;
+namespace objcopy {
+namespace elf {
+
+class SectionBase;
+class Section;
+class OwnedDataSection;
+class StringTableSection;
+class SymbolTableSection;
+class RelocationSection;
+class DynamicRelocationSection;
+class GnuDebugLinkSection;
+class GroupSection;
+class SectionIndexSection;
+class CompressedSection;
+class DecompressedSection;
+class Segment;
+class Object;
+struct Symbol;
+
+class SectionTableRef {
+  ArrayRef<std::unique_ptr<SectionBase>> Sections;
+
+public:
+  using iterator = pointee_iterator<const std::unique_ptr<SectionBase> *>;
+
+  explicit SectionTableRef(ArrayRef<std::unique_ptr<SectionBase>> Secs)
+      : Sections(Secs) {}
+  SectionTableRef(const SectionTableRef &) = default;
+
+  iterator begin() const { return iterator(Sections.data()); }
+  iterator end() const { return iterator(Sections.data() + Sections.size()); }
+  size_t size() const { return Sections.size(); }
+
+  Expected<SectionBase *> getSection(uint32_t Index, Twine ErrMsg);
+
+  template <class T>
+  Expected<T *> getSectionOfType(uint32_t Index, Twine IndexErrMsg,
+                                 Twine TypeErrMsg);
+};
+
+enum ElfType { ELFT_ELF32LE, ELFT_ELF64LE, ELFT_ELF32BE, ELFT_ELF64BE };
+
+class SectionVisitor {
+public:
+  virtual ~SectionVisitor() = default;
+
+  virtual Error visit(const Section &Sec) = 0;
+  virtual Error visit(const OwnedDataSection &Sec) = 0;
+  virtual Error visit(const StringTableSection &Sec) = 0;
+  virtual Error visit(const SymbolTableSection &Sec) = 0;
+  virtual Error visit(const RelocationSection &Sec) = 0;
+  virtual Error visit(const DynamicRelocationSection &Sec) = 0;
+  virtual Error visit(const GnuDebugLinkSection &Sec) = 0;
+  virtual Error visit(const GroupSection &Sec) = 0;
+  virtual Error visit(const SectionIndexSection &Sec) = 0;
+  virtual Error visit(const CompressedSection &Sec) = 0;
+  virtual Error visit(const DecompressedSection &Sec) = 0;
+};
+
+class MutableSectionVisitor {
+public:
+  virtual ~MutableSectionVisitor() = default;
+
+  virtual Error visit(Section &Sec) = 0;
+  virtual Error visit(OwnedDataSection &Sec) = 0;
+  virtual Error visit(StringTableSection &Sec) = 0;
+  virtual Error visit(SymbolTableSection &Sec) = 0;
+  virtual Error visit(RelocationSection &Sec) = 0;
+  virtual Error visit(DynamicRelocationSection &Sec) = 0;
+  virtual Error visit(GnuDebugLinkSection &Sec) = 0;
+  virtual Error visit(GroupSection &Sec) = 0;
+  virtual Error visit(SectionIndexSection &Sec) = 0;
+  virtual Error visit(CompressedSection &Sec) = 0;
+  virtual Error visit(DecompressedSection &Sec) = 0;
+};
+
+class SectionWriter : public SectionVisitor {
+protected:
+  WritableMemoryBuffer &Out;
+
+public:
+  virtual ~SectionWriter() = default;
+
+  Error visit(const Section &Sec) override;
+  Error visit(const OwnedDataSection &Sec) override;
+  Error visit(const StringTableSection &Sec) override;
+  Error visit(const DynamicRelocationSection &Sec) override;
+  virtual Error visit(const SymbolTableSection &Sec) override = 0;
+  virtual Error visit(const RelocationSection &Sec) override = 0;
+  virtual Error visit(const GnuDebugLinkSection &Sec) override = 0;
+  virtual Error visit(const GroupSection &Sec) override = 0;
+  virtual Error visit(const SectionIndexSection &Sec) override = 0;
+  virtual Error visit(const CompressedSection &Sec) override = 0;
+  virtual Error visit(const DecompressedSection &Sec) override = 0;
+
+  explicit SectionWriter(WritableMemoryBuffer &Buf) : Out(Buf) {}
+};
+
+template <class ELFT> class ELFSectionWriter : public SectionWriter {
+private:
+  using Elf_Word = typename ELFT::Word;
+  using Elf_Rel = typename ELFT::Rel;
+  using Elf_Rela = typename ELFT::Rela;
+  using Elf_Sym = typename ELFT::Sym;
+
+public:
+  virtual ~ELFSectionWriter() {}
+  Error visit(const SymbolTableSection &Sec) override;
+  Error visit(const RelocationSection &Sec) override;
+  Error visit(const GnuDebugLinkSection &Sec) override;
+  Error visit(const GroupSection &Sec) override;
+  Error visit(const SectionIndexSection &Sec) override;
+  Error visit(const CompressedSection &Sec) override;
+  Error visit(const DecompressedSection &Sec) override;
+
+  explicit ELFSectionWriter(WritableMemoryBuffer &Buf) : SectionWriter(Buf) {}
+};
+
+template <class ELFT> class ELFSectionSizer : public MutableSectionVisitor {
+private:
+  using Elf_Rel = typename ELFT::Rel;
+  using Elf_Rela = typename ELFT::Rela;
+  using Elf_Sym = typename ELFT::Sym;
+  using Elf_Word = typename ELFT::Word;
+  using Elf_Xword = typename ELFT::Xword;
+
+public:
+  Error visit(Section &Sec) override;
+  Error visit(OwnedDataSection &Sec) override;
+  Error visit(StringTableSection &Sec) override;
+  Error visit(DynamicRelocationSection &Sec) override;
+  Error visit(SymbolTableSection &Sec) override;
+  Error visit(RelocationSection &Sec) override;
+  Error visit(GnuDebugLinkSection &Sec) override;
+  Error visit(GroupSection &Sec) override;
+  Error visit(SectionIndexSection &Sec) override;
+  Error visit(CompressedSection &Sec) override;
+  Error visit(DecompressedSection &Sec) override;
+};
+
+#define MAKE_SEC_WRITER_FRIEND                                                 \
+  friend class SectionWriter;                                                  \
+  friend class IHexSectionWriterBase;                                          \
+  friend class IHexSectionWriter;                                              \
+  template <class ELFT> friend class ELFSectionWriter;                         \
+  template <class ELFT> friend class ELFSectionSizer;
+
+class BinarySectionWriter : public SectionWriter {
+public:
+  virtual ~BinarySectionWriter() {}
+
+  Error visit(const SymbolTableSection &Sec) override;
+  Error visit(const RelocationSection &Sec) override;
+  Error visit(const GnuDebugLinkSection &Sec) override;
+  Error visit(const GroupSection &Sec) override;
+  Error visit(const SectionIndexSection &Sec) override;
+  Error visit(const CompressedSection &Sec) override;
+  Error visit(const DecompressedSection &Sec) override;
+
+  explicit BinarySectionWriter(WritableMemoryBuffer &Buf)
+      : SectionWriter(Buf) {}
+};
+
+using IHexLineData = SmallVector<char, 64>;
+
+struct IHexRecord {
+  // Memory address of the record.
+  uint16_t Addr;
+  // Record type (see below).
+  uint16_t Type;
+  // Record data in hexadecimal form.
+  StringRef HexData;
+
+  // Helper method to get file length of the record
+  // including newline character
+  static size_t getLength(size_t DataSize) {
+    // :LLAAAATT[DD...DD]CC'
+    return DataSize * 2 + 11;
+  }
+
+  // Gets length of line in a file (getLength + CRLF).
+  static size_t getLineLength(size_t DataSize) {
+    return getLength(DataSize) + 2;
+  }
+
+  // Given type, address and data returns line which can
+  // be written to output file.
+  static IHexLineData getLine(uint8_t Type, uint16_t Addr,
+                              ArrayRef<uint8_t> Data);
+
+  // Parses the line and returns record if possible.
+  // Line should be trimmed from whitespace characters.
+  static Expected<IHexRecord> parse(StringRef Line);
+
+  // Calculates checksum of stringified record representation
+  // S must NOT contain leading ':' and trailing whitespace
+  // characters
+  static uint8_t getChecksum(StringRef S);
+
+  enum Type {
+    // Contains data and a 16-bit starting address for the data.
+    // The byte count specifies number of data bytes in the record.
+    Data = 0,
+    // Must occur exactly once per file in the last line of the file.
+    // The data field is empty (thus byte count is 00) and the address
+    // field is typically 0000.
+    EndOfFile = 1,
+    // The data field contains a 16-bit segment base address (thus byte
+    // count is always 02) compatible with 80x86 real mode addressing.
+    // The address field (typically 0000) is ignored. The segment address
+    // from the most recent 02 record is multiplied by 16 and added to each
+    // subsequent data record address to form the physical starting address
+    // for the data. This allows addressing up to one megabyte of address
+    // space.
+    SegmentAddr = 2,
+    // or 80x86 processors, specifies the initial content of the CS:IP
+    // registers. The address field is 0000, the byte count is always 04,
+    // the first two data bytes are the CS value, the latter two are the
+    // IP value.
+    StartAddr80x86 = 3,
+    // Allows for 32 bit addressing (up to 4GiB). The record's address field
+    // is ignored (typically 0000) and its byte count is always 02. The two
+    // data bytes (big endian) specify the upper 16 bits of the 32 bit
+    // absolute address for all subsequent type 00 records
+    ExtendedAddr = 4,
+    // The address field is 0000 (not used) and the byte count is always 04.
+    // The four data bytes represent a 32-bit address value. In the case of
+    // 80386 and higher CPUs, this address is loaded into the EIP register.
+    StartAddr = 5,
+    // We have no other valid types
+    InvalidType = 6
+  };
+};
+
+// Base class for IHexSectionWriter. This class implements writing algorithm,
+// but doesn't actually write records. It is used for output buffer size
+// calculation in IHexWriter::finalize.
+class IHexSectionWriterBase : public BinarySectionWriter {
+  // 20-bit segment address
+  uint32_t SegmentAddr = 0;
+  // Extended linear address
+  uint32_t BaseAddr = 0;
+
+  // Write segment address corresponding to 'Addr'
+  uint64_t writeSegmentAddr(uint64_t Addr);
+  // Write extended linear (base) address corresponding to 'Addr'
+  uint64_t writeBaseAddr(uint64_t Addr);
+
+protected:
+  // Offset in the output buffer
+  uint64_t Offset = 0;
+
+  void writeSection(const SectionBase *Sec, ArrayRef<uint8_t> Data);
+  virtual void writeData(uint8_t Type, uint16_t Addr, ArrayRef<uint8_t> Data);
+
+public:
+  explicit IHexSectionWriterBase(WritableMemoryBuffer &Buf)
+      : BinarySectionWriter(Buf) {}
+
+  uint64_t getBufferOffset() const { return Offset; }
+  Error visit(const Section &Sec) final;
+  Error visit(const OwnedDataSection &Sec) final;
+  Error visit(const StringTableSection &Sec) override;
+  Error visit(const DynamicRelocationSection &Sec) final;
+  using BinarySectionWriter::visit;
+};
+
+// Real IHEX section writer
+class IHexSectionWriter : public IHexSectionWriterBase {
+public:
+  IHexSectionWriter(WritableMemoryBuffer &Buf) : IHexSectionWriterBase(Buf) {}
+
+  void writeData(uint8_t Type, uint16_t Addr, ArrayRef<uint8_t> Data) override;
+  Error visit(const StringTableSection &Sec) override;
+};
+
+class Writer {
+protected:
+  Object &Obj;
+  std::unique_ptr<WritableMemoryBuffer> Buf;
+  raw_ostream &Out;
+
+public:
+  virtual ~Writer();
+  virtual Error finalize() = 0;
+  virtual Error write() = 0;
+
+  Writer(Object &O, raw_ostream &Out) : Obj(O), Out(Out) {}
+};
+
+template <class ELFT> class ELFWriter : public Writer {
+private:
+  using Elf_Addr = typename ELFT::Addr;
+  using Elf_Shdr = typename ELFT::Shdr;
+  using Elf_Phdr = typename ELFT::Phdr;
+  using Elf_Ehdr = typename ELFT::Ehdr;
+
+  void initEhdrSegment();
+
+  void writeEhdr();
+  void writePhdr(const Segment &Seg);
+  void writeShdr(const SectionBase &Sec);
+
+  void writePhdrs();
+  void writeShdrs();
+  Error writeSectionData();
+  void writeSegmentData();
+
+  void assignOffsets();
+
+  std::unique_ptr<ELFSectionWriter<ELFT>> SecWriter;
+
+  size_t totalSize() const;
+
+public:
+  virtual ~ELFWriter() {}
+  bool WriteSectionHeaders;
+
+  // For --only-keep-debug, select an alternative section/segment layout
+  // algorithm.
+  bool OnlyKeepDebug;
+
+  Error finalize() override;
+  Error write() override;
+  ELFWriter(Object &Obj, raw_ostream &Out, bool WSH, bool OnlyKeepDebug);
+};
+
+class BinaryWriter : public Writer {
+private:
+  std::unique_ptr<BinarySectionWriter> SecWriter;
+
+  uint64_t TotalSize = 0;
+
+public:
+  ~BinaryWriter() {}
+  Error finalize() override;
+  Error write() override;
+  BinaryWriter(Object &Obj, raw_ostream &Out) : Writer(Obj, Out) {}
+};
+
+class IHexWriter : public Writer {
+  struct SectionCompare {
+    bool operator()(const SectionBase *Lhs, const SectionBase *Rhs) const;
+  };
+
+  std::set<const SectionBase *, SectionCompare> Sections;
+  size_t TotalSize = 0;
+
+  Error checkSection(const SectionBase &Sec);
+  uint64_t writeEntryPointRecord(uint8_t *Buf);
+  uint64_t writeEndOfFileRecord(uint8_t *Buf);
+
+public:
+  ~IHexWriter() {}
+  Error finalize() override;
+  Error write() override;
+  IHexWriter(Object &Obj, raw_ostream &Out) : Writer(Obj, Out) {}
+};
+
+class SectionBase {
+public:
+  std::string Name;
+  Segment *ParentSegment = nullptr;
+  uint64_t HeaderOffset = 0;
+  uint32_t Index = 0;
+
+  uint32_t OriginalIndex = 0;
+  uint64_t OriginalFlags = 0;
+  uint64_t OriginalType = ELF::SHT_NULL;
+  uint64_t OriginalOffset = std::numeric_limits<uint64_t>::max();
+
+  uint64_t Addr = 0;
+  uint64_t Align = 1;
+  uint32_t EntrySize = 0;
+  uint64_t Flags = 0;
+  uint64_t Info = 0;
+  uint64_t Link = ELF::SHN_UNDEF;
+  uint64_t NameIndex = 0;
+  uint64_t Offset = 0;
+  uint64_t Size = 0;
+  uint64_t Type = ELF::SHT_NULL;
+  ArrayRef<uint8_t> OriginalData;
+  bool HasSymbol = false;
+
+  SectionBase() = default;
+  SectionBase(const SectionBase &) = default;
+
+  virtual ~SectionBase() = default;
+
+  virtual Error initialize(SectionTableRef SecTable);
+  virtual void finalize();
+  // Remove references to these sections. The list of sections must be sorted.
+  virtual Error
+  removeSectionReferences(bool AllowBrokenLinks,
+                          function_ref<bool(const SectionBase *)> ToRemove);
+  virtual Error removeSymbols(function_ref<bool(const Symbol &)> ToRemove);
+  virtual Error accept(SectionVisitor &Visitor) const = 0;
+  virtual Error accept(MutableSectionVisitor &Visitor) = 0;
+  virtual void markSymbols();
+  virtual void
+  replaceSectionReferences(const DenseMap<SectionBase *, SectionBase *> &);
+  virtual bool hasContents() const { return false; }
+  // Notify the section that it is subject to removal.
+  virtual void onRemove();
+};
+
+class Segment {
+private:
+  struct SectionCompare {
+    bool operator()(const SectionBase *Lhs, const SectionBase *Rhs) const {
+      // Some sections might have the same address if one of them is empty. To
+      // fix this we can use the lexicographic ordering on ->Addr and the
+      // original index.
+      if (Lhs->OriginalOffset == Rhs->OriginalOffset)
+        return Lhs->OriginalIndex < Rhs->OriginalIndex;
+      return Lhs->OriginalOffset < Rhs->OriginalOffset;
+    }
+  };
+
+public:
+  uint32_t Type = 0;
+  uint32_t Flags = 0;
+  uint64_t Offset = 0;
+  uint64_t VAddr = 0;
+  uint64_t PAddr = 0;
+  uint64_t FileSize = 0;
+  uint64_t MemSize = 0;
+  uint64_t Align = 0;
+
+  uint32_t Index = 0;
+  uint64_t OriginalOffset = 0;
+  Segment *ParentSegment = nullptr;
+  ArrayRef<uint8_t> Contents;
+  std::set<const SectionBase *, SectionCompare> Sections;
+
+  explicit Segment(ArrayRef<uint8_t> Data) : Contents(Data) {}
+  Segment() = default;
+
+  const SectionBase *firstSection() const {
+    if (!Sections.empty())
+      return *Sections.begin();
+    return nullptr;
+  }
+
+  void removeSection(const SectionBase *Sec) { Sections.erase(Sec); }
+  void addSection(const SectionBase *Sec) { Sections.insert(Sec); }
+
+  ArrayRef<uint8_t> getContents() const { return Contents; }
+};
+
+class Section : public SectionBase {
+  MAKE_SEC_WRITER_FRIEND
+
+  ArrayRef<uint8_t> Contents;
+  SectionBase *LinkSection = nullptr;
+
+public:
+  explicit Section(ArrayRef<uint8_t> Data) : Contents(Data) {}
+
+  Error accept(SectionVisitor &Visitor) const override;
+  Error accept(MutableSectionVisitor &Visitor) override;
+  Error removeSectionReferences(
+      bool AllowBrokenLinks,
+      function_ref<bool(const SectionBase *)> ToRemove) override;
+  Error initialize(SectionTableRef SecTable) override;
+  void finalize() override;
+  bool hasContents() const override {
+    return Type != ELF::SHT_NOBITS && Type != ELF::SHT_NULL;
+  }
+};
+
+class OwnedDataSection : public SectionBase {
+  MAKE_SEC_WRITER_FRIEND
+
+  std::vector<uint8_t> Data;
+
+public:
+  OwnedDataSection(StringRef SecName, ArrayRef<uint8_t> Data)
+      : Data(std::begin(Data), std::end(Data)) {
+    Name = SecName.str();
+    Type = OriginalType = ELF::SHT_PROGBITS;
+    Size = Data.size();
+    OriginalOffset = std::numeric_limits<uint64_t>::max();
+  }
+
+  OwnedDataSection(const Twine &SecName, uint64_t SecAddr, uint64_t SecFlags,
+                   uint64_t SecOff) {
+    Name = SecName.str();
+    Type = OriginalType = ELF::SHT_PROGBITS;
+    Addr = SecAddr;
+    Flags = OriginalFlags = SecFlags;
+    OriginalOffset = SecOff;
+  }
+
+  OwnedDataSection(SectionBase &S, ArrayRef<uint8_t> Data)
+      : SectionBase(S), Data(std::begin(Data), std::end(Data)) {
+    Size = Data.size();
+  }
+
+  void appendHexData(StringRef HexData);
+  Error accept(SectionVisitor &Sec) const override;
+  Error accept(MutableSectionVisitor &Visitor) override;
+  bool hasContents() const override { return true; }
+};
+
+class CompressedSection : public SectionBase {
+  MAKE_SEC_WRITER_FRIEND
+
+  DebugCompressionType CompressionType;
+  uint64_t DecompressedSize;
+  uint64_t DecompressedAlign;
+  SmallVector<char, 128> CompressedData;
+
+public:
+  CompressedSection(const SectionBase &Sec,
+                    DebugCompressionType CompressionType);
+  CompressedSection(ArrayRef<uint8_t> CompressedData, uint64_t DecompressedSize,
+                    uint64_t DecompressedAlign);
+
+  uint64_t getDecompressedSize() const { return DecompressedSize; }
+  uint64_t getDecompressedAlign() const { return DecompressedAlign; }
+
+  Error accept(SectionVisitor &Visitor) const override;
+  Error accept(MutableSectionVisitor &Visitor) override;
+
+  static bool classof(const SectionBase *S) {
+    return S->OriginalFlags & ELF::SHF_COMPRESSED;
+  }
+};
+
+class DecompressedSection : public SectionBase {
+  MAKE_SEC_WRITER_FRIEND
+
+public:
+  explicit DecompressedSection(const CompressedSection &Sec)
+      : SectionBase(Sec) {
+    Size = Sec.getDecompressedSize();
+    Align = Sec.getDecompressedAlign();
+    Flags = OriginalFlags = (Flags & ~ELF::SHF_COMPRESSED);
+  }
+
+  Error accept(SectionVisitor &Visitor) const override;
+  Error accept(MutableSectionVisitor &Visitor) override;
+};
+
+// There are two types of string tables that can exist, dynamic and not dynamic.
+// In the dynamic case the string table is allocated. Changing a dynamic string
+// table would mean altering virtual addresses and thus the memory image. So
+// dynamic string tables should not have an interface to modify them or
+// reconstruct them. This type lets us reconstruct a string table. To avoid
+// this class being used for dynamic string tables (which has happened) the
+// classof method checks that the particular instance is not allocated. This
+// then agrees with the makeSection method used to construct most sections.
+class StringTableSection : public SectionBase {
+  MAKE_SEC_WRITER_FRIEND
+
+  StringTableBuilder StrTabBuilder;
+
+public:
+  StringTableSection() : StrTabBuilder(StringTableBuilder::ELF) {
+    Type = OriginalType = ELF::SHT_STRTAB;
+  }
+
+  void addString(StringRef Name);
+  uint32_t findIndex(StringRef Name) const;
+  void prepareForLayout();
+  Error accept(SectionVisitor &Visitor) const override;
+  Error accept(MutableSectionVisitor &Visitor) override;
+
+  static bool classof(const SectionBase *S) {
+    if (S->OriginalFlags & ELF::SHF_ALLOC)
+      return false;
+    return S->OriginalType == ELF::SHT_STRTAB;
+  }
+};
+
+// Symbols have a st_shndx field that normally stores an index but occasionally
+// stores a different special value. This enum keeps track of what the st_shndx
+// field means. Most of the values are just copies of the special SHN_* values.
+// SYMBOL_SIMPLE_INDEX means that the st_shndx is just an index of a section.
+enum SymbolShndxType {
+  SYMBOL_SIMPLE_INDEX = 0,
+  SYMBOL_ABS = ELF::SHN_ABS,
+  SYMBOL_COMMON = ELF::SHN_COMMON,
+  SYMBOL_LOPROC = ELF::SHN_LOPROC,
+  SYMBOL_AMDGPU_LDS = ELF::SHN_AMDGPU_LDS,
+  SYMBOL_HEXAGON_SCOMMON = ELF::SHN_HEXAGON_SCOMMON,
+  SYMBOL_HEXAGON_SCOMMON_2 = ELF::SHN_HEXAGON_SCOMMON_2,
+  SYMBOL_HEXAGON_SCOMMON_4 = ELF::SHN_HEXAGON_SCOMMON_4,
+  SYMBOL_HEXAGON_SCOMMON_8 = ELF::SHN_HEXAGON_SCOMMON_8,
+  SYMBOL_MIPS_ACOMMON = ELF::SHN_MIPS_ACOMMON,
+  SYMBOL_MIPS_TEXT = ELF::SHN_MIPS_TEXT,
+  SYMBOL_MIPS_DATA = ELF::SHN_MIPS_DATA,
+  SYMBOL_MIPS_SCOMMON = ELF::SHN_MIPS_SCOMMON,
+  SYMBOL_MIPS_SUNDEFINED = ELF::SHN_MIPS_SUNDEFINED,
+  SYMBOL_HIPROC = ELF::SHN_HIPROC,
+  SYMBOL_LOOS = ELF::SHN_LOOS,
+  SYMBOL_HIOS = ELF::SHN_HIOS,
+  SYMBOL_XINDEX = ELF::SHN_XINDEX,
+};
+
+struct Symbol {
+  uint8_t Binding;
+  SectionBase *DefinedIn = nullptr;
+  SymbolShndxType ShndxType;
+  uint32_t Index;
+  std::string Name;
+  uint32_t NameIndex;
+  uint64_t Size;
+  uint8_t Type;
+  uint64_t Value;
+  uint8_t Visibility;
+  bool Referenced = false;
+
+  uint16_t getShndx() const;
+  bool isCommon() const;
+};
+
+class SectionIndexSection : public SectionBase {
+  MAKE_SEC_WRITER_FRIEND
+
+private:
+  std::vector<uint32_t> Indexes;
+  SymbolTableSection *Symbols = nullptr;
+
+public:
+  virtual ~SectionIndexSection() {}
+  void addIndex(uint32_t Index) {
+    assert(Size > 0);
+    Indexes.push_back(Index);
+  }
+
+  void reserve(size_t NumSymbols) {
+    Indexes.reserve(NumSymbols);
+    Size = NumSymbols * 4;
+  }
+  void setSymTab(SymbolTableSection *SymTab) { Symbols = SymTab; }
+  Error initialize(SectionTableRef SecTable) override;
+  void finalize() override;
+  Error accept(SectionVisitor &Visitor) const override;
+  Error accept(MutableSectionVisitor &Visitor) override;
+
+  SectionIndexSection() {
+    Name = ".symtab_shndx";
+    Align = 4;
+    EntrySize = 4;
+    Type = OriginalType = ELF::SHT_SYMTAB_SHNDX;
+  }
+};
+
+class SymbolTableSection : public SectionBase {
+  MAKE_SEC_WRITER_FRIEND
+
+  void setStrTab(StringTableSection *StrTab) { SymbolNames = StrTab; }
+  void assignIndices();
+
+protected:
+  std::vector<std::unique_ptr<Symbol>> Symbols;
+  StringTableSection *SymbolNames = nullptr;
+  SectionIndexSection *SectionIndexTable = nullptr;
+
+  using SymPtr = std::unique_ptr<Symbol>;
+
+public:
+  SymbolTableSection() { Type = OriginalType = ELF::SHT_SYMTAB; }
+
+  void addSymbol(Twine Name, uint8_t Bind, uint8_t Type, SectionBase *DefinedIn,
+                 uint64_t Value, uint8_t Visibility, uint16_t Shndx,
+                 uint64_t SymbolSize);
+  void prepareForLayout();
+  // An 'empty' symbol table still contains a null symbol.
+  bool empty() const { return Symbols.size() == 1; }
+  void setShndxTable(SectionIndexSection *ShndxTable) {
+    SectionIndexTable = ShndxTable;
+  }
+  const SectionIndexSection *getShndxTable() const { return SectionIndexTable; }
+  void fillShndxTable();
+  const SectionBase *getStrTab() const { return SymbolNames; }
+  Expected<const Symbol *> getSymbolByIndex(uint32_t Index) const;
+  Expected<Symbol *> getSymbolByIndex(uint32_t Index);
+  void updateSymbols(function_ref<void(Symbol &)> Callable);
+
+  Error removeSectionReferences(
+      bool AllowBrokenLinks,
+      function_ref<bool(const SectionBase *)> ToRemove) override;
+  Error initialize(SectionTableRef SecTable) override;
+  void finalize() override;
+  Error accept(SectionVisitor &Visitor) const override;
+  Error accept(MutableSectionVisitor &Visitor) override;
+  Error removeSymbols(function_ref<bool(const Symbol &)> ToRemove) override;
+  void replaceSectionReferences(
+      const DenseMap<SectionBase *, SectionBase *> &FromTo) override;
+
+  static bool classof(const SectionBase *S) {
+    return S->OriginalType == ELF::SHT_SYMTAB;
+  }
+};
+
+struct Relocation {
+  Symbol *RelocSymbol = nullptr;
+  uint64_t Offset;
+  uint64_t Addend;
+  uint32_t Type;
+};
+
+// All relocation sections denote relocations to apply to another section.
+// However, some relocation sections use a dynamic symbol table and others use
+// a regular symbol table. Because the types of the two symbol tables differ in
+// our system (because they should behave differently) we can't uniformly
+// represent all relocations with the same base class if we expose an interface
+// that mentions the symbol table type. So we split the two base types into two
+// different classes, one which handles the section the relocation is applied to
+// and another which handles the symbol table type. The symbol table type is
+// taken as a type parameter to the class (see RelocSectionWithSymtabBase).
+class RelocationSectionBase : public SectionBase {
+protected:
+  SectionBase *SecToApplyRel = nullptr;
+
+public:
+  const SectionBase *getSection() const { return SecToApplyRel; }
+  void setSection(SectionBase *Sec) { SecToApplyRel = Sec; }
+
+  StringRef getNamePrefix() const;
+
+  static bool classof(const SectionBase *S) {
+    return S->OriginalType == ELF::SHT_REL || S->OriginalType == ELF::SHT_RELA;
+  }
+};
+
+// Takes the symbol table type to use as a parameter so that we can deduplicate
+// that code between the two symbol table types.
+template <class SymTabType>
+class RelocSectionWithSymtabBase : public RelocationSectionBase {
+  void setSymTab(SymTabType *SymTab) { Symbols = SymTab; }
+
+protected:
+  RelocSectionWithSymtabBase() = default;
+
+  SymTabType *Symbols = nullptr;
+
+public:
+  Error initialize(SectionTableRef SecTable) override;
+  void finalize() override;
+};
+
+class RelocationSection
+    : public RelocSectionWithSymtabBase<SymbolTableSection> {
+  MAKE_SEC_WRITER_FRIEND
+
+  std::vector<Relocation> Relocations;
+  const Object &Obj;
+
+public:
+  RelocationSection(const Object &O) : Obj(O) {}
+  void addRelocation(Relocation Rel) { Relocations.push_back(Rel); }
+  Error accept(SectionVisitor &Visitor) const override;
+  Error accept(MutableSectionVisitor &Visitor) override;
+  Error removeSectionReferences(
+      bool AllowBrokenLinks,
+      function_ref<bool(const SectionBase *)> ToRemove) override;
+  Error removeSymbols(function_ref<bool(const Symbol &)> ToRemove) override;
+  void markSymbols() override;
+  void replaceSectionReferences(
+      const DenseMap<SectionBase *, SectionBase *> &FromTo) override;
+  const Object &getObject() const { return Obj; }
+
+  static bool classof(const SectionBase *S) {
+    if (S->OriginalFlags & ELF::SHF_ALLOC)
+      return false;
+    return S->OriginalType == ELF::SHT_REL || S->OriginalType == ELF::SHT_RELA;
+  }
+};
+
+// TODO: The way stripping and groups interact is complicated
+// and still needs to be worked on.
+
+class GroupSection : public SectionBase {
+  MAKE_SEC_WRITER_FRIEND
+  const SymbolTableSection *SymTab = nullptr;
+  Symbol *Sym = nullptr;
+  ELF::Elf32_Word FlagWord;
+  SmallVector<SectionBase *, 3> GroupMembers;
+
+public:
+  // TODO: Contents is present in several classes of the hierarchy.
+  // This needs to be refactored to avoid duplication.
+  ArrayRef<uint8_t> Contents;
+
+  explicit GroupSection(ArrayRef<uint8_t> Data) : Contents(Data) {}
+
+  void setSymTab(const SymbolTableSection *SymTabSec) { SymTab = SymTabSec; }
+  void setSymbol(Symbol *S) { Sym = S; }
+  void setFlagWord(ELF::Elf32_Word W) { FlagWord = W; }
+  void addMember(SectionBase *Sec) { GroupMembers.push_back(Sec); }
+
+  Error accept(SectionVisitor &) const override;
+  Error accept(MutableSectionVisitor &Visitor) override;
+  void finalize() override;
+  Error removeSectionReferences(
+      bool AllowBrokenLinks,
+      function_ref<bool(const SectionBase *)> ToRemove) override;
+  Error removeSymbols(function_ref<bool(const Symbol &)> ToRemove) override;
+  void markSymbols() override;
+  void replaceSectionReferences(
+      const DenseMap<SectionBase *, SectionBase *> &FromTo) override;
+  void onRemove() override;
+
+  static bool classof(const SectionBase *S) {
+    return S->OriginalType == ELF::SHT_GROUP;
+  }
+};
+
+class DynamicSymbolTableSection : public Section {
+public:
+  explicit DynamicSymbolTableSection(ArrayRef<uint8_t> Data) : Section(Data) {}
+
+  static bool classof(const SectionBase *S) {
+    return S->OriginalType == ELF::SHT_DYNSYM;
+  }
+};
+
+class DynamicSection : public Section {
+public:
+  explicit DynamicSection(ArrayRef<uint8_t> Data) : Section(Data) {}
+
+  static bool classof(const SectionBase *S) {
+    return S->OriginalType == ELF::SHT_DYNAMIC;
+  }
+};
+
+class DynamicRelocationSection
+    : public RelocSectionWithSymtabBase<DynamicSymbolTableSection> {
+  MAKE_SEC_WRITER_FRIEND
+
+private:
+  ArrayRef<uint8_t> Contents;
+
+public:
+  explicit DynamicRelocationSection(ArrayRef<uint8_t> Data) : Contents(Data) {}
+
+  Error accept(SectionVisitor &) const override;
+  Error accept(MutableSectionVisitor &Visitor) override;
+  Error removeSectionReferences(
+      bool AllowBrokenLinks,
+      function_ref<bool(const SectionBase *)> ToRemove) override;
+
+  static bool classof(const SectionBase *S) {
+    if (!(S->OriginalFlags & ELF::SHF_ALLOC))
+      return false;
+    return S->OriginalType == ELF::SHT_REL || S->OriginalType == ELF::SHT_RELA;
+  }
+};
+
+class GnuDebugLinkSection : public SectionBase {
+  MAKE_SEC_WRITER_FRIEND
+
+private:
+  StringRef FileName;
+  uint32_t CRC32;
+
+  void init(StringRef File);
+
+public:
+  // If we add this section from an external source we can use this ctor.
+  explicit GnuDebugLinkSection(StringRef File, uint32_t PrecomputedCRC);
+  Error accept(SectionVisitor &Visitor) const override;
+  Error accept(MutableSectionVisitor &Visitor) override;
+};
+
+class Reader {
+public:
+  virtual ~Reader();
+  virtual Expected<std::unique_ptr<Object>> create(bool EnsureSymtab) const = 0;
+};
+
+using object::Binary;
+using object::ELFFile;
+using object::ELFObjectFile;
+using object::OwningBinary;
+
+class BasicELFBuilder {
+protected:
+  std::unique_ptr<Object> Obj;
+
+  void initFileHeader();
+  void initHeaderSegment();
+  StringTableSection *addStrTab();
+  SymbolTableSection *addSymTab(StringTableSection *StrTab);
+  Error initSections();
+
+public:
+  BasicELFBuilder() : Obj(std::make_unique<Object>()) {}
+};
+
+class BinaryELFBuilder : public BasicELFBuilder {
+  MemoryBuffer *MemBuf;
+  uint8_t NewSymbolVisibility;
+  void addData(SymbolTableSection *SymTab);
+
+public:
+  BinaryELFBuilder(MemoryBuffer *MB, uint8_t NewSymbolVisibility)
+      : MemBuf(MB), NewSymbolVisibility(NewSymbolVisibility) {}
+
+  Expected<std::unique_ptr<Object>> build();
+};
+
+class IHexELFBuilder : public BasicELFBuilder {
+  const std::vector<IHexRecord> &Records;
+
+  void addDataSections();
+
+public:
+  IHexELFBuilder(const std::vector<IHexRecord> &Records) : Records(Records) {}
+
+  Expected<std::unique_ptr<Object>> build();
+};
+
+template <class ELFT> class ELFBuilder {
+private:
+  using Elf_Addr = typename ELFT::Addr;
+  using Elf_Shdr = typename ELFT::Shdr;
+  using Elf_Word = typename ELFT::Word;
+
+  const ELFFile<ELFT> &ElfFile;
+  Object &Obj;
+  size_t EhdrOffset = 0;
+  Optional<StringRef> ExtractPartition;
+
+  void setParentSegment(Segment &Child);
+  Error readProgramHeaders(const ELFFile<ELFT> &HeadersFile);
+  Error initGroupSection(GroupSection *GroupSec);
+  Error initSymbolTable(SymbolTableSection *SymTab);
+  Error readSectionHeaders();
+  Error readSections(bool EnsureSymtab);
+  Error findEhdrOffset();
+  Expected<SectionBase &> makeSection(const Elf_Shdr &Shdr);
+
+public:
+  ELFBuilder(const ELFObjectFile<ELFT> &ElfObj, Object &Obj,
+             Optional<StringRef> ExtractPartition);
+
+  Error build(bool EnsureSymtab);
+};
+
+class BinaryReader : public Reader {
+  MemoryBuffer *MemBuf;
+  uint8_t NewSymbolVisibility;
+
+public:
+  BinaryReader(MemoryBuffer *MB, const uint8_t NewSymbolVisibility)
+      : MemBuf(MB), NewSymbolVisibility(NewSymbolVisibility) {}
+  Expected<std::unique_ptr<Object>> create(bool EnsureSymtab) const override;
+};
+
+class IHexReader : public Reader {
+  MemoryBuffer *MemBuf;
+
+  Expected<std::vector<IHexRecord>> parse() const;
+  Error parseError(size_t LineNo, Error E) const {
+    return LineNo == -1U
+               ? createFileError(MemBuf->getBufferIdentifier(), std::move(E))
+               : createFileError(MemBuf->getBufferIdentifier(), LineNo,
+                                 std::move(E));
+  }
+  template <typename... Ts>
+  Error parseError(size_t LineNo, char const *Fmt, const Ts &...Vals) const {
+    Error E = createStringError(errc::invalid_argument, Fmt, Vals...);
+    return parseError(LineNo, std::move(E));
+  }
+
+public:
+  IHexReader(MemoryBuffer *MB) : MemBuf(MB) {}
+
+  Expected<std::unique_ptr<Object>> create(bool EnsureSymtab) const override;
+};
+
+class ELFReader : public Reader {
+  Binary *Bin;
+  Optional<StringRef> ExtractPartition;
+
+public:
+  Expected<std::unique_ptr<Object>> create(bool EnsureSymtab) const override;
+  explicit ELFReader(Binary *B, Optional<StringRef> ExtractPartition)
+      : Bin(B), ExtractPartition(ExtractPartition) {}
+};
+
+class Object {
+private:
+  using SecPtr = std::unique_ptr<SectionBase>;
+  using SegPtr = std::unique_ptr<Segment>;
+
+  std::vector<SecPtr> Sections;
+  std::vector<SegPtr> Segments;
+  std::vector<SecPtr> RemovedSections;
+  DenseMap<SectionBase *, std::vector<uint8_t>> UpdatedSections;
+
+  static bool sectionIsAlloc(const SectionBase &Sec) {
+    return Sec.Flags & ELF::SHF_ALLOC;
+  };
+
+public:
+  template <class T>
+  using ConstRange = iterator_range<pointee_iterator<
+      typename std::vector<std::unique_ptr<T>>::const_iterator>>;
+
+  // It is often the case that the ELF header and the program header table are
+  // not present in any segment. This could be a problem during file layout,
+  // because other segments may get assigned an offset where either of the
+  // two should reside, which will effectively corrupt the resulting binary.
+  // Other than that we use these segments to track program header offsets
+  // when they may not follow the ELF header.
+  Segment ElfHdrSegment;
+  Segment ProgramHdrSegment;
+
+  uint8_t OSABI;
+  uint8_t ABIVersion;
+  uint64_t Entry;
+  uint64_t SHOff;
+  uint32_t Type;
+  uint32_t Machine;
+  uint32_t Version;
+  uint32_t Flags;
+
+  bool HadShdrs = true;
+  bool MustBeRelocatable = false;
+  StringTableSection *SectionNames = nullptr;
+  SymbolTableSection *SymbolTable = nullptr;
+  SectionIndexSection *SectionIndexTable = nullptr;
+
+  bool IsMips64EL = false;
+
+  SectionTableRef sections() const { return SectionTableRef(Sections); }
+  iterator_range<
+      filter_iterator<pointee_iterator<std::vector<SecPtr>::const_iterator>,
+                      decltype(&sectionIsAlloc)>>
+  allocSections() const {
+    return make_filter_range(make_pointee_range(Sections), sectionIsAlloc);
+  }
+
+  const auto &getUpdatedSections() const { return UpdatedSections; }
+  Error updateSection(StringRef Name, ArrayRef<uint8_t> Data);
+
+  SectionBase *findSection(StringRef Name) {
+    auto SecIt =
+        find_if(Sections, [&](const SecPtr &Sec) { return Sec->Name == Name; });
+    return SecIt == Sections.end() ? nullptr : SecIt->get();
+  }
+  SectionTableRef removedSections() { return SectionTableRef(RemovedSections); }
+
+  ConstRange<Segment> segments() const { return make_pointee_range(Segments); }
+
+  Error removeSections(bool AllowBrokenLinks,
+                       std::function<bool(const SectionBase &)> ToRemove);
+  Error replaceSections(const DenseMap<SectionBase *, SectionBase *> &FromTo);
+  Error removeSymbols(function_ref<bool(const Symbol &)> ToRemove);
+  template <class T, class... Ts> T &addSection(Ts &&...Args) {
+    auto Sec = std::make_unique<T>(std::forward<Ts>(Args)...);
+    auto Ptr = Sec.get();
+    MustBeRelocatable |= isa<RelocationSection>(*Ptr);
+    Sections.emplace_back(std::move(Sec));
+    Ptr->Index = Sections.size();
+    return *Ptr;
+  }
+  Error addNewSymbolTable();
+  Segment &addSegment(ArrayRef<uint8_t> Data) {
+    Segments.emplace_back(std::make_unique<Segment>(Data));
+    return *Segments.back();
+  }
+  bool isRelocatable() const {
+    return (Type != ELF::ET_DYN && Type != ELF::ET_EXEC) || MustBeRelocatable;
+  }
+};
+
+} // end namespace elf
+} // end namespace objcopy
+} // end namespace llvm
+
+#endif // LLVM_LIB_OBJCOPY_ELF_ELFOBJECT_H
diff --git a/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp b/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp
new file mode 100644
index 000000000000..6b731abd9ed9
--- /dev/null
+++ b/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp
@@ -0,0 +1,441 @@
+//===- MachOLayoutBuilder.cpp -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MachOLayoutBuilder.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+using namespace llvm::objcopy::macho;
+
+StringTableBuilder::Kind
+MachOLayoutBuilder::getStringTableBuilderKind(const Object &O, bool Is64Bit) {
+  if (O.Header.FileType == MachO::HeaderFileType::MH_OBJECT)
+    return Is64Bit ? StringTableBuilder::MachO64 : StringTableBuilder::MachO;
+  return Is64Bit ? StringTableBuilder::MachO64Linked
+                 : StringTableBuilder::MachOLinked;
+}
+
+uint32_t MachOLayoutBuilder::computeSizeOfCmds() const {
+  uint32_t Size = 0;
+  for (const LoadCommand &LC : O.LoadCommands) {
+    const MachO::macho_load_command &MLC = LC.MachOLoadCommand;
+    auto cmd = MLC.load_command_data.cmd;
+    switch (cmd) {
+    case MachO::LC_SEGMENT:
+      Size += sizeof(MachO::segment_command) +
+              sizeof(MachO::section) * LC.Sections.size();
+      continue;
+    case MachO::LC_SEGMENT_64:
+      Size += sizeof(MachO::segment_command_64) +
+              sizeof(MachO::section_64) * LC.Sections.size();
+      continue;
+    }
+
+    switch (cmd) {
+#define HANDLE_LOAD_COMMAND(LCName, LCValue, LCStruct)                         \
+  case MachO::LCName:                                                          \
+    Size += sizeof(MachO::LCStruct) + LC.Payload.size();                       \
+    break;
+#include "llvm/BinaryFormat/MachO.def"
+#undef HANDLE_LOAD_COMMAND
+    }
+  }
+
+  return Size;
+}
+
+void MachOLayoutBuilder::constructStringTable() {
+  for (std::unique_ptr<SymbolEntry> &Sym : O.SymTable.Symbols)
+    StrTableBuilder.add(Sym->Name);
+  StrTableBuilder.finalize();
+}
+
+void MachOLayoutBuilder::updateSymbolIndexes() {
+  uint32_t Index = 0;
+  for (auto &Symbol : O.SymTable.Symbols)
+    Symbol->Index = Index++;
+}
+
+// Updates the index and the number of local/external/undefined symbols.
+void MachOLayoutBuilder::updateDySymTab(MachO::macho_load_command &MLC) {
+  assert(MLC.load_command_data.cmd == MachO::LC_DYSYMTAB);
+  // Make sure that nlist entries in the symbol table are sorted by the those
+  // types. The order is: local < defined external < undefined external.
+  assert(llvm::is_sorted(O.SymTable.Symbols,
+                         [](const std::unique_ptr<SymbolEntry> &A,
+                            const std::unique_ptr<SymbolEntry> &B) {
+                           bool AL = A->isLocalSymbol(),
+                                BL = B->isLocalSymbol();
+                           if (AL != BL)
+                             return AL;
+                           return !AL && !A->isUndefinedSymbol() &&
+                                  B->isUndefinedSymbol();
+                         }) &&
+         "Symbols are not sorted by their types.");
+
+  uint32_t NumLocalSymbols = 0;
+  auto Iter = O.SymTable.Symbols.begin();
+  auto End = O.SymTable.Symbols.end();
+  for (; Iter != End; ++Iter) {
+    if ((*Iter)->isExternalSymbol())
+      break;
+
+    ++NumLocalSymbols;
+  }
+
+  uint32_t NumExtDefSymbols = 0;
+  for (; Iter != End; ++Iter) {
+    if ((*Iter)->isUndefinedSymbol())
+      break;
+
+    ++NumExtDefSymbols;
+  }
+
+  MLC.dysymtab_command_data.ilocalsym = 0;
+  MLC.dysymtab_command_data.nlocalsym = NumLocalSymbols;
+  MLC.dysymtab_command_data.iextdefsym = NumLocalSymbols;
+  MLC.dysymtab_command_data.nextdefsym = NumExtDefSymbols;
+  MLC.dysymtab_command_data.iundefsym = NumLocalSymbols + NumExtDefSymbols;
+  MLC.dysymtab_command_data.nundefsym =
+      O.SymTable.Symbols.size() - (NumLocalSymbols + NumExtDefSymbols);
+}
+
+// Recomputes and updates offset and size fields in load commands and sections
+// since they could be modified.
+uint64_t MachOLayoutBuilder::layoutSegments() {
+  auto HeaderSize =
+      Is64Bit ? sizeof(MachO::mach_header_64) : sizeof(MachO::mach_header);
+  const bool IsObjectFile =
+      O.Header.FileType == MachO::HeaderFileType::MH_OBJECT;
+  uint64_t Offset = IsObjectFile ? (HeaderSize + O.Header.SizeOfCmds) : 0;
+  for (LoadCommand &LC : O.LoadCommands) {
+    auto &MLC = LC.MachOLoadCommand;
+    StringRef Segname;
+    uint64_t SegmentVmAddr;
+    uint64_t SegmentVmSize;
+    switch (MLC.load_command_data.cmd) {
+    case MachO::LC_SEGMENT:
+      SegmentVmAddr = MLC.segment_command_data.vmaddr;
+      SegmentVmSize = MLC.segment_command_data.vmsize;
+      Segname = StringRef(MLC.segment_command_data.segname,
+                          strnlen(MLC.segment_command_data.segname,
+                                  sizeof(MLC.segment_command_data.segname)));
+      break;
+    case MachO::LC_SEGMENT_64:
+      SegmentVmAddr = MLC.segment_command_64_data.vmaddr;
+      SegmentVmSize = MLC.segment_command_64_data.vmsize;
+      Segname = StringRef(MLC.segment_command_64_data.segname,
+                          strnlen(MLC.segment_command_64_data.segname,
+                                  sizeof(MLC.segment_command_64_data.segname)));
+      break;
+    default:
+      continue;
+    }
+
+    if (Segname == "__LINKEDIT") {
+      // We update the __LINKEDIT segment later (in layoutTail).
+      assert(LC.Sections.empty() && "__LINKEDIT segment has sections");
+      LinkEditLoadCommand = &MLC;
+      continue;
+    }
+
+    // Update file offsets and sizes of sections.
+    uint64_t SegOffset = Offset;
+    uint64_t SegFileSize = 0;
+    uint64_t VMSize = 0;
+    for (std::unique_ptr<Section> &Sec : LC.Sections) {
+      assert(SegmentVmAddr <= Sec->Addr &&
+             "Section's address cannot be smaller than Segment's one");
+      uint32_t SectOffset = Sec->Addr - SegmentVmAddr;
+      if (IsObjectFile) {
+        if (!Sec->hasValidOffset()) {
+          Sec->Offset = 0;
+        } else {
+          uint64_t PaddingSize =
+              offsetToAlignment(SegFileSize, Align(1ull << Sec->Align));
+          Sec->Offset = SegOffset + SegFileSize + PaddingSize;
+          Sec->Size = Sec->Content.size();
+          SegFileSize += PaddingSize + Sec->Size;
+        }
+      } else {
+        if (!Sec->hasValidOffset()) {
+          Sec->Offset = 0;
+        } else {
+          Sec->Offset = SegOffset + SectOffset;
+          Sec->Size = Sec->Content.size();
+          SegFileSize = std::max(SegFileSize, SectOffset + Sec->Size);
+        }
+      }
+      VMSize = std::max(VMSize, SectOffset + Sec->Size);
+    }
+
+    if (IsObjectFile) {
+      Offset += SegFileSize;
+    } else {
+      Offset = alignTo(Offset + SegFileSize, PageSize);
+      SegFileSize = alignTo(SegFileSize, PageSize);
+      // Use the original vmsize if the segment is __PAGEZERO.
+      VMSize =
+          Segname == "__PAGEZERO" ? SegmentVmSize : alignTo(VMSize, PageSize);
+    }
+
+    switch (MLC.load_command_data.cmd) {
+    case MachO::LC_SEGMENT:
+      MLC.segment_command_data.cmdsize =
+          sizeof(MachO::segment_command) +
+          sizeof(MachO::section) * LC.Sections.size();
+      MLC.segment_command_data.nsects = LC.Sections.size();
+      MLC.segment_command_data.fileoff = SegOffset;
+      MLC.segment_command_data.vmsize = VMSize;
+      MLC.segment_command_data.filesize = SegFileSize;
+      break;
+    case MachO::LC_SEGMENT_64:
+      MLC.segment_command_64_data.cmdsize =
+          sizeof(MachO::segment_command_64) +
+          sizeof(MachO::section_64) * LC.Sections.size();
+      MLC.segment_command_64_data.nsects = LC.Sections.size();
+      MLC.segment_command_64_data.fileoff = SegOffset;
+      MLC.segment_command_64_data.vmsize = VMSize;
+      MLC.segment_command_64_data.filesize = SegFileSize;
+      break;
+    }
+  }
+
+  return Offset;
+}
+
+uint64_t MachOLayoutBuilder::layoutRelocations(uint64_t Offset) {
+  for (LoadCommand &LC : O.LoadCommands)
+    for (std::unique_ptr<Section> &Sec : LC.Sections) {
+      Sec->RelOff = Sec->Relocations.empty() ? 0 : Offset;
+      Sec->NReloc = Sec->Relocations.size();
+      Offset += sizeof(MachO::any_relocation_info) * Sec->NReloc;
+    }
+
+  return Offset;
+}
+
+Error MachOLayoutBuilder::layoutTail(uint64_t Offset) {
+  // If we are building the layout of an executable or dynamic library
+  // which does not have any segments other than __LINKEDIT,
+  // the Offset can be equal to zero by this time. It happens because of the
+  // convention that in such cases the file offsets specified by LC_SEGMENT
+  // start with zero (unlike the case of a relocatable object file).
+  const uint64_t HeaderSize =
+      Is64Bit ? sizeof(MachO::mach_header_64) : sizeof(MachO::mach_header);
+  assert((!(O.Header.FileType == MachO::HeaderFileType::MH_OBJECT) ||
+          Offset >= HeaderSize + O.Header.SizeOfCmds) &&
+         "Incorrect tail offset");
+  Offset = std::max(Offset, HeaderSize + O.Header.SizeOfCmds);
+
+  // The order of LINKEDIT elements is as follows:
+  // rebase info, binding info, weak binding info, lazy binding info, export
+  // trie, data-in-code, symbol table, indirect symbol table, symbol table
+  // strings, code signature.
+  uint64_t NListSize = Is64Bit ? sizeof(MachO::nlist_64) : sizeof(MachO::nlist);
+  uint64_t StartOfLinkEdit = Offset;
+  uint64_t StartOfRebaseInfo = StartOfLinkEdit;
+  uint64_t StartOfBindingInfo = StartOfRebaseInfo + O.Rebases.Opcodes.size();
+  uint64_t StartOfWeakBindingInfo = StartOfBindingInfo + O.Binds.Opcodes.size();
+  uint64_t StartOfLazyBindingInfo =
+      StartOfWeakBindingInfo + O.WeakBinds.Opcodes.size();
+  uint64_t StartOfExportTrie =
+      StartOfLazyBindingInfo + O.LazyBinds.Opcodes.size();
+  uint64_t StartOfFunctionStarts = StartOfExportTrie + O.Exports.Trie.size();
+  uint64_t StartOfDyldExportsTrie =
+      StartOfFunctionStarts + O.FunctionStarts.Data.size();
+  uint64_t StartOfChainedFixups =
+      StartOfDyldExportsTrie + O.ExportsTrie.Data.size();
+  uint64_t StartOfDataInCode =
+      StartOfChainedFixups + O.ChainedFixups.Data.size();
+  uint64_t StartOfLinkerOptimizationHint =
+      StartOfDataInCode + O.DataInCode.Data.size();
+  uint64_t StartOfSymbols =
+      StartOfLinkerOptimizationHint + O.LinkerOptimizationHint.Data.size();
+  uint64_t StartOfIndirectSymbols =
+      StartOfSymbols + NListSize * O.SymTable.Symbols.size();
+  uint64_t StartOfSymbolStrings =
+      StartOfIndirectSymbols +
+      sizeof(uint32_t) * O.IndirectSymTable.Symbols.size();
+  uint64_t StartOfCodeSignature =
+      StartOfSymbolStrings + StrTableBuilder.getSize();
+  uint32_t CodeSignatureSize = 0;
+  if (O.CodeSignatureCommandIndex) {
+    StartOfCodeSignature = alignTo(StartOfCodeSignature, 16);
+
+    // Note: These calculations are to be kept in sync with the same
+    // calculations performed in LLD's CodeSignatureSection.
+    const uint32_t AllHeadersSize =
+        alignTo(CodeSignature.FixedHeadersSize + OutputFileName.size() + 1,
+                CodeSignature.Align);
+    const uint32_t BlockCount =
+        (StartOfCodeSignature + CodeSignature.BlockSize - 1) /
+        CodeSignature.BlockSize;
+    const uint32_t Size =
+        alignTo(AllHeadersSize + BlockCount * CodeSignature.HashSize,
+                CodeSignature.Align);
+
+    CodeSignature.StartOffset = StartOfCodeSignature;
+    CodeSignature.AllHeadersSize = AllHeadersSize;
+    CodeSignature.BlockCount = BlockCount;
+    CodeSignature.OutputFileName = OutputFileName;
+    CodeSignature.Size = Size;
+    CodeSignatureSize = Size;
+  }
+  uint64_t LinkEditSize =
+      StartOfCodeSignature + CodeSignatureSize - StartOfLinkEdit;
+
+  // Now we have determined the layout of the contents of the __LINKEDIT
+  // segment. Update its load command.
+  if (LinkEditLoadCommand) {
+    MachO::macho_load_command *MLC = LinkEditLoadCommand;
+    switch (LinkEditLoadCommand->load_command_data.cmd) {
+    case MachO::LC_SEGMENT:
+      MLC->segment_command_data.cmdsize = sizeof(MachO::segment_command);
+      MLC->segment_command_data.fileoff = StartOfLinkEdit;
+      MLC->segment_command_data.vmsize = alignTo(LinkEditSize, PageSize);
+      MLC->segment_command_data.filesize = LinkEditSize;
+      break;
+    case MachO::LC_SEGMENT_64:
+      MLC->segment_command_64_data.cmdsize = sizeof(MachO::segment_command_64);
+      MLC->segment_command_64_data.fileoff = StartOfLinkEdit;
+      MLC->segment_command_64_data.vmsize = alignTo(LinkEditSize, PageSize);
+      MLC->segment_command_64_data.filesize = LinkEditSize;
+      break;
+    }
+  }
+
+  for (LoadCommand &LC : O.LoadCommands) {
+    auto &MLC = LC.MachOLoadCommand;
+    auto cmd = MLC.load_command_data.cmd;
+    switch (cmd) {
+    case MachO::LC_CODE_SIGNATURE:
+      MLC.linkedit_data_command_data.dataoff = StartOfCodeSignature;
+      MLC.linkedit_data_command_data.datasize = CodeSignatureSize;
+      break;
+    case MachO::LC_SYMTAB:
+      MLC.symtab_command_data.symoff = StartOfSymbols;
+      MLC.symtab_command_data.nsyms = O.SymTable.Symbols.size();
+      MLC.symtab_command_data.stroff = StartOfSymbolStrings;
+      MLC.symtab_command_data.strsize = StrTableBuilder.getSize();
+      break;
+    case MachO::LC_DYSYMTAB: {
+      if (MLC.dysymtab_command_data.ntoc != 0 ||
+          MLC.dysymtab_command_data.nmodtab != 0 ||
+          MLC.dysymtab_command_data.nextrefsyms != 0 ||
+          MLC.dysymtab_command_data.nlocrel != 0 ||
+          MLC.dysymtab_command_data.nextrel != 0)
+        return createStringError(llvm::errc::not_supported,
+                                 "shared library is not yet supported");
+
+      if (!O.IndirectSymTable.Symbols.empty()) {
+        MLC.dysymtab_command_data.indirectsymoff = StartOfIndirectSymbols;
+        MLC.dysymtab_command_data.nindirectsyms =
+            O.IndirectSymTable.Symbols.size();
+      }
+
+      updateDySymTab(MLC);
+      break;
+    }
+    case MachO::LC_DATA_IN_CODE:
+      MLC.linkedit_data_command_data.dataoff = StartOfDataInCode;
+      MLC.linkedit_data_command_data.datasize = O.DataInCode.Data.size();
+      break;
+    case MachO::LC_LINKER_OPTIMIZATION_HINT:
+      MLC.linkedit_data_command_data.dataoff = StartOfLinkerOptimizationHint;
+      MLC.linkedit_data_command_data.datasize =
+          O.LinkerOptimizationHint.Data.size();
+      break;
+    case MachO::LC_FUNCTION_STARTS:
+      MLC.linkedit_data_command_data.dataoff = StartOfFunctionStarts;
+      MLC.linkedit_data_command_data.datasize = O.FunctionStarts.Data.size();
+      break;
+    case MachO::LC_DYLD_CHAINED_FIXUPS:
+      MLC.linkedit_data_command_data.dataoff = StartOfChainedFixups;
+      MLC.linkedit_data_command_data.datasize = O.ChainedFixups.Data.size();
+      break;
+    case MachO::LC_DYLD_EXPORTS_TRIE:
+      MLC.linkedit_data_command_data.dataoff = StartOfDyldExportsTrie;
+      MLC.linkedit_data_command_data.datasize = O.ExportsTrie.Data.size();
+      break;
+    case MachO::LC_DYLD_INFO:
+    case MachO::LC_DYLD_INFO_ONLY:
+      MLC.dyld_info_command_data.rebase_off =
+          O.Rebases.Opcodes.empty() ? 0 : StartOfRebaseInfo;
+      MLC.dyld_info_command_data.rebase_size = O.Rebases.Opcodes.size();
+      MLC.dyld_info_command_data.bind_off =
+          O.Binds.Opcodes.empty() ? 0 : StartOfBindingInfo;
+      MLC.dyld_info_command_data.bind_size = O.Binds.Opcodes.size();
+      MLC.dyld_info_command_data.weak_bind_off =
+          O.WeakBinds.Opcodes.empty() ? 0 : StartOfWeakBindingInfo;
+      MLC.dyld_info_command_data.weak_bind_size = O.WeakBinds.Opcodes.size();
+      MLC.dyld_info_command_data.lazy_bind_off =
+          O.LazyBinds.Opcodes.empty() ? 0 : StartOfLazyBindingInfo;
+      MLC.dyld_info_command_data.lazy_bind_size = O.LazyBinds.Opcodes.size();
+      MLC.dyld_info_command_data.export_off =
+          O.Exports.Trie.empty() ? 0 : StartOfExportTrie;
+      MLC.dyld_info_command_data.export_size = O.Exports.Trie.size();
+      break;
+    // Note that LC_ENCRYPTION_INFO.cryptoff despite its name and the comment in
+    // <mach-o/loader.h> is not an offset in the binary file, instead, it is a
+    // relative virtual address. At the moment modification of the __TEXT
+    // segment of executables isn't supported anyway (e.g. data in code entries
+    // are not recalculated). Moreover, in general
+    // LC_ENCRYPT_INFO/LC_ENCRYPTION_INFO_64 are nontrivial to update because
+    // without making additional assumptions (e.g. that the entire __TEXT
+    // segment should be encrypted) we do not know how to recalculate the
+    // boundaries of the encrypted part. For now just copy over these load
+    // commands until we encounter a real world usecase where
+    // LC_ENCRYPT_INFO/LC_ENCRYPTION_INFO_64 need to be adjusted.
+    case MachO::LC_ENCRYPTION_INFO:
+    case MachO::LC_ENCRYPTION_INFO_64:
+    case MachO::LC_LOAD_DYLINKER:
+    case MachO::LC_MAIN:
+    case MachO::LC_RPATH:
+    case MachO::LC_SEGMENT:
+    case MachO::LC_SEGMENT_64:
+    case MachO::LC_VERSION_MIN_MACOSX:
+    case MachO::LC_VERSION_MIN_IPHONEOS:
+    case MachO::LC_VERSION_MIN_TVOS:
+    case MachO::LC_VERSION_MIN_WATCHOS:
+    case MachO::LC_BUILD_VERSION:
+    case MachO::LC_ID_DYLIB:
+    case MachO::LC_LOAD_DYLIB:
+    case MachO::LC_LOAD_WEAK_DYLIB:
+    case MachO::LC_UUID:
+    case MachO::LC_SOURCE_VERSION:
+    case MachO::LC_THREAD:
+    case MachO::LC_UNIXTHREAD:
+    case MachO::LC_SUB_FRAMEWORK:
+    case MachO::LC_SUB_UMBRELLA:
+    case MachO::LC_SUB_CLIENT:
+    case MachO::LC_SUB_LIBRARY:
+    case MachO::LC_LINKER_OPTION:
+      // Nothing to update.
+      break;
+    default:
+      // Abort if it's unsupported in order to prevent corrupting the object.
+      return createStringError(llvm::errc::not_supported,
+                               "unsupported load command (cmd=0x%x)", cmd);
+    }
+  }
+
+  return Error::success();
+}
+
+Error MachOLayoutBuilder::layout() {
+  O.Header.NCmds = O.LoadCommands.size();
+  O.Header.SizeOfCmds = computeSizeOfCmds();
+  constructStringTable();
+  updateSymbolIndexes();
+  uint64_t Offset = layoutSegments();
+  Offset = layoutRelocations(Offset);
+  return layoutTail(Offset);
+}
diff --git a/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.h b/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.h
new file mode 100644
index 000000000000..8d8716df22bb
--- /dev/null
+++ b/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.h
@@ -0,0 +1,97 @@
+//===- MachOLayoutBuilder.h -------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_OBJCOPY_MACHO_MACHOLAYOUTBUILDER_H
+#define LLVM_LIB_OBJCOPY_MACHO_MACHOLAYOUTBUILDER_H
+
+#include "MachOObject.h"
+#include "llvm/ObjCopy/MachO/MachOObjcopy.h"
+
+namespace llvm {
+namespace objcopy {
+namespace macho {
+
+/// When MachO binaries include a LC_CODE_SIGNATURE load command,
+/// the __LINKEDIT data segment will include a section corresponding
+/// to the LC_CODE_SIGNATURE load command. This section serves as a signature
+/// for the binary. Included in the CodeSignature section is a header followed
+/// by a hash of the binary. If present, the CodeSignature section is the
+/// last component of the binary.
+struct CodeSignatureInfo {
+  // NOTE: These values are to be kept in sync with those in
+  // LLD's CodeSignatureSection class.
+
+  static constexpr uint32_t Align = 16;
+  static constexpr uint8_t BlockSizeShift = 12;
+  // The binary is read in blocks of the following size.
+  static constexpr size_t BlockSize = (1 << BlockSizeShift); // 4 KiB
+  // For each block, a SHA256 hash (256 bits, 32 bytes) is written to
+  // the CodeSignature section.
+  static constexpr size_t HashSize = 256 / 8;
+  static constexpr size_t BlobHeadersSize = llvm::alignTo<8>(
+      sizeof(llvm::MachO::CS_SuperBlob) + sizeof(llvm::MachO::CS_BlobIndex));
+  // The size of the entire header depends upon the filename the binary is being
+  // written to, but the rest of the header is fixed in size.
+  static constexpr uint32_t FixedHeadersSize =
+      BlobHeadersSize + sizeof(llvm::MachO::CS_CodeDirectory);
+
+  // The offset relative to the start of the binary where
+  // the CodeSignature section should begin.
+  uint32_t StartOffset;
+  // The size of the entire header, output file name size included.
+  uint32_t AllHeadersSize;
+  // The number of blocks required to hash the binary.
+  uint32_t BlockCount;
+  StringRef OutputFileName;
+  // The size of the entire CodeSignature section, including both the header and
+  // hashes.
+  uint32_t Size;
+};
+
+class MachOLayoutBuilder {
+  Object &O;
+  bool Is64Bit;
+  StringRef OutputFileName;
+  uint64_t PageSize;
+  CodeSignatureInfo CodeSignature;
+
+  // Points to the __LINKEDIT segment if it exists.
+  MachO::macho_load_command *LinkEditLoadCommand = nullptr;
+  StringTableBuilder StrTableBuilder;
+
+  uint32_t computeSizeOfCmds() const;
+  void constructStringTable();
+  void updateSymbolIndexes();
+  void updateDySymTab(MachO::macho_load_command &MLC);
+  uint64_t layoutSegments();
+  uint64_t layoutRelocations(uint64_t Offset);
+  Error layoutTail(uint64_t Offset);
+
+  static StringTableBuilder::Kind getStringTableBuilderKind(const Object &O,
+                                                            bool Is64Bit);
+
+public:
+  MachOLayoutBuilder(Object &O, bool Is64Bit, StringRef OutputFileName,
+                     uint64_t PageSize)
+      : O(O), Is64Bit(Is64Bit), OutputFileName(OutputFileName),
+        PageSize(PageSize),
+        StrTableBuilder(getStringTableBuilderKind(O, Is64Bit)) {}
+
+  // Recomputes and updates fields in the given object such as file offsets.
+  Error layout();
+
+  StringTableBuilder &getStringTableBuilder() { return StrTableBuilder; }
+
+  const CodeSignatureInfo &getCodeSignature() const { return CodeSignature; }
+};
+
+} // end namespace macho
+} // end namespace objcopy
+} // end namespace llvm
+
+#endif // LLVM_LIB_OBJCOPY_MACHO_MACHOLAYOUTBUILDER_H
diff --git a/llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp b/llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp
new file mode 100644
index 000000000000..5db03a4e268e
--- /dev/null
+++ b/llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp
@@ -0,0 +1,550 @@
+//===- MachOObjcopy.cpp -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ObjCopy/MachO/MachOObjcopy.h"
+#include "Archive.h"
+#include "MachOReader.h"
+#include "MachOWriter.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ObjCopy/CommonConfig.h"
+#include "llvm/ObjCopy/MachO/MachOConfig.h"
+#include "llvm/ObjCopy/MultiFormatConfig.h"
+#include "llvm/ObjCopy/ObjCopy.h"
+#include "llvm/Object/ArchiveWriter.h"
+#include "llvm/Object/MachOUniversal.h"
+#include "llvm/Object/MachOUniversalWriter.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileOutputBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/SmallVectorMemoryBuffer.h"
+
+using namespace llvm;
+using namespace llvm::objcopy;
+using namespace llvm::objcopy::macho;
+using namespace llvm::object;
+
+using SectionPred = std::function<bool(const std::unique_ptr<Section> &Sec)>;
+using LoadCommandPred = std::function<bool(const LoadCommand &LC)>;
+
+#ifndef NDEBUG
+static bool isLoadCommandWithPayloadString(const LoadCommand &LC) {
+  // TODO: Add support for LC_REEXPORT_DYLIB, LC_LOAD_UPWARD_DYLIB and
+  // LC_LAZY_LOAD_DYLIB
+  return LC.MachOLoadCommand.load_command_data.cmd == MachO::LC_RPATH ||
+         LC.MachOLoadCommand.load_command_data.cmd == MachO::LC_ID_DYLIB ||
+         LC.MachOLoadCommand.load_command_data.cmd == MachO::LC_LOAD_DYLIB ||
+         LC.MachOLoadCommand.load_command_data.cmd == MachO::LC_LOAD_WEAK_DYLIB;
+}
+#endif
+
+static StringRef getPayloadString(const LoadCommand &LC) {
+  assert(isLoadCommandWithPayloadString(LC) &&
+         "unsupported load command encountered");
+
+  return StringRef(reinterpret_cast<const char *>(LC.Payload.data()),
+                   LC.Payload.size())
+      .rtrim('\0');
+}
+
+static Error removeSections(const CommonConfig &Config, Object &Obj) {
+  SectionPred RemovePred = [](const std::unique_ptr<Section> &) {
+    return false;
+  };
+
+  if (!Config.ToRemove.empty()) {
+    RemovePred = [&Config, RemovePred](const std::unique_ptr<Section> &Sec) {
+      return Config.ToRemove.matches(Sec->CanonicalName);
+    };
+  }
+
+  if (Config.StripAll || Config.StripDebug) {
+    // Remove all debug sections.
+    RemovePred = [RemovePred](const std::unique_ptr<Section> &Sec) {
+      if (Sec->Segname == "__DWARF")
+        return true;
+
+      return RemovePred(Sec);
+    };
+  }
+
+  if (!Config.OnlySection.empty()) {
+    // Overwrite RemovePred because --only-section takes priority.
+    RemovePred = [&Config](const std::unique_ptr<Section> &Sec) {
+      return !Config.OnlySection.matches(Sec->CanonicalName);
+    };
+  }
+
+  return Obj.removeSections(RemovePred);
+}
+
+static void markSymbols(const CommonConfig &, Object &Obj) {
+  // Symbols referenced from the indirect symbol table must not be removed.
+  for (IndirectSymbolEntry &ISE : Obj.IndirectSymTable.Symbols)
+    if (ISE.Symbol)
+      (*ISE.Symbol)->Referenced = true;
+}
+
+static void updateAndRemoveSymbols(const CommonConfig &Config,
+                                   const MachOConfig &MachOConfig,
+                                   Object &Obj) {
+  for (SymbolEntry &Sym : Obj.SymTable) {
+    auto I = Config.SymbolsToRename.find(Sym.Name);
+    if (I != Config.SymbolsToRename.end())
+      Sym.Name = std::string(I->getValue());
+  }
+
+  auto RemovePred = [&Config, &MachOConfig,
+                     &Obj](const std::unique_ptr<SymbolEntry> &N) {
+    if (N->Referenced)
+      return false;
+    if (MachOConfig.KeepUndefined && N->isUndefinedSymbol())
+      return false;
+    if (N->n_desc & MachO::REFERENCED_DYNAMICALLY)
+      return false;
+    if (Config.StripAll)
+      return true;
+    if (Config.DiscardMode == DiscardType::All && !(N->n_type & MachO::N_EXT))
+      return true;
+    // This behavior is consistent with cctools' strip.
+    if (MachOConfig.StripSwiftSymbols &&
+        (Obj.Header.Flags & MachO::MH_DYLDLINK) && Obj.SwiftVersion &&
+        *Obj.SwiftVersion && N->isSwiftSymbol())
+      return true;
+    return false;
+  };
+
+  Obj.SymTable.removeSymbols(RemovePred);
+}
+
+template <typename LCType>
+static void updateLoadCommandPayloadString(LoadCommand &LC, StringRef S) {
+  assert(isLoadCommandWithPayloadString(LC) &&
+         "unsupported load command encountered");
+
+  uint32_t NewCmdsize = alignTo(sizeof(LCType) + S.size() + 1, 8);
+
+  LC.MachOLoadCommand.load_command_data.cmdsize = NewCmdsize;
+  LC.Payload.assign(NewCmdsize - sizeof(LCType), 0);
+  std::copy(S.begin(), S.end(), LC.Payload.begin());
+}
+
+static LoadCommand buildRPathLoadCommand(StringRef Path) {
+  LoadCommand LC;
+  MachO::rpath_command RPathLC;
+  RPathLC.cmd = MachO::LC_RPATH;
+  RPathLC.path = sizeof(MachO::rpath_command);
+  RPathLC.cmdsize = alignTo(sizeof(MachO::rpath_command) + Path.size() + 1, 8);
+  LC.MachOLoadCommand.rpath_command_data = RPathLC;
+  LC.Payload.assign(RPathLC.cmdsize - sizeof(MachO::rpath_command), 0);
+  std::copy(Path.begin(), Path.end(), LC.Payload.begin());
+  return LC;
+}
+
+static Error processLoadCommands(const MachOConfig &MachOConfig, Object &Obj) {
+  // Remove RPaths.
+  DenseSet<StringRef> RPathsToRemove(MachOConfig.RPathsToRemove.begin(),
+                                     MachOConfig.RPathsToRemove.end());
+
+  LoadCommandPred RemovePred = [&RPathsToRemove,
+                                &MachOConfig](const LoadCommand &LC) {
+    if (LC.MachOLoadCommand.load_command_data.cmd == MachO::LC_RPATH) {
+      // When removing all RPaths we don't need to care
+      // about what it contains
+      if (MachOConfig.RemoveAllRpaths)
+        return true;
+
+      StringRef RPath = getPayloadString(LC);
+      if (RPathsToRemove.count(RPath)) {
+        RPathsToRemove.erase(RPath);
+        return true;
+      }
+    }
+    return false;
+  };
+
+  if (Error E = Obj.removeLoadCommands(RemovePred))
+    return E;
+
+  // Emit an error if the Mach-O binary does not contain an rpath path name
+  // specified in -delete_rpath.
+  for (StringRef RPath : MachOConfig.RPathsToRemove) {
+    if (RPathsToRemove.count(RPath))
+      return createStringError(errc::invalid_argument,
+                               "no LC_RPATH load command with path: %s",
+                               RPath.str().c_str());
+  }
+
+  DenseSet<StringRef> RPaths;
+
+  // Get all existing RPaths.
+  for (LoadCommand &LC : Obj.LoadCommands) {
+    if (LC.MachOLoadCommand.load_command_data.cmd == MachO::LC_RPATH)
+      RPaths.insert(getPayloadString(LC));
+  }
+
+  // Throw errors for invalid RPaths.
+  for (const auto &OldNew : MachOConfig.RPathsToUpdate) {
+    StringRef Old = OldNew.getFirst();
+    StringRef New = OldNew.getSecond();
+    if (!RPaths.contains(Old))
+      return createStringError(errc::invalid_argument,
+                               "no LC_RPATH load command with path: " + Old);
+    if (RPaths.contains(New))
+      return createStringError(errc::invalid_argument,
+                               "rpath '" + New +
+                                   "' would create a duplicate load command");
+  }
+
+  // Update load commands.
+  for (LoadCommand &LC : Obj.LoadCommands) {
+    switch (LC.MachOLoadCommand.load_command_data.cmd) {
+    case MachO::LC_ID_DYLIB:
+      if (MachOConfig.SharedLibId)
+        updateLoadCommandPayloadString<MachO::dylib_command>(
+            LC, *MachOConfig.SharedLibId);
+      break;
+
+    case MachO::LC_RPATH: {
+      StringRef RPath = getPayloadString(LC);
+      StringRef NewRPath = MachOConfig.RPathsToUpdate.lookup(RPath);
+      if (!NewRPath.empty())
+        updateLoadCommandPayloadString<MachO::rpath_command>(LC, NewRPath);
+      break;
+    }
+
+    // TODO: Add LC_REEXPORT_DYLIB, LC_LAZY_LOAD_DYLIB, and LC_LOAD_UPWARD_DYLIB
+    // here once llvm-objcopy supports them.
+    case MachO::LC_LOAD_DYLIB:
+    case MachO::LC_LOAD_WEAK_DYLIB:
+      StringRef InstallName = getPayloadString(LC);
+      StringRef NewInstallName =
+          MachOConfig.InstallNamesToUpdate.lookup(InstallName);
+      if (!NewInstallName.empty())
+        updateLoadCommandPayloadString<MachO::dylib_command>(LC,
+                                                             NewInstallName);
+      break;
+    }
+  }
+
+  // Add new RPaths.
+  for (StringRef RPath : MachOConfig.RPathToAdd) {
+    if (RPaths.contains(RPath))
+      return createStringError(errc::invalid_argument,
+                               "rpath '" + RPath +
+                                   "' would create a duplicate load command");
+    RPaths.insert(RPath);
+    Obj.LoadCommands.push_back(buildRPathLoadCommand(RPath));
+  }
+
+  for (StringRef RPath : MachOConfig.RPathToPrepend) {
+    if (RPaths.contains(RPath))
+      return createStringError(errc::invalid_argument,
+                               "rpath '" + RPath +
+                                   "' would create a duplicate load command");
+
+    RPaths.insert(RPath);
+    Obj.LoadCommands.insert(Obj.LoadCommands.begin(),
+                            buildRPathLoadCommand(RPath));
+  }
+
+  // Unlike appending rpaths, the indexes of subsequent load commands must
+  // be recalculated after prepending one.
+  if (!MachOConfig.RPathToPrepend.empty())
+    Obj.updateLoadCommandIndexes();
+
+  // Remove any empty segments if required.
+  if (!MachOConfig.EmptySegmentsToRemove.empty()) {
+    auto RemovePred = [&MachOConfig](const LoadCommand &LC) {
+      if (LC.MachOLoadCommand.load_command_data.cmd == MachO::LC_SEGMENT_64 ||
+          LC.MachOLoadCommand.load_command_data.cmd == MachO::LC_SEGMENT) {
+        return LC.Sections.empty() &&
+               MachOConfig.EmptySegmentsToRemove.contains(*LC.getSegmentName());
+      }
+      return false;
+    };
+    if (Error E = Obj.removeLoadCommands(RemovePred))
+      return E;
+  }
+
+  return Error::success();
+}
+
+static Error dumpSectionToFile(StringRef SecName, StringRef Filename,
+                               Object &Obj) {
+  for (LoadCommand &LC : Obj.LoadCommands)
+    for (const std::unique_ptr<Section> &Sec : LC.Sections) {
+      if (Sec->CanonicalName == SecName) {
+        Expected<std::unique_ptr<FileOutputBuffer>> BufferOrErr =
+            FileOutputBuffer::create(Filename, Sec->Content.size());
+        if (!BufferOrErr)
+          return BufferOrErr.takeError();
+        std::unique_ptr<FileOutputBuffer> Buf = std::move(*BufferOrErr);
+        llvm::copy(Sec->Content, Buf->getBufferStart());
+
+        if (Error E = Buf->commit())
+          return E;
+        return Error::success();
+      }
+    }
+
+  return createStringError(object_error::parse_failed, "section '%s' not found",
+                           SecName.str().c_str());
+}
+
+static Error addSection(const NewSectionInfo &NewSection, Object &Obj) {
+  std::pair<StringRef, StringRef> Pair = NewSection.SectionName.split(',');
+  StringRef TargetSegName = Pair.first;
+  Section Sec(TargetSegName, Pair.second);
+  Sec.Content =
+      Obj.NewSectionsContents.save(NewSection.SectionData->getBuffer());
+  Sec.Size = Sec.Content.size();
+
+  // Add the a section into an existing segment.
+  for (LoadCommand &LC : Obj.LoadCommands) {
+    Optional<StringRef> SegName = LC.getSegmentName();
+    if (SegName && SegName == TargetSegName) {
+      uint64_t Addr = *LC.getSegmentVMAddr();
+      for (const std::unique_ptr<Section> &S : LC.Sections)
+        Addr = std::max(Addr, S->Addr + S->Size);
+      LC.Sections.push_back(std::make_unique<Section>(Sec));
+      LC.Sections.back()->Addr = Addr;
+      return Error::success();
+    }
+  }
+
+  // There's no segment named TargetSegName. Create a new load command and
+  // Insert a new section into it.
+  LoadCommand &NewSegment =
+      Obj.addSegment(TargetSegName, alignTo(Sec.Size, 16384));
+  NewSegment.Sections.push_back(std::make_unique<Section>(Sec));
+  NewSegment.Sections.back()->Addr = *NewSegment.getSegmentVMAddr();
+  return Error::success();
+}
+
+static Expected<Section &> findSection(StringRef SecName, Object &O) {
+  StringRef SegName;
+  std::tie(SegName, SecName) = SecName.split(",");
+  auto FoundSeg =
+      llvm::find_if(O.LoadCommands, [SegName](const LoadCommand &LC) {
+        return LC.getSegmentName() == SegName;
+      });
+  if (FoundSeg == O.LoadCommands.end())
+    return createStringError(errc::invalid_argument,
+                             "could not find segment with name '%s'",
+                             SegName.str().c_str());
+  auto FoundSec = llvm::find_if(FoundSeg->Sections,
+                                [SecName](const std::unique_ptr<Section> &Sec) {
+                                  return Sec->Sectname == SecName;
+                                });
+  if (FoundSec == FoundSeg->Sections.end())
+    return createStringError(errc::invalid_argument,
+                             "could not find section with name '%s'",
+                             SecName.str().c_str());
+
+  assert(FoundSec->get()->CanonicalName == (SegName + "," + SecName).str());
+  return *FoundSec->get();
+}
+
+static Error updateSection(const NewSectionInfo &NewSection, Object &O) {
+  Expected<Section &> SecToUpdateOrErr = findSection(NewSection.SectionName, O);
+
+  if (!SecToUpdateOrErr)
+    return SecToUpdateOrErr.takeError();
+  Section &Sec = *SecToUpdateOrErr;
+
+  if (NewSection.SectionData->getBufferSize() > Sec.Size)
+    return createStringError(
+        errc::invalid_argument,
+        "new section cannot be larger than previous section");
+  Sec.Content = O.NewSectionsContents.save(NewSection.SectionData->getBuffer());
+  Sec.Size = Sec.Content.size();
+  return Error::success();
+}
+
+// isValidMachOCannonicalName returns success if Name is a MachO cannonical name
+// ("<segment>,<section>") and lengths of both segment and section names are
+// valid.
+static Error isValidMachOCannonicalName(StringRef Name) {
+  if (Name.count(',') != 1)
+    return createStringError(errc::invalid_argument,
+                             "invalid section name '%s' (should be formatted "
+                             "as '<segment name>,<section name>')",
+                             Name.str().c_str());
+
+  std::pair<StringRef, StringRef> Pair = Name.split(',');
+  if (Pair.first.size() > 16)
+    return createStringError(errc::invalid_argument,
+                             "too long segment name: '%s'",
+                             Pair.first.str().c_str());
+  if (Pair.second.size() > 16)
+    return createStringError(errc::invalid_argument,
+                             "too long section name: '%s'",
+                             Pair.second.str().c_str());
+  return Error::success();
+}
+
+static Error handleArgs(const CommonConfig &Config,
+                        const MachOConfig &MachOConfig, Object &Obj) {
+  // Dump sections before add/remove for compatibility with GNU objcopy.
+  for (StringRef Flag : Config.DumpSection) {
+    StringRef SectionName;
+    StringRef FileName;
+    std::tie(SectionName, FileName) = Flag.split('=');
+    if (Error E = dumpSectionToFile(SectionName, FileName, Obj))
+      return E;
+  }
+
+  if (Error E = removeSections(Config, Obj))
+    return E;
+
+  // Mark symbols to determine which symbols are still needed.
+  if (Config.StripAll)
+    markSymbols(Config, Obj);
+
+  updateAndRemoveSymbols(Config, MachOConfig, Obj);
+
+  if (Config.StripAll)
+    for (LoadCommand &LC : Obj.LoadCommands)
+      for (std::unique_ptr<Section> &Sec : LC.Sections)
+        Sec->Relocations.clear();
+
+  for (const NewSectionInfo &NewSection : Config.AddSection) {
+    if (Error E = isValidMachOCannonicalName(NewSection.SectionName))
+      return E;
+    if (Error E = addSection(NewSection, Obj))
+      return E;
+  }
+
+  for (const NewSectionInfo &NewSection : Config.UpdateSection) {
+    if (Error E = isValidMachOCannonicalName(NewSection.SectionName))
+      return E;
+    if (Error E = updateSection(NewSection, Obj))
+      return E;
+  }
+
+  if (Error E = processLoadCommands(MachOConfig, Obj))
+    return E;
+
+  return Error::success();
+}
+
+Error objcopy::macho::executeObjcopyOnBinary(const CommonConfig &Config,
+                                             const MachOConfig &MachOConfig,
+                                             object::MachOObjectFile &In,
+                                             raw_ostream &Out) {
+  MachOReader Reader(In);
+  Expected<std::unique_ptr<Object>> O = Reader.create();
+  if (!O)
+    return createFileError(Config.InputFilename, O.takeError());
+
+  if (O->get()->Header.FileType == MachO::HeaderFileType::MH_PRELOAD)
+    return createStringError(std::errc::not_supported,
+                             "%s: MH_PRELOAD files are not supported",
+                             Config.InputFilename.str().c_str());
+
+  if (Error E = handleArgs(Config, MachOConfig, **O))
+    return createFileError(Config.InputFilename, std::move(E));
+
+  // Page size used for alignment of segment sizes in Mach-O executables and
+  // dynamic libraries.
+  uint64_t PageSize;
+  switch (In.getArch()) {
+  case Triple::ArchType::arm:
+  case Triple::ArchType::aarch64:
+  case Triple::ArchType::aarch64_32:
+    PageSize = 16384;
+    break;
+  default:
+    PageSize = 4096;
+  }
+
+  MachOWriter Writer(**O, In.is64Bit(), In.isLittleEndian(),
+                     sys::path::filename(Config.OutputFilename), PageSize, Out);
+  if (auto E = Writer.finalize())
+    return E;
+  return Writer.write();
+}
+
+Error objcopy::macho::executeObjcopyOnMachOUniversalBinary(
+    const MultiFormatConfig &Config, const MachOUniversalBinary &In,
+    raw_ostream &Out) {
+  SmallVector<OwningBinary<Binary>, 2> Binaries;
+  SmallVector<Slice, 2> Slices;
+  for (const auto &O : In.objects()) {
+    Expected<std::unique_ptr<Archive>> ArOrErr = O.getAsArchive();
+    if (ArOrErr) {
+      Expected<std::vector<NewArchiveMember>> NewArchiveMembersOrErr =
+          createNewArchiveMembers(Config, **ArOrErr);
+      if (!NewArchiveMembersOrErr)
+        return NewArchiveMembersOrErr.takeError();
+      auto Kind = (*ArOrErr)->kind();
+      if (Kind == object::Archive::K_BSD)
+        Kind = object::Archive::K_DARWIN;
+      Expected<std::unique_ptr<MemoryBuffer>> OutputBufferOrErr =
+          writeArchiveToBuffer(*NewArchiveMembersOrErr,
+                               (*ArOrErr)->hasSymbolTable(), Kind,
+                               Config.getCommonConfig().DeterministicArchives,
+                               (*ArOrErr)->isThin());
+      if (!OutputBufferOrErr)
+        return OutputBufferOrErr.takeError();
+      Expected<std::unique_ptr<Binary>> BinaryOrErr =
+          object::createBinary(**OutputBufferOrErr);
+      if (!BinaryOrErr)
+        return BinaryOrErr.takeError();
+      Binaries.emplace_back(std::move(*BinaryOrErr),
+                            std::move(*OutputBufferOrErr));
+      Slices.emplace_back(*cast<Archive>(Binaries.back().getBinary()),
+                          O.getCPUType(), O.getCPUSubType(),
+                          O.getArchFlagName(), O.getAlign());
+      continue;
+    }
+    // The methods getAsArchive, getAsObjectFile, getAsIRObject of the class
+    // ObjectForArch return an Error in case of the type mismatch. We need to
+    // check each in turn to see what kind of slice this is, so ignore errors
+    // produced along the way.
+    consumeError(ArOrErr.takeError());
+
+    Expected<std::unique_ptr<MachOObjectFile>> ObjOrErr = O.getAsObjectFile();
+    if (!ObjOrErr) {
+      consumeError(ObjOrErr.takeError());
+      return createStringError(
+          std::errc::invalid_argument,
+          "slice for '%s' of the universal Mach-O binary "
+          "'%s' is not a Mach-O object or an archive",
+          O.getArchFlagName().c_str(),
+          Config.getCommonConfig().InputFilename.str().c_str());
+    }
+    std::string ArchFlagName = O.getArchFlagName();
+
+    SmallVector<char, 0> Buffer;
+    raw_svector_ostream MemStream(Buffer);
+
+    Expected<const MachOConfig &> MachO = Config.getMachOConfig();
+    if (!MachO)
+      return MachO.takeError();
+
+    if (Error E = executeObjcopyOnBinary(Config.getCommonConfig(), *MachO,
+                                         **ObjOrErr, MemStream))
+      return E;
+
+    auto MB = std::make_unique<SmallVectorMemoryBuffer>(
+        std::move(Buffer), ArchFlagName, /*RequiresNullTerminator=*/false);
+    Expected<std::unique_ptr<Binary>> BinaryOrErr = object::createBinary(*MB);
+    if (!BinaryOrErr)
+      return BinaryOrErr.takeError();
+    Binaries.emplace_back(std::move(*BinaryOrErr), std::move(MB));
+    Slices.emplace_back(*cast<MachOObjectFile>(Binaries.back().getBinary()),
+                        O.getAlign());
+  }
+
+  if (Error Err = writeUniversalBinaryToStream(Slices, Out))
+    return Err;
+
+  return Error::success();
+}
diff --git a/llvm/lib/ObjCopy/MachO/MachOObject.cpp b/llvm/lib/ObjCopy/MachO/MachOObject.cpp
new file mode 100644
index 000000000000..56f31e456198
--- /dev/null
+++ b/llvm/lib/ObjCopy/MachO/MachOObject.cpp
@@ -0,0 +1,214 @@
+//===- MachOObject.cpp - Mach-O object file model ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MachOObject.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include <unordered_set>
+
+using namespace llvm;
+using namespace llvm::objcopy::macho;
+
+const SymbolEntry *SymbolTable::getSymbolByIndex(uint32_t Index) const {
+  assert(Index < Symbols.size() && "invalid symbol index");
+  return Symbols[Index].get();
+}
+
+SymbolEntry *SymbolTable::getSymbolByIndex(uint32_t Index) {
+  return const_cast<SymbolEntry *>(
+      static_cast<const SymbolTable *>(this)->getSymbolByIndex(Index));
+}
+
+void SymbolTable::removeSymbols(
+    function_ref<bool(const std::unique_ptr<SymbolEntry> &)> ToRemove) {
+  llvm::erase_if(Symbols, ToRemove);
+}
+
+void Object::updateLoadCommandIndexes() {
+  static constexpr char TextSegmentName[] = "__TEXT";
+  // Update indices of special load commands
+  for (size_t Index = 0, Size = LoadCommands.size(); Index < Size; ++Index) {
+    LoadCommand &LC = LoadCommands[Index];
+    switch (LC.MachOLoadCommand.load_command_data.cmd) {
+    case MachO::LC_CODE_SIGNATURE:
+      CodeSignatureCommandIndex = Index;
+      break;
+    case MachO::LC_SEGMENT:
+      if (StringRef(LC.MachOLoadCommand.segment_command_data.segname) ==
+          TextSegmentName)
+        TextSegmentCommandIndex = Index;
+      break;
+    case MachO::LC_SEGMENT_64:
+      if (StringRef(LC.MachOLoadCommand.segment_command_64_data.segname) ==
+          TextSegmentName)
+        TextSegmentCommandIndex = Index;
+      break;
+    case MachO::LC_SYMTAB:
+      SymTabCommandIndex = Index;
+      break;
+    case MachO::LC_DYSYMTAB:
+      DySymTabCommandIndex = Index;
+      break;
+    case MachO::LC_DYLD_INFO:
+    case MachO::LC_DYLD_INFO_ONLY:
+      DyLdInfoCommandIndex = Index;
+      break;
+    case MachO::LC_DATA_IN_CODE:
+      DataInCodeCommandIndex = Index;
+      break;
+    case MachO::LC_LINKER_OPTIMIZATION_HINT:
+      LinkerOptimizationHintCommandIndex = Index;
+      break;
+    case MachO::LC_FUNCTION_STARTS:
+      FunctionStartsCommandIndex = Index;
+      break;
+    case MachO::LC_DYLD_CHAINED_FIXUPS:
+      ChainedFixupsCommandIndex = Index;
+      break;
+    case MachO::LC_DYLD_EXPORTS_TRIE:
+      ExportsTrieCommandIndex = Index;
+      break;
+    }
+  }
+}
+
+Error Object::removeLoadCommands(
+    function_ref<bool(const LoadCommand &)> ToRemove) {
+  auto It = std::stable_partition(
+      LoadCommands.begin(), LoadCommands.end(),
+      [&](const LoadCommand &LC) { return !ToRemove(LC); });
+  LoadCommands.erase(It, LoadCommands.end());
+
+  updateLoadCommandIndexes();
+  return Error::success();
+}
+
+Error Object::removeSections(
+    function_ref<bool(const std::unique_ptr<Section> &)> ToRemove) {
+  DenseMap<uint32_t, const Section *> OldIndexToSection;
+  uint32_t NextSectionIndex = 1;
+  for (LoadCommand &LC : LoadCommands) {
+    auto It = std::stable_partition(
+        std::begin(LC.Sections), std::end(LC.Sections),
+        [&](const std::unique_ptr<Section> &Sec) { return !ToRemove(Sec); });
+    for (auto I = LC.Sections.begin(), End = It; I != End; ++I) {
+      OldIndexToSection[(*I)->Index] = I->get();
+      (*I)->Index = NextSectionIndex++;
+    }
+    LC.Sections.erase(It, LC.Sections.end());
+  }
+
+  auto IsDead = [&](const std::unique_ptr<SymbolEntry> &S) -> bool {
+    Optional<uint32_t> Section = S->section();
+    return (Section && !OldIndexToSection.count(*Section));
+  };
+
+  SmallPtrSet<const SymbolEntry *, 2> DeadSymbols;
+  for (const std::unique_ptr<SymbolEntry> &Sym : SymTable.Symbols)
+    if (IsDead(Sym))
+      DeadSymbols.insert(Sym.get());
+
+  for (const LoadCommand &LC : LoadCommands)
+    for (const std::unique_ptr<Section> &Sec : LC.Sections)
+      for (const RelocationInfo &R : Sec->Relocations)
+        if (R.Symbol && *R.Symbol && DeadSymbols.count(*R.Symbol))
+          return createStringError(std::errc::invalid_argument,
+                                   "symbol '%s' defined in section with index "
+                                   "'%u' cannot be removed because it is "
+                                   "referenced by a relocation in section '%s'",
+                                   (*R.Symbol)->Name.c_str(),
+                                   *((*R.Symbol)->section()),
+                                   Sec->CanonicalName.c_str());
+  SymTable.removeSymbols(IsDead);
+  for (std::unique_ptr<SymbolEntry> &S : SymTable.Symbols)
+    if (S->section())
+      S->n_sect = OldIndexToSection[S->n_sect]->Index;
+  return Error::success();
+}
+
+uint64_t Object::nextAvailableSegmentAddress() const {
+  uint64_t HeaderSize =
+      is64Bit() ? sizeof(MachO::mach_header_64) : sizeof(MachO::mach_header);
+  uint64_t Addr = HeaderSize + Header.SizeOfCmds;
+  for (const LoadCommand &LC : LoadCommands) {
+    const MachO::macho_load_command &MLC = LC.MachOLoadCommand;
+    switch (MLC.load_command_data.cmd) {
+    case MachO::LC_SEGMENT:
+      Addr = std::max(Addr,
+                      static_cast<uint64_t>(MLC.segment_command_data.vmaddr) +
+                          MLC.segment_command_data.vmsize);
+      break;
+    case MachO::LC_SEGMENT_64:
+      Addr = std::max(Addr, MLC.segment_command_64_data.vmaddr +
+                                MLC.segment_command_64_data.vmsize);
+      break;
+    default:
+      continue;
+    }
+  }
+  return Addr;
+}
+
+template <typename SegmentType>
+static void
+constructSegment(SegmentType &Seg, llvm::MachO::LoadCommandType CmdType,
+                 StringRef SegName, uint64_t SegVMAddr, uint64_t SegVMSize) {
+  assert(SegName.size() <= sizeof(Seg.segname) && "too long segment name");
+  memset(&Seg, 0, sizeof(SegmentType));
+  Seg.cmd = CmdType;
+  strncpy(Seg.segname, SegName.data(), SegName.size());
+  Seg.maxprot |=
+      (MachO::VM_PROT_READ | MachO::VM_PROT_WRITE | MachO::VM_PROT_EXECUTE);
+  Seg.initprot |=
+      (MachO::VM_PROT_READ | MachO::VM_PROT_WRITE | MachO::VM_PROT_EXECUTE);
+  Seg.vmaddr = SegVMAddr;
+  Seg.vmsize = SegVMSize;
+}
+
+LoadCommand &Object::addSegment(StringRef SegName, uint64_t SegVMSize) {
+  LoadCommand LC;
+  const uint64_t SegVMAddr = nextAvailableSegmentAddress();
+  if (is64Bit())
+    constructSegment(LC.MachOLoadCommand.segment_command_64_data,
+                     MachO::LC_SEGMENT_64, SegName, SegVMAddr, SegVMSize);
+  else
+    constructSegment(LC.MachOLoadCommand.segment_command_data,
+                     MachO::LC_SEGMENT, SegName, SegVMAddr, SegVMSize);
+
+  LoadCommands.push_back(std::move(LC));
+  return LoadCommands.back();
+}
+
+/// Extracts a segment name from a string which is possibly non-null-terminated.
+static StringRef extractSegmentName(const char *SegName) {
+  return StringRef(SegName,
+                   strnlen(SegName, sizeof(MachO::segment_command::segname)));
+}
+
+Optional<StringRef> LoadCommand::getSegmentName() const {
+  const MachO::macho_load_command &MLC = MachOLoadCommand;
+  switch (MLC.load_command_data.cmd) {
+  case MachO::LC_SEGMENT:
+    return extractSegmentName(MLC.segment_command_data.segname);
+  case MachO::LC_SEGMENT_64:
+    return extractSegmentName(MLC.segment_command_64_data.segname);
+  default:
+    return None;
+  }
+}
+
+Optional<uint64_t> LoadCommand::getSegmentVMAddr() const {
+  const MachO::macho_load_command &MLC = MachOLoadCommand;
+  switch (MLC.load_command_data.cmd) {
+  case MachO::LC_SEGMENT:
+    return MLC.segment_command_data.vmaddr;
+  case MachO::LC_SEGMENT_64:
+    return MLC.segment_command_64_data.vmaddr;
+  default:
+    return None;
+  }
+}
diff --git a/llvm/lib/ObjCopy/MachO/MachOObject.h b/llvm/lib/ObjCopy/MachO/MachOObject.h
new file mode 100644
index 000000000000..df9261b76e4d
--- /dev/null
+++ b/llvm/lib/ObjCopy/MachO/MachOObject.h
@@ -0,0 +1,374 @@
+//===- MachOObject.h - Mach-O object file model -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_OBJCOPY_MACHO_MACHOOBJECT_H
+#define LLVM_LIB_OBJCOPY_MACHO_MACHOOBJECT_H
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/MachO.h"
+#include "llvm/MC/StringTableBuilder.h"
+#include "llvm/ObjectYAML/DWARFYAML.h"
+#include "llvm/Support/StringSaver.h"
+#include "llvm/Support/YAMLTraits.h"
+#include <cstdint>
+#include <string>
+#include <vector>
+
+namespace llvm {
+namespace objcopy {
+namespace macho {
+
+struct MachHeader {
+  uint32_t Magic;
+  uint32_t CPUType;
+  uint32_t CPUSubType;
+  uint32_t FileType;
+  uint32_t NCmds;
+  uint32_t SizeOfCmds;
+  uint32_t Flags;
+  uint32_t Reserved = 0;
+};
+
+struct RelocationInfo;
+struct Section {
+  uint32_t Index;
+  std::string Segname;
+  std::string Sectname;
+  // CanonicalName is a string formatted as “<Segname>,<Sectname>".
+  std::string CanonicalName;
+  uint64_t Addr = 0;
+  uint64_t Size = 0;
+  // Offset in the input file.
+  Optional<uint32_t> OriginalOffset;
+  uint32_t Offset = 0;
+  uint32_t Align = 0;
+  uint32_t RelOff = 0;
+  uint32_t NReloc = 0;
+  uint32_t Flags = 0;
+  uint32_t Reserved1 = 0;
+  uint32_t Reserved2 = 0;
+  uint32_t Reserved3 = 0;
+  StringRef Content;
+  std::vector<RelocationInfo> Relocations;
+
+  Section(StringRef SegName, StringRef SectName)
+      : Segname(std::string(SegName)), Sectname(std::string(SectName)),
+        CanonicalName((Twine(SegName) + Twine(',') + SectName).str()) {}
+
+  Section(StringRef SegName, StringRef SectName, StringRef Content)
+      : Segname(std::string(SegName)), Sectname(std::string(SectName)),
+        CanonicalName((Twine(SegName) + Twine(',') + SectName).str()),
+        Content(Content) {}
+
+  MachO::SectionType getType() const {
+    return static_cast<MachO::SectionType>(Flags & MachO::SECTION_TYPE);
+  }
+
+  bool isVirtualSection() const {
+    return (getType() == MachO::S_ZEROFILL ||
+            getType() == MachO::S_GB_ZEROFILL ||
+            getType() == MachO::S_THREAD_LOCAL_ZEROFILL);
+  }
+
+  bool hasValidOffset() const {
+    return !(isVirtualSection() || (OriginalOffset && *OriginalOffset == 0));
+  }
+};
+
+struct LoadCommand {
+  // The type MachO::macho_load_command is defined in llvm/BinaryFormat/MachO.h
+  // and it is a union of all the structs corresponding to various load
+  // commands.
+  MachO::macho_load_command MachOLoadCommand;
+
+  // The raw content of the payload of the load command (located right after the
+  // corresponding struct). In some cases it is either empty or can be
+  // copied-over without digging into its structure.
+  std::vector<uint8_t> Payload;
+
+  // Some load commands can contain (inside the payload) an array of sections,
+  // though the contents of the sections are stored separately. The struct
+  // Section describes only sections' metadata and where to find the
+  // corresponding content inside the binary.
+  std::vector<std::unique_ptr<Section>> Sections;
+
+  // Returns the segment name if the load command is a segment command.
+  Optional<StringRef> getSegmentName() const;
+
+  // Returns the segment vm address if the load command is a segment command.
+  Optional<uint64_t> getSegmentVMAddr() const;
+};
+
+// A symbol information. Fields which starts with "n_" are same as them in the
+// nlist.
+struct SymbolEntry {
+  std::string Name;
+  bool Referenced = false;
+  uint32_t Index;
+  uint8_t n_type;
+  uint8_t n_sect;
+  uint16_t n_desc;
+  uint64_t n_value;
+
+  bool isExternalSymbol() const { return n_type & MachO::N_EXT; }
+
+  bool isLocalSymbol() const { return !isExternalSymbol(); }
+
+  bool isUndefinedSymbol() const {
+    return (n_type & MachO::N_TYPE) == MachO::N_UNDF;
+  }
+
+  bool isSwiftSymbol() const {
+    return StringRef(Name).startswith("_$s") ||
+           StringRef(Name).startswith("_$S");
+  }
+
+  Optional<uint32_t> section() const {
+    return n_sect == MachO::NO_SECT ? None : Optional<uint32_t>(n_sect);
+  }
+};
+
+/// The location of the symbol table inside the binary is described by LC_SYMTAB
+/// load command.
+struct SymbolTable {
+  std::vector<std::unique_ptr<SymbolEntry>> Symbols;
+
+  using iterator = pointee_iterator<
+      std::vector<std::unique_ptr<SymbolEntry>>::const_iterator>;
+
+  iterator begin() const { return iterator(Symbols.begin()); }
+  iterator end() const { return iterator(Symbols.end()); }
+
+  const SymbolEntry *getSymbolByIndex(uint32_t Index) const;
+  SymbolEntry *getSymbolByIndex(uint32_t Index);
+  void removeSymbols(
+      function_ref<bool(const std::unique_ptr<SymbolEntry> &)> ToRemove);
+};
+
+struct IndirectSymbolEntry {
+  // The original value in an indirect symbol table. Higher bits encode extra
+  // information (INDIRECT_SYMBOL_LOCAL and INDIRECT_SYMBOL_ABS).
+  uint32_t OriginalIndex;
+  /// The Symbol referenced by this entry. It's None if the index is
+  /// INDIRECT_SYMBOL_LOCAL or INDIRECT_SYMBOL_ABS.
+  Optional<SymbolEntry *> Symbol;
+
+  IndirectSymbolEntry(uint32_t OriginalIndex, Optional<SymbolEntry *> Symbol)
+      : OriginalIndex(OriginalIndex), Symbol(Symbol) {}
+};
+
+struct IndirectSymbolTable {
+  std::vector<IndirectSymbolEntry> Symbols;
+};
+
+/// The location of the string table inside the binary is described by LC_SYMTAB
+/// load command.
+struct StringTable {
+  std::vector<std::string> Strings;
+};
+
+struct RelocationInfo {
+  // The referenced symbol entry. Set if !Scattered && Extern.
+  Optional<const SymbolEntry *> Symbol;
+  // The referenced section. Set if !Scattered && !Extern.
+  Optional<const Section *> Sec;
+  // True if Info is a scattered_relocation_info.
+  bool Scattered;
+  // True if the type is an ADDEND. r_symbolnum holds the addend instead of a
+  // symbol index.
+  bool IsAddend;
+  // True if the r_symbolnum points to a section number (i.e. r_extern=0).
+  bool Extern;
+  MachO::any_relocation_info Info;
+
+  unsigned getPlainRelocationSymbolNum(bool IsLittleEndian) {
+    if (IsLittleEndian)
+      return Info.r_word1 & 0xffffff;
+    return Info.r_word1 >> 8;
+  }
+
+  void setPlainRelocationSymbolNum(unsigned SymbolNum, bool IsLittleEndian) {
+    assert(SymbolNum < (1 << 24) && "SymbolNum out of range");
+    if (IsLittleEndian)
+      Info.r_word1 = (Info.r_word1 & ~0x00ffffff) | SymbolNum;
+    else
+      Info.r_word1 = (Info.r_word1 & ~0xffffff00) | (SymbolNum << 8);
+  }
+};
+
+/// The location of the rebase info inside the binary is described by
+/// LC_DYLD_INFO load command. Dyld rebases an image whenever dyld loads it at
+/// an address different from its preferred address.  The rebase information is
+/// a stream of byte sized opcodes whose symbolic names start with
+/// REBASE_OPCODE_. Conceptually the rebase information is a table of tuples:
+///   <seg-index, seg-offset, type>
+/// The opcodes are a compressed way to encode the table by only
+/// encoding when a column changes.  In addition simple patterns
+/// like "every n'th offset for m times" can be encoded in a few
+/// bytes.
+struct RebaseInfo {
+  // At the moment we do not parse this info (and it is simply copied over),
+  // but the proper support will be added later.
+  ArrayRef<uint8_t> Opcodes;
+};
+
+/// The location of the bind info inside the binary is described by
+/// LC_DYLD_INFO load command. Dyld binds an image during the loading process,
+/// if the image requires any pointers to be initialized to symbols in other
+/// images. The bind information is a stream of byte sized opcodes whose
+/// symbolic names start with BIND_OPCODE_. Conceptually the bind information is
+/// a table of tuples: <seg-index, seg-offset, type, symbol-library-ordinal,
+/// symbol-name, addend> The opcodes are a compressed way to encode the table by
+/// only encoding when a column changes.  In addition simple patterns like for
+/// runs of pointers initialized to the same value can be encoded in a few
+/// bytes.
+struct BindInfo {
+  // At the moment we do not parse this info (and it is simply copied over),
+  // but the proper support will be added later.
+  ArrayRef<uint8_t> Opcodes;
+};
+
+/// The location of the weak bind info inside the binary is described by
+/// LC_DYLD_INFO load command. Some C++ programs require dyld to unique symbols
+/// so that all images in the process use the same copy of some code/data. This
+/// step is done after binding. The content of the weak_bind info is an opcode
+/// stream like the bind_info.  But it is sorted alphabetically by symbol name.
+/// This enable dyld to walk all images with weak binding information in order
+/// and look for collisions.  If there are no collisions, dyld does no updating.
+/// That means that some fixups are also encoded in the bind_info.  For
+/// instance, all calls to "operator new" are first bound to libstdc++.dylib
+/// using the information in bind_info.  Then if some image overrides operator
+/// new that is detected when the weak_bind information is processed and the
+/// call to operator new is then rebound.
+struct WeakBindInfo {
+  // At the moment we do not parse this info (and it is simply copied over),
+  // but the proper support will be added later.
+  ArrayRef<uint8_t> Opcodes;
+};
+
+/// The location of the lazy bind info inside the binary is described by
+/// LC_DYLD_INFO load command. Some uses of external symbols do not need to be
+/// bound immediately. Instead they can be lazily bound on first use.  The
+/// lazy_bind contains a stream of BIND opcodes to bind all lazy symbols. Normal
+/// use is that dyld ignores the lazy_bind section when loading an image.
+/// Instead the static linker arranged for the lazy pointer to initially point
+/// to a helper function which pushes the offset into the lazy_bind area for the
+/// symbol needing to be bound, then jumps to dyld which simply adds the offset
+/// to lazy_bind_off to get the information on what to bind.
+struct LazyBindInfo {
+  ArrayRef<uint8_t> Opcodes;
+};
+
+/// The location of the export info inside the binary is described by
+/// LC_DYLD_INFO load command. The symbols exported by a dylib are encoded in a
+/// trie.  This is a compact representation that factors out common prefixes. It
+/// also reduces LINKEDIT pages in RAM because it encodes all information (name,
+/// address, flags) in one small, contiguous range. The export area is a stream
+/// of nodes.  The first node sequentially is the start node for the trie. Nodes
+/// for a symbol start with a uleb128 that is the length of the exported symbol
+/// information for the string so far. If there is no exported symbol, the node
+/// starts with a zero byte. If there is exported info, it follows the length.
+/// First is a uleb128 containing flags. Normally, it is followed by
+/// a uleb128 encoded offset which is location of the content named
+/// by the symbol from the mach_header for the image.  If the flags
+/// is EXPORT_SYMBOL_FLAGS_REEXPORT, then following the flags is
+/// a uleb128 encoded library ordinal, then a zero terminated
+/// UTF8 string.  If the string is zero length, then the symbol
+/// is re-export from the specified dylib with the same name.
+/// If the flags is EXPORT_SYMBOL_FLAGS_STUB_AND_RESOLVER, then following
+/// the flags is two uleb128s: the stub offset and the resolver offset.
+/// The stub is used by non-lazy pointers.  The resolver is used
+/// by lazy pointers and must be called to get the actual address to use.
+/// After the optional exported symbol information is a byte of
+/// how many edges (0-255) that this node has leaving it,
+/// followed by each edge.
+/// Each edge is a zero terminated UTF8 of the addition chars
+/// in the symbol, followed by a uleb128 offset for the node that
+/// edge points to.
+struct ExportInfo {
+  ArrayRef<uint8_t> Trie;
+};
+
+struct LinkData {
+  ArrayRef<uint8_t> Data;
+};
+
+struct Object {
+  MachHeader Header;
+  std::vector<LoadCommand> LoadCommands;
+
+  SymbolTable SymTable;
+  StringTable StrTable;
+
+  RebaseInfo Rebases;
+  BindInfo Binds;
+  WeakBindInfo WeakBinds;
+  LazyBindInfo LazyBinds;
+  ExportInfo Exports;
+  IndirectSymbolTable IndirectSymTable;
+  LinkData DataInCode;
+  LinkData LinkerOptimizationHint;
+  LinkData FunctionStarts;
+  LinkData ExportsTrie;
+  LinkData ChainedFixups;
+
+  Optional<uint32_t> SwiftVersion;
+
+  /// The index of LC_CODE_SIGNATURE load command if present.
+  Optional<size_t> CodeSignatureCommandIndex;
+  /// The index of LC_SYMTAB load command if present.
+  Optional<size_t> SymTabCommandIndex;
+  /// The index of LC_DYLD_INFO or LC_DYLD_INFO_ONLY load command if present.
+  Optional<size_t> DyLdInfoCommandIndex;
+  /// The index LC_DYSYMTAB load command if present.
+  Optional<size_t> DySymTabCommandIndex;
+  /// The index LC_DATA_IN_CODE load command if present.
+  Optional<size_t> DataInCodeCommandIndex;
+  /// The index of LC_LINKER_OPTIMIZATIN_HINT load command if present.
+  Optional<size_t> LinkerOptimizationHintCommandIndex;
+  /// The index LC_FUNCTION_STARTS load command if present.
+  Optional<size_t> FunctionStartsCommandIndex;
+  /// The index LC_DYLD_CHAINED_FIXUPS load command if present.
+  Optional<size_t> ChainedFixupsCommandIndex;
+  /// The index LC_DYLD_EXPORTS_TRIE load command if present.
+  Optional<size_t> ExportsTrieCommandIndex;
+  /// The index of the LC_SEGMENT or LC_SEGMENT_64 load command
+  /// corresponding to the __TEXT segment.
+  Optional<size_t> TextSegmentCommandIndex;
+
+  BumpPtrAllocator Alloc;
+  StringSaver NewSectionsContents;
+
+  Object() : NewSectionsContents(Alloc) {}
+
+  Error
+  removeSections(function_ref<bool(const std::unique_ptr<Section> &)> ToRemove);
+
+  Error removeLoadCommands(function_ref<bool(const LoadCommand &)> ToRemove);
+
+  void updateLoadCommandIndexes();
+
+  /// Creates a new segment load command in the object and returns a reference
+  /// to the newly created load command. The caller should verify that SegName
+  /// is not too long (SegName.size() should be less than or equal to 16).
+  LoadCommand &addSegment(StringRef SegName, uint64_t SegVMSize);
+
+  bool is64Bit() const {
+    return Header.Magic == MachO::MH_MAGIC_64 ||
+           Header.Magic == MachO::MH_CIGAM_64;
+  }
+
+  uint64_t nextAvailableSegmentAddress() const;
+};
+
+} // end namespace macho
+} // end namespace objcopy
+} // end namespace llvm
+
+#endif // LLVM_LIB_OBJCOPY_MACHO_MACHOOBJECT_H
diff --git a/llvm/lib/ObjCopy/MachO/MachOReader.cpp b/llvm/lib/ObjCopy/MachO/MachOReader.cpp
new file mode 100644
index 000000000000..94459a436094
--- /dev/null
+++ b/llvm/lib/ObjCopy/MachO/MachOReader.cpp
@@ -0,0 +1,374 @@
+//===- MachOReader.cpp ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MachOReader.h"
+#include "MachOObject.h"
+#include "llvm/BinaryFormat/MachO.h"
+#include "llvm/Object/MachO.h"
+#include "llvm/Support/Errc.h"
+#include <memory>
+
+using namespace llvm;
+using namespace llvm::objcopy;
+using namespace llvm::objcopy::macho;
+
+void MachOReader::readHeader(Object &O) const {
+  O.Header.Magic = MachOObj.getHeader().magic;
+  O.Header.CPUType = MachOObj.getHeader().cputype;
+  O.Header.CPUSubType = MachOObj.getHeader().cpusubtype;
+  O.Header.FileType = MachOObj.getHeader().filetype;
+  O.Header.NCmds = MachOObj.getHeader().ncmds;
+  O.Header.SizeOfCmds = MachOObj.getHeader().sizeofcmds;
+  O.Header.Flags = MachOObj.getHeader().flags;
+}
+
+template <typename SectionType>
+static Section constructSectionCommon(const SectionType &Sec, uint32_t Index) {
+  StringRef SegName(Sec.segname, strnlen(Sec.segname, sizeof(Sec.segname)));
+  StringRef SectName(Sec.sectname, strnlen(Sec.sectname, sizeof(Sec.sectname)));
+  Section S(SegName, SectName);
+  S.Index = Index;
+  S.Addr = Sec.addr;
+  S.Size = Sec.size;
+  S.OriginalOffset = Sec.offset;
+  S.Align = Sec.align;
+  S.RelOff = Sec.reloff;
+  S.NReloc = Sec.nreloc;
+  S.Flags = Sec.flags;
+  S.Reserved1 = Sec.reserved1;
+  S.Reserved2 = Sec.reserved2;
+  S.Reserved3 = 0;
+  return S;
+}
+
+Section constructSection(const MachO::section &Sec, uint32_t Index) {
+  return constructSectionCommon(Sec, Index);
+}
+
+Section constructSection(const MachO::section_64 &Sec, uint32_t Index) {
+  Section S = constructSectionCommon(Sec, Index);
+  S.Reserved3 = Sec.reserved3;
+  return S;
+}
+
+template <typename SectionType, typename SegmentType>
+Expected<std::vector<std::unique_ptr<Section>>> static extractSections(
+    const object::MachOObjectFile::LoadCommandInfo &LoadCmd,
+    const object::MachOObjectFile &MachOObj, uint32_t &NextSectionIndex) {
+  std::vector<std::unique_ptr<Section>> Sections;
+  for (auto Curr = reinterpret_cast<const SectionType *>(LoadCmd.Ptr +
+                                                         sizeof(SegmentType)),
+            End = reinterpret_cast<const SectionType *>(LoadCmd.Ptr +
+                                                        LoadCmd.C.cmdsize);
+       Curr < End; ++Curr) {
+    SectionType Sec;
+    memcpy((void *)&Sec, Curr, sizeof(SectionType));
+
+    if (MachOObj.isLittleEndian() != sys::IsLittleEndianHost)
+      MachO::swapStruct(Sec);
+
+    Sections.push_back(
+        std::make_unique<Section>(constructSection(Sec, NextSectionIndex)));
+
+    Section &S = *Sections.back();
+
+    Expected<object::SectionRef> SecRef =
+        MachOObj.getSection(NextSectionIndex++);
+    if (!SecRef)
+      return SecRef.takeError();
+
+    Expected<ArrayRef<uint8_t>> Data =
+        MachOObj.getSectionContents(SecRef->getRawDataRefImpl());
+    if (!Data)
+      return Data.takeError();
+
+    S.Content =
+        StringRef(reinterpret_cast<const char *>(Data->data()), Data->size());
+
+    const uint32_t CPUType = MachOObj.getHeader().cputype;
+    S.Relocations.reserve(S.NReloc);
+    for (auto RI = MachOObj.section_rel_begin(SecRef->getRawDataRefImpl()),
+              RE = MachOObj.section_rel_end(SecRef->getRawDataRefImpl());
+         RI != RE; ++RI) {
+      RelocationInfo R;
+      R.Symbol = nullptr; // We'll fill this field later.
+      R.Info = MachOObj.getRelocation(RI->getRawDataRefImpl());
+      R.Scattered = MachOObj.isRelocationScattered(R.Info);
+      unsigned Type = MachOObj.getAnyRelocationType(R.Info);
+      // TODO Support CPU_TYPE_ARM.
+      R.IsAddend = !R.Scattered && (CPUType == MachO::CPU_TYPE_ARM64 &&
+                                    Type == MachO::ARM64_RELOC_ADDEND);
+      R.Extern = !R.Scattered && MachOObj.getPlainRelocationExternal(R.Info);
+      S.Relocations.push_back(R);
+    }
+
+    assert(S.NReloc == S.Relocations.size() &&
+           "Incorrect number of relocations");
+  }
+  return std::move(Sections);
+}
+
+Error MachOReader::readLoadCommands(Object &O) const {
+  // For MachO sections indices start from 1.
+  uint32_t NextSectionIndex = 1;
+  static constexpr char TextSegmentName[] = "__TEXT";
+  for (auto LoadCmd : MachOObj.load_commands()) {
+    LoadCommand LC;
+    switch (LoadCmd.C.cmd) {
+    case MachO::LC_CODE_SIGNATURE:
+      O.CodeSignatureCommandIndex = O.LoadCommands.size();
+      break;
+    case MachO::LC_SEGMENT:
+      // LoadCmd.Ptr might not be aligned temporarily as
+      // MachO::segment_command requires, but the segname char pointer do not
+      // have alignment restrictions.
+      if (StringRef(reinterpret_cast<const char *>(
+              LoadCmd.Ptr + offsetof(MachO::segment_command, segname))) ==
+          TextSegmentName)
+        O.TextSegmentCommandIndex = O.LoadCommands.size();
+
+      if (Expected<std::vector<std::unique_ptr<Section>>> Sections =
+              extractSections<MachO::section, MachO::segment_command>(
+                  LoadCmd, MachOObj, NextSectionIndex))
+        LC.Sections = std::move(*Sections);
+      else
+        return Sections.takeError();
+      break;
+    case MachO::LC_SEGMENT_64:
+      // LoadCmd.Ptr might not be aligned temporarily as
+      // MachO::segment_command_64 requires, but the segname char pointer do
+      // not have alignment restrictions.
+      if (StringRef(reinterpret_cast<const char *>(
+              LoadCmd.Ptr + offsetof(MachO::segment_command_64, segname))) ==
+          TextSegmentName)
+        O.TextSegmentCommandIndex = O.LoadCommands.size();
+
+      if (Expected<std::vector<std::unique_ptr<Section>>> Sections =
+              extractSections<MachO::section_64, MachO::segment_command_64>(
+                  LoadCmd, MachOObj, NextSectionIndex))
+        LC.Sections = std::move(*Sections);
+      else
+        return Sections.takeError();
+      break;
+    case MachO::LC_SYMTAB:
+      O.SymTabCommandIndex = O.LoadCommands.size();
+      break;
+    case MachO::LC_DYSYMTAB:
+      O.DySymTabCommandIndex = O.LoadCommands.size();
+      break;
+    case MachO::LC_DYLD_INFO:
+    case MachO::LC_DYLD_INFO_ONLY:
+      O.DyLdInfoCommandIndex = O.LoadCommands.size();
+      break;
+    case MachO::LC_DATA_IN_CODE:
+      O.DataInCodeCommandIndex = O.LoadCommands.size();
+      break;
+    case MachO::LC_LINKER_OPTIMIZATION_HINT:
+      O.LinkerOptimizationHintCommandIndex = O.LoadCommands.size();
+      break;
+    case MachO::LC_FUNCTION_STARTS:
+      O.FunctionStartsCommandIndex = O.LoadCommands.size();
+      break;
+    case MachO::LC_DYLD_EXPORTS_TRIE:
+      O.ExportsTrieCommandIndex = O.LoadCommands.size();
+      break;
+    case MachO::LC_DYLD_CHAINED_FIXUPS:
+      O.ChainedFixupsCommandIndex = O.LoadCommands.size();
+      break;
+    }
+#define HANDLE_LOAD_COMMAND(LCName, LCValue, LCStruct)                         \
+  case MachO::LCName:                                                          \
+    memcpy((void *)&(LC.MachOLoadCommand.LCStruct##_data), LoadCmd.Ptr,        \
+           sizeof(MachO::LCStruct));                                           \
+    if (MachOObj.isLittleEndian() != sys::IsLittleEndianHost)                  \
+      MachO::swapStruct(LC.MachOLoadCommand.LCStruct##_data);                  \
+    if (LoadCmd.C.cmdsize > sizeof(MachO::LCStruct))                           \
+      LC.Payload = ArrayRef<uint8_t>(                                          \
+          reinterpret_cast<uint8_t *>(const_cast<char *>(LoadCmd.Ptr)) +       \
+              sizeof(MachO::LCStruct),                                         \
+          LoadCmd.C.cmdsize - sizeof(MachO::LCStruct));                        \
+    break;
+
+    switch (LoadCmd.C.cmd) {
+    default:
+      memcpy((void *)&(LC.MachOLoadCommand.load_command_data), LoadCmd.Ptr,
+             sizeof(MachO::load_command));
+      if (MachOObj.isLittleEndian() != sys::IsLittleEndianHost)
+        MachO::swapStruct(LC.MachOLoadCommand.load_command_data);
+      if (LoadCmd.C.cmdsize > sizeof(MachO::load_command))
+        LC.Payload = ArrayRef<uint8_t>(
+            reinterpret_cast<uint8_t *>(const_cast<char *>(LoadCmd.Ptr)) +
+                sizeof(MachO::load_command),
+            LoadCmd.C.cmdsize - sizeof(MachO::load_command));
+      break;
+#include "llvm/BinaryFormat/MachO.def"
+    }
+    O.LoadCommands.push_back(std::move(LC));
+  }
+  return Error::success();
+}
+
+template <typename nlist_t>
+SymbolEntry constructSymbolEntry(StringRef StrTable, const nlist_t &nlist) {
+  assert(nlist.n_strx < StrTable.size() &&
+         "n_strx exceeds the size of the string table");
+  SymbolEntry SE;
+  SE.Name = StringRef(StrTable.data() + nlist.n_strx).str();
+  SE.n_type = nlist.n_type;
+  SE.n_sect = nlist.n_sect;
+  SE.n_desc = nlist.n_desc;
+  SE.n_value = nlist.n_value;
+  return SE;
+}
+
+void MachOReader::readSymbolTable(Object &O) const {
+  StringRef StrTable = MachOObj.getStringTableData();
+  for (auto Symbol : MachOObj.symbols()) {
+    SymbolEntry SE =
+        (MachOObj.is64Bit()
+             ? constructSymbolEntry(StrTable, MachOObj.getSymbol64TableEntry(
+                                                  Symbol.getRawDataRefImpl()))
+             : constructSymbolEntry(StrTable, MachOObj.getSymbolTableEntry(
+                                                  Symbol.getRawDataRefImpl())));
+
+    O.SymTable.Symbols.push_back(std::make_unique<SymbolEntry>(SE));
+  }
+}
+
+void MachOReader::setSymbolInRelocationInfo(Object &O) const {
+  std::vector<const Section *> Sections;
+  for (auto &LC : O.LoadCommands)
+    for (std::unique_ptr<Section> &Sec : LC.Sections)
+      Sections.push_back(Sec.get());
+
+  for (LoadCommand &LC : O.LoadCommands)
+    for (std::unique_ptr<Section> &Sec : LC.Sections)
+      for (auto &Reloc : Sec->Relocations)
+        if (!Reloc.Scattered && !Reloc.IsAddend) {
+          const uint32_t SymbolNum =
+              Reloc.getPlainRelocationSymbolNum(MachOObj.isLittleEndian());
+          if (Reloc.Extern) {
+            Reloc.Symbol = O.SymTable.getSymbolByIndex(SymbolNum);
+          } else {
+            // FIXME: Refactor error handling in MachOReader and report an error
+            // if we encounter an invalid relocation.
+            assert(SymbolNum >= 1 && SymbolNum <= Sections.size() &&
+                   "Invalid section index.");
+            Reloc.Sec = Sections[SymbolNum - 1];
+          }
+        }
+}
+
+void MachOReader::readRebaseInfo(Object &O) const {
+  O.Rebases.Opcodes = MachOObj.getDyldInfoRebaseOpcodes();
+}
+
+void MachOReader::readBindInfo(Object &O) const {
+  O.Binds.Opcodes = MachOObj.getDyldInfoBindOpcodes();
+}
+
+void MachOReader::readWeakBindInfo(Object &O) const {
+  O.WeakBinds.Opcodes = MachOObj.getDyldInfoWeakBindOpcodes();
+}
+
+void MachOReader::readLazyBindInfo(Object &O) const {
+  O.LazyBinds.Opcodes = MachOObj.getDyldInfoLazyBindOpcodes();
+}
+
+void MachOReader::readExportInfo(Object &O) const {
+  O.Exports.Trie = MachOObj.getDyldInfoExportsTrie();
+}
+
+void MachOReader::readLinkData(Object &O, Optional<size_t> LCIndex,
+                               LinkData &LD) const {
+  if (!LCIndex)
+    return;
+  const MachO::linkedit_data_command &LC =
+      O.LoadCommands[*LCIndex].MachOLoadCommand.linkedit_data_command_data;
+  LD.Data =
+      arrayRefFromStringRef(MachOObj.getData().substr(LC.dataoff, LC.datasize));
+}
+
+void MachOReader::readDataInCodeData(Object &O) const {
+  return readLinkData(O, O.DataInCodeCommandIndex, O.DataInCode);
+}
+
+void MachOReader::readLinkerOptimizationHint(Object &O) const {
+  return readLinkData(O, O.LinkerOptimizationHintCommandIndex,
+                      O.LinkerOptimizationHint);
+}
+
+void MachOReader::readFunctionStartsData(Object &O) const {
+  return readLinkData(O, O.FunctionStartsCommandIndex, O.FunctionStarts);
+}
+
+void MachOReader::readExportsTrie(Object &O) const {
+  return readLinkData(O, O.ExportsTrieCommandIndex, O.ExportsTrie);
+}
+
+void MachOReader::readChainedFixups(Object &O) const {
+  return readLinkData(O, O.ChainedFixupsCommandIndex, O.ChainedFixups);
+}
+
+void MachOReader::readIndirectSymbolTable(Object &O) const {
+  MachO::dysymtab_command DySymTab = MachOObj.getDysymtabLoadCommand();
+  constexpr uint32_t AbsOrLocalMask =
+      MachO::INDIRECT_SYMBOL_LOCAL | MachO::INDIRECT_SYMBOL_ABS;
+  for (uint32_t i = 0; i < DySymTab.nindirectsyms; ++i) {
+    uint32_t Index = MachOObj.getIndirectSymbolTableEntry(DySymTab, i);
+    if ((Index & AbsOrLocalMask) != 0)
+      O.IndirectSymTable.Symbols.emplace_back(Index, None);
+    else
+      O.IndirectSymTable.Symbols.emplace_back(
+          Index, O.SymTable.getSymbolByIndex(Index));
+  }
+}
+
+void MachOReader::readSwiftVersion(Object &O) const {
+  struct ObjCImageInfo {
+    uint32_t Version;
+    uint32_t Flags;
+  } ImageInfo;
+
+  for (const LoadCommand &LC : O.LoadCommands)
+    for (const std::unique_ptr<Section> &Sec : LC.Sections)
+      if (Sec->Sectname == "__objc_imageinfo" &&
+          (Sec->Segname == "__DATA" || Sec->Segname == "__DATA_CONST" ||
+           Sec->Segname == "__DATA_DIRTY") &&
+          Sec->Content.size() >= sizeof(ObjCImageInfo)) {
+        memcpy(&ImageInfo, Sec->Content.data(), sizeof(ObjCImageInfo));
+        if (MachOObj.isLittleEndian() != sys::IsLittleEndianHost) {
+          sys::swapByteOrder(ImageInfo.Version);
+          sys::swapByteOrder(ImageInfo.Flags);
+        }
+        O.SwiftVersion = (ImageInfo.Flags >> 8) & 0xff;
+        return;
+      }
+}
+
+Expected<std::unique_ptr<Object>> MachOReader::create() const {
+  auto Obj = std::make_unique<Object>();
+  readHeader(*Obj);
+  if (Error E = readLoadCommands(*Obj))
+    return std::move(E);
+  readSymbolTable(*Obj);
+  setSymbolInRelocationInfo(*Obj);
+  readRebaseInfo(*Obj);
+  readBindInfo(*Obj);
+  readWeakBindInfo(*Obj);
+  readLazyBindInfo(*Obj);
+  readExportInfo(*Obj);
+  readDataInCodeData(*Obj);
+  readLinkerOptimizationHint(*Obj);
+  readFunctionStartsData(*Obj);
+  readExportsTrie(*Obj);
+  readChainedFixups(*Obj);
+  readIndirectSymbolTable(*Obj);
+  readSwiftVersion(*Obj);
+  return std::move(Obj);
+}
diff --git a/llvm/lib/ObjCopy/MachO/MachOReader.h b/llvm/lib/ObjCopy/MachO/MachOReader.h
new file mode 100644
index 000000000000..ef374aa9efae
--- /dev/null
+++ b/llvm/lib/ObjCopy/MachO/MachOReader.h
@@ -0,0 +1,62 @@
+//===- MachOReader.h --------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_OBJCOPY_MACHO_MACHOREADER_H
+#define LLVM_LIB_OBJCOPY_MACHO_MACHOREADER_H
+
+#include "MachOObject.h"
+#include "llvm/BinaryFormat/MachO.h"
+#include "llvm/ObjCopy/MachO/MachOObjcopy.h"
+#include "llvm/Object/MachO.h"
+#include <memory>
+
+namespace llvm {
+namespace objcopy {
+namespace macho {
+
+// The hierarchy of readers is responsible for parsing different inputs:
+// raw binaries and regular MachO object files.
+class Reader {
+public:
+  virtual ~Reader(){};
+  virtual Expected<std::unique_ptr<Object>> create() const = 0;
+};
+
+class MachOReader : public Reader {
+  const object::MachOObjectFile &MachOObj;
+
+  void readHeader(Object &O) const;
+  Error readLoadCommands(Object &O) const;
+  void readSymbolTable(Object &O) const;
+  void setSymbolInRelocationInfo(Object &O) const;
+  void readRebaseInfo(Object &O) const;
+  void readBindInfo(Object &O) const;
+  void readWeakBindInfo(Object &O) const;
+  void readLazyBindInfo(Object &O) const;
+  void readExportInfo(Object &O) const;
+  void readLinkData(Object &O, Optional<size_t> LCIndex, LinkData &LD) const;
+  void readCodeSignature(Object &O) const;
+  void readDataInCodeData(Object &O) const;
+  void readLinkerOptimizationHint(Object &O) const;
+  void readFunctionStartsData(Object &O) const;
+  void readExportsTrie(Object &O) const;
+  void readChainedFixups(Object &O) const;
+  void readIndirectSymbolTable(Object &O) const;
+  void readSwiftVersion(Object &O) const;
+
+public:
+  explicit MachOReader(const object::MachOObjectFile &Obj) : MachOObj(Obj) {}
+
+  Expected<std::unique_ptr<Object>> create() const override;
+};
+
+} // end namespace macho
+} // end namespace objcopy
+} // end namespace llvm
+
+#endif // LLVM_LIB_OBJCOPY_MACHO_MACHOREADER_H
diff --git a/llvm/lib/ObjCopy/MachO/MachOWriter.cpp b/llvm/lib/ObjCopy/MachO/MachOWriter.cpp
new file mode 100644
index 000000000000..bc633285e03c
--- /dev/null
+++ b/llvm/lib/ObjCopy/MachO/MachOWriter.cpp
@@ -0,0 +1,662 @@
+//===- MachOWriter.cpp ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MachOWriter.h"
+#include "MachOLayoutBuilder.h"
+#include "MachOObject.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/BinaryFormat/MachO.h"
+#include "llvm/Object/MachO.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SHA256.h"
+#include <memory>
+
+#if defined(__APPLE__)
+#include <sys/mman.h>
+#endif
+
+using namespace llvm;
+using namespace llvm::objcopy::macho;
+using namespace llvm::support::endian;
+
+size_t MachOWriter::headerSize() const {
+  return Is64Bit ? sizeof(MachO::mach_header_64) : sizeof(MachO::mach_header);
+}
+
+size_t MachOWriter::loadCommandsSize() const { return O.Header.SizeOfCmds; }
+
+size_t MachOWriter::symTableSize() const {
+  return O.SymTable.Symbols.size() *
+         (Is64Bit ? sizeof(MachO::nlist_64) : sizeof(MachO::nlist));
+}
+
+size_t MachOWriter::totalSize() const {
+  // Going from tail to head and looking for an appropriate "anchor" to
+  // calculate the total size assuming that all the offsets are either valid
+  // ("true") or 0 (0 indicates that the corresponding part is missing).
+
+  SmallVector<size_t, 7> Ends;
+  if (O.SymTabCommandIndex) {
+    const MachO::symtab_command &SymTabCommand =
+        O.LoadCommands[*O.SymTabCommandIndex]
+            .MachOLoadCommand.symtab_command_data;
+    if (SymTabCommand.symoff)
+      Ends.push_back(SymTabCommand.symoff + symTableSize());
+    if (SymTabCommand.stroff)
+      Ends.push_back(SymTabCommand.stroff + SymTabCommand.strsize);
+  }
+  if (O.DyLdInfoCommandIndex) {
+    const MachO::dyld_info_command &DyLdInfoCommand =
+        O.LoadCommands[*O.DyLdInfoCommandIndex]
+            .MachOLoadCommand.dyld_info_command_data;
+    if (DyLdInfoCommand.rebase_off) {
+      assert((DyLdInfoCommand.rebase_size == O.Rebases.Opcodes.size()) &&
+             "Incorrect rebase opcodes size");
+      Ends.push_back(DyLdInfoCommand.rebase_off + DyLdInfoCommand.rebase_size);
+    }
+    if (DyLdInfoCommand.bind_off) {
+      assert((DyLdInfoCommand.bind_size == O.Binds.Opcodes.size()) &&
+             "Incorrect bind opcodes size");
+      Ends.push_back(DyLdInfoCommand.bind_off + DyLdInfoCommand.bind_size);
+    }
+    if (DyLdInfoCommand.weak_bind_off) {
+      assert((DyLdInfoCommand.weak_bind_size == O.WeakBinds.Opcodes.size()) &&
+             "Incorrect weak bind opcodes size");
+      Ends.push_back(DyLdInfoCommand.weak_bind_off +
+                     DyLdInfoCommand.weak_bind_size);
+    }
+    if (DyLdInfoCommand.lazy_bind_off) {
+      assert((DyLdInfoCommand.lazy_bind_size == O.LazyBinds.Opcodes.size()) &&
+             "Incorrect lazy bind opcodes size");
+      Ends.push_back(DyLdInfoCommand.lazy_bind_off +
+                     DyLdInfoCommand.lazy_bind_size);
+    }
+    if (DyLdInfoCommand.export_off) {
+      assert((DyLdInfoCommand.export_size == O.Exports.Trie.size()) &&
+             "Incorrect trie size");
+      Ends.push_back(DyLdInfoCommand.export_off + DyLdInfoCommand.export_size);
+    }
+  }
+
+  if (O.DySymTabCommandIndex) {
+    const MachO::dysymtab_command &DySymTabCommand =
+        O.LoadCommands[*O.DySymTabCommandIndex]
+            .MachOLoadCommand.dysymtab_command_data;
+
+    if (DySymTabCommand.indirectsymoff)
+      Ends.push_back(DySymTabCommand.indirectsymoff +
+                     sizeof(uint32_t) * O.IndirectSymTable.Symbols.size());
+  }
+
+  for (Optional<size_t> LinkEditDataCommandIndex :
+       {O.CodeSignatureCommandIndex, O.DataInCodeCommandIndex,
+        O.LinkerOptimizationHintCommandIndex, O.FunctionStartsCommandIndex,
+        O.ChainedFixupsCommandIndex, O.ExportsTrieCommandIndex})
+    if (LinkEditDataCommandIndex) {
+      const MachO::linkedit_data_command &LinkEditDataCommand =
+          O.LoadCommands[*LinkEditDataCommandIndex]
+              .MachOLoadCommand.linkedit_data_command_data;
+      if (LinkEditDataCommand.dataoff)
+        Ends.push_back(LinkEditDataCommand.dataoff +
+                       LinkEditDataCommand.datasize);
+    }
+
+  // Otherwise, use the last section / reloction.
+  for (const LoadCommand &LC : O.LoadCommands)
+    for (const std::unique_ptr<Section> &S : LC.Sections) {
+      if (!S->hasValidOffset()) {
+        assert((S->Offset == 0) && "Skipped section's offset must be zero");
+        assert((S->isVirtualSection() || S->Size == 0) &&
+               "Non-zero-fill sections with zero offset must have zero size");
+        continue;
+      }
+      assert((S->Offset != 0) &&
+             "Non-zero-fill section's offset cannot be zero");
+      Ends.push_back(S->Offset + S->Size);
+      if (S->RelOff)
+        Ends.push_back(S->RelOff +
+                       S->NReloc * sizeof(MachO::any_relocation_info));
+    }
+
+  if (!Ends.empty())
+    return *std::max_element(Ends.begin(), Ends.end());
+
+  // Otherwise, we have only Mach header and load commands.
+  return headerSize() + loadCommandsSize();
+}
+
+void MachOWriter::writeHeader() {
+  MachO::mach_header_64 Header;
+
+  Header.magic = O.Header.Magic;
+  Header.cputype = O.Header.CPUType;
+  Header.cpusubtype = O.Header.CPUSubType;
+  Header.filetype = O.Header.FileType;
+  Header.ncmds = O.Header.NCmds;
+  Header.sizeofcmds = O.Header.SizeOfCmds;
+  Header.flags = O.Header.Flags;
+  Header.reserved = O.Header.Reserved;
+
+  if (IsLittleEndian != sys::IsLittleEndianHost)
+    MachO::swapStruct(Header);
+
+  auto HeaderSize =
+      Is64Bit ? sizeof(MachO::mach_header_64) : sizeof(MachO::mach_header);
+  memcpy(Buf->getBufferStart(), &Header, HeaderSize);
+}
+
+void MachOWriter::writeLoadCommands() {
+  uint8_t *Begin =
+      reinterpret_cast<uint8_t *>(Buf->getBufferStart()) + headerSize();
+  for (const LoadCommand &LC : O.LoadCommands) {
+    // Construct a load command.
+    MachO::macho_load_command MLC = LC.MachOLoadCommand;
+    switch (MLC.load_command_data.cmd) {
+    case MachO::LC_SEGMENT:
+      if (IsLittleEndian != sys::IsLittleEndianHost)
+        MachO::swapStruct(MLC.segment_command_data);
+      memcpy(Begin, &MLC.segment_command_data, sizeof(MachO::segment_command));
+      Begin += sizeof(MachO::segment_command);
+
+      for (const std::unique_ptr<Section> &Sec : LC.Sections)
+        writeSectionInLoadCommand<MachO::section>(*Sec, Begin);
+      continue;
+    case MachO::LC_SEGMENT_64:
+      if (IsLittleEndian != sys::IsLittleEndianHost)
+        MachO::swapStruct(MLC.segment_command_64_data);
+      memcpy(Begin, &MLC.segment_command_64_data,
+             sizeof(MachO::segment_command_64));
+      Begin += sizeof(MachO::segment_command_64);
+
+      for (const std::unique_ptr<Section> &Sec : LC.Sections)
+        writeSectionInLoadCommand<MachO::section_64>(*Sec, Begin);
+      continue;
+    }
+
+#define HANDLE_LOAD_COMMAND(LCName, LCValue, LCStruct)                         \
+  case MachO::LCName:                                                          \
+    assert(sizeof(MachO::LCStruct) + LC.Payload.size() ==                      \
+           MLC.load_command_data.cmdsize);                                     \
+    if (IsLittleEndian != sys::IsLittleEndianHost)                             \
+      MachO::swapStruct(MLC.LCStruct##_data);                                  \
+    memcpy(Begin, &MLC.LCStruct##_data, sizeof(MachO::LCStruct));              \
+    Begin += sizeof(MachO::LCStruct);                                          \
+    if (!LC.Payload.empty())                                                   \
+      memcpy(Begin, LC.Payload.data(), LC.Payload.size());                     \
+    Begin += LC.Payload.size();                                                \
+    break;
+
+    // Copy the load command as it is.
+    switch (MLC.load_command_data.cmd) {
+    default:
+      assert(sizeof(MachO::load_command) + LC.Payload.size() ==
+             MLC.load_command_data.cmdsize);
+      if (IsLittleEndian != sys::IsLittleEndianHost)
+        MachO::swapStruct(MLC.load_command_data);
+      memcpy(Begin, &MLC.load_command_data, sizeof(MachO::load_command));
+      Begin += sizeof(MachO::load_command);
+      if (!LC.Payload.empty())
+        memcpy(Begin, LC.Payload.data(), LC.Payload.size());
+      Begin += LC.Payload.size();
+      break;
+#include "llvm/BinaryFormat/MachO.def"
+    }
+  }
+}
+
+template <typename StructType>
+void MachOWriter::writeSectionInLoadCommand(const Section &Sec, uint8_t *&Out) {
+  StructType Temp;
+  assert(Sec.Segname.size() <= sizeof(Temp.segname) && "too long segment name");
+  assert(Sec.Sectname.size() <= sizeof(Temp.sectname) &&
+         "too long section name");
+  memset(&Temp, 0, sizeof(StructType));
+  memcpy(Temp.segname, Sec.Segname.data(), Sec.Segname.size());
+  memcpy(Temp.sectname, Sec.Sectname.data(), Sec.Sectname.size());
+  Temp.addr = Sec.Addr;
+  Temp.size = Sec.Size;
+  Temp.offset = Sec.Offset;
+  Temp.align = Sec.Align;
+  Temp.reloff = Sec.RelOff;
+  Temp.nreloc = Sec.NReloc;
+  Temp.flags = Sec.Flags;
+  Temp.reserved1 = Sec.Reserved1;
+  Temp.reserved2 = Sec.Reserved2;
+
+  if (IsLittleEndian != sys::IsLittleEndianHost)
+    MachO::swapStruct(Temp);
+  memcpy(Out, &Temp, sizeof(StructType));
+  Out += sizeof(StructType);
+}
+
+void MachOWriter::writeSections() {
+  for (const LoadCommand &LC : O.LoadCommands)
+    for (const std::unique_ptr<Section> &Sec : LC.Sections) {
+      if (!Sec->hasValidOffset()) {
+        assert((Sec->Offset == 0) && "Skipped section's offset must be zero");
+        assert((Sec->isVirtualSection() || Sec->Size == 0) &&
+               "Non-zero-fill sections with zero offset must have zero size");
+        continue;
+      }
+
+      assert(Sec->Offset && "Section offset can not be zero");
+      assert((Sec->Size == Sec->Content.size()) && "Incorrect section size");
+      memcpy(Buf->getBufferStart() + Sec->Offset, Sec->Content.data(),
+             Sec->Content.size());
+      for (size_t Index = 0; Index < Sec->Relocations.size(); ++Index) {
+        RelocationInfo RelocInfo = Sec->Relocations[Index];
+        if (!RelocInfo.Scattered && !RelocInfo.IsAddend) {
+          const uint32_t SymbolNum = RelocInfo.Extern
+                                         ? (*RelocInfo.Symbol)->Index
+                                         : (*RelocInfo.Sec)->Index;
+          RelocInfo.setPlainRelocationSymbolNum(SymbolNum, IsLittleEndian);
+        }
+        if (IsLittleEndian != sys::IsLittleEndianHost)
+          MachO::swapStruct(
+              reinterpret_cast<MachO::any_relocation_info &>(RelocInfo.Info));
+        memcpy(Buf->getBufferStart() + Sec->RelOff +
+                   Index * sizeof(MachO::any_relocation_info),
+               &RelocInfo.Info, sizeof(RelocInfo.Info));
+      }
+    }
+}
+
+template <typename NListType>
+void writeNListEntry(const SymbolEntry &SE, bool IsLittleEndian, char *&Out,
+                     uint32_t Nstrx) {
+  NListType ListEntry;
+  ListEntry.n_strx = Nstrx;
+  ListEntry.n_type = SE.n_type;
+  ListEntry.n_sect = SE.n_sect;
+  ListEntry.n_desc = SE.n_desc;
+  ListEntry.n_value = SE.n_value;
+
+  if (IsLittleEndian != sys::IsLittleEndianHost)
+    MachO::swapStruct(ListEntry);
+  memcpy(Out, reinterpret_cast<const char *>(&ListEntry), sizeof(NListType));
+  Out += sizeof(NListType);
+}
+
+void MachOWriter::writeStringTable() {
+  if (!O.SymTabCommandIndex)
+    return;
+  const MachO::symtab_command &SymTabCommand =
+      O.LoadCommands[*O.SymTabCommandIndex]
+          .MachOLoadCommand.symtab_command_data;
+
+  uint8_t *StrTable = (uint8_t *)Buf->getBufferStart() + SymTabCommand.stroff;
+  LayoutBuilder.getStringTableBuilder().write(StrTable);
+}
+
+void MachOWriter::writeSymbolTable() {
+  if (!O.SymTabCommandIndex)
+    return;
+  const MachO::symtab_command &SymTabCommand =
+      O.LoadCommands[*O.SymTabCommandIndex]
+          .MachOLoadCommand.symtab_command_data;
+
+  char *SymTable = (char *)Buf->getBufferStart() + SymTabCommand.symoff;
+  for (auto Iter = O.SymTable.Symbols.begin(), End = O.SymTable.Symbols.end();
+       Iter != End; Iter++) {
+    SymbolEntry *Sym = Iter->get();
+    uint32_t Nstrx = LayoutBuilder.getStringTableBuilder().getOffset(Sym->Name);
+
+    if (Is64Bit)
+      writeNListEntry<MachO::nlist_64>(*Sym, IsLittleEndian, SymTable, Nstrx);
+    else
+      writeNListEntry<MachO::nlist>(*Sym, IsLittleEndian, SymTable, Nstrx);
+  }
+}
+
+void MachOWriter::writeRebaseInfo() {
+  if (!O.DyLdInfoCommandIndex)
+    return;
+  const MachO::dyld_info_command &DyLdInfoCommand =
+      O.LoadCommands[*O.DyLdInfoCommandIndex]
+          .MachOLoadCommand.dyld_info_command_data;
+  char *Out = (char *)Buf->getBufferStart() + DyLdInfoCommand.rebase_off;
+  assert((DyLdInfoCommand.rebase_size == O.Rebases.Opcodes.size()) &&
+         "Incorrect rebase opcodes size");
+  memcpy(Out, O.Rebases.Opcodes.data(), O.Rebases.Opcodes.size());
+}
+
+void MachOWriter::writeBindInfo() {
+  if (!O.DyLdInfoCommandIndex)
+    return;
+  const MachO::dyld_info_command &DyLdInfoCommand =
+      O.LoadCommands[*O.DyLdInfoCommandIndex]
+          .MachOLoadCommand.dyld_info_command_data;
+  char *Out = (char *)Buf->getBufferStart() + DyLdInfoCommand.bind_off;
+  assert((DyLdInfoCommand.bind_size == O.Binds.Opcodes.size()) &&
+         "Incorrect bind opcodes size");
+  memcpy(Out, O.Binds.Opcodes.data(), O.Binds.Opcodes.size());
+}
+
+void MachOWriter::writeWeakBindInfo() {
+  if (!O.DyLdInfoCommandIndex)
+    return;
+  const MachO::dyld_info_command &DyLdInfoCommand =
+      O.LoadCommands[*O.DyLdInfoCommandIndex]
+          .MachOLoadCommand.dyld_info_command_data;
+  char *Out = (char *)Buf->getBufferStart() + DyLdInfoCommand.weak_bind_off;
+  assert((DyLdInfoCommand.weak_bind_size == O.WeakBinds.Opcodes.size()) &&
+         "Incorrect weak bind opcodes size");
+  memcpy(Out, O.WeakBinds.Opcodes.data(), O.WeakBinds.Opcodes.size());
+}
+
+void MachOWriter::writeLazyBindInfo() {
+  if (!O.DyLdInfoCommandIndex)
+    return;
+  const MachO::dyld_info_command &DyLdInfoCommand =
+      O.LoadCommands[*O.DyLdInfoCommandIndex]
+          .MachOLoadCommand.dyld_info_command_data;
+  char *Out = (char *)Buf->getBufferStart() + DyLdInfoCommand.lazy_bind_off;
+  assert((DyLdInfoCommand.lazy_bind_size == O.LazyBinds.Opcodes.size()) &&
+         "Incorrect lazy bind opcodes size");
+  memcpy(Out, O.LazyBinds.Opcodes.data(), O.LazyBinds.Opcodes.size());
+}
+
+void MachOWriter::writeExportInfo() {
+  if (!O.DyLdInfoCommandIndex)
+    return;
+  const MachO::dyld_info_command &DyLdInfoCommand =
+      O.LoadCommands[*O.DyLdInfoCommandIndex]
+          .MachOLoadCommand.dyld_info_command_data;
+  char *Out = (char *)Buf->getBufferStart() + DyLdInfoCommand.export_off;
+  assert((DyLdInfoCommand.export_size == O.Exports.Trie.size()) &&
+         "Incorrect export trie size");
+  memcpy(Out, O.Exports.Trie.data(), O.Exports.Trie.size());
+}
+
+void MachOWriter::writeIndirectSymbolTable() {
+  if (!O.DySymTabCommandIndex)
+    return;
+
+  const MachO::dysymtab_command &DySymTabCommand =
+      O.LoadCommands[*O.DySymTabCommandIndex]
+          .MachOLoadCommand.dysymtab_command_data;
+
+  uint32_t *Out =
+      (uint32_t *)(Buf->getBufferStart() + DySymTabCommand.indirectsymoff);
+  for (const IndirectSymbolEntry &Sym : O.IndirectSymTable.Symbols) {
+    uint32_t Entry = (Sym.Symbol) ? (*Sym.Symbol)->Index : Sym.OriginalIndex;
+    if (IsLittleEndian != sys::IsLittleEndianHost)
+      sys::swapByteOrder(Entry);
+    *Out++ = Entry;
+  }
+}
+
+void MachOWriter::writeLinkData(Optional<size_t> LCIndex, const LinkData &LD) {
+  if (!LCIndex)
+    return;
+  const MachO::linkedit_data_command &LinkEditDataCommand =
+      O.LoadCommands[*LCIndex].MachOLoadCommand.linkedit_data_command_data;
+  char *Out = (char *)Buf->getBufferStart() + LinkEditDataCommand.dataoff;
+  assert((LinkEditDataCommand.datasize == LD.Data.size()) &&
+         "Incorrect data size");
+  memcpy(Out, LD.Data.data(), LD.Data.size());
+}
+
+static uint64_t
+getSegmentFileOffset(const LoadCommand &TextSegmentLoadCommand) {
+  const MachO::macho_load_command &MLC =
+      TextSegmentLoadCommand.MachOLoadCommand;
+  switch (MLC.load_command_data.cmd) {
+  case MachO::LC_SEGMENT:
+    return MLC.segment_command_data.fileoff;
+  case MachO::LC_SEGMENT_64:
+    return MLC.segment_command_64_data.fileoff;
+  default:
+    return 0;
+  }
+}
+
+static uint64_t getSegmentFileSize(const LoadCommand &TextSegmentLoadCommand) {
+  const MachO::macho_load_command &MLC =
+      TextSegmentLoadCommand.MachOLoadCommand;
+  switch (MLC.load_command_data.cmd) {
+  case MachO::LC_SEGMENT:
+    return MLC.segment_command_data.filesize;
+  case MachO::LC_SEGMENT_64:
+    return MLC.segment_command_64_data.filesize;
+  default:
+    return 0;
+  }
+}
+
+void MachOWriter::writeCodeSignatureData() {
+  // NOTE: This CodeSignature section behaviour must be kept in sync with that
+  // performed in LLD's CodeSignatureSection::write /
+  // CodeSignatureSection::writeHashes. Furthermore, this call must occur only
+  // after the rest of the binary has already been written to the buffer. This
+  // is because the buffer is read from to perform the necessary hashing.
+
+  // The CodeSignature section is the last section in the MachO binary and
+  // contains a hash of all content in the binary before it. Since llvm-objcopy
+  // has likely modified the target binary, the hash must be regenerated
+  // entirely. To generate this hash, we must read from the start of the binary
+  // (HashReadStart) to just before the start of the CodeSignature section
+  // (HashReadEnd).
+
+  const CodeSignatureInfo &CodeSignature = LayoutBuilder.getCodeSignature();
+
+  uint8_t *BufferStart = reinterpret_cast<uint8_t *>(Buf->getBufferStart());
+  uint8_t *HashReadStart = BufferStart;
+  uint8_t *HashReadEnd = BufferStart + CodeSignature.StartOffset;
+
+  // The CodeSignature section begins with a header, after which the hashes
+  // of each page of the binary are written.
+  uint8_t *HashWriteStart = HashReadEnd + CodeSignature.AllHeadersSize;
+
+  uint32_t TextSegmentFileOff = 0;
+  uint32_t TextSegmentFileSize = 0;
+  if (O.TextSegmentCommandIndex) {
+    const LoadCommand &TextSegmentLoadCommand =
+        O.LoadCommands[*O.TextSegmentCommandIndex];
+    assert(TextSegmentLoadCommand.MachOLoadCommand.load_command_data.cmd ==
+               MachO::LC_SEGMENT ||
+           TextSegmentLoadCommand.MachOLoadCommand.load_command_data.cmd ==
+               MachO::LC_SEGMENT_64);
+    assert(StringRef(TextSegmentLoadCommand.MachOLoadCommand
+                         .segment_command_data.segname) == "__TEXT");
+    TextSegmentFileOff = getSegmentFileOffset(TextSegmentLoadCommand);
+    TextSegmentFileSize = getSegmentFileSize(TextSegmentLoadCommand);
+  }
+
+  const uint32_t FileNamePad = CodeSignature.AllHeadersSize -
+                               CodeSignature.FixedHeadersSize -
+                               CodeSignature.OutputFileName.size();
+
+  // Write code section header.
+  auto *SuperBlob = reinterpret_cast<MachO::CS_SuperBlob *>(HashReadEnd);
+  write32be(&SuperBlob->magic, MachO::CSMAGIC_EMBEDDED_SIGNATURE);
+  write32be(&SuperBlob->length, CodeSignature.Size);
+  write32be(&SuperBlob->count, 1);
+  auto *BlobIndex = reinterpret_cast<MachO::CS_BlobIndex *>(&SuperBlob[1]);
+  write32be(&BlobIndex->type, MachO::CSSLOT_CODEDIRECTORY);
+  write32be(&BlobIndex->offset, CodeSignature.BlobHeadersSize);
+  auto *CodeDirectory = reinterpret_cast<MachO::CS_CodeDirectory *>(
+      HashReadEnd + CodeSignature.BlobHeadersSize);
+  write32be(&CodeDirectory->magic, MachO::CSMAGIC_CODEDIRECTORY);
+  write32be(&CodeDirectory->length,
+            CodeSignature.Size - CodeSignature.BlobHeadersSize);
+  write32be(&CodeDirectory->version, MachO::CS_SUPPORTSEXECSEG);
+  write32be(&CodeDirectory->flags, MachO::CS_ADHOC | MachO::CS_LINKER_SIGNED);
+  write32be(&CodeDirectory->hashOffset,
+            sizeof(MachO::CS_CodeDirectory) +
+                CodeSignature.OutputFileName.size() + FileNamePad);
+  write32be(&CodeDirectory->identOffset, sizeof(MachO::CS_CodeDirectory));
+  CodeDirectory->nSpecialSlots = 0;
+  write32be(&CodeDirectory->nCodeSlots, CodeSignature.BlockCount);
+  write32be(&CodeDirectory->codeLimit, CodeSignature.StartOffset);
+  CodeDirectory->hashSize = static_cast<uint8_t>(CodeSignature.HashSize);
+  CodeDirectory->hashType = MachO::kSecCodeSignatureHashSHA256;
+  CodeDirectory->platform = 0;
+  CodeDirectory->pageSize = CodeSignature.BlockSizeShift;
+  CodeDirectory->spare2 = 0;
+  CodeDirectory->scatterOffset = 0;
+  CodeDirectory->teamOffset = 0;
+  CodeDirectory->spare3 = 0;
+  CodeDirectory->codeLimit64 = 0;
+  write64be(&CodeDirectory->execSegBase, TextSegmentFileOff);
+  write64be(&CodeDirectory->execSegLimit, TextSegmentFileSize);
+  write64be(&CodeDirectory->execSegFlags, O.Header.FileType == MachO::MH_EXECUTE
+                                              ? MachO::CS_EXECSEG_MAIN_BINARY
+                                              : 0);
+
+  auto *Id = reinterpret_cast<char *>(&CodeDirectory[1]);
+  memcpy(Id, CodeSignature.OutputFileName.begin(),
+         CodeSignature.OutputFileName.size());
+  memset(Id + CodeSignature.OutputFileName.size(), 0, FileNamePad);
+
+  // Write the hashes.
+  uint8_t *CurrHashReadPosition = HashReadStart;
+  uint8_t *CurrHashWritePosition = HashWriteStart;
+  while (CurrHashReadPosition < HashReadEnd) {
+    StringRef Block(reinterpret_cast<char *>(CurrHashReadPosition),
+                    std::min(static_cast<size_t>(HashReadEnd
+                             - CurrHashReadPosition),
+                             static_cast<size_t>(CodeSignature.BlockSize)));
+    SHA256 Hasher;
+    Hasher.update(Block);
+    std::array<uint8_t, 32> Hash = Hasher.final();
+    assert(Hash.size() == CodeSignature.HashSize);
+    memcpy(CurrHashWritePosition, Hash.data(), CodeSignature.HashSize);
+    CurrHashReadPosition += CodeSignature.BlockSize;
+    CurrHashWritePosition += CodeSignature.HashSize;
+  }
+#if defined(__APPLE__)
+  // This is macOS-specific work-around and makes no sense for any
+  // other host OS. See https://openradar.appspot.com/FB8914231
+  //
+  // The macOS kernel maintains a signature-verification cache to
+  // quickly validate applications at time of execve(2).  The trouble
+  // is that for the kernel creates the cache entry at the time of the
+  // mmap(2) call, before we have a chance to write either the code to
+  // sign or the signature header+hashes.  The fix is to invalidate
+  // all cached data associated with the output file, thus discarding
+  // the bogus prematurely-cached signature.
+  msync(BufferStart, CodeSignature.StartOffset + CodeSignature.Size,
+        MS_INVALIDATE);
+#endif
+}
+
+void MachOWriter::writeDataInCodeData() {
+  return writeLinkData(O.DataInCodeCommandIndex, O.DataInCode);
+}
+
+void MachOWriter::writeLinkerOptimizationHint() {
+  return writeLinkData(O.LinkerOptimizationHintCommandIndex,
+                       O.LinkerOptimizationHint);
+}
+
+void MachOWriter::writeFunctionStartsData() {
+  return writeLinkData(O.FunctionStartsCommandIndex, O.FunctionStarts);
+}
+
+void MachOWriter::writeChainedFixupsData() {
+  return writeLinkData(O.ChainedFixupsCommandIndex, O.ChainedFixups);
+}
+
+void MachOWriter::writeExportsTrieData() {
+  return writeLinkData(O.ExportsTrieCommandIndex, O.ExportsTrie);
+}
+
+void MachOWriter::writeTail() {
+  typedef void (MachOWriter::*WriteHandlerType)();
+  typedef std::pair<uint64_t, WriteHandlerType> WriteOperation;
+  SmallVector<WriteOperation, 7> Queue;
+
+  if (O.SymTabCommandIndex) {
+    const MachO::symtab_command &SymTabCommand =
+        O.LoadCommands[*O.SymTabCommandIndex]
+            .MachOLoadCommand.symtab_command_data;
+    if (SymTabCommand.symoff)
+      Queue.push_back({SymTabCommand.symoff, &MachOWriter::writeSymbolTable});
+    if (SymTabCommand.stroff)
+      Queue.push_back({SymTabCommand.stroff, &MachOWriter::writeStringTable});
+  }
+
+  if (O.DyLdInfoCommandIndex) {
+    const MachO::dyld_info_command &DyLdInfoCommand =
+        O.LoadCommands[*O.DyLdInfoCommandIndex]
+            .MachOLoadCommand.dyld_info_command_data;
+    if (DyLdInfoCommand.rebase_off)
+      Queue.push_back(
+          {DyLdInfoCommand.rebase_off, &MachOWriter::writeRebaseInfo});
+    if (DyLdInfoCommand.bind_off)
+      Queue.push_back({DyLdInfoCommand.bind_off, &MachOWriter::writeBindInfo});
+    if (DyLdInfoCommand.weak_bind_off)
+      Queue.push_back(
+          {DyLdInfoCommand.weak_bind_off, &MachOWriter::writeWeakBindInfo});
+    if (DyLdInfoCommand.lazy_bind_off)
+      Queue.push_back(
+          {DyLdInfoCommand.lazy_bind_off, &MachOWriter::writeLazyBindInfo});
+    if (DyLdInfoCommand.export_off)
+      Queue.push_back(
+          {DyLdInfoCommand.export_off, &MachOWriter::writeExportInfo});
+  }
+
+  if (O.DySymTabCommandIndex) {
+    const MachO::dysymtab_command &DySymTabCommand =
+        O.LoadCommands[*O.DySymTabCommandIndex]
+            .MachOLoadCommand.dysymtab_command_data;
+
+    if (DySymTabCommand.indirectsymoff)
+      Queue.emplace_back(DySymTabCommand.indirectsymoff,
+                         &MachOWriter::writeIndirectSymbolTable);
+  }
+
+  std::initializer_list<std::pair<Optional<size_t>, WriteHandlerType>>
+      LinkEditDataCommandWriters = {
+          {O.CodeSignatureCommandIndex, &MachOWriter::writeCodeSignatureData},
+          {O.DataInCodeCommandIndex, &MachOWriter::writeDataInCodeData},
+          {O.LinkerOptimizationHintCommandIndex,
+           &MachOWriter::writeLinkerOptimizationHint},
+          {O.FunctionStartsCommandIndex, &MachOWriter::writeFunctionStartsData},
+          {O.ChainedFixupsCommandIndex, &MachOWriter::writeChainedFixupsData},
+          {O.ExportsTrieCommandIndex, &MachOWriter::writeExportsTrieData}};
+  for (const auto &W : LinkEditDataCommandWriters) {
+    Optional<size_t> LinkEditDataCommandIndex;
+    WriteHandlerType WriteHandler;
+    std::tie(LinkEditDataCommandIndex, WriteHandler) = W;
+    if (LinkEditDataCommandIndex) {
+      const MachO::linkedit_data_command &LinkEditDataCommand =
+          O.LoadCommands[*LinkEditDataCommandIndex]
+              .MachOLoadCommand.linkedit_data_command_data;
+      if (LinkEditDataCommand.dataoff)
+        Queue.emplace_back(LinkEditDataCommand.dataoff, WriteHandler);
+    }
+  }
+
+  llvm::sort(Queue, llvm::less_first());
+
+  for (auto WriteOp : Queue)
+    (this->*WriteOp.second)();
+}
+
+Error MachOWriter::finalize() { return LayoutBuilder.layout(); }
+
+Error MachOWriter::write() {
+  size_t TotalSize = totalSize();
+  Buf = WritableMemoryBuffer::getNewMemBuffer(TotalSize);
+  if (!Buf)
+    return createStringError(errc::not_enough_memory,
+                             "failed to allocate memory buffer of " +
+                                 Twine::utohexstr(TotalSize) + " bytes");
+  writeHeader();
+  writeLoadCommands();
+  writeSections();
+  writeTail();
+
+  // TODO: Implement direct writing to the output stream (without intermediate
+  // memory buffer Buf).
+  Out.write(Buf->getBufferStart(), Buf->getBufferSize());
+  return Error::success();
+}
diff --git a/llvm/lib/ObjCopy/MachO/MachOWriter.h b/llvm/lib/ObjCopy/MachO/MachOWriter.h
new file mode 100644
index 000000000000..a54c10294246
--- /dev/null
+++ b/llvm/lib/ObjCopy/MachO/MachOWriter.h
@@ -0,0 +1,76 @@
+//===- MachOWriter.h --------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_OBJCOPY_MACHO_MACHOWRITER_H
+#define LLVM_LIB_OBJCOPY_MACHO_MACHOWRITER_H
+
+#include "MachOLayoutBuilder.h"
+#include "MachOObject.h"
+#include "llvm/BinaryFormat/MachO.h"
+#include "llvm/ObjCopy/MachO/MachOObjcopy.h"
+#include "llvm/Object/MachO.h"
+
+namespace llvm {
+class Error;
+
+namespace objcopy {
+namespace macho {
+
+class MachOWriter {
+  Object &O;
+  bool Is64Bit;
+  bool IsLittleEndian;
+  uint64_t PageSize;
+  std::unique_ptr<WritableMemoryBuffer> Buf;
+  raw_ostream &Out;
+  MachOLayoutBuilder LayoutBuilder;
+
+  size_t headerSize() const;
+  size_t loadCommandsSize() const;
+  size_t symTableSize() const;
+  size_t strTableSize() const;
+
+  void writeHeader();
+  void writeLoadCommands();
+  template <typename StructType>
+  void writeSectionInLoadCommand(const Section &Sec, uint8_t *&Out);
+  void writeSections();
+  void writeSymbolTable();
+  void writeStringTable();
+  void writeRebaseInfo();
+  void writeBindInfo();
+  void writeWeakBindInfo();
+  void writeLazyBindInfo();
+  void writeExportInfo();
+  void writeIndirectSymbolTable();
+  void writeLinkData(Optional<size_t> LCIndex, const LinkData &LD);
+  void writeCodeSignatureData();
+  void writeDataInCodeData();
+  void writeLinkerOptimizationHint();
+  void writeFunctionStartsData();
+  void writeChainedFixupsData();
+  void writeExportsTrieData();
+  void writeTail();
+
+public:
+  MachOWriter(Object &O, bool Is64Bit, bool IsLittleEndian,
+              StringRef OutputFileName, uint64_t PageSize, raw_ostream &Out)
+      : O(O), Is64Bit(Is64Bit), IsLittleEndian(IsLittleEndian),
+        PageSize(PageSize), Out(Out),
+        LayoutBuilder(O, Is64Bit, OutputFileName, PageSize) {}
+
+  size_t totalSize() const;
+  Error finalize();
+  Error write();
+};
+
+} // end namespace macho
+} // end namespace objcopy
+} // end namespace llvm
+
+#endif // LLVM_LIB_OBJCOPY_MACHO_MACHOWRITER_H
diff --git a/llvm/lib/ObjCopy/ObjCopy.cpp b/llvm/lib/ObjCopy/ObjCopy.cpp
new file mode 100644
index 000000000000..16968d202265
--- /dev/null
+++ b/llvm/lib/ObjCopy/ObjCopy.cpp
@@ -0,0 +1,90 @@
+//===- Objcopy.cpp --------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ObjCopy/ObjCopy.h"
+#include "llvm/ObjCopy/COFF/COFFConfig.h"
+#include "llvm/ObjCopy/COFF/COFFObjcopy.h"
+#include "llvm/ObjCopy/CommonConfig.h"
+#include "llvm/ObjCopy/ELF/ELFConfig.h"
+#include "llvm/ObjCopy/ELF/ELFObjcopy.h"
+#include "llvm/ObjCopy/MachO/MachOConfig.h"
+#include "llvm/ObjCopy/MachO/MachOObjcopy.h"
+#include "llvm/ObjCopy/MultiFormatConfig.h"
+#include "llvm/ObjCopy/wasm/WasmConfig.h"
+#include "llvm/ObjCopy/wasm/WasmObjcopy.h"
+#include "llvm/ObjCopy/XCOFF/XCOFFConfig.h"
+#include "llvm/ObjCopy/XCOFF/XCOFFObjcopy.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/Error.h"
+#include "llvm/Object/MachO.h"
+#include "llvm/Object/MachOUniversal.h"
+#include "llvm/Object/Wasm.h"
+#include "llvm/Object/XCOFFObjectFile.h"
+#include "llvm/Support/SmallVectorMemoryBuffer.h"
+
+namespace llvm {
+namespace objcopy {
+
+using namespace llvm::object;
+
+/// The function executeObjcopyOnBinary does the dispatch based on the format
+/// of the input binary (ELF, MachO or COFF).
+Error executeObjcopyOnBinary(const MultiFormatConfig &Config,
+                             object::Binary &In, raw_ostream &Out) {
+  if (auto *ELFBinary = dyn_cast<object::ELFObjectFileBase>(&In)) {
+    Expected<const ELFConfig &> ELFConfig = Config.getELFConfig();
+    if (!ELFConfig)
+      return ELFConfig.takeError();
+
+    return elf::executeObjcopyOnBinary(Config.getCommonConfig(), *ELFConfig,
+                                       *ELFBinary, Out);
+  }
+  if (auto *COFFBinary = dyn_cast<object::COFFObjectFile>(&In)) {
+    Expected<const COFFConfig &> COFFConfig = Config.getCOFFConfig();
+    if (!COFFConfig)
+      return COFFConfig.takeError();
+
+    return coff::executeObjcopyOnBinary(Config.getCommonConfig(), *COFFConfig,
+                                        *COFFBinary, Out);
+  }
+  if (auto *MachOBinary = dyn_cast<object::MachOObjectFile>(&In)) {
+    Expected<const MachOConfig &> MachOConfig = Config.getMachOConfig();
+    if (!MachOConfig)
+      return MachOConfig.takeError();
+
+    return macho::executeObjcopyOnBinary(Config.getCommonConfig(), *MachOConfig,
+                                         *MachOBinary, Out);
+  }
+  if (auto *MachOUniversalBinary =
+          dyn_cast<object::MachOUniversalBinary>(&In)) {
+    return macho::executeObjcopyOnMachOUniversalBinary(
+        Config, *MachOUniversalBinary, Out);
+  }
+  if (auto *WasmBinary = dyn_cast<object::WasmObjectFile>(&In)) {
+    Expected<const WasmConfig &> WasmConfig = Config.getWasmConfig();
+    if (!WasmConfig)
+      return WasmConfig.takeError();
+
+    return objcopy::wasm::executeObjcopyOnBinary(Config.getCommonConfig(),
+                                                 *WasmConfig, *WasmBinary, Out);
+  }
+  if (auto *XCOFFBinary = dyn_cast<object::XCOFFObjectFile>(&In)) {
+    Expected<const XCOFFConfig &> XCOFFConfig = Config.getXCOFFConfig();
+    if (!XCOFFConfig)
+      return XCOFFConfig.takeError();
+
+    return xcoff::executeObjcopyOnBinary(Config.getCommonConfig(), *XCOFFConfig,
+                                         *XCOFFBinary, Out);
+  }
+  return createStringError(object_error::invalid_file_type,
+                           "unsupported object file format");
+}
+
+} // end namespace objcopy
+} // end namespace llvm
diff --git a/llvm/lib/ObjCopy/XCOFF/XCOFFObjcopy.cpp b/llvm/lib/ObjCopy/XCOFF/XCOFFObjcopy.cpp
new file mode 100644
index 000000000000..f6e29bd315cb
--- /dev/null
+++ b/llvm/lib/ObjCopy/XCOFF/XCOFFObjcopy.cpp
@@ -0,0 +1,45 @@
+//===- XCOFFObjcopy.cpp ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ObjCopy/CommonConfig.h"
+#include "llvm/ObjCopy/XCOFF/XCOFFConfig.h"
+#include "llvm/ObjCopy/XCOFF/XCOFFObjcopy.h"
+#include "llvm/Support/Errc.h"
+#include "XCOFFObject.h"
+#include "XCOFFReader.h"
+#include "XCOFFWriter.h"
+
+namespace llvm {
+namespace objcopy {
+namespace xcoff {
+
+using namespace object;
+
+static Error handleArgs(const CommonConfig &Config, Object &Obj) {
+  return Error::success();
+}
+
+Error executeObjcopyOnBinary(const CommonConfig &Config, const XCOFFConfig &,
+                             XCOFFObjectFile &In, raw_ostream &Out) {
+  XCOFFReader Reader(In);
+  Expected<std::unique_ptr<Object>> ObjOrErr = Reader.create();
+  if (!ObjOrErr)
+    return createFileError(Config.InputFilename, ObjOrErr.takeError());
+  Object *Obj = ObjOrErr->get();
+  assert(Obj && "Unable to deserialize XCOFF object");
+  if (Error E = handleArgs(Config, *Obj))
+    return createFileError(Config.InputFilename, std::move(E));
+  XCOFFWriter Writer(*Obj, Out);
+  if (Error E = Writer.write())
+    return createFileError(Config.OutputFilename, std::move(E));
+  return Error::success();
+}
+
+} // end namespace xcoff
+} // end namespace objcopy
+} // end namespace llvm
diff --git a/llvm/lib/ObjCopy/XCOFF/XCOFFObject.h b/llvm/lib/ObjCopy/XCOFF/XCOFFObject.h
new file mode 100644
index 000000000000..3c68b6d3878f
--- /dev/null
+++ b/llvm/lib/ObjCopy/XCOFF/XCOFFObject.h
@@ -0,0 +1,48 @@
+//===- XCOFFObject.h --------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_OBJCOPY_XCOFF_XCOFFOBJECT_H
+#define LLVM_LIB_OBJCOPY_XCOFF_XCOFFOBJECT_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Object/XCOFFObjectFile.h"
+#include <vector>
+
+namespace llvm {
+namespace objcopy {
+namespace xcoff {
+
+using namespace object;
+
+struct Section {
+  XCOFFSectionHeader32 SectionHeader;
+  ArrayRef<uint8_t> Contents;
+  std::vector<XCOFFRelocation32> Relocations;
+};
+
+struct Symbol {
+  XCOFFSymbolEntry32 Sym;
+  // For now, each auxiliary symbol is only an opaque binary blob with no
+  // distinction.
+  StringRef AuxSymbolEntries;
+};
+
+struct Object {
+  XCOFFFileHeader32 FileHeader;
+  XCOFFAuxiliaryHeader32 OptionalFileHeader;
+  std::vector<Section> Sections;
+  std::vector<Symbol> Symbols;
+  StringRef StringTable;
+};
+
+} // end namespace xcoff
+} // end namespace objcopy
+} // end namespace llvm
+
+#endif // LLVM_LIB_OBJCOPY_XCOFF_XCOFFOBJECT_H
diff --git a/llvm/lib/ObjCopy/XCOFF/XCOFFReader.cpp b/llvm/lib/ObjCopy/XCOFF/XCOFFReader.cpp
new file mode 100644
index 000000000000..8ad3021a0342
--- /dev/null
+++ b/llvm/lib/ObjCopy/XCOFF/XCOFFReader.cpp
@@ -0,0 +1,101 @@
+//===- XCOFFReader.cpp ----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCOFFReader.h"
+
+namespace llvm {
+namespace objcopy {
+namespace xcoff {
+
+using namespace object;
+
+Error XCOFFReader::readSections(Object &Obj) const {
+  ArrayRef<XCOFFSectionHeader32> Sections = XCOFFObj.sections32();
+  for (const XCOFFSectionHeader32 &Sec : Sections) {
+    Section ReadSec;
+    // Section header.
+    ReadSec.SectionHeader = Sec;
+    DataRefImpl SectionDRI;
+    SectionDRI.p = reinterpret_cast<uintptr_t>(&Sec);
+
+    // Section data.
+    if (Sec.SectionSize) {
+      Expected<ArrayRef<uint8_t>> ContentsRef =
+          XCOFFObj.getSectionContents(SectionDRI);
+      if (!ContentsRef)
+        return ContentsRef.takeError();
+      ReadSec.Contents = ContentsRef.get();
+    }
+
+    // Relocations.
+    if (Sec.NumberOfRelocations) {
+      auto Relocations =
+          XCOFFObj.relocations<XCOFFSectionHeader32, XCOFFRelocation32>(Sec);
+      if (!Relocations)
+        return Relocations.takeError();
+      for (const XCOFFRelocation32 &Rel : Relocations.get())
+        ReadSec.Relocations.push_back(Rel);
+    }
+
+    Obj.Sections.push_back(std::move(ReadSec));
+  }
+  return Error::success();
+}
+
+Error XCOFFReader::readSymbols(Object &Obj) const {
+  std::vector<Symbol> Symbols;
+  Symbols.reserve(XCOFFObj.getNumberOfSymbolTableEntries());
+  for (SymbolRef Sym : XCOFFObj.symbols()) {
+    Symbol ReadSym;
+    DataRefImpl SymbolDRI = Sym.getRawDataRefImpl();
+    XCOFFSymbolRef SymbolEntRef = XCOFFObj.toSymbolRef(SymbolDRI);
+    ReadSym.Sym = *SymbolEntRef.getSymbol32();
+    // Auxiliary entries.
+    if (SymbolEntRef.getNumberOfAuxEntries()) {
+      const char *Start = reinterpret_cast<const char *>(
+          SymbolDRI.p + XCOFF::SymbolTableEntrySize);
+      Expected<StringRef> RawAuxEntriesOrError = XCOFFObj.getRawData(
+          Start,
+          XCOFF::SymbolTableEntrySize * SymbolEntRef.getNumberOfAuxEntries(),
+          StringRef("symbol"));
+      if (!RawAuxEntriesOrError)
+        return RawAuxEntriesOrError.takeError();
+      ReadSym.AuxSymbolEntries = RawAuxEntriesOrError.get();
+    }
+    Obj.Symbols.push_back(std::move(ReadSym));
+  }
+  return Error::success();
+}
+
+Expected<std::unique_ptr<Object>> XCOFFReader::create() const {
+  auto Obj = std::make_unique<Object>();
+  // Only 32-bit supported now.
+  if (XCOFFObj.is64Bit())
+    return createStringError(object_error::invalid_file_type,
+                             "64-bit XCOFF is not supported yet");
+  // Read the file header.
+  Obj->FileHeader = *XCOFFObj.fileHeader32();
+  // Read the optional header.
+  if (XCOFFObj.getOptionalHeaderSize())
+    Obj->OptionalFileHeader = *XCOFFObj.auxiliaryHeader32();
+  // Read each section.
+  Obj->Sections.reserve(XCOFFObj.getNumberOfSections());
+  if (Error E = readSections(*Obj))
+    return std::move(E);
+  // Read each symbol.
+  Obj->Symbols.reserve(XCOFFObj.getRawNumberOfSymbolTableEntries32());
+  if (Error E = readSymbols(*Obj))
+    return std::move(E);
+  // String table.
+  Obj->StringTable = XCOFFObj.getStringTable();
+  return std::move(Obj);
+}
+
+} // end namespace xcoff
+} // end namespace objcopy
+} // end namespace llvm
diff --git a/llvm/lib/ObjCopy/XCOFF/XCOFFReader.h b/llvm/lib/ObjCopy/XCOFF/XCOFFReader.h
new file mode 100644
index 000000000000..63a8d8579d37
--- /dev/null
+++ b/llvm/lib/ObjCopy/XCOFF/XCOFFReader.h
@@ -0,0 +1,35 @@
+//===- XCOFFReader.h --------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_OBJCOPY_XCOFF_XCOFFREADER_H
+#define LLVM_LIB_OBJCOPY_XCOFF_XCOFFREADER_H
+
+#include "XCOFFObject.h"
+
+namespace llvm {
+namespace objcopy {
+namespace xcoff {
+
+using namespace object;
+
+class XCOFFReader {
+public:
+  explicit XCOFFReader(const XCOFFObjectFile &O) : XCOFFObj(O) {}
+  Expected<std::unique_ptr<Object>> create() const;
+
+private:
+  const XCOFFObjectFile &XCOFFObj;
+  Error readSections(Object &Obj) const;
+  Error readSymbols(Object &Obj) const;
+};
+
+} // end namespace xcoff
+} // end namespace objcopy
+} // end namespace llvm
+
+#endif // LLVM_LIB_OBJCOPY_XCOFF_XCOFFREADER_H
diff --git a/llvm/lib/ObjCopy/XCOFF/XCOFFWriter.cpp b/llvm/lib/ObjCopy/XCOFF/XCOFFWriter.cpp
new file mode 100644
index 000000000000..bae3128822e2
--- /dev/null
+++ b/llvm/lib/ObjCopy/XCOFF/XCOFFWriter.cpp
@@ -0,0 +1,125 @@
+//===- XCOFFWriter.cpp ----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Errc.h"
+#include "XCOFFWriter.h"
+
+namespace llvm {
+namespace objcopy {
+namespace xcoff {
+
+using namespace object;
+
+void XCOFFWriter::finalizeHeaders() {
+  // File header.
+  FileSize += sizeof(XCOFFFileHeader32);
+  // Optional file header.
+  FileSize += Obj.FileHeader.AuxHeaderSize;
+  // Section headers.
+  FileSize += sizeof(XCOFFSectionHeader32) * Obj.Sections.size();
+}
+
+void XCOFFWriter::finalizeSections() {
+  for (const Section &Sec : Obj.Sections) {
+    // Section data.
+    FileSize += Sec.Contents.size();
+    // Relocations.
+    FileSize +=
+        Sec.SectionHeader.NumberOfRelocations * sizeof(XCOFFRelocation32);
+  }
+}
+
+void XCOFFWriter::finalizeSymbolStringTable() {
+  assert(Obj.FileHeader.SymbolTableOffset >= FileSize);
+  FileSize = Obj.FileHeader.SymbolTableOffset;
+  // Symbols and auxiliary entries.
+  FileSize +=
+      Obj.FileHeader.NumberOfSymTableEntries * XCOFF::SymbolTableEntrySize;
+  // String table.
+  FileSize += Obj.StringTable.size();
+}
+
+void XCOFFWriter::finalize() {
+  FileSize = 0;
+  finalizeHeaders();
+  finalizeSections();
+  finalizeSymbolStringTable();
+}
+
+void XCOFFWriter::writeHeaders() {
+  // Write the file header.
+  uint8_t *Ptr = reinterpret_cast<uint8_t *>(Buf->getBufferStart());
+  memcpy(Ptr, &Obj.FileHeader, sizeof(XCOFFFileHeader32));
+  Ptr += sizeof(XCOFFFileHeader32);
+
+  // Write the optional header.
+  if (Obj.FileHeader.AuxHeaderSize) {
+    memcpy(Ptr, &Obj.OptionalFileHeader, Obj.FileHeader.AuxHeaderSize);
+    Ptr += Obj.FileHeader.AuxHeaderSize;
+  }
+
+  // Write section headers.
+  for (const Section &Sec : Obj.Sections) {
+    memcpy(Ptr, &Sec.SectionHeader, sizeof(XCOFFSectionHeader32));
+    Ptr += sizeof(XCOFFSectionHeader32);
+  }
+}
+
+void XCOFFWriter::writeSections() {
+  // Write section data.
+  for (const Section &Sec : Obj.Sections) {
+    uint8_t *Ptr = reinterpret_cast<uint8_t *>(Buf->getBufferStart()) +
+                   Sec.SectionHeader.FileOffsetToRawData;
+    Ptr = std::copy(Sec.Contents.begin(), Sec.Contents.end(), Ptr);
+  }
+
+  // Write relocations.
+  for (const Section &Sec : Obj.Sections) {
+    uint8_t *Ptr = reinterpret_cast<uint8_t *>(Buf->getBufferStart()) +
+                   Sec.SectionHeader.FileOffsetToRelocationInfo;
+    for (const XCOFFRelocation32 &Rel : Sec.Relocations) {
+      memcpy(Ptr, &Rel, sizeof(XCOFFRelocation32));
+      Ptr += sizeof(XCOFFRelocation32);
+    }
+  }
+}
+
+void XCOFFWriter::writeSymbolStringTable() {
+  // Write symbols.
+  uint8_t *Ptr = reinterpret_cast<uint8_t *>(Buf->getBufferStart()) +
+                 Obj.FileHeader.SymbolTableOffset;
+  for (const Symbol &Sym : Obj.Symbols) {
+    memcpy(Ptr, &Sym.Sym, XCOFF::SymbolTableEntrySize);
+    Ptr += XCOFF::SymbolTableEntrySize;
+    // Auxiliary symbols.
+    memcpy(Ptr, Sym.AuxSymbolEntries.data(), Sym.AuxSymbolEntries.size());
+    Ptr += Sym.AuxSymbolEntries.size();
+  }
+  // Write the string table.
+  memcpy(Ptr, Obj.StringTable.data(), Obj.StringTable.size());
+  Ptr += Obj.StringTable.size();
+}
+
+Error XCOFFWriter::write() {
+  finalize();
+  Buf = WritableMemoryBuffer::getNewMemBuffer(FileSize);
+  if (!Buf)
+    return createStringError(errc::not_enough_memory,
+                             "failed to allocate memory buffer of " +
+                                 Twine::utohexstr(FileSize) + " bytes");
+
+  writeHeaders();
+  writeSections();
+  writeSymbolStringTable();
+  Out.write(Buf->getBufferStart(), Buf->getBufferSize());
+  return Error::success();
+}
+
+} // end namespace xcoff
+} // end namespace objcopy
+} // end namespace llvm
diff --git a/llvm/lib/ObjCopy/XCOFF/XCOFFWriter.h b/llvm/lib/ObjCopy/XCOFF/XCOFFWriter.h
new file mode 100644
index 000000000000..54c7b5f3ccbe
--- /dev/null
+++ b/llvm/lib/ObjCopy/XCOFF/XCOFFWriter.h
@@ -0,0 +1,48 @@
+//===- XCOFFWriter.h --------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_OBJCOPY_XCOFF_XCOFFWRITER_H
+#define LLVM_LIB_OBJCOPY_XCOFF_XCOFFWRITER_H
+
+#include "llvm/Support/MemoryBuffer.h"
+#include "XCOFFObject.h"
+
+#include <cstdint>
+#include <vector>
+
+namespace llvm {
+namespace objcopy {
+namespace xcoff {
+
+class XCOFFWriter {
+public:
+  virtual ~XCOFFWriter() {}
+  XCOFFWriter(Object &Obj, raw_ostream &Out) : Obj(Obj), Out(Out) {}
+  Error write();
+
+private:
+  Object &Obj;
+  raw_ostream &Out;
+  std::unique_ptr<WritableMemoryBuffer> Buf;
+  size_t FileSize;
+
+  void finalizeHeaders();
+  void finalizeSections();
+  void finalizeSymbolStringTable();
+  void finalize();
+
+  void writeHeaders();
+  void writeSections();
+  void writeSymbolStringTable();
+};
+
+} // end namespace xcoff
+} // end namespace objcopy
+} // end namespace llvm
+
+#endif // LLVM_LIB_OBJCOPY_XCOFF_XCOFFWRITER_H
diff --git a/llvm/lib/ObjCopy/wasm/WasmObjcopy.cpp b/llvm/lib/ObjCopy/wasm/WasmObjcopy.cpp
new file mode 100644
index 000000000000..6877cd68bee4
--- /dev/null
+++ b/llvm/lib/ObjCopy/wasm/WasmObjcopy.cpp
@@ -0,0 +1,160 @@
+//===- WasmObjcopy.cpp ----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ObjCopy/wasm/WasmObjcopy.h"
+#include "WasmObject.h"
+#include "WasmReader.h"
+#include "WasmWriter.h"
+#include "llvm/ObjCopy/CommonConfig.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/FileOutputBuffer.h"
+
+namespace llvm {
+namespace objcopy {
+namespace wasm {
+
+using namespace object;
+using SectionPred = std::function<bool(const Section &Sec)>;
+
+static bool isDebugSection(const Section &Sec) {
+  return Sec.Name.startswith(".debug");
+}
+
+static bool isLinkerSection(const Section &Sec) {
+  return Sec.Name.startswith("reloc.") || Sec.Name == "linking";
+}
+
+static bool isNameSection(const Section &Sec) { return Sec.Name == "name"; }
+
+// Sections which are known to be "comments" or informational and do not affect
+// program semantics.
+static bool isCommentSection(const Section &Sec) {
+  return Sec.Name == "producers";
+}
+
+static Error dumpSectionToFile(StringRef SecName, StringRef Filename,
+                               Object &Obj) {
+  for (const Section &Sec : Obj.Sections) {
+    if (Sec.Name == SecName) {
+      ArrayRef<uint8_t> Contents = Sec.Contents;
+      Expected<std::unique_ptr<FileOutputBuffer>> BufferOrErr =
+          FileOutputBuffer::create(Filename, Contents.size());
+      if (!BufferOrErr)
+        return BufferOrErr.takeError();
+      std::unique_ptr<FileOutputBuffer> Buf = std::move(*BufferOrErr);
+      std::copy(Contents.begin(), Contents.end(), Buf->getBufferStart());
+      if (Error E = Buf->commit())
+        return E;
+      return Error::success();
+    }
+  }
+  return createStringError(errc::invalid_argument, "section '%s' not found",
+                           SecName.str().c_str());
+}
+
+static void removeSections(const CommonConfig &Config, Object &Obj) {
+  SectionPred RemovePred = [](const Section &) { return false; };
+
+  // Explicitly-requested sections.
+  if (!Config.ToRemove.empty()) {
+    RemovePred = [&Config](const Section &Sec) {
+      return Config.ToRemove.matches(Sec.Name);
+    };
+  }
+
+  if (Config.StripDebug) {
+    RemovePred = [RemovePred](const Section &Sec) {
+      return RemovePred(Sec) || isDebugSection(Sec);
+    };
+  }
+
+  if (Config.StripAll) {
+    RemovePred = [RemovePred](const Section &Sec) {
+      return RemovePred(Sec) || isDebugSection(Sec) || isLinkerSection(Sec) ||
+             isNameSection(Sec) || isCommentSection(Sec);
+    };
+  }
+
+  if (Config.OnlyKeepDebug) {
+    RemovePred = [&Config](const Section &Sec) {
+      // Keep debug sections, unless explicitly requested to remove.
+      // Remove everything else, including known sections.
+      return Config.ToRemove.matches(Sec.Name) || !isDebugSection(Sec);
+    };
+  }
+
+  if (!Config.OnlySection.empty()) {
+    RemovePred = [&Config](const Section &Sec) {
+      // Explicitly keep these sections regardless of previous removes.
+      // Remove everything else, inluding known sections.
+      return !Config.OnlySection.matches(Sec.Name);
+    };
+  }
+
+  if (!Config.KeepSection.empty()) {
+    RemovePred = [&Config, RemovePred](const Section &Sec) {
+      // Explicitly keep these sections regardless of previous removes.
+      if (Config.KeepSection.matches(Sec.Name))
+        return false;
+      // Otherwise defer to RemovePred.
+      return RemovePred(Sec);
+    };
+  }
+
+  Obj.removeSections(RemovePred);
+}
+
+static Error handleArgs(const CommonConfig &Config, Object &Obj) {
+  // Only support AddSection, DumpSection, RemoveSection for now.
+  for (StringRef Flag : Config.DumpSection) {
+    StringRef SecName;
+    StringRef FileName;
+    std::tie(SecName, FileName) = Flag.split("=");
+    if (Error E = dumpSectionToFile(SecName, FileName, Obj))
+      return createFileError(FileName, std::move(E));
+  }
+
+  removeSections(Config, Obj);
+
+  for (const NewSectionInfo &NewSection : Config.AddSection) {
+    Section Sec;
+    Sec.SectionType = llvm::wasm::WASM_SEC_CUSTOM;
+    Sec.Name = NewSection.SectionName;
+
+    std::unique_ptr<MemoryBuffer> BufferCopy = MemoryBuffer::getMemBufferCopy(
+        NewSection.SectionData->getBufferStart(),
+        NewSection.SectionData->getBufferIdentifier());
+    Sec.Contents = makeArrayRef<uint8_t>(
+        reinterpret_cast<const uint8_t *>(BufferCopy->getBufferStart()),
+        BufferCopy->getBufferSize());
+
+    Obj.addSectionWithOwnedContents(Sec, std::move(BufferCopy));
+  }
+
+  return Error::success();
+}
+
+Error executeObjcopyOnBinary(const CommonConfig &Config, const WasmConfig &,
+                             object::WasmObjectFile &In, raw_ostream &Out) {
+  Reader TheReader(In);
+  Expected<std::unique_ptr<Object>> ObjOrErr = TheReader.create();
+  if (!ObjOrErr)
+    return createFileError(Config.InputFilename, ObjOrErr.takeError());
+  Object *Obj = ObjOrErr->get();
+  assert(Obj && "Unable to deserialize Wasm object");
+  if (Error E = handleArgs(Config, *Obj))
+    return E;
+  Writer TheWriter(*Obj, Out);
+  if (Error E = TheWriter.write())
+    return createFileError(Config.OutputFilename, std::move(E));
+  return Error::success();
+}
+
+} // end namespace wasm
+} // end namespace objcopy
+} // end namespace llvm
diff --git a/llvm/lib/ObjCopy/wasm/WasmObject.cpp b/llvm/lib/ObjCopy/wasm/WasmObject.cpp
new file mode 100644
index 000000000000..28a2de6e6e4f
--- /dev/null
+++ b/llvm/lib/ObjCopy/wasm/WasmObject.cpp
@@ -0,0 +1,34 @@
+//===- WasmObject.cpp -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "WasmObject.h"
+
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+namespace objcopy {
+namespace wasm {
+
+using namespace object;
+using namespace llvm::wasm;
+
+void Object::addSectionWithOwnedContents(
+    Section NewSection, std::unique_ptr<MemoryBuffer> &&Content) {
+  Sections.push_back(NewSection);
+  OwnedContents.emplace_back(std::move(Content));
+}
+
+void Object::removeSections(function_ref<bool(const Section &)> ToRemove) {
+  // TODO: remove reloc sections for the removed section, handle symbols, etc.
+  llvm::erase_if(Sections, ToRemove);
+}
+
+} // end namespace wasm
+} // end namespace objcopy
+} // end namespace llvm
diff --git a/llvm/lib/ObjCopy/wasm/WasmObject.h b/llvm/lib/ObjCopy/wasm/WasmObject.h
new file mode 100644
index 000000000000..9bc5831926c6
--- /dev/null
+++ b/llvm/lib/ObjCopy/wasm/WasmObject.h
@@ -0,0 +1,47 @@
+//===- WasmObject.h ---------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_OBJCOPY_WASM_WASMOBJECT_H
+#define LLVM_LIB_OBJCOPY_WASM_WASMOBJECT_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Object/Wasm.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <vector>
+
+namespace llvm {
+namespace objcopy {
+namespace wasm {
+
+struct Section {
+  // For now, each section is only an opaque binary blob with no distinction
+  // between custom and known sections.
+  uint8_t SectionType;
+  StringRef Name;
+  ArrayRef<uint8_t> Contents;
+};
+
+struct Object {
+  llvm::wasm::WasmObjectHeader Header;
+  // For now don't discriminate between kinds of sections.
+  std::vector<Section> Sections;
+
+  void addSectionWithOwnedContents(Section NewSection,
+                                   std::unique_ptr<MemoryBuffer> &&Content);
+  void removeSections(function_ref<bool(const Section &)> ToRemove);
+
+private:
+  std::vector<std::unique_ptr<MemoryBuffer>> OwnedContents;
+};
+
+} // end namespace wasm
+} // end namespace objcopy
+} // end namespace llvm
+
+#endif // LLVM_LIB_OBJCOPY_WASM_WASMOBJECT_H
diff --git a/llvm/lib/ObjCopy/wasm/WasmReader.cpp b/llvm/lib/ObjCopy/wasm/WasmReader.cpp
new file mode 100644
index 000000000000..6e7d8b5591c9
--- /dev/null
+++ b/llvm/lib/ObjCopy/wasm/WasmReader.cpp
@@ -0,0 +1,39 @@
+//===- WasmReader.cpp -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "WasmReader.h"
+
+namespace llvm {
+namespace objcopy {
+namespace wasm {
+
+using namespace object;
+using namespace llvm::wasm;
+
+Expected<std::unique_ptr<Object>> Reader::create() const {
+  auto Obj = std::make_unique<Object>();
+  Obj->Header = WasmObj.getHeader();
+  std::vector<Section> Sections;
+  Obj->Sections.reserve(WasmObj.getNumSections());
+  for (const SectionRef &Sec : WasmObj.sections()) {
+    const WasmSection &WS = WasmObj.getWasmSection(Sec);
+    Obj->Sections.push_back(
+        {static_cast<uint8_t>(WS.Type), WS.Name, WS.Content});
+    // Give known sections standard names to allow them to be selected. (Custom
+    // sections already have their names filled in by the parser).
+    Section &ReaderSec = Obj->Sections.back();
+    if (ReaderSec.SectionType > WASM_SEC_CUSTOM &&
+        ReaderSec.SectionType <= WASM_SEC_LAST_KNOWN)
+      ReaderSec.Name = sectionTypeToString(ReaderSec.SectionType);
+  }
+  return std::move(Obj);
+}
+
+} // end namespace wasm
+} // end namespace objcopy
+} // end namespace llvm
diff --git a/llvm/lib/ObjCopy/wasm/WasmReader.h b/llvm/lib/ObjCopy/wasm/WasmReader.h
new file mode 100644
index 000000000000..d71660fa2b65
--- /dev/null
+++ b/llvm/lib/ObjCopy/wasm/WasmReader.h
@@ -0,0 +1,31 @@
+//===- WasmReader.h ---------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_OBJCOPY_WASM_WASMREADER_H
+#define LLVM_LIB_OBJCOPY_WASM_WASMREADER_H
+
+#include "WasmObject.h"
+
+namespace llvm {
+namespace objcopy {
+namespace wasm {
+
+class Reader {
+public:
+  explicit Reader(const object::WasmObjectFile &O) : WasmObj(O) {}
+  Expected<std::unique_ptr<Object>> create() const;
+
+private:
+  const object::WasmObjectFile &WasmObj;
+};
+
+} // end namespace wasm
+} // end namespace objcopy
+} // end namespace llvm
+
+#endif // LLVM_LIB_OBJCOPY_WASM_WASMREADER_H
diff --git a/llvm/lib/ObjCopy/wasm/WasmWriter.cpp b/llvm/lib/ObjCopy/wasm/WasmWriter.cpp
new file mode 100644
index 000000000000..fdcd441cc798
--- /dev/null
+++ b/llvm/lib/ObjCopy/wasm/WasmWriter.cpp
@@ -0,0 +1,79 @@
+//===- WasmWriter.cpp -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "WasmWriter.h"
+#include "llvm/BinaryFormat/Wasm.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+namespace objcopy {
+namespace wasm {
+
+using namespace object;
+using namespace llvm::wasm;
+
+Writer::SectionHeader Writer::createSectionHeader(const Section &S,
+                                                  size_t &SectionSize) {
+  SectionHeader Header;
+  raw_svector_ostream OS(Header);
+  OS << S.SectionType;
+  bool HasName = S.SectionType == WASM_SEC_CUSTOM;
+  SectionSize = S.Contents.size();
+  if (HasName)
+    SectionSize += getULEB128Size(S.Name.size()) + S.Name.size();
+  // Pad the LEB value out to 5 bytes to make it a predictable size, and
+  // match the behavior of clang.
+  encodeULEB128(SectionSize, OS, 5);
+  if (HasName) {
+    encodeULEB128(S.Name.size(), OS);
+    OS << S.Name;
+  }
+  // Total section size is the content size plus 1 for the section type and
+  // 5 for the LEB-encoded size.
+  SectionSize = SectionSize + 1 + 5;
+  return Header;
+}
+
+size_t Writer::finalize() {
+  size_t ObjectSize = sizeof(WasmMagic) + sizeof(WasmVersion);
+  SectionHeaders.reserve(Obj.Sections.size());
+  // Finalize the headers of each section so we know the total size.
+  for (const Section &S : Obj.Sections) {
+    size_t SectionSize;
+    SectionHeaders.push_back(createSectionHeader(S, SectionSize));
+    ObjectSize += SectionSize;
+  }
+  return ObjectSize;
+}
+
+Error Writer::write() {
+  size_t TotalSize = finalize();
+  Out.reserveExtraSpace(TotalSize);
+
+  // Write the header.
+  Out.write(Obj.Header.Magic.data(), Obj.Header.Magic.size());
+  uint32_t Version;
+  support::endian::write32le(&Version, Obj.Header.Version);
+  Out.write(reinterpret_cast<const char *>(&Version), sizeof(Version));
+
+  // Write each section.
+  for (size_t I = 0, S = SectionHeaders.size(); I < S; ++I) {
+    Out.write(SectionHeaders[I].data(), SectionHeaders[I].size());
+    Out.write(reinterpret_cast<const char *>(Obj.Sections[I].Contents.data()),
+              Obj.Sections[I].Contents.size());
+  }
+
+  return Error::success();
+}
+
+} // end namespace wasm
+} // end namespace objcopy
+} // end namespace llvm
diff --git a/llvm/lib/ObjCopy/wasm/WasmWriter.h b/llvm/lib/ObjCopy/wasm/WasmWriter.h
new file mode 100644
index 000000000000..14bbcf88875e
--- /dev/null
+++ b/llvm/lib/ObjCopy/wasm/WasmWriter.h
@@ -0,0 +1,49 @@
+//===- WasmWriter.h ---------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_OBJCOPY_WASM_WASMWRITER_H
+#define LLVM_LIB_OBJCOPY_WASM_WASMWRITER_H
+
+#include "WasmObject.h"
+#include <cstdint>
+#include <vector>
+
+namespace llvm {
+namespace objcopy {
+namespace wasm {
+
+class Writer {
+public:
+  Writer(Object &Obj, raw_ostream &Out) : Obj(Obj), Out(Out) {}
+  Error write();
+
+private:
+  using SectionHeader = SmallVector<char, 8>;
+  Object &Obj;
+  raw_ostream &Out;
+  std::vector<SectionHeader> SectionHeaders;
+
+  /// Generate a wasm section section header for S.
+  /// The header consists of
+  /// * A one-byte section ID (aka the section type).
+  /// * The size of the section contents, encoded as ULEB128.
+  /// * If the section is a custom section (type 0) it also has a name, which is
+  ///   encoded as a length-prefixed string. The encoded section size *includes*
+  ///   this string.
+  /// See https://webassembly.github.io/spec/core/binary/modules.html#sections
+  /// Return the header and store the total size in SectionSize.
+  static SectionHeader createSectionHeader(const Section &S,
+                                           size_t &SectionSize);
+  size_t finalize();
+};
+
+} // end namespace wasm
+} // end namespace objcopy
+} // end namespace llvm
+
+#endif // LLVM_LIB_OBJCOPY_WASM_WASMWRITER_H
diff --git a/llvm/lib/Object/Archive.cpp b/llvm/lib/Object/Archive.cpp
index 9a4ef055faa4..ad03f9cae9f8 100644
--- a/llvm/lib/Object/Archive.cpp
+++ b/llvm/lib/Object/Archive.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Host.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
@@ -30,7 +31,6 @@
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
-#include <cstring>
 #include <memory>
 #include <string>
 #include <system_error>
@@ -257,6 +257,14 @@ Expected<StringRef> ArchiveMemberHeader::getName(uint64_t Size) const {
       return Name;
     if (Name.size() == 2 && Name[1] == '/') // String table.
       return Name;
+    // System libraries from the Windows SDK for Windows 11 contain this symbol.
+    // It looks like a CFG guard: we just skip it for now.
+    if (Name.equals("/<XFGHASHMAP>/"))
+      return Name;
+    // Some libraries (e.g., arm64rt.lib) from the Windows WDK
+    // (version 10.0.22000.0) contain this undocumented special member.
+    if (Name.equals("/<ECSYMBOLS>/"))
+      return Name;
     // It's a long name.
     // Get the string table offset.
     std::size_t StringOffset;
@@ -922,6 +930,14 @@ Archive::Archive(MemoryBufferRef Source, Error &Err)
   Err = Error::success();
 }
 
+object::Archive::Kind Archive::getDefaultKindForHost() {
+  Triple HostTriple(sys::getProcessTriple());
+  return HostTriple.isOSDarwin()
+             ? object::Archive::K_DARWIN
+             : (HostTriple.isOSAIX() ? object::Archive::K_AIXBIG
+                                     : object::Archive::K_GNU);
+}
+
 Archive::child_iterator Archive::child_begin(Error &Err,
                                              bool SkipInternal) const {
   if (isEmpty())
diff --git a/llvm/lib/Object/ArchiveWriter.cpp b/llvm/lib/Object/ArchiveWriter.cpp
index 053b3dafed95..dbf5052cdac0 100644
--- a/llvm/lib/Object/ArchiveWriter.cpp
+++ b/llvm/lib/Object/ArchiveWriter.cpp
@@ -18,16 +18,19 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/Error.h"
+#include "llvm/Object/IRObjectFile.h"
+#include "llvm/Object/MachO.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Object/SymbolicFile.h"
+#include "llvm/Object/XCOFFObjectFile.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SmallVectorMemoryBuffer.h"
-#include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/raw_ostream.h"
 
 #include <map>
@@ -44,6 +47,40 @@ NewArchiveMember::NewArchiveMember(MemoryBufferRef BufRef)
     : Buf(MemoryBuffer::getMemBuffer(BufRef, false)),
       MemberName(BufRef.getBufferIdentifier()) {}
 
+object::Archive::Kind NewArchiveMember::detectKindFromObject() const {
+  auto MemBufferRef = this->Buf->getMemBufferRef();
+  Expected<std::unique_ptr<object::ObjectFile>> OptionalObject =
+      object::ObjectFile::createObjectFile(MemBufferRef);
+
+  if (OptionalObject)
+    return isa<object::MachOObjectFile>(**OptionalObject)
+               ? object::Archive::K_DARWIN
+               : (isa<object::XCOFFObjectFile>(**OptionalObject)
+                      ? object::Archive::K_AIXBIG
+                      : object::Archive::K_GNU);
+
+  // Squelch the error in case we had a non-object file.
+  consumeError(OptionalObject.takeError());
+
+  // If we're adding a bitcode file to the archive, detect the Archive kind
+  // based on the target triple.
+  LLVMContext Context;
+  if (identify_magic(MemBufferRef.getBuffer()) == file_magic::bitcode) {
+    if (auto ObjOrErr = object::SymbolicFile::createSymbolicFile(
+            MemBufferRef, file_magic::bitcode, &Context)) {
+      auto &IRObject = cast<object::IRObjectFile>(**ObjOrErr);
+      return Triple(IRObject.getTargetTriple()).isOSDarwin()
+                 ? object::Archive::K_DARWIN
+                 : object::Archive::K_GNU;
+    } else {
+      // Squelch the error in case this was not a SymbolicFile.
+      consumeError(ObjOrErr.takeError());
+    }
+  }
+
+  return object::Archive::getDefaultKindForHost();
+}
+
 Expected<NewArchiveMember>
 NewArchiveMember::getOldMember(const object::Archive::Child &OldMember,
                                bool Deterministic) {
@@ -128,16 +165,20 @@ static bool isDarwin(object::Archive::Kind Kind) {
          Kind == object::Archive::K_DARWIN64;
 }
 
+static bool isAIXBigArchive(object::Archive::Kind Kind) {
+  return Kind == object::Archive::K_AIXBIG;
+}
+
 static bool isBSDLike(object::Archive::Kind Kind) {
   switch (Kind) {
   case object::Archive::K_GNU:
   case object::Archive::K_GNU64:
+  case object::Archive::K_AIXBIG:
     return false;
   case object::Archive::K_BSD:
   case object::Archive::K_DARWIN:
   case object::Archive::K_DARWIN64:
     return true;
-  case object::Archive::K_AIXBIG:
   case object::Archive::K_COFF:
     break;
   }
@@ -190,6 +231,31 @@ printBSDMemberHeader(raw_ostream &Out, uint64_t Pos, StringRef Name,
     Out.write(uint8_t(0));
 }
 
+static void
+printBigArchiveMemberHeader(raw_ostream &Out, StringRef Name,
+                            const sys::TimePoint<std::chrono::seconds> &ModTime,
+                            unsigned UID, unsigned GID, unsigned Perms,
+                            uint64_t Size, unsigned PrevOffset,
+                            unsigned NextOffset) {
+  unsigned NameLen = Name.size();
+
+  printWithSpacePadding(Out, Size, 20);           // File member size
+  printWithSpacePadding(Out, NextOffset, 20);     // Next member header offset
+  printWithSpacePadding(Out, PrevOffset, 20); // Previous member header offset
+  printWithSpacePadding(Out, sys::toTimeT(ModTime), 12); // File member date
+  // The big archive format has 12 chars for uid and gid.
+  printWithSpacePadding(Out, UID % 1000000000000, 12);   // UID
+  printWithSpacePadding(Out, GID % 1000000000000, 12);   // GID
+  printWithSpacePadding(Out, format("%o", Perms), 12);   // Permission
+  printWithSpacePadding(Out, NameLen, 4);                // Name length
+  if (NameLen) {
+    printWithSpacePadding(Out, Name, NameLen); // Name
+    if (NameLen % 2)
+      Out.write(uint8_t(0)); // Null byte padding
+  }
+  Out << "`\n"; // Terminator
+}
+
 static bool useStringTable(bool Thin, StringRef Name) {
   return Thin || Name.size() >= 16 || Name.contains('/');
 }
@@ -200,8 +266,8 @@ static bool is64BitKind(object::Archive::Kind Kind) {
   case object::Archive::K_BSD:
   case object::Archive::K_DARWIN:
   case object::Archive::K_COFF:
-  case object::Archive::K_AIXBIG:
     return false;
+  case object::Archive::K_AIXBIG:
   case object::Archive::K_DARWIN64:
   case object::Archive::K_GNU64:
     return true;
@@ -305,7 +371,11 @@ static uint64_t computeSymbolTableSize(object::Archive::Kind Kind,
   // least 4-byte aligned for 32-bit content.  Opt for the larger encoding
   // uniformly.
   // We do this for all bsd formats because it simplifies aligning members.
-  uint32_t Pad = offsetToAlignment(Size, Align(isBSDLike(Kind) ? 8 : 2));
+  // For the big archive format, the symbol table is the last member, so there
+  // is no need to align.
+  uint32_t Pad = isAIXBigArchive(Kind)
+                     ? 0
+                     : offsetToAlignment(Size, Align(isBSDLike(Kind) ? 8 : 2));
   Size += Pad;
   if (Padding)
     *Padding = Pad;
@@ -313,11 +383,15 @@ static uint64_t computeSymbolTableSize(object::Archive::Kind Kind,
 }
 
 static void writeSymbolTableHeader(raw_ostream &Out, object::Archive::Kind Kind,
-                                   bool Deterministic, uint64_t Size) {
+                                   bool Deterministic, uint64_t Size,
+                                   uint64_t PrevMemberOffset = 0) {
   if (isBSDLike(Kind)) {
     const char *Name = is64BitKind(Kind) ? "__.SYMDEF_64" : "__.SYMDEF";
     printBSDMemberHeader(Out, Out.tell(), Name, now(Deterministic), 0, 0, 0,
                          Size);
+  } else if (isAIXBigArchive(Kind)) {
+    printBigArchiveMemberHeader(Out, "", now(Deterministic), 0, 0,
+                                0, Size, PrevMemberOffset, 0);
   } else {
     const char *Name = is64BitKind(Kind) ? "/SYM64" : "";
     printGNUSmallMemberHeader(Out, Name, now(Deterministic), 0, 0, 0, Size);
@@ -326,7 +400,8 @@ static void writeSymbolTableHeader(raw_ostream &Out, object::Archive::Kind Kind,
 
 static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind,
                              bool Deterministic, ArrayRef<MemberData> Members,
-                             StringRef StringTable) {
+                             StringRef StringTable,
+                             uint64_t PrevMemberOffset = 0) {
   // We don't write a symbol table on an archive with no members -- except on
   // Darwin, where the linker will abort unless the archive has a symbol table.
   if (StringTable.empty() && !isDarwin(Kind))
@@ -339,9 +414,10 @@ static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind,
   uint64_t OffsetSize = is64BitKind(Kind) ? 8 : 4;
   uint32_t Pad;
   uint64_t Size = computeSymbolTableSize(Kind, NumSyms, OffsetSize, StringTable, &Pad);
-  writeSymbolTableHeader(Out, Kind, Deterministic, Size);
+  writeSymbolTableHeader(Out, Kind, Deterministic, Size, PrevMemberOffset);
 
-  uint64_t Pos = Out.tell() + Size;
+  uint64_t Pos = isAIXBigArchive(Kind) ? sizeof(object::BigArchive::FixLenHdr)
+                                       : Out.tell() + Size;
 
   if (isBSDLike(Kind))
     printNBits(Out, Kind, NumSyms * 2 * OffsetSize);
@@ -410,9 +486,8 @@ computeMemberData(raw_ostream &StringTable, raw_ostream &SymNames,
                   bool NeedSymbols, ArrayRef<NewArchiveMember> NewMembers) {
   static char PaddingData[8] = {'\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'};
 
-  // This ignores the symbol table, but we only need the value mod 8 and the
-  // symbol table is aligned to be a multiple of 8 bytes
-  uint64_t Pos = 0;
+  uint64_t Pos =
+      isAIXBigArchive(Kind) ? sizeof(object::BigArchive::FixLenHdr) : 0;
 
   std::vector<MemberData> Ret;
   bool HasObject = false;
@@ -472,6 +547,9 @@ computeMemberData(raw_ostream &StringTable, raw_ostream &SymNames,
       Entry.second = Entry.second > 1 ? 1 : 0;
   }
 
+  // The big archive format needs to know the offset of the previous member
+  // header.
+  unsigned PrevOffset = 0;
   for (const NewArchiveMember &M : NewMembers) {
     std::string Header;
     raw_string_ostream Out(Header);
@@ -504,8 +582,16 @@ computeMemberData(raw_ostream &StringTable, raw_ostream &SymNames,
           std::move(StringMsg), object::object_error::parse_failed);
     }
 
-    printMemberHeader(Out, Pos, StringTable, MemberNames, Kind, Thin, M,
-                      ModTime, Size);
+    if (isAIXBigArchive(Kind)) {
+      unsigned NextOffset = Pos + sizeof(object::BigArMemHdrType) +
+                            alignTo(M.MemberName.size(), 2) + alignTo(Size, 2);
+      printBigArchiveMemberHeader(Out, M.MemberName, ModTime, M.UID, M.GID,
+                                  M.Perms, Size, PrevOffset, NextOffset);
+      PrevOffset = Pos;
+    } else {
+      printMemberHeader(Out, Pos, StringTable, MemberNames, Kind, Thin, M,
+                        ModTime, Size);
+    }
     Out.flush();
 
     std::vector<unsigned> Symbols;
@@ -589,22 +675,25 @@ static Error writeArchiveToStream(raw_ostream &Out,
     return E;
   std::vector<MemberData> &Data = *DataOrErr;
 
-  if (!StringTableBuf.empty())
+  if (!StringTableBuf.empty() && !isAIXBigArchive(Kind))
     Data.insert(Data.begin(), computeStringTable(StringTableBuf));
 
   // We would like to detect if we need to switch to a 64-bit symbol table.
-  if (WriteSymtab) {
-    uint64_t MaxOffset = 8; // For the file signature.
-    uint64_t LastOffset = MaxOffset;
-    uint64_t NumSyms = 0;
-    for (const auto &M : Data) {
-      // Record the start of the member's offset
-      LastOffset = MaxOffset;
-      // Account for the size of each part associated with the member.
-      MaxOffset += M.Header.size() + M.Data.size() + M.Padding.size();
-      NumSyms += M.Symbols.size();
-    }
+  uint64_t LastMemberEndOffset =
+      isAIXBigArchive(Kind) ? sizeof(object::BigArchive::FixLenHdr) : 8;
+  uint64_t LastMemberHeaderOffset = LastMemberEndOffset;
+  uint64_t NumSyms = 0;
+  for (const auto &M : Data) {
+    // Record the start of the member's offset
+    LastMemberHeaderOffset = LastMemberEndOffset;
+    // Account for the size of each part associated with the member.
+    LastMemberEndOffset += M.Header.size() + M.Data.size() + M.Padding.size();
+    NumSyms += M.Symbols.size();
+  }
 
+  // The symbol table is put at the end of the big archive file. The symbol
+  // table is at the start of the archive file for other archive formats.
+  if (WriteSymtab && !isAIXBigArchive(Kind)) {
     // We assume 32-bit offsets to see if 32-bit symbols are possible or not.
     uint64_t SymtabSize = computeSymbolTableSize(Kind, NumSyms, 4, SymNamesBuf);
     auto computeSymbolTableHeaderSize =
@@ -614,7 +703,7 @@ static Error writeArchiveToStream(raw_ostream &Out,
           writeSymbolTableHeader(Tmp, Kind, Deterministic, SymtabSize);
           return TmpBuf.size();
         };
-    LastOffset += computeSymbolTableHeaderSize() + SymtabSize;
+    LastMemberHeaderOffset += computeSymbolTableHeaderSize() + SymtabSize;
 
     // The SYM64 format is used when an archive's member offsets are larger than
     // 32-bits can hold. The need for this shift in format is detected by
@@ -628,10 +717,10 @@ static Error writeArchiveToStream(raw_ostream &Out,
     if (Sym64Env)
       StringRef(Sym64Env).getAsInteger(10, Sym64Threshold);
 
-    // If LastOffset isn't going to fit in a 32-bit varible we need to switch
-    // to 64-bit. Note that the file can be larger than 4GB as long as the last
-    // member starts before the 4GB offset.
-    if (LastOffset >= Sym64Threshold) {
+    // If LastMemberHeaderOffset isn't going to fit in a 32-bit varible we need
+    // to switch to 64-bit. Note that the file can be larger than 4GB as long as
+    // the last member starts before the 4GB offset.
+    if (LastMemberHeaderOffset >= Sym64Threshold) {
       if (Kind == object::Archive::K_DARWIN)
         Kind = object::Archive::K_DARWIN64;
       else
@@ -641,15 +730,92 @@ static Error writeArchiveToStream(raw_ostream &Out,
 
   if (Thin)
     Out << "!<thin>\n";
+  else if (isAIXBigArchive(Kind))
+    Out << "<bigaf>\n";
   else
     Out << "!<arch>\n";
 
-  if (WriteSymtab)
-    writeSymbolTable(Out, Kind, Deterministic, Data, SymNamesBuf);
+  if (!isAIXBigArchive(Kind)) {
+    if (WriteSymtab)
+      writeSymbolTable(Out, Kind, Deterministic, Data, SymNamesBuf);
+    for (const MemberData &M : Data)
+      Out << M.Header << M.Data << M.Padding;
+  } else {
+    // For the big archive (AIX) format, compute a table of member names and
+    // offsets, used in the member table.
+    uint64_t MemberTableNameStrTblSize = 0;
+    std::vector<size_t> MemberOffsets;
+    std::vector<StringRef> MemberNames;
+    // Loop across object to find offset and names.
+    uint64_t MemberEndOffset = sizeof(object::BigArchive::FixLenHdr);
+    for (size_t I = 0, Size = NewMembers.size(); I != Size; ++I) {
+      const NewArchiveMember &Member = NewMembers[I];
+      MemberTableNameStrTblSize += Member.MemberName.size() + 1;
+      MemberOffsets.push_back(MemberEndOffset);
+      MemberNames.push_back(Member.MemberName);
+      // File member name ended with "`\n". The length is included in
+      // BigArMemHdrType.
+      MemberEndOffset += sizeof(object::BigArMemHdrType) +
+                             alignTo(Data[I].Data.size(), 2) +
+                             alignTo(Member.MemberName.size(), 2);
+    }
 
-  for (const MemberData &M : Data)
-    Out << M.Header << M.Data << M.Padding;
+    // AIX member table size.
+    unsigned MemberTableSize = 20 + // Number of members field
+                               20 * MemberOffsets.size() +
+                               MemberTableNameStrTblSize;
+
+    unsigned GlobalSymbolOffset =
+        (WriteSymtab && NumSyms > 0)
+            ? LastMemberEndOffset +
+                  alignTo(sizeof(object::BigArMemHdrType) + MemberTableSize, 2)
+            : 0;
+
+    // Fixed Sized Header.
+    printWithSpacePadding(Out, NewMembers.size() ? LastMemberEndOffset : 0,
+                          20); // Offset to member table
+    // If there are no file members in the archive, there will be no global
+    // symbol table.
+    printWithSpacePadding(Out, NewMembers.size() ? GlobalSymbolOffset : 0, 20);
+    printWithSpacePadding(
+        Out, 0,
+        20); // Offset to 64 bits global symbol table - Not supported yet
+    printWithSpacePadding(
+        Out, NewMembers.size() ? sizeof(object::BigArchive::FixLenHdr) : 0,
+        20); // Offset to first archive member
+    printWithSpacePadding(Out, NewMembers.size() ? LastMemberHeaderOffset : 0,
+                          20); // Offset to last archive member
+    printWithSpacePadding(
+        Out, 0,
+        20); // Offset to first member of free list - Not supported yet
+
+    for (const MemberData &M : Data) {
+      Out << M.Header << M.Data;
+      if (M.Data.size() % 2)
+        Out << '\0';
+    }
 
+    if (NewMembers.size()) {
+      // Member table.
+      printBigArchiveMemberHeader(Out, "", sys::toTimePoint(0), 0, 0, 0,
+                                  MemberTableSize, LastMemberHeaderOffset,
+                                  GlobalSymbolOffset);
+      printWithSpacePadding(Out, MemberOffsets.size(), 20); // Number of members
+      for (uint64_t MemberOffset : MemberOffsets)
+        printWithSpacePadding(Out, MemberOffset,
+                              20); // Offset to member file header.
+      for (StringRef MemberName : MemberNames)
+        Out << MemberName << '\0'; // Member file name, null byte padding.
+
+      if (MemberTableNameStrTblSize % 2)
+        Out << '\0'; // Name table must be tail padded to an even number of
+                     // bytes.
+
+      if (WriteSymtab && NumSyms > 0)
+        writeSymbolTable(Out, Kind, Deterministic, Data, SymNamesBuf,
+                         LastMemberEndOffset);
+    }
+  }
   Out.flush();
   return Error::success();
 }
diff --git a/llvm/lib/Object/Binary.cpp b/llvm/lib/Object/Binary.cpp
index 143554344256..8065e3eb1d85 100644
--- a/llvm/lib/Object/Binary.cpp
+++ b/llvm/lib/Object/Binary.cpp
@@ -18,14 +18,13 @@
 #include "llvm/Object/MachOUniversal.h"
 #include "llvm/Object/Minidump.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/OffloadBinary.h"
 #include "llvm/Object/TapiUniversal.h"
 #include "llvm/Object/WindowsResource.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ErrorOr.h"
-#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include <algorithm>
 #include <memory>
 #include <system_error>
 
@@ -84,9 +83,13 @@ Expected<std::unique_ptr<Binary>> object::createBinary(MemoryBufferRef Buffer,
     // PDB does not support the Binary interface.
     return errorCodeToError(object_error::invalid_file_type);
   case file_magic::unknown:
+  case file_magic::cuda_fatbinary:
   case file_magic::coff_cl_gl_object:
+  case file_magic::dxcontainer_object:
     // Unrecognized object file format.
     return errorCodeToError(object_error::invalid_file_type);
+  case file_magic::offload_binary:
+    return OffloadBinary::create(Buffer);
   case file_magic::minidump:
     return MinidumpFile::create(Buffer);
   case file_magic::tapi_file:
diff --git a/llvm/lib/Object/COFFImportFile.cpp b/llvm/lib/Object/COFFImportFile.cpp
index 69bbf70b43a1..91ecea11511d 100644
--- a/llvm/lib/Object/COFFImportFile.cpp
+++ b/llvm/lib/Object/COFFImportFile.cpp
@@ -12,10 +12,14 @@
 
 #include "llvm/Object/COFFImportFile.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/ArchiveWriter.h"
 #include "llvm/Object/COFF.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Path.h"
 
 #include <cstdint>
diff --git a/llvm/lib/Object/COFFModuleDefinition.cpp b/llvm/lib/Object/COFFModuleDefinition.cpp
index 55ddd3baca2b..0666970d5c60 100644
--- a/llvm/lib/Object/COFFModuleDefinition.cpp
+++ b/llvm/lib/Object/COFFModuleDefinition.cpp
@@ -17,12 +17,10 @@
 #include "llvm/Object/COFFModuleDefinition.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/Object/COFF.h"
 #include "llvm/Object/COFFImportFile.h"
 #include "llvm/Object/Error.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm::COFF;
 using namespace llvm;
diff --git a/llvm/lib/Object/COFFObjectFile.cpp b/llvm/lib/Object/COFFObjectFile.cpp
index 354b3c0d5577..1a4bb329201a 100644
--- a/llvm/lib/Object/COFFObjectFile.cpp
+++ b/llvm/lib/Object/COFFObjectFile.cpp
@@ -25,7 +25,7 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/MemoryBufferRef.h"
 #include <algorithm>
 #include <cassert>
 #include <cinttypes>
@@ -447,7 +447,8 @@ Error COFFObjectFile::initSymbolTablePtr() {
 
   // Check that the string table is null terminated if has any in it.
   if (StringTableSize > 4 && StringTable[StringTableSize - 1] != 0)
-    return errorCodeToError(object_error::parse_failed);
+    return createStringError(object_error::parse_failed,
+                             "string table missing null terminator");
   return Error::success();
 }
 
@@ -469,23 +470,43 @@ Error COFFObjectFile::getVaPtr(uint64_t Addr, uintptr_t &Res) const {
 }
 
 // Returns the file offset for the given RVA.
-Error COFFObjectFile::getRvaPtr(uint32_t Addr, uintptr_t &Res) const {
+Error COFFObjectFile::getRvaPtr(uint32_t Addr, uintptr_t &Res,
+                                const char *ErrorContext) const {
   for (const SectionRef &S : sections()) {
     const coff_section *Section = getCOFFSection(S);
     uint32_t SectionStart = Section->VirtualAddress;
     uint32_t SectionEnd = Section->VirtualAddress + Section->VirtualSize;
     if (SectionStart <= Addr && Addr < SectionEnd) {
+      // A table/directory entry can be pointing to somewhere in a stripped
+      // section, in an object that went through `objcopy --only-keep-debug`.
+      // In this case we don't want to cause the parsing of the object file to
+      // fail, otherwise it will be impossible to use this object as debug info
+      // in LLDB. Return SectionStrippedError here so that
+      // COFFObjectFile::initialize can ignore the error.
+      // Somewhat common binaries may have RVAs pointing outside of the
+      // provided raw data. Instead of rejecting the binaries, just
+      // treat the section as stripped for these purposes.
+      if (Section->SizeOfRawData < Section->VirtualSize &&
+          Addr >= SectionStart + Section->SizeOfRawData) {
+        return make_error<SectionStrippedError>();
+      }
       uint32_t Offset = Addr - SectionStart;
       Res = reinterpret_cast<uintptr_t>(base()) + Section->PointerToRawData +
             Offset;
       return Error::success();
     }
   }
-  return errorCodeToError(object_error::parse_failed);
+  if (ErrorContext)
+    return createStringError(object_error::parse_failed,
+                             "RVA 0x%" PRIx32 " for %s not found", Addr,
+                             ErrorContext);
+  return createStringError(object_error::parse_failed,
+                           "RVA 0x%" PRIx32 " not found", Addr);
 }
 
 Error COFFObjectFile::getRvaAndSizeAsBytes(uint32_t RVA, uint32_t Size,
-                                           ArrayRef<uint8_t> &Contents) const {
+                                           ArrayRef<uint8_t> &Contents,
+                                           const char *ErrorContext) const {
   for (const SectionRef &S : sections()) {
     const coff_section *Section = getCOFFSection(S);
     uint32_t SectionStart = Section->VirtualAddress;
@@ -501,7 +522,12 @@ Error COFFObjectFile::getRvaAndSizeAsBytes(uint32_t RVA, uint32_t Size,
       return Error::success();
     }
   }
-  return errorCodeToError(object_error::parse_failed);
+  if (ErrorContext)
+    return createStringError(object_error::parse_failed,
+                             "RVA 0x%" PRIx32 " for %s not found", RVA,
+                             ErrorContext);
+  return createStringError(object_error::parse_failed,
+                           "RVA 0x%" PRIx32 " not found", RVA);
 }
 
 // Returns hint and name fields, assuming \p Rva is pointing to a Hint/Name
@@ -521,11 +547,12 @@ Error COFFObjectFile::getDebugPDBInfo(const debug_directory *DebugDir,
                                       const codeview::DebugInfo *&PDBInfo,
                                       StringRef &PDBFileName) const {
   ArrayRef<uint8_t> InfoBytes;
-  if (Error E = getRvaAndSizeAsBytes(
-          DebugDir->AddressOfRawData, DebugDir->SizeOfData, InfoBytes))
+  if (Error E =
+          getRvaAndSizeAsBytes(DebugDir->AddressOfRawData, DebugDir->SizeOfData,
+                               InfoBytes, "PDB info"))
     return E;
   if (InfoBytes.size() < sizeof(*PDBInfo) + 1)
-    return errorCodeToError(object_error::parse_failed);
+    return createStringError(object_error::parse_failed, "PDB info too small");
   PDBInfo = reinterpret_cast<const codeview::DebugInfo *>(InfoBytes.data());
   InfoBytes = InfoBytes.drop_front(sizeof(*PDBInfo));
   PDBFileName = StringRef(reinterpret_cast<const char *>(InfoBytes.data()),
@@ -563,7 +590,7 @@ Error COFFObjectFile::initImportTablePtr() {
   // Find the section that contains the RVA. This is needed because the RVA is
   // the import table's memory address which is different from its file offset.
   uintptr_t IntPtr = 0;
-  if (Error E = getRvaPtr(ImportTableRva, IntPtr))
+  if (Error E = getRvaPtr(ImportTableRva, IntPtr, "import table"))
     return E;
   if (Error E = checkOffset(Data, IntPtr, DataEntry->Size))
     return E;
@@ -586,8 +613,11 @@ Error COFFObjectFile::initDelayImportTablePtr() {
       sizeof(delay_import_directory_table_entry) - 1;
 
   uintptr_t IntPtr = 0;
-  if (Error E = getRvaPtr(RVA, IntPtr))
+  if (Error E = getRvaPtr(RVA, IntPtr, "delay import table"))
     return E;
+  if (Error E = checkOffset(Data, IntPtr, DataEntry->Size))
+    return E;
+
   DelayImportDirectory = reinterpret_cast<
       const delay_import_directory_table_entry *>(IntPtr);
   return Error::success();
@@ -607,8 +637,11 @@ Error COFFObjectFile::initExportTablePtr() {
 
   uint32_t ExportTableRva = DataEntry->RelativeVirtualAddress;
   uintptr_t IntPtr = 0;
-  if (Error E = getRvaPtr(ExportTableRva, IntPtr))
+  if (Error E = getRvaPtr(ExportTableRva, IntPtr, "export table"))
     return E;
+  if (Error E = checkOffset(Data, IntPtr, DataEntry->Size))
+    return E;
+
   ExportDirectory =
       reinterpret_cast<const export_directory_table_entry *>(IntPtr);
   return Error::success();
@@ -623,8 +656,12 @@ Error COFFObjectFile::initBaseRelocPtr() {
     return Error::success();
 
   uintptr_t IntPtr = 0;
-  if (Error E = getRvaPtr(DataEntry->RelativeVirtualAddress, IntPtr))
+  if (Error E = getRvaPtr(DataEntry->RelativeVirtualAddress, IntPtr,
+                          "base reloc table"))
+    return E;
+  if (Error E = checkOffset(Data, IntPtr, DataEntry->Size))
     return E;
+
   BaseRelocHeader = reinterpret_cast<const coff_base_reloc_block_header *>(
       IntPtr);
   BaseRelocEnd = reinterpret_cast<coff_base_reloc_block_header *>(
@@ -646,11 +683,16 @@ Error COFFObjectFile::initDebugDirectoryPtr() {
 
   // Check that the size is a multiple of the entry size.
   if (DataEntry->Size % sizeof(debug_directory) != 0)
-    return errorCodeToError(object_error::parse_failed);
+    return createStringError(object_error::parse_failed,
+                             "debug directory has uneven size");
 
   uintptr_t IntPtr = 0;
-  if (Error E = getRvaPtr(DataEntry->RelativeVirtualAddress, IntPtr))
+  if (Error E = getRvaPtr(DataEntry->RelativeVirtualAddress, IntPtr,
+                          "debug directory"))
+    return E;
+  if (Error E = checkOffset(Data, IntPtr, DataEntry->Size))
     return E;
+
   DebugDirectoryBegin = reinterpret_cast<const debug_directory *>(IntPtr);
   DebugDirectoryEnd = reinterpret_cast<const debug_directory *>(
       IntPtr + DataEntry->Size);
@@ -680,7 +722,10 @@ Error COFFObjectFile::initTLSDirectoryPtr() {
         static_cast<uint32_t>(DataEntry->Size), DirSize);
 
   uintptr_t IntPtr = 0;
-  if (Error E = getRvaPtr(DataEntry->RelativeVirtualAddress, IntPtr))
+  if (Error E =
+          getRvaPtr(DataEntry->RelativeVirtualAddress, IntPtr, "TLS directory"))
+    return E;
+  if (Error E = checkOffset(Data, IntPtr, DataEntry->Size))
     return E;
 
   if (is64())
@@ -701,7 +746,10 @@ Error COFFObjectFile::initLoadConfigPtr() {
   if (DataEntry->RelativeVirtualAddress == 0)
     return Error::success();
   uintptr_t IntPtr = 0;
-  if (Error E = getRvaPtr(DataEntry->RelativeVirtualAddress, IntPtr))
+  if (Error E = getRvaPtr(DataEntry->RelativeVirtualAddress, IntPtr,
+                          "load config table"))
+    return E;
+  if (Error E = checkOffset(Data, IntPtr, DataEntry->Size))
     return E;
 
   LoadConfig = (const void *)IntPtr;
@@ -727,6 +775,14 @@ COFFObjectFile::COFFObjectFile(MemoryBufferRef Object)
       DebugDirectoryBegin(nullptr), DebugDirectoryEnd(nullptr),
       TLSDirectory32(nullptr), TLSDirectory64(nullptr) {}
 
+static Error ignoreStrippedErrors(Error E) {
+  if (E.isA<SectionStrippedError>()) {
+    consumeError(std::move(E));
+    return Error::success();
+  }
+  return E;
+}
+
 Error COFFObjectFile::initialize() {
   // Check that we at least have enough room for a header.
   std::error_code EC;
@@ -749,7 +805,8 @@ Error COFFObjectFile::initialize() {
       CurPtr = DH->AddressOfNewExeHeader;
       // Check the PE magic bytes. ("PE\0\0")
       if (memcmp(base() + CurPtr, COFF::PEMagic, sizeof(COFF::PEMagic)) != 0) {
-        return errorCodeToError(object_error::parse_failed);
+        return createStringError(object_error::parse_failed,
+                                 "incorrect PE magic");
       }
       CurPtr += sizeof(COFF::PEMagic); // Skip the PE magic bytes.
       HasPEHeader = true;
@@ -805,7 +862,8 @@ Error COFFObjectFile::initialize() {
       DataDirSize = sizeof(data_directory) * PE32PlusHeader->NumberOfRvaAndSize;
     } else {
       // It's neither PE32 nor PE32+.
-      return errorCodeToError(object_error::parse_failed);
+      return createStringError(object_error::parse_failed,
+                               "incorrect PE magic");
     }
     if (Error E = getObject(DataDirectory, Data, DataDirAddr, DataDirSize))
       return E;
@@ -834,33 +892,34 @@ Error COFFObjectFile::initialize() {
   } else {
     // We had better not have any symbols if we don't have a symbol table.
     if (getNumberOfSymbols() != 0) {
-      return errorCodeToError(object_error::parse_failed);
+      return createStringError(object_error::parse_failed,
+                               "symbol table missing");
     }
   }
 
   // Initialize the pointer to the beginning of the import table.
-  if (Error E = initImportTablePtr())
+  if (Error E = ignoreStrippedErrors(initImportTablePtr()))
     return E;
-  if (Error E = initDelayImportTablePtr())
+  if (Error E = ignoreStrippedErrors(initDelayImportTablePtr()))
     return E;
 
   // Initialize the pointer to the export table.
-  if (Error E = initExportTablePtr())
+  if (Error E = ignoreStrippedErrors(initExportTablePtr()))
     return E;
 
   // Initialize the pointer to the base relocation table.
-  if (Error E = initBaseRelocPtr())
+  if (Error E = ignoreStrippedErrors(initBaseRelocPtr()))
     return E;
 
   // Initialize the pointer to the debug directory.
-  if (Error E = initDebugDirectoryPtr())
+  if (Error E = ignoreStrippedErrors(initDebugDirectoryPtr()))
     return E;
 
   // Initialize the pointer to the TLS directory.
-  if (Error E = initTLSDirectoryPtr())
+  if (Error E = ignoreStrippedErrors(initTLSDirectoryPtr()))
     return E;
 
-  if (Error E = initLoadConfigPtr())
+  if (Error E = ignoreStrippedErrors(initLoadConfigPtr()))
     return E;
 
   return Error::success();
@@ -1021,13 +1080,14 @@ Expected<const coff_section *> COFFObjectFile::getSection(int32_t Index) const {
     // We already verified the section table data, so no need to check again.
     return SectionTable + (Index - 1);
   }
-  return errorCodeToError(object_error::parse_failed);
+  return createStringError(object_error::parse_failed,
+                           "section index out of bounds");
 }
 
 Expected<StringRef> COFFObjectFile::getString(uint32_t Offset) const {
   if (StringTableSize <= 4)
     // Tried to get a string from an empty string table.
-    return errorCodeToError(object_error::parse_failed);
+    return createStringError(object_error::parse_failed, "string table empty");
   if (Offset >= StringTableSize)
     return errorCodeToError(object_error::unexpected_eof);
   return StringRef(StringTable + Offset);
@@ -1086,13 +1146,7 @@ uint32_t COFFObjectFile::getSymbolIndex(COFFSymbolRef Symbol) const {
 
 Expected<StringRef>
 COFFObjectFile::getSectionName(const coff_section *Sec) const {
-  StringRef Name;
-  if (Sec->Name[COFF::NameSize - 1] == 0)
-    // Null terminated, let ::strlen figure out the length.
-    Name = Sec->Name;
-  else
-    // Not null terminated, use all 8 bytes.
-    Name = StringRef(Sec->Name, COFF::NameSize);
+  StringRef Name = StringRef(Sec->Name, COFF::NameSize).split('\0').first;
 
   // Check for string table entry. First byte is '/'.
   if (Name.startswith("/")) {
@@ -1414,7 +1468,8 @@ ImportDirectoryEntryRef::lookup_table_symbols() const {
 
 Error ImportDirectoryEntryRef::getName(StringRef &Result) const {
   uintptr_t IntPtr = 0;
-  if (Error E = OwningObject->getRvaPtr(ImportTable[Index].NameRVA, IntPtr))
+  if (Error E = OwningObject->getRvaPtr(ImportTable[Index].NameRVA, IntPtr,
+                                        "import directory name"))
     return E;
   Result = StringRef(reinterpret_cast<const char *>(IntPtr));
   return Error::success();
@@ -1460,7 +1515,8 @@ DelayImportDirectoryEntryRef::imported_symbols() const {
 
 Error DelayImportDirectoryEntryRef::getName(StringRef &Result) const {
   uintptr_t IntPtr = 0;
-  if (Error E = OwningObject->getRvaPtr(Table[Index].Name, IntPtr))
+  if (Error E = OwningObject->getRvaPtr(Table[Index].Name, IntPtr,
+                                        "delay import directory name"))
     return E;
   Result = StringRef(reinterpret_cast<const char *>(IntPtr));
   return Error::success();
@@ -1477,7 +1533,7 @@ Error DelayImportDirectoryEntryRef::getImportAddress(int AddrIndex,
   uint32_t RVA = Table[Index].DelayImportAddressTable +
       AddrIndex * (OwningObject->is64() ? 8 : 4);
   uintptr_t IntPtr = 0;
-  if (Error E = OwningObject->getRvaPtr(RVA, IntPtr))
+  if (Error E = OwningObject->getRvaPtr(RVA, IntPtr, "import address"))
     return E;
   if (OwningObject->is64())
     Result = *reinterpret_cast<const ulittle64_t *>(IntPtr);
@@ -1499,7 +1555,8 @@ void ExportDirectoryEntryRef::moveNext() {
 // by ordinal, the empty string is set as a result.
 Error ExportDirectoryEntryRef::getDllName(StringRef &Result) const {
   uintptr_t IntPtr = 0;
-  if (Error E = OwningObject->getRvaPtr(ExportTable->NameRVA, IntPtr))
+  if (Error E =
+          OwningObject->getRvaPtr(ExportTable->NameRVA, IntPtr, "dll name"))
     return E;
   Result = StringRef(reinterpret_cast<const char *>(IntPtr));
   return Error::success();
@@ -1520,8 +1577,8 @@ Error ExportDirectoryEntryRef::getOrdinal(uint32_t &Result) const {
 // Returns the address of the current export symbol.
 Error ExportDirectoryEntryRef::getExportRVA(uint32_t &Result) const {
   uintptr_t IntPtr = 0;
-  if (Error EC =
-          OwningObject->getRvaPtr(ExportTable->ExportAddressTableRVA, IntPtr))
+  if (Error EC = OwningObject->getRvaPtr(ExportTable->ExportAddressTableRVA,
+                                         IntPtr, "export address"))
     return EC;
   const export_address_table_entry *entry =
       reinterpret_cast<const export_address_table_entry *>(IntPtr);
@@ -1534,8 +1591,8 @@ Error ExportDirectoryEntryRef::getExportRVA(uint32_t &Result) const {
 Error
 ExportDirectoryEntryRef::getSymbolName(StringRef &Result) const {
   uintptr_t IntPtr = 0;
-  if (Error EC =
-          OwningObject->getRvaPtr(ExportTable->OrdinalTableRVA, IntPtr))
+  if (Error EC = OwningObject->getRvaPtr(ExportTable->OrdinalTableRVA, IntPtr,
+                                         "export ordinal table"))
     return EC;
   const ulittle16_t *Start = reinterpret_cast<const ulittle16_t *>(IntPtr);
 
@@ -1545,11 +1602,12 @@ ExportDirectoryEntryRef::getSymbolName(StringRef &Result) const {
        I < E; ++I, ++Offset) {
     if (*I != Index)
       continue;
-    if (Error EC =
-            OwningObject->getRvaPtr(ExportTable->NamePointerRVA, IntPtr))
+    if (Error EC = OwningObject->getRvaPtr(ExportTable->NamePointerRVA, IntPtr,
+                                           "export table entry"))
       return EC;
     const ulittle32_t *NamePtr = reinterpret_cast<const ulittle32_t *>(IntPtr);
-    if (Error EC = OwningObject->getRvaPtr(NamePtr[Offset], IntPtr))
+    if (Error EC = OwningObject->getRvaPtr(NamePtr[Offset], IntPtr,
+                                           "export symbol name"))
       return EC;
     Result = StringRef(reinterpret_cast<const char *>(IntPtr));
     return Error::success();
@@ -1562,7 +1620,8 @@ Error ExportDirectoryEntryRef::isForwarder(bool &Result) const {
   const data_directory *DataEntry =
       OwningObject->getDataDirectory(COFF::EXPORT_TABLE);
   if (!DataEntry)
-    return errorCodeToError(object_error::parse_failed);
+    return createStringError(object_error::parse_failed,
+                             "export table missing");
   uint32_t RVA;
   if (auto EC = getExportRVA(RVA))
     return EC;
@@ -1577,7 +1636,7 @@ Error ExportDirectoryEntryRef::getForwardTo(StringRef &Result) const {
   if (auto EC = getExportRVA(RVA))
     return EC;
   uintptr_t IntPtr = 0;
-  if (auto EC = OwningObject->getRvaPtr(RVA, IntPtr))
+  if (auto EC = OwningObject->getRvaPtr(RVA, IntPtr, "export forward target"))
     return EC;
   Result = StringRef(reinterpret_cast<const char *>(IntPtr));
   return Error::success();
@@ -1606,7 +1665,7 @@ Error ImportedSymbolRef::getSymbolName(StringRef &Result) const {
     RVA = Entry64[Index].getHintNameRVA();
   }
   uintptr_t IntPtr = 0;
-  if (Error EC = OwningObject->getRvaPtr(RVA, IntPtr))
+  if (Error EC = OwningObject->getRvaPtr(RVA, IntPtr, "import symbol name"))
     return EC;
   // +2 because the first two bytes is hint.
   Result = StringRef(reinterpret_cast<const char *>(IntPtr + 2));
@@ -1645,7 +1704,7 @@ Error ImportedSymbolRef::getOrdinal(uint16_t &Result) const {
     RVA = Entry64[Index].getHintNameRVA();
   }
   uintptr_t IntPtr = 0;
-  if (Error EC = OwningObject->getRvaPtr(RVA, IntPtr))
+  if (Error EC = OwningObject->getRvaPtr(RVA, IntPtr, "import symbol ordinal"))
     return EC;
   Result = *reinterpret_cast<const ulittle16_t *>(IntPtr);
   return Error::success();
diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp
new file mode 100644
index 000000000000..ca859c1f69ae
--- /dev/null
+++ b/llvm/lib/Object/DXContainer.cpp
@@ -0,0 +1,111 @@
+//===- DXContainer.cpp - DXContainer object file implementation -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/DXContainer.h"
+#include "llvm/BinaryFormat/DXContainer.h"
+#include "llvm/Object/Error.h"
+
+using namespace llvm;
+using namespace llvm::object;
+
+static Error parseFailed(const Twine &Msg) {
+  return make_error<GenericBinaryError>(Msg.str(), object_error::parse_failed);
+}
+
+template <typename T>
+static Error readStruct(StringRef Buffer, const char *Src, T &Struct) {
+  // Don't read before the beginning or past the end of the file
+  if (Src < Buffer.begin() || Src + sizeof(T) > Buffer.end())
+    return parseFailed("Reading structure out of file bounds");
+
+  memcpy(&Struct, Src, sizeof(T));
+  // DXContainer is always little endian
+  if (sys::IsBigEndianHost)
+    Struct.swapBytes();
+  return Error::success();
+}
+
+template <typename T>
+static Error readInteger(StringRef Buffer, const char *Src, T &Val) {
+  static_assert(std::is_integral<T>::value,
+                "Cannot call readInteger on non-integral type.");
+  assert(reinterpret_cast<uintptr_t>(Src) % alignof(T) == 0 &&
+         "Unaligned read of value from buffer!");
+  // Don't read before the beginning or past the end of the file
+  if (Src < Buffer.begin() || Src + sizeof(T) > Buffer.end())
+    return parseFailed("Reading structure out of file bounds");
+
+  Val = *reinterpret_cast<const T *>(Src);
+  // DXContainer is always little endian
+  if (sys::IsBigEndianHost)
+    sys::swapByteOrder(Val);
+  return Error::success();
+}
+
+DXContainer::DXContainer(MemoryBufferRef O) : Data(O) {}
+
+Error DXContainer::parseHeader() {
+  return readStruct(Data.getBuffer(), Data.getBuffer().data(), Header);
+}
+
+Error DXContainer::parseDXILHeader(uint32_t Offset) {
+  if (DXIL)
+    return parseFailed("More than one DXIL part is present in the file");
+  const char *Current = Data.getBuffer().data() + Offset;
+  dxbc::ProgramHeader Header;
+  if (Error Err = readStruct(Data.getBuffer(), Current, Header))
+    return Err;
+  Current += offsetof(dxbc::ProgramHeader, Bitcode) + Header.Bitcode.Offset;
+  DXIL.emplace(std::make_pair(Header, Current));
+  return Error::success();
+}
+
+Error DXContainer::parsePartOffsets() {
+  const char *Current = Data.getBuffer().data() + sizeof(dxbc::Header);
+  for (uint32_t Part = 0; Part < Header.PartCount; ++Part) {
+    uint32_t PartOffset;
+    if (Error Err = readInteger(Data.getBuffer(), Current, PartOffset))
+      return Err;
+    Current += sizeof(uint32_t);
+    // We need to ensure that each part offset leaves enough space for a part
+    // header. To prevent overflow, we subtract the part header size from the
+    // buffer size, rather than adding to the offset. Since the file header is
+    // larger than the part header we can't reach this code unless the buffer
+    // is larger than the part header, so this can't underflow.
+    if (PartOffset > Data.getBufferSize() - sizeof(dxbc::PartHeader))
+      return parseFailed("Part offset points beyond boundary of the file");
+    PartOffsets.push_back(PartOffset);
+
+    // If this isn't a dxil part stop here...
+    if (Data.getBuffer().substr(PartOffset, 4) != "DXIL")
+      continue;
+    if (Error Err = parseDXILHeader(PartOffset + sizeof(dxbc::PartHeader)))
+      return Err;
+  }
+  return Error::success();
+}
+
+Expected<DXContainer> DXContainer::create(MemoryBufferRef Object) {
+  DXContainer Container(Object);
+  if (Error Err = Container.parseHeader())
+    return std::move(Err);
+  if (Error Err = Container.parsePartOffsets())
+    return std::move(Err);
+  return Container;
+}
+
+void DXContainer::PartIterator::updateIteratorImpl(const uint32_t Offset) {
+  StringRef Buffer = Container.Data.getBuffer();
+  const char *Current = Buffer.data() + Offset;
+  // Offsets are validated during parsing, so all offsets in the container are
+  // valid and contain enough readable data to read a header.
+  cantFail(readStruct(Buffer, Current, IteratorState.Part));
+  IteratorState.Data =
+      StringRef(Current + sizeof(dxbc::PartHeader), IteratorState.Part.Size);
+  IteratorState.Offset = Offset;
+}
diff --git a/llvm/lib/Object/Decompressor.cpp b/llvm/lib/Object/Decompressor.cpp
index 11efd857d1a1..de067ed59ac5 100644
--- a/llvm/lib/Object/Decompressor.cpp
+++ b/llvm/lib/Object/Decompressor.cpp
@@ -8,7 +8,7 @@
 
 #include "llvm/Object/Decompressor.h"
 #include "llvm/BinaryFormat/ELF.h"
-#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Compression.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Endian.h"
diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp
index 6e56da1a31f3..6acf4543be5a 100644
--- a/llvm/lib/Object/ELF.cpp
+++ b/llvm/lib/Object/ELF.cpp
@@ -166,6 +166,13 @@ StringRef llvm::object::getELFRelocationTypeName(uint32_t Machine,
       break;
     }
     break;
+  case ELF::EM_LOONGARCH:
+    switch (Type) {
+#include "llvm/BinaryFormat/ELFRelocs/LoongArch.def"
+    default:
+      break;
+    }
+    break;
   default:
     break;
   }
@@ -288,6 +295,7 @@ StringRef llvm::object::getELFSectionTypeName(uint32_t Machine, unsigned Type) {
     STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_SYMPART);
     STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_PART_EHDR);
     STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_PART_PHDR);
+    STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_BB_ADDR_MAP_V0);
     STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_BB_ADDR_MAP);
     STRINGIFY_ENUM_CASE(ELF, SHT_GNU_ATTRIBUTES);
     STRINGIFY_ENUM_CASE(ELF, SHT_GNU_HASH);
@@ -561,11 +569,9 @@ Expected<typename ELFT::DynRange> ELFFile<ELFT>::dynamicEntries() const {
   }
 
   if (Dyn.empty())
-    // TODO: this error is untested.
     return createError("invalid empty dynamic section");
 
   if (Dyn.back().d_tag != ELF::DT_NULL)
-    // TODO: this error is untested.
     return createError("dynamic sections must be DT_NULL terminated");
 
   return Dyn;
@@ -635,7 +641,6 @@ ELFFile<ELFT>::decodeBBAddrMap(const Elf_Shdr &Sec) const {
 
   DataExtractor::Cursor Cur(0);
   Error ULEBSizeErr = Error::success();
-
   // Helper to extract and decode the next ULEB128 value as uint32_t.
   // Returns zero and sets ULEBSizeErr if the ULEB128 value exceeds the uint32_t
   // limit.
@@ -655,18 +660,34 @@ ELFFile<ELFT>::decodeBBAddrMap(const Elf_Shdr &Sec) const {
     return static_cast<uint32_t>(Value);
   };
 
+  uint8_t Version = 0;
   while (!ULEBSizeErr && Cur && Cur.tell() < Content.size()) {
+    if (Sec.sh_type == ELF::SHT_LLVM_BB_ADDR_MAP) {
+      Version = Data.getU8(Cur);
+      if (!Cur)
+        break;
+      if (Version > 1)
+        return createError("unsupported SHT_LLVM_BB_ADDR_MAP version: " +
+                           Twine(static_cast<int>(Version)));
+      Data.getU8(Cur); // Feature byte
+    }
     uintX_t Address = static_cast<uintX_t>(Data.getAddress(Cur));
     uint32_t NumBlocks = ReadULEB128AsUInt32();
     std::vector<BBAddrMap::BBEntry> BBEntries;
+    uint32_t PrevBBEndOffset = 0;
     for (uint32_t BlockID = 0; !ULEBSizeErr && Cur && (BlockID < NumBlocks);
          ++BlockID) {
       uint32_t Offset = ReadULEB128AsUInt32();
       uint32_t Size = ReadULEB128AsUInt32();
       uint32_t Metadata = ReadULEB128AsUInt32();
+      if (Version >= 1) {
+        // Offset is calculated relative to the end of the previous BB.
+        Offset += PrevBBEndOffset;
+        PrevBBEndOffset = Offset + Size;
+      }
       BBEntries.push_back({Offset, Size, Metadata});
     }
-    FunctionEntries.push_back({Address, BBEntries});
+    FunctionEntries.push_back({Address, std::move(BBEntries)});
   }
   // Either Cur is in the error state, or ULEBSizeError is set (not both), but
   // we join the two errors here to be safe.
diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp
index cf1f12d9a9a7..38de669f1d3d 100644
--- a/llvm/lib/Object/ELFObjectFile.cpp
+++ b/llvm/lib/Object/ELFObjectFile.cpp
@@ -21,7 +21,6 @@
 #include "llvm/Object/Error.h"
 #include "llvm/Support/ARMAttributeParser.h"
 #include "llvm/Support/ARMBuildAttributes.h"
-#include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/RISCVAttributeParser.h"
@@ -31,7 +30,6 @@
 #include <cstdint>
 #include <memory>
 #include <string>
-#include <system_error>
 #include <utility>
 
 using namespace llvm;
@@ -169,11 +167,11 @@ SubtargetFeatures ELFObjectFileBase::getARMFeatures() const {
   bool isV7 = false;
   Optional<unsigned> Attr =
       Attributes.getAttributeValue(ARMBuildAttrs::CPU_arch);
-  if (Attr.hasValue())
+  if (Attr)
     isV7 = Attr.getValue() == ARMBuildAttrs::v7;
 
   Attr = Attributes.getAttributeValue(ARMBuildAttrs::CPU_arch_profile);
-  if (Attr.hasValue()) {
+  if (Attr) {
     switch (Attr.getValue()) {
     case ARMBuildAttrs::ApplicationProfile:
       Features.AddFeature("aclass");
@@ -192,7 +190,7 @@ SubtargetFeatures ELFObjectFileBase::getARMFeatures() const {
   }
 
   Attr = Attributes.getAttributeValue(ARMBuildAttrs::THUMB_ISA_use);
-  if (Attr.hasValue()) {
+  if (Attr) {
     switch (Attr.getValue()) {
     default:
       break;
@@ -207,7 +205,7 @@ SubtargetFeatures ELFObjectFileBase::getARMFeatures() const {
   }
 
   Attr = Attributes.getAttributeValue(ARMBuildAttrs::FP_arch);
-  if (Attr.hasValue()) {
+  if (Attr) {
     switch (Attr.getValue()) {
     default:
       break;
@@ -231,7 +229,7 @@ SubtargetFeatures ELFObjectFileBase::getARMFeatures() const {
   }
 
   Attr = Attributes.getAttributeValue(ARMBuildAttrs::Advanced_SIMD_arch);
-  if (Attr.hasValue()) {
+  if (Attr) {
     switch (Attr.getValue()) {
     default:
       break;
@@ -250,7 +248,7 @@ SubtargetFeatures ELFObjectFileBase::getARMFeatures() const {
   }
 
   Attr = Attributes.getAttributeValue(ARMBuildAttrs::MVE_arch);
-  if (Attr.hasValue()) {
+  if (Attr) {
     switch (Attr.getValue()) {
     default:
       break;
@@ -269,7 +267,7 @@ SubtargetFeatures ELFObjectFileBase::getARMFeatures() const {
   }
 
   Attr = Attributes.getAttributeValue(ARMBuildAttrs::DIV_use);
-  if (Attr.hasValue()) {
+  if (Attr) {
     switch (Attr.getValue()) {
     default:
       break;
@@ -305,11 +303,11 @@ SubtargetFeatures ELFObjectFileBase::getRISCVFeatures() const {
   }
 
   Optional<StringRef> Attr = Attributes.getAttributeString(RISCVAttrs::ARCH);
-  if (Attr.hasValue()) {
+  if (Attr) {
     // The Arch pattern is [rv32|rv64][i|e]version(_[m|a|f|d|c]version)*
     // Version string pattern is (major)p(minor). Major and minor are optional.
     // For example, a version number could be 2p0, 2, or p92.
-    StringRef Arch = Attr.getValue();
+    StringRef Arch = *Attr;
     if (Arch.consume_front("rv32"))
       Features.AddFeature("64bit", false);
     else if (Arch.consume_front("rv64"))
@@ -360,6 +358,8 @@ Optional<StringRef> ELFObjectFileBase::tryGetCPUName() const {
   switch (getEMachine()) {
   case ELF::EM_AMDGPU:
     return getAMDGPUCPUName();
+  case ELF::EM_PPC64:
+    return StringRef("future");
   default:
     return None;
   }
@@ -461,6 +461,8 @@ StringRef ELFObjectFileBase::getAMDGPUCPUName() const {
     return "gfx90a";
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C:
     return "gfx90c";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX940:
+    return "gfx940";
 
   // AMDGCN GFX10.
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010:
@@ -483,6 +485,18 @@ StringRef ELFObjectFileBase::getAMDGPUCPUName() const {
     return "gfx1034";
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1035:
     return "gfx1035";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1036:
+    return "gfx1036";
+
+  // AMDGCN GFX11.
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1100:
+    return "gfx1100";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1101:
+    return "gfx1101";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1102:
+    return "gfx1102";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1103:
+    return "gfx1103";
   default:
     llvm_unreachable("Unknown EF_AMDGPU_MACH value");
   }
@@ -509,7 +523,7 @@ void ELFObjectFileBase::setARMSubArch(Triple &TheTriple) const {
 
   Optional<unsigned> Attr =
       Attributes.getAttributeValue(ARMBuildAttrs::CPU_arch);
-  if (Attr.hasValue()) {
+  if (Attr) {
     switch (Attr.getValue()) {
     case ARMBuildAttrs::v4:
       Triple += "v4";
@@ -541,7 +555,7 @@ void ELFObjectFileBase::setARMSubArch(Triple &TheTriple) const {
     case ARMBuildAttrs::v7: {
       Optional<unsigned> ArchProfileAttr =
           Attributes.getAttributeValue(ARMBuildAttrs::CPU_arch_profile);
-      if (ArchProfileAttr.hasValue() &&
+      if (ArchProfileAttr &&
           ArchProfileAttr.getValue() == ARMBuildAttrs::MicroControllerProfile)
         Triple += "v7m";
       else
@@ -572,6 +586,9 @@ void ELFObjectFileBase::setARMSubArch(Triple &TheTriple) const {
     case ARMBuildAttrs::v8_1_M_Main:
       Triple += "v8.1m.main";
       break;
+    case ARMBuildAttrs::v9_A:
+      Triple += "v9a";
+      break;
     }
   }
   if (!isLittleEndian())
@@ -655,6 +672,36 @@ ELFObjectFileBase::getPltAddresses() const {
   return Result;
 }
 
+template <class ELFT>
+Expected<std::vector<BBAddrMap>>
+readBBAddrMapImpl(const ELFFile<ELFT> &EF,
+                  Optional<unsigned> TextSectionIndex) {
+  using Elf_Shdr = typename ELFT::Shdr;
+  std::vector<BBAddrMap> BBAddrMaps;
+  const auto &Sections = cantFail(EF.sections());
+  for (const Elf_Shdr &Sec : Sections) {
+    if (Sec.sh_type != ELF::SHT_LLVM_BB_ADDR_MAP &&
+        Sec.sh_type != ELF::SHT_LLVM_BB_ADDR_MAP_V0)
+      continue;
+    if (TextSectionIndex) {
+      Expected<const Elf_Shdr *> TextSecOrErr = EF.getSection(Sec.sh_link);
+      if (!TextSecOrErr)
+        return createError("unable to get the linked-to section for " +
+                           describe(EF, Sec) + ": " +
+                           toString(TextSecOrErr.takeError()));
+      if (*TextSectionIndex != std::distance(Sections.begin(), *TextSecOrErr))
+        continue;
+    }
+    Expected<std::vector<BBAddrMap>> BBAddrMapOrErr = EF.decodeBBAddrMap(Sec);
+    if (!BBAddrMapOrErr)
+      return createError("unable to read " + describe(EF, Sec) + ": " +
+                         toString(BBAddrMapOrErr.takeError()));
+    std::move(BBAddrMapOrErr->begin(), BBAddrMapOrErr->end(),
+              std::back_inserter(BBAddrMaps));
+  }
+  return BBAddrMaps;
+}
+
 template <class ELFT>
 static Expected<std::vector<VersionEntry>>
 readDynsymVersionsImpl(const ELFFile<ELFT> &EF,
@@ -723,3 +770,17 @@ ELFObjectFileBase::readDynsymVersions() const {
   return readDynsymVersionsImpl(cast<ELF64BEObjectFile>(this)->getELFFile(),
                                 Symbols);
 }
+
+Expected<std::vector<BBAddrMap>>
+ELFObjectFileBase::readBBAddrMap(Optional<unsigned> TextSectionIndex) const {
+  if (const auto *Obj = dyn_cast<ELF32LEObjectFile>(this))
+    return readBBAddrMapImpl(Obj->getELFFile(), TextSectionIndex);
+  if (const auto *Obj = dyn_cast<ELF64LEObjectFile>(this))
+    return readBBAddrMapImpl(Obj->getELFFile(), TextSectionIndex);
+  if (const auto *Obj = dyn_cast<ELF32BEObjectFile>(this))
+    return readBBAddrMapImpl(Obj->getELFFile(), TextSectionIndex);
+  if (const auto *Obj = cast<ELF64BEObjectFile>(this))
+    return readBBAddrMapImpl(Obj->getELFFile(), TextSectionIndex);
+  else
+    llvm_unreachable("Unsupported binary format");
+}
diff --git a/llvm/lib/Object/Error.cpp b/llvm/lib/Object/Error.cpp
index bc75bc6c0445..6d1e3f2a59d0 100644
--- a/llvm/lib/Object/Error.cpp
+++ b/llvm/lib/Object/Error.cpp
@@ -52,6 +52,8 @@ std::string _object_error_category::message(int EV) const {
     return "Bitcode section not found in object file";
   case object_error::invalid_symbol_index:
     return "Invalid symbol index";
+  case object_error::section_stripped:
+    return "Section has been stripped from the object file";
   }
   llvm_unreachable("An enumerator of object_error does not have a message "
                    "defined.");
diff --git a/llvm/lib/Object/IRObjectFile.cpp b/llvm/lib/Object/IRObjectFile.cpp
index c653262791cc..091930988bd0 100644
--- a/llvm/lib/Object/IRObjectFile.cpp
+++ b/llvm/lib/Object/IRObjectFile.cpp
@@ -11,20 +11,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Object/IRObjectFile.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/PointerUnion.h"
 #include "llvm/BinaryFormat/Magic.h"
 #include "llvm/Bitcode/BitcodeReader.h"
-#include "llvm/IR/GVMaterializer.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
-#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Object/ObjectFile.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 using namespace object;
 
+namespace llvm {
+class LLVMContext;
+class raw_ostream;
+} // namespace llvm
+
 IRObjectFile::IRObjectFile(MemoryBufferRef Object,
                            std::vector<std::unique_ptr<Module>> Mods)
     : SymbolicFile(Binary::ID_IR, Object), Mods(std::move(Mods)) {
@@ -32,7 +32,7 @@ IRObjectFile::IRObjectFile(MemoryBufferRef Object,
     SymTab.addModule(M.get());
 }
 
-IRObjectFile::~IRObjectFile() {}
+IRObjectFile::~IRObjectFile() = default;
 
 static ModuleSymbolTable::Symbol getSym(DataRefImpl &Symb) {
   return *reinterpret_cast<ModuleSymbolTable::Symbol *>(Symb.p);
diff --git a/llvm/lib/Object/IRSymtab.cpp b/llvm/lib/Object/IRSymtab.cpp
index dea3d90d3560..5a7ecdb1fc25 100644
--- a/llvm/lib/Object/IRSymtab.cpp
+++ b/llvm/lib/Object/IRSymtab.cpp
@@ -24,7 +24,6 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/StringTableBuilder.h"
-#include "llvm/Object/IRObjectFile.h"
 #include "llvm/Object/ModuleSymbolTable.h"
 #include "llvm/Object/SymbolicFile.h"
 #include "llvm/Support/Allocator.h"
diff --git a/llvm/lib/Object/MachOObjectFile.cpp b/llvm/lib/Object/MachOObjectFile.cpp
index 3d95b18f4672..2f463a1bd458 100644
--- a/llvm/lib/Object/MachOObjectFile.cpp
+++ b/llvm/lib/Object/MachOObjectFile.cpp
@@ -34,7 +34,7 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/LEB128.h"
-#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/MemoryBufferRef.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SwapByteOrder.h"
 #include "llvm/Support/raw_ostream.h"
@@ -1303,7 +1303,6 @@ MachOObjectFile::MachOObjectFile(MemoryBufferRef Object, bool IsLittleEndian,
   }
 
   const char *DyldIdLoadCmd = nullptr;
-  const char *FuncStartsLoadCmd = nullptr;
   const char *SplitInfoLoadCmd = nullptr;
   const char *CodeSignDrsLoadCmd = nullptr;
   const char *CodeSignLoadCmd = nullptr;
@@ -1381,6 +1380,11 @@ MachOObjectFile::MachOObjectFile(MemoryBufferRef Object, bool IsLittleEndian,
       if ((Err = checkDyldInfoCommand(*this, Load, I, &DyldInfoLoadCmd,
                                       "LC_DYLD_INFO_ONLY", Elements)))
         return;
+    } else if (Load.C.cmd == MachO::LC_DYLD_CHAINED_FIXUPS) {
+      if ((Err = checkLinkeditDataCommand(
+               *this, Load, I, &DyldChainedFixupsLoadCmd,
+               "LC_DYLD_CHAINED_FIXUPS", Elements, "chained fixups")))
+        return;
     } else if (Load.C.cmd == MachO::LC_UUID) {
       if (Load.C.cmdsize != sizeof(MachO::uuid_command)) {
         Err = malformedError("LC_UUID command " + Twine(I) + " has incorrect "
@@ -1596,9 +1600,9 @@ MachOObjectFile::MachOObjectFile(MemoryBufferRef Object, bool IsLittleEndian,
         return;
     // Note: LC_TWOLEVEL_HINTS is really obsolete and is not supported.
     } else if (Load.C.cmd == MachO::LC_TWOLEVEL_HINTS) {
-       if ((Err = checkTwoLevelHintsCommand(*this, Load, I,
-                                            &TwoLevelHintsLoadCmd, Elements)))
-         return;
+      if ((Err = checkTwoLevelHintsCommand(*this, Load, I,
+                                           &TwoLevelHintsLoadCmd, Elements)))
+        return;
     } else if (Load.C.cmd == MachO::LC_IDENT) {
       // Note: LC_IDENT is ignored.
       continue;
@@ -2993,7 +2997,9 @@ void ExportEntry::pushNode(uint64_t offset) {
         return;
       }
       if (O != nullptr) {
-        if (State.Other > O->getLibraryCount()) {
+        // Only positive numbers represent library ordinals. Zero and negative
+        // numbers have special meaning (see BindSpecialDylib).
+        if ((int64_t)State.Other > 0 && State.Other > O->getLibraryCount()) {
           *E = malformedError(
               "bad library ordinal: " + Twine((int)State.Other) + " (max " +
               Twine((int)O->getLibraryCount()) +
@@ -3186,6 +3192,106 @@ iterator_range<export_iterator> MachOObjectFile::exports(Error &Err) const {
   return exports(Err, getDyldInfoExportsTrie(), this);
 }
 
+MachOAbstractFixupEntry::MachOAbstractFixupEntry(Error *E,
+                                                 const MachOObjectFile *O)
+    : E(E), O(O) {
+  // Cache the vmaddress of __TEXT
+  for (const auto &Command : O->load_commands()) {
+    if (Command.C.cmd == MachO::LC_SEGMENT) {
+      MachO::segment_command SLC = O->getSegmentLoadCommand(Command);
+      if (StringRef(SLC.segname) == StringRef("__TEXT")) {
+        TextAddress = SLC.vmaddr;
+        break;
+      }
+    } else if (Command.C.cmd == MachO::LC_SEGMENT_64) {
+      MachO::segment_command_64 SLC_64 = O->getSegment64LoadCommand(Command);
+      if (StringRef(SLC_64.segname) == StringRef("__TEXT")) {
+        TextAddress = SLC_64.vmaddr;
+        break;
+      }
+    }
+  }
+}
+
+int32_t MachOAbstractFixupEntry::segmentIndex() const { return SegmentIndex; }
+
+uint64_t MachOAbstractFixupEntry::segmentOffset() const {
+  return SegmentOffset;
+}
+
+uint64_t MachOAbstractFixupEntry::segmentAddress() const {
+  return O->BindRebaseAddress(SegmentIndex, 0);
+}
+
+StringRef MachOAbstractFixupEntry::segmentName() const {
+  return O->BindRebaseSegmentName(SegmentIndex);
+}
+
+StringRef MachOAbstractFixupEntry::sectionName() const {
+  return O->BindRebaseSectionName(SegmentIndex, SegmentOffset);
+}
+
+uint64_t MachOAbstractFixupEntry::address() const {
+  return O->BindRebaseAddress(SegmentIndex, SegmentOffset);
+}
+
+StringRef MachOAbstractFixupEntry::symbolName() const { return SymbolName; }
+
+int64_t MachOAbstractFixupEntry::addend() const { return Addend; }
+
+uint32_t MachOAbstractFixupEntry::flags() const { return Flags; }
+
+int MachOAbstractFixupEntry::ordinal() const { return Ordinal; }
+
+StringRef MachOAbstractFixupEntry::typeName() const { return "unknown"; }
+
+void MachOAbstractFixupEntry::moveToFirst() {
+  SegmentOffset = 0;
+  SegmentIndex = -1;
+  Ordinal = 0;
+  Flags = 0;
+  Addend = 0;
+  Done = false;
+}
+
+void MachOAbstractFixupEntry::moveToEnd() { Done = true; }
+
+MachOChainedFixupEntry::MachOChainedFixupEntry(Error *E,
+                                               const MachOObjectFile *O,
+                                               bool Parse)
+    : MachOAbstractFixupEntry(E, O) {
+  ErrorAsOutParameter e(E);
+  if (!Parse)
+    return;
+  if (auto FixupTargetsOrErr = O->getDyldChainedFixupTargets())
+    FixupTargets = *FixupTargetsOrErr;
+  else {
+    *E = FixupTargetsOrErr.takeError();
+    return;
+  }
+}
+
+void MachOChainedFixupEntry::moveToFirst() {
+  MachOAbstractFixupEntry::moveToFirst();
+  FixupIndex = 0;
+  moveNext();
+}
+
+void MachOChainedFixupEntry::moveToEnd() {
+  MachOAbstractFixupEntry::moveToEnd();
+}
+
+void MachOChainedFixupEntry::moveNext() { Done = true; }
+
+bool MachOChainedFixupEntry::operator==(
+    const MachOChainedFixupEntry &Other) const {
+  if (Done == Other.Done)
+    return true;
+  if ((FixupIndex == Other.FixupIndex))
+    return true;
+  return false;
+}
+
 MachORebaseEntry::MachORebaseEntry(Error *E, const MachOObjectFile *O,
                                    ArrayRef<uint8_t> Bytes, bool is64Bit)
     : E(E), O(O), Opcodes(Bytes), Ptr(Bytes.begin()),
@@ -4194,6 +4300,16 @@ iterator_range<bind_iterator> MachOObjectFile::weakBindTable(Error &Err) {
                    MachOBindEntry::Kind::Weak);
 }
 
+iterator_range<fixup_iterator> MachOObjectFile::fixupTable(Error &Err) {
+  MachOChainedFixupEntry Start(&Err, this, true);
+  Start.moveToFirst();
+
+  MachOChainedFixupEntry Finish(&Err, this, false);
+  Finish.moveToEnd();
+
+  return make_range(fixup_iterator(Start), fixup_iterator(Finish));
+}
+
 MachOObjectFile::load_command_iterator
 MachOObjectFile::begin_load_commands() const {
   return LoadCommands.begin();
@@ -4649,6 +4765,72 @@ ArrayRef<uint8_t> MachOObjectFile::getDyldInfoLazyBindOpcodes() const {
   return makeArrayRef(Ptr, DyldInfo.lazy_bind_size);
 }
 
+Expected<Optional<MachO::dyld_chained_fixups_header>>
+MachOObjectFile::getChainedFixupsHeader() const {
+  // Load the dyld chained fixups load command.
+  if (!DyldChainedFixupsLoadCmd)
+    return llvm::None;
+  auto DyldChainedFixupsOrErr = getStructOrErr<MachO::linkedit_data_command>(
+      *this, DyldChainedFixupsLoadCmd);
+  if (!DyldChainedFixupsOrErr)
+    return DyldChainedFixupsOrErr.takeError();
+  MachO::linkedit_data_command DyldChainedFixups = DyldChainedFixupsOrErr.get();
+
+  // If the load command is present but the data offset has been zeroed out,
+  // as is the case for dylib stubs, return None (no error).
+  uint64_t CFHeaderOffset = DyldChainedFixups.dataoff;
+  if (CFHeaderOffset == 0)
+    return DyldChainedFixupsOrErr.takeError();
+
+  // Load the dyld chained fixups header.
+  const char *CFHeaderPtr = getPtr(*this, CFHeaderOffset);
+  auto CFHeaderOrErr =
+      getStructOrErr<MachO::dyld_chained_fixups_header>(*this, CFHeaderPtr);
+  if (!CFHeaderOrErr)
+    return CFHeaderOrErr.takeError();
+  MachO::dyld_chained_fixups_header CFHeader = CFHeaderOrErr.get();
+
+  // Reject unknown chained fixup formats.
+  if (CFHeader.fixups_version != 0)
+    return malformedError(Twine("bad chained fixups: unknown version: ") +
+                          Twine(CFHeader.fixups_version));
+  if (CFHeader.imports_format < 1 || CFHeader.imports_format > 3)
+    return malformedError(
+        Twine("bad chained fixups: unknown imports format: ") +
+        Twine(CFHeader.imports_format));
+
+  // Validate the image format.
+  //
+  // Load the image starts.
+  uint64_t CFImageStartsOffset = (CFHeaderOffset + CFHeader.starts_offset);
+  if (CFHeader.starts_offset < sizeof(MachO::dyld_chained_fixups_header)) {
+    return malformedError(Twine("bad chained fixups: image starts offset ") +
+                          Twine(CFHeader.starts_offset) +
+                          " overlaps with chained fixups header");
+  }
+  uint32_t EndOffset = DyldChainedFixups.dataoff + DyldChainedFixups.datasize;
+  if (CFImageStartsOffset + sizeof(MachO::dyld_chained_starts_in_image) >
+      EndOffset) {
+    return malformedError(Twine("bad chained fixups: image starts end ") +
+                          Twine(CFImageStartsOffset +
+                                sizeof(MachO::dyld_chained_starts_in_image)) +
+                          " extends past end " + Twine(EndOffset));
+  }
+
+  return CFHeader;
+}
+
+Expected<std::vector<ChainedFixupTarget>>
+MachOObjectFile::getDyldChainedFixupTargets() const {
+  auto CFHeaderOrErr = getChainedFixupsHeader();
+  if (!CFHeaderOrErr)
+    return CFHeaderOrErr.takeError();
+  std::vector<ChainedFixupTarget> Targets;
+  if (!(*CFHeaderOrErr))
+    return Targets;
+  return Targets;
+}
+
 ArrayRef<uint8_t> MachOObjectFile::getDyldInfoExportsTrie() const {
   if (!DyldInfoLoadCmd)
     return None;
@@ -4663,6 +4845,21 @@ ArrayRef<uint8_t> MachOObjectFile::getDyldInfoExportsTrie() const {
   return makeArrayRef(Ptr, DyldInfo.export_size);
 }
 
+SmallVector<uint64_t> MachOObjectFile::getFunctionStarts() const {
+  if (!FuncStartsLoadCmd)
+    return {};
+
+  auto InfoOrErr =
+      getStructOrErr<MachO::linkedit_data_command>(*this, FuncStartsLoadCmd);
+  if (!InfoOrErr)
+    return {};
+
+  MachO::linkedit_data_command Info = InfoOrErr.get();
+  SmallVector<uint64_t, 8> FunctionStarts;
+  this->ReadULEB128s(Info.dataoff, FunctionStarts);
+  return std::move(FunctionStarts);
+}
+
 ArrayRef<uint8_t> MachOObjectFile::getUuid() const {
   if (!UuidLoadCmd)
     return None;
@@ -4778,3 +4975,23 @@ MachOObjectFile::mapReflectionSectionNameToEnumValue(
       .Default(llvm::binaryformat::Swift5ReflectionSectionKind::unknown);
 #undef HANDLE_SWIFT_SECTION
 }
+
+bool MachOObjectFile::isMachOPairedReloc(uint64_t RelocType, uint64_t Arch) {
+  switch (Arch) {
+  case Triple::x86:
+    return RelocType == MachO::GENERIC_RELOC_SECTDIFF ||
+           RelocType == MachO::GENERIC_RELOC_LOCAL_SECTDIFF;
+  case Triple::x86_64:
+    return RelocType == MachO::X86_64_RELOC_SUBTRACTOR;
+  case Triple::arm:
+  case Triple::thumb:
+    return RelocType == MachO::ARM_RELOC_SECTDIFF ||
+           RelocType == MachO::ARM_RELOC_LOCAL_SECTDIFF ||
+           RelocType == MachO::ARM_RELOC_HALF ||
+           RelocType == MachO::ARM_RELOC_HALF_SECTDIFF;
+  case Triple::aarch64:
+    return RelocType == MachO::ARM64_RELOC_SUBTRACTOR;
+  default:
+    return false;
+  }
+}
diff --git a/llvm/lib/Object/MachOUniversal.cpp b/llvm/lib/Object/MachOUniversal.cpp
index f3ce005e6ef9..c2c2b67814dc 100644
--- a/llvm/lib/Object/MachOUniversal.cpp
+++ b/llvm/lib/Object/MachOUniversal.cpp
@@ -15,9 +15,9 @@
 #include "llvm/Object/IRObjectFile.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Object/ObjectFile.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Host.h"
-#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SwapByteOrder.h"
+#include "llvm/Support/type_traits.h"
 
 using namespace llvm;
 using namespace object;
diff --git a/llvm/lib/Object/MachOUniversalWriter.cpp b/llvm/lib/Object/MachOUniversalWriter.cpp
index ae1ff09a4f8f..333706baf8c1 100644
--- a/llvm/lib/Object/MachOUniversalWriter.cpp
+++ b/llvm/lib/Object/MachOUniversalWriter.cpp
@@ -12,13 +12,21 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Object/MachOUniversalWriter.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/Binary.h"
-#include "llvm/Object/Error.h"
 #include "llvm/Object/IRObjectFile.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Object/MachOUniversal.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/MemoryBufferRef.h"
+#include "llvm/Support/SwapByteOrder.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 using namespace object;
@@ -205,7 +213,7 @@ Expected<Slice> Slice::create(const Archive &A, LLVMContext *LLVMCtx) {
             .c_str());
 
   if (MFO) {
-    Slice ArchiveSlice(*(MFO.get()), MFO->is64Bit() ? 3 : 2);
+    Slice ArchiveSlice(*(MFO), MFO->is64Bit() ? 3 : 2);
     ArchiveSlice.B = &A;
     return ArchiveSlice;
   }
diff --git a/llvm/lib/Object/ModuleSymbolTable.cpp b/llvm/lib/Object/ModuleSymbolTable.cpp
index 954d1f09f4e9..11274a7fcc16 100644
--- a/llvm/lib/Object/ModuleSymbolTable.cpp
+++ b/llvm/lib/Object/ModuleSymbolTable.cpp
@@ -15,7 +15,6 @@
 #include "llvm/Object/ModuleSymbolTable.h"
 #include "RecordStreamer.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
@@ -27,7 +26,6 @@
 #include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
@@ -39,7 +37,6 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Object/SymbolicFile.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SMLoc.h"
diff --git a/llvm/lib/Object/Object.cpp b/llvm/lib/Object/Object.cpp
index 576eb8d069d6..d5e67160dfa3 100644
--- a/llvm/lib/Object/Object.cpp
+++ b/llvm/lib/Object/Object.cpp
@@ -120,6 +120,8 @@ LLVMBinaryType LLVMBinaryGetType(LLVMBinaryRef BR) {
         return LLVMBinaryTypeMachO64L;
       case ID_MachO64B:
         return LLVMBinaryTypeMachO64B;
+      case ID_Offload:
+        return LLVMBinaryTypeOffload;
       case ID_Wasm:
         return LLVMBinaryTypeWasm;
       case ID_StartObjects:
diff --git a/llvm/lib/Object/ObjectFile.cpp b/llvm/lib/Object/ObjectFile.cpp
index 6fd02f3b9592..1be8f11751be 100644
--- a/llvm/lib/Object/ObjectFile.cpp
+++ b/llvm/lib/Object/ObjectFile.cpp
@@ -21,10 +21,9 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ErrorOr.h"
-#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cstdint>
 #include <memory>
 #include <system_error>
@@ -147,6 +146,9 @@ ObjectFile::createObjectFile(MemoryBufferRef Object, file_magic Type,
   case file_magic::pdb:
   case file_magic::minidump:
   case file_magic::goff_object:
+  case file_magic::cuda_fatbinary:
+  case file_magic::offload_binary:
+  case file_magic::dxcontainer_object:
     return errorCodeToError(object_error::invalid_file_type);
   case file_magic::tapi_file:
     return errorCodeToError(object_error::invalid_file_type);
@@ -198,3 +200,12 @@ ObjectFile::createObjectFile(StringRef ObjectPath) {
 
   return OwningBinary<ObjectFile>(std::move(Obj), std::move(Buffer));
 }
+
+bool ObjectFile::isReflectionSectionStrippable(
+    llvm::binaryformat::Swift5ReflectionSectionKind ReflectionSectionKind)
+    const {
+  using llvm::binaryformat::Swift5ReflectionSectionKind;
+  return ReflectionSectionKind == Swift5ReflectionSectionKind::fieldmd ||
+         ReflectionSectionKind == Swift5ReflectionSectionKind::reflstr ||
+         ReflectionSectionKind == Swift5ReflectionSectionKind::assocty;
+}
diff --git a/llvm/lib/Object/OffloadBinary.cpp b/llvm/lib/Object/OffloadBinary.cpp
new file mode 100644
index 000000000000..21946ec2d6fb
--- /dev/null
+++ b/llvm/lib/Object/OffloadBinary.cpp
@@ -0,0 +1,164 @@
+//===- Offloading.cpp - Utilities for handling offloading code  -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/OffloadBinary.h"
+
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/BinaryFormat/Magic.h"
+#include "llvm/MC/StringTableBuilder.h"
+#include "llvm/Object/Error.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/FileOutputBuffer.h"
+
+using namespace llvm;
+using namespace llvm::object;
+
+Expected<std::unique_ptr<OffloadBinary>>
+OffloadBinary::create(MemoryBufferRef Buf) {
+  if (Buf.getBufferSize() < sizeof(Header) + sizeof(Entry))
+    return errorCodeToError(object_error::parse_failed);
+
+  // Check for 0x10FF1OAD magic bytes.
+  if (identify_magic(Buf.getBuffer()) != file_magic::offload_binary)
+    return errorCodeToError(object_error::parse_failed);
+
+  // Make sure that the data has sufficient alignment.
+  if (!isAddrAligned(Align(getAlignment()), Buf.getBufferStart()))
+    return errorCodeToError(object_error::parse_failed);
+
+  const char *Start = Buf.getBufferStart();
+  const Header *TheHeader = reinterpret_cast<const Header *>(Start);
+  if (TheHeader->Version != OffloadBinary::Version)
+    return errorCodeToError(object_error::parse_failed);
+
+  if (TheHeader->Size > Buf.getBufferSize() ||
+      TheHeader->EntryOffset > TheHeader->Size - sizeof(Entry) ||
+      TheHeader->EntrySize > TheHeader->Size - sizeof(Header))
+    return errorCodeToError(object_error::unexpected_eof);
+
+  const Entry *TheEntry =
+      reinterpret_cast<const Entry *>(&Start[TheHeader->EntryOffset]);
+
+  if (TheEntry->ImageOffset > Buf.getBufferSize() ||
+      TheEntry->StringOffset > Buf.getBufferSize())
+    return errorCodeToError(object_error::unexpected_eof);
+
+  return std::unique_ptr<OffloadBinary>(
+      new OffloadBinary(Buf, TheHeader, TheEntry));
+}
+
+std::unique_ptr<MemoryBuffer>
+OffloadBinary::write(const OffloadingImage &OffloadingData) {
+  // Create a null-terminated string table with all the used strings.
+  StringTableBuilder StrTab(StringTableBuilder::ELF);
+  for (auto &KeyAndValue : OffloadingData.StringData) {
+    StrTab.add(KeyAndValue.getKey());
+    StrTab.add(KeyAndValue.getValue());
+  }
+  StrTab.finalize();
+
+  uint64_t StringEntrySize =
+      sizeof(StringEntry) * OffloadingData.StringData.size();
+
+  // Make sure the image we're wrapping around is aligned as well.
+  uint64_t BinaryDataSize = alignTo(sizeof(Header) + sizeof(Entry) +
+                                        StringEntrySize + StrTab.getSize(),
+                                    getAlignment());
+
+  // Create the header and fill in the offsets. The entry will be directly
+  // placed after the header in memory. Align the size to the alignment of the
+  // header so this can be placed contiguously in a single section.
+  Header TheHeader;
+  TheHeader.Size = alignTo(
+      BinaryDataSize + OffloadingData.Image->getBufferSize(), getAlignment());
+  TheHeader.EntryOffset = sizeof(Header);
+  TheHeader.EntrySize = sizeof(Entry);
+
+  // Create the entry using the string table offsets. The string table will be
+  // placed directly after the entry in memory, and the image after that.
+  Entry TheEntry;
+  TheEntry.TheImageKind = OffloadingData.TheImageKind;
+  TheEntry.TheOffloadKind = OffloadingData.TheOffloadKind;
+  TheEntry.Flags = OffloadingData.Flags;
+  TheEntry.StringOffset = sizeof(Header) + sizeof(Entry);
+  TheEntry.NumStrings = OffloadingData.StringData.size();
+
+  TheEntry.ImageOffset = BinaryDataSize;
+  TheEntry.ImageSize = OffloadingData.Image->getBufferSize();
+
+  SmallVector<char> Data;
+  Data.reserve(TheHeader.Size);
+  raw_svector_ostream OS(Data);
+  OS << StringRef(reinterpret_cast<char *>(&TheHeader), sizeof(Header));
+  OS << StringRef(reinterpret_cast<char *>(&TheEntry), sizeof(Entry));
+  for (auto &KeyAndValue : OffloadingData.StringData) {
+    uint64_t Offset = sizeof(Header) + sizeof(Entry) + StringEntrySize;
+    StringEntry Map{Offset + StrTab.getOffset(KeyAndValue.getKey()),
+                    Offset + StrTab.getOffset(KeyAndValue.getValue())};
+    OS << StringRef(reinterpret_cast<char *>(&Map), sizeof(StringEntry));
+  }
+  StrTab.write(OS);
+  // Add padding to required image alignment.
+  OS.write_zeros(TheEntry.ImageOffset - OS.tell());
+  OS << OffloadingData.Image->getBuffer();
+
+  // Add final padding to required alignment.
+  assert(TheHeader.Size >= OS.tell() && "Too much data written?");
+  OS.write_zeros(TheHeader.Size - OS.tell());
+  assert(TheHeader.Size == OS.tell() && "Size mismatch");
+
+  return MemoryBuffer::getMemBufferCopy(OS.str());
+}
+
+OffloadKind object::getOffloadKind(StringRef Name) {
+  return llvm::StringSwitch<OffloadKind>(Name)
+      .Case("openmp", OFK_OpenMP)
+      .Case("cuda", OFK_Cuda)
+      .Case("hip", OFK_HIP)
+      .Default(OFK_None);
+}
+
+StringRef object::getOffloadKindName(OffloadKind Kind) {
+  switch (Kind) {
+  case OFK_OpenMP:
+    return "openmp";
+  case OFK_Cuda:
+    return "cuda";
+  case OFK_HIP:
+    return "hip";
+  default:
+    return "none";
+  }
+}
+
+ImageKind object::getImageKind(StringRef Name) {
+  return llvm::StringSwitch<ImageKind>(Name)
+      .Case("o", IMG_Object)
+      .Case("bc", IMG_Bitcode)
+      .Case("cubin", IMG_Cubin)
+      .Case("fatbin", IMG_Fatbinary)
+      .Case("s", IMG_PTX)
+      .Default(IMG_None);
+}
+
+StringRef object::getImageKindName(ImageKind Kind) {
+  switch (Kind) {
+  case IMG_Object:
+    return "o";
+  case IMG_Bitcode:
+    return "bc";
+  case IMG_Cubin:
+    return "cubin";
+  case IMG_Fatbinary:
+    return "fatbin";
+  case IMG_PTX:
+    return "s";
+  default:
+    return "";
+  }
+}
diff --git a/llvm/lib/Object/RecordStreamer.h b/llvm/lib/Object/RecordStreamer.h
index 957d80f33bf4..5c6541e5052d 100644
--- a/llvm/lib/Object/RecordStreamer.h
+++ b/llvm/lib/Object/RecordStreamer.h
@@ -57,10 +57,10 @@ public:
   // Ignore COFF-specific directives; we do not need any information from them,
   // but the default implementation of these methods crashes, so we override
   // them with versions that do nothing.
-  void BeginCOFFSymbolDef(const MCSymbol *Symbol) override {}
-  void EmitCOFFSymbolStorageClass(int StorageClass) override {}
-  void EmitCOFFSymbolType(int Type) override {}
-  void EndCOFFSymbolDef() override {}
+  void beginCOFFSymbolDef(const MCSymbol *Symbol) override {}
+  void emitCOFFSymbolStorageClass(int StorageClass) override {}
+  void emitCOFFSymbolType(int Type) override {}
+  void endCOFFSymbolDef() override {}
 
   /// Record .symver aliases for later processing.
   void emitELFSymverDirective(const MCSymbol *OriginalSym, StringRef Name,
diff --git a/llvm/lib/Object/RelocationResolver.cpp b/llvm/lib/Object/RelocationResolver.cpp
index 00a45e2c5d4e..e14301663df3 100644
--- a/llvm/lib/Object/RelocationResolver.cpp
+++ b/llvm/lib/Object/RelocationResolver.cpp
@@ -11,6 +11,21 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Object/RelocationResolver.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/BinaryFormat/MachO.h"
+#include "llvm/BinaryFormat/Wasm.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/ELFTypes.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/SymbolicFile.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+#include <vector>
 
 namespace llvm {
 namespace object {
@@ -63,6 +78,7 @@ static bool supportsAArch64(uint64_t Type) {
   switch (Type) {
   case ELF::R_AARCH64_ABS32:
   case ELF::R_AARCH64_ABS64:
+  case ELF::R_AARCH64_PREL16:
   case ELF::R_AARCH64_PREL32:
   case ELF::R_AARCH64_PREL64:
     return true;
@@ -78,6 +94,8 @@ static uint64_t resolveAArch64(uint64_t Type, uint64_t Offset, uint64_t S,
     return (S + Addend) & 0xFFFFFFFF;
   case ELF::R_AARCH64_ABS64:
     return S + Addend;
+  case ELF::R_AARCH64_PREL16:
+    return (S + Addend - Offset) & 0xFFFF;
   case ELF::R_AARCH64_PREL32:
     return (S + Addend - Offset) & 0xFFFFFFFF;
   case ELF::R_AARCH64_PREL64:
@@ -468,6 +486,31 @@ static uint64_t resolveRISCV(uint64_t Type, uint64_t Offset, uint64_t S,
   }
 }
 
+static bool supportsCSKY(uint64_t Type) {
+  switch (Type) {
+  case ELF::R_CKCORE_NONE:
+  case ELF::R_CKCORE_ADDR32:
+  case ELF::R_CKCORE_PCREL32:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static uint64_t resolveCSKY(uint64_t Type, uint64_t Offset, uint64_t S,
+                            uint64_t LocData, int64_t Addend) {
+  switch (Type) {
+  case ELF::R_CKCORE_NONE:
+    return LocData;
+  case ELF::R_CKCORE_ADDR32:
+    return (S + Addend) & 0xFFFFFFFF;
+  case ELF::R_CKCORE_PCREL32:
+    return (S + Addend - Offset) & 0xFFFFFFFF;
+  default:
+    llvm_unreachable("Invalid relocation type");
+  }
+}
+
 static bool supportsCOFFX86(uint64_t Type) {
   switch (Type) {
   case COFF::IMAGE_REL_I386_SECREL:
@@ -715,6 +758,8 @@ getRelocationResolver(const ObjectFile &Obj) {
       return {supportsHexagon, resolveHexagon};
     case Triple::riscv32:
       return {supportsRISCV, resolveRISCV};
+    case Triple::csky:
+      return {supportsCSKY, resolveCSKY};
     default:
       return {nullptr, nullptr};
     }
diff --git a/llvm/lib/Object/SymbolicFile.cpp b/llvm/lib/Object/SymbolicFile.cpp
index 58db5b672914..05f47cfbf2ff 100644
--- a/llvm/lib/Object/SymbolicFile.cpp
+++ b/llvm/lib/Object/SymbolicFile.cpp
@@ -17,18 +17,17 @@
 #include "llvm/Object/Error.h"
 #include "llvm/Object/IRObjectFile.h"
 #include "llvm/Object/ObjectFile.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/ErrorOr.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include <algorithm>
 #include <memory>
 
 using namespace llvm;
 using namespace object;
 
+namespace llvm {
+class LLVMContext;
+}
+
 SymbolicFile::SymbolicFile(unsigned int Type, MemoryBufferRef Source)
     : Binary(Type, Source) {}
 
diff --git a/llvm/lib/Object/TapiFile.cpp b/llvm/lib/Object/TapiFile.cpp
index 83568e8d823a..596445a09e85 100644
--- a/llvm/lib/Object/TapiFile.cpp
+++ b/llvm/lib/Object/TapiFile.cpp
@@ -12,8 +12,12 @@
 
 #include "llvm/Object/TapiFile.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/MachO.h"
 #include "llvm/Object/Error.h"
-#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/MemoryBufferRef.h"
+#include "llvm/TextAPI/ArchitectureSet.h"
+#include "llvm/TextAPI/InterfaceFile.h"
+#include "llvm/TextAPI/Platform.h"
 #include "llvm/TextAPI/Symbol.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Object/TapiUniversal.cpp b/llvm/lib/Object/TapiUniversal.cpp
index d73d93f6bd53..bf96b57f0321 100644
--- a/llvm/lib/Object/TapiUniversal.cpp
+++ b/llvm/lib/Object/TapiUniversal.cpp
@@ -13,7 +13,8 @@
 #include "llvm/Object/TapiUniversal.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Object/Error.h"
-#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Object/TapiFile.h"
+#include "llvm/TextAPI/ArchitectureSet.h"
 #include "llvm/TextAPI/TextAPIReader.h"
 
 using namespace llvm;
@@ -47,7 +48,7 @@ TapiUniversal::~TapiUniversal() = default;
 Expected<std::unique_ptr<TapiFile>>
 TapiUniversal::ObjectForArch::getAsObjectFile() const {
   return std::unique_ptr<TapiFile>(new TapiFile(Parent->getMemoryBufferRef(),
-                                                *Parent->ParsedFile.get(),
+                                                *Parent->ParsedFile,
                                                 Parent->Libraries[Index].Arch));
 }
 
diff --git a/llvm/lib/Object/WasmObjectFile.cpp b/llvm/lib/Object/WasmObjectFile.cpp
index 6a19b159f3d5..ce816b097691 100644
--- a/llvm/lib/Object/WasmObjectFile.cpp
+++ b/llvm/lib/Object/WasmObjectFile.cpp
@@ -8,7 +8,6 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
@@ -30,7 +29,6 @@
 #include <cassert>
 #include <cstdint>
 #include <cstring>
-#include <system_error>
 
 #define DEBUG_TYPE "wasm-object"
 
@@ -166,23 +164,25 @@ static uint8_t readOpcode(WasmObjectFile::ReadContext &Ctx) {
 
 static Error readInitExpr(wasm::WasmInitExpr &Expr,
                           WasmObjectFile::ReadContext &Ctx) {
-  Expr.Opcode = readOpcode(Ctx);
+  auto Start = Ctx.Ptr;
 
-  switch (Expr.Opcode) {
+  Expr.Extended = false;
+  Expr.Inst.Opcode = readOpcode(Ctx);
+  switch (Expr.Inst.Opcode) {
   case wasm::WASM_OPCODE_I32_CONST:
-    Expr.Value.Int32 = readVarint32(Ctx);
+    Expr.Inst.Value.Int32 = readVarint32(Ctx);
     break;
   case wasm::WASM_OPCODE_I64_CONST:
-    Expr.Value.Int64 = readVarint64(Ctx);
+    Expr.Inst.Value.Int64 = readVarint64(Ctx);
     break;
   case wasm::WASM_OPCODE_F32_CONST:
-    Expr.Value.Float32 = readFloat32(Ctx);
+    Expr.Inst.Value.Float32 = readFloat32(Ctx);
     break;
   case wasm::WASM_OPCODE_F64_CONST:
-    Expr.Value.Float64 = readFloat64(Ctx);
+    Expr.Inst.Value.Float64 = readFloat64(Ctx);
     break;
   case wasm::WASM_OPCODE_GLOBAL_GET:
-    Expr.Value.Global = readULEB128(Ctx);
+    Expr.Inst.Value.Global = readULEB128(Ctx);
     break;
   case wasm::WASM_OPCODE_REF_NULL: {
     wasm::ValType Ty = static_cast<wasm::ValType>(readULEB128(Ctx));
@@ -193,15 +193,46 @@ static Error readInitExpr(wasm::WasmInitExpr &Expr,
     break;
   }
   default:
-    return make_error<GenericBinaryError>("invalid opcode in init_expr",
-                                          object_error::parse_failed);
+    Expr.Extended = true;
   }
 
-  uint8_t EndOpcode = readOpcode(Ctx);
-  if (EndOpcode != wasm::WASM_OPCODE_END) {
-    return make_error<GenericBinaryError>("invalid init_expr",
-                                          object_error::parse_failed);
+  if (!Expr.Extended) {
+    uint8_t EndOpcode = readOpcode(Ctx);
+    if (EndOpcode != wasm::WASM_OPCODE_END)
+      Expr.Extended = true;
+  }
+
+  if (Expr.Extended) {
+    Ctx.Ptr = Start;
+    while (1) {
+      uint8_t Opcode = readOpcode(Ctx);
+      switch (Opcode) {
+      case wasm::WASM_OPCODE_I32_CONST:
+      case wasm::WASM_OPCODE_GLOBAL_GET:
+      case wasm::WASM_OPCODE_REF_NULL:
+      case wasm::WASM_OPCODE_I64_CONST:
+      case wasm::WASM_OPCODE_F32_CONST:
+      case wasm::WASM_OPCODE_F64_CONST:
+        readULEB128(Ctx);
+        break;
+      case wasm::WASM_OPCODE_I32_ADD:
+      case wasm::WASM_OPCODE_I32_SUB:
+      case wasm::WASM_OPCODE_I32_MUL:
+      case wasm::WASM_OPCODE_I64_ADD:
+      case wasm::WASM_OPCODE_I64_SUB:
+      case wasm::WASM_OPCODE_I64_MUL:
+        break;
+      case wasm::WASM_OPCODE_END:
+        Expr.Body = ArrayRef<uint8_t>(Start, Ctx.Ptr - Start);
+        return Error::success();
+      default:
+        return make_error<GenericBinaryError>(
+            Twine("invalid opcode in init_expr: ") + Twine(unsigned(Opcode)),
+            object_error::parse_failed);
+      }
+    }
   }
+
   return Error::success();
 }
 
@@ -420,10 +451,6 @@ Error WasmObjectFile::parseNameSection(ReadContext &Ctx) {
   llvm::DenseSet<uint64_t> SeenFunctions;
   llvm::DenseSet<uint64_t> SeenGlobals;
   llvm::DenseSet<uint64_t> SeenSegments;
-  if (Functions.size() && !SeenCodeSection) {
-    return make_error<GenericBinaryError>("names must come after code section",
-                                          object_error::parse_failed);
-  }
 
   while (Ctx.Ptr < Ctx.End) {
     uint8_t Type = readUint8(Ctx);
@@ -443,7 +470,7 @@ Error WasmObjectFile::parseNameSection(ReadContext &Ctx) {
             return make_error<GenericBinaryError>(
                 "function named more than once", object_error::parse_failed);
           if (!isValidFunctionIndex(Index) || Name.empty())
-            return make_error<GenericBinaryError>("invalid name entry",
+            return make_error<GenericBinaryError>("invalid function name entry",
                                                   object_error::parse_failed);
 
           if (isDefinedFunctionIndex(Index))
@@ -454,7 +481,7 @@ Error WasmObjectFile::parseNameSection(ReadContext &Ctx) {
             return make_error<GenericBinaryError>("global named more than once",
                                                   object_error::parse_failed);
           if (!isValidGlobalIndex(Index) || Name.empty())
-            return make_error<GenericBinaryError>("invalid name entry",
+            return make_error<GenericBinaryError>("invalid global name entry",
                                                   object_error::parse_failed);
         } else {
           nameType = wasm::NameType::DATA_SEGMENT;
@@ -462,7 +489,7 @@ Error WasmObjectFile::parseNameSection(ReadContext &Ctx) {
             return make_error<GenericBinaryError>(
                 "segment named more than once", object_error::parse_failed);
           if (Index > DataSegments.size())
-            return make_error<GenericBinaryError>("invalid named data segment",
+            return make_error<GenericBinaryError>("invalid data segment name entry",
                                                   object_error::parse_failed);
         }
         DebugNames.push_back(wasm::WasmDebugName{nameType, Index, Name});
@@ -488,11 +515,6 @@ Error WasmObjectFile::parseNameSection(ReadContext &Ctx) {
 
 Error WasmObjectFile::parseLinkingSection(ReadContext &Ctx) {
   HasLinkingSection = true;
-  if (Functions.size() && !SeenCodeSection) {
-    return make_error<GenericBinaryError>(
-        "linking data must come after code section",
-        object_error::parse_failed);
-  }
 
   LinkingData.Version = readVaruint32(Ctx);
   if (LinkingData.Version != wasm::WasmMetadataVersion) {
@@ -1379,7 +1401,6 @@ Error WasmObjectFile::parseStartSection(ReadContext &Ctx) {
 }
 
 Error WasmObjectFile::parseCodeSection(ReadContext &Ctx) {
-  SeenCodeSection = true;
   CodeSection = Sections.size();
   uint32_t FunctionCount = readVaruint32(Ctx);
   if (FunctionCount != Functions.size()) {
@@ -1443,8 +1464,9 @@ Error WasmObjectFile::parseElemSection(ReadContext &Ctx) {
                                             object_error::parse_failed);
 
     if (Segment.Flags & wasm::WASM_ELEM_SEGMENT_IS_PASSIVE) {
-      Segment.Offset.Opcode = wasm::WASM_OPCODE_I32_CONST;
-      Segment.Offset.Value.Int32 = 0;
+      Segment.Offset.Extended = false;
+      Segment.Offset.Inst.Opcode = wasm::WASM_OPCODE_I32_CONST;
+      Segment.Offset.Inst.Value.Int32 = 0;
     } else {
       if (Error Err = readInitExpr(Segment.Offset, Ctx))
         return Err;
@@ -1488,7 +1510,7 @@ Error WasmObjectFile::parseElemSection(ReadContext &Ctx) {
 Error WasmObjectFile::parseDataSection(ReadContext &Ctx) {
   DataSection = Sections.size();
   uint32_t Count = readVaruint32(Ctx);
-  if (DataCount && Count != DataCount.getValue())
+  if (DataCount && Count != *DataCount)
     return make_error<GenericBinaryError>(
         "number of data segments does not match DataCount section");
   DataSegments.reserve(Count);
@@ -1503,8 +1525,9 @@ Error WasmObjectFile::parseDataSection(ReadContext &Ctx) {
       if (Error Err = readInitExpr(Segment.Data.Offset, Ctx))
         return Err;
     } else {
-      Segment.Data.Offset.Opcode = wasm::WASM_OPCODE_I32_CONST;
-      Segment.Data.Offset.Value.Int32 = 0;
+      Segment.Data.Offset.Extended = false;
+      Segment.Data.Offset.Inst.Opcode = wasm::WASM_OPCODE_I32_CONST;
+      Segment.Data.Offset.Inst.Value.Int32 = 0;
     }
     uint32_t Size = readVaruint32(Ctx);
     if (Size > (size_t)(Ctx.End - Ctx.Ptr))
@@ -1602,10 +1625,12 @@ uint64_t WasmObjectFile::getWasmSymbolValue(const WasmSymbol &Sym) const {
     // offset within the segment.
     uint32_t SegmentIndex = Sym.Info.DataRef.Segment;
     const wasm::WasmDataSegment &Segment = DataSegments[SegmentIndex].Data;
-    if (Segment.Offset.Opcode == wasm::WASM_OPCODE_I32_CONST) {
-      return Segment.Offset.Value.Int32 + Sym.Info.DataRef.Offset;
-    } else if (Segment.Offset.Opcode == wasm::WASM_OPCODE_I64_CONST) {
-      return Segment.Offset.Value.Int64 + Sym.Info.DataRef.Offset;
+    if (Segment.Offset.Extended) {
+      llvm_unreachable("extended init exprs not supported");
+    } else if (Segment.Offset.Inst.Opcode == wasm::WASM_OPCODE_I32_CONST) {
+      return Segment.Offset.Inst.Value.Int32 + Sym.Info.DataRef.Offset;
+    } else if (Segment.Offset.Inst.Opcode == wasm::WASM_OPCODE_I64_CONST) {
+      return Segment.Offset.Inst.Value.Int64 + Sym.Info.DataRef.Offset;
     } else {
       llvm_unreachable("unknown init expr opcode");
     }
@@ -1692,29 +1717,11 @@ void WasmObjectFile::moveSectionNext(DataRefImpl &Sec) const { Sec.d.a++; }
 
 Expected<StringRef> WasmObjectFile::getSectionName(DataRefImpl Sec) const {
   const WasmSection &S = Sections[Sec.d.a];
-#define ECase(X)                                                               \
-  case wasm::WASM_SEC_##X:                                                     \
-    return #X;
-  switch (S.Type) {
-    ECase(TYPE);
-    ECase(IMPORT);
-    ECase(FUNCTION);
-    ECase(TABLE);
-    ECase(MEMORY);
-    ECase(GLOBAL);
-    ECase(TAG);
-    ECase(EXPORT);
-    ECase(START);
-    ECase(ELEM);
-    ECase(CODE);
-    ECase(DATA);
-    ECase(DATACOUNT);
-  case wasm::WASM_SEC_CUSTOM:
+  if (S.Type == wasm::WASM_SEC_CUSTOM)
     return S.Name;
-  default:
+  if (S.Type > wasm::WASM_SEC_LAST_KNOWN)
     return createStringError(object_error::invalid_section_index, "");
-  }
-#undef ECase
+  return wasm::sectionTypeToString(S.Type);
 }
 
 uint64_t WasmObjectFile::getSectionAddress(DataRefImpl Sec) const { return 0; }
diff --git a/llvm/lib/Object/WindowsResource.cpp b/llvm/lib/Object/WindowsResource.cpp
index 2a69c6c46b59..d50f149629c3 100644
--- a/llvm/lib/Object/WindowsResource.cpp
+++ b/llvm/lib/Object/WindowsResource.cpp
@@ -12,13 +12,11 @@
 
 #include "llvm/Object/WindowsResource.h"
 #include "llvm/Object/COFF.h"
-#include "llvm/Support/FileOutputBuffer.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include <ctime>
 #include <queue>
-#include <system_error>
 
 using namespace llvm;
 using namespace object;
diff --git a/llvm/lib/Object/XCOFFObjectFile.cpp b/llvm/lib/Object/XCOFFObjectFile.cpp
index f2f6d700ddd8..ff39fe1794c0 100644
--- a/llvm/lib/Object/XCOFFObjectFile.cpp
+++ b/llvm/lib/Object/XCOFFObjectFile.cpp
@@ -615,6 +615,16 @@ Expected<uint32_t> XCOFFObjectFile::getSymbolFlags(DataRefImpl Symb) const {
   if (XCOFFSym.getSectionNumber() == XCOFF::N_UNDEF)
     Result |= SymbolRef::SF_Undefined;
 
+  // There is no visibility in old 32 bit XCOFF object file interpret.
+  if (is64Bit() || (auxiliaryHeader32() && (auxiliaryHeader32()->getVersion() ==
+                                            NEW_XCOFF_INTERPRET))) {
+    uint16_t SymType = XCOFFSym.getSymbolType();
+    if ((SymType & VISIBILITY_MASK) == SYM_V_HIDDEN)
+      Result |= SymbolRef::SF_Hidden;
+
+    if ((SymType & VISIBILITY_MASK) == SYM_V_EXPORTED)
+      Result |= SymbolRef::SF_Exported;
+  }
   return Result;
 }
 
@@ -699,6 +709,19 @@ bool XCOFFObjectFile::is64Bit() const {
   return Binary::ID_XCOFF64 == getType();
 }
 
+Expected<StringRef> XCOFFObjectFile::getRawData(const char *Start,
+                                                uint64_t Size,
+                                                StringRef Name) const {
+  uintptr_t StartPtr = reinterpret_cast<uintptr_t>(Start);
+  // TODO: this path is untested.
+  if (Error E = Binary::checkOffset(Data, StartPtr, Size))
+    return createError(toString(std::move(E)) + ": " + Name.data() +
+                       " data with offset 0x" + Twine::utohexstr(StartPtr) +
+                       " and size 0x" + Twine::utohexstr(Size) +
+                       " goes past the end of the file");
+  return StringRef(Start, Size);
+}
+
 uint16_t XCOFFObjectFile::getMagic() const {
   return is64Bit() ? fileHeader64()->Magic : fileHeader32()->Magic;
 }
@@ -1319,7 +1342,7 @@ XCOFFTracebackTable::XCOFFTracebackTable(const uint8_t *Ptr, uint64_t &Size,
     NumOfCtlAnchors = DE.getU32(Cur);
     if (Cur && NumOfCtlAnchors) {
       SmallVector<uint32_t, 8> Disp;
-      Disp.reserve(NumOfCtlAnchors.getValue());
+      Disp.reserve(*NumOfCtlAnchors);
       for (uint32_t I = 0; I < NumOfCtlAnchors && Cur; ++I)
         Disp.push_back(DE.getU32(Cur));
       if (Cur)
@@ -1346,7 +1369,7 @@ XCOFFTracebackTable::XCOFFTracebackTable(const uint8_t *Ptr, uint64_t &Size,
         return;
       }
       VecExt = TBVecExtOrErr.get();
-      VectorParmsNum = VecExt.getValue().getNumberOfVectorParms();
+      VectorParmsNum = VecExt->getNumberOfVectorParms();
     }
   }
 
diff --git a/llvm/lib/ObjectYAML/COFFEmitter.cpp b/llvm/lib/ObjectYAML/COFFEmitter.cpp
index d884e2fd55cd..72d7db665d0e 100644
--- a/llvm/lib/ObjectYAML/COFFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/COFFEmitter.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Object/COFF.h"
 #include "llvm/ObjectYAML/ObjectYAML.h"
 #include "llvm/ObjectYAML/yaml2obj.h"
+#include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
@@ -45,7 +46,7 @@ struct COFFParser {
            COFF::MaxNumberOfSections16;
   }
 
-  bool isPE() const { return Obj.OptionalHeader.hasValue(); }
+  bool isPE() const { return Obj.OptionalHeader.has_value(); }
   bool is64Bit() const {
     return Obj.Header.Machine == COFF::IMAGE_FILE_MACHINE_AMD64 ||
            Obj.Header.Machine == COFF::IMAGE_FILE_MACHINE_ARM64;
@@ -236,7 +237,7 @@ static bool layoutCOFF(COFFParser &CP) {
       if (S.SectionData.binary_size() == 0)
         S.SectionData = CodeViewYAML::toDebugT(S.DebugP, CP.Allocator, S.Name);
     } else if (S.Name == ".debug$H") {
-      if (S.DebugH.hasValue() && S.SectionData.binary_size() == 0)
+      if (S.DebugH && S.SectionData.binary_size() == 0)
         S.SectionData = CodeViewYAML::toDebugH(*S.DebugH, CP.Allocator);
     }
 
@@ -456,7 +457,7 @@ static bool writeCOFF(COFFParser &CP, raw_ostream &OS) {
           CP.Obj.OptionalHeader->DataDirectories;
       uint32_t NumDataDir = sizeof(CP.Obj.OptionalHeader->DataDirectories) /
                             sizeof(Optional<COFF::DataDirectory>);
-      if (I >= NumDataDir || !DataDirectories[I].hasValue()) {
+      if (I >= NumDataDir || !DataDirectories[I]) {
         OS << zeros(uint32_t(0));
         OS << zeros(uint32_t(0));
       } else {
diff --git a/llvm/lib/ObjectYAML/COFFYAML.cpp b/llvm/lib/ObjectYAML/COFFYAML.cpp
index 6e5cdce89060..099ddb2b9665 100644
--- a/llvm/lib/ObjectYAML/COFFYAML.cpp
+++ b/llvm/lib/ObjectYAML/COFFYAML.cpp
@@ -75,6 +75,9 @@ void ScalarEnumerationTraits<COFF::MachineTypes>::enumeration(
   ECase(IMAGE_FILE_MACHINE_POWERPC);
   ECase(IMAGE_FILE_MACHINE_POWERPCFP);
   ECase(IMAGE_FILE_MACHINE_R4000);
+  ECase(IMAGE_FILE_MACHINE_RISCV32);
+  ECase(IMAGE_FILE_MACHINE_RISCV64);
+  ECase(IMAGE_FILE_MACHINE_RISCV128);
   ECase(IMAGE_FILE_MACHINE_SH3);
   ECase(IMAGE_FILE_MACHINE_SH3DSP);
   ECase(IMAGE_FILE_MACHINE_SH4);
diff --git a/llvm/lib/ObjectYAML/CodeViewYAMLSymbols.cpp b/llvm/lib/ObjectYAML/CodeViewYAMLSymbols.cpp
index 6b6a1176628b..b1ad10d425cc 100644
--- a/llvm/lib/ObjectYAML/CodeViewYAMLSymbols.cpp
+++ b/llvm/lib/ObjectYAML/CodeViewYAMLSymbols.cpp
@@ -25,6 +25,7 @@
 #include "llvm/ObjectYAML/YAML.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/YAMLTraits.h"
 #include <algorithm>
 #include <cstdint>
diff --git a/llvm/lib/ObjectYAML/CodeViewYAMLTypes.cpp b/llvm/lib/ObjectYAML/CodeViewYAMLTypes.cpp
index 49b24e21cf60..e4e2b2a6d21a 100644
--- a/llvm/lib/ObjectYAML/CodeViewYAMLTypes.cpp
+++ b/llvm/lib/ObjectYAML/CodeViewYAMLTypes.cpp
@@ -490,7 +490,10 @@ private:
 
 Error LeafRecordImpl<FieldListRecord>::fromCodeViewRecord(CVType Type) {
   MemberRecordConversionVisitor V(Members);
-  return visitMemberRecordStream(Type.content(), V);
+  FieldListRecord FieldList;
+  cantFail(TypeDeserializer::deserializeAs<FieldListRecord>(Type,
+                                                            FieldList));
+  return visitMemberRecordStream(FieldList.Data, V);
 }
 
 CVType LeafRecordImpl<FieldListRecord>::toCodeViewRecord(
diff --git a/llvm/lib/ObjectYAML/DWARFEmitter.cpp b/llvm/lib/ObjectYAML/DWARFEmitter.cpp
index eec733c7d7f9..c0e2cdd54f07 100644
--- a/llvm/lib/ObjectYAML/DWARFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/DWARFEmitter.cpp
@@ -423,7 +423,7 @@ Error DWARFYAML::emitDebugInfo(raw_ostream &OS, const DWARFYAML::Data &DI) {
     std::string EntryBuffer;
     raw_string_ostream EntryBufferOS(EntryBuffer);
 
-    uint64_t AbbrevTableID = Unit.AbbrevTableID.getValueOr(I);
+    uint64_t AbbrevTableID = Unit.AbbrevTableID.value_or(I);
     for (const DWARFYAML::Entry &Entry : Unit.Entries) {
       if (Expected<uint64_t> EntryLength =
               writeDIE(DI, I, AbbrevTableID, Params, Entry, EntryBufferOS,
@@ -507,7 +507,7 @@ static void writeExtendedOpcode(const DWARFYAML::LineTableOpcode &Op,
     for (auto OpByte : Op.UnknownOpcodeData)
       writeInteger((uint8_t)OpByte, OpBufferOS, IsLittleEndian);
   }
-  uint64_t ExtLen = Op.ExtLen.getValueOr(OpBuffer.size());
+  uint64_t ExtLen = Op.ExtLen.value_or(OpBuffer.size());
   encodeULEB128(ExtLen, OS);
   OS.write(OpBuffer.data(), OpBuffer.size());
 }
@@ -582,7 +582,7 @@ Error DWARFYAML::emitDebugLine(raw_ostream &OS, const DWARFYAML::Data &DI) {
     writeInteger(LineTable.LineRange, BufferOS, DI.IsLittleEndian);
 
     std::vector<uint8_t> StandardOpcodeLengths =
-        LineTable.StandardOpcodeLengths.getValueOr(
+        LineTable.StandardOpcodeLengths.value_or(
             getStandardOpcodeLengths(LineTable.Version, LineTable.OpcodeBase));
     uint8_t OpcodeBase = LineTable.OpcodeBase
                              ? *LineTable.OpcodeBase
diff --git a/llvm/lib/ObjectYAML/DWARFYAML.cpp b/llvm/lib/ObjectYAML/DWARFYAML.cpp
index 2591bf4d5af4..37116ada9901 100644
--- a/llvm/lib/ObjectYAML/DWARFYAML.cpp
+++ b/llvm/lib/ObjectYAML/DWARFYAML.cpp
@@ -62,7 +62,7 @@ DWARFYAML::Data::getAbbrevTableInfoByID(uint64_t ID) const {
     for (auto &AbbrevTable : enumerate(DebugAbbrev)) {
       // If the abbrev table's ID isn't specified, we use the index as its ID.
       uint64_t AbbrevTableID =
-          AbbrevTable.value().ID.getValueOr(AbbrevTable.index());
+          AbbrevTable.value().ID.value_or(AbbrevTable.index());
       auto It = AbbrevTableInfoMap.insert(
           {AbbrevTableID, AbbrevTableInfo{/*Index=*/AbbrevTable.index(),
                                           /*Offset=*/AbbrevTableOffset}});
diff --git a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp
new file mode 100644
index 000000000000..9834b036de90
--- /dev/null
+++ b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp
@@ -0,0 +1,190 @@
+//===- DXContainerEmitter.cpp - Convert YAML to a DXContainer -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Binary emitter for yaml to DXContainer binary
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/BinaryFormat/DXContainer.h"
+#include "llvm/ObjectYAML/ObjectYAML.h"
+#include "llvm/ObjectYAML/yaml2obj.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+class DXContainerWriter {
+public:
+  DXContainerWriter(DXContainerYAML::Object &ObjectFile)
+      : ObjectFile(ObjectFile) {}
+
+  Error write(raw_ostream &OS);
+
+private:
+  DXContainerYAML::Object &ObjectFile;
+
+  Error computePartOffsets();
+  Error validatePartOffsets();
+  Error validateSize(uint32_t Computed);
+
+  void writeHeader(raw_ostream &OS);
+  void writeParts(raw_ostream &OS);
+};
+} // namespace
+
+Error DXContainerWriter::validateSize(uint32_t Computed) {
+  if (!ObjectFile.Header.FileSize)
+    ObjectFile.Header.FileSize = Computed;
+  else if (*ObjectFile.Header.FileSize < Computed)
+    return createStringError(errc::result_out_of_range,
+                             "File size specified is too small.");
+  return Error::success();
+}
+
+Error DXContainerWriter::validatePartOffsets() {
+  if (ObjectFile.Parts.size() != ObjectFile.Header.PartOffsets->size())
+    return createStringError(
+        errc::invalid_argument,
+        "Mismatch between number of parts and part offsets.");
+  uint32_t RollingOffset =
+      sizeof(dxbc::Header) + (ObjectFile.Header.PartCount * sizeof(uint32_t));
+  for (auto I : llvm::zip(ObjectFile.Parts, *ObjectFile.Header.PartOffsets)) {
+    if (RollingOffset > std::get<1>(I))
+      return createStringError(errc::invalid_argument,
+                               "Offset mismatch, not enough space for data.");
+    RollingOffset =
+        std::get<1>(I) + sizeof(dxbc::PartHeader) + std::get<0>(I).Size;
+  }
+  if (Error Err = validateSize(RollingOffset))
+    return Err;
+
+  return Error::success();
+}
+
+Error DXContainerWriter::computePartOffsets() {
+  if (ObjectFile.Header.PartOffsets)
+    return validatePartOffsets();
+  uint32_t RollingOffset =
+      sizeof(dxbc::Header) + (ObjectFile.Header.PartCount * sizeof(uint32_t));
+  ObjectFile.Header.PartOffsets = std::vector<uint32_t>();
+  for (const auto &Part : ObjectFile.Parts) {
+    ObjectFile.Header.PartOffsets->push_back(RollingOffset);
+    RollingOffset += sizeof(dxbc::PartHeader) + Part.Size;
+  }
+  if (Error Err = validateSize(RollingOffset))
+    return Err;
+
+  return Error::success();
+}
+
+void DXContainerWriter::writeHeader(raw_ostream &OS) {
+  dxbc::Header Header;
+  memcpy(Header.Magic, "DXBC", 4);
+  memcpy(Header.FileHash.Digest, ObjectFile.Header.Hash.data(), 16);
+  Header.Version.Major = ObjectFile.Header.Version.Major;
+  Header.Version.Minor = ObjectFile.Header.Version.Minor;
+  Header.FileSize = *ObjectFile.Header.FileSize;
+  Header.PartCount = ObjectFile.Parts.size();
+  if (sys::IsBigEndianHost)
+    Header.swapBytes();
+  OS.write(reinterpret_cast<char *>(&Header), sizeof(Header));
+  SmallVector<uint32_t> Offsets(ObjectFile.Header.PartOffsets->begin(),
+                                ObjectFile.Header.PartOffsets->end());
+  if (sys::IsBigEndianHost)
+    for (auto &O : Offsets)
+      sys::swapByteOrder(O);
+  OS.write(reinterpret_cast<char *>(Offsets.data()),
+           Offsets.size() * sizeof(uint32_t));
+}
+
+void DXContainerWriter::writeParts(raw_ostream &OS) {
+  uint32_t RollingOffset =
+      sizeof(dxbc::Header) + (ObjectFile.Header.PartCount * sizeof(uint32_t));
+  for (auto I : llvm::zip(ObjectFile.Parts, *ObjectFile.Header.PartOffsets)) {
+    if (RollingOffset < std::get<1>(I)) {
+      uint32_t PadBytes = std::get<1>(I) - RollingOffset;
+      OS.write_zeros(PadBytes);
+    }
+    DXContainerYAML::Part P = std::get<0>(I);
+    OS.write(P.Name.c_str(), 4);
+    if (sys::IsBigEndianHost)
+      sys::swapByteOrder(P.Size);
+    OS.write(reinterpret_cast<const char *>(&P.Size), sizeof(uint32_t));
+    RollingOffset = std::get<1>(I) + sizeof(dxbc::PartHeader);
+
+    if (P.Name == "DXIL" && P.Program) {
+      dxbc::ProgramHeader Header;
+      Header.MajorVersion = P.Program->MajorVersion;
+      Header.MinorVersion = P.Program->MinorVersion;
+      Header.Unused = 0;
+      Header.ShaderKind = P.Program->ShaderKind;
+      memcpy(Header.Bitcode.Magic, "DXIL", 4);
+      Header.Bitcode.MajorVersion = P.Program->DXILMajorVersion;
+      Header.Bitcode.MinorVersion = P.Program->DXILMinorVersion;
+      Header.Bitcode.Unused = 0;
+
+      // Compute the optional fields if needed...
+      if (P.Program->DXILOffset)
+        Header.Bitcode.Offset = P.Program->DXILOffset.getValue();
+      else
+        Header.Bitcode.Offset = sizeof(dxbc::BitcodeHeader);
+
+      if (P.Program->DXILSize)
+        Header.Bitcode.Size = P.Program->DXILSize.getValue();
+      else
+        Header.Bitcode.Size = P.Program->DXIL ? P.Program->DXIL->size() : 0;
+
+      if (P.Program->Size)
+        Header.Size = P.Program->Size.getValue();
+      else
+        Header.Size = sizeof(dxbc::ProgramHeader) + Header.Bitcode.Size;
+
+      uint32_t BitcodeOffset = Header.Bitcode.Offset;
+      if (sys::IsBigEndianHost)
+        Header.swapBytes();
+      OS.write(reinterpret_cast<const char *>(&Header),
+               sizeof(dxbc::ProgramHeader));
+      if (P.Program->DXIL) {
+        if (BitcodeOffset > sizeof(dxbc::BitcodeHeader)) {
+          uint32_t PadBytes = BitcodeOffset - sizeof(dxbc::BitcodeHeader);
+          OS.write_zeros(PadBytes);
+        }
+        OS.write(reinterpret_cast<char *>(P.Program->DXIL->data()),
+                 P.Program->DXIL->size());
+      }
+    }
+  }
+}
+
+Error DXContainerWriter::write(raw_ostream &OS) {
+  if (Error Err = computePartOffsets())
+    return Err;
+  writeHeader(OS);
+  writeParts(OS);
+  return Error::success();
+}
+
+namespace llvm {
+namespace yaml {
+
+bool yaml2dxcontainer(DXContainerYAML::Object &Doc, raw_ostream &Out,
+                      ErrorHandler EH) {
+  DXContainerWriter Writer(Doc);
+  if (Error Err = Writer.write(Out)) {
+    handleAllErrors(std::move(Err),
+                    [&](const ErrorInfoBase &Err) { EH(Err.message()); });
+    return false;
+  }
+  return true;
+}
+
+} // namespace yaml
+} // namespace llvm
diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp
new file mode 100644
index 000000000000..7952fa4bf0e8
--- /dev/null
+++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp
@@ -0,0 +1,61 @@
+//===- DXContainerYAML.cpp - DXContainer YAMLIO implementation ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines classes for handling the YAML representation of
+// DXContainerYAML.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ObjectYAML/DXContainerYAML.h"
+
+namespace llvm {
+namespace yaml {
+
+void MappingTraits<DXContainerYAML::VersionTuple>::mapping(
+    IO &IO, DXContainerYAML::VersionTuple &Version) {
+  IO.mapRequired("Major", Version.Major);
+  IO.mapRequired("Minor", Version.Minor);
+}
+
+void MappingTraits<DXContainerYAML::FileHeader>::mapping(
+    IO &IO, DXContainerYAML::FileHeader &Header) {
+  IO.mapRequired("Hash", Header.Hash);
+  IO.mapRequired("Version", Header.Version);
+  IO.mapOptional("FileSize", Header.FileSize);
+  IO.mapRequired("PartCount", Header.PartCount);
+  IO.mapOptional("PartOffsets", Header.PartOffsets);
+}
+
+void MappingTraits<DXContainerYAML::DXILProgram>::mapping(
+    IO &IO, DXContainerYAML::DXILProgram &Program) {
+  IO.mapRequired("MajorVersion", Program.MajorVersion);
+  IO.mapRequired("MinorVersion", Program.MinorVersion);
+  IO.mapRequired("ShaderKind", Program.ShaderKind);
+  IO.mapOptional("Size", Program.Size);
+  IO.mapRequired("DXILMajorVersion", Program.DXILMajorVersion);
+  IO.mapRequired("DXILMinorVersion", Program.DXILMinorVersion);
+  IO.mapOptional("DXILSize", Program.DXILSize);
+  IO.mapOptional("DXIL", Program.DXIL);
+}
+
+void MappingTraits<DXContainerYAML::Part>::mapping(IO &IO,
+                                                   DXContainerYAML::Part &P) {
+  IO.mapRequired("Name", P.Name);
+  IO.mapRequired("Size", P.Size);
+  IO.mapOptional("Program", P.Program);
+}
+
+void MappingTraits<DXContainerYAML::Object>::mapping(
+    IO &IO, DXContainerYAML::Object &Obj) {
+  IO.mapTag("!dxcontainer", true);
+  IO.mapRequired("Header", Obj.Header);
+  IO.mapRequired("Parts", Obj.Parts);
+}
+
+} // namespace yaml
+} // namespace llvm
diff --git a/llvm/lib/ObjectYAML/ELFEmitter.cpp b/llvm/lib/ObjectYAML/ELFEmitter.cpp
index e378be3892fe..f5611ed1197b 100644
--- a/llvm/lib/ObjectYAML/ELFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/ELFEmitter.cpp
@@ -412,7 +412,7 @@ ELFState<ELFT>::ELFState(ELFYAML::Object &D, yaml::ErrorHandler EH)
     }
   // TODO: Only create the .strtab here if any symbols have been requested.
   ImplicitSections.insert(".strtab");
-  if (!SecHdrTable || !SecHdrTable->NoHeaders.getValueOr(false))
+  if (!SecHdrTable || !SecHdrTable->NoHeaders.value_or(false))
     ImplicitSections.insert(SectionHeaderStringTableName);
 
   // Insert placeholders for implicit sections that are not
@@ -596,12 +596,11 @@ unsigned ELFState<ELFT>::toSectionIndex(StringRef S, StringRef LocSec,
   const ELFYAML::SectionHeaderTable &SectionHeaders =
       Doc.getSectionHeaderTable();
   if (SectionHeaders.IsImplicit ||
-      (SectionHeaders.NoHeaders && !SectionHeaders.NoHeaders.getValue()) ||
+      (SectionHeaders.NoHeaders && !*SectionHeaders.NoHeaders) ||
       SectionHeaders.isDefault())
     return Index;
 
-  assert(!SectionHeaders.NoHeaders.getValueOr(false) ||
-         !SectionHeaders.Sections);
+  assert(!SectionHeaders.NoHeaders.value_or(false) || !SectionHeaders.Sections);
   size_t FirstExcluded =
       SectionHeaders.Sections ? SectionHeaders.Sections->size() : 0;
   if (Index > FirstExcluded) {
@@ -771,7 +770,7 @@ void ELFState<ELFT>::initSectionHeaders(std::vector<Elf_Shdr> &SHeaders,
 
     if (ELFYAML::SectionHeaderTable *S =
             dyn_cast<ELFYAML::SectionHeaderTable>(D.get())) {
-      if (S->NoHeaders.getValueOr(false))
+      if (S->NoHeaders.value_or(false))
         continue;
 
       if (!S->Offset)
@@ -808,7 +807,7 @@ void ELFState<ELFT>::initSectionHeaders(std::vector<Elf_Shdr> &SHeaders,
       SHeader.sh_entsize = *Sec->EntSize;
     else
       SHeader.sh_entsize = ELFYAML::getDefaultShEntSize<ELFT>(
-          Doc.Header.Machine.getValueOr(ELF::EM_NONE), Sec->Type, Sec->Name);
+          Doc.Header.Machine.value_or(ELF::EM_NONE), Sec->Type, Sec->Name);
 
     // We have a few sections like string or symbol tables that are usually
     // added implicitly to the end. However, if they are explicitly specified
@@ -958,9 +957,9 @@ ELFState<ELFT>::toELFSymbols(ArrayRef<ELFYAML::Symbol> Symbols,
     else if (Sym.Index)
       Symbol.st_shndx = *Sym.Index;
 
-    Symbol.st_value = Sym.Value.getValueOr(yaml::Hex64(0));
+    Symbol.st_value = Sym.Value.value_or(yaml::Hex64(0));
     Symbol.st_other = Sym.Other ? *Sym.Other : 0;
-    Symbol.st_size = Sym.Size.getValueOr(yaml::Hex64(0));
+    Symbol.st_size = Sym.Size.value_or(yaml::Hex64(0));
   }
 
   return Ret;
@@ -1394,12 +1393,22 @@ void ELFState<ELFT>::writeSectionContent(
     return;
 
   for (const ELFYAML::BBAddrMapEntry &E : *Section.Entries) {
+    // Write version and feature values.
+    if (Section.Type == llvm::ELF::SHT_LLVM_BB_ADDR_MAP) {
+      if (E.Version > 1)
+        WithColor::warning() << "unsupported SHT_LLVM_BB_ADDR_MAP version: "
+                             << static_cast<int>(E.Version)
+                             << "; encoding using the most recent version";
+      CBA.write(E.Version);
+      CBA.write(E.Feature);
+      SHeader.sh_size += 2;
+    }
     // Write the address of the function.
     CBA.write<uintX_t>(E.Address, ELFT::TargetEndianness);
     // Write number of BBEntries (number of basic blocks in the function). This
     // is overridden by the 'NumBlocks' YAML field when specified.
     uint64_t NumBlocks =
-        E.NumBlocks.getValueOr(E.BBEntries ? E.BBEntries->size() : 0);
+        E.NumBlocks.value_or(E.BBEntries ? E.BBEntries->size() : 0);
     SHeader.sh_size += sizeof(uintX_t) + CBA.writeULEB128(NumBlocks);
     // Write all BBEntries.
     if (!E.BBEntries)
@@ -1486,10 +1495,10 @@ void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
     return;
 
   CBA.write<uint32_t>(
-      Section.NBucket.getValueOr(llvm::yaml::Hex64(Section.Bucket->size())),
+      Section.NBucket.value_or(llvm::yaml::Hex64(Section.Bucket->size())),
       ELFT::TargetEndianness);
   CBA.write<uint32_t>(
-      Section.NChain.getValueOr(llvm::yaml::Hex64(Section.Chain->size())),
+      Section.NChain.value_or(llvm::yaml::Hex64(Section.Chain->size())),
       ELFT::TargetEndianness);
 
   for (uint32_t Val : *Section.Bucket)
@@ -1518,10 +1527,10 @@ void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
     const ELFYAML::VerdefEntry &E = (*Section.Entries)[I];
 
     Elf_Verdef VerDef;
-    VerDef.vd_version = E.Version.getValueOr(1);
-    VerDef.vd_flags = E.Flags.getValueOr(0);
-    VerDef.vd_ndx = E.VersionNdx.getValueOr(0);
-    VerDef.vd_hash = E.Hash.getValueOr(0);
+    VerDef.vd_version = E.Version.value_or(1);
+    VerDef.vd_flags = E.Flags.value_or(0);
+    VerDef.vd_ndx = E.VersionNdx.value_or(0);
+    VerDef.vd_hash = E.Hash.value_or(0);
     VerDef.vd_aux = sizeof(Elf_Verdef);
     VerDef.vd_cnt = E.VerNames.size();
     if (I == Section.Entries->size() - 1)
@@ -1830,7 +1839,7 @@ template <class ELFT> void ELFState<ELFT>::buildSectionIndex() {
       if (!ExcludedSectionHeaders.insert(Hdr.Name).second)
         llvm_unreachable("buildSectionIndex() failed");
 
-  if (SectionHeaders.NoHeaders.getValueOr(false))
+  if (SectionHeaders.NoHeaders.value_or(false))
     for (const ELFYAML::Section *S : Sections)
       if (!ExcludedSectionHeaders.insert(S->Name).second)
         llvm_unreachable("buildSectionIndex() failed");
@@ -1960,7 +1969,7 @@ bool ELFState<ELFT>::writeELF(raw_ostream &OS, ELFYAML::Object &Doc,
   writeArrayData(OS, makeArrayRef(PHeaders));
 
   const ELFYAML::SectionHeaderTable &SHT = Doc.getSectionHeaderTable();
-  if (!SHT.NoHeaders.getValueOr(false))
+  if (!SHT.NoHeaders.value_or(false))
     CBA.updateDataAt(*SHT.Offset, SHeaders.data(),
                      SHT.getNumHeaders(SHeaders.size()) * sizeof(Elf_Shdr));
 
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index d597148b98ab..cdd180cdc15d 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -29,6 +29,8 @@ namespace llvm {
 ELFYAML::Chunk::~Chunk() = default;
 
 namespace ELFYAML {
+ELF_ELFOSABI Object::getOSAbi() const { return Header.OSABI; }
+
 unsigned Object::getMachine() const {
   if (Header.Machine)
     return *Header.Machine;
@@ -175,6 +177,10 @@ void ScalarEnumerationTraits<ELFYAML::ELF_NT>::enumeration(
   ECase(NT_AMD_PAL_METADATA);
   // AMDGPU specific notes. (Code Object V3)
   ECase(NT_AMDGPU_METADATA);
+  // Android specific notes.
+  ECase(NT_ANDROID_TYPE_IDENT);
+  ECase(NT_ANDROID_TYPE_KUSER);
+  ECase(NT_ANDROID_TYPE_MEMTAG);
 #undef ECase
   IO.enumFallback<Hex32>(Value);
 }
@@ -344,6 +350,7 @@ void ScalarEnumerationTraits<ELFYAML::ELF_EM>::enumeration(
   ECase(EM_BPF);
   ECase(EM_VE);
   ECase(EM_CSKY);
+  ECase(EM_LOONGARCH);
 #undef ECase
   IO.enumFallback<Hex16>(Value);
 }
@@ -560,6 +567,7 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX909, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX90A, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX90C, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX940, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1010, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1011, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1012, EF_AMDGPU_MACH);
@@ -570,6 +578,11 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1033, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1034, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1035, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1036, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1100, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1101, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1102, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1103, EF_AMDGPU_MACH);
     switch (Object->Header.ABIVersion) {
     default:
       // ELFOSABI_AMDGPU_PAL, ELFOSABI_AMDGPU_MESA3D support *_V3 flags.
@@ -641,6 +654,7 @@ void ScalarEnumerationTraits<ELFYAML::ELF_SHT>::enumeration(
   ECase(SHT_LLVM_SYMPART);
   ECase(SHT_LLVM_PART_EHDR);
   ECase(SHT_LLVM_PART_PHDR);
+  ECase(SHT_LLVM_BB_ADDR_MAP_V0);
   ECase(SHT_LLVM_BB_ADDR_MAP);
   ECase(SHT_GNU_ATTRIBUTES);
   ECase(SHT_GNU_HASH);
@@ -705,7 +719,14 @@ void ScalarBitSetTraits<ELFYAML::ELF_SHF>::bitset(IO &IO,
   BCase(SHF_GROUP);
   BCase(SHF_TLS);
   BCase(SHF_COMPRESSED);
-  BCase(SHF_GNU_RETAIN);
+  switch (Object->getOSAbi()) {
+  case ELF::ELFOSABI_SOLARIS:
+    BCase(SHF_SUNW_NODISCARD);
+    break;
+  default:
+    BCase(SHF_GNU_RETAIN);
+    break;
+  }
   switch (Object->getMachine()) {
   case ELF::EM_ARM:
     BCase(SHF_ARM_PURECODE);
@@ -735,6 +756,8 @@ void ScalarBitSetTraits<ELFYAML::ELF_SHF>::bitset(IO &IO,
 
 void ScalarEnumerationTraits<ELFYAML::ELF_SHN>::enumeration(
     IO &IO, ELFYAML::ELF_SHN &Value) {
+  const auto *Object = static_cast<ELFYAML::Object *>(IO.getContext());
+  assert(Object && "The IO context is not initialized");
 #define ECase(X) IO.enumCase(Value, #X, ELF::X)
   ECase(SHN_UNDEF);
   ECase(SHN_LORESERVE);
@@ -747,6 +770,15 @@ void ScalarEnumerationTraits<ELFYAML::ELF_SHN>::enumeration(
   ECase(SHN_XINDEX);
   ECase(SHN_HIRESERVE);
   ECase(SHN_AMDGPU_LDS);
+
+  if (!IO.outputting() || Object->getMachine() == ELF::EM_MIPS) {
+    ECase(SHN_MIPS_ACOMMON);
+    ECase(SHN_MIPS_TEXT);
+    ECase(SHN_MIPS_DATA);
+    ECase(SHN_MIPS_SCOMMON);
+    ECase(SHN_MIPS_SUNDEFINED);
+  }
+
   ECase(SHN_HEXAGON_SCOMMON);
   ECase(SHN_HEXAGON_SCOMMON_1);
   ECase(SHN_HEXAGON_SCOMMON_2);
@@ -839,12 +871,18 @@ void ScalarEnumerationTraits<ELFYAML::ELF_REL>::enumeration(
   case ELF::EM_CSKY:
 #include "llvm/BinaryFormat/ELFRelocs/CSKY.def"
     break;
+  case ELF::EM_PPC:
+#include "llvm/BinaryFormat/ELFRelocs/PowerPC.def"
+    break;
   case ELF::EM_PPC64:
 #include "llvm/BinaryFormat/ELFRelocs/PowerPC64.def"
     break;
   case ELF::EM_68K:
 #include "llvm/BinaryFormat/ELFRelocs/M68k.def"
     break;
+  case ELF::EM_LOONGARCH:
+#include "llvm/BinaryFormat/ELFRelocs/LoongArch.def"
+    break;
   default:
     // Nothing to do.
     break;
@@ -1298,7 +1336,7 @@ static void sectionMapping(IO &IO, ELFYAML::RawContentSection &Section) {
 
   // We also support reading a content as array of bytes using the ContentArray
   // key. obj2yaml never prints this field.
-  assert(!IO.outputting() || !Section.ContentBuf.hasValue());
+  assert(!IO.outputting() || !Section.ContentBuf);
   IO.mapOptional("ContentArray", Section.ContentBuf);
   if (Section.ContentBuf) {
     if (Section.Content)
@@ -1327,8 +1365,7 @@ static void sectionMapping(IO &IO, ELFYAML::HashSection &Section) {
 
   // obj2yaml does not dump these fields. They can be used to override nchain
   // and nbucket values for creating broken sections.
-  assert(!IO.outputting() ||
-         (!Section.NBucket.hasValue() && !Section.NChain.hasValue()));
+  assert(!IO.outputting() || (!Section.NBucket && !Section.NChain));
   IO.mapOptional("NChain", Section.NChain);
   IO.mapOptional("NBucket", Section.NBucket);
 }
@@ -1603,6 +1640,7 @@ void MappingTraits<std::unique_ptr<ELFYAML::Chunk>>::mapping(
       Section.reset(new ELFYAML::CallGraphProfileSection());
     sectionMapping(IO, *cast<ELFYAML::CallGraphProfileSection>(Section.get()));
     break;
+  case ELF::SHT_LLVM_BB_ADDR_MAP_V0:
   case ELF::SHT_LLVM_BB_ADDR_MAP:
     if (!IO.outputting())
       Section.reset(new ELFYAML::BBAddrMapSection());
@@ -1732,6 +1770,8 @@ void MappingTraits<ELFYAML::StackSizeEntry>::mapping(
 void MappingTraits<ELFYAML::BBAddrMapEntry>::mapping(
     IO &IO, ELFYAML::BBAddrMapEntry &E) {
   assert(IO.getContext() && "The IO context is not initialized");
+  IO.mapRequired("Version", E.Version);
+  IO.mapOptional("Feature", E.Feature, Hex8(0));
   IO.mapOptional("Address", E.Address, Hex64(0));
   IO.mapOptional("NumBlocks", E.NumBlocks);
   IO.mapOptional("BBEntries", E.BBEntries);
diff --git a/llvm/lib/ObjectYAML/MachOEmitter.cpp b/llvm/lib/ObjectYAML/MachOEmitter.cpp
index b9fad2982828..3d06f3d0bf86 100644
--- a/llvm/lib/ObjectYAML/MachOEmitter.cpp
+++ b/llvm/lib/ObjectYAML/MachOEmitter.cpp
@@ -55,6 +55,7 @@ private:
   void writeStringTable(raw_ostream &OS);
   void writeExportTrie(raw_ostream &OS);
   void writeDynamicSymbolTable(raw_ostream &OS);
+  void writeFunctionStarts(raw_ostream &OS);
 
   void dumpExportEntry(raw_ostream &OS, MachOYAML::ExportEntry &Entry);
   void ZeroToOffset(raw_ostream &OS, size_t offset);
@@ -484,6 +485,7 @@ void MachOWriter::writeLinkEditData(raw_ostream &OS) {
   MachO::dyld_info_command *DyldInfoOnlyCmd = nullptr;
   MachO::symtab_command *SymtabCmd = nullptr;
   MachO::dysymtab_command *DSymtabCmd = nullptr;
+  MachO::linkedit_data_command *FunctionStartsCmd = nullptr;
   for (auto &LC : Obj.LoadCommands) {
     switch (LC.Data.load_command_data.cmd) {
     case MachO::LC_SYMTAB:
@@ -511,12 +513,15 @@ void MachOWriter::writeLinkEditData(raw_ostream &OS) {
       WriteQueue.push_back(std::make_pair(
           DSymtabCmd->indirectsymoff, &MachOWriter::writeDynamicSymbolTable));
       break;
+    case MachO::LC_FUNCTION_STARTS:
+      FunctionStartsCmd = &LC.Data.linkedit_data_command_data;
+      WriteQueue.push_back(std::make_pair(FunctionStartsCmd->dataoff,
+                                          &MachOWriter::writeFunctionStarts));
+      break;
     }
   }
 
-  llvm::sort(WriteQueue, [](const writeOperation &a, const writeOperation &b) {
-    return a.first < b.first;
-  });
+  llvm::sort(WriteQueue, llvm::less_first());
 
   for (auto writeOp : WriteQueue) {
     ZeroToOffset(OS, writeOp.first);
@@ -569,6 +574,17 @@ void MachOWriter::writeDynamicSymbolTable(raw_ostream &OS) {
              sizeof(yaml::Hex32::BaseType));
 }
 
+void MachOWriter::writeFunctionStarts(raw_ostream &OS) {
+  uint64_t Addr = 0;
+  for (uint64_t NextAddr : Obj.LinkEdit.FunctionStarts) {
+    uint64_t Delta = NextAddr - Addr;
+    encodeULEB128(Delta, OS);
+    Addr = NextAddr;
+  }
+
+  OS.write('\0');
+}
+
 class UniversalWriter {
 public:
   UniversalWriter(yaml::YamlObjectFile &ObjectFile)
diff --git a/llvm/lib/ObjectYAML/MachOYAML.cpp b/llvm/lib/ObjectYAML/MachOYAML.cpp
index f32009458110..b6f3b53a42b3 100644
--- a/llvm/lib/ObjectYAML/MachOYAML.cpp
+++ b/llvm/lib/ObjectYAML/MachOYAML.cpp
@@ -26,10 +26,10 @@ namespace llvm {
 MachOYAML::LoadCommand::~LoadCommand() = default;
 
 bool MachOYAML::LinkEditData::isEmpty() const {
-  return 0 ==
-         RebaseOpcodes.size() + BindOpcodes.size() + WeakBindOpcodes.size() +
-             LazyBindOpcodes.size() + ExportTrie.Children.size() +
-             NameList.size() + StringTable.size();
+  return 0 == RebaseOpcodes.size() + BindOpcodes.size() +
+                  WeakBindOpcodes.size() + LazyBindOpcodes.size() +
+                  ExportTrie.Children.size() + NameList.size() +
+                  StringTable.size() + FunctionStarts.size();
 }
 
 namespace yaml {
@@ -165,6 +165,7 @@ void MappingTraits<MachOYAML::LinkEditData>::mapping(
   IO.mapOptional("NameList", LinkEditData.NameList);
   IO.mapOptional("StringTable", LinkEditData.StringTable);
   IO.mapOptional("IndirectSymbols", LinkEditData.IndirectSymbols);
+  IO.mapOptional("FunctionStarts", LinkEditData.FunctionStarts);
 }
 
 void MappingTraits<MachOYAML::RebaseOpcode>::mapping(
diff --git a/llvm/lib/ObjectYAML/MinidumpEmitter.cpp b/llvm/lib/ObjectYAML/MinidumpEmitter.cpp
index bbfd2cd8cbab..9505473a2415 100644
--- a/llvm/lib/ObjectYAML/MinidumpEmitter.cpp
+++ b/llvm/lib/ObjectYAML/MinidumpEmitter.cpp
@@ -219,7 +219,7 @@ static Directory layout(BlobAllocator &File, Stream &S) {
   // If DataEnd is not set, we assume everything we generated is a part of the
   // stream.
   Result.Location.DataSize =
-      DataEnd.getValueOr(File.tell()) - Result.Location.RVA;
+      DataEnd.value_or(File.tell()) - Result.Location.RVA;
   return Result;
 }
 
diff --git a/llvm/lib/ObjectYAML/ObjectYAML.cpp b/llvm/lib/ObjectYAML/ObjectYAML.cpp
index 63769d2eba0e..d57e5583016b 100644
--- a/llvm/lib/ObjectYAML/ObjectYAML.cpp
+++ b/llvm/lib/ObjectYAML/ObjectYAML.cpp
@@ -56,12 +56,19 @@ void MappingTraits<YamlObjectFile>::mapping(IO &IO,
     } else if (IO.mapTag("!minidump")) {
       ObjectFile.Minidump.reset(new MinidumpYAML::Object());
       MappingTraits<MinidumpYAML::Object>::mapping(IO, *ObjectFile.Minidump);
+    } else if (IO.mapTag("!Offload")) {
+      ObjectFile.Offload.reset(new OffloadYAML::Binary());
+      MappingTraits<OffloadYAML::Binary>::mapping(IO, *ObjectFile.Offload);
     } else if (IO.mapTag("!WASM")) {
       ObjectFile.Wasm.reset(new WasmYAML::Object());
       MappingTraits<WasmYAML::Object>::mapping(IO, *ObjectFile.Wasm);
     } else if (IO.mapTag("!XCOFF")) {
       ObjectFile.Xcoff.reset(new XCOFFYAML::Object());
       MappingTraits<XCOFFYAML::Object>::mapping(IO, *ObjectFile.Xcoff);
+    } else if (IO.mapTag("!dxcontainer")) {
+      ObjectFile.DXContainer.reset(new DXContainerYAML::Object());
+      MappingTraits<DXContainerYAML::Object>::mapping(IO,
+                                                      *ObjectFile.DXContainer);
     } else if (const Node *N = In.getCurrentNode()) {
       if (N->getRawTag().empty())
         IO.setError("YAML Object File missing document type tag!");
diff --git a/llvm/lib/ObjectYAML/OffloadEmitter.cpp b/llvm/lib/ObjectYAML/OffloadEmitter.cpp
new file mode 100644
index 000000000000..3ffbc4ff0e11
--- /dev/null
+++ b/llvm/lib/ObjectYAML/OffloadEmitter.cpp
@@ -0,0 +1,68 @@
+//===- OffloadEmitter.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/OffloadBinary.h"
+#include "llvm/ObjectYAML/OffloadYAML.h"
+#include "llvm/ObjectYAML/yaml2obj.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace OffloadYAML;
+
+namespace llvm {
+namespace yaml {
+
+bool yaml2offload(Binary &Doc, raw_ostream &Out, ErrorHandler EH) {
+  for (const auto &Member : Doc.Members) {
+    object::OffloadBinary::OffloadingImage Image{};
+    if (Member.ImageKind)
+      Image.TheImageKind = *Member.ImageKind;
+    if (Member.OffloadKind)
+      Image.TheOffloadKind = *Member.OffloadKind;
+    if (Member.Flags)
+      Image.Flags = *Member.Flags;
+
+    StringMap<StringRef> &StringData = Image.StringData;
+    if (Member.StringEntries) {
+      for (const auto &Entry : *Member.StringEntries) {
+        StringData[Entry.Key] = Entry.Value;
+      }
+    }
+
+    SmallVector<char, 1024> Data;
+    raw_svector_ostream OS(Data);
+    if (Member.Content)
+      Member.Content->writeAsBinary(OS);
+    Image.Image = MemoryBuffer::getMemBufferCopy(OS.str());
+
+    std::unique_ptr<MemoryBuffer> Binary = object::OffloadBinary::write(Image);
+
+    // Copy the data to a new buffer so we can modify the bytes directly.
+    SmallVector<char> NewBuffer;
+    std::copy(Binary->getBufferStart(), Binary->getBufferEnd(),
+              std::back_inserter(NewBuffer));
+    auto *TheHeader =
+        reinterpret_cast<object::OffloadBinary::Header *>(&NewBuffer[0]);
+    if (Doc.Version)
+      TheHeader->Version = *Doc.Version;
+    if (Doc.Size)
+      TheHeader->Size = *Doc.Size;
+    if (Doc.EntryOffset)
+      TheHeader->EntryOffset = *Doc.EntryOffset;
+    if (Doc.EntrySize)
+      TheHeader->EntrySize = *Doc.EntrySize;
+
+    Out.write(NewBuffer.begin(), NewBuffer.size());
+  }
+
+  return true;
+}
+
+} // namespace yaml
+} // namespace llvm
diff --git a/llvm/lib/ObjectYAML/OffloadYAML.cpp b/llvm/lib/ObjectYAML/OffloadYAML.cpp
new file mode 100644
index 000000000000..d5a0edde2179
--- /dev/null
+++ b/llvm/lib/ObjectYAML/OffloadYAML.cpp
@@ -0,0 +1,78 @@
+//===- OffloadYAML.cpp - Offload Binary YAMLIO implementation -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines classes for handling the YAML representation of offload
+// binaries.
+//
+//===----------------------------------------------------------------------===//
+
+#include <llvm/ObjectYAML/OffloadYAML.h>
+
+namespace llvm {
+
+namespace yaml {
+
+void ScalarEnumerationTraits<object::ImageKind>::enumeration(
+    IO &IO, object::ImageKind &Value) {
+#define ECase(X) IO.enumCase(Value, #X, object::X)
+  ECase(IMG_None);
+  ECase(IMG_Object);
+  ECase(IMG_Bitcode);
+  ECase(IMG_Cubin);
+  ECase(IMG_Fatbinary);
+  ECase(IMG_PTX);
+  ECase(IMG_LAST);
+#undef ECase
+  IO.enumFallback<Hex16>(Value);
+}
+
+void ScalarEnumerationTraits<object::OffloadKind>::enumeration(
+    IO &IO, object::OffloadKind &Value) {
+#define ECase(X) IO.enumCase(Value, #X, object::X)
+  ECase(OFK_None);
+  ECase(OFK_OpenMP);
+  ECase(OFK_Cuda);
+  ECase(OFK_HIP);
+  ECase(OFK_LAST);
+#undef ECase
+  IO.enumFallback<Hex16>(Value);
+}
+
+void MappingTraits<OffloadYAML::Binary>::mapping(IO &IO,
+                                                 OffloadYAML::Binary &O) {
+  assert(!IO.getContext() && "The IO context is initialized already");
+  IO.setContext(&O);
+  IO.mapTag("!Offload", true);
+  IO.mapOptional("Version", O.Version);
+  IO.mapOptional("Size", O.Size);
+  IO.mapOptional("EntryOffset", O.EntryOffset);
+  IO.mapOptional("EntrySize", O.EntrySize);
+  IO.mapRequired("Members", O.Members);
+  IO.setContext(nullptr);
+}
+
+void MappingTraits<OffloadYAML::Binary::StringEntry>::mapping(
+    IO &IO, OffloadYAML::Binary::StringEntry &SE) {
+  assert(IO.getContext() && "The IO context is not initialized");
+  IO.mapRequired("Key", SE.Key);
+  IO.mapRequired("Value", SE.Value);
+}
+
+void MappingTraits<OffloadYAML::Binary::Member>::mapping(
+    IO &IO, OffloadYAML::Binary::Member &M) {
+  assert(IO.getContext() && "The IO context is not initialized");
+  IO.mapOptional("ImageKind", M.ImageKind);
+  IO.mapOptional("OffloadKind", M.OffloadKind);
+  IO.mapOptional("Flags", M.Flags);
+  IO.mapOptional("String", M.StringEntries);
+  IO.mapOptional("Content", M.Content);
+}
+
+} // namespace yaml
+
+} // namespace llvm
diff --git a/llvm/lib/ObjectYAML/WasmEmitter.cpp b/llvm/lib/ObjectYAML/WasmEmitter.cpp
index 2aa2ef3e5541..6230312eff7b 100644
--- a/llvm/lib/ObjectYAML/WasmEmitter.cpp
+++ b/llvm/lib/ObjectYAML/WasmEmitter.cpp
@@ -33,7 +33,7 @@ private:
   void writeRelocSection(raw_ostream &OS, WasmYAML::Section &Sec,
                          uint32_t SectionIndex);
 
-  void writeInitExpr(raw_ostream &OS, const wasm::WasmInitExpr &InitExpr);
+  void writeInitExpr(raw_ostream &OS, const WasmYAML::InitExpr &InitExpr);
 
   void writeSectionContent(raw_ostream &OS, WasmYAML::CustomSection &Section);
   void writeSectionContent(raw_ostream &OS, WasmYAML::TypeSection &Section);
@@ -129,29 +129,34 @@ void WasmWriter::reportError(const Twine &Msg) {
 }
 
 void WasmWriter::writeInitExpr(raw_ostream &OS,
-                               const wasm::WasmInitExpr &InitExpr) {
-  writeUint8(OS, InitExpr.Opcode);
-  switch (InitExpr.Opcode) {
-  case wasm::WASM_OPCODE_I32_CONST:
-    encodeSLEB128(InitExpr.Value.Int32, OS);
-    break;
-  case wasm::WASM_OPCODE_I64_CONST:
-    encodeSLEB128(InitExpr.Value.Int64, OS);
-    break;
-  case wasm::WASM_OPCODE_F32_CONST:
-    writeUint32(OS, InitExpr.Value.Float32);
-    break;
-  case wasm::WASM_OPCODE_F64_CONST:
-    writeUint64(OS, InitExpr.Value.Float64);
-    break;
-  case wasm::WASM_OPCODE_GLOBAL_GET:
-    encodeULEB128(InitExpr.Value.Global, OS);
-    break;
-  default:
-    reportError("unknown opcode in init_expr: " + Twine(InitExpr.Opcode));
-    return;
+                               const WasmYAML::InitExpr &InitExpr) {
+  if (InitExpr.Extended) {
+    InitExpr.Body.writeAsBinary(OS);
+  } else {
+    writeUint8(OS, InitExpr.Inst.Opcode);
+    switch (InitExpr.Inst.Opcode) {
+    case wasm::WASM_OPCODE_I32_CONST:
+      encodeSLEB128(InitExpr.Inst.Value.Int32, OS);
+      break;
+    case wasm::WASM_OPCODE_I64_CONST:
+      encodeSLEB128(InitExpr.Inst.Value.Int64, OS);
+      break;
+    case wasm::WASM_OPCODE_F32_CONST:
+      writeUint32(OS, InitExpr.Inst.Value.Float32);
+      break;
+    case wasm::WASM_OPCODE_F64_CONST:
+      writeUint64(OS, InitExpr.Inst.Value.Float64);
+      break;
+    case wasm::WASM_OPCODE_GLOBAL_GET:
+      encodeULEB128(InitExpr.Inst.Value.Global, OS);
+      break;
+    default:
+      reportError("unknown opcode in init_expr: " +
+                  Twine(InitExpr.Inst.Opcode));
+      return;
+    }
+    writeUint8(OS, wasm::WASM_OPCODE_END);
   }
-  writeUint8(OS, wasm::WASM_OPCODE_END);
 }
 
 void WasmWriter::writeSectionContent(raw_ostream &OS,
@@ -187,13 +192,10 @@ void WasmWriter::writeSectionContent(raw_ostream &OS,
   // SYMBOL_TABLE subsection
   if (Section.SymbolTable.size()) {
     writeUint8(OS, wasm::WASM_SYMBOL_TABLE);
-
     encodeULEB128(Section.SymbolTable.size(), SubSection.getStream());
-#ifndef NDEBUG
-    uint32_t SymbolIndex = 0;
-#endif
-    for (const WasmYAML::SymbolInfo &Info : Section.SymbolTable) {
-      assert(Info.Index == SymbolIndex++);
+    for (auto Sym : llvm::enumerate(Section.SymbolTable)) {
+      const WasmYAML::SymbolInfo &Info = Sym.value();
+      assert(Info.Index == Sym.index());
       writeUint8(SubSection.getStream(), Info.Kind);
       encodeULEB128(Info.Flags, SubSection.getStream());
       switch (Info.Kind) {
@@ -481,7 +483,7 @@ void WasmWriter::writeSectionContent(raw_ostream &OS,
     ++ExpectedIndex;
     writeUint8(OS, Global.Type);
     writeUint8(OS, Global.Mutable);
-    writeInitExpr(OS, Global.InitExpr);
+    writeInitExpr(OS, Global.Init);
   }
 }
 
diff --git a/llvm/lib/ObjectYAML/WasmYAML.cpp b/llvm/lib/ObjectYAML/WasmYAML.cpp
index 3f0172ebf361..7ca422487df2 100644
--- a/llvm/lib/ObjectYAML/WasmYAML.cpp
+++ b/llvm/lib/ObjectYAML/WasmYAML.cpp
@@ -367,8 +367,7 @@ void MappingTraits<WasmYAML::LocalDecl>::mapping(
 
 void MappingTraits<WasmYAML::Limits>::mapping(IO &IO,
                                               WasmYAML::Limits &Limits) {
-  if (!IO.outputting() || Limits.Flags)
-    IO.mapOptional("Flags", Limits.Flags);
+  IO.mapOptional("Flags", Limits.Flags, 0);
   IO.mapRequired("Minimum", Limits.Minimum);
   if (!IO.outputting() || Limits.Flags & wasm::WASM_LIMITS_FLAG_HAS_MAX)
     IO.mapOptional("Maximum", Limits.Maximum);
@@ -376,8 +375,7 @@ void MappingTraits<WasmYAML::Limits>::mapping(IO &IO,
 
 void MappingTraits<WasmYAML::ElemSegment>::mapping(
     IO &IO, WasmYAML::ElemSegment &Segment) {
-  if (!IO.outputting() || Segment.Flags)
-    IO.mapOptional("Flags", Segment.Flags);
+  IO.mapOptional("Flags", Segment.Flags, 0);
   if (!IO.outputting() ||
       Segment.Flags & wasm::WASM_ELEM_SEGMENT_HAS_TABLE_NUMBER)
     IO.mapOptional("TableNumber", Segment.TableNumber);
@@ -420,35 +418,40 @@ void MappingTraits<WasmYAML::Global>::mapping(IO &IO,
   IO.mapRequired("Index", Global.Index);
   IO.mapRequired("Type", Global.Type);
   IO.mapRequired("Mutable", Global.Mutable);
-  IO.mapRequired("InitExpr", Global.InitExpr);
+  IO.mapRequired("InitExpr", Global.Init);
 }
 
-void MappingTraits<wasm::WasmInitExpr>::mapping(IO &IO,
-                                                wasm::WasmInitExpr &Expr) {
-  WasmYAML::Opcode Op = Expr.Opcode;
-  IO.mapRequired("Opcode", Op);
-  Expr.Opcode = Op;
-  switch (Expr.Opcode) {
-  case wasm::WASM_OPCODE_I32_CONST:
-    IO.mapRequired("Value", Expr.Value.Int32);
-    break;
-  case wasm::WASM_OPCODE_I64_CONST:
-    IO.mapRequired("Value", Expr.Value.Int64);
-    break;
-  case wasm::WASM_OPCODE_F32_CONST:
-    IO.mapRequired("Value", Expr.Value.Float32);
-    break;
-  case wasm::WASM_OPCODE_F64_CONST:
-    IO.mapRequired("Value", Expr.Value.Float64);
-    break;
-  case wasm::WASM_OPCODE_GLOBAL_GET:
-    IO.mapRequired("Index", Expr.Value.Global);
-    break;
-  case wasm::WASM_OPCODE_REF_NULL: {
-    WasmYAML::ValueType Ty = wasm::WASM_TYPE_EXTERNREF;
-    IO.mapRequired("Type", Ty);
-    break;
-  }
+void MappingTraits<WasmYAML::InitExpr>::mapping(IO &IO,
+                                                WasmYAML::InitExpr &Expr) {
+  IO.mapOptional("Extended", Expr.Extended, false);
+  if (Expr.Extended) {
+    IO.mapRequired("Body", Expr.Body);
+  } else {
+    WasmYAML::Opcode Op = Expr.Inst.Opcode;
+    IO.mapRequired("Opcode", Op);
+    Expr.Inst.Opcode = Op;
+    switch (Expr.Inst.Opcode) {
+    case wasm::WASM_OPCODE_I32_CONST:
+      IO.mapRequired("Value", Expr.Inst.Value.Int32);
+      break;
+    case wasm::WASM_OPCODE_I64_CONST:
+      IO.mapRequired("Value", Expr.Inst.Value.Int64);
+      break;
+    case wasm::WASM_OPCODE_F32_CONST:
+      IO.mapRequired("Value", Expr.Inst.Value.Float32);
+      break;
+    case wasm::WASM_OPCODE_F64_CONST:
+      IO.mapRequired("Value", Expr.Inst.Value.Float64);
+      break;
+    case wasm::WASM_OPCODE_GLOBAL_GET:
+      IO.mapRequired("Index", Expr.Inst.Value.Global);
+      break;
+    case wasm::WASM_OPCODE_REF_NULL: {
+      WasmYAML::ValueType Ty = wasm::WASM_TYPE_EXTERNREF;
+      IO.mapRequired("Type", Ty);
+      break;
+    }
+    }
   }
 }
 
@@ -464,8 +467,8 @@ void MappingTraits<WasmYAML::DataSegment>::mapping(
   if ((Segment.InitFlags & wasm::WASM_DATA_SEGMENT_IS_PASSIVE) == 0) {
     IO.mapRequired("Offset", Segment.Offset);
   } else {
-    Segment.Offset.Opcode = wasm::WASM_OPCODE_I32_CONST;
-    Segment.Offset.Value.Int32 = 0;
+    Segment.Offset.Inst.Opcode = wasm::WASM_OPCODE_I32_CONST;
+    Segment.Offset.Inst.Value.Int32 = 0;
   }
   IO.mapRequired("Content", Segment.Content);
 }
diff --git a/llvm/lib/ObjectYAML/XCOFFEmitter.cpp b/llvm/lib/ObjectYAML/XCOFFEmitter.cpp
index 2a7204d3f773..1ceac6c05893 100644
--- a/llvm/lib/ObjectYAML/XCOFFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/XCOFFEmitter.cpp
@@ -212,8 +212,8 @@ bool XCOFFWriter::initStringTable() {
     for (const std::unique_ptr<XCOFFYAML::AuxSymbolEnt> &AuxSym :
          YamlSym.AuxEntries) {
       if (auto AS = dyn_cast<XCOFFYAML::FileAuxEnt>(AuxSym.get()))
-        if (nameShouldBeInStringTable(AS->FileNameOrString.getValueOr("")))
-          StrTblBuilder.add(AS->FileNameOrString.getValueOr(""));
+        if (nameShouldBeInStringTable(AS->FileNameOrString.value_or("")))
+          StrTblBuilder.add(AS->FileNameOrString.value_or(""));
     }
   }
 
@@ -247,8 +247,7 @@ bool XCOFFWriter::initFileHeader(uint64_t CurrentOffset) {
                  Twine(AuxCount));
       return false;
     }
-    YamlSym.NumberOfAuxEntries =
-        YamlSym.NumberOfAuxEntries.getValueOr(AuxCount);
+    YamlSym.NumberOfAuxEntries = YamlSym.NumberOfAuxEntries.value_or(AuxCount);
     // Add the number of auxiliary symbols to the total number.
     InitFileHdr.NumberOfSymTableEntries += *YamlSym.NumberOfAuxEntries;
   }
@@ -378,59 +377,60 @@ void XCOFFWriter::writeFileHeader() {
 }
 
 void XCOFFWriter::writeAuxFileHeader() {
-  W.write<uint16_t>(InitAuxFileHdr.Magic.getValueOr(yaml::Hex16(1)));
-  W.write<uint16_t>(InitAuxFileHdr.Version.getValueOr(yaml::Hex16(1)));
+  W.write<uint16_t>(InitAuxFileHdr.Magic.value_or(yaml::Hex16(1)));
+  W.write<uint16_t>(InitAuxFileHdr.Version.value_or(yaml::Hex16(1)));
   if (Is64Bit) {
     W.OS.write_zeros(4); // Reserved for debugger.
-    W.write<uint64_t>(InitAuxFileHdr.TextStartAddr.getValueOr(yaml::Hex64(0)));
-    W.write<uint64_t>(InitAuxFileHdr.DataStartAddr.getValueOr(yaml::Hex64(0)));
-    W.write<uint64_t>(InitAuxFileHdr.TOCAnchorAddr.getValueOr(yaml::Hex64(0)));
+    W.write<uint64_t>(InitAuxFileHdr.TextStartAddr.value_or(yaml::Hex64(0)));
+    W.write<uint64_t>(InitAuxFileHdr.DataStartAddr.value_or(yaml::Hex64(0)));
+    W.write<uint64_t>(InitAuxFileHdr.TOCAnchorAddr.value_or(yaml::Hex64(0)));
   } else {
-    W.write<uint32_t>(InitAuxFileHdr.TextSize.getValueOr(yaml::Hex64(0)));
-    W.write<uint32_t>(InitAuxFileHdr.InitDataSize.getValueOr(yaml::Hex64(0)));
-    W.write<uint32_t>(InitAuxFileHdr.BssDataSize.getValueOr(yaml::Hex64(0)));
-    W.write<uint32_t>(InitAuxFileHdr.EntryPointAddr.getValueOr(yaml::Hex64(0)));
-    W.write<uint32_t>(InitAuxFileHdr.TextStartAddr.getValueOr(yaml::Hex64(0)));
-    W.write<uint32_t>(InitAuxFileHdr.DataStartAddr.getValueOr(yaml::Hex64(0)));
-    W.write<uint32_t>(InitAuxFileHdr.TOCAnchorAddr.getValueOr(yaml::Hex64(0)));
+    W.write<uint32_t>(InitAuxFileHdr.TextSize.value_or(yaml::Hex64(0)));
+    W.write<uint32_t>(InitAuxFileHdr.InitDataSize.value_or(yaml::Hex64(0)));
+    W.write<uint32_t>(InitAuxFileHdr.BssDataSize.value_or(yaml::Hex64(0)));
+    W.write<uint32_t>(InitAuxFileHdr.EntryPointAddr.value_or(yaml::Hex64(0)));
+    W.write<uint32_t>(InitAuxFileHdr.TextStartAddr.value_or(yaml::Hex64(0)));
+    W.write<uint32_t>(InitAuxFileHdr.DataStartAddr.value_or(yaml::Hex64(0)));
+    W.write<uint32_t>(InitAuxFileHdr.TOCAnchorAddr.value_or(yaml::Hex64(0)));
   }
-  W.write<uint16_t>(InitAuxFileHdr.SecNumOfEntryPoint.getValueOr(0));
-  W.write<uint16_t>(InitAuxFileHdr.SecNumOfText.getValueOr(0));
-  W.write<uint16_t>(InitAuxFileHdr.SecNumOfData.getValueOr(0));
-  W.write<uint16_t>(InitAuxFileHdr.SecNumOfTOC.getValueOr(0));
-  W.write<uint16_t>(InitAuxFileHdr.SecNumOfLoader.getValueOr(0));
-  W.write<uint16_t>(InitAuxFileHdr.SecNumOfBSS.getValueOr(0));
-  W.write<uint16_t>(InitAuxFileHdr.MaxAlignOfText.getValueOr(yaml::Hex16(0)));
-  W.write<uint16_t>(InitAuxFileHdr.MaxAlignOfData.getValueOr(yaml::Hex16(0)));
-  W.write<uint16_t>(InitAuxFileHdr.ModuleType.getValueOr(yaml::Hex16(0)));
-  W.write<uint8_t>(InitAuxFileHdr.CpuFlag.getValueOr(yaml::Hex8(0)));
+  W.write<uint16_t>(InitAuxFileHdr.SecNumOfEntryPoint.value_or(0));
+  W.write<uint16_t>(InitAuxFileHdr.SecNumOfText.value_or(0));
+  W.write<uint16_t>(InitAuxFileHdr.SecNumOfData.value_or(0));
+  W.write<uint16_t>(InitAuxFileHdr.SecNumOfTOC.value_or(0));
+  W.write<uint16_t>(InitAuxFileHdr.SecNumOfLoader.value_or(0));
+  W.write<uint16_t>(InitAuxFileHdr.SecNumOfBSS.value_or(0));
+  W.write<uint16_t>(InitAuxFileHdr.MaxAlignOfText.value_or(yaml::Hex16(0)));
+  W.write<uint16_t>(InitAuxFileHdr.MaxAlignOfData.value_or(yaml::Hex16(0)));
+  W.write<uint16_t>(InitAuxFileHdr.ModuleType.value_or(yaml::Hex16(0)));
+  W.write<uint8_t>(InitAuxFileHdr.CpuFlag.value_or(yaml::Hex8(0)));
   W.write<uint8_t>(0); // Reserved for CPU type.
   if (Is64Bit) {
-    W.write<uint8_t>(InitAuxFileHdr.TextPageSize.getValueOr(yaml::Hex8(0)));
-    W.write<uint8_t>(InitAuxFileHdr.DataPageSize.getValueOr(yaml::Hex8(0)));
-    W.write<uint8_t>(InitAuxFileHdr.StackPageSize.getValueOr(yaml::Hex8(0)));
+    W.write<uint8_t>(InitAuxFileHdr.TextPageSize.value_or(yaml::Hex8(0)));
+    W.write<uint8_t>(InitAuxFileHdr.DataPageSize.value_or(yaml::Hex8(0)));
+    W.write<uint8_t>(InitAuxFileHdr.StackPageSize.value_or(yaml::Hex8(0)));
     W.write<uint8_t>(
-        InitAuxFileHdr.FlagAndTDataAlignment.getValueOr(yaml::Hex8(0x80)));
-    W.write<uint64_t>(InitAuxFileHdr.TextSize.getValueOr(yaml::Hex64(0)));
-    W.write<uint64_t>(InitAuxFileHdr.InitDataSize.getValueOr(yaml::Hex64(0)));
-    W.write<uint64_t>(InitAuxFileHdr.BssDataSize.getValueOr(yaml::Hex64(0)));
-    W.write<uint64_t>(InitAuxFileHdr.EntryPointAddr.getValueOr(yaml::Hex64(0)));
-    W.write<uint64_t>(InitAuxFileHdr.MaxStackSize.getValueOr(yaml::Hex64(0)));
-    W.write<uint64_t>(InitAuxFileHdr.MaxDataSize.getValueOr(yaml::Hex64(0)));
+        InitAuxFileHdr.FlagAndTDataAlignment.value_or(yaml::Hex8(0x80)));
+    W.write<uint64_t>(InitAuxFileHdr.TextSize.value_or(yaml::Hex64(0)));
+    W.write<uint64_t>(InitAuxFileHdr.InitDataSize.value_or(yaml::Hex64(0)));
+    W.write<uint64_t>(InitAuxFileHdr.BssDataSize.value_or(yaml::Hex64(0)));
+    W.write<uint64_t>(InitAuxFileHdr.EntryPointAddr.value_or(yaml::Hex64(0)));
+    W.write<uint64_t>(InitAuxFileHdr.MaxStackSize.value_or(yaml::Hex64(0)));
+    W.write<uint64_t>(InitAuxFileHdr.MaxDataSize.value_or(yaml::Hex64(0)));
   } else {
-    W.write<uint32_t>(InitAuxFileHdr.MaxStackSize.getValueOr(yaml::Hex64(0)));
-    W.write<uint32_t>(InitAuxFileHdr.MaxDataSize.getValueOr(yaml::Hex64(0)));
+    W.write<uint32_t>(InitAuxFileHdr.MaxStackSize.value_or(yaml::Hex64(0)));
+    W.write<uint32_t>(InitAuxFileHdr.MaxDataSize.value_or(yaml::Hex64(0)));
     W.OS.write_zeros(4); // Reserved for debugger.
-    W.write<uint8_t>(InitAuxFileHdr.TextPageSize.getValueOr(yaml::Hex8(0)));
-    W.write<uint8_t>(InitAuxFileHdr.DataPageSize.getValueOr(yaml::Hex8(0)));
-    W.write<uint8_t>(InitAuxFileHdr.StackPageSize.getValueOr(yaml::Hex8(0)));
+    W.write<uint8_t>(InitAuxFileHdr.TextPageSize.value_or(yaml::Hex8(0)));
+    W.write<uint8_t>(InitAuxFileHdr.DataPageSize.value_or(yaml::Hex8(0)));
+    W.write<uint8_t>(InitAuxFileHdr.StackPageSize.value_or(yaml::Hex8(0)));
     W.write<uint8_t>(
-        InitAuxFileHdr.FlagAndTDataAlignment.getValueOr(yaml::Hex8(0)));
+        InitAuxFileHdr.FlagAndTDataAlignment.value_or(yaml::Hex8(0)));
   }
-  W.write<uint16_t>(InitAuxFileHdr.SecNumOfTData.getValueOr(0));
-  W.write<uint16_t>(InitAuxFileHdr.SecNumOfTBSS.getValueOr(0));
+  W.write<uint16_t>(InitAuxFileHdr.SecNumOfTData.value_or(0));
+  W.write<uint16_t>(InitAuxFileHdr.SecNumOfTBSS.value_or(0));
   if (Is64Bit) {
-    W.write<uint16_t>(InitAuxFileHdr.Flag.getValueOr(yaml::Hex16(XCOFF::SHR_SYMTAB)));
+    W.write<uint16_t>(
+        InitAuxFileHdr.Flag.value_or(yaml::Hex16(XCOFF::SHR_SYMTAB)));
     if (InitFileHdr.AuxHeaderSize > XCOFF::AuxFileHeaderSize64)
       W.OS.write_zeros(InitFileHdr.AuxHeaderSize - XCOFF::AuxFileHeaderSize64);
   } else if (InitFileHdr.AuxHeaderSize > XCOFF::AuxFileHeaderSize32) {
@@ -526,52 +526,52 @@ bool XCOFFWriter::writeRelocations() {
 
 void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::CsectAuxEnt &AuxSym) {
   if (Is64Bit) {
-    W.write<uint32_t>(AuxSym.SectionOrLengthLo.getValueOr(0));
-    W.write<uint32_t>(AuxSym.ParameterHashIndex.getValueOr(0));
-    W.write<uint16_t>(AuxSym.TypeChkSectNum.getValueOr(0));
-    W.write<uint8_t>(AuxSym.SymbolAlignmentAndType.getValueOr(0));
-    W.write<uint8_t>(AuxSym.StorageMappingClass.getValueOr(XCOFF::XMC_PR));
-    W.write<uint32_t>(AuxSym.SectionOrLengthHi.getValueOr(0));
+    W.write<uint32_t>(AuxSym.SectionOrLengthLo.value_or(0));
+    W.write<uint32_t>(AuxSym.ParameterHashIndex.value_or(0));
+    W.write<uint16_t>(AuxSym.TypeChkSectNum.value_or(0));
+    W.write<uint8_t>(AuxSym.SymbolAlignmentAndType.value_or(0));
+    W.write<uint8_t>(AuxSym.StorageMappingClass.value_or(XCOFF::XMC_PR));
+    W.write<uint32_t>(AuxSym.SectionOrLengthHi.value_or(0));
     W.write<uint8_t>(0);
     W.write<uint8_t>(XCOFF::AUX_CSECT);
   } else {
-    W.write<uint32_t>(AuxSym.SectionOrLength.getValueOr(0));
-    W.write<uint32_t>(AuxSym.ParameterHashIndex.getValueOr(0));
-    W.write<uint16_t>(AuxSym.TypeChkSectNum.getValueOr(0));
-    W.write<uint8_t>(AuxSym.SymbolAlignmentAndType.getValueOr(0));
-    W.write<uint8_t>(AuxSym.StorageMappingClass.getValueOr(XCOFF::XMC_PR));
-    W.write<uint32_t>(AuxSym.StabInfoIndex.getValueOr(0));
-    W.write<uint16_t>(AuxSym.StabSectNum.getValueOr(0));
+    W.write<uint32_t>(AuxSym.SectionOrLength.value_or(0));
+    W.write<uint32_t>(AuxSym.ParameterHashIndex.value_or(0));
+    W.write<uint16_t>(AuxSym.TypeChkSectNum.value_or(0));
+    W.write<uint8_t>(AuxSym.SymbolAlignmentAndType.value_or(0));
+    W.write<uint8_t>(AuxSym.StorageMappingClass.value_or(XCOFF::XMC_PR));
+    W.write<uint32_t>(AuxSym.StabInfoIndex.value_or(0));
+    W.write<uint16_t>(AuxSym.StabSectNum.value_or(0));
   }
 }
 
 void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::ExcpetionAuxEnt &AuxSym) {
   assert(Is64Bit && "can't write the exception auxiliary symbol for XCOFF32");
-  W.write<uint64_t>(AuxSym.OffsetToExceptionTbl.getValueOr(0));
-  W.write<uint32_t>(AuxSym.SizeOfFunction.getValueOr(0));
-  W.write<uint32_t>(AuxSym.SymIdxOfNextBeyond.getValueOr(0));
+  W.write<uint64_t>(AuxSym.OffsetToExceptionTbl.value_or(0));
+  W.write<uint32_t>(AuxSym.SizeOfFunction.value_or(0));
+  W.write<uint32_t>(AuxSym.SymIdxOfNextBeyond.value_or(0));
   W.write<uint8_t>(0);
   W.write<uint8_t>(XCOFF::AUX_EXCEPT);
 }
 
 void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::FunctionAuxEnt &AuxSym) {
   if (Is64Bit) {
-    W.write<uint64_t>(AuxSym.PtrToLineNum.getValueOr(0));
-    W.write<uint32_t>(AuxSym.SizeOfFunction.getValueOr(0));
-    W.write<uint32_t>(AuxSym.SymIdxOfNextBeyond.getValueOr(0));
+    W.write<uint64_t>(AuxSym.PtrToLineNum.value_or(0));
+    W.write<uint32_t>(AuxSym.SizeOfFunction.value_or(0));
+    W.write<uint32_t>(AuxSym.SymIdxOfNextBeyond.value_or(0));
     W.write<uint8_t>(0);
     W.write<uint8_t>(XCOFF::AUX_FCN);
   } else {
-    W.write<uint32_t>(AuxSym.OffsetToExceptionTbl.getValueOr(0));
-    W.write<uint32_t>(AuxSym.SizeOfFunction.getValueOr(0));
-    W.write<uint32_t>(AuxSym.PtrToLineNum.getValueOr(0));
-    W.write<uint32_t>(AuxSym.SymIdxOfNextBeyond.getValueOr(0));
+    W.write<uint32_t>(AuxSym.OffsetToExceptionTbl.value_or(0));
+    W.write<uint32_t>(AuxSym.SizeOfFunction.value_or(0));
+    W.write<uint32_t>(AuxSym.PtrToLineNum.value_or(0));
+    W.write<uint32_t>(AuxSym.SymIdxOfNextBeyond.value_or(0));
     W.OS.write_zeros(2);
   }
 }
 
 void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::FileAuxEnt &AuxSym) {
-  StringRef FileName = AuxSym.FileNameOrString.getValueOr("");
+  StringRef FileName = AuxSym.FileNameOrString.value_or("");
   if (nameShouldBeInStringTable(FileName)) {
     W.write<int32_t>(0);
     W.write<uint32_t>(StrTblBuilder.getOffset(FileName));
@@ -579,7 +579,7 @@ void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::FileAuxEnt &AuxSym) {
     writeName(FileName, W);
   }
   W.OS.write_zeros(XCOFF::FileNamePadSize);
-  W.write<uint8_t>(AuxSym.FileStringType.getValueOr(XCOFF::XFT_FN));
+  W.write<uint8_t>(AuxSym.FileStringType.value_or(XCOFF::XFT_FN));
   if (Is64Bit) {
     W.OS.write_zeros(2);
     W.write<uint8_t>(XCOFF::AUX_FILE);
@@ -590,36 +590,36 @@ void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::FileAuxEnt &AuxSym) {
 
 void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::BlockAuxEnt &AuxSym) {
   if (Is64Bit) {
-    W.write<uint32_t>(AuxSym.LineNum.getValueOr(0));
+    W.write<uint32_t>(AuxSym.LineNum.value_or(0));
     W.OS.write_zeros(13);
     W.write<uint8_t>(XCOFF::AUX_SYM);
   } else {
     W.OS.write_zeros(2);
-    W.write<uint16_t>(AuxSym.LineNumHi.getValueOr(0));
-    W.write<uint16_t>(AuxSym.LineNumLo.getValueOr(0));
+    W.write<uint16_t>(AuxSym.LineNumHi.value_or(0));
+    W.write<uint16_t>(AuxSym.LineNumLo.value_or(0));
     W.OS.write_zeros(12);
   }
 }
 
 void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::SectAuxEntForDWARF &AuxSym) {
   if (Is64Bit) {
-    W.write<uint64_t>(AuxSym.LengthOfSectionPortion.getValueOr(0));
-    W.write<uint64_t>(AuxSym.NumberOfRelocEnt.getValueOr(0));
+    W.write<uint64_t>(AuxSym.LengthOfSectionPortion.value_or(0));
+    W.write<uint64_t>(AuxSym.NumberOfRelocEnt.value_or(0));
     W.write<uint8_t>(0);
     W.write<uint8_t>(XCOFF::AUX_SECT);
   } else {
-    W.write<uint32_t>(AuxSym.LengthOfSectionPortion.getValueOr(0));
+    W.write<uint32_t>(AuxSym.LengthOfSectionPortion.value_or(0));
     W.OS.write_zeros(4);
-    W.write<uint32_t>(AuxSym.NumberOfRelocEnt.getValueOr(0));
+    W.write<uint32_t>(AuxSym.NumberOfRelocEnt.value_or(0));
     W.OS.write_zeros(6);
   }
 }
 
 void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::SectAuxEntForStat &AuxSym) {
   assert(!Is64Bit && "can't write the stat auxiliary symbol for XCOFF64");
-  W.write<uint32_t>(AuxSym.SectionLength.getValueOr(0));
-  W.write<uint16_t>(AuxSym.NumberOfRelocEnt.getValueOr(0));
-  W.write<uint16_t>(AuxSym.NumberOfLineNum.getValueOr(0));
+  W.write<uint32_t>(AuxSym.SectionLength.value_or(0));
+  W.write<uint16_t>(AuxSym.NumberOfRelocEnt.value_or(0));
+  W.write<uint16_t>(AuxSym.NumberOfLineNum.value_or(0));
   W.OS.write_zeros(10);
 }
 
@@ -686,7 +686,7 @@ bool XCOFFWriter::writeSymbols() {
     W.write<uint16_t>(YamlSym.Type);
     W.write<uint8_t>(YamlSym.StorageClass);
 
-    uint8_t NumOfAuxSym = YamlSym.NumberOfAuxEntries.getValueOr(0);
+    uint8_t NumOfAuxSym = YamlSym.NumberOfAuxEntries.value_or(0);
     W.write<uint8_t>(NumOfAuxSym);
 
     if (!NumOfAuxSym && !YamlSym.AuxEntries.size())
diff --git a/llvm/lib/ObjectYAML/yaml2obj.cpp b/llvm/lib/ObjectYAML/yaml2obj.cpp
index d19fa0a52530..06050e246fbf 100644
--- a/llvm/lib/ObjectYAML/yaml2obj.cpp
+++ b/llvm/lib/ObjectYAML/yaml2obj.cpp
@@ -42,10 +42,14 @@ bool convertYAML(yaml::Input &YIn, raw_ostream &Out, ErrorHandler ErrHandler,
       return yaml2macho(Doc, Out, ErrHandler);
     if (Doc.Minidump)
       return yaml2minidump(*Doc.Minidump, Out, ErrHandler);
+    if (Doc.Offload)
+      return yaml2offload(*Doc.Offload, Out, ErrHandler);
     if (Doc.Wasm)
       return yaml2wasm(*Doc.Wasm, Out, ErrHandler);
     if (Doc.Xcoff)
       return yaml2xcoff(*Doc.Xcoff, Out, ErrHandler);
+    if (Doc.DXContainer)
+      return yaml2dxcontainer(*Doc.DXContainer, Out, ErrHandler);
 
     ErrHandler("unknown document type");
     return false;
diff --git a/llvm/lib/Option/ArgList.cpp b/llvm/lib/Option/ArgList.cpp
index ad7be5fbec19..fab0fb07cbc8 100644
--- a/llvm/lib/Option/ArgList.cpp
+++ b/llvm/lib/Option/ArgList.cpp
@@ -95,6 +95,13 @@ std::vector<std::string> ArgList::getAllArgValues(OptSpecifier Id) const {
   return std::vector<std::string>(Values.begin(), Values.end());
 }
 
+void ArgList::addOptInFlag(ArgStringList &Output, OptSpecifier Pos,
+                           OptSpecifier Neg) const {
+  if (Arg *A = getLastArg(Pos, Neg))
+    if (A->getOption().matches(Pos))
+      A->render(*this, Output);
+}
+
 void ArgList::AddAllArgsExcept(ArgStringList &Output,
                                ArrayRef<OptSpecifier> Ids,
                                ArrayRef<OptSpecifier> ExcludeIds) const {
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 015ca1eec4df..42fde3752724 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Analysis/CFLSteensAliasAnalysis.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CallPrinter.h"
 #include "llvm/Analysis/CostModel.h"
 #include "llvm/Analysis/CycleAnalysis.h"
 #include "llvm/Analysis/DDG.h"
@@ -185,7 +186,7 @@
 #include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h"
 #include "llvm/Transforms/Scalar/LoopUnrollPass.h"
 #include "llvm/Transforms/Scalar/LoopVersioningLICM.h"
-#include "llvm/Transforms/Scalar/LowerAtomic.h"
+#include "llvm/Transforms/Scalar/LowerAtomicPass.h"
 #include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h"
 #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
 #include "llvm/Transforms/Scalar/LowerGuardIntrinsic.h"
@@ -212,6 +213,7 @@
 #include "llvm/Transforms/Scalar/SpeculativeExecution.h"
 #include "llvm/Transforms/Scalar/StraightLineStrengthReduce.h"
 #include "llvm/Transforms/Scalar/StructurizeCFG.h"
+#include "llvm/Transforms/Scalar/TLSVariableHoist.h"
 #include "llvm/Transforms/Scalar/TailRecursionElimination.h"
 #include "llvm/Transforms/Scalar/WarnMissedTransforms.h"
 #include "llvm/Transforms/Utils/AddDiscriminators.h"
@@ -229,11 +231,13 @@
 #include "llvm/Transforms/Utils/LibCallsShrinkWrap.h"
 #include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/Transforms/Utils/LowerGlobalDtors.h"
 #include "llvm/Transforms/Utils/LowerInvoke.h"
 #include "llvm/Transforms/Utils/LowerSwitch.h"
 #include "llvm/Transforms/Utils/Mem2Reg.h"
 #include "llvm/Transforms/Utils/MetaRenamer.h"
 #include "llvm/Transforms/Utils/NameAnonGlobals.h"
+#include "llvm/Transforms/Utils/PredicateInfo.h"
 #include "llvm/Transforms/Utils/RelLookupTableConverter.h"
 #include "llvm/Transforms/Utils/StripGCRelocates.h"
 #include "llvm/Transforms/Utils/StripNonLineTableDebugInfo.h"
@@ -371,6 +375,17 @@ bool shouldPopulateClassToPassNames() {
          !printAfterPasses().empty();
 }
 
+// A pass for testing -print-on-crash.
+// DO NOT USE THIS EXCEPT FOR TESTING!
+class TriggerCrashPass : public PassInfoMixin<TriggerCrashPass> {
+public:
+  PreservedAnalyses run(Module &, ModuleAnalysisManager &) {
+    abort();
+    return PreservedAnalyses::all();
+  }
+  static StringRef name() { return "TriggerCrashPass"; }
+};
+
 } // namespace
 
 PassBuilder::PassBuilder(TargetMachine *TM, PipelineTuningOptions PTO,
@@ -585,6 +600,10 @@ Expected<bool> parseInlinerPassOptions(StringRef Params) {
   return parseSinglePassOption(Params, "only-mandatory", "InlinerPass");
 }
 
+Expected<bool> parseCoroSplitPassOptions(StringRef Params) {
+  return parseSinglePassOption(Params, "reuse-storage", "CoroSplitPass");
+}
+
 Expected<bool> parseEarlyCSEPassOptions(StringRef Params) {
   return parseSinglePassOption(Params, "memssa", "EarlyCSE");
 }
@@ -679,6 +698,8 @@ Expected<SimplifyCFGOptions> parseSimplifyCFGOptions(StringRef Params) {
     bool Enable = !ParamName.consume_front("no-");
     if (ParamName == "forward-switch-cond") {
       Result.forwardSwitchCondToPhi(Enable);
+    } else if (ParamName == "switch-range-to-icmp") {
+      Result.convertSwitchRangeToICmp(Enable);
     } else if (ParamName == "switch-to-lookup") {
       Result.convertSwitchToLookupTable(Enable);
     } else if (ParamName == "keep-loops") {
@@ -747,6 +768,24 @@ Expected<std::pair<bool, bool>> parseLoopUnswitchOptions(StringRef Params) {
   return Result;
 }
 
+Expected<LICMOptions> parseLICMOptions(StringRef Params) {
+  LICMOptions Result;
+  while (!Params.empty()) {
+    StringRef ParamName;
+    std::tie(ParamName, Params) = Params.split(';');
+
+    bool Enable = !ParamName.consume_front("no-");
+    if (ParamName == "allowspeculation") {
+      Result.AllowSpeculation = Enable;
+    } else {
+      return make_error<StringError>(
+          formatv("invalid LICM pass parameter '{0}' ", ParamName).str(),
+          inconvertibleErrorCode());
+    }
+  }
+  return Result;
+}
+
 Expected<bool> parseMergedLoadStoreMotionOptions(StringRef Params) {
   bool Result = false;
   while (!Params.empty()) {
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 93637c890c4f..a5345172aae1 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -32,6 +32,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h"
 #include "llvm/Transforms/Coroutines/CoroCleanup.h"
+#include "llvm/Transforms/Coroutines/CoroConditionalWrapper.h"
 #include "llvm/Transforms/Coroutines/CoroEarly.h"
 #include "llvm/Transforms/Coroutines/CoroElide.h"
 #include "llvm/Transforms/Coroutines/CoroSplit.h"
@@ -140,7 +141,7 @@ static cl::opt<InliningAdvisorMode> UseInlineAdvisor(
                           "Use release mode (AOT-compiled model).")));
 
 static cl::opt<bool> EnableSyntheticCounts(
-    "enable-npm-synthetic-counts", cl::init(false), cl::Hidden, cl::ZeroOrMore,
+    "enable-npm-synthetic-counts", cl::Hidden,
     cl::desc("Run synthetic function entry count generation "
              "pass"));
 
@@ -150,8 +151,7 @@ static cl::opt<bool>
                             cl::Hidden,
                             cl::desc("Enable inline deferral during PGO"));
 
-static cl::opt<bool> EnableMemProfiler("enable-mem-prof", cl::init(false),
-                                       cl::Hidden, cl::ZeroOrMore,
+static cl::opt<bool> EnableMemProfiler("enable-mem-prof", cl::Hidden,
                                        cl::desc("Enable memory profiler"));
 
 static cl::opt<bool> EnableModuleInliner("enable-module-inliner",
@@ -159,13 +159,13 @@ static cl::opt<bool> EnableModuleInliner("enable-module-inliner",
                                          cl::desc("Enable module inliner"));
 
 static cl::opt<bool> PerformMandatoryInliningsFirst(
-    "mandatory-inlining-first", cl::init(true), cl::Hidden, cl::ZeroOrMore,
+    "mandatory-inlining-first", cl::init(true), cl::Hidden,
     cl::desc("Perform mandatory inlinings module-wide, before performing "
              "inlining."));
 
 static cl::opt<bool> EnableO3NonTrivialUnswitching(
     "enable-npm-O3-nontrivial-unswitch", cl::init(true), cl::Hidden,
-    cl::ZeroOrMore, cl::desc("Enable non-trivial loop unswitching for -O3"));
+    cl::desc("Enable non-trivial loop unswitching for -O3"));
 
 static cl::opt<bool> EnableEagerlyInvalidateAnalyses(
     "eagerly-invalidate-analyses", cl::init(true), cl::Hidden,
@@ -233,9 +233,7 @@ void PassBuilder::invokePeepholeEPCallbacks(FunctionPassManager &FPM,
 
 // Helper to add AnnotationRemarksPass.
 static void addAnnotationRemarksPass(ModulePassManager &MPM) {
-  FunctionPassManager FPM;
-  FPM.addPass(AnnotationRemarksPass());
-  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+  MPM.addPass(createModuleToFunctionPassAdaptor(AnnotationRemarksPass()));
 }
 
 // Helper to check if the current compilation phase is preparing for LTO
@@ -259,14 +257,16 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
   FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */));
 
   // Hoisting of scalars and load expressions.
-  FPM.addPass(SimplifyCFGPass());
+  FPM.addPass(
+      SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
   FPM.addPass(InstCombinePass());
 
   FPM.addPass(LibCallsShrinkWrapPass());
 
   invokePeepholeEPCallbacks(FPM, Level);
 
-  FPM.addPass(SimplifyCFGPass());
+  FPM.addPass(
+      SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
 
   // Form canonically associated expression trees, and simplify the trees using
   // basic mathematical properties. For example, this will form (nearly)
@@ -291,14 +291,19 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
   LPM1.addPass(LoopSimplifyCFGPass());
 
   // Try to remove as much code from the loop header as possible,
-  // to reduce amount of IR that will have to be duplicated.
+  // to reduce amount of IR that will have to be duplicated. However,
+  // do not perform speculative hoisting the first time as LICM
+  // will destroy metadata that may not need to be destroyed if run
+  // after loop rotation.
   // TODO: Investigate promotion cap for O1.
-  LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
+  LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
+                        /*AllowSpeculation=*/false));
 
   LPM1.addPass(LoopRotatePass(/* Disable header duplication */ true,
                               isLTOPreLink(Phase)));
   // TODO: Investigate promotion cap for O1.
-  LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
+  LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
+                        /*AllowSpeculation=*/true));
   LPM1.addPass(SimpleLoopUnswitchPass());
   if (EnableLoopFlatten)
     LPM1.addPass(LoopFlattenPass());
@@ -335,7 +340,8 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
   FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1),
                                               /*UseMemorySSA=*/true,
                                               /*UseBlockFrequencyInfo=*/true));
-  FPM.addPass(SimplifyCFGPass());
+  FPM.addPass(
+      SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
   FPM.addPass(InstCombinePass());
   // The loop passes in LPM2 (LoopFullUnrollPass) do not preserve MemorySSA.
   // *All* loop passes must preserve it, in order to be able to use it.
@@ -373,7 +379,8 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
   // the simplifications and basic cleanup after all the simplifications.
   // TODO: Investigate if this is too expensive.
   FPM.addPass(ADCEPass());
-  FPM.addPass(SimplifyCFGPass());
+  FPM.addPass(
+      SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
   FPM.addPass(InstCombinePass());
   invokePeepholeEPCallbacks(FPM, Level);
 
@@ -408,7 +415,8 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   // Global value numbering based sinking.
   if (EnableGVNSink) {
     FPM.addPass(GVNSinkPass());
-    FPM.addPass(SimplifyCFGPass());
+    FPM.addPass(
+        SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
   }
 
   if (EnableConstraintElimination)
@@ -421,7 +429,8 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   FPM.addPass(JumpThreadingPass());
   FPM.addPass(CorrelatedValuePropagationPass());
 
-  FPM.addPass(SimplifyCFGPass());
+  FPM.addPass(
+      SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
   FPM.addPass(InstCombinePass());
   if (Level == OptimizationLevel::O3)
     FPM.addPass(AggressiveInstCombinePass());
@@ -438,7 +447,8 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
     FPM.addPass(PGOMemOPSizeOpt());
 
   FPM.addPass(TailCallElimPass());
-  FPM.addPass(SimplifyCFGPass());
+  FPM.addPass(
+      SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
 
   // Form canonically associated expression trees, and simplify the trees using
   // basic mathematical properties. For example, this will form (nearly)
@@ -463,15 +473,20 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   LPM1.addPass(LoopSimplifyCFGPass());
 
   // Try to remove as much code from the loop header as possible,
-  // to reduce amount of IR that will have to be duplicated.
+  // to reduce amount of IR that will have to be duplicated. However,
+  // do not perform speculative hoisting the first time as LICM
+  // will destroy metadata that may not need to be destroyed if run
+  // after loop rotation.
   // TODO: Investigate promotion cap for O1.
-  LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
+  LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
+                        /*AllowSpeculation=*/false));
 
   // Disable header duplication in loop rotation at -Oz.
   LPM1.addPass(
       LoopRotatePass(Level != OptimizationLevel::Oz, isLTOPreLink(Phase)));
   // TODO: Investigate promotion cap for O1.
-  LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
+  LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
+                        /*AllowSpeculation=*/true));
   LPM1.addPass(
       SimpleLoopUnswitchPass(/* NonTrivial */ Level == OptimizationLevel::O3 &&
                              EnableO3NonTrivialUnswitching));
@@ -510,7 +525,8 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1),
                                               /*UseMemorySSA=*/true,
                                               /*UseBlockFrequencyInfo=*/true));
-  FPM.addPass(SimplifyCFGPass());
+  FPM.addPass(
+      SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
   FPM.addPass(InstCombinePass());
   // The loop passes in LPM2 (LoopIdiomRecognizePass, IndVarSimplifyPass,
   // LoopDeletionPass and LoopFullUnrollPass) do not preserve MemorySSA.
@@ -567,7 +583,8 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
 
   FPM.addPass(DSEPass());
   FPM.addPass(createFunctionToLoopPassAdaptor(
-      LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
+      LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
+               /*AllowSpeculation=*/true),
       /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true));
 
   FPM.addPass(CoroElidePass());
@@ -575,8 +592,10 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   for (auto &C : ScalarOptimizerLateEPCallbacks)
     C(FPM, Level);
 
-  FPM.addPass(SimplifyCFGPass(
-      SimplifyCFGOptions().hoistCommonInsts(true).sinkCommonInsts(true)));
+  FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
+                                  .convertSwitchRangeToICmp(true)
+                                  .hoistCommonInsts(true)
+                                  .sinkCommonInsts(true)));
   FPM.addPass(InstCombinePass());
   invokePeepholeEPCallbacks(FPM, Level);
 
@@ -596,7 +615,8 @@ void PassBuilder::addRequiredLTOPreLinkPasses(ModulePassManager &MPM) {
 void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM,
                                     OptimizationLevel Level, bool RunProfileGen,
                                     bool IsCS, std::string ProfileFile,
-                                    std::string ProfileRemappingFile) {
+                                    std::string ProfileRemappingFile,
+                                    ThinOrFullLTOPhase LTOPhase) {
   assert(Level != OptimizationLevel::O0 && "Not expecting O0 here!");
   if (!IsCS && !DisablePreInliner) {
     InlineParams IP;
@@ -608,13 +628,16 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM,
     // performance testing.
     // FIXME: this comment is cargo culted from the old pass manager, revisit).
     IP.HintThreshold = Level.isOptimizingForSize() ? PreInlineThreshold : 325;
-    ModuleInlinerWrapperPass MIWP(IP);
+    ModuleInlinerWrapperPass MIWP(
+        IP, /* MandatoryFirst */ true,
+        InlineContext{LTOPhase, InlinePass::EarlyInliner});
     CGSCCPassManager &CGPipeline = MIWP.getPM();
 
     FunctionPassManager FPM;
     FPM.addPass(SROAPass());
     FPM.addPass(EarlyCSEPass());    // Catch trivial redundancies.
-    FPM.addPass(SimplifyCFGPass()); // Merge & remove basic blocks.
+    FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(
+        true)));                    // Merge & remove basic blocks.
     FPM.addPass(InstCombinePass()); // Combine silly sequences.
     invokePeepholeEPCallbacks(FPM, Level);
 
@@ -641,13 +664,13 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM,
   // Perform PGO instrumentation.
   MPM.addPass(PGOInstrumentationGen(IsCS));
 
-  FunctionPassManager FPM;
   // Disable header duplication in loop rotation at -Oz.
-  FPM.addPass(createFunctionToLoopPassAdaptor(
-      LoopRotatePass(Level != OptimizationLevel::Oz), /*UseMemorySSA=*/false,
-      /*UseBlockFrequencyInfo=*/false));
-  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM),
-                                                PTO.EagerlyInvalidateAnalyses));
+  MPM.addPass(createModuleToFunctionPassAdaptor(
+      createFunctionToLoopPassAdaptor(
+          LoopRotatePass(Level != OptimizationLevel::Oz),
+          /*UseMemorySSA=*/false,
+          /*UseBlockFrequencyInfo=*/false),
+      PTO.EagerlyInvalidateAnalyses));
 
   // Add the profile lowering pass.
   InstrProfOptions Options;
@@ -692,6 +715,12 @@ ModuleInlinerWrapperPass
 PassBuilder::buildInlinerPipeline(OptimizationLevel Level,
                                   ThinOrFullLTOPhase Phase) {
   InlineParams IP = getInlineParamsFromOptLevel(Level);
+  // For PreLinkThinLTO + SamplePGO, set hot-caller threshold to 0 to
+  // disable hot callsite inline (as much as possible [1]) because it makes
+  // profile annotation in the backend inaccurate.
+  //
+  // [1] Note the cost of a function could be below zero due to erased
+  // prologue / epilogue.
   if (Phase == ThinOrFullLTOPhase::ThinLTOPreLink && PGOOpt &&
       PGOOpt->Action == PGOOptions::SampleUse)
     IP.HotCallSiteThreshold = 0;
@@ -699,8 +728,10 @@ PassBuilder::buildInlinerPipeline(OptimizationLevel Level,
   if (PGOOpt)
     IP.EnableDeferral = EnablePGOInlineDeferral;
 
-  ModuleInlinerWrapperPass MIWP(IP, PerformMandatoryInliningsFirst,
-                                UseInlineAdvisor, MaxDevirtIterations);
+  ModuleInlinerWrapperPass MIWP(
+      IP, PerformMandatoryInliningsFirst,
+      InlineContext{Phase, InlinePass::CGSCCInliner},
+      UseInlineAdvisor, MaxDevirtIterations);
 
   // Require the GlobalsAA analysis for the module so we can query it within
   // the CGSCC pipeline.
@@ -765,6 +796,12 @@ PassBuilder::buildModuleInlinerPipeline(OptimizationLevel Level,
   ModulePassManager MPM;
 
   InlineParams IP = getInlineParamsFromOptLevel(Level);
+  // For PreLinkThinLTO + SamplePGO, set hot-caller threshold to 0 to
+  // disable hot callsite inline (as much as possible [1]) because it makes
+  // profile annotation in the backend inaccurate.
+  //
+  // [1] Note the cost of a function could be below zero due to erased
+  // prologue / epilogue.
   if (Phase == ThinOrFullLTOPhase::ThinLTOPreLink && PGOOpt &&
       PGOOpt->Action == PGOOptions::SampleUse)
     IP.HotCallSiteThreshold = 0;
@@ -780,7 +817,7 @@ PassBuilder::buildModuleInlinerPipeline(OptimizationLevel Level,
   // inline deferral logic in module inliner.
   IP.EnableDeferral = false;
 
-  MPM.addPass(ModuleInlinerPass(IP, UseInlineAdvisor));
+  MPM.addPass(ModuleInlinerPass(IP, UseInlineAdvisor, Phase));
 
   MPM.addPass(createModuleToFunctionPassAdaptor(
       buildFunctionSimplificationPipeline(Level, Phase),
@@ -832,6 +869,7 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   // Do basic inference of function attributes from known properties of system
   // libraries and other oracles.
   MPM.addPass(InferFunctionAttrsPass());
+  MPM.addPass(CoroEarlyPass());
 
   // Create an early function pass manager to cleanup the output of the
   // frontend.
@@ -842,7 +880,6 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   EarlyFPM.addPass(SimplifyCFGPass());
   EarlyFPM.addPass(SROAPass());
   EarlyFPM.addPass(EarlyCSEPass());
-  EarlyFPM.addPass(CoroEarlyPass());
   if (Level == OptimizationLevel::O3)
     EarlyFPM.addPass(CallSiteSplittingPass());
 
@@ -928,7 +965,8 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   GlobalCleanupPM.addPass(InstCombinePass());
   invokePeepholeEPCallbacks(GlobalCleanupPM, Level);
 
-  GlobalCleanupPM.addPass(SimplifyCFGPass());
+  GlobalCleanupPM.addPass(
+      SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(GlobalCleanupPM),
                                                 PTO.EagerlyInvalidateAnalyses));
 
@@ -939,7 +977,7 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
     addPGOInstrPasses(MPM, Level,
                       /* RunProfileGen */ PGOOpt->Action == PGOOptions::IRInstr,
                       /* IsCS */ false, PGOOpt->ProfileFile,
-                      PGOOpt->ProfileRemappingFile);
+                      PGOOpt->ProfileRemappingFile, Phase);
     MPM.addPass(PGOIndirectCallPromotion(false, false));
   }
   if (PGOOpt && Phase != ThinOrFullLTOPhase::ThinLTOPostLink &&
@@ -955,6 +993,8 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   else
     MPM.addPass(buildInlinerPipeline(Level, Phase));
 
+  MPM.addPass(CoroCleanupPass());
+
   if (EnableMemProfiler && Phase != ThinOrFullLTOPhase::ThinLTOPreLink) {
     MPM.addPass(createModuleToFunctionPassAdaptor(MemProfilerPass()));
     MPM.addPass(ModuleMemProfilerPass());
@@ -1007,7 +1047,8 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
     ExtraPasses.addPass(CorrelatedValuePropagationPass());
     ExtraPasses.addPass(InstCombinePass());
     LoopPassManager LPM;
-    LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
+    LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
+                         /*AllowSpeculation=*/true));
     LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level ==
                                        OptimizationLevel::O3));
     ExtraPasses.addPass(
@@ -1015,7 +1056,8 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
     ExtraPasses.addPass(
         createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/true,
                                         /*UseBlockFrequencyInfo=*/true));
-    ExtraPasses.addPass(SimplifyCFGPass());
+    ExtraPasses.addPass(
+        SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
     ExtraPasses.addPass(InstCombinePass());
     FPM.addPass(std::move(ExtraPasses));
   }
@@ -1031,6 +1073,7 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
   // before SLP vectorization.
   FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
                                   .forwardSwitchCondToPhi(true)
+                                  .convertSwitchRangeToICmp(true)
                                   .convertSwitchToLookupTable(true)
                                   .needCanonicalLoops(false)
                                   .hoistCommonInsts(true)
@@ -1073,7 +1116,8 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
     FPM.addPass(
         RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
     FPM.addPass(createFunctionToLoopPassAdaptor(
-        LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
+        LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
+                 /*AllowSpeculation=*/true),
         /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true));
   }
 
@@ -1087,7 +1131,9 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
 
 ModulePassManager
 PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
-                                             bool LTOPreLink) {
+                                             ThinOrFullLTOPhase LTOPhase) {
+  const bool LTOPreLink = (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink ||
+                           LTOPhase == ThinOrFullLTOPhase::FullLTOPreLink);
   ModulePassManager MPM;
 
   // Optimize globals now that the module is fully simplified.
@@ -1127,21 +1173,24 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
     if (PGOOpt->CSAction == PGOOptions::CSIRInstr)
       addPGOInstrPasses(MPM, Level, /* RunProfileGen */ true,
                         /* IsCS */ true, PGOOpt->CSProfileGenFile,
-                        PGOOpt->ProfileRemappingFile);
+                        PGOOpt->ProfileRemappingFile, LTOPhase);
     else if (PGOOpt->CSAction == PGOOptions::CSIRUse)
       addPGOInstrPasses(MPM, Level, /* RunProfileGen */ false,
                         /* IsCS */ true, PGOOpt->ProfileFile,
-                        PGOOpt->ProfileRemappingFile);
+                        PGOOpt->ProfileRemappingFile, LTOPhase);
   }
 
-  // Re-require GloblasAA here prior to function passes. This is particularly
+  // Re-compute GlobalsAA here prior to function passes. This is particularly
   // useful as the above will have inlined, DCE'ed, and function-attr
   // propagated everything. We should at this point have a reasonably minimal
   // and richly annotated call graph. By computing aliasing and mod/ref
   // information for all local globals here, the late loop passes and notably
   // the vectorizer will be able to use them to help recognize vectorizable
   // memory operations.
-  MPM.addPass(RequireAnalysisPass<GlobalsAA, Module>());
+  MPM.addPass(RecomputeGlobalsAAPass());
+
+  for (auto &C : OptimizerEarlyEPCallbacks)
+    C(MPM, Level);
 
   FunctionPassManager OptimizePM;
   OptimizePM.addPass(Float2IntPass());
@@ -1202,9 +1251,8 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
 
   // LoopSink (and other loop passes since the last simplifyCFG) might have
   // resulted in single-entry-single-exit or empty blocks. Clean up the CFG.
-  OptimizePM.addPass(SimplifyCFGPass());
-
-  OptimizePM.addPass(CoroCleanupPass());
+  OptimizePM.addPass(
+      SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
 
   // Add the core optimizing pipeline.
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(OptimizePM),
@@ -1230,9 +1278,6 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
   if (PTO.MergeFunctions)
     MPM.addPass(MergeFunctionsPass());
 
-  if (PTO.CallGraphProfile)
-    MPM.addPass(CGProfilePass());
-
   // Now we need to do some global optimization transforms.
   // FIXME: It would seem like these should come first in the optimization
   // pipeline and maybe be the bottom of the canonicalization pipeline? Weird
@@ -1240,6 +1285,9 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
   MPM.addPass(GlobalDCEPass());
   MPM.addPass(ConstantMergePass());
 
+  if (PTO.CallGraphProfile && !LTOPreLink)
+    MPM.addPass(CGProfilePass());
+
   // TODO: Relative look table converter pass caused an issue when full lto is
   // enabled. See https://reviews.llvm.org/D94355 for more details.
   // Until the issue fixed, disable this pass during pre-linking phase.
@@ -1270,13 +1318,14 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
   if (PGOOpt && PGOOpt->DebugInfoForProfiling)
     MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass()));
 
+  const ThinOrFullLTOPhase LTOPhase = LTOPreLink
+                                          ? ThinOrFullLTOPhase::FullLTOPreLink
+                                          : ThinOrFullLTOPhase::None;
   // Add the core simplification pipeline.
-  MPM.addPass(buildModuleSimplificationPipeline(
-      Level, LTOPreLink ? ThinOrFullLTOPhase::FullLTOPreLink
-                        : ThinOrFullLTOPhase::None));
+  MPM.addPass(buildModuleSimplificationPipeline(Level, LTOPhase));
 
   // Now add the optimization pipeline.
-  MPM.addPass(buildModuleOptimizationPipeline(Level, LTOPreLink));
+  MPM.addPass(buildModuleOptimizationPipeline(Level, LTOPhase));
 
   if (PGOOpt && PGOOpt->PseudoProbeForProfiling &&
       PGOOpt->Action == PGOOptions::SampleUse)
@@ -1330,11 +1379,6 @@ PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) {
   // Reduce the size of the IR as much as possible.
   MPM.addPass(GlobalOptPass());
 
-  // Module simplification splits coroutines, but does not fully clean up
-  // coroutine intrinsics. To ensure ThinLTO optimization passes don't trip up
-  // on these, we schedule the cleanup here.
-  MPM.addPass(createModuleToFunctionPassAdaptor(CoroCleanupPass()));
-
   if (PGOOpt && PGOOpt->PseudoProbeForProfiling &&
       PGOOpt->Action == PGOOptions::SampleUse)
     MPM.addPass(PseudoProbeUpdatePass());
@@ -1400,7 +1444,8 @@ ModulePassManager PassBuilder::buildThinLTODefaultPipeline(
       Level, ThinOrFullLTOPhase::ThinLTOPostLink));
 
   // Now add the optimization pipeline.
-  MPM.addPass(buildModuleOptimizationPipeline(Level));
+  MPM.addPass(buildModuleOptimizationPipeline(
+      Level, ThinOrFullLTOPhase::ThinLTOPostLink));
 
   // Emit annotation remarks.
   addAnnotationRemarksPass(MPM);
@@ -1425,6 +1470,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   // Convert @llvm.global.annotations to !annotation metadata.
   MPM.addPass(Annotation2MetadataPass());
 
+  for (auto &C : FullLinkTimeOptimizationEarlyEPCallbacks)
+    C(MPM, Level);
+
   // Create a function that performs CFI checks for cross-DSO calls with targets
   // in the current module.
   MPM.addPass(CrossDSOCFIPass());
@@ -1438,6 +1486,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
     // in ICP.
     MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
 
+    for (auto &C : FullLinkTimeOptimizationLastEPCallbacks)
+      C(MPM, Level);
+
     // Emit annotation remarks.
     addAnnotationRemarksPass(MPM);
 
@@ -1469,10 +1520,8 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   MPM.addPass(InferFunctionAttrsPass());
 
   if (Level.getSpeedupLevel() > 1) {
-    FunctionPassManager EarlyFPM;
-    EarlyFPM.addPass(CallSiteSplittingPass());
     MPM.addPass(createModuleToFunctionPassAdaptor(
-        std::move(EarlyFPM), PTO.EagerlyInvalidateAnalyses));
+        CallSiteSplittingPass(), PTO.EagerlyInvalidateAnalyses));
 
     // Indirect call promotion. This should promote all the targets that are
     // left by the earlier promotion pass that promotes intra-module targets.
@@ -1519,6 +1568,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
     // pipeline).
     MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
 
+    for (auto &C : FullLinkTimeOptimizationLastEPCallbacks)
+      C(MPM, Level);
+
     // Emit annotation remarks.
     addAnnotationRemarksPass(MPM);
 
@@ -1556,7 +1608,11 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   // valuable as the inliner doesn't currently care whether it is inlining an
   // invoke or a call.
   // Run the inliner now.
-  MPM.addPass(ModuleInlinerWrapperPass(getInlineParamsFromOptLevel(Level)));
+  MPM.addPass(ModuleInlinerWrapperPass(
+      getInlineParamsFromOptLevel(Level),
+      /* MandatoryFirst */ true,
+      InlineContext{ThinOrFullLTOPhase::FullLTOPostLink,
+                          InlinePass::CGSCCInliner}));
 
   // Optimize globals again after we ran the inliner.
   MPM.addPass(GlobalOptPass());
@@ -1573,7 +1629,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   FPM.addPass(InstCombinePass());
   invokePeepholeEPCallbacks(FPM, Level);
 
-  FPM.addPass(JumpThreadingPass(/*InsertFreezeWhenUnfoldingSelect*/ true));
+  FPM.addPass(JumpThreadingPass());
 
   // Do a post inline PGO instrumentation and use pass. This is a context
   // sensitive PGO pass.
@@ -1581,11 +1637,13 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
     if (PGOOpt->CSAction == PGOOptions::CSIRInstr)
       addPGOInstrPasses(MPM, Level, /* RunProfileGen */ true,
                         /* IsCS */ true, PGOOpt->CSProfileGenFile,
-                        PGOOpt->ProfileRemappingFile);
+                        PGOOpt->ProfileRemappingFile,
+                        ThinOrFullLTOPhase::FullLTOPostLink);
     else if (PGOOpt->CSAction == PGOOptions::CSIRUse)
       addPGOInstrPasses(MPM, Level, /* RunProfileGen */ false,
                         /* IsCS */ true, PGOOpt->ProfileFile,
-                        PGOOpt->ProfileRemappingFile);
+                        PGOOpt->ProfileRemappingFile,
+                        ThinOrFullLTOPhase::FullLTOPostLink);
   }
 
   // Break up allocas
@@ -1612,7 +1670,8 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
 
   FunctionPassManager MainFPM;
   MainFPM.addPass(createFunctionToLoopPassAdaptor(
-      LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
+      LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
+               /*AllowSpeculation=*/true),
       /*USeMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true));
 
   if (RunNewGVN)
@@ -1656,7 +1715,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
       createModuleToPostOrderCGSCCPassAdaptor(OpenMPOptCGSCCPass()));
 
   invokePeepholeEPCallbacks(MainFPM, Level);
-  MainFPM.addPass(JumpThreadingPass(/*InsertFreezeWhenUnfoldingSelect*/ true));
+  MainFPM.addPass(JumpThreadingPass());
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(MainFPM),
                                                 PTO.EagerlyInvalidateAnalyses));
 
@@ -1676,8 +1735,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
 
   // Add late LTO optimization passes.
   // Delete basic blocks, which optimization passes may have killed.
-  MPM.addPass(createModuleToFunctionPassAdaptor(
-      SimplifyCFGPass(SimplifyCFGOptions().hoistCommonInsts(true))));
+  MPM.addPass(createModuleToFunctionPassAdaptor(SimplifyCFGPass(
+      SimplifyCFGOptions().convertSwitchRangeToICmp(true).hoistCommonInsts(
+          true))));
 
   // Drop bodies of available eternally objects to improve GlobalDCE.
   MPM.addPass(EliminateAvailableExternallyPass());
@@ -1688,6 +1748,12 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   if (PTO.MergeFunctions)
     MPM.addPass(MergeFunctionsPass());
 
+  if (PTO.CallGraphProfile)
+    MPM.addPass(CGProfilePass());
+
+  for (auto &C : FullLinkTimeOptimizationLastEPCallbacks)
+    C(MPM, Level);
+
   // Emit annotation remarks.
   addAnnotationRemarksPass(MPM);
 
@@ -1770,6 +1836,10 @@ ModulePassManager PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level,
     if (!FPM.isEmpty())
       MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
   }
+
+  for (auto &C : OptimizerEarlyEPCallbacks)
+    C(MPM, Level);
+
   if (!VectorizerStartEPCallbacks.empty()) {
     FunctionPassManager FPM;
     for (auto &C : VectorizerStartEPCallbacks)
@@ -1778,11 +1848,14 @@ ModulePassManager PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level,
       MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
   }
 
-  MPM.addPass(createModuleToFunctionPassAdaptor(CoroEarlyPass()));
+  ModulePassManager CoroPM;
+  CoroPM.addPass(CoroEarlyPass());
   CGSCCPassManager CGPM;
   CGPM.addPass(CoroSplitPass());
-  MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM)));
-  MPM.addPass(createModuleToFunctionPassAdaptor(CoroCleanupPass()));
+  CoroPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM)));
+  CoroPM.addPass(CoroCleanupPass());
+  CoroPM.addPass(GlobalDCEPass());
+  MPM.addPass(CoroConditionalWrapper(std::move(CoroPM)));
 
   for (auto &C : OptimizerLastEPCallbacks)
     C(MPM, Level);
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 8e0af11b854d..7c29bffbc327 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -26,7 +26,6 @@ MODULE_ANALYSIS("profile-summary", ProfileSummaryAnalysis())
 MODULE_ANALYSIS("stack-safety", StackSafetyGlobalAnalysis())
 MODULE_ANALYSIS("verify", VerifierAnalysis())
 MODULE_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC))
-MODULE_ANALYSIS("asan-globals-md", ASanGlobalsMetadataAnalysis())
 MODULE_ANALYSIS("inline-advisor", InlineAdvisorAnalysis())
 MODULE_ANALYSIS("ir-similarity", IRSimilarityAnalysis())
 
@@ -50,9 +49,12 @@ MODULE_PASS("canonicalize-aliases", CanonicalizeAliasesPass())
 MODULE_PASS("cg-profile", CGProfilePass())
 MODULE_PASS("check-debugify", NewPMCheckDebugifyPass())
 MODULE_PASS("constmerge", ConstantMergePass())
+MODULE_PASS("coro-early", CoroEarlyPass())
+MODULE_PASS("coro-cleanup", CoroCleanupPass())
 MODULE_PASS("cross-dso-cfi", CrossDSOCFIPass())
 MODULE_PASS("deadargelim", DeadArgumentEliminationPass())
 MODULE_PASS("debugify", NewPMDebugifyPass())
+MODULE_PASS("dot-callgraph", CallGraphDOTPrinterPass())
 MODULE_PASS("elim-avail-extern", EliminateAvailableExternallyPass())
 MODULE_PASS("extract-blocks", BlockExtractorPass())
 MODULE_PASS("forceattrs", ForceFunctionAttrsPass())
@@ -64,6 +66,7 @@ MODULE_PASS("globalsplit", GlobalSplitPass())
 MODULE_PASS("hotcoldsplit", HotColdSplittingPass())
 MODULE_PASS("inferattrs", InferFunctionAttrsPass())
 MODULE_PASS("inliner-wrapper", ModuleInlinerWrapperPass())
+MODULE_PASS("inliner-ml-advisor-release", ModuleInlinerWrapperPass(getInlineParams(), true, {}, InliningAdvisorMode::Release, 0))
 MODULE_PASS("print<inline-advisor>", InlineAdvisorAnalysisPrinterPass(dbgs()))
 MODULE_PASS("inliner-wrapper-no-mandatory-first", ModuleInlinerWrapperPass(
   getInlineParams(),
@@ -76,6 +79,7 @@ MODULE_PASS("invalidate<all>", InvalidateAllAnalysesPass())
 MODULE_PASS("ipsccp", IPSCCPPass())
 MODULE_PASS("iroutliner", IROutlinerPass())
 MODULE_PASS("print-ir-similarity", IRSimilarityAnalysisPrinterPass(dbgs()))
+MODULE_PASS("lower-global-dtors", LowerGlobalDtorsPass())
 MODULE_PASS("lowertypetests", LowerTypeTestsPass())
 MODULE_PASS("metarenamer", MetaRenamerPass())
 MODULE_PASS("mergefunc", MergeFunctionsPass())
@@ -94,6 +98,7 @@ MODULE_PASS("print-lcg-dot", LazyCallGraphDOTPrinterPass(dbgs()))
 MODULE_PASS("print-must-be-executed-contexts", MustBeExecutedContextPrinterPass(dbgs()))
 MODULE_PASS("print-stack-safety", StackSafetyGlobalPrinterPass(dbgs()))
 MODULE_PASS("print<module-debuginfo>", ModuleDebugInfoPrinterPass(dbgs()))
+MODULE_PASS("recompute-globalsaa", RecomputeGlobalsAAPass())
 MODULE_PASS("rel-lookup-table-converter", RelLookupTableConverterPass())
 MODULE_PASS("rewrite-statepoints-for-gc", RewriteStatepointsForGC())
 MODULE_PASS("rewrite-symbols", RewriteSymbolPass())
@@ -109,7 +114,9 @@ MODULE_PASS("strip-debug-declare", StripDebugDeclarePass())
 MODULE_PASS("strip-nondebug", StripNonDebugSymbolsPass())
 MODULE_PASS("strip-nonlinetable-debuginfo", StripNonLineTableDebugInfoPass())
 MODULE_PASS("synthetic-counts-propagation", SyntheticCountsPropagation())
+MODULE_PASS("trigger-crash", TriggerCrashPass())
 MODULE_PASS("verify", VerifierPass())
+MODULE_PASS("view-callgraph", CallGraphViewerPass())
 MODULE_PASS("wholeprogramdevirt", WholeProgramDevirtPass())
 MODULE_PASS("dfsan", DataFlowSanitizerPass())
 MODULE_PASS("msan-module", ModuleMemorySanitizerPass({}))
@@ -165,7 +172,6 @@ CGSCC_PASS("invalidate<all>", InvalidateAllAnalysesPass())
 CGSCC_PASS("function-attrs", PostOrderFunctionAttrsPass())
 CGSCC_PASS("attributor-cgscc", AttributorCGSCCPass())
 CGSCC_PASS("openmp-opt-cgscc", OpenMPOptCGSCCPass())
-CGSCC_PASS("coro-split", CoroSplitPass())
 CGSCC_PASS("no-op-cgscc", NoOpCGSCCPass())
 #undef CGSCC_PASS
 
@@ -179,6 +185,13 @@ CGSCC_PASS_WITH_PARAMS("inline",
                        },
                        parseInlinerPassOptions,
                        "only-mandatory")
+CGSCC_PASS_WITH_PARAMS("coro-split",
+                       "CoroSplitPass",
+                       [](bool OptimizeFrame) {
+                         return CoroSplitPass(OptimizeFrame);
+                       },
+                       parseCoroSplitPassOptions,
+                       "reuse-storage")
 #undef CGSCC_PASS_WITH_PARAMS
 
 #ifndef FUNCTION_ANALYSIS
@@ -247,9 +260,7 @@ FUNCTION_PASS("callsite-splitting", CallSiteSplittingPass())
 FUNCTION_PASS("consthoist", ConstantHoistingPass())
 FUNCTION_PASS("constraint-elimination", ConstraintEliminationPass())
 FUNCTION_PASS("chr", ControlHeightReductionPass())
-FUNCTION_PASS("coro-early", CoroEarlyPass())
 FUNCTION_PASS("coro-elide", CoroElidePass())
-FUNCTION_PASS("coro-cleanup", CoroCleanupPass())
 FUNCTION_PASS("correlated-propagation", CorrelatedValuePropagationPass())
 FUNCTION_PASS("dce", DCEPass())
 FUNCTION_PASS("dfa-jump-threading", DFAJumpThreadingPass())
@@ -257,8 +268,14 @@ FUNCTION_PASS("div-rem-pairs", DivRemPairsPass())
 FUNCTION_PASS("dse", DSEPass())
 FUNCTION_PASS("dot-cfg", CFGPrinterPass())
 FUNCTION_PASS("dot-cfg-only", CFGOnlyPrinterPass())
-FUNCTION_PASS("dot-dom", DomTreePrinterPass())
-FUNCTION_PASS("dot-dom-only", DomTreeOnlyPrinterPass())
+FUNCTION_PASS("dot-dom", DomPrinter())
+FUNCTION_PASS("dot-dom-only", DomOnlyPrinter())
+FUNCTION_PASS("dot-post-dom", PostDomPrinter())
+FUNCTION_PASS("dot-post-dom-only", PostDomOnlyPrinter())
+FUNCTION_PASS("view-dom", DomViewer())
+FUNCTION_PASS("view-dom-only", DomOnlyViewer())
+FUNCTION_PASS("view-post-dom", PostDomViewer())
+FUNCTION_PASS("view-post-dom-only", PostDomOnlyViewer())
 FUNCTION_PASS("fix-irreducible", FixIrreduciblePass())
 FUNCTION_PASS("flattencfg", FlattenCFGPass())
 FUNCTION_PASS("make-guards-explicit", MakeGuardsExplicitPass())
@@ -361,6 +378,7 @@ FUNCTION_PASS("verify<safepoint-ir>", SafepointIRVerifierPass())
 FUNCTION_PASS("verify<scalar-evolution>", ScalarEvolutionVerifierPass())
 FUNCTION_PASS("view-cfg", CFGViewerPass())
 FUNCTION_PASS("view-cfg-only", CFGOnlyViewerPass())
+FUNCTION_PASS("tlshoist", TLSVariableHoistPass())
 FUNCTION_PASS("transform-warning", WarnMissedTransformationsPass())
 FUNCTION_PASS("tsan", ThreadSanitizerPass())
 FUNCTION_PASS("memprof", MemProfilerPass())
@@ -402,13 +420,6 @@ FUNCTION_PASS_WITH_PARAMS("loop-unroll",
                           "no-profile-peeling;profile-peeling;"
                           "no-runtime;runtime;"
                           "no-upperbound;upperbound")
-FUNCTION_PASS_WITH_PARAMS("asan",
-                          "AddressSanitizerPass",
-                           [](AddressSanitizerOptions Opts) {
-                             return AddressSanitizerPass(Opts);
-                           },
-                          parseASanPassOptions,
-                          "kernel")
 FUNCTION_PASS_WITH_PARAMS("msan",
                           "MemorySanitizerPass",
                            [](MemorySanitizerOptions Opts) {
@@ -423,6 +434,7 @@ FUNCTION_PASS_WITH_PARAMS("simplifycfg",
                            },
                           parseSimplifyCFGOptions,
                           "no-forward-switch-cond;forward-switch-cond;"
+                          "no-switch-range-to-icmp;switch-range-to-icmp;"
                           "no-switch-to-lookup;switch-to-lookup;"
                           "no-keep-loops;keep-loops;"
                           "no-hoist-common-insts;hoist-common-insts;"
@@ -466,7 +478,6 @@ FUNCTION_PASS_WITH_PARAMS("print<stack-lifetime>",
 #ifndef LOOPNEST_PASS
 #define LOOPNEST_PASS(NAME, CREATE_PASS)
 #endif
-LOOPNEST_PASS("lnicm", LNICMPass())
 LOOPNEST_PASS("loop-flatten", LoopFlattenPass())
 LOOPNEST_PASS("loop-interchange", LoopInterchangePass())
 LOOPNEST_PASS("loop-unroll-and-jam", LoopUnrollAndJamPass())
@@ -489,7 +500,6 @@ LOOP_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC))
 LOOP_PASS("canon-freeze", CanonicalizeFreezeInLoopsPass())
 LOOP_PASS("dot-ddg", DDGDotPrinterPass())
 LOOP_PASS("invalidate<all>", InvalidateAllAnalysesPass())
-LOOP_PASS("licm", LICMPass())
 LOOP_PASS("loop-idiom", LoopIdiomRecognizePass())
 LOOP_PASS("loop-instsimplify", LoopInstSimplifyPass())
 LOOP_PASS("loop-rotate", LoopRotatePass())
@@ -522,4 +532,18 @@ LOOP_PASS_WITH_PARAMS("simple-loop-unswitch",
                       },
                       parseLoopUnswitchOptions,
                       "nontrivial;no-nontrivial;trivial;no-trivial")
+
+LOOP_PASS_WITH_PARAMS("licm", "LICMPass",
+                      [](LICMOptions Params) {
+                        return LICMPass(Params);
+                      },
+                      parseLICMOptions,
+                      "allowspeculation");
+
+LOOP_PASS_WITH_PARAMS("lnicm", "LNICMPass",
+                      [](LICMOptions Params) {
+                        return LNICMPass(Params);
+                      },
+                      parseLICMOptions,
+                      "allowspeculation");
 #undef LOOP_PASS_WITH_PARAMS
diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp
index c42b1cb26f13..ab9f8bf9c957 100644
--- a/llvm/lib/Passes/StandardInstrumentations.cpp
+++ b/llvm/lib/Passes/StandardInstrumentations.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
@@ -27,12 +28,14 @@
 #include "llvm/IR/PrintPasses.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/CrashRecoveryContext.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/Regex.h"
+#include "llvm/Support/Signals.h"
 #include "llvm/Support/raw_ostream.h"
 #include <unordered_map>
 #include <unordered_set>
@@ -164,6 +167,12 @@ static cl::opt<std::string> DotCfgDir(
     cl::desc("Generate dot files into specified directory for changed IRs"),
     cl::Hidden, cl::init("./"));
 
+// An option to print the IR that was being processed when a pass crashes.
+static cl::opt<bool>
+    PrintCrashIR("print-on-crash",
+                 cl::desc("Print the last form of the IR before crash"),
+                 cl::init(false), cl::Hidden);
+
 namespace {
 
 // Perform a system based diff between \p Before and \p After, using
@@ -439,19 +448,11 @@ const Module *getModuleForComparison(Any IR) {
   return nullptr;
 }
 
-} // namespace
-
-template <typename T> ChangeReporter<T>::~ChangeReporter() {
-  assert(BeforeStack.empty() && "Problem with Change Printer stack.");
-}
-
-template <typename T>
-bool ChangeReporter<T>::isInterestingFunction(const Function &F) {
+bool isInterestingFunction(const Function &F) {
   return isFunctionInPrintList(F.getName());
 }
 
-template <typename T>
-bool ChangeReporter<T>::isInterestingPass(StringRef PassID) {
+bool isInterestingPass(StringRef PassID) {
   if (isIgnored(PassID))
     return false;
 
@@ -462,8 +463,7 @@ bool ChangeReporter<T>::isInterestingPass(StringRef PassID) {
 
 // Return true when this is a pass on IR for which printing
 // of changes is desired.
-template <typename T>
-bool ChangeReporter<T>::isInteresting(Any IR, StringRef PassID) {
+bool isInteresting(Any IR, StringRef PassID) {
   if (!isInterestingPass(PassID))
     return false;
   if (any_isa<const Function *>(IR))
@@ -471,6 +471,12 @@ bool ChangeReporter<T>::isInteresting(Any IR, StringRef PassID) {
   return true;
 }
 
+} // namespace
+
+template <typename T> ChangeReporter<T>::~ChangeReporter() {
+  assert(BeforeStack.empty() && "Problem with Change Printer stack.");
+}
+
 template <typename T>
 void ChangeReporter<T>::saveIRBeforePass(Any IR, StringRef PassID) {
   // Always need to place something on the stack because invalidated passes
@@ -587,7 +593,7 @@ void TextChangeReporter<T>::handleIgnored(StringRef PassID, std::string &Name) {
   Out << formatv("*** IR Pass {0} on {1} ignored ***\n", PassID, Name);
 }
 
-IRChangedPrinter::~IRChangedPrinter() {}
+IRChangedPrinter::~IRChangedPrinter() = default;
 
 void IRChangedPrinter::registerCallbacks(PassInstrumentationCallbacks &PIC) {
   if (PrintChanged == ChangePrinter::PrintChangedVerbose ||
@@ -1186,7 +1192,7 @@ void VerifyInstrumentation::registerCallbacks(
           if (DebugLogging)
             dbgs() << "Verifying function " << F->getName() << "\n";
 
-          if (verifyFunction(*F))
+          if (verifyFunction(*F, &errs()))
             report_fatal_error("Broken function found, compilation aborted!");
         } else if (any_isa<const Module *>(IR) ||
                    any_isa<const LazyCallGraph::SCC *>(IR)) {
@@ -1201,13 +1207,13 @@ void VerifyInstrumentation::registerCallbacks(
           if (DebugLogging)
             dbgs() << "Verifying module " << M->getName() << "\n";
 
-          if (verifyModule(*M))
+          if (verifyModule(*M, &errs()))
             report_fatal_error("Broken module found, compilation aborted!");
         }
       });
 }
 
-InLineChangePrinter::~InLineChangePrinter() {}
+InLineChangePrinter::~InLineChangePrinter() = default;
 
 void InLineChangePrinter::generateIRRepresentation(Any IR, StringRef PassID,
                                                    IRDataT<EmptyData> &D) {
@@ -2117,6 +2123,51 @@ StandardInstrumentations::StandardInstrumentations(
                             ChangePrinter::PrintChangedDotCfgVerbose),
       Verify(DebugLogging), VerifyEach(VerifyEach) {}
 
+PrintCrashIRInstrumentation *PrintCrashIRInstrumentation::CrashReporter =
+    nullptr;
+
+void PrintCrashIRInstrumentation::reportCrashIR() { dbgs() << SavedIR; }
+
+void PrintCrashIRInstrumentation::SignalHandler(void *) {
+  // Called by signal handlers so do not lock here
+  // Is the PrintCrashIRInstrumentation still alive?
+  if (!CrashReporter)
+    return;
+
+  assert(PrintCrashIR && "Did not expect to get here without option set.");
+  CrashReporter->reportCrashIR();
+}
+
+PrintCrashIRInstrumentation::~PrintCrashIRInstrumentation() {
+  if (!CrashReporter)
+    return;
+
+  assert(PrintCrashIR && "Did not expect to get here without option set.");
+  CrashReporter = nullptr;
+}
+
+void PrintCrashIRInstrumentation::registerCallbacks(
+    PassInstrumentationCallbacks &PIC) {
+  if (!PrintCrashIR || CrashReporter)
+    return;
+
+  sys::AddSignalHandler(SignalHandler, nullptr);
+  CrashReporter = this;
+
+  PIC.registerBeforeNonSkippedPassCallback([this](StringRef PassID, Any IR) {
+    SavedIR.clear();
+    raw_string_ostream OS(SavedIR);
+    OS << formatv("*** Dump of {0}IR Before Last Pass {1}",
+                  llvm::forcePrintModuleIR() ? "Module " : "", PassID);
+    if (!isInteresting(IR, PassID)) {
+      OS << " Filtered Out ***\n";
+      return;
+    }
+    OS << " Started ***\n";
+    unwrapAndPrint(OS, IR);
+  });
+}
+
 void StandardInstrumentations::registerCallbacks(
     PassInstrumentationCallbacks &PIC, FunctionAnalysisManager *FAM) {
   PrintIR.registerCallbacks(PIC);
@@ -2132,6 +2183,7 @@ void StandardInstrumentations::registerCallbacks(
     Verify.registerCallbacks(PIC);
   PrintChangedDiff.registerCallbacks(PIC);
   WebsiteChangeReporter.registerCallbacks(PIC);
+  PrintCrashIR.registerCallbacks(PIC);
 }
 
 template class ChangeReporter<std::string>;
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index 94c2bee3590c..f9e58fd6afa5 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -123,13 +123,15 @@ Counter CounterExpressionBuilder::simplify(Counter ExpressionTree) {
   return C;
 }
 
-Counter CounterExpressionBuilder::add(Counter LHS, Counter RHS) {
-  return simplify(get(CounterExpression(CounterExpression::Add, LHS, RHS)));
+Counter CounterExpressionBuilder::add(Counter LHS, Counter RHS, bool Simplify) {
+  auto Cnt = get(CounterExpression(CounterExpression::Add, LHS, RHS));
+  return Simplify ? simplify(Cnt) : Cnt;
 }
 
-Counter CounterExpressionBuilder::subtract(Counter LHS, Counter RHS) {
-  return simplify(
-      get(CounterExpression(CounterExpression::Subtract, LHS, RHS)));
+Counter CounterExpressionBuilder::subtract(Counter LHS, Counter RHS,
+                                           bool Simplify) {
+  auto Cnt = get(CounterExpression(CounterExpression::Subtract, LHS, RHS));
+  return Simplify ? simplify(Cnt) : Cnt;
 }
 
 void CounterMappingContext::dump(const Counter &C, raw_ostream &OS) const {
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
index c6691e321b3c..1a187795a8a0 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Object/Archive.h"
 #include "llvm/Object/Binary.h"
 #include "llvm/Object/COFF.h"
 #include "llvm/Object/Error.h"
@@ -174,7 +175,8 @@ Error RawCoverageFilenamesReader::readUncompressed(CovMapVersion Version,
         else
           P.assign(CWD);
         llvm::sys::path::append(P, Filename);
-        Filenames.push_back(static_cast<std::string>(P));
+        sys::path::remove_dots(P, /*remove_dot_dot=*/true);
+        Filenames.push_back(static_cast<std::string>(P.str()));
       }
     }
   }
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
index ceb2d7dcb5b9..781a2901dbb9 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
@@ -49,12 +49,8 @@ void CoverageFilenamesSectionWriter::write(raw_ostream &OS, bool Compress) {
   SmallString<128> CompressedStr;
   bool doCompression =
       Compress && zlib::isAvailable() && DoInstrProfNameCompression;
-  if (doCompression) {
-    auto E =
-        zlib::compress(FilenamesStr, CompressedStr, zlib::BestSizeCompression);
-    if (E)
-      report_bad_alloc_error("Failed to zlib compress coverage data");
-  }
+  if (doCompression)
+    zlib::compress(FilenamesStr, CompressedStr, zlib::BestSizeCompression);
 
   // ::= <num-filenames>
   //     <uncompressed-len>
diff --git a/llvm/lib/ProfileData/GCOV.cpp b/llvm/lib/ProfileData/GCOV.cpp
index 72d1addab01e..feacf40b8d0a 100644
--- a/llvm/lib/ProfileData/GCOV.cpp
+++ b/llvm/lib/ProfileData/GCOV.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/ProfileData/GCOV.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Demangle/Demangle.h"
 #include "llvm/Support/Debug.h"
@@ -23,7 +24,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <system_error>
-#include <unordered_map>
 
 using namespace llvm;
 
@@ -663,6 +663,8 @@ void Context::collectFunction(GCOVFunction &f, Summary &summary) {
   if (f.startLine >= si.startLineToFunctions.size())
     si.startLineToFunctions.resize(f.startLine + 1);
   si.startLineToFunctions[f.startLine].push_back(&f);
+  SmallSet<uint32_t, 16> lines;
+  SmallSet<uint32_t, 16> linesExec;
   for (const GCOVBlock &b : f.blocksRange()) {
     if (b.lines.empty())
       continue;
@@ -671,9 +673,9 @@ void Context::collectFunction(GCOVFunction &f, Summary &summary) {
       si.lines.resize(maxLineNum + 1);
     for (uint32_t lineNum : b.lines) {
       LineInfo &line = si.lines[lineNum];
-      if (!line.exists)
+      if (lines.insert(lineNum).second)
         ++summary.lines;
-      if (line.count == 0 && b.count)
+      if (b.count && linesExec.insert(lineNum).second)
         ++summary.linesExec;
       line.exists = true;
       line.count += b.count;
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 07d467305ae5..48ac5ce0d607 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -51,6 +51,7 @@
 #include <memory>
 #include <string>
 #include <system_error>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
@@ -466,12 +467,8 @@ Error collectPGOFuncNameStrings(ArrayRef<std::string> NameStrs,
   }
 
   SmallString<128> CompressedNameStrings;
-  Error E = zlib::compress(StringRef(UncompressedNameStrings),
-                           CompressedNameStrings, zlib::BestSizeCompression);
-  if (E) {
-    consumeError(std::move(E));
-    return make_error<InstrProfError>(instrprof_error::compress_failed);
-  }
+  zlib::compress(StringRef(UncompressedNameStrings), CompressedNameStrings,
+                 zlib::BestSizeCompression);
 
   return WriteStringToResult(CompressedNameStrings.size(),
                              CompressedNameStrings);
@@ -1311,4 +1308,76 @@ void OverlapStats::dump(raw_fd_ostream &OS) const {
   }
 }
 
+namespace IndexedInstrProf {
+// A C++14 compatible version of the offsetof macro.
+template <typename T1, typename T2>
+inline size_t constexpr offsetOf(T1 T2::*Member) {
+  constexpr T2 Object{};
+  return size_t(&(Object.*Member)) - size_t(&Object);
+}
+
+static inline uint64_t read(const unsigned char *Buffer, size_t Offset) {
+  return *reinterpret_cast<const uint64_t *>(Buffer + Offset);
+}
+
+uint64_t Header::formatVersion() const {
+  using namespace support;
+  return endian::byte_swap<uint64_t, little>(Version);
+}
+
+Expected<Header> Header::readFromBuffer(const unsigned char *Buffer) {
+  using namespace support;
+  static_assert(std::is_standard_layout<Header>::value,
+                "The header should be standard layout type since we use offset "
+                "of fields to read.");
+  Header H;
+
+  H.Magic = read(Buffer, offsetOf(&Header::Magic));
+  // Check the magic number.
+  uint64_t Magic = endian::byte_swap<uint64_t, little>(H.Magic);
+  if (Magic != IndexedInstrProf::Magic)
+    return make_error<InstrProfError>(instrprof_error::bad_magic);
+
+  // Read the version.
+  H.Version = read(Buffer, offsetOf(&Header::Version));
+  if (GET_VERSION(H.formatVersion()) >
+      IndexedInstrProf::ProfVersion::CurrentVersion)
+    return make_error<InstrProfError>(instrprof_error::unsupported_version);
+
+  switch (GET_VERSION(H.formatVersion())) {
+    // When a new field is added in the header add a case statement here to
+    // populate it.
+    static_assert(
+        IndexedInstrProf::ProfVersion::CurrentVersion == Version8,
+        "Please update the reading code below if a new field has been added, "
+        "if not add a case statement to fall through to the latest version.");
+  case 8ull:
+    H.MemProfOffset = read(Buffer, offsetOf(&Header::MemProfOffset));
+    LLVM_FALLTHROUGH;
+  default: // Version7 (when the backwards compatible header was introduced).
+    H.HashType = read(Buffer, offsetOf(&Header::HashType));
+    H.HashOffset = read(Buffer, offsetOf(&Header::HashOffset));
+  }
+
+  return H;
+}
+
+size_t Header::size() const {
+  switch (GET_VERSION(formatVersion())) {
+    // When a new field is added to the header add a case statement here to
+    // compute the size as offset of the new field + size of the new field. This
+    // relies on the field being added to the end of the list.
+    static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version8,
+                  "Please update the size computation below if a new field has "
+                  "been added to the header, if not add a case statement to "
+                  "fall through to the latest version.");
+  case 8ull:
+    return offsetOf(&Header::MemProfOffset) + sizeof(Header::MemProfOffset);
+  default: // Version7 (when the backwards compatible header was introduced).
+    return offsetOf(&Header::HashOffset) + sizeof(Header::HashOffset);
+  }
+}
+
+} // namespace IndexedInstrProf
+
 } // end namespace llvm
diff --git a/llvm/lib/ProfileData/InstrProfCorrelator.cpp b/llvm/lib/ProfileData/InstrProfCorrelator.cpp
index 8e38a6869d07..4b8212c546f7 100644
--- a/llvm/lib/ProfileData/InstrProfCorrelator.cpp
+++ b/llvm/lib/ProfileData/InstrProfCorrelator.cpp
@@ -7,10 +7,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ProfileData/InstrProfCorrelator.h"
+#include "llvm/DebugInfo/DIContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFDie.h"
+#include "llvm/DebugInfo/DWARF/DWARFExpression.h"
+#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
+#include "llvm/DebugInfo/DWARF/DWARFLocationExpression.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/Path.h"
 
 #define DEBUG_TYPE "correlator"
 
@@ -279,7 +284,7 @@ void DwarfInstrProfCorrelator<IntPtrT>::correlateProfileDataImpl() {
       LLVM_DEBUG(Die.dump(dbgs()));
     }
     this->addProbe(*FunctionName, *CFGHash, *CounterPtr - CountersStart,
-                   FunctionPtr.getValueOr(0), *NumCounters);
+                   FunctionPtr.value_or(0), *NumCounters);
   };
   for (auto &CU : DICtx->normal_units())
     for (const auto &Entry : CU->dies())
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index 138b1532d778..ee8989979a26 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -14,11 +14,11 @@
 #include "llvm/ProfileData/InstrProfReader.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/ProfileSummary.h"
 #include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/MemProf.h"
 #include "llvm/ProfileData/ProfileCommon.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
@@ -27,7 +27,6 @@
 #include "llvm/Support/SwapByteOrder.h"
 #include "llvm/Support/SymbolRemappingReader.h"
 #include <algorithm>
-#include <cctype>
 #include <cstddef>
 #include <cstdint>
 #include <limits>
@@ -43,13 +42,13 @@ using namespace llvm;
 static InstrProfKind getProfileKindFromVersion(uint64_t Version) {
   InstrProfKind ProfileKind = InstrProfKind::Unknown;
   if (Version & VARIANT_MASK_IR_PROF) {
-    ProfileKind |= InstrProfKind::IR;
+    ProfileKind |= InstrProfKind::IRInstrumentation;
   }
   if (Version & VARIANT_MASK_CSIR_PROF) {
-    ProfileKind |= InstrProfKind::CS;
+    ProfileKind |= InstrProfKind::ContextSensitive;
   }
   if (Version & VARIANT_MASK_INSTR_ENTRY) {
-    ProfileKind |= InstrProfKind::BB;
+    ProfileKind |= InstrProfKind::FunctionEntryInstrumentation;
   }
   if (Version & VARIANT_MASK_BYTE_COVERAGE) {
     ProfileKind |= InstrProfKind::SingleByteCoverage;
@@ -57,6 +56,9 @@ static InstrProfKind getProfileKindFromVersion(uint64_t Version) {
   if (Version & VARIANT_MASK_FUNCTION_ENTRY_ONLY) {
     ProfileKind |= InstrProfKind::FunctionEntryOnly;
   }
+  if (Version & VARIANT_MASK_MEMPROF) {
+    ProfileKind |= InstrProfKind::MemProf;
+  }
   return ProfileKind;
 }
 
@@ -153,14 +155,6 @@ IndexedInstrProfReader::create(std::unique_ptr<MemoryBuffer> Buffer,
   return std::move(Result);
 }
 
-void InstrProfIterator::Increment() {
-  if (auto E = Reader->readNextRecord(Record)) {
-    // Handle errors in the reader.
-    InstrProfError::take(std::move(E));
-    *this = InstrProfIterator();
-  }
-}
-
 bool TextInstrProfReader::hasFormat(const MemoryBuffer &Buffer) {
   // Verify that this really looks like plain ASCII text by checking a
   // 'reasonable' number of characters (up to profile magic size).
@@ -180,16 +174,16 @@ Error TextInstrProfReader::readHeader() {
   while (Line->startswith(":")) {
     StringRef Str = Line->substr(1);
     if (Str.equals_insensitive("ir"))
-      ProfileKind |= InstrProfKind::IR;
+      ProfileKind |= InstrProfKind::IRInstrumentation;
     else if (Str.equals_insensitive("fe"))
-      ProfileKind |= InstrProfKind::FE;
+      ProfileKind |= InstrProfKind::FrontendInstrumentation;
     else if (Str.equals_insensitive("csir")) {
-      ProfileKind |= InstrProfKind::IR;
-      ProfileKind |= InstrProfKind::CS;
+      ProfileKind |= InstrProfKind::IRInstrumentation;
+      ProfileKind |= InstrProfKind::ContextSensitive;
     } else if (Str.equals_insensitive("entry_first"))
-      ProfileKind |= InstrProfKind::BB;
+      ProfileKind |= InstrProfKind::FunctionEntryInstrumentation;
     else if (Str.equals_insensitive("not_entry_first"))
-      ProfileKind &= ~InstrProfKind::BB;
+      ProfileKind &= ~InstrProfKind::FunctionEntryInstrumentation;
     else
       return error(instrprof_error::bad_header);
     ++Line;
@@ -454,7 +448,7 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
     return error(instrprof_error::bad_header);
 
   std::unique_ptr<InstrProfSymtab> NewSymtab = std::make_unique<InstrProfSymtab>();
-  if (Error E = createSymtab(*NewSymtab.get()))
+  if (Error E = createSymtab(*NewSymtab))
     return E;
 
   Symtab = std::move(NewSymtab);
@@ -942,24 +936,17 @@ Error IndexedInstrProfReader::readHeader() {
   if ((const unsigned char *)DataBuffer->getBufferEnd() - Cur < 24)
     return error(instrprof_error::truncated);
 
-  auto *Header = reinterpret_cast<const IndexedInstrProf::Header *>(Cur);
-  Cur += sizeof(IndexedInstrProf::Header);
+  auto HeaderOr = IndexedInstrProf::Header::readFromBuffer(Start);
+  if (!HeaderOr)
+    return HeaderOr.takeError();
 
-  // Check the magic number.
-  uint64_t Magic = endian::byte_swap<uint64_t, little>(Header->Magic);
-  if (Magic != IndexedInstrProf::Magic)
-    return error(instrprof_error::bad_magic);
-
-  // Read the version.
-  uint64_t FormatVersion = endian::byte_swap<uint64_t, little>(Header->Version);
-  if (GET_VERSION(FormatVersion) >
-      IndexedInstrProf::ProfVersion::CurrentVersion)
-    return error(instrprof_error::unsupported_version);
+  const IndexedInstrProf::Header *Header = &HeaderOr.get();
+  Cur += Header->size();
 
-  Cur = readSummary((IndexedInstrProf::ProfVersion)FormatVersion, Cur,
+  Cur = readSummary((IndexedInstrProf::ProfVersion)Header->formatVersion(), Cur,
                     /* UseCS */ false);
-  if (FormatVersion & VARIANT_MASK_CSIR_PROF)
-    Cur = readSummary((IndexedInstrProf::ProfVersion)FormatVersion, Cur,
+  if (Header->formatVersion() & VARIANT_MASK_CSIR_PROF)
+    Cur = readSummary((IndexedInstrProf::ProfVersion)Header->formatVersion(), Cur,
                       /* UseCS */ true);
 
   // Read the hash type and start offset.
@@ -970,10 +957,46 @@ Error IndexedInstrProfReader::readHeader() {
 
   uint64_t HashOffset = endian::byte_swap<uint64_t, little>(Header->HashOffset);
 
-  // The rest of the file is an on disk hash table.
-  auto IndexPtr =
-      std::make_unique<InstrProfReaderIndex<OnDiskHashTableImplV3>>(
-          Start + HashOffset, Cur, Start, HashType, FormatVersion);
+  // The hash table with profile counts comes next.
+  auto IndexPtr = std::make_unique<InstrProfReaderIndex<OnDiskHashTableImplV3>>(
+      Start + HashOffset, Cur, Start, HashType, Header->formatVersion());
+
+  // The MemProfOffset field in the header is only valid when the format version
+  // is higher than 8 (when it was introduced).
+  if (GET_VERSION(Header->formatVersion()) >= 8 &&
+      Header->formatVersion() & VARIANT_MASK_MEMPROF) {
+    uint64_t MemProfOffset =
+        endian::byte_swap<uint64_t, little>(Header->MemProfOffset);
+
+    const unsigned char *Ptr = Start + MemProfOffset;
+    // The value returned from RecordTableGenerator.Emit.
+    const uint64_t RecordTableOffset =
+        support::endian::readNext<uint64_t, little, unaligned>(Ptr);
+    // The offset in the stream right before invoking FrameTableGenerator.Emit.
+    const uint64_t FramePayloadOffset =
+        support::endian::readNext<uint64_t, little, unaligned>(Ptr);
+    // The value returned from FrameTableGenerator.Emit.
+    const uint64_t FrameTableOffset =
+        support::endian::readNext<uint64_t, little, unaligned>(Ptr);
+
+    // Read the schema.
+    auto SchemaOr = memprof::readMemProfSchema(Ptr);
+    if (!SchemaOr)
+      return SchemaOr.takeError();
+    Schema = SchemaOr.get();
+
+    // Now initialize the table reader with a pointer into data buffer.
+    MemProfRecordTable.reset(MemProfRecordHashTable::Create(
+        /*Buckets=*/Start + RecordTableOffset,
+        /*Payload=*/Ptr,
+        /*Base=*/Start, memprof::RecordLookupTrait(Schema)));
+
+    // Initialize the frame table reader with the payload and bucket offsets.
+    MemProfFrameTable.reset(MemProfFrameHashTable::Create(
+        /*Buckets=*/Start + FrameTableOffset,
+        /*Payload=*/Start + FramePayloadOffset,
+        /*Base=*/Start, memprof::FrameLookupTrait()));
+  }
 
   // Load the remapping table now if requested.
   if (RemappingBuffer) {
@@ -991,16 +1014,16 @@ Error IndexedInstrProfReader::readHeader() {
 }
 
 InstrProfSymtab &IndexedInstrProfReader::getSymtab() {
-  if (Symtab.get())
-    return *Symtab.get();
+  if (Symtab)
+    return *Symtab;
 
   std::unique_ptr<InstrProfSymtab> NewSymtab = std::make_unique<InstrProfSymtab>();
-  if (Error E = Index->populateSymtab(*NewSymtab.get())) {
+  if (Error E = Index->populateSymtab(*NewSymtab)) {
     consumeError(error(InstrProfError::take(std::move(E))));
   }
 
   Symtab = std::move(NewSymtab);
-  return *Symtab.get();
+  return *Symtab;
 }
 
 Expected<InstrProfRecord>
@@ -1019,6 +1042,43 @@ IndexedInstrProfReader::getInstrProfRecord(StringRef FuncName,
   return error(instrprof_error::hash_mismatch);
 }
 
+Expected<memprof::MemProfRecord>
+IndexedInstrProfReader::getMemProfRecord(const uint64_t FuncNameHash) {
+  // TODO: Add memprof specific errors.
+  if (MemProfRecordTable == nullptr)
+    return make_error<InstrProfError>(instrprof_error::invalid_prof,
+                                      "no memprof data available in profile");
+  auto Iter = MemProfRecordTable->find(FuncNameHash);
+  if (Iter == MemProfRecordTable->end())
+    return make_error<InstrProfError>(
+        instrprof_error::unknown_function,
+        "memprof record not found for function hash " + Twine(FuncNameHash));
+
+  // Setup a callback to convert from frame ids to frame using the on-disk
+  // FrameData hash table.
+  memprof::FrameId LastUnmappedFrameId = 0;
+  bool HasFrameMappingError = false;
+  auto IdToFrameCallback = [&](const memprof::FrameId Id) {
+    auto FrIter = MemProfFrameTable->find(Id);
+    if (FrIter == MemProfFrameTable->end()) {
+      LastUnmappedFrameId = Id;
+      HasFrameMappingError = true;
+      return memprof::Frame(0, 0, 0, false);
+    }
+    return *FrIter;
+  };
+
+  memprof::MemProfRecord Record(*Iter, IdToFrameCallback);
+
+  // Check that all frame ids were successfully converted to frames.
+  if (HasFrameMappingError) {
+    return make_error<InstrProfError>(instrprof_error::hash_mismatch,
+                                      "memprof frame not found for frame id " +
+                                          Twine(LastUnmappedFrameId));
+  }
+  return Record;
+}
+
 Error IndexedInstrProfReader::getFunctionCounts(StringRef FuncName,
                                                 uint64_t FuncHash,
                                                 std::vector<uint64_t> &Counts) {
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index 8ded1c0426e5..cd4e8900c963 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/ProfileSummary.h"
 #include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/MemProf.h"
 #include "llvm/ProfileData/ProfileCommon.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
@@ -23,7 +24,6 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/OnDiskHashTable.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cstdint>
 #include <memory>
 #include <string>
@@ -32,7 +32,6 @@
 #include <vector>
 
 using namespace llvm;
-extern cl::opt<bool> DebugInfoCorrelate;
 
 // A struct to define how the data stream should be patched. For Indexed
 // profiling, only uint64_t data type is needed.
@@ -64,11 +63,16 @@ public:
 
     if (IsFDOStream) {
       raw_fd_ostream &FDOStream = static_cast<raw_fd_ostream &>(OS);
+      const uint64_t LastPos = FDOStream.tell();
       for (int K = 0; K < NItems; K++) {
         FDOStream.seek(P[K].Pos);
         for (int I = 0; I < P[K].N; I++)
           write(P[K].D[I]);
       }
+      // Reset the stream to the last position after patching so that users
+      // don't accidentally overwrite data. This makes it consistent with
+      // the string stream below which replaces the data directly.
+      FDOStream.seek(LastPos);
     } else {
       raw_string_ostream &SOStream = static_cast<raw_string_ostream &>(OS);
       std::string &Data = SOStream.str(); // with flush
@@ -249,11 +253,51 @@ void InstrProfWriter::addRecord(StringRef Name, uint64_t Hash,
   Dest.sortValueData();
 }
 
+void InstrProfWriter::addMemProfRecord(
+    const Function::GUID Id, const memprof::IndexedMemProfRecord &Record) {
+  auto Result = MemProfRecordData.insert({Id, Record});
+  // If we inserted a new record then we are done.
+  if (Result.second) {
+    return;
+  }
+  memprof::IndexedMemProfRecord &Existing = Result.first->second;
+  Existing.merge(Record);
+}
+
+bool InstrProfWriter::addMemProfFrame(const memprof::FrameId Id,
+                                      const memprof::Frame &Frame,
+                                      function_ref<void(Error)> Warn) {
+  auto Result = MemProfFrameData.insert({Id, Frame});
+  // If a mapping already exists for the current frame id and it does not
+  // match the new mapping provided then reset the existing contents and bail
+  // out. We don't support the merging of memprof data whose Frame -> Id
+  // mapping across profiles is inconsistent.
+  if (!Result.second && Result.first->second != Frame) {
+    Warn(make_error<InstrProfError>(instrprof_error::malformed,
+                                    "frame to id mapping mismatch"));
+    return false;
+  }
+  return true;
+}
+
 void InstrProfWriter::mergeRecordsFromWriter(InstrProfWriter &&IPW,
                                              function_ref<void(Error)> Warn) {
   for (auto &I : IPW.FunctionData)
     for (auto &Func : I.getValue())
       addRecord(I.getKey(), Func.first, std::move(Func.second), 1, Warn);
+
+  MemProfFrameData.reserve(IPW.MemProfFrameData.size());
+  for (auto &I : IPW.MemProfFrameData) {
+    // If we weren't able to add the frame mappings then it doesn't make sense
+    // to try to merge the records from this profile.
+    if (!addMemProfFrame(I.first, I.second, Warn))
+      return;
+  }
+
+  MemProfRecordData.reserve(IPW.MemProfRecordData.size());
+  for (auto &I : IPW.MemProfRecordData) {
+    addMemProfRecord(I.first, I.second);
+  }
 }
 
 bool InstrProfWriter::shouldEncodeData(const ProfilingData &PD) {
@@ -298,30 +342,34 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   for (const auto &I : FunctionData)
     if (shouldEncodeData(I.getValue()))
       Generator.insert(I.getKey(), &I.getValue());
+
   // Write the header.
   IndexedInstrProf::Header Header;
   Header.Magic = IndexedInstrProf::Magic;
   Header.Version = IndexedInstrProf::ProfVersion::CurrentVersion;
-  if (static_cast<bool>(ProfileKind & InstrProfKind::IR))
+  if (static_cast<bool>(ProfileKind & InstrProfKind::IRInstrumentation))
     Header.Version |= VARIANT_MASK_IR_PROF;
-  if (static_cast<bool>(ProfileKind & InstrProfKind::CS))
+  if (static_cast<bool>(ProfileKind & InstrProfKind::ContextSensitive))
     Header.Version |= VARIANT_MASK_CSIR_PROF;
-  if (static_cast<bool>(ProfileKind & InstrProfKind::BB))
+  if (static_cast<bool>(ProfileKind &
+                        InstrProfKind::FunctionEntryInstrumentation))
     Header.Version |= VARIANT_MASK_INSTR_ENTRY;
   if (static_cast<bool>(ProfileKind & InstrProfKind::SingleByteCoverage))
     Header.Version |= VARIANT_MASK_BYTE_COVERAGE;
   if (static_cast<bool>(ProfileKind & InstrProfKind::FunctionEntryOnly))
     Header.Version |= VARIANT_MASK_FUNCTION_ENTRY_ONLY;
+  if (static_cast<bool>(ProfileKind & InstrProfKind::MemProf))
+    Header.Version |= VARIANT_MASK_MEMPROF;
 
   Header.Unused = 0;
   Header.HashType = static_cast<uint64_t>(IndexedInstrProf::HashType);
   Header.HashOffset = 0;
+  Header.MemProfOffset = 0;
   int N = sizeof(IndexedInstrProf::Header) / sizeof(uint64_t);
 
-  // Only write out all the fields except 'HashOffset'. We need
-  // to remember the offset of that field to allow back patching
-  // later.
-  for (int I = 0; I < N - 1; I++)
+  // Only write out all the fields except 'HashOffset' and 'MemProfOffset'. We
+  // need to remember the offset of these fields to allow back patching later.
+  for (int I = 0; I < N - 2; I++)
     OS.write(reinterpret_cast<uint64_t *>(&Header)[I]);
 
   // Save the location of Header.HashOffset field in \c OS.
@@ -329,6 +377,13 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   // Reserve the space for HashOffset field.
   OS.write(0);
 
+  // Save the location of MemProf profile data. This is stored in two parts as
+  // the schema and as a separate on-disk chained hashtable.
+  uint64_t MemProfSectionOffset = OS.tell();
+  // Reserve space for the MemProf table field to be patched later if this
+  // profile contains memory profile information.
+  OS.write(0);
+
   // Reserve space to write profile summary data.
   uint32_t NumEntries = ProfileSummaryBuilder::DefaultCutoffs.size();
   uint32_t SummarySize = Summary::getSize(Summary::NumKinds, NumEntries);
@@ -338,7 +393,7 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
     OS.write(0);
   uint64_t CSSummaryOffset = 0;
   uint64_t CSSummarySize = 0;
-  if (static_cast<bool>(ProfileKind & InstrProfKind::CS)) {
+  if (static_cast<bool>(ProfileKind & InstrProfKind::ContextSensitive)) {
     CSSummaryOffset = OS.tell();
     CSSummarySize = SummarySize / sizeof(uint64_t);
     for (unsigned I = 0; I < CSSummarySize; I++)
@@ -348,6 +403,63 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   // Write the hash table.
   uint64_t HashTableStart = Generator.Emit(OS.OS, *InfoObj);
 
+  // Write the MemProf profile data if we have it. This includes a simple schema
+  // with the format described below followed by the hashtable:
+  // uint64_t RecordTableOffset = RecordTableGenerator.Emit
+  // uint64_t FramePayloadOffset = Stream offset before emitting the frame table
+  // uint64_t FrameTableOffset = FrameTableGenerator.Emit
+  // uint64_t Num schema entries
+  // uint64_t Schema entry 0
+  // uint64_t Schema entry 1
+  // ....
+  // uint64_t Schema entry N - 1
+  // OnDiskChainedHashTable MemProfRecordData
+  // OnDiskChainedHashTable MemProfFrameData
+  uint64_t MemProfSectionStart = 0;
+  if (static_cast<bool>(ProfileKind & InstrProfKind::MemProf)) {
+    MemProfSectionStart = OS.tell();
+    OS.write(0ULL); // Reserve space for the memprof record table offset.
+    OS.write(0ULL); // Reserve space for the memprof frame payload offset.
+    OS.write(0ULL); // Reserve space for the memprof frame table offset.
+
+    auto Schema = memprof::PortableMemInfoBlock::getSchema();
+    OS.write(static_cast<uint64_t>(Schema.size()));
+    for (const auto Id : Schema) {
+      OS.write(static_cast<uint64_t>(Id));
+    }
+
+    auto RecordWriter = std::make_unique<memprof::RecordWriterTrait>();
+    RecordWriter->Schema = &Schema;
+    OnDiskChainedHashTableGenerator<memprof::RecordWriterTrait>
+        RecordTableGenerator;
+    for (auto &I : MemProfRecordData) {
+      // Insert the key (func hash) and value (memprof record).
+      RecordTableGenerator.insert(I.first, I.second);
+    }
+
+    uint64_t RecordTableOffset =
+        RecordTableGenerator.Emit(OS.OS, *RecordWriter);
+
+    uint64_t FramePayloadOffset = OS.tell();
+
+    auto FrameWriter = std::make_unique<memprof::FrameWriterTrait>();
+    OnDiskChainedHashTableGenerator<memprof::FrameWriterTrait>
+        FrameTableGenerator;
+    for (auto &I : MemProfFrameData) {
+      // Insert the key (frame id) and value (frame contents).
+      FrameTableGenerator.insert(I.first, I.second);
+    }
+
+    uint64_t FrameTableOffset = FrameTableGenerator.Emit(OS.OS, *FrameWriter);
+
+    PatchItem PatchItems[] = {
+        {MemProfSectionStart, &RecordTableOffset, 1},
+        {MemProfSectionStart + sizeof(uint64_t), &FramePayloadOffset, 1},
+        {MemProfSectionStart + 2 * sizeof(uint64_t), &FrameTableOffset, 1},
+    };
+    OS.patch(PatchItems, 3);
+  }
+
   // Allocate space for data to be serialized out.
   std::unique_ptr<IndexedInstrProf::Summary> TheSummary =
       IndexedInstrProf::allocSummary(SummarySize);
@@ -359,7 +471,7 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
 
   // For Context Sensitive summary.
   std::unique_ptr<IndexedInstrProf::Summary> TheCSSummary = nullptr;
-  if (static_cast<bool>(ProfileKind & InstrProfKind::CS)) {
+  if (static_cast<bool>(ProfileKind & InstrProfKind::ContextSensitive)) {
     TheCSSummary = IndexedInstrProf::allocSummary(SummarySize);
     std::unique_ptr<ProfileSummary> CSPS = CSISB.getSummary();
     setSummary(TheCSSummary.get(), *CSPS);
@@ -370,6 +482,8 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   PatchItem PatchItems[] = {
       // Patch the Header.HashOffset field.
       {HashTableStartFieldOffset, &HashTableStart, 1},
+      // Patch the Header.MemProfOffset (=0 for profiles without MemProf data).
+      {MemProfSectionOffset, &MemProfSectionStart, 1},
       // Patch the summary data.
       {SummaryOffset, reinterpret_cast<uint64_t *>(TheSummary.get()),
        (int)(SummarySize / sizeof(uint64_t))},
@@ -472,12 +586,13 @@ void InstrProfWriter::writeRecordInText(StringRef Name, uint64_t Hash,
 
 Error InstrProfWriter::writeText(raw_fd_ostream &OS) {
   // Check CS first since it implies an IR level profile.
-  if (static_cast<bool>(ProfileKind & InstrProfKind::CS))
+  if (static_cast<bool>(ProfileKind & InstrProfKind::ContextSensitive))
     OS << "# CSIR level Instrumentation Flag\n:csir\n";
-  else if (static_cast<bool>(ProfileKind & InstrProfKind::IR))
+  else if (static_cast<bool>(ProfileKind & InstrProfKind::IRInstrumentation))
     OS << "# IR level Instrumentation Flag\n:ir\n";
 
-  if (static_cast<bool>(ProfileKind & InstrProfKind::BB))
+  if (static_cast<bool>(ProfileKind &
+                        InstrProfKind::FunctionEntryInstrumentation))
     OS << "# Always instrument the function entry block\n:entry_first\n";
   InstrProfSymtab Symtab;
 
diff --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp
new file mode 100644
index 000000000000..3d44cf0b4c37
--- /dev/null
+++ b/llvm/lib/ProfileData/MemProf.cpp
@@ -0,0 +1,110 @@
+#include "llvm/ProfileData/MemProf.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Function.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/EndianStream.h"
+
+namespace llvm {
+namespace memprof {
+
+void IndexedMemProfRecord::serialize(const MemProfSchema &Schema,
+                                     raw_ostream &OS) {
+  using namespace support;
+
+  endian::Writer LE(OS, little);
+
+  LE.write<uint64_t>(AllocSites.size());
+  for (const IndexedAllocationInfo &N : AllocSites) {
+    LE.write<uint64_t>(N.CallStack.size());
+    for (const FrameId &Id : N.CallStack)
+      LE.write<FrameId>(Id);
+    N.Info.serialize(Schema, OS);
+  }
+
+  // Related contexts.
+  LE.write<uint64_t>(CallSites.size());
+  for (const auto &Frames : CallSites) {
+    LE.write<uint64_t>(Frames.size());
+    for (const FrameId &Id : Frames)
+      LE.write<FrameId>(Id);
+  }
+}
+
+IndexedMemProfRecord
+IndexedMemProfRecord::deserialize(const MemProfSchema &Schema,
+                                  const unsigned char *Ptr) {
+  using namespace support;
+
+  IndexedMemProfRecord Record;
+
+  // Read the meminfo nodes.
+  const uint64_t NumNodes = endian::readNext<uint64_t, little, unaligned>(Ptr);
+  for (uint64_t I = 0; I < NumNodes; I++) {
+    IndexedAllocationInfo Node;
+    const uint64_t NumFrames =
+        endian::readNext<uint64_t, little, unaligned>(Ptr);
+    for (uint64_t J = 0; J < NumFrames; J++) {
+      const FrameId Id = endian::readNext<FrameId, little, unaligned>(Ptr);
+      Node.CallStack.push_back(Id);
+    }
+    Node.Info.deserialize(Schema, Ptr);
+    Ptr += PortableMemInfoBlock::serializedSize();
+    Record.AllocSites.push_back(Node);
+  }
+
+  // Read the callsite information.
+  const uint64_t NumCtxs = endian::readNext<uint64_t, little, unaligned>(Ptr);
+  for (uint64_t J = 0; J < NumCtxs; J++) {
+    const uint64_t NumFrames =
+        endian::readNext<uint64_t, little, unaligned>(Ptr);
+    llvm::SmallVector<FrameId> Frames;
+    Frames.reserve(NumFrames);
+    for (uint64_t K = 0; K < NumFrames; K++) {
+      const FrameId Id = endian::readNext<FrameId, little, unaligned>(Ptr);
+      Frames.push_back(Id);
+    }
+    Record.CallSites.push_back(Frames);
+  }
+
+  return Record;
+}
+
+GlobalValue::GUID IndexedMemProfRecord::getGUID(const StringRef FunctionName) {
+  const auto Pos = FunctionName.find(".llvm.");
+
+  // We use the function guid which we expect to be a uint64_t. At
+  // this time, it is the lower 64 bits of the md5 of the function
+  // name. Any suffix with .llvm. is trimmed since these are added by
+  // thinLTO global promotion. At the time the profile is consumed,
+  // these suffixes will not be present.
+  return Function::getGUID(FunctionName.take_front(Pos));
+}
+
+Expected<MemProfSchema> readMemProfSchema(const unsigned char *&Buffer) {
+  using namespace support;
+
+  const unsigned char *Ptr = Buffer;
+  const uint64_t NumSchemaIds =
+      endian::readNext<uint64_t, little, unaligned>(Ptr);
+  if (NumSchemaIds > static_cast<uint64_t>(Meta::Size)) {
+    return make_error<InstrProfError>(instrprof_error::malformed,
+                                      "memprof schema invalid");
+  }
+
+  MemProfSchema Result;
+  for (size_t I = 0; I < NumSchemaIds; I++) {
+    const uint64_t Tag = endian::readNext<uint64_t, little, unaligned>(Ptr);
+    if (Tag >= static_cast<uint64_t>(Meta::Size)) {
+      return make_error<InstrProfError>(instrprof_error::malformed,
+                                        "memprof schema invalid");
+    }
+    Result.push_back(static_cast<Meta>(Tag));
+  }
+  // Advace the buffer to one past the schema if we succeeded.
+  Buffer = Ptr;
+  return Result;
+}
+
+} // namespace memprof
+} // namespace llvm
diff --git a/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp b/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp
index bbb640cfaee8..755e25b355a8 100644
--- a/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp
+++ b/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp
@@ -10,20 +10,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Type.h"
+#include "llvm/IR/ProfileSummary.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/ProfileData/ProfileCommon.h"
 #include "llvm/ProfileData/SampleProf.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
 
 cl::opt<bool> UseContextLessSummary(
-    "profile-summary-contextless", cl::Hidden, cl::init(false), cl::ZeroOrMore,
+    "profile-summary-contextless", cl::Hidden,
     cl::desc("Merge context profiles before calculating thresholds."));
 
 // The following two parameters determine the threshold for a count to be
@@ -34,38 +30,38 @@ cl::opt<bool> UseContextLessSummary(
 // threshold for determining cold count (everything <= this threshold is
 // considered cold).
 cl::opt<int> ProfileSummaryCutoffHot(
-    "profile-summary-cutoff-hot", cl::Hidden, cl::init(990000), cl::ZeroOrMore,
+    "profile-summary-cutoff-hot", cl::Hidden, cl::init(990000),
     cl::desc("A count is hot if it exceeds the minimum count to"
              " reach this percentile of total counts."));
 
 cl::opt<int> ProfileSummaryCutoffCold(
-    "profile-summary-cutoff-cold", cl::Hidden, cl::init(999999), cl::ZeroOrMore,
+    "profile-summary-cutoff-cold", cl::Hidden, cl::init(999999),
     cl::desc("A count is cold if it is below the minimum count"
              " to reach this percentile of total counts."));
 
 cl::opt<unsigned> ProfileSummaryHugeWorkingSetSizeThreshold(
     "profile-summary-huge-working-set-size-threshold", cl::Hidden,
-    cl::init(15000), cl::ZeroOrMore,
+    cl::init(15000),
     cl::desc("The code working set size is considered huge if the number of"
              " blocks required to reach the -profile-summary-cutoff-hot"
              " percentile exceeds this count."));
 
 cl::opt<unsigned> ProfileSummaryLargeWorkingSetSizeThreshold(
     "profile-summary-large-working-set-size-threshold", cl::Hidden,
-    cl::init(12500), cl::ZeroOrMore,
+    cl::init(12500),
     cl::desc("The code working set size is considered large if the number of"
              " blocks required to reach the -profile-summary-cutoff-hot"
              " percentile exceeds this count."));
 
 // The next two options override the counts derived from summary computation and
 // are useful for debugging purposes.
-cl::opt<int> ProfileSummaryHotCount(
-    "profile-summary-hot-count", cl::ReallyHidden, cl::ZeroOrMore,
+cl::opt<uint64_t> ProfileSummaryHotCount(
+    "profile-summary-hot-count", cl::ReallyHidden,
     cl::desc("A fixed hot count that overrides the count derived from"
              " profile-summary-cutoff-hot"));
 
-cl::opt<int> ProfileSummaryColdCount(
-    "profile-summary-cold-count", cl::ReallyHidden, cl::ZeroOrMore,
+cl::opt<uint64_t> ProfileSummaryColdCount(
+    "profile-summary-cold-count", cl::ReallyHidden,
     cl::desc("A fixed cold count that overrides the count derived from"
              " profile-summary-cutoff-cold"));
 
@@ -110,7 +106,13 @@ void SampleProfileSummaryBuilder::addRecord(
     NumFunctions++;
     if (FS.getHeadSamples() > MaxFunctionCount)
       MaxFunctionCount = FS.getHeadSamples();
+  } else if (FS.getContext().hasAttribute(
+                 sampleprof::ContextDuplicatedIntoBase)) {
+    // Do not recount callee samples if they are already merged into their base
+    // profiles. This can happen to CS nested profile.
+    return;
   }
+
   for (const auto &I : FS.getBodySamples()) {
     uint64_t Count = I.second.getSamples();
       addCount(Count);
@@ -194,7 +196,7 @@ SampleProfileSummaryBuilder::computeSummaryForProfiles(
   // more function profiles each with lower counts, which in turn leads to lower
   // hot thresholds. To compensate for that, by default we merge context
   // profiles before computing profile summary.
-  if (UseContextLessSummary || (sampleprof::FunctionSamples::ProfileIsCSFlat &&
+  if (UseContextLessSummary || (sampleprof::FunctionSamples::ProfileIsCS &&
                                 !UseContextLessSummary.getNumOccurrences())) {
     for (const auto &I : Profiles) {
       ContextLessProfiles[I.second.getName()].merge(I.second);
diff --git a/llvm/lib/ProfileData/RawMemProfReader.cpp b/llvm/lib/ProfileData/RawMemProfReader.cpp
index f8d13c74fac3..2423fd38e9a2 100644
--- a/llvm/lib/ProfileData/RawMemProfReader.cpp
+++ b/llvm/lib/ProfileData/RawMemProfReader.cpp
@@ -10,69 +10,55 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <algorithm>
 #include <cstdint>
+#include <memory>
 #include <type_traits>
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
+#include "llvm/DebugInfo/Symbolize/SymbolizableObjectFile.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/ObjectFile.h"
 #include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/MemProf.h"
 #include "llvm/ProfileData/MemProfData.inc"
 #include "llvm/ProfileData/RawMemProfReader.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Path.h"
+
+#define DEBUG_TYPE "memprof"
 
 namespace llvm {
 namespace memprof {
 namespace {
-
-struct Summary {
-  uint64_t Version;
-  uint64_t TotalSizeBytes;
-  uint64_t NumSegments;
-  uint64_t NumMIBInfo;
-  uint64_t NumStackOffsets;
-};
-
 template <class T = uint64_t> inline T alignedRead(const char *Ptr) {
   static_assert(std::is_pod<T>::value, "Not a pod type.");
   assert(reinterpret_cast<size_t>(Ptr) % sizeof(T) == 0 && "Unaligned Read");
   return *reinterpret_cast<const T *>(Ptr);
 }
 
-Summary computeSummary(const char *Start) {
-  auto *H = reinterpret_cast<const Header *>(Start);
-
-  // Check alignment while reading the number of items in each section.
-  return Summary{
-      H->Version,
-      H->TotalSize,
-      alignedRead(Start + H->SegmentOffset),
-      alignedRead(Start + H->MIBOffset),
-      alignedRead(Start + H->StackOffset),
-  };
-}
-
-} // namespace
-
-Expected<std::unique_ptr<RawMemProfReader>>
-RawMemProfReader::create(const Twine &Path) {
-  auto BufferOr = MemoryBuffer::getFileOrSTDIN(Path, /*IsText=*/true);
-  if (std::error_code EC = BufferOr.getError())
-    return errorCodeToError(EC);
-
-  std::unique_ptr<MemoryBuffer> Buffer(BufferOr.get().release());
+Error checkBuffer(const MemoryBuffer &Buffer) {
+  if (!RawMemProfReader::hasFormat(Buffer))
+    return make_error<InstrProfError>(instrprof_error::bad_magic);
 
-  if (Buffer->getBufferSize() == 0)
+  if (Buffer.getBufferSize() == 0)
     return make_error<InstrProfError>(instrprof_error::empty_raw_profile);
 
-  if (!RawMemProfReader::hasFormat(*Buffer))
-    return make_error<InstrProfError>(instrprof_error::bad_magic);
-
-  if (Buffer->getBufferSize() < sizeof(Header)) {
+  if (Buffer.getBufferSize() < sizeof(Header)) {
     return make_error<InstrProfError>(instrprof_error::truncated);
   }
 
   // The size of the buffer can be > header total size since we allow repeated
   // serialization of memprof profiles to the same file.
   uint64_t TotalSize = 0;
-  const char *Next = Buffer->getBufferStart();
-  while (Next < Buffer->getBufferEnd()) {
+  const char *Next = Buffer.getBufferStart();
+  while (Next < Buffer.getBufferEnd()) {
     auto *H = reinterpret_cast<const Header *>(Next);
     if (H->Version != MEMPROF_RAW_VERSION) {
       return make_error<InstrProfError>(instrprof_error::unsupported_version);
@@ -82,11 +68,143 @@ RawMemProfReader::create(const Twine &Path) {
     Next += H->TotalSize;
   }
 
-  if (Buffer->getBufferSize() != TotalSize) {
+  if (Buffer.getBufferSize() != TotalSize) {
     return make_error<InstrProfError>(instrprof_error::malformed);
   }
+  return Error::success();
+}
+
+llvm::SmallVector<SegmentEntry> readSegmentEntries(const char *Ptr) {
+  using namespace support;
+
+  const uint64_t NumItemsToRead =
+      endian::readNext<uint64_t, little, unaligned>(Ptr);
+  llvm::SmallVector<SegmentEntry> Items;
+  for (uint64_t I = 0; I < NumItemsToRead; I++) {
+    Items.push_back(*reinterpret_cast<const SegmentEntry *>(
+        Ptr + I * sizeof(SegmentEntry)));
+  }
+  return Items;
+}
+
+llvm::SmallVector<std::pair<uint64_t, MemInfoBlock>>
+readMemInfoBlocks(const char *Ptr) {
+  using namespace support;
+
+  const uint64_t NumItemsToRead =
+      endian::readNext<uint64_t, little, unaligned>(Ptr);
+  llvm::SmallVector<std::pair<uint64_t, MemInfoBlock>> Items;
+  for (uint64_t I = 0; I < NumItemsToRead; I++) {
+    const uint64_t Id = endian::readNext<uint64_t, little, unaligned>(Ptr);
+    const MemInfoBlock MIB = *reinterpret_cast<const MemInfoBlock *>(Ptr);
+    Items.push_back({Id, MIB});
+    // Only increment by size of MIB since readNext implicitly increments.
+    Ptr += sizeof(MemInfoBlock);
+  }
+  return Items;
+}
+
+CallStackMap readStackInfo(const char *Ptr) {
+  using namespace support;
+
+  const uint64_t NumItemsToRead =
+      endian::readNext<uint64_t, little, unaligned>(Ptr);
+  CallStackMap Items;
+
+  for (uint64_t I = 0; I < NumItemsToRead; I++) {
+    const uint64_t StackId = endian::readNext<uint64_t, little, unaligned>(Ptr);
+    const uint64_t NumPCs = endian::readNext<uint64_t, little, unaligned>(Ptr);
+
+    SmallVector<uint64_t> CallStack;
+    for (uint64_t J = 0; J < NumPCs; J++) {
+      CallStack.push_back(endian::readNext<uint64_t, little, unaligned>(Ptr));
+    }
+
+    Items[StackId] = CallStack;
+  }
+  return Items;
+}
+
+// Merges the contents of stack information in \p From to \p To. Returns true if
+// any stack ids observed previously map to a different set of program counter
+// addresses.
+bool mergeStackMap(const CallStackMap &From, CallStackMap &To) {
+  for (const auto &IdStack : From) {
+    auto I = To.find(IdStack.first);
+    if (I == To.end()) {
+      To[IdStack.first] = IdStack.second;
+    } else {
+      // Check that the PCs are the same (in order).
+      if (IdStack.second != I->second)
+        return true;
+    }
+  }
+  return false;
+}
 
-  return std::make_unique<RawMemProfReader>(std::move(Buffer));
+Error report(Error E, const StringRef Context) {
+  return joinErrors(createStringError(inconvertibleErrorCode(), Context),
+                    std::move(E));
+}
+
+bool isRuntimePath(const StringRef Path) {
+  return StringRef(llvm::sys::path::convert_to_slash(Path))
+      .contains("memprof/memprof_");
+}
+
+std::string getBuildIdString(const SegmentEntry &Entry) {
+  constexpr size_t Size = sizeof(Entry.BuildId) / sizeof(uint8_t);
+  constexpr uint8_t Zeros[Size] = {0};
+  // If the build id is unset print a helpful string instead of all zeros.
+  if (memcmp(Entry.BuildId, Zeros, Size) == 0)
+    return "<None>";
+
+  std::string Str;
+  raw_string_ostream OS(Str);
+  for (size_t I = 0; I < Size; I++) {
+    OS << format_hex_no_prefix(Entry.BuildId[I], 2);
+  }
+  return OS.str();
+}
+} // namespace
+
+Expected<std::unique_ptr<RawMemProfReader>>
+RawMemProfReader::create(const Twine &Path, const StringRef ProfiledBinary,
+                         bool KeepName) {
+  auto BufferOr = MemoryBuffer::getFileOrSTDIN(Path);
+  if (std::error_code EC = BufferOr.getError())
+    return report(errorCodeToError(EC), Path.getSingleStringRef());
+
+  std::unique_ptr<MemoryBuffer> Buffer(BufferOr.get().release());
+  if (Error E = checkBuffer(*Buffer))
+    return report(std::move(E), Path.getSingleStringRef());
+
+  if (ProfiledBinary.empty())
+    return report(
+        errorCodeToError(make_error_code(std::errc::invalid_argument)),
+        "Path to profiled binary is empty!");
+
+  auto BinaryOr = llvm::object::createBinary(ProfiledBinary);
+  if (!BinaryOr) {
+    return report(BinaryOr.takeError(), ProfiledBinary);
+  }
+
+  // Use new here since constructor is private.
+  std::unique_ptr<RawMemProfReader> Reader(
+      new RawMemProfReader(std::move(BinaryOr.get()), KeepName));
+  if (Error E = Reader->initialize(std::move(Buffer))) {
+    return std::move(E);
+  }
+  return std::move(Reader);
+}
+
+bool RawMemProfReader::hasFormat(const StringRef Path) {
+  auto BufferOr = MemoryBuffer::getFileOrSTDIN(Path);
+  if (!BufferOr)
+    return false;
+
+  std::unique_ptr<MemoryBuffer> Buffer(BufferOr.get().release());
+  return hasFormat(*Buffer);
 }
 
 bool RawMemProfReader::hasFormat(const MemoryBuffer &Buffer) {
@@ -98,24 +216,343 @@ bool RawMemProfReader::hasFormat(const MemoryBuffer &Buffer) {
   return Magic == MEMPROF_RAW_MAGIC_64;
 }
 
-void RawMemProfReader::printSummaries(raw_ostream &OS) const {
-  int Count = 0;
+void RawMemProfReader::printYAML(raw_ostream &OS) {
+  uint64_t NumAllocFunctions = 0, NumMibInfo = 0;
+  for (const auto &KV : FunctionProfileData) {
+    const size_t NumAllocSites = KV.second.AllocSites.size();
+    if (NumAllocSites > 0) {
+      NumAllocFunctions++;
+      NumMibInfo += NumAllocSites;
+    }
+  }
+
+  OS << "MemprofProfile:\n";
+  OS << "  Summary:\n";
+  OS << "    Version: " << MEMPROF_RAW_VERSION << "\n";
+  OS << "    NumSegments: " << SegmentInfo.size() << "\n";
+  OS << "    NumMibInfo: " << NumMibInfo << "\n";
+  OS << "    NumAllocFunctions: " << NumAllocFunctions << "\n";
+  OS << "    NumStackOffsets: " << StackMap.size() << "\n";
+  // Print out the segment information.
+  OS << "  Segments:\n";
+  for (const auto &Entry : SegmentInfo) {
+    OS << "  -\n";
+    OS << "    BuildId: " << getBuildIdString(Entry) << "\n";
+    OS << "    Start: 0x" << llvm::utohexstr(Entry.Start) << "\n";
+    OS << "    End: 0x" << llvm::utohexstr(Entry.End) << "\n";
+    OS << "    Offset: 0x" << llvm::utohexstr(Entry.Offset) << "\n";
+  }
+  // Print out the merged contents of the profiles.
+  OS << "  Records:\n";
+  for (const auto &Entry : *this) {
+    OS << "  -\n";
+    OS << "    FunctionGUID: " << Entry.first << "\n";
+    Entry.second.print(OS);
+  }
+}
+
+Error RawMemProfReader::initialize(std::unique_ptr<MemoryBuffer> DataBuffer) {
+  const StringRef FileName = Binary.getBinary()->getFileName();
+
+  auto *ElfObject = dyn_cast<object::ELFObjectFileBase>(Binary.getBinary());
+  if (!ElfObject) {
+    return report(make_error<StringError>(Twine("Not an ELF file: "),
+                                          inconvertibleErrorCode()),
+                  FileName);
+  }
+
+  // Check whether the profiled binary was built with position independent code
+  // (PIC). For now we provide a error message until symbolization support
+  // is added for pic.
+  auto* Elf64LEObject = llvm::cast<llvm::object::ELF64LEObjectFile>(ElfObject);
+  const llvm::object::ELF64LEFile& ElfFile = Elf64LEObject->getELFFile();
+  auto PHdrsOr = ElfFile.program_headers();
+  if(!PHdrsOr) 
+    return report(make_error<StringError>(Twine("Could not read program headers: "),
+                                          inconvertibleErrorCode()),
+                  FileName);
+  auto FirstLoadHeader = PHdrsOr->begin();
+  while (FirstLoadHeader->p_type != llvm::ELF::PT_LOAD)
+    ++FirstLoadHeader;
+  if(FirstLoadHeader->p_vaddr == 0)
+    return report(make_error<StringError>(Twine("Unsupported position independent code"),
+                                          inconvertibleErrorCode()),
+                  FileName);
+
+  auto Triple = ElfObject->makeTriple();
+  if (!Triple.isX86())
+    return report(make_error<StringError>(Twine("Unsupported target: ") +
+                                              Triple.getArchName(),
+                                          inconvertibleErrorCode()),
+                  FileName);
+
+  auto *Object = cast<object::ObjectFile>(Binary.getBinary());
+  std::unique_ptr<DIContext> Context = DWARFContext::create(
+      *Object, DWARFContext::ProcessDebugRelocations::Process);
+
+  auto SOFOr = symbolize::SymbolizableObjectFile::create(
+      Object, std::move(Context), /*UntagAddresses=*/false);
+  if (!SOFOr)
+    return report(SOFOr.takeError(), FileName);
+  Symbolizer = std::move(SOFOr.get());
+
+  if (Error E = readRawProfile(std::move(DataBuffer)))
+    return E;
+
+  if (Error E = symbolizeAndFilterStackFrames())
+    return E;
+
+  return mapRawProfileToRecords();
+}
+
+Error RawMemProfReader::mapRawProfileToRecords() {
+  // Hold a mapping from function to each callsite location we encounter within
+  // it that is part of some dynamic allocation context. The location is stored
+  // as a pointer to a symbolized list of inline frames.
+  using LocationPtr = const llvm::SmallVector<FrameId> *;
+  llvm::DenseMap<GlobalValue::GUID, llvm::SetVector<LocationPtr>>
+      PerFunctionCallSites;
+
+  // Convert the raw profile callstack data into memprof records. While doing so
+  // keep track of related contexts so that we can fill these in later.
+  for (const auto &Entry : CallstackProfileData) {
+    const uint64_t StackId = Entry.first;
+
+    auto It = StackMap.find(StackId);
+    if (It == StackMap.end())
+      return make_error<InstrProfError>(
+          instrprof_error::malformed,
+          "memprof callstack record does not contain id: " + Twine(StackId));
+
+    // Construct the symbolized callstack.
+    llvm::SmallVector<FrameId> Callstack;
+    Callstack.reserve(It->getSecond().size());
+
+    llvm::ArrayRef<uint64_t> Addresses = It->getSecond();
+    for (size_t I = 0; I < Addresses.size(); I++) {
+      const uint64_t Address = Addresses[I];
+      assert(SymbolizedFrame.count(Address) > 0 &&
+             "Address not found in SymbolizedFrame map");
+      const SmallVector<FrameId> &Frames = SymbolizedFrame[Address];
+
+      assert(!idToFrame(Frames.back()).IsInlineFrame &&
+             "The last frame should not be inlined");
+
+      // Record the callsites for each function. Skip the first frame of the
+      // first address since it is the allocation site itself that is recorded
+      // as an alloc site.
+      for (size_t J = 0; J < Frames.size(); J++) {
+        if (I == 0 && J == 0)
+          continue;
+        // We attach the entire bottom-up frame here for the callsite even
+        // though we only need the frames up to and including the frame for
+        // Frames[J].Function. This will enable better deduplication for
+        // compression in the future.
+        const GlobalValue::GUID Guid = idToFrame(Frames[J]).Function;
+        PerFunctionCallSites[Guid].insert(&Frames);
+      }
+
+      // Add all the frames to the current allocation callstack.
+      Callstack.append(Frames.begin(), Frames.end());
+    }
+
+    // We attach the memprof record to each function bottom-up including the
+    // first non-inline frame.
+    for (size_t I = 0; /*Break out using the condition below*/; I++) {
+      const Frame &F = idToFrame(Callstack[I]);
+      auto Result =
+          FunctionProfileData.insert({F.Function, IndexedMemProfRecord()});
+      IndexedMemProfRecord &Record = Result.first->second;
+      Record.AllocSites.emplace_back(Callstack, Entry.second);
+
+      if (!F.IsInlineFrame)
+        break;
+    }
+  }
+
+  // Fill in the related callsites per function.
+  for (auto I = PerFunctionCallSites.begin(), E = PerFunctionCallSites.end();
+       I != E; I++) {
+    const GlobalValue::GUID Id = I->first;
+    // Some functions may have only callsite data and no allocation data. Here
+    // we insert a new entry for callsite data if we need to.
+    auto Result = FunctionProfileData.insert({Id, IndexedMemProfRecord()});
+    IndexedMemProfRecord &Record = Result.first->second;
+    for (LocationPtr Loc : I->getSecond()) {
+      Record.CallSites.push_back(*Loc);
+    }
+  }
+
+  return Error::success();
+}
+
+Error RawMemProfReader::symbolizeAndFilterStackFrames() {
+  // The specifier to use when symbolization is requested.
+  const DILineInfoSpecifier Specifier(
+      DILineInfoSpecifier::FileLineInfoKind::RawValue,
+      DILineInfoSpecifier::FunctionNameKind::LinkageName);
+
+  // For entries where all PCs in the callstack are discarded, we erase the
+  // entry from the stack map.
+  llvm::SmallVector<uint64_t> EntriesToErase;
+  // We keep track of all prior discarded entries so that we can avoid invoking
+  // the symbolizer for such entries.
+  llvm::DenseSet<uint64_t> AllVAddrsToDiscard;
+  for (auto &Entry : StackMap) {
+    for (const uint64_t VAddr : Entry.getSecond()) {
+      // Check if we have already symbolized and cached the result or if we
+      // don't want to attempt symbolization since we know this address is bad.
+      // In this case the address is also removed from the current callstack.
+      if (SymbolizedFrame.count(VAddr) > 0 ||
+          AllVAddrsToDiscard.contains(VAddr))
+        continue;
+
+      Expected<DIInliningInfo> DIOr = Symbolizer->symbolizeInlinedCode(
+          getModuleOffset(VAddr), Specifier, /*UseSymbolTable=*/false);
+      if (!DIOr)
+        return DIOr.takeError();
+      DIInliningInfo DI = DIOr.get();
+
+      // Drop frames which we can't symbolize or if they belong to the runtime.
+      if (DI.getFrame(0).FunctionName == DILineInfo::BadString ||
+          isRuntimePath(DI.getFrame(0).FileName)) {
+        AllVAddrsToDiscard.insert(VAddr);
+        continue;
+      }
+
+      for (size_t I = 0, NumFrames = DI.getNumberOfFrames(); I < NumFrames;
+           I++) {
+        const auto &DIFrame = DI.getFrame(I);
+        const uint64_t Guid =
+            IndexedMemProfRecord::getGUID(DIFrame.FunctionName);
+        const Frame F(Guid, DIFrame.Line - DIFrame.StartLine, DIFrame.Column,
+                      // Only the last entry is not an inlined location.
+                      I != NumFrames - 1);
+        // Here we retain a mapping from the GUID to symbol name instead of
+        // adding it to the frame object directly to reduce memory overhead.
+        // This is because there can be many unique frames, particularly for
+        // callsite frames.
+        if (KeepSymbolName)
+          GuidToSymbolName.insert({Guid, DIFrame.FunctionName});
+
+        const FrameId Hash = F.hash();
+        IdToFrame.insert({Hash, F});
+        SymbolizedFrame[VAddr].push_back(Hash);
+      }
+    }
+
+    auto &CallStack = Entry.getSecond();
+    llvm::erase_if(CallStack, [&AllVAddrsToDiscard](const uint64_t A) {
+      return AllVAddrsToDiscard.contains(A);
+    });
+    if (CallStack.empty())
+      EntriesToErase.push_back(Entry.getFirst());
+  }
+
+  // Drop the entries where the callstack is empty.
+  for (const uint64_t Id : EntriesToErase) {
+    StackMap.erase(Id);
+    CallstackProfileData.erase(Id);
+  }
+
+  if (StackMap.empty())
+    return make_error<InstrProfError>(
+        instrprof_error::malformed,
+        "no entries in callstack map after symbolization");
+
+  return Error::success();
+}
+
+Error RawMemProfReader::readRawProfile(
+    std::unique_ptr<MemoryBuffer> DataBuffer) {
   const char *Next = DataBuffer->getBufferStart();
+
   while (Next < DataBuffer->getBufferEnd()) {
-    auto Summary = computeSummary(Next);
-    OS << "MemProf Profile " << ++Count << "\n";
-    OS << "  Version: " << Summary.Version << "\n";
-    OS << "  TotalSizeBytes: " << Summary.TotalSizeBytes << "\n";
-    OS << "  NumSegments: " << Summary.NumSegments << "\n";
-    OS << "  NumMIBInfo: " << Summary.NumMIBInfo << "\n";
-    OS << "  NumStackOffsets: " << Summary.NumStackOffsets << "\n";
-    // TODO: Print the build ids once we can record them using the
-    // sanitizer_procmaps library for linux.
+    auto *Header = reinterpret_cast<const memprof::Header *>(Next);
 
-    auto *H = reinterpret_cast<const Header *>(Next);
-    Next += H->TotalSize;
+    // Read in the segment information, check whether its the same across all
+    // profiles in this binary file.
+    const llvm::SmallVector<SegmentEntry> Entries =
+        readSegmentEntries(Next + Header->SegmentOffset);
+    if (!SegmentInfo.empty() && SegmentInfo != Entries) {
+      // We do not expect segment information to change when deserializing from
+      // the same binary profile file. This can happen if dynamic libraries are
+      // loaded/unloaded between profile dumping.
+      return make_error<InstrProfError>(
+          instrprof_error::malformed,
+          "memprof raw profile has different segment information");
+    }
+    SegmentInfo.assign(Entries.begin(), Entries.end());
+
+    // Read in the MemInfoBlocks. Merge them based on stack id - we assume that
+    // raw profiles in the same binary file are from the same process so the
+    // stackdepot ids are the same.
+    for (const auto &Value : readMemInfoBlocks(Next + Header->MIBOffset)) {
+      if (CallstackProfileData.count(Value.first)) {
+        CallstackProfileData[Value.first].Merge(Value.second);
+      } else {
+        CallstackProfileData[Value.first] = Value.second;
+      }
+    }
+
+    // Read in the callstack for each ids. For multiple raw profiles in the same
+    // file, we expect that the callstack is the same for a unique id.
+    const CallStackMap CSM = readStackInfo(Next + Header->StackOffset);
+    if (StackMap.empty()) {
+      StackMap = CSM;
+    } else {
+      if (mergeStackMap(CSM, StackMap))
+        return make_error<InstrProfError>(
+            instrprof_error::malformed,
+            "memprof raw profile got different call stack for same id");
+    }
+
+    Next += Header->TotalSize;
+  }
+
+  return Error::success();
+}
+
+object::SectionedAddress
+RawMemProfReader::getModuleOffset(const uint64_t VirtualAddress) {
+  LLVM_DEBUG({
+  SegmentEntry *ContainingSegment = nullptr;
+  for (auto &SE : SegmentInfo) {
+    if (VirtualAddress > SE.Start && VirtualAddress <= SE.End) {
+      ContainingSegment = &SE;
+    }
   }
+
+  // Ensure that the virtual address is valid.
+  assert(ContainingSegment && "Could not find a segment entry");
+  });
+
+  // TODO: Compute the file offset based on the maps and program headers. For
+  // now this only works for non PIE binaries.
+  return object::SectionedAddress{VirtualAddress};
 }
 
+Error RawMemProfReader::readNextRecord(GuidMemProfRecordPair &GuidRecord) {
+  if (FunctionProfileData.empty())
+    return make_error<InstrProfError>(instrprof_error::empty_raw_profile);
+
+  if (Iter == FunctionProfileData.end())
+    return make_error<InstrProfError>(instrprof_error::eof);
+
+  auto IdToFrameCallback = [this](const FrameId Id) {
+    Frame F = this->idToFrame(Id);
+    if (!this->KeepSymbolName)
+      return F;
+    auto Iter = this->GuidToSymbolName.find(F.Function);
+    assert(Iter != this->GuidToSymbolName.end());
+    F.SymbolName = Iter->getSecond();
+    return F;
+  };
+
+  const IndexedMemProfRecord &IndexedRecord = Iter->second;
+  GuidRecord = {Iter->first, MemProfRecord(IndexedRecord, IdToFrameCallback)};
+  Iter++;
+  return Error::success();
+}
 } // namespace memprof
 } // namespace llvm
diff --git a/llvm/lib/ProfileData/SampleProf.cpp b/llvm/lib/ProfileData/SampleProf.cpp
index 9b01a386a360..f794e64a13e7 100644
--- a/llvm/lib/ProfileData/SampleProf.cpp
+++ b/llvm/lib/ProfileData/SampleProf.cpp
@@ -19,9 +19,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/LEB128.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/raw_ostream.h"
 #include <string>
@@ -31,22 +29,21 @@ using namespace llvm;
 using namespace sampleprof;
 
 static cl::opt<uint64_t> ProfileSymbolListCutOff(
-    "profile-symbol-list-cutoff", cl::Hidden, cl::init(-1), cl::ZeroOrMore,
+    "profile-symbol-list-cutoff", cl::Hidden, cl::init(-1),
     cl::desc("Cutoff value about how many symbols in profile symbol list "
              "will be used. This is very useful for performance debugging"));
 
 cl::opt<bool> GenerateMergedBaseProfiles(
-    "generate-merged-base-profiles", cl::init(true), cl::ZeroOrMore,
+    "generate-merged-base-profiles",
     cl::desc("When generating nested context-sensitive profiles, always "
              "generate extra base profile for function with all its context "
              "profiles merged into it."));
 
 namespace llvm {
 namespace sampleprof {
-SampleProfileFormat FunctionSamples::Format;
 bool FunctionSamples::ProfileIsProbeBased = false;
-bool FunctionSamples::ProfileIsCSFlat = false;
-bool FunctionSamples::ProfileIsCSNested = false;
+bool FunctionSamples::ProfileIsCS = false;
+bool FunctionSamples::ProfileIsPreInlined = false;
 bool FunctionSamples::UseMD5 = false;
 bool FunctionSamples::HasUniqSuffix = true;
 bool FunctionSamples::ProfileIsFS = false;
@@ -88,8 +85,6 @@ class SampleProfErrorCategoryType : public std::error_category {
       return "Counter overflow";
     case sampleprof_error::ostream_seek_unsupported:
       return "Ostream does not support seek";
-    case sampleprof_error::compress_failed:
-      return "Compress failure";
     case sampleprof_error::uncompress_failed:
       return "Uncompress failure";
     case sampleprof_error::zlib_unavailable:
@@ -523,6 +518,12 @@ void CSProfileConverter::convertProfiles(CSProfileConverter::FrameNode &Node) {
       auto &SamplesMap = NodeProfile->functionSamplesAt(ChildNode.CallSiteLoc);
       SamplesMap.emplace(OrigChildContext.getName().str(), *ChildProfile);
       NodeProfile->addTotalSamples(ChildProfile->getTotalSamples());
+      // Remove the corresponding body sample for the callsite and update the
+      // total weight.
+      auto Count = NodeProfile->removeCalledTargetAndBodySample(
+          ChildNode.CallSiteLoc.LineOffset, ChildNode.CallSiteLoc.Discriminator,
+          OrigChildContext.getName());
+      NodeProfile->removeTotalSamples(Count);
     }
 
     // Separate child profile to be a standalone profile, if the current parent
@@ -531,13 +532,14 @@ void CSProfileConverter::convertProfiles(CSProfileConverter::FrameNode &Node) {
     // thus done optionally. It is seen that duplicating context profiles into
     // base profiles improves the code quality for thinlto build by allowing a
     // profile in the prelink phase for to-be-fully-inlined functions.
-    if (!NodeProfile || GenerateMergedBaseProfiles)
+    if (!NodeProfile) {
       ProfileMap[ChildProfile->getContext()].merge(*ChildProfile);
-
-    // Contexts coming with a `ContextShouldBeInlined` attribute indicate this
-    // is a preinliner-computed profile.
-    if (OrigChildContext.hasAttribute(ContextShouldBeInlined))
-      FunctionSamples::ProfileIsCSNested = true;
+    } else if (GenerateMergedBaseProfiles) {
+      ProfileMap[ChildProfile->getContext()].merge(*ChildProfile);
+      auto &SamplesMap = NodeProfile->functionSamplesAt(ChildNode.CallSiteLoc);
+      SamplesMap[ChildProfile->getName().str()].getContext().setAttribute(
+          ContextDuplicatedIntoBase);
+    }
 
     // Remove the original child profile.
     ProfileMap.erase(OrigChildContext);
diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp
index 80c02faaba04..280e3c6cb8d1 100644
--- a/llvm/lib/ProfileData/SampleProfReader.cpp
+++ b/llvm/lib/ProfileData/SampleProfReader.cpp
@@ -23,6 +23,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/ProfileSummary.h"
 #include "llvm/ProfileData/ProfileCommon.h"
 #include "llvm/ProfileData/SampleProf.h"
@@ -39,7 +40,6 @@
 #include <cstdint>
 #include <limits>
 #include <memory>
-#include <set>
 #include <system_error>
 #include <vector>
 
@@ -348,7 +348,7 @@ std::error_code SampleProfileReaderText::readImpl() {
         }
         FProfile.getContext().setAllAttributes(Attributes);
         if (Attributes & (uint32_t)ContextShouldBeInlined)
-          ProfileIsCSNested = true;
+          ProfileIsPreInlined = true;
         DepthMetadata = Depth;
         break;
       }
@@ -358,14 +358,14 @@ std::error_code SampleProfileReaderText::readImpl() {
 
   assert((CSProfileCount == 0 || CSProfileCount == Profiles.size()) &&
          "Cannot have both context-sensitive and regular profile");
-  ProfileIsCSFlat = (CSProfileCount > 0);
+  ProfileIsCS = (CSProfileCount > 0);
   assert((TopLevelProbeProfileCount == 0 ||
           TopLevelProbeProfileCount == Profiles.size()) &&
          "Cannot have both probe-based profiles and regular profiles");
   ProfileIsProbeBased = (TopLevelProbeProfileCount > 0);
   FunctionSamples::ProfileIsProbeBased = ProfileIsProbeBased;
-  FunctionSamples::ProfileIsCSFlat = ProfileIsCSFlat;
-  FunctionSamples::ProfileIsCSNested = ProfileIsCSNested;
+  FunctionSamples::ProfileIsCS = ProfileIsCS;
+  FunctionSamples::ProfileIsPreInlined = ProfileIsPreInlined;
 
   if (Result == sampleprof_error::success)
     computeSummary();
@@ -630,7 +630,7 @@ SampleProfileReaderExtBinaryBase::readContextFromTable() {
 
 ErrorOr<SampleContext>
 SampleProfileReaderExtBinaryBase::readSampleContextFromTable() {
-  if (ProfileIsCSFlat) {
+  if (ProfileIsCS) {
     auto FContext(readContextFromTable());
     if (std::error_code EC = FContext.getError())
       return EC;
@@ -654,9 +654,9 @@ std::error_code SampleProfileReaderExtBinaryBase::readOneSection(
     if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagPartial))
       Summary->setPartialProfile(true);
     if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagFullContext))
-      FunctionSamples::ProfileIsCSFlat = ProfileIsCSFlat = true;
-    if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagIsCSNested))
-      FunctionSamples::ProfileIsCSNested = ProfileIsCSNested;
+      FunctionSamples::ProfileIsCS = ProfileIsCS = true;
+    if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagIsPreInlined))
+      FunctionSamples::ProfileIsPreInlined = ProfileIsPreInlined = true;
     if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagFSDiscriminator))
       FunctionSamples::ProfileIsFS = ProfileIsFS = true;
     break;
@@ -777,7 +777,7 @@ std::error_code SampleProfileReaderExtBinaryBase::readFuncProfiles() {
       }
     }
 
-    if (ProfileIsCSFlat) {
+    if (ProfileIsCS) {
       DenseSet<uint64_t> FuncGuidsToUse;
       if (useMD5()) {
         for (auto Name : FuncsToUse)
@@ -847,7 +847,7 @@ std::error_code SampleProfileReaderExtBinaryBase::readFuncProfiles() {
   }
   assert((CSProfileCount == 0 || CSProfileCount == Profiles.size()) &&
          "Cannot have both context-sensitive and regular profile");
-  assert((!CSProfileCount || ProfileIsCSFlat) &&
+  assert((!CSProfileCount || ProfileIsCS) &&
          "Section flag should be consistent with actual profile");
   return sampleprof_error::success;
 }
@@ -1105,7 +1105,7 @@ SampleProfileReaderExtBinaryBase::readFuncMetadata(bool ProfileHasAttribute,
         FProfile->getContext().setAllAttributes(*Attributes);
     }
 
-    if (!ProfileIsCSFlat) {
+    if (!ProfileIsCS) {
       // Read all the attributes for inlined function calls.
       auto NumCallsites = readNumber<uint32_t>();
       if (std::error_code EC = NumCallsites.getError())
@@ -1275,8 +1275,8 @@ static std::string getSecFlagsStr(const SecHdrTableEntry &Entry) {
       Flags.append("partial,");
     if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagFullContext))
       Flags.append("context,");
-    if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagIsCSNested))
-      Flags.append("context-nested,");
+    if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagIsPreInlined))
+      Flags.append("preInlined,");
     if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagFSDiscriminator))
       Flags.append("fs-discriminator,");
     break;
@@ -1828,7 +1828,7 @@ SampleProfileReaderItaniumRemapper::create(std::unique_ptr<MemoryBuffer> &B,
                                            SampleProfileReader &Reader,
                                            LLVMContext &C) {
   auto Remappings = std::make_unique<SymbolRemappingReader>();
-  if (Error E = Remappings->read(*B.get())) {
+  if (Error E = Remappings->read(*B)) {
     handleAllErrors(
         std::move(E), [&](const SymbolRemappingParseError &ParseError) {
           C.diagnose(DiagnosticInfoSampleProfile(B->getBufferIdentifier(),
@@ -1882,7 +1882,6 @@ SampleProfileReader::create(std::unique_ptr<MemoryBuffer> &B, LLVMContext &C,
     Reader->Remapper = std::move(ReaderOrErr.get());
   }
 
-  FunctionSamples::Format = Reader->getFormat();
   if (std::error_code EC = Reader->readHeader()) {
     return EC;
   }
diff --git a/llvm/lib/ProfileData/SampleProfWriter.cpp b/llvm/lib/ProfileData/SampleProfWriter.cpp
index b575425d4e94..8ec6b7ebc29e 100644
--- a/llvm/lib/ProfileData/SampleProfWriter.cpp
+++ b/llvm/lib/ProfileData/SampleProfWriter.cpp
@@ -19,7 +19,6 @@
 
 #include "llvm/ProfileData/SampleProfWriter.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSet.h"
 #include "llvm/ProfileData/ProfileCommon.h"
 #include "llvm/ProfileData/SampleProf.h"
 #include "llvm/Support/Compression.h"
@@ -87,10 +86,8 @@ std::error_code SampleProfileWriterExtBinaryBase::compressAndOutput() {
     return sampleprof_error::success;
   auto &OS = *OutputStream;
   SmallString<128> CompressedStrings;
-  llvm::Error E = zlib::compress(UncompressedStrings, CompressedStrings,
-                                 zlib::BestSizeCompression);
-  if (E)
-    return sampleprof_error::compress_failed;
+  zlib::compress(UncompressedStrings, CompressedStrings,
+                 zlib::BestSizeCompression);
   encodeULEB128(UncompressedStrings.size(), OS);
   encodeULEB128(CompressedStrings.size(), OS);
   OS << CompressedStrings.str();
@@ -172,7 +169,7 @@ std::error_code SampleProfileWriterExtBinaryBase::writeFuncOffsetTable() {
     return (std::error_code)sampleprof_error::success;
   };
 
-  if (FunctionSamples::ProfileIsCSFlat) {
+  if (FunctionSamples::ProfileIsCS) {
     // Sort the contexts before writing them out. This is to help fast load all
     // context profiles for a function as well as their callee contexts which
     // can help profile-guided importing for ThinLTO.
@@ -202,11 +199,11 @@ std::error_code SampleProfileWriterExtBinaryBase::writeFuncMetadata(
 
   if (FunctionSamples::ProfileIsProbeBased)
     encodeULEB128(FunctionProfile.getFunctionHash(), OS);
-  if (FunctionSamples::ProfileIsCSFlat || FunctionSamples::ProfileIsCSNested) {
+  if (FunctionSamples::ProfileIsCS || FunctionSamples::ProfileIsPreInlined) {
     encodeULEB128(FunctionProfile.getContext().getAllAttributes(), OS);
   }
 
-  if (!FunctionSamples::ProfileIsCSFlat) {
+  if (!FunctionSamples::ProfileIsCS) {
     // Recursively emit attributes for all callee samples.
     uint64_t NumCallsites = 0;
     for (const auto &J : FunctionProfile.getCallsiteSamples())
@@ -228,8 +225,8 @@ std::error_code SampleProfileWriterExtBinaryBase::writeFuncMetadata(
 
 std::error_code SampleProfileWriterExtBinaryBase::writeFuncMetadata(
     const SampleProfileMap &Profiles) {
-  if (!FunctionSamples::ProfileIsProbeBased &&
-      !FunctionSamples::ProfileIsCSFlat && !FunctionSamples::ProfileIsCSNested)
+  if (!FunctionSamples::ProfileIsProbeBased && !FunctionSamples::ProfileIsCS &&
+      !FunctionSamples::ProfileIsPreInlined)
     return sampleprof_error::success;
   for (const auto &Entry : Profiles) {
     if (std::error_code EC = writeFuncMetadata(Entry.second))
@@ -324,12 +321,12 @@ std::error_code SampleProfileWriterExtBinaryBase::writeOneSection(
   if (Type == SecFuncMetadata && FunctionSamples::ProfileIsProbeBased)
     addSectionFlag(SecFuncMetadata, SecFuncMetadataFlags::SecFlagIsProbeBased);
   if (Type == SecFuncMetadata &&
-      (FunctionSamples::ProfileIsCSFlat || FunctionSamples::ProfileIsCSNested))
+      (FunctionSamples::ProfileIsCS || FunctionSamples::ProfileIsPreInlined))
     addSectionFlag(SecFuncMetadata, SecFuncMetadataFlags::SecFlagHasAttribute);
-  if (Type == SecProfSummary && FunctionSamples::ProfileIsCSFlat)
+  if (Type == SecProfSummary && FunctionSamples::ProfileIsCS)
     addSectionFlag(SecProfSummary, SecProfSummaryFlags::SecFlagFullContext);
-  if (Type == SecProfSummary && FunctionSamples::ProfileIsCSNested)
-    addSectionFlag(SecProfSummary, SecProfSummaryFlags::SecFlagIsCSNested);
+  if (Type == SecProfSummary && FunctionSamples::ProfileIsPreInlined)
+    addSectionFlag(SecProfSummary, SecProfSummaryFlags::SecFlagIsPreInlined);
   if (Type == SecProfSummary && FunctionSamples::ProfileIsFS)
     addSectionFlag(SecProfSummary, SecProfSummaryFlags::SecFlagFSDiscriminator);
 
@@ -471,7 +468,7 @@ SampleProfileWriterCompactBinary::write(const SampleProfileMap &ProfileMap) {
 /// it needs to be parsed by the SampleProfileReaderText class.
 std::error_code SampleProfileWriterText::writeSample(const FunctionSamples &S) {
   auto &OS = *OutputStream;
-  if (FunctionSamples::ProfileIsCSFlat)
+  if (FunctionSamples::ProfileIsCS)
     OS << "[" << S.getContext().toString() << "]:" << S.getTotalSamples();
   else
     OS << S.getName() << ":" << S.getTotalSamples();
@@ -871,8 +868,7 @@ SampleProfileWriter::create(std::unique_ptr<raw_ostream> &OS,
   std::unique_ptr<SampleProfileWriter> Writer;
 
   // Currently only Text and Extended Binary format are supported for CSSPGO.
-  if ((FunctionSamples::ProfileIsCSFlat ||
-       FunctionSamples::ProfileIsProbeBased) &&
+  if ((FunctionSamples::ProfileIsCS || FunctionSamples::ProfileIsProbeBased) &&
       (Format == SPF_Binary || Format == SPF_Compact_Binary))
     return sampleprof_error::unsupported_writing_format;
 
diff --git a/llvm/lib/Remarks/BitstreamRemarkSerializer.cpp b/llvm/lib/Remarks/BitstreamRemarkSerializer.cpp
index 0810bf531db8..5a77a25b1569 100644
--- a/llvm/lib/Remarks/BitstreamRemarkSerializer.cpp
+++ b/llvm/lib/Remarks/BitstreamRemarkSerializer.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Remarks/BitstreamRemarkSerializer.h"
+#include "llvm/Remarks/Remark.h"
 
 using namespace llvm;
 using namespace llvm::remarks;
diff --git a/llvm/lib/Remarks/RemarkLinker.cpp b/llvm/lib/Remarks/RemarkLinker.cpp
index 62f80918ea1d..cbe966794c49 100644
--- a/llvm/lib/Remarks/RemarkLinker.cpp
+++ b/llvm/lib/Remarks/RemarkLinker.cpp
@@ -17,11 +17,14 @@
 #include "llvm/Remarks/RemarkParser.h"
 #include "llvm/Remarks/RemarkSerializer.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 using namespace llvm::remarks;
 
+namespace llvm {
+class raw_ostream;
+}
+
 static Expected<StringRef>
 getRemarksSectionName(const object::ObjectFile &Obj) {
   if (Obj.isMachO())
@@ -63,7 +66,7 @@ void RemarkLinker::setExternalFilePrependPath(StringRef PrependPathIn) {
 }
 
 // Discard remarks with no source location.
-static bool shouldKeepRemark(const Remark &R) { return R.Loc.hasValue(); }
+static bool shouldKeepRemark(const Remark &R) { return R.Loc.has_value(); }
 
 Error RemarkLinker::link(StringRef Buffer, Optional<Format> RemarkFormat) {
   if (!RemarkFormat) {
diff --git a/llvm/lib/Remarks/RemarkParser.cpp b/llvm/lib/Remarks/RemarkParser.cpp
index f36767efcbf4..fc0612fb76e2 100644
--- a/llvm/lib/Remarks/RemarkParser.cpp
+++ b/llvm/lib/Remarks/RemarkParser.cpp
@@ -118,7 +118,7 @@ struct CParser {
                    : createRemarkParser(ParserFormat, Buf))) {}
 
   void handleError(Error E) { Err.emplace(toString(std::move(E))); }
-  bool hasError() const { return Err.hasValue(); }
+  bool hasError() const { return Err.has_value(); }
   const char *getMessage() const { return Err ? Err->c_str() : nullptr; };
 };
 } // namespace
diff --git a/llvm/lib/Remarks/YAMLRemarkSerializer.cpp b/llvm/lib/Remarks/YAMLRemarkSerializer.cpp
index 9e965aa4f6c4..fff2b655e821 100644
--- a/llvm/lib/Remarks/YAMLRemarkSerializer.cpp
+++ b/llvm/lib/Remarks/YAMLRemarkSerializer.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Remarks/YAMLRemarkSerializer.h"
+#include "llvm/Remarks/Remark.h"
 #include "llvm/Support/FileSystem.h"
 
 using namespace llvm;
@@ -58,8 +59,7 @@ template <> struct MappingTraits<remarks::Remark *> {
 
     if (auto *Serializer = dyn_cast<YAMLStrTabRemarkSerializer>(
             reinterpret_cast<RemarkSerializer *>(io.getContext()))) {
-      assert(Serializer->StrTab.hasValue() &&
-             "YAMLStrTabSerializer with no StrTab.");
+      assert(Serializer->StrTab && "YAMLStrTabSerializer with no StrTab.");
       StringTable &StrTab = *Serializer->StrTab;
       unsigned PassID = StrTab.add(Remark->PassName).first;
       unsigned NameID = StrTab.add(Remark->RemarkName).first;
@@ -83,8 +83,7 @@ template <> struct MappingTraits<RemarkLocation> {
 
     if (auto *Serializer = dyn_cast<YAMLStrTabRemarkSerializer>(
             reinterpret_cast<RemarkSerializer *>(io.getContext()))) {
-      assert(Serializer->StrTab.hasValue() &&
-             "YAMLStrTabSerializer with no StrTab.");
+      assert(Serializer->StrTab && "YAMLStrTabSerializer with no StrTab.");
       StringTable &StrTab = *Serializer->StrTab;
       unsigned FileID = StrTab.add(File).first;
       io.mapRequired("File", FileID);
@@ -138,8 +137,7 @@ template <> struct MappingTraits<Argument> {
 
     if (auto *Serializer = dyn_cast<YAMLStrTabRemarkSerializer>(
             reinterpret_cast<RemarkSerializer *>(io.getContext()))) {
-      assert(Serializer->StrTab.hasValue() &&
-             "YAMLStrTabSerializer with no StrTab.");
+      assert(Serializer->StrTab && "YAMLStrTabSerializer with no StrTab.");
       StringTable &StrTab = *Serializer->StrTab;
       auto ValueID = StrTab.add(A.Val).first;
       io.mapRequired(A.Key.data(), ValueID);
diff --git a/llvm/lib/Support/AArch64TargetParser.cpp b/llvm/lib/Support/AArch64TargetParser.cpp
index cdf7c8ade9aa..e2579bf53260 100644
--- a/llvm/lib/Support/AArch64TargetParser.cpp
+++ b/llvm/lib/Support/AArch64TargetParser.cpp
@@ -64,62 +64,14 @@ bool AArch64::getExtensionFeatures(uint64_t Extensions,
   if (Extensions == AArch64::AEK_INVALID)
     return false;
 
-  if (Extensions & AEK_FP)
-    Features.push_back("+fp-armv8");
-  if (Extensions & AEK_SIMD)
-    Features.push_back("+neon");
-  if (Extensions & AEK_CRC)
-    Features.push_back("+crc");
-  if (Extensions & AEK_CRYPTO)
-    Features.push_back("+crypto");
-  if (Extensions & AEK_DOTPROD)
-    Features.push_back("+dotprod");
-  if (Extensions & AEK_FP16FML)
-    Features.push_back("+fp16fml");
-  if (Extensions & AEK_FP16)
-    Features.push_back("+fullfp16");
-  if (Extensions & AEK_PROFILE)
-    Features.push_back("+spe");
-  if (Extensions & AEK_RAS)
-    Features.push_back("+ras");
-  if (Extensions & AEK_LSE)
-    Features.push_back("+lse");
-  if (Extensions & AEK_RDM)
-    Features.push_back("+rdm");
-  if (Extensions & AEK_SVE)
-    Features.push_back("+sve");
-  if (Extensions & AEK_SVE2)
-    Features.push_back("+sve2");
-  if (Extensions & AEK_SVE2AES)
-    Features.push_back("+sve2-aes");
-  if (Extensions & AEK_SVE2SM4)
-    Features.push_back("+sve2-sm4");
-  if (Extensions & AEK_SVE2SHA3)
-    Features.push_back("+sve2-sha3");
-  if (Extensions & AEK_SVE2BITPERM)
-    Features.push_back("+sve2-bitperm");
-  if (Extensions & AArch64::AEK_TME)
-    Features.push_back("+tme");
-  if (Extensions & AEK_RCPC)
-    Features.push_back("+rcpc");
-  if (Extensions & AEK_BRBE)
-    Features.push_back("+brbe");
-  if (Extensions & AEK_PAUTH)
-    Features.push_back("+pauth");
-  if (Extensions & AEK_FLAGM)
-    Features.push_back("+flagm");
-  if (Extensions & AArch64::AEK_SME)
-    Features.push_back("+sme");
-  if (Extensions & AArch64::AEK_SMEF64)
-    Features.push_back("+sme-f64");
-  if (Extensions & AArch64::AEK_SMEI64)
-    Features.push_back("+sme-i64");
-  if (Extensions & AArch64::AEK_HBC)
-    Features.push_back("+hbc");
-  if (Extensions & AArch64::AEK_MOPS)
-    Features.push_back("+mops");
-  if (Extensions & AArch64::AEK_PERFMON)
-    Features.push_back("+perfmon");
+#define AARCH64_ARCH_EXT_NAME(NAME, ID, FEATURE, NEGFEATURE)                   \
+  if (Extensions & ID) {                                                       \
+    const char *feature = FEATURE;                                             \
+    /* INVALID and NONE have no feature name. */                               \
+    if (feature)                                                               \
+      Features.push_back(feature);                                             \
+  }
+#include "../../include/llvm/Support/AArch64TargetParser.def"
 
   return true;
 }
diff --git a/llvm/lib/Support/APFixedPoint.cpp b/llvm/lib/Support/APFixedPoint.cpp
index 61b30b5c5c60..f1d07184793c 100644
--- a/llvm/lib/Support/APFixedPoint.cpp
+++ b/llvm/lib/Support/APFixedPoint.cpp
@@ -233,11 +233,11 @@ APFixedPoint APFixedPoint::mul(const APFixedPoint &Other,
   // Widen the LHS and RHS so we can perform a full multiplication.
   unsigned Wide = CommonFXSema.getWidth() * 2;
   if (CommonFXSema.isSigned()) {
-    ThisVal = ThisVal.sextOrSelf(Wide);
-    OtherVal = OtherVal.sextOrSelf(Wide);
+    ThisVal = ThisVal.sext(Wide);
+    OtherVal = OtherVal.sext(Wide);
   } else {
-    ThisVal = ThisVal.zextOrSelf(Wide);
-    OtherVal = OtherVal.zextOrSelf(Wide);
+    ThisVal = ThisVal.zext(Wide);
+    OtherVal = OtherVal.zext(Wide);
   }
 
   // Perform the full multiplication and downscale to get the same scale.
@@ -290,11 +290,11 @@ APFixedPoint APFixedPoint::div(const APFixedPoint &Other,
   // Widen the LHS and RHS so we can perform a full division.
   unsigned Wide = CommonFXSema.getWidth() * 2;
   if (CommonFXSema.isSigned()) {
-    ThisVal = ThisVal.sextOrSelf(Wide);
-    OtherVal = OtherVal.sextOrSelf(Wide);
+    ThisVal = ThisVal.sext(Wide);
+    OtherVal = OtherVal.sext(Wide);
   } else {
-    ThisVal = ThisVal.zextOrSelf(Wide);
-    OtherVal = OtherVal.zextOrSelf(Wide);
+    ThisVal = ThisVal.zext(Wide);
+    OtherVal = OtherVal.zext(Wide);
   }
 
   // Upscale to compensate for the loss of precision from division, and
@@ -340,9 +340,9 @@ APFixedPoint APFixedPoint::shl(unsigned Amt, bool *Overflow) const {
   // Widen the LHS.
   unsigned Wide = Sema.getWidth() * 2;
   if (Sema.isSigned())
-    ThisVal = ThisVal.sextOrSelf(Wide);
+    ThisVal = ThisVal.sext(Wide);
   else
-    ThisVal = ThisVal.zextOrSelf(Wide);
+    ThisVal = ThisVal.zext(Wide);
 
   // Clamp the shift amount at the original width, and perform the shift.
   Amt = std::min(Amt, ThisVal.getBitWidth());
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 4b75c9db8526..2ae28fe066cd 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -2213,8 +2213,11 @@ IEEEFloat::opStatus IEEEFloat::convert(const fltSemantics &toSemantics,
   // when truncating from PowerPC double-double to double format), the
   // right shift could lose result mantissa bits.  Adjust exponent instead
   // of performing excessive shift.
+  // Also do a similar trick in case shifting denormal would produce zero
+  // significand as this case isn't handled correctly by normalize.
   if (shift < 0 && isFiniteNonZero()) {
-    int exponentChange = significandMSB() + 1 - fromSemantics.precision;
+    int omsb = significandMSB() + 1;
+    int exponentChange = omsb - fromSemantics.precision;
     if (exponent + exponentChange < toSemantics.minExponent)
       exponentChange = toSemantics.minExponent - exponent;
     if (exponentChange < shift)
@@ -2222,6 +2225,10 @@ IEEEFloat::opStatus IEEEFloat::convert(const fltSemantics &toSemantics,
     if (exponentChange < 0) {
       shift -= exponentChange;
       exponent += exponentChange;
+    } else if (omsb <= -shift) {
+      exponentChange = omsb + shift - 1; // leave at least one bit set
+      shift -= exponentChange;
+      exponent += exponentChange;
     }
   }
 
diff --git a/llvm/lib/Support/APInt.cpp b/llvm/lib/Support/APInt.cpp
index b536e9a9a6d0..f74178b1ba4e 100644
--- a/llvm/lib/Support/APInt.cpp
+++ b/llvm/lib/Support/APInt.cpp
@@ -343,7 +343,7 @@ void APInt::flipAllBitsSlowCase() {
 /// In the slow case, we know the result is large.
 APInt APInt::concatSlowCase(const APInt &NewLSB) const {
   unsigned NewWidth = getBitWidth() + NewLSB.getBitWidth();
-  APInt Result = NewLSB.zextOrSelf(NewWidth);
+  APInt Result = NewLSB.zext(NewWidth);
   Result.insertBits(*this, NewLSB.getBitWidth());
   return Result;
 }
@@ -502,12 +502,51 @@ uint64_t APInt::extractBitsAsZExtValue(unsigned numBits,
   return retBits;
 }
 
+unsigned APInt::getSufficientBitsNeeded(StringRef Str, uint8_t Radix) {
+  assert(!Str.empty() && "Invalid string length");
+  size_t StrLen = Str.size();
+
+  // Each computation below needs to know if it's negative.
+  unsigned IsNegative = false;
+  if (Str[0] == '-' || Str[0] == '+') {
+    IsNegative = Str[0] == '-';
+    StrLen--;
+    assert(StrLen && "String is only a sign, needs a value.");
+  }
+
+  // For radixes of power-of-two values, the bits required is accurately and
+  // easily computed.
+  if (Radix == 2)
+    return StrLen + IsNegative;
+  if (Radix == 8)
+    return StrLen * 3 + IsNegative;
+  if (Radix == 16)
+    return StrLen * 4 + IsNegative;
+
+  // Compute a sufficient number of bits that is always large enough but might
+  // be too large. This avoids the assertion in the constructor. This
+  // calculation doesn't work appropriately for the numbers 0-9, so just use 4
+  // bits in that case.
+  if (Radix == 10)
+    return (StrLen == 1 ? 4 : StrLen * 64 / 18) + IsNegative;
+
+  assert(Radix == 36);
+  return (StrLen == 1 ? 7 : StrLen * 16 / 3) + IsNegative;
+}
+
 unsigned APInt::getBitsNeeded(StringRef str, uint8_t radix) {
-  assert(!str.empty() && "Invalid string length");
-  assert((radix == 10 || radix == 8 || radix == 16 || radix == 2 ||
-          radix == 36) &&
-         "Radix should be 2, 8, 10, 16, or 36!");
+  // Compute a sufficient number of bits that is always large enough but might
+  // be too large.
+  unsigned sufficient = getSufficientBitsNeeded(str, radix);
+
+  // For bases 2, 8, and 16, the sufficient number of bits is exact and we can
+  // return the value directly. For bases 10 and 36, we need to do extra work.
+  if (radix == 2 || radix == 8 || radix == 16)
+    return sufficient;
 
+  // This is grossly inefficient but accurate. We could probably do something
+  // with a computation of roughly slen*64/20 and then adjust by the value of
+  // the first few digits. But, I'm not sure how accurate that could be.
   size_t slen = str.size();
 
   // Each computation below needs to know if it's negative.
@@ -519,28 +558,6 @@ unsigned APInt::getBitsNeeded(StringRef str, uint8_t radix) {
     assert(slen && "String is only a sign, needs a value.");
   }
 
-  // For radixes of power-of-two values, the bits required is accurately and
-  // easily computed
-  if (radix == 2)
-    return slen + isNegative;
-  if (radix == 8)
-    return slen * 3 + isNegative;
-  if (radix == 16)
-    return slen * 4 + isNegative;
-
-  // FIXME: base 36
-
-  // This is grossly inefficient but accurate. We could probably do something
-  // with a computation of roughly slen*64/20 and then adjust by the value of
-  // the first few digits. But, I'm not sure how accurate that could be.
-
-  // Compute a sufficient number of bits that is always large enough but might
-  // be too large. This avoids the assertion in the constructor. This
-  // calculation doesn't work appropriately for the numbers 0-9, so just use 4
-  // bits in that case.
-  unsigned sufficient
-    = radix == 10? (slen == 1 ? 4 : slen * 64/18)
-                 : (slen == 1 ? 7 : slen * 16/3);
 
   // Convert to the actual binary value.
   APInt tmp(sufficient, StringRef(p, slen), radix);
@@ -595,7 +612,7 @@ APInt APInt::getLoBits(unsigned numBits) const {
 APInt APInt::getSplat(unsigned NewLen, const APInt &V) {
   assert(NewLen >= V.getBitWidth() && "Can't splat to smaller bit width!");
 
-  APInt Val = V.zextOrSelf(NewLen);
+  APInt Val = V.zext(NewLen);
   for (unsigned I = V.getBitWidth(); I < NewLen; I <<= 1)
     Val |= Val << I;
 
@@ -879,11 +896,14 @@ double APInt::roundToDouble(bool isSigned) const {
 
 // Truncate to new width.
 APInt APInt::trunc(unsigned width) const {
-  assert(width < BitWidth && "Invalid APInt Truncate request");
+  assert(width <= BitWidth && "Invalid APInt Truncate request");
 
   if (width <= APINT_BITS_PER_WORD)
     return APInt(width, getRawData()[0]);
 
+  if (width == BitWidth)
+    return *this;
+
   APInt Result(getMemory(getNumWords(width)), width);
 
   // Copy full words.
@@ -901,7 +921,7 @@ APInt APInt::trunc(unsigned width) const {
 
 // Truncate to new width with unsigned saturation.
 APInt APInt::truncUSat(unsigned width) const {
-  assert(width < BitWidth && "Invalid APInt Truncate request");
+  assert(width <= BitWidth && "Invalid APInt Truncate request");
 
   // Can we just losslessly truncate it?
   if (isIntN(width))
@@ -912,7 +932,7 @@ APInt APInt::truncUSat(unsigned width) const {
 
 // Truncate to new width with signed saturation.
 APInt APInt::truncSSat(unsigned width) const {
-  assert(width < BitWidth && "Invalid APInt Truncate request");
+  assert(width <= BitWidth && "Invalid APInt Truncate request");
 
   // Can we just losslessly truncate it?
   if (isSignedIntN(width))
@@ -924,11 +944,14 @@ APInt APInt::truncSSat(unsigned width) const {
 
 // Sign extend to a new width.
 APInt APInt::sext(unsigned Width) const {
-  assert(Width > BitWidth && "Invalid APInt SignExtend request");
+  assert(Width >= BitWidth && "Invalid APInt SignExtend request");
 
   if (Width <= APINT_BITS_PER_WORD)
     return APInt(Width, SignExtend64(U.VAL, BitWidth));
 
+  if (Width == BitWidth)
+    return *this;
+
   APInt Result(getMemory(getNumWords(Width)), Width);
 
   // Copy words.
@@ -948,11 +971,14 @@ APInt APInt::sext(unsigned Width) const {
 
 //  Zero extend to a new width.
 APInt APInt::zext(unsigned width) const {
-  assert(width > BitWidth && "Invalid APInt ZeroExtend request");
+  assert(width >= BitWidth && "Invalid APInt ZeroExtend request");
 
   if (width <= APINT_BITS_PER_WORD)
     return APInt(width, U.VAL);
 
+  if (width == BitWidth)
+    return *this;
+
   APInt Result(getMemory(getNumWords(width)), width);
 
   // Copy words.
@@ -981,24 +1007,6 @@ APInt APInt::sextOrTrunc(unsigned width) const {
   return *this;
 }
 
-APInt APInt::truncOrSelf(unsigned width) const {
-  if (BitWidth > width)
-    return trunc(width);
-  return *this;
-}
-
-APInt APInt::zextOrSelf(unsigned width) const {
-  if (BitWidth < width)
-    return zext(width);
-  return *this;
-}
-
-APInt APInt::sextOrSelf(unsigned width) const {
-  if (BitWidth < width)
-    return sext(width);
-  return *this;
-}
-
 /// Arithmetic right-shift this APInt by shiftAmt.
 /// Arithmetic right-shift function.
 void APInt::ashrInPlace(const APInt &shiftAmt) {
@@ -2960,7 +2968,8 @@ llvm::APIntOps::GetMostSignificantDifferentBit(const APInt &A, const APInt &B) {
   return A.getBitWidth() - ((A ^ B).countLeadingZeros() + 1);
 }
 
-APInt llvm::APIntOps::ScaleBitMask(const APInt &A, unsigned NewBitWidth) {
+APInt llvm::APIntOps::ScaleBitMask(const APInt &A, unsigned NewBitWidth,
+                                   bool MatchAllBits) {
   unsigned OldBitWidth = A.getBitWidth();
   assert((((OldBitWidth % NewBitWidth) == 0) ||
           ((NewBitWidth % OldBitWidth) == 0)) &&
@@ -2984,11 +2993,16 @@ APInt llvm::APIntOps::ScaleBitMask(const APInt &A, unsigned NewBitWidth) {
       if (A[i])
         NewA.setBits(i * Scale, (i + 1) * Scale);
   } else {
-    // Merge bits - if any old bit is set, then set scale equivalent new bit.
     unsigned Scale = OldBitWidth / NewBitWidth;
-    for (unsigned i = 0; i != NewBitWidth; ++i)
-      if (!A.extractBits(Scale, i * Scale).isZero())
-        NewA.setBit(i);
+    for (unsigned i = 0; i != NewBitWidth; ++i) {
+      if (MatchAllBits) {
+        if (A.extractBits(Scale, i * Scale).isAllOnes())
+          NewA.setBit(i);
+      } else {
+        if (!A.extractBits(Scale, i * Scale).isZero())
+          NewA.setBit(i);
+      }
+    }
   }
 
   return NewA;
diff --git a/llvm/lib/Support/ARMAttributeParser.cpp b/llvm/lib/Support/ARMAttributeParser.cpp
index 9ba224cee0ca..adb5d3f0964d 100644
--- a/llvm/lib/Support/ARMAttributeParser.cpp
+++ b/llvm/lib/Support/ARMAttributeParser.cpp
@@ -87,7 +87,7 @@ Error ARMAttributeParser::CPU_arch(AttrType tag) {
     "ARM v6KZ", "ARM v6T2", "ARM v6K", "ARM v7", "ARM v6-M", "ARM v6S-M",
     "ARM v7E-M", "ARM v8", nullptr,
     "ARM v8-M Baseline", "ARM v8-M Mainline", nullptr, nullptr, nullptr,
-    "ARM v8.1-M Mainline"
+    "ARM v8.1-M Mainline", "ARM v9-A"
   };
   return parseStringAttribute("CPU_arch", tag, makeArrayRef(strings));
 }
diff --git a/llvm/lib/Support/ARMWinEH.cpp b/llvm/lib/Support/ARMWinEH.cpp
index 8e7fa1149082..29c7a28541f2 100644
--- a/llvm/lib/Support/ARMWinEH.cpp
+++ b/llvm/lib/Support/ARMWinEH.cpp
@@ -11,22 +11,35 @@
 namespace llvm {
 namespace ARM {
 namespace WinEH {
-std::pair<uint16_t, uint32_t> SavedRegisterMask(const RuntimeFunction &RF) {
+std::pair<uint16_t, uint32_t> SavedRegisterMask(const RuntimeFunction &RF,
+                                                bool Prologue) {
   uint8_t NumRegisters = RF.Reg();
   uint8_t RegistersVFP = RF.R();
   uint8_t LinkRegister = RF.L();
   uint8_t ChainedFrame = RF.C();
 
-  uint16_t GPRMask = (ChainedFrame << 11) | (LinkRegister << 14);
+  uint16_t GPRMask = (ChainedFrame << 11);
   uint32_t VFPMask = 0;
 
+  if (Prologue) {
+    GPRMask |= (LinkRegister << 14);
+  } else {
+    // If Ret != 0, we pop into Lr and return later
+    if (RF.Ret() != ReturnType::RT_POP)
+      GPRMask |= (LinkRegister << 14);
+    else if (!RF.H()) // If H == 0, we pop directly into Pc
+      GPRMask |= (LinkRegister << 15);
+    // else, Ret == 0 && H == 1, we pop into Pc separately afterwards
+  }
+
   if (RegistersVFP)
     VFPMask |= (((1 << ((NumRegisters + 1) % 8)) - 1) << 8);
   else
     GPRMask |= (((1 << (NumRegisters + 1)) - 1) << 4);
 
-  if (PrologueFolding(RF))
-    GPRMask |= (((1 << (NumRegisters + 1)) - 1) << (~RF.StackAdjust() & 0x3));
+  if ((PrologueFolding(RF) && Prologue) || (EpilogueFolding(RF) && !Prologue))
+    GPRMask |= (((1 << ((RF.StackAdjust() & 0x3) + 1)) - 1)
+                << (~RF.StackAdjust() & 0x3));
 
   return std::make_pair(GPRMask, VFPMask);
 }
diff --git a/llvm/lib/Support/AddressRanges.cpp b/llvm/lib/Support/AddressRanges.cpp
new file mode 100644
index 000000000000..5ba011bac4e9
--- /dev/null
+++ b/llvm/lib/Support/AddressRanges.cpp
@@ -0,0 +1,59 @@
+//===- AddressRanges.cpp ----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/AddressRanges.h"
+#include "llvm/ADT/STLExtras.h"
+#include <inttypes.h>
+
+using namespace llvm;
+
+void AddressRanges::insert(AddressRange Range) {
+  if (Range.size() == 0)
+    return;
+
+  auto It = llvm::upper_bound(Ranges, Range);
+  auto It2 = It;
+  while (It2 != Ranges.end() && It2->start() < Range.end())
+    ++It2;
+  if (It != It2) {
+    Range = {Range.start(), std::max(Range.end(), It2[-1].end())};
+    It = Ranges.erase(It, It2);
+  }
+  if (It != Ranges.begin() && Range.start() < It[-1].end())
+    It[-1] = {It[-1].start(), std::max(It[-1].end(), Range.end())};
+  else
+    Ranges.insert(It, Range);
+}
+
+bool AddressRanges::contains(uint64_t Addr) const {
+  auto It = std::partition_point(
+      Ranges.begin(), Ranges.end(),
+      [=](const AddressRange &R) { return R.start() <= Addr; });
+  return It != Ranges.begin() && Addr < It[-1].end();
+}
+
+bool AddressRanges::contains(AddressRange Range) const {
+  if (Range.size() == 0)
+    return false;
+  auto It = std::partition_point(
+      Ranges.begin(), Ranges.end(),
+      [=](const AddressRange &R) { return R.start() <= Range.start(); });
+  if (It == Ranges.begin())
+    return false;
+  return Range.end() <= It[-1].end();
+}
+
+Optional<AddressRange>
+AddressRanges::getRangeThatContains(uint64_t Addr) const {
+  auto It = std::partition_point(
+      Ranges.begin(), Ranges.end(),
+      [=](const AddressRange &R) { return R.start() <= Addr; });
+  if (It != Ranges.begin() && Addr < It[-1].end())
+    return It[-1];
+  return llvm::None;
+}
diff --git a/llvm/lib/Support/BLAKE3/LICENSE b/llvm/lib/Support/BLAKE3/LICENSE
new file mode 100644
index 000000000000..f5892efc3b9b
--- /dev/null
+++ b/llvm/lib/Support/BLAKE3/LICENSE
@@ -0,0 +1,330 @@
+This work is released into the public domain with CC0 1.0. Alternatively, it is
+licensed under the Apache License 2.0.
+
+-------------------------------------------------------------------------------
+
+Creative Commons Legal Code
+
+CC0 1.0 Universal
+
+    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
+    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
+    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
+    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
+    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
+    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
+    THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
+    HEREUNDER.
+
+Statement of Purpose
+
+The laws of most jurisdictions throughout the world automatically confer
+exclusive Copyright and Related Rights (defined below) upon the creator
+and subsequent owner(s) (each and all, an "owner") of an original work of
+authorship and/or a database (each, a "Work").
+
+Certain owners wish to permanently relinquish those rights to a Work for
+the purpose of contributing to a commons of creative, cultural and
+scientific works ("Commons") that the public can reliably and without fear
+of later claims of infringement build upon, modify, incorporate in other
+works, reuse and redistribute as freely as possible in any form whatsoever
+and for any purposes, including without limitation commercial purposes.
+These owners may contribute to the Commons to promote the ideal of a free
+culture and the further production of creative, cultural and scientific
+works, or to gain reputation or greater distribution for their Work in
+part through the use and efforts of others.
+
+For these and/or other purposes and motivations, and without any
+expectation of additional consideration or compensation, the person
+associating CC0 with a Work (the "Affirmer"), to the extent that he or she
+is an owner of Copyright and Related Rights in the Work, voluntarily
+elects to apply CC0 to the Work and publicly distribute the Work under its
+terms, with knowledge of his or her Copyright and Related Rights in the
+Work and the meaning and intended legal effect of CC0 on those rights.
+
+1. Copyright and Related Rights. A Work made available under CC0 may be
+protected by copyright and related or neighboring rights ("Copyright and
+Related Rights"). Copyright and Related Rights include, but are not
+limited to, the following:
+
+  i. the right to reproduce, adapt, distribute, perform, display,
+     communicate, and translate a Work;
+ ii. moral rights retained by the original author(s) and/or performer(s);
+iii. publicity and privacy rights pertaining to a person's image or
+     likeness depicted in a Work;
+ iv. rights protecting against unfair competition in regards to a Work,
+     subject to the limitations in paragraph 4(a), below;
+  v. rights protecting the extraction, dissemination, use and reuse of data
+     in a Work;
+ vi. database rights (such as those arising under Directive 96/9/EC of the
+     European Parliament and of the Council of 11 March 1996 on the legal
+     protection of databases, and under any national implementation
+     thereof, including any amended or successor version of such
+     directive); and
+vii. other similar, equivalent or corresponding rights throughout the
+     world based on applicable law or treaty, and any national
+     implementations thereof.
+
+2. Waiver. To the greatest extent permitted by, but not in contravention
+of, applicable law, Affirmer hereby overtly, fully, permanently,
+irrevocably and unconditionally waives, abandons, and surrenders all of
+Affirmer's Copyright and Related Rights and associated claims and causes
+of action, whether now known or unknown (including existing as well as
+future claims and causes of action), in the Work (i) in all territories
+worldwide, (ii) for the maximum duration provided by applicable law or
+treaty (including future time extensions), (iii) in any current or future
+medium and for any number of copies, and (iv) for any purpose whatsoever,
+including without limitation commercial, advertising or promotional
+purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
+member of the public at large and to the detriment of Affirmer's heirs and
+successors, fully intending that such Waiver shall not be subject to
+revocation, rescission, cancellation, termination, or any other legal or
+equitable action to disrupt the quiet enjoyment of the Work by the public
+as contemplated by Affirmer's express Statement of Purpose.
+
+3. Public License Fallback. Should any part of the Waiver for any reason
+be judged legally invalid or ineffective under applicable law, then the
+Waiver shall be preserved to the maximum extent permitted taking into
+account Affirmer's express Statement of Purpose. In addition, to the
+extent the Waiver is so judged Affirmer hereby grants to each affected
+person a royalty-free, non transferable, non sublicensable, non exclusive,
+irrevocable and unconditional license to exercise Affirmer's Copyright and
+Related Rights in the Work (i) in all territories worldwide, (ii) for the
+maximum duration provided by applicable law or treaty (including future
+time extensions), (iii) in any current or future medium and for any number
+of copies, and (iv) for any purpose whatsoever, including without
+limitation commercial, advertising or promotional purposes (the
+"License"). The License shall be deemed effective as of the date CC0 was
+applied by Affirmer to the Work. Should any part of the License for any
+reason be judged legally invalid or ineffective under applicable law, such
+partial invalidity or ineffectiveness shall not invalidate the remainder
+of the License, and in such case Affirmer hereby affirms that he or she
+will not (i) exercise any of his or her remaining Copyright and Related
+Rights in the Work or (ii) assert any associated claims and causes of
+action with respect to the Work, in either case contrary to Affirmer's
+express Statement of Purpose.
+
+4. Limitations and Disclaimers.
+
+ a. No trademark or patent rights held by Affirmer are waived, abandoned,
+    surrendered, licensed or otherwise affected by this document.
+ b. Affirmer offers the Work as-is and makes no representations or
+    warranties of any kind concerning the Work, express, implied,
+    statutory or otherwise, including without limitation warranties of
+    title, merchantability, fitness for a particular purpose, non
+    infringement, or the absence of latent or other defects, accuracy, or
+    the present or absence of errors, whether or not discoverable, all to
+    the greatest extent permissible under applicable law.
+ c. Affirmer disclaims responsibility for clearing rights of other persons
+    that may apply to the Work or any use thereof, including without
+    limitation any person's Copyright and Related Rights in the Work.
+    Further, Affirmer disclaims responsibility for obtaining any necessary
+    consents, permissions or other rights required for any use of the
+    Work.
+ d. Affirmer understands and acknowledges that Creative Commons is not a
+    party to this document and has no duty or obligation with respect to
+    this CC0 or use of the Work.
+
+-------------------------------------------------------------------------------
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2019 Jack O'Connor and Samuel Neves
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/llvm/lib/Support/BLAKE3/README.md b/llvm/lib/Support/BLAKE3/README.md
new file mode 100644
index 000000000000..319a7514e8b5
--- /dev/null
+++ b/llvm/lib/Support/BLAKE3/README.md
@@ -0,0 +1,296 @@
+Implementation of BLAKE3, originating from https://github.com/BLAKE3-team/BLAKE3/tree/1.3.1/c
+
+# Example
+
+An example program that hashes bytes from standard input and prints the
+result:
+
+Using the C++ API:
+
+```c++
+#include "llvm/Support/BLAKE3.h"
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+int main() {
+  // Initialize the hasher.
+  llvm::BLAKE3 hasher;
+
+  // Read input bytes from stdin.
+  char buf[65536];
+  while (1) {
+    ssize_t n = read(STDIN_FILENO, buf, sizeof(buf));
+    if (n > 0) {
+      hasher.update(llvm::StringRef(buf, n));
+    } else if (n == 0) {
+      break; // end of file
+    } else {
+      fprintf(stderr, "read failed: %s\n", strerror(errno));
+      exit(1);
+    }
+  }
+
+  // Finalize the hash. Default output length is 32 bytes.
+  auto output = hasher.final();
+
+  // Print the hash as hexadecimal.
+  for (uint8_t byte : output) {
+    printf("%02x", byte);
+  }
+  printf("\n");
+  return 0;
+}
+```
+
+Using the C API:
+
+```c
+#include "llvm-c/blake3.h"
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+int main() {
+  // Initialize the hasher.
+  llvm_blake3_hasher hasher;
+  llvm_blake3_hasher_init(&hasher);
+
+  // Read input bytes from stdin.
+  unsigned char buf[65536];
+  while (1) {
+    ssize_t n = read(STDIN_FILENO, buf, sizeof(buf));
+    if (n > 0) {
+      llvm_blake3_hasher_update(&hasher, buf, n);
+    } else if (n == 0) {
+      break; // end of file
+    } else {
+      fprintf(stderr, "read failed: %s\n", strerror(errno));
+      exit(1);
+    }
+  }
+
+  // Finalize the hash. LLVM_BLAKE3_OUT_LEN is the default output length, 32 bytes.
+  uint8_t output[LLVM_BLAKE3_OUT_LEN];
+  llvm_blake3_hasher_finalize(&hasher, output, LLVM_BLAKE3_OUT_LEN);
+
+  // Print the hash as hexadecimal.
+  for (size_t i = 0; i < LLVM_BLAKE3_OUT_LEN; i++) {
+    printf("%02x", output[i]);
+  }
+  printf("\n");
+  return 0;
+}
+```
+
+# API
+
+## The Class/Struct
+
+```c++
+class BLAKE3 {
+  // API
+private:
+  llvm_blake3_hasher Hasher;
+};
+```
+```c
+typedef struct {
+  // private fields
+} llvm_blake3_hasher;
+```
+
+An incremental BLAKE3 hashing state, which can accept any number of
+updates. This implementation doesn't allocate any heap memory, but
+`sizeof(llvm_blake3_hasher)` itself is relatively large, currently 1912 bytes
+on x86-64. This size can be reduced by restricting the maximum input
+length, as described in Section 5.4 of [the BLAKE3
+spec](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf),
+but this implementation doesn't currently support that strategy.
+
+## Common API Functions
+
+```c++
+BLAKE3::BLAKE3();
+
+void BLAKE3::init();
+```
+```c
+void llvm_blake3_hasher_init(
+  llvm_blake3_hasher *self);
+```
+
+Initialize a `llvm_blake3_hasher` in the default hashing mode.
+
+---
+
+```c++
+void BLAKE3::update(ArrayRef<uint8_t> Data);
+
+void BLAKE3::update(StringRef Str);
+```
+```c
+void llvm_blake3_hasher_update(
+  llvm_blake3_hasher *self,
+  const void *input,
+  size_t input_len);
+```
+
+Add input to the hasher. This can be called any number of times.
+
+---
+
+```c++
+template <size_t NumBytes = LLVM_BLAKE3_OUT_LEN>
+using BLAKE3Result = std::array<uint8_t, NumBytes>;
+
+template <size_t NumBytes = LLVM_BLAKE3_OUT_LEN>
+void BLAKE3::final(BLAKE3Result<NumBytes> &Result);
+
+template <size_t NumBytes = LLVM_BLAKE3_OUT_LEN>
+BLAKE3Result<NumBytes> BLAKE3::final();
+```
+```c
+void llvm_blake3_hasher_finalize(
+  const llvm_blake3_hasher *self,
+  uint8_t *out,
+  size_t out_len);
+```
+
+Finalize the hasher and return an output of any length, given in bytes.
+This doesn't modify the hasher itself, and it's possible to finalize
+again after adding more input. The constant `LLVM_BLAKE3_OUT_LEN` provides
+the default output length, 32 bytes, which is recommended for most
+callers.
+
+Outputs shorter than the default length of 32 bytes (256 bits) provide
+less security. An N-bit BLAKE3 output is intended to provide N bits of
+first and second preimage resistance and N/2 bits of collision
+resistance, for any N up to 256. Longer outputs don't provide any
+additional security.
+
+Shorter BLAKE3 outputs are prefixes of longer ones. Explicitly
+requesting a short output is equivalent to truncating the default-length
+output. (Note that this is different between BLAKE2 and BLAKE3.)
+
+## Less Common API Functions
+
+```c
+void llvm_blake3_hasher_init_keyed(
+  llvm_blake3_hasher *self,
+  const uint8_t key[LLVM_BLAKE3_KEY_LEN]);
+```
+
+Initialize a `llvm_blake3_hasher` in the keyed hashing mode. The key must be
+exactly 32 bytes.
+
+---
+
+```c
+void llvm_blake3_hasher_init_derive_key(
+  llvm_blake3_hasher *self,
+  const char *context);
+```
+
+Initialize a `llvm_blake3_hasher` in the key derivation mode. The context
+string is given as an initialization parameter, and afterwards input key
+material should be given with `llvm_blake3_hasher_update`. The context string
+is a null-terminated C string which should be **hardcoded, globally
+unique, and application-specific**. The context string should not
+include any dynamic input like salts, nonces, or identifiers read from a
+database at runtime. A good default format for the context string is
+`"[application] [commit timestamp] [purpose]"`, e.g., `"example.com
+2019-12-25 16:18:03 session tokens v1"`.
+
+This function is intended for application code written in C. For
+language bindings, see `llvm_blake3_hasher_init_derive_key_raw` below.
+
+---
+
+```c
+void llvm_blake3_hasher_init_derive_key_raw(
+  llvm_blake3_hasher *self,
+  const void *context,
+  size_t context_len);
+```
+
+As `llvm_blake3_hasher_init_derive_key` above, except that the context string
+is given as a pointer to an array of arbitrary bytes with a provided
+length. This is intended for writing language bindings, where C string
+conversion would add unnecessary overhead and new error cases. Unicode
+strings should be encoded as UTF-8.
+
+Application code in C should prefer `llvm_blake3_hasher_init_derive_key`,
+which takes the context as a C string. If you need to use arbitrary
+bytes as a context string in application code, consider whether you're
+violating the requirement that context strings should be hardcoded.
+
+---
+
+```c
+void llvm_blake3_hasher_finalize_seek(
+  const llvm_blake3_hasher *self,
+  uint64_t seek,
+  uint8_t *out,
+  size_t out_len);
+```
+
+The same as `llvm_blake3_hasher_finalize`, but with an additional `seek`
+parameter for the starting byte position in the output stream. To
+efficiently stream a large output without allocating memory, call this
+function in a loop, incrementing `seek` by the output length each time.
+
+---
+
+```c
+void llvm_blake3_hasher_reset(
+  llvm_blake3_hasher *self);
+```
+
+Reset the hasher to its initial state, prior to any calls to
+`llvm_blake3_hasher_update`. Currently this is no different from calling
+`llvm_blake3_hasher_init` or similar again. However, if this implementation gains
+multithreading support in the future, and if `llvm_blake3_hasher` holds (optional)
+threading resources, this function will reuse those resources.
+
+
+# Building
+
+This implementation is just C and assembly files.
+
+## x86
+
+Dynamic dispatch is enabled by default on x86. The implementation will
+query the CPU at runtime to detect SIMD support, and it will use the
+widest instruction set available. By default, `blake3_dispatch.c`
+expects to be linked with code for five different instruction sets:
+portable C, SSE2, SSE4.1, AVX2, and AVX-512.
+
+For each of the x86 SIMD instruction sets, four versions are available:
+three flavors of assembly (Unix, Windows MSVC, and Windows GNU) and one
+version using C intrinsics. The assembly versions are generally
+preferred. They perform better, they perform more consistently across
+different compilers, and they build more quickly. On the other hand, the
+assembly versions are x86\_64-only, and you need to select the right
+flavor for your target platform.
+
+## ARM NEON
+
+The NEON implementation is enabled by default on AArch64, but not on
+other ARM targets, since not all of them support it. To enable it, set
+`BLAKE3_USE_NEON=1`.
+
+To explicitiy disable using NEON instructions on AArch64, set
+`BLAKE3_USE_NEON=0`.
+
+## Other Platforms
+
+The portable implementation should work on most other architectures.
+
+# Multithreading
+
+The implementation doesn't currently support multithreading.
diff --git a/llvm/lib/Support/BLAKE3/blake3.c b/llvm/lib/Support/BLAKE3/blake3.c
new file mode 100644
index 000000000000..a369452a3e75
--- /dev/null
+++ b/llvm/lib/Support/BLAKE3/blake3.c
@@ -0,0 +1,627 @@
+/*===-- blake3.c - BLAKE3 C Implementation ------------------------*- C -*-===*\
+|*                                                                            *|
+|* Released into the public domain with CC0 1.0                               *|
+|* See 'llvm/lib/Support/BLAKE3/LICENSE' for info.                            *|
+|* SPDX-License-Identifier: CC0-1.0                                           *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+
+#include "blake3_impl.h"
+
+const char *llvm_blake3_version(void) { return BLAKE3_VERSION_STRING; }
+
+INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8],
+                             uint8_t flags) {
+  memcpy(self->cv, key, BLAKE3_KEY_LEN);
+  self->chunk_counter = 0;
+  memset(self->buf, 0, BLAKE3_BLOCK_LEN);
+  self->buf_len = 0;
+  self->blocks_compressed = 0;
+  self->flags = flags;
+}
+
+INLINE void chunk_state_reset(blake3_chunk_state *self, const uint32_t key[8],
+                              uint64_t chunk_counter) {
+  memcpy(self->cv, key, BLAKE3_KEY_LEN);
+  self->chunk_counter = chunk_counter;
+  self->blocks_compressed = 0;
+  memset(self->buf, 0, BLAKE3_BLOCK_LEN);
+  self->buf_len = 0;
+}
+
+INLINE size_t chunk_state_len(const blake3_chunk_state *self) {
+  return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) +
+         ((size_t)self->buf_len);
+}
+
+INLINE size_t chunk_state_fill_buf(blake3_chunk_state *self,
+                                   const uint8_t *input, size_t input_len) {
+  size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len);
+  if (take > input_len) {
+    take = input_len;
+  }
+  uint8_t *dest = self->buf + ((size_t)self->buf_len);
+  memcpy(dest, input, take);
+  self->buf_len += (uint8_t)take;
+  return take;
+}
+
+INLINE uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state *self) {
+  if (self->blocks_compressed == 0) {
+    return CHUNK_START;
+  } else {
+    return 0;
+  }
+}
+
+typedef struct {
+  uint32_t input_cv[8];
+  uint64_t counter;
+  uint8_t block[BLAKE3_BLOCK_LEN];
+  uint8_t block_len;
+  uint8_t flags;
+} output_t;
+
+INLINE output_t make_output(const uint32_t input_cv[8],
+                            const uint8_t block[BLAKE3_BLOCK_LEN],
+                            uint8_t block_len, uint64_t counter,
+                            uint8_t flags) {
+  output_t ret;
+  memcpy(ret.input_cv, input_cv, 32);
+  memcpy(ret.block, block, BLAKE3_BLOCK_LEN);
+  ret.block_len = block_len;
+  ret.counter = counter;
+  ret.flags = flags;
+  return ret;
+}
+
+// Chaining values within a given chunk (specifically the compress_in_place
+// interface) are represented as words. This avoids unnecessary bytes<->words
+// conversion overhead in the portable implementation. However, the hash_many
+// interface handles both user input and parent node blocks, so it accepts
+// bytes. For that reason, chaining values in the CV stack are represented as
+// bytes.
+INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) {
+  uint32_t cv_words[8];
+  memcpy(cv_words, self->input_cv, 32);
+  blake3_compress_in_place(cv_words, self->block, self->block_len,
+                           self->counter, self->flags);
+  store_cv_words(cv, cv_words);
+}
+
+INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out,
+                              size_t out_len) {
+  uint64_t output_block_counter = seek / 64;
+  size_t offset_within_block = seek % 64;
+  uint8_t wide_buf[64];
+  while (out_len > 0) {
+    blake3_compress_xof(self->input_cv, self->block, self->block_len,
+                        output_block_counter, self->flags | ROOT, wide_buf);
+    size_t available_bytes = 64 - offset_within_block;
+    size_t memcpy_len;
+    if (out_len > available_bytes) {
+      memcpy_len = available_bytes;
+    } else {
+      memcpy_len = out_len;
+    }
+    memcpy(out, wide_buf + offset_within_block, memcpy_len);
+    out += memcpy_len;
+    out_len -= memcpy_len;
+    output_block_counter += 1;
+    offset_within_block = 0;
+  }
+}
+
+INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input,
+                               size_t input_len) {
+  if (self->buf_len > 0) {
+    size_t take = chunk_state_fill_buf(self, input, input_len);
+    input += take;
+    input_len -= take;
+    if (input_len > 0) {
+      blake3_compress_in_place(
+          self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter,
+          self->flags | chunk_state_maybe_start_flag(self));
+      self->blocks_compressed += 1;
+      self->buf_len = 0;
+      memset(self->buf, 0, BLAKE3_BLOCK_LEN);
+    }
+  }
+
+  while (input_len > BLAKE3_BLOCK_LEN) {
+    blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN,
+                             self->chunk_counter,
+                             self->flags | chunk_state_maybe_start_flag(self));
+    self->blocks_compressed += 1;
+    input += BLAKE3_BLOCK_LEN;
+    input_len -= BLAKE3_BLOCK_LEN;
+  }
+
+  size_t take = chunk_state_fill_buf(self, input, input_len);
+  input += take;
+  input_len -= take;
+}
+
+INLINE output_t chunk_state_output(const blake3_chunk_state *self) {
+  uint8_t block_flags =
+      self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END;
+  return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter,
+                     block_flags);
+}
+
+INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN],
+                              const uint32_t key[8], uint8_t flags) {
+  return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT);
+}
+
+// Given some input larger than one chunk, return the number of bytes that
+// should go in the left subtree. This is the largest power-of-2 number of
+// chunks that leaves at least 1 byte for the right subtree.
+INLINE size_t left_len(size_t content_len) {
+  // Subtract 1 to reserve at least one byte for the right side. content_len
+  // should always be greater than BLAKE3_CHUNK_LEN.
+  size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN;
+  return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN;
+}
+
+// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time
+// on a single thread. Write out the chunk chaining values and return the
+// number of chunks hashed. These chunks are never the root and never empty;
+// those cases use a different codepath.
+INLINE size_t compress_chunks_parallel(const uint8_t *input, size_t input_len,
+                                       const uint32_t key[8],
+                                       uint64_t chunk_counter, uint8_t flags,
+                                       uint8_t *out) {
+#if defined(BLAKE3_TESTING)
+  assert(0 < input_len);
+  assert(input_len <= MAX_SIMD_DEGREE * BLAKE3_CHUNK_LEN);
+#endif
+
+  const uint8_t *chunks_array[MAX_SIMD_DEGREE];
+  size_t input_position = 0;
+  size_t chunks_array_len = 0;
+  while (input_len - input_position >= BLAKE3_CHUNK_LEN) {
+    chunks_array[chunks_array_len] = &input[input_position];
+    input_position += BLAKE3_CHUNK_LEN;
+    chunks_array_len += 1;
+  }
+
+  blake3_hash_many(chunks_array, chunks_array_len,
+                   BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter,
+                   true, flags, CHUNK_START, CHUNK_END, out);
+
+  // Hash the remaining partial chunk, if there is one. Note that the empty
+  // chunk (meaning the empty message) is a different codepath.
+  if (input_len > input_position) {
+    uint64_t counter = chunk_counter + (uint64_t)chunks_array_len;
+    blake3_chunk_state chunk_state;
+    chunk_state_init(&chunk_state, key, flags);
+    chunk_state.chunk_counter = counter;
+    chunk_state_update(&chunk_state, &input[input_position],
+                       input_len - input_position);
+    output_t output = chunk_state_output(&chunk_state);
+    output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]);
+    return chunks_array_len + 1;
+  } else {
+    return chunks_array_len;
+  }
+}
+
+// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time
+// on a single thread. Write out the parent chaining values and return the
+// number of parents hashed. (If there's an odd input chaining value left over,
+// return it as an additional output.) These parents are never the root and
+// never empty; those cases use a different codepath.
+INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
+                                        size_t num_chaining_values,
+                                        const uint32_t key[8], uint8_t flags,
+                                        uint8_t *out) {
+#if defined(BLAKE3_TESTING)
+  assert(2 <= num_chaining_values);
+  assert(num_chaining_values <= 2 * MAX_SIMD_DEGREE_OR_2);
+#endif
+
+  const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2];
+  size_t parents_array_len = 0;
+  while (num_chaining_values - (2 * parents_array_len) >= 2) {
+    parents_array[parents_array_len] =
+        &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN];
+    parents_array_len += 1;
+  }
+
+  blake3_hash_many(parents_array, parents_array_len, 1, key,
+                   0, // Parents always use counter 0.
+                   false, flags | PARENT,
+                   0, // Parents have no start flags.
+                   0, // Parents have no end flags.
+                   out);
+
+  // If there's an odd child left over, it becomes an output.
+  if (num_chaining_values > 2 * parents_array_len) {
+    memcpy(&out[parents_array_len * BLAKE3_OUT_LEN],
+           &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN],
+           BLAKE3_OUT_LEN);
+    return parents_array_len + 1;
+  } else {
+    return parents_array_len;
+  }
+}
+
+// The wide helper function returns (writes out) an array of chaining values
+// and returns the length of that array. The number of chaining values returned
+// is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
+// if the input is shorter than that many chunks. The reason for maintaining a
+// wide array of chaining values going back up the tree, is to allow the
+// implementation to hash as many parents in parallel as possible.
+//
+// As a special case when the SIMD degree is 1, this function will still return
+// at least 2 outputs. This guarantees that this function doesn't perform the
+// root compression. (If it did, it would use the wrong flags, and also we
+// wouldn't be able to implement exendable ouput.) Note that this function is
+// not used when the whole input is only 1 chunk long; that's a different
+// codepath.
+//
+// Why not just have the caller split the input on the first update(), instead
+// of implementing this special rule? Because we don't want to limit SIMD or
+// multi-threading parallelism for that update().
+static size_t blake3_compress_subtree_wide(const uint8_t *input,
+                                           size_t input_len,
+                                           const uint32_t key[8],
+                                           uint64_t chunk_counter,
+                                           uint8_t flags, uint8_t *out) {
+  // Note that the single chunk case does *not* bump the SIMD degree up to 2
+  // when it is 1. If this implementation adds multi-threading in the future,
+  // this gives us the option of multi-threading even the 2-chunk case, which
+  // can help performance on smaller platforms.
+  if (input_len <= blake3_simd_degree() * BLAKE3_CHUNK_LEN) {
+    return compress_chunks_parallel(input, input_len, key, chunk_counter, flags,
+                                    out);
+  }
+
+  // With more than simd_degree chunks, we need to recurse. Start by dividing
+  // the input into left and right subtrees. (Note that this is only optimal
+  // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree
+  // of 3 or something, we'll need a more complicated strategy.)
+  size_t left_input_len = left_len(input_len);
+  size_t right_input_len = input_len - left_input_len;
+  const uint8_t *right_input = &input[left_input_len];
+  uint64_t right_chunk_counter =
+      chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN);
+
+  // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to
+  // account for the special case of returning 2 outputs when the SIMD degree
+  // is 1.
+  uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
+  size_t degree = blake3_simd_degree();
+  if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) {
+    // The special case: We always use a degree of at least two, to make
+    // sure there are two outputs. Except, as noted above, at the chunk
+    // level, where we allow degree=1. (Note that the 1-chunk-input case is
+    // a different codepath.)
+    degree = 2;
+  }
+  uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN];
+
+  // Recurse! If this implementation adds multi-threading support in the
+  // future, this is where it will go.
+  size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key,
+                                               chunk_counter, flags, cv_array);
+  size_t right_n = blake3_compress_subtree_wide(
+      right_input, right_input_len, key, right_chunk_counter, flags, right_cvs);
+
+  // The special case again. If simd_degree=1, then we'll have left_n=1 and
+  // right_n=1. Rather than compressing them into a single output, return
+  // them directly, to make sure we always have at least two outputs.
+  if (left_n == 1) {
+    memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
+    return 2;
+  }
+
+  // Otherwise, do one layer of parent node compression.
+  size_t num_chaining_values = left_n + right_n;
+  return compress_parents_parallel(cv_array, num_chaining_values, key, flags,
+                                   out);
+}
+
+// Hash a subtree with compress_subtree_wide(), and then condense the resulting
+// list of chaining values down to a single parent node. Don't compress that
+// last parent node, however. Instead, return its message bytes (the
+// concatenated chaining values of its children). This is necessary when the
+// first call to update() supplies a complete subtree, because the topmost
+// parent node of that subtree could end up being the root. It's also necessary
+// for extended output in the general case.
+//
+// As with compress_subtree_wide(), this function is not used on inputs of 1
+// chunk or less. That's a different codepath.
+INLINE void compress_subtree_to_parent_node(
+    const uint8_t *input, size_t input_len, const uint32_t key[8],
+    uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) {
+#if defined(BLAKE3_TESTING)
+  assert(input_len > BLAKE3_CHUNK_LEN);
+#endif
+
+  uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
+  size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
+                                                chunk_counter, flags, cv_array);
+  assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
+
+  // If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
+  // compress_subtree_wide() returns more than 2 chaining values. Condense
+  // them into 2 by forming parent nodes repeatedly.
+  uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
+  // The second half of this loop condition is always true, and we just
+  // asserted it above. But GCC can't tell that it's always true, and if NDEBUG
+  // is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious
+  // warnings here. GCC 8.5 is particularly sensitive, so if you're changing
+  // this code, test it against that version.
+  while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) {
+    num_cvs =
+        compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
+    memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
+  }
+  memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
+}
+
+INLINE void hasher_init_base(blake3_hasher *self, const uint32_t key[8],
+                             uint8_t flags) {
+  memcpy(self->key, key, BLAKE3_KEY_LEN);
+  chunk_state_init(&self->chunk, key, flags);
+  self->cv_stack_len = 0;
+}
+
+void llvm_blake3_hasher_init(blake3_hasher *self) { hasher_init_base(self, IV, 0); }
+
+void llvm_blake3_hasher_init_keyed(blake3_hasher *self,
+                              const uint8_t key[BLAKE3_KEY_LEN]) {
+  uint32_t key_words[8];
+  load_key_words(key, key_words);
+  hasher_init_base(self, key_words, KEYED_HASH);
+}
+
+void llvm_blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
+                                       size_t context_len) {
+  blake3_hasher context_hasher;
+  hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT);
+  llvm_blake3_hasher_update(&context_hasher, context, context_len);
+  uint8_t context_key[BLAKE3_KEY_LEN];
+  llvm_blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN);
+  uint32_t context_key_words[8];
+  load_key_words(context_key, context_key_words);
+  hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL);
+}
+
+void llvm_blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) {
+  llvm_blake3_hasher_init_derive_key_raw(self, context, strlen(context));
+}
+
+// As described in hasher_push_cv() below, we do "lazy merging", delaying
+// merges until right before the next CV is about to be added. This is
+// different from the reference implementation. Another difference is that we
+// aren't always merging 1 chunk at a time. Instead, each CV might represent
+// any power-of-two number of chunks, as long as the smaller-above-larger stack
+// order is maintained. Instead of the "count the trailing 0-bits" algorithm
+// described in the spec, we use a "count the total number of 1-bits" variant
+// that doesn't require us to retain the subtree size of the CV on top of the
+// stack. The principle is the same: each CV that should remain in the stack is
+// represented by a 1-bit in the total number of chunks (or bytes) so far.
+INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) {
+  size_t post_merge_stack_len = (size_t)popcnt(total_len);
+  while (self->cv_stack_len > post_merge_stack_len) {
+    uint8_t *parent_node =
+        &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN];
+    output_t output = parent_output(parent_node, self->key, self->chunk.flags);
+    output_chaining_value(&output, parent_node);
+    self->cv_stack_len -= 1;
+  }
+}
+
+// In reference_impl.rs, we merge the new CV with existing CVs from the stack
+// before pushing it. We can do that because we know more input is coming, so
+// we know none of the merges are root.
+//
+// This setting is different. We want to feed as much input as possible to
+// compress_subtree_wide(), without setting aside anything for the chunk_state.
+// If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once
+// as a single subtree, if at all possible.
+//
+// This leads to two problems:
+// 1) This 64 KiB input might be the only call that ever gets made to update.
+//    In this case, the root node of the 64 KiB subtree would be the root node
+//    of the whole tree, and it would need to be ROOT finalized. We can't
+//    compress it until we know.
+// 2) This 64 KiB input might complete a larger tree, whose root node is
+//    similarly going to be the the root of the whole tree. For example, maybe
+//    we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the
+//    node at the root of the 256 KiB subtree until we know how to finalize it.
+//
+// The second problem is solved with "lazy merging". That is, when we're about
+// to add a CV to the stack, we don't merge it with anything first, as the
+// reference impl does. Instead we do merges using the *previous* CV that was
+// added, which is sitting on top of the stack, and we put the new CV
+// (unmerged) on top of the stack afterwards. This guarantees that we never
+// merge the root node until finalize().
+//
+// Solving the first problem requires an additional tool,
+// compress_subtree_to_parent_node(). That function always returns the top
+// *two* chaining values of the subtree it's compressing. We then do lazy
+// merging with each of them separately, so that the second CV will always
+// remain unmerged. (That also helps us support extendable output when we're
+// hashing an input all-at-once.)
+INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN],
+                           uint64_t chunk_counter) {
+  hasher_merge_cv_stack(self, chunk_counter);
+  memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv,
+         BLAKE3_OUT_LEN);
+  self->cv_stack_len += 1;
+}
+
+void llvm_blake3_hasher_update(blake3_hasher *self, const void *input,
+                          size_t input_len) {
+  // Explicitly checking for zero avoids causing UB by passing a null pointer
+  // to memcpy. This comes up in practice with things like:
+  //   std::vector<uint8_t> v;
+  //   blake3_hasher_update(&hasher, v.data(), v.size());
+  if (input_len == 0) {
+    return;
+  }
+
+  const uint8_t *input_bytes = (const uint8_t *)input;
+
+  // If we have some partial chunk bytes in the internal chunk_state, we need
+  // to finish that chunk first.
+  if (chunk_state_len(&self->chunk) > 0) {
+    size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk);
+    if (take > input_len) {
+      take = input_len;
+    }
+    chunk_state_update(&self->chunk, input_bytes, take);
+    input_bytes += take;
+    input_len -= take;
+    // If we've filled the current chunk and there's more coming, finalize this
+    // chunk and proceed. In this case we know it's not the root.
+    if (input_len > 0) {
+      output_t output = chunk_state_output(&self->chunk);
+      uint8_t chunk_cv[32];
+      output_chaining_value(&output, chunk_cv);
+      hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter);
+      chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1);
+    } else {
+      return;
+    }
+  }
+
+  // Now the chunk_state is clear, and we have more input. If there's more than
+  // a single chunk (so, definitely not the root chunk), hash the largest whole
+  // subtree we can, with the full benefits of SIMD (and maybe in the future,
+  // multi-threading) parallelism. Two restrictions:
+  // - The subtree has to be a power-of-2 number of chunks. Only subtrees along
+  //   the right edge can be incomplete, and we don't know where the right edge
+  //   is going to be until we get to finalize().
+  // - The subtree must evenly divide the total number of chunks up until this
+  //   point (if total is not 0). If the current incomplete subtree is only
+  //   waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have
+  //   to complete the current subtree first.
+  // Because we might need to break up the input to form powers of 2, or to
+  // evenly divide what we already have, this part runs in a loop.
+  while (input_len > BLAKE3_CHUNK_LEN) {
+    size_t subtree_len = round_down_to_power_of_2(input_len);
+    uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN;
+    // Shrink the subtree_len until it evenly divides the count so far. We know
+    // that subtree_len itself is a power of 2, so we can use a bitmasking
+    // trick instead of an actual remainder operation. (Note that if the caller
+    // consistently passes power-of-2 inputs of the same size, as is hopefully
+    // typical, this loop condition will always fail, and subtree_len will
+    // always be the full length of the input.)
+    //
+    // An aside: We don't have to shrink subtree_len quite this much. For
+    // example, if count_so_far is 1, we could pass 2 chunks to
+    // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still
+    // get the right answer in the end, and we might get to use 2-way SIMD
+    // parallelism. The problem with this optimization, is that it gets us
+    // stuck always hashing 2 chunks. The total number of chunks will remain
+    // odd, and we'll never graduate to higher degrees of parallelism. See
+    // https://github.com/BLAKE3-team/BLAKE3/issues/69.
+    while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) {
+      subtree_len /= 2;
+    }
+    // The shrunken subtree_len might now be 1 chunk long. If so, hash that one
+    // chunk by itself. Otherwise, compress the subtree into a pair of CVs.
+    uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN;
+    if (subtree_len <= BLAKE3_CHUNK_LEN) {
+      blake3_chunk_state chunk_state;
+      chunk_state_init(&chunk_state, self->key, self->chunk.flags);
+      chunk_state.chunk_counter = self->chunk.chunk_counter;
+      chunk_state_update(&chunk_state, input_bytes, subtree_len);
+      output_t output = chunk_state_output(&chunk_state);
+      uint8_t cv[BLAKE3_OUT_LEN];
+      output_chaining_value(&output, cv);
+      hasher_push_cv(self, cv, chunk_state.chunk_counter);
+    } else {
+      // This is the high-performance happy path, though getting here depends
+      // on the caller giving us a long enough input.
+      uint8_t cv_pair[2 * BLAKE3_OUT_LEN];
+      compress_subtree_to_parent_node(input_bytes, subtree_len, self->key,
+                                      self->chunk.chunk_counter,
+                                      self->chunk.flags, cv_pair);
+      hasher_push_cv(self, cv_pair, self->chunk.chunk_counter);
+      hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN],
+                     self->chunk.chunk_counter + (subtree_chunks / 2));
+    }
+    self->chunk.chunk_counter += subtree_chunks;
+    input_bytes += subtree_len;
+    input_len -= subtree_len;
+  }
+
+  // If there's any remaining input less than a full chunk, add it to the chunk
+  // state. In that case, also do a final merge loop to make sure the subtree
+  // stack doesn't contain any unmerged pairs. The remaining input means we
+  // know these merges are non-root. This merge loop isn't strictly necessary
+  // here, because hasher_push_chunk_cv already does its own merge loop, but it
+  // simplifies blake3_hasher_finalize below.
+  if (input_len > 0) {
+    chunk_state_update(&self->chunk, input_bytes, input_len);
+    hasher_merge_cv_stack(self, self->chunk.chunk_counter);
+  }
+}
+
+void llvm_blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
+                            size_t out_len) {
+  llvm_blake3_hasher_finalize_seek(self, 0, out, out_len);
+#if LLVM_MEMORY_SANITIZER_BUILD
+  // Avoid false positives due to uninstrumented assembly code.
+  __msan_unpoison(out, out_len);
+#endif
+}
+
+void llvm_blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
+                                 uint8_t *out, size_t out_len) {
+  // Explicitly checking for zero avoids causing UB by passing a null pointer
+  // to memcpy. This comes up in practice with things like:
+  //   std::vector<uint8_t> v;
+  //   blake3_hasher_finalize(&hasher, v.data(), v.size());
+  if (out_len == 0) {
+    return;
+  }
+
+  // If the subtree stack is empty, then the current chunk is the root.
+  if (self->cv_stack_len == 0) {
+    output_t output = chunk_state_output(&self->chunk);
+    output_root_bytes(&output, seek, out, out_len);
+    return;
+  }
+  // If there are any bytes in the chunk state, finalize that chunk and do a
+  // roll-up merge between that chunk hash and every subtree in the stack. In
+  // this case, the extra merge loop at the end of blake3_hasher_update
+  // guarantees that none of the subtrees in the stack need to be merged with
+  // each other first. Otherwise, if there are no bytes in the chunk state,
+  // then the top of the stack is a chunk hash, and we start the merge from
+  // that.
+  output_t output;
+  size_t cvs_remaining;
+  if (chunk_state_len(&self->chunk) > 0) {
+    cvs_remaining = self->cv_stack_len;
+    output = chunk_state_output(&self->chunk);
+  } else {
+    // There are always at least 2 CVs in the stack in this case.
+    cvs_remaining = self->cv_stack_len - 2;
+    output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key,
+                           self->chunk.flags);
+  }
+  while (cvs_remaining > 0) {
+    cvs_remaining -= 1;
+    uint8_t parent_block[BLAKE3_BLOCK_LEN];
+    memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32);
+    output_chaining_value(&output, &parent_block[32]);
+    output = parent_output(parent_block, self->key, self->chunk.flags);
+  }
+  output_root_bytes(&output, seek, out, out_len);
+}
+
+void llvm_blake3_hasher_reset(blake3_hasher *self) {
+  chunk_state_reset(&self->chunk, self->key, 0);
+  self->cv_stack_len = 0;
+}
diff --git a/llvm/lib/Support/BLAKE3/blake3_avx2.c b/llvm/lib/Support/BLAKE3/blake3_avx2.c
new file mode 100644
index 000000000000..e76aa1a3aeb3
--- /dev/null
+++ b/llvm/lib/Support/BLAKE3/blake3_avx2.c
@@ -0,0 +1,326 @@
+#include "blake3_impl.h"
+
+#include <immintrin.h>
+
+#define DEGREE 8
+
+INLINE __m256i loadu(const uint8_t src[32]) {
+  return _mm256_loadu_si256((const __m256i *)src);
+}
+
+INLINE void storeu(__m256i src, uint8_t dest[16]) {
+  _mm256_storeu_si256((__m256i *)dest, src);
+}
+
+INLINE __m256i addv(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); }
+
+// Note that clang-format doesn't like the name "xor" for some reason.
+INLINE __m256i xorv(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); }
+
+INLINE __m256i set1(uint32_t x) { return _mm256_set1_epi32((int32_t)x); }
+
+INLINE __m256i rot16(__m256i x) {
+  return _mm256_shuffle_epi8(
+      x, _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2,
+                         13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2));
+}
+
+INLINE __m256i rot12(__m256i x) {
+  return _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12));
+}
+
+INLINE __m256i rot8(__m256i x) {
+  return _mm256_shuffle_epi8(
+      x, _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1,
+                         12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1));
+}
+
+INLINE __m256i rot7(__m256i x) {
+  return _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7));
+}
+
+INLINE void round_fn(__m256i v[16], __m256i m[16], size_t r) {
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
+  v[0] = addv(v[0], v[4]);
+  v[1] = addv(v[1], v[5]);
+  v[2] = addv(v[2], v[6]);
+  v[3] = addv(v[3], v[7]);
+  v[12] = xorv(v[12], v[0]);
+  v[13] = xorv(v[13], v[1]);
+  v[14] = xorv(v[14], v[2]);
+  v[15] = xorv(v[15], v[3]);
+  v[12] = rot16(v[12]);
+  v[13] = rot16(v[13]);
+  v[14] = rot16(v[14]);
+  v[15] = rot16(v[15]);
+  v[8] = addv(v[8], v[12]);
+  v[9] = addv(v[9], v[13]);
+  v[10] = addv(v[10], v[14]);
+  v[11] = addv(v[11], v[15]);
+  v[4] = xorv(v[4], v[8]);
+  v[5] = xorv(v[5], v[9]);
+  v[6] = xorv(v[6], v[10]);
+  v[7] = xorv(v[7], v[11]);
+  v[4] = rot12(v[4]);
+  v[5] = rot12(v[5]);
+  v[6] = rot12(v[6]);
+  v[7] = rot12(v[7]);
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
+  v[0] = addv(v[0], v[4]);
+  v[1] = addv(v[1], v[5]);
+  v[2] = addv(v[2], v[6]);
+  v[3] = addv(v[3], v[7]);
+  v[12] = xorv(v[12], v[0]);
+  v[13] = xorv(v[13], v[1]);
+  v[14] = xorv(v[14], v[2]);
+  v[15] = xorv(v[15], v[3]);
+  v[12] = rot8(v[12]);
+  v[13] = rot8(v[13]);
+  v[14] = rot8(v[14]);
+  v[15] = rot8(v[15]);
+  v[8] = addv(v[8], v[12]);
+  v[9] = addv(v[9], v[13]);
+  v[10] = addv(v[10], v[14]);
+  v[11] = addv(v[11], v[15]);
+  v[4] = xorv(v[4], v[8]);
+  v[5] = xorv(v[5], v[9]);
+  v[6] = xorv(v[6], v[10]);
+  v[7] = xorv(v[7], v[11]);
+  v[4] = rot7(v[4]);
+  v[5] = rot7(v[5]);
+  v[6] = rot7(v[6]);
+  v[7] = rot7(v[7]);
+
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
+  v[0] = addv(v[0], v[5]);
+  v[1] = addv(v[1], v[6]);
+  v[2] = addv(v[2], v[7]);
+  v[3] = addv(v[3], v[4]);
+  v[15] = xorv(v[15], v[0]);
+  v[12] = xorv(v[12], v[1]);
+  v[13] = xorv(v[13], v[2]);
+  v[14] = xorv(v[14], v[3]);
+  v[15] = rot16(v[15]);
+  v[12] = rot16(v[12]);
+  v[13] = rot16(v[13]);
+  v[14] = rot16(v[14]);
+  v[10] = addv(v[10], v[15]);
+  v[11] = addv(v[11], v[12]);
+  v[8] = addv(v[8], v[13]);
+  v[9] = addv(v[9], v[14]);
+  v[5] = xorv(v[5], v[10]);
+  v[6] = xorv(v[6], v[11]);
+  v[7] = xorv(v[7], v[8]);
+  v[4] = xorv(v[4], v[9]);
+  v[5] = rot12(v[5]);
+  v[6] = rot12(v[6]);
+  v[7] = rot12(v[7]);
+  v[4] = rot12(v[4]);
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
+  v[0] = addv(v[0], v[5]);
+  v[1] = addv(v[1], v[6]);
+  v[2] = addv(v[2], v[7]);
+  v[3] = addv(v[3], v[4]);
+  v[15] = xorv(v[15], v[0]);
+  v[12] = xorv(v[12], v[1]);
+  v[13] = xorv(v[13], v[2]);
+  v[14] = xorv(v[14], v[3]);
+  v[15] = rot8(v[15]);
+  v[12] = rot8(v[12]);
+  v[13] = rot8(v[13]);
+  v[14] = rot8(v[14]);
+  v[10] = addv(v[10], v[15]);
+  v[11] = addv(v[11], v[12]);
+  v[8] = addv(v[8], v[13]);
+  v[9] = addv(v[9], v[14]);
+  v[5] = xorv(v[5], v[10]);
+  v[6] = xorv(v[6], v[11]);
+  v[7] = xorv(v[7], v[8]);
+  v[4] = xorv(v[4], v[9]);
+  v[5] = rot7(v[5]);
+  v[6] = rot7(v[6]);
+  v[7] = rot7(v[7]);
+  v[4] = rot7(v[4]);
+}
+
+INLINE void transpose_vecs(__m256i vecs[DEGREE]) {
+  // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high
+  // is 22/33/66/77.
+  __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]);
+  __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]);
+  __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]);
+  __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]);
+  __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]);
+  __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]);
+  __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
+  __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
+
+  // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
+  // 11/33.
+  __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
+  __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
+  __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367);
+  __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367);
+  __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145);
+  __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145);
+  __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367);
+  __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367);
+
+  // Interleave 128-bit lanes.
+  vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20);
+  vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20);
+  vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20);
+  vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20);
+  vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31);
+  vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31);
+  vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31);
+  vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31);
+}
+
+INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
+                               size_t block_offset, __m256i out[16]) {
+  out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m256i)]);
+  out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m256i)]);
+  out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m256i)]);
+  out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m256i)]);
+  out[4] = loadu(&inputs[4][block_offset + 0 * sizeof(__m256i)]);
+  out[5] = loadu(&inputs[5][block_offset + 0 * sizeof(__m256i)]);
+  out[6] = loadu(&inputs[6][block_offset + 0 * sizeof(__m256i)]);
+  out[7] = loadu(&inputs[7][block_offset + 0 * sizeof(__m256i)]);
+  out[8] = loadu(&inputs[0][block_offset + 1 * sizeof(__m256i)]);
+  out[9] = loadu(&inputs[1][block_offset + 1 * sizeof(__m256i)]);
+  out[10] = loadu(&inputs[2][block_offset + 1 * sizeof(__m256i)]);
+  out[11] = loadu(&inputs[3][block_offset + 1 * sizeof(__m256i)]);
+  out[12] = loadu(&inputs[4][block_offset + 1 * sizeof(__m256i)]);
+  out[13] = loadu(&inputs[5][block_offset + 1 * sizeof(__m256i)]);
+  out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
+  out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
+  for (size_t i = 0; i < 8; ++i) {
+    _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
+  }
+  transpose_vecs(&out[0]);
+  transpose_vecs(&out[8]);
+}
+
+INLINE void load_counters(uint64_t counter, bool increment_counter,
+                          __m256i *out_lo, __m256i *out_hi) {
+  const __m256i mask = _mm256_set1_epi32(-(int32_t)increment_counter);
+  const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+  const __m256i add1 = _mm256_and_si256(mask, add0);
+  __m256i l = _mm256_add_epi32(_mm256_set1_epi32((int32_t)counter), add1);
+  __m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)), 
+                                     _mm256_xor_si256(   l, _mm256_set1_epi32(0x80000000)));
+  __m256i h = _mm256_sub_epi32(_mm256_set1_epi32((int32_t)(counter >> 32)), carry);
+  *out_lo = l;
+  *out_hi = h;
+}
+
+static
+void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks,
+                       const uint32_t key[8], uint64_t counter,
+                       bool increment_counter, uint8_t flags,
+                       uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+  __m256i h_vecs[8] = {
+      set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]),
+      set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]),
+  };
+  __m256i counter_low_vec, counter_high_vec;
+  load_counters(counter, increment_counter, &counter_low_vec,
+                &counter_high_vec);
+  uint8_t block_flags = flags | flags_start;
+
+  for (size_t block = 0; block < blocks; block++) {
+    if (block + 1 == blocks) {
+      block_flags |= flags_end;
+    }
+    __m256i block_len_vec = set1(BLAKE3_BLOCK_LEN);
+    __m256i block_flags_vec = set1(block_flags);
+    __m256i msg_vecs[16];
+    transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
+
+    __m256i v[16] = {
+        h_vecs[0],       h_vecs[1],        h_vecs[2],     h_vecs[3],
+        h_vecs[4],       h_vecs[5],        h_vecs[6],     h_vecs[7],
+        set1(IV[0]),     set1(IV[1]),      set1(IV[2]),   set1(IV[3]),
+        counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
+    };
+    round_fn(v, msg_vecs, 0);
+    round_fn(v, msg_vecs, 1);
+    round_fn(v, msg_vecs, 2);
+    round_fn(v, msg_vecs, 3);
+    round_fn(v, msg_vecs, 4);
+    round_fn(v, msg_vecs, 5);
+    round_fn(v, msg_vecs, 6);
+    h_vecs[0] = xorv(v[0], v[8]);
+    h_vecs[1] = xorv(v[1], v[9]);
+    h_vecs[2] = xorv(v[2], v[10]);
+    h_vecs[3] = xorv(v[3], v[11]);
+    h_vecs[4] = xorv(v[4], v[12]);
+    h_vecs[5] = xorv(v[5], v[13]);
+    h_vecs[6] = xorv(v[6], v[14]);
+    h_vecs[7] = xorv(v[7], v[15]);
+
+    block_flags = flags;
+  }
+
+  transpose_vecs(h_vecs);
+  storeu(h_vecs[0], &out[0 * sizeof(__m256i)]);
+  storeu(h_vecs[1], &out[1 * sizeof(__m256i)]);
+  storeu(h_vecs[2], &out[2 * sizeof(__m256i)]);
+  storeu(h_vecs[3], &out[3 * sizeof(__m256i)]);
+  storeu(h_vecs[4], &out[4 * sizeof(__m256i)]);
+  storeu(h_vecs[5], &out[5 * sizeof(__m256i)]);
+  storeu(h_vecs[6], &out[6 * sizeof(__m256i)]);
+  storeu(h_vecs[7], &out[7 * sizeof(__m256i)]);
+}
+
+#if !defined(BLAKE3_NO_SSE41)
+void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
+                            size_t blocks, const uint32_t key[8],
+                            uint64_t counter, bool increment_counter,
+                            uint8_t flags, uint8_t flags_start,
+                            uint8_t flags_end, uint8_t *out);
+#else
+void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
+                               size_t blocks, const uint32_t key[8],
+                               uint64_t counter, bool increment_counter,
+                               uint8_t flags, uint8_t flags_start,
+                               uint8_t flags_end, uint8_t *out);
+#endif
+
+void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
+                           size_t blocks, const uint32_t key[8],
+                           uint64_t counter, bool increment_counter,
+                           uint8_t flags, uint8_t flags_start,
+                           uint8_t flags_end, uint8_t *out) {
+  while (num_inputs >= DEGREE) {
+    blake3_hash8_avx2(inputs, blocks, key, counter, increment_counter, flags,
+                      flags_start, flags_end, out);
+    if (increment_counter) {
+      counter += DEGREE;
+    }
+    inputs += DEGREE;
+    num_inputs -= DEGREE;
+    out = &out[DEGREE * BLAKE3_OUT_LEN];
+  }
+#if !defined(BLAKE3_NO_SSE41)
+  blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
+                         increment_counter, flags, flags_start, flags_end, out);
+#else
+  blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
+                            increment_counter, flags, flags_start, flags_end,
+                            out);
+#endif
+}
diff --git a/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_unix.S b/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_unix.S
new file mode 100644
index 000000000000..449e07492832
--- /dev/null
+++ b/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_unix.S
@@ -0,0 +1,1826 @@
+#if defined(__x86_64__)
+
+#if defined(__ELF__) && defined(__linux__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
+#if __has_include(<cet.h>)
+#include <cet.h>
+#endif
+#endif
+
+#if !defined(_CET_ENDBR)
+#define _CET_ENDBR
+#endif
+
+#ifdef __APPLE__
+#define HIDDEN .private_extern
+#else
+#define HIDDEN .hidden
+#endif
+
+.intel_syntax noprefix
+HIDDEN _blake3_hash_many_avx2
+HIDDEN blake3_hash_many_avx2
+.global _blake3_hash_many_avx2
+.global blake3_hash_many_avx2
+#ifdef __APPLE__
+.text
+#else
+.section .text
+#endif
+        .p2align  6
+_blake3_hash_many_avx2:
+blake3_hash_many_avx2:
+        _CET_ENDBR
+        push    r15
+        push    r14
+        push    r13
+        push    r12
+        push    rbx
+        push    rbp
+        mov     rbp, rsp
+        sub     rsp, 680
+        and     rsp, 0xFFFFFFFFFFFFFFC0
+        neg     r9d
+        vmovd   xmm0, r9d
+        vpbroadcastd ymm0, xmm0
+        vmovdqa ymmword ptr [rsp+0x280], ymm0
+        vpand   ymm1, ymm0, ymmword ptr [ADD0+rip]
+        vpand   ymm2, ymm0, ymmword ptr [ADD1+rip]
+        vmovdqa ymmword ptr [rsp+0x220], ymm2
+        vmovd   xmm2, r8d
+        vpbroadcastd ymm2, xmm2
+        vpaddd  ymm2, ymm2, ymm1
+        vmovdqa ymmword ptr [rsp+0x240], ymm2
+        vpxor   ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip]
+        vpxor   ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip]
+        vpcmpgtd ymm2, ymm1, ymm2
+        shr     r8, 32
+        vmovd   xmm3, r8d
+        vpbroadcastd ymm3, xmm3
+        vpsubd  ymm3, ymm3, ymm2
+        vmovdqa ymmword ptr [rsp+0x260], ymm3
+        shl     rdx, 6
+        mov     qword ptr [rsp+0x2A0], rdx
+        cmp     rsi, 8
+        jc      3f
+2:
+        vpbroadcastd ymm0, dword ptr [rcx]
+        vpbroadcastd ymm1, dword ptr [rcx+0x4]
+        vpbroadcastd ymm2, dword ptr [rcx+0x8]
+        vpbroadcastd ymm3, dword ptr [rcx+0xC]
+        vpbroadcastd ymm4, dword ptr [rcx+0x10]
+        vpbroadcastd ymm5, dword ptr [rcx+0x14]
+        vpbroadcastd ymm6, dword ptr [rcx+0x18]
+        vpbroadcastd ymm7, dword ptr [rcx+0x1C]
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        mov     r10, qword ptr [rdi+0x10]
+        mov     r11, qword ptr [rdi+0x18]
+        mov     r12, qword ptr [rdi+0x20]
+        mov     r13, qword ptr [rdi+0x28]
+        mov     r14, qword ptr [rdi+0x30]
+        mov     r15, qword ptr [rdi+0x38]
+        movzx   eax, byte ptr [rbp+0x38]
+        movzx   ebx, byte ptr [rbp+0x40]
+        or      eax, ebx
+        xor     edx, edx
+.p2align  5
+9:
+        movzx   ebx, byte ptr [rbp+0x48]
+        or      ebx, eax
+        add     rdx, 64
+        cmp     rdx, qword ptr [rsp+0x2A0]
+        cmove   eax, ebx
+        mov     dword ptr [rsp+0x200], eax
+        vmovups xmm8, xmmword ptr [r8+rdx-0x40]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01
+        vmovups xmm9, xmmword ptr [r9+rdx-0x40]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01
+        vunpcklpd ymm12, ymm8, ymm9
+        vunpckhpd ymm13, ymm8, ymm9
+        vmovups xmm10, xmmword ptr [r10+rdx-0x40]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01
+        vmovups xmm11, xmmword ptr [r11+rdx-0x40]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01
+        vunpcklpd ymm14, ymm10, ymm11
+        vunpckhpd ymm15, ymm10, ymm11
+        vshufps ymm8, ymm12, ymm14, 136
+        vmovaps ymmword ptr [rsp], ymm8
+        vshufps ymm9, ymm12, ymm14, 221
+        vmovaps ymmword ptr [rsp+0x20], ymm9
+        vshufps ymm10, ymm13, ymm15, 136
+        vmovaps ymmword ptr [rsp+0x40], ymm10
+        vshufps ymm11, ymm13, ymm15, 221
+        vmovaps ymmword ptr [rsp+0x60], ymm11
+        vmovups xmm8, xmmword ptr [r8+rdx-0x30]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01
+        vmovups xmm9, xmmword ptr [r9+rdx-0x30]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01
+        vunpcklpd ymm12, ymm8, ymm9
+        vunpckhpd ymm13, ymm8, ymm9
+        vmovups xmm10, xmmword ptr [r10+rdx-0x30]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01
+        vmovups xmm11, xmmword ptr [r11+rdx-0x30]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01
+        vunpcklpd ymm14, ymm10, ymm11
+        vunpckhpd ymm15, ymm10, ymm11
+        vshufps ymm8, ymm12, ymm14, 136
+        vmovaps ymmword ptr [rsp+0x80], ymm8
+        vshufps ymm9, ymm12, ymm14, 221
+        vmovaps ymmword ptr [rsp+0xA0], ymm9
+        vshufps ymm10, ymm13, ymm15, 136
+        vmovaps ymmword ptr [rsp+0xC0], ymm10
+        vshufps ymm11, ymm13, ymm15, 221
+        vmovaps ymmword ptr [rsp+0xE0], ymm11
+        vmovups xmm8, xmmword ptr [r8+rdx-0x20]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01
+        vmovups xmm9, xmmword ptr [r9+rdx-0x20]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01
+        vunpcklpd ymm12, ymm8, ymm9
+        vunpckhpd ymm13, ymm8, ymm9
+        vmovups xmm10, xmmword ptr [r10+rdx-0x20]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01
+        vmovups xmm11, xmmword ptr [r11+rdx-0x20]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01
+        vunpcklpd ymm14, ymm10, ymm11
+        vunpckhpd ymm15, ymm10, ymm11
+        vshufps ymm8, ymm12, ymm14, 136
+        vmovaps ymmword ptr [rsp+0x100], ymm8
+        vshufps ymm9, ymm12, ymm14, 221
+        vmovaps ymmword ptr [rsp+0x120], ymm9
+        vshufps ymm10, ymm13, ymm15, 136
+        vmovaps ymmword ptr [rsp+0x140], ymm10
+        vshufps ymm11, ymm13, ymm15, 221
+        vmovaps ymmword ptr [rsp+0x160], ymm11
+        vmovups xmm8, xmmword ptr [r8+rdx-0x10]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01
+        vmovups xmm9, xmmword ptr [r9+rdx-0x10]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01
+        vunpcklpd ymm12, ymm8, ymm9
+        vunpckhpd ymm13, ymm8, ymm9
+        vmovups xmm10, xmmword ptr [r10+rdx-0x10]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01
+        vmovups xmm11, xmmword ptr [r11+rdx-0x10]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01
+        vunpcklpd ymm14, ymm10, ymm11
+        vunpckhpd ymm15, ymm10, ymm11
+        vshufps ymm8, ymm12, ymm14, 136
+        vmovaps ymmword ptr [rsp+0x180], ymm8
+        vshufps ymm9, ymm12, ymm14, 221
+        vmovaps ymmword ptr [rsp+0x1A0], ymm9
+        vshufps ymm10, ymm13, ymm15, 136
+        vmovaps ymmword ptr [rsp+0x1C0], ymm10
+        vshufps ymm11, ymm13, ymm15, 221
+        vmovaps ymmword ptr [rsp+0x1E0], ymm11
+        vpbroadcastd ymm15, dword ptr [rsp+0x200]
+        prefetcht0 [r8+rdx+0x80]
+        prefetcht0 [r12+rdx+0x80]
+        prefetcht0 [r9+rdx+0x80]
+        prefetcht0 [r13+rdx+0x80]
+        prefetcht0 [r10+rdx+0x80]
+        prefetcht0 [r14+rdx+0x80]
+        prefetcht0 [r11+rdx+0x80]
+        prefetcht0 [r15+rdx+0x80]
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x40]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x80]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0xC0]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm0, ymmword ptr [rsp+0x240]
+        vpxor   ymm13, ymm1, ymmword ptr [rsp+0x260]
+        vpxor   ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip]
+        vpxor   ymm15, ymm3, ymm15
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip]
+        vpaddd  ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip]
+        vpaddd  ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip]
+        vpaddd  ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip]
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x20]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x60]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0xA0]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0xE0]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x100]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x140]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x180]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x1C0]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x120]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x160]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x1A0]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x1E0]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x40]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x60]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0xE0]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x80]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0xC0]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x140]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x1A0]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x20]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x180]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x120]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x1E0]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x160]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0xA0]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x1C0]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x100]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x60]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x140]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x1A0]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0xE0]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x80]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x180]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x40]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x1C0]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0xC0]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x120]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x160]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x100]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0xA0]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x1E0]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x20]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x140]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x180]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x1C0]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x1A0]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0xE0]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x120]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x60]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x1E0]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x80]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x160]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0xA0]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x20]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x40]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x100]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0xC0]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x180]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x120]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x1E0]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x1C0]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x1A0]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x160]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x140]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x100]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0xE0]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0xA0]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0xC0]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x40]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x60]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x20]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x80]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x120]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x160]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x100]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x1E0]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x1C0]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0xA0]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x180]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x20]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x1A0]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x40]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x80]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x60]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x140]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0xC0]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0xE0]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x160]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0xA0]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x20]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x100]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x1E0]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x120]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0xC0]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x1C0]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x40]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x60]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0xE0]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x140]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x180]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x80]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x1A0]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vpxor   ymm0, ymm0, ymm8
+        vpxor   ymm1, ymm1, ymm9
+        vpxor   ymm2, ymm2, ymm10
+        vpxor   ymm3, ymm3, ymm11
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpxor   ymm4, ymm4, ymm12
+        vpxor   ymm5, ymm5, ymm13
+        vpxor   ymm6, ymm6, ymm14
+        vpxor   ymm7, ymm7, ymm15
+        movzx   eax, byte ptr [rbp+0x38]
+        jne     9b
+        mov     rbx, qword ptr [rbp+0x50]
+        vunpcklps ymm8, ymm0, ymm1
+        vunpcklps ymm9, ymm2, ymm3
+        vunpckhps ymm10, ymm0, ymm1
+        vunpcklps ymm11, ymm4, ymm5
+        vunpcklps ymm0, ymm6, ymm7
+        vshufps ymm12, ymm8, ymm9, 78
+        vblendps ymm1, ymm8, ymm12, 0xCC
+        vshufps ymm8, ymm11, ymm0, 78
+        vunpckhps ymm13, ymm2, ymm3
+        vblendps ymm2, ymm11, ymm8, 0xCC
+        vblendps ymm3, ymm12, ymm9, 0xCC
+        vperm2f128 ymm12, ymm1, ymm2, 0x20
+        vmovups ymmword ptr [rbx], ymm12
+        vunpckhps ymm14, ymm4, ymm5
+        vblendps ymm4, ymm8, ymm0, 0xCC
+        vunpckhps ymm15, ymm6, ymm7
+        vperm2f128 ymm7, ymm3, ymm4, 0x20
+        vmovups ymmword ptr [rbx+0x20], ymm7
+        vshufps ymm5, ymm10, ymm13, 78
+        vblendps ymm6, ymm5, ymm13, 0xCC
+        vshufps ymm13, ymm14, ymm15, 78
+        vblendps ymm10, ymm10, ymm5, 0xCC
+        vblendps ymm14, ymm14, ymm13, 0xCC
+        vperm2f128 ymm8, ymm10, ymm14, 0x20
+        vmovups ymmword ptr [rbx+0x40], ymm8
+        vblendps ymm15, ymm13, ymm15, 0xCC
+        vperm2f128 ymm13, ymm6, ymm15, 0x20
+        vmovups ymmword ptr [rbx+0x60], ymm13
+        vperm2f128 ymm9, ymm1, ymm2, 0x31
+        vperm2f128 ymm11, ymm3, ymm4, 0x31
+        vmovups ymmword ptr [rbx+0x80], ymm9
+        vperm2f128 ymm14, ymm10, ymm14, 0x31
+        vperm2f128 ymm15, ymm6, ymm15, 0x31
+        vmovups ymmword ptr [rbx+0xA0], ymm11
+        vmovups ymmword ptr [rbx+0xC0], ymm14
+        vmovups ymmword ptr [rbx+0xE0], ymm15
+        vmovdqa ymm0, ymmword ptr [rsp+0x220]
+        vpaddd  ymm1, ymm0, ymmword ptr [rsp+0x240]
+        vmovdqa ymmword ptr [rsp+0x240], ymm1
+        vpxor   ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip]
+        vpxor   ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip]
+        vpcmpgtd ymm2, ymm0, ymm2
+        vmovdqa ymm0, ymmword ptr [rsp+0x260]
+        vpsubd  ymm2, ymm0, ymm2
+        vmovdqa ymmword ptr [rsp+0x260], ymm2
+        add     rdi, 64
+        add     rbx, 256
+        mov     qword ptr [rbp+0x50], rbx
+        sub     rsi, 8
+        cmp     rsi, 8
+        jnc     2b
+        test    rsi, rsi
+        jnz     3f
+4:
+        vzeroupper
+        mov     rsp, rbp
+        pop     rbp
+        pop     rbx
+        pop     r12
+        pop     r13
+        pop     r14
+        pop     r15
+        ret
+.p2align  5
+3:
+        mov     rbx, qword ptr [rbp+0x50]
+        mov     r15, qword ptr [rsp+0x2A0]
+        movzx   r13d, byte ptr [rbp+0x38]
+        movzx   r12d, byte ptr [rbp+0x48]
+        test    rsi, 0x4
+        je      3f
+        vbroadcasti128 ymm0, xmmword ptr [rcx]
+        vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
+        vmovdqa ymm8, ymm0
+        vmovdqa ymm9, ymm1
+        vbroadcasti128 ymm12, xmmword ptr [rsp+0x240]
+        vbroadcasti128 ymm13, xmmword ptr [rsp+0x260]
+        vpunpckldq ymm14, ymm12, ymm13
+        vpunpckhdq ymm15, ymm12, ymm13
+        vpermq  ymm14, ymm14, 0x50
+        vpermq  ymm15, ymm15, 0x50
+        vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
+        vpblendd ymm14, ymm14, ymm12, 0x44
+        vpblendd ymm15, ymm15, ymm12, 0x44
+        vmovdqa ymmword ptr [rsp], ymm14
+        vmovdqa ymmword ptr [rsp+0x20], ymm15
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        mov     r10, qword ptr [rdi+0x10]
+        mov     r11, qword ptr [rdi+0x18]
+        movzx   eax, byte ptr [rbp+0x40]
+        or      eax, r13d
+        xor     edx, edx
+.p2align  5
+2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        mov     dword ptr [rsp+0x200], eax
+        vmovups ymm2, ymmword ptr [r8+rdx-0x40]
+        vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01
+        vmovups ymm3, ymmword ptr [r8+rdx-0x30]
+        vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01
+        vshufps ymm4, ymm2, ymm3, 136
+        vshufps ymm5, ymm2, ymm3, 221
+        vmovups ymm2, ymmword ptr [r8+rdx-0x20]
+        vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01
+        vmovups ymm3, ymmword ptr [r8+rdx-0x10]
+        vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01
+        vshufps ymm6, ymm2, ymm3, 136
+        vshufps ymm7, ymm2, ymm3, 221
+        vpshufd ymm6, ymm6, 0x93
+        vpshufd ymm7, ymm7, 0x93
+        vmovups ymm10, ymmword ptr [r10+rdx-0x40]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01
+        vmovups ymm11, ymmword ptr [r10+rdx-0x30]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01
+        vshufps ymm12, ymm10, ymm11, 136
+        vshufps ymm13, ymm10, ymm11, 221
+        vmovups ymm10, ymmword ptr [r10+rdx-0x20]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01
+        vmovups ymm11, ymmword ptr [r10+rdx-0x10]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01
+        vshufps ymm14, ymm10, ymm11, 136
+        vshufps ymm15, ymm10, ymm11, 221
+        vpshufd ymm14, ymm14, 0x93
+        vpshufd ymm15, ymm15, 0x93
+        prefetcht0 [r8+rdx+0x80]
+        prefetcht0 [r9+rdx+0x80]
+        prefetcht0 [r10+rdx+0x80]
+        prefetcht0 [r11+rdx+0x80]
+        vpbroadcastd ymm2, dword ptr [rsp+0x200]
+        vmovdqa ymm3, ymmword ptr [rsp]
+        vmovdqa ymm11, ymmword ptr [rsp+0x20]
+        vpblendd ymm3, ymm3, ymm2, 0x88
+        vpblendd ymm11, ymm11, ymm2, 0x88
+        vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
+        vmovdqa ymm10, ymm2
+        mov     al, 7
+9:
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm8, ymm8, ymm12
+        vmovdqa ymmword ptr [rsp+0x40], ymm4
+        nop
+        vmovdqa ymmword ptr [rsp+0x60], ymm12
+        nop
+        vpaddd  ymm0, ymm0, ymm1
+        vpaddd  ymm8, ymm8, ymm9
+        vpxor   ymm3, ymm3, ymm0
+        vpxor   ymm11, ymm11, ymm8
+        vbroadcasti128 ymm4, xmmword ptr [ROT16+rip]
+        vpshufb ymm3, ymm3, ymm4
+        vpshufb ymm11, ymm11, ymm4
+        vpaddd  ymm2, ymm2, ymm3
+        vpaddd  ymm10, ymm10, ymm11
+        vpxor   ymm1, ymm1, ymm2
+        vpxor   ymm9, ymm9, ymm10
+        vpsrld  ymm4, ymm1, 12
+        vpslld  ymm1, ymm1, 20
+        vpor    ymm1, ymm1, ymm4
+        vpsrld  ymm4, ymm9, 12
+        vpslld  ymm9, ymm9, 20
+        vpor    ymm9, ymm9, ymm4
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm0, ymm0, ymm1
+        vpaddd  ymm8, ymm8, ymm9
+        vmovdqa ymmword ptr [rsp+0x80], ymm5
+        vmovdqa ymmword ptr [rsp+0xA0], ymm13
+        vpxor   ymm3, ymm3, ymm0
+        vpxor   ymm11, ymm11, ymm8
+        vbroadcasti128 ymm4, xmmword ptr [ROT8+rip]
+        vpshufb ymm3, ymm3, ymm4
+        vpshufb ymm11, ymm11, ymm4
+        vpaddd  ymm2, ymm2, ymm3
+        vpaddd  ymm10, ymm10, ymm11
+        vpxor   ymm1, ymm1, ymm2
+        vpxor   ymm9, ymm9, ymm10
+        vpsrld  ymm4, ymm1, 7
+        vpslld  ymm1, ymm1, 25
+        vpor    ymm1, ymm1, ymm4
+        vpsrld  ymm4, ymm9, 7
+        vpslld  ymm9, ymm9, 25
+        vpor    ymm9, ymm9, ymm4
+        vpshufd ymm0, ymm0, 0x93
+        vpshufd ymm8, ymm8, 0x93
+        vpshufd ymm3, ymm3, 0x4E
+        vpshufd ymm11, ymm11, 0x4E
+        vpshufd ymm2, ymm2, 0x39
+        vpshufd ymm10, ymm10, 0x39
+        vpaddd  ymm0, ymm0, ymm6
+        vpaddd  ymm8, ymm8, ymm14
+        vpaddd  ymm0, ymm0, ymm1
+        vpaddd  ymm8, ymm8, ymm9
+        vpxor   ymm3, ymm3, ymm0
+        vpxor   ymm11, ymm11, ymm8
+        vbroadcasti128 ymm4, xmmword ptr [ROT16+rip]
+        vpshufb ymm3, ymm3, ymm4
+        vpshufb ymm11, ymm11, ymm4
+        vpaddd  ymm2, ymm2, ymm3
+        vpaddd  ymm10, ymm10, ymm11
+        vpxor   ymm1, ymm1, ymm2
+        vpxor   ymm9, ymm9, ymm10
+        vpsrld  ymm4, ymm1, 12
+        vpslld  ymm1, ymm1, 20
+        vpor    ymm1, ymm1, ymm4
+        vpsrld  ymm4, ymm9, 12
+        vpslld  ymm9, ymm9, 20
+        vpor    ymm9, ymm9, ymm4
+        vpaddd  ymm0, ymm0, ymm7
+        vpaddd  ymm8, ymm8, ymm15
+        vpaddd  ymm0, ymm0, ymm1
+        vpaddd  ymm8, ymm8, ymm9
+        vpxor   ymm3, ymm3, ymm0
+        vpxor   ymm11, ymm11, ymm8
+        vbroadcasti128 ymm4, xmmword ptr [ROT8+rip]
+        vpshufb ymm3, ymm3, ymm4
+        vpshufb ymm11, ymm11, ymm4
+        vpaddd  ymm2, ymm2, ymm3
+        vpaddd  ymm10, ymm10, ymm11
+        vpxor   ymm1, ymm1, ymm2
+        vpxor   ymm9, ymm9, ymm10
+        vpsrld  ymm4, ymm1, 7
+        vpslld  ymm1, ymm1, 25
+        vpor    ymm1, ymm1, ymm4
+        vpsrld  ymm4, ymm9, 7
+        vpslld  ymm9, ymm9, 25
+        vpor    ymm9, ymm9, ymm4
+        vpshufd ymm0, ymm0, 0x39
+        vpshufd ymm8, ymm8, 0x39
+        vpshufd ymm3, ymm3, 0x4E
+        vpshufd ymm11, ymm11, 0x4E
+        vpshufd ymm2, ymm2, 0x93
+        vpshufd ymm10, ymm10, 0x93
+        dec     al
+        je      9f
+        vmovdqa ymm4, ymmword ptr [rsp+0x40]
+        vmovdqa ymm5, ymmword ptr [rsp+0x80]
+        vshufps ymm12, ymm4, ymm5, 214
+        vpshufd ymm13, ymm4, 0x0F
+        vpshufd ymm4, ymm12, 0x39
+        vshufps ymm12, ymm6, ymm7, 250
+        vpblendd ymm13, ymm13, ymm12, 0xAA
+        vpunpcklqdq ymm12, ymm7, ymm5
+        vpblendd ymm12, ymm12, ymm6, 0x88
+        vpshufd ymm12, ymm12, 0x78
+        vpunpckhdq ymm5, ymm5, ymm7
+        vpunpckldq ymm6, ymm6, ymm5
+        vpshufd ymm7, ymm6, 0x1E
+        vmovdqa ymmword ptr [rsp+0x40], ymm13
+        vmovdqa ymmword ptr [rsp+0x80], ymm12
+        vmovdqa ymm12, ymmword ptr [rsp+0x60]
+        vmovdqa ymm13, ymmword ptr [rsp+0xA0]
+        vshufps ymm5, ymm12, ymm13, 214
+        vpshufd ymm6, ymm12, 0x0F
+        vpshufd ymm12, ymm5, 0x39
+        vshufps ymm5, ymm14, ymm15, 250
+        vpblendd ymm6, ymm6, ymm5, 0xAA
+        vpunpcklqdq ymm5, ymm15, ymm13
+        vpblendd ymm5, ymm5, ymm14, 0x88
+        vpshufd ymm5, ymm5, 0x78
+        vpunpckhdq ymm13, ymm13, ymm15
+        vpunpckldq ymm14, ymm14, ymm13
+        vpshufd ymm15, ymm14, 0x1E
+        vmovdqa ymm13, ymm6
+        vmovdqa ymm14, ymm5
+        vmovdqa ymm5, ymmword ptr [rsp+0x40]
+        vmovdqa ymm6, ymmword ptr [rsp+0x80]
+        jmp     9b
+9:
+        vpxor   ymm0, ymm0, ymm2
+        vpxor   ymm1, ymm1, ymm3
+        vpxor   ymm8, ymm8, ymm10
+        vpxor   ymm9, ymm9, ymm11
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     2b
+        vmovdqu xmmword ptr [rbx], xmm0
+        vmovdqu xmmword ptr [rbx+0x10], xmm1
+        vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
+        vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
+        vmovdqu xmmword ptr [rbx+0x40], xmm8
+        vmovdqu xmmword ptr [rbx+0x50], xmm9
+        vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01
+        vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01
+        vmovaps xmm8, xmmword ptr [rsp+0x280]
+        vmovaps xmm0, xmmword ptr [rsp+0x240]
+        vmovaps xmm1, xmmword ptr [rsp+0x250]
+        vmovaps xmm2, xmmword ptr [rsp+0x260]
+        vmovaps xmm3, xmmword ptr [rsp+0x270]
+        vblendvps xmm0, xmm0, xmm1, xmm8
+        vblendvps xmm2, xmm2, xmm3, xmm8
+        vmovaps xmmword ptr [rsp+0x240], xmm0
+        vmovaps xmmword ptr [rsp+0x260], xmm2
+        add     rbx, 128
+        add     rdi, 32
+        sub     rsi, 4
+3:
+        test    rsi, 0x2
+        je      3f
+        vbroadcasti128 ymm0, xmmword ptr [rcx]
+        vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
+        vmovd   xmm13, dword ptr [rsp+0x240]
+        vpinsrd xmm13, xmm13, dword ptr [rsp+0x260], 1
+        vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+        vmovd   xmm14, dword ptr [rsp+0x244]
+        vpinsrd xmm14, xmm14, dword ptr [rsp+0x264], 1
+        vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+        vinserti128 ymm13, ymm13, xmm14, 0x01
+        vbroadcasti128 ymm14, xmmword ptr [ROT16+rip]
+        vbroadcasti128 ymm15, xmmword ptr [ROT8+rip]
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        movzx   eax, byte ptr [rbp+0x40]
+        or      eax, r13d
+        xor     edx, edx
+.p2align  5
+2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        mov     dword ptr [rsp+0x200], eax
+        vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
+        vpbroadcastd ymm8, dword ptr [rsp+0x200]
+        vpblendd ymm3, ymm13, ymm8, 0x88
+        vmovups ymm8, ymmword ptr [r8+rdx-0x40]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01
+        vmovups ymm9, ymmword ptr [r8+rdx-0x30]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01
+        vshufps ymm4, ymm8, ymm9, 136
+        vshufps ymm5, ymm8, ymm9, 221
+        vmovups ymm8, ymmword ptr [r8+rdx-0x20]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01
+        vmovups ymm9, ymmword ptr [r8+rdx-0x10]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01
+        vshufps ymm6, ymm8, ymm9, 136
+        vshufps ymm7, ymm8, ymm9, 221
+        vpshufd ymm6, ymm6, 0x93
+        vpshufd ymm7, ymm7, 0x93
+        mov     al, 7
+9:
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm0, ymm0, ymm1
+        vpxor   ymm3, ymm3, ymm0
+        vpshufb ymm3, ymm3, ymm14
+        vpaddd  ymm2, ymm2, ymm3
+        vpxor   ymm1, ymm1, ymm2
+        vpsrld  ymm8, ymm1, 12
+        vpslld  ymm1, ymm1, 20
+        vpor    ymm1, ymm1, ymm8
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm0, ymm0, ymm1
+        vpxor   ymm3, ymm3, ymm0
+        vpshufb ymm3, ymm3, ymm15
+        vpaddd  ymm2, ymm2, ymm3
+        vpxor   ymm1, ymm1, ymm2
+        vpsrld  ymm8, ymm1, 7
+        vpslld  ymm1, ymm1, 25
+        vpor    ymm1, ymm1, ymm8
+        vpshufd ymm0, ymm0, 0x93
+        vpshufd ymm3, ymm3, 0x4E
+        vpshufd ymm2, ymm2, 0x39
+        vpaddd  ymm0, ymm0, ymm6
+        vpaddd  ymm0, ymm0, ymm1
+        vpxor   ymm3, ymm3, ymm0
+        vpshufb ymm3, ymm3, ymm14
+        vpaddd  ymm2, ymm2, ymm3
+        vpxor   ymm1, ymm1, ymm2
+        vpsrld  ymm8, ymm1, 12
+        vpslld  ymm1, ymm1, 20
+        vpor    ymm1, ymm1, ymm8
+        vpaddd  ymm0, ymm0, ymm7
+        vpaddd  ymm0, ymm0, ymm1
+        vpxor   ymm3, ymm3, ymm0
+        vpshufb ymm3, ymm3, ymm15
+        vpaddd  ymm2, ymm2, ymm3
+        vpxor   ymm1, ymm1, ymm2
+        vpsrld  ymm8, ymm1, 7
+        vpslld  ymm1, ymm1, 25
+        vpor    ymm1, ymm1, ymm8
+        vpshufd ymm0, ymm0, 0x39
+        vpshufd ymm3, ymm3, 0x4E
+        vpshufd ymm2, ymm2, 0x93
+        dec     al
+        jz      9f
+        vshufps ymm8, ymm4, ymm5, 214
+        vpshufd ymm9, ymm4, 0x0F
+        vpshufd ymm4, ymm8, 0x39
+        vshufps ymm8, ymm6, ymm7, 250
+        vpblendd ymm9, ymm9, ymm8, 0xAA
+        vpunpcklqdq ymm8, ymm7, ymm5
+        vpblendd ymm8, ymm8, ymm6, 0x88
+        vpshufd ymm8, ymm8, 0x78
+        vpunpckhdq ymm5, ymm5, ymm7
+        vpunpckldq ymm6, ymm6, ymm5
+        vpshufd ymm7, ymm6, 0x1E
+        vmovdqa ymm5, ymm9
+        vmovdqa ymm6, ymm8
+        jmp     9b
+9:
+        vpxor   ymm0, ymm0, ymm2
+        vpxor   ymm1, ymm1, ymm3
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     2b
+        vmovdqu xmmword ptr [rbx], xmm0
+        vmovdqu xmmword ptr [rbx+0x10], xmm1
+        vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
+        vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
+        vmovaps ymm8, ymmword ptr [rsp+0x280]
+        vmovaps ymm0, ymmword ptr [rsp+0x240]
+        vmovups ymm1, ymmword ptr [rsp+0x248]
+        vmovaps ymm2, ymmword ptr [rsp+0x260]
+        vmovups ymm3, ymmword ptr [rsp+0x268]
+        vblendvps ymm0, ymm0, ymm1, ymm8
+        vblendvps ymm2, ymm2, ymm3, ymm8
+        vmovaps ymmword ptr [rsp+0x240], ymm0
+        vmovaps ymmword ptr [rsp+0x260], ymm2
+        add     rbx, 64
+        add     rdi, 16
+        sub     rsi, 2
+3:
+        test    rsi, 0x1
+        je      4b
+        vmovdqu xmm0, xmmword ptr [rcx]
+        vmovdqu xmm1, xmmword ptr [rcx+0x10]
+        vmovd   xmm3, dword ptr [rsp+0x240]
+        vpinsrd xmm3, xmm3, dword ptr [rsp+0x260], 1
+        vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+        vmovdqa xmm14, xmmword ptr [ROT16+rip]
+        vmovdqa xmm15, xmmword ptr [ROT8+rip]
+        mov     r8, qword ptr [rdi]
+        movzx   eax, byte ptr [rbp+0x40]
+        or      eax, r13d
+        xor     edx, edx
+.p2align  5
+2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip]
+        vmovdqa xmm3, xmm13
+        vpinsrd xmm3, xmm3, eax, 3
+        vmovups xmm8, xmmword ptr [r8+rdx-0x40]
+        vmovups xmm9, xmmword ptr [r8+rdx-0x30]
+        vshufps xmm4, xmm8, xmm9, 136
+        vshufps xmm5, xmm8, xmm9, 221
+        vmovups xmm8, xmmword ptr [r8+rdx-0x20]
+        vmovups xmm9, xmmword ptr [r8+rdx-0x10]
+        vshufps xmm6, xmm8, xmm9, 136
+        vshufps xmm7, xmm8, xmm9, 221
+        vpshufd xmm6, xmm6, 0x93
+        vpshufd xmm7, xmm7, 0x93
+        mov     al, 7
+9:
+        vpaddd  xmm0, xmm0, xmm4
+        vpaddd  xmm0, xmm0, xmm1
+        vpxor   xmm3, xmm3, xmm0
+        vpshufb xmm3, xmm3, xmm14
+        vpaddd  xmm2, xmm2, xmm3
+        vpxor   xmm1, xmm1, xmm2
+        vpsrld  xmm8, xmm1, 12
+        vpslld  xmm1, xmm1, 20
+        vpor    xmm1, xmm1, xmm8
+        vpaddd  xmm0, xmm0, xmm5
+        vpaddd  xmm0, xmm0, xmm1
+        vpxor   xmm3, xmm3, xmm0
+        vpshufb xmm3, xmm3, xmm15
+        vpaddd  xmm2, xmm2, xmm3
+        vpxor   xmm1, xmm1, xmm2
+        vpsrld  xmm8, xmm1, 7
+        vpslld  xmm1, xmm1, 25
+        vpor    xmm1, xmm1, xmm8
+        vpshufd xmm0, xmm0, 0x93
+        vpshufd xmm3, xmm3, 0x4E
+        vpshufd xmm2, xmm2, 0x39
+        vpaddd  xmm0, xmm0, xmm6
+        vpaddd  xmm0, xmm0, xmm1
+        vpxor   xmm3, xmm3, xmm0
+        vpshufb xmm3, xmm3, xmm14
+        vpaddd  xmm2, xmm2, xmm3
+        vpxor   xmm1, xmm1, xmm2
+        vpsrld  xmm8, xmm1, 12
+        vpslld  xmm1, xmm1, 20
+        vpor    xmm1, xmm1, xmm8
+        vpaddd  xmm0, xmm0, xmm7
+        vpaddd  xmm0, xmm0, xmm1
+        vpxor   xmm3, xmm3, xmm0
+        vpshufb xmm3, xmm3, xmm15
+        vpaddd  xmm2, xmm2, xmm3
+        vpxor   xmm1, xmm1, xmm2
+        vpsrld  xmm8, xmm1, 7
+        vpslld  xmm1, xmm1, 25
+        vpor    xmm1, xmm1, xmm8
+        vpshufd xmm0, xmm0, 0x39
+        vpshufd xmm3, xmm3, 0x4E
+        vpshufd xmm2, xmm2, 0x93
+        dec     al
+        jz      9f
+        vshufps xmm8, xmm4, xmm5, 214
+        vpshufd xmm9, xmm4, 0x0F
+        vpshufd xmm4, xmm8, 0x39
+        vshufps xmm8, xmm6, xmm7, 250
+        vpblendd xmm9, xmm9, xmm8, 0xAA
+        vpunpcklqdq xmm8, xmm7, xmm5
+        vpblendd xmm8, xmm8, xmm6, 0x88
+        vpshufd xmm8, xmm8, 0x78
+        vpunpckhdq xmm5, xmm5, xmm7
+        vpunpckldq xmm6, xmm6, xmm5
+        vpshufd xmm7, xmm6, 0x1E
+        vmovdqa xmm5, xmm9
+        vmovdqa xmm6, xmm8
+        jmp     9b
+9:
+        vpxor   xmm0, xmm0, xmm2
+        vpxor   xmm1, xmm1, xmm3
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     2b
+        vmovdqu xmmword ptr [rbx], xmm0
+        vmovdqu xmmword ptr [rbx+0x10], xmm1
+        jmp     4b
+
+
+#ifdef __APPLE__
+.static_data
+#else
+.section .rodata
+#endif
+.p2align  6
+ADD0:
+        .long  0, 1, 2, 3, 4, 5, 6, 7
+ADD1:
+        .long  8, 8, 8, 8, 8, 8, 8, 8
+BLAKE3_IV_0:
+        .long  0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
+        .long  0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
+BLAKE3_IV_1:
+        .long  0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
+        .long  0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
+BLAKE3_IV_2:
+        .long  0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
+        .long  0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
+BLAKE3_IV_3:
+        .long  0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
+        .long  0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
+BLAKE3_BLOCK_LEN:
+        .long  0x00000040, 0x00000040, 0x00000040, 0x00000040
+        .long  0x00000040, 0x00000040, 0x00000040, 0x00000040
+ROT16:
+        .byte  2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+ROT8:
+        .byte  1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
+CMP_MSB_MASK:
+        .long  0x80000000, 0x80000000, 0x80000000, 0x80000000
+        .long  0x80000000, 0x80000000, 0x80000000, 0x80000000
+BLAKE3_IV:
+        .long  0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
+
+#endif
diff --git a/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_windows_gnu.S b/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_windows_gnu.S
new file mode 100644
index 000000000000..bb58d2ae64b1
--- /dev/null
+++ b/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_windows_gnu.S
@@ -0,0 +1,1817 @@
+.intel_syntax noprefix
+.global _blake3_hash_many_avx2
+.global blake3_hash_many_avx2
+.section .text
+        .p2align  6
+_blake3_hash_many_avx2:
+blake3_hash_many_avx2:
+        push    r15
+        push    r14
+        push    r13
+        push    r12
+        push    rsi
+        push    rdi
+        push    rbx
+        push    rbp
+        mov     rbp, rsp
+        sub     rsp, 880
+        and     rsp, 0xFFFFFFFFFFFFFFC0
+        vmovdqa xmmword ptr [rsp+0x2D0], xmm6
+        vmovdqa xmmword ptr [rsp+0x2E0], xmm7
+        vmovdqa xmmword ptr [rsp+0x2F0], xmm8
+        vmovdqa xmmword ptr [rsp+0x300], xmm9
+        vmovdqa xmmword ptr [rsp+0x310], xmm10
+        vmovdqa xmmword ptr [rsp+0x320], xmm11
+        vmovdqa xmmword ptr [rsp+0x330], xmm12
+        vmovdqa xmmword ptr [rsp+0x340], xmm13
+        vmovdqa xmmword ptr [rsp+0x350], xmm14
+        vmovdqa xmmword ptr [rsp+0x360], xmm15
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+        mov     rcx, r9
+        mov     r8, qword ptr [rbp+0x68]
+        movzx   r9, byte ptr [rbp+0x70]
+        neg     r9d
+        vmovd   xmm0, r9d
+        vpbroadcastd ymm0, xmm0
+        vmovdqa ymmword ptr [rsp+0x260], ymm0
+        vpand   ymm1, ymm0, ymmword ptr [ADD0+rip]
+        vpand   ymm2, ymm0, ymmword ptr [ADD1+rip]
+        vmovdqa ymmword ptr [rsp+0x2A0], ymm2
+        vmovd   xmm2, r8d
+        vpbroadcastd ymm2, xmm2
+        vpaddd  ymm2, ymm2, ymm1
+        vmovdqa ymmword ptr [rsp+0x220], ymm2
+        vpxor   ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip]
+        vpxor   ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip]
+        vpcmpgtd ymm2, ymm1, ymm2
+        shr     r8, 32
+        vmovd   xmm3, r8d
+        vpbroadcastd ymm3, xmm3
+        vpsubd  ymm3, ymm3, ymm2
+        vmovdqa ymmword ptr [rsp+0x240], ymm3
+        shl     rdx, 6
+        mov     qword ptr [rsp+0x2C0], rdx
+        cmp     rsi, 8
+        jc      3f
+2:
+        vpbroadcastd ymm0, dword ptr [rcx]
+        vpbroadcastd ymm1, dword ptr [rcx+0x4]
+        vpbroadcastd ymm2, dword ptr [rcx+0x8]
+        vpbroadcastd ymm3, dword ptr [rcx+0xC]
+        vpbroadcastd ymm4, dword ptr [rcx+0x10]
+        vpbroadcastd ymm5, dword ptr [rcx+0x14]
+        vpbroadcastd ymm6, dword ptr [rcx+0x18]
+        vpbroadcastd ymm7, dword ptr [rcx+0x1C]
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        mov     r10, qword ptr [rdi+0x10]
+        mov     r11, qword ptr [rdi+0x18]
+        mov     r12, qword ptr [rdi+0x20]
+        mov     r13, qword ptr [rdi+0x28]
+        mov     r14, qword ptr [rdi+0x30]
+        mov     r15, qword ptr [rdi+0x38]
+        movzx   eax, byte ptr [rbp+0x78]
+        movzx   ebx, byte ptr [rbp+0x80]
+        or      eax, ebx
+        xor     edx, edx
+.p2align 5
+9:
+        movzx   ebx, byte ptr [rbp+0x88]
+        or      ebx, eax
+        add     rdx, 64
+        cmp     rdx, qword ptr [rsp+0x2C0]
+        cmove   eax, ebx
+        mov     dword ptr [rsp+0x200], eax
+        vmovups xmm8, xmmword ptr [r8+rdx-0x40]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01
+        vmovups xmm9, xmmword ptr [r9+rdx-0x40]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01
+        vunpcklpd ymm12, ymm8, ymm9
+        vunpckhpd ymm13, ymm8, ymm9
+        vmovups xmm10, xmmword ptr [r10+rdx-0x40]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01
+        vmovups xmm11, xmmword ptr [r11+rdx-0x40]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01
+        vunpcklpd ymm14, ymm10, ymm11
+        vunpckhpd ymm15, ymm10, ymm11
+        vshufps ymm8, ymm12, ymm14, 136
+        vmovaps ymmword ptr [rsp], ymm8
+        vshufps ymm9, ymm12, ymm14, 221
+        vmovaps ymmword ptr [rsp+0x20], ymm9
+        vshufps ymm10, ymm13, ymm15, 136
+        vmovaps ymmword ptr [rsp+0x40], ymm10
+        vshufps ymm11, ymm13, ymm15, 221
+        vmovaps ymmword ptr [rsp+0x60], ymm11
+        vmovups xmm8, xmmword ptr [r8+rdx-0x30]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01
+        vmovups xmm9, xmmword ptr [r9+rdx-0x30]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01
+        vunpcklpd ymm12, ymm8, ymm9
+        vunpckhpd ymm13, ymm8, ymm9
+        vmovups xmm10, xmmword ptr [r10+rdx-0x30]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01
+        vmovups xmm11, xmmword ptr [r11+rdx-0x30]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01
+        vunpcklpd ymm14, ymm10, ymm11
+        vunpckhpd ymm15, ymm10, ymm11
+        vshufps ymm8, ymm12, ymm14, 136
+        vmovaps ymmword ptr [rsp+0x80], ymm8
+        vshufps ymm9, ymm12, ymm14, 221
+        vmovaps ymmword ptr [rsp+0xA0], ymm9
+        vshufps ymm10, ymm13, ymm15, 136
+        vmovaps ymmword ptr [rsp+0xC0], ymm10
+        vshufps ymm11, ymm13, ymm15, 221
+        vmovaps ymmword ptr [rsp+0xE0], ymm11
+        vmovups xmm8, xmmword ptr [r8+rdx-0x20]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01
+        vmovups xmm9, xmmword ptr [r9+rdx-0x20]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01
+        vunpcklpd ymm12, ymm8, ymm9
+        vunpckhpd ymm13, ymm8, ymm9
+        vmovups xmm10, xmmword ptr [r10+rdx-0x20]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01
+        vmovups xmm11, xmmword ptr [r11+rdx-0x20]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01
+        vunpcklpd ymm14, ymm10, ymm11
+        vunpckhpd ymm15, ymm10, ymm11
+        vshufps ymm8, ymm12, ymm14, 136
+        vmovaps ymmword ptr [rsp+0x100], ymm8
+        vshufps ymm9, ymm12, ymm14, 221
+        vmovaps ymmword ptr [rsp+0x120], ymm9
+        vshufps ymm10, ymm13, ymm15, 136
+        vmovaps ymmword ptr [rsp+0x140], ymm10
+        vshufps ymm11, ymm13, ymm15, 221
+        vmovaps ymmword ptr [rsp+0x160], ymm11
+        vmovups xmm8, xmmword ptr [r8+rdx-0x10]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01
+        vmovups xmm9, xmmword ptr [r9+rdx-0x10]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01
+        vunpcklpd ymm12, ymm8, ymm9
+        vunpckhpd ymm13, ymm8, ymm9
+        vmovups xmm10, xmmword ptr [r10+rdx-0x10]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01
+        vmovups xmm11, xmmword ptr [r11+rdx-0x10]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01
+        vunpcklpd ymm14, ymm10, ymm11
+        vunpckhpd ymm15, ymm10, ymm11
+        vshufps ymm8, ymm12, ymm14, 136
+        vmovaps ymmword ptr [rsp+0x180], ymm8
+        vshufps ymm9, ymm12, ymm14, 221
+        vmovaps ymmword ptr [rsp+0x1A0], ymm9
+        vshufps ymm10, ymm13, ymm15, 136
+        vmovaps ymmword ptr [rsp+0x1C0], ymm10
+        vshufps ymm11, ymm13, ymm15, 221
+        vmovaps ymmword ptr [rsp+0x1E0], ymm11
+        vpbroadcastd ymm15, dword ptr [rsp+0x200]
+        prefetcht0 [r8+rdx+0x80]
+        prefetcht0 [r12+rdx+0x80]
+        prefetcht0 [r9+rdx+0x80]
+        prefetcht0 [r13+rdx+0x80]
+        prefetcht0 [r10+rdx+0x80]
+        prefetcht0 [r14+rdx+0x80]
+        prefetcht0 [r11+rdx+0x80]
+        prefetcht0 [r15+rdx+0x80]
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x40]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x80]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0xC0]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm0, ymmword ptr [rsp+0x220]
+        vpxor   ymm13, ymm1, ymmword ptr [rsp+0x240]
+        vpxor   ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip]
+        vpxor   ymm15, ymm3, ymm15
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip]
+        vpaddd  ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip]
+        vpaddd  ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip]
+        vpaddd  ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip]
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x20]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x60]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0xA0]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0xE0]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x100]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x140]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x180]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x1C0]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x120]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x160]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x1A0]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x1E0]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x40]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x60]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0xE0]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x80]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0xC0]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x140]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x1A0]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x20]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x180]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x120]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x1E0]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x160]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0xA0]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x1C0]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x100]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x60]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x140]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x1A0]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0xE0]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x80]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x180]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x40]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x1C0]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0xC0]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x120]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x160]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x100]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0xA0]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x1E0]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x20]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x140]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x180]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x1C0]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x1A0]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0xE0]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x120]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x60]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x1E0]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x80]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x160]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0xA0]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x20]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x40]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x100]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0xC0]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x180]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x120]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x1E0]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x1C0]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x1A0]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x160]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x140]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x100]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0xE0]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0xA0]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0xC0]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x40]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x60]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x20]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x80]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x120]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x160]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x100]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x1E0]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x1C0]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0xA0]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x180]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x20]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x1A0]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x40]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x80]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x60]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x140]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0xC0]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0xE0]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x160]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0xA0]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x20]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x100]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x1E0]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x120]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0xC0]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x1C0]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x40]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x60]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0xE0]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x140]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x180]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x80]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x1A0]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vpxor   ymm0, ymm0, ymm8
+        vpxor   ymm1, ymm1, ymm9
+        vpxor   ymm2, ymm2, ymm10
+        vpxor   ymm3, ymm3, ymm11
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpxor   ymm4, ymm4, ymm12
+        vpxor   ymm5, ymm5, ymm13
+        vpxor   ymm6, ymm6, ymm14
+        vpxor   ymm7, ymm7, ymm15
+        movzx   eax, byte ptr [rbp+0x78]
+        jne     9b
+        mov     rbx, qword ptr [rbp+0x90]
+        vunpcklps ymm8, ymm0, ymm1
+        vunpcklps ymm9, ymm2, ymm3
+        vunpckhps ymm10, ymm0, ymm1
+        vunpcklps ymm11, ymm4, ymm5
+        vunpcklps ymm0, ymm6, ymm7
+        vshufps ymm12, ymm8, ymm9, 78
+        vblendps ymm1, ymm8, ymm12, 0xCC
+        vshufps ymm8, ymm11, ymm0, 78
+        vunpckhps ymm13, ymm2, ymm3
+        vblendps ymm2, ymm11, ymm8, 0xCC
+        vblendps ymm3, ymm12, ymm9, 0xCC
+        vperm2f128 ymm12, ymm1, ymm2, 0x20
+        vmovups ymmword ptr [rbx], ymm12
+        vunpckhps ymm14, ymm4, ymm5
+        vblendps ymm4, ymm8, ymm0, 0xCC
+        vunpckhps ymm15, ymm6, ymm7
+        vperm2f128 ymm7, ymm3, ymm4, 0x20
+        vmovups ymmword ptr [rbx+0x20], ymm7
+        vshufps ymm5, ymm10, ymm13, 78
+        vblendps ymm6, ymm5, ymm13, 0xCC
+        vshufps ymm13, ymm14, ymm15, 78
+        vblendps ymm10, ymm10, ymm5, 0xCC
+        vblendps ymm14, ymm14, ymm13, 0xCC
+        vperm2f128 ymm8, ymm10, ymm14, 0x20
+        vmovups ymmword ptr [rbx+0x40], ymm8
+        vblendps ymm15, ymm13, ymm15, 0xCC
+        vperm2f128 ymm13, ymm6, ymm15, 0x20
+        vmovups ymmword ptr [rbx+0x60], ymm13
+        vperm2f128 ymm9, ymm1, ymm2, 0x31
+        vperm2f128 ymm11, ymm3, ymm4, 0x31
+        vmovups ymmword ptr [rbx+0x80], ymm9
+        vperm2f128 ymm14, ymm10, ymm14, 0x31
+        vperm2f128 ymm15, ymm6, ymm15, 0x31
+        vmovups ymmword ptr [rbx+0xA0], ymm11
+        vmovups ymmword ptr [rbx+0xC0], ymm14
+        vmovups ymmword ptr [rbx+0xE0], ymm15
+        vmovdqa ymm0, ymmword ptr [rsp+0x2A0]
+        vpaddd  ymm1, ymm0, ymmword ptr [rsp+0x220]
+        vmovdqa ymmword ptr [rsp+0x220], ymm1
+        vpxor   ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip]
+        vpxor   ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip]
+        vpcmpgtd ymm2, ymm0, ymm2
+        vmovdqa ymm0, ymmword ptr [rsp+0x240]
+        vpsubd  ymm2, ymm0, ymm2
+        vmovdqa ymmword ptr [rsp+0x240], ymm2
+        add     rdi, 64
+        add     rbx, 256
+        mov     qword ptr [rbp+0x90], rbx
+        sub     rsi, 8
+        cmp     rsi, 8
+        jnc     2b
+        test    rsi, rsi
+        jnz     3f
+4:
+        vzeroupper
+        vmovdqa xmm6, xmmword ptr [rsp+0x2D0]
+        vmovdqa xmm7, xmmword ptr [rsp+0x2E0]
+        vmovdqa xmm8, xmmword ptr [rsp+0x2F0]
+        vmovdqa xmm9, xmmword ptr [rsp+0x300]
+        vmovdqa xmm10, xmmword ptr [rsp+0x310]
+        vmovdqa xmm11, xmmword ptr [rsp+0x320]
+        vmovdqa xmm12, xmmword ptr [rsp+0x330]
+        vmovdqa xmm13, xmmword ptr [rsp+0x340]
+        vmovdqa xmm14, xmmword ptr [rsp+0x350]
+        vmovdqa xmm15, xmmword ptr [rsp+0x360]
+        mov     rsp, rbp
+        pop     rbp
+        pop     rbx
+        pop     rdi
+        pop     rsi
+        pop     r12
+        pop     r13
+        pop     r14
+        pop     r15
+        ret
+.p2align 5
+3:
+        mov     rbx, qword ptr [rbp+0x90]
+        mov     r15, qword ptr [rsp+0x2C0]
+        movzx   r13d, byte ptr [rbp+0x78]
+        movzx   r12d, byte ptr [rbp+0x88]
+        test    rsi, 0x4
+        je      3f
+        vbroadcasti128 ymm0, xmmword ptr [rcx]
+        vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
+        vmovdqa ymm8, ymm0
+        vmovdqa ymm9, ymm1
+        vbroadcasti128 ymm12, xmmword ptr [rsp+0x220]
+        vbroadcasti128 ymm13, xmmword ptr [rsp+0x240]
+        vpunpckldq ymm14, ymm12, ymm13
+        vpunpckhdq ymm15, ymm12, ymm13
+        vpermq  ymm14, ymm14, 0x50
+        vpermq  ymm15, ymm15, 0x50
+        vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
+        vpblendd ymm14, ymm14, ymm12, 0x44
+        vpblendd ymm15, ymm15, ymm12, 0x44
+        vmovdqa ymmword ptr [rsp], ymm14
+        vmovdqa ymmword ptr [rsp+0x20], ymm15
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        mov     r10, qword ptr [rdi+0x10]
+        mov     r11, qword ptr [rdi+0x18]
+        movzx   eax, byte ptr [rbp+0x80]
+        or      eax, r13d
+        xor     edx, edx
+.p2align 5
+2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        mov     dword ptr [rsp+0x200], eax
+        vmovups ymm2, ymmword ptr [r8+rdx-0x40]
+        vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01
+        vmovups ymm3, ymmword ptr [r8+rdx-0x30]
+        vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01
+        vshufps ymm4, ymm2, ymm3, 136
+        vshufps ymm5, ymm2, ymm3, 221
+        vmovups ymm2, ymmword ptr [r8+rdx-0x20]
+        vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01
+        vmovups ymm3, ymmword ptr [r8+rdx-0x10]
+        vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01
+        vshufps ymm6, ymm2, ymm3, 136
+        vshufps ymm7, ymm2, ymm3, 221
+        vpshufd ymm6, ymm6, 0x93
+        vpshufd ymm7, ymm7, 0x93
+        vmovups ymm10, ymmword ptr [r10+rdx-0x40]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01
+        vmovups ymm11, ymmword ptr [r10+rdx-0x30]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01
+        vshufps ymm12, ymm10, ymm11, 136
+        vshufps ymm13, ymm10, ymm11, 221
+        vmovups ymm10, ymmword ptr [r10+rdx-0x20]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01
+        vmovups ymm11, ymmword ptr [r10+rdx-0x10]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01
+        vshufps ymm14, ymm10, ymm11, 136
+        vshufps ymm15, ymm10, ymm11, 221
+        vpshufd ymm14, ymm14, 0x93
+        vpshufd ymm15, ymm15, 0x93
+        vpbroadcastd ymm2, dword ptr [rsp+0x200]
+        vmovdqa ymm3, ymmword ptr [rsp]
+        vmovdqa ymm11, ymmword ptr [rsp+0x20]
+        vpblendd ymm3, ymm3, ymm2, 0x88
+        vpblendd ymm11, ymm11, ymm2, 0x88
+        vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
+        vmovdqa ymm10, ymm2
+        mov     al, 7
+9:
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm8, ymm8, ymm12
+        vmovdqa ymmword ptr [rsp+0x40], ymm4
+        nop
+        vmovdqa ymmword ptr [rsp+0x60], ymm12
+        nop
+        vpaddd  ymm0, ymm0, ymm1
+        vpaddd  ymm8, ymm8, ymm9
+        vpxor   ymm3, ymm3, ymm0
+        vpxor   ymm11, ymm11, ymm8
+        vbroadcasti128 ymm4, xmmword ptr [ROT16+rip]
+        vpshufb ymm3, ymm3, ymm4
+        vpshufb ymm11, ymm11, ymm4
+        vpaddd  ymm2, ymm2, ymm3
+        vpaddd  ymm10, ymm10, ymm11
+        vpxor   ymm1, ymm1, ymm2
+        vpxor   ymm9, ymm9, ymm10
+        vpsrld  ymm4, ymm1, 12
+        vpslld  ymm1, ymm1, 20
+        vpor    ymm1, ymm1, ymm4
+        vpsrld  ymm4, ymm9, 12
+        vpslld  ymm9, ymm9, 20
+        vpor    ymm9, ymm9, ymm4
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm0, ymm0, ymm1
+        vpaddd  ymm8, ymm8, ymm9
+        vmovdqa ymmword ptr [rsp+0x80], ymm5
+        vmovdqa ymmword ptr [rsp+0xA0], ymm13
+        vpxor   ymm3, ymm3, ymm0
+        vpxor   ymm11, ymm11, ymm8
+        vbroadcasti128 ymm4, xmmword ptr [ROT8+rip]
+        vpshufb ymm3, ymm3, ymm4
+        vpshufb ymm11, ymm11, ymm4
+        vpaddd  ymm2, ymm2, ymm3
+        vpaddd  ymm10, ymm10, ymm11
+        vpxor   ymm1, ymm1, ymm2
+        vpxor   ymm9, ymm9, ymm10
+        vpsrld  ymm4, ymm1, 7
+        vpslld  ymm1, ymm1, 25
+        vpor    ymm1, ymm1, ymm4
+        vpsrld  ymm4, ymm9, 7
+        vpslld  ymm9, ymm9, 25
+        vpor    ymm9, ymm9, ymm4
+        vpshufd ymm0, ymm0, 0x93
+        vpshufd ymm8, ymm8, 0x93
+        vpshufd ymm3, ymm3, 0x4E
+        vpshufd ymm11, ymm11, 0x4E
+        vpshufd ymm2, ymm2, 0x39
+        vpshufd ymm10, ymm10, 0x39
+        vpaddd  ymm0, ymm0, ymm6
+        vpaddd  ymm8, ymm8, ymm14
+        vpaddd  ymm0, ymm0, ymm1
+        vpaddd  ymm8, ymm8, ymm9
+        vpxor   ymm3, ymm3, ymm0
+        vpxor   ymm11, ymm11, ymm8
+        vbroadcasti128 ymm4, xmmword ptr [ROT16+rip]
+        vpshufb ymm3, ymm3, ymm4
+        vpshufb ymm11, ymm11, ymm4
+        vpaddd  ymm2, ymm2, ymm3
+        vpaddd  ymm10, ymm10, ymm11
+        vpxor   ymm1, ymm1, ymm2
+        vpxor   ymm9, ymm9, ymm10
+        vpsrld  ymm4, ymm1, 12
+        vpslld  ymm1, ymm1, 20
+        vpor    ymm1, ymm1, ymm4
+        vpsrld  ymm4, ymm9, 12
+        vpslld  ymm9, ymm9, 20
+        vpor    ymm9, ymm9, ymm4
+        vpaddd  ymm0, ymm0, ymm7
+        vpaddd  ymm8, ymm8, ymm15
+        vpaddd  ymm0, ymm0, ymm1
+        vpaddd  ymm8, ymm8, ymm9
+        vpxor   ymm3, ymm3, ymm0
+        vpxor   ymm11, ymm11, ymm8
+        vbroadcasti128 ymm4, xmmword ptr [ROT8+rip]
+        vpshufb ymm3, ymm3, ymm4
+        vpshufb ymm11, ymm11, ymm4
+        vpaddd  ymm2, ymm2, ymm3
+        vpaddd  ymm10, ymm10, ymm11
+        vpxor   ymm1, ymm1, ymm2
+        vpxor   ymm9, ymm9, ymm10
+        vpsrld  ymm4, ymm1, 7
+        vpslld  ymm1, ymm1, 25
+        vpor    ymm1, ymm1, ymm4
+        vpsrld  ymm4, ymm9, 7
+        vpslld  ymm9, ymm9, 25
+        vpor    ymm9, ymm9, ymm4
+        vpshufd ymm0, ymm0, 0x39
+        vpshufd ymm8, ymm8, 0x39
+        vpshufd ymm3, ymm3, 0x4E
+        vpshufd ymm11, ymm11, 0x4E
+        vpshufd ymm2, ymm2, 0x93
+        vpshufd ymm10, ymm10, 0x93
+        dec     al
+        je      9f
+        vmovdqa ymm4, ymmword ptr [rsp+0x40]
+        vmovdqa ymm5, ymmword ptr [rsp+0x80]
+        vshufps ymm12, ymm4, ymm5, 214
+        vpshufd ymm13, ymm4, 0x0F
+        vpshufd ymm4, ymm12, 0x39
+        vshufps ymm12, ymm6, ymm7, 250
+        vpblendd ymm13, ymm13, ymm12, 0xAA
+        vpunpcklqdq ymm12, ymm7, ymm5
+        vpblendd ymm12, ymm12, ymm6, 0x88
+        vpshufd ymm12, ymm12, 0x78
+        vpunpckhdq ymm5, ymm5, ymm7
+        vpunpckldq ymm6, ymm6, ymm5
+        vpshufd ymm7, ymm6, 0x1E
+        vmovdqa ymmword ptr [rsp+0x40], ymm13
+        vmovdqa ymmword ptr [rsp+0x80], ymm12
+        vmovdqa ymm12, ymmword ptr [rsp+0x60]
+        vmovdqa ymm13, ymmword ptr [rsp+0xA0]
+        vshufps ymm5, ymm12, ymm13, 214
+        vpshufd ymm6, ymm12, 0x0F
+        vpshufd ymm12, ymm5, 0x39
+        vshufps ymm5, ymm14, ymm15, 250
+        vpblendd ymm6, ymm6, ymm5, 0xAA
+        vpunpcklqdq ymm5, ymm15, ymm13
+        vpblendd ymm5, ymm5, ymm14, 0x88
+        vpshufd ymm5, ymm5, 0x78
+        vpunpckhdq ymm13, ymm13, ymm15
+        vpunpckldq ymm14, ymm14, ymm13
+        vpshufd ymm15, ymm14, 0x1E
+        vmovdqa ymm13, ymm6
+        vmovdqa ymm14, ymm5
+        vmovdqa ymm5, ymmword ptr [rsp+0x40]
+        vmovdqa ymm6, ymmword ptr [rsp+0x80]
+        jmp     9b
+9:
+        vpxor   ymm0, ymm0, ymm2
+        vpxor   ymm1, ymm1, ymm3
+        vpxor   ymm8, ymm8, ymm10
+        vpxor   ymm9, ymm9, ymm11
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     2b
+        vmovdqu xmmword ptr [rbx], xmm0
+        vmovdqu xmmword ptr [rbx+0x10], xmm1
+        vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
+        vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
+        vmovdqu xmmword ptr [rbx+0x40], xmm8
+        vmovdqu xmmword ptr [rbx+0x50], xmm9
+        vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01
+        vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01
+        vmovaps xmm8, xmmword ptr [rsp+0x260]
+        vmovaps xmm0, xmmword ptr [rsp+0x220]
+        vmovaps xmm1, xmmword ptr [rsp+0x230]
+        vmovaps xmm2, xmmword ptr [rsp+0x240]
+        vmovaps xmm3, xmmword ptr [rsp+0x250]
+        vblendvps xmm0, xmm0, xmm1, xmm8
+        vblendvps xmm2, xmm2, xmm3, xmm8
+        vmovaps xmmword ptr [rsp+0x220], xmm0
+        vmovaps xmmword ptr [rsp+0x240], xmm2
+        add     rbx, 128
+        add     rdi, 32
+        sub     rsi, 4
+3:
+        test    rsi, 0x2
+        je      3f
+        vbroadcasti128 ymm0, xmmword ptr [rcx]
+        vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
+        vmovd   xmm13, dword ptr [rsp+0x220]
+        vpinsrd xmm13, xmm13, dword ptr [rsp+0x240], 1
+        vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+        vmovd   xmm14, dword ptr [rsp+0x224]
+        vpinsrd xmm14, xmm14, dword ptr [rsp+0x244], 1
+        vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+        vinserti128 ymm13, ymm13, xmm14, 0x01
+        vbroadcasti128 ymm14, xmmword ptr [ROT16+rip]
+        vbroadcasti128 ymm15, xmmword ptr [ROT8+rip]
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        movzx   eax, byte ptr [rbp+0x80]
+        or      eax, r13d
+        xor     edx, edx
+.p2align 5
+2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        mov     dword ptr [rsp+0x200], eax
+        vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
+        vpbroadcastd ymm8, dword ptr [rsp+0x200]
+        vpblendd ymm3, ymm13, ymm8, 0x88
+        vmovups ymm8, ymmword ptr [r8+rdx-0x40]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01
+        vmovups ymm9, ymmword ptr [r8+rdx-0x30]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01
+        vshufps ymm4, ymm8, ymm9, 136
+        vshufps ymm5, ymm8, ymm9, 221
+        vmovups ymm8, ymmword ptr [r8+rdx-0x20]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01
+        vmovups ymm9, ymmword ptr [r8+rdx-0x10]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01
+        vshufps ymm6, ymm8, ymm9, 136
+        vshufps ymm7, ymm8, ymm9, 221
+        vpshufd ymm6, ymm6, 0x93
+        vpshufd ymm7, ymm7, 0x93
+        mov     al, 7
+9:
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm0, ymm0, ymm1
+        vpxor   ymm3, ymm3, ymm0
+        vpshufb ymm3, ymm3, ymm14
+        vpaddd  ymm2, ymm2, ymm3
+        vpxor   ymm1, ymm1, ymm2
+        vpsrld  ymm8, ymm1, 12
+        vpslld  ymm1, ymm1, 20
+        vpor    ymm1, ymm1, ymm8
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm0, ymm0, ymm1
+        vpxor   ymm3, ymm3, ymm0
+        vpshufb ymm3, ymm3, ymm15
+        vpaddd  ymm2, ymm2, ymm3
+        vpxor   ymm1, ymm1, ymm2
+        vpsrld  ymm8, ymm1, 7
+        vpslld  ymm1, ymm1, 25
+        vpor    ymm1, ymm1, ymm8
+        vpshufd ymm0, ymm0, 0x93
+        vpshufd ymm3, ymm3, 0x4E
+        vpshufd ymm2, ymm2, 0x39
+        vpaddd  ymm0, ymm0, ymm6
+        vpaddd  ymm0, ymm0, ymm1
+        vpxor   ymm3, ymm3, ymm0
+        vpshufb ymm3, ymm3, ymm14
+        vpaddd  ymm2, ymm2, ymm3
+        vpxor   ymm1, ymm1, ymm2
+        vpsrld  ymm8, ymm1, 12
+        vpslld  ymm1, ymm1, 20
+        vpor    ymm1, ymm1, ymm8
+        vpaddd  ymm0, ymm0, ymm7
+        vpaddd  ymm0, ymm0, ymm1
+        vpxor   ymm3, ymm3, ymm0
+        vpshufb ymm3, ymm3, ymm15
+        vpaddd  ymm2, ymm2, ymm3
+        vpxor   ymm1, ymm1, ymm2
+        vpsrld  ymm8, ymm1, 7
+        vpslld  ymm1, ymm1, 25
+        vpor    ymm1, ymm1, ymm8
+        vpshufd ymm0, ymm0, 0x39
+        vpshufd ymm3, ymm3, 0x4E
+        vpshufd ymm2, ymm2, 0x93
+        dec     al
+        jz      9f
+        vshufps ymm8, ymm4, ymm5, 214
+        vpshufd ymm9, ymm4, 0x0F
+        vpshufd ymm4, ymm8, 0x39
+        vshufps ymm8, ymm6, ymm7, 250
+        vpblendd ymm9, ymm9, ymm8, 0xAA
+        vpunpcklqdq ymm8, ymm7, ymm5
+        vpblendd ymm8, ymm8, ymm6, 0x88
+        vpshufd ymm8, ymm8, 0x78
+        vpunpckhdq ymm5, ymm5, ymm7
+        vpunpckldq ymm6, ymm6, ymm5
+        vpshufd ymm7, ymm6, 0x1E
+        vmovdqa ymm5, ymm9
+        vmovdqa ymm6, ymm8
+        jmp     9b
+9:
+        vpxor   ymm0, ymm0, ymm2
+        vpxor   ymm1, ymm1, ymm3
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     2b
+        vmovdqu xmmword ptr [rbx], xmm0
+        vmovdqu xmmword ptr [rbx+0x10], xmm1
+        vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
+        vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
+        vmovaps ymm8, ymmword ptr [rsp+0x260]
+        vmovaps ymm0, ymmword ptr [rsp+0x220]
+        vmovups ymm1, ymmword ptr [rsp+0x228]
+        vmovaps ymm2, ymmword ptr [rsp+0x240]
+        vmovups ymm3, ymmword ptr [rsp+0x248]
+        vblendvps ymm0, ymm0, ymm1, ymm8
+        vblendvps ymm2, ymm2, ymm3, ymm8
+        vmovaps ymmword ptr [rsp+0x220], ymm0
+        vmovaps ymmword ptr [rsp+0x240], ymm2
+        add     rbx, 64
+        add     rdi, 16
+        sub     rsi, 2
+3:
+        test    rsi, 0x1
+        je      4b
+        vmovdqu xmm0, xmmword ptr [rcx]
+        vmovdqu xmm1, xmmword ptr [rcx+0x10]
+        vmovd   xmm3, dword ptr [rsp+0x220]
+        vpinsrd xmm3, xmm3, dword ptr [rsp+0x240], 1
+        vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+        vmovdqa xmm14, xmmword ptr [ROT16+rip]
+        vmovdqa xmm15, xmmword ptr [ROT8+rip]
+        mov     r8, qword ptr [rdi]
+        movzx   eax, byte ptr [rbp+0x80]
+        or      eax, r13d
+        xor     edx, edx
+.p2align 5
+2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip]
+        vmovdqa xmm3, xmm13
+        vpinsrd xmm3, xmm3, eax, 3
+        vmovups xmm8, xmmword ptr [r8+rdx-0x40]
+        vmovups xmm9, xmmword ptr [r8+rdx-0x30]
+        vshufps xmm4, xmm8, xmm9, 136
+        vshufps xmm5, xmm8, xmm9, 221
+        vmovups xmm8, xmmword ptr [r8+rdx-0x20]
+        vmovups xmm9, xmmword ptr [r8+rdx-0x10]
+        vshufps xmm6, xmm8, xmm9, 136
+        vshufps xmm7, xmm8, xmm9, 221
+        vpshufd xmm6, xmm6, 0x93
+        vpshufd xmm7, xmm7, 0x93
+        mov     al, 7
+9:
+        vpaddd  xmm0, xmm0, xmm4
+        vpaddd  xmm0, xmm0, xmm1
+        vpxor   xmm3, xmm3, xmm0
+        vpshufb xmm3, xmm3, xmm14
+        vpaddd  xmm2, xmm2, xmm3
+        vpxor   xmm1, xmm1, xmm2
+        vpsrld  xmm8, xmm1, 12
+        vpslld  xmm1, xmm1, 20
+        vpor    xmm1, xmm1, xmm8
+        vpaddd  xmm0, xmm0, xmm5
+        vpaddd  xmm0, xmm0, xmm1
+        vpxor   xmm3, xmm3, xmm0
+        vpshufb xmm3, xmm3, xmm15
+        vpaddd  xmm2, xmm2, xmm3
+        vpxor   xmm1, xmm1, xmm2
+        vpsrld  xmm8, xmm1, 7
+        vpslld  xmm1, xmm1, 25
+        vpor    xmm1, xmm1, xmm8
+        vpshufd xmm0, xmm0, 0x93
+        vpshufd xmm3, xmm3, 0x4E
+        vpshufd xmm2, xmm2, 0x39
+        vpaddd  xmm0, xmm0, xmm6
+        vpaddd  xmm0, xmm0, xmm1
+        vpxor   xmm3, xmm3, xmm0
+        vpshufb xmm3, xmm3, xmm14
+        vpaddd  xmm2, xmm2, xmm3
+        vpxor   xmm1, xmm1, xmm2
+        vpsrld  xmm8, xmm1, 12
+        vpslld  xmm1, xmm1, 20
+        vpor    xmm1, xmm1, xmm8
+        vpaddd  xmm0, xmm0, xmm7
+        vpaddd  xmm0, xmm0, xmm1
+        vpxor   xmm3, xmm3, xmm0
+        vpshufb xmm3, xmm3, xmm15
+        vpaddd  xmm2, xmm2, xmm3
+        vpxor   xmm1, xmm1, xmm2
+        vpsrld  xmm8, xmm1, 7
+        vpslld  xmm1, xmm1, 25
+        vpor    xmm1, xmm1, xmm8
+        vpshufd xmm0, xmm0, 0x39
+        vpshufd xmm3, xmm3, 0x4E
+        vpshufd xmm2, xmm2, 0x93
+        dec     al
+        jz      9f
+        vshufps xmm8, xmm4, xmm5, 214
+        vpshufd xmm9, xmm4, 0x0F
+        vpshufd xmm4, xmm8, 0x39
+        vshufps xmm8, xmm6, xmm7, 250
+        vpblendd xmm9, xmm9, xmm8, 0xAA
+        vpunpcklqdq xmm8, xmm7, xmm5
+        vpblendd xmm8, xmm8, xmm6, 0x88
+        vpshufd xmm8, xmm8, 0x78
+        vpunpckhdq xmm5, xmm5, xmm7
+        vpunpckldq xmm6, xmm6, xmm5
+        vpshufd xmm7, xmm6, 0x1E
+        vmovdqa xmm5, xmm9
+        vmovdqa xmm6, xmm8
+        jmp     9b
+9:
+        vpxor   xmm0, xmm0, xmm2
+        vpxor   xmm1, xmm1, xmm3
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     2b
+        vmovdqu xmmword ptr [rbx], xmm0
+        vmovdqu xmmword ptr [rbx+0x10], xmm1
+        jmp     4b
+
+.section .rodata
+.p2align  6
+ADD0:
+        .long  0, 1, 2, 3, 4, 5, 6, 7
+ADD1:
+        .long  8, 8, 8, 8, 8, 8, 8, 8
+BLAKE3_IV_0:
+        .long  0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
+        .long  0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
+BLAKE3_IV_1:
+        .long  0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
+        .long  0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
+BLAKE3_IV_2:
+        .long  0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
+        .long  0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
+BLAKE3_IV_3:
+        .long  0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
+        .long  0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
+BLAKE3_BLOCK_LEN:
+        .long  0x00000040, 0x00000040, 0x00000040, 0x00000040
+        .long  0x00000040, 0x00000040, 0x00000040, 0x00000040
+ROT16:
+        .byte  2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+ROT8:
+        .byte  1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
+CMP_MSB_MASK:
+        .long  0x80000000, 0x80000000, 0x80000000, 0x80000000
+        .long  0x80000000, 0x80000000, 0x80000000, 0x80000000
+BLAKE3_IV:
+        .long  0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
+
diff --git a/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_windows_msvc.asm b/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_windows_msvc.asm
new file mode 100644
index 000000000000..352298edd2e8
--- /dev/null
+++ b/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_windows_msvc.asm
@@ -0,0 +1,1828 @@
+public _blake3_hash_many_avx2
+public blake3_hash_many_avx2
+
+_TEXT   SEGMENT ALIGN(16) 'CODE'
+
+ALIGN   16
+blake3_hash_many_avx2 PROC
+_blake3_hash_many_avx2 PROC
+        push    r15
+        push    r14
+        push    r13
+        push    r12
+        push    rsi
+        push    rdi
+        push    rbx
+        push    rbp
+        mov     rbp, rsp
+        sub     rsp, 880
+        and     rsp, 0FFFFFFFFFFFFFFC0H
+        vmovdqa xmmword ptr [rsp+2D0H], xmm6
+        vmovdqa xmmword ptr [rsp+2E0H], xmm7
+        vmovdqa xmmword ptr [rsp+2F0H], xmm8
+        vmovdqa xmmword ptr [rsp+300H], xmm9
+        vmovdqa xmmword ptr [rsp+310H], xmm10
+        vmovdqa xmmword ptr [rsp+320H], xmm11
+        vmovdqa xmmword ptr [rsp+330H], xmm12
+        vmovdqa xmmword ptr [rsp+340H], xmm13
+        vmovdqa xmmword ptr [rsp+350H], xmm14
+        vmovdqa xmmword ptr [rsp+360H], xmm15
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+        mov     rcx, r9
+        mov     r8, qword ptr [rbp+68H]
+        movzx   r9, byte ptr [rbp+70H]
+        neg     r9d
+        vmovd   xmm0, r9d
+        vpbroadcastd ymm0, xmm0
+        vmovdqa ymmword ptr [rsp+260H], ymm0
+        vpand   ymm1, ymm0, ymmword ptr [ADD0]
+        vpand   ymm2, ymm0, ymmword ptr [ADD1]
+        vmovdqa ymmword ptr [rsp+2A0H], ymm2
+        vmovd   xmm2, r8d
+        vpbroadcastd ymm2, xmm2
+        vpaddd  ymm2, ymm2, ymm1
+        vmovdqa ymmword ptr [rsp+220H], ymm2
+        vpxor   ymm1, ymm1, ymmword ptr [CMP_MSB_MASK]
+        vpxor   ymm2, ymm2, ymmword ptr [CMP_MSB_MASK]
+        vpcmpgtd ymm2, ymm1, ymm2
+        shr     r8, 32
+        vmovd   xmm3, r8d
+        vpbroadcastd ymm3, xmm3
+        vpsubd  ymm3, ymm3, ymm2
+        vmovdqa ymmword ptr [rsp+240H], ymm3
+        shl     rdx, 6
+        mov     qword ptr [rsp+2C0H], rdx
+        cmp     rsi, 8
+        jc      final7blocks
+outerloop8:
+        vpbroadcastd ymm0, dword ptr [rcx]
+        vpbroadcastd ymm1, dword ptr [rcx+4H]
+        vpbroadcastd ymm2, dword ptr [rcx+8H]
+        vpbroadcastd ymm3, dword ptr [rcx+0CH]
+        vpbroadcastd ymm4, dword ptr [rcx+10H]
+        vpbroadcastd ymm5, dword ptr [rcx+14H]
+        vpbroadcastd ymm6, dword ptr [rcx+18H]
+        vpbroadcastd ymm7, dword ptr [rcx+1CH]
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+8H]
+        mov     r10, qword ptr [rdi+10H]
+        mov     r11, qword ptr [rdi+18H]
+        mov     r12, qword ptr [rdi+20H]
+        mov     r13, qword ptr [rdi+28H]
+        mov     r14, qword ptr [rdi+30H]
+        mov     r15, qword ptr [rdi+38H]
+        movzx   eax, byte ptr [rbp+78H]
+        movzx   ebx, byte ptr [rbp+80H]
+        or      eax, ebx
+        xor     edx, edx
+ALIGN   16
+innerloop8:
+        movzx   ebx, byte ptr [rbp+88H]
+        or      ebx, eax
+        add     rdx, 64
+        cmp     rdx, qword ptr [rsp+2C0H]
+        cmove   eax, ebx
+        mov     dword ptr [rsp+200H], eax
+        vmovups xmm8, xmmword ptr [r8+rdx-40H]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-40H], 01H
+        vmovups xmm9, xmmword ptr [r9+rdx-40H]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-40H], 01H
+        vunpcklpd ymm12, ymm8, ymm9
+        vunpckhpd ymm13, ymm8, ymm9
+        vmovups xmm10, xmmword ptr [r10+rdx-40H]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-40H], 01H
+        vmovups xmm11, xmmword ptr [r11+rdx-40H]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-40H], 01H
+        vunpcklpd ymm14, ymm10, ymm11
+        vunpckhpd ymm15, ymm10, ymm11
+        vshufps ymm8, ymm12, ymm14, 136
+        vmovaps ymmword ptr [rsp], ymm8
+        vshufps ymm9, ymm12, ymm14, 221
+        vmovaps ymmword ptr [rsp+20H], ymm9
+        vshufps ymm10, ymm13, ymm15, 136
+        vmovaps ymmword ptr [rsp+40H], ymm10
+        vshufps ymm11, ymm13, ymm15, 221
+        vmovaps ymmword ptr [rsp+60H], ymm11
+        vmovups xmm8, xmmword ptr [r8+rdx-30H]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-30H], 01H
+        vmovups xmm9, xmmword ptr [r9+rdx-30H]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-30H], 01H
+        vunpcklpd ymm12, ymm8, ymm9
+        vunpckhpd ymm13, ymm8, ymm9
+        vmovups xmm10, xmmword ptr [r10+rdx-30H]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-30H], 01H
+        vmovups xmm11, xmmword ptr [r11+rdx-30H]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-30H], 01H
+        vunpcklpd ymm14, ymm10, ymm11
+        vunpckhpd ymm15, ymm10, ymm11
+        vshufps ymm8, ymm12, ymm14, 136
+        vmovaps ymmword ptr [rsp+80H], ymm8
+        vshufps ymm9, ymm12, ymm14, 221
+        vmovaps ymmword ptr [rsp+0A0H], ymm9
+        vshufps ymm10, ymm13, ymm15, 136
+        vmovaps ymmword ptr [rsp+0C0H], ymm10
+        vshufps ymm11, ymm13, ymm15, 221
+        vmovaps ymmword ptr [rsp+0E0H], ymm11
+        vmovups xmm8, xmmword ptr [r8+rdx-20H]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-20H], 01H
+        vmovups xmm9, xmmword ptr [r9+rdx-20H]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-20H], 01H
+        vunpcklpd ymm12, ymm8, ymm9
+        vunpckhpd ymm13, ymm8, ymm9
+        vmovups xmm10, xmmword ptr [r10+rdx-20H]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-20H], 01H
+        vmovups xmm11, xmmword ptr [r11+rdx-20H]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-20H], 01H
+        vunpcklpd ymm14, ymm10, ymm11
+        vunpckhpd ymm15, ymm10, ymm11
+        vshufps ymm8, ymm12, ymm14, 136
+        vmovaps ymmword ptr [rsp+100H], ymm8
+        vshufps ymm9, ymm12, ymm14, 221
+        vmovaps ymmword ptr [rsp+120H], ymm9
+        vshufps ymm10, ymm13, ymm15, 136
+        vmovaps ymmword ptr [rsp+140H], ymm10
+        vshufps ymm11, ymm13, ymm15, 221
+        vmovaps ymmword ptr [rsp+160H], ymm11
+        vmovups xmm8, xmmword ptr [r8+rdx-10H]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-10H], 01H
+        vmovups xmm9, xmmword ptr [r9+rdx-10H]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-10H], 01H
+        vunpcklpd ymm12, ymm8, ymm9
+        vunpckhpd ymm13, ymm8, ymm9
+        vmovups xmm10, xmmword ptr [r10+rdx-10H]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-10H], 01H
+        vmovups xmm11, xmmword ptr [r11+rdx-10H]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-10H], 01H
+        vunpcklpd ymm14, ymm10, ymm11
+        vunpckhpd ymm15, ymm10, ymm11
+        vshufps ymm8, ymm12, ymm14, 136
+        vmovaps ymmword ptr [rsp+180H], ymm8
+        vshufps ymm9, ymm12, ymm14, 221
+        vmovaps ymmword ptr [rsp+1A0H], ymm9
+        vshufps ymm10, ymm13, ymm15, 136
+        vmovaps ymmword ptr [rsp+1C0H], ymm10
+        vshufps ymm11, ymm13, ymm15, 221
+        vmovaps ymmword ptr [rsp+1E0H], ymm11
+        vpbroadcastd ymm15, dword ptr [rsp+200H]
+        prefetcht0 byte ptr [r8+rdx+80H]
+        prefetcht0 byte ptr [r12+rdx+80H]
+        prefetcht0 byte ptr [r9+rdx+80H]
+        prefetcht0 byte ptr [r13+rdx+80H]
+        prefetcht0 byte ptr [r10+rdx+80H]
+        prefetcht0 byte ptr [r14+rdx+80H]
+        prefetcht0 byte ptr [r11+rdx+80H]
+        prefetcht0 byte ptr [r15+rdx+80H]
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+40H]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+80H]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0C0H]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm0, ymmword ptr [rsp+220H]
+        vpxor   ymm13, ymm1, ymmword ptr [rsp+240H]
+        vpxor   ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN]
+        vpxor   ymm15, ymm3, ymm15
+        vbroadcasti128 ymm8, xmmword ptr [ROT16]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [BLAKE3_IV_0]
+        vpaddd  ymm9, ymm13, ymmword ptr [BLAKE3_IV_1]
+        vpaddd  ymm10, ymm14, ymmword ptr [BLAKE3_IV_2]
+        vpaddd  ymm11, ymm15, ymmword ptr [BLAKE3_IV_3]
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+200H], ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+20H]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+60H]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0A0H]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0E0H]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+200H]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+200H], ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+100H]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+140H]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+180H]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+1C0H]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+200H]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+200H], ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+120H]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+160H]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+1A0H]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+1E0H]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+200H]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+200H], ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+40H]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+60H]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0E0H]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+80H]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+200H]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+200H], ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0C0H]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+140H]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+1A0H]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+200H]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+200H], ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+20H]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+180H]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+120H]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+1E0H]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+200H]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+200H], ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+160H]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0A0H]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+1C0H]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+100H]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+200H]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+200H], ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+60H]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+140H]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+1A0H]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0E0H]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+200H]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+200H], ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+80H]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+180H]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+40H]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+1C0H]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+200H]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+200H], ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0C0H]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+120H]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+160H]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+100H]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+200H]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+200H], ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0A0H]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+1E0H]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+20H]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+200H]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+200H], ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+140H]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+180H]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+1C0H]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+1A0H]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+200H]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+200H], ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0E0H]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+120H]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+60H]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+1E0H]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+200H]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+200H], ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+80H]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+160H]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0A0H]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+20H]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+200H]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+200H], ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+40H]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+100H]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0C0H]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+200H]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+200H], ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+180H]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+120H]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+1E0H]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+1C0H]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+200H]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+200H], ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+1A0H]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+160H]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+140H]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+100H]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+200H]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+200H], ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0E0H]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0A0H]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0C0H]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+200H]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+200H], ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+40H]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+60H]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+20H]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+80H]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+200H]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+200H], ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+120H]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+160H]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+100H]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+1E0H]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+200H]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+200H], ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+1C0H]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0A0H]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+180H]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+20H]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+200H]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+200H], ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+1A0H]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+40H]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+80H]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+200H]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+200H], ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+60H]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+140H]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0C0H]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0E0H]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+200H]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+200H], ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+160H]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0A0H]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+20H]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+100H]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+200H]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+200H], ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+1E0H]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+120H]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0C0H]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+200H]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+200H], ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+1C0H]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+40H]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+60H]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0E0H]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+200H]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+200H], ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+140H]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+180H]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+80H]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+1A0H]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+200H]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vpxor   ymm0, ymm0, ymm8
+        vpxor   ymm1, ymm1, ymm9
+        vpxor   ymm2, ymm2, ymm10
+        vpxor   ymm3, ymm3, ymm11
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpxor   ymm4, ymm4, ymm12
+        vpxor   ymm5, ymm5, ymm13
+        vpxor   ymm6, ymm6, ymm14
+        vpxor   ymm7, ymm7, ymm15
+        movzx   eax, byte ptr [rbp+78H]
+        jne     innerloop8
+        mov     rbx, qword ptr [rbp+90H]
+        vunpcklps ymm8, ymm0, ymm1
+        vunpcklps ymm9, ymm2, ymm3
+        vunpckhps ymm10, ymm0, ymm1
+        vunpcklps ymm11, ymm4, ymm5
+        vunpcklps ymm0, ymm6, ymm7
+        vshufps ymm12, ymm8, ymm9, 78
+        vblendps ymm1, ymm8, ymm12, 0CCH
+        vshufps ymm8, ymm11, ymm0, 78
+        vunpckhps ymm13, ymm2, ymm3
+        vblendps ymm2, ymm11, ymm8, 0CCH
+        vblendps ymm3, ymm12, ymm9, 0CCH
+        vperm2f128 ymm12, ymm1, ymm2, 20H
+        vmovups ymmword ptr [rbx], ymm12
+        vunpckhps ymm14, ymm4, ymm5
+        vblendps ymm4, ymm8, ymm0, 0CCH
+        vunpckhps ymm15, ymm6, ymm7
+        vperm2f128 ymm7, ymm3, ymm4, 20H
+        vmovups ymmword ptr [rbx+20H], ymm7
+        vshufps ymm5, ymm10, ymm13, 78
+        vblendps ymm6, ymm5, ymm13, 0CCH
+        vshufps ymm13, ymm14, ymm15, 78
+        vblendps ymm10, ymm10, ymm5, 0CCH
+        vblendps ymm14, ymm14, ymm13, 0CCH
+        vperm2f128 ymm8, ymm10, ymm14, 20H
+        vmovups ymmword ptr [rbx+40H], ymm8
+        vblendps ymm15, ymm13, ymm15, 0CCH
+        vperm2f128 ymm13, ymm6, ymm15, 20H
+        vmovups ymmword ptr [rbx+60H], ymm13
+        vperm2f128 ymm9, ymm1, ymm2, 31H
+        vperm2f128 ymm11, ymm3, ymm4, 31H
+        vmovups ymmword ptr [rbx+80H], ymm9
+        vperm2f128 ymm14, ymm10, ymm14, 31H
+        vperm2f128 ymm15, ymm6, ymm15, 31H
+        vmovups ymmword ptr [rbx+0A0H], ymm11
+        vmovups ymmword ptr [rbx+0C0H], ymm14
+        vmovups ymmword ptr [rbx+0E0H], ymm15
+        vmovdqa ymm0, ymmword ptr [rsp+2A0H]
+        vpaddd  ymm1, ymm0, ymmword ptr [rsp+220H]
+        vmovdqa ymmword ptr [rsp+220H], ymm1
+        vpxor   ymm0, ymm0, ymmword ptr [CMP_MSB_MASK]
+        vpxor   ymm2, ymm1, ymmword ptr [CMP_MSB_MASK]
+        vpcmpgtd ymm2, ymm0, ymm2
+        vmovdqa ymm0, ymmword ptr [rsp+240H]
+        vpsubd  ymm2, ymm0, ymm2
+        vmovdqa ymmword ptr [rsp+240H], ymm2
+        add     rdi, 64
+        add     rbx, 256
+        mov     qword ptr [rbp+90H], rbx
+        sub     rsi, 8
+        cmp     rsi, 8
+        jnc     outerloop8
+        test    rsi, rsi
+        jnz     final7blocks
+unwind:
+        vzeroupper
+        vmovdqa xmm6, xmmword ptr [rsp+2D0H]
+        vmovdqa xmm7, xmmword ptr [rsp+2E0H]
+        vmovdqa xmm8, xmmword ptr [rsp+2F0H]
+        vmovdqa xmm9, xmmword ptr [rsp+300H]
+        vmovdqa xmm10, xmmword ptr [rsp+310H]
+        vmovdqa xmm11, xmmword ptr [rsp+320H]
+        vmovdqa xmm12, xmmword ptr [rsp+330H]
+        vmovdqa xmm13, xmmword ptr [rsp+340H]
+        vmovdqa xmm14, xmmword ptr [rsp+350H]
+        vmovdqa xmm15, xmmword ptr [rsp+360H]
+        mov     rsp, rbp
+        pop     rbp
+        pop     rbx
+        pop     rdi
+        pop     rsi
+        pop     r12
+        pop     r13
+        pop     r14
+        pop     r15
+        ret
+ALIGN   16
+final7blocks:
+        mov     rbx, qword ptr [rbp+90H]
+        mov     r15, qword ptr [rsp+2C0H]
+        movzx   r13d, byte ptr [rbp+78H]
+        movzx   r12d, byte ptr [rbp+88H]
+        test    rsi, 4H
+        je      final3blocks
+        vbroadcasti128 ymm0, xmmword ptr [rcx]
+        vbroadcasti128 ymm1, xmmword ptr [rcx+10H]
+        vmovdqa ymm8, ymm0
+        vmovdqa ymm9, ymm1
+        vbroadcasti128 ymm12, xmmword ptr [rsp+220H]
+        vbroadcasti128 ymm13, xmmword ptr [rsp+240H]
+        vpunpckldq ymm14, ymm12, ymm13
+        vpunpckhdq ymm15, ymm12, ymm13
+        vpermq  ymm14, ymm14, 50H
+        vpermq  ymm15, ymm15, 50H
+        vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN]
+        vpblendd ymm14, ymm14, ymm12, 44H
+        vpblendd ymm15, ymm15, ymm12, 44H
+        vmovdqa ymmword ptr [rsp], ymm14
+        vmovdqa ymmword ptr [rsp+20H], ymm15
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+8H]
+        mov     r10, qword ptr [rdi+10H]
+        mov     r11, qword ptr [rdi+18H]
+        movzx   eax, byte ptr [rbp+80H]
+        or      eax, r13d
+        xor     edx, edx
+ALIGN   16
+innerloop4:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        mov     dword ptr [rsp+200H], eax
+        vmovups ymm2, ymmword ptr [r8+rdx-40H]
+        vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-40H], 01H
+        vmovups ymm3, ymmword ptr [r8+rdx-30H]
+        vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-30H], 01H
+        vshufps ymm4, ymm2, ymm3, 136
+        vshufps ymm5, ymm2, ymm3, 221
+        vmovups ymm2, ymmword ptr [r8+rdx-20H]
+        vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-20H], 01H
+        vmovups ymm3, ymmword ptr [r8+rdx-10H]
+        vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-10H], 01H
+        vshufps ymm6, ymm2, ymm3, 136
+        vshufps ymm7, ymm2, ymm3, 221
+        vpshufd ymm6, ymm6, 93H
+        vpshufd ymm7, ymm7, 93H
+        vmovups ymm10, ymmword ptr [r10+rdx-40H]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-40H], 01H
+        vmovups ymm11, ymmword ptr [r10+rdx-30H]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-30H], 01H
+        vshufps ymm12, ymm10, ymm11, 136
+        vshufps ymm13, ymm10, ymm11, 221
+        vmovups ymm10, ymmword ptr [r10+rdx-20H]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-20H], 01H
+        vmovups ymm11, ymmword ptr [r10+rdx-10H]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-10H], 01H
+        vshufps ymm14, ymm10, ymm11, 136
+        vshufps ymm15, ymm10, ymm11, 221
+        vpshufd ymm14, ymm14, 93H
+        vpshufd ymm15, ymm15, 93H
+        vpbroadcastd ymm2, dword ptr [rsp+200H]
+        vmovdqa ymm3, ymmword ptr [rsp]
+        vmovdqa ymm11, ymmword ptr [rsp+20H]
+        vpblendd ymm3, ymm3, ymm2, 88H
+        vpblendd ymm11, ymm11, ymm2, 88H
+        vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV]
+        vmovdqa ymm10, ymm2
+        mov     al, 7
+roundloop4:
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm8, ymm8, ymm12
+        vmovdqa ymmword ptr [rsp+40H], ymm4
+        nop
+        vmovdqa ymmword ptr [rsp+60H], ymm12
+        nop
+        vpaddd  ymm0, ymm0, ymm1
+        vpaddd  ymm8, ymm8, ymm9
+        vpxor   ymm3, ymm3, ymm0
+        vpxor   ymm11, ymm11, ymm8
+        vbroadcasti128 ymm4, xmmword ptr [ROT16]
+        vpshufb ymm3, ymm3, ymm4
+        vpshufb ymm11, ymm11, ymm4
+        vpaddd  ymm2, ymm2, ymm3
+        vpaddd  ymm10, ymm10, ymm11
+        vpxor   ymm1, ymm1, ymm2
+        vpxor   ymm9, ymm9, ymm10
+        vpsrld  ymm4, ymm1, 12
+        vpslld  ymm1, ymm1, 20
+        vpor    ymm1, ymm1, ymm4
+        vpsrld  ymm4, ymm9, 12
+        vpslld  ymm9, ymm9, 20
+        vpor    ymm9, ymm9, ymm4
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm0, ymm0, ymm1
+        vpaddd  ymm8, ymm8, ymm9
+        vmovdqa ymmword ptr [rsp+80H], ymm5
+        vmovdqa ymmword ptr [rsp+0A0H], ymm13
+        vpxor   ymm3, ymm3, ymm0
+        vpxor   ymm11, ymm11, ymm8
+        vbroadcasti128 ymm4, xmmword ptr [ROT8]
+        vpshufb ymm3, ymm3, ymm4
+        vpshufb ymm11, ymm11, ymm4
+        vpaddd  ymm2, ymm2, ymm3
+        vpaddd  ymm10, ymm10, ymm11
+        vpxor   ymm1, ymm1, ymm2
+        vpxor   ymm9, ymm9, ymm10
+        vpsrld  ymm4, ymm1, 7
+        vpslld  ymm1, ymm1, 25
+        vpor    ymm1, ymm1, ymm4
+        vpsrld  ymm4, ymm9, 7
+        vpslld  ymm9, ymm9, 25
+        vpor    ymm9, ymm9, ymm4
+        vpshufd ymm0, ymm0, 93H
+        vpshufd ymm8, ymm8, 93H
+        vpshufd ymm3, ymm3, 4EH
+        vpshufd ymm11, ymm11, 4EH
+        vpshufd ymm2, ymm2, 39H
+        vpshufd ymm10, ymm10, 39H
+        vpaddd  ymm0, ymm0, ymm6
+        vpaddd  ymm8, ymm8, ymm14
+        vpaddd  ymm0, ymm0, ymm1
+        vpaddd  ymm8, ymm8, ymm9
+        vpxor   ymm3, ymm3, ymm0
+        vpxor   ymm11, ymm11, ymm8
+        vbroadcasti128 ymm4, xmmword ptr [ROT16]
+        vpshufb ymm3, ymm3, ymm4
+        vpshufb ymm11, ymm11, ymm4
+        vpaddd  ymm2, ymm2, ymm3
+        vpaddd  ymm10, ymm10, ymm11
+        vpxor   ymm1, ymm1, ymm2
+        vpxor   ymm9, ymm9, ymm10
+        vpsrld  ymm4, ymm1, 12
+        vpslld  ymm1, ymm1, 20
+        vpor    ymm1, ymm1, ymm4
+        vpsrld  ymm4, ymm9, 12
+        vpslld  ymm9, ymm9, 20
+        vpor    ymm9, ymm9, ymm4
+        vpaddd  ymm0, ymm0, ymm7
+        vpaddd  ymm8, ymm8, ymm15
+        vpaddd  ymm0, ymm0, ymm1
+        vpaddd  ymm8, ymm8, ymm9
+        vpxor   ymm3, ymm3, ymm0
+        vpxor   ymm11, ymm11, ymm8
+        vbroadcasti128 ymm4, xmmword ptr [ROT8]
+        vpshufb ymm3, ymm3, ymm4
+        vpshufb ymm11, ymm11, ymm4
+        vpaddd  ymm2, ymm2, ymm3
+        vpaddd  ymm10, ymm10, ymm11
+        vpxor   ymm1, ymm1, ymm2
+        vpxor   ymm9, ymm9, ymm10
+        vpsrld  ymm4, ymm1, 7
+        vpslld  ymm1, ymm1, 25
+        vpor    ymm1, ymm1, ymm4
+        vpsrld  ymm4, ymm9, 7
+        vpslld  ymm9, ymm9, 25
+        vpor    ymm9, ymm9, ymm4
+        vpshufd ymm0, ymm0, 39H
+        vpshufd ymm8, ymm8, 39H
+        vpshufd ymm3, ymm3, 4EH
+        vpshufd ymm11, ymm11, 4EH
+        vpshufd ymm2, ymm2, 93H
+        vpshufd ymm10, ymm10, 93H
+        dec     al
+        je      endroundloop4
+        vmovdqa ymm4, ymmword ptr [rsp+40H]
+        vmovdqa ymm5, ymmword ptr [rsp+80H]
+        vshufps ymm12, ymm4, ymm5, 214
+        vpshufd ymm13, ymm4, 0FH
+        vpshufd ymm4, ymm12, 39H
+        vshufps ymm12, ymm6, ymm7, 250
+        vpblendd ymm13, ymm13, ymm12, 0AAH
+        vpunpcklqdq ymm12, ymm7, ymm5
+        vpblendd ymm12, ymm12, ymm6, 88H
+        vpshufd ymm12, ymm12, 78H
+        vpunpckhdq ymm5, ymm5, ymm7
+        vpunpckldq ymm6, ymm6, ymm5
+        vpshufd ymm7, ymm6, 1EH
+        vmovdqa ymmword ptr [rsp+40H], ymm13
+        vmovdqa ymmword ptr [rsp+80H], ymm12
+        vmovdqa ymm12, ymmword ptr [rsp+60H]
+        vmovdqa ymm13, ymmword ptr [rsp+0A0H]
+        vshufps ymm5, ymm12, ymm13, 214
+        vpshufd ymm6, ymm12, 0FH
+        vpshufd ymm12, ymm5, 39H
+        vshufps ymm5, ymm14, ymm15, 250
+        vpblendd ymm6, ymm6, ymm5, 0AAH
+        vpunpcklqdq ymm5, ymm15, ymm13
+        vpblendd ymm5, ymm5, ymm14, 88H
+        vpshufd ymm5, ymm5, 78H
+        vpunpckhdq ymm13, ymm13, ymm15
+        vpunpckldq ymm14, ymm14, ymm13
+        vpshufd ymm15, ymm14, 1EH
+        vmovdqa ymm13, ymm6
+        vmovdqa ymm14, ymm5
+        vmovdqa ymm5, ymmword ptr [rsp+40H]
+        vmovdqa ymm6, ymmword ptr [rsp+80H]
+        jmp     roundloop4
+endroundloop4:
+        vpxor   ymm0, ymm0, ymm2
+        vpxor   ymm1, ymm1, ymm3
+        vpxor   ymm8, ymm8, ymm10
+        vpxor   ymm9, ymm9, ymm11
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     innerloop4
+        vmovdqu xmmword ptr [rbx], xmm0
+        vmovdqu xmmword ptr [rbx+10H], xmm1
+        vextracti128 xmmword ptr [rbx+20H], ymm0, 01H
+        vextracti128 xmmword ptr [rbx+30H], ymm1, 01H
+        vmovdqu xmmword ptr [rbx+40H], xmm8
+        vmovdqu xmmword ptr [rbx+50H], xmm9
+        vextracti128 xmmword ptr [rbx+60H], ymm8, 01H
+        vextracti128 xmmword ptr [rbx+70H], ymm9, 01H
+        vmovaps xmm8, xmmword ptr [rsp+260H]
+        vmovaps xmm0, xmmword ptr [rsp+220H]
+        vmovaps xmm1, xmmword ptr [rsp+230H]
+        vmovaps xmm2, xmmword ptr [rsp+240H]
+        vmovaps xmm3, xmmword ptr [rsp+250H]
+        vblendvps xmm0, xmm0, xmm1, xmm8
+        vblendvps xmm2, xmm2, xmm3, xmm8
+        vmovaps xmmword ptr [rsp+220H], xmm0
+        vmovaps xmmword ptr [rsp+240H], xmm2
+        add     rbx, 128
+        add     rdi, 32
+        sub     rsi, 4
+final3blocks:
+        test    rsi, 2H
+        je      final1blocks
+        vbroadcasti128 ymm0, xmmword ptr [rcx]
+        vbroadcasti128 ymm1, xmmword ptr [rcx+10H]
+        vmovd   xmm13, dword ptr [rsp+220H]
+        vpinsrd xmm13, xmm13, dword ptr [rsp+240H], 1
+        vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2
+        vmovd   xmm14, dword ptr [rsp+224H]
+        vpinsrd xmm14, xmm14, dword ptr [rsp+244H], 1
+        vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2
+        vinserti128 ymm13, ymm13, xmm14, 01H
+        vbroadcasti128 ymm14, xmmword ptr [ROT16]
+        vbroadcasti128 ymm15, xmmword ptr [ROT8]
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+8H]
+        movzx   eax, byte ptr [rbp+80H]
+        or      eax, r13d
+        xor     edx, edx
+ALIGN   16
+innerloop2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        mov     dword ptr [rsp+200H], eax
+        vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV]
+        vpbroadcastd ymm8, dword ptr [rsp+200H]
+        vpblendd ymm3, ymm13, ymm8, 88H
+        vmovups ymm8, ymmword ptr [r8+rdx-40H]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-40H], 01H
+        vmovups ymm9, ymmword ptr [r8+rdx-30H]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-30H], 01H
+        vshufps ymm4, ymm8, ymm9, 136
+        vshufps ymm5, ymm8, ymm9, 221
+        vmovups ymm8, ymmword ptr [r8+rdx-20H]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-20H], 01H
+        vmovups ymm9, ymmword ptr [r8+rdx-10H]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-10H], 01H
+        vshufps ymm6, ymm8, ymm9, 136
+        vshufps ymm7, ymm8, ymm9, 221
+        vpshufd ymm6, ymm6, 93H
+        vpshufd ymm7, ymm7, 93H
+        mov     al, 7
+roundloop2:
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm0, ymm0, ymm1
+        vpxor   ymm3, ymm3, ymm0
+        vpshufb ymm3, ymm3, ymm14
+        vpaddd  ymm2, ymm2, ymm3
+        vpxor   ymm1, ymm1, ymm2
+        vpsrld  ymm8, ymm1, 12
+        vpslld  ymm1, ymm1, 20
+        vpor    ymm1, ymm1, ymm8
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm0, ymm0, ymm1
+        vpxor   ymm3, ymm3, ymm0
+        vpshufb ymm3, ymm3, ymm15
+        vpaddd  ymm2, ymm2, ymm3
+        vpxor   ymm1, ymm1, ymm2
+        vpsrld  ymm8, ymm1, 7
+        vpslld  ymm1, ymm1, 25
+        vpor    ymm1, ymm1, ymm8
+        vpshufd ymm0, ymm0, 93H
+        vpshufd ymm3, ymm3, 4EH
+        vpshufd ymm2, ymm2, 39H
+        vpaddd  ymm0, ymm0, ymm6
+        vpaddd  ymm0, ymm0, ymm1
+        vpxor   ymm3, ymm3, ymm0
+        vpshufb ymm3, ymm3, ymm14
+        vpaddd  ymm2, ymm2, ymm3
+        vpxor   ymm1, ymm1, ymm2
+        vpsrld  ymm8, ymm1, 12
+        vpslld  ymm1, ymm1, 20
+        vpor    ymm1, ymm1, ymm8
+        vpaddd  ymm0, ymm0, ymm7
+        vpaddd  ymm0, ymm0, ymm1
+        vpxor   ymm3, ymm3, ymm0
+        vpshufb ymm3, ymm3, ymm15
+        vpaddd  ymm2, ymm2, ymm3
+        vpxor   ymm1, ymm1, ymm2
+        vpsrld  ymm8, ymm1, 7
+        vpslld  ymm1, ymm1, 25
+        vpor    ymm1, ymm1, ymm8
+        vpshufd ymm0, ymm0, 39H
+        vpshufd ymm3, ymm3, 4EH
+        vpshufd ymm2, ymm2, 93H
+        dec     al
+        jz      endroundloop2
+        vshufps ymm8, ymm4, ymm5, 214
+        vpshufd ymm9, ymm4, 0FH
+        vpshufd ymm4, ymm8, 39H
+        vshufps ymm8, ymm6, ymm7, 250
+        vpblendd ymm9, ymm9, ymm8, 0AAH
+        vpunpcklqdq ymm8, ymm7, ymm5
+        vpblendd ymm8, ymm8, ymm6, 88H
+        vpshufd ymm8, ymm8, 78H
+        vpunpckhdq ymm5, ymm5, ymm7
+        vpunpckldq ymm6, ymm6, ymm5
+        vpshufd ymm7, ymm6, 1EH
+        vmovdqa ymm5, ymm9
+        vmovdqa ymm6, ymm8
+        jmp     roundloop2
+endroundloop2:
+        vpxor   ymm0, ymm0, ymm2
+        vpxor   ymm1, ymm1, ymm3
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     innerloop2
+        vmovdqu xmmword ptr [rbx], xmm0
+        vmovdqu xmmword ptr [rbx+10H], xmm1
+        vextracti128 xmmword ptr [rbx+20H], ymm0, 01H
+        vextracti128 xmmword ptr [rbx+30H], ymm1, 01H
+        vmovaps ymm8, ymmword ptr [rsp+260H]
+        vmovaps ymm0, ymmword ptr [rsp+220H]
+        vmovups ymm1, ymmword ptr [rsp+228H]
+        vmovaps ymm2, ymmword ptr [rsp+240H]
+        vmovups ymm3, ymmword ptr [rsp+248H]
+        vblendvps ymm0, ymm0, ymm1, ymm8
+        vblendvps ymm2, ymm2, ymm3, ymm8
+        vmovaps ymmword ptr [rsp+220H], ymm0
+        vmovaps ymmword ptr [rsp+240H], ymm2
+        add     rbx, 64
+        add     rdi, 16
+        sub     rsi, 2
+final1blocks:
+        test    rsi, 1H
+        je      unwind
+        vmovdqu xmm0, xmmword ptr [rcx]
+        vmovdqu xmm1, xmmword ptr [rcx+10H]
+        vmovd   xmm3, dword ptr [rsp+220H]
+        vpinsrd xmm3, xmm3, dword ptr [rsp+240H], 1
+        vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN], 2
+        vmovdqa xmm14, xmmword ptr [ROT16]
+        vmovdqa xmm15, xmmword ptr [ROT8]
+        mov     r8, qword ptr [rdi]
+        movzx   eax, byte ptr [rbp+80H]
+        or      eax, r13d
+        xor     edx, edx
+ALIGN   16
+innerloop1:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        vmovdqa xmm2, xmmword ptr [BLAKE3_IV]
+        vmovdqa xmm3, xmm13
+        vpinsrd xmm3, xmm3, eax, 3
+        vmovups xmm8, xmmword ptr [r8+rdx-40H]
+        vmovups xmm9, xmmword ptr [r8+rdx-30H]
+        vshufps xmm4, xmm8, xmm9, 136
+        vshufps xmm5, xmm8, xmm9, 221
+        vmovups xmm8, xmmword ptr [r8+rdx-20H]
+        vmovups xmm9, xmmword ptr [r8+rdx-10H]
+        vshufps xmm6, xmm8, xmm9, 136
+        vshufps xmm7, xmm8, xmm9, 221
+        vpshufd xmm6, xmm6, 93H
+        vpshufd xmm7, xmm7, 93H
+        mov     al, 7
+roundloop1:
+        vpaddd  xmm0, xmm0, xmm4
+        vpaddd  xmm0, xmm0, xmm1
+        vpxor   xmm3, xmm3, xmm0
+        vpshufb xmm3, xmm3, xmm14
+        vpaddd  xmm2, xmm2, xmm3
+        vpxor   xmm1, xmm1, xmm2
+        vpsrld  xmm8, xmm1, 12
+        vpslld  xmm1, xmm1, 20
+        vpor    xmm1, xmm1, xmm8
+        vpaddd  xmm0, xmm0, xmm5
+        vpaddd  xmm0, xmm0, xmm1
+        vpxor   xmm3, xmm3, xmm0
+        vpshufb xmm3, xmm3, xmm15
+        vpaddd  xmm2, xmm2, xmm3
+        vpxor   xmm1, xmm1, xmm2
+        vpsrld  xmm8, xmm1, 7
+        vpslld  xmm1, xmm1, 25
+        vpor    xmm1, xmm1, xmm8
+        vpshufd xmm0, xmm0, 93H
+        vpshufd xmm3, xmm3, 4EH
+        vpshufd xmm2, xmm2, 39H
+        vpaddd  xmm0, xmm0, xmm6
+        vpaddd  xmm0, xmm0, xmm1
+        vpxor   xmm3, xmm3, xmm0
+        vpshufb xmm3, xmm3, xmm14
+        vpaddd  xmm2, xmm2, xmm3
+        vpxor   xmm1, xmm1, xmm2
+        vpsrld  xmm8, xmm1, 12
+        vpslld  xmm1, xmm1, 20
+        vpor    xmm1, xmm1, xmm8
+        vpaddd  xmm0, xmm0, xmm7
+        vpaddd  xmm0, xmm0, xmm1
+        vpxor   xmm3, xmm3, xmm0
+        vpshufb xmm3, xmm3, xmm15
+        vpaddd  xmm2, xmm2, xmm3
+        vpxor   xmm1, xmm1, xmm2
+        vpsrld  xmm8, xmm1, 7
+        vpslld  xmm1, xmm1, 25
+        vpor    xmm1, xmm1, xmm8
+        vpshufd xmm0, xmm0, 39H
+        vpshufd xmm3, xmm3, 4EH
+        vpshufd xmm2, xmm2, 93H
+        dec     al
+        jz      endroundloop1
+        vshufps xmm8, xmm4, xmm5, 214
+        vpshufd xmm9, xmm4, 0FH
+        vpshufd xmm4, xmm8, 39H
+        vshufps xmm8, xmm6, xmm7, 250
+        vpblendd xmm9, xmm9, xmm8, 0AAH
+        vpunpcklqdq xmm8, xmm7, xmm5
+        vpblendd xmm8, xmm8, xmm6, 88H
+        vpshufd xmm8, xmm8, 78H
+        vpunpckhdq xmm5, xmm5, xmm7
+        vpunpckldq xmm6, xmm6, xmm5
+        vpshufd xmm7, xmm6, 1EH
+        vmovdqa xmm5, xmm9
+        vmovdqa xmm6, xmm8
+        jmp     roundloop1
+endroundloop1:
+        vpxor   xmm0, xmm0, xmm2
+        vpxor   xmm1, xmm1, xmm3
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     innerloop1
+        vmovdqu xmmword ptr [rbx], xmm0
+        vmovdqu xmmword ptr [rbx+10H], xmm1
+        jmp     unwind
+
+_blake3_hash_many_avx2 ENDP
+blake3_hash_many_avx2 ENDP
+_TEXT ENDS
+
+_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST'
+ALIGN   64
+ADD0:
+        dd 0, 1, 2, 3, 4, 5, 6, 7
+
+ADD1:
+        dd 8 dup (8)
+
+BLAKE3_IV_0:
+        dd 8 dup (6A09E667H)
+
+BLAKE3_IV_1:
+        dd 8 dup (0BB67AE85H)
+
+BLAKE3_IV_2:
+        dd 8 dup (3C6EF372H)
+
+BLAKE3_IV_3:
+        dd 8 dup (0A54FF53AH)
+
+BLAKE3_BLOCK_LEN:
+        dd 8 dup (64)
+
+ROT16:
+        db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+
+ROT8:
+        db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
+
+CMP_MSB_MASK:
+        dd 8 dup(80000000H)
+
+BLAKE3_IV:
+        dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH
+
+_RDATA ENDS
+END
diff --git a/llvm/lib/Support/BLAKE3/blake3_avx512.c b/llvm/lib/Support/BLAKE3/blake3_avx512.c
new file mode 100644
index 000000000000..9c35b08c439a
--- /dev/null
+++ b/llvm/lib/Support/BLAKE3/blake3_avx512.c
@@ -0,0 +1,1207 @@
+#include "blake3_impl.h"
+
+#include <immintrin.h>
+
+#define _mm_shuffle_ps2(a, b, c)                                               \
+  (_mm_castps_si128(                                                           \
+      _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
+
+INLINE __m128i loadu_128(const uint8_t src[16]) {
+  return _mm_loadu_si128((const __m128i *)src);
+}
+
+INLINE __m256i loadu_256(const uint8_t src[32]) {
+  return _mm256_loadu_si256((const __m256i *)src);
+}
+
+INLINE __m512i loadu_512(const uint8_t src[64]) {
+  return _mm512_loadu_si512((const __m512i *)src);
+}
+
+INLINE void storeu_128(__m128i src, uint8_t dest[16]) {
+  _mm_storeu_si128((__m128i *)dest, src);
+}
+
+INLINE void storeu_256(__m256i src, uint8_t dest[16]) {
+  _mm256_storeu_si256((__m256i *)dest, src);
+}
+
+INLINE __m128i add_128(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
+
+INLINE __m256i add_256(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); }
+
+INLINE __m512i add_512(__m512i a, __m512i b) { return _mm512_add_epi32(a, b); }
+
+INLINE __m128i xor_128(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
+
+INLINE __m256i xor_256(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); }
+
+INLINE __m512i xor_512(__m512i a, __m512i b) { return _mm512_xor_si512(a, b); }
+
+INLINE __m128i set1_128(uint32_t x) { return _mm_set1_epi32((int32_t)x); }
+
+INLINE __m256i set1_256(uint32_t x) { return _mm256_set1_epi32((int32_t)x); }
+
+INLINE __m512i set1_512(uint32_t x) { return _mm512_set1_epi32((int32_t)x); }
+
+INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+  return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
+}
+
+INLINE __m128i rot16_128(__m128i x) { return _mm_ror_epi32(x, 16); }
+
+INLINE __m256i rot16_256(__m256i x) { return _mm256_ror_epi32(x, 16); }
+
+INLINE __m512i rot16_512(__m512i x) { return _mm512_ror_epi32(x, 16); }
+
+INLINE __m128i rot12_128(__m128i x) { return _mm_ror_epi32(x, 12); }
+
+INLINE __m256i rot12_256(__m256i x) { return _mm256_ror_epi32(x, 12); }
+
+INLINE __m512i rot12_512(__m512i x) { return _mm512_ror_epi32(x, 12); }
+
+INLINE __m128i rot8_128(__m128i x) { return _mm_ror_epi32(x, 8); }
+
+INLINE __m256i rot8_256(__m256i x) { return _mm256_ror_epi32(x, 8); }
+
+INLINE __m512i rot8_512(__m512i x) { return _mm512_ror_epi32(x, 8); }
+
+INLINE __m128i rot7_128(__m128i x) { return _mm_ror_epi32(x, 7); }
+
+INLINE __m256i rot7_256(__m256i x) { return _mm256_ror_epi32(x, 7); }
+
+INLINE __m512i rot7_512(__m512i x) { return _mm512_ror_epi32(x, 7); }
+
+/*
+ * ----------------------------------------------------------------------------
+ * compress_avx512
+ * ----------------------------------------------------------------------------
+ */
+
+INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
+               __m128i m) {
+  *row0 = add_128(add_128(*row0, m), *row1);
+  *row3 = xor_128(*row3, *row0);
+  *row3 = rot16_128(*row3);
+  *row2 = add_128(*row2, *row3);
+  *row1 = xor_128(*row1, *row2);
+  *row1 = rot12_128(*row1);
+}
+
+INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
+               __m128i m) {
+  *row0 = add_128(add_128(*row0, m), *row1);
+  *row3 = xor_128(*row3, *row0);
+  *row3 = rot8_128(*row3);
+  *row2 = add_128(*row2, *row3);
+  *row1 = xor_128(*row1, *row2);
+  *row1 = rot7_128(*row1);
+}
+
+// Note the optimization here of leaving row1 as the unrotated row, rather than
+// row0. All the message loads below are adjusted to compensate for this. See
+// discussion at https://github.com/sneves/blake2-avx2/pull/4
+INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
+  *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
+  *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
+  *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
+}
+
+INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
+  *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
+  *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
+  *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
+}
+
+INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
+                         const uint8_t block[BLAKE3_BLOCK_LEN],
+                         uint8_t block_len, uint64_t counter, uint8_t flags) {
+  rows[0] = loadu_128((uint8_t *)&cv[0]);
+  rows[1] = loadu_128((uint8_t *)&cv[4]);
+  rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
+  rows[3] = set4(counter_low(counter), counter_high(counter),
+                 (uint32_t)block_len, (uint32_t)flags);
+
+  __m128i m0 = loadu_128(&block[sizeof(__m128i) * 0]);
+  __m128i m1 = loadu_128(&block[sizeof(__m128i) * 1]);
+  __m128i m2 = loadu_128(&block[sizeof(__m128i) * 2]);
+  __m128i m3 = loadu_128(&block[sizeof(__m128i) * 3]);
+
+  __m128i t0, t1, t2, t3, tt;
+
+  // Round 1. The first round permutes the message words from the original
+  // input order, into the groups that get mixed in parallel.
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); //  6  4  2  0
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); //  7  5  3  1
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10  8
+  t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));   // 12 10  8 14
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11  9
+  t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));   // 13 11  9 15
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 2. This round and all following rounds apply a fixed permutation
+  // to the message words from the round before.
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 3
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 4
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 5
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 6
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 7
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+}
+
+void blake3_compress_xof_avx512(const uint32_t cv[8],
+                                const uint8_t block[BLAKE3_BLOCK_LEN],
+                                uint8_t block_len, uint64_t counter,
+                                uint8_t flags, uint8_t out[64]) {
+  __m128i rows[4];
+  compress_pre(rows, cv, block, block_len, counter, flags);
+  storeu_128(xor_128(rows[0], rows[2]), &out[0]);
+  storeu_128(xor_128(rows[1], rows[3]), &out[16]);
+  storeu_128(xor_128(rows[2], loadu_128((uint8_t *)&cv[0])), &out[32]);
+  storeu_128(xor_128(rows[3], loadu_128((uint8_t *)&cv[4])), &out[48]);
+}
+
+void blake3_compress_in_place_avx512(uint32_t cv[8],
+                                     const uint8_t block[BLAKE3_BLOCK_LEN],
+                                     uint8_t block_len, uint64_t counter,
+                                     uint8_t flags) {
+  __m128i rows[4];
+  compress_pre(rows, cv, block, block_len, counter, flags);
+  storeu_128(xor_128(rows[0], rows[2]), (uint8_t *)&cv[0]);
+  storeu_128(xor_128(rows[1], rows[3]), (uint8_t *)&cv[4]);
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * hash4_avx512
+ * ----------------------------------------------------------------------------
+ */
+
+INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) {
+  v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
+  v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
+  v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
+  v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
+  v[0] = add_128(v[0], v[4]);
+  v[1] = add_128(v[1], v[5]);
+  v[2] = add_128(v[2], v[6]);
+  v[3] = add_128(v[3], v[7]);
+  v[12] = xor_128(v[12], v[0]);
+  v[13] = xor_128(v[13], v[1]);
+  v[14] = xor_128(v[14], v[2]);
+  v[15] = xor_128(v[15], v[3]);
+  v[12] = rot16_128(v[12]);
+  v[13] = rot16_128(v[13]);
+  v[14] = rot16_128(v[14]);
+  v[15] = rot16_128(v[15]);
+  v[8] = add_128(v[8], v[12]);
+  v[9] = add_128(v[9], v[13]);
+  v[10] = add_128(v[10], v[14]);
+  v[11] = add_128(v[11], v[15]);
+  v[4] = xor_128(v[4], v[8]);
+  v[5] = xor_128(v[5], v[9]);
+  v[6] = xor_128(v[6], v[10]);
+  v[7] = xor_128(v[7], v[11]);
+  v[4] = rot12_128(v[4]);
+  v[5] = rot12_128(v[5]);
+  v[6] = rot12_128(v[6]);
+  v[7] = rot12_128(v[7]);
+  v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
+  v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
+  v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
+  v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
+  v[0] = add_128(v[0], v[4]);
+  v[1] = add_128(v[1], v[5]);
+  v[2] = add_128(v[2], v[6]);
+  v[3] = add_128(v[3], v[7]);
+  v[12] = xor_128(v[12], v[0]);
+  v[13] = xor_128(v[13], v[1]);
+  v[14] = xor_128(v[14], v[2]);
+  v[15] = xor_128(v[15], v[3]);
+  v[12] = rot8_128(v[12]);
+  v[13] = rot8_128(v[13]);
+  v[14] = rot8_128(v[14]);
+  v[15] = rot8_128(v[15]);
+  v[8] = add_128(v[8], v[12]);
+  v[9] = add_128(v[9], v[13]);
+  v[10] = add_128(v[10], v[14]);
+  v[11] = add_128(v[11], v[15]);
+  v[4] = xor_128(v[4], v[8]);
+  v[5] = xor_128(v[5], v[9]);
+  v[6] = xor_128(v[6], v[10]);
+  v[7] = xor_128(v[7], v[11]);
+  v[4] = rot7_128(v[4]);
+  v[5] = rot7_128(v[5]);
+  v[6] = rot7_128(v[6]);
+  v[7] = rot7_128(v[7]);
+
+  v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
+  v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
+  v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
+  v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
+  v[0] = add_128(v[0], v[5]);
+  v[1] = add_128(v[1], v[6]);
+  v[2] = add_128(v[2], v[7]);
+  v[3] = add_128(v[3], v[4]);
+  v[15] = xor_128(v[15], v[0]);
+  v[12] = xor_128(v[12], v[1]);
+  v[13] = xor_128(v[13], v[2]);
+  v[14] = xor_128(v[14], v[3]);
+  v[15] = rot16_128(v[15]);
+  v[12] = rot16_128(v[12]);
+  v[13] = rot16_128(v[13]);
+  v[14] = rot16_128(v[14]);
+  v[10] = add_128(v[10], v[15]);
+  v[11] = add_128(v[11], v[12]);
+  v[8] = add_128(v[8], v[13]);
+  v[9] = add_128(v[9], v[14]);
+  v[5] = xor_128(v[5], v[10]);
+  v[6] = xor_128(v[6], v[11]);
+  v[7] = xor_128(v[7], v[8]);
+  v[4] = xor_128(v[4], v[9]);
+  v[5] = rot12_128(v[5]);
+  v[6] = rot12_128(v[6]);
+  v[7] = rot12_128(v[7]);
+  v[4] = rot12_128(v[4]);
+  v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
+  v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
+  v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
+  v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
+  v[0] = add_128(v[0], v[5]);
+  v[1] = add_128(v[1], v[6]);
+  v[2] = add_128(v[2], v[7]);
+  v[3] = add_128(v[3], v[4]);
+  v[15] = xor_128(v[15], v[0]);
+  v[12] = xor_128(v[12], v[1]);
+  v[13] = xor_128(v[13], v[2]);
+  v[14] = xor_128(v[14], v[3]);
+  v[15] = rot8_128(v[15]);
+  v[12] = rot8_128(v[12]);
+  v[13] = rot8_128(v[13]);
+  v[14] = rot8_128(v[14]);
+  v[10] = add_128(v[10], v[15]);
+  v[11] = add_128(v[11], v[12]);
+  v[8] = add_128(v[8], v[13]);
+  v[9] = add_128(v[9], v[14]);
+  v[5] = xor_128(v[5], v[10]);
+  v[6] = xor_128(v[6], v[11]);
+  v[7] = xor_128(v[7], v[8]);
+  v[4] = xor_128(v[4], v[9]);
+  v[5] = rot7_128(v[5]);
+  v[6] = rot7_128(v[6]);
+  v[7] = rot7_128(v[7]);
+  v[4] = rot7_128(v[4]);
+}
+
+INLINE void transpose_vecs_128(__m128i vecs[4]) {
+  // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
+  // 22/33. Note that this doesn't split the vector into two lanes, as the
+  // AVX2 counterparts do.
+  __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
+  __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
+  __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
+  __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
+
+  // Interleave 64-bit lanes.
+  __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
+  __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
+  __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
+  __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
+
+  vecs[0] = abcd_0;
+  vecs[1] = abcd_1;
+  vecs[2] = abcd_2;
+  vecs[3] = abcd_3;
+}
+
+INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
+                                size_t block_offset, __m128i out[16]) {
+  out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
+  out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
+  out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
+  out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
+  out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
+  out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
+  out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
+  out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
+  out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
+  out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
+  out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
+  out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
+  out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
+  out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
+  out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
+  out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
+  for (size_t i = 0; i < 4; ++i) {
+    _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
+  }
+  transpose_vecs_128(&out[0]);
+  transpose_vecs_128(&out[4]);
+  transpose_vecs_128(&out[8]);
+  transpose_vecs_128(&out[12]);
+}
+
+INLINE void load_counters4(uint64_t counter, bool increment_counter,
+                           __m128i *out_lo, __m128i *out_hi) {
+  uint64_t mask = (increment_counter ? ~0 : 0);
+  __m256i mask_vec = _mm256_set1_epi64x(mask);
+  __m256i deltas = _mm256_setr_epi64x(0, 1, 2, 3);
+  deltas = _mm256_and_si256(mask_vec, deltas);
+  __m256i counters =
+      _mm256_add_epi64(_mm256_set1_epi64x((int64_t)counter), deltas);
+  *out_lo = _mm256_cvtepi64_epi32(counters);
+  *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32));
+}
+
+static
+void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks,
+                         const uint32_t key[8], uint64_t counter,
+                         bool increment_counter, uint8_t flags,
+                         uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+  __m128i h_vecs[8] = {
+      set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
+      set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
+  };
+  __m128i counter_low_vec, counter_high_vec;
+  load_counters4(counter, increment_counter, &counter_low_vec,
+                 &counter_high_vec);
+  uint8_t block_flags = flags | flags_start;
+
+  for (size_t block = 0; block < blocks; block++) {
+    if (block + 1 == blocks) {
+      block_flags |= flags_end;
+    }
+    __m128i block_len_vec = set1_128(BLAKE3_BLOCK_LEN);
+    __m128i block_flags_vec = set1_128(block_flags);
+    __m128i msg_vecs[16];
+    transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
+
+    __m128i v[16] = {
+        h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
+        h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
+        set1_128(IV[0]), set1_128(IV[1]),  set1_128(IV[2]), set1_128(IV[3]),
+        counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
+    };
+    round_fn4(v, msg_vecs, 0);
+    round_fn4(v, msg_vecs, 1);
+    round_fn4(v, msg_vecs, 2);
+    round_fn4(v, msg_vecs, 3);
+    round_fn4(v, msg_vecs, 4);
+    round_fn4(v, msg_vecs, 5);
+    round_fn4(v, msg_vecs, 6);
+    h_vecs[0] = xor_128(v[0], v[8]);
+    h_vecs[1] = xor_128(v[1], v[9]);
+    h_vecs[2] = xor_128(v[2], v[10]);
+    h_vecs[3] = xor_128(v[3], v[11]);
+    h_vecs[4] = xor_128(v[4], v[12]);
+    h_vecs[5] = xor_128(v[5], v[13]);
+    h_vecs[6] = xor_128(v[6], v[14]);
+    h_vecs[7] = xor_128(v[7], v[15]);
+
+    block_flags = flags;
+  }
+
+  transpose_vecs_128(&h_vecs[0]);
+  transpose_vecs_128(&h_vecs[4]);
+  // The first four vecs now contain the first half of each output, and the
+  // second four vecs contain the second half of each output.
+  storeu_128(h_vecs[0], &out[0 * sizeof(__m128i)]);
+  storeu_128(h_vecs[4], &out[1 * sizeof(__m128i)]);
+  storeu_128(h_vecs[1], &out[2 * sizeof(__m128i)]);
+  storeu_128(h_vecs[5], &out[3 * sizeof(__m128i)]);
+  storeu_128(h_vecs[2], &out[4 * sizeof(__m128i)]);
+  storeu_128(h_vecs[6], &out[5 * sizeof(__m128i)]);
+  storeu_128(h_vecs[3], &out[6 * sizeof(__m128i)]);
+  storeu_128(h_vecs[7], &out[7 * sizeof(__m128i)]);
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * hash8_avx512
+ * ----------------------------------------------------------------------------
+ */
+
+INLINE void round_fn8(__m256i v[16], __m256i m[16], size_t r) {
+  v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
+  v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
+  v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
+  v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
+  v[0] = add_256(v[0], v[4]);
+  v[1] = add_256(v[1], v[5]);
+  v[2] = add_256(v[2], v[6]);
+  v[3] = add_256(v[3], v[7]);
+  v[12] = xor_256(v[12], v[0]);
+  v[13] = xor_256(v[13], v[1]);
+  v[14] = xor_256(v[14], v[2]);
+  v[15] = xor_256(v[15], v[3]);
+  v[12] = rot16_256(v[12]);
+  v[13] = rot16_256(v[13]);
+  v[14] = rot16_256(v[14]);
+  v[15] = rot16_256(v[15]);
+  v[8] = add_256(v[8], v[12]);
+  v[9] = add_256(v[9], v[13]);
+  v[10] = add_256(v[10], v[14]);
+  v[11] = add_256(v[11], v[15]);
+  v[4] = xor_256(v[4], v[8]);
+  v[5] = xor_256(v[5], v[9]);
+  v[6] = xor_256(v[6], v[10]);
+  v[7] = xor_256(v[7], v[11]);
+  v[4] = rot12_256(v[4]);
+  v[5] = rot12_256(v[5]);
+  v[6] = rot12_256(v[6]);
+  v[7] = rot12_256(v[7]);
+  v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
+  v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
+  v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
+  v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
+  v[0] = add_256(v[0], v[4]);
+  v[1] = add_256(v[1], v[5]);
+  v[2] = add_256(v[2], v[6]);
+  v[3] = add_256(v[3], v[7]);
+  v[12] = xor_256(v[12], v[0]);
+  v[13] = xor_256(v[13], v[1]);
+  v[14] = xor_256(v[14], v[2]);
+  v[15] = xor_256(v[15], v[3]);
+  v[12] = rot8_256(v[12]);
+  v[13] = rot8_256(v[13]);
+  v[14] = rot8_256(v[14]);
+  v[15] = rot8_256(v[15]);
+  v[8] = add_256(v[8], v[12]);
+  v[9] = add_256(v[9], v[13]);
+  v[10] = add_256(v[10], v[14]);
+  v[11] = add_256(v[11], v[15]);
+  v[4] = xor_256(v[4], v[8]);
+  v[5] = xor_256(v[5], v[9]);
+  v[6] = xor_256(v[6], v[10]);
+  v[7] = xor_256(v[7], v[11]);
+  v[4] = rot7_256(v[4]);
+  v[5] = rot7_256(v[5]);
+  v[6] = rot7_256(v[6]);
+  v[7] = rot7_256(v[7]);
+
+  v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
+  v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
+  v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
+  v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
+  v[0] = add_256(v[0], v[5]);
+  v[1] = add_256(v[1], v[6]);
+  v[2] = add_256(v[2], v[7]);
+  v[3] = add_256(v[3], v[4]);
+  v[15] = xor_256(v[15], v[0]);
+  v[12] = xor_256(v[12], v[1]);
+  v[13] = xor_256(v[13], v[2]);
+  v[14] = xor_256(v[14], v[3]);
+  v[15] = rot16_256(v[15]);
+  v[12] = rot16_256(v[12]);
+  v[13] = rot16_256(v[13]);
+  v[14] = rot16_256(v[14]);
+  v[10] = add_256(v[10], v[15]);
+  v[11] = add_256(v[11], v[12]);
+  v[8] = add_256(v[8], v[13]);
+  v[9] = add_256(v[9], v[14]);
+  v[5] = xor_256(v[5], v[10]);
+  v[6] = xor_256(v[6], v[11]);
+  v[7] = xor_256(v[7], v[8]);
+  v[4] = xor_256(v[4], v[9]);
+  v[5] = rot12_256(v[5]);
+  v[6] = rot12_256(v[6]);
+  v[7] = rot12_256(v[7]);
+  v[4] = rot12_256(v[4]);
+  v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
+  v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
+  v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
+  v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
+  v[0] = add_256(v[0], v[5]);
+  v[1] = add_256(v[1], v[6]);
+  v[2] = add_256(v[2], v[7]);
+  v[3] = add_256(v[3], v[4]);
+  v[15] = xor_256(v[15], v[0]);
+  v[12] = xor_256(v[12], v[1]);
+  v[13] = xor_256(v[13], v[2]);
+  v[14] = xor_256(v[14], v[3]);
+  v[15] = rot8_256(v[15]);
+  v[12] = rot8_256(v[12]);
+  v[13] = rot8_256(v[13]);
+  v[14] = rot8_256(v[14]);
+  v[10] = add_256(v[10], v[15]);
+  v[11] = add_256(v[11], v[12]);
+  v[8] = add_256(v[8], v[13]);
+  v[9] = add_256(v[9], v[14]);
+  v[5] = xor_256(v[5], v[10]);
+  v[6] = xor_256(v[6], v[11]);
+  v[7] = xor_256(v[7], v[8]);
+  v[4] = xor_256(v[4], v[9]);
+  v[5] = rot7_256(v[5]);
+  v[6] = rot7_256(v[6]);
+  v[7] = rot7_256(v[7]);
+  v[4] = rot7_256(v[4]);
+}
+
+INLINE void transpose_vecs_256(__m256i vecs[8]) {
+  // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high
+  // is 22/33/66/77.
+  __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]);
+  __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]);
+  __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]);
+  __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]);
+  __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]);
+  __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]);
+  __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
+  __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
+
+  // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
+  // 11/33.
+  __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
+  __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
+  __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367);
+  __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367);
+  __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145);
+  __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145);
+  __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367);
+  __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367);
+
+  // Interleave 128-bit lanes.
+  vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20);
+  vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20);
+  vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20);
+  vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20);
+  vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31);
+  vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31);
+  vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31);
+  vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31);
+}
+
+INLINE void transpose_msg_vecs8(const uint8_t *const *inputs,
+                                size_t block_offset, __m256i out[16]) {
+  out[0] = loadu_256(&inputs[0][block_offset + 0 * sizeof(__m256i)]);
+  out[1] = loadu_256(&inputs[1][block_offset + 0 * sizeof(__m256i)]);
+  out[2] = loadu_256(&inputs[2][block_offset + 0 * sizeof(__m256i)]);
+  out[3] = loadu_256(&inputs[3][block_offset + 0 * sizeof(__m256i)]);
+  out[4] = loadu_256(&inputs[4][block_offset + 0 * sizeof(__m256i)]);
+  out[5] = loadu_256(&inputs[5][block_offset + 0 * sizeof(__m256i)]);
+  out[6] = loadu_256(&inputs[6][block_offset + 0 * sizeof(__m256i)]);
+  out[7] = loadu_256(&inputs[7][block_offset + 0 * sizeof(__m256i)]);
+  out[8] = loadu_256(&inputs[0][block_offset + 1 * sizeof(__m256i)]);
+  out[9] = loadu_256(&inputs[1][block_offset + 1 * sizeof(__m256i)]);
+  out[10] = loadu_256(&inputs[2][block_offset + 1 * sizeof(__m256i)]);
+  out[11] = loadu_256(&inputs[3][block_offset + 1 * sizeof(__m256i)]);
+  out[12] = loadu_256(&inputs[4][block_offset + 1 * sizeof(__m256i)]);
+  out[13] = loadu_256(&inputs[5][block_offset + 1 * sizeof(__m256i)]);
+  out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
+  out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
+  for (size_t i = 0; i < 8; ++i) {
+    _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
+  }
+  transpose_vecs_256(&out[0]);
+  transpose_vecs_256(&out[8]);
+}
+
+INLINE void load_counters8(uint64_t counter, bool increment_counter,
+                           __m256i *out_lo, __m256i *out_hi) {
+  uint64_t mask = (increment_counter ? ~0 : 0);
+  __m512i mask_vec = _mm512_set1_epi64(mask);
+  __m512i deltas = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+  deltas = _mm512_and_si512(mask_vec, deltas);
+  __m512i counters =
+      _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas);
+  *out_lo = _mm512_cvtepi64_epi32(counters);
+  *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32));
+}
+
+static
+void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks,
+                         const uint32_t key[8], uint64_t counter,
+                         bool increment_counter, uint8_t flags,
+                         uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+  __m256i h_vecs[8] = {
+      set1_256(key[0]), set1_256(key[1]), set1_256(key[2]), set1_256(key[3]),
+      set1_256(key[4]), set1_256(key[5]), set1_256(key[6]), set1_256(key[7]),
+  };
+  __m256i counter_low_vec, counter_high_vec;
+  load_counters8(counter, increment_counter, &counter_low_vec,
+                 &counter_high_vec);
+  uint8_t block_flags = flags | flags_start;
+
+  for (size_t block = 0; block < blocks; block++) {
+    if (block + 1 == blocks) {
+      block_flags |= flags_end;
+    }
+    __m256i block_len_vec = set1_256(BLAKE3_BLOCK_LEN);
+    __m256i block_flags_vec = set1_256(block_flags);
+    __m256i msg_vecs[16];
+    transpose_msg_vecs8(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
+
+    __m256i v[16] = {
+        h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
+        h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
+        set1_256(IV[0]), set1_256(IV[1]),  set1_256(IV[2]), set1_256(IV[3]),
+        counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
+    };
+    round_fn8(v, msg_vecs, 0);
+    round_fn8(v, msg_vecs, 1);
+    round_fn8(v, msg_vecs, 2);
+    round_fn8(v, msg_vecs, 3);
+    round_fn8(v, msg_vecs, 4);
+    round_fn8(v, msg_vecs, 5);
+    round_fn8(v, msg_vecs, 6);
+    h_vecs[0] = xor_256(v[0], v[8]);
+    h_vecs[1] = xor_256(v[1], v[9]);
+    h_vecs[2] = xor_256(v[2], v[10]);
+    h_vecs[3] = xor_256(v[3], v[11]);
+    h_vecs[4] = xor_256(v[4], v[12]);
+    h_vecs[5] = xor_256(v[5], v[13]);
+    h_vecs[6] = xor_256(v[6], v[14]);
+    h_vecs[7] = xor_256(v[7], v[15]);
+
+    block_flags = flags;
+  }
+
+  transpose_vecs_256(h_vecs);
+  storeu_256(h_vecs[0], &out[0 * sizeof(__m256i)]);
+  storeu_256(h_vecs[1], &out[1 * sizeof(__m256i)]);
+  storeu_256(h_vecs[2], &out[2 * sizeof(__m256i)]);
+  storeu_256(h_vecs[3], &out[3 * sizeof(__m256i)]);
+  storeu_256(h_vecs[4], &out[4 * sizeof(__m256i)]);
+  storeu_256(h_vecs[5], &out[5 * sizeof(__m256i)]);
+  storeu_256(h_vecs[6], &out[6 * sizeof(__m256i)]);
+  storeu_256(h_vecs[7], &out[7 * sizeof(__m256i)]);
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * hash16_avx512
+ * ----------------------------------------------------------------------------
+ */
+
+INLINE void round_fn16(__m512i v[16], __m512i m[16], size_t r) {
+  v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
+  v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
+  v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
+  v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
+  v[0] = add_512(v[0], v[4]);
+  v[1] = add_512(v[1], v[5]);
+  v[2] = add_512(v[2], v[6]);
+  v[3] = add_512(v[3], v[7]);
+  v[12] = xor_512(v[12], v[0]);
+  v[13] = xor_512(v[13], v[1]);
+  v[14] = xor_512(v[14], v[2]);
+  v[15] = xor_512(v[15], v[3]);
+  v[12] = rot16_512(v[12]);
+  v[13] = rot16_512(v[13]);
+  v[14] = rot16_512(v[14]);
+  v[15] = rot16_512(v[15]);
+  v[8] = add_512(v[8], v[12]);
+  v[9] = add_512(v[9], v[13]);
+  v[10] = add_512(v[10], v[14]);
+  v[11] = add_512(v[11], v[15]);
+  v[4] = xor_512(v[4], v[8]);
+  v[5] = xor_512(v[5], v[9]);
+  v[6] = xor_512(v[6], v[10]);
+  v[7] = xor_512(v[7], v[11]);
+  v[4] = rot12_512(v[4]);
+  v[5] = rot12_512(v[5]);
+  v[6] = rot12_512(v[6]);
+  v[7] = rot12_512(v[7]);
+  v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
+  v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
+  v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
+  v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
+  v[0] = add_512(v[0], v[4]);
+  v[1] = add_512(v[1], v[5]);
+  v[2] = add_512(v[2], v[6]);
+  v[3] = add_512(v[3], v[7]);
+  v[12] = xor_512(v[12], v[0]);
+  v[13] = xor_512(v[13], v[1]);
+  v[14] = xor_512(v[14], v[2]);
+  v[15] = xor_512(v[15], v[3]);
+  v[12] = rot8_512(v[12]);
+  v[13] = rot8_512(v[13]);
+  v[14] = rot8_512(v[14]);
+  v[15] = rot8_512(v[15]);
+  v[8] = add_512(v[8], v[12]);
+  v[9] = add_512(v[9], v[13]);
+  v[10] = add_512(v[10], v[14]);
+  v[11] = add_512(v[11], v[15]);
+  v[4] = xor_512(v[4], v[8]);
+  v[5] = xor_512(v[5], v[9]);
+  v[6] = xor_512(v[6], v[10]);
+  v[7] = xor_512(v[7], v[11]);
+  v[4] = rot7_512(v[4]);
+  v[5] = rot7_512(v[5]);
+  v[6] = rot7_512(v[6]);
+  v[7] = rot7_512(v[7]);
+
+  v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
+  v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
+  v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
+  v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
+  v[0] = add_512(v[0], v[5]);
+  v[1] = add_512(v[1], v[6]);
+  v[2] = add_512(v[2], v[7]);
+  v[3] = add_512(v[3], v[4]);
+  v[15] = xor_512(v[15], v[0]);
+  v[12] = xor_512(v[12], v[1]);
+  v[13] = xor_512(v[13], v[2]);
+  v[14] = xor_512(v[14], v[3]);
+  v[15] = rot16_512(v[15]);
+  v[12] = rot16_512(v[12]);
+  v[13] = rot16_512(v[13]);
+  v[14] = rot16_512(v[14]);
+  v[10] = add_512(v[10], v[15]);
+  v[11] = add_512(v[11], v[12]);
+  v[8] = add_512(v[8], v[13]);
+  v[9] = add_512(v[9], v[14]);
+  v[5] = xor_512(v[5], v[10]);
+  v[6] = xor_512(v[6], v[11]);
+  v[7] = xor_512(v[7], v[8]);
+  v[4] = xor_512(v[4], v[9]);
+  v[5] = rot12_512(v[5]);
+  v[6] = rot12_512(v[6]);
+  v[7] = rot12_512(v[7]);
+  v[4] = rot12_512(v[4]);
+  v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
+  v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
+  v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
+  v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
+  v[0] = add_512(v[0], v[5]);
+  v[1] = add_512(v[1], v[6]);
+  v[2] = add_512(v[2], v[7]);
+  v[3] = add_512(v[3], v[4]);
+  v[15] = xor_512(v[15], v[0]);
+  v[12] = xor_512(v[12], v[1]);
+  v[13] = xor_512(v[13], v[2]);
+  v[14] = xor_512(v[14], v[3]);
+  v[15] = rot8_512(v[15]);
+  v[12] = rot8_512(v[12]);
+  v[13] = rot8_512(v[13]);
+  v[14] = rot8_512(v[14]);
+  v[10] = add_512(v[10], v[15]);
+  v[11] = add_512(v[11], v[12]);
+  v[8] = add_512(v[8], v[13]);
+  v[9] = add_512(v[9], v[14]);
+  v[5] = xor_512(v[5], v[10]);
+  v[6] = xor_512(v[6], v[11]);
+  v[7] = xor_512(v[7], v[8]);
+  v[4] = xor_512(v[4], v[9]);
+  v[5] = rot7_512(v[5]);
+  v[6] = rot7_512(v[6]);
+  v[7] = rot7_512(v[7]);
+  v[4] = rot7_512(v[4]);
+}
+
+// 0b10001000, or lanes a0/a2/b0/b2 in little-endian order
+#define LO_IMM8 0x88
+
+INLINE __m512i unpack_lo_128(__m512i a, __m512i b) {
+  return _mm512_shuffle_i32x4(a, b, LO_IMM8);
+}
+
+// 0b11011101, or lanes a1/a3/b1/b3 in little-endian order
+#define HI_IMM8 0xdd
+
+INLINE __m512i unpack_hi_128(__m512i a, __m512i b) {
+  return _mm512_shuffle_i32x4(a, b, HI_IMM8);
+}
+
+INLINE void transpose_vecs_512(__m512i vecs[16]) {
+  // Interleave 32-bit lanes. The _0 unpack is lanes
+  // 0/0/1/1/4/4/5/5/8/8/9/9/12/12/13/13, and the _2 unpack is lanes
+  // 2/2/3/3/6/6/7/7/10/10/11/11/14/14/15/15.
+  __m512i ab_0 = _mm512_unpacklo_epi32(vecs[0], vecs[1]);
+  __m512i ab_2 = _mm512_unpackhi_epi32(vecs[0], vecs[1]);
+  __m512i cd_0 = _mm512_unpacklo_epi32(vecs[2], vecs[3]);
+  __m512i cd_2 = _mm512_unpackhi_epi32(vecs[2], vecs[3]);
+  __m512i ef_0 = _mm512_unpacklo_epi32(vecs[4], vecs[5]);
+  __m512i ef_2 = _mm512_unpackhi_epi32(vecs[4], vecs[5]);
+  __m512i gh_0 = _mm512_unpacklo_epi32(vecs[6], vecs[7]);
+  __m512i gh_2 = _mm512_unpackhi_epi32(vecs[6], vecs[7]);
+  __m512i ij_0 = _mm512_unpacklo_epi32(vecs[8], vecs[9]);
+  __m512i ij_2 = _mm512_unpackhi_epi32(vecs[8], vecs[9]);
+  __m512i kl_0 = _mm512_unpacklo_epi32(vecs[10], vecs[11]);
+  __m512i kl_2 = _mm512_unpackhi_epi32(vecs[10], vecs[11]);
+  __m512i mn_0 = _mm512_unpacklo_epi32(vecs[12], vecs[13]);
+  __m512i mn_2 = _mm512_unpackhi_epi32(vecs[12], vecs[13]);
+  __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]);
+  __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]);
+
+  // Interleave 64-bit lates. The _0 unpack is lanes
+  // 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes
+  // 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes
+  // 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes
+  // 3/3/3/3/7/7/7/7/11/11/11/11/15/15/15/15.
+  __m512i abcd_0 = _mm512_unpacklo_epi64(ab_0, cd_0);
+  __m512i abcd_1 = _mm512_unpackhi_epi64(ab_0, cd_0);
+  __m512i abcd_2 = _mm512_unpacklo_epi64(ab_2, cd_2);
+  __m512i abcd_3 = _mm512_unpackhi_epi64(ab_2, cd_2);
+  __m512i efgh_0 = _mm512_unpacklo_epi64(ef_0, gh_0);
+  __m512i efgh_1 = _mm512_unpackhi_epi64(ef_0, gh_0);
+  __m512i efgh_2 = _mm512_unpacklo_epi64(ef_2, gh_2);
+  __m512i efgh_3 = _mm512_unpackhi_epi64(ef_2, gh_2);
+  __m512i ijkl_0 = _mm512_unpacklo_epi64(ij_0, kl_0);
+  __m512i ijkl_1 = _mm512_unpackhi_epi64(ij_0, kl_0);
+  __m512i ijkl_2 = _mm512_unpacklo_epi64(ij_2, kl_2);
+  __m512i ijkl_3 = _mm512_unpackhi_epi64(ij_2, kl_2);
+  __m512i mnop_0 = _mm512_unpacklo_epi64(mn_0, op_0);
+  __m512i mnop_1 = _mm512_unpackhi_epi64(mn_0, op_0);
+  __m512i mnop_2 = _mm512_unpacklo_epi64(mn_2, op_2);
+  __m512i mnop_3 = _mm512_unpackhi_epi64(mn_2, op_2);
+
+  // Interleave 128-bit lanes. The _0 unpack is
+  // 0/0/0/0/8/8/8/8/0/0/0/0/8/8/8/8, the _1 unpack is
+  // 1/1/1/1/9/9/9/9/1/1/1/1/9/9/9/9, and so on.
+  __m512i abcdefgh_0 = unpack_lo_128(abcd_0, efgh_0);
+  __m512i abcdefgh_1 = unpack_lo_128(abcd_1, efgh_1);
+  __m512i abcdefgh_2 = unpack_lo_128(abcd_2, efgh_2);
+  __m512i abcdefgh_3 = unpack_lo_128(abcd_3, efgh_3);
+  __m512i abcdefgh_4 = unpack_hi_128(abcd_0, efgh_0);
+  __m512i abcdefgh_5 = unpack_hi_128(abcd_1, efgh_1);
+  __m512i abcdefgh_6 = unpack_hi_128(abcd_2, efgh_2);
+  __m512i abcdefgh_7 = unpack_hi_128(abcd_3, efgh_3);
+  __m512i ijklmnop_0 = unpack_lo_128(ijkl_0, mnop_0);
+  __m512i ijklmnop_1 = unpack_lo_128(ijkl_1, mnop_1);
+  __m512i ijklmnop_2 = unpack_lo_128(ijkl_2, mnop_2);
+  __m512i ijklmnop_3 = unpack_lo_128(ijkl_3, mnop_3);
+  __m512i ijklmnop_4 = unpack_hi_128(ijkl_0, mnop_0);
+  __m512i ijklmnop_5 = unpack_hi_128(ijkl_1, mnop_1);
+  __m512i ijklmnop_6 = unpack_hi_128(ijkl_2, mnop_2);
+  __m512i ijklmnop_7 = unpack_hi_128(ijkl_3, mnop_3);
+
+  // Interleave 128-bit lanes again for the final outputs.
+  vecs[0] = unpack_lo_128(abcdefgh_0, ijklmnop_0);
+  vecs[1] = unpack_lo_128(abcdefgh_1, ijklmnop_1);
+  vecs[2] = unpack_lo_128(abcdefgh_2, ijklmnop_2);
+  vecs[3] = unpack_lo_128(abcdefgh_3, ijklmnop_3);
+  vecs[4] = unpack_lo_128(abcdefgh_4, ijklmnop_4);
+  vecs[5] = unpack_lo_128(abcdefgh_5, ijklmnop_5);
+  vecs[6] = unpack_lo_128(abcdefgh_6, ijklmnop_6);
+  vecs[7] = unpack_lo_128(abcdefgh_7, ijklmnop_7);
+  vecs[8] = unpack_hi_128(abcdefgh_0, ijklmnop_0);
+  vecs[9] = unpack_hi_128(abcdefgh_1, ijklmnop_1);
+  vecs[10] = unpack_hi_128(abcdefgh_2, ijklmnop_2);
+  vecs[11] = unpack_hi_128(abcdefgh_3, ijklmnop_3);
+  vecs[12] = unpack_hi_128(abcdefgh_4, ijklmnop_4);
+  vecs[13] = unpack_hi_128(abcdefgh_5, ijklmnop_5);
+  vecs[14] = unpack_hi_128(abcdefgh_6, ijklmnop_6);
+  vecs[15] = unpack_hi_128(abcdefgh_7, ijklmnop_7);
+}
+
+INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
+                                 size_t block_offset, __m512i out[16]) {
+  out[0] = loadu_512(&inputs[0][block_offset]);
+  out[1] = loadu_512(&inputs[1][block_offset]);
+  out[2] = loadu_512(&inputs[2][block_offset]);
+  out[3] = loadu_512(&inputs[3][block_offset]);
+  out[4] = loadu_512(&inputs[4][block_offset]);
+  out[5] = loadu_512(&inputs[5][block_offset]);
+  out[6] = loadu_512(&inputs[6][block_offset]);
+  out[7] = loadu_512(&inputs[7][block_offset]);
+  out[8] = loadu_512(&inputs[8][block_offset]);
+  out[9] = loadu_512(&inputs[9][block_offset]);
+  out[10] = loadu_512(&inputs[10][block_offset]);
+  out[11] = loadu_512(&inputs[11][block_offset]);
+  out[12] = loadu_512(&inputs[12][block_offset]);
+  out[13] = loadu_512(&inputs[13][block_offset]);
+  out[14] = loadu_512(&inputs[14][block_offset]);
+  out[15] = loadu_512(&inputs[15][block_offset]);
+  for (size_t i = 0; i < 16; ++i) {
+    _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
+  }
+  transpose_vecs_512(out);
+}
+
+INLINE void load_counters16(uint64_t counter, bool increment_counter,
+                            __m512i *out_lo, __m512i *out_hi) {
+  const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter);
+  const __m512i add0 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  const __m512i add1 = _mm512_and_si512(mask, add0);
+  __m512i l = _mm512_add_epi32(_mm512_set1_epi32((int32_t)counter), add1);
+  __mmask16 carry = _mm512_cmp_epu32_mask(l, add1, _MM_CMPINT_LT);
+  __m512i h = _mm512_mask_add_epi32(_mm512_set1_epi32((int32_t)(counter >> 32)), carry, _mm512_set1_epi32((int32_t)(counter >> 32)), _mm512_set1_epi32(1));
+  *out_lo = l;
+  *out_hi = h;
+}
+
+static
+void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
+                          const uint32_t key[8], uint64_t counter,
+                          bool increment_counter, uint8_t flags,
+                          uint8_t flags_start, uint8_t flags_end,
+                          uint8_t *out) {
+  __m512i h_vecs[8] = {
+      set1_512(key[0]), set1_512(key[1]), set1_512(key[2]), set1_512(key[3]),
+      set1_512(key[4]), set1_512(key[5]), set1_512(key[6]), set1_512(key[7]),
+  };
+  __m512i counter_low_vec, counter_high_vec;
+  load_counters16(counter, increment_counter, &counter_low_vec,
+                  &counter_high_vec);
+  uint8_t block_flags = flags | flags_start;
+
+  for (size_t block = 0; block < blocks; block++) {
+    if (block + 1 == blocks) {
+      block_flags |= flags_end;
+    }
+    __m512i block_len_vec = set1_512(BLAKE3_BLOCK_LEN);
+    __m512i block_flags_vec = set1_512(block_flags);
+    __m512i msg_vecs[16];
+    transpose_msg_vecs16(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
+
+    __m512i v[16] = {
+        h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
+        h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
+        set1_512(IV[0]), set1_512(IV[1]),  set1_512(IV[2]), set1_512(IV[3]),
+        counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
+    };
+    round_fn16(v, msg_vecs, 0);
+    round_fn16(v, msg_vecs, 1);
+    round_fn16(v, msg_vecs, 2);
+    round_fn16(v, msg_vecs, 3);
+    round_fn16(v, msg_vecs, 4);
+    round_fn16(v, msg_vecs, 5);
+    round_fn16(v, msg_vecs, 6);
+    h_vecs[0] = xor_512(v[0], v[8]);
+    h_vecs[1] = xor_512(v[1], v[9]);
+    h_vecs[2] = xor_512(v[2], v[10]);
+    h_vecs[3] = xor_512(v[3], v[11]);
+    h_vecs[4] = xor_512(v[4], v[12]);
+    h_vecs[5] = xor_512(v[5], v[13]);
+    h_vecs[6] = xor_512(v[6], v[14]);
+    h_vecs[7] = xor_512(v[7], v[15]);
+
+    block_flags = flags;
+  }
+
+  // transpose_vecs_512 operates on a 16x16 matrix of words, but we only have 8
+  // state vectors. Pad the matrix with zeros. After transposition, store the
+  // lower half of each vector.
+  __m512i padded[16] = {
+      h_vecs[0],   h_vecs[1],   h_vecs[2],   h_vecs[3],
+      h_vecs[4],   h_vecs[5],   h_vecs[6],   h_vecs[7],
+      set1_512(0), set1_512(0), set1_512(0), set1_512(0),
+      set1_512(0), set1_512(0), set1_512(0), set1_512(0),
+  };
+  transpose_vecs_512(padded);
+  _mm256_mask_storeu_epi32(&out[0 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[0]));
+  _mm256_mask_storeu_epi32(&out[1 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[1]));
+  _mm256_mask_storeu_epi32(&out[2 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[2]));
+  _mm256_mask_storeu_epi32(&out[3 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[3]));
+  _mm256_mask_storeu_epi32(&out[4 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[4]));
+  _mm256_mask_storeu_epi32(&out[5 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[5]));
+  _mm256_mask_storeu_epi32(&out[6 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[6]));
+  _mm256_mask_storeu_epi32(&out[7 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[7]));
+  _mm256_mask_storeu_epi32(&out[8 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[8]));
+  _mm256_mask_storeu_epi32(&out[9 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[9]));
+  _mm256_mask_storeu_epi32(&out[10 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[10]));
+  _mm256_mask_storeu_epi32(&out[11 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[11]));
+  _mm256_mask_storeu_epi32(&out[12 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[12]));
+  _mm256_mask_storeu_epi32(&out[13 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[13]));
+  _mm256_mask_storeu_epi32(&out[14 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[14]));
+  _mm256_mask_storeu_epi32(&out[15 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15]));
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * hash_many_avx512
+ * ----------------------------------------------------------------------------
+ */
+
+INLINE void hash_one_avx512(const uint8_t *input, size_t blocks,
+                            const uint32_t key[8], uint64_t counter,
+                            uint8_t flags, uint8_t flags_start,
+                            uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
+  uint32_t cv[8];
+  memcpy(cv, key, BLAKE3_KEY_LEN);
+  uint8_t block_flags = flags | flags_start;
+  while (blocks > 0) {
+    if (blocks == 1) {
+      block_flags |= flags_end;
+    }
+    blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, counter,
+                                    block_flags);
+    input = &input[BLAKE3_BLOCK_LEN];
+    blocks -= 1;
+    block_flags = flags;
+  }
+  memcpy(out, cv, BLAKE3_OUT_LEN);
+}
+
+void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
+                             size_t blocks, const uint32_t key[8],
+                             uint64_t counter, bool increment_counter,
+                             uint8_t flags, uint8_t flags_start,
+                             uint8_t flags_end, uint8_t *out) {
+  while (num_inputs >= 16) {
+    blake3_hash16_avx512(inputs, blocks, key, counter, increment_counter, flags,
+                         flags_start, flags_end, out);
+    if (increment_counter) {
+      counter += 16;
+    }
+    inputs += 16;
+    num_inputs -= 16;
+    out = &out[16 * BLAKE3_OUT_LEN];
+  }
+  while (num_inputs >= 8) {
+    blake3_hash8_avx512(inputs, blocks, key, counter, increment_counter, flags,
+                        flags_start, flags_end, out);
+    if (increment_counter) {
+      counter += 8;
+    }
+    inputs += 8;
+    num_inputs -= 8;
+    out = &out[8 * BLAKE3_OUT_LEN];
+  }
+  while (num_inputs >= 4) {
+    blake3_hash4_avx512(inputs, blocks, key, counter, increment_counter, flags,
+                        flags_start, flags_end, out);
+    if (increment_counter) {
+      counter += 4;
+    }
+    inputs += 4;
+    num_inputs -= 4;
+    out = &out[4 * BLAKE3_OUT_LEN];
+  }
+  while (num_inputs > 0) {
+    hash_one_avx512(inputs[0], blocks, key, counter, flags, flags_start,
+                    flags_end, out);
+    if (increment_counter) {
+      counter += 1;
+    }
+    inputs += 1;
+    num_inputs -= 1;
+    out = &out[BLAKE3_OUT_LEN];
+  }
+}
diff --git a/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_unix.S b/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_unix.S
new file mode 100644
index 000000000000..3afc0e2250e2
--- /dev/null
+++ b/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_unix.S
@@ -0,0 +1,2601 @@
+#if defined(__x86_64__)
+
+#if defined(__ELF__) && defined(__linux__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
+#if __has_include(<cet.h>)
+#include <cet.h>
+#endif
+#endif
+
+#if !defined(_CET_ENDBR)
+#define _CET_ENDBR
+#endif
+
+#ifdef __APPLE__
+#define HIDDEN .private_extern
+#else
+#define HIDDEN .hidden
+#endif
+
+.intel_syntax noprefix
+HIDDEN _blake3_hash_many_avx512
+HIDDEN blake3_hash_many_avx512
+HIDDEN blake3_compress_in_place_avx512
+HIDDEN _blake3_compress_in_place_avx512
+HIDDEN blake3_compress_xof_avx512
+HIDDEN _blake3_compress_xof_avx512
+.global _blake3_hash_many_avx512
+.global blake3_hash_many_avx512
+.global blake3_compress_in_place_avx512
+.global _blake3_compress_in_place_avx512
+.global blake3_compress_xof_avx512
+.global _blake3_compress_xof_avx512
+
+#ifdef __APPLE__
+.text
+#else
+.section .text
+#endif
+.p2align  6
+_blake3_hash_many_avx512:
+blake3_hash_many_avx512:
+        _CET_ENDBR
+        push    r15
+        push    r14
+        push    r13
+        push    r12
+        push    rbx
+        push    rbp
+        mov     rbp, rsp
+        sub     rsp, 144
+        and     rsp, 0xFFFFFFFFFFFFFFC0
+        neg     r9
+        kmovw   k1, r9d
+        vmovd   xmm0, r8d
+        vpbroadcastd ymm0, xmm0
+        shr     r8, 32
+        vmovd   xmm1, r8d
+        vpbroadcastd ymm1, xmm1
+        vmovdqa ymm4, ymm1
+        vmovdqa ymm5, ymm1
+        vpaddd  ymm2, ymm0, ymmword ptr [ADD0+rip]
+        vpaddd  ymm3, ymm0, ymmword ptr [ADD0+32+rip]
+        vpcmpltud k2, ymm2, ymm0
+        vpcmpltud k3, ymm3, ymm0
+        vpaddd  ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8}
+        vpaddd  ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8}
+        knotw   k2, k1
+        vmovdqa32 ymm2 {k2}, ymm0
+        vmovdqa32 ymm3 {k2}, ymm0
+        vmovdqa32 ymm4 {k2}, ymm1
+        vmovdqa32 ymm5 {k2}, ymm1
+        vmovdqa ymmword ptr [rsp], ymm2
+        vmovdqa ymmword ptr [rsp+0x1*0x20], ymm3
+        vmovdqa ymmword ptr [rsp+0x2*0x20], ymm4
+        vmovdqa ymmword ptr [rsp+0x3*0x20], ymm5
+        shl     rdx, 6
+        mov     qword ptr [rsp+0x80], rdx
+        cmp     rsi, 16
+        jc      3f
+2:
+        vpbroadcastd zmm0, dword ptr [rcx]
+        vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4]
+        vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4]
+        vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4]
+        vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4]
+        vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4]
+        vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4]
+        vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4]
+        movzx   eax, byte ptr [rbp+0x38]
+        movzx   ebx, byte ptr [rbp+0x40]
+        or      eax, ebx
+        xor     edx, edx
+.p2align 5
+9:
+        movzx   ebx, byte ptr [rbp+0x48]
+        or      ebx, eax
+        add     rdx, 64
+        cmp     rdx, qword ptr [rsp+0x80]
+        cmove   eax, ebx
+        mov     dword ptr [rsp+0x88], eax
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        mov     r10, qword ptr [rdi+0x10]
+        mov     r11, qword ptr [rdi+0x18]
+        mov     r12, qword ptr [rdi+0x40]
+        mov     r13, qword ptr [rdi+0x48]
+        mov     r14, qword ptr [rdi+0x50]
+        mov     r15, qword ptr [rdi+0x58]
+        vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
+        vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
+        vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
+        vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
+        vpunpcklqdq zmm8, zmm16, zmm17
+        vpunpckhqdq zmm9, zmm16, zmm17
+        vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
+        vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
+        vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
+        vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
+        vpunpcklqdq zmm10, zmm18, zmm19
+        vpunpckhqdq zmm11, zmm18, zmm19
+        mov     r8, qword ptr [rdi+0x20]
+        mov     r9, qword ptr [rdi+0x28]
+        mov     r10, qword ptr [rdi+0x30]
+        mov     r11, qword ptr [rdi+0x38]
+        mov     r12, qword ptr [rdi+0x60]
+        mov     r13, qword ptr [rdi+0x68]
+        mov     r14, qword ptr [rdi+0x70]
+        mov     r15, qword ptr [rdi+0x78]
+        vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
+        vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
+        vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
+        vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
+        vpunpcklqdq zmm12, zmm16, zmm17
+        vpunpckhqdq zmm13, zmm16, zmm17
+        vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
+        vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
+        vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
+        vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
+        vpunpcklqdq zmm14, zmm18, zmm19
+        vpunpckhqdq zmm15, zmm18, zmm19
+        vmovdqa32 zmm27, zmmword ptr [INDEX0+rip]
+        vmovdqa32 zmm31, zmmword ptr [INDEX1+rip]
+        vshufps zmm16, zmm8, zmm10, 136
+        vshufps zmm17, zmm12, zmm14, 136
+        vmovdqa32 zmm20, zmm16
+        vpermt2d zmm16, zmm27, zmm17
+        vpermt2d zmm20, zmm31, zmm17
+        vshufps zmm17, zmm8, zmm10, 221
+        vshufps zmm30, zmm12, zmm14, 221
+        vmovdqa32 zmm21, zmm17
+        vpermt2d zmm17, zmm27, zmm30
+        vpermt2d zmm21, zmm31, zmm30
+        vshufps zmm18, zmm9, zmm11, 136
+        vshufps zmm8, zmm13, zmm15, 136
+        vmovdqa32 zmm22, zmm18
+        vpermt2d zmm18, zmm27, zmm8
+        vpermt2d zmm22, zmm31, zmm8
+        vshufps zmm19, zmm9, zmm11, 221
+        vshufps zmm8, zmm13, zmm15, 221
+        vmovdqa32 zmm23, zmm19
+        vpermt2d zmm19, zmm27, zmm8
+        vpermt2d zmm23, zmm31, zmm8
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        mov     r10, qword ptr [rdi+0x10]
+        mov     r11, qword ptr [rdi+0x18]
+        mov     r12, qword ptr [rdi+0x40]
+        mov     r13, qword ptr [rdi+0x48]
+        mov     r14, qword ptr [rdi+0x50]
+        mov     r15, qword ptr [rdi+0x58]
+        vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
+        vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
+        vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
+        vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
+        vpunpcklqdq zmm8, zmm24, zmm25
+        vpunpckhqdq zmm9, zmm24, zmm25
+        vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
+        vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
+        vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
+        vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
+        vpunpcklqdq zmm10, zmm24, zmm25
+        vpunpckhqdq zmm11, zmm24, zmm25
+        prefetcht0 [r8+rdx+0x80]
+        prefetcht0 [r12+rdx+0x80]
+        prefetcht0 [r9+rdx+0x80]
+        prefetcht0 [r13+rdx+0x80]
+        prefetcht0 [r10+rdx+0x80]
+        prefetcht0 [r14+rdx+0x80]
+        prefetcht0 [r11+rdx+0x80]
+        prefetcht0 [r15+rdx+0x80]
+        mov     r8, qword ptr [rdi+0x20]
+        mov     r9, qword ptr [rdi+0x28]
+        mov     r10, qword ptr [rdi+0x30]
+        mov     r11, qword ptr [rdi+0x38]
+        mov     r12, qword ptr [rdi+0x60]
+        mov     r13, qword ptr [rdi+0x68]
+        mov     r14, qword ptr [rdi+0x70]
+        mov     r15, qword ptr [rdi+0x78]
+        vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
+        vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
+        vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
+        vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
+        vpunpcklqdq zmm12, zmm24, zmm25
+        vpunpckhqdq zmm13, zmm24, zmm25
+        vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
+        vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
+        vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
+        vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
+        vpunpcklqdq zmm14, zmm24, zmm25
+        vpunpckhqdq zmm15, zmm24, zmm25
+        prefetcht0 [r8+rdx+0x80]
+        prefetcht0 [r12+rdx+0x80]
+        prefetcht0 [r9+rdx+0x80]
+        prefetcht0 [r13+rdx+0x80]
+        prefetcht0 [r10+rdx+0x80]
+        prefetcht0 [r14+rdx+0x80]
+        prefetcht0 [r11+rdx+0x80]
+        prefetcht0 [r15+rdx+0x80]
+        vshufps zmm24, zmm8, zmm10, 136
+        vshufps zmm30, zmm12, zmm14, 136
+        vmovdqa32 zmm28, zmm24
+        vpermt2d zmm24, zmm27, zmm30
+        vpermt2d zmm28, zmm31, zmm30
+        vshufps zmm25, zmm8, zmm10, 221
+        vshufps zmm30, zmm12, zmm14, 221
+        vmovdqa32 zmm29, zmm25
+        vpermt2d zmm25, zmm27, zmm30
+        vpermt2d zmm29, zmm31, zmm30
+        vshufps zmm26, zmm9, zmm11, 136
+        vshufps zmm8, zmm13, zmm15, 136
+        vmovdqa32 zmm30, zmm26
+        vpermt2d zmm26, zmm27, zmm8
+        vpermt2d zmm30, zmm31, zmm8
+        vshufps zmm8, zmm9, zmm11, 221
+        vshufps zmm10, zmm13, zmm15, 221
+        vpermi2d zmm27, zmm8, zmm10
+        vpermi2d zmm31, zmm8, zmm10
+        vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip]
+        vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip]
+        vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip]
+        vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip]
+        vmovdqa32 zmm12, zmmword ptr [rsp]
+        vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40]
+        vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip]
+        vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4]
+        vpaddd  zmm0, zmm0, zmm16
+        vpaddd  zmm1, zmm1, zmm18
+        vpaddd  zmm2, zmm2, zmm20
+        vpaddd  zmm3, zmm3, zmm22
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vprord  zmm15, zmm15, 16
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 12
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vpaddd  zmm0, zmm0, zmm17
+        vpaddd  zmm1, zmm1, zmm19
+        vpaddd  zmm2, zmm2, zmm21
+        vpaddd  zmm3, zmm3, zmm23
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vprord  zmm15, zmm15, 8
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 7
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vpaddd  zmm0, zmm0, zmm24
+        vpaddd  zmm1, zmm1, zmm26
+        vpaddd  zmm2, zmm2, zmm28
+        vpaddd  zmm3, zmm3, zmm30
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 16
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vprord  zmm4, zmm4, 12
+        vpaddd  zmm0, zmm0, zmm25
+        vpaddd  zmm1, zmm1, zmm27
+        vpaddd  zmm2, zmm2, zmm29
+        vpaddd  zmm3, zmm3, zmm31
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 8
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vprord  zmm4, zmm4, 7
+        vpaddd  zmm0, zmm0, zmm18
+        vpaddd  zmm1, zmm1, zmm19
+        vpaddd  zmm2, zmm2, zmm23
+        vpaddd  zmm3, zmm3, zmm20
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vprord  zmm15, zmm15, 16
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 12
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vpaddd  zmm0, zmm0, zmm22
+        vpaddd  zmm1, zmm1, zmm26
+        vpaddd  zmm2, zmm2, zmm16
+        vpaddd  zmm3, zmm3, zmm29
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vprord  zmm15, zmm15, 8
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 7
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vpaddd  zmm0, zmm0, zmm17
+        vpaddd  zmm1, zmm1, zmm28
+        vpaddd  zmm2, zmm2, zmm25
+        vpaddd  zmm3, zmm3, zmm31
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 16
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vprord  zmm4, zmm4, 12
+        vpaddd  zmm0, zmm0, zmm27
+        vpaddd  zmm1, zmm1, zmm21
+        vpaddd  zmm2, zmm2, zmm30
+        vpaddd  zmm3, zmm3, zmm24
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 8
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vprord  zmm4, zmm4, 7
+        vpaddd  zmm0, zmm0, zmm19
+        vpaddd  zmm1, zmm1, zmm26
+        vpaddd  zmm2, zmm2, zmm29
+        vpaddd  zmm3, zmm3, zmm23
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vprord  zmm15, zmm15, 16
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 12
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vpaddd  zmm0, zmm0, zmm20
+        vpaddd  zmm1, zmm1, zmm28
+        vpaddd  zmm2, zmm2, zmm18
+        vpaddd  zmm3, zmm3, zmm30
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vprord  zmm15, zmm15, 8
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 7
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vpaddd  zmm0, zmm0, zmm22
+        vpaddd  zmm1, zmm1, zmm25
+        vpaddd  zmm2, zmm2, zmm27
+        vpaddd  zmm3, zmm3, zmm24
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 16
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vprord  zmm4, zmm4, 12
+        vpaddd  zmm0, zmm0, zmm21
+        vpaddd  zmm1, zmm1, zmm16
+        vpaddd  zmm2, zmm2, zmm31
+        vpaddd  zmm3, zmm3, zmm17
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 8
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vprord  zmm4, zmm4, 7
+        vpaddd  zmm0, zmm0, zmm26
+        vpaddd  zmm1, zmm1, zmm28
+        vpaddd  zmm2, zmm2, zmm30
+        vpaddd  zmm3, zmm3, zmm29
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vprord  zmm15, zmm15, 16
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 12
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vpaddd  zmm0, zmm0, zmm23
+        vpaddd  zmm1, zmm1, zmm25
+        vpaddd  zmm2, zmm2, zmm19
+        vpaddd  zmm3, zmm3, zmm31
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vprord  zmm15, zmm15, 8
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 7
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vpaddd  zmm0, zmm0, zmm20
+        vpaddd  zmm1, zmm1, zmm27
+        vpaddd  zmm2, zmm2, zmm21
+        vpaddd  zmm3, zmm3, zmm17
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 16
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vprord  zmm4, zmm4, 12
+        vpaddd  zmm0, zmm0, zmm16
+        vpaddd  zmm1, zmm1, zmm18
+        vpaddd  zmm2, zmm2, zmm24
+        vpaddd  zmm3, zmm3, zmm22
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 8
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vprord  zmm4, zmm4, 7
+        vpaddd  zmm0, zmm0, zmm28
+        vpaddd  zmm1, zmm1, zmm25
+        vpaddd  zmm2, zmm2, zmm31
+        vpaddd  zmm3, zmm3, zmm30
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vprord  zmm15, zmm15, 16
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 12
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vpaddd  zmm0, zmm0, zmm29
+        vpaddd  zmm1, zmm1, zmm27
+        vpaddd  zmm2, zmm2, zmm26
+        vpaddd  zmm3, zmm3, zmm24
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vprord  zmm15, zmm15, 8
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 7
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vpaddd  zmm0, zmm0, zmm23
+        vpaddd  zmm1, zmm1, zmm21
+        vpaddd  zmm2, zmm2, zmm16
+        vpaddd  zmm3, zmm3, zmm22
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 16
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vprord  zmm4, zmm4, 12
+        vpaddd  zmm0, zmm0, zmm18
+        vpaddd  zmm1, zmm1, zmm19
+        vpaddd  zmm2, zmm2, zmm17
+        vpaddd  zmm3, zmm3, zmm20
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 8
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vprord  zmm4, zmm4, 7
+        vpaddd  zmm0, zmm0, zmm25
+        vpaddd  zmm1, zmm1, zmm27
+        vpaddd  zmm2, zmm2, zmm24
+        vpaddd  zmm3, zmm3, zmm31
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vprord  zmm15, zmm15, 16
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 12
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vpaddd  zmm0, zmm0, zmm30
+        vpaddd  zmm1, zmm1, zmm21
+        vpaddd  zmm2, zmm2, zmm28
+        vpaddd  zmm3, zmm3, zmm17
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vprord  zmm15, zmm15, 8
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 7
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vpaddd  zmm0, zmm0, zmm29
+        vpaddd  zmm1, zmm1, zmm16
+        vpaddd  zmm2, zmm2, zmm18
+        vpaddd  zmm3, zmm3, zmm20
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 16
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vprord  zmm4, zmm4, 12
+        vpaddd  zmm0, zmm0, zmm19
+        vpaddd  zmm1, zmm1, zmm26
+        vpaddd  zmm2, zmm2, zmm22
+        vpaddd  zmm3, zmm3, zmm23
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 8
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vprord  zmm4, zmm4, 7
+        vpaddd  zmm0, zmm0, zmm27
+        vpaddd  zmm1, zmm1, zmm21
+        vpaddd  zmm2, zmm2, zmm17
+        vpaddd  zmm3, zmm3, zmm24
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vprord  zmm15, zmm15, 16
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 12
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vpaddd  zmm0, zmm0, zmm31
+        vpaddd  zmm1, zmm1, zmm16
+        vpaddd  zmm2, zmm2, zmm25
+        vpaddd  zmm3, zmm3, zmm22
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vprord  zmm15, zmm15, 8
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 7
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vpaddd  zmm0, zmm0, zmm30
+        vpaddd  zmm1, zmm1, zmm18
+        vpaddd  zmm2, zmm2, zmm19
+        vpaddd  zmm3, zmm3, zmm23
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 16
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vprord  zmm4, zmm4, 12
+        vpaddd  zmm0, zmm0, zmm26
+        vpaddd  zmm1, zmm1, zmm28
+        vpaddd  zmm2, zmm2, zmm20
+        vpaddd  zmm3, zmm3, zmm29
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 8
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vprord  zmm4, zmm4, 7
+        vpxord  zmm0, zmm0, zmm8
+        vpxord  zmm1, zmm1, zmm9
+        vpxord  zmm2, zmm2, zmm10
+        vpxord  zmm3, zmm3, zmm11
+        vpxord  zmm4, zmm4, zmm12
+        vpxord  zmm5, zmm5, zmm13
+        vpxord  zmm6, zmm6, zmm14
+        vpxord  zmm7, zmm7, zmm15
+        movzx   eax, byte ptr [rbp+0x38]
+        jne     9b
+        mov     rbx, qword ptr [rbp+0x50]
+        vpunpckldq zmm16, zmm0, zmm1
+        vpunpckhdq zmm17, zmm0, zmm1
+        vpunpckldq zmm18, zmm2, zmm3
+        vpunpckhdq zmm19, zmm2, zmm3
+        vpunpckldq zmm20, zmm4, zmm5
+        vpunpckhdq zmm21, zmm4, zmm5
+        vpunpckldq zmm22, zmm6, zmm7
+        vpunpckhdq zmm23, zmm6, zmm7
+        vpunpcklqdq zmm0, zmm16, zmm18
+        vpunpckhqdq zmm1, zmm16, zmm18
+        vpunpcklqdq zmm2, zmm17, zmm19
+        vpunpckhqdq zmm3, zmm17, zmm19
+        vpunpcklqdq zmm4, zmm20, zmm22
+        vpunpckhqdq zmm5, zmm20, zmm22
+        vpunpcklqdq zmm6, zmm21, zmm23
+        vpunpckhqdq zmm7, zmm21, zmm23
+        vshufi32x4 zmm16, zmm0, zmm4, 0x88
+        vshufi32x4 zmm17, zmm1, zmm5, 0x88
+        vshufi32x4 zmm18, zmm2, zmm6, 0x88
+        vshufi32x4 zmm19, zmm3, zmm7, 0x88
+        vshufi32x4 zmm20, zmm0, zmm4, 0xDD
+        vshufi32x4 zmm21, zmm1, zmm5, 0xDD
+        vshufi32x4 zmm22, zmm2, zmm6, 0xDD
+        vshufi32x4 zmm23, zmm3, zmm7, 0xDD
+        vshufi32x4 zmm0, zmm16, zmm17, 0x88
+        vshufi32x4 zmm1, zmm18, zmm19, 0x88
+        vshufi32x4 zmm2, zmm20, zmm21, 0x88
+        vshufi32x4 zmm3, zmm22, zmm23, 0x88
+        vshufi32x4 zmm4, zmm16, zmm17, 0xDD
+        vshufi32x4 zmm5, zmm18, zmm19, 0xDD
+        vshufi32x4 zmm6, zmm20, zmm21, 0xDD
+        vshufi32x4 zmm7, zmm22, zmm23, 0xDD
+        vmovdqu32 zmmword ptr [rbx], zmm0
+        vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1
+        vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2
+        vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3
+        vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4
+        vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5
+        vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6
+        vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7
+        vmovdqa32 zmm0, zmmword ptr [rsp]
+        vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40]
+        vmovdqa32 zmm2, zmm0
+        vpaddd  zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16}
+        vpcmpltud k2, zmm2, zmm0
+        vpaddd  zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16}
+        vmovdqa32 zmmword ptr [rsp], zmm2
+        vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1
+        add     rdi, 128
+        add     rbx, 512
+        mov     qword ptr [rbp+0x50], rbx
+        sub     rsi, 16
+        cmp     rsi, 16
+        jnc     2b
+        test    rsi, rsi
+        jnz     3f
+4:
+        vzeroupper
+        mov     rsp, rbp
+        pop     rbp
+        pop     rbx
+        pop     r12
+        pop     r13
+        pop     r14
+        pop     r15
+        ret
+.p2align 6
+3:
+        test    esi, 0x8
+        je      3f
+        vpbroadcastd ymm0, dword ptr [rcx]
+        vpbroadcastd ymm1, dword ptr [rcx+0x4]
+        vpbroadcastd ymm2, dword ptr [rcx+0x8]
+        vpbroadcastd ymm3, dword ptr [rcx+0xC]
+        vpbroadcastd ymm4, dword ptr [rcx+0x10]
+        vpbroadcastd ymm5, dword ptr [rcx+0x14]
+        vpbroadcastd ymm6, dword ptr [rcx+0x18]
+        vpbroadcastd ymm7, dword ptr [rcx+0x1C]
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        mov     r10, qword ptr [rdi+0x10]
+        mov     r11, qword ptr [rdi+0x18]
+        mov     r12, qword ptr [rdi+0x20]
+        mov     r13, qword ptr [rdi+0x28]
+        mov     r14, qword ptr [rdi+0x30]
+        mov     r15, qword ptr [rdi+0x38]
+        movzx   eax, byte ptr [rbp+0x38]
+        movzx   ebx, byte ptr [rbp+0x40]
+        or      eax, ebx
+        xor     edx, edx
+2:
+        movzx   ebx, byte ptr [rbp+0x48]
+        or      ebx, eax
+        add     rdx, 64
+        cmp     rdx, qword ptr [rsp+0x80]
+        cmove   eax, ebx
+        mov     dword ptr [rsp+0x88], eax
+        vmovups xmm8, xmmword ptr [r8+rdx-0x40]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01
+        vmovups xmm9, xmmword ptr [r9+rdx-0x40]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01
+        vunpcklpd ymm12, ymm8, ymm9
+        vunpckhpd ymm13, ymm8, ymm9
+        vmovups xmm10, xmmword ptr [r10+rdx-0x40]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01
+        vmovups xmm11, xmmword ptr [r11+rdx-0x40]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01
+        vunpcklpd ymm14, ymm10, ymm11
+        vunpckhpd ymm15, ymm10, ymm11
+        vshufps ymm16, ymm12, ymm14, 136
+        vshufps ymm17, ymm12, ymm14, 221
+        vshufps ymm18, ymm13, ymm15, 136
+        vshufps ymm19, ymm13, ymm15, 221
+        vmovups xmm8, xmmword ptr [r8+rdx-0x30]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01
+        vmovups xmm9, xmmword ptr [r9+rdx-0x30]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01
+        vunpcklpd ymm12, ymm8, ymm9
+        vunpckhpd ymm13, ymm8, ymm9
+        vmovups xmm10, xmmword ptr [r10+rdx-0x30]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01
+        vmovups xmm11, xmmword ptr [r11+rdx-0x30]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01
+        vunpcklpd ymm14, ymm10, ymm11
+        vunpckhpd ymm15, ymm10, ymm11
+        vshufps ymm20, ymm12, ymm14, 136
+        vshufps ymm21, ymm12, ymm14, 221
+        vshufps ymm22, ymm13, ymm15, 136
+        vshufps ymm23, ymm13, ymm15, 221
+        vmovups xmm8, xmmword ptr [r8+rdx-0x20]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01
+        vmovups xmm9, xmmword ptr [r9+rdx-0x20]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01
+        vunpcklpd ymm12, ymm8, ymm9
+        vunpckhpd ymm13, ymm8, ymm9
+        vmovups xmm10, xmmword ptr [r10+rdx-0x20]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01
+        vmovups xmm11, xmmword ptr [r11+rdx-0x20]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01
+        vunpcklpd ymm14, ymm10, ymm11
+        vunpckhpd ymm15, ymm10, ymm11
+        vshufps ymm24, ymm12, ymm14, 136
+        vshufps ymm25, ymm12, ymm14, 221
+        vshufps ymm26, ymm13, ymm15, 136
+        vshufps ymm27, ymm13, ymm15, 221
+        vmovups xmm8, xmmword ptr [r8+rdx-0x10]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01
+        vmovups xmm9, xmmword ptr [r9+rdx-0x10]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01
+        vunpcklpd ymm12, ymm8, ymm9
+        vunpckhpd ymm13, ymm8, ymm9
+        vmovups xmm10, xmmword ptr [r10+rdx-0x10]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01
+        vmovups xmm11, xmmword ptr [r11+rdx-0x10]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01
+        vunpcklpd ymm14, ymm10, ymm11
+        vunpckhpd ymm15, ymm10, ymm11
+        vshufps ymm28, ymm12, ymm14, 136
+        vshufps ymm29, ymm12, ymm14, 221
+        vshufps ymm30, ymm13, ymm15, 136
+        vshufps ymm31, ymm13, ymm15, 221
+        vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip]
+        vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip]
+        vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip]
+        vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip]
+        vmovdqa ymm12, ymmword ptr [rsp]
+        vmovdqa ymm13, ymmword ptr [rsp+0x40]
+        vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip]
+        vpbroadcastd ymm15, dword ptr [rsp+0x88]
+        vpaddd  ymm0, ymm0, ymm16
+        vpaddd  ymm1, ymm1, ymm18
+        vpaddd  ymm2, ymm2, ymm20
+        vpaddd  ymm3, ymm3, ymm22
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vprord  ymm15, ymm15, 16
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 12
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vpaddd  ymm0, ymm0, ymm17
+        vpaddd  ymm1, ymm1, ymm19
+        vpaddd  ymm2, ymm2, ymm21
+        vpaddd  ymm3, ymm3, ymm23
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vprord  ymm15, ymm15, 8
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 7
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vpaddd  ymm0, ymm0, ymm24
+        vpaddd  ymm1, ymm1, ymm26
+        vpaddd  ymm2, ymm2, ymm28
+        vpaddd  ymm3, ymm3, ymm30
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 16
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vprord  ymm4, ymm4, 12
+        vpaddd  ymm0, ymm0, ymm25
+        vpaddd  ymm1, ymm1, ymm27
+        vpaddd  ymm2, ymm2, ymm29
+        vpaddd  ymm3, ymm3, ymm31
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 8
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vprord  ymm4, ymm4, 7
+        vpaddd  ymm0, ymm0, ymm18
+        vpaddd  ymm1, ymm1, ymm19
+        vpaddd  ymm2, ymm2, ymm23
+        vpaddd  ymm3, ymm3, ymm20
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vprord  ymm15, ymm15, 16
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 12
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vpaddd  ymm0, ymm0, ymm22
+        vpaddd  ymm1, ymm1, ymm26
+        vpaddd  ymm2, ymm2, ymm16
+        vpaddd  ymm3, ymm3, ymm29
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vprord  ymm15, ymm15, 8
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 7
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vpaddd  ymm0, ymm0, ymm17
+        vpaddd  ymm1, ymm1, ymm28
+        vpaddd  ymm2, ymm2, ymm25
+        vpaddd  ymm3, ymm3, ymm31
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 16
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vprord  ymm4, ymm4, 12
+        vpaddd  ymm0, ymm0, ymm27
+        vpaddd  ymm1, ymm1, ymm21
+        vpaddd  ymm2, ymm2, ymm30
+        vpaddd  ymm3, ymm3, ymm24
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 8
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vprord  ymm4, ymm4, 7
+        vpaddd  ymm0, ymm0, ymm19
+        vpaddd  ymm1, ymm1, ymm26
+        vpaddd  ymm2, ymm2, ymm29
+        vpaddd  ymm3, ymm3, ymm23
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vprord  ymm15, ymm15, 16
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 12
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vpaddd  ymm0, ymm0, ymm20
+        vpaddd  ymm1, ymm1, ymm28
+        vpaddd  ymm2, ymm2, ymm18
+        vpaddd  ymm3, ymm3, ymm30
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vprord  ymm15, ymm15, 8
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 7
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vpaddd  ymm0, ymm0, ymm22
+        vpaddd  ymm1, ymm1, ymm25
+        vpaddd  ymm2, ymm2, ymm27
+        vpaddd  ymm3, ymm3, ymm24
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 16
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vprord  ymm4, ymm4, 12
+        vpaddd  ymm0, ymm0, ymm21
+        vpaddd  ymm1, ymm1, ymm16
+        vpaddd  ymm2, ymm2, ymm31
+        vpaddd  ymm3, ymm3, ymm17
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 8
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vprord  ymm4, ymm4, 7
+        vpaddd  ymm0, ymm0, ymm26
+        vpaddd  ymm1, ymm1, ymm28
+        vpaddd  ymm2, ymm2, ymm30
+        vpaddd  ymm3, ymm3, ymm29
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vprord  ymm15, ymm15, 16
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 12
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vpaddd  ymm0, ymm0, ymm23
+        vpaddd  ymm1, ymm1, ymm25
+        vpaddd  ymm2, ymm2, ymm19
+        vpaddd  ymm3, ymm3, ymm31
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vprord  ymm15, ymm15, 8
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 7
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vpaddd  ymm0, ymm0, ymm20
+        vpaddd  ymm1, ymm1, ymm27
+        vpaddd  ymm2, ymm2, ymm21
+        vpaddd  ymm3, ymm3, ymm17
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 16
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vprord  ymm4, ymm4, 12
+        vpaddd  ymm0, ymm0, ymm16
+        vpaddd  ymm1, ymm1, ymm18
+        vpaddd  ymm2, ymm2, ymm24
+        vpaddd  ymm3, ymm3, ymm22
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 8
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vprord  ymm4, ymm4, 7
+        vpaddd  ymm0, ymm0, ymm28
+        vpaddd  ymm1, ymm1, ymm25
+        vpaddd  ymm2, ymm2, ymm31
+        vpaddd  ymm3, ymm3, ymm30
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vprord  ymm15, ymm15, 16
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 12
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vpaddd  ymm0, ymm0, ymm29
+        vpaddd  ymm1, ymm1, ymm27
+        vpaddd  ymm2, ymm2, ymm26
+        vpaddd  ymm3, ymm3, ymm24
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vprord  ymm15, ymm15, 8
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 7
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vpaddd  ymm0, ymm0, ymm23
+        vpaddd  ymm1, ymm1, ymm21
+        vpaddd  ymm2, ymm2, ymm16
+        vpaddd  ymm3, ymm3, ymm22
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 16
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vprord  ymm4, ymm4, 12
+        vpaddd  ymm0, ymm0, ymm18
+        vpaddd  ymm1, ymm1, ymm19
+        vpaddd  ymm2, ymm2, ymm17
+        vpaddd  ymm3, ymm3, ymm20
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 8
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vprord  ymm4, ymm4, 7
+        vpaddd  ymm0, ymm0, ymm25
+        vpaddd  ymm1, ymm1, ymm27
+        vpaddd  ymm2, ymm2, ymm24
+        vpaddd  ymm3, ymm3, ymm31
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vprord  ymm15, ymm15, 16
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 12
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vpaddd  ymm0, ymm0, ymm30
+        vpaddd  ymm1, ymm1, ymm21
+        vpaddd  ymm2, ymm2, ymm28
+        vpaddd  ymm3, ymm3, ymm17
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vprord  ymm15, ymm15, 8
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 7
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vpaddd  ymm0, ymm0, ymm29
+        vpaddd  ymm1, ymm1, ymm16
+        vpaddd  ymm2, ymm2, ymm18
+        vpaddd  ymm3, ymm3, ymm20
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 16
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vprord  ymm4, ymm4, 12
+        vpaddd  ymm0, ymm0, ymm19
+        vpaddd  ymm1, ymm1, ymm26
+        vpaddd  ymm2, ymm2, ymm22
+        vpaddd  ymm3, ymm3, ymm23
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 8
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vprord  ymm4, ymm4, 7
+        vpaddd  ymm0, ymm0, ymm27
+        vpaddd  ymm1, ymm1, ymm21
+        vpaddd  ymm2, ymm2, ymm17
+        vpaddd  ymm3, ymm3, ymm24
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vprord  ymm15, ymm15, 16
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 12
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vpaddd  ymm0, ymm0, ymm31
+        vpaddd  ymm1, ymm1, ymm16
+        vpaddd  ymm2, ymm2, ymm25
+        vpaddd  ymm3, ymm3, ymm22
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vprord  ymm15, ymm15, 8
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 7
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vpaddd  ymm0, ymm0, ymm30
+        vpaddd  ymm1, ymm1, ymm18
+        vpaddd  ymm2, ymm2, ymm19
+        vpaddd  ymm3, ymm3, ymm23
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 16
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vprord  ymm4, ymm4, 12
+        vpaddd  ymm0, ymm0, ymm26
+        vpaddd  ymm1, ymm1, ymm28
+        vpaddd  ymm2, ymm2, ymm20
+        vpaddd  ymm3, ymm3, ymm29
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 8
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vprord  ymm4, ymm4, 7
+        vpxor   ymm0, ymm0, ymm8
+        vpxor   ymm1, ymm1, ymm9
+        vpxor   ymm2, ymm2, ymm10
+        vpxor   ymm3, ymm3, ymm11
+        vpxor   ymm4, ymm4, ymm12
+        vpxor   ymm5, ymm5, ymm13
+        vpxor   ymm6, ymm6, ymm14
+        vpxor   ymm7, ymm7, ymm15
+        movzx   eax, byte ptr [rbp+0x38]
+        jne     2b
+        mov     rbx, qword ptr [rbp+0x50]
+        vunpcklps ymm8, ymm0, ymm1
+        vunpcklps ymm9, ymm2, ymm3
+        vunpckhps ymm10, ymm0, ymm1
+        vunpcklps ymm11, ymm4, ymm5
+        vunpcklps ymm0, ymm6, ymm7
+        vshufps ymm12, ymm8, ymm9, 78
+        vblendps ymm1, ymm8, ymm12, 0xCC
+        vshufps ymm8, ymm11, ymm0, 78
+        vunpckhps ymm13, ymm2, ymm3
+        vblendps ymm2, ymm11, ymm8, 0xCC
+        vblendps ymm3, ymm12, ymm9, 0xCC
+        vperm2f128 ymm12, ymm1, ymm2, 0x20
+        vmovups ymmword ptr [rbx], ymm12
+        vunpckhps ymm14, ymm4, ymm5
+        vblendps ymm4, ymm8, ymm0, 0xCC
+        vunpckhps ymm15, ymm6, ymm7
+        vperm2f128 ymm7, ymm3, ymm4, 0x20
+        vmovups ymmword ptr [rbx+0x20], ymm7
+        vshufps ymm5, ymm10, ymm13, 78
+        vblendps ymm6, ymm5, ymm13, 0xCC
+        vshufps ymm13, ymm14, ymm15, 78
+        vblendps ymm10, ymm10, ymm5, 0xCC
+        vblendps ymm14, ymm14, ymm13, 0xCC
+        vperm2f128 ymm8, ymm10, ymm14, 0x20
+        vmovups ymmword ptr [rbx+0x40], ymm8
+        vblendps ymm15, ymm13, ymm15, 0xCC
+        vperm2f128 ymm13, ymm6, ymm15, 0x20
+        vmovups ymmword ptr [rbx+0x60], ymm13
+        vperm2f128 ymm9, ymm1, ymm2, 0x31
+        vperm2f128 ymm11, ymm3, ymm4, 0x31
+        vmovups ymmword ptr [rbx+0x80], ymm9
+        vperm2f128 ymm14, ymm10, ymm14, 0x31
+        vperm2f128 ymm15, ymm6, ymm15, 0x31
+        vmovups ymmword ptr [rbx+0xA0], ymm11
+        vmovups ymmword ptr [rbx+0xC0], ymm14
+        vmovups ymmword ptr [rbx+0xE0], ymm15
+        vmovdqa ymm0, ymmword ptr [rsp]
+        vmovdqa ymm2, ymmword ptr [rsp+0x2*0x20]
+        vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20]
+        vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20]
+        vmovdqa ymmword ptr [rsp], ymm0
+        vmovdqa ymmword ptr [rsp+0x2*0x20], ymm2
+        add     rbx, 256
+        mov     qword ptr [rbp+0x50], rbx
+        add     rdi, 64
+        sub     rsi, 8
+3:
+        mov     rbx, qword ptr [rbp+0x50]
+        mov     r15, qword ptr [rsp+0x80]
+        movzx   r13, byte ptr [rbp+0x38]
+        movzx   r12, byte ptr [rbp+0x48]
+        test    esi, 0x4
+        je      3f
+        vbroadcasti32x4 zmm0, xmmword ptr [rcx]
+        vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10]
+        vmovdqa xmm12, xmmword ptr [rsp]
+        vmovdqa xmm13, xmmword ptr [rsp+0x4*0x10]
+        vpunpckldq xmm14, xmm12, xmm13
+        vpunpckhdq xmm15, xmm12, xmm13
+        vpermq  ymm14, ymm14, 0xDC
+        vpermq  ymm15, ymm15, 0xDC
+        vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip]
+        vinserti64x4 zmm13, zmm14, ymm15, 0x01
+        mov     eax, 17476
+        kmovw   k2, eax
+        vpblendmd zmm13 {k2}, zmm13, zmm12
+        vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip]
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        mov     r10, qword ptr [rdi+0x10]
+        mov     r11, qword ptr [rdi+0x18]
+        mov     eax, 43690
+        kmovw   k3, eax
+        mov     eax, 34952
+        kmovw   k4, eax
+        movzx   eax, byte ptr [rbp+0x40]
+        or      eax, r13d
+        xor     edx, edx
+.p2align 5
+2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        mov     dword ptr [rsp+0x88], eax
+        vmovdqa32 zmm2, zmm15
+        vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4]
+        vpblendmd zmm3 {k4}, zmm13, zmm8
+        vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40]
+        vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01
+        vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02
+        vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03
+        vmovups zmm9, zmmword ptr [r8+rdx-0x30]
+        vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01
+        vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02
+        vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03
+        vshufps zmm4, zmm8, zmm9, 136
+        vshufps zmm5, zmm8, zmm9, 221
+        vmovups zmm8, zmmword ptr [r8+rdx-0x20]
+        vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01
+        vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02
+        vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03
+        vmovups zmm9, zmmword ptr [r8+rdx-0x10]
+        vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01
+        vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02
+        vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03
+        vshufps zmm6, zmm8, zmm9, 136
+        vshufps zmm7, zmm8, zmm9, 221
+        vpshufd zmm6, zmm6, 0x93
+        vpshufd zmm7, zmm7, 0x93
+        mov     al, 7
+9:
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm0, zmm0, zmm1
+        vpxord  zmm3, zmm3, zmm0
+        vprord  zmm3, zmm3, 16
+        vpaddd  zmm2, zmm2, zmm3
+        vpxord  zmm1, zmm1, zmm2
+        vprord  zmm1, zmm1, 12
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm0, zmm0, zmm1
+        vpxord  zmm3, zmm3, zmm0
+        vprord  zmm3, zmm3, 8
+        vpaddd  zmm2, zmm2, zmm3
+        vpxord  zmm1, zmm1, zmm2
+        vprord  zmm1, zmm1, 7
+        vpshufd zmm0, zmm0, 0x93
+        vpshufd zmm3, zmm3, 0x4E
+        vpshufd zmm2, zmm2, 0x39
+        vpaddd  zmm0, zmm0, zmm6
+        vpaddd  zmm0, zmm0, zmm1
+        vpxord  zmm3, zmm3, zmm0
+        vprord  zmm3, zmm3, 16
+        vpaddd  zmm2, zmm2, zmm3
+        vpxord  zmm1, zmm1, zmm2
+        vprord  zmm1, zmm1, 12
+        vpaddd  zmm0, zmm0, zmm7
+        vpaddd  zmm0, zmm0, zmm1
+        vpxord  zmm3, zmm3, zmm0
+        vprord  zmm3, zmm3, 8
+        vpaddd  zmm2, zmm2, zmm3
+        vpxord  zmm1, zmm1, zmm2
+        vprord  zmm1, zmm1, 7
+        vpshufd zmm0, zmm0, 0x39
+        vpshufd zmm3, zmm3, 0x4E
+        vpshufd zmm2, zmm2, 0x93
+        dec     al
+        jz      9f
+        vshufps zmm8, zmm4, zmm5, 214
+        vpshufd zmm9, zmm4, 0x0F
+        vpshufd zmm4, zmm8, 0x39
+        vshufps zmm8, zmm6, zmm7, 250
+        vpblendmd zmm9 {k3}, zmm9, zmm8
+        vpunpcklqdq zmm8, zmm7, zmm5
+        vpblendmd zmm8 {k4}, zmm8, zmm6
+        vpshufd zmm8, zmm8, 0x78
+        vpunpckhdq zmm5, zmm5, zmm7
+        vpunpckldq zmm6, zmm6, zmm5
+        vpshufd zmm7, zmm6, 0x1E
+        vmovdqa32 zmm5, zmm9
+        vmovdqa32 zmm6, zmm8
+        jmp     9b
+9:
+        vpxord  zmm0, zmm0, zmm2
+        vpxord  zmm1, zmm1, zmm3
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     2b
+        vmovdqu xmmword ptr [rbx], xmm0
+        vmovdqu xmmword ptr [rbx+0x10], xmm1
+        vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
+        vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
+        vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02
+        vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02
+        vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03
+        vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03
+        vmovdqa xmm0, xmmword ptr [rsp]
+        vmovdqa xmm2, xmmword ptr [rsp+0x40]
+        vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10]
+        vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10]
+        vmovdqa xmmword ptr [rsp], xmm0
+        vmovdqa xmmword ptr [rsp+0x40], xmm2
+        add     rbx, 128
+        add     rdi, 32
+        sub     rsi, 4
+3:
+        test    esi, 0x2
+        je      3f
+        vbroadcasti128 ymm0, xmmword ptr [rcx]
+        vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
+        vmovd   xmm13, dword ptr [rsp]
+        vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1
+        vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+        vmovd   xmm14, dword ptr [rsp+0x4]
+        vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1
+        vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+        vinserti128 ymm13, ymm13, xmm14, 0x01
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        movzx   eax, byte ptr [rbp+0x40]
+        or      eax, r13d
+        xor     edx, edx
+.p2align 5
+2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        mov     dword ptr [rsp+0x88], eax
+        vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
+        vpbroadcastd ymm8, dword ptr [rsp+0x88]
+        vpblendd ymm3, ymm13, ymm8, 0x88
+        vmovups ymm8, ymmword ptr [r8+rdx-0x40]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01
+        vmovups ymm9, ymmword ptr [r8+rdx-0x30]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01
+        vshufps ymm4, ymm8, ymm9, 136
+        vshufps ymm5, ymm8, ymm9, 221
+        vmovups ymm8, ymmword ptr [r8+rdx-0x20]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01
+        vmovups ymm9, ymmword ptr [r8+rdx-0x10]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01
+        vshufps ymm6, ymm8, ymm9, 136
+        vshufps ymm7, ymm8, ymm9, 221
+        vpshufd ymm6, ymm6, 0x93
+        vpshufd ymm7, ymm7, 0x93
+        mov     al, 7
+9:
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm0, ymm0, ymm1
+        vpxord  ymm3, ymm3, ymm0
+        vprord  ymm3, ymm3, 16
+        vpaddd  ymm2, ymm2, ymm3
+        vpxord  ymm1, ymm1, ymm2
+        vprord  ymm1, ymm1, 12
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm0, ymm0, ymm1
+        vpxord  ymm3, ymm3, ymm0
+        vprord  ymm3, ymm3, 8
+        vpaddd  ymm2, ymm2, ymm3
+        vpxord  ymm1, ymm1, ymm2
+        vprord  ymm1, ymm1, 7
+        vpshufd ymm0, ymm0, 0x93
+        vpshufd ymm3, ymm3, 0x4E
+        vpshufd ymm2, ymm2, 0x39
+        vpaddd  ymm0, ymm0, ymm6
+        vpaddd  ymm0, ymm0, ymm1
+        vpxord  ymm3, ymm3, ymm0
+        vprord  ymm3, ymm3, 16
+        vpaddd  ymm2, ymm2, ymm3
+        vpxord  ymm1, ymm1, ymm2
+        vprord  ymm1, ymm1, 12
+        vpaddd  ymm0, ymm0, ymm7
+        vpaddd  ymm0, ymm0, ymm1
+        vpxord  ymm3, ymm3, ymm0
+        vprord  ymm3, ymm3, 8
+        vpaddd  ymm2, ymm2, ymm3
+        vpxord  ymm1, ymm1, ymm2
+        vprord  ymm1, ymm1, 7
+        vpshufd ymm0, ymm0, 0x39
+        vpshufd ymm3, ymm3, 0x4E
+        vpshufd ymm2, ymm2, 0x93
+        dec     al
+        jz      9f
+        vshufps ymm8, ymm4, ymm5, 214
+        vpshufd ymm9, ymm4, 0x0F
+        vpshufd ymm4, ymm8, 0x39
+        vshufps ymm8, ymm6, ymm7, 250
+        vpblendd ymm9, ymm9, ymm8, 0xAA
+        vpunpcklqdq ymm8, ymm7, ymm5
+        vpblendd ymm8, ymm8, ymm6, 0x88
+        vpshufd ymm8, ymm8, 0x78
+        vpunpckhdq ymm5, ymm5, ymm7
+        vpunpckldq ymm6, ymm6, ymm5
+        vpshufd ymm7, ymm6, 0x1E
+        vmovdqa ymm5, ymm9
+        vmovdqa ymm6, ymm8
+        jmp     9b
+9:
+        vpxor   ymm0, ymm0, ymm2
+        vpxor   ymm1, ymm1, ymm3
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     2b
+        vmovdqu xmmword ptr [rbx], xmm0
+        vmovdqu xmmword ptr [rbx+0x10], xmm1
+        vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
+        vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
+        vmovdqa xmm0, xmmword ptr [rsp]
+        vmovdqa xmm2, xmmword ptr [rsp+0x4*0x10]
+        vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8]
+        vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48]
+        vmovdqa xmmword ptr [rsp], xmm0
+        vmovdqa xmmword ptr [rsp+0x4*0x10], xmm2
+        add     rbx, 64
+        add     rdi, 16
+        sub     rsi, 2
+3:
+        test    esi, 0x1
+        je      4b
+        vmovdqu xmm0, xmmword ptr [rcx]
+        vmovdqu xmm1, xmmword ptr [rcx+0x10]
+        vmovd   xmm14, dword ptr [rsp]
+        vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1
+        vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+        vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip]
+        mov     r8, qword ptr [rdi]
+        movzx   eax, byte ptr [rbp+0x40]
+        or      eax, r13d
+        xor     edx, edx
+.p2align 5
+2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        vpinsrd xmm3, xmm14, eax, 3
+        vmovdqa xmm2, xmm15
+        vmovups xmm8, xmmword ptr [r8+rdx-0x40]
+        vmovups xmm9, xmmword ptr [r8+rdx-0x30]
+        vshufps xmm4, xmm8, xmm9, 136
+        vshufps xmm5, xmm8, xmm9, 221
+        vmovups xmm8, xmmword ptr [r8+rdx-0x20]
+        vmovups xmm9, xmmword ptr [r8+rdx-0x10]
+        vshufps xmm6, xmm8, xmm9, 136
+        vshufps xmm7, xmm8, xmm9, 221
+        vpshufd xmm6, xmm6, 0x93
+        vpshufd xmm7, xmm7, 0x93
+        mov     al, 7
+9:
+        vpaddd  xmm0, xmm0, xmm4
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 16
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 12
+        vpaddd  xmm0, xmm0, xmm5
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 8
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 7
+        vpshufd xmm0, xmm0, 0x93
+        vpshufd xmm3, xmm3, 0x4E
+        vpshufd xmm2, xmm2, 0x39
+        vpaddd  xmm0, xmm0, xmm6
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 16
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 12
+        vpaddd  xmm0, xmm0, xmm7
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 8
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 7
+        vpshufd xmm0, xmm0, 0x39
+        vpshufd xmm3, xmm3, 0x4E
+        vpshufd xmm2, xmm2, 0x93
+        dec     al
+        jz      9f
+        vshufps xmm8, xmm4, xmm5, 214
+        vpshufd xmm9, xmm4, 0x0F
+        vpshufd xmm4, xmm8, 0x39
+        vshufps xmm8, xmm6, xmm7, 250
+        vpblendd xmm9, xmm9, xmm8, 0xAA
+        vpunpcklqdq xmm8, xmm7, xmm5
+        vpblendd xmm8, xmm8, xmm6, 0x88
+        vpshufd xmm8, xmm8, 0x78
+        vpunpckhdq xmm5, xmm5, xmm7
+        vpunpckldq xmm6, xmm6, xmm5
+        vpshufd xmm7, xmm6, 0x1E
+        vmovdqa xmm5, xmm9
+        vmovdqa xmm6, xmm8
+        jmp     9b
+9:
+        vpxor   xmm0, xmm0, xmm2
+        vpxor   xmm1, xmm1, xmm3
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     2b
+        vmovdqu xmmword ptr [rbx], xmm0
+        vmovdqu xmmword ptr [rbx+0x10], xmm1
+        jmp     4b
+.p2align 6
+_blake3_compress_in_place_avx512:
+blake3_compress_in_place_avx512:
+        _CET_ENDBR
+        vmovdqu xmm0, xmmword ptr [rdi]
+        vmovdqu xmm1, xmmword ptr [rdi+0x10]
+        movzx   eax, r8b
+        movzx   edx, dl
+        shl     rax, 32
+        add     rdx, rax
+        vmovq   xmm3, rcx
+        vmovq   xmm4, rdx
+        vpunpcklqdq xmm3, xmm3, xmm4
+        vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+        vmovups xmm8, xmmword ptr [rsi]
+        vmovups xmm9, xmmword ptr [rsi+0x10]
+        vshufps xmm4, xmm8, xmm9, 136
+        vshufps xmm5, xmm8, xmm9, 221
+        vmovups xmm8, xmmword ptr [rsi+0x20]
+        vmovups xmm9, xmmword ptr [rsi+0x30]
+        vshufps xmm6, xmm8, xmm9, 136
+        vshufps xmm7, xmm8, xmm9, 221
+        vpshufd xmm6, xmm6, 0x93
+        vpshufd xmm7, xmm7, 0x93
+        mov     al, 7
+9:
+        vpaddd  xmm0, xmm0, xmm4
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 16
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 12
+        vpaddd  xmm0, xmm0, xmm5
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 8
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 7
+        vpshufd xmm0, xmm0, 0x93
+        vpshufd xmm3, xmm3, 0x4E
+        vpshufd xmm2, xmm2, 0x39
+        vpaddd  xmm0, xmm0, xmm6
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 16
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 12
+        vpaddd  xmm0, xmm0, xmm7
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 8
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 7
+        vpshufd xmm0, xmm0, 0x39
+        vpshufd xmm3, xmm3, 0x4E
+        vpshufd xmm2, xmm2, 0x93
+        dec     al
+        jz      9f
+        vshufps xmm8, xmm4, xmm5, 214
+        vpshufd xmm9, xmm4, 0x0F
+        vpshufd xmm4, xmm8, 0x39
+        vshufps xmm8, xmm6, xmm7, 250
+        vpblendd xmm9, xmm9, xmm8, 0xAA
+        vpunpcklqdq xmm8, xmm7, xmm5
+        vpblendd xmm8, xmm8, xmm6, 0x88
+        vpshufd xmm8, xmm8, 0x78
+        vpunpckhdq xmm5, xmm5, xmm7
+        vpunpckldq xmm6, xmm6, xmm5
+        vpshufd xmm7, xmm6, 0x1E
+        vmovdqa xmm5, xmm9
+        vmovdqa xmm6, xmm8
+        jmp     9b
+9:
+        vpxor   xmm0, xmm0, xmm2
+        vpxor   xmm1, xmm1, xmm3
+        vmovdqu xmmword ptr [rdi], xmm0
+        vmovdqu xmmword ptr [rdi+0x10], xmm1
+        ret
+
+.p2align 6
+_blake3_compress_xof_avx512:
+blake3_compress_xof_avx512:
+        _CET_ENDBR
+        vmovdqu xmm0, xmmword ptr [rdi]
+        vmovdqu xmm1, xmmword ptr [rdi+0x10]
+        movzx   eax, r8b
+        movzx   edx, dl
+        shl     rax, 32
+        add     rdx, rax
+        vmovq   xmm3, rcx
+        vmovq   xmm4, rdx
+        vpunpcklqdq xmm3, xmm3, xmm4
+        vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+        vmovups xmm8, xmmword ptr [rsi]
+        vmovups xmm9, xmmword ptr [rsi+0x10]
+        vshufps xmm4, xmm8, xmm9, 136
+        vshufps xmm5, xmm8, xmm9, 221
+        vmovups xmm8, xmmword ptr [rsi+0x20]
+        vmovups xmm9, xmmword ptr [rsi+0x30]
+        vshufps xmm6, xmm8, xmm9, 136
+        vshufps xmm7, xmm8, xmm9, 221
+        vpshufd xmm6, xmm6, 0x93
+        vpshufd xmm7, xmm7, 0x93
+        mov     al, 7
+9:
+        vpaddd  xmm0, xmm0, xmm4
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 16
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 12
+        vpaddd  xmm0, xmm0, xmm5
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 8
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 7
+        vpshufd xmm0, xmm0, 0x93
+        vpshufd xmm3, xmm3, 0x4E
+        vpshufd xmm2, xmm2, 0x39
+        vpaddd  xmm0, xmm0, xmm6
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 16
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 12
+        vpaddd  xmm0, xmm0, xmm7
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 8
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 7
+        vpshufd xmm0, xmm0, 0x39
+        vpshufd xmm3, xmm3, 0x4E
+        vpshufd xmm2, xmm2, 0x93
+        dec     al
+        jz      9f
+        vshufps xmm8, xmm4, xmm5, 214
+        vpshufd xmm9, xmm4, 0x0F
+        vpshufd xmm4, xmm8, 0x39
+        vshufps xmm8, xmm6, xmm7, 250
+        vpblendd xmm9, xmm9, xmm8, 0xAA
+        vpunpcklqdq xmm8, xmm7, xmm5
+        vpblendd xmm8, xmm8, xmm6, 0x88
+        vpshufd xmm8, xmm8, 0x78
+        vpunpckhdq xmm5, xmm5, xmm7
+        vpunpckldq xmm6, xmm6, xmm5
+        vpshufd xmm7, xmm6, 0x1E
+        vmovdqa xmm5, xmm9
+        vmovdqa xmm6, xmm8
+        jmp     9b
+9:
+        vpxor   xmm0, xmm0, xmm2
+        vpxor   xmm1, xmm1, xmm3
+        vpxor   xmm2, xmm2, [rdi]
+        vpxor   xmm3, xmm3, [rdi+0x10]
+        vmovdqu xmmword ptr [r9], xmm0
+        vmovdqu xmmword ptr [r9+0x10], xmm1
+        vmovdqu xmmword ptr [r9+0x20], xmm2
+        vmovdqu xmmword ptr [r9+0x30], xmm3
+        ret
+
+#ifdef __APPLE__
+.static_data
+#else
+.section .rodata
+#endif
+.p2align  6
+INDEX0:
+        .long    0,  1,  2,  3, 16, 17, 18, 19
+        .long    8,  9, 10, 11, 24, 25, 26, 27
+INDEX1:
+        .long    4,  5,  6,  7, 20, 21, 22, 23
+        .long   12, 13, 14, 15, 28, 29, 30, 31
+ADD0:
+        .long    0,  1,  2,  3,  4,  5,  6,  7
+        .long    8,  9, 10, 11, 12, 13, 14, 15
+ADD1:   .long    1
+
+ADD16:  .long   16
+BLAKE3_BLOCK_LEN:
+        .long   64
+.p2align 6
+BLAKE3_IV:
+BLAKE3_IV_0:
+        .long   0x6A09E667
+BLAKE3_IV_1:
+        .long   0xBB67AE85
+BLAKE3_IV_2:
+        .long   0x3C6EF372
+BLAKE3_IV_3:
+        .long   0xA54FF53A
+
+#endif
diff --git a/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_windows_gnu.S b/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_windows_gnu.S
new file mode 100644
index 000000000000..e10b9f36cbcc
--- /dev/null
+++ b/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_windows_gnu.S
@@ -0,0 +1,2615 @@
+.intel_syntax noprefix
+
+.global _blake3_hash_many_avx512
+.global blake3_hash_many_avx512
+.global blake3_compress_in_place_avx512
+.global _blake3_compress_in_place_avx512
+.global blake3_compress_xof_avx512
+.global _blake3_compress_xof_avx512
+
+.section .text
+.p2align  6
+_blake3_hash_many_avx512:
+blake3_hash_many_avx512:
+        push    r15
+        push    r14
+        push    r13
+        push    r12
+        push    rdi
+        push    rsi
+        push    rbx
+        push    rbp
+        mov     rbp, rsp
+        sub     rsp, 304
+        and     rsp, 0xFFFFFFFFFFFFFFC0
+        vmovdqa xmmword ptr [rsp+0x90], xmm6
+        vmovdqa xmmword ptr [rsp+0xA0], xmm7
+        vmovdqa xmmword ptr [rsp+0xB0], xmm8
+        vmovdqa xmmword ptr [rsp+0xC0], xmm9
+        vmovdqa xmmword ptr [rsp+0xD0], xmm10
+        vmovdqa xmmword ptr [rsp+0xE0], xmm11
+        vmovdqa xmmword ptr [rsp+0xF0], xmm12
+        vmovdqa xmmword ptr [rsp+0x100], xmm13
+        vmovdqa xmmword ptr [rsp+0x110], xmm14
+        vmovdqa xmmword ptr [rsp+0x120], xmm15
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+        mov     rcx, r9
+        mov     r8, qword ptr [rbp+0x68]
+        movzx   r9, byte ptr [rbp+0x70]
+        neg     r9
+        kmovw   k1, r9d
+        vmovd   xmm0, r8d
+        vpbroadcastd ymm0, xmm0
+        shr     r8, 32
+        vmovd   xmm1, r8d
+        vpbroadcastd ymm1, xmm1
+        vmovdqa ymm4, ymm1
+        vmovdqa ymm5, ymm1
+        vpaddd  ymm2, ymm0, ymmword ptr [ADD0+rip]
+        vpaddd  ymm3, ymm0, ymmword ptr [ADD0+32+rip]
+        vpcmpltud k2, ymm2, ymm0
+        vpcmpltud k3, ymm3, ymm0
+        vpaddd  ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8}
+        vpaddd  ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8}
+        knotw   k2, k1
+        vmovdqa32 ymm2 {k2}, ymm0
+        vmovdqa32 ymm3 {k2}, ymm0
+        vmovdqa32 ymm4 {k2}, ymm1
+        vmovdqa32 ymm5 {k2}, ymm1
+        vmovdqa ymmword ptr [rsp], ymm2
+        vmovdqa ymmword ptr [rsp+0x20], ymm3
+        vmovdqa ymmword ptr [rsp+0x40], ymm4
+        vmovdqa ymmword ptr [rsp+0x60], ymm5
+        shl     rdx, 6
+        mov     qword ptr [rsp+0x80], rdx
+        cmp     rsi, 16
+        jc      3f
+2:
+        vpbroadcastd zmm0, dword ptr [rcx]
+        vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4]
+        vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4]
+        vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4]
+        vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4]
+        vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4]
+        vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4]
+        vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4]
+        movzx   eax, byte ptr [rbp+0x78]
+        movzx   ebx, byte ptr [rbp+0x80]
+        or      eax, ebx
+        xor     edx, edx
+.p2align 5
+9:
+        movzx   ebx, byte ptr [rbp+0x88]
+        or      ebx, eax
+        add     rdx, 64
+        cmp     rdx, qword ptr [rsp+0x80]
+        cmove   eax, ebx
+        mov     dword ptr [rsp+0x88], eax
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        mov     r10, qword ptr [rdi+0x10]
+        mov     r11, qword ptr [rdi+0x18]
+        mov     r12, qword ptr [rdi+0x40]
+        mov     r13, qword ptr [rdi+0x48]
+        mov     r14, qword ptr [rdi+0x50]
+        mov     r15, qword ptr [rdi+0x58]
+        vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
+        vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
+        vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
+        vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
+        vpunpcklqdq zmm8, zmm16, zmm17
+        vpunpckhqdq zmm9, zmm16, zmm17
+        vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
+        vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
+        vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
+        vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
+        vpunpcklqdq zmm10, zmm18, zmm19
+        vpunpckhqdq zmm11, zmm18, zmm19
+        mov     r8, qword ptr [rdi+0x20]
+        mov     r9, qword ptr [rdi+0x28]
+        mov     r10, qword ptr [rdi+0x30]
+        mov     r11, qword ptr [rdi+0x38]
+        mov     r12, qword ptr [rdi+0x60]
+        mov     r13, qword ptr [rdi+0x68]
+        mov     r14, qword ptr [rdi+0x70]
+        mov     r15, qword ptr [rdi+0x78]
+        vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
+        vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
+        vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
+        vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
+        vpunpcklqdq zmm12, zmm16, zmm17
+        vpunpckhqdq zmm13, zmm16, zmm17
+        vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
+        vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
+        vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
+        vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
+        vpunpcklqdq zmm14, zmm18, zmm19
+        vpunpckhqdq zmm15, zmm18, zmm19
+        vmovdqa32 zmm27, zmmword ptr [INDEX0+rip]
+        vmovdqa32 zmm31, zmmword ptr [INDEX1+rip]
+        vshufps zmm16, zmm8, zmm10, 136
+        vshufps zmm17, zmm12, zmm14, 136
+        vmovdqa32 zmm20, zmm16
+        vpermt2d zmm16, zmm27, zmm17
+        vpermt2d zmm20, zmm31, zmm17
+        vshufps zmm17, zmm8, zmm10, 221
+        vshufps zmm30, zmm12, zmm14, 221
+        vmovdqa32 zmm21, zmm17
+        vpermt2d zmm17, zmm27, zmm30
+        vpermt2d zmm21, zmm31, zmm30
+        vshufps zmm18, zmm9, zmm11, 136
+        vshufps zmm8, zmm13, zmm15, 136
+        vmovdqa32 zmm22, zmm18
+        vpermt2d zmm18, zmm27, zmm8
+        vpermt2d zmm22, zmm31, zmm8
+        vshufps zmm19, zmm9, zmm11, 221
+        vshufps zmm8, zmm13, zmm15, 221
+        vmovdqa32 zmm23, zmm19
+        vpermt2d zmm19, zmm27, zmm8
+        vpermt2d zmm23, zmm31, zmm8
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        mov     r10, qword ptr [rdi+0x10]
+        mov     r11, qword ptr [rdi+0x18]
+        mov     r12, qword ptr [rdi+0x40]
+        mov     r13, qword ptr [rdi+0x48]
+        mov     r14, qword ptr [rdi+0x50]
+        mov     r15, qword ptr [rdi+0x58]
+        vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
+        vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
+        vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
+        vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
+        vpunpcklqdq zmm8, zmm24, zmm25
+        vpunpckhqdq zmm9, zmm24, zmm25
+        vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
+        vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
+        vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
+        vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
+        vpunpcklqdq zmm10, zmm24, zmm25
+        vpunpckhqdq zmm11, zmm24, zmm25
+        prefetcht0 [r8+rdx+0x80]
+        prefetcht0 [r12+rdx+0x80]
+        prefetcht0 [r9+rdx+0x80]
+        prefetcht0 [r13+rdx+0x80]
+        prefetcht0 [r10+rdx+0x80]
+        prefetcht0 [r14+rdx+0x80]
+        prefetcht0 [r11+rdx+0x80]
+        prefetcht0 [r15+rdx+0x80]
+        mov     r8, qword ptr [rdi+0x20]
+        mov     r9, qword ptr [rdi+0x28]
+        mov     r10, qword ptr [rdi+0x30]
+        mov     r11, qword ptr [rdi+0x38]
+        mov     r12, qword ptr [rdi+0x60]
+        mov     r13, qword ptr [rdi+0x68]
+        mov     r14, qword ptr [rdi+0x70]
+        mov     r15, qword ptr [rdi+0x78]
+        vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
+        vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
+        vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
+        vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
+        vpunpcklqdq zmm12, zmm24, zmm25
+        vpunpckhqdq zmm13, zmm24, zmm25
+        vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
+        vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
+        vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
+        vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
+        vpunpcklqdq zmm14, zmm24, zmm25
+        vpunpckhqdq zmm15, zmm24, zmm25
+        prefetcht0 [r8+rdx+0x80]
+        prefetcht0 [r12+rdx+0x80]
+        prefetcht0 [r9+rdx+0x80]
+        prefetcht0 [r13+rdx+0x80]
+        prefetcht0 [r10+rdx+0x80]
+        prefetcht0 [r14+rdx+0x80]
+        prefetcht0 [r11+rdx+0x80]
+        prefetcht0 [r15+rdx+0x80]
+        vshufps zmm24, zmm8, zmm10, 136
+        vshufps zmm30, zmm12, zmm14, 136
+        vmovdqa32 zmm28, zmm24
+        vpermt2d zmm24, zmm27, zmm30
+        vpermt2d zmm28, zmm31, zmm30
+        vshufps zmm25, zmm8, zmm10, 221
+        vshufps zmm30, zmm12, zmm14, 221
+        vmovdqa32 zmm29, zmm25
+        vpermt2d zmm25, zmm27, zmm30
+        vpermt2d zmm29, zmm31, zmm30
+        vshufps zmm26, zmm9, zmm11, 136
+        vshufps zmm8, zmm13, zmm15, 136
+        vmovdqa32 zmm30, zmm26
+        vpermt2d zmm26, zmm27, zmm8
+        vpermt2d zmm30, zmm31, zmm8
+        vshufps zmm8, zmm9, zmm11, 221
+        vshufps zmm10, zmm13, zmm15, 221
+        vpermi2d zmm27, zmm8, zmm10
+        vpermi2d zmm31, zmm8, zmm10
+        vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip]
+        vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip]
+        vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip]
+        vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip]
+        vmovdqa32 zmm12, zmmword ptr [rsp]
+        vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40]
+        vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip]
+        vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4]
+        vpaddd  zmm0, zmm0, zmm16
+        vpaddd  zmm1, zmm1, zmm18
+        vpaddd  zmm2, zmm2, zmm20
+        vpaddd  zmm3, zmm3, zmm22
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vprord  zmm15, zmm15, 16
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 12
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vpaddd  zmm0, zmm0, zmm17
+        vpaddd  zmm1, zmm1, zmm19
+        vpaddd  zmm2, zmm2, zmm21
+        vpaddd  zmm3, zmm3, zmm23
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vprord  zmm15, zmm15, 8
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 7
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vpaddd  zmm0, zmm0, zmm24
+        vpaddd  zmm1, zmm1, zmm26
+        vpaddd  zmm2, zmm2, zmm28
+        vpaddd  zmm3, zmm3, zmm30
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 16
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vprord  zmm4, zmm4, 12
+        vpaddd  zmm0, zmm0, zmm25
+        vpaddd  zmm1, zmm1, zmm27
+        vpaddd  zmm2, zmm2, zmm29
+        vpaddd  zmm3, zmm3, zmm31
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 8
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vprord  zmm4, zmm4, 7
+        vpaddd  zmm0, zmm0, zmm18
+        vpaddd  zmm1, zmm1, zmm19
+        vpaddd  zmm2, zmm2, zmm23
+        vpaddd  zmm3, zmm3, zmm20
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vprord  zmm15, zmm15, 16
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 12
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vpaddd  zmm0, zmm0, zmm22
+        vpaddd  zmm1, zmm1, zmm26
+        vpaddd  zmm2, zmm2, zmm16
+        vpaddd  zmm3, zmm3, zmm29
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vprord  zmm15, zmm15, 8
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 7
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vpaddd  zmm0, zmm0, zmm17
+        vpaddd  zmm1, zmm1, zmm28
+        vpaddd  zmm2, zmm2, zmm25
+        vpaddd  zmm3, zmm3, zmm31
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 16
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vprord  zmm4, zmm4, 12
+        vpaddd  zmm0, zmm0, zmm27
+        vpaddd  zmm1, zmm1, zmm21
+        vpaddd  zmm2, zmm2, zmm30
+        vpaddd  zmm3, zmm3, zmm24
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 8
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vprord  zmm4, zmm4, 7
+        vpaddd  zmm0, zmm0, zmm19
+        vpaddd  zmm1, zmm1, zmm26
+        vpaddd  zmm2, zmm2, zmm29
+        vpaddd  zmm3, zmm3, zmm23
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vprord  zmm15, zmm15, 16
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 12
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vpaddd  zmm0, zmm0, zmm20
+        vpaddd  zmm1, zmm1, zmm28
+        vpaddd  zmm2, zmm2, zmm18
+        vpaddd  zmm3, zmm3, zmm30
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vprord  zmm15, zmm15, 8
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 7
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vpaddd  zmm0, zmm0, zmm22
+        vpaddd  zmm1, zmm1, zmm25
+        vpaddd  zmm2, zmm2, zmm27
+        vpaddd  zmm3, zmm3, zmm24
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 16
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vprord  zmm4, zmm4, 12
+        vpaddd  zmm0, zmm0, zmm21
+        vpaddd  zmm1, zmm1, zmm16
+        vpaddd  zmm2, zmm2, zmm31
+        vpaddd  zmm3, zmm3, zmm17
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 8
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vprord  zmm4, zmm4, 7
+        vpaddd  zmm0, zmm0, zmm26
+        vpaddd  zmm1, zmm1, zmm28
+        vpaddd  zmm2, zmm2, zmm30
+        vpaddd  zmm3, zmm3, zmm29
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vprord  zmm15, zmm15, 16
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 12
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vpaddd  zmm0, zmm0, zmm23
+        vpaddd  zmm1, zmm1, zmm25
+        vpaddd  zmm2, zmm2, zmm19
+        vpaddd  zmm3, zmm3, zmm31
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vprord  zmm15, zmm15, 8
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 7
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vpaddd  zmm0, zmm0, zmm20
+        vpaddd  zmm1, zmm1, zmm27
+        vpaddd  zmm2, zmm2, zmm21
+        vpaddd  zmm3, zmm3, zmm17
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 16
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vprord  zmm4, zmm4, 12
+        vpaddd  zmm0, zmm0, zmm16
+        vpaddd  zmm1, zmm1, zmm18
+        vpaddd  zmm2, zmm2, zmm24
+        vpaddd  zmm3, zmm3, zmm22
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 8
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vprord  zmm4, zmm4, 7
+        vpaddd  zmm0, zmm0, zmm28
+        vpaddd  zmm1, zmm1, zmm25
+        vpaddd  zmm2, zmm2, zmm31
+        vpaddd  zmm3, zmm3, zmm30
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vprord  zmm15, zmm15, 16
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 12
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vpaddd  zmm0, zmm0, zmm29
+        vpaddd  zmm1, zmm1, zmm27
+        vpaddd  zmm2, zmm2, zmm26
+        vpaddd  zmm3, zmm3, zmm24
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vprord  zmm15, zmm15, 8
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 7
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vpaddd  zmm0, zmm0, zmm23
+        vpaddd  zmm1, zmm1, zmm21
+        vpaddd  zmm2, zmm2, zmm16
+        vpaddd  zmm3, zmm3, zmm22
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 16
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vprord  zmm4, zmm4, 12
+        vpaddd  zmm0, zmm0, zmm18
+        vpaddd  zmm1, zmm1, zmm19
+        vpaddd  zmm2, zmm2, zmm17
+        vpaddd  zmm3, zmm3, zmm20
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 8
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vprord  zmm4, zmm4, 7
+        vpaddd  zmm0, zmm0, zmm25
+        vpaddd  zmm1, zmm1, zmm27
+        vpaddd  zmm2, zmm2, zmm24
+        vpaddd  zmm3, zmm3, zmm31
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vprord  zmm15, zmm15, 16
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 12
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vpaddd  zmm0, zmm0, zmm30
+        vpaddd  zmm1, zmm1, zmm21
+        vpaddd  zmm2, zmm2, zmm28
+        vpaddd  zmm3, zmm3, zmm17
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vprord  zmm15, zmm15, 8
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 7
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vpaddd  zmm0, zmm0, zmm29
+        vpaddd  zmm1, zmm1, zmm16
+        vpaddd  zmm2, zmm2, zmm18
+        vpaddd  zmm3, zmm3, zmm20
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 16
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vprord  zmm4, zmm4, 12
+        vpaddd  zmm0, zmm0, zmm19
+        vpaddd  zmm1, zmm1, zmm26
+        vpaddd  zmm2, zmm2, zmm22
+        vpaddd  zmm3, zmm3, zmm23
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 8
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vprord  zmm4, zmm4, 7
+        vpaddd  zmm0, zmm0, zmm27
+        vpaddd  zmm1, zmm1, zmm21
+        vpaddd  zmm2, zmm2, zmm17
+        vpaddd  zmm3, zmm3, zmm24
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vprord  zmm15, zmm15, 16
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 12
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vpaddd  zmm0, zmm0, zmm31
+        vpaddd  zmm1, zmm1, zmm16
+        vpaddd  zmm2, zmm2, zmm25
+        vpaddd  zmm3, zmm3, zmm22
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vprord  zmm15, zmm15, 8
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 7
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vpaddd  zmm0, zmm0, zmm30
+        vpaddd  zmm1, zmm1, zmm18
+        vpaddd  zmm2, zmm2, zmm19
+        vpaddd  zmm3, zmm3, zmm23
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 16
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vprord  zmm4, zmm4, 12
+        vpaddd  zmm0, zmm0, zmm26
+        vpaddd  zmm1, zmm1, zmm28
+        vpaddd  zmm2, zmm2, zmm20
+        vpaddd  zmm3, zmm3, zmm29
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 8
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vprord  zmm4, zmm4, 7
+        vpxord  zmm0, zmm0, zmm8
+        vpxord  zmm1, zmm1, zmm9
+        vpxord  zmm2, zmm2, zmm10
+        vpxord  zmm3, zmm3, zmm11
+        vpxord  zmm4, zmm4, zmm12
+        vpxord  zmm5, zmm5, zmm13
+        vpxord  zmm6, zmm6, zmm14
+        vpxord  zmm7, zmm7, zmm15
+        movzx   eax, byte ptr [rbp+0x78]
+        jne     9b
+        mov     rbx, qword ptr [rbp+0x90]
+        vpunpckldq zmm16, zmm0, zmm1
+        vpunpckhdq zmm17, zmm0, zmm1
+        vpunpckldq zmm18, zmm2, zmm3
+        vpunpckhdq zmm19, zmm2, zmm3
+        vpunpckldq zmm20, zmm4, zmm5
+        vpunpckhdq zmm21, zmm4, zmm5
+        vpunpckldq zmm22, zmm6, zmm7
+        vpunpckhdq zmm23, zmm6, zmm7
+        vpunpcklqdq zmm0, zmm16, zmm18
+        vpunpckhqdq zmm1, zmm16, zmm18
+        vpunpcklqdq zmm2, zmm17, zmm19
+        vpunpckhqdq zmm3, zmm17, zmm19
+        vpunpcklqdq zmm4, zmm20, zmm22
+        vpunpckhqdq zmm5, zmm20, zmm22
+        vpunpcklqdq zmm6, zmm21, zmm23
+        vpunpckhqdq zmm7, zmm21, zmm23
+        vshufi32x4 zmm16, zmm0, zmm4, 0x88
+        vshufi32x4 zmm17, zmm1, zmm5, 0x88
+        vshufi32x4 zmm18, zmm2, zmm6, 0x88
+        vshufi32x4 zmm19, zmm3, zmm7, 0x88
+        vshufi32x4 zmm20, zmm0, zmm4, 0xDD
+        vshufi32x4 zmm21, zmm1, zmm5, 0xDD
+        vshufi32x4 zmm22, zmm2, zmm6, 0xDD
+        vshufi32x4 zmm23, zmm3, zmm7, 0xDD
+        vshufi32x4 zmm0, zmm16, zmm17, 0x88
+        vshufi32x4 zmm1, zmm18, zmm19, 0x88
+        vshufi32x4 zmm2, zmm20, zmm21, 0x88
+        vshufi32x4 zmm3, zmm22, zmm23, 0x88
+        vshufi32x4 zmm4, zmm16, zmm17, 0xDD
+        vshufi32x4 zmm5, zmm18, zmm19, 0xDD
+        vshufi32x4 zmm6, zmm20, zmm21, 0xDD
+        vshufi32x4 zmm7, zmm22, zmm23, 0xDD
+        vmovdqu32 zmmword ptr [rbx], zmm0
+        vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1
+        vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2
+        vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3
+        vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4
+        vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5
+        vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6
+        vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7
+        vmovdqa32 zmm0, zmmword ptr [rsp]
+        vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40]
+        vmovdqa32 zmm2, zmm0
+        vpaddd  zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16}
+        vpcmpltud k2, zmm2, zmm0
+        vpaddd  zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16}
+        vmovdqa32 zmmword ptr [rsp], zmm2
+        vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1
+        add     rdi, 128
+        add     rbx, 512
+        mov     qword ptr [rbp+0x90], rbx
+        sub     rsi, 16
+        cmp     rsi, 16
+        jnc     2b
+        test    rsi, rsi
+        jne     3f
+4:
+        vzeroupper
+        vmovdqa xmm6, xmmword ptr [rsp+0x90]
+        vmovdqa xmm7, xmmword ptr [rsp+0xA0]
+        vmovdqa xmm8, xmmword ptr [rsp+0xB0]
+        vmovdqa xmm9, xmmword ptr [rsp+0xC0]
+        vmovdqa xmm10, xmmword ptr [rsp+0xD0]
+        vmovdqa xmm11, xmmword ptr [rsp+0xE0]
+        vmovdqa xmm12, xmmword ptr [rsp+0xF0]
+        vmovdqa xmm13, xmmword ptr [rsp+0x100]
+        vmovdqa xmm14, xmmword ptr [rsp+0x110]
+        vmovdqa xmm15, xmmword ptr [rsp+0x120]
+        mov     rsp, rbp
+        pop     rbp
+        pop     rbx
+        pop     rsi
+        pop     rdi
+        pop     r12
+        pop     r13
+        pop     r14
+        pop     r15
+        ret
+.p2align 6
+3:
+        test    esi, 0x8
+        je      3f
+        vpbroadcastd ymm0, dword ptr [rcx]
+        vpbroadcastd ymm1, dword ptr [rcx+0x4]
+        vpbroadcastd ymm2, dword ptr [rcx+0x8]
+        vpbroadcastd ymm3, dword ptr [rcx+0xC]
+        vpbroadcastd ymm4, dword ptr [rcx+0x10]
+        vpbroadcastd ymm5, dword ptr [rcx+0x14]
+        vpbroadcastd ymm6, dword ptr [rcx+0x18]
+        vpbroadcastd ymm7, dword ptr [rcx+0x1C]
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        mov     r10, qword ptr [rdi+0x10]
+        mov     r11, qword ptr [rdi+0x18]
+        mov     r12, qword ptr [rdi+0x20]
+        mov     r13, qword ptr [rdi+0x28]
+        mov     r14, qword ptr [rdi+0x30]
+        mov     r15, qword ptr [rdi+0x38]
+        movzx   eax, byte ptr [rbp+0x78]
+        movzx   ebx, byte ptr [rbp+0x80]
+        or      eax, ebx
+        xor     edx, edx
+2:
+        movzx   ebx, byte ptr [rbp+0x88]
+        or      ebx, eax
+        add     rdx, 64
+        cmp     rdx, qword ptr [rsp+0x80]
+        cmove   eax, ebx
+        mov     dword ptr [rsp+0x88], eax
+        vmovups xmm8, xmmword ptr [r8+rdx-0x40]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01
+        vmovups xmm9, xmmword ptr [r9+rdx-0x40]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01
+        vunpcklpd ymm12, ymm8, ymm9
+        vunpckhpd ymm13, ymm8, ymm9
+        vmovups xmm10, xmmword ptr [r10+rdx-0x40]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01
+        vmovups xmm11, xmmword ptr [r11+rdx-0x40]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01
+        vunpcklpd ymm14, ymm10, ymm11
+        vunpckhpd ymm15, ymm10, ymm11
+        vshufps ymm16, ymm12, ymm14, 136
+        vshufps ymm17, ymm12, ymm14, 221
+        vshufps ymm18, ymm13, ymm15, 136
+        vshufps ymm19, ymm13, ymm15, 221
+        vmovups xmm8, xmmword ptr [r8+rdx-0x30]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01
+        vmovups xmm9, xmmword ptr [r9+rdx-0x30]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01
+        vunpcklpd ymm12, ymm8, ymm9
+        vunpckhpd ymm13, ymm8, ymm9
+        vmovups xmm10, xmmword ptr [r10+rdx-0x30]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01
+        vmovups xmm11, xmmword ptr [r11+rdx-0x30]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01
+        vunpcklpd ymm14, ymm10, ymm11
+        vunpckhpd ymm15, ymm10, ymm11
+        vshufps ymm20, ymm12, ymm14, 136
+        vshufps ymm21, ymm12, ymm14, 221
+        vshufps ymm22, ymm13, ymm15, 136
+        vshufps ymm23, ymm13, ymm15, 221
+        vmovups xmm8, xmmword ptr [r8+rdx-0x20]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01
+        vmovups xmm9, xmmword ptr [r9+rdx-0x20]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01
+        vunpcklpd ymm12, ymm8, ymm9
+        vunpckhpd ymm13, ymm8, ymm9
+        vmovups xmm10, xmmword ptr [r10+rdx-0x20]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01
+        vmovups xmm11, xmmword ptr [r11+rdx-0x20]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01
+        vunpcklpd ymm14, ymm10, ymm11
+        vunpckhpd ymm15, ymm10, ymm11
+        vshufps ymm24, ymm12, ymm14, 136
+        vshufps ymm25, ymm12, ymm14, 221
+        vshufps ymm26, ymm13, ymm15, 136
+        vshufps ymm27, ymm13, ymm15, 221
+        vmovups xmm8, xmmword ptr [r8+rdx-0x10]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01
+        vmovups xmm9, xmmword ptr [r9+rdx-0x10]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01
+        vunpcklpd ymm12, ymm8, ymm9
+        vunpckhpd ymm13, ymm8, ymm9
+        vmovups xmm10, xmmword ptr [r10+rdx-0x10]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01
+        vmovups xmm11, xmmword ptr [r11+rdx-0x10]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01
+        vunpcklpd ymm14, ymm10, ymm11
+        vunpckhpd ymm15, ymm10, ymm11
+        vshufps ymm28, ymm12, ymm14, 136
+        vshufps ymm29, ymm12, ymm14, 221
+        vshufps ymm30, ymm13, ymm15, 136
+        vshufps ymm31, ymm13, ymm15, 221
+        vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip]
+        vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip]
+        vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip]
+        vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip]
+        vmovdqa ymm12, ymmword ptr [rsp]
+        vmovdqa ymm13, ymmword ptr [rsp+0x40]
+        vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip]
+        vpbroadcastd ymm15, dword ptr [rsp+0x88]
+        vpaddd  ymm0, ymm0, ymm16
+        vpaddd  ymm1, ymm1, ymm18
+        vpaddd  ymm2, ymm2, ymm20
+        vpaddd  ymm3, ymm3, ymm22
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vprord  ymm15, ymm15, 16
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 12
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vpaddd  ymm0, ymm0, ymm17
+        vpaddd  ymm1, ymm1, ymm19
+        vpaddd  ymm2, ymm2, ymm21
+        vpaddd  ymm3, ymm3, ymm23
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vprord  ymm15, ymm15, 8
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 7
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vpaddd  ymm0, ymm0, ymm24
+        vpaddd  ymm1, ymm1, ymm26
+        vpaddd  ymm2, ymm2, ymm28
+        vpaddd  ymm3, ymm3, ymm30
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 16
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vprord  ymm4, ymm4, 12
+        vpaddd  ymm0, ymm0, ymm25
+        vpaddd  ymm1, ymm1, ymm27
+        vpaddd  ymm2, ymm2, ymm29
+        vpaddd  ymm3, ymm3, ymm31
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 8
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vprord  ymm4, ymm4, 7
+        vpaddd  ymm0, ymm0, ymm18
+        vpaddd  ymm1, ymm1, ymm19
+        vpaddd  ymm2, ymm2, ymm23
+        vpaddd  ymm3, ymm3, ymm20
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vprord  ymm15, ymm15, 16
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 12
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vpaddd  ymm0, ymm0, ymm22
+        vpaddd  ymm1, ymm1, ymm26
+        vpaddd  ymm2, ymm2, ymm16
+        vpaddd  ymm3, ymm3, ymm29
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vprord  ymm15, ymm15, 8
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 7
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vpaddd  ymm0, ymm0, ymm17
+        vpaddd  ymm1, ymm1, ymm28
+        vpaddd  ymm2, ymm2, ymm25
+        vpaddd  ymm3, ymm3, ymm31
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 16
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vprord  ymm4, ymm4, 12
+        vpaddd  ymm0, ymm0, ymm27
+        vpaddd  ymm1, ymm1, ymm21
+        vpaddd  ymm2, ymm2, ymm30
+        vpaddd  ymm3, ymm3, ymm24
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 8
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vprord  ymm4, ymm4, 7
+        vpaddd  ymm0, ymm0, ymm19
+        vpaddd  ymm1, ymm1, ymm26
+        vpaddd  ymm2, ymm2, ymm29
+        vpaddd  ymm3, ymm3, ymm23
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vprord  ymm15, ymm15, 16
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 12
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vpaddd  ymm0, ymm0, ymm20
+        vpaddd  ymm1, ymm1, ymm28
+        vpaddd  ymm2, ymm2, ymm18
+        vpaddd  ymm3, ymm3, ymm30
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vprord  ymm15, ymm15, 8
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 7
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vpaddd  ymm0, ymm0, ymm22
+        vpaddd  ymm1, ymm1, ymm25
+        vpaddd  ymm2, ymm2, ymm27
+        vpaddd  ymm3, ymm3, ymm24
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 16
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vprord  ymm4, ymm4, 12
+        vpaddd  ymm0, ymm0, ymm21
+        vpaddd  ymm1, ymm1, ymm16
+        vpaddd  ymm2, ymm2, ymm31
+        vpaddd  ymm3, ymm3, ymm17
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 8
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vprord  ymm4, ymm4, 7
+        vpaddd  ymm0, ymm0, ymm26
+        vpaddd  ymm1, ymm1, ymm28
+        vpaddd  ymm2, ymm2, ymm30
+        vpaddd  ymm3, ymm3, ymm29
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vprord  ymm15, ymm15, 16
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 12
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vpaddd  ymm0, ymm0, ymm23
+        vpaddd  ymm1, ymm1, ymm25
+        vpaddd  ymm2, ymm2, ymm19
+        vpaddd  ymm3, ymm3, ymm31
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vprord  ymm15, ymm15, 8
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 7
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vpaddd  ymm0, ymm0, ymm20
+        vpaddd  ymm1, ymm1, ymm27
+        vpaddd  ymm2, ymm2, ymm21
+        vpaddd  ymm3, ymm3, ymm17
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 16
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vprord  ymm4, ymm4, 12
+        vpaddd  ymm0, ymm0, ymm16
+        vpaddd  ymm1, ymm1, ymm18
+        vpaddd  ymm2, ymm2, ymm24
+        vpaddd  ymm3, ymm3, ymm22
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 8
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vprord  ymm4, ymm4, 7
+        vpaddd  ymm0, ymm0, ymm28
+        vpaddd  ymm1, ymm1, ymm25
+        vpaddd  ymm2, ymm2, ymm31
+        vpaddd  ymm3, ymm3, ymm30
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vprord  ymm15, ymm15, 16
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 12
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vpaddd  ymm0, ymm0, ymm29
+        vpaddd  ymm1, ymm1, ymm27
+        vpaddd  ymm2, ymm2, ymm26
+        vpaddd  ymm3, ymm3, ymm24
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vprord  ymm15, ymm15, 8
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 7
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vpaddd  ymm0, ymm0, ymm23
+        vpaddd  ymm1, ymm1, ymm21
+        vpaddd  ymm2, ymm2, ymm16
+        vpaddd  ymm3, ymm3, ymm22
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 16
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vprord  ymm4, ymm4, 12
+        vpaddd  ymm0, ymm0, ymm18
+        vpaddd  ymm1, ymm1, ymm19
+        vpaddd  ymm2, ymm2, ymm17
+        vpaddd  ymm3, ymm3, ymm20
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 8
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vprord  ymm4, ymm4, 7
+        vpaddd  ymm0, ymm0, ymm25
+        vpaddd  ymm1, ymm1, ymm27
+        vpaddd  ymm2, ymm2, ymm24
+        vpaddd  ymm3, ymm3, ymm31
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vprord  ymm15, ymm15, 16
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 12
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vpaddd  ymm0, ymm0, ymm30
+        vpaddd  ymm1, ymm1, ymm21
+        vpaddd  ymm2, ymm2, ymm28
+        vpaddd  ymm3, ymm3, ymm17
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vprord  ymm15, ymm15, 8
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 7
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vpaddd  ymm0, ymm0, ymm29
+        vpaddd  ymm1, ymm1, ymm16
+        vpaddd  ymm2, ymm2, ymm18
+        vpaddd  ymm3, ymm3, ymm20
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 16
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vprord  ymm4, ymm4, 12
+        vpaddd  ymm0, ymm0, ymm19
+        vpaddd  ymm1, ymm1, ymm26
+        vpaddd  ymm2, ymm2, ymm22
+        vpaddd  ymm3, ymm3, ymm23
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 8
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vprord  ymm4, ymm4, 7
+        vpaddd  ymm0, ymm0, ymm27
+        vpaddd  ymm1, ymm1, ymm21
+        vpaddd  ymm2, ymm2, ymm17
+        vpaddd  ymm3, ymm3, ymm24
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vprord  ymm15, ymm15, 16
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 12
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vpaddd  ymm0, ymm0, ymm31
+        vpaddd  ymm1, ymm1, ymm16
+        vpaddd  ymm2, ymm2, ymm25
+        vpaddd  ymm3, ymm3, ymm22
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vprord  ymm15, ymm15, 8
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 7
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vpaddd  ymm0, ymm0, ymm30
+        vpaddd  ymm1, ymm1, ymm18
+        vpaddd  ymm2, ymm2, ymm19
+        vpaddd  ymm3, ymm3, ymm23
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 16
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vprord  ymm4, ymm4, 12
+        vpaddd  ymm0, ymm0, ymm26
+        vpaddd  ymm1, ymm1, ymm28
+        vpaddd  ymm2, ymm2, ymm20
+        vpaddd  ymm3, ymm3, ymm29
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 8
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vprord  ymm4, ymm4, 7
+        vpxor   ymm0, ymm0, ymm8
+        vpxor   ymm1, ymm1, ymm9
+        vpxor   ymm2, ymm2, ymm10
+        vpxor   ymm3, ymm3, ymm11
+        vpxor   ymm4, ymm4, ymm12
+        vpxor   ymm5, ymm5, ymm13
+        vpxor   ymm6, ymm6, ymm14
+        vpxor   ymm7, ymm7, ymm15
+        movzx   eax, byte ptr [rbp+0x78]
+        jne     2b
+        mov     rbx, qword ptr [rbp+0x90]
+        vunpcklps ymm8, ymm0, ymm1
+        vunpcklps ymm9, ymm2, ymm3
+        vunpckhps ymm10, ymm0, ymm1
+        vunpcklps ymm11, ymm4, ymm5
+        vunpcklps ymm0, ymm6, ymm7
+        vshufps ymm12, ymm8, ymm9, 78
+        vblendps ymm1, ymm8, ymm12, 0xCC
+        vshufps ymm8, ymm11, ymm0, 78
+        vunpckhps ymm13, ymm2, ymm3
+        vblendps ymm2, ymm11, ymm8, 0xCC
+        vblendps ymm3, ymm12, ymm9, 0xCC
+        vperm2f128 ymm12, ymm1, ymm2, 0x20
+        vmovups ymmword ptr [rbx], ymm12
+        vunpckhps ymm14, ymm4, ymm5
+        vblendps ymm4, ymm8, ymm0, 0xCC
+        vunpckhps ymm15, ymm6, ymm7
+        vperm2f128 ymm7, ymm3, ymm4, 0x20
+        vmovups ymmword ptr [rbx+0x20], ymm7
+        vshufps ymm5, ymm10, ymm13, 78
+        vblendps ymm6, ymm5, ymm13, 0xCC
+        vshufps ymm13, ymm14, ymm15, 78
+        vblendps ymm10, ymm10, ymm5, 0xCC
+        vblendps ymm14, ymm14, ymm13, 0xCC
+        vperm2f128 ymm8, ymm10, ymm14, 0x20
+        vmovups ymmword ptr [rbx+0x40], ymm8
+        vblendps ymm15, ymm13, ymm15, 0xCC
+        vperm2f128 ymm13, ymm6, ymm15, 0x20
+        vmovups ymmword ptr [rbx+0x60], ymm13
+        vperm2f128 ymm9, ymm1, ymm2, 0x31
+        vperm2f128 ymm11, ymm3, ymm4, 0x31
+        vmovups ymmword ptr [rbx+0x80], ymm9
+        vperm2f128 ymm14, ymm10, ymm14, 0x31
+        vperm2f128 ymm15, ymm6, ymm15, 0x31
+        vmovups ymmword ptr [rbx+0xA0], ymm11
+        vmovups ymmword ptr [rbx+0xC0], ymm14
+        vmovups ymmword ptr [rbx+0xE0], ymm15
+        vmovdqa ymm0, ymmword ptr [rsp]
+        vmovdqa ymm2, ymmword ptr [rsp+0x40]
+        vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20]
+        vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20]
+        vmovdqa ymmword ptr [rsp], ymm0
+        vmovdqa ymmword ptr [rsp+0x40], ymm2
+        add     rbx, 256
+        mov     qword ptr [rbp+0x90], rbx
+        add     rdi, 64
+        sub     rsi, 8
+3:
+        mov     rbx, qword ptr [rbp+0x90]
+        mov     r15, qword ptr [rsp+0x80]
+        movzx   r13, byte ptr [rbp+0x78]
+        movzx   r12, byte ptr [rbp+0x88]
+        test    esi, 0x4
+        je      3f
+        vbroadcasti32x4 zmm0, xmmword ptr [rcx]
+        vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10]
+        vmovdqa xmm12, xmmword ptr [rsp]
+        vmovdqa xmm13, xmmword ptr [rsp+0x40]
+        vpunpckldq xmm14, xmm12, xmm13
+        vpunpckhdq xmm15, xmm12, xmm13
+        vpermq  ymm14, ymm14, 0xDC
+        vpermq  ymm15, ymm15, 0xDC
+        vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip]
+        vinserti64x4 zmm13, zmm14, ymm15, 0x01
+        mov     eax, 17476
+        kmovw   k2, eax
+        vpblendmd zmm13 {k2}, zmm13, zmm12
+        vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip]
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        mov     r10, qword ptr [rdi+0x10]
+        mov     r11, qword ptr [rdi+0x18]
+        mov     eax, 43690
+        kmovw   k3, eax
+        mov     eax, 34952
+        kmovw   k4, eax
+        movzx   eax, byte ptr [rbp+0x80]
+        or      eax, r13d
+        xor     edx, edx
+.p2align 5
+2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        mov     dword ptr [rsp+0x88], eax
+        vmovdqa32 zmm2, zmm15
+        vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4]
+        vpblendmd zmm3 {k4}, zmm13, zmm8
+        vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40]
+        vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01
+        vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02
+        vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03
+        vmovups zmm9, zmmword ptr [r8+rdx-0x30]
+        vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01
+        vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02
+        vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03
+        vshufps zmm4, zmm8, zmm9, 136
+        vshufps zmm5, zmm8, zmm9, 221
+        vmovups zmm8, zmmword ptr [r8+rdx-0x20]
+        vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01
+        vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02
+        vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03
+        vmovups zmm9, zmmword ptr [r8+rdx-0x10]
+        vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01
+        vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02
+        vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03
+        vshufps zmm6, zmm8, zmm9, 136
+        vshufps zmm7, zmm8, zmm9, 221
+        vpshufd zmm6, zmm6, 0x93
+        vpshufd zmm7, zmm7, 0x93
+        mov     al, 7
+9:
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm0, zmm0, zmm1
+        vpxord  zmm3, zmm3, zmm0
+        vprord  zmm3, zmm3, 16
+        vpaddd  zmm2, zmm2, zmm3
+        vpxord  zmm1, zmm1, zmm2
+        vprord  zmm1, zmm1, 12
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm0, zmm0, zmm1
+        vpxord  zmm3, zmm3, zmm0
+        vprord  zmm3, zmm3, 8
+        vpaddd  zmm2, zmm2, zmm3
+        vpxord  zmm1, zmm1, zmm2
+        vprord  zmm1, zmm1, 7
+        vpshufd zmm0, zmm0, 0x93
+        vpshufd zmm3, zmm3, 0x4E
+        vpshufd zmm2, zmm2, 0x39
+        vpaddd  zmm0, zmm0, zmm6
+        vpaddd  zmm0, zmm0, zmm1
+        vpxord  zmm3, zmm3, zmm0
+        vprord  zmm3, zmm3, 16
+        vpaddd  zmm2, zmm2, zmm3
+        vpxord  zmm1, zmm1, zmm2
+        vprord  zmm1, zmm1, 12
+        vpaddd  zmm0, zmm0, zmm7
+        vpaddd  zmm0, zmm0, zmm1
+        vpxord  zmm3, zmm3, zmm0
+        vprord  zmm3, zmm3, 8
+        vpaddd  zmm2, zmm2, zmm3
+        vpxord  zmm1, zmm1, zmm2
+        vprord  zmm1, zmm1, 7
+        vpshufd zmm0, zmm0, 0x39
+        vpshufd zmm3, zmm3, 0x4E
+        vpshufd zmm2, zmm2, 0x93
+        dec     al
+        jz      9f
+        vshufps zmm8, zmm4, zmm5, 214
+        vpshufd zmm9, zmm4, 0x0F
+        vpshufd zmm4, zmm8, 0x39
+        vshufps zmm8, zmm6, zmm7, 250
+        vpblendmd zmm9 {k3}, zmm9, zmm8
+        vpunpcklqdq zmm8, zmm7, zmm5
+        vpblendmd zmm8 {k4}, zmm8, zmm6
+        vpshufd zmm8, zmm8, 0x78
+        vpunpckhdq zmm5, zmm5, zmm7
+        vpunpckldq zmm6, zmm6, zmm5
+        vpshufd zmm7, zmm6, 0x1E
+        vmovdqa32 zmm5, zmm9
+        vmovdqa32 zmm6, zmm8
+        jmp     9b
+9:
+        vpxord  zmm0, zmm0, zmm2
+        vpxord  zmm1, zmm1, zmm3
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     2b
+        vmovdqu xmmword ptr [rbx], xmm0
+        vmovdqu xmmword ptr [rbx+0x10], xmm1
+        vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
+        vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
+        vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02
+        vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02
+        vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03
+        vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03
+        vmovdqa xmm0, xmmword ptr [rsp]
+        vmovdqa xmm2, xmmword ptr [rsp+0x40]
+        vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10]
+        vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10]
+        vmovdqa xmmword ptr [rsp], xmm0
+        vmovdqa xmmword ptr [rsp+0x40], xmm2
+        add     rbx, 128
+        add     rdi, 32
+        sub     rsi, 4
+3:
+        test    esi, 0x2
+        je      3f
+        vbroadcasti128 ymm0, xmmword ptr [rcx]
+        vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
+        vmovd   xmm13, dword ptr [rsp]
+        vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1
+        vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+        vmovd   xmm14, dword ptr [rsp+0x4]
+        vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1
+        vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+        vinserti128 ymm13, ymm13, xmm14, 0x01
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        movzx   eax, byte ptr [rbp+0x80]
+        or      eax, r13d
+        xor     edx, edx
+.p2align 5
+2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        mov     dword ptr [rsp+0x88], eax
+        vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
+        vpbroadcastd ymm8, dword ptr [rsp+0x88]
+        vpblendd ymm3, ymm13, ymm8, 0x88
+        vmovups ymm8, ymmword ptr [r8+rdx-0x40]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01
+        vmovups ymm9, ymmword ptr [r8+rdx-0x30]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01
+        vshufps ymm4, ymm8, ymm9, 136
+        vshufps ymm5, ymm8, ymm9, 221
+        vmovups ymm8, ymmword ptr [r8+rdx-0x20]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01
+        vmovups ymm9, ymmword ptr [r8+rdx-0x10]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01
+        vshufps ymm6, ymm8, ymm9, 136
+        vshufps ymm7, ymm8, ymm9, 221
+        vpshufd ymm6, ymm6, 0x93
+        vpshufd ymm7, ymm7, 0x93
+        mov     al, 7
+9:
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm0, ymm0, ymm1
+        vpxord  ymm3, ymm3, ymm0
+        vprord  ymm3, ymm3, 16
+        vpaddd  ymm2, ymm2, ymm3
+        vpxord  ymm1, ymm1, ymm2
+        vprord  ymm1, ymm1, 12
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm0, ymm0, ymm1
+        vpxord  ymm3, ymm3, ymm0
+        vprord  ymm3, ymm3, 8
+        vpaddd  ymm2, ymm2, ymm3
+        vpxord  ymm1, ymm1, ymm2
+        vprord  ymm1, ymm1, 7
+        vpshufd ymm0, ymm0, 0x93
+        vpshufd ymm3, ymm3, 0x4E
+        vpshufd ymm2, ymm2, 0x39
+        vpaddd  ymm0, ymm0, ymm6
+        vpaddd  ymm0, ymm0, ymm1
+        vpxord  ymm3, ymm3, ymm0
+        vprord  ymm3, ymm3, 16
+        vpaddd  ymm2, ymm2, ymm3
+        vpxord  ymm1, ymm1, ymm2
+        vprord  ymm1, ymm1, 12
+        vpaddd  ymm0, ymm0, ymm7
+        vpaddd  ymm0, ymm0, ymm1
+        vpxord  ymm3, ymm3, ymm0
+        vprord  ymm3, ymm3, 8
+        vpaddd  ymm2, ymm2, ymm3
+        vpxord  ymm1, ymm1, ymm2
+        vprord  ymm1, ymm1, 7
+        vpshufd ymm0, ymm0, 0x39
+        vpshufd ymm3, ymm3, 0x4E
+        vpshufd ymm2, ymm2, 0x93
+        dec     al
+        jz      9f
+        vshufps ymm8, ymm4, ymm5, 214
+        vpshufd ymm9, ymm4, 0x0F
+        vpshufd ymm4, ymm8, 0x39
+        vshufps ymm8, ymm6, ymm7, 250
+        vpblendd ymm9, ymm9, ymm8, 0xAA
+        vpunpcklqdq ymm8, ymm7, ymm5
+        vpblendd ymm8, ymm8, ymm6, 0x88
+        vpshufd ymm8, ymm8, 0x78
+        vpunpckhdq ymm5, ymm5, ymm7
+        vpunpckldq ymm6, ymm6, ymm5
+        vpshufd ymm7, ymm6, 0x1E
+        vmovdqa ymm5, ymm9
+        vmovdqa ymm6, ymm8
+        jmp     9b
+9:
+        vpxor   ymm0, ymm0, ymm2
+        vpxor   ymm1, ymm1, ymm3
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     2b
+        vmovdqu xmmword ptr [rbx], xmm0
+        vmovdqu xmmword ptr [rbx+0x10], xmm1
+        vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
+        vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
+        vmovdqa xmm0, xmmword ptr [rsp]
+        vmovdqa xmm2, xmmword ptr [rsp+0x40]
+        vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8]
+        vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48]
+        vmovdqa xmmword ptr [rsp], xmm0
+        vmovdqa xmmword ptr [rsp+0x40], xmm2
+        add     rbx, 64
+        add     rdi, 16
+        sub     rsi, 2
+3:
+        test    esi, 0x1
+        je      4b
+        vmovdqu xmm0, xmmword ptr [rcx]
+        vmovdqu xmm1, xmmword ptr [rcx+0x10]
+        vmovd   xmm14, dword ptr [rsp]
+        vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1
+        vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+        vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip]
+        mov     r8, qword ptr [rdi]
+        movzx   eax, byte ptr [rbp+0x80]
+        or      eax, r13d
+        xor     edx, edx
+.p2align 5
+2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        vpinsrd xmm3, xmm14, eax, 3
+        vmovdqa xmm2, xmm15
+        vmovups xmm8, xmmword ptr [r8+rdx-0x40]
+        vmovups xmm9, xmmword ptr [r8+rdx-0x30]
+        vshufps xmm4, xmm8, xmm9, 136
+        vshufps xmm5, xmm8, xmm9, 221
+        vmovups xmm8, xmmword ptr [r8+rdx-0x20]
+        vmovups xmm9, xmmword ptr [r8+rdx-0x10]
+        vshufps xmm6, xmm8, xmm9, 136
+        vshufps xmm7, xmm8, xmm9, 221
+        vpshufd xmm6, xmm6, 0x93
+        vpshufd xmm7, xmm7, 0x93
+        mov     al, 7
+9:
+        vpaddd  xmm0, xmm0, xmm4
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 16
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 12
+        vpaddd  xmm0, xmm0, xmm5
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 8
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 7
+        vpshufd xmm0, xmm0, 0x93
+        vpshufd xmm3, xmm3, 0x4E
+        vpshufd xmm2, xmm2, 0x39
+        vpaddd  xmm0, xmm0, xmm6
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 16
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 12
+        vpaddd  xmm0, xmm0, xmm7
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 8
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 7
+        vpshufd xmm0, xmm0, 0x39
+        vpshufd xmm3, xmm3, 0x4E
+        vpshufd xmm2, xmm2, 0x93
+        dec     al
+        jz      9f
+        vshufps xmm8, xmm4, xmm5, 214
+        vpshufd xmm9, xmm4, 0x0F
+        vpshufd xmm4, xmm8, 0x39
+        vshufps xmm8, xmm6, xmm7, 250
+        vpblendd xmm9, xmm9, xmm8, 0xAA
+        vpunpcklqdq xmm8, xmm7, xmm5
+        vpblendd xmm8, xmm8, xmm6, 0x88
+        vpshufd xmm8, xmm8, 0x78
+        vpunpckhdq xmm5, xmm5, xmm7
+        vpunpckldq xmm6, xmm6, xmm5
+        vpshufd xmm7, xmm6, 0x1E
+        vmovdqa xmm5, xmm9
+        vmovdqa xmm6, xmm8
+        jmp     9b
+9:
+        vpxor   xmm0, xmm0, xmm2
+        vpxor   xmm1, xmm1, xmm3
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     2b
+        vmovdqu xmmword ptr [rbx], xmm0
+        vmovdqu xmmword ptr [rbx+0x10], xmm1
+        jmp     4b
+
+
+.p2align 6
+_blake3_compress_in_place_avx512:
+blake3_compress_in_place_avx512:
+        sub     rsp, 72
+        vmovdqa xmmword ptr [rsp], xmm6
+        vmovdqa xmmword ptr [rsp+0x10], xmm7
+        vmovdqa xmmword ptr [rsp+0x20], xmm8
+        vmovdqa xmmword ptr [rsp+0x30], xmm9
+        vmovdqu xmm0, xmmword ptr [rcx]
+        vmovdqu xmm1, xmmword ptr [rcx+0x10]
+        movzx   eax, byte ptr [rsp+0x70]
+        movzx   r8d, r8b
+        shl     rax, 32
+        add     r8, rax
+        vmovq   xmm3, r9
+        vmovq   xmm4, r8
+        vpunpcklqdq xmm3, xmm3, xmm4
+        vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+        vmovups xmm8, xmmword ptr [rdx]
+        vmovups xmm9, xmmword ptr [rdx+0x10]
+        vshufps xmm4, xmm8, xmm9, 136
+        vshufps xmm5, xmm8, xmm9, 221
+        vmovups xmm8, xmmword ptr [rdx+0x20]
+        vmovups xmm9, xmmword ptr [rdx+0x30]
+        vshufps xmm6, xmm8, xmm9, 136
+        vshufps xmm7, xmm8, xmm9, 221
+        vpshufd xmm6, xmm6, 0x93
+        vpshufd xmm7, xmm7, 0x93
+        mov     al, 7
+9:
+        vpaddd  xmm0, xmm0, xmm4
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 16
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 12
+        vpaddd  xmm0, xmm0, xmm5
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 8
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 7
+        vpshufd xmm0, xmm0, 0x93
+        vpshufd xmm3, xmm3, 0x4E
+        vpshufd xmm2, xmm2, 0x39
+        vpaddd  xmm0, xmm0, xmm6
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 16
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 12
+        vpaddd  xmm0, xmm0, xmm7
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 8
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 7
+        vpshufd xmm0, xmm0, 0x39
+        vpshufd xmm3, xmm3, 0x4E
+        vpshufd xmm2, xmm2, 0x93
+        dec     al
+        jz      9f
+        vshufps xmm8, xmm4, xmm5, 214
+        vpshufd xmm9, xmm4, 0x0F
+        vpshufd xmm4, xmm8, 0x39
+        vshufps xmm8, xmm6, xmm7, 250
+        vpblendd xmm9, xmm9, xmm8, 0xAA
+        vpunpcklqdq xmm8, xmm7, xmm5
+        vpblendd xmm8, xmm8, xmm6, 0x88
+        vpshufd xmm8, xmm8, 0x78
+        vpunpckhdq xmm5, xmm5, xmm7
+        vpunpckldq xmm6, xmm6, xmm5
+        vpshufd xmm7, xmm6, 0x1E
+        vmovdqa xmm5, xmm9
+        vmovdqa xmm6, xmm8
+        jmp     9b
+9:
+        vpxor   xmm0, xmm0, xmm2
+        vpxor   xmm1, xmm1, xmm3
+        vmovdqu xmmword ptr [rcx], xmm0
+        vmovdqu xmmword ptr [rcx+0x10], xmm1
+        vmovdqa xmm6, xmmword ptr [rsp]
+        vmovdqa xmm7, xmmword ptr [rsp+0x10]
+        vmovdqa xmm8, xmmword ptr [rsp+0x20]
+        vmovdqa xmm9, xmmword ptr [rsp+0x30]
+        add     rsp, 72
+        ret
+
+
+.p2align 6
+_blake3_compress_xof_avx512:
+blake3_compress_xof_avx512:
+        sub     rsp, 72
+        vmovdqa xmmword ptr [rsp], xmm6
+        vmovdqa xmmword ptr [rsp+0x10], xmm7
+        vmovdqa xmmword ptr [rsp+0x20], xmm8
+        vmovdqa xmmword ptr [rsp+0x30], xmm9
+        vmovdqu xmm0, xmmword ptr [rcx]
+        vmovdqu xmm1, xmmword ptr [rcx+0x10]
+        movzx   eax, byte ptr [rsp+0x70]
+        movzx   r8d, r8b
+        mov     r10, qword ptr [rsp+0x78]
+        shl     rax, 32
+        add     r8, rax
+        vmovq   xmm3, r9
+        vmovq   xmm4, r8
+        vpunpcklqdq xmm3, xmm3, xmm4
+        vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+        vmovups xmm8, xmmword ptr [rdx]
+        vmovups xmm9, xmmword ptr [rdx+0x10]
+        vshufps xmm4, xmm8, xmm9, 136
+        vshufps xmm5, xmm8, xmm9, 221
+        vmovups xmm8, xmmword ptr [rdx+0x20]
+        vmovups xmm9, xmmword ptr [rdx+0x30]
+        vshufps xmm6, xmm8, xmm9, 136
+        vshufps xmm7, xmm8, xmm9, 221
+        vpshufd xmm6, xmm6, 0x93
+        vpshufd xmm7, xmm7, 0x93
+        mov     al, 7
+9:
+        vpaddd  xmm0, xmm0, xmm4
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 16
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 12
+        vpaddd  xmm0, xmm0, xmm5
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 8
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 7
+        vpshufd xmm0, xmm0, 0x93
+        vpshufd xmm3, xmm3, 0x4E
+        vpshufd xmm2, xmm2, 0x39
+        vpaddd  xmm0, xmm0, xmm6
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 16
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 12
+        vpaddd  xmm0, xmm0, xmm7
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 8
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 7
+        vpshufd xmm0, xmm0, 0x39
+        vpshufd xmm3, xmm3, 0x4E
+        vpshufd xmm2, xmm2, 0x93
+        dec     al
+        jz      9f
+        vshufps xmm8, xmm4, xmm5, 214
+        vpshufd xmm9, xmm4, 0x0F
+        vpshufd xmm4, xmm8, 0x39
+        vshufps xmm8, xmm6, xmm7, 250
+        vpblendd xmm9, xmm9, xmm8, 0xAA
+        vpunpcklqdq xmm8, xmm7, xmm5
+        vpblendd xmm8, xmm8, xmm6, 0x88
+        vpshufd xmm8, xmm8, 0x78
+        vpunpckhdq xmm5, xmm5, xmm7
+        vpunpckldq xmm6, xmm6, xmm5
+        vpshufd xmm7, xmm6, 0x1E
+        vmovdqa xmm5, xmm9
+        vmovdqa xmm6, xmm8
+        jmp     9b
+9:
+        vpxor   xmm0, xmm0, xmm2
+        vpxor   xmm1, xmm1, xmm3
+        vpxor   xmm2, xmm2, xmmword ptr [rcx]
+        vpxor   xmm3, xmm3, xmmword ptr [rcx+0x10]
+        vmovdqu xmmword ptr [r10], xmm0
+        vmovdqu xmmword ptr [r10+0x10], xmm1
+        vmovdqu xmmword ptr [r10+0x20], xmm2
+        vmovdqu xmmword ptr [r10+0x30], xmm3
+        vmovdqa xmm6, xmmword ptr [rsp]
+        vmovdqa xmm7, xmmword ptr [rsp+0x10]
+        vmovdqa xmm8, xmmword ptr [rsp+0x20]
+        vmovdqa xmm9, xmmword ptr [rsp+0x30]
+        add     rsp, 72
+        ret
+
+.section .rodata
+.p2align  6
+INDEX0:
+        .long    0,  1,  2,  3, 16, 17, 18, 19
+        .long    8,  9, 10, 11, 24, 25, 26, 27
+INDEX1:
+        .long    4,  5,  6,  7, 20, 21, 22, 23
+        .long   12, 13, 14, 15, 28, 29, 30, 31
+ADD0:
+        .long    0,  1,  2,  3,  4,  5,  6,  7
+        .long    8,  9, 10, 11, 12, 13, 14, 15
+ADD1:   .long    1
+
+ADD16:  .long   16
+BLAKE3_BLOCK_LEN:
+        .long   64
+.p2align 6
+BLAKE3_IV:
+BLAKE3_IV_0:
+        .long   0x6A09E667
+BLAKE3_IV_1:
+        .long   0xBB67AE85
+BLAKE3_IV_2:
+        .long   0x3C6EF372
+BLAKE3_IV_3:
+        .long   0xA54FF53A
diff --git a/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_windows_msvc.asm b/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_windows_msvc.asm
new file mode 100644
index 000000000000..b19efbaaeb36
--- /dev/null
+++ b/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_windows_msvc.asm
@@ -0,0 +1,2634 @@
+public _blake3_hash_many_avx512
+public blake3_hash_many_avx512
+public blake3_compress_in_place_avx512
+public _blake3_compress_in_place_avx512
+public blake3_compress_xof_avx512
+public _blake3_compress_xof_avx512
+
+_TEXT   SEGMENT ALIGN(16) 'CODE'
+
+ALIGN   16
+blake3_hash_many_avx512 PROC
+_blake3_hash_many_avx512 PROC
+        push    r15
+        push    r14
+        push    r13
+        push    r12
+        push    rdi
+        push    rsi
+        push    rbx
+        push    rbp
+        mov     rbp, rsp
+        sub     rsp, 304
+        and     rsp, 0FFFFFFFFFFFFFFC0H
+        vmovdqa xmmword ptr [rsp+90H], xmm6
+        vmovdqa xmmword ptr [rsp+0A0H], xmm7
+        vmovdqa xmmword ptr [rsp+0B0H], xmm8
+        vmovdqa xmmword ptr [rsp+0C0H], xmm9
+        vmovdqa xmmword ptr [rsp+0D0H], xmm10
+        vmovdqa xmmword ptr [rsp+0E0H], xmm11
+        vmovdqa xmmword ptr [rsp+0F0H], xmm12
+        vmovdqa xmmword ptr [rsp+100H], xmm13
+        vmovdqa xmmword ptr [rsp+110H], xmm14
+        vmovdqa xmmword ptr [rsp+120H], xmm15
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+        mov     rcx, r9
+        mov     r8, qword ptr [rbp+68H]
+        movzx   r9, byte ptr [rbp+70H]
+        neg     r9
+        kmovw   k1, r9d
+        vmovd   xmm0, r8d
+        vpbroadcastd ymm0, xmm0
+        shr     r8, 32
+        vmovd   xmm1, r8d
+        vpbroadcastd ymm1, xmm1
+        vmovdqa ymm4, ymm1
+        vmovdqa ymm5, ymm1
+        vpaddd  ymm2, ymm0, ymmword ptr [ADD0]
+        vpaddd  ymm3, ymm0, ymmword ptr [ADD0+32]
+        vpcmpud k2, ymm2, ymm0, 1
+        vpcmpud k3, ymm3, ymm0, 1
+        ; XXX: ml64.exe does not currently understand the syntax. We use a workaround.
+        vpbroadcastd ymm6, dword ptr [ADD1]
+        vpaddd  ymm4 {k2}, ymm4, ymm6
+        vpaddd  ymm5 {k3}, ymm5, ymm6
+        ; vpaddd  ymm4 {k2}, ymm4, dword ptr [ADD1] {1to8}
+        ; vpaddd  ymm5 {k3}, ymm5, dword ptr [ADD1] {1to8}
+        knotw   k2, k1
+        vmovdqa32 ymm2 {k2}, ymm0
+        vmovdqa32 ymm3 {k2}, ymm0
+        vmovdqa32 ymm4 {k2}, ymm1
+        vmovdqa32 ymm5 {k2}, ymm1
+        vmovdqa ymmword ptr [rsp], ymm2
+        vmovdqa ymmword ptr [rsp+20H], ymm3
+        vmovdqa ymmword ptr [rsp+40H], ymm4
+        vmovdqa ymmword ptr [rsp+60H], ymm5
+        shl     rdx, 6
+        mov     qword ptr [rsp+80H], rdx
+        cmp     rsi, 16
+        jc      final15blocks
+outerloop16:
+        vpbroadcastd zmm0, dword ptr [rcx]
+        vpbroadcastd zmm1, dword ptr [rcx+1H*4H]
+        vpbroadcastd zmm2, dword ptr [rcx+2H*4H]
+        vpbroadcastd zmm3, dword ptr [rcx+3H*4H]
+        vpbroadcastd zmm4, dword ptr [rcx+4H*4H]
+        vpbroadcastd zmm5, dword ptr [rcx+5H*4H]
+        vpbroadcastd zmm6, dword ptr [rcx+6H*4H]
+        vpbroadcastd zmm7, dword ptr [rcx+7H*4H]
+        movzx   eax, byte ptr [rbp+78H]
+        movzx   ebx, byte ptr [rbp+80H]
+        or      eax, ebx
+        xor     edx, edx
+ALIGN   16
+innerloop16:
+        movzx   ebx, byte ptr [rbp+88H]
+        or      ebx, eax
+        add     rdx, 64
+        cmp     rdx, qword ptr [rsp+80H]
+        cmove   eax, ebx
+        mov     dword ptr [rsp+88H], eax
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+8H]
+        mov     r10, qword ptr [rdi+10H]
+        mov     r11, qword ptr [rdi+18H]
+        mov     r12, qword ptr [rdi+40H]
+        mov     r13, qword ptr [rdi+48H]
+        mov     r14, qword ptr [rdi+50H]
+        mov     r15, qword ptr [rdi+58H]
+        vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H]
+        vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
+        vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H]
+        vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
+        vpunpcklqdq zmm8, zmm16, zmm17
+        vpunpckhqdq zmm9, zmm16, zmm17
+        vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H]
+        vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
+        vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H]
+        vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
+        vpunpcklqdq zmm10, zmm18, zmm19
+        vpunpckhqdq zmm11, zmm18, zmm19
+        mov     r8, qword ptr [rdi+20H]
+        mov     r9, qword ptr [rdi+28H]
+        mov     r10, qword ptr [rdi+30H]
+        mov     r11, qword ptr [rdi+38H]
+        mov     r12, qword ptr [rdi+60H]
+        mov     r13, qword ptr [rdi+68H]
+        mov     r14, qword ptr [rdi+70H]
+        mov     r15, qword ptr [rdi+78H]
+        vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H]
+        vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
+        vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H]
+        vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
+        vpunpcklqdq zmm12, zmm16, zmm17
+        vpunpckhqdq zmm13, zmm16, zmm17
+        vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H]
+        vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
+        vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H]
+        vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
+        vpunpcklqdq zmm14, zmm18, zmm19
+        vpunpckhqdq zmm15, zmm18, zmm19
+        vmovdqa32 zmm27, zmmword ptr [INDEX0]
+        vmovdqa32 zmm31, zmmword ptr [INDEX1]
+        vshufps zmm16, zmm8, zmm10, 136
+        vshufps zmm17, zmm12, zmm14, 136
+        vmovdqa32 zmm20, zmm16
+        vpermt2d zmm16, zmm27, zmm17
+        vpermt2d zmm20, zmm31, zmm17
+        vshufps zmm17, zmm8, zmm10, 221
+        vshufps zmm30, zmm12, zmm14, 221
+        vmovdqa32 zmm21, zmm17
+        vpermt2d zmm17, zmm27, zmm30
+        vpermt2d zmm21, zmm31, zmm30
+        vshufps zmm18, zmm9, zmm11, 136
+        vshufps zmm8, zmm13, zmm15, 136
+        vmovdqa32 zmm22, zmm18
+        vpermt2d zmm18, zmm27, zmm8
+        vpermt2d zmm22, zmm31, zmm8
+        vshufps zmm19, zmm9, zmm11, 221
+        vshufps zmm8, zmm13, zmm15, 221
+        vmovdqa32 zmm23, zmm19
+        vpermt2d zmm19, zmm27, zmm8
+        vpermt2d zmm23, zmm31, zmm8
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+8H]
+        mov     r10, qword ptr [rdi+10H]
+        mov     r11, qword ptr [rdi+18H]
+        mov     r12, qword ptr [rdi+40H]
+        mov     r13, qword ptr [rdi+48H]
+        mov     r14, qword ptr [rdi+50H]
+        mov     r15, qword ptr [rdi+58H]
+        vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H]
+        vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
+        vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H]
+        vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
+        vpunpcklqdq zmm8, zmm24, zmm25
+        vpunpckhqdq zmm9, zmm24, zmm25
+        vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H]
+        vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
+        vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H]
+        vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
+        vpunpcklqdq zmm10, zmm24, zmm25
+        vpunpckhqdq zmm11, zmm24, zmm25
+        prefetcht0 byte ptr [r8+rdx+80H]
+        prefetcht0 byte ptr [r12+rdx+80H]
+        prefetcht0 byte ptr [r9+rdx+80H]
+        prefetcht0 byte ptr [r13+rdx+80H]
+        prefetcht0 byte ptr [r10+rdx+80H]
+        prefetcht0 byte ptr [r14+rdx+80H]
+        prefetcht0 byte ptr [r11+rdx+80H]
+        prefetcht0 byte ptr [r15+rdx+80H]
+        mov     r8, qword ptr [rdi+20H]
+        mov     r9, qword ptr [rdi+28H]
+        mov     r10, qword ptr [rdi+30H]
+        mov     r11, qword ptr [rdi+38H]
+        mov     r12, qword ptr [rdi+60H]
+        mov     r13, qword ptr [rdi+68H]
+        mov     r14, qword ptr [rdi+70H]
+        mov     r15, qword ptr [rdi+78H]
+        vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H]
+        vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
+        vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H]
+        vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
+        vpunpcklqdq zmm12, zmm24, zmm25
+        vpunpckhqdq zmm13, zmm24, zmm25
+        vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H]
+        vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
+        vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H]
+        vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
+        vpunpcklqdq zmm14, zmm24, zmm25
+        vpunpckhqdq zmm15, zmm24, zmm25
+        prefetcht0 byte  ptr [r8+rdx+80H]
+        prefetcht0 byte ptr [r12+rdx+80H]
+        prefetcht0 byte ptr [r9+rdx+80H]
+        prefetcht0 byte ptr [r13+rdx+80H]
+        prefetcht0 byte ptr [r10+rdx+80H]
+        prefetcht0 byte ptr [r14+rdx+80H]
+        prefetcht0 byte ptr [r11+rdx+80H]
+        prefetcht0 byte ptr [r15+rdx+80H]
+        vshufps zmm24, zmm8, zmm10, 136
+        vshufps zmm30, zmm12, zmm14, 136
+        vmovdqa32 zmm28, zmm24
+        vpermt2d zmm24, zmm27, zmm30
+        vpermt2d zmm28, zmm31, zmm30
+        vshufps zmm25, zmm8, zmm10, 221
+        vshufps zmm30, zmm12, zmm14, 221
+        vmovdqa32 zmm29, zmm25
+        vpermt2d zmm25, zmm27, zmm30
+        vpermt2d zmm29, zmm31, zmm30
+        vshufps zmm26, zmm9, zmm11, 136
+        vshufps zmm8, zmm13, zmm15, 136
+        vmovdqa32 zmm30, zmm26
+        vpermt2d zmm26, zmm27, zmm8
+        vpermt2d zmm30, zmm31, zmm8
+        vshufps zmm8, zmm9, zmm11, 221
+        vshufps zmm10, zmm13, zmm15, 221
+        vpermi2d zmm27, zmm8, zmm10
+        vpermi2d zmm31, zmm8, zmm10
+        vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0]
+        vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1]
+        vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2]
+        vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3]
+        vmovdqa32 zmm12, zmmword ptr [rsp]
+        vmovdqa32 zmm13, zmmword ptr [rsp+1H*40H]
+        vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN]
+        vpbroadcastd zmm15, dword ptr [rsp+22H*4H]
+        vpaddd  zmm0, zmm0, zmm16
+        vpaddd  zmm1, zmm1, zmm18
+        vpaddd  zmm2, zmm2, zmm20
+        vpaddd  zmm3, zmm3, zmm22
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vprord  zmm15, zmm15, 16
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 12
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vpaddd  zmm0, zmm0, zmm17
+        vpaddd  zmm1, zmm1, zmm19
+        vpaddd  zmm2, zmm2, zmm21
+        vpaddd  zmm3, zmm3, zmm23
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vprord  zmm15, zmm15, 8
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 7
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vpaddd  zmm0, zmm0, zmm24
+        vpaddd  zmm1, zmm1, zmm26
+        vpaddd  zmm2, zmm2, zmm28
+        vpaddd  zmm3, zmm3, zmm30
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 16
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vprord  zmm4, zmm4, 12
+        vpaddd  zmm0, zmm0, zmm25
+        vpaddd  zmm1, zmm1, zmm27
+        vpaddd  zmm2, zmm2, zmm29
+        vpaddd  zmm3, zmm3, zmm31
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 8
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vprord  zmm4, zmm4, 7
+        vpaddd  zmm0, zmm0, zmm18
+        vpaddd  zmm1, zmm1, zmm19
+        vpaddd  zmm2, zmm2, zmm23
+        vpaddd  zmm3, zmm3, zmm20
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vprord  zmm15, zmm15, 16
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 12
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vpaddd  zmm0, zmm0, zmm22
+        vpaddd  zmm1, zmm1, zmm26
+        vpaddd  zmm2, zmm2, zmm16
+        vpaddd  zmm3, zmm3, zmm29
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vprord  zmm15, zmm15, 8
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 7
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vpaddd  zmm0, zmm0, zmm17
+        vpaddd  zmm1, zmm1, zmm28
+        vpaddd  zmm2, zmm2, zmm25
+        vpaddd  zmm3, zmm3, zmm31
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 16
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vprord  zmm4, zmm4, 12
+        vpaddd  zmm0, zmm0, zmm27
+        vpaddd  zmm1, zmm1, zmm21
+        vpaddd  zmm2, zmm2, zmm30
+        vpaddd  zmm3, zmm3, zmm24
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 8
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vprord  zmm4, zmm4, 7
+        vpaddd  zmm0, zmm0, zmm19
+        vpaddd  zmm1, zmm1, zmm26
+        vpaddd  zmm2, zmm2, zmm29
+        vpaddd  zmm3, zmm3, zmm23
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vprord  zmm15, zmm15, 16
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 12
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vpaddd  zmm0, zmm0, zmm20
+        vpaddd  zmm1, zmm1, zmm28
+        vpaddd  zmm2, zmm2, zmm18
+        vpaddd  zmm3, zmm3, zmm30
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vprord  zmm15, zmm15, 8
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 7
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vpaddd  zmm0, zmm0, zmm22
+        vpaddd  zmm1, zmm1, zmm25
+        vpaddd  zmm2, zmm2, zmm27
+        vpaddd  zmm3, zmm3, zmm24
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 16
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vprord  zmm4, zmm4, 12
+        vpaddd  zmm0, zmm0, zmm21
+        vpaddd  zmm1, zmm1, zmm16
+        vpaddd  zmm2, zmm2, zmm31
+        vpaddd  zmm3, zmm3, zmm17
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 8
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vprord  zmm4, zmm4, 7
+        vpaddd  zmm0, zmm0, zmm26
+        vpaddd  zmm1, zmm1, zmm28
+        vpaddd  zmm2, zmm2, zmm30
+        vpaddd  zmm3, zmm3, zmm29
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vprord  zmm15, zmm15, 16
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 12
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vpaddd  zmm0, zmm0, zmm23
+        vpaddd  zmm1, zmm1, zmm25
+        vpaddd  zmm2, zmm2, zmm19
+        vpaddd  zmm3, zmm3, zmm31
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vprord  zmm15, zmm15, 8
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 7
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vpaddd  zmm0, zmm0, zmm20
+        vpaddd  zmm1, zmm1, zmm27
+        vpaddd  zmm2, zmm2, zmm21
+        vpaddd  zmm3, zmm3, zmm17
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 16
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vprord  zmm4, zmm4, 12
+        vpaddd  zmm0, zmm0, zmm16
+        vpaddd  zmm1, zmm1, zmm18
+        vpaddd  zmm2, zmm2, zmm24
+        vpaddd  zmm3, zmm3, zmm22
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 8
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vprord  zmm4, zmm4, 7
+        vpaddd  zmm0, zmm0, zmm28
+        vpaddd  zmm1, zmm1, zmm25
+        vpaddd  zmm2, zmm2, zmm31
+        vpaddd  zmm3, zmm3, zmm30
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vprord  zmm15, zmm15, 16
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 12
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vpaddd  zmm0, zmm0, zmm29
+        vpaddd  zmm1, zmm1, zmm27
+        vpaddd  zmm2, zmm2, zmm26
+        vpaddd  zmm3, zmm3, zmm24
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vprord  zmm15, zmm15, 8
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 7
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vpaddd  zmm0, zmm0, zmm23
+        vpaddd  zmm1, zmm1, zmm21
+        vpaddd  zmm2, zmm2, zmm16
+        vpaddd  zmm3, zmm3, zmm22
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 16
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vprord  zmm4, zmm4, 12
+        vpaddd  zmm0, zmm0, zmm18
+        vpaddd  zmm1, zmm1, zmm19
+        vpaddd  zmm2, zmm2, zmm17
+        vpaddd  zmm3, zmm3, zmm20
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 8
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vprord  zmm4, zmm4, 7
+        vpaddd  zmm0, zmm0, zmm25
+        vpaddd  zmm1, zmm1, zmm27
+        vpaddd  zmm2, zmm2, zmm24
+        vpaddd  zmm3, zmm3, zmm31
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vprord  zmm15, zmm15, 16
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 12
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vpaddd  zmm0, zmm0, zmm30
+        vpaddd  zmm1, zmm1, zmm21
+        vpaddd  zmm2, zmm2, zmm28
+        vpaddd  zmm3, zmm3, zmm17
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vprord  zmm15, zmm15, 8
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 7
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vpaddd  zmm0, zmm0, zmm29
+        vpaddd  zmm1, zmm1, zmm16
+        vpaddd  zmm2, zmm2, zmm18
+        vpaddd  zmm3, zmm3, zmm20
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 16
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vprord  zmm4, zmm4, 12
+        vpaddd  zmm0, zmm0, zmm19
+        vpaddd  zmm1, zmm1, zmm26
+        vpaddd  zmm2, zmm2, zmm22
+        vpaddd  zmm3, zmm3, zmm23
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 8
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vprord  zmm4, zmm4, 7
+        vpaddd  zmm0, zmm0, zmm27
+        vpaddd  zmm1, zmm1, zmm21
+        vpaddd  zmm2, zmm2, zmm17
+        vpaddd  zmm3, zmm3, zmm24
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vprord  zmm15, zmm15, 16
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 12
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vpaddd  zmm0, zmm0, zmm31
+        vpaddd  zmm1, zmm1, zmm16
+        vpaddd  zmm2, zmm2, zmm25
+        vpaddd  zmm3, zmm3, zmm22
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vprord  zmm15, zmm15, 8
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 7
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vpaddd  zmm0, zmm0, zmm30
+        vpaddd  zmm1, zmm1, zmm18
+        vpaddd  zmm2, zmm2, zmm19
+        vpaddd  zmm3, zmm3, zmm23
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 16
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vprord  zmm4, zmm4, 12
+        vpaddd  zmm0, zmm0, zmm26
+        vpaddd  zmm1, zmm1, zmm28
+        vpaddd  zmm2, zmm2, zmm20
+        vpaddd  zmm3, zmm3, zmm29
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 8
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vprord  zmm4, zmm4, 7
+        vpxord  zmm0, zmm0, zmm8
+        vpxord  zmm1, zmm1, zmm9
+        vpxord  zmm2, zmm2, zmm10
+        vpxord  zmm3, zmm3, zmm11
+        vpxord  zmm4, zmm4, zmm12
+        vpxord  zmm5, zmm5, zmm13
+        vpxord  zmm6, zmm6, zmm14
+        vpxord  zmm7, zmm7, zmm15
+        movzx   eax, byte ptr [rbp+78H]
+        jne     innerloop16
+        mov     rbx, qword ptr [rbp+90H]
+        vpunpckldq zmm16, zmm0, zmm1
+        vpunpckhdq zmm17, zmm0, zmm1
+        vpunpckldq zmm18, zmm2, zmm3
+        vpunpckhdq zmm19, zmm2, zmm3
+        vpunpckldq zmm20, zmm4, zmm5
+        vpunpckhdq zmm21, zmm4, zmm5
+        vpunpckldq zmm22, zmm6, zmm7
+        vpunpckhdq zmm23, zmm6, zmm7
+        vpunpcklqdq zmm0, zmm16, zmm18
+        vpunpckhqdq zmm1, zmm16, zmm18
+        vpunpcklqdq zmm2, zmm17, zmm19
+        vpunpckhqdq zmm3, zmm17, zmm19
+        vpunpcklqdq zmm4, zmm20, zmm22
+        vpunpckhqdq zmm5, zmm20, zmm22
+        vpunpcklqdq zmm6, zmm21, zmm23
+        vpunpckhqdq zmm7, zmm21, zmm23
+        vshufi32x4 zmm16, zmm0, zmm4, 88H
+        vshufi32x4 zmm17, zmm1, zmm5, 88H
+        vshufi32x4 zmm18, zmm2, zmm6, 88H
+        vshufi32x4 zmm19, zmm3, zmm7, 88H
+        vshufi32x4 zmm20, zmm0, zmm4, 0DDH
+        vshufi32x4 zmm21, zmm1, zmm5, 0DDH
+        vshufi32x4 zmm22, zmm2, zmm6, 0DDH
+        vshufi32x4 zmm23, zmm3, zmm7, 0DDH
+        vshufi32x4 zmm0, zmm16, zmm17, 88H
+        vshufi32x4 zmm1, zmm18, zmm19, 88H
+        vshufi32x4 zmm2, zmm20, zmm21, 88H
+        vshufi32x4 zmm3, zmm22, zmm23, 88H
+        vshufi32x4 zmm4, zmm16, zmm17, 0DDH
+        vshufi32x4 zmm5, zmm18, zmm19, 0DDH
+        vshufi32x4 zmm6, zmm20, zmm21, 0DDH
+        vshufi32x4 zmm7, zmm22, zmm23, 0DDH
+        vmovdqu32 zmmword ptr [rbx], zmm0
+        vmovdqu32 zmmword ptr [rbx+1H*40H], zmm1
+        vmovdqu32 zmmword ptr [rbx+2H*40H], zmm2
+        vmovdqu32 zmmword ptr [rbx+3H*40H], zmm3
+        vmovdqu32 zmmword ptr [rbx+4H*40H], zmm4
+        vmovdqu32 zmmword ptr [rbx+5H*40H], zmm5
+        vmovdqu32 zmmword ptr [rbx+6H*40H], zmm6
+        vmovdqu32 zmmword ptr [rbx+7H*40H], zmm7
+        vmovdqa32 zmm0, zmmword ptr [rsp]
+        vmovdqa32 zmm1, zmmword ptr [rsp+1H*40H]
+        vmovdqa32 zmm2, zmm0
+        ; XXX: ml64.exe does not currently understand the syntax. We use a workaround.
+        vpbroadcastd zmm4, dword ptr [ADD16]
+        vpbroadcastd zmm5, dword ptr [ADD1]
+        vpaddd  zmm2{k1}, zmm0, zmm4
+        ; vpaddd  zmm2{k1}, zmm0, dword ptr [ADD16] ; {1to16}
+        vpcmpud k2, zmm2, zmm0, 1
+        vpaddd  zmm1 {k2}, zmm1, zmm5
+        ; vpaddd  zmm1 {k2}, zmm1, dword ptr [ADD1] ; {1to16}
+        vmovdqa32 zmmword ptr [rsp], zmm2
+        vmovdqa32 zmmword ptr [rsp+1H*40H], zmm1
+        add     rdi, 128
+        add     rbx, 512
+        mov     qword ptr [rbp+90H], rbx
+        sub     rsi, 16
+        cmp     rsi, 16
+        jnc     outerloop16
+        test    rsi, rsi
+        jne     final15blocks
+unwind:
+        vzeroupper
+        vmovdqa xmm6, xmmword ptr [rsp+90H]
+        vmovdqa xmm7, xmmword ptr [rsp+0A0H]
+        vmovdqa xmm8, xmmword ptr [rsp+0B0H]
+        vmovdqa xmm9, xmmword ptr [rsp+0C0H]
+        vmovdqa xmm10, xmmword ptr [rsp+0D0H]
+        vmovdqa xmm11, xmmword ptr [rsp+0E0H]
+        vmovdqa xmm12, xmmword ptr [rsp+0F0H]
+        vmovdqa xmm13, xmmword ptr [rsp+100H]
+        vmovdqa xmm14, xmmword ptr [rsp+110H]
+        vmovdqa xmm15, xmmword ptr [rsp+120H]
+        mov     rsp, rbp
+        pop     rbp
+        pop     rbx
+        pop     rsi
+        pop     rdi
+        pop     r12
+        pop     r13
+        pop     r14
+        pop     r15
+        ret
+ALIGN   16
+final15blocks:
+        test    esi, 8H
+        je      final7blocks
+        vpbroadcastd ymm0, dword ptr [rcx]
+        vpbroadcastd ymm1, dword ptr [rcx+4H]
+        vpbroadcastd ymm2, dword ptr [rcx+8H]
+        vpbroadcastd ymm3, dword ptr [rcx+0CH]
+        vpbroadcastd ymm4, dword ptr [rcx+10H]
+        vpbroadcastd ymm5, dword ptr [rcx+14H]
+        vpbroadcastd ymm6, dword ptr [rcx+18H]
+        vpbroadcastd ymm7, dword ptr [rcx+1CH]
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+8H]
+        mov     r10, qword ptr [rdi+10H]
+        mov     r11, qword ptr [rdi+18H]
+        mov     r12, qword ptr [rdi+20H]
+        mov     r13, qword ptr [rdi+28H]
+        mov     r14, qword ptr [rdi+30H]
+        mov     r15, qword ptr [rdi+38H]
+        movzx   eax, byte ptr [rbp+78H]
+        movzx   ebx, byte ptr [rbp+80H]
+        or      eax, ebx
+        xor     edx, edx
+innerloop8:
+        movzx   ebx, byte ptr [rbp+88H]
+        or      ebx, eax
+        add     rdx, 64
+        cmp     rdx, qword ptr [rsp+80H]
+        cmove   eax, ebx
+        mov     dword ptr [rsp+88H], eax
+        vmovups xmm8, xmmword ptr [r8+rdx-40H]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-40H], 01H
+        vmovups xmm9, xmmword ptr [r9+rdx-40H]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-40H], 01H
+        vunpcklpd ymm12, ymm8, ymm9
+        vunpckhpd ymm13, ymm8, ymm9
+        vmovups xmm10, xmmword ptr [r10+rdx-40H]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-40H], 01H
+        vmovups xmm11, xmmword ptr [r11+rdx-40H]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-40H], 01H
+        vunpcklpd ymm14, ymm10, ymm11
+        vunpckhpd ymm15, ymm10, ymm11
+        vshufps ymm16, ymm12, ymm14, 136
+        vshufps ymm17, ymm12, ymm14, 221
+        vshufps ymm18, ymm13, ymm15, 136
+        vshufps ymm19, ymm13, ymm15, 221
+        vmovups xmm8, xmmword ptr [r8+rdx-30H]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-30H], 01H
+        vmovups xmm9, xmmword ptr [r9+rdx-30H]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-30H], 01H
+        vunpcklpd ymm12, ymm8, ymm9
+        vunpckhpd ymm13, ymm8, ymm9
+        vmovups xmm10, xmmword ptr [r10+rdx-30H]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-30H], 01H
+        vmovups xmm11, xmmword ptr [r11+rdx-30H]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-30H], 01H
+        vunpcklpd ymm14, ymm10, ymm11
+        vunpckhpd ymm15, ymm10, ymm11
+        vshufps ymm20, ymm12, ymm14, 136
+        vshufps ymm21, ymm12, ymm14, 221
+        vshufps ymm22, ymm13, ymm15, 136
+        vshufps ymm23, ymm13, ymm15, 221
+        vmovups xmm8, xmmword ptr [r8+rdx-20H]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-20H], 01H
+        vmovups xmm9, xmmword ptr [r9+rdx-20H]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-20H], 01H
+        vunpcklpd ymm12, ymm8, ymm9
+        vunpckhpd ymm13, ymm8, ymm9
+        vmovups xmm10, xmmword ptr [r10+rdx-20H]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-20H], 01H
+        vmovups xmm11, xmmword ptr [r11+rdx-20H]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-20H], 01H
+        vunpcklpd ymm14, ymm10, ymm11
+        vunpckhpd ymm15, ymm10, ymm11
+        vshufps ymm24, ymm12, ymm14, 136
+        vshufps ymm25, ymm12, ymm14, 221
+        vshufps ymm26, ymm13, ymm15, 136
+        vshufps ymm27, ymm13, ymm15, 221
+        vmovups xmm8, xmmword ptr [r8+rdx-10H]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-10H], 01H
+        vmovups xmm9, xmmword ptr [r9+rdx-10H]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-10H], 01H
+        vunpcklpd ymm12, ymm8, ymm9
+        vunpckhpd ymm13, ymm8, ymm9
+        vmovups xmm10, xmmword ptr [r10+rdx-10H]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-10H], 01H
+        vmovups xmm11, xmmword ptr [r11+rdx-10H]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-10H], 01H
+        vunpcklpd ymm14, ymm10, ymm11
+        vunpckhpd ymm15, ymm10, ymm11
+        vshufps ymm28, ymm12, ymm14, 136
+        vshufps ymm29, ymm12, ymm14, 221
+        vshufps ymm30, ymm13, ymm15, 136
+        vshufps ymm31, ymm13, ymm15, 221
+        vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0]
+        vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1]
+        vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2]
+        vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3]
+        vmovdqa ymm12, ymmword ptr [rsp]
+        vmovdqa ymm13, ymmword ptr [rsp+40H]
+        vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN]
+        vpbroadcastd ymm15, dword ptr [rsp+88H]
+        vpaddd  ymm0, ymm0, ymm16
+        vpaddd  ymm1, ymm1, ymm18
+        vpaddd  ymm2, ymm2, ymm20
+        vpaddd  ymm3, ymm3, ymm22
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vprord  ymm15, ymm15, 16
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 12
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vpaddd  ymm0, ymm0, ymm17
+        vpaddd  ymm1, ymm1, ymm19
+        vpaddd  ymm2, ymm2, ymm21
+        vpaddd  ymm3, ymm3, ymm23
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vprord  ymm15, ymm15, 8
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 7
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vpaddd  ymm0, ymm0, ymm24
+        vpaddd  ymm1, ymm1, ymm26
+        vpaddd  ymm2, ymm2, ymm28
+        vpaddd  ymm3, ymm3, ymm30
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 16
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vprord  ymm4, ymm4, 12
+        vpaddd  ymm0, ymm0, ymm25
+        vpaddd  ymm1, ymm1, ymm27
+        vpaddd  ymm2, ymm2, ymm29
+        vpaddd  ymm3, ymm3, ymm31
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 8
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vprord  ymm4, ymm4, 7
+        vpaddd  ymm0, ymm0, ymm18
+        vpaddd  ymm1, ymm1, ymm19
+        vpaddd  ymm2, ymm2, ymm23
+        vpaddd  ymm3, ymm3, ymm20
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vprord  ymm15, ymm15, 16
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 12
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vpaddd  ymm0, ymm0, ymm22
+        vpaddd  ymm1, ymm1, ymm26
+        vpaddd  ymm2, ymm2, ymm16
+        vpaddd  ymm3, ymm3, ymm29
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vprord  ymm15, ymm15, 8
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 7
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vpaddd  ymm0, ymm0, ymm17
+        vpaddd  ymm1, ymm1, ymm28
+        vpaddd  ymm2, ymm2, ymm25
+        vpaddd  ymm3, ymm3, ymm31
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 16
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vprord  ymm4, ymm4, 12
+        vpaddd  ymm0, ymm0, ymm27
+        vpaddd  ymm1, ymm1, ymm21
+        vpaddd  ymm2, ymm2, ymm30
+        vpaddd  ymm3, ymm3, ymm24
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 8
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vprord  ymm4, ymm4, 7
+        vpaddd  ymm0, ymm0, ymm19
+        vpaddd  ymm1, ymm1, ymm26
+        vpaddd  ymm2, ymm2, ymm29
+        vpaddd  ymm3, ymm3, ymm23
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vprord  ymm15, ymm15, 16
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 12
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vpaddd  ymm0, ymm0, ymm20
+        vpaddd  ymm1, ymm1, ymm28
+        vpaddd  ymm2, ymm2, ymm18
+        vpaddd  ymm3, ymm3, ymm30
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vprord  ymm15, ymm15, 8
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 7
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vpaddd  ymm0, ymm0, ymm22
+        vpaddd  ymm1, ymm1, ymm25
+        vpaddd  ymm2, ymm2, ymm27
+        vpaddd  ymm3, ymm3, ymm24
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 16
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vprord  ymm4, ymm4, 12
+        vpaddd  ymm0, ymm0, ymm21
+        vpaddd  ymm1, ymm1, ymm16
+        vpaddd  ymm2, ymm2, ymm31
+        vpaddd  ymm3, ymm3, ymm17
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 8
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vprord  ymm4, ymm4, 7
+        vpaddd  ymm0, ymm0, ymm26
+        vpaddd  ymm1, ymm1, ymm28
+        vpaddd  ymm2, ymm2, ymm30
+        vpaddd  ymm3, ymm3, ymm29
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vprord  ymm15, ymm15, 16
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 12
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vpaddd  ymm0, ymm0, ymm23
+        vpaddd  ymm1, ymm1, ymm25
+        vpaddd  ymm2, ymm2, ymm19
+        vpaddd  ymm3, ymm3, ymm31
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vprord  ymm15, ymm15, 8
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 7
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vpaddd  ymm0, ymm0, ymm20
+        vpaddd  ymm1, ymm1, ymm27
+        vpaddd  ymm2, ymm2, ymm21
+        vpaddd  ymm3, ymm3, ymm17
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 16
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vprord  ymm4, ymm4, 12
+        vpaddd  ymm0, ymm0, ymm16
+        vpaddd  ymm1, ymm1, ymm18
+        vpaddd  ymm2, ymm2, ymm24
+        vpaddd  ymm3, ymm3, ymm22
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 8
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vprord  ymm4, ymm4, 7
+        vpaddd  ymm0, ymm0, ymm28
+        vpaddd  ymm1, ymm1, ymm25
+        vpaddd  ymm2, ymm2, ymm31
+        vpaddd  ymm3, ymm3, ymm30
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vprord  ymm15, ymm15, 16
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 12
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vpaddd  ymm0, ymm0, ymm29
+        vpaddd  ymm1, ymm1, ymm27
+        vpaddd  ymm2, ymm2, ymm26
+        vpaddd  ymm3, ymm3, ymm24
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vprord  ymm15, ymm15, 8
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 7
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vpaddd  ymm0, ymm0, ymm23
+        vpaddd  ymm1, ymm1, ymm21
+        vpaddd  ymm2, ymm2, ymm16
+        vpaddd  ymm3, ymm3, ymm22
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 16
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vprord  ymm4, ymm4, 12
+        vpaddd  ymm0, ymm0, ymm18
+        vpaddd  ymm1, ymm1, ymm19
+        vpaddd  ymm2, ymm2, ymm17
+        vpaddd  ymm3, ymm3, ymm20
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 8
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vprord  ymm4, ymm4, 7
+        vpaddd  ymm0, ymm0, ymm25
+        vpaddd  ymm1, ymm1, ymm27
+        vpaddd  ymm2, ymm2, ymm24
+        vpaddd  ymm3, ymm3, ymm31
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vprord  ymm15, ymm15, 16
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 12
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vpaddd  ymm0, ymm0, ymm30
+        vpaddd  ymm1, ymm1, ymm21
+        vpaddd  ymm2, ymm2, ymm28
+        vpaddd  ymm3, ymm3, ymm17
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vprord  ymm15, ymm15, 8
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 7
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vpaddd  ymm0, ymm0, ymm29
+        vpaddd  ymm1, ymm1, ymm16
+        vpaddd  ymm2, ymm2, ymm18
+        vpaddd  ymm3, ymm3, ymm20
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 16
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vprord  ymm4, ymm4, 12
+        vpaddd  ymm0, ymm0, ymm19
+        vpaddd  ymm1, ymm1, ymm26
+        vpaddd  ymm2, ymm2, ymm22
+        vpaddd  ymm3, ymm3, ymm23
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 8
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vprord  ymm4, ymm4, 7
+        vpaddd  ymm0, ymm0, ymm27
+        vpaddd  ymm1, ymm1, ymm21
+        vpaddd  ymm2, ymm2, ymm17
+        vpaddd  ymm3, ymm3, ymm24
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vprord  ymm15, ymm15, 16
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 12
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vpaddd  ymm0, ymm0, ymm31
+        vpaddd  ymm1, ymm1, ymm16
+        vpaddd  ymm2, ymm2, ymm25
+        vpaddd  ymm3, ymm3, ymm22
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vprord  ymm15, ymm15, 8
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 7
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vpaddd  ymm0, ymm0, ymm30
+        vpaddd  ymm1, ymm1, ymm18
+        vpaddd  ymm2, ymm2, ymm19
+        vpaddd  ymm3, ymm3, ymm23
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 16
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vprord  ymm4, ymm4, 12
+        vpaddd  ymm0, ymm0, ymm26
+        vpaddd  ymm1, ymm1, ymm28
+        vpaddd  ymm2, ymm2, ymm20
+        vpaddd  ymm3, ymm3, ymm29
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 8
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vprord  ymm4, ymm4, 7
+        vpxor   ymm0, ymm0, ymm8
+        vpxor   ymm1, ymm1, ymm9
+        vpxor   ymm2, ymm2, ymm10
+        vpxor   ymm3, ymm3, ymm11
+        vpxor   ymm4, ymm4, ymm12
+        vpxor   ymm5, ymm5, ymm13
+        vpxor   ymm6, ymm6, ymm14
+        vpxor   ymm7, ymm7, ymm15
+        movzx   eax, byte ptr [rbp+78H]
+        jne     innerloop8
+        mov     rbx, qword ptr [rbp+90H]
+        vunpcklps ymm8, ymm0, ymm1
+        vunpcklps ymm9, ymm2, ymm3
+        vunpckhps ymm10, ymm0, ymm1
+        vunpcklps ymm11, ymm4, ymm5
+        vunpcklps ymm0, ymm6, ymm7
+        vshufps ymm12, ymm8, ymm9, 78
+        vblendps ymm1, ymm8, ymm12, 0CCH
+        vshufps ymm8, ymm11, ymm0, 78
+        vunpckhps ymm13, ymm2, ymm3
+        vblendps ymm2, ymm11, ymm8, 0CCH
+        vblendps ymm3, ymm12, ymm9, 0CCH
+        vperm2f128 ymm12, ymm1, ymm2, 20H
+        vmovups ymmword ptr [rbx], ymm12
+        vunpckhps ymm14, ymm4, ymm5
+        vblendps ymm4, ymm8, ymm0, 0CCH
+        vunpckhps ymm15, ymm6, ymm7
+        vperm2f128 ymm7, ymm3, ymm4, 20H
+        vmovups ymmword ptr [rbx+20H], ymm7
+        vshufps ymm5, ymm10, ymm13, 78
+        vblendps ymm6, ymm5, ymm13, 0CCH
+        vshufps ymm13, ymm14, ymm15, 78
+        vblendps ymm10, ymm10, ymm5, 0CCH
+        vblendps ymm14, ymm14, ymm13, 0CCH
+        vperm2f128 ymm8, ymm10, ymm14, 20H
+        vmovups ymmword ptr [rbx+40H], ymm8
+        vblendps ymm15, ymm13, ymm15, 0CCH
+        vperm2f128 ymm13, ymm6, ymm15, 20H
+        vmovups ymmword ptr [rbx+60H], ymm13
+        vperm2f128 ymm9, ymm1, ymm2, 31H
+        vperm2f128 ymm11, ymm3, ymm4, 31H
+        vmovups ymmword ptr [rbx+80H], ymm9
+        vperm2f128 ymm14, ymm10, ymm14, 31H
+        vperm2f128 ymm15, ymm6, ymm15, 31H
+        vmovups ymmword ptr [rbx+0A0H], ymm11
+        vmovups ymmword ptr [rbx+0C0H], ymm14
+        vmovups ymmword ptr [rbx+0E0H], ymm15
+        vmovdqa ymm0, ymmword ptr [rsp]
+        vmovdqa ymm2, ymmword ptr [rsp+40H]
+        vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+1H*20H]
+        vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+3H*20H]
+        vmovdqa ymmword ptr [rsp], ymm0
+        vmovdqa ymmword ptr [rsp+40H], ymm2
+        add     rbx, 256
+        mov     qword ptr [rbp+90H], rbx
+        add     rdi, 64
+        sub     rsi, 8
+final7blocks:
+        mov     rbx, qword ptr [rbp+90H]
+        mov     r15, qword ptr [rsp+80H]
+        movzx   r13, byte ptr [rbp+78H]
+        movzx   r12, byte ptr [rbp+88H]
+        test    esi, 4H
+        je      final3blocks
+        vbroadcasti32x4 zmm0, xmmword ptr [rcx]
+        vbroadcasti32x4 zmm1, xmmword ptr [rcx+1H*10H]
+        vmovdqa xmm12, xmmword ptr [rsp]
+        vmovdqa xmm13, xmmword ptr [rsp+40H]
+        vpunpckldq xmm14, xmm12, xmm13
+        vpunpckhdq xmm15, xmm12, xmm13
+        vpermq  ymm14, ymm14, 0DCH
+        vpermq  ymm15, ymm15, 0DCH
+        vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN]
+        vinserti64x4 zmm13, zmm14, ymm15, 01H
+        mov     eax, 17476
+        kmovw   k2, eax
+        vpblendmd zmm13 {k2}, zmm13, zmm12
+        vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV]
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+8H]
+        mov     r10, qword ptr [rdi+10H]
+        mov     r11, qword ptr [rdi+18H]
+        mov     eax, 43690
+        kmovw   k3, eax
+        mov     eax, 34952
+        kmovw   k4, eax
+        movzx   eax, byte ptr [rbp+80H]
+        or      eax, r13d
+        xor     edx, edx
+ALIGN   16
+innerloop4:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        mov     dword ptr [rsp+88H], eax
+        vmovdqa32 zmm2, zmm15
+        vpbroadcastd zmm8, dword ptr [rsp+22H*4H]
+        vpblendmd zmm3 {k4}, zmm13, zmm8
+        vmovups zmm8, zmmword ptr [r8+rdx-1H*40H]
+        vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-4H*10H], 01H
+        vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-4H*10H], 02H
+        vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-4H*10H], 03H
+        vmovups zmm9, zmmword ptr [r8+rdx-30H]
+        vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-3H*10H], 01H
+        vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-3H*10H], 02H
+        vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-3H*10H], 03H
+        vshufps zmm4, zmm8, zmm9, 136
+        vshufps zmm5, zmm8, zmm9, 221
+        vmovups zmm8, zmmword ptr [r8+rdx-20H]
+        vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-2H*10H], 01H
+        vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-2H*10H], 02H
+        vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-2H*10H], 03H
+        vmovups zmm9, zmmword ptr [r8+rdx-10H]
+        vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-1H*10H], 01H
+        vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-1H*10H], 02H
+        vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-1H*10H], 03H
+        vshufps zmm6, zmm8, zmm9, 136
+        vshufps zmm7, zmm8, zmm9, 221
+        vpshufd zmm6, zmm6, 93H
+        vpshufd zmm7, zmm7, 93H
+        mov     al, 7
+roundloop4:
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm0, zmm0, zmm1
+        vpxord  zmm3, zmm3, zmm0
+        vprord  zmm3, zmm3, 16
+        vpaddd  zmm2, zmm2, zmm3
+        vpxord  zmm1, zmm1, zmm2
+        vprord  zmm1, zmm1, 12
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm0, zmm0, zmm1
+        vpxord  zmm3, zmm3, zmm0
+        vprord  zmm3, zmm3, 8
+        vpaddd  zmm2, zmm2, zmm3
+        vpxord  zmm1, zmm1, zmm2
+        vprord  zmm1, zmm1, 7
+        vpshufd zmm0, zmm0, 93H
+        vpshufd zmm3, zmm3, 4EH
+        vpshufd zmm2, zmm2, 39H
+        vpaddd  zmm0, zmm0, zmm6
+        vpaddd  zmm0, zmm0, zmm1
+        vpxord  zmm3, zmm3, zmm0
+        vprord  zmm3, zmm3, 16
+        vpaddd  zmm2, zmm2, zmm3
+        vpxord  zmm1, zmm1, zmm2
+        vprord  zmm1, zmm1, 12
+        vpaddd  zmm0, zmm0, zmm7
+        vpaddd  zmm0, zmm0, zmm1
+        vpxord  zmm3, zmm3, zmm0
+        vprord  zmm3, zmm3, 8
+        vpaddd  zmm2, zmm2, zmm3
+        vpxord  zmm1, zmm1, zmm2
+        vprord  zmm1, zmm1, 7
+        vpshufd zmm0, zmm0, 39H
+        vpshufd zmm3, zmm3, 4EH
+        vpshufd zmm2, zmm2, 93H
+        dec     al
+        jz      endroundloop4
+        vshufps zmm8, zmm4, zmm5, 214
+        vpshufd zmm9, zmm4, 0FH
+        vpshufd zmm4, zmm8, 39H
+        vshufps zmm8, zmm6, zmm7, 250
+        vpblendmd zmm9 {k3}, zmm9, zmm8
+        vpunpcklqdq zmm8, zmm7, zmm5
+        vpblendmd zmm8 {k4}, zmm8, zmm6
+        vpshufd zmm8, zmm8, 78H
+        vpunpckhdq zmm5, zmm5, zmm7
+        vpunpckldq zmm6, zmm6, zmm5
+        vpshufd zmm7, zmm6, 1EH
+        vmovdqa32 zmm5, zmm9
+        vmovdqa32 zmm6, zmm8
+        jmp     roundloop4
+endroundloop4:
+        vpxord  zmm0, zmm0, zmm2
+        vpxord  zmm1, zmm1, zmm3
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     innerloop4
+        vmovdqu xmmword ptr [rbx], xmm0
+        vmovdqu xmmword ptr [rbx+10H], xmm1
+        vextracti128 xmmword ptr [rbx+20H], ymm0, 01H
+        vextracti128 xmmword ptr [rbx+30H], ymm1, 01H
+        vextracti32x4 xmmword ptr [rbx+4H*10H], zmm0, 02H
+        vextracti32x4 xmmword ptr [rbx+5H*10H], zmm1, 02H
+        vextracti32x4 xmmword ptr [rbx+6H*10H], zmm0, 03H
+        vextracti32x4 xmmword ptr [rbx+7H*10H], zmm1, 03H
+        vmovdqa xmm0, xmmword ptr [rsp]
+        vmovdqa xmm2, xmmword ptr [rsp+40H]
+        vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+1H*10H]
+        vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+5H*10H]
+        vmovdqa xmmword ptr [rsp], xmm0
+        vmovdqa xmmword ptr [rsp+40H], xmm2
+        add     rbx, 128
+        add     rdi, 32
+        sub     rsi, 4
+final3blocks:
+        test    esi, 2H
+        je      final1block
+        vbroadcasti128 ymm0, xmmword ptr [rcx]
+        vbroadcasti128 ymm1, xmmword ptr [rcx+10H]
+        vmovd   xmm13, dword ptr [rsp]
+        vpinsrd xmm13, xmm13, dword ptr [rsp+40H], 1
+        vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2
+        vmovd   xmm14, dword ptr [rsp+4H]
+        vpinsrd xmm14, xmm14, dword ptr [rsp+44H], 1
+        vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2
+        vinserti128 ymm13, ymm13, xmm14, 01H
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+8H]
+        movzx   eax, byte ptr [rbp+80H]
+        or      eax, r13d
+        xor     edx, edx
+ALIGN   16
+innerloop2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        mov     dword ptr [rsp+88H], eax
+        vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV]
+        vpbroadcastd ymm8, dword ptr [rsp+88H]
+        vpblendd ymm3, ymm13, ymm8, 88H
+        vmovups ymm8, ymmword ptr [r8+rdx-40H]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-40H], 01H
+        vmovups ymm9, ymmword ptr [r8+rdx-30H]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-30H], 01H
+        vshufps ymm4, ymm8, ymm9, 136
+        vshufps ymm5, ymm8, ymm9, 221
+        vmovups ymm8, ymmword ptr [r8+rdx-20H]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-20H], 01H
+        vmovups ymm9, ymmword ptr [r8+rdx-10H]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-10H], 01H
+        vshufps ymm6, ymm8, ymm9, 136
+        vshufps ymm7, ymm8, ymm9, 221
+        vpshufd ymm6, ymm6, 93H
+        vpshufd ymm7, ymm7, 93H
+        mov     al, 7
+roundloop2:
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm0, ymm0, ymm1
+        vpxord  ymm3, ymm3, ymm0
+        vprord  ymm3, ymm3, 16
+        vpaddd  ymm2, ymm2, ymm3
+        vpxord  ymm1, ymm1, ymm2
+        vprord  ymm1, ymm1, 12
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm0, ymm0, ymm1
+        vpxord  ymm3, ymm3, ymm0
+        vprord  ymm3, ymm3, 8
+        vpaddd  ymm2, ymm2, ymm3
+        vpxord  ymm1, ymm1, ymm2
+        vprord  ymm1, ymm1, 7
+        vpshufd ymm0, ymm0, 93H
+        vpshufd ymm3, ymm3, 4EH
+        vpshufd ymm2, ymm2, 39H
+        vpaddd  ymm0, ymm0, ymm6
+        vpaddd  ymm0, ymm0, ymm1
+        vpxord  ymm3, ymm3, ymm0
+        vprord  ymm3, ymm3, 16
+        vpaddd  ymm2, ymm2, ymm3
+        vpxord  ymm1, ymm1, ymm2
+        vprord  ymm1, ymm1, 12
+        vpaddd  ymm0, ymm0, ymm7
+        vpaddd  ymm0, ymm0, ymm1
+        vpxord  ymm3, ymm3, ymm0
+        vprord  ymm3, ymm3, 8
+        vpaddd  ymm2, ymm2, ymm3
+        vpxord  ymm1, ymm1, ymm2
+        vprord  ymm1, ymm1, 7
+        vpshufd ymm0, ymm0, 39H
+        vpshufd ymm3, ymm3, 4EH
+        vpshufd ymm2, ymm2, 93H
+        dec     al
+        jz      endroundloop2
+        vshufps ymm8, ymm4, ymm5, 214
+        vpshufd ymm9, ymm4, 0FH
+        vpshufd ymm4, ymm8, 39H
+        vshufps ymm8, ymm6, ymm7, 250
+        vpblendd ymm9, ymm9, ymm8, 0AAH
+        vpunpcklqdq ymm8, ymm7, ymm5
+        vpblendd ymm8, ymm8, ymm6, 88H
+        vpshufd ymm8, ymm8, 78H
+        vpunpckhdq ymm5, ymm5, ymm7
+        vpunpckldq ymm6, ymm6, ymm5
+        vpshufd ymm7, ymm6, 1EH
+        vmovdqa ymm5, ymm9
+        vmovdqa ymm6, ymm8
+        jmp     roundloop2
+endroundloop2:
+        vpxor   ymm0, ymm0, ymm2
+        vpxor   ymm1, ymm1, ymm3
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     innerloop2
+        vmovdqu xmmword ptr [rbx], xmm0
+        vmovdqu xmmword ptr [rbx+10H], xmm1
+        vextracti128 xmmword ptr [rbx+20H], ymm0, 01H
+        vextracti128 xmmword ptr [rbx+30H], ymm1, 01H
+        vmovdqa xmm0, xmmword ptr [rsp]
+        vmovdqa xmm2, xmmword ptr [rsp+40H]
+        vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+8H]
+        vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+48H]
+        vmovdqa xmmword ptr [rsp], xmm0
+        vmovdqa xmmword ptr [rsp+40H], xmm2
+        add     rbx, 64
+        add     rdi, 16
+        sub     rsi, 2
+final1block:
+        test    esi, 1H
+        je      unwind
+        vmovdqu xmm0, xmmword ptr [rcx]
+        vmovdqu xmm1, xmmword ptr [rcx+10H]
+        vmovd   xmm14, dword ptr [rsp]
+        vpinsrd xmm14, xmm14, dword ptr [rsp+40H], 1
+        vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2
+        vmovdqa xmm15, xmmword ptr [BLAKE3_IV]
+        mov     r8, qword ptr [rdi]
+        movzx   eax, byte ptr [rbp+80H]
+        or      eax, r13d
+        xor     edx, edx
+ALIGN   16
+innerloop1:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        vpinsrd xmm3, xmm14, eax, 3
+        vmovdqa xmm2, xmm15
+        vmovups xmm8, xmmword ptr [r8+rdx-40H]
+        vmovups xmm9, xmmword ptr [r8+rdx-30H]
+        vshufps xmm4, xmm8, xmm9, 136
+        vshufps xmm5, xmm8, xmm9, 221
+        vmovups xmm8, xmmword ptr [r8+rdx-20H]
+        vmovups xmm9, xmmword ptr [r8+rdx-10H]
+        vshufps xmm6, xmm8, xmm9, 136
+        vshufps xmm7, xmm8, xmm9, 221
+        vpshufd xmm6, xmm6, 93H
+        vpshufd xmm7, xmm7, 93H
+        mov     al, 7
+roundloop1:
+        vpaddd  xmm0, xmm0, xmm4
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 16
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 12
+        vpaddd  xmm0, xmm0, xmm5
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 8
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 7
+        vpshufd xmm0, xmm0, 93H
+        vpshufd xmm3, xmm3, 4EH
+        vpshufd xmm2, xmm2, 39H
+        vpaddd  xmm0, xmm0, xmm6
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 16
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 12
+        vpaddd  xmm0, xmm0, xmm7
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 8
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 7
+        vpshufd xmm0, xmm0, 39H
+        vpshufd xmm3, xmm3, 4EH
+        vpshufd xmm2, xmm2, 93H
+        dec     al
+        jz      endroundloop1
+        vshufps xmm8, xmm4, xmm5, 214
+        vpshufd xmm9, xmm4, 0FH
+        vpshufd xmm4, xmm8, 39H
+        vshufps xmm8, xmm6, xmm7, 250
+        vpblendd xmm9, xmm9, xmm8, 0AAH
+        vpunpcklqdq xmm8, xmm7, xmm5
+        vpblendd xmm8, xmm8, xmm6, 88H
+        vpshufd xmm8, xmm8, 78H
+        vpunpckhdq xmm5, xmm5, xmm7
+        vpunpckldq xmm6, xmm6, xmm5
+        vpshufd xmm7, xmm6, 1EH
+        vmovdqa xmm5, xmm9
+        vmovdqa xmm6, xmm8
+        jmp     roundloop1
+endroundloop1:
+        vpxor   xmm0, xmm0, xmm2
+        vpxor   xmm1, xmm1, xmm3
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     innerloop1
+        vmovdqu xmmword ptr [rbx], xmm0
+        vmovdqu xmmword ptr [rbx+10H], xmm1
+        jmp     unwind
+
+_blake3_hash_many_avx512 ENDP
+blake3_hash_many_avx512 ENDP
+
+ALIGN 16
+blake3_compress_in_place_avx512 PROC
+_blake3_compress_in_place_avx512 PROC
+        sub     rsp, 72
+        vmovdqa xmmword ptr [rsp], xmm6
+        vmovdqa xmmword ptr [rsp+10H], xmm7
+        vmovdqa xmmword ptr [rsp+20H], xmm8
+        vmovdqa xmmword ptr [rsp+30H], xmm9
+        vmovdqu xmm0, xmmword ptr [rcx]
+        vmovdqu xmm1, xmmword ptr [rcx+10H]
+        movzx   eax, byte ptr [rsp+70H]
+        movzx   r8d, r8b
+        shl     rax, 32
+        add     r8, rax
+        vmovq   xmm3, r9
+        vmovq   xmm4, r8
+        vpunpcklqdq xmm3, xmm3, xmm4
+        vmovaps xmm2, xmmword ptr [BLAKE3_IV]
+        vmovups xmm8, xmmword ptr [rdx]
+        vmovups xmm9, xmmword ptr [rdx+10H]
+        vshufps xmm4, xmm8, xmm9, 136
+        vshufps xmm5, xmm8, xmm9, 221
+        vmovups xmm8, xmmword ptr [rdx+20H]
+        vmovups xmm9, xmmword ptr [rdx+30H]
+        vshufps xmm6, xmm8, xmm9, 136
+        vshufps xmm7, xmm8, xmm9, 221
+        vpshufd xmm6, xmm6, 93H
+        vpshufd xmm7, xmm7, 93H
+        mov     al, 7
+@@:
+        vpaddd  xmm0, xmm0, xmm4
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 16
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 12
+        vpaddd  xmm0, xmm0, xmm5
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 8
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 7
+        vpshufd xmm0, xmm0, 93H
+        vpshufd xmm3, xmm3, 4EH
+        vpshufd xmm2, xmm2, 39H
+        vpaddd  xmm0, xmm0, xmm6
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 16
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 12
+        vpaddd  xmm0, xmm0, xmm7
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 8
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 7
+        vpshufd xmm0, xmm0, 39H
+        vpshufd xmm3, xmm3, 4EH
+        vpshufd xmm2, xmm2, 93H
+        dec     al
+        jz      @F
+        vshufps xmm8, xmm4, xmm5, 214
+        vpshufd xmm9, xmm4, 0FH
+        vpshufd xmm4, xmm8, 39H
+        vshufps xmm8, xmm6, xmm7, 250
+        vpblendd xmm9, xmm9, xmm8, 0AAH
+        vpunpcklqdq xmm8, xmm7, xmm5
+        vpblendd xmm8, xmm8, xmm6, 88H
+        vpshufd xmm8, xmm8, 78H
+        vpunpckhdq xmm5, xmm5, xmm7
+        vpunpckldq xmm6, xmm6, xmm5
+        vpshufd xmm7, xmm6, 1EH
+        vmovdqa xmm5, xmm9
+        vmovdqa xmm6, xmm8
+        jmp     @B
+@@:
+        vpxor   xmm0, xmm0, xmm2
+        vpxor   xmm1, xmm1, xmm3
+        vmovdqu xmmword ptr [rcx], xmm0
+        vmovdqu xmmword ptr [rcx+10H], xmm1
+        vmovdqa xmm6, xmmword ptr [rsp]
+        vmovdqa xmm7, xmmword ptr [rsp+10H]
+        vmovdqa xmm8, xmmword ptr [rsp+20H]
+        vmovdqa xmm9, xmmword ptr [rsp+30H]
+        add     rsp, 72
+        ret
+_blake3_compress_in_place_avx512 ENDP
+blake3_compress_in_place_avx512 ENDP
+
+ALIGN 16
+blake3_compress_xof_avx512 PROC
+_blake3_compress_xof_avx512 PROC
+        sub     rsp, 72
+        vmovdqa xmmword ptr [rsp], xmm6
+        vmovdqa xmmword ptr [rsp+10H], xmm7
+        vmovdqa xmmword ptr [rsp+20H], xmm8
+        vmovdqa xmmword ptr [rsp+30H], xmm9
+        vmovdqu xmm0, xmmword ptr [rcx]
+        vmovdqu xmm1, xmmword ptr [rcx+10H]
+        movzx   eax, byte ptr [rsp+70H]
+        movzx   r8d, r8b
+        mov     r10, qword ptr [rsp+78H]
+        shl     rax, 32
+        add     r8, rax
+        vmovq   xmm3, r9
+        vmovq   xmm4, r8
+        vpunpcklqdq xmm3, xmm3, xmm4
+        vmovaps xmm2, xmmword ptr [BLAKE3_IV]
+        vmovups xmm8, xmmword ptr [rdx]
+        vmovups xmm9, xmmword ptr [rdx+10H]
+        vshufps xmm4, xmm8, xmm9, 136
+        vshufps xmm5, xmm8, xmm9, 221
+        vmovups xmm8, xmmword ptr [rdx+20H]
+        vmovups xmm9, xmmword ptr [rdx+30H]
+        vshufps xmm6, xmm8, xmm9, 136
+        vshufps xmm7, xmm8, xmm9, 221
+        vpshufd xmm6, xmm6, 93H
+        vpshufd xmm7, xmm7, 93H
+        mov     al, 7
+@@:
+        vpaddd  xmm0, xmm0, xmm4
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 16
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 12
+        vpaddd  xmm0, xmm0, xmm5
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 8
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 7
+        vpshufd xmm0, xmm0, 93H
+        vpshufd xmm3, xmm3, 4EH
+        vpshufd xmm2, xmm2, 39H
+        vpaddd  xmm0, xmm0, xmm6
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 16
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 12
+        vpaddd  xmm0, xmm0, xmm7
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 8
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 7
+        vpshufd xmm0, xmm0, 39H
+        vpshufd xmm3, xmm3, 4EH
+        vpshufd xmm2, xmm2, 93H
+        dec     al
+        jz      @F
+        vshufps xmm8, xmm4, xmm5, 214
+        vpshufd xmm9, xmm4, 0FH
+        vpshufd xmm4, xmm8, 39H
+        vshufps xmm8, xmm6, xmm7, 250
+        vpblendd xmm9, xmm9, xmm8, 0AAH
+        vpunpcklqdq xmm8, xmm7, xmm5
+        vpblendd xmm8, xmm8, xmm6, 88H
+        vpshufd xmm8, xmm8, 78H
+        vpunpckhdq xmm5, xmm5, xmm7
+        vpunpckldq xmm6, xmm6, xmm5
+        vpshufd xmm7, xmm6, 1EH
+        vmovdqa xmm5, xmm9
+        vmovdqa xmm6, xmm8
+        jmp     @B
+@@:
+        vpxor   xmm0, xmm0, xmm2
+        vpxor   xmm1, xmm1, xmm3
+        vpxor   xmm2, xmm2, xmmword ptr [rcx]
+        vpxor   xmm3, xmm3, xmmword ptr [rcx+10H]
+        vmovdqu xmmword ptr [r10], xmm0
+        vmovdqu xmmword ptr [r10+10H], xmm1
+        vmovdqu xmmword ptr [r10+20H], xmm2
+        vmovdqu xmmword ptr [r10+30H], xmm3
+        vmovdqa xmm6, xmmword ptr [rsp]
+        vmovdqa xmm7, xmmword ptr [rsp+10H]
+        vmovdqa xmm8, xmmword ptr [rsp+20H]
+        vmovdqa xmm9, xmmword ptr [rsp+30H]
+        add     rsp, 72
+        ret
+_blake3_compress_xof_avx512 ENDP
+blake3_compress_xof_avx512 ENDP
+
+_TEXT ENDS
+
+_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST'
+ALIGN   64
+INDEX0:
+        dd    0,  1,  2,  3, 16, 17, 18, 19
+        dd    8,  9, 10, 11, 24, 25, 26, 27
+INDEX1:
+        dd    4,  5,  6,  7, 20, 21, 22, 23
+        dd   12, 13, 14, 15, 28, 29, 30, 31
+ADD0:
+        dd    0,  1,  2,  3,  4,  5,  6,  7
+        dd    8,  9, 10, 11, 12, 13, 14, 15
+ADD1:   
+        dd    1
+ADD16:  
+        dd   16
+BLAKE3_BLOCK_LEN:
+        dd   64
+ALIGN   64
+BLAKE3_IV:
+BLAKE3_IV_0:
+        dd   06A09E667H
+BLAKE3_IV_1:
+        dd   0BB67AE85H
+BLAKE3_IV_2:
+        dd   03C6EF372H
+BLAKE3_IV_3:
+        dd   0A54FF53AH
+
+_RDATA ENDS
+END
diff --git a/llvm/lib/Support/BLAKE3/blake3_dispatch.c b/llvm/lib/Support/BLAKE3/blake3_dispatch.c
new file mode 100644
index 000000000000..e96e714225f4
--- /dev/null
+++ b/llvm/lib/Support/BLAKE3/blake3_dispatch.c
@@ -0,0 +1,277 @@
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "blake3_impl.h"
+
+#if defined(IS_X86)
+#if defined(_MSC_VER)
+#include <intrin.h>
+#elif defined(__GNUC__)
+#include <immintrin.h>
+#else
+#error "Unimplemented!"
+#endif
+#endif
+
+#define MAYBE_UNUSED(x) (void)((x))
+
+#if defined(IS_X86)
+static uint64_t xgetbv(void) {
+#if defined(_MSC_VER)
+  return _xgetbv(0);
+#else
+  uint32_t eax = 0, edx = 0;
+  __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
+  return ((uint64_t)edx << 32) | eax;
+#endif
+}
+
+static void cpuid(uint32_t out[4], uint32_t id) {
+#if defined(_MSC_VER)
+  __cpuid((int *)out, id);
+#elif defined(__i386__) || defined(_M_IX86)
+  __asm__ __volatile__("movl %%ebx, %1\n"
+                       "cpuid\n"
+                       "xchgl %1, %%ebx\n"
+                       : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
+                       : "a"(id));
+#else
+  __asm__ __volatile__("cpuid\n"
+                       : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
+                       : "a"(id));
+#endif
+}
+
+static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
+#if defined(_MSC_VER)
+  __cpuidex((int *)out, id, sid);
+#elif defined(__i386__) || defined(_M_IX86)
+  __asm__ __volatile__("movl %%ebx, %1\n"
+                       "cpuid\n"
+                       "xchgl %1, %%ebx\n"
+                       : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
+                       : "a"(id), "c"(sid));
+#else
+  __asm__ __volatile__("cpuid\n"
+                       : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
+                       : "a"(id), "c"(sid));
+#endif
+}
+
+#endif
+
+enum cpu_feature {
+  SSE2 = 1 << 0,
+  SSSE3 = 1 << 1,
+  SSE41 = 1 << 2,
+  AVX = 1 << 3,
+  AVX2 = 1 << 4,
+  AVX512F = 1 << 5,
+  AVX512VL = 1 << 6,
+  /* ... */
+  UNDEFINED = 1 << 30
+};
+
+#if !defined(BLAKE3_TESTING)
+static /* Allow the variable to be controlled manually for testing */
+#endif
+    enum cpu_feature g_cpu_features = UNDEFINED;
+
+LLVM_ATTRIBUTE_USED
+#if !defined(BLAKE3_TESTING)
+static
+#endif
+    enum cpu_feature
+    get_cpu_features(void) {
+
+  if (g_cpu_features != UNDEFINED) {
+    return g_cpu_features;
+  } else {
+#if defined(IS_X86)
+    uint32_t regs[4] = {0};
+    uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
+    (void)edx;
+    enum cpu_feature features = 0;
+    cpuid(regs, 0);
+    const int max_id = *eax;
+    cpuid(regs, 1);
+#if defined(__amd64__) || defined(_M_X64)
+    features |= SSE2;
+#else
+    if (*edx & (1UL << 26))
+      features |= SSE2;
+#endif
+    if (*ecx & (1UL << 0))
+      features |= SSSE3;
+    if (*ecx & (1UL << 19))
+      features |= SSE41;
+
+    if (*ecx & (1UL << 27)) { // OSXSAVE
+      const uint64_t mask = xgetbv();
+      if ((mask & 6) == 6) { // SSE and AVX states
+        if (*ecx & (1UL << 28))
+          features |= AVX;
+        if (max_id >= 7) {
+          cpuidex(regs, 7, 0);
+          if (*ebx & (1UL << 5))
+            features |= AVX2;
+          if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
+            if (*ebx & (1UL << 31))
+              features |= AVX512VL;
+            if (*ebx & (1UL << 16))
+              features |= AVX512F;
+          }
+        }
+      }
+    }
+    g_cpu_features = features;
+    return features;
+#else
+    /* How to detect NEON? */
+    return 0;
+#endif
+  }
+}
+
+void blake3_compress_in_place(uint32_t cv[8],
+                              const uint8_t block[BLAKE3_BLOCK_LEN],
+                              uint8_t block_len, uint64_t counter,
+                              uint8_t flags) {
+#if defined(IS_X86)
+  const enum cpu_feature features = get_cpu_features();
+  MAYBE_UNUSED(features);
+#if !defined(BLAKE3_NO_AVX512)
+  if (features & AVX512VL) {
+    blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
+    return;
+  }
+#endif
+#if !defined(BLAKE3_NO_SSE41)
+  if (features & SSE41) {
+    blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
+    return;
+  }
+#endif
+#if !defined(BLAKE3_NO_SSE2)
+  if (features & SSE2) {
+    blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
+    return;
+  }
+#endif
+#endif
+  blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
+}
+
+void blake3_compress_xof(const uint32_t cv[8],
+                         const uint8_t block[BLAKE3_BLOCK_LEN],
+                         uint8_t block_len, uint64_t counter, uint8_t flags,
+                         uint8_t out[64]) {
+#if defined(IS_X86)
+  const enum cpu_feature features = get_cpu_features();
+  MAYBE_UNUSED(features);
+#if !defined(BLAKE3_NO_AVX512)
+  if (features & AVX512VL) {
+    blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
+    return;
+  }
+#endif
+#if !defined(BLAKE3_NO_SSE41)
+  if (features & SSE41) {
+    blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
+    return;
+  }
+#endif
+#if !defined(BLAKE3_NO_SSE2)
+  if (features & SSE2) {
+    blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
+    return;
+  }
+#endif
+#endif
+  blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
+}
+
+void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
+                      size_t blocks, const uint32_t key[8], uint64_t counter,
+                      bool increment_counter, uint8_t flags,
+                      uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+#if defined(IS_X86)
+  const enum cpu_feature features = get_cpu_features();
+  MAYBE_UNUSED(features);
+#if !defined(BLAKE3_NO_AVX512)
+  if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
+    blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
+                            increment_counter, flags, flags_start, flags_end,
+                            out);
+    return;
+  }
+#endif
+#if !defined(BLAKE3_NO_AVX2)
+  if (features & AVX2) {
+    blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
+                          increment_counter, flags, flags_start, flags_end,
+                          out);
+    return;
+  }
+#endif
+#if !defined(BLAKE3_NO_SSE41)
+  if (features & SSE41) {
+    blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
+                           increment_counter, flags, flags_start, flags_end,
+                           out);
+    return;
+  }
+#endif
+#if !defined(BLAKE3_NO_SSE2)
+  if (features & SSE2) {
+    blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
+                          increment_counter, flags, flags_start, flags_end,
+                          out);
+    return;
+  }
+#endif
+#endif
+
+#if BLAKE3_USE_NEON == 1
+  blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
+                        increment_counter, flags, flags_start, flags_end, out);
+  return;
+#endif
+
+  blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
+                            increment_counter, flags, flags_start, flags_end,
+                            out);
+}
+
+// The dynamically detected SIMD degree of the current platform.
+size_t blake3_simd_degree(void) {
+#if defined(IS_X86)
+  const enum cpu_feature features = get_cpu_features();
+  MAYBE_UNUSED(features);
+#if !defined(BLAKE3_NO_AVX512)
+  if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
+    return 16;
+  }
+#endif
+#if !defined(BLAKE3_NO_AVX2)
+  if (features & AVX2) {
+    return 8;
+  }
+#endif
+#if !defined(BLAKE3_NO_SSE41)
+  if (features & SSE41) {
+    return 4;
+  }
+#endif
+#if !defined(BLAKE3_NO_SSE2)
+  if (features & SSE2) {
+    return 4;
+  }
+#endif
+#endif
+#if BLAKE3_USE_NEON == 1
+  return 4;
+#endif
+  return 1;
+}
diff --git a/llvm/lib/Support/BLAKE3/blake3_impl.h b/llvm/lib/Support/BLAKE3/blake3_impl.h
new file mode 100644
index 000000000000..180d0a6eeda8
--- /dev/null
+++ b/llvm/lib/Support/BLAKE3/blake3_impl.h
@@ -0,0 +1,312 @@
+#ifndef BLAKE3_IMPL_H
+#define BLAKE3_IMPL_H
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "llvm-c/blake3.h"
+// For \p LLVM_LIBRARY_VISIBILITY
+#include "llvm/Support/Compiler.h"
+
+// Remove the 'llvm_' prefix for the rest of the internal implementation.
+#define BLAKE3_VERSION_STRING LLVM_BLAKE3_VERSION_STRING
+#define BLAKE3_KEY_LEN LLVM_BLAKE3_KEY_LEN
+#define BLAKE3_OUT_LEN LLVM_BLAKE3_OUT_LEN
+#define BLAKE3_BLOCK_LEN LLVM_BLAKE3_BLOCK_LEN
+#define BLAKE3_CHUNK_LEN LLVM_BLAKE3_CHUNK_LEN
+#define BLAKE3_MAX_DEPTH LLVM_BLAKE3_MAX_DEPTH
+#define blake3_hasher llvm_blake3_hasher
+#define blake3_chunk_state llvm_blake3_chunk_state
+
+// internal flags
+enum blake3_flags {
+  CHUNK_START         = 1 << 0,
+  CHUNK_END           = 1 << 1,
+  PARENT              = 1 << 2,
+  ROOT                = 1 << 3,
+  KEYED_HASH          = 1 << 4,
+  DERIVE_KEY_CONTEXT  = 1 << 5,
+  DERIVE_KEY_MATERIAL = 1 << 6,
+};
+
+// This C implementation tries to support recent versions of GCC, Clang, and
+// MSVC.
+#if defined(_MSC_VER)
+#define INLINE static __forceinline
+#else
+#define INLINE static inline __attribute__((always_inline))
+#endif
+
+#if defined(__x86_64__) || defined(_M_X64) 
+#define IS_X86
+#define IS_X86_64
+#endif
+
+#if defined(__i386__) || defined(_M_IX86)
+#define IS_X86
+#define IS_X86_32
+#endif
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+#define IS_AARCH64
+#endif
+
+#if defined(IS_X86)
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+#include <immintrin.h>
+#endif
+
+#if !defined(BLAKE3_USE_NEON) 
+  // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness
+  #if defined(IS_AARCH64)
+    #define BLAKE3_USE_NEON 1
+  #else
+    #define BLAKE3_USE_NEON 0
+  #endif
+#endif
+
+#if defined(IS_X86)
+#define MAX_SIMD_DEGREE 16
+#elif BLAKE3_USE_NEON == 1
+#define MAX_SIMD_DEGREE 4
+#else
+#define MAX_SIMD_DEGREE 1
+#endif
+
+// There are some places where we want a static size that's equal to the
+// MAX_SIMD_DEGREE, but also at least 2.
+#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2)
+
+static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL,
+                               0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL,
+                               0x1F83D9ABUL, 0x5BE0CD19UL};
+
+static const uint8_t MSG_SCHEDULE[7][16] = {
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+    {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8},
+    {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1},
+    {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6},
+    {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4},
+    {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7},
+    {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13},
+};
+
+/* Find index of the highest set bit */
+/* x is assumed to be nonzero.       */
+static unsigned int highest_one(uint64_t x) {
+#if defined(__GNUC__) || defined(__clang__)
+  return 63 ^ __builtin_clzll(x);
+#elif defined(_MSC_VER) && defined(IS_X86_64)
+  unsigned long index;
+  _BitScanReverse64(&index, x);
+  return index;
+#elif defined(_MSC_VER) && defined(IS_X86_32)
+  if(x >> 32) {
+    unsigned long index;
+    _BitScanReverse(&index, (unsigned long)(x >> 32));
+    return 32 + index;
+  } else {
+    unsigned long index;
+    _BitScanReverse(&index, (unsigned long)x);
+    return index;
+  }
+#else
+  unsigned int c = 0;
+  if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; }
+  if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; }
+  if(x & 0x000000000000ff00ULL) { x >>=  8; c +=  8; }
+  if(x & 0x00000000000000f0ULL) { x >>=  4; c +=  4; }
+  if(x & 0x000000000000000cULL) { x >>=  2; c +=  2; }
+  if(x & 0x0000000000000002ULL) {           c +=  1; }
+  return c;
+#endif
+}
+
+// Count the number of 1 bits.
+INLINE unsigned int popcnt(uint64_t x) {
+#if defined(__GNUC__) || defined(__clang__)
+  return __builtin_popcountll(x);
+#else
+  unsigned int count = 0;
+  while (x != 0) {
+    count += 1;
+    x &= x - 1;
+  }
+  return count;
+#endif
+}
+
+// Largest power of two less than or equal to x. As a special case, returns 1
+// when x is 0. 
+INLINE uint64_t round_down_to_power_of_2(uint64_t x) {
+  return 1ULL << highest_one(x | 1);
+}
+
+INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; }
+
+INLINE uint32_t counter_high(uint64_t counter) {
+  return (uint32_t)(counter >> 32);
+}
+
+INLINE uint32_t load32(const void *src) {
+  const uint8_t *p = (const uint8_t *)src;
+  return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) |
+         ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);
+}
+
+INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
+                           uint32_t key_words[8]) {
+  key_words[0] = load32(&key[0 * 4]);
+  key_words[1] = load32(&key[1 * 4]);
+  key_words[2] = load32(&key[2 * 4]);
+  key_words[3] = load32(&key[3 * 4]);
+  key_words[4] = load32(&key[4 * 4]);
+  key_words[5] = load32(&key[5 * 4]);
+  key_words[6] = load32(&key[6 * 4]);
+  key_words[7] = load32(&key[7 * 4]);
+}
+
+INLINE void store32(void *dst, uint32_t w) {
+  uint8_t *p = (uint8_t *)dst;
+  p[0] = (uint8_t)(w >> 0);
+  p[1] = (uint8_t)(w >> 8);
+  p[2] = (uint8_t)(w >> 16);
+  p[3] = (uint8_t)(w >> 24);
+}
+
+INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) {
+  store32(&bytes_out[0 * 4], cv_words[0]);
+  store32(&bytes_out[1 * 4], cv_words[1]);
+  store32(&bytes_out[2 * 4], cv_words[2]);
+  store32(&bytes_out[3 * 4], cv_words[3]);
+  store32(&bytes_out[4 * 4], cv_words[4]);
+  store32(&bytes_out[5 * 4], cv_words[5]);
+  store32(&bytes_out[6 * 4], cv_words[6]);
+  store32(&bytes_out[7 * 4], cv_words[7]);
+}
+
+LLVM_LIBRARY_VISIBILITY
+void blake3_compress_in_place(uint32_t cv[8],
+                              const uint8_t block[BLAKE3_BLOCK_LEN],
+                              uint8_t block_len, uint64_t counter,
+                              uint8_t flags);
+
+LLVM_LIBRARY_VISIBILITY
+void blake3_compress_xof(const uint32_t cv[8],
+                         const uint8_t block[BLAKE3_BLOCK_LEN],
+                         uint8_t block_len, uint64_t counter, uint8_t flags,
+                         uint8_t out[64]);
+
+LLVM_LIBRARY_VISIBILITY
+void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
+                      size_t blocks, const uint32_t key[8], uint64_t counter,
+                      bool increment_counter, uint8_t flags,
+                      uint8_t flags_start, uint8_t flags_end, uint8_t *out);
+
+LLVM_LIBRARY_VISIBILITY
+size_t blake3_simd_degree(void);
+
+
+// Declarations for implementation-specific functions.
+LLVM_LIBRARY_VISIBILITY
+void blake3_compress_in_place_portable(uint32_t cv[8],
+                                       const uint8_t block[BLAKE3_BLOCK_LEN],
+                                       uint8_t block_len, uint64_t counter,
+                                       uint8_t flags);
+
+LLVM_LIBRARY_VISIBILITY
+void blake3_compress_xof_portable(const uint32_t cv[8],
+                                  const uint8_t block[BLAKE3_BLOCK_LEN],
+                                  uint8_t block_len, uint64_t counter,
+                                  uint8_t flags, uint8_t out[64]);
+
+LLVM_LIBRARY_VISIBILITY
+void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
+                               size_t blocks, const uint32_t key[8],
+                               uint64_t counter, bool increment_counter,
+                               uint8_t flags, uint8_t flags_start,
+                               uint8_t flags_end, uint8_t *out);
+
+#if defined(IS_X86)
+#if !defined(BLAKE3_NO_SSE2)
+LLVM_LIBRARY_VISIBILITY
+void blake3_compress_in_place_sse2(uint32_t cv[8],
+                                   const uint8_t block[BLAKE3_BLOCK_LEN],
+                                   uint8_t block_len, uint64_t counter,
+                                   uint8_t flags);
+LLVM_LIBRARY_VISIBILITY
+void blake3_compress_xof_sse2(const uint32_t cv[8],
+                              const uint8_t block[BLAKE3_BLOCK_LEN],
+                              uint8_t block_len, uint64_t counter,
+                              uint8_t flags, uint8_t out[64]);
+LLVM_LIBRARY_VISIBILITY
+void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
+                           size_t blocks, const uint32_t key[8],
+                           uint64_t counter, bool increment_counter,
+                           uint8_t flags, uint8_t flags_start,
+                           uint8_t flags_end, uint8_t *out);
+#endif
+#if !defined(BLAKE3_NO_SSE41)
+LLVM_LIBRARY_VISIBILITY
+void blake3_compress_in_place_sse41(uint32_t cv[8],
+                                    const uint8_t block[BLAKE3_BLOCK_LEN],
+                                    uint8_t block_len, uint64_t counter,
+                                    uint8_t flags);
+LLVM_LIBRARY_VISIBILITY
+void blake3_compress_xof_sse41(const uint32_t cv[8],
+                               const uint8_t block[BLAKE3_BLOCK_LEN],
+                               uint8_t block_len, uint64_t counter,
+                               uint8_t flags, uint8_t out[64]);
+LLVM_LIBRARY_VISIBILITY
+void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
+                            size_t blocks, const uint32_t key[8],
+                            uint64_t counter, bool increment_counter,
+                            uint8_t flags, uint8_t flags_start,
+                            uint8_t flags_end, uint8_t *out);
+#endif
+#if !defined(BLAKE3_NO_AVX2)
+LLVM_LIBRARY_VISIBILITY
+void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
+                           size_t blocks, const uint32_t key[8],
+                           uint64_t counter, bool increment_counter,
+                           uint8_t flags, uint8_t flags_start,
+                           uint8_t flags_end, uint8_t *out);
+#endif
+#if !defined(BLAKE3_NO_AVX512)
+LLVM_LIBRARY_VISIBILITY
+void blake3_compress_in_place_avx512(uint32_t cv[8],
+                                     const uint8_t block[BLAKE3_BLOCK_LEN],
+                                     uint8_t block_len, uint64_t counter,
+                                     uint8_t flags);
+
+LLVM_LIBRARY_VISIBILITY
+void blake3_compress_xof_avx512(const uint32_t cv[8],
+                                const uint8_t block[BLAKE3_BLOCK_LEN],
+                                uint8_t block_len, uint64_t counter,
+                                uint8_t flags, uint8_t out[64]);
+
+LLVM_LIBRARY_VISIBILITY
+void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
+                             size_t blocks, const uint32_t key[8],
+                             uint64_t counter, bool increment_counter,
+                             uint8_t flags, uint8_t flags_start,
+                             uint8_t flags_end, uint8_t *out);
+#endif
+#endif
+
+#if BLAKE3_USE_NEON == 1
+LLVM_LIBRARY_VISIBILITY
+void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
+                           size_t blocks, const uint32_t key[8],
+                           uint64_t counter, bool increment_counter,
+                           uint8_t flags, uint8_t flags_start,
+                           uint8_t flags_end, uint8_t *out);
+#endif
+
+
+#endif /* BLAKE3_IMPL_H */
diff --git a/llvm/lib/Support/BLAKE3/blake3_neon.c b/llvm/lib/Support/BLAKE3/blake3_neon.c
new file mode 100644
index 000000000000..380bbfc3e466
--- /dev/null
+++ b/llvm/lib/Support/BLAKE3/blake3_neon.c
@@ -0,0 +1,356 @@
+#include "blake3_impl.h"
+
+#if BLAKE3_USE_NEON
+
+#include <arm_neon.h>
+
+#ifdef __ARM_BIG_ENDIAN
+#error "This implementation only supports little-endian ARM."
+// It might be that all we need for big-endian support here is to get the loads
+// and stores right, but step zero would be finding a way to test it in CI.
+#endif
+
+INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
+  // vld1q_u32 has alignment requirements. Don't use it.
+  uint32x4_t x;
+  memcpy(&x, src, 16);
+  return x;
+}
+
+INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) {
+  // vst1q_u32 has alignment requirements. Don't use it.
+  memcpy(dest, &src, 16);
+}
+
+INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) {
+  return vaddq_u32(a, b);
+}
+
+INLINE uint32x4_t xor_128(uint32x4_t a, uint32x4_t b) {
+  return veorq_u32(a, b);
+}
+
+INLINE uint32x4_t set1_128(uint32_t x) { return vld1q_dup_u32(&x); }
+
+INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+  uint32_t array[4] = {a, b, c, d};
+  return vld1q_u32(array);
+}
+
+INLINE uint32x4_t rot16_128(uint32x4_t x) {
+  return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
+}
+
+INLINE uint32x4_t rot12_128(uint32x4_t x) {
+  return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
+}
+
+INLINE uint32x4_t rot8_128(uint32x4_t x) {
+  return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
+}
+
+INLINE uint32x4_t rot7_128(uint32x4_t x) {
+  return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
+}
+
+// TODO: compress_neon
+
+// TODO: hash2_neon
+
+/*
+ * ----------------------------------------------------------------------------
+ * hash4_neon
+ * ----------------------------------------------------------------------------
+ */
+
+INLINE void round_fn4(uint32x4_t v[16], uint32x4_t m[16], size_t r) {
+  v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
+  v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
+  v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
+  v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
+  v[0] = add_128(v[0], v[4]);
+  v[1] = add_128(v[1], v[5]);
+  v[2] = add_128(v[2], v[6]);
+  v[3] = add_128(v[3], v[7]);
+  v[12] = xor_128(v[12], v[0]);
+  v[13] = xor_128(v[13], v[1]);
+  v[14] = xor_128(v[14], v[2]);
+  v[15] = xor_128(v[15], v[3]);
+  v[12] = rot16_128(v[12]);
+  v[13] = rot16_128(v[13]);
+  v[14] = rot16_128(v[14]);
+  v[15] = rot16_128(v[15]);
+  v[8] = add_128(v[8], v[12]);
+  v[9] = add_128(v[9], v[13]);
+  v[10] = add_128(v[10], v[14]);
+  v[11] = add_128(v[11], v[15]);
+  v[4] = xor_128(v[4], v[8]);
+  v[5] = xor_128(v[5], v[9]);
+  v[6] = xor_128(v[6], v[10]);
+  v[7] = xor_128(v[7], v[11]);
+  v[4] = rot12_128(v[4]);
+  v[5] = rot12_128(v[5]);
+  v[6] = rot12_128(v[6]);
+  v[7] = rot12_128(v[7]);
+  v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
+  v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
+  v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
+  v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
+  v[0] = add_128(v[0], v[4]);
+  v[1] = add_128(v[1], v[5]);
+  v[2] = add_128(v[2], v[6]);
+  v[3] = add_128(v[3], v[7]);
+  v[12] = xor_128(v[12], v[0]);
+  v[13] = xor_128(v[13], v[1]);
+  v[14] = xor_128(v[14], v[2]);
+  v[15] = xor_128(v[15], v[3]);
+  v[12] = rot8_128(v[12]);
+  v[13] = rot8_128(v[13]);
+  v[14] = rot8_128(v[14]);
+  v[15] = rot8_128(v[15]);
+  v[8] = add_128(v[8], v[12]);
+  v[9] = add_128(v[9], v[13]);
+  v[10] = add_128(v[10], v[14]);
+  v[11] = add_128(v[11], v[15]);
+  v[4] = xor_128(v[4], v[8]);
+  v[5] = xor_128(v[5], v[9]);
+  v[6] = xor_128(v[6], v[10]);
+  v[7] = xor_128(v[7], v[11]);
+  v[4] = rot7_128(v[4]);
+  v[5] = rot7_128(v[5]);
+  v[6] = rot7_128(v[6]);
+  v[7] = rot7_128(v[7]);
+
+  v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
+  v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
+  v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
+  v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
+  v[0] = add_128(v[0], v[5]);
+  v[1] = add_128(v[1], v[6]);
+  v[2] = add_128(v[2], v[7]);
+  v[3] = add_128(v[3], v[4]);
+  v[15] = xor_128(v[15], v[0]);
+  v[12] = xor_128(v[12], v[1]);
+  v[13] = xor_128(v[13], v[2]);
+  v[14] = xor_128(v[14], v[3]);
+  v[15] = rot16_128(v[15]);
+  v[12] = rot16_128(v[12]);
+  v[13] = rot16_128(v[13]);
+  v[14] = rot16_128(v[14]);
+  v[10] = add_128(v[10], v[15]);
+  v[11] = add_128(v[11], v[12]);
+  v[8] = add_128(v[8], v[13]);
+  v[9] = add_128(v[9], v[14]);
+  v[5] = xor_128(v[5], v[10]);
+  v[6] = xor_128(v[6], v[11]);
+  v[7] = xor_128(v[7], v[8]);
+  v[4] = xor_128(v[4], v[9]);
+  v[5] = rot12_128(v[5]);
+  v[6] = rot12_128(v[6]);
+  v[7] = rot12_128(v[7]);
+  v[4] = rot12_128(v[4]);
+  v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
+  v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
+  v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
+  v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
+  v[0] = add_128(v[0], v[5]);
+  v[1] = add_128(v[1], v[6]);
+  v[2] = add_128(v[2], v[7]);
+  v[3] = add_128(v[3], v[4]);
+  v[15] = xor_128(v[15], v[0]);
+  v[12] = xor_128(v[12], v[1]);
+  v[13] = xor_128(v[13], v[2]);
+  v[14] = xor_128(v[14], v[3]);
+  v[15] = rot8_128(v[15]);
+  v[12] = rot8_128(v[12]);
+  v[13] = rot8_128(v[13]);
+  v[14] = rot8_128(v[14]);
+  v[10] = add_128(v[10], v[15]);
+  v[11] = add_128(v[11], v[12]);
+  v[8] = add_128(v[8], v[13]);
+  v[9] = add_128(v[9], v[14]);
+  v[5] = xor_128(v[5], v[10]);
+  v[6] = xor_128(v[6], v[11]);
+  v[7] = xor_128(v[7], v[8]);
+  v[4] = xor_128(v[4], v[9]);
+  v[5] = rot7_128(v[5]);
+  v[6] = rot7_128(v[6]);
+  v[7] = rot7_128(v[7]);
+  v[4] = rot7_128(v[4]);
+}
+
+INLINE void transpose_vecs_128(uint32x4_t vecs[4]) {
+  // Individually transpose the four 2x2 sub-matrices in each corner.
+  uint32x4x2_t rows01 = vtrnq_u32(vecs[0], vecs[1]);
+  uint32x4x2_t rows23 = vtrnq_u32(vecs[2], vecs[3]);
+
+  // Swap the top-right and bottom-left 2x2s (which just got transposed).
+  vecs[0] =
+      vcombine_u32(vget_low_u32(rows01.val[0]), vget_low_u32(rows23.val[0]));
+  vecs[1] =
+      vcombine_u32(vget_low_u32(rows01.val[1]), vget_low_u32(rows23.val[1]));
+  vecs[2] =
+      vcombine_u32(vget_high_u32(rows01.val[0]), vget_high_u32(rows23.val[0]));
+  vecs[3] =
+      vcombine_u32(vget_high_u32(rows01.val[1]), vget_high_u32(rows23.val[1]));
+}
+
+INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
+                                size_t block_offset, uint32x4_t out[16]) {
+  out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(uint32x4_t)]);
+  out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(uint32x4_t)]);
+  out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(uint32x4_t)]);
+  out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(uint32x4_t)]);
+  out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(uint32x4_t)]);
+  out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(uint32x4_t)]);
+  out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(uint32x4_t)]);
+  out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(uint32x4_t)]);
+  out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(uint32x4_t)]);
+  out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(uint32x4_t)]);
+  out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(uint32x4_t)]);
+  out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(uint32x4_t)]);
+  out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(uint32x4_t)]);
+  out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(uint32x4_t)]);
+  out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(uint32x4_t)]);
+  out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(uint32x4_t)]);
+  transpose_vecs_128(&out[0]);
+  transpose_vecs_128(&out[4]);
+  transpose_vecs_128(&out[8]);
+  transpose_vecs_128(&out[12]);
+}
+
+INLINE void load_counters4(uint64_t counter, bool increment_counter,
+                           uint32x4_t *out_low, uint32x4_t *out_high) {
+  uint64_t mask = (increment_counter ? ~0 : 0);
+  *out_low = set4(
+      counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)),
+      counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3)));
+  *out_high = set4(
+      counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)),
+      counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)));
+}
+
+static
+void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
+                       const uint32_t key[8], uint64_t counter,
+                       bool increment_counter, uint8_t flags,
+                       uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+  uint32x4_t h_vecs[8] = {
+      set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
+      set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
+  };
+  uint32x4_t counter_low_vec, counter_high_vec;
+  load_counters4(counter, increment_counter, &counter_low_vec,
+                 &counter_high_vec);
+  uint8_t block_flags = flags | flags_start;
+
+  for (size_t block = 0; block < blocks; block++) {
+    if (block + 1 == blocks) {
+      block_flags |= flags_end;
+    }
+    uint32x4_t block_len_vec = set1_128(BLAKE3_BLOCK_LEN);
+    uint32x4_t block_flags_vec = set1_128(block_flags);
+    uint32x4_t msg_vecs[16];
+    transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
+
+    uint32x4_t v[16] = {
+        h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
+        h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
+        set1_128(IV[0]), set1_128(IV[1]),  set1_128(IV[2]), set1_128(IV[3]),
+        counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
+    };
+    round_fn4(v, msg_vecs, 0);
+    round_fn4(v, msg_vecs, 1);
+    round_fn4(v, msg_vecs, 2);
+    round_fn4(v, msg_vecs, 3);
+    round_fn4(v, msg_vecs, 4);
+    round_fn4(v, msg_vecs, 5);
+    round_fn4(v, msg_vecs, 6);
+    h_vecs[0] = xor_128(v[0], v[8]);
+    h_vecs[1] = xor_128(v[1], v[9]);
+    h_vecs[2] = xor_128(v[2], v[10]);
+    h_vecs[3] = xor_128(v[3], v[11]);
+    h_vecs[4] = xor_128(v[4], v[12]);
+    h_vecs[5] = xor_128(v[5], v[13]);
+    h_vecs[6] = xor_128(v[6], v[14]);
+    h_vecs[7] = xor_128(v[7], v[15]);
+
+    block_flags = flags;
+  }
+
+  transpose_vecs_128(&h_vecs[0]);
+  transpose_vecs_128(&h_vecs[4]);
+  // The first four vecs now contain the first half of each output, and the
+  // second four vecs contain the second half of each output.
+  storeu_128(h_vecs[0], &out[0 * sizeof(uint32x4_t)]);
+  storeu_128(h_vecs[4], &out[1 * sizeof(uint32x4_t)]);
+  storeu_128(h_vecs[1], &out[2 * sizeof(uint32x4_t)]);
+  storeu_128(h_vecs[5], &out[3 * sizeof(uint32x4_t)]);
+  storeu_128(h_vecs[2], &out[4 * sizeof(uint32x4_t)]);
+  storeu_128(h_vecs[6], &out[5 * sizeof(uint32x4_t)]);
+  storeu_128(h_vecs[3], &out[6 * sizeof(uint32x4_t)]);
+  storeu_128(h_vecs[7], &out[7 * sizeof(uint32x4_t)]);
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * hash_many_neon
+ * ----------------------------------------------------------------------------
+ */
+
+void blake3_compress_in_place_portable(uint32_t cv[8],
+                                       const uint8_t block[BLAKE3_BLOCK_LEN],
+                                       uint8_t block_len, uint64_t counter,
+                                       uint8_t flags);
+
+INLINE void hash_one_neon(const uint8_t *input, size_t blocks,
+                          const uint32_t key[8], uint64_t counter,
+                          uint8_t flags, uint8_t flags_start, uint8_t flags_end,
+                          uint8_t out[BLAKE3_OUT_LEN]) {
+  uint32_t cv[8];
+  memcpy(cv, key, BLAKE3_KEY_LEN);
+  uint8_t block_flags = flags | flags_start;
+  while (blocks > 0) {
+    if (blocks == 1) {
+      block_flags |= flags_end;
+    }
+    // TODO: Implement compress_neon. However note that according to
+    // https://github.com/BLAKE2/BLAKE2/commit/7965d3e6e1b4193438b8d3a656787587d2579227,
+    // compress_neon might not be any faster than compress_portable.
+    blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter,
+                                      block_flags);
+    input = &input[BLAKE3_BLOCK_LEN];
+    blocks -= 1;
+    block_flags = flags;
+  }
+  memcpy(out, cv, BLAKE3_OUT_LEN);
+}
+
+void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
+                           size_t blocks, const uint32_t key[8],
+                           uint64_t counter, bool increment_counter,
+                           uint8_t flags, uint8_t flags_start,
+                           uint8_t flags_end, uint8_t *out) {
+  while (num_inputs >= 4) {
+    blake3_hash4_neon(inputs, blocks, key, counter, increment_counter, flags,
+                      flags_start, flags_end, out);
+    if (increment_counter) {
+      counter += 4;
+    }
+    inputs += 4;
+    num_inputs -= 4;
+    out = &out[4 * BLAKE3_OUT_LEN];
+  }
+  while (num_inputs > 0) {
+    hash_one_neon(inputs[0], blocks, key, counter, flags, flags_start,
+                  flags_end, out);
+    if (increment_counter) {
+      counter += 1;
+    }
+    inputs += 1;
+    num_inputs -= 1;
+    out = &out[BLAKE3_OUT_LEN];
+  }
+}
+
+#endif // BLAKE3_USE_NEON
diff --git a/llvm/lib/Support/BLAKE3/blake3_portable.c b/llvm/lib/Support/BLAKE3/blake3_portable.c
new file mode 100644
index 000000000000..062dd1b47fb6
--- /dev/null
+++ b/llvm/lib/Support/BLAKE3/blake3_portable.c
@@ -0,0 +1,160 @@
+#include "blake3_impl.h"
+#include <string.h>
+
+INLINE uint32_t rotr32(uint32_t w, uint32_t c) {
+  return (w >> c) | (w << (32 - c));
+}
+
+INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d,
+              uint32_t x, uint32_t y) {
+  state[a] = state[a] + state[b] + x;
+  state[d] = rotr32(state[d] ^ state[a], 16);
+  state[c] = state[c] + state[d];
+  state[b] = rotr32(state[b] ^ state[c], 12);
+  state[a] = state[a] + state[b] + y;
+  state[d] = rotr32(state[d] ^ state[a], 8);
+  state[c] = state[c] + state[d];
+  state[b] = rotr32(state[b] ^ state[c], 7);
+}
+
+INLINE void round_fn(uint32_t state[16], const uint32_t *msg, size_t round) {
+  // Select the message schedule based on the round.
+  const uint8_t *schedule = MSG_SCHEDULE[round];
+
+  // Mix the columns.
+  g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
+  g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
+  g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
+  g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
+
+  // Mix the rows.
+  g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
+  g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
+  g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
+  g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
+}
+
+INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8],
+                         const uint8_t block[BLAKE3_BLOCK_LEN],
+                         uint8_t block_len, uint64_t counter, uint8_t flags) {
+  uint32_t block_words[16];
+  block_words[0] = load32(block + 4 * 0);
+  block_words[1] = load32(block + 4 * 1);
+  block_words[2] = load32(block + 4 * 2);
+  block_words[3] = load32(block + 4 * 3);
+  block_words[4] = load32(block + 4 * 4);
+  block_words[5] = load32(block + 4 * 5);
+  block_words[6] = load32(block + 4 * 6);
+  block_words[7] = load32(block + 4 * 7);
+  block_words[8] = load32(block + 4 * 8);
+  block_words[9] = load32(block + 4 * 9);
+  block_words[10] = load32(block + 4 * 10);
+  block_words[11] = load32(block + 4 * 11);
+  block_words[12] = load32(block + 4 * 12);
+  block_words[13] = load32(block + 4 * 13);
+  block_words[14] = load32(block + 4 * 14);
+  block_words[15] = load32(block + 4 * 15);
+
+  state[0] = cv[0];
+  state[1] = cv[1];
+  state[2] = cv[2];
+  state[3] = cv[3];
+  state[4] = cv[4];
+  state[5] = cv[5];
+  state[6] = cv[6];
+  state[7] = cv[7];
+  state[8] = IV[0];
+  state[9] = IV[1];
+  state[10] = IV[2];
+  state[11] = IV[3];
+  state[12] = counter_low(counter);
+  state[13] = counter_high(counter);
+  state[14] = (uint32_t)block_len;
+  state[15] = (uint32_t)flags;
+
+  round_fn(state, &block_words[0], 0);
+  round_fn(state, &block_words[0], 1);
+  round_fn(state, &block_words[0], 2);
+  round_fn(state, &block_words[0], 3);
+  round_fn(state, &block_words[0], 4);
+  round_fn(state, &block_words[0], 5);
+  round_fn(state, &block_words[0], 6);
+}
+
+void blake3_compress_in_place_portable(uint32_t cv[8],
+                                       const uint8_t block[BLAKE3_BLOCK_LEN],
+                                       uint8_t block_len, uint64_t counter,
+                                       uint8_t flags) {
+  uint32_t state[16];
+  compress_pre(state, cv, block, block_len, counter, flags);
+  cv[0] = state[0] ^ state[8];
+  cv[1] = state[1] ^ state[9];
+  cv[2] = state[2] ^ state[10];
+  cv[3] = state[3] ^ state[11];
+  cv[4] = state[4] ^ state[12];
+  cv[5] = state[5] ^ state[13];
+  cv[6] = state[6] ^ state[14];
+  cv[7] = state[7] ^ state[15];
+}
+
+void blake3_compress_xof_portable(const uint32_t cv[8],
+                                  const uint8_t block[BLAKE3_BLOCK_LEN],
+                                  uint8_t block_len, uint64_t counter,
+                                  uint8_t flags, uint8_t out[64]) {
+  uint32_t state[16];
+  compress_pre(state, cv, block, block_len, counter, flags);
+
+  store32(&out[0 * 4], state[0] ^ state[8]);
+  store32(&out[1 * 4], state[1] ^ state[9]);
+  store32(&out[2 * 4], state[2] ^ state[10]);
+  store32(&out[3 * 4], state[3] ^ state[11]);
+  store32(&out[4 * 4], state[4] ^ state[12]);
+  store32(&out[5 * 4], state[5] ^ state[13]);
+  store32(&out[6 * 4], state[6] ^ state[14]);
+  store32(&out[7 * 4], state[7] ^ state[15]);
+  store32(&out[8 * 4], state[8] ^ cv[0]);
+  store32(&out[9 * 4], state[9] ^ cv[1]);
+  store32(&out[10 * 4], state[10] ^ cv[2]);
+  store32(&out[11 * 4], state[11] ^ cv[3]);
+  store32(&out[12 * 4], state[12] ^ cv[4]);
+  store32(&out[13 * 4], state[13] ^ cv[5]);
+  store32(&out[14 * 4], state[14] ^ cv[6]);
+  store32(&out[15 * 4], state[15] ^ cv[7]);
+}
+
+INLINE void hash_one_portable(const uint8_t *input, size_t blocks,
+                              const uint32_t key[8], uint64_t counter,
+                              uint8_t flags, uint8_t flags_start,
+                              uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
+  uint32_t cv[8];
+  memcpy(cv, key, BLAKE3_KEY_LEN);
+  uint8_t block_flags = flags | flags_start;
+  while (blocks > 0) {
+    if (blocks == 1) {
+      block_flags |= flags_end;
+    }
+    blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter,
+                                      block_flags);
+    input = &input[BLAKE3_BLOCK_LEN];
+    blocks -= 1;
+    block_flags = flags;
+  }
+  store_cv_words(out, cv);
+}
+
+void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
+                               size_t blocks, const uint32_t key[8],
+                               uint64_t counter, bool increment_counter,
+                               uint8_t flags, uint8_t flags_start,
+                               uint8_t flags_end, uint8_t *out) {
+  while (num_inputs > 0) {
+    hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start,
+                      flags_end, out);
+    if (increment_counter) {
+      counter += 1;
+    }
+    inputs += 1;
+    num_inputs -= 1;
+    out = &out[BLAKE3_OUT_LEN];
+  }
+}
diff --git a/llvm/lib/Support/BLAKE3/blake3_sse2.c b/llvm/lib/Support/BLAKE3/blake3_sse2.c
new file mode 100644
index 000000000000..f4449ac0b3cd
--- /dev/null
+++ b/llvm/lib/Support/BLAKE3/blake3_sse2.c
@@ -0,0 +1,566 @@
+#include "blake3_impl.h"
+
+#include <immintrin.h>
+
+#define DEGREE 4
+
+#define _mm_shuffle_ps2(a, b, c)                                               \
+  (_mm_castps_si128(                                                           \
+      _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
+
+INLINE __m128i loadu(const uint8_t src[16]) {
+  return _mm_loadu_si128((const __m128i *)src);
+}
+
+INLINE void storeu(__m128i src, uint8_t dest[16]) {
+  _mm_storeu_si128((__m128i *)dest, src);
+}
+
+INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
+
+// Note that clang-format doesn't like the name "xor" for some reason.
+INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
+
+INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); }
+
+INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+  return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
+}
+
+INLINE __m128i rot16(__m128i x) {
+  return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0xB1), 0xB1);
+}
+
+INLINE __m128i rot12(__m128i x) {
+  return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12));
+}
+
+INLINE __m128i rot8(__m128i x) {
+  return xorv(_mm_srli_epi32(x, 8), _mm_slli_epi32(x, 32 - 8));
+}
+
+INLINE __m128i rot7(__m128i x) {
+  return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7));
+}
+
+INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
+               __m128i m) {
+  *row0 = addv(addv(*row0, m), *row1);
+  *row3 = xorv(*row3, *row0);
+  *row3 = rot16(*row3);
+  *row2 = addv(*row2, *row3);
+  *row1 = xorv(*row1, *row2);
+  *row1 = rot12(*row1);
+}
+
+INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
+               __m128i m) {
+  *row0 = addv(addv(*row0, m), *row1);
+  *row3 = xorv(*row3, *row0);
+  *row3 = rot8(*row3);
+  *row2 = addv(*row2, *row3);
+  *row1 = xorv(*row1, *row2);
+  *row1 = rot7(*row1);
+}
+
+// Note the optimization here of leaving row1 as the unrotated row, rather than
+// row0. All the message loads below are adjusted to compensate for this. See
+// discussion at https://github.com/sneves/blake2-avx2/pull/4
+INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
+  *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
+  *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
+  *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
+}
+
+INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
+  *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
+  *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
+  *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
+}
+
+INLINE __m128i blend_epi16(__m128i a, __m128i b, const int16_t imm8) {
+  const __m128i bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
+  __m128i mask = _mm_set1_epi16(imm8);
+  mask = _mm_and_si128(mask, bits);
+  mask = _mm_cmpeq_epi16(mask, bits);
+  return _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a));
+}
+
+INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
+                         const uint8_t block[BLAKE3_BLOCK_LEN],
+                         uint8_t block_len, uint64_t counter, uint8_t flags) {
+  rows[0] = loadu((uint8_t *)&cv[0]);
+  rows[1] = loadu((uint8_t *)&cv[4]);
+  rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
+  rows[3] = set4(counter_low(counter), counter_high(counter),
+                 (uint32_t)block_len, (uint32_t)flags);
+
+  __m128i m0 = loadu(&block[sizeof(__m128i) * 0]);
+  __m128i m1 = loadu(&block[sizeof(__m128i) * 1]);
+  __m128i m2 = loadu(&block[sizeof(__m128i) * 2]);
+  __m128i m3 = loadu(&block[sizeof(__m128i) * 3]);
+
+  __m128i t0, t1, t2, t3, tt;
+
+  // Round 1. The first round permutes the message words from the original
+  // input order, into the groups that get mixed in parallel.
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); //  6  4  2  0
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); //  7  5  3  1
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10  8
+  t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));   // 12 10  8 14
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11  9
+  t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));   // 13 11  9 15
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 2. This round and all following rounds apply a fixed permutation
+  // to the message words from the round before.
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 3
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 4
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 5
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 6
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 7
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+}
+
+void blake3_compress_in_place_sse2(uint32_t cv[8],
+                                   const uint8_t block[BLAKE3_BLOCK_LEN],
+                                   uint8_t block_len, uint64_t counter,
+                                   uint8_t flags) {
+  __m128i rows[4];
+  compress_pre(rows, cv, block, block_len, counter, flags);
+  storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]);
+  storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]);
+}
+
+void blake3_compress_xof_sse2(const uint32_t cv[8],
+                              const uint8_t block[BLAKE3_BLOCK_LEN],
+                              uint8_t block_len, uint64_t counter,
+                              uint8_t flags, uint8_t out[64]) {
+  __m128i rows[4];
+  compress_pre(rows, cv, block, block_len, counter, flags);
+  storeu(xorv(rows[0], rows[2]), &out[0]);
+  storeu(xorv(rows[1], rows[3]), &out[16]);
+  storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]);
+  storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]);
+}
+
+INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
+  v[0] = addv(v[0], v[4]);
+  v[1] = addv(v[1], v[5]);
+  v[2] = addv(v[2], v[6]);
+  v[3] = addv(v[3], v[7]);
+  v[12] = xorv(v[12], v[0]);
+  v[13] = xorv(v[13], v[1]);
+  v[14] = xorv(v[14], v[2]);
+  v[15] = xorv(v[15], v[3]);
+  v[12] = rot16(v[12]);
+  v[13] = rot16(v[13]);
+  v[14] = rot16(v[14]);
+  v[15] = rot16(v[15]);
+  v[8] = addv(v[8], v[12]);
+  v[9] = addv(v[9], v[13]);
+  v[10] = addv(v[10], v[14]);
+  v[11] = addv(v[11], v[15]);
+  v[4] = xorv(v[4], v[8]);
+  v[5] = xorv(v[5], v[9]);
+  v[6] = xorv(v[6], v[10]);
+  v[7] = xorv(v[7], v[11]);
+  v[4] = rot12(v[4]);
+  v[5] = rot12(v[5]);
+  v[6] = rot12(v[6]);
+  v[7] = rot12(v[7]);
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
+  v[0] = addv(v[0], v[4]);
+  v[1] = addv(v[1], v[5]);
+  v[2] = addv(v[2], v[6]);
+  v[3] = addv(v[3], v[7]);
+  v[12] = xorv(v[12], v[0]);
+  v[13] = xorv(v[13], v[1]);
+  v[14] = xorv(v[14], v[2]);
+  v[15] = xorv(v[15], v[3]);
+  v[12] = rot8(v[12]);
+  v[13] = rot8(v[13]);
+  v[14] = rot8(v[14]);
+  v[15] = rot8(v[15]);
+  v[8] = addv(v[8], v[12]);
+  v[9] = addv(v[9], v[13]);
+  v[10] = addv(v[10], v[14]);
+  v[11] = addv(v[11], v[15]);
+  v[4] = xorv(v[4], v[8]);
+  v[5] = xorv(v[5], v[9]);
+  v[6] = xorv(v[6], v[10]);
+  v[7] = xorv(v[7], v[11]);
+  v[4] = rot7(v[4]);
+  v[5] = rot7(v[5]);
+  v[6] = rot7(v[6]);
+  v[7] = rot7(v[7]);
+
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
+  v[0] = addv(v[0], v[5]);
+  v[1] = addv(v[1], v[6]);
+  v[2] = addv(v[2], v[7]);
+  v[3] = addv(v[3], v[4]);
+  v[15] = xorv(v[15], v[0]);
+  v[12] = xorv(v[12], v[1]);
+  v[13] = xorv(v[13], v[2]);
+  v[14] = xorv(v[14], v[3]);
+  v[15] = rot16(v[15]);
+  v[12] = rot16(v[12]);
+  v[13] = rot16(v[13]);
+  v[14] = rot16(v[14]);
+  v[10] = addv(v[10], v[15]);
+  v[11] = addv(v[11], v[12]);
+  v[8] = addv(v[8], v[13]);
+  v[9] = addv(v[9], v[14]);
+  v[5] = xorv(v[5], v[10]);
+  v[6] = xorv(v[6], v[11]);
+  v[7] = xorv(v[7], v[8]);
+  v[4] = xorv(v[4], v[9]);
+  v[5] = rot12(v[5]);
+  v[6] = rot12(v[6]);
+  v[7] = rot12(v[7]);
+  v[4] = rot12(v[4]);
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
+  v[0] = addv(v[0], v[5]);
+  v[1] = addv(v[1], v[6]);
+  v[2] = addv(v[2], v[7]);
+  v[3] = addv(v[3], v[4]);
+  v[15] = xorv(v[15], v[0]);
+  v[12] = xorv(v[12], v[1]);
+  v[13] = xorv(v[13], v[2]);
+  v[14] = xorv(v[14], v[3]);
+  v[15] = rot8(v[15]);
+  v[12] = rot8(v[12]);
+  v[13] = rot8(v[13]);
+  v[14] = rot8(v[14]);
+  v[10] = addv(v[10], v[15]);
+  v[11] = addv(v[11], v[12]);
+  v[8] = addv(v[8], v[13]);
+  v[9] = addv(v[9], v[14]);
+  v[5] = xorv(v[5], v[10]);
+  v[6] = xorv(v[6], v[11]);
+  v[7] = xorv(v[7], v[8]);
+  v[4] = xorv(v[4], v[9]);
+  v[5] = rot7(v[5]);
+  v[6] = rot7(v[6]);
+  v[7] = rot7(v[7]);
+  v[4] = rot7(v[4]);
+}
+
+INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
+  // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
+  // 22/33. Note that this doesn't split the vector into two lanes, as the
+  // AVX2 counterparts do.
+  __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
+  __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
+  __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
+  __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
+
+  // Interleave 64-bit lanes.
+  __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
+  __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
+  __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
+  __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
+
+  vecs[0] = abcd_0;
+  vecs[1] = abcd_1;
+  vecs[2] = abcd_2;
+  vecs[3] = abcd_3;
+}
+
+INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
+                               size_t block_offset, __m128i out[16]) {
+  out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
+  out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
+  out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
+  out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
+  out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
+  out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
+  out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
+  out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
+  out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
+  out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
+  out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
+  out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
+  out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
+  out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
+  out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
+  out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
+  for (size_t i = 0; i < 4; ++i) {
+    _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
+  }
+  transpose_vecs(&out[0]);
+  transpose_vecs(&out[4]);
+  transpose_vecs(&out[8]);
+  transpose_vecs(&out[12]);
+}
+
+INLINE void load_counters(uint64_t counter, bool increment_counter,
+                          __m128i *out_lo, __m128i *out_hi) {
+  const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
+  const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
+  const __m128i add1 = _mm_and_si128(mask, add0);
+  __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
+  __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), 
+                                  _mm_xor_si128(   l, _mm_set1_epi32(0x80000000)));
+  __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
+  *out_lo = l;
+  *out_hi = h;
+}
+
+static
+void blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks,
+                       const uint32_t key[8], uint64_t counter,
+                       bool increment_counter, uint8_t flags,
+                       uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+  __m128i h_vecs[8] = {
+      set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]),
+      set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]),
+  };
+  __m128i counter_low_vec, counter_high_vec;
+  load_counters(counter, increment_counter, &counter_low_vec,
+                &counter_high_vec);
+  uint8_t block_flags = flags | flags_start;
+
+  for (size_t block = 0; block < blocks; block++) {
+    if (block + 1 == blocks) {
+      block_flags |= flags_end;
+    }
+    __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN);
+    __m128i block_flags_vec = set1(block_flags);
+    __m128i msg_vecs[16];
+    transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
+
+    __m128i v[16] = {
+        h_vecs[0],       h_vecs[1],        h_vecs[2],     h_vecs[3],
+        h_vecs[4],       h_vecs[5],        h_vecs[6],     h_vecs[7],
+        set1(IV[0]),     set1(IV[1]),      set1(IV[2]),   set1(IV[3]),
+        counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
+    };
+    round_fn(v, msg_vecs, 0);
+    round_fn(v, msg_vecs, 1);
+    round_fn(v, msg_vecs, 2);
+    round_fn(v, msg_vecs, 3);
+    round_fn(v, msg_vecs, 4);
+    round_fn(v, msg_vecs, 5);
+    round_fn(v, msg_vecs, 6);
+    h_vecs[0] = xorv(v[0], v[8]);
+    h_vecs[1] = xorv(v[1], v[9]);
+    h_vecs[2] = xorv(v[2], v[10]);
+    h_vecs[3] = xorv(v[3], v[11]);
+    h_vecs[4] = xorv(v[4], v[12]);
+    h_vecs[5] = xorv(v[5], v[13]);
+    h_vecs[6] = xorv(v[6], v[14]);
+    h_vecs[7] = xorv(v[7], v[15]);
+
+    block_flags = flags;
+  }
+
+  transpose_vecs(&h_vecs[0]);
+  transpose_vecs(&h_vecs[4]);
+  // The first four vecs now contain the first half of each output, and the
+  // second four vecs contain the second half of each output.
+  storeu(h_vecs[0], &out[0 * sizeof(__m128i)]);
+  storeu(h_vecs[4], &out[1 * sizeof(__m128i)]);
+  storeu(h_vecs[1], &out[2 * sizeof(__m128i)]);
+  storeu(h_vecs[5], &out[3 * sizeof(__m128i)]);
+  storeu(h_vecs[2], &out[4 * sizeof(__m128i)]);
+  storeu(h_vecs[6], &out[5 * sizeof(__m128i)]);
+  storeu(h_vecs[3], &out[6 * sizeof(__m128i)]);
+  storeu(h_vecs[7], &out[7 * sizeof(__m128i)]);
+}
+
+INLINE void hash_one_sse2(const uint8_t *input, size_t blocks,
+                          const uint32_t key[8], uint64_t counter,
+                          uint8_t flags, uint8_t flags_start,
+                          uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
+  uint32_t cv[8];
+  memcpy(cv, key, BLAKE3_KEY_LEN);
+  uint8_t block_flags = flags | flags_start;
+  while (blocks > 0) {
+    if (blocks == 1) {
+      block_flags |= flags_end;
+    }
+    blake3_compress_in_place_sse2(cv, input, BLAKE3_BLOCK_LEN, counter,
+                                  block_flags);
+    input = &input[BLAKE3_BLOCK_LEN];
+    blocks -= 1;
+    block_flags = flags;
+  }
+  memcpy(out, cv, BLAKE3_OUT_LEN);
+}
+
+void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
+                           size_t blocks, const uint32_t key[8],
+                           uint64_t counter, bool increment_counter,
+                           uint8_t flags, uint8_t flags_start,
+                           uint8_t flags_end, uint8_t *out) {
+  while (num_inputs >= DEGREE) {
+    blake3_hash4_sse2(inputs, blocks, key, counter, increment_counter, flags,
+                      flags_start, flags_end, out);
+    if (increment_counter) {
+      counter += DEGREE;
+    }
+    inputs += DEGREE;
+    num_inputs -= DEGREE;
+    out = &out[DEGREE * BLAKE3_OUT_LEN];
+  }
+  while (num_inputs > 0) {
+    hash_one_sse2(inputs[0], blocks, key, counter, flags, flags_start,
+                  flags_end, out);
+    if (increment_counter) {
+      counter += 1;
+    }
+    inputs += 1;
+    num_inputs -= 1;
+    out = &out[BLAKE3_OUT_LEN];
+  }
+}
diff --git a/llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_unix.S b/llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_unix.S
new file mode 100644
index 000000000000..0106b13ba851
--- /dev/null
+++ b/llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_unix.S
@@ -0,0 +1,2307 @@
+#if defined(__x86_64__)
+
+#if defined(__ELF__) && defined(__linux__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
+#if __has_include(<cet.h>)
+#include <cet.h>
+#endif
+#endif
+
+#if !defined(_CET_ENDBR)
+#define _CET_ENDBR
+#endif
+
+#ifdef __APPLE__
+#define HIDDEN .private_extern
+#else
+#define HIDDEN .hidden
+#endif
+
+.intel_syntax noprefix
+HIDDEN blake3_hash_many_sse2
+HIDDEN _blake3_hash_many_sse2
+HIDDEN blake3_compress_in_place_sse2
+HIDDEN _blake3_compress_in_place_sse2
+HIDDEN blake3_compress_xof_sse2
+HIDDEN _blake3_compress_xof_sse2
+.global blake3_hash_many_sse2
+.global _blake3_hash_many_sse2
+.global blake3_compress_in_place_sse2
+.global _blake3_compress_in_place_sse2
+.global blake3_compress_xof_sse2
+.global _blake3_compress_xof_sse2
+#ifdef __APPLE__
+.text
+#else
+.section .text
+#endif
+        .p2align  6
+_blake3_hash_many_sse2:
+blake3_hash_many_sse2:
+        _CET_ENDBR
+        push    r15
+        push    r14
+        push    r13
+        push    r12
+        push    rbx
+        push    rbp
+        mov     rbp, rsp
+        sub     rsp, 360
+        and     rsp, 0xFFFFFFFFFFFFFFC0
+        neg     r9d
+        movd    xmm0, r9d
+        pshufd  xmm0, xmm0, 0x00
+        movdqa  xmmword ptr [rsp+0x130], xmm0
+        movdqa  xmm1, xmm0
+        pand    xmm1, xmmword ptr [ADD0+rip]
+        pand    xmm0, xmmword ptr [ADD1+rip]
+        movdqa  xmmword ptr [rsp+0x150], xmm0
+        movd    xmm0, r8d
+        pshufd  xmm0, xmm0, 0x00
+        paddd   xmm0, xmm1
+        movdqa  xmmword ptr [rsp+0x110], xmm0
+        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
+        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
+        pcmpgtd xmm1, xmm0
+        shr     r8, 32
+        movd    xmm2, r8d
+        pshufd  xmm2, xmm2, 0x00
+        psubd   xmm2, xmm1
+        movdqa  xmmword ptr [rsp+0x120], xmm2
+        mov     rbx, qword ptr [rbp+0x50]
+        mov     r15, rdx
+        shl     r15, 6
+        movzx   r13d, byte ptr [rbp+0x38]
+        movzx   r12d, byte ptr [rbp+0x48]
+        cmp     rsi, 4
+        jc      3f
+2:
+        movdqu  xmm3, xmmword ptr [rcx]
+        pshufd  xmm0, xmm3, 0x00
+        pshufd  xmm1, xmm3, 0x55
+        pshufd  xmm2, xmm3, 0xAA
+        pshufd  xmm3, xmm3, 0xFF
+        movdqu  xmm7, xmmword ptr [rcx+0x10]
+        pshufd  xmm4, xmm7, 0x00
+        pshufd  xmm5, xmm7, 0x55
+        pshufd  xmm6, xmm7, 0xAA
+        pshufd  xmm7, xmm7, 0xFF
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        mov     r10, qword ptr [rdi+0x10]
+        mov     r11, qword ptr [rdi+0x18]
+        movzx   eax, byte ptr [rbp+0x40]
+        or      eax, r13d
+        xor     edx, edx
+9:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        movdqu  xmm8, xmmword ptr [r8+rdx-0x40]
+        movdqu  xmm9, xmmword ptr [r9+rdx-0x40]
+        movdqu  xmm10, xmmword ptr [r10+rdx-0x40]
+        movdqu  xmm11, xmmword ptr [r11+rdx-0x40]
+        movdqa  xmm12, xmm8
+        punpckldq xmm8, xmm9
+        punpckhdq xmm12, xmm9
+        movdqa  xmm14, xmm10
+        punpckldq xmm10, xmm11
+        punpckhdq xmm14, xmm11
+        movdqa  xmm9, xmm8
+        punpcklqdq xmm8, xmm10
+        punpckhqdq xmm9, xmm10
+        movdqa  xmm13, xmm12
+        punpcklqdq xmm12, xmm14
+        punpckhqdq xmm13, xmm14
+        movdqa  xmmword ptr [rsp], xmm8
+        movdqa  xmmword ptr [rsp+0x10], xmm9
+        movdqa  xmmword ptr [rsp+0x20], xmm12
+        movdqa  xmmword ptr [rsp+0x30], xmm13
+        movdqu  xmm8, xmmword ptr [r8+rdx-0x30]
+        movdqu  xmm9, xmmword ptr [r9+rdx-0x30]
+        movdqu  xmm10, xmmword ptr [r10+rdx-0x30]
+        movdqu  xmm11, xmmword ptr [r11+rdx-0x30]
+        movdqa  xmm12, xmm8
+        punpckldq xmm8, xmm9
+        punpckhdq xmm12, xmm9
+        movdqa  xmm14, xmm10
+        punpckldq xmm10, xmm11
+        punpckhdq xmm14, xmm11
+        movdqa  xmm9, xmm8
+        punpcklqdq xmm8, xmm10
+        punpckhqdq xmm9, xmm10
+        movdqa  xmm13, xmm12
+        punpcklqdq xmm12, xmm14
+        punpckhqdq xmm13, xmm14
+        movdqa  xmmword ptr [rsp+0x40], xmm8
+        movdqa  xmmword ptr [rsp+0x50], xmm9
+        movdqa  xmmword ptr [rsp+0x60], xmm12
+        movdqa  xmmword ptr [rsp+0x70], xmm13
+        movdqu  xmm8, xmmword ptr [r8+rdx-0x20]
+        movdqu  xmm9, xmmword ptr [r9+rdx-0x20]
+        movdqu  xmm10, xmmword ptr [r10+rdx-0x20]
+        movdqu  xmm11, xmmword ptr [r11+rdx-0x20]
+        movdqa  xmm12, xmm8
+        punpckldq xmm8, xmm9
+        punpckhdq xmm12, xmm9
+        movdqa  xmm14, xmm10
+        punpckldq xmm10, xmm11
+        punpckhdq xmm14, xmm11
+        movdqa  xmm9, xmm8
+        punpcklqdq xmm8, xmm10
+        punpckhqdq xmm9, xmm10
+        movdqa  xmm13, xmm12
+        punpcklqdq xmm12, xmm14
+        punpckhqdq xmm13, xmm14
+        movdqa  xmmword ptr [rsp+0x80], xmm8
+        movdqa  xmmword ptr [rsp+0x90], xmm9
+        movdqa  xmmword ptr [rsp+0xA0], xmm12
+        movdqa  xmmword ptr [rsp+0xB0], xmm13
+        movdqu  xmm8, xmmword ptr [r8+rdx-0x10]
+        movdqu  xmm9, xmmword ptr [r9+rdx-0x10]
+        movdqu  xmm10, xmmword ptr [r10+rdx-0x10]
+        movdqu  xmm11, xmmword ptr [r11+rdx-0x10]
+        movdqa  xmm12, xmm8
+        punpckldq xmm8, xmm9
+        punpckhdq xmm12, xmm9
+        movdqa  xmm14, xmm10
+        punpckldq xmm10, xmm11
+        punpckhdq xmm14, xmm11
+        movdqa  xmm9, xmm8
+        punpcklqdq xmm8, xmm10
+        punpckhqdq xmm9, xmm10
+        movdqa  xmm13, xmm12
+        punpcklqdq xmm12, xmm14
+        punpckhqdq xmm13, xmm14
+        movdqa  xmmword ptr [rsp+0xC0], xmm8
+        movdqa  xmmword ptr [rsp+0xD0], xmm9
+        movdqa  xmmword ptr [rsp+0xE0], xmm12
+        movdqa  xmmword ptr [rsp+0xF0], xmm13
+        movdqa  xmm9, xmmword ptr [BLAKE3_IV_1+rip]
+        movdqa  xmm10, xmmword ptr [BLAKE3_IV_2+rip]
+        movdqa  xmm11, xmmword ptr [BLAKE3_IV_3+rip]
+        movdqa  xmm12, xmmword ptr [rsp+0x110]
+        movdqa  xmm13, xmmword ptr [rsp+0x120]
+        movdqa  xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
+        movd    xmm15, eax
+        pshufd  xmm15, xmm15, 0x00
+        prefetcht0 [r8+rdx+0x80]
+        prefetcht0 [r9+rdx+0x80]
+        prefetcht0 [r10+rdx+0x80]
+        prefetcht0 [r11+rdx+0x80]
+        paddd   xmm0, xmmword ptr [rsp]
+        paddd   xmm1, xmmword ptr [rsp+0x20]
+        paddd   xmm2, xmmword ptr [rsp+0x40]
+        paddd   xmm3, xmmword ptr [rsp+0x60]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        movdqa  xmm8, xmmword ptr [BLAKE3_IV_0+rip]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x10]
+        paddd   xmm1, xmmword ptr [rsp+0x30]
+        paddd   xmm2, xmmword ptr [rsp+0x50]
+        paddd   xmm3, xmmword ptr [rsp+0x70]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x80]
+        paddd   xmm1, xmmword ptr [rsp+0xA0]
+        paddd   xmm2, xmmword ptr [rsp+0xC0]
+        paddd   xmm3, xmmword ptr [rsp+0xE0]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x90]
+        paddd   xmm1, xmmword ptr [rsp+0xB0]
+        paddd   xmm2, xmmword ptr [rsp+0xD0]
+        paddd   xmm3, xmmword ptr [rsp+0xF0]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x20]
+        paddd   xmm1, xmmword ptr [rsp+0x30]
+        paddd   xmm2, xmmword ptr [rsp+0x70]
+        paddd   xmm3, xmmword ptr [rsp+0x40]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x60]
+        paddd   xmm1, xmmword ptr [rsp+0xA0]
+        paddd   xmm2, xmmword ptr [rsp]
+        paddd   xmm3, xmmword ptr [rsp+0xD0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x10]
+        paddd   xmm1, xmmword ptr [rsp+0xC0]
+        paddd   xmm2, xmmword ptr [rsp+0x90]
+        paddd   xmm3, xmmword ptr [rsp+0xF0]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xB0]
+        paddd   xmm1, xmmword ptr [rsp+0x50]
+        paddd   xmm2, xmmword ptr [rsp+0xE0]
+        paddd   xmm3, xmmword ptr [rsp+0x80]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x30]
+        paddd   xmm1, xmmword ptr [rsp+0xA0]
+        paddd   xmm2, xmmword ptr [rsp+0xD0]
+        paddd   xmm3, xmmword ptr [rsp+0x70]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x40]
+        paddd   xmm1, xmmword ptr [rsp+0xC0]
+        paddd   xmm2, xmmword ptr [rsp+0x20]
+        paddd   xmm3, xmmword ptr [rsp+0xE0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x60]
+        paddd   xmm1, xmmword ptr [rsp+0x90]
+        paddd   xmm2, xmmword ptr [rsp+0xB0]
+        paddd   xmm3, xmmword ptr [rsp+0x80]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x50]
+        paddd   xmm1, xmmword ptr [rsp]
+        paddd   xmm2, xmmword ptr [rsp+0xF0]
+        paddd   xmm3, xmmword ptr [rsp+0x10]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xA0]
+        paddd   xmm1, xmmword ptr [rsp+0xC0]
+        paddd   xmm2, xmmword ptr [rsp+0xE0]
+        paddd   xmm3, xmmword ptr [rsp+0xD0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x70]
+        paddd   xmm1, xmmword ptr [rsp+0x90]
+        paddd   xmm2, xmmword ptr [rsp+0x30]
+        paddd   xmm3, xmmword ptr [rsp+0xF0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x40]
+        paddd   xmm1, xmmword ptr [rsp+0xB0]
+        paddd   xmm2, xmmword ptr [rsp+0x50]
+        paddd   xmm3, xmmword ptr [rsp+0x10]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp]
+        paddd   xmm1, xmmword ptr [rsp+0x20]
+        paddd   xmm2, xmmword ptr [rsp+0x80]
+        paddd   xmm3, xmmword ptr [rsp+0x60]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xC0]
+        paddd   xmm1, xmmword ptr [rsp+0x90]
+        paddd   xmm2, xmmword ptr [rsp+0xF0]
+        paddd   xmm3, xmmword ptr [rsp+0xE0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xD0]
+        paddd   xmm1, xmmword ptr [rsp+0xB0]
+        paddd   xmm2, xmmword ptr [rsp+0xA0]
+        paddd   xmm3, xmmword ptr [rsp+0x80]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x70]
+        paddd   xmm1, xmmword ptr [rsp+0x50]
+        paddd   xmm2, xmmword ptr [rsp]
+        paddd   xmm3, xmmword ptr [rsp+0x60]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x20]
+        paddd   xmm1, xmmword ptr [rsp+0x30]
+        paddd   xmm2, xmmword ptr [rsp+0x10]
+        paddd   xmm3, xmmword ptr [rsp+0x40]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x90]
+        paddd   xmm1, xmmword ptr [rsp+0xB0]
+        paddd   xmm2, xmmword ptr [rsp+0x80]
+        paddd   xmm3, xmmword ptr [rsp+0xF0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xE0]
+        paddd   xmm1, xmmword ptr [rsp+0x50]
+        paddd   xmm2, xmmword ptr [rsp+0xC0]
+        paddd   xmm3, xmmword ptr [rsp+0x10]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xD0]
+        paddd   xmm1, xmmword ptr [rsp]
+        paddd   xmm2, xmmword ptr [rsp+0x20]
+        paddd   xmm3, xmmword ptr [rsp+0x40]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x30]
+        paddd   xmm1, xmmword ptr [rsp+0xA0]
+        paddd   xmm2, xmmword ptr [rsp+0x60]
+        paddd   xmm3, xmmword ptr [rsp+0x70]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xB0]
+        paddd   xmm1, xmmword ptr [rsp+0x50]
+        paddd   xmm2, xmmword ptr [rsp+0x10]
+        paddd   xmm3, xmmword ptr [rsp+0x80]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xF0]
+        paddd   xmm1, xmmword ptr [rsp]
+        paddd   xmm2, xmmword ptr [rsp+0x90]
+        paddd   xmm3, xmmword ptr [rsp+0x60]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xE0]
+        paddd   xmm1, xmmword ptr [rsp+0x20]
+        paddd   xmm2, xmmword ptr [rsp+0x30]
+        paddd   xmm3, xmmword ptr [rsp+0x70]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xA0]
+        paddd   xmm1, xmmword ptr [rsp+0xC0]
+        paddd   xmm2, xmmword ptr [rsp+0x40]
+        paddd   xmm3, xmmword ptr [rsp+0xD0]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        pxor    xmm0, xmm8
+        pxor    xmm1, xmm9
+        pxor    xmm2, xmm10
+        pxor    xmm3, xmm11
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        pxor    xmm4, xmm12
+        pxor    xmm5, xmm13
+        pxor    xmm6, xmm14
+        pxor    xmm7, xmm15
+        mov     eax, r13d
+        jne     9b
+        movdqa  xmm9, xmm0
+        punpckldq xmm0, xmm1
+        punpckhdq xmm9, xmm1
+        movdqa  xmm11, xmm2
+        punpckldq xmm2, xmm3
+        punpckhdq xmm11, xmm3
+        movdqa  xmm1, xmm0
+        punpcklqdq xmm0, xmm2
+        punpckhqdq xmm1, xmm2
+        movdqa  xmm3, xmm9
+        punpcklqdq xmm9, xmm11
+        punpckhqdq xmm3, xmm11
+        movdqu  xmmword ptr [rbx], xmm0
+        movdqu  xmmword ptr [rbx+0x20], xmm1
+        movdqu  xmmword ptr [rbx+0x40], xmm9
+        movdqu  xmmword ptr [rbx+0x60], xmm3
+        movdqa  xmm9, xmm4
+        punpckldq xmm4, xmm5
+        punpckhdq xmm9, xmm5
+        movdqa  xmm11, xmm6
+        punpckldq xmm6, xmm7
+        punpckhdq xmm11, xmm7
+        movdqa  xmm5, xmm4
+        punpcklqdq xmm4, xmm6
+        punpckhqdq xmm5, xmm6
+        movdqa  xmm7, xmm9
+        punpcklqdq xmm9, xmm11
+        punpckhqdq xmm7, xmm11
+        movdqu  xmmword ptr [rbx+0x10], xmm4
+        movdqu  xmmword ptr [rbx+0x30], xmm5
+        movdqu  xmmword ptr [rbx+0x50], xmm9
+        movdqu  xmmword ptr [rbx+0x70], xmm7
+        movdqa  xmm1, xmmword ptr [rsp+0x110]
+        movdqa  xmm0, xmm1
+        paddd   xmm1, xmmword ptr [rsp+0x150]
+        movdqa  xmmword ptr [rsp+0x110], xmm1
+        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
+        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
+        pcmpgtd xmm0, xmm1
+        movdqa  xmm1, xmmword ptr [rsp+0x120]
+        psubd   xmm1, xmm0
+        movdqa  xmmword ptr [rsp+0x120], xmm1
+        add     rbx, 128
+        add     rdi, 32
+        sub     rsi, 4
+        cmp     rsi, 4
+        jnc     2b
+        test    rsi, rsi
+        jnz     3f
+4:
+        mov     rsp, rbp
+        pop     rbp
+        pop     rbx
+        pop     r12
+        pop     r13
+        pop     r14
+        pop     r15
+        ret
+.p2align 5
+3:
+        test    esi, 0x2
+        je      3f
+        movups  xmm0, xmmword ptr [rcx]
+        movups  xmm1, xmmword ptr [rcx+0x10]
+        movaps  xmm8, xmm0
+        movaps  xmm9, xmm1
+        movd    xmm13, dword ptr [rsp+0x110]
+        movd    xmm14, dword ptr [rsp+0x120]
+        punpckldq xmm13, xmm14
+        movaps  xmmword ptr [rsp], xmm13
+        movd    xmm14, dword ptr [rsp+0x114]
+        movd    xmm13, dword ptr [rsp+0x124]
+        punpckldq xmm14, xmm13
+        movaps  xmmword ptr [rsp+0x10], xmm14
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        movzx   eax, byte ptr [rbp+0x40]
+        or      eax, r13d
+        xor     edx, edx
+2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
+        movaps  xmm10, xmm2
+        movups  xmm4, xmmword ptr [r8+rdx-0x40]
+        movups  xmm5, xmmword ptr [r8+rdx-0x30]
+        movaps  xmm3, xmm4
+        shufps  xmm4, xmm5, 136
+        shufps  xmm3, xmm5, 221
+        movaps  xmm5, xmm3
+        movups  xmm6, xmmword ptr [r8+rdx-0x20]
+        movups  xmm7, xmmword ptr [r8+rdx-0x10]
+        movaps  xmm3, xmm6
+        shufps  xmm6, xmm7, 136
+        pshufd  xmm6, xmm6, 0x93
+        shufps  xmm3, xmm7, 221
+        pshufd  xmm7, xmm3, 0x93
+        movups  xmm12, xmmword ptr [r9+rdx-0x40]
+        movups  xmm13, xmmword ptr [r9+rdx-0x30]
+        movaps  xmm11, xmm12
+        shufps  xmm12, xmm13, 136
+        shufps  xmm11, xmm13, 221
+        movaps  xmm13, xmm11
+        movups  xmm14, xmmword ptr [r9+rdx-0x20]
+        movups  xmm15, xmmword ptr [r9+rdx-0x10]
+        movaps  xmm11, xmm14
+        shufps  xmm14, xmm15, 136
+        pshufd  xmm14, xmm14, 0x93
+        shufps  xmm11, xmm15, 221
+        pshufd  xmm15, xmm11, 0x93
+        shl     rax, 0x20
+        or      rax, 0x40
+        movq    xmm3, rax
+        movdqa  xmmword ptr [rsp+0x20], xmm3
+        movaps  xmm3, xmmword ptr [rsp]
+        movaps  xmm11, xmmword ptr [rsp+0x10]
+        punpcklqdq xmm3, xmmword ptr [rsp+0x20]
+        punpcklqdq xmm11, xmmword ptr [rsp+0x20]
+        mov     al, 7
+9:
+        paddd   xmm0, xmm4
+        paddd   xmm8, xmm12
+        movaps  xmmword ptr [rsp+0x20], xmm4
+        movaps  xmmword ptr [rsp+0x30], xmm12
+        paddd   xmm0, xmm1
+        paddd   xmm8, xmm9
+        pxor    xmm3, xmm0
+        pxor    xmm11, xmm8
+        pshuflw xmm3, xmm3, 0xB1
+        pshufhw xmm3, xmm3, 0xB1
+        pshuflw xmm11, xmm11, 0xB1
+        pshufhw xmm11, xmm11, 0xB1
+        paddd   xmm2, xmm3
+        paddd   xmm10, xmm11
+        pxor    xmm1, xmm2
+        pxor    xmm9, xmm10
+        movdqa  xmm4, xmm1
+        pslld   xmm1, 20
+        psrld   xmm4, 12
+        por     xmm1, xmm4
+        movdqa  xmm4, xmm9
+        pslld   xmm9, 20
+        psrld   xmm4, 12
+        por     xmm9, xmm4
+        paddd   xmm0, xmm5
+        paddd   xmm8, xmm13
+        movaps  xmmword ptr [rsp+0x40], xmm5
+        movaps  xmmword ptr [rsp+0x50], xmm13
+        paddd   xmm0, xmm1
+        paddd   xmm8, xmm9
+        pxor    xmm3, xmm0
+        pxor    xmm11, xmm8
+        movdqa  xmm13, xmm3
+        psrld   xmm3, 8
+        pslld   xmm13, 24
+        pxor    xmm3, xmm13
+        movdqa  xmm13, xmm11
+        psrld   xmm11, 8
+        pslld   xmm13, 24
+        pxor    xmm11, xmm13
+        paddd   xmm2, xmm3
+        paddd   xmm10, xmm11
+        pxor    xmm1, xmm2
+        pxor    xmm9, xmm10
+        movdqa  xmm4, xmm1
+        pslld   xmm1, 25
+        psrld   xmm4, 7
+        por     xmm1, xmm4
+        movdqa  xmm4, xmm9
+        pslld   xmm9, 25
+        psrld   xmm4, 7
+        por     xmm9, xmm4
+        pshufd  xmm0, xmm0, 0x93
+        pshufd  xmm8, xmm8, 0x93
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm11, xmm11, 0x4E
+        pshufd  xmm2, xmm2, 0x39
+        pshufd  xmm10, xmm10, 0x39
+        paddd   xmm0, xmm6
+        paddd   xmm8, xmm14
+        paddd   xmm0, xmm1
+        paddd   xmm8, xmm9
+        pxor    xmm3, xmm0
+        pxor    xmm11, xmm8
+        pshuflw xmm3, xmm3, 0xB1
+        pshufhw xmm3, xmm3, 0xB1
+        pshuflw xmm11, xmm11, 0xB1
+        pshufhw xmm11, xmm11, 0xB1
+        paddd   xmm2, xmm3
+        paddd   xmm10, xmm11
+        pxor    xmm1, xmm2
+        pxor    xmm9, xmm10
+        movdqa  xmm4, xmm1
+        pslld   xmm1, 20
+        psrld   xmm4, 12
+        por     xmm1, xmm4
+        movdqa  xmm4, xmm9
+        pslld   xmm9, 20
+        psrld   xmm4, 12
+        por     xmm9, xmm4
+        paddd   xmm0, xmm7
+        paddd   xmm8, xmm15
+        paddd   xmm0, xmm1
+        paddd   xmm8, xmm9
+        pxor    xmm3, xmm0
+        pxor    xmm11, xmm8
+        movdqa  xmm13, xmm3
+        psrld   xmm3, 8
+        pslld   xmm13, 24
+        pxor    xmm3, xmm13
+        movdqa  xmm13, xmm11
+        psrld   xmm11, 8
+        pslld   xmm13, 24
+        pxor    xmm11, xmm13
+        paddd   xmm2, xmm3
+        paddd   xmm10, xmm11
+        pxor    xmm1, xmm2
+        pxor    xmm9, xmm10
+        movdqa  xmm4, xmm1
+        pslld   xmm1, 25
+        psrld   xmm4, 7
+        por     xmm1, xmm4
+        movdqa  xmm4, xmm9
+        pslld   xmm9, 25
+        psrld   xmm4, 7
+        por     xmm9, xmm4
+        pshufd  xmm0, xmm0, 0x39
+        pshufd  xmm8, xmm8, 0x39
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm11, xmm11, 0x4E
+        pshufd  xmm2, xmm2, 0x93
+        pshufd  xmm10, xmm10, 0x93
+        dec     al
+        je      9f
+        movdqa  xmm12, xmmword ptr [rsp+0x20]
+        movdqa  xmm5, xmmword ptr [rsp+0x40]
+        pshufd  xmm13, xmm12, 0x0F
+        shufps  xmm12, xmm5, 214
+        pshufd  xmm4, xmm12, 0x39
+        movdqa  xmm12, xmm6
+        shufps  xmm12, xmm7, 250
+        pand    xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
+        pand    xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+        por     xmm13, xmm12
+        movdqa  xmmword ptr [rsp+0x20], xmm13
+        movdqa  xmm12, xmm7
+        punpcklqdq xmm12, xmm5
+        movdqa  xmm13, xmm6
+        pand    xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+        pand    xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+        por     xmm12, xmm13
+        pshufd  xmm12, xmm12, 0x78
+        punpckhdq xmm5, xmm7
+        punpckldq xmm6, xmm5
+        pshufd  xmm7, xmm6, 0x1E
+        movdqa  xmmword ptr [rsp+0x40], xmm12
+        movdqa  xmm5, xmmword ptr [rsp+0x30]
+        movdqa  xmm13, xmmword ptr [rsp+0x50]
+        pshufd  xmm6, xmm5, 0x0F
+        shufps  xmm5, xmm13, 214
+        pshufd  xmm12, xmm5, 0x39
+        movdqa  xmm5, xmm14
+        shufps  xmm5, xmm15, 250
+        pand    xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
+        pand    xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+        por     xmm6, xmm5
+        movdqa  xmm5, xmm15
+        punpcklqdq xmm5, xmm13
+        movdqa  xmmword ptr [rsp+0x30], xmm2
+        movdqa  xmm2, xmm14
+        pand    xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+        pand    xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+        por     xmm5, xmm2
+        movdqa  xmm2, xmmword ptr [rsp+0x30]
+        pshufd  xmm5, xmm5, 0x78
+        punpckhdq xmm13, xmm15
+        punpckldq xmm14, xmm13
+        pshufd  xmm15, xmm14, 0x1E
+        movdqa  xmm13, xmm6
+        movdqa  xmm14, xmm5
+        movdqa  xmm5, xmmword ptr [rsp+0x20]
+        movdqa  xmm6, xmmword ptr [rsp+0x40]
+        jmp     9b
+9:
+        pxor    xmm0, xmm2
+        pxor    xmm1, xmm3
+        pxor    xmm8, xmm10
+        pxor    xmm9, xmm11
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     2b
+        movups  xmmword ptr [rbx], xmm0
+        movups  xmmword ptr [rbx+0x10], xmm1
+        movups  xmmword ptr [rbx+0x20], xmm8
+        movups  xmmword ptr [rbx+0x30], xmm9
+        mov     eax, dword ptr [rsp+0x130]
+        neg     eax
+        mov    r10d, dword ptr [rsp+0x110+8*rax]
+        mov    r11d, dword ptr [rsp+0x120+8*rax]
+        mov dword ptr [rsp+0x110], r10d
+        mov dword ptr [rsp+0x120], r11d
+        add     rdi, 16
+        add     rbx, 64
+        sub     rsi, 2
+3:
+        test    esi, 0x1
+        je      4b
+        movups  xmm0, xmmword ptr [rcx]
+        movups  xmm1, xmmword ptr [rcx+0x10]
+        movd    xmm13, dword ptr [rsp+0x110]
+        movd    xmm14, dword ptr [rsp+0x120]
+        punpckldq xmm13, xmm14
+        mov     r8, qword ptr [rdi]
+        movzx   eax, byte ptr [rbp+0x40]
+        or      eax, r13d
+        xor     edx, edx
+2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
+        shl     rax, 32
+        or      rax, 64
+        movq    xmm12, rax
+        movdqa  xmm3, xmm13
+        punpcklqdq xmm3, xmm12
+        movups  xmm4, xmmword ptr [r8+rdx-0x40]
+        movups  xmm5, xmmword ptr [r8+rdx-0x30]
+        movaps  xmm8, xmm4
+        shufps  xmm4, xmm5, 136
+        shufps  xmm8, xmm5, 221
+        movaps  xmm5, xmm8
+        movups  xmm6, xmmword ptr [r8+rdx-0x20]
+        movups  xmm7, xmmword ptr [r8+rdx-0x10]
+        movaps  xmm8, xmm6
+        shufps  xmm6, xmm7, 136
+        pshufd  xmm6, xmm6, 0x93
+        shufps  xmm8, xmm7, 221
+        pshufd  xmm7, xmm8, 0x93
+        mov     al, 7
+9:
+        paddd   xmm0, xmm4
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshuflw xmm3, xmm3, 0xB1
+        pshufhw xmm3, xmm3, 0xB1
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm5
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        movdqa  xmm14, xmm3
+        psrld   xmm3, 8
+        pslld   xmm14, 24
+        pxor    xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x93
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x39
+        paddd   xmm0, xmm6
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshuflw xmm3, xmm3, 0xB1
+        pshufhw xmm3, xmm3, 0xB1
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm7
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        movdqa  xmm14, xmm3
+        psrld   xmm3, 8
+        pslld   xmm14, 24
+        pxor    xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x39
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x93
+        dec     al
+        jz      9f
+        movdqa  xmm8, xmm4
+        shufps  xmm8, xmm5, 214
+        pshufd  xmm9, xmm4, 0x0F
+        pshufd  xmm4, xmm8, 0x39
+        movdqa  xmm8, xmm6
+        shufps  xmm8, xmm7, 250
+        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+        por     xmm9, xmm8
+        movdqa  xmm8, xmm7
+        punpcklqdq xmm8, xmm5
+        movdqa  xmm10, xmm6
+        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+        pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+        por     xmm8, xmm10
+        pshufd  xmm8, xmm8, 0x78
+        punpckhdq xmm5, xmm7
+        punpckldq xmm6, xmm5
+        pshufd  xmm7, xmm6, 0x1E
+        movdqa  xmm5, xmm9
+        movdqa  xmm6, xmm8
+        jmp     9b
+9:
+        pxor    xmm0, xmm2
+        pxor    xmm1, xmm3
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     2b
+        movups  xmmword ptr [rbx], xmm0
+        movups  xmmword ptr [rbx+0x10], xmm1
+        jmp     4b
+
+.p2align 6
+blake3_compress_in_place_sse2:
+_blake3_compress_in_place_sse2:
+        _CET_ENDBR
+        movups  xmm0, xmmword ptr [rdi]
+        movups  xmm1, xmmword ptr [rdi+0x10]
+        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
+        shl     r8, 32
+        add     rdx, r8
+        movq    xmm3, rcx
+        movq    xmm4, rdx
+        punpcklqdq xmm3, xmm4
+        movups  xmm4, xmmword ptr [rsi]
+        movups  xmm5, xmmword ptr [rsi+0x10]
+        movaps  xmm8, xmm4
+        shufps  xmm4, xmm5, 136
+        shufps  xmm8, xmm5, 221
+        movaps  xmm5, xmm8
+        movups  xmm6, xmmword ptr [rsi+0x20]
+        movups  xmm7, xmmword ptr [rsi+0x30]
+        movaps  xmm8, xmm6
+        shufps  xmm6, xmm7, 136
+        pshufd  xmm6, xmm6, 0x93
+        shufps  xmm8, xmm7, 221
+        pshufd  xmm7, xmm8, 0x93
+        mov     al, 7
+9:
+        paddd   xmm0, xmm4
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshuflw xmm3, xmm3, 0xB1
+        pshufhw xmm3, xmm3, 0xB1
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm5
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        movdqa  xmm14, xmm3
+        psrld   xmm3, 8
+        pslld   xmm14, 24
+        pxor    xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x93
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x39
+        paddd   xmm0, xmm6
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshuflw xmm3, xmm3, 0xB1
+        pshufhw xmm3, xmm3, 0xB1
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm7
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        movdqa  xmm14, xmm3
+        psrld   xmm3, 8
+        pslld   xmm14, 24
+        pxor    xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x39
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x93
+        dec     al
+        jz      9f
+        movdqa  xmm8, xmm4
+        shufps  xmm8, xmm5, 214
+        pshufd  xmm9, xmm4, 0x0F
+        pshufd  xmm4, xmm8, 0x39
+        movdqa  xmm8, xmm6
+        shufps  xmm8, xmm7, 250
+        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+        por     xmm9, xmm8
+        movdqa  xmm8, xmm7
+        punpcklqdq xmm8, xmm5
+        movdqa  xmm10, xmm6
+        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+        pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+        por     xmm8, xmm10
+        pshufd  xmm8, xmm8, 0x78
+        punpckhdq xmm5, xmm7
+        punpckldq xmm6, xmm5
+        pshufd  xmm7, xmm6, 0x1E
+        movdqa  xmm5, xmm9
+        movdqa  xmm6, xmm8
+        jmp     9b
+9:
+        pxor    xmm0, xmm2
+        pxor    xmm1, xmm3
+        movups  xmmword ptr [rdi], xmm0
+        movups  xmmword ptr [rdi+0x10], xmm1
+        ret
+
+.p2align 6
+blake3_compress_xof_sse2:
+_blake3_compress_xof_sse2:
+        _CET_ENDBR
+        movups  xmm0, xmmword ptr [rdi]
+        movups  xmm1, xmmword ptr [rdi+0x10]
+        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
+        movzx   eax, r8b
+        movzx   edx, dl
+        shl     rax, 32
+        add     rdx, rax
+        movq    xmm3, rcx
+        movq    xmm4, rdx
+        punpcklqdq xmm3, xmm4
+        movups  xmm4, xmmword ptr [rsi]
+        movups  xmm5, xmmword ptr [rsi+0x10]
+        movaps  xmm8, xmm4
+        shufps  xmm4, xmm5, 136
+        shufps  xmm8, xmm5, 221
+        movaps  xmm5, xmm8
+        movups  xmm6, xmmword ptr [rsi+0x20]
+        movups  xmm7, xmmword ptr [rsi+0x30]
+        movaps  xmm8, xmm6
+        shufps  xmm6, xmm7, 136
+        pshufd  xmm6, xmm6, 0x93
+        shufps  xmm8, xmm7, 221
+        pshufd  xmm7, xmm8, 0x93
+        mov     al, 7
+9:
+        paddd   xmm0, xmm4
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshuflw xmm3, xmm3, 0xB1
+        pshufhw xmm3, xmm3, 0xB1
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm5
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        movdqa  xmm14, xmm3
+        psrld   xmm3, 8
+        pslld   xmm14, 24
+        pxor    xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x93
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x39
+        paddd   xmm0, xmm6
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshuflw xmm3, xmm3, 0xB1
+        pshufhw xmm3, xmm3, 0xB1
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm7
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        movdqa  xmm14, xmm3
+        psrld   xmm3, 8
+        pslld   xmm14, 24
+        pxor    xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x39
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x93
+        dec     al
+        jz      9f
+        movdqa  xmm8, xmm4
+        shufps  xmm8, xmm5, 214
+        pshufd  xmm9, xmm4, 0x0F
+        pshufd  xmm4, xmm8, 0x39
+        movdqa  xmm8, xmm6
+        shufps  xmm8, xmm7, 250
+        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+        por     xmm9, xmm8
+        movdqa  xmm8, xmm7
+        punpcklqdq xmm8, xmm5
+        movdqa  xmm10, xmm6
+        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+        pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+        por     xmm8, xmm10
+        pshufd  xmm8, xmm8, 0x78
+        punpckhdq xmm5, xmm7
+        punpckldq xmm6, xmm5
+        pshufd  xmm7, xmm6, 0x1E
+        movdqa  xmm5, xmm9
+        movdqa  xmm6, xmm8
+        jmp     9b
+9:
+        movdqu  xmm4, xmmword ptr [rdi]
+        movdqu  xmm5, xmmword ptr [rdi+0x10]
+        pxor    xmm0, xmm2
+        pxor    xmm1, xmm3
+        pxor    xmm2, xmm4
+        pxor    xmm3, xmm5
+        movups  xmmword ptr [r9], xmm0
+        movups  xmmword ptr [r9+0x10], xmm1
+        movups  xmmword ptr [r9+0x20], xmm2
+        movups  xmmword ptr [r9+0x30], xmm3
+        ret
+
+
+#ifdef __APPLE__
+.static_data
+#else
+.section .rodata
+#endif
+.p2align  6
+BLAKE3_IV:
+        .long  0x6A09E667, 0xBB67AE85
+        .long  0x3C6EF372, 0xA54FF53A
+ADD0:	
+        .long  0, 1, 2, 3
+ADD1:
+	.long  4, 4, 4, 4
+BLAKE3_IV_0:
+	.long  0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
+BLAKE3_IV_1:
+	.long  0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
+BLAKE3_IV_2:
+	.long  0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
+BLAKE3_IV_3:
+	.long  0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
+BLAKE3_BLOCK_LEN:
+	.long  64, 64, 64, 64
+CMP_MSB_MASK:
+	.long  0x80000000, 0x80000000, 0x80000000, 0x80000000
+PBLENDW_0x33_MASK:
+	.long  0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
+PBLENDW_0xCC_MASK:
+	.long  0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
+PBLENDW_0x3F_MASK:
+	.long  0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
+PBLENDW_0xC0_MASK:
+	.long  0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
+
+#endif
diff --git a/llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_windows_gnu.S b/llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_windows_gnu.S
new file mode 100644
index 000000000000..8852ba5976e1
--- /dev/null
+++ b/llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_windows_gnu.S
@@ -0,0 +1,2332 @@
+.intel_syntax noprefix
+.global blake3_hash_many_sse2
+.global _blake3_hash_many_sse2
+.global blake3_compress_in_place_sse2
+.global _blake3_compress_in_place_sse2
+.global blake3_compress_xof_sse2
+.global _blake3_compress_xof_sse2
+.section .text
+        .p2align  6
+_blake3_hash_many_sse2:
+blake3_hash_many_sse2:
+        push    r15
+        push    r14
+        push    r13
+        push    r12
+        push    rsi
+        push    rdi
+        push    rbx
+        push    rbp
+        mov     rbp, rsp
+        sub     rsp, 528
+        and     rsp, 0xFFFFFFFFFFFFFFC0
+        movdqa  xmmword ptr [rsp+0x170], xmm6
+        movdqa  xmmword ptr [rsp+0x180], xmm7
+        movdqa  xmmword ptr [rsp+0x190], xmm8
+        movdqa  xmmword ptr [rsp+0x1A0], xmm9
+        movdqa  xmmword ptr [rsp+0x1B0], xmm10
+        movdqa  xmmword ptr [rsp+0x1C0], xmm11
+        movdqa  xmmword ptr [rsp+0x1D0], xmm12
+        movdqa  xmmword ptr [rsp+0x1E0], xmm13
+        movdqa  xmmword ptr [rsp+0x1F0], xmm14
+        movdqa  xmmword ptr [rsp+0x200], xmm15
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+        mov     rcx, r9
+        mov     r8, qword ptr [rbp+0x68]
+        movzx   r9, byte ptr [rbp+0x70]
+        neg     r9d
+        movd    xmm0, r9d
+        pshufd  xmm0, xmm0, 0x00
+        movdqa  xmmword ptr [rsp+0x130], xmm0
+        movdqa  xmm1, xmm0
+        pand    xmm1, xmmword ptr [ADD0+rip]
+        pand    xmm0, xmmword ptr [ADD1+rip]
+        movdqa  xmmword ptr [rsp+0x150], xmm0
+        movd    xmm0, r8d
+        pshufd  xmm0, xmm0, 0x00
+        paddd   xmm0, xmm1
+        movdqa  xmmword ptr [rsp+0x110], xmm0
+        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
+        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
+        pcmpgtd xmm1, xmm0
+        shr     r8, 32
+        movd    xmm2, r8d
+        pshufd  xmm2, xmm2, 0x00
+        psubd   xmm2, xmm1
+        movdqa  xmmword ptr [rsp+0x120], xmm2
+        mov     rbx, qword ptr [rbp+0x90]
+        mov     r15, rdx
+        shl     r15, 6
+        movzx   r13d, byte ptr [rbp+0x78]
+        movzx   r12d, byte ptr [rbp+0x88]
+        cmp     rsi, 4
+        jc      3f
+2:
+        movdqu  xmm3, xmmword ptr [rcx]
+        pshufd  xmm0, xmm3, 0x00
+        pshufd  xmm1, xmm3, 0x55
+        pshufd  xmm2, xmm3, 0xAA
+        pshufd  xmm3, xmm3, 0xFF
+        movdqu  xmm7, xmmword ptr [rcx+0x10]
+        pshufd  xmm4, xmm7, 0x00
+        pshufd  xmm5, xmm7, 0x55
+        pshufd  xmm6, xmm7, 0xAA
+        pshufd  xmm7, xmm7, 0xFF
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        mov     r10, qword ptr [rdi+0x10]
+        mov     r11, qword ptr [rdi+0x18]
+        movzx   eax, byte ptr [rbp+0x80]
+        or      eax, r13d
+        xor     edx, edx
+9:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        movdqu  xmm8, xmmword ptr [r8+rdx-0x40]
+        movdqu  xmm9, xmmword ptr [r9+rdx-0x40]
+        movdqu  xmm10, xmmword ptr [r10+rdx-0x40]
+        movdqu  xmm11, xmmword ptr [r11+rdx-0x40]
+        movdqa  xmm12, xmm8
+        punpckldq xmm8, xmm9
+        punpckhdq xmm12, xmm9
+        movdqa  xmm14, xmm10
+        punpckldq xmm10, xmm11
+        punpckhdq xmm14, xmm11
+        movdqa  xmm9, xmm8
+        punpcklqdq xmm8, xmm10
+        punpckhqdq xmm9, xmm10
+        movdqa  xmm13, xmm12
+        punpcklqdq xmm12, xmm14
+        punpckhqdq xmm13, xmm14
+        movdqa  xmmword ptr [rsp], xmm8
+        movdqa  xmmword ptr [rsp+0x10], xmm9
+        movdqa  xmmword ptr [rsp+0x20], xmm12
+        movdqa  xmmword ptr [rsp+0x30], xmm13
+        movdqu  xmm8, xmmword ptr [r8+rdx-0x30]
+        movdqu  xmm9, xmmword ptr [r9+rdx-0x30]
+        movdqu  xmm10, xmmword ptr [r10+rdx-0x30]
+        movdqu  xmm11, xmmword ptr [r11+rdx-0x30]
+        movdqa  xmm12, xmm8
+        punpckldq xmm8, xmm9
+        punpckhdq xmm12, xmm9
+        movdqa  xmm14, xmm10
+        punpckldq xmm10, xmm11
+        punpckhdq xmm14, xmm11
+        movdqa  xmm9, xmm8
+        punpcklqdq xmm8, xmm10
+        punpckhqdq xmm9, xmm10
+        movdqa  xmm13, xmm12
+        punpcklqdq xmm12, xmm14
+        punpckhqdq xmm13, xmm14
+        movdqa  xmmword ptr [rsp+0x40], xmm8
+        movdqa  xmmword ptr [rsp+0x50], xmm9
+        movdqa  xmmword ptr [rsp+0x60], xmm12
+        movdqa  xmmword ptr [rsp+0x70], xmm13
+        movdqu  xmm8, xmmword ptr [r8+rdx-0x20]
+        movdqu  xmm9, xmmword ptr [r9+rdx-0x20]
+        movdqu  xmm10, xmmword ptr [r10+rdx-0x20]
+        movdqu  xmm11, xmmword ptr [r11+rdx-0x20]
+        movdqa  xmm12, xmm8
+        punpckldq xmm8, xmm9
+        punpckhdq xmm12, xmm9
+        movdqa  xmm14, xmm10
+        punpckldq xmm10, xmm11
+        punpckhdq xmm14, xmm11
+        movdqa  xmm9, xmm8
+        punpcklqdq xmm8, xmm10
+        punpckhqdq xmm9, xmm10
+        movdqa  xmm13, xmm12
+        punpcklqdq xmm12, xmm14
+        punpckhqdq xmm13, xmm14
+        movdqa  xmmword ptr [rsp+0x80], xmm8
+        movdqa  xmmword ptr [rsp+0x90], xmm9
+        movdqa  xmmword ptr [rsp+0xA0], xmm12
+        movdqa  xmmword ptr [rsp+0xB0], xmm13
+        movdqu  xmm8, xmmword ptr [r8+rdx-0x10]
+        movdqu  xmm9, xmmword ptr [r9+rdx-0x10]
+        movdqu  xmm10, xmmword ptr [r10+rdx-0x10]
+        movdqu  xmm11, xmmword ptr [r11+rdx-0x10]
+        movdqa  xmm12, xmm8
+        punpckldq xmm8, xmm9
+        punpckhdq xmm12, xmm9
+        movdqa  xmm14, xmm10
+        punpckldq xmm10, xmm11
+        punpckhdq xmm14, xmm11
+        movdqa  xmm9, xmm8
+        punpcklqdq xmm8, xmm10
+        punpckhqdq xmm9, xmm10
+        movdqa  xmm13, xmm12
+        punpcklqdq xmm12, xmm14
+        punpckhqdq xmm13, xmm14
+        movdqa  xmmword ptr [rsp+0xC0], xmm8
+        movdqa  xmmword ptr [rsp+0xD0], xmm9
+        movdqa  xmmword ptr [rsp+0xE0], xmm12
+        movdqa  xmmword ptr [rsp+0xF0], xmm13
+        movdqa  xmm9, xmmword ptr [BLAKE3_IV_1+rip]
+        movdqa  xmm10, xmmword ptr [BLAKE3_IV_2+rip]
+        movdqa  xmm11, xmmword ptr [BLAKE3_IV_3+rip]
+        movdqa  xmm12, xmmword ptr [rsp+0x110]
+        movdqa  xmm13, xmmword ptr [rsp+0x120]
+        movdqa  xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
+        movd    xmm15, eax
+        pshufd  xmm15, xmm15, 0x00
+        prefetcht0 [r8+rdx+0x80]
+        prefetcht0 [r9+rdx+0x80]
+        prefetcht0 [r10+rdx+0x80]
+        prefetcht0 [r11+rdx+0x80]
+        paddd   xmm0, xmmword ptr [rsp]
+        paddd   xmm1, xmmword ptr [rsp+0x20]
+        paddd   xmm2, xmmword ptr [rsp+0x40]
+        paddd   xmm3, xmmword ptr [rsp+0x60]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        movdqa  xmm8, xmmword ptr [BLAKE3_IV_0+rip]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x10]
+        paddd   xmm1, xmmword ptr [rsp+0x30]
+        paddd   xmm2, xmmword ptr [rsp+0x50]
+        paddd   xmm3, xmmword ptr [rsp+0x70]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x80]
+        paddd   xmm1, xmmword ptr [rsp+0xA0]
+        paddd   xmm2, xmmword ptr [rsp+0xC0]
+        paddd   xmm3, xmmword ptr [rsp+0xE0]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x90]
+        paddd   xmm1, xmmword ptr [rsp+0xB0]
+        paddd   xmm2, xmmword ptr [rsp+0xD0]
+        paddd   xmm3, xmmword ptr [rsp+0xF0]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x20]
+        paddd   xmm1, xmmword ptr [rsp+0x30]
+        paddd   xmm2, xmmword ptr [rsp+0x70]
+        paddd   xmm3, xmmword ptr [rsp+0x40]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x60]
+        paddd   xmm1, xmmword ptr [rsp+0xA0]
+        paddd   xmm2, xmmword ptr [rsp]
+        paddd   xmm3, xmmword ptr [rsp+0xD0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x10]
+        paddd   xmm1, xmmword ptr [rsp+0xC0]
+        paddd   xmm2, xmmword ptr [rsp+0x90]
+        paddd   xmm3, xmmword ptr [rsp+0xF0]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xB0]
+        paddd   xmm1, xmmword ptr [rsp+0x50]
+        paddd   xmm2, xmmword ptr [rsp+0xE0]
+        paddd   xmm3, xmmword ptr [rsp+0x80]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x30]
+        paddd   xmm1, xmmword ptr [rsp+0xA0]
+        paddd   xmm2, xmmword ptr [rsp+0xD0]
+        paddd   xmm3, xmmword ptr [rsp+0x70]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x40]
+        paddd   xmm1, xmmword ptr [rsp+0xC0]
+        paddd   xmm2, xmmword ptr [rsp+0x20]
+        paddd   xmm3, xmmword ptr [rsp+0xE0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x60]
+        paddd   xmm1, xmmword ptr [rsp+0x90]
+        paddd   xmm2, xmmword ptr [rsp+0xB0]
+        paddd   xmm3, xmmword ptr [rsp+0x80]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x50]
+        paddd   xmm1, xmmword ptr [rsp]
+        paddd   xmm2, xmmword ptr [rsp+0xF0]
+        paddd   xmm3, xmmword ptr [rsp+0x10]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xA0]
+        paddd   xmm1, xmmword ptr [rsp+0xC0]
+        paddd   xmm2, xmmword ptr [rsp+0xE0]
+        paddd   xmm3, xmmword ptr [rsp+0xD0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x70]
+        paddd   xmm1, xmmword ptr [rsp+0x90]
+        paddd   xmm2, xmmword ptr [rsp+0x30]
+        paddd   xmm3, xmmword ptr [rsp+0xF0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x40]
+        paddd   xmm1, xmmword ptr [rsp+0xB0]
+        paddd   xmm2, xmmword ptr [rsp+0x50]
+        paddd   xmm3, xmmword ptr [rsp+0x10]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp]
+        paddd   xmm1, xmmword ptr [rsp+0x20]
+        paddd   xmm2, xmmword ptr [rsp+0x80]
+        paddd   xmm3, xmmword ptr [rsp+0x60]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xC0]
+        paddd   xmm1, xmmword ptr [rsp+0x90]
+        paddd   xmm2, xmmword ptr [rsp+0xF0]
+        paddd   xmm3, xmmword ptr [rsp+0xE0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xD0]
+        paddd   xmm1, xmmword ptr [rsp+0xB0]
+        paddd   xmm2, xmmword ptr [rsp+0xA0]
+        paddd   xmm3, xmmword ptr [rsp+0x80]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x70]
+        paddd   xmm1, xmmword ptr [rsp+0x50]
+        paddd   xmm2, xmmword ptr [rsp]
+        paddd   xmm3, xmmword ptr [rsp+0x60]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x20]
+        paddd   xmm1, xmmword ptr [rsp+0x30]
+        paddd   xmm2, xmmword ptr [rsp+0x10]
+        paddd   xmm3, xmmword ptr [rsp+0x40]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x90]
+        paddd   xmm1, xmmword ptr [rsp+0xB0]
+        paddd   xmm2, xmmword ptr [rsp+0x80]
+        paddd   xmm3, xmmword ptr [rsp+0xF0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xE0]
+        paddd   xmm1, xmmword ptr [rsp+0x50]
+        paddd   xmm2, xmmword ptr [rsp+0xC0]
+        paddd   xmm3, xmmword ptr [rsp+0x10]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xD0]
+        paddd   xmm1, xmmword ptr [rsp]
+        paddd   xmm2, xmmword ptr [rsp+0x20]
+        paddd   xmm3, xmmword ptr [rsp+0x40]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x30]
+        paddd   xmm1, xmmword ptr [rsp+0xA0]
+        paddd   xmm2, xmmword ptr [rsp+0x60]
+        paddd   xmm3, xmmword ptr [rsp+0x70]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xB0]
+        paddd   xmm1, xmmword ptr [rsp+0x50]
+        paddd   xmm2, xmmword ptr [rsp+0x10]
+        paddd   xmm3, xmmword ptr [rsp+0x80]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xF0]
+        paddd   xmm1, xmmword ptr [rsp]
+        paddd   xmm2, xmmword ptr [rsp+0x90]
+        paddd   xmm3, xmmword ptr [rsp+0x60]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xE0]
+        paddd   xmm1, xmmword ptr [rsp+0x20]
+        paddd   xmm2, xmmword ptr [rsp+0x30]
+        paddd   xmm3, xmmword ptr [rsp+0x70]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xA0]
+        paddd   xmm1, xmmword ptr [rsp+0xC0]
+        paddd   xmm2, xmmword ptr [rsp+0x40]
+        paddd   xmm3, xmmword ptr [rsp+0xD0]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        pxor    xmm0, xmm8
+        pxor    xmm1, xmm9
+        pxor    xmm2, xmm10
+        pxor    xmm3, xmm11
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        pxor    xmm4, xmm12
+        pxor    xmm5, xmm13
+        pxor    xmm6, xmm14
+        pxor    xmm7, xmm15
+        mov     eax, r13d
+        jne     9b
+        movdqa  xmm9, xmm0
+        punpckldq xmm0, xmm1
+        punpckhdq xmm9, xmm1
+        movdqa  xmm11, xmm2
+        punpckldq xmm2, xmm3
+        punpckhdq xmm11, xmm3
+        movdqa  xmm1, xmm0
+        punpcklqdq xmm0, xmm2
+        punpckhqdq xmm1, xmm2
+        movdqa  xmm3, xmm9
+        punpcklqdq xmm9, xmm11
+        punpckhqdq xmm3, xmm11
+        movdqu  xmmword ptr [rbx], xmm0
+        movdqu  xmmword ptr [rbx+0x20], xmm1
+        movdqu  xmmword ptr [rbx+0x40], xmm9
+        movdqu  xmmword ptr [rbx+0x60], xmm3
+        movdqa  xmm9, xmm4
+        punpckldq xmm4, xmm5
+        punpckhdq xmm9, xmm5
+        movdqa  xmm11, xmm6
+        punpckldq xmm6, xmm7
+        punpckhdq xmm11, xmm7
+        movdqa  xmm5, xmm4
+        punpcklqdq xmm4, xmm6
+        punpckhqdq xmm5, xmm6
+        movdqa  xmm7, xmm9
+        punpcklqdq xmm9, xmm11
+        punpckhqdq xmm7, xmm11
+        movdqu  xmmword ptr [rbx+0x10], xmm4
+        movdqu  xmmword ptr [rbx+0x30], xmm5
+        movdqu  xmmword ptr [rbx+0x50], xmm9
+        movdqu  xmmword ptr [rbx+0x70], xmm7
+        movdqa  xmm1, xmmword ptr [rsp+0x110]
+        movdqa  xmm0, xmm1
+        paddd   xmm1, xmmword ptr [rsp+0x150]
+        movdqa  xmmword ptr [rsp+0x110], xmm1
+        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
+        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
+        pcmpgtd xmm0, xmm1
+        movdqa  xmm1, xmmword ptr [rsp+0x120]
+        psubd   xmm1, xmm0
+        movdqa  xmmword ptr [rsp+0x120], xmm1
+        add     rbx, 128
+        add     rdi, 32
+        sub     rsi, 4
+        cmp     rsi, 4
+        jnc     2b
+        test    rsi, rsi
+        jne     3f
+4:
+        movdqa  xmm6, xmmword ptr [rsp+0x170]
+        movdqa  xmm7, xmmword ptr [rsp+0x180]
+        movdqa  xmm8, xmmword ptr [rsp+0x190]
+        movdqa  xmm9, xmmword ptr [rsp+0x1A0]
+        movdqa  xmm10, xmmword ptr [rsp+0x1B0]
+        movdqa  xmm11, xmmword ptr [rsp+0x1C0]
+        movdqa  xmm12, xmmword ptr [rsp+0x1D0]
+        movdqa  xmm13, xmmword ptr [rsp+0x1E0]
+        movdqa  xmm14, xmmword ptr [rsp+0x1F0]
+        movdqa  xmm15, xmmword ptr [rsp+0x200]
+        mov     rsp, rbp
+        pop     rbp
+        pop     rbx
+        pop     rdi
+        pop     rsi
+        pop     r12
+        pop     r13
+        pop     r14
+        pop     r15
+        ret
+.p2align 5
+3:
+        test    esi, 0x2
+        je      3f
+        movups  xmm0, xmmword ptr [rcx]
+        movups  xmm1, xmmword ptr [rcx+0x10]
+        movaps  xmm8, xmm0
+        movaps  xmm9, xmm1
+        movd    xmm13, dword ptr [rsp+0x110]
+        movd    xmm14, dword ptr [rsp+0x120]
+        punpckldq xmm13, xmm14
+        movaps  xmmword ptr [rsp], xmm13
+        movd    xmm14, dword ptr [rsp+0x114]
+        movd    xmm13, dword ptr [rsp+0x124]
+        punpckldq xmm14, xmm13
+        movaps  xmmword ptr [rsp+0x10], xmm14
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        movzx   eax, byte ptr [rbp+0x80]
+        or      eax, r13d
+        xor     edx, edx
+2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
+        movaps  xmm10, xmm2
+        movups  xmm4, xmmword ptr [r8+rdx-0x40]
+        movups  xmm5, xmmword ptr [r8+rdx-0x30]
+        movaps  xmm3, xmm4
+        shufps  xmm4, xmm5, 136
+        shufps  xmm3, xmm5, 221
+        movaps  xmm5, xmm3
+        movups  xmm6, xmmword ptr [r8+rdx-0x20]
+        movups  xmm7, xmmword ptr [r8+rdx-0x10]
+        movaps  xmm3, xmm6
+        shufps  xmm6, xmm7, 136
+        pshufd  xmm6, xmm6, 0x93
+        shufps  xmm3, xmm7, 221
+        pshufd  xmm7, xmm3, 0x93
+        movups  xmm12, xmmword ptr [r9+rdx-0x40]
+        movups  xmm13, xmmword ptr [r9+rdx-0x30]
+        movaps  xmm11, xmm12
+        shufps  xmm12, xmm13, 136
+        shufps  xmm11, xmm13, 221
+        movaps  xmm13, xmm11
+        movups  xmm14, xmmword ptr [r9+rdx-0x20]
+        movups  xmm15, xmmword ptr [r9+rdx-0x10]
+        movaps  xmm11, xmm14
+        shufps  xmm14, xmm15, 136
+        pshufd  xmm14, xmm14, 0x93
+        shufps  xmm11, xmm15, 221
+        pshufd  xmm15, xmm11, 0x93
+        shl     rax, 0x20
+        or      rax, 0x40
+        movq    xmm3, rax
+        movdqa  xmmword ptr [rsp+0x20], xmm3
+        movaps  xmm3, xmmword ptr [rsp]
+        movaps  xmm11, xmmword ptr [rsp+0x10]
+        punpcklqdq xmm3, xmmword ptr [rsp+0x20]
+        punpcklqdq xmm11, xmmword ptr [rsp+0x20]
+        mov     al, 7
+9:
+        paddd   xmm0, xmm4
+        paddd   xmm8, xmm12
+        movaps  xmmword ptr [rsp+0x20], xmm4
+        movaps  xmmword ptr [rsp+0x30], xmm12
+        paddd   xmm0, xmm1
+        paddd   xmm8, xmm9
+        pxor    xmm3, xmm0
+        pxor    xmm11, xmm8
+        pshuflw xmm3, xmm3, 0xB1
+        pshufhw xmm3, xmm3, 0xB1
+        pshuflw xmm11, xmm11, 0xB1
+        pshufhw xmm11, xmm11, 0xB1
+        paddd   xmm2, xmm3
+        paddd   xmm10, xmm11
+        pxor    xmm1, xmm2
+        pxor    xmm9, xmm10
+        movdqa  xmm4, xmm1
+        pslld   xmm1, 20
+        psrld   xmm4, 12
+        por     xmm1, xmm4
+        movdqa  xmm4, xmm9
+        pslld   xmm9, 20
+        psrld   xmm4, 12
+        por     xmm9, xmm4
+        paddd   xmm0, xmm5
+        paddd   xmm8, xmm13
+        movaps  xmmword ptr [rsp+0x40], xmm5
+        movaps  xmmword ptr [rsp+0x50], xmm13
+        paddd   xmm0, xmm1
+        paddd   xmm8, xmm9
+        pxor    xmm3, xmm0
+        pxor    xmm11, xmm8
+        movdqa  xmm13, xmm3
+        psrld   xmm3, 8
+        pslld   xmm13, 24
+        pxor    xmm3, xmm13
+        movdqa  xmm13, xmm11
+        psrld   xmm11, 8
+        pslld   xmm13, 24
+        pxor    xmm11, xmm13
+        paddd   xmm2, xmm3
+        paddd   xmm10, xmm11
+        pxor    xmm1, xmm2
+        pxor    xmm9, xmm10
+        movdqa  xmm4, xmm1
+        pslld   xmm1, 25
+        psrld   xmm4, 7
+        por     xmm1, xmm4
+        movdqa  xmm4, xmm9
+        pslld   xmm9, 25
+        psrld   xmm4, 7
+        por     xmm9, xmm4
+        pshufd  xmm0, xmm0, 0x93
+        pshufd  xmm8, xmm8, 0x93
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm11, xmm11, 0x4E
+        pshufd  xmm2, xmm2, 0x39
+        pshufd  xmm10, xmm10, 0x39
+        paddd   xmm0, xmm6
+        paddd   xmm8, xmm14
+        paddd   xmm0, xmm1
+        paddd   xmm8, xmm9
+        pxor    xmm3, xmm0
+        pxor    xmm11, xmm8
+        pshuflw xmm3, xmm3, 0xB1
+        pshufhw xmm3, xmm3, 0xB1
+        pshuflw xmm11, xmm11, 0xB1
+        pshufhw xmm11, xmm11, 0xB1
+        paddd   xmm2, xmm3
+        paddd   xmm10, xmm11
+        pxor    xmm1, xmm2
+        pxor    xmm9, xmm10
+        movdqa  xmm4, xmm1
+        pslld   xmm1, 20
+        psrld   xmm4, 12
+        por     xmm1, xmm4
+        movdqa  xmm4, xmm9
+        pslld   xmm9, 20
+        psrld   xmm4, 12
+        por     xmm9, xmm4
+        paddd   xmm0, xmm7
+        paddd   xmm8, xmm15
+        paddd   xmm0, xmm1
+        paddd   xmm8, xmm9
+        pxor    xmm3, xmm0
+        pxor    xmm11, xmm8
+        movdqa  xmm13, xmm3
+        psrld   xmm3, 8
+        pslld   xmm13, 24
+        pxor    xmm3, xmm13
+        movdqa  xmm13, xmm11
+        psrld   xmm11, 8
+        pslld   xmm13, 24
+        pxor    xmm11, xmm13
+        paddd   xmm2, xmm3
+        paddd   xmm10, xmm11
+        pxor    xmm1, xmm2
+        pxor    xmm9, xmm10
+        movdqa  xmm4, xmm1
+        pslld   xmm1, 25
+        psrld   xmm4, 7
+        por     xmm1, xmm4
+        movdqa  xmm4, xmm9
+        pslld   xmm9, 25
+        psrld   xmm4, 7
+        por     xmm9, xmm4
+        pshufd  xmm0, xmm0, 0x39
+        pshufd  xmm8, xmm8, 0x39
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm11, xmm11, 0x4E
+        pshufd  xmm2, xmm2, 0x93
+        pshufd  xmm10, xmm10, 0x93
+        dec     al
+        je      9f
+        movdqa  xmm12, xmmword ptr [rsp+0x20]
+        movdqa  xmm5, xmmword ptr [rsp+0x40]
+        pshufd  xmm13, xmm12, 0x0F
+        shufps  xmm12, xmm5, 214
+        pshufd  xmm4, xmm12, 0x39
+        movdqa  xmm12, xmm6
+        shufps  xmm12, xmm7, 250
+        pand    xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
+        pand    xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+        por     xmm13, xmm12
+        movdqa  xmmword ptr [rsp+0x20], xmm13
+        movdqa  xmm12, xmm7
+        punpcklqdq xmm12, xmm5
+        movdqa  xmm13, xmm6
+        pand    xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+        pand    xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+        por     xmm12, xmm13
+        pshufd  xmm12, xmm12, 0x78
+        punpckhdq xmm5, xmm7
+        punpckldq xmm6, xmm5
+        pshufd  xmm7, xmm6, 0x1E
+        movdqa  xmmword ptr [rsp+0x40], xmm12
+        movdqa  xmm5, xmmword ptr [rsp+0x30]
+        movdqa  xmm13, xmmword ptr [rsp+0x50]
+        pshufd  xmm6, xmm5, 0x0F
+        shufps  xmm5, xmm13, 214
+        pshufd  xmm12, xmm5, 0x39
+        movdqa  xmm5, xmm14
+        shufps  xmm5, xmm15, 250
+        pand    xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
+        pand    xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+        por     xmm6, xmm5
+        movdqa  xmm5, xmm15
+        punpcklqdq xmm5, xmm13
+        movdqa  xmmword ptr [rsp+0x30], xmm2
+        movdqa  xmm2, xmm14
+        pand    xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+        pand    xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+        por     xmm5, xmm2
+        movdqa  xmm2, xmmword ptr [rsp+0x30]
+        pshufd  xmm5, xmm5, 0x78
+        punpckhdq xmm13, xmm15
+        punpckldq xmm14, xmm13
+        pshufd  xmm15, xmm14, 0x1E
+        movdqa  xmm13, xmm6
+        movdqa  xmm14, xmm5
+        movdqa  xmm5, xmmword ptr [rsp+0x20]
+        movdqa  xmm6, xmmword ptr [rsp+0x40]
+        jmp     9b
+9:
+        pxor    xmm0, xmm2
+        pxor    xmm1, xmm3
+        pxor    xmm8, xmm10
+        pxor    xmm9, xmm11
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     2b
+        movups  xmmword ptr [rbx], xmm0
+        movups  xmmword ptr [rbx+0x10], xmm1
+        movups  xmmword ptr [rbx+0x20], xmm8
+        movups  xmmword ptr [rbx+0x30], xmm9
+        mov     eax, dword ptr [rsp+0x130]
+        neg     eax
+        mov    r10d, dword ptr [rsp+0x110+8*rax]
+        mov    r11d, dword ptr [rsp+0x120+8*rax]
+        mov dword ptr [rsp+0x110], r10d
+        mov dword ptr [rsp+0x120], r11d
+        add     rdi, 16
+        add     rbx, 64
+        sub     rsi, 2
+3:
+        test    esi, 0x1
+        je      4b
+        movups  xmm0, xmmword ptr [rcx]
+        movups  xmm1, xmmword ptr [rcx+0x10]
+        movd    xmm13, dword ptr [rsp+0x110]
+        movd    xmm14, dword ptr [rsp+0x120]
+        punpckldq xmm13, xmm14
+        mov     r8, qword ptr [rdi]
+        movzx   eax, byte ptr [rbp+0x80]
+        or      eax, r13d
+        xor     edx, edx
+2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
+        shl     rax, 32
+        or      rax, 64
+        movq    xmm12, rax
+        movdqa  xmm3, xmm13
+        punpcklqdq xmm3, xmm12
+        movups  xmm4, xmmword ptr [r8+rdx-0x40]
+        movups  xmm5, xmmword ptr [r8+rdx-0x30]
+        movaps  xmm8, xmm4
+        shufps  xmm4, xmm5, 136
+        shufps  xmm8, xmm5, 221
+        movaps  xmm5, xmm8
+        movups  xmm6, xmmword ptr [r8+rdx-0x20]
+        movups  xmm7, xmmword ptr [r8+rdx-0x10]
+        movaps  xmm8, xmm6
+        shufps  xmm6, xmm7, 136
+        pshufd  xmm6, xmm6, 0x93
+        shufps  xmm8, xmm7, 221
+        pshufd  xmm7, xmm8, 0x93
+        mov     al, 7
+9:
+        paddd   xmm0, xmm4
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshuflw xmm3, xmm3, 0xB1
+        pshufhw xmm3, xmm3, 0xB1
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm5
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        movdqa  xmm14, xmm3
+        psrld   xmm3, 8
+        pslld   xmm14, 24
+        pxor    xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x93
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x39
+        paddd   xmm0, xmm6
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshuflw xmm3, xmm3, 0xB1
+        pshufhw xmm3, xmm3, 0xB1
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm7
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        movdqa  xmm14, xmm3
+        psrld   xmm3, 8
+        pslld   xmm14, 24
+        pxor    xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x39
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x93
+        dec     al
+        jz      9f
+        movdqa  xmm8, xmm4
+        shufps  xmm8, xmm5, 214
+        pshufd  xmm9, xmm4, 0x0F
+        pshufd  xmm4, xmm8, 0x39
+        movdqa  xmm8, xmm6
+        shufps  xmm8, xmm7, 250
+        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+        por     xmm9, xmm8
+        movdqa  xmm8, xmm7
+        punpcklqdq xmm8, xmm5
+        movdqa  xmm10, xmm6
+        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+        pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+        por     xmm8, xmm10
+        pshufd  xmm8, xmm8, 0x78
+        punpckhdq xmm5, xmm7
+        punpckldq xmm6, xmm5
+        pshufd  xmm7, xmm6, 0x1E
+        movdqa  xmm5, xmm9
+        movdqa  xmm6, xmm8
+        jmp     9b
+9:
+        pxor    xmm0, xmm2
+        pxor    xmm1, xmm3
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     2b
+        movups  xmmword ptr [rbx], xmm0
+        movups  xmmword ptr [rbx+0x10], xmm1
+        jmp     4b
+
+.p2align 6
+blake3_compress_in_place_sse2:
+_blake3_compress_in_place_sse2:
+        sub     rsp, 120
+        movdqa  xmmword ptr [rsp], xmm6
+        movdqa  xmmword ptr [rsp+0x10], xmm7
+        movdqa  xmmword ptr [rsp+0x20], xmm8
+        movdqa  xmmword ptr [rsp+0x30], xmm9
+        movdqa  xmmword ptr [rsp+0x40], xmm11
+        movdqa  xmmword ptr [rsp+0x50], xmm14
+        movdqa  xmmword ptr [rsp+0x60], xmm15
+        movups  xmm0, xmmword ptr [rcx]
+        movups  xmm1, xmmword ptr [rcx+0x10]
+        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
+        movzx   eax, byte ptr [rsp+0xA0]
+        movzx   r8d, r8b
+        shl     rax, 32
+        add     r8, rax
+        movq    xmm3, r9
+        movq    xmm4, r8
+        punpcklqdq xmm3, xmm4
+        movups  xmm4, xmmword ptr [rdx]
+        movups  xmm5, xmmword ptr [rdx+0x10]
+        movaps  xmm8, xmm4
+        shufps  xmm4, xmm5, 136
+        shufps  xmm8, xmm5, 221
+        movaps  xmm5, xmm8
+        movups  xmm6, xmmword ptr [rdx+0x20]
+        movups  xmm7, xmmword ptr [rdx+0x30]
+        movaps  xmm8, xmm6
+        shufps  xmm6, xmm7, 136
+        pshufd  xmm6, xmm6, 0x93
+        shufps  xmm8, xmm7, 221
+        pshufd  xmm7, xmm8, 0x93
+        mov     al, 7
+9:
+        paddd   xmm0, xmm4
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshuflw xmm3, xmm3, 0xB1
+        pshufhw xmm3, xmm3, 0xB1
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm5
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        movdqa  xmm14, xmm3
+        psrld   xmm3, 8
+        pslld   xmm14, 24
+        pxor    xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x93
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x39
+        paddd   xmm0, xmm6
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshuflw xmm3, xmm3, 0xB1
+        pshufhw xmm3, xmm3, 0xB1
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm7
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        movdqa  xmm14, xmm3
+        psrld   xmm3, 8
+        pslld   xmm14, 24
+        pxor    xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x39
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x93
+        dec     al
+        jz      9f
+        movdqa  xmm8, xmm4
+        shufps  xmm8, xmm5, 214
+        pshufd  xmm9, xmm4, 0x0F
+        pshufd  xmm4, xmm8, 0x39
+        movdqa  xmm8, xmm6
+        shufps  xmm8, xmm7, 250
+        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+        por     xmm9, xmm8
+        movdqa  xmm8, xmm7
+        punpcklqdq xmm8, xmm5
+        movdqa  xmm14, xmm6
+        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+        pand    xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+        por     xmm8, xmm14
+        pshufd  xmm8, xmm8, 0x78
+        punpckhdq xmm5, xmm7
+        punpckldq xmm6, xmm5
+        pshufd  xmm7, xmm6, 0x1E
+        movdqa  xmm5, xmm9
+        movdqa  xmm6, xmm8
+        jmp     9b
+9:
+        pxor    xmm0, xmm2
+        pxor    xmm1, xmm3
+        movups  xmmword ptr [rcx], xmm0
+        movups  xmmword ptr [rcx+0x10], xmm1
+        movdqa  xmm6, xmmword ptr [rsp]
+        movdqa  xmm7, xmmword ptr [rsp+0x10]
+        movdqa  xmm8, xmmword ptr [rsp+0x20]
+        movdqa  xmm9, xmmword ptr [rsp+0x30]
+        movdqa  xmm11, xmmword ptr [rsp+0x40]
+        movdqa  xmm14, xmmword ptr [rsp+0x50]
+        movdqa  xmm15, xmmword ptr [rsp+0x60]
+        add     rsp, 120
+        ret
+
+
+.p2align 6
+_blake3_compress_xof_sse2:
+blake3_compress_xof_sse2:
+        sub     rsp, 120
+        movdqa  xmmword ptr [rsp], xmm6
+        movdqa  xmmword ptr [rsp+0x10], xmm7
+        movdqa  xmmword ptr [rsp+0x20], xmm8
+        movdqa  xmmword ptr [rsp+0x30], xmm9
+        movdqa  xmmword ptr [rsp+0x40], xmm11
+        movdqa  xmmword ptr [rsp+0x50], xmm14
+        movdqa  xmmword ptr [rsp+0x60], xmm15
+        movups  xmm0, xmmword ptr [rcx]
+        movups  xmm1, xmmword ptr [rcx+0x10]
+        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
+        movzx   eax, byte ptr [rsp+0xA0]
+        movzx   r8d, r8b
+        mov     r10, qword ptr [rsp+0xA8]
+        shl     rax, 32
+        add     r8, rax
+        movq    xmm3, r9
+        movq    xmm4, r8
+        punpcklqdq xmm3, xmm4
+        movups  xmm4, xmmword ptr [rdx]
+        movups  xmm5, xmmword ptr [rdx+0x10]
+        movaps  xmm8, xmm4
+        shufps  xmm4, xmm5, 136
+        shufps  xmm8, xmm5, 221
+        movaps  xmm5, xmm8
+        movups  xmm6, xmmword ptr [rdx+0x20]
+        movups  xmm7, xmmword ptr [rdx+0x30]
+        movaps  xmm8, xmm6
+        shufps  xmm6, xmm7, 136
+        pshufd  xmm6, xmm6, 0x93
+        shufps  xmm8, xmm7, 221
+        pshufd  xmm7, xmm8, 0x93
+        mov     al, 7
+9:
+        paddd   xmm0, xmm4
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshuflw xmm3, xmm3, 0xB1
+        pshufhw xmm3, xmm3, 0xB1
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm5
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        movdqa  xmm14, xmm3
+        psrld   xmm3, 8
+        pslld   xmm14, 24
+        pxor    xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x93
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x39
+        paddd   xmm0, xmm6
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshuflw xmm3, xmm3, 0xB1
+        pshufhw xmm3, xmm3, 0xB1
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm7
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        movdqa  xmm14, xmm3
+        psrld   xmm3, 8
+        pslld   xmm14, 24
+        pxor    xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x39
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x93
+        dec     al
+        jz      9f
+        movdqa  xmm8, xmm4
+        shufps  xmm8, xmm5, 214
+        pshufd  xmm9, xmm4, 0x0F
+        pshufd  xmm4, xmm8, 0x39
+        movdqa  xmm8, xmm6
+        shufps  xmm8, xmm7, 250
+        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+        por     xmm9, xmm8
+        movdqa  xmm8, xmm7
+        punpcklqdq xmm8, xmm5
+        movdqa  xmm14, xmm6
+        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+        pand    xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+        por     xmm8, xmm14
+        pshufd  xmm8, xmm8, 0x78
+        punpckhdq xmm5, xmm7
+        punpckldq xmm6, xmm5
+        pshufd  xmm7, xmm6, 0x1E
+        movdqa  xmm5, xmm9
+        movdqa  xmm6, xmm8
+        jmp     9b
+9:
+        movdqu  xmm4, xmmword ptr [rcx]
+        movdqu  xmm5, xmmword ptr [rcx+0x10]
+        pxor    xmm0, xmm2
+        pxor    xmm1, xmm3
+        pxor    xmm2, xmm4
+        pxor    xmm3, xmm5
+        movups  xmmword ptr [r10], xmm0
+        movups  xmmword ptr [r10+0x10], xmm1
+        movups  xmmword ptr [r10+0x20], xmm2
+        movups  xmmword ptr [r10+0x30], xmm3
+        movdqa  xmm6, xmmword ptr [rsp]
+        movdqa  xmm7, xmmword ptr [rsp+0x10]
+        movdqa  xmm8, xmmword ptr [rsp+0x20]
+        movdqa  xmm9, xmmword ptr [rsp+0x30]
+        movdqa  xmm11, xmmword ptr [rsp+0x40]
+        movdqa  xmm14, xmmword ptr [rsp+0x50]
+        movdqa  xmm15, xmmword ptr [rsp+0x60]
+        add     rsp, 120
+        ret
+
+
+.section .rodata
+.p2align  6
+BLAKE3_IV:
+        .long  0x6A09E667, 0xBB67AE85
+        .long  0x3C6EF372, 0xA54FF53A
+ADD0:   
+        .long  0, 1, 2, 3
+ADD1:
+        .long  4, 4, 4, 4
+BLAKE3_IV_0:
+        .long  0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
+BLAKE3_IV_1:
+        .long  0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
+BLAKE3_IV_2:
+        .long  0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
+BLAKE3_IV_3:
+        .long  0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
+BLAKE3_BLOCK_LEN:
+        .long  64, 64, 64, 64
+CMP_MSB_MASK:
+        .long  0x80000000, 0x80000000, 0x80000000, 0x80000000
+PBLENDW_0x33_MASK:
+        .long  0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
+PBLENDW_0xCC_MASK:
+        .long  0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
+PBLENDW_0x3F_MASK:
+        .long  0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
+PBLENDW_0xC0_MASK:
+        .long  0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
diff --git a/llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_windows_msvc.asm b/llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_windows_msvc.asm
new file mode 100644
index 000000000000..507502f11a80
--- /dev/null
+++ b/llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_windows_msvc.asm
@@ -0,0 +1,2350 @@
+public _blake3_hash_many_sse2
+public blake3_hash_many_sse2
+public blake3_compress_in_place_sse2
+public _blake3_compress_in_place_sse2
+public blake3_compress_xof_sse2
+public _blake3_compress_xof_sse2
+
+_TEXT   SEGMENT ALIGN(16) 'CODE'
+
+ALIGN   16
+blake3_hash_many_sse2 PROC
+_blake3_hash_many_sse2 PROC
+        push    r15
+        push    r14
+        push    r13
+        push    r12
+        push    rsi
+        push    rdi
+        push    rbx
+        push    rbp
+        mov     rbp, rsp
+        sub     rsp, 528
+        and     rsp, 0FFFFFFFFFFFFFFC0H
+        movdqa  xmmword ptr [rsp+170H], xmm6
+        movdqa  xmmword ptr [rsp+180H], xmm7
+        movdqa  xmmword ptr [rsp+190H], xmm8
+        movdqa  xmmword ptr [rsp+1A0H], xmm9
+        movdqa  xmmword ptr [rsp+1B0H], xmm10
+        movdqa  xmmword ptr [rsp+1C0H], xmm11
+        movdqa  xmmword ptr [rsp+1D0H], xmm12
+        movdqa  xmmword ptr [rsp+1E0H], xmm13
+        movdqa  xmmword ptr [rsp+1F0H], xmm14
+        movdqa  xmmword ptr [rsp+200H], xmm15
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+        mov     rcx, r9
+        mov     r8, qword ptr [rbp+68H]
+        movzx   r9, byte ptr [rbp+70H]
+        neg     r9d
+        movd    xmm0, r9d
+        pshufd  xmm0, xmm0, 00H
+        movdqa  xmmword ptr [rsp+130H], xmm0
+        movdqa  xmm1, xmm0
+        pand    xmm1, xmmword ptr [ADD0]
+        pand    xmm0, xmmword ptr [ADD1]
+        movdqa  xmmword ptr [rsp+150H], xmm0
+        movd    xmm0, r8d
+        pshufd  xmm0, xmm0, 00H
+        paddd   xmm0, xmm1
+        movdqa  xmmword ptr [rsp+110H], xmm0
+        pxor    xmm0, xmmword ptr [CMP_MSB_MASK]
+        pxor    xmm1, xmmword ptr [CMP_MSB_MASK]
+        pcmpgtd xmm1, xmm0
+        shr     r8, 32
+        movd    xmm2, r8d
+        pshufd  xmm2, xmm2, 00H
+        psubd   xmm2, xmm1
+        movdqa  xmmword ptr [rsp+120H], xmm2
+        mov     rbx, qword ptr [rbp+90H]
+        mov     r15, rdx
+        shl     r15, 6
+        movzx   r13d, byte ptr [rbp+78H]
+        movzx   r12d, byte ptr [rbp+88H]
+        cmp     rsi, 4
+        jc      final3blocks
+outerloop4:
+        movdqu  xmm3, xmmword ptr [rcx]
+        pshufd  xmm0, xmm3, 00H
+        pshufd  xmm1, xmm3, 55H
+        pshufd  xmm2, xmm3, 0AAH
+        pshufd  xmm3, xmm3, 0FFH
+        movdqu  xmm7, xmmword ptr [rcx+10H]
+        pshufd  xmm4, xmm7, 00H
+        pshufd  xmm5, xmm7, 55H
+        pshufd  xmm6, xmm7, 0AAH
+        pshufd  xmm7, xmm7, 0FFH
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+8H]
+        mov     r10, qword ptr [rdi+10H]
+        mov     r11, qword ptr [rdi+18H]
+        movzx   eax, byte ptr [rbp+80H]
+        or      eax, r13d
+        xor     edx, edx
+innerloop4:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        movdqu  xmm8, xmmword ptr [r8+rdx-40H]
+        movdqu  xmm9, xmmword ptr [r9+rdx-40H]
+        movdqu  xmm10, xmmword ptr [r10+rdx-40H]
+        movdqu  xmm11, xmmword ptr [r11+rdx-40H]
+        movdqa  xmm12, xmm8
+        punpckldq xmm8, xmm9
+        punpckhdq xmm12, xmm9
+        movdqa  xmm14, xmm10
+        punpckldq xmm10, xmm11
+        punpckhdq xmm14, xmm11
+        movdqa  xmm9, xmm8
+        punpcklqdq xmm8, xmm10
+        punpckhqdq xmm9, xmm10
+        movdqa  xmm13, xmm12
+        punpcklqdq xmm12, xmm14
+        punpckhqdq xmm13, xmm14
+        movdqa  xmmword ptr [rsp], xmm8
+        movdqa  xmmword ptr [rsp+10H], xmm9
+        movdqa  xmmword ptr [rsp+20H], xmm12
+        movdqa  xmmword ptr [rsp+30H], xmm13
+        movdqu  xmm8, xmmword ptr [r8+rdx-30H]
+        movdqu  xmm9, xmmword ptr [r9+rdx-30H]
+        movdqu  xmm10, xmmword ptr [r10+rdx-30H]
+        movdqu  xmm11, xmmword ptr [r11+rdx-30H]
+        movdqa  xmm12, xmm8
+        punpckldq xmm8, xmm9
+        punpckhdq xmm12, xmm9
+        movdqa  xmm14, xmm10
+        punpckldq xmm10, xmm11
+        punpckhdq xmm14, xmm11
+        movdqa  xmm9, xmm8
+        punpcklqdq xmm8, xmm10
+        punpckhqdq xmm9, xmm10
+        movdqa  xmm13, xmm12
+        punpcklqdq xmm12, xmm14
+        punpckhqdq xmm13, xmm14
+        movdqa  xmmword ptr [rsp+40H], xmm8
+        movdqa  xmmword ptr [rsp+50H], xmm9
+        movdqa  xmmword ptr [rsp+60H], xmm12
+        movdqa  xmmword ptr [rsp+70H], xmm13
+        movdqu  xmm8, xmmword ptr [r8+rdx-20H]
+        movdqu  xmm9, xmmword ptr [r9+rdx-20H]
+        movdqu  xmm10, xmmword ptr [r10+rdx-20H]
+        movdqu  xmm11, xmmword ptr [r11+rdx-20H]
+        movdqa  xmm12, xmm8
+        punpckldq xmm8, xmm9
+        punpckhdq xmm12, xmm9
+        movdqa  xmm14, xmm10
+        punpckldq xmm10, xmm11
+        punpckhdq xmm14, xmm11
+        movdqa  xmm9, xmm8
+        punpcklqdq xmm8, xmm10
+        punpckhqdq xmm9, xmm10
+        movdqa  xmm13, xmm12
+        punpcklqdq xmm12, xmm14
+        punpckhqdq xmm13, xmm14
+        movdqa  xmmword ptr [rsp+80H], xmm8
+        movdqa  xmmword ptr [rsp+90H], xmm9
+        movdqa  xmmword ptr [rsp+0A0H], xmm12
+        movdqa  xmmword ptr [rsp+0B0H], xmm13
+        movdqu  xmm8, xmmword ptr [r8+rdx-10H]
+        movdqu  xmm9, xmmword ptr [r9+rdx-10H]
+        movdqu  xmm10, xmmword ptr [r10+rdx-10H]
+        movdqu  xmm11, xmmword ptr [r11+rdx-10H]
+        movdqa  xmm12, xmm8
+        punpckldq xmm8, xmm9
+        punpckhdq xmm12, xmm9
+        movdqa  xmm14, xmm10
+        punpckldq xmm10, xmm11
+        punpckhdq xmm14, xmm11
+        movdqa  xmm9, xmm8
+        punpcklqdq xmm8, xmm10
+        punpckhqdq xmm9, xmm10
+        movdqa  xmm13, xmm12
+        punpcklqdq xmm12, xmm14
+        punpckhqdq xmm13, xmm14
+        movdqa  xmmword ptr [rsp+0C0H], xmm8
+        movdqa  xmmword ptr [rsp+0D0H], xmm9
+        movdqa  xmmword ptr [rsp+0E0H], xmm12
+        movdqa  xmmword ptr [rsp+0F0H], xmm13
+        movdqa  xmm9, xmmword ptr [BLAKE3_IV_1]
+        movdqa  xmm10, xmmword ptr [BLAKE3_IV_2]
+        movdqa  xmm11, xmmword ptr [BLAKE3_IV_3]
+        movdqa  xmm12, xmmword ptr [rsp+110H]
+        movdqa  xmm13, xmmword ptr [rsp+120H]
+        movdqa  xmm14, xmmword ptr [BLAKE3_BLOCK_LEN]
+        movd    xmm15, eax
+        pshufd  xmm15, xmm15, 00H
+        prefetcht0 byte ptr [r8+rdx+80H]
+        prefetcht0 byte ptr [r9+rdx+80H]
+        prefetcht0 byte ptr [r10+rdx+80H]
+        prefetcht0 byte ptr [r11+rdx+80H]
+        paddd   xmm0, xmmword ptr [rsp]
+        paddd   xmm1, xmmword ptr [rsp+20H]
+        paddd   xmm2, xmmword ptr [rsp+40H]
+        paddd   xmm3, xmmword ptr [rsp+60H]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        pshuflw xmm12, xmm12, 0B1H
+        pshufhw xmm12, xmm12, 0B1H
+        pshuflw xmm13, xmm13, 0B1H
+        pshufhw xmm13, xmm13, 0B1H
+        pshuflw xmm14, xmm14, 0B1H
+        pshufhw xmm14, xmm14, 0B1H
+        pshuflw xmm15, xmm15, 0B1H
+        pshufhw xmm15, xmm15, 0B1H
+        movdqa  xmm8, xmmword ptr [BLAKE3_IV_0]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+10H]
+        paddd   xmm1, xmmword ptr [rsp+30H]
+        paddd   xmm2, xmmword ptr [rsp+50H]
+        paddd   xmm3, xmmword ptr [rsp+70H]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+80H]
+        paddd   xmm1, xmmword ptr [rsp+0A0H]
+        paddd   xmm2, xmmword ptr [rsp+0C0H]
+        paddd   xmm3, xmmword ptr [rsp+0E0H]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        pshuflw xmm15, xmm15, 0B1H
+        pshufhw xmm15, xmm15, 0B1H
+        pshuflw xmm12, xmm12, 0B1H
+        pshufhw xmm12, xmm12, 0B1H
+        pshuflw xmm13, xmm13, 0B1H
+        pshufhw xmm13, xmm13, 0B1H
+        pshuflw xmm14, xmm14, 0B1H
+        pshufhw xmm14, xmm14, 0B1H
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+90H]
+        paddd   xmm1, xmmword ptr [rsp+0B0H]
+        paddd   xmm2, xmmword ptr [rsp+0D0H]
+        paddd   xmm3, xmmword ptr [rsp+0F0H]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+20H]
+        paddd   xmm1, xmmword ptr [rsp+30H]
+        paddd   xmm2, xmmword ptr [rsp+70H]
+        paddd   xmm3, xmmword ptr [rsp+40H]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        pshuflw xmm12, xmm12, 0B1H
+        pshufhw xmm12, xmm12, 0B1H
+        pshuflw xmm13, xmm13, 0B1H
+        pshufhw xmm13, xmm13, 0B1H
+        pshuflw xmm14, xmm14, 0B1H
+        pshufhw xmm14, xmm14, 0B1H
+        pshuflw xmm15, xmm15, 0B1H
+        pshufhw xmm15, xmm15, 0B1H
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+60H]
+        paddd   xmm1, xmmword ptr [rsp+0A0H]
+        paddd   xmm2, xmmword ptr [rsp]
+        paddd   xmm3, xmmword ptr [rsp+0D0H]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+10H]
+        paddd   xmm1, xmmword ptr [rsp+0C0H]
+        paddd   xmm2, xmmword ptr [rsp+90H]
+        paddd   xmm3, xmmword ptr [rsp+0F0H]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        pshuflw xmm15, xmm15, 0B1H
+        pshufhw xmm15, xmm15, 0B1H
+        pshuflw xmm12, xmm12, 0B1H
+        pshufhw xmm12, xmm12, 0B1H
+        pshuflw xmm13, xmm13, 0B1H
+        pshufhw xmm13, xmm13, 0B1H
+        pshuflw xmm14, xmm14, 0B1H
+        pshufhw xmm14, xmm14, 0B1H
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0B0H]
+        paddd   xmm1, xmmword ptr [rsp+50H]
+        paddd   xmm2, xmmword ptr [rsp+0E0H]
+        paddd   xmm3, xmmword ptr [rsp+80H]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+30H]
+        paddd   xmm1, xmmword ptr [rsp+0A0H]
+        paddd   xmm2, xmmword ptr [rsp+0D0H]
+        paddd   xmm3, xmmword ptr [rsp+70H]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        pshuflw xmm12, xmm12, 0B1H
+        pshufhw xmm12, xmm12, 0B1H
+        pshuflw xmm13, xmm13, 0B1H
+        pshufhw xmm13, xmm13, 0B1H
+        pshuflw xmm14, xmm14, 0B1H
+        pshufhw xmm14, xmm14, 0B1H
+        pshuflw xmm15, xmm15, 0B1H
+        pshufhw xmm15, xmm15, 0B1H
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+40H]
+        paddd   xmm1, xmmword ptr [rsp+0C0H]
+        paddd   xmm2, xmmword ptr [rsp+20H]
+        paddd   xmm3, xmmword ptr [rsp+0E0H]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+60H]
+        paddd   xmm1, xmmword ptr [rsp+90H]
+        paddd   xmm2, xmmword ptr [rsp+0B0H]
+        paddd   xmm3, xmmword ptr [rsp+80H]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        pshuflw xmm15, xmm15, 0B1H
+        pshufhw xmm15, xmm15, 0B1H
+        pshuflw xmm12, xmm12, 0B1H
+        pshufhw xmm12, xmm12, 0B1H
+        pshuflw xmm13, xmm13, 0B1H
+        pshufhw xmm13, xmm13, 0B1H
+        pshuflw xmm14, xmm14, 0B1H
+        pshufhw xmm14, xmm14, 0B1H
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+50H]
+        paddd   xmm1, xmmword ptr [rsp]
+        paddd   xmm2, xmmword ptr [rsp+0F0H]
+        paddd   xmm3, xmmword ptr [rsp+10H]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0A0H]
+        paddd   xmm1, xmmword ptr [rsp+0C0H]
+        paddd   xmm2, xmmword ptr [rsp+0E0H]
+        paddd   xmm3, xmmword ptr [rsp+0D0H]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        pshuflw xmm12, xmm12, 0B1H
+        pshufhw xmm12, xmm12, 0B1H
+        pshuflw xmm13, xmm13, 0B1H
+        pshufhw xmm13, xmm13, 0B1H
+        pshuflw xmm14, xmm14, 0B1H
+        pshufhw xmm14, xmm14, 0B1H
+        pshuflw xmm15, xmm15, 0B1H
+        pshufhw xmm15, xmm15, 0B1H
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+70H]
+        paddd   xmm1, xmmword ptr [rsp+90H]
+        paddd   xmm2, xmmword ptr [rsp+30H]
+        paddd   xmm3, xmmword ptr [rsp+0F0H]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+40H]
+        paddd   xmm1, xmmword ptr [rsp+0B0H]
+        paddd   xmm2, xmmword ptr [rsp+50H]
+        paddd   xmm3, xmmword ptr [rsp+10H]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        pshuflw xmm15, xmm15, 0B1H
+        pshufhw xmm15, xmm15, 0B1H
+        pshuflw xmm12, xmm12, 0B1H
+        pshufhw xmm12, xmm12, 0B1H
+        pshuflw xmm13, xmm13, 0B1H
+        pshufhw xmm13, xmm13, 0B1H
+        pshuflw xmm14, xmm14, 0B1H
+        pshufhw xmm14, xmm14, 0B1H
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp]
+        paddd   xmm1, xmmword ptr [rsp+20H]
+        paddd   xmm2, xmmword ptr [rsp+80H]
+        paddd   xmm3, xmmword ptr [rsp+60H]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0C0H]
+        paddd   xmm1, xmmword ptr [rsp+90H]
+        paddd   xmm2, xmmword ptr [rsp+0F0H]
+        paddd   xmm3, xmmword ptr [rsp+0E0H]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        pshuflw xmm12, xmm12, 0B1H
+        pshufhw xmm12, xmm12, 0B1H
+        pshuflw xmm13, xmm13, 0B1H
+        pshufhw xmm13, xmm13, 0B1H
+        pshuflw xmm14, xmm14, 0B1H
+        pshufhw xmm14, xmm14, 0B1H
+        pshuflw xmm15, xmm15, 0B1H
+        pshufhw xmm15, xmm15, 0B1H
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0D0H]
+        paddd   xmm1, xmmword ptr [rsp+0B0H]
+        paddd   xmm2, xmmword ptr [rsp+0A0H]
+        paddd   xmm3, xmmword ptr [rsp+80H]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+70H]
+        paddd   xmm1, xmmword ptr [rsp+50H]
+        paddd   xmm2, xmmword ptr [rsp]
+        paddd   xmm3, xmmword ptr [rsp+60H]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        pshuflw xmm15, xmm15, 0B1H
+        pshufhw xmm15, xmm15, 0B1H
+        pshuflw xmm12, xmm12, 0B1H
+        pshufhw xmm12, xmm12, 0B1H
+        pshuflw xmm13, xmm13, 0B1H
+        pshufhw xmm13, xmm13, 0B1H
+        pshuflw xmm14, xmm14, 0B1H
+        pshufhw xmm14, xmm14, 0B1H
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+20H]
+        paddd   xmm1, xmmword ptr [rsp+30H]
+        paddd   xmm2, xmmword ptr [rsp+10H]
+        paddd   xmm3, xmmword ptr [rsp+40H]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+90H]
+        paddd   xmm1, xmmword ptr [rsp+0B0H]
+        paddd   xmm2, xmmword ptr [rsp+80H]
+        paddd   xmm3, xmmword ptr [rsp+0F0H]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        pshuflw xmm12, xmm12, 0B1H
+        pshufhw xmm12, xmm12, 0B1H
+        pshuflw xmm13, xmm13, 0B1H
+        pshufhw xmm13, xmm13, 0B1H
+        pshuflw xmm14, xmm14, 0B1H
+        pshufhw xmm14, xmm14, 0B1H
+        pshuflw xmm15, xmm15, 0B1H
+        pshufhw xmm15, xmm15, 0B1H
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0E0H]
+        paddd   xmm1, xmmword ptr [rsp+50H]
+        paddd   xmm2, xmmword ptr [rsp+0C0H]
+        paddd   xmm3, xmmword ptr [rsp+10H]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0D0H]
+        paddd   xmm1, xmmword ptr [rsp]
+        paddd   xmm2, xmmword ptr [rsp+20H]
+        paddd   xmm3, xmmword ptr [rsp+40H]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        pshuflw xmm15, xmm15, 0B1H
+        pshufhw xmm15, xmm15, 0B1H
+        pshuflw xmm12, xmm12, 0B1H
+        pshufhw xmm12, xmm12, 0B1H
+        pshuflw xmm13, xmm13, 0B1H
+        pshufhw xmm13, xmm13, 0B1H
+        pshuflw xmm14, xmm14, 0B1H
+        pshufhw xmm14, xmm14, 0B1H
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+30H]
+        paddd   xmm1, xmmword ptr [rsp+0A0H]
+        paddd   xmm2, xmmword ptr [rsp+60H]
+        paddd   xmm3, xmmword ptr [rsp+70H]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0B0H]
+        paddd   xmm1, xmmword ptr [rsp+50H]
+        paddd   xmm2, xmmword ptr [rsp+10H]
+        paddd   xmm3, xmmword ptr [rsp+80H]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        pshuflw xmm12, xmm12, 0B1H
+        pshufhw xmm12, xmm12, 0B1H
+        pshuflw xmm13, xmm13, 0B1H
+        pshufhw xmm13, xmm13, 0B1H
+        pshuflw xmm14, xmm14, 0B1H
+        pshufhw xmm14, xmm14, 0B1H
+        pshuflw xmm15, xmm15, 0B1H
+        pshufhw xmm15, xmm15, 0B1H
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0F0H]
+        paddd   xmm1, xmmword ptr [rsp]
+        paddd   xmm2, xmmword ptr [rsp+90H]
+        paddd   xmm3, xmmword ptr [rsp+60H]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0E0H]
+        paddd   xmm1, xmmword ptr [rsp+20H]
+        paddd   xmm2, xmmword ptr [rsp+30H]
+        paddd   xmm3, xmmword ptr [rsp+70H]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        pshuflw xmm15, xmm15, 0B1H
+        pshufhw xmm15, xmm15, 0B1H
+        pshuflw xmm12, xmm12, 0B1H
+        pshufhw xmm12, xmm12, 0B1H
+        pshuflw xmm13, xmm13, 0B1H
+        pshufhw xmm13, xmm13, 0B1H
+        pshuflw xmm14, xmm14, 0B1H
+        pshufhw xmm14, xmm14, 0B1H
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0A0H]
+        paddd   xmm1, xmmword ptr [rsp+0C0H]
+        paddd   xmm2, xmmword ptr [rsp+40H]
+        paddd   xmm3, xmmword ptr [rsp+0D0H]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        pxor    xmm0, xmm8
+        pxor    xmm1, xmm9
+        pxor    xmm2, xmm10
+        pxor    xmm3, xmm11
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        pxor    xmm4, xmm12
+        pxor    xmm5, xmm13
+        pxor    xmm6, xmm14
+        pxor    xmm7, xmm15
+        mov     eax, r13d
+        jne     innerloop4
+        movdqa  xmm9, xmm0
+        punpckldq xmm0, xmm1
+        punpckhdq xmm9, xmm1
+        movdqa  xmm11, xmm2
+        punpckldq xmm2, xmm3
+        punpckhdq xmm11, xmm3
+        movdqa  xmm1, xmm0
+        punpcklqdq xmm0, xmm2
+        punpckhqdq xmm1, xmm2
+        movdqa  xmm3, xmm9
+        punpcklqdq xmm9, xmm11
+        punpckhqdq xmm3, xmm11
+        movdqu  xmmword ptr [rbx], xmm0
+        movdqu  xmmword ptr [rbx+20H], xmm1
+        movdqu  xmmword ptr [rbx+40H], xmm9
+        movdqu  xmmword ptr [rbx+60H], xmm3
+        movdqa  xmm9, xmm4
+        punpckldq xmm4, xmm5
+        punpckhdq xmm9, xmm5
+        movdqa  xmm11, xmm6
+        punpckldq xmm6, xmm7
+        punpckhdq xmm11, xmm7
+        movdqa  xmm5, xmm4
+        punpcklqdq xmm4, xmm6
+        punpckhqdq xmm5, xmm6
+        movdqa  xmm7, xmm9
+        punpcklqdq xmm9, xmm11
+        punpckhqdq xmm7, xmm11
+        movdqu  xmmword ptr [rbx+10H], xmm4
+        movdqu  xmmword ptr [rbx+30H], xmm5
+        movdqu  xmmword ptr [rbx+50H], xmm9
+        movdqu  xmmword ptr [rbx+70H], xmm7
+        movdqa  xmm1, xmmword ptr [rsp+110H]
+        movdqa  xmm0, xmm1
+        paddd   xmm1, xmmword ptr [rsp+150H]
+        movdqa  xmmword ptr [rsp+110H], xmm1
+        pxor    xmm0, xmmword ptr [CMP_MSB_MASK]
+        pxor    xmm1, xmmword ptr [CMP_MSB_MASK]
+        pcmpgtd xmm0, xmm1
+        movdqa  xmm1, xmmword ptr [rsp+120H]
+        psubd   xmm1, xmm0
+        movdqa  xmmword ptr [rsp+120H], xmm1
+        add     rbx, 128
+        add     rdi, 32
+        sub     rsi, 4
+        cmp     rsi, 4
+        jnc     outerloop4
+        test    rsi, rsi
+        jne     final3blocks
+unwind:
+        movdqa  xmm6, xmmword ptr [rsp+170H]
+        movdqa  xmm7, xmmword ptr [rsp+180H]
+        movdqa  xmm8, xmmword ptr [rsp+190H]
+        movdqa  xmm9, xmmword ptr [rsp+1A0H]
+        movdqa  xmm10, xmmword ptr [rsp+1B0H]
+        movdqa  xmm11, xmmword ptr [rsp+1C0H]
+        movdqa  xmm12, xmmword ptr [rsp+1D0H]
+        movdqa  xmm13, xmmword ptr [rsp+1E0H]
+        movdqa  xmm14, xmmword ptr [rsp+1F0H]
+        movdqa  xmm15, xmmword ptr [rsp+200H]
+        mov     rsp, rbp
+        pop     rbp
+        pop     rbx
+        pop     rdi
+        pop     rsi
+        pop     r12
+        pop     r13
+        pop     r14
+        pop     r15
+        ret
+ALIGN   16
+final3blocks:
+        test    esi, 2H
+        je      final1block
+        movups  xmm0, xmmword ptr [rcx]
+        movups  xmm1, xmmword ptr [rcx+10H]
+        movaps  xmm8, xmm0
+        movaps  xmm9, xmm1
+        movd    xmm13, dword ptr [rsp+110H]
+        movd    xmm14, dword ptr [rsp+120H]
+        punpckldq xmm13, xmm14
+        movaps  xmmword ptr [rsp], xmm13
+        movd    xmm14, dword ptr [rsp+114H]
+        movd    xmm13, dword ptr [rsp+124H]
+        punpckldq xmm14, xmm13
+        movaps  xmmword ptr [rsp+10H], xmm14
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+8H]
+        movzx   eax, byte ptr [rbp+80H]
+        or      eax, r13d
+        xor     edx, edx
+innerloop2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        movaps  xmm2, xmmword ptr [BLAKE3_IV]
+        movaps  xmm10, xmm2
+        movups  xmm4, xmmword ptr [r8+rdx-40H]
+        movups  xmm5, xmmword ptr [r8+rdx-30H]
+        movaps  xmm3, xmm4
+        shufps  xmm4, xmm5, 136
+        shufps  xmm3, xmm5, 221
+        movaps  xmm5, xmm3
+        movups  xmm6, xmmword ptr [r8+rdx-20H]
+        movups  xmm7, xmmword ptr [r8+rdx-10H]
+        movaps  xmm3, xmm6
+        shufps  xmm6, xmm7, 136
+        pshufd  xmm6, xmm6, 93H
+        shufps  xmm3, xmm7, 221
+        pshufd  xmm7, xmm3, 93H
+        movups  xmm12, xmmword ptr [r9+rdx-40H]
+        movups  xmm13, xmmword ptr [r9+rdx-30H]
+        movaps  xmm11, xmm12
+        shufps  xmm12, xmm13, 136
+        shufps  xmm11, xmm13, 221
+        movaps  xmm13, xmm11
+        movups  xmm14, xmmword ptr [r9+rdx-20H]
+        movups  xmm15, xmmword ptr [r9+rdx-10H]
+        movaps  xmm11, xmm14
+        shufps  xmm14, xmm15, 136
+        pshufd  xmm14, xmm14, 93H
+        shufps  xmm11, xmm15, 221
+        pshufd  xmm15, xmm11, 93H
+        shl     rax, 20H
+        or      rax, 40H
+        movd    xmm3, rax
+        movdqa  xmmword ptr [rsp+20H], xmm3
+        movaps  xmm3, xmmword ptr [rsp]
+        movaps  xmm11, xmmword ptr [rsp+10H]
+        punpcklqdq xmm3, xmmword ptr [rsp+20H]
+        punpcklqdq xmm11, xmmword ptr [rsp+20H]
+        mov     al, 7
+roundloop2:
+        paddd   xmm0, xmm4
+        paddd   xmm8, xmm12
+        movaps  xmmword ptr [rsp+20H], xmm4
+        movaps  xmmword ptr [rsp+30H], xmm12
+        paddd   xmm0, xmm1
+        paddd   xmm8, xmm9
+        pxor    xmm3, xmm0
+        pxor    xmm11, xmm8
+        pshuflw xmm3, xmm3, 0B1H
+        pshufhw xmm3, xmm3, 0B1H
+        pshuflw xmm11, xmm11, 0B1H
+        pshufhw xmm11, xmm11, 0B1H
+        paddd   xmm2, xmm3
+        paddd   xmm10, xmm11
+        pxor    xmm1, xmm2
+        pxor    xmm9, xmm10
+        movdqa  xmm4, xmm1
+        pslld   xmm1, 20
+        psrld   xmm4, 12
+        por     xmm1, xmm4
+        movdqa  xmm4, xmm9
+        pslld   xmm9, 20
+        psrld   xmm4, 12
+        por     xmm9, xmm4
+        paddd   xmm0, xmm5
+        paddd   xmm8, xmm13
+        movaps  xmmword ptr [rsp+40H], xmm5
+        movaps  xmmword ptr [rsp+50H], xmm13
+        paddd   xmm0, xmm1
+        paddd   xmm8, xmm9
+        pxor    xmm3, xmm0
+        pxor    xmm11, xmm8
+        movdqa  xmm13, xmm3
+        psrld   xmm3, 8
+        pslld   xmm13, 24
+        pxor    xmm3, xmm13
+        movdqa  xmm13, xmm11
+        psrld   xmm11, 8
+        pslld   xmm13, 24
+        pxor    xmm11, xmm13
+        paddd   xmm2, xmm3
+        paddd   xmm10, xmm11
+        pxor    xmm1, xmm2
+        pxor    xmm9, xmm10
+        movdqa  xmm4, xmm1
+        pslld   xmm1, 25
+        psrld   xmm4, 7
+        por     xmm1, xmm4
+        movdqa  xmm4, xmm9
+        pslld   xmm9, 25
+        psrld   xmm4, 7
+        por     xmm9, xmm4
+        pshufd  xmm0, xmm0, 93H
+        pshufd  xmm8, xmm8, 93H
+        pshufd  xmm3, xmm3, 4EH
+        pshufd  xmm11, xmm11, 4EH
+        pshufd  xmm2, xmm2, 39H
+        pshufd  xmm10, xmm10, 39H
+        paddd   xmm0, xmm6
+        paddd   xmm8, xmm14
+        paddd   xmm0, xmm1
+        paddd   xmm8, xmm9
+        pxor    xmm3, xmm0
+        pxor    xmm11, xmm8
+        pshuflw xmm3, xmm3, 0B1H
+        pshufhw xmm3, xmm3, 0B1H
+        pshuflw xmm11, xmm11, 0B1H
+        pshufhw xmm11, xmm11, 0B1H
+        paddd   xmm2, xmm3
+        paddd   xmm10, xmm11
+        pxor    xmm1, xmm2
+        pxor    xmm9, xmm10
+        movdqa  xmm4, xmm1
+        pslld   xmm1, 20
+        psrld   xmm4, 12
+        por     xmm1, xmm4
+        movdqa  xmm4, xmm9
+        pslld   xmm9, 20
+        psrld   xmm4, 12
+        por     xmm9, xmm4
+        paddd   xmm0, xmm7
+        paddd   xmm8, xmm15
+        paddd   xmm0, xmm1
+        paddd   xmm8, xmm9
+        pxor    xmm3, xmm0
+        pxor    xmm11, xmm8
+        movdqa  xmm13, xmm3
+        psrld   xmm3, 8
+        pslld   xmm13, 24
+        pxor    xmm3, xmm13
+        movdqa  xmm13, xmm11
+        psrld   xmm11, 8
+        pslld   xmm13, 24
+        pxor    xmm11, xmm13
+        paddd   xmm2, xmm3
+        paddd   xmm10, xmm11
+        pxor    xmm1, xmm2
+        pxor    xmm9, xmm10
+        movdqa  xmm4, xmm1
+        pslld   xmm1, 25
+        psrld   xmm4, 7
+        por     xmm1, xmm4
+        movdqa  xmm4, xmm9
+        pslld   xmm9, 25
+        psrld   xmm4, 7
+        por     xmm9, xmm4
+        pshufd  xmm0, xmm0, 39H
+        pshufd  xmm8, xmm8, 39H
+        pshufd  xmm3, xmm3, 4EH
+        pshufd  xmm11, xmm11, 4EH
+        pshufd  xmm2, xmm2, 93H
+        pshufd  xmm10, xmm10, 93H
+        dec     al
+        je      endroundloop2
+        movdqa  xmm12, xmmword ptr [rsp+20H]
+        movdqa  xmm5, xmmword ptr [rsp+40H]
+        pshufd  xmm13, xmm12, 0FH
+        shufps  xmm12, xmm5, 214
+        pshufd  xmm4, xmm12, 39H
+        movdqa  xmm12, xmm6
+        shufps  xmm12, xmm7, 250
+        pand    xmm13, xmmword ptr [PBLENDW_0x33_MASK]
+        pand    xmm12, xmmword ptr [PBLENDW_0xCC_MASK]
+        por     xmm13, xmm12
+        movdqa  xmmword ptr [rsp+20H], xmm13
+        movdqa  xmm12, xmm7
+        punpcklqdq xmm12, xmm5
+        movdqa  xmm13, xmm6
+        pand    xmm12, xmmword ptr [PBLENDW_0x3F_MASK]
+        pand    xmm13, xmmword ptr [PBLENDW_0xC0_MASK]
+        por     xmm12, xmm13
+        pshufd  xmm12, xmm12, 78H
+        punpckhdq xmm5, xmm7
+        punpckldq xmm6, xmm5
+        pshufd  xmm7, xmm6, 1EH
+        movdqa  xmmword ptr [rsp+40H], xmm12
+        movdqa  xmm5, xmmword ptr [rsp+30H]
+        movdqa  xmm13, xmmword ptr [rsp+50H]
+        pshufd  xmm6, xmm5, 0FH
+        shufps  xmm5, xmm13, 214
+        pshufd  xmm12, xmm5, 39H
+        movdqa  xmm5, xmm14
+        shufps  xmm5, xmm15, 250
+        pand    xmm6, xmmword ptr [PBLENDW_0x33_MASK]
+        pand    xmm5, xmmword ptr [PBLENDW_0xCC_MASK]
+        por     xmm6, xmm5
+        movdqa  xmm5, xmm15
+        punpcklqdq xmm5, xmm13
+        movdqa  xmmword ptr [rsp+30H], xmm2
+        movdqa  xmm2, xmm14
+        pand    xmm5, xmmword ptr [PBLENDW_0x3F_MASK]
+        pand    xmm2, xmmword ptr [PBLENDW_0xC0_MASK]
+        por     xmm5, xmm2
+        movdqa  xmm2, xmmword ptr [rsp+30H]
+        pshufd  xmm5, xmm5, 78H
+        punpckhdq xmm13, xmm15
+        punpckldq xmm14, xmm13
+        pshufd  xmm15, xmm14, 1EH
+        movdqa  xmm13, xmm6
+        movdqa  xmm14, xmm5
+        movdqa  xmm5, xmmword ptr [rsp+20H]
+        movdqa  xmm6, xmmword ptr [rsp+40H]
+        jmp     roundloop2
+endroundloop2:
+        pxor    xmm0, xmm2
+        pxor    xmm1, xmm3
+        pxor    xmm8, xmm10
+        pxor    xmm9, xmm11
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     innerloop2
+        movups  xmmword ptr [rbx], xmm0
+        movups  xmmword ptr [rbx+10H], xmm1
+        movups  xmmword ptr [rbx+20H], xmm8
+        movups  xmmword ptr [rbx+30H], xmm9
+        mov     eax, dword ptr [rsp+130H]
+        neg     eax
+        mov    r10d, dword ptr [rsp+110H+8*rax]
+        mov    r11d, dword ptr [rsp+120H+8*rax]
+        mov dword ptr [rsp+110H], r10d
+        mov dword ptr [rsp+120H], r11d
+        add     rdi, 16
+        add     rbx, 64
+        sub     rsi, 2
+final1block:
+        test    esi, 1H
+        je      unwind
+        movups  xmm0, xmmword ptr [rcx]
+        movups  xmm1, xmmword ptr [rcx+10H]
+        movd    xmm13, dword ptr [rsp+110H]
+        movd    xmm14, dword ptr [rsp+120H]
+        punpckldq xmm13, xmm14
+        mov     r8, qword ptr [rdi]
+        movzx   eax, byte ptr [rbp+80H]
+        or      eax, r13d
+        xor     edx, edx
+innerloop1:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        movaps  xmm2, xmmword ptr [BLAKE3_IV]
+        shl     rax, 32
+        or      rax, 64
+        movd    xmm12, rax
+        movdqa  xmm3, xmm13
+        punpcklqdq xmm3, xmm12
+        movups  xmm4, xmmword ptr [r8+rdx-40H]
+        movups  xmm5, xmmword ptr [r8+rdx-30H]
+        movaps  xmm8, xmm4
+        shufps  xmm4, xmm5, 136
+        shufps  xmm8, xmm5, 221
+        movaps  xmm5, xmm8
+        movups  xmm6, xmmword ptr [r8+rdx-20H]
+        movups  xmm7, xmmword ptr [r8+rdx-10H]
+        movaps  xmm8, xmm6
+        shufps  xmm6, xmm7, 136
+        pshufd  xmm6, xmm6, 93H
+        shufps  xmm8, xmm7, 221
+        pshufd  xmm7, xmm8, 93H
+        mov     al, 7
+roundloop1:
+        paddd   xmm0, xmm4
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshuflw xmm3, xmm3, 0B1H
+        pshufhw xmm3, xmm3, 0B1H
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm5
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        movdqa  xmm14, xmm3
+        psrld   xmm3, 8
+        pslld   xmm14, 24
+        pxor    xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 93H
+        pshufd  xmm3, xmm3, 4EH
+        pshufd  xmm2, xmm2, 39H
+        paddd   xmm0, xmm6
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshuflw xmm3, xmm3, 0B1H
+        pshufhw xmm3, xmm3, 0B1H
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm7
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        movdqa  xmm14, xmm3
+        psrld   xmm3, 8
+        pslld   xmm14, 24
+        pxor    xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 39H
+        pshufd  xmm3, xmm3, 4EH
+        pshufd  xmm2, xmm2, 93H
+        dec     al
+        jz      endroundloop1
+        movdqa  xmm8, xmm4
+        shufps  xmm8, xmm5, 214
+        pshufd  xmm9, xmm4, 0FH
+        pshufd  xmm4, xmm8, 39H
+        movdqa  xmm8, xmm6
+        shufps  xmm8, xmm7, 250
+        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK]
+        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK]
+        por     xmm9, xmm8
+        movdqa  xmm8, xmm7
+        punpcklqdq xmm8, xmm5
+        movdqa  xmm10, xmm6
+        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
+        pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
+        por     xmm8, xmm10
+        pshufd  xmm8, xmm8, 78H
+        punpckhdq xmm5, xmm7
+        punpckldq xmm6, xmm5
+        pshufd  xmm7, xmm6, 1EH
+        movdqa  xmm5, xmm9
+        movdqa  xmm6, xmm8
+        jmp     roundloop1
+endroundloop1:
+        pxor    xmm0, xmm2
+        pxor    xmm1, xmm3
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     innerloop1
+        movups  xmmword ptr [rbx], xmm0
+        movups  xmmword ptr [rbx+10H], xmm1
+        jmp     unwind
+_blake3_hash_many_sse2 ENDP
+blake3_hash_many_sse2 ENDP
+
+blake3_compress_in_place_sse2 PROC
+_blake3_compress_in_place_sse2 PROC
+        sub     rsp, 120
+        movdqa  xmmword ptr [rsp], xmm6
+        movdqa  xmmword ptr [rsp+10H], xmm7
+        movdqa  xmmword ptr [rsp+20H], xmm8
+        movdqa  xmmword ptr [rsp+30H], xmm9
+        movdqa  xmmword ptr [rsp+40H], xmm11
+        movdqa  xmmword ptr [rsp+50H], xmm14
+        movdqa  xmmword ptr [rsp+60H], xmm15
+        movups  xmm0, xmmword ptr [rcx]
+        movups  xmm1, xmmword ptr [rcx+10H]
+        movaps  xmm2, xmmword ptr [BLAKE3_IV]
+        movzx   eax, byte ptr [rsp+0A0H]
+        movzx   r8d, r8b
+        shl     rax, 32
+        add     r8, rax
+        movd    xmm3, r9
+        movd    xmm4, r8
+        punpcklqdq xmm3, xmm4
+        movups  xmm4, xmmword ptr [rdx]
+        movups  xmm5, xmmword ptr [rdx+10H]
+        movaps  xmm8, xmm4
+        shufps  xmm4, xmm5, 136
+        shufps  xmm8, xmm5, 221
+        movaps  xmm5, xmm8
+        movups  xmm6, xmmword ptr [rdx+20H]
+        movups  xmm7, xmmword ptr [rdx+30H]
+        movaps  xmm8, xmm6
+        shufps  xmm6, xmm7, 136
+        pshufd  xmm6, xmm6, 93H
+        shufps  xmm8, xmm7, 221
+        pshufd  xmm7, xmm8, 93H
+        mov     al, 7
+@@:
+        paddd   xmm0, xmm4
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshuflw xmm3, xmm3, 0B1H
+        pshufhw xmm3, xmm3, 0B1H
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm5
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        movdqa  xmm14, xmm3
+        psrld   xmm3, 8
+        pslld   xmm14, 24
+        pxor    xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 93H
+        pshufd  xmm3, xmm3, 4EH
+        pshufd  xmm2, xmm2, 39H
+        paddd   xmm0, xmm6
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshuflw xmm3, xmm3, 0B1H
+        pshufhw xmm3, xmm3, 0B1H
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm7
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        movdqa  xmm14, xmm3
+        psrld   xmm3, 8
+        pslld   xmm14, 24
+        pxor    xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 39H
+        pshufd  xmm3, xmm3, 4EH
+        pshufd  xmm2, xmm2, 93H
+        dec     al
+        jz      @F
+        movdqa  xmm8, xmm4
+        shufps  xmm8, xmm5, 214
+        pshufd  xmm9, xmm4, 0FH
+        pshufd  xmm4, xmm8, 39H
+        movdqa  xmm8, xmm6
+        shufps  xmm8, xmm7, 250
+        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK]
+        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK]
+        por     xmm9, xmm8
+        movdqa  xmm8, xmm7
+        punpcklqdq xmm8, xmm5
+        movdqa  xmm14, xmm6
+        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
+        pand    xmm14, xmmword ptr [PBLENDW_0xC0_MASK]
+        por     xmm8, xmm14
+        pshufd  xmm8, xmm8, 78H
+        punpckhdq xmm5, xmm7
+        punpckldq xmm6, xmm5
+        pshufd  xmm7, xmm6, 1EH
+        movdqa  xmm5, xmm9
+        movdqa  xmm6, xmm8
+        jmp     @B
+@@:
+        pxor    xmm0, xmm2
+        pxor    xmm1, xmm3
+        movups  xmmword ptr [rcx], xmm0
+        movups  xmmword ptr [rcx+10H], xmm1
+        movdqa  xmm6, xmmword ptr [rsp]
+        movdqa  xmm7, xmmword ptr [rsp+10H]
+        movdqa  xmm8, xmmword ptr [rsp+20H]
+        movdqa  xmm9, xmmword ptr [rsp+30H]
+        movdqa  xmm11, xmmword ptr [rsp+40H]
+        movdqa  xmm14, xmmword ptr [rsp+50H]
+        movdqa  xmm15, xmmword ptr [rsp+60H]
+        add     rsp, 120
+        ret
+_blake3_compress_in_place_sse2 ENDP
+blake3_compress_in_place_sse2 ENDP
+
+ALIGN 16
+blake3_compress_xof_sse2 PROC
+_blake3_compress_xof_sse2 PROC
+        sub     rsp, 120
+        movdqa  xmmword ptr [rsp], xmm6
+        movdqa  xmmword ptr [rsp+10H], xmm7
+        movdqa  xmmword ptr [rsp+20H], xmm8
+        movdqa  xmmword ptr [rsp+30H], xmm9
+        movdqa  xmmword ptr [rsp+40H], xmm11
+        movdqa  xmmword ptr [rsp+50H], xmm14
+        movdqa  xmmword ptr [rsp+60H], xmm15
+        movups  xmm0, xmmword ptr [rcx]
+        movups  xmm1, xmmword ptr [rcx+10H]
+        movaps  xmm2, xmmword ptr [BLAKE3_IV]
+        movzx   eax, byte ptr [rsp+0A0H]
+        movzx   r8d, r8b
+        mov     r10, qword ptr [rsp+0A8H]
+        shl     rax, 32
+        add     r8, rax
+        movd    xmm3, r9
+        movd    xmm4, r8
+        punpcklqdq xmm3, xmm4
+        movups  xmm4, xmmword ptr [rdx]
+        movups  xmm5, xmmword ptr [rdx+10H]
+        movaps  xmm8, xmm4
+        shufps  xmm4, xmm5, 136
+        shufps  xmm8, xmm5, 221
+        movaps  xmm5, xmm8
+        movups  xmm6, xmmword ptr [rdx+20H]
+        movups  xmm7, xmmword ptr [rdx+30H]
+        movaps  xmm8, xmm6
+        shufps  xmm6, xmm7, 136
+        pshufd  xmm6, xmm6, 93H
+        shufps  xmm8, xmm7, 221
+        pshufd  xmm7, xmm8, 93H
+        mov     al, 7
+@@:
+        paddd   xmm0, xmm4
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshuflw xmm3, xmm3, 0B1H
+        pshufhw xmm3, xmm3, 0B1H
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm5
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        movdqa  xmm14, xmm3
+        psrld   xmm3, 8
+        pslld   xmm14, 24
+        pxor    xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 93H
+        pshufd  xmm3, xmm3, 4EH
+        pshufd  xmm2, xmm2, 39H
+        paddd   xmm0, xmm6
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshuflw xmm3, xmm3, 0B1H
+        pshufhw xmm3, xmm3, 0B1H
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm7
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        movdqa  xmm14, xmm3
+        psrld   xmm3, 8
+        pslld   xmm14, 24
+        pxor    xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 39H
+        pshufd  xmm3, xmm3, 4EH
+        pshufd  xmm2, xmm2, 93H
+        dec     al
+        jz      @F
+        movdqa  xmm8, xmm4
+        shufps  xmm8, xmm5, 214
+        pshufd  xmm9, xmm4, 0FH
+        pshufd  xmm4, xmm8, 39H
+        movdqa  xmm8, xmm6
+        shufps  xmm8, xmm7, 250
+        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK]
+        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK]
+        por     xmm9, xmm8
+        movdqa  xmm8, xmm7
+        punpcklqdq xmm8, xmm5
+        movdqa  xmm14, xmm6
+        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
+        pand    xmm14, xmmword ptr [PBLENDW_0xC0_MASK]
+        por     xmm8, xmm14
+        pshufd  xmm8, xmm8, 78H
+        punpckhdq xmm5, xmm7
+        punpckldq xmm6, xmm5
+        pshufd  xmm7, xmm6, 1EH
+        movdqa  xmm5, xmm9
+        movdqa  xmm6, xmm8
+        jmp     @B
+@@:
+        movdqu  xmm4, xmmword ptr [rcx]
+        movdqu  xmm5, xmmword ptr [rcx+10H]
+        pxor    xmm0, xmm2
+        pxor    xmm1, xmm3
+        pxor    xmm2, xmm4
+        pxor    xmm3, xmm5
+        movups  xmmword ptr [r10], xmm0
+        movups  xmmword ptr [r10+10H], xmm1
+        movups  xmmword ptr [r10+20H], xmm2
+        movups  xmmword ptr [r10+30H], xmm3
+        movdqa  xmm6, xmmword ptr [rsp]
+        movdqa  xmm7, xmmword ptr [rsp+10H]
+        movdqa  xmm8, xmmword ptr [rsp+20H]
+        movdqa  xmm9, xmmword ptr [rsp+30H]
+        movdqa  xmm11, xmmword ptr [rsp+40H]
+        movdqa  xmm14, xmmword ptr [rsp+50H]
+        movdqa  xmm15, xmmword ptr [rsp+60H]
+        add     rsp, 120
+        ret
+_blake3_compress_xof_sse2 ENDP
+blake3_compress_xof_sse2 ENDP
+
+_TEXT ENDS
+
+
+_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST'
+ALIGN   64
+BLAKE3_IV:
+        dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH
+
+ADD0:
+        dd 0, 1, 2, 3
+
+ADD1:
+        dd 4 dup (4)
+
+BLAKE3_IV_0:
+        dd 4 dup (6A09E667H)
+
+BLAKE3_IV_1:
+        dd 4 dup (0BB67AE85H)
+
+BLAKE3_IV_2:
+        dd 4 dup (3C6EF372H)
+
+BLAKE3_IV_3:
+        dd 4 dup (0A54FF53AH)
+
+BLAKE3_BLOCK_LEN:
+        dd 4 dup (64)
+
+CMP_MSB_MASK:
+        dd 8 dup(80000000H)
+
+PBLENDW_0x33_MASK:
+       dd 0FFFFFFFFH, 000000000H, 0FFFFFFFFH, 000000000H
+PBLENDW_0xCC_MASK:
+       dd 000000000H, 0FFFFFFFFH, 000000000H, 0FFFFFFFFH
+PBLENDW_0x3F_MASK:
+	dd 0FFFFFFFFH, 0FFFFFFFFH, 0FFFFFFFFH, 000000000H
+PBLENDW_0xC0_MASK:
+       dd 000000000H, 000000000H, 000000000H, 0FFFFFFFFH
+
+_RDATA ENDS
+END
diff --git a/llvm/lib/Support/BLAKE3/blake3_sse41.c b/llvm/lib/Support/BLAKE3/blake3_sse41.c
new file mode 100644
index 000000000000..87a8dae15ce9
--- /dev/null
+++ b/llvm/lib/Support/BLAKE3/blake3_sse41.c
@@ -0,0 +1,560 @@
+#include "blake3_impl.h"
+
+#include <immintrin.h>
+
+#define DEGREE 4
+
+#define _mm_shuffle_ps2(a, b, c)                                               \
+  (_mm_castps_si128(                                                           \
+      _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
+
+INLINE __m128i loadu(const uint8_t src[16]) {
+  return _mm_loadu_si128((const __m128i *)src);
+}
+
+INLINE void storeu(__m128i src, uint8_t dest[16]) {
+  _mm_storeu_si128((__m128i *)dest, src);
+}
+
+INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
+
+// Note that clang-format doesn't like the name "xor" for some reason.
+INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
+
+INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); }
+
+INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+  return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
+}
+
+INLINE __m128i rot16(__m128i x) {
+  return _mm_shuffle_epi8(
+      x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2));
+}
+
+INLINE __m128i rot12(__m128i x) {
+  return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12));
+}
+
+INLINE __m128i rot8(__m128i x) {
+  return _mm_shuffle_epi8(
+      x, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1));
+}
+
+INLINE __m128i rot7(__m128i x) {
+  return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7));
+}
+
+INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
+               __m128i m) {
+  *row0 = addv(addv(*row0, m), *row1);
+  *row3 = xorv(*row3, *row0);
+  *row3 = rot16(*row3);
+  *row2 = addv(*row2, *row3);
+  *row1 = xorv(*row1, *row2);
+  *row1 = rot12(*row1);
+}
+
+INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
+               __m128i m) {
+  *row0 = addv(addv(*row0, m), *row1);
+  *row3 = xorv(*row3, *row0);
+  *row3 = rot8(*row3);
+  *row2 = addv(*row2, *row3);
+  *row1 = xorv(*row1, *row2);
+  *row1 = rot7(*row1);
+}
+
+// Note the optimization here of leaving row1 as the unrotated row, rather than
+// row0. All the message loads below are adjusted to compensate for this. See
+// discussion at https://github.com/sneves/blake2-avx2/pull/4
+INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
+  *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
+  *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
+  *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
+}
+
+INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
+  *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
+  *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
+  *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
+}
+
+INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
+                         const uint8_t block[BLAKE3_BLOCK_LEN],
+                         uint8_t block_len, uint64_t counter, uint8_t flags) {
+  rows[0] = loadu((uint8_t *)&cv[0]);
+  rows[1] = loadu((uint8_t *)&cv[4]);
+  rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
+  rows[3] = set4(counter_low(counter), counter_high(counter),
+                 (uint32_t)block_len, (uint32_t)flags);
+
+  __m128i m0 = loadu(&block[sizeof(__m128i) * 0]);
+  __m128i m1 = loadu(&block[sizeof(__m128i) * 1]);
+  __m128i m2 = loadu(&block[sizeof(__m128i) * 2]);
+  __m128i m3 = loadu(&block[sizeof(__m128i) * 3]);
+
+  __m128i t0, t1, t2, t3, tt;
+
+  // Round 1. The first round permutes the message words from the original
+  // input order, into the groups that get mixed in parallel.
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); //  6  4  2  0
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); //  7  5  3  1
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10  8
+  t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));   // 12 10  8 14
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11  9
+  t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));   // 13 11  9 15
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 2. This round and all following rounds apply a fixed permutation
+  // to the message words from the round before.
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 3
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 4
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 5
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 6
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 7
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+}
+
+void blake3_compress_in_place_sse41(uint32_t cv[8],
+                                    const uint8_t block[BLAKE3_BLOCK_LEN],
+                                    uint8_t block_len, uint64_t counter,
+                                    uint8_t flags) {
+  __m128i rows[4];
+  compress_pre(rows, cv, block, block_len, counter, flags);
+  storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]);
+  storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]);
+}
+
+void blake3_compress_xof_sse41(const uint32_t cv[8],
+                               const uint8_t block[BLAKE3_BLOCK_LEN],
+                               uint8_t block_len, uint64_t counter,
+                               uint8_t flags, uint8_t out[64]) {
+  __m128i rows[4];
+  compress_pre(rows, cv, block, block_len, counter, flags);
+  storeu(xorv(rows[0], rows[2]), &out[0]);
+  storeu(xorv(rows[1], rows[3]), &out[16]);
+  storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]);
+  storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]);
+}
+
+INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
+  v[0] = addv(v[0], v[4]);
+  v[1] = addv(v[1], v[5]);
+  v[2] = addv(v[2], v[6]);
+  v[3] = addv(v[3], v[7]);
+  v[12] = xorv(v[12], v[0]);
+  v[13] = xorv(v[13], v[1]);
+  v[14] = xorv(v[14], v[2]);
+  v[15] = xorv(v[15], v[3]);
+  v[12] = rot16(v[12]);
+  v[13] = rot16(v[13]);
+  v[14] = rot16(v[14]);
+  v[15] = rot16(v[15]);
+  v[8] = addv(v[8], v[12]);
+  v[9] = addv(v[9], v[13]);
+  v[10] = addv(v[10], v[14]);
+  v[11] = addv(v[11], v[15]);
+  v[4] = xorv(v[4], v[8]);
+  v[5] = xorv(v[5], v[9]);
+  v[6] = xorv(v[6], v[10]);
+  v[7] = xorv(v[7], v[11]);
+  v[4] = rot12(v[4]);
+  v[5] = rot12(v[5]);
+  v[6] = rot12(v[6]);
+  v[7] = rot12(v[7]);
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
+  v[0] = addv(v[0], v[4]);
+  v[1] = addv(v[1], v[5]);
+  v[2] = addv(v[2], v[6]);
+  v[3] = addv(v[3], v[7]);
+  v[12] = xorv(v[12], v[0]);
+  v[13] = xorv(v[13], v[1]);
+  v[14] = xorv(v[14], v[2]);
+  v[15] = xorv(v[15], v[3]);
+  v[12] = rot8(v[12]);
+  v[13] = rot8(v[13]);
+  v[14] = rot8(v[14]);
+  v[15] = rot8(v[15]);
+  v[8] = addv(v[8], v[12]);
+  v[9] = addv(v[9], v[13]);
+  v[10] = addv(v[10], v[14]);
+  v[11] = addv(v[11], v[15]);
+  v[4] = xorv(v[4], v[8]);
+  v[5] = xorv(v[5], v[9]);
+  v[6] = xorv(v[6], v[10]);
+  v[7] = xorv(v[7], v[11]);
+  v[4] = rot7(v[4]);
+  v[5] = rot7(v[5]);
+  v[6] = rot7(v[6]);
+  v[7] = rot7(v[7]);
+
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
+  v[0] = addv(v[0], v[5]);
+  v[1] = addv(v[1], v[6]);
+  v[2] = addv(v[2], v[7]);
+  v[3] = addv(v[3], v[4]);
+  v[15] = xorv(v[15], v[0]);
+  v[12] = xorv(v[12], v[1]);
+  v[13] = xorv(v[13], v[2]);
+  v[14] = xorv(v[14], v[3]);
+  v[15] = rot16(v[15]);
+  v[12] = rot16(v[12]);
+  v[13] = rot16(v[13]);
+  v[14] = rot16(v[14]);
+  v[10] = addv(v[10], v[15]);
+  v[11] = addv(v[11], v[12]);
+  v[8] = addv(v[8], v[13]);
+  v[9] = addv(v[9], v[14]);
+  v[5] = xorv(v[5], v[10]);
+  v[6] = xorv(v[6], v[11]);
+  v[7] = xorv(v[7], v[8]);
+  v[4] = xorv(v[4], v[9]);
+  v[5] = rot12(v[5]);
+  v[6] = rot12(v[6]);
+  v[7] = rot12(v[7]);
+  v[4] = rot12(v[4]);
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
+  v[0] = addv(v[0], v[5]);
+  v[1] = addv(v[1], v[6]);
+  v[2] = addv(v[2], v[7]);
+  v[3] = addv(v[3], v[4]);
+  v[15] = xorv(v[15], v[0]);
+  v[12] = xorv(v[12], v[1]);
+  v[13] = xorv(v[13], v[2]);
+  v[14] = xorv(v[14], v[3]);
+  v[15] = rot8(v[15]);
+  v[12] = rot8(v[12]);
+  v[13] = rot8(v[13]);
+  v[14] = rot8(v[14]);
+  v[10] = addv(v[10], v[15]);
+  v[11] = addv(v[11], v[12]);
+  v[8] = addv(v[8], v[13]);
+  v[9] = addv(v[9], v[14]);
+  v[5] = xorv(v[5], v[10]);
+  v[6] = xorv(v[6], v[11]);
+  v[7] = xorv(v[7], v[8]);
+  v[4] = xorv(v[4], v[9]);
+  v[5] = rot7(v[5]);
+  v[6] = rot7(v[6]);
+  v[7] = rot7(v[7]);
+  v[4] = rot7(v[4]);
+}
+
+INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
+  // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
+  // 22/33. Note that this doesn't split the vector into two lanes, as the
+  // AVX2 counterparts do.
+  __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
+  __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
+  __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
+  __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
+
+  // Interleave 64-bit lanes.
+  __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
+  __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
+  __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
+  __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
+
+  vecs[0] = abcd_0;
+  vecs[1] = abcd_1;
+  vecs[2] = abcd_2;
+  vecs[3] = abcd_3;
+}
+
+INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
+                               size_t block_offset, __m128i out[16]) {
+  out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
+  out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
+  out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
+  out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
+  out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
+  out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
+  out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
+  out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
+  out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
+  out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
+  out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
+  out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
+  out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
+  out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
+  out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
+  out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
+  for (size_t i = 0; i < 4; ++i) {
+    _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
+  }
+  transpose_vecs(&out[0]);
+  transpose_vecs(&out[4]);
+  transpose_vecs(&out[8]);
+  transpose_vecs(&out[12]);
+}
+
+INLINE void load_counters(uint64_t counter, bool increment_counter,
+                          __m128i *out_lo, __m128i *out_hi) {
+  const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
+  const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
+  const __m128i add1 = _mm_and_si128(mask, add0);
+  __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
+  __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), 
+                                  _mm_xor_si128(   l, _mm_set1_epi32(0x80000000)));
+  __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
+  *out_lo = l;
+  *out_hi = h;
+}
+
+static
+void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks,
+                        const uint32_t key[8], uint64_t counter,
+                        bool increment_counter, uint8_t flags,
+                        uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+  __m128i h_vecs[8] = {
+      set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]),
+      set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]),
+  };
+  __m128i counter_low_vec, counter_high_vec;
+  load_counters(counter, increment_counter, &counter_low_vec,
+                &counter_high_vec);
+  uint8_t block_flags = flags | flags_start;
+
+  for (size_t block = 0; block < blocks; block++) {
+    if (block + 1 == blocks) {
+      block_flags |= flags_end;
+    }
+    __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN);
+    __m128i block_flags_vec = set1(block_flags);
+    __m128i msg_vecs[16];
+    transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
+
+    __m128i v[16] = {
+        h_vecs[0],       h_vecs[1],        h_vecs[2],     h_vecs[3],
+        h_vecs[4],       h_vecs[5],        h_vecs[6],     h_vecs[7],
+        set1(IV[0]),     set1(IV[1]),      set1(IV[2]),   set1(IV[3]),
+        counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
+    };
+    round_fn(v, msg_vecs, 0);
+    round_fn(v, msg_vecs, 1);
+    round_fn(v, msg_vecs, 2);
+    round_fn(v, msg_vecs, 3);
+    round_fn(v, msg_vecs, 4);
+    round_fn(v, msg_vecs, 5);
+    round_fn(v, msg_vecs, 6);
+    h_vecs[0] = xorv(v[0], v[8]);
+    h_vecs[1] = xorv(v[1], v[9]);
+    h_vecs[2] = xorv(v[2], v[10]);
+    h_vecs[3] = xorv(v[3], v[11]);
+    h_vecs[4] = xorv(v[4], v[12]);
+    h_vecs[5] = xorv(v[5], v[13]);
+    h_vecs[6] = xorv(v[6], v[14]);
+    h_vecs[7] = xorv(v[7], v[15]);
+
+    block_flags = flags;
+  }
+
+  transpose_vecs(&h_vecs[0]);
+  transpose_vecs(&h_vecs[4]);
+  // The first four vecs now contain the first half of each output, and the
+  // second four vecs contain the second half of each output.
+  storeu(h_vecs[0], &out[0 * sizeof(__m128i)]);
+  storeu(h_vecs[4], &out[1 * sizeof(__m128i)]);
+  storeu(h_vecs[1], &out[2 * sizeof(__m128i)]);
+  storeu(h_vecs[5], &out[3 * sizeof(__m128i)]);
+  storeu(h_vecs[2], &out[4 * sizeof(__m128i)]);
+  storeu(h_vecs[6], &out[5 * sizeof(__m128i)]);
+  storeu(h_vecs[3], &out[6 * sizeof(__m128i)]);
+  storeu(h_vecs[7], &out[7 * sizeof(__m128i)]);
+}
+
+INLINE void hash_one_sse41(const uint8_t *input, size_t blocks,
+                           const uint32_t key[8], uint64_t counter,
+                           uint8_t flags, uint8_t flags_start,
+                           uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
+  uint32_t cv[8];
+  memcpy(cv, key, BLAKE3_KEY_LEN);
+  uint8_t block_flags = flags | flags_start;
+  while (blocks > 0) {
+    if (blocks == 1) {
+      block_flags |= flags_end;
+    }
+    blake3_compress_in_place_sse41(cv, input, BLAKE3_BLOCK_LEN, counter,
+                                   block_flags);
+    input = &input[BLAKE3_BLOCK_LEN];
+    blocks -= 1;
+    block_flags = flags;
+  }
+  memcpy(out, cv, BLAKE3_OUT_LEN);
+}
+
+void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
+                            size_t blocks, const uint32_t key[8],
+                            uint64_t counter, bool increment_counter,
+                            uint8_t flags, uint8_t flags_start,
+                            uint8_t flags_end, uint8_t *out) {
+  while (num_inputs >= DEGREE) {
+    blake3_hash4_sse41(inputs, blocks, key, counter, increment_counter, flags,
+                       flags_start, flags_end, out);
+    if (increment_counter) {
+      counter += DEGREE;
+    }
+    inputs += DEGREE;
+    num_inputs -= DEGREE;
+    out = &out[DEGREE * BLAKE3_OUT_LEN];
+  }
+  while (num_inputs > 0) {
+    hash_one_sse41(inputs[0], blocks, key, counter, flags, flags_start,
+                   flags_end, out);
+    if (increment_counter) {
+      counter += 1;
+    }
+    inputs += 1;
+    num_inputs -= 1;
+    out = &out[BLAKE3_OUT_LEN];
+  }
+}
diff --git a/llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_unix.S b/llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_unix.S
new file mode 100644
index 000000000000..4e918c5bb2cc
--- /dev/null
+++ b/llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_unix.S
@@ -0,0 +1,2044 @@
+#if defined(__x86_64__)
+
+#if defined(__ELF__) && defined(__linux__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
+#if __has_include(<cet.h>)
+#include <cet.h>
+#endif
+#endif
+
+#if !defined(_CET_ENDBR)
+#define _CET_ENDBR
+#endif
+
+#ifdef __APPLE__
+#define HIDDEN .private_extern
+#else
+#define HIDDEN .hidden
+#endif
+
+.intel_syntax noprefix
+HIDDEN blake3_hash_many_sse41
+HIDDEN _blake3_hash_many_sse41
+HIDDEN blake3_compress_in_place_sse41
+HIDDEN _blake3_compress_in_place_sse41
+HIDDEN blake3_compress_xof_sse41
+HIDDEN _blake3_compress_xof_sse41
+.global blake3_hash_many_sse41
+.global _blake3_hash_many_sse41
+.global blake3_compress_in_place_sse41
+.global _blake3_compress_in_place_sse41
+.global blake3_compress_xof_sse41
+.global _blake3_compress_xof_sse41
+#ifdef __APPLE__
+.text
+#else
+.section .text
+#endif
+        .p2align  6
+_blake3_hash_many_sse41:
+blake3_hash_many_sse41:
+        _CET_ENDBR
+        push    r15
+        push    r14
+        push    r13
+        push    r12
+        push    rbx
+        push    rbp
+        mov     rbp, rsp
+        sub     rsp, 360
+        and     rsp, 0xFFFFFFFFFFFFFFC0
+        neg     r9d
+        movd    xmm0, r9d
+        pshufd  xmm0, xmm0, 0x00
+        movdqa  xmmword ptr [rsp+0x130], xmm0
+        movdqa  xmm1, xmm0
+        pand    xmm1, xmmword ptr [ADD0+rip]
+        pand    xmm0, xmmword ptr [ADD1+rip]
+        movdqa  xmmword ptr [rsp+0x150], xmm0
+        movd    xmm0, r8d
+        pshufd  xmm0, xmm0, 0x00
+        paddd   xmm0, xmm1
+        movdqa  xmmword ptr [rsp+0x110], xmm0
+        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
+        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
+        pcmpgtd xmm1, xmm0
+        shr     r8, 32
+        movd    xmm2, r8d
+        pshufd  xmm2, xmm2, 0x00
+        psubd   xmm2, xmm1
+        movdqa  xmmword ptr [rsp+0x120], xmm2
+        mov     rbx, qword ptr [rbp+0x50]
+        mov     r15, rdx
+        shl     r15, 6
+        movzx   r13d, byte ptr [rbp+0x38]
+        movzx   r12d, byte ptr [rbp+0x48]
+        cmp     rsi, 4
+        jc      3f
+2:
+        movdqu  xmm3, xmmword ptr [rcx]
+        pshufd  xmm0, xmm3, 0x00
+        pshufd  xmm1, xmm3, 0x55
+        pshufd  xmm2, xmm3, 0xAA
+        pshufd  xmm3, xmm3, 0xFF
+        movdqu  xmm7, xmmword ptr [rcx+0x10]
+        pshufd  xmm4, xmm7, 0x00
+        pshufd  xmm5, xmm7, 0x55
+        pshufd  xmm6, xmm7, 0xAA
+        pshufd  xmm7, xmm7, 0xFF
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        mov     r10, qword ptr [rdi+0x10]
+        mov     r11, qword ptr [rdi+0x18]
+        movzx   eax, byte ptr [rbp+0x40]
+        or      eax, r13d
+        xor     edx, edx
+9:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        movdqu  xmm8, xmmword ptr [r8+rdx-0x40]
+        movdqu  xmm9, xmmword ptr [r9+rdx-0x40]
+        movdqu  xmm10, xmmword ptr [r10+rdx-0x40]
+        movdqu  xmm11, xmmword ptr [r11+rdx-0x40]
+        movdqa  xmm12, xmm8
+        punpckldq xmm8, xmm9
+        punpckhdq xmm12, xmm9
+        movdqa  xmm14, xmm10
+        punpckldq xmm10, xmm11
+        punpckhdq xmm14, xmm11
+        movdqa  xmm9, xmm8
+        punpcklqdq xmm8, xmm10
+        punpckhqdq xmm9, xmm10
+        movdqa  xmm13, xmm12
+        punpcklqdq xmm12, xmm14
+        punpckhqdq xmm13, xmm14
+        movdqa  xmmword ptr [rsp], xmm8
+        movdqa  xmmword ptr [rsp+0x10], xmm9
+        movdqa  xmmword ptr [rsp+0x20], xmm12
+        movdqa  xmmword ptr [rsp+0x30], xmm13
+        movdqu  xmm8, xmmword ptr [r8+rdx-0x30]
+        movdqu  xmm9, xmmword ptr [r9+rdx-0x30]
+        movdqu  xmm10, xmmword ptr [r10+rdx-0x30]
+        movdqu  xmm11, xmmword ptr [r11+rdx-0x30]
+        movdqa  xmm12, xmm8
+        punpckldq xmm8, xmm9
+        punpckhdq xmm12, xmm9
+        movdqa  xmm14, xmm10
+        punpckldq xmm10, xmm11
+        punpckhdq xmm14, xmm11
+        movdqa  xmm9, xmm8
+        punpcklqdq xmm8, xmm10
+        punpckhqdq xmm9, xmm10
+        movdqa  xmm13, xmm12
+        punpcklqdq xmm12, xmm14
+        punpckhqdq xmm13, xmm14
+        movdqa  xmmword ptr [rsp+0x40], xmm8
+        movdqa  xmmword ptr [rsp+0x50], xmm9
+        movdqa  xmmword ptr [rsp+0x60], xmm12
+        movdqa  xmmword ptr [rsp+0x70], xmm13
+        movdqu  xmm8, xmmword ptr [r8+rdx-0x20]
+        movdqu  xmm9, xmmword ptr [r9+rdx-0x20]
+        movdqu  xmm10, xmmword ptr [r10+rdx-0x20]
+        movdqu  xmm11, xmmword ptr [r11+rdx-0x20]
+        movdqa  xmm12, xmm8
+        punpckldq xmm8, xmm9
+        punpckhdq xmm12, xmm9
+        movdqa  xmm14, xmm10
+        punpckldq xmm10, xmm11
+        punpckhdq xmm14, xmm11
+        movdqa  xmm9, xmm8
+        punpcklqdq xmm8, xmm10
+        punpckhqdq xmm9, xmm10
+        movdqa  xmm13, xmm12
+        punpcklqdq xmm12, xmm14
+        punpckhqdq xmm13, xmm14
+        movdqa  xmmword ptr [rsp+0x80], xmm8
+        movdqa  xmmword ptr [rsp+0x90], xmm9
+        movdqa  xmmword ptr [rsp+0xA0], xmm12
+        movdqa  xmmword ptr [rsp+0xB0], xmm13
+        movdqu  xmm8, xmmword ptr [r8+rdx-0x10]
+        movdqu  xmm9, xmmword ptr [r9+rdx-0x10]
+        movdqu  xmm10, xmmword ptr [r10+rdx-0x10]
+        movdqu  xmm11, xmmword ptr [r11+rdx-0x10]
+        movdqa  xmm12, xmm8
+        punpckldq xmm8, xmm9
+        punpckhdq xmm12, xmm9
+        movdqa  xmm14, xmm10
+        punpckldq xmm10, xmm11
+        punpckhdq xmm14, xmm11
+        movdqa  xmm9, xmm8
+        punpcklqdq xmm8, xmm10
+        punpckhqdq xmm9, xmm10
+        movdqa  xmm13, xmm12
+        punpcklqdq xmm12, xmm14
+        punpckhqdq xmm13, xmm14
+        movdqa  xmmword ptr [rsp+0xC0], xmm8
+        movdqa  xmmword ptr [rsp+0xD0], xmm9
+        movdqa  xmmword ptr [rsp+0xE0], xmm12
+        movdqa  xmmword ptr [rsp+0xF0], xmm13
+        movdqa  xmm9, xmmword ptr [BLAKE3_IV_1+rip]
+        movdqa  xmm10, xmmword ptr [BLAKE3_IV_2+rip]
+        movdqa  xmm11, xmmword ptr [BLAKE3_IV_3+rip]
+        movdqa  xmm12, xmmword ptr [rsp+0x110]
+        movdqa  xmm13, xmmword ptr [rsp+0x120]
+        movdqa  xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
+        movd    xmm15, eax
+        pshufd  xmm15, xmm15, 0x00
+        prefetcht0 [r8+rdx+0x80]
+        prefetcht0 [r9+rdx+0x80]
+        prefetcht0 [r10+rdx+0x80]
+        prefetcht0 [r11+rdx+0x80]
+        paddd   xmm0, xmmword ptr [rsp]
+        paddd   xmm1, xmmword ptr [rsp+0x20]
+        paddd   xmm2, xmmword ptr [rsp+0x40]
+        paddd   xmm3, xmmword ptr [rsp+0x60]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [BLAKE3_IV_0+rip]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x10]
+        paddd   xmm1, xmmword ptr [rsp+0x30]
+        paddd   xmm2, xmmword ptr [rsp+0x50]
+        paddd   xmm3, xmmword ptr [rsp+0x70]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x80]
+        paddd   xmm1, xmmword ptr [rsp+0xA0]
+        paddd   xmm2, xmmword ptr [rsp+0xC0]
+        paddd   xmm3, xmmword ptr [rsp+0xE0]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x90]
+        paddd   xmm1, xmmword ptr [rsp+0xB0]
+        paddd   xmm2, xmmword ptr [rsp+0xD0]
+        paddd   xmm3, xmmword ptr [rsp+0xF0]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x20]
+        paddd   xmm1, xmmword ptr [rsp+0x30]
+        paddd   xmm2, xmmword ptr [rsp+0x70]
+        paddd   xmm3, xmmword ptr [rsp+0x40]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x60]
+        paddd   xmm1, xmmword ptr [rsp+0xA0]
+        paddd   xmm2, xmmword ptr [rsp]
+        paddd   xmm3, xmmword ptr [rsp+0xD0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x10]
+        paddd   xmm1, xmmword ptr [rsp+0xC0]
+        paddd   xmm2, xmmword ptr [rsp+0x90]
+        paddd   xmm3, xmmword ptr [rsp+0xF0]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xB0]
+        paddd   xmm1, xmmword ptr [rsp+0x50]
+        paddd   xmm2, xmmword ptr [rsp+0xE0]
+        paddd   xmm3, xmmword ptr [rsp+0x80]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x30]
+        paddd   xmm1, xmmword ptr [rsp+0xA0]
+        paddd   xmm2, xmmword ptr [rsp+0xD0]
+        paddd   xmm3, xmmword ptr [rsp+0x70]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x40]
+        paddd   xmm1, xmmword ptr [rsp+0xC0]
+        paddd   xmm2, xmmword ptr [rsp+0x20]
+        paddd   xmm3, xmmword ptr [rsp+0xE0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x60]
+        paddd   xmm1, xmmword ptr [rsp+0x90]
+        paddd   xmm2, xmmword ptr [rsp+0xB0]
+        paddd   xmm3, xmmword ptr [rsp+0x80]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x50]
+        paddd   xmm1, xmmword ptr [rsp]
+        paddd   xmm2, xmmword ptr [rsp+0xF0]
+        paddd   xmm3, xmmword ptr [rsp+0x10]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xA0]
+        paddd   xmm1, xmmword ptr [rsp+0xC0]
+        paddd   xmm2, xmmword ptr [rsp+0xE0]
+        paddd   xmm3, xmmword ptr [rsp+0xD0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x70]
+        paddd   xmm1, xmmword ptr [rsp+0x90]
+        paddd   xmm2, xmmword ptr [rsp+0x30]
+        paddd   xmm3, xmmword ptr [rsp+0xF0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x40]
+        paddd   xmm1, xmmword ptr [rsp+0xB0]
+        paddd   xmm2, xmmword ptr [rsp+0x50]
+        paddd   xmm3, xmmword ptr [rsp+0x10]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp]
+        paddd   xmm1, xmmword ptr [rsp+0x20]
+        paddd   xmm2, xmmword ptr [rsp+0x80]
+        paddd   xmm3, xmmword ptr [rsp+0x60]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xC0]
+        paddd   xmm1, xmmword ptr [rsp+0x90]
+        paddd   xmm2, xmmword ptr [rsp+0xF0]
+        paddd   xmm3, xmmword ptr [rsp+0xE0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xD0]
+        paddd   xmm1, xmmword ptr [rsp+0xB0]
+        paddd   xmm2, xmmword ptr [rsp+0xA0]
+        paddd   xmm3, xmmword ptr [rsp+0x80]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x70]
+        paddd   xmm1, xmmword ptr [rsp+0x50]
+        paddd   xmm2, xmmword ptr [rsp]
+        paddd   xmm3, xmmword ptr [rsp+0x60]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x20]
+        paddd   xmm1, xmmword ptr [rsp+0x30]
+        paddd   xmm2, xmmword ptr [rsp+0x10]
+        paddd   xmm3, xmmword ptr [rsp+0x40]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x90]
+        paddd   xmm1, xmmword ptr [rsp+0xB0]
+        paddd   xmm2, xmmword ptr [rsp+0x80]
+        paddd   xmm3, xmmword ptr [rsp+0xF0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xE0]
+        paddd   xmm1, xmmword ptr [rsp+0x50]
+        paddd   xmm2, xmmword ptr [rsp+0xC0]
+        paddd   xmm3, xmmword ptr [rsp+0x10]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xD0]
+        paddd   xmm1, xmmword ptr [rsp]
+        paddd   xmm2, xmmword ptr [rsp+0x20]
+        paddd   xmm3, xmmword ptr [rsp+0x40]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x30]
+        paddd   xmm1, xmmword ptr [rsp+0xA0]
+        paddd   xmm2, xmmword ptr [rsp+0x60]
+        paddd   xmm3, xmmword ptr [rsp+0x70]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xB0]
+        paddd   xmm1, xmmword ptr [rsp+0x50]
+        paddd   xmm2, xmmword ptr [rsp+0x10]
+        paddd   xmm3, xmmword ptr [rsp+0x80]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xF0]
+        paddd   xmm1, xmmword ptr [rsp]
+        paddd   xmm2, xmmword ptr [rsp+0x90]
+        paddd   xmm3, xmmword ptr [rsp+0x60]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xE0]
+        paddd   xmm1, xmmword ptr [rsp+0x20]
+        paddd   xmm2, xmmword ptr [rsp+0x30]
+        paddd   xmm3, xmmword ptr [rsp+0x70]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xA0]
+        paddd   xmm1, xmmword ptr [rsp+0xC0]
+        paddd   xmm2, xmmword ptr [rsp+0x40]
+        paddd   xmm3, xmmword ptr [rsp+0xD0]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        pxor    xmm0, xmm8
+        pxor    xmm1, xmm9
+        pxor    xmm2, xmm10
+        pxor    xmm3, xmm11
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        pxor    xmm4, xmm12
+        pxor    xmm5, xmm13
+        pxor    xmm6, xmm14
+        pxor    xmm7, xmm15
+        mov     eax, r13d
+        jne     9b
+        movdqa  xmm9, xmm0
+        punpckldq xmm0, xmm1
+        punpckhdq xmm9, xmm1
+        movdqa  xmm11, xmm2
+        punpckldq xmm2, xmm3
+        punpckhdq xmm11, xmm3
+        movdqa  xmm1, xmm0
+        punpcklqdq xmm0, xmm2
+        punpckhqdq xmm1, xmm2
+        movdqa  xmm3, xmm9
+        punpcklqdq xmm9, xmm11
+        punpckhqdq xmm3, xmm11
+        movdqu  xmmword ptr [rbx], xmm0
+        movdqu  xmmword ptr [rbx+0x20], xmm1
+        movdqu  xmmword ptr [rbx+0x40], xmm9
+        movdqu  xmmword ptr [rbx+0x60], xmm3
+        movdqa  xmm9, xmm4
+        punpckldq xmm4, xmm5
+        punpckhdq xmm9, xmm5
+        movdqa  xmm11, xmm6
+        punpckldq xmm6, xmm7
+        punpckhdq xmm11, xmm7
+        movdqa  xmm5, xmm4
+        punpcklqdq xmm4, xmm6
+        punpckhqdq xmm5, xmm6
+        movdqa  xmm7, xmm9
+        punpcklqdq xmm9, xmm11
+        punpckhqdq xmm7, xmm11
+        movdqu  xmmword ptr [rbx+0x10], xmm4
+        movdqu  xmmword ptr [rbx+0x30], xmm5
+        movdqu  xmmword ptr [rbx+0x50], xmm9
+        movdqu  xmmword ptr [rbx+0x70], xmm7
+        movdqa  xmm1, xmmword ptr [rsp+0x110]
+        movdqa  xmm0, xmm1
+        paddd   xmm1, xmmword ptr [rsp+0x150]
+        movdqa  xmmword ptr [rsp+0x110], xmm1
+        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
+        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
+        pcmpgtd xmm0, xmm1
+        movdqa  xmm1, xmmword ptr [rsp+0x120]
+        psubd   xmm1, xmm0
+        movdqa  xmmword ptr [rsp+0x120], xmm1
+        add     rbx, 128
+        add     rdi, 32
+        sub     rsi, 4
+        cmp     rsi, 4
+        jnc     2b
+        test    rsi, rsi
+        jnz     3f
+4:
+        mov     rsp, rbp
+        pop     rbp
+        pop     rbx
+        pop     r12
+        pop     r13
+        pop     r14
+        pop     r15
+        ret
+.p2align 5
+3:
+        test    esi, 0x2
+        je      3f
+        movups  xmm0, xmmword ptr [rcx]
+        movups  xmm1, xmmword ptr [rcx+0x10]
+        movaps  xmm8, xmm0
+        movaps  xmm9, xmm1
+        movd    xmm13, dword ptr [rsp+0x110]
+        pinsrd  xmm13, dword ptr [rsp+0x120], 1
+        pinsrd  xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+        movaps  xmmword ptr [rsp], xmm13
+        movd    xmm14, dword ptr [rsp+0x114]
+        pinsrd  xmm14, dword ptr [rsp+0x124], 1
+        pinsrd  xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+        movaps  xmmword ptr [rsp+0x10], xmm14
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        movzx   eax, byte ptr [rbp+0x40]
+        or      eax, r13d
+        xor     edx, edx
+2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
+        movaps  xmm10, xmm2
+        movups  xmm4, xmmword ptr [r8+rdx-0x40]
+        movups  xmm5, xmmword ptr [r8+rdx-0x30]
+        movaps  xmm3, xmm4
+        shufps  xmm4, xmm5, 136
+        shufps  xmm3, xmm5, 221
+        movaps  xmm5, xmm3
+        movups  xmm6, xmmword ptr [r8+rdx-0x20]
+        movups  xmm7, xmmword ptr [r8+rdx-0x10]
+        movaps  xmm3, xmm6
+        shufps  xmm6, xmm7, 136
+        pshufd  xmm6, xmm6, 0x93
+        shufps  xmm3, xmm7, 221
+        pshufd  xmm7, xmm3, 0x93
+        movups  xmm12, xmmword ptr [r9+rdx-0x40]
+        movups  xmm13, xmmword ptr [r9+rdx-0x30]
+        movaps  xmm11, xmm12
+        shufps  xmm12, xmm13, 136
+        shufps  xmm11, xmm13, 221
+        movaps  xmm13, xmm11
+        movups  xmm14, xmmword ptr [r9+rdx-0x20]
+        movups  xmm15, xmmword ptr [r9+rdx-0x10]
+        movaps  xmm11, xmm14
+        shufps  xmm14, xmm15, 136
+        pshufd  xmm14, xmm14, 0x93
+        shufps  xmm11, xmm15, 221
+        pshufd  xmm15, xmm11, 0x93
+        movaps  xmm3, xmmword ptr [rsp]
+        movaps  xmm11, xmmword ptr [rsp+0x10]
+        pinsrd  xmm3, eax, 3
+        pinsrd  xmm11, eax, 3
+        mov     al, 7
+9:
+        paddd   xmm0, xmm4
+        paddd   xmm8, xmm12
+        movaps  xmmword ptr [rsp+0x20], xmm4
+        movaps  xmmword ptr [rsp+0x30], xmm12
+        paddd   xmm0, xmm1
+        paddd   xmm8, xmm9
+        pxor    xmm3, xmm0
+        pxor    xmm11, xmm8
+        movaps  xmm12, xmmword ptr [ROT16+rip]
+        pshufb  xmm3, xmm12
+        pshufb  xmm11, xmm12
+        paddd   xmm2, xmm3
+        paddd   xmm10, xmm11
+        pxor    xmm1, xmm2
+        pxor    xmm9, xmm10
+        movdqa  xmm4, xmm1
+        pslld   xmm1, 20
+        psrld   xmm4, 12
+        por     xmm1, xmm4
+        movdqa  xmm4, xmm9
+        pslld   xmm9, 20
+        psrld   xmm4, 12
+        por     xmm9, xmm4
+        paddd   xmm0, xmm5
+        paddd   xmm8, xmm13
+        movaps  xmmword ptr [rsp+0x40], xmm5
+        movaps  xmmword ptr [rsp+0x50], xmm13
+        paddd   xmm0, xmm1
+        paddd   xmm8, xmm9
+        pxor    xmm3, xmm0
+        pxor    xmm11, xmm8
+        movaps  xmm13, xmmword ptr [ROT8+rip]
+        pshufb  xmm3, xmm13
+        pshufb  xmm11, xmm13
+        paddd   xmm2, xmm3
+        paddd   xmm10, xmm11
+        pxor    xmm1, xmm2
+        pxor    xmm9, xmm10
+        movdqa  xmm4, xmm1
+        pslld   xmm1, 25
+        psrld   xmm4, 7
+        por     xmm1, xmm4
+        movdqa  xmm4, xmm9
+        pslld   xmm9, 25
+        psrld   xmm4, 7
+        por     xmm9, xmm4
+        pshufd  xmm0, xmm0, 0x93
+        pshufd  xmm8, xmm8, 0x93
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm11, xmm11, 0x4E
+        pshufd  xmm2, xmm2, 0x39
+        pshufd  xmm10, xmm10, 0x39
+        paddd   xmm0, xmm6
+        paddd   xmm8, xmm14
+        paddd   xmm0, xmm1
+        paddd   xmm8, xmm9
+        pxor    xmm3, xmm0
+        pxor    xmm11, xmm8
+        pshufb  xmm3, xmm12
+        pshufb  xmm11, xmm12
+        paddd   xmm2, xmm3
+        paddd   xmm10, xmm11
+        pxor    xmm1, xmm2
+        pxor    xmm9, xmm10
+        movdqa  xmm4, xmm1
+        pslld   xmm1, 20
+        psrld   xmm4, 12
+        por     xmm1, xmm4
+        movdqa  xmm4, xmm9
+        pslld   xmm9, 20
+        psrld   xmm4, 12
+        por     xmm9, xmm4
+        paddd   xmm0, xmm7
+        paddd   xmm8, xmm15
+        paddd   xmm0, xmm1
+        paddd   xmm8, xmm9
+        pxor    xmm3, xmm0
+        pxor    xmm11, xmm8
+        pshufb  xmm3, xmm13
+        pshufb  xmm11, xmm13
+        paddd   xmm2, xmm3
+        paddd   xmm10, xmm11
+        pxor    xmm1, xmm2
+        pxor    xmm9, xmm10
+        movdqa  xmm4, xmm1
+        pslld   xmm1, 25
+        psrld   xmm4, 7
+        por     xmm1, xmm4
+        movdqa  xmm4, xmm9
+        pslld   xmm9, 25
+        psrld   xmm4, 7
+        por     xmm9, xmm4
+        pshufd  xmm0, xmm0, 0x39
+        pshufd  xmm8, xmm8, 0x39
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm11, xmm11, 0x4E
+        pshufd  xmm2, xmm2, 0x93
+        pshufd  xmm10, xmm10, 0x93
+        dec     al
+        je      9f
+        movdqa  xmm12, xmmword ptr [rsp+0x20]
+        movdqa  xmm5, xmmword ptr [rsp+0x40]
+        pshufd  xmm13, xmm12, 0x0F
+        shufps  xmm12, xmm5, 214
+        pshufd  xmm4, xmm12, 0x39
+        movdqa  xmm12, xmm6
+        shufps  xmm12, xmm7, 250
+        pblendw xmm13, xmm12, 0xCC
+        movdqa  xmm12, xmm7
+        punpcklqdq xmm12, xmm5
+        pblendw xmm12, xmm6, 0xC0
+        pshufd  xmm12, xmm12, 0x78
+        punpckhdq xmm5, xmm7
+        punpckldq xmm6, xmm5
+        pshufd  xmm7, xmm6, 0x1E
+        movdqa  xmmword ptr [rsp+0x20], xmm13
+        movdqa  xmmword ptr [rsp+0x40], xmm12
+        movdqa  xmm5, xmmword ptr [rsp+0x30]
+        movdqa  xmm13, xmmword ptr [rsp+0x50]
+        pshufd  xmm6, xmm5, 0x0F
+        shufps  xmm5, xmm13, 214
+        pshufd  xmm12, xmm5, 0x39
+        movdqa  xmm5, xmm14
+        shufps  xmm5, xmm15, 250
+        pblendw xmm6, xmm5, 0xCC
+        movdqa  xmm5, xmm15
+        punpcklqdq xmm5, xmm13
+        pblendw xmm5, xmm14, 0xC0
+        pshufd  xmm5, xmm5, 0x78
+        punpckhdq xmm13, xmm15
+        punpckldq xmm14, xmm13
+        pshufd  xmm15, xmm14, 0x1E
+        movdqa  xmm13, xmm6
+        movdqa  xmm14, xmm5
+        movdqa  xmm5, xmmword ptr [rsp+0x20]
+        movdqa  xmm6, xmmword ptr [rsp+0x40]
+        jmp     9b
+9:
+        pxor    xmm0, xmm2
+        pxor    xmm1, xmm3
+        pxor    xmm8, xmm10
+        pxor    xmm9, xmm11
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     2b
+        movups  xmmword ptr [rbx], xmm0
+        movups  xmmword ptr [rbx+0x10], xmm1
+        movups  xmmword ptr [rbx+0x20], xmm8
+        movups  xmmword ptr [rbx+0x30], xmm9
+        movdqa  xmm0, xmmword ptr [rsp+0x130]
+        movdqa  xmm1, xmmword ptr [rsp+0x110]
+        movdqa  xmm2, xmmword ptr [rsp+0x120]
+        movdqu  xmm3, xmmword ptr [rsp+0x118]
+        movdqu  xmm4, xmmword ptr [rsp+0x128]
+        blendvps xmm1, xmm3, xmm0
+        blendvps xmm2, xmm4, xmm0
+        movdqa  xmmword ptr [rsp+0x110], xmm1
+        movdqa  xmmword ptr [rsp+0x120], xmm2
+        add     rdi, 16
+        add     rbx, 64
+        sub     rsi, 2
+3:
+        test    esi, 0x1
+        je      4b
+        movups  xmm0, xmmword ptr [rcx]
+        movups  xmm1, xmmword ptr [rcx+0x10]
+        movd    xmm13, dword ptr [rsp+0x110]
+        pinsrd  xmm13, dword ptr [rsp+0x120], 1
+        pinsrd  xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+        movaps  xmm14, xmmword ptr [ROT8+rip]
+        movaps  xmm15, xmmword ptr [ROT16+rip]
+        mov     r8, qword ptr [rdi]
+        movzx   eax, byte ptr [rbp+0x40]
+        or      eax, r13d
+        xor     edx, edx
+2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
+        movaps  xmm3, xmm13
+        pinsrd  xmm3, eax, 3
+        movups  xmm4, xmmword ptr [r8+rdx-0x40]
+        movups  xmm5, xmmword ptr [r8+rdx-0x30]
+        movaps  xmm8, xmm4
+        shufps  xmm4, xmm5, 136
+        shufps  xmm8, xmm5, 221
+        movaps  xmm5, xmm8
+        movups  xmm6, xmmword ptr [r8+rdx-0x20]
+        movups  xmm7, xmmword ptr [r8+rdx-0x10]
+        movaps  xmm8, xmm6
+        shufps  xmm6, xmm7, 136
+        pshufd  xmm6, xmm6, 0x93
+        shufps  xmm8, xmm7, 221
+        pshufd  xmm7, xmm8, 0x93
+        mov     al, 7
+9:
+        paddd   xmm0, xmm4
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm15
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm5
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x93
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x39
+        paddd   xmm0, xmm6
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm15
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm7
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x39
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x93
+        dec     al
+        jz      9f
+        movdqa  xmm8, xmm4
+        shufps  xmm8, xmm5, 214
+        pshufd  xmm9, xmm4, 0x0F
+        pshufd  xmm4, xmm8, 0x39
+        movdqa  xmm8, xmm6
+        shufps  xmm8, xmm7, 250
+        pblendw xmm9, xmm8, 0xCC
+        movdqa  xmm8, xmm7
+        punpcklqdq xmm8, xmm5
+        pblendw xmm8, xmm6, 0xC0
+        pshufd  xmm8, xmm8, 0x78
+        punpckhdq xmm5, xmm7
+        punpckldq xmm6, xmm5
+        pshufd  xmm7, xmm6, 0x1E
+        movdqa  xmm5, xmm9
+        movdqa  xmm6, xmm8
+        jmp     9b
+9:
+        pxor    xmm0, xmm2
+        pxor    xmm1, xmm3
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     2b
+        movups  xmmword ptr [rbx], xmm0
+        movups  xmmword ptr [rbx+0x10], xmm1
+        jmp     4b
+
+.p2align 6
+blake3_compress_in_place_sse41:
+_blake3_compress_in_place_sse41:
+        _CET_ENDBR
+        movups  xmm0, xmmword ptr [rdi]
+        movups  xmm1, xmmword ptr [rdi+0x10]
+        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
+        shl     r8, 32
+        add     rdx, r8
+        movq    xmm3, rcx
+        movq    xmm4, rdx
+        punpcklqdq xmm3, xmm4
+        movups  xmm4, xmmword ptr [rsi]
+        movups  xmm5, xmmword ptr [rsi+0x10]
+        movaps  xmm8, xmm4
+        shufps  xmm4, xmm5, 136
+        shufps  xmm8, xmm5, 221
+        movaps  xmm5, xmm8
+        movups  xmm6, xmmword ptr [rsi+0x20]
+        movups  xmm7, xmmword ptr [rsi+0x30]
+        movaps  xmm8, xmm6
+        shufps  xmm6, xmm7, 136
+        pshufd  xmm6, xmm6, 0x93
+        shufps  xmm8, xmm7, 221
+        pshufd  xmm7, xmm8, 0x93
+        movaps  xmm14, xmmword ptr [ROT8+rip]
+        movaps  xmm15, xmmword ptr [ROT16+rip]
+        mov     al, 7
+9:
+        paddd   xmm0, xmm4
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm15
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm5
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x93
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x39
+        paddd   xmm0, xmm6
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm15
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm7
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x39
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x93
+        dec     al
+        jz      9f
+        movdqa  xmm8, xmm4
+        shufps  xmm8, xmm5, 214
+        pshufd  xmm9, xmm4, 0x0F
+        pshufd  xmm4, xmm8, 0x39
+        movdqa  xmm8, xmm6
+        shufps  xmm8, xmm7, 250
+        pblendw xmm9, xmm8, 0xCC
+        movdqa  xmm8, xmm7
+        punpcklqdq xmm8, xmm5
+        pblendw xmm8, xmm6, 0xC0
+        pshufd  xmm8, xmm8, 0x78
+        punpckhdq xmm5, xmm7
+        punpckldq xmm6, xmm5
+        pshufd  xmm7, xmm6, 0x1E
+        movdqa  xmm5, xmm9
+        movdqa  xmm6, xmm8
+        jmp     9b
+9:
+        pxor    xmm0, xmm2
+        pxor    xmm1, xmm3
+        movups  xmmword ptr [rdi], xmm0
+        movups  xmmword ptr [rdi+0x10], xmm1
+        ret
+
+.p2align 6
+blake3_compress_xof_sse41:
+_blake3_compress_xof_sse41:
+        _CET_ENDBR
+        movups  xmm0, xmmword ptr [rdi]
+        movups  xmm1, xmmword ptr [rdi+0x10]
+        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
+        movzx   eax, r8b
+        movzx   edx, dl
+        shl     rax, 32
+        add     rdx, rax
+        movq    xmm3, rcx
+        movq    xmm4, rdx
+        punpcklqdq xmm3, xmm4
+        movups  xmm4, xmmword ptr [rsi]
+        movups  xmm5, xmmword ptr [rsi+0x10]
+        movaps  xmm8, xmm4
+        shufps  xmm4, xmm5, 136
+        shufps  xmm8, xmm5, 221
+        movaps  xmm5, xmm8
+        movups  xmm6, xmmword ptr [rsi+0x20]
+        movups  xmm7, xmmword ptr [rsi+0x30]
+        movaps  xmm8, xmm6
+        shufps  xmm6, xmm7, 136
+        pshufd  xmm6, xmm6, 0x93
+        shufps  xmm8, xmm7, 221
+        pshufd  xmm7, xmm8, 0x93
+        movaps  xmm14, xmmword ptr [ROT8+rip]
+        movaps  xmm15, xmmword ptr [ROT16+rip]
+        mov     al, 7
+9:
+        paddd   xmm0, xmm4
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm15
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm5
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x93
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x39
+        paddd   xmm0, xmm6
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm15
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm7
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x39
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x93
+        dec     al
+        jz      9f
+        movdqa  xmm8, xmm4
+        shufps  xmm8, xmm5, 214
+        pshufd  xmm9, xmm4, 0x0F
+        pshufd  xmm4, xmm8, 0x39
+        movdqa  xmm8, xmm6
+        shufps  xmm8, xmm7, 250
+        pblendw xmm9, xmm8, 0xCC
+        movdqa  xmm8, xmm7
+        punpcklqdq xmm8, xmm5
+        pblendw xmm8, xmm6, 0xC0
+        pshufd  xmm8, xmm8, 0x78
+        punpckhdq xmm5, xmm7
+        punpckldq xmm6, xmm5
+        pshufd  xmm7, xmm6, 0x1E
+        movdqa  xmm5, xmm9
+        movdqa  xmm6, xmm8
+        jmp     9b
+9:
+        movdqu  xmm4, xmmword ptr [rdi]
+        movdqu  xmm5, xmmword ptr [rdi+0x10]
+        pxor    xmm0, xmm2
+        pxor    xmm1, xmm3
+        pxor    xmm2, xmm4
+        pxor    xmm3, xmm5
+        movups  xmmword ptr [r9], xmm0
+        movups  xmmword ptr [r9+0x10], xmm1
+        movups  xmmword ptr [r9+0x20], xmm2
+        movups  xmmword ptr [r9+0x30], xmm3
+        ret
+
+
+#ifdef __APPLE__
+.static_data
+#else
+.section .rodata
+#endif
+.p2align  6
+BLAKE3_IV:
+        .long  0x6A09E667, 0xBB67AE85
+        .long  0x3C6EF372, 0xA54FF53A
+ROT16:
+        .byte  2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+ROT8:
+        .byte  1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
+ADD0:	
+        .long  0, 1, 2, 3
+ADD1:
+	.long  4, 4, 4, 4
+BLAKE3_IV_0:
+	.long  0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
+BLAKE3_IV_1:
+	.long  0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
+BLAKE3_IV_2:
+	.long  0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
+BLAKE3_IV_3:
+	.long  0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
+BLAKE3_BLOCK_LEN:
+	.long  64, 64, 64, 64
+CMP_MSB_MASK:
+	.long  0x80000000, 0x80000000, 0x80000000, 0x80000000
+
+#endif
diff --git a/llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_windows_gnu.S b/llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_windows_gnu.S
new file mode 100644
index 000000000000..60d0a4042e71
--- /dev/null
+++ b/llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_windows_gnu.S
@@ -0,0 +1,2069 @@
+.intel_syntax noprefix
+.global blake3_hash_many_sse41
+.global _blake3_hash_many_sse41
+.global blake3_compress_in_place_sse41
+.global _blake3_compress_in_place_sse41
+.global blake3_compress_xof_sse41
+.global _blake3_compress_xof_sse41
+.section .text
+        .p2align  6
+_blake3_hash_many_sse41:
+blake3_hash_many_sse41:
+        push    r15
+        push    r14
+        push    r13
+        push    r12
+        push    rsi
+        push    rdi
+        push    rbx
+        push    rbp
+        mov     rbp, rsp
+        sub     rsp, 528
+        and     rsp, 0xFFFFFFFFFFFFFFC0
+        movdqa  xmmword ptr [rsp+0x170], xmm6
+        movdqa  xmmword ptr [rsp+0x180], xmm7
+        movdqa  xmmword ptr [rsp+0x190], xmm8
+        movdqa  xmmword ptr [rsp+0x1A0], xmm9
+        movdqa  xmmword ptr [rsp+0x1B0], xmm10
+        movdqa  xmmword ptr [rsp+0x1C0], xmm11
+        movdqa  xmmword ptr [rsp+0x1D0], xmm12
+        movdqa  xmmword ptr [rsp+0x1E0], xmm13
+        movdqa  xmmword ptr [rsp+0x1F0], xmm14
+        movdqa  xmmword ptr [rsp+0x200], xmm15
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+        mov     rcx, r9
+        mov     r8, qword ptr [rbp+0x68]
+        movzx   r9, byte ptr [rbp+0x70]
+        neg     r9d
+        movd    xmm0, r9d
+        pshufd  xmm0, xmm0, 0x00
+        movdqa  xmmword ptr [rsp+0x130], xmm0
+        movdqa  xmm1, xmm0
+        pand    xmm1, xmmword ptr [ADD0+rip]
+        pand    xmm0, xmmword ptr [ADD1+rip]
+        movdqa  xmmword ptr [rsp+0x150], xmm0
+        movd    xmm0, r8d
+        pshufd  xmm0, xmm0, 0x00
+        paddd   xmm0, xmm1
+        movdqa  xmmword ptr [rsp+0x110], xmm0
+        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
+        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
+        pcmpgtd xmm1, xmm0
+        shr     r8, 32
+        movd    xmm2, r8d
+        pshufd  xmm2, xmm2, 0x00
+        psubd   xmm2, xmm1
+        movdqa  xmmword ptr [rsp+0x120], xmm2
+        mov     rbx, qword ptr [rbp+0x90]
+        mov     r15, rdx
+        shl     r15, 6
+        movzx   r13d, byte ptr [rbp+0x78]
+        movzx   r12d, byte ptr [rbp+0x88]
+        cmp     rsi, 4
+        jc      3f
+2:
+        movdqu  xmm3, xmmword ptr [rcx]
+        pshufd  xmm0, xmm3, 0x00
+        pshufd  xmm1, xmm3, 0x55
+        pshufd  xmm2, xmm3, 0xAA
+        pshufd  xmm3, xmm3, 0xFF
+        movdqu  xmm7, xmmword ptr [rcx+0x10]
+        pshufd  xmm4, xmm7, 0x00
+        pshufd  xmm5, xmm7, 0x55
+        pshufd  xmm6, xmm7, 0xAA
+        pshufd  xmm7, xmm7, 0xFF
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        mov     r10, qword ptr [rdi+0x10]
+        mov     r11, qword ptr [rdi+0x18]
+        movzx   eax, byte ptr [rbp+0x80]
+        or      eax, r13d
+        xor     edx, edx
+9:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        movdqu  xmm8, xmmword ptr [r8+rdx-0x40]
+        movdqu  xmm9, xmmword ptr [r9+rdx-0x40]
+        movdqu  xmm10, xmmword ptr [r10+rdx-0x40]
+        movdqu  xmm11, xmmword ptr [r11+rdx-0x40]
+        movdqa  xmm12, xmm8
+        punpckldq xmm8, xmm9
+        punpckhdq xmm12, xmm9
+        movdqa  xmm14, xmm10
+        punpckldq xmm10, xmm11
+        punpckhdq xmm14, xmm11
+        movdqa  xmm9, xmm8
+        punpcklqdq xmm8, xmm10
+        punpckhqdq xmm9, xmm10
+        movdqa  xmm13, xmm12
+        punpcklqdq xmm12, xmm14
+        punpckhqdq xmm13, xmm14
+        movdqa  xmmword ptr [rsp], xmm8
+        movdqa  xmmword ptr [rsp+0x10], xmm9
+        movdqa  xmmword ptr [rsp+0x20], xmm12
+        movdqa  xmmword ptr [rsp+0x30], xmm13
+        movdqu  xmm8, xmmword ptr [r8+rdx-0x30]
+        movdqu  xmm9, xmmword ptr [r9+rdx-0x30]
+        movdqu  xmm10, xmmword ptr [r10+rdx-0x30]
+        movdqu  xmm11, xmmword ptr [r11+rdx-0x30]
+        movdqa  xmm12, xmm8
+        punpckldq xmm8, xmm9
+        punpckhdq xmm12, xmm9
+        movdqa  xmm14, xmm10
+        punpckldq xmm10, xmm11
+        punpckhdq xmm14, xmm11
+        movdqa  xmm9, xmm8
+        punpcklqdq xmm8, xmm10
+        punpckhqdq xmm9, xmm10
+        movdqa  xmm13, xmm12
+        punpcklqdq xmm12, xmm14
+        punpckhqdq xmm13, xmm14
+        movdqa  xmmword ptr [rsp+0x40], xmm8
+        movdqa  xmmword ptr [rsp+0x50], xmm9
+        movdqa  xmmword ptr [rsp+0x60], xmm12
+        movdqa  xmmword ptr [rsp+0x70], xmm13
+        movdqu  xmm8, xmmword ptr [r8+rdx-0x20]
+        movdqu  xmm9, xmmword ptr [r9+rdx-0x20]
+        movdqu  xmm10, xmmword ptr [r10+rdx-0x20]
+        movdqu  xmm11, xmmword ptr [r11+rdx-0x20]
+        movdqa  xmm12, xmm8
+        punpckldq xmm8, xmm9
+        punpckhdq xmm12, xmm9
+        movdqa  xmm14, xmm10
+        punpckldq xmm10, xmm11
+        punpckhdq xmm14, xmm11
+        movdqa  xmm9, xmm8
+        punpcklqdq xmm8, xmm10
+        punpckhqdq xmm9, xmm10
+        movdqa  xmm13, xmm12
+        punpcklqdq xmm12, xmm14
+        punpckhqdq xmm13, xmm14
+        movdqa  xmmword ptr [rsp+0x80], xmm8
+        movdqa  xmmword ptr [rsp+0x90], xmm9
+        movdqa  xmmword ptr [rsp+0xA0], xmm12
+        movdqa  xmmword ptr [rsp+0xB0], xmm13
+        movdqu  xmm8, xmmword ptr [r8+rdx-0x10]
+        movdqu  xmm9, xmmword ptr [r9+rdx-0x10]
+        movdqu  xmm10, xmmword ptr [r10+rdx-0x10]
+        movdqu  xmm11, xmmword ptr [r11+rdx-0x10]
+        movdqa  xmm12, xmm8
+        punpckldq xmm8, xmm9
+        punpckhdq xmm12, xmm9
+        movdqa  xmm14, xmm10
+        punpckldq xmm10, xmm11
+        punpckhdq xmm14, xmm11
+        movdqa  xmm9, xmm8
+        punpcklqdq xmm8, xmm10
+        punpckhqdq xmm9, xmm10
+        movdqa  xmm13, xmm12
+        punpcklqdq xmm12, xmm14
+        punpckhqdq xmm13, xmm14
+        movdqa  xmmword ptr [rsp+0xC0], xmm8
+        movdqa  xmmword ptr [rsp+0xD0], xmm9
+        movdqa  xmmword ptr [rsp+0xE0], xmm12
+        movdqa  xmmword ptr [rsp+0xF0], xmm13
+        movdqa  xmm9, xmmword ptr [BLAKE3_IV_1+rip]
+        movdqa  xmm10, xmmword ptr [BLAKE3_IV_2+rip]
+        movdqa  xmm11, xmmword ptr [BLAKE3_IV_3+rip]
+        movdqa  xmm12, xmmword ptr [rsp+0x110]
+        movdqa  xmm13, xmmword ptr [rsp+0x120]
+        movdqa  xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
+        movd    xmm15, eax
+        pshufd  xmm15, xmm15, 0x00
+        prefetcht0 [r8+rdx+0x80]
+        prefetcht0 [r9+rdx+0x80]
+        prefetcht0 [r10+rdx+0x80]
+        prefetcht0 [r11+rdx+0x80]
+        paddd   xmm0, xmmword ptr [rsp]
+        paddd   xmm1, xmmword ptr [rsp+0x20]
+        paddd   xmm2, xmmword ptr [rsp+0x40]
+        paddd   xmm3, xmmword ptr [rsp+0x60]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [BLAKE3_IV_0+rip]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x10]
+        paddd   xmm1, xmmword ptr [rsp+0x30]
+        paddd   xmm2, xmmword ptr [rsp+0x50]
+        paddd   xmm3, xmmword ptr [rsp+0x70]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x80]
+        paddd   xmm1, xmmword ptr [rsp+0xA0]
+        paddd   xmm2, xmmword ptr [rsp+0xC0]
+        paddd   xmm3, xmmword ptr [rsp+0xE0]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x90]
+        paddd   xmm1, xmmword ptr [rsp+0xB0]
+        paddd   xmm2, xmmword ptr [rsp+0xD0]
+        paddd   xmm3, xmmword ptr [rsp+0xF0]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x20]
+        paddd   xmm1, xmmword ptr [rsp+0x30]
+        paddd   xmm2, xmmword ptr [rsp+0x70]
+        paddd   xmm3, xmmword ptr [rsp+0x40]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x60]
+        paddd   xmm1, xmmword ptr [rsp+0xA0]
+        paddd   xmm2, xmmword ptr [rsp]
+        paddd   xmm3, xmmword ptr [rsp+0xD0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x10]
+        paddd   xmm1, xmmword ptr [rsp+0xC0]
+        paddd   xmm2, xmmword ptr [rsp+0x90]
+        paddd   xmm3, xmmword ptr [rsp+0xF0]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xB0]
+        paddd   xmm1, xmmword ptr [rsp+0x50]
+        paddd   xmm2, xmmword ptr [rsp+0xE0]
+        paddd   xmm3, xmmword ptr [rsp+0x80]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x30]
+        paddd   xmm1, xmmword ptr [rsp+0xA0]
+        paddd   xmm2, xmmword ptr [rsp+0xD0]
+        paddd   xmm3, xmmword ptr [rsp+0x70]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x40]
+        paddd   xmm1, xmmword ptr [rsp+0xC0]
+        paddd   xmm2, xmmword ptr [rsp+0x20]
+        paddd   xmm3, xmmword ptr [rsp+0xE0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x60]
+        paddd   xmm1, xmmword ptr [rsp+0x90]
+        paddd   xmm2, xmmword ptr [rsp+0xB0]
+        paddd   xmm3, xmmword ptr [rsp+0x80]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x50]
+        paddd   xmm1, xmmword ptr [rsp]
+        paddd   xmm2, xmmword ptr [rsp+0xF0]
+        paddd   xmm3, xmmword ptr [rsp+0x10]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xA0]
+        paddd   xmm1, xmmword ptr [rsp+0xC0]
+        paddd   xmm2, xmmword ptr [rsp+0xE0]
+        paddd   xmm3, xmmword ptr [rsp+0xD0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x70]
+        paddd   xmm1, xmmword ptr [rsp+0x90]
+        paddd   xmm2, xmmword ptr [rsp+0x30]
+        paddd   xmm3, xmmword ptr [rsp+0xF0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x40]
+        paddd   xmm1, xmmword ptr [rsp+0xB0]
+        paddd   xmm2, xmmword ptr [rsp+0x50]
+        paddd   xmm3, xmmword ptr [rsp+0x10]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp]
+        paddd   xmm1, xmmword ptr [rsp+0x20]
+        paddd   xmm2, xmmword ptr [rsp+0x80]
+        paddd   xmm3, xmmword ptr [rsp+0x60]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xC0]
+        paddd   xmm1, xmmword ptr [rsp+0x90]
+        paddd   xmm2, xmmword ptr [rsp+0xF0]
+        paddd   xmm3, xmmword ptr [rsp+0xE0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xD0]
+        paddd   xmm1, xmmword ptr [rsp+0xB0]
+        paddd   xmm2, xmmword ptr [rsp+0xA0]
+        paddd   xmm3, xmmword ptr [rsp+0x80]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x70]
+        paddd   xmm1, xmmword ptr [rsp+0x50]
+        paddd   xmm2, xmmword ptr [rsp]
+        paddd   xmm3, xmmword ptr [rsp+0x60]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x20]
+        paddd   xmm1, xmmword ptr [rsp+0x30]
+        paddd   xmm2, xmmword ptr [rsp+0x10]
+        paddd   xmm3, xmmword ptr [rsp+0x40]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x90]
+        paddd   xmm1, xmmword ptr [rsp+0xB0]
+        paddd   xmm2, xmmword ptr [rsp+0x80]
+        paddd   xmm3, xmmword ptr [rsp+0xF0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xE0]
+        paddd   xmm1, xmmword ptr [rsp+0x50]
+        paddd   xmm2, xmmword ptr [rsp+0xC0]
+        paddd   xmm3, xmmword ptr [rsp+0x10]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xD0]
+        paddd   xmm1, xmmword ptr [rsp]
+        paddd   xmm2, xmmword ptr [rsp+0x20]
+        paddd   xmm3, xmmword ptr [rsp+0x40]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x30]
+        paddd   xmm1, xmmword ptr [rsp+0xA0]
+        paddd   xmm2, xmmword ptr [rsp+0x60]
+        paddd   xmm3, xmmword ptr [rsp+0x70]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xB0]
+        paddd   xmm1, xmmword ptr [rsp+0x50]
+        paddd   xmm2, xmmword ptr [rsp+0x10]
+        paddd   xmm3, xmmword ptr [rsp+0x80]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xF0]
+        paddd   xmm1, xmmword ptr [rsp]
+        paddd   xmm2, xmmword ptr [rsp+0x90]
+        paddd   xmm3, xmmword ptr [rsp+0x60]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xE0]
+        paddd   xmm1, xmmword ptr [rsp+0x20]
+        paddd   xmm2, xmmword ptr [rsp+0x30]
+        paddd   xmm3, xmmword ptr [rsp+0x70]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xA0]
+        paddd   xmm1, xmmword ptr [rsp+0xC0]
+        paddd   xmm2, xmmword ptr [rsp+0x40]
+        paddd   xmm3, xmmword ptr [rsp+0xD0]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        pxor    xmm0, xmm8
+        pxor    xmm1, xmm9
+        pxor    xmm2, xmm10
+        pxor    xmm3, xmm11
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        pxor    xmm4, xmm12
+        pxor    xmm5, xmm13
+        pxor    xmm6, xmm14
+        pxor    xmm7, xmm15
+        mov     eax, r13d
+        jne     9b
+        movdqa  xmm9, xmm0
+        punpckldq xmm0, xmm1
+        punpckhdq xmm9, xmm1
+        movdqa  xmm11, xmm2
+        punpckldq xmm2, xmm3
+        punpckhdq xmm11, xmm3
+        movdqa  xmm1, xmm0
+        punpcklqdq xmm0, xmm2
+        punpckhqdq xmm1, xmm2
+        movdqa  xmm3, xmm9
+        punpcklqdq xmm9, xmm11
+        punpckhqdq xmm3, xmm11
+        movdqu  xmmword ptr [rbx], xmm0
+        movdqu  xmmword ptr [rbx+0x20], xmm1
+        movdqu  xmmword ptr [rbx+0x40], xmm9
+        movdqu  xmmword ptr [rbx+0x60], xmm3
+        movdqa  xmm9, xmm4
+        punpckldq xmm4, xmm5
+        punpckhdq xmm9, xmm5
+        movdqa  xmm11, xmm6
+        punpckldq xmm6, xmm7
+        punpckhdq xmm11, xmm7
+        movdqa  xmm5, xmm4
+        punpcklqdq xmm4, xmm6
+        punpckhqdq xmm5, xmm6
+        movdqa  xmm7, xmm9
+        punpcklqdq xmm9, xmm11
+        punpckhqdq xmm7, xmm11
+        movdqu  xmmword ptr [rbx+0x10], xmm4
+        movdqu  xmmword ptr [rbx+0x30], xmm5
+        movdqu  xmmword ptr [rbx+0x50], xmm9
+        movdqu  xmmword ptr [rbx+0x70], xmm7
+        movdqa  xmm1, xmmword ptr [rsp+0x110]
+        movdqa  xmm0, xmm1
+        paddd   xmm1, xmmword ptr [rsp+0x150]
+        movdqa  xmmword ptr [rsp+0x110], xmm1
+        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
+        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
+        pcmpgtd xmm0, xmm1
+        movdqa  xmm1, xmmword ptr [rsp+0x120]
+        psubd   xmm1, xmm0
+        movdqa  xmmword ptr [rsp+0x120], xmm1
+        add     rbx, 128
+        add     rdi, 32
+        sub     rsi, 4
+        cmp     rsi, 4
+        jnc     2b
+        test    rsi, rsi
+        jne     3f
+4:
+        movdqa  xmm6, xmmword ptr [rsp+0x170]
+        movdqa  xmm7, xmmword ptr [rsp+0x180]
+        movdqa  xmm8, xmmword ptr [rsp+0x190]
+        movdqa  xmm9, xmmword ptr [rsp+0x1A0]
+        movdqa  xmm10, xmmword ptr [rsp+0x1B0]
+        movdqa  xmm11, xmmword ptr [rsp+0x1C0]
+        movdqa  xmm12, xmmword ptr [rsp+0x1D0]
+        movdqa  xmm13, xmmword ptr [rsp+0x1E0]
+        movdqa  xmm14, xmmword ptr [rsp+0x1F0]
+        movdqa  xmm15, xmmword ptr [rsp+0x200]
+        mov     rsp, rbp
+        pop     rbp
+        pop     rbx
+        pop     rdi
+        pop     rsi
+        pop     r12
+        pop     r13
+        pop     r14
+        pop     r15
+        ret
+.p2align 5
+3:
+        test    esi, 0x2
+        je      3f
+        movups  xmm0, xmmword ptr [rcx]
+        movups  xmm1, xmmword ptr [rcx+0x10]
+        movaps  xmm8, xmm0
+        movaps  xmm9, xmm1
+        movd    xmm13, dword ptr [rsp+0x110]
+        pinsrd  xmm13, dword ptr [rsp+0x120], 1
+        pinsrd  xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+        movaps  xmmword ptr [rsp], xmm13
+        movd    xmm14, dword ptr [rsp+0x114]
+        pinsrd  xmm14, dword ptr [rsp+0x124], 1
+        pinsrd  xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+        movaps  xmmword ptr [rsp+0x10], xmm14
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        movzx   eax, byte ptr [rbp+0x80]
+        or      eax, r13d
+        xor     edx, edx
+2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
+        movaps  xmm10, xmm2
+        movups  xmm4, xmmword ptr [r8+rdx-0x40]
+        movups  xmm5, xmmword ptr [r8+rdx-0x30]
+        movaps  xmm3, xmm4
+        shufps  xmm4, xmm5, 136
+        shufps  xmm3, xmm5, 221
+        movaps  xmm5, xmm3
+        movups  xmm6, xmmword ptr [r8+rdx-0x20]
+        movups  xmm7, xmmword ptr [r8+rdx-0x10]
+        movaps  xmm3, xmm6
+        shufps  xmm6, xmm7, 136
+        pshufd  xmm6, xmm6, 0x93
+        shufps  xmm3, xmm7, 221
+        pshufd  xmm7, xmm3, 0x93
+        movups  xmm12, xmmword ptr [r9+rdx-0x40]
+        movups  xmm13, xmmword ptr [r9+rdx-0x30]
+        movaps  xmm11, xmm12
+        shufps  xmm12, xmm13, 136
+        shufps  xmm11, xmm13, 221
+        movaps  xmm13, xmm11
+        movups  xmm14, xmmword ptr [r9+rdx-0x20]
+        movups  xmm15, xmmword ptr [r9+rdx-0x10]
+        movaps  xmm11, xmm14
+        shufps  xmm14, xmm15, 136
+        pshufd  xmm14, xmm14, 0x93
+        shufps  xmm11, xmm15, 221
+        pshufd  xmm15, xmm11, 0x93
+        movaps  xmm3, xmmword ptr [rsp]
+        movaps  xmm11, xmmword ptr [rsp+0x10]
+        pinsrd  xmm3, eax, 3
+        pinsrd  xmm11, eax, 3
+        mov     al, 7
+9:
+        paddd   xmm0, xmm4
+        paddd   xmm8, xmm12
+        movaps  xmmword ptr [rsp+0x20], xmm4
+        movaps  xmmword ptr [rsp+0x30], xmm12
+        paddd   xmm0, xmm1
+        paddd   xmm8, xmm9
+        pxor    xmm3, xmm0
+        pxor    xmm11, xmm8
+        movaps  xmm12, xmmword ptr [ROT16+rip]
+        pshufb  xmm3, xmm12
+        pshufb  xmm11, xmm12
+        paddd   xmm2, xmm3
+        paddd   xmm10, xmm11
+        pxor    xmm1, xmm2
+        pxor    xmm9, xmm10
+        movdqa  xmm4, xmm1
+        pslld   xmm1, 20
+        psrld   xmm4, 12
+        por     xmm1, xmm4
+        movdqa  xmm4, xmm9
+        pslld   xmm9, 20
+        psrld   xmm4, 12
+        por     xmm9, xmm4
+        paddd   xmm0, xmm5
+        paddd   xmm8, xmm13
+        movaps  xmmword ptr [rsp+0x40], xmm5
+        movaps  xmmword ptr [rsp+0x50], xmm13
+        paddd   xmm0, xmm1
+        paddd   xmm8, xmm9
+        pxor    xmm3, xmm0
+        pxor    xmm11, xmm8
+        movaps  xmm13, xmmword ptr [ROT8+rip]
+        pshufb  xmm3, xmm13
+        pshufb  xmm11, xmm13
+        paddd   xmm2, xmm3
+        paddd   xmm10, xmm11
+        pxor    xmm1, xmm2
+        pxor    xmm9, xmm10
+        movdqa  xmm4, xmm1
+        pslld   xmm1, 25
+        psrld   xmm4, 7
+        por     xmm1, xmm4
+        movdqa  xmm4, xmm9
+        pslld   xmm9, 25
+        psrld   xmm4, 7
+        por     xmm9, xmm4
+        pshufd  xmm0, xmm0, 0x93
+        pshufd  xmm8, xmm8, 0x93
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm11, xmm11, 0x4E
+        pshufd  xmm2, xmm2, 0x39
+        pshufd  xmm10, xmm10, 0x39
+        paddd   xmm0, xmm6
+        paddd   xmm8, xmm14
+        paddd   xmm0, xmm1
+        paddd   xmm8, xmm9
+        pxor    xmm3, xmm0
+        pxor    xmm11, xmm8
+        pshufb  xmm3, xmm12
+        pshufb  xmm11, xmm12
+        paddd   xmm2, xmm3
+        paddd   xmm10, xmm11
+        pxor    xmm1, xmm2
+        pxor    xmm9, xmm10
+        movdqa  xmm4, xmm1
+        pslld   xmm1, 20
+        psrld   xmm4, 12
+        por     xmm1, xmm4
+        movdqa  xmm4, xmm9
+        pslld   xmm9, 20
+        psrld   xmm4, 12
+        por     xmm9, xmm4
+        paddd   xmm0, xmm7
+        paddd   xmm8, xmm15
+        paddd   xmm0, xmm1
+        paddd   xmm8, xmm9
+        pxor    xmm3, xmm0
+        pxor    xmm11, xmm8
+        pshufb  xmm3, xmm13
+        pshufb  xmm11, xmm13
+        paddd   xmm2, xmm3
+        paddd   xmm10, xmm11
+        pxor    xmm1, xmm2
+        pxor    xmm9, xmm10
+        movdqa  xmm4, xmm1
+        pslld   xmm1, 25
+        psrld   xmm4, 7
+        por     xmm1, xmm4
+        movdqa  xmm4, xmm9
+        pslld   xmm9, 25
+        psrld   xmm4, 7
+        por     xmm9, xmm4
+        pshufd  xmm0, xmm0, 0x39
+        pshufd  xmm8, xmm8, 0x39
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm11, xmm11, 0x4E
+        pshufd  xmm2, xmm2, 0x93
+        pshufd  xmm10, xmm10, 0x93
+        dec     al
+        je      9f
+        movdqa  xmm12, xmmword ptr [rsp+0x20]
+        movdqa  xmm5, xmmword ptr [rsp+0x40]
+        pshufd  xmm13, xmm12, 0x0F
+        shufps  xmm12, xmm5, 214
+        pshufd  xmm4, xmm12, 0x39
+        movdqa  xmm12, xmm6
+        shufps  xmm12, xmm7, 250
+        pblendw xmm13, xmm12, 0xCC
+        movdqa  xmm12, xmm7
+        punpcklqdq xmm12, xmm5
+        pblendw xmm12, xmm6, 0xC0
+        pshufd  xmm12, xmm12, 0x78
+        punpckhdq xmm5, xmm7
+        punpckldq xmm6, xmm5
+        pshufd  xmm7, xmm6, 0x1E
+        movdqa  xmmword ptr [rsp+0x20], xmm13
+        movdqa  xmmword ptr [rsp+0x40], xmm12
+        movdqa  xmm5, xmmword ptr [rsp+0x30]
+        movdqa  xmm13, xmmword ptr [rsp+0x50]
+        pshufd  xmm6, xmm5, 0x0F
+        shufps  xmm5, xmm13, 214
+        pshufd  xmm12, xmm5, 0x39
+        movdqa  xmm5, xmm14
+        shufps  xmm5, xmm15, 250
+        pblendw xmm6, xmm5, 0xCC
+        movdqa  xmm5, xmm15
+        punpcklqdq xmm5, xmm13
+        pblendw xmm5, xmm14, 0xC0
+        pshufd  xmm5, xmm5, 0x78
+        punpckhdq xmm13, xmm15
+        punpckldq xmm14, xmm13
+        pshufd  xmm15, xmm14, 0x1E
+        movdqa  xmm13, xmm6
+        movdqa  xmm14, xmm5
+        movdqa  xmm5, xmmword ptr [rsp+0x20]
+        movdqa  xmm6, xmmword ptr [rsp+0x40]
+        jmp     9b
+9:
+        pxor    xmm0, xmm2
+        pxor    xmm1, xmm3
+        pxor    xmm8, xmm10
+        pxor    xmm9, xmm11
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     2b
+        movups  xmmword ptr [rbx], xmm0
+        movups  xmmword ptr [rbx+0x10], xmm1
+        movups  xmmword ptr [rbx+0x20], xmm8
+        movups  xmmword ptr [rbx+0x30], xmm9
+        movdqa  xmm0, xmmword ptr [rsp+0x130]
+        movdqa  xmm1, xmmword ptr [rsp+0x110]
+        movdqa  xmm2, xmmword ptr [rsp+0x120]
+        movdqu  xmm3, xmmword ptr [rsp+0x118]
+        movdqu  xmm4, xmmword ptr [rsp+0x128]
+        blendvps xmm1, xmm3, xmm0
+        blendvps xmm2, xmm4, xmm0
+        movdqa  xmmword ptr [rsp+0x110], xmm1
+        movdqa  xmmword ptr [rsp+0x120], xmm2
+        add     rdi, 16
+        add     rbx, 64
+        sub     rsi, 2
+3:
+        test    esi, 0x1
+        je      4b
+        movups  xmm0, xmmword ptr [rcx]
+        movups  xmm1, xmmword ptr [rcx+0x10]
+        movd    xmm13, dword ptr [rsp+0x110]
+        pinsrd  xmm13, dword ptr [rsp+0x120], 1
+        pinsrd  xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+        movaps  xmm14, xmmword ptr [ROT8+rip]
+        movaps  xmm15, xmmword ptr [ROT16+rip]
+        mov     r8, qword ptr [rdi]
+        movzx   eax, byte ptr [rbp+0x80]
+        or      eax, r13d
+        xor     edx, edx
+2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
+        movaps  xmm3, xmm13
+        pinsrd  xmm3, eax, 3
+        movups  xmm4, xmmword ptr [r8+rdx-0x40]
+        movups  xmm5, xmmword ptr [r8+rdx-0x30]
+        movaps  xmm8, xmm4
+        shufps  xmm4, xmm5, 136
+        shufps  xmm8, xmm5, 221
+        movaps  xmm5, xmm8
+        movups  xmm6, xmmword ptr [r8+rdx-0x20]
+        movups  xmm7, xmmword ptr [r8+rdx-0x10]
+        movaps  xmm8, xmm6
+        shufps  xmm6, xmm7, 136
+        pshufd  xmm6, xmm6, 0x93
+        shufps  xmm8, xmm7, 221
+        pshufd  xmm7, xmm8, 0x93
+        mov     al, 7
+9:
+        paddd   xmm0, xmm4
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm15
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm5
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x93
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x39
+        paddd   xmm0, xmm6
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm15
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm7
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x39
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x93
+        dec     al
+        jz      9f
+        movdqa  xmm8, xmm4
+        shufps  xmm8, xmm5, 214
+        pshufd  xmm9, xmm4, 0x0F
+        pshufd  xmm4, xmm8, 0x39
+        movdqa  xmm8, xmm6
+        shufps  xmm8, xmm7, 250
+        pblendw xmm9, xmm8, 0xCC
+        movdqa  xmm8, xmm7
+        punpcklqdq xmm8, xmm5
+        pblendw xmm8, xmm6, 0xC0
+        pshufd  xmm8, xmm8, 0x78
+        punpckhdq xmm5, xmm7
+        punpckldq xmm6, xmm5
+        pshufd  xmm7, xmm6, 0x1E
+        movdqa  xmm5, xmm9
+        movdqa  xmm6, xmm8
+        jmp     9b
+9:
+        pxor    xmm0, xmm2
+        pxor    xmm1, xmm3
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     2b
+        movups  xmmword ptr [rbx], xmm0
+        movups  xmmword ptr [rbx+0x10], xmm1
+        jmp     4b
+
+.p2align 6
+blake3_compress_in_place_sse41:
+_blake3_compress_in_place_sse41:
+        sub     rsp, 120
+        movdqa  xmmword ptr [rsp], xmm6
+        movdqa  xmmword ptr [rsp+0x10], xmm7
+        movdqa  xmmword ptr [rsp+0x20], xmm8
+        movdqa  xmmword ptr [rsp+0x30], xmm9
+        movdqa  xmmword ptr [rsp+0x40], xmm11
+        movdqa  xmmword ptr [rsp+0x50], xmm14
+        movdqa  xmmword ptr [rsp+0x60], xmm15
+        movups  xmm0, xmmword ptr [rcx]
+        movups  xmm1, xmmword ptr [rcx+0x10]
+        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
+        movzx   eax, byte ptr [rsp+0xA0]
+        movzx   r8d, r8b
+        shl     rax, 32
+        add     r8, rax
+        movq    xmm3, r9
+        movq    xmm4, r8
+        punpcklqdq xmm3, xmm4
+        movups  xmm4, xmmword ptr [rdx]
+        movups  xmm5, xmmword ptr [rdx+0x10]
+        movaps  xmm8, xmm4
+        shufps  xmm4, xmm5, 136
+        shufps  xmm8, xmm5, 221
+        movaps  xmm5, xmm8
+        movups  xmm6, xmmword ptr [rdx+0x20]
+        movups  xmm7, xmmword ptr [rdx+0x30]
+        movaps  xmm8, xmm6
+        shufps  xmm6, xmm7, 136
+        pshufd  xmm6, xmm6, 0x93
+        shufps  xmm8, xmm7, 221
+        pshufd  xmm7, xmm8, 0x93
+        movaps  xmm14, xmmword ptr [ROT8+rip]
+        movaps  xmm15, xmmword ptr [ROT16+rip]
+        mov     al, 7
+9:
+        paddd   xmm0, xmm4
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm15
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm5
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x93
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x39
+        paddd   xmm0, xmm6
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm15
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm7
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x39
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x93
+        dec     al
+        jz      9f
+        movdqa  xmm8, xmm4
+        shufps  xmm8, xmm5, 214
+        pshufd  xmm9, xmm4, 0x0F
+        pshufd  xmm4, xmm8, 0x39
+        movdqa  xmm8, xmm6
+        shufps  xmm8, xmm7, 250
+        pblendw xmm9, xmm8, 0xCC
+        movdqa  xmm8, xmm7
+        punpcklqdq xmm8, xmm5
+        pblendw xmm8, xmm6, 0xC0
+        pshufd  xmm8, xmm8, 0x78
+        punpckhdq xmm5, xmm7
+        punpckldq xmm6, xmm5
+        pshufd  xmm7, xmm6, 0x1E
+        movdqa  xmm5, xmm9
+        movdqa  xmm6, xmm8
+        jmp     9b
+9:
+        pxor    xmm0, xmm2
+        pxor    xmm1, xmm3
+        movups  xmmword ptr [rcx], xmm0
+        movups  xmmword ptr [rcx+0x10], xmm1
+        movdqa  xmm6, xmmword ptr [rsp]
+        movdqa  xmm7, xmmword ptr [rsp+0x10]
+        movdqa  xmm8, xmmword ptr [rsp+0x20]
+        movdqa  xmm9, xmmword ptr [rsp+0x30]
+        movdqa  xmm11, xmmword ptr [rsp+0x40]
+        movdqa  xmm14, xmmword ptr [rsp+0x50]
+        movdqa  xmm15, xmmword ptr [rsp+0x60]
+        add     rsp, 120
+        ret
+
+
+.p2align 6
+_blake3_compress_xof_sse41:
+blake3_compress_xof_sse41:
+        sub     rsp, 120
+        movdqa  xmmword ptr [rsp], xmm6
+        movdqa  xmmword ptr [rsp+0x10], xmm7
+        movdqa  xmmword ptr [rsp+0x20], xmm8
+        movdqa  xmmword ptr [rsp+0x30], xmm9
+        movdqa  xmmword ptr [rsp+0x40], xmm11
+        movdqa  xmmword ptr [rsp+0x50], xmm14
+        movdqa  xmmword ptr [rsp+0x60], xmm15
+        movups  xmm0, xmmword ptr [rcx]
+        movups  xmm1, xmmword ptr [rcx+0x10]
+        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
+        movzx   eax, byte ptr [rsp+0xA0]
+        movzx   r8d, r8b
+        mov     r10, qword ptr [rsp+0xA8]
+        shl     rax, 32
+        add     r8, rax
+        movq    xmm3, r9
+        movq    xmm4, r8
+        punpcklqdq xmm3, xmm4
+        movups  xmm4, xmmword ptr [rdx]
+        movups  xmm5, xmmword ptr [rdx+0x10]
+        movaps  xmm8, xmm4
+        shufps  xmm4, xmm5, 136
+        shufps  xmm8, xmm5, 221
+        movaps  xmm5, xmm8
+        movups  xmm6, xmmword ptr [rdx+0x20]
+        movups  xmm7, xmmword ptr [rdx+0x30]
+        movaps  xmm8, xmm6
+        shufps  xmm6, xmm7, 136
+        pshufd  xmm6, xmm6, 0x93
+        shufps  xmm8, xmm7, 221
+        pshufd  xmm7, xmm8, 0x93
+        movaps  xmm14, xmmword ptr [ROT8+rip]
+        movaps  xmm15, xmmword ptr [ROT16+rip]
+        mov     al, 7
+9:
+        paddd   xmm0, xmm4
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm15
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm5
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x93
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x39
+        paddd   xmm0, xmm6
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm15
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm7
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x39
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x93
+        dec     al
+        jz      9f
+        movdqa  xmm8, xmm4
+        shufps  xmm8, xmm5, 214
+        pshufd  xmm9, xmm4, 0x0F
+        pshufd  xmm4, xmm8, 0x39
+        movdqa  xmm8, xmm6
+        shufps  xmm8, xmm7, 250
+        pblendw xmm9, xmm8, 0xCC
+        movdqa  xmm8, xmm7
+        punpcklqdq xmm8, xmm5
+        pblendw xmm8, xmm6, 0xC0
+        pshufd  xmm8, xmm8, 0x78
+        punpckhdq xmm5, xmm7
+        punpckldq xmm6, xmm5
+        pshufd  xmm7, xmm6, 0x1E
+        movdqa  xmm5, xmm9
+        movdqa  xmm6, xmm8
+        jmp     9b
+9:
+        movdqu  xmm4, xmmword ptr [rcx]
+        movdqu  xmm5, xmmword ptr [rcx+0x10]
+        pxor    xmm0, xmm2
+        pxor    xmm1, xmm3
+        pxor    xmm2, xmm4
+        pxor    xmm3, xmm5
+        movups  xmmword ptr [r10], xmm0
+        movups  xmmword ptr [r10+0x10], xmm1
+        movups  xmmword ptr [r10+0x20], xmm2
+        movups  xmmword ptr [r10+0x30], xmm3
+        movdqa  xmm6, xmmword ptr [rsp]
+        movdqa  xmm7, xmmword ptr [rsp+0x10]
+        movdqa  xmm8, xmmword ptr [rsp+0x20]
+        movdqa  xmm9, xmmword ptr [rsp+0x30]
+        movdqa  xmm11, xmmword ptr [rsp+0x40]
+        movdqa  xmm14, xmmword ptr [rsp+0x50]
+        movdqa  xmm15, xmmword ptr [rsp+0x60]
+        add     rsp, 120
+        ret
+
+
+.section .rodata
+.p2align  6
+BLAKE3_IV:
+        .long  0x6A09E667, 0xBB67AE85
+        .long  0x3C6EF372, 0xA54FF53A
+ROT16:
+        .byte  2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+ROT8:
+        .byte  1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
+ADD0:   
+        .long  0, 1, 2, 3
+ADD1:
+        .long  4, 4, 4, 4
+BLAKE3_IV_0:
+        .long  0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
+BLAKE3_IV_1:
+        .long  0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
+BLAKE3_IV_2:
+        .long  0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
+BLAKE3_IV_3:
+        .long  0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
+BLAKE3_BLOCK_LEN:
+        .long  64, 64, 64, 64
+CMP_MSB_MASK:
+        .long  0x80000000, 0x80000000, 0x80000000, 0x80000000
diff --git a/llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_windows_msvc.asm b/llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_windows_msvc.asm
new file mode 100644
index 000000000000..8966c7b84406
--- /dev/null
+++ b/llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_windows_msvc.asm
@@ -0,0 +1,2089 @@
+public _blake3_hash_many_sse41
+public blake3_hash_many_sse41
+public blake3_compress_in_place_sse41
+public _blake3_compress_in_place_sse41
+public blake3_compress_xof_sse41
+public _blake3_compress_xof_sse41
+
+_TEXT   SEGMENT ALIGN(16) 'CODE'
+
+ALIGN   16
+blake3_hash_many_sse41 PROC
+_blake3_hash_many_sse41 PROC
+        push    r15
+        push    r14
+        push    r13
+        push    r12
+        push    rsi
+        push    rdi
+        push    rbx
+        push    rbp
+        mov     rbp, rsp
+        sub     rsp, 528
+        and     rsp, 0FFFFFFFFFFFFFFC0H
+        movdqa  xmmword ptr [rsp+170H], xmm6
+        movdqa  xmmword ptr [rsp+180H], xmm7
+        movdqa  xmmword ptr [rsp+190H], xmm8
+        movdqa  xmmword ptr [rsp+1A0H], xmm9
+        movdqa  xmmword ptr [rsp+1B0H], xmm10
+        movdqa  xmmword ptr [rsp+1C0H], xmm11
+        movdqa  xmmword ptr [rsp+1D0H], xmm12
+        movdqa  xmmword ptr [rsp+1E0H], xmm13
+        movdqa  xmmword ptr [rsp+1F0H], xmm14
+        movdqa  xmmword ptr [rsp+200H], xmm15
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+        mov     rcx, r9
+        mov     r8, qword ptr [rbp+68H]
+        movzx   r9, byte ptr [rbp+70H]
+        neg     r9d
+        movd    xmm0, r9d
+        pshufd  xmm0, xmm0, 00H
+        movdqa  xmmword ptr [rsp+130H], xmm0
+        movdqa  xmm1, xmm0
+        pand    xmm1, xmmword ptr [ADD0]
+        pand    xmm0, xmmword ptr [ADD1]
+        movdqa  xmmword ptr [rsp+150H], xmm0
+        movd    xmm0, r8d
+        pshufd  xmm0, xmm0, 00H
+        paddd   xmm0, xmm1
+        movdqa  xmmword ptr [rsp+110H], xmm0
+        pxor    xmm0, xmmword ptr [CMP_MSB_MASK]
+        pxor    xmm1, xmmword ptr [CMP_MSB_MASK]
+        pcmpgtd xmm1, xmm0
+        shr     r8, 32
+        movd    xmm2, r8d
+        pshufd  xmm2, xmm2, 00H
+        psubd   xmm2, xmm1
+        movdqa  xmmword ptr [rsp+120H], xmm2
+        mov     rbx, qword ptr [rbp+90H]
+        mov     r15, rdx
+        shl     r15, 6
+        movzx   r13d, byte ptr [rbp+78H]
+        movzx   r12d, byte ptr [rbp+88H]
+        cmp     rsi, 4
+        jc      final3blocks
+outerloop4:
+        movdqu  xmm3, xmmword ptr [rcx]
+        pshufd  xmm0, xmm3, 00H
+        pshufd  xmm1, xmm3, 55H
+        pshufd  xmm2, xmm3, 0AAH
+        pshufd  xmm3, xmm3, 0FFH
+        movdqu  xmm7, xmmword ptr [rcx+10H]
+        pshufd  xmm4, xmm7, 00H
+        pshufd  xmm5, xmm7, 55H
+        pshufd  xmm6, xmm7, 0AAH
+        pshufd  xmm7, xmm7, 0FFH
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+8H]
+        mov     r10, qword ptr [rdi+10H]
+        mov     r11, qword ptr [rdi+18H]
+        movzx   eax, byte ptr [rbp+80H]
+        or      eax, r13d
+        xor     edx, edx
+innerloop4:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        movdqu  xmm8, xmmword ptr [r8+rdx-40H]
+        movdqu  xmm9, xmmword ptr [r9+rdx-40H]
+        movdqu  xmm10, xmmword ptr [r10+rdx-40H]
+        movdqu  xmm11, xmmword ptr [r11+rdx-40H]
+        movdqa  xmm12, xmm8
+        punpckldq xmm8, xmm9
+        punpckhdq xmm12, xmm9
+        movdqa  xmm14, xmm10
+        punpckldq xmm10, xmm11
+        punpckhdq xmm14, xmm11
+        movdqa  xmm9, xmm8
+        punpcklqdq xmm8, xmm10
+        punpckhqdq xmm9, xmm10
+        movdqa  xmm13, xmm12
+        punpcklqdq xmm12, xmm14
+        punpckhqdq xmm13, xmm14
+        movdqa  xmmword ptr [rsp], xmm8
+        movdqa  xmmword ptr [rsp+10H], xmm9
+        movdqa  xmmword ptr [rsp+20H], xmm12
+        movdqa  xmmword ptr [rsp+30H], xmm13
+        movdqu  xmm8, xmmword ptr [r8+rdx-30H]
+        movdqu  xmm9, xmmword ptr [r9+rdx-30H]
+        movdqu  xmm10, xmmword ptr [r10+rdx-30H]
+        movdqu  xmm11, xmmword ptr [r11+rdx-30H]
+        movdqa  xmm12, xmm8
+        punpckldq xmm8, xmm9
+        punpckhdq xmm12, xmm9
+        movdqa  xmm14, xmm10
+        punpckldq xmm10, xmm11
+        punpckhdq xmm14, xmm11
+        movdqa  xmm9, xmm8
+        punpcklqdq xmm8, xmm10
+        punpckhqdq xmm9, xmm10
+        movdqa  xmm13, xmm12
+        punpcklqdq xmm12, xmm14
+        punpckhqdq xmm13, xmm14
+        movdqa  xmmword ptr [rsp+40H], xmm8
+        movdqa  xmmword ptr [rsp+50H], xmm9
+        movdqa  xmmword ptr [rsp+60H], xmm12
+        movdqa  xmmword ptr [rsp+70H], xmm13
+        movdqu  xmm8, xmmword ptr [r8+rdx-20H]
+        movdqu  xmm9, xmmword ptr [r9+rdx-20H]
+        movdqu  xmm10, xmmword ptr [r10+rdx-20H]
+        movdqu  xmm11, xmmword ptr [r11+rdx-20H]
+        movdqa  xmm12, xmm8
+        punpckldq xmm8, xmm9
+        punpckhdq xmm12, xmm9
+        movdqa  xmm14, xmm10
+        punpckldq xmm10, xmm11
+        punpckhdq xmm14, xmm11
+        movdqa  xmm9, xmm8
+        punpcklqdq xmm8, xmm10
+        punpckhqdq xmm9, xmm10
+        movdqa  xmm13, xmm12
+        punpcklqdq xmm12, xmm14
+        punpckhqdq xmm13, xmm14
+        movdqa  xmmword ptr [rsp+80H], xmm8
+        movdqa  xmmword ptr [rsp+90H], xmm9
+        movdqa  xmmword ptr [rsp+0A0H], xmm12
+        movdqa  xmmword ptr [rsp+0B0H], xmm13
+        movdqu  xmm8, xmmword ptr [r8+rdx-10H]
+        movdqu  xmm9, xmmword ptr [r9+rdx-10H]
+        movdqu  xmm10, xmmword ptr [r10+rdx-10H]
+        movdqu  xmm11, xmmword ptr [r11+rdx-10H]
+        movdqa  xmm12, xmm8
+        punpckldq xmm8, xmm9
+        punpckhdq xmm12, xmm9
+        movdqa  xmm14, xmm10
+        punpckldq xmm10, xmm11
+        punpckhdq xmm14, xmm11
+        movdqa  xmm9, xmm8
+        punpcklqdq xmm8, xmm10
+        punpckhqdq xmm9, xmm10
+        movdqa  xmm13, xmm12
+        punpcklqdq xmm12, xmm14
+        punpckhqdq xmm13, xmm14
+        movdqa  xmmword ptr [rsp+0C0H], xmm8
+        movdqa  xmmword ptr [rsp+0D0H], xmm9
+        movdqa  xmmword ptr [rsp+0E0H], xmm12
+        movdqa  xmmword ptr [rsp+0F0H], xmm13
+        movdqa  xmm9, xmmword ptr [BLAKE3_IV_1]
+        movdqa  xmm10, xmmword ptr [BLAKE3_IV_2]
+        movdqa  xmm11, xmmword ptr [BLAKE3_IV_3]
+        movdqa  xmm12, xmmword ptr [rsp+110H]
+        movdqa  xmm13, xmmword ptr [rsp+120H]
+        movdqa  xmm14, xmmword ptr [BLAKE3_BLOCK_LEN]
+        movd    xmm15, eax
+        pshufd  xmm15, xmm15, 00H
+        prefetcht0 byte ptr [r8+rdx+80H]
+        prefetcht0 byte ptr [r9+rdx+80H]
+        prefetcht0 byte ptr [r10+rdx+80H]
+        prefetcht0 byte ptr [r11+rdx+80H]
+        paddd   xmm0, xmmword ptr [rsp]
+        paddd   xmm1, xmmword ptr [rsp+20H]
+        paddd   xmm2, xmmword ptr [rsp+40H]
+        paddd   xmm3, xmmword ptr [rsp+60H]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [BLAKE3_IV_0]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+10H]
+        paddd   xmm1, xmmword ptr [rsp+30H]
+        paddd   xmm2, xmmword ptr [rsp+50H]
+        paddd   xmm3, xmmword ptr [rsp+70H]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+80H]
+        paddd   xmm1, xmmword ptr [rsp+0A0H]
+        paddd   xmm2, xmmword ptr [rsp+0C0H]
+        paddd   xmm3, xmmword ptr [rsp+0E0H]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+90H]
+        paddd   xmm1, xmmword ptr [rsp+0B0H]
+        paddd   xmm2, xmmword ptr [rsp+0D0H]
+        paddd   xmm3, xmmword ptr [rsp+0F0H]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+20H]
+        paddd   xmm1, xmmword ptr [rsp+30H]
+        paddd   xmm2, xmmword ptr [rsp+70H]
+        paddd   xmm3, xmmword ptr [rsp+40H]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+60H]
+        paddd   xmm1, xmmword ptr [rsp+0A0H]
+        paddd   xmm2, xmmword ptr [rsp]
+        paddd   xmm3, xmmword ptr [rsp+0D0H]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+10H]
+        paddd   xmm1, xmmword ptr [rsp+0C0H]
+        paddd   xmm2, xmmword ptr [rsp+90H]
+        paddd   xmm3, xmmword ptr [rsp+0F0H]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0B0H]
+        paddd   xmm1, xmmword ptr [rsp+50H]
+        paddd   xmm2, xmmword ptr [rsp+0E0H]
+        paddd   xmm3, xmmword ptr [rsp+80H]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+30H]
+        paddd   xmm1, xmmword ptr [rsp+0A0H]
+        paddd   xmm2, xmmword ptr [rsp+0D0H]
+        paddd   xmm3, xmmword ptr [rsp+70H]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+40H]
+        paddd   xmm1, xmmword ptr [rsp+0C0H]
+        paddd   xmm2, xmmword ptr [rsp+20H]
+        paddd   xmm3, xmmword ptr [rsp+0E0H]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+60H]
+        paddd   xmm1, xmmword ptr [rsp+90H]
+        paddd   xmm2, xmmword ptr [rsp+0B0H]
+        paddd   xmm3, xmmword ptr [rsp+80H]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+50H]
+        paddd   xmm1, xmmword ptr [rsp]
+        paddd   xmm2, xmmword ptr [rsp+0F0H]
+        paddd   xmm3, xmmword ptr [rsp+10H]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0A0H]
+        paddd   xmm1, xmmword ptr [rsp+0C0H]
+        paddd   xmm2, xmmword ptr [rsp+0E0H]
+        paddd   xmm3, xmmword ptr [rsp+0D0H]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+70H]
+        paddd   xmm1, xmmword ptr [rsp+90H]
+        paddd   xmm2, xmmword ptr [rsp+30H]
+        paddd   xmm3, xmmword ptr [rsp+0F0H]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+40H]
+        paddd   xmm1, xmmword ptr [rsp+0B0H]
+        paddd   xmm2, xmmword ptr [rsp+50H]
+        paddd   xmm3, xmmword ptr [rsp+10H]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp]
+        paddd   xmm1, xmmword ptr [rsp+20H]
+        paddd   xmm2, xmmword ptr [rsp+80H]
+        paddd   xmm3, xmmword ptr [rsp+60H]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0C0H]
+        paddd   xmm1, xmmword ptr [rsp+90H]
+        paddd   xmm2, xmmword ptr [rsp+0F0H]
+        paddd   xmm3, xmmword ptr [rsp+0E0H]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0D0H]
+        paddd   xmm1, xmmword ptr [rsp+0B0H]
+        paddd   xmm2, xmmword ptr [rsp+0A0H]
+        paddd   xmm3, xmmword ptr [rsp+80H]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+70H]
+        paddd   xmm1, xmmword ptr [rsp+50H]
+        paddd   xmm2, xmmword ptr [rsp]
+        paddd   xmm3, xmmword ptr [rsp+60H]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+20H]
+        paddd   xmm1, xmmword ptr [rsp+30H]
+        paddd   xmm2, xmmword ptr [rsp+10H]
+        paddd   xmm3, xmmword ptr [rsp+40H]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+90H]
+        paddd   xmm1, xmmword ptr [rsp+0B0H]
+        paddd   xmm2, xmmword ptr [rsp+80H]
+        paddd   xmm3, xmmword ptr [rsp+0F0H]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0E0H]
+        paddd   xmm1, xmmword ptr [rsp+50H]
+        paddd   xmm2, xmmword ptr [rsp+0C0H]
+        paddd   xmm3, xmmword ptr [rsp+10H]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0D0H]
+        paddd   xmm1, xmmword ptr [rsp]
+        paddd   xmm2, xmmword ptr [rsp+20H]
+        paddd   xmm3, xmmword ptr [rsp+40H]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+30H]
+        paddd   xmm1, xmmword ptr [rsp+0A0H]
+        paddd   xmm2, xmmword ptr [rsp+60H]
+        paddd   xmm3, xmmword ptr [rsp+70H]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0B0H]
+        paddd   xmm1, xmmword ptr [rsp+50H]
+        paddd   xmm2, xmmword ptr [rsp+10H]
+        paddd   xmm3, xmmword ptr [rsp+80H]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0F0H]
+        paddd   xmm1, xmmword ptr [rsp]
+        paddd   xmm2, xmmword ptr [rsp+90H]
+        paddd   xmm3, xmmword ptr [rsp+60H]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0E0H]
+        paddd   xmm1, xmmword ptr [rsp+20H]
+        paddd   xmm2, xmmword ptr [rsp+30H]
+        paddd   xmm3, xmmword ptr [rsp+70H]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+100H], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0A0H]
+        paddd   xmm1, xmmword ptr [rsp+0C0H]
+        paddd   xmm2, xmmword ptr [rsp+40H]
+        paddd   xmm3, xmmword ptr [rsp+0D0H]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+100H]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        pxor    xmm0, xmm8
+        pxor    xmm1, xmm9
+        pxor    xmm2, xmm10
+        pxor    xmm3, xmm11
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        pxor    xmm4, xmm12
+        pxor    xmm5, xmm13
+        pxor    xmm6, xmm14
+        pxor    xmm7, xmm15
+        mov     eax, r13d
+        jne     innerloop4
+        movdqa  xmm9, xmm0
+        punpckldq xmm0, xmm1
+        punpckhdq xmm9, xmm1
+        movdqa  xmm11, xmm2
+        punpckldq xmm2, xmm3
+        punpckhdq xmm11, xmm3
+        movdqa  xmm1, xmm0
+        punpcklqdq xmm0, xmm2
+        punpckhqdq xmm1, xmm2
+        movdqa  xmm3, xmm9
+        punpcklqdq xmm9, xmm11
+        punpckhqdq xmm3, xmm11
+        movdqu  xmmword ptr [rbx], xmm0
+        movdqu  xmmword ptr [rbx+20H], xmm1
+        movdqu  xmmword ptr [rbx+40H], xmm9
+        movdqu  xmmword ptr [rbx+60H], xmm3
+        movdqa  xmm9, xmm4
+        punpckldq xmm4, xmm5
+        punpckhdq xmm9, xmm5
+        movdqa  xmm11, xmm6
+        punpckldq xmm6, xmm7
+        punpckhdq xmm11, xmm7
+        movdqa  xmm5, xmm4
+        punpcklqdq xmm4, xmm6
+        punpckhqdq xmm5, xmm6
+        movdqa  xmm7, xmm9
+        punpcklqdq xmm9, xmm11
+        punpckhqdq xmm7, xmm11
+        movdqu  xmmword ptr [rbx+10H], xmm4
+        movdqu  xmmword ptr [rbx+30H], xmm5
+        movdqu  xmmword ptr [rbx+50H], xmm9
+        movdqu  xmmword ptr [rbx+70H], xmm7
+        movdqa  xmm1, xmmword ptr [rsp+110H]
+        movdqa  xmm0, xmm1
+        paddd   xmm1, xmmword ptr [rsp+150H]
+        movdqa  xmmword ptr [rsp+110H], xmm1
+        pxor    xmm0, xmmword ptr [CMP_MSB_MASK]
+        pxor    xmm1, xmmword ptr [CMP_MSB_MASK]
+        pcmpgtd xmm0, xmm1
+        movdqa  xmm1, xmmword ptr [rsp+120H]
+        psubd   xmm1, xmm0
+        movdqa  xmmword ptr [rsp+120H], xmm1
+        add     rbx, 128
+        add     rdi, 32
+        sub     rsi, 4
+        cmp     rsi, 4
+        jnc     outerloop4
+        test    rsi, rsi
+        jne     final3blocks
+unwind:
+        movdqa  xmm6, xmmword ptr [rsp+170H]
+        movdqa  xmm7, xmmword ptr [rsp+180H]
+        movdqa  xmm8, xmmword ptr [rsp+190H]
+        movdqa  xmm9, xmmword ptr [rsp+1A0H]
+        movdqa  xmm10, xmmword ptr [rsp+1B0H]
+        movdqa  xmm11, xmmword ptr [rsp+1C0H]
+        movdqa  xmm12, xmmword ptr [rsp+1D0H]
+        movdqa  xmm13, xmmword ptr [rsp+1E0H]
+        movdqa  xmm14, xmmword ptr [rsp+1F0H]
+        movdqa  xmm15, xmmword ptr [rsp+200H]
+        mov     rsp, rbp
+        pop     rbp
+        pop     rbx
+        pop     rdi
+        pop     rsi
+        pop     r12
+        pop     r13
+        pop     r14
+        pop     r15
+        ret
+ALIGN   16
+final3blocks:
+        test    esi, 2H
+        je      final1block
+        movups  xmm0, xmmword ptr [rcx]
+        movups  xmm1, xmmword ptr [rcx+10H]
+        movaps  xmm8, xmm0
+        movaps  xmm9, xmm1
+        movd    xmm13, dword ptr [rsp+110H]
+        pinsrd  xmm13, dword ptr [rsp+120H], 1
+        pinsrd  xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2
+        movaps  xmmword ptr [rsp], xmm13
+        movd    xmm14, dword ptr [rsp+114H]
+        pinsrd  xmm14, dword ptr [rsp+124H], 1
+        pinsrd  xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2
+        movaps  xmmword ptr [rsp+10H], xmm14
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+8H]
+        movzx   eax, byte ptr [rbp+80H]
+        or      eax, r13d
+        xor     edx, edx
+innerloop2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        movaps  xmm2, xmmword ptr [BLAKE3_IV]
+        movaps  xmm10, xmm2
+        movups  xmm4, xmmword ptr [r8+rdx-40H]
+        movups  xmm5, xmmword ptr [r8+rdx-30H]
+        movaps  xmm3, xmm4
+        shufps  xmm4, xmm5, 136
+        shufps  xmm3, xmm5, 221
+        movaps  xmm5, xmm3
+        movups  xmm6, xmmword ptr [r8+rdx-20H]
+        movups  xmm7, xmmword ptr [r8+rdx-10H]
+        movaps  xmm3, xmm6
+        shufps  xmm6, xmm7, 136
+        pshufd  xmm6, xmm6, 93H
+        shufps  xmm3, xmm7, 221
+        pshufd  xmm7, xmm3, 93H
+        movups  xmm12, xmmword ptr [r9+rdx-40H]
+        movups  xmm13, xmmword ptr [r9+rdx-30H]
+        movaps  xmm11, xmm12
+        shufps  xmm12, xmm13, 136
+        shufps  xmm11, xmm13, 221
+        movaps  xmm13, xmm11
+        movups  xmm14, xmmword ptr [r9+rdx-20H]
+        movups  xmm15, xmmword ptr [r9+rdx-10H]
+        movaps  xmm11, xmm14
+        shufps  xmm14, xmm15, 136
+        pshufd  xmm14, xmm14, 93H
+        shufps  xmm11, xmm15, 221
+        pshufd  xmm15, xmm11, 93H
+        movaps  xmm3, xmmword ptr [rsp]
+        movaps  xmm11, xmmword ptr [rsp+10H]
+        pinsrd  xmm3, eax, 3
+        pinsrd  xmm11, eax, 3
+        mov     al, 7
+roundloop2:
+        paddd   xmm0, xmm4
+        paddd   xmm8, xmm12
+        movaps  xmmword ptr [rsp+20H], xmm4
+        movaps  xmmword ptr [rsp+30H], xmm12
+        paddd   xmm0, xmm1
+        paddd   xmm8, xmm9
+        pxor    xmm3, xmm0
+        pxor    xmm11, xmm8
+        movaps  xmm12, xmmword ptr [ROT16]
+        pshufb  xmm3, xmm12
+        pshufb  xmm11, xmm12
+        paddd   xmm2, xmm3
+        paddd   xmm10, xmm11
+        pxor    xmm1, xmm2
+        pxor    xmm9, xmm10
+        movdqa  xmm4, xmm1
+        pslld   xmm1, 20
+        psrld   xmm4, 12
+        por     xmm1, xmm4
+        movdqa  xmm4, xmm9
+        pslld   xmm9, 20
+        psrld   xmm4, 12
+        por     xmm9, xmm4
+        paddd   xmm0, xmm5
+        paddd   xmm8, xmm13
+        movaps  xmmword ptr [rsp+40H], xmm5
+        movaps  xmmword ptr [rsp+50H], xmm13
+        paddd   xmm0, xmm1
+        paddd   xmm8, xmm9
+        pxor    xmm3, xmm0
+        pxor    xmm11, xmm8
+        movaps  xmm13, xmmword ptr [ROT8]
+        pshufb  xmm3, xmm13
+        pshufb  xmm11, xmm13
+        paddd   xmm2, xmm3
+        paddd   xmm10, xmm11
+        pxor    xmm1, xmm2
+        pxor    xmm9, xmm10
+        movdqa  xmm4, xmm1
+        pslld   xmm1, 25
+        psrld   xmm4, 7
+        por     xmm1, xmm4
+        movdqa  xmm4, xmm9
+        pslld   xmm9, 25
+        psrld   xmm4, 7
+        por     xmm9, xmm4
+        pshufd  xmm0, xmm0, 93H
+        pshufd  xmm8, xmm8, 93H
+        pshufd  xmm3, xmm3, 4EH
+        pshufd  xmm11, xmm11, 4EH
+        pshufd  xmm2, xmm2, 39H
+        pshufd  xmm10, xmm10, 39H
+        paddd   xmm0, xmm6
+        paddd   xmm8, xmm14
+        paddd   xmm0, xmm1
+        paddd   xmm8, xmm9
+        pxor    xmm3, xmm0
+        pxor    xmm11, xmm8
+        pshufb  xmm3, xmm12
+        pshufb  xmm11, xmm12
+        paddd   xmm2, xmm3
+        paddd   xmm10, xmm11
+        pxor    xmm1, xmm2
+        pxor    xmm9, xmm10
+        movdqa  xmm4, xmm1
+        pslld   xmm1, 20
+        psrld   xmm4, 12
+        por     xmm1, xmm4
+        movdqa  xmm4, xmm9
+        pslld   xmm9, 20
+        psrld   xmm4, 12
+        por     xmm9, xmm4
+        paddd   xmm0, xmm7
+        paddd   xmm8, xmm15
+        paddd   xmm0, xmm1
+        paddd   xmm8, xmm9
+        pxor    xmm3, xmm0
+        pxor    xmm11, xmm8
+        pshufb  xmm3, xmm13
+        pshufb  xmm11, xmm13
+        paddd   xmm2, xmm3
+        paddd   xmm10, xmm11
+        pxor    xmm1, xmm2
+        pxor    xmm9, xmm10
+        movdqa  xmm4, xmm1
+        pslld   xmm1, 25
+        psrld   xmm4, 7
+        por     xmm1, xmm4
+        movdqa  xmm4, xmm9
+        pslld   xmm9, 25
+        psrld   xmm4, 7
+        por     xmm9, xmm4
+        pshufd  xmm0, xmm0, 39H
+        pshufd  xmm8, xmm8, 39H
+        pshufd  xmm3, xmm3, 4EH
+        pshufd  xmm11, xmm11, 4EH
+        pshufd  xmm2, xmm2, 93H
+        pshufd  xmm10, xmm10, 93H
+        dec     al
+        je      endroundloop2
+        movdqa  xmm12, xmmword ptr [rsp+20H]
+        movdqa  xmm5, xmmword ptr [rsp+40H]
+        pshufd  xmm13, xmm12, 0FH
+        shufps  xmm12, xmm5, 214
+        pshufd  xmm4, xmm12, 39H
+        movdqa  xmm12, xmm6
+        shufps  xmm12, xmm7, 250
+        pblendw xmm13, xmm12, 0CCH
+        movdqa  xmm12, xmm7
+        punpcklqdq xmm12, xmm5
+        pblendw xmm12, xmm6, 0C0H
+        pshufd  xmm12, xmm12, 78H
+        punpckhdq xmm5, xmm7
+        punpckldq xmm6, xmm5
+        pshufd  xmm7, xmm6, 1EH
+        movdqa  xmmword ptr [rsp+20H], xmm13
+        movdqa  xmmword ptr [rsp+40H], xmm12
+        movdqa  xmm5, xmmword ptr [rsp+30H]
+        movdqa  xmm13, xmmword ptr [rsp+50H]
+        pshufd  xmm6, xmm5, 0FH
+        shufps  xmm5, xmm13, 214
+        pshufd  xmm12, xmm5, 39H
+        movdqa  xmm5, xmm14
+        shufps  xmm5, xmm15, 250
+        pblendw xmm6, xmm5, 0CCH
+        movdqa  xmm5, xmm15
+        punpcklqdq xmm5, xmm13
+        pblendw xmm5, xmm14, 0C0H
+        pshufd  xmm5, xmm5, 78H
+        punpckhdq xmm13, xmm15
+        punpckldq xmm14, xmm13
+        pshufd  xmm15, xmm14, 1EH
+        movdqa  xmm13, xmm6
+        movdqa  xmm14, xmm5
+        movdqa  xmm5, xmmword ptr [rsp+20H]
+        movdqa  xmm6, xmmword ptr [rsp+40H]
+        jmp     roundloop2
+endroundloop2:
+        pxor    xmm0, xmm2
+        pxor    xmm1, xmm3
+        pxor    xmm8, xmm10
+        pxor    xmm9, xmm11
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     innerloop2
+        movups  xmmword ptr [rbx], xmm0
+        movups  xmmword ptr [rbx+10H], xmm1
+        movups  xmmword ptr [rbx+20H], xmm8
+        movups  xmmword ptr [rbx+30H], xmm9
+        movdqa  xmm0, xmmword ptr [rsp+130H]
+        movdqa  xmm1, xmmword ptr [rsp+110H]
+        movdqa  xmm2, xmmword ptr [rsp+120H]
+        movdqu  xmm3, xmmword ptr [rsp+118H]
+        movdqu  xmm4, xmmword ptr [rsp+128H]
+        blendvps xmm1, xmm3, xmm0
+        blendvps xmm2, xmm4, xmm0
+        movdqa  xmmword ptr [rsp+110H], xmm1
+        movdqa  xmmword ptr [rsp+120H], xmm2
+        add     rdi, 16
+        add     rbx, 64
+        sub     rsi, 2
+final1block:
+        test    esi, 1H
+        je      unwind
+        movups  xmm0, xmmword ptr [rcx]
+        movups  xmm1, xmmword ptr [rcx+10H]
+        movd    xmm13, dword ptr [rsp+110H]
+        pinsrd  xmm13, dword ptr [rsp+120H], 1
+        pinsrd  xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2
+        movaps  xmm14, xmmword ptr [ROT8]
+        movaps  xmm15, xmmword ptr [ROT16]
+        mov     r8, qword ptr [rdi]
+        movzx   eax, byte ptr [rbp+80H]
+        or      eax, r13d
+        xor     edx, edx
+innerloop1:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        movaps  xmm2, xmmword ptr [BLAKE3_IV]
+        movaps  xmm3, xmm13
+        pinsrd  xmm3, eax, 3
+        movups  xmm4, xmmword ptr [r8+rdx-40H]
+        movups  xmm5, xmmword ptr [r8+rdx-30H]
+        movaps  xmm8, xmm4
+        shufps  xmm4, xmm5, 136
+        shufps  xmm8, xmm5, 221
+        movaps  xmm5, xmm8
+        movups  xmm6, xmmword ptr [r8+rdx-20H]
+        movups  xmm7, xmmword ptr [r8+rdx-10H]
+        movaps  xmm8, xmm6
+        shufps  xmm6, xmm7, 136
+        pshufd  xmm6, xmm6, 93H
+        shufps  xmm8, xmm7, 221
+        pshufd  xmm7, xmm8, 93H
+        mov     al, 7
+roundloop1:
+        paddd   xmm0, xmm4
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm15
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm5
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 93H
+        pshufd  xmm3, xmm3, 4EH
+        pshufd  xmm2, xmm2, 39H
+        paddd   xmm0, xmm6
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm15
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm7
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 39H
+        pshufd  xmm3, xmm3, 4EH
+        pshufd  xmm2, xmm2, 93H
+        dec     al
+        jz      endroundloop1
+        movdqa  xmm8, xmm4
+        shufps  xmm8, xmm5, 214
+        pshufd  xmm9, xmm4, 0FH
+        pshufd  xmm4, xmm8, 39H
+        movdqa  xmm8, xmm6
+        shufps  xmm8, xmm7, 250
+        pblendw xmm9, xmm8, 0CCH
+        movdqa  xmm8, xmm7
+        punpcklqdq xmm8, xmm5
+        pblendw xmm8, xmm6, 0C0H
+        pshufd  xmm8, xmm8, 78H
+        punpckhdq xmm5, xmm7
+        punpckldq xmm6, xmm5
+        pshufd  xmm7, xmm6, 1EH
+        movdqa  xmm5, xmm9
+        movdqa  xmm6, xmm8
+        jmp     roundloop1
+endroundloop1:
+        pxor    xmm0, xmm2
+        pxor    xmm1, xmm3
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     innerloop1
+        movups  xmmword ptr [rbx], xmm0
+        movups  xmmword ptr [rbx+10H], xmm1
+        jmp     unwind
+_blake3_hash_many_sse41 ENDP
+blake3_hash_many_sse41 ENDP
+
+blake3_compress_in_place_sse41 PROC
+_blake3_compress_in_place_sse41 PROC
+        sub     rsp, 120
+        movdqa  xmmword ptr [rsp], xmm6
+        movdqa  xmmword ptr [rsp+10H], xmm7
+        movdqa  xmmword ptr [rsp+20H], xmm8
+        movdqa  xmmword ptr [rsp+30H], xmm9
+        movdqa  xmmword ptr [rsp+40H], xmm11
+        movdqa  xmmword ptr [rsp+50H], xmm14
+        movdqa  xmmword ptr [rsp+60H], xmm15
+        movups  xmm0, xmmword ptr [rcx]
+        movups  xmm1, xmmword ptr [rcx+10H]
+        movaps  xmm2, xmmword ptr [BLAKE3_IV]
+        movzx   eax, byte ptr [rsp+0A0H]
+        movzx   r8d, r8b
+        shl     rax, 32
+        add     r8, rax
+        movd    xmm3, r9
+        movd    xmm4, r8
+        punpcklqdq xmm3, xmm4
+        movups  xmm4, xmmword ptr [rdx]
+        movups  xmm5, xmmword ptr [rdx+10H]
+        movaps  xmm8, xmm4
+        shufps  xmm4, xmm5, 136
+        shufps  xmm8, xmm5, 221
+        movaps  xmm5, xmm8
+        movups  xmm6, xmmword ptr [rdx+20H]
+        movups  xmm7, xmmword ptr [rdx+30H]
+        movaps  xmm8, xmm6
+        shufps  xmm6, xmm7, 136
+        pshufd  xmm6, xmm6, 93H
+        shufps  xmm8, xmm7, 221
+        pshufd  xmm7, xmm8, 93H
+        movaps  xmm14, xmmword ptr [ROT8]
+        movaps  xmm15, xmmword ptr [ROT16]
+        mov     al, 7
+@@:
+        paddd   xmm0, xmm4
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm15
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm5
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 93H
+        pshufd  xmm3, xmm3, 4EH
+        pshufd  xmm2, xmm2, 39H
+        paddd   xmm0, xmm6
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm15
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm7
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 39H
+        pshufd  xmm3, xmm3, 4EH
+        pshufd  xmm2, xmm2, 93H
+        dec     al
+        jz      @F
+        movdqa  xmm8, xmm4
+        shufps  xmm8, xmm5, 214
+        pshufd  xmm9, xmm4, 0FH
+        pshufd  xmm4, xmm8, 39H
+        movdqa  xmm8, xmm6
+        shufps  xmm8, xmm7, 250
+        pblendw xmm9, xmm8, 0CCH
+        movdqa  xmm8, xmm7
+        punpcklqdq xmm8, xmm5
+        pblendw xmm8, xmm6, 0C0H
+        pshufd  xmm8, xmm8, 78H
+        punpckhdq xmm5, xmm7
+        punpckldq xmm6, xmm5
+        pshufd  xmm7, xmm6, 1EH
+        movdqa  xmm5, xmm9
+        movdqa  xmm6, xmm8
+        jmp     @B
+@@:
+        pxor    xmm0, xmm2
+        pxor    xmm1, xmm3
+        movups  xmmword ptr [rcx], xmm0
+        movups  xmmword ptr [rcx+10H], xmm1
+        movdqa  xmm6, xmmword ptr [rsp]
+        movdqa  xmm7, xmmword ptr [rsp+10H]
+        movdqa  xmm8, xmmword ptr [rsp+20H]
+        movdqa  xmm9, xmmword ptr [rsp+30H]
+        movdqa  xmm11, xmmword ptr [rsp+40H]
+        movdqa  xmm14, xmmword ptr [rsp+50H]
+        movdqa  xmm15, xmmword ptr [rsp+60H]
+        add     rsp, 120
+        ret
+_blake3_compress_in_place_sse41 ENDP
+blake3_compress_in_place_sse41 ENDP
+
+ALIGN 16
+blake3_compress_xof_sse41 PROC
+_blake3_compress_xof_sse41 PROC
+        sub     rsp, 120
+        movdqa  xmmword ptr [rsp], xmm6
+        movdqa  xmmword ptr [rsp+10H], xmm7
+        movdqa  xmmword ptr [rsp+20H], xmm8
+        movdqa  xmmword ptr [rsp+30H], xmm9
+        movdqa  xmmword ptr [rsp+40H], xmm11
+        movdqa  xmmword ptr [rsp+50H], xmm14
+        movdqa  xmmword ptr [rsp+60H], xmm15
+        movups  xmm0, xmmword ptr [rcx]
+        movups  xmm1, xmmword ptr [rcx+10H]
+        movaps  xmm2, xmmword ptr [BLAKE3_IV]
+        movzx   eax, byte ptr [rsp+0A0H]
+        movzx   r8d, r8b
+        mov     r10, qword ptr [rsp+0A8H]
+        shl     rax, 32
+        add     r8, rax
+        movd    xmm3, r9
+        movd    xmm4, r8
+        punpcklqdq xmm3, xmm4
+        movups  xmm4, xmmword ptr [rdx]
+        movups  xmm5, xmmword ptr [rdx+10H]
+        movaps  xmm8, xmm4
+        shufps  xmm4, xmm5, 136
+        shufps  xmm8, xmm5, 221
+        movaps  xmm5, xmm8
+        movups  xmm6, xmmword ptr [rdx+20H]
+        movups  xmm7, xmmword ptr [rdx+30H]
+        movaps  xmm8, xmm6
+        shufps  xmm6, xmm7, 136
+        pshufd  xmm6, xmm6, 93H
+        shufps  xmm8, xmm7, 221
+        pshufd  xmm7, xmm8, 93H
+        movaps  xmm14, xmmword ptr [ROT8]
+        movaps  xmm15, xmmword ptr [ROT16]
+        mov     al, 7
+@@:
+        paddd   xmm0, xmm4
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm15
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm5
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 93H
+        pshufd  xmm3, xmm3, 4EH
+        pshufd  xmm2, xmm2, 39H
+        paddd   xmm0, xmm6
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm15
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm7
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 39H
+        pshufd  xmm3, xmm3, 4EH
+        pshufd  xmm2, xmm2, 93H
+        dec     al
+        jz      @F
+        movdqa  xmm8, xmm4
+        shufps  xmm8, xmm5, 214
+        pshufd  xmm9, xmm4, 0FH
+        pshufd  xmm4, xmm8, 39H
+        movdqa  xmm8, xmm6
+        shufps  xmm8, xmm7, 250
+        pblendw xmm9, xmm8, 0CCH
+        movdqa  xmm8, xmm7
+        punpcklqdq xmm8, xmm5
+        pblendw xmm8, xmm6, 0C0H
+        pshufd  xmm8, xmm8, 78H
+        punpckhdq xmm5, xmm7
+        punpckldq xmm6, xmm5
+        pshufd  xmm7, xmm6, 1EH
+        movdqa  xmm5, xmm9
+        movdqa  xmm6, xmm8
+        jmp     @B
+@@:
+        movdqu  xmm4, xmmword ptr [rcx]
+        movdqu  xmm5, xmmword ptr [rcx+10H]
+        pxor    xmm0, xmm2
+        pxor    xmm1, xmm3
+        pxor    xmm2, xmm4
+        pxor    xmm3, xmm5
+        movups  xmmword ptr [r10], xmm0
+        movups  xmmword ptr [r10+10H], xmm1
+        movups  xmmword ptr [r10+20H], xmm2
+        movups  xmmword ptr [r10+30H], xmm3
+        movdqa  xmm6, xmmword ptr [rsp]
+        movdqa  xmm7, xmmword ptr [rsp+10H]
+        movdqa  xmm8, xmmword ptr [rsp+20H]
+        movdqa  xmm9, xmmword ptr [rsp+30H]
+        movdqa  xmm11, xmmword ptr [rsp+40H]
+        movdqa  xmm14, xmmword ptr [rsp+50H]
+        movdqa  xmm15, xmmword ptr [rsp+60H]
+        add     rsp, 120
+        ret
+_blake3_compress_xof_sse41 ENDP
+blake3_compress_xof_sse41 ENDP
+
+_TEXT ENDS
+
+
+_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST'
+ALIGN   64
+BLAKE3_IV:
+        dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH
+
+ADD0:
+        dd 0, 1, 2, 3
+
+ADD1:
+        dd 4 dup (4)
+
+BLAKE3_IV_0:
+        dd 4 dup (6A09E667H)
+
+BLAKE3_IV_1:
+        dd 4 dup (0BB67AE85H)
+
+BLAKE3_IV_2:
+        dd 4 dup (3C6EF372H)
+
+BLAKE3_IV_3:
+        dd 4 dup (0A54FF53AH)
+
+BLAKE3_BLOCK_LEN:
+        dd 4 dup (64)
+
+ROT16:
+        db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+
+ROT8:
+        db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
+
+CMP_MSB_MASK:
+        dd 8 dup(80000000H)
+
+_RDATA ENDS
+END
+
diff --git a/llvm/lib/Support/BinaryStreamWriter.cpp b/llvm/lib/Support/BinaryStreamWriter.cpp
index 8c9efa0ed9a9..dc4ea200c7be 100644
--- a/llvm/lib/Support/BinaryStreamWriter.cpp
+++ b/llvm/lib/Support/BinaryStreamWriter.cpp
@@ -8,7 +8,6 @@
 
 #include "llvm/Support/BinaryStreamWriter.h"
 
-#include "llvm/Support/BinaryStreamError.h"
 #include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/LEB128.h"
@@ -94,10 +93,11 @@ BinaryStreamWriter::split(uint64_t Off) const {
 
 Error BinaryStreamWriter::padToAlignment(uint32_t Align) {
   uint64_t NewOffset = alignTo(Offset, Align);
-  if (NewOffset > getLength())
-    return make_error<BinaryStreamError>(stream_error_code::stream_too_short);
+  const uint64_t ZerosSize = 64;
+  static constexpr char Zeros[ZerosSize] = {};
   while (Offset < NewOffset)
-    if (auto EC = writeInteger('\0'))
-      return EC;
+    if (auto E = writeArray(
+            ArrayRef<char>(Zeros, std::min(ZerosSize, NewOffset - Offset))))
+      return E;
   return Error::success();
 }
diff --git a/llvm/lib/Support/CSKYAttributeParser.cpp b/llvm/lib/Support/CSKYAttributeParser.cpp
new file mode 100644
index 000000000000..ea1ac9232315
--- /dev/null
+++ b/llvm/lib/Support/CSKYAttributeParser.cpp
@@ -0,0 +1,155 @@
+//===-- CSKYAttributeParser.cpp - CSKY Attribute Parser -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/CSKYAttributeParser.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Errc.h"
+
+using namespace llvm;
+
+const CSKYAttributeParser::DisplayHandler
+    CSKYAttributeParser::displayRoutines[] = {
+        {
+            CSKYAttrs::CSKY_ARCH_NAME,
+            &ELFAttributeParser::stringAttribute,
+        },
+        {
+            CSKYAttrs::CSKY_CPU_NAME,
+            &ELFAttributeParser::stringAttribute,
+        },
+        {
+            CSKYAttrs::CSKY_ISA_FLAGS,
+            &ELFAttributeParser::integerAttribute,
+        },
+        {
+            CSKYAttrs::CSKY_ISA_EXT_FLAGS,
+            &ELFAttributeParser::integerAttribute,
+        },
+        {
+            CSKYAttrs::CSKY_DSP_VERSION,
+            &CSKYAttributeParser::dspVersion,
+        },
+        {
+            CSKYAttrs::CSKY_VDSP_VERSION,
+            &CSKYAttributeParser::vdspVersion,
+        },
+        {
+            CSKYAttrs::CSKY_FPU_VERSION,
+            &CSKYAttributeParser::fpuVersion,
+        },
+        {
+            CSKYAttrs::CSKY_FPU_ABI,
+            &CSKYAttributeParser::fpuABI,
+        },
+        {
+            CSKYAttrs::CSKY_FPU_ROUNDING,
+            &CSKYAttributeParser::fpuRounding,
+        },
+        {
+            CSKYAttrs::CSKY_FPU_DENORMAL,
+            &CSKYAttributeParser::fpuDenormal,
+        },
+        {
+            CSKYAttrs::CSKY_FPU_EXCEPTION,
+            &CSKYAttributeParser::fpuException,
+        },
+        {
+            CSKYAttrs::CSKY_FPU_NUMBER_MODULE,
+            &ELFAttributeParser::stringAttribute,
+        },
+        {
+            CSKYAttrs::CSKY_FPU_HARDFP,
+            &CSKYAttributeParser::fpuHardFP,
+        }};
+
+Error CSKYAttributeParser::handler(uint64_t tag, bool &handled) {
+  handled = false;
+  for (unsigned AHI = 0, AHE = array_lengthof(displayRoutines); AHI != AHE;
+       ++AHI) {
+    if (uint64_t(displayRoutines[AHI].attribute) == tag) {
+      if (Error e = (this->*displayRoutines[AHI].routine)(tag))
+        return e;
+      handled = true;
+      break;
+    }
+  }
+
+  return Error::success();
+}
+
+Error CSKYAttributeParser::dspVersion(unsigned tag) {
+  static const char *strings[] = {"Error", "DSP Extension", "DSP 2.0"};
+  return parseStringAttribute("Tag_CSKY_DSP_VERSION", tag,
+                              makeArrayRef(strings));
+}
+
+Error CSKYAttributeParser::vdspVersion(unsigned tag) {
+  static const char *strings[] = {"Error", "VDSP Version 1", "VDSP Version 2"};
+  return parseStringAttribute("Tag_CSKY_VDSP_VERSION", tag,
+                              makeArrayRef(strings));
+}
+
+Error CSKYAttributeParser::fpuVersion(unsigned tag) {
+  static const char *strings[] = {"Error", "FPU Version 1", "FPU Version 2",
+                                  "FPU Version 3"};
+  return parseStringAttribute("Tag_CSKY_FPU_VERSION", tag,
+                              makeArrayRef(strings));
+}
+
+Error CSKYAttributeParser::fpuABI(unsigned tag) {
+  static const char *strings[] = {"Error", "Soft", "SoftFP", "Hard"};
+  return parseStringAttribute("Tag_CSKY_FPU_ABI", tag, makeArrayRef(strings));
+}
+
+Error CSKYAttributeParser::fpuRounding(unsigned tag) {
+  static const char *strings[] = {"None", "Needed"};
+  return parseStringAttribute("Tag_CSKY_FPU_ROUNDING", tag,
+                              makeArrayRef(strings));
+}
+
+Error CSKYAttributeParser::fpuDenormal(unsigned tag) {
+  static const char *strings[] = {"None", "Needed"};
+  return parseStringAttribute("Tag_CSKY_FPU_DENORMAL", tag,
+                              makeArrayRef(strings));
+}
+
+Error CSKYAttributeParser::fpuException(unsigned tag) {
+  static const char *strings[] = {"None", "Needed"};
+  return parseStringAttribute("Tag_CSKY_FPU_EXCEPTION", tag,
+                              makeArrayRef(strings));
+}
+
+Error CSKYAttributeParser::fpuHardFP(unsigned tag) {
+  uint64_t value = de.getULEB128(cursor);
+  ListSeparator LS(" ");
+
+  std::string description;
+
+  if (value & 0x1) {
+    description += LS;
+    description += "Half";
+  }
+  if ((value >> 1) & 0x1) {
+    description += LS;
+    description += "Single";
+  }
+  if ((value >> 2) & 0x1) {
+    description += LS;
+    description += "Double";
+  }
+
+  if (description.empty()) {
+    printAttribute(tag, value, "");
+    return createStringError(errc::invalid_argument,
+                             "unknown Tag_CSKY_FPU_HARDFP value: " +
+                                 Twine(value));
+  }
+
+  printAttribute(tag, value, description);
+  return Error::success();
+}
diff --git a/llvm/lib/Support/CSKYAttributes.cpp b/llvm/lib/Support/CSKYAttributes.cpp
new file mode 100644
index 000000000000..6130517e44e3
--- /dev/null
+++ b/llvm/lib/Support/CSKYAttributes.cpp
@@ -0,0 +1,33 @@
+//===-- CSKYAttributes.cpp - CSKY Attributes ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/CSKYAttributes.h"
+
+using namespace llvm;
+using namespace llvm::CSKYAttrs;
+
+static const TagNameItem tagData[] = {
+    {CSKY_ARCH_NAME, "Tag_CSKY_ARCH_NAME"},
+    {CSKY_CPU_NAME, "Tag_CSKY_CPU_NAME"},
+    {CSKY_CPU_NAME, "Tag_CSKY_CPU_NAME"},
+    {CSKY_ISA_FLAGS, "Tag_CSKY_ISA_FLAGS"},
+    {CSKY_ISA_EXT_FLAGS, "Tag_CSKY_ISA_EXT_FLAGS"},
+    {CSKY_DSP_VERSION, "Tag_CSKY_DSP_VERSION"},
+    {CSKY_VDSP_VERSION, "Tag_CSKY_VDSP_VERSION"},
+    {CSKY_FPU_VERSION, "Tag_CSKY_FPU_VERSION"},
+    {CSKY_FPU_ABI, "Tag_CSKY_FPU_ABI"},
+    {CSKY_FPU_ROUNDING, "Tag_CSKY_FPU_ROUNDING"},
+    {CSKY_FPU_DENORMAL, "Tag_CSKY_FPU_DENORMAL"},
+    {CSKY_FPU_EXCEPTION, "Tag_CSKY_FPU_EXCEPTION"},
+    {CSKY_FPU_NUMBER_MODULE, "Tag_CSKY_FPU_NUMBER_MODULE"},
+    {CSKY_FPU_HARDFP, "Tag_CSKY_FPU_HARDFP"}};
+
+constexpr TagNameMap CSKYAttributeTags{tagData};
+const TagNameMap &llvm::CSKYAttrs::getCSKYAttributeTags() {
+  return CSKYAttributeTags;
+}
diff --git a/llvm/lib/Support/CSKYTargetParser.cpp b/llvm/lib/Support/CSKYTargetParser.cpp
new file mode 100644
index 000000000000..7e9d2ca0428d
--- /dev/null
+++ b/llvm/lib/Support/CSKYTargetParser.cpp
@@ -0,0 +1,181 @@
+//===-- TargetParser - Parser for target features ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a target parser to recognise CSKY hardware features
+// such as CPU/ARCH names.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/CSKYTargetParser.h"
+#include "llvm/ADT/StringSwitch.h"
+
+using namespace llvm;
+
+bool CSKY::getFPUFeatures(CSKYFPUKind CSKYFPUKind,
+                          std::vector<StringRef> &Features) {
+
+  if (CSKYFPUKind >= FK_LAST || CSKYFPUKind == FK_INVALID)
+    return false;
+
+  switch (CSKYFPUKind) {
+  case FK_AUTO:
+    Features.push_back("+fpuv2_sf");
+    Features.push_back("+fpuv2_df");
+    Features.push_back("+fdivdu");
+    break;
+  case FK_FPV2:
+    Features.push_back("+fpuv2_sf");
+    Features.push_back("+fpuv2_df");
+    break;
+  case FK_FPV2_DIVD:
+    Features.push_back("+fpuv2_sf");
+    Features.push_back("+fpuv2_df");
+    Features.push_back("+fdivdu");
+    break;
+  case FK_FPV2_SF:
+    Features.push_back("+fpuv2_sf");
+    break;
+  case FK_FPV3:
+    Features.push_back("+fpuv3_hf");
+    Features.push_back("+fpuv3_hi");
+    Features.push_back("+fpuv3_sf");
+    Features.push_back("+fpuv3_df");
+    break;
+  case FK_FPV3_HF:
+    Features.push_back("+fpuv3_hf");
+    Features.push_back("+fpuv3_hi");
+    break;
+  case FK_FPV3_HSF:
+    Features.push_back("+fpuv3_hf");
+    Features.push_back("+fpuv3_hi");
+    Features.push_back("+fpuv3_sf");
+    break;
+  case FK_FPV3_SDF:
+    Features.push_back("+fpuv3_sf");
+    Features.push_back("+fpuv3_df");
+    break;
+  default:
+    llvm_unreachable("Unknown FPU Kind");
+    return false;
+  }
+
+  return true;
+}
+
+// ======================================================= //
+// Information by ID
+// ======================================================= //
+
+StringRef CSKY::getArchName(ArchKind AK) {
+  return ARCHNames[static_cast<unsigned>(AK)].getName();
+}
+
+// The default cpu's name is same as arch name.
+StringRef CSKY::getDefaultCPU(StringRef Arch) {
+  ArchKind AK = parseArch(Arch);
+  if (AK == CSKY::ArchKind::INVALID)
+    return StringRef();
+
+  return Arch;
+}
+
+// ======================================================= //
+// Parsers
+// ======================================================= //
+CSKY::ArchKind CSKY::parseArch(StringRef Arch) {
+  for (const auto A : ARCHNames) {
+    if (A.getName() == Arch)
+      return A.ID;
+  }
+
+  return CSKY::ArchKind::INVALID;
+}
+
+CSKY::ArchKind CSKY::parseCPUArch(StringRef CPU) {
+  for (const auto C : CPUNames) {
+    if (CPU == C.getName())
+      return C.ArchID;
+  }
+
+  return CSKY::ArchKind::INVALID;
+}
+
+uint64_t CSKY::parseArchExt(StringRef ArchExt) {
+  for (const auto &A : CSKYARCHExtNames) {
+    if (ArchExt == A.getName())
+      return A.ID;
+  }
+  return AEK_INVALID;
+}
+
+void CSKY::fillValidCPUArchList(SmallVectorImpl<StringRef> &Values) {
+  for (const CpuNames<CSKY::ArchKind> &Arch : CPUNames) {
+    if (Arch.ArchID != CSKY::ArchKind::INVALID)
+      Values.push_back(Arch.getName());
+  }
+}
+
+StringRef CSKY::getFPUName(unsigned FPUKind) {
+  if (FPUKind >= FK_LAST)
+    return StringRef();
+  return FPUNames[FPUKind].getName();
+}
+
+CSKY::FPUVersion CSKY::getFPUVersion(unsigned FPUKind) {
+  if (FPUKind >= FK_LAST)
+    return FPUVersion::NONE;
+  return FPUNames[FPUKind].FPUVer;
+}
+
+uint64_t CSKY::getDefaultExtensions(StringRef CPU) {
+  return StringSwitch<uint64_t>(CPU)
+#define CSKY_CPU_NAME(NAME, ID, DEFAULT_EXT)                                   \
+  .Case(NAME, ARCHNames[static_cast<unsigned>(ArchKind::ID)].archBaseExt |     \
+                  DEFAULT_EXT)
+#include "llvm/Support/CSKYTargetParser.def"
+      .Default(CSKY::AEK_INVALID);
+}
+
+StringRef CSKY::getArchExtName(uint64_t ArchExtKind) {
+  for (const auto &AE : CSKYARCHExtNames)
+    if (ArchExtKind == AE.ID)
+      return AE.getName();
+  return StringRef();
+}
+
+static bool stripNegationPrefix(StringRef &Name) {
+  if (Name.startswith("no")) {
+    Name = Name.substr(2);
+    return true;
+  }
+  return false;
+}
+
+StringRef CSKY::getArchExtFeature(StringRef ArchExt) {
+  bool Negated = stripNegationPrefix(ArchExt);
+  for (const auto &AE : CSKYARCHExtNames) {
+    if (AE.Feature && ArchExt == AE.getName())
+      return StringRef(Negated ? AE.NegFeature : AE.Feature);
+  }
+
+  return StringRef();
+}
+
+bool CSKY::getExtensionFeatures(uint64_t Extensions,
+                                std::vector<StringRef> &Features) {
+  if (Extensions == CSKY::AEK_INVALID)
+    return false;
+
+  for (const auto &AE : CSKYARCHExtNames) {
+    if ((Extensions & AE.ID) == AE.ID && AE.Feature)
+      Features.push_back(AE.Feature);
+  }
+
+  return true;
+}
diff --git a/llvm/lib/Support/CodeGenCoverage.cpp b/llvm/lib/Support/CodeGenCoverage.cpp
index 73e0fb3edce8..d5ab77b9c66f 100644
--- a/llvm/lib/Support/CodeGenCoverage.cpp
+++ b/llvm/lib/Support/CodeGenCoverage.cpp
@@ -23,7 +23,7 @@ using namespace llvm;
 
 static sys::SmartMutex<true> OutputMutex;
 
-CodeGenCoverage::CodeGenCoverage() {}
+CodeGenCoverage::CodeGenCoverage() = default;
 
 void CodeGenCoverage::setCovered(uint64_t RuleID) {
   if (RuleCoverage.size() <= RuleID)
diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp
index 71a6ebf2a72e..eb6c04d987b3 100644
--- a/llvm/lib/Support/CommandLine.cpp
+++ b/llvm/lib/Support/CommandLine.cpp
@@ -166,7 +166,7 @@ public:
   // This collects the different subcommands that have been registered.
   SmallPtrSet<SubCommand *, 4> RegisteredSubCommands;
 
-  CommandLineParser() : ActiveSubCommand(nullptr) {
+  CommandLineParser() {
     registerSubCommand(&*TopLevelSubCommand);
     registerSubCommand(&*AllSubCommands);
   }
@@ -418,7 +418,7 @@ public:
   }
 
 private:
-  SubCommand *ActiveSubCommand;
+  SubCommand *ActiveSubCommand = nullptr;
 
   Option *LookupOption(SubCommand &Sub, StringRef &Arg, StringRef &Value);
   Option *LookupLongOption(SubCommand &Sub, StringRef &Arg, StringRef &Value,
@@ -918,21 +918,34 @@ static size_t parseBackslash(StringRef Src, size_t I, SmallString<128> &Token) {
   return I - 1;
 }
 
-// Windows treats whitespace, double quotes, and backslashes specially.
+// Windows treats whitespace, double quotes, and backslashes specially, except
+// when parsing the first token of a full command line, in which case
+// backslashes are not special.
 static bool isWindowsSpecialChar(char C) {
   return isWhitespaceOrNull(C) || C == '\\' || C == '\"';
 }
+static bool isWindowsSpecialCharInCommandName(char C) {
+  return isWhitespaceOrNull(C) || C == '\"';
+}
 
 // Windows tokenization implementation. The implementation is designed to be
 // inlined and specialized for the two user entry points.
-static inline void
-tokenizeWindowsCommandLineImpl(StringRef Src, StringSaver &Saver,
-                               function_ref<void(StringRef)> AddToken,
-                               bool AlwaysCopy, function_ref<void()> MarkEOL) {
+static inline void tokenizeWindowsCommandLineImpl(
+    StringRef Src, StringSaver &Saver, function_ref<void(StringRef)> AddToken,
+    bool AlwaysCopy, function_ref<void()> MarkEOL, bool InitialCommandName) {
   SmallString<128> Token;
 
+  // Sometimes, this function will be handling a full command line including an
+  // executable pathname at the start. In that situation, the initial pathname
+  // needs different handling from the following arguments, because when
+  // CreateProcess or cmd.exe scans the pathname, it doesn't treat \ as
+  // escaping the quote character, whereas when libc scans the rest of the
+  // command line, it does.
+  bool CommandName = InitialCommandName;
+
   // Try to do as much work inside the state machine as possible.
   enum { INIT, UNQUOTED, QUOTED } State = INIT;
+
   for (size_t I = 0, E = Src.size(); I < E; ++I) {
     switch (State) {
     case INIT: {
@@ -947,19 +960,29 @@ tokenizeWindowsCommandLineImpl(StringRef Src, StringSaver &Saver,
       if (I >= E)
         break;
       size_t Start = I;
-      while (I < E && !isWindowsSpecialChar(Src[I]))
-        ++I;
+      if (CommandName) {
+        while (I < E && !isWindowsSpecialCharInCommandName(Src[I]))
+          ++I;
+      } else {
+        while (I < E && !isWindowsSpecialChar(Src[I]))
+          ++I;
+      }
       StringRef NormalChars = Src.slice(Start, I);
       if (I >= E || isWhitespaceOrNull(Src[I])) {
         // No special characters: slice out the substring and start the next
         // token. Copy the string if the caller asks us to.
         AddToken(AlwaysCopy ? Saver.save(NormalChars) : NormalChars);
-        if (I < E && Src[I] == '\n')
+        if (I < E && Src[I] == '\n') {
           MarkEOL();
+          CommandName = InitialCommandName;
+        } else {
+          CommandName = false;
+        }
       } else if (Src[I] == '\"') {
         Token += NormalChars;
         State = QUOTED;
       } else if (Src[I] == '\\') {
+        assert(!CommandName && "or else we'd have treated it as a normal char");
         Token += NormalChars;
         I = parseBackslash(Src, I, Token);
         State = UNQUOTED;
@@ -976,12 +999,16 @@ tokenizeWindowsCommandLineImpl(StringRef Src, StringSaver &Saver,
         // token.
         AddToken(Saver.save(Token.str()));
         Token.clear();
-        if (Src[I] == '\n')
+        if (Src[I] == '\n') {
+          CommandName = InitialCommandName;
           MarkEOL();
+        } else {
+          CommandName = false;
+        }
         State = INIT;
       } else if (Src[I] == '\"') {
         State = QUOTED;
-      } else if (Src[I] == '\\') {
+      } else if (Src[I] == '\\' && !CommandName) {
         I = parseBackslash(Src, I, Token);
       } else {
         Token.push_back(Src[I]);
@@ -999,7 +1026,7 @@ tokenizeWindowsCommandLineImpl(StringRef Src, StringSaver &Saver,
           // Otherwise, end the quoted portion and return to the unquoted state.
           State = UNQUOTED;
         }
-      } else if (Src[I] == '\\') {
+      } else if (Src[I] == '\\' && !CommandName) {
         I = parseBackslash(Src, I, Token);
       } else {
         Token.push_back(Src[I]);
@@ -1008,7 +1035,7 @@ tokenizeWindowsCommandLineImpl(StringRef Src, StringSaver &Saver,
     }
   }
 
-  if (State == UNQUOTED)
+  if (State != INIT)
     AddToken(Saver.save(Token.str()));
 }
 
@@ -1021,7 +1048,7 @@ void cl::TokenizeWindowsCommandLine(StringRef Src, StringSaver &Saver,
       NewArgv.push_back(nullptr);
   };
   tokenizeWindowsCommandLineImpl(Src, Saver, AddToken,
-                                 /*AlwaysCopy=*/true, OnEOL);
+                                 /*AlwaysCopy=*/true, OnEOL, false);
 }
 
 void cl::TokenizeWindowsCommandLineNoCopy(StringRef Src, StringSaver &Saver,
@@ -1029,7 +1056,19 @@ void cl::TokenizeWindowsCommandLineNoCopy(StringRef Src, StringSaver &Saver,
   auto AddToken = [&](StringRef Tok) { NewArgv.push_back(Tok); };
   auto OnEOL = []() {};
   tokenizeWindowsCommandLineImpl(Src, Saver, AddToken, /*AlwaysCopy=*/false,
-                                 OnEOL);
+                                 OnEOL, false);
+}
+
+void cl::TokenizeWindowsCommandLineFull(StringRef Src, StringSaver &Saver,
+                                        SmallVectorImpl<const char *> &NewArgv,
+                                        bool MarkEOLs) {
+  auto AddToken = [&](StringRef Tok) { NewArgv.push_back(Tok.data()); };
+  auto OnEOL = [&]() {
+    if (MarkEOLs)
+      NewArgv.push_back(nullptr);
+  };
+  tokenizeWindowsCommandLineImpl(Src, Saver, AddToken,
+                                 /*AlwaysCopy=*/true, OnEOL, true);
 }
 
 void cl::tokenizeConfigFile(StringRef Source, StringSaver &Saver,
@@ -1737,21 +1776,6 @@ bool Option::addOccurrence(unsigned pos, StringRef ArgName, StringRef Value,
   if (!MultiArg)
     NumOccurrences++; // Increment the number of times we have been seen
 
-  switch (getNumOccurrencesFlag()) {
-  case Optional:
-    if (NumOccurrences > 1)
-      return error("may only occur zero or one times!", ArgName);
-    break;
-  case Required:
-    if (NumOccurrences > 1)
-      return error("must occur exactly one time!", ArgName);
-    LLVM_FALLTHROUGH;
-  case OneOrMore:
-  case ZeroOrMore:
-  case ConsumeAfter:
-    break;
-  }
-
   return handleOccurrence(pos, ArgName, Value);
 }
 
@@ -2236,7 +2260,7 @@ protected:
 
 public:
   explicit HelpPrinter(bool showHidden) : ShowHidden(showHidden) {}
-  virtual ~HelpPrinter() {}
+  virtual ~HelpPrinter() = default;
 
   // Invoke the printer.
   void operator=(bool Value) {
@@ -2444,11 +2468,7 @@ public:
 #else
     OS << "LLVM (http://llvm.org/):\n  ";
 #endif
-    OS << PACKAGE_NAME << " version " << PACKAGE_VERSION;
-#ifdef LLVM_VERSION_INFO
-    OS << " " << LLVM_VERSION_INFO;
-#endif
-    OS << "\n  ";
+    OS << PACKAGE_NAME << " version " << PACKAGE_VERSION << "\n  ";
 #if LLVM_IS_DEBUG_BUILD
     OS << "DEBUG build";
 #else
diff --git a/llvm/lib/Support/Compression.cpp b/llvm/lib/Support/Compression.cpp
index ccf6ef4bb662..983a6348bbe4 100644
--- a/llvm/lib/Support/Compression.cpp
+++ b/llvm/lib/Support/Compression.cpp
@@ -46,18 +46,20 @@ static StringRef convertZlibCodeToString(int Code) {
 
 bool zlib::isAvailable() { return true; }
 
-Error zlib::compress(StringRef InputBuffer,
-                     SmallVectorImpl<char> &CompressedBuffer, int Level) {
+void zlib::compress(StringRef InputBuffer,
+                    SmallVectorImpl<char> &CompressedBuffer, int Level) {
   unsigned long CompressedSize = ::compressBound(InputBuffer.size());
   CompressedBuffer.resize_for_overwrite(CompressedSize);
   int Res =
       ::compress2((Bytef *)CompressedBuffer.data(), &CompressedSize,
                   (const Bytef *)InputBuffer.data(), InputBuffer.size(), Level);
+  if (Res == Z_MEM_ERROR)
+    report_bad_alloc_error("Allocation failed");
+  assert(Res == Z_OK);
   // Tell MemorySanitizer that zlib output buffer is fully initialized.
   // This avoids a false report when running LLVM with uninstrumented ZLib.
   __msan_unpoison(CompressedBuffer.data(), CompressedSize);
   CompressedBuffer.truncate(CompressedSize);
-  return Res ? createError(convertZlibCodeToString(Res)) : Error::success();
 }
 
 Error zlib::uncompress(StringRef InputBuffer, char *UncompressedBuffer,
@@ -87,8 +89,8 @@ uint32_t zlib::crc32(StringRef Buffer) {
 
 #else
 bool zlib::isAvailable() { return false; }
-Error zlib::compress(StringRef InputBuffer,
-                     SmallVectorImpl<char> &CompressedBuffer, int Level) {
+void zlib::compress(StringRef InputBuffer,
+                    SmallVectorImpl<char> &CompressedBuffer, int Level) {
   llvm_unreachable("zlib::compress is unavailable");
 }
 Error zlib::uncompress(StringRef InputBuffer, char *UncompressedBuffer,
diff --git a/llvm/lib/Support/ConvertUTFWrapper.cpp b/llvm/lib/Support/ConvertUTFWrapper.cpp
index 392c4c4890e1..9bf3f8f8b897 100644
--- a/llvm/lib/Support/ConvertUTFWrapper.cpp
+++ b/llvm/lib/Support/ConvertUTFWrapper.cpp
@@ -34,31 +34,31 @@ bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
     const UTF8 *sourceStart = (const UTF8*)Source.data();
     // FIXME: Make the type of the result buffer correct instead of
     // using reinterpret_cast.
-    UTF16 *targetStart = reinterpret_cast<UTF16*>(ResultPtr);
+    UTF16 *targetStart = reinterpret_cast<UTF16 *>(ResultPtr);
     ConversionFlags flags = strictConversion;
-    result = ConvertUTF8toUTF16(
-        &sourceStart, sourceStart + Source.size(),
-        &targetStart, targetStart + Source.size(), flags);
+    result =
+        ConvertUTF8toUTF16(&sourceStart, sourceStart + Source.size(),
+                           &targetStart, targetStart + Source.size(), flags);
     if (result == conversionOK)
-      ResultPtr = reinterpret_cast<char*>(targetStart);
+      ResultPtr = reinterpret_cast<char *>(targetStart);
     else
       ErrorPtr = sourceStart;
   } else if (WideCharWidth == 4) {
-    const UTF8 *sourceStart = (const UTF8*)Source.data();
+    const UTF8 *sourceStart = (const UTF8 *)Source.data();
     // FIXME: Make the type of the result buffer correct instead of
     // using reinterpret_cast.
-    UTF32 *targetStart = reinterpret_cast<UTF32*>(ResultPtr);
+    UTF32 *targetStart = reinterpret_cast<UTF32 *>(ResultPtr);
     ConversionFlags flags = strictConversion;
-    result = ConvertUTF8toUTF32(
-        &sourceStart, sourceStart + Source.size(),
-        &targetStart, targetStart + Source.size(), flags);
+    result =
+        ConvertUTF8toUTF32(&sourceStart, sourceStart + Source.size(),
+                           &targetStart, targetStart + Source.size(), flags);
     if (result == conversionOK)
-      ResultPtr = reinterpret_cast<char*>(targetStart);
+      ResultPtr = reinterpret_cast<char *>(targetStart);
     else
       ErrorPtr = sourceStart;
   }
-  assert((result != targetExhausted)
-         && "ConvertUTF8toUTFXX exhausted target buffer");
+  assert((result != targetExhausted) &&
+         "ConvertUTF8toUTFXX exhausted target buffer");
   return result == conversionOK;
 }
 
@@ -67,20 +67,18 @@ bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr) {
   const UTF32 *SourceEnd = SourceStart + 1;
   UTF8 *TargetStart = reinterpret_cast<UTF8 *>(ResultPtr);
   UTF8 *TargetEnd = TargetStart + 4;
-  ConversionResult CR = ConvertUTF32toUTF8(&SourceStart, SourceEnd,
-                                           &TargetStart, TargetEnd,
-                                           strictConversion);
+  ConversionResult CR = ConvertUTF32toUTF8(
+      &SourceStart, SourceEnd, &TargetStart, TargetEnd, strictConversion);
   if (CR != conversionOK)
     return false;
 
-  ResultPtr = reinterpret_cast<char*>(TargetStart);
+  ResultPtr = reinterpret_cast<char *>(TargetStart);
   return true;
 }
 
 bool hasUTF16ByteOrderMark(ArrayRef<char> S) {
-  return (S.size() >= 2 &&
-          ((S[0] == '\xff' && S[1] == '\xfe') ||
-           (S[0] == '\xfe' && S[1] == '\xff')));
+  return (S.size() >= 2 && ((S[0] == '\xff' && S[1] == '\xfe') ||
+                            (S[0] == '\xfe' && S[1] == '\xff')));
 }
 
 bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {
@@ -134,11 +132,69 @@ bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {
   return true;
 }
 
-bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out)
-{
+bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out) {
   return convertUTF16ToUTF8String(
       llvm::ArrayRef<char>(reinterpret_cast<const char *>(Src.data()),
-      Src.size() * sizeof(UTF16)), Out);
+                           Src.size() * sizeof(UTF16)),
+      Out);
+}
+
+bool convertUTF32ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {
+  assert(Out.empty());
+
+  // Error out on an uneven byte count.
+  if (SrcBytes.size() % 4)
+    return false;
+
+  // Avoid OOB by returning early on empty input.
+  if (SrcBytes.empty())
+    return true;
+
+  const UTF32 *Src = reinterpret_cast<const UTF32 *>(SrcBytes.begin());
+  const UTF32 *SrcEnd = reinterpret_cast<const UTF32 *>(SrcBytes.end());
+
+  assert((uintptr_t)Src % sizeof(UTF32) == 0);
+
+  // Byteswap if necessary.
+  std::vector<UTF32> ByteSwapped;
+  if (Src[0] == UNI_UTF32_BYTE_ORDER_MARK_SWAPPED) {
+    ByteSwapped.insert(ByteSwapped.end(), Src, SrcEnd);
+    for (UTF32 &I : ByteSwapped)
+      I = llvm::ByteSwap_32(I);
+    Src = &ByteSwapped[0];
+    SrcEnd = &ByteSwapped[ByteSwapped.size() - 1] + 1;
+  }
+
+  // Skip the BOM for conversion.
+  if (Src[0] == UNI_UTF32_BYTE_ORDER_MARK_NATIVE)
+    Src++;
+
+  // Just allocate enough space up front.  We'll shrink it later.  Allocate
+  // enough that we can fit a null terminator without reallocating.
+  Out.resize(SrcBytes.size() * UNI_MAX_UTF8_BYTES_PER_CODE_POINT + 1);
+  UTF8 *Dst = reinterpret_cast<UTF8 *>(&Out[0]);
+  UTF8 *DstEnd = Dst + Out.size();
+
+  ConversionResult CR =
+      ConvertUTF32toUTF8(&Src, SrcEnd, &Dst, DstEnd, strictConversion);
+  assert(CR != targetExhausted);
+
+  if (CR != conversionOK) {
+    Out.clear();
+    return false;
+  }
+
+  Out.resize(reinterpret_cast<char *>(Dst) - &Out[0]);
+  Out.push_back(0);
+  Out.pop_back();
+  return true;
+}
+
+bool convertUTF32ToUTF8String(ArrayRef<UTF32> Src, std::string &Out) {
+  return convertUTF32ToUTF8String(
+      llvm::ArrayRef<char>(reinterpret_cast<const char *>(Src.data()),
+                           Src.size() * sizeof(UTF32)),
+      Out);
 }
 
 bool convertUTF8ToUTF16String(StringRef SrcUTF8,
diff --git a/llvm/lib/Support/CrashRecoveryContext.cpp b/llvm/lib/Support/CrashRecoveryContext.cpp
index 2ee3074b840e..292ba63d14aa 100644
--- a/llvm/lib/Support/CrashRecoveryContext.cpp
+++ b/llvm/lib/Support/CrashRecoveryContext.cpp
@@ -9,6 +9,7 @@
 #include "llvm/Support/CrashRecoveryContext.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ExitCodes.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/ThreadLocal.h"
@@ -16,10 +17,6 @@
 #include <mutex>
 #include <setjmp.h>
 
-#if !defined(_MSC_VER) && !defined(_WIN32)
-#include "llvm/Support/ExitCodes.h"
-#endif
-
 using namespace llvm;
 
 namespace {
@@ -97,7 +94,7 @@ static ManagedStatic<sys::ThreadLocal<const CrashRecoveryContext>>
 static void installExceptionOrSignalHandlers();
 static void uninstallExceptionOrSignalHandlers();
 
-CrashRecoveryContextCleanup::~CrashRecoveryContextCleanup() {}
+CrashRecoveryContextCleanup::~CrashRecoveryContextCleanup() = default;
 
 CrashRecoveryContext::CrashRecoveryContext() {
   // On Windows, if abort() was previously triggered (and caught by a previous
@@ -445,7 +442,7 @@ bool CrashRecoveryContext::RunSafely(function_ref<void()> Fn) {
   llvm_unreachable("Most likely setjmp wasn't called!");
 }
 
-bool CrashRecoveryContext::throwIfCrash(int RetCode) {
+bool CrashRecoveryContext::isCrash(int RetCode) {
 #if defined(_WIN32)
   // On Windows, the high bits are reserved for kernel return codes. Values
   // starting with 0x80000000 are reserved for "warnings"; values of 0xC0000000
@@ -454,12 +451,21 @@ bool CrashRecoveryContext::throwIfCrash(int RetCode) {
   unsigned Code = ((unsigned)RetCode & 0xF0000000) >> 28;
   if (Code != 0xC && Code != 8)
     return false;
-  ::RaiseException(RetCode, 0, 0, NULL);
 #else
   // On Unix, signals are represented by return codes of 128 or higher.
   // Exit code 128 is a reserved value and should not be raised as a signal.
   if (RetCode <= 128)
     return false;
+#endif
+  return true;
+}
+
+bool CrashRecoveryContext::throwIfCrash(int RetCode) {
+  if (!isCrash(RetCode))
+    return false;
+#if defined(_WIN32)
+  ::RaiseException(RetCode, 0, 0, NULL);
+#else
   llvm::sys::unregisterHandlers();
   raise(RetCode - 128);
 #endif
diff --git a/llvm/lib/Support/Debug.cpp b/llvm/lib/Support/Debug.cpp
index 5470d931b00b..98a9ac4722b5 100644
--- a/llvm/lib/Support/Debug.cpp
+++ b/llvm/lib/Support/Debug.cpp
@@ -132,7 +132,7 @@ struct CreateDebugOnly {
         "debug-only",
         cl::desc("Enable a specific type of debug output (comma separated list "
                  "of types)"),
-        cl::Hidden, cl::ZeroOrMore, cl::value_desc("debug string"),
+        cl::Hidden, cl::value_desc("debug string"),
         cl::location(DebugOnlyOptLoc), cl::ValueRequired);
   }
 };
diff --git a/llvm/lib/Support/DebugCounter.cpp b/llvm/lib/Support/DebugCounter.cpp
index f553463be8df..bc2df37e773d 100644
--- a/llvm/lib/Support/DebugCounter.cpp
+++ b/llvm/lib/Support/DebugCounter.cpp
@@ -49,8 +49,7 @@ struct CreateDebugCounterOption {
     return new DebugCounterList(
         "debug-counter", cl::Hidden,
         cl::desc("Comma separated list of debug counter skip and count"),
-        cl::CommaSeparated, cl::ZeroOrMore,
-        cl::location(DebugCounter::instance()));
+        cl::CommaSeparated, cl::location(DebugCounter::instance()));
   }
 };
 } // namespace
diff --git a/llvm/lib/Support/DeltaAlgorithm.cpp b/llvm/lib/Support/DeltaAlgorithm.cpp
index a2017a10ab3f..341de244547c 100644
--- a/llvm/lib/Support/DeltaAlgorithm.cpp
+++ b/llvm/lib/Support/DeltaAlgorithm.cpp
@@ -11,8 +11,7 @@
 #include <set>
 using namespace llvm;
 
-DeltaAlgorithm::~DeltaAlgorithm() {
-}
+DeltaAlgorithm::~DeltaAlgorithm() = default;
 
 bool DeltaAlgorithm::GetTestResult(const changeset_ty &Changes) {
   if (FailedTestsCache.count(Changes))
diff --git a/llvm/lib/Support/DynamicLibrary.cpp b/llvm/lib/Support/DynamicLibrary.cpp
index 2bcdbdcdb9b0..7b9d7abe7545 100644
--- a/llvm/lib/Support/DynamicLibrary.cpp
+++ b/llvm/lib/Support/DynamicLibrary.cpp
@@ -12,14 +12,11 @@
 
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm-c/Support.h"
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Mutex.h"
-#include <cstdio>
-#include <cstring>
 #include <vector>
 
 using namespace llvm;
@@ -29,14 +26,14 @@ using namespace llvm::sys;
 class DynamicLibrary::HandleSet {
   typedef std::vector<void *> HandleList;
   HandleList Handles;
-  void *Process;
+  void *Process = nullptr;
 
 public:
   static void *DLOpen(const char *Filename, std::string *Err);
   static void DLClose(void *Handle);
   static void *DLSym(void *Handle, const char *Symbol);
 
-  HandleSet() : Process(nullptr) {}
+  HandleSet() = default;
   ~HandleSet();
 
   HandleList::iterator Find(void *Handle) { return find(Handles, Handle); }
diff --git a/llvm/lib/Support/Errno.cpp b/llvm/lib/Support/Errno.cpp
index d18231c6ebf5..7f665be8db6c 100644
--- a/llvm/lib/Support/Errno.cpp
+++ b/llvm/lib/Support/Errno.cpp
@@ -12,8 +12,7 @@
 
 #include "llvm/Support/Errno.h"
 #include "llvm/Config/config.h"
-#include "llvm/Support/raw_ostream.h"
-#include <string.h>
+#include <cstring>
 
 #if HAVE_ERRNO_H
 #include <errno.h>
diff --git a/llvm/lib/Support/ErrorHandling.cpp b/llvm/lib/Support/ErrorHandling.cpp
index 80c0e00439a5..b8b3b7424ac6 100644
--- a/llvm/lib/Support/ErrorHandling.cpp
+++ b/llvm/lib/Support/ErrorHandling.cpp
@@ -119,7 +119,10 @@ void llvm::report_fatal_error(const Twine &Reason, bool GenCrashDiag) {
   // files registered with RemoveFileOnSignal.
   sys::RunInterruptHandlers();
 
-  abort();
+  if (GenCrashDiag)
+    abort();
+  else
+    exit(1);
 }
 
 void llvm::install_bad_alloc_error_handler(fatal_error_handler_t handler,
diff --git a/llvm/lib/Support/FileUtilities.cpp b/llvm/lib/Support/FileUtilities.cpp
index 489b8d119e6f..eda3eb044901 100644
--- a/llvm/lib/Support/FileUtilities.cpp
+++ b/llvm/lib/Support/FileUtilities.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Process.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdint>
 #include <cstdlib>
@@ -323,4 +324,69 @@ llvm::Error llvm::writeFileAtomically(
   return Error::success();
 }
 
+Expected<FilePermissionsApplier>
+FilePermissionsApplier::create(StringRef InputFilename) {
+  sys::fs::file_status Status;
+
+  if (InputFilename != "-") {
+    if (auto EC = sys::fs::status(InputFilename, Status))
+      return createFileError(InputFilename, EC);
+  } else {
+    Status.permissions(static_cast<sys::fs::perms>(0777));
+  }
+
+  return FilePermissionsApplier(InputFilename, Status);
+}
+
+Error FilePermissionsApplier::apply(
+    StringRef OutputFilename, bool CopyDates,
+    Optional<sys::fs::perms> OverwritePermissions) {
+  sys::fs::file_status Status = InputStatus;
+
+  if (OverwritePermissions)
+    Status.permissions(*OverwritePermissions);
+
+  int FD = 0;
+
+  // Writing to stdout should not be treated as an error here, just
+  // do not set access/modification times or permissions.
+  if (OutputFilename == "-")
+    return Error::success();
+
+  if (std::error_code EC = sys::fs::openFileForWrite(OutputFilename, FD,
+                                                     sys::fs::CD_OpenExisting))
+    return createFileError(OutputFilename, EC);
+
+  if (CopyDates)
+    if (std::error_code EC = sys::fs::setLastAccessAndModificationTime(
+            FD, Status.getLastAccessedTime(), Status.getLastModificationTime()))
+      return createFileError(OutputFilename, EC);
+
+  sys::fs::file_status OStat;
+  if (std::error_code EC = sys::fs::status(FD, OStat))
+    return createFileError(OutputFilename, EC);
+  if (OStat.type() == sys::fs::file_type::regular_file) {
+#ifndef _WIN32
+    // Keep ownership if llvm-objcopy is called under root.
+    if (OutputFilename == InputFilename && OStat.getUser() == 0)
+      sys::fs::changeFileOwnership(FD, Status.getUser(), Status.getGroup());
+#endif
+
+    sys::fs::perms Perm = Status.permissions();
+    if (OutputFilename != InputFilename)
+      Perm = static_cast<sys::fs::perms>(Perm & ~sys::fs::getUmask() & ~06000);
+#ifdef _WIN32
+    if (std::error_code EC = sys::fs::setPermissions(OutputFilename, Perm))
+#else
+    if (std::error_code EC = sys::fs::setPermissions(FD, Perm))
+#endif
+      return createFileError(OutputFilename, EC);
+  }
+
+  if (std::error_code EC = sys::Process::SafelyCloseFileDescriptor(FD))
+    return createFileError(OutputFilename, EC);
+
+  return Error::success();
+}
+
 char llvm::AtomicFileWriteError::ID;
diff --git a/llvm/lib/Support/FoldingSet.cpp b/llvm/lib/Support/FoldingSet.cpp
index e3d7168305af..178855289fe8 100644
--- a/llvm/lib/Support/FoldingSet.cpp
+++ b/llvm/lib/Support/FoldingSet.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/FoldingSet.h"
-#include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -25,12 +24,6 @@ using namespace llvm;
 //===----------------------------------------------------------------------===//
 // FoldingSetNodeIDRef Implementation
 
-/// ComputeHash - Compute a strong hash value for this FoldingSetNodeIDRef,
-/// used to lookup the node in the FoldingSetBase.
-unsigned FoldingSetNodeIDRef::ComputeHash() const {
-  return static_cast<unsigned>(hash_combine_range(Data, Data+Size));
-}
-
 bool FoldingSetNodeIDRef::operator==(FoldingSetNodeIDRef RHS) const {
   if (Size != RHS.Size) return false;
   return memcmp(Data, RHS.Data, Size*sizeof(*Data)) == 0;
@@ -49,41 +42,6 @@ bool FoldingSetNodeIDRef::operator<(FoldingSetNodeIDRef RHS) const {
 
 /// Add* - Add various data types to Bit data.
 ///
-void FoldingSetNodeID::AddPointer(const void *Ptr) {
-  // Note: this adds pointers to the hash using sizes and endianness that
-  // depend on the host. It doesn't matter, however, because hashing on
-  // pointer values is inherently unstable. Nothing should depend on the
-  // ordering of nodes in the folding set.
-  static_assert(sizeof(uintptr_t) <= sizeof(unsigned long long),
-                "unexpected pointer size");
-  AddInteger(reinterpret_cast<uintptr_t>(Ptr));
-}
-void FoldingSetNodeID::AddInteger(signed I) {
-  Bits.push_back(I);
-}
-void FoldingSetNodeID::AddInteger(unsigned I) {
-  Bits.push_back(I);
-}
-void FoldingSetNodeID::AddInteger(long I) {
-  AddInteger((unsigned long)I);
-}
-void FoldingSetNodeID::AddInteger(unsigned long I) {
-  if (sizeof(long) == sizeof(int))
-    AddInteger(unsigned(I));
-  else if (sizeof(long) == sizeof(long long)) {
-    AddInteger((unsigned long long)I);
-  } else {
-    llvm_unreachable("unexpected sizeof(long)");
-  }
-}
-void FoldingSetNodeID::AddInteger(long long I) {
-  AddInteger((unsigned long long)I);
-}
-void FoldingSetNodeID::AddInteger(unsigned long long I) {
-  AddInteger(unsigned(I));
-  AddInteger(unsigned(I >> 32));
-}
-
 void FoldingSetNodeID::AddString(StringRef String) {
   unsigned Size =  String.size();
 
@@ -145,12 +103,6 @@ void FoldingSetNodeID::AddNodeID(const FoldingSetNodeID &ID) {
   Bits.append(ID.Bits.begin(), ID.Bits.end());
 }
 
-/// ComputeHash - Compute a strong hash value for this FoldingSetNodeID, used to
-/// lookup the node in the FoldingSetBase.
-unsigned FoldingSetNodeID::ComputeHash() const {
-  return FoldingSetNodeIDRef(Bits.data(), Bits.size()).ComputeHash();
-}
-
 /// operator== - Used to compare two nodes to each other.
 ///
 bool FoldingSetNodeID::operator==(const FoldingSetNodeID &RHS) const {
diff --git a/llvm/lib/Support/FormatVariadic.cpp b/llvm/lib/Support/FormatVariadic.cpp
index f6d48bcd50e8..0709d65e81e0 100644
--- a/llvm/lib/Support/FormatVariadic.cpp
+++ b/llvm/lib/Support/FormatVariadic.cpp
@@ -130,7 +130,7 @@ formatv_object_base::splitLiteralAndReplacement(StringRef Fmt) {
     StringRef Right = Fmt.substr(BC + 1);
 
     auto RI = parseReplacementItem(Spec);
-    if (RI.hasValue())
+    if (RI)
       return std::make_pair(*RI, Right);
 
     // If there was an error parsing the replacement item, treat it as an
diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp
index f6003b783245..08e3a27e0173 100644
--- a/llvm/lib/Support/Host.cpp
+++ b/llvm/lib/Support/Host.cpp
@@ -11,20 +11,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Host.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Config/llvm-config.h"
-#include "llvm/Support/BCD.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/X86TargetParser.h"
 #include "llvm/Support/raw_ostream.h"
-#include <assert.h>
 #include <string.h>
 
 // Include the platform-specific parts of this class.
@@ -38,11 +33,16 @@
 #ifdef _MSC_VER
 #include <intrin.h>
 #endif
-#if defined(__APPLE__) && (!defined(__x86_64__))
+#ifdef __MVS__
+#include "llvm/Support/BCD.h"
+#endif
+#if defined(__APPLE__)
 #include <mach/host_info.h>
 #include <mach/mach.h>
 #include <mach/mach_host.h>
 #include <mach/machine.h>
+#include <sys/param.h>
+#include <sys/sysctl.h>
 #endif
 #ifdef _AIX
 #include <sys/systemcfg.h>
@@ -296,6 +296,12 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
     }
   }
 
+  if (Implementer == "0xc0") { // Ampere Computing
+    return StringSwitch<const char *>(Part)
+        .Case("0xac3", "ampere1")
+        .Default("generic");
+  }
+
   return "generic";
 }
 
@@ -330,7 +336,7 @@ StringRef getCPUNameFromS390Model(unsigned int Id, bool HaveVectorSupport) {
     case 3931:
     case 3932:
     default:
-      return HaveVectorSupport? "arch14" : "zEC12";
+      return HaveVectorSupport? "z16" : "zEC12";
   }
 }
 } // end anonymous namespace
@@ -380,6 +386,26 @@ StringRef sys::detail::getHostCPUNameForS390x(StringRef ProcCpuinfoContent) {
   return "generic";
 }
 
+StringRef sys::detail::getHostCPUNameForRISCV(StringRef ProcCpuinfoContent) {
+  // There are 24 lines in /proc/cpuinfo
+  SmallVector<StringRef> Lines;
+  ProcCpuinfoContent.split(Lines, "\n");
+
+  // Look for uarch line to determine cpu name
+  StringRef UArch;
+  for (unsigned I = 0, E = Lines.size(); I != E; ++I) {
+    if (Lines[I].startswith("uarch")) {
+      UArch = Lines[I].substr(5).ltrim("\t :");
+      break;
+    }
+  }
+
+  return StringSwitch<const char *>(UArch)
+      .Case("sifive,u74-mc", "sifive-u74")
+      .Case("sifive,bullet0", "sifive-u74")
+      .Default("generic");
+}
+
 StringRef sys::detail::getHostCPUNameForBPF() {
 #if !defined(__linux__) || !defined(__x86_64__)
   return "generic";
@@ -1034,9 +1060,9 @@ getAMDProcessorTypeAndSubtype(unsigned Family, unsigned Model,
   case 25:
     CPU = "znver3";
     *Type = X86::AMDFAM19H;
-    if (Model <= 0x0f) {
+    if (Model <= 0x0f || Model == 0x21) {
       *Subtype = X86::AMDFAM19H_ZNVER3;
-      break; // 00h-0Fh: Zen3
+      break; // 00h-0Fh, 21h: Zen3
     }
     break;
   default:
@@ -1299,32 +1325,45 @@ StringRef sys::getHostCPUName() {
   bool HaveVectorSupport = CVT[244] & 0x80;
   return getCPUNameFromS390Model(Id, HaveVectorSupport);
 }
-#elif defined(__APPLE__) && defined(__aarch64__)
-StringRef sys::getHostCPUName() {
-  return "cyclone";
-}
-#elif defined(__APPLE__) && defined(__arm__)
-StringRef sys::getHostCPUName() {
-  host_basic_info_data_t hostInfo;
-  mach_msg_type_number_t infoCount;
+#elif defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__))
+#define CPUFAMILY_ARM_SWIFT 0x1e2d6381
+#define CPUFAMILY_ARM_CYCLONE 0x37a09642
+#define CPUFAMILY_ARM_TYPHOON 0x2c91a47e
+#define CPUFAMILY_ARM_TWISTER 0x92fb37c8
+#define CPUFAMILY_ARM_HURRICANE 0x67ceee93
+#define CPUFAMILY_ARM_MONSOON_MISTRAL 0xe81e7ef6
+#define CPUFAMILY_ARM_VORTEX_TEMPEST 0x07d34b9f
+#define CPUFAMILY_ARM_LIGHTNING_THUNDER 0x462504d2
+#define CPUFAMILY_ARM_FIRESTORM_ICESTORM 0x1b588bb3
 
-  infoCount = HOST_BASIC_INFO_COUNT;
-  mach_port_t hostPort = mach_host_self();
-  host_info(hostPort, HOST_BASIC_INFO, (host_info_t)&hostInfo,
-            &infoCount);
-  mach_port_deallocate(mach_task_self(), hostPort);
+StringRef sys::getHostCPUName() {
+  uint32_t Family;
+  size_t Length = sizeof(Family);
+  sysctlbyname("hw.cpufamily", &Family, &Length, NULL, 0);
 
-  if (hostInfo.cpu_type != CPU_TYPE_ARM) {
-    assert(false && "CPUType not equal to ARM should not be possible on ARM");
-    return "generic";
+  switch (Family) {
+  case CPUFAMILY_ARM_SWIFT:
+    return "swift";
+  case CPUFAMILY_ARM_CYCLONE:
+    return "apple-a7";
+  case CPUFAMILY_ARM_TYPHOON:
+    return "apple-a8";
+  case CPUFAMILY_ARM_TWISTER:
+    return "apple-a9";
+  case CPUFAMILY_ARM_HURRICANE:
+    return "apple-a10";
+  case CPUFAMILY_ARM_MONSOON_MISTRAL:
+    return "apple-a11";
+  case CPUFAMILY_ARM_VORTEX_TEMPEST:
+    return "apple-a12";
+  case CPUFAMILY_ARM_LIGHTNING_THUNDER:
+    return "apple-a13";
+  case CPUFAMILY_ARM_FIRESTORM_ICESTORM:
+    return "apple-m1";
+  default:
+    // Default to the newest CPU we know about.
+    return "apple-m1";
   }
-  switch (hostInfo.cpu_subtype) {
-    case CPU_SUBTYPE_ARM_V7S:
-      return "swift";
-    default:;
-    }
-
-  return "generic";
 }
 #elif defined(_AIX)
 StringRef sys::getHostCPUName() {
@@ -1360,6 +1399,11 @@ StringRef sys::getHostCPUName() {
 }
 #elif defined(__riscv)
 StringRef sys::getHostCPUName() {
+#if defined(__linux__)
+  std::unique_ptr<llvm::MemoryBuffer> P = getProcCpuinfoContent();
+  StringRef Content = P ? P->getBuffer() : "";
+  return detail::getHostCPUNameForRISCV(Content);
+#else
 #if __riscv_xlen == 64
   return "generic-rv64";
 #elif __riscv_xlen == 32
@@ -1367,6 +1411,7 @@ StringRef sys::getHostCPUName() {
 #else
 #error "Unhandled value of __riscv_xlen"
 #endif
+#endif
 }
 #else
 StringRef sys::getHostCPUName() { return "generic"; }
@@ -1455,9 +1500,6 @@ int computeHostNumPhysicalCores() {
 #elif defined(__linux__) && defined(__s390x__)
 int computeHostNumPhysicalCores() { return sysconf(_SC_NPROCESSORS_ONLN); }
 #elif defined(__APPLE__)
-#include <sys/param.h>
-#include <sys/sysctl.h>
-
 // Gets the number of *physical cores* on the machine.
 int computeHostNumPhysicalCores() {
   uint32_t count;
@@ -1706,6 +1748,9 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
                                    .Case("asimd", "neon")
                                    .Case("fp", "fp-armv8")
                                    .Case("crc32", "crc")
+                                   .Case("atomics", "lse")
+                                   .Case("sve", "sve")
+                                   .Case("sve2", "sve2")
 #else
                                    .Case("half", "fp16")
                                    .Case("neon", "neon")
diff --git a/llvm/lib/Support/ItaniumManglingCanonicalizer.cpp b/llvm/lib/Support/ItaniumManglingCanonicalizer.cpp
index e6cba26cfcf3..52d5de93ff7d 100644
--- a/llvm/lib/Support/ItaniumManglingCanonicalizer.cpp
+++ b/llvm/lib/Support/ItaniumManglingCanonicalizer.cpp
@@ -189,20 +189,6 @@ public:
   bool trackedNodeIsUsed() const { return TrackedNodeIsUsed; }
 };
 
-/// Convert St3foo to NSt3fooE so that equivalences naming one also affect the
-/// other.
-template<>
-struct CanonicalizerAllocator::MakeNodeImpl<
-           itanium_demangle::StdQualifiedName> {
-  CanonicalizerAllocator &Self;
-  Node *make(Node *Child) {
-    Node *StdNamespace = Self.makeNode<itanium_demangle::NameType>("std");
-    if (!StdNamespace)
-      return nullptr;
-    return Self.makeNode<itanium_demangle::NestedName>(StdNamespace, Child);
-  }
-};
-
 // FIXME: Also expand built-in substitutions?
 
 using CanonicalizingDemangler =
diff --git a/llvm/lib/Support/JSON.cpp b/llvm/lib/Support/JSON.cpp
index 20babbe56d86..b87e39f0a963 100644
--- a/llvm/lib/Support/JSON.cpp
+++ b/llvm/lib/Support/JSON.cpp
@@ -509,13 +509,25 @@ bool Parser::parseNumber(char First, Value &Out) {
     S.push_back(next());
   char *End;
   // Try first to parse as integer, and if so preserve full 64 bits.
-  // strtoll returns long long >= 64 bits, so check it's in range too.
-  auto I = std::strtoll(S.c_str(), &End, 10);
-  if (End == S.end() && I >= std::numeric_limits<int64_t>::min() &&
-      I <= std::numeric_limits<int64_t>::max()) {
+  // We check for errno for out of bounds errors and for End == S.end()
+  // to make sure that the numeric string is not malformed.
+  errno = 0;
+  int64_t I = std::strtoll(S.c_str(), &End, 10);
+  if (End == S.end() && errno != ERANGE) {
     Out = int64_t(I);
     return true;
   }
+  // strtroull has a special handling for negative numbers, but in this
+  // case we don't want to do that because negative numbers were already
+  // handled in the previous block.
+  if (First != '-') {
+    errno = 0;
+    uint64_t UI = std::strtoull(S.c_str(), &End, 10);
+    if (End == S.end() && errno != ERANGE) {
+      Out = UI;
+      return true;
+    }
+  }
   // If it's not an integer
   Out = std::strtod(S.c_str(), &End);
   return End == S.end() || parseError("Invalid JSON value (number?)");
diff --git a/llvm/lib/Support/KnownBits.cpp b/llvm/lib/Support/KnownBits.cpp
index 8e154067abc0..9f34405e54fc 100644
--- a/llvm/lib/Support/KnownBits.cpp
+++ b/llvm/lib/Support/KnownBits.cpp
@@ -340,7 +340,7 @@ Optional<bool> KnownBits::eq(const KnownBits &LHS, const KnownBits &RHS) {
 
 Optional<bool> KnownBits::ne(const KnownBits &LHS, const KnownBits &RHS) {
   if (Optional<bool> KnownEQ = eq(LHS, RHS))
-    return Optional<bool>(!KnownEQ.getValue());
+    return Optional<bool>(!*KnownEQ);
   return None;
 }
 
@@ -356,7 +356,7 @@ Optional<bool> KnownBits::ugt(const KnownBits &LHS, const KnownBits &RHS) {
 
 Optional<bool> KnownBits::uge(const KnownBits &LHS, const KnownBits &RHS) {
   if (Optional<bool> IsUGT = ugt(RHS, LHS))
-    return Optional<bool>(!IsUGT.getValue());
+    return Optional<bool>(!*IsUGT);
   return None;
 }
 
@@ -380,7 +380,7 @@ Optional<bool> KnownBits::sgt(const KnownBits &LHS, const KnownBits &RHS) {
 
 Optional<bool> KnownBits::sge(const KnownBits &LHS, const KnownBits &RHS) {
   if (Optional<bool> KnownSGT = sgt(RHS, LHS))
-    return Optional<bool>(!KnownSGT.getValue());
+    return Optional<bool>(!*KnownSGT);
   return None;
 }
 
@@ -413,11 +413,11 @@ KnownBits KnownBits::abs(bool IntMinIsPoison) const {
 }
 
 KnownBits KnownBits::mul(const KnownBits &LHS, const KnownBits &RHS,
-                         bool SelfMultiply) {
+                         bool NoUndefSelfMultiply) {
   unsigned BitWidth = LHS.getBitWidth();
   assert(BitWidth == RHS.getBitWidth() && !LHS.hasConflict() &&
          !RHS.hasConflict() && "Operand mismatch");
-  assert((!SelfMultiply || (LHS.One == RHS.One && LHS.Zero == RHS.Zero)) &&
+  assert((!NoUndefSelfMultiply || LHS == RHS) &&
          "Self multiplication knownbits mismatch");
 
   // Compute the high known-0 bits by multiplying the unsigned max of each side.
@@ -501,7 +501,7 @@ KnownBits KnownBits::mul(const KnownBits &LHS, const KnownBits &RHS,
   Res.One = BottomKnown.getLoBits(ResultBitsKnown);
 
   // If we're self-multiplying then bit[1] is guaranteed to be zero.
-  if (SelfMultiply && BitWidth > 1) {
+  if (NoUndefSelfMultiply && BitWidth > 1) {
     assert(Res.One[1] == 0 &&
            "Self-multiplication failed Quadratic Reciprocity!");
     Res.Zero.setBit(1);
diff --git a/llvm/lib/Support/LineIterator.cpp b/llvm/lib/Support/LineIterator.cpp
index 7bdf1271ac25..9874d16d19e1 100644
--- a/llvm/lib/Support/LineIterator.cpp
+++ b/llvm/lib/Support/LineIterator.cpp
@@ -38,7 +38,7 @@ line_iterator::line_iterator(const MemoryBuffer &Buffer, bool SkipBlanks,
 line_iterator::line_iterator(const MemoryBufferRef &Buffer, bool SkipBlanks,
                              char CommentMarker)
     : Buffer(Buffer.getBufferSize() ? Optional<MemoryBufferRef>(Buffer) : None),
-      CommentMarker(CommentMarker), SkipBlanks(SkipBlanks), LineNumber(1),
+      CommentMarker(CommentMarker), SkipBlanks(SkipBlanks),
       CurrentLine(Buffer.getBufferSize() ? Buffer.getBufferStart() : nullptr,
                   0) {
   // Ensure that if we are constructed on a non-empty memory buffer that it is
diff --git a/llvm/lib/Support/MD5.cpp b/llvm/lib/Support/MD5.cpp
index caadde389504..fdcf34d70ad9 100644
--- a/llvm/lib/Support/MD5.cpp
+++ b/llvm/lib/Support/MD5.cpp
@@ -261,13 +261,13 @@ void MD5::final(MD5Result &Result) {
   support::endian::write32le(&Result[12], InternalState.d);
 }
 
-StringRef MD5::final() {
+MD5::MD5Result MD5::final() {
+  MD5Result Result;
   final(Result);
-  return StringRef(reinterpret_cast<char *>(Result.Bytes.data()),
-                   Result.Bytes.size());
+  return Result;
 }
 
-StringRef MD5::result() {
+MD5::MD5Result MD5::result() {
   auto StateToRestore = InternalState;
 
   auto Hash = final();
@@ -280,15 +280,15 @@ StringRef MD5::result() {
 
 SmallString<32> MD5::MD5Result::digest() const {
   SmallString<32> Str;
-  toHex(Bytes, /*LowerCase*/ true, Str);
+  toHex(*this, /*LowerCase*/ true, Str);
   return Str;
 }
 
 void MD5::stringifyResult(MD5Result &Result, SmallVectorImpl<char> &Str) {
-  toHex(Result.Bytes, /*LowerCase*/ true, Str);
+  toHex(Result, /*LowerCase*/ true, Str);
 }
 
-std::array<uint8_t, 16> MD5::hash(ArrayRef<uint8_t> Data) {
+MD5::MD5Result MD5::hash(ArrayRef<uint8_t> Data) {
   MD5 Hash;
   Hash.update(Data);
   MD5::MD5Result Res;
diff --git a/llvm/lib/Support/MathExtras.cpp b/llvm/lib/Support/MathExtras.cpp
index 7efffaa7f8b8..ad44b1a21676 100644
--- a/llvm/lib/Support/MathExtras.cpp
+++ b/llvm/lib/Support/MathExtras.cpp
@@ -15,7 +15,7 @@
 #ifdef _MSC_VER
 #include <limits>
 #else
-#include <math.h>
+#include <cmath>
 #endif
 
 namespace llvm {
diff --git a/llvm/lib/Support/Memory.cpp b/llvm/lib/Support/Memory.cpp
index 581484268cd8..f1ba2d0cfe3a 100644
--- a/llvm/lib/Support/Memory.cpp
+++ b/llvm/lib/Support/Memory.cpp
@@ -13,7 +13,6 @@
 
 #include "llvm/Support/Memory.h"
 #include "llvm/Config/llvm-config.h"
-#include "llvm/Support/Valgrind.h"
 
 #ifndef NDEBUG
 #include "llvm/Support/raw_ostream.h"
diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp
index 7816779cca1d..9872dfa78b26 100644
--- a/llvm/lib/Support/MemoryBuffer.cpp
+++ b/llvm/lib/Support/MemoryBuffer.cpp
@@ -13,10 +13,9 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Config/config.h"
-#include "llvm/Support/AutoConvert.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Process.h"
@@ -32,13 +31,17 @@
 #else
 #include <io.h>
 #endif
+
+#ifdef __MVS__
+#include "llvm/Support/AutoConvert.h"
+#endif
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
 // MemoryBuffer implementation itself.
 //===----------------------------------------------------------------------===//
 
-MemoryBuffer::~MemoryBuffer() { }
+MemoryBuffer::~MemoryBuffer() = default;
 
 /// init - Initialize this MemoryBuffer as a reference to externally allocated
 /// memory, memory that we know is already null terminated.
@@ -286,6 +289,8 @@ WritableMemoryBuffer::getNewUninitMemBuffer(size_t Size, const Twine &BufferName
   StringRef NameRef = BufferName.toStringRef(NameBuf);
   size_t AlignedStringLen = alignTo(sizeof(MemBuffer) + NameRef.size() + 1, 16);
   size_t RealLen = AlignedStringLen + Size + 1;
+  if (RealLen <= Size) // Check for rollover.
+    return nullptr;
   char *Mem = static_cast<char*>(operator new(RealLen, std::nothrow));
   if (!Mem)
     return nullptr;
@@ -533,4 +538,4 @@ MemoryBufferRef MemoryBuffer::getMemBufferRef() const {
   return MemoryBufferRef(Data, Identifier);
 }
 
-SmallVectorMemoryBuffer::~SmallVectorMemoryBuffer() {}
+SmallVectorMemoryBuffer::~SmallVectorMemoryBuffer() = default;
diff --git a/llvm/lib/Support/NativeFormatting.cpp b/llvm/lib/Support/NativeFormatting.cpp
index 0a797046bb68..8a69f7513255 100644
--- a/llvm/lib/Support/NativeFormatting.cpp
+++ b/llvm/lib/Support/NativeFormatting.cpp
@@ -14,6 +14,10 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 
+#if defined(_WIN32) && !defined(__MINGW32__)
+#include <float.h> // For _fpclass in llvm::write_double.
+#endif
+
 using namespace llvm;
 
 template<typename T, std::size_t N>
@@ -133,7 +137,7 @@ void llvm::write_hex(raw_ostream &S, uint64_t N, HexPrintStyle Style,
                      Optional<size_t> Width) {
   const size_t kMaxWidth = 128u;
 
-  size_t W = std::min(kMaxWidth, Width.getValueOr(0u));
+  size_t W = std::min(kMaxWidth, Width.value_or(0u));
 
   unsigned Nibbles = (64 - countLeadingZeros(N) + 3) / 4;
   bool Prefix = (Style == HexPrintStyle::PrefixLower ||
@@ -161,7 +165,7 @@ void llvm::write_hex(raw_ostream &S, uint64_t N, HexPrintStyle Style,
 
 void llvm::write_double(raw_ostream &S, double N, FloatStyle Style,
                         Optional<size_t> Precision) {
-  size_t Prec = Precision.getValueOr(getDefaultPrecision(Style));
+  size_t Prec = Precision.value_or(getDefaultPrecision(Style));
 
   if (std::isnan(N)) {
     S << "nan";
@@ -258,5 +262,5 @@ size_t llvm::getDefaultPrecision(FloatStyle Style) {
   case FloatStyle::Percent:
     return 2; // Number of decimal places.
   }
-  LLVM_BUILTIN_UNREACHABLE;
+  llvm_unreachable("Unknown FloatStyle enum");
 }
diff --git a/llvm/lib/Support/Parallel.cpp b/llvm/lib/Support/Parallel.cpp
index 4977c188f934..798d7124e7e9 100644
--- a/llvm/lib/Support/Parallel.cpp
+++ b/llvm/lib/Support/Parallel.cpp
@@ -89,7 +89,7 @@ public:
   void add(std::function<void()> F) override {
     {
       std::lock_guard<std::mutex> Lock(Mutex);
-      WorkStack.push(F);
+      WorkStack.push(std::move(F));
     }
     Cond.notify_one();
   }
@@ -102,7 +102,7 @@ private:
       Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); });
       if (Stop)
         break;
-      auto Task = WorkStack.top();
+      auto Task = std::move(WorkStack.top());
       WorkStack.pop();
       Lock.unlock();
       Task();
@@ -161,7 +161,7 @@ TaskGroup::~TaskGroup() {
 void TaskGroup::spawn(std::function<void()> F) {
   if (Parallel) {
     L.inc();
-    Executor::getDefaultExecutor()->add([&, F] {
+    Executor::getDefaultExecutor()->add([&, F = std::move(F)] {
       F();
       L.dec();
     });
@@ -175,8 +175,8 @@ void TaskGroup::spawn(std::function<void()> F) {
 } // namespace llvm
 #endif // LLVM_ENABLE_THREADS
 
-void llvm::parallelForEachN(size_t Begin, size_t End,
-                            llvm::function_ref<void(size_t)> Fn) {
+void llvm::parallelFor(size_t Begin, size_t End,
+                       llvm::function_ref<void(size_t)> Fn) {
   // If we have zero or one items, then do not incur the overhead of spinning up
   // a task group.  They are surprisingly expensive, and because they do not
   // support nested parallelism, a single entry task group can block parallel
diff --git a/llvm/lib/Support/Path.cpp b/llvm/lib/Support/Path.cpp
index 63d8d4ee4648..283dc70f2bc9 100644
--- a/llvm/lib/Support/Path.cpp
+++ b/llvm/lib/Support/Path.cpp
@@ -22,7 +22,6 @@
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Signals.h"
 #include <cctype>
-#include <cstring>
 
 #if !defined(_MSC_VER) && !defined(__MINGW32__)
 #include <unistd.h>
@@ -761,11 +760,15 @@ bool remove_dots(SmallVectorImpl<char> &the_path, bool remove_dot_dot,
     }
   }
 
+  SmallString<256> buffer = root;
+  // "root" could be "/", which may need to be translated into "\".
+  make_preferred(buffer, style);
+  needs_change |= root != buffer;
+
   // Avoid rewriting the path unless we have to.
   if (!needs_change)
     return false;
 
-  SmallString<256> buffer = root;
   if (!components.empty()) {
     buffer += components[0];
     for (StringRef C : makeArrayRef(components).drop_front()) {
@@ -1199,9 +1202,18 @@ Error readNativeFileToEOF(file_t FileHandle, SmallVectorImpl<char> &Buffer,
 #include "Windows/Path.inc"
 #endif
 
+bool IsLLVMDriver = false;
+
 namespace llvm {
 namespace sys {
 namespace fs {
+
+std::string getMainExecutable(const char *Argv0, void *MainAddr) {
+  if (IsLLVMDriver)
+    return sys::path::stem(Argv0).str();
+  return getMainExecutableImpl(Argv0, MainAddr);
+}
+
 TempFile::TempFile(StringRef Name, int FD)
     : TmpName(std::string(Name)), FD(FD) {}
 TempFile::TempFile(TempFile &&Other) { *this = std::move(Other); }
diff --git a/llvm/lib/Support/Process.cpp b/llvm/lib/Support/Process.cpp
index 547b3b73eec2..cf3962ae927b 100644
--- a/llvm/lib/Support/Process.cpp
+++ b/llvm/lib/Support/Process.cpp
@@ -42,7 +42,7 @@ Optional<std::string> Process::FindInEnvPath(StringRef EnvName,
   assert(!path::is_absolute(FileName));
   Optional<std::string> FoundPath;
   Optional<std::string> OptPath = Process::GetEnv(EnvName);
-  if (!OptPath.hasValue())
+  if (!OptPath)
     return FoundPath;
 
   const char EnvPathSeparatorStr[] = {Separator, '\0'};
diff --git a/llvm/lib/Support/Program.cpp b/llvm/lib/Support/Program.cpp
index c7a59642b27e..0560714a6acd 100644
--- a/llvm/lib/Support/Program.cpp
+++ b/llvm/lib/Support/Program.cpp
@@ -14,7 +14,6 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Support/raw_ostream.h"
-#include <system_error>
 using namespace llvm;
 using namespace sys;
 
diff --git a/llvm/lib/Support/RISCVISAInfo.cpp b/llvm/lib/Support/RISCVISAInfo.cpp
index 2b3395b669b8..7fe04af4696b 100644
--- a/llvm/lib/Support/RISCVISAInfo.cpp
+++ b/llvm/lib/Support/RISCVISAInfo.cpp
@@ -37,7 +37,7 @@ struct RISCVSupportedExtension {
 
 } // end anonymous namespace
 
-static constexpr StringLiteral AllStdExts = "mafdqlcbjtpvn";
+static constexpr StringLiteral AllStdExts = "mafdqlcbkjtpvn";
 
 static const RISCVSupportedExtension SupportedExtensions[] = {
     {"i", RISCVExtensionVersion{2, 0}},
@@ -48,9 +48,16 @@ static const RISCVSupportedExtension SupportedExtensions[] = {
     {"d", RISCVExtensionVersion{2, 0}},
     {"c", RISCVExtensionVersion{2, 0}},
 
+    {"zihintpause", RISCVExtensionVersion{2, 0}},
+
     {"zfhmin", RISCVExtensionVersion{1, 0}},
     {"zfh", RISCVExtensionVersion{1, 0}},
 
+    {"zfinx", RISCVExtensionVersion{1, 0}},
+    {"zdinx", RISCVExtensionVersion{1, 0}},
+    {"zhinxmin", RISCVExtensionVersion{1, 0}},
+    {"zhinx", RISCVExtensionVersion{1, 0}},
+
     {"zba", RISCVExtensionVersion{1, 0}},
     {"zbb", RISCVExtensionVersion{1, 0}},
     {"zbc", RISCVExtensionVersion{1, 0}},
@@ -88,6 +95,10 @@ static const RISCVSupportedExtension SupportedExtensions[] = {
     {"zve64x", RISCVExtensionVersion{1, 0}},
     {"zve64f", RISCVExtensionVersion{1, 0}},
     {"zve64d", RISCVExtensionVersion{1, 0}},
+
+    {"zicbom", RISCVExtensionVersion{1, 0}},
+    {"zicboz", RISCVExtensionVersion{1, 0}},
+    {"zicbop", RISCVExtensionVersion{1, 0}},
 };
 
 static const RISCVSupportedExtension SupportedExperimentalExtensions[] = {
@@ -97,6 +108,7 @@ static const RISCVSupportedExtension SupportedExperimentalExtensions[] = {
     {"zbp", RISCVExtensionVersion{0, 93}},
     {"zbr", RISCVExtensionVersion{0, 93}},
     {"zbt", RISCVExtensionVersion{0, 93}},
+    {"zvfh", RISCVExtensionVersion{0, 1}},
 };
 
 static bool stripExperimentalPrefix(StringRef &Ext) {
@@ -340,7 +352,7 @@ static Error getExtensionVersion(StringRef Ext, StringRef In, unsigned &Major,
 
   if (!MajorStr.empty() && In.consume_front("p")) {
     MinorStr = In.take_while(isDigit);
-    In = In.substr(MajorStr.size() + 1);
+    In = In.substr(MajorStr.size() + MinorStr.size() - 1);
 
     // Expected 'p' to be followed by minor version number.
     if (MinorStr.empty()) {
@@ -398,8 +410,8 @@ static Error getExtensionVersion(StringRef Ext, StringRef In, unsigned &Major,
       if (!MinorStr.empty())
         Error += "." + MinorStr.str();
       Error += " for experimental extension '" + Ext.str() +
-               "'(this compiler supports " + utostr(SupportedVers.Major) + "." +
-               utostr(SupportedVers.Minor) + ")";
+               "' (this compiler supports " + utostr(SupportedVers.Major) +
+               "." + utostr(SupportedVers.Minor) + ")";
       return createStringError(errc::invalid_argument, Error);
     }
     return Error::success();
@@ -686,11 +698,11 @@ Error RISCVISAInfo::checkDependency() {
   bool HasE = Exts.count("e") != 0;
   bool HasD = Exts.count("d") != 0;
   bool HasF = Exts.count("f") != 0;
-  bool HasZve32x = Exts.count("zve32x") != 0;
+  bool HasZfinx = Exts.count("zfinx") != 0;
+  bool HasZdinx = Exts.count("zdinx") != 0;
+  bool HasVector = Exts.count("zve32x") != 0;
   bool HasZve32f = Exts.count("zve32f") != 0;
   bool HasZve64d = Exts.count("zve64d") != 0;
-  bool HasV = Exts.count("v") != 0;
-  bool HasVector = HasZve32x || HasV;
   bool HasZvl = MinVLen != 0;
 
   if (HasE && !IsRv32)
@@ -706,17 +718,22 @@ Error RISCVISAInfo::checkDependency() {
     return createStringError(errc::invalid_argument,
                              "d requires f extension to also be specified");
 
-  // FIXME: Consider Zfinx in the future
-  if (HasZve32f && !HasF)
+  if (HasZve32f && !HasF && !HasZfinx)
+    return createStringError(
+        errc::invalid_argument,
+        "zve32f requires f or zfinx extension to also be specified");
+
+  if (HasZve64d && !HasD && !HasZdinx)
     return createStringError(
         errc::invalid_argument,
-        "zve32f requires f extension to also be specified");
+        "zve64d requires d or zdinx extension to also be specified");
 
-  // FIXME: Consider Zdinx in the future
-  if (HasZve64d && !HasD)
+  if (Exts.count("zvfh") && !Exts.count("zfh") && !Exts.count("zfhmin") &&
+      !Exts.count("zhinx") && !Exts.count("zhinxmin"))
     return createStringError(
         errc::invalid_argument,
-        "zve64d requires d extension to also be specified");
+        "zvfh requires zfh, zfhmin, zhinx or zhinxmin extension to also be "
+        "specified");
 
   if (HasZvl && !HasVector)
     return createStringError(
@@ -730,9 +747,12 @@ Error RISCVISAInfo::checkDependency() {
   return Error::success();
 }
 
-static const char *ImpliedExtsV[] = {"zvl128b", "f", "d"};
+static const char *ImpliedExtsV[] = {"zvl128b", "zve64d", "f", "d"};
 static const char *ImpliedExtsZfhmin[] = {"f"};
 static const char *ImpliedExtsZfh[] = {"f"};
+static const char *ImpliedExtsZdinx[] = {"zfinx"};
+static const char *ImpliedExtsZhinxmin[] = {"zfinx"};
+static const char *ImpliedExtsZhinx[] = {"zfinx"};
 static const char *ImpliedExtsZve64d[] = {"zve64f"};
 static const char *ImpliedExtsZve64f[] = {"zve64x", "zve32f"};
 static const char *ImpliedExtsZve64x[] = {"zve32x", "zvl64b"};
@@ -752,6 +772,7 @@ static const char *ImpliedExtsZvl64b[] = {"zvl32b"};
 static const char *ImpliedExtsZk[] = {"zkn", "zkt", "zkr"};
 static const char *ImpliedExtsZkn[] = {"zbkb", "zbkc", "zbkx", "zkne", "zknd", "zknh"};
 static const char *ImpliedExtsZks[] = {"zbkb", "zbkc", "zbkx", "zksed", "zksh"};
+static const char *ImpliedExtsZvfh[] = {"zve32f"};
 
 struct ImpliedExtsEntry {
   StringLiteral Name;
@@ -767,8 +788,11 @@ struct ImpliedExtsEntry {
 // Note: The table needs to be sorted by name.
 static constexpr ImpliedExtsEntry ImpliedExts[] = {
     {{"v"}, {ImpliedExtsV}},
+    {{"zdinx"}, {ImpliedExtsZdinx}},
     {{"zfh"}, {ImpliedExtsZfh}},
     {{"zfhmin"}, {ImpliedExtsZfhmin}},
+    {{"zhinx"}, {ImpliedExtsZhinx}},
+    {{"zhinxmin"}, {ImpliedExtsZhinxmin}},
     {{"zk"}, {ImpliedExtsZk}},
     {{"zkn"}, {ImpliedExtsZkn}},
     {{"zks"}, {ImpliedExtsZks}},
@@ -777,6 +801,7 @@ static constexpr ImpliedExtsEntry ImpliedExts[] = {
     {{"zve64d"}, {ImpliedExtsZve64d}},
     {{"zve64f"}, {ImpliedExtsZve64f}},
     {{"zve64x"}, {ImpliedExtsZve64x}},
+    {{"zvfh"}, {ImpliedExtsZvfh}},
     {{"zvl1024b"}, {ImpliedExtsZvl1024b}},
     {{"zvl128b"}, {ImpliedExtsZvl128b}},
     {{"zvl16384b"}, {ImpliedExtsZvl16384b}},
@@ -826,6 +851,38 @@ void RISCVISAInfo::updateImplication() {
   }
 }
 
+struct CombinedExtsEntry {
+  StringLiteral CombineExt;
+  ArrayRef<const char *> RequiredExts;
+};
+
+static constexpr CombinedExtsEntry CombineIntoExts[] = {
+    {{"zk"}, {ImpliedExtsZk}},
+    {{"zkn"}, {ImpliedExtsZkn}},
+    {{"zks"}, {ImpliedExtsZks}},
+};
+
+void RISCVISAInfo::updateCombination() {
+  bool IsNewCombine = false;
+  do {
+    IsNewCombine = false;
+    for (CombinedExtsEntry CombineIntoExt : CombineIntoExts) {
+      auto CombineExt = CombineIntoExt.CombineExt;
+      auto RequiredExts = CombineIntoExt.RequiredExts;
+      if (hasExtension(CombineExt))
+        continue;
+      bool IsAllRequiredFeatureExist = true;
+      for (const char *Ext : RequiredExts)
+        IsAllRequiredFeatureExist &= hasExtension(Ext);
+      if (IsAllRequiredFeatureExist) {
+        auto Version = findDefaultVersion(CombineExt);
+        addExtension(CombineExt, Version->Major, Version->Minor);
+        IsNewCombine = true;
+      }
+    }
+  } while (IsNewCombine);
+}
+
 void RISCVISAInfo::updateFLen() {
   FLen = 0;
   // TODO: Handle q extension.
@@ -862,11 +919,6 @@ void RISCVISAInfo::updateMaxELen() {
       ExtName.getAsInteger(10, ZveELen);
       MaxELen = std::max(MaxELen, ZveELen);
     }
-    if (ExtName == "v") {
-      MaxELenFp = 64;
-      MaxELen = 64;
-      return;
-    }
   }
 }
 
@@ -904,6 +956,7 @@ std::vector<std::string> RISCVISAInfo::toFeatureVector() const {
 llvm::Expected<std::unique_ptr<RISCVISAInfo>>
 RISCVISAInfo::postProcessAndChecking(std::unique_ptr<RISCVISAInfo> &&ISAInfo) {
   ISAInfo->updateImplication();
+  ISAInfo->updateCombination();
   ISAInfo->updateFLen();
   ISAInfo->updateMinVLen();
   ISAInfo->updateMaxELen();
@@ -912,3 +965,18 @@ RISCVISAInfo::postProcessAndChecking(std::unique_ptr<RISCVISAInfo> &&ISAInfo) {
     return std::move(Result);
   return std::move(ISAInfo);
 }
+
+StringRef RISCVISAInfo::computeDefaultABI() const {
+  if (XLen == 32) {
+    if (hasExtension("d"))
+      return "ilp32d";
+    if (hasExtension("e"))
+      return "ilp32e";
+    return "ilp32";
+  } else if (XLen == 64) {
+    if (hasExtension("d"))
+      return "lp64d";
+    return "lp64";
+  }
+  llvm_unreachable("Invalid XLEN");
+}
diff --git a/llvm/lib/Support/SHA1.cpp b/llvm/lib/Support/SHA1.cpp
index 5dce44af9ecd..52bae700350d 100644
--- a/llvm/lib/Support/SHA1.cpp
+++ b/llvm/lib/Support/SHA1.cpp
@@ -263,7 +263,7 @@ void SHA1::pad() {
   addUncounted(InternalState.ByteCount << 3);
 }
 
-StringRef SHA1::final() {
+void SHA1::final(std::array<uint32_t, HASH_LENGTH / 4> &HashResult) {
   // Pad to complete the last block
   pad();
 
@@ -281,12 +281,19 @@ StringRef SHA1::final() {
                     (((InternalState.State[i]) >> 24) & 0x000000ff);
   }
 #endif
+}
 
-  // Return pointer to hash (20 characters)
-  return StringRef((char *)HashResult, HASH_LENGTH);
+std::array<uint8_t, 20> SHA1::final() {
+  union {
+    std::array<uint32_t, HASH_LENGTH / 4> HashResult;
+    std::array<uint8_t, HASH_LENGTH> ReturnResult;
+  };
+  static_assert(sizeof(HashResult) == sizeof(ReturnResult), "");
+  final(HashResult);
+  return ReturnResult;
 }
 
-StringRef SHA1::result() {
+std::array<uint8_t, 20> SHA1::result() {
   auto StateToRestore = InternalState;
 
   auto Hash = final();
@@ -301,9 +308,5 @@ StringRef SHA1::result() {
 std::array<uint8_t, 20> SHA1::hash(ArrayRef<uint8_t> Data) {
   SHA1 Hash;
   Hash.update(Data);
-  StringRef S = Hash.final();
-
-  std::array<uint8_t, 20> Arr;
-  memcpy(Arr.data(), S.data(), S.size());
-  return Arr;
+  return Hash.final();
 }
diff --git a/llvm/lib/Support/SHA256.cpp b/llvm/lib/Support/SHA256.cpp
index 3b81506847ec..81d897fb4187 100644
--- a/llvm/lib/Support/SHA256.cpp
+++ b/llvm/lib/Support/SHA256.cpp
@@ -243,7 +243,7 @@ void SHA256::pad() {
   addUncounted(len);
 }
 
-StringRef SHA256::final() {
+void SHA256::final(std::array<uint32_t, HASH_LENGTH / 4> &HashResult) {
   // Pad to complete the last block
   pad();
 
@@ -261,12 +261,19 @@ StringRef SHA256::final() {
                     (((InternalState.State[i]) >> 24) & 0x000000ff);
   }
 #endif
+}
 
-  // Return pointer to hash (32 characters)
-  return StringRef((char *)HashResult, HASH_LENGTH);
+std::array<uint8_t, 32> SHA256::final() {
+  union {
+    std::array<uint32_t, HASH_LENGTH / 4> HashResult;
+    std::array<uint8_t, HASH_LENGTH> ReturnResult;
+  };
+  static_assert(sizeof(HashResult) == sizeof(ReturnResult), "");
+  final(HashResult);
+  return ReturnResult;
 }
 
-StringRef SHA256::result() {
+std::array<uint8_t, 32> SHA256::result() {
   auto StateToRestore = InternalState;
 
   auto Hash = final();
@@ -281,11 +288,7 @@ StringRef SHA256::result() {
 std::array<uint8_t, 32> SHA256::hash(ArrayRef<uint8_t> Data) {
   SHA256 Hash;
   Hash.update(Data);
-  StringRef S = Hash.final();
-
-  std::array<uint8_t, 32> Arr;
-  memcpy(Arr.data(), S.data(), S.size());
-  return Arr;
+  return Hash.final();
 }
 
 } // namespace llvm
diff --git a/llvm/lib/Support/ScopedPrinter.cpp b/llvm/lib/Support/ScopedPrinter.cpp
index a434e50e8c1f..ef6dd5fdf1d6 100644
--- a/llvm/lib/Support/ScopedPrinter.cpp
+++ b/llvm/lib/Support/ScopedPrinter.cpp
@@ -7,17 +7,10 @@ using namespace llvm::support;
 namespace llvm {
 
 raw_ostream &operator<<(raw_ostream &OS, const HexNumber &Value) {
-  OS << "0x" << to_hexString(Value.Value);
+  OS << "0x" << utohexstr(Value.Value);
   return OS;
 }
 
-std::string to_hexString(uint64_t Value, bool UpperCase) {
-  std::string number;
-  llvm::raw_string_ostream stream(number);
-  stream << format_hex_no_prefix(Value, 1, UpperCase);
-  return stream.str();
-}
-
 void ScopedPrinter::printBinaryImpl(StringRef Label, StringRef Str,
                                     ArrayRef<uint8_t> Data, bool Block,
                                     uint32_t StartOffset) {
diff --git a/llvm/lib/Support/Signals.cpp b/llvm/lib/Support/Signals.cpp
index 1d61f2bf7525..a6fd845da869 100644
--- a/llvm/lib/Support/Signals.cpp
+++ b/llvm/lib/Support/Signals.cpp
@@ -15,7 +15,6 @@
 
 #include "DebugOptions.h"
 
-#include "llvm/ADT/STLArrayExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Support/CommandLine.h"
@@ -23,15 +22,14 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FileUtilities.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/FormatAdapters.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Mutex.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/StringSaver.h"
 #include "llvm/Support/raw_ostream.h"
+#include <array>
 #include <vector>
 
 //===----------------------------------------------------------------------===//
@@ -83,12 +81,20 @@ struct CallbackAndCookie {
   enum class Status { Empty, Initializing, Initialized, Executing };
   std::atomic<Status> Flag;
 };
+
 static constexpr size_t MaxSignalHandlerCallbacks = 8;
-static CallbackAndCookie CallBacksToRun[MaxSignalHandlerCallbacks];
+
+// A global array of CallbackAndCookie may not compile with
+// -Werror=global-constructors in c++20 and above
+static std::array<CallbackAndCookie, MaxSignalHandlerCallbacks> &
+CallBacksToRun() {
+  static std::array<CallbackAndCookie, MaxSignalHandlerCallbacks> callbacks;
+  return callbacks;
+}
 
 // Signal-safe.
 void sys::RunSignalHandlers() {
-  for (CallbackAndCookie &RunMe : CallBacksToRun) {
+  for (CallbackAndCookie &RunMe : CallBacksToRun()) {
     auto Expected = CallbackAndCookie::Status::Initialized;
     auto Desired = CallbackAndCookie::Status::Executing;
     if (!RunMe.Flag.compare_exchange_strong(Expected, Desired))
@@ -103,7 +109,7 @@ void sys::RunSignalHandlers() {
 // Signal-safe.
 static void insertSignalHandler(sys::SignalHandlerCallback FnPtr,
                                 void *Cookie) {
-  for (CallbackAndCookie &SetMe : CallBacksToRun) {
+  for (CallbackAndCookie &SetMe : CallBacksToRun()) {
     auto Expected = CallbackAndCookie::Status::Empty;
     auto Desired = CallbackAndCookie::Status::Initializing;
     if (!SetMe.Flag.compare_exchange_strong(Expected, Desired))
diff --git a/llvm/lib/Support/Signposts.cpp b/llvm/lib/Support/Signposts.cpp
index 074dddc81c80..232b84e965a0 100644
--- a/llvm/lib/Support/Signposts.cpp
+++ b/llvm/lib/Support/Signposts.cpp
@@ -7,8 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Signposts.h"
-
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Config/config.h"
+
 #if LLVM_SUPPORT_XCODE_SIGNPOSTS
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/Mutex.h"
@@ -24,7 +25,7 @@ using namespace llvm;
 namespace {
 os_log_t *LogCreator() {
   os_log_t *X = new os_log_t;
-  *X = os_log_create("org.llvm.signposts", OS_LOG_CATEGORY_POINTS_OF_INTEREST);
+  *X = os_log_create("org.llvm.signposts", "toolchain");
   return X;
 }
 struct LogDeleter {
diff --git a/llvm/lib/Support/SourceMgr.cpp b/llvm/lib/Support/SourceMgr.cpp
index 2eb2989b200b..42982b4c8e6c 100644
--- a/llvm/lib/Support/SourceMgr.cpp
+++ b/llvm/lib/Support/SourceMgr.cpp
@@ -40,6 +40,17 @@ static const size_t TabStop = 8;
 unsigned SourceMgr::AddIncludeFile(const std::string &Filename,
                                    SMLoc IncludeLoc,
                                    std::string &IncludedFile) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> NewBufOrErr =
+      OpenIncludeFile(Filename, IncludedFile);
+  if (!NewBufOrErr)
+    return 0;
+
+  return AddNewSourceBuffer(std::move(*NewBufOrErr), IncludeLoc);
+}
+
+ErrorOr<std::unique_ptr<MemoryBuffer>>
+SourceMgr::OpenIncludeFile(const std::string &Filename,
+                           std::string &IncludedFile) {
   IncludedFile = Filename;
   ErrorOr<std::unique_ptr<MemoryBuffer>> NewBufOrErr =
       MemoryBuffer::getFile(IncludedFile);
@@ -52,10 +63,7 @@ unsigned SourceMgr::AddIncludeFile(const std::string &Filename,
     NewBufOrErr = MemoryBuffer::getFile(IncludedFile);
   }
 
-  if (!NewBufOrErr)
-    return 0;
-
-  return AddNewSourceBuffer(std::move(*NewBufOrErr), IncludeLoc);
+  return NewBufOrErr;
 }
 
 unsigned SourceMgr::FindBufferContainingLoc(SMLoc Loc) const {
diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp
index 137b37f2b1c3..0fb65accbf1d 100644
--- a/llvm/lib/Support/SpecialCaseList.cpp
+++ b/llvm/lib/Support/SpecialCaseList.cpp
@@ -198,7 +198,7 @@ bool SpecialCaseList::parse(const MemoryBuffer *MB,
   return true;
 }
 
-SpecialCaseList::~SpecialCaseList() {}
+SpecialCaseList::~SpecialCaseList() = default;
 
 bool SpecialCaseList::inSection(StringRef Section, StringRef Prefix,
                                 StringRef Query, StringRef Category) const {
diff --git a/llvm/lib/Support/Statistic.cpp b/llvm/lib/Support/Statistic.cpp
index 95ee885d2f8f..ec12118650c1 100644
--- a/llvm/lib/Support/Statistic.cpp
+++ b/llvm/lib/Support/Statistic.cpp
@@ -192,7 +192,7 @@ void llvm::PrintStatistics(raw_ostream &OS) {
 
   // Print all of the statistics.
   for (TrackingStatistic *Stat : Stats.Stats)
-    OS << format("%*u %-*s - %s\n", MaxValLen, Stat->getValue(),
+    OS << format("%*" PRIu64 " %-*s - %s\n", MaxValLen, Stat->getValue(),
                  MaxDebugTypeLen, Stat->getDebugType(), Stat->getDesc());
 
   OS << '\n';  // Flush the output stream.
@@ -253,9 +253,9 @@ void llvm::PrintStatistics() {
 #endif
 }
 
-const std::vector<std::pair<StringRef, unsigned>> llvm::GetStatistics() {
+const std::vector<std::pair<StringRef, uint64_t>> llvm::GetStatistics() {
   sys::SmartScopedLock<true> Reader(*StatLock);
-  std::vector<std::pair<StringRef, unsigned>> ReturnStats;
+  std::vector<std::pair<StringRef, uint64_t>> ReturnStats;
 
   for (const auto &Stat : StatInfo->statistics())
     ReturnStats.emplace_back(Stat->getName(), Stat->getValue());
diff --git a/llvm/lib/Support/StringMap.cpp b/llvm/lib/Support/StringMap.cpp
index 012c785b4351..9b2f96fca2cd 100644
--- a/llvm/lib/Support/StringMap.cpp
+++ b/llvm/lib/Support/StringMap.cpp
@@ -18,7 +18,7 @@ using namespace llvm;
 
 /// Returns the number of buckets to allocate to ensure that the DenseMap can
 /// accommodate \p NumEntries without need to grow().
-static unsigned getMinBucketToReserveForEntries(unsigned NumEntries) {
+static inline unsigned getMinBucketToReserveForEntries(unsigned NumEntries) {
   // Ensure that "NumEntries * 4 < NumBuckets * 3"
   if (NumEntries == 0)
     return 0;
@@ -27,6 +27,21 @@ static unsigned getMinBucketToReserveForEntries(unsigned NumEntries) {
   return NextPowerOf2(NumEntries * 4 / 3 + 1);
 }
 
+static inline StringMapEntryBase **createTable(unsigned NewNumBuckets) {
+  auto **Table = static_cast<StringMapEntryBase **>(safe_calloc(
+      NewNumBuckets + 1, sizeof(StringMapEntryBase **) + sizeof(unsigned)));
+
+  // Allocate one extra bucket, set it to look filled so the iterators stop at
+  // end.
+  Table[NewNumBuckets] = (StringMapEntryBase *)2;
+  return Table;
+}
+
+static inline unsigned *getHashTable(StringMapEntryBase **TheTable,
+                                     unsigned NumBuckets) {
+  return reinterpret_cast<unsigned *>(TheTable + NumBuckets + 1);
+}
+
 StringMapImpl::StringMapImpl(unsigned InitSize, unsigned itemSize) {
   ItemSize = itemSize;
 
@@ -54,15 +69,10 @@ void StringMapImpl::init(unsigned InitSize) {
   NumItems = 0;
   NumTombstones = 0;
 
-  TheTable = static_cast<StringMapEntryBase **>(safe_calloc(
-      NewNumBuckets + 1, sizeof(StringMapEntryBase **) + sizeof(unsigned)));
+  TheTable = createTable(NewNumBuckets);
 
   // Set the member only if TheTable was successfully allocated
   NumBuckets = NewNumBuckets;
-
-  // Allocate one extra bucket, set it to look filled so the iterators stop at
-  // end.
-  TheTable[NumBuckets] = (StringMapEntryBase *)2;
 }
 
 /// LookupBucketFor - Look up the bucket that the specified string should end
@@ -71,14 +81,12 @@ void StringMapImpl::init(unsigned InitSize) {
 /// case, the FullHashValue field of the bucket will be set to the hash value
 /// of the string.
 unsigned StringMapImpl::LookupBucketFor(StringRef Name) {
-  unsigned HTSize = NumBuckets;
-  if (HTSize == 0) { // Hash table unallocated so far?
+  // Hash table unallocated so far?
+  if (NumBuckets == 0)
     init(16);
-    HTSize = NumBuckets;
-  }
   unsigned FullHashValue = djbHash(Name, 0);
-  unsigned BucketNo = FullHashValue & (HTSize - 1);
-  unsigned *HashTable = (unsigned *)(TheTable + NumBuckets + 1);
+  unsigned BucketNo = FullHashValue & (NumBuckets - 1);
+  unsigned *HashTable = getHashTable(TheTable, NumBuckets);
 
   unsigned ProbeAmt = 1;
   int FirstTombstone = -1;
@@ -117,7 +125,7 @@ unsigned StringMapImpl::LookupBucketFor(StringRef Name) {
     }
 
     // Okay, we didn't find the item.  Probe to the next bucket.
-    BucketNo = (BucketNo + ProbeAmt) & (HTSize - 1);
+    BucketNo = (BucketNo + ProbeAmt) & (NumBuckets - 1);
 
     // Use quadratic probing, it has fewer clumping artifacts than linear
     // probing and has good cache behavior in the common case.
@@ -129,12 +137,11 @@ unsigned StringMapImpl::LookupBucketFor(StringRef Name) {
 /// in the map, return the bucket number of the key.  Otherwise return -1.
 /// This does not modify the map.
 int StringMapImpl::FindKey(StringRef Key) const {
-  unsigned HTSize = NumBuckets;
-  if (HTSize == 0)
+  if (NumBuckets == 0)
     return -1; // Really empty table?
   unsigned FullHashValue = djbHash(Key, 0);
-  unsigned BucketNo = FullHashValue & (HTSize - 1);
-  unsigned *HashTable = (unsigned *)(TheTable + NumBuckets + 1);
+  unsigned BucketNo = FullHashValue & (NumBuckets - 1);
+  unsigned *HashTable = getHashTable(TheTable, NumBuckets);
 
   unsigned ProbeAmt = 1;
   while (true) {
@@ -161,7 +168,7 @@ int StringMapImpl::FindKey(StringRef Key) const {
     }
 
     // Okay, we didn't find the item.  Probe to the next bucket.
-    BucketNo = (BucketNo + ProbeAmt) & (HTSize - 1);
+    BucketNo = (BucketNo + ProbeAmt) & (NumBuckets - 1);
 
     // Use quadratic probing, it has fewer clumping artifacts than linear
     // probing and has good cache behavior in the common case.
@@ -198,8 +205,6 @@ StringMapEntryBase *StringMapImpl::RemoveKey(StringRef Key) {
 /// the appropriate mod-of-hashtable-size.
 unsigned StringMapImpl::RehashTable(unsigned BucketNo) {
   unsigned NewSize;
-  unsigned *HashTable = (unsigned *)(TheTable + NumBuckets + 1);
-
   // If the hash table is now more than 3/4 full, or if fewer than 1/8 of
   // the buckets are empty (meaning that many are filled with tombstones),
   // grow/rehash the table.
@@ -213,36 +218,25 @@ unsigned StringMapImpl::RehashTable(unsigned BucketNo) {
   }
 
   unsigned NewBucketNo = BucketNo;
-  // Allocate one extra bucket which will always be non-empty.  This allows the
-  // iterators to stop at end.
-  auto NewTableArray = static_cast<StringMapEntryBase **>(safe_calloc(
-      NewSize + 1, sizeof(StringMapEntryBase *) + sizeof(unsigned)));
-
-  unsigned *NewHashArray = (unsigned *)(NewTableArray + NewSize + 1);
-  NewTableArray[NewSize] = (StringMapEntryBase *)2;
+  auto **NewTableArray = createTable(NewSize);
+  unsigned *NewHashArray = getHashTable(NewTableArray, NewSize);
+  unsigned *HashTable = getHashTable(TheTable, NumBuckets);
 
   // Rehash all the items into their new buckets.  Luckily :) we already have
   // the hash values available, so we don't have to rehash any strings.
   for (unsigned I = 0, E = NumBuckets; I != E; ++I) {
     StringMapEntryBase *Bucket = TheTable[I];
     if (Bucket && Bucket != getTombstoneVal()) {
-      // Fast case, bucket available.
+      // If the bucket is not available, probe for a spot.
       unsigned FullHash = HashTable[I];
       unsigned NewBucket = FullHash & (NewSize - 1);
-      if (!NewTableArray[NewBucket]) {
-        NewTableArray[FullHash & (NewSize - 1)] = Bucket;
-        NewHashArray[FullHash & (NewSize - 1)] = FullHash;
-        if (I == BucketNo)
-          NewBucketNo = NewBucket;
-        continue;
+      if (NewTableArray[NewBucket]) {
+        unsigned ProbeSize = 1;
+        do {
+          NewBucket = (NewBucket + ProbeSize++) & (NewSize - 1);
+        } while (NewTableArray[NewBucket]);
       }
 
-      // Otherwise probe for a spot.
-      unsigned ProbeSize = 1;
-      do {
-        NewBucket = (NewBucket + ProbeSize++) & (NewSize - 1);
-      } while (NewTableArray[NewBucket]);
-
       // Finally found a slot.  Fill it in.
       NewTableArray[NewBucket] = Bucket;
       NewHashArray[NewBucket] = FullHash;
diff --git a/llvm/lib/Support/StringRef.cpp b/llvm/lib/Support/StringRef.cpp
index 3ed08ed38661..096b2d2d8c07 100644
--- a/llvm/lib/Support/StringRef.cpp
+++ b/llvm/lib/Support/StringRef.cpp
@@ -98,6 +98,13 @@ unsigned StringRef::edit_distance(llvm::StringRef Other,
       AllowReplacements, MaxEditDistance);
 }
 
+unsigned llvm::StringRef::edit_distance_insensitive(
+    StringRef Other, bool AllowReplacements, unsigned MaxEditDistance) const {
+  return llvm::ComputeMappedEditDistance(
+      makeArrayRef(data(), size()), makeArrayRef(Other.data(), Other.size()),
+      llvm::toLower, AllowReplacements, MaxEditDistance);
+}
+
 //===----------------------------------------------------------------------===//
 // String Operations
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Support/TargetParser.cpp b/llvm/lib/Support/TargetParser.cpp
index 0105cd2e8153..e5590d458fed 100644
--- a/llvm/lib/Support/TargetParser.cpp
+++ b/llvm/lib/Support/TargetParser.cpp
@@ -104,6 +104,7 @@ constexpr GPUInfo AMDGCNGPUs[] = {
   {{"gfx909"},    {"gfx909"},  GK_GFX909,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK},
   {{"gfx90a"},    {"gfx90a"},  GK_GFX90A,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC},
   {{"gfx90c"},    {"gfx90c"},  GK_GFX90C,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK},
+  {{"gfx940"},    {"gfx940"},  GK_GFX940,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC},
   {{"gfx1010"},   {"gfx1010"}, GK_GFX1010, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK},
   {{"gfx1011"},   {"gfx1011"}, GK_GFX1011, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK},
   {{"gfx1012"},   {"gfx1012"}, GK_GFX1012, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK},
@@ -114,6 +115,11 @@ constexpr GPUInfo AMDGCNGPUs[] = {
   {{"gfx1033"},   {"gfx1033"}, GK_GFX1033, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32},
   {{"gfx1034"},   {"gfx1034"}, GK_GFX1034, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32},
   {{"gfx1035"},   {"gfx1035"}, GK_GFX1035, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32},
+  {{"gfx1036"},   {"gfx1036"}, GK_GFX1036, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32},
+  {{"gfx1100"},   {"gfx1100"}, GK_GFX1100, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32},
+  {{"gfx1101"},   {"gfx1101"}, GK_GFX1101, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32},
+  {{"gfx1102"},   {"gfx1102"}, GK_GFX1102, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32},
+  {{"gfx1103"},   {"gfx1103"}, GK_GFX1103, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32},
 };
 
 const GPUInfo *getArchEntry(AMDGPU::GPUKind AK, ArrayRef<GPUInfo> Table) {
@@ -217,6 +223,7 @@ AMDGPU::IsaVersion AMDGPU::getIsaVersion(StringRef GPU) {
   case GK_GFX909:  return {9, 0, 9};
   case GK_GFX90A:  return {9, 0, 10};
   case GK_GFX90C:  return {9, 0, 12};
+  case GK_GFX940:  return {9, 4, 0};
   case GK_GFX1010: return {10, 1, 0};
   case GK_GFX1011: return {10, 1, 1};
   case GK_GFX1012: return {10, 1, 2};
@@ -227,6 +234,11 @@ AMDGPU::IsaVersion AMDGPU::getIsaVersion(StringRef GPU) {
   case GK_GFX1033: return {10, 3, 3};
   case GK_GFX1034: return {10, 3, 4};
   case GK_GFX1035: return {10, 3, 5};
+  case GK_GFX1036: return {10, 3, 6};
+  case GK_GFX1100: return {11, 0, 0};
+  case GK_GFX1101: return {11, 0, 1};
+  case GK_GFX1102: return {11, 0, 2};
+  case GK_GFX1103: return {11, 0, 3};
   default:         return {0, 0, 0};
   }
 }
@@ -329,21 +341,6 @@ bool getCPUFeaturesExceptStdExt(CPUKind Kind,
   return true;
 }
 
-StringRef computeDefaultABIFromArch(const llvm::RISCVISAInfo &ISAInfo) {
-  if (ISAInfo.getXLen() == 32) {
-    if (ISAInfo.hasExtension("d"))
-      return "ilp32d";
-    if (ISAInfo.hasExtension("e"))
-      return "ilp32e";
-    return "ilp32";
-  } else if (ISAInfo.getXLen() == 64) {
-    if (ISAInfo.hasExtension("d"))
-      return "lp64d";
-    return "lp64";
-  }
-  llvm_unreachable("Invalid XLEN");
-}
-
 } // namespace RISCV
 } // namespace llvm
 
diff --git a/llvm/lib/Support/ThreadPool.cpp b/llvm/lib/Support/ThreadPool.cpp
index 9f92ae1c7a7c..31461e31c65c 100644
--- a/llvm/lib/Support/ThreadPool.cpp
+++ b/llvm/lib/Support/ThreadPool.cpp
@@ -24,11 +24,19 @@ using namespace llvm;
 
 #if LLVM_ENABLE_THREADS
 
+// A note on thread groups: Tasks are by default in no group (represented
+// by nullptr ThreadPoolTaskGroup pointer in the Tasks queue) and functionality
+// here normally works on all tasks regardless of their group (functions
+// in that case receive nullptr ThreadPoolTaskGroup pointer as argument).
+// A task in a group has a pointer to that ThreadPoolTaskGroup in the Tasks
+// queue, and functions called to work only on tasks from one group take that
+// pointer.
+
 ThreadPool::ThreadPool(ThreadPoolStrategy S)
     : Strategy(S), MaxThreadCount(S.compute_thread_count()) {}
 
 void ThreadPool::grow(int requested) {
-  std::unique_lock<std::mutex> LockGuard(ThreadsLock);
+  llvm::sys::ScopedWriter LockGuard(ThreadsLock);
   if (Threads.size() >= MaxThreadCount)
     return; // Already hit the max thread pool size.
   int newThreadCount = std::min<int>(requested, MaxThreadCount);
@@ -36,52 +44,129 @@ void ThreadPool::grow(int requested) {
     int ThreadID = Threads.size();
     Threads.emplace_back([this, ThreadID] {
       Strategy.apply_thread_strategy(ThreadID);
-      while (true) {
-        std::function<void()> Task;
-        {
-          std::unique_lock<std::mutex> LockGuard(QueueLock);
-          // Wait for tasks to be pushed in the queue
-          QueueCondition.wait(LockGuard,
-                              [&] { return !EnableFlag || !Tasks.empty(); });
-          // Exit condition
-          if (!EnableFlag && Tasks.empty())
-            return;
-          // Yeah, we have a task, grab it and release the lock on the queue
-
-          // We first need to signal that we are active before popping the queue
-          // in order for wait() to properly detect that even if the queue is
-          // empty, there is still a task in flight.
-          ++ActiveThreads;
-          Task = std::move(Tasks.front());
-          Tasks.pop();
-        }
-        // Run the task we just grabbed
-        Task();
-
-        bool Notify;
-        {
-          // Adjust `ActiveThreads`, in case someone waits on ThreadPool::wait()
-          std::lock_guard<std::mutex> LockGuard(QueueLock);
-          --ActiveThreads;
-          Notify = workCompletedUnlocked();
-        }
-        // Notify task completion if this is the last active thread, in case
-        // someone waits on ThreadPool::wait().
-        if (Notify)
-          CompletionCondition.notify_all();
-      }
+      processTasks(nullptr);
     });
   }
 }
 
+#ifndef NDEBUG
+// The group of the tasks run by the current thread.
+static LLVM_THREAD_LOCAL std::vector<ThreadPoolTaskGroup *>
+    *CurrentThreadTaskGroups = nullptr;
+#endif
+
+// WaitingForGroup == nullptr means all tasks regardless of their group.
+void ThreadPool::processTasks(ThreadPoolTaskGroup *WaitingForGroup) {
+  while (true) {
+    std::function<void()> Task;
+    ThreadPoolTaskGroup *GroupOfTask;
+    {
+      std::unique_lock<std::mutex> LockGuard(QueueLock);
+      bool workCompletedForGroup = false; // Result of workCompletedUnlocked()
+      // Wait for tasks to be pushed in the queue
+      QueueCondition.wait(LockGuard, [&] {
+        return !EnableFlag || !Tasks.empty() ||
+               (WaitingForGroup != nullptr &&
+                (workCompletedForGroup =
+                     workCompletedUnlocked(WaitingForGroup)));
+      });
+      // Exit condition
+      if (!EnableFlag && Tasks.empty())
+        return;
+      if (WaitingForGroup != nullptr && workCompletedForGroup)
+        return;
+      // Yeah, we have a task, grab it and release the lock on the queue
+
+      // We first need to signal that we are active before popping the queue
+      // in order for wait() to properly detect that even if the queue is
+      // empty, there is still a task in flight.
+      ++ActiveThreads;
+      Task = std::move(Tasks.front().first);
+      GroupOfTask = Tasks.front().second;
+      // Need to count active threads in each group separately, ActiveThreads
+      // would never be 0 if waiting for another group inside a wait.
+      if (GroupOfTask != nullptr)
+        ++ActiveGroups[GroupOfTask]; // Increment or set to 1 if new item
+      Tasks.pop_front();
+    }
+#ifndef NDEBUG
+    if (CurrentThreadTaskGroups == nullptr)
+      CurrentThreadTaskGroups = new std::vector<ThreadPoolTaskGroup *>;
+    CurrentThreadTaskGroups->push_back(GroupOfTask);
+#endif
+
+    // Run the task we just grabbed
+    Task();
+
+#ifndef NDEBUG
+    CurrentThreadTaskGroups->pop_back();
+    if (CurrentThreadTaskGroups->empty()) {
+      delete CurrentThreadTaskGroups;
+      CurrentThreadTaskGroups = nullptr;
+    }
+#endif
+
+    bool Notify;
+    bool NotifyGroup;
+    {
+      // Adjust `ActiveThreads`, in case someone waits on ThreadPool::wait()
+      std::lock_guard<std::mutex> LockGuard(QueueLock);
+      --ActiveThreads;
+      if (GroupOfTask != nullptr) {
+        auto A = ActiveGroups.find(GroupOfTask);
+        if (--(A->second) == 0)
+          ActiveGroups.erase(A);
+      }
+      Notify = workCompletedUnlocked(GroupOfTask);
+      NotifyGroup = GroupOfTask != nullptr && Notify;
+    }
+    // Notify task completion if this is the last active thread, in case
+    // someone waits on ThreadPool::wait().
+    if (Notify)
+      CompletionCondition.notify_all();
+    // If this was a task in a group, notify also threads waiting for tasks
+    // in this function on QueueCondition, to make a recursive wait() return
+    // after the group it's been waiting for has finished.
+    if (NotifyGroup)
+      QueueCondition.notify_all();
+  }
+}
+
+bool ThreadPool::workCompletedUnlocked(ThreadPoolTaskGroup *Group) const {
+  if (Group == nullptr)
+    return !ActiveThreads && Tasks.empty();
+  return ActiveGroups.count(Group) == 0 &&
+         !llvm::any_of(Tasks,
+                       [Group](const auto &T) { return T.second == Group; });
+}
+
 void ThreadPool::wait() {
+  assert(!isWorkerThread()); // Would deadlock waiting for itself.
   // Wait for all threads to complete and the queue to be empty
   std::unique_lock<std::mutex> LockGuard(QueueLock);
-  CompletionCondition.wait(LockGuard, [&] { return workCompletedUnlocked(); });
+  CompletionCondition.wait(LockGuard,
+                           [&] { return workCompletedUnlocked(nullptr); });
+}
+
+void ThreadPool::wait(ThreadPoolTaskGroup &Group) {
+  // Wait for all threads in the group to complete.
+  if (!isWorkerThread()) {
+    std::unique_lock<std::mutex> LockGuard(QueueLock);
+    CompletionCondition.wait(LockGuard,
+                             [&] { return workCompletedUnlocked(&Group); });
+    return;
+  }
+  // Make sure to not deadlock waiting for oneself.
+  assert(CurrentThreadTaskGroups == nullptr ||
+         !llvm::is_contained(*CurrentThreadTaskGroups, &Group));
+  // Handle the case of recursive call from another task in a different group,
+  // in which case process tasks while waiting to keep the thread busy and avoid
+  // possible deadlock.
+  processTasks(&Group);
 }
 
 bool ThreadPool::isWorkerThread() const {
-  std::unique_lock<std::mutex> LockGuard(ThreadsLock);
+  llvm::sys::ScopedReader LockGuard(ThreadsLock);
   llvm::thread::id CurrentThreadId = llvm::this_thread::get_id();
   for (const llvm::thread &Thread : Threads)
     if (CurrentThreadId == Thread.get_id())
@@ -96,7 +181,7 @@ ThreadPool::~ThreadPool() {
     EnableFlag = false;
   }
   QueueCondition.notify_all();
-  std::unique_lock<std::mutex> LockGuard(ThreadsLock);
+  llvm::sys::ScopedReader LockGuard(ThreadsLock);
   for (auto &Worker : Threads)
     Worker.join();
 }
@@ -115,12 +200,18 @@ ThreadPool::ThreadPool(ThreadPoolStrategy S) : MaxThreadCount(1) {
 void ThreadPool::wait() {
   // Sequential implementation running the tasks
   while (!Tasks.empty()) {
-    auto Task = std::move(Tasks.front());
-    Tasks.pop();
+    auto Task = std::move(Tasks.front().first);
+    Tasks.pop_front();
     Task();
   }
 }
 
+void ThreadPool::wait(ThreadPoolTaskGroup &) {
+  // Simply wait for all, this works even if recursive (the running task
+  // is already removed from the queue).
+  wait();
+}
+
 bool ThreadPool::isWorkerThread() const {
   report_fatal_error("LLVM compiled without multithreading");
 }
diff --git a/llvm/lib/Support/TrigramIndex.cpp b/llvm/lib/Support/TrigramIndex.cpp
index 4370adc9c3e0..40a20ccc6583 100644
--- a/llvm/lib/Support/TrigramIndex.cpp
+++ b/llvm/lib/Support/TrigramIndex.cpp
@@ -15,6 +15,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/TrigramIndex.h"
+#include "llvm/ADT/StringRef.h"
 #include <set>
 
 using namespace llvm;
diff --git a/llvm/lib/Support/Triple.cpp b/llvm/lib/Support/Triple.cpp
index a9afcc9db96a..6696d158b2c1 100644
--- a/llvm/lib/Support/Triple.cpp
+++ b/llvm/lib/Support/Triple.cpp
@@ -37,6 +37,7 @@ StringRef Triple::getArchTypeName(ArchType Kind) {
   case bpfeb:          return "bpfeb";
   case bpfel:          return "bpfel";
   case csky:           return "csky";
+  case dxil:           return "dxil";
   case hexagon:        return "hexagon";
   case hsail64:        return "hsail64";
   case hsail:          return "hsail";
@@ -44,6 +45,8 @@ StringRef Triple::getArchTypeName(ArchType Kind) {
   case lanai:          return "lanai";
   case le32:           return "le32";
   case le64:           return "le64";
+  case loongarch32:    return "loongarch32";
+  case loongarch64:    return "loongarch64";
   case m68k:           return "m68k";
   case mips64:         return "mips64";
   case mips64el:       return "mips64el";
@@ -164,6 +167,11 @@ StringRef Triple::getArchTypePrefix(ArchType Kind) {
 
   case ve:          return "ve";
   case csky:        return "csky";
+
+  case loongarch32:
+  case loongarch64: return "loongarch";
+  
+  case dxil:        return "dx";
   }
 }
 
@@ -203,6 +211,7 @@ StringRef Triple::getOSTypeName(OSType Kind) {
   case Contiki: return "contiki";
   case Darwin: return "darwin";
   case DragonFly: return "dragonfly";
+  case DriverKit: return "driverkit";
   case ELFIAMCU: return "elfiamcu";
   case Emscripten: return "emscripten";
   case FreeBSD: return "freebsd";
@@ -222,6 +231,7 @@ StringRef Triple::getOSTypeName(OSType Kind) {
   case NetBSD: return "netbsd";
   case OpenBSD: return "openbsd";
   case PS4: return "ps4";
+  case PS5: return "ps5";
   case RTEMS: return "rtems";
   case Solaris: return "solaris";
   case TvOS: return "tvos";
@@ -229,6 +239,7 @@ StringRef Triple::getOSTypeName(OSType Kind) {
   case WatchOS: return "watchos";
   case Win32: return "windows";
   case ZOS: return "zos";
+  case ShaderModel: return "shadermodel";
   }
 
   llvm_unreachable("Invalid OSType");
@@ -258,6 +269,21 @@ StringRef Triple::getEnvironmentTypeName(EnvironmentType Kind) {
   case MuslEABIHF: return "musleabihf";
   case MuslX32: return "muslx32";
   case Simulator: return "simulator";
+  case Pixel: return "pixel";
+  case Vertex: return "vertex";
+  case Geometry: return "geometry";
+  case Hull: return "hull";
+  case Domain: return "domain";
+  case Compute: return "compute";
+  case Library: return "library";
+  case RayGeneration: return "raygeneration";
+  case Intersection: return "intersection";
+  case AnyHit: return "anyhit";
+  case ClosestHit: return "closesthit";
+  case Miss: return "miss";
+  case Callable: return "callable";
+  case Mesh: return "mesh";
+  case Amplification: return "amplification";
   }
 
   llvm_unreachable("Invalid EnvironmentType!");
@@ -311,12 +337,14 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
     .Case("sparc", sparc)
     .Case("sparcel", sparcel)
     .Case("sparcv9", sparcv9)
+    .Case("s390x", systemz)
     .Case("systemz", systemz)
     .Case("tce", tce)
     .Case("tcele", tcele)
     .Case("thumb", thumb)
     .Case("thumbeb", thumbeb)
     .Case("x86", x86)
+    .Case("i386", x86)
     .Case("x86-64", x86_64)
     .Case("xcore", xcore)
     .Case("nvptx", nvptx)
@@ -340,6 +368,9 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
     .Case("renderscript64", renderscript64)
     .Case("ve", ve)
     .Case("csky", csky)
+    .Case("loongarch32", loongarch32)
+    .Case("loongarch64", loongarch64)
+    .Case("dxil", dxil)
     .Default(UnknownArch);
 }
 
@@ -464,8 +495,10 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     .Case("hsail64", Triple::hsail64)
     .Case("spir", Triple::spir)
     .Case("spir64", Triple::spir64)
-    .Case("spirv32", Triple::spirv32)
-    .Case("spirv64", Triple::spirv64)
+    .Cases("spirv32", "spirv32v1.0", "spirv32v1.1", "spirv32v1.2",
+           "spirv32v1.3", "spirv32v1.4", "spirv32v1.5", Triple::spirv32)
+    .Cases("spirv64", "spirv64v1.0", "spirv64v1.1", "spirv64v1.2",
+           "spirv64v1.3", "spirv64v1.4", "spirv64v1.5", Triple::spirv64)
     .StartsWith("kalimba", Triple::kalimba)
     .Case("lanai", Triple::lanai)
     .Case("renderscript32", Triple::renderscript32)
@@ -475,6 +508,9 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     .Case("wasm32", Triple::wasm32)
     .Case("wasm64", Triple::wasm64)
     .Case("csky", Triple::csky)
+    .Case("loongarch32", Triple::loongarch32)
+    .Case("loongarch64", Triple::loongarch64)
+    .Case("dxil", Triple::dxil)
     .Default(Triple::UnknownArch);
 
   // Some architectures require special parsing logic just to compute the
@@ -538,9 +574,11 @@ static Triple::OSType parseOS(StringRef OSName) {
     .StartsWith("nvcl", Triple::NVCL)
     .StartsWith("amdhsa", Triple::AMDHSA)
     .StartsWith("ps4", Triple::PS4)
+    .StartsWith("ps5", Triple::PS5)
     .StartsWith("elfiamcu", Triple::ELFIAMCU)
     .StartsWith("tvos", Triple::TvOS)
     .StartsWith("watchos", Triple::WatchOS)
+    .StartsWith("driverkit", Triple::DriverKit)
     .StartsWith("mesa3d", Triple::Mesa3D)
     .StartsWith("contiki", Triple::Contiki)
     .StartsWith("amdpal", Triple::AMDPAL)
@@ -548,6 +586,7 @@ static Triple::OSType parseOS(StringRef OSName) {
     .StartsWith("hurd", Triple::Hurd)
     .StartsWith("wasi", Triple::WASI)
     .StartsWith("emscripten", Triple::Emscripten)
+    .StartsWith("shadermodel", Triple::ShaderModel)
     .Default(Triple::UnknownOS);
 }
 
@@ -574,20 +613,36 @@ static Triple::EnvironmentType parseEnvironment(StringRef EnvironmentName) {
       .StartsWith("coreclr", Triple::CoreCLR)
       .StartsWith("simulator", Triple::Simulator)
       .StartsWith("macabi", Triple::MacABI)
+      .StartsWith("pixel", Triple::Pixel)
+      .StartsWith("vertex", Triple::Vertex)
+      .StartsWith("geometry", Triple::Geometry)
+      .StartsWith("hull", Triple::Hull)
+      .StartsWith("domain", Triple::Domain)
+      .StartsWith("compute", Triple::Compute)
+      .StartsWith("library", Triple::Library)
+      .StartsWith("raygeneration", Triple::RayGeneration)
+      .StartsWith("intersection", Triple::Intersection)
+      .StartsWith("anyhit", Triple::AnyHit)
+      .StartsWith("closesthit", Triple::ClosestHit)
+      .StartsWith("miss", Triple::Miss)
+      .StartsWith("callable", Triple::Callable)
+      .StartsWith("mesh", Triple::Mesh)
+      .StartsWith("amplification", Triple::Amplification)
       .Default(Triple::UnknownEnvironment);
 }
 
 static Triple::ObjectFormatType parseFormat(StringRef EnvironmentName) {
   return StringSwitch<Triple::ObjectFormatType>(EnvironmentName)
-    // "xcoff" must come before "coff" because of the order-dependendent
-    // pattern matching.
-    .EndsWith("xcoff", Triple::XCOFF)
-    .EndsWith("coff", Triple::COFF)
-    .EndsWith("elf", Triple::ELF)
-    .EndsWith("goff", Triple::GOFF)
-    .EndsWith("macho", Triple::MachO)
-    .EndsWith("wasm", Triple::Wasm)
-    .Default(Triple::UnknownObjectFormat);
+      // "xcoff" must come before "coff" because of the order-dependendent
+      // pattern matching.
+      .EndsWith("xcoff", Triple::XCOFF)
+      .EndsWith("coff", Triple::COFF)
+      .EndsWith("elf", Triple::ELF)
+      .EndsWith("goff", Triple::GOFF)
+      .EndsWith("macho", Triple::MachO)
+      .EndsWith("wasm", Triple::Wasm)
+      .EndsWith("spirv", Triple::SPIRV)
+      .Default(Triple::UnknownObjectFormat);
 }
 
 static Triple::SubArchType parseSubArch(StringRef SubArchName) {
@@ -601,6 +656,16 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) {
   if (SubArchName == "arm64e")
     return Triple::AArch64SubArch_arm64e;
 
+  if (SubArchName.startswith("spirv"))
+    return StringSwitch<Triple::SubArchType>(SubArchName)
+        .EndsWith("v1.0", Triple::SPIRVSubArch_v10)
+        .EndsWith("v1.1", Triple::SPIRVSubArch_v11)
+        .EndsWith("v1.2", Triple::SPIRVSubArch_v12)
+        .EndsWith("v1.3", Triple::SPIRVSubArch_v13)
+        .EndsWith("v1.4", Triple::SPIRVSubArch_v14)
+        .EndsWith("v1.5", Triple::SPIRVSubArch_v15)
+        .Default(Triple::NoSubArch);
+
   StringRef ARMSubArch = ARM::getCanonicalArchName(SubArchName);
 
   // For now, this is the small part. Early return.
@@ -688,13 +753,24 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) {
 
 static StringRef getObjectFormatTypeName(Triple::ObjectFormatType Kind) {
   switch (Kind) {
-  case Triple::UnknownObjectFormat: return "";
-  case Triple::COFF:  return "coff";
-  case Triple::ELF:   return "elf";
-  case Triple::GOFF:  return "goff";
-  case Triple::MachO: return "macho";
-  case Triple::Wasm:  return "wasm";
-  case Triple::XCOFF: return "xcoff";
+  case Triple::UnknownObjectFormat:
+    return "";
+  case Triple::COFF:
+    return "coff";
+  case Triple::ELF:
+    return "elf";
+  case Triple::GOFF:
+    return "goff";
+  case Triple::MachO:
+    return "macho";
+  case Triple::Wasm:
+    return "wasm";
+  case Triple::XCOFF:
+    return "xcoff";
+  case Triple::DXContainer:
+    return "dxcontainer";
+  case Triple::SPIRV:
+    return "spirv";
   }
   llvm_unreachable("unknown object format type");
 }
@@ -731,6 +807,8 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) {
   case Triple::lanai:
   case Triple::le32:
   case Triple::le64:
+  case Triple::loongarch32:
+  case Triple::loongarch64:
   case Triple::m68k:
   case Triple::mips64:
   case Triple::mips64el:
@@ -776,8 +854,10 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) {
 
   case Triple::spirv32:
   case Triple::spirv64:
-    // TODO: In future this will be Triple::SPIRV.
-    return Triple::UnknownObjectFormat;
+    return Triple::SPIRV;
+
+  case Triple::dxil:
+    return Triple::DXContainer;
   }
   llvm_unreachable("unknown architecture");
 }
@@ -1158,6 +1238,8 @@ bool Triple::getMacOSXVersion(VersionTuple &Version) const {
     // IOS.
     Version = VersionTuple(10, 4);
     break;
+  case DriverKit:
+    llvm_unreachable("OSX version isn't relevant for DriverKit");
   }
   return true;
 }
@@ -1182,6 +1264,8 @@ VersionTuple Triple::getiOSVersion() const {
   }
   case WatchOS:
     llvm_unreachable("conflicting triple info");
+  case DriverKit:
+    llvm_unreachable("DriverKit doesn't have an iOS version");
   }
 }
 
@@ -1203,6 +1287,20 @@ VersionTuple Triple::getWatchOSVersion() const {
   }
   case IOS:
     llvm_unreachable("conflicting triple info");
+  case DriverKit:
+    llvm_unreachable("DriverKit doesn't have a WatchOS version");
+  }
+}
+
+VersionTuple Triple::getDriverKitVersion() const {
+  switch (getOS()) {
+  default:
+    llvm_unreachable("unexpected OS for Darwin triple");
+  case DriverKit:
+    VersionTuple Version = getOSVersion();
+    if (Version.getMajor() == 0)
+      return Version.withMajorReplaced(19);
+    return Version;
   }
 }
 
@@ -1285,11 +1383,13 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
   case llvm::Triple::arm:
   case llvm::Triple::armeb:
   case llvm::Triple::csky:
+  case llvm::Triple::dxil:
   case llvm::Triple::hexagon:
   case llvm::Triple::hsail:
   case llvm::Triple::kalimba:
   case llvm::Triple::lanai:
   case llvm::Triple::le32:
+  case llvm::Triple::loongarch32:
   case llvm::Triple::m68k:
   case llvm::Triple::mips:
   case llvm::Triple::mipsel:
@@ -1321,6 +1421,7 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
   case llvm::Triple::bpfel:
   case llvm::Triple::hsail64:
   case llvm::Triple::le64:
+  case llvm::Triple::loongarch64:
   case llvm::Triple::mips64:
   case llvm::Triple::mips64el:
   case llvm::Triple::nvptx64:
@@ -1372,11 +1473,13 @@ Triple Triple::get32BitArchVariant() const {
   case Triple::arm:
   case Triple::armeb:
   case Triple::csky:
+  case Triple::dxil:
   case Triple::hexagon:
   case Triple::hsail:
   case Triple::kalimba:
   case Triple::lanai:
   case Triple::le32:
+  case Triple::loongarch32:
   case Triple::m68k:
   case Triple::mips:
   case Triple::mipsel:
@@ -1406,6 +1509,7 @@ Triple Triple::get32BitArchVariant() const {
   case Triple::amdil64:        T.setArch(Triple::amdil);   break;
   case Triple::hsail64:        T.setArch(Triple::hsail);   break;
   case Triple::le64:           T.setArch(Triple::le32);    break;
+  case Triple::loongarch64:    T.setArch(Triple::loongarch32); break;
   case Triple::mips64:
     T.setArch(Triple::mips, getSubArch());
     break;
@@ -1419,7 +1523,9 @@ Triple Triple::get32BitArchVariant() const {
   case Triple::riscv64:        T.setArch(Triple::riscv32); break;
   case Triple::sparcv9:        T.setArch(Triple::sparc);   break;
   case Triple::spir64:         T.setArch(Triple::spir);    break;
-  case Triple::spirv64:        T.setArch(Triple::spirv32); break;
+  case Triple::spirv64:
+    T.setArch(Triple::spirv32, getSubArch());
+    break;
   case Triple::wasm64:         T.setArch(Triple::wasm32);  break;
   case Triple::x86_64:         T.setArch(Triple::x86);     break;
   }
@@ -1433,6 +1539,7 @@ Triple Triple::get64BitArchVariant() const {
   case Triple::arc:
   case Triple::avr:
   case Triple::csky:
+  case Triple::dxil:
   case Triple::hexagon:
   case Triple::kalimba:
   case Triple::lanai:
@@ -1455,6 +1562,7 @@ Triple Triple::get64BitArchVariant() const {
   case Triple::bpfel:
   case Triple::hsail64:
   case Triple::le64:
+  case Triple::loongarch64:
   case Triple::mips64:
   case Triple::mips64el:
   case Triple::nvptx64:
@@ -1478,6 +1586,7 @@ Triple Triple::get64BitArchVariant() const {
   case Triple::armeb:           T.setArch(Triple::aarch64_be); break;
   case Triple::hsail:           T.setArch(Triple::hsail64);    break;
   case Triple::le32:            T.setArch(Triple::le64);       break;
+  case Triple::loongarch32:     T.setArch(Triple::loongarch64);    break;
   case Triple::mips:
     T.setArch(Triple::mips64, getSubArch());
     break;
@@ -1491,7 +1600,9 @@ Triple Triple::get64BitArchVariant() const {
   case Triple::riscv32:         T.setArch(Triple::riscv64);    break;
   case Triple::sparc:           T.setArch(Triple::sparcv9);    break;
   case Triple::spir:            T.setArch(Triple::spir64);     break;
-  case Triple::spirv32:         T.setArch(Triple::spirv64);    break;
+  case Triple::spirv32:
+    T.setArch(Triple::spirv64, getSubArch());
+    break;
   case Triple::thumb:           T.setArch(Triple::aarch64);    break;
   case Triple::thumbeb:         T.setArch(Triple::aarch64_be); break;
   case Triple::wasm32:          T.setArch(Triple::wasm64);     break;
@@ -1511,12 +1622,15 @@ Triple Triple::getBigEndianArchVariant() const {
   case Triple::amdil64:
   case Triple::amdil:
   case Triple::avr:
+  case Triple::dxil:
   case Triple::hexagon:
   case Triple::hsail64:
   case Triple::hsail:
   case Triple::kalimba:
   case Triple::le32:
   case Triple::le64:
+  case Triple::loongarch32:
+  case Triple::loongarch64:
   case Triple::msp430:
   case Triple::nvptx64:
   case Triple::nvptx:
@@ -1611,12 +1725,15 @@ bool Triple::isLittleEndian() const {
   case Triple::avr:
   case Triple::bpfel:
   case Triple::csky:
+  case Triple::dxil:
   case Triple::hexagon:
   case Triple::hsail64:
   case Triple::hsail:
   case Triple::kalimba:
   case Triple::le32:
   case Triple::le64:
+  case Triple::loongarch32:
+  case Triple::loongarch64:
   case Triple::mips64el:
   case Triple::mipsel:
   case Triple::msp430:
@@ -1725,6 +1842,8 @@ VersionTuple Triple::getMinimumSupportedOSVersion() const {
     if (isSimulatorEnvironment())
       return VersionTuple(7, 0, 0);
     break;
+  case Triple::DriverKit:
+    return VersionTuple(20, 0, 0);
   default:
     break;
   }
@@ -1755,6 +1874,7 @@ StringRef Triple::getARMCPUForArch(StringRef MArch) const {
   case llvm::Triple::MacOSX:
   case llvm::Triple::TvOS:
   case llvm::Triple::WatchOS:
+  case llvm::Triple::DriverKit:
     if (MArch == "v7k")
       return "cortex-a7";
     break;
@@ -1811,3 +1931,33 @@ VersionTuple Triple::getCanonicalVersionForOS(OSType OSKind,
     return Version;
   }
 }
+
+// HLSL triple environment orders are relied on in the front end
+static_assert(Triple::Vertex - Triple::Pixel == 1,
+              "incorrect HLSL stage order");
+static_assert(Triple::Geometry - Triple::Pixel == 2,
+              "incorrect HLSL stage order");
+static_assert(Triple::Hull - Triple::Pixel == 3,
+              "incorrect HLSL stage order");
+static_assert(Triple::Domain - Triple::Pixel == 4,
+              "incorrect HLSL stage order");
+static_assert(Triple::Compute - Triple::Pixel == 5,
+              "incorrect HLSL stage order");
+static_assert(Triple::Library - Triple::Pixel == 6,
+              "incorrect HLSL stage order");
+static_assert(Triple::RayGeneration - Triple::Pixel == 7,
+              "incorrect HLSL stage order");
+static_assert(Triple::Intersection - Triple::Pixel == 8,
+              "incorrect HLSL stage order");
+static_assert(Triple::AnyHit - Triple::Pixel == 9,
+              "incorrect HLSL stage order");
+static_assert(Triple::ClosestHit - Triple::Pixel == 10,
+              "incorrect HLSL stage order");
+static_assert(Triple::Miss - Triple::Pixel == 11,
+              "incorrect HLSL stage order");
+static_assert(Triple::Callable - Triple::Pixel == 12,
+              "incorrect HLSL stage order");
+static_assert(Triple::Mesh - Triple::Pixel == 13,
+              "incorrect HLSL stage order");
+static_assert(Triple::Amplification - Triple::Pixel == 14,
+              "incorrect HLSL stage order");
diff --git a/llvm/lib/Support/TypeSize.cpp b/llvm/lib/Support/TypeSize.cpp
index a80fde83e3bc..8bed9b29cba5 100644
--- a/llvm/lib/Support/TypeSize.cpp
+++ b/llvm/lib/Support/TypeSize.cpp
@@ -21,11 +21,10 @@ struct CreateScalableErrorAsWarning {
   /// using the wrong interface on a scalable vector.
   static void *call() {
     return new cl::opt<bool>(
-        "treat-scalable-fixed-error-as-warning", cl::Hidden, cl::init(false),
+        "treat-scalable-fixed-error-as-warning", cl::Hidden,
         cl::desc(
             "Treat issues where a fixed-width property is requested from a "
-            "scalable type as a warning, instead of an error."),
-        cl::ZeroOrMore);
+            "scalable type as a warning, instead of an error"));
   }
 };
 } // namespace
diff --git a/llvm/lib/Support/Unicode.cpp b/llvm/lib/Support/Unicode.cpp
index bb6e75555b4c..103710303094 100644
--- a/llvm/lib/Support/Unicode.cpp
+++ b/llvm/lib/Support/Unicode.cpp
@@ -19,197 +19,271 @@ namespace llvm {
 namespace sys {
 namespace unicode {
 
+/// Unicode code points of the categories L, M, N, P, S and Zs are considered
+/// printable.
+/// In addition, U+00AD SOFT HYPHEN is also considered printable, as
+/// it's actually displayed on most terminals. \return true if the character is
+/// considered printable.
 bool isPrintable(int UCS) {
-  // Sorted list of non-overlapping intervals of code points that are not
-  // supposed to be printable.
-  static const UnicodeCharRange NonPrintableRanges[] = {
-    { 0x0000, 0x001F }, { 0x007F, 0x009F }, { 0x034F, 0x034F },
-    { 0x0378, 0x0379 }, { 0x037F, 0x0383 }, { 0x038B, 0x038B },
-    { 0x038D, 0x038D }, { 0x03A2, 0x03A2 }, { 0x0528, 0x0530 },
-    { 0x0557, 0x0558 }, { 0x0560, 0x0560 }, { 0x0588, 0x0588 },
-    { 0x058B, 0x058E }, { 0x0590, 0x0590 }, { 0x05C8, 0x05CF },
-    { 0x05EB, 0x05EF }, { 0x05F5, 0x0605 }, { 0x061C, 0x061D },
-    { 0x06DD, 0x06DD }, { 0x070E, 0x070F }, { 0x074B, 0x074C },
-    { 0x07B2, 0x07BF }, { 0x07FB, 0x07FF }, { 0x082E, 0x082F },
-    { 0x083F, 0x083F }, { 0x085C, 0x085D }, { 0x085F, 0x089F },
-    { 0x08A1, 0x08A1 }, { 0x08AD, 0x08E3 }, { 0x08FF, 0x08FF },
-    { 0x0978, 0x0978 }, { 0x0980, 0x0980 }, { 0x0984, 0x0984 },
-    { 0x098D, 0x098E }, { 0x0991, 0x0992 }, { 0x09A9, 0x09A9 },
-    { 0x09B1, 0x09B1 }, { 0x09B3, 0x09B5 }, { 0x09BA, 0x09BB },
-    { 0x09C5, 0x09C6 }, { 0x09C9, 0x09CA }, { 0x09CF, 0x09D6 },
-    { 0x09D8, 0x09DB }, { 0x09DE, 0x09DE }, { 0x09E4, 0x09E5 },
-    { 0x09FC, 0x0A00 }, { 0x0A04, 0x0A04 }, { 0x0A0B, 0x0A0E },
-    { 0x0A11, 0x0A12 }, { 0x0A29, 0x0A29 }, { 0x0A31, 0x0A31 },
-    { 0x0A34, 0x0A34 }, { 0x0A37, 0x0A37 }, { 0x0A3A, 0x0A3B },
-    { 0x0A3D, 0x0A3D }, { 0x0A43, 0x0A46 }, { 0x0A49, 0x0A4A },
-    { 0x0A4E, 0x0A50 }, { 0x0A52, 0x0A58 }, { 0x0A5D, 0x0A5D },
-    { 0x0A5F, 0x0A65 }, { 0x0A76, 0x0A80 }, { 0x0A84, 0x0A84 },
-    { 0x0A8E, 0x0A8E }, { 0x0A92, 0x0A92 }, { 0x0AA9, 0x0AA9 },
-    { 0x0AB1, 0x0AB1 }, { 0x0AB4, 0x0AB4 }, { 0x0ABA, 0x0ABB },
-    { 0x0AC6, 0x0AC6 }, { 0x0ACA, 0x0ACA }, { 0x0ACE, 0x0ACF },
-    { 0x0AD1, 0x0ADF }, { 0x0AE4, 0x0AE5 }, { 0x0AF2, 0x0B00 },
-    { 0x0B04, 0x0B04 }, { 0x0B0D, 0x0B0E }, { 0x0B11, 0x0B12 },
-    { 0x0B29, 0x0B29 }, { 0x0B31, 0x0B31 }, { 0x0B34, 0x0B34 },
-    { 0x0B3A, 0x0B3B }, { 0x0B45, 0x0B46 }, { 0x0B49, 0x0B4A },
-    { 0x0B4E, 0x0B55 }, { 0x0B58, 0x0B5B }, { 0x0B5E, 0x0B5E },
-    { 0x0B64, 0x0B65 }, { 0x0B78, 0x0B81 }, { 0x0B84, 0x0B84 },
-    { 0x0B8B, 0x0B8D }, { 0x0B91, 0x0B91 }, { 0x0B96, 0x0B98 },
-    { 0x0B9B, 0x0B9B }, { 0x0B9D, 0x0B9D }, { 0x0BA0, 0x0BA2 },
-    { 0x0BA5, 0x0BA7 }, { 0x0BAB, 0x0BAD }, { 0x0BBA, 0x0BBD },
-    { 0x0BC3, 0x0BC5 }, { 0x0BC9, 0x0BC9 }, { 0x0BCE, 0x0BCF },
-    { 0x0BD1, 0x0BD6 }, { 0x0BD8, 0x0BE5 }, { 0x0BFB, 0x0C00 },
-    { 0x0C04, 0x0C04 }, { 0x0C0D, 0x0C0D }, { 0x0C11, 0x0C11 },
-    { 0x0C29, 0x0C29 }, { 0x0C34, 0x0C34 }, { 0x0C3A, 0x0C3C },
-    { 0x0C45, 0x0C45 }, { 0x0C49, 0x0C49 }, { 0x0C4E, 0x0C54 },
-    { 0x0C57, 0x0C57 }, { 0x0C5A, 0x0C5F }, { 0x0C64, 0x0C65 },
-    { 0x0C70, 0x0C77 }, { 0x0C80, 0x0C81 }, { 0x0C84, 0x0C84 },
-    { 0x0C8D, 0x0C8D }, { 0x0C91, 0x0C91 }, { 0x0CA9, 0x0CA9 },
-    { 0x0CB4, 0x0CB4 }, { 0x0CBA, 0x0CBB }, { 0x0CC5, 0x0CC5 },
-    { 0x0CC9, 0x0CC9 }, { 0x0CCE, 0x0CD4 }, { 0x0CD7, 0x0CDD },
-    { 0x0CDF, 0x0CDF }, { 0x0CE4, 0x0CE5 }, { 0x0CF0, 0x0CF0 },
-    { 0x0CF3, 0x0D01 }, { 0x0D04, 0x0D04 }, { 0x0D0D, 0x0D0D },
-    { 0x0D11, 0x0D11 }, { 0x0D3B, 0x0D3C }, { 0x0D45, 0x0D45 },
-    { 0x0D49, 0x0D49 }, { 0x0D4F, 0x0D56 }, { 0x0D58, 0x0D5F },
-    { 0x0D64, 0x0D65 }, { 0x0D76, 0x0D78 }, { 0x0D80, 0x0D81 },
-    { 0x0D84, 0x0D84 }, { 0x0D97, 0x0D99 }, { 0x0DB2, 0x0DB2 },
-    { 0x0DBC, 0x0DBC }, { 0x0DBE, 0x0DBF }, { 0x0DC7, 0x0DC9 },
-    { 0x0DCB, 0x0DCE }, { 0x0DD5, 0x0DD5 }, { 0x0DD7, 0x0DD7 },
-    { 0x0DE0, 0x0DF1 }, { 0x0DF5, 0x0E00 }, { 0x0E3B, 0x0E3E },
-    { 0x0E5C, 0x0E80 }, { 0x0E83, 0x0E83 }, { 0x0E85, 0x0E86 },
-    { 0x0E89, 0x0E89 }, { 0x0E8B, 0x0E8C }, { 0x0E8E, 0x0E93 },
-    { 0x0E98, 0x0E98 }, { 0x0EA0, 0x0EA0 }, { 0x0EA4, 0x0EA4 },
-    { 0x0EA6, 0x0EA6 }, { 0x0EA8, 0x0EA9 }, { 0x0EAC, 0x0EAC },
-    { 0x0EBA, 0x0EBA }, { 0x0EBE, 0x0EBF }, { 0x0EC5, 0x0EC5 },
-    { 0x0EC7, 0x0EC7 }, { 0x0ECE, 0x0ECF }, { 0x0EDA, 0x0EDB },
-    { 0x0EE0, 0x0EFF }, { 0x0F48, 0x0F48 }, { 0x0F6D, 0x0F70 },
-    { 0x0F98, 0x0F98 }, { 0x0FBD, 0x0FBD }, { 0x0FCD, 0x0FCD },
-    { 0x0FDB, 0x0FFF }, { 0x10C6, 0x10C6 }, { 0x10C8, 0x10CC },
-    { 0x10CE, 0x10CF }, { 0x115F, 0x1160 }, { 0x1249, 0x1249 },
-    { 0x124E, 0x124F }, { 0x1257, 0x1257 }, { 0x1259, 0x1259 },
-    { 0x125E, 0x125F }, { 0x1289, 0x1289 }, { 0x128E, 0x128F },
-    { 0x12B1, 0x12B1 }, { 0x12B6, 0x12B7 }, { 0x12BF, 0x12BF },
-    { 0x12C1, 0x12C1 }, { 0x12C6, 0x12C7 }, { 0x12D7, 0x12D7 },
-    { 0x1311, 0x1311 }, { 0x1316, 0x1317 }, { 0x135B, 0x135C },
-    { 0x137D, 0x137F }, { 0x139A, 0x139F }, { 0x13F5, 0x13FF },
-    { 0x169D, 0x169F }, { 0x16F1, 0x16FF }, { 0x170D, 0x170D },
-    { 0x1715, 0x171F }, { 0x1737, 0x173F }, { 0x1754, 0x175F },
-    { 0x176D, 0x176D }, { 0x1771, 0x1771 }, { 0x1774, 0x177F },
-    { 0x17B4, 0x17B5 }, { 0x17DE, 0x17DF }, { 0x17EA, 0x17EF },
-    { 0x17FA, 0x17FF }, { 0x180B, 0x180D }, { 0x180F, 0x180F },
-    { 0x181A, 0x181F }, { 0x1878, 0x187F }, { 0x18AB, 0x18AF },
-    { 0x18F6, 0x18FF }, { 0x191D, 0x191F }, { 0x192C, 0x192F },
-    { 0x193C, 0x193F }, { 0x1941, 0x1943 }, { 0x196E, 0x196F },
-    { 0x1975, 0x197F }, { 0x19AC, 0x19AF }, { 0x19CA, 0x19CF },
-    { 0x19DB, 0x19DD }, { 0x1A1C, 0x1A1D }, { 0x1A5F, 0x1A5F },
-    { 0x1A7D, 0x1A7E }, { 0x1A8A, 0x1A8F }, { 0x1A9A, 0x1A9F },
-    { 0x1AAE, 0x1AFF }, { 0x1B4C, 0x1B4F }, { 0x1B7D, 0x1B7F },
-    { 0x1BF4, 0x1BFB }, { 0x1C38, 0x1C3A }, { 0x1C4A, 0x1C4C },
-    { 0x1C80, 0x1CBF }, { 0x1CC8, 0x1CCF }, { 0x1CF7, 0x1CFF },
-    { 0x1DE7, 0x1DFB }, { 0x1F16, 0x1F17 }, { 0x1F1E, 0x1F1F },
-    { 0x1F46, 0x1F47 }, { 0x1F4E, 0x1F4F }, { 0x1F58, 0x1F58 },
-    { 0x1F5A, 0x1F5A }, { 0x1F5C, 0x1F5C }, { 0x1F5E, 0x1F5E },
-    { 0x1F7E, 0x1F7F }, { 0x1FB5, 0x1FB5 }, { 0x1FC5, 0x1FC5 },
-    { 0x1FD4, 0x1FD5 }, { 0x1FDC, 0x1FDC }, { 0x1FF0, 0x1FF1 },
-    { 0x1FF5, 0x1FF5 }, { 0x1FFF, 0x1FFF }, { 0x200B, 0x200F },
-    { 0x202A, 0x202E }, { 0x2060, 0x206F }, { 0x2072, 0x2073 },
-    { 0x208F, 0x208F }, { 0x209D, 0x209F }, { 0x20BB, 0x20CF },
-    { 0x20F1, 0x20FF }, { 0x218A, 0x218F }, { 0x23F4, 0x23FF },
-    { 0x2427, 0x243F }, { 0x244B, 0x245F }, { 0x2700, 0x2700 },
-    { 0x2B4D, 0x2B4F }, { 0x2B5A, 0x2BFF }, { 0x2C2F, 0x2C2F },
-    { 0x2C5F, 0x2C5F }, { 0x2CF4, 0x2CF8 }, { 0x2D26, 0x2D26 },
-    { 0x2D28, 0x2D2C }, { 0x2D2E, 0x2D2F }, { 0x2D68, 0x2D6E },
-    { 0x2D71, 0x2D7E }, { 0x2D97, 0x2D9F }, { 0x2DA7, 0x2DA7 },
-    { 0x2DAF, 0x2DAF }, { 0x2DB7, 0x2DB7 }, { 0x2DBF, 0x2DBF },
-    { 0x2DC7, 0x2DC7 }, { 0x2DCF, 0x2DCF }, { 0x2DD7, 0x2DD7 },
-    { 0x2DDF, 0x2DDF }, { 0x2E3C, 0x2E7F }, { 0x2E9A, 0x2E9A },
-    { 0x2EF4, 0x2EFF }, { 0x2FD6, 0x2FEF }, { 0x2FFC, 0x2FFF },
-    { 0x3040, 0x3040 }, { 0x3097, 0x3098 }, { 0x3100, 0x3104 },
-    { 0x312E, 0x3130 }, { 0x3164, 0x3164 }, { 0x318F, 0x318F },
-    { 0x31BB, 0x31BF }, { 0x31E4, 0x31EF }, { 0x321F, 0x321F },
-    { 0x32FF, 0x32FF }, { 0x4DB6, 0x4DBF }, { 0x9FCD, 0x9FFF },
-    { 0xA48D, 0xA48F }, { 0xA4C7, 0xA4CF }, { 0xA62C, 0xA63F },
-    { 0xA698, 0xA69E }, { 0xA6F8, 0xA6FF }, { 0xA78F, 0xA78F },
-    { 0xA794, 0xA79F }, { 0xA7AB, 0xA7F7 }, { 0xA82C, 0xA82F },
-    { 0xA83A, 0xA83F }, { 0xA878, 0xA87F }, { 0xA8C5, 0xA8CD },
-    { 0xA8DA, 0xA8DF }, { 0xA8FC, 0xA8FF }, { 0xA954, 0xA95E },
-    { 0xA97D, 0xA97F }, { 0xA9CE, 0xA9CE }, { 0xA9DA, 0xA9DD },
-    { 0xA9E0, 0xA9FF }, { 0xAA37, 0xAA3F }, { 0xAA4E, 0xAA4F },
-    { 0xAA5A, 0xAA5B }, { 0xAA7C, 0xAA7F }, { 0xAAC3, 0xAADA },
-    { 0xAAF7, 0xAB00 }, { 0xAB07, 0xAB08 }, { 0xAB0F, 0xAB10 },
-    { 0xAB17, 0xAB1F }, { 0xAB27, 0xAB27 }, { 0xAB2F, 0xABBF },
-    { 0xABEE, 0xABEF }, { 0xABFA, 0xABFF }, { 0xD7A4, 0xD7AF },
-    { 0xD7C7, 0xD7CA }, { 0xD7FC, 0xDFFF }, { 0xFA6E, 0xFA6F },
-    { 0xFADA, 0xFAFF }, { 0xFB07, 0xFB12 }, { 0xFB18, 0xFB1C },
-    { 0xFB37, 0xFB37 }, { 0xFB3D, 0xFB3D }, { 0xFB3F, 0xFB3F },
-    { 0xFB42, 0xFB42 }, { 0xFB45, 0xFB45 }, { 0xFBC2, 0xFBD2 },
-    { 0xFD40, 0xFD4F }, { 0xFD90, 0xFD91 }, { 0xFDC8, 0xFDEF },
-    { 0xFDFE, 0xFE0F }, { 0xFE1A, 0xFE1F }, { 0xFE27, 0xFE2F },
-    { 0xFE53, 0xFE53 }, { 0xFE67, 0xFE67 }, { 0xFE6C, 0xFE6F },
-    { 0xFE75, 0xFE75 }, { 0xFEFD, 0xFEFF }, { 0xFF00, 0xFF00 },
-    { 0xFFA0, 0xFFA0 }, { 0xFFBF, 0xFFC1 }, { 0xFFC8, 0xFFC9 },
-    { 0xFFD0, 0xFFD1 }, { 0xFFD8, 0xFFD9 }, { 0xFFDD, 0xFFDF },
-    { 0xFFE7, 0xFFE7 }, { 0xFFEF, 0xFFFB }, { 0xFFFE, 0xFFFF },
-    { 0x1000C, 0x1000C }, { 0x10027, 0x10027 }, { 0x1003B, 0x1003B },
-    { 0x1003E, 0x1003E }, { 0x1004E, 0x1004F }, { 0x1005E, 0x1007F },
-    { 0x100FB, 0x100FF }, { 0x10103, 0x10106 }, { 0x10134, 0x10136 },
-    { 0x1018B, 0x1018F }, { 0x1019C, 0x101CF }, { 0x101FE, 0x1027F },
-    { 0x1029D, 0x1029F }, { 0x102D1, 0x102FF }, { 0x1031F, 0x1031F },
-    { 0x10324, 0x1032F }, { 0x1034B, 0x1037F }, { 0x1039E, 0x1039E },
-    { 0x103C4, 0x103C7 }, { 0x103D6, 0x103FF }, { 0x1049E, 0x1049F },
-    { 0x104AA, 0x107FF }, { 0x10806, 0x10807 }, { 0x10809, 0x10809 },
-    { 0x10836, 0x10836 }, { 0x10839, 0x1083B }, { 0x1083D, 0x1083E },
-    { 0x10856, 0x10856 }, { 0x10860, 0x108FF }, { 0x1091C, 0x1091E },
-    { 0x1093A, 0x1093E }, { 0x10940, 0x1097F }, { 0x109B8, 0x109BD },
-    { 0x109C0, 0x109FF }, { 0x10A04, 0x10A04 }, { 0x10A07, 0x10A0B },
-    { 0x10A14, 0x10A14 }, { 0x10A18, 0x10A18 }, { 0x10A34, 0x10A37 },
-    { 0x10A3B, 0x10A3E }, { 0x10A48, 0x10A4F }, { 0x10A59, 0x10A5F },
-    { 0x10A80, 0x10AFF }, { 0x10B36, 0x10B38 }, { 0x10B56, 0x10B57 },
-    { 0x10B73, 0x10B77 }, { 0x10B80, 0x10BFF }, { 0x10C49, 0x10E5F },
-    { 0x10E7F, 0x10FFF }, { 0x1104E, 0x11051 }, { 0x11070, 0x1107F },
-    { 0x110BD, 0x110BD }, { 0x110C2, 0x110CF }, { 0x110E9, 0x110EF },
-    { 0x110FA, 0x110FF }, { 0x11135, 0x11135 }, { 0x11144, 0x1117F },
-    { 0x111C9, 0x111CF }, { 0x111DA, 0x1167F }, { 0x116B8, 0x116BF },
-    { 0x116CA, 0x11FFF }, { 0x1236F, 0x123FF }, { 0x12463, 0x1246F },
-    { 0x12474, 0x12FFF }, { 0x1342F, 0x167FF }, { 0x16A39, 0x16EFF },
-    { 0x16F45, 0x16F4F }, { 0x16F7F, 0x16F8E }, { 0x16FA0, 0x1AFFF },
-    { 0x1B002, 0x1CFFF }, { 0x1D0F6, 0x1D0FF }, { 0x1D127, 0x1D128 },
-    { 0x1D173, 0x1D17A }, { 0x1D1DE, 0x1D1FF }, { 0x1D246, 0x1D2FF },
-    { 0x1D357, 0x1D35F }, { 0x1D372, 0x1D3FF }, { 0x1D455, 0x1D455 },
-    { 0x1D49D, 0x1D49D }, { 0x1D4A0, 0x1D4A1 }, { 0x1D4A3, 0x1D4A4 },
-    { 0x1D4A7, 0x1D4A8 }, { 0x1D4AD, 0x1D4AD }, { 0x1D4BA, 0x1D4BA },
-    { 0x1D4BC, 0x1D4BC }, { 0x1D4C4, 0x1D4C4 }, { 0x1D506, 0x1D506 },
-    { 0x1D50B, 0x1D50C }, { 0x1D515, 0x1D515 }, { 0x1D51D, 0x1D51D },
-    { 0x1D53A, 0x1D53A }, { 0x1D53F, 0x1D53F }, { 0x1D545, 0x1D545 },
-    { 0x1D547, 0x1D549 }, { 0x1D551, 0x1D551 }, { 0x1D6A6, 0x1D6A7 },
-    { 0x1D7CC, 0x1D7CD }, { 0x1D800, 0x1EDFF }, { 0x1EE04, 0x1EE04 },
-    { 0x1EE20, 0x1EE20 }, { 0x1EE23, 0x1EE23 }, { 0x1EE25, 0x1EE26 },
-    { 0x1EE28, 0x1EE28 }, { 0x1EE33, 0x1EE33 }, { 0x1EE38, 0x1EE38 },
-    { 0x1EE3A, 0x1EE3A }, { 0x1EE3C, 0x1EE41 }, { 0x1EE43, 0x1EE46 },
-    { 0x1EE48, 0x1EE48 }, { 0x1EE4A, 0x1EE4A }, { 0x1EE4C, 0x1EE4C },
-    { 0x1EE50, 0x1EE50 }, { 0x1EE53, 0x1EE53 }, { 0x1EE55, 0x1EE56 },
-    { 0x1EE58, 0x1EE58 }, { 0x1EE5A, 0x1EE5A }, { 0x1EE5C, 0x1EE5C },
-    { 0x1EE5E, 0x1EE5E }, { 0x1EE60, 0x1EE60 }, { 0x1EE63, 0x1EE63 },
-    { 0x1EE65, 0x1EE66 }, { 0x1EE6B, 0x1EE6B }, { 0x1EE73, 0x1EE73 },
-    { 0x1EE78, 0x1EE78 }, { 0x1EE7D, 0x1EE7D }, { 0x1EE7F, 0x1EE7F },
-    { 0x1EE8A, 0x1EE8A }, { 0x1EE9C, 0x1EEA0 }, { 0x1EEA4, 0x1EEA4 },
-    { 0x1EEAA, 0x1EEAA }, { 0x1EEBC, 0x1EEEF }, { 0x1EEF2, 0x1EFFF },
-    { 0x1F02C, 0x1F02F }, { 0x1F094, 0x1F09F }, { 0x1F0AF, 0x1F0B0 },
-    { 0x1F0BF, 0x1F0C0 }, { 0x1F0D0, 0x1F0D0 }, { 0x1F0E0, 0x1F0FF },
-    { 0x1F10B, 0x1F10F }, { 0x1F12F, 0x1F12F }, { 0x1F16C, 0x1F16F },
-    { 0x1F19B, 0x1F1E5 }, { 0x1F203, 0x1F20F }, { 0x1F23B, 0x1F23F },
-    { 0x1F249, 0x1F24F }, { 0x1F252, 0x1F2FF }, { 0x1F321, 0x1F32F },
-    { 0x1F336, 0x1F336 }, { 0x1F37D, 0x1F37F }, { 0x1F394, 0x1F39F },
-    { 0x1F3C5, 0x1F3C5 }, { 0x1F3CB, 0x1F3DF }, { 0x1F3F1, 0x1F3FF },
-    { 0x1F43F, 0x1F43F }, { 0x1F441, 0x1F441 }, { 0x1F4F8, 0x1F4F8 },
-    { 0x1F4FD, 0x1F4FF }, { 0x1F53E, 0x1F53F }, { 0x1F544, 0x1F54F },
-    { 0x1F568, 0x1F5FA }, { 0x1F641, 0x1F644 }, { 0x1F650, 0x1F67F },
-    { 0x1F6C6, 0x1F6FF }, { 0x1F774, 0x1FFFF }, { 0x2A6D7, 0x2A6FF },
-    { 0x2B735, 0x2B73F }, { 0x2B81E, 0x2F7FF }, { 0x2FA1E, 0xF0000 },
-    { 0xFFFFE, 0xFFFFF }, { 0x10FFFE, 0x10FFFF }
-  };
-  static const UnicodeCharSet NonPrintables(NonPrintableRanges);
+  // https://unicode.org/Public/14.0.0/ucdxml/
+  static const UnicodeCharRange PrintableRanges[] = {
+      {0x0020, 0x007E},   {0x00A0, 0x00AC},   {0x00AE, 0x0377},
+      {0x037A, 0x037F},   {0x0384, 0x038A},   {0x038C, 0x038C},
+      {0x038E, 0x03A1},   {0x03A3, 0x052F},   {0x0531, 0x0556},
+      {0x0559, 0x058A},   {0x058D, 0x058F},   {0x0591, 0x05C7},
+      {0x05D0, 0x05EA},   {0x05EF, 0x05F4},   {0x0606, 0x061B},
+      {0x061D, 0x06DC},   {0x06DE, 0x070D},   {0x0710, 0x074A},
+      {0x074D, 0x07B1},   {0x07C0, 0x07FA},   {0x07FD, 0x082D},
+      {0x0830, 0x083E},   {0x0840, 0x085B},   {0x085E, 0x085E},
+      {0x0860, 0x086A},   {0x0870, 0x088E},   {0x0898, 0x08E1},
+      {0x08E3, 0x0983},   {0x0985, 0x098C},   {0x098F, 0x0990},
+      {0x0993, 0x09A8},   {0x09AA, 0x09B0},   {0x09B2, 0x09B2},
+      {0x09B6, 0x09B9},   {0x09BC, 0x09C4},   {0x09C7, 0x09C8},
+      {0x09CB, 0x09CE},   {0x09D7, 0x09D7},   {0x09DC, 0x09DD},
+      {0x09DF, 0x09E3},   {0x09E6, 0x09FE},   {0x0A01, 0x0A03},
+      {0x0A05, 0x0A0A},   {0x0A0F, 0x0A10},   {0x0A13, 0x0A28},
+      {0x0A2A, 0x0A30},   {0x0A32, 0x0A33},   {0x0A35, 0x0A36},
+      {0x0A38, 0x0A39},   {0x0A3C, 0x0A3C},   {0x0A3E, 0x0A42},
+      {0x0A47, 0x0A48},   {0x0A4B, 0x0A4D},   {0x0A51, 0x0A51},
+      {0x0A59, 0x0A5C},   {0x0A5E, 0x0A5E},   {0x0A66, 0x0A76},
+      {0x0A81, 0x0A83},   {0x0A85, 0x0A8D},   {0x0A8F, 0x0A91},
+      {0x0A93, 0x0AA8},   {0x0AAA, 0x0AB0},   {0x0AB2, 0x0AB3},
+      {0x0AB5, 0x0AB9},   {0x0ABC, 0x0AC5},   {0x0AC7, 0x0AC9},
+      {0x0ACB, 0x0ACD},   {0x0AD0, 0x0AD0},   {0x0AE0, 0x0AE3},
+      {0x0AE6, 0x0AF1},   {0x0AF9, 0x0AFF},   {0x0B01, 0x0B03},
+      {0x0B05, 0x0B0C},   {0x0B0F, 0x0B10},   {0x0B13, 0x0B28},
+      {0x0B2A, 0x0B30},   {0x0B32, 0x0B33},   {0x0B35, 0x0B39},
+      {0x0B3C, 0x0B44},   {0x0B47, 0x0B48},   {0x0B4B, 0x0B4D},
+      {0x0B55, 0x0B57},   {0x0B5C, 0x0B5D},   {0x0B5F, 0x0B63},
+      {0x0B66, 0x0B77},   {0x0B82, 0x0B83},   {0x0B85, 0x0B8A},
+      {0x0B8E, 0x0B90},   {0x0B92, 0x0B95},   {0x0B99, 0x0B9A},
+      {0x0B9C, 0x0B9C},   {0x0B9E, 0x0B9F},   {0x0BA3, 0x0BA4},
+      {0x0BA8, 0x0BAA},   {0x0BAE, 0x0BB9},   {0x0BBE, 0x0BC2},
+      {0x0BC6, 0x0BC8},   {0x0BCA, 0x0BCD},   {0x0BD0, 0x0BD0},
+      {0x0BD7, 0x0BD7},   {0x0BE6, 0x0BFA},   {0x0C00, 0x0C0C},
+      {0x0C0E, 0x0C10},   {0x0C12, 0x0C28},   {0x0C2A, 0x0C39},
+      {0x0C3C, 0x0C44},   {0x0C46, 0x0C48},   {0x0C4A, 0x0C4D},
+      {0x0C55, 0x0C56},   {0x0C58, 0x0C5A},   {0x0C5D, 0x0C5D},
+      {0x0C60, 0x0C63},   {0x0C66, 0x0C6F},   {0x0C77, 0x0C8C},
+      {0x0C8E, 0x0C90},   {0x0C92, 0x0CA8},   {0x0CAA, 0x0CB3},
+      {0x0CB5, 0x0CB9},   {0x0CBC, 0x0CC4},   {0x0CC6, 0x0CC8},
+      {0x0CCA, 0x0CCD},   {0x0CD5, 0x0CD6},   {0x0CDD, 0x0CDE},
+      {0x0CE0, 0x0CE3},   {0x0CE6, 0x0CEF},   {0x0CF1, 0x0CF2},
+      {0x0D00, 0x0D0C},   {0x0D0E, 0x0D10},   {0x0D12, 0x0D44},
+      {0x0D46, 0x0D48},   {0x0D4A, 0x0D4F},   {0x0D54, 0x0D63},
+      {0x0D66, 0x0D7F},   {0x0D81, 0x0D83},   {0x0D85, 0x0D96},
+      {0x0D9A, 0x0DB1},   {0x0DB3, 0x0DBB},   {0x0DBD, 0x0DBD},
+      {0x0DC0, 0x0DC6},   {0x0DCA, 0x0DCA},   {0x0DCF, 0x0DD4},
+      {0x0DD6, 0x0DD6},   {0x0DD8, 0x0DDF},   {0x0DE6, 0x0DEF},
+      {0x0DF2, 0x0DF4},   {0x0E01, 0x0E3A},   {0x0E3F, 0x0E5B},
+      {0x0E81, 0x0E82},   {0x0E84, 0x0E84},   {0x0E86, 0x0E8A},
+      {0x0E8C, 0x0EA3},   {0x0EA5, 0x0EA5},   {0x0EA7, 0x0EBD},
+      {0x0EC0, 0x0EC4},   {0x0EC6, 0x0EC6},   {0x0EC8, 0x0ECD},
+      {0x0ED0, 0x0ED9},   {0x0EDC, 0x0EDF},   {0x0F00, 0x0F47},
+      {0x0F49, 0x0F6C},   {0x0F71, 0x0F97},   {0x0F99, 0x0FBC},
+      {0x0FBE, 0x0FCC},   {0x0FCE, 0x0FDA},   {0x1000, 0x10C5},
+      {0x10C7, 0x10C7},   {0x10CD, 0x10CD},   {0x10D0, 0x1248},
+      {0x124A, 0x124D},   {0x1250, 0x1256},   {0x1258, 0x1258},
+      {0x125A, 0x125D},   {0x1260, 0x1288},   {0x128A, 0x128D},
+      {0x1290, 0x12B0},   {0x12B2, 0x12B5},   {0x12B8, 0x12BE},
+      {0x12C0, 0x12C0},   {0x12C2, 0x12C5},   {0x12C8, 0x12D6},
+      {0x12D8, 0x1310},   {0x1312, 0x1315},   {0x1318, 0x135A},
+      {0x135D, 0x137C},   {0x1380, 0x1399},   {0x13A0, 0x13F5},
+      {0x13F8, 0x13FD},   {0x1400, 0x169C},   {0x16A0, 0x16F8},
+      {0x1700, 0x1715},   {0x171F, 0x1736},   {0x1740, 0x1753},
+      {0x1760, 0x176C},   {0x176E, 0x1770},   {0x1772, 0x1773},
+      {0x1780, 0x17DD},   {0x17E0, 0x17E9},   {0x17F0, 0x17F9},
+      {0x1800, 0x180D},   {0x180F, 0x1819},   {0x1820, 0x1878},
+      {0x1880, 0x18AA},   {0x18B0, 0x18F5},   {0x1900, 0x191E},
+      {0x1920, 0x192B},   {0x1930, 0x193B},   {0x1940, 0x1940},
+      {0x1944, 0x196D},   {0x1970, 0x1974},   {0x1980, 0x19AB},
+      {0x19B0, 0x19C9},   {0x19D0, 0x19DA},   {0x19DE, 0x1A1B},
+      {0x1A1E, 0x1A5E},   {0x1A60, 0x1A7C},   {0x1A7F, 0x1A89},
+      {0x1A90, 0x1A99},   {0x1AA0, 0x1AAD},   {0x1AB0, 0x1ACE},
+      {0x1B00, 0x1B4C},   {0x1B50, 0x1B7E},   {0x1B80, 0x1BF3},
+      {0x1BFC, 0x1C37},   {0x1C3B, 0x1C49},   {0x1C4D, 0x1C88},
+      {0x1C90, 0x1CBA},   {0x1CBD, 0x1CC7},   {0x1CD0, 0x1CFA},
+      {0x1D00, 0x1F15},   {0x1F18, 0x1F1D},   {0x1F20, 0x1F45},
+      {0x1F48, 0x1F4D},   {0x1F50, 0x1F57},   {0x1F59, 0x1F59},
+      {0x1F5B, 0x1F5B},   {0x1F5D, 0x1F5D},   {0x1F5F, 0x1F7D},
+      {0x1F80, 0x1FB4},   {0x1FB6, 0x1FC4},   {0x1FC6, 0x1FD3},
+      {0x1FD6, 0x1FDB},   {0x1FDD, 0x1FEF},   {0x1FF2, 0x1FF4},
+      {0x1FF6, 0x1FFE},   {0x2000, 0x200A},   {0x2010, 0x2027},
+      {0x202F, 0x205F},   {0x2070, 0x2071},   {0x2074, 0x208E},
+      {0x2090, 0x209C},   {0x20A0, 0x20C0},   {0x20D0, 0x20F0},
+      {0x2100, 0x218B},   {0x2190, 0x2426},   {0x2440, 0x244A},
+      {0x2460, 0x2B73},   {0x2B76, 0x2B95},   {0x2B97, 0x2CF3},
+      {0x2CF9, 0x2D25},   {0x2D27, 0x2D27},   {0x2D2D, 0x2D2D},
+      {0x2D30, 0x2D67},   {0x2D6F, 0x2D70},   {0x2D7F, 0x2D96},
+      {0x2DA0, 0x2DA6},   {0x2DA8, 0x2DAE},   {0x2DB0, 0x2DB6},
+      {0x2DB8, 0x2DBE},   {0x2DC0, 0x2DC6},   {0x2DC8, 0x2DCE},
+      {0x2DD0, 0x2DD6},   {0x2DD8, 0x2DDE},   {0x2DE0, 0x2E5D},
+      {0x2E80, 0x2E99},   {0x2E9B, 0x2EF3},   {0x2F00, 0x2FD5},
+      {0x2FF0, 0x2FFB},   {0x3000, 0x303F},   {0x3041, 0x3096},
+      {0x3099, 0x30FF},   {0x3105, 0x312F},   {0x3131, 0x318E},
+      {0x3190, 0x31E3},   {0x31F0, 0x321E},   {0x3220, 0xA48C},
+      {0xA490, 0xA4C6},   {0xA4D0, 0xA62B},   {0xA640, 0xA6F7},
+      {0xA700, 0xA7CA},   {0xA7D0, 0xA7D1},   {0xA7D3, 0xA7D3},
+      {0xA7D5, 0xA7D9},   {0xA7F2, 0xA82C},   {0xA830, 0xA839},
+      {0xA840, 0xA877},   {0xA880, 0xA8C5},   {0xA8CE, 0xA8D9},
+      {0xA8E0, 0xA953},   {0xA95F, 0xA97C},   {0xA980, 0xA9CD},
+      {0xA9CF, 0xA9D9},   {0xA9DE, 0xA9FE},   {0xAA00, 0xAA36},
+      {0xAA40, 0xAA4D},   {0xAA50, 0xAA59},   {0xAA5C, 0xAAC2},
+      {0xAADB, 0xAAF6},   {0xAB01, 0xAB06},   {0xAB09, 0xAB0E},
+      {0xAB11, 0xAB16},   {0xAB20, 0xAB26},   {0xAB28, 0xAB2E},
+      {0xAB30, 0xAB6B},   {0xAB70, 0xABED},   {0xABF0, 0xABF9},
+      {0xAC00, 0xD7A3},   {0xD7B0, 0xD7C6},   {0xD7CB, 0xD7FB},
+      {0xF900, 0xFA6D},   {0xFA70, 0xFAD9},   {0xFB00, 0xFB06},
+      {0xFB13, 0xFB17},   {0xFB1D, 0xFB36},   {0xFB38, 0xFB3C},
+      {0xFB3E, 0xFB3E},   {0xFB40, 0xFB41},   {0xFB43, 0xFB44},
+      {0xFB46, 0xFBC2},   {0xFBD3, 0xFD8F},   {0xFD92, 0xFDC7},
+      {0xFDCF, 0xFDCF},   {0xFDF0, 0xFE19},   {0xFE20, 0xFE52},
+      {0xFE54, 0xFE66},   {0xFE68, 0xFE6B},   {0xFE70, 0xFE74},
+      {0xFE76, 0xFEFC},   {0xFF01, 0xFFBE},   {0xFFC2, 0xFFC7},
+      {0xFFCA, 0xFFCF},   {0xFFD2, 0xFFD7},   {0xFFDA, 0xFFDC},
+      {0xFFE0, 0xFFE6},   {0xFFE8, 0xFFEE},   {0xFFFC, 0xFFFD},
+      {0x10000, 0x1000B}, {0x1000D, 0x10026}, {0x10028, 0x1003A},
+      {0x1003C, 0x1003D}, {0x1003F, 0x1004D}, {0x10050, 0x1005D},
+      {0x10080, 0x100FA}, {0x10100, 0x10102}, {0x10107, 0x10133},
+      {0x10137, 0x1018E}, {0x10190, 0x1019C}, {0x101A0, 0x101A0},
+      {0x101D0, 0x101FD}, {0x10280, 0x1029C}, {0x102A0, 0x102D0},
+      {0x102E0, 0x102FB}, {0x10300, 0x10323}, {0x1032D, 0x1034A},
+      {0x10350, 0x1037A}, {0x10380, 0x1039D}, {0x1039F, 0x103C3},
+      {0x103C8, 0x103D5}, {0x10400, 0x1049D}, {0x104A0, 0x104A9},
+      {0x104B0, 0x104D3}, {0x104D8, 0x104FB}, {0x10500, 0x10527},
+      {0x10530, 0x10563}, {0x1056F, 0x1057A}, {0x1057C, 0x1058A},
+      {0x1058C, 0x10592}, {0x10594, 0x10595}, {0x10597, 0x105A1},
+      {0x105A3, 0x105B1}, {0x105B3, 0x105B9}, {0x105BB, 0x105BC},
+      {0x10600, 0x10736}, {0x10740, 0x10755}, {0x10760, 0x10767},
+      {0x10780, 0x10785}, {0x10787, 0x107B0}, {0x107B2, 0x107BA},
+      {0x10800, 0x10805}, {0x10808, 0x10808}, {0x1080A, 0x10835},
+      {0x10837, 0x10838}, {0x1083C, 0x1083C}, {0x1083F, 0x10855},
+      {0x10857, 0x1089E}, {0x108A7, 0x108AF}, {0x108E0, 0x108F2},
+      {0x108F4, 0x108F5}, {0x108FB, 0x1091B}, {0x1091F, 0x10939},
+      {0x1093F, 0x1093F}, {0x10980, 0x109B7}, {0x109BC, 0x109CF},
+      {0x109D2, 0x10A03}, {0x10A05, 0x10A06}, {0x10A0C, 0x10A13},
+      {0x10A15, 0x10A17}, {0x10A19, 0x10A35}, {0x10A38, 0x10A3A},
+      {0x10A3F, 0x10A48}, {0x10A50, 0x10A58}, {0x10A60, 0x10A9F},
+      {0x10AC0, 0x10AE6}, {0x10AEB, 0x10AF6}, {0x10B00, 0x10B35},
+      {0x10B39, 0x10B55}, {0x10B58, 0x10B72}, {0x10B78, 0x10B91},
+      {0x10B99, 0x10B9C}, {0x10BA9, 0x10BAF}, {0x10C00, 0x10C48},
+      {0x10C80, 0x10CB2}, {0x10CC0, 0x10CF2}, {0x10CFA, 0x10D27},
+      {0x10D30, 0x10D39}, {0x10E60, 0x10E7E}, {0x10E80, 0x10EA9},
+      {0x10EAB, 0x10EAD}, {0x10EB0, 0x10EB1}, {0x10F00, 0x10F27},
+      {0x10F30, 0x10F59}, {0x10F70, 0x10F89}, {0x10FB0, 0x10FCB},
+      {0x10FE0, 0x10FF6}, {0x11000, 0x1104D}, {0x11052, 0x11075},
+      {0x1107F, 0x110BC}, {0x110BE, 0x110C2}, {0x110D0, 0x110E8},
+      {0x110F0, 0x110F9}, {0x11100, 0x11134}, {0x11136, 0x11147},
+      {0x11150, 0x11176}, {0x11180, 0x111DF}, {0x111E1, 0x111F4},
+      {0x11200, 0x11211}, {0x11213, 0x1123E}, {0x11280, 0x11286},
+      {0x11288, 0x11288}, {0x1128A, 0x1128D}, {0x1128F, 0x1129D},
+      {0x1129F, 0x112A9}, {0x112B0, 0x112EA}, {0x112F0, 0x112F9},
+      {0x11300, 0x11303}, {0x11305, 0x1130C}, {0x1130F, 0x11310},
+      {0x11313, 0x11328}, {0x1132A, 0x11330}, {0x11332, 0x11333},
+      {0x11335, 0x11339}, {0x1133B, 0x11344}, {0x11347, 0x11348},
+      {0x1134B, 0x1134D}, {0x11350, 0x11350}, {0x11357, 0x11357},
+      {0x1135D, 0x11363}, {0x11366, 0x1136C}, {0x11370, 0x11374},
+      {0x11400, 0x1145B}, {0x1145D, 0x11461}, {0x11480, 0x114C7},
+      {0x114D0, 0x114D9}, {0x11580, 0x115B5}, {0x115B8, 0x115DD},
+      {0x11600, 0x11644}, {0x11650, 0x11659}, {0x11660, 0x1166C},
+      {0x11680, 0x116B9}, {0x116C0, 0x116C9}, {0x11700, 0x1171A},
+      {0x1171D, 0x1172B}, {0x11730, 0x11746}, {0x11800, 0x1183B},
+      {0x118A0, 0x118F2}, {0x118FF, 0x11906}, {0x11909, 0x11909},
+      {0x1190C, 0x11913}, {0x11915, 0x11916}, {0x11918, 0x11935},
+      {0x11937, 0x11938}, {0x1193B, 0x11946}, {0x11950, 0x11959},
+      {0x119A0, 0x119A7}, {0x119AA, 0x119D7}, {0x119DA, 0x119E4},
+      {0x11A00, 0x11A47}, {0x11A50, 0x11AA2}, {0x11AB0, 0x11AF8},
+      {0x11C00, 0x11C08}, {0x11C0A, 0x11C36}, {0x11C38, 0x11C45},
+      {0x11C50, 0x11C6C}, {0x11C70, 0x11C8F}, {0x11C92, 0x11CA7},
+      {0x11CA9, 0x11CB6}, {0x11D00, 0x11D06}, {0x11D08, 0x11D09},
+      {0x11D0B, 0x11D36}, {0x11D3A, 0x11D3A}, {0x11D3C, 0x11D3D},
+      {0x11D3F, 0x11D47}, {0x11D50, 0x11D59}, {0x11D60, 0x11D65},
+      {0x11D67, 0x11D68}, {0x11D6A, 0x11D8E}, {0x11D90, 0x11D91},
+      {0x11D93, 0x11D98}, {0x11DA0, 0x11DA9}, {0x11EE0, 0x11EF8},
+      {0x11FB0, 0x11FB0}, {0x11FC0, 0x11FF1}, {0x11FFF, 0x12399},
+      {0x12400, 0x1246E}, {0x12470, 0x12474}, {0x12480, 0x12543},
+      {0x12F90, 0x12FF2}, {0x13000, 0x1342E}, {0x14400, 0x14646},
+      {0x16800, 0x16A38}, {0x16A40, 0x16A5E}, {0x16A60, 0x16A69},
+      {0x16A6E, 0x16ABE}, {0x16AC0, 0x16AC9}, {0x16AD0, 0x16AED},
+      {0x16AF0, 0x16AF5}, {0x16B00, 0x16B45}, {0x16B50, 0x16B59},
+      {0x16B5B, 0x16B61}, {0x16B63, 0x16B77}, {0x16B7D, 0x16B8F},
+      {0x16E40, 0x16E9A}, {0x16F00, 0x16F4A}, {0x16F4F, 0x16F87},
+      {0x16F8F, 0x16F9F}, {0x16FE0, 0x16FE4}, {0x16FF0, 0x16FF1},
+      {0x17000, 0x187F7}, {0x18800, 0x18CD5}, {0x18D00, 0x18D08},
+      {0x1AFF0, 0x1AFF3}, {0x1AFF5, 0x1AFFB}, {0x1AFFD, 0x1AFFE},
+      {0x1B000, 0x1B122}, {0x1B150, 0x1B152}, {0x1B164, 0x1B167},
+      {0x1B170, 0x1B2FB}, {0x1BC00, 0x1BC6A}, {0x1BC70, 0x1BC7C},
+      {0x1BC80, 0x1BC88}, {0x1BC90, 0x1BC99}, {0x1BC9C, 0x1BC9F},
+      {0x1CF00, 0x1CF2D}, {0x1CF30, 0x1CF46}, {0x1CF50, 0x1CFC3},
+      {0x1D000, 0x1D0F5}, {0x1D100, 0x1D126}, {0x1D129, 0x1D172},
+      {0x1D17B, 0x1D1EA}, {0x1D200, 0x1D245}, {0x1D2E0, 0x1D2F3},
+      {0x1D300, 0x1D356}, {0x1D360, 0x1D378}, {0x1D400, 0x1D454},
+      {0x1D456, 0x1D49C}, {0x1D49E, 0x1D49F}, {0x1D4A2, 0x1D4A2},
+      {0x1D4A5, 0x1D4A6}, {0x1D4A9, 0x1D4AC}, {0x1D4AE, 0x1D4B9},
+      {0x1D4BB, 0x1D4BB}, {0x1D4BD, 0x1D4C3}, {0x1D4C5, 0x1D505},
+      {0x1D507, 0x1D50A}, {0x1D50D, 0x1D514}, {0x1D516, 0x1D51C},
+      {0x1D51E, 0x1D539}, {0x1D53B, 0x1D53E}, {0x1D540, 0x1D544},
+      {0x1D546, 0x1D546}, {0x1D54A, 0x1D550}, {0x1D552, 0x1D6A5},
+      {0x1D6A8, 0x1D7CB}, {0x1D7CE, 0x1DA8B}, {0x1DA9B, 0x1DA9F},
+      {0x1DAA1, 0x1DAAF}, {0x1DF00, 0x1DF1E}, {0x1E000, 0x1E006},
+      {0x1E008, 0x1E018}, {0x1E01B, 0x1E021}, {0x1E023, 0x1E024},
+      {0x1E026, 0x1E02A}, {0x1E100, 0x1E12C}, {0x1E130, 0x1E13D},
+      {0x1E140, 0x1E149}, {0x1E14E, 0x1E14F}, {0x1E290, 0x1E2AE},
+      {0x1E2C0, 0x1E2F9}, {0x1E2FF, 0x1E2FF}, {0x1E7E0, 0x1E7E6},
+      {0x1E7E8, 0x1E7EB}, {0x1E7ED, 0x1E7EE}, {0x1E7F0, 0x1E7FE},
+      {0x1E800, 0x1E8C4}, {0x1E8C7, 0x1E8D6}, {0x1E900, 0x1E94B},
+      {0x1E950, 0x1E959}, {0x1E95E, 0x1E95F}, {0x1EC71, 0x1ECB4},
+      {0x1ED01, 0x1ED3D}, {0x1EE00, 0x1EE03}, {0x1EE05, 0x1EE1F},
+      {0x1EE21, 0x1EE22}, {0x1EE24, 0x1EE24}, {0x1EE27, 0x1EE27},
+      {0x1EE29, 0x1EE32}, {0x1EE34, 0x1EE37}, {0x1EE39, 0x1EE39},
+      {0x1EE3B, 0x1EE3B}, {0x1EE42, 0x1EE42}, {0x1EE47, 0x1EE47},
+      {0x1EE49, 0x1EE49}, {0x1EE4B, 0x1EE4B}, {0x1EE4D, 0x1EE4F},
+      {0x1EE51, 0x1EE52}, {0x1EE54, 0x1EE54}, {0x1EE57, 0x1EE57},
+      {0x1EE59, 0x1EE59}, {0x1EE5B, 0x1EE5B}, {0x1EE5D, 0x1EE5D},
+      {0x1EE5F, 0x1EE5F}, {0x1EE61, 0x1EE62}, {0x1EE64, 0x1EE64},
+      {0x1EE67, 0x1EE6A}, {0x1EE6C, 0x1EE72}, {0x1EE74, 0x1EE77},
+      {0x1EE79, 0x1EE7C}, {0x1EE7E, 0x1EE7E}, {0x1EE80, 0x1EE89},
+      {0x1EE8B, 0x1EE9B}, {0x1EEA1, 0x1EEA3}, {0x1EEA5, 0x1EEA9},
+      {0x1EEAB, 0x1EEBB}, {0x1EEF0, 0x1EEF1}, {0x1F000, 0x1F02B},
+      {0x1F030, 0x1F093}, {0x1F0A0, 0x1F0AE}, {0x1F0B1, 0x1F0BF},
+      {0x1F0C1, 0x1F0CF}, {0x1F0D1, 0x1F0F5}, {0x1F100, 0x1F1AD},
+      {0x1F1E6, 0x1F202}, {0x1F210, 0x1F23B}, {0x1F240, 0x1F248},
+      {0x1F250, 0x1F251}, {0x1F260, 0x1F265}, {0x1F300, 0x1F6D7},
+      {0x1F6DD, 0x1F6EC}, {0x1F6F0, 0x1F6FC}, {0x1F700, 0x1F773},
+      {0x1F780, 0x1F7D8}, {0x1F7E0, 0x1F7EB}, {0x1F7F0, 0x1F7F0},
+      {0x1F800, 0x1F80B}, {0x1F810, 0x1F847}, {0x1F850, 0x1F859},
+      {0x1F860, 0x1F887}, {0x1F890, 0x1F8AD}, {0x1F8B0, 0x1F8B1},
+      {0x1F900, 0x1FA53}, {0x1FA60, 0x1FA6D}, {0x1FA70, 0x1FA74},
+      {0x1FA78, 0x1FA7C}, {0x1FA80, 0x1FA86}, {0x1FA90, 0x1FAAC},
+      {0x1FAB0, 0x1FABA}, {0x1FAC0, 0x1FAC5}, {0x1FAD0, 0x1FAD9},
+      {0x1FAE0, 0x1FAE7}, {0x1FAF0, 0x1FAF6}, {0x1FB00, 0x1FB92},
+      {0x1FB94, 0x1FBCA}, {0x1FBF0, 0x1FBF9}, {0x20000, 0x2A6DF},
+      {0x2A700, 0x2B738}, {0x2B740, 0x2B81D}, {0x2B820, 0x2CEA1},
+      {0x2CEB0, 0x2EBE0}, {0x2F800, 0x2FA1D}, {0x30000, 0x3134A},
+      {0xE0100, 0xE01EF}};
+
+  static const UnicodeCharSet Printables(PrintableRanges);
+  // Clang special cases 0x00AD (SOFT HYPHEN) which is rendered as an actual
+  // hyphen in most terminals.
+  return UCS == 0x00AD || Printables.contains(UCS);
+}
+
+/// Unicode code points of the Cf category are considered
+/// fornatting characters.
+bool isFormatting(int UCS) {
+
+  // https://unicode.org/Public/14.0.0/ucdxml/
+  static const UnicodeCharRange Cf[] = {
+      {0x00AD, 0x00AD},   {0x0600, 0x0605},   {0x061C, 0x061C},
+      {0x06DD, 0x06DD},   {0x070F, 0x070F},   {0x0890, 0x0891},
+      {0x08E2, 0x08E2},   {0x180E, 0x180E},   {0x200B, 0x200F},
+      {0x202A, 0x202E},   {0x2060, 0x2064},   {0x2066, 0x206F},
+      {0xFEFF, 0xFEFF},   {0xFFF9, 0xFFFB},   {0x110BD, 0x110BD},
+      {0x110CD, 0x110CD}, {0x13430, 0x13438}, {0x1BCA0, 0x1BCA3},
+      {0x1D173, 0x1D17A}, {0xE0001, 0xE0001}, {0xE0020, 0xE007F}};
 
-  return UCS >= 0 && UCS <= 0x10FFFF && !NonPrintables.contains(UCS);
+  static const UnicodeCharSet Format(Cf);
+  return Format.contains(UCS);
 }
 
 /// Gets the number of positions a character is likely to occupy when output
diff --git a/llvm/lib/Support/UnicodeNameToCodepoint.cpp b/llvm/lib/Support/UnicodeNameToCodepoint.cpp
new file mode 100644
index 000000000000..1e8aebf1b8eb
--- /dev/null
+++ b/llvm/lib/Support/UnicodeNameToCodepoint.cpp
@@ -0,0 +1,551 @@
+//===- llvm/Support/UnicodeNameToCodepoint.cpp - Unicode character properties
+//-*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements functions to map the name or alias of a unicode
+// character to its codepoint.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Unicode.h"
+
+namespace llvm {
+namespace sys {
+namespace unicode {
+
+extern const char *UnicodeNameToCodepointDict;
+extern const uint8_t *UnicodeNameToCodepointIndex;
+extern const std::size_t UnicodeNameToCodepointIndexSize;
+extern const std::size_t UnicodeNameToCodepointLargestNameSize;
+
+using BufferType = SmallString<64>;
+
+struct Node {
+  bool IsRoot = false;
+  char32_t Value = 0xFFFFFFFF;
+  uint32_t ChildrenOffset = 0;
+  bool HasSibling = false;
+  uint32_t Size = 0;
+  StringRef Name;
+  const Node *Parent = nullptr;
+
+  constexpr bool isValid() const {
+    return !Name.empty() || Value == 0xFFFFFFFF;
+  }
+  constexpr bool hasChildren() const { return ChildrenOffset != 0 || IsRoot; }
+
+  std::string fullName() const {
+    std::string S;
+    // Reserve enough space for most unicode code points.
+    // The chosen value represent the 99th percentile of name size as of
+    // Unicode 14.
+    S.reserve(46);
+    const Node *N = this;
+    while (N) {
+      std::reverse_copy(N->Name.begin(), N->Name.end(), std::back_inserter(S));
+      N = N->Parent;
+    }
+    std::reverse(S.begin(), S.end());
+    return S;
+  }
+};
+
+static Node createRoot() {
+  Node N;
+  N.IsRoot = true;
+  N.ChildrenOffset = 1;
+  N.Size = 1;
+  return N;
+}
+
+static Node readNode(uint32_t Offset, const Node *Parent = nullptr) {
+  if (Offset == 0)
+    return createRoot();
+
+  uint32_t Origin = Offset;
+  Node N;
+  N.Parent = Parent;
+  uint8_t NameInfo = UnicodeNameToCodepointIndex[Offset++];
+  if (Offset + 6 >= UnicodeNameToCodepointIndexSize)
+    return N;
+
+  bool LongName = NameInfo & 0x40;
+  bool HasValue = NameInfo & 0x80;
+  std::size_t Size = NameInfo & ~0xC0;
+  if (LongName) {
+    uint32_t NameOffset = (UnicodeNameToCodepointIndex[Offset++] << 8);
+    NameOffset |= UnicodeNameToCodepointIndex[Offset++];
+    N.Name = StringRef(UnicodeNameToCodepointDict + NameOffset, Size);
+  } else {
+    N.Name = StringRef(UnicodeNameToCodepointDict + Size, 1);
+  }
+  if (HasValue) {
+    uint8_t H = UnicodeNameToCodepointIndex[Offset++];
+    uint8_t M = UnicodeNameToCodepointIndex[Offset++];
+    uint8_t L = UnicodeNameToCodepointIndex[Offset++];
+    N.Value = ((H << 16) | (M << 8) | L) >> 3;
+
+    bool HasChildren = L & 0x02;
+    N.HasSibling = L & 0x01;
+
+    if (HasChildren) {
+      N.ChildrenOffset = UnicodeNameToCodepointIndex[Offset++] << 16;
+      N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++] << 8;
+      N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
+    }
+  } else {
+    uint8_t H = UnicodeNameToCodepointIndex[Offset++];
+    N.HasSibling = H & 0x80;
+    bool HasChildren = H & 0x40;
+    H &= ~0xC0;
+    if (HasChildren) {
+      N.ChildrenOffset = (H << 16);
+      N.ChildrenOffset |=
+          (uint32_t(UnicodeNameToCodepointIndex[Offset++]) << 8);
+      N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
+    }
+  }
+  N.Size = Offset - Origin;
+  return N;
+}
+
+static bool startsWith(StringRef Name, StringRef Needle, bool Strict,
+                       std::size_t &Consummed, char &PreviousCharInName,
+                       char &PreviousCharInNeedle, bool IsPrefix = false) {
+
+  Consummed = 0;
+  if (Strict) {
+    if (!Name.startswith(Needle))
+      return false;
+    Consummed = Needle.size();
+    return true;
+  }
+  if (Needle.empty())
+    return true;
+
+  auto NamePos = Name.begin();
+  auto NeedlePos = Needle.begin();
+
+  char PreviousCharInNameOrigin = PreviousCharInName;
+  char PreviousCharInNeedleOrigin = PreviousCharInNeedle;
+
+  auto IgnoreSpaces = [](auto It, auto End, char &PreviousChar,
+                         bool IgnoreEnd = false) {
+    while (It != End) {
+      const auto Next = std::next(It);
+      // Ignore spaces, underscore, medial hyphens
+      // https://unicode.org/reports/tr44/#UAX44-LM2.
+      bool Ignore =
+          *It == ' ' || *It == '_' ||
+          (*It == '-' && isAlnum(PreviousChar) &&
+           ((Next != End && isAlnum(*Next)) || (Next == End && IgnoreEnd)));
+      PreviousChar = *It;
+      if (!Ignore)
+        break;
+      ++It;
+    }
+    return It;
+  };
+
+  while (true) {
+    NamePos = IgnoreSpaces(NamePos, Name.end(), PreviousCharInName);
+    NeedlePos =
+        IgnoreSpaces(NeedlePos, Needle.end(), PreviousCharInNeedle, IsPrefix);
+    if (NeedlePos == Needle.end())
+      break;
+    if (NamePos == Name.end())
+      break;
+    if (toUpper(*NeedlePos) != toUpper(*NamePos))
+      break;
+    NeedlePos++;
+    NamePos++;
+  }
+  Consummed = std::distance(Name.begin(), NamePos);
+  if (NeedlePos != Needle.end()) {
+    PreviousCharInName = PreviousCharInNameOrigin;
+    PreviousCharInNeedle = PreviousCharInNeedleOrigin;
+  }
+  return NeedlePos == Needle.end();
+}
+
+static std::tuple<Node, bool, uint32_t>
+compareNode(uint32_t Offset, StringRef Name, bool Strict,
+            char PreviousCharInName, char PreviousCharInNeedle,
+            BufferType &Buffer, const Node *Parent = nullptr) {
+  Node N = readNode(Offset, Parent);
+  std::size_t Consummed = 0;
+  bool DoesStartWith =
+      N.IsRoot || startsWith(Name, N.Name, Strict, Consummed,
+                             PreviousCharInName, PreviousCharInNeedle);
+  if (!DoesStartWith)
+    return std::make_tuple(N, false, 0);
+
+  if (Name.size() - Consummed == 0 && N.Value != 0xFFFFFFFF)
+    return std::make_tuple(N, true, N.Value);
+
+  if (N.hasChildren()) {
+    uint32_t ChildOffset = N.ChildrenOffset;
+    for (;;) {
+      Node C;
+      bool Matches;
+      uint32_t Value;
+      std::tie(C, Matches, Value) =
+          compareNode(ChildOffset, Name.substr(Consummed), Strict,
+                      PreviousCharInName, PreviousCharInNeedle, Buffer, &N);
+      if (Matches) {
+        std::reverse_copy(C.Name.begin(), C.Name.end(),
+                          std::back_inserter(Buffer));
+        return std::make_tuple(N, true, Value);
+      }
+      ChildOffset += C.Size;
+      if (!C.HasSibling)
+        break;
+    }
+  }
+  return std::make_tuple(N, false, 0);
+}
+
+static std::tuple<Node, bool, uint32_t>
+compareNode(uint32_t Offset, StringRef Name, bool Strict, BufferType &Buffer) {
+  return compareNode(Offset, Name, Strict, 0, 0, Buffer);
+}
+
+// clang-format off
+constexpr const char *const HangulSyllables[][3] = {
+    { "G",  "A",   ""   },
+    { "GG", "AE",  "G"  },
+    { "N",  "YA",  "GG" },
+    { "D",  "YAE", "GS" },
+    { "DD", "EO",  "N", },
+    { "R",  "E",   "NJ" },
+    { "M",  "YEO", "NH" },
+    { "B",  "YE",  "D"  },
+    { "BB", "O",   "L"  },
+    { "S",  "WA",  "LG" },
+    { "SS", "WAE", "LM" },
+    { "",   "OE",  "LB" },
+    { "J",  "YO",  "LS" },
+    { "JJ", "U",   "LT" },
+    { "C",  "WEO", "LP" },
+    { "K",  "WE",  "LH" },
+    { "T",  "WI",  "M"  },
+    { "P",  "YU",  "B"  },
+    { "H",  "EU",  "BS" },
+    { 0,    "YI",  "S"  },
+    { 0,    "I",   "SS" },
+    { 0,    0,     "NG" },
+    { 0,    0,     "J"  },
+    { 0,    0,     "C"  },
+    { 0,    0,     "K"  },
+    { 0,    0,     "T"  },
+    { 0,    0,     "P"  },
+    { 0,    0,     "H"  }
+    };
+// clang-format on
+
+// Unicode 14.0
+// 3.12 Conjoining Jamo Behavior Common constants
+constexpr const char32_t SBase = 0xAC00;
+constexpr const uint32_t LCount = 19;
+constexpr const uint32_t VCount = 21;
+constexpr const uint32_t TCount = 28;
+
+static std::size_t findSyllable(StringRef Name, bool Strict,
+                                char &PreviousInName, int &Pos, int Column) {
+  assert(Column == 0 || Column == 1 || Column == 2);
+  static std::size_t CountPerColumn[] = {LCount, VCount, TCount};
+  char NeedleStart = 0;
+  int Len = -1;
+  int Prev = PreviousInName;
+  for (std::size_t I = 0; I < CountPerColumn[Column]; I++) {
+    StringRef Syllable(HangulSyllables[I][Column]);
+    if (int(Syllable.size()) <= Len)
+      continue;
+    std::size_t Consummed = 0;
+    char PreviousInNameCopy = PreviousInName;
+    bool DoesStartWith = startsWith(Name, Syllable, Strict, Consummed,
+                                    PreviousInNameCopy, NeedleStart);
+    if (!DoesStartWith)
+      continue;
+    Len = Consummed;
+    Pos = I;
+    Prev = PreviousInNameCopy;
+  }
+  if (Len == -1)
+    return 0;
+  PreviousInName = Prev;
+  return size_t(Len);
+}
+
+static llvm::Optional<char32_t>
+nameToHangulCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
+  Buffer.clear();
+  // Hangul Syllable Decomposition
+  std::size_t Consummed = 0;
+  char NameStart = 0, NeedleStart = 0;
+  bool DoesStartWith = startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed,
+                                  NameStart, NeedleStart);
+  if (!DoesStartWith)
+    return None;
+  Name = Name.substr(Consummed);
+  int L = -1, V = -1, T = -1;
+  Name = Name.substr(findSyllable(Name, Strict, NameStart, L, 0));
+  Name = Name.substr(findSyllable(Name, Strict, NameStart, V, 1));
+  Name = Name.substr(findSyllable(Name, Strict, NameStart, T, 2));
+  if (L != -1 && V != -1 && T != -1 && Name.empty()) {
+    if (!Strict) {
+      Buffer.append("HANGUL SYLLABLE ");
+      if (L != -1)
+        Buffer.append(HangulSyllables[L][0]);
+      if (V != -1)
+        Buffer.append(HangulSyllables[V][1]);
+      if (T != -1)
+        Buffer.append(HangulSyllables[T][2]);
+    }
+    return SBase + (std::uint32_t(L) * VCount + std::uint32_t(V)) * TCount +
+           std::uint32_t(T);
+  }
+  // Otherwise, it's an illegal syllable name.
+  return None;
+}
+
+struct GeneratedNamesData {
+  StringRef Prefix;
+  uint32_t Start;
+  uint32_t End;
+};
+
+// Unicode 14.0 Table 4-8. Name Derivation Rule Prefix Strings
+// This needs to be kept in sync with
+// llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp
+static const GeneratedNamesData GeneratedNamesDataTable[] = {
+    {"CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4DBF},
+    {"CJK UNIFIED IDEOGRAPH-", 0x4E00, 0x9FFC},
+    {"CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2A6DD},
+    {"CJK UNIFIED IDEOGRAPH-", 0x2A700, 0x2B734},
+    {"CJK UNIFIED IDEOGRAPH-", 0x2B740, 0x2B81D},
+    {"CJK UNIFIED IDEOGRAPH-", 0x2B820, 0x2CEA1},
+    {"CJK UNIFIED IDEOGRAPH-", 0x2CEB0, 0x2EBE0},
+    {"CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134A},
+    {"TANGUT IDEOGRAPH-", 0x17000, 0x187F7},
+    {"TANGUT IDEOGRAPH-", 0x18D00, 0x18D08},
+    {"KHITAN SMALL SCRIPT CHARACTER-", 0x18B00, 0x18CD5},
+    {"NUSHU CHARACTER-", 0x1B170, 0x1B2FB},
+    {"CJK COMPATIBILITY IDEOGRAPH-", 0xF900, 0xFA6D},
+    {"CJK COMPATIBILITY IDEOGRAPH-", 0xFA70, 0xFAD9},
+    {"CJK COMPATIBILITY IDEOGRAPH-", 0x2F800, 0x2FA1D},
+};
+
+static llvm::Optional<char32_t>
+nameToGeneratedCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
+  for (auto &&Item : GeneratedNamesDataTable) {
+    Buffer.clear();
+    std::size_t Consummed = 0;
+    char NameStart = 0, NeedleStart = 0;
+    bool DoesStartWith = startsWith(Name, Item.Prefix, Strict, Consummed,
+                                    NameStart, NeedleStart, /*isPrefix*/ true);
+    if (!DoesStartWith)
+      continue;
+    auto Number = Name.substr(Consummed);
+    unsigned long long V = 0;
+    // Be consistent about mandating upper casing.
+    if (Strict &&
+        llvm::any_of(Number, [](char C) { return C >= 'a' && C <= 'f'; }))
+      return {};
+    if (getAsUnsignedInteger(Number, 16, V) || V < Item.Start || V > Item.End)
+      continue;
+    if (!Strict) {
+      Buffer.append(Item.Prefix);
+      Buffer.append(utohexstr(V, true));
+    }
+    return V;
+  }
+  return None;
+}
+
+static llvm::Optional<char32_t> nameToCodepoint(StringRef Name, bool Strict,
+                                                BufferType &Buffer) {
+  if (Name.empty())
+    return None;
+
+  llvm::Optional<char32_t> Res = nameToHangulCodePoint(Name, Strict, Buffer);
+  if (!Res)
+    Res = nameToGeneratedCodePoint(Name, Strict, Buffer);
+  if (Res)
+    return *Res;
+
+  Buffer.clear();
+  Node Node;
+  bool Matches;
+  uint32_t Value;
+  std::tie(Node, Matches, Value) = compareNode(0, Name, Strict, Buffer);
+  if (Matches) {
+    std::reverse(Buffer.begin(), Buffer.end());
+    // UAX44-LM2. Ignore case, whitespace, underscore ('_'), and all medial
+    // hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E.
+    if (!Strict && Value == 0x116c &&
+        Name.find_insensitive("O-E") != StringRef::npos) {
+      Buffer = "HANGUL JUNGSEONG O-E";
+      Value = 0x1180;
+    }
+    return Value;
+  }
+  return None;
+}
+
+llvm::Optional<char32_t> nameToCodepointStrict(StringRef Name) {
+
+  BufferType Buffer;
+  auto Opt = nameToCodepoint(Name, true, Buffer);
+  return Opt;
+}
+
+llvm::Optional<LooseMatchingResult>
+nameToCodepointLooseMatching(StringRef Name) {
+  BufferType Buffer;
+  auto Opt = nameToCodepoint(Name, false, Buffer);
+  if (!Opt)
+    return None;
+  return LooseMatchingResult{*Opt, Buffer};
+}
+
+// Find the unicode character whose editing distance to Pattern
+// is shortest, using the Wagner–Fischer algorithm.
+llvm::SmallVector<MatchForCodepointName>
+nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount) {
+  // We maintain a fixed size vector of matches,
+  // sorted by distance
+  // The worst match (with the biggest distance) are discarded when new elements
+  // are added.
+  std::size_t LargestEditDistance = 0;
+  llvm::SmallVector<MatchForCodepointName> Matches;
+  Matches.reserve(MaxMatchesCount + 1);
+
+  auto Insert = [&](const Node &Node, uint32_t Distance,
+                    char32_t Value) -> bool {
+    if (Distance > LargestEditDistance) {
+      if (Matches.size() == MaxMatchesCount)
+        return false;
+      LargestEditDistance = Distance;
+    }
+    // To avoid allocations, the creation of the name is delayed
+    // as much as possible.
+    std::string Name;
+    auto GetName = [&] {
+      if (Name.empty())
+        Name = Node.fullName();
+      return Name;
+    };
+
+    auto It = std::lower_bound(
+        Matches.begin(), Matches.end(), Distance,
+        [&](const MatchForCodepointName &a, std::size_t Distance) {
+          if (Distance == a.Distance)
+            return a.Name < GetName();
+          return a.Distance < Distance;
+        });
+    if (It == Matches.end() && Matches.size() == MaxMatchesCount)
+      return false;
+
+    MatchForCodepointName M{GetName(), Distance, Value};
+    Matches.insert(It, std::move(M));
+    if (Matches.size() > MaxMatchesCount)
+      Matches.pop_back();
+    return true;
+  };
+
+  // We ignore case, space, hyphens, etc,
+  // in both the search pattern and the prospective names.
+  auto Normalize = [](StringRef Name) {
+    std::string Out;
+    Out.reserve(Name.size());
+    for (char C : Name) {
+      if (isAlnum(C))
+        Out.push_back(toUpper(C));
+    }
+    return Out;
+  };
+  std::string NormalizedName = Normalize(Pattern);
+
+  // Allocate a matrix big enough for longest names.
+  const std::size_t Columns =
+      std::min(NormalizedName.size(), UnicodeNameToCodepointLargestNameSize) +
+      1;
+
+  LLVM_ATTRIBUTE_UNUSED static std::size_t Rows =
+      UnicodeNameToCodepointLargestNameSize + 1;
+
+  std::vector<char> Distances(
+      Columns * (UnicodeNameToCodepointLargestNameSize + 1), 0);
+
+  auto Get = [&Distances, Columns](size_t Column, std::size_t Row) -> char & {
+    assert(Column < Columns);
+    assert(Row < Rows);
+    return Distances[Row * Columns + Column];
+  };
+
+  for (std::size_t I = 0; I < Columns; I++)
+    Get(I, 0) = I;
+
+  // Visit the childrens,
+  // Filling (and overriding) the matrix for the name fragment of each node
+  // iteratively. CompleteName is used to collect the actual name of potential
+  // match, respecting case and spacing.
+  auto VisitNode = [&](const Node &N, std::size_t Row,
+                       auto &VisitNode) -> void {
+    std::size_t J = 0;
+    for (; J < N.Name.size(); J++) {
+      if (!isAlnum(N.Name[J]))
+        continue;
+
+      Get(0, Row) = Row;
+
+      for (std::size_t I = 1; I < Columns; I++) {
+        const int Delete = Get(I - 1, Row) + 1;
+        const int Insert = Get(I, Row - 1) + 1;
+
+        const int Replace =
+            Get(I - 1, Row - 1) + (NormalizedName[I - 1] != N.Name[J] ? 1 : 0);
+
+        Get(I, Row) = std::min(Insert, std::min(Delete, Replace));
+      }
+
+      Row++;
+    }
+
+    unsigned Cost = Get(Columns - 1, Row - 1);
+    if (N.Value != 0xFFFFFFFF) {
+      Insert(N, Cost, N.Value);
+    }
+
+    if (N.hasChildren()) {
+      auto ChildOffset = N.ChildrenOffset;
+      for (;;) {
+        Node C = readNode(ChildOffset, &N);
+        ChildOffset += C.Size;
+        if (!C.isValid())
+          break;
+        VisitNode(C, Row, VisitNode);
+        if (!C.HasSibling)
+          break;
+      }
+    }
+  };
+
+  Node Root = createRoot();
+  VisitNode(Root, 1, VisitNode);
+  return Matches;
+}
+
+} // namespace unicode
+
+} // namespace sys
+} // namespace llvm
diff --git a/llvm/lib/Support/UnicodeNameToCodepointGenerated.cpp b/llvm/lib/Support/UnicodeNameToCodepointGenerated.cpp
new file mode 100644
index 000000000000..86e8378eceb1
--- /dev/null
+++ b/llvm/lib/Support/UnicodeNameToCodepointGenerated.cpp
@@ -0,0 +1,20911 @@
+
+//===------------- Support/UnicodeNameToCodepointGenerated.cpp ------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements mapping the name of a unicode code point to its value.
+//
+// This file was generated using ./bin/UnicodeNameMappingGenerator.
+// Do not edit manually.
+//
+//===----------------------------------------------------------------------===//
+
+/*
+UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
+
+See Terms of Use <https://www.unicode.org/copyright.html>
+for definitions of Unicode Inc.’s Data Files and Software.
+
+NOTICE TO USER: Carefully read the following legal agreement.
+BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
+DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
+YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
+TERMS AND CONDITIONS OF THIS AGREEMENT.
+IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
+THE DATA FILES OR SOFTWARE.
+
+COPYRIGHT AND PERMISSION NOTICE
+
+Copyright © 1991-2022 Unicode, Inc. All rights reserved.
+Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of the Unicode data files and any associated documentation
+(the "Data Files") or Unicode software and any associated documentation
+(the "Software") to deal in the Data Files or Software
+without restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, and/or sell copies of
+the Data Files or Software, and to permit persons to whom the Data Files
+or Software are furnished to do so, provided that either
+(a) this copyright and permission notice appear with all copies
+of the Data Files or Software, or
+(b) this copyright and permission notice appear in associated
+Documentation.
+
+THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
+NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
+DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THE DATA FILES OR SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder
+shall not be used in advertising or otherwise to promote the sale,
+use or other dealings in these Data Files or Software without prior
+written authorization of the copyright holder.
+*/
+
+#include "llvm/Support/Compiler.h"
+#include <cstddef>
+#include <cstdint>
+namespace llvm {
+namespace sys {
+namespace unicode {
+extern const char *UnicodeNameToCodepointDict;
+extern const uint8_t *UnicodeNameToCodepointIndex;
+extern const std::size_t UnicodeNameToCodepointIndexSize;
+extern const std::size_t UnicodeNameToCodepointLargestNameSize;
+const char *UnicodeNameToCodepointDict =
+    " _-ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789OWER RIGHT CURLY BRACKET SECTIONM "
+    "LEFT MEMBER OF DOUBLE VERTICALPER BODY TILTING FROM HIP JOINTSFACE WITH "
+    "SYMBOLS COVERING MOUTHVED STEM PARAGRAPH SIGN ORNAMENTVE LESS-THAN ABOVE "
+    "SLANTED EQUAL KORANIC STOP SIGN ISOLATED FORMROFLEX CLICK WITH RETROFLEX "
+    "HOOKSWIRL BIRGA WITH DOUBLE ORNAMENTOWNWARDS HARPOON WITH BARB RIGHT "
+    "HORIZONTAL STROKES TO THE RIGHT LEFTWARDS TRIANGLE-HEADED "
+    "ARROWFT-POINTING ANGLE QUOTATION MARK LOWER HALF INVERSE MEDIUM "
+    "SHADERONT-TILTED SHADOWED WHITE ARROWDIFIER LETTER LABIALIZATION MARKDIC "
+    "KASHMIRI INDEPENDENT SVARITAMARK WITH LEFT RIGHT ARROW ABOVEOUBLE-LINE "
+    "EQUAL ABOVE LESS-THANL ARABIC LETTER TAH AND TWO DOTSLL BUT UPPER LEFT "
+    "QUADRANT BLACKRIGHT SEMICIRCLE WITH THREE DOTSLAR SIGN WITH OVERLAID "
+    "BACKSLASH CONTAINING SMALL WHITE TRIANGLEEN ARM ENDING IN ARROW POINTING "
+    "LAGAB TIMES U OVER LAGAB TIMES ULOWER LEFT CURLY BRACKET "
+    "SECTIONRIGHTWARDS TRIANGLE-HEADED ARROWTRIANGLE-HEADED RIGHTWARDS ARROW "
+    "DOWNWARDS EQUILATERAL ARROWHEAD DOWNWARDS TRIANGLE-HEADED ARROWER ARROWS "
+    "CIRCLING ANTICLOCKWISEER IGI SHIR OVER SHIR UD OVER UDER TAB NI OVER NI "
+    "DISH OVER DISHESS-THAN ABOVE DOUBLE-LINE EQUALETALLED BLACK AND WHITE "
+    "FLORETTEATHARVAVEDIC INDEPENDENT SVARITAAND MIDDLE RIGHT TO LOWER "
+    "CENTREWO DOTS ABOVE AND TWO DOTS BELOWWO DOTS OVER ONE DOT PUNCTUATIONS "
+    "VERTICALLY BELOW AND SMALL TAHTIMES ASH2 KU OVER HI TIMES ASH2AND "
+    "LEFTWARDS OPEN CIRCLE ARROWSTICAL BAR DOUBLE RIGHT TURNSTILENORMAL FACTOR "
+    "SEMIDIRECT PRODUCTD ARROW WITH TRIANGLE ARROWHEADSSEMICIRCULAR "
+    "ANTICLOCKWISE ARROWINTING DOWNWARDS THEN NORTH EASTHT-POINTING ANGLE "
+    "QUOTATION MARKHUR KAZAKH KIRGHIZ ALEF MAKSURA THIRD WHITE RIGHT POINTING "
+    "INDEX SHADOWED WHITE RIGHTWARDS ARROWIDE AND JOINED WITH "
+    "INTERSECTIONUPPER AND LOWER ONE EIGHTH BLOCKIGHTWARDS HARPOON WITH BARB "
+    "DOWNTER-THAN ABOVE DOUBLE-LINE EQUALH SUPERSCRIPT ALEF ISOLATED "
+    "FORMROXIMATELY NOR ACTUALLY EQUAL TOAISING BOTH HANDS IN CELEBRATIONIRECT "
+    "PRODUCT WITH BOTTOM CLOSEDTOP HALF DIVIDED BY VERTICAL BARGREATER-THAN "
+    "ABOVE SLANTED EQUALTOM-LIGHTED RIGHTWARDS ARROWHEADH HAMZA ABOVE WITH "
+    "ALEF MAKSURA H HORIZONTAL MIDDLE BLACK STRIPERONG CENTRALIZATION STROKE "
+    "BELOW TRIANGULAR THREE QUARTERS BLOCK TORTOISE SHELL BRACKET "
+    "ORNAMENTWNWARDS ARROW WITH TIP LEFTWARDSDED HIGH STOP WITH FILLED "
+    "CENTRETION SIGN WITH CIRCUMFLEX ACCENTS AND UPWARDS OPEN CIRCLE "
+    "ARROWSHAND WITH MIDDLE FINGER EXTENDEDOF UPWARDS TRIANGLE-HEADED "
+    "ARROWLEFTWARDS HARPOON WITH BARB DOWNED ARABIC-INDIC DIGIT FOUR "
+    "BELOWEDIUM SHADE AND RIGHT HALF BLOCKLE-LINE EQUAL ABOVE GREATER-THANARDS "
+    "ARROW ABOVE LEFTWARDS ARROW BAR AT END OF HORIZONTAL STROKEEDIUM SHADE "
+    "AND LOWER HALF BLOCKE TO MIDDLE LEFT TO LOWER CENTREED ARABIC-INDIC DIGIT "
+    "FOUR ABOVEED COMMA QUOTATION MARK ORNAMENTE-POINTED BLACK RIGHTWARDS "
+    "ARROWE CONTAINING BLACK SMALL LOZENGEARDROP-SPOKED PROPELLER ASTERISKE "
+    "SQUARED LATIN CAPITAL LETTER PLE COMMA QUOTATION MARK ORNAMENTUG2 OVER "
+    "TUG2 TUG2 OVER TUG2 PAPARDS HARPOON WITH BARB DOWNWARDS-POINTING ANGLE "
+    "BRACKET ORNAMENTRIANGLE-HEADED OPEN CIRCLE ARROW BETWEEN MIDDLE AND RING "
+    "FINGERSED UPWARDS EQUILATERAL ARROWHEAD-SHADOWED WHITE RIGHTWARDS "
+    "ARROWAISED HAND WITH FINGERS SPLAYEDETALLED OUTLINED BLACK "
+    "FLORETTEACK-TILTED SHADOWED WHITE ARROWTNAMESE ALTERNATE READING MARK "
+    "RINGS OVER ONE RING PUNCTUATIONRIGHTWARDS HARPOON WITH BARB UPAND MIDDLE "
+    "LEFT TO LOWER CENTREONE HUNDRED THIRTY-FIVE DEGREES CROSSING ASH OVER ASH "
+    "OVER ASHUPWARDS HARPOON WITH BARB RIGHTRING OVER TWO RINGS "
+    "PUNCTUATIONLEFTWARDS EQUILATERAL ARROWHEADIN WHITE CIRCLE IN BLACK "
+    "SQUAREMAKSURA WITH SUPERSCRIPT ALEF -HIRAGANA PROLONGED SOUND MARKSAD "
+    "WITH LAM WITH ALEF MAKSURADOWNWARDS AND RIGHTWARDS ARROWEFT SEMICIRCLE "
+    "WITH THREE DOTSGHT FOUR POINTED PINWHEEL STARDOT BELOW AND THREE DOTS "
+    "ABOVEAND JOINED BY DASH WITH SUBSETGREATER-THAN ABOVE EQUALS SIGNINDEX "
+    "THUMB CURVE THUMB INSIDEDIVIDED BY HORIZONTAL BAR AND EART EXCLAMATION "
+    "MARK ORNAMENTHT CENTRALIZATION STROKE BELOWON WITH RIGHTWARDS ARROW "
+    "ABOVEMODIFIER LETTER LEFT HALF RINGOPEN CENTRE EIGHT POINTED STARQAF WITH "
+    "LAM WITH ALEF MAKSURAHIGH-REVERSED-9 QUOTATION MARKMINTON RACQUET AND "
+    "SHUTTLECOCKAGGRAVATED INDEPENDENT SVARITAEXTENDED ARABIC-INDIC DIGIT "
+    "TEVERSED LUNATE EPSILON SYMBOLWITH RIGHTWARDS ARROW AT LEFTONAL INDICATOR "
+    "SYMBOL LETTER  OVER RIGHTWARDS ARROW TO BARSUPERSCRIPT ALEF INITIAL "
+    "FORMNS-SERIF INTERROBANG ORNAMENTEFTWARDS HARPOON WITH BARB "
+    "UPSEMICIRCULAR PATH AROUND POLEDOWN MIDDLE THUMB INDEX CROSSDOWN HEAVY "
+    "AND RIGHT UP LIGHTCKED FACE WITH EXPLODING HEAD WITH REVERSED NEGATION "
+    "SLASHLIGHT FOUR POINTED BLACK CUSP DOWN INDEX THUMB HOOK MIDDLEDOT OVER "
+    "TWO DOTS PUNCTUATIONPUNCTUATION CHINOOK FULL STOPUP HEAVY AND RIGHT DOWN "
+    "LIGHTCONTAINING BLACK SMALL CIRCLEACE DIRECTION POSITION NOSE FTING POINT "
+    "RIGHTWARDS ARROWT LITTER IN ITS PLACE SYMBOLOUND-TIPPED RIGHTWARDS "
+    "ARROWISMILLAH AR-RAHMAN AR-RAHEEMDOWN HEAVY AND LEFT UP LIGHTUPWARDS AND "
+    "RIGHTWARDS ARROWRECTANGULAR PATH AROUND POLEEFT ARC GREATER-THAN "
+    "BRACKETMONOGRAMMOS TESSERA DODEKATASALTIRE WITH ROUNDED CORNERSBESIDE AND "
+    "JOINED WITH UNIONMIDDLE RING LITTLE CONJOINEDASTERISKS ALIGNED "
+    "VERTICALLYUP HEAVY AND LEFT DOWN LIGHTUPPER CENTRE TO MIDDLE RIGHTHREE "
+    "HUNDRED FIFTEEN DEGREESLEFTWARDS OF DOWNWARDS ARROWDOUBLE ANUSVARA "
+    "ANTARGOMUKHAHADED WHITE RIGHTWARDS ARROWU ALAYHI WAAALIHEE WA-SALLAMIBE "
+    "SYLLABLE BOUNDARY MARKEREDGE-TAILED RIGHTWARDS ARROWLIQUID MEASURE FIRST "
+    "SUBUNIT-FEATHERED RIGHTWARDS ARROWRIANGULAR ONE QUARTER BLOCKIMPERFECTUM "
+    "CUM PROLATIONE OUR BALLOON-SPOKED ASTERISKEAVY WHITE RIGHTWARDS ARROWIDE "
+    "ARC ANTICLOCKWISE ARROWIDE-HEADED RIGHTWARDS ARROWCIRCLE WITH NORTHWEST "
+    "ARROWBETWEEN TWO HORIZONTAL BARSHEAD MARK WITH MOON AND SUNZERO FOR ODD "
+    "POWERS OF FOURWO DOTS BELOW AND DOT ABOVEHANDED INTERLACED "
+    "PENTAGRAMLESS-THAN ABOVE EQUALS SIGNBRDA RNYING YIG MGO MDUN MABRDA "
+    "RNYING YIG MGO SGAB MARIGHT ARC LESS-THAN BRACKETUPPER MIDDLE LEFT TO "
+    "UPPER CONTINUOUS UNDERLINE SYMBOL AND LEFT SEMICIRCLE ARROWSTALIC LATIN "
+    "CAPITAL LETTER ONE LARGE AND ONE SMALL EYEENTATION FORM FOR VERTICAL "
+    "LARGE EQUILATERAL ARROWHEADEMICIRCULAR CLOCKWISE ARROWFINGER COVERING "
+    "CLOSED LIPSSTRUMENTAL NOTATION SYMBOL-PHARYNGEAL VOICED FRICATIVE BARREE "
+    "WITH TWO DOTS BELOWKATHAKA INDEPENDENT SVARITATWO HUNDRED SEVENTY "
+    "DEGREESDOUBLE PRIME QUOTATION MARKDOUBLE ANGLE QUOTATION MARKRIPLE "
+    "VERTICAL BAR OPERATOR DIVIDED BY HORIZONTAL RULEPPY PERSON RAISING ONE "
+    "HANDWALLPLANE SHOULDER HIP MOVELOWER MIDDLE LEFT TO LOWER FOUR FINGERS "
+    "CONJOINED BENTLOWER TONAL RANGE INDICATORLIGHT CENTRALIZATION "
+    "STROKEYAJURVEDIC MIDLINE SVARITAINDUSTRIAL STANDARD SYMBOLMEEM WITH HAH "
+    "WITH TATWEELDOTTED SUBSTITUTION MARKERCRIPT LIGATURE ET ORNAMENTSSIAN "
+    "ASTROLOGICAL SYMBOL ONOMICAL SYMBOL FOR URANUSOORPLANE SHOULDER HIP "
+    "MOVEHTORA SKLIRON CHROMA VASIS OR APPROXIMATELY EQUAL TOLANTED SOUTH "
+    "ARROW WITH HORIGHT PARENTHESIS ORNAMENTDOTTED LUNATE SIGMA "
+    "SYMBOLDROP-SHADOWED WHITE SQUAREMODIFIER FITZPATRICK TYPE-AND MIDDLE "
+    "FINGERS CROSSEDE ONE-WAY LEFT WAY TRAFFIC GAD OVER GAD GAR OVER GARLINE "
+    "FEED SEPARATOR SYMBOLRIPLE DOT PUNCTUATION MARKLEFTWARDS OF UPWARDS "
+    "ARROWTHREE DOTS ABOVE DOWNWARDSU REVERSED OVER U REVERSEDBLE TENNIS "
+    "PADDLE AND BALLERSTRASS ELLIPTIC FUNCTIONOCKED FEMALE AND MALE SIGN "
+    "WITHIN TRIANGLE ARROWHEADUNEVEN EYES AND WAVY MOUTH LESS THAN THE "
+    "DENOMINATORAND RIGHT ONE EIGHTH BLOCK NEGATED WITH VERTICAL BARJECT "
+    "REPLACEMENT CHARACTERMARRIED PARTNERSHIP SYMBOLIDEOGRAPHIC ITERATION "
+    "MARKOTATED FLORAL HEART BULLETALEF MAKSURA ISOLATED FORMORTHOGONAL "
+    "CROSSHATCH FILLWITH LEFTWARDS ARROW ABOVECLOCKWISE ARROW WITH "
+    "MINUSLLALLAHOU ALAYHE WASSALLAMCAT FACE WITH SMILING EYESOUTLINED "
+    "RIGHTWARDS ARROWINVERTED EXCLAMATION MARKBREVE WITH INVERTED "
+    "BREVEFECTIVENESS OR DISTORTIONOLD ASSYRIAN WORD DIVIDERMBINING "
+    "CRYPTOGRAMMIC DOTLEFT PARENTHESIS ORNAMENTREE-HUNDRED-AND-TWENTIETHSTROKE "
+    "AND TWO DOTS ABOVETERNION INTEGRAL OPERATORRIGHT DIAGONAL HALF BLACKRIPLE "
+    "BIRGA WITH ORNAMENTDOUBLE CANDRABINDU VIRAMAOUBLE BIRGA WITH ORNAMENT "
+    "WITH DOUBLE MIDDLE TILDERANCH BANK IDENTIFICATIONELD HOCKEY STICK AND "
+    "BALL WITH DOUBLE GRAVE ACCENTMULTIPLICATION SIGN BELOWNIVERSAL RECYCLING "
+    "SYMBOLLEFTWARDS ARROW WITH HOOKONE UNDER EIGHTEEN SYMBOLLOW QUILT SQUARE "
+    "ORNAMENTFFICULTY AT THE BEGINNINGBUT NOT ACTUALLY EQUAL TOTTED "
+    "SUBSTITUTION BRACKETTAB OVER TAB GAR OVER GARMEDIUM TRIANGLE ARROWHEAD "
+    "OVER NUN LAGAR TIMES SALRIST CIRCLE HITTING WALL  WITH DOUBLE VERTICAL "
+    "BARCROSSING NORTH EAST ARROW WITH CIRCLED ONE OVERLAYCAT FACE WITH CLOSED "
+    "EYESDIAERESIS AND HOOK SYMBOLDRY MEASURE FIRST SUBUNITING ON THE FLOOR "
+    "LAUGHINGAND MALE AND FEMALE SIGNVOICED LARYNGEAL SPIRANTTEARDROP-SPOKED "
+    "ASTERISKTED INTERPOLATION MARKERUPRIGHT RECTANGULAR ZERORIGHTWARDS THEN "
+    "CURVING BLACK LENTICULAR BRACKETIGATURE OPEN ET ORNAMENTARROW POINTING "
+    "DIRECTLY BLIC ADDRESS LOUDSPEAKERCULINE ORDINAL INDICATORING FACE WITH "
+    "OPEN MOUTHMTAVRULI CAPITAL LETTER ARM CIRCLE HITTING WALL WELVE POINTED "
+    "BLACK STARLARGE TRIANGLE ARROWHEADLINE HORIZONTAL ELLIPSISORIZONTAL BAR "
+    "WITH NOTCHWITH UPWARDS ARROW ABOVEONE-HUNDRED-AND-SIXTIETHBUSINESS SUIT "
+    "LEVITATINGPERSCRIPT ALEF MOKHASSASCONSECUTIVE EQUALS SIGNSDESCENDING "
+    "MUSICAL NOTESGLOTTAL STOP WITH STROKEEYES AND HAND OVER MOUTHLICATION "
+    "PROGRAM COMMANDFINGER AND THUMB CROSSEDGREATER-THAN OR EQUAL TOISOSCELES "
+    "RIGHT TRIANGLEWITH CANCELLATION STROKEOTTOM SHADED WHITE "
+    "ARROWOTTOM-SHADED WHITE ARROWDIAGONAL CROSSHATCH FILLUPWARD POINTING "
+    "TRIANGLESINGLE-LINE NOT EQUAL TOSYLLABLE REPETITION MARKT BLACK "
+    "RIGHTWARDS ARROWMALL CIRCLE TO THE RIGHTSMALL ARABIC LETTER TAH DOUBLE "
+    "HORIZONTAL STROKE POINTING BACKHAND INDEXEQUAL TO OR "
+    "GREATER-THANINTERSECTION WITH SERIFSHEAVY BLACK HEART BULLETBERKANAN "
+    "BEORC BJARKAN BCOMPATIBILITY IDEOGRAPH-LEFT DIAGONAL HALF BLACKWO DOTS "
+    "VERTICALLY ABOVEDOWNSCALING FACTOR KIIZH OVER TOP SQUARE "
+    "BRACKETLY-RECYCLED PAPER SYMBOLE PLUS A PLUS SU PLUS NASTROKE THROUGH "
+    "DESCENDERPOINTING DOWNWARDS ABOVESHAPE WITH A DOT INSIDEIVE FINGERS "
+    "SPREAD OPENALGAMATION OR COPRODUCTCIRCUMFLEX ACCENT ABOVEININE ORDINAL "
+    "INDICATORLSCHREIBER PAUSE SYMBOLUPWARDS THEN NORTH WESTLEFT-SHADED WHITE "
+    "ARROWCLUSTER-INITIAL LETTER ALEF MAKSURA FINAL FORMMITIAN CONJUGATE "
+    "MATRIXISTED RIGHTWARDS ARROWSSING DIAGONAL CROSSING YELORUSSIAN-UKRAINIAN "
+    "ISOLIDUS BINARY RELATION WITH HALF-CIRCLE BELOWRIGHT HORIZONTAL SECANTUP "
+    "SPREAD THUMB FORWARDORIGINAL OF OR EQUAL TOPUNCTUATION END OF "
+    "TEXTVERTICAL BISECTING LINERIGHT DIAGONAL ELLIPSISORAH WITH NINE BRANCHES "
+    "POINTING AT THE VIEWERREE VARIATION SELECTOR WO-WAY LEFT WAY TRAFFICWHITE "
+    "FOUR POINTED CUSPHANKED RIGHTWARDS ARROWWESTERN PWO KAREN TONE-ESS "
+    "OUTLINED WHITE STARP WITH EXCLAMATION MARK HUNDRED TWENTY-EIGHTH BARBED "
+    "RIGHTWARDS ARROWRTOISE SHELL BRACKETED OMBINING ANUSVARA ABOVEATTACHING "
+    "VERTICAL OMETDOT BELOW AND DOT ABOVEAVOURING DELICIOUS FOODRAISED "
+    "OMISSION BRACKETPA OVER PA GAR OVER GARGREEK SMALL LETTER IOTAASCENDING "
+    "MUSICAL NOTESIDE ARC CLOCKWISE ARROWAND WOMAN HOLDING HANDSRIGHT-POINTING "
+    "TRIANGLEOVER RIGHTWARDS HARPOON CAKE WITH SWIRL DESIGNZANTINE MUSICAL "
+    "SYMBOL IGHT-SHADED WHITE ARROWHT TRIFOLIATE SNOWFLAKEOVERLAPPING LOGICAL "
+    "ANDHREE POINTED BLACK STARARTY HORN AND PARTY HATCURRENT SYMBOL FORM TWO "
+    "ROTATED NINETY DEGREESUBLE VERTICAL BAR BELOWDOWNWARDS THEN CURVING "
+    "ARABIC LETTER TAH ABOVEANG DEPARTING TONE MARK WITH DECORATIVE COVEROVER "
+    "NU11 BUR OVER BUROVER LEFTWARDS HARPOONUIGHUR KIRGHIZ YEH "
+    "WITSYMPTOTICALLY EQUAL TOOVER SHIR BUR OVER BURCONSONANT MODIFIER "
+    "BARDOMAIN ANTIRESTRICTIONND RECORDING COPYRIGHTTRIPLE VERTICAL "
+    "STROKEUPPER RIGHT AND LOWER DOUBLE SOLIDUS OVERLAYLATIN CAPITAL LETTER "
+    "SLONG HORIZONTAL STROKERIGHT-POINTING FLEURONQUESTION MARK ORNAMENT WITH "
+    "THREE DOTS ABOVEUBSCRIPT SMALL LETTER LOW PARAPHRASE BRACKET WITH SINGLE "
+    "ZAPYATAYAPUNCTUATION KUNDDALIYAUPPER ONE EIGHTH BLOCKARMENIAN ETERNITY "
+    "SIGNDOUBLE VERTICAL STROKEPRECEDED BY APOSTROPHEPOINTING UPWARDS "
+    "BELOWKEEPING STILL MOUNTAINTWO HORIZONTAL STROKESPERSET OF NOR EQUAL "
+    "TODOUBLE-LINED HEAD MARKMNYAM YIG GI MGO RGYANEAST-POINTING AIRPLANEIGEL "
+    "LONG-BRANCH-SOL SDOWNWARDS ZIGZAG ARROWACKSLANTED SOUTH ARROWRECTILINEAR "
+    "BLACK STARI YFESIS TETARTIMORIONREE-CIRCLE ALTERNATE IDOWN-POINTING "
+    "TRIANGLEHEXIFORM LONG ANUSVARANOT INCLUDING THE POLESHORT VERTICAL "
+    "STROKES SYMBOL FOR LIGHTHOUSEUSTOMER ACCOUNT NUMBERIN DEPARTING TONE "
+    "MARKDRESSED TO THE SUBJECTSHORT RIGHTWARDS ARROWLEFT TRIANGLE "
+    "OPERATORALEF WITH LAM WITH YEH RIGHT ARROWHEAD ABOVEING HEAD IN "
+    "SILHOUETTEHORT HORIZONTAL STROKEINDIRECT QUESTION MARKSEMI-VOICED SOUND "
+    "MARKCURLY BRACKET ORNAMENTCJK UNIFIED IDEOGRAPH-TRIPLE RIGHT "
+    "TURNSTILEYIAKENG PUACHUE HMONG  WITH CIRCUMFLEX ABOVEWITH HORIZONTAL "
+    "STROKECONSONANT SIGN MEDIAL ROUND A POINT OPERATORWITH JEEM INITIAL "
+    "FORMWASALLAM ISOLATED FORM-ROTATED DIVISION SIGNRROW WITH ROUNDED "
+    "HEADGREATER-THAN DIAERESISWITH VOICED SOUND MARKLE BESIDE VERTICAL "
+    "BARINVERTED SMALL V BELOWINVERTED SMALL V ABOVE OVER STAMPED ENVELOPEBAR "
+    "ABOVE INTERSECTIONREASE FONT SIZE SYMBOLARD SHELL FLOPPY DISKDOWNWARDS "
+    "ARROW ABOVEACUTE AND HOOK SYMBOLEFT-POINTING TRIANGLE-SHAPED BAG "
+    "DELIMITEREFT OPEN BOX OPERATORDOWN HORIZONTAL LIGHTEFT HORIZONTAL "
+    "SECANTDOWN HORIZONTAL HEAVYYIG MGO TSHEG SHAD MA-ROUND NOTEHEAD DOWN "
+    "ABOVE SHORT DOWN TACKAKIA TELOUS ICHIMATOSINVERTED GLOTTAL STOPINVERTED "
+    "BRIDGE BELOWDELIMITER TSHEG BSTARHALF TRIANGULAR COLONHAND INTERIOR "
+    "PRODUCTWO-CIRCLE ALTERNATE IWO-CIRCLE NUKTA ABOVEINTERSECTION "
+    "OPERATORINTERSECTING LOGICAL TILDE OPERATOR ABOVE GRUENT WITH DOT "
+    "ABOVEHOCKEY STICK AND PUCKHORIZONTAL TABULATIONHOUSAND MILLIONS SIGNTHICK "
+    "LETTER SELECTORCTOR OR CROSS PRODUCTCRUCIFORM NUMBER FOURTEEN POINTED "
+    "ASTERISKCROSSE STICK AND BALLXTRA SHORT VOWEL MARKFINAL CONSONANT SIGN "
+    "EIGHT SPOKED ASTERISKELATIONAL COMPOSITIONVOICED ITERATION MARKDOUBLE "
+    "LEFT TURNSTILEEQUAL TO OR LESS-THANER RIGHT CORNER ANGLEALLING DIAGONAL "
+    "SLASHLATTENED OPEN A ABOVEFLATTENED PARENTHESISDIGRAMMOS EX "
+    "DODEKATATRIANGULAR HALF BLOCKWITH INVERTED V ABOVEGHT OPEN BOX "
+    "OPERATORTOUCHING INSIDE MOUTHGRAMMOS OKTO DODEKATAARKENING OF THE "
+    "LIGHTVERY HEAVY BARB ARROW WITH VERTICAL STROKE AND SLANTED PARALLELSH "
+    "AMPERSAND ORNAMENT WITH SHORT RIGHT LEGAND VOWEL LENGTH MARKPAP PLUS PAP "
+    "PLUS LU3RATING SYSTEM COMMANDVERTICAL LINE OVERLAYBOTTOM U-SHAPED ARROWND "
+    "TELEPHONE RECEIVERRISING DIAGONAL SLASHMORPHOLOGICAL DIVIDERSHORT "
+    "LEFTWARDS ARROWMIDDLE RING LITTLE ONSIDE TO SIDE SCISSORSMALE WITH STROKE "
+    "SIGNBUT NOT EQUIVALENT TOARYSTIAN FIVE HUNDREDQUADRANT CIRCULAR ARCRELICT "
+    "HOUSE BUILDINGREVERSED FEATHER MARKLETTER SMALL CAPITAL OP SHADED WHITE "
+    "ARROWOCAL NOTATION SYMBOL-OPPOSING AN PLUS NAGABESIDE RIGHT "
+    "TRIANGLENTISTRY SYMBOL LIGHT OHAMMAD ISOLATED FORMLESS-THAN OR EQUAL "
+    "TOWITH SOROCHYA NOZHKAHAR2 TIMES GAL PLUS RUMAI PALAUNG TONE-5HALF CIRCLE "
+    "WITH DOTPLUS GISH TIMES TAK4VAL WITH OVAL INSIDEINSIDE MOUTH RELAXEDINING "
+    "OBLIQUE STROKEDOUBLE ANGLE BRACKETCRESCENT MOON SYMBOLGRA GCAN -CHAR "
+    "RTAGSENARMONIOS ANTIFONIAA- SHOG GI MGO RGYAN OVER TUR ZA OVER "
+    "ZAUBHAANAHU WA TAAALAAONE MARK SGAW KAREN INVERSE WHITE CIRCLEINVERTED "
+    "CANDRABINDU OVER LAGAR GUNU SHEAND NORTH EAST ARROWWET CULTIVATION "
+    "SIGNSIDEWAYS NOON GHUNNAONCAVE-SIDED DIAMONDBSET OF NOR EQUAL TODOUBLE "
+    "DOT TONE MARKPOTABLE WATER SYMBOLSINGLE DOT TONE MARKIRCLES HITTING WALL "
+    "HREE-DOT NUKTA ABOVEFOUR RAISED KNUCKLESBETWEEN PALM FACINGSANGE "
+    "ANTIRESTRICTIONCURRENCY SYMBOL RIELTRANSPOSITION MARKERSEPARATOR MIDDLE "
+    "DOTSEPARATOR KEY SYMBOLFORMS LIGHT VERTICALOVER LEFTWARDS ARROWTHROUGH "
+    "SMALL CIRCLENIS RACQUET AND BALLWITH FOUR DOTS ABOVESCRIPTION CHARACTER "
+    "CURVED ANGLE BRACKETHORIZONTAL BAR WITH OTLESS J WITH STROKEFINAL "
+    "CONSONANT MARKMULTIPLE PUNCTUATIONINDEX RING LITTLE ONUP-POINTING "
+    "TRIANGLEAND NORTH WEST ARROWDOTLESS HEAD OF KHAHIMAGE OF OR EQUAL "
+    "TOGHTWARDS ARROW BELOWEVERSED ROTATED RANAAND SOUTH EAST ARROWAND SOUTH "
+    "WEST ARROWFIVE SPOKED ASTERISK79 OVER LAK-079 GUNULEFT-TO-RIGHT "
+    "SECANTHIGH RATHA OR LOW PAWORD REPETITION MARKHIGH TONE APOSTROPHEE "
+    "CONSONANT MODIFIERCONSONANT SIGN HAARULEFT AND LOWER RIGHTCENTRE VERTICAL "
+    "LINERIGHT QUADRANT BLACKRIGHT-POINTING ANGLEJUDEO-SPANISH VARIKAKHAMTI "
+    "REDUPLICATIONARXIS KAI FTHORA VOUREAN STANDARD SYMBOLYRENAIC TWO "
+    "DRACHMASLATALIZED HOOK BELOWRIGHT U-SHAPED ARROWLE WITH POPPING "
+    "CORKWARE-FUNCTION SYMBOLLASHING SWEAT SYMBOL WITH HORIZONTAL BARL "
+    "FUNCTIONAL SYMBOL CHEMICAL SYMBOL FOR  AND DIAGONAL STROKESTAR WITH "
+    "MIDDLE DOTCHARACTER INTRODUCERDOWN ARROWHEAD BELOWEMESTVENNY ZADERZHKA "
+    "BEGIN LOGOGRAM MARKREVERSED ONE HUNDREDRIGHT ANGLE WITH DOTYIG MGO PHUR "
+    "SHAD MA ABOVE LEFT TRIANGLEOW-9 QUOTATION MARK WITH STRIKETHROUGHGIBBOUS "
+    "MOON SYMBOLTHANG LONG ANUSVARALEADING MCHAN RTAGSVARIATION INDICATORSEVEN "
+    "EIGHTHS BLOCKNETWORKED COMPUTERSKULL AND CROSSBONESLANTED EQUAL ABOVE "
+    "VASTNESS OR WASTINGAHU ALAYHI WA-AALIHNE HUNDRED TWENTY PNDRED POINTS "
+    "SYMBOLRROW NO-BREAK SPACEIGATURE AYIN-DALETHSH PLUS HU PLUS ASHFLOORPLANE "
+    "TWISTINGRATUM SUPER STRATUMOTATED ARDHAVISARGAWOMEN HOLDING HANDSBETWEEN "
+    "MIDDLE RING WITH VERTICAL TAILDOWN POINTING INDEXTIGHTLY-CLOSED "
+    "EYESALTERNATE LAKH MARKD CIRCUMFLEX ACCENTVARIANT WITH SQUARENOGRAPHIC "
+    "FULL STOPGAPPED CIRCLE ARROWUP HORIZONTAL LIGHTLF MADDA OVER "
+    "MADDAREE-QUARTER CIRCLE NORTH ARROW WITH HOANSPOSITION BRACKETSEQUENCE "
+    "INTRODUCERARENTHESIS NOTEHEADHORT STROKE OVERLAYVERTICAL TABULATIONOVER E "
+    "NUN OVER NUNTRANNO MALO POVYSHEUP HORIZONTAL HEAVY AND "
+    "PROSGEGRAMMENIVARIANT FORM ILIMMUFT-POINTING FLEURON LOVE YOU HAND "
+    "SIGNHURISAZ THURS THORN AND RETROFLEX HOOKARTIAL DIFFERENTIALLEFT "
+    "POINTING INDEXTO LOWER RIGHT FILLQUESTION MARK ABOVECIRCLED SANS-SERIF "
+    "HAND COVERING MOUTHWITH YEH FINAL FORMET WITH WHITE CROSSLEFT TO LOWER "
+    "RIGHTATED TELLER MACHINERIGHT TO LOWER LEFTINSIDE CIRCLE BELOWCIRCLED "
+    "WHITE ARROWRY CULTIVATION SIGNURRENCY SYMBOL BAHTITED LIABILITY SIGNVERSE "
+    "FINAL BARLINEUBLE DOT WITHIN DOTVERSAL INTERSECTIONISPUTED END OF AYAHOP "
+    "SEMICIRCLE ARROWDENOMINATOR SIXTEENLEFT U-SHAPED ARROWQUADRUPLE "
+    "CRESCENTSA END LOGOGRAM MARKSYMBOL FOR BEGINNERPREFIXED NASAL SIGN "
+    "FLUTTERING IN WINDC DIGRAPH WITH CURLSTRAIGHT THUMB BENTRIGHT MIDDLE "
+    "STROKETWENTY-FIVE DEGREESSTRATIAN FIFTY MNASIN CHEN SPUNGS SHADTURNED "
+    "SECTION MARKTURNED PADA PISELEH KASKAL U GUNU DISHEVEN POWERS OF FOURDOWN "
+    "AND HORIZONTALIMIDIA SEXTULA SIGNPARAGRAPH SEPARATORARABIC FORM "
+    "SHAPINGILDING CONSTRUCTIONHEAD-SHAPED POINTERNAXIAN FIVE HUNDREDFIVE "
+    "FINGERS SPREAD IN A RECTANGLE BOXLUB-SPOKED ASTERISKMSHELL MOBILE "
+    "PHONETART OF RUB EL HIZBANS-SERIF CAPITAL LING SHIRT WITH SASHSLANTED "
+    "NORTH ARROWMOVES AGAINST CHEEKRAILING MCHAN RTAGSWEST POINTING LEAF OVER "
+    "INVERTED SHUGGLY VERTICAL LINEUM WITH DRUMSTICKSWITH STROKE SYMBOLTO "
+    "LOWER LEFT FILLBAARAKA WA-TAAALAATOP U-SHAPED ARROWGISH CROSSING "
+    "GISHASTROLOGICAL SIGN PERFIXED LETTER RAATIN SMALL LETTER RIST CIRCLE "
+    "FRONT EVERSED CHELYUSTKAABBREVIATION MARK EVENTEEN FULL STOPATERRESTRIAL "
+    "ALIENTYPE A ELECTRONICSARROW SHAFT WIDTH WHITE VERTICAL BAR FOR "
+    "SIMALUNGUN SAU-SHAPED ORNAMENTSQUARTER NOTE STEM ERTICAL BAR "
+    "VIRAMAEPIGRAPHIC LETTER DOUBLE PUNCTUATIONPUNCTUATION BINDU ENTY-TWO "
+    "POINT TWOENTERING TONE MARKASTED SWEET POTATOVARIANT FORM LIMMUGATIVE "
+    "ACKNOWLEDGEWITH JUSTIFICATIONDOWN-OUTPUT SYMBOLOTLESS DALATH RISH NOT "
+    "LITTER SYMBOLOU ALAYHE WASALLAMOUCHTONE TELEPHONE AND NO DOTS ABOVEORK ON "
+    "THE DECAYEDEAST POINTING LEAFTROFLEX HOOK BELOW AND SMASH PRODUCTOW TONE "
+    "APOSTROPHEFORTY-FIVE DEGREESFORKED PARAGRAPHOSVERY SMALL DIAMOND AND "
+    "YPOGEGRAMMENIFIVE EIGHTHS BLOCKPACING CANDRABINDU WITH KAVYKA "
+    "ABOVEIGATURE ZAYIN-YODHJEEM ISOLATED FORMYLLABLE LENGTHENER WITH FLOWING "
+    "SANDSET OVER BUILDINGSKANTAJA NAASIKYAYACUP WITHOUT HANDLEKBAR ISOLATED "
+    "FORMSEPTUPLE CRESCENTSHUNDREDS UNIT MARKNINETEEN FULL STOPCTLY EQUIVALENT "
+    "TOUPPER MIDDLE RIGHTHOUSANDS SEPARATORNISH VERSE DIVIDERNITE PART "
+    "INTEGRALHORIZONTALLY BELOWSMALL CIRCLE ABOVEKOREAN CHARACTER ONORMAL "
+    "SUBGROUP OFCANTILLATION SIGN HOLDING BACK TEARSLOWER MIDDLE RIGHTCOPPER "
+    "ANTIMONIATEAND LOW RIGHT RING THUMB INDEX THUMBCONTINUING "
+    "OVERLAPMATHEMATICAL SPACESINGLE PUNCTUATIONINDEPENDENT VOWEL IN "
+    "POSSESSION SIGN WITH CIRCLE ABOVEITAN SMALL SCRIPT  WITH CIRCLE BELOW "
+    "WITH CROSSED-TAILSHAN REDUPLICATIONBOTTOM RIGHT KASRAIGSAW PUZZLE PIECEIX "
+    "SPOKED ASTERISKSYMMETRIC SWAPPING SPREAD THUMB SIDEUP ARROWHEAD "
+    "BELOWTILTING FROM WAISTYPTIAN HIEROGLYPH NYOOGA NAAKSIKYAYABASELINE ROUND "
+    "DOTHAIS LUS NTOG NTOGS PRESSED TOGETHERNYET THYOOM TA-ROLHILOSOPHERS "
+    "SULFURSMALL RED TRIANGLERYUKOVAYA SVETLAYALEFT MIDDLE STROKEUTLINED BLACK "
+    "STARLOSED CIRCLE ARROWLEFT-STEM TONE BARS INSIDE AND ABOVESOUL ISOLATED "
+    "FORMVOCALIZATION MARK WITH BULLET NOSEA PLUS HA PLUS DAPUNCTUATION SIGN "
+    "ALTERNATE NUMBER BUT RELIEVED FACECONSONANT SIGN PA-GAAHLAA TTUDDAAGAMBDA "
+    "WITH STROKEAPLI DYO DODEKATALAGOLITIC LETTER WHITE PARENTHESISDELPHIC "
+    "FIVE MNASINVERTED MCHU CANYEH ISOLATED FORMCONTOURED OUTLINESIGN O WITH "
+    "CROSSPRECEDING SOLIDUS ALTERNATION MARKASTERN PWO KAREN MEEM INITIAL "
+    "FORMPRESSIONLESS FACEPRIZNAK MODIFIER MEDIUM BARB ARROWCIRCLES WITH "
+    "DOTSCONTINUATION SIGNWHITE SHOGI PIECERIATION SELECTOR-CANDRABINDU "
+    "ABOVEEAR SCREEN SYMBOL WITH TILDE ABOVEABBREVIATION SIGNKE BOTTLE AND "
+    "CUPKHAH INITIAL FORMLAPPING LESS-THANSTRAIGHT MOVEMENT AND PALATAL "
+    "HOOKREATIONAL VEHICLEAMPHYLIAN DIGAMMARIGHT HALF CIRCLEVERY SMALL "
+    "SQUARECLOSED LITTLE YUSCOMBINING NUMBER LAH ISOLATED FORM WITH SOUND "
+    "WAVESULAR MEDIUM SHADESQUARED TIMES KURLHOUETTE OF JAPANMANENT PAPER "
+    "SIGNEMICOLON UNDERBARMALL WHITE CIRCLELIAN HIEROGLYPH ALD PERMIC LETTER "
+    "URNED DAMMA BELOWURNED COMMA ABOVEQUAT REVERSED ESHCAL SYMBOL BOTTOMAEUM "
+    "ONE PLETHRON0 WHEELED CHARIOTCANCELLATION MARKTRIPLE DASH ARROWHIRTEEN "
+    "FULL STOPVARIANT FORM IMINVRE TOURNOIS SIGNTHREE SOUND WAVESUP POINTING "
+    "INDEXVARIANT FORM USSUHORIZONTAL DOUBLEHORIZONTAL SINGLEGENERIC "
+    "MATERIALSOURTEEN FULL STOPNG STROKE OVERLAYNFORMATION SOURCEFROM SMALL "
+    "CIRCLEFRACTION ONE HALFBOTTOM HALF BLACKIASTRE MARK ABOVESERVER EYE "
+    "SYMBOLICTED LEFT ENTRY-NEGATIVE CIRCLED IDEOGRAPHIC COMMA OVER ZU PLUS "
+    "SARHAH ISOLATED FORMUP AND HORIZONTALRYBLION BASE SIGNVARIANT FORM "
+    "ASH9TONAL RANGE MARK ONE EIGHTH BLOCK-DENTAL PERCUSSIVEBE WITH "
+    "MERIDIANSGREATER-THAN SIGNGREATER-THAN NOR BRIGHTNESS SYMBOLBERBER "
+    "ACADEMY YAS REVOLVING LIGHTHEART-SHAPED EYES PLUS SHA3 PLUS AOPEN-HEADED "
+    "ARROWWO VERTICAL DOTS WITH NOT EQUAL TOTIAL ARTS UNIFORMING POLE AND "
+    "FISHFACING BABY CHICKVEE WITH UNDERBARY ON BLACK SQUAREAUKAZ LAGU LOGR "
+    "LATHERING TOGETHERINEAR ANNOTATION TARTING FROM SIGNNE EYEBROW "
+    "RAISEDPINWHEEL ASTERISKINITIAL LETTER RAMILITARY AIRPLANEVERAGE WITH "
+    "SLASHTAN ISOLATED FORM GRAVEYARD SYMBOL TO BLACK DIAMONDAND BLACK "
+    "SQUARESOWER NUMERAL SIGNIGHTEEN FULL STOP LAGAB TIMES ASH2NASALIZATION "
+    "MARKFINGER-POST ARROW LAGAR OVER LAGARTERSYLLABIC TSHEGNAUDIZ NYD NAUD "
+    "NTEN THOUSAND SIGNBRACKET EXTENSIONFLICK ALTERNATINGCTION "
+    "APPLICATIONCROSS PUNCTUATIONVARIANT FORM ESHCH WITH UMBRELLAARENTHESES "
+    "ABOVEDOUBLE TURNSTILEDITORIAL CORONISVERY HEAVY SHAFTDOUBLE DOT "
+    "ABOVECONSONANT JOINERVIEWING CEREMONYBOTTOM HALF RINGCORNER "
+    "DOWNWARDSDOUBLE CRESCENTSAFFRICATION MARKUPERSCRIPT ALAPHUP-OUTPUT "
+    "SYMBOLCOMPRESSED ARROWANABAZAR SQUARE UPPER OVER LOWERVOWEL LENGTHENERUP "
+    "MIDDLE HINGEDDOWN RIGHT BARB BOLD GREEK CROSSDEWAYS U BRACKETDOUBLE "
+    "ZAPYATAYAB2 TENU PLUS TABDOTTED CRESCENTSCASIAN ALBANIAN DOUBLE HEAD "
+    "MARKCREAMING IN FEARCORNER LEFTWARDSIFTEEN FULL STOP LIGHT MOON "
+    "ARTASERIFS AT BOTTOMNION WITH SERIFSHYPHENATION MARKSMALL NOON ABOVEIDED "
+    "GREEK CROSSORIZONTAL JOINERIGHTH NOTE STEM IMENSIONAL ANGLEINDEPENDENT "
+    "SHININDEX THUMB SIDEHIGH SPACING DOTMAGNIFYING GLASSRISING TONE MARK "
+    "SMALL ROTATIONS INSERTION POINTRIZONTAL ELLIPSEINES CONVERGING HMATULLAH "
+    "ALAYHESLANTED EQUAL TOSMALL CAPITAL ELHOLDING TOGETHERPEN CENTRE "
+    "CROSSLTERNATE HASANTALOWER OVER UPPERSTUCK-OUT TONGUESTRING "
+    "FRETBOARDSTRAIGHT STRETCHSTICKING OUT FARSTERISK OPERATOR PLUS KAK PLUS "
+    "AADIAN SYLLABICS K PERMITTED HEREO-MINOAN SIGN CMLD ASSYRIAN ONE LEFT "
+    "HALF CIRCLELEFT ARROW ABOVENTAIGANA LETTER SANS-SERIF ARROW OR THE IMAGE "
+    "OFYATHOS BASE SIGNLLOW PAN OF FOODTAKANA-HIRAGANA IPPER-MOUTH FACEIRCLE X "
+    "NOTEHEADLIGHT BARB ARROWLIGHT AND RIGHT ISTOS DISC SIGN OLD WHITE "
+    "CIRCLEIVE POINTED STAROLD TAMIL VIRAMAYIR MKPARAQ MEUNEPSILON "
+    "UNDERBARUDLY CRYING FACEEN MILLIONS SIGNRIGHT DOWN BARB END OF TEXT "
+    "MARKUBJOINED LETTER ENTRE WHITE STARENUMERATION SIGNERCURY SUBLIMATERAYS "
+    "AND DOTTED RIGHT HALF BELOWRIGHT HALF BLACKMIDDLE AND RIGHTMIDDLE AND "
+    "BELOWRAIDO RAD REID R TIMES GAN2 TENUUMBER SIGN ABOVEDVUMYA ZAPYATYMI "
+    "TIMES DISH TENUSHU2 PLUS KASKALRESH-AYIN-DALETHREPETITION MARK-WAVY HAMZA "
+    "BELOWE PLUS GAN2 TENUPLE MEASURE REST AND HEAVY RIGHTULDERED OPEN "
+    "BOXECIMAL SEPARATOR AND LIGHT RIGHTEFORE COMPLETIONRECORD SEPARATORWITH "
+    "HEARING AIDWITH CENTRED DOTSIGN RISING TONE WITH BUNNY EARSWITH LEFT "
+    "UPTURNPRECEDING SUBSETQUALS SIGN BELOWWITH HAMZA ABOVEQ WITH HOOK "
+    "TAILTRIPLE CRESCENTSSITION INDICATORPRECHGESANG STEMNAL DIGIT "
+    "SHAPESEVERSED VISARGA EVERY OTHER TIMEMESTVENNY KLYUCHPLACEHOLDER MARKR "
+    "PLUS GAN2 TENUFALLING DIAGONAL WITH DOT INSIDEPOSTPOSITION MENFFERENCE "
+    "BETWEEN CAPPED MOUNTAINFLOORPLANE SPACEND OF PARAGRAPHMURDA "
+    "MAHAPRANABINDING BRACKETNASALIZED TONE-N-ARY SUMMATIONUSTER NOTEHEAD "
+    "BLOCK DIAGONAL NOON WITH KASRANOON FINAL FORMNO GOOD GESTURENJOINING "
+    "MACRONNA DOUBLE HELIXRIGHT RERENGGANATINATE MYSLITEPERTHO PEORTH PPLUS "
+    "SIGN BELOWATA LINK ESCAPEPRISHTHAMATRA EPUT SYMBOL FOR RIGHTWARDS "
+    "TICKRIGHTWARDS AND QUADRUPLE ARROWQUADRUPLE DASH R WITH FISHHOOKPENSION "
+    "RAILWAYRIGHT HALF RINGVERTICAL SECANTREAMY EYEBROWS RECEPTIVE "
+    "EARTHRECITATIVE MARKREVERSE SOLIDUSREVERSED OPEN EGHT REPEAT SIGNON TOP "
+    "OF MODEMNVERTED UBADAMASALTER PAHLAVI BENT OVER INDEXBELOW LONG "
+    "DASHBELGTHOR SYMBOLODO SOFT HYPHENS IN SILHOUETTES ELEVATUS MARKOGOGRAM "
+    "KHAMTI BAR ABOVE UNIONOLIDUS OPERATORNOT APPROXIMATEOND PLACE "
+    "MEDALONJOINED HINGEDONTOUR INTEGRALORIZONTAL COLONORT EQUALS SIGNOUBLE "
+    "BACKSLASHOW-FALLING TONEOWER HALF BLACKRNAMENT STROKE-RMAN PENNY SIGNPEN "
+    "SQUARED DOTTOP RIGHT FATHADOING CARTWHEELFOUR DOTS WITH FOUR "
+    "ENCLOSURESFRACTION DIGIT FTER COMPLETIONDIGA AELA-PILLADIALYTIKA "
+    "TONOSTRIANGULAR MARKDI ALLAHOU ANHUGEMINATION MARKGGLY LINE "
+    "BELOWDESCENDING TONEFORWARD TILTINGGROUP SEPARATORHAKING PARALLELHALF "
+    "FILL SPACETIP ON THE LEFTHEH MEDIAL FORMTILDE DIAERESISTHROWING A "
+    "KISSDAGESH OR MAPIQHOOKED INDEX UPTHREE DISH TENUHORIZONTAL "
+    "DASHHORIZONTAL FILLEH INITIAL FORMDOWNWARDS TRENDUMAI PALAUNG FAE "
+    "ISOLATED FORME MUSICAL NOTESE OVER INFINITYDOWN SEQUENTIALULTIPLICATION "
+    "XUGMENTATION DOTEFT REPEAT SIGNEFTWARDS ARROWSDOUBLE TRIANGLEUBLE RING "
+    "BELOWERICAN FOOTBALLESIDE LESS-THANU PLUS U PLUS UESSARON CHRONONETIC "
+    "VERSE SIGNTWO WITH STROKEEXPONENT SYMBOLTVIMADUR SYMBOLLONG VOWEL SIGNLD "
+    "TAMIL SHORT LEFT DOWN BARB LEFT HALF BELOWLEFT HALF BLACKCIRCUIT-OUTPUT "
+    "LEFT HAND INDEXLETTER CAPITAL LEVEL TONE MARKLEVEN FULL STOPLIGHT AND "
+    "LEFT LMOST EQUAL TO UR POINTED STARLONG HOOK BELOWCKET CALCULATORLOOK OF "
+    "TRIUMPHLOSED INSULAR GCAPITAL LETTERSSIXTEENTH NOTESMALAKON CHROMA "
+    "MARRYING MAIDENMEEM FINAL FORMBROWS STRAIGHT BREAKING HYPHENMIDDLE "
+    "DIAGONALSHORT OVER LONGINVERTED STROKEHOUSAND STATERSHREE DOTS "
+    "BELOWIAMOND UNDERBARIDING ENCLOSUREIGN PALI VIRAMAIMISEOS "
+    "CHRONOUIMPERFECTA RESTING SYMBOL FOR CORNER WITH DOTINGLE HEAD MARKINUS "
+    "SIGN BELOWINVERTED LAZY SSHITA PLUS GISHIRCUMFLEX BELOWTAI LAING TONE-ITH "
+    "FINGERNAILSIZED WHEELCHAIRSTROKE NOT SIGNKISIM5 TIMES BISTERESIS SYMBOLST "
+    "SYRIAC CROSSST QUARTER MOONSSICAL BUILDINGCLOSED BY CURVELATION "
+    "FUNCTIONXTEEN FULL STOPAMARITAN SOURCE WITH DESCENDER CORNER "
+    "BRACKET-CARRIER LETTERZAIN FINAL FORM OVER SIG4 SHU2 NEPOSTOYANNAYA OVER "
+    "MOUNTAINSVOWEL SEPARATORZERO WITH SLASH TOUCHING INDEX THUMB STRAIGHT "
+    "CLOUD AND RAINYNCHRONOUS IDLE TIMES IGI GUNU WITH RIGHT LEGVOWEL "
+    "SHORTENERWITH DOWN ARROWACHES THE LIMITWITH RAIN DROPSAI LAING DIGIT  "
+    "OPERATOR WITH ALMOST EQUAL TOWHITE DOT RIGHTWALLPLANE SPACE PLUS HI PLUS "
+    "A-PIECE SWIMSUIT THROUGH CIRCLE AND LOWER LEFTAMOUNT OF CHECK DEYTEROU "
+    "ICHOU WITH DIAERESIS ALTERNATE FORM-NO-EVIL MONKEY PARESTIGMENON ALIF "
+    "LENGTHENER2 CHARIOT FRAMEALAYHE ASSALLAMAND PARALLEL TOBLACK "
+    "TRIANGLEBLADE SCISSORSPARATED SYMBOLD-UP NEWSPAPERPARTMENT STOREFORWARD "
+    "INDEX INOLOGICAL DOTMOTHETIC ABOVEFINAL ANUSVARAAND COLD SWEATINVERTED "
+    "BIRGASEL LOCOMOTIVEUP RIGHT BARB OVER GUD LUGALINSERTION SIGNVRON "
+    "SNOWFLAKESEPARATOR MARKING HANDS SIGNSMALL TRIANGLEUSPENSION MARKDASIA "
+    "PNEUMATAINFINITY BELOWPAO KAREN TONESHESHIG TIMES IGHTWARDS VANEUNIT "
+    "SEPARATORTRIANGLE WITH XO EKFONITIKONTERMINAL MARK-UNION OPERATORDI "
+    "ALLAAHU ANHWITH LEFT HOOKPPED MIDDLE UPDEYTEROS ICHOSDIAGONAL "
+    "MOUTHTETARTOS ICHOSDIAGONAL PATH PROTECTED AREAMRACHNOTIKHAYARING "
+    "MEGAPHONEGERED TREMOLO-BAG MEMBERSHIP HASER FOR VAVWITH DOT BELOWPEN MARK "
+    "BELOWSMALL LETTER JLOTUS POSITIONSMALL LETTER DBHATTIPROLU AAANGLE "
+    "OPENING SHAN MEDIAL WAPLE WITH HEARTPLETE INFINITYLOWER DIAGONALPLITTING "
+    "APARTED SYMBOL FOR IKHAYA PUTNAYATELPIECE CLOCKWITH FATHATAN CERTAINTY "
+    "SIGNENDED MULTIMAPLEFTWARDS AND CRIFICIAL WINEYOUTHFUL FOLLYEND OF "
+    "SECTIONONE SOUND WAVELEFTWARDS TICKTWO WHITE DOTSSTRONG ISOLATEENNA WITH "
+    "BARSCEPTER OF JOVECENTURIAL SIGNOOTNOTE MARKERTWO ENCLOSURESLESS-THAN NOR "
+    "-HEADED ARROW SPEECH BUBBLESEMIVOWEL SIGN ALLAJALALOUHOUCOLON OPERATORUAL "
+    "WHEELCHAIRSQUIGGLE ARROWOBLIQUE HYPHENERIAL ARAMAIC ERIC "
+    "INDICATOREPENTHETIC YUTLETTER OVERLAPNYI ZLA NAA DAUBHAYATO MUKHAERTICAL "
+    "JOINEROLD RESOLUTIONALF TREE TRUNKVONIC ASTERISKLACE OF SAJDAHLITTLE "
+    "SECTIONOT TILDE ABOVELIGHTLY SMALL UPPED INDEX UPOTHERS CIRCLEDTURKIC "
+    "LETTER FATHATAN ABOVEISED ROUND DOTSECOND SUBUNITLINE EXTENSION1 OVER "
+    "LAK-081ROSS ON SHIELDIRCULAR VIRAMAFFED FLATBREADFFICE BUILDINGOUR OBOLS "
+    "SIGNSMOKING SYMBOLOUSING THUNDERLEVEN TWELFTHSSURROUND FROM OPPOSING "
+    "PIRIGJOINED SQUARESAMNUC PII KUUHORANGE DIAMONDORD SEPARATOR EXCLAMATION "
+    "OHTWO DOT LEADERINVERTED DAMMANORTH ARABIAN -CURRENCY SIGNIWAZ TIR TYR "
+    "TIVE OBOLS SIGNIVE KEY SYMBOLOSITION SYMBOLITA PLUS GISH ISSION "
+    "TICKETSVERTICAL HEAVYSIDE-DOWN FACEZAKAYA LANTERNTIMES OPERATORDIRECTION "
+    "FLIPREH FINAL FORMRD PLACE MEDALAU LENGTH MARKWORD SEPARATOR CROSSING "
+    "ESH2GYPTOLOGICAL AVERTICAL LIGHTDOUBLE-STRUCK DIO MICROPHONEVERTICAL "
+    "ABOVEDOES NOT EXISTGHT WITH STARSGUNU TIMES ASHAFETY SCISSORSHIRD-STAGE "
+    "HLIREATIVE HEAVENTHER CHRISTMASAROUND-PROFILEHREE-LEGGED TEVENIENCE "
+    "STOREQUINARIUS SIGNVERTICAL COLONRIGHT CROSSBARUNDER RELATIONMENSION "
+    "ORIGINTHOUSANDS MARKUND MARK ABOVEZAH WITH MEEM REVERSED-SCHWA WITH LONG "
+    "LEGREE-LINE STAFFMEDIUM DIAMONDTHOUSANDS SIGNTHAKA ANUDATTAAI LENGTH "
+    "MARKTOP HALF BLACK AND DIAERESISTRANSMIT STATEDUN3 GUNU GUNUTHALAN ETHEL "
+    "OTHREE POINTED TIMES SHU TENUMID-LEVEL TONEHESIVE BANDAGERRIAGE RETURN OF "
+    "THE HORNSAPPED PRESENT-ESASA DOTTEDMALO POVYSHE GTER TSHEG MADOUBLE "
+    "STROKEEVERSED DAMMACULATED LORRYHIEROGLYPHIC MESSENIAN TENDVOECHELNAYA "
+    "JES SU NGA ROGYA GRAM SHADOPPOSING NAGARPENTRY PLANETU WAS-SALAAMDOUBLE "
+    "CIRCLEVERLAY MIDDLEAN RUPEE SIGNVERGREEN TREEROTATED BIRGABY "
+    "DEFINITIONURNED W BELOWUPERIMPOSED XLISION SYMBOLUPONDIUS SIGNDOTTED "
+    "ZLAMA IRCLED INDEX NING MOVEMENTIOT SYLLABLE FICATION CARDNINE "
+    "TWELFTHSINVERTED TURNITING THROUGHHINESE TONE YSYNDESMOS NEOIVE SLOW SIGN "
+    "AND SKI BOOTAMUHU ALAYNAAIVE POINT ONEDOUBLE MUCAADHERICAL ANGLEDOUBLE "
+    "HYPHEN AND YEN SIGNMALL LETTER ZOTEHEAD BLACKISH LIRA SIGNNUMERIC SIGN "
+    "MEDIUM SQUARE VARIANT FORMERTION SYMBOLAR WITH QUILLHAKASSIAN CHEARLAUG "
+    "SYMBOLSAMYOK SANNYACIRCLE INSIDESSAGE WAITINGUPSILON WITH U WITH "
+    "STROKENUMERATOR ONEOLVING HEARTSOMAN NUMERAL CHRYSANTHEMUMSTABLE "
+    "SYMBOLL-TYPE SYMBOLOBLIQUE LINE ARCHAIC KOPPAER BOARD FILLS KRYZHEM ON S "
+    "KAI APOTHESHAM DIGIT ONEMASORA CIRCLELATERAL CLICKNTY FULL STOPOGOTYPE "
+    "SIGN S UP TOGETHER-PER-EM SPACE-OR-PLUS SIGNLEFT CROSSBARSAL PLUS "
+    "TUG2ARGOSYNTHETON-OFF CALENDARCITATION MARKTIRTA TUMETESEUROPE-AFRICAYOD "
+    "YOD PATAHCROSSING GAN2WO-LINE STAFFYMBOL TAU RHOKAPYEOUNPIEUPRTABLE "
+    "STEREOSILI PNEUMATACROSSING GABAOON NOTEHEAD CROSSING MUSHARROW "
+    "OVERLAYH-TYPE SYMBOLVERTICAL BARS OPPOSING KUREMPHATIC TONESIGN "
+    "AVAGRAHASIGN PAMUDPODVERTICAL FILLONAL COMPUTERMARKS CHAPTERMELODIC "
+    "QITSACRIPTION TAKESTERTIUS SIGNCRIPTIONAL PAK WORK SYMBOLLEGETOS ICHOSONG "
+    "RIGHT LEGCHECKER BOARDUPWARDS TRENDONG-LEGGED DEONGRATULATIONARRED "
+    "TRIDENTSHESH PLUS KII WITH STROKEGAR FRACTION  BAT AND BALL CROSSING "
+    "KA2WITH INTEGRALAUDATE CHRIVIFOREMENTIONEDMODIFIER MARK WITHOUT SNOWED "
+    "PAPERCLIPSZHOU NUMERAL VEN POINT ONENG TERMINATORPPOSING LUGALGAW KAREN "
+    "SHADIAERESIZED UWITH ASTERISKBOHAIRIC KHEIPA NJI PIPAEMED DOUBLE VERBASAN "
+    "LETTER MINDER RIBBONSIA-AUSTRALIA WITH JEGOGANHREE TWELFTHSPAIRED "
+    "ARROWSUSICAL LEIMMA BZHI MIG CANRN PENTATHLONLVEOLAR CLICKTE ORDER "
+    "MARKGIFT ENVELOPEVE-LINE STAFFSMALL LETTERSYUUKALEAPINTURIZONTAL "
+    "TAILEELING PERSON WITH TEE TOPPLUS OPERATORFROWNING FACEIMAGE "
+    "BRACKETRIPLE SVARITAIGHT TWELFTHSRACKETS ABOVEWAVY OVERLINELVE FULL "
+    "STOPTHIRD SUBUNITMINUS WHITE XMINUS SIMILARILE SEPARATORBACKSLASH BARW "
+    "RING INSIDE DIMINUTION-1FINAL SEMKATHEHU FEOH FE FFULL SURROUND HEADED "
+    "ARROWSELECTED AREAUDDISA SIRRAHDIC MARK SIGNBALL AND HOOPUSHING "
+    "UPWARDWAW-AYIN-RESHOUT MIDDLE UP WITH INK PENOURTH SUBUNITRANKS CASKET "
+    "INVERTED FORKVICE CONTROL  DIRECTIONAL TROFLEX CLICKRIGHT "
+    "HARPOONAWELLEMET YAZNAP PIZZICATOFINAL LETTER MAILBOX WITH TOP HALF "
+    "RINGANNED LEATHERLOCATION SIGNACCOMMODATION B BAR SYMBOLBOTTOM CORNERFT "
+    "ARROWHEAD TED HAND SIGNUFFLE PRODUCTMULTIOCULAR OQUARTERS SIGNEAVENLY "
+    "EARTHPREPONDERANCEFIXED-FORM RAIFI ROHINGYA LOCK WITH KEYILABIAL "
+    "CLICKINTEREST SIGNWAVY LOW LINEEDIC ANUSVARAMOBILE PHONESVOWEL SIGN "
+    "PABOWING DEEPLY WITH OVERBARUE OF LIBERTY TIMES KASKALLEFT-LIGHTEDVOLTAGE "
+    "SIGNCRESCENT BARSHORT RIKRIKNUITY SYMBOLUPPER CORNERENOS CHRONOUDIGRAPH "
+    "YORIALLPOINT PENDIGRAPH KOTOMPTY CENTRE LU PLUS ESH2DICTION SIGNLEADING "
+    "EYESMPHASIS MARKMEDARY CAMELMBELLISHMENTACE INTEGRALS SUBPUNCTISLUS "
+    "NOTEHEADLOWERED FLAGDOWN NEUTRALN ELEMENT OFENT ARROW POULL NOTEHEAD-MAIL "
+    "SYMBOLUME INTEGRALSHED BARLINESMALL DOUBLELEFT HARPOONCROSSING "
+    "NUNMONOGRAPH UKMUM TIMES PAMEDIUM SHAFTNGLE BARLINEDOUBLE ARROWEGIN "
+    "SEGMENTUBSCRIPT TWOMADDA ABOVE MALL SECTIONAFU LEERAEWAWDATA SQUARESMALL "
+    "TRIPLELICKING LIPSAA AS-SALAAM-DZUD RTAGS DASHED ARROWNORTHERN TSESMILING "
+    "FACEEIGHTH NOTESMIDDLE PIECELL MODIFIER-UN WITH RAYSACUTE ACCENTSECTION "
+    "SIGNLINKING MARKLINGING FIREDOT OPERATORLLE PATTERN NJALA GONDI LIMBS "
+    "DIGITSDOUBLE ARCH  WITH INDEX NDING PERSONM NSHUT NYAMLER CONSTANTSH ZIDA "
+    "TENUNCK CONSTANTCROSSING LU2CROSSING KALCROSSING GI4DENTAL CLICKNATURAL "
+    "SIGNENARIUS SIGNNARROW SHAFTDOWN HARPOONDUG TIMES NIUGHT BALLOONMING TO "
+    "MEETNERSHIP SIGNNEPOSTOYANNYMETA STAVROUEMELY HEAVY  WITH DAGESHEAGULL "
+    "BELOW SKEWED LEFTLOWER CORNERNOTCHED HOOKNOTCHED TAILEMISOFT SIGNEEPING "
+    "SMALLDE MARK SIGNMANNAZ MAN MUH PLUS GISHSAZ IS ISS IRNAM BCAD MARISTMAS "
+    "TREETEARS OF JOYTE SEPARATOR IN TRIANGLEIN MIDDLE UPBINING MARK PHEME "
+    "JOINERANG KHANG GYBLACK CIRCLEFOUNTAIN PENFORMING ARTSINDEX MIDDLEPOETRY "
+    "MARK-GAW KAREN EURION CHRONONPOUTING FACEIGATURE SHRITERNATE AYINPORT "
+    "CONTROLBEHIND CLOUDUTH-SLAVEY KUTH ARABIAN TRIPLE DANDATRIPLE "
+    "FLAMEBETWEEN LIPSFT RERENGGANINUSOID SIGNUSEATED FACEINVERTEBRATEAND "
+    "OPERATORBRATION MODEAND CRESCENTBRIDGE ABOVEBSCRIPT ALEFOUR TWELFTHSYAN "
+    "NUMERAL IRAGANA HOKAOUGHT BUBBLEFERENCE MARKOUCHES THUMBFEMININE "
+    "DOTBUTTON MOUSEFOLDED HANDSBLOWING FACEBLUE DIAMONDING ENVELOPE "
+    "KLYUCHEVAYAING HITTING ING OPERATORXIRON KLASMAFLAG ON POSTROLLING EYES "
+    "LINE SYMBOLINTEGRATION OVER KASKAL RIGHT DOUBLERED KEYBOARD AND "
+    "PICTUREGUARDED AREAGROUND SLIDEGREEN DRAGONRCHAIC SAMPITHREE HEARTSWITH "
+    "SMALL VRANCHING OUTHEAD-BANDAGEHAND FORMAT RIAL TRAMWAYRIAGE SYMBOLHASIS "
+    "SYMBOLARALLELOGRAMHALF BRACKETREVERSE MARKVER EQUAL TOAR DIAERESISHAH "
+    "WITH DALREN CROSSINGREFACE COLONHIBITED SIGNBAHIRGOMUKHAQUARTER "
+    "SIGNQUARED ARROW CROSSING GUBACK OF HANDQUIRREL TAILIDENTICAL TOGEBA "
+    "KAREN IRING OVERLAYVAKRAHASANYAPROTOS ICHOSGBY FOOTBALLRAFFIC LIGHTHREE "
+    "FINGERSATNAH HAFUKHVICTORY HANDTOP-LIGHTED ATTOOED HEADRAH BEN YOMO6 LONG "
+    "NGGOO-SHAPED SIGNTHODOX CROSSHYPHEN-MINUSRIGHT SINGLETHIC LETTER TRAGRAM "
+    "FOR THETA SYMBOLWIGGLY FENCEOPPOSING LU2 OVER KISIM5OQ NSHUT YUMLARGE "
+    "DOUBLE ON PEDESTALS ABOVE SIGN OVER MIDDLEALT PAN SIGNOPLE HUGGINGOHAZARD "
+    "SIGNLATALIZATIONYOD TRIANGLEOGOGRAM NYAJYOUTHFULNESSON US SYMBOLYMBOL "
+    "BINDU OK HAND SIGNKANA REPEAT CIRCLED PLUSLARGE TRIPLECENDING NODESS-THAN "
+    "SIGNEVERING FACEERPENDICULARKLYUCHEVAYA CK-O-LANTERNOPENING LEFTSUR OVER "
+    "SURKAPPA SYMBOLCIRCLES AND OING TO MEETOID NOTEHEADOTTOM HALF OT "
+    "MONGKEUAEQCHARACTER-1BCABBAGE-TREEALTERNATING FALLING DOTS OVER TWO "
+    "PIIRTY-SECOND BYSMAL WATERONISHED FACEETRETES SIGNLAYING CARDSCHAIR "
+    "SYMBOLKHAMTI TONE-KHMIMIC KHEICHARACTER-18CALENDAR PADCIAN LETTER "
+    "-SIMPLIFIED IVE TWELFTHS OF ANTIMONYROUNDED ZEROHREE BALUDAE WITH "
+    "VEILGRAMMA SIGNHORA DIGIT ULO TWO SUMLACK SULFURTRAIGHT WAWL OF THREADL "
+    "TIMES LAL0 FOOTSTOOL WITH JACKSWHITE JOKERI TIMES NUNI TIMES BADESH "
+    "DIGRAPHACKED COMMATHIRDS SIGNLACKLETTER MACING FACE-OFF SYMBOLLEFT "
+    "SYMBOLLEFT SINGLEXAGRAM FOR ENTHESIZED 6 LONG NGGE-MINUS SIGN WITH "
+    "FLASHE2 TIMES ANLEEP SYMBOLLEAF CLOVERHEELED SHOEWO TWELFTHSHAGGAR "
+    "YAZHLATIN CROSSERCENT SIGNHEAVEN MARKDUATION CAPHEATED FACE WITH "
+    "COMMAEPIDAUREAN HAWH HMONG  WITH CARONHANG KHUDAMSINGLE AND 5 LONG "
+    "MBOOLCE TSA CANMBA BAYANNALD SCRIPT XSIMILE SIGNMBLER GLASSLD POLISH "
+    "OLEFT DOUBLESSANGKIYEOKGRAVE-ACUTEACUTE-GRAVEHOKHLOM ON THREE "
+    "TIMESEORGIAN NARSTERED SIGNHLETIC SHOEACTIVE SIGNHITE DRAGONGSUM "
+    "-KHYILDYO CHRONONGUISED FACETONAL MARK UMAN FIGUREWASLA ABOVETIEE "
+    "SHEUOQTIGHT ACUTE WITH DASIASPIRATED FAHIGH STROKELETION MARKJECT "
+    "SYMBOLLON SKEWED JIHVAMULIYAUG RTAGS GYSVASTI SIGNINDICESIMA TRUNCATED "
+    "AEEZING FACELEU SATANGAINDERGARTENJOYOUS LAKEKAARA POLLUFOURTH ROOT WITH "
+    "TRILLZZA WA JALL WITH TITLOUISHED FACELOSED ENTRYSPEED TRAININ EQUAL "
+    "TOLOSING MARKLOTI NAGRI IMULTANEOUSUETTE BREADTUNE "
+    "COOKIEYEORINHIEUHIRCLED TEXTIPLE TONGUEFGHANI SIGNTA EQUAL TOISIGOTHIC "
+    "ZWING NEEDLEFINAL SIGMA-COPPER ORE WRIST FLEXFIRE ENGINEIVERY TRUCKUBLE "
+    "TONGUESYURA SASAKWINKING EYEIX TWELFTHSWE PALAUNG SYMBOL VIDJ WITH "
+    "MAPIQIEN MONSTERKRAINIAN IETRESS SIGN LTED FLOWERGE AT "
+    "NIGHTKTIESELSKABLTERNATE YAXI RADICAL LINE FILLERLU PLUS IGIGENTLE WIND3 "
+    "LONG NGGOTETRAFONIASXESTES SIGNTH-THALATHAEAVER DENE ENG DIGRAPHSTEAMY "
+    "ROOMGHAIN WITH THAM DIGIT LUPOVODNAYAIBLE-CREE YTWO FINGERSEUNJOMNDEUQTY "
+    "THOUSANDILIQUA SIGNEDICAL MASKILCROW SIGNABOVE RIGHTIL "
+    "FRAGMENTXTINGUISHERTENS DIGIT WITH GARDENEN STRAIGHTTRIAN CAMELGAP "
+    "FILLER-SMALL CLOUDSTORIC SITEGAYANUKITTA WITH PLATELT OF CLOTHETEI MAYEK "
+    "TRESVETLAYASECOND MARKPHNAEK MUANRISING DOTSBETA SYMBOLZIGZAG LINEUTH "
+    "CORNERSCURVED BENDRITING HANDBELOW RIGHTPODCHASHIEMUPADHMANIYAUTING "
+    "WHALECROSSING URPARAKALESMABLACK ARROWCROSSING BUCROSSING ENCROSSING "
+    "IMCROSSING PIRIPLE PRIMENSE CHEEKS  PROPORTIONCTION MARK CTION "
+    "MARK-PERISPOMENI I ZAPYATOYAWNING FACEDE KIKAKUI VARYS ICHOSQUERED "
+    "FLAGQUIQUADRATEND OF PIECEVYKA ABOVE  SHOE STILEND ODD SIGNSHAAYATHIYAVE "
+    "OF PEACEDENT EMBLEMNBLENDED UKRIGHT-LIGHTRIGHT-HAND UNJO WYNN W S "
+    "ZAPYATOYNIKOLSBURG POST OFFICEVA V CHELNUBANK "
+    "SYMBOLDALETH-RESHVAMAGOMUKHAPUT MORTUUMNG LEFT LEGRING LIQUIDDASH SYMBOL "
+    "DECORATIONCAN RGYINGSRPOON ABOVECARET TILDE OF FLOWERSOLD NUBIAN ORT "
+    "BARLINEAMUSED FACEORCE SYMBOLVISARGA ONERYVNIA SIGNCK SEXTANT-OHINGYA "
+    "YEHOF MASHFAATZERO THIRDSOF ENVELOPERUNNING MANONIAN SIGN  OVER BULUG "
+    "OVER IDIM CH AND LAMPCHING CHICKCELANDIC-YRCE OF PIZZAOMAN SIYAQ "
+    "CCUMULATIONOPPOSING ENOPPOSING IMOR OPERATORBOTTOM MARKNYIS -KHYILCONTAIN "
+    "AS BREVE BELOWOUTHERN TSEROR-BARRED RONTHISMATAOVERSTRUCK COND "
+    "SCREENNUSVARA ONENUN HAFUKHANUMBER ZEROROKUTASTI ANUMBER SIGNCREDIT "
+    "SIGNNTIMONY ORE PLUS MASH2OUBLE ACUTEBZHI -KHYIL PLUS NUNUZURRENT "
+    "SIGNOUBLE DANDANITIAL IZHECOMBINATIONOUNDED FACEROSS ACCENTBUMPY "
+    "ABOVERCHAIC JNYAMIDDLE STEMASE TO THE  AND MACRONDONG "
+    "TSHUGSDOACHASHMEEREAKTHROUGH TIMES ESH2AILLESS PHIRIGHT GUARDMONOCULAR "
+    "OMOVED BELOWDIATONON DIATH PRODUCTRANSMISSIONRIGHT HEAVYRIGHT LIGHTMFON "
+    "PIPAEMME LONG CANMED RGYINGSARAM GONDI  UPPER HALFRESPONDS "
+    "TOAESCULAPIUSAESHAE NYAMARM SPIRAL ARMS RAISEDDOLLAR SIGNDOUBLE "
+    "SHADDOUBLE RINGDOUBLE MARKARPEGGIATO AGAZ DAEG DMICAL HEARTMIDDLE "
+    "BENTDOUBLE AND MIDDLE HOOKAGONAL SIGNDESK PERSONSHEQEL SIGNUNIT DIGIT "
+    "MUUSIKATOANMUNCIA SIGNRADITIONAL N THE VERGERACHMA SIGNATION SPACE TACK "
+    "BELOWRA SOMPENG ATION POINTRAISED FLAGRAGGISMATAOTING STAR1 PLASTICSZH "
+    "DIGRAPHFAHRENHEITQUISH QUADOSTAL MARKVEL SLIDERTHMIKON N 1 LONG "
+    "MBEURIPIGMENTIT MBAAKETC WITH DOTROUND DOT HEAVY BEATISMUTH OREGHT "
+    "LIFTERWO SHORTS OUT INDEX URVED OMETBSTRUCTIONHERMOMETERION BOTTLEXED "
+    "BICEPSBROKEN BARHAAPRAANA WING HEARTOUTER JOIN AND BREVEFINAL HETHOUTHERN "
+    "TAATRICHISMAOSSED SHEIVIOUS PAGEAYER BEADS AND ARROWOUND OMEGA AND "
+    "ACUTEFFICULTIESTAIL GLASSATTY WITH OUR FIFTHSRSI SYMBOLTWO SHORTSOON "
+    "LILITHOON SELENAEUTRAL YERSTRUCTION RGE CIRCLEUR YIG MGOUR HUNDREDR2 PLUS "
+    "SUYMBOL AIVAOP NKAARAEKAI SYMBOLKA SATANGAK2 PLUS BUGIMEL-HETHRHO "
+    "SYMBOLETTA-PILLAKINDI MVOPSTRAL SIGNHAMZA MARKI ARCHAIONTYPE COLONOPEN "
+    "SHELFCHAD RTAGSUR CORNERSCH BALLOONRGE SQUARESTROM SIGNTWO THIRDSRESH "
+    "BELOW5 PLASTICS OF DHARMAHEADSTROKEORTHERN TARIGHT SIGNIXTHS DISHROUNDED "
+    "ERF SHE-GOATT AND BOLT3 PLASTICSHUNGARIAN TIMES SIGNTING HEARTEVERSED PE6 "
+    "PLASTICSJONG TILE REVERSED IITH DIGIT SYLLABLE MZU OVER ZUCAPITAL ETOROME "
+    "SIGNVERAGE BOXPLUS BELOWIKRON ISONUTH OR SPYPLUS ERIN2TEMPLATIONHOOK "
+    "ABOVEPLUS NAGA BELOW LEFTWITH SPOONHAN DIGIT FRONT WALLY AND RICEGREE "
+    "SLASHRCHAIC KHAWITH STRAWANGKHANKHUGAGE CLAIMFTOGGOS OUGGING FACERING "
+    "ABOVEILE FOLDERIDDLE MARKIGATING RA DRAWINGS TERNATIVE PRALINEAR "
+    "GBAKURUNENTESE CROSSPPOPOTAMUSRIGHT HOOKIED SHRIMPTRESS AND "
+    "TREFACTIONHREE ABOVEXHEEJ CEEVIDEOGRAPH POLICE CARANGULAR TOTOP "
+    "CORNERGANDA MARKHOTIC HOOKPOUND SIGNIGATURE OEGAS BZUNG TRETCHED "
+    "CROEZENIAN INHERENT A AND MOUSEBOLD SHAFT2 LONG MBOING-SHIFT ANDHI "
+    "MARKING LARGE INITIAL RAROAD OMEGAAUTOMOBILE2 PLASTICSFOR RECORDINDU "
+    "BELOWTAMAN SIGNUSEL HORSEGOLUBCHIK THDAY CAKERED DRAGONTHAPASCAN 2 PLUS "
+    "ASH AND KNIFEUSHED FACEVIE CAMERA LATE FORMICAL TAPERRDHACANDRAWITH "
+    "WINGSASTERISCUSICK FIGUREPASSIMBANG KABA TENUPEDAL MARK7 PLASTICSRKING "
+    "FACE4 PLASTICSRECIPITATEFORMATTINGGUA PI MAOINDEX BENTBLACK "
+    "FLAGASPIRATIONGGRAVATIONBA SATANGALPAPRAANA  WITH RAIN WITH PLUSA TANG "
+    "LAIED FINGERSNTITY MARKED FIGURE-N NGGEUAETALENT SIGN WITH "
+    "PAGEENETRATIONNTO SHRINESHMIRI YEHLEFT-HAND -LUE KARANENS SYMBOLLEK ATTAK "
+    "NAKE BELOWEDESTRIANSLENDED YUS POVODNAYALOWER HOOKALEF LAMEDCROSS MARK "
+    "THOUSANDSCROPHONIC UBLE DASH  WITH RINGSHARP SIGNLEFT GUARDLEFT "
+    "LIGHTMONOGRAM BLEFT HEAVYMONOFONIASDIRGA MUREEONGCHIEUMMONOSPACE AILED "
+    "BIRD PLUS SHU2EARTH MARKW OR MODELCOMPONENT-COMPONENT OANDAKHIATUPPER "
+    "HOOKNUMBER TENDIATONIKI LTERNATE UA PLUS KURLTIC CROSSSBUB "
+    "-CHALENTHUSIASMLEFT SERIFA PLUS IGIEBENSTIMME WITH LOW DIGIT ZEROMONTH "
+    "SIGNSGOR RTAGSSMALL TAH EIGHTIETHSLONG FINALLONG OVER UP HARPOONZAR "
+    "AMULETNDU TEMPLELONG TSHEGCY MESSAGEDA PLUS HANGUAGE TAGUP OR DOWNUP "
+    "NEUTRALNGLICANA WLLOW HEARTDA SATANGA SCHROEDERSELINE ESHAB2 TIMES EICH "
+    "STARKABATA TREED WITH DOTLOGICAL ORAKKHANGYAOSMILO SIGNNASPIRATEDUNKIA "
+    "SIGNLHAG RTAGSLGIZ EOLHX WITH TAILSPACE MARKCURLED WAWNANGMONTHONOTE WITH "
+    "LET SYMBOLSCAN LINE-ND SEGMENTLINDRICITYLIMITATIONDED PERSONNDA PA "
+    "NJISE-CREE SKLIGHT BULBLIGHT BEATMOTORCYCLE WITH TICKEEKING EYE RGYA "
+    "GRAMCURLY HAIRELT BUCKLE RESUPINUSMEL SYMBOLMALL ALEPHSSANGARAEAON MEDIAL "
+    "E PLUS SUMCISIVENESSADAK BINDILANE MERGE WITH EGGS TIMES SHESS OF MILKU "
+    "CIN HAU UM ROTUNDAKRYZHEVAYAWHOLE NOTEST PALETTEOLON EQUALLACK JOKEROLING "
+    "FACEDUOUS TREEWHITE HAIRRUPEE MARKLA USED ASMEEM ABOVEUMAN EARTHSIDEWAYS "
+    "IZEIRO SIGNU2 PLUS BACIRCLED CAST-FEEDINGOMMA BELOWDOUBLE BARSSANGPIEUPM "
+    "STALLIONMINO TILE  OVER KAD5COLATE BARAEDA-PILLAUAM TSHOOJRUDIMENTA "
+    "-SHAPED HASIXTEENTHSEQUIHOPPERALLY MARK LE LETTER ME PLUS "
+    "ENLE-DELAYEDCHECK MARKEARLY FORMUARDEDNESSADDA WITH OF HYGIEIAWHITE "
+    "FLAGMILLE SIGN WITH BASE WITH BELTMADDA MARK SPARKLERHEADSCARFHARD SIGNIA "
+    "SYMBOLHARACTERSSEMICOLONNGER SHIPZ DIGRAPHNCLOSING NFORZANDOSHAB CEEBLOND "
+    "HAIRIDEWAYS UARCHAIC MRFUL FACEQUSHSHAYAXHAUSTIONNG SANDALIDEOGRAM "
+    "QUADCOLONLONG TIP TIMES PAPSEPTEMBERQUEEN OF IALECT-P NDAILING ICE CREAM5 "
+    "CYPERUS5 LONG JO AND TAILWRY SMILEWORDSPACEMRACHNAYAHINOCEROSHOT "
+    "SASAKMAEMGBIEEWRINKLES HIMA SIMARED JOKERMUKPHRENGRCHAIC IIHIYYAALAAREAK "
+    "HERE TIMES HAM HE-GOATRDEL DKARRCHAIC RALVIN SIGNREDNE ON  APODEXIAHOOK "
+    "MARKMBROIDERYZAL SASAKMALL RINGHWAZ EH E3 PLUS ANTIMES NA2RIED FACE5 "
+    "BATHTUBLOWER DOTI PLUS LI STREAMERMHANCHOLLR PLUS RA "
+    "TROMIKONMETOBELUSMARK CIM ZAKRYTAYAHREE FOR  AND CURLHI SYMBOLMARK SHADNA "
+    "KHONNAXCITEMENTREFORMED  AND BELTSIVE FACE TIMES UDISEN-ISEN PLUS LAL "
+    "PLUS KU3ROTATION-OTAL SIGNOF STIMME-STACCATO PLUS GUDT ON BONE PLUS GALS "
+    "DIGRAPHODIASTOLET OF MEATLARGEMENTYRANISMA OKED HEADITRA SIGNZERO "
+    "SIGNOKED TAILLAN SIGN  OF BLOODIVE-PULL-IVINATIONNVERTED ROUTH WIND PLUS "
+    "ZA7 PLUS TUROUT MOUTHYEAR SIGNYEH ABOVEYEH WITH OURA SIGNORTH "
+    "WINDTAKHALLUS PLUS SAGSPIRITUS IRST MARKTABE SIGNOCCLUSIONZENE RINGON "
+    "GROUNDL ME HANDKYO TOWERON TEUAEQSTEBASKETRTER MARKRUM CLEF-OO DENNENKU "
+    "RU KHAKSTREPTON OVER LUMONE MARK- OVER BALKEMPHRENGONE THIRDSTRELNAYARTS "
+    "MEDAL0 LONG LEONG GRAVEKING BOOTONGSEONG "
+    "RPORATIONOKOUFISMAORT-TWIG-SSANGSIOS1 CHARIOT OF PAPERJERUSALEMLACKFOOT "
+    "RWARI DDAOM SYMBOLK GESTUREKA- SHOG KAMEYTSA OP HALF OSTAL BALLPLE "
+    "HEARTLITTLE UP GARSHUNILISSANDO IGN NUKTAIGN SAFHAIGN TOMPILINE FACETEH "
+    "ABOVELIGHTNING-AMMONIACIGHTH ASHTED PLANT RICKSHAWNO TELEIAPIDERY HAILE "
+    "TILDE247 DIPTEILIPPINE Y BLOSSOMNIGHT OF NGUN SIGNPROJECTORZIR SASAKSMALL "
+    "YUSPPOSITIONLLABLE OMPPOINTED LLABLE B0NIGGAHITA RA OR RINIHSHVASASOF "
+    "PASUQ FROM BARLIVERANCENING SIGNIGH HAMZAP ELAMITEING LANESP DIGRAPH-LOW "
+    "TONEING STONENTRACTIONINISHMENTROJECTIONINNYIIYHELEFT "
+    "TACKNUSVARAYAPAA-PILLAOW KAVYKATANDSTILL2 GARMENTOVER MUSHLEFT RINGOVER "
+    "GAN2-MID TONENTERPRISEPENTASEMEPENT SIGNIN SQUAREINAL NOTENSERT AT "
+    "INARBORASRNEY PARAY-FOURTH Y-FOURTHSRO WIDTH NTESSENCE-KHYUD "
+    "PAPANYANGGAING CARD ING DOLLSPADE SUITING GLOVEED DIGIT ETRASIMOUEAVY "
+    "DOWNURNED AYBBITE LIPSEBIT SIGNTRESVETLOAVE ARROWETTI BALLCHOSEONG URLY "
+    "LOOPFROM WALLUTRA MARKFACING UPED PLANETABOVE TO UPPER DOTATHAMASATAL "
+    "RUNOUTCORN FACEVIGINTILEUURDHAJA UBSTITUTEANG CITI URNED GANFEH WITH "
+    "TUKWENTISDEPARTUREURAMAZDAABKHASIAN ANTHAKHATDENT AND VERLONG "
+    "AAJANYALANUR-DE-LISACE NOTE ALI GALI VRAKHIYA G IN HOLEA PLUS "
+    "NAVELOPMENTAOS ICHOSCAPITAL QGREATER YANTAYALANBICYCLISTCAPITAL IANSKRIT "
+    "SUE MAEMBAGITTARIUSBIAL SIGNCARTRIDGEDAD WITH B DIGRAPHEIGHT OF "
+    "CRESCENDOVISARGAYAVOCALIC RBEER MUGSVER LUGALD SALTIRETUTEYASATCANG "
+    "TE-UTONE MAI EEN WITH ER BUBBLEVICE MARKBING CANEGRIK SIGNENTRY SAWWITH "
+    "FACEATTACHED EFAIDRIN CAPITAL DANGGEUAETEFORMED TARISTERA HALF NOTEFISH "
+    "TAILEMPTY SETDOWN SIGNDOWN STEPCOIN SIGNADMA GDANBASE UNITWING STAREURO "
+    "SIGNADEG ADEGARM CLOCKAROSHTHI VOETOCHIEFINAL NUNCHANICAL CUBE ROOTCLOSED "
+    "PLESAME DOTALPAPRANAES AKURU EMBEDDINGAFFE FACEFLAT SIGNAF PERSONBOTH "
+    "BENTTREDECILEALAYALAM ERTY LINEBO GYFU GHALSHELETTTED STEMDOWN HANDBO "
+    "BAIMAIHALF SIGNELEGRAPH AISED DOTFINAL NGABRUL SHADFOUR BENTAS MEMBERETER "
+    "SIGNTO CORNERERCIAL ATE AT LEFTUNGSEONG VANAGARI URUZ UR UVINE "
+    "LEAFUPTSTIMMEUVUZHAKKUAINTBRUSHFINAL MEMDRAM SIGNHAIKSUKI "
+    "UNGLASSESCHAVIYANICOMPLETEDWASH TAILUMED HEADELLOWSHIPTRAIGHT UDUS "
+    "RTAGSVEUAENGAMANEROSIS KAIYARAAEVEN OF CHATTAWA OVER "
+    "KGKATAKANAKASRATANETRASEMEL POLISHETA SIGNCK CHARTET SHOESOHM SIGN PLUS "
+    "DI PLUS DUL-LAKUNAEST WINDLA LENGACLIMBING OVER ZIEUFEUAETONE FOR  OVER "
+    "MUCHINESE ON CROSSOMMA BARCLOSED TOMANIAN OM NTEUMOLLOWINGBUNDANCEBOX "
+    "TRAYOVER GA2OVER BU FILE BOXBRA FACETTO MARK8 KANAKOYBEYFILIROSSED OANC "
+    "SIGNYENISEI IRD MARKYER YAGHTAI LUE FEBRUARYTAALUJA IS FORM BOL SIGNING "
+    "ROD  LANTANGBOT FACETAR EYESOVERRIDEIS WHEELTTENTIONOVER TIROVER SHEOVER "
+    "SAGOVER GI4FINAL THCASSETTE1 BARLEYJACK OF "
+    "JAVIYANISWIMMINGEXCHANGECEILING RSE DUNGJUNCTIONSUPERSETCER "
+    "BALLEVERANCEOO TYPE SUCCEEDSCANDICUSIS-PILLAC SIYAQ OTIFIED YESIEUNG "
+    "NUTILLUCABLEWAYITA MFONOT MBUAETURNED MCAL DISC OTTAVA AMS HORNT NGGEET1 "
+    "HELMETYIDDISH ORM FEED OF YARNOREHEAD  ON LEFTNAVIYANIECH YIWNLTRY "
+    "LEGEBEEFILILUB SUITSMA SIGNNCE SIGNM ALLAAHED BRICKULLS LEGNAMENNY "
+    "ZAKRYTOEAIYANNOINA METEKN-JOINERSIX DOTSACKSPACELORRAINEABAAFILIWBOY "
+    "HATABOAFILIDAMMATANLONG BARNG RTAGSDANTAJA LONG S TNEUTRAL E OF "
+    "POOUKEUTNDALOW DOUBNEIFORM LOW STOPNED FOODDDY BEARLOZHITIE "
+    "SLIDINGSIFISTONHAN-AKATDIM GUNUUNG DASHAEN NYAMMON TIMESHORT ERSIGN "
+    "LAEMEM-QOPHUNDERTIEUNDERDOTDIT CARD TTUDDAGMMATION MIONIAN DOCUMENTW "
+    "PRINTSDUSHENNAMALL AXEMY HOUSE TALENTSMANDARINDVISVARAMANGALAMDVANTAGE "
+    "SCOTS SSHKIR KAMARRATANDS-CREE SHOE JOTDIAMONDSWASH KAFDIFONIASME "
+    "BADGEUATRILLOERAL URNER TRUTHALLIANCESALT OF VOLUTION-PHIEUPHUAREG "
+    "YALEANING SQUEEZEDYRILLIC EOUT BOXVOMITINGCOUNCIL COUNTERSA SIGN "
+    "AUBJOINERENICIAN ESH LOOPODESTONE0 BRONZEOCUS OF OCK SALTOCALIC "
+    "MYPORROON-X BELOWOBOOFILICOMBINEDEREVODKAERDIGRISLATION XSNA LDANSE "
+    "WEDGEELEPHANTEK ONKARNITIAL ZD BUBBLESOFTNESSD CROSS NINE OF SCRIPT "
+    "GLKULIZMYUP TRUCKNI ABOVE YUQ NAEUDAWADI  SATCHELEGORIAN "
+    "SENTAGONLOCATIVENOTE PAD POLNAYA-KHIEUKHSPERSIONSANYAKA EN NTEUMNRES "
+    "TOSLESS SHALESSER YNOVEMBERS OCHKOM-EM DASHLF RING LFWIDTH  RASWADI-CREE "
+    "THCURLICUENO THUMBCURSIVE NO SLASHY BEETLERDEL NAGIMANSIS GBASINNAASTERN "
+    "WGLASNAYAAZHAAKKU CURRENTTO-LEFT ATAKANA XCELLENTVERGENCEATE "
+    "MARKATEBOARDTHOSCOPEBINOVILETICK IN PENTAGONAPITAL FRILLIONSREE "
+    "MARKINAGARI ARTYRIA RED HAIRBACKWARDFRAKTUR BATBEIT QAIRTHRAY "
+    "POPPERHESPIAN REATNESSTHIOPIC BACK YERANS SIGNFRICAN DPAVIYANI ANTENNAAST "
+    "WINDHOP BELLQUINTILEBEVERAGEBER POLEGORAZDO  HANDLESAVY BANDTRICOLONGREAT "
+    "SA CEDILLATER FACEIGMOID SWRINKLEDVE SASAK3 ARMOURWRITING RAMMA GGRAUGHTS "
+    "BILLIONSATH MARKHREE OF RASMIAN GARITIC BIEE FONTRI DISHWON "
+    "SIGNAY-NIGHTRIYOOSAN AT DUSK56 TURO2FLOURISHFOR STOPPALOCHKABLE "
+    "SIGNICHAEAN ARCASITEPUSHPIKAZWJ THAJV OVER MAR "
+    "TSHESHARBAHAYZWARAKAYHARMONICBLINEAR PAKPAK ETIRRUP RTISMOS EANE "
+    "TREEARKLEAN BLED CARHAGALL HWO ABOVEPRECEDESHALF GURGENITIVEVESSEL "
+    "BPROSTAYAPUB DAWBPAIRTHRAARSI YEHRESVETLYWN HEARTI SHAKTIING BELL KEMBANG "
+    "FACING ING BOWLTOWARDS ARRIVINGPUN IYEKPTHAHA SOV ROGLF FACE RAMBATAY "
+    "SIGNGOLIAN VAYANNAVE DOT QUEEZE GHEUGHEEL PUMPUBUFILI-WELSH ERNIN "
+    "ANJAEMLILAMITE ZQAPHA D MADDAD MOUTHIBIFILIGRADUALPSTICKSALLOT "
+    "X-TIKEUTSCOOTER CHIKI LASHES  CER-WAAXIMATAQUARIUS-CREE RIANGQI LIGHT "
+    "XCOMING 3 OMEGABAMBOOSSOLDIERTRAINERA NAME VAPOURSVANESE "
+    "THESEOSPUSHPINSANDHI CRACKER-MU-MO--SHIFT-3 SPICE3 SWORD-MACRONENSHUETI "
+    "RTAGS6 NGGOOI NTEUMSAMPHAOLE LEAFVOICINGPURPLE A -PHRUSPRINGSCOPTIC "
+    "THIEUTHHYAAUSHNUMBERSSA VAH BAIRKANSAYANNAVAV YODCONTACTEN "
+    "LEAFS-SAJDALEUT KAVOWEL K-THIRTYTHKUQI SANGAN ALESMA GLAGOLIER "
+    "THAN-KIYEOKLEYBALLNTAINS LAYANNALEK TOO3 WHEELLENGTH-TORNADOAS "
+    "SIGNHAARKAADYNAMICSHIFT TMANCHU WO WAENMUNGKAH TEDUNGMARCATOVYSOKO DU "
+    "NJAAWO MARKMASSAGEMRACHNYDIARGONDRIL BUMAAYYAATIKRAMAEAD OREHEXAGONUM "
+    "IYEKMAI SATTIVATE VEW NOWREATHY ASHTRA ACTER TDHALATHE GLASSE DRINKAD "
+    "NECKASH FRODIPLOUNDISIMOUMERICASUN MEUTAETMEUNHANGUL DOFONONSHORT AMINIMA "
+    "MINGKALSIDDHAMARDNESSAHAPAKHARRED OMBOL B0ARRED BREREKANHEADINGWO FOR "
+    "RESILLOHALANTASIGN UD5 NGGEEAELAENGHAYANNA WAAJIBMEETORUAU "
+    "MARKVEMENT-DANCINGDANESE  WOLOSONG MASKRAKHANG SHAKERHIUCHUSNESTED "
+    "SERPINADAYANNAUKKAKHANEQUDAADA FACENIKAHITLJUDIJER2 GUNUEIGHT KUP TACKUP "
+    "STEPUP SIGNWORSHIPRA REPAAPEZIUMAUNTLETAULDRON BUTTONUP "
+    "MARKWDRIVERLYGISMAEAVY YAATAEAN ASUTORUDEAVOURRD DISKRD FACENAYANNA "
+    "STRIDESHAKINGNANCIALHI SIGNRDO RJE APLOUNUP HANDRANGKEPRARIETYATH OF ED "
+    "RICEWAZ EOHSEXTILERAYANNAECEMBER SLOWLYTAISYOU3 AREPAYMAIC "
+    "LBULANCESUKUUDOBUFFALOOUR OF RISIMOU9 CLOTH MENDUTRTHIAN OUT HUB2 WOMAN "
+    "MUQDAMJIBWAY ANGKUOQ7 NGUAN OPEN-O MUOMAEONTIEENBLACHKOWIFRUITCELSIUSOP "
+    "MARK KEFULAXOPHONEEULEUNGOVER ANCHEINAP0 WHEATTTHACANANGLONGKAYANNAFINAGH "
+    "0 SPEAROVER DUVILIK BYNAMIC FORKINGRIPPLE "
+    "CHEVRONKEUAERICHIEUCHTROLLEYUSSYERUTTILIK BREVIS YELLOW BERRIES3 "
+    "EIGHTBERGINETALL AAPHUTHAOONGONANANGLED KARO BAONG UEXPLOYAN URFACE "
+    "URGLASSPENGKALCAP TENISIBLE T ASHESRMUKHI  ISLANDF SASAKPAYEROKIVE OF "
+    "IMILAR F DAVIDOT NGOMITALIC PECTIVEOT REPHPEGERMAFATIGUE "
+    "OCLOCKORTIETHCANDRA ILLEANNCABINET7 NGGUAITON RA1 ARROWAN MARKJAIN "
+    "OMJARATI TCHFORKJAYANNARRECTUSJECTIVEWIGNYANCAYANNAURATIONTAYANNAJERAN "
+    "JIL DRUMBIG YUSORKHON FAYANNA26 EYYYPAYANNATA MARKOREVMA SYNAGMAIKHAHITY "
+    "GREENORCULUSUT TIMEPERVISEANGOLATCK LIMEPOVODNYSTERINGGENERALFLUENCE9 "
+    "NGGAAKUTAARUKYLISMAESTIVALCLEAVER3 MONTHKPAK "
+    "WAVILLAINKOMBUVATYSCAPEOLAPUK KOQNDONKORONISINNABAR "
+    "FLEXUSOKRYTIEANDERERALTILLOPRENKHACLOTHESLAGIOS "
+    "ROGRESSTHALIYALAK-050OCTOBERIC WANDOCTAGONCOASTERP PIEETICYCLESOGDIAN "
+    "OWILO SL SEGNOBARREKHPPROACHOFFICERST TUBEUYGHUR BORZAYAOF SOAPCLOSE EOX "
+    "BACKICOPTEROX LINEROKEN L2 OLIVEYA LAMPOMERANGPALLAWAPOMOFO  "
+    "LONSUMKKURUNIETNAHTATASHEELYAH LI TRYASKAPANSIOSPANESE YAYANNAKHA "
+    "YATGAYANNAFINAL YBOURINGON FACEYANMAR  MAELEEIFIED "
+    "ETSECHKABOARDERAMAKKANOW ALEF PLOPHUAM ALEFRY "
+    "FACEARADDOBOWTIEPBOARDDIESISROCKET TIKHYNACLESSICKLEBLINK "
+    "DICINEDOKMAIANCHORRENGTHAPYRUSAJANI PECIALVILIANAPLI MURNAMABISHOPDERMA "
+    "PALUTAPEAKS BURGERAEMMAE AGUNG MURDAASSINGVERTKAC CLEF LONGA "
+    "LELETUNGAAMARBUTADGEHOGN DASHSHAYIMVIRIAMSHMAAMRICORNREMEDYZHITSAASHGABOW "
+    "TIE KAPALAILUREN-NISFARSEOSMPLING MELIKN YANGOTTED-BOFILINSUZ ANOWMANON "
+    "KEYNOZHEKSAUCERNSANAQPOKOJIPOMMEENTEVMANTIIMUCHURCHNTOGENUUMISHCREASECRAYO"
+    "NCHEMA ONOCLEANUARYNOKHUKCHEIKHCUPPEDNOR BUCHESS CUMBER "
+    "QATANBEFILICHIRETCHO CHOBELOSOFOUNDPUFFEDCLOSETS "
+    "TENTOGONEKODHADHANIMALBANWA  EPOCHS SHOE EQUIDOCENCEOCIETYCODILE "
+    "DIPLIUYANNAQETANARIISAPQAMATSNKNOWNOITIC "
+    "PWATCHUZEIROBAFILISAADIYCKNESSRAVEL-PEPPERAPISMACARIK CASTLECATAWANEUME "
+    "AKEUAEBGBIEERAKLITATTERYATTIC BISCUSCALATEOSETTERKAANU SPLITAK-668NDA "
+    "TARBITSAPENCILDE DOGAKABATUP BOWNISTERRSHANAPIRIT OOMUUTRSIAN  "
+    "CARETNIRUGU RULERRSENICCEVITUNIZKO "
+    "RISEMEUPNAYACHADINCHAMKOANGKATOPITSANGBAT NCH "
+    "FRPICKETRACINGDAGGERRAAKANOPEN POPEN DAUTUMNBETAN OOPED SOUNAP9 MUENKE "
+    "PHOKAYAH UBLE XEUNYAMELLITELIGIONLIGON 2 NGGUI MAIMI HOOKKASAR 2 MBOO6 "
+    "NGGEEUREUTSTROFO-HIEUHEN GHEEUAENAKEYCAP-HIDETEMPUS SPATHIGHAMALIB "
+    "YAMLEVEL-3 NGGAIASMA WEORTHGHETTISPADESGHEUAEEMASTIHORT "
+    "IGOBLINSUCKEDEVENTHLONG EUGGAGEGORGON00-102GO NGUEENTH-INSHIPSURANG4 "
+    "DART4 DEERWRENCH4 KPEEINGAATLISHA HUR PALITIKIUCIBLEHUMBS EIGHTYGLAZ "
+    "HINHALETARGETUDARKA2 KPOOLLIPOPAASHAETOPBARGNANT LAMEDHYOMBO TE USETE "
+    "TSEERMATASTLERSUAEQTUXO NEOSTOLI LASTON7 NGONKRISISLD "
+    "MAPFRAMESUANGXIKNIFE IGGLESGANGIA3 GBEE3 HEEIYRENE STANCYSTANCE7 MBEEKY "
+    "WAYESTAN 7 KAPOU MBITILBOAT7 MBUU7 NDOOIN YEHKUSHU2LAFRONSSLESSET "
+    "KUTILLAGETRIKE 9 NJEEKTIKO 7 GUANLAMADH6 TREEENIKI 0 "
+    "NGGOGEDOLAKILLERTRAPLIIDE ESFORMEE-IEUNGYSTICKINDHI GEADALEU "
+    "MBUTAUROSTHAKKUGGLING0 NYONA-KARA0 NYUNTAU ROEPACT INAGMA-PIEUPSPLIT "
+    "KLITONTERON SPITALINCUNXX FACEA HAAMXIMIZEIEVAN GBASAQTEUWEN0 "
+    "NGGIENTIMAFORTIS1 WINE8 NYENMANYA WN BOWWN BOXSYOUWA8 NYANTUXEDOF "
+    "CLEFDVANCEDUCEUSHERMESIX OF HEISEITIMATEF MARE1 GBOO1 GOLDIXTY "
+    "PHIMAHUGURAMUMADDAHMADR MEYANNAE WAVEIYANNAMALGAMITULUMAGRANTSYNAFIHIBIT "
+    "5 WOOLMALL FWINDOWTIKENOHEUAEP8 MBEEITABLEAFFIX TURBAN1 NDEEFATHA "
+    "HASHKAFAMILYISSIMOHAM AIHAMEDHISSHARHAMILOISSANTAGOGUE5 MERIWO OF ME "
+    "DIEFF OF MECHIK1 HORNTAIKHUTIRYAKITHER HE MGOAESURAT NJAQHALF "
+    "HIRINGUTAMINGEXHALE8 HOOUHO HOISKAPI 4 NGENSIXTHS4 NJOOM BOARM "
+    "BULLHIVETEGS-PA SURED YAKASHED ICEWBERRYED CAPGRASP 4 MUANWORKER6 HUAN6 "
+    "GUEIYIN-DOSWORDSEXISTS4 NYINHINGE EAHMUKXYOOJFLUTEPEAN 8 KPEFEARN8 "
+    "GBUFSAAQRONOSPAATOBREW INNA PASEQ2-VASZHAINPATAKIMMERINTHU1 "
+    "WVIIMGBAFLAGSPCHA  LACA7 NEN7 MIN2 NJA2 HEN1 PEEANGELOTHALEYBUSBISAHILLU "
+    "2 NJUPEITHZIDI 8 FEEILVERYAMOKPEN O JERA2 HOOPEN-P1 TWO2 POO2 PTEYECEK2 "
+    "MBUROGOMWISADBORZYTSADI2 SEE MOOD1 YOOTTOCK2 MBEBOOTSFORCEBSTERTTORU1 "
+    "TEE2 MBA1-VASIRACYTUEUMIPEHA7 TWE KAWI7 NIN8 KPOYENAP2 KPA2 "
+    "KPIBLANKTSEEBWINDUBLAKOOUNCEURTLEIPINGWINJA7-"
+    "VASFLICTTSEREHIRIQHISTIHIUTHATIYA4 NDO6 GBARAIDAHOLAM4 TOO4 WUIATAF 4 "
+    "WOO4 VOOWLINETON AGVANGGURE HOLARHIMELRATERGULUSRASHAAWAY "
+    "WU318WUAETAVROSHOTELGORGIQUIRY6 KOOHOUR 32 JE CHWV4 KPU4 MON4 MBO4 LOO4 "
+    "LEERACHYAUTHS4 GBIZSEKAR-RUB CAPO4 ABB AMPS5 NDUHASE-HATHIHAYINHALA AR "
+    "AERIEENRIEULHAINU5 KEEZYGOS5 MBI "
+    "ALLORICEMHANNATINNETIPPIARERUHALQAASPERTILES4-VASHETHERDIONHI "
+    "ROTIGMA5-VASRCHIDRELAARELA "
+    "REIWATKAANREGIATMAAUARTARHADDAAPPLEHAALUASEIAPMUNK3 HINPLUTOPLUTA HAA "
+    "UTIESBENDE GORAPLHAU3 FOOGAMALPPAGE6 WEEBASSAGEAN 6-VASPONSEPOLI  "
+    "FUJIZILDEXING GAZE-BEITH3 HON ICONBHADHBHETHRITSITRIOLIKARAIHVUSXTRA- "
+    "ILUTTEGEHIKURUXW XWUTEUXPITER3 BOO7 FUA7 GBEGESH2GADOL7 HUNPI RO7 JEE3 "
+    "RA3HUTA TORSO DEKABAARUTHINGRILLA3 VEEQAAFUI KOIBACUSTRACKGOGI "
+    "TORCH3-VASHROOMI-RESVATOR3 WEI COATHUMP 6 SOO6 SIAICHON6 TA2ICRONBASA "
+    "PTUNEGHULU6 RA2BALAGTRAIFVIET GHNUTPEPETPSILIIARDSIAUDAVAAVU3 "
+    "NDIANNONNNAN -ALAFAAMAEEKEET-BEAMUBURUUBUTSEISMANIS "
+    "FYURIICUBEDEMBICNINTHAADHUSOLVEWBOATLOBE "
+    "SENTONGENTEGALILOMKAEESHIUGUSTVRIDOLOOP UDAATNIEUNEIDONNGUE  SARIEGL "
+    "HDAIC NGMANEGIONLOAN ALGARLEASEWFISHLEERIEOPLEEO-EUENUTOUBITOO BOX9-VASO "
+    "RUAO PLA-SIOSLAYARO KAIO ANGSADHENZEUMSAKINALLI "
+    "ALLEY-RINGSALADCROWNNSYONNSUAENSIEESATA "
+    "ENANOSAUILEMLJACTRICNUENGENJETNTXIVENENGOTERIA UNADATUSNTHA A "
+    "YUESPINEUMMERMSHAEMROCKUNGBAMPIRESHOOKSILA3MPAREVZMET TELUMALONMUOY "
+    "SHIMAADULTMAQAFMUCH DSMANMI ROSHTINDKAR "
+    "MISRAMETRYDLINGWAQFAMINGOSICLEAGMA "
+    "MIEUMWAAVUDOTS-"
+    "MELONAEMAEMEEMUMEIZIAEPENDWICHAEREEMENOEMEPETMETEGMMOTHUNOO LURALEATH "
+    "LWAY "
+    "SKATEEBALLDELTANCORADENCEDEPTHUKARADBOATLOURENENOENEMKANASHINEGARDESTYNA "
+    "POSHARUMADYAMAI KMAIZEDHAM E GEEWATTOMAALAACHKAUNITYM "
+    "RAMMAAEHENDEPSHANGEAGLE TABSSHAR2SHARANADA MACUSNABLACHULASUKUNCIEUCF "
+    "COWRUSH CHUTECCEPT1 FANOMBIEEYYAL0 MANAMEKH8 NWAK-020RYASOTUUMUSTNUTOLD "
+    "XRRITOKO LA9 MUNITUALCANUSCEREKSTORMROWN F SOWYIZETF "
+    "EWEKNOBSUQUETCHADACAUSEEURAELATIKRUHUAKARORRUDAA9 DEE0-VASOQPEN9 KUAJANG "
+    "WIANGCAUDA8 RO2EVAL 1 DWEKHAPHEUAEMCHOOLCHOOIRUMP-CHIME0 "
+    "OILRULAIKESH2KERETCHESTCHERYKBALL9 MENU U U0 DWOERKHA8 "
+    "MANCCOLICAKESCLUBSJUDGECAKRAURITYLAGUSESHE3JUDULALOG "
+    "LABOROPLETLABATVITAEFAIHUOBYLAOCADO0 BEELAMDA8-VASESO E9 WVE9 "
+    "WVAJERVISURYAISTLE0 DOO0 JOO0 HEESTARTKUSMACKAGEKURONURINE9 YEEET TUOJKI "
+    "8 NANCLONECALYAORUTOOKARACECAK9 NDEOKEE 9 NDACAANGITHI 9 "
+    "PU2JUEUIOSTERALPHA0 GBOCECEKCLIFF9 NUNL-JUZ9 NONL NET0 GEE0 "
+    "HANCKTIEKWAENFAAFURUISROOKVEYZRRORRROISHYATUKIZETATURUWAIRWAHAYEUXUNAVWAET"
+    "WAAKUNAHSINKRPSEVEUXSIKIRIFYURUSVESTZATAZZY TZELZIZ2VOS "
+    "YUDHRUSISOKARUTUYUKUZELOZAYNUTANSA-ITAXIUTTYTFONXEYNZIETXEIAWAW "
+    "SUABVIDAUON SLURULU SUNGRSO-RT TRUNARUNGROA "
+    "YWAASELFWDERSEEVSEENWULUROARUHURRUKUVIYOVEDESEYEVUEQHEYSHEENHEEPHEROHERUHE"
+    "YNHEYTHHWA2 YAHID HIINHILDHAVEHAYNHWAA2 SOHUEN2 RO2 QOHSHUHWAH2 "
+    "PEIANOIARA2 NOHMI 2 VIHOKEHOM HOPHHOSTHSDA3 MUFFINFIRIFITA3 PA3 MI3 ME3 "
+    "TAEZZO3 YU3 LE3 RIFAIBFASTFEEMFETHFEUQGIBAGIDAGIEAGIR2GOALGORTGROMGRU "
+    "GUINFWAA3 L33 KU3 JO3 JEGAMEGAML3 EEGEDEGGWS3 A3GHOMKMA KOBAKOETKOKEKOKO1 "
+    "KU1 KIKWAA1 IN1 HA1 GA1 DULAAN1 DO1 RAKANGKAPHKCET1 QI1 "
+    "POKICKLFERLFIELIFULIUMLIWNLOLL1 DAKALIILUYIK HINORINY 2 L22 KAIFAT2 "
+    "BUIGERIITOJOT JEONJIIM1 YI1 VU1 SU1 SI1 SAKAAFKAD3KAKOIPODIQAAISI "
+    "1358ARGIAROO7 JAATIMAPAQ7 LUAPON7 KIARA36 NA6 RU6 QABASH6 POBAYI6 LA6 L66 "
+    "JOATYA7 EIAULA7 DD7 DA7 BE6 WU6 SEBAGSBALD8 QE8 PI8 KO8 JIAAMU8 GU8 "
+    "FOAFEL8 EN9 JA9 TA9 TO9 TU9 SO9 SI9 SE9 PI9 PAA IEA-HA8 WE8 SUAACUALTA7 "
+    "VO7 TIAMLA7 REAN X8 DU8 BOAILM7 ZAALDAEAAE5 FE5 FADZHA5 DE5 BB5 AU5 AN5 "
+    "A2EANSEEEEDGER5 LIDIM2EENG5 JU5 IN5 GI4 FI4 NE4 L44 KE4 DO4 WAEENUEETA4 "
+    "ZEEHEH4 WIEIPTEIRTEIWS4 TU4 TE6 DIBUNGBUOYCANOCASECAYNCHAUCHEH6 "
+    "HIBERDBETH6 JE6 HEBOOKBORE6 FU5 VACWAA5 TODAGSDAIR5 TEDDAKDDHI5 OODEAD5 "
+    "NU5 MOCKEN5 WE5 WACOONCOREHUVA5 VECRETNDAPPAWN0 "
+    "BINCERPEEPNAM2NHAYOXIANGA2 OHMNET NEO -UM NDUEPLUMMUASPOLOMPET0 "
+    "NIPLUGPRIL0 PUMMU2QEF NAG NAAUPEUX0 HO0 JUPHABPHIN0 "
+    "KOMVATMUINOBATOFUMOENGODLEOONEOBROO-YOOOTH R SOJI ONA  "
+    "WEBNRUAOUBTNPEANOWC-ONE-RAYNJAMORAXNWAANUTSORIINUNGNTOCNTAANSUB0 ZO C "
+    "DRAFE0 SA0 YEQOPAMFAALUMNMARUMESOMARYREIAMIIM028BMIINMLYARGU2LOVOLUIS0 "
+    "WIMEARQHAU0 RADE6DA2UOPZUP8 ID70D42WAU5 "
+    "UCYACWIWOQUEHUEZ8F04-"
+    "0UDYA7AOMSREX9819E3UMXDJAE80DZEVOKAUMAUJAWXZOOZJEB89B576-"
+    "0620AZUVAUAYD6D7ZORQ00PUQQIGQIF7 "
+    "OQARPOQQUFVNOQOTQOFCA9550557BXGCAHBUD5B68 "
+    "AUQAAG-CAIVOYAL2BAU72C5-0VUUBIBIMNYOT18D15514DIWRMU Y00I-IHOJHOX0 E0 "
+    "UL000-0LJE04A0B9LFA1 XSUUJHAK00121JAH1-21-0JEUKUEKAQSIIFOMFLYO "
+    "YOAYXEHTUJFAJOEH3 IFUEES-OIX4 "
+    "EF8CF143-0XAUEZHEYKXAN305X0031CXWVXWG25320BNII-TE3 "
+    "DTJE2DD2-0HHANIB40488309713938291716494B4E1D1AQWR7R0C0D0VDW099F39092G9G3";
+uint8_t UnicodeNameToCodepointIndex_[239405] = {
+    0x00, 0x05, 0xc0, 0x00, 0x6b, 0x15, 0xc0, 0x00, 0x95, 0x12, 0xc0, 0x00,
+    0xdd, 0x06, 0xc0, 0x01, 0x03, 0x14, 0xc0, 0x01, 0x27, 0x18, 0xc0, 0x01,
+    0x41, 0x16, 0xc0, 0x01, 0x57, 0x03, 0xc0, 0x01, 0x7b, 0x04, 0xc0, 0x01,
+    0xd8, 0x0e, 0xc0, 0x01, 0xfe, 0x17, 0xc0, 0x02, 0x22, 0x0a, 0xc0, 0x02,
+    0x3f, 0x0b, 0xc0, 0x02, 0x5d, 0x19, 0xc0, 0x02, 0x7d, 0x08, 0xc0, 0x02,
+    0x95, 0x0f, 0xc0, 0x02, 0xb1, 0x0d, 0xc0, 0x02, 0xd1, 0x10, 0xc0, 0x02,
+    0xef, 0x1a, 0xc0, 0x03, 0x15, 0x07, 0xc0, 0x03, 0x2d, 0x09, 0xc0, 0x03,
+    0x84, 0x11, 0xc0, 0x03, 0xa6, 0x1c, 0xc0, 0x04, 0x0a, 0x0c, 0xc0, 0x04,
+    0x2c, 0x42, 0x00, 0xe3, 0xc0, 0x04, 0x44, 0x1b, 0x40, 0x04, 0x5a, 0x03,
+    0xc0, 0x04, 0x6e, 0x43, 0x30, 0x23, 0xc0, 0x04, 0x9d, 0x0a, 0xc0, 0x04,
+    0xaf, 0x14, 0xc0, 0x04, 0xcb, 0x11, 0xc0, 0x04, 0xea, 0x0e, 0xc0, 0x05,
+    0x25, 0x0b, 0xc0, 0x05, 0x37, 0x17, 0xc0, 0x05, 0x4c, 0x07, 0xc0, 0x05,
+    0x72, 0x1b, 0x40, 0x05, 0x8a, 0x07, 0xc0, 0x05, 0xa2, 0x0b, 0xc0, 0x05,
+    0xe9, 0x16, 0xc0, 0x06, 0x07, 0x03, 0xc0, 0x06, 0x24, 0x0d, 0xc0, 0x06,
+    0x60, 0x0e, 0xc0, 0x06, 0x6e, 0x0a, 0xc0, 0x06, 0x7e, 0x05, 0xc0, 0x06,
+    0x9a, 0x10, 0xc0, 0x06, 0xaf, 0x11, 0xc0, 0x06, 0xbf, 0x42, 0x00, 0xe3,
+    0xc0, 0x06, 0xf1, 0x1b, 0xc0, 0x06, 0xfb, 0x12, 0xc0, 0x07, 0x0f, 0x17,
+    0xc0, 0x07, 0x2e, 0x0f, 0xc0, 0x07, 0x5a, 0x19, 0xc0, 0x07, 0x68, 0xcc,
+    0x85, 0x35, 0x01, 0x4e, 0x60, 0x14, 0xc0, 0x07, 0x78, 0x0e, 0xc0, 0x07,
+    0x8a, 0x0b, 0xc0, 0x07, 0x92, 0x03, 0xc0, 0x07, 0xbb, 0x11, 0xc0, 0x07,
+    0xef, 0x07, 0xc0, 0x08, 0x1d, 0x17, 0xc0, 0x08, 0x3f, 0x4f, 0x62, 0x1f,
+    0xc0, 0x08, 0x5b, 0x0a, 0x40, 0x08, 0x79, 0x07, 0xc0, 0x08, 0x87, 0x0b,
+    0xc0, 0x08, 0xbb, 0x14, 0xc0, 0x08, 0xf9, 0x11, 0xc0, 0x09, 0x13, 0x17,
+    0xc0, 0x09, 0x5f, 0x03, 0xc0, 0x09, 0x71, 0xc2, 0xe6, 0x9f, 0x0f, 0xa6,
+    0x01, 0xcf, 0x60, 0xb7, 0x0f, 0xcf, 0x60, 0x07, 0xc0, 0x09, 0x96, 0x0b,
+    0xc0, 0x09, 0xd2, 0x11, 0xc0, 0x0a, 0x02, 0x03, 0xc0, 0x0a, 0x44, 0x17,
+    0xc0, 0x0a, 0x6c, 0xc9, 0xa9, 0x51, 0x0f, 0xcc, 0x78, 0x03, 0xc0, 0x0a,
+    0x94, 0x07, 0xc0, 0x0a, 0xa6, 0x0b, 0xc0, 0x0a, 0xbc, 0x11, 0xc0, 0x0a,
+    0xe4, 0x42, 0x03, 0x66, 0x40, 0x0a, 0xee, 0x03, 0xc0, 0x0a, 0xfa, 0x02,
+    0xc0, 0x0b, 0x34, 0x17, 0xc0, 0x0b, 0x40, 0x0a, 0xc0, 0x0b, 0x56, 0x11,
+    0xc0, 0x0b, 0x72, 0x14, 0xc0, 0x0b, 0x9e, 0x07, 0xc0, 0x0b, 0xae, 0x0b,
+    0xc0, 0x0b, 0xcc, 0x19, 0x40, 0x0c, 0x04, 0x14, 0xc0, 0x0c, 0x14, 0xc2,
+    0x24, 0xe2, 0x0f, 0xd4, 0x99, 0x06, 0xc0, 0x0c, 0x36, 0x0e, 0xc0, 0x0c,
+    0x58, 0x17, 0xc0, 0x0c, 0x80, 0xc7, 0x2e, 0x21, 0x01, 0x38, 0x43, 0x00,
+    0x0c, 0x92, 0x10, 0xc0, 0x0c, 0x96, 0x15, 0xc0, 0x0c, 0xb9, 0x16, 0xc0,
+    0x0c, 0xcd, 0xc7, 0xc0, 0xa5, 0x01, 0x32, 0x91, 0x44, 0xdf, 0xff, 0xc0,
+    0x0c, 0xd9, 0x05, 0xc0, 0x0c, 0xfb, 0x12, 0xc0, 0x0d, 0x19, 0xcb, 0x91,
+    0xe6, 0x01, 0x0a, 0x69, 0x18, 0xc0, 0x0d, 0x27, 0x0f, 0xc0, 0x0d, 0x33,
+    0xcb, 0x90, 0xff, 0x00, 0x30, 0x59, 0x07, 0xc0, 0x0d, 0x49, 0xc5, 0xd8,
+    0x44, 0x0f, 0xcf, 0x70, 0x11, 0xc0, 0x0d, 0x55, 0x0e, 0xc0, 0x0d, 0x95,
+    0x03, 0xc0, 0x0d, 0xa3, 0x0b, 0xc0, 0x0d, 0xd5, 0x07, 0xc0, 0x0e, 0x01,
+    0x17, 0xc0, 0x0e, 0x2a, 0x14, 0xc0, 0x0e, 0x65, 0x1b, 0xc0, 0x0e, 0x75,
+    0x49, 0xb4, 0xc7, 0x40, 0x0e, 0x81, 0x11, 0xc0, 0x0e, 0xaf, 0x07, 0xc0,
+    0x0e, 0xed, 0x0b, 0xc0, 0x0f, 0x22, 0x1b, 0xc0, 0x0f, 0x5b, 0x03, 0xc0,
+    0x0f, 0x6d, 0xcd, 0x7f, 0x73, 0x01, 0x08, 0xa1, 0xc4, 0x0f, 0x0c, 0x0f,
+    0xcc, 0xc9, 0x17, 0x40, 0x0f, 0x9a, 0x12, 0xc0, 0x0f, 0xa6, 0x10, 0xc0,
+    0x0f, 0xc2, 0xc7, 0x57, 0x8b, 0x01, 0x30, 0x13, 0x00, 0x0f, 0xdc, 0xc5,
+    0x19, 0xdd, 0x01, 0x32, 0x29, 0x48, 0xbe, 0x5a, 0x40, 0x0f, 0xe0, 0x07,
+    0xc0, 0x0f, 0xec, 0x11, 0xc0, 0x10, 0x10, 0x03, 0xc0, 0x10, 0x3e, 0x0b,
+    0xc0, 0x10, 0x68, 0x1b, 0xc0, 0x10, 0x92, 0xcb, 0x96, 0x3d, 0x01, 0x05,
+    0xa1, 0x17, 0x40, 0x10, 0xa2, 0x10, 0xc0, 0x10, 0xb8, 0x42, 0x00, 0x06,
+    0xc0, 0x10, 0xe4, 0x43, 0x00, 0x89, 0xc0, 0x10, 0xf0, 0x0f, 0xc0, 0x11,
+    0x00, 0xce, 0x72, 0xc6, 0x0f, 0x9f, 0x71, 0xd3, 0x42, 0xc7, 0x0f, 0xc8,
+    0xf8, 0x11, 0xc0, 0x11, 0x10, 0x0a, 0xc0, 0x11, 0x2a, 0x0b, 0xc0, 0x11,
+    0x3f, 0x03, 0xc0, 0x11, 0x5b, 0x07, 0xc0, 0x11, 0x7d, 0x14, 0x40, 0x11,
+    0x91, 0x0e, 0xc0, 0x11, 0xa1, 0x11, 0xc0, 0x11, 0xba, 0x03, 0xc0, 0x11,
+    0xe4, 0x14, 0xc0, 0x12, 0x0a, 0x17, 0xc0, 0x12, 0x1c, 0x07, 0xc0, 0x12,
+    0x32, 0x0b, 0x40, 0x12, 0x46, 0x0b, 0xc0, 0x12, 0x6a, 0x07, 0xc0, 0x12,
+    0x8b, 0x11, 0xc0, 0x12, 0xbd, 0x03, 0xc0, 0x12, 0xec, 0x17, 0xc0, 0x13,
+    0x2d, 0x43, 0x15, 0xe9, 0xc0, 0x13, 0x3d, 0x47, 0xca, 0x45, 0x40, 0x13,
+    0x47, 0x10, 0xc0, 0x13, 0x6b, 0x07, 0xc0, 0x13, 0x77, 0x03, 0xc0, 0x13,
+    0x84, 0x0a, 0xc0, 0x13, 0xa0, 0x0b, 0xc0, 0x13, 0xbe, 0x11, 0xc0, 0x13,
+    0xdf, 0xc5, 0xd4, 0x02, 0x01, 0x5f, 0x18, 0x07, 0xc0, 0x13, 0xeb, 0x03,
+    0xc0, 0x14, 0x20, 0x11, 0xc0, 0x14, 0x4f, 0x56, 0x30, 0x4e, 0xc0, 0x14,
+    0x74, 0x17, 0xc0, 0x14, 0x8e, 0x45, 0x60, 0x4f, 0xc0, 0x14, 0xa4, 0x43,
+    0xc2, 0x7e, 0xc0, 0x14, 0xd3, 0x0b, 0x40, 0x14, 0xf9, 0x47, 0xc0, 0xb3,
+    0xc0, 0x15, 0x05, 0xd3, 0x46, 0x6a, 0x01, 0x19, 0x39, 0xc2, 0x00, 0xbf,
+    0x01, 0x15, 0xd9, 0xc4, 0xe4, 0x5b, 0x0f, 0xd3, 0xd8, 0x0f, 0xc0, 0x15,
+    0x11, 0x03, 0xc0, 0x15, 0x1f, 0x09, 0xc0, 0x15, 0x32, 0x1a, 0xc0, 0x15,
+    0x3c, 0x48, 0xbd, 0x72, 0xc0, 0x15, 0x4a, 0x0e, 0xc0, 0x15, 0x7c, 0x44,
+    0x00, 0x2d, 0xc0, 0x15, 0x90, 0x10, 0xc0, 0x15, 0x9a, 0xcb, 0x8f, 0xcb,
+    0x01, 0x1e, 0x79, 0x14, 0xc0, 0x15, 0xb9, 0x42, 0x00, 0xe3, 0xc0, 0x15,
+    0xcb, 0x15, 0xc0, 0x15, 0xd5, 0x17, 0xc0, 0x15, 0xe1, 0xcc, 0x81, 0xbd,
+    0x0f, 0xa7, 0x39, 0xcd, 0x76, 0x5c, 0x0f, 0x99, 0x91, 0xc2, 0x0c, 0x43,
+    0x0f, 0xa2, 0x0b, 0x00, 0x15, 0xed, 0xd0, 0x57, 0xb2, 0x01, 0x70, 0x70,
+    0x17, 0xc0, 0x15, 0xf7, 0x11, 0xc0, 0x16, 0x13, 0x14, 0xc0, 0x16, 0x2f,
+    0x07, 0xc0, 0x16, 0x3f, 0x0b, 0xc0, 0x16, 0x62, 0xc4, 0xe0, 0x07, 0x0f,
+    0xa3, 0xd9, 0x03, 0xc0, 0x16, 0x6c, 0x0e, 0x40, 0x16, 0x78, 0xc5, 0xc8,
+    0x6f, 0x0f, 0xcd, 0x51, 0x14, 0xc0, 0x16, 0x86, 0x42, 0x02, 0x10, 0xc0,
+    0x16, 0xa2, 0xc2, 0x09, 0x66, 0x0f, 0xcc, 0x49, 0xc7, 0xc7, 0xf9, 0x0f,
+    0xb7, 0x11, 0x10, 0xc0, 0x16, 0xae, 0x12, 0xc0, 0x16, 0xc4, 0x0e, 0xc0,
+    0x16, 0xda, 0x17, 0xc0, 0x16, 0xea, 0x05, 0xc0, 0x16, 0xf4, 0x04, 0xc0,
+    0x16, 0xfe, 0xc7, 0xb5, 0x83, 0x01, 0x09, 0x31, 0x43, 0x00, 0x5f, 0xc0,
+    0x17, 0x10, 0x09, 0xc0, 0x17, 0x1a, 0xc8, 0xad, 0x5d, 0x0f, 0xaa, 0x49,
+    0xce, 0x71, 0x76, 0x0f, 0x9f, 0x11, 0xc3, 0x02, 0x3b, 0x0f, 0x9b, 0x11,
+    0x9a, 0x0f, 0xa0, 0x11, 0x15, 0xc0, 0x17, 0x26, 0xcb, 0x8a, 0xd6, 0x0f,
+    0xa2, 0x60, 0xd0, 0x5c, 0x12, 0x0f, 0xc8, 0x81, 0x48, 0xb8, 0x6a, 0xc0,
+    0x17, 0x32, 0x50, 0x58, 0x72, 0xc0, 0x17, 0x44, 0x4a, 0x17, 0xa1, 0xc0,
+    0x17, 0x6c, 0x07, 0xc0, 0x17, 0x8c, 0xc5, 0xdc, 0x1d, 0x0f, 0xce, 0xf8,
+    0x03, 0xc0, 0x17, 0x9e, 0x17, 0xc0, 0x17, 0xb4, 0x11, 0xc0, 0x17, 0xc6,
+    0xc4, 0xe2, 0x9b, 0x0f, 0xa2, 0xb1, 0xd2, 0x4d, 0x45, 0x0f, 0xcf, 0x48,
+    0xc6, 0xd1, 0x75, 0x01, 0x35, 0xd9, 0x03, 0xc0, 0x17, 0xd2, 0x46, 0x2c,
+    0xb4, 0xc0, 0x17, 0xe4, 0xcc, 0x01, 0xbb, 0x00, 0x01, 0x10, 0x0b, 0xc0,
+    0x17, 0xee, 0x07, 0xc0, 0x17, 0xf8, 0xcb, 0x94, 0xa6, 0x0f, 0xcb, 0x89,
+    0xc4, 0xe4, 0x2f, 0x0f, 0xd4, 0x00, 0x10, 0xc0, 0x18, 0x0a, 0xc4, 0x26,
+    0xba, 0x01, 0x37, 0x59, 0x14, 0xc0, 0x18, 0x26, 0x12, 0xc0, 0x18, 0x48,
+    0x06, 0xc0, 0x18, 0x54, 0x17, 0xc0, 0x18, 0x60, 0x0f, 0xc0, 0x18, 0x6c,
+    0x0e, 0xc0, 0x18, 0x7b, 0xc4, 0xc7, 0xcb, 0x0f, 0x99, 0xa9, 0x96, 0x0f,
+    0xa0, 0x42, 0x00, 0x18, 0x87, 0x58, 0x25, 0x43, 0xc0, 0x18, 0x90, 0x48,
+    0x91, 0xff, 0xc0, 0x18, 0x9a, 0x47, 0x08, 0x5b, 0x40, 0x18, 0xe8, 0x07,
+    0xc0, 0x19, 0x22, 0x03, 0xc0, 0x19, 0x3c, 0xc4, 0xcc, 0x07, 0x01, 0x37,
+    0x51, 0x0b, 0xc0, 0x19, 0x50, 0x11, 0xc0, 0x19, 0x71, 0xcc, 0x85, 0x11,
+    0x0f, 0x9c, 0x20, 0x17, 0xc0, 0x19, 0x83, 0xc2, 0x00, 0x03, 0x0f, 0xcc,
+    0x01, 0x1b, 0xc0, 0x19, 0x8f, 0x11, 0xc0, 0x19, 0x9b, 0x07, 0xc0, 0x19,
+    0xb3, 0xc5, 0x72, 0xa4, 0x0f, 0xcc, 0xba, 0x00, 0x19, 0xbf, 0x05, 0xc0,
+    0x19, 0xc5, 0x0f, 0xc0, 0x19, 0xcf, 0x17, 0xc0, 0x19, 0xe3, 0xc4, 0xe0,
+    0x13, 0x01, 0x35, 0x81, 0x10, 0xc0, 0x19, 0xf5, 0x14, 0xc0, 0x1a, 0x1b,
+    0x0e, 0xc0, 0x1a, 0x2d, 0x42, 0x01, 0x25, 0xc0, 0x1a, 0x3c, 0x99, 0x0f,
+    0xa0, 0x23, 0x00, 0x1a, 0x46, 0x12, 0xc0, 0x1a, 0x4c, 0xc2, 0x00, 0xfe,
+    0x0f, 0xcf, 0x29, 0xc2, 0x00, 0x74, 0x0f, 0xd4, 0xc8, 0x0b, 0xc0, 0x1a,
+    0x56, 0x11, 0xc0, 0x1a, 0x62, 0xd1, 0x50, 0xac, 0x01, 0x1c, 0xd1, 0x03,
+    0x40, 0x1a, 0x7d, 0x42, 0x02, 0xd3, 0xc0, 0x1a, 0x8f, 0xc7, 0xc8, 0xc4,
+    0x0f, 0x9e, 0xcb, 0x00, 0x1a, 0x99, 0xc4, 0x78, 0xfe, 0x0f, 0x9d, 0x30,
+    0x42, 0x00, 0x15, 0xc0, 0x1a, 0x9f, 0x48, 0xb9, 0x0a, 0xc0, 0x1a, 0xab,
+    0x14, 0xc0, 0x1a, 0xbd, 0x12, 0xc0, 0x1a, 0xcb, 0xc7, 0xb3, 0x73, 0x01,
+    0x10, 0xd9, 0xc6, 0xcc, 0x53, 0x0f, 0xca, 0x91, 0xc9, 0xab, 0x52, 0x0f,
+    0xcb, 0x48, 0xca, 0xa3, 0x8c, 0x0f, 0xaa, 0x41, 0xc3, 0x20, 0xac, 0x01,
+    0x35, 0x99, 0x42, 0x00, 0x84, 0xc0, 0x1a, 0xdb, 0x42, 0x01, 0xdd, 0x40,
+    0x1a, 0xe7, 0x42, 0x05, 0xc0, 0xc0, 0x1a, 0xf3, 0xca, 0xa5, 0x62, 0x01,
+    0x19, 0x69, 0x47, 0xba, 0x9b, 0xc0, 0x1a, 0xff, 0xc5, 0xdd, 0xfd, 0x0f,
+    0x98, 0x00, 0x42, 0x00, 0x30, 0xc0, 0x1b, 0x23, 0xc5, 0x65, 0x68, 0x01,
+    0x18, 0x9b, 0x00, 0x1b, 0x2f, 0xcb, 0x91, 0x20, 0x0f, 0xd5, 0x09, 0x03,
+    0xc0, 0x1b, 0x35, 0x15, 0xc0, 0x1b, 0x3d, 0x42, 0x02, 0x2f, 0xc0, 0x1b,
+    0x49, 0xc5, 0xc5, 0x38, 0x01, 0x35, 0xc9, 0x05, 0xc0, 0x1b, 0x59, 0x14,
+    0xc0, 0x1b, 0x63, 0x07, 0xc0, 0x1b, 0x6f, 0xc3, 0x92, 0x91, 0x01, 0x5f,
+    0x91, 0xce, 0x6b, 0xaa, 0x01, 0x5f, 0xd9, 0xc4, 0xe0, 0xff, 0x0f, 0xc9,
+    0x98, 0x10, 0xc0, 0x1b, 0x7b, 0x42, 0x00, 0xbc, 0xc0, 0x1b, 0x8d, 0x1a,
+    0xc0, 0x1b, 0x99, 0x06, 0xc0, 0x1b, 0xab, 0xd1, 0x51, 0xde, 0x0f, 0xaf,
+    0xf1, 0x46, 0xc7, 0x36, 0x40, 0x1b, 0xb7, 0x07, 0xc0, 0x1b, 0xc9, 0x03,
+    0xc0, 0x1b, 0xdb, 0x14, 0xc0, 0x1b, 0xfb, 0x11, 0xc0, 0x1c, 0x09, 0x17,
+    0xc0, 0x1c, 0x15, 0xca, 0xa1, 0x0c, 0x0f, 0xde, 0x2a, 0x00, 0x1c, 0x27,
+    0x0e, 0xc0, 0x1c, 0x2b, 0x42, 0x00, 0x33, 0xc0, 0x1c, 0x35, 0x10, 0xc0,
+    0x1c, 0x41, 0xc6, 0xd0, 0x7f, 0x01, 0x37, 0xa9, 0xc9, 0xb1, 0xe5, 0x01,
+    0x32, 0x81, 0x16, 0xc0, 0x1c, 0x4d, 0x48, 0x69, 0x46, 0xc0, 0x1c, 0x5c,
+    0xc7, 0xc6, 0x01, 0x0f, 0x9d, 0xb9, 0xd1, 0x50, 0xdf, 0x0f, 0x9b, 0xb1,
+    0xc2, 0x00, 0x2c, 0x0f, 0xcb, 0xd9, 0x45, 0x73, 0xa7, 0x40, 0x1c, 0x78,
+    0x17, 0xc0, 0x1c, 0x84, 0x0b, 0xc0, 0x1c, 0x93, 0xc8, 0xbc, 0xe2, 0x0f,
+    0xb7, 0xc8, 0x11, 0xc0, 0x1c, 0x9f, 0x07, 0xc0, 0x1c, 0xa7, 0x0b, 0xc0,
+    0x1c, 0xb7, 0x03, 0x40, 0x1c, 0xc3, 0x14, 0xc0, 0x1c, 0xcf, 0x03, 0xc0,
+    0x1c, 0xdb, 0x11, 0xc0, 0x1c, 0xf5, 0x0b, 0xc0, 0x1d, 0x19, 0xcd, 0x7f,
+    0xa7, 0x01, 0x4f, 0x11, 0xc3, 0x2d, 0xa5, 0x0f, 0xa0, 0x88, 0x11, 0xc0,
+    0x1d, 0x2f, 0x03, 0xc0, 0x1d, 0x3b, 0x14, 0xc0, 0x1d, 0x47, 0xc4, 0xdc,
+    0xf0, 0x0f, 0x9f, 0x5a, 0x00, 0x1d, 0x5d, 0xcb, 0x90, 0x18, 0x0f, 0xc9,
+    0x39, 0x42, 0x00, 0x27, 0xc0, 0x1d, 0x63, 0x03, 0x40, 0x1d, 0x7e, 0x17,
+    0xc0, 0x1d, 0x8a, 0x43, 0x1c, 0x85, 0xc0, 0x1d, 0x96, 0xde, 0x0f, 0x40,
+    0x0f, 0xa8, 0xe1, 0x46, 0xcf, 0xdd, 0xc0, 0x1d, 0xa8, 0x05, 0xc0, 0x1d,
+    0xdf, 0x42, 0x00, 0x4b, 0xc0, 0x1d, 0xeb, 0xc6, 0x55, 0xf6, 0x01, 0x06,
+    0x01, 0x4b, 0x9a, 0x47, 0xc0, 0x1d, 0xfb, 0x46, 0xc9, 0x58, 0x40, 0x1e,
+    0x07, 0x03, 0xc0, 0x1e, 0x25, 0xc2, 0x02, 0xfb, 0x0f, 0xcc, 0x88, 0x0f,
+    0xc0, 0x1e, 0x31, 0x10, 0xc0, 0x1e, 0x3d, 0x42, 0x00, 0x2c, 0xc0, 0x1e,
+    0x49, 0x4b, 0x90, 0xb2, 0x40, 0x1e, 0x55, 0x07, 0xc0, 0x1e, 0x6d, 0x03,
+    0xc0, 0x1e, 0x7d, 0xcd, 0x77, 0x7a, 0x01, 0x11, 0x13, 0x00, 0x1e, 0x8f,
+    0x0b, 0xc0, 0x1e, 0x95, 0xd4, 0x3e, 0x58, 0x0f, 0xa5, 0x31, 0x11, 0x40,
+    0x1e, 0xa4, 0x43, 0x00, 0x67, 0xc0, 0x1e, 0xba, 0x90, 0x01, 0x30, 0x4b,
+    0x00, 0x1e, 0xca, 0x48, 0xb9, 0xa2, 0xc0, 0x1e, 0xe9, 0xc6, 0xb7, 0x74,
+    0x01, 0x13, 0xdb, 0x00, 0x1e, 0xfb, 0x42, 0x0e, 0xa6, 0xc0, 0x1e, 0xff,
+    0x42, 0x15, 0x13, 0xc0, 0x1f, 0x11, 0x15, 0x40, 0x1f, 0x1d, 0x0b, 0xc0,
+    0x1f, 0x29, 0x03, 0xc0, 0x1f, 0x33, 0xcc, 0x71, 0x94, 0x0f, 0xb5, 0x60,
+    0xc8, 0xb9, 0x52, 0x01, 0x02, 0x99, 0x03, 0xc0, 0x1f, 0x3f, 0xc5, 0xd4,
+    0x2a, 0x0f, 0x9e, 0x50, 0x0b, 0xc0, 0x1f, 0x49, 0x11, 0xc0, 0x1f, 0x59,
+    0x07, 0xc0, 0x1f, 0x75, 0xca, 0x9b, 0xbc, 0x0f, 0xa7, 0xf8, 0x03, 0xc0,
+    0x1f, 0x94, 0x17, 0x40, 0x1f, 0xa5, 0x10, 0xc0, 0x1f, 0xb8, 0xc2, 0x00,
+    0x3b, 0x01, 0x36, 0x7b, 0x00, 0x1f, 0xce, 0x15, 0xc0, 0x1f, 0xd4, 0xc7,
+    0xc7, 0xba, 0x01, 0x16, 0xa3, 0x00, 0x1f, 0xe0, 0x0e, 0xc0, 0x1f, 0xe6,
+    0x89, 0x0f, 0xa0, 0xb3, 0x00, 0x1f, 0xf6, 0x87, 0x0f, 0xcb, 0x38, 0x42,
+    0x00, 0xcc, 0xc0, 0x1f, 0xfa, 0x09, 0xc0, 0x20, 0x0a, 0x14, 0xc0, 0x20,
+    0x17, 0x4a, 0xa6, 0x5c, 0xc0, 0x20, 0x2b, 0x0e, 0xc0, 0x20, 0x50, 0x4b,
+    0x8e, 0x55, 0xc0, 0x20, 0x5a, 0xc5, 0xdd, 0xda, 0x0f, 0xa7, 0x31, 0xc7,
+    0x7b, 0xdd, 0x0f, 0xa6, 0x71, 0xc8, 0xb9, 0xba, 0x0f, 0xa1, 0xf1, 0x10,
+    0x40, 0x20, 0x7c, 0x16, 0xc0, 0x20, 0x88, 0x17, 0xc0, 0x20, 0x98, 0x44,
+    0x00, 0x28, 0xc0, 0x20, 0xb6, 0x15, 0xc0, 0x20, 0xc0, 0x12, 0xc0, 0x20,
+    0xd0, 0xcf, 0x66, 0xfc, 0x0f, 0xad, 0x49, 0xcd, 0x79, 0xf7, 0x0f, 0xa7,
+    0xf1, 0x45, 0x9f, 0x92, 0xc0, 0x20, 0xdc, 0xc4, 0xe4, 0x23, 0x0f, 0xa1,
+    0x48, 0x14, 0xc0, 0x20, 0xeb, 0x10, 0xc0, 0x21, 0x0e, 0x03, 0xc0, 0x21,
+    0x2c, 0x15, 0xc0, 0x21, 0x3a, 0xc8, 0xa2, 0x57, 0x0f, 0xb5, 0xb1, 0xc8,
+    0xbe, 0x6a, 0x0f, 0xcf, 0x59, 0xcc, 0x8a, 0x75, 0x0f, 0xd6, 0x10, 0x44,
+    0x05, 0x1e, 0xc0, 0x21, 0x46, 0xd8, 0x21, 0xcb, 0x0f, 0xa7, 0x11, 0xc5,
+    0xc1, 0x02, 0x0f, 0xa6, 0x61, 0x14, 0xc0, 0x21, 0x52, 0xdc, 0x12, 0x71,
+    0x0f, 0xb5, 0x70, 0x47, 0x34, 0x2f, 0xc0, 0x21, 0x5e, 0x4f, 0x63, 0x87,
+    0xc0, 0x21, 0x71, 0xd3, 0x45, 0x86, 0x08, 0x5c, 0xd1, 0xcc, 0x45, 0x8d,
+    0x08, 0x5c, 0xc9, 0x47, 0x02, 0x0e, 0x40, 0x21, 0x7d, 0x49, 0xae, 0x34,
+    0xc0, 0x21, 0xd8, 0x11, 0xc0, 0x21, 0xe4, 0x03, 0x40, 0x21, 0xf0, 0x18,
+    0xc0, 0x21, 0xfc, 0xc2, 0x00, 0x29, 0x0f, 0xcc, 0x61, 0x15, 0xc0, 0x22,
+    0x08, 0x05, 0xc0, 0x22, 0x1a, 0x55, 0x38, 0x15, 0xc0, 0x22, 0x24, 0x0e,
+    0xc0, 0x22, 0x3c, 0x45, 0x9e, 0xa0, 0xc0, 0x22, 0x4e, 0xce, 0x6b, 0xc6,
+    0x0f, 0x9f, 0x61, 0xd5, 0x37, 0x82, 0x0f, 0x9e, 0xd1, 0xc9, 0xb3, 0xb9,
+    0x0f, 0xce, 0x78, 0xc7, 0xc9, 0xc7, 0x0f, 0xd4, 0xa1, 0x44, 0xde, 0xdf,
+    0xc0, 0x22, 0x60, 0x09, 0xc0, 0x22, 0x6c, 0x18, 0xc0, 0x22, 0x78, 0x46,
+    0xce, 0x09, 0xc0, 0x22, 0x88, 0x15, 0xc0, 0x22, 0x94, 0x07, 0xc0, 0x22,
+    0xa4, 0x45, 0x05, 0xbb, 0xc0, 0x22, 0xb0, 0xce, 0x74, 0x40, 0x01, 0x19,
+    0x89, 0x03, 0xc0, 0x22, 0xbc, 0xd0, 0x5f, 0xe2, 0x01, 0x12, 0x79, 0xc8,
+    0xb6, 0x42, 0x01, 0x80, 0x18, 0x11, 0xc0, 0x22, 0xc6, 0x03, 0xc0, 0x22,
+    0xd6, 0xcd, 0x77, 0x39, 0x01, 0x36, 0xd1, 0xc3, 0x00, 0xcb, 0x0f, 0xa2,
+    0xb9, 0xd2, 0x47, 0x4b, 0x0f, 0xca, 0x08, 0xc2, 0x00, 0x58, 0x0f, 0xcd,
+    0x21, 0x42, 0x01, 0x48, 0xc0, 0x22, 0xeb, 0x4a, 0xa7, 0x42, 0xc0, 0x22,
+    0xfb, 0x17, 0xc0, 0x23, 0x07, 0x16, 0xc0, 0x23, 0x13, 0x89, 0x0f, 0xa0,
+    0xab, 0x00, 0x23, 0x1d, 0x47, 0x73, 0x7e, 0xc0, 0x23, 0x29, 0xc7, 0xae,
+    0xcf, 0x01, 0x05, 0x59, 0xc6, 0xb9, 0xb4, 0x0f, 0xae, 0x73, 0x00, 0x23,
+    0x4d, 0xcb, 0x95, 0x14, 0x0f, 0xaa, 0x51, 0x0e, 0xc0, 0x23, 0x53, 0xc2,
+    0x00, 0xbf, 0x0f, 0xb5, 0x51, 0xd2, 0x49, 0x8b, 0x0f, 0xb5, 0x78, 0x47,
+    0xc6, 0xe1, 0xc0, 0x23, 0x5f, 0xc6, 0xcb, 0xab, 0x0f, 0xca, 0xf9, 0xc2,
+    0x00, 0x3b, 0x0f, 0xcc, 0x30, 0x42, 0x01, 0xe2, 0xc0, 0x23, 0x83, 0x44,
+    0x39, 0x86, 0xc0, 0x23, 0x8d, 0xca, 0xa5, 0x44, 0x01, 0x09, 0xc1, 0xc4,
+    0xce, 0x23, 0x01, 0x01, 0x03, 0x00, 0x23, 0x99, 0x10, 0xc0, 0x23, 0x9d,
+    0xce, 0x61, 0x03, 0x00, 0x00, 0x80, 0x18, 0xc0, 0x23, 0xa9, 0x15, 0xc0,
+    0x23, 0xb5, 0x05, 0xc0, 0x23, 0xc1, 0x45, 0x75, 0x61, 0xc0, 0x23, 0xd9,
+    0xcc, 0x86, 0xd9, 0x01, 0x01, 0xd9, 0xcd, 0x7c, 0x74, 0x0f, 0x9c, 0xb9,
+    0x42, 0x00, 0xa9, 0xc0, 0x23, 0xeb, 0x42, 0x04, 0x2b, 0xc0, 0x23, 0xf7,
+    0x45, 0xdc, 0xc7, 0xc0, 0x24, 0x03, 0xcb, 0x4f, 0x1a, 0x0f, 0xb0, 0x61,
+    0xd3, 0x1c, 0x59, 0x07, 0xff, 0xe8, 0x43, 0x00, 0x2e, 0xc0, 0x24, 0x19,
+    0xc2, 0x00, 0x75, 0x0f, 0xa4, 0x6b, 0x00, 0x24, 0x2d, 0xc4, 0x7c, 0x7d,
+    0x0f, 0x9c, 0x03, 0x00, 0x24, 0x3d, 0x43, 0x00, 0x89, 0xc0, 0x24, 0x43,
+    0x57, 0x27, 0x2f, 0xc0, 0x24, 0x4f, 0xc7, 0x44, 0xfa, 0x07, 0xef, 0xe1,
+    0xc3, 0x01, 0x09, 0x0f, 0xca, 0x30, 0xc2, 0x00, 0x3b, 0x0f, 0xd5, 0x43,
+    0x00, 0x24, 0x5b, 0x42, 0x02, 0xa7, 0xc0, 0x24, 0x61, 0xc8, 0xb6, 0xba,
+    0x0f, 0xc8, 0xb1, 0x43, 0x0d, 0x05, 0xc0, 0x24, 0x71, 0x46, 0x1c, 0xa1,
+    0xc0, 0x24, 0x7b, 0x44, 0x12, 0xb8, 0xc0, 0x24, 0x99, 0xd2, 0x49, 0x1f,
+    0x0f, 0x9b, 0x01, 0xc2, 0x00, 0x40, 0x0f, 0x99, 0xcb, 0x00, 0x24, 0xbf,
+    0xc5, 0xde, 0x39, 0x0f, 0xa0, 0x99, 0xc5, 0xd9, 0x2a, 0x0f, 0xb5, 0x18,
+    0xc3, 0xe5, 0x57, 0x0f, 0xd4, 0x91, 0x0b, 0xc0, 0x24, 0xc5, 0x42, 0x01,
+    0xdd, 0xc0, 0x24, 0xd8, 0x96, 0x0f, 0xa0, 0x03, 0x00, 0x24, 0xe5, 0x05,
+    0xc0, 0x24, 0xeb, 0xc4, 0xb0, 0x4f, 0x0f, 0xa0, 0x3b, 0x00, 0x24, 0xf7,
+    0x8f, 0x0f, 0xa0, 0x78, 0xc8, 0xbe, 0xb2, 0x01, 0x05, 0xe9, 0xc8, 0x76,
+    0x54, 0x01, 0x05, 0x41, 0x43, 0x5d, 0xc0, 0xc0, 0x24, 0xfd, 0x10, 0xc0,
+    0x25, 0x0f, 0xcc, 0x89, 0x49, 0x0f, 0x9e, 0x49, 0xca, 0xa7, 0xba, 0x01,
+    0x4f, 0xa1, 0x5a, 0x19, 0xae, 0x40, 0x25, 0x19, 0x51, 0x50, 0x8a, 0xc0,
+    0x25, 0x3d, 0x42, 0x02, 0x32, 0xc0, 0x25, 0x7c, 0xc5, 0xda, 0x74, 0x0f,
+    0xce, 0xd8, 0x14, 0xc0, 0x25, 0x9a, 0xc3, 0x0e, 0x6a, 0x01, 0x35, 0xb1,
+    0x44, 0x02, 0x27, 0xc0, 0x25, 0xac, 0xd5, 0x34, 0x10, 0x01, 0x51, 0x78,
+    0x07, 0xc0, 0x25, 0xb8, 0xca, 0x89, 0x7b, 0x01, 0x38, 0x61, 0xc3, 0x14,
+    0x45, 0x01, 0x32, 0x69, 0x43, 0x1c, 0x87, 0xc0, 0x25, 0xc4, 0xcc, 0x86,
+    0x79, 0x0f, 0xa7, 0x99, 0xc4, 0x87, 0x8b, 0x0f, 0x9d, 0xd9, 0x47, 0xc1,
+    0xe0, 0x40, 0x25, 0xce, 0x0e, 0xc0, 0x25, 0xda, 0xd0, 0x59, 0xb2, 0x0f,
+    0xdd, 0xd8, 0x4d, 0x7b, 0x70, 0xc0, 0x25, 0xec, 0xc5, 0xdc, 0x63, 0x01,
+    0x5f, 0x30, 0x09, 0xc0, 0x26, 0x06, 0xc2, 0x07, 0x49, 0x0f, 0xb4, 0xa9,
+    0x49, 0xa7, 0x9d, 0xc0, 0x26, 0x16, 0x10, 0xc0, 0x26, 0x22, 0x0f, 0xc0,
+    0x26, 0x2c, 0x43, 0x26, 0x1e, 0xc0, 0x26, 0x38, 0xc4, 0xde, 0xd3, 0x01,
+    0x32, 0x49, 0x0d, 0xc0, 0x26, 0x44, 0x42, 0x02, 0x32, 0xc0, 0x26, 0x50,
+    0xda, 0x1b, 0x82, 0x0f, 0x9e, 0x99, 0xc2, 0x00, 0x99, 0x0f, 0x99, 0x70,
+    0xc3, 0xe5, 0x18, 0x0f, 0xcc, 0xb1, 0xc5, 0x46, 0xcd, 0x0f, 0xa2, 0xa8,
+    0x14, 0xc0, 0x26, 0x62, 0xc9, 0xb2, 0x90, 0x01, 0x05, 0x71, 0xc3, 0x17,
+    0x93, 0x0f, 0x99, 0xb9, 0xcb, 0x8e, 0xb8, 0x0f, 0xca, 0x18, 0x43, 0x02,
+    0xdf, 0xc0, 0x26, 0x72, 0x0b, 0xc0, 0x26, 0x7a, 0x11, 0xc0, 0x26, 0x84,
+    0x17, 0xc0, 0x26, 0x90, 0x42, 0x00, 0x29, 0xc0, 0x26, 0x9c, 0x03, 0x40,
+    0x26, 0xa6, 0xc4, 0xbc, 0xf7, 0x0f, 0xb5, 0xe9, 0x42, 0x00, 0x7f, 0xc0,
+    0x26, 0xb2, 0x16, 0xc0, 0x26, 0xe8, 0xc9, 0xac, 0x60, 0x0f, 0xaf, 0xe1,
+    0x57, 0x29, 0x12, 0xc0, 0x26, 0xf4, 0xc4, 0x32, 0xd0, 0x0f, 0x9a, 0x29,
+    0xc4, 0x5a, 0xfe, 0x0f, 0xa2, 0x29, 0x11, 0x40, 0x27, 0x00, 0x03, 0xc0,
+    0x27, 0x0f, 0x0b, 0xc0, 0x27, 0x2c, 0x17, 0xc0, 0x27, 0x4a, 0x11, 0x40,
+    0x27, 0x57, 0x4c, 0x89, 0xf1, 0xc0, 0x27, 0x64, 0x03, 0xc0, 0x27, 0xc4,
+    0x0e, 0xc0, 0x27, 0xd4, 0x10, 0xc0, 0x27, 0xde, 0xc7, 0xc9, 0x81, 0x0f,
+    0xcf, 0x51, 0xc8, 0xb9, 0x22, 0x0f, 0xcf, 0xc0, 0x09, 0xc0, 0x27, 0xee,
+    0x42, 0x00, 0x4e, 0xc0, 0x27, 0xfd, 0xc3, 0x18, 0xb3, 0x00, 0x03, 0xf3,
+    0x00, 0x28, 0x09, 0x14, 0xc0, 0x28, 0x0d, 0xc2, 0x16, 0x59, 0x01, 0x4f,
+    0xf3, 0x00, 0x28, 0x1f, 0xc4, 0x00, 0x3b, 0x0f, 0x9d, 0x59, 0xcf, 0x65,
+    0x3a, 0x01, 0x4e, 0xe9, 0x46, 0xce, 0x3f, 0xc0, 0x28, 0x25, 0x47, 0xc6,
+    0x39, 0x40, 0x28, 0x54, 0xd7, 0x22, 0x44, 0x01, 0x39, 0xc9, 0x11, 0xc0,
+    0x28, 0x6c, 0xd7, 0x27, 0x18, 0x0f, 0xa8, 0x00, 0x43, 0x01, 0xa4, 0xc0,
+    0x28, 0x76, 0xc3, 0x91, 0xe8, 0x01, 0x32, 0x41, 0x85, 0x01, 0x18, 0x91,
+    0x44, 0x02, 0x8b, 0xc0, 0x28, 0x82, 0x47, 0x2d, 0x4e, 0xc0, 0x28, 0x8c,
+    0x42, 0x00, 0x43, 0x40, 0x28, 0xbc, 0xce, 0x75, 0x4a, 0x0f, 0xd3, 0xc9,
+    0xc8, 0xbf, 0x9a, 0x01, 0x31, 0x61, 0xd6, 0x2f, 0x46, 0x01, 0x08, 0x09,
+    0x0f, 0xc0, 0x28, 0xc8, 0xc3, 0x1f, 0x19, 0x0f, 0xce, 0x89, 0x44, 0x0d,
+    0xff, 0x40, 0x28, 0xd4, 0x54, 0x3e, 0x94, 0xc0, 0x29, 0x06, 0x46, 0x0c,
+    0x8e, 0xc0, 0x29, 0x6a, 0x07, 0xc0, 0x29, 0x76, 0xc9, 0xb3, 0x44, 0x01,
+    0x1f, 0x81, 0x42, 0x00, 0xe6, 0xc0, 0x29, 0x88, 0x4b, 0x66, 0xd0, 0xc0,
+    0x29, 0x94, 0xcb, 0x91, 0xaf, 0x0f, 0xa3, 0xf0, 0x42, 0x00, 0xf1, 0xc0,
+    0x29, 0xa3, 0xca, 0x9c, 0xca, 0x01, 0x05, 0x99, 0xc7, 0xc6, 0xb0, 0x0f,
+    0x9a, 0x30, 0x00, 0x40, 0x29, 0xad, 0x43, 0x10, 0x73, 0xc0, 0x29, 0xb9,
+    0x96, 0x0f, 0xa0, 0xe3, 0x00, 0x29, 0xc5, 0xca, 0xa4, 0xc2, 0x01, 0x3e,
+    0x89, 0xc4, 0xca, 0xcf, 0x01, 0x34, 0x99, 0xc2, 0x06, 0x46, 0x01, 0x31,
+    0x29, 0x09, 0x40, 0x29, 0xd1, 0x16, 0xc0, 0x29, 0xf2, 0x05, 0xc0, 0x2a,
+    0x02, 0xc7, 0x5a, 0x55, 0x01, 0x15, 0x31, 0xd5, 0x2b, 0xc1, 0x01, 0x12,
+    0x18, 0xc9, 0xad, 0x5c, 0x01, 0x34, 0xd9, 0xcb, 0x8f, 0x26, 0x0f, 0xa2,
+    0xf8, 0x47, 0x02, 0x0e, 0xc0, 0x2a, 0x0e, 0x15, 0xc0, 0x2a, 0x55, 0x48,
+    0xa3, 0x64, 0xc0, 0x2a, 0x61, 0x46, 0x09, 0x97, 0xc0, 0x2a, 0x6d, 0x4b,
+    0x6f, 0xc7, 0xc0, 0x2a, 0x91, 0x56, 0x30, 0x90, 0x40, 0x2a, 0xae, 0xc8,
+    0xbc, 0xb2, 0x01, 0x1f, 0x31, 0x42, 0x00, 0x99, 0xc0, 0x2a, 0xb8, 0x47,
+    0xc2, 0xd5, 0xc0, 0x2a, 0xc4, 0xc9, 0x49, 0x4c, 0x00, 0x00, 0x31, 0x45,
+    0x31, 0xf0, 0x40, 0x2a, 0xd0, 0x54, 0x3e, 0x80, 0xc0, 0x2a, 0xdc, 0x12,
+    0xc0, 0x2b, 0x38, 0x11, 0x40, 0x2b, 0x44, 0x46, 0xd0, 0x6d, 0xc0, 0x2b,
+    0x50, 0xc5, 0xdd, 0x8f, 0x0f, 0xca, 0x88, 0xcf, 0x65, 0xb2, 0x0f, 0x9e,
+    0x41, 0xd7, 0x26, 0x49, 0x01, 0x51, 0xf9, 0x12, 0xc0, 0x2b, 0x5c, 0xc7,
+    0xc5, 0x67, 0x0f, 0xb4, 0x88, 0xcc, 0x88, 0x35, 0x0f, 0xb5, 0x09, 0x45,
+    0xd7, 0x72, 0x40, 0x2b, 0x68, 0x1a, 0xc0, 0x2b, 0x8a, 0x43, 0x1d, 0xbb,
+    0xc0, 0x2b, 0x96, 0x42, 0x02, 0x10, 0xc0, 0x2b, 0xb2, 0x19, 0xc0, 0x2b,
+    0xbe, 0x9b, 0x0f, 0xa3, 0x33, 0x00, 0x2b, 0xd1, 0x11, 0xc0, 0x2b, 0xd7,
+    0xc2, 0x00, 0x50, 0x0f, 0xa5, 0x19, 0xc5, 0xdc, 0x8b, 0x0f, 0xa4, 0x83,
+    0x00, 0x2b, 0xe4, 0xc2, 0x00, 0xb1, 0x0f, 0xa0, 0xb9, 0xc2, 0x02, 0x6f,
+    0x0f, 0xcd, 0xa1, 0x47, 0xc9, 0xdc, 0x40, 0x2b, 0xea, 0x11, 0xc0, 0x2b,
+    0xf6, 0x03, 0xc0, 0x2c, 0x08, 0x42, 0x0f, 0xe1, 0x40, 0x2c, 0x14, 0x10,
+    0xc0, 0x2c, 0x1e, 0x0e, 0xc0, 0x2c, 0x31, 0x15, 0xc0, 0x2c, 0x3b, 0x06,
+    0xc0, 0x2c, 0x50, 0xc2, 0x07, 0xb8, 0x0f, 0xa3, 0xb3, 0x00, 0x2c, 0x5c,
+    0x44, 0x82, 0x11, 0xc0, 0x2c, 0x60, 0x05, 0xc0, 0x2c, 0x84, 0x96, 0x0f,
+    0xcc, 0x3b, 0x00, 0x2c, 0x94, 0x14, 0xc0, 0x2c, 0xa7, 0x09, 0x40, 0x2c,
+    0xb1, 0xc3, 0x18, 0x91, 0x0f, 0xcd, 0x61, 0xcc, 0x8a, 0x81, 0x01, 0x31,
+    0x19, 0x16, 0xc0, 0x2c, 0xc3, 0xc4, 0x56, 0x1d, 0x0f, 0xa2, 0xc9, 0x42,
+    0x02, 0xa7, 0xc0, 0x2c, 0xcf, 0x14, 0xc0, 0x2c, 0xdb, 0x42, 0x00, 0x76,
+    0xc0, 0x2c, 0xe5, 0x44, 0x1f, 0x3c, 0x40, 0x2c, 0xf1, 0x03, 0xc0, 0x2c,
+    0xfb, 0x10, 0xc0, 0x2d, 0x1d, 0xc2, 0x02, 0xa7, 0x0f, 0xa8, 0xa3, 0x00,
+    0x2d, 0x30, 0x16, 0xc0, 0x2d, 0x3a, 0xc5, 0xdc, 0x95, 0x01, 0x11, 0xa9,
+    0x07, 0xc0, 0x2d, 0x46, 0x86, 0x0f, 0xb6, 0x79, 0xca, 0x9e, 0x1e, 0x0f,
+    0xce, 0x18, 0xc4, 0x02, 0x10, 0x0f, 0xce, 0x43, 0x00, 0x2d, 0x52, 0x95,
+    0x0f, 0xb4, 0x63, 0x00, 0x2d, 0x58, 0x42, 0x02, 0xa7, 0xc0, 0x2d, 0x62,
+    0x89, 0x0f, 0xa0, 0xdb, 0x00, 0x2d, 0x7a, 0x44, 0xdf, 0xb3, 0xc0, 0x2d,
+    0x80, 0xd3, 0x46, 0x1e, 0x0f, 0x9e, 0xb9, 0x44, 0x6f, 0xbf, 0xc0, 0x2d,
+    0x8c, 0xc4, 0x00, 0x3b, 0x0f, 0xd5, 0x19, 0xc5, 0xdc, 0x4f, 0x0f, 0x99,
+    0x78, 0x0b, 0xc0, 0x2d, 0x96, 0x03, 0xc0, 0x2d, 0xa6, 0x11, 0xc0, 0x2d,
+    0xb0, 0x07, 0x40, 0x2d, 0xc8, 0x57, 0x2a, 0x54, 0xc0, 0x2d, 0xd2, 0xcd,
+    0x7c, 0xe9, 0x07, 0xf7, 0xf8, 0xd2, 0x4b, 0x4d, 0x08, 0xe3, 0x61, 0x47,
+    0x34, 0x2f, 0xc0, 0x2e, 0x26, 0x06, 0xc0, 0x2e, 0x4a, 0x4b, 0x93, 0x30,
+    0xc0, 0x2e, 0x5c, 0xce, 0x73, 0x1a, 0x08, 0xe2, 0x19, 0x45, 0x00, 0xba,
+    0xc0, 0x2e, 0x64, 0x4b, 0x6f, 0xc7, 0xc0, 0x2e, 0x74, 0x47, 0x02, 0x0e,
+    0x40, 0x2e, 0x94, 0x19, 0xc0, 0x2e, 0xfb, 0x43, 0x00, 0x75, 0xc0, 0x2f,
+    0x05, 0xc5, 0x0a, 0xe2, 0x01, 0x2e, 0x53, 0x00, 0x2f, 0x15, 0x46, 0x19,
+    0xbb, 0xc0, 0x2f, 0x1b, 0xc2, 0x00, 0x3b, 0x0f, 0xa8, 0x93, 0x00, 0x2f,
+    0x2d, 0x43, 0x00, 0xc7, 0xc0, 0x2f, 0x39, 0xc6, 0xcf, 0xbf, 0x0f, 0x9b,
+    0x69, 0xd0, 0x5c, 0xb2, 0x0f, 0xb1, 0x69, 0xc5, 0xd5, 0x01, 0x0f, 0xcc,
+    0xf1, 0x16, 0x40, 0x2f, 0x45, 0x42, 0x00, 0x4b, 0xc0, 0x2f, 0x51, 0x42,
+    0x0f, 0x9b, 0xc0, 0x2f, 0x5f, 0x91, 0x01, 0x32, 0x63, 0x00, 0x2f, 0x6b,
+    0x48, 0x00, 0xcc, 0xc0, 0x2f, 0x71, 0x45, 0xd4, 0x43, 0xc0, 0x2f, 0x9a,
+    0xc4, 0xe2, 0xa3, 0x0f, 0xa6, 0x91, 0xca, 0x9a, 0xae, 0x0f, 0x9c, 0xd1,
+    0xc3, 0x13, 0x35, 0x0f, 0x9a, 0x59, 0x89, 0x0f, 0xcd, 0xa8, 0xc7, 0xca,
+    0x3e, 0x0f, 0xcc, 0x09, 0x09, 0xc0, 0x2f, 0xbc, 0x43, 0x1b, 0x67, 0xc0,
+    0x2f, 0xc8, 0xc3, 0x00, 0x38, 0x01, 0x32, 0x71, 0xd1, 0x52, 0xee, 0x01,
+    0x05, 0xb1, 0xc7, 0x77, 0xc1, 0x01, 0x05, 0x21, 0x10, 0xc0, 0x2f, 0xd4,
+    0x0f, 0xc0, 0x2f, 0xdc, 0xc2, 0x10, 0x3f, 0x0f, 0xaf, 0x13, 0x00, 0x2f,
+    0xe8, 0xc4, 0x8a, 0x84, 0x0f, 0xcc, 0x70, 0xc8, 0x21, 0xfb, 0x0f, 0xc9,
+    0x29, 0x45, 0x5b, 0x53, 0xc0, 0x2f, 0xee, 0x4c, 0x8c, 0x61, 0x40, 0x2f,
+    0xfa, 0x14, 0xc0, 0x30, 0x63, 0x44, 0x0b, 0x13, 0xc0, 0x30, 0x6f, 0xca,
+    0xa4, 0x54, 0x70, 0x00, 0x09, 0xcf, 0x68, 0xfa, 0x01, 0x31, 0xf3, 0x00,
+    0x30, 0x83, 0x04, 0xc0, 0x30, 0x87, 0x06, 0xc0, 0x30, 0x93, 0xd5, 0x34,
+    0x4f, 0x0f, 0xca, 0x69, 0x42, 0x01, 0x7c, 0x40, 0x30, 0x9f, 0xc5, 0xcf,
+    0x36, 0x0f, 0xcf, 0x99, 0xc3, 0x0c, 0xa5, 0x0f, 0xd6, 0x08, 0x44, 0x00,
+    0x67, 0xc0, 0x30, 0xd9, 0x46, 0x01, 0x4a, 0xc0, 0x31, 0x0d, 0x4a, 0x01,
+    0xa9, 0xc0, 0x31, 0x4b, 0xce, 0x72, 0xb8, 0x0f, 0xb2, 0x19, 0x00, 0x40,
+    0x31, 0x69, 0x0b, 0xc0, 0x31, 0x90, 0xda, 0x1c, 0x6c, 0x01, 0x35, 0x79,
+    0x06, 0xc0, 0x31, 0xa9, 0xcb, 0x96, 0x1c, 0x0f, 0xb0, 0x91, 0xce, 0x6e,
+    0xc8, 0x01, 0x5e, 0x88, 0x00, 0x40, 0x31, 0xb5, 0x47, 0x02, 0x0e, 0xc0,
+    0x31, 0xc1, 0xcc, 0x1d, 0xc7, 0x08, 0x1c, 0xf8, 0x03, 0xc0, 0x32, 0x24,
+    0x0e, 0xc0, 0x32, 0x32, 0x50, 0x5b, 0xb2, 0xc0, 0x32, 0x42, 0x14, 0xc0,
+    0x32, 0x84, 0x45, 0xd4, 0x0c, 0xc0, 0x32, 0x8e, 0xc6, 0xcb, 0x57, 0x0f,
+    0xcc, 0xa1, 0x4b, 0x8d, 0x8f, 0x40, 0x32, 0xa8, 0x14, 0xc0, 0x33, 0x00,
+    0x16, 0xc0, 0x33, 0x0f, 0x17, 0xc0, 0x33, 0x19, 0xc8, 0x6b, 0xf0, 0x01,
+    0x11, 0xd9, 0x0e, 0xc0, 0x33, 0x2b, 0xc3, 0x6b, 0x12, 0x0f, 0xa9, 0x51,
+    0xc6, 0xd1, 0x6f, 0x0f, 0x9f, 0x29, 0x43, 0x6e, 0xfe, 0xc0, 0x33, 0x38,
+    0xc2, 0x01, 0x25, 0x0f, 0xd4, 0xe8, 0x0f, 0xc0, 0x33, 0x44, 0x10, 0xc0,
+    0x33, 0x57, 0x42, 0x01, 0x29, 0xc0, 0x33, 0x6b, 0xc7, 0xc4, 0xcd, 0x0f,
+    0xad, 0xa1, 0x16, 0xc0, 0x33, 0x77, 0xdb, 0x18, 0x8a, 0x0f, 0xb2, 0x59,
+    0xc3, 0x23, 0x1b, 0x01, 0x5f, 0x09, 0x48, 0xbc, 0x42, 0x40, 0x33, 0x83,
+    0x42, 0x00, 0x09, 0xc0, 0x33, 0xbf, 0x47, 0x0d, 0xdb, 0xc0, 0x33, 0xc7,
+    0xcb, 0x93, 0x46, 0x01, 0x37, 0x61, 0xc6, 0xcd, 0x5b, 0x0f, 0x99, 0xd1,
+    0xca, 0xa4, 0x2c, 0x0f, 0xb6, 0xa9, 0xc9, 0xac, 0xf9, 0x0f, 0xcb, 0xf1,
+    0xca, 0x9f, 0x40, 0x0f, 0xcc, 0xd8, 0xcf, 0x68, 0xdc, 0x01, 0x1c, 0x71,
+    0x12, 0xc0, 0x33, 0xdf, 0xc4, 0xe0, 0x5b, 0x01, 0x5e, 0xd0, 0xd3, 0x40,
+    0x67, 0x0f, 0xa5, 0x79, 0xc9, 0x8c, 0x04, 0x0f, 0xb1, 0x79, 0x96, 0x0f,
+    0xb6, 0xb1, 0xca, 0x9e, 0xdc, 0x0f, 0xc8, 0xb8, 0x18, 0xc0, 0x33, 0xee,
+    0x4f, 0x61, 0x20, 0xc0, 0x33, 0xfa, 0x42, 0x00, 0xac, 0xc0, 0x34, 0x0c,
+    0x15, 0xc0, 0x34, 0x19, 0x08, 0xc0, 0x34, 0x25, 0x05, 0xc0, 0x34, 0x34,
+    0x06, 0xc0, 0x34, 0x40, 0x46, 0xd2, 0x65, 0xc0, 0x34, 0x4d, 0xc8, 0xb6,
+    0x1a, 0x0f, 0xa7, 0x28, 0x43, 0x01, 0xad, 0xc0, 0x34, 0x59, 0x49, 0x1c,
+    0x89, 0x40, 0x34, 0x65, 0xc5, 0xdb, 0x41, 0x01, 0x37, 0xc1, 0xd5, 0x33,
+    0xbc, 0x0f, 0x9e, 0x91, 0x05, 0x40, 0x34, 0xaf, 0xc6, 0x3c, 0x52, 0x01,
+    0x15, 0xbb, 0x00, 0x34, 0xbb, 0x92, 0x0f, 0xa3, 0xfa, 0x00, 0x34, 0xc1,
+    0x14, 0xc0, 0x34, 0xc7, 0xc6, 0x08, 0xea, 0x01, 0x05, 0x49, 0x0f, 0xc0,
+    0x34, 0xdd, 0xc7, 0xbf, 0xe8, 0x0f, 0xa1, 0xd1, 0xc2, 0x00, 0x6c, 0x0f,
+    0xd5, 0xa8, 0x43, 0x01, 0xfe, 0xc0, 0x34, 0xec, 0xc3, 0x0e, 0x66, 0x0f,
+    0xb6, 0xf3, 0x00, 0x34, 0xf6, 0xc3, 0x04, 0x85, 0x0f, 0xa0, 0x58, 0x4a,
+    0x15, 0x7c, 0xc0, 0x35, 0x02, 0xcc, 0x87, 0xb1, 0x0f, 0xad, 0x71, 0x10,
+    0xc0, 0x35, 0x26, 0xcb, 0x91, 0xd0, 0x0f, 0xca, 0x01, 0xd2, 0x47, 0x39,
+    0x01, 0x71, 0xf0, 0x16, 0xc0, 0x35, 0x36, 0x10, 0xc0, 0x35, 0x42, 0x14,
+    0xc0, 0x35, 0x4e, 0x18, 0xc0, 0x35, 0x5a, 0xc9, 0xac, 0x72, 0x0f, 0xae,
+    0x89, 0x45, 0xd7, 0x90, 0xc0, 0x35, 0x6c, 0xc4, 0x7f, 0xa8, 0x0f, 0xce,
+    0x38, 0x06, 0xc0, 0x35, 0x78, 0xcf, 0x68, 0xeb, 0x01, 0x33, 0x81, 0x0b,
+    0xc0, 0x35, 0x84, 0x44, 0x14, 0x97, 0x40, 0x35, 0x90, 0xca, 0x93, 0xd6,
+    0x01, 0x38, 0x69, 0x07, 0xc0, 0x35, 0x9c, 0xcd, 0x75, 0x72, 0x0f, 0x9c,
+    0x08, 0x9b, 0x0f, 0xd5, 0x83, 0x00, 0x35, 0xae, 0x03, 0xc0, 0x35, 0xb4,
+    0x11, 0xc0, 0x35, 0xc4, 0x07, 0xc0, 0x35, 0xd9, 0xca, 0xa0, 0xc6, 0x0f,
+    0xb1, 0x98, 0xc6, 0xd1, 0x7b, 0x0f, 0xcc, 0x51, 0x17, 0xc0, 0x35, 0xe5,
+    0x14, 0xc0, 0x35, 0xef, 0xc2, 0x01, 0xbb, 0x0f, 0xcd, 0xb3, 0x00, 0x36,
+    0x0b, 0xc4, 0x18, 0xb3, 0x0f, 0xae, 0x01, 0x89, 0x0f, 0x99, 0x5b, 0x00,
+    0x36, 0x11, 0xc4, 0xe3, 0xc3, 0x0f, 0xd6, 0xa8, 0x05, 0xc0, 0x36, 0x17,
+    0x42, 0x01, 0x0c, 0xc0, 0x36, 0x29, 0x0e, 0xc0, 0x36, 0x35, 0xca, 0x9c,
+    0x0c, 0x01, 0x31, 0x59, 0xce, 0x73, 0xd0, 0x0f, 0x9c, 0x29, 0xc3, 0xd3,
+    0x0e, 0x0f, 0xce, 0xd1, 0xc4, 0xd2, 0xb5, 0x0f, 0xa3, 0x50, 0x07, 0xc0,
+    0x36, 0x3f, 0x11, 0xc0, 0x36, 0x4b, 0x03, 0xc0, 0x36, 0x60, 0xca, 0x9f,
+    0x54, 0x0f, 0x9b, 0x20, 0x42, 0x02, 0xa7, 0xc0, 0x36, 0x6c, 0xc7, 0xc0,
+    0x20, 0x01, 0x37, 0xe9, 0x10, 0xc0, 0x36, 0x76, 0xc2, 0x00, 0x40, 0x01,
+    0x1e, 0xd8, 0x42, 0x01, 0xa3, 0xc0, 0x36, 0x82, 0x0f, 0xc0, 0x36, 0x8c,
+    0x03, 0xc0, 0x36, 0x98, 0xc4, 0xe3, 0x9b, 0x0f, 0xc9, 0xd0, 0x14, 0xc0,
+    0x36, 0xa4, 0x15, 0xc0, 0x36, 0xb1, 0x47, 0xc0, 0x0b, 0xc0, 0x36, 0xbe,
+    0x45, 0xd5, 0xd3, 0xc0, 0x36, 0xca, 0x0e, 0xc0, 0x36, 0xd6, 0xd9, 0x1e,
+    0xe6, 0x0f, 0x9e, 0x89, 0xd2, 0x4b, 0xb9, 0x01, 0x50, 0x68, 0xc4, 0xde,
+    0x8b, 0x0f, 0xd4, 0xf3, 0x00, 0x36, 0xe2, 0x0e, 0xc0, 0x36, 0xe8, 0x43,
+    0x6c, 0xc3, 0xc0, 0x36, 0xfa, 0x42, 0x07, 0x2f, 0xc0, 0x37, 0x12, 0x06,
+    0xc0, 0x37, 0x1a, 0x10, 0x40, 0x37, 0x26, 0x49, 0xb3, 0x68, 0xc0, 0x37,
+    0x34, 0x06, 0xc0, 0x37, 0x40, 0x42, 0x01, 0x1b, 0xc0, 0x37, 0x4a, 0x10,
+    0xc0, 0x37, 0x54, 0x14, 0xc0, 0x37, 0x66, 0x03, 0xc0, 0x37, 0x78, 0x4b,
+    0x93, 0x72, 0xc0, 0x37, 0x84, 0xc2, 0x00, 0xa2, 0x0f, 0xa6, 0xe9, 0x0e,
+    0xc0, 0x37, 0xa8, 0xcd, 0x78, 0x3d, 0x00, 0x04, 0xa8, 0x16, 0xc0, 0x37,
+    0xb4, 0x17, 0xc0, 0x37, 0xc0, 0x10, 0xc0, 0x37, 0xd5, 0x06, 0xc0, 0x37,
+    0xee, 0xc3, 0x87, 0x43, 0x0f, 0xaf, 0xf9, 0x11, 0xc0, 0x37, 0xfc, 0x43,
+    0x0b, 0x09, 0xc0, 0x38, 0x08, 0xca, 0x46, 0x99, 0x0f, 0xa7, 0x8b, 0x00,
+    0x38, 0x12, 0xca, 0xa0, 0xd0, 0x0f, 0x9d, 0x28, 0x16, 0xc0, 0x38, 0x16,
+    0x4c, 0x86, 0xb5, 0xc0, 0x38, 0x22, 0x46, 0xce, 0x93, 0xc0, 0x38, 0x47,
+    0x15, 0xc0, 0x38, 0x65, 0x14, 0xc0, 0x38, 0x7d, 0x0e, 0xc0, 0x38, 0x8f,
+    0x12, 0xc0, 0x38, 0xa1, 0x90, 0x0f, 0xa3, 0x43, 0x00, 0x38, 0xad, 0x0a,
+    0xc0, 0x38, 0xdb, 0xc6, 0xd1, 0x87, 0x0f, 0xae, 0xb1, 0xc4, 0x60, 0xb3,
+    0x00, 0x05, 0x79, 0xc5, 0xdb, 0x28, 0x0f, 0xcd, 0x19, 0x09, 0x40, 0x38,
+    0xe7, 0x15, 0xc0, 0x38, 0xf7, 0x42, 0x00, 0x72, 0xc0, 0x39, 0x03, 0x43,
+    0x1c, 0xe7, 0x40, 0x39, 0x0d, 0x06, 0xc0, 0x39, 0x19, 0x47, 0x02, 0x0e,
+    0x40, 0x39, 0x2b, 0x15, 0xc0, 0x39, 0x8b, 0x0e, 0xc0, 0x39, 0x9d, 0x50,
+    0x0f, 0x5e, 0xc0, 0x39, 0xa9, 0x16, 0xc0, 0x39, 0xb5, 0x4b, 0x6f, 0xc7,
+    0xc0, 0x39, 0xc1, 0x4f, 0x30, 0x90, 0xc0, 0x3a, 0x02, 0x46, 0x09, 0x97,
+    0x40, 0x3a, 0x0c, 0xc2, 0x01, 0xbb, 0x0f, 0xd5, 0x11, 0xcd, 0x7d, 0x37,
+    0x0f, 0xce, 0x70, 0x9b, 0x0f, 0xa8, 0x8b, 0x00, 0x3a, 0x30, 0xc9, 0xa9,
+    0xcf, 0x01, 0x09, 0x50, 0x46, 0x5c, 0x02, 0xc0, 0x3a, 0x3f, 0x45, 0xde,
+    0x2a, 0xc0, 0x3a, 0x49, 0xc3, 0x4d, 0xd4, 0x0f, 0xaa, 0x59, 0x47, 0xc9,
+    0xff, 0xc0, 0x3a, 0x72, 0x10, 0x40, 0x3a, 0x90, 0x52, 0x4c, 0xeb, 0xc0,
+    0x3a, 0x9a, 0x48, 0xbb, 0xb2, 0xc0, 0x3a, 0xa6, 0x45, 0xdd, 0xf3, 0xc0,
+    0x3a, 0xbe, 0x44, 0x2f, 0x1e, 0xc0, 0x3a, 0xde, 0x49, 0xb3, 0x4d, 0x40,
+    0x3b, 0x00, 0xc6, 0x00, 0xf3, 0x01, 0x05, 0x69, 0xc2, 0x00, 0xcc, 0x0f,
+    0xa4, 0x7b, 0x00, 0x3b, 0x28, 0xc4, 0x13, 0x35, 0x0f, 0xa2, 0xc1, 0xc7,
+    0xc5, 0xe5, 0x0f, 0xca, 0xe9, 0xc2, 0x00, 0xac, 0x0f, 0xd4, 0x08, 0xc3,
+    0x14, 0x6b, 0x0f, 0xa1, 0x41, 0xd4, 0x3d, 0xe0, 0x01, 0x93, 0xf8, 0x15,
+    0xc0, 0x3b, 0x34, 0x42, 0x00, 0xa4, 0xc0, 0x3b, 0x3e, 0x19, 0xc0, 0x3b,
+    0x4a, 0x43, 0x11, 0x7f, 0xc0, 0x3b, 0x60, 0xc5, 0xd8, 0x99, 0x01, 0x32,
+    0x33, 0x00, 0x3b, 0x6c, 0x43, 0x5c, 0xeb, 0xc0, 0x3b, 0x72, 0x46, 0xd3,
+    0x13, 0xc0, 0x3b, 0x7e, 0xc5, 0xde, 0x70, 0x0f, 0xa2, 0xa1, 0xc7, 0xc4,
+    0xd4, 0x0f, 0xc8, 0x98, 0xcc, 0x86, 0x55, 0x0f, 0xc9, 0x11, 0xc2, 0x02,
+    0x35, 0x01, 0x15, 0xe3, 0x00, 0x3b, 0x8e, 0x04, 0xc0, 0x3b, 0x94, 0x0b,
+    0xc0, 0x3b, 0xa0, 0x47, 0x34, 0xa6, 0xc0, 0x3b, 0xac, 0xd3, 0x40, 0x7a,
+    0x01, 0x01, 0x79, 0xc8, 0xba, 0x42, 0x0f, 0xa6, 0xd9, 0xca, 0xa4, 0x22,
+    0x0f, 0xcf, 0xf8, 0x10, 0xc0, 0x3b, 0xb8, 0x94, 0x01, 0x15, 0xeb, 0x00,
+    0x3b, 0xc2, 0x16, 0xc0, 0x3b, 0xd7, 0x00, 0xc0, 0x3b, 0xe8, 0x42, 0x02,
+    0x2f, 0xc0, 0x3c, 0x0b, 0xc2, 0x00, 0x40, 0x0f, 0xa2, 0x19, 0xcc, 0x40,
+    0x81, 0x00, 0x05, 0x00, 0xca, 0xa7, 0x06, 0x0f, 0x0a, 0x79, 0x0e, 0xc0,
+    0x3c, 0x17, 0x46, 0x09, 0x97, 0xc0, 0x3c, 0x23, 0x15, 0xc0, 0x3c, 0x47,
+    0x45, 0x28, 0xb1, 0x40, 0x3c, 0x53, 0x44, 0x75, 0x34, 0xc0, 0x3c, 0x6f,
+    0x0f, 0xc0, 0x3c, 0x7b, 0xca, 0x9d, 0x92, 0x0f, 0xa9, 0x49, 0xc2, 0x02,
+    0xa7, 0x00, 0x00, 0x00, 0xc5, 0x13, 0x84, 0x01, 0x16, 0x1b, 0x00, 0x3c,
+    0x87, 0xcc, 0x06, 0xbb, 0x01, 0x16, 0x11, 0x48, 0x19, 0xb9, 0xc0, 0x3c,
+    0x8d, 0x15, 0xc0, 0x3c, 0x99, 0x05, 0xc0, 0x3c, 0xa5, 0xc7, 0x05, 0xc0,
+    0x01, 0x10, 0x79, 0xce, 0x72, 0xd4, 0x01, 0x50, 0x49, 0xd2, 0x48, 0x6b,
+    0x01, 0x57, 0xf8, 0xca, 0xa0, 0x76, 0x00, 0x3f, 0xf9, 0x06, 0xc0, 0x3c,
+    0xb1, 0x0e, 0xc0, 0x3c, 0xc3, 0xd0, 0x0f, 0x09, 0x00, 0x3f, 0xc9, 0x43,
+    0x0a, 0x8a, 0xc0, 0x3c, 0xd5, 0x47, 0x10, 0x78, 0xc0, 0x3c, 0xe1, 0xd4,
+    0x3d, 0x18, 0x00, 0x3f, 0xa0, 0xc3, 0x83, 0x55, 0x0f, 0xcb, 0xb9, 0xce,
+    0x73, 0x8a, 0x0f, 0x98, 0x18, 0x46, 0x04, 0x8f, 0xc0, 0x3c, 0xed, 0x44,
+    0x0b, 0x0d, 0x40, 0x3d, 0x0f, 0x44, 0xe4, 0x3b, 0xc0, 0x3d, 0x31, 0x12,
+    0xc0, 0x3d, 0x3d, 0x00, 0x40, 0x3d, 0x49, 0xc3, 0x01, 0x97, 0x0f, 0xcc,
+    0x29, 0xcf, 0x68, 0xeb, 0x01, 0x33, 0x89, 0x94, 0x0f, 0xa2, 0x12, 0x00,
+    0x3d, 0x5b, 0x89, 0x0f, 0xca, 0xd1, 0x52, 0x4d, 0xb1, 0x40, 0x3d, 0x68,
+    0x16, 0xc0, 0x3d, 0xee, 0x05, 0xc0, 0x3d, 0xf8, 0xd1, 0x50, 0x24, 0x0f,
+    0xb0, 0x88, 0x15, 0xc0, 0x3e, 0x04, 0x42, 0x00, 0x99, 0xc0, 0x3e, 0x0e,
+    0xc9, 0xa9, 0x3f, 0x00, 0x9b, 0x09, 0xc9, 0x11, 0xf6, 0x00, 0x9b, 0x11,
+    0x12, 0xc0, 0x3e, 0x18, 0xcd, 0x2c, 0xb2, 0x00, 0x9b, 0x39, 0x46, 0x09,
+    0x97, 0xc0, 0x3e, 0x24, 0x47, 0x34, 0x2f, 0xc0, 0x3e, 0x42, 0x4b, 0x8f,
+    0x68, 0x40, 0x3e, 0x60, 0x07, 0xc0, 0x3e, 0x86, 0x47, 0xc5, 0x60, 0xc0,
+    0x3e, 0xa1, 0x88, 0x0f, 0xce, 0xe9, 0x4d, 0x7c, 0x67, 0x40, 0x3e, 0xad,
+    0x00, 0xc0, 0x3f, 0x26, 0xc6, 0x59, 0xd6, 0x01, 0x33, 0x50, 0xc6, 0x31,
+    0x92, 0x01, 0x38, 0x4b, 0x00, 0x3f, 0x36, 0xca, 0x3a, 0x52, 0x01, 0x1c,
+    0x31, 0x42, 0x00, 0xa9, 0xc0, 0x3f, 0x3c, 0x00, 0xc0, 0x3f, 0x48, 0xc5,
+    0xd6, 0x0f, 0x00, 0x00, 0x28, 0x4b, 0x98, 0x4d, 0xc0, 0x3f, 0x5a, 0x4b,
+    0x97, 0x45, 0xc0, 0x3f, 0x66, 0x48, 0xb6, 0x9a, 0x40, 0x3f, 0x72, 0x42,
+    0x00, 0x65, 0xc0, 0x3f, 0x7e, 0x0b, 0x40, 0x3f, 0x88, 0x46, 0xd2, 0x05,
+    0xc0, 0x3f, 0x94, 0xc4, 0x61, 0x0d, 0x00, 0x00, 0xd8, 0xcc, 0x83, 0x9d,
+    0x01, 0x08, 0x39, 0x42, 0x00, 0x79, 0x40, 0x3f, 0x9e, 0x95, 0x0f, 0xa2,
+    0x01, 0xc7, 0xb4, 0xd2, 0x0f, 0xa2, 0x98, 0x0b, 0xc0, 0x3f, 0xb0, 0x4c,
+    0x83, 0x55, 0xc0, 0x3f, 0xbc, 0x42, 0x00, 0xb1, 0xc0, 0x3f, 0xd8, 0x47,
+    0xc7, 0x12, 0xc0, 0x3f, 0xe4, 0x47, 0xc7, 0xb3, 0x40, 0x40, 0x18, 0xc5,
+    0xd8, 0x30, 0x0f, 0xcc, 0x69, 0xc4, 0xe0, 0xfb, 0x0f, 0x9e, 0x61, 0x03,
+    0xc0, 0x40, 0x42, 0xc5, 0xd0, 0x38, 0x0f, 0xcb, 0xe9, 0x4c, 0x89, 0xe5,
+    0x40, 0x40, 0x4c, 0x07, 0xc0, 0x40, 0xc0, 0x03, 0xc0, 0x40, 0xca, 0x0b,
+    0xc0, 0x40, 0xe2, 0x11, 0x40, 0x40, 0xee, 0xc2, 0x00, 0xb1, 0x01, 0x34,
+    0xcb, 0x00, 0x40, 0xfa, 0x0f, 0xc0, 0x41, 0x00, 0x11, 0xc0, 0x41, 0x0c,
+    0xcf, 0x63, 0x4b, 0x01, 0x05, 0x81, 0xc3, 0x73, 0xfc, 0x0f, 0xce, 0xf1,
+    0xc7, 0xc8, 0x23, 0x01, 0x80, 0x98, 0xca, 0xa5, 0xee, 0x01, 0x09, 0xb9,
+    0x14, 0x40, 0x41, 0x18, 0xc6, 0xd2, 0xef, 0x0f, 0x9d, 0x91, 0xc4, 0xbc,
+    0x5c, 0x0f, 0xce, 0x20, 0x11, 0xc0, 0x41, 0x25, 0xca, 0xa4, 0xae, 0x01,
+    0x4f, 0x31, 0x03, 0x40, 0x41, 0x37, 0x43, 0x01, 0x95, 0xc0, 0x41, 0x43,
+    0xd0, 0x5f, 0xc2, 0x01, 0x3e, 0x39, 0xcc, 0x89, 0xc1, 0x01, 0x31, 0x31,
+    0x0b, 0xc0, 0x41, 0x4f, 0x45, 0x0c, 0x91, 0x40, 0x41, 0x5b, 0xc2, 0x00,
+    0x29, 0x0f, 0xcd, 0x31, 0x4b, 0x96, 0xd7, 0x40, 0x41, 0x67, 0x47, 0xc0,
+    0xc1, 0xc0, 0x41, 0x7f, 0x07, 0xc0, 0x41, 0x9d, 0x52, 0x28, 0xce, 0xc0,
+    0x41, 0xa7, 0xc3, 0x00, 0x44, 0x0f, 0xce, 0x28, 0x07, 0xc0, 0x41, 0xad,
+    0xc7, 0xc4, 0x10, 0x01, 0x36, 0x71, 0xc8, 0x12, 0x47, 0x01, 0x30, 0x69,
+    0x42, 0x00, 0x43, 0x40, 0x41, 0xb7, 0x06, 0xc0, 0x41, 0xc6, 0x47, 0xc0,
+    0x89, 0xc0, 0x41, 0xd0, 0xc3, 0x0d, 0x14, 0x0f, 0xd6, 0x90, 0x16, 0xc0,
+    0x41, 0xf8, 0xc8, 0xb8, 0x4a, 0x01, 0x09, 0x28, 0x42, 0x00, 0x2a, 0xc0,
+    0x42, 0x04, 0x16, 0x40, 0x42, 0x28, 0xd1, 0x53, 0xdc, 0x01, 0x1f, 0xf9,
+    0x46, 0x38, 0xe8, 0xc0, 0x42, 0x34, 0xda, 0x1c, 0x52, 0x07, 0xff, 0xe0,
+    0x0e, 0xc0, 0x42, 0x40, 0xcb, 0x8e, 0x34, 0x0f, 0xcb, 0xa8, 0x44, 0x78,
+    0xf3, 0xc0, 0x42, 0x4f, 0xc4, 0xcc, 0x91, 0x00, 0x16, 0xd8, 0x46, 0xd1,
+    0xbd, 0xc0, 0x42, 0x67, 0x44, 0x3c, 0x52, 0x40, 0x42, 0x73, 0x46, 0xcd,
+    0x37, 0xc0, 0x42, 0x7f, 0x51, 0x50, 0x35, 0xc0, 0x42, 0xc2, 0x4a, 0x51,
+    0x89, 0x40, 0x42, 0xda, 0x15, 0xc0, 0x42, 0xf2, 0x42, 0x01, 0x0e, 0xc0,
+    0x42, 0xfe, 0x48, 0x10, 0xb4, 0xc0, 0x43, 0x0a, 0x45, 0x01, 0xc3, 0xc0,
+    0x43, 0x16, 0xd4, 0x3b, 0xd8, 0x08, 0xd1, 0x99, 0x47, 0x02, 0x0e, 0xc0,
+    0x43, 0x2e, 0x46, 0x34, 0x6f, 0x40, 0x43, 0x8a, 0xce, 0x6d, 0xcc, 0x01,
+    0x17, 0xf9, 0x14, 0xc0, 0x43, 0x96, 0x15, 0xc0, 0x43, 0xa8, 0x45, 0x00,
+    0x49, 0xc0, 0x43, 0xb4, 0xca, 0x9c, 0xe8, 0x01, 0x4c, 0x11, 0xd6, 0x2c,
+    0x02, 0x01, 0x53, 0x20, 0x49, 0xaf, 0xe4, 0xc0, 0x43, 0xc0, 0xc2, 0x11,
+    0xa5, 0x01, 0x5f, 0x11, 0xc8, 0xb6, 0x3a, 0x0f, 0xcc, 0x98, 0x47, 0xca,
+    0x14, 0xc0, 0x43, 0xd2, 0x47, 0xc0, 0xf2, 0xc0, 0x44, 0x02, 0xcc, 0x8b,
+    0x41, 0x0f, 0x9c, 0x19, 0x94, 0x0f, 0xd6, 0xc8, 0xc2, 0x00, 0x10, 0x01,
+    0x35, 0xa9, 0xc5, 0xd7, 0xe5, 0x01, 0x32, 0x19, 0xc6, 0xd1, 0x2d, 0x0f,
+    0xc9, 0xc8, 0xc6, 0xd1, 0x09, 0x0f, 0xab, 0xc9, 0xc2, 0x00, 0x74, 0x01,
+    0x50, 0xe8, 0xc9, 0x48, 0xa4, 0x01, 0x33, 0x49, 0x42, 0x02, 0xbc, 0xc0,
+    0x44, 0x32, 0xd9, 0x1e, 0x37, 0x01, 0x50, 0xb0, 0xcb, 0x5a, 0x97, 0x01,
+    0x12, 0xf9, 0x00, 0x40, 0x44, 0x3e, 0xc6, 0xcb, 0xb7, 0x01, 0x31, 0x79,
+    0x00, 0x40, 0x44, 0x4a, 0x45, 0xd4, 0x89, 0xc0, 0x44, 0x56, 0xca, 0xa4,
+    0x7c, 0x0f, 0xa4, 0xd9, 0xc6, 0x08, 0xea, 0x00, 0x05, 0x28, 0x42, 0x00,
+    0x89, 0xc0, 0x44, 0x68, 0xc8, 0xb9, 0x1a, 0x0f, 0xcb, 0x59, 0xc2, 0x49,
+    0x0c, 0x0f, 0xb7, 0xb1, 0x50, 0x5b, 0x52, 0xc0, 0x44, 0x73, 0x06, 0x40,
+    0x44, 0xf5, 0xc8, 0xb9, 0x32, 0x01, 0x36, 0x81, 0x07, 0xc0, 0x44, 0xff,
+    0x42, 0x00, 0xa9, 0xc0, 0x45, 0x0c, 0x11, 0xc0, 0x45, 0x1b, 0x12, 0xc0,
+    0x45, 0x25, 0x14, 0xc0, 0x45, 0x31, 0x4b, 0x8c, 0x62, 0x40, 0x45, 0x3d,
+    0xc6, 0xcb, 0x75, 0x01, 0x32, 0x89, 0xc6, 0xd2, 0x53, 0x01, 0x71, 0xf8,
+    0xc5, 0xd1, 0xee, 0x01, 0x31, 0x21, 0xc5, 0xda, 0x47, 0x01, 0x08, 0x30,
+    0xc9, 0x08, 0xe7, 0x01, 0x31, 0x09, 0x50, 0x59, 0x12, 0x40, 0x45, 0xb5,
+    0xc3, 0x03, 0xd9, 0x0f, 0xa7, 0xbb, 0x00, 0x45, 0xc1, 0xc4, 0x2a, 0xa0,
+    0x0f, 0x9e, 0xa8, 0xc5, 0x79, 0x8a, 0x0f, 0xa6, 0x29, 0xc9, 0xac, 0x57,
+    0x0f, 0xc8, 0xc8, 0xc5, 0x11, 0x55, 0x0f, 0xa1, 0x8a, 0x00, 0x45, 0xc7,
+    0x42, 0xbe, 0x99, 0xc0, 0x45, 0xcd, 0x08, 0x40, 0x45, 0xd9, 0x14, 0xc0,
+    0x45, 0xe1, 0x05, 0xc0, 0x45, 0xeb, 0x15, 0xc0, 0x46, 0x05, 0x12, 0xc0,
+    0x46, 0x29, 0x04, 0xc0, 0x46, 0x35, 0x16, 0xc0, 0x46, 0x4b, 0x46, 0xd0,
+    0x31, 0xc0, 0x46, 0x63, 0x06, 0xc0, 0x46, 0x6f, 0x0e, 0xc0, 0x46, 0x81,
+    0x0a, 0xc0, 0x46, 0x8d, 0x0f, 0xc0, 0x46, 0x9f, 0x19, 0xc0, 0x46, 0xa7,
+    0x08, 0xc0, 0x46, 0xb1, 0x0c, 0xc0, 0x46, 0xbd, 0x07, 0xc0, 0x46, 0xc9,
+    0x44, 0xe3, 0xb7, 0xc0, 0x46, 0xdb, 0xc3, 0x1a, 0x7c, 0x01, 0x75, 0xc9,
+    0x09, 0x40, 0x46, 0xeb, 0x96, 0x01, 0x8e, 0x03, 0x00, 0x46, 0xf7, 0xc2,
+    0x47, 0xa4, 0x01, 0x8e, 0x09, 0xc2, 0xe5, 0x85, 0x01, 0x8e, 0x11, 0xc3,
+    0xe5, 0x84, 0x01, 0x8e, 0x19, 0x95, 0x01, 0x8e, 0x8b, 0x00, 0x46, 0xfb,
+    0x8a, 0x01, 0x8e, 0x83, 0x00, 0x47, 0x15, 0x90, 0x01, 0x8e, 0x79, 0x92,
+    0x01, 0x8e, 0x93, 0x00, 0x47, 0x2d, 0x86, 0x01, 0x8e, 0xa1, 0x93, 0x01,
+    0x8f, 0x18, 0x42, 0x00, 0x3b, 0xc0, 0x47, 0x39, 0x07, 0xc0, 0x47, 0x48,
+    0x14, 0xc0, 0x47, 0x54, 0xcb, 0x94, 0xc7, 0x0f, 0x9e, 0x09, 0xc5, 0xdc,
+    0x45, 0x0f, 0x99, 0x80, 0x0b, 0xc0, 0x47, 0x5e, 0x14, 0xc0, 0x47, 0x68,
+    0x44, 0xe0, 0xa7, 0xc0, 0x47, 0x74, 0x42, 0x00, 0x47, 0x40, 0x47, 0x9e,
+    0xc3, 0x01, 0xe7, 0x01, 0x35, 0xb9, 0xc4, 0x79, 0xe6, 0x01, 0x31, 0x39,
+    0xc5, 0xd7, 0x2c, 0x0f, 0xa1, 0xf9, 0xc4, 0xe3, 0x6f, 0x0f, 0xa0, 0xa1,
+    0xc2, 0x18, 0xb3, 0x0f, 0xce, 0x92, 0x00, 0x47, 0xbc, 0x48, 0xbe, 0x52,
+    0xc0, 0x47, 0xc2, 0xca, 0xa7, 0x56, 0x0f, 0x9b, 0x59, 0xc7, 0xc0, 0x6d,
+    0x0f, 0xcb, 0x10, 0xc3, 0x1c, 0xe6, 0x0f, 0xd3, 0xe1, 0xca, 0xa6, 0xf2,
+    0x01, 0x05, 0x10, 0x44, 0x00, 0x74, 0xc0, 0x47, 0xce, 0xc9, 0xad, 0x89,
+    0x0f, 0xa9, 0x70, 0x42, 0x00, 0xcc, 0xc0, 0x47, 0xda, 0xc2, 0x01, 0x48,
+    0x0f, 0xa2, 0x89, 0xc6, 0xcc, 0xc5, 0x0f, 0xa0, 0x51, 0xc6, 0xd2, 0xd7,
+    0x0f, 0xca, 0x80, 0xc8, 0xb9, 0x92, 0x0f, 0xa5, 0x99, 0xca, 0x39, 0x0b,
+    0x0f, 0x98, 0xc8, 0xcd, 0x7b, 0x7d, 0x0f, 0x9e, 0x78, 0xc4, 0x9e, 0x3a,
+    0x0f, 0xcb, 0x29, 0x0d, 0x40, 0x47, 0xea, 0x47, 0x1d, 0xd4, 0xc0, 0x47,
+    0xf6, 0xc2, 0x00, 0x3d, 0x01, 0x30, 0x21, 0x12, 0xc0, 0x48, 0x5c, 0x0f,
+    0x40, 0x48, 0x74, 0x42, 0x00, 0x84, 0xc0, 0x48, 0x7e, 0xce, 0x6e, 0x58,
+    0x0f, 0xa4, 0x89, 0xcb, 0x96, 0xab, 0x0f, 0xb6, 0x58, 0xc8, 0xb7, 0x4a,
+    0x01, 0x30, 0x61, 0x16, 0xc0, 0x48, 0x8a, 0xca, 0xa0, 0xe4, 0x01, 0x19,
+    0x91, 0x4a, 0x9c, 0x3e, 0xc0, 0x48, 0xa2, 0xce, 0x73, 0xfa, 0x0f, 0x9f,
+    0x51, 0x08, 0xc0, 0x48, 0xae, 0xd5, 0x33, 0xa7, 0x01, 0x53, 0x68, 0xcb,
+    0x99, 0x29, 0x01, 0x12, 0xc1, 0xc2, 0x00, 0x65, 0x0f, 0xd5, 0xc1, 0xd2,
+    0x4b, 0xa7, 0x01, 0x72, 0x78, 0xc2, 0x00, 0x45, 0x00, 0x01, 0xd3, 0x00,
+    0x48, 0xc0, 0xcd, 0x76, 0x9d, 0x0f, 0xa5, 0x28, 0x0b, 0xc0, 0x48, 0xc4,
+    0xc7, 0xc5, 0x28, 0x0f, 0x9a, 0xd0, 0xc5, 0x11, 0x55, 0x0f, 0xa1, 0x70,
+    0x1b, 0xc0, 0x48, 0xce, 0x44, 0x1b, 0xaa, 0x40, 0x48, 0xda, 0x46, 0x83,
+    0x27, 0xc0, 0x48, 0xf8, 0xc6, 0xca, 0x97, 0x0f, 0xa6, 0x58, 0xc7, 0x72,
+    0xbf, 0x0f, 0xc9, 0x09, 0x42, 0x00, 0x40, 0xc0, 0x49, 0x04, 0x42, 0x00,
+    0x3b, 0xc0, 0x49, 0x10, 0xc2, 0x04, 0x3d, 0x01, 0x30, 0x0a, 0x00, 0x49,
+    0x1c, 0xd3, 0x46, 0x90, 0x0f, 0xac, 0x09, 0x42, 0x02, 0xaf, 0xc0, 0x49,
+    0x22, 0xcf, 0x69, 0x09, 0x0f, 0x9e, 0xd8, 0x42, 0x00, 0x49, 0xc0, 0x49,
+    0x2e, 0x17, 0x40, 0x49, 0x38, 0xc8, 0xbe, 0x8a, 0x0f, 0x98, 0x30, 0xc3,
+    0xe5, 0x15, 0x0f, 0xb6, 0x19, 0xc3, 0x01, 0x4b, 0x0f, 0x9b, 0x70, 0x45,
+    0x00, 0xba, 0xc0, 0x49, 0x4a, 0x51, 0x4e, 0xf2, 0xc0, 0x49, 0x9a, 0x4d,
+    0x77, 0xc8, 0x40, 0x49, 0xac, 0x0e, 0xc0, 0x49, 0xc6, 0xe0, 0x00, 0xa7,
+    0x01, 0x3b, 0x09, 0x14, 0x40, 0x49, 0xd2, 0x00, 0xc0, 0x49, 0xde, 0xc3,
+    0x2e, 0xab, 0x01, 0x5f, 0x01, 0xc4, 0x2a, 0x3e, 0x0f, 0xce, 0x08, 0x42,
+    0x01, 0x19, 0xc0, 0x49, 0xea, 0xc5, 0x00, 0xb9, 0x00, 0x05, 0x10, 0xc5,
+    0x00, 0xb9, 0x01, 0x05, 0xa9, 0xc3, 0x12, 0xad, 0x00, 0x05, 0xc0, 0x50,
+    0x5b, 0x72, 0xc0, 0x49, 0xf6, 0x4d, 0x76, 0xde, 0x40, 0x4a, 0x04, 0x47,
+    0x02, 0x0e, 0xc0, 0x4a, 0x48, 0x47, 0x0a, 0xda, 0xc0, 0x4a, 0x5a, 0x49,
+    0x0b, 0x17, 0xc0, 0x4a, 0x66, 0xce, 0x74, 0xb0, 0x00, 0x24, 0x11, 0xc6,
+    0x4a, 0x9f, 0x05, 0x33, 0xf1, 0xc7, 0xc7, 0x27, 0x05, 0x33, 0xf8, 0xce,
+    0x74, 0xf6, 0x00, 0x04, 0x99, 0xc5, 0x1d, 0x1d, 0x01, 0x10, 0xb0, 0x49,
+    0xb0, 0x2c, 0x40, 0x4a, 0x72, 0x8e, 0x0f, 0xcd, 0x69, 0x96, 0x0f, 0xa5,
+    0xd0, 0xcb, 0x94, 0xd2, 0x01, 0x35, 0xe1, 0xc7, 0xb3, 0x85, 0x07, 0xf2,
+    0x28, 0xc7, 0xc5, 0x36, 0x01, 0x35, 0xd1, 0x06, 0xc0, 0x4a, 0x96, 0xc5,
+    0x33, 0x24, 0x00, 0x01, 0xd8, 0x16, 0xc0, 0x4a, 0x9c, 0xcf, 0x62, 0xc4,
+    0x0f, 0xca, 0x40, 0xc9, 0xb2, 0x7e, 0x01, 0x09, 0x01, 0x45, 0x29, 0x7c,
+    0x40, 0x4a, 0xa8, 0xc5, 0xda, 0xce, 0x0f, 0x99, 0x89, 0xcf, 0x6b, 0x34,
+    0x0f, 0xb2, 0x40, 0x43, 0x01, 0x97, 0xc0, 0x4a, 0xae, 0xc6, 0xd2, 0x41,
+    0x01, 0x11, 0xf9, 0x45, 0xd6, 0x7d, 0x40, 0x4a, 0xb8, 0x48, 0xbe, 0x22,
+    0xc0, 0x4a, 0xd4, 0xcd, 0x75, 0x65, 0x0f, 0xc8, 0xc0, 0x42, 0x00, 0xaf,
+    0xc0, 0x4b, 0x26, 0xd5, 0x34, 0x3a, 0x01, 0x39, 0xd1, 0xcd, 0x79, 0x41,
+    0x01, 0x00, 0x30, 0x45, 0xdb, 0xa0, 0xc0, 0x4b, 0x32, 0x46, 0x39, 0xfb,
+    0x40, 0x4b, 0x52, 0xcd, 0x7d, 0xd3, 0x01, 0x53, 0x61, 0x43, 0x05, 0xb2,
+    0xc0, 0x4b, 0x5e, 0x46, 0x00, 0xd4, 0x40, 0x4b, 0x6a, 0xc8, 0xbc, 0xea,
+    0x0f, 0xd3, 0xd1, 0x42, 0x00, 0xc2, 0xc0, 0x4b, 0x76, 0xd3, 0x41, 0x84,
+    0x01, 0x71, 0xe0, 0x16, 0xc0, 0x4b, 0x82, 0x14, 0xc0, 0x4b, 0x8e, 0x46,
+    0xd2, 0xf5, 0xc0, 0x4b, 0x98, 0xcd, 0x31, 0x8b, 0x0f, 0xac, 0x19, 0xc4,
+    0x01, 0xdd, 0x0f, 0x9e, 0xf9, 0xcc, 0x83, 0x85, 0x0f, 0xce, 0x68, 0xd7,
+    0x28, 0xb6, 0x01, 0x39, 0x49, 0x03, 0xc0, 0x4b, 0xa4, 0x0b, 0x40, 0x4b,
+    0xb0, 0xc6, 0xcc, 0xf5, 0x01, 0x1f, 0x89, 0xc8, 0xb5, 0x72, 0x0f, 0xaf,
+    0x00, 0xce, 0x73, 0x60, 0x0f, 0x9c, 0xc9, 0xc2, 0x00, 0xb0, 0x0f, 0xb6,
+    0x99, 0xce, 0x71, 0x68, 0x0f, 0xca, 0xc8, 0x00, 0x40, 0x4b, 0xbc, 0x16,
+    0xc0, 0x4b, 0xc8, 0xca, 0x85, 0xc7, 0x0f, 0xd7, 0x08, 0xc4, 0xba, 0xe0,
+    0x0f, 0xcc, 0xa9, 0x47, 0xc2, 0xea, 0x40, 0x4b, 0xd4, 0x48, 0x10, 0xc1,
+    0xc0, 0x4b, 0xf0, 0xc5, 0xdb, 0x0f, 0x0f, 0xcb, 0x50, 0xc3, 0x05, 0x9f,
+    0x01, 0x32, 0x21, 0xc6, 0xce, 0x6f, 0x0f, 0xb7, 0x82, 0x00, 0x4b, 0xfc,
+    0x4c, 0x11, 0xe2, 0xc0, 0x4c, 0x02, 0xd1, 0x48, 0x11, 0x00, 0x41, 0xb1,
+    0x0f, 0xc0, 0x4c, 0x2c, 0x4b, 0x6f, 0xc7, 0xc0, 0x4c, 0x38, 0x47, 0x02,
+    0x0e, 0x40, 0x4c, 0x5c, 0xc4, 0xde, 0xd7, 0x0f, 0xcd, 0xd1, 0xc3, 0x0e,
+    0x61, 0x0f, 0xcf, 0xb8, 0xc2, 0x1e, 0xd5, 0x0f, 0xcd, 0x41, 0xc2, 0x02,
+    0xa7, 0x0f, 0xa4, 0x02, 0x00, 0x4c, 0xb4, 0xc2, 0x00, 0x29, 0x01, 0x37,
+    0xb9, 0xcd, 0x77, 0x46, 0x0f, 0x9d, 0xf8, 0x16, 0xc0, 0x4c, 0xba, 0x12,
+    0x40, 0x4c, 0xc4, 0x86, 0x0f, 0xb7, 0xb9, 0xca, 0x9e, 0x3c, 0x0f, 0xab,
+    0xa9, 0x42, 0x02, 0x37, 0x40, 0x4c, 0xce, 0x46, 0x70, 0xd0, 0xc0, 0x4c,
+    0xda, 0xcb, 0x96, 0xcc, 0x0f, 0x9a, 0xa8, 0x45, 0x00, 0xdd, 0xc0, 0x4c,
+    0xe6, 0xce, 0x70, 0x96, 0x05, 0x33, 0x98, 0xc3, 0x15, 0x0f, 0x0f, 0xcc,
+    0x81, 0xc2, 0x0b, 0x47, 0x0f, 0xc9, 0xb8, 0x14, 0xc0, 0x4c, 0xf2, 0x4c,
+    0x01, 0xf6, 0xc0, 0x4c, 0xfc, 0xc5, 0xda, 0x6a, 0x01, 0x30, 0xc1, 0x18,
+    0xc0, 0x4d, 0x0e, 0xd0, 0x5b, 0xf2, 0x0f, 0xca, 0xc0, 0xc3, 0x00, 0x28,
+    0x0f, 0xb5, 0xf9, 0x42, 0x00, 0x61, 0xc0, 0x4d, 0x1a, 0xd0, 0x5e, 0x42,
+    0x01, 0x1b, 0xe9, 0xca, 0x9a, 0x72, 0x0f, 0x99, 0x01, 0x46, 0x2a, 0x9f,
+    0xc0, 0x4d, 0x2e, 0xdd, 0x11, 0x51, 0x0f, 0xc9, 0x78, 0xca, 0xa2, 0x10,
+    0x01, 0x37, 0x49, 0x43, 0x00, 0x4b, 0xc0, 0x4d, 0x3a, 0x92, 0x0f, 0xb5,
+    0x11, 0xc3, 0x19, 0x78, 0x0f, 0xb7, 0x08, 0x43, 0xc4, 0x20, 0xc0, 0x4d,
+    0x46, 0xc4, 0xc0, 0x85, 0x0f, 0xb7, 0xa0, 0xc3, 0x00, 0xca, 0x01, 0x34,
+    0xb1, 0xc2, 0x15, 0x13, 0x0f, 0xcf, 0x18, 0x44, 0x07, 0x31, 0xc0, 0x4d,
+    0x52, 0xc4, 0x44, 0xba, 0x01, 0x08, 0x41, 0x07, 0xc0, 0x4d, 0x64, 0xc3,
+    0x1f, 0x48, 0x0f, 0xa6, 0xe0, 0xc8, 0xbb, 0xba, 0x0f, 0x9c, 0x90, 0xc5,
+    0x2a, 0x94, 0x01, 0x3a, 0x21, 0xc3, 0x12, 0xb8, 0x01, 0x30, 0x1b, 0x00,
+    0x4d, 0x70, 0xd0, 0x5f, 0xf2, 0x0f, 0x9e, 0xa1, 0xc7, 0xca, 0x61, 0x0f,
+    0x9e, 0x10, 0xc2, 0x00, 0x71, 0x0f, 0xa0, 0x61, 0xc2, 0x00, 0x3c, 0x0f,
+    0xa0, 0x68, 0x43, 0x00, 0x8e, 0xc0, 0x4d, 0x76, 0xd6, 0x2c, 0x18, 0x01,
+    0x08, 0xb8, 0xd6, 0x1f, 0x7f, 0x0f, 0xb3, 0x53, 0x00, 0x4d, 0x82, 0xc2,
+    0x11, 0xa5, 0x00, 0x01, 0x7a, 0x00, 0x4d, 0x88, 0x4e, 0x6d, 0x16, 0xc0,
+    0x4d, 0x8e, 0xdb, 0x15, 0xcc, 0x08, 0xd5, 0x03, 0x00, 0x4d, 0x96, 0x45,
+    0x01, 0xc3, 0xc0, 0x4d, 0x9c, 0x15, 0xc0, 0x4d, 0xb4, 0xcf, 0x63, 0xff,
+    0x08, 0xd4, 0xc1, 0x55, 0x34, 0x79, 0xc0, 0x4d, 0xc0, 0x57, 0x26, 0xd3,
+    0xc0, 0x4d, 0xf0, 0x47, 0x02, 0x0e, 0xc0, 0x4e, 0x00, 0x46, 0x34, 0x6f,
+    0x40, 0x4e, 0x5a, 0xc8, 0xb7, 0x5a, 0x01, 0x35, 0xe9, 0xc2, 0x01, 0x26,
+    0x0f, 0xcf, 0x30, 0xd4, 0x3e, 0x44, 0x01, 0x1c, 0xa1, 0x00, 0xc0, 0x4e,
+    0x66, 0xc4, 0x15, 0x2e, 0x0f, 0xca, 0x70, 0x46, 0x09, 0x97, 0xc0, 0x4e,
+    0x78, 0x47, 0x02, 0x0e, 0x40, 0x4e, 0x9c, 0x4c, 0x11, 0xe2, 0xc0, 0x4f,
+    0x16, 0x47, 0x34, 0x2f, 0xc0, 0x4f, 0x28, 0x4a, 0x51, 0x89, 0xc0, 0x4f,
+    0x35, 0xd0, 0x59, 0xf2, 0x08, 0x7a, 0x29, 0x47, 0x02, 0x0e, 0x40, 0x4f,
+    0x5f, 0x42, 0x01, 0x19, 0xc0, 0x4f, 0xbc, 0xd8, 0x24, 0x6b, 0x01, 0x3d,
+    0x38, 0x48, 0x19, 0xd4, 0xc0, 0x4f, 0xc6, 0xc5, 0xda, 0xc9, 0x01, 0x19,
+    0x78, 0xc6, 0xd2, 0xdd, 0x0f, 0xaa, 0x69, 0xcd, 0x6a, 0x0a, 0x00, 0x00,
+    0xb0, 0x43, 0x68, 0xf2, 0xc0, 0x50, 0x1a, 0xc3, 0x09, 0x3a, 0x0f, 0xa4,
+    0x48, 0x47, 0x02, 0x0e, 0xc0, 0x50, 0x72, 0x45, 0x00, 0xba, 0xc0, 0x50,
+    0xc8, 0x4b, 0x6f, 0xc7, 0xc0, 0x50, 0xd8, 0x4c, 0x85, 0xa1, 0x40, 0x50,
+    0xee, 0x07, 0xc0, 0x50, 0xfe, 0xca, 0xa4, 0xe0, 0x01, 0x05, 0xb9, 0x42,
+    0x06, 0x4e, 0x40, 0x51, 0x0a, 0x43, 0x1b, 0x32, 0xc0, 0x51, 0x1f, 0xc6,
+    0xce, 0xff, 0x0f, 0x9a, 0xe9, 0xc2, 0x00, 0x89, 0x00, 0x01, 0x00, 0x49,
+    0x6e, 0x41, 0x40, 0x51, 0x2c, 0x44, 0x03, 0xda, 0xc0, 0x51, 0x38, 0xc3,
+    0x01, 0xe5, 0x0f, 0xab, 0xba, 0x00, 0x51, 0x4a, 0xc9, 0xac, 0xde, 0x0f,
+    0x9e, 0x29, 0xcb, 0x94, 0x01, 0x0f, 0xa1, 0x99, 0x11, 0xc0, 0x51, 0x50,
+    0xc3, 0x09, 0x3a, 0x0f, 0xcf, 0xe8, 0x15, 0xc0, 0x51, 0x5a, 0xc4, 0xdf,
+    0x9b, 0x0f, 0xcd, 0xc1, 0xc7, 0xc8, 0xb6, 0x0f, 0xcd, 0xc8, 0x00, 0xc0,
+    0x51, 0x66, 0x47, 0xc3, 0xed, 0xc0, 0x51, 0x72, 0xc6, 0x91, 0xd5, 0x0f,
+    0x99, 0xd9, 0xc4, 0xaf, 0x8f, 0x0f, 0x98, 0x2b, 0x00, 0x51, 0x9c, 0xd2,
+    0x4a, 0xf3, 0x0f, 0x98, 0x38, 0xc6, 0x07, 0x9a, 0x01, 0x1d, 0x99, 0xc3,
+    0x00, 0xf1, 0x01, 0x1d, 0x91, 0xcd, 0x7b, 0x97, 0x01, 0x50, 0x58, 0x00,
+    0x40, 0x51, 0xa2, 0x43, 0x00, 0x3d, 0xc0, 0x51, 0xba, 0x46, 0x07, 0x2f,
+    0xc0, 0x51, 0xcf, 0xc6, 0xb0, 0xf5, 0x00, 0x00, 0xd0, 0xcc, 0x81, 0x5d,
+    0x01, 0x11, 0x79, 0xc2, 0x00, 0x29, 0x0f, 0x9e, 0x20, 0xc2, 0x00, 0x0a,
+    0x0f, 0x9b, 0x19, 0xcf, 0x61, 0x7a, 0x0f, 0xb4, 0xf8, 0x0e, 0xc0, 0x52,
+    0x09, 0xca, 0xa1, 0x3e, 0x0f, 0xb0, 0x78, 0x42, 0x02, 0xa7, 0xc0, 0x52,
+    0x13, 0xca, 0x4a, 0x11, 0x01, 0x51, 0x98, 0xd5, 0x36, 0x1d, 0x0f, 0xb3,
+    0xa9, 0x90, 0x0f, 0xcd, 0x10, 0x42, 0x02, 0x41, 0xc0, 0x52, 0x20, 0x10,
+    0xc0, 0x52, 0x2c, 0xc2, 0x00, 0x4e, 0x01, 0x01, 0x90, 0xc9, 0xb2, 0x87,
+    0x0f, 0xcd, 0x79, 0xc7, 0xc7, 0xcf, 0x01, 0x18, 0x29, 0x12, 0xc0, 0x52,
+    0x39, 0xc7, 0xc4, 0x1e, 0x01, 0x5e, 0xc1, 0xcc, 0x88, 0xb9, 0x0f, 0xb6,
+    0x38, 0xca, 0x9b, 0xb2, 0x01, 0x1c, 0xb9, 0xc5, 0xbf, 0x4d, 0x01, 0x13,
+    0xd3, 0x00, 0x52, 0x48, 0x15, 0xc0, 0x52, 0x4c, 0x46, 0xcf, 0xd1, 0xc0,
+    0x52, 0x58, 0xc4, 0xde, 0xe3, 0x0f, 0xcb, 0x40, 0x05, 0xc0, 0x52, 0x6a,
+    0xcc, 0x83, 0xb5, 0x01, 0x08, 0x73, 0x00, 0x52, 0x76, 0x1b, 0x40, 0x52,
+    0x7c, 0xc2, 0x00, 0xf1, 0x01, 0x32, 0x3b, 0x00, 0x52, 0x88, 0x15, 0xc0,
+    0x52, 0x8e, 0xc4, 0x09, 0x3a, 0x0f, 0xd5, 0x00, 0x42, 0x11, 0xee, 0xc0,
+    0x52, 0x9d, 0xca, 0x0e, 0x64, 0x01, 0x39, 0x79, 0x07, 0xc0, 0x52, 0xa9,
+    0xc3, 0x13, 0x4e, 0x0f, 0xd4, 0x28, 0xc8, 0xbb, 0x9a, 0x0f, 0xb7, 0xd8,
+    0xc3, 0x4c, 0xa1, 0x01, 0x32, 0x99, 0xc3, 0x1a, 0x2e, 0x0f, 0xa9, 0x58,
+    0xcd, 0x7d, 0x44, 0x01, 0x56, 0xd0, 0xc8, 0xb8, 0xf2, 0x0f, 0xa5, 0x49,
+    0x8e, 0x0f, 0xa4, 0x51, 0xc9, 0x92, 0xda, 0x00, 0x05, 0xb0, 0x00, 0x40,
+    0x52, 0xb5, 0xcc, 0x85, 0xe9, 0x0f, 0xb6, 0x11, 0x49, 0xab, 0xa3, 0xc0,
+    0x52, 0xc1, 0x07, 0x40, 0x52, 0xcd, 0x87, 0x0f, 0xae, 0x7b, 0x00, 0x52,
+    0xd9, 0xc3, 0x7f, 0x6c, 0x0f, 0xb6, 0xa0, 0x16, 0xc0, 0x52, 0xe5, 0x4b,
+    0x8d, 0x9a, 0xc0, 0x52, 0xfd, 0x03, 0xc0, 0x53, 0x21, 0xc3, 0x2a, 0xf6,
+    0x0f, 0xcc, 0xe0, 0xcc, 0x23, 0x33, 0x08, 0xd7, 0xab, 0x00, 0x53, 0x33,
+    0x0e, 0xc0, 0x53, 0x37, 0xce, 0x75, 0x3c, 0x08, 0xd7, 0x7b, 0x00, 0x53,
+    0x46, 0x47, 0xc1, 0x07, 0xc0, 0x53, 0x4a, 0xcb, 0x5a, 0x32, 0x08, 0xd7,
+    0x32, 0x00, 0x53, 0x5c, 0xc3, 0x03, 0x03, 0x01, 0x35, 0xa1, 0x0f, 0x40,
+    0x53, 0x60, 0x05, 0xc0, 0x53, 0x70, 0x45, 0x00, 0xba, 0xc0, 0x53, 0x7c,
+    0x47, 0x34, 0x2f, 0xc0, 0x53, 0xb4, 0x46, 0x09, 0x97, 0xc0, 0x53, 0xc4,
+    0x49, 0xaa, 0x7a, 0xc0, 0x53, 0xe8, 0x47, 0xc1, 0xd2, 0x40, 0x53, 0xfa,
+    0xc7, 0xc4, 0x3a, 0x0f, 0xa1, 0xe1, 0xc5, 0xdd, 0x44, 0x0f, 0xca, 0xf0,
+    0x03, 0xc0, 0x54, 0x12, 0xc8, 0x5b, 0xfa, 0x0f, 0x9b, 0x91, 0xc9, 0xad,
+    0xfe, 0x0f, 0xd5, 0xa0, 0x45, 0x00, 0x73, 0xc0, 0x54, 0x1e, 0xc8, 0xb8,
+    0x2a, 0x0f, 0x9a, 0xb9, 0xc7, 0x42, 0xd3, 0x00, 0x05, 0x19, 0xcb, 0x95,
+    0xb9, 0x0f, 0xd6, 0xb9, 0xc2, 0x11, 0xee, 0x0f, 0xa2, 0xe8, 0x15, 0xc0,
+    0x54, 0x2a, 0x42, 0x00, 0x45, 0x40, 0x54, 0x36, 0xcf, 0x5f, 0x33, 0x01,
+    0x18, 0xb1, 0x16, 0xc0, 0x54, 0x42, 0xc5, 0xd9, 0x66, 0x01, 0x5f, 0x38,
+    0x4d, 0x7e, 0xe4, 0xc0, 0x54, 0x4e, 0xc4, 0x13, 0x66, 0x0f, 0x9b, 0xf8,
+    0xc3, 0x63, 0x7e, 0x0f, 0xb4, 0x9b, 0x00, 0x54, 0x5a, 0xc7, 0xc9, 0x7a,
+    0x0f, 0xa3, 0x70, 0xca, 0x8b, 0x2b, 0x01, 0x3e, 0x13, 0x00, 0x54, 0x60,
+    0x15, 0xc0, 0x54, 0x66, 0xd1, 0x51, 0xef, 0x01, 0x33, 0xf1, 0x00, 0xc0,
+    0x54, 0x78, 0xcc, 0x85, 0x89, 0x0f, 0x9d, 0x69, 0xc9, 0x8e, 0x15, 0x00,
+    0x01, 0x28, 0xc3, 0xb3, 0xd0, 0x01, 0x38, 0x79, 0xc6, 0x16, 0x32, 0x01,
+    0x37, 0x21, 0xd6, 0x31, 0x82, 0x0f, 0xac, 0x31, 0xc9, 0xaa, 0xd4, 0x0f,
+    0xb0, 0xa1, 0xc4, 0xe0, 0x73, 0x0f, 0xa1, 0x38, 0x05, 0xc0, 0x54, 0x8a,
+    0x94, 0x0f, 0x9a, 0x81, 0xc4, 0xe4, 0x6b, 0x0f, 0xca, 0xe0, 0xc6, 0xa4,
+    0xe4, 0x01, 0x05, 0x89, 0xc8, 0xb5, 0x6a, 0x01, 0x05, 0x38, 0xcb, 0x9a,
+    0x31, 0x01, 0x00, 0x41, 0xcf, 0x62, 0x79, 0x01, 0x72, 0x70, 0xc9, 0xad,
+    0x92, 0x0f, 0xa4, 0xe1, 0xc2, 0x00, 0x40, 0x0f, 0xa2, 0xd8, 0x16, 0xc0,
+    0x54, 0x9a, 0xc3, 0x05, 0x14, 0x08, 0x5d, 0x4b, 0x00, 0x54, 0xaa, 0xc4,
+    0x09, 0x9d, 0x08, 0x5d, 0x60, 0xc3, 0x02, 0xa3, 0x08, 0x5c, 0xe1, 0xc5,
+    0x0d, 0x20, 0x08, 0x5c, 0xd8, 0xc3, 0xb5, 0x3e, 0x08, 0x5c, 0x89, 0x15,
+    0xc0, 0x54, 0xb0, 0xc2, 0x00, 0x67, 0x08, 0x5c, 0x71, 0xc3, 0x20, 0x18,
+    0x08, 0x5c, 0x61, 0xc8, 0xb9, 0x7a, 0x08, 0x5c, 0x59, 0xc6, 0xcf, 0xd7,
+    0x08, 0x5c, 0x51, 0xc4, 0xe0, 0xe7, 0x08, 0x5c, 0x49, 0xc4, 0x4a, 0xb9,
+    0x08, 0x5c, 0x41, 0xc2, 0x01, 0x7f, 0x08, 0x5c, 0x23, 0x00, 0x54, 0xba,
+    0xc5, 0x4a, 0xb3, 0x08, 0x5c, 0x31, 0xcd, 0x7e, 0x89, 0x08, 0x5c, 0x29,
+    0xc6, 0x40, 0x9a, 0x08, 0x5c, 0x19, 0xc5, 0x9c, 0xa2, 0x08, 0x5c, 0x11,
+    0xc4, 0xe3, 0x27, 0x08, 0x5c, 0x09, 0xc5, 0xa5, 0xfd, 0x08, 0x5c, 0x00,
+    0xd2, 0x48, 0xd7, 0x00, 0xb9, 0xb1, 0xd2, 0x4c, 0xa3, 0x00, 0xb9, 0xa8,
+    0x48, 0xba, 0xd2, 0xc0, 0x54, 0xc0, 0xc3, 0x25, 0xd6, 0x01, 0x5e, 0xd8,
+    0x46, 0xd3, 0x79, 0xc0, 0x54, 0xd2, 0x50, 0x5c, 0x52, 0x40, 0x54, 0xe8,
+    0x4c, 0x7e, 0xd8, 0xc0, 0x55, 0x3c, 0x48, 0xb4, 0x80, 0x40, 0x55, 0x52,
+    0xcc, 0x8b, 0x05, 0x01, 0x30, 0x59, 0x45, 0x74, 0xd9, 0xc0, 0x55, 0x86,
+    0x42, 0x00, 0x29, 0x40, 0x55, 0x92, 0x0b, 0xc0, 0x55, 0x9f, 0xd6, 0x31,
+    0xae, 0x0f, 0xae, 0xd8, 0x49, 0x07, 0xbb, 0xc0, 0x55, 0xab, 0xd1, 0x54,
+    0x42, 0x01, 0x1e, 0x53, 0x00, 0x55, 0xb7, 0xd3, 0x45, 0xd2, 0x01, 0x1e,
+    0x4a, 0x00, 0x55, 0xbd, 0xcb, 0x91, 0x0a, 0x01, 0x12, 0xe1, 0xc3, 0x1e,
+    0x36, 0x00, 0x03, 0xf9, 0xcb, 0x91, 0x57, 0x0f, 0xb4, 0xd0, 0xca, 0x9a,
+    0x90, 0x01, 0x08, 0x49, 0xc7, 0xc5, 0xec, 0x01, 0x08, 0x19, 0xc4, 0x00,
+    0xba, 0x00, 0x05, 0x80, 0xc4, 0x00, 0x87, 0x0f, 0xb1, 0xa9, 0xc6, 0x00,
+    0x91, 0x0f, 0xa5, 0x58, 0x48, 0x89, 0xf5, 0xc0, 0x55, 0xc3, 0x43, 0x09,
+    0x9a, 0x40, 0x55, 0xdc, 0x49, 0xb3, 0x95, 0xc0, 0x56, 0x0c, 0xcb, 0x96,
+    0x27, 0x01, 0x35, 0x71, 0x0b, 0x40, 0x56, 0x3e, 0x51, 0x53, 0xfe, 0xc0,
+    0x56, 0x50, 0x53, 0x43, 0x4c, 0x40, 0x56, 0x62, 0x03, 0xc0, 0x56, 0x6e,
+    0xdb, 0x16, 0xbf, 0x01, 0x1c, 0x11, 0xcb, 0x8f, 0x5d, 0x0f, 0xcb, 0xc0,
+    0x46, 0x8d, 0x69, 0xc0, 0x56, 0x7a, 0xce, 0x6c, 0x28, 0x0f, 0xb7, 0x90,
+    0xd7, 0x2a, 0xde, 0x01, 0x1c, 0x99, 0xc3, 0x01, 0xfd, 0x0f, 0x9d, 0x78,
+    0x0f, 0xc0, 0x56, 0x92, 0xc6, 0x20, 0xab, 0x00, 0x05, 0x40, 0x12, 0xc0,
+    0x56, 0x9e, 0xca, 0xa6, 0xa2, 0x0f, 0xc9, 0x21, 0xcc, 0x81, 0x45, 0x0f,
+    0xa1, 0x50, 0xdc, 0x12, 0x55, 0x01, 0x3c, 0xd9, 0xc9, 0x9a, 0x28, 0x01,
+    0x05, 0x79, 0xc3, 0x1c, 0xd9, 0x0f, 0xa0, 0x4a, 0x00, 0x56, 0xaa, 0x44,
+    0x01, 0x4a, 0xc0, 0x56, 0xb0, 0x00, 0xc0, 0x56, 0xbc, 0x4a, 0x01, 0xa9,
+    0x40, 0x56, 0xd7, 0x4a, 0x01, 0x68, 0xc0, 0x56, 0xe9, 0x48, 0x00, 0x5f,
+    0x40, 0x56, 0xf5, 0x43, 0x00, 0x5b, 0xc0, 0x57, 0x01, 0xc5, 0xd8, 0xb7,
+    0x0f, 0x9b, 0x48, 0x44, 0x00, 0xde, 0xc0, 0x57, 0x0f, 0x00, 0x40, 0x57,
+    0x35, 0x43, 0x06, 0x64, 0xc0, 0x57, 0x4d, 0xc5, 0x11, 0x55, 0x0f, 0xa1,
+    0xb0, 0x4b, 0x97, 0x24, 0xc0, 0x57, 0x65, 0xc7, 0xb7, 0x72, 0x01, 0x14,
+    0x0b, 0x00, 0x57, 0x74, 0x42, 0x05, 0xc0, 0xc0, 0x57, 0x7a, 0xc5, 0xd4,
+    0xfc, 0x01, 0x15, 0x71, 0xc6, 0x07, 0xb0, 0x01, 0x11, 0x22, 0x00, 0x57,
+    0x89, 0x46, 0x00, 0x8b, 0x40, 0x57, 0x8f, 0xc4, 0xe4, 0x07, 0x0f, 0xa1,
+    0x61, 0xc8, 0x02, 0xe7, 0x00, 0x01, 0x20, 0xdd, 0x11, 0xe2, 0x0d, 0xe4,
+    0xf9, 0xcb, 0x99, 0x81, 0x0d, 0xe4, 0xf1, 0xd5, 0x33, 0xfb, 0x0d, 0xe4,
+    0xe9, 0xd1, 0x4f, 0xcf, 0x0d, 0xe4, 0xe1, 0x46, 0xd2, 0x95, 0xc0, 0x57,
+    0x9e, 0x47, 0x02, 0x0e, 0x40, 0x57, 0xba, 0x43, 0x00, 0xa8, 0xc0, 0x58,
+    0x57, 0x00, 0x40, 0x58, 0x69, 0xc4, 0x01, 0xe3, 0x01, 0x2c, 0x99, 0xc9,
+    0xb4, 0xd0, 0x0f, 0xab, 0xb0, 0x00, 0x40, 0x58, 0x75, 0xc3, 0x3e, 0xe1,
+    0x0f, 0xa4, 0x19, 0xc2, 0x0f, 0x7b, 0x0f, 0x9b, 0x08, 0x44, 0x01, 0xd6,
+    0xc0, 0x58, 0x81, 0xcd, 0x78, 0x71, 0x0f, 0xa4, 0xf0, 0x42, 0x01, 0x1b,
+    0xc0, 0x58, 0x8b, 0xc5, 0xd7, 0x7c, 0x01, 0x08, 0xf8, 0x43, 0x1f, 0x3d,
+    0xc0, 0x58, 0x97, 0xcd, 0x5e, 0x85, 0x00, 0x00, 0xf1, 0xd1, 0x51, 0x34,
+    0x0f, 0xb4, 0xc9, 0xc4, 0xe2, 0xeb, 0x0f, 0xcf, 0xf0, 0xc6, 0x00, 0x91,
+    0x01, 0x1e, 0x71, 0xc4, 0x00, 0x49, 0x01, 0x5c, 0x81, 0xc5, 0x00, 0x2c,
+    0x01, 0x5c, 0x88, 0xc5, 0xd7, 0x1d, 0x0f, 0x9a, 0x71, 0xcd, 0x7c, 0xf6,
+    0x0f, 0xcf, 0x38, 0x5d, 0x10, 0x69, 0xc0, 0x58, 0xa3, 0xcb, 0x8f, 0x1b,
+    0x00, 0x05, 0x70, 0xcc, 0x45, 0x8d, 0x05, 0x4a, 0xf9, 0x18, 0xc0, 0x59,
+    0x0b, 0x4f, 0x30, 0x90, 0xc0, 0x59, 0x17, 0x47, 0x02, 0x0e, 0x40, 0x59,
+    0x26, 0x00, 0xc0, 0x59, 0x86, 0x46, 0x01, 0x4a, 0xc0, 0x59, 0xd5, 0x02,
+    0xc0, 0x5a, 0x1c, 0xd5, 0x33, 0x29, 0x01, 0x51, 0xe8, 0x00, 0xc0, 0x5a,
+    0x38, 0xc8, 0xbf, 0xa2, 0x0f, 0xab, 0x69, 0xc9, 0xb0, 0xaa, 0x0f, 0xd4,
+    0x80, 0x47, 0x02, 0x5b, 0x40, 0x5a, 0x5c, 0xc4, 0x15, 0x2e, 0x0f, 0x9a,
+    0xc9, 0xc7, 0xc1, 0x0e, 0x0f, 0x9a, 0xc0, 0xd0, 0x5f, 0xb2, 0x01, 0x49,
+    0x59, 0xd0, 0x3c, 0x90, 0x01, 0x49, 0x80, 0xc2, 0x00, 0x3d, 0x0f, 0xb4,
+    0x00, 0xd9, 0x20, 0xda, 0x0f, 0xc9, 0x19, 0x07, 0xc0, 0x5a, 0x74, 0xc9,
+    0xad, 0x38, 0x0f, 0xcf, 0xd8, 0x00, 0xc0, 0x5a, 0x80, 0x4e, 0x6e, 0x90,
+    0x40, 0x5a, 0x8c, 0xd3, 0x1c, 0xa7, 0x01, 0x3b, 0x39, 0xd8, 0x25, 0x13,
+    0x01, 0x3b, 0x29, 0xc9, 0xb1, 0xa6, 0x01, 0x09, 0xd1, 0xdd, 0x11, 0x8b,
+    0x01, 0x5e, 0x69, 0xd7, 0x28, 0x71, 0x01, 0x5e, 0x78, 0x48, 0x56, 0x9a,
+    0xc0, 0x5a, 0xaa, 0x15, 0xc0, 0x5a, 0xcf, 0xca, 0x9a, 0x06, 0x08, 0x0c,
+    0x89, 0x06, 0xc0, 0x5a, 0xd9, 0xce, 0x74, 0x08, 0x08, 0x0c, 0xb9, 0xc7,
+    0xc2, 0x3b, 0x08, 0x0c, 0xd1, 0xce, 0x6f, 0x70, 0x08, 0x0c, 0xd8, 0xc3,
+    0x02, 0x10, 0x0f, 0x9f, 0xa8, 0x45, 0xdb, 0x3c, 0xc0, 0x5a, 0xeb, 0x44,
+    0x0b, 0xe6, 0xc0, 0x5a, 0xf7, 0x90, 0x01, 0x36, 0x32, 0x00, 0x5b, 0x2b,
+    0x91, 0x0f, 0xa7, 0xdb, 0x00, 0x5b, 0x31, 0xd1, 0x52, 0x77, 0x01, 0x1d,
+    0xb8, 0xc2, 0x00, 0x44, 0x01, 0x11, 0xb0, 0x44, 0x00, 0x74, 0xc0, 0x5b,
+    0x3d, 0xc4, 0xe3, 0x7b, 0x0f, 0xcc, 0xe8, 0xc5, 0x11, 0x55, 0x0f, 0xa1,
+    0x80, 0x49, 0x53, 0xa9, 0xc0, 0x5b, 0x49, 0x47, 0x34, 0x2f, 0xc0, 0x5b,
+    0x55, 0x46, 0x09, 0x97, 0x40, 0x5b, 0x73, 0x43, 0x00, 0xed, 0xc0, 0x5b,
+    0x91, 0x10, 0x40, 0x5b, 0xbb, 0xc9, 0xb0, 0xe0, 0x01, 0x5f, 0x99, 0xc6,
+    0xbc, 0xf4, 0x01, 0x5f, 0xa1, 0xc8, 0xbd, 0xb2, 0x01, 0x5f, 0xa9, 0xc8,
+    0xbc, 0xf2, 0x01, 0x5f, 0xb1, 0xc8, 0xbb, 0xca, 0x01, 0x5f, 0xb9, 0xc9,
+    0xb3, 0xcb, 0x01, 0x5f, 0xc0, 0x9e, 0x07, 0xf0, 0x03, 0x00, 0x5b, 0xc7,
+    0x9f, 0x07, 0xf0, 0x0b, 0x00, 0x5c, 0x0d, 0xa6, 0x07, 0xf0, 0x43, 0x00,
+    0x5c, 0x47, 0xa5, 0x07, 0xf0, 0x3b, 0x00, 0x5c, 0x6f, 0xa4, 0x07, 0xf0,
+    0x33, 0x00, 0x5c, 0x97, 0xa3, 0x07, 0xf0, 0x2b, 0x00, 0x5c, 0xbf, 0xa2,
+    0x07, 0xf0, 0x23, 0x00, 0x5c, 0xe7, 0xa1, 0x07, 0xf0, 0x1b, 0x00, 0x5d,
+    0x0f, 0xa0, 0x07, 0xf0, 0x12, 0x00, 0x5d, 0x37, 0x42, 0x00, 0x91, 0xc0,
+    0x5d, 0x5f, 0xc5, 0x0a, 0x8a, 0x05, 0x30, 0x69, 0xc9, 0x11, 0xf6, 0x05,
+    0x30, 0x71, 0xcd, 0x2c, 0xb2, 0x05, 0x30, 0x79, 0x46, 0x09, 0x97, 0x40,
+    0x5d, 0x6b, 0x46, 0x05, 0x87, 0xc0, 0x5d, 0x8f, 0x42, 0x00, 0x36, 0xc0,
+    0x5d, 0xd2, 0xc5, 0xda, 0xdd, 0x01, 0x09, 0x18, 0x45, 0x00, 0xba, 0xc0,
+    0x5d, 0xe4, 0x45, 0x2b, 0x5f, 0x40, 0x5e, 0x22, 0x5f, 0x0c, 0x84, 0xc0,
+    0x5e, 0x56, 0xcc, 0x82, 0x7d, 0x01, 0x18, 0xb8, 0xc8, 0xb7, 0x0a, 0x0f,
+    0xa7, 0xe1, 0x00, 0x40, 0x5e, 0x62, 0x4f, 0x0b, 0x17, 0xc0, 0x5e, 0x6e,
+    0x4d, 0x29, 0xb9, 0x40, 0x5e, 0xee, 0xcc, 0x81, 0xc9, 0x01, 0x11, 0x81,
+    0xc7, 0xc2, 0x0a, 0x0f, 0x9e, 0x81, 0xc4, 0xe3, 0x0b, 0x0f, 0x98, 0x58,
+    0xcb, 0x96, 0x69, 0x01, 0x0c, 0x49, 0xcd, 0x3f, 0xe2, 0x01, 0x0a, 0xf1,
+    0x08, 0xc0, 0x5f, 0x6e, 0x16, 0xc0, 0x5f, 0x7a, 0x44, 0x05, 0x14, 0x40,
+    0x5f, 0x86, 0x00, 0xc0, 0x5f, 0xac, 0x46, 0xcc, 0xa1, 0xc0, 0x5f, 0xf6,
+    0x45, 0xdd, 0x6c, 0x40, 0x60, 0x02, 0xc4, 0x0d, 0x13, 0x0e, 0x9b, 0xc1,
+    0xc3, 0x05, 0x14, 0x0e, 0x9b, 0xb8, 0x09, 0xc0, 0x60, 0x14, 0xca, 0xa4,
+    0xb8, 0x0f, 0x9c, 0x58, 0x43, 0x5c, 0x89, 0xc0, 0x60, 0x26, 0xc3, 0x04,
+    0x85, 0x0f, 0xd6, 0xa0, 0xc5, 0xc4, 0xa4, 0x01, 0x38, 0x39, 0xc9, 0xb1,
+    0xf7, 0x0f, 0xad, 0x68, 0x43, 0x02, 0x31, 0xc0, 0x60, 0x7a, 0xc8, 0xba,
+    0xa2, 0x0f, 0xcb, 0x08, 0x45, 0x92, 0x80, 0xc0, 0x60, 0x98, 0x4a, 0xa7,
+    0xa6, 0xc0, 0x60, 0xbc, 0x45, 0xd8, 0xb2, 0x40, 0x61, 0x22, 0x0d, 0xc0,
+    0x61, 0x40, 0x44, 0x06, 0xb2, 0xc0, 0x61, 0x4c, 0xc3, 0x0f, 0xed, 0x0f,
+    0xa1, 0x10, 0x00, 0xc0, 0x61, 0x7a, 0x02, 0x40, 0x61, 0xa4, 0x10, 0xc0,
+    0x61, 0xb6, 0xce, 0x72, 0xfe, 0x0f, 0xca, 0x48, 0xcc, 0x84, 0x2d, 0x0f,
+    0xa5, 0x69, 0xc9, 0xa8, 0xc1, 0x0f, 0xd3, 0xa0, 0x44, 0x16, 0xcb, 0xc0,
+    0x61, 0xc0, 0x44, 0x83, 0x63, 0x40, 0x61, 0xcc, 0x07, 0xc0, 0x61, 0xd8,
+    0x42, 0x00, 0xa2, 0x40, 0x61, 0xe2, 0x44, 0x0d, 0xde, 0xc0, 0x61, 0xee,
+    0x42, 0x02, 0x32, 0x40, 0x62, 0x12, 0xd8, 0x22, 0xa3, 0x0f, 0xa8, 0xe9,
+    0xd6, 0x08, 0x88, 0x01, 0x1f, 0x01, 0xcd, 0x00, 0x32, 0x01, 0x1e, 0xf1,
+    0xcb, 0x1a, 0x50, 0x01, 0x1e, 0xe1, 0xce, 0x25, 0xad, 0x01, 0x1d, 0xa1,
+    0x42, 0x00, 0xd0, 0xc0, 0x62, 0x1c, 0x46, 0x00, 0x2c, 0xc0, 0x62, 0x26,
+    0x45, 0x00, 0x49, 0xc0, 0x62, 0x30, 0x44, 0x13, 0x1d, 0x40, 0x62, 0x3a,
+    0x42, 0x01, 0x7c, 0xc0, 0x62, 0x49, 0xc9, 0xb0, 0xce, 0x01, 0x19, 0x80,
+    0x56, 0x30, 0x22, 0xc0, 0x62, 0x55, 0xd6, 0x2c, 0x70, 0x0f, 0x89, 0x50,
+    0xc2, 0x00, 0x8e, 0x0f, 0xcd, 0xbb, 0x00, 0x62, 0x67, 0xc4, 0x7f, 0x35,
+    0x0f, 0xcf, 0x80, 0x8f, 0x0f, 0xb4, 0x53, 0x00, 0x62, 0x6d, 0xc2, 0x00,
+    0x74, 0x0f, 0xb4, 0x31, 0xcc, 0x84, 0xd5, 0x01, 0x09, 0x11, 0x05, 0xc0,
+    0x62, 0x73, 0x42, 0x05, 0x26, 0x40, 0x62, 0x7f, 0x43, 0x01, 0x95, 0xc0,
+    0x62, 0x8b, 0x49, 0x89, 0xf4, 0xc0, 0x62, 0x97, 0x44, 0x0b, 0x26, 0xc0,
+    0x62, 0xbf, 0xc5, 0x33, 0x24, 0x01, 0x02, 0xe9, 0xcb, 0x95, 0x1f, 0x0f,
+    0xa9, 0x88, 0x87, 0x01, 0x15, 0x43, 0x00, 0x62, 0xf3, 0xc4, 0xe3, 0xd3,
+    0x0f, 0x9d, 0xd0, 0x12, 0xc0, 0x62, 0xf9, 0xc2, 0x02, 0xa7, 0x0f, 0xce,
+    0x62, 0x00, 0x63, 0x05, 0x08, 0xc0, 0x63, 0x0b, 0x0e, 0xc0, 0x63, 0x21,
+    0x06, 0xc0, 0x63, 0x2b, 0x11, 0xc0, 0x63, 0x45, 0x05, 0xc0, 0x63, 0x51,
+    0x03, 0xc0, 0x63, 0x67, 0x0a, 0xc0, 0x63, 0x7f, 0x15, 0xc0, 0x63, 0x8b,
+    0x07, 0xc0, 0x63, 0x9b, 0x42, 0x00, 0x74, 0xc0, 0x63, 0xb7, 0x42, 0x01,
+    0x4a, 0xc0, 0x63, 0xc3, 0x0f, 0xc0, 0x63, 0xcf, 0x09, 0xc0, 0x63, 0xe1,
+    0xc5, 0xdb, 0xb9, 0x0e, 0x99, 0xd9, 0xd3, 0x40, 0x2e, 0x0e, 0x99, 0xb9,
+    0x14, 0xc0, 0x63, 0xfc, 0x12, 0xc0, 0x64, 0x06, 0x0d, 0xc0, 0x64, 0x16,
+    0x04, 0xc0, 0x64, 0x22, 0xc3, 0x85, 0x26, 0x0e, 0x98, 0xe9, 0xcc, 0x8a,
+    0xb1, 0x0e, 0x98, 0x88, 0x14, 0xc0, 0x64, 0x34, 0xd2, 0x4b, 0x17, 0x0f,
+    0x9b, 0xa9, 0xc3, 0x3a, 0x48, 0x0f, 0xd6, 0xb0, 0x07, 0xc0, 0x64, 0x40,
+    0x44, 0xcd, 0xca, 0x40, 0x64, 0x52, 0x96, 0x01, 0x37, 0xd1, 0xc7, 0x80,
+    0xa2, 0x01, 0x05, 0xc1, 0xd4, 0x3b, 0x60, 0x0f, 0x9d, 0xf0, 0xd7, 0x2a,
+    0x82, 0x01, 0x3a, 0x29, 0xc2, 0x00, 0x29, 0x0f, 0xa0, 0x2a, 0x00, 0x64,
+    0x76, 0xc7, 0x17, 0x6b, 0x01, 0x1f, 0x91, 0x47, 0x50, 0x5d, 0x40, 0x64,
+    0x7c, 0x00, 0x40, 0x64, 0x88, 0x45, 0xd8, 0x17, 0xc0, 0x64, 0x97, 0x4b,
+    0x96, 0x8a, 0xc0, 0x64, 0xbf, 0xc7, 0x11, 0x53, 0x0f, 0xb1, 0x58, 0x42,
+    0x00, 0x6f, 0x40, 0x64, 0xcb, 0x15, 0xc0, 0x64, 0xd1, 0x45, 0x01, 0xc3,
+    0xc0, 0x64, 0xe1, 0x0e, 0xc0, 0x65, 0x2d, 0x52, 0x47, 0xb7, 0xc0, 0x65,
+    0x39, 0x46, 0x09, 0x97, 0xc0, 0x65, 0x43, 0x4b, 0x6f, 0xc7, 0xc0, 0x65,
+    0x6d, 0xc9, 0xac, 0x96, 0x00, 0x7d, 0xf3, 0x00, 0x65, 0x9e, 0x52, 0x4c,
+    0x13, 0x40, 0x65, 0xa4, 0x47, 0x02, 0x0e, 0xc0, 0x65, 0xbc, 0x42, 0x00,
+    0xa2, 0xc0, 0x65, 0xce, 0xce, 0x6c, 0x6e, 0x01, 0x6b, 0x81, 0xd0, 0x57,
+    0xe2, 0x01, 0x6b, 0xf8, 0x00, 0xc0, 0x65, 0xd4, 0xc8, 0xbc, 0x32, 0x01,
+    0x71, 0xd0, 0xd3, 0x46, 0x31, 0x0f, 0xdd, 0x81, 0x4a, 0x03, 0x3d, 0x40,
+    0x66, 0x16, 0x00, 0xc0, 0x66, 0x28, 0x47, 0x09, 0x90, 0x40, 0x66, 0x8f,
+    0x47, 0x0a, 0xda, 0xc0, 0x66, 0xa7, 0xc9, 0xb4, 0xbe, 0x00, 0x2c, 0x79,
+    0xc6, 0x59, 0x92, 0x00, 0x2c, 0x51, 0xc9, 0x11, 0xf6, 0x00, 0x2c, 0x49,
+    0x03, 0xc0, 0x66, 0xb3, 0xcd, 0x2c, 0xb2, 0x00, 0x2a, 0xf1, 0x05, 0xc0,
+    0x66, 0xbf, 0x07, 0xc0, 0x66, 0xcb, 0xde, 0x0f, 0x5e, 0x00, 0x2a, 0xc8,
+    0xca, 0xa6, 0x84, 0x0f, 0x9d, 0x41, 0xcd, 0x75, 0xc0, 0x0f, 0xb4, 0xd8,
+    0xce, 0x72, 0x9c, 0x0f, 0x9c, 0xf9, 0xc4, 0x7a, 0xfe, 0x01, 0x5f, 0x28,
+    0x05, 0xc0, 0x66, 0xd7, 0x4d, 0x29, 0xb9, 0xc0, 0x66, 0xe3, 0xcf, 0x6b,
+    0x52, 0x0f, 0x4a, 0x21, 0xd0, 0x58, 0x92, 0x0f, 0x4a, 0x29, 0x47, 0x63,
+    0xff, 0xc0, 0x67, 0x63, 0xc5, 0x08, 0x09, 0x0f, 0x4a, 0x39, 0x10, 0xc0,
+    0x67, 0x6f, 0x46, 0x09, 0x97, 0xc0, 0x67, 0x7b, 0x48, 0x10, 0xb4, 0x40,
+    0x67, 0x9f, 0x04, 0xc0, 0x67, 0xab, 0x05, 0xc0, 0x67, 0xcc, 0x06, 0xc0,
+    0x67, 0xe0, 0x12, 0xc0, 0x67, 0xec, 0x16, 0xc0, 0x68, 0x00, 0x14, 0xc0,
+    0x68, 0x1b, 0x18, 0xc0, 0x68, 0x28, 0x15, 0xc0, 0x68, 0x32, 0x03, 0xc0,
+    0x68, 0x58, 0x0e, 0xc0, 0x68, 0x86, 0x42, 0x00, 0xec, 0xc0, 0x68, 0x92,
+    0x0f, 0xc0, 0x68, 0x9e, 0x42, 0x01, 0x4a, 0xc0, 0x68, 0xb3, 0xc5, 0x61,
+    0xc0, 0x0f, 0xb8, 0x19, 0x43, 0x03, 0xd3, 0xc0, 0x68, 0xbd, 0xc4, 0x83,
+    0x39, 0x0f, 0xb8, 0x11, 0x09, 0xc0, 0x68, 0xc9, 0x44, 0x1a, 0x05, 0xc0,
+    0x68, 0xd5, 0xc3, 0xdd, 0x05, 0x0f, 0xba, 0x31, 0xc5, 0xdd, 0xe4, 0x0f,
+    0xba, 0xa9, 0x0a, 0x40, 0x68, 0xe4, 0xda, 0x1a, 0xcc, 0x01, 0x36, 0xa9,
+    0xce, 0x72, 0x72, 0x01, 0x1c, 0x38, 0xc4, 0xd9, 0x17, 0x01, 0x34, 0xb9,
+    0xc8, 0x8d, 0x71, 0x01, 0x09, 0xa9, 0xc2, 0x00, 0x61, 0x00, 0x00, 0x38,
+    0xce, 0x73, 0xde, 0x01, 0x19, 0x71, 0xc8, 0x07, 0x5f, 0x01, 0x12, 0x60,
+    0xcb, 0x23, 0xa0, 0x01, 0x12, 0x51, 0xc2, 0x00, 0xf1, 0x01, 0x12, 0x42,
+    0x00, 0x68, 0xee, 0xc9, 0xae, 0x07, 0x0f, 0xb7, 0xd1, 0x0f, 0x40, 0x68,
+    0xf4, 0xc8, 0xbf, 0xca, 0x0f, 0xb7, 0x61, 0xc9, 0xb1, 0x1f, 0x0f, 0xb7,
+    0x58, 0x51, 0x52, 0x22, 0xc0, 0x69, 0x00, 0xcb, 0x99, 0xa2, 0x0f, 0xd6,
+    0x00, 0x4b, 0x05, 0xf7, 0xc0, 0x69, 0x18, 0xce, 0x6f, 0x54, 0x0f, 0xa7,
+    0xb0, 0xc2, 0x00, 0x49, 0x01, 0x11, 0x03, 0x00, 0x69, 0x38, 0xca, 0x9d,
+    0x24, 0x01, 0x09, 0x59, 0xc9, 0x25, 0xca, 0x0f, 0xa5, 0x11, 0xc7, 0xca,
+    0x84, 0x0f, 0xb1, 0x01, 0xcb, 0x90, 0x7b, 0x0f, 0xb1, 0x38, 0x14, 0xc0,
+    0x69, 0x3e, 0x44, 0x0b, 0x02, 0xc0, 0x69, 0x4a, 0xcc, 0x8c, 0x01, 0x0f,
+    0xb1, 0x90, 0xcb, 0x8b, 0x06, 0x01, 0x30, 0x51, 0xc9, 0xa8, 0x43, 0x08,
+    0x0c, 0xe0, 0x0e, 0xc0, 0x69, 0x55, 0x10, 0xc0, 0x69, 0x5f, 0x06, 0xc0,
+    0x69, 0x75, 0x16, 0xc0, 0x69, 0x83, 0x05, 0xc0, 0x69, 0x91, 0x83, 0x08,
+    0xb8, 0x93, 0x00, 0x69, 0x9b, 0x0c, 0xc0, 0x69, 0xa1, 0x04, 0xc0, 0x69,
+    0xab, 0x09, 0xc0, 0x69, 0xb5, 0xc2, 0x00, 0xd0, 0x08, 0xb8, 0x89, 0xc2,
+    0x0d, 0xf6, 0x08, 0xb8, 0x79, 0xc2, 0x00, 0x39, 0x08, 0xb8, 0x69, 0xc2,
+    0x01, 0xc3, 0x08, 0xb8, 0x49, 0x12, 0xc0, 0x69, 0xbf, 0x0d, 0x40, 0x69,
+    0xc9, 0xc8, 0x91, 0x9a, 0x08, 0xb9, 0xf9, 0x44, 0x00, 0xbb, 0x40, 0x69,
+    0xd3, 0xc5, 0x28, 0xee, 0x08, 0xb9, 0xd9, 0xc2, 0x00, 0xc4, 0x08, 0xb9,
+    0xd0, 0xc4, 0x26, 0x78, 0x08, 0xb9, 0xc9, 0xc5, 0x06, 0xdb, 0x08, 0xb9,
+    0xc1, 0x15, 0xc0, 0x69, 0xe3, 0x08, 0xc0, 0x69, 0xef, 0x16, 0xc0, 0x69,
+    0xfb, 0xc3, 0x05, 0x14, 0x08, 0xb9, 0x89, 0xc4, 0x15, 0xe7, 0x08, 0xb9,
+    0x80, 0x83, 0x08, 0xb9, 0x03, 0x00, 0x6a, 0x07, 0x91, 0x08, 0xb9, 0x41,
+    0x87, 0x08, 0xb9, 0x31, 0x97, 0x08, 0xb9, 0x23, 0x00, 0x6a, 0x17, 0x8b,
+    0x08, 0xb9, 0x12, 0x00, 0x6a, 0x1b, 0x0e, 0xc0, 0x6a, 0x1f, 0xc2, 0x00,
+    0x39, 0x08, 0xb8, 0xf0, 0xc6, 0x6a, 0xfb, 0x01, 0x08, 0x01, 0xc5, 0xd6,
+    0xdc, 0x0f, 0xd4, 0xb8, 0xd3, 0x46, 0x0b, 0x01, 0x03, 0x69, 0xd2, 0x4d,
+    0x69, 0x01, 0x03, 0x58, 0xc4, 0x01, 0x96, 0x01, 0x4c, 0xf9, 0xc5, 0x09,
+    0x02, 0x00, 0x05, 0xa0, 0x42, 0x00, 0xe3, 0xc0, 0x6a, 0x29, 0xc5, 0xde,
+    0x3e, 0x01, 0x1b, 0xd3, 0x00, 0x6a, 0x38, 0xc5, 0x9b, 0xd5, 0x01, 0x1b,
+    0xab, 0x00, 0x6a, 0x3e, 0x0b, 0xc0, 0x6a, 0x44, 0xd0, 0x5c, 0xa2, 0x01,
+    0x1b, 0xb9, 0x14, 0xc0, 0x6a, 0x53, 0x42, 0x02, 0xae, 0xc0, 0x6a, 0x5f,
+    0x06, 0xc0, 0x6a, 0x69, 0x15, 0xc0, 0x6a, 0x7b, 0xc5, 0xd7, 0x8b, 0x01,
+    0x1b, 0x61, 0x05, 0xc0, 0x6a, 0x91, 0xd6, 0x31, 0x14, 0x01, 0x1b, 0x49,
+    0xcf, 0x64, 0x86, 0x01, 0x1b, 0x41, 0x44, 0x00, 0x49, 0xc0, 0x6a, 0x9d,
+    0x44, 0xe1, 0x43, 0xc0, 0x6a, 0xa9, 0xcd, 0x7d, 0xed, 0x01, 0x1a, 0x00,
+    0x42, 0x00, 0x79, 0xc0, 0x6a, 0xb5, 0xd8, 0x23, 0x63, 0x00, 0x04, 0xf8,
+    0xc7, 0x2d, 0x87, 0x00, 0x01, 0x39, 0xc4, 0x66, 0x29, 0x01, 0x5f, 0x20,
+    0xd1, 0x48, 0x11, 0x08, 0x59, 0xc9, 0x47, 0x02, 0x0e, 0x40, 0x6a, 0xc1,
+    0xc4, 0x3d, 0xd8, 0x0f, 0x9f, 0xd1, 0xc6, 0x36, 0x23, 0x00, 0x01, 0x30,
+    0xca, 0xa7, 0xc4, 0x08, 0x08, 0x11, 0x47, 0x34, 0x2f, 0xc0, 0x6b, 0x42,
+    0x19, 0xc0, 0x6b, 0x69, 0xd9, 0x20, 0xc1, 0x08, 0x09, 0xe1, 0xdc, 0x14,
+    0xbd, 0x08, 0x09, 0xe9, 0x48, 0x14, 0xc4, 0x40, 0x6b, 0x75, 0x4a, 0x9f,
+    0x0e, 0xc0, 0x6b, 0x81, 0xc9, 0xb0, 0x23, 0x0f, 0xca, 0x50, 0xd4, 0x3c,
+    0xb4, 0x0f, 0xbd, 0x89, 0xcb, 0x58, 0xc7, 0x0f, 0xbd, 0x21, 0x46, 0x01,
+    0xfc, 0xc0, 0x6b, 0xa3, 0x15, 0xc0, 0x6b, 0xaf, 0xd5, 0x34, 0x8e, 0x0f,
+    0xbd, 0xe8, 0x43, 0x00, 0x7a, 0xc0, 0x6b, 0xbb, 0xd4, 0x3e, 0x30, 0x0f,
+    0x9b, 0xf0, 0xc3, 0x1e, 0x19, 0x01, 0x16, 0x43, 0x00, 0x6b, 0xee, 0x0e,
+    0xc0, 0x6b, 0xf4, 0xca, 0x9b, 0xc6, 0x0f, 0x9f, 0xc8, 0xc8, 0x2f, 0x03,
+    0x0f, 0xb6, 0x48, 0x8d, 0x0f, 0xab, 0x73, 0x00, 0x6b, 0xfe, 0xc6, 0xc9,
+    0xcf, 0x0f, 0xd4, 0x18, 0xcb, 0x95, 0xfb, 0x0f, 0x9c, 0xa8, 0x47, 0x02,
+    0x0e, 0xc0, 0x6c, 0x0b, 0x4d, 0x7f, 0x25, 0x40, 0x6c, 0x95, 0x4b, 0x96,
+    0x48, 0xc0, 0x6c, 0xa9, 0xc4, 0xae, 0x42, 0x0f, 0x99, 0xe1, 0xc5, 0xd9,
+    0x98, 0x0f, 0xa1, 0x08, 0x42, 0x00, 0x3b, 0xc0, 0x6c, 0xd0, 0xc9, 0x95,
+    0x84, 0x01, 0x21, 0x10, 0x00, 0xc0, 0x6c, 0xd8, 0xc7, 0xc6, 0xa2, 0x0f,
+    0xd6, 0x80, 0xc2, 0x00, 0x81, 0x0f, 0xd4, 0xa9, 0x8d, 0x0f, 0x9f, 0x33,
+    0x00, 0x6c, 0xe4, 0xc3, 0x09, 0xe5, 0x0f, 0x9a, 0x60, 0x0e, 0xc0, 0x6c,
+    0xea, 0x46, 0x77, 0x20, 0x40, 0x6c, 0xfa, 0xc3, 0x00, 0x3c, 0x0f, 0xcf,
+    0xd3, 0x00, 0x6d, 0x30, 0xc5, 0xdb, 0x46, 0x01, 0x35, 0xf1, 0x47, 0xc1,
+    0x9a, 0x40, 0x6d, 0x36, 0xc3, 0x09, 0x3b, 0x0f, 0xcd, 0x09, 0xde, 0x0f,
+    0xd6, 0x0f, 0x9f, 0xc0, 0x00, 0x40, 0x6d, 0x48, 0x47, 0x02, 0x0e, 0xc0,
+    0x6d, 0x60, 0x42, 0x00, 0x99, 0xc0, 0x6d, 0xa5, 0xc7, 0xc0, 0x3c, 0x05,
+    0x37, 0x91, 0xc9, 0x11, 0xf6, 0x05, 0x37, 0x99, 0xc9, 0xa8, 0x55, 0x05,
+    0x37, 0xb1, 0xcd, 0x2c, 0xb2, 0x05, 0x37, 0xb8, 0x0d, 0xc0, 0x6d, 0xaf,
+    0xcb, 0x93, 0x25, 0x0f, 0xa1, 0x59, 0xc2, 0x00, 0x45, 0x0f, 0xca, 0x98,
+    0x43, 0x40, 0x85, 0xc0, 0x6d, 0xbd, 0xc4, 0xcd, 0x51, 0x0f, 0xa8, 0x59,
+    0x8a, 0x0f, 0xb6, 0x02, 0x00, 0x6d, 0xd9, 0x00, 0xc0, 0x6d, 0xdf, 0xc8,
+    0xbd, 0xc2, 0x0f, 0xa4, 0x40, 0xca, 0x9e, 0xc8, 0x0f, 0xb6, 0x21, 0xcb,
+    0x90, 0xc8, 0x0f, 0xca, 0xb1, 0xc2, 0x05, 0x03, 0x0f, 0xcb, 0x78, 0xc9,
+    0xb3, 0x0e, 0x01, 0x05, 0xf9, 0xc7, 0x82, 0x99, 0x0f, 0xd7, 0x30, 0xc5,
+    0xd8, 0xc6, 0x0f, 0x9d, 0x89, 0xc6, 0xd3, 0x97, 0x0f, 0xcf, 0x10, 0xca,
+    0xa0, 0x94, 0x0f, 0x9c, 0x11, 0x86, 0x0f, 0xa1, 0x30, 0xcf, 0x61, 0xd4,
+    0x01, 0x4f, 0xc9, 0xc7, 0x27, 0x5d, 0x01, 0x4f, 0xc0, 0x87, 0x0f, 0xb5,
+    0x91, 0xc3, 0x1d, 0xb1, 0x0f, 0xb5, 0xa0, 0xc3, 0x00, 0x5f, 0x0f, 0xcd,
+    0x59, 0x44, 0x7c, 0x59, 0xc0, 0x6d, 0xeb, 0xca, 0x9d, 0xba, 0x0f, 0xa4,
+    0x99, 0xd0, 0x57, 0x82, 0x0f, 0x9e, 0xb1, 0x14, 0xc0, 0x6e, 0x03, 0xc2,
+    0x05, 0x26, 0x0f, 0xd6, 0xc0, 0xc9, 0xac, 0x45, 0x01, 0x19, 0x63, 0x00,
+    0x6e, 0x0f, 0x45, 0xb1, 0x74, 0xc0, 0x6e, 0x15, 0x16, 0x40, 0x6e, 0x47,
+    0x00, 0xc0, 0x6e, 0x53, 0xc8, 0xbd, 0xaa, 0x0f, 0xb6, 0x70, 0xc4, 0x0b,
+    0xcb, 0x01, 0x13, 0x61, 0xc7, 0x00, 0x90, 0x01, 0x09, 0xb0, 0xc5, 0xb2,
+    0x39, 0x0f, 0x9b, 0xd1, 0xc3, 0x0f, 0xed, 0x0f, 0xd5, 0x90, 0xc3, 0xe6,
+    0x11, 0x0f, 0xcc, 0x58, 0xc5, 0x00, 0xef, 0x0f, 0xb4, 0x79, 0x16, 0x40,
+    0x6e, 0x65, 0xc4, 0xdf, 0x87, 0x01, 0x2e, 0x71, 0xc2, 0x00, 0x3d, 0x01,
+    0x01, 0x13, 0x00, 0x6e, 0x71, 0xc4, 0x2a, 0xcc, 0x0f, 0xab, 0x5a, 0x00,
+    0x6e, 0x77, 0x46, 0x77, 0x20, 0x40, 0x6e, 0x7d, 0x4b, 0x6f, 0xc7, 0xc0,
+    0x6e, 0x95, 0x47, 0x02, 0x0e, 0x40, 0x6e, 0x9d, 0xc4, 0x4c, 0x31, 0x0f,
+    0xce, 0x59, 0x95, 0x0f, 0xd7, 0x38, 0x06, 0xc0, 0x6e, 0xfb, 0x42, 0x00,
+    0x07, 0xc0, 0x6f, 0x07, 0xc2, 0x00, 0x3b, 0x0f, 0xcf, 0x88, 0x0b, 0xc0,
+    0x6f, 0x11, 0x44, 0xdf, 0xf3, 0x40, 0x6f, 0x1b, 0x44, 0x9b, 0x5b, 0xc0,
+    0x6f, 0x3b, 0xc8, 0xbf, 0x92, 0x0f, 0xc8, 0x71, 0xc5, 0xdd, 0x3f, 0x0f,
+    0xcb, 0x31, 0xc2, 0x00, 0x7a, 0x0f, 0xcf, 0xc8, 0x03, 0xc0, 0x6f, 0x4d,
+    0xc2, 0x00, 0x5f, 0x00, 0x16, 0xc0, 0x09, 0xc0, 0x6f, 0x5d, 0x0d, 0xc0,
+    0x6f, 0x6f, 0x03, 0xc0, 0x6f, 0x92, 0x15, 0xc0, 0x6f, 0xa4, 0x06, 0xc0,
+    0x6f, 0xc1, 0x1b, 0xc0, 0x6f, 0xd1, 0x08, 0xc0, 0x6f, 0xdb, 0x42, 0x11,
+    0xee, 0xc0, 0x6f, 0xed, 0x0b, 0xc0, 0x6f, 0xff, 0x07, 0xc0, 0x70, 0x0f,
+    0x0f, 0xc0, 0x70, 0x31, 0x16, 0xc0, 0x70, 0x3d, 0x0e, 0xc0, 0x70, 0x4f,
+    0x11, 0xc0, 0x70, 0x59, 0x12, 0xc0, 0x70, 0x71, 0xcc, 0x87, 0x5d, 0x0e,
+    0x83, 0x51, 0x42, 0x02, 0x41, 0xc0, 0x70, 0x87, 0xc4, 0xc6, 0xc9, 0x0e,
+    0x82, 0x01, 0x14, 0x40, 0x70, 0x93, 0xc4, 0x26, 0x78, 0x08, 0xe3, 0x13,
+    0x00, 0x70, 0x9f, 0xc5, 0x06, 0xdb, 0x08, 0xe3, 0x0b, 0x00, 0x70, 0xa5,
+    0x15, 0xc0, 0x70, 0xa9, 0x08, 0xc0, 0x70, 0xbb, 0x16, 0xc0, 0x70, 0xc3,
+    0xc3, 0x05, 0x14, 0x08, 0xe2, 0xd0, 0x45, 0x09, 0x98, 0xc0, 0x70, 0xd1,
+    0xcb, 0x97, 0xf5, 0x08, 0xe2, 0x11, 0xc4, 0x19, 0x53, 0x08, 0xe2, 0x08,
+    0x9f, 0x08, 0xe2, 0x29, 0x9e, 0x08, 0xe2, 0x20, 0x03, 0xc0, 0x70, 0xf5,
+    0x42, 0x07, 0xb2, 0xc0, 0x71, 0x01, 0xcb, 0x1e, 0x89, 0x08, 0xe1, 0xe0,
+    0x03, 0xc0, 0x71, 0x0d, 0x91, 0x08, 0xe1, 0xd1, 0x87, 0x08, 0xe1, 0xc1,
+    0x48, 0xb2, 0x2d, 0xc0, 0x71, 0x19, 0x97, 0x08, 0xe1, 0x93, 0x00, 0x71,
+    0x24, 0x8b, 0x08, 0xe1, 0x82, 0x00, 0x71, 0x28, 0xc2, 0x00, 0xd0, 0x08,
+    0xe1, 0x71, 0x15, 0xc0, 0x71, 0x2c, 0x18, 0xc0, 0x71, 0x3c, 0xc2, 0x00,
+    0xdb, 0x08, 0xe1, 0x49, 0xc2, 0x00, 0x39, 0x08, 0xe1, 0x41, 0xc2, 0x19,
+    0x2c, 0x08, 0xe1, 0x39, 0xc2, 0x01, 0xc3, 0x08, 0xe1, 0x31, 0x04, 0xc0,
+    0x71, 0x46, 0x12, 0xc0, 0x71, 0x50, 0x10, 0xc0, 0x71, 0x5a, 0x06, 0xc0,
+    0x71, 0x70, 0x16, 0xc0, 0x71, 0x7e, 0x0c, 0xc0, 0x71, 0x8c, 0x05, 0xc0,
+    0x71, 0x96, 0x09, 0xc0, 0x71, 0xa0, 0x0d, 0xc0, 0x71, 0xaa, 0x83, 0x08,
+    0xe0, 0x03, 0x00, 0x71, 0xb4, 0x91, 0x08, 0xe0, 0x61, 0x87, 0x08, 0xe0,
+    0x51, 0x97, 0x08, 0xe0, 0x23, 0x00, 0x71, 0xc0, 0x8b, 0x08, 0xe0, 0x12,
+    0x00, 0x71, 0xc4, 0x43, 0x00, 0x29, 0xc0, 0x71, 0xc8, 0x00, 0x40, 0x71,
+    0xf6, 0x45, 0x00, 0x2c, 0xc0, 0x72, 0x15, 0x44, 0x00, 0x49, 0xc0, 0x72,
+    0x21, 0x06, 0x40, 0x72, 0x2b, 0xdb, 0x18, 0x6f, 0x01, 0x3f, 0x00, 0xc2,
+    0x00, 0xbf, 0x01, 0x11, 0x43, 0x00, 0x72, 0x3d, 0xc3, 0x02, 0x9b, 0x01,
+    0x11, 0x3a, 0x00, 0x72, 0x41, 0xcd, 0x7e, 0xa3, 0x0f, 0xa8, 0x79, 0x4a,
+    0xa0, 0x1c, 0x40, 0x72, 0x47, 0xc6, 0x02, 0x0e, 0x0f, 0xa4, 0x61, 0xc5,
+    0xd6, 0x05, 0x0f, 0x9f, 0x48, 0xca, 0x9b, 0x44, 0x0f, 0xcf, 0xa1, 0xc2,
+    0x11, 0xa5, 0x0f, 0xd5, 0xb8, 0x00, 0xc0, 0x72, 0x53, 0x46, 0x01, 0x4a,
+    0xc0, 0x72, 0xa2, 0x02, 0x40, 0x72, 0xe9, 0xc7, 0xc8, 0x3f, 0x0f, 0xcb,
+    0x61, 0xd3, 0x45, 0x01, 0x0f, 0x9a, 0x18, 0xc4, 0x0b, 0x66, 0x0f, 0xa0,
+    0x30, 0x4b, 0x37, 0x43, 0xc0, 0x73, 0x05, 0xd8, 0x24, 0xe3, 0x01, 0x16,
+    0xd1, 0x45, 0x00, 0x8c, 0xc0, 0x73, 0x11, 0x11, 0xc0, 0x73, 0x23, 0x03,
+    0xc0, 0x73, 0x2f, 0xc4, 0x00, 0xba, 0x00, 0x01, 0xe1, 0xcf, 0x69, 0x18,
+    0x01, 0x55, 0x32, 0x00, 0x73, 0x3b, 0x47, 0x02, 0x0e, 0xc0, 0x73, 0x41,
+    0x46, 0x09, 0x97, 0xc0, 0x73, 0x99, 0x4c, 0x11, 0xe2, 0xc0, 0x73, 0xbd,
+    0x15, 0xc0, 0x73, 0xcd, 0x4f, 0x30, 0x90, 0xc0, 0x73, 0xd9, 0x4b, 0x6f,
+    0xc7, 0x40, 0x73, 0xfb, 0x42, 0x00, 0x2f, 0xc0, 0x74, 0x17, 0xd6, 0x21,
+    0x9d, 0x0f, 0xb3, 0x90, 0x47, 0x02, 0x0e, 0xc0, 0x74, 0x24, 0x4c, 0x11,
+    0xe2, 0x40, 0x74, 0x9a, 0x07, 0xc0, 0x74, 0xa6, 0x0d, 0x40, 0x74, 0xb0,
+    0x43, 0xb6, 0x2f, 0xc0, 0x74, 0xbc, 0xd3, 0x44, 0x1d, 0x01, 0x96, 0x78,
+    0xc4, 0x1e, 0xf2, 0x0f, 0xa4, 0x20, 0xcf, 0x63, 0xe1, 0x08, 0x49, 0xf9,
+    0x47, 0x02, 0x0e, 0x40, 0x74, 0xde, 0x83, 0x08, 0x14, 0x03, 0x00, 0x75,
+    0x40, 0x87, 0x08, 0x14, 0x0b, 0x00, 0x75, 0x44, 0x84, 0x08, 0x14, 0x13,
+    0x00, 0x75, 0x48, 0x89, 0x08, 0x14, 0x21, 0x86, 0x08, 0x14, 0x29, 0x8b,
+    0x08, 0x14, 0x31, 0x99, 0x08, 0x14, 0x39, 0x9c, 0x08, 0x14, 0x41, 0x96,
+    0x08, 0x14, 0xbb, 0x00, 0x75, 0x4c, 0x8c, 0x08, 0x14, 0x51, 0x8d, 0x08,
+    0x14, 0x5b, 0x00, 0x75, 0x54, 0x93, 0x08, 0x14, 0x61, 0x8e, 0x08, 0x14,
+    0x69, 0x8f, 0x08, 0x14, 0x73, 0x00, 0x75, 0x58, 0x90, 0x08, 0x14, 0x7b,
+    0x00, 0x75, 0x5c, 0x97, 0x08, 0x14, 0x91, 0x92, 0x08, 0x14, 0x99, 0x94,
+    0x08, 0x14, 0xa9, 0x95, 0x08, 0x14, 0xb1, 0x8a, 0x08, 0x14, 0xd9, 0x9a,
+    0x08, 0x14, 0xe0, 0x42, 0x09, 0x3b, 0xc0, 0x75, 0x60, 0xc6, 0x8f, 0xfc,
+    0x01, 0x05, 0xf0, 0x15, 0xc0, 0x75, 0x6d, 0x47, 0x02, 0x0e, 0xc0, 0x75,
+    0x79, 0x05, 0xc0, 0x75, 0xc9, 0x52, 0x48, 0xc5, 0x40, 0x75, 0xd5, 0x00,
+    0x40, 0x75, 0xeb, 0xc2, 0x05, 0x03, 0x0f, 0x9f, 0xb9, 0xc5, 0xd8, 0x71,
+    0x0f, 0xcb, 0xe0, 0xc8, 0xbc, 0x7a, 0x0f, 0xa0, 0xf1, 0xc3, 0x01, 0xe5,
+    0x0f, 0xd4, 0xe0, 0x47, 0x02, 0x0e, 0xc0, 0x75, 0xf7, 0xc8, 0x22, 0x83,
+    0x00, 0x75, 0x79, 0x4b, 0x6f, 0xc7, 0xc0, 0x76, 0x4e, 0x15, 0xc0, 0x76,
+    0x7b, 0xc5, 0xdc, 0x54, 0x00, 0x76, 0x31, 0x49, 0xb2, 0x63, 0xc0, 0x76,
+    0x87, 0xd1, 0x52, 0xaa, 0x00, 0x76, 0x61, 0xc9, 0xae, 0x97, 0x00, 0x76,
+    0x69, 0x46, 0x09, 0x97, 0xc0, 0x76, 0x97, 0x43, 0x60, 0xe8, 0x40, 0x76,
+    0xbb, 0x46, 0x00, 0x2c, 0xc0, 0x76, 0xc7, 0x45, 0x00, 0x49, 0xc0, 0x76,
+    0xef, 0x44, 0x02, 0x9b, 0xc0, 0x77, 0x0b, 0x45, 0x01, 0xce, 0xc0, 0x77,
+    0x15, 0xce, 0x6b, 0x9c, 0x01, 0x38, 0x09, 0x44, 0x05, 0x14, 0xc0, 0x77,
+    0x30, 0x16, 0xc0, 0x77, 0x3c, 0xd2, 0x4a, 0x75, 0x0f, 0xdc, 0x21, 0xd3,
+    0x3f, 0xe2, 0x0f, 0xdc, 0x30, 0x46, 0x01, 0xfc, 0xc0, 0x77, 0x48, 0x16,
+    0xc0, 0x77, 0x5a, 0x15, 0xc0, 0x77, 0x66, 0xd0, 0x58, 0x62, 0x0f, 0xc1,
+    0xe9, 0xd1, 0x56, 0xd9, 0x0f, 0xc1, 0xa9, 0x03, 0xc0, 0x77, 0x72, 0xcf,
+    0x61, 0x4d, 0x01, 0x3f, 0x81, 0x06, 0xc0, 0x77, 0x81, 0xcd, 0x7c, 0xa8,
+    0x01, 0x0e, 0x41, 0x0a, 0xc0, 0x77, 0x8d, 0xc6, 0xca, 0xa3, 0x0f, 0xb3,
+    0x69, 0x46, 0x04, 0x8f, 0x40, 0x77, 0x99, 0x46, 0x03, 0x13, 0xc0, 0x77,
+    0xa5, 0x4e, 0x6c, 0xfa, 0xc0, 0x77, 0xb1, 0xcc, 0x4e, 0x35, 0x0f, 0xa9,
+    0xd1, 0xd1, 0x56, 0x2f, 0x0f, 0xb7, 0x31, 0xc8, 0x2e, 0x20, 0x0f, 0xb7,
+    0x38, 0xc4, 0x32, 0xbc, 0x01, 0x15, 0x2b, 0x00, 0x77, 0xbd, 0x45, 0x01,
+    0xa2, 0xc0, 0x77, 0xc3, 0xd7, 0x27, 0xfe, 0x01, 0x17, 0x81, 0x45, 0x11,
+    0x17, 0xc0, 0x77, 0xd2, 0xc9, 0xb2, 0xea, 0x01, 0x4b, 0xf1, 0x45, 0x01,
+    0x5d, 0x40, 0x77, 0xf9, 0xc9, 0xb0, 0xd7, 0x0f, 0xcc, 0x21, 0xd7, 0x1f,
+    0x33, 0x01, 0x33, 0x91, 0xc2, 0x00, 0x45, 0x01, 0x11, 0x53, 0x00, 0x78,
+    0x05, 0x16, 0x40, 0x78, 0x09, 0xc8, 0x9c, 0xae, 0x01, 0x1c, 0x61, 0xc5,
+    0xb9, 0x85, 0x01, 0x01, 0xf8, 0xc9, 0xac, 0x4e, 0x01, 0x37, 0x89, 0xcf,
+    0x6a, 0x62, 0x01, 0x30, 0xa0, 0x03, 0xc0, 0x78, 0x15, 0xc4, 0x93, 0xa9,
+    0x08, 0x1c, 0x09, 0x09, 0xc0, 0x78, 0x21, 0x0d, 0xc0, 0x78, 0x2d, 0x06,
+    0xc0, 0x78, 0x39, 0xc2, 0x01, 0x23, 0x08, 0x1c, 0x2b, 0x00, 0x78, 0x45,
+    0xc2, 0x02, 0xa0, 0x08, 0x1c, 0x31, 0x1c, 0xc0, 0x78, 0x4b, 0x16, 0xc0,
+    0x78, 0x55, 0xc3, 0x4a, 0xb9, 0x08, 0x1c, 0x51, 0x15, 0xc0, 0x78, 0x65,
+    0xc5, 0xdd, 0x99, 0x08, 0x1c, 0x69, 0xc3, 0x00, 0x4e, 0x08, 0x1c, 0x71,
+    0xc3, 0x20, 0x18, 0x08, 0x1c, 0x81, 0xc2, 0x05, 0x1c, 0x08, 0x1c, 0xa1,
+    0xc4, 0xe4, 0x97, 0x08, 0x1c, 0xb1, 0xc5, 0xd5, 0xec, 0x08, 0x1c, 0xb9,
+    0x8b, 0x08, 0x1c, 0xd9, 0x97, 0x08, 0x1c, 0xe0, 0x43, 0x11, 0x3c, 0xc0,
+    0x78, 0x75, 0x06, 0xc0, 0x78, 0xd1, 0x14, 0x40, 0x78, 0xe0, 0xc7, 0xc9,
+    0xab, 0x0f, 0xb4, 0x09, 0x0f, 0xc0, 0x78, 0xec, 0xd7, 0x26, 0x8e, 0x01,
+    0x5f, 0xf8, 0x14, 0xc0, 0x78, 0xf8, 0x0a, 0xc0, 0x79, 0x16, 0x10, 0xc0,
+    0x79, 0x34, 0x0d, 0xc0, 0x79, 0x58, 0x42, 0x28, 0x5b, 0xc0, 0x79, 0x76,
+    0x42, 0x01, 0x99, 0xc0, 0x79, 0x82, 0x42, 0x36, 0xa2, 0xc0, 0x79, 0x9a,
+    0x42, 0x2f, 0xf9, 0xc0, 0x79, 0xae, 0x42, 0x14, 0x7d, 0xc0, 0x79, 0xbe,
+    0x19, 0xc0, 0x79, 0xd0, 0x1b, 0xc0, 0x79, 0xe8, 0x0f, 0xc0, 0x79, 0xfa,
+    0x16, 0xc0, 0x7a, 0x18, 0x15, 0x40, 0x7a, 0x36, 0xd7, 0x27, 0x01, 0x01,
+    0x15, 0xc9, 0x84, 0x0f, 0x99, 0xf8, 0x0e, 0xc0, 0x7a, 0x54, 0x12, 0xc0,
+    0x7a, 0x60, 0xcc, 0x8a, 0x99, 0x00, 0x2f, 0x79, 0x45, 0x01, 0xc3, 0xc0,
+    0x7a, 0x6c, 0x47, 0x26, 0x6b, 0x40, 0x7a, 0x7e, 0x16, 0xc0, 0x7a, 0xc8,
+    0x06, 0xc0, 0x7a, 0xd4, 0xce, 0x6f, 0x00, 0x02, 0x6e, 0x19, 0x19, 0xc0,
+    0x7a, 0xe8, 0x42, 0x00, 0x99, 0xc0, 0x7a, 0xf4, 0xd0, 0x5a, 0xb2, 0x02,
+    0x6e, 0x39, 0x15, 0xc0, 0x7a, 0xfe, 0x12, 0xc0, 0x7b, 0x10, 0x08, 0xc0,
+    0x7b, 0x22, 0x09, 0xc0, 0x7b, 0x2e, 0x42, 0x00, 0xa2, 0xc0, 0x7b, 0x38,
+    0xca, 0xa3, 0xa0, 0x02, 0x6e, 0x79, 0x03, 0xc0, 0x7b, 0x44, 0x04, 0xc0,
+    0x7b, 0x56, 0x42, 0x01, 0x19, 0xc0, 0x7b, 0x68, 0x42, 0x00, 0x74, 0xc0,
+    0x7b, 0x72, 0x11, 0xc0, 0x7b, 0x82, 0xca, 0xa5, 0x6c, 0x02, 0x6f, 0xd8,
+    0x48, 0x01, 0x6b, 0xc0, 0x7b, 0x8e, 0xc2, 0x00, 0x40, 0x0f, 0xa0, 0x72,
+    0x00, 0x7b, 0xb4, 0x00, 0xc0, 0x7b, 0xb8, 0xc2, 0x05, 0x03, 0x0f, 0x9f,
+    0x40, 0xc6, 0xc6, 0xf0, 0x01, 0x18, 0xdb, 0x00, 0x7b, 0xd0, 0xc2, 0x00,
+    0x40, 0x01, 0x18, 0x12, 0x00, 0x7b, 0xd6, 0xd9, 0x1f, 0x7c, 0x0f, 0xb3,
+    0x43, 0x00, 0x7b, 0xda, 0x87, 0x0f, 0xab, 0x98, 0xc4, 0x49, 0x2a, 0x0f,
+    0x9b, 0x79, 0xc3, 0xb2, 0x36, 0x0f, 0xa0, 0xe8, 0x15, 0xc0, 0x7b, 0xe0,
+    0xc3, 0x2f, 0x1e, 0x0f, 0xa9, 0x43, 0x00, 0x7b, 0xea, 0xc6, 0xcb, 0x15,
+    0x0f, 0x9a, 0xa0, 0x06, 0xc0, 0x7b, 0xf0, 0x4d, 0x7f, 0xf5, 0xc0, 0x7c,
+    0x02, 0x45, 0xdb, 0x2d, 0xc0, 0x7c, 0x20, 0x09, 0x40, 0x7c, 0x32, 0xc6,
+    0x40, 0x87, 0x01, 0x00, 0x51, 0xc3, 0x23, 0x08, 0x0f, 0xa4, 0x38, 0x44,
+    0xc7, 0xf4, 0xc0, 0x7c, 0x3e, 0xcb, 0x96, 0xb6, 0x0f, 0xa1, 0x18, 0x4c,
+    0x1c, 0x86, 0xc0, 0x7c, 0x4a, 0x44, 0x00, 0x49, 0xc0, 0x7c, 0x56, 0x45,
+    0x00, 0x2c, 0xc0, 0x7c, 0x62, 0x48, 0xb5, 0x4a, 0xc0, 0x7c, 0x6e, 0x47,
+    0xc3, 0x3e, 0xc0, 0x7c, 0x78, 0xd4, 0x3b, 0x24, 0x07, 0xff, 0x41, 0xcd,
+    0x1b, 0x41, 0x07, 0xff, 0x51, 0xcf, 0x14, 0x22, 0x07, 0xff, 0x61, 0xcc,
+    0x0d, 0xae, 0x07, 0xff, 0x69, 0xcc, 0x0d, 0x9e, 0x07, 0xff, 0x70, 0x02,
+    0xc0, 0x7c, 0x84, 0x00, 0x40, 0x7c, 0x93, 0x47, 0x02, 0x0e, 0xc0, 0x7c,
+    0x9f, 0xce, 0x1c, 0x92, 0x01, 0x84, 0xe9, 0xd5, 0x34, 0xb8, 0x01, 0x84,
+    0xf1, 0xcc, 0x80, 0xe5, 0x01, 0x84, 0xf8, 0xc3, 0x06, 0x19, 0x01, 0x00,
+    0x83, 0x00, 0x7c, 0xf7, 0xc9, 0xab, 0x49, 0x01, 0x70, 0x90, 0x42, 0x00,
+    0x29, 0xc0, 0x7d, 0x07, 0x47, 0xc7, 0x04, 0x40, 0x7d, 0x13, 0x46, 0x0b,
+    0x11, 0xc0, 0x7d, 0x25, 0xc7, 0x00, 0x91, 0x0f, 0xa9, 0x19, 0xc7, 0xc1,
+    0x93, 0x0f, 0xa9, 0x10, 0x14, 0xc0, 0x7d, 0x37, 0xc4, 0x1e, 0x43, 0x01,
+    0x11, 0x5a, 0x00, 0x7d, 0x56, 0xcd, 0x77, 0xef, 0x01, 0x1c, 0x01, 0x4d,
+    0x7a, 0xe1, 0x40, 0x7d, 0x5a, 0xc5, 0x65, 0x44, 0x01, 0x10, 0xf3, 0x00,
+    0x7d, 0x66, 0x49, 0x53, 0x89, 0x40, 0x7d, 0x6c, 0x42, 0x01, 0x19, 0xc0,
+    0x7d, 0x76, 0x42, 0x00, 0x7a, 0x40, 0x7d, 0x82, 0x0b, 0xc0, 0x7d, 0x8e,
+    0xc2, 0x01, 0x0b, 0x00, 0x04, 0x22, 0x00, 0x7d, 0x9a, 0xd3, 0x46, 0x0b,
+    0x01, 0x03, 0x61, 0xd2, 0x4d, 0x69, 0x01, 0x03, 0x50, 0xcd, 0x76, 0xeb,
+    0x0f, 0xd5, 0x51, 0x44, 0x05, 0x89, 0x40, 0x7d, 0xa0, 0x16, 0xc0, 0x7d,
+    0xaf, 0x42, 0x00, 0x06, 0xc0, 0x7d, 0xbb, 0xc5, 0x40, 0x88, 0x01, 0x80,
+    0x01, 0x05, 0xc0, 0x7d, 0xc7, 0xc9, 0x11, 0xf6, 0x01, 0x80, 0x11, 0xce,
+    0x1c, 0x92, 0x01, 0x80, 0x29, 0xcb, 0x97, 0x87, 0x01, 0x80, 0x39, 0xcf,
+    0x66, 0xa2, 0x01, 0x81, 0x51, 0xd0, 0x5a, 0x32, 0x01, 0x81, 0x59, 0xd2,
+    0x49, 0x0d, 0x01, 0x81, 0x69, 0xd3, 0x3f, 0xcf, 0x01, 0x81, 0xf1, 0xcf,
+    0x64, 0x59, 0x01, 0x81, 0xf9, 0x4b, 0x55, 0xe0, 0x40, 0x7d, 0xd3, 0xc4,
+    0x59, 0x33, 0x0f, 0x9b, 0x41, 0xc3, 0xb3, 0x72, 0x0f, 0xce, 0x50, 0xda,
+    0x1a, 0x16, 0x01, 0x12, 0x98, 0x4e, 0x70, 0x18, 0x40, 0x7e, 0x09, 0x8f,
+    0x0f, 0xd5, 0x89, 0x42, 0x00, 0xa9, 0xc0, 0x7e, 0x1b, 0xc6, 0xd0, 0x1f,
+    0x0f, 0xaf, 0xd1, 0xc9, 0xaa, 0x29, 0x0f, 0xb0, 0xf8, 0xc2, 0x00, 0xd1,
+    0x0f, 0xa3, 0x4b, 0x00, 0x7e, 0x27, 0xca, 0xa2, 0x38, 0x0f, 0xb5, 0xd0,
+    0x00, 0xc0, 0x7e, 0x33, 0xdb, 0x14, 0xd9, 0x01, 0x3d, 0x98, 0xcc, 0x8c,
+    0x25, 0x01, 0x33, 0xf9, 0xca, 0x9d, 0x4c, 0x01, 0x31, 0xc0, 0x46, 0x1a,
+    0x37, 0xc0, 0x7e, 0x85, 0x46, 0x06, 0x1d, 0xc0, 0x7e, 0x91, 0x4a, 0x03,
+    0xc8, 0xc0, 0x7e, 0x9d, 0x4b, 0x03, 0x87, 0xc0, 0x7e, 0xbb, 0x4a, 0x01,
+    0x88, 0xc0, 0x7e, 0xd9, 0x48, 0x09, 0x0d, 0x40, 0x7e, 0xf7, 0x06, 0xc0,
+    0x7f, 0x15, 0xc7, 0xc2, 0xff, 0x0f, 0x9b, 0xb9, 0xc9, 0xa1, 0x3f, 0x0f,
+    0xb0, 0x48, 0x42, 0x00, 0x29, 0xc0, 0x7f, 0x1f, 0xc2, 0x11, 0xee, 0x01,
+    0x18, 0xd0, 0x44, 0xcc, 0x6b, 0xc0, 0x7f, 0x29, 0x44, 0x00, 0x74, 0x40,
+    0x7f, 0x41, 0x49, 0xb0, 0xfb, 0xc0, 0x7f, 0x4d, 0xc9, 0xae, 0xcd, 0x01,
+    0x35, 0x00, 0x42, 0x00, 0x36, 0xc0, 0x7f, 0x6b, 0x44, 0x00, 0x74, 0xc0,
+    0x7f, 0x7b, 0x42, 0x00, 0x5d, 0x40, 0x7f, 0x8d, 0xd3, 0x3f, 0x96, 0x0f,
+    0x98, 0xa1, 0xd4, 0x39, 0x08, 0x0f, 0x98, 0x90, 0xda, 0x14, 0xa3, 0x01,
+    0x3d, 0xe1, 0xc4, 0x03, 0x30, 0x0f, 0xa4, 0x90, 0xda, 0x1b, 0x9c, 0x01,
+    0x08, 0xc1, 0xca, 0x9b, 0x08, 0x0f, 0x9e, 0x58, 0xc4, 0x00, 0x87, 0x0f,
+    0xb1, 0x49, 0xc8, 0x1d, 0x3c, 0x0f, 0xb2, 0x00, 0xcb, 0x98, 0xc6, 0x01,
+    0x12, 0x01, 0xc3, 0x1e, 0xcf, 0x0f, 0xa9, 0x39, 0xc6, 0xcf, 0xf5, 0x0f,
+    0xc9, 0xe0, 0x44, 0x00, 0x74, 0x40, 0x7f, 0x99, 0xc5, 0xda, 0xd8, 0x0f,
+    0xcd, 0x49, 0x16, 0xc0, 0x7f, 0xab, 0xc9, 0xb1, 0x82, 0x01, 0x37, 0x98,
+    0xc9, 0x1c, 0xaa, 0x01, 0x3b, 0x31, 0xc3, 0x00, 0x28, 0x01, 0x34, 0xc3,
+    0x00, 0x7f, 0xbd, 0xc8, 0x31, 0xd1, 0x0f, 0xa5, 0xf0, 0xc9, 0xb1, 0x5e,
+    0x01, 0x34, 0xe1, 0xca, 0x9b, 0x4e, 0x0f, 0xa5, 0x50, 0x14, 0xc0, 0x7f,
+    0xc3, 0xc5, 0x03, 0x0a, 0x01, 0x37, 0x90, 0xc3, 0x4c, 0xa1, 0x01, 0x15,
+    0x49, 0xc4, 0x63, 0xf2, 0x01, 0x10, 0x01, 0x0d, 0xc0, 0x7f, 0xd3, 0xc6,
+    0xb7, 0xfc, 0x00, 0x00, 0x61, 0xcb, 0x90, 0xd3, 0x0f, 0xcb, 0x00, 0xc6,
+    0xb9, 0xbc, 0x0f, 0xa3, 0x18, 0xc2, 0x2e, 0x0e, 0x0f, 0x98, 0x08, 0x42,
+    0x00, 0x5d, 0xc0, 0x7f, 0xe8, 0xcb, 0x8e, 0xad, 0x01, 0x09, 0xd9, 0xc4,
+    0x89, 0x7c, 0x0f, 0x9f, 0x68, 0xc7, 0x43, 0xb7, 0x0f, 0xa7, 0x01, 0xc4,
+    0xd7, 0xa5, 0x0f, 0xad, 0xb8, 0x0e, 0xc0, 0x80, 0x0a, 0xc4, 0xe2, 0x0b,
+    0x0f, 0xce, 0x30, 0xca, 0x90, 0x19, 0x0f, 0xcb, 0xb1, 0x46, 0xce, 0x0f,
+    0x40, 0x80, 0x16, 0x43, 0x01, 0xe9, 0xc0, 0x80, 0x22, 0xc2, 0x01, 0x48,
+    0x01, 0x19, 0x13, 0x00, 0x80, 0x2e, 0xc6, 0x21, 0xfd, 0x0f, 0xa1, 0xc0,
+    0x46, 0x12, 0x41, 0xc0, 0x80, 0x34, 0x48, 0xa3, 0xc6, 0x40, 0x80, 0x40,
+    0x00, 0xc0, 0x80, 0x52, 0x46, 0x48, 0x65, 0x40, 0x80, 0x6a, 0xc8, 0xba,
+    0x52, 0x01, 0x35, 0x89, 0xd1, 0x57, 0x50, 0x01, 0x03, 0x08, 0x9b, 0x01,
+    0x37, 0xa1, 0xc8, 0xb6, 0xd2, 0x0f, 0x9d, 0x08, 0xc8, 0x1b, 0xc8, 0x01,
+    0x32, 0x01, 0xd7, 0x26, 0x77, 0x00, 0x05, 0x50, 0xc9, 0xa8, 0xa6, 0x0f,
+    0xb1, 0x41, 0xc4, 0x14, 0xdd, 0x0f, 0xd5, 0xb0, 0x43, 0x14, 0xcf, 0xc0,
+    0x80, 0xca, 0x87, 0x0f, 0xa9, 0x2a, 0x00, 0x80, 0xdf, 0x8a, 0x0f, 0xa0,
+    0xfb, 0x00, 0x80, 0xf1, 0xcd, 0x7f, 0x9a, 0x0f, 0xa2, 0x50, 0xcb, 0x05,
+    0x1c, 0x01, 0x02, 0xc9, 0xc4, 0x01, 0xc3, 0x01, 0x71, 0x68, 0xc4, 0x0e,
+    0x9a, 0x01, 0x00, 0x91, 0xc5, 0x40, 0x88, 0x01, 0x00, 0x38, 0x42, 0x00,
+    0x5d, 0xc0, 0x81, 0x03, 0x42, 0x00, 0x47, 0x40, 0x81, 0x15, 0xc5, 0x15,
+    0x2d, 0x0f, 0xd5, 0x48, 0x46, 0x56, 0x32, 0xc0, 0x81, 0x21, 0xc6, 0x44,
+    0xfb, 0x01, 0x05, 0x29, 0xc6, 0xd0, 0x67, 0x0f, 0x98, 0x60, 0x47, 0x02,
+    0x0e, 0xc0, 0x81, 0x2d, 0x45, 0x2b, 0x5f, 0xc0, 0x81, 0x87, 0x4b, 0x6f,
+    0xc7, 0xc0, 0x81, 0x9f, 0x45, 0x00, 0xba, 0x40, 0x81, 0xe6, 0x00, 0xc0,
+    0x81, 0xf8, 0x11, 0x40, 0x82, 0x04, 0xd8, 0x22, 0x73, 0x01, 0x17, 0x79,
+    0x44, 0x04, 0xce, 0x40, 0x82, 0x1c, 0x42, 0x11, 0xa5, 0xc0, 0x82, 0x28,
+    0x0b, 0xc0, 0x82, 0x32, 0x9b, 0x01, 0x4f, 0xf8, 0xc3, 0x03, 0x2a, 0x0f,
+    0xcd, 0xf1, 0xc3, 0x36, 0x44, 0x0f, 0xcd, 0xf8, 0x0b, 0xc0, 0x82, 0x44,
+    0x49, 0xb2, 0xb4, 0x40, 0x82, 0x50, 0x91, 0x0f, 0xb4, 0x39, 0x45, 0x05,
+    0x88, 0x40, 0x82, 0x70, 0x4b, 0x94, 0xb1, 0xc0, 0x82, 0x8c, 0xd7, 0x28,
+    0x15, 0x0f, 0xaa, 0x71, 0xc8, 0x2f, 0x03, 0x0f, 0xb5, 0xc8, 0xc4, 0x5d,
+    0x24, 0x01, 0x31, 0xf9, 0x46, 0xcc, 0xef, 0xc0, 0x82, 0x9e, 0xc6, 0x18,
+    0x8e, 0x0f, 0xce, 0xe0, 0x46, 0xd3, 0x7f, 0xc0, 0x82, 0xaa, 0xc9, 0xab,
+    0x2e, 0x0f, 0x9a, 0xb0, 0x46, 0x09, 0x97, 0xc0, 0x82, 0xbf, 0x03, 0xc0,
+    0x82, 0xe3, 0x18, 0xc0, 0x82, 0xf5, 0x0e, 0xc0, 0x83, 0x01, 0xd4, 0x3d,
+    0x04, 0x05, 0x57, 0xa1, 0xd8, 0x24, 0x53, 0x05, 0x57, 0x99, 0x46, 0xcc,
+    0x3b, 0x40, 0x83, 0x0d, 0xc2, 0x00, 0x45, 0x0f, 0x9a, 0x41, 0xc9, 0x85,
+    0xc8, 0x0f, 0xd7, 0x00, 0x42, 0x00, 0xbf, 0xc0, 0x83, 0x19, 0xcd, 0x73,
+    0xd1, 0x0f, 0xc9, 0xb0, 0x42, 0x00, 0x84, 0xc0, 0x83, 0x29, 0xc2, 0x00,
+    0x8e, 0x0f, 0xa2, 0x21, 0xc2, 0x00, 0x40, 0x0f, 0xa0, 0x0a, 0x00, 0x83,
+    0x38, 0x11, 0xc0, 0x83, 0x3c, 0x47, 0xbf, 0xfd, 0xc0, 0x83, 0x4e, 0x42,
+    0x17, 0x28, 0xc0, 0x83, 0x9d, 0xc3, 0x19, 0x2a, 0x0f, 0xa0, 0x92, 0x00,
+    0x83, 0xa7, 0x0b, 0xc0, 0x83, 0xad, 0x07, 0xc0, 0x83, 0xb7, 0xcb, 0x8c,
+    0xc9, 0x01, 0x50, 0x50, 0xc8, 0xbb, 0x6a, 0x0f, 0xaf, 0x81, 0x42, 0x00,
+    0xbd, 0x40, 0x83, 0xc3, 0x87, 0x0f, 0xaa, 0x61, 0xc3, 0x57, 0xb3, 0x0f,
+    0xcc, 0xf8, 0x00, 0x40, 0x83, 0xcf, 0x4a, 0x4c, 0x94, 0xc0, 0x83, 0xdb,
+    0xc7, 0xc3, 0x06, 0x0f, 0xce, 0x48, 0xc4, 0x26, 0x78, 0x0e, 0x97, 0x4b,
+    0x00, 0x84, 0x07, 0x07, 0xc0, 0x84, 0x0d, 0x15, 0xc0, 0x84, 0x1c, 0x08,
+    0xc0, 0x84, 0x2e, 0x16, 0xc0, 0x84, 0x3b, 0xc3, 0x05, 0x14, 0x0e, 0x97,
+    0x09, 0xc4, 0x15, 0xe7, 0x0e, 0x97, 0x00, 0xce, 0x6f, 0x0e, 0x08, 0xf7,
+    0xc1, 0xca, 0xa1, 0x16, 0x08, 0xf7, 0xb9, 0x4b, 0x6f, 0xc7, 0xc0, 0x84,
+    0x49, 0xc5, 0xcd, 0xfd, 0x08, 0xf7, 0x91, 0x47, 0x02, 0x0e, 0x40, 0x84,
+    0x59, 0x4b, 0x99, 0x13, 0xc0, 0x84, 0xb5, 0xcd, 0x7a, 0xee, 0x0f, 0x8d,
+    0x69, 0xd8, 0x21, 0xe3, 0x00, 0x05, 0xd1, 0xc6, 0xc3, 0xd9, 0x01, 0x81,
+    0xe0, 0x45, 0x45, 0x76, 0xc0, 0x84, 0xcf, 0xcc, 0x88, 0x41, 0x01, 0x35,
+    0x69, 0xd1, 0x55, 0x63, 0x0f, 0xca, 0x58, 0xca, 0x9f, 0x36, 0x01, 0x39,
+    0x01, 0x42, 0x00, 0x5d, 0xc0, 0x84, 0xeb, 0x47, 0xb3, 0xd6, 0x40, 0x84,
+    0xfd, 0xd6, 0x2f, 0x04, 0x01, 0x37, 0x79, 0xc7, 0xc1, 0x5b, 0x0f, 0x9a,
+    0x08, 0xc7, 0x61, 0xfa, 0x01, 0x05, 0xe1, 0x48, 0xbe, 0xba, 0xc0, 0x85,
+    0x25, 0x00, 0xc0, 0x85, 0x43, 0xce, 0x6e, 0xac, 0x0f, 0xab, 0x81, 0x45,
+    0xd9, 0x7a, 0xc0, 0x85, 0x5b, 0xc2, 0x0f, 0x7b, 0x0f, 0xcb, 0x69, 0xce,
+    0x6f, 0xee, 0x0f, 0xcd, 0xe9, 0xc6, 0xcc, 0xad, 0x0f, 0xa2, 0xf0, 0x46,
+    0xca, 0xdf, 0xc0, 0x85, 0x79, 0x4a, 0x9d, 0xd8, 0x40, 0x85, 0x87, 0x87,
+    0x0f, 0xce, 0xc9, 0xc3, 0x2b, 0x00, 0x0f, 0xcf, 0x91, 0xc7, 0xc9, 0x34,
+    0x0f, 0xd4, 0x20, 0x42, 0x00, 0x63, 0xc0, 0x85, 0xcd, 0xc5, 0xd8, 0x5d,
+    0x0f, 0x9a, 0x20, 0x0b, 0xc0, 0x85, 0xd7, 0x44, 0x91, 0x02, 0x40, 0x85,
+    0xec, 0xcc, 0x07, 0xc7, 0x01, 0x13, 0x59, 0xc9, 0x00, 0xca, 0x01, 0x13,
+    0x50, 0xcb, 0x97, 0xf5, 0x0b, 0x53, 0x79, 0xc4, 0x19, 0x53, 0x0b, 0x53,
+    0x71, 0x45, 0x09, 0x98, 0x40, 0x85, 0xf8, 0x16, 0xc0, 0x86, 0x1c, 0x14,
+    0xc0, 0x86, 0x2c, 0x42, 0x00, 0xd0, 0xc0, 0x86, 0x34, 0xc2, 0x00, 0xdb,
+    0x0b, 0x52, 0xdb, 0x00, 0x86, 0x3c, 0x0d, 0xc0, 0x86, 0x40, 0x87, 0x0b,
+    0x52, 0xc3, 0x00, 0x86, 0x50, 0xc2, 0x01, 0x4a, 0x0b, 0x52, 0xb9, 0xc3,
+    0x04, 0x2e, 0x0b, 0x52, 0xa1, 0x91, 0x0b, 0x52, 0x93, 0x00, 0x86, 0x54,
+    0x12, 0xc0, 0x86, 0x5c, 0x10, 0xc0, 0x86, 0x66, 0x0f, 0xc0, 0x86, 0x72,
+    0xc3, 0x30, 0x59, 0x0b, 0x52, 0x59, 0xc2, 0x0e, 0x9a, 0x0b, 0x52, 0x2b,
+    0x00, 0x86, 0x7e, 0x83, 0x0b, 0x52, 0x31, 0xc2, 0x01, 0x5d, 0x0b, 0x52,
+    0x21, 0xc2, 0x42, 0xcd, 0x0b, 0x52, 0x10, 0x44, 0x00, 0xbb, 0xc0, 0x86,
+    0x82, 0x46, 0x10, 0x79, 0xc0, 0x86, 0xba, 0x4a, 0x9e, 0x82, 0x40, 0x86,
+    0xd6, 0x46, 0x02, 0x0f, 0xc0, 0x86, 0xfa, 0x4f, 0x62, 0x88, 0x40, 0x87,
+    0x64, 0xd4, 0x3d, 0xb8, 0x05, 0x53, 0x81, 0xd2, 0x4d, 0x21, 0x05, 0x4f,
+    0x30, 0x4f, 0x6a, 0x71, 0xc0, 0x87, 0x76, 0x54, 0x39, 0x80, 0x40, 0x87,
+    0x9a, 0xc7, 0xc6, 0xb7, 0x00, 0x81, 0x59, 0x03, 0xc0, 0x87, 0xa6, 0x8b,
+    0x00, 0x81, 0x6b, 0x00, 0x87, 0xb1, 0x97, 0x00, 0x81, 0x7b, 0x00, 0x87,
+    0xb5, 0x87, 0x00, 0x81, 0x8b, 0x00, 0x87, 0xb9, 0x44, 0xb9, 0x62, 0xc0,
+    0x87, 0xbf, 0x48, 0xb2, 0x2d, 0xc0, 0x87, 0xc9, 0x15, 0xc0, 0x87, 0xd7,
+    0x52, 0x28, 0x9f, 0xc0, 0x87, 0xe3, 0xcc, 0x89, 0x19, 0x00, 0x83, 0x89,
+    0x46, 0xce, 0x5d, 0x40, 0x87, 0xef, 0x0f, 0xc0, 0x87, 0xff, 0xce, 0x6e,
+    0x4a, 0x00, 0x84, 0x10, 0xc4, 0x15, 0xe7, 0x00, 0x82, 0x01, 0xc3, 0x05,
+    0x14, 0x00, 0x82, 0x09, 0x16, 0xc0, 0x88, 0x0b, 0x08, 0xc0, 0x88, 0x17,
+    0x15, 0xc0, 0x88, 0x23, 0xc5, 0x06, 0xdb, 0x00, 0x82, 0x41, 0xc4, 0x26,
+    0x78, 0x00, 0x82, 0x48, 0xc7, 0xc6, 0xfd, 0x0f, 0xa8, 0xf9, 0xc5, 0x5b,
+    0x0d, 0x01, 0x19, 0x42, 0x00, 0x88, 0x2f, 0x00, 0xc0, 0x88, 0x35, 0x4a,
+    0x0d, 0xd8, 0x40, 0x88, 0x53, 0xcb, 0x97, 0x9d, 0x08, 0x85, 0xeb, 0x00,
+    0x88, 0x6b, 0x4b, 0x6f, 0xc7, 0xc0, 0x88, 0x71, 0x06, 0xc0, 0x88, 0x91,
+    0x15, 0xc0, 0x88, 0x9d, 0xd0, 0x5d, 0x12, 0x08, 0x85, 0xe1, 0xd1, 0x50,
+    0xce, 0x08, 0x85, 0xd9, 0x47, 0x02, 0x0e, 0x40, 0x88, 0xa9, 0x45, 0x00,
+    0xba, 0xc0, 0x89, 0x10, 0x45, 0x2b, 0x5f, 0xc0, 0x89, 0x1c, 0x46, 0x34,
+    0x6f, 0xc0, 0x89, 0x2b, 0x47, 0x02, 0x0e, 0xc0, 0x89, 0x3d, 0x46, 0x09,
+    0x97, 0x40, 0x89, 0xa3, 0x45, 0xdb, 0xe6, 0xc0, 0x89, 0xc7, 0x09, 0x40,
+    0x89, 0xe5, 0x4c, 0x8c, 0x49, 0xc0, 0x89, 0xf1, 0xc6, 0x92, 0x0c, 0x0b,
+    0x7f, 0x20, 0x46, 0x09, 0x97, 0xc0, 0x89, 0xf9, 0x45, 0x00, 0xba, 0xc0,
+    0x8a, 0x1d, 0x4b, 0x6f, 0xc7, 0xc0, 0x8a, 0x2f, 0x47, 0x02, 0x0e, 0x40,
+    0x8a, 0x49, 0x15, 0xc0, 0x8a, 0xb0, 0xd1, 0x50, 0xce, 0x08, 0x91, 0xe9,
+    0x06, 0xc0, 0x8a, 0xbc, 0xce, 0x73, 0x1a, 0x08, 0x91, 0xd1, 0x4b, 0x6f,
+    0xc7, 0xc0, 0x8a, 0xc8, 0x47, 0x02, 0x0e, 0x40, 0x8a, 0xdf, 0x15, 0xc0,
+    0x8b, 0x42, 0x46, 0x09, 0x97, 0xc0, 0x8b, 0x4e, 0xd4, 0x3a, 0xd4, 0x00,
+    0xbe, 0xd9, 0x46, 0x34, 0x6f, 0xc0, 0x8b, 0x72, 0x52, 0x4c, 0xb5, 0xc0,
+    0x8b, 0x7e, 0x47, 0x02, 0x0e, 0x40, 0x8b, 0x94, 0x4c, 0x11, 0xe2, 0xc0,
+    0x8b, 0xde, 0xd1, 0x53, 0xa9, 0x08, 0x52, 0x41, 0x47, 0x34, 0x2f, 0xc0,
+    0x8b, 0xf6, 0x46, 0x09, 0x97, 0xc0, 0x8c, 0x00, 0x18, 0xc0, 0x8c, 0x10,
+    0x45, 0x00, 0xba, 0xc0, 0x8c, 0x1c, 0x47, 0x02, 0x0e, 0x40, 0x8c, 0x3a,
+    0xc5, 0x01, 0xc2, 0x0f, 0xa4, 0x59, 0x44, 0x00, 0x74, 0x40, 0x8c, 0x90,
+    0x16, 0xc0, 0x8c, 0x9f, 0xc3, 0x7c, 0xb4, 0x01, 0x5e, 0xe0, 0x44, 0x03,
+    0xda, 0xc0, 0x8c, 0xab, 0xc2, 0x00, 0x29, 0x01, 0x35, 0x90, 0xc6, 0x6b,
+    0xc0, 0x0f, 0xa7, 0x81, 0x42, 0x01, 0x31, 0xc0, 0x8c, 0xb7, 0x00, 0xc0,
+    0x8c, 0xef, 0x45, 0x02, 0x6d, 0x40, 0x8d, 0x07, 0x44, 0x0d, 0x14, 0xc0,
+    0x8d, 0x13, 0x4d, 0x7c, 0x5a, 0x40, 0x8d, 0x2b, 0xc9, 0x2a, 0xec, 0x01,
+    0x5e, 0x48, 0xc4, 0x9b, 0xb8, 0x01, 0x1c, 0xc1, 0xc4, 0x02, 0x6d, 0x00,
+    0x04, 0x28, 0x03, 0xc0, 0x8d, 0x31, 0x51, 0x54, 0xca, 0xc0, 0x8d, 0x3d,
+    0x4e, 0x6f, 0x9a, 0x40, 0x8d, 0x49, 0x48, 0xbd, 0x22, 0x40, 0x8d, 0x55,
+    0xc2, 0x00, 0xdb, 0x01, 0x10, 0x39, 0x47, 0xc4, 0xb8, 0x40, 0x8d, 0x6d,
+    0xc7, 0x77, 0xc1, 0x01, 0x05, 0x31, 0xc8, 0xb5, 0x62, 0x0f, 0xa4, 0x28,
+    0xcc, 0x5f, 0x56, 0x01, 0x03, 0x71, 0xc4, 0xa8, 0x2a, 0x0f, 0x9e, 0xf0,
+    0x02, 0xc0, 0x8d, 0x7f, 0xc7, 0xc6, 0x5c, 0x01, 0x56, 0xe8, 0x42, 0x00,
+    0x8e, 0xc0, 0x8d, 0x8b, 0xcf, 0x4c, 0x04, 0x01, 0x15, 0x93, 0x00, 0x8d,
+    0x95, 0xcd, 0x7e, 0x62, 0x01, 0x05, 0xd8, 0x45, 0x84, 0xa8, 0xc0, 0x8d,
+    0x9b, 0x00, 0xc0, 0x8d, 0xab, 0x87, 0x0f, 0xae, 0x42, 0x00, 0x8d, 0xe4,
+    0xd9, 0x1f, 0x63, 0x0f, 0xa8, 0xf1, 0xc5, 0x53, 0xf8, 0x01, 0x36, 0xa3,
+    0x00, 0x8d, 0xf3, 0x12, 0xc0, 0x8d, 0xf9, 0xcd, 0x80, 0x43, 0x0f, 0xa7,
+    0xa9, 0x04, 0xc0, 0x8e, 0x05, 0xce, 0x71, 0x92, 0x0f, 0xb5, 0x68, 0xd0,
+    0x5f, 0x52, 0x01, 0x03, 0x79, 0xc8, 0xb8, 0x42, 0x08, 0x0c, 0x70, 0xcc,
+    0x8a, 0xa5, 0x0f, 0x0a, 0x71, 0x46, 0x02, 0x0f, 0x40, 0x8e, 0x11, 0xc4,
+    0x26, 0x78, 0x0f, 0x0a, 0x49, 0xc5, 0x06, 0xdb, 0x0f, 0x0a, 0x41, 0x15,
+    0xc0, 0x8e, 0x93, 0x08, 0xc0, 0x8e, 0x9f, 0x16, 0xc0, 0x8e, 0xab, 0xc3,
+    0x05, 0x14, 0x0f, 0x0a, 0x09, 0xc4, 0x15, 0xe7, 0x0f, 0x0a, 0x00, 0xd2,
+    0x4a, 0xcf, 0x0f, 0x09, 0xe9, 0x44, 0x00, 0xbb, 0x40, 0x8e, 0xb7, 0x86,
+    0x0f, 0x09, 0xb1, 0x89, 0x0f, 0x09, 0xa9, 0x95, 0x0f, 0x09, 0xa1, 0x98,
+    0x0f, 0x09, 0x99, 0x8c, 0x0f, 0x09, 0x91, 0x8f, 0x0f, 0x09, 0x89, 0x84,
+    0x0f, 0x09, 0x80, 0x4c, 0x8b, 0xad, 0xc0, 0x8e, 0xc3, 0xce, 0x1c, 0x92,
+    0x0b, 0x7f, 0x08, 0x44, 0x00, 0x51, 0xc0, 0x8e, 0xcb, 0xc8, 0xab, 0x80,
+    0x01, 0x08, 0xb0, 0x4f, 0x6a, 0x80, 0x40, 0x8e, 0xe1, 0xc2, 0x00, 0xbf,
+    0x01, 0x16, 0x09, 0xc3, 0x02, 0x9b, 0x01, 0x16, 0x00, 0xc8, 0x60, 0x55,
+    0x01, 0x10, 0x89, 0x46, 0x1f, 0x87, 0x40, 0x8e, 0xed, 0xc8, 0x26, 0x58,
+    0x01, 0x10, 0x81, 0x47, 0x20, 0x7d, 0x40, 0x8e, 0xf9, 0xca, 0x9e, 0x14,
+    0x00, 0x3f, 0xf1, 0xc9, 0xb1, 0xb8, 0x00, 0x3f, 0xe9, 0x45, 0x09, 0x98,
+    0x40, 0x8f, 0x0b, 0xc9, 0xb1, 0x55, 0x00, 0x3f, 0xd1, 0xd2, 0x4a, 0x1b,
+    0x00, 0x3f, 0xa9, 0x46, 0x02, 0x0f, 0x40, 0x8f, 0x2f, 0xc2, 0x01, 0xc3,
+    0x00, 0x3f, 0xc1, 0x47, 0x1d, 0xd4, 0x40, 0x8f, 0xaf, 0xca, 0x9f, 0x2c,
+    0x00, 0x3f, 0xb9, 0xc9, 0xac, 0x8d, 0x00, 0x3f, 0xb0, 0xc7, 0xc0, 0xd6,
+    0x0f, 0xd3, 0x69, 0xc7, 0xc8, 0x77, 0x0f, 0xd3, 0x39, 0xc8, 0xb9, 0xe2,
+    0x0f, 0xd3, 0x41, 0xc8, 0xbb, 0x52, 0x0f, 0xd3, 0x49, 0xc5, 0xa0, 0x85,
+    0x0f, 0xd3, 0x51, 0x05, 0x40, 0x8f, 0xc7, 0xc5, 0xa0, 0x85, 0x0f, 0xd3,
+    0x19, 0xc7, 0xc8, 0x77, 0x0f, 0xd3, 0x01, 0xc8, 0xb9, 0xe2, 0x0f, 0xd3,
+    0x09, 0xc8, 0xbb, 0x52, 0x0f, 0xd3, 0x11, 0x05, 0xc0, 0x8f, 0xd3, 0xc7,
+    0xc0, 0xd6, 0x0f, 0xd3, 0x30, 0x4a, 0xa3, 0x46, 0xc0, 0x8f, 0xdf, 0x5a,
+    0x1a, 0x98, 0x40, 0x8f, 0xf7, 0xcc, 0x88, 0x4d, 0x01, 0x1c, 0x19, 0x43,
+    0x18, 0x14, 0x40, 0x90, 0x0d, 0xc4, 0x0e, 0x9a, 0x01, 0x00, 0xa1, 0xc5,
+    0x40, 0x88, 0x01, 0x00, 0x19, 0xc4, 0x02, 0xb9, 0x01, 0x00, 0x08, 0xc2,
+    0x00, 0x8e, 0x01, 0x32, 0x0b, 0x00, 0x90, 0x29, 0x00, 0x40, 0x90, 0x2f,
+    0x07, 0xc0, 0x90, 0x3b, 0x04, 0xc0, 0x90, 0x45, 0x11, 0xc0, 0x90, 0x51,
+    0x0b, 0xc0, 0x90, 0x5b, 0x0a, 0xc0, 0x90, 0x65, 0x18, 0xc0, 0x90, 0x71,
+    0x03, 0xc0, 0x90, 0x7b, 0x42, 0x00, 0x1c, 0xc0, 0x90, 0x85, 0x43, 0xe5,
+    0xc6, 0xc0, 0x90, 0x8d, 0x43, 0xe6, 0x4d, 0xc0, 0x90, 0xb0, 0x42, 0xe6,
+    0xa1, 0xc0, 0x90, 0xd9, 0x42, 0xdd, 0x2f, 0xc0, 0x90, 0xe5, 0x42, 0xde,
+    0x65, 0xc0, 0x90, 0xf9, 0x42, 0xe4, 0xce, 0xc0, 0x91, 0x09, 0x42, 0xe6,
+    0x99, 0xc0, 0x91, 0x1d, 0x43, 0xe5, 0x5a, 0xc0, 0x91, 0x29, 0x42, 0xc6,
+    0x1c, 0xc0, 0x91, 0x45, 0x10, 0xc0, 0x91, 0x4d, 0x42, 0xe4, 0xb6, 0xc0,
+    0x91, 0x5d, 0x43, 0xe5, 0xd8, 0xc0, 0x91, 0x71, 0x43, 0xe5, 0xf3, 0xc0,
+    0x91, 0x97, 0x42, 0xd1, 0x32, 0xc0, 0x91, 0xb7, 0x42, 0xe5, 0x0a, 0xc0,
+    0x91, 0xcf, 0x42, 0xe6, 0x9d, 0xc0, 0x91, 0xe7, 0x42, 0xe6, 0x9b, 0x40,
+    0x92, 0x03, 0x14, 0xc0, 0x92, 0x0f, 0x59, 0x10, 0x15, 0x40, 0x92, 0x1b,
+    0xc3, 0x00, 0xcb, 0x01, 0x11, 0xc9, 0x49, 0x0f, 0x0c, 0x40, 0x92, 0x3f,
+    0x48, 0x14, 0x8a, 0xc0, 0x92, 0x4b, 0x07, 0x40, 0x92, 0x9f, 0x0f, 0xc0,
+    0x92, 0xab, 0xc3, 0x0d, 0xe5, 0x00, 0x9b, 0x28, 0xcc, 0x88, 0xad, 0x00,
+    0x9b, 0x31, 0xd2, 0x45, 0xf9, 0x00, 0x9b, 0x40, 0xc3, 0x05, 0x14, 0x00,
+    0x9b, 0x49, 0x16, 0xc0, 0x92, 0xb7, 0x08, 0xc0, 0x92, 0xc3, 0x15, 0xc0,
+    0x92, 0xcf, 0xc5, 0x06, 0xdb, 0x00, 0x9b, 0x81, 0xc4, 0x26, 0x78, 0x00,
+    0x9b, 0x88, 0x16, 0xc0, 0x92, 0xdb, 0x08, 0xc0, 0x92, 0xf0, 0x15, 0xc0,
+    0x92, 0xfc, 0xc6, 0xcf, 0x9b, 0x00, 0x9b, 0xc9, 0xc6, 0x2a, 0xfe, 0x00,
+    0x9b, 0xd1, 0xc7, 0x0d, 0x04, 0x00, 0x9b, 0xd8, 0xc5, 0xdc, 0x7c, 0x00,
+    0x9c, 0x81, 0x06, 0xc0, 0x93, 0x08, 0xc6, 0x80, 0xbb, 0x00, 0x9c, 0x91,
+    0xcc, 0x80, 0xb5, 0x00, 0x9c, 0x99, 0x0d, 0xc0, 0x93, 0x17, 0xc6, 0xcc,
+    0x65, 0x00, 0x9c, 0xb1, 0xc5, 0xce, 0xca, 0x00, 0x9c, 0xb8, 0xc7, 0x81,
+    0x9e, 0x01, 0x10, 0x43, 0x00, 0x93, 0x23, 0x45, 0xda, 0x29, 0xc0, 0x93,
+    0x27, 0xc5, 0xbb, 0x55, 0x0f, 0xa0, 0xc1, 0xc5, 0xd8, 0x67, 0x0f, 0xb6,
+    0xb8, 0xd2, 0x4a, 0xab, 0x08, 0x7f, 0xb1, 0x46, 0x02, 0x0f, 0x40, 0x93,
+    0x31, 0x83, 0x08, 0x28, 0x01, 0xc2, 0x00, 0x51, 0x08, 0x28, 0x09, 0x05,
+    0xc0, 0x93, 0x94, 0x06, 0xc0, 0x93, 0x9e, 0x10, 0xc0, 0x93, 0xa8, 0x87,
+    0x08, 0x28, 0x43, 0x00, 0x93, 0xbc, 0xc2, 0x14, 0xda, 0x08, 0x28, 0x49,
+    0x09, 0xc0, 0x93, 0xc0, 0xc2, 0x01, 0x7f, 0x08, 0x28, 0x61, 0x8b, 0x08,
+    0x28, 0x69, 0xc2, 0x1c, 0x52, 0x08, 0x28, 0x71, 0x0d, 0xc0, 0x93, 0xce,
+    0x0e, 0xc0, 0x93, 0xd8, 0xc2, 0x00, 0x4e, 0x08, 0x28, 0x91, 0x91, 0x08,
+    0x28, 0xb1, 0xc2, 0x00, 0x67, 0x08, 0x28, 0xb9, 0xc2, 0x99, 0xe7, 0x08,
+    0x28, 0xc1, 0x14, 0xc0, 0x93, 0xe2, 0x15, 0xc0, 0x93, 0xec, 0x16, 0xc0,
+    0x93, 0xf6, 0x97, 0x08, 0x28, 0xf9, 0xc2, 0x00, 0x5f, 0x08, 0x29, 0x01,
+    0xc2, 0x24, 0xe2, 0x08, 0x29, 0x09, 0x9b, 0x08, 0x29, 0x11, 0x1c, 0x40,
+    0x94, 0x00, 0x42, 0x00, 0xac, 0xc0, 0x94, 0x0a, 0x12, 0xc0, 0x94, 0x10,
+    0xcf, 0x15, 0x36, 0x01, 0x39, 0x98, 0x46, 0x00, 0x8b, 0x40, 0x94, 0x1c,
+    0x43, 0x00, 0x55, 0xc0, 0x94, 0x28, 0xda, 0x1c, 0xee, 0x0f, 0xa8, 0xd0,
+    0xc4, 0x0e, 0x9a, 0x01, 0x00, 0x99, 0xc5, 0x40, 0x88, 0x01, 0x00, 0x11,
+    0xc4, 0x02, 0xb9, 0x01, 0x00, 0x00, 0xc4, 0x00, 0x49, 0x01, 0x19, 0x59,
+    0xc5, 0x00, 0x2c, 0x01, 0x19, 0x30, 0x46, 0x04, 0x8f, 0xc0, 0x94, 0x4a,
+    0x46, 0x01, 0xfc, 0x40, 0x94, 0x5c, 0xc3, 0x05, 0x14, 0x01, 0x5f, 0x81,
+    0xc3, 0x02, 0x9f, 0x01, 0x5f, 0x88, 0x00, 0xc0, 0x94, 0x6e, 0x42, 0x00,
+    0x97, 0x40, 0x94, 0x7a, 0xca, 0x9f, 0x9a, 0x01, 0x12, 0xd1, 0x47, 0x37,
+    0x4f, 0x40, 0x94, 0x8f, 0x95, 0x01, 0x12, 0xc9, 0xc8, 0x19, 0x58, 0x01,
+    0x09, 0x70, 0xc5, 0x00, 0xb9, 0x01, 0x05, 0x61, 0xce, 0x72, 0x48, 0x01,
+    0x05, 0x01, 0x45, 0xd3, 0xe9, 0x40, 0x94, 0x9b, 0xc6, 0xcd, 0x91, 0x0f,
+    0xcd, 0x71, 0xc3, 0x0e, 0x6b, 0x0f, 0x9d, 0xc0, 0x46, 0x09, 0x97, 0xc0,
+    0x94, 0xa7, 0xc2, 0x00, 0x7a, 0x08, 0xec, 0xc1, 0x18, 0xc0, 0x94, 0xcb,
+    0x45, 0x00, 0xba, 0xc0, 0x94, 0xd7, 0x47, 0x02, 0x0e, 0x40, 0x94, 0xe3,
+    0xc8, 0x91, 0x02, 0x01, 0x05, 0x91, 0xc5, 0xda, 0xb0, 0x0f, 0xa4, 0x10,
+    0x45, 0x00, 0xba, 0xc0, 0x95, 0x50, 0x47, 0x02, 0x0e, 0xc0, 0x95, 0x74,
+    0x4b, 0x6f, 0xc7, 0xc0, 0x95, 0xe9, 0x46, 0x09, 0x97, 0xc0, 0x96, 0x07,
+    0xc5, 0xd6, 0xb9, 0x00, 0x53, 0x81, 0x03, 0xc0, 0x96, 0x2b, 0xc3, 0x02,
+    0x30, 0x00, 0x53, 0x91, 0xc3, 0x06, 0x63, 0x00, 0x53, 0x99, 0xc8, 0xbb,
+    0x5a, 0x00, 0x53, 0xa0, 0x45, 0x00, 0xba, 0xc0, 0x96, 0x37, 0x47, 0x02,
+    0x0e, 0xc0, 0x96, 0x59, 0x46, 0x34, 0x6f, 0xc0, 0x96, 0xc4, 0xc2, 0x00,
+    0x7a, 0x00, 0x56, 0x81, 0x46, 0x09, 0x97, 0xc0, 0x96, 0xd0, 0xd1, 0x50,
+    0xce, 0x00, 0x57, 0x81, 0xca, 0x76, 0x52, 0x00, 0x57, 0x88, 0x96, 0x0f,
+    0xa0, 0x81, 0xc5, 0xde, 0x61, 0x0f, 0xca, 0x28, 0xc4, 0xe3, 0x4b, 0x08,
+    0x19, 0x99, 0x03, 0xc0, 0x96, 0xf4, 0xc8, 0xbd, 0x52, 0x08, 0x19, 0xa9,
+    0x0b, 0xc0, 0x97, 0x00, 0x0a, 0xc0, 0x97, 0x0c, 0x16, 0xc0, 0x97, 0x18,
+    0xc3, 0x71, 0x13, 0x08, 0x19, 0xc9, 0xc5, 0xdd, 0xd5, 0x08, 0x19, 0xd1,
+    0xc5, 0xdd, 0x5d, 0x08, 0x19, 0xd9, 0xc5, 0x84, 0xe1, 0x08, 0x19, 0xe1,
+    0x10, 0xc0, 0x97, 0x24, 0xc3, 0xad, 0x41, 0x08, 0x19, 0xf1, 0xc4, 0xde,
+    0xd3, 0x08, 0x19, 0xf9, 0xc8, 0xbf, 0x7a, 0x08, 0x1a, 0x01, 0xc5, 0xd5,
+    0xa1, 0x08, 0x1a, 0x11, 0xc5, 0xda, 0x1f, 0x08, 0x1a, 0x19, 0xc5, 0xd5,
+    0x6f, 0x08, 0x1a, 0x29, 0xc5, 0xdd, 0x85, 0x08, 0x1a, 0x31, 0xc5, 0xd4,
+    0x6b, 0x08, 0x1a, 0x49, 0xc7, 0xc1, 0xa1, 0x08, 0x19, 0x89, 0xc4, 0xe0,
+    0x53, 0x08, 0x19, 0x90, 0x07, 0xc0, 0x97, 0x30, 0x4a, 0x07, 0xca, 0x40,
+    0x97, 0x3c, 0x45, 0xda, 0x51, 0xc0, 0x97, 0x63, 0xcb, 0x8e, 0x29, 0x0f,
+    0x9c, 0x99, 0xc3, 0x5f, 0x5f, 0x0f, 0x9a, 0x39, 0xc9, 0x1f, 0x0f, 0x00,
+    0x03, 0x00, 0x46, 0x96, 0x81, 0xc0, 0x97, 0x81, 0xcb, 0x8d, 0x63, 0x0f,
+    0xb1, 0x60, 0xca, 0x9b, 0x6c, 0x0f, 0xa4, 0xb9, 0x43, 0x11, 0x49, 0x40,
+    0x97, 0x90, 0x45, 0x00, 0x8b, 0x40, 0x97, 0x9c, 0xc3, 0x03, 0x0d, 0x01,
+    0x32, 0x51, 0xc6, 0xaf, 0x06, 0x0f, 0xa4, 0x70, 0x46, 0x4d, 0x6c, 0xc0,
+    0x97, 0xa8, 0x46, 0x8f, 0x12, 0x40, 0x97, 0xb4, 0x8e, 0x0f, 0xa3, 0x3b,
+    0x00, 0x97, 0xd2, 0xc9, 0xb3, 0xa7, 0x0f, 0xcc, 0x90, 0xc9, 0xb3, 0x29,
+    0x0f, 0x98, 0xf9, 0xd1, 0x54, 0xa8, 0x0f, 0x98, 0x81, 0xc3, 0x26, 0x19,
+    0x0f, 0xcf, 0x20, 0x48, 0x4f, 0x6b, 0xc0, 0x97, 0xd8, 0xca, 0xa6, 0x52,
+    0x0f, 0xca, 0xd8, 0xc4, 0xdf, 0x0b, 0x0f, 0xcd, 0x39, 0x42, 0x00, 0x5d,
+    0x40, 0x97, 0xe4, 0xc8, 0x27, 0xbc, 0x01, 0x15, 0xb1, 0x43, 0x38, 0x5f,
+    0x40, 0x97, 0xf0, 0xd0, 0x1d, 0xec, 0x07, 0xe9, 0xf1, 0xd1, 0x1a, 0x4a,
+    0x07, 0xe9, 0xf8, 0x4d, 0x53, 0xa9, 0xc0, 0x98, 0x18, 0x47, 0x34, 0x2f,
+    0xc0, 0x98, 0x24, 0xc8, 0xba, 0x12, 0x0f, 0x69, 0x71, 0x51, 0x4f, 0x03,
+    0x40, 0x98, 0x4b, 0xc4, 0xdf, 0x13, 0x0f, 0xb4, 0xb1, 0xc3, 0x22, 0xd3,
+    0x0f, 0xb4, 0x69, 0xca, 0x9f, 0x90, 0x0f, 0xb4, 0xa1, 0xca, 0xa0, 0x4e,
+    0x0f, 0xb4, 0xc1, 0xcb, 0x91, 0x4c, 0x0f, 0xb7, 0x88, 0x00, 0xc0, 0x98,
+    0x63, 0xcf, 0x6a, 0xcb, 0x0f, 0xd3, 0x88, 0xe0, 0x02, 0x47, 0x0f, 0xa8,
+    0xd8, 0x10, 0xc0, 0x98, 0x6f, 0xd5, 0x36, 0x71, 0x00, 0x04, 0xe8, 0xc6,
+    0xcb, 0xd5, 0x01, 0x19, 0x29, 0xc8, 0xb7, 0xca, 0x0f, 0xa5, 0xfa, 0x00,
+    0x98, 0x77, 0x00, 0xc0, 0x98, 0x7d, 0x43, 0x00, 0x29, 0x40, 0x98, 0xb3,
+    0x12, 0xc0, 0x98, 0xc5, 0xc4, 0x14, 0x4c, 0x00, 0xe3, 0xe9, 0xc5, 0xd6,
+    0xf5, 0x00, 0xe3, 0xd9, 0x42, 0x14, 0x48, 0xc0, 0x98, 0xd1, 0xd0, 0x4f,
+    0x37, 0x00, 0xe3, 0xc9, 0x47, 0x02, 0x0e, 0xc0, 0x98, 0xdd, 0x46, 0x09,
+    0x97, 0x40, 0x98, 0xf5, 0x46, 0x0c, 0x51, 0xc0, 0x99, 0x19, 0xc8, 0xb6,
+    0x1a, 0x0f, 0xa7, 0x20, 0x06, 0xc0, 0x99, 0x31, 0x05, 0xc0, 0x99, 0x3d,
+    0xcf, 0x6a, 0xf8, 0x01, 0x22, 0x39, 0x04, 0xc0, 0x99, 0x49, 0xcd, 0x7e,
+    0xca, 0x01, 0x22, 0x19, 0xc4, 0x4a, 0x3f, 0x01, 0x22, 0x11, 0xc4, 0x01,
+    0x23, 0x01, 0x22, 0x00, 0xc4, 0x7e, 0x7a, 0x0f, 0xa0, 0xc9, 0xcb, 0x99,
+    0xce, 0x0f, 0xb6, 0x88, 0x4e, 0x6e, 0x3c, 0xc0, 0x99, 0x5b, 0xc6, 0x59,
+    0x92, 0x01, 0x72, 0xe8, 0xc3, 0x02, 0x6e, 0x01, 0x01, 0xf1, 0xc2, 0x00,
+    0xb6, 0x0f, 0xae, 0xba, 0x00, 0x99, 0x67, 0xd5, 0x37, 0x97, 0x00, 0xb4,
+    0xe1, 0xcc, 0x37, 0xa0, 0x00, 0xb4, 0xd9, 0x47, 0x02, 0x0e, 0xc0, 0x99,
+    0x6d, 0xca, 0xa5, 0x1c, 0x00, 0xb4, 0x00, 0x47, 0x02, 0x0e, 0xc0, 0x99,
+    0xc7, 0x46, 0x09, 0x97, 0x40, 0x9a, 0x4a, 0x4f, 0x0b, 0x17, 0xc0, 0x9a,
+    0x6e, 0x4d, 0x29, 0xb9, 0x40, 0x9a, 0xd5, 0x12, 0xc0, 0x9b, 0x3c, 0xc5,
+    0xdb, 0x73, 0x0e, 0x7e, 0x11, 0x06, 0xc0, 0x9b, 0x4d, 0x11, 0xc0, 0x9b,
+    0x63, 0x0d, 0xc0, 0x9b, 0x72, 0x15, 0xc0, 0x9b, 0x90, 0xc6, 0xd2, 0xfb,
+    0x0e, 0x7d, 0x3b, 0x00, 0x9b, 0xa3, 0x1c, 0xc0, 0x9b, 0xa7, 0xc4, 0xe0,
+    0x1b, 0x0e, 0x7c, 0x19, 0x14, 0xc0, 0x9b, 0xb1, 0x42, 0x11, 0xee, 0xc0,
+    0x9b, 0xbd, 0x49, 0xb1, 0x79, 0xc0, 0x9b, 0xc9, 0x4a, 0xa0, 0x8a, 0x40,
+    0x9b, 0xe7, 0xc3, 0x23, 0x6d, 0x0e, 0x7a, 0x31, 0xc5, 0x78, 0xdb, 0x0e,
+    0x7a, 0x29, 0xce, 0x72, 0xe2, 0x0e, 0x7a, 0x21, 0x46, 0xce, 0xe7, 0x40,
+    0x9b, 0xfd, 0xdb, 0x18, 0xf6, 0x0e, 0x7a, 0x09, 0x45, 0x01, 0xc3, 0xc0,
+    0x9c, 0x05, 0xd7, 0x29, 0x40, 0x0e, 0x79, 0xf1, 0x51, 0x54, 0x75, 0x40,
+    0x9c, 0x57, 0xc8, 0xba, 0xca, 0x08, 0xd2, 0x39, 0x44, 0x00, 0xbb, 0x40,
+    0x9c, 0x69, 0x46, 0x37, 0xee, 0xc0, 0x9c, 0x7b, 0x46, 0x26, 0xd5, 0x40,
+    0x9c, 0x87, 0xd6, 0x2d, 0xe6, 0x08, 0xd2, 0x29, 0xc9, 0x15, 0xcc, 0x08,
+    0xd1, 0xf8, 0xca, 0xa4, 0x36, 0x08, 0xd2, 0x21, 0xcb, 0x99, 0x6b, 0x08,
+    0xd2, 0x19, 0xc4, 0x01, 0xe2, 0x08, 0xd2, 0x11, 0xc5, 0x32, 0x89, 0x08,
+    0xd2, 0x08, 0x0d, 0xc0, 0x9c, 0x93, 0xc2, 0x00, 0xd0, 0x08, 0xd1, 0x89,
+    0x15, 0xc0, 0x9c, 0xa3, 0xc2, 0x02, 0x41, 0x08, 0xd1, 0x69, 0xc2, 0x00,
+    0xdb, 0x08, 0xd1, 0x61, 0xc2, 0x00, 0x39, 0x08, 0xd1, 0x59, 0xc2, 0x19,
+    0x2c, 0x08, 0xd1, 0x51, 0xc2, 0x00, 0x02, 0x08, 0xd1, 0x49, 0x1c, 0xc0,
+    0x9c, 0xb3, 0x06, 0xc0, 0x9c, 0xbd, 0x16, 0xc0, 0x9c, 0xcf, 0xc2, 0x01,
+    0xc3, 0x08, 0xd1, 0x11, 0x04, 0xc0, 0x9c, 0xe1, 0x12, 0xc0, 0x9c, 0xeb,
+    0x10, 0xc0, 0x9c, 0xf5, 0xc2, 0x25, 0x3b, 0x08, 0xd0, 0x91, 0x05, 0xc0,
+    0x9d, 0x0b, 0x09, 0xc0, 0x9d, 0x15, 0x83, 0x08, 0xd0, 0x00, 0xcb, 0x36,
+    0x51, 0x08, 0xd0, 0x51, 0x45, 0x00, 0xba, 0x40, 0x9d, 0x1f, 0xd5, 0x34,
+    0xa3, 0x01, 0x51, 0xf1, 0x45, 0x00, 0x2d, 0xc0, 0x9d, 0x3f, 0xd4, 0x3a,
+    0xc0, 0x01, 0x53, 0x28, 0x46, 0xcc, 0x23, 0xc0, 0x9d, 0x4b, 0xc3, 0x3a,
+    0x48, 0x01, 0x4c, 0x08, 0xcf, 0x60, 0x30, 0x01, 0x4c, 0x49, 0xcd, 0x7d,
+    0x6b, 0x01, 0x4c, 0x38, 0xc6, 0x57, 0xec, 0x01, 0x00, 0x69, 0x42, 0x00,
+    0x10, 0xc0, 0x9d, 0x55, 0xc5, 0x40, 0x88, 0x01, 0x00, 0x58, 0xcb, 0x95,
+    0x82, 0x01, 0x37, 0xd9, 0xd3, 0x44, 0xdb, 0x0f, 0xa9, 0x81, 0xc6, 0xcf,
+    0x1d, 0x0f, 0xa3, 0xd1, 0xc4, 0xc9, 0x19, 0x0f, 0xa3, 0xc9, 0xcb, 0x95,
+    0x6c, 0x0f, 0x9f, 0x19, 0xc5, 0xb0, 0x15, 0x0f, 0x9c, 0x71, 0xc6, 0xcd,
+    0x2b, 0x0f, 0x9f, 0x79, 0xda, 0x19, 0x46, 0x01, 0x80, 0x20, 0x42, 0x00,
+    0xb0, 0xc0, 0x9d, 0x61, 0x42, 0x00, 0x49, 0xc0, 0x9d, 0x6d, 0x46, 0x09,
+    0x97, 0xc0, 0x9d, 0x79, 0xd3, 0x45, 0x99, 0x05, 0x4e, 0x69, 0xcf, 0x60,
+    0xc6, 0x05, 0x4e, 0x11, 0x4f, 0x30, 0x90, 0xc0, 0x9d, 0x9d, 0x4b, 0x6f,
+    0xc7, 0xc0, 0x9d, 0xaf, 0x45, 0x00, 0xba, 0x40, 0x9d, 0xd1, 0x44, 0x02,
+    0xbe, 0xc0, 0x9d, 0xec, 0x45, 0x44, 0xba, 0x40, 0x9d, 0xf8, 0xd0, 0x0f,
+    0x09, 0x01, 0x02, 0x41, 0xc4, 0x01, 0xc3, 0x00, 0x01, 0xf8, 0x49, 0x14,
+    0x89, 0xc0, 0x9e, 0x04, 0x48, 0x91, 0xff, 0x40, 0x9e, 0x7d, 0x47, 0x02,
+    0x0e, 0xc0, 0x9e, 0xcf, 0xd0, 0x59, 0x92, 0x08, 0x75, 0x69, 0x4a, 0x51,
+    0x89, 0x40, 0x9f, 0x54, 0x8e, 0x00, 0x00, 0xc3, 0x00, 0x9f, 0x60, 0x94,
+    0x01, 0x32, 0x58, 0x95, 0x00, 0xa8, 0x2b, 0x00, 0x9f, 0x6a, 0x90, 0x00,
+    0xa6, 0x83, 0x00, 0x9f, 0x95, 0x85, 0x00, 0xa5, 0x0b, 0x00, 0x9f, 0xd2,
+    0x04, 0xc0, 0x9f, 0xf5, 0x96, 0x00, 0xa3, 0x33, 0x00, 0xa0, 0x07, 0x19,
+    0xc0, 0xa0, 0x39, 0x94, 0x00, 0xaa, 0x83, 0x00, 0xa0, 0x55, 0x88, 0x00,
+    0xaa, 0xeb, 0x00, 0xa0, 0x78, 0x87, 0x00, 0xa0, 0x0b, 0x00, 0xa0, 0x9d,
+    0x91, 0x00, 0xa0, 0x2b, 0x00, 0xa0, 0xa7, 0x9b, 0x00, 0xa9, 0xf3, 0x00,
+    0xa0, 0xb9, 0x8e, 0x00, 0xa7, 0x53, 0x00, 0xa0, 0xdc, 0x8f, 0x00, 0xa5,
+    0xdb, 0x00, 0xa1, 0x00, 0x8d, 0x00, 0xa4, 0x1b, 0x00, 0xa1, 0x24, 0x92,
+    0x00, 0xa2, 0x4b, 0x00, 0xa1, 0x44, 0x83, 0x00, 0xa0, 0x53, 0x00, 0xa1,
+    0x61, 0x93, 0x00, 0xac, 0x2b, 0x00, 0xa1, 0x7d, 0x0a, 0xc0, 0xa1, 0x92,
+    0x8b, 0x00, 0xa0, 0x1b, 0x00, 0xa1, 0x9c, 0xcc, 0x23, 0x33, 0x00, 0xa0,
+    0xf0, 0xc2, 0x00, 0x49, 0x0f, 0xab, 0x79, 0x9b, 0x0f, 0x9b, 0x60, 0xc3,
+    0x00, 0x54, 0x01, 0x08, 0x29, 0x96, 0x01, 0x01, 0xc2, 0x00, 0xa1, 0xa4,
+    0xc8, 0xb6, 0x32, 0x0f, 0xae, 0x19, 0xc5, 0x06, 0x82, 0x0f, 0xa6, 0x3a,
+    0x00, 0xa1, 0xaa, 0xca, 0xa0, 0x80, 0x0f, 0x9d, 0x01, 0x90, 0x00, 0x16,
+    0x38, 0xc9, 0xaa, 0x4d, 0x0f, 0x9c, 0x79, 0xc9, 0xb2, 0x99, 0x0f, 0xd4,
+    0xd0, 0xcb, 0x75, 0x5a, 0x00, 0x00, 0x69, 0xc2, 0x01, 0xbb, 0x0f, 0xca,
+    0xa8, 0x97, 0x08, 0x15, 0x93, 0x00, 0xa1, 0xb0, 0x94, 0x08, 0x15, 0x2b,
+    0x00, 0xa1, 0xb7, 0x8e, 0x08, 0x15, 0x1b, 0x00, 0xa1, 0xbb, 0x83, 0x08,
+    0x15, 0x03, 0x00, 0xa1, 0xc2, 0x93, 0x08, 0x15, 0x41, 0x84, 0x08, 0x15,
+    0x49, 0x8f, 0x08, 0x15, 0x53, 0x00, 0xa1, 0xc6, 0x91, 0x08, 0x15, 0x59,
+    0x86, 0x08, 0x15, 0x13, 0x00, 0xa1, 0xcd, 0x96, 0x08, 0x15, 0x6b, 0x00,
+    0xa1, 0xd1, 0x95, 0x08, 0x15, 0x83, 0x00, 0xa1, 0xd8, 0x42, 0x09, 0x8f,
+    0xc0, 0xa1, 0xea, 0x90, 0x08, 0x15, 0xab, 0x00, 0xa1, 0xf6, 0x9a, 0x08,
+    0x15, 0xa1, 0x92, 0x08, 0x15, 0xbb, 0x00, 0xa2, 0x02, 0x8b, 0x08, 0x15,
+    0xcb, 0x00, 0xa2, 0x06, 0x87, 0x08, 0x15, 0xd3, 0x00, 0xa2, 0x0a, 0x8d,
+    0x08, 0x15, 0xe3, 0x00, 0xa2, 0x0e, 0x89, 0x08, 0x16, 0x02, 0x00, 0xa2,
+    0x12, 0x47, 0x02, 0x0e, 0xc0, 0xa2, 0x16, 0xcd, 0x79, 0x8f, 0x08, 0x2b,
+    0x78, 0xcb, 0x8d, 0xbb, 0x0f, 0xa7, 0xc0, 0x46, 0x00, 0x8b, 0x40, 0xa2,
+    0x8a, 0x26, 0xc0, 0xa2, 0x96, 0x25, 0xc0, 0xa2, 0xd6, 0x03, 0x40, 0xa3,
+    0x16, 0x03, 0xc0, 0xa3, 0x1e, 0x26, 0x40, 0xa3, 0x56, 0xc5, 0x61, 0xf7,
+    0x01, 0x74, 0x01, 0x03, 0x40, 0xa3, 0x96, 0x0e, 0xc0, 0xa3, 0xa4, 0xc4,
+    0xdf, 0xfb, 0x01, 0x74, 0xd9, 0x0b, 0xc0, 0xa3, 0xb0, 0xc2, 0x00, 0x27,
+    0x01, 0x75, 0x39, 0x4c, 0x8c, 0x6d, 0x40, 0xa3, 0xbc, 0x07, 0xc0, 0xa3,
+    0xf2, 0x45, 0x03, 0x14, 0xc0, 0xa3, 0xfe, 0x10, 0xc0, 0xa4, 0x0a, 0xc2,
+    0x05, 0x1d, 0x01, 0x74, 0xe1, 0x0b, 0xc0, 0xa4, 0x16, 0x46, 0xcd, 0xaf,
+    0xc0, 0xa4, 0x22, 0xc4, 0xdf, 0x97, 0x01, 0x75, 0xb0, 0xc5, 0x18, 0x8f,
+    0x01, 0x74, 0x29, 0x43, 0x39, 0x8b, 0x40, 0xa4, 0x2e, 0x11, 0xc0, 0xa4,
+    0x3a, 0xc5, 0xc0, 0xd0, 0x01, 0x75, 0x71, 0x45, 0xdc, 0x09, 0xc0, 0xa4,
+    0x4a, 0xc3, 0x87, 0x22, 0x01, 0x76, 0xc0, 0xc4, 0x14, 0x8d, 0x01, 0x74,
+    0x39, 0xc5, 0x8c, 0xf0, 0x01, 0x74, 0x99, 0xc4, 0xe1, 0x3b, 0x01, 0x76,
+    0x09, 0xc5, 0xd5, 0x65, 0x01, 0x77, 0x88, 0xc3, 0x05, 0x14, 0x01, 0x74,
+    0x41, 0xc3, 0x02, 0x9f, 0x01, 0x74, 0x48, 0xc9, 0xab, 0xac, 0x01, 0x74,
+    0x51, 0xc4, 0x04, 0xa6, 0x01, 0x74, 0xf1, 0xc2, 0x13, 0x38, 0x01, 0x75,
+    0x40, 0x44, 0xb3, 0x85, 0xc0, 0xa4, 0x56, 0x44, 0x08, 0x48, 0x40, 0xa4,
+    0x66, 0x42, 0x01, 0x9c, 0xc0, 0xa4, 0x72, 0xc3, 0x02, 0x9b, 0x01, 0x74,
+    0xc1, 0xc3, 0x00, 0xbf, 0x01, 0x76, 0x38, 0x11, 0xc0, 0xa4, 0x7c, 0x07,
+    0x40, 0xa4, 0x94, 0x03, 0xc0, 0xa4, 0xa0, 0x44, 0x15, 0xa8, 0x40, 0xa4,
+    0xac, 0xc3, 0x05, 0xba, 0x01, 0x75, 0x19, 0xc3, 0x65, 0xba, 0x01, 0x76,
+    0x50, 0xc3, 0x01, 0x9d, 0x01, 0x75, 0x49, 0x4c, 0x8c, 0x6d, 0x40, 0xa4,
+    0xb8, 0xc2, 0x0c, 0x43, 0x01, 0x75, 0x59, 0xc2, 0x00, 0x28, 0x01, 0x75,
+    0xc1, 0x43, 0x0a, 0x0c, 0x40, 0xa4, 0xc8, 0xc3, 0x05, 0x14, 0x01, 0x75,
+    0x89, 0x16, 0xc0, 0xa4, 0xd2, 0xc4, 0x09, 0x9d, 0x01, 0x75, 0xa0, 0x45,
+    0x1b, 0xa0, 0xc0, 0xa4, 0xde, 0xc4, 0xe0, 0x07, 0x01, 0x77, 0x20, 0x90,
+    0x01, 0x8e, 0xe8, 0x99, 0x01, 0x8e, 0x23, 0x00, 0xa4, 0xe8, 0x9c, 0x01,
+    0x8e, 0xbb, 0x00, 0xa4, 0xf0, 0x92, 0x01, 0x8e, 0x99, 0x96, 0x01, 0x8e,
+    0xc9, 0x89, 0x01, 0x8e, 0xd0, 0x9c, 0x01, 0x8e, 0xab, 0x00, 0xa4, 0xfa,
+    0x92, 0x01, 0x8e, 0x3b, 0x00, 0xa5, 0x10, 0x89, 0x01, 0x8e, 0xb1, 0xc3,
+    0xe6, 0x56, 0x01, 0x8f, 0x00, 0x86, 0x01, 0x8e, 0xd9, 0x9c, 0x01, 0x8e,
+    0xe1, 0x89, 0x01, 0x8f, 0x10, 0xc8, 0x78, 0xcc, 0x0f, 0xb3, 0xf3, 0x00,
+    0xa5, 0x16, 0xc5, 0x01, 0xc2, 0x01, 0x38, 0x98, 0xce, 0x6d, 0xb0, 0x0f,
+    0xa7, 0x19, 0xc8, 0xbb, 0x4a, 0x0f, 0xce, 0x00, 0x45, 0xde, 0x1b, 0xc0,
+    0xa5, 0x1c, 0x14, 0x40, 0xa5, 0x28, 0x94, 0x0f, 0xd4, 0x89, 0xc2, 0x05,
+    0x26, 0x01, 0x36, 0x98, 0x47, 0xc2, 0xf1, 0xc0, 0xa5, 0x34, 0x47, 0x07,
+    0x93, 0x40, 0xa5, 0x43, 0x47, 0x02, 0x0e, 0xc0, 0xa5, 0x52, 0x18, 0xc0,
+    0xa5, 0xb4, 0xcd, 0x2c, 0xb2, 0x08, 0x8a, 0x19, 0x06, 0xc0, 0xa5, 0xc0,
+    0x15, 0xc0, 0xa5, 0xd2, 0xc7, 0xc2, 0xab, 0x08, 0x89, 0xa1, 0xc7, 0xc3,
+    0xd8, 0x08, 0x89, 0x91, 0xc6, 0xb6, 0x44, 0x08, 0x89, 0x88, 0x4f, 0x30,
+    0x90, 0xc0, 0xa5, 0xde, 0x4b, 0x6f, 0xc7, 0xc0, 0xa5, 0xfc, 0x47, 0x02,
+    0x0e, 0xc0, 0xa6, 0x1b, 0x4c, 0x11, 0xe2, 0xc0, 0xa6, 0x84, 0x46, 0x09,
+    0x97, 0x40, 0xa6, 0x94, 0xcc, 0x88, 0xa1, 0x0f, 0xb5, 0xc0, 0x47, 0x34,
+    0x2f, 0xc0, 0xa6, 0xb8, 0x47, 0x02, 0x0e, 0x40, 0xa6, 0xcb, 0xc8, 0x1d,
+    0x3c, 0x0f, 0xb1, 0xf9, 0xc4, 0x00, 0x87, 0x0f, 0xb1, 0x10, 0x00, 0xc0,
+    0xa7, 0x30, 0xc9, 0xae, 0xe8, 0x01, 0x36, 0x61, 0x43, 0x00, 0xa8, 0x40,
+    0xa7, 0x40, 0xca, 0x9b, 0xee, 0x0f, 0x9b, 0xc1, 0xc5, 0xc9, 0x75, 0x0f,
+    0xd5, 0x98, 0x09, 0xc0, 0xa7, 0x52, 0x03, 0xc0, 0xa7, 0x5c, 0x14, 0xc0,
+    0xa7, 0x72, 0x0e, 0xc0, 0xa7, 0x7a, 0x42, 0x00, 0x8c, 0xc0, 0xa7, 0x90,
+    0x16, 0xc0, 0xa7, 0x9c, 0x06, 0xc0, 0xa7, 0xb7, 0x07, 0xc0, 0xa7, 0xc8,
+    0x08, 0xc0, 0xa7, 0xd4, 0x05, 0xc0, 0xa7, 0xe0, 0x15, 0xc0, 0xa8, 0x03,
+    0x04, 0xc0, 0xa8, 0x25, 0x42, 0x02, 0x2b, 0xc0, 0xa8, 0x2f, 0x17, 0xc0,
+    0xa8, 0x3b, 0x0b, 0xc0, 0xa8, 0x4b, 0x47, 0x2e, 0x48, 0xc0, 0xa8, 0x55,
+    0x11, 0xc0, 0xa8, 0x61, 0x0f, 0xc0, 0xa8, 0x7c, 0x12, 0xc0, 0xa8, 0x8b,
+    0x10, 0xc0, 0xa8, 0x95, 0x1a, 0xc0, 0xa8, 0xa1, 0x42, 0x01, 0x23, 0xc0,
+    0xa8, 0xab, 0x49, 0x07, 0xbb, 0x40, 0xa8, 0xbd, 0xce, 0x72, 0x80, 0x01,
+    0x1c, 0x21, 0xc6, 0x81, 0x9c, 0x01, 0x10, 0x09, 0xc7, 0x50, 0x25, 0x0f,
+    0xae, 0xe1, 0xc3, 0x1b, 0xa1, 0x0f, 0xcf, 0x68, 0x47, 0xb4, 0x64, 0xc0,
+    0xa8, 0xc9, 0x83, 0x00, 0x01, 0x60, 0x48, 0xb6, 0xaa, 0xc0, 0xa8, 0xd5,
+    0x42, 0x00, 0x29, 0x40, 0xa8, 0xe1, 0xd7, 0x16, 0xc3, 0x01, 0x1c, 0x09,
+    0x45, 0xc2, 0x13, 0xc0, 0xa8, 0xed, 0xcc, 0x62, 0xe5, 0x01, 0x11, 0x71,
+    0x44, 0x7e, 0xe0, 0x40, 0xa8, 0xf9, 0xc6, 0xd3, 0xa9, 0x0f, 0xa3, 0xb9,
+    0xc4, 0x00, 0xba, 0x0f, 0xb5, 0x38, 0xc9, 0xb0, 0x74, 0x0f, 0x9c, 0x51,
+    0xcb, 0x98, 0x16, 0x0f, 0xb0, 0xb1, 0xc9, 0x96, 0x1e, 0x0f, 0xb0, 0xa8,
+    0x00, 0x40, 0xa9, 0x05, 0xc2, 0x00, 0x75, 0x0f, 0x9b, 0x99, 0x87, 0x0f,
+    0x9b, 0x50, 0xcb, 0x8d, 0x79, 0x0f, 0x89, 0x79, 0xca, 0x9d, 0x6a, 0x00,
+    0x05, 0x48, 0x15, 0xc0, 0xa9, 0x11, 0x05, 0xc0, 0xa9, 0x1d, 0x46, 0xd1,
+    0x4b, 0xc0, 0xa9, 0x29, 0x4b, 0x96, 0x06, 0xc0, 0xa9, 0x3b, 0x08, 0xc0,
+    0xa9, 0x53, 0xd5, 0x36, 0xda, 0x01, 0x67, 0xf8, 0xc7, 0xb4, 0xd2, 0x0f,
+    0xca, 0x11, 0xc9, 0xb2, 0x36, 0x0f, 0x9b, 0xd8, 0x42, 0x00, 0xa9, 0xc0,
+    0xa9, 0x5f, 0xc3, 0x02, 0xad, 0x01, 0x02, 0x80, 0x45, 0x05, 0xfd, 0xc0,
+    0xa9, 0x81, 0x46, 0x11, 0x55, 0x40, 0xa9, 0xa7, 0x46, 0x00, 0x8b, 0x40,
+    0xa9, 0xc3, 0xce, 0x6c, 0x7c, 0x0f, 0xa2, 0x79, 0xc8, 0x78, 0xcc, 0x0f,
+    0x9d, 0x60, 0x42, 0x00, 0xa9, 0xc0, 0xa9, 0xdb, 0x00, 0x40, 0xaa, 0x3d,
+    0xc6, 0xcc, 0x89, 0x0f, 0x9d, 0x51, 0xcf, 0x69, 0x27, 0x01, 0x50, 0x81,
+    0xcc, 0x08, 0xfb, 0x00, 0x02, 0xf0, 0x1c, 0xc0, 0xaa, 0x49, 0x97, 0x09,
+    0x18, 0x5b, 0x00, 0xaa, 0x64, 0x16, 0xc0, 0xaa, 0x9f, 0x15, 0xc0, 0xaa,
+    0xbb, 0x10, 0xc0, 0xaa, 0xd4, 0x0f, 0xc0, 0xaa, 0xf0, 0x0e, 0xc0, 0xab,
+    0x0c, 0x0d, 0xc0, 0xab, 0x21, 0x0a, 0xc0, 0xab, 0x42, 0x09, 0xc0, 0xab,
+    0x57, 0x87, 0x09, 0x04, 0x53, 0x00, 0xab, 0x70, 0x06, 0xc0, 0xab, 0xa8,
+    0x04, 0xc0, 0xab, 0xbd, 0x83, 0x09, 0x00, 0x03, 0x00, 0xab, 0xd2, 0x12,
+    0xc0, 0xac, 0x16, 0x14, 0xc0, 0xac, 0x2d, 0x8b, 0x09, 0x09, 0xfa, 0x00,
+    0xac, 0x3c, 0x49, 0x1e, 0x56, 0xc0, 0xac, 0x72, 0xce, 0x74, 0x16, 0x09,
+    0x23, 0x89, 0xd9, 0x1d, 0xba, 0x09, 0x23, 0x80, 0x42, 0x00, 0xec, 0xc0,
+    0xac, 0x84, 0x07, 0xc0, 0xac, 0x90, 0x15, 0xc0, 0xac, 0x9c, 0x08, 0xc0,
+    0xac, 0xae, 0x11, 0xc0, 0xac, 0xba, 0x16, 0x40, 0xac, 0xc6, 0x42, 0x00,
+    0x36, 0xc0, 0xac, 0xd2, 0xc9, 0xaf, 0x03, 0x0f, 0xca, 0x60, 0x45, 0x3a,
+    0xd8, 0xc0, 0xac, 0xde, 0xca, 0x9e, 0x96, 0x0f, 0x9a, 0xd8, 0xcf, 0x55,
+    0xa9, 0x01, 0x37, 0xf1, 0xca, 0x9e, 0xb4, 0x0f, 0xcb, 0x20, 0xcc, 0x87,
+    0x81, 0x01, 0x08, 0x21, 0x45, 0x02, 0x6d, 0x40, 0xac, 0xea, 0x42, 0xe6,
+    0x4a, 0xc0, 0xac, 0xf6, 0x1e, 0xc0, 0xac, 0xfe, 0x1d, 0x40, 0xad, 0x06,
+    0x19, 0xc0, 0xad, 0x2e, 0x1a, 0xc0, 0xad, 0x3e, 0x1c, 0xc0, 0xad, 0x46,
+    0x83, 0x08, 0x40, 0x01, 0x87, 0x08, 0x40, 0x09, 0x8b, 0x08, 0x40, 0x11,
+    0x91, 0x08, 0x40, 0x19, 0x97, 0x08, 0x40, 0x21, 0x0c, 0xc0, 0xad, 0x4e,
+    0x0d, 0xc0, 0xad, 0x56, 0x0e, 0xc0, 0xad, 0x6a, 0x0f, 0xc0, 0xad, 0x7e,
+    0x10, 0xc0, 0xad, 0x92, 0x12, 0xc0, 0xad, 0xa6, 0x14, 0xc0, 0xad, 0xba,
+    0x15, 0xc0, 0xad, 0xce, 0x16, 0x40, 0xad, 0xe2, 0xd0, 0x5a, 0xa2, 0x00,
+    0xe9, 0x59, 0xc8, 0xbe, 0xaa, 0x00, 0x26, 0x01, 0xcd, 0x7f, 0xb4, 0x05,
+    0x33, 0x70, 0x46, 0x02, 0x0f, 0xc0, 0xad, 0xf6, 0x48, 0x19, 0x9b, 0x40,
+    0xae, 0x73, 0x46, 0x02, 0x0f, 0xc0, 0xae, 0x85, 0x48, 0x19, 0x9b, 0x40,
+    0xaf, 0x04, 0xc4, 0x26, 0x78, 0x0f, 0xdf, 0xc9, 0xc4, 0x15, 0xe7, 0x0f,
+    0xdf, 0x81, 0xc3, 0x05, 0x14, 0x0f, 0xdf, 0x89, 0x16, 0xc0, 0xaf, 0x16,
+    0x08, 0xc0, 0xaf, 0x22, 0x15, 0xc0, 0xaf, 0x2e, 0xc5, 0x06, 0xdb, 0x0f,
+    0xdf, 0xc0, 0xe0, 0x07, 0x87, 0x01, 0x51, 0x90, 0xc2, 0x00, 0xbf, 0x01,
+    0x18, 0xa1, 0xc8, 0x08, 0xe8, 0x00, 0x05, 0x38, 0xe0, 0x00, 0x87, 0x0f,
+    0xc9, 0x60, 0x47, 0xc1, 0x1c, 0xc0, 0xaf, 0x3a, 0x00, 0x40, 0xaf, 0x42,
+    0x48, 0x78, 0xbf, 0xc0, 0xaf, 0x5e, 0x45, 0x00, 0xba, 0xc0, 0xaf, 0x6a,
+    0x0e, 0xc0, 0xaf, 0x7a, 0x4b, 0x6f, 0xc7, 0xc0, 0xaf, 0x86, 0xd6, 0x2d,
+    0x20, 0x00, 0x6f, 0xa0, 0x14, 0xc0, 0xaf, 0x9c, 0x08, 0xc0, 0xaf, 0xa8,
+    0xcb, 0x1a, 0x50, 0x0e, 0xd4, 0x59, 0x05, 0xc0, 0xaf, 0xc2, 0x15, 0xc0,
+    0xaf, 0xcc, 0x0e, 0xc0, 0xaf, 0xea, 0x42, 0x02, 0xae, 0xc0, 0xaf, 0xf4,
+    0x16, 0xc0, 0xaf, 0xfa, 0xdb, 0x18, 0xa5, 0x0e, 0xd3, 0x79, 0x07, 0xc0,
+    0xb0, 0x08, 0x0a, 0xc0, 0xb0, 0x1a, 0x10, 0xc0, 0xb0, 0x27, 0x42, 0x00,
+    0xa2, 0xc0, 0xb0, 0x33, 0x42, 0x00, 0x38, 0xc0, 0xb0, 0x3f, 0x44, 0x8c,
+    0x27, 0xc0, 0xb0, 0x4b, 0x06, 0xc0, 0xb0, 0x57, 0x46, 0xd3, 0x9d, 0x40,
+    0xb0, 0x63, 0xe0, 0x04, 0x87, 0x01, 0x39, 0xf1, 0x47, 0x0a, 0xaa, 0x40,
+    0xb0, 0x75, 0x4b, 0x6f, 0xc7, 0xc0, 0xb0, 0x87, 0x47, 0x02, 0x0e, 0xc0,
+    0xb0, 0xaa, 0x15, 0xc0, 0xb1, 0x11, 0xd0, 0x5c, 0xe2, 0x08, 0xae, 0x49,
+    0x50, 0x5d, 0xf2, 0xc0, 0xb1, 0x1b, 0x06, 0x40, 0xb1, 0x27, 0x46, 0x04,
+    0x8f, 0xc0, 0xb1, 0x33, 0x46, 0x01, 0xfc, 0x40, 0xb1, 0x4b, 0xc9, 0x00,
+    0xca, 0x01, 0x54, 0xe9, 0xcc, 0x07, 0xc7, 0x01, 0x54, 0xf0, 0xdb, 0x16,
+    0x38, 0x01, 0x54, 0xf9, 0xde, 0x0e, 0xaa, 0x01, 0x55, 0x00, 0xcb, 0x6c,
+    0x2b, 0x0f, 0xb4, 0x11, 0xc8, 0xbf, 0xba, 0x0f, 0x9a, 0xe0, 0xc3, 0x00,
+    0x44, 0x0f, 0xb4, 0x49, 0xcd, 0x80, 0x77, 0x0f, 0xaf, 0xe8, 0x00, 0xc0,
+    0xb1, 0x63, 0x45, 0x2d, 0xd5, 0x40, 0xb1, 0x79, 0xc6, 0xd1, 0xf3, 0x01,
+    0x34, 0xd1, 0xcb, 0x99, 0x34, 0x01, 0x34, 0xa8, 0x44, 0x00, 0x2d, 0xc0,
+    0xb1, 0x95, 0xc6, 0xd3, 0x91, 0x0f, 0x9a, 0x98, 0xd2, 0x4b, 0x71, 0x01,
+    0x13, 0x19, 0xcd, 0x7b, 0xff, 0x00, 0x04, 0xe0, 0x45, 0x00, 0x8c, 0xc0,
+    0xb1, 0xa1, 0x48, 0xba, 0x8a, 0x40, 0xb1, 0xad, 0xc7, 0xc4, 0x09, 0x0f,
+    0xce, 0x11, 0xc3, 0x05, 0xba, 0x01, 0x30, 0x98, 0x45, 0x00, 0xba, 0xc0,
+    0xb1, 0xb9, 0x4b, 0x6f, 0xc7, 0xc0, 0xb1, 0xcb, 0x47, 0x02, 0x0e, 0xc0,
+    0xb1, 0xf1, 0xd4, 0x3d, 0x40, 0x05, 0x45, 0xa1, 0x06, 0x40, 0xb2, 0x5c,
+    0xd4, 0x10, 0xc9, 0x0f, 0xb3, 0xd1, 0x46, 0x11, 0x39, 0x40, 0xb2, 0x6e,
+    0xc8, 0xbd, 0x9a, 0x0f, 0xa7, 0x08, 0x03, 0xc0, 0xb2, 0x7a, 0x15, 0xc0,
+    0xb2, 0x90, 0xc4, 0xde, 0x9f, 0x00, 0x41, 0xd9, 0x1c, 0xc0, 0xb2, 0x9c,
+    0xc5, 0x7a, 0xc2, 0x00, 0x41, 0xc9, 0xcd, 0x7a, 0xba, 0x00, 0x41, 0xb9,
+    0xc3, 0xe5, 0xa2, 0x00, 0x41, 0x99, 0xc7, 0xc4, 0x33, 0x00, 0x41, 0x80,
+    0x44, 0x01, 0xc4, 0xc0, 0xb2, 0xa8, 0x4f, 0x0f, 0x5f, 0x40, 0xb2, 0xc9,
+    0x15, 0xc0, 0xb2, 0xd9, 0x91, 0x00, 0x41, 0x5b, 0x00, 0xb2, 0xe5, 0x8b,
+    0x00, 0x41, 0x51, 0x45, 0x2c, 0x86, 0xc0, 0xb2, 0xee, 0x97, 0x00, 0x41,
+    0x39, 0x83, 0x00, 0x41, 0x1b, 0x00, 0xb3, 0x01, 0x87, 0x00, 0x40, 0xe8,
+    0x16, 0xc0, 0xb3, 0x05, 0x15, 0xc0, 0xb3, 0x17, 0xc4, 0x49, 0x87, 0x00,
+    0x40, 0x99, 0xc3, 0xe5, 0x6f, 0x00, 0x40, 0x91, 0xc2, 0x02, 0x09, 0x00,
+    0x40, 0x81, 0x0b, 0xc0, 0xb3, 0x23, 0xc3, 0x20, 0x18, 0x00, 0x40, 0x69,
+    0xc3, 0x8c, 0x3f, 0x00, 0x40, 0x61, 0xc5, 0xdd, 0x7b, 0x00, 0x40, 0x59,
+    0xc4, 0xe1, 0x63, 0x00, 0x40, 0x51, 0xc3, 0x70, 0x3f, 0x00, 0x40, 0x49,
+    0xc3, 0x0a, 0xe2, 0x00, 0x40, 0x31, 0x04, 0xc0, 0xb3, 0x2f, 0xc5, 0x49,
+    0x80, 0x00, 0x40, 0x19, 0xc5, 0xb5, 0x1e, 0x00, 0x40, 0x11, 0xc4, 0xd8,
+    0xe5, 0x00, 0x40, 0x00, 0xcf, 0x40, 0x0c, 0x01, 0x31, 0x00, 0x8a, 0x0f,
+    0xcd, 0x29, 0xc8, 0x43, 0xb6, 0x0f, 0x9d, 0x80, 0x87, 0x01, 0x19, 0x99,
+    0x4a, 0xa0, 0x30, 0x40, 0xb3, 0x3b, 0x44, 0x00, 0x74, 0xc0, 0xb3, 0x47,
+    0xc6, 0xca, 0x85, 0x0f, 0xb1, 0x50, 0xcc, 0x82, 0xd1, 0x0f, 0xb2, 0x11,
+    0xcd, 0x7d, 0x5e, 0x0f, 0xb2, 0x08, 0x4c, 0x24, 0xe3, 0xc0, 0xb3, 0x59,
+    0x53, 0x41, 0xe3, 0x40, 0xb3, 0x6b, 0x8d, 0x0f, 0xcc, 0x41, 0x44, 0x45,
+    0xa1, 0x40, 0xb3, 0x77, 0xc6, 0x02, 0xd1, 0x01, 0x3a, 0x69, 0xc4, 0x0e,
+    0x6a, 0x01, 0x39, 0x81, 0xcb, 0x8e, 0x08, 0x01, 0x38, 0xf0, 0xc6, 0xd3,
+    0x8b, 0x0f, 0x9b, 0x39, 0x4b, 0x8c, 0x62, 0x40, 0xb3, 0xa7, 0x4c, 0x88,
+    0x29, 0xc0, 0xb4, 0x27, 0xc4, 0x2a, 0x3e, 0x0f, 0x9b, 0x81, 0x00, 0xc0,
+    0xb4, 0x3f, 0x95, 0x0f, 0xd3, 0x98, 0xc4, 0xe1, 0xe7, 0x0f, 0xb6, 0x69,
+    0xc7, 0xc6, 0x7f, 0x0f, 0xb6, 0x90, 0xc2, 0x00, 0x74, 0x00, 0x00, 0x79,
+    0xc3, 0x00, 0xa3, 0x00, 0x00, 0x70, 0xc2, 0x00, 0x45, 0x0f, 0xcc, 0x11,
+    0xc2, 0x11, 0xa5, 0x01, 0x32, 0x78, 0x46, 0x03, 0x13, 0xc0, 0xb4, 0x67,
+    0x48, 0x0b, 0x17, 0xc0, 0xb4, 0x77, 0xd4, 0x19, 0x9a, 0x0f, 0xb3, 0x80,
+    0xc2, 0x00, 0xc4, 0x0f, 0xad, 0xa9, 0xc7, 0xc4, 0xa3, 0x0f, 0xd4, 0xd8,
+    0xcd, 0x7b, 0xcb, 0x01, 0x36, 0x20, 0x45, 0x15, 0xa7, 0xc0, 0xb4, 0x9b,
+    0x45, 0x20, 0x6c, 0x40, 0xb4, 0xcb, 0xd0, 0x0d, 0xaa, 0x0f, 0xb3, 0x58,
+    0xcd, 0x80, 0x6a, 0x01, 0x4f, 0xb0, 0x9f, 0x08, 0xd5, 0x11, 0x9e, 0x08,
+    0xd5, 0x08, 0x45, 0x02, 0x9a, 0x40, 0xb4, 0xfb, 0xc5, 0xd7, 0x3b, 0x08,
+    0xd4, 0xe9, 0xcb, 0x99, 0x6b, 0x08, 0xd4, 0xe1, 0xc4, 0x01, 0xe2, 0x08,
+    0xd4, 0xd9, 0xc5, 0x32, 0x89, 0x08, 0xd4, 0xd0, 0xc8, 0xba, 0xca, 0x08,
+    0xd4, 0xc9, 0x44, 0x00, 0xbb, 0x40, 0xb5, 0x07, 0xc2, 0x00, 0x02, 0x08,
+    0xd4, 0xa9, 0x95, 0x08, 0xd4, 0xa3, 0x00, 0xb5, 0x1f, 0x8e, 0x08, 0xd4,
+    0x91, 0x94, 0x08, 0xd4, 0x89, 0x8f, 0x08, 0xd4, 0x81, 0x84, 0x08, 0xd4,
+    0x79, 0x90, 0x08, 0xd4, 0x73, 0x00, 0xb5, 0x23, 0x86, 0x08, 0xd4, 0x69,
+    0x8d, 0x08, 0xd4, 0x59, 0x89, 0x08, 0xd4, 0x50, 0x15, 0xc0, 0xb5, 0x27,
+    0xc2, 0x00, 0xdb, 0x08, 0xd4, 0x39, 0xc2, 0x00, 0x39, 0x08, 0xd4, 0x30,
+    0x0d, 0xc0, 0xb5, 0x31, 0xc2, 0x00, 0xd0, 0x08, 0xd4, 0x11, 0x15, 0xc0,
+    0xb5, 0x41, 0xc2, 0x02, 0x41, 0x08, 0xd3, 0xf1, 0xc2, 0x00, 0xdb, 0x08,
+    0xd3, 0xe9, 0xc2, 0x00, 0x39, 0x08, 0xd3, 0xe1, 0xc2, 0x19, 0x2c, 0x08,
+    0xd3, 0xd9, 0xc2, 0x00, 0x02, 0x08, 0xd3, 0xd1, 0x1c, 0xc0, 0xb5, 0x51,
+    0x06, 0xc0, 0xb5, 0x5b, 0x16, 0xc0, 0xb5, 0x6f, 0xc2, 0x01, 0xc3, 0x08,
+    0xd3, 0xa1, 0x04, 0xc0, 0xb5, 0x81, 0x12, 0xc0, 0xb5, 0x8b, 0x10, 0xc0,
+    0xb5, 0x95, 0x0c, 0xc0, 0xb5, 0xab, 0x05, 0xc0, 0xb5, 0xb5, 0x09, 0xc0,
+    0xb5, 0xbf, 0x83, 0x08, 0xd2, 0x80, 0xcb, 0x36, 0x51, 0x08, 0xd2, 0xd9,
+    0x45, 0x00, 0xba, 0x40, 0xb5, 0xc9, 0xd1, 0x31, 0xc8, 0x0f, 0xad, 0x61,
+    0xc9, 0xa9, 0x12, 0x0f, 0x9b, 0x31, 0xc6, 0x59, 0x92, 0x00, 0x05, 0x68,
+    0xc4, 0x26, 0x78, 0x08, 0x87, 0xc9, 0xc5, 0x06, 0xdb, 0x08, 0x87, 0xc1,
+    0x15, 0xc0, 0xb5, 0xe9, 0x08, 0xc0, 0xb5, 0xf5, 0x16, 0xc0, 0xb6, 0x01,
+    0xc3, 0x05, 0x14, 0x08, 0x87, 0x89, 0xc4, 0x15, 0xe7, 0x08, 0x87, 0x80,
+    0x42, 0x01, 0xc3, 0xc0, 0xb6, 0x0d, 0x07, 0xc0, 0xb6, 0x15, 0xc2, 0x38,
+    0x2a, 0x08, 0x87, 0x31, 0xc2, 0x53, 0x31, 0x08, 0x87, 0x29, 0xc2, 0x14,
+    0x77, 0x08, 0x87, 0x21, 0xc2, 0x02, 0x98, 0x08, 0x87, 0x11, 0x10, 0xc0,
+    0xb6, 0x1f, 0xc3, 0xe5, 0xf9, 0x08, 0x87, 0x01, 0xc3, 0x38, 0x66, 0x08,
+    0x86, 0xf9, 0xc3, 0x14, 0x4b, 0x08, 0x86, 0xf1, 0xc3, 0x0f, 0xb6, 0x08,
+    0x86, 0xe9, 0xc3, 0x44, 0x79, 0x08, 0x86, 0xe1, 0xc3, 0x62, 0x26, 0x08,
+    0x86, 0xd9, 0xc3, 0xc1, 0x9d, 0x08, 0x86, 0xd1, 0xc3, 0x12, 0xae, 0x08,
+    0x86, 0xc1, 0xc3, 0x40, 0x40, 0x08, 0x86, 0xa9, 0xc3, 0x70, 0xaf, 0x08,
+    0x86, 0xa1, 0xc3, 0xe5, 0x87, 0x08, 0x86, 0x99, 0xc3, 0x44, 0x19, 0x08,
+    0x86, 0x91, 0xc3, 0x02, 0x97, 0x08, 0x86, 0x89, 0xc3, 0xc3, 0x6e, 0x08,
+    0x86, 0x80, 0xd4, 0x38, 0x90, 0x08, 0x7a, 0xc9, 0x44, 0x02, 0x9f, 0xc0,
+    0xb6, 0x31, 0xcf, 0x38, 0x95, 0x08, 0x7a, 0xb8, 0xc3, 0x05, 0x14, 0x08,
+    0x7a, 0x8b, 0x00, 0xb6, 0x40, 0x16, 0x40, 0xb6, 0x46, 0xcc, 0x08, 0x5b,
+    0x08, 0x7a, 0x81, 0xca, 0x9d, 0x38, 0x08, 0x7a, 0x79, 0xcf, 0x66, 0xed,
+    0x08, 0x7a, 0x71, 0x45, 0x11, 0xba, 0xc0, 0xb6, 0x52, 0x46, 0x0e, 0xd4,
+    0xc0, 0xb6, 0x5e, 0x49, 0x04, 0xf9, 0xc0, 0xb6, 0x6a, 0x44, 0x05, 0x18,
+    0x40, 0xb6, 0x76, 0x0e, 0xc0, 0xb6, 0x82, 0xc4, 0xe0, 0x4b, 0x08, 0x7a,
+    0x19, 0xc3, 0xb5, 0x3e, 0x08, 0x7a, 0x11, 0x15, 0xc0, 0xb6, 0x8e, 0xc9,
+    0x5d, 0xe2, 0x08, 0x7a, 0x01, 0xc2, 0x00, 0x67, 0x08, 0x79, 0xf1, 0x03,
+    0xc0, 0xb6, 0x98, 0xc3, 0x20, 0x18, 0x08, 0x79, 0xd9, 0xc3, 0x00, 0x4e,
+    0x08, 0x79, 0xd1, 0xc4, 0xe0, 0xe7, 0x08, 0x79, 0xc1, 0xc4, 0x4a, 0xb9,
+    0x08, 0x79, 0xb9, 0xc2, 0x01, 0x7f, 0x08, 0x79, 0x9b, 0x00, 0xb6, 0xa4,
+    0xc5, 0x4a, 0xb3, 0x08, 0x79, 0xa9, 0xc3, 0x7e, 0x89, 0x08, 0x79, 0xa1,
+    0xc5, 0x9c, 0xa2, 0x08, 0x79, 0x91, 0xc4, 0xe3, 0x27, 0x08, 0x79, 0x88,
+    0x00, 0xc0, 0xb6, 0xaa, 0x42, 0x00, 0xa9, 0x40, 0xb7, 0x06, 0xcd, 0x7a,
+    0xad, 0x0f, 0xaa, 0x29, 0x15, 0xc0, 0xb7, 0x5e, 0x06, 0xc0, 0xb7, 0x85,
+    0x10, 0xc0, 0xb7, 0x8f, 0xce, 0x6c, 0xec, 0x01, 0x20, 0xf9, 0xd0, 0x5e,
+    0x82, 0x01, 0x20, 0xf1, 0xcf, 0x64, 0x3b, 0x01, 0x20, 0xe9, 0x08, 0xc0,
+    0xb7, 0x99, 0x07, 0xc0, 0xb7, 0xa5, 0x42, 0x00, 0x64, 0xc0, 0xb7, 0xaf,
+    0xd3, 0x42, 0x42, 0x01, 0x20, 0x59, 0xc9, 0x1b, 0x00, 0x01, 0x20, 0x51,
+    0xd5, 0x33, 0xd1, 0x01, 0x20, 0x49, 0x04, 0xc0, 0xb7, 0xbb, 0xcb, 0x49,
+    0x4a, 0x01, 0x20, 0x31, 0xd2, 0x48, 0x47, 0x01, 0x5c, 0xb8, 0x47, 0x02,
+    0x0e, 0xc0, 0xb7, 0xc7, 0x0a, 0xc0, 0xb8, 0x39, 0x4d, 0x76, 0xb7, 0xc0,
+    0xb8, 0x4b, 0x14, 0xc0, 0xb8, 0x57, 0x47, 0xc0, 0x4a, 0xc0, 0xb8, 0x69,
+    0x47, 0xbf, 0xda, 0xc0, 0xb8, 0x7b, 0xd1, 0x48, 0x11, 0x00, 0x38, 0x79,
+    0x42, 0x00, 0x99, 0xc0, 0xb8, 0x8d, 0x42, 0x06, 0x62, 0xc0, 0xb8, 0x99,
+    0x07, 0xc0, 0xb8, 0xa5, 0xc7, 0xc9, 0x6c, 0x00, 0x3a, 0x51, 0xc5, 0x23,
+    0x26, 0x00, 0x3a, 0x49, 0xcc, 0x86, 0xf1, 0x00, 0x3a, 0x01, 0xc9, 0xa8,
+    0xaf, 0x00, 0x3a, 0x09, 0x16, 0xc0, 0xb8, 0xb1, 0x4d, 0x78, 0xb2, 0x40,
+    0xb8, 0xbd, 0x83, 0x05, 0x40, 0x01, 0x8b, 0x05, 0x40, 0x09, 0x97, 0x05,
+    0x40, 0x19, 0x87, 0x05, 0x40, 0x21, 0x91, 0x05, 0x40, 0x29, 0x0d, 0xc0,
+    0xb8, 0xc9, 0x09, 0xc0, 0xb8, 0xd3, 0x05, 0xc0, 0xb8, 0xdd, 0x16, 0xc0,
+    0xb8, 0xe7, 0x06, 0xc0, 0xb8, 0xf5, 0xc2, 0x01, 0x23, 0x05, 0x41, 0x11,
+    0x0c, 0xc0, 0xb9, 0x03, 0xc2, 0x00, 0x10, 0x05, 0x40, 0xc1, 0x12, 0xc0,
+    0xb9, 0x0d, 0x04, 0xc0, 0xb9, 0x17, 0xc2, 0x00, 0xa2, 0x05, 0x40, 0xe9,
+    0x14, 0xc0, 0xb9, 0x21, 0xc2, 0x01, 0xc8, 0x05, 0x40, 0xf9, 0xc2, 0x00,
+    0xfb, 0x05, 0x41, 0x08, 0xc8, 0xb9, 0xea, 0x05, 0x40, 0x11, 0xc7, 0x5a,
+    0xdb, 0x05, 0x40, 0x31, 0x03, 0x40, 0xb9, 0x2b, 0x83, 0x05, 0x41, 0x19,
+    0x8b, 0x05, 0x41, 0x21, 0x97, 0x05, 0x41, 0x29, 0x87, 0x05, 0x41, 0x31,
+    0xc2, 0x01, 0x24, 0x05, 0x41, 0x38, 0x9e, 0x05, 0x41, 0x41, 0x9f, 0x05,
+    0x41, 0x49, 0xa0, 0x05, 0x41, 0x51, 0xa1, 0x05, 0x41, 0x58, 0xca, 0x9d,
+    0x10, 0x0f, 0xa5, 0x61, 0xc5, 0xdb, 0x64, 0x0f, 0xb5, 0x20, 0xd6, 0x2f,
+    0xb4, 0x0f, 0xaf, 0x19, 0xc2, 0x00, 0x29, 0x0f, 0xa8, 0x43, 0x00, 0xb9,
+    0x37, 0xcf, 0x6b, 0x34, 0x0f, 0xb2, 0x50, 0x87, 0x01, 0x3a, 0x3b, 0x00,
+    0xb9, 0x3d, 0xc9, 0x78, 0x74, 0x0f, 0xa4, 0xb0, 0xc2, 0x02, 0xae, 0x01,
+    0x4d, 0x09, 0xc4, 0x00, 0x49, 0x01, 0x4d, 0x00, 0xcc, 0x8c, 0x55, 0x0f,
+    0xae, 0x99, 0xc8, 0xbb, 0xda, 0x0f, 0xae, 0x91, 0xc5, 0x08, 0x91, 0x0f,
+    0xa0, 0xd0, 0xc4, 0xe4, 0x43, 0x0f, 0xab, 0xc0, 0x90, 0x0f, 0xca, 0x21,
+    0xcb, 0x8c, 0xea, 0x0f, 0xcf, 0xa8, 0x43, 0x00, 0x3d, 0xc0, 0xb9, 0x41,
+    0x46, 0x07, 0x2f, 0x40, 0xb9, 0x62, 0xcc, 0x85, 0xf5, 0x01, 0x36, 0x29,
+    0xc9, 0xb2, 0xa2, 0x0f, 0x98, 0xf0, 0x52, 0x48, 0xe9, 0xc0, 0xb9, 0x9a,
+    0x47, 0x02, 0x0e, 0xc0, 0xb9, 0xc2, 0xc8, 0x7a, 0x7e, 0x00, 0xdd, 0xd1,
+    0x46, 0x09, 0x97, 0xc0, 0xba, 0x4c, 0x51, 0x4f, 0x25, 0xc0, 0xba, 0x70,
+    0x45, 0x00, 0xba, 0xc0, 0xba, 0x82, 0x4d, 0x80, 0x50, 0x40, 0xba, 0x8e,
+    0xcf, 0x69, 0xae, 0x0f, 0x98, 0x20, 0xd5, 0x37, 0x43, 0x01, 0x17, 0x49,
+    0xce, 0x74, 0x32, 0x01, 0x15, 0x89, 0x46, 0x23, 0xa0, 0xc0, 0xba, 0x98,
+    0x46, 0x00, 0xd4, 0x40, 0xba, 0xa4, 0xc2, 0x00, 0x55, 0x01, 0x14, 0x13,
+    0x00, 0xba, 0xbc, 0x46, 0x00, 0xd4, 0xc0, 0xba, 0xc0, 0x45, 0x00, 0x8c,
+    0x40, 0xba, 0xcc, 0xd1, 0x1a, 0x4a, 0x01, 0x04, 0x71, 0xd0, 0x1d, 0xec,
+    0x01, 0x04, 0x69, 0x07, 0xc0, 0xba, 0xde, 0xc5, 0x1d, 0x1d, 0x01, 0x04,
+    0x59, 0xc9, 0x60, 0xf3, 0x01, 0x04, 0x51, 0xc4, 0x26, 0x78, 0x01, 0x04,
+    0x49, 0x15, 0xc0, 0xba, 0xea, 0x08, 0xc0, 0xba, 0xf6, 0x16, 0xc0, 0xbb,
+    0x02, 0xc3, 0x05, 0x14, 0x01, 0x04, 0x09, 0xc4, 0x15, 0xe7, 0x01, 0x04,
+    0x00, 0x87, 0x01, 0x19, 0x19, 0x44, 0x00, 0x74, 0x40, 0xbb, 0x0e, 0x00,
+    0xc0, 0xbb, 0x1a, 0xc7, 0xc1, 0xfc, 0x01, 0x55, 0x52, 0x00, 0xbb, 0x7c,
+    0x46, 0xcf, 0xe3, 0xc0, 0xbb, 0x82, 0xca, 0xa4, 0x40, 0x00, 0x04, 0xf0,
+    0x16, 0xc0, 0xbb, 0x8a, 0xc2, 0x00, 0x89, 0x0f, 0xc9, 0xa2, 0x00, 0xbb,
+    0x99, 0xc6, 0x1d, 0xb4, 0x01, 0x11, 0xbb, 0x00, 0xbb, 0x9f, 0xc9, 0xb3,
+    0xdd, 0x01, 0x0a, 0x50, 0x00, 0x40, 0xbb, 0xa5, 0xcd, 0x7a, 0xc7, 0x01,
+    0x08, 0xf1, 0x5b, 0x17, 0x2b, 0x40, 0xbb, 0xbd, 0xc5, 0x29, 0xfc, 0x0f,
+    0xc9, 0x81, 0xc3, 0x12, 0xb8, 0x0f, 0xd6, 0x19, 0xc6, 0x18, 0x8e, 0x0f,
+    0xd6, 0x20, 0xc3, 0x01, 0x4b, 0x0f, 0xd5, 0x39, 0x45, 0x3c, 0x54, 0x40,
+    0xbb, 0xf5, 0xcc, 0x8b, 0xdd, 0x01, 0x08, 0x78, 0x49, 0xb0, 0x08, 0xc0,
+    0xbc, 0x01, 0xcc, 0x87, 0xd5, 0x0f, 0xb6, 0xe8, 0x46, 0x17, 0x33, 0x40,
+    0xbc, 0x3f, 0xc5, 0x00, 0xb9, 0x00, 0x01, 0x5b, 0x00, 0xbc, 0x47, 0xcb,
+    0x8d, 0xb0, 0x00, 0x05, 0x88, 0xc8, 0x2a, 0x06, 0x0f, 0xc8, 0x79, 0xca,
+    0xa1, 0xca, 0x0f, 0xc8, 0x60, 0xcb, 0x95, 0xcf, 0x0f, 0x9c, 0x69, 0xc5,
+    0xd7, 0x1d, 0x0f, 0x9a, 0x68, 0xc4, 0x12, 0x50, 0x0f, 0xa1, 0xe9, 0xc4,
+    0x00, 0x87, 0x0f, 0xa1, 0xb8, 0xd0, 0x58, 0x52, 0x01, 0x1c, 0x91, 0xd2,
+    0x49, 0x67, 0x01, 0x1c, 0x88, 0xc8, 0x19, 0x58, 0x01, 0x5f, 0xe9, 0xc9,
+    0xa8, 0x5e, 0x0f, 0xb7, 0x98, 0x94, 0x0f, 0xa6, 0xf9, 0x00, 0xc0, 0xbc,
+    0x4b, 0x95, 0x0f, 0xae, 0x80, 0x43, 0x02, 0x18, 0xc0, 0xbc, 0x57, 0xc8,
+    0xbd, 0x5a, 0x0f, 0x9c, 0x49, 0xd1, 0x4f, 0xf1, 0x01, 0x81, 0xe9, 0xcc,
+    0x84, 0x45, 0x01, 0x92, 0x80, 0x46, 0x0b, 0x11, 0xc0, 0xbc, 0x61, 0x47,
+    0x34, 0x2f, 0xc0, 0xbc, 0x6d, 0x46, 0x09, 0x97, 0xc0, 0xbc, 0x83, 0x47,
+    0xc3, 0x3e, 0xc0, 0xbc, 0xa1, 0x52, 0x4b, 0xef, 0xc0, 0xbc, 0xe7, 0x4a,
+    0x9f, 0x86, 0x40, 0xbc, 0xf3, 0x45, 0x6b, 0x87, 0xc0, 0xbd, 0x31, 0x45,
+    0x00, 0xb4, 0xc0, 0xbd, 0x3d, 0xc5, 0xdc, 0x0e, 0x0f, 0xd4, 0x10, 0x00,
+    0x40, 0xbd, 0x4f, 0xcf, 0x63, 0x1e, 0x08, 0xd7, 0xa3, 0x00, 0xbd, 0x5b,
+    0x46, 0x02, 0x0f, 0x40, 0xbd, 0x5f, 0x00, 0x40, 0xbd, 0xcd, 0xc4, 0x28,
+    0xb1, 0x08, 0xd7, 0x63, 0x00, 0xbd, 0xd9, 0xcc, 0x23, 0x33, 0x08, 0xd7,
+    0x3a, 0x00, 0xbd, 0xdd, 0x00, 0x40, 0xbd, 0xe3, 0x00, 0xc0, 0xbd, 0xf2,
+    0x46, 0xd0, 0x4f, 0xc0, 0xbe, 0x0a, 0xcd, 0x79, 0x34, 0x0f, 0xc9, 0x90,
+    0x49, 0xab, 0x91, 0xc0, 0xbe, 0x1c, 0x49, 0x2b, 0xed, 0x40, 0xbe, 0x4e,
+    0x44, 0xaa, 0x7f, 0xc0, 0xbe, 0x90, 0x0f, 0xc0, 0xbe, 0xaa, 0xc3, 0x07,
+    0xa2, 0x0b, 0x5b, 0x81, 0x16, 0xc0, 0xbe, 0xb6, 0xc2, 0x04, 0xad, 0x0b,
+    0x5b, 0x61, 0x10, 0xc0, 0xbe, 0xc8, 0x1a, 0xc0, 0xbe, 0xd4, 0x0a, 0xc0,
+    0xbe, 0xe4, 0xc8, 0xbe, 0xd2, 0x0b, 0x5b, 0x39, 0x44, 0xde, 0xeb, 0xc0,
+    0xbe, 0xf0, 0xc6, 0xce, 0xe1, 0x0b, 0x5a, 0x18, 0x16, 0xc0, 0xbf, 0x0c,
+    0x47, 0x0d, 0x04, 0xc0, 0xbf, 0x18, 0xc8, 0x33, 0xee, 0x0b, 0x5a, 0xf0,
+    0xc4, 0x26, 0x78, 0x0b, 0x5a, 0xc9, 0xc5, 0x06, 0xdb, 0x0b, 0x5a, 0xc1,
+    0x15, 0xc0, 0xbf, 0x22, 0x08, 0xc0, 0xbf, 0x2e, 0x16, 0xc0, 0xbf, 0x3a,
+    0xc3, 0x05, 0x14, 0x0b, 0x5a, 0x89, 0xc4, 0x15, 0xe7, 0x0b, 0x5a, 0x80,
+    0x16, 0xc0, 0xbf, 0x46, 0xc3, 0xdf, 0xff, 0x0b, 0x59, 0xa9, 0x15, 0xc0,
+    0xbf, 0x52, 0x0d, 0x40, 0xbf, 0x5c, 0x03, 0xc0, 0xbf, 0x68, 0x19, 0xc0,
+    0xbf, 0x80, 0x0b, 0xc0, 0xbf, 0x88, 0x11, 0xc0, 0xbf, 0x94, 0x17, 0xc0,
+    0xbf, 0xa0, 0x07, 0x40, 0xbf, 0xac, 0xd0, 0x3a, 0x4c, 0x0f, 0xb5, 0x81,
+    0xc2, 0x00, 0xf1, 0x0f, 0xca, 0xa0, 0xc8, 0x1d, 0x3c, 0x0f, 0xb1, 0xf1,
+    0xc4, 0x00, 0x87, 0x0f, 0xb1, 0x08, 0xcb, 0x8d, 0xd1, 0x01, 0x1f, 0xf1,
+    0xc5, 0x00, 0x92, 0x01, 0x1f, 0xd8, 0xc7, 0x00, 0x90, 0x01, 0x1f, 0xe9,
+    0xcb, 0x8d, 0x6e, 0x01, 0x1f, 0xe0, 0x43, 0x00, 0xe5, 0xc0, 0xbf, 0xb8,
+    0xc3, 0x32, 0x36, 0x0f, 0xa7, 0x70, 0xc7, 0x00, 0xfa, 0x01, 0x03, 0x49,
+    0xca, 0xa1, 0x5c, 0x01, 0x01, 0x60, 0xd1, 0x54, 0xfd, 0x0f, 0xb5, 0x40,
+    0xc7, 0x00, 0x8b, 0x01, 0x57, 0x08, 0x42, 0x00, 0x45, 0xc0, 0xbf, 0xc7,
+    0xc7, 0xc7, 0x51, 0x01, 0x18, 0x31, 0xcc, 0x8b, 0x1d, 0x0f, 0xb1, 0x18,
+    0xc4, 0x00, 0xba, 0x01, 0x0a, 0x61, 0xd1, 0x57, 0x1d, 0x01, 0x01, 0x89,
+    0xca, 0xa8, 0x00, 0x01, 0x01, 0x80, 0xc8, 0x12, 0x85, 0x01, 0x31, 0x71,
+    0x8a, 0x0f, 0x9a, 0x89, 0xc3, 0x04, 0x20, 0x0f, 0xcc, 0xd0, 0xc4, 0x02,
+    0xde, 0x08, 0x5d, 0x59, 0x19, 0xc0, 0xbf, 0xd1, 0xc2, 0x00, 0xc4, 0x08,
+    0x5d, 0x68, 0xc8, 0x0d, 0x03, 0x08, 0x5d, 0x78, 0xc3, 0x11, 0xef, 0x08,
+    0x5c, 0x81, 0x03, 0x40, 0xbf, 0xdb, 0xc2, 0x00, 0x8e, 0x08, 0x5c, 0x38,
+    0xce, 0x73, 0x1a, 0x08, 0x48, 0xf9, 0x47, 0x34, 0x2f, 0xc0, 0xbf, 0xe7,
+    0x47, 0x02, 0x0e, 0x40, 0xbf, 0xf4, 0x47, 0x02, 0x0e, 0xc0, 0xc0, 0x57,
+    0x15, 0xc0, 0xc0, 0xdd, 0xd0, 0x59, 0x22, 0x05, 0x43, 0xa9, 0x45, 0x01,
+    0xc3, 0x40, 0xc0, 0xe7, 0x12, 0xc0, 0xc0, 0xf3, 0x16, 0xc0, 0xc1, 0x03,
+    0x05, 0xc0, 0xc1, 0x15, 0x19, 0xc0, 0xc1, 0x29, 0x0a, 0xc0, 0xc1, 0x35,
+    0x04, 0xc0, 0xc1, 0x47, 0x15, 0xc0, 0xc1, 0x5a, 0x42, 0x01, 0xc3, 0xc0,
+    0xc1, 0x78, 0x42, 0x01, 0x0f, 0xc0, 0xc1, 0x84, 0x42, 0x00, 0x58, 0xc0,
+    0xc1, 0x8e, 0x14, 0xc0, 0xc1, 0x9a, 0xc5, 0xdb, 0xd2, 0x08, 0x0f, 0x71,
+    0xc4, 0xb4, 0x91, 0x08, 0x0f, 0x99, 0xc7, 0xc9, 0xa4, 0x08, 0x0f, 0xb9,
+    0x09, 0xc0, 0xc1, 0xa6, 0xc5, 0x01, 0xa2, 0x08, 0x0e, 0xc9, 0xc5, 0xd3,
+    0xe4, 0x08, 0x0f, 0xc0, 0xc6, 0x5b, 0x02, 0x00, 0x04, 0x81, 0xc4, 0x09,
+    0x9d, 0x00, 0x00, 0xa1, 0x16, 0xc0, 0xc1, 0xb2, 0xc3, 0x05, 0x14, 0x00,
+    0x00, 0x88, 0x03, 0xc0, 0xc1, 0xbe, 0x09, 0xc0, 0xc1, 0xca, 0x15, 0xc0,
+    0xc1, 0xd6, 0xc2, 0x00, 0x7a, 0x00, 0x4a, 0x81, 0x4b, 0x6f, 0xc7, 0xc0,
+    0xc1, 0xe2, 0x47, 0x02, 0x0e, 0xc0, 0xc2, 0x17, 0xc7, 0xc7, 0xac, 0x05,
+    0x47, 0xe9, 0xca, 0x9d, 0x56, 0x05, 0x47, 0xd9, 0xc5, 0x95, 0xf0, 0x05,
+    0x47, 0xd1, 0x06, 0x40, 0xc2, 0x8c, 0xc6, 0xd2, 0x77, 0x0f, 0xae, 0xa1,
+    0xc8, 0x3f, 0xff, 0x0f, 0xad, 0x28, 0x96, 0x0f, 0x9e, 0xe3, 0x00, 0xc2,
+    0x9e, 0x43, 0x00, 0x3d, 0x40, 0xc2, 0xa4, 0x44, 0x05, 0xaa, 0xc0, 0xc2,
+    0xb0, 0xca, 0xa6, 0xac, 0x0f, 0x99, 0x98, 0x44, 0x02, 0x9b, 0xc0, 0xc2,
+    0xbc, 0x45, 0x00, 0x8c, 0x40, 0xc2, 0xce, 0x46, 0x00, 0x8b, 0x40, 0xc2,
+    0xda, 0x46, 0x00, 0x8b, 0x40, 0xc2, 0xec, 0xc5, 0x61, 0xc0, 0x0e, 0x98,
+    0x2b, 0x00, 0xc2, 0xfe, 0x0a, 0xc0, 0xc3, 0x04, 0x49, 0xb1, 0xaf, 0xc0,
+    0xc3, 0x10, 0x48, 0xbc, 0x1a, 0x40, 0xc3, 0x1c, 0xc4, 0x26, 0x78, 0x00,
+    0x01, 0xcb, 0x00, 0xc3, 0x28, 0xc5, 0x06, 0xdb, 0x00, 0x01, 0xc3, 0x00,
+    0xc3, 0x2c, 0x15, 0xc0, 0xc3, 0x30, 0x08, 0xc0, 0xc3, 0x42, 0x16, 0xc0,
+    0xc3, 0x54, 0xc3, 0x05, 0x14, 0x00, 0x01, 0x8b, 0x00, 0xc3, 0x66, 0xc4,
+    0x15, 0xe7, 0x00, 0x01, 0x82, 0x00, 0xc3, 0x6a, 0x06, 0xc0, 0xc3, 0x6e,
+    0xd0, 0x5c, 0xe2, 0x08, 0xca, 0x31, 0xca, 0x93, 0x30, 0x08, 0xca, 0x29,
+    0x45, 0x00, 0xba, 0xc0, 0xc3, 0x7a, 0x47, 0x30, 0x9f, 0xc0, 0xc3, 0x92,
+    0xca, 0xa0, 0x3a, 0x08, 0xca, 0x09, 0xd3, 0x44, 0xee, 0x08, 0xc9, 0xf9,
+    0x18, 0xc0, 0xc3, 0x9e, 0x47, 0x02, 0x0e, 0x40, 0xc3, 0xaa, 0x45, 0x29,
+    0x90, 0xc0, 0xc4, 0x17, 0xc3, 0x23, 0x1b, 0x01, 0x11, 0x19, 0xc7, 0xc3,
+    0xfb, 0x0f, 0xc9, 0xf8, 0x4b, 0x43, 0x54, 0xc0, 0xc4, 0x21, 0xca, 0xa3,
+    0x64, 0x01, 0x3b, 0xf9, 0x46, 0x09, 0x97, 0x40, 0xc4, 0x2d, 0xca, 0xa3,
+    0x64, 0x01, 0x3c, 0x49, 0x46, 0x09, 0x97, 0x40, 0xc4, 0x4b, 0xc8, 0xbf,
+    0x32, 0x01, 0x36, 0x69, 0x49, 0xae, 0x85, 0x40, 0xc4, 0x6f, 0xa3, 0x01,
+    0x34, 0x29, 0xa2, 0x01, 0x34, 0x21, 0xa1, 0x01, 0x34, 0x19, 0xa0, 0x01,
+    0x34, 0x11, 0x9f, 0x01, 0x34, 0x09, 0x9e, 0x01, 0x34, 0x00, 0xc9, 0xb4,
+    0x52, 0x01, 0x18, 0x01, 0x44, 0x4a, 0x60, 0x40, 0xc4, 0x7b, 0xc9, 0xab,
+    0x9a, 0x0f, 0xd3, 0xc1, 0xc3, 0x02, 0x0e, 0x0f, 0xa5, 0x38, 0xc5, 0x11,
+    0x55, 0x0f, 0xa1, 0x90, 0x48, 0xbf, 0xb2, 0xc0, 0xc4, 0x93, 0x42, 0x00,
+    0x97, 0x40, 0xc4, 0xa5, 0xc9, 0x03, 0xde, 0x01, 0x18, 0x21, 0xd7, 0x27,
+    0xfe, 0x01, 0x17, 0x89, 0xc4, 0x32, 0xbc, 0x01, 0x15, 0x23, 0x00, 0xc4,
+    0xec, 0xc9, 0xb2, 0xea, 0x01, 0x4b, 0xf8, 0xd2, 0x4e, 0x2f, 0x0f, 0xa9,
+    0xe9, 0xcc, 0x4e, 0x35, 0x0f, 0xa9, 0xd9, 0x4e, 0x6c, 0xfa, 0x40, 0xc4,
+    0xf2, 0x42, 0x3c, 0xd3, 0xc0, 0xc4, 0xfe, 0xc5, 0x02, 0xfd, 0x0f, 0x81,
+    0x80, 0xc5, 0x02, 0xfd, 0x0f, 0x83, 0x11, 0x42, 0x3c, 0xd3, 0x40, 0xc5,
+    0x28, 0x00, 0xc0, 0xc5, 0x52, 0x42, 0x00, 0xa9, 0xc0, 0xc5, 0xa4, 0x02,
+    0x40, 0xc5, 0xb6, 0x05, 0xc0, 0xc5, 0xc8, 0xc5, 0x8a, 0x10, 0x01, 0x4c,
+    0xc9, 0x15, 0xc0, 0xc5, 0xd4, 0xc9, 0xad, 0xd1, 0x0f, 0xd7, 0x29, 0xd4,
+    0x3a, 0xe8, 0x01, 0x70, 0x41, 0xc6, 0xcc, 0x71, 0x01, 0x70, 0x99, 0xd4,
+    0x3d, 0x90, 0x01, 0x70, 0xb0, 0xc8, 0x18, 0x67, 0x01, 0x16, 0x29, 0xc5,
+    0x1d, 0x1d, 0x01, 0x11, 0xc1, 0xc4, 0x25, 0xd5, 0x01, 0x10, 0xa1, 0xc5,
+    0x00, 0xd4, 0x00, 0x16, 0xc8, 0xd1, 0x50, 0xce, 0x08, 0xc1, 0xd9, 0x45,
+    0x00, 0xba, 0xc0, 0xc5, 0xe0, 0x4b, 0x6f, 0xc7, 0xc0, 0xc5, 0xf2, 0x47,
+    0x02, 0x0e, 0x40, 0xc6, 0x15, 0xcf, 0x4c, 0x01, 0x01, 0x17, 0x5b, 0x00,
+    0xc6, 0x7c, 0xc6, 0x00, 0x4e, 0x01, 0x10, 0x60, 0xc9, 0x23, 0x9f, 0x01,
+    0x17, 0x08, 0xc5, 0x2d, 0x7a, 0x01, 0x14, 0x03, 0x00, 0xc6, 0x82, 0xc3,
+    0x00, 0x9a, 0x01, 0x15, 0x60, 0xdd, 0x11, 0x6e, 0x01, 0x57, 0x70, 0xc7,
+    0x87, 0xc2, 0x0f, 0xad, 0xd9, 0xc4, 0x27, 0xe3, 0x0f, 0xad, 0xca, 0x00,
+    0xc6, 0x88, 0x0e, 0xc0, 0xc6, 0x8e, 0x45, 0x08, 0xcb, 0xc0, 0xc6, 0x9a,
+    0x49, 0xb2, 0xab, 0xc0, 0xc6, 0xcb, 0x44, 0xaf, 0x82, 0xc0, 0xc6, 0xe9,
+    0xd7, 0x27, 0x8b, 0x0d, 0xe3, 0x90, 0x99, 0x0d, 0xe1, 0xc3, 0x00, 0xc6,
+    0xf5, 0x96, 0x0d, 0xe0, 0x1b, 0x00, 0xc7, 0x14, 0x95, 0x0d, 0xe0, 0xe3,
+    0x00, 0xc7, 0x1c, 0x8c, 0x0d, 0xe0, 0xdb, 0x00, 0xc7, 0x2c, 0x90, 0x0d,
+    0xe0, 0xd3, 0x00, 0xc7, 0x30, 0x8f, 0x0d, 0xe0, 0xcb, 0x00, 0xc7, 0x3a,
+    0x94, 0x0d, 0xe0, 0x5b, 0x00, 0xc7, 0x3e, 0x8e, 0x0d, 0xe0, 0x33, 0x00,
+    0xc7, 0x4e, 0x8a, 0x0d, 0xe0, 0x03, 0x00, 0xc7, 0x58, 0x8d, 0x0d, 0xe0,
+    0x2b, 0x00, 0xc7, 0x5c, 0x86, 0x0d, 0xe0, 0x43, 0x00, 0xc7, 0x64, 0x88,
+    0x0d, 0xe0, 0x23, 0x00, 0xc7, 0x6e, 0x92, 0x0d, 0xe0, 0x13, 0x00, 0xc7,
+    0x74, 0x89, 0x0d, 0xe0, 0x53, 0x00, 0xc7, 0x80, 0x98, 0x0d, 0xe0, 0x4b,
+    0x00, 0xc7, 0x86, 0x84, 0x0d, 0xe0, 0x39, 0x9a, 0x0d, 0xe0, 0x0b, 0x00,
+    0xc7, 0x8c, 0x91, 0x0d, 0xe2, 0x23, 0x00, 0xc7, 0x90, 0x97, 0x0d, 0xe2,
+    0x8b, 0x00, 0xc7, 0xa2, 0x87, 0x0d, 0xe2, 0x3b, 0x00, 0xc7, 0xb0, 0xc2,
+    0x0c, 0x43, 0x0d, 0xe2, 0x81, 0x8b, 0x0d, 0xe2, 0x33, 0x00, 0xc7, 0xb8,
+    0x83, 0x0d, 0xe2, 0x0a, 0x00, 0xc7, 0xbc, 0xe0, 0x03, 0xa7, 0x01, 0x3c,
+    0xf9, 0xc8, 0x7d, 0xa4, 0x07, 0xf2, 0x49, 0xc8, 0x80, 0x2e, 0x07, 0xf2,
+    0x68, 0xc6, 0x00, 0x91, 0x0f, 0xa5, 0x41, 0xd0, 0x5e, 0xd2, 0x01, 0x72,
+    0x18, 0xc5, 0xa0, 0xc1, 0x0f, 0xaf, 0x09, 0x45, 0x00, 0x8c, 0x40, 0xc7,
+    0xc2, 0x00, 0xc0, 0xc7, 0xce, 0x42, 0x00, 0xa9, 0x40, 0xc7, 0xef, 0x51,
+    0x53, 0xed, 0xc0, 0xc8, 0x38, 0xc3, 0x4e, 0x13, 0x0f, 0xb5, 0xd8, 0xcf,
+    0x25, 0xc4, 0x01, 0x33, 0xe1, 0x4f, 0x68, 0x28, 0x40, 0xc8, 0x40, 0x9c,
+    0x0f, 0x8f, 0xf9, 0x9b, 0x0f, 0x8f, 0xf1, 0x9a, 0x0f, 0x8f, 0xe9, 0x99,
+    0x0f, 0x8f, 0xe1, 0x98, 0x0f, 0x8f, 0xd9, 0x97, 0x0f, 0x8f, 0xd1, 0x96,
+    0x0f, 0x8f, 0xc9, 0x95, 0x0f, 0x8f, 0xc1, 0x94, 0x0f, 0x8f, 0xb9, 0x93,
+    0x0f, 0x8f, 0xb1, 0x92, 0x0f, 0x8f, 0xa9, 0x91, 0x0f, 0x8f, 0xa1, 0x90,
+    0x0f, 0x8f, 0x99, 0x8f, 0x0f, 0x8f, 0x91, 0x8e, 0x0f, 0x8f, 0x89, 0x8d,
+    0x0f, 0x8f, 0x81, 0x8c, 0x0f, 0x8f, 0x79, 0x8b, 0x0f, 0x8f, 0x71, 0x8a,
+    0x0f, 0x8f, 0x69, 0x89, 0x0f, 0x8f, 0x61, 0x88, 0x0f, 0x8f, 0x59, 0x87,
+    0x0f, 0x8f, 0x51, 0x86, 0x0f, 0x8f, 0x49, 0x85, 0x0f, 0x8f, 0x41, 0x84,
+    0x0f, 0x8f, 0x39, 0x83, 0x0f, 0x8f, 0x30, 0xc5, 0x1e, 0x96, 0x05, 0x4a,
+    0x99, 0x4a, 0x6f, 0xc8, 0x40, 0xc8, 0x4c, 0x8a, 0x05, 0x4a, 0x91, 0x94,
+    0x05, 0x4a, 0x89, 0x90, 0x05, 0x4a, 0x82, 0x00, 0xc8, 0x63, 0x83, 0x05,
+    0x4a, 0x31, 0x10, 0xc0, 0xc8, 0x67, 0x0f, 0xc0, 0xc8, 0x79, 0xc2, 0x00,
+    0xd0, 0x05, 0x4a, 0x09, 0xc2, 0x01, 0x4a, 0x05, 0x4a, 0x01, 0xc2, 0x19,
+    0x2c, 0x05, 0x49, 0xf9, 0xc2, 0x00, 0xdb, 0x05, 0x49, 0xf1, 0xc2, 0x00,
+    0x39, 0x05, 0x49, 0xe9, 0xc2, 0x0d, 0xf6, 0x05, 0x49, 0xe1, 0xc2, 0x25,
+    0x3b, 0x05, 0x49, 0xd1, 0xc2, 0x00, 0x64, 0x05, 0x49, 0xc9, 0xc2, 0x01,
+    0x5d, 0x05, 0x49, 0xb9, 0xc2, 0x00, 0xb0, 0x05, 0x49, 0xb1, 0xc2, 0x0e,
+    0x9a, 0x05, 0x49, 0xa1, 0xc2, 0x01, 0x6f, 0x05, 0x49, 0x99, 0xc2, 0x01,
+    0x30, 0x05, 0x49, 0x89, 0xc2, 0x02, 0x2b, 0x05, 0x49, 0x80, 0x15, 0xc0,
+    0xc8, 0x83, 0x03, 0xc0, 0xc8, 0xa6, 0x11, 0xc0, 0xc8, 0xae, 0x42, 0x00,
+    0xd0, 0xc0, 0xc8, 0xc0, 0x4a, 0x07, 0xbb, 0xc0, 0xc8, 0xcc, 0x05, 0xc0,
+    0xc8, 0xd8, 0xcb, 0x1a, 0x50, 0x00, 0x01, 0x4b, 0x00, 0xc8, 0xed, 0x08,
+    0xc0, 0xc8, 0xf1, 0xe0, 0x05, 0xa7, 0x01, 0x16, 0x51, 0x16, 0xc0, 0xc8,
+    0xfb, 0x42, 0x00, 0x58, 0xc0, 0xc9, 0x0f, 0x19, 0xc0, 0xc9, 0x1b, 0x46,
+    0x04, 0x8f, 0xc0, 0xc9, 0x27, 0xd7, 0x29, 0x85, 0x01, 0x70, 0x69, 0xd6,
+    0x2c, 0xf4, 0x01, 0x70, 0xe8, 0x19, 0xc0, 0xc9, 0x33, 0x16, 0xc0, 0xc9,
+    0x42, 0x15, 0xc0, 0xc9, 0x54, 0x0a, 0xc0, 0xc9, 0x60, 0xd0, 0x58, 0x62,
+    0x0f, 0xc1, 0xf1, 0xc5, 0x01, 0xa2, 0x01, 0x0c, 0x93, 0x00, 0xc9, 0x6a,
+    0xd1, 0x55, 0x30, 0x01, 0x0f, 0xf1, 0x06, 0xc0, 0xc9, 0x74, 0xcd, 0x7c,
+    0xa8, 0x01, 0x0e, 0x49, 0x14, 0xc0, 0xc9, 0x80, 0xcf, 0x61, 0x4d, 0x01,
+    0x5a, 0x31, 0x04, 0xc0, 0xc9, 0x8c, 0x08, 0xc0, 0xc9, 0x9e, 0xd7, 0x26,
+    0xbc, 0x0f, 0xc5, 0x38, 0x49, 0x01, 0xaa, 0xc0, 0xc9, 0xaa, 0x15, 0xc0,
+    0xc9, 0xc2, 0xdb, 0x16, 0x1d, 0x01, 0x37, 0x29, 0x48, 0xbc, 0xba, 0xc0,
+    0xc9, 0xce, 0x47, 0x55, 0x85, 0x40, 0xc9, 0xe6, 0xc8, 0x07, 0x5f, 0x01,
+    0x12, 0xb9, 0xcb, 0x90, 0x9c, 0x01, 0x12, 0xb1, 0xc8, 0x18, 0x67, 0x01,
+    0x10, 0xc1, 0xc5, 0x00, 0xd4, 0x00, 0x16, 0xd1, 0xc4, 0xe3, 0x07, 0x0f,
+    0xb6, 0xf9, 0xc5, 0x01, 0xaa, 0x01, 0x71, 0x80, 0x45, 0x11, 0x17, 0xc0,
+    0xc9, 0xfb, 0x43, 0x11, 0x49, 0xc0, 0xca, 0x07, 0x45, 0x00, 0x49, 0xc0,
+    0xca, 0x13, 0x46, 0x00, 0x2c, 0x40, 0xca, 0x1f, 0xce, 0x6b, 0xb8, 0x0f,
+    0xae, 0xf1, 0x42, 0x00, 0x2a, 0x40, 0xca, 0x2b, 0xc6, 0xcf, 0xad, 0x0f,
+    0xbc, 0x59, 0xc7, 0xc1, 0x00, 0x0f, 0xa6, 0x68, 0xc3, 0xe5, 0x99, 0x0f,
+    0x93, 0x29, 0x42, 0x01, 0xe2, 0xc0, 0xca, 0x37, 0xc2, 0x07, 0x49, 0x0f,
+    0x93, 0x19, 0xc2, 0x10, 0x37, 0x0f, 0x93, 0x09, 0xc2, 0x11, 0xf6, 0x0f,
+    0x93, 0x00, 0xc3, 0x05, 0x14, 0x01, 0x0b, 0x03, 0x00, 0xca, 0x43, 0x08,
+    0xc0, 0xca, 0x47, 0x15, 0xc0, 0xca, 0x51, 0xd4, 0x3f, 0x20, 0x01, 0x0c,
+    0x19, 0x16, 0xc0, 0xca, 0x60, 0x07, 0xc0, 0xca, 0x73, 0xc4, 0x26, 0x78,
+    0x01, 0x0b, 0x40, 0x07, 0xc0, 0xca, 0x7f, 0xcb, 0x92, 0xc2, 0x08, 0x0c,
+    0xa8, 0xd3, 0x45, 0xe5, 0x08, 0x0c, 0xa1, 0xcc, 0x83, 0xfd, 0x08, 0x0c,
+    0xb1, 0xcd, 0x76, 0xaa, 0x08, 0x0c, 0xc8, 0xc3, 0x63, 0x7e, 0x0f, 0xb4,
+    0x19, 0xc5, 0xd8, 0x49, 0x0f, 0xb7, 0x20, 0xc4, 0x07, 0x73, 0x01, 0x38,
+    0x5b, 0x00, 0xca, 0x91, 0xc4, 0xb9, 0x3c, 0x01, 0x38, 0x51, 0x0f, 0xc0,
+    0xca, 0x97, 0xcc, 0x88, 0xf5, 0x0f, 0xc8, 0xd1, 0xd4, 0x21, 0x3f, 0x01,
+    0x70, 0x31, 0xc3, 0x02, 0xa3, 0x01, 0x71, 0x9b, 0x00, 0xca, 0xa9, 0xc6,
+    0x0b, 0x09, 0x01, 0x70, 0x59, 0xc5, 0x0a, 0x8a, 0x01, 0x71, 0xa0, 0xc3,
+    0x80, 0x5d, 0x0f, 0x98, 0x40, 0xcb, 0x8f, 0x31, 0x01, 0x31, 0x11, 0xc7,
+    0xc4, 0x95, 0x0f, 0xa8, 0xc0, 0xc3, 0x63, 0x7e, 0x0f, 0x9e, 0x71, 0xca,
+    0xa5, 0xa8, 0x0f, 0x9e, 0x68, 0xca, 0x9d, 0x2e, 0x08, 0x73, 0xf1, 0x44,
+    0x05, 0x14, 0x40, 0xca, 0xaf, 0x44, 0x26, 0x78, 0xc0, 0xca, 0xc1, 0x45,
+    0x06, 0xdb, 0xc0, 0xca, 0xcd, 0x15, 0xc0, 0xca, 0xd7, 0x08, 0xc0, 0xca,
+    0xe3, 0x16, 0xc0, 0xca, 0xeb, 0xcb, 0x0d, 0x00, 0x08, 0x73, 0x90, 0xc4,
+    0x26, 0x78, 0x08, 0x73, 0x41, 0xc5, 0x06, 0xdb, 0x08, 0x73, 0x39, 0x15,
+    0xc0, 0xca, 0xf9, 0x08, 0xc0, 0xcb, 0x05, 0x16, 0xc0, 0xcb, 0x11, 0xc3,
+    0x05, 0x14, 0x08, 0x73, 0x00, 0x47, 0x02, 0x0e, 0xc0, 0xcb, 0x1d, 0xcf,
+    0x62, 0x4c, 0x00, 0xb7, 0x81, 0xcf, 0x66, 0x1b, 0x00, 0xb7, 0x79, 0xcd,
+    0x78, 0x16, 0x00, 0xb7, 0x71, 0xd1, 0x57, 0x61, 0x00, 0xb7, 0x69, 0xd4,
+    0x3b, 0xec, 0x00, 0xb7, 0x61, 0xd2, 0x4c, 0xa3, 0x00, 0xb7, 0x58, 0xc2,
+    0x00, 0x29, 0x0f, 0x9e, 0x19, 0xd3, 0x46, 0xc9, 0x0f, 0x9d, 0xe8, 0xa2,
+    0x07, 0xf0, 0x73, 0x00, 0xcb, 0xad, 0x9e, 0x07, 0xf0, 0x53, 0x00, 0xcb,
+    0xd5, 0x9d, 0x07, 0xf0, 0x4b, 0x00, 0xcb, 0xfd, 0xa6, 0x70, 0x08, 0x13,
+    0x00, 0xcc, 0x25, 0xa5, 0x70, 0x08, 0x0b, 0x00, 0xcc, 0x4d, 0xa4, 0x70,
+    0x08, 0x03, 0x00, 0xcc, 0x75, 0xa3, 0x07, 0xf0, 0x7b, 0x00, 0xcc, 0x9d,
+    0xa1, 0x07, 0xf0, 0x6b, 0x00, 0xcc, 0xc5, 0xa0, 0x07, 0xf0, 0x63, 0x00,
+    0xcc, 0xed, 0x9f, 0x07, 0xf0, 0x5a, 0x00, 0xcd, 0x15, 0xa2, 0x70, 0x08,
+    0x43, 0x00, 0xcd, 0x3d, 0xa1, 0x70, 0x08, 0x3b, 0x00, 0xcd, 0x59, 0xa0,
+    0x70, 0x08, 0x33, 0x00, 0xcd, 0x81, 0x9f, 0x70, 0x08, 0x2b, 0x00, 0xcd,
+    0xa9, 0x9e, 0x70, 0x08, 0x23, 0x00, 0xcd, 0xd1, 0x9d, 0x70, 0x08, 0x1b,
+    0x00, 0xcd, 0xf9, 0xa6, 0x70, 0x08, 0x61, 0xa5, 0x70, 0x08, 0x59, 0xa4,
+    0x70, 0x08, 0x51, 0xa3, 0x70, 0x08, 0x48, 0xa6, 0x70, 0x0a, 0x91, 0xa5,
+    0x70, 0x0a, 0x89, 0xa4, 0x70, 0x0a, 0x81, 0xa3, 0x70, 0x0a, 0x79, 0xa2,
+    0x70, 0x0a, 0x71, 0xa1, 0x70, 0x0a, 0x69, 0xa0, 0x70, 0x0a, 0x61, 0x9f,
+    0x70, 0x0a, 0x59, 0x9e, 0x70, 0x0a, 0x51, 0x9d, 0x70, 0x0a, 0x48, 0xa6,
+    0x70, 0x0a, 0x41, 0xa5, 0x70, 0x0a, 0x39, 0xa4, 0x70, 0x0a, 0x31, 0xa3,
+    0x70, 0x0a, 0x29, 0xa2, 0x70, 0x0a, 0x21, 0xa1, 0x70, 0x0a, 0x19, 0xa0,
+    0x70, 0x0a, 0x11, 0x9f, 0x70, 0x0a, 0x09, 0x9e, 0x70, 0x0a, 0x01, 0x9d,
+    0x70, 0x09, 0xf8, 0xa6, 0x70, 0x09, 0xf1, 0xa5, 0x70, 0x09, 0xe9, 0xa4,
+    0x70, 0x09, 0xe1, 0xa3, 0x70, 0x09, 0xd9, 0xa2, 0x70, 0x09, 0xd1, 0xa1,
+    0x70, 0x09, 0xc9, 0xa0, 0x70, 0x09, 0xc1, 0x9f, 0x70, 0x09, 0xb9, 0x9e,
+    0x70, 0x09, 0xb1, 0x9d, 0x70, 0x09, 0xa8, 0xa6, 0x70, 0x09, 0xa1, 0xa5,
+    0x70, 0x09, 0x99, 0xa4, 0x70, 0x09, 0x91, 0xa3, 0x70, 0x09, 0x89, 0xa2,
+    0x70, 0x09, 0x81, 0xa1, 0x70, 0x09, 0x79, 0xa0, 0x70, 0x09, 0x71, 0x9f,
+    0x70, 0x09, 0x69, 0x9e, 0x70, 0x09, 0x61, 0x9d, 0x70, 0x09, 0x58, 0xa6,
+    0x70, 0x09, 0x51, 0xa5, 0x70, 0x09, 0x49, 0xa4, 0x70, 0x09, 0x41, 0xa3,
+    0x70, 0x09, 0x39, 0xa2, 0x70, 0x09, 0x31, 0xa1, 0x70, 0x09, 0x29, 0xa0,
+    0x70, 0x09, 0x21, 0x9f, 0x70, 0x09, 0x19, 0x9e, 0x70, 0x09, 0x11, 0x9d,
+    0x70, 0x09, 0x08, 0xa6, 0x70, 0x09, 0x01, 0xa5, 0x70, 0x08, 0xf9, 0xa4,
+    0x70, 0x08, 0xf1, 0xa3, 0x70, 0x08, 0xe9, 0xa2, 0x70, 0x08, 0xe1, 0xa1,
+    0x70, 0x08, 0xd9, 0xa0, 0x70, 0x08, 0xd1, 0x9f, 0x70, 0x08, 0xc9, 0x9e,
+    0x70, 0x08, 0xc1, 0x9d, 0x70, 0x08, 0xb8, 0xa6, 0x70, 0x08, 0xb1, 0xa5,
+    0x70, 0x08, 0xa9, 0xa4, 0x70, 0x08, 0xa1, 0xa3, 0x70, 0x08, 0x99, 0xa2,
+    0x70, 0x08, 0x91, 0xa1, 0x70, 0x08, 0x89, 0xa0, 0x70, 0x08, 0x81, 0x9f,
+    0x70, 0x08, 0x79, 0x9e, 0x70, 0x08, 0x71, 0x9d, 0x70, 0x08, 0x68, 0x47,
+    0x14, 0x8b, 0xc0, 0xce, 0x21, 0x45, 0x10, 0x7a, 0x40, 0xce, 0x90, 0xc4,
+    0x15, 0xe7, 0x05, 0x31, 0x01, 0xc3, 0x05, 0x14, 0x05, 0x31, 0x09, 0x16,
+    0xc0, 0xce, 0xb2, 0x08, 0xc0, 0xce, 0xbe, 0x15, 0xc0, 0xce, 0xca, 0xc5,
+    0x06, 0xdb, 0x05, 0x31, 0x41, 0xc4, 0x26, 0x78, 0x05, 0x31, 0x48, 0x51,
+    0x54, 0x86, 0xc0, 0xce, 0xd6, 0x44, 0x05, 0x8d, 0xc0, 0xce, 0xee, 0xd5,
+    0x37, 0x2e, 0x01, 0x35, 0x41, 0xc4, 0x02, 0x6d, 0x00, 0x03, 0xe3, 0x00,
+    0xcf, 0x06, 0xc8, 0x22, 0x83, 0x01, 0x17, 0x71, 0xc9, 0x3b, 0x79, 0x01,
+    0x02, 0xf1, 0x16, 0xc0, 0xcf, 0x0a, 0xcb, 0x93, 0xb4, 0x01, 0x4c, 0xd1,
+    0xc8, 0xb8, 0x92, 0x01, 0x71, 0xe9, 0x4c, 0x8a, 0xe1, 0xc0, 0xcf, 0x1c,
+    0xda, 0x1c, 0x86, 0x01, 0x81, 0xd8, 0x46, 0x11, 0x39, 0xc0, 0xcf, 0x2e,
+    0xd0, 0x58, 0xc2, 0x0f, 0xbd, 0x29, 0x45, 0xda, 0xab, 0x40, 0xcf, 0x50,
+    0xdc, 0x14, 0x31, 0x00, 0xe7, 0xd1, 0x03, 0xc0, 0xcf, 0x5c, 0xcb, 0x93,
+    0xf6, 0x00, 0xe7, 0xb1, 0xcb, 0x8f, 0xe1, 0x00, 0xe7, 0xa9, 0x14, 0xc0,
+    0xcf, 0x6e, 0xcd, 0x2e, 0xcb, 0x00, 0xe7, 0x79, 0xd6, 0x2e, 0xc2, 0x00,
+    0xe7, 0x71, 0xc6, 0xd3, 0x0d, 0x00, 0xe7, 0x69, 0x48, 0x5f, 0x6a, 0xc0,
+    0xcf, 0x80, 0xda, 0x19, 0x2c, 0x00, 0xe6, 0xa1, 0xc9, 0xae, 0xa9, 0x00,
+    0xe6, 0x98, 0x42, 0x00, 0x58, 0xc0, 0xcf, 0x98, 0x42, 0x00, 0x2c, 0xc0,
+    0xcf, 0xa4, 0x47, 0xc7, 0x7b, 0xc0, 0xcf, 0xb0, 0xe0, 0x04, 0xa7, 0x00,
+    0xe7, 0x09, 0x16, 0xc0, 0xcf, 0xbc, 0x42, 0x02, 0x2b, 0xc0, 0xcf, 0xce,
+    0x4b, 0x19, 0x2c, 0xc0, 0xcf, 0xda, 0xc7, 0xc9, 0x03, 0x00, 0xe6, 0x91,
+    0xc5, 0xdb, 0xe1, 0x00, 0xe6, 0x88, 0xc4, 0xe3, 0xa7, 0x0b, 0x7f, 0x89,
+    0xc2, 0x00, 0x64, 0x0b, 0x7f, 0x80, 0xc6, 0xa0, 0xd4, 0x0f, 0xa7, 0xc9,
+    0xc4, 0xe0, 0x8b, 0x0f, 0x9d, 0x70, 0x83, 0x08, 0x2b, 0x81, 0x04, 0xc0,
+    0xcf, 0xef, 0x05, 0xc0, 0xcf, 0xf9, 0x06, 0xc0, 0xd0, 0x03, 0x87, 0x08,
+    0x2b, 0xc3, 0x00, 0xd0, 0x0d, 0xc2, 0x14, 0xda, 0x08, 0x2b, 0xc9, 0xc2,
+    0x01, 0x30, 0x08, 0x2b, 0xd1, 0x0a, 0xc0, 0xd0, 0x11, 0x8b, 0x08, 0x2b,
+    0xf3, 0x00, 0xd0, 0x1b, 0xc2, 0x1c, 0x52, 0x08, 0x2c, 0x01, 0x0e, 0xc0,
+    0xd0, 0x21, 0xc2, 0x00, 0x4e, 0x08, 0x2c, 0x21, 0x10, 0xc0, 0xd0, 0x2b,
+    0x91, 0x08, 0x2c, 0x39, 0xc2, 0x00, 0x67, 0x08, 0x2c, 0x41, 0xc2, 0x0f,
+    0x9a, 0x08, 0x2c, 0x49, 0x15, 0xc0, 0xd0, 0x35, 0x16, 0xc0, 0xd0, 0x3f,
+    0x97, 0x08, 0x2c, 0x81, 0x9b, 0x08, 0x2c, 0xa1, 0xc2, 0x0a, 0xe2, 0x08,
+    0x2c, 0xa9, 0xc2, 0x02, 0x2b, 0x08, 0x2c, 0x09, 0xc2, 0x01, 0x19, 0x08,
+    0x2c, 0x51, 0xc2, 0x00, 0x5f, 0x08, 0x2c, 0x89, 0xc2, 0x24, 0xe2, 0x08,
+    0x2c, 0x90, 0x83, 0x08, 0x2c, 0xb9, 0x04, 0xc0, 0xd0, 0x49, 0x05, 0xc0,
+    0xd0, 0x53, 0x06, 0xc0, 0xd0, 0x5d, 0x87, 0x08, 0x2c, 0xfb, 0x00, 0xd0,
+    0x67, 0xc2, 0x14, 0xda, 0x08, 0x2d, 0x01, 0xc2, 0x01, 0x30, 0x08, 0x2d,
+    0x09, 0x0a, 0xc0, 0xd0, 0x6b, 0x8b, 0x08, 0x2d, 0x2b, 0x00, 0xd0, 0x75,
+    0xc2, 0x1c, 0x52, 0x08, 0x2d, 0x39, 0xc2, 0x02, 0x2b, 0x08, 0x2d, 0x41,
+    0x0e, 0xc0, 0xd0, 0x7b, 0xc2, 0x00, 0x4e, 0x08, 0x2d, 0x59, 0x10, 0xc0,
+    0xd0, 0x85, 0x91, 0x08, 0x2d, 0x71, 0xc2, 0x00, 0x67, 0x08, 0x2d, 0x79,
+    0xc2, 0x0f, 0x9a, 0x08, 0x2d, 0x81, 0xc2, 0x01, 0x19, 0x08, 0x2d, 0x89,
+    0x15, 0xc0, 0xd0, 0x8f, 0x16, 0xc0, 0xd0, 0x99, 0x97, 0x08, 0x2d, 0xb9,
+    0xc2, 0x00, 0x5f, 0x08, 0x2d, 0xc1, 0xc2, 0x24, 0xe2, 0x08, 0x2d, 0xc9,
+    0x9b, 0x08, 0x2d, 0xd9, 0xc2, 0x0a, 0xe2, 0x08, 0x2d, 0xe0, 0x44, 0x0d,
+    0x14, 0xc0, 0xd0, 0xa3, 0xca, 0x9c, 0x02, 0x01, 0x0a, 0xc0, 0x45, 0x02,
+    0xde, 0xc0, 0xd0, 0xaf, 0x43, 0x02, 0xa0, 0x40, 0xd0, 0xc1, 0xc6, 0x06,
+    0xdb, 0x01, 0x0a, 0xd9, 0x15, 0xc0, 0xd0, 0xcd, 0xc5, 0x9c, 0x06, 0x01,
+    0x0a, 0xa9, 0x16, 0xc0, 0xd0, 0xd9, 0xc5, 0xd9, 0x1b, 0x01, 0x0a, 0x89,
+    0xc7, 0x08, 0x79, 0x00, 0x05, 0xe1, 0xc4, 0x01, 0xce, 0x00, 0x05, 0xe8,
+    0x42, 0x00, 0xb4, 0xc0, 0xd0, 0xe5, 0x0e, 0xc0, 0xd0, 0xf1, 0x05, 0xc0,
+    0xd1, 0x01, 0x14, 0xc0, 0xd1, 0x0b, 0x42, 0x00, 0xe3, 0xc0, 0xd1, 0x17,
+    0x07, 0xc0, 0xd1, 0x23, 0x15, 0xc0, 0xd1, 0x2f, 0x06, 0xc0, 0xd1, 0x41,
+    0xc9, 0x11, 0xf6, 0x70, 0x01, 0x71, 0xcc, 0x89, 0xcd, 0x70, 0x01, 0x69,
+    0x12, 0xc0, 0xd1, 0x4d, 0x03, 0xc0, 0xd1, 0x59, 0xc5, 0x1e, 0xc8, 0x70,
+    0x03, 0xf1, 0xcd, 0x36, 0x86, 0x70, 0x03, 0xe1, 0xcb, 0x97, 0x9d, 0x70,
+    0x01, 0x18, 0x4b, 0x6f, 0xc7, 0xc0, 0xd1, 0x6b, 0x47, 0x02, 0x0e, 0x40,
+    0xd1, 0x73, 0x47, 0x02, 0x0e, 0xc0, 0xd1, 0xc5, 0x45, 0x00, 0xba, 0xc0,
+    0xd2, 0x26, 0x4b, 0x6f, 0xc7, 0x40, 0xd2, 0x32, 0x43, 0x02, 0xab, 0xc0,
+    0xd2, 0x3a, 0x43, 0x44, 0xc7, 0xc0, 0xd2, 0x46, 0xc5, 0x55, 0xd8, 0x0f,
+    0x9a, 0x50, 0xd7, 0x27, 0xd0, 0x08, 0xff, 0xf9, 0x15, 0xc0, 0xd2, 0x52,
+    0xd2, 0x4c, 0xc7, 0x08, 0xff, 0x71, 0x16, 0xc0, 0xd2, 0x6a, 0x03, 0xc0,
+    0xd2, 0x76, 0x05, 0xc0, 0xd2, 0x88, 0x0e, 0xc0, 0xd2, 0x94, 0x06, 0xc0,
+    0xd2, 0xa0, 0xd4, 0x39, 0xe4, 0x08, 0xff, 0x21, 0x49, 0x53, 0xa9, 0xc0,
+    0xd2, 0xb8, 0x4b, 0x6f, 0xc7, 0xc0, 0xd2, 0xca, 0xc2, 0x00, 0x7a, 0x00,
+    0x5e, 0x81, 0x47, 0x34, 0x2f, 0xc0, 0xd2, 0xea, 0xca, 0xa3, 0xdc, 0x00,
+    0x5f, 0xa1, 0xc9, 0xab, 0xe2, 0x00, 0x5f, 0xa9, 0xca, 0x76, 0x52, 0x00,
+    0x5f, 0xc8, 0x46, 0x09, 0x97, 0xc0, 0xd2, 0xfc, 0xd1, 0x50, 0xce, 0x08,
+    0xb5, 0xc9, 0x47, 0x02, 0x0e, 0xc0, 0xd3, 0x20, 0x45, 0x00, 0xba, 0xc0,
+    0xd3, 0x87, 0x4b, 0x6f, 0xc7, 0x40, 0xd3, 0x99, 0x45, 0x00, 0xba, 0xc0,
+    0xd3, 0xb3, 0x4b, 0x92, 0x80, 0xc0, 0xd3, 0xe6, 0x4b, 0x8c, 0xbe, 0xc0,
+    0xd4, 0x0a, 0x42, 0x00, 0x99, 0xc0, 0xd4, 0x2e, 0x4b, 0x6f, 0xc7, 0xc0,
+    0xd4, 0x3a, 0x47, 0x02, 0x0e, 0x40, 0xd4, 0x64, 0x16, 0xc0, 0xd4, 0xb2,
+    0x83, 0x00, 0xcb, 0x1b, 0x00, 0xd4, 0xc6, 0x87, 0x00, 0xcb, 0x5b, 0x00,
+    0xd4, 0xd0, 0x97, 0x00, 0xcb, 0x3b, 0x00, 0xd4, 0xd8, 0x91, 0x00, 0xcb,
+    0x4b, 0x00, 0xd4, 0xdc, 0x8b, 0x00, 0xcb, 0x21, 0x10, 0xc0, 0xd4, 0xe0,
+    0x0d, 0xc0, 0xd4, 0xea, 0xc2, 0x0f, 0x9a, 0x00, 0xca, 0xf9, 0xc2, 0x00,
+    0xd0, 0x00, 0xca, 0xf1, 0xc2, 0x02, 0x41, 0x00, 0xca, 0xe9, 0xc2, 0x00,
+    0x87, 0x00, 0xca, 0xe1, 0xc2, 0x01, 0xc3, 0x00, 0xca, 0xd9, 0x12, 0xc0,
+    0xd4, 0xf4, 0xc2, 0x00, 0xdb, 0x00, 0xca, 0xc1, 0xc2, 0x19, 0x2c, 0x00,
+    0xca, 0xa9, 0xc2, 0x0d, 0xf6, 0x00, 0xca, 0xa1, 0xc2, 0x8d, 0x8f, 0x00,
+    0xca, 0x88, 0x47, 0x10, 0x78, 0xc0, 0xd4, 0xfe, 0x49, 0xb2, 0x63, 0xc0,
+    0xd5, 0x16, 0x46, 0x34, 0x6f, 0xc0, 0xd5, 0x2e, 0x45, 0xdb, 0x96, 0xc0,
+    0xd5, 0x48, 0x47, 0x02, 0x0e, 0x40, 0xd5, 0x54, 0xc2, 0x17, 0x28, 0x0f,
+    0xcc, 0x19, 0xcd, 0x77, 0xbb, 0x01, 0x05, 0xd0, 0x46, 0x04, 0x8f, 0xc0,
+    0xd5, 0x60, 0xd1, 0x50, 0x79, 0x01, 0x36, 0x49, 0x42, 0x00, 0x10, 0xc0,
+    0xd5, 0x6c, 0x06, 0xc0, 0xd5, 0x78, 0x15, 0xc0, 0xd5, 0x84, 0x03, 0xc0,
+    0xd5, 0x9c, 0x05, 0xc0, 0xd5, 0xa8, 0xd7, 0x29, 0xb3, 0x01, 0x09, 0x49,
+    0xcc, 0x8a, 0xd5, 0x0f, 0xac, 0x78, 0xd2, 0x22, 0x49, 0x0f, 0xbe, 0x11,
+    0x06, 0xc0, 0xd5, 0xb4, 0x0e, 0xc0, 0xd5, 0xc0, 0x14, 0xc0, 0xd5, 0xcc,
+    0xce, 0x6f, 0xb6, 0x0f, 0xaf, 0x59, 0xcc, 0x86, 0xfd, 0x0f, 0xad, 0x89,
+    0xd3, 0x3f, 0xf5, 0x0f, 0xad, 0x39, 0xd8, 0x23, 0x03, 0x01, 0x53, 0xb0,
+    0x42, 0x00, 0xa9, 0xc0, 0xd5, 0xd8, 0xcc, 0x79, 0x42, 0x01, 0x00, 0x21,
+    0xc7, 0xbc, 0x33, 0x01, 0x71, 0xd8, 0x00, 0xc0, 0xd5, 0xf0, 0xc9, 0xa1,
+    0x3f, 0x0f, 0xc8, 0xa0, 0xcf, 0x69, 0xf9, 0x01, 0x36, 0x41, 0xc5, 0xdc,
+    0x6d, 0x01, 0x30, 0x40, 0xc9, 0xb2, 0xfc, 0x0f, 0xa2, 0x71, 0xc7, 0xc4,
+    0x6b, 0x0f, 0xa2, 0x68, 0xc4, 0x5e, 0x73, 0x01, 0x11, 0xa1, 0x00, 0x40,
+    0xd5, 0xfa, 0xc5, 0x9b, 0x3f, 0x0f, 0x99, 0x09, 0xc7, 0xc2, 0x49, 0x01,
+    0x4f, 0x38, 0x11, 0xc0, 0xd6, 0x06, 0xc7, 0xc0, 0x27, 0x00, 0x3d, 0x51,
+    0x07, 0xc0, 0xd6, 0x18, 0xc7, 0xc0, 0x5f, 0x00, 0x3d, 0x41, 0x03, 0xc0,
+    0xd6, 0x2a, 0x47, 0x02, 0x0e, 0xc0, 0xd6, 0x36, 0xc5, 0xdb, 0xfa, 0x00,
+    0x3d, 0x80, 0x05, 0xc0, 0xd6, 0xa0, 0x46, 0x09, 0x97, 0x40, 0xd6, 0xac,
+    0x43, 0x01, 0xd0, 0xc0, 0xd6, 0xd0, 0x96, 0x0f, 0x9d, 0x48, 0x05, 0xc0,
+    0xd6, 0xee, 0xcc, 0x88, 0x65, 0x01, 0x71, 0x18, 0x05, 0xc0, 0xd6, 0xfa,
+    0xcc, 0x88, 0x65, 0x01, 0x71, 0x10, 0xd3, 0x05, 0xf4, 0x01, 0x49, 0xd3,
+    0x00, 0xd7, 0x06, 0xda, 0x1d, 0x08, 0x01, 0x49, 0xe0, 0xd0, 0x5e, 0xb2,
+    0x0f, 0x15, 0x71, 0x47, 0x02, 0x0e, 0x40, 0xd7, 0x0c, 0x42, 0xe6, 0x8f,
+    0xc0, 0xd7, 0x85, 0x23, 0xc0, 0xd7, 0x91, 0x22, 0xc0, 0xd7, 0xa3, 0x24,
+    0x40, 0xd7, 0xaf, 0xc5, 0xb4, 0xb0, 0x0f, 0xd5, 0x28, 0xc4, 0x63, 0x7d,
+    0x0f, 0xb4, 0x58, 0xc5, 0xdd, 0x21, 0x0f, 0xad, 0x91, 0xc3, 0x05, 0xb1,
+    0x0f, 0xb4, 0xe0, 0xd3, 0x44, 0x56, 0x01, 0x56, 0xd9, 0xc5, 0xd7, 0x36,
+    0x01, 0x5e, 0xb8, 0x42, 0x00, 0x49, 0xc0, 0xd7, 0xbb, 0x45, 0x05, 0xef,
+    0x40, 0xd7, 0xc7, 0xc5, 0x61, 0xc0, 0x01, 0x31, 0xb9, 0xc8, 0x2d, 0xb2,
+    0x01, 0x31, 0xb1, 0x19, 0xc0, 0xd7, 0xd9, 0xc7, 0x71, 0xa7, 0x01, 0x31,
+    0x99, 0xc4, 0x83, 0x39, 0x01, 0x31, 0x91, 0xc4, 0x2a, 0x95, 0x01, 0x31,
+    0x89, 0xc6, 0x73, 0xca, 0x01, 0x31, 0x80, 0x4d, 0x18, 0x5a, 0xc0, 0xd7,
+    0xe5, 0xc5, 0x1e, 0xc8, 0x01, 0x12, 0x59, 0xc8, 0x1e, 0x3f, 0x01, 0x11,
+    0x69, 0x12, 0xc0, 0xd7, 0xfd, 0x54, 0x3b, 0xb0, 0xc0, 0xd8, 0x09, 0xce,
+    0x6f, 0xe0, 0x01, 0x57, 0xb1, 0x47, 0xc4, 0x17, 0xc0, 0xd8, 0x15, 0xd7,
+    0x27, 0x5d, 0x01, 0x57, 0xd9, 0xc6, 0xce, 0x21, 0x01, 0x72, 0x58, 0xd0,
+    0x59, 0x62, 0x01, 0x5e, 0xf8, 0xc2, 0x38, 0x5e, 0x0f, 0x9e, 0x31, 0x45,
+    0x05, 0x88, 0x40, 0xd8, 0x21, 0xc5, 0xd4, 0x70, 0x0f, 0xb4, 0x70, 0x11,
+    0xc0, 0xd8, 0x2d, 0xc6, 0xcb, 0x99, 0x0e, 0x9a, 0x81, 0xc5, 0x07, 0xeb,
+    0x0e, 0x99, 0xb1, 0x43, 0x11, 0xf7, 0x40, 0xd8, 0x39, 0x03, 0xc0, 0xd8,
+    0x45, 0xc5, 0xd9, 0x84, 0x0e, 0x99, 0x28, 0x0b, 0xc0, 0xd8, 0x51, 0xc8,
+    0x35, 0xc9, 0x0e, 0x9a, 0x41, 0x07, 0xc0, 0xd8, 0x61, 0xc4, 0xe4, 0x4b,
+    0x0e, 0x9a, 0x19, 0xc5, 0xd7, 0x45, 0x0e, 0x99, 0x00, 0xcb, 0x9a, 0x1b,
+    0x0e, 0x9a, 0x99, 0xc9, 0xae, 0x73, 0x0e, 0x98, 0x68, 0x11, 0xc0, 0xd8,
+    0x73, 0x43, 0x07, 0xa2, 0xc0, 0xd8, 0x7d, 0xc5, 0xb7, 0x35, 0x0e, 0x99,
+    0x09, 0xc5, 0x04, 0xe2, 0x0e, 0x98, 0x30, 0xca, 0xa1, 0x8e, 0x0e, 0x9a,
+    0x89, 0xcb, 0x96, 0xe2, 0x0e, 0x9a, 0x09, 0xc6, 0xd1, 0xe7, 0x0e, 0x98,
+    0xc9, 0xc5, 0x39, 0x0b, 0x0e, 0x98, 0x60, 0xc7, 0xc3, 0x68, 0x0e, 0x9a,
+    0x69, 0xcb, 0x4c, 0x26, 0x0e, 0x98, 0xb0, 0x16, 0xc0, 0xd8, 0x87, 0xc8,
+    0xb7, 0x62, 0x0e, 0x9a, 0x59, 0xc6, 0x83, 0x26, 0x0e, 0x9a, 0x28, 0xc9,
+    0xa8, 0xb8, 0x0e, 0x9a, 0x51, 0xcc, 0x81, 0x51, 0x0e, 0x9a, 0x11, 0xc7,
+    0x2d, 0x56, 0x0e, 0x99, 0xd1, 0x10, 0xc0, 0xd8, 0x91, 0xc3, 0x2c, 0xff,
+    0x0e, 0x98, 0xe0, 0xc3, 0x13, 0x69, 0x0e, 0x9a, 0x31, 0xc6, 0xcc, 0x17,
+    0x0e, 0x98, 0x90, 0xc3, 0x1c, 0xe6, 0x0e, 0x9a, 0x21, 0xc5, 0x20, 0xd8,
+    0x0e, 0x98, 0xb8, 0xc6, 0xcb, 0x2d, 0x0e, 0x9a, 0x01, 0xc6, 0x14, 0xc5,
+    0x0e, 0x99, 0xc9, 0xc4, 0x7c, 0xaa, 0x0e, 0x98, 0x40, 0xc8, 0x55, 0xc9,
+    0x0e, 0x99, 0x43, 0x00, 0xd8, 0xa3, 0xca, 0xa7, 0xd8, 0x0e, 0x99, 0xf1,
+    0xc8, 0xbd, 0x6a, 0x0e, 0x99, 0x91, 0xcc, 0x8b, 0x7d, 0x0e, 0x99, 0x78,
+    0xc5, 0xdc, 0x5e, 0x0e, 0x99, 0xa9, 0x07, 0x40, 0xd8, 0xa9, 0x03, 0xc0,
+    0xd8, 0xb9, 0xc5, 0xdd, 0x58, 0x0e, 0x99, 0x51, 0xca, 0xa2, 0x06, 0x0e,
+    0x98, 0x98, 0xc6, 0xcf, 0x53, 0x0e, 0x99, 0x39, 0xcc, 0x84, 0xc9, 0x0e,
+    0x98, 0x50, 0xce, 0x70, 0x7a, 0x0e, 0x99, 0x19, 0xcc, 0x88, 0x11, 0x0e,
+    0x98, 0x71, 0xc6, 0x69, 0x74, 0x0e, 0x98, 0x48, 0x45, 0x0a, 0xe9, 0xc0,
+    0xd8, 0xc5, 0xcd, 0x79, 0x82, 0x0f, 0xa6, 0x30, 0x46, 0x36, 0xb7, 0xc0,
+    0xd8, 0xd1, 0xc5, 0xbc, 0xed, 0x0f, 0xa9, 0x69, 0xc6, 0x30, 0xf3, 0x0f,
+    0xa7, 0xd0, 0x45, 0x00, 0xba, 0xc0, 0xd8, 0xe9, 0x42, 0x00, 0x49, 0xc0,
+    0xd9, 0x09, 0x4b, 0x6f, 0xc7, 0xc0, 0xd9, 0x15, 0xce, 0x74, 0xcc, 0x00,
+    0x62, 0xb1, 0x46, 0x09, 0x97, 0xc0, 0xd9, 0x3b, 0x4f, 0x63, 0xa5, 0x40,
+    0xd9, 0x5f, 0xc5, 0x11, 0x55, 0x0f, 0xa1, 0x78, 0xd0, 0x5d, 0x52, 0x01,
+    0x4e, 0xa9, 0xcf, 0x66, 0x66, 0x01, 0x4e, 0xa0, 0xc8, 0x18, 0x67, 0x01,
+    0x11, 0xe3, 0x00, 0xd9, 0x6f, 0x45, 0x00, 0x8c, 0x40, 0xd9, 0x73, 0x46,
+    0x09, 0x97, 0xc0, 0xd9, 0x7f, 0xc2, 0x00, 0x7a, 0x08, 0xa6, 0x39, 0x03,
+    0xc0, 0xd9, 0xa3, 0xc5, 0xd5, 0xce, 0x08, 0xa6, 0x29, 0x45, 0x00, 0xba,
+    0xc0, 0xd9, 0xaf, 0x4b, 0x6f, 0xc7, 0xc0, 0xd9, 0xc5, 0x47, 0x02, 0x0e,
+    0x40, 0xd9, 0xeb, 0xc2, 0x00, 0x3d, 0x01, 0x02, 0x51, 0xca, 0x9e, 0x0a,
+    0x01, 0x72, 0x90, 0xe0, 0x05, 0x07, 0x08, 0x59, 0xd0, 0x1b, 0xc0, 0xda,
+    0x52, 0x44, 0x00, 0xbb, 0xc0, 0xda, 0x5e, 0x49, 0x5c, 0xf2, 0x40, 0xda,
+    0x8a, 0x09, 0xc0, 0xda, 0x96, 0x42, 0x00, 0x74, 0xc0, 0xda, 0xa2, 0x05,
+    0xc0, 0xda, 0xae, 0xd5, 0x32, 0x81, 0x00, 0x78, 0x39, 0x15, 0xc0, 0xda,
+    0xc0, 0x04, 0xc0, 0xda, 0xcc, 0xd5, 0x32, 0xff, 0x00, 0x78, 0x61, 0x10,
+    0xc0, 0xda, 0xd6, 0x16, 0xc0, 0xda, 0xe2, 0x14, 0xc0, 0xda, 0xec, 0x4c,
+    0x85, 0x65, 0xc0, 0xda, 0xf8, 0xc7, 0xc3, 0xa7, 0x00, 0x7c, 0x21, 0xc6,
+    0xcb, 0x09, 0x00, 0x7c, 0x29, 0xd6, 0x2d, 0xfc, 0x00, 0x7e, 0x89, 0xd3,
+    0x3f, 0xbc, 0x00, 0x7e, 0xc8, 0x4d, 0x79, 0x27, 0xc0, 0xdb, 0x04, 0x46,
+    0x02, 0x0f, 0x40, 0xdb, 0x10, 0x15, 0xc0, 0xdb, 0x70, 0xc9, 0xaf, 0xf6,
+    0x00, 0x78, 0xc0, 0xc4, 0x15, 0xe7, 0x00, 0x79, 0x01, 0xc3, 0x05, 0x14,
+    0x00, 0x79, 0x09, 0x16, 0xc0, 0xdb, 0x7c, 0x08, 0xc0, 0xdb, 0x88, 0x15,
+    0xc0, 0xdb, 0x94, 0xc5, 0x06, 0xdb, 0x00, 0x79, 0x41, 0xc4, 0x26, 0x78,
+    0x00, 0x79, 0x49, 0x45, 0x01, 0xce, 0x40, 0xdb, 0xa0, 0xc2, 0x04, 0xc6,
+    0x00, 0x7b, 0x89, 0x8b, 0x00, 0x7b, 0x93, 0x00, 0xdb, 0xc4, 0x97, 0x00,
+    0x7b, 0xa3, 0x00, 0xdb, 0xc8, 0x48, 0xb2, 0x2d, 0xc0, 0xdb, 0xcc, 0x87,
+    0x00, 0x7b, 0xd3, 0x00, 0xdb, 0xda, 0x91, 0x00, 0x7b, 0xe3, 0x00, 0xdb,
+    0xde, 0xca, 0x9d, 0xe2, 0x00, 0x7c, 0x02, 0x00, 0xdb, 0xe2, 0xcd, 0x7c,
+    0xc2, 0x00, 0x7d, 0xf8, 0xca, 0x9a, 0xf4, 0x00, 0x7e, 0x01, 0xca, 0xa5,
+    0x9e, 0x00, 0x7e, 0x09, 0xc9, 0xb2, 0x5a, 0x00, 0x7e, 0x11, 0xca, 0xa3,
+    0x96, 0x00, 0x7e, 0x18, 0x1b, 0xc0, 0xdb, 0xe6, 0x51, 0x54, 0xec, 0xc0,
+    0xdc, 0x00, 0x16, 0xc0, 0xdc, 0x08, 0x03, 0x40, 0xdc, 0x14, 0xe0, 0x02,
+    0x07, 0x01, 0x6b, 0x78, 0x43, 0x02, 0xa3, 0xc0, 0xdc, 0x20, 0xdc, 0x13,
+    0xa5, 0x01, 0x02, 0x89, 0xce, 0x6f, 0xb6, 0x0f, 0xaf, 0x51, 0xcc, 0x86,
+    0xfd, 0x0f, 0xad, 0x81, 0xc6, 0x78, 0x78, 0x0f, 0xa4, 0xa9, 0x55, 0x33,
+    0x7d, 0xc0, 0xdc, 0x2a, 0x48, 0x19, 0xb9, 0xc0, 0xdc, 0x36, 0xce, 0x71,
+    0xd8, 0x01, 0x4e, 0x49, 0xd8, 0x23, 0x03, 0x01, 0x53, 0xa9, 0xd1, 0x40,
+    0xee, 0x0f, 0xa3, 0x61, 0xd3, 0x40, 0xec, 0x0f, 0xa3, 0x68, 0xd7, 0x26,
+    0xa5, 0x0f, 0xc5, 0x81, 0x58, 0x21, 0x6b, 0xc0, 0xdc, 0x42, 0x57, 0x2b,
+    0x23, 0x40, 0xdc, 0x54, 0x0e, 0xc0, 0xdc, 0x60, 0x42, 0x01, 0xc3, 0xc0,
+    0xdc, 0x70, 0x06, 0xc0, 0xdc, 0x82, 0x14, 0xc0, 0xdc, 0x98, 0xc5, 0x4d,
+    0x40, 0x00, 0x32, 0x83, 0x00, 0xdc, 0xae, 0x08, 0xc0, 0xdc, 0xbb, 0x15,
+    0xc0, 0xdc, 0xd6, 0x45, 0x05, 0x75, 0xc0, 0xdd, 0x01, 0x16, 0xc0, 0xdd,
+    0x13, 0x05, 0xc0, 0xdd, 0x2f, 0x42, 0x00, 0xd0, 0xc0, 0xdd, 0x3b, 0x12,
+    0xc0, 0xdd, 0x47, 0x18, 0xc0, 0xdd, 0x5d, 0xd2, 0x4d, 0xd5, 0x00, 0x44,
+    0x39, 0x07, 0xc0, 0xdd, 0x69, 0xd0, 0x5e, 0x02, 0x00, 0x32, 0xf9, 0xc8,
+    0xbe, 0xf2, 0x00, 0x32, 0xc9, 0xce, 0x72, 0x2c, 0x00, 0x32, 0xb9, 0xcd,
+    0x2c, 0xb2, 0x00, 0x30, 0xf9, 0x47, 0x34, 0x2f, 0x40, 0xdd, 0x75, 0x46,
+    0x09, 0x97, 0xc0, 0xdd, 0x81, 0x44, 0x00, 0x67, 0xc0, 0xdd, 0xa5, 0xcb,
+    0x90, 0x4f, 0x00, 0x30, 0x39, 0xc9, 0xb3, 0x71, 0x00, 0x30, 0x30, 0x48,
+    0x19, 0x9b, 0xc0, 0xdd, 0xb1, 0x46, 0x02, 0x0f, 0x40, 0xdd, 0xc3, 0xd0,
+    0x48, 0x12, 0x00, 0x2a, 0xf9, 0xc9, 0x2d, 0x85, 0x00, 0x2a, 0xd0, 0xc4,
+    0x0a, 0x8b, 0x00, 0x2a, 0xe9, 0x4e, 0x0b, 0x18, 0x40, 0xde, 0x3c, 0xcf,
+    0x0f, 0x0a, 0x00, 0x2a, 0xe1, 0xcc, 0x81, 0x39, 0x00, 0x2a, 0xd8, 0x4e,
+    0x0b, 0x18, 0xc0, 0xde, 0xb5, 0xd1, 0x2b, 0xed, 0x0f, 0x4a, 0x40, 0xc4,
+    0x6b, 0x52, 0x0f, 0x49, 0x11, 0x06, 0xc0, 0xdf, 0x35, 0xc4, 0x76, 0x31,
+    0x0f, 0x49, 0x21, 0xc4, 0xe4, 0xb3, 0x0f, 0x49, 0x29, 0x04, 0xc0, 0xdf,
+    0x41, 0x15, 0xc0, 0xdf, 0x4b, 0xc2, 0x00, 0x67, 0x0f, 0x49, 0x41, 0xc2,
+    0x00, 0x39, 0x0f, 0x49, 0x51, 0x87, 0x0f, 0x49, 0x59, 0xc2, 0x00, 0x87,
+    0x0f, 0x49, 0x61, 0x8b, 0x0f, 0x49, 0x69, 0x91, 0x0f, 0x49, 0x71, 0x1b,
+    0xc0, 0xdf, 0x57, 0xc3, 0x7e, 0x89, 0x0f, 0x49, 0x89, 0x10, 0xc0, 0xdf,
+    0x61, 0x0d, 0xc0, 0xdf, 0x73, 0x97, 0x0f, 0x49, 0xa9, 0xc4, 0xe1, 0x4b,
+    0x0f, 0x49, 0xb1, 0xc3, 0x11, 0xee, 0x0f, 0x49, 0xb9, 0xc2, 0x00, 0xd0,
+    0x0f, 0x49, 0xc1, 0xc4, 0xd8, 0x3a, 0x0f, 0x49, 0xc9, 0x09, 0xc0, 0xdf,
+    0x85, 0xc2, 0x00, 0x16, 0x0f, 0x49, 0xe1, 0xc2, 0x02, 0x41, 0x0f, 0x49,
+    0xf1, 0xc3, 0xa9, 0xfc, 0x0f, 0x4a, 0x08, 0xc8, 0x01, 0xbf, 0x0f, 0x4a,
+    0x31, 0xd4, 0x3d, 0x2c, 0x0f, 0x4a, 0x48, 0xc4, 0x33, 0x5e, 0x0f, 0x4a,
+    0x51, 0xd0, 0x56, 0xc9, 0x0f, 0x4a, 0x58, 0xc4, 0x15, 0xe7, 0x0f, 0x4a,
+    0x81, 0xc3, 0x05, 0x14, 0x0f, 0x4a, 0x89, 0x16, 0xc0, 0xdf, 0x8f, 0x08,
+    0xc0, 0xdf, 0x9b, 0x15, 0xc0, 0xdf, 0xa7, 0xc5, 0x06, 0xdb, 0x0f, 0x4a,
+    0xc1, 0xc4, 0x26, 0x78, 0x0f, 0x4a, 0xc8, 0xd0, 0x0f, 0x09, 0x0f, 0x4a,
+    0xf1, 0xcd, 0x2c, 0xb2, 0x0f, 0x4a, 0xf8, 0x47, 0xc5, 0x21, 0xc0, 0xdf,
+    0xb3, 0xc4, 0xe4, 0x63, 0x0f, 0xba, 0x13, 0x00, 0xdf, 0xbf, 0xcb, 0x8c,
+    0xd4, 0x0f, 0xb8, 0x79, 0xca, 0x9a, 0xfe, 0x0f, 0xb9, 0xf1, 0xc4, 0x1a,
+    0xa8, 0x0f, 0xba, 0xc8, 0x14, 0xc0, 0xdf, 0xc3, 0xc7, 0xc8, 0xe0, 0x0f,
+    0xb8, 0x99, 0x46, 0x4c, 0x4a, 0xc0, 0xdf, 0xd2, 0x03, 0x40, 0xdf, 0xde,
+    0x42, 0x00, 0xfa, 0xc0, 0xdf, 0xf0, 0xc8, 0xbe, 0x7a, 0x0f, 0xbb, 0x80,
+    0x11, 0xc0, 0xdf, 0xff, 0xd2, 0x4e, 0x1d, 0x0f, 0xb8, 0x71, 0xca, 0xa1,
+    0x52, 0x0f, 0xba, 0xf9, 0x17, 0x40, 0xe0, 0x0e, 0xc5, 0xd7, 0x13, 0x0f,
+    0xb9, 0xfb, 0x00, 0xe0, 0x1a, 0x42, 0x00, 0x74, 0xc0, 0xe0, 0x20, 0xc4,
+    0xdf, 0x17, 0x0f, 0xba, 0x69, 0xc6, 0x7b, 0x50, 0x0f, 0xba, 0x88, 0x07,
+    0xc0, 0xe0, 0x2c, 0xc8, 0xba, 0xfa, 0x0f, 0xb8, 0xc2, 0x00, 0xe0, 0x44,
+    0x0b, 0xc0, 0xe0, 0x4a, 0xc8, 0xbb, 0x32, 0x0f, 0xb9, 0x40, 0x17, 0xc0,
+    0xe0, 0x5c, 0x42, 0x00, 0x65, 0xc0, 0xe0, 0x68, 0xc5, 0xd4, 0x93, 0x0f,
+    0xb8, 0xd9, 0xc5, 0xac, 0x22, 0x0f, 0xba, 0x39, 0xce, 0x6f, 0x62, 0x0f,
+    0xba, 0x79, 0x16, 0xc0, 0xe0, 0x75, 0xc3, 0xc9, 0x9a, 0x0f, 0xba, 0xa0,
+    0xcb, 0x97, 0xb3, 0x0f, 0xb9, 0x59, 0x43, 0x00, 0xe3, 0xc0, 0xe0, 0x84,
+    0xc2, 0x01, 0x29, 0x0f, 0xb8, 0x09, 0x0e, 0xc0, 0xe0, 0x8e, 0xc6, 0xcd,
+    0xd3, 0x0f, 0xb9, 0xd1, 0xca, 0x9a, 0xcc, 0x0f, 0xb9, 0xe9, 0xc4, 0x04,
+    0x65, 0x0f, 0xba, 0xb9, 0xc6, 0xd2, 0x4d, 0x0f, 0xba, 0xd8, 0xc7, 0xc2,
+    0xb9, 0x0f, 0xb9, 0x51, 0xc8, 0xba, 0xe2, 0x0f, 0xba, 0x98, 0xc3, 0x04,
+    0xe4, 0x0f, 0xb8, 0xa9, 0xc3, 0x00, 0x2e, 0x0f, 0xbb, 0x78, 0xd0, 0x5d,
+    0x22, 0x0f, 0xb8, 0x83, 0x00, 0xe0, 0xa3, 0xc8, 0xbe, 0xc2, 0x0f, 0xb9,
+    0xc1, 0xc4, 0x97, 0x51, 0x0f, 0xbb, 0x88, 0xc3, 0x02, 0x11, 0x0f, 0xb8,
+    0x21, 0x9a, 0x0f, 0xba, 0x50, 0xc9, 0xaf, 0xed, 0x0f, 0xb8, 0x01, 0xc7,
+    0xc8, 0x62, 0x0f, 0xba, 0x08, 0xc3, 0x1a, 0x7c, 0x0f, 0xb8, 0xd1, 0xc2,
+    0x01, 0xdf, 0x0f, 0xba, 0x48, 0xc4, 0x91, 0x3d, 0x0f, 0xb8, 0xe3, 0x00,
+    0xe0, 0xa7, 0xcb, 0x91, 0x36, 0x0f, 0xb9, 0x08, 0x11, 0xc0, 0xe0, 0xad,
+    0x44, 0x01, 0xcf, 0x40, 0xe0, 0xb9, 0xd7, 0x08, 0xf0, 0x01, 0x53, 0x78,
+    0xd3, 0x43, 0xab, 0x0f, 0x9f, 0x39, 0xc5, 0x46, 0x98, 0x0f, 0xb4, 0xb8,
+    0x1d, 0xc0, 0xe0, 0xc5, 0x1e, 0xc0, 0xe0, 0xed, 0x1f, 0xc0, 0xe1, 0x15,
+    0x20, 0xc0, 0xe1, 0x3d, 0x21, 0xc0, 0xe1, 0x65, 0x22, 0x40, 0xe1, 0x8d,
+    0xd3, 0x41, 0x97, 0x01, 0x3f, 0x91, 0x05, 0xc0, 0xe1, 0x9f, 0xd1, 0x05,
+    0x75, 0x01, 0x0d, 0xd1, 0x16, 0xc0, 0xe1, 0xab, 0x48, 0x03, 0xc8, 0xc0,
+    0xe1, 0xb7, 0xcb, 0x87, 0x8d, 0x01, 0x50, 0x88, 0x46, 0x00, 0x8b, 0x40,
+    0xe1, 0xbd, 0xda, 0x19, 0xc8, 0x01, 0x37, 0x11, 0xc3, 0x92, 0x53, 0x01,
+    0x5e, 0xc8, 0x8d, 0x00, 0x01, 0x53, 0x00, 0xe1, 0xc9, 0x8f, 0x01, 0x02,
+    0x10, 0xc2, 0x00, 0xdb, 0x08, 0xba, 0x31, 0x83, 0x08, 0xb8, 0x70, 0xc2,
+    0x00, 0xc1, 0x08, 0xba, 0x29, 0xc2, 0x19, 0x2c, 0x08, 0xb8, 0x81, 0x83,
+    0x08, 0xb8, 0x19, 0xc2, 0x01, 0x30, 0x08, 0xb8, 0x10, 0x06, 0xc0, 0xe1,
+    0xcf, 0xc2, 0x00, 0xd0, 0x08, 0xb8, 0xa1, 0x83, 0x08, 0xb8, 0x98, 0x16,
+    0xc0, 0xe1, 0xd9, 0xc2, 0x00, 0xd0, 0x08, 0xb8, 0x61, 0x83, 0x08, 0xb8,
+    0x20, 0x83, 0x08, 0xba, 0x01, 0xc2, 0x00, 0xd0, 0x08, 0xb8, 0x58, 0x49,
+    0x0c, 0x8d, 0x40, 0xe1, 0xe3, 0xc2, 0x00, 0xd0, 0x08, 0xb8, 0xc9, 0x83,
+    0x08, 0xb8, 0x50, 0xc2, 0x00, 0xd0, 0x08, 0xb8, 0xc1, 0x83, 0x08, 0xb8,
+    0x40, 0xc2, 0x00, 0xd0, 0x08, 0xb8, 0xb9, 0x83, 0x08, 0xb8, 0xa8, 0xc2,
+    0x00, 0xd0, 0x08, 0xb8, 0x39, 0x83, 0x08, 0xb8, 0x30, 0xc2, 0x00, 0xd0,
+    0x08, 0xb8, 0x09, 0x83, 0x08, 0xb8, 0x00, 0xc5, 0xdd, 0x08, 0x08, 0xb9,
+    0xf1, 0x15, 0xc0, 0xe1, 0xf5, 0xc6, 0xd0, 0xeb, 0x08, 0xb9, 0x58, 0xc4,
+    0x18, 0x10, 0x08, 0xb9, 0xb9, 0xc2, 0x22, 0xcc, 0x08, 0xb9, 0xb0, 0xc3,
+    0x0d, 0x14, 0x08, 0xb9, 0xa9, 0xc3, 0x09, 0x9e, 0x08, 0xb9, 0xa0, 0xc4,
+    0x02, 0xde, 0x08, 0xb9, 0x99, 0xc2, 0x02, 0xa0, 0x08, 0xb9, 0x90, 0x8f,
+    0x08, 0xb9, 0x51, 0x8b, 0x08, 0xb9, 0x49, 0x99, 0x08, 0xb9, 0x39, 0x83,
+    0x08, 0xb9, 0x08, 0x97, 0x08, 0xb9, 0x28, 0x8b, 0x08, 0xb9, 0x18, 0xca,
+    0x9f, 0x04, 0x08, 0xb8, 0xf9, 0x83, 0x08, 0xb8, 0xe8, 0xc2, 0x01, 0x9d,
+    0x01, 0x1c, 0xab, 0x00, 0xe2, 0x01, 0x44, 0x48, 0xaa, 0x40, 0xe2, 0x05,
+    0xc9, 0x52, 0x08, 0x01, 0x1b, 0xb0, 0xc9, 0x52, 0x08, 0x01, 0x1b, 0xc8,
+    0xc3, 0x01, 0xbb, 0x01, 0x1b, 0x9b, 0x00, 0xe2, 0x11, 0xc5, 0xd8, 0xf3,
+    0x01, 0x19, 0xb0, 0xc2, 0x01, 0x23, 0x01, 0x1b, 0xa1, 0xce, 0x6c, 0xde,
+    0x01, 0x1a, 0x30, 0x00, 0xc0, 0xe2, 0x17, 0xca, 0x6c, 0xe2, 0x01, 0x1a,
+    0x78, 0x43, 0x01, 0x47, 0xc0, 0xe2, 0x29, 0x42, 0x05, 0x03, 0xc0, 0xe2,
+    0x33, 0xcf, 0x67, 0xdd, 0x01, 0x1a, 0xd0, 0xd1, 0x52, 0x00, 0x01, 0x1b,
+    0x71, 0x16, 0xc0, 0xe2, 0x3d, 0xc8, 0x7d, 0xf2, 0x01, 0x19, 0xf9, 0xca,
+    0x9a, 0x9a, 0x01, 0x19, 0xb8, 0xc8, 0xb5, 0xea, 0x01, 0x1b, 0x51, 0x46,
+    0x02, 0xd2, 0x40, 0xe2, 0x49, 0xcb, 0x94, 0xf3, 0x01, 0x1b, 0x39, 0xca,
+    0x6c, 0xe2, 0x01, 0x1a, 0x28, 0xc9, 0x20, 0xa8, 0x01, 0x1b, 0x21, 0xc8,
+    0x52, 0x09, 0x01, 0x1a, 0xd8, 0x49, 0x07, 0x49, 0xc0, 0xe2, 0x67, 0xcf,
+    0x6a, 0x53, 0x01, 0x12, 0x80, 0x0a, 0xc0, 0xe2, 0x73, 0x15, 0xc0, 0xe2,
+    0x7d, 0xc2, 0x00, 0x5f, 0x08, 0x59, 0x61, 0x1b, 0xc0, 0xe2, 0x8b, 0xc2,
+    0x00, 0x4e, 0x08, 0x59, 0x41, 0x10, 0xc0, 0xe2, 0x95, 0x06, 0xc0, 0xe2,
+    0xa9, 0x16, 0xc0, 0xe2, 0xb3, 0xc2, 0x1c, 0x52, 0x08, 0x58, 0xc1, 0xc2,
+    0x00, 0x89, 0x08, 0x58, 0xb9, 0x09, 0xc0, 0xe2, 0xc3, 0x1a, 0xc0, 0xe2,
+    0xd3, 0xc2, 0x00, 0x3c, 0x08, 0x58, 0x81, 0x97, 0x08, 0x58, 0x73, 0x00,
+    0xe2, 0xe3, 0x8b, 0x08, 0x58, 0x63, 0x00, 0xe2, 0xe7, 0x91, 0x08, 0x58,
+    0x53, 0x00, 0xe2, 0xeb, 0x87, 0x08, 0x58, 0x43, 0x00, 0xe2, 0xef, 0x83,
+    0x08, 0x58, 0x03, 0x00, 0xe2, 0xf3, 0xc2, 0x00, 0x67, 0x08, 0x58, 0xf1,
+    0xc2, 0x14, 0xda, 0x08, 0x58, 0xf9, 0x04, 0xc0, 0xe3, 0x09, 0xc2, 0x01,
+    0x19, 0x08, 0x59, 0x69, 0xc2, 0x00, 0x49, 0x08, 0x59, 0x71, 0x1c, 0x40,
+    0xe3, 0x13, 0xc3, 0x05, 0x14, 0x08, 0x08, 0x3b, 0x00, 0xe3, 0x1d, 0x16,
+    0xc0, 0xe3, 0x21, 0x08, 0xc0, 0xe3, 0x32, 0x15, 0xc0, 0xe3, 0x3a, 0xc5,
+    0x06, 0xdb, 0x08, 0x08, 0x73, 0x00, 0xe3, 0x4c, 0xc4, 0x26, 0x78, 0x08,
+    0x08, 0x7a, 0x00, 0xe3, 0x57, 0x46, 0x0f, 0x88, 0xc0, 0xe3, 0x64, 0x4e,
+    0x72, 0x02, 0x40, 0xe3, 0x7a, 0xce, 0x71, 0x22, 0x08, 0x09, 0xf1, 0xcd,
+    0x7d, 0xb9, 0x08, 0x09, 0xf8, 0x0e, 0xc0, 0xe3, 0x86, 0x46, 0x11, 0x39,
+    0xc0, 0xe3, 0x92, 0x42, 0x00, 0x58, 0xc0, 0xe3, 0xcb, 0x49, 0x07, 0xbb,
+    0xc0, 0xe3, 0xd7, 0x43, 0x11, 0x49, 0xc0, 0xe3, 0xef, 0x46, 0x00, 0x2c,
+    0x40, 0xe4, 0x07, 0xc6, 0x0b, 0x09, 0x0f, 0xbc, 0x81, 0xc6, 0x02, 0xd1,
+    0x0f, 0xbc, 0x30, 0xc6, 0x13, 0x52, 0x0f, 0xbd, 0x59, 0xd2, 0x4d, 0x57,
+    0x0f, 0xbd, 0xb8, 0xd6, 0x08, 0x88, 0x01, 0x1f, 0x09, 0xcd, 0x00, 0x32,
+    0x01, 0x1e, 0xf9, 0xcb, 0x1a, 0x50, 0x01, 0x1e, 0xe9, 0xce, 0x25, 0xad,
+    0x01, 0x1d, 0xab, 0x00, 0xe4, 0x1f, 0x45, 0x01, 0xce, 0xc0, 0xe4, 0x25,
+    0x46, 0x00, 0x2c, 0xc0, 0xe4, 0x3d, 0x45, 0x00, 0x49, 0xc0, 0xe4, 0x47,
+    0xd7, 0x15, 0x64, 0x01, 0x49, 0xd8, 0x46, 0x00, 0x8b, 0x40, 0xe4, 0x51,
+    0x00, 0xc0, 0xe4, 0x5d, 0xc3, 0x00, 0x74, 0x0f, 0x9d, 0x98, 0xc4, 0x01,
+    0xc3, 0x0f, 0xa8, 0xb3, 0x00, 0xe4, 0x69, 0x95, 0x0f, 0xa6, 0xd0, 0x84,
+    0x01, 0x88, 0x2b, 0x00, 0xe4, 0x6f, 0x92, 0x01, 0x88, 0x31, 0x8f, 0x01,
+    0x88, 0x39, 0x88, 0x01, 0x88, 0x41, 0x86, 0x01, 0x88, 0x49, 0x96, 0x01,
+    0x88, 0x51, 0x90, 0x01, 0x88, 0x5b, 0x00, 0xe4, 0x73, 0x8e, 0x01, 0x88,
+    0x63, 0x00, 0xe4, 0x7e, 0x89, 0x01, 0x88, 0x6b, 0x00, 0xe4, 0x82, 0x8d,
+    0x01, 0x88, 0x73, 0x00, 0xe4, 0x92, 0x8a, 0x01, 0x88, 0x79, 0x8c, 0x01,
+    0x88, 0x83, 0x00, 0xe4, 0x96, 0x93, 0x01, 0x88, 0x89, 0x9a, 0x01, 0x88,
+    0x91, 0x9c, 0x01, 0x88, 0xbb, 0x00, 0xe4, 0x9a, 0x85, 0x01, 0x88, 0xc3,
+    0x00, 0xe4, 0xa6, 0x95, 0x01, 0x88, 0xcb, 0x00, 0xe4, 0xaa, 0x94, 0x01,
+    0x88, 0xb1, 0x83, 0x01, 0x88, 0xd3, 0x00, 0xe4, 0xae, 0x91, 0x01, 0x88,
+    0xdb, 0x00, 0xe4, 0xcb, 0x87, 0x01, 0x88, 0xe3, 0x00, 0xe4, 0xe5, 0x8b,
+    0x01, 0x89, 0x3b, 0x00, 0xe4, 0xfc, 0x97, 0x01, 0x89, 0x43, 0x00, 0xe5,
+    0x15, 0x98, 0x01, 0x89, 0x50, 0x92, 0x01, 0x8d, 0xa1, 0x96, 0x01, 0x8d,
+    0xa9, 0x8d, 0x01, 0x8d, 0xb1, 0x8a, 0x01, 0x8d, 0xb9, 0x89, 0x01, 0x8d,
+    0xd8, 0x9e, 0x0f, 0xd8, 0x03, 0x00, 0xe5, 0x1b, 0xa0, 0x0f, 0xd8, 0x1b,
+    0x00, 0xe5, 0x3b, 0x9f, 0x0f, 0xd8, 0x0b, 0x00, 0xe5, 0x4d, 0xa2, 0x0f,
+    0xd8, 0x7b, 0x00, 0xe5, 0x66, 0xa1, 0x0f, 0xd8, 0x3b, 0x00, 0xe5, 0x6a,
+    0xa3, 0x0f, 0xd8, 0xf0, 0x00, 0xc0, 0xe5, 0x75, 0x02, 0x40, 0xe5, 0xbf,
+    0xc4, 0xe3, 0x33, 0x0f, 0xa6, 0xc1, 0xc5, 0x1c, 0xae, 0x0f, 0xa4, 0xc8,
+    0x4a, 0xa5, 0x3a, 0x40, 0xe5, 0xcb, 0xc8, 0xb5, 0x7a, 0x0f, 0xd3, 0x81,
+    0xc8, 0xb8, 0x02, 0x0f, 0xcf, 0xb1, 0x11, 0x40, 0xe5, 0xe3, 0x42, 0x00,
+    0xb0, 0xc0, 0xe5, 0xf2, 0x4f, 0x2a, 0x5c, 0xc0, 0xe5, 0xff, 0x46, 0xcd,
+    0x25, 0xc0, 0xe6, 0x15, 0xc5, 0xd5, 0x56, 0x00, 0xda, 0xe1, 0x46, 0x09,
+    0x97, 0xc0, 0xe6, 0x21, 0x47, 0x02, 0x0e, 0xc0, 0xe6, 0x45, 0xc9, 0xb3,
+    0x3b, 0x00, 0xda, 0x21, 0x4b, 0x6f, 0xc7, 0xc0, 0xe6, 0xe9, 0x45, 0x00,
+    0xba, 0x40, 0xe7, 0x1a, 0xcd, 0x7e, 0x6f, 0x0f, 0x9e, 0x00, 0xc9, 0x11,
+    0xf6, 0x0b, 0x57, 0xa9, 0x4a, 0x51, 0x89, 0xc0, 0xe7, 0x38, 0x47, 0x02,
+    0x0e, 0x40, 0xe7, 0x4a, 0xc6, 0x00, 0x91, 0x0f, 0xb5, 0xe1, 0xc5, 0xd4,
+    0x66, 0x0f, 0xa3, 0xe1, 0xc6, 0x50, 0xe2, 0x0f, 0x9b, 0xe1, 0xc5, 0x55,
+    0x91, 0x0f, 0xa1, 0x20, 0x12, 0xc0, 0xe7, 0xc2, 0x83, 0x05, 0x35, 0x01,
+    0x0d, 0xc0, 0xe7, 0xd8, 0x97, 0x05, 0x35, 0x11, 0xc2, 0x02, 0xe0, 0x05,
+    0x35, 0x21, 0x14, 0xc0, 0xe7, 0xfb, 0x16, 0xc0, 0xe8, 0x0d, 0x91, 0x05,
+    0x35, 0x39, 0x10, 0xc0, 0xe8, 0x19, 0x8b, 0x05, 0x35, 0x49, 0x0e, 0xc0,
+    0xe8, 0x46, 0x8f, 0x05, 0x35, 0x9b, 0x00, 0xe8, 0x5e, 0x15, 0xc0, 0xe8,
+    0x76, 0x1b, 0xc0, 0xe8, 0x90, 0x19, 0xc0, 0xe8, 0xa0, 0x08, 0x40, 0xe8,
+    0xaa, 0x0f, 0xc0, 0xe8, 0xc0, 0xc3, 0x0d, 0xe5, 0x05, 0x37, 0xa0, 0x47,
+    0x01, 0xeb, 0xc0, 0xe8, 0xcc, 0x00, 0xc0, 0xe8, 0xd2, 0x15, 0x40, 0xe8,
+    0xde, 0x15, 0xc0, 0xe8, 0xea, 0x43, 0x0c, 0xe0, 0xc0, 0xe8, 0xf6, 0x4f,
+    0x30, 0x90, 0xc0, 0xe9, 0x02, 0x4b, 0x6f, 0xc7, 0xc0, 0xe9, 0x0c, 0x47,
+    0x02, 0x0e, 0x40, 0xe9, 0x2e, 0xc3, 0x82, 0x4c, 0x0f, 0xb6, 0x08, 0xc5,
+    0xb5, 0x75, 0x0f, 0xa6, 0x51, 0xc7, 0xc9, 0x96, 0x0f, 0xcf, 0xe0, 0xcf,
+    0x67, 0x38, 0x01, 0x33, 0x61, 0xcc, 0x82, 0xdd, 0x01, 0x33, 0x59, 0xd8,
+    0x23, 0x1b, 0x0f, 0x9c, 0xe9, 0xd7, 0x29, 0xca, 0x0f, 0x9c, 0xe0, 0xc5,
+    0x11, 0x55, 0x0f, 0xa1, 0xd9, 0xca, 0xa5, 0x76, 0x0f, 0xce, 0xa0, 0xcc,
+    0x20, 0x76, 0x01, 0x1f, 0x18, 0x47, 0x02, 0x0e, 0xc0, 0xe9, 0x91, 0x15,
+    0xc0, 0xe9, 0xf4, 0x4b, 0x6f, 0xc7, 0xc0, 0xea, 0x00, 0x03, 0xc0, 0xea,
+    0x20, 0x46, 0x09, 0x97, 0xc0, 0xea, 0x32, 0x46, 0x76, 0x52, 0xc0, 0xea,
+    0x56, 0x49, 0x3a, 0xd4, 0xc0, 0xea, 0x62, 0xc6, 0xd2, 0xcb, 0x00, 0x4f,
+    0xd1, 0xca, 0x9f, 0xae, 0x00, 0x4f, 0xd8, 0xc5, 0xd9, 0xb6, 0x0f, 0x9b,
+    0x89, 0x49, 0x03, 0x37, 0x40, 0xea, 0x6e, 0xc6, 0x00, 0x91, 0x01, 0x1b,
+    0xf1, 0xd8, 0x23, 0xc3, 0x0f, 0xa8, 0xa9, 0xc6, 0xcd, 0x19, 0x0f, 0xd6,
+    0x88, 0xcf, 0x62, 0x6a, 0x0f, 0xa3, 0x29, 0xce, 0x2f, 0xbc, 0x0f, 0xa3,
+    0x20, 0xc9, 0x18, 0x66, 0x01, 0x10, 0xc8, 0xd1, 0x51, 0xab, 0x0f, 0xab,
+    0x60, 0xce, 0x6f, 0x0e, 0x00, 0xd0, 0xf9, 0xc7, 0xc9, 0xd5, 0x00, 0xd0,
+    0xf1, 0x4b, 0x6f, 0xc7, 0xc0, 0xea, 0x74, 0x47, 0x02, 0x0e, 0x40, 0xea,
+    0x8a, 0x97, 0x00, 0xba, 0x99, 0x8b, 0x00, 0xba, 0x90, 0xc2, 0x00, 0xd0,
+    0x00, 0xba, 0x89, 0xc2, 0x0d, 0xf6, 0x00, 0xba, 0x81, 0xc2, 0x01, 0x4a,
+    0x00, 0xba, 0x79, 0xc2, 0x00, 0xdb, 0x00, 0xba, 0x71, 0xc2, 0x00, 0x39,
+    0x00, 0xba, 0x69, 0xc2, 0x19, 0x2c, 0x00, 0xba, 0x61, 0xc2, 0x01, 0xc3,
+    0x00, 0xba, 0x59, 0xc2, 0x01, 0x5d, 0x00, 0xba, 0x51, 0xc2, 0x00, 0xb0,
+    0x00, 0xba, 0x49, 0x10, 0xc0, 0xea, 0xea, 0xc2, 0x0e, 0x9a, 0x00, 0xba,
+    0x39, 0xc2, 0x01, 0x6f, 0x00, 0xba, 0x31, 0xc2, 0x01, 0x30, 0x00, 0xba,
+    0x21, 0xc2, 0x02, 0x2b, 0x00, 0xba, 0x19, 0x97, 0x00, 0xba, 0x11, 0x8b,
+    0x00, 0xba, 0x09, 0x83, 0x00, 0xba, 0x00, 0xcb, 0x8c, 0xa8, 0x0f, 0xa3,
+    0x81, 0xcb, 0x91, 0xdb, 0x0f, 0x98, 0x48, 0xc4, 0xe3, 0x0f, 0x0f, 0xa5,
+    0xe1, 0x95, 0x0f, 0xd3, 0x90, 0x4c, 0x83, 0x49, 0xc0, 0xea, 0xf4, 0x90,
+    0x0f, 0xcf, 0x00, 0x47, 0x34, 0x2f, 0xc0, 0xeb, 0x00, 0x47, 0x02, 0x0e,
+    0xc0, 0xeb, 0x2d, 0x18, 0xc0, 0xeb, 0x95, 0x45, 0x00, 0xba, 0xc0, 0xeb,
+    0xa1, 0x06, 0xc0, 0xeb, 0xc5, 0x4c, 0x11, 0xe2, 0x40, 0xeb, 0xd7, 0xdb,
+    0x15, 0x96, 0x01, 0x1c, 0x59, 0xc5, 0x1c, 0xae, 0x0f, 0xa4, 0xa1, 0xc3,
+    0x01, 0x5d, 0x00, 0x05, 0x30, 0x86, 0x0f, 0x9a, 0xf1, 0xd0, 0x5b, 0x62,
+    0x00, 0x04, 0x11, 0xca, 0xa7, 0x10, 0x0f, 0xc9, 0x88, 0x42, 0x00, 0xbf,
+    0xc0, 0xeb, 0xe7, 0x46, 0xd0, 0xd9, 0xc0, 0xeb, 0xf3, 0xcb, 0x97, 0x50,
+    0x0e, 0x82, 0x28, 0xc5, 0x87, 0x64, 0x0e, 0x81, 0x23, 0x00, 0xeb, 0xff,
+    0x46, 0xd1, 0xa5, 0xc0, 0xec, 0x03, 0x11, 0xc0, 0xec, 0x10, 0x14, 0xc0,
+    0xec, 0x25, 0x42, 0x00, 0xfe, 0xc0, 0xec, 0x31, 0xc6, 0xc8, 0x94, 0x0e,
+    0x83, 0x08, 0x14, 0xc0, 0xec, 0x3d, 0x12, 0xc0, 0xec, 0x49, 0x45, 0xd8,
+    0x4e, 0xc0, 0xec, 0x59, 0x10, 0x40, 0xec, 0x71, 0x16, 0xc0, 0xec, 0x7d,
+    0x48, 0xbc, 0x8a, 0xc0, 0xec, 0x92, 0xc5, 0xd9, 0x02, 0x0e, 0x81, 0x4b,
+    0x00, 0xec, 0xa4, 0x1b, 0xc0, 0xec, 0xaa, 0xc7, 0xc0, 0x9e, 0x0e, 0x80,
+    0xe8, 0x0b, 0xc0, 0xec, 0xb7, 0xc2, 0x42, 0xcd, 0x0e, 0x81, 0x79, 0xc5,
+    0xd7, 0x27, 0x0e, 0x80, 0x08, 0x42, 0x14, 0xda, 0xc0, 0xec, 0xd4, 0x12,
+    0x40, 0xec, 0xe0, 0x46, 0x3d, 0xd7, 0xc0, 0xec, 0xea, 0xda, 0x19, 0xfc,
+    0x0e, 0x86, 0x29, 0x49, 0xb5, 0x21, 0x40, 0xed, 0x15, 0x44, 0xdf, 0x57,
+    0xc0, 0xed, 0x27, 0x47, 0xc8, 0x2a, 0xc0, 0xed, 0x39, 0x44, 0x56, 0x2e,
+    0x40, 0xed, 0x45, 0x42, 0x02, 0x2f, 0xc0, 0xed, 0x4f, 0x15, 0xc0, 0xed,
+    0x59, 0xc6, 0xcd, 0xf1, 0x0e, 0x81, 0xf8, 0x10, 0xc0, 0xed, 0x65, 0x46,
+    0xd1, 0x69, 0xc0, 0xed, 0x71, 0xc7, 0xc7, 0x5f, 0x0e, 0x83, 0x41, 0xc9,
+    0xac, 0x9f, 0x0e, 0x83, 0x21, 0xc6, 0xd0, 0x9d, 0x0e, 0x82, 0xa9, 0xce,
+    0x6d, 0x08, 0x0e, 0x80, 0x70, 0x48, 0xbd, 0x2a, 0xc0, 0xed, 0x7d, 0xca,
+    0x9e, 0x32, 0x0e, 0x82, 0xb8, 0x14, 0xc0, 0xed, 0x9d, 0x07, 0xc0, 0xed,
+    0xa7, 0x0a, 0xc0, 0xed, 0xb9, 0xc6, 0xd1, 0x51, 0x0e, 0x81, 0x38, 0x07,
+    0xc0, 0xed, 0xc3, 0xc6, 0xc4, 0xab, 0x0e, 0x82, 0xe8, 0x49, 0xab, 0x64,
+    0xc0, 0xed, 0xcf, 0xc5, 0xda, 0x92, 0x0e, 0x82, 0xd9, 0x44, 0xdf, 0x27,
+    0xc0, 0xed, 0xdb, 0x46, 0xce, 0x7b, 0x40, 0xed, 0xe5, 0x42, 0x00, 0xba,
+    0xc0, 0xed, 0xf1, 0x42, 0x00, 0xb1, 0xc0, 0xed, 0xfb, 0x46, 0xce, 0xf3,
+    0xc0, 0xee, 0x07, 0x07, 0x40, 0xee, 0x13, 0x44, 0xe4, 0xaf, 0xc0, 0xee,
+    0x28, 0xc3, 0x4e, 0x10, 0x0e, 0x80, 0xc8, 0xc6, 0xcd, 0x1f, 0x0e, 0x81,
+    0xe1, 0xc4, 0xc8, 0x2c, 0x0e, 0x81, 0x28, 0xc2, 0x0d, 0x10, 0x08, 0xe3,
+    0x58, 0x9b, 0x08, 0xe3, 0x50, 0xc4, 0x18, 0x10, 0x08, 0xe3, 0x03, 0x00,
+    0xee, 0x32, 0xc2, 0x22, 0xcc, 0x08, 0xe2, 0xfa, 0x00, 0xee, 0x38, 0x0b,
+    0xc0, 0xee, 0x3e, 0x11, 0x40, 0xee, 0x4a, 0x0a, 0xc0, 0xee, 0x56, 0x19,
+    0xc0, 0xee, 0x62, 0xc2, 0x00, 0xc4, 0x08, 0xe3, 0x18, 0xc4, 0x26, 0x78,
+    0x08, 0xe2, 0xc9, 0xc5, 0x06, 0xdb, 0x08, 0xe2, 0xc1, 0x15, 0xc0, 0xee,
+    0x6c, 0x08, 0xc0, 0xee, 0x78, 0x16, 0xc0, 0xee, 0x84, 0xc3, 0x05, 0x14,
+    0x08, 0xe2, 0x89, 0xc4, 0x15, 0xe7, 0x08, 0xe2, 0x80, 0xc7, 0x7a, 0x7f,
+    0x08, 0xe2, 0x01, 0xc7, 0x14, 0x39, 0x08, 0xe1, 0xe8, 0xc4, 0x1e, 0x97,
+    0x08, 0xe1, 0xf9, 0xc5, 0x40, 0xe7, 0x08, 0xe1, 0xf0, 0x97, 0x08, 0xe1,
+    0xd9, 0x8b, 0x08, 0xe1, 0xc9, 0x83, 0x08, 0xe1, 0x78, 0x8e, 0x08, 0xe1,
+    0xb1, 0x94, 0x08, 0xe1, 0xa2, 0x00, 0xee, 0x90, 0x97, 0x08, 0xe1, 0x98,
+    0x8b, 0x08, 0xe1, 0x88, 0x83, 0x08, 0xe1, 0x69, 0xc2, 0x0d, 0xf6, 0x08,
+    0xe1, 0x61, 0xc2, 0x00, 0xd0, 0x08, 0xe1, 0x58, 0x83, 0x08, 0xe1, 0x51,
+    0x47, 0xb2, 0x2e, 0x40, 0xee, 0x94, 0xc2, 0x00, 0xd0, 0x08, 0xe1, 0x29,
+    0x83, 0x08, 0xe1, 0x20, 0xc2, 0x00, 0xd0, 0x08, 0xe1, 0x19, 0x83, 0x08,
+    0xe1, 0x10, 0x83, 0x08, 0xe1, 0x09, 0xc2, 0x00, 0xc1, 0x08, 0xe0, 0xe1,
+    0xc2, 0x19, 0x2c, 0x08, 0xe0, 0xb9, 0xc2, 0x01, 0x30, 0x08, 0xe0, 0x90,
+    0xc2, 0x00, 0xd0, 0x08, 0xe1, 0x01, 0x83, 0x08, 0xe0, 0xf9, 0x06, 0x40,
+    0xee, 0x9f, 0xc2, 0x00, 0xd0, 0x08, 0xe0, 0xf1, 0x83, 0x08, 0xe0, 0xe9,
+    0x16, 0x40, 0xee, 0xa9, 0xc2, 0x00, 0xd0, 0x08, 0xe0, 0xb1, 0x83, 0x08,
+    0xe0, 0xa8, 0xc2, 0x00, 0xd0, 0x08, 0xe0, 0xa1, 0x83, 0x08, 0xe0, 0x98,
+    0xc2, 0x00, 0xd0, 0x08, 0xe0, 0x89, 0x83, 0x08, 0xe0, 0x80, 0xc2, 0x00,
+    0xd0, 0x08, 0xe0, 0x79, 0x83, 0x08, 0xe0, 0x70, 0x97, 0x08, 0xe0, 0x69,
+    0x8b, 0x08, 0xe0, 0x59, 0x83, 0x08, 0xe0, 0x08, 0x97, 0x08, 0xe0, 0x28,
+    0x8b, 0x08, 0xe0, 0x18, 0x45, 0x00, 0x49, 0xc0, 0xee, 0xb3, 0x46, 0x00,
+    0x2c, 0xc0, 0xee, 0xd9, 0x16, 0xc0, 0xef, 0x01, 0xce, 0x6b, 0x9c, 0x01,
+    0x38, 0x19, 0x45, 0x01, 0xce, 0xc0, 0xef, 0x0d, 0xd3, 0x3f, 0xe2, 0x01,
+    0x2c, 0x39, 0xd2, 0x4a, 0x75, 0x01, 0x2c, 0x29, 0x44, 0x05, 0x14, 0x40,
+    0xef, 0x25, 0x04, 0xc0, 0xef, 0x31, 0xc8, 0x0a, 0xff, 0x01, 0x02, 0x71,
+    0xc4, 0x02, 0x6d, 0x00, 0x02, 0xf9, 0xc6, 0x4a, 0x9f, 0x01, 0x72, 0x3b,
+    0x00, 0xef, 0x3d, 0xdb, 0x18, 0x1e, 0x01, 0x80, 0xf8, 0x46, 0x01, 0x4a,
+    0xc0, 0xef, 0x43, 0xc5, 0x32, 0xbb, 0x01, 0x3e, 0xe8, 0x46, 0x01, 0x4a,
+    0xc0, 0xef, 0x5b, 0x00, 0x40, 0xef, 0x73, 0xc7, 0x30, 0xf2, 0x01, 0x3e,
+    0x61, 0x47, 0xc3, 0x14, 0xc0, 0xef, 0x7f, 0xc3, 0x17, 0x99, 0x0f, 0xd4,
+    0xc0, 0x00, 0x40, 0xef, 0x85, 0x46, 0x00, 0x8b, 0x40, 0xef, 0x91, 0xc4,
+    0x15, 0xe7, 0x00, 0x00, 0x79, 0xc3, 0x05, 0x14, 0x00, 0x00, 0x70, 0x03,
+    0xc0, 0xef, 0xa9, 0x42, 0x00, 0xd0, 0xc0, 0xef, 0xb1, 0x14, 0xc0, 0xef,
+    0xbd, 0xc8, 0x6e, 0xdc, 0x01, 0x3e, 0xe1, 0x11, 0xc0, 0xef, 0xc9, 0x15,
+    0xc0, 0xef, 0xd5, 0x05, 0xc0, 0xef, 0xf8, 0x16, 0xc0, 0xf0, 0x13, 0x08,
+    0xc0, 0xf0, 0x27, 0x4a, 0x07, 0xbb, 0xc0, 0xf0, 0x31, 0xcb, 0x1a, 0x50,
+    0x00, 0x01, 0x43, 0x00, 0xf0, 0x3d, 0xe0, 0x05, 0xa7, 0x01, 0x16, 0x49,
+    0x42, 0x00, 0x58, 0xc0, 0xf0, 0x41, 0x19, 0xc0, 0xf0, 0x4d, 0x04, 0xc0,
+    0xf0, 0x5f, 0x0e, 0x40, 0xf0, 0x6b, 0x19, 0xc0, 0xf0, 0x77, 0x16, 0xc0,
+    0xf0, 0x86, 0xd0, 0x58, 0x62, 0x0f, 0xc1, 0xe1, 0xc5, 0x01, 0xa2, 0x01,
+    0x0c, 0x83, 0x00, 0xf0, 0x98, 0x14, 0xc0, 0xf0, 0xa2, 0xd1, 0x55, 0x30,
+    0x01, 0x0f, 0xe9, 0x06, 0xc0, 0xf0, 0xae, 0x15, 0xc0, 0xf0, 0xba, 0x0a,
+    0xc0, 0xf0, 0xc6, 0xcd, 0x7c, 0xa8, 0x01, 0x0e, 0x39, 0x04, 0xc0, 0xf0,
+    0xd0, 0xcf, 0x61, 0x4d, 0x01, 0x5a, 0x29, 0x08, 0xc0, 0xf0, 0xe2, 0xd7,
+    0x26, 0xbc, 0x0f, 0xc5, 0x20, 0x49, 0x01, 0xaa, 0xc0, 0xf0, 0xee, 0x15,
+    0xc0, 0xf1, 0x06, 0xdb, 0x16, 0x1d, 0x01, 0x37, 0x31, 0x49, 0x3c, 0xe1,
+    0xc0, 0xf1, 0x12, 0x47, 0x55, 0x85, 0x40, 0xf1, 0x2a, 0xca, 0x37, 0x4e,
+    0x01, 0x17, 0x31, 0xc5, 0x07, 0x62, 0x01, 0x13, 0x40, 0xc3, 0x02, 0xa3,
+    0x01, 0x16, 0xb1, 0xcd, 0x78, 0x30, 0x01, 0x53, 0xc9, 0xd3, 0x43, 0x39,
+    0x01, 0x53, 0xd8, 0x42, 0x00, 0x2a, 0xc0, 0xf1, 0x3f, 0xcc, 0x88, 0x7d,
+    0x01, 0x13, 0x30, 0x45, 0x00, 0xd5, 0xc0, 0xf1, 0x5a, 0x43, 0x02, 0x9c,
+    0x40, 0xf1, 0x70, 0xd4, 0x00, 0xd3, 0x01, 0x55, 0x40, 0x06, 0xc0, 0xf1,
+    0x7c, 0x16, 0xc0, 0xf1, 0x8c, 0x83, 0x00, 0xe1, 0x19, 0xc2, 0x01, 0x4a,
+    0x00, 0xe1, 0x11, 0x15, 0xc0, 0xf1, 0x9e, 0xc2, 0x02, 0x41, 0x00, 0xe0,
+    0xf9, 0x0a, 0xc0, 0xf1, 0xa8, 0xc2, 0x00, 0xdb, 0x00, 0xe0, 0xe1, 0xc2,
+    0x00, 0x39, 0x00, 0xe0, 0xd9, 0xc2, 0x19, 0x2c, 0x00, 0xe0, 0xd1, 0x0f,
+    0xc0, 0xf1, 0xb2, 0x04, 0xc0, 0xf1, 0xbc, 0x08, 0xc0, 0xf1, 0xc6, 0x12,
+    0xc0, 0xf1, 0xd0, 0x10, 0xc0, 0xf1, 0xe0, 0xc2, 0x25, 0x3b, 0x00, 0xe0,
+    0x41, 0x05, 0xc0, 0xf1, 0xf0, 0x09, 0xc0, 0xf1, 0xfa, 0x0d, 0x40, 0xf2,
+    0x04, 0xc4, 0x26, 0x78, 0x00, 0xe2, 0x49, 0xc5, 0x06, 0xdb, 0x00, 0xe2,
+    0x41, 0x15, 0xc0, 0xf2, 0x14, 0x08, 0xc0, 0xf2, 0x20, 0x16, 0xc0, 0xf2,
+    0x2c, 0xc3, 0x05, 0x14, 0x00, 0xe2, 0x09, 0xc4, 0x15, 0xe7, 0x00, 0xe2,
+    0x00, 0x16, 0xc0, 0xf2, 0x38, 0xc6, 0xc0, 0x98, 0x00, 0xe1, 0xe9, 0xd2,
+    0x4e, 0x0b, 0x00, 0xe1, 0xe0, 0x44, 0x00, 0xbb, 0xc0, 0xf2, 0x47, 0x50,
+    0x5c, 0xf2, 0x40, 0xf2, 0x53, 0x8d, 0x00, 0xe1, 0x6b, 0x00, 0xf2, 0x5f,
+    0x90, 0x00, 0xe1, 0x83, 0x00, 0xf2, 0x65, 0x96, 0x00, 0xe1, 0x99, 0x94,
+    0x00, 0xe1, 0x91, 0x92, 0x00, 0xe1, 0x89, 0x8e, 0x00, 0xe1, 0x79, 0x8f,
+    0x00, 0xe1, 0x70, 0x87, 0x00, 0xe1, 0x61, 0x97, 0x00, 0xe1, 0x53, 0x00,
+    0xf2, 0x6b, 0x91, 0x00, 0xe1, 0x43, 0x00, 0xf2, 0x6f, 0x8b, 0x00, 0xe1,
+    0x39, 0xc2, 0x04, 0xc6, 0x00, 0xe1, 0x30, 0x00, 0xc0, 0xf2, 0x73, 0xc4,
+    0x03, 0x0e, 0x01, 0x30, 0x3a, 0x00, 0xf2, 0xa7, 0x1b, 0xc0, 0xf2, 0xb0,
+    0xc2, 0x01, 0x5d, 0x05, 0x26, 0x81, 0x12, 0xc0, 0xf2, 0xba, 0x06, 0xc0,
+    0xf2, 0xc4, 0x16, 0xc0, 0xf2, 0xce, 0x09, 0xc0, 0xf2, 0xe2, 0x0d, 0xc0,
+    0xf2, 0xec, 0xc2, 0x25, 0x3b, 0x05, 0x26, 0xc9, 0x05, 0xc0, 0xf2, 0xf6,
+    0xc2, 0x01, 0xc3, 0x05, 0x26, 0xf9, 0x10, 0xc0, 0xf3, 0x00, 0xc2, 0x00,
+    0xdb, 0x05, 0x27, 0x09, 0x15, 0xc0, 0xf3, 0x0a, 0x1c, 0xc0, 0xf3, 0x14,
+    0x0a, 0xc0, 0xf3, 0x1e, 0xc2, 0x8d, 0x8f, 0x05, 0x27, 0x39, 0xc2, 0x00,
+    0x87, 0x05, 0x27, 0x49, 0xc2, 0x01, 0x4a, 0x05, 0x27, 0x51, 0x83, 0x05,
+    0x27, 0x73, 0x00, 0xf3, 0x28, 0x87, 0x05, 0x27, 0x83, 0x00, 0xf3, 0x2c,
+    0x8b, 0x05, 0x27, 0x91, 0x91, 0x05, 0x27, 0x9b, 0x00, 0xf3, 0x30, 0x97,
+    0x05, 0x27, 0xa2, 0x00, 0xf3, 0x34, 0xc5, 0x0a, 0x8a, 0x05, 0x27, 0xf1,
+    0xc9, 0x11, 0xf6, 0x05, 0x27, 0xf8, 0x00, 0xc0, 0xf3, 0x3c, 0x43, 0x02,
+    0xe8, 0x40, 0xf3, 0x57, 0xcd, 0x7b, 0xd8, 0x0f, 0xac, 0x39, 0xc7, 0x00,
+    0x90, 0x0f, 0xa8, 0xb8, 0x46, 0x09, 0x97, 0xc0, 0xf3, 0x63, 0xcd, 0x2c,
+    0xb2, 0x00, 0xca, 0x29, 0xd0, 0x0f, 0x09, 0x00, 0xca, 0x21, 0x15, 0xc0,
+    0xf3, 0x87, 0x45, 0x34, 0x6f, 0xc0, 0xf3, 0x99, 0x47, 0x02, 0x0e, 0x40,
+    0xf3, 0xa5, 0x85, 0x08, 0x49, 0xc9, 0x90, 0x08, 0x49, 0x5b, 0x00, 0xf3,
+    0xf4, 0x8e, 0x08, 0x49, 0x4b, 0x00, 0xf3, 0xf8, 0x87, 0x08, 0x49, 0x23,
+    0x00, 0xf3, 0xfc, 0x83, 0x08, 0x49, 0x03, 0x00, 0xf4, 0x00, 0x96, 0x08,
+    0x49, 0x7b, 0x00, 0xf4, 0x04, 0x95, 0x08, 0x49, 0x9b, 0x00, 0xf4, 0x08,
+    0x93, 0x08, 0x49, 0x91, 0x88, 0x08, 0x49, 0x89, 0x97, 0x08, 0x49, 0x81,
+    0x94, 0x08, 0x49, 0x69, 0x91, 0x08, 0x49, 0x61, 0x8f, 0x08, 0x49, 0x51,
+    0x8d, 0x08, 0x49, 0x41, 0x9b, 0x08, 0x49, 0x39, 0x8b, 0x08, 0x49, 0x31,
+    0x98, 0x08, 0x49, 0x29, 0x86, 0x08, 0x49, 0x19, 0x89, 0x08, 0x49, 0x11,
+    0x84, 0x08, 0x49, 0x08, 0x90, 0x08, 0x14, 0xc8, 0x90, 0x08, 0x14, 0xd0,
+    0x8a, 0x08, 0x14, 0x18, 0x8a, 0x08, 0x14, 0x49, 0x96, 0x08, 0x14, 0xc0,
+    0x8d, 0x08, 0x14, 0xa0, 0x8f, 0x08, 0x14, 0x80, 0x90, 0x08, 0x14, 0x88,
+    0x00, 0xc0, 0xf4, 0x0c, 0xc6, 0xc1, 0xfd, 0x01, 0x55, 0x5a, 0x00, 0xf4,
+    0x48, 0x45, 0x03, 0x14, 0xc0, 0xf4, 0x4e, 0x56, 0x2c, 0xde, 0x40, 0xf4,
+    0x58, 0x15, 0xc0, 0xf4, 0x9f, 0xd5, 0x32, 0xd5, 0x00, 0x14, 0xb3, 0x00,
+    0xf4, 0xb4, 0x42, 0x01, 0x19, 0xc0, 0xf4, 0xba, 0x03, 0xc0, 0xf4, 0xc9,
+    0xd8, 0x21, 0x0b, 0x00, 0xe9, 0x21, 0xcc, 0x23, 0x33, 0x00, 0x14, 0xa3,
+    0x00, 0xf4, 0xd5, 0xdb, 0x17, 0xb2, 0x00, 0x14, 0xa9, 0x42, 0x01, 0x2d,
+    0xc0, 0xf4, 0xdb, 0xc2, 0x1d, 0xc1, 0x00, 0x0d, 0x31, 0xcf, 0x65, 0xfd,
+    0x00, 0x0d, 0xd9, 0xc4, 0x95, 0x50, 0x00, 0x0d, 0xf9, 0xcc, 0x83, 0xe5,
+    0x00, 0x0e, 0x01, 0xcd, 0x79, 0x0d, 0x00, 0x0e, 0x08, 0xc4, 0x0d, 0x21,
+    0x01, 0x38, 0xe9, 0x48, 0x0b, 0x18, 0x40, 0xf4, 0xe7, 0xca, 0xa6, 0xe8,
+    0x05, 0x3f, 0xb9, 0x49, 0x11, 0x74, 0xc0, 0xf4, 0xf3, 0x0b, 0xc0, 0xf4,
+    0xfb, 0xc9, 0xa8, 0x9d, 0x05, 0x3f, 0xf8, 0xc9, 0xb2, 0xa2, 0x0f, 0x98,
+    0xe1, 0xc6, 0x00, 0x91, 0x0f, 0x98, 0xb8, 0x0d, 0xc0, 0xf5, 0x07, 0x12,
+    0xc0, 0xf5, 0x0f, 0x10, 0xc0, 0xf5, 0x1f, 0xc2, 0x00, 0x99, 0x00, 0x74,
+    0x41, 0x15, 0xc0, 0xf5, 0x2f, 0xc2, 0x00, 0x58, 0x00, 0x74, 0xa1, 0x16,
+    0xc0, 0xf5, 0x3b, 0xc2, 0x00, 0x6b, 0x00, 0x74, 0xd1, 0x43, 0xc9, 0xe0,
+    0xc0, 0xf5, 0x45, 0xc2, 0x00, 0xa2, 0x00, 0x75, 0x09, 0xc2, 0x42, 0xcd,
+    0x00, 0x75, 0x11, 0xc2, 0x00, 0x79, 0x00, 0x75, 0x19, 0xc2, 0x01, 0xc8,
+    0x00, 0x75, 0x2b, 0x00, 0xf5, 0x55, 0xc2, 0x02, 0xa0, 0x00, 0x75, 0x39,
+    0x43, 0x60, 0xe8, 0xc0, 0xf5, 0x5b, 0x91, 0x00, 0x75, 0x68, 0x83, 0x00,
+    0x75, 0x83, 0x00, 0xf5, 0x67, 0x45, 0xdb, 0x96, 0xc0, 0xf5, 0x77, 0x8b,
+    0x00, 0x75, 0xa3, 0x00, 0xf5, 0x83, 0x9b, 0x00, 0x75, 0xb3, 0x00, 0xf5,
+    0x87, 0x97, 0x00, 0x75, 0xc3, 0x00, 0xf5, 0x8b, 0x87, 0x00, 0x76, 0x03,
+    0x00, 0xf5, 0x8f, 0x91, 0x00, 0x76, 0x10, 0xcf, 0x67, 0xfb, 0x00, 0x75,
+    0xd1, 0x4e, 0x6f, 0xc4, 0x40, 0xf5, 0x93, 0xc2, 0x13, 0x4c, 0x00, 0x76,
+    0x41, 0x16, 0xc0, 0xf5, 0x9f, 0xc6, 0xcd, 0x31, 0x00, 0x76, 0x58, 0xc4,
+    0x15, 0xe7, 0x00, 0x76, 0x81, 0xc3, 0x05, 0x14, 0x00, 0x76, 0x89, 0x16,
+    0xc0, 0xf5, 0xa9, 0x08, 0xc0, 0xf5, 0xb5, 0x15, 0xc0, 0xf5, 0xc1, 0xc5,
+    0x06, 0xdb, 0x00, 0x76, 0xc1, 0xc4, 0x26, 0x78, 0x00, 0x76, 0xc8, 0xc2,
+    0x00, 0x10, 0x00, 0x76, 0xe1, 0xc2, 0x00, 0xa2, 0x00, 0x76, 0xe8, 0x16,
+    0xc0, 0xf5, 0xcd, 0x4f, 0x60, 0x6c, 0xc0, 0xf5, 0xd9, 0x4f, 0x01, 0xf3,
+    0xc0, 0xf5, 0xe5, 0xda, 0x1a, 0x7e, 0x01, 0x3a, 0x81, 0xc6, 0xcd, 0x8b,
+    0x01, 0x38, 0x81, 0xd5, 0x37, 0x6d, 0x01, 0x2e, 0xe9, 0x43, 0x05, 0xb2,
+    0x40, 0xf5, 0xf1, 0x16, 0xc0, 0xf5, 0xf7, 0x4f, 0x60, 0x6c, 0xc0, 0xf6,
+    0x03, 0xcf, 0x68, 0x37, 0x01, 0x3e, 0xa1, 0xd5, 0x37, 0x6d, 0x01, 0x2e,
+    0xe1, 0x44, 0x20, 0xe8, 0x40, 0xf6, 0x0f, 0x0e, 0xc0, 0xf6, 0x15, 0x4f,
+    0x2c, 0x4a, 0x40, 0xf6, 0x21, 0x48, 0x01, 0xd3, 0xc0, 0xf6, 0x27, 0xc5,
+    0x06, 0xe2, 0x01, 0x2c, 0x03, 0x00, 0xf6, 0x31, 0xc6, 0x02, 0xd1, 0x01,
+    0x2f, 0x01, 0xcc, 0x01, 0xdb, 0x0f, 0xdc, 0x70, 0xcc, 0x06, 0xdb, 0x01,
+    0x2c, 0xa1, 0xcd, 0x15, 0x02, 0x0f, 0xdc, 0x10, 0xdb, 0x14, 0xf4, 0x0f,
+    0xdb, 0x69, 0x45, 0x02, 0xde, 0x40, 0xf6, 0x37, 0xc5, 0x01, 0xa2, 0x01,
+    0x0f, 0x3b, 0x00, 0xf6, 0x43, 0xcc, 0x82, 0x35, 0x01, 0x0f, 0x72, 0x00,
+    0xf6, 0x47, 0x42, 0x00, 0x2c, 0xc0, 0xf6, 0x4d, 0x42, 0x02, 0xa0, 0x40,
+    0xf6, 0x59, 0xcf, 0x5b, 0xc3, 0x0f, 0xc2, 0x89, 0xcc, 0x88, 0xdd, 0x0f,
+    0xc1, 0xc8, 0xc4, 0x01, 0xa3, 0x01, 0x0c, 0x8b, 0x00, 0xf6, 0x65, 0xc5,
+    0xdb, 0x50, 0x01, 0x70, 0xa8, 0xcb, 0x82, 0xba, 0x01, 0x0f, 0x09, 0xcb,
+    0x82, 0x36, 0x01, 0x0e, 0x88, 0x51, 0x01, 0x51, 0xc0, 0xf6, 0x69, 0x45,
+    0x11, 0x3a, 0x40, 0xf6, 0x75, 0xc5, 0x01, 0xa2, 0x01, 0x58, 0x31, 0xd3,
+    0x43, 0xe4, 0x01, 0x5c, 0x48, 0xc8, 0x2e, 0x20, 0x0f, 0xb7, 0x41, 0xcc,
+    0x4e, 0x35, 0x0f, 0xa9, 0xe0, 0xd0, 0x5d, 0x52, 0x01, 0x2f, 0x71, 0xcf,
+    0x66, 0x66, 0x01, 0x2f, 0x68, 0xd2, 0x4c, 0xd9, 0x01, 0x3e, 0xf8, 0xc4,
+    0x01, 0x9b, 0x01, 0x18, 0x1b, 0x00, 0xf6, 0x81, 0xcf, 0x6a, 0xda, 0x01,
+    0x4d, 0xe8, 0xcb, 0x01, 0xfc, 0x01, 0x0f, 0x99, 0xcc, 0x82, 0x35, 0x01,
+    0x0e, 0xa9, 0xc5, 0x01, 0xa2, 0x01, 0x0c, 0xab, 0x00, 0xf6, 0x85, 0xcb,
+    0x94, 0x22, 0x01, 0x58, 0x69, 0xd5, 0x01, 0x92, 0x01, 0x5b, 0x29, 0xd0,
+    0x5b, 0xc2, 0x0f, 0xc2, 0xc8, 0x4f, 0x66, 0x48, 0xc0, 0xf6, 0x8b, 0x50,
+    0x5c, 0xd2, 0x40, 0xf6, 0x97, 0x00, 0x40, 0xf6, 0xa3, 0xca, 0x1b, 0x09,
+    0x00, 0x00, 0xf9, 0xc9, 0x6b, 0xaf, 0x01, 0x5f, 0xd0, 0xc3, 0xa1, 0xa2,
+    0x08, 0x1c, 0x01, 0xc2, 0x00, 0x74, 0x08, 0x1c, 0x98, 0xc4, 0xe2, 0x57,
+    0x08, 0x1c, 0x11, 0xc4, 0x92, 0x76, 0x08, 0x1c, 0xc8, 0xc2, 0x00, 0xd0,
+    0x08, 0x1c, 0x19, 0xc2, 0x0f, 0x9b, 0x08, 0x1c, 0x58, 0xc4, 0xdb, 0x4c,
+    0x08, 0x1c, 0x21, 0xc3, 0x01, 0xce, 0x08, 0x1c, 0x78, 0xc2, 0x01, 0x6f,
+    0x08, 0x1c, 0x40, 0xc3, 0x04, 0x87, 0x08, 0x1c, 0x39, 0x97, 0x08, 0x1c,
+    0x88, 0xc2, 0x00, 0x3d, 0x08, 0x1c, 0x49, 0xc5, 0xd6, 0xaf, 0x08, 0x1c,
+    0xc1, 0x91, 0x08, 0x1c, 0xd0, 0xc3, 0x11, 0xef, 0x08, 0x1c, 0x61, 0x03,
+    0xc0, 0xf6, 0xb5, 0xc2, 0x06, 0x62, 0x08, 0x1c, 0xe8, 0x0a, 0xc0, 0xf6,
+    0xc1, 0x07, 0xc0, 0xf6, 0xcd, 0x19, 0xc0, 0xf6, 0xdf, 0x15, 0xc0, 0xf6,
+    0xf1, 0x46, 0x06, 0x1d, 0xc0, 0xf7, 0x0b, 0x0e, 0xc0, 0xf7, 0x17, 0x16,
+    0xc0, 0xf7, 0x2d, 0x04, 0xc0, 0xf7, 0x3f, 0x42, 0x02, 0xae, 0xc0, 0xf7,
+    0x4b, 0x05, 0xc0, 0xf7, 0x57, 0x06, 0xc0, 0xf7, 0x6c, 0x14, 0xc0, 0xf7,
+    0x7c, 0x0f, 0xc0, 0xf7, 0x88, 0xc9, 0x60, 0xf3, 0x01, 0x3c, 0xa9, 0xcc,
+    0x07, 0xbb, 0x01, 0x3a, 0xd1, 0x03, 0xc0, 0xf7, 0x94, 0x11, 0xc0, 0xf7,
+    0xa6, 0x08, 0xc0, 0xf7, 0xb8, 0xcb, 0x58, 0xc7, 0x01, 0x38, 0xd1, 0xd4,
+    0x10, 0xc9, 0x0f, 0xb3, 0xc8, 0xc5, 0xaf, 0x07, 0x0f, 0xd5, 0x33, 0x00,
+    0xf7, 0xc4, 0xc5, 0x36, 0xb7, 0x0f, 0x9d, 0x38, 0x42, 0x00, 0x30, 0xc0,
+    0xf7, 0xca, 0xcf, 0x6b, 0x34, 0x0f, 0xb2, 0x48, 0xd3, 0x43, 0x85, 0x01,
+    0x36, 0x89, 0xc7, 0x00, 0x90, 0x01, 0x1c, 0x40, 0x42, 0x36, 0xa2, 0xc0,
+    0xf7, 0xdc, 0x42, 0x2f, 0xf9, 0xc0, 0xf7, 0xf4, 0x42, 0x14, 0x7d, 0xc0,
+    0xf8, 0x10, 0x42, 0x28, 0x5b, 0xc0, 0xf8, 0x20, 0x42, 0x01, 0x99, 0x40,
+    0xf8, 0x38, 0x42, 0x28, 0x5b, 0xc0, 0xf8, 0x48, 0x42, 0x01, 0x99, 0xc0,
+    0xf8, 0x68, 0x42, 0x36, 0xa2, 0xc0, 0xf8, 0x84, 0x42, 0x2f, 0xf9, 0xc0,
+    0xf8, 0x90, 0x42, 0x14, 0x7d, 0x40, 0xf8, 0xac, 0x42, 0x28, 0x5b, 0xc0,
+    0xf8, 0xd3, 0x42, 0x01, 0x99, 0xc0, 0xf8, 0xe7, 0x42, 0x36, 0xa2, 0xc0,
+    0xf9, 0x05, 0x42, 0x2f, 0xf9, 0xc0, 0xf9, 0x11, 0x42, 0x14, 0x7d, 0xc0,
+    0xf9, 0x33, 0x47, 0xc1, 0x15, 0x40, 0xf9, 0x57, 0x42, 0x28, 0x5b, 0xc0,
+    0xf9, 0x5f, 0x42, 0x01, 0x99, 0xc0, 0xf9, 0x71, 0x42, 0x36, 0xa2, 0xc0,
+    0xf9, 0x89, 0x42, 0x2f, 0xf9, 0xc0, 0xf9, 0xa5, 0x42, 0x14, 0x7d, 0x40,
+    0xf9, 0xc5, 0xa0, 0x0d, 0x80, 0xb1, 0x9f, 0x0d, 0x80, 0xa9, 0x9e, 0x0d,
+    0x80, 0xa0, 0xa3, 0x0d, 0x80, 0x99, 0xa2, 0x0d, 0x80, 0x91, 0xa1, 0x0d,
+    0x80, 0x89, 0xa0, 0x0d, 0x80, 0x81, 0x9f, 0x0d, 0x80, 0x79, 0x9e, 0x0d,
+    0x80, 0x08, 0xa2, 0x0d, 0x80, 0x71, 0xa1, 0x0d, 0x80, 0x69, 0xa0, 0x0d,
+    0x80, 0x61, 0x9f, 0x0d, 0x80, 0x59, 0x9e, 0x0d, 0x80, 0x50, 0xa1, 0x0d,
+    0x80, 0x49, 0xa0, 0x0d, 0x80, 0x41, 0x9f, 0x0d, 0x80, 0x39, 0x9e, 0x0d,
+    0x80, 0x30, 0xc2, 0x02, 0xa0, 0x0d, 0x80, 0x29, 0xa0, 0x0d, 0x80, 0x21,
+    0x9f, 0x0d, 0x80, 0x19, 0x9e, 0x0d, 0x80, 0x10, 0x42, 0x28, 0x5b, 0xc0,
+    0xf9, 0xf2, 0x42, 0x01, 0x99, 0xc0, 0xfa, 0x0e, 0x42, 0x2f, 0xf9, 0xc0,
+    0xfa, 0x1e, 0x42, 0x14, 0x7d, 0x40, 0xfa, 0x32, 0x42, 0x14, 0x7d, 0xc0,
+    0xfa, 0x46, 0x42, 0x36, 0xa2, 0xc0, 0xfa, 0x60, 0x42, 0x28, 0x5b, 0x40,
+    0xfa, 0x70, 0x42, 0x28, 0x5b, 0xc0, 0xfa, 0x88, 0x42, 0x01, 0x99, 0xc0,
+    0xfa, 0xa0, 0x42, 0x36, 0xa2, 0xc0, 0xfa, 0xae, 0x42, 0x2f, 0xf9, 0xc0,
+    0xfa, 0xbe, 0x42, 0x14, 0x7d, 0x40, 0xfa, 0xda, 0x42, 0x28, 0x5b, 0xc0,
+    0xfa, 0xf6, 0x42, 0x01, 0x99, 0xc0, 0xfb, 0x14, 0x42, 0x2f, 0xf9, 0xc0,
+    0xfb, 0x38, 0x42, 0x14, 0x7d, 0xc0, 0xfb, 0x54, 0x42, 0x36, 0xa2, 0x40,
+    0xfb, 0x64, 0x42, 0x28, 0x5b, 0xc0, 0xfb, 0x7a, 0x42, 0x01, 0x99, 0xc0,
+    0xfb, 0x96, 0x42, 0x36, 0xa2, 0xc0, 0xfb, 0xaa, 0x42, 0x2f, 0xf9, 0xc0,
+    0xfb, 0xca, 0x42, 0x14, 0x7d, 0x40, 0xfb, 0xe2, 0x48, 0x19, 0x9b, 0xc0,
+    0xfc, 0x02, 0x46, 0x02, 0x0f, 0x40, 0xfc, 0x0e, 0x45, 0x12, 0x5c, 0xc0,
+    0xfc, 0xa4, 0x4b, 0x11, 0xe3, 0x40, 0xfc, 0xd4, 0xc9, 0xaa, 0x3b, 0x00,
+    0x2e, 0x29, 0xc9, 0xb0, 0xbc, 0x00, 0x2e, 0x21, 0xcd, 0x79, 0x00, 0x00,
+    0x2d, 0x78, 0x1c, 0xc0, 0xfc, 0xf2, 0x06, 0xc0, 0xfc, 0xfc, 0xc4, 0xe1,
+    0x1b, 0x00, 0x2d, 0x61, 0xc3, 0x11, 0x14, 0x00, 0x2d, 0x59, 0x42, 0x0c,
+    0x43, 0xc0, 0xfd, 0x08, 0x16, 0xc0, 0xfd, 0x14, 0x42, 0x0f, 0x9a, 0xc0,
+    0xfd, 0x1e, 0xcc, 0x89, 0x6d, 0x00, 0x2d, 0x11, 0x42, 0x00, 0xb0, 0xc0,
+    0xfd, 0x2a, 0xc5, 0x48, 0x14, 0x00, 0x2c, 0xb9, 0x15, 0xc0, 0xfd, 0x36,
+    0xc7, 0xc9, 0xf1, 0x00, 0x2c, 0x89, 0x43, 0x09, 0x3b, 0xc0, 0xfd, 0x42,
+    0x0f, 0x40, 0xfd, 0x51, 0x43, 0x01, 0x7f, 0xc0, 0xfd, 0x66, 0xc7, 0x0c,
+    0x96, 0x02, 0x6e, 0x48, 0x0b, 0xc0, 0xfd, 0x96, 0xc7, 0xc7, 0xe4, 0x02,
+    0x6e, 0xf9, 0xd5, 0x35, 0xc9, 0x02, 0x6f, 0x19, 0x07, 0x40, 0xfd, 0xa2,
+    0xc6, 0x78, 0x44, 0x02, 0x6e, 0x21, 0xd2, 0x49, 0xd3, 0x02, 0x6e, 0x88,
+    0x10, 0xc0, 0xfd, 0xb4, 0xcc, 0x84, 0x39, 0x02, 0x6f, 0x58, 0x45, 0x03,
+    0x14, 0xc0, 0xfd, 0xc0, 0xc9, 0xaf, 0x54, 0x02, 0x6e, 0x59, 0xce, 0x6e,
+    0x82, 0x02, 0x6e, 0xb0, 0xc4, 0x12, 0x38, 0x02, 0x6e, 0x51, 0xc7, 0xc9,
+    0x18, 0x02, 0x6f, 0x11, 0xcd, 0x7e, 0x7c, 0x02, 0x6f, 0x68, 0xc9, 0xb4,
+    0xfd, 0x02, 0x6e, 0x61, 0xc8, 0xb6, 0x0a, 0x02, 0x6e, 0x80, 0x14, 0xc0,
+    0xfd, 0xcc, 0xd1, 0x55, 0xc9, 0x02, 0x6f, 0x60, 0xc5, 0xdb, 0x82, 0x02,
+    0x6e, 0x71, 0xcb, 0x93, 0xbf, 0x02, 0x6e, 0xd0, 0xc7, 0xc9, 0x73, 0x02,
+    0x6e, 0x91, 0xc8, 0xb6, 0x12, 0x02, 0x6f, 0xb1, 0xcf, 0x63, 0xb4, 0x02,
+    0x6f, 0xf0, 0xcd, 0x77, 0x12, 0x02, 0x6e, 0xa1, 0xcb, 0x98, 0x79, 0x02,
+    0x6f, 0x51, 0xd0, 0x5e, 0x72, 0x02, 0x6f, 0xf8, 0x16, 0xc0, 0xfd, 0xd8,
+    0xc8, 0xba, 0x72, 0x02, 0x6f, 0x80, 0x10, 0xc0, 0xfd, 0xe4, 0xc7, 0xc8,
+    0x7e, 0x02, 0x6e, 0xf1, 0xc6, 0xcc, 0x17, 0x02, 0x6f, 0x48, 0x42, 0x02,
+    0xaf, 0xc0, 0xfd, 0xf0, 0xca, 0x9b, 0x30, 0x02, 0x6f, 0x30, 0x51, 0x54,
+    0x86, 0xc0, 0xfd, 0xfc, 0x04, 0xc0, 0xfe, 0x1a, 0xd5, 0x37, 0x2e, 0x01,
+    0x35, 0x49, 0x4a, 0xa5, 0x4e, 0xc0, 0xfe, 0x26, 0xce, 0x71, 0x30, 0x01,
+    0x1d, 0x79, 0xc8, 0x22, 0x83, 0x01, 0x01, 0x31, 0x16, 0x40, 0xfe, 0x36,
+    0x00, 0x40, 0xfe, 0x42, 0xc7, 0xc1, 0x77, 0x01, 0x33, 0x41, 0xc8, 0xbd,
+    0xba, 0x01, 0x30, 0xa9, 0xc6, 0xcd, 0x19, 0x0f, 0x99, 0xb1, 0xc3, 0xcd,
+    0x94, 0x0f, 0x99, 0x68, 0xd2, 0x4a, 0xe1, 0x01, 0x1f, 0x98, 0x00, 0x40,
+    0xfe, 0x4e, 0xd0, 0x0d, 0xaa, 0x0f, 0xb3, 0x48, 0x83, 0x0f, 0xd5, 0x61,
+    0xc8, 0xbd, 0xfa, 0x0f, 0xa1, 0xc8, 0x45, 0x02, 0x9a, 0x40, 0xfe, 0x5d,
+    0x42, 0x01, 0x5d, 0xc0, 0xfe, 0x6f, 0xc5, 0xc4, 0x0a, 0x0f, 0xc8, 0xe9,
+    0x4c, 0x83, 0x79, 0x40, 0xfe, 0x79, 0x46, 0x09, 0x97, 0xc0, 0xfe, 0x85,
+    0x45, 0x00, 0xba, 0xc0, 0xfe, 0xa9, 0x45, 0x01, 0xc3, 0xc0, 0xfe, 0xb5,
+    0x46, 0x34, 0x6f, 0xc0, 0xfe, 0xc1, 0x47, 0x02, 0x0e, 0x40, 0xfe, 0xd5,
+    0xcd, 0x7a, 0x86, 0x00, 0xb9, 0xa1, 0x4b, 0x6f, 0xc7, 0xc0, 0xff, 0x3f,
+    0x47, 0x02, 0x0e, 0x40, 0xff, 0x47, 0x43, 0x4e, 0xaf, 0xc0, 0xff, 0xa5,
+    0x4d, 0x7b, 0xe5, 0x40, 0xff, 0xc7, 0x47, 0x34, 0x2f, 0xc0, 0xff, 0xe5,
+    0x47, 0x02, 0x0e, 0x40, 0xff, 0xf8, 0xc9, 0x11, 0xf6, 0x07, 0xfb, 0x09,
+    0xc5, 0x0a, 0x8a, 0x07, 0xfb, 0x20, 0xcf, 0x69, 0x63, 0x07, 0xfb, 0x11,
+    0xcb, 0x03, 0xbc, 0x07, 0xff, 0x48, 0xcf, 0x69, 0x63, 0x07, 0xfb, 0x19,
+    0xcb, 0x03, 0xbc, 0x07, 0xff, 0x58, 0x00, 0xc1, 0x00, 0x55, 0xde, 0x0d,
+    0xd8, 0x07, 0xfb, 0x80, 0xc6, 0x92, 0x0c, 0x07, 0xfd, 0x01, 0x47, 0x02,
+    0x0e, 0x41, 0x00, 0x6d, 0xcb, 0x90, 0x91, 0x0f, 0xb4, 0x23, 0x01, 0x00,
+    0xc7, 0xcb, 0x8d, 0xe7, 0x0f, 0xa3, 0x00, 0xcc, 0x80, 0x9d, 0x01, 0x35,
+    0x09, 0xd1, 0x54, 0xdb, 0x0f, 0xa8, 0x30, 0x83, 0x01, 0x82, 0x13, 0x01,
+    0x00, 0xcd, 0x15, 0xc1, 0x00, 0xd3, 0x8b, 0x01, 0x82, 0x21, 0x97, 0x01,
+    0x82, 0x31, 0x87, 0x01, 0x82, 0x41, 0x91, 0x01, 0x82, 0x51, 0x0d, 0xc1,
+    0x00, 0xed, 0x09, 0xc1, 0x01, 0x01, 0x1c, 0xc1, 0x01, 0x15, 0x16, 0xc1,
+    0x01, 0x29, 0x06, 0xc1, 0x01, 0x3d, 0x90, 0x01, 0x84, 0x9b, 0x01, 0x01,
+    0x51, 0x0a, 0xc1, 0x01, 0x65, 0x04, 0xc1, 0x01, 0x79, 0x12, 0xc1, 0x01,
+    0x8d, 0x0f, 0xc1, 0x01, 0xa1, 0x1b, 0xc1, 0x01, 0xb5, 0x14, 0xc1, 0x01,
+    0xc1, 0x19, 0xc1, 0x01, 0xd5, 0xc2, 0x5d, 0xb3, 0x01, 0x84, 0xa0, 0x00,
+    0xc1, 0x01, 0xe5, 0xcb, 0x9a, 0x52, 0x01, 0x01, 0x39, 0xc6, 0x89, 0xd3,
+    0x00, 0x01, 0x68, 0x43, 0x01, 0xd8, 0xc1, 0x01, 0xf1, 0x44, 0x00, 0xde,
+    0x41, 0x02, 0x0f, 0xc4, 0x25, 0xd5, 0x01, 0x03, 0x21, 0xc9, 0x1b, 0x0a,
+    0x01, 0x03, 0x19, 0xc5, 0x03, 0x4d, 0x01, 0x03, 0x10, 0xcf, 0x67, 0x29,
+    0x0f, 0xa9, 0x01, 0xc7, 0x67, 0x31, 0x0f, 0xa9, 0x21, 0xcd, 0x7d, 0x10,
+    0x0f, 0xa9, 0x08, 0x0e, 0xc1, 0x02, 0x37, 0xc6, 0xcd, 0x61, 0x01, 0x15,
+    0xd1, 0xc7, 0x00, 0x40, 0x01, 0x11, 0x4b, 0x01, 0x02, 0x43, 0xc6, 0x10,
+    0xce, 0x01, 0x01, 0xe9, 0xcb, 0x33, 0x33, 0x01, 0x51, 0xe0, 0x00, 0x41,
+    0x02, 0x47, 0x46, 0x62, 0x28, 0xc1, 0x02, 0x57, 0x47, 0xc5, 0x98, 0x41,
+    0x02, 0x63, 0xda, 0x1c, 0x38, 0x01, 0x4e, 0xf0, 0x15, 0xc1, 0x02, 0x6f,
+    0xcb, 0x99, 0xd9, 0x0f, 0xa4, 0x08, 0xc4, 0x00, 0xc3, 0x01, 0x10, 0x31,
+    0x43, 0x2c, 0xff, 0x41, 0x02, 0x7b, 0xcc, 0x87, 0x2d, 0x0f, 0xa7, 0x41,
+    0xce, 0x6e, 0x66, 0x01, 0x4e, 0xe0, 0xcd, 0x76, 0x4f, 0x01, 0x05, 0xc9,
+    0x48, 0xb7, 0x8a, 0x41, 0x02, 0x87, 0xd7, 0x28, 0x2c, 0x0f, 0xd7, 0xa8,
+    0xc2, 0x00, 0xf1, 0x01, 0x13, 0x0b, 0x01, 0x02, 0xab, 0xce, 0x33, 0xae,
+    0x01, 0x53, 0x38, 0x4a, 0xa7, 0x9c, 0xc1, 0x02, 0xb1, 0x49, 0xb4, 0x1c,
+    0x41, 0x02, 0xbf, 0x54, 0x3b, 0x88, 0xc1, 0x02, 0xcb, 0xd1, 0x2b, 0x57,
+    0x01, 0x81, 0x60, 0xc4, 0x0a, 0x8b, 0x01, 0x80, 0x09, 0xcb, 0x90, 0xa7,
+    0x01, 0x80, 0x30, 0xcc, 0x83, 0x25, 0x01, 0x8c, 0x81, 0xcc, 0x88, 0x71,
+    0x01, 0x8c, 0x89, 0xc8, 0x2b, 0x60, 0x01, 0x8c, 0x91, 0x16, 0xc1, 0x02,
+    0xe9, 0x08, 0xc1, 0x02, 0xf9, 0x0f, 0xc1, 0x03, 0x05, 0xcb, 0x97, 0x0e,
+    0x01, 0x8c, 0xc1, 0xcb, 0x93, 0x88, 0x01, 0x8c, 0xd1, 0xcb, 0x8e, 0x1e,
+    0x01, 0x8c, 0xe9, 0xca, 0xa3, 0x28, 0x01, 0x8c, 0xf0, 0x47, 0x34, 0x2f,
+    0xc1, 0x03, 0x11, 0xcc, 0x83, 0x19, 0x08, 0x42, 0xb9, 0x47, 0x02, 0x0e,
+    0x41, 0x03, 0x1e, 0xc6, 0x57, 0xec, 0x01, 0x03, 0x01, 0xd4, 0x3a, 0xfc,
+    0x01, 0x71, 0x88, 0x42, 0x00, 0x97, 0xc1, 0x03, 0x81, 0xd0, 0x5e, 0xc2,
+    0x0f, 0xa3, 0x78, 0x05, 0xc1, 0x03, 0x99, 0x0a, 0xc1, 0x03, 0xb7, 0x52,
+    0x48, 0x59, 0xc1, 0x03, 0xc5, 0x15, 0xc1, 0x03, 0xd1, 0x0e, 0xc1, 0x04,
+    0x05, 0x06, 0xc1, 0x04, 0x15, 0x16, 0xc1, 0x04, 0x2a, 0xd9, 0x0f, 0x09,
+    0x01, 0x3a, 0xa9, 0xd6, 0x2c, 0xb2, 0x01, 0x3a, 0xa1, 0x08, 0xc1, 0x04,
+    0x40, 0xc3, 0xe6, 0x74, 0x01, 0x38, 0x89, 0x14, 0xc1, 0x04, 0x50, 0x42,
+    0x02, 0xae, 0xc1, 0x04, 0x5c, 0x0f, 0xc1, 0x04, 0x68, 0xc6, 0x1c, 0xb4,
+    0x01, 0x2f, 0x31, 0x12, 0xc1, 0x04, 0x74, 0x43, 0x00, 0x5f, 0x41, 0x04,
+    0x80, 0x45, 0x15, 0xa7, 0xc1, 0x04, 0x8c, 0x45, 0x20, 0x6c, 0x41, 0x04,
+    0xaa, 0x45, 0x20, 0x6c, 0xc1, 0x04, 0xc8, 0x45, 0x15, 0xa7, 0x41, 0x04,
+    0xe6, 0xd5, 0x35, 0xde, 0x0f, 0xc4, 0x19, 0xca, 0x35, 0xe9, 0x0f, 0xc3,
+    0x59, 0xd0, 0x5c, 0x32, 0x0f, 0xc3, 0x19, 0xd1, 0x50, 0x46, 0x0f, 0xc3,
+    0x99, 0xd0, 0x35, 0xe3, 0x0f, 0xc3, 0xd8, 0xd5, 0x35, 0xde, 0x0f, 0xc4,
+    0x11, 0xd0, 0x35, 0xe3, 0x0f, 0xc3, 0xd1, 0xd0, 0x5c, 0x32, 0x0f, 0xc3,
+    0x11, 0xca, 0x35, 0xe9, 0x0f, 0xc3, 0x51, 0xd1, 0x50, 0x46, 0x0f, 0xc3,
+    0x90, 0xd5, 0x35, 0xde, 0x0f, 0xc4, 0x01, 0xd0, 0x5c, 0x32, 0x0f, 0xc3,
+    0x01, 0xca, 0x35, 0xe9, 0x0f, 0xc3, 0x41, 0xd1, 0x50, 0x46, 0x0f, 0xc3,
+    0x81, 0xd0, 0x35, 0xe3, 0x0f, 0xc3, 0xc0, 0xd0, 0x5c, 0x32, 0x0f, 0xc3,
+    0x09, 0xca, 0x35, 0xe9, 0x0f, 0xc3, 0x49, 0xd1, 0x50, 0x46, 0x0f, 0xc3,
+    0x89, 0xd0, 0x35, 0xe3, 0x0f, 0xc3, 0xc9, 0xd5, 0x35, 0xde, 0x0f, 0xc4,
+    0x08, 0x00, 0xc1, 0x05, 0x04, 0xc2, 0x00, 0x27, 0x0f, 0xd4, 0xf8, 0x00,
+    0xc1, 0x05, 0x10, 0xc5, 0xda, 0xf6, 0x0f, 0x9a, 0x48, 0xc9, 0xae, 0x4f,
+    0x0f, 0x17, 0xf9, 0x46, 0x09, 0x97, 0xc1, 0x05, 0x28, 0x45, 0x2b, 0x5f,
+    0xc1, 0x05, 0x4c, 0x47, 0x02, 0x0e, 0x41, 0x05, 0x5e, 0xd4, 0x39, 0x08,
+    0x0f, 0x98, 0xc1, 0xd3, 0x3f, 0x96, 0x0f, 0x98, 0xb0, 0xc2, 0x00, 0x7a,
+    0x08, 0xc7, 0xf9, 0x47, 0x34, 0x2f, 0xc1, 0x05, 0xe5, 0x46, 0x09, 0x97,
+    0xc1, 0x05, 0xfd, 0x4d, 0x29, 0xb9, 0xc1, 0x06, 0x21, 0x4f, 0x0b, 0x17,
+    0x41, 0x06, 0x80, 0x0e, 0xc1, 0x06, 0xdf, 0xc8, 0x7d, 0xa4, 0x07, 0xf2,
+    0x59, 0xc4, 0x0e, 0x9a, 0x01, 0x81, 0x80, 0xca, 0xa7, 0xf6, 0x0f, 0x9f,
+    0x99, 0xca, 0xa1, 0x7a, 0x0f, 0x9f, 0xa1, 0xc9, 0x42, 0xd1, 0x0f, 0xa2,
+    0x58, 0x58, 0x21, 0xb3, 0xc1, 0x06, 0xeb, 0xc4, 0x0e, 0x9a, 0x01, 0x80,
+    0xe0, 0xc8, 0x31, 0x90, 0x0f, 0xac, 0x29, 0xc6, 0xcb, 0xe1, 0x0f, 0xb7,
+    0xc1, 0xc4, 0x5c, 0x58, 0x0f, 0xca, 0x78, 0xc5, 0x8d, 0xed, 0x0f, 0xcb,
+    0xf9, 0xc4, 0x1d, 0xa8, 0x01, 0x1f, 0x29, 0xc5, 0x71, 0x71, 0x0f, 0xd6,
+    0x98, 0x42, 0x00, 0xaf, 0x41, 0x06, 0xf7, 0x00, 0xc1, 0x07, 0x03, 0xc7,
+    0x90, 0x53, 0x01, 0x10, 0xe1, 0xcd, 0x79, 0x41, 0x01, 0x00, 0x28, 0xca,
+    0xa0, 0xbc, 0x0f, 0x9b, 0xa3, 0x01, 0x07, 0x25, 0xc3, 0x00, 0x74, 0x01,
+    0x56, 0xe1, 0xce, 0x4a, 0x43, 0x01, 0x70, 0x80, 0x44, 0x00, 0x8c, 0xc1,
+    0x07, 0x2b, 0xc4, 0x3a, 0xb4, 0x0f, 0xc9, 0x31, 0xc7, 0xc2, 0x8f, 0x0f,
+    0xa4, 0x31, 0xcf, 0x64, 0x95, 0x0f, 0xb0, 0xc1, 0x15, 0xc1, 0x07, 0x35,
+    0xd2, 0x4c, 0x25, 0x0f, 0xcb, 0xc8, 0x4d, 0x27, 0x30, 0xc1, 0x07, 0x41,
+    0xc7, 0xc1, 0xbd, 0x0f, 0x9a, 0x10, 0xc8, 0xb6, 0x62, 0x01, 0x05, 0x19,
+    0xc3, 0x91, 0xe8, 0x0f, 0x9a, 0xf8, 0x46, 0x01, 0xec, 0xc1, 0x07, 0x4d,
+    0xd1, 0x55, 0x85, 0x0f, 0xa1, 0x28, 0xd8, 0x21, 0xfb, 0x0f, 0xb1, 0x30,
+    0xcd, 0x78, 0x64, 0x01, 0x0a, 0xf9, 0xc5, 0x03, 0x02, 0x01, 0x02, 0x20,
+    0xc4, 0xe2, 0x5f, 0x0f, 0xad, 0xf1, 0xc5, 0xd6, 0xcd, 0x0f, 0xad, 0xe9,
+    0xc7, 0x87, 0xc2, 0x0f, 0xad, 0xe0, 0xca, 0x9b, 0x76, 0x01, 0x3e, 0xb9,
+    0xc5, 0x06, 0xe2, 0x01, 0x2c, 0x41, 0x45, 0x15, 0xdb, 0xc1, 0x07, 0x53,
+    0xc4, 0x00, 0xf0, 0x00, 0x01, 0x70, 0x10, 0xc1, 0x07, 0x5f, 0x03, 0xc1,
+    0x07, 0x6b, 0x06, 0xc1, 0x07, 0x7d, 0x05, 0xc1, 0x07, 0x89, 0x15, 0xc1,
+    0x07, 0x99, 0x0e, 0xc1, 0x07, 0xa5, 0x07, 0xc1, 0x07, 0xb5, 0x42, 0x00,
+    0xb4, 0xc1, 0x07, 0xc1, 0x42, 0x00, 0xe3, 0xc1, 0x07, 0xcd, 0x14, 0xc1,
+    0x07, 0xd9, 0xc5, 0x1e, 0xc8, 0x07, 0xfa, 0xf1, 0x12, 0xc1, 0x07, 0xe5,
+    0xc6, 0x60, 0xb1, 0x07, 0xff, 0x19, 0xca, 0x9b, 0x58, 0x07, 0xff, 0x21,
+    0xc8, 0x77, 0x99, 0x07, 0xff, 0x29, 0xc8, 0xbe, 0x72, 0x07, 0xff, 0x31,
+    0xcc, 0x89, 0xcd, 0x07, 0xf8, 0x69, 0xc9, 0x11, 0xf6, 0x07, 0xf8, 0x71,
+    0xcd, 0x36, 0x86, 0x07, 0xfa, 0xe0, 0xcc, 0x68, 0xfd, 0x01, 0x31, 0xeb,
+    0x01, 0x07, 0xf7, 0xce, 0x6f, 0x46, 0x01, 0x03, 0x41, 0xcb, 0x62, 0xc8,
+    0x0f, 0xca, 0x38, 0x44, 0x3f, 0xf8, 0xc1, 0x07, 0xfb, 0x42, 0x00, 0xe1,
+    0xc1, 0x08, 0x05, 0xc7, 0xc1, 0x0e, 0x0f, 0xcf, 0x40, 0xc3, 0x17, 0x28,
+    0x01, 0x2e, 0x49, 0xd1, 0x55, 0x74, 0x0f, 0x9d, 0x19, 0xd7, 0x2a, 0x3d,
+    0x0f, 0x9b, 0x28, 0xc7, 0xc7, 0x89, 0x0f, 0xae, 0x21, 0xc6, 0x9e, 0xf4,
+    0x0f, 0xa6, 0x09, 0xc9, 0x1b, 0x0a, 0x00, 0x00, 0xe0, 0xc9, 0xae, 0x58,
+    0x0f, 0xa7, 0xe9, 0xc6, 0xd0, 0x25, 0x0f, 0x9c, 0xf0, 0xc6, 0xb7, 0xec,
+    0x0f, 0xd4, 0xb1, 0xc5, 0x62, 0xce, 0x0f, 0x9c, 0xb0, 0x14, 0xc1, 0x08,
+    0x11, 0x16, 0xc1, 0x08, 0x1d, 0x10, 0xc1, 0x08, 0x3b, 0x06, 0xc1, 0x08,
+    0x54, 0x15, 0xc1, 0x08, 0x68, 0x04, 0xc1, 0x08, 0x7e, 0x0a, 0xc1, 0x08,
+    0x88, 0x03, 0xc1, 0x08, 0x92, 0xc2, 0x01, 0x4a, 0x0b, 0x7a, 0x11, 0x1c,
+    0xc1, 0x08, 0x9c, 0x43, 0x70, 0x51, 0xc1, 0x08, 0xae, 0x09, 0xc1, 0x08,
+    0xca, 0xc2, 0x8d, 0x8f, 0x0b, 0x79, 0x39, 0x13, 0xc1, 0x08, 0xd2, 0xc2,
+    0x02, 0x2b, 0x0b, 0x78, 0xf1, 0x0e, 0xc1, 0x08, 0xdc, 0x18, 0xc1, 0x08,
+    0xea, 0xc2, 0x00, 0x87, 0x0b, 0x78, 0x39, 0x0f, 0xc1, 0x08, 0xf4, 0x12,
+    0x41, 0x08, 0xfe, 0xc5, 0x05, 0x02, 0x0b, 0x7c, 0x91, 0xc5, 0x00, 0xd4,
+    0x0b, 0x7c, 0x89, 0xc9, 0x63, 0x69, 0x0b, 0x7c, 0x81, 0xc5, 0x00, 0x2c,
+    0x0b, 0x7c, 0x78, 0x97, 0x0b, 0x7b, 0x53, 0x01, 0x09, 0x08, 0x8b, 0x0b,
+    0x7b, 0x0b, 0x01, 0x09, 0x29, 0x87, 0x0b, 0x7a, 0xeb, 0x01, 0x09, 0x4d,
+    0xc2, 0x00, 0x18, 0x0b, 0x7c, 0x19, 0x91, 0x0b, 0x7a, 0xcb, 0x01, 0x09,
+    0x63, 0x9b, 0x0b, 0x7b, 0x8b, 0x01, 0x09, 0x73, 0x90, 0x0b, 0x7b, 0xeb,
+    0x01, 0x09, 0x7d, 0x83, 0x0b, 0x7a, 0xa3, 0x01, 0x09, 0x81, 0xca, 0x9d,
+    0x7e, 0x0b, 0x7b, 0xc3, 0x01, 0x09, 0xa1, 0x99, 0x0b, 0x7a, 0xe2, 0x01,
+    0x09, 0xa5, 0x49, 0xaa, 0xc2, 0xc1, 0x09, 0xa9, 0xca, 0xa1, 0x84, 0x0b,
+    0x7a, 0x89, 0xd6, 0x2b, 0xec, 0x0b, 0x7a, 0x78, 0xcb, 0x95, 0xc4, 0x01,
+    0x22, 0x49, 0xcc, 0x8a, 0xbd, 0x01, 0x22, 0x40, 0xc5, 0xbc, 0xed, 0x0f,
+    0xa9, 0x61, 0xc5, 0x36, 0xb7, 0x0f, 0x9d, 0x21, 0xc5, 0x00, 0xb9, 0x00,
+    0x05, 0xa9, 0xc2, 0x00, 0x51, 0x0f, 0xcd, 0x00, 0xc3, 0x02, 0xa3, 0x00,
+    0x05, 0xb9, 0xe0, 0x06, 0x67, 0x0f, 0xde, 0x10, 0x00, 0xc1, 0x09, 0xb5,
+    0xcd, 0x79, 0x4e, 0x01, 0x10, 0x98, 0xc4, 0xd1, 0x89, 0x0f, 0xae, 0xa9,
+    0xc4, 0x5c, 0x58, 0x0f, 0xa5, 0xe9, 0xc3, 0x22, 0xd3, 0x0f, 0xb4, 0x80,
+    0x43, 0x01, 0xdf, 0xc1, 0x09, 0xc4, 0x45, 0xdc, 0xdb, 0x41, 0x0a, 0x00,
+    0xce, 0x72, 0x10, 0x0b, 0x74, 0xd1, 0x15, 0xc1, 0x0a, 0x12, 0xc9, 0x11,
+    0xf6, 0x0b, 0x74, 0xc1, 0x05, 0xc1, 0x0a, 0x1e, 0x46, 0x09, 0x97, 0xc1,
+    0x0a, 0x2a, 0x47, 0x34, 0x2f, 0x41, 0x0a, 0x51, 0xc9, 0xaf, 0x93, 0x01,
+    0x1e, 0xc9, 0x16, 0xc1, 0x0a, 0x67, 0x4a, 0xa4, 0x0e, 0xc1, 0x0a, 0x79,
+    0xcf, 0x67, 0xa1, 0x01, 0x1e, 0x99, 0xc5, 0x1d, 0x88, 0x01, 0x1e, 0x88,
+    0x4a, 0x9d, 0xf6, 0xc1, 0x0a, 0x85, 0x46, 0x09, 0x97, 0xc1, 0x0a, 0x8d,
+    0x51, 0x51, 0x89, 0x41, 0x0a, 0xab, 0x48, 0xbc, 0x6a, 0xc1, 0x0a, 0xbb,
+    0x4d, 0x75, 0xcd, 0x41, 0x0a, 0xcb, 0xc2, 0x07, 0xb8, 0x01, 0x12, 0xf1,
+    0xc5, 0x01, 0x95, 0x01, 0x11, 0x0b, 0x01, 0x0a, 0xda, 0xd4, 0x3f, 0x34,
+    0x01, 0x4c, 0xe8, 0xc4, 0x15, 0xe7, 0x05, 0x5f, 0x81, 0xc4, 0x26, 0x78,
+    0x05, 0x5f, 0xc9, 0xc3, 0x05, 0x14, 0x05, 0x5f, 0x89, 0x16, 0xc1, 0x0a,
+    0xde, 0x08, 0xc1, 0x0a, 0xea, 0x15, 0xc1, 0x0a, 0xf6, 0xc5, 0x06, 0xdb,
+    0x05, 0x5f, 0xc0, 0xc8, 0xbf, 0xd2, 0x05, 0x5f, 0x69, 0xc3, 0x7c, 0x50,
+    0x05, 0x57, 0x91, 0xcb, 0x8e, 0x6b, 0x05, 0x57, 0x88, 0x4a, 0x6f, 0xc8,
+    0xc1, 0x0b, 0x02, 0xc5, 0x1e, 0x96, 0x05, 0x57, 0xb0, 0x46, 0x02, 0x0f,
+    0xc1, 0x0b, 0x32, 0xc7, 0xc2, 0xc7, 0x05, 0x5f, 0x60, 0xc2, 0x00, 0xd1,
+    0x05, 0x57, 0x81, 0xc2, 0x06, 0xdb, 0x05, 0x5f, 0x58, 0x00, 0xc1, 0x0b,
+    0xa1, 0xc3, 0x1a, 0xd2, 0x0f, 0xb7, 0x19, 0xcf, 0x68, 0xaf, 0x0f, 0xcd,
+    0xe0, 0xc3, 0x03, 0x0c, 0x01, 0x37, 0x83, 0x01, 0x0b, 0xad, 0xc5, 0xd7,
+    0x86, 0x0f, 0xaf, 0xd8, 0x00, 0x41, 0x0b, 0xb1, 0x49, 0x89, 0xf4, 0xc1,
+    0x0b, 0xbd, 0xcd, 0x78, 0x98, 0x01, 0x1c, 0x69, 0xc4, 0x47, 0x02, 0x0f,
+    0xb4, 0xe8, 0x16, 0xc1, 0x0b, 0xc7, 0x15, 0xc1, 0x0b, 0xd9, 0xce, 0x6c,
+    0x1a, 0x08, 0xb3, 0x3b, 0x01, 0x0b, 0xe8, 0xcd, 0x76, 0x69, 0x08, 0xb3,
+    0x0b, 0x01, 0x0b, 0xee, 0xc5, 0x01, 0x2d, 0x00, 0xc0, 0x03, 0x01, 0x0b,
+    0xf4, 0x06, 0xc1, 0x0b, 0xfa, 0x47, 0x02, 0x0e, 0xc1, 0x0c, 0x06, 0x08,
+    0xc1, 0x0c, 0x91, 0xcf, 0x69, 0xbd, 0x00, 0xc0, 0x71, 0xc6, 0xcd, 0xc7,
+    0x00, 0xc0, 0x51, 0x47, 0xc2, 0x57, 0xc1, 0x0c, 0xa3, 0x42, 0x00, 0x99,
+    0xc1, 0x0c, 0xaf, 0xc8, 0x22, 0x83, 0x00, 0xc0, 0x08, 0x00, 0xc1, 0x0c,
+    0xbb, 0xcb, 0x5c, 0x17, 0x0f, 0xc8, 0x88, 0xc5, 0x11, 0x55, 0x0f, 0xa1,
+    0xa8, 0x00, 0xc1, 0x0c, 0xc7, 0x45, 0x02, 0x09, 0x41, 0x0c, 0xe3, 0xc2,
+    0x00, 0x96, 0x01, 0x15, 0x39, 0xcd, 0x7c, 0xcf, 0x0f, 0xc9, 0xd8, 0xd0,
+    0x57, 0xf2, 0x0f, 0x9c, 0x89, 0xc4, 0x2a, 0x3e, 0x0f, 0xcb, 0x70, 0xc3,
+    0x79, 0x83, 0x0f, 0xa7, 0xa1, 0xdd, 0x10, 0x4c, 0x0f, 0xa7, 0x90, 0x47,
+    0xc7, 0x4a, 0xc1, 0x0c, 0xef, 0x45, 0x58, 0xc2, 0xc1, 0x0d, 0x1d, 0x4a,
+    0xa3, 0x0a, 0xc1, 0x0d, 0x5b, 0x15, 0xc1, 0x0d, 0x6d, 0x4e, 0x73, 0x52,
+    0xc1, 0x0d, 0x79, 0x08, 0xc1, 0x0d, 0x8b, 0x42, 0x00, 0x2c, 0xc1, 0x0d,
+    0x97, 0x45, 0x00, 0x49, 0x41, 0x0d, 0xa3, 0xc4, 0x14, 0x09, 0x0e, 0x97,
+    0x98, 0xc4, 0x00, 0x2d, 0x0e, 0x97, 0x43, 0x01, 0x0d, 0xbb, 0xc5, 0x66,
+    0xb1, 0x0e, 0x97, 0x58, 0xc4, 0x18, 0x10, 0x0e, 0x97, 0x3b, 0x01, 0x0d,
+    0xc1, 0xc2, 0x22, 0xcc, 0x0e, 0x97, 0x32, 0x01, 0x0d, 0xc7, 0x0b, 0xc1,
+    0x0d, 0xcd, 0xc3, 0x09, 0x9e, 0x0e, 0x97, 0x22, 0x01, 0x0d, 0xd9, 0x0a,
+    0xc1, 0x0d, 0xdf, 0x19, 0xc1, 0x0d, 0xeb, 0xc2, 0x00, 0xc4, 0x0e, 0x97,
+    0x50, 0x91, 0x08, 0xf7, 0xb1, 0x87, 0x08, 0xf7, 0xa9, 0x97, 0x08, 0xf7,
+    0xa1, 0x8b, 0x08, 0xf7, 0x98, 0x83, 0x08, 0xf7, 0x89, 0xc2, 0x0d, 0xf6,
+    0x08, 0xf7, 0x81, 0xc2, 0x02, 0x41, 0x08, 0xf7, 0x79, 0xc2, 0x00, 0xdb,
+    0x08, 0xf7, 0x71, 0xc2, 0x00, 0x39, 0x08, 0xf7, 0x69, 0xc2, 0x19, 0x2c,
+    0x08, 0xf7, 0x61, 0x10, 0xc1, 0x0d, 0xf5, 0xc2, 0x25, 0x3b, 0x08, 0xf7,
+    0x51, 0xc2, 0x00, 0x64, 0x08, 0xf7, 0x49, 0xc2, 0x0e, 0x9a, 0x08, 0xf7,
+    0x39, 0xc2, 0x01, 0x6f, 0x08, 0xf7, 0x31, 0xc2, 0x01, 0xc3, 0x08, 0xf7,
+    0x29, 0xc2, 0x01, 0x5d, 0x08, 0xf7, 0x21, 0xc2, 0x00, 0xb0, 0x08, 0xf7,
+    0x19, 0xc2, 0x01, 0x30, 0x08, 0xf7, 0x09, 0xc2, 0x02, 0x2b, 0x08, 0xf7,
+    0x00, 0x46, 0x09, 0x97, 0xc1, 0x0e, 0x05, 0x14, 0xc1, 0x0e, 0x29, 0x18,
+    0xc1, 0x0e, 0x35, 0x45, 0x00, 0xba, 0xc1, 0x0e, 0x41, 0x47, 0x02, 0x0e,
+    0x41, 0x0e, 0x5f, 0x15, 0xc1, 0x0e, 0xc6, 0x4b, 0x6f, 0xc7, 0xc1, 0x0e,
+    0xd2, 0x47, 0x02, 0x0e, 0xc1, 0x0e, 0xe8, 0xc9, 0xaa, 0xa7, 0x08, 0xe3,
+    0x89, 0xc9, 0x15, 0xcc, 0x08, 0xe3, 0x80, 0x4c, 0x37, 0x33, 0xc1, 0x0f,
+    0x48, 0xcf, 0x20, 0xfc, 0x01, 0x35, 0x29, 0xc4, 0x00, 0xba, 0x01, 0x32,
+    0x10, 0x45, 0x00, 0xba, 0xc1, 0x0f, 0x54, 0x47, 0x02, 0x0e, 0xc1, 0x0f,
+    0x66, 0x4b, 0x6f, 0xc7, 0xc1, 0x0f, 0xcf, 0xce, 0x73, 0x0c, 0x00, 0x6a,
+    0xb9, 0x49, 0x53, 0xa9, 0xc1, 0x0f, 0xf5, 0x06, 0xc1, 0x10, 0x01, 0x47,
+    0x34, 0x2f, 0x41, 0x10, 0x0d, 0x4c, 0x11, 0xe2, 0xc1, 0x10, 0x19, 0x47,
+    0x34, 0x2f, 0xc1, 0x10, 0x37, 0x52, 0x48, 0x11, 0xc1, 0x10, 0x4a, 0x47,
+    0x02, 0x0e, 0xc1, 0x10, 0x56, 0xc7, 0xc3, 0xae, 0x08, 0x56, 0x40, 0xc7,
+    0xc3, 0xe6, 0x0f, 0xab, 0xd1, 0x43, 0x03, 0x35, 0xc1, 0x10, 0xbb, 0x45,
+    0x00, 0x8c, 0xc1, 0x10, 0xc7, 0xd7, 0x29, 0xf8, 0x0f, 0xa3, 0x58, 0xcb,
+    0x05, 0x1c, 0x00, 0x42, 0xf1, 0xcf, 0x63, 0xff, 0x00, 0x42, 0xd9, 0xd1,
+    0x4e, 0xbf, 0x00, 0x42, 0xd1, 0xd0, 0x58, 0x32, 0x00, 0x42, 0xc9, 0x47,
+    0x02, 0x0e, 0x41, 0x10, 0xd3, 0x0e, 0xc1, 0x10, 0xf3, 0x15, 0xc1, 0x10,
+    0xff, 0xd1, 0x50, 0xce, 0x08, 0x8b, 0xa0, 0xc5, 0x8d, 0x1c, 0x0f, 0x81,
+    0x51, 0x19, 0xc1, 0x11, 0x0b, 0x07, 0xc1, 0x11, 0x1d, 0x15, 0xc1, 0x11,
+    0x29, 0x10, 0xc1, 0x11, 0x47, 0xca, 0xa0, 0x9e, 0x0f, 0x80, 0x21, 0xcc,
+    0x87, 0xe1, 0x0f, 0x80, 0x29, 0x11, 0xc1, 0x11, 0x53, 0x16, 0xc1, 0x11,
+    0x5f, 0x08, 0xc1, 0x11, 0x6b, 0xc4, 0xe3, 0xc7, 0x0f, 0x81, 0x11, 0xcd,
+    0x78, 0x8b, 0x0f, 0x81, 0x29, 0x42, 0x01, 0x5d, 0xc1, 0x11, 0x77, 0xc6,
+    0xce, 0x39, 0x0f, 0x81, 0x40, 0x43, 0x00, 0xe5, 0xc1, 0x11, 0x83, 0x00,
+    0x41, 0x11, 0x96, 0x42, 0x0b, 0x26, 0xc1, 0x11, 0xa8, 0xc3, 0x64, 0xae,
+    0x01, 0x15, 0xc1, 0xc3, 0x0e, 0xa7, 0x01, 0x14, 0x62, 0x01, 0x11, 0xb4,
+    0xcc, 0x45, 0x8d, 0x08, 0x95, 0x49, 0x47, 0x02, 0x0e, 0x41, 0x11, 0xb8,
+    0xc4, 0x26, 0x78, 0x0b, 0x53, 0x49, 0xc5, 0x06, 0xdb, 0x0b, 0x53, 0x41,
+    0x15, 0xc1, 0x12, 0x14, 0x08, 0xc1, 0x12, 0x20, 0x16, 0xc1, 0x12, 0x2c,
+    0xc3, 0x05, 0x14, 0x0b, 0x53, 0x09, 0xc4, 0x15, 0xe7, 0x0b, 0x53, 0x00,
+    0xc2, 0x13, 0x4c, 0x0b, 0x52, 0xf1, 0xc3, 0x01, 0x9b, 0x0b, 0x52, 0xa9,
+    0x83, 0x0b, 0x52, 0x00, 0x8b, 0x0b, 0x52, 0xe9, 0x91, 0x0b, 0x52, 0x98,
+    0x8b, 0x0b, 0x52, 0xe1, 0x91, 0x0b, 0x52, 0x48, 0x90, 0x0b, 0x52, 0xd0,
+    0x91, 0x0b, 0x52, 0xc9, 0xc4, 0xe2, 0x77, 0x0b, 0x52, 0x61, 0xc3, 0x4d,
+    0xe7, 0x0b, 0x52, 0x40, 0x83, 0x0b, 0x52, 0xb0, 0x91, 0x0b, 0x52, 0x89,
+    0x8e, 0x0b, 0x52, 0x68, 0x83, 0x0b, 0x52, 0x81, 0xc2, 0x00, 0x0a, 0x0b,
+    0x52, 0x38, 0xc2, 0x00, 0x74, 0x0b, 0x52, 0x79, 0xc2, 0x04, 0x2b, 0x0b,
+    0x52, 0x08, 0xc3, 0x7c, 0x57, 0x0b, 0x52, 0x71, 0xc2, 0x03, 0x4e, 0x0b,
+    0x52, 0x18, 0x8b, 0x0b, 0x52, 0x50, 0x4f, 0x68, 0x91, 0xc1, 0x12, 0x38,
+    0xce, 0x6c, 0xc2, 0x05, 0x53, 0xd9, 0x15, 0xc1, 0x12, 0x40, 0x03, 0xc1,
+    0x12, 0x4c, 0xc9, 0x0e, 0x6e, 0x00, 0x81, 0xb9, 0x42, 0x07, 0xb2, 0xc1,
+    0x12, 0x58, 0xce, 0x70, 0xb2, 0x00, 0x82, 0x51, 0x57, 0x28, 0x9f, 0xc1,
+    0x12, 0x64, 0xd4, 0x38, 0x7c, 0x00, 0x84, 0x79, 0x4c, 0x8c, 0x31, 0x41,
+    0x12, 0x78, 0x03, 0xc1, 0x12, 0x80, 0xc8, 0xbb, 0xd2, 0x00, 0x82, 0x61,
+    0xc9, 0xb4, 0xe2, 0x00, 0x82, 0x69, 0xc8, 0xbf, 0x5a, 0x00, 0x82, 0x79,
+    0x45, 0x4d, 0x21, 0x41, 0x12, 0x8c, 0xc4, 0x15, 0xe7, 0x00, 0x84, 0x81,
+    0xc3, 0x05, 0x14, 0x00, 0x84, 0x89, 0x16, 0xc1, 0x12, 0x98, 0x08, 0xc1,
+    0x12, 0xa4, 0x15, 0xc1, 0x12, 0xb0, 0xc5, 0x06, 0xdb, 0x00, 0x84, 0xc1,
+    0xc4, 0x26, 0x78, 0x00, 0x84, 0xc8, 0x83, 0x00, 0x81, 0x0b, 0x01, 0x12,
+    0xbc, 0x0d, 0xc1, 0x12, 0xc6, 0x16, 0xc1, 0x12, 0xd3, 0x15, 0xc1, 0x12,
+    0xe4, 0x09, 0xc1, 0x12, 0xf8, 0x10, 0xc1, 0x13, 0x08, 0x05, 0xc1, 0x13,
+    0x1c, 0x0c, 0xc1, 0x13, 0x26, 0x06, 0xc1, 0x13, 0x30, 0x12, 0xc1, 0x13,
+    0x3e, 0x04, 0xc1, 0x13, 0x48, 0x0f, 0xc1, 0x13, 0x52, 0xc2, 0x19, 0x2c,
+    0x00, 0x80, 0xd1, 0x14, 0xc1, 0x13, 0x5c, 0x0e, 0xc1, 0x13, 0x66, 0x19,
+    0xc1, 0x13, 0x70, 0xc2, 0x00, 0xd0, 0x00, 0x80, 0xf9, 0x8b, 0x00, 0x81,
+    0x1b, 0x01, 0x13, 0x7a, 0x97, 0x00, 0x81, 0x2b, 0x01, 0x13, 0x7e, 0x87,
+    0x00, 0x81, 0x3b, 0x01, 0x13, 0x82, 0x91, 0x00, 0x81, 0x49, 0x48, 0xb2,
+    0x2d, 0x41, 0x13, 0x88, 0xc2, 0x02, 0x2e, 0x05, 0x53, 0xb1, 0xc2, 0xc8,
+    0xd4, 0x05, 0x53, 0xa9, 0xc3, 0xe6, 0x17, 0x05, 0x53, 0xa0, 0xc4, 0x26,
+    0x78, 0x05, 0x4f, 0xc9, 0xc5, 0x06, 0xdb, 0x05, 0x4f, 0xc1, 0x15, 0xc1,
+    0x13, 0x96, 0x08, 0xc1, 0x13, 0xa2, 0x16, 0xc1, 0x13, 0xae, 0xc3, 0x05,
+    0x14, 0x05, 0x4f, 0x89, 0xc4, 0x15, 0xe7, 0x05, 0x4f, 0x80, 0xc5, 0xd6,
+    0x73, 0x00, 0x83, 0x19, 0xc6, 0xce, 0x57, 0x00, 0x83, 0x20, 0x83, 0x00,
+    0x81, 0x61, 0x8b, 0x00, 0x81, 0x92, 0x01, 0x13, 0xba, 0x8b, 0x00, 0x81,
+    0x70, 0x97, 0x00, 0x81, 0x80, 0xc6, 0x00, 0xd3, 0x00, 0x81, 0xa8, 0xc2,
+    0x25, 0x9f, 0x00, 0x81, 0x99, 0x91, 0x00, 0x81, 0xa0, 0x94, 0x00, 0x82,
+    0xb3, 0x01, 0x13, 0xc3, 0x8e, 0x00, 0x82, 0xc2, 0x01, 0x13, 0xc7, 0xcc,
+    0x85, 0xad, 0x00, 0x83, 0x11, 0x44, 0x00, 0xd0, 0x41, 0x13, 0xcb, 0xc2,
+    0x2c, 0x43, 0x00, 0x83, 0x39, 0xc2, 0x0f, 0xe1, 0x00, 0x83, 0x40, 0xc2,
+    0x49, 0x0c, 0x00, 0x83, 0x91, 0x97, 0x00, 0x83, 0x99, 0xc2, 0x02, 0xe0,
+    0x00, 0x83, 0xa0, 0x46, 0x30, 0xa0, 0xc1, 0x13, 0xde, 0x4a, 0xa6, 0x0c,
+    0x41, 0x13, 0xf6, 0xc2, 0x02, 0xa0, 0x00, 0x82, 0x11, 0xc4, 0x02, 0xde,
+    0x00, 0x82, 0x18, 0xc3, 0x09, 0x9e, 0x00, 0x82, 0x21, 0xc3, 0x0d, 0x14,
+    0x00, 0x82, 0x28, 0xc2, 0x22, 0xcc, 0x00, 0x82, 0x31, 0xc4, 0x18, 0x10,
+    0x00, 0x82, 0x38, 0xca, 0x9f, 0xfe, 0x0f, 0xad, 0x30, 0x47, 0x02, 0x0e,
+    0xc1, 0x14, 0x08, 0xca, 0x3b, 0x06, 0x01, 0x87, 0xd9, 0xce, 0x1c, 0x92,
+    0x01, 0x87, 0xe9, 0xd5, 0x34, 0xb8, 0x01, 0x87, 0xf1, 0xcc, 0x80, 0xfd,
+    0x01, 0x87, 0xf8, 0xd1, 0x2f, 0xfb, 0x01, 0x84, 0xd9, 0xd6, 0x2f, 0xf6,
+    0x01, 0x84, 0xe1, 0xcd, 0x77, 0x87, 0x01, 0x85, 0x01, 0xd4, 0x0d, 0xe2,
+    0x01, 0x87, 0xe0, 0xc6, 0x00, 0xd3, 0x08, 0x86, 0x68, 0xc9, 0xb2, 0x2d,
+    0x08, 0x86, 0x11, 0x03, 0xc1, 0x14, 0x5e, 0x91, 0x08, 0x85, 0xb9, 0x87,
+    0x08, 0x85, 0xa9, 0x97, 0x08, 0x85, 0x9b, 0x01, 0x14, 0x6a, 0x8b, 0x08,
+    0x85, 0x8a, 0x01, 0x14, 0x6e, 0x46, 0x00, 0x59, 0xc1, 0x14, 0x72, 0xc4,
+    0x19, 0x53, 0x08, 0x86, 0x00, 0xcb, 0x45, 0x8e, 0x08, 0x85, 0xf1, 0x44,
+    0x00, 0xbb, 0x41, 0x14, 0x7e, 0xc2, 0x00, 0xd0, 0x08, 0x85, 0x79, 0x15,
+    0xc1, 0x14, 0x96, 0xc2, 0x02, 0x41, 0x08, 0x85, 0x59, 0xc2, 0x00, 0xdb,
+    0x08, 0x85, 0x51, 0x14, 0xc1, 0x14, 0xa6, 0xc2, 0x19, 0x2c, 0x08, 0x85,
+    0x41, 0xc2, 0x01, 0xc3, 0x08, 0x85, 0x39, 0x04, 0xc1, 0x14, 0xb0, 0x12,
+    0xc1, 0x14, 0xba, 0x10, 0xc1, 0x14, 0xc4, 0x06, 0xc1, 0x14, 0xda, 0x16,
+    0xc1, 0x14, 0xe8, 0x0c, 0xc1, 0x14, 0xf6, 0x05, 0xc1, 0x15, 0x00, 0x09,
+    0xc1, 0x15, 0x0a, 0x0d, 0xc1, 0x15, 0x14, 0x83, 0x08, 0x84, 0x1b, 0x01,
+    0x15, 0x1e, 0x91, 0x08, 0x84, 0x59, 0x87, 0x08, 0x84, 0x49, 0x97, 0x08,
+    0x84, 0x3b, 0x01, 0x15, 0x2a, 0x8b, 0x08, 0x84, 0x2a, 0x01, 0x15, 0x2e,
+    0xc4, 0xde, 0x93, 0x05, 0x49, 0x79, 0xc3, 0xe4, 0xfd, 0x05, 0x49, 0x70,
+    0xc5, 0xde, 0x02, 0x05, 0x49, 0x63, 0x01, 0x15, 0x32, 0xc6, 0xca, 0x77,
+    0x05, 0x49, 0x58, 0x91, 0x05, 0x49, 0x51, 0x87, 0x05, 0x49, 0x3b, 0x01,
+    0x15, 0x38, 0x97, 0x05, 0x49, 0x42, 0x01, 0x15, 0x3c, 0x11, 0xc1, 0x15,
+    0x40, 0x8b, 0x05, 0x49, 0x21, 0x83, 0x05, 0x49, 0x11, 0xc2, 0x00, 0x64,
+    0x05, 0x49, 0x09, 0xc2, 0x02, 0x41, 0x05, 0x49, 0x01, 0x0a, 0xc1, 0x15,
+    0x48, 0x16, 0xc1, 0x15, 0x52, 0xc2, 0x01, 0x4a, 0x05, 0x48, 0xe9, 0xc2,
+    0x00, 0xdb, 0x05, 0x48, 0xe1, 0xc2, 0x19, 0x2c, 0x05, 0x48, 0xd9, 0xc2,
+    0x00, 0x39, 0x05, 0x48, 0xd1, 0xc2, 0x01, 0x5d, 0x05, 0x48, 0xc9, 0xc2,
+    0x0e, 0x9a, 0x05, 0x48, 0xc1, 0xc2, 0x01, 0xc3, 0x05, 0x48, 0xb9, 0x12,
+    0xc1, 0x15, 0x5c, 0x10, 0xc1, 0x15, 0x66, 0xc2, 0x02, 0x1c, 0x05, 0x48,
+    0x81, 0x15, 0xc1, 0x15, 0x76, 0xc2, 0x01, 0x30, 0x05, 0x48, 0x61, 0x0d,
+    0x41, 0x15, 0x80, 0xc4, 0x26, 0x78, 0x05, 0x48, 0x49, 0xc5, 0x06, 0xdb,
+    0x05, 0x48, 0x41, 0x15, 0xc1, 0x15, 0x8a, 0x08, 0xc1, 0x15, 0x96, 0x16,
+    0xc1, 0x15, 0xa2, 0xc3, 0x05, 0x14, 0x05, 0x48, 0x09, 0xc4, 0x15, 0xe7,
+    0x05, 0x48, 0x00, 0x45, 0x00, 0xba, 0xc1, 0x15, 0xae, 0x42, 0x00, 0x49,
+    0xc1, 0x15, 0xd4, 0x4b, 0x6f, 0xc7, 0xc1, 0x15, 0xe0, 0xce, 0x74, 0xcc,
+    0x00, 0x66, 0xb1, 0x46, 0x09, 0x97, 0x41, 0x16, 0x06, 0xc4, 0xe1, 0x83,
+    0x0f, 0xcc, 0xc1, 0x4b, 0x91, 0xfc, 0x41, 0x16, 0x2a, 0x05, 0xc1, 0x16,
+    0x8e, 0x04, 0x41, 0x16, 0xc6, 0xc4, 0x26, 0x78, 0x08, 0x97, 0xc9, 0x15,
+    0xc1, 0x17, 0x06, 0x08, 0xc1, 0x17, 0x12, 0x16, 0xc1, 0x17, 0x1e, 0xc3,
+    0x05, 0x14, 0x08, 0x97, 0x89, 0xc4, 0x15, 0xe7, 0x08, 0x97, 0x81, 0xc5,
+    0x06, 0xdb, 0x08, 0x97, 0xc0, 0xc6, 0x1e, 0x95, 0x08, 0x97, 0x51, 0xc5,
+    0x33, 0x5d, 0x08, 0x97, 0x49, 0xc8, 0x14, 0x38, 0x08, 0x96, 0xf8, 0x91,
+    0x08, 0x97, 0x39, 0x03, 0xc1, 0x17, 0x2a, 0x87, 0x08, 0x97, 0x29, 0x97,
+    0x08, 0x97, 0x1b, 0x01, 0x17, 0x36, 0x8b, 0x08, 0x97, 0x0a, 0x01, 0x17,
+    0x3a, 0xc2, 0x00, 0xd0, 0x08, 0x96, 0xf1, 0x15, 0xc1, 0x17, 0x3e, 0xc2,
+    0x02, 0x41, 0x08, 0x96, 0xd9, 0xc2, 0x00, 0xdb, 0x08, 0x96, 0xd1, 0x14,
+    0xc1, 0x17, 0x48, 0xc2, 0x19, 0x2c, 0x08, 0x96, 0xc1, 0xc2, 0x01, 0xc3,
+    0x08, 0x96, 0xb9, 0x04, 0xc1, 0x17, 0x52, 0x12, 0xc1, 0x17, 0x62, 0x10,
+    0xc1, 0x17, 0x6c, 0x06, 0xc1, 0x17, 0x82, 0x16, 0xc1, 0x17, 0x90, 0x0c,
+    0xc1, 0x17, 0x9e, 0x05, 0xc1, 0x17, 0xae, 0x09, 0xc1, 0x17, 0xb8, 0x0d,
+    0xc1, 0x17, 0xc8, 0x83, 0x08, 0x95, 0x83, 0x01, 0x17, 0xd2, 0x91, 0x08,
+    0x95, 0xc1, 0x87, 0x08, 0x95, 0xb1, 0x97, 0x08, 0x95, 0xa3, 0x01, 0x17,
+    0xde, 0x8b, 0x08, 0x95, 0x92, 0x01, 0x17, 0xe2, 0x44, 0x00, 0xbb, 0xc1,
+    0x17, 0xe6, 0xcb, 0x45, 0x8e, 0x08, 0x91, 0xd8, 0x46, 0x00, 0x59, 0xc1,
+    0x17, 0xfc, 0xc4, 0x19, 0x53, 0x08, 0x91, 0xc0, 0x03, 0xc1, 0x18, 0x08,
+    0x91, 0x08, 0x91, 0x91, 0x87, 0x08, 0x91, 0x81, 0x97, 0x08, 0x91, 0x79,
+    0x8b, 0x08, 0x91, 0x6a, 0x01, 0x18, 0x14, 0x0e, 0xc1, 0x18, 0x18, 0xc2,
+    0x00, 0xd0, 0x08, 0x91, 0x51, 0xc2, 0x0d, 0xf6, 0x08, 0x91, 0x49, 0xc2,
+    0x02, 0x41, 0x08, 0x91, 0x41, 0xc2, 0x00, 0x39, 0x08, 0x91, 0x31, 0xc2,
+    0x19, 0x2c, 0x08, 0x91, 0x29, 0xc2, 0x01, 0xc3, 0x08, 0x91, 0x21, 0x04,
+    0xc1, 0x18, 0x22, 0x12, 0xc1, 0x18, 0x32, 0x10, 0xc1, 0x18, 0x3c, 0x06,
+    0xc1, 0x18, 0x52, 0x16, 0xc1, 0x18, 0x60, 0x0c, 0xc1, 0x18, 0x6e, 0x05,
+    0xc1, 0x18, 0x78, 0x09, 0xc1, 0x18, 0x82, 0x0d, 0xc1, 0x18, 0x92, 0x83,
+    0x08, 0x90, 0x03, 0x01, 0x18, 0x9c, 0x91, 0x08, 0x90, 0x31, 0x87, 0x08,
+    0x90, 0x21, 0x97, 0x08, 0x90, 0x19, 0x8b, 0x08, 0x90, 0x10, 0x46, 0x10,
+    0x79, 0xc1, 0x18, 0xa8, 0x44, 0x00, 0xbb, 0x41, 0x18, 0xc8, 0xc4, 0x26,
+    0x78, 0x00, 0xbf, 0x49, 0xc5, 0x06, 0xdb, 0x00, 0xbf, 0x41, 0x15, 0xc1,
+    0x19, 0x0a, 0x08, 0xc1, 0x19, 0x16, 0x16, 0xc1, 0x19, 0x22, 0xc3, 0x05,
+    0x14, 0x00, 0xbf, 0x09, 0xc4, 0x15, 0xe7, 0x00, 0xbf, 0x00, 0x45, 0x00,
+    0xba, 0xc1, 0x19, 0x2e, 0x4a, 0x9f, 0xf4, 0x41, 0x19, 0x4f, 0x13, 0xc1,
+    0x19, 0x57, 0xc2, 0x00, 0x35, 0x00, 0xbd, 0x6b, 0x01, 0x19, 0x73, 0xc2,
+    0x14, 0x98, 0x00, 0xbd, 0x5a, 0x01, 0x19, 0x77, 0xc2, 0x0f, 0x9a, 0x00,
+    0xbd, 0x11, 0x0e, 0xc1, 0x19, 0x7b, 0xc2, 0x00, 0xd0, 0x00, 0xbd, 0x01,
+    0x15, 0xc1, 0x19, 0x83, 0xc2, 0x17, 0xbd, 0x00, 0xbc, 0xe1, 0xc2, 0x00,
+    0x79, 0x00, 0xbc, 0xd1, 0xc2, 0x42, 0xcd, 0x00, 0xbc, 0xc9, 0xc2, 0x00,
+    0xa2, 0x00, 0xbc, 0xc1, 0x12, 0xc1, 0x19, 0x93, 0xc2, 0x01, 0x5d, 0x00,
+    0xbc, 0xa1, 0x10, 0xc1, 0x19, 0x9b, 0x16, 0xc1, 0x19, 0xb1, 0x06, 0xc1,
+    0x19, 0xc3, 0x05, 0xc1, 0x19, 0xcb, 0x0d, 0x41, 0x19, 0xd7, 0x0e, 0xc1,
+    0x19, 0xe3, 0x06, 0xc1, 0x19, 0xef, 0xc8, 0xb9, 0xf2, 0x08, 0x52, 0xa1,
+    0x05, 0xc1, 0x19, 0xf9, 0xcc, 0x12, 0x2d, 0x08, 0x52, 0x88, 0x44, 0x05,
+    0x14, 0xc1, 0x1a, 0x05, 0x16, 0x41, 0x1a, 0x11, 0xc4, 0x09, 0x9d, 0x08,
+    0x52, 0x19, 0x16, 0xc1, 0x1a, 0x1d, 0xc3, 0x05, 0x14, 0x08, 0x52, 0x00,
+    0xc5, 0x1e, 0x96, 0x08, 0x51, 0xf9, 0x45, 0x34, 0x70, 0x41, 0x1a, 0x29,
+    0x42, 0x00, 0x58, 0xc1, 0x1a, 0x35, 0xc5, 0xdc, 0xd1, 0x08, 0x51, 0xc9,
+    0xc9, 0x31, 0x98, 0x08, 0x51, 0xc1, 0xc7, 0x40, 0xe5, 0x08, 0x50, 0x79,
+    0xc8, 0x14, 0x38, 0x08, 0x50, 0x70, 0x18, 0xc1, 0x1a, 0x41, 0x16, 0xc1,
+    0x1a, 0x4b, 0xc2, 0x00, 0xdb, 0x08, 0x51, 0x59, 0xc2, 0x00, 0x39, 0x08,
+    0x51, 0x51, 0xc2, 0x19, 0x2c, 0x08, 0x51, 0x49, 0xc2, 0x01, 0xc3, 0x08,
+    0x51, 0x41, 0x04, 0xc1, 0x1a, 0x59, 0x12, 0xc1, 0x1a, 0x63, 0x10, 0xc1,
+    0x1a, 0x6d, 0x06, 0xc1, 0x1a, 0x7d, 0xc2, 0x25, 0x3b, 0x08, 0x50, 0xb9,
+    0x05, 0xc1, 0x1a, 0x8b, 0x09, 0xc1, 0x1a, 0x95, 0x0d, 0xc1, 0x1a, 0x9f,
+    0x83, 0x08, 0x50, 0x01, 0x15, 0xc1, 0x1a, 0xaf, 0xc2, 0x02, 0x1c, 0x08,
+    0x51, 0x81, 0xc2, 0x00, 0xd0, 0x08, 0x51, 0x88, 0xc4, 0x00, 0x87, 0x0f,
+    0xb0, 0xbb, 0x01, 0x1a, 0xbf, 0xd9, 0x20, 0x8f, 0x0f, 0xb1, 0xe8, 0xc9,
+    0xb0, 0x11, 0x0f, 0xd4, 0x31, 0xca, 0xa6, 0x3e, 0x0f, 0xd5, 0xd0, 0x46,
+    0xcc, 0x4d, 0xc1, 0x1a, 0xc5, 0xc4, 0x00, 0x87, 0x0f, 0xb0, 0x80, 0x15,
+    0xc1, 0x1a, 0xfc, 0x47, 0x02, 0x0e, 0xc1, 0x1b, 0x06, 0xce, 0x6c, 0x52,
+    0x08, 0xa2, 0xe9, 0xd0, 0x5f, 0x92, 0x08, 0xa2, 0xd9, 0x06, 0xc1, 0x1b,
+    0x6d, 0xd1, 0x50, 0xce, 0x08, 0xa2, 0x79, 0xca, 0x93, 0x30, 0x08, 0xa2,
+    0x71, 0xc5, 0x0a, 0x8a, 0x08, 0xa2, 0x69, 0xc2, 0x00, 0x7a, 0x08, 0xa2,
+    0x49, 0x4b, 0x6f, 0xc7, 0x41, 0x1b, 0x7f, 0xcb, 0x99, 0xe4, 0x01, 0x05,
+    0x51, 0x48, 0xb6, 0x82, 0xc1, 0x1b, 0x9f, 0x45, 0x15, 0xdb, 0xc1, 0x1b,
+    0xbe, 0xc4, 0x02, 0x6d, 0x00, 0x00, 0x50, 0xc4, 0x00, 0x49, 0x01, 0x5c,
+    0x91, 0xc5, 0x00, 0x2c, 0x01, 0x5c, 0x98, 0x48, 0x0b, 0x09, 0xc1, 0x1b,
+    0xca, 0x48, 0x20, 0x7c, 0xc1, 0x1b, 0xfa, 0xcb, 0x49, 0x4a, 0x00, 0x00,
+    0xa9, 0x49, 0x1e, 0x56, 0x41, 0x1c, 0x18, 0xe0, 0x05, 0x87, 0x01, 0x15,
+    0x78, 0x43, 0x07, 0x28, 0xc1, 0x1c, 0x2a, 0x42, 0x02, 0xaf, 0x41, 0x1c,
+    0x36, 0xc9, 0x00, 0xca, 0x01, 0x13, 0xc9, 0x43, 0x00, 0xe2, 0x41, 0x1c,
+    0x3c, 0xcc, 0x07, 0xc7, 0x01, 0x13, 0xc1, 0x43, 0x00, 0xe2, 0x41, 0x1c,
+    0x48, 0x4b, 0x6f, 0xc7, 0xc1, 0x1c, 0x54, 0xca, 0x9d, 0x56, 0x08, 0xcf,
+    0x19, 0x45, 0x00, 0xba, 0xc1, 0x1c, 0x7d, 0x47, 0x02, 0x0e, 0x41, 0x1c,
+    0x8d, 0x47, 0x34, 0x2f, 0xc1, 0x1c, 0xf0, 0xd5, 0x34, 0x25, 0x08, 0x45,
+    0x59, 0x47, 0x02, 0x0e, 0x41, 0x1d, 0x01, 0xd4, 0x3a, 0x48, 0x0f, 0xb5,
+    0x89, 0xcf, 0x67, 0x83, 0x01, 0x00, 0x88, 0x00, 0xc1, 0x1d, 0x6a, 0xd6,
+    0x2e, 0x12, 0x0f, 0xb7, 0x50, 0xcc, 0x23, 0x9f, 0x01, 0x15, 0xa0, 0xe0,
+    0x02, 0xc7, 0x0f, 0xaa, 0x21, 0x0e, 0xc1, 0x1d, 0x7c, 0x4b, 0x2c, 0x44,
+    0x41, 0x1d, 0x88, 0xca, 0xa7, 0xc4, 0x01, 0x1b, 0xd9, 0xd2, 0x4c, 0x01,
+    0x01, 0x17, 0x53, 0x01, 0x1d, 0x8e, 0x15, 0xc1, 0x1d, 0x94, 0x16, 0xc1,
+    0x1d, 0xa0, 0x03, 0xc1, 0x1d, 0xac, 0xcc, 0x07, 0xc7, 0x01, 0x13, 0x79,
+    0xc9, 0x00, 0xca, 0x01, 0x13, 0x71, 0x43, 0x00, 0xe2, 0xc1, 0x1d, 0xc4,
+    0xcc, 0x89, 0x0d, 0x01, 0x13, 0x11, 0xcb, 0x6b, 0x83, 0x01, 0x11, 0x30,
+    0x43, 0x00, 0xaf, 0xc1, 0x1d, 0xd0, 0xc4, 0xe3, 0x33, 0x0f, 0xa6, 0x9a,
+    0x01, 0x1d, 0xda, 0xc5, 0x00, 0xb9, 0x0f, 0xb5, 0x58, 0xc5, 0xd5, 0x1a,
+    0x0f, 0xab, 0x91, 0xca, 0xa2, 0x56, 0x0f, 0xb5, 0xb8, 0xc9, 0xa9, 0xa2,
+    0x00, 0x04, 0x19, 0xc7, 0xc9, 0x50, 0x0f, 0xb5, 0x98, 0x99, 0x0f, 0x09,
+    0x61, 0x87, 0x0f, 0x09, 0x53, 0x01, 0x1d, 0xe0, 0x91, 0x0f, 0x09, 0x43,
+    0x01, 0x1d, 0xe4, 0x97, 0x0f, 0x09, 0x39, 0x8b, 0x0f, 0x09, 0x31, 0x83,
+    0x0f, 0x09, 0x23, 0x01, 0x1d, 0xe8, 0x14, 0xc1, 0x1d, 0xec, 0xc2, 0x01,
+    0x30, 0x0f, 0x09, 0x11, 0x12, 0xc1, 0x1d, 0xf6, 0x0f, 0xc1, 0x1e, 0x00,
+    0xc2, 0x00, 0xd0, 0x0f, 0x08, 0x23, 0x01, 0x1e, 0x0a, 0x10, 0xc1, 0x1e,
+    0x0e, 0x06, 0xc1, 0x1e, 0x38, 0x1a, 0xc1, 0x1e, 0x42, 0xc2, 0x19, 0x2c,
+    0x0f, 0x08, 0xc1, 0xc2, 0x0f, 0x9a, 0x0f, 0x08, 0xb9, 0xc2, 0x00, 0x87,
+    0x0f, 0x08, 0xa9, 0x16, 0xc1, 0x1e, 0x4c, 0xc2, 0x02, 0x41, 0x0f, 0x08,
+    0x91, 0xc2, 0x02, 0x2b, 0x0f, 0x08, 0x71, 0xc2, 0x02, 0x1c, 0x0f, 0x08,
+    0x59, 0xc2, 0x0d, 0xf6, 0x0f, 0x08, 0x51, 0xc2, 0x00, 0xdb, 0x0f, 0x08,
+    0x49, 0xc2, 0x00, 0x64, 0x0f, 0x08, 0x40, 0xc4, 0x18, 0x10, 0x0f, 0x0a,
+    0x39, 0xc2, 0x22, 0xcc, 0x0f, 0x0a, 0x30, 0xc3, 0x0d, 0x14, 0x0f, 0x0a,
+    0x29, 0xc3, 0x09, 0x9e, 0x0f, 0x0a, 0x20, 0xc4, 0x02, 0xde, 0x0f, 0x0a,
+    0x19, 0xc2, 0x02, 0xa0, 0x0f, 0x0a, 0x10, 0xc5, 0xd7, 0xdb, 0x0f, 0x09,
+    0xe1, 0x44, 0x15, 0xec, 0x41, 0x1e, 0x5c, 0x1f, 0xc1, 0x1e, 0x7a, 0x1e,
+    0x41, 0x1e, 0xba, 0x16, 0xc1, 0x1e, 0xde, 0xd2, 0x4b, 0x5f, 0x01, 0x24,
+    0xd1, 0x07, 0xc1, 0x1e, 0xf0, 0x15, 0xc1, 0x1e, 0xfc, 0x08, 0x41, 0x1f,
+    0x06, 0xc4, 0x25, 0xd5, 0x01, 0x50, 0x21, 0xc3, 0x02, 0xa3, 0x01, 0x50,
+    0x18, 0xce, 0x6d, 0x24, 0x01, 0x50, 0x31, 0xd5, 0x33, 0x68, 0x01, 0x50,
+    0x28, 0xce, 0x72, 0xd4, 0x01, 0x50, 0x11, 0xcd, 0x7d, 0x51, 0x01, 0x50,
+    0x09, 0xcc, 0x83, 0x3d, 0x01, 0x50, 0x00, 0xc4, 0x26, 0x78, 0x00, 0x3e,
+    0x49, 0xc5, 0x06, 0xdb, 0x00, 0x3e, 0x41, 0x15, 0xc1, 0x1f, 0x12, 0x08,
+    0xc1, 0x1f, 0x1e, 0x16, 0xc1, 0x1f, 0x2a, 0xc3, 0x05, 0x14, 0x00, 0x3e,
+    0x09, 0xc4, 0x15, 0xe7, 0x00, 0x3e, 0x00, 0x0c, 0xc1, 0x1f, 0x36, 0x90,
+    0x00, 0x3e, 0x93, 0x01, 0x1f, 0x40, 0xc2, 0x19, 0x2c, 0x00, 0x3f, 0x31,
+    0xc2, 0x01, 0x4a, 0x00, 0x3f, 0x29, 0xc2, 0x00, 0xd0, 0x00, 0x3f, 0x21,
+    0xc2, 0x01, 0xc3, 0x00, 0x3f, 0x09, 0xc2, 0x00, 0xdb, 0x00, 0x3e, 0xf9,
+    0xc2, 0x02, 0x2b, 0x00, 0x3e, 0xf1, 0xc2, 0x00, 0x87, 0x00, 0x3e, 0xe9,
+    0xc3, 0x9f, 0x2c, 0x00, 0x3e, 0xe1, 0xc2, 0x0d, 0xf6, 0x00, 0x3e, 0xd9,
+    0x14, 0xc1, 0x1f, 0x50, 0xc2, 0x0e, 0x9a, 0x00, 0x3e, 0xc3, 0x01, 0x1f,
+    0x5a, 0xc3, 0x1c, 0x63, 0x00, 0x3e, 0xb9, 0xc2, 0x01, 0x6f, 0x00, 0x3e,
+    0xa9, 0xc2, 0x00, 0xb0, 0x00, 0x3e, 0xa1, 0xc2, 0x01, 0x5d, 0x00, 0x3e,
+    0x99, 0x91, 0x00, 0x3e, 0x83, 0x01, 0x1f, 0x60, 0x97, 0x00, 0x3e, 0x71,
+    0x87, 0x00, 0x3e, 0x6b, 0x01, 0x1f, 0x64, 0x8b, 0x00, 0x3e, 0x61, 0x83,
+    0x00, 0x3e, 0x50, 0xd0, 0x57, 0xd2, 0x00, 0x3f, 0x99, 0xd1, 0x56, 0xc8,
+    0x00, 0x3f, 0x91, 0x45, 0x2c, 0x86, 0xc1, 0x1f, 0x68, 0x46, 0x2e, 0xee,
+    0x41, 0x1f, 0x80, 0xc6, 0x52, 0xa4, 0x0f, 0xd3, 0x59, 0xc5, 0xd8, 0xda,
+    0x0f, 0xd3, 0x60, 0xc6, 0x52, 0xa4, 0x0f, 0xd3, 0x21, 0xc5, 0xd8, 0xda,
+    0x0f, 0xd3, 0x28, 0xc8, 0xbd, 0x32, 0x0f, 0xcd, 0x81, 0xca, 0xa5, 0xd0,
+    0x0f, 0xcd, 0x89, 0xc4, 0xe1, 0xeb, 0x0f, 0xcd, 0x91, 0xca, 0xa6, 0xb6,
+    0x0f, 0xcd, 0x98, 0xa3, 0x0f, 0x9f, 0xf9, 0xa2, 0x0f, 0x9f, 0xf1, 0xa1,
+    0x0f, 0x9f, 0xe9, 0xa0, 0x0f, 0x9f, 0xe1, 0xc3, 0xe5, 0xfc, 0x0f, 0x9f,
+    0xd8, 0xc3, 0x0e, 0xa7, 0x01, 0x10, 0x2b, 0x01, 0x1f, 0x92, 0xc4, 0x9b,
+    0xb8, 0x0f, 0xae, 0x63, 0x01, 0x1f, 0x98, 0xc8, 0xb9, 0xb2, 0x0f, 0xae,
+    0x59, 0x10, 0x41, 0x1f, 0x9c, 0x42, 0x09, 0xda, 0x41, 0x1f, 0xab, 0x43,
+    0x00, 0x55, 0xc1, 0x1f, 0xb7, 0xd0, 0x5e, 0x92, 0x0f, 0xcd, 0xd8, 0xca,
+    0xa5, 0x58, 0x09, 0xa1, 0xc1, 0x1d, 0x41, 0x1f, 0xc3, 0xcc, 0x82, 0x41,
+    0x09, 0xa1, 0xb9, 0x42, 0xcf, 0x41, 0x41, 0x1f, 0xd3, 0xcd, 0x76, 0x42,
+    0x09, 0xa1, 0xb1, 0x1d, 0x41, 0x1f, 0xfa, 0x49, 0xaf, 0xb7, 0xc1, 0x20,
+    0x12, 0x1d, 0x41, 0x20, 0x1e, 0xd0, 0x59, 0xc2, 0x09, 0xa1, 0x89, 0x42,
+    0xcf, 0x41, 0x41, 0x20, 0x26, 0xce, 0x70, 0x6c, 0x09, 0xa1, 0x81, 0x1d,
+    0x41, 0x20, 0x49, 0x42, 0xd1, 0x3e, 0xc1, 0x20, 0x62, 0x1d, 0x41, 0x20,
+    0x72, 0x1e, 0xc1, 0x20, 0x94, 0x1d, 0x41, 0x20, 0xb6, 0xa5, 0x09, 0x9f,
+    0x19, 0xa4, 0x09, 0x9f, 0x11, 0xa3, 0x09, 0x9f, 0x09, 0xa2, 0x09, 0x9f,
+    0x01, 0xa1, 0x09, 0x9e, 0xf9, 0xa0, 0x09, 0x9e, 0xf1, 0x9f, 0x09, 0x9e,
+    0xe9, 0x9e, 0x09, 0x9e, 0xda, 0x01, 0x20, 0xe6, 0xa5, 0x09, 0x9e, 0xcb,
+    0x01, 0x20, 0xea, 0xa4, 0x09, 0x9e, 0xc1, 0xa3, 0x09, 0x9e, 0xb3, 0x01,
+    0x20, 0xee, 0xa2, 0x09, 0x9e, 0xa9, 0xa1, 0x09, 0x9e, 0x93, 0x01, 0x20,
+    0xf2, 0xa0, 0x09, 0x9e, 0x89, 0x9f, 0x09, 0x9e, 0x81, 0x9e, 0x09, 0x9e,
+    0x78, 0x1f, 0xc1, 0x20, 0xfa, 0x1e, 0xc1, 0x21, 0x15, 0x1d, 0x41, 0x21,
+    0x49, 0x21, 0xc1, 0x21, 0x73, 0x20, 0xc1, 0x21, 0x7f, 0x1f, 0xc1, 0x21,
+    0xaa, 0x1e, 0xc1, 0x21, 0xd8, 0x1d, 0x41, 0x22, 0x00, 0x20, 0xc1, 0x22,
+    0x27, 0x1f, 0xc1, 0x22, 0x49, 0x1e, 0xc1, 0x22, 0x71, 0x1d, 0x41, 0x22,
+    0x9f, 0x21, 0xc1, 0x22, 0xcf, 0x20, 0xc1, 0x22, 0xeb, 0x1f, 0xc1, 0x23,
+    0x16, 0x1e, 0xc1, 0x23, 0x41, 0x1d, 0x41, 0x23, 0x6f, 0x1f, 0xc1, 0x23,
+    0x99, 0x1e, 0xc1, 0x23, 0xc1, 0x1d, 0x41, 0x23, 0xef, 0xa4, 0x09, 0x95,
+    0x71, 0xa3, 0x09, 0x95, 0x69, 0xa2, 0x09, 0x95, 0x61, 0xa1, 0x09, 0x95,
+    0x59, 0xa0, 0x09, 0x95, 0x51, 0x9f, 0x09, 0x95, 0x49, 0x9e, 0x09, 0x95,
+    0x40, 0x1e, 0xc1, 0x24, 0x19, 0x1d, 0x41, 0x24, 0x21, 0x42, 0xdd, 0x2f,
+    0xc1, 0x24, 0x4b, 0x42, 0x8c, 0xff, 0xc1, 0x24, 0x57, 0x1d, 0x41, 0x24,
+    0x65, 0x21, 0xc1, 0x24, 0x79, 0x20, 0xc1, 0x24, 0x90, 0x1f, 0xc1, 0x24,
+    0xbe, 0x1e, 0xc1, 0x24, 0xef, 0x1d, 0x41, 0x25, 0x26, 0xa5, 0x09, 0x8d,
+    0x61, 0xa4, 0x09, 0x8d, 0x59, 0xa3, 0x09, 0x8d, 0x4b, 0x01, 0x25, 0x50,
+    0xa2, 0x09, 0x8d, 0x41, 0xa1, 0x09, 0x8d, 0x39, 0xa0, 0x09, 0x8d, 0x31,
+    0x9f, 0x09, 0x8d, 0x23, 0x01, 0x25, 0x54, 0x9e, 0x09, 0x8d, 0x18, 0xa5,
+    0x09, 0x8d, 0x11, 0xa4, 0x09, 0x8d, 0x09, 0xa3, 0x09, 0x8d, 0x01, 0xa2,
+    0x09, 0x8c, 0xf9, 0xa1, 0x09, 0x8c, 0xf1, 0xa0, 0x09, 0x8c, 0xe9, 0x9f,
+    0x09, 0x8c, 0xe1, 0x9e, 0x09, 0x8c, 0xd8, 0x22, 0xc1, 0x25, 0x58, 0x21,
+    0xc1, 0x25, 0x6c, 0x20, 0xc1, 0x25, 0x9a, 0x1f, 0xc1, 0x25, 0xc8, 0x1e,
+    0xc1, 0x25, 0xf6, 0x1d, 0x41, 0x26, 0x21, 0x22, 0xc1, 0x26, 0x4b, 0x21,
+    0xc1, 0x26, 0x5e, 0x20, 0xc1, 0x26, 0x8f, 0x1f, 0xc1, 0x26, 0xc0, 0x1e,
+    0xc1, 0x26, 0xeb, 0x1d, 0x41, 0x27, 0x16, 0x23, 0xc1, 0x27, 0x3d, 0x22,
+    0xc1, 0x27, 0x60, 0x21, 0xc1, 0x27, 0x91, 0x20, 0xc1, 0x27, 0xbf, 0x1f,
+    0xc1, 0x27, 0xed, 0x1e, 0xc1, 0x28, 0x18, 0x1d, 0x41, 0x28, 0x40, 0x1f,
+    0xc1, 0x28, 0x67, 0x1e, 0xc1, 0x28, 0x7b, 0x1d, 0x41, 0x28, 0xa6, 0x4c,
+    0x84, 0x69, 0xc1, 0x28, 0xcd, 0xd2, 0x48, 0x35, 0x0f, 0xa3, 0xe8, 0xc4,
+    0x26, 0x78, 0x00, 0x37, 0xc9, 0xc5, 0x06, 0xdb, 0x00, 0x37, 0xc1, 0x15,
+    0xc1, 0x28, 0xe3, 0x08, 0xc1, 0x28, 0xef, 0x16, 0xc1, 0x28, 0xfb, 0xc3,
+    0x05, 0x14, 0x00, 0x37, 0x89, 0xc4, 0x15, 0xe7, 0x00, 0x37, 0x80, 0xcd,
+    0x2c, 0xb2, 0x01, 0x02, 0x49, 0xc4, 0x01, 0xc3, 0x00, 0x01, 0x08, 0x09,
+    0xc1, 0x29, 0x07, 0x0a, 0xc1, 0x29, 0x39, 0x04, 0xc1, 0x29, 0x5a, 0x05,
+    0xc1, 0x29, 0x7f, 0x06, 0xc1, 0x29, 0xaa, 0x16, 0xc1, 0x29, 0xd5, 0x0e,
+    0xc1, 0x2a, 0x0a, 0x0f, 0xc1, 0x2a, 0x2d, 0x15, 0xc1, 0x2a, 0x54, 0x14,
+    0xc1, 0x2a, 0x83, 0x13, 0xc1, 0x2a, 0xac, 0x18, 0xc1, 0x2a, 0xd5, 0x1a,
+    0xc1, 0x2a, 0xf5, 0x10, 0xc1, 0x2b, 0x1a, 0x0d, 0xc1, 0x2b, 0x41, 0x19,
+    0xc1, 0x2b, 0x6a, 0x12, 0xc1, 0x2b, 0x87, 0x1c, 0xc1, 0x2b, 0xac, 0x1b,
+    0xc1, 0x2b, 0xd7, 0x0c, 0xc1, 0x2b, 0xf4, 0x08, 0x41, 0x2c, 0x17, 0xca,
+    0x45, 0x8f, 0x00, 0x9b, 0x01, 0xc7, 0x52, 0x01, 0x00, 0x9b, 0x20, 0x47,
+    0x1d, 0xd4, 0xc1, 0x2c, 0x3b, 0xc2, 0x01, 0xc3, 0x00, 0x9b, 0x18, 0xc2,
+    0x02, 0xa0, 0x00, 0x9b, 0x51, 0xc4, 0x02, 0xde, 0x00, 0x9b, 0x58, 0xc3,
+    0x09, 0x9e, 0x00, 0x9b, 0x61, 0xc3, 0x0d, 0x14, 0x00, 0x9b, 0x68, 0xc2,
+    0x22, 0xcc, 0x00, 0x9b, 0x71, 0xc4, 0x18, 0x10, 0x00, 0x9b, 0x78, 0xc2,
+    0x00, 0xc4, 0x00, 0x9b, 0x93, 0x01, 0x2c, 0x47, 0xc5, 0x28, 0xee, 0x00,
+    0x9b, 0x99, 0xc5, 0x0d, 0x0d, 0x00, 0x9b, 0xa0, 0xc4, 0x4a, 0x2e, 0x00,
+    0x9b, 0xa9, 0xc4, 0x45, 0x6a, 0x00, 0x9b, 0xb0, 0xc4, 0xd2, 0x1d, 0x00,
+    0x9b, 0xb9, 0xc6, 0x18, 0x10, 0x00, 0x9b, 0xc0, 0xc4, 0xb4, 0x50, 0x00,
+    0x9c, 0x8b, 0x01, 0x2c, 0x4d, 0xc4, 0xe1, 0x33, 0x00, 0x9c, 0xa0, 0xc4,
+    0x59, 0x96, 0x00, 0x9c, 0xa9, 0xc3, 0x34, 0x38, 0x00, 0x9c, 0xc8, 0x00,
+    0x41, 0x2c, 0x53, 0xcf, 0x44, 0x5a, 0x01, 0x1f, 0x39, 0x00, 0x41, 0x2c,
+    0x5f, 0x16, 0xc1, 0x2c, 0x77, 0x15, 0xc1, 0x2c, 0x83, 0xc4, 0x5d, 0xe2,
+    0x08, 0x7f, 0x99, 0xc4, 0xb9, 0x7e, 0x08, 0x7f, 0x91, 0xc2, 0x00, 0x67,
+    0x08, 0x7f, 0x81, 0xc3, 0x20, 0x18, 0x08, 0x7f, 0x69, 0xc3, 0x00, 0x4e,
+    0x08, 0x7f, 0x61, 0xc6, 0xcf, 0xd7, 0x08, 0x7f, 0x59, 0xc4, 0xe0, 0xe7,
+    0x08, 0x7f, 0x51, 0xc4, 0x4a, 0xb9, 0x08, 0x7f, 0x49, 0xc2, 0x01, 0x7f,
+    0x08, 0x7f, 0x23, 0x01, 0x2c, 0x8d, 0xc5, 0x4a, 0xb3, 0x08, 0x7f, 0x31,
+    0xc3, 0x7e, 0x89, 0x08, 0x7f, 0x29, 0xc6, 0x40, 0x9a, 0x08, 0x7f, 0x19,
+    0xc5, 0x9c, 0xa2, 0x08, 0x7f, 0x11, 0xc4, 0xe3, 0x27, 0x08, 0x7f, 0x09,
+    0x03, 0x41, 0x2c, 0x93, 0x87, 0x08, 0x28, 0x11, 0xc2, 0x01, 0x7f, 0x08,
+    0x28, 0x18, 0x87, 0x08, 0x28, 0x21, 0xc2, 0x01, 0x7f, 0x08, 0x28, 0x30,
+    0xc2, 0x00, 0x06, 0x08, 0x28, 0x29, 0x87, 0x08, 0x28, 0x99, 0x83, 0x08,
+    0x28, 0xa1, 0xc2, 0x1c, 0x52, 0x08, 0x28, 0xa8, 0x8b, 0x08, 0x28, 0x38,
+    0x87, 0x08, 0x28, 0x51, 0xc2, 0x1c, 0x52, 0x08, 0x28, 0x59, 0x0a, 0x41,
+    0x2c, 0x9f, 0x87, 0x08, 0x28, 0x79, 0xc2, 0x01, 0x7f, 0x08, 0x29, 0x38,
+    0x87, 0x08, 0x28, 0x81, 0xc2, 0x00, 0x49, 0x08, 0x28, 0x88, 0x87, 0x08,
+    0x28, 0xc9, 0xc2, 0x01, 0x19, 0x08, 0x28, 0xd0, 0x87, 0x08, 0x28, 0xd9,
+    0xc2, 0x01, 0x7f, 0x08, 0x28, 0xe0, 0x87, 0x08, 0x28, 0xe9, 0xc2, 0x01,
+    0x7f, 0x08, 0x28, 0xf0, 0x87, 0x08, 0x29, 0x19, 0xc2, 0x01, 0x7f, 0x08,
+    0x29, 0x20, 0xe0, 0x0a, 0xe7, 0x01, 0x3a, 0x50, 0xdf, 0x0c, 0x46, 0x01,
+    0x3a, 0x09, 0x47, 0x0a, 0xaa, 0x41, 0x2c, 0xa9, 0xc9, 0xad, 0xe3, 0x0f,
+    0xac, 0x21, 0xd5, 0x31, 0xd9, 0x0f, 0xa7, 0x48, 0x43, 0x05, 0xc0, 0xc1,
+    0x2c, 0xbb, 0xc6, 0x01, 0xdb, 0x00, 0x00, 0xc9, 0x16, 0xc1, 0x2c, 0xc7,
+    0xc4, 0x02, 0x6d, 0x00, 0x00, 0x51, 0xcd, 0x7e, 0x48, 0x00, 0x04, 0x39,
+    0xcc, 0x87, 0xc9, 0x00, 0x04, 0xb8, 0xc6, 0x02, 0xd1, 0x01, 0x4f, 0x99,
+    0xc7, 0x3a, 0x19, 0x01, 0x4f, 0x89, 0xc6, 0x0b, 0x09, 0x01, 0x4f, 0x78,
+    0xc6, 0x02, 0xd1, 0x01, 0x4f, 0x91, 0xc7, 0x3a, 0x19, 0x01, 0x4f, 0x81,
+    0xc6, 0x0b, 0x09, 0x01, 0x4f, 0x70, 0x43, 0x01, 0x7b, 0xc1, 0x2c, 0xd6,
+    0xcf, 0x6b, 0x7f, 0x01, 0x16, 0xa8, 0xc5, 0x33, 0x24, 0x01, 0x12, 0xa9,
+    0xc4, 0x00, 0xba, 0x00, 0x01, 0xeb, 0x01, 0x2c, 0xe2, 0xcd, 0x7c, 0x33,
+    0x01, 0x53, 0x70, 0xc2, 0x00, 0xf1, 0x01, 0x12, 0x69, 0xd4, 0x3b, 0x74,
+    0x01, 0x53, 0xc0, 0xcb, 0x95, 0x6c, 0x0f, 0x9f, 0x21, 0xc6, 0xcd, 0x2b,
+    0x0f, 0x9f, 0x80, 0xc4, 0x26, 0x78, 0x08, 0xed, 0x49, 0xc5, 0x06, 0xdb,
+    0x08, 0xed, 0x41, 0x15, 0xc1, 0x2c, 0xe6, 0x08, 0xc1, 0x2c, 0xf2, 0x16,
+    0xc1, 0x2c, 0xfe, 0xc3, 0x05, 0x14, 0x08, 0xed, 0x09, 0xc4, 0x15, 0xe7,
+    0x08, 0xed, 0x00, 0xc5, 0x1e, 0x96, 0x08, 0xec, 0xb9, 0x4a, 0x6f, 0xc8,
+    0x41, 0x2d, 0x0a, 0xc7, 0x40, 0xe5, 0x08, 0xec, 0xb1, 0xc8, 0x14, 0x38,
+    0x08, 0xec, 0xa8, 0xc2, 0x0d, 0xf6, 0x08, 0xec, 0x49, 0xc2, 0x00, 0x39,
+    0x08, 0xec, 0x41, 0xc2, 0x00, 0xd0, 0x08, 0xec, 0x39, 0x12, 0xc1, 0x2d,
+    0x28, 0x10, 0xc1, 0x2d, 0x32, 0x06, 0xc1, 0x2d, 0x3c, 0x0c, 0xc1, 0x2d,
+    0x4a, 0x0e, 0xc1, 0x2d, 0x54, 0x16, 0xc1, 0x2d, 0x5e, 0x05, 0xc1, 0x2d,
+    0x6c, 0x09, 0xc1, 0x2d, 0x76, 0x0d, 0xc1, 0x2d, 0x80, 0xc2, 0x01, 0xc3,
+    0x08, 0xeb, 0x81, 0x04, 0xc1, 0x2d, 0x8a, 0xc2, 0x02, 0x41, 0x08, 0xeb,
+    0x69, 0xc2, 0x19, 0x2c, 0x08, 0xeb, 0x61, 0x83, 0x08, 0xeb, 0x03, 0x01,
+    0x2d, 0x94, 0xc2, 0x01, 0x24, 0x08, 0xeb, 0x51, 0xc2, 0x02, 0xe0, 0x08,
+    0xeb, 0x39, 0x97, 0x08, 0xeb, 0x23, 0x01, 0x2d, 0xa0, 0x8b, 0x08, 0xeb,
+    0x12, 0x01, 0x2d, 0xa4, 0xca, 0xa6, 0x2a, 0x00, 0x50, 0x09, 0xc5, 0x60,
+    0x30, 0x00, 0x50, 0x11, 0x42, 0x07, 0xb2, 0xc1, 0x2d, 0xa8, 0xc5, 0x33,
+    0x5d, 0x00, 0x51, 0xe1, 0xc5, 0xd9, 0x5c, 0x00, 0x52, 0x89, 0xc6, 0xd3,
+    0x85, 0x00, 0x53, 0xa8, 0x83, 0x00, 0x50, 0x2b, 0x01, 0x2d, 0xb4, 0x8b,
+    0x00, 0x50, 0x3b, 0x01, 0x2d, 0xc0, 0x97, 0x00, 0x50, 0x4b, 0x01, 0x2d,
+    0xc4, 0xc2, 0x02, 0xe0, 0x00, 0x50, 0x79, 0xc2, 0x01, 0x24, 0x00, 0x50,
+    0x99, 0x0d, 0xc1, 0x2d, 0xc8, 0x09, 0xc1, 0x2d, 0xd0, 0x10, 0xc1, 0x2d,
+    0xd8, 0x05, 0xc1, 0x2d, 0xee, 0x0c, 0xc1, 0x2d, 0xf8, 0x16, 0xc1, 0x2e,
+    0x02, 0x06, 0xc1, 0x2e, 0x10, 0x12, 0xc1, 0x2e, 0x1e, 0x04, 0xc1, 0x2e,
+    0x28, 0xc2, 0x01, 0xc3, 0x00, 0x51, 0x71, 0xc2, 0x19, 0x2c, 0x00, 0x51,
+    0x79, 0x14, 0xc1, 0x2e, 0x32, 0x0e, 0xc1, 0x2e, 0x3c, 0xc2, 0x02, 0x41,
+    0x00, 0x51, 0xa9, 0x15, 0xc1, 0x2e, 0x46, 0xc2, 0x00, 0xd0, 0x00, 0x51,
+    0xc9, 0xc2, 0x02, 0x1c, 0x00, 0x52, 0xd9, 0xc2, 0x00, 0x87, 0x00, 0x52,
+    0xf0, 0x03, 0xc1, 0x2e, 0x50, 0x8b, 0x00, 0x51, 0xfb, 0x01, 0x2e, 0x5c,
+    0x97, 0x00, 0x52, 0x0b, 0x01, 0x2e, 0x60, 0xc2, 0x02, 0xe0, 0x00, 0x52,
+    0x39, 0xc2, 0x01, 0x24, 0x00, 0x52, 0x58, 0xc4, 0x15, 0xe7, 0x00, 0x53,
+    0x31, 0xc3, 0x05, 0x14, 0x00, 0x53, 0x39, 0x16, 0xc1, 0x2e, 0x64, 0x08,
+    0xc1, 0x2e, 0x70, 0x15, 0xc1, 0x2e, 0x7c, 0xc5, 0x06, 0xdb, 0x00, 0x53,
+    0x71, 0xc4, 0x26, 0x78, 0x00, 0x53, 0x78, 0xc4, 0xe3, 0x57, 0x00, 0x53,
+    0x89, 0xd0, 0x50, 0xcf, 0x00, 0x53, 0xb0, 0x05, 0xc1, 0x2e, 0x88, 0x03,
+    0xc1, 0x2e, 0x94, 0x42, 0x07, 0xb2, 0xc1, 0x2e, 0xa0, 0xc5, 0x33, 0x5d,
+    0x00, 0x55, 0xe1, 0x15, 0xc1, 0x2e, 0xac, 0xc6, 0xd2, 0x2f, 0x00, 0x57,
+    0xe1, 0x16, 0x41, 0x2e, 0xb8, 0x83, 0x00, 0x54, 0x2b, 0x01, 0x2e, 0xc4,
+    0x8b, 0x00, 0x54, 0x3b, 0x01, 0x2e, 0xd0, 0x97, 0x00, 0x54, 0x4b, 0x01,
+    0x2e, 0xd4, 0x18, 0xc1, 0x2e, 0xd8, 0x87, 0x00, 0x54, 0x79, 0x91, 0x00,
+    0x54, 0x99, 0x0d, 0xc1, 0x2e, 0xe2, 0x09, 0xc1, 0x2e, 0xec, 0x10, 0xc1,
+    0x2e, 0xf6, 0x05, 0xc1, 0x2f, 0x0c, 0x0c, 0xc1, 0x2f, 0x16, 0x16, 0xc1,
+    0x2f, 0x20, 0x06, 0xc1, 0x2f, 0x2e, 0x12, 0xc1, 0x2f, 0x3c, 0x04, 0xc1,
+    0x2f, 0x46, 0xc2, 0x01, 0xc3, 0x00, 0x55, 0x71, 0xc2, 0x19, 0x2c, 0x00,
+    0x55, 0x79, 0xc2, 0x00, 0x39, 0x00, 0x55, 0x81, 0x0e, 0xc1, 0x2f, 0x50,
+    0x15, 0xc1, 0x2f, 0x5a, 0xc2, 0x00, 0xd0, 0x00, 0x55, 0xc9, 0xc3, 0xb4,
+    0xa6, 0x00, 0x57, 0xc8, 0x47, 0xc7, 0x7b, 0xc1, 0x2f, 0x6a, 0x45, 0x00,
+    0xba, 0x41, 0x2f, 0x72, 0xc4, 0x15, 0xe7, 0x00, 0x57, 0x31, 0xc3, 0x05,
+    0x14, 0x00, 0x57, 0x39, 0x16, 0xc1, 0x2f, 0x98, 0x08, 0xc1, 0x2f, 0xa4,
+    0x15, 0xc1, 0x2f, 0xb0, 0xc5, 0x06, 0xdb, 0x00, 0x57, 0x71, 0xc4, 0x26,
+    0x78, 0x00, 0x57, 0x78, 0xc5, 0xd7, 0xc2, 0x08, 0x19, 0xa1, 0xc3, 0x84,
+    0xf8, 0x08, 0x19, 0x80, 0xc3, 0xb6, 0x96, 0x08, 0x19, 0xb1, 0xc4, 0xe0,
+    0x9b, 0x08, 0x1a, 0x38, 0xc3, 0xdb, 0xd3, 0x08, 0x19, 0xb9, 0xc4, 0xde,
+    0xa3, 0x08, 0x1a, 0x40, 0xc5, 0xd5, 0x8d, 0x08, 0x19, 0xc1, 0xc4, 0xe2,
+    0xf3, 0x08, 0x1a, 0x20, 0xc5, 0xd6, 0x46, 0x08, 0x19, 0xe9, 0x43, 0x02,
+    0x6e, 0x41, 0x2f, 0xbc, 0x42, 0x01, 0x12, 0xc1, 0x2f, 0xc8, 0x42, 0x00,
+    0xbd, 0x41, 0x30, 0x32, 0x04, 0xc1, 0x30, 0x4a, 0xd5, 0x34, 0xe2, 0x01,
+    0x16, 0xd9, 0x45, 0x00, 0x8c, 0xc1, 0x30, 0x56, 0x11, 0xc1, 0x30, 0x68,
+    0x03, 0xc1, 0x30, 0x74, 0xc4, 0x00, 0xba, 0x00, 0x01, 0xf1, 0xcf, 0x69,
+    0x18, 0x01, 0x55, 0x3a, 0x01, 0x30, 0x80, 0x4b, 0x6f, 0xc7, 0xc1, 0x30,
+    0x86, 0x47, 0x02, 0x0e, 0xc1, 0x30, 0xaa, 0x45, 0x00, 0xba, 0xc1, 0x31,
+    0x13, 0xce, 0x73, 0x0c, 0x08, 0x9a, 0xb9, 0xc2, 0x00, 0x7a, 0x08, 0x9a,
+    0x80, 0xc4, 0x00, 0x87, 0x0f, 0xb0, 0x03, 0x01, 0x31, 0x2d, 0xda, 0x1d,
+    0x3c, 0x0f, 0xb1, 0xc0, 0xc9, 0x1b, 0x0a, 0x00, 0x00, 0xe9, 0xc4, 0x01,
+    0xc3, 0x01, 0x5e, 0x90, 0xc8, 0xbd, 0xd2, 0x01, 0x37, 0x71, 0xc7, 0xc5,
+    0x9f, 0x01, 0x37, 0x68, 0x48, 0x07, 0x5a, 0xc1, 0x31, 0x33, 0xcb, 0x94,
+    0x6f, 0x01, 0x11, 0xd0, 0x58, 0x22, 0x13, 0xc1, 0x31, 0x3f, 0x4f, 0x0b,
+    0x17, 0xc1, 0x31, 0xc5, 0x47, 0x02, 0x0e, 0xc1, 0x32, 0x49, 0xd3, 0x45,
+    0xf8, 0x00, 0x87, 0xd9, 0x4d, 0x29, 0xb9, 0x41, 0x32, 0xcf, 0xc8, 0x2f,
+    0x03, 0x0f, 0xb6, 0x50, 0x4f, 0x0b, 0x17, 0xc1, 0x33, 0x53, 0x4d, 0x29,
+    0xb9, 0x41, 0x33, 0xbc, 0xc4, 0xe3, 0x33, 0x0f, 0xa6, 0xc9, 0xc5, 0x1c,
+    0xae, 0x0f, 0xcf, 0x08, 0x45, 0x00, 0xba, 0xc1, 0x34, 0x25, 0x47, 0x02,
+    0x0e, 0xc1, 0x34, 0x41, 0x4b, 0x6f, 0xc7, 0xc1, 0x34, 0xa8, 0x03, 0xc1,
+    0x34, 0xc8, 0x46, 0x09, 0x97, 0xc1, 0x34, 0xd4, 0xc6, 0xd2, 0xcb, 0x00,
+    0x5b, 0x81, 0x49, 0x53, 0xa9, 0x41, 0x34, 0xf8, 0xc5, 0xd3, 0x5b, 0x0f,
+    0x69, 0xe9, 0xc4, 0x01, 0xce, 0x0f, 0x69, 0xe0, 0x16, 0xc1, 0x35, 0x04,
+    0x08, 0xc1, 0x35, 0x15, 0xc3, 0x05, 0x14, 0x0f, 0x68, 0x0b, 0x01, 0x35,
+    0x1d, 0x15, 0xc1, 0x35, 0x21, 0xc5, 0x06, 0xdb, 0x0f, 0x68, 0x43, 0x01,
+    0x35, 0x33, 0xc4, 0x26, 0x78, 0x0f, 0x68, 0x4a, 0x01, 0x35, 0x3e, 0x16,
+    0xc1, 0x35, 0x4b, 0x08, 0xc1, 0x35, 0x63, 0x15, 0xc1, 0x35, 0x72, 0xc5,
+    0x06, 0xdb, 0x0f, 0x69, 0xa9, 0xc4, 0x26, 0x78, 0x0f, 0x69, 0xb0, 0x44,
+    0x05, 0x18, 0xc1, 0x35, 0x81, 0xcc, 0x86, 0xfd, 0x0f, 0xad, 0x78, 0x00,
+    0xc1, 0x35, 0x8d, 0x02, 0x41, 0x35, 0xb5, 0xc5, 0xd7, 0xa4, 0x0f, 0xad,
+    0xc0, 0x48, 0xb5, 0xda, 0xc1, 0x35, 0xc1, 0x47, 0xc9, 0x88, 0xc1, 0x35,
+    0xcd, 0x42, 0x00, 0xfb, 0xc1, 0x35, 0xdf, 0x4a, 0x9d, 0xa6, 0xc1, 0x35,
+    0xeb, 0x4e, 0x70, 0xf8, 0xc1, 0x35, 0xfd, 0x4e, 0x72, 0x3a, 0xc1, 0x36,
+    0x09, 0xc3, 0x19, 0x2a, 0x0f, 0xae, 0xe9, 0x43, 0x00, 0x67, 0xc1, 0x36,
+    0x15, 0x47, 0xc7, 0x4a, 0x41, 0x36, 0x1f, 0xc5, 0x29, 0xfc, 0x0f, 0xa3,
+    0xa9, 0xc3, 0x12, 0xb8, 0x0f, 0xa3, 0xa1, 0xc5, 0xda, 0xa1, 0x0f, 0xce,
+    0x98, 0x4b, 0x11, 0xe3, 0xc1, 0x36, 0x2b, 0xc7, 0xc2, 0x42, 0x00, 0xe3,
+    0xe0, 0xd1, 0x4f, 0x36, 0x00, 0xe3, 0xd1, 0xc8, 0xb9, 0x9a, 0x00, 0xe3,
+    0xc0, 0x11, 0xc1, 0x36, 0x37, 0x0e, 0xc1, 0x36, 0x49, 0x07, 0xc1, 0x36,
+    0x60, 0x17, 0xc1, 0x36, 0x74, 0x0b, 0xc1, 0x36, 0x86, 0x03, 0x41, 0x36,
+    0x98, 0xc4, 0x26, 0x78, 0x00, 0xe2, 0xc9, 0xc5, 0x06, 0xdb, 0x00, 0xe2,
+    0xc1, 0x15, 0xc1, 0x36, 0xae, 0x08, 0xc1, 0x36, 0xba, 0x16, 0xc1, 0x36,
+    0xc6, 0xc3, 0x05, 0x14, 0x00, 0xe2, 0x89, 0xc4, 0x15, 0xe7, 0x00, 0xe2,
+    0x80, 0xca, 0x22, 0x51, 0x01, 0x39, 0x69, 0xcb, 0x8e, 0x08, 0x01, 0x38,
+    0xf9, 0xcb, 0x58, 0xc7, 0x01, 0x38, 0xc9, 0xca, 0x28, 0xc3, 0x01, 0x34,
+    0xe8, 0xcf, 0x63, 0x0f, 0x01, 0x22, 0x51, 0xc3, 0x02, 0x2c, 0x01, 0x22,
+    0x40, 0xd6, 0x2f, 0x1a, 0x01, 0x22, 0x49, 0xc4, 0x68, 0xba, 0x01, 0x22,
+    0x08, 0xd9, 0x1e, 0xcd, 0x01, 0x22, 0x31, 0xc6, 0xcb, 0x8d, 0x01, 0x22,
+    0x29, 0xca, 0xa5, 0xda, 0x01, 0x22, 0x20, 0xc4, 0x03, 0xc8, 0x01, 0x4d,
+    0x39, 0xc2, 0x02, 0xae, 0x01, 0x4d, 0x30, 0x45, 0x2a, 0xa0, 0x41, 0x36,
+    0xd2, 0xc5, 0xd4, 0x84, 0x00, 0xb4, 0xd1, 0x42, 0x01, 0x9c, 0xc1, 0x36,
+    0xde, 0x0b, 0xc1, 0x36, 0xf0, 0x17, 0xc1, 0x36, 0xfc, 0x11, 0xc1, 0x37,
+    0x0c, 0xc4, 0xe2, 0x6b, 0x00, 0xb4, 0x81, 0xc4, 0xde, 0x7f, 0x00, 0xb4,
+    0x79, 0x15, 0xc1, 0x37, 0x16, 0x10, 0xc1, 0x37, 0x22, 0xc4, 0xe0, 0x67,
+    0x00, 0xb4, 0x61, 0xc4, 0xe4, 0x13, 0x00, 0xb4, 0x59, 0x05, 0xc1, 0x37,
+    0x2e, 0xc5, 0xd6, 0xb4, 0x00, 0xb4, 0x41, 0xc4, 0xe3, 0x4f, 0x00, 0xb4,
+    0x39, 0xc5, 0xd3, 0xf3, 0x00, 0xb4, 0x19, 0xc4, 0xe4, 0xcb, 0x00, 0xb4,
+    0x11, 0xc5, 0xd7, 0x9a, 0x00, 0xb4, 0x08, 0x83, 0x08, 0x24, 0xb3, 0x01,
+    0x37, 0x3a, 0xc2, 0x01, 0x5d, 0x08, 0x24, 0x09, 0xc2, 0x01, 0x6f, 0x08,
+    0x24, 0x11, 0xc2, 0x25, 0x3b, 0x08, 0x24, 0x19, 0xc2, 0x8d, 0x8f, 0x08,
+    0x24, 0x21, 0x0d, 0xc1, 0x37, 0x44, 0x06, 0xc1, 0x37, 0x50, 0xc2, 0x00,
+    0x39, 0x08, 0x24, 0x39, 0x15, 0xc1, 0x37, 0x5c, 0xc4, 0xe3, 0x13, 0x08,
+    0x24, 0x59, 0xc2, 0x01, 0x30, 0x08, 0x24, 0x61, 0xc2, 0x00, 0x87, 0x08,
+    0x24, 0x69, 0xc4, 0xd8, 0x3a, 0x08, 0x24, 0x71, 0xc4, 0xe0, 0xd7, 0x08,
+    0x24, 0x81, 0xc4, 0xe4, 0xbb, 0x08, 0x24, 0x89, 0xc4, 0xb9, 0x50, 0x08,
+    0x24, 0x91, 0xc3, 0x7e, 0x89, 0x08, 0x24, 0x99, 0xc2, 0x00, 0xd0, 0x08,
+    0x24, 0xa1, 0xc2, 0x19, 0x2c, 0x08, 0x24, 0xa9, 0x87, 0x08, 0x24, 0xbb,
+    0x01, 0x37, 0x66, 0x8b, 0x08, 0x24, 0xc1, 0x91, 0x08, 0x24, 0xcb, 0x01,
+    0x37, 0x6a, 0x97, 0x08, 0x24, 0xd0, 0xc4, 0x15, 0xe7, 0x08, 0x25, 0x01,
+    0xc3, 0x05, 0x14, 0x08, 0x25, 0x09, 0x16, 0xc1, 0x37, 0x6e, 0x08, 0xc1,
+    0x37, 0x7a, 0x15, 0xc1, 0x37, 0x86, 0xc5, 0x06, 0xdb, 0x08, 0x25, 0x41,
+    0xc4, 0x26, 0x78, 0x08, 0x25, 0x48, 0x83, 0x08, 0x25, 0x83, 0x01, 0x37,
+    0x92, 0xc3, 0x00, 0x38, 0x08, 0x25, 0xa1, 0xc3, 0x1c, 0x63, 0x08, 0x25,
+    0xa9, 0x87, 0x08, 0x25, 0xbb, 0x01, 0x37, 0x9d, 0x0a, 0xc1, 0x37, 0xa7,
+    0x8b, 0x08, 0x25, 0xd9, 0x0d, 0xc1, 0x37, 0xb1, 0xc2, 0x00, 0xdb, 0x08,
+    0x25, 0xf9, 0xc2, 0x01, 0xc3, 0x08, 0x26, 0x01, 0xc2, 0x00, 0xc1, 0x08,
+    0x26, 0x09, 0x91, 0x08, 0x26, 0x13, 0x01, 0x37, 0xc1, 0xc2, 0x00, 0xb0,
+    0x08, 0x26, 0x21, 0x15, 0xc1, 0x37, 0xc7, 0x16, 0xc1, 0x37, 0xd1, 0xc3,
+    0x40, 0xe2, 0x08, 0x26, 0x69, 0x97, 0x08, 0x26, 0x71, 0xc2, 0x01, 0x4a,
+    0x08, 0x26, 0x79, 0xc3, 0x91, 0x00, 0x08, 0x26, 0x89, 0x1c, 0x41, 0x37,
+    0xd9, 0x83, 0x08, 0x26, 0xc3, 0x01, 0x37, 0xe3, 0xc3, 0x00, 0x38, 0x08,
+    0x26, 0xe1, 0xc3, 0x1c, 0x63, 0x08, 0x26, 0xe9, 0x87, 0x08, 0x26, 0xfb,
+    0x01, 0x37, 0xee, 0x0a, 0xc1, 0x37, 0xf8, 0x8b, 0x08, 0x27, 0x19, 0x0d,
+    0xc1, 0x38, 0x02, 0xc2, 0x00, 0xdb, 0x08, 0x27, 0x39, 0xc2, 0x01, 0xc3,
+    0x08, 0x27, 0x41, 0xc2, 0x00, 0xc1, 0x08, 0x27, 0x49, 0x91, 0x08, 0x27,
+    0x53, 0x01, 0x38, 0x12, 0xc2, 0x00, 0xb0, 0x08, 0x27, 0x61, 0x15, 0xc1,
+    0x38, 0x18, 0x16, 0xc1, 0x38, 0x22, 0xc3, 0x40, 0xe2, 0x08, 0x27, 0xa9,
+    0x97, 0x08, 0x27, 0xb1, 0xc2, 0x01, 0x4a, 0x08, 0x27, 0xb9, 0xc3, 0x91,
+    0x00, 0x08, 0x27, 0xc9, 0x1c, 0x41, 0x38, 0x2a, 0x03, 0xc1, 0x38, 0x34,
+    0x11, 0xc1, 0x38, 0x46, 0xc8, 0xbb, 0x2a, 0x0e, 0x7a, 0xc2, 0x01, 0x38,
+    0x52, 0xc3, 0x74, 0xc6, 0x0e, 0x7e, 0x09, 0x07, 0xc1, 0x38, 0x58, 0xcf,
+    0x58, 0xe3, 0x0e, 0x7b, 0x59, 0xcb, 0x95, 0x77, 0x0e, 0x7a, 0x98, 0xc5,
+    0xd5, 0x5b, 0x0e, 0x7e, 0x01, 0xc4, 0xde, 0xf7, 0x0e, 0x7d, 0x7a, 0x01,
+    0x38, 0x64, 0xc6, 0xad, 0x17, 0x0e, 0x7d, 0xf9, 0xc5, 0xdd, 0x8a, 0x0e,
+    0x7c, 0x21, 0x42, 0x14, 0x98, 0xc1, 0x38, 0x68, 0xc6, 0xd2, 0xd1, 0x0e,
+    0x7b, 0x71, 0xc5, 0x5f, 0x8d, 0x0e, 0x7a, 0xa0, 0x16, 0xc1, 0x38, 0x77,
+    0xc8, 0xb9, 0x2a, 0x0e, 0x7b, 0xeb, 0x01, 0x38, 0x8f, 0x49, 0xad, 0x77,
+    0x41, 0x38, 0x93, 0x00, 0x41, 0x38, 0xaf, 0xc6, 0xad, 0x79, 0x0e, 0x7c,
+    0x29, 0x03, 0x41, 0x38, 0xbb, 0xc2, 0x13, 0x38, 0x0e, 0x7c, 0x11, 0xd2,
+    0x47, 0xff, 0x0e, 0x7b, 0x60, 0xc5, 0xd2, 0xae, 0x0e, 0x7b, 0x79, 0xc8,
+    0x48, 0x09, 0x0e, 0x7a, 0xd8, 0x4c, 0x8b, 0x35, 0xc1, 0x38, 0xc7, 0xcb,
+    0x93, 0x7d, 0x0e, 0x7b, 0x31, 0xc8, 0x4e, 0x4b, 0x0e, 0x7b, 0x29, 0xc9,
+    0xa9, 0x48, 0x0e, 0x7b, 0x21, 0xc8, 0xbf, 0x6a, 0x0e, 0x7b, 0x18, 0x16,
+    0xc1, 0x38, 0xdf, 0xc6, 0xbf, 0x8c, 0x0e, 0x7b, 0x09, 0xc7, 0xc2, 0x96,
+    0x0e, 0x7b, 0x01, 0xc5, 0xd4, 0xd4, 0x0e, 0x7a, 0xf0, 0xa0, 0x0e, 0x7a,
+    0x19, 0x9f, 0x0e, 0x7a, 0x10, 0x0d, 0xc1, 0x38, 0xeb, 0x05, 0xc1, 0x39,
+    0x00, 0x06, 0xc1, 0x39, 0x0f, 0x16, 0xc1, 0x39, 0x1b, 0x15, 0xc1, 0x39,
+    0x2d, 0x11, 0xc1, 0x39, 0x45, 0x42, 0x01, 0x53, 0xc1, 0x39, 0x55, 0x1c,
+    0xc1, 0x39, 0x5f, 0x42, 0x00, 0x39, 0xc1, 0x39, 0x69, 0xc5, 0xd9, 0x43,
+    0x0e, 0x79, 0x39, 0xc6, 0xcf, 0xb3, 0x0e, 0x79, 0x29, 0xc7, 0xc9, 0x8f,
+    0x0e, 0x79, 0x21, 0x48, 0xbd, 0xca, 0xc1, 0x39, 0x75, 0x4d, 0x75, 0x8c,
+    0xc1, 0x39, 0x81, 0x47, 0xc2, 0x7a, 0xc1, 0x39, 0x8b, 0x46, 0xcd, 0xdf,
+    0x41, 0x39, 0x97, 0xc9, 0xb0, 0x62, 0x0e, 0x79, 0x91, 0xc6, 0xb0, 0x65,
+    0x0e, 0x79, 0x89, 0xc7, 0x6d, 0xa2, 0x0e, 0x79, 0x80, 0x42, 0x07, 0xb2,
+    0xc1, 0x39, 0xa3, 0xc8, 0x14, 0x38, 0x08, 0xd1, 0xc1, 0x46, 0x1e, 0x89,
+    0x41, 0x39, 0xaf, 0xd6, 0x2d, 0xe6, 0x08, 0xd2, 0x31, 0xc9, 0x15, 0xcc,
+    0x08, 0xd2, 0x00, 0x4d, 0x7f, 0x25, 0xc1, 0x39, 0xbe, 0xd1, 0x56, 0x1e,
+    0x08, 0xd1, 0xd0, 0xc3, 0x1d, 0x35, 0x08, 0xd1, 0x91, 0xc2, 0x00, 0xd0,
+    0x08, 0xd0, 0x61, 0x83, 0x08, 0xd0, 0x58, 0x83, 0x08, 0xd1, 0x81, 0xc2,
+    0x0d, 0xf6, 0x08, 0xd1, 0x79, 0xc2, 0x00, 0xd0, 0x08, 0xd1, 0x70, 0x83,
+    0x08, 0xd1, 0x41, 0xc2, 0x00, 0xd0, 0x08, 0xd1, 0x38, 0x1c, 0xc1, 0x39,
+    0xd6, 0xc2, 0x00, 0xd0, 0x08, 0xd0, 0xe1, 0x83, 0x08, 0xd0, 0xd9, 0x06,
+    0x41, 0x39, 0xe0, 0x15, 0xc1, 0x39, 0xea, 0xc2, 0x00, 0xd0, 0x08, 0xd0,
+    0xd1, 0x83, 0x08, 0xd0, 0xc9, 0x16, 0x41, 0x39, 0xf4, 0xc2, 0x00, 0xd0,
+    0x08, 0xd1, 0x09, 0x83, 0x08, 0xd1, 0x00, 0xc2, 0x00, 0xd0, 0x08, 0xd0,
+    0xf9, 0x83, 0x08, 0xd0, 0xf0, 0x83, 0x08, 0xd0, 0xe9, 0xc2, 0x00, 0xc1,
+    0x08, 0xd0, 0xc1, 0xc2, 0x19, 0x2c, 0x08, 0xd0, 0x99, 0xc2, 0x01, 0x30,
+    0x08, 0xd0, 0x78, 0xc2, 0x00, 0xd0, 0x08, 0xd0, 0x89, 0x83, 0x08, 0xd0,
+    0x80, 0xc2, 0x00, 0xd0, 0x08, 0xd0, 0x71, 0x83, 0x08, 0xd0, 0x68, 0xca,
+    0x9d, 0xe2, 0x08, 0xd0, 0x49, 0x03, 0xc1, 0x39, 0xfe, 0x91, 0x08, 0xd0,
+    0x33, 0x01, 0x3a, 0x06, 0x87, 0x08, 0xd0, 0x21, 0x97, 0x08, 0xd0, 0x1b,
+    0x01, 0x3a, 0x0a, 0x8b, 0x08, 0xd0, 0x08, 0xcf, 0x60, 0x30, 0x01, 0x4c,
+    0x51, 0xcd, 0x7d, 0x6b, 0x01, 0x4c, 0x40, 0x12, 0xc1, 0x3a, 0x0e, 0xcb,
+    0x34, 0xad, 0x01, 0x50, 0xf8, 0xc8, 0xb8, 0x8a, 0x01, 0x00, 0x61, 0xcc,
+    0x40, 0x81, 0x07, 0xf7, 0xf8, 0x43, 0x16, 0x55, 0xc1, 0x3a, 0x1a, 0x42,
+    0x00, 0x75, 0x41, 0x3a, 0x3e, 0x45, 0x02, 0x10, 0xc1, 0x3a, 0x4a, 0xcc,
+    0x86, 0x3d, 0x05, 0x4e, 0x08, 0x16, 0xc1, 0x3a, 0xd6, 0xc3, 0x05, 0x14,
+    0x05, 0x4e, 0x89, 0xc4, 0x15, 0xe7, 0x05, 0x4e, 0x81, 0x08, 0xc1, 0x3a,
+    0xe2, 0x15, 0xc1, 0x3a, 0xee, 0xc5, 0x06, 0xdb, 0x05, 0x4e, 0xc1, 0xc4,
+    0x26, 0x78, 0x05, 0x4e, 0xc8, 0xc5, 0xdd, 0x53, 0x05, 0x4d, 0xf9, 0xc7,
+    0xc6, 0xf6, 0x05, 0x4d, 0xf1, 0xc5, 0xdd, 0x12, 0x05, 0x4d, 0xe8, 0xc5,
+    0xd8, 0xbc, 0x05, 0x4d, 0xe1, 0xca, 0xa2, 0xf6, 0x05, 0x4d, 0xd9, 0x16,
+    0xc1, 0x3a, 0xfa, 0xc4, 0xc5, 0x6e, 0x05, 0x4d, 0xc3, 0x01, 0x3b, 0x04,
+    0xc4, 0xdf, 0x6f, 0x05, 0x4d, 0xb2, 0x01, 0x3b, 0x0a, 0xc5, 0xde, 0x11,
+    0x05, 0x4c, 0x0b, 0x01, 0x3b, 0x10, 0xc7, 0xc7, 0xd6, 0x05, 0x4c, 0x19,
+    0xc5, 0xd9, 0xcf, 0x05, 0x4c, 0x11, 0xc9, 0xaf, 0xff, 0x05, 0x4c, 0x00,
+    0x46, 0x02, 0xae, 0xc1, 0x3b, 0x16, 0x46, 0x01, 0xc8, 0x41, 0x3b, 0x28,
+    0xc5, 0x18, 0x25, 0x01, 0x02, 0xb9, 0xd1, 0x1e, 0x3f, 0x01, 0x50, 0x60,
+    0x10, 0xc1, 0x3b, 0x34, 0x0c, 0xc1, 0x3b, 0x73, 0x13, 0xc1, 0x3b, 0x93,
+    0x14, 0xc1, 0x3b, 0xaf, 0x15, 0xc1, 0x3b, 0xd6, 0x05, 0xc1, 0x3c, 0x08,
+    0x1c, 0xc1, 0x3c, 0x36, 0x19, 0xc1, 0x3c, 0x68, 0x0a, 0xc1, 0x3c, 0x84,
+    0x1b, 0xc1, 0x3c, 0xb6, 0x1a, 0xc1, 0x3c, 0xd2, 0x0f, 0xc1, 0x3c, 0xf0,
+    0x8b, 0x05, 0x00, 0x13, 0x01, 0x3d, 0x1e, 0x83, 0x05, 0x00, 0x53, 0x01,
+    0x3d, 0x34, 0xc2, 0x01, 0xba, 0x05, 0x00, 0x6b, 0x01, 0x3d, 0x40, 0x91,
+    0x05, 0x00, 0x8b, 0x01, 0x3d, 0x48, 0x87, 0x05, 0x00, 0xa3, 0x01, 0x3d,
+    0x54, 0x04, 0xc1, 0x3d, 0x58, 0x12, 0xc1, 0x3d, 0x86, 0x08, 0xc1, 0x3d,
+    0xa9, 0x18, 0xc1, 0x3d, 0xcc, 0x06, 0xc1, 0x3d, 0xf3, 0x16, 0xc1, 0x3e,
+    0x1a, 0x0e, 0xc1, 0x3e, 0x3d, 0x09, 0xc1, 0x3e, 0x67, 0x0d, 0x41, 0x3e,
+    0x8e, 0xc3, 0xe5, 0x75, 0x05, 0x24, 0x81, 0x0e, 0xc1, 0x3e, 0xb1, 0x0d,
+    0xc1, 0x3e, 0xbe, 0x10, 0xc1, 0x3e, 0xc8, 0x05, 0xc1, 0x3e, 0xd8, 0x15,
+    0xc1, 0x3e, 0xf1, 0x09, 0xc1, 0x3e, 0xfb, 0x0f, 0xc1, 0x3f, 0x0f, 0x0a,
+    0xc1, 0x3f, 0x19, 0x04, 0xc1, 0x3f, 0x23, 0x1b, 0xc1, 0x3f, 0x2f, 0x12,
+    0xc1, 0x3f, 0x39, 0x16, 0xc1, 0x3f, 0x45, 0x1c, 0xc1, 0x3f, 0x4f, 0x06,
+    0xc1, 0x3f, 0x63, 0xc2, 0x00, 0x11, 0x05, 0x25, 0x49, 0x0c, 0xc1, 0x3f,
+    0x6d, 0x18, 0xc1, 0x3f, 0x75, 0xc2, 0x02, 0xa0, 0x05, 0x25, 0xc0, 0xc3,
+    0xe5, 0xb4, 0x08, 0x75, 0x43, 0x01, 0x3f, 0x81, 0xc3, 0x0d, 0xff, 0x08,
+    0x75, 0x03, 0x01, 0x3f, 0x87, 0x07, 0xc1, 0x3f, 0x8d, 0x0a, 0xc1, 0x3f,
+    0xa1, 0xc2, 0x00, 0x27, 0x08, 0x75, 0x29, 0xc3, 0x7e, 0x89, 0x08, 0x75,
+    0x21, 0xc2, 0x01, 0xdf, 0x08, 0x75, 0x19, 0xc3, 0x20, 0x18, 0x08, 0x75,
+    0x11, 0xc3, 0x8c, 0x3f, 0x08, 0x75, 0x09, 0xc3, 0xb3, 0xa6, 0x08, 0x74,
+    0xf9, 0x0d, 0xc1, 0x3f, 0xad, 0xc3, 0x0f, 0x9a, 0x08, 0x74, 0xe1, 0xc2,
+    0x02, 0x41, 0x08, 0x74, 0xd3, 0x01, 0x3f, 0xb9, 0xc2, 0x00, 0x87, 0x08,
+    0x74, 0xc9, 0x1a, 0xc1, 0x3f, 0xbf, 0x1c, 0xc1, 0x3f, 0xc9, 0x16, 0xc1,
+    0x3f, 0xd4, 0x42, 0x0e, 0x9a, 0xc1, 0x3f, 0xde, 0x15, 0xc1, 0x3f, 0xe6,
+    0xc2, 0x25, 0x3b, 0x08, 0x74, 0x81, 0x14, 0xc1, 0x3f, 0xfc, 0x05, 0xc1,
+    0x40, 0x06, 0x12, 0xc1, 0x40, 0x10, 0xc2, 0x00, 0x51, 0x08, 0x74, 0x08,
+    0xca, 0xa8, 0x1e, 0x08, 0x75, 0x61, 0xca, 0x9c, 0xd4, 0x08, 0x75, 0x58,
+    0x00, 0xc1, 0x40, 0x1a, 0xc8, 0xbb, 0x3a, 0x0f, 0xae, 0xc8, 0x12, 0xc1,
+    0x40, 0x26, 0x83, 0x00, 0xa7, 0xa3, 0x01, 0x40, 0x36, 0x8a, 0x00, 0xa9,
+    0x2b, 0x01, 0x40, 0x44, 0x91, 0x00, 0xa7, 0x8b, 0x01, 0x40, 0x61, 0x99,
+    0x00, 0xa8, 0x3b, 0x01, 0x40, 0x6f, 0x87, 0x00, 0xa7, 0x69, 0x8b, 0x00,
+    0xa7, 0x7a, 0x01, 0x40, 0x88, 0x83, 0x00, 0xa6, 0x3b, 0x01, 0x40, 0x8c,
+    0x19, 0xc1, 0x40, 0xa3, 0x91, 0x00, 0xa6, 0x23, 0x01, 0x40, 0xbc, 0xc2,
+    0x00, 0x75, 0x00, 0xac, 0xb3, 0x01, 0x40, 0xc4, 0x89, 0x00, 0xac, 0xab,
+    0x01, 0x40, 0xd9, 0x44, 0xde, 0xaf, 0xc1, 0x40, 0xee, 0x48, 0xbc, 0x52,
+    0xc1, 0x40, 0xfd, 0x87, 0x00, 0xa6, 0x01, 0x8b, 0x00, 0xa6, 0x13, 0x01,
+    0x41, 0x08, 0x8a, 0x00, 0xa6, 0x90, 0x83, 0x00, 0xa4, 0x83, 0x01, 0x41,
+    0x0c, 0xc7, 0xc4, 0xf7, 0x00, 0xb3, 0x69, 0x19, 0xc1, 0x41, 0x19, 0x91,
+    0x00, 0xa4, 0x6b, 0x01, 0x41, 0x32, 0x8b, 0x00, 0xa4, 0x5b, 0x01, 0x41,
+    0x36, 0x87, 0x00, 0xa4, 0x48, 0x4b, 0x92, 0x54, 0xc1, 0x41, 0x3a, 0x49,
+    0xad, 0x4a, 0xc1, 0x41, 0x42, 0xcb, 0x92, 0x96, 0x00, 0xa9, 0xf8, 0x42,
+    0x07, 0x26, 0xc1, 0x41, 0x65, 0x16, 0xc1, 0x41, 0x7e, 0x8a, 0x00, 0xab,
+    0x53, 0x01, 0x41, 0x95, 0x83, 0x00, 0xa2, 0xab, 0x01, 0x41, 0xbb, 0x1b,
+    0xc1, 0x41, 0xc6, 0x19, 0xc1, 0x41, 0xd6, 0x91, 0x00, 0xa2, 0x83, 0x01,
+    0x41, 0xef, 0x8b, 0x00, 0xa2, 0x73, 0x01, 0x41, 0xf3, 0x87, 0x00, 0xa2,
+    0x60, 0x87, 0x00, 0xa0, 0x63, 0x01, 0x41, 0xf7, 0x83, 0x00, 0xa0, 0xbb,
+    0x01, 0x41, 0xfd, 0x91, 0x00, 0xa0, 0x93, 0x01, 0x42, 0x05, 0x8b, 0x00,
+    0xa0, 0x72, 0x01, 0x42, 0x0c, 0x47, 0xc0, 0xac, 0xc1, 0x42, 0x10, 0x19,
+    0xc1, 0x42, 0x1a, 0x83, 0x00, 0xaa, 0x5b, 0x01, 0x42, 0x35, 0x91, 0x00,
+    0xaa, 0x43, 0x01, 0x42, 0x40, 0x8b, 0x00, 0xaa, 0x33, 0x01, 0x42, 0x44,
+    0x87, 0x00, 0xaa, 0x10, 0x8b, 0x00, 0xaa, 0xab, 0x01, 0x42, 0x48, 0xc8,
+    0x11, 0xf7, 0x00, 0xb3, 0x71, 0xc3, 0x14, 0x72, 0x00, 0xaa, 0xd9, 0x83,
+    0x00, 0xaa, 0xcb, 0x01, 0x42, 0x52, 0x91, 0x00, 0xaa, 0xbb, 0x01, 0x42,
+    0x59, 0x87, 0x00, 0xaa, 0x98, 0xc8, 0xbc, 0x9a, 0x00, 0xc6, 0xe1, 0x90,
+    0x00, 0xa1, 0x58, 0x47, 0xc5, 0xb4, 0xc1, 0x42, 0x5d, 0x9b, 0x00, 0xc5,
+    0x81, 0x91, 0x00, 0xa0, 0x31, 0x90, 0x00, 0xa1, 0x68, 0x83, 0x00, 0xa9,
+    0x6b, 0x01, 0x42, 0x7f, 0x91, 0x00, 0xa9, 0x53, 0x01, 0x42, 0x8a, 0x19,
+    0xc1, 0x42, 0x92, 0x46, 0x92, 0x9a, 0xc1, 0x42, 0xab, 0x8b, 0x00, 0xa9,
+    0x43, 0x01, 0x42, 0xe9, 0x87, 0x00, 0xa9, 0x30, 0x83, 0x00, 0xa6, 0xd3,
+    0x01, 0x42, 0xed, 0x8a, 0x00, 0xad, 0x33, 0x01, 0x42, 0xf8, 0x87, 0x00,
+    0xa6, 0x99, 0x8b, 0x00, 0xa6, 0xab, 0x01, 0x43, 0x0d, 0x91, 0x00, 0xa6,
+    0xbb, 0x01, 0x43, 0x11, 0x19, 0x41, 0x43, 0x15, 0x83, 0x00, 0xa5, 0x53,
+    0x01, 0x43, 0x2e, 0x87, 0x00, 0xa5, 0x1b, 0x01, 0x43, 0x39, 0x91, 0x00,
+    0xa5, 0x3b, 0x01, 0x43, 0x3f, 0x8b, 0x00, 0xa5, 0x2b, 0x01, 0x43, 0x46,
+    0x19, 0xc1, 0x43, 0x4a, 0x8a, 0x00, 0xa5, 0xe8, 0x99, 0x00, 0xa4, 0x23,
+    0x01, 0x43, 0x63, 0x83, 0x00, 0xa3, 0x93, 0x01, 0x43, 0x7c, 0x87, 0x00,
+    0xa3, 0x59, 0x8b, 0x00, 0xa3, 0x6b, 0x01, 0x43, 0x87, 0x91, 0x00, 0xa3,
+    0x7a, 0x01, 0x43, 0x8b, 0x19, 0xc1, 0x43, 0x8f, 0x83, 0x00, 0xa1, 0xc3,
+    0x01, 0x43, 0xa8, 0x91, 0x00, 0xa1, 0x9b, 0x01, 0x43, 0xb3, 0x87, 0x00,
+    0xa1, 0x79, 0x8b, 0x00, 0xa1, 0x8a, 0x01, 0x43, 0xbb, 0x83, 0x00, 0xa0,
+    0x5b, 0x01, 0x43, 0xbf, 0x9b, 0x00, 0xc5, 0x89, 0x8b, 0x00, 0xa0, 0xe3,
+    0x01, 0x43, 0xc7, 0x4a, 0xa0, 0xa8, 0xc1, 0x43, 0xcd, 0x90, 0x00, 0xa1,
+    0x70, 0x83, 0x00, 0xac, 0x1b, 0x01, 0x43, 0xd5, 0x91, 0x00, 0xac, 0x0b,
+    0x01, 0x43, 0xe0, 0x8b, 0x00, 0xab, 0xfa, 0x01, 0x43, 0xe4, 0x8d, 0x00,
+    0xab, 0xe9, 0xc5, 0x59, 0x93, 0x00, 0xa0, 0x00, 0x8b, 0x00, 0xa0, 0x21,
+    0x90, 0x00, 0xa1, 0x60, 0xd0, 0x5a, 0x52, 0x01, 0x02, 0x08, 0xc9, 0x36,
+    0xe7, 0x0f, 0xae, 0x10, 0x97, 0x08, 0x15, 0xfa, 0x01, 0x43, 0xe8, 0x94,
+    0x08, 0x16, 0x48, 0x86, 0x08, 0x15, 0x32, 0x01, 0x43, 0xef, 0x9f, 0x08,
+    0x15, 0x38, 0x84, 0x08, 0x16, 0x52, 0x01, 0x43, 0xf3, 0x9f, 0x08, 0x15,
+    0x60, 0x96, 0x08, 0x16, 0x3a, 0x01, 0x43, 0xff, 0x8a, 0x08, 0x15, 0x73,
+    0x01, 0x44, 0x03, 0x95, 0x08, 0x15, 0xc1, 0x96, 0x08, 0x16, 0x12, 0x01,
+    0x44, 0x07, 0xc2, 0x8c, 0x53, 0x08, 0x15, 0x89, 0xc2, 0xe6, 0x81, 0x08,
+    0x16, 0x30, 0x90, 0x08, 0x15, 0x99, 0x86, 0x08, 0x15, 0xf1, 0x89, 0x08,
+    0x16, 0x20, 0x9f, 0x08, 0x15, 0x08, 0x8b, 0x08, 0x16, 0x28, 0x9f, 0x08,
+    0x16, 0x78, 0x9f, 0x08, 0x15, 0xe8, 0x9f, 0x08, 0x16, 0x08, 0x03, 0xc1,
+    0x44, 0x0b, 0xc3, 0x0b, 0xc8, 0x08, 0x29, 0x89, 0x09, 0xc1, 0x44, 0x17,
+    0x06, 0xc1, 0x44, 0x23, 0x07, 0xc1, 0x44, 0x33, 0x1c, 0xc1, 0x44, 0x3d,
+    0x16, 0xc1, 0x44, 0x47, 0x05, 0xc1, 0x44, 0x59, 0x1b, 0xc1, 0x44, 0x67,
+    0x0b, 0xc1, 0x44, 0x73, 0x15, 0xc1, 0x44, 0x85, 0x0e, 0xc1, 0x44, 0x8f,
+    0xc4, 0xdf, 0x1f, 0x08, 0x2a, 0x01, 0x0c, 0xc1, 0x44, 0x9b, 0x0d, 0xc1,
+    0x44, 0xa7, 0xc4, 0xdf, 0xa7, 0x08, 0x2a, 0x31, 0x42, 0x0f, 0x9a, 0xc1,
+    0x44, 0xb3, 0xc3, 0xda, 0xa6, 0x08, 0x2a, 0x61, 0xc4, 0xe4, 0x53, 0x08,
+    0x2a, 0x71, 0xc2, 0x00, 0x45, 0x08, 0x2a, 0x91, 0xc3, 0xd2, 0xb3, 0x08,
+    0x2a, 0xa1, 0x12, 0xc1, 0x44, 0xbb, 0xc3, 0x07, 0x81, 0x08, 0x2a, 0xc9,
+    0xc4, 0xde, 0x87, 0x08, 0x2a, 0xd8, 0xcc, 0x85, 0x1d, 0x0f, 0xb1, 0xc9,
+    0xc9, 0xa9, 0x36, 0x0f, 0xb1, 0xe0, 0x07, 0xc1, 0x44, 0xc7, 0x06, 0xc1,
+    0x45, 0x07, 0x03, 0xc1, 0x45, 0x47, 0x08, 0xc1, 0x45, 0x87, 0x24, 0xc1,
+    0x45, 0xc7, 0x23, 0xc1, 0x46, 0x07, 0x20, 0xc1, 0x46, 0x47, 0x1f, 0xc1,
+    0x46, 0x87, 0x1e, 0xc1, 0x46, 0xc7, 0x1d, 0xc1, 0x47, 0x07, 0x05, 0xc1,
+    0x47, 0x47, 0x04, 0xc1, 0x47, 0x87, 0x26, 0xc1, 0x47, 0xc7, 0x25, 0xc1,
+    0x48, 0x07, 0x22, 0xc1, 0x48, 0x47, 0x21, 0x41, 0x48, 0x87, 0x24, 0xc1,
+    0x48, 0xc7, 0x23, 0xc1, 0x49, 0x07, 0x22, 0xc1, 0x49, 0x47, 0x21, 0xc1,
+    0x49, 0x87, 0x1f, 0xc1, 0x49, 0xc7, 0x1d, 0xc1, 0x4a, 0x07, 0x08, 0xc1,
+    0x4a, 0x47, 0x04, 0xc1, 0x4a, 0x87, 0x03, 0xc1, 0x4a, 0xc7, 0x26, 0xc1,
+    0x4b, 0x07, 0x25, 0xc1, 0x4b, 0x47, 0x07, 0xc1, 0x4b, 0x87, 0x06, 0xc1,
+    0x4b, 0xc7, 0x05, 0xc1, 0x4c, 0x07, 0x20, 0xc1, 0x4c, 0x47, 0x1e, 0x41,
+    0x4c, 0x87, 0x1e, 0xc1, 0x4c, 0xc7, 0x1d, 0x41, 0x4c, 0xff, 0x06, 0xc1,
+    0x4d, 0x3f, 0x05, 0xc1, 0x4d, 0x67, 0x04, 0xc1, 0x4d, 0xa7, 0x03, 0xc1,
+    0x4d, 0xe7, 0x26, 0xc1, 0x4e, 0x27, 0x25, 0xc1, 0x4e, 0x67, 0x24, 0xc1,
+    0x4e, 0xa7, 0x23, 0xc1, 0x4e, 0xe7, 0x22, 0xc1, 0x4f, 0x1f, 0x21, 0xc1,
+    0x4f, 0x5f, 0x20, 0xc1, 0x4f, 0x9f, 0x1f, 0xc1, 0x4f, 0xdf, 0x1e, 0xc1,
+    0x50, 0x1f, 0x1d, 0x41, 0x50, 0x5f, 0x08, 0xc1, 0x50, 0x9f, 0x07, 0xc1,
+    0x50, 0xdf, 0x06, 0xc1, 0x51, 0x1f, 0x05, 0xc1, 0x51, 0x5f, 0x04, 0xc1,
+    0x51, 0x9f, 0x03, 0xc1, 0x51, 0xdf, 0x26, 0xc1, 0x52, 0x1f, 0x25, 0xc1,
+    0x52, 0x5f, 0x24, 0xc1, 0x52, 0x9f, 0x23, 0xc1, 0x52, 0xdf, 0x22, 0xc1,
+    0x53, 0x1f, 0x21, 0xc1, 0x53, 0x5f, 0x20, 0xc1, 0x53, 0x9f, 0x1f, 0xc1,
+    0x53, 0xdf, 0x1e, 0xc1, 0x54, 0x1f, 0x1d, 0x41, 0x54, 0x5f, 0x92, 0x01,
+    0x74, 0xc9, 0x8f, 0x01, 0x75, 0xb9, 0xc2, 0x00, 0x74, 0x01, 0x76, 0xb8,
+    0xc3, 0x43, 0x08, 0x01, 0x74, 0x09, 0xc5, 0x78, 0xee, 0x01, 0x76, 0x10,
+    0xc6, 0xca, 0xeb, 0x01, 0x75, 0x01, 0xc2, 0x0d, 0x10, 0x01, 0x76, 0x78,
+    0x15, 0xc1, 0x54, 0x9f, 0xc4, 0x63, 0x7e, 0x01, 0x76, 0x59, 0x09, 0xc1,
+    0x54, 0xbd, 0x0e, 0xc1, 0x54, 0xc9, 0x16, 0xc1, 0x54, 0xd5, 0xc4, 0x45,
+    0x10, 0x01, 0x76, 0xd9, 0x08, 0xc1, 0x54, 0xe7, 0x07, 0xc1, 0x54, 0xf9,
+    0xc5, 0xa0, 0x85, 0x01, 0x77, 0x11, 0xc4, 0xa3, 0x1a, 0x01, 0x77, 0x31,
+    0xc6, 0x87, 0xe7, 0x01, 0x77, 0x80, 0x45, 0x71, 0x24, 0xc1, 0x55, 0x05,
+    0xc2, 0x00, 0x65, 0x01, 0x74, 0x58, 0xc3, 0x05, 0x14, 0x01, 0x74, 0x61,
+    0xc3, 0x02, 0x9f, 0x01, 0x74, 0x68, 0xc3, 0x21, 0xdf, 0x01, 0x74, 0x91,
+    0x44, 0x4b, 0x1f, 0x41, 0x55, 0x0f, 0x49, 0x8c, 0x70, 0xc1, 0x55, 0x1b,
+    0xc2, 0x8c, 0x30, 0x01, 0x75, 0x78, 0xc3, 0x05, 0x14, 0x01, 0x75, 0x61,
+    0xc3, 0x02, 0x9f, 0x01, 0x75, 0x68, 0xc3, 0x05, 0x14, 0x01, 0x75, 0x21,
+    0xc3, 0x02, 0x9f, 0x01, 0x75, 0x28, 0x9a, 0x01, 0x74, 0x31, 0xcb, 0x93,
+    0x67, 0x01, 0x75, 0x51, 0xc2, 0x02, 0x6f, 0x01, 0x77, 0x18, 0xc3, 0x05,
+    0x14, 0x01, 0x75, 0xd1, 0xc3, 0x02, 0x9f, 0x01, 0x75, 0xd8, 0xc3, 0x05,
+    0x14, 0x01, 0x74, 0x71, 0x16, 0xc1, 0x55, 0x29, 0xc4, 0x09, 0x9d, 0x01,
+    0x74, 0x88, 0xc3, 0x05, 0x14, 0x01, 0x76, 0x89, 0xc3, 0x02, 0x9f, 0x01,
+    0x76, 0x90, 0x43, 0x0f, 0x06, 0xc1, 0x55, 0x35, 0x86, 0x01, 0x77, 0x08,
+    0xc2, 0x00, 0x45, 0x01, 0x74, 0xe9, 0xc4, 0x14, 0xdd, 0x01, 0x74, 0xf9,
+    0xc4, 0xd7, 0x14, 0x01, 0x75, 0xe9, 0x44, 0x0d, 0xee, 0x41, 0x55, 0x41,
+    0xc2, 0x01, 0xe2, 0x01, 0x75, 0xa9, 0xc2, 0x00, 0xfe, 0x01, 0x75, 0xe0,
+    0x44, 0x02, 0x11, 0xc1, 0x55, 0x4d, 0x43, 0xad, 0x64, 0x41, 0x55, 0x59,
+    0xc3, 0x05, 0x14, 0x01, 0x76, 0x19, 0xc3, 0x02, 0x9f, 0x01, 0x76, 0x20,
+    0xc4, 0x18, 0x10, 0x01, 0x77, 0x59, 0x16, 0xc1, 0x55, 0x65, 0xc6, 0x87,
+    0xe7, 0x01, 0x77, 0x78, 0xc3, 0x05, 0x14, 0x01, 0x76, 0xe9, 0x16, 0x41,
+    0x55, 0x71, 0xc2, 0x02, 0xa0, 0x01, 0x75, 0x91, 0xc4, 0x02, 0xde, 0x01,
+    0x75, 0x98, 0xc3, 0x05, 0x14, 0x01, 0x75, 0xf1, 0x16, 0x41, 0x55, 0x7d,
+    0x9c, 0x01, 0x8e, 0xc1, 0x89, 0x01, 0x8e, 0xf8, 0xc2, 0x47, 0xa4, 0x01,
+    0x8e, 0x49, 0x9c, 0x01, 0x8e, 0xf0, 0x9c, 0x01, 0x8e, 0x2b, 0x01, 0x55,
+    0x89, 0x89, 0x01, 0x8e, 0x31, 0x99, 0x01, 0x8e, 0x6b, 0x01, 0x55, 0x94,
+    0x96, 0x01, 0x8e, 0x50, 0xc2, 0x47, 0xa4, 0x01, 0x8e, 0x60, 0xc5, 0x08,
+    0xd9, 0x0f, 0xdc, 0xa8, 0x4d, 0x29, 0xb9, 0xc1, 0x55, 0x98, 0x47, 0x02,
+    0x0e, 0x41, 0x55, 0xe7, 0xc3, 0x91, 0xe8, 0x0f, 0x9a, 0x91, 0xc9, 0xae,
+    0x3d, 0x0f, 0x99, 0xc0, 0xc2, 0x02, 0x0a, 0x01, 0x02, 0x01, 0xc9, 0x33,
+    0xdd, 0x00, 0x00, 0x4a, 0x01, 0x56, 0x36, 0xcf, 0x64, 0xfe, 0x0f, 0xa6,
+    0x49, 0xcd, 0x7b, 0x22, 0x0f, 0xa6, 0x42, 0x01, 0x56, 0x3a, 0xc3, 0xd8,
+    0xd0, 0x08, 0x8a, 0x39, 0x0e, 0xc1, 0x56, 0x40, 0xc3, 0x39, 0x6e, 0x08,
+    0x89, 0x31, 0xc3, 0x82, 0xa0, 0x08, 0x89, 0x29, 0xc3, 0x14, 0x72, 0x08,
+    0x89, 0x21, 0xc3, 0x47, 0xd9, 0x08, 0x89, 0x11, 0x1b, 0xc1, 0x56, 0x4c,
+    0xc3, 0xc2, 0xab, 0x08, 0x88, 0xf9, 0x04, 0xc1, 0x56, 0x58, 0x12, 0xc1,
+    0x56, 0x64, 0x10, 0xc1, 0x56, 0x70, 0x06, 0xc1, 0x56, 0x88, 0x16, 0xc1,
+    0x56, 0x98, 0x0c, 0xc1, 0x56, 0xa8, 0x05, 0xc1, 0x56, 0xb4, 0x09, 0xc1,
+    0x56, 0xc0, 0x0d, 0xc1, 0x56, 0xcc, 0x87, 0x08, 0x88, 0x31, 0x97, 0x08,
+    0x88, 0x29, 0x8b, 0x08, 0x88, 0x21, 0xc2, 0x04, 0xc6, 0x08, 0x88, 0x18,
+    0x4a, 0x6f, 0xc8, 0xc1, 0x56, 0xd8, 0xc5, 0x1e, 0x96, 0x08, 0x89, 0x98,
+    0xcb, 0x97, 0xf5, 0x08, 0x8a, 0x11, 0xc4, 0x19, 0x53, 0x08, 0x8a, 0x09,
+    0x45, 0x09, 0x98, 0x41, 0x56, 0xfb, 0xcb, 0x45, 0x8e, 0x08, 0x8a, 0x01,
+    0x44, 0x00, 0xbb, 0x41, 0x57, 0x1f, 0xc2, 0x01, 0x4a, 0x05, 0x51, 0xb1,
+    0xc2, 0x00, 0xdb, 0x05, 0x51, 0xa9, 0xc2, 0x00, 0x39, 0x05, 0x51, 0xa1,
+    0xc2, 0x19, 0x2c, 0x05, 0x51, 0x99, 0x46, 0x26, 0xf7, 0x41, 0x57, 0x31,
+    0x97, 0x05, 0x51, 0x6b, 0x01, 0x57, 0x3f, 0x03, 0xc1, 0x57, 0x43, 0x91,
+    0x05, 0x51, 0x7b, 0x01, 0x57, 0x4f, 0xc2, 0x06, 0xdb, 0x05, 0x51, 0x61,
+    0x8b, 0x05, 0x51, 0x52, 0x01, 0x57, 0x53, 0xc2, 0x00, 0xd0, 0x05, 0x51,
+    0x41, 0x15, 0xc1, 0x57, 0x57, 0x10, 0xc1, 0x57, 0x61, 0x09, 0xc1, 0x57,
+    0x73, 0x0d, 0xc1, 0x57, 0x7d, 0x91, 0x05, 0x50, 0x29, 0x83, 0x05, 0x50,
+    0x03, 0x01, 0x57, 0x87, 0x87, 0x05, 0x50, 0x19, 0x46, 0x26, 0xf7, 0xc1,
+    0x57, 0x8b, 0xc2, 0x02, 0x41, 0x05, 0x51, 0x29, 0xc2, 0x00, 0xdb, 0x05,
+    0x51, 0x21, 0xc2, 0x00, 0x39, 0x05, 0x51, 0x19, 0xc2, 0x19, 0x2c, 0x05,
+    0x51, 0x11, 0x04, 0xc1, 0x57, 0xba, 0x0f, 0xc1, 0x57, 0xca, 0x12, 0xc1,
+    0x57, 0xd4, 0x06, 0xc1, 0x57, 0xe4, 0x16, 0xc1, 0x57, 0xf4, 0x0c, 0xc1,
+    0x57, 0xfe, 0x42, 0x11, 0xee, 0xc1, 0x58, 0x08, 0x97, 0x05, 0x50, 0x11,
+    0x8b, 0x05, 0x50, 0x08, 0xcc, 0x86, 0x19, 0x05, 0x52, 0xf9, 0x06, 0xc1,
+    0x58, 0x12, 0xc6, 0x99, 0x4e, 0x05, 0x52, 0xe0, 0xc4, 0x26, 0x78, 0x05,
+    0x52, 0xc9, 0xc5, 0x06, 0xdb, 0x05, 0x52, 0xc1, 0x15, 0xc1, 0x58, 0x1e,
+    0x08, 0xc1, 0x58, 0x2a, 0x16, 0xc1, 0x58, 0x36, 0xc4, 0x15, 0xe7, 0x05,
+    0x52, 0x81, 0xc3, 0x05, 0x14, 0x05, 0x52, 0x88, 0xc3, 0x05, 0x14, 0x08,
+    0x7e, 0x2b, 0x01, 0x58, 0x42, 0x16, 0xc1, 0x58, 0x48, 0xc4, 0x09, 0x9d,
+    0x08, 0x7e, 0x40, 0xc3, 0xb5, 0x3e, 0x08, 0x7e, 0x21, 0x15, 0xc1, 0x58,
+    0x58, 0xc4, 0xe0, 0xe7, 0x08, 0x7d, 0xd9, 0xc4, 0x4a, 0xb9, 0x08, 0x7d,
+    0xd1, 0xc2, 0x01, 0x7f, 0x08, 0x7d, 0xab, 0x01, 0x58, 0x6a, 0xc5, 0x4a,
+    0xb3, 0x08, 0x7d, 0xc1, 0xca, 0xa5, 0x26, 0x08, 0x7d, 0xb9, 0xc3, 0x7e,
+    0x89, 0x08, 0x7d, 0xb1, 0xc6, 0x40, 0x9a, 0x08, 0x7d, 0xa1, 0xc5, 0x9c,
+    0xa2, 0x08, 0x7d, 0x99, 0xc4, 0xe3, 0x27, 0x08, 0x7d, 0x91, 0x03, 0xc1,
+    0x58, 0x70, 0xc6, 0xcf, 0xd7, 0x08, 0x7d, 0xe1, 0xc3, 0x00, 0x4e, 0x08,
+    0x7d, 0xe9, 0xc3, 0x20, 0x18, 0x08, 0x7d, 0xf1, 0xc2, 0x00, 0x67, 0x08,
+    0x7e, 0x09, 0xc4, 0x5d, 0xe2, 0x08, 0x7e, 0x10, 0xc4, 0x01, 0xc3, 0x01,
+    0x3a, 0x61, 0x43, 0x00, 0x55, 0xc1, 0x58, 0x7c, 0x12, 0x41, 0x58, 0x88,
+    0xc6, 0xd3, 0xc1, 0x01, 0x34, 0xa1, 0xc5, 0xd4, 0x3e, 0x0f, 0x9c, 0x61,
+    0x47, 0x53, 0xfe, 0x41, 0x58, 0x97, 0x51, 0x4f, 0x69, 0xc1, 0x58, 0x9d,
+    0x14, 0x41, 0x59, 0x0e, 0x48, 0x5b, 0x32, 0xc1, 0x59, 0x18, 0x10, 0xc1,
+    0x59, 0x24, 0x4f, 0x66, 0xcf, 0xc1, 0x59, 0x30, 0x44, 0x31, 0xef, 0x41,
+    0x59, 0x3c, 0x0b, 0xc1, 0x59, 0x44, 0x07, 0x41, 0x59, 0x50, 0x43, 0x00,
+    0x4a, 0xc1, 0x59, 0x5c, 0x11, 0xc1, 0x59, 0x66, 0x45, 0x0b, 0x12, 0xc1,
+    0x59, 0x72, 0x42, 0x00, 0x2d, 0x41, 0x59, 0x7e, 0x43, 0x06, 0xa8, 0xc1,
+    0x59, 0x8a, 0xcf, 0x64, 0x0e, 0x00, 0xd5, 0xb0, 0x46, 0x18, 0x54, 0xc1,
+    0x59, 0x96, 0xcf, 0x0e, 0x7d, 0x01, 0x06, 0xd9, 0xc4, 0x1e, 0xc9, 0x00,
+    0x18, 0x1b, 0x01, 0x59, 0xa8, 0xd1, 0x52, 0x55, 0x00, 0x18, 0x90, 0x11,
+    0xc1, 0x59, 0xac, 0x07, 0xc1, 0x59, 0xbc, 0xc8, 0x20, 0xa9, 0x00, 0x18,
+    0x42, 0x01, 0x59, 0xc8, 0x49, 0xa8, 0x70, 0xc1, 0x59, 0xd4, 0xd0, 0x5e,
+    0xf2, 0x00, 0x1a, 0x38, 0xce, 0x3b, 0x7a, 0x01, 0x06, 0xe1, 0xc6, 0xcf,
+    0xef, 0x00, 0x1a, 0x90, 0x49, 0x05, 0xf9, 0xc1, 0x59, 0xf3, 0x48, 0xba,
+    0x9a, 0xc1, 0x59, 0xff, 0xd0, 0x08, 0xf7, 0x00, 0x18, 0x13, 0x01, 0x5a,
+    0x2b, 0x03, 0xc1, 0x5a, 0x31, 0x11, 0xc1, 0x5a, 0x40, 0xc6, 0xbd, 0xf4,
+    0x00, 0x19, 0x38, 0x45, 0x2e, 0xef, 0xc1, 0x5a, 0x4f, 0xce, 0x6c, 0x98,
+    0x00, 0xee, 0x19, 0xca, 0xa2, 0x4c, 0x00, 0xee, 0x11, 0x47, 0x25, 0xae,
+    0xc1, 0x5a, 0x59, 0x16, 0xc1, 0x5a, 0x65, 0xcc, 0x84, 0x81, 0x00, 0x19,
+    0xe0, 0xca, 0xa0, 0x6c, 0x08, 0x99, 0xd9, 0x14, 0x41, 0x5a, 0x6b, 0x4b,
+    0x94, 0xe8, 0xc1, 0x5a, 0x7a, 0x50, 0x5c, 0x02, 0x41, 0x5a, 0x86, 0x12,
+    0xc1, 0x5a, 0x92, 0xc7, 0x04, 0xed, 0x00, 0xee, 0x91, 0xc7, 0x0a, 0x80,
+    0x00, 0xee, 0x88, 0xc7, 0x05, 0x00, 0x00, 0xee, 0x81, 0x10, 0x41, 0x5a,
+    0x9e, 0xc5, 0x05, 0x02, 0x00, 0xee, 0x79, 0xc5, 0x00, 0xd4, 0x00, 0x1a,
+    0xd8, 0xc5, 0xcc, 0x90, 0x00, 0x19, 0x43, 0x01, 0x5a, 0xaa, 0xce, 0x6d,
+    0xf6, 0x00, 0xd5, 0xb9, 0xc7, 0x7d, 0xa5, 0x00, 0x18, 0x29, 0x51, 0x52,
+    0x33, 0x41, 0x5a, 0xb0, 0xc5, 0x60, 0xb2, 0x00, 0x18, 0x23, 0x01, 0x5a,
+    0xce, 0xcf, 0x68, 0x55, 0x00, 0x19, 0x00, 0x49, 0x60, 0xf4, 0xc1, 0x5a,
+    0xd6, 0x03, 0x41, 0x5a, 0xe2, 0xd0, 0x5d, 0xa2, 0x00, 0xd6, 0x31, 0xce,
+    0x70, 0xc0, 0x00, 0x1a, 0x50, 0xc8, 0xbb, 0x12, 0x00, 0xd5, 0xa9, 0x00,
+    0x41, 0x5a, 0xee, 0xc8, 0x9e, 0x5c, 0x00, 0x18, 0x49, 0xc2, 0x00, 0xc0,
+    0x00, 0x18, 0xd9, 0xce, 0x6b, 0xf0, 0x00, 0x1a, 0x58, 0x45, 0x02, 0x6d,
+    0xc1, 0x5a, 0xfa, 0xc5, 0x1e, 0xc8, 0x00, 0x19, 0xf0, 0xca, 0x8d, 0xb1,
+    0x01, 0x02, 0x91, 0xc2, 0x00, 0xfe, 0x00, 0x02, 0x00, 0x4b, 0x93, 0x04,
+    0xc1, 0x5b, 0x06, 0x4b, 0x99, 0xef, 0x41, 0x5b, 0x24, 0xc4, 0xde, 0xbf,
+    0x01, 0x19, 0xa9, 0xc4, 0xe3, 0x37, 0x01, 0x19, 0xa0, 0x45, 0x00, 0x8c,
+    0xc1, 0x5b, 0x42, 0x43, 0x54, 0xfc, 0x41, 0x5b, 0x54, 0xc5, 0xdc, 0x86,
+    0x0f, 0x9c, 0xd9, 0xd3, 0x42, 0x09, 0x00, 0x04, 0xd8, 0xc6, 0x0e, 0xbd,
+    0x01, 0x12, 0xa1, 0xc4, 0x00, 0xba, 0x01, 0x05, 0x08, 0x4c, 0x29, 0xba,
+    0xc1, 0x5b, 0x63, 0x46, 0x10, 0x79, 0x41, 0x5b, 0xd0, 0x4e, 0x0b, 0x18,
+    0xc1, 0x5b, 0xea, 0x49, 0x29, 0x29, 0x41, 0x5c, 0x57, 0xce, 0x74, 0x4e,
+    0x08, 0x17, 0x01, 0x46, 0x09, 0x97, 0xc1, 0x5c, 0x63, 0x47, 0x34, 0x2f,
+    0x41, 0x5c, 0x81, 0xc9, 0x11, 0xf6, 0x01, 0x67, 0xc9, 0xd4, 0x2f, 0xe2,
+    0x01, 0x67, 0xd1, 0xd6, 0x2f, 0xe0, 0x01, 0x67, 0xd9, 0xcd, 0x4b, 0xac,
+    0x01, 0x67, 0xe0, 0xd0, 0x53, 0xaa, 0x01, 0x67, 0xe9, 0xc8, 0x11, 0xf7,
+    0x01, 0x67, 0xf0, 0xcd, 0x80, 0x02, 0x0f, 0xa8, 0x81, 0x4d, 0x7f, 0x32,
+    0xc1, 0x5c, 0x9f, 0xc4, 0xe3, 0x33, 0x0f, 0xa6, 0xa9, 0x17, 0xc1, 0x5c,
+    0xab, 0xd8, 0x24, 0xfb, 0x01, 0x52, 0x69, 0x42, 0x06, 0x62, 0x41, 0x5c,
+    0xba, 0xd3, 0x41, 0x97, 0x01, 0x3f, 0x99, 0x05, 0xc1, 0x5c, 0xcc, 0xc8,
+    0x1e, 0x3f, 0x01, 0x11, 0x89, 0xd1, 0x05, 0x75, 0x01, 0x0d, 0xd9, 0x16,
+    0xc1, 0x5c, 0xd8, 0x45, 0x00, 0x2c, 0xc1, 0x5c, 0xe4, 0x48, 0x03, 0xc8,
+    0x41, 0x5c, 0xf0, 0x16, 0xc1, 0x5c, 0xf6, 0x07, 0xc1, 0x5d, 0x06, 0x44,
+    0x26, 0x78, 0xc1, 0x5d, 0x12, 0x15, 0xc1, 0x5d, 0x1e, 0x08, 0xc1, 0x5d,
+    0x2a, 0x43, 0x05, 0x14, 0x41, 0x5d, 0x36, 0xc9, 0xad, 0xe3, 0x0f, 0x99,
+    0x49, 0xc4, 0x2a, 0x90, 0x0f, 0x99, 0x41, 0xc4, 0x27, 0x54, 0x0f, 0x99,
+    0x39, 0xc7, 0xc2, 0x34, 0x0f, 0x99, 0x50, 0x05, 0xc1, 0x5d, 0x42, 0x0a,
+    0xc1, 0x5d, 0x56, 0xde, 0x0f, 0x7c, 0x01, 0x3a, 0x11, 0x19, 0xc1, 0x5d,
+    0x6e, 0x06, 0xc1, 0x5d, 0x78, 0x0e, 0xc1, 0x5d, 0x86, 0x47, 0x34, 0x2f,
+    0xc1, 0x5d, 0x92, 0x16, 0xc1, 0x5d, 0xa8, 0xc6, 0x0e, 0xbd, 0x01, 0x14,
+    0xe1, 0x03, 0xc1, 0x5d, 0xb7, 0x14, 0xc1, 0x5d, 0xc3, 0x0f, 0xc1, 0x5d,
+    0xcf, 0x12, 0xc1, 0x5d, 0xdb, 0x0b, 0xc1, 0x5d, 0xf3, 0xcc, 0x07, 0xc7,
+    0x01, 0x4e, 0x09, 0x04, 0xc1, 0x5e, 0x05, 0xcc, 0x07, 0xbb, 0x01, 0x4d,
+    0xb1, 0x9a, 0x01, 0x5d, 0xf1, 0xcf, 0x69, 0xcc, 0x0f, 0x88, 0x69, 0xc6,
+    0x0b, 0x09, 0x0f, 0xbe, 0xb9, 0x0d, 0x41, 0x5e, 0x11, 0x45, 0x00, 0x8c,
+    0xc1, 0x5e, 0x1d, 0x5e, 0x0e, 0xe6, 0x41, 0x5e, 0x47, 0x97, 0x09, 0x1b,
+    0x53, 0x01, 0x5e, 0x4d, 0x83, 0x09, 0x1a, 0xeb, 0x01, 0x5e, 0x64, 0x8b,
+    0x09, 0x1b, 0x1b, 0x01, 0x5e, 0x76, 0xc2, 0x8d, 0xc6, 0x09, 0x1b, 0x10,
+    0x94, 0x09, 0x19, 0x43, 0x01, 0x5e, 0x91, 0x00, 0xc1, 0x5e, 0xae, 0x8f,
+    0x09, 0x18, 0xeb, 0x01, 0x5e, 0xc1, 0x1c, 0xc1, 0x5e, 0xd6, 0xc4, 0xde,
+    0x97, 0x09, 0x1a, 0xc9, 0xc2, 0x01, 0xe2, 0x09, 0x1a, 0x8b, 0x01, 0x5e,
+    0xe1, 0x90, 0x09, 0x19, 0x33, 0x01, 0x5e, 0xf5, 0x86, 0x09, 0x18, 0x9b,
+    0x01, 0x5e, 0xfb, 0x84, 0x09, 0x18, 0x91, 0x9f, 0x09, 0x18, 0x88, 0x97,
+    0x09, 0x18, 0x2b, 0x01, 0x5f, 0x05, 0x83, 0x09, 0x17, 0x5b, 0x01, 0x5f,
+    0x1d, 0x8b, 0x09, 0x17, 0xf3, 0x01, 0x5f, 0x3c, 0x87, 0x09, 0x17, 0xe2,
+    0x01, 0x5f, 0x51, 0x8b, 0x09, 0x16, 0xdb, 0x01, 0x5f, 0x57, 0x0a, 0xc1,
+    0x5f, 0x6e, 0x83, 0x09, 0x14, 0x9b, 0x01, 0x5f, 0x87, 0x97, 0x09, 0x17,
+    0x12, 0x01, 0x5f, 0x9f, 0x8b, 0x09, 0x12, 0x63, 0x01, 0x5f, 0xc0, 0x97,
+    0x09, 0x13, 0x0b, 0x01, 0x5f, 0xde, 0x83, 0x09, 0x11, 0xf3, 0x01, 0x5f,
+    0xee, 0x87, 0x09, 0x12, 0x42, 0x01, 0x60, 0x06, 0x97, 0x09, 0x11, 0x63,
+    0x01, 0x60, 0x0a, 0x8b, 0x09, 0x11, 0x53, 0x01, 0x60, 0x2c, 0x87, 0x09,
+    0x11, 0x43, 0x01, 0x60, 0x36, 0x83, 0x09, 0x11, 0x02, 0x01, 0x60, 0x3d,
+    0x97, 0x09, 0x0f, 0xdb, 0x01, 0x60, 0x56, 0x83, 0x09, 0x0d, 0xbb, 0x01,
+    0x60, 0x7f, 0x8b, 0x09, 0x0f, 0xba, 0x01, 0x60, 0x9f, 0x83, 0x09, 0x0a,
+    0xbb, 0x01, 0x60, 0xaf, 0xc5, 0xd5, 0xf6, 0x09, 0x0d, 0xb1, 0x97, 0x09,
+    0x0d, 0x53, 0x01, 0x60, 0xe5, 0x8b, 0x09, 0x0d, 0x03, 0x01, 0x61, 0x12,
+    0xc4, 0x73, 0x32, 0x09, 0x0c, 0xf8, 0x8b, 0x09, 0x09, 0x6b, 0x01, 0x61,
+    0x24, 0x83, 0x09, 0x09, 0x4b, 0x01, 0x61, 0x2a, 0x97, 0x09, 0x09, 0xba,
+    0x01, 0x61, 0x32, 0x97, 0x09, 0x08, 0xb3, 0x01, 0x61, 0x47, 0x8b, 0x09,
+    0x08, 0x03, 0x01, 0x61, 0x6d, 0x07, 0xc1, 0x61, 0x8a, 0x83, 0x09, 0x05,
+    0xaa, 0x01, 0x61, 0x99, 0xc3, 0x0a, 0xe2, 0x09, 0x05, 0x0b, 0x01, 0x61,
+    0xd5, 0xc3, 0x05, 0x4e, 0x09, 0x05, 0x03, 0x01, 0x61, 0xd9, 0x14, 0xc1,
+    0x61, 0xdf, 0x9f, 0x09, 0x04, 0x6b, 0x01, 0x61, 0xee, 0x90, 0x09, 0x04,
+    0xbb, 0x01, 0x61, 0xf4, 0x8e, 0x09, 0x04, 0xb1, 0xc3, 0xe0, 0x5f, 0x09,
+    0x04, 0xa9, 0xc3, 0x03, 0x30, 0x09, 0x04, 0xa1, 0x00, 0x41, 0x61, 0xf8,
+    0x97, 0x09, 0x03, 0xd3, 0x01, 0x62, 0x04, 0x8b, 0x09, 0x03, 0x93, 0x01,
+    0x62, 0x27, 0x83, 0x09, 0x02, 0xaa, 0x01, 0x62, 0x42, 0x97, 0x09, 0x02,
+    0x6b, 0x01, 0x62, 0x5a, 0x83, 0x09, 0x02, 0x03, 0x01, 0x62, 0x6e, 0x8b,
+    0x09, 0x02, 0x4a, 0x01, 0x62, 0x92, 0x86, 0x09, 0x00, 0xe3, 0x01, 0x62,
+    0x98, 0x84, 0x09, 0x00, 0x53, 0x01, 0x62, 0x9e, 0xc3, 0x01, 0xc3, 0x09,
+    0x01, 0x5b, 0x01, 0x62, 0xa9, 0x15, 0xc1, 0x62, 0xaf, 0x14, 0xc1, 0x62,
+    0xbc, 0xc3, 0x0e, 0x61, 0x09, 0x01, 0x99, 0x90, 0x09, 0x01, 0x6b, 0x01,
+    0x62, 0xcb, 0x8e, 0x09, 0x01, 0x03, 0x01, 0x62, 0xd5, 0x8d, 0x09, 0x00,
+    0xeb, 0x01, 0x62, 0xe7, 0x9f, 0x09, 0x00, 0x49, 0x47, 0x03, 0x4c, 0x41,
+    0x62, 0xed, 0x8b, 0x09, 0x13, 0xfb, 0x01, 0x63, 0x1b, 0xc4, 0x73, 0x32,
+    0x09, 0x13, 0xf3, 0x01, 0x63, 0x23, 0x83, 0x09, 0x13, 0xd2, 0x01, 0x63,
+    0x29, 0x97, 0x09, 0x14, 0x91, 0x8b, 0x09, 0x14, 0x89, 0x83, 0x09, 0x14,
+    0x7a, 0x01, 0x63, 0x35, 0xc2, 0x01, 0xe2, 0x09, 0x0a, 0xb1, 0x94, 0x09,
+    0x0a, 0xa9, 0x90, 0x09, 0x0a, 0xa1, 0x8f, 0x09, 0x0a, 0x73, 0x01, 0x63,
+    0x39, 0x8e, 0x09, 0x0a, 0x5b, 0x01, 0x63, 0x43, 0x89, 0x09, 0x0a, 0x2b,
+    0x01, 0x63, 0x4d, 0xc3, 0x7e, 0x08, 0x09, 0x0a, 0x13, 0x01, 0x63, 0x54,
+    0x84, 0x09, 0x0a, 0x09, 0xc2, 0x00, 0xd3, 0x09, 0x0a, 0x00, 0xc9, 0xa8,
+    0xd3, 0x09, 0x23, 0xa1, 0xc8, 0xbd, 0xe2, 0x09, 0x23, 0x99, 0xc5, 0x33,
+    0x24, 0x09, 0x23, 0x90, 0x43, 0x02, 0x6f, 0xc1, 0x63, 0x5a, 0x44, 0xe0,
+    0x57, 0x41, 0x63, 0x82, 0x45, 0x00, 0x2d, 0xc1, 0x63, 0x8e, 0x47, 0xc0,
+    0x43, 0x41, 0x63, 0xb6, 0x45, 0x1b, 0xec, 0xc1, 0x63, 0xc6, 0x43, 0x4d,
+    0x57, 0xc1, 0x63, 0xeb, 0x54, 0x38, 0x68, 0x41, 0x64, 0x13, 0x44, 0x0d,
+    0x14, 0xc1, 0x64, 0x1f, 0x44, 0x09, 0x9e, 0x41, 0x64, 0x43, 0x43, 0x02,
+    0x6f, 0xc1, 0x64, 0x72, 0x50, 0x5b, 0x82, 0x41, 0x64, 0x98, 0x43, 0x02,
+    0xa0, 0xc1, 0x64, 0xa4, 0x45, 0x02, 0xde, 0x41, 0x64, 0xc9, 0x42, 0x01,
+    0xc8, 0xc1, 0x64, 0xee, 0xd1, 0x57, 0x2e, 0x01, 0x1d, 0x50, 0xc8, 0xb7,
+    0x32, 0x0f, 0xa5, 0x89, 0xc4, 0x00, 0xba, 0x00, 0x05, 0x20, 0xc8, 0x7d,
+    0xa4, 0x07, 0xf2, 0x51, 0xc8, 0x80, 0x2e, 0x07, 0xf2, 0x70, 0x9f, 0x09,
+    0x7f, 0x91, 0x9e, 0x09, 0x7f, 0x88, 0x1e, 0xc1, 0x64, 0xfa, 0x1d, 0x41,
+    0x65, 0x06, 0x26, 0xc1, 0x65, 0x2a, 0x25, 0xc1, 0x65, 0x4e, 0x24, 0xc1,
+    0x65, 0x76, 0x23, 0xc1, 0x65, 0x9d, 0x22, 0xc1, 0x65, 0xc1, 0x21, 0xc1,
+    0x65, 0xe5, 0x20, 0xc1, 0x65, 0xfd, 0x1f, 0xc1, 0x66, 0x1d, 0x1e, 0xc1,
+    0x66, 0x3d, 0x1d, 0x41, 0x66, 0x5c, 0x87, 0x08, 0x41, 0x99, 0x8b, 0x08,
+    0x41, 0xa1, 0x91, 0x08, 0x41, 0xa9, 0x83, 0x08, 0x41, 0x90, 0x83, 0x08,
+    0x41, 0xb9, 0x87, 0x08, 0x41, 0xc0, 0x83, 0x08, 0x41, 0xe1, 0x91, 0x08,
+    0x41, 0xf8, 0x83, 0x08, 0x40, 0x29, 0x91, 0x08, 0x40, 0x40, 0x83, 0x08,
+    0x40, 0x51, 0x87, 0x08, 0x40, 0x59, 0x8b, 0x08, 0x40, 0x61, 0x91, 0x08,
+    0x40, 0x69, 0x97, 0x08, 0x40, 0x70, 0x83, 0x08, 0x40, 0x79, 0x87, 0x08,
+    0x40, 0x81, 0x8b, 0x08, 0x40, 0x89, 0x91, 0x08, 0x40, 0x91, 0x97, 0x08,
+    0x40, 0x98, 0x83, 0x08, 0x40, 0xa1, 0x87, 0x08, 0x40, 0xa9, 0x8b, 0x08,
+    0x40, 0xb1, 0x91, 0x08, 0x40, 0xb9, 0x97, 0x08, 0x40, 0xc0, 0x83, 0x08,
+    0x40, 0xc9, 0x87, 0x08, 0x40, 0xd1, 0x8b, 0x08, 0x40, 0xd9, 0x91, 0x08,
+    0x40, 0xe1, 0x97, 0x08, 0x40, 0xe8, 0x83, 0x08, 0x40, 0xf1, 0x87, 0x08,
+    0x40, 0xf9, 0x8b, 0x08, 0x41, 0x01, 0x91, 0x08, 0x41, 0x09, 0x97, 0x08,
+    0x41, 0x10, 0x83, 0x08, 0x41, 0x19, 0x87, 0x08, 0x41, 0x21, 0x8b, 0x08,
+    0x41, 0x29, 0x91, 0x08, 0x41, 0x31, 0x97, 0x08, 0x41, 0x38, 0x83, 0x08,
+    0x41, 0x41, 0x87, 0x08, 0x41, 0x49, 0x8b, 0x08, 0x41, 0x51, 0x91, 0x08,
+    0x41, 0x59, 0x97, 0x08, 0x41, 0x60, 0x83, 0x08, 0x41, 0x69, 0x87, 0x08,
+    0x41, 0x71, 0x8b, 0x08, 0x41, 0x79, 0x91, 0x08, 0x41, 0x81, 0x97, 0x08,
+    0x41, 0x88, 0x97, 0x00, 0x22, 0x1b, 0x01, 0x66, 0x7c, 0x16, 0xc1, 0x66,
+    0x8f, 0x19, 0xc1, 0x66, 0xb2, 0x10, 0xc1, 0x66, 0xbc, 0x0e, 0xc1, 0x66,
+    0xce, 0x14, 0xc1, 0x66, 0xe6, 0x87, 0x00, 0x22, 0x6b, 0x01, 0x66, 0xf8,
+    0x06, 0xc1, 0x67, 0x25, 0x15, 0xc1, 0x67, 0x48, 0x12, 0xc1, 0x67, 0x6a,
+    0x83, 0x00, 0x21, 0x83, 0x01, 0x67, 0x7d, 0xc2, 0x0f, 0x9a, 0x00, 0x28,
+    0xd9, 0x1b, 0xc1, 0x67, 0x8f, 0x0d, 0xc1, 0x67, 0xab, 0x0a, 0xc1, 0x67,
+    0xc8, 0x09, 0xc1, 0x67, 0xd5, 0x04, 0xc1, 0x67, 0xe4, 0x91, 0x00, 0x21,
+    0xf3, 0x01, 0x68, 0x02, 0x8b, 0x00, 0x21, 0xc3, 0x01, 0x68, 0x15, 0x1c,
+    0xc1, 0x68, 0x32, 0x05, 0xc1, 0x68, 0x3d, 0x44, 0x13, 0x35, 0xc1, 0x68,
+    0x58, 0xc2, 0x00, 0x5f, 0x00, 0x21, 0x91, 0xc2, 0x1c, 0x52, 0x00, 0x22,
+    0xc1, 0xc4, 0xe0, 0x1b, 0x00, 0x23, 0x98, 0xc4, 0xe2, 0x37, 0x00, 0x26,
+    0xa9, 0xc6, 0xcf, 0xe9, 0x00, 0x25, 0xa9, 0xc6, 0xce, 0xb7, 0x00, 0x25,
+    0x28, 0x87, 0x00, 0x21, 0x6b, 0x01, 0x68, 0x64, 0x06, 0xc1, 0x68, 0x91,
+    0x15, 0xc1, 0x68, 0xb4, 0x12, 0xc1, 0x68, 0xd6, 0x83, 0x00, 0x20, 0x83,
+    0x01, 0x68, 0xe3, 0xc2, 0x00, 0x28, 0x00, 0x28, 0xe1, 0xc2, 0x0f, 0x9a,
+    0x00, 0x28, 0xd1, 0x1b, 0xc1, 0x68, 0xf5, 0x14, 0xc1, 0x69, 0x11, 0x0e,
+    0xc1, 0x69, 0x23, 0x0d, 0xc1, 0x69, 0x35, 0x0a, 0xc1, 0x69, 0x52, 0x09,
+    0xc1, 0x69, 0x5f, 0x05, 0xc1, 0x69, 0x6e, 0x97, 0x00, 0x21, 0x1b, 0x01,
+    0x69, 0x89, 0x04, 0xc1, 0x69, 0x96, 0x91, 0x00, 0x20, 0xf3, 0x01, 0x69,
+    0xb4, 0x8b, 0x00, 0x20, 0xc3, 0x01, 0x69, 0xc7, 0x1c, 0xc1, 0x69, 0xe4,
+    0x16, 0xc1, 0x69, 0xef, 0xc2, 0x1c, 0x52, 0x00, 0x20, 0x41, 0x10, 0xc1,
+    0x6a, 0x06, 0xc2, 0x00, 0x5f, 0x00, 0x20, 0x91, 0x44, 0x13, 0x35, 0xc1,
+    0x6a, 0x12, 0xc4, 0xe0, 0x1b, 0x00, 0x23, 0x90, 0xc4, 0xe2, 0x37, 0x00,
+    0x26, 0xa1, 0xc6, 0xcf, 0xe9, 0x00, 0x25, 0xa1, 0xc6, 0xce, 0xb7, 0x00,
+    0x25, 0x20, 0xc2, 0x02, 0xa0, 0x0f, 0xdf, 0x91, 0xc4, 0x02, 0xde, 0x0f,
+    0xdf, 0x98, 0xc3, 0x09, 0x9e, 0x0f, 0xdf, 0xa1, 0xc3, 0x0d, 0x14, 0x0f,
+    0xdf, 0xa8, 0xc2, 0x22, 0xcc, 0x0f, 0xdf, 0xb1, 0xc4, 0x18, 0x10, 0x0f,
+    0xdf, 0xb8, 0xa0, 0x00, 0x04, 0x79, 0x9f, 0x00, 0x04, 0x70, 0x47, 0xc2,
+    0x50, 0xc1, 0x6a, 0x1e, 0x43, 0x00, 0x2c, 0xc1, 0x6a, 0x2a, 0x0e, 0xc1,
+    0x6a, 0x30, 0xde, 0x0f, 0xb8, 0x01, 0x00, 0xd9, 0xd4, 0x3e, 0xd0, 0x00,
+    0x04, 0xd0, 0x47, 0x34, 0x2f, 0xc1, 0x6a, 0x3a, 0x46, 0x09, 0x97, 0x41,
+    0x6a, 0x58, 0xcb, 0x1e, 0x89, 0x00, 0x6c, 0x09, 0x03, 0xc1, 0x6a, 0x76,
+    0xc9, 0xb2, 0x24, 0x00, 0x6c, 0x18, 0x46, 0x02, 0x0f, 0xc1, 0x6a, 0x82,
+    0x4a, 0x9d, 0xec, 0x41, 0x6a, 0xd0, 0xca, 0x63, 0xc8, 0x00, 0x6e, 0x79,
+    0x0d, 0xc1, 0x6a, 0xf4, 0x45, 0x63, 0xc3, 0xc1, 0x6b, 0x00, 0x42, 0x01,
+    0x30, 0x41, 0x6b, 0x1e, 0x47, 0x01, 0xbb, 0xc1, 0x6b, 0x2a, 0x43, 0x46,
+    0xac, 0x41, 0x6b, 0x34, 0x0b, 0xc1, 0x6b, 0x46, 0xc8, 0x11, 0xf7, 0x0e,
+    0xd4, 0x41, 0x0e, 0xc1, 0x6b, 0x52, 0x48, 0xb8, 0x0a, 0xc1, 0x6b, 0x5e,
+    0x5c, 0x12, 0x39, 0x41, 0x6b, 0x70, 0x11, 0xc1, 0x6b, 0x7f, 0x46, 0x94,
+    0x69, 0x41, 0x6b, 0x8b, 0xc8, 0x52, 0x00, 0x0e, 0xd4, 0x49, 0x48, 0x18,
+    0xb0, 0xc1, 0x6b, 0x9d, 0x47, 0xc0, 0x12, 0xc1, 0x6b, 0xa9, 0x47, 0xc6,
+    0xe8, 0xc1, 0x6b, 0xb9, 0x46, 0xd0, 0xb5, 0x41, 0x6b, 0xc5, 0x47, 0x7f,
+    0x5a, 0xc1, 0x6b, 0xd7, 0x0b, 0x41, 0x6b, 0xdf, 0xe0, 0x00, 0x67, 0x0e,
+    0xd3, 0xa8, 0x11, 0xc1, 0x6b, 0xe9, 0x07, 0xc1, 0x6b, 0xfb, 0x46, 0xcd,
+    0x13, 0x41, 0x6c, 0x0a, 0xc9, 0xaa, 0xb9, 0x0e, 0xd3, 0x61, 0xc3, 0x10,
+    0xa1, 0x0e, 0xd1, 0x81, 0x42, 0x0c, 0x43, 0x41, 0x6c, 0x16, 0x03, 0xc1,
+    0x6c, 0x32, 0xc3, 0x01, 0x9c, 0x0e, 0xcf, 0xfa, 0x01, 0x6c, 0x3e, 0xc3,
+    0x6b, 0x04, 0x0e, 0xd3, 0x51, 0x44, 0x12, 0x51, 0x41, 0x6c, 0x42, 0x47,
+    0xc3, 0xdf, 0xc1, 0x6c, 0x52, 0x44, 0x1a, 0x39, 0x41, 0x6c, 0x6a, 0x45,
+    0xdb, 0x37, 0xc1, 0x6c, 0x9e, 0x44, 0xdc, 0x0a, 0x41, 0x6c, 0xaa, 0x44,
+    0xcf, 0x23, 0xc1, 0x6c, 0xbc, 0x44, 0x87, 0x15, 0x41, 0x6c, 0xc8, 0x4f,
+    0x61, 0xa7, 0xc1, 0x6c, 0xd4, 0x47, 0xc6, 0x55, 0x41, 0x6c, 0xe6, 0xc7,
+    0x0b, 0xc8, 0x0e, 0xc8, 0x51, 0xc8, 0x3b, 0xec, 0x0e, 0xc8, 0x49, 0xc6,
+    0x24, 0x3b, 0x0e, 0xc8, 0x40, 0xca, 0x22, 0x51, 0x01, 0x39, 0xb1, 0xd4,
+    0x3e, 0xbc, 0x0f, 0xa9, 0x79, 0xcd, 0x0e, 0x61, 0x0f, 0xbe, 0x68, 0x03,
+    0xc1, 0x6d, 0x0e, 0x91, 0x08, 0xad, 0xd1, 0x87, 0x08, 0xad, 0xc1, 0xc9,
+    0xb2, 0x2d, 0x08, 0xad, 0xa3, 0x01, 0x6d, 0x23, 0x97, 0x08, 0xad, 0x93,
+    0x01, 0x6d, 0x27, 0x8b, 0x08, 0xad, 0x82, 0x01, 0x6d, 0x2b, 0x83, 0x08,
+    0xac, 0x03, 0x01, 0x6d, 0x2f, 0x16, 0xc1, 0x6d, 0x41, 0xc2, 0x00, 0xd0,
+    0x08, 0xad, 0x71, 0x15, 0xc1, 0x6d, 0x56, 0x18, 0xc1, 0x6d, 0x66, 0xc2,
+    0x00, 0xdb, 0x08, 0xad, 0x49, 0xc2, 0x00, 0x39, 0x08, 0xad, 0x41, 0xc2,
+    0x19, 0x2c, 0x08, 0xad, 0x39, 0xc2, 0x01, 0xc3, 0x08, 0xad, 0x31, 0x04,
+    0xc1, 0x6d, 0x70, 0x12, 0xc1, 0x6d, 0x7a, 0x10, 0xc1, 0x6d, 0x84, 0x06,
+    0xc1, 0x6d, 0x9a, 0x0c, 0xc1, 0x6d, 0xa8, 0x05, 0xc1, 0x6d, 0xb2, 0x09,
+    0xc1, 0x6d, 0xbc, 0x0d, 0xc1, 0x6d, 0xc6, 0x91, 0x08, 0xac, 0x61, 0x87,
+    0x08, 0xac, 0x51, 0x97, 0x08, 0xac, 0x23, 0x01, 0x6d, 0xd0, 0x8b, 0x08,
+    0xac, 0x12, 0x01, 0x6d, 0xd4, 0x07, 0xc1, 0x6d, 0xd8, 0x44, 0x00, 0xbb,
+    0x41, 0x6d, 0xe4, 0xa0, 0x08, 0xae, 0x41, 0x9f, 0x08, 0xae, 0x39, 0x9e,
+    0x08, 0xae, 0x30, 0xcb, 0x97, 0xf5, 0x08, 0xae, 0x19, 0xc4, 0x19, 0x53,
+    0x08, 0xae, 0x10, 0xd3, 0x41, 0x25, 0x0f, 0xad, 0x09, 0xd1, 0x53, 0x10,
+    0x0f, 0xad, 0x01, 0xd4, 0x06, 0x73, 0x0f, 0xac, 0xd9, 0xd3, 0x43, 0x13,
+    0x0f, 0xac, 0xd0, 0xd3, 0x41, 0x25, 0x0f, 0xac, 0xf9, 0xd1, 0x53, 0x10,
+    0x0f, 0xac, 0xf1, 0xd4, 0x06, 0x73, 0x0f, 0xac, 0xc9, 0xd3, 0x43, 0x13,
+    0x0f, 0xac, 0xc0, 0x11, 0xc1, 0x6e, 0x02, 0xcc, 0x86, 0x85, 0x01, 0x31,
+    0x51, 0xc6, 0x0e, 0xbd, 0x01, 0x12, 0xd9, 0x45, 0x00, 0x8c, 0x41, 0x6e,
+    0x0e, 0xc4, 0x27, 0xe3, 0x00, 0x00, 0x11, 0xc7, 0xc3, 0x92, 0x00, 0x00,
+    0x09, 0x15, 0xc1, 0x6e, 0x1a, 0xce, 0x6d, 0x94, 0x00, 0x04, 0xb1, 0xcc,
+    0x87, 0xc9, 0x00, 0x04, 0xb0, 0xc4, 0x1d, 0xa8, 0x01, 0x1f, 0x21, 0xc6,
+    0xcd, 0xcd, 0x0f, 0xa6, 0x78, 0xcb, 0x99, 0x55, 0x0f, 0xde, 0x31, 0xc5,
+    0x21, 0xd2, 0x0f, 0xde, 0x48, 0xc4, 0x00, 0x49, 0x0f, 0xde, 0x39, 0xc5,
+    0x00, 0x2c, 0x0f, 0xde, 0x40, 0xcb, 0x1e, 0x89, 0x05, 0x46, 0x29, 0x42,
+    0x07, 0xb2, 0xc1, 0x6e, 0x26, 0xc8, 0x14, 0x38, 0x05, 0x44, 0x00, 0x03,
+    0xc1, 0x6e, 0x32, 0x91, 0x05, 0x46, 0x0b, 0x01, 0x6e, 0x3e, 0x87, 0x05,
+    0x45, 0xf3, 0x01, 0x6e, 0x42, 0x48, 0xb2, 0x2d, 0xc1, 0x6e, 0x46, 0x8b,
+    0x05, 0x45, 0xb3, 0x01, 0x6e, 0x54, 0x97, 0x05, 0x45, 0xc2, 0x01, 0x6e,
+    0x58, 0x15, 0xc1, 0x6e, 0x5c, 0xc2, 0x00, 0xd0, 0x05, 0x45, 0x91, 0x0e,
+    0xc1, 0x6e, 0x6c, 0x83, 0x05, 0x44, 0x13, 0x01, 0x6e, 0x76, 0x8b, 0x05,
+    0x44, 0x23, 0x01, 0x6e, 0x82, 0x97, 0x05, 0x44, 0x33, 0x01, 0x6e, 0x86,
+    0x18, 0xc1, 0x6e, 0x8a, 0x87, 0x05, 0x44, 0x63, 0x01, 0x6e, 0x94, 0x91,
+    0x05, 0x44, 0x7b, 0x01, 0x6e, 0x98, 0x0d, 0xc1, 0x6e, 0x9c, 0x09, 0xc1,
+    0x6e, 0xa6, 0x10, 0xc1, 0x6e, 0xb0, 0x05, 0xc1, 0x6e, 0xc6, 0x0c, 0xc1,
+    0x6e, 0xd0, 0x16, 0xc1, 0x6e, 0xda, 0x06, 0xc1, 0x6e, 0xe8, 0x12, 0xc1,
+    0x6e, 0xf6, 0x04, 0xc1, 0x6f, 0x00, 0xc2, 0x01, 0xc3, 0x05, 0x45, 0x51,
+    0xc2, 0x19, 0x2c, 0x05, 0x45, 0x59, 0xc2, 0x00, 0x39, 0x05, 0x45, 0x60,
+    0xc4, 0x19, 0x53, 0x05, 0x46, 0x71, 0xcb, 0x97, 0xf5, 0x05, 0x46, 0x79,
+    0x45, 0x09, 0x98, 0x41, 0x6f, 0x0a, 0x47, 0x00, 0x58, 0xc1, 0x6f, 0x2e,
+    0x48, 0xb9, 0x02, 0x41, 0x6f, 0x3a, 0x10, 0xc1, 0x6f, 0x40, 0xc6, 0xcd,
+    0x6d, 0x00, 0x41, 0xe1, 0xc5, 0xd7, 0x0e, 0x00, 0x41, 0xa1, 0xc5, 0xd3,
+    0xfd, 0x00, 0x41, 0x88, 0xcb, 0x96, 0x5e, 0x00, 0x41, 0xe9, 0xc9, 0xa9,
+    0x99, 0x00, 0x41, 0xa8, 0xc3, 0xdd, 0x83, 0x00, 0x41, 0xd1, 0xc4, 0xe1,
+    0x73, 0x00, 0x41, 0xc0, 0xc7, 0xc4, 0x33, 0x00, 0x41, 0x69, 0xce, 0x70,
+    0x34, 0x00, 0x40, 0xd9, 0xc6, 0x64, 0xa4, 0x00, 0x40, 0xc9, 0xc9, 0xac,
+    0x3c, 0x00, 0x40, 0xc1, 0xc2, 0x00, 0x74, 0x00, 0x40, 0xb2, 0x01, 0x6f,
+    0x4c, 0x8b, 0x00, 0x41, 0x41, 0xc7, 0xc3, 0x4c, 0x00, 0x41, 0x21, 0xce,
+    0x70, 0x34, 0x00, 0x40, 0xd0, 0xc4, 0xdb, 0xfb, 0x00, 0x41, 0x61, 0xc6,
+    0xc3, 0x4d, 0x00, 0x41, 0x28, 0xc9, 0xb1, 0x4c, 0x00, 0x41, 0x0a, 0x01,
+    0x6f, 0x52, 0x8b, 0x00, 0x41, 0x49, 0x97, 0x00, 0x41, 0x31, 0x83, 0x00,
+    0x41, 0x13, 0x01, 0x6f, 0x56, 0x87, 0x00, 0x40, 0xe0, 0x83, 0x00, 0x41,
+    0x00, 0xc3, 0xb8, 0xac, 0x00, 0x40, 0xa9, 0xc6, 0xcd, 0x07, 0x00, 0x40,
+    0x89, 0xc2, 0x00, 0x8d, 0x00, 0x40, 0x40, 0xc3, 0x00, 0xd0, 0x00, 0x40,
+    0xa1, 0xc6, 0xcf, 0x77, 0x00, 0x40, 0x70, 0x90, 0x00, 0x40, 0x79, 0x96,
+    0x00, 0x40, 0x39, 0x9b, 0x00, 0x40, 0x20, 0xc2, 0x04, 0xc6, 0x00, 0x40,
+    0x29, 0xc2, 0x00, 0x8d, 0x00, 0x40, 0x08, 0xc3, 0x02, 0x9b, 0x01, 0x52,
+    0xc1, 0xc2, 0x00, 0xbf, 0x01, 0x52, 0xb8, 0xc6, 0x00, 0x91, 0x0f, 0xa5,
+    0x21, 0xc4, 0x00, 0x87, 0x0f, 0xb1, 0xa1, 0xcd, 0x7f, 0x66, 0x0f, 0xb6,
+    0x60, 0xc9, 0x00, 0xca, 0x01, 0x54, 0xab, 0x01, 0x6f, 0x5a, 0xcc, 0x07,
+    0xc7, 0x01, 0x54, 0xb2, 0x01, 0x6f, 0x60, 0xc9, 0xab, 0x6d, 0x01, 0x5a,
+    0xd1, 0xcd, 0x7d, 0x2a, 0x01, 0x5a, 0xe0, 0x15, 0xc1, 0x6f, 0x66, 0xd1,
+    0x50, 0x68, 0x08, 0x8e, 0xe9, 0xca, 0x9d, 0x56, 0x08, 0x8e, 0xe1, 0x07,
+    0xc1, 0x6f, 0x7c, 0x06, 0xc1, 0x6f, 0x88, 0x46, 0x34, 0x6f, 0xc1, 0x6f,
+    0x9a, 0xd1, 0x50, 0xce, 0x08, 0x8e, 0x39, 0xc2, 0x00, 0x7a, 0x08, 0x8e,
+    0x21, 0x47, 0x02, 0x0e, 0x41, 0x6f, 0xa6, 0xc4, 0xe3, 0x9f, 0x08, 0x22,
+    0x81, 0x16, 0xc1, 0x70, 0x0b, 0xc4, 0xe0, 0xf7, 0x08, 0x22, 0x91, 0xc3,
+    0x1b, 0x05, 0x08, 0x22, 0x99, 0x15, 0xc1, 0x70, 0x15, 0xc6, 0xcc, 0x05,
+    0x08, 0x22, 0xb9, 0x42, 0x0c, 0x43, 0xc1, 0x70, 0x1f, 0x0a, 0xc1, 0x70,
+    0x27, 0xc3, 0xe5, 0xae, 0x08, 0x22, 0xd1, 0xc4, 0xe3, 0x63, 0x08, 0x22,
+    0xd9, 0xc3, 0x9e, 0xc8, 0x08, 0x22, 0xe1, 0xc3, 0x34, 0x6f, 0x08, 0x22,
+    0xe9, 0xc3, 0xe5, 0x39, 0x08, 0x22, 0xf9, 0x0f, 0xc1, 0x70, 0x33, 0xc5,
+    0xdd, 0x4e, 0x08, 0x23, 0x09, 0x42, 0x02, 0xa0, 0xc1, 0x70, 0x3f, 0xc4,
+    0xe1, 0x0f, 0x08, 0x23, 0x21, 0x0b, 0xc1, 0x70, 0x49, 0x07, 0xc1, 0x70,
+    0x59, 0x03, 0xc1, 0x70, 0x69, 0x11, 0xc1, 0x70, 0x8f, 0xc4, 0xdf, 0x73,
+    0x08, 0x23, 0x71, 0xc3, 0x20, 0x18, 0x08, 0x23, 0x79, 0xc2, 0x02, 0xae,
+    0x08, 0x23, 0x98, 0xc7, 0xc4, 0x64, 0x0d, 0xe5, 0x19, 0xc9, 0xb3, 0x05,
+    0x0d, 0xe5, 0x11, 0xd2, 0x4c, 0x7f, 0x0d, 0xe5, 0x09, 0xce, 0x70, 0x42,
+    0x0d, 0xe5, 0x00, 0x46, 0x03, 0x87, 0xc1, 0x70, 0xaf, 0xc9, 0xaf, 0x30,
+    0x01, 0x56, 0xf1, 0xc9, 0x32, 0xb7, 0x01, 0x56, 0xfb, 0x01, 0x70, 0xb5,
+    0xc7, 0xc4, 0x5d, 0x01, 0x57, 0x03, 0x01, 0x70, 0xbb, 0xd3, 0x46, 0xdc,
+    0x01, 0x5a, 0x71, 0x04, 0x41, 0x70, 0xbf, 0x91, 0x01, 0x09, 0xa1, 0x87,
+    0x01, 0x09, 0x79, 0x8e, 0x01, 0x08, 0x99, 0x89, 0x01, 0x08, 0x50, 0x8f,
+    0x01, 0x09, 0x99, 0x88, 0x01, 0x09, 0x89, 0x87, 0x01, 0x09, 0x81, 0x84,
+    0x01, 0x09, 0x61, 0x94, 0x01, 0x08, 0xd9, 0x92, 0x01, 0x08, 0xc1, 0x8e,
+    0x01, 0x08, 0x91, 0x8b, 0x01, 0x08, 0x81, 0x8a, 0x01, 0x08, 0x58, 0xd0,
+    0x5b, 0xc2, 0x0f, 0xc2, 0xb9, 0xcc, 0x82, 0x35, 0x01, 0x0e, 0xc9, 0xc5,
+    0x01, 0xa2, 0x01, 0x0c, 0xcb, 0x01, 0x70, 0xcb, 0x49, 0x01, 0xaa, 0xc1,
+    0x70, 0xcf, 0xcb, 0x01, 0xfc, 0x01, 0x58, 0x19, 0xcb, 0x94, 0x22, 0x01,
+    0x58, 0x59, 0xd5, 0x01, 0x92, 0x01, 0x5b, 0x4a, 0x01, 0x70, 0xe1, 0xd0,
+    0x5b, 0xc2, 0x0f, 0xc2, 0xb1, 0xc5, 0x01, 0xa2, 0x01, 0x0c, 0xc3, 0x01,
+    0x70, 0xe7, 0xcc, 0x82, 0x35, 0x01, 0x0e, 0xc1, 0x49, 0x01, 0xaa, 0xc1,
+    0x70, 0xeb, 0xcb, 0x01, 0xfc, 0x01, 0x58, 0x11, 0xcb, 0x94, 0x22, 0x01,
+    0x58, 0x51, 0xd5, 0x01, 0x92, 0x01, 0x5b, 0x42, 0x01, 0x70, 0xfd, 0xc5,
+    0x86, 0x2c, 0x08, 0xd4, 0xf9, 0xcc, 0x86, 0x25, 0x08, 0xd4, 0xf0, 0xc7,
+    0x40, 0xe5, 0x08, 0xd4, 0xb9, 0xc8, 0x14, 0x38, 0x08, 0xd4, 0xb1, 0xcb,
+    0x93, 0xf6, 0x08, 0xd4, 0x29, 0xcb, 0x8f, 0xe1, 0x08, 0xd4, 0x20, 0x8a,
+    0x08, 0xd4, 0x98, 0x89, 0x08, 0xd4, 0x60, 0x83, 0x08, 0xd4, 0x49, 0xc2,
+    0x00, 0xd0, 0x08, 0xd4, 0x40, 0xc3, 0x1d, 0x35, 0x08, 0xd4, 0x19, 0xc2,
+    0x00, 0xd0, 0x08, 0xd2, 0xe9, 0x83, 0x08, 0xd2, 0xe0, 0x83, 0x08, 0xd4,
+    0x09, 0xc2, 0x0d, 0xf6, 0x08, 0xd4, 0x01, 0xc2, 0x00, 0xd0, 0x08, 0xd3,
+    0xf8, 0x83, 0x08, 0xd3, 0xc9, 0xc2, 0x00, 0xd0, 0x08, 0xd3, 0xc0, 0xc2,
+    0x02, 0x1c, 0x08, 0xd3, 0xb9, 0xc2, 0x00, 0xd0, 0x08, 0xd3, 0x71, 0x83,
+    0x08, 0xd3, 0x69, 0x06, 0x41, 0x71, 0x03, 0x15, 0xc1, 0x71, 0x0d, 0xc2,
+    0x00, 0xd0, 0x08, 0xd3, 0x61, 0x83, 0x08, 0xd3, 0x59, 0x16, 0x41, 0x71,
+    0x17, 0xc2, 0x00, 0xd0, 0x08, 0xd3, 0x99, 0x83, 0x08, 0xd3, 0x90, 0xc2,
+    0x00, 0xd0, 0x08, 0xd3, 0x89, 0x83, 0x08, 0xd3, 0x80, 0x83, 0x08, 0xd3,
+    0x79, 0xc2, 0x00, 0xc1, 0x08, 0xd3, 0x51, 0xc2, 0x19, 0x2c, 0x08, 0xd3,
+    0x29, 0xc2, 0x01, 0x30, 0x08, 0xd3, 0x00, 0xc2, 0x00, 0xd0, 0x08, 0xd3,
+    0x21, 0x83, 0x08, 0xd3, 0x18, 0xc2, 0x00, 0xd0, 0x08, 0xd3, 0x11, 0x83,
+    0x08, 0xd3, 0x08, 0xc2, 0x00, 0xd0, 0x08, 0xd2, 0xf9, 0x83, 0x08, 0xd2,
+    0xf0, 0x48, 0xb2, 0x2d, 0xc1, 0x71, 0x21, 0x03, 0xc1, 0x71, 0x29, 0x91,
+    0x08, 0xd2, 0xab, 0x01, 0x71, 0x31, 0x87, 0x08, 0xd2, 0xa1, 0x97, 0x08,
+    0xd2, 0x9b, 0x01, 0x71, 0x35, 0x8b, 0x08, 0xd2, 0x88, 0xc4, 0x18, 0x10,
+    0x08, 0x87, 0xb9, 0xc2, 0x22, 0xcc, 0x08, 0x87, 0xb0, 0xc3, 0x0d, 0x14,
+    0x08, 0x87, 0xa9, 0xc3, 0x09, 0x9e, 0x08, 0x87, 0xa0, 0xc4, 0x02, 0xde,
+    0x08, 0x87, 0x99, 0xc2, 0x02, 0xa0, 0x08, 0x87, 0x90, 0x87, 0x08, 0x87,
+    0x41, 0x8a, 0x08, 0x86, 0xb0, 0x8a, 0x08, 0x87, 0x39, 0xc2, 0x16, 0x1c,
+    0x08, 0x87, 0x18, 0xc3, 0x44, 0x79, 0x08, 0x87, 0x09, 0xc2, 0x02, 0x98,
+    0x08, 0x86, 0xc9, 0xc3, 0x40, 0x40, 0x08, 0x86, 0xb8, 0xd1, 0x50, 0x57,
+    0x08, 0x7a, 0xc1, 0xcd, 0x7a, 0x52, 0x08, 0x7a, 0xaa, 0x01, 0x71, 0x39,
+    0xc8, 0x0d, 0x03, 0x08, 0x7a, 0xa0, 0xc5, 0x28, 0xee, 0x08, 0x7a, 0x99,
+    0xc2, 0x00, 0xc4, 0x08, 0x7a, 0x90, 0xc5, 0x05, 0x02, 0x08, 0x7a, 0x69,
+    0xc5, 0x00, 0xd4, 0x08, 0x7a, 0x60, 0xc5, 0x05, 0x02, 0x08, 0x7a, 0x59,
+    0xc5, 0x00, 0xd4, 0x08, 0x7a, 0x50, 0xc5, 0x00, 0xd4, 0x08, 0x7a, 0x49,
+    0xc5, 0x05, 0x02, 0x08, 0x7a, 0x38, 0xc5, 0x00, 0xd4, 0x08, 0x7a, 0x41,
+    0xc5, 0x05, 0x02, 0x08, 0x7a, 0x30, 0xc3, 0x26, 0x1a, 0x08, 0x7a, 0x21,
+    0xc5, 0xcf, 0xd8, 0x08, 0x79, 0xc8, 0xc3, 0x11, 0xef, 0x08, 0x7a, 0x09,
+    0x03, 0x41, 0x71, 0x3f, 0xc3, 0x16, 0x5a, 0x08, 0x79, 0xe9, 0xc4, 0x36,
+    0xb5, 0x08, 0x79, 0x80, 0xc2, 0x00, 0x8e, 0x08, 0x79, 0xb0, 0x16, 0xc1,
+    0x71, 0x4b, 0x08, 0xc1, 0x71, 0x5d, 0x19, 0xc1, 0x71, 0x65, 0x0e, 0xc1,
+    0x71, 0x75, 0x11, 0xc1, 0x71, 0x8b, 0x0b, 0xc1, 0x71, 0xa4, 0x05, 0xc1,
+    0x71, 0xb8, 0x14, 0xc1, 0x71, 0xde, 0x0a, 0xc1, 0x71, 0xf9, 0x06, 0xc1,
+    0x72, 0x21, 0x12, 0xc1, 0x72, 0x47, 0x07, 0xc1, 0x72, 0x80, 0x03, 0xc1,
+    0x72, 0x94, 0xc3, 0xdf, 0x37, 0x01, 0x98, 0x31, 0x0d, 0xc1, 0x72, 0xba,
+    0x09, 0xc1, 0x73, 0x1b, 0x15, 0xc1, 0x73, 0x40, 0x10, 0xc1, 0x73, 0x58,
+    0x04, 0xc1, 0x73, 0x79, 0x0f, 0xc1, 0x73, 0x99, 0x1b, 0xc1, 0x73, 0xec,
+    0xc8, 0xbe, 0xda, 0x01, 0x9e, 0xf0, 0x0e, 0xc1, 0x73, 0xf8, 0x15, 0xc1,
+    0x74, 0x02, 0x0d, 0xc1, 0x74, 0x32, 0xcc, 0x83, 0x3d, 0x01, 0x15, 0x09,
+    0x16, 0xc1, 0x74, 0x3e, 0x0f, 0xc1, 0x74, 0x4e, 0x12, 0xc1, 0x74, 0x58,
+    0x05, 0xc1, 0x74, 0x64, 0x18, 0xc1, 0x74, 0x74, 0x17, 0xc1, 0x74, 0x7e,
+    0x0a, 0xc1, 0x74, 0x8a, 0x11, 0xc1, 0x74, 0x9e, 0x08, 0xc1, 0x74, 0xa8,
+    0xc7, 0xc4, 0x56, 0x0f, 0x8c, 0xf9, 0x10, 0xc1, 0x74, 0xc0, 0xc2, 0x02,
+    0xfb, 0x0f, 0x8c, 0xa1, 0xc8, 0x0a, 0xff, 0x01, 0x4e, 0x31, 0xd5, 0x36,
+    0xc5, 0x01, 0x4e, 0x21, 0xc2, 0x15, 0x95, 0x0f, 0x8a, 0x78, 0xc9, 0xb0,
+    0xf2, 0x01, 0x20, 0xd3, 0x01, 0x74, 0xca, 0xc4, 0x40, 0x89, 0x01, 0x21,
+    0x01, 0xcf, 0x6a, 0x08, 0x01, 0x20, 0xb1, 0x45, 0xa0, 0x21, 0xc1, 0x74,
+    0xd0, 0x48, 0x46, 0xa3, 0xc1, 0x74, 0xdc, 0xcf, 0x69, 0x45, 0x01, 0x0a,
+    0x78, 0x07, 0xc1, 0x74, 0xe8, 0xcf, 0x61, 0x02, 0x01, 0x20, 0x80, 0x07,
+    0xc1, 0x74, 0xf7, 0xc3, 0x11, 0xf7, 0x01, 0x20, 0x00, 0xcd, 0x7d, 0xe0,
+    0x01, 0x20, 0xe1, 0xc8, 0xb7, 0xfa, 0x01, 0x20, 0x60, 0xc5, 0x61, 0x0c,
+    0x01, 0x20, 0xd9, 0x10, 0x41, 0x75, 0x03, 0xc4, 0x23, 0xca, 0x01, 0x20,
+    0xc1, 0xcd, 0x75, 0x58, 0x01, 0x20, 0x68, 0xc8, 0xb8, 0x9a, 0x01, 0x20,
+    0x41, 0xc3, 0x08, 0x93, 0x01, 0x20, 0x38, 0x0f, 0xc1, 0x75, 0x0f, 0xc2,
+    0x00, 0x67, 0x00, 0x39, 0x33, 0x01, 0x75, 0x1b, 0x16, 0xc1, 0x75, 0x21,
+    0x15, 0xc1, 0x75, 0x30, 0x14, 0xc1, 0x75, 0x4e, 0xc4, 0xc0, 0x4b, 0x00,
+    0x39, 0x49, 0x87, 0x00, 0x39, 0x29, 0xcd, 0x7e, 0x14, 0x00, 0x39, 0x21,
+    0xc3, 0x20, 0x18, 0x00, 0x39, 0x11, 0xc6, 0xd0, 0xcd, 0x00, 0x39, 0x01,
+    0xc4, 0xe0, 0xe7, 0x00, 0x38, 0xf9, 0xc4, 0xde, 0xef, 0x00, 0x38, 0xeb,
+    0x01, 0x75, 0x5a, 0xc2, 0x01, 0x7f, 0x00, 0x38, 0xbb, 0x01, 0x75, 0x60,
+    0xc4, 0x69, 0x81, 0x00, 0x38, 0xc9, 0xc3, 0x7e, 0x89, 0x00, 0x38, 0xc1,
+    0x06, 0xc1, 0x75, 0x66, 0xc5, 0xd7, 0x5e, 0x00, 0x38, 0x9b, 0x01, 0x75,
+    0x72, 0xc4, 0xe3, 0x27, 0x00, 0x38, 0x91, 0xc5, 0x58, 0x4d, 0x00, 0x38,
+    0x80, 0x44, 0x7c, 0x67, 0xc1, 0x75, 0x78, 0x48, 0xbf, 0x2a, 0xc1, 0x75,
+    0x82, 0xcf, 0x62, 0xf1, 0x00, 0x38, 0x28, 0xc7, 0x08, 0x6b, 0x00, 0x39,
+    0xc9, 0xca, 0x01, 0x68, 0x00, 0x39, 0xc0, 0x45, 0xd8, 0x94, 0xc1, 0x75,
+    0x94, 0xc4, 0xde, 0xa7, 0x00, 0x39, 0xf9, 0xc7, 0xc4, 0x2c, 0x00, 0x3a,
+    0x10, 0xc6, 0x19, 0x7a, 0x00, 0x39, 0xa9, 0xc5, 0x05, 0x02, 0x00, 0x39,
+    0xa1, 0xc5, 0x00, 0xd4, 0x00, 0x39, 0x98, 0xc6, 0x19, 0x7a, 0x00, 0x39,
+    0x91, 0xc5, 0x05, 0x02, 0x00, 0x39, 0x89, 0xc5, 0x00, 0xd4, 0x00, 0x39,
+    0x80, 0xc9, 0xaf, 0x0c, 0x00, 0x38, 0x51, 0x4b, 0x8f, 0xd6, 0x41, 0x75,
+    0xa0, 0x48, 0xbf, 0x02, 0xc1, 0x75, 0xac, 0x4a, 0x9f, 0x22, 0x41, 0x75,
+    0xbb, 0xcf, 0x60, 0x12, 0x00, 0x38, 0x01, 0x45, 0x75, 0x81, 0x41, 0x75,
+    0xca, 0x51, 0x55, 0x41, 0xc1, 0x75, 0xd6, 0x4a, 0x0e, 0x7d, 0x41, 0x75,
+    0xe2, 0xc5, 0x00, 0xd4, 0x00, 0x3a, 0x39, 0xc5, 0x05, 0x02, 0x00, 0x3a,
+    0x40, 0x91, 0x05, 0x40, 0x39, 0xc2, 0x01, 0x23, 0x05, 0x40, 0x40, 0x91,
+    0x05, 0x40, 0x49, 0xc2, 0x01, 0x23, 0x05, 0x40, 0x50, 0x91, 0x05, 0x40,
+    0x61, 0xc2, 0x01, 0x23, 0x05, 0x40, 0x68, 0x16, 0xc1, 0x75, 0xee, 0x91,
+    0x05, 0x40, 0xa1, 0xc2, 0x01, 0x23, 0x05, 0x40, 0xa8, 0x06, 0xc1, 0x75,
+    0xf8, 0x91, 0x05, 0x40, 0xb1, 0xc2, 0x01, 0x23, 0x05, 0x40, 0xb8, 0x91,
+    0x05, 0x40, 0x71, 0xc2, 0x01, 0x23, 0x05, 0x40, 0x78, 0x91, 0x05, 0x40,
+    0xc9, 0xc2, 0x01, 0x23, 0x05, 0x40, 0xd0, 0x91, 0x05, 0x40, 0xd9, 0xc2,
+    0x01, 0x23, 0x05, 0x40, 0xe0, 0x91, 0x05, 0x40, 0xf1, 0xc2, 0x00, 0x79,
+    0x05, 0x41, 0x00, 0xc7, 0x14, 0x39, 0x05, 0x40, 0x59, 0xd0, 0x5a, 0xd2,
+    0x05, 0x41, 0x60, 0x46, 0x00, 0x8b, 0x41, 0x76, 0x02, 0x95, 0x01, 0x39,
+    0x40, 0xd1, 0x4f, 0xe0, 0x01, 0x3e, 0x49, 0xc2, 0x00, 0x55, 0x01, 0x14,
+    0x1b, 0x01, 0x76, 0x14, 0x46, 0x00, 0xd4, 0xc1, 0x76, 0x18, 0x45, 0x00,
+    0x8c, 0xc1, 0x76, 0x24, 0x47, 0x13, 0x6d, 0x41, 0x76, 0x36, 0x0e, 0xc1,
+    0x76, 0x42, 0xd1, 0x1a, 0x4a, 0x01, 0x03, 0xf1, 0x07, 0xc1, 0x76, 0x4e,
+    0xc5, 0x1d, 0x1d, 0x01, 0x03, 0xd9, 0xc9, 0x60, 0xf3, 0x01, 0x03, 0xd1,
+    0xc4, 0x26, 0x78, 0x01, 0x03, 0xc9, 0x15, 0xc1, 0x76, 0x5a, 0x08, 0xc1,
+    0x76, 0x66, 0xc4, 0x15, 0xe7, 0x01, 0x03, 0x81, 0x16, 0xc1, 0x76, 0x72,
+    0xc3, 0x05, 0x14, 0x00, 0x05, 0xc8, 0xca, 0xa1, 0x98, 0x00, 0xe6, 0x39,
+    0xca, 0xa4, 0x86, 0x00, 0xe6, 0x31, 0xca, 0x9c, 0x8e, 0x00, 0xe6, 0x29,
+    0xcb, 0x90, 0x23, 0x00, 0xe6, 0x21, 0xc5, 0xdd, 0x53, 0x00, 0xe6, 0x19,
+    0x12, 0xc1, 0x76, 0x7e, 0xc5, 0xdd, 0xb7, 0x00, 0xe6, 0x00, 0x08, 0xc1,
+    0x76, 0x8a, 0x04, 0xc1, 0x76, 0x94, 0x0e, 0xc1, 0x76, 0x9e, 0x14, 0xc1,
+    0x76, 0xa8, 0x15, 0xc1, 0x76, 0xb2, 0x0d, 0xc1, 0x76, 0xbc, 0xc2, 0x00,
+    0xd0, 0x00, 0xdd, 0x01, 0xc2, 0x8d, 0x8f, 0x00, 0xdc, 0xf9, 0xc2, 0x01,
+    0x4a, 0x00, 0xdc, 0xe9, 0xc2, 0x19, 0x2c, 0x00, 0xdc, 0xd1, 0xc2, 0x01,
+    0xc3, 0x00, 0xdc, 0xc9, 0xc2, 0x02, 0x41, 0x00, 0xdc, 0xb9, 0xc2, 0x00,
+    0xb0, 0x00, 0xdc, 0xa9, 0x10, 0xc1, 0x76, 0xc6, 0xc2, 0x0e, 0x9a, 0x00,
+    0xdc, 0x99, 0xc2, 0x01, 0x6f, 0x00, 0xdc, 0x91, 0xc2, 0x02, 0x1c, 0x00,
+    0xdc, 0x81, 0xc2, 0x25, 0x3b, 0x00, 0xdc, 0x79, 0xc2, 0x00, 0x64, 0x00,
+    0xdc, 0x71, 0xc2, 0x01, 0x30, 0x00, 0xdc, 0x61, 0xc2, 0x0f, 0x9a, 0x00,
+    0xdc, 0x59, 0x87, 0x00, 0xdc, 0x43, 0x01, 0x76, 0xd6, 0x91, 0x00, 0xdc,
+    0x39, 0x83, 0x00, 0xdc, 0x1b, 0x01, 0x76, 0xda, 0x97, 0x00, 0xdc, 0x29,
+    0x8b, 0x00, 0xdc, 0x20, 0xc4, 0x26, 0x78, 0x00, 0xdd, 0xc9, 0xc5, 0x06,
+    0xdb, 0x00, 0xdd, 0xc1, 0x15, 0xc1, 0x76, 0xde, 0x08, 0xc1, 0x76, 0xea,
+    0x16, 0xc1, 0x76, 0xf6, 0xc3, 0x05, 0x14, 0x00, 0xdd, 0x89, 0xc4, 0x15,
+    0xe7, 0x00, 0xdd, 0x80, 0x47, 0xc1, 0xe7, 0xc1, 0x77, 0x02, 0x42, 0x16,
+    0x59, 0xc1, 0x77, 0x0e, 0xc7, 0xc3, 0x5a, 0x00, 0xdd, 0x08, 0xc6, 0x1e,
+    0x95, 0x00, 0xdd, 0x59, 0x42, 0x00, 0xb0, 0x41, 0x77, 0x1a, 0x10, 0xc1,
+    0x77, 0x24, 0xc5, 0xdb, 0x1e, 0x00, 0xdd, 0x40, 0xca, 0x37, 0x4e, 0x01,
+    0x13, 0xf9, 0xc5, 0x07, 0x62, 0x01, 0x13, 0xe8, 0x4c, 0x24, 0x3b, 0xc1,
+    0x77, 0x42, 0xcb, 0x0e, 0xbd, 0x01, 0x55, 0xa1, 0x44, 0x1f, 0xb2, 0xc1,
+    0x77, 0x4e, 0xcf, 0x6a, 0x8f, 0x01, 0x55, 0xc0, 0x00, 0x41, 0x77, 0x5a,
+    0xd0, 0x03, 0xb7, 0x01, 0x4b, 0xc9, 0x42, 0x06, 0x62, 0x41, 0x77, 0x6f,
+    0xc3, 0x02, 0xa3, 0x01, 0x55, 0xe9, 0xcf, 0x60, 0xf3, 0x01, 0x55, 0xf9,
+    0xd9, 0x1f, 0x18, 0x01, 0x56, 0x08, 0xca, 0x0e, 0xbe, 0x01, 0x04, 0x61,
+    0xc4, 0x00, 0x2d, 0x01, 0x04, 0x40, 0xc4, 0x18, 0x10, 0x01, 0x04, 0x39,
+    0xc2, 0x22, 0xcc, 0x01, 0x04, 0x30, 0xc3, 0x0d, 0x14, 0x01, 0x04, 0x29,
+    0xc3, 0x09, 0x9e, 0x01, 0x04, 0x20, 0xc4, 0x02, 0xde, 0x01, 0x04, 0x19,
+    0xc2, 0x02, 0xa0, 0x01, 0x04, 0x10, 0x4a, 0x00, 0x87, 0xc1, 0x77, 0x7b,
+    0x4e, 0x1d, 0x3c, 0x41, 0x77, 0x92, 0x42, 0x00, 0x99, 0xc1, 0x77, 0x9e,
+    0x07, 0xc1, 0x77, 0xb0, 0x14, 0xc1, 0x77, 0xcb, 0x16, 0xc1, 0x77, 0xdd,
+    0xcc, 0x87, 0x21, 0x0f, 0xa9, 0xc9, 0xce, 0x71, 0xf4, 0x0f, 0xa9, 0xc1,
+    0xd1, 0x55, 0x96, 0x01, 0x53, 0x09, 0x03, 0xc1, 0x77, 0xe9, 0xd1, 0x54,
+    0x0f, 0x07, 0xf2, 0x89, 0xc9, 0x11, 0xf6, 0x07, 0xf2, 0x91, 0xc9, 0xa8,
+    0x55, 0x07, 0xf2, 0xa1, 0xcd, 0x2c, 0xb2, 0x07, 0xf2, 0xb1, 0x42, 0x00,
+    0x49, 0xc1, 0x77, 0xfb, 0xcb, 0x97, 0x9d, 0x07, 0xf2, 0xf9, 0x12, 0xc1,
+    0x78, 0x07, 0xcc, 0x89, 0xcd, 0x07, 0xf3, 0x19, 0xd1, 0x54, 0xb9, 0x07,
+    0xf3, 0x29, 0xcb, 0x99, 0x60, 0x07, 0xf3, 0x48, 0xcc, 0x23, 0x9f, 0x01,
+    0x55, 0x60, 0x02, 0xc1, 0x78, 0x13, 0x00, 0x41, 0x78, 0x1b, 0xce, 0x50,
+    0xaf, 0x01, 0x1c, 0xc9, 0xc2, 0x00, 0x29, 0x0f, 0xad, 0x42, 0x01, 0x78,
+    0x27, 0xc2, 0x00, 0xcc, 0x0f, 0xa3, 0xc0, 0xc5, 0x07, 0x62, 0x01, 0x10,
+    0xe8, 0xd5, 0x37, 0x43, 0x01, 0x17, 0x41, 0xce, 0x74, 0x32, 0x01, 0x15,
+    0x81, 0x46, 0x23, 0xa0, 0xc1, 0x78, 0x2d, 0x46, 0x00, 0xd4, 0x41, 0x78,
+    0x39, 0x42, 0x00, 0x99, 0xc1, 0x78, 0x51, 0xc9, 0xa8, 0x55, 0x07, 0xf0,
+    0xa1, 0x07, 0xc1, 0x78, 0x5d, 0xcd, 0x2c, 0xb2, 0x07, 0xf0, 0xb1, 0xd3,
+    0x22, 0x78, 0x07, 0xf0, 0xc9, 0xce, 0x72, 0x1e, 0x07, 0xf1, 0x81, 0xcd,
+    0x80, 0x29, 0x07, 0xf1, 0xa1, 0x0e, 0xc1, 0x78, 0x6f, 0x46, 0x00, 0x2c,
+    0xc1, 0x78, 0x7b, 0x4c, 0x1c, 0x86, 0x41, 0x78, 0xa9, 0xcd, 0x80, 0x1c,
+    0x01, 0x18, 0xc1, 0xc7, 0xc4, 0x72, 0x0f, 0xb6, 0x80, 0x04, 0xc1, 0x78,
+    0xb5, 0x47, 0x70, 0xa5, 0xc1, 0x78, 0xc1, 0x16, 0xc1, 0x78, 0xd9, 0x08,
+    0xc1, 0x78, 0xf1, 0x15, 0xc1, 0x78, 0xfb, 0x49, 0xb2, 0x12, 0xc1, 0x79,
+    0x07, 0x48, 0xbb, 0x82, 0xc1, 0x79, 0x1f, 0x48, 0xb7, 0x1a, 0xc1, 0x79,
+    0x37, 0x0d, 0xc1, 0x79, 0x4f, 0x49, 0xa8, 0xf7, 0xc1, 0x79, 0x5b, 0xc9,
+    0xa9, 0x7e, 0x0f, 0x85, 0xf9, 0xcb, 0x8d, 0x16, 0x0f, 0x86, 0xf8, 0x16,
+    0xc1, 0x79, 0x73, 0x08, 0x41, 0x79, 0x7f, 0x00, 0x41, 0x79, 0x8b, 0x46,
+    0x08, 0xf1, 0xc1, 0x79, 0x9d, 0xc9, 0xb0, 0xa1, 0x0f, 0xa6, 0x20, 0x00,
+    0xc1, 0x79, 0xa9, 0xd8, 0x25, 0xbb, 0x01, 0x33, 0xe8, 0x4d, 0x29, 0xb9,
+    0xc1, 0x79, 0xb5, 0x4f, 0x0b, 0x17, 0x41, 0x7a, 0x1d, 0x16, 0xc1, 0x7a,
+    0x85, 0xc8, 0x4b, 0x5f, 0x01, 0x24, 0x31, 0x07, 0xc1, 0x7a, 0x97, 0x15,
+    0xc1, 0x7a, 0xa3, 0x08, 0x41, 0x7a, 0xaf, 0xc4, 0x26, 0x78, 0x01, 0x23,
+    0xe1, 0xc5, 0x06, 0xdb, 0x01, 0x23, 0xd9, 0x15, 0xc1, 0x7a, 0xbb, 0x08,
+    0xc1, 0x7a, 0xc7, 0x16, 0xc1, 0x7a, 0xd3, 0xc3, 0x05, 0x14, 0x01, 0x23,
+    0xa0, 0x0d, 0xc1, 0x7a, 0xdf, 0xc5, 0xd9, 0x61, 0x01, 0x90, 0x0b, 0x01,
+    0x7a, 0xf1, 0x16, 0xc1, 0x7a, 0xf7, 0xc5, 0xd6, 0x8c, 0x01, 0x90, 0x1b,
+    0x01, 0x7b, 0x09, 0xc5, 0xda, 0xe7, 0x01, 0x90, 0x23, 0x01, 0x7b, 0x0f,
+    0x12, 0xc1, 0x7b, 0x15, 0xc4, 0xad, 0x2b, 0x01, 0x90, 0x33, 0x01, 0x7b,
+    0x27, 0xc5, 0xb7, 0x9d, 0x01, 0x90, 0x3b, 0x01, 0x7b, 0x2d, 0x05, 0xc1,
+    0x7b, 0x33, 0xc5, 0x90, 0xe4, 0x01, 0x90, 0x6a, 0x01, 0x7b, 0x45, 0xc4,
+    0xe1, 0x47, 0x01, 0x90, 0xe9, 0xc3, 0x0d, 0x03, 0x01, 0x90, 0xf0, 0xc3,
+    0x05, 0x14, 0x01, 0x91, 0x01, 0x16, 0xc1, 0x7b, 0x4b, 0x08, 0xc1, 0x7b,
+    0x5d, 0x15, 0xc1, 0x7b, 0x6d, 0x07, 0xc1, 0x7b, 0x8b, 0x10, 0xc1, 0x7b,
+    0x9d, 0x0f, 0xc1, 0x7b, 0xa9, 0x19, 0xc1, 0x7b, 0xb5, 0xc4, 0xdf, 0xbf,
+    0x01, 0x91, 0x91, 0x05, 0xc1, 0x7b, 0xc1, 0xc5, 0xdd, 0x71, 0x01, 0x91,
+    0xc1, 0x42, 0x01, 0x19, 0xc1, 0x7b, 0xcd, 0xc8, 0xba, 0x62, 0x01, 0x91,
+    0xf8, 0xc2, 0x00, 0xf1, 0x01, 0x11, 0x29, 0x45, 0x00, 0x8c, 0x41, 0x7b,
+    0xdd, 0xca, 0x1b, 0x09, 0x01, 0x01, 0x49, 0xc2, 0x07, 0xa3, 0x01, 0x70,
+    0x79, 0xc7, 0x62, 0x81, 0x01, 0x72, 0x68, 0xc5, 0x26, 0xf7, 0x08, 0xd7,
+    0xc1, 0xc7, 0x41, 0x71, 0x08, 0xd7, 0x80, 0x00, 0x41, 0x7b, 0xe9, 0x08,
+    0xc1, 0x7b, 0xf8, 0x8b, 0x08, 0xd6, 0xbb, 0x01, 0x7c, 0x02, 0x97, 0x08,
+    0xd6, 0xcb, 0x01, 0x7c, 0x06, 0x91, 0x08, 0xd6, 0xc1, 0x87, 0x08, 0xd6,
+    0xb1, 0x83, 0x08, 0xd6, 0xa9, 0x05, 0xc1, 0x7c, 0x0a, 0xc2, 0x00, 0x39,
+    0x08, 0xd6, 0x91, 0x12, 0xc1, 0x7c, 0x14, 0x10, 0xc1, 0x7c, 0x1e, 0x16,
+    0xc1, 0x7c, 0x28, 0xc2, 0x01, 0x5d, 0x08, 0xd6, 0x61, 0xc2, 0x0d, 0xf6,
+    0x08, 0xd6, 0x59, 0x0d, 0xc1, 0x7c, 0x32, 0xc2, 0x01, 0x30, 0x08, 0xd6,
+    0x49, 0xc2, 0x00, 0xd0, 0x08, 0xd6, 0x41, 0xc2, 0x02, 0x41, 0x08, 0xd6,
+    0x31, 0xc2, 0x02, 0x1c, 0x08, 0xd6, 0x29, 0xc2, 0x0e, 0x9a, 0x08, 0xd6,
+    0x21, 0xc2, 0x01, 0xc3, 0x08, 0xd6, 0x19, 0xc2, 0x00, 0xdb, 0x08, 0xd6,
+    0x10, 0xc5, 0x26, 0xf7, 0x08, 0xd7, 0x91, 0xca, 0xa4, 0x04, 0x08, 0xd7,
+    0x88, 0x00, 0x41, 0x7c, 0x3c, 0xc6, 0x26, 0xf6, 0x08, 0xd7, 0x50, 0xc5,
+    0x26, 0xf7, 0x08, 0xd7, 0x49, 0xc4, 0x0d, 0xe5, 0x08, 0xd7, 0x2a, 0x01,
+    0x7c, 0x4b, 0xc4, 0x0a, 0x64, 0x0f, 0x99, 0xa1, 0xc9, 0xb4, 0x01, 0x0f,
+    0xd7, 0x99, 0xc7, 0xc5, 0x0c, 0x0f, 0xd7, 0xa1, 0xc6, 0x28, 0x24, 0x01,
+    0x70, 0xc8, 0x47, 0x34, 0x2f, 0xc1, 0x7c, 0x51, 0xd6, 0x2c, 0x9c, 0x08,
+    0x43, 0xc1, 0x42, 0x00, 0x49, 0x41, 0x7c, 0x5f, 0x18, 0xc1, 0x7c, 0x6b,
+    0x0d, 0xc1, 0x7c, 0x77, 0x16, 0xc1, 0x7c, 0x89, 0x1b, 0xc1, 0x7c, 0x93,
+    0xc3, 0xe6, 0x20, 0x0b, 0x5c, 0x59, 0x42, 0x00, 0xd0, 0xc1, 0x7c, 0x9f,
+    0xc4, 0xe4, 0x03, 0x0b, 0x5c, 0x39, 0xc4, 0xe3, 0xcb, 0x0b, 0x5c, 0x21,
+    0xc5, 0xd3, 0xdf, 0x0b, 0x5c, 0x09, 0x0e, 0x41, 0x7c, 0xa9, 0x05, 0xc1,
+    0x7c, 0xb5, 0xc3, 0xe6, 0x3e, 0x0b, 0x59, 0x71, 0xc2, 0x20, 0xec, 0x0b,
+    0x59, 0x69, 0x10, 0xc1, 0x7c, 0xc1, 0xc5, 0xd7, 0x54, 0x0b, 0x59, 0x51,
+    0x0a, 0xc1, 0x7c, 0xdd, 0xc3, 0xc4, 0x86, 0x0b, 0x59, 0x31, 0xc3, 0x2d,
+    0x34, 0x0b, 0x59, 0x21, 0xc4, 0xe4, 0xd7, 0x0b, 0x59, 0x19, 0xc3, 0xbe,
+    0x32, 0x0b, 0x59, 0x09, 0xc3, 0x20, 0xeb, 0x0b, 0x58, 0xf1, 0xc3, 0xe5,
+    0x4e, 0x0b, 0x58, 0xe0, 0xc8, 0xbc, 0x0a, 0x0b, 0x5b, 0xb9, 0xc8, 0xbf,
+    0x72, 0x0b, 0x5b, 0xb1, 0x16, 0xc1, 0x7c, 0xef, 0x05, 0xc1, 0x7c, 0xfe,
+    0xd2, 0x4d, 0xe7, 0x0b, 0x5b, 0x90, 0xc2, 0x11, 0xa5, 0x0b, 0x5b, 0x89,
+    0x44, 0x9f, 0x7e, 0x41, 0x7d, 0x0a, 0xc2, 0x20, 0xec, 0x0b, 0x5b, 0x79,
+    0xca, 0x9f, 0x7c, 0x0b, 0x5b, 0x69, 0xce, 0x73, 0xb4, 0x0b, 0x5b, 0x30,
+    0xc3, 0xe6, 0x1d, 0x0b, 0x5b, 0x59, 0xc3, 0xe5, 0x60, 0x0b, 0x5b, 0x48,
+    0xc3, 0x44, 0x23, 0x0b, 0x5b, 0x51, 0x1b, 0xc1, 0x7d, 0x16, 0xc3, 0x26,
+    0x9a, 0x0b, 0x5a, 0x20, 0xc3, 0x95, 0x80, 0x0b, 0x5b, 0x41, 0xc2, 0x01,
+    0x0f, 0x0b, 0x5b, 0x28, 0xc3, 0x46, 0x7d, 0x0b, 0x5b, 0x19, 0xc4, 0xe4,
+    0x47, 0x0b, 0x5a, 0x11, 0xc4, 0xdf, 0x67, 0x0b, 0x5a, 0x01, 0xc4, 0xe0,
+    0x47, 0x0b, 0x59, 0xd9, 0x16, 0x41, 0x7d, 0x22, 0xc8, 0xbd, 0x12, 0x0b,
+    0x5b, 0x09, 0x42, 0x00, 0xc4, 0x41, 0x7d, 0x2c, 0xc9, 0x33, 0xed, 0x0b,
+    0x5a, 0xf9, 0x95, 0x0b, 0x5a, 0xe0, 0xc4, 0x18, 0x10, 0x0b, 0x5a, 0xb9,
+    0xc2, 0x22, 0xcc, 0x0b, 0x5a, 0xb0, 0xc3, 0x0d, 0x14, 0x0b, 0x5a, 0xa9,
+    0xc3, 0x09, 0x9e, 0x0b, 0x5a, 0xa0, 0xc4, 0x02, 0xde, 0x0b, 0x5a, 0x99,
+    0xc2, 0x02, 0xa0, 0x0b, 0x5a, 0x90, 0xc3, 0xe5, 0x30, 0x0b, 0x59, 0xb1,
+    0xc2, 0x00, 0x5a, 0x0b, 0x59, 0x80, 0xc3, 0xa7, 0x6a, 0x0b, 0x59, 0xa1,
+    0x91, 0x0b, 0x59, 0x88, 0xc3, 0x40, 0xe3, 0x0b, 0x59, 0x99, 0xc2, 0x00,
+    0xcb, 0x0b, 0x59, 0x90, 0x03, 0xc1, 0x7d, 0x34, 0x98, 0x0b, 0x58, 0xb9,
+    0x84, 0x0b, 0x58, 0xb1, 0x19, 0xc1, 0x7d, 0x3c, 0x0b, 0xc1, 0x7d, 0x44,
+    0x17, 0x41, 0x7d, 0x4c, 0x98, 0x0b, 0x58, 0xc9, 0x84, 0x0b, 0x58, 0xc0,
+    0x03, 0xc1, 0x7d, 0x54, 0x98, 0x0b, 0x58, 0x19, 0x84, 0x0b, 0x58, 0x10,
+    0x98, 0x0b, 0x58, 0x99, 0x84, 0x0b, 0x58, 0x91, 0x11, 0x41, 0x7d, 0x5c,
+    0x03, 0xc1, 0x7d, 0x64, 0x98, 0x0b, 0x58, 0x39, 0x84, 0x0b, 0x58, 0x30,
+    0x98, 0x0b, 0x58, 0x49, 0x84, 0x0b, 0x58, 0x41, 0x07, 0x41, 0x7d, 0x6c,
+    0xc4, 0x2a, 0xcc, 0x0f, 0xa7, 0x79, 0xc4, 0x01, 0xc3, 0x01, 0x80, 0x92,
+    0x01, 0x7d, 0x74, 0x00, 0xc1, 0x7d, 0x7a, 0xcb, 0x7a, 0xa2, 0x0f, 0xa5,
+    0xd8, 0x91, 0x08, 0x5d, 0x51, 0xc4, 0x18, 0x12, 0x08, 0x5d, 0x70, 0xc3,
+    0x77, 0x79, 0x08, 0x5c, 0x79, 0xc4, 0xdc, 0x2d, 0x08, 0x5c, 0x68, 0x16,
+    0xc1, 0x7d, 0xa2, 0xc3, 0x05, 0x14, 0x08, 0x48, 0xb2, 0x01, 0x7d, 0xb2,
+    0x16, 0xc1, 0x7d, 0xb8, 0x15, 0xc1, 0x7d, 0xc4, 0xc4, 0xa9, 0x57, 0x08,
+    0x48, 0x99, 0xc3, 0xe5, 0x78, 0x08, 0x48, 0x91, 0xc2, 0x00, 0x67, 0x08,
+    0x48, 0x81, 0x03, 0xc1, 0x7d, 0xd6, 0xc3, 0x20, 0x18, 0x08, 0x48, 0x69,
+    0xc3, 0x00, 0x4e, 0x08, 0x48, 0x61, 0xc4, 0xb9, 0xf7, 0x08, 0x48, 0x59,
+    0xc3, 0xba, 0x37, 0x08, 0x48, 0x51, 0xc3, 0x4a, 0xb9, 0x08, 0x48, 0x49,
+    0xc2, 0x01, 0x7f, 0x08, 0x48, 0x23, 0x01, 0x7d, 0xe2, 0xc3, 0x69, 0x81,
+    0x08, 0x48, 0x31, 0xc3, 0xe4, 0xf4, 0x08, 0x48, 0x29, 0xc4, 0xdb, 0x4b,
+    0x08, 0x48, 0x19, 0xc4, 0xe0, 0x8f, 0x08, 0x48, 0x11, 0xc3, 0x0b, 0xc8,
+    0x08, 0x48, 0x08, 0x0d, 0xc1, 0x7d, 0xe6, 0x09, 0xc1, 0x7d, 0xf0, 0x10,
+    0xc1, 0x7d, 0xfa, 0x05, 0xc1, 0x7e, 0x10, 0xc2, 0x25, 0x3b, 0x05, 0x42,
+    0x31, 0x16, 0xc1, 0x7e, 0x1d, 0x06, 0xc1, 0x7e, 0x2f, 0x12, 0xc1, 0x7e,
+    0x3f, 0xc2, 0x01, 0x5d, 0x05, 0x42, 0x71, 0xc2, 0x01, 0xc3, 0x05, 0x42,
+    0x79, 0xc2, 0x01, 0x4a, 0x05, 0x42, 0x99, 0x1c, 0xc1, 0x7e, 0x49, 0x15,
+    0xc1, 0x7e, 0x53, 0xc2, 0x19, 0x2c, 0x05, 0x42, 0xb9, 0xc2, 0x00, 0x39,
+    0x05, 0x42, 0xc1, 0xc2, 0x00, 0xdb, 0x05, 0x42, 0xc9, 0xc2, 0x00, 0xd0,
+    0x05, 0x42, 0xe1, 0x83, 0x05, 0x42, 0xeb, 0x01, 0x7e, 0x63, 0x8b, 0x05,
+    0x42, 0xf1, 0x97, 0x05, 0x42, 0xf9, 0x87, 0x05, 0x43, 0x03, 0x01, 0x7e,
+    0x6f, 0x91, 0x05, 0x43, 0x09, 0xc2, 0x0f, 0x9a, 0x05, 0x43, 0x11, 0xc2,
+    0x8d, 0x8f, 0x05, 0x43, 0x19, 0xc2, 0x00, 0x87, 0x05, 0x43, 0x21, 0x45,
+    0x17, 0xbd, 0x41, 0x7e, 0x73, 0x17, 0xc1, 0x7e, 0x7f, 0xcf, 0x68, 0x46,
+    0x05, 0x43, 0xa0, 0xc4, 0x01, 0xe2, 0x05, 0x43, 0xb1, 0xcb, 0x99, 0x6b,
+    0x05, 0x43, 0xb8, 0xc9, 0xa2, 0x56, 0x08, 0x0e, 0x81, 0x0e, 0xc1, 0x7e,
+    0x8b, 0xc6, 0xca, 0xd9, 0x08, 0x0f, 0xa0, 0xcc, 0x89, 0x91, 0x08, 0x0e,
+    0x91, 0xc4, 0xdf, 0xeb, 0x08, 0x0e, 0xc1, 0xc4, 0x5e, 0xc9, 0x08, 0x0f,
+    0x80, 0x03, 0xc1, 0x7e, 0x97, 0xc4, 0xdf, 0xbb, 0x08, 0x0e, 0xa1, 0xc3,
+    0x46, 0x7d, 0x08, 0x0e, 0xe1, 0x11, 0x41, 0x7e, 0xa7, 0xc4, 0x29, 0xfd,
+    0x08, 0x0e, 0xa9, 0xc8, 0xbd, 0xda, 0x08, 0x0f, 0xe0, 0xc5, 0xb7, 0xed,
+    0x08, 0x0e, 0xb1, 0xc3, 0x00, 0xbf, 0x08, 0x0f, 0x49, 0xc3, 0x06, 0xa7,
+    0x08, 0x0f, 0x50, 0x11, 0xc1, 0x7e, 0xb6, 0xc2, 0x02, 0xe0, 0x08, 0x0f,
+    0x8b, 0x01, 0x7e, 0xc0, 0xc8, 0xb8, 0x62, 0x08, 0x0f, 0x58, 0x42, 0x00,
+    0x0a, 0xc1, 0x7e, 0xc6, 0xc2, 0x39, 0x8b, 0x08, 0x0e, 0xf9, 0xc4, 0x04,
+    0x15, 0x08, 0x0f, 0x29, 0xc8, 0xb9, 0xca, 0x08, 0x0f, 0xd9, 0xc7, 0xc0,
+    0xdd, 0x08, 0x0f, 0xd0, 0xc6, 0xca, 0xaf, 0x08, 0x0e, 0xe9, 0xc5, 0xd4,
+    0xed, 0x08, 0x0e, 0xf0, 0x86, 0x08, 0x0f, 0x01, 0xc2, 0x00, 0x35, 0x08,
+    0x0f, 0xb0, 0xc4, 0xe1, 0x07, 0x08, 0x0f, 0x19, 0xc2, 0x00, 0x5f, 0x08,
+    0x0f, 0x78, 0xc2, 0x00, 0xc2, 0x08, 0x0f, 0x69, 0xc6, 0xcd, 0x67, 0x08,
+    0x0f, 0xa8, 0xc5, 0xd5, 0xe2, 0x08, 0x0f, 0xc9, 0xc7, 0xc4, 0x87, 0x08,
+    0x0e, 0xb8, 0xc4, 0x02, 0xde, 0x00, 0x00, 0x99, 0xc2, 0x02, 0xa0, 0x00,
+    0x00, 0x90, 0xcb, 0x83, 0x0e, 0x00, 0x4a, 0xa1, 0xd0, 0x50, 0xcf, 0x00,
+    0x4b, 0x80, 0xcb, 0x1f, 0x0d, 0x00, 0x4a, 0x99, 0xc9, 0x93, 0x31, 0x05,
+    0x47, 0xc8, 0x4b, 0x91, 0xc5, 0xc1, 0x7e, 0xd0, 0x44, 0x00, 0xbb, 0x41,
+    0x7e, 0xdc, 0x03, 0xc1, 0x7f, 0x11, 0xcf, 0x61, 0x11, 0x00, 0x4a, 0x71,
+    0x91, 0x00, 0x4a, 0x5b, 0x01, 0x7f, 0x25, 0x46, 0x2e, 0xee, 0xc1, 0x7f,
+    0x2f, 0x47, 0xc7, 0x7b, 0xc1, 0x7f, 0x37, 0x87, 0x00, 0x4a, 0x39, 0x48,
+    0xb2, 0x2d, 0xc1, 0x7f, 0x45, 0x97, 0x00, 0x4a, 0x0b, 0x01, 0x7f, 0x53,
+    0x8b, 0x00, 0x49, 0xfa, 0x01, 0x7f, 0x5e, 0x0a, 0xc1, 0x7f, 0x62, 0x15,
+    0xc1, 0x7f, 0x6c, 0x18, 0xc1, 0x7f, 0x7a, 0x0e, 0xc1, 0x7f, 0x84, 0x14,
+    0xc1, 0x7f, 0x8c, 0x1b, 0xc1, 0x7f, 0x9c, 0xc2, 0x01, 0xc3, 0x00, 0x49,
+    0x73, 0x01, 0x7f, 0xa6, 0x04, 0xc1, 0x7f, 0xac, 0x12, 0xc1, 0x7f, 0xbc,
+    0x10, 0xc1, 0x7f, 0xc6, 0x06, 0xc1, 0x7f, 0xda, 0x16, 0xc1, 0x7f, 0xe8,
+    0x0c, 0xc1, 0x7f, 0xf6, 0x05, 0xc1, 0x80, 0x06, 0x09, 0xc1, 0x80, 0x13,
+    0x0d, 0xc1, 0x80, 0x27, 0x83, 0x00, 0x48, 0x2b, 0x01, 0x80, 0x2f, 0x91,
+    0x00, 0x48, 0x9b, 0x01, 0x80, 0x43, 0x87, 0x00, 0x48, 0x79, 0x97, 0x00,
+    0x48, 0x4b, 0x01, 0x80, 0x4d, 0x8b, 0x00, 0x48, 0x3b, 0x01, 0x80, 0x58,
+    0xc2, 0x0f, 0x9a, 0x00, 0x4a, 0xc1, 0x1c, 0xc1, 0x80, 0x5c, 0xc2, 0x00,
+    0x87, 0x00, 0x4a, 0xf0, 0x45, 0x09, 0x98, 0xc1, 0x80, 0x66, 0xcb, 0x97,
+    0xf5, 0x00, 0x4b, 0x29, 0xc4, 0x19, 0x53, 0x00, 0x4b, 0x20, 0xc7, 0xc7,
+    0x19, 0x0f, 0x9e, 0xe8, 0x4f, 0x0b, 0x17, 0xc1, 0x80, 0x8a, 0x4d, 0x29,
+    0xb9, 0x41, 0x80, 0xec, 0xcf, 0x66, 0x0c, 0x01, 0x1f, 0x41, 0xd4, 0x3b,
+    0x10, 0x01, 0x1c, 0xb0, 0x47, 0x07, 0x9a, 0xc1, 0x81, 0x4e, 0x44, 0x00,
+    0xf1, 0xc1, 0x81, 0x5a, 0xc4, 0x51, 0xb7, 0x01, 0x1e, 0x30, 0xc8, 0x01,
+    0x92, 0x01, 0x1e, 0x19, 0xc6, 0x02, 0xd1, 0x01, 0x1e, 0x00, 0xc4, 0x51,
+    0xb7, 0x01, 0x1e, 0x41, 0xc8, 0x01, 0x92, 0x01, 0x1e, 0x29, 0xc6, 0x02,
+    0xd1, 0x01, 0x1e, 0x10, 0xc4, 0x51, 0xb7, 0x01, 0x1e, 0x39, 0xc8, 0x01,
+    0x92, 0x01, 0x1e, 0x21, 0xc6, 0x02, 0xd1, 0x01, 0x1e, 0x08, 0x44, 0x84,
+    0x6c, 0x41, 0x81, 0x66, 0xca, 0xa6, 0xde, 0x0e, 0x98, 0x11, 0xcd, 0x7f,
+    0xce, 0x0e, 0x98, 0x08, 0xc2, 0x00, 0x74, 0x01, 0x34, 0x79, 0xc3, 0x01,
+    0x95, 0x01, 0x34, 0x60, 0xc3, 0x01, 0x95, 0x01, 0x34, 0x71, 0xc2, 0x00,
+    0x74, 0x01, 0x34, 0x68, 0x00, 0x41, 0x81, 0x72, 0x00, 0x41, 0x81, 0x7e,
+    0xc4, 0x18, 0x10, 0x00, 0x01, 0xbb, 0x01, 0x81, 0x8a, 0xc2, 0x22, 0xcc,
+    0x00, 0x01, 0xb2, 0x01, 0x81, 0x8e, 0xc3, 0x0d, 0x14, 0x00, 0x01, 0xab,
+    0x01, 0x81, 0x92, 0xc3, 0x09, 0x9e, 0x00, 0x01, 0xa2, 0x01, 0x81, 0x96,
+    0xc4, 0x02, 0xde, 0x00, 0x01, 0x9b, 0x01, 0x81, 0x9a, 0xc2, 0x02, 0xa0,
+    0x00, 0x01, 0x92, 0x01, 0x81, 0x9e, 0x00, 0x41, 0x81, 0xa2, 0x00, 0x41,
+    0x81, 0xae, 0x45, 0x09, 0x98, 0xc1, 0x81, 0xba, 0xcb, 0x97, 0xf5, 0x08,
+    0xca, 0x20, 0xc5, 0x33, 0x5d, 0x08, 0xca, 0x19, 0xc7, 0xc3, 0xa7, 0x08,
+    0xc9, 0xe9, 0xcb, 0x1e, 0x89, 0x08, 0xc9, 0xe1, 0xc8, 0x14, 0x38, 0x08,
+    0xc9, 0xd8, 0xc2, 0x00, 0x39, 0x08, 0xca, 0x11, 0xc2, 0x19, 0x2c, 0x08,
+    0xca, 0x00, 0xc5, 0x1e, 0x96, 0x08, 0xc9, 0xf1, 0x4a, 0x6f, 0xc8, 0x41,
+    0x81, 0xde, 0xc2, 0x02, 0x1c, 0x08, 0xc9, 0x79, 0x0e, 0xc1, 0x81, 0xf8,
+    0xc2, 0x00, 0xd0, 0x08, 0xc9, 0x69, 0x15, 0xc1, 0x82, 0x02, 0xc2, 0x02,
+    0x41, 0x08, 0xc9, 0x49, 0xc2, 0x00, 0x39, 0x08, 0xc9, 0x39, 0x1b, 0xc1,
+    0x82, 0x12, 0xc2, 0x01, 0xc3, 0x08, 0xc9, 0x21, 0x04, 0xc1, 0x82, 0x1c,
+    0x12, 0xc1, 0x82, 0x26, 0x10, 0xc1, 0x82, 0x30, 0x06, 0xc1, 0x82, 0x46,
+    0x16, 0xc1, 0x82, 0x54, 0xc2, 0x25, 0x3b, 0x08, 0xc8, 0x99, 0x05, 0xc1,
+    0x82, 0x64, 0x09, 0xc1, 0x82, 0x6e, 0x0d, 0xc1, 0x82, 0x78, 0x91, 0x08,
+    0xc8, 0x49, 0x87, 0x08, 0xc8, 0x31, 0x97, 0x08, 0xc8, 0x23, 0x01, 0x82,
+    0x82, 0x8b, 0x08, 0xc8, 0x13, 0x01, 0x82, 0x86, 0x83, 0x08, 0xc8, 0x02,
+    0x01, 0x82, 0x8a, 0xc5, 0x03, 0x4d, 0x01, 0x16, 0x39, 0x15, 0x41, 0x82,
+    0x8e, 0xca, 0xa3, 0x64, 0x01, 0x3c, 0x99, 0x46, 0x09, 0x97, 0x41, 0x82,
+    0x9a, 0xc4, 0x26, 0x78, 0x01, 0x3b, 0xf1, 0xc5, 0x06, 0xdb, 0x01, 0x3b,
+    0xe9, 0x15, 0xc1, 0x82, 0xbe, 0x08, 0xc1, 0x82, 0xca, 0x16, 0xc1, 0x82,
+    0xd6, 0xc3, 0x05, 0x14, 0x01, 0x3b, 0xb0, 0xc4, 0x26, 0x78, 0x01, 0x3c,
+    0x41, 0xc5, 0x06, 0xdb, 0x01, 0x3c, 0x39, 0x15, 0xc1, 0x82, 0xe2, 0x08,
+    0xc1, 0x82, 0xee, 0x16, 0xc1, 0x82, 0xfa, 0xc3, 0x05, 0x14, 0x01, 0x3c,
+    0x01, 0xc4, 0x15, 0xe7, 0x0f, 0x88, 0x58, 0xc4, 0x00, 0x87, 0x0f, 0xb0,
+    0xf1, 0xd1, 0x4f, 0x14, 0x0f, 0xb1, 0x28, 0xc8, 0x18, 0x67, 0x01, 0x16,
+    0x21, 0xd7, 0x26, 0x1b, 0x0f, 0xa5, 0x01, 0x45, 0x00, 0x8c, 0xc1, 0x83,
+    0x06, 0xc6, 0xcf, 0xad, 0x0f, 0xbc, 0xe0, 0xc4, 0x01, 0x23, 0x0f, 0xc8,
+    0x43, 0x01, 0x83, 0x1e, 0xcc, 0x84, 0xa5, 0x0f, 0xc8, 0x4a, 0x01, 0x83,
+    0x24, 0x16, 0xc1, 0x83, 0x2a, 0x15, 0xc1, 0x83, 0x36, 0x0a, 0xc1, 0x83,
+    0x42, 0x03, 0xc1, 0x83, 0x4e, 0xcf, 0x61, 0x4d, 0x01, 0x3f, 0x89, 0xcb,
+    0x01, 0xfc, 0x01, 0x0f, 0x4b, 0x01, 0x83, 0x5d, 0x06, 0xc1, 0x83, 0x63,
+    0xcd, 0x7c, 0xa8, 0x01, 0x0e, 0x51, 0xcc, 0x2e, 0x48, 0x01, 0x0d, 0x79,
+    0xc6, 0xca, 0xa3, 0x0f, 0xb3, 0x79, 0x46, 0x04, 0x8f, 0xc1, 0x83, 0x6f,
+    0xd1, 0x56, 0xd9, 0x0f, 0xc1, 0xb9, 0xd0, 0x58, 0x62, 0x0f, 0xc1, 0xf8,
+    0xd2, 0x4c, 0xfd, 0x01, 0x57, 0x88, 0xd0, 0x5d, 0x52, 0x01, 0x4f, 0x49,
+    0xcf, 0x66, 0x66, 0x01, 0x4f, 0x40, 0x43, 0xe5, 0x0c, 0xc1, 0x83, 0x7b,
+    0x43, 0xe5, 0xff, 0xc1, 0x83, 0x97, 0x43, 0xe5, 0xdb, 0xc1, 0x83, 0xb3,
+    0x43, 0xe6, 0x6e, 0xc1, 0x83, 0xcf, 0x43, 0xe6, 0x3b, 0xc1, 0x83, 0xeb,
+    0x43, 0xe5, 0xa8, 0xc1, 0x84, 0x07, 0x43, 0xe5, 0x45, 0x41, 0x84, 0x23,
+    0x43, 0xe5, 0xdb, 0xc1, 0x84, 0x3f, 0x43, 0xe5, 0xff, 0xc1, 0x84, 0x5b,
+    0x43, 0xe6, 0x6e, 0xc1, 0x84, 0x77, 0x43, 0xe6, 0x3b, 0xc1, 0x84, 0x93,
+    0x43, 0xe5, 0x0c, 0xc1, 0x84, 0xaf, 0x43, 0xe5, 0xa8, 0xc1, 0x84, 0xcb,
+    0x43, 0xe5, 0x45, 0x41, 0x84, 0xe7, 0x05, 0xc1, 0x85, 0x03, 0x49, 0x07,
+    0xbb, 0xc1, 0x85, 0x15, 0x17, 0xc1, 0x85, 0x24, 0x44, 0x06, 0xbb, 0xc1,
+    0x85, 0x30, 0x15, 0xc1, 0x85, 0x3c, 0xcd, 0x2c, 0xb2, 0x01, 0x02, 0x39,
+    0xd0, 0x0f, 0x09, 0x01, 0x01, 0xe1, 0x12, 0xc1, 0x85, 0x50, 0x06, 0xc1,
+    0x85, 0x5a, 0x0a, 0xc1, 0x85, 0x66, 0x0e, 0xc1, 0x85, 0x72, 0xdb, 0x16,
+    0x89, 0x01, 0x4c, 0xb1, 0x47, 0xc4, 0x17, 0xc1, 0x85, 0x7c, 0xcc, 0x83,
+    0x0d, 0x00, 0x16, 0xe9, 0xcd, 0x7d, 0x9f, 0x07, 0xf2, 0x61, 0xce, 0x70,
+    0x0a, 0x01, 0x70, 0xb8, 0xc9, 0x1b, 0xc7, 0x01, 0x35, 0x19, 0xcb, 0x21,
+    0x00, 0x01, 0x35, 0x11, 0xc6, 0x00, 0x91, 0x01, 0x5f, 0xe0, 0x47, 0x73,
+    0x59, 0xc1, 0x85, 0x8b, 0xce, 0x6e, 0xd6, 0x01, 0x4e, 0xf9, 0x45, 0x02,
+    0x6d, 0x41, 0x85, 0xa3, 0xc5, 0x02, 0xd2, 0x01, 0x2e, 0x61, 0xc4, 0x0d,
+    0x21, 0x01, 0x02, 0xe0, 0xc5, 0x0b, 0x0a, 0x01, 0x58, 0xd1, 0xc6, 0x27,
+    0x5e, 0x01, 0x72, 0x50, 0xc5, 0x33, 0x5d, 0x08, 0xc1, 0xd1, 0x42, 0x07,
+    0xb2, 0xc1, 0x85, 0xaf, 0xc8, 0x14, 0x38, 0x08, 0xc1, 0xb8, 0x03, 0xc1,
+    0x85, 0xbb, 0x91, 0x08, 0xc1, 0xa9, 0x87, 0x08, 0xc1, 0x99, 0xc9, 0xb2,
+    0x2d, 0x08, 0xc1, 0x8b, 0x01, 0x85, 0xc7, 0x97, 0x08, 0xc1, 0x7b, 0x01,
+    0x85, 0xcb, 0x8b, 0x08, 0xc1, 0x6a, 0x01, 0x85, 0xcf, 0x14, 0xc1, 0x85,
+    0xd3, 0xc2, 0x00, 0xd0, 0x08, 0xc1, 0x51, 0x15, 0xc1, 0x85, 0xdd, 0xc2,
+    0x02, 0x41, 0x08, 0xc1, 0x31, 0xc2, 0x00, 0xdb, 0x08, 0xc1, 0x29, 0xc2,
+    0x19, 0x2c, 0x08, 0xc1, 0x19, 0xc2, 0x01, 0xc3, 0x08, 0xc1, 0x11, 0x04,
+    0xc1, 0x85, 0xed, 0x12, 0xc1, 0x85, 0xf7, 0x10, 0xc1, 0x86, 0x01, 0x06,
+    0xc1, 0x86, 0x17, 0x16, 0xc1, 0x86, 0x25, 0x0c, 0xc1, 0x86, 0x33, 0x05,
+    0xc1, 0x86, 0x3d, 0x09, 0xc1, 0x86, 0x47, 0x0d, 0xc1, 0x86, 0x51, 0x83,
+    0x08, 0xc0, 0x03, 0x01, 0x86, 0x5b, 0x91, 0x08, 0xc0, 0x41, 0x87, 0x08,
+    0xc0, 0x31, 0x97, 0x08, 0xc0, 0x23, 0x01, 0x86, 0x67, 0x8b, 0x08, 0xc0,
+    0x12, 0x01, 0x86, 0x6b, 0xc9, 0x23, 0x9f, 0x01, 0x17, 0x68, 0xc9, 0x23,
+    0x9f, 0x01, 0x17, 0x00, 0xcc, 0x87, 0xbd, 0x0f, 0xad, 0xd0, 0x43, 0x02,
+    0x5f, 0xc1, 0x86, 0x6f, 0xd5, 0x32, 0x57, 0x0d, 0xe3, 0x80, 0xc8, 0x00,
+    0x5f, 0x0d, 0xe4, 0x43, 0x01, 0x86, 0x9e, 0xc4, 0x51, 0xb7, 0x0d, 0xe4,
+    0x39, 0x0e, 0xc1, 0x86, 0xa4, 0xc6, 0x02, 0xd1, 0x0d, 0xe4, 0x29, 0xc3,
+    0x02, 0xa3, 0x0d, 0xe4, 0x21, 0xc5, 0x1f, 0x0c, 0x0d, 0xe4, 0x11, 0xcb,
+    0x8f, 0x94, 0x0d, 0xe4, 0x09, 0xc5, 0x31, 0xee, 0x0d, 0xe4, 0x00, 0x42,
+    0x01, 0x6f, 0xc1, 0x86, 0xb0, 0xc6, 0xce, 0x8d, 0x0d, 0xe3, 0xd9, 0xc6,
+    0x99, 0xc8, 0x0d, 0xe3, 0xd1, 0xd4, 0x3c, 0xdc, 0x0d, 0xe3, 0xb9, 0xc6,
+    0x27, 0x9c, 0x0d, 0xe3, 0xb0, 0xcf, 0x61, 0x98, 0x0d, 0xe3, 0xa1, 0xd1,
+    0x27, 0x91, 0x0d, 0xe3, 0x88, 0xc2, 0x00, 0x2b, 0x0d, 0xe1, 0xd1, 0x8a,
+    0x0d, 0xe1, 0xc9, 0x91, 0x0d, 0xe2, 0xeb, 0x01, 0x86, 0xbf, 0xc2, 0x06,
+    0xdb, 0x0d, 0xe2, 0xf9, 0x8b, 0x0d, 0xe2, 0xf1, 0x83, 0x0d, 0xe2, 0xe0,
+    0x00, 0xc1, 0x86, 0xc3, 0x8a, 0x0d, 0xe0, 0x88, 0x00, 0xc1, 0x86, 0xcd,
+    0x45, 0xd9, 0x89, 0xc1, 0x86, 0xfe, 0xc6, 0xcf, 0x17, 0x0d, 0xe2, 0x48,
+    0x00, 0x41, 0x87, 0x1a, 0x00, 0xc1, 0x87, 0x38, 0x45, 0x44, 0xf8, 0x41,
+    0x87, 0x49, 0x00, 0x41, 0x87, 0x59, 0x8a, 0x0d, 0xe0, 0xc1, 0xc2, 0x00,
+    0x3f, 0x0d, 0xe0, 0x81, 0x48, 0xb5, 0xfa, 0x41, 0x87, 0x6a, 0x8a, 0x0d,
+    0xe0, 0xb9, 0x44, 0x08, 0x48, 0x41, 0x87, 0x72, 0x8e, 0x0d, 0xe0, 0xb0,
+    0x8d, 0x0d, 0xe0, 0xa1, 0x00, 0x41, 0x87, 0x7a, 0x8a, 0x0d, 0xe0, 0x99,
+    0xc2, 0x00, 0x3f, 0x0d, 0xe0, 0x68, 0xc2, 0x04, 0x4d, 0x0d, 0xe0, 0x70,
+    0xc2, 0x04, 0x4d, 0x0d, 0xe0, 0x61, 0x47, 0xc0, 0x35, 0x41, 0x87, 0x84,
+    0xc4, 0xe4, 0x37, 0x0d, 0xe1, 0xf0, 0xc8, 0xbb, 0x02, 0x0d, 0xe3, 0x50,
+    0x99, 0x0d, 0xe2, 0x98, 0x97, 0x0d, 0xe2, 0xd9, 0x99, 0x0d, 0xe2, 0xd1,
+    0xc2, 0x38, 0x2a, 0x0d, 0xe2, 0xc9, 0x83, 0x0d, 0xe2, 0x18, 0x8a, 0x0d,
+    0xe2, 0xb9, 0xc2, 0x04, 0x4d, 0x0d, 0xe2, 0xa1, 0x8b, 0x0d, 0xe2, 0x50,
+    0x97, 0x0d, 0xe2, 0x91, 0x87, 0x0d, 0xe2, 0x58, 0x87, 0x0d, 0xe2, 0x40,
+    0xc2, 0x00, 0x59, 0x0d, 0xe2, 0x28, 0xca, 0xa2, 0xc4, 0x01, 0x71, 0xb1,
+    0xcb, 0x98, 0x9a, 0x01, 0x71, 0xb8, 0xc5, 0x06, 0x82, 0x00, 0x04, 0x69,
+    0x42, 0x01, 0x0f, 0xc1, 0x87, 0x8c, 0xc7, 0x27, 0x5d, 0x00, 0x02, 0xe3,
+    0x01, 0x87, 0x98, 0xcd, 0x7b, 0x15, 0x0f, 0xb3, 0xf9, 0x55, 0x33, 0x92,
+    0x41, 0x87, 0x9c, 0x14, 0xc1, 0x87, 0xa8, 0xc8, 0x68, 0xc5, 0x01, 0x18,
+    0x81, 0x16, 0xc1, 0x87, 0xba, 0x15, 0xc1, 0x87, 0xcf, 0x12, 0xc1, 0x87,
+    0xdb, 0x47, 0x00, 0x58, 0xc1, 0x87, 0xe7, 0xe0, 0x09, 0x27, 0x0f, 0xac,
+    0xa9, 0xcc, 0x89, 0x79, 0x0f, 0xac, 0xa1, 0xc9, 0xb2, 0xf3, 0x01, 0x4d,
+    0x81, 0xc5, 0x01, 0x95, 0x01, 0x4d, 0x1b, 0x01, 0x87, 0xf6, 0xd2, 0x4a,
+    0x3f, 0x01, 0x70, 0x89, 0xcd, 0x2c, 0xb2, 0x01, 0x71, 0x71, 0xc5, 0x0a,
+    0x8a, 0x01, 0x72, 0x08, 0x9f, 0x01, 0x37, 0x09, 0x9e, 0x01, 0x37, 0x00,
+    0xd1, 0x53, 0x54, 0x01, 0x33, 0xd1, 0x45, 0x1a, 0xad, 0x41, 0x87, 0xfc,
+    0x87, 0x05, 0x4a, 0x4b, 0x01, 0x88, 0x26, 0x03, 0xc1, 0x88, 0x2e, 0x91,
+    0x05, 0x4a, 0x59, 0x97, 0x05, 0x4a, 0x41, 0x8b, 0x05, 0x4a, 0x38, 0x89,
+    0x05, 0x4a, 0x78, 0x1b, 0xc1, 0x88, 0x36, 0xc2, 0x0e, 0x9a, 0x05, 0x4a,
+    0x21, 0x09, 0xc1, 0x88, 0x40, 0x83, 0x05, 0x49, 0xa8, 0xc2, 0x01, 0x5d,
+    0x05, 0x4a, 0x11, 0x83, 0x05, 0x49, 0xc0, 0x07, 0xc1, 0x88, 0x4a, 0xd5,
+    0x32, 0x18, 0x01, 0x3e, 0x31, 0xcd, 0x25, 0xae, 0x00, 0x02, 0xeb, 0x01,
+    0x88, 0x56, 0x0b, 0xc1, 0x88, 0x5a, 0x42, 0x00, 0x67, 0xc1, 0x88, 0x66,
+    0xd3, 0x1f, 0xcd, 0x01, 0x70, 0x18, 0x10, 0xc1, 0x88, 0x75, 0x14, 0x41,
+    0x88, 0x7f, 0xc9, 0x9b, 0x77, 0x01, 0x3e, 0xb1, 0x43, 0x02, 0x6f, 0xc1,
+    0x88, 0x8b, 0xcf, 0x63, 0x5a, 0x0f, 0xdd, 0xe0, 0x43, 0x01, 0xd0, 0xc1,
+    0x88, 0x97, 0xd5, 0x36, 0xb0, 0x0f, 0xab, 0xe8, 0xc7, 0xc9, 0xb2, 0x01,
+    0x1d, 0xc9, 0xcd, 0x77, 0xfc, 0x01, 0x71, 0x08, 0xcc, 0x00, 0x33, 0x00,
+    0x03, 0xeb, 0x01, 0x88, 0xaf, 0xc6, 0xb7, 0x3b, 0x01, 0x18, 0x49, 0xcd,
+    0x69, 0x65, 0x01, 0x80, 0x68, 0x00, 0x41, 0x88, 0xb3, 0xc4, 0x20, 0xe6,
+    0x01, 0x18, 0x59, 0x0b, 0x41, 0x88, 0xc5, 0x14, 0xc1, 0x88, 0xd1, 0xc3,
+    0x00, 0x3a, 0x01, 0x15, 0x11, 0x0a, 0xc1, 0x88, 0xdd, 0xd5, 0x08, 0x89,
+    0x01, 0x80, 0xa8, 0x45, 0x00, 0x5a, 0xc1, 0x88, 0xef, 0xd9, 0x1f, 0xc7,
+    0x01, 0x70, 0x28, 0xcb, 0x8a, 0x0a, 0x01, 0x4e, 0xc9, 0x45, 0x01, 0xfd,
+    0x41, 0x89, 0x05, 0xd6, 0x08, 0x88, 0x01, 0x4c, 0xc1, 0xd2, 0x21, 0x89,
+    0x01, 0x80, 0x88, 0xca, 0x01, 0xfd, 0x01, 0x0f, 0x43, 0x01, 0x89, 0x21,
+    0xc9, 0xb0, 0x6b, 0x01, 0x0c, 0xe8, 0x42, 0x00, 0x2c, 0xc1, 0x89, 0x25,
+    0x42, 0x02, 0xa0, 0xc1, 0x89, 0x31, 0xd5, 0x37, 0xc1, 0x0f, 0xc5, 0x18,
+    0xcf, 0x5b, 0xc3, 0x0f, 0xc2, 0x91, 0x42, 0x00, 0xe3, 0x41, 0x89, 0x3d,
+    0x45, 0x11, 0x3a, 0xc1, 0x89, 0x49, 0x03, 0x41, 0x89, 0x55, 0x00, 0xc1,
+    0x89, 0x61, 0xc5, 0x14, 0xa5, 0x01, 0x48, 0xd0, 0xcb, 0x82, 0xba, 0x01,
+    0x0f, 0x11, 0x46, 0x00, 0x59, 0x41, 0x89, 0x7e, 0xc5, 0xca, 0xa4, 0x0f,
+    0xb3, 0x71, 0xd7, 0x2a, 0x6b, 0x0f, 0xc5, 0x28, 0x45, 0x04, 0x90, 0xc1,
+    0x89, 0x8d, 0xd8, 0x23, 0xdb, 0x0f, 0xc5, 0x09, 0xdf, 0x0c, 0x65, 0x0f,
+    0xc5, 0x48, 0xd0, 0x56, 0xda, 0x0f, 0xc1, 0xb1, 0xe0, 0x01, 0xe7, 0x0f,
+    0xc5, 0x58, 0xd0, 0x5a, 0x22, 0x0f, 0xa8, 0x71, 0xcd, 0x0b, 0x91, 0x01,
+    0x19, 0x51, 0xd4, 0x3b, 0x9c, 0x01, 0x4f, 0xe9, 0xdb, 0x18, 0x39, 0x00,
+    0x05, 0xd8, 0xdc, 0x14, 0x4d, 0x01, 0x3d, 0x49, 0xd7, 0x29, 0xe1, 0x01,
+    0x49, 0xc0, 0xc7, 0x00, 0xfa, 0x01, 0x03, 0x39, 0xc8, 0xb6, 0xca, 0x01,
+    0x01, 0x71, 0xc9, 0xb3, 0x9e, 0x01, 0x01, 0x59, 0xc4, 0x01, 0xc3, 0x01,
+    0x00, 0x78, 0xd6, 0x2d, 0x4c, 0x00, 0x2c, 0x69, 0xc4, 0xb9, 0x3c, 0x0f,
+    0xc8, 0xe1, 0xcb, 0x8f, 0xf7, 0x00, 0x7e, 0xaa, 0x01, 0x89, 0x99, 0xc4,
+    0x00, 0x49, 0x01, 0x5d, 0x81, 0xc5, 0x00, 0x2c, 0x01, 0x5d, 0x88, 0xc4,
+    0x00, 0x49, 0x01, 0x5d, 0x91, 0xc5, 0x00, 0x2c, 0x01, 0x5d, 0x98, 0xc2,
+    0x02, 0xae, 0x01, 0x5d, 0xa1, 0xc4, 0x03, 0xc8, 0x01, 0x5d, 0xb0, 0xc2,
+    0x02, 0xae, 0x01, 0x5d, 0xa9, 0xc4, 0x03, 0xc8, 0x01, 0x5d, 0xb8, 0xc7,
+    0xc9, 0x42, 0x0f, 0x9d, 0x11, 0xc5, 0xdb, 0x41, 0x0f, 0xb7, 0xe0, 0xc6,
+    0xd0, 0x2b, 0x0f, 0x93, 0x21, 0xc2, 0x00, 0x59, 0x0f, 0x93, 0x10, 0x00,
+    0x41, 0x89, 0x9f, 0x0b, 0xc1, 0x89, 0xb1, 0xc3, 0x09, 0x9e, 0x01, 0x0b,
+    0x18, 0xc2, 0x22, 0xcc, 0x01, 0x0b, 0x2b, 0x01, 0x89, 0xc3, 0xc4, 0x18,
+    0x10, 0x01, 0x0b, 0x30, 0xc2, 0x00, 0xc4, 0x01, 0x0b, 0x4b, 0x01, 0x89,
+    0xc9, 0x19, 0xc1, 0x89, 0xcf, 0xc4, 0x02, 0xde, 0x01, 0x0b, 0x10, 0xc5,
+    0x66, 0xb1, 0x01, 0x0b, 0x51, 0xc4, 0x00, 0x2d, 0x01, 0x0b, 0x38, 0x42,
+    0x09, 0x40, 0xc1, 0x89, 0xd9, 0xcb, 0x9a, 0x05, 0x08, 0x0c, 0x91, 0xcd,
+    0x7a, 0xd4, 0x08, 0x0c, 0xc0, 0x46, 0x00, 0x8b, 0x41, 0x89, 0xe5, 0xc6,
+    0x02, 0xe9, 0x0f, 0x8b, 0x61, 0xc6, 0x42, 0xd4, 0x0f, 0x8b, 0x59, 0xc6,
+    0x5c, 0x5b, 0x0f, 0x8b, 0x50, 0xd8, 0x21, 0x3b, 0x01, 0x70, 0x38, 0xc5,
+    0x06, 0x67, 0x08, 0x73, 0xe9, 0xc7, 0x08, 0x79, 0x08, 0x73, 0xe1, 0xc4,
+    0x01, 0xce, 0x08, 0x73, 0xd8, 0xc8, 0x0d, 0x03, 0x08, 0x73, 0xd1, 0xc2,
+    0x0d, 0x10, 0x08, 0x73, 0x88, 0xc8, 0x0d, 0x03, 0x08, 0x73, 0xc9, 0x9b,
+    0x08, 0x73, 0x80, 0x44, 0x18, 0x10, 0xc1, 0x89, 0xf1, 0x42, 0x22, 0xcc,
+    0x41, 0x89, 0xfd, 0x0b, 0xc1, 0x8a, 0x09, 0x11, 0x41, 0x8a, 0x15, 0x0a,
+    0xc1, 0x8a, 0x21, 0x19, 0xc1, 0x8a, 0x2d, 0xc2, 0x00, 0xc4, 0x08, 0x73,
+    0x48, 0xc4, 0x18, 0x10, 0x08, 0x73, 0x31, 0xc2, 0x22, 0xcc, 0x08, 0x73,
+    0x28, 0xc3, 0x0d, 0x14, 0x08, 0x73, 0x21, 0xc3, 0x09, 0x9e, 0x08, 0x73,
+    0x18, 0xc4, 0x02, 0xde, 0x08, 0x73, 0x11, 0xc2, 0x02, 0xa0, 0x08, 0x73,
+    0x08, 0x08, 0xc1, 0x8a, 0x39, 0x91, 0x00, 0xb5, 0x73, 0x01, 0x8a, 0x45,
+    0x15, 0xc1, 0x8a, 0x63, 0x8d, 0x00, 0xb7, 0x8b, 0x01, 0x8a, 0x7c, 0x9a,
+    0x00, 0xb7, 0x51, 0x93, 0x00, 0xb7, 0x49, 0x0b, 0xc1, 0x8a, 0x82, 0x0e,
+    0xc1, 0x8a, 0xa3, 0x85, 0x00, 0xb6, 0x6b, 0x01, 0x8a, 0xaf, 0x87, 0x00,
+    0xb6, 0x13, 0x01, 0x8a, 0xbf, 0x86, 0x00, 0xb6, 0x8b, 0x01, 0x8a, 0xd7,
+    0xcc, 0x84, 0xe1, 0x00, 0xb6, 0xb9, 0xd8, 0x25, 0x2b, 0x00, 0xb6, 0x91,
+    0x16, 0xc1, 0x8a, 0xe3, 0x9c, 0x00, 0xb6, 0x71, 0x03, 0xc1, 0x8a, 0xef,
+    0xcf, 0x60, 0xe4, 0x00, 0xb6, 0x41, 0x89, 0x00, 0xb5, 0xab, 0x01, 0x8b,
+    0x07, 0xc7, 0xc7, 0xf2, 0x00, 0xb6, 0x19, 0xd1, 0x57, 0x0c, 0x00, 0xb5,
+    0xf1, 0x42, 0x00, 0xd0, 0xc1, 0x8b, 0x11, 0x99, 0x00, 0xb5, 0x2b, 0x01,
+    0x8b, 0x1d, 0xd0, 0x5d, 0x82, 0x00, 0xb5, 0x89, 0x9b, 0x00, 0xb5, 0x23,
+    0x01, 0x8b, 0x23, 0xc9, 0xb4, 0x88, 0x00, 0xb5, 0x11, 0x98, 0x00, 0xb5,
+    0x08, 0xa1, 0x70, 0x0c, 0x49, 0xa0, 0x70, 0x0c, 0x41, 0xa6, 0x70, 0x0c,
+    0x71, 0xa5, 0x70, 0x0c, 0x69, 0xa4, 0x70, 0x0c, 0x61, 0xa3, 0x70, 0x0c,
+    0x59, 0xa2, 0x70, 0x0c, 0x51, 0x9f, 0x70, 0x0c, 0x39, 0x9e, 0x70, 0x0c,
+    0x31, 0x9d, 0x70, 0x0c, 0x28, 0xa0, 0x70, 0x0b, 0x01, 0x9f, 0x70, 0x0a,
+    0xf9, 0x9e, 0x70, 0x0a, 0xf1, 0x9d, 0x70, 0x0a, 0xe9, 0xa6, 0x70, 0x0b,
+    0x31, 0xa5, 0x70, 0x0b, 0x29, 0xa4, 0x70, 0x0b, 0x21, 0xa3, 0x70, 0x0b,
+    0x19, 0xa2, 0x70, 0x0b, 0x11, 0xa1, 0x70, 0x0b, 0x08, 0xa6, 0x70, 0x0a,
+    0xe1, 0xa5, 0x70, 0x0a, 0xd9, 0xa4, 0x70, 0x0a, 0xd1, 0xa3, 0x70, 0x0a,
+    0xc9, 0xa2, 0x70, 0x0a, 0xc1, 0xa1, 0x70, 0x0a, 0xb9, 0xa0, 0x70, 0x0a,
+    0xb1, 0x9f, 0x70, 0x0a, 0xa9, 0x9e, 0x70, 0x0a, 0xa1, 0x9d, 0x70, 0x0a,
+    0x98, 0xa6, 0x70, 0x0d, 0xb1, 0xa5, 0x70, 0x0d, 0xa9, 0xa4, 0x70, 0x0d,
+    0xa1, 0xa3, 0x70, 0x0d, 0x99, 0xa2, 0x70, 0x0d, 0x91, 0xa1, 0x70, 0x0d,
+    0x89, 0xa0, 0x70, 0x0d, 0x81, 0x9f, 0x70, 0x0d, 0x79, 0x9e, 0x70, 0x0d,
+    0x71, 0x9d, 0x70, 0x0d, 0x68, 0xa6, 0x70, 0x0d, 0x61, 0xa5, 0x70, 0x0d,
+    0x59, 0xa4, 0x70, 0x0d, 0x51, 0xa3, 0x70, 0x0d, 0x49, 0xa2, 0x70, 0x0d,
+    0x41, 0xa1, 0x70, 0x0d, 0x39, 0xa0, 0x70, 0x0d, 0x31, 0x9f, 0x70, 0x0d,
+    0x29, 0x9e, 0x70, 0x0d, 0x21, 0x9d, 0x70, 0x0d, 0x18, 0xa6, 0x70, 0x0d,
+    0x11, 0xa5, 0x70, 0x0d, 0x09, 0xa4, 0x70, 0x0d, 0x01, 0xa3, 0x70, 0x0c,
+    0xf9, 0xa2, 0x70, 0x0c, 0xf1, 0xa1, 0x70, 0x0c, 0xe9, 0xa0, 0x70, 0x0c,
+    0xe1, 0x9f, 0x70, 0x0c, 0xd9, 0x9e, 0x70, 0x0c, 0xd1, 0x9d, 0x70, 0x0c,
+    0xc8, 0xa6, 0x70, 0x0c, 0xc1, 0xa5, 0x70, 0x0c, 0xb9, 0xa4, 0x70, 0x0c,
+    0xb1, 0xa3, 0x70, 0x0c, 0xa9, 0xa2, 0x70, 0x0c, 0xa1, 0xa1, 0x70, 0x0c,
+    0x99, 0xa0, 0x70, 0x0c, 0x91, 0x9f, 0x70, 0x0c, 0x89, 0x9e, 0x70, 0x0c,
+    0x81, 0x9d, 0x70, 0x0c, 0x78, 0xa6, 0x70, 0x0c, 0x21, 0xa5, 0x70, 0x0c,
+    0x19, 0xa4, 0x70, 0x0c, 0x11, 0xa3, 0x70, 0x0c, 0x09, 0xa2, 0x70, 0x0c,
+    0x01, 0xa1, 0x70, 0x0b, 0xf9, 0xa0, 0x70, 0x0b, 0xf1, 0x9f, 0x70, 0x0b,
+    0xe9, 0x9e, 0x70, 0x0b, 0xe1, 0x9d, 0x70, 0x0b, 0xd8, 0xa6, 0x70, 0x0b,
+    0xd1, 0xa5, 0x70, 0x0b, 0xc9, 0xa4, 0x70, 0x0b, 0xc1, 0xa3, 0x70, 0x0b,
+    0xb9, 0xa2, 0x70, 0x0b, 0xb1, 0xa1, 0x70, 0x0b, 0xa9, 0xa0, 0x70, 0x0b,
+    0xa1, 0x9f, 0x70, 0x0b, 0x99, 0x9e, 0x70, 0x0b, 0x91, 0x9d, 0x70, 0x0b,
+    0x88, 0xa6, 0x70, 0x0b, 0x81, 0xa5, 0x70, 0x0b, 0x79, 0xa4, 0x70, 0x0b,
+    0x71, 0xa3, 0x70, 0x0b, 0x69, 0xa2, 0x70, 0x0b, 0x61, 0xa1, 0x70, 0x0b,
+    0x59, 0xa0, 0x70, 0x0b, 0x51, 0x9f, 0x70, 0x0b, 0x49, 0x9e, 0x70, 0x0b,
+    0x41, 0x9d, 0x70, 0x0b, 0x38, 0xa3, 0x70, 0x0f, 0x79, 0xa2, 0x70, 0x0f,
+    0x71, 0xa1, 0x70, 0x0f, 0x69, 0xa0, 0x70, 0x0f, 0x61, 0x9f, 0x70, 0x0f,
+    0x59, 0x9e, 0x70, 0x0f, 0x51, 0x9d, 0x70, 0x0f, 0x48, 0xa6, 0x70, 0x0f,
+    0x41, 0xa5, 0x70, 0x0f, 0x39, 0xa4, 0x70, 0x0f, 0x31, 0xa3, 0x70, 0x0f,
+    0x29, 0xa2, 0x70, 0x0f, 0x21, 0xa1, 0x70, 0x0f, 0x19, 0xa0, 0x70, 0x0f,
+    0x11, 0x9f, 0x70, 0x0f, 0x09, 0x9e, 0x70, 0x0f, 0x01, 0x9d, 0x70, 0x0e,
+    0xf8, 0xa6, 0x70, 0x0e, 0xf1, 0xa5, 0x70, 0x0e, 0xe9, 0xa4, 0x70, 0x0e,
+    0xe1, 0xa3, 0x70, 0x0e, 0xd9, 0xa2, 0x70, 0x0e, 0xd1, 0xa1, 0x70, 0x0e,
+    0xc9, 0xa0, 0x70, 0x0e, 0xc1, 0x9f, 0x70, 0x0e, 0xb9, 0x9e, 0x70, 0x0e,
+    0xb1, 0x9d, 0x70, 0x0e, 0xa8, 0xa6, 0x70, 0x0e, 0xa1, 0xa5, 0x70, 0x0e,
+    0x99, 0xa4, 0x70, 0x0e, 0x91, 0xa3, 0x70, 0x0e, 0x89, 0xa2, 0x70, 0x0e,
+    0x81, 0xa1, 0x70, 0x0e, 0x79, 0xa0, 0x70, 0x0e, 0x71, 0x9f, 0x70, 0x0e,
+    0x69, 0x9e, 0x70, 0x0e, 0x61, 0x9d, 0x70, 0x0e, 0x58, 0xa6, 0x70, 0x0e,
+    0x51, 0xa5, 0x70, 0x0e, 0x49, 0xa4, 0x70, 0x0e, 0x41, 0xa3, 0x70, 0x0e,
+    0x39, 0xa2, 0x70, 0x0e, 0x31, 0xa1, 0x70, 0x0e, 0x29, 0xa0, 0x70, 0x0e,
+    0x21, 0x9f, 0x70, 0x0e, 0x19, 0x9e, 0x70, 0x0e, 0x11, 0x9d, 0x70, 0x0e,
+    0x08, 0xa6, 0x70, 0x0e, 0x01, 0xa5, 0x70, 0x0d, 0xf9, 0xa4, 0x70, 0x0d,
+    0xf1, 0xa3, 0x70, 0x0d, 0xe9, 0xa2, 0x70, 0x0d, 0xe1, 0xa1, 0x70, 0x0d,
+    0xd9, 0xa0, 0x70, 0x0d, 0xd1, 0x9f, 0x70, 0x0d, 0xc9, 0x9e, 0x70, 0x0d,
+    0xc1, 0x9d, 0x70, 0x0d, 0xb8, 0x87, 0x05, 0x2f, 0x0b, 0x01, 0x8b, 0x27,
+    0x0a, 0xc1, 0x8b, 0x32, 0x19, 0xc1, 0x8b, 0x55, 0x12, 0xc1, 0x8b, 0x78,
+    0x04, 0xc1, 0x8b, 0x92, 0x0f, 0xc1, 0x8b, 0xb0, 0x0d, 0xc1, 0x8b, 0xd4,
+    0x09, 0xc1, 0x8b, 0xf5, 0x08, 0xc1, 0x8c, 0x13, 0x18, 0xc1, 0x8c, 0x2d,
+    0x16, 0xc1, 0x8c, 0x47, 0x06, 0xc1, 0x8c, 0x65, 0x0e, 0xc1, 0x8c, 0x83,
+    0x14, 0xc1, 0x8c, 0x9d, 0x10, 0xc1, 0x8c, 0xb7, 0x15, 0xc1, 0x8c, 0xe4,
+    0x1c, 0xc1, 0x8d, 0x02, 0x05, 0xc1, 0x8d, 0x20, 0x0c, 0xc1, 0x8d, 0x3a,
+    0x1b, 0xc1, 0x8d, 0x54, 0x8b, 0x05, 0x29, 0x23, 0x01, 0x8d, 0x6e, 0x83,
+    0x05, 0x2a, 0x4b, 0x01, 0x8d, 0x72, 0x91, 0x05, 0x2d, 0xd3, 0x01, 0x8d,
+    0x76, 0x97, 0x05, 0x2c, 0xaa, 0x01, 0x8d, 0x81, 0x08, 0xc1, 0x8d, 0x85,
+    0x0d, 0xc1, 0x8d, 0x91, 0x16, 0xc1, 0x8d, 0x9d, 0xc3, 0xe6, 0x5f, 0x05,
+    0x30, 0xb1, 0xc4, 0x10, 0xd0, 0x05, 0x30, 0xb9, 0x06, 0xc1, 0x8d, 0xaf,
+    0xc4, 0x9d, 0xd8, 0x05, 0x30, 0xf8, 0xc2, 0x02, 0xa0, 0x05, 0x31, 0x11,
+    0xc4, 0x02, 0xde, 0x05, 0x31, 0x18, 0xc3, 0x09, 0x9e, 0x05, 0x31, 0x21,
+    0xc3, 0x0d, 0x14, 0x05, 0x31, 0x28, 0xc2, 0x22, 0xcc, 0x05, 0x31, 0x31,
+    0xc4, 0x18, 0x10, 0x05, 0x31, 0x38, 0x9f, 0x0f, 0xdb, 0x81, 0xa0, 0x0f,
+    0xdb, 0x89, 0xa1, 0x0f, 0xdb, 0x91, 0xa2, 0x0f, 0xdb, 0x99, 0xa3, 0x0f,
+    0xdb, 0xa1, 0xa4, 0x0f, 0xdb, 0xa8, 0xd6, 0x30, 0x7a, 0x01, 0x3e, 0x51,
+    0xd5, 0x38, 0x00, 0x01, 0x4e, 0x81, 0xd6, 0x30, 0x38, 0x01, 0x57, 0x11,
+    0xd5, 0x34, 0xcd, 0x01, 0x57, 0x20, 0x00, 0x41, 0x8d, 0xb9, 0x42, 0x00,
+    0x03, 0xc1, 0x8d, 0xc5, 0xcc, 0x89, 0x55, 0x0f, 0xb5, 0x31, 0xc4, 0x1e,
+    0xc9, 0x01, 0x71, 0x78, 0xc4, 0x01, 0xc3, 0x01, 0x81, 0x8b, 0x01, 0x8d,
+    0xd4, 0xd6, 0x31, 0x2a, 0x01, 0x81, 0x92, 0x01, 0x8d, 0xd8, 0x46, 0x0f,
+    0x88, 0xc1, 0x8d, 0xde, 0xcb, 0x58, 0xc7, 0x0f, 0xbd, 0x31, 0x46, 0x01,
+    0xfc, 0xc1, 0x8d, 0xea, 0xcf, 0x61, 0xd4, 0x0f, 0xb3, 0xe9, 0x15, 0xc1,
+    0x8d, 0xf6, 0xd4, 0x3c, 0xb4, 0x0f, 0xbd, 0x98, 0xcc, 0x07, 0xc7, 0x01,
+    0x16, 0xc9, 0xc9, 0x00, 0xca, 0x01, 0x16, 0xc0, 0xc7, 0xc2, 0xb2, 0x00,
+    0xe7, 0xb9, 0xcb, 0x40, 0xe1, 0x00, 0xe7, 0x91, 0x48, 0x14, 0x39, 0x41,
+    0x8e, 0x08, 0xd3, 0x40, 0xd9, 0x00, 0xe7, 0x99, 0xd3, 0x3f, 0xa9, 0x00,
+    0xe7, 0x81, 0x50, 0x5f, 0x62, 0x41, 0x8e, 0x23, 0xc8, 0x74, 0xc4, 0x00,
+    0xe7, 0x2b, 0x01, 0x8e, 0x2f, 0xc6, 0x74, 0xc6, 0x00, 0xe7, 0x1b, 0x01,
+    0x8e, 0x35, 0xc7, 0x02, 0x40, 0x00, 0xe7, 0x10, 0x45, 0x00, 0x5a, 0xc1,
+    0x8e, 0x3b, 0xc7, 0x0e, 0x70, 0x00, 0xe6, 0xe8, 0xc8, 0x9e, 0xe8, 0x00,
+    0xe7, 0xc1, 0x43, 0x61, 0x97, 0x41, 0x8e, 0x47, 0xc5, 0x00, 0xd4, 0x00,
+    0xe7, 0xa1, 0xc5, 0x05, 0x02, 0x00, 0xe6, 0xc0, 0xcf, 0x67, 0xce, 0x00,
+    0xe6, 0xf9, 0xcd, 0x04, 0xfa, 0x00, 0xe6, 0xf1, 0xcd, 0x7d, 0x78, 0x00,
+    0xe6, 0xd8, 0xce, 0x74, 0xbe, 0x00, 0xe6, 0xe1, 0xc6, 0xcd, 0xa9, 0x00,
+    0xe6, 0x80, 0xdb, 0x17, 0xe8, 0x00, 0xe6, 0xbb, 0x01, 0x8e, 0x4d, 0xd3,
+    0x02, 0x34, 0x00, 0xe6, 0xb1, 0xde, 0x0f, 0xf4, 0x00, 0xe6, 0xa8, 0xc2,
+    0x00, 0x51, 0x08, 0x2b, 0x89, 0x87, 0x08, 0x2b, 0x90, 0x87, 0x08, 0x2b,
+    0x99, 0xc2, 0x01, 0x7f, 0x08, 0x2b, 0xa0, 0x87, 0x08, 0x2b, 0xa9, 0xc2,
+    0x01, 0x7f, 0x08, 0x2b, 0xb0, 0x8b, 0x08, 0x2b, 0xb8, 0xc2, 0x00, 0xd0,
+    0x08, 0x2b, 0xe9, 0x83, 0x08, 0x2b, 0xe0, 0xc2, 0x1c, 0x52, 0x08, 0x2b,
+    0xf8, 0xc2, 0x00, 0xdb, 0x08, 0x2c, 0x19, 0x83, 0x08, 0x2c, 0x10, 0x87,
+    0x08, 0x2c, 0x29, 0xc2, 0x1c, 0x52, 0x08, 0x2c, 0x30, 0xc2, 0x01, 0x7f,
+    0x08, 0x2c, 0x69, 0x87, 0x08, 0x2c, 0x60, 0x87, 0x08, 0x2c, 0x71, 0xc2,
+    0x01, 0x7f, 0x08, 0x2c, 0x78, 0xc2, 0x00, 0x51, 0x08, 0x2c, 0xc1, 0x87,
+    0x08, 0x2c, 0xc8, 0x87, 0x08, 0x2c, 0xd1, 0xc2, 0x01, 0x7f, 0x08, 0x2c,
+    0xd8, 0x87, 0x08, 0x2c, 0xe1, 0xc2, 0x01, 0x7f, 0x08, 0x2c, 0xe8, 0x8b,
+    0x08, 0x2c, 0xf0, 0x83, 0x08, 0x2d, 0x19, 0xc2, 0x00, 0xd0, 0x08, 0x2d,
+    0x20, 0xc2, 0x1c, 0x52, 0x08, 0x2d, 0x30, 0x83, 0x08, 0x2d, 0x49, 0xc2,
+    0x00, 0xdb, 0x08, 0x2d, 0x50, 0x87, 0x08, 0x2d, 0x61, 0xc2, 0x1c, 0x52,
+    0x08, 0x2d, 0x68, 0x87, 0x08, 0x2d, 0x99, 0xc2, 0x01, 0x7f, 0x08, 0x2d,
+    0xa0, 0x87, 0x08, 0x2d, 0xa9, 0xc2, 0x01, 0x7f, 0x08, 0x2d, 0xb0, 0xc7,
+    0x3f, 0xe8, 0x01, 0x0a, 0xe9, 0xc6, 0xd3, 0x5b, 0x01, 0x0a, 0xd0, 0xc7,
+    0x3f, 0xe8, 0x01, 0x0a, 0xe1, 0xc6, 0x9c, 0x06, 0x01, 0x0a, 0xb9, 0xc8,
+    0x08, 0x79, 0x00, 0x05, 0xf0, 0xc6, 0x9c, 0x06, 0x01, 0x0a, 0xb1, 0xc6,
+    0x8d, 0x4d, 0x01, 0x0a, 0xa0, 0xc4, 0x9d, 0x74, 0x01, 0x0a, 0xc9, 0xc6,
+    0xcf, 0x29, 0x01, 0x0a, 0x80, 0xc4, 0x06, 0x68, 0x01, 0x0a, 0x99, 0xc4,
+    0x0f, 0x1f, 0x01, 0x0a, 0x90, 0xca, 0x1f, 0x0e, 0x70, 0x03, 0x01, 0xcf,
+    0x54, 0xbb, 0x70, 0x01, 0xf0, 0xc7, 0x80, 0x2f, 0x70, 0x02, 0xf9, 0x07,
+    0xc1, 0x8e, 0x53, 0x45, 0x0b, 0x12, 0x41, 0x8e, 0x5f, 0xd0, 0x08, 0xf7,
+    0x70, 0x02, 0xf1, 0x11, 0x41, 0x8e, 0x6b, 0x45, 0x00, 0x2d, 0xc1, 0x8e,
+    0x77, 0xce, 0x61, 0xd5, 0x70, 0x02, 0xe0, 0xcb, 0x2c, 0xb4, 0x70, 0x01,
+    0xf9, 0xcc, 0x01, 0xbb, 0x70, 0x01, 0x10, 0xca, 0x0e, 0xbe, 0x70, 0x01,
+    0xe9, 0xcf, 0x0f, 0x0a, 0x70, 0x01, 0x08, 0xc8, 0x52, 0x00, 0x70, 0x01,
+    0xd9, 0xc6, 0x27, 0x5e, 0x70, 0x01, 0x79, 0xc4, 0x40, 0x89, 0x70, 0x01,
+    0x00, 0x45, 0x09, 0x98, 0xc1, 0x8e, 0x89, 0xca, 0x99, 0x61, 0x70, 0x01,
+    0x20, 0xc8, 0x60, 0xf4, 0x70, 0x01, 0x59, 0xcb, 0x8e, 0x13, 0x70, 0x01,
+    0x28, 0xc7, 0x0b, 0x00, 0x70, 0x01, 0x51, 0xc9, 0x2d, 0x85, 0x70, 0x01,
+    0x39, 0xc8, 0x36, 0x21, 0x70, 0x01, 0x30, 0x97, 0x00, 0xbb, 0x99, 0x8b,
+    0x00, 0xbb, 0x90, 0xc2, 0x0d, 0xf6, 0x00, 0xbb, 0x81, 0xc2, 0x01, 0x4a,
+    0x00, 0xbb, 0x79, 0xc2, 0x00, 0xdb, 0x00, 0xbb, 0x71, 0xc2, 0x19, 0x2c,
+    0x00, 0xbb, 0x61, 0xc2, 0x01, 0xc3, 0x00, 0xbb, 0x59, 0xc2, 0x01, 0x5d,
+    0x00, 0xbb, 0x51, 0xc2, 0x00, 0xb0, 0x00, 0xbb, 0x49, 0x10, 0xc1, 0x8e,
+    0xad, 0xc2, 0x0e, 0x9a, 0x00, 0xbb, 0x39, 0xc2, 0x01, 0x6f, 0x00, 0xbb,
+    0x31, 0xc2, 0x01, 0x30, 0x00, 0xbb, 0x21, 0xc2, 0x02, 0x2b, 0x00, 0xbb,
+    0x19, 0x97, 0x00, 0xbb, 0x11, 0x8b, 0x00, 0xbb, 0x09, 0x83, 0x00, 0xbb,
+    0x00, 0x83, 0x00, 0xb8, 0x03, 0x01, 0x8e, 0xb7, 0xc2, 0x00, 0xd0, 0x00,
+    0xb8, 0x89, 0xc2, 0x0d, 0xf6, 0x00, 0xb8, 0x81, 0xc2, 0x01, 0x4a, 0x00,
+    0xb8, 0x79, 0xc2, 0x00, 0xdb, 0x00, 0xb8, 0x71, 0xc2, 0x00, 0x39, 0x00,
+    0xb8, 0x69, 0xc2, 0x19, 0x2c, 0x00, 0xb8, 0x61, 0xc2, 0x01, 0xc3, 0x00,
+    0xb8, 0x59, 0xc2, 0x01, 0x5d, 0x00, 0xb8, 0x51, 0xc2, 0x00, 0xb0, 0x00,
+    0xb8, 0x49, 0x10, 0xc1, 0x8e, 0xbd, 0xc2, 0x0e, 0x9a, 0x00, 0xb8, 0x39,
+    0xc2, 0x01, 0x6f, 0x00, 0xb8, 0x31, 0xc2, 0x01, 0x30, 0x00, 0xb8, 0x21,
+    0xc2, 0x02, 0x2b, 0x00, 0xb8, 0x19, 0x97, 0x00, 0xb8, 0x11, 0x8b, 0x00,
+    0xb8, 0x08, 0xc8, 0x7a, 0x8b, 0x00, 0xb8, 0xa9, 0xc6, 0x1e, 0x95, 0x00,
+    0xb8, 0xa0, 0x97, 0x00, 0xb8, 0x99, 0x8b, 0x00, 0xb8, 0x90, 0x4a, 0xa3,
+    0x3c, 0xc1, 0x8e, 0xc7, 0xce, 0x1c, 0x92, 0x0b, 0x7f, 0x00, 0x46, 0x09,
+    0x97, 0xc1, 0x8e, 0xe7, 0x47, 0x02, 0x0e, 0x41, 0x8f, 0x0b, 0x44, 0x00,
+    0xbb, 0xc1, 0x8f, 0x77, 0xd1, 0x55, 0xeb, 0x08, 0xff, 0x79, 0xc9, 0xaf,
+    0x9c, 0x08, 0xff, 0x61, 0xcc, 0x8a, 0x69, 0x08, 0xff, 0x38, 0xc9, 0xab,
+    0x0a, 0x08, 0xff, 0x69, 0x4b, 0x9a, 0x10, 0x41, 0x8f, 0x9f, 0xcb, 0x94,
+    0xfe, 0x08, 0xff, 0x59, 0xcd, 0x73, 0x0d, 0x00, 0x5e, 0xb9, 0xcc, 0x8a,
+    0x51, 0x00, 0x5f, 0xc0, 0xcb, 0x97, 0xea, 0x08, 0xff, 0x51, 0xca, 0x97,
+    0xa9, 0x00, 0x5f, 0xb8, 0xc8, 0x42, 0xd2, 0x08, 0xff, 0x31, 0x46, 0x02,
+    0x0f, 0x41, 0x8f, 0xab, 0xd3, 0x43, 0xf7, 0x08, 0xff, 0x29, 0x45, 0x09,
+    0x98, 0xc1, 0x90, 0x12, 0xc7, 0xbf, 0xf6, 0x00, 0x5f, 0x99, 0xc9, 0xb0,
+    0x59, 0x00, 0x5f, 0xb0, 0xd8, 0x25, 0x8b, 0x08, 0xfe, 0xa1, 0x46, 0x02,
+    0xdd, 0xc1, 0x90, 0x36, 0x44, 0x05, 0x14, 0x41, 0x90, 0x4e, 0x03, 0xc1,
+    0x90, 0x74, 0x8b, 0x00, 0x5d, 0xfb, 0x01, 0x90, 0x80, 0x97, 0x00, 0x5e,
+    0x0b, 0x01, 0x90, 0x84, 0x87, 0x00, 0x5e, 0x33, 0x01, 0x90, 0x88, 0x91,
+    0x00, 0x5e, 0x52, 0x01, 0x90, 0x8c, 0xc3, 0x09, 0x41, 0x00, 0x5f, 0x81,
+    0x44, 0x05, 0x14, 0xc1, 0x90, 0x90, 0xc4, 0x00, 0xba, 0x00, 0x5f, 0xd0,
+    0xc4, 0x26, 0x78, 0x08, 0xb6, 0x49, 0xc5, 0x06, 0xdb, 0x08, 0xb6, 0x41,
+    0x15, 0xc1, 0x90, 0x9c, 0x08, 0xc1, 0x90, 0xa8, 0x16, 0xc1, 0x90, 0xb4,
+    0xc3, 0x05, 0x14, 0x08, 0xb6, 0x09, 0xc4, 0x15, 0xe7, 0x08, 0xb6, 0x00,
+    0x83, 0x08, 0xb4, 0x03, 0x01, 0x90, 0xc0, 0x14, 0xc1, 0x90, 0xd2, 0xc2,
+    0x00, 0xd0, 0x08, 0xb5, 0x49, 0x15, 0xc1, 0x90, 0xdc, 0xc2, 0x02, 0x41,
+    0x08, 0xb5, 0x31, 0xc2, 0x00, 0xdb, 0x08, 0xb5, 0x29, 0xc2, 0x19, 0x2c,
+    0x08, 0xb5, 0x19, 0xc2, 0x01, 0xc3, 0x08, 0xb5, 0x11, 0x04, 0xc1, 0x90,
+    0xe6, 0x12, 0xc1, 0x90, 0xf0, 0x10, 0xc1, 0x90, 0xfa, 0x06, 0xc1, 0x91,
+    0x10, 0x16, 0xc1, 0x91, 0x1e, 0x0c, 0xc1, 0x91, 0x2c, 0x05, 0xc1, 0x91,
+    0x36, 0x09, 0xc1, 0x91, 0x40, 0x0d, 0xc1, 0x91, 0x4a, 0x91, 0x08, 0xb4,
+    0x41, 0x87, 0x08, 0xb4, 0x31, 0x97, 0x08, 0xb4, 0x23, 0x01, 0x91, 0x54,
+    0x8b, 0x08, 0xb4, 0x12, 0x01, 0x91, 0x58, 0xc5, 0x33, 0x5d, 0x08, 0xb5,
+    0xb9, 0x42, 0x07, 0xb2, 0xc1, 0x91, 0x5c, 0xc8, 0x14, 0x38, 0x08, 0xb5,
+    0x58, 0x03, 0xc1, 0x91, 0x68, 0x91, 0x08, 0xb5, 0xa1, 0x87, 0x08, 0xb5,
+    0x91, 0x97, 0x08, 0xb5, 0x83, 0x01, 0x91, 0x74, 0x8b, 0x08, 0xb5, 0x72,
+    0x01, 0x91, 0x78, 0xc5, 0xde, 0x25, 0x00, 0xd5, 0x69, 0x0a, 0xc1, 0x91,
+    0x7c, 0x42, 0x0d, 0xf6, 0xc1, 0x91, 0x88, 0x0d, 0xc1, 0x91, 0x9d, 0x44,
+    0x38, 0x7e, 0xc1, 0x91, 0xb2, 0x14, 0xc1, 0x91, 0xc7, 0xc6, 0xca, 0xc7,
+    0x00, 0xd5, 0x29, 0xc5, 0xdc, 0xcc, 0x00, 0xd5, 0x03, 0x01, 0x91, 0xd3,
+    0x45, 0x28, 0xb1, 0x41, 0x91, 0xd9, 0xc4, 0x26, 0x78, 0x00, 0xd4, 0xc9,
+    0xc5, 0x06, 0xdb, 0x00, 0xd4, 0xc1, 0x15, 0xc1, 0x91, 0xe1, 0x08, 0xc1,
+    0x91, 0xed, 0x16, 0xc1, 0x91, 0xf9, 0xc3, 0x05, 0x14, 0x00, 0xd4, 0x89,
+    0xc4, 0x15, 0xe7, 0x00, 0xd4, 0x80, 0xc4, 0x26, 0x78, 0x00, 0xd4, 0x49,
+    0xc5, 0x06, 0xdb, 0x00, 0xd4, 0x41, 0x15, 0xc1, 0x92, 0x05, 0x08, 0xc1,
+    0x92, 0x11, 0x16, 0xc1, 0x92, 0x1d, 0xc3, 0x05, 0x14, 0x00, 0xd4, 0x09,
+    0xc4, 0x15, 0xe7, 0x00, 0xd4, 0x00, 0xd9, 0x1d, 0xd3, 0x00, 0xd3, 0xf9,
+    0x4d, 0x30, 0x92, 0x41, 0x92, 0x29, 0x91, 0x00, 0xd3, 0x5b, 0x01, 0x92,
+    0x49, 0x16, 0xc1, 0x92, 0x57, 0x83, 0x00, 0xd3, 0x0b, 0x01, 0x92, 0x63,
+    0x87, 0x00, 0xd3, 0x71, 0x97, 0x00, 0xd3, 0x4b, 0x01, 0x92, 0x6f, 0x8b,
+    0x00, 0xd3, 0x2b, 0x01, 0x92, 0x7a, 0xc7, 0xc2, 0xce, 0x00, 0xd3, 0x10,
+    0xc8, 0xbd, 0xea, 0x00, 0xd2, 0xa1, 0x0e, 0xc1, 0x92, 0x7e, 0xc2, 0x01,
+    0x24, 0x00, 0xd2, 0x91, 0xc2, 0x02, 0xe0, 0x00, 0xd2, 0x89, 0x97, 0x00,
+    0xd2, 0x7b, 0x01, 0x92, 0x97, 0x8b, 0x00, 0xd2, 0x6b, 0x01, 0x92, 0x9b,
+    0x83, 0x00, 0xd2, 0x59, 0x45, 0x08, 0xcb, 0xc1, 0x92, 0x9f, 0xc2, 0x01,
+    0x4a, 0x00, 0xd2, 0x29, 0x14, 0xc1, 0x92, 0xcb, 0xc2, 0x01, 0xc3, 0x00,
+    0xd1, 0xf1, 0xc2, 0x01, 0x5d, 0x00, 0xd1, 0xb9, 0x10, 0xc1, 0x92, 0xd8,
+    0xc2, 0x0e, 0x9a, 0x00, 0xd1, 0x78, 0x44, 0x1a, 0xce, 0xc1, 0x92, 0xe8,
+    0x15, 0xc1, 0x92, 0xfc, 0xc2, 0x00, 0xd0, 0x00, 0xca, 0xb9, 0x83, 0x00,
+    0xca, 0xb0, 0x8b, 0x00, 0xcb, 0x69, 0xc2, 0x0f, 0xe1, 0x00, 0xcb, 0x60,
+    0x8a, 0x00, 0xcb, 0x31, 0x87, 0x00, 0xcb, 0x28, 0x87, 0x00, 0xcb, 0x50,
+    0x91, 0x00, 0xcb, 0x40, 0x83, 0x00, 0xcb, 0x11, 0xc2, 0x01, 0x30, 0x00,
+    0xca, 0x90, 0xc2, 0x00, 0xd0, 0x00, 0xcb, 0x01, 0x83, 0x00, 0xca, 0x80,
+    0xc2, 0x00, 0xd0, 0x00, 0xca, 0xd1, 0x83, 0x00, 0xca, 0xc8, 0x42, 0x00,
+    0xe8, 0xc1, 0x93, 0x06, 0xc6, 0xd3, 0x49, 0x05, 0x56, 0xf1, 0xc3, 0x71,
+    0xe5, 0x05, 0x56, 0xe9, 0xc5, 0xda, 0x2e, 0x05, 0x56, 0xe0, 0xc4, 0x7b,
+    0x07, 0x05, 0x56, 0x11, 0xc3, 0x1c, 0xd6, 0x05, 0x56, 0x09, 0xc5, 0xda,
+    0x2e, 0x05, 0x56, 0x01, 0xc2, 0x13, 0x4c, 0x05, 0x55, 0xf8, 0x03, 0xc1,
+    0x93, 0x10, 0x97, 0x05, 0x55, 0xa3, 0x01, 0x93, 0x26, 0x8b, 0x05, 0x55,
+    0x93, 0x01, 0x93, 0x31, 0x87, 0x05, 0x55, 0xa9, 0x91, 0x05, 0x55, 0xb0,
+    0xc3, 0x01, 0x95, 0x05, 0x55, 0x81, 0xc3, 0x01, 0xfd, 0x05, 0x55, 0xb8,
+    0x45, 0x08, 0xcb, 0xc1, 0x93, 0x35, 0x44, 0x05, 0x36, 0x41, 0x93, 0x8f,
+    0xcb, 0x50, 0x7f, 0x01, 0x36, 0x51, 0xc8, 0xbd, 0x02, 0x01, 0x5e, 0x10,
+    0xc6, 0x30, 0x98, 0x01, 0x18, 0xc9, 0x44, 0x06, 0x1f, 0x41, 0x93, 0xe9,
+    0x46, 0x10, 0x29, 0xc1, 0x93, 0xf5, 0xc5, 0xce, 0x22, 0x01, 0x71, 0xc0,
+    0xc6, 0xd2, 0x71, 0x01, 0x0a, 0x71, 0x52, 0x46, 0xb6, 0xc1, 0x94, 0x01,
+    0x45, 0x1a, 0x38, 0xc1, 0x94, 0x0d, 0xc8, 0x52, 0x00, 0x01, 0x71, 0xa8,
+    0xc8, 0x36, 0x21, 0x01, 0x0a, 0x59, 0xc4, 0x01, 0x96, 0x01, 0x4d, 0x10,
+    0xc8, 0xbd, 0x0a, 0x01, 0x09, 0x91, 0xc4, 0x0a, 0x8b, 0x01, 0x71, 0x90,
+    0xd0, 0x59, 0xe2, 0x01, 0x3e, 0x01, 0xce, 0x05, 0x19, 0x01, 0x02, 0xb0,
+    0x50, 0x5a, 0x72, 0xc1, 0x94, 0x19, 0xcf, 0x65, 0x85, 0x01, 0x59, 0x88,
+    0xd0, 0x27, 0x1f, 0x01, 0x0f, 0xb1, 0x44, 0x39, 0xfd, 0x41, 0x94, 0x25,
+    0x4c, 0x89, 0x85, 0xc1, 0x94, 0x3d, 0x4b, 0x95, 0x35, 0xc1, 0x94, 0x49,
+    0x43, 0x07, 0x6e, 0xc1, 0x94, 0x4f, 0x4c, 0x80, 0x91, 0x41, 0x94, 0x55,
+    0x15, 0xc1, 0x94, 0x5b, 0xcb, 0x58, 0xc7, 0x0f, 0xbd, 0x08, 0xce, 0x73,
+    0x7c, 0x01, 0x10, 0x21, 0xc6, 0xd3, 0xc7, 0x01, 0x10, 0x18, 0xc8, 0xb8,
+    0xba, 0x00, 0x3d, 0x79, 0xc6, 0xcb, 0xc3, 0x00, 0x3d, 0x71, 0xc8, 0xbb,
+    0x1a, 0x00, 0x3d, 0x58, 0xc8, 0xb8, 0x32, 0x00, 0x3d, 0x49, 0xc6, 0xcc,
+    0x5f, 0x00, 0x3d, 0x61, 0xc8, 0xb6, 0x52, 0x00, 0x3d, 0x68, 0xc8, 0xb8,
+    0xaa, 0x00, 0x3d, 0x39, 0xc6, 0xcd, 0x01, 0x00, 0x3d, 0x30, 0xc5, 0xda,
+    0xec, 0x00, 0x3d, 0x29, 0xc5, 0xd8, 0x3a, 0x00, 0x3d, 0x21, 0x09, 0xc1,
+    0x94, 0x67, 0x16, 0xc1, 0x94, 0x79, 0x06, 0xc1, 0x94, 0x92, 0x15, 0xc1,
+    0x94, 0x9c, 0x0a, 0xc1, 0x94, 0xac, 0xc9, 0xb4, 0xd9, 0x00, 0x3c, 0xb9,
+    0xc8, 0xb7, 0x22, 0x00, 0x3c, 0xb1, 0xc8, 0xbd, 0x92, 0x00, 0x3c, 0xa9,
+    0xc3, 0xa9, 0x9c, 0x00, 0x3c, 0xa1, 0x1c, 0xc1, 0x94, 0xb8, 0x0e, 0xc1,
+    0x94, 0xc0, 0xc5, 0xde, 0x7a, 0x00, 0x3c, 0x51, 0xc5, 0xdb, 0x00, 0x00,
+    0x3c, 0x49, 0xc5, 0xd8, 0xd0, 0x00, 0x3c, 0x41, 0x03, 0xc1, 0x94, 0xcc,
+    0x0d, 0xc1, 0x94, 0xd8, 0xc3, 0x47, 0x81, 0x00, 0x3c, 0x21, 0xc3, 0x47,
+    0xd9, 0x00, 0x3c, 0x19, 0x10, 0x41, 0x94, 0xe4, 0x49, 0x3b, 0x93, 0xc1,
+    0x94, 0xf0, 0xd3, 0x44, 0x0a, 0x00, 0x71, 0xf8, 0xc4, 0x15, 0xe7, 0x00,
+    0x72, 0x81, 0xc3, 0x05, 0x14, 0x00, 0x72, 0x89, 0x16, 0xc1, 0x95, 0x44,
+    0x08, 0xc1, 0x95, 0x50, 0x15, 0xc1, 0x95, 0x5c, 0xc5, 0x06, 0xdb, 0x00,
+    0x72, 0xc1, 0xc4, 0x26, 0x78, 0x00, 0x72, 0xc8, 0xc8, 0x1e, 0x3f, 0x01,
+    0x19, 0x01, 0xcc, 0x85, 0x71, 0x01, 0x5e, 0x51, 0xcc, 0x83, 0x19, 0x01,
+    0x71, 0xc9, 0xd0, 0x1d, 0xec, 0x01, 0x72, 0xc9, 0xd1, 0x1a, 0x4a, 0x01,
+    0x72, 0xd0, 0xc5, 0x13, 0x67, 0x01, 0x18, 0xe9, 0xc3, 0x0a, 0xea, 0x01,
+    0x18, 0x70, 0xc5, 0x13, 0x67, 0x01, 0x18, 0xe1, 0xc3, 0x0a, 0xea, 0x01,
+    0x18, 0x78, 0xca, 0xa1, 0xb6, 0x01, 0x49, 0xe8, 0x83, 0x0f, 0x15, 0x6b,
+    0x01, 0x95, 0x68, 0x04, 0xc1, 0x95, 0x6c, 0x91, 0x0f, 0x15, 0x51, 0x87,
+    0x0f, 0x15, 0x33, 0x01, 0x95, 0x76, 0x97, 0x0f, 0x15, 0x29, 0x8b, 0x0f,
+    0x15, 0x0b, 0x01, 0x95, 0x7a, 0xc2, 0x00, 0xdb, 0x0f, 0x15, 0x01, 0xc2,
+    0x00, 0x39, 0x0f, 0x14, 0xf9, 0xc2, 0x00, 0xd0, 0x0f, 0x14, 0xf1, 0xc2,
+    0x25, 0x3b, 0x0f, 0x14, 0xe9, 0xc2, 0x01, 0x4a, 0x0f, 0x14, 0xe1, 0xc2,
+    0x19, 0x2c, 0x0f, 0x14, 0xd9, 0xc3, 0x1c, 0x63, 0x0f, 0x14, 0xd1, 0xc2,
+    0x0d, 0xf6, 0x0f, 0x14, 0xc9, 0x10, 0xc1, 0x95, 0x7e, 0xc2, 0x01, 0xc3,
+    0x0f, 0x14, 0xb1, 0xc2, 0x01, 0x30, 0x0f, 0x14, 0xa9, 0xc2, 0x02, 0x2b,
+    0x0f, 0x14, 0xa1, 0xc2, 0x0e, 0x9a, 0x0f, 0x14, 0x99, 0xc2, 0x01, 0x6f,
+    0x0f, 0x14, 0x91, 0xc2, 0x00, 0xb0, 0x0f, 0x14, 0x80, 0xc2, 0xe6, 0x7d,
+    0x0f, 0x92, 0x09, 0xc2, 0x8c, 0x54, 0x0f, 0x92, 0x10, 0xc3, 0xe5, 0x81,
+    0x0f, 0x92, 0x41, 0xc3, 0xe6, 0x59, 0x0f, 0x92, 0x29, 0xc3, 0xe5, 0xa5,
+    0x0f, 0x92, 0x00, 0xc3, 0xe6, 0x6b, 0x0f, 0x92, 0x39, 0xc3, 0xe5, 0x3f,
+    0x0f, 0x92, 0x18, 0xc3, 0xe5, 0x54, 0x0f, 0x92, 0x31, 0xc3, 0xe5, 0xe4,
+    0x0f, 0x92, 0x20, 0xd8, 0x03, 0xaf, 0x01, 0x3c, 0xe9, 0x46, 0x00, 0x8b,
+    0x41, 0x95, 0x88, 0xc6, 0x1c, 0xb4, 0x01, 0x01, 0x19, 0xc5, 0xcd, 0xce,
+    0x0f, 0xa6, 0x81, 0xcc, 0x87, 0x69, 0x0f, 0xb5, 0x48, 0xc4, 0x03, 0xd7,
+    0x01, 0x31, 0xa9, 0xc3, 0x02, 0x34, 0x01, 0x31, 0xa0, 0xcf, 0x05, 0x98,
+    0x01, 0x15, 0x51, 0xc9, 0x32, 0x24, 0x01, 0x4c, 0x01, 0xcf, 0x27, 0x65,
+    0x01, 0x57, 0xa1, 0xd6, 0x30, 0x7a, 0x01, 0x57, 0xa8, 0xc4, 0x18, 0x26,
+    0x01, 0x01, 0xa1, 0xc3, 0x25, 0xd6, 0x01, 0x4f, 0xd8, 0xd6, 0x2d, 0x62,
+    0x01, 0x53, 0x41, 0xd6, 0x2c, 0x2e, 0x01, 0x53, 0x48, 0xc9, 0x00, 0xca,
+    0x01, 0x57, 0xb9, 0xcc, 0x07, 0xc7, 0x01, 0x57, 0xc0, 0xc5, 0xc3, 0x08,
+    0x0f, 0x9b, 0xc9, 0xc4, 0x55, 0x81, 0x0f, 0xa1, 0x00, 0xc7, 0xc8, 0x70,
+    0x0e, 0x9a, 0xb1, 0xc7, 0xb6, 0x0b, 0x0e, 0x98, 0xc0, 0xc4, 0x1d, 0xa8,
+    0x0e, 0x99, 0x59, 0xc7, 0x05, 0x79, 0x0e, 0x98, 0x38, 0xc7, 0xca, 0x37,
+    0x0e, 0x9a, 0xa9, 0xca, 0xa3, 0x32, 0x0e, 0x99, 0x68, 0xca, 0x9b, 0xe4,
+    0x0e, 0x9a, 0xa1, 0x0f, 0xc1, 0x95, 0xa0, 0xc8, 0xbc, 0xd2, 0x0e, 0x98,
+    0x80, 0xc7, 0xb1, 0x21, 0x0e, 0x9a, 0x39, 0xca, 0xa6, 0x20, 0x0e, 0x99,
+    0x11, 0xd9, 0x1d, 0xa1, 0x0e, 0x98, 0x78, 0x43, 0x5e, 0x7a, 0xc1, 0x95,
+    0xac, 0x10, 0x41, 0x95, 0xb8, 0xc3, 0x14, 0xc8, 0x0e, 0x9a, 0x79, 0x07,
+    0x41, 0x95, 0xc2, 0x11, 0xc1, 0x95, 0xce, 0xc6, 0xca, 0xd3, 0x0e, 0x99,
+    0x48, 0xc9, 0xab, 0x5b, 0x0e, 0x99, 0x99, 0xc8, 0xba, 0xba, 0x0e, 0x99,
+    0x81, 0xc7, 0xc4, 0xc6, 0x0e, 0x98, 0xf8, 0xc3, 0x01, 0xd2, 0x0e, 0x99,
+    0xf8, 0x15, 0xc1, 0x95, 0xda, 0xc5, 0xd9, 0x93, 0x0e, 0x98, 0xd1, 0xc3,
+    0x29, 0x43, 0x0e, 0x98, 0xa0, 0xc5, 0x83, 0x4f, 0x0e, 0x99, 0xa1, 0xc5,
+    0x5b, 0x25, 0x0e, 0x99, 0x20, 0xd7, 0x28, 0xfb, 0x01, 0x3d, 0xd1, 0xcf,
+    0x15, 0x36, 0x01, 0x39, 0xd8, 0xcd, 0x7f, 0x59, 0x01, 0x38, 0x31, 0x43,
+    0x05, 0xbb, 0xc1, 0x95, 0xe4, 0xc4, 0x00, 0xba, 0x01, 0x09, 0x09, 0xcf,
+    0x62, 0x01, 0x0f, 0xac, 0x00, 0x05, 0xc1, 0x95, 0xf3, 0x03, 0xc1, 0x95,
+    0xff, 0x42, 0x07, 0xb2, 0xc1, 0x96, 0x0b, 0xc5, 0x33, 0x5d, 0x00, 0x61,
+    0xe1, 0xc7, 0xc3, 0x61, 0x00, 0x63, 0xb9, 0xc5, 0xdc, 0x40, 0x00, 0x63,
+    0xf8, 0x45, 0x02, 0x10, 0xc1, 0x96, 0x17, 0xc9, 0x36, 0x53, 0x00, 0x62,
+    0xa8, 0x03, 0xc1, 0x96, 0x80, 0x8b, 0x00, 0x61, 0xfb, 0x01, 0x96, 0x8c,
+    0x97, 0x00, 0x62, 0x0b, 0x01, 0x96, 0x90, 0x48, 0xb2, 0x2d, 0xc1, 0x96,
+    0x94, 0x87, 0x00, 0x62, 0x33, 0x01, 0x96, 0xa2, 0x91, 0x00, 0x62, 0x52,
+    0x01, 0x96, 0xa6, 0xc4, 0x15, 0xe7, 0x00, 0x63, 0x31, 0xc3, 0x05, 0x14,
+    0x00, 0x63, 0x39, 0x16, 0xc1, 0x96, 0xaa, 0x08, 0xc1, 0x96, 0xb6, 0x15,
+    0xc1, 0x96, 0xc2, 0xc5, 0x06, 0xdb, 0x00, 0x63, 0x71, 0xc4, 0x26, 0x78,
+    0x00, 0x63, 0x78, 0xdb, 0x15, 0xe7, 0x00, 0x63, 0xc1, 0x48, 0xb5, 0xca,
+    0xc1, 0x96, 0xce, 0x16, 0x41, 0x96, 0xda, 0x00, 0x41, 0x96, 0xe6, 0xca,
+    0x9e, 0xe6, 0x01, 0x70, 0xd9, 0x44, 0x05, 0x18, 0x41, 0x96, 0xf2, 0xc4,
+    0x26, 0x78, 0x08, 0xa6, 0xc9, 0xc5, 0x06, 0xdb, 0x08, 0xa6, 0xc1, 0x15,
+    0xc1, 0x96, 0xfe, 0x08, 0xc1, 0x97, 0x0a, 0x16, 0xc1, 0x97, 0x16, 0xc3,
+    0x05, 0x14, 0x08, 0xa6, 0x89, 0xc4, 0x15, 0xe7, 0x08, 0xa6, 0x80, 0xd0,
+    0x50, 0xcf, 0x08, 0xa6, 0x31, 0xc3, 0x7c, 0x50, 0x08, 0xa4, 0x00, 0x03,
+    0xc1, 0x97, 0x22, 0xc5, 0x33, 0x5d, 0x08, 0xa6, 0x19, 0xcb, 0x1e, 0x89,
+    0x08, 0xa5, 0xf9, 0x42, 0x07, 0xb2, 0x41, 0x97, 0x2e, 0x03, 0xc1, 0x97,
+    0x3a, 0x46, 0x2e, 0xee, 0xc1, 0x97, 0x46, 0x91, 0x08, 0xa5, 0xe1, 0x87,
+    0x08, 0xa5, 0xc9, 0x48, 0xb2, 0x2d, 0xc1, 0x97, 0x4e, 0x97, 0x08, 0xa5,
+    0x9b, 0x01, 0x97, 0x5c, 0x8b, 0x08, 0xa5, 0x8a, 0x01, 0x97, 0x60, 0xc2,
+    0x00, 0xd0, 0x08, 0xa5, 0x79, 0x15, 0xc1, 0x97, 0x64, 0x18, 0xc1, 0x97,
+    0x74, 0xc2, 0x00, 0xdb, 0x08, 0xa5, 0x51, 0xc2, 0x00, 0x39, 0x08, 0xa5,
+    0x49, 0xc2, 0x19, 0x2c, 0x08, 0xa5, 0x41, 0xc2, 0x01, 0xc3, 0x08, 0xa5,
+    0x39, 0x04, 0xc1, 0x97, 0x7e, 0x12, 0xc1, 0x97, 0x88, 0x10, 0xc1, 0x97,
+    0x92, 0x06, 0xc1, 0x97, 0xa8, 0x16, 0xc1, 0x97, 0xb6, 0x0c, 0xc1, 0x97,
+    0xc4, 0x05, 0xc1, 0x97, 0xce, 0x09, 0xc1, 0x97, 0xd8, 0x0d, 0xc1, 0x97,
+    0xe2, 0x83, 0x08, 0xa4, 0x0b, 0x01, 0x97, 0xec, 0x91, 0x08, 0xa4, 0x69,
+    0x87, 0x08, 0xa4, 0x59, 0x97, 0x08, 0xa4, 0x2b, 0x01, 0x97, 0xf8, 0x8b,
+    0x08, 0xa4, 0x1a, 0x01, 0x97, 0xfc, 0xc9, 0xae, 0x7c, 0x00, 0x78, 0x01,
+    0x45, 0x10, 0x7a, 0x41, 0x98, 0x00, 0x14, 0xc1, 0x98, 0x1c, 0x42, 0x19,
+    0x2c, 0xc1, 0x98, 0x2e, 0x0f, 0xc1, 0x98, 0x3a, 0xce, 0x70, 0x50, 0x00,
+    0x7c, 0x11, 0xc8, 0xbb, 0x42, 0x00, 0x7c, 0x19, 0x42, 0x58, 0x61, 0xc1,
+    0x98, 0x46, 0x44, 0xe0, 0x6f, 0xc1, 0x98, 0x52, 0xd1, 0x4f, 0x9c, 0x00,
+    0x7c, 0x60, 0x45, 0x00, 0xba, 0xc1, 0x98, 0x5e, 0x47, 0x02, 0x0e, 0x41,
+    0x98, 0x70, 0x44, 0x02, 0x11, 0xc1, 0x98, 0xd2, 0x4b, 0x8f, 0xec, 0x41,
+    0x98, 0xde, 0x46, 0x10, 0xb6, 0xc1, 0x98, 0xea, 0xd1, 0x56, 0xfb, 0x00,
+    0x78, 0x58, 0x47, 0x90, 0xa7, 0xc1, 0x98, 0xf6, 0x45, 0x95, 0xf1, 0xc1,
+    0x99, 0x02, 0xc6, 0xd3, 0x19, 0x00, 0x79, 0xc0, 0xc9, 0xb4, 0x37, 0x00,
+    0x78, 0x41, 0xc3, 0x01, 0xe3, 0x00, 0x78, 0x68, 0x15, 0xc1, 0x99, 0x0e,
+    0x49, 0xad, 0x6e, 0x41, 0x99, 0x18, 0x44, 0x97, 0x1a, 0xc1, 0x99, 0x24,
+    0x4a, 0x9f, 0xd6, 0x41, 0x99, 0x33, 0x15, 0xc1, 0x99, 0x3f, 0xd3, 0x47,
+    0x02, 0x00, 0x7e, 0xd0, 0xd3, 0x45, 0x73, 0x00, 0x78, 0x89, 0xcd, 0x76,
+    0x01, 0x00, 0x78, 0x90, 0xc2, 0x00, 0x45, 0x00, 0x79, 0xe1, 0xc2, 0x02,
+    0x2c, 0x00, 0x79, 0xe8, 0xca, 0x9c, 0xfc, 0x00, 0x78, 0xa9, 0xca, 0xa4,
+    0xfe, 0x00, 0x78, 0xb0, 0x0d, 0xc1, 0x99, 0x4b, 0x09, 0xc1, 0x99, 0x61,
+    0x10, 0xc1, 0x99, 0x6b, 0x05, 0xc1, 0x99, 0x81, 0xc2, 0x25, 0x3b, 0x00,
+    0x7a, 0x39, 0x16, 0xc1, 0x99, 0x8b, 0x06, 0xc1, 0x99, 0x9d, 0x12, 0xc1,
+    0x99, 0xaf, 0x04, 0xc1, 0x99, 0xb9, 0xc2, 0x01, 0xc3, 0x00, 0x7a, 0xc1,
+    0xc2, 0x01, 0x4a, 0x00, 0x7a, 0xe9, 0x1c, 0xc1, 0x99, 0xc3, 0xc2, 0x00,
+    0x02, 0x00, 0x7b, 0x01, 0xc2, 0x19, 0x2c, 0x00, 0x7b, 0x09, 0x14, 0xc1,
+    0x99, 0xcd, 0xc2, 0x00, 0xdb, 0x00, 0x7b, 0x19, 0x15, 0xc1, 0x99, 0xd7,
+    0xc2, 0x00, 0xd0, 0x00, 0x7b, 0x39, 0x83, 0x00, 0x7b, 0x41, 0xcd, 0x7f,
+    0xe8, 0x00, 0x7b, 0x50, 0xd4, 0x39, 0x1c, 0x00, 0x78, 0xb9, 0xcb, 0x98,
+    0x63, 0x00, 0x78, 0xc8, 0xc2, 0x02, 0xa0, 0x00, 0x79, 0x11, 0xc4, 0x02,
+    0xde, 0x00, 0x79, 0x18, 0xc3, 0x09, 0x9e, 0x00, 0x79, 0x21, 0xc3, 0x0d,
+    0x14, 0x00, 0x79, 0x28, 0xc2, 0x22, 0xcc, 0x00, 0x79, 0x31, 0xc4, 0x18,
+    0x10, 0x00, 0x79, 0x38, 0xc3, 0x05, 0x14, 0x00, 0x79, 0x51, 0x16, 0xc1,
+    0x99, 0xe7, 0x08, 0xc1, 0x99, 0xf3, 0x15, 0xc1, 0x99, 0xff, 0xc5, 0x06,
+    0xdb, 0x00, 0x79, 0x89, 0xc4, 0x26, 0x78, 0x00, 0x79, 0x91, 0xc4, 0x15,
+    0xe7, 0x00, 0x79, 0x98, 0x8b, 0x00, 0x7b, 0x98, 0x97, 0x00, 0x7b, 0xa8,
+    0x94, 0x00, 0x7b, 0xb3, 0x01, 0x9a, 0x0b, 0x8e, 0x00, 0x7b, 0xc2, 0x01,
+    0x9a, 0x0f, 0x87, 0x00, 0x7b, 0xd8, 0x91, 0x00, 0x7b, 0xe8, 0x8b, 0x00,
+    0x7c, 0x08, 0x83, 0x01, 0x69, 0x83, 0x01, 0x9a, 0x13, 0x87, 0x01, 0x6b,
+    0x33, 0x01, 0x9a, 0x84, 0x8b, 0x01, 0x6a, 0x49, 0x97, 0x01, 0x6a, 0x99,
+    0x91, 0x01, 0x6b, 0x38, 0x8c, 0x01, 0x69, 0xa9, 0x8a, 0x01, 0x6a, 0x08,
+    0x48, 0xba, 0x82, 0xc1, 0x9a, 0x88, 0xcd, 0x7f, 0x0b, 0x01, 0x6b, 0x20,
+    0xcb, 0x8d, 0xfd, 0x01, 0x6a, 0x59, 0xc8, 0xb6, 0x7a, 0x01, 0x6a, 0xc0,
+    0x00, 0xc1, 0x9a, 0xa7, 0xda, 0x05, 0x0d, 0x01, 0x71, 0x50, 0xc2, 0x00,
+    0xbf, 0x01, 0x52, 0xb1, 0xc3, 0x02, 0x9b, 0x01, 0x52, 0xa8, 0xcb, 0x97,
+    0x03, 0x01, 0x50, 0x41, 0xcc, 0x86, 0x6d, 0x01, 0x50, 0x38, 0xc7, 0x09,
+    0x0d, 0x01, 0x49, 0xa1, 0xc9, 0x03, 0xc8, 0x01, 0x49, 0xa9, 0xca, 0x3c,
+    0xa4, 0x0f, 0xc5, 0x88, 0xc9, 0x01, 0x88, 0x01, 0x49, 0xb1, 0xca, 0x03,
+    0x87, 0x01, 0x49, 0xb8, 0x48, 0x19, 0x9b, 0xc1, 0x9a, 0xb3, 0x07, 0xc1,
+    0x9b, 0x11, 0x45, 0x17, 0x15, 0x41, 0x9b, 0x1d, 0x43, 0x01, 0xc5, 0xc1,
+    0x9b, 0x29, 0x43, 0x2d, 0x2f, 0xc1, 0x9b, 0x35, 0x4b, 0x4c, 0x93, 0x41,
+    0x9b, 0x41, 0x03, 0xc1, 0x9b, 0xad, 0x45, 0x00, 0x59, 0xc1, 0x9b, 0xbc,
+    0xd3, 0x44, 0x69, 0x00, 0x47, 0x11, 0xd0, 0x5e, 0x52, 0x00, 0x33, 0x58,
+    0x4f, 0x2f, 0xa0, 0xc1, 0x9b, 0xcb, 0x03, 0xc1, 0x9b, 0xda, 0x43, 0x0d,
+    0xed, 0xc1, 0x9b, 0xe4, 0xcd, 0x75, 0xb3, 0x00, 0x32, 0xe8, 0x00, 0xc1,
+    0x9b, 0xea, 0xc3, 0x13, 0x00, 0x00, 0x32, 0x6a, 0x01, 0x9b, 0xfc, 0xc4,
+    0x04, 0xa7, 0x00, 0x32, 0x73, 0x01, 0x9c, 0x02, 0xc8, 0x11, 0xf7, 0x00,
+    0x36, 0xa1, 0xd0, 0x5c, 0x72, 0x00, 0x33, 0x69, 0xce, 0x6f, 0x7e, 0x00,
+    0x30, 0x10, 0x45, 0x03, 0x14, 0xc1, 0x9c, 0x0f, 0x17, 0xc1, 0x9c, 0x39,
+    0x46, 0x10, 0x79, 0xc1, 0x9c, 0x4e, 0x44, 0x00, 0xbb, 0xc1, 0x9c, 0x70,
+    0xd3, 0x46, 0xa3, 0x00, 0x36, 0xf1, 0xc5, 0xd7, 0x18, 0x00, 0x32, 0x8b,
+    0x01, 0x9c, 0x8c, 0xc8, 0x52, 0x00, 0x00, 0x30, 0xd8, 0xc8, 0xb5, 0x52,
+    0x00, 0x47, 0x91, 0xc8, 0xb8, 0xc2, 0x00, 0x47, 0x89, 0xc8, 0x6e, 0xbf,
+    0x00, 0x47, 0x80, 0x44, 0x05, 0x14, 0xc1, 0x9c, 0x90, 0xd1, 0x52, 0x44,
+    0x00, 0x47, 0x19, 0x03, 0xc1, 0x9c, 0xa2, 0xd2, 0x4b, 0x95, 0x00, 0x33,
+    0x61, 0xda, 0x1b, 0x1a, 0x00, 0x30, 0xf0, 0x45, 0x00, 0x33, 0xc1, 0x9c,
+    0xb1, 0xc4, 0x0a, 0x8b, 0x00, 0x30, 0x60, 0xd3, 0x41, 0xbd, 0x00, 0x44,
+    0xf9, 0x44, 0x08, 0x0b, 0x41, 0x9c, 0xcc, 0xd1, 0x53, 0xcb, 0x00, 0x44,
+    0x89, 0x11, 0xc1, 0x9c, 0xd8, 0xce, 0x70, 0xa4, 0x00, 0x37, 0x49, 0xcb,
+    0x8e, 0x13, 0x00, 0x33, 0x50, 0xcc, 0x41, 0x19, 0x00, 0x44, 0x71, 0x4a,
+    0x6f, 0xc8, 0x41, 0x9c, 0xe4, 0x4c, 0x81, 0x09, 0xc1, 0x9c, 0xf6, 0x46,
+    0x0a, 0x10, 0x41, 0x9d, 0x02, 0xca, 0x43, 0x42, 0x00, 0x30, 0x29, 0xc4,
+    0x00, 0xba, 0x00, 0x30, 0x00, 0xc4, 0x26, 0x78, 0x00, 0x33, 0x49, 0xc5,
+    0x06, 0xdb, 0x00, 0x33, 0x41, 0x15, 0xc1, 0x9d, 0x0e, 0x08, 0xc1, 0x9d,
+    0x1a, 0x16, 0xc1, 0x9d, 0x26, 0xc3, 0x05, 0x14, 0x00, 0x33, 0x09, 0xc4,
+    0x15, 0xe7, 0x00, 0x33, 0x00, 0xd1, 0x57, 0x1d, 0x00, 0x30, 0x51, 0xca,
+    0xa8, 0x00, 0x00, 0x30, 0x48, 0x44, 0x40, 0xee, 0xc1, 0x9d, 0x32, 0xc7,
+    0xc2, 0xdc, 0x07, 0xd8, 0xb1, 0xc8, 0xb8, 0x22, 0x00, 0x2c, 0x38, 0xc2,
+    0x16, 0x5a, 0x00, 0x2b, 0xab, 0x01, 0x9d, 0x4a, 0xc3, 0xb1, 0x0d, 0x00,
+    0x2c, 0x31, 0xc2, 0x38, 0x2a, 0x00, 0x2c, 0x29, 0x42, 0x00, 0x3c, 0xc1,
+    0x9d, 0x56, 0x12, 0xc1, 0x9d, 0x5e, 0x05, 0xc1, 0x9d, 0x6a, 0x14, 0xc1,
+    0x9d, 0x76, 0x16, 0xc1, 0x9d, 0x80, 0x18, 0xc1, 0x9d, 0x90, 0x15, 0xc1,
+    0x9d, 0x9a, 0x0c, 0xc1, 0x9d, 0xa6, 0xc3, 0x2a, 0x91, 0x00, 0x2b, 0xb1,
+    0xc3, 0x00, 0xc3, 0x00, 0x2b, 0xa1, 0x09, 0xc1, 0x9d, 0xb0, 0xc2, 0x01,
+    0x23, 0x00, 0x2b, 0x81, 0xc3, 0xe6, 0x1a, 0x00, 0x2b, 0x69, 0xc4, 0xe1,
+    0x0b, 0x00, 0x2b, 0x61, 0xc3, 0x03, 0x0d, 0x00, 0x2b, 0x59, 0x1c, 0xc1,
+    0x9d, 0xbc, 0x07, 0xc1, 0x9d, 0xc6, 0xc2, 0x0e, 0x9a, 0x00, 0x2b, 0x21,
+    0xc3, 0x18, 0xf2, 0x00, 0x2b, 0x11, 0xc3, 0x36, 0x99, 0x00, 0x2b, 0x08,
+    0xc3, 0xb1, 0x0d, 0x00, 0x2a, 0xb1, 0xc2, 0x38, 0x2a, 0x00, 0x2a, 0xa9,
+    0x42, 0x00, 0x3c, 0xc1, 0x9d, 0xd4, 0x12, 0xc1, 0x9d, 0xdc, 0xc2, 0x16,
+    0x5a, 0x00, 0x2a, 0x2b, 0x01, 0x9d, 0xe8, 0x05, 0xc1, 0x9d, 0xee, 0x14,
+    0xc1, 0x9d, 0xfa, 0x16, 0xc1, 0x9e, 0x04, 0x18, 0xc1, 0x9e, 0x0e, 0x15,
+    0xc1, 0x9e, 0x18, 0x0c, 0xc1, 0x9e, 0x24, 0xc3, 0x2a, 0x91, 0x00, 0x2a,
+    0x31, 0xc3, 0x00, 0xc3, 0x00, 0x2a, 0x21, 0x09, 0xc1, 0x9e, 0x2e, 0xc2,
+    0x01, 0x23, 0x00, 0x2a, 0x01, 0xc3, 0xe6, 0x1a, 0x00, 0x29, 0xe9, 0xc4,
+    0xe1, 0x0b, 0x00, 0x29, 0xe1, 0xc3, 0x03, 0x0d, 0x00, 0x29, 0xd9, 0x1c,
+    0xc1, 0x9e, 0x3a, 0x07, 0xc1, 0x9e, 0x44, 0xc2, 0x0e, 0x9a, 0x00, 0x29,
+    0xa1, 0xc3, 0x36, 0x99, 0x00, 0x29, 0x89, 0xc3, 0x18, 0xf2, 0x00, 0x29,
+    0x90, 0xc4, 0x6b, 0x52, 0x0f, 0x48, 0x01, 0x06, 0xc1, 0x9e, 0x52, 0xc4,
+    0x76, 0x31, 0x0f, 0x48, 0x11, 0xc4, 0xe4, 0xb3, 0x0f, 0x48, 0x19, 0x04,
+    0xc1, 0x9e, 0x5e, 0x15, 0xc1, 0x9e, 0x68, 0xc2, 0x00, 0x67, 0x0f, 0x48,
+    0x31, 0xc2, 0x00, 0x39, 0x0f, 0x48, 0x41, 0x87, 0x0f, 0x48, 0x49, 0xc2,
+    0x00, 0x87, 0x0f, 0x48, 0x51, 0x8b, 0x0f, 0x48, 0x59, 0x91, 0x0f, 0x48,
+    0x61, 0x1b, 0xc1, 0x9e, 0x74, 0xc3, 0x7e, 0x89, 0x0f, 0x48, 0x79, 0x10,
+    0xc1, 0x9e, 0x7e, 0x0d, 0xc1, 0x9e, 0x90, 0x97, 0x0f, 0x48, 0x99, 0xc4,
+    0xe1, 0x4b, 0x0f, 0x48, 0xa1, 0xc3, 0x11, 0xee, 0x0f, 0x48, 0xa9, 0xc2,
+    0x00, 0xd0, 0x0f, 0x48, 0xb1, 0xc4, 0xd8, 0x3a, 0x0f, 0x48, 0xb9, 0x09,
+    0xc1, 0x9e, 0xa2, 0xc2, 0x00, 0x16, 0x0f, 0x48, 0xd1, 0xc2, 0x02, 0x41,
+    0x0f, 0x48, 0xe1, 0xc3, 0xa9, 0xfc, 0x0f, 0x48, 0xf8, 0xc4, 0x14, 0x74,
+    0x0f, 0x49, 0x19, 0xc2, 0x00, 0xd0, 0x0f, 0x49, 0x78, 0x83, 0x0f, 0x49,
+    0x31, 0xc2, 0x01, 0x7f, 0x0f, 0x49, 0x48, 0xc9, 0xaf, 0x27, 0x0f, 0x49,
+    0x39, 0xc2, 0x00, 0xd0, 0x0f, 0x4a, 0x18, 0xc2, 0x01, 0x7f, 0x0f, 0x49,
+    0x81, 0x83, 0x0f, 0x49, 0xa0, 0xc2, 0x05, 0x1d, 0x0f, 0x49, 0x91, 0xc2,
+    0x19, 0x2c, 0x0f, 0x49, 0xd9, 0xc2, 0x00, 0xd0, 0x0f, 0x49, 0xe8, 0xc2,
+    0x0f, 0x9b, 0x0f, 0x49, 0x99, 0xc2, 0x00, 0xd0, 0x0f, 0x49, 0xf9, 0xc2,
+    0x01, 0x53, 0x0f, 0x4a, 0x10, 0x83, 0x0f, 0x49, 0xd1, 0xc2, 0x00, 0x51,
+    0x0f, 0x4a, 0x00, 0xc2, 0x02, 0xa0, 0x0f, 0x4a, 0x91, 0xc4, 0x02, 0xde,
+    0x0f, 0x4a, 0x98, 0xc3, 0x09, 0x9e, 0x0f, 0x4a, 0xa1, 0xc3, 0x0d, 0x14,
+    0x0f, 0x4a, 0xa8, 0xc2, 0x22, 0xcc, 0x0f, 0x4a, 0xb1, 0xc4, 0x18, 0x10,
+    0x0f, 0x4a, 0xb8, 0xc7, 0xc0, 0xeb, 0x0f, 0xbb, 0x61, 0xc4, 0xe4, 0xab,
+    0x0f, 0xbb, 0x58, 0x02, 0x41, 0x9e, 0xac, 0xc6, 0xcf, 0x8f, 0x0f, 0xbb,
+    0x2b, 0x01, 0x9e, 0xb4, 0x48, 0xba, 0xf2, 0x41, 0x9e, 0xb8, 0xc3, 0x04,
+    0xa1, 0x0f, 0xb9, 0x01, 0xcb, 0x4c, 0x50, 0x0f, 0xb9, 0x28, 0xc2, 0x34,
+    0x63, 0x0f, 0xba, 0x61, 0xcb, 0x95, 0xa3, 0x0f, 0xba, 0x71, 0xc6, 0xd1,
+    0xed, 0x0f, 0xba, 0x80, 0xc5, 0xd9, 0x25, 0x0f, 0xbb, 0x0b, 0x01, 0x9e,
+    0xc7, 0xc4, 0x2d, 0xad, 0x0f, 0xbb, 0x00, 0xc4, 0xdf, 0x63, 0x0f, 0xba,
+    0x5b, 0x01, 0x9e, 0xcd, 0xc7, 0xc7, 0x0b, 0x0f, 0xba, 0xc0, 0xc4, 0xde,
+    0xcf, 0x0f, 0xbb, 0x19, 0xca, 0x9f, 0x68, 0x0f, 0xbb, 0x20, 0xc2, 0xe5,
+    0xfd, 0x0f, 0xba, 0x00, 0xc4, 0x91, 0x3d, 0x0f, 0xb9, 0x49, 0xc5, 0x87,
+    0xc4, 0x0f, 0xba, 0x40, 0xc5, 0xd5, 0xe7, 0x0f, 0xb9, 0x93, 0x01, 0x9e,
+    0xd3, 0xc5, 0xd9, 0x8e, 0x0f, 0xb9, 0xdb, 0x01, 0x9e, 0xdd, 0xc4, 0x08,
+    0x88, 0x0f, 0xbb, 0x68, 0xc2, 0xe5, 0xfd, 0x0f, 0xb8, 0xc8, 0xc5, 0xdb,
+    0x7d, 0x0f, 0xb8, 0x53, 0x01, 0x9e, 0xe3, 0xc5, 0xd7, 0xb8, 0x0f, 0xb8,
+    0xb2, 0x01, 0x9e, 0xed, 0x46, 0x5d, 0x2b, 0xc1, 0x9e, 0xf3, 0xc4, 0x4e,
+    0x2b, 0x0f, 0xb8, 0x68, 0x96, 0x0f, 0xb8, 0xa3, 0x01, 0x9e, 0xff, 0xc9,
+    0xad, 0xec, 0x0f, 0xb9, 0xc8, 0xcd, 0x7b, 0x49, 0x0f, 0xba, 0x91, 0xd3,
+    0x40, 0xc6, 0x0f, 0xba, 0xe2, 0x01, 0x9f, 0x05, 0x00, 0xc1, 0x9f, 0x0b,
+    0xc6, 0xd1, 0xab, 0x0f, 0xb8, 0x28, 0xc4, 0xe1, 0x17, 0x0f, 0xb9, 0xb3,
+    0x01, 0x9f, 0x1d, 0xc2, 0x01, 0xdf, 0x0f, 0xba, 0x29, 0xc5, 0xd9, 0x16,
+    0x0f, 0xbb, 0x50, 0x02, 0x41, 0x9f, 0x23, 0xc2, 0xe5, 0xfd, 0x0f, 0xb8,
+    0xe8, 0xc8, 0xb7, 0x42, 0x0f, 0xba, 0xb1, 0xc2, 0x00, 0x33, 0x0f, 0xbb,
+    0x70, 0xc4, 0xb4, 0xbe, 0x0f, 0xbb, 0x91, 0xc5, 0xd5, 0x60, 0x0f, 0xbb,
+    0x98, 0x22, 0xc1, 0x9f, 0x2b, 0x21, 0xc1, 0x9f, 0x53, 0x20, 0xc1, 0x9f,
+    0x84, 0x1f, 0xc1, 0x9f, 0xaf, 0x1e, 0xc1, 0x9f, 0xda, 0x1d, 0xc1, 0xa0,
+    0x05, 0x23, 0xc1, 0xa0, 0x29, 0x24, 0xc1, 0xa0, 0x54, 0x25, 0xc1, 0xa0,
+    0x7c, 0x26, 0x41, 0xa0, 0xa4, 0x1d, 0xc1, 0xa0, 0xd2, 0x1e, 0xc1, 0xa1,
+    0x0c, 0x1f, 0xc1, 0xa1, 0x3a, 0x20, 0xc1, 0xa1, 0x65, 0x21, 0xc1, 0xa1,
+    0x90, 0x22, 0xc1, 0xa1, 0xb8, 0x23, 0xc1, 0xa1, 0xe0, 0x24, 0xc1, 0xa2,
+    0x08, 0x25, 0xc1, 0xa2, 0x30, 0x26, 0x41, 0xa2, 0x58, 0x1d, 0xc1, 0xa2,
+    0x80, 0x1e, 0xc1, 0xa2, 0xb1, 0x1f, 0xc1, 0xa2, 0xdf, 0x20, 0xc1, 0xa3,
+    0x0a, 0x21, 0xc1, 0xa3, 0x32, 0x22, 0xc1, 0xa3, 0x5a, 0x23, 0xc1, 0xa3,
+    0x82, 0x24, 0xc1, 0xa3, 0xad, 0x25, 0xc1, 0xa3, 0xd5, 0x26, 0x41, 0xa4,
+    0x00, 0x1d, 0xc1, 0xa4, 0x2e, 0x1e, 0xc1, 0xa4, 0x59, 0x1f, 0xc1, 0xa4,
+    0x81, 0x20, 0xc1, 0xa4, 0xac, 0x21, 0xc1, 0xa4, 0xd7, 0x22, 0xc1, 0xa4,
+    0xff, 0x23, 0xc1, 0xa5, 0x2a, 0x24, 0xc1, 0xa5, 0x58, 0x25, 0xc1, 0xa5,
+    0x83, 0x26, 0x41, 0xa5, 0xb1, 0x1d, 0xc1, 0xa5, 0xdb, 0x1e, 0xc1, 0xa6,
+    0x03, 0x1f, 0xc1, 0xa6, 0x2b, 0x20, 0xc1, 0xa6, 0x53, 0x21, 0xc1, 0xa6,
+    0x7b, 0x22, 0xc1, 0xa6, 0xa3, 0x23, 0xc1, 0xa6, 0xd1, 0x24, 0xc1, 0xa6,
+    0xf9, 0x25, 0xc1, 0xa7, 0x21, 0x26, 0x41, 0xa7, 0x49, 0x1d, 0xc1, 0xa7,
+    0x69, 0x1e, 0xc1, 0xa7, 0x8d, 0x1f, 0xc1, 0xa7, 0xb5, 0xc2, 0xe6, 0x4a,
+    0x0a, 0x32, 0x30, 0xcf, 0x62, 0xe2, 0x01, 0x11, 0x99, 0xd2, 0x4e, 0x77,
+    0x01, 0x4a, 0x00, 0xd3, 0x44, 0x7c, 0x01, 0x0d, 0xb1, 0x4f, 0x01, 0x93,
+    0x41, 0xa7, 0xdd, 0xe0, 0x09, 0x07, 0x0f, 0xa8, 0x20, 0xc8, 0x52, 0x09,
+    0x01, 0x4d, 0x21, 0xc8, 0x4e, 0x9b, 0x01, 0x4c, 0xf0, 0xc9, 0x18, 0x66,
+    0x01, 0x10, 0xb8, 0xc2, 0x00, 0xd0, 0x08, 0xba, 0x21, 0x83, 0x08, 0xba,
+    0x18, 0xc2, 0x00, 0xd0, 0x08, 0xba, 0x11, 0x83, 0x08, 0xba, 0x08, 0xc2,
+    0x01, 0x5d, 0x08, 0xb8, 0xd1, 0xc2, 0x01, 0x30, 0x08, 0xb8, 0xb1, 0xc2,
+    0x01, 0x6f, 0x08, 0xb8, 0x28, 0xc6, 0x00, 0x41, 0x08, 0xb9, 0xe9, 0xcc,
+    0x82, 0x65, 0x08, 0xb9, 0xe0, 0x00, 0x41, 0xa7, 0xfb, 0xc4, 0x02, 0xb9,
+    0x01, 0x1a, 0xf1, 0xc8, 0x52, 0x09, 0x01, 0x1a, 0xc0, 0xc9, 0x52, 0x08,
+    0x01, 0x1b, 0xc0, 0xcb, 0x95, 0xf0, 0x01, 0x1b, 0x91, 0x45, 0x9a, 0x3d,
+    0xc1, 0xa8, 0x3f, 0xc8, 0xba, 0x22, 0x01, 0x1a, 0xe8, 0x00, 0xc1, 0xa8,
+    0x51, 0xca, 0x6c, 0xe2, 0x01, 0x1a, 0xb0, 0x00, 0xc1, 0xa8, 0x63, 0x43,
+    0x33, 0x60, 0x41, 0xa8, 0x75, 0xc9, 0xae, 0x22, 0x01, 0x1b, 0x69, 0xcc,
+    0x88, 0x89, 0x01, 0x1b, 0x18, 0xc9, 0x20, 0xa8, 0x01, 0x1b, 0x29, 0x42,
+    0x00, 0x15, 0xc1, 0xa8, 0x81, 0xc8, 0x52, 0x09, 0x01, 0x1a, 0xe1, 0xc9,
+    0x02, 0xfe, 0x01, 0x1a, 0x49, 0xc3, 0xba, 0x27, 0x01, 0x19, 0xf0, 0x46,
+    0x00, 0xe2, 0xc1, 0xa8, 0x8d, 0xd9, 0x1f, 0xae, 0x01, 0x12, 0x30, 0x87,
+    0x08, 0x59, 0xa9, 0xc2, 0x00, 0x4e, 0x08, 0x59, 0x48, 0xc3, 0x04, 0x65,
+    0x08, 0x59, 0xa1, 0x0a, 0xc1, 0xa8, 0x9c, 0x87, 0x08, 0x59, 0x78, 0x87,
+    0x08, 0x59, 0x59, 0xc2, 0x0c, 0x43, 0x08, 0x59, 0x50, 0xc2, 0x02, 0x6f,
+    0x08, 0x59, 0x39, 0xc2, 0x0c, 0x43, 0x08, 0x59, 0x31, 0x87, 0x08, 0x59,
+    0x29, 0x09, 0x41, 0xa8, 0xa6, 0xc2, 0x01, 0x7f, 0x08, 0x58, 0xe1, 0x87,
+    0x08, 0x58, 0xd8, 0xc2, 0x01, 0x7f, 0x08, 0x58, 0xd1, 0x87, 0x08, 0x58,
+    0xc9, 0xc2, 0x00, 0xac, 0x08, 0x58, 0xe8, 0xc2, 0x01, 0x7f, 0x08, 0x58,
+    0xb1, 0xc2, 0x09, 0x3b, 0x08, 0x58, 0xa9, 0x87, 0x08, 0x58, 0xa0, 0xc2,
+    0x00, 0x5f, 0x08, 0x58, 0x99, 0x87, 0x08, 0x58, 0x89, 0xc2, 0x0c, 0x43,
+    0x08, 0x58, 0x90, 0x97, 0x08, 0x58, 0x78, 0x8b, 0x08, 0x58, 0x68, 0x91,
+    0x08, 0x58, 0x58, 0x87, 0x08, 0x58, 0x48, 0x87, 0x08, 0x58, 0x33, 0x01,
+    0xa8, 0xb6, 0x83, 0x08, 0x58, 0x0b, 0x01, 0xa8, 0xba, 0x90, 0x08, 0x58,
+    0x21, 0x91, 0x08, 0x58, 0x10, 0x87, 0x08, 0x59, 0x01, 0xc2, 0x01, 0x7f,
+    0x08, 0x59, 0x08, 0x87, 0x08, 0x59, 0x81, 0xc2, 0x01, 0x7f, 0x08, 0x59,
+    0x90, 0x00, 0x41, 0xa8, 0xc2, 0x0a, 0xc1, 0xa8, 0xce, 0xc2, 0x00, 0xc4,
+    0x08, 0x08, 0x83, 0x01, 0xa8, 0xe0, 0x19, 0x41, 0xa8, 0xe6, 0x0b, 0xc1,
+    0xa8, 0xf6, 0x11, 0x41, 0xa9, 0x08, 0xc2, 0x22, 0xcc, 0x08, 0x08, 0x63,
+    0x01, 0xa9, 0x1a, 0xc4, 0x18, 0x10, 0x08, 0x08, 0x6a, 0x01, 0xa9, 0x27,
+    0x00, 0xc1, 0xa9, 0x34, 0x9b, 0x08, 0x08, 0xba, 0x01, 0xa9, 0x40, 0x00,
+    0xc1, 0xa9, 0x46, 0xc2, 0x0d, 0x10, 0x08, 0x08, 0xc2, 0x01, 0xa9, 0x52,
+    0xc9, 0xb3, 0x20, 0x08, 0x09, 0xb9, 0x08, 0xc1, 0xa9, 0x58, 0xce, 0x71,
+    0x22, 0x08, 0x09, 0xc9, 0xcd, 0x7d, 0xb9, 0x08, 0x09, 0xd0, 0xc4, 0x02,
+    0x6d, 0x08, 0x08, 0x01, 0xc3, 0x02, 0xa3, 0x08, 0x08, 0x08, 0x45, 0x00,
+    0x2d, 0xc1, 0xa9, 0x64, 0x44, 0x00, 0x4a, 0x41, 0xa9, 0xa4, 0xc2, 0x02,
+    0xae, 0x01, 0x2b, 0xcb, 0x01, 0xa9, 0xbc, 0xc4, 0x00, 0x49, 0x01, 0x2b,
+    0xc3, 0x01, 0xa9, 0xc2, 0x42, 0x00, 0x58, 0xc1, 0xa9, 0xc8, 0xc5, 0x00,
+    0x2c, 0x01, 0x2b, 0xd1, 0xc8, 0x00, 0x5f, 0x01, 0x28, 0x1b, 0x01, 0xa9,
+    0xd7, 0x4f, 0x61, 0x5c, 0xc1, 0xa9, 0xdd, 0x4c, 0x52, 0xbb, 0xc1, 0xa9,
+    0xe9, 0xca, 0x01, 0x68, 0x01, 0x28, 0x08, 0x45, 0x00, 0x5a, 0xc1, 0xa9,
+    0xf5, 0x43, 0x11, 0x19, 0x41, 0xaa, 0x10, 0x4b, 0x99, 0xb8, 0xc1, 0xaa,
+    0x28, 0x4b, 0x8e, 0x76, 0xc1, 0xaa, 0x3a, 0x4a, 0x11, 0x39, 0xc1, 0xaa,
+    0x4c, 0x4a, 0x5c, 0x42, 0x41, 0xaa, 0x5e, 0x4b, 0x99, 0xb8, 0xc1, 0xaa,
+    0x70, 0x4b, 0x8e, 0x76, 0xc1, 0xaa, 0x82, 0x4a, 0x5c, 0x42, 0xc1, 0xaa,
+    0x94, 0x4a, 0x11, 0x39, 0x41, 0xaa, 0xac, 0x4f, 0x66, 0xc0, 0xc1, 0xaa,
+    0xc4, 0xdc, 0x12, 0xc5, 0x01, 0x2a, 0x31, 0xdc, 0x13, 0xc1, 0x01, 0x2a,
+    0x21, 0x4f, 0x12, 0xca, 0x41, 0xaa, 0xd6, 0xd8, 0x25, 0xa3, 0x01, 0x1d,
+    0xb0, 0xc8, 0x1e, 0x3f, 0x01, 0x19, 0x09, 0xcc, 0x85, 0x71, 0x01, 0x5e,
+    0x59, 0xd0, 0x1d, 0xec, 0x01, 0x72, 0xd9, 0xd1, 0x1a, 0x4a, 0x01, 0x72,
+    0xe0, 0x05, 0xc1, 0xaa, 0xe8, 0xcc, 0x88, 0x65, 0x01, 0x71, 0x28, 0x05,
+    0xc1, 0xaa, 0xf4, 0xcc, 0x88, 0x65, 0x01, 0x71, 0x20, 0xd0, 0x5d, 0x52,
+    0x01, 0x4e, 0x91, 0xcf, 0x66, 0x66, 0x01, 0x4e, 0x88, 0xca, 0xa7, 0xec,
+    0x0f, 0xaa, 0x79, 0xca, 0x9e, 0x78, 0x0f, 0xcb, 0x18, 0xc5, 0xdb, 0xd7,
+    0x0f, 0xa6, 0x88, 0x97, 0x01, 0x8d, 0x00, 0x89, 0x01, 0x89, 0x5b, 0x01,
+    0xab, 0x00, 0x90, 0x01, 0x89, 0x78, 0x8a, 0x01, 0x8d, 0xc8, 0x90, 0x01,
+    0x89, 0x61, 0x97, 0x01, 0x8d, 0x19, 0x8a, 0x01, 0x8d, 0xc1, 0x99, 0x01,
+    0x8d, 0xe0, 0x99, 0x01, 0x8d, 0xe8, 0x8b, 0x01, 0x8d, 0x10, 0x8a, 0x01,
+    0x88, 0x99, 0x8b, 0x01, 0x8d, 0x09, 0x9b, 0x01, 0x8d, 0xd0, 0x8a, 0x01,
+    0x88, 0xa0, 0x8a, 0x01, 0x88, 0xa8, 0x8b, 0x01, 0x88, 0xf3, 0x01, 0xab,
+    0x04, 0x97, 0x01, 0x89, 0x03, 0x01, 0xab, 0x0a, 0x90, 0x01, 0x89, 0x13,
+    0x01, 0xab, 0x10, 0x8f, 0x01, 0x8d, 0x81, 0x8a, 0x01, 0x8d, 0xf8, 0x97,
+    0x01, 0x89, 0x09, 0xcf, 0x33, 0xad, 0x01, 0x89, 0x71, 0x91, 0x01, 0x8d,
+    0x31, 0x10, 0xc1, 0xab, 0x18, 0x8f, 0x01, 0x8d, 0x89, 0x87, 0x01, 0x8d,
+    0xf0, 0x8a, 0x01, 0x88, 0xe9, 0x8b, 0x01, 0x88, 0xf9, 0x90, 0x01, 0x89,
+    0x1b, 0x01, 0xab, 0x20, 0x94, 0x01, 0x89, 0x31, 0x87, 0x01, 0x8d, 0x20,
+    0x97, 0x01, 0x89, 0x49, 0x8a, 0x01, 0x89, 0x69, 0x94, 0x01, 0x8d, 0x41,
+    0xc2, 0x1b, 0x88, 0x01, 0x8d, 0x53, 0x01, 0xab, 0x28, 0x8f, 0x01, 0x8d,
+    0x60, 0xc2, 0x1b, 0x88, 0x01, 0x8d, 0x58, 0xa1, 0x0f, 0xd8, 0x43, 0x01,
+    0xab, 0x2c, 0x9f, 0x0f, 0xd8, 0x13, 0x01, 0xab, 0x37, 0xa2, 0x0f, 0xd8,
+    0x83, 0x01, 0xab, 0x50, 0xa0, 0x0f, 0xd8, 0x23, 0x01, 0xab, 0x54, 0xa3,
+    0x0f, 0xd8, 0xf8, 0xa2, 0x0f, 0xd8, 0x9b, 0x01, 0xab, 0x65, 0xa1, 0x0f,
+    0xd8, 0x5b, 0x01, 0xab, 0x69, 0xa3, 0x0f, 0xd9, 0x10, 0xa2, 0x0f, 0xd8,
+    0x8b, 0x01, 0xab, 0x74, 0xa0, 0x0f, 0xd8, 0x2b, 0x01, 0xab, 0x78, 0xa3,
+    0x0f, 0xd9, 0x01, 0xa1, 0x0f, 0xd8, 0x4a, 0x01, 0xab, 0x8a, 0xa3, 0x0f,
+    0xd9, 0x68, 0xa3, 0x0f, 0xd9, 0x31, 0xa2, 0x0f, 0xd8, 0xb2, 0x01, 0xab,
+    0x91, 0x05, 0xc1, 0xab, 0x95, 0x15, 0xc1, 0xab, 0xbc, 0x16, 0xc1, 0xab,
+    0xff, 0x06, 0xc1, 0xac, 0x1d, 0x14, 0xc1, 0xac, 0x30, 0x0e, 0xc1, 0xac,
+    0x42, 0xd6, 0x2c, 0xb2, 0x01, 0x3a, 0x99, 0x08, 0xc1, 0xac, 0x52, 0xc3,
+    0xe6, 0x74, 0x01, 0x38, 0x91, 0x0f, 0xc1, 0xac, 0x5a, 0x17, 0xc1, 0xac,
+    0x66, 0x0a, 0xc1, 0xac, 0x70, 0x12, 0xc1, 0xac, 0x7e, 0x43, 0x00, 0x5f,
+    0xc1, 0xac, 0x90, 0xc6, 0xca, 0x91, 0x01, 0x4e, 0x99, 0xc7, 0xc9, 0x3b,
+    0x01, 0x5e, 0x20, 0x4a, 0x14, 0xda, 0xc1, 0xac, 0x9c, 0x4f, 0x66, 0x93,
+    0x41, 0xac, 0xae, 0xca, 0x9f, 0xc2, 0x0f, 0xa5, 0xb9, 0xc9, 0xb3, 0x32,
+    0x0f, 0xa5, 0xb1, 0xcb, 0x99, 0x60, 0x0f, 0xa5, 0xa9, 0xc8, 0x77, 0x99,
+    0x0f, 0xa5, 0xa0, 0xc2, 0x00, 0x45, 0x0f, 0x9c, 0x43, 0x01, 0xac, 0xc2,
+    0x42, 0x00, 0x30, 0x41, 0xac, 0xc8, 0x0f, 0xc1, 0xac, 0xd8, 0xc3, 0x01,
+    0xad, 0x00, 0xda, 0xd2, 0x01, 0xac, 0xe7, 0x4a, 0xa2, 0x24, 0xc1, 0xac,
+    0xed, 0x4b, 0x95, 0x40, 0xc1, 0xac, 0xf9, 0x4a, 0x51, 0x89, 0xc1, 0xad,
+    0x05, 0x06, 0x41, 0xad, 0x29, 0x42, 0x00, 0xb0, 0xc1, 0xad, 0x43, 0xc4,
+    0xde, 0xcb, 0x00, 0xda, 0xf0, 0xc4, 0x26, 0x78, 0x00, 0xda, 0xc9, 0xc5,
+    0x06, 0xdb, 0x00, 0xda, 0xc1, 0x15, 0xc1, 0xad, 0x4f, 0x08, 0xc1, 0xad,
+    0x5b, 0x16, 0xc1, 0xad, 0x67, 0xc3, 0x05, 0x14, 0x00, 0xda, 0x89, 0xc4,
+    0x15, 0xe7, 0x00, 0xda, 0x80, 0x03, 0xc1, 0xad, 0x73, 0xc9, 0xa9, 0xfc,
+    0x00, 0xda, 0x51, 0xc8, 0xbe, 0x12, 0x00, 0xda, 0x49, 0x07, 0xc1, 0xad,
+    0x8e, 0x16, 0xc1, 0xad, 0x9a, 0x0d, 0xc1, 0xad, 0xa7, 0xc2, 0x00, 0xd0,
+    0x00, 0xd9, 0x99, 0xc2, 0x0d, 0xf6, 0x00, 0xd9, 0x93, 0x01, 0xad, 0xb4,
+    0xc2, 0x01, 0x4a, 0x00, 0xd9, 0x79, 0xc2, 0x00, 0xdb, 0x00, 0xd9, 0x73,
+    0x01, 0xad, 0xba, 0xc2, 0x00, 0x39, 0x00, 0xd9, 0x6b, 0x01, 0xad, 0xc3,
+    0xc2, 0x19, 0x2c, 0x00, 0xd9, 0x61, 0xc2, 0x01, 0xc3, 0x00, 0xd9, 0x59,
+    0xc2, 0x01, 0x5d, 0x00, 0xd9, 0x4b, 0x01, 0xad, 0xcc, 0xc2, 0x00, 0xb0,
+    0x00, 0xd9, 0x3b, 0x01, 0xad, 0xd2, 0x10, 0xc1, 0xad, 0xd8, 0xc2, 0x0e,
+    0x9a, 0x00, 0xd9, 0x23, 0x01, 0xad, 0xeb, 0xc2, 0x25, 0x3b, 0x00, 0xd8,
+    0xd3, 0x01, 0xad, 0xf1, 0xc2, 0x00, 0x64, 0x00, 0xd8, 0xc3, 0x01, 0xad,
+    0xf7, 0xc2, 0x01, 0x30, 0x00, 0xd8, 0xab, 0x01, 0xad, 0xfd, 0xc5, 0xde,
+    0x0c, 0x00, 0xd8, 0x8b, 0x01, 0xae, 0x03, 0xc5, 0xdb, 0x5f, 0x00, 0xd8,
+    0x4b, 0x01, 0xae, 0x09, 0xc5, 0xd7, 0xbd, 0x00, 0xd8, 0x3a, 0x01, 0xae,
+    0x0f, 0xc5, 0xd8, 0xbc, 0x00, 0xda, 0x13, 0x01, 0xae, 0x15, 0x16, 0xc1,
+    0xae, 0x1b, 0xc8, 0xb5, 0xaa, 0x00, 0xd9, 0xe3, 0x01, 0xae, 0x2a, 0xc7,
+    0xc4, 0x79, 0x00, 0xd9, 0xd3, 0x01, 0xae, 0x30, 0xc4, 0xc5, 0x6e, 0x00,
+    0xd9, 0xc3, 0x01, 0xae, 0x36, 0xc3, 0x96, 0x9c, 0x00, 0xd9, 0xb2, 0x01,
+    0xae, 0x3c, 0xc7, 0xc3, 0x8b, 0x00, 0xd9, 0xa1, 0xc5, 0xd4, 0x75, 0x00,
+    0xd8, 0x21, 0xc6, 0xcf, 0x59, 0x00, 0xd8, 0x19, 0xc5, 0xde, 0x48, 0x00,
+    0xd8, 0x11, 0x44, 0xdf, 0x3f, 0x41, 0xae, 0x42, 0x44, 0x08, 0xcb, 0xc1,
+    0xae, 0x4e, 0x43, 0x01, 0xc8, 0xc1, 0xae, 0x5a, 0xc8, 0xaf, 0x82, 0x0b,
+    0x57, 0x90, 0x8b, 0x0b, 0x57, 0x69, 0x87, 0x0b, 0x57, 0x63, 0x01, 0xae,
+    0x66, 0x97, 0x0b, 0x57, 0x53, 0x01, 0xae, 0x70, 0x91, 0x0b, 0x57, 0x43,
+    0x01, 0xae, 0x76, 0x83, 0x0b, 0x57, 0x39, 0xc2, 0x01, 0x4a, 0x0b, 0x56,
+    0xdb, 0x01, 0xae, 0x7a, 0xc2, 0x00, 0xb0, 0x0b, 0x57, 0x29, 0x1b, 0xc1,
+    0xae, 0x80, 0xc2, 0x5d, 0xb3, 0x0b, 0x57, 0x19, 0xc2, 0x01, 0x5d, 0x0b,
+    0x57, 0x11, 0xc2, 0x00, 0xf1, 0x0b, 0x57, 0x09, 0xc2, 0x00, 0x89, 0x0b,
+    0x56, 0xf9, 0x06, 0xc1, 0xae, 0x8c, 0x09, 0xc1, 0xae, 0x96, 0xc2, 0x01,
+    0x6c, 0x0b, 0x56, 0xe1, 0xc4, 0xdf, 0xdf, 0x0b, 0x56, 0xd1, 0xc2, 0x00,
+    0x81, 0x0b, 0x56, 0xc9, 0x0d, 0xc1, 0xae, 0xa2, 0xc3, 0x00, 0x50, 0x0b,
+    0x56, 0xa1, 0xc2, 0x00, 0x87, 0x0b, 0x56, 0x99, 0xc2, 0x00, 0x40, 0x0b,
+    0x56, 0x90, 0x45, 0xd6, 0x6e, 0xc1, 0xae, 0xac, 0x83, 0x05, 0x35, 0x59,
+    0x07, 0xc1, 0xae, 0xd0, 0x17, 0xc1, 0xae, 0xda, 0x8b, 0x05, 0x36, 0xe8,
+    0x83, 0x05, 0x35, 0x09, 0x97, 0x05, 0x35, 0x19, 0xc3, 0x17, 0x29, 0x05,
+    0x35, 0xd1, 0x07, 0xc1, 0xae, 0xe4, 0x91, 0x05, 0x36, 0xfb, 0x01, 0xae,
+    0xf2, 0x8b, 0x05, 0x37, 0x29, 0xc2, 0x00, 0xb0, 0x05, 0x37, 0x48, 0x07,
+    0xc1, 0xae, 0xfe, 0x0b, 0xc1, 0xaf, 0x0c, 0x97, 0x05, 0x36, 0x61, 0xc2,
+    0x10, 0x11, 0x05, 0x36, 0x88, 0x03, 0xc1, 0xaf, 0x16, 0x8b, 0x05, 0x37,
+    0x21, 0x07, 0x41, 0xaf, 0x1e, 0xc2, 0x16, 0x5a, 0x05, 0x35, 0x41, 0xc3,
+    0x4f, 0x43, 0x05, 0x35, 0x89, 0x0c, 0xc1, 0xaf, 0x26, 0x97, 0x05, 0x35,
+    0xeb, 0x01, 0xaf, 0x38, 0xc3, 0x01, 0xe2, 0x05, 0x36, 0x19, 0x16, 0xc1,
+    0xaf, 0x3e, 0x8b, 0x05, 0x36, 0x79, 0x09, 0xc1, 0xaf, 0x4a, 0x83, 0x05,
+    0x36, 0xd8, 0x83, 0x05, 0x35, 0x51, 0xc4, 0xe2, 0x9f, 0x05, 0x35, 0x71,
+    0x97, 0x05, 0x36, 0x69, 0x8b, 0x05, 0x36, 0xe1, 0xc2, 0x7f, 0xc0, 0x05,
+    0x36, 0xf0, 0x07, 0xc1, 0xaf, 0x5a, 0x97, 0x05, 0x35, 0xa9, 0x8b, 0x05,
+    0x36, 0x71, 0x04, 0xc1, 0xaf, 0x64, 0x83, 0x05, 0x37, 0x19, 0x91, 0x05,
+    0x37, 0x30, 0xc2, 0x5d, 0xa1, 0x05, 0x35, 0xa1, 0x0a, 0xc1, 0xaf, 0x70,
+    0x8b, 0x05, 0x35, 0xb9, 0xc3, 0xd7, 0xe2, 0x05, 0x35, 0xc9, 0xc4, 0xbf,
+    0xf1, 0x05, 0x37, 0x60, 0xc2, 0x7f, 0xc0, 0x05, 0x35, 0xf9, 0xc2, 0x92,
+    0xb5, 0x05, 0x36, 0x09, 0x83, 0x05, 0x36, 0x10, 0xc2, 0x0f, 0xe1, 0x05,
+    0x36, 0x49, 0x83, 0x05, 0x36, 0xd0, 0xc2, 0x02, 0xe0, 0x05, 0x36, 0x59,
+    0x97, 0x05, 0x36, 0xc1, 0xc2, 0x00, 0x7a, 0x05, 0x36, 0xc9, 0xc5, 0xd8,
+    0xe9, 0x05, 0x37, 0x68, 0x4c, 0x85, 0x4d, 0xc1, 0xaf, 0x84, 0xc2, 0x01,
+    0xc3, 0x05, 0x37, 0xa8, 0xe0, 0x06, 0x87, 0x01, 0x3d, 0x58, 0xcb, 0x96,
+    0x74, 0x0f, 0xac, 0x11, 0xda, 0x1c, 0xee, 0x0f, 0xa8, 0xc8, 0xc4, 0x40,
+    0x89, 0x00, 0x00, 0x41, 0x5a, 0x1a, 0x30, 0x41, 0xaf, 0x90, 0x4c, 0x8a,
+    0xc9, 0xc1, 0xaf, 0x9c, 0xc9, 0xad, 0xc8, 0x00, 0xdf, 0x30, 0xc7, 0xc6,
+    0xc5, 0x00, 0xdf, 0x99, 0xc5, 0xc8, 0x5d, 0x00, 0xdf, 0x90, 0x8a, 0x00,
+    0xdf, 0x89, 0xc2, 0x00, 0x75, 0x00, 0xdf, 0x80, 0x97, 0x00, 0xdf, 0x73,
+    0x01, 0xaf, 0xac, 0x45, 0xc6, 0xd3, 0xc1, 0xaf, 0xb2, 0x91, 0x00, 0xdf,
+    0x61, 0x8b, 0x00, 0xdf, 0x51, 0x87, 0x00, 0xdf, 0x3b, 0x01, 0xaf, 0xba,
+    0xc8, 0xbf, 0x0a, 0x00, 0xdf, 0x40, 0x97, 0x00, 0xdf, 0x29, 0x8b, 0x00,
+    0xdf, 0x21, 0x0f, 0xc1, 0xaf, 0xbe, 0x10, 0xc1, 0xaf, 0xcb, 0xc2, 0x00,
+    0x64, 0x00, 0xdf, 0x09, 0x15, 0xc1, 0xaf, 0xe7, 0xc2, 0x00, 0xdb, 0x00,
+    0xde, 0xf1, 0xc2, 0x19, 0x2c, 0x00, 0xde, 0xd9, 0xc2, 0x00, 0x39, 0x00,
+    0xde, 0x91, 0xc2, 0x0e, 0x9a, 0x00, 0xde, 0x89, 0xc2, 0x25, 0x3b, 0x00,
+    0xde, 0x81, 0xc2, 0x01, 0x30, 0x00, 0xde, 0x71, 0xc2, 0x00, 0xb0, 0x00,
+    0xde, 0x3b, 0x01, 0xaf, 0xf7, 0xc2, 0x01, 0x4a, 0x00, 0xde, 0x59, 0xc7,
+    0xc6, 0xd3, 0x00, 0xde, 0x31, 0xc2, 0x01, 0x5d, 0x00, 0xde, 0x29, 0xc2,
+    0x00, 0xd0, 0x00, 0xde, 0x11, 0x83, 0x00, 0xde, 0x00, 0x0d, 0xc1, 0xaf,
+    0xfd, 0xc2, 0x00, 0xd0, 0x00, 0x4d, 0xc9, 0x15, 0xc1, 0xb0, 0x0a, 0xc2,
+    0x00, 0xdb, 0x00, 0x4d, 0x91, 0x14, 0xc1, 0xb0, 0x1a, 0x1b, 0xc1, 0xb0,
+    0x2d, 0xc2, 0x01, 0xc3, 0x00, 0x4d, 0x71, 0x04, 0xc1, 0xb0, 0x37, 0x12,
+    0xc1, 0xb0, 0x41, 0x10, 0xc1, 0xb0, 0x4b, 0x06, 0xc1, 0xb0, 0x61, 0x16,
+    0xc1, 0xb0, 0x6f, 0x0c, 0xc1, 0xb0, 0x7d, 0x05, 0xc1, 0xb0, 0x87, 0x09,
+    0xc1, 0xb0, 0x91, 0x83, 0x00, 0x4c, 0x2b, 0x01, 0xb0, 0x9b, 0x91, 0x00,
+    0x4c, 0x99, 0x8b, 0x00, 0x4c, 0x3b, 0x01, 0xb0, 0xa7, 0x97, 0x00, 0x4c,
+    0x4b, 0x01, 0xb0, 0xab, 0x18, 0xc1, 0xb0, 0xaf, 0x87, 0x00, 0x4c, 0x78,
+    0x44, 0x00, 0xbb, 0xc1, 0xb0, 0xbb, 0xca, 0xa0, 0x26, 0x00, 0x4f, 0xf0,
+    0x03, 0xc1, 0xb0, 0xd1, 0x91, 0x00, 0x4e, 0x59, 0x87, 0x00, 0x4e, 0x39,
+    0x48, 0xb2, 0x2d, 0xc1, 0xb0, 0xdd, 0x97, 0x00, 0x4e, 0x0b, 0x01, 0xb0,
+    0xeb, 0x8b, 0x00, 0x4d, 0xfa, 0x01, 0xb0, 0xef, 0xcd, 0x73, 0x0d, 0x00,
+    0x4e, 0xb9, 0xc3, 0x7c, 0x50, 0x00, 0x4c, 0x01, 0xd0, 0x50, 0xcf, 0x00,
+    0x4f, 0xe8, 0xc4, 0x15, 0xe7, 0x00, 0x4f, 0x31, 0xc3, 0x05, 0x14, 0x00,
+    0x4f, 0x39, 0x16, 0xc1, 0xb0, 0xf3, 0x08, 0xc1, 0xb0, 0xff, 0x15, 0xc1,
+    0xb1, 0x0b, 0xc5, 0x06, 0xdb, 0x00, 0x4f, 0x71, 0xc4, 0x26, 0x78, 0x00,
+    0x4f, 0x78, 0xc4, 0x01, 0xc3, 0x00, 0x4f, 0x91, 0xc4, 0x00, 0xba, 0x00,
+    0x4f, 0x98, 0x4a, 0x78, 0x64, 0xc1, 0xb1, 0x17, 0xd3, 0x44, 0x8f, 0x00,
+    0x4f, 0xc8, 0xe0, 0x06, 0x07, 0x01, 0x5a, 0xf0, 0xc2, 0x10, 0x11, 0x00,
+    0xd0, 0xd9, 0x91, 0x00, 0xd0, 0xd1, 0x87, 0x00, 0xd0, 0xc9, 0x97, 0x00,
+    0xd0, 0xc1, 0x8b, 0x00, 0xd0, 0xb8, 0xc2, 0x00, 0xd0, 0x00, 0xd0, 0xb1,
+    0x83, 0x00, 0xd0, 0xa9, 0xc2, 0x0d, 0xf6, 0x00, 0xd0, 0xa1, 0xc2, 0x02,
+    0x41, 0x00, 0xd0, 0x99, 0xc2, 0x00, 0xdb, 0x00, 0xd0, 0x91, 0xc2, 0x00,
+    0x39, 0x00, 0xd0, 0x89, 0xc2, 0x19, 0x2c, 0x00, 0xd0, 0x81, 0x10, 0xc1,
+    0xb1, 0x2a, 0xc2, 0x25, 0x3b, 0x00, 0xd0, 0x69, 0xc2, 0x00, 0x64, 0x00,
+    0xd0, 0x61, 0xc2, 0x0e, 0x9a, 0x00, 0xd0, 0x49, 0xc2, 0x01, 0x6f, 0x00,
+    0xd0, 0x41, 0x0f, 0xc1, 0xb1, 0x3c, 0xc2, 0x01, 0x5d, 0x00, 0xd0, 0x29,
+    0xc2, 0x00, 0xb0, 0x00, 0xd0, 0x21, 0xc2, 0x01, 0x30, 0x00, 0xd0, 0x09,
+    0xc2, 0x02, 0x2b, 0x00, 0xd0, 0x00, 0x83, 0x00, 0xba, 0x41, 0xc2, 0x01,
+    0x30, 0x00, 0xba, 0x28, 0x45, 0xda, 0xf1, 0xc1, 0xb1, 0x46, 0xc5, 0xd5,
+    0x4c, 0x01, 0x40, 0x00, 0xc6, 0x57, 0xec, 0x08, 0x83, 0xf9, 0xc3, 0x05,
+    0x14, 0x08, 0x82, 0x93, 0x01, 0xb1, 0x7b, 0xc4, 0x26, 0x78, 0x08, 0x82,
+    0xd3, 0x01, 0xb1, 0x7f, 0xc5, 0x06, 0xdb, 0x08, 0x82, 0xcb, 0x01, 0xb1,
+    0x85, 0x15, 0xc1, 0xb1, 0x89, 0x08, 0xc1, 0xb1, 0x9b, 0x16, 0x41, 0xb1,
+    0xa3, 0x91, 0x08, 0x80, 0x8b, 0x01, 0xb1, 0xb1, 0x0e, 0xc1, 0xb1, 0xb7,
+    0xc2, 0x00, 0xd0, 0x08, 0x81, 0x99, 0xc2, 0x00, 0x39, 0x08, 0x81, 0x69,
+    0xc2, 0x19, 0x2c, 0x08, 0x81, 0x61, 0xc2, 0x01, 0xc3, 0x08, 0x81, 0x59,
+    0x04, 0xc1, 0xb1, 0xc1, 0x12, 0xc1, 0xb1, 0xcb, 0x10, 0xc1, 0xb1, 0xd5,
+    0x06, 0xc1, 0xb1, 0xeb, 0x16, 0xc1, 0xb1, 0xf9, 0x0c, 0xc1, 0xb2, 0x07,
+    0x05, 0xc1, 0xb2, 0x11, 0x09, 0xc1, 0xb2, 0x1b, 0x0d, 0xc1, 0xb2, 0x25,
+    0x83, 0x08, 0x80, 0x2b, 0x01, 0xb2, 0x2f, 0x87, 0x08, 0x80, 0x79, 0x18,
+    0xc1, 0xb2, 0x3b, 0x97, 0x08, 0x80, 0x4b, 0x01, 0xb2, 0x45, 0x8b, 0x08,
+    0x80, 0x3b, 0x01, 0xb2, 0x49, 0x15, 0x41, 0xb2, 0x4d, 0x4a, 0x6f, 0xc8,
+    0xc1, 0xb2, 0x5d, 0xc5, 0x1e, 0x96, 0x08, 0x82, 0x30, 0xd0, 0x5c, 0x82,
+    0x08, 0x83, 0x81, 0xcb, 0x93, 0xf6, 0x08, 0x80, 0x21, 0xcb, 0x8f, 0xe1,
+    0x08, 0x80, 0x19, 0xcb, 0x1e, 0x89, 0x08, 0x80, 0x01, 0xc8, 0x14, 0x38,
+    0x08, 0x80, 0x09, 0xc7, 0x40, 0xe5, 0x08, 0x80, 0x10, 0x45, 0x09, 0x98,
+    0xc1, 0xb2, 0x86, 0xcb, 0x97, 0xf5, 0x08, 0x82, 0x41, 0xc4, 0x19, 0x53,
+    0x08, 0x82, 0x38, 0x0e, 0xc1, 0xb2, 0xaa, 0xcc, 0x80, 0xa9, 0x08, 0x82,
+    0x61, 0x42, 0x00, 0x58, 0x41, 0xb2, 0xb6, 0x42, 0x0f, 0x7b, 0xc1, 0xb2,
+    0xc0, 0x4a, 0x9a, 0xb8, 0x41, 0xb2, 0xcc, 0xc6, 0x2e, 0x82, 0x0e, 0x86,
+    0xc9, 0xc6, 0xca, 0x9d, 0x0e, 0x86, 0xc0, 0x00, 0x41, 0xb2, 0xd8, 0x00,
+    0xc1, 0xb2, 0xe4, 0xc2, 0x01, 0x6f, 0x0e, 0x80, 0x82, 0x01, 0xb2, 0xf0,
+    0xc5, 0x57, 0xbd, 0x0e, 0x84, 0x49, 0xc6, 0xad, 0x17, 0x0e, 0x82, 0x51,
+    0xc6, 0xcb, 0xf9, 0x0e, 0x81, 0xd2, 0x01, 0xb2, 0xf4, 0x44, 0xe1, 0x8b,
+    0xc1, 0xb2, 0xfa, 0xc6, 0xcf, 0x11, 0x0e, 0x80, 0x60, 0x43, 0x0f, 0xf8,
+    0xc1, 0xb3, 0x02, 0xc5, 0xd5, 0x88, 0x0e, 0x80, 0x38, 0x46, 0xd0, 0xc1,
+    0xc1, 0xb3, 0x0e, 0x42, 0x0f, 0x7b, 0x41, 0xb3, 0x38, 0x11, 0xc1, 0xb3,
+    0x42, 0xc2, 0x01, 0x0f, 0x0e, 0x84, 0x29, 0x45, 0xdd, 0xa3, 0x41, 0xb3,
+    0x54, 0x45, 0xd7, 0x81, 0xc1, 0xb3, 0x60, 0x44, 0xcf, 0x3b, 0xc1, 0xb3,
+    0x6c, 0x42, 0x00, 0x4e, 0xc1, 0xb3, 0x76, 0x43, 0x07, 0xc5, 0x41, 0xb3,
+    0x82, 0x46, 0xd2, 0x7d, 0xc1, 0xb3, 0x8c, 0xca, 0x9b, 0x9e, 0x0e, 0x81,
+    0x40, 0xc4, 0x1a, 0x73, 0x0e, 0x87, 0x41, 0xc5, 0xd6, 0x00, 0x0e, 0x83,
+    0xf3, 0x01, 0xb3, 0x98, 0xca, 0x9a, 0x68, 0x0e, 0x82, 0x20, 0xc6, 0xcb,
+    0xa5, 0x0e, 0x87, 0x13, 0x01, 0xb3, 0x9e, 0xc7, 0xc0, 0xf9, 0x0e, 0x86,
+    0xf2, 0x01, 0xb3, 0xa2, 0xc4, 0x77, 0x35, 0x0e, 0x83, 0x48, 0xc3, 0x05,
+    0xa9, 0x0e, 0x83, 0x33, 0x01, 0xb3, 0xa6, 0x10, 0x41, 0xb3, 0xac, 0xca,
+    0x9e, 0xd2, 0x0e, 0x87, 0x39, 0x09, 0xc1, 0xb3, 0xb8, 0x03, 0xc1, 0xb3,
+    0xc7, 0x45, 0x1a, 0x57, 0xc1, 0xb3, 0xd3, 0xc3, 0x1f, 0x1d, 0x0e, 0x84,
+    0x32, 0x01, 0xb3, 0xe9, 0x44, 0x1a, 0x13, 0xc1, 0xb3, 0xef, 0x42, 0x00,
+    0xbd, 0x41, 0xb4, 0x07, 0x11, 0xc1, 0xb4, 0x13, 0xc4, 0x7a, 0x04, 0x0e,
+    0x82, 0x80, 0xd4, 0x39, 0x30, 0x0e, 0x86, 0x61, 0xd6, 0x2e, 0x80, 0x0e,
+    0x86, 0x59, 0x10, 0xc1, 0xb4, 0x22, 0x48, 0x1a, 0x02, 0xc1, 0xb4, 0x2e,
+    0x4f, 0x67, 0x47, 0xc1, 0xb4, 0x3a, 0x4a, 0xa3, 0x6e, 0xc1, 0xb4, 0x46,
+    0xc8, 0x9c, 0xe0, 0x0e, 0x81, 0xa2, 0x01, 0xb4, 0x62, 0xc8, 0xba, 0x3a,
+    0x0e, 0x85, 0x81, 0xca, 0xa2, 0xec, 0x0e, 0x85, 0x79, 0xcb, 0x92, 0x33,
+    0x0e, 0x85, 0x70, 0xc6, 0xce, 0xd5, 0x0e, 0x86, 0x51, 0xc6, 0xd1, 0x63,
+    0x0e, 0x86, 0x49, 0xc5, 0xd6, 0x9b, 0x0e, 0x86, 0x40, 0xc3, 0x63, 0x2b,
+    0x0e, 0x83, 0x39, 0xc8, 0x9c, 0xe0, 0x0e, 0x81, 0xd8, 0x8b, 0x0e, 0x82,
+    0xb1, 0xc2, 0x00, 0x45, 0x0e, 0x80, 0xc0, 0x08, 0xc1, 0xb4, 0x68, 0xc7,
+    0xc2, 0x9d, 0x0e, 0x84, 0xc0, 0xd5, 0x32, 0xc0, 0x0e, 0x85, 0x61, 0x43,
+    0x01, 0x55, 0x41, 0xb4, 0x74, 0xd4, 0x3d, 0xcc, 0x0e, 0x85, 0xb1, 0xc7,
+    0xc3, 0x45, 0x0e, 0x83, 0xd8, 0xcd, 0x79, 0x75, 0x0e, 0x83, 0xa1, 0xcb,
+    0x94, 0x17, 0x0e, 0x83, 0x00, 0x12, 0xc1, 0xb4, 0x80, 0xcb, 0x94, 0xbc,
+    0x0e, 0x85, 0x89, 0xcd, 0x7a, 0xfb, 0x0e, 0x85, 0x51, 0x16, 0xc1, 0xb4,
+    0x8c, 0x45, 0xd9, 0xed, 0xc1, 0xb4, 0x98, 0xce, 0x6d, 0x5c, 0x0e, 0x85,
+    0x20, 0x0b, 0xc1, 0xb4, 0xa4, 0x45, 0xaa, 0x6b, 0x41, 0xb4, 0xb4, 0xc6,
+    0xd0, 0xf1, 0x0e, 0x84, 0x41, 0xc5, 0x13, 0x43, 0x0e, 0x81, 0x89, 0xc4,
+    0xae, 0x15, 0x0e, 0x80, 0x78, 0x07, 0xc1, 0xb4, 0xca, 0xc3, 0x02, 0x44,
+    0x0e, 0x80, 0xa0, 0x45, 0x7c, 0xbe, 0xc1, 0xb4, 0xd9, 0xc3, 0xbe, 0x04,
+    0x0e, 0x81, 0x70, 0xc3, 0x63, 0x2b, 0x0e, 0x83, 0xa9, 0xc8, 0x9c, 0xe0,
+    0x0e, 0x81, 0x60, 0x00, 0xc1, 0xb4, 0xef, 0xca, 0x9c, 0xde, 0x0e, 0x81,
+    0x00, 0xc3, 0x63, 0x2b, 0x0e, 0x82, 0x39, 0xc8, 0x9c, 0xe0, 0x0e, 0x80,
+    0xa8, 0x45, 0xb9, 0x3c, 0xc1, 0xb5, 0x01, 0x0e, 0x41, 0xb5, 0x1a, 0x42,
+    0x06, 0x4e, 0xc1, 0xb5, 0x24, 0xc5, 0xd8, 0x85, 0x0e, 0x80, 0xf0, 0xc3,
+    0x63, 0x2b, 0x0e, 0x82, 0xc9, 0xc8, 0x9c, 0xe0, 0x0e, 0x81, 0x30, 0xc6,
+    0xd0, 0x0d, 0x0e, 0x81, 0xc3, 0x01, 0xb5, 0x33, 0x43, 0x13, 0x4f, 0xc1,
+    0xb5, 0x39, 0xc9, 0x94, 0x92, 0x0e, 0x80, 0x10, 0x00, 0xc1, 0xb5, 0x43,
+    0xca, 0x9c, 0xde, 0x0e, 0x81, 0x08, 0xc2, 0x0d, 0x10, 0x08, 0xe3, 0x48,
+    0xc2, 0x0d, 0x10, 0x08, 0xe3, 0x40, 0xc3, 0x45, 0x6b, 0x08, 0xe3, 0x39,
+    0xc2, 0x00, 0x5f, 0x08, 0xe2, 0xf0, 0xc3, 0x0d, 0x0f, 0x08, 0xe3, 0x31,
+    0xc2, 0x00, 0x33, 0x08, 0xe2, 0xe8, 0xc4, 0x0d, 0x0e, 0x08, 0xe3, 0x29,
+    0xc3, 0x02, 0xdf, 0x08, 0xe2, 0xe0, 0xc4, 0x18, 0x12, 0x08, 0xe3, 0x21,
+    0x91, 0x08, 0xe2, 0xd8, 0xc4, 0x18, 0x10, 0x08, 0xe2, 0xb9, 0xc2, 0x22,
+    0xcc, 0x08, 0xe2, 0xb0, 0xc3, 0x0d, 0x14, 0x08, 0xe2, 0xa9, 0xc3, 0x09,
+    0x9e, 0x08, 0xe2, 0xa0, 0xc4, 0x02, 0xde, 0x08, 0xe2, 0x99, 0xc2, 0x02,
+    0xa0, 0x08, 0xe2, 0x90, 0x94, 0x08, 0xe1, 0xa8, 0x8e, 0x08, 0xe0, 0x41,
+    0x94, 0x08, 0xe0, 0x32, 0x01, 0xb5, 0x55, 0xc2, 0x00, 0xd0, 0x08, 0xe0,
+    0xd9, 0x83, 0x08, 0xe0, 0xd0, 0xc2, 0x00, 0xd0, 0x08, 0xe0, 0xc9, 0x83,
+    0x08, 0xe0, 0xc0, 0x46, 0x01, 0x92, 0xc1, 0xb5, 0x59, 0x04, 0xc1, 0xb5,
+    0x65, 0xd5, 0x37, 0x6d, 0x01, 0x2e, 0xf9, 0xc6, 0xcc, 0x1d, 0x0f, 0xac,
+    0x69, 0x12, 0xc1, 0xb5, 0x71, 0xcc, 0x85, 0x7d, 0x0f, 0xac, 0x59, 0xe0,
+    0x05, 0xe7, 0x01, 0x49, 0xf8, 0x46, 0x01, 0x92, 0xc1, 0xb5, 0x7d, 0xcf,
+    0x68, 0x37, 0x01, 0x3e, 0x99, 0x15, 0xc1, 0xb5, 0x89, 0xda, 0x1a, 0x7e,
+    0x01, 0x3a, 0x79, 0xc6, 0xcd, 0x8b, 0x01, 0x38, 0x71, 0xd5, 0x37, 0x6d,
+    0x01, 0x2e, 0xf1, 0x4f, 0x60, 0x6c, 0x41, 0xb5, 0x95, 0xdb, 0x14, 0xf4,
+    0x0f, 0xdb, 0x79, 0x45, 0x02, 0xde, 0x41, 0xb5, 0xa1, 0xc6, 0x02, 0xd1,
+    0x01, 0x2f, 0x09, 0xd4, 0x39, 0x94, 0x01, 0x2e, 0xd9, 0xc5, 0x06, 0xe2,
+    0x01, 0x2c, 0x21, 0xcc, 0x01, 0xdb, 0x0f, 0xdc, 0x78, 0xcd, 0x15, 0x02,
+    0x01, 0x2c, 0x11, 0xcc, 0x06, 0xdb, 0x01, 0x2c, 0x08, 0xc6, 0xcd, 0x4f,
+    0x0f, 0xd5, 0x59, 0xd0, 0x54, 0xdc, 0x0f, 0xa8, 0x28, 0xc9, 0x33, 0xad,
+    0x01, 0x72, 0x40, 0xce, 0x6f, 0xfc, 0x01, 0x3f, 0xf9, 0xcc, 0x82, 0x35,
+    0x01, 0x3f, 0xcb, 0x01, 0xb5, 0xad, 0xc5, 0x01, 0xa2, 0x01, 0x3f, 0xb2,
+    0x01, 0xb5, 0xb3, 0xcc, 0x82, 0x35, 0x01, 0x3f, 0xc3, 0x01, 0xb5, 0xb9,
+    0xc5, 0x01, 0xa2, 0x01, 0x3f, 0xab, 0x01, 0xb5, 0xbf, 0xce, 0x6f, 0xfc,
+    0x01, 0x59, 0x98, 0x46, 0x00, 0x2c, 0xc1, 0xb5, 0xc5, 0xc4, 0x32, 0xbc,
+    0x01, 0x3e, 0xf0, 0xe0, 0x00, 0x47, 0x01, 0x57, 0x30, 0x45, 0x00, 0x8c,
+    0xc1, 0xb5, 0xd1, 0xd7, 0x2a, 0x99, 0x01, 0x52, 0xc8, 0xcf, 0x64, 0xd1,
+    0x01, 0x52, 0xe1, 0xcb, 0x98, 0x42, 0x01, 0x52, 0xd1, 0x42, 0x00, 0x58,
+    0xc1, 0xb5, 0xe3, 0xc8, 0x52, 0x09, 0x01, 0x52, 0xf8, 0x10, 0xc1, 0xb5,
+    0xef, 0x14, 0x41, 0xb5, 0xf9, 0x43, 0x01, 0xd0, 0xc1, 0xb6, 0x05, 0xd5,
+    0x36, 0xb0, 0x0f, 0xab, 0xd8, 0x45, 0x00, 0x2d, 0xc1, 0xb6, 0x2c, 0xd6,
+    0x29, 0x86, 0x01, 0x70, 0x60, 0xc9, 0x9b, 0x77, 0x01, 0x3e, 0xa9, 0x43,
+    0x02, 0x6f, 0x41, 0xb6, 0x5a, 0xd5, 0x32, 0x18, 0x01, 0x3e, 0x29, 0x07,
+    0xc1, 0xb6, 0x66, 0xcd, 0x25, 0xae, 0x00, 0x02, 0xdb, 0x01, 0xb6, 0x72,
+    0x0b, 0xc1, 0xb6, 0x76, 0xcc, 0x6f, 0xb7, 0x0f, 0xaf, 0x41, 0xd3, 0x1f,
+    0xcd, 0x01, 0x70, 0x10, 0xcb, 0x90, 0x86, 0x01, 0x36, 0xe1, 0xcc, 0x00,
+    0x33, 0x00, 0x03, 0xdb, 0x01, 0xb6, 0x82, 0xc6, 0xb7, 0x3b, 0x01, 0x18,
+    0x41, 0xcd, 0x69, 0x65, 0x01, 0x80, 0x60, 0x0a, 0xc1, 0xb6, 0x86, 0xc3,
+    0x00, 0x3a, 0x01, 0x15, 0x19, 0x14, 0xc1, 0xb6, 0x98, 0xd5, 0x08, 0x89,
+    0x01, 0x80, 0xa0, 0x0b, 0xc1, 0xb6, 0xa4, 0xc4, 0x20, 0xe6, 0x01, 0x18,
+    0x50, 0xc7, 0xc9, 0xb2, 0x01, 0x1d, 0xc1, 0xcd, 0x77, 0xfc, 0x01, 0x71,
+    0x00, 0x00, 0x41, 0xb6, 0xb0, 0x45, 0x00, 0x5a, 0xc1, 0xb6, 0xc2, 0xd9,
+    0x1f, 0xc7, 0x01, 0x70, 0x20, 0xcb, 0x93, 0xd5, 0x0f, 0xac, 0x71, 0xcb,
+    0x8a, 0x0a, 0x01, 0x4e, 0xc1, 0x45, 0x01, 0xfd, 0x41, 0xb6, 0xda, 0x45,
+    0x04, 0x90, 0xc1, 0xb6, 0xf6, 0x44, 0x01, 0x5e, 0x41, 0xb7, 0x02, 0xc6,
+    0xcf, 0x35, 0x0f, 0xb6, 0x29, 0xd5, 0x2c, 0xf5, 0x01, 0x70, 0xe0, 0xca,
+    0x01, 0xfd, 0x01, 0x0f, 0x33, 0x01, 0xb7, 0x0e, 0xc9, 0xb0, 0x6b, 0x01,
+    0x0c, 0xe0, 0x42, 0x00, 0x2c, 0xc1, 0xb7, 0x14, 0x42, 0x02, 0xa0, 0xc1,
+    0xb7, 0x20, 0xd5, 0x37, 0xc1, 0x0f, 0xc5, 0x10, 0x00, 0xc1, 0xb7, 0x2c,
+    0xc5, 0x14, 0xa5, 0x01, 0x48, 0xc8, 0xc5, 0xca, 0xa4, 0x0f, 0xb3, 0x61,
+    0xd7, 0x2a, 0x6b, 0x0f, 0xc5, 0x30, 0xcb, 0x82, 0xba, 0x01, 0x0f, 0x01,
+    0x46, 0x00, 0x59, 0x41, 0xb7, 0x49, 0x42, 0x00, 0xe3, 0xc1, 0xb7, 0x58,
+    0xcf, 0x5b, 0xc3, 0x0f, 0xc2, 0x80, 0x03, 0xc1, 0xb7, 0x64, 0x45, 0x11,
+    0x3a, 0x41, 0xb7, 0x70, 0x45, 0x04, 0x90, 0xc1, 0xb7, 0x7c, 0xd8, 0x23,
+    0xf3, 0x0f, 0xc5, 0x01, 0xdf, 0x0c, 0x65, 0x0f, 0xc5, 0x40, 0xd0, 0x56,
+    0xda, 0x0f, 0xc1, 0xa1, 0xe0, 0x01, 0xe7, 0x0f, 0xc5, 0x50, 0xd0, 0x5a,
+    0x22, 0x0f, 0xa8, 0x69, 0xcd, 0x0b, 0x91, 0x01, 0x19, 0x49, 0xd4, 0x3b,
+    0x9c, 0x01, 0x4f, 0xe1, 0xdb, 0x18, 0x39, 0x00, 0x05, 0x58, 0xdc, 0x14,
+    0x4d, 0x01, 0x3d, 0x51, 0xdb, 0x15, 0x60, 0x01, 0x49, 0xc8, 0xc7, 0x00,
+    0xfa, 0x01, 0x03, 0x31, 0xc8, 0xb6, 0xca, 0x01, 0x01, 0x69, 0xc9, 0xb3,
+    0x9e, 0x01, 0x01, 0x51, 0xc4, 0x01, 0xc3, 0x01, 0x00, 0x70, 0xd6, 0x2d,
+    0x4c, 0x00, 0x2c, 0x71, 0xc4, 0xb9, 0x3c, 0x0f, 0xc8, 0xd9, 0xcb, 0x8f,
+    0xf7, 0x00, 0x7e, 0xb2, 0x01, 0xb7, 0x88, 0xcc, 0x07, 0xc7, 0x01, 0x13,
+    0xb1, 0x43, 0x00, 0xe2, 0xc1, 0xb7, 0x8e, 0xd0, 0x5a, 0x92, 0x01, 0x53,
+    0xeb, 0x01, 0xb7, 0x9a, 0xcb, 0x1a, 0x1a, 0x01, 0x54, 0x28, 0xcf, 0x09,
+    0xf8, 0x01, 0x4b, 0xb1, 0x44, 0x00, 0x58, 0xc1, 0xb7, 0xa0, 0x15, 0xc1,
+    0xb7, 0xa6, 0x44, 0x07, 0xc7, 0x41, 0xb7, 0xb2, 0xd8, 0x24, 0x3b, 0x01,
+    0x54, 0x39, 0xcf, 0x62, 0xb5, 0x01, 0x54, 0x48, 0xc2, 0x0e, 0x9a, 0x00,
+    0xe2, 0x79, 0xc2, 0x02, 0x1c, 0x00, 0xe0, 0xc9, 0x83, 0x00, 0xe0, 0x60,
+    0x16, 0xc1, 0xb7, 0xb8, 0x15, 0xc1, 0xb7, 0xc2, 0xc2, 0x00, 0xd0, 0x00,
+    0xe0, 0x59, 0x83, 0x00, 0xe0, 0x50, 0xc2, 0x00, 0xd0, 0x00, 0xe1, 0x09,
+    0x83, 0x00, 0xe1, 0x00, 0xc2, 0x00, 0xdb, 0x00, 0xe0, 0xf1, 0x83, 0x00,
+    0xe0, 0xe8, 0xc2, 0x00, 0xdb, 0x00, 0xe0, 0xb1, 0x83, 0x00, 0xe0, 0xa8,
+    0xc2, 0x00, 0xdb, 0x00, 0xe0, 0xa1, 0x83, 0x00, 0xe0, 0x98, 0xc2, 0x00,
+    0xdb, 0x00, 0xe0, 0x91, 0x83, 0x00, 0xe0, 0x88, 0xc2, 0x00, 0xd0, 0x00,
+    0xe0, 0x81, 0xc2, 0x00, 0xdb, 0x00, 0xe0, 0x79, 0x83, 0x00, 0xe0, 0x70,
+    0x83, 0x00, 0xe0, 0x69, 0xc2, 0x19, 0x2c, 0x00, 0xe0, 0x49, 0xc2, 0x01,
+    0x30, 0x00, 0xe0, 0x28, 0xc2, 0x00, 0xd0, 0x00, 0xe0, 0x39, 0x83, 0x00,
+    0xe0, 0x30, 0xc2, 0x00, 0xdb, 0x00, 0xe0, 0x21, 0x83, 0x00, 0xe0, 0x18,
+    0xc2, 0x00, 0xd0, 0x00, 0xe0, 0x11, 0xc2, 0x00, 0xdb, 0x00, 0xe0, 0x09,
+    0x83, 0x00, 0xe0, 0x00, 0xc4, 0x18, 0x10, 0x00, 0xe2, 0x39, 0xc2, 0x22,
+    0xcc, 0x00, 0xe2, 0x30, 0xc3, 0x0d, 0x14, 0x00, 0xe2, 0x29, 0xc3, 0x09,
+    0x9e, 0x00, 0xe2, 0x20, 0xc4, 0x02, 0xde, 0x00, 0xe2, 0x19, 0xc2, 0x02,
+    0xa0, 0x00, 0xe2, 0x10, 0xc5, 0xda, 0x79, 0x00, 0xe1, 0xfb, 0x01, 0xb7,
+    0xcc, 0xc5, 0x4e, 0x18, 0x00, 0xe1, 0xd8, 0xc5, 0x33, 0x5d, 0x00, 0xe1,
+    0xb9, 0xc3, 0x00, 0xea, 0x00, 0xe1, 0xb0, 0xc2, 0x00, 0x39, 0x00, 0xe1,
+    0x29, 0xc2, 0x19, 0x2c, 0x00, 0xe1, 0x20, 0xc3, 0x01, 0x95, 0x00, 0xe1,
+    0xa8, 0xc6, 0xd3, 0xbb, 0x00, 0xe1, 0xa0, 0x97, 0x00, 0xe1, 0x58, 0x91,
+    0x00, 0xe1, 0x48, 0x15, 0xc1, 0xb7, 0xd2, 0xcc, 0x1a, 0x8c, 0x0f, 0xbc,
+    0x71, 0x14, 0xc1, 0xb7, 0xe4, 0x44, 0x00, 0x49, 0xc1, 0xb7, 0xf0, 0xcc,
+    0x07, 0xbb, 0x01, 0x3a, 0xc1, 0xca, 0xa7, 0xc4, 0x0f, 0xaf, 0xc1, 0x08,
+    0xc1, 0xb7, 0xf6, 0xcb, 0x58, 0xc7, 0x0f, 0xbd, 0x11, 0xd5, 0x34, 0x8e,
+    0x0f, 0xbd, 0xd9, 0x16, 0x41, 0xb8, 0x02, 0xc5, 0xd4, 0xe3, 0x0f, 0xaf,
+    0x92, 0x01, 0xb8, 0x0e, 0xc2, 0x00, 0xd0, 0x08, 0xfd, 0x81, 0x83, 0x05,
+    0x27, 0x60, 0x83, 0x05, 0x26, 0x89, 0xc2, 0x00, 0xd0, 0x05, 0x26, 0x90,
+    0x83, 0x05, 0x26, 0x99, 0xc2, 0x02, 0x1c, 0x05, 0x26, 0xe0, 0x83, 0x05,
+    0x26, 0xa1, 0xc2, 0x00, 0xd0, 0x05, 0x26, 0xa9, 0x15, 0xc1, 0xb8, 0x14,
+    0x44, 0x05, 0x14, 0x41, 0xb8, 0x1e, 0x83, 0x05, 0x26, 0xb1, 0xc2, 0x00,
+    0xd0, 0x05, 0x27, 0x68, 0x83, 0x05, 0x26, 0xb9, 0xc2, 0x00, 0xd0, 0x05,
+    0x26, 0xc0, 0x83, 0x05, 0x26, 0xd1, 0xc2, 0x00, 0xd0, 0x05, 0x26, 0xd8,
+    0x83, 0x05, 0x27, 0x01, 0xc2, 0x01, 0x30, 0x05, 0x27, 0x28, 0x83, 0x05,
+    0x27, 0x11, 0xc2, 0x00, 0xd0, 0x05, 0x27, 0x58, 0xc2, 0x00, 0xd0, 0x05,
+    0x27, 0x19, 0x83, 0x05, 0x27, 0x20, 0x83, 0x05, 0x27, 0x31, 0xc2, 0x00,
+    0xd0, 0x05, 0x27, 0x40, 0x87, 0x05, 0x27, 0x78, 0x97, 0x05, 0x27, 0x88,
+    0x87, 0x05, 0x27, 0xb8, 0x87, 0x05, 0x27, 0xa9, 0x8a, 0x05, 0x27, 0xb0,
+    0xc9, 0x1b, 0x0a, 0x01, 0x01, 0x41, 0xca, 0x33, 0xdc, 0x00, 0x00, 0x5b,
+    0x01, 0xb8, 0x2a, 0xc4, 0x1b, 0x05, 0x00, 0x00, 0x51, 0x4c, 0x87, 0x8d,
+    0x41, 0xb8, 0x30, 0x48, 0xba, 0xc2, 0xc1, 0xb8, 0x3c, 0x42, 0x01, 0x60,
+    0x41, 0xb8, 0x64, 0xc4, 0x26, 0x78, 0x00, 0xca, 0x79, 0xc5, 0x06, 0xdb,
+    0x00, 0xca, 0x71, 0x15, 0xc1, 0xb8, 0x76, 0x08, 0xc1, 0xb8, 0x82, 0x16,
+    0xc1, 0xb8, 0x8e, 0xc3, 0x05, 0x14, 0x00, 0xca, 0x39, 0xc4, 0x15, 0xe7,
+    0x00, 0xca, 0x30, 0x44, 0x00, 0xbb, 0xc1, 0xb8, 0x9a, 0x4c, 0x29, 0xba,
+    0xc1, 0xb8, 0xb2, 0x50, 0x5c, 0xf2, 0x41, 0xb8, 0xe0, 0x46, 0x00, 0xb9,
+    0xc1, 0xb8, 0xf2, 0xcf, 0x69, 0x72, 0x00, 0xc8, 0x00, 0x16, 0xc1, 0xb9,
+    0x0f, 0x09, 0xc1, 0xb9, 0x1f, 0xc2, 0x00, 0xd0, 0x00, 0xc8, 0xe1, 0x15,
+    0xc1, 0xb9, 0x2f, 0xc2, 0x01, 0x4a, 0x00, 0xc8, 0xc1, 0xc2, 0x00, 0xdb,
+    0x00, 0xc8, 0xb9, 0xc2, 0x00, 0x39, 0x00, 0xc8, 0xb1, 0xc2, 0x19, 0x2c,
+    0x00, 0xc8, 0xab, 0x01, 0xb9, 0x3f, 0xc2, 0x01, 0xc3, 0x00, 0xc8, 0xa1,
+    0x04, 0xc1, 0xb9, 0x43, 0x12, 0xc1, 0xb9, 0x4d, 0x10, 0xc1, 0xb9, 0x57,
+    0x06, 0xc1, 0xb9, 0x61, 0x0c, 0xc1, 0xb9, 0x6b, 0x05, 0xc1, 0xb9, 0x75,
+    0x0d, 0x41, 0xb9, 0x7f, 0x90, 0x08, 0x49, 0xc0, 0x9b, 0x08, 0x49, 0xb8,
+    0x90, 0x08, 0x49, 0xb0, 0x90, 0x08, 0x49, 0xa8, 0x96, 0x08, 0x49, 0xa0,
+    0x95, 0x08, 0x49, 0x70, 0x04, 0xc1, 0xb9, 0x89, 0x44, 0x0b, 0x0d, 0xc1,
+    0xb9, 0x95, 0x46, 0x76, 0x5f, 0xc1, 0xb9, 0xa1, 0xc9, 0x32, 0xb7, 0x01,
+    0x3e, 0xc9, 0xc7, 0xc4, 0x5d, 0x01, 0x3e, 0xc1, 0xc6, 0x02, 0xd1, 0x01,
+    0x2f, 0x79, 0x11, 0xc1, 0xb9, 0xad, 0x16, 0xc1, 0xb9, 0xb9, 0xd6, 0x2f,
+    0x72, 0x01, 0x50, 0xf1, 0x47, 0xc6, 0x9b, 0xc1, 0xb9, 0xc5, 0x47, 0xc1,
+    0x69, 0x41, 0xb9, 0xd1, 0xcc, 0x23, 0x9f, 0x01, 0x55, 0x68, 0x0e, 0xc1,
+    0xb9, 0xdd, 0x4f, 0x0b, 0x17, 0x41, 0xb9, 0xe9, 0x96, 0x01, 0x04, 0xe1,
+    0x95, 0x01, 0x04, 0xdb, 0x01, 0xb9, 0xf5, 0x92, 0x01, 0x04, 0xd1, 0x90,
+    0x01, 0x04, 0xc9, 0x8f, 0x01, 0x04, 0xc1, 0x8e, 0x01, 0x04, 0xb9, 0x8d,
+    0x01, 0x04, 0xb1, 0x8a, 0x01, 0x04, 0xa9, 0x9a, 0x01, 0x04, 0x99, 0x91,
+    0x01, 0x04, 0x91, 0x87, 0x01, 0x04, 0x89, 0x83, 0x01, 0x04, 0x81, 0x98,
+    0x00, 0xeb, 0x29, 0x97, 0x00, 0xeb, 0x21, 0x94, 0x00, 0xeb, 0x19, 0x8b,
+    0x00, 0xeb, 0x11, 0x8c, 0x01, 0x63, 0xe0, 0x4d, 0x37, 0xb4, 0xc1, 0xb9,
+    0xfb, 0xca, 0x9f, 0xe0, 0x00, 0x14, 0xbb, 0x01, 0xba, 0x7a, 0xce, 0x6b,
+    0xe2, 0x05, 0x3c, 0x78, 0x46, 0x00, 0x8b, 0x41, 0xba, 0x80, 0xcd, 0x7e,
+    0xf1, 0x00, 0x0e, 0x1b, 0x01, 0xba, 0x8c, 0x47, 0x10, 0x30, 0x41, 0xba,
+    0x92, 0xc2, 0x00, 0x74, 0x00, 0xe9, 0x29, 0xcd, 0x7c, 0xdc, 0x00, 0x0e,
+    0x10, 0xcc, 0x23, 0x3f, 0x00, 0x15, 0x08, 0x47, 0x80, 0x10, 0xc1, 0xba,
+    0x9e, 0xd1, 0x54, 0x97, 0x00, 0x15, 0x68, 0x46, 0x02, 0x0f, 0xc1, 0xba,
+    0xaa, 0x48, 0x19, 0x9b, 0x41, 0xbb, 0x60, 0x88, 0x05, 0x3f, 0xd9, 0x92,
+    0x05, 0x3f, 0xe0, 0xc9, 0x4f, 0x9d, 0x05, 0x3f, 0xe9, 0xc6, 0xcb, 0x3f,
+    0x05, 0x3f, 0xf0, 0x91, 0x00, 0x74, 0x09, 0x0a, 0x41, 0xbb, 0x6c, 0x44,
+    0x68, 0x00, 0xc1, 0xbb, 0x78, 0x91, 0x00, 0x74, 0xd9, 0x43, 0x60, 0xe8,
+    0x41, 0xbb, 0xa4, 0xc2, 0x0f, 0x7b, 0x00, 0x74, 0x39, 0xc2, 0x42, 0xcd,
+    0x00, 0x74, 0x69, 0x91, 0x00, 0x74, 0xc8, 0x42, 0x01, 0x7c, 0xc1, 0xbb,
+    0xb0, 0x49, 0xb1, 0xd3, 0x41, 0xbb, 0xbc, 0x91, 0x00, 0x74, 0xa9, 0x43,
+    0x60, 0xe8, 0x41, 0xbb, 0xc8, 0x08, 0xc1, 0xbb, 0xd4, 0xc3, 0x02, 0x45,
+    0x00, 0x74, 0xe9, 0xc4, 0xdf, 0x43, 0x00, 0x74, 0xf8, 0x42, 0x00, 0x48,
+    0x41, 0xbb, 0xe0, 0xc4, 0xdf, 0x43, 0x00, 0x75, 0x59, 0xc3, 0x02, 0x45,
+    0x00, 0x75, 0x70, 0x83, 0x00, 0x75, 0x91, 0x8f, 0x00, 0x75, 0x99, 0x9b,
+    0x00, 0x76, 0x19, 0x8b, 0x00, 0x76, 0x20, 0xc2, 0x00, 0xd1, 0x00, 0x75,
+    0x89, 0xc2, 0x00, 0x45, 0x00, 0x75, 0xd8, 0x8b, 0x00, 0x75, 0xa8, 0x9b,
+    0x00, 0x75, 0xb8, 0x97, 0x00, 0x75, 0xc8, 0x8b, 0x00, 0x76, 0x08, 0xc2,
+    0x01, 0xc8, 0x00, 0x75, 0xe1, 0xc3, 0x4d, 0xc3, 0x00, 0x75, 0xe8, 0xc2,
+    0x01, 0x23, 0x00, 0x76, 0x49, 0x8b, 0x00, 0x76, 0x50, 0xc2, 0x02, 0xa0,
+    0x00, 0x76, 0x91, 0xc4, 0x02, 0xde, 0x00, 0x76, 0x98, 0xc3, 0x09, 0x9e,
+    0x00, 0x76, 0xa1, 0xc3, 0x0d, 0x14, 0x00, 0x76, 0xa8, 0xc2, 0x22, 0xcc,
+    0x00, 0x76, 0xb1, 0xc4, 0x18, 0x10, 0x00, 0x76, 0xb8, 0x45, 0x01, 0x93,
+    0xc1, 0xbb, 0xec, 0xd1, 0x47, 0x70, 0x0f, 0xdc, 0xc8, 0x46, 0x02, 0xae,
+    0xc1, 0xbb, 0xf8, 0x5b, 0x18, 0xc0, 0x41, 0xbc, 0x0a, 0xc6, 0x0b, 0x09,
+    0x01, 0x3a, 0x91, 0xc6, 0x02, 0xd1, 0x0f, 0xa9, 0xf8, 0xe0, 0x03, 0x67,
+    0x01, 0x1d, 0x88, 0x45, 0x01, 0x93, 0xc1, 0xbc, 0x16, 0xd2, 0x43, 0x27,
+    0x0f, 0xdc, 0xc0, 0x5b, 0x16, 0xa4, 0xc1, 0xbc, 0x22, 0x46, 0x01, 0xc8,
+    0x41, 0xbc, 0x2e, 0xe0, 0x00, 0x27, 0x01, 0x1d, 0x80, 0x45, 0x00, 0x27,
+    0xc1, 0xbc, 0x40, 0x4d, 0x3d, 0x55, 0x41, 0xbc, 0x4c, 0xe0, 0x08, 0x67,
+    0x0f, 0xdb, 0x40, 0x0f, 0xc1, 0xbc, 0x52, 0xcc, 0x0d, 0x9e, 0x01, 0x2e,
+    0xd0, 0x44, 0x02, 0x9a, 0x41, 0xbc, 0x58, 0xcd, 0x3f, 0xe8, 0x0f, 0xdc,
+    0x19, 0xce, 0x08, 0x79, 0x0f, 0xdc, 0x28, 0x00, 0x41, 0xbc, 0x5e, 0xcc,
+    0x8a, 0x45, 0x01, 0x0f, 0x78, 0x45, 0x01, 0x95, 0xc1, 0xbc, 0x76, 0xc9,
+    0x61, 0x53, 0x01, 0x48, 0x50, 0xcd, 0x7e, 0x3b, 0x01, 0x0c, 0xf9, 0x4e,
+    0x6f, 0xa8, 0x41, 0xbc, 0x82, 0x00, 0x41, 0xbc, 0x8e, 0x44, 0x00, 0x49,
+    0xc1, 0xbc, 0xac, 0x45, 0x00, 0x2c, 0x41, 0xbc, 0xb6, 0xd0, 0x58, 0x62,
+    0x0f, 0xc2, 0x09, 0xc5, 0x01, 0xa2, 0x0f, 0xc2, 0x28, 0x00, 0x41, 0xbc,
+    0xc0, 0xca, 0xa8, 0x0a, 0x01, 0x0d, 0x40, 0xcc, 0x81, 0xed, 0x01, 0x4a,
+    0x89, 0xcd, 0x7e, 0xfe, 0x01, 0x4a, 0x68, 0xcd, 0x7e, 0xfe, 0x01, 0x4a,
+    0x79, 0xcc, 0x81, 0xed, 0x01, 0x4a, 0x60, 0xdc, 0x13, 0x6d, 0x01, 0x52,
+    0x51, 0x46, 0x00, 0xd4, 0xc1, 0xbc, 0xcc, 0x45, 0x00, 0x8c, 0x41, 0xbc,
+    0xd8, 0xc3, 0x7e, 0x1c, 0x08, 0x1c, 0x91, 0xc2, 0x00, 0x06, 0x08, 0x1c,
+    0xa8, 0xce, 0x64, 0xe1, 0x0f, 0xdc, 0xb9, 0xde, 0x0f, 0x04, 0x01, 0x3b,
+    0x18, 0x45, 0x00, 0x2d, 0xc1, 0xbc, 0xea, 0x50, 0x0f, 0x0a, 0xc1, 0xbc,
+    0xfc, 0xca, 0x0e, 0xbe, 0x0f, 0xbf, 0x80, 0x45, 0x01, 0xfd, 0xc1, 0xbd,
+    0x08, 0xdc, 0x14, 0xa1, 0x01, 0x3d, 0xe9, 0xdb, 0x15, 0x7b, 0x01, 0x3c,
+    0xa0, 0x03, 0xc1, 0xbd, 0x1a, 0x45, 0x1a, 0x38, 0xc1, 0xbd, 0x26, 0x0b,
+    0xc1, 0xbd, 0x32, 0xc6, 0xa8, 0x2a, 0x01, 0x3a, 0x41, 0xda, 0x19, 0x94,
+    0x0f, 0xb3, 0x88, 0x45, 0x20, 0x6c, 0xc1, 0xbd, 0x3e, 0x4e, 0x47, 0x15,
+    0x41, 0xbd, 0x4a, 0x03, 0xc1, 0xbd, 0x56, 0x42, 0x00, 0x27, 0xc1, 0xbd,
+    0x62, 0x43, 0x00, 0x4a, 0xc1, 0xbd, 0x6c, 0xd8, 0x21, 0x9b, 0x0f, 0xb3,
+    0x98, 0x49, 0x0a, 0xe6, 0xc1, 0xbd, 0x78, 0xdf, 0x03, 0xa8, 0x01, 0x3c,
+    0xf1, 0x4e, 0x22, 0x43, 0x41, 0xbd, 0x84, 0x44, 0x02, 0xc3, 0xc1, 0xbd,
+    0x90, 0xc7, 0xc0, 0x74, 0x01, 0x38, 0xc0, 0x49, 0x2c, 0x46, 0xc1, 0xbd,
+    0x9a, 0x51, 0x08, 0xa9, 0x41, 0xbd, 0xa0, 0x45, 0x3a, 0x0c, 0xc1, 0xbd,
+    0xac, 0x42, 0x01, 0x7f, 0xc1, 0xbd, 0xb2, 0xc5, 0x02, 0xd2, 0x01, 0x5a,
+    0xc2, 0x01, 0xbd, 0xbe, 0x46, 0x82, 0xba, 0xc1, 0xbd, 0xca, 0xcc, 0x30,
+    0xf2, 0x01, 0x3c, 0xb9, 0x11, 0x41, 0xbd, 0xd0, 0xdc, 0x12, 0x8d, 0x01,
+    0x3c, 0xe1, 0x44, 0x00, 0x2d, 0x41, 0xbd, 0xe2, 0xc9, 0x68, 0x55, 0x01,
+    0x3c, 0xb1, 0xcf, 0x65, 0x58, 0x01, 0x38, 0xb0, 0xc7, 0x0b, 0x00, 0x01,
+    0x39, 0x89, 0xd1, 0x36, 0x21, 0x0f, 0xb3, 0xa1, 0x51, 0x48, 0x5a, 0x41,
+    0xbd, 0xf1, 0xd2, 0x4e, 0x65, 0x01, 0x39, 0x71, 0xd0, 0x5a, 0xc2, 0x01,
+    0x38, 0xe1, 0xd4, 0x38, 0xb8, 0x01, 0x5a, 0xb0, 0xdb, 0x15, 0x2a, 0x01,
+    0x39, 0x21, 0x44, 0x0d, 0x14, 0x41, 0xbe, 0x00, 0xd1, 0x56, 0x62, 0x01,
+    0x37, 0xe0, 0xca, 0x95, 0xd0, 0x0f, 0xa4, 0xf9, 0x45, 0x00, 0x8c, 0xc1,
+    0xbe, 0x0c, 0xc5, 0x07, 0x73, 0x0f, 0xd7, 0xb0, 0xa0, 0x0d, 0x87, 0xd1,
+    0x9f, 0x0d, 0x87, 0xc9, 0x9e, 0x0d, 0x87, 0xc1, 0xa3, 0x0d, 0x87, 0xe9,
+    0xa2, 0x0d, 0x87, 0xe1, 0xa1, 0x0d, 0x87, 0xd8, 0xa4, 0x0d, 0x87, 0xb9,
+    0xa3, 0x0d, 0x87, 0xb1, 0xa2, 0x0d, 0x87, 0xa9, 0xa1, 0x0d, 0x87, 0xa1,
+    0xa0, 0x0d, 0x87, 0x99, 0x9f, 0x0d, 0x87, 0x91, 0x9e, 0x0d, 0x87, 0x88,
+    0xa1, 0x0d, 0x87, 0x81, 0xa0, 0x0d, 0x87, 0x79, 0x9f, 0x0d, 0x87, 0x71,
+    0x9e, 0x0d, 0x87, 0x68, 0xa3, 0x0d, 0x88, 0x39, 0xa2, 0x0d, 0x88, 0x31,
+    0xa1, 0x0d, 0x88, 0x29, 0xa0, 0x0d, 0x88, 0x21, 0x9f, 0x0d, 0x88, 0x19,
+    0x9e, 0x0d, 0x88, 0x10, 0xa1, 0x0d, 0x88, 0x09, 0xa0, 0x0d, 0x88, 0x01,
+    0x9f, 0x0d, 0x87, 0xf9, 0x9e, 0x0d, 0x87, 0xf0, 0x9e, 0x0d, 0x85, 0xd1,
+    0xa5, 0x0d, 0x86, 0x09, 0xa4, 0x0d, 0x86, 0x01, 0xa3, 0x0d, 0x85, 0xf9,
+    0xa2, 0x0d, 0x85, 0xf1, 0xa1, 0x0d, 0x85, 0xe9, 0xa0, 0x0d, 0x85, 0xe1,
+    0x9f, 0x0d, 0x85, 0xd8, 0xa4, 0x0d, 0x85, 0xc9, 0xa3, 0x0d, 0x85, 0xc1,
+    0xa2, 0x0d, 0x85, 0xb9, 0xa1, 0x0d, 0x85, 0xb1, 0xa0, 0x0d, 0x85, 0xa9,
+    0x9f, 0x0d, 0x85, 0xa1, 0x9e, 0x0d, 0x85, 0x98, 0xa0, 0x0d, 0x85, 0x91,
+    0x9f, 0x0d, 0x85, 0x89, 0x9e, 0x0d, 0x85, 0x80, 0xa4, 0x0d, 0x85, 0x79,
+    0xa3, 0x0d, 0x85, 0x71, 0xa2, 0x0d, 0x85, 0x69, 0xa1, 0x0d, 0x85, 0x61,
+    0xa0, 0x0d, 0x85, 0x59, 0x9f, 0x0d, 0x85, 0x51, 0x9e, 0x0d, 0x85, 0x48,
+    0x9e, 0x0d, 0x84, 0xf3, 0x01, 0xbe, 0x1e, 0xa6, 0x0d, 0x85, 0x31, 0xa5,
+    0x0d, 0x85, 0x29, 0xa4, 0x0d, 0x85, 0x21, 0xa3, 0x0d, 0x85, 0x19, 0xa2,
+    0x0d, 0x85, 0x11, 0xa1, 0x0d, 0x85, 0x09, 0xa0, 0x0d, 0x85, 0x01, 0x9f,
+    0x0d, 0x84, 0xf8, 0xa2, 0x0d, 0x84, 0xe9, 0xa1, 0x0d, 0x84, 0xe1, 0xa0,
+    0x0d, 0x84, 0xd9, 0x9f, 0x0d, 0x84, 0xd1, 0x9e, 0x0d, 0x84, 0xc8, 0xc2,
+    0x00, 0xe8, 0x0d, 0x84, 0xc1, 0xa3, 0x0d, 0x84, 0xb9, 0xa2, 0x0d, 0x84,
+    0xb1, 0xa1, 0x0d, 0x84, 0xa9, 0xa0, 0x0d, 0x84, 0xa1, 0x9f, 0x0d, 0x84,
+    0x99, 0x9e, 0x0d, 0x84, 0x90, 0xa0, 0x0d, 0x84, 0x89, 0x9f, 0x0d, 0x84,
+    0x81, 0x9e, 0x0d, 0x84, 0x78, 0xc2, 0x00, 0xac, 0x0d, 0x84, 0x71, 0xa4,
+    0x0d, 0x84, 0x69, 0xa3, 0x0d, 0x84, 0x61, 0xa2, 0x0d, 0x84, 0x59, 0xa1,
+    0x0d, 0x84, 0x51, 0xa0, 0x0d, 0x84, 0x49, 0x9f, 0x0d, 0x84, 0x41, 0x9e,
+    0x0d, 0x84, 0x38, 0xa6, 0x0d, 0x84, 0x31, 0xa5, 0x0d, 0x84, 0x29, 0xa4,
+    0x0d, 0x84, 0x21, 0xa3, 0x0d, 0x84, 0x19, 0xa2, 0x0d, 0x84, 0x11, 0xa1,
+    0x0d, 0x84, 0x09, 0xa0, 0x0d, 0x84, 0x01, 0x9f, 0x0d, 0x83, 0xf9, 0x9e,
+    0x0d, 0x83, 0xf0, 0x9f, 0x0d, 0x88, 0xf1, 0x9e, 0x0d, 0x88, 0xe8, 0xa0,
+    0x0d, 0x81, 0xd1, 0x9f, 0x0d, 0x81, 0xc9, 0x9e, 0x0d, 0x81, 0xc1, 0xc2,
+    0x06, 0x52, 0x0d, 0x81, 0xd8, 0xa3, 0x0d, 0x81, 0xb9, 0xa2, 0x0d, 0x81,
+    0xb1, 0xa1, 0x0d, 0x81, 0xa9, 0xa0, 0x0d, 0x81, 0xa1, 0x9f, 0x0d, 0x81,
+    0x99, 0x9e, 0x0d, 0x81, 0x90, 0xa4, 0x0d, 0x81, 0x89, 0xa3, 0x0d, 0x81,
+    0x81, 0xa2, 0x0d, 0x81, 0x79, 0xa1, 0x0d, 0x81, 0x71, 0xa0, 0x0d, 0x81,
+    0x69, 0x9f, 0x0d, 0x81, 0x61, 0x9e, 0x0d, 0x81, 0x58, 0xa5, 0x0d, 0x81,
+    0x51, 0xa4, 0x0d, 0x81, 0x49, 0xa3, 0x0d, 0x81, 0x41, 0xa2, 0x0d, 0x81,
+    0x39, 0xa1, 0x0d, 0x81, 0x31, 0xa0, 0x0d, 0x81, 0x29, 0x9f, 0x0d, 0x81,
+    0x21, 0x9e, 0x0d, 0x81, 0x18, 0xc2, 0x00, 0x3c, 0x0d, 0x81, 0x11, 0x9e,
+    0x0d, 0x80, 0xbb, 0x01, 0xbe, 0x26, 0xa6, 0x0d, 0x80, 0xf9, 0xa5, 0x0d,
+    0x80, 0xf1, 0xa4, 0x0d, 0x80, 0xe9, 0xa3, 0x0d, 0x80, 0xe1, 0xa2, 0x0d,
+    0x80, 0xd9, 0xa1, 0x0d, 0x80, 0xd1, 0xa0, 0x0d, 0x80, 0xc9, 0x9f, 0x0d,
+    0x80, 0xc0, 0xa1, 0x0d, 0x88, 0xc9, 0xa0, 0x0d, 0x88, 0xc1, 0x9f, 0x0d,
+    0x88, 0xb9, 0x9e, 0x0d, 0x88, 0xb1, 0xa2, 0x0d, 0x88, 0xd1, 0xa3, 0x0d,
+    0x88, 0xd9, 0xa4, 0x0d, 0x88, 0xe0, 0xa1, 0x0d, 0x88, 0xa9, 0xa0, 0x0d,
+    0x88, 0xa1, 0x9f, 0x0d, 0x88, 0x99, 0x9e, 0x0d, 0x88, 0x90, 0xa2, 0x0d,
+    0x88, 0x89, 0xa1, 0x0d, 0x88, 0x81, 0xa0, 0x0d, 0x88, 0x79, 0x9f, 0x0d,
+    0x88, 0x71, 0x9e, 0x0d, 0x88, 0x68, 0xa2, 0x0d, 0x88, 0x61, 0xa1, 0x0d,
+    0x88, 0x59, 0xa0, 0x0d, 0x88, 0x51, 0x9f, 0x0d, 0x88, 0x49, 0x9e, 0x0d,
+    0x88, 0x40, 0xc2, 0x42, 0xcd, 0x0d, 0x87, 0x11, 0xa2, 0x0d, 0x87, 0x09,
+    0xa1, 0x0d, 0x87, 0x01, 0xa0, 0x0d, 0x86, 0xf9, 0x9f, 0x0d, 0x86, 0xf1,
+    0x9e, 0x0d, 0x86, 0xe8, 0x9e, 0x0d, 0x87, 0x19, 0x9f, 0x0d, 0x87, 0x21,
+    0xa0, 0x0d, 0x87, 0x29, 0xa1, 0x0d, 0x87, 0x30, 0x9e, 0x0d, 0x87, 0x39,
+    0x9f, 0x0d, 0x87, 0x41, 0xa0, 0x0d, 0x87, 0x49, 0xa1, 0x0d, 0x87, 0x51,
+    0xa2, 0x0d, 0x87, 0x59, 0xa3, 0x0d, 0x87, 0x60, 0xa2, 0x0d, 0x86, 0xd9,
+    0xa1, 0x0d, 0x86, 0xd1, 0xa0, 0x0d, 0x86, 0xc9, 0x9f, 0x0d, 0x86, 0xc1,
+    0x9e, 0x0d, 0x86, 0xb9, 0xa3, 0x0d, 0x86, 0xe0, 0xc2, 0x01, 0xc3, 0x0d,
+    0x86, 0xb1, 0x9f, 0x0d, 0x86, 0xa9, 0x9e, 0x0d, 0x86, 0xa0, 0xa1, 0x0d,
+    0x86, 0x99, 0xa0, 0x0d, 0x86, 0x91, 0x9f, 0x0d, 0x86, 0x89, 0x9e, 0x0d,
+    0x86, 0x80, 0xa4, 0x0d, 0x86, 0x79, 0xa3, 0x0d, 0x86, 0x71, 0xa2, 0x0d,
+    0x86, 0x69, 0xa1, 0x0d, 0x86, 0x61, 0xa0, 0x0d, 0x86, 0x59, 0x9f, 0x0d,
+    0x86, 0x51, 0x9e, 0x0d, 0x86, 0x48, 0xa4, 0x0d, 0x86, 0x41, 0xa3, 0x0d,
+    0x86, 0x39, 0xa2, 0x0d, 0x86, 0x31, 0xa1, 0x0d, 0x86, 0x29, 0xa0, 0x0d,
+    0x86, 0x21, 0x9f, 0x0d, 0x86, 0x19, 0x9e, 0x0d, 0x86, 0x10, 0xc2, 0x00,
+    0x39, 0x0d, 0x83, 0xe9, 0xa3, 0x0d, 0x83, 0xe1, 0xa2, 0x0d, 0x83, 0xd9,
+    0xa1, 0x0d, 0x83, 0xd1, 0xa0, 0x0d, 0x83, 0xc9, 0x9f, 0x0d, 0x83, 0xc1,
+    0x9e, 0x0d, 0x83, 0xb8, 0xa6, 0x0d, 0x83, 0xb1, 0xa5, 0x0d, 0x83, 0xa9,
+    0xa4, 0x0d, 0x83, 0xa1, 0xa3, 0x0d, 0x83, 0x99, 0xa2, 0x0d, 0x83, 0x91,
+    0xa1, 0x0d, 0x83, 0x89, 0xa0, 0x0d, 0x83, 0x81, 0x9f, 0x0d, 0x83, 0x79,
+    0x9e, 0x0d, 0x83, 0x70, 0x9f, 0x0d, 0x83, 0x19, 0x9e, 0x0d, 0x83, 0x11,
+    0xa0, 0x0d, 0x83, 0x21, 0xa1, 0x0d, 0x83, 0x29, 0xa2, 0x0d, 0x83, 0x31,
+    0xa3, 0x0d, 0x83, 0x39, 0xa4, 0x0d, 0x83, 0x40, 0xa1, 0x0d, 0x83, 0x09,
+    0xa0, 0x0d, 0x83, 0x01, 0x9f, 0x0d, 0x82, 0xf9, 0x9e, 0x0d, 0x82, 0xf0,
+    0x9e, 0x0d, 0x83, 0x49, 0x9f, 0x0d, 0x83, 0x51, 0xa0, 0x0d, 0x83, 0x59,
+    0xa1, 0x0d, 0x83, 0x61, 0xc2, 0x00, 0xf1, 0x0d, 0x83, 0x68, 0xa4, 0x0d,
+    0x82, 0xe9, 0xa3, 0x0d, 0x82, 0xe1, 0xa2, 0x0d, 0x82, 0xd9, 0xa1, 0x0d,
+    0x82, 0xd1, 0xa0, 0x0d, 0x82, 0xc9, 0x9f, 0x0d, 0x82, 0xc1, 0x9e, 0x0d,
+    0x82, 0xb8, 0xa2, 0x0d, 0x82, 0xb1, 0xa1, 0x0d, 0x82, 0xa9, 0xa0, 0x0d,
+    0x82, 0xa1, 0x9f, 0x0d, 0x82, 0x99, 0x9e, 0x0d, 0x82, 0x90, 0xa5, 0x0d,
+    0x82, 0x89, 0xa4, 0x0d, 0x82, 0x81, 0xa3, 0x0d, 0x82, 0x79, 0xa2, 0x0d,
+    0x82, 0x71, 0xa1, 0x0d, 0x82, 0x69, 0xa0, 0x0d, 0x82, 0x61, 0x9f, 0x0d,
+    0x82, 0x59, 0x9e, 0x0d, 0x82, 0x50, 0xa3, 0x0d, 0x82, 0x49, 0xa2, 0x0d,
+    0x82, 0x41, 0xa1, 0x0d, 0x82, 0x39, 0xa0, 0x0d, 0x82, 0x31, 0x9f, 0x0d,
+    0x82, 0x29, 0x9e, 0x0d, 0x82, 0x20, 0xa5, 0x0d, 0x82, 0x19, 0xa4, 0x0d,
+    0x82, 0x11, 0xa3, 0x0d, 0x82, 0x09, 0xa2, 0x0d, 0x82, 0x01, 0xa1, 0x0d,
+    0x81, 0xf9, 0xa0, 0x0d, 0x81, 0xf1, 0x9f, 0x0d, 0x81, 0xe9, 0x9e, 0x0d,
+    0x81, 0xe0, 0xca, 0xa2, 0x7e, 0x07, 0xda, 0x79, 0x48, 0xb7, 0xf2, 0x41,
+    0xbe, 0x2e, 0xc2, 0x00, 0x67, 0x00, 0x2f, 0x23, 0x01, 0xbe, 0x40, 0xc3,
+    0xba, 0x37, 0x00, 0x2e, 0xdb, 0x01, 0xbe, 0x46, 0xc3, 0x0b, 0xc8, 0x00,
+    0x2e, 0x8b, 0x01, 0xbe, 0x4c, 0xc3, 0x04, 0xac, 0x00, 0x2e, 0xab, 0x01,
+    0xbe, 0x52, 0x16, 0xc1, 0xbe, 0x58, 0x15, 0xc1, 0xbe, 0x73, 0xc4, 0x5d,
+    0xe2, 0x00, 0x2f, 0x43, 0x01, 0xbe, 0x85, 0xc3, 0xe5, 0x78, 0x00, 0x2f,
+    0x3b, 0x01, 0xbe, 0x8b, 0x46, 0x26, 0xf7, 0xc1, 0xbe, 0x91, 0xc3, 0x20,
+    0x18, 0x00, 0x2f, 0x03, 0x01, 0xbe, 0xb5, 0xc3, 0x00, 0x4e, 0x00, 0x2e,
+    0xf3, 0x01, 0xbe, 0xbb, 0xc5, 0xa2, 0x83, 0x00, 0x2e, 0xe3, 0x01, 0xbe,
+    0xc1, 0xc3, 0x4a, 0xb9, 0x00, 0x2e, 0xcb, 0x01, 0xbe, 0xc7, 0xc5, 0x4a,
+    0xb3, 0x00, 0x2e, 0xb3, 0x01, 0xbe, 0xcd, 0xc2, 0x01, 0x7f, 0x00, 0x2e,
+    0xa3, 0x01, 0xbe, 0xd3, 0xc5, 0x40, 0x9a, 0x00, 0x2e, 0x9b, 0x01, 0xbe,
+    0xdd, 0xc5, 0x9c, 0xa2, 0x00, 0x2e, 0x93, 0x01, 0xbe, 0xe3, 0x03, 0xc1,
+    0xbe, 0xe9, 0x45, 0x06, 0xa6, 0x41, 0xbe, 0xf3, 0xd4, 0x3d, 0xa4, 0x07,
+    0xd8, 0xf1, 0x13, 0xc1, 0xbf, 0x23, 0x15, 0xc1, 0xbf, 0x32, 0xc4, 0xe4,
+    0x8b, 0x00, 0x2d, 0xf9, 0xc5, 0xdb, 0x23, 0x00, 0x2d, 0xe9, 0xcf, 0x64,
+    0xa4, 0x00, 0x2d, 0xe1, 0x0a, 0xc1, 0xbf, 0x42, 0xc5, 0x79, 0xbe, 0x00,
+    0x2d, 0xb9, 0xc5, 0xd5, 0x7e, 0x00, 0x2d, 0xa8, 0x43, 0x09, 0x3b, 0xc1,
+    0xbf, 0x57, 0xcb, 0x97, 0x7c, 0x00, 0x2e, 0x31, 0xc9, 0xae, 0xb2, 0x00,
+    0x2e, 0x19, 0xc5, 0xd4, 0x16, 0x00, 0x2e, 0x01, 0xc5, 0xda, 0xa6, 0x00,
+    0x2d, 0xf0, 0xc4, 0xe1, 0x23, 0x00, 0x2d, 0x71, 0x03, 0x41, 0xbf, 0x63,
+    0xc3, 0x51, 0x3f, 0x00, 0x2d, 0x69, 0xc4, 0x40, 0xe8, 0x00, 0x2d, 0x38,
+    0xcc, 0x89, 0x9d, 0x00, 0x2d, 0x51, 0xc3, 0x17, 0xc9, 0x00, 0x2c, 0xd0,
+    0x07, 0xc1, 0xbf, 0x6f, 0xc5, 0xd5, 0x24, 0x00, 0x2c, 0xb0, 0xc3, 0x75,
+    0x8b, 0x00, 0x2d, 0x41, 0xc9, 0xaf, 0xc9, 0x00, 0x2c, 0xf8, 0xc3, 0x15,
+    0xe7, 0x00, 0x2d, 0x09, 0xc4, 0x56, 0x4f, 0x00, 0x2c, 0xc8, 0xc9, 0xb3,
+    0xef, 0x00, 0x2c, 0x99, 0xc4, 0xa0, 0x89, 0x00, 0x2c, 0x90, 0xc3, 0x26,
+    0x1a, 0x00, 0x2c, 0xe3, 0x01, 0xbf, 0x7b, 0xc6, 0xcb, 0x63, 0x00, 0x2c,
+    0xf0, 0xc4, 0xde, 0xbb, 0x00, 0x2d, 0x19, 0xc7, 0xc3, 0x6f, 0x00, 0x2d,
+    0x21, 0xc5, 0xdd, 0x35, 0x00, 0x2d, 0x2a, 0x01, 0xbf, 0x81, 0x05, 0xc1,
+    0xbf, 0x87, 0xcf, 0x61, 0xb6, 0x02, 0x6e, 0x09, 0x03, 0xc1, 0xbf, 0x99,
+    0xc6, 0xd2, 0xb3, 0x02, 0x6f, 0x21, 0x19, 0xc1, 0xbf, 0xa3, 0xd6, 0x2d,
+    0xa4, 0x02, 0x6f, 0x99, 0xcf, 0x67, 0x56, 0x02, 0x6f, 0xa9, 0xcb, 0x92,
+    0x1d, 0x02, 0x6f, 0xc1, 0xcb, 0x90, 0x39, 0x02, 0x6f, 0xc8, 0xd9, 0x1f,
+    0x95, 0x02, 0x6e, 0x11, 0xc8, 0xbb, 0xf2, 0x02, 0x6f, 0xd0, 0xc9, 0xae,
+    0xc4, 0x02, 0x6f, 0x39, 0xc6, 0xcc, 0x17, 0x02, 0x6f, 0x41, 0xc9, 0xb1,
+    0x94, 0x02, 0x6f, 0xa0, 0xc5, 0xd5, 0x79, 0x02, 0x6e, 0x29, 0xca, 0x9e,
+    0x50, 0x02, 0x6e, 0x98, 0xc6, 0xd3, 0x37, 0x02, 0x6e, 0x41, 0xcd, 0x7f,
+    0xdb, 0x02, 0x6f, 0xe8, 0x44, 0x3e, 0x62, 0xc1, 0xbf, 0xaf, 0xc3, 0x00,
+    0x88, 0x02, 0x6e, 0xa8, 0xc3, 0x05, 0x9f, 0x02, 0x6e, 0xb9, 0xc4, 0x07,
+    0xc8, 0x02, 0x6f, 0x00, 0xc6, 0xcc, 0xb9, 0x02, 0x6e, 0xc1, 0xc8, 0xba,
+    0x5a, 0x02, 0x6f, 0xe0, 0xc7, 0x12, 0x48, 0x02, 0x6f, 0x29, 0xc7, 0x50,
+    0x25, 0x02, 0x6f, 0x70, 0xa1, 0x0f, 0xdb, 0xc1, 0x9f, 0x0f, 0xdb, 0xb1,
+    0xa0, 0x0f, 0xdb, 0xb9, 0xa2, 0x0f, 0xdb, 0xc9, 0xa3, 0x0f, 0xdb, 0xd1,
+    0xa4, 0x0f, 0xdb, 0xd9, 0xc4, 0xe1, 0x7b, 0x0f, 0xdc, 0x08, 0x45, 0x04,
+    0x90, 0xc1, 0xbf, 0xb9, 0xc2, 0x00, 0xb1, 0x01, 0x00, 0xa8, 0xa6, 0x01,
+    0x1d, 0xe9, 0xa4, 0x01, 0x1d, 0xe1, 0xa0, 0x01, 0x1d, 0xd9, 0x9e, 0x01,
+    0x1d, 0xd0, 0x42, 0x00, 0x03, 0xc1, 0xbf, 0xc5, 0xcc, 0x89, 0x55, 0x0f,
+    0xb5, 0x28, 0xc6, 0xce, 0x1b, 0x0f, 0x9e, 0x39, 0xc4, 0x00, 0x87, 0x0f,
+    0xa1, 0xa0, 0xcb, 0x93, 0x0f, 0x0f, 0x9f, 0x09, 0xc8, 0x37, 0x8f, 0x0f,
+    0x9f, 0x02, 0x01, 0xbf, 0xd4, 0xc4, 0xce, 0x15, 0x01, 0x34, 0x91, 0xc6,
+    0xca, 0xb5, 0x01, 0x31, 0x69, 0xc6, 0xcf, 0x6b, 0x0f, 0xb7, 0x00, 0xc2,
+    0x02, 0xa7, 0x0f, 0xc9, 0xf1, 0x89, 0x0f, 0xa2, 0xe0, 0xda, 0x1a, 0xb2,
+    0x0f, 0xc8, 0xf1, 0xd8, 0x23, 0x7b, 0x0f, 0xd7, 0x80, 0xc4, 0x26, 0x78,
+    0x08, 0x69, 0xc9, 0xc5, 0x06, 0xdb, 0x08, 0x69, 0xc1, 0x15, 0xc1, 0xbf,
+    0xd8, 0x08, 0xc1, 0xbf, 0xe4, 0x16, 0xc1, 0xbf, 0xf0, 0xc3, 0x05, 0x14,
+    0x08, 0x69, 0x89, 0xc4, 0x15, 0xe7, 0x08, 0x69, 0x80, 0x42, 0x01, 0x6f,
+    0xc1, 0xbf, 0xfc, 0xc8, 0xbe, 0xea, 0x08, 0x69, 0x20, 0xc9, 0xaa, 0xb0,
+    0x08, 0x69, 0x19, 0xc5, 0xd9, 0xe8, 0x08, 0x69, 0x10, 0x91, 0x08, 0x69,
+    0x09, 0x87, 0x08, 0x69, 0x01, 0x97, 0x08, 0x68, 0xf9, 0x8b, 0x08, 0x68,
+    0xf1, 0x83, 0x08, 0x68, 0xe8, 0xc2, 0x02, 0x41, 0x08, 0x68, 0xe1, 0x10,
+    0xc1, 0xc0, 0x0e, 0x0d, 0xc1, 0xc0, 0x1e, 0xc2, 0x19, 0x2c, 0x08, 0x68,
+    0xc1, 0xc2, 0x01, 0x4a, 0x08, 0x68, 0xb1, 0xc2, 0x01, 0xc3, 0x08, 0x68,
+    0xa1, 0xc2, 0x00, 0xdb, 0x08, 0x68, 0x99, 0xc2, 0x01, 0x30, 0x08, 0x68,
+    0x91, 0x14, 0xc1, 0xc0, 0x2e, 0x06, 0xc1, 0xc0, 0x38, 0xc2, 0x00, 0x87,
+    0x08, 0x68, 0x49, 0xc2, 0x00, 0xd0, 0x08, 0x68, 0x39, 0xc2, 0x00, 0x64,
+    0x08, 0x68, 0x31, 0xc2, 0x25, 0x3b, 0x08, 0x68, 0x29, 0x16, 0xc1, 0xc0,
+    0x42, 0x83, 0x08, 0x68, 0x01, 0xc2, 0x01, 0x5d, 0x08, 0x68, 0x09, 0xc2,
+    0x00, 0xb0, 0x08, 0x68, 0x11, 0xc2, 0x02, 0x1c, 0x08, 0x68, 0x71, 0x15,
+    0x41, 0xc0, 0x4c, 0x97, 0x00, 0xb9, 0x99, 0x8b, 0x00, 0xb9, 0x90, 0xc2,
+    0x00, 0xd0, 0x00, 0xb9, 0x89, 0xc2, 0x0d, 0xf6, 0x00, 0xb9, 0x81, 0xc2,
+    0x01, 0x4a, 0x00, 0xb9, 0x79, 0xc2, 0x00, 0xdb, 0x00, 0xb9, 0x71, 0xc2,
+    0x00, 0x39, 0x00, 0xb9, 0x69, 0xc2, 0x19, 0x2c, 0x00, 0xb9, 0x61, 0xc2,
+    0x01, 0xc3, 0x00, 0xb9, 0x59, 0xc2, 0x01, 0x5d, 0x00, 0xb9, 0x51, 0xc2,
+    0x00, 0xb0, 0x00, 0xb9, 0x49, 0x10, 0xc1, 0xc0, 0x56, 0xc2, 0x0e, 0x9a,
+    0x00, 0xb9, 0x39, 0xc2, 0x01, 0x6f, 0x00, 0xb9, 0x31, 0xc2, 0x01, 0x30,
+    0x00, 0xb9, 0x21, 0xc2, 0x02, 0x2b, 0x00, 0xb9, 0x19, 0x97, 0x00, 0xb9,
+    0x11, 0x8b, 0x00, 0xb9, 0x09, 0x83, 0x00, 0xb9, 0x00, 0x49, 0xb0, 0x7d,
+    0xc1, 0xc0, 0x60, 0x0c, 0xc1, 0xc0, 0xad, 0xd4, 0x3a, 0x5c, 0x01, 0x81,
+    0x71, 0xd4, 0x3a, 0x34, 0x01, 0x81, 0x79, 0x47, 0x02, 0x0e, 0xc1, 0xc0,
+    0xb9, 0xc6, 0x92, 0x0c, 0x01, 0x8b, 0x20, 0xc3, 0x05, 0x14, 0x01, 0x81,
+    0x09, 0x16, 0xc1, 0xc1, 0x16, 0x08, 0xc1, 0xc1, 0x24, 0x15, 0xc1, 0xc1,
+    0x30, 0xc5, 0x06, 0xdb, 0x01, 0x81, 0x41, 0xc4, 0x26, 0x78, 0x01, 0x81,
+    0x48, 0xc3, 0x05, 0x14, 0x08, 0x47, 0xdb, 0x01, 0xc1, 0x3c, 0x16, 0xc1,
+    0xc1, 0x42, 0xc4, 0x0d, 0x13, 0x08, 0x47, 0xe0, 0x16, 0xc1, 0xc1, 0x4e,
+    0x15, 0xc1, 0xc1, 0x5a, 0xc4, 0xb9, 0x7e, 0x08, 0x47, 0x91, 0xc2, 0x00,
+    0x67, 0x08, 0x47, 0x81, 0x03, 0xc1, 0xc1, 0x64, 0xc3, 0x20, 0x18, 0x08,
+    0x47, 0x69, 0xc3, 0x00, 0x4e, 0x08, 0x47, 0x61, 0xc6, 0xcf, 0xd7, 0x08,
+    0x47, 0x59, 0xc4, 0xe0, 0xe7, 0x08, 0x47, 0x51, 0xc4, 0x4a, 0xb9, 0x08,
+    0x47, 0x49, 0xc2, 0x01, 0x7f, 0x08, 0x47, 0x23, 0x01, 0xc1, 0x70, 0xc4,
+    0xdf, 0x07, 0x08, 0x47, 0x31, 0xc3, 0x7e, 0x89, 0x08, 0x47, 0x29, 0xcb,
+    0x95, 0x8d, 0x08, 0x47, 0x19, 0xc5, 0x9c, 0xa2, 0x08, 0x47, 0x11, 0xc4,
+    0xe3, 0x27, 0x08, 0x47, 0x08, 0xca, 0x3b, 0x06, 0x07, 0xfb, 0x29, 0x47,
+    0x02, 0x0e, 0xc1, 0xc1, 0x76, 0xd1, 0x2f, 0xfb, 0x07, 0xfc, 0xf1, 0xd6,
+    0x2f, 0xf6, 0x07, 0xfc, 0xf8, 0x0d, 0xc1, 0xc1, 0xb1, 0x15, 0xc1, 0xc1,
+    0xc0, 0xc5, 0xd6, 0x8c, 0x07, 0xfd, 0x4b, 0x01, 0xc1, 0xcc, 0xc5, 0xda,
+    0xe7, 0x07, 0xfd, 0x89, 0x12, 0xc1, 0xc1, 0xd0, 0x8b, 0x07, 0xfe, 0xe3,
+    0x01, 0xc1, 0xdf, 0x05, 0xc1, 0xc1, 0xe5, 0x16, 0xc1, 0xc1, 0xf1, 0xc5,
+    0x90, 0xe4, 0x07, 0xfd, 0xf1, 0x83, 0x07, 0xfe, 0x13, 0x01, 0xc1, 0xfd,
+    0x1b, 0xc1, 0xc2, 0x01, 0x87, 0x07, 0xfe, 0x3b, 0x01, 0xc2, 0x1b, 0x91,
+    0x07, 0xfe, 0x63, 0x01, 0xc2, 0x23, 0x19, 0xc1, 0xc2, 0x27, 0x97, 0x07,
+    0xfe, 0x99, 0xc5, 0xd9, 0x61, 0x07, 0xfd, 0x22, 0x01, 0xc2, 0x39, 0xd1,
+    0x4e, 0xd0, 0x0f, 0xb4, 0x28, 0x47, 0x78, 0xc0, 0x41, 0xc2, 0x3d, 0x45,
+    0x03, 0x14, 0xc1, 0xc2, 0x49, 0x83, 0x01, 0x82, 0xa9, 0x8b, 0x01, 0x82,
+    0xb9, 0x97, 0x01, 0x82, 0xc9, 0x87, 0x01, 0x82, 0xd9, 0x91, 0x01, 0x82,
+    0xe8, 0x83, 0x01, 0x82, 0x59, 0x8b, 0x01, 0x82, 0x69, 0x97, 0x01, 0x82,
+    0x79, 0x87, 0x01, 0x82, 0x89, 0x91, 0x01, 0x82, 0x98, 0x83, 0x01, 0x82,
+    0x61, 0x8b, 0x01, 0x82, 0x71, 0x97, 0x01, 0x82, 0x81, 0x87, 0x01, 0x82,
+    0x91, 0x91, 0x01, 0x82, 0xa0, 0x83, 0x01, 0x82, 0xb1, 0x8b, 0x01, 0x82,
+    0xc1, 0x97, 0x01, 0x82, 0xd1, 0x87, 0x01, 0x82, 0xe1, 0x91, 0x01, 0x82,
+    0xf0, 0x83, 0x01, 0x82, 0xf9, 0x8b, 0x01, 0x83, 0x09, 0x97, 0x01, 0x83,
+    0x21, 0x87, 0x01, 0x83, 0x31, 0x91, 0x01, 0x83, 0x40, 0x83, 0x01, 0x83,
+    0x01, 0x8b, 0x01, 0x83, 0x11, 0x97, 0x01, 0x83, 0x29, 0x87, 0x01, 0x83,
+    0x39, 0x91, 0x01, 0x83, 0x48, 0x83, 0x01, 0x83, 0x51, 0x8b, 0x01, 0x83,
+    0x59, 0x97, 0x01, 0x83, 0x61, 0x87, 0x01, 0x83, 0x69, 0x91, 0x01, 0x83,
+    0x70, 0x83, 0x01, 0x83, 0x79, 0x8b, 0x01, 0x83, 0x91, 0x97, 0x01, 0x83,
+    0xa9, 0x87, 0x01, 0x83, 0xc1, 0x91, 0x01, 0x83, 0xd8, 0x83, 0x01, 0x83,
+    0x81, 0x8b, 0x01, 0x83, 0x99, 0x97, 0x01, 0x83, 0xb1, 0x87, 0x01, 0x83,
+    0xc9, 0x91, 0x01, 0x83, 0xe0, 0x83, 0x01, 0x83, 0x89, 0x8b, 0x01, 0x83,
+    0xa1, 0x97, 0x01, 0x83, 0xb9, 0x87, 0x01, 0x83, 0xd1, 0x91, 0x01, 0x83,
+    0xe8, 0x83, 0x01, 0x83, 0xf1, 0x8b, 0x01, 0x83, 0xf9, 0x97, 0x01, 0x84,
+    0x01, 0x87, 0x01, 0x84, 0x09, 0x91, 0x01, 0x84, 0x10, 0x83, 0x01, 0x84,
+    0x21, 0x97, 0x01, 0x84, 0x31, 0x91, 0x01, 0x84, 0x40, 0x83, 0x01, 0x84,
+    0x49, 0x8b, 0x01, 0x84, 0x51, 0x97, 0x01, 0x84, 0x59, 0x87, 0x01, 0x84,
+    0x61, 0x91, 0x01, 0x84, 0x68, 0x83, 0x01, 0x84, 0x79, 0x8b, 0x01, 0x84,
+    0x81, 0x87, 0x01, 0x84, 0x89, 0x91, 0x01, 0x84, 0x90, 0xc6, 0x1c, 0xb4,
+    0x01, 0x02, 0x19, 0xce, 0x6b, 0x17, 0x01, 0x70, 0xd0, 0x45, 0x6b, 0x02,
+    0xc1, 0xc2, 0x6f, 0xcc, 0x0d, 0x9e, 0x01, 0x2e, 0xc9, 0xc6, 0x1c, 0xb4,
+    0x01, 0x2e, 0xc1, 0xcc, 0x01, 0xdb, 0x0f, 0xdc, 0x81, 0x42, 0x00, 0x58,
+    0x41, 0xc2, 0x7b, 0xc9, 0x16, 0x2f, 0x01, 0x37, 0x39, 0x0e, 0xc1, 0xc2,
+    0x81, 0xc8, 0xb5, 0x82, 0x01, 0x09, 0x39, 0xc8, 0xb9, 0x82, 0x01, 0x02,
+    0xa1, 0xd0, 0x0f, 0x09, 0x00, 0x05, 0x09, 0xcd, 0x2c, 0xb2, 0x00, 0x05,
+    0xf9, 0xcb, 0x10, 0xc9, 0x01, 0x70, 0xc0, 0xda, 0x1b, 0xb6, 0x01, 0x35,
+    0x21, 0x51, 0x55, 0xda, 0x41, 0xc2, 0x90, 0x00, 0x41, 0xc2, 0xa2, 0xc9,
+    0x57, 0x36, 0x01, 0x1d, 0x71, 0x45, 0x00, 0x8c, 0xc1, 0xc2, 0xb4, 0x03,
+    0x41, 0xc2, 0xd8, 0x47, 0x34, 0x2f, 0xc1, 0xc2, 0xe4, 0x47, 0x02, 0x0e,
+    0x41, 0xc2, 0xf7, 0x47, 0x34, 0x2f, 0xc1, 0xc3, 0x50, 0x47, 0x02, 0x0e,
+    0x41, 0xc3, 0x63, 0xc5, 0x53, 0x93, 0x01, 0x09, 0xc9, 0x49, 0x1b, 0x0b,
+    0x41, 0xc3, 0xc6, 0xd1, 0x31, 0xb3, 0x0f, 0xae, 0xd1, 0xc4, 0x05, 0x4b,
+    0x01, 0x4f, 0x08, 0xd3, 0x41, 0x4b, 0x0f, 0x65, 0xa1, 0x47, 0x34, 0x2f,
+    0xc1, 0xc3, 0xd6, 0xca, 0xa6, 0xc0, 0x0f, 0x65, 0x81, 0x49, 0x53, 0xa9,
+    0xc1, 0xc4, 0x1b, 0xcb, 0x5f, 0x92, 0x0f, 0x65, 0x61, 0xc9, 0x41, 0x55,
+    0x0f, 0x65, 0x00, 0xd5, 0x36, 0x08, 0x01, 0x4f, 0x28, 0x08, 0xc1, 0xc4,
+    0x27, 0x16, 0xc1, 0xc4, 0x33, 0xc3, 0x05, 0x14, 0x0e, 0x9b, 0x90, 0xda,
+    0x1b, 0x00, 0x01, 0x81, 0xb9, 0x4b, 0x19, 0xd1, 0x41, 0xc4, 0x3f, 0x48,
+    0x0a, 0x53, 0xc1, 0xc4, 0x6f, 0x49, 0xb0, 0xb3, 0xc1, 0xc4, 0x7b, 0xcd,
+    0x7e, 0x2e, 0x01, 0x7f, 0xa1, 0x4e, 0x71, 0xbc, 0xc1, 0xc4, 0x87, 0xc8,
+    0x02, 0xf5, 0x01, 0x7f, 0xd8, 0xc7, 0xc2, 0x88, 0x01, 0x8c, 0x99, 0x0a,
+    0xc1, 0xc4, 0x9d, 0xc7, 0xc5, 0xf3, 0x01, 0x8c, 0xb0, 0x43, 0x09, 0x9e,
+    0xc1, 0xc4, 0xa9, 0xc9, 0xac, 0x2a, 0x01, 0x8c, 0xc8, 0xca, 0x9e, 0xfa,
+    0x01, 0x8c, 0xb9, 0xc7, 0xc7, 0xa5, 0x01, 0x8c, 0xf8, 0x16, 0xc1, 0xc4,
+    0xb5, 0xc3, 0x05, 0x14, 0x08, 0x42, 0xc2, 0x01, 0xc4, 0xc8, 0x16, 0xc1,
+    0xc4, 0xcc, 0x15, 0xc1, 0xc4, 0xd8, 0x03, 0xc1, 0xc4, 0xe2, 0xc3, 0x20,
+    0x18, 0x08, 0x42, 0x69, 0xc3, 0x00, 0x4e, 0x08, 0x42, 0x61, 0xc6, 0xcf,
+    0xd7, 0x08, 0x42, 0x59, 0xc4, 0xe0, 0xe7, 0x08, 0x42, 0x51, 0xc4, 0x4a,
+    0xb9, 0x08, 0x42, 0x49, 0xc2, 0x01, 0x7f, 0x08, 0x42, 0x23, 0x01, 0xc4,
+    0xee, 0xc5, 0x4a, 0xb3, 0x08, 0x42, 0x31, 0xc3, 0x7e, 0x89, 0x08, 0x42,
+    0x29, 0xc6, 0x40, 0x9a, 0x08, 0x42, 0x19, 0xc5, 0x9c, 0xa2, 0x08, 0x42,
+    0x11, 0xc4, 0xe3, 0x27, 0x08, 0x42, 0x09, 0xc2, 0x00, 0x67, 0x08, 0x42,
+    0x81, 0xc4, 0xb9, 0x7e, 0x08, 0x42, 0x91, 0xc4, 0x5d, 0xe2, 0x08, 0x42,
+    0x98, 0xc7, 0xc9, 0x0a, 0x0f, 0xa2, 0xd1, 0xc3, 0x1c, 0xe4, 0x0f, 0xa2,
+    0x91, 0xc6, 0xa8, 0xc4, 0x0f, 0xa3, 0x09, 0xc5, 0xd4, 0xf7, 0x0f, 0xa3,
+    0x10, 0x45, 0xa6, 0x50, 0xc1, 0xc4, 0xf4, 0xc5, 0x02, 0xd2, 0x01, 0x2e,
+    0x5b, 0x01, 0xc5, 0x2b, 0xd4, 0x3a, 0x0c, 0x01, 0x3f, 0x0b, 0x01, 0xc5,
+    0x2f, 0xc8, 0xb8, 0x3a, 0x01, 0x33, 0x38, 0x07, 0xc1, 0xc5, 0x35, 0xd5,
+    0x31, 0xc4, 0x0f, 0xad, 0x59, 0x11, 0x41, 0xc5, 0x3f, 0xca, 0x9d, 0x2e,
+    0x0f, 0xc5, 0x69, 0xc3, 0x05, 0x14, 0x0f, 0xc5, 0x60, 0xc5, 0x0b, 0x0a,
+    0x01, 0x2d, 0x0b, 0x01, 0xc5, 0x4b, 0xc7, 0x37, 0x27, 0x01, 0x38, 0x21,
+    0xc9, 0xb0, 0x1a, 0x01, 0x33, 0x21, 0xc2, 0x05, 0x1d, 0x0f, 0x99, 0x1b,
+    0x01, 0xc5, 0x4f, 0x0f, 0xc1, 0xc5, 0x53, 0xca, 0x50, 0x80, 0x01, 0x30,
+    0xb1, 0xc3, 0x0e, 0x6b, 0x01, 0x30, 0x31, 0xc9, 0xb3, 0x83, 0x07, 0xf2,
+    0x30, 0x03, 0xc1, 0xc5, 0x5f, 0x43, 0x00, 0x4a, 0xc1, 0xc5, 0x6b, 0x45,
+    0x0a, 0xe1, 0x41, 0xc5, 0x75, 0xc6, 0x3a, 0x1a, 0x01, 0x2e, 0x3b, 0x01,
+    0xc5, 0x7b, 0x48, 0xbe, 0x32, 0xc1, 0xc5, 0x7f, 0x43, 0x01, 0x47, 0x41,
+    0xc5, 0x8b, 0x14, 0xc1, 0xc5, 0x97, 0xd7, 0x28, 0x5a, 0x01, 0x36, 0xb9,
+    0xc8, 0x36, 0xb4, 0x01, 0x30, 0x79, 0xd2, 0x49, 0xaf, 0x0f, 0xab, 0xf0,
+    0x0e, 0xc1, 0xc5, 0xa3, 0x4c, 0x0e, 0x55, 0xc1, 0xc5, 0xb0, 0xcc, 0x7d,
+    0x5f, 0x01, 0x31, 0xc8, 0x44, 0x00, 0x2d, 0xc1, 0xc5, 0xbc, 0xc8, 0x46,
+    0x71, 0x01, 0x2d, 0x68, 0x4a, 0x03, 0x3d, 0xc1, 0xc5, 0xc8, 0x4a, 0x01,
+    0xa9, 0x41, 0xc5, 0xd4, 0x46, 0x01, 0xdc, 0xc1, 0xc5, 0xe9, 0xca, 0x9c,
+    0x2a, 0x01, 0x5e, 0xe8, 0xcc, 0x88, 0x59, 0x01, 0x2d, 0x89, 0x42, 0x00,
+    0xc4, 0x41, 0xc5, 0xf9, 0x46, 0x05, 0x87, 0xc1, 0xc6, 0x05, 0xce, 0x51,
+    0x6a, 0x01, 0x58, 0xf0, 0xd5, 0x35, 0xde, 0x0f, 0xc4, 0x39, 0xd0, 0x35,
+    0xe3, 0x0f, 0xc3, 0xf9, 0xd0, 0x5c, 0x32, 0x0f, 0xc3, 0x39, 0xca, 0x35,
+    0xe9, 0x0f, 0xc3, 0x79, 0xd1, 0x50, 0x46, 0x0f, 0xc3, 0xb8, 0xd5, 0x35,
+    0xde, 0x0f, 0xc4, 0x31, 0xd1, 0x50, 0x46, 0x0f, 0xc3, 0xb1, 0xca, 0x35,
+    0xe9, 0x0f, 0xc3, 0x71, 0xd0, 0x5c, 0x32, 0x0f, 0xc3, 0x31, 0xd0, 0x35,
+    0xe3, 0x0f, 0xc3, 0xf0, 0xd5, 0x35, 0xde, 0x0f, 0xc4, 0x29, 0xd1, 0x50,
+    0x46, 0x0f, 0xc3, 0xa9, 0xca, 0x35, 0xe9, 0x0f, 0xc3, 0x69, 0xd0, 0x5c,
+    0x32, 0x0f, 0xc3, 0x29, 0xd0, 0x35, 0xe3, 0x0f, 0xc3, 0xe8, 0xd5, 0x35,
+    0xde, 0x0f, 0xc4, 0x21, 0xd1, 0x50, 0x46, 0x0f, 0xc3, 0xa1, 0xca, 0x35,
+    0xe9, 0x0f, 0xc3, 0x61, 0xd0, 0x5c, 0x32, 0x0f, 0xc3, 0x21, 0xd0, 0x35,
+    0xe3, 0x0f, 0xc3, 0xe0, 0xc5, 0xdc, 0xfe, 0x0f, 0x9c, 0x81, 0xcc, 0x87,
+    0x15, 0x0f, 0x99, 0x60, 0xc6, 0xcc, 0x83, 0x0f, 0xb5, 0xf1, 0xc4, 0x51,
+    0xb7, 0x0f, 0x98, 0x51, 0xc7, 0xc5, 0x75, 0x0f, 0xa0, 0x19, 0xc4, 0xe3,
+    0xcf, 0x0f, 0xc9, 0xe8, 0xc4, 0x26, 0x78, 0x0f, 0x17, 0xc9, 0xc5, 0x06,
+    0xdb, 0x0f, 0x17, 0xc1, 0x15, 0xc1, 0xc6, 0x17, 0x08, 0xc1, 0xc6, 0x23,
+    0x16, 0xc1, 0xc6, 0x2f, 0xc3, 0x05, 0x14, 0x0f, 0x17, 0x89, 0xc4, 0x15,
+    0xe7, 0x0f, 0x17, 0x80, 0xc3, 0xd8, 0x41, 0x0f, 0x17, 0x73, 0x01, 0xc6,
+    0x3b, 0xc3, 0x12, 0xe0, 0x0f, 0x17, 0x62, 0x01, 0xc6, 0x41, 0x1b, 0xc1,
+    0xc6, 0x47, 0x97, 0x0f, 0x16, 0xf3, 0x01, 0xc6, 0x51, 0x10, 0xc1, 0xc6,
+    0x57, 0x83, 0x0f, 0x16, 0x0b, 0x01, 0xc6, 0x67, 0x87, 0x0f, 0x16, 0xdb,
+    0x01, 0xc6, 0x78, 0x91, 0x0f, 0x16, 0xab, 0x01, 0xc6, 0x7c, 0x8b, 0x0f,
+    0x16, 0xe3, 0x01, 0xc6, 0x83, 0x16, 0xc1, 0xc6, 0x89, 0x0e, 0xc1, 0xc6,
+    0x9f, 0xc2, 0x00, 0xd0, 0x0f, 0x16, 0xd1, 0x0d, 0xc1, 0xc6, 0xa9, 0xc2,
+    0x01, 0xc3, 0x0f, 0x16, 0xc1, 0xc2, 0x00, 0x39, 0x0f, 0x16, 0xb9, 0xc2,
+    0x02, 0x41, 0x0f, 0x16, 0x99, 0xc2, 0x01, 0x4a, 0x0f, 0x16, 0x91, 0xc2,
+    0x02, 0x1c, 0x0f, 0x16, 0x89, 0xc2, 0x25, 0x3b, 0x0f, 0x16, 0x81, 0x15,
+    0xc1, 0xc6, 0xb3, 0xc2, 0x00, 0x87, 0x0f, 0x16, 0x69, 0x12, 0xc1, 0xc6,
+    0xbd, 0xc2, 0x01, 0x30, 0x0f, 0x16, 0x29, 0xc2, 0x0e, 0x9a, 0x0f, 0x16,
+    0x21, 0xc2, 0x00, 0x64, 0x0f, 0x16, 0x19, 0xc2, 0x01, 0x5d, 0x0f, 0x16,
+    0x10, 0xc6, 0x2a, 0xfe, 0x08, 0xc7, 0x91, 0xc6, 0xcf, 0x9b, 0x08, 0xc7,
+    0x89, 0x15, 0xc1, 0xc6, 0xc7, 0x08, 0xc1, 0xc6, 0xd3, 0x16, 0x41, 0xc6,
+    0xdf, 0xc4, 0x26, 0x78, 0x08, 0xc7, 0x49, 0xc5, 0x06, 0xdb, 0x08, 0xc7,
+    0x41, 0x15, 0xc1, 0xc6, 0xf1, 0x08, 0xc1, 0xc6, 0xfd, 0x16, 0xc1, 0xc7,
+    0x09, 0xc3, 0x05, 0x14, 0x08, 0xc7, 0x09, 0xc4, 0x15, 0xe7, 0x08, 0xc7,
+    0x00, 0xc4, 0xdf, 0x7f, 0x08, 0xc6, 0xf9, 0x15, 0xc1, 0xc7, 0x15, 0x0a,
+    0xc1, 0xc7, 0x21, 0xc2, 0x05, 0x1c, 0x08, 0xc6, 0xc1, 0xc2, 0x02, 0xaa,
+    0x08, 0xc6, 0xb9, 0x83, 0x08, 0xc6, 0x0b, 0x01, 0xc7, 0x31, 0xc2, 0x0e,
+    0x9a, 0x08, 0xc6, 0xa1, 0x10, 0xc1, 0xc7, 0x3f, 0xc3, 0x02, 0x10, 0x08,
+    0xc6, 0x91, 0x91, 0x08, 0xc6, 0x4b, 0x01, 0xc7, 0x4b, 0x87, 0x08, 0xc6,
+    0x43, 0x01, 0xc7, 0x51, 0x17, 0xc1, 0xc7, 0x55, 0x1b, 0xc1, 0xc7, 0x5d,
+    0xc2, 0x00, 0xe8, 0x08, 0xc6, 0x61, 0xc2, 0x01, 0x30, 0x08, 0xc6, 0x59,
+    0xc2, 0x25, 0x9f, 0x08, 0xc6, 0x31, 0xc2, 0x00, 0x8c, 0x08, 0xc6, 0x10,
+    0xc4, 0xdf, 0x7f, 0x08, 0xc5, 0xf9, 0x15, 0xc1, 0xc7, 0x6c, 0x0a, 0xc1,
+    0xc7, 0x78, 0xc2, 0x05, 0x1c, 0x08, 0xc5, 0xc1, 0xc2, 0x02, 0xaa, 0x08,
+    0xc5, 0xb9, 0x83, 0x08, 0xc5, 0x0b, 0x01, 0xc7, 0x88, 0xc2, 0x0e, 0x9a,
+    0x08, 0xc5, 0xa1, 0x10, 0xc1, 0xc7, 0x96, 0xc3, 0x02, 0x10, 0x08, 0xc5,
+    0x91, 0x91, 0x08, 0xc5, 0x4b, 0x01, 0xc7, 0xa2, 0x87, 0x08, 0xc5, 0x43,
+    0x01, 0xc7, 0xa8, 0x17, 0xc1, 0xc7, 0xac, 0x1b, 0xc1, 0xc7, 0xb4, 0xc2,
+    0x00, 0xe8, 0x08, 0xc5, 0x61, 0xc2, 0x01, 0x30, 0x08, 0xc5, 0x59, 0xc2,
+    0x25, 0x9f, 0x08, 0xc5, 0x31, 0xc2, 0x00, 0x8c, 0x08, 0xc5, 0x10, 0xc3,
+    0x02, 0x6e, 0x01, 0x18, 0x39, 0xc7, 0x80, 0x2f, 0x07, 0xf2, 0x78, 0xc5,
+    0x00, 0x2c, 0x01, 0x49, 0x99, 0xc4, 0x00, 0x49, 0x01, 0x59, 0xf8, 0xcf,
+    0x1b, 0x25, 0x01, 0x02, 0xa9, 0xcc, 0x8c, 0x19, 0x0f, 0x9d, 0xa0, 0x05,
+    0xc1, 0xc7, 0xc3, 0xd7, 0x15, 0x2e, 0x01, 0x39, 0x19, 0xd8, 0x21, 0x23,
+    0x01, 0x39, 0x11, 0x44, 0x05, 0x18, 0xc1, 0xc7, 0xcf, 0xcb, 0x8d, 0xdc,
+    0x0f, 0x9a, 0x01, 0xd2, 0x22, 0x49, 0x0f, 0xbe, 0x30, 0xcb, 0x93, 0x5c,
+    0x0f, 0x9b, 0xe8, 0x00, 0xc1, 0xc7, 0xdb, 0xc9, 0xab, 0xd9, 0x0f, 0xb1,
+    0xb0, 0xd7, 0x29, 0x6e, 0x0f, 0xb0, 0x59, 0xd0, 0x59, 0x32, 0x0f, 0xb1,
+    0x88, 0xdf, 0x0d, 0x9b, 0x01, 0x36, 0xf1, 0x49, 0x0d, 0x20, 0x41, 0xc8,
+    0x24, 0xe0, 0x06, 0x87, 0x01, 0x3d, 0x60, 0xc9, 0xb2, 0xa2, 0x0f, 0x98,
+    0xe9, 0xc6, 0x00, 0x91, 0x0f, 0x98, 0xa8, 0xca, 0x5d, 0xa2, 0x07, 0xf8,
+    0x19, 0xc7, 0x68, 0xc6, 0x07, 0xff, 0x10, 0xc7, 0x0b, 0x00, 0x07, 0xf8,
+    0x51, 0xc8, 0x36, 0x21, 0x07, 0xf8, 0x31, 0xc9, 0x2d, 0x85, 0x07, 0xf8,
+    0x38, 0x45, 0x09, 0x98, 0xc1, 0xc8, 0x30, 0xca, 0x99, 0x61, 0x07, 0xf8,
+    0x20, 0x11, 0xc1, 0xc8, 0x54, 0xd0, 0x08, 0xf7, 0x07, 0xf9, 0xf1, 0xc8,
+    0x8e, 0x16, 0x07, 0xff, 0x00, 0xc8, 0x52, 0x00, 0x07, 0xf8, 0xd9, 0xc6,
+    0x27, 0x5e, 0x07, 0xf8, 0x78, 0x07, 0xc1, 0xc8, 0x60, 0x45, 0x0b, 0x12,
+    0xc1, 0xc8, 0x6c, 0xc7, 0x80, 0x2f, 0x07, 0xf9, 0xf8, 0xca, 0x0e, 0xbe,
+    0x07, 0xf8, 0xe9, 0xcf, 0x0f, 0x0a, 0x07, 0xf8, 0x08, 0xcf, 0x54, 0xbb,
+    0x07, 0xf8, 0xf1, 0xca, 0x1f, 0x0e, 0x07, 0xfa, 0x00, 0xcb, 0x2c, 0xb4,
+    0x07, 0xf8, 0xf9, 0xcc, 0x01, 0xbb, 0x07, 0xf8, 0x10, 0xce, 0x61, 0xd5,
+    0x07, 0xf9, 0xe1, 0x45, 0x00, 0x2d, 0x41, 0xc8, 0x78, 0xc9, 0x9f, 0xc3,
+    0x07, 0xff, 0x09, 0xcb, 0x8e, 0x13, 0x07, 0xf8, 0x29, 0xc8, 0x60, 0xf4,
+    0x07, 0xf8, 0x58, 0x00, 0x41, 0xc8, 0x90, 0xc9, 0xa8, 0x28, 0x0f, 0x9c,
+    0x39, 0x95, 0x0f, 0x9c, 0x30, 0xc5, 0x91, 0x52, 0x0f, 0xb4, 0x91, 0xcb,
+    0x92, 0xf9, 0x0f, 0xcf, 0x78, 0x49, 0xb2, 0xcf, 0xc1, 0xc8, 0x9c, 0xc2,
+    0x00, 0xac, 0x0b, 0x7a, 0x50, 0x44, 0x1a, 0xce, 0xc1, 0xc8, 0xa8, 0x15,
+    0xc1, 0xc8, 0xc4, 0x87, 0x0b, 0x7a, 0x41, 0x42, 0x07, 0x26, 0xc1, 0xc8,
+    0xd8, 0xc2, 0x01, 0x6f, 0x0b, 0x78, 0x71, 0x83, 0x0b, 0x78, 0x50, 0x83,
+    0x0b, 0x78, 0x83, 0x01, 0xc8, 0xe2, 0x1b, 0xc1, 0xc8, 0xe8, 0x09, 0xc1,
+    0xc8, 0xf2, 0x10, 0xc1, 0xc8, 0xfc, 0xc2, 0x00, 0xd0, 0x0b, 0x78, 0x88,
+    0x1c, 0xc1, 0xc9, 0x06, 0x42, 0x07, 0x26, 0xc1, 0xc9, 0x1c, 0xc2, 0x0e,
+    0x9a, 0x0b, 0x78, 0x79, 0x83, 0x0b, 0x78, 0x58, 0xc2, 0x16, 0x5a, 0x0b,
+    0x7a, 0x31, 0x83, 0x0b, 0x79, 0xd1, 0xc2, 0x0d, 0xf6, 0x0b, 0x79, 0xa1,
+    0xc2, 0x00, 0xd0, 0x0b, 0x79, 0x98, 0xc2, 0x00, 0x2c, 0x0b, 0x7a, 0x29,
+    0x83, 0x0b, 0x78, 0x08, 0xc2, 0x00, 0xd0, 0x0b, 0x7a, 0x21, 0x83, 0x0b,
+    0x79, 0x30, 0x8a, 0x0b, 0x7a, 0x19, 0x47, 0x78, 0xc0, 0x41, 0xc9, 0x26,
+    0x1c, 0xc1, 0xc9, 0x36, 0x15, 0xc1, 0xc9, 0x44, 0x83, 0x0b, 0x79, 0xd9,
+    0xc2, 0x00, 0xd0, 0x0b, 0x79, 0xa8, 0x16, 0xc1, 0xc9, 0x4e, 0xc4, 0xe2,
+    0x83, 0x0b, 0x79, 0x89, 0xc2, 0x02, 0x2b, 0x0b, 0x79, 0x01, 0xc3, 0x3a,
+    0x09, 0x0b, 0x78, 0x91, 0xc2, 0x00, 0xb0, 0x0b, 0x78, 0x10, 0x0a, 0xc1,
+    0xc9, 0x5c, 0x83, 0x0b, 0x78, 0xf8, 0xc2, 0x01, 0x30, 0x0b, 0x79, 0x11,
+    0x83, 0x0b, 0x79, 0x08, 0x0a, 0xc1, 0xc9, 0x66, 0xc2, 0x19, 0x2c, 0x0b,
+    0x78, 0xb9, 0x83, 0x0b, 0x78, 0xb0, 0xc2, 0x00, 0x87, 0x0b, 0x78, 0x49,
+    0x83, 0x0b, 0x78, 0x40, 0xc2, 0x00, 0xd0, 0x0b, 0x78, 0x29, 0x83, 0x0b,
+    0x78, 0x20, 0xc2, 0x00, 0xdb, 0x0b, 0x78, 0x19, 0x83, 0x0b, 0x78, 0x00,
+    0x8b, 0x0b, 0x7c, 0x39, 0xc2, 0x13, 0x38, 0x0b, 0x7b, 0xf9, 0xc2, 0x00,
+    0x75, 0x0b, 0x7b, 0x81, 0xc2, 0x06, 0xdb, 0x0b, 0x7b, 0x79, 0x97, 0x0b,
+    0x7b, 0x71, 0x83, 0x0b, 0x7b, 0x5a, 0x01, 0xc9, 0x70, 0x91, 0x0b, 0x7b,
+    0x2b, 0x01, 0xc9, 0x77, 0x89, 0x0b, 0x7c, 0x21, 0xc2, 0x00, 0x75, 0x0b,
+    0x7b, 0x49, 0x97, 0x0b, 0x7b, 0x41, 0x8b, 0x0b, 0x7b, 0x39, 0x87, 0x0b,
+    0x7b, 0x31, 0x83, 0x0b, 0x7b, 0x12, 0x01, 0xc9, 0x7d, 0x83, 0x0b, 0x7c,
+    0x29, 0x8b, 0x0b, 0x7b, 0xd1, 0x94, 0x0b, 0x7b, 0xbb, 0x01, 0xc9, 0x84,
+    0x90, 0x0b, 0x7a, 0xf2, 0x01, 0xc9, 0x88, 0x07, 0xc1, 0xc9, 0x8c, 0x89,
+    0x0b, 0x7c, 0x09, 0x97, 0x0b, 0x7b, 0xe1, 0x91, 0x0b, 0x7a, 0xd0, 0xc2,
+    0x03, 0xd4, 0x0b, 0x7c, 0x01, 0x8b, 0x0b, 0x7b, 0x90, 0x89, 0x0b, 0x7b,
+    0xf0, 0x97, 0x0b, 0x7b, 0xd9, 0x8b, 0x0b, 0x7b, 0xc9, 0x87, 0x0b, 0x7b,
+    0x9b, 0x01, 0xc9, 0x94, 0x90, 0x0b, 0x7a, 0xbb, 0x01, 0xc9, 0x98, 0xc2,
+    0x61, 0x75, 0x0b, 0x7a, 0xb1, 0x83, 0x0b, 0x7a, 0xa8, 0x94, 0x0b, 0x7b,
+    0xb0, 0x91, 0x0b, 0x7a, 0xd8, 0xca, 0xa1, 0x84, 0x0b, 0x7a, 0x99, 0xc7,
+    0xc1, 0x62, 0x0b, 0x7a, 0x90, 0xc5, 0x1e, 0xc8, 0x01, 0x12, 0x11, 0xc4,
+    0x00, 0xba, 0x01, 0x10, 0x92, 0x01, 0xc9, 0x9c, 0x4e, 0x75, 0x20, 0xc1,
+    0xc9, 0xa0, 0xcb, 0x58, 0xc7, 0x0f, 0xbd, 0x19, 0x46, 0x01, 0xfc, 0xc1,
+    0xc9, 0xac, 0x04, 0xc1, 0xc9, 0xb8, 0x45, 0x00, 0x2c, 0xc1, 0xc9, 0xc4,
+    0x44, 0x00, 0x49, 0xc1, 0xc9, 0xce, 0x08, 0xc1, 0xc9, 0xd8, 0xcc, 0x07,
+    0xbb, 0x01, 0x3a, 0xc9, 0x15, 0xc1, 0xc9, 0xea, 0xd2, 0x4c, 0x91, 0x01,
+    0x02, 0xf9, 0x46, 0x0f, 0x88, 0x41, 0xca, 0x02, 0xc5, 0x0a, 0x8a, 0x01,
+    0x72, 0x61, 0xd0, 0x0f, 0x09, 0x01, 0x72, 0x99, 0xcd, 0x2c, 0xb2, 0x01,
+    0x72, 0xa0, 0xca, 0x9c, 0x70, 0x0b, 0x74, 0xc9, 0x4c, 0x29, 0xba, 0x41,
+    0xca, 0x0e, 0xc4, 0x0a, 0x8b, 0x0b, 0x74, 0xb9, 0x4e, 0x0b, 0x18, 0x41,
+    0xca, 0x88, 0x16, 0xc1, 0xcb, 0x02, 0xc3, 0x05, 0x14, 0x0b, 0x74, 0x0b,
+    0x01, 0xcb, 0x14, 0xc4, 0x26, 0x78, 0x0b, 0x74, 0x49, 0xc5, 0x06, 0xdb,
+    0x0b, 0x74, 0x41, 0x15, 0xc1, 0xcb, 0x1a, 0x08, 0xc1, 0xcb, 0x26, 0xc4,
+    0x15, 0xe7, 0x0b, 0x74, 0x00, 0xc8, 0x4b, 0x5f, 0x0b, 0x74, 0x99, 0x07,
+    0xc1, 0xcb, 0x32, 0x15, 0xc1, 0xcb, 0x3e, 0x08, 0xc1, 0xcb, 0x4a, 0x16,
+    0x41, 0xcb, 0x56, 0xc8, 0xb5, 0x5a, 0x01, 0x1e, 0xc1, 0xc6, 0xcd, 0xe5,
+    0x01, 0x1e, 0xb9, 0x4a, 0x9b, 0x12, 0x41, 0xcb, 0x68, 0xca, 0x9c, 0x16,
+    0x01, 0x1e, 0xa1, 0xc5, 0x2e, 0xee, 0x01, 0x1e, 0x90, 0x1d, 0xc1, 0xcb,
+    0x74, 0x1e, 0x41, 0xcb, 0x9c, 0xc3, 0x05, 0x14, 0x0f, 0x46, 0x39, 0x16,
+    0xc1, 0xcb, 0xc4, 0x08, 0xc1, 0xcb, 0xd0, 0x15, 0xc1, 0xcb, 0xdc, 0xc5,
+    0x06, 0xdb, 0x0f, 0x46, 0x71, 0xc4, 0x26, 0x78, 0x0f, 0x46, 0x78, 0x16,
+    0xc1, 0xcb, 0xe8, 0x47, 0x0d, 0x04, 0xc1, 0xcb, 0xf2, 0xc8, 0x33, 0xee,
+    0x0f, 0x46, 0xb0, 0x49, 0x53, 0xa9, 0xc1, 0xcb, 0xfc, 0x47, 0x34, 0x2f,
+    0xc1, 0xcc, 0x18, 0x0e, 0x41, 0xcc, 0x3f, 0xcb, 0x91, 0x99, 0x08, 0x4c,
+    0xf3, 0x01, 0xcc, 0x4b, 0x47, 0x02, 0x0e, 0x41, 0xcc, 0x51, 0x00, 0x41,
+    0xcc, 0xb3, 0xc2, 0x02, 0xa0, 0x05, 0x5f, 0x91, 0xc4, 0x02, 0xde, 0x05,
+    0x5f, 0x98, 0xc3, 0x09, 0x9e, 0x05, 0x5f, 0xa1, 0xc3, 0x0d, 0x14, 0x05,
+    0x5f, 0xa8, 0xc2, 0x22, 0xcc, 0x05, 0x5f, 0xb1, 0xc4, 0x18, 0x10, 0x05,
+    0x5f, 0xb8, 0xc4, 0xe4, 0x73, 0x05, 0x5f, 0x51, 0xc7, 0xc6, 0x16, 0x05,
+    0x5f, 0x49, 0xc5, 0xd5, 0x3d, 0x05, 0x5f, 0x31, 0x03, 0xc1, 0xcc, 0xbf,
+    0x0b, 0xc1, 0xcc, 0xcd, 0xc4, 0xbd, 0x08, 0x05, 0x5f, 0x19, 0xc7, 0x40,
+    0xe5, 0x05, 0x57, 0xa9, 0x17, 0xc1, 0xcc, 0xd7, 0xc6, 0xce, 0x4b, 0x05,
+    0x5f, 0x38, 0x8b, 0x05, 0x5e, 0x7b, 0x01, 0xcc, 0xe1, 0x10, 0xc1, 0xcc,
+    0xe7, 0x16, 0xc1, 0xcd, 0x03, 0x12, 0xc1, 0xcd, 0x16, 0x0d, 0xc1, 0xcd,
+    0x23, 0x04, 0xc1, 0xcd, 0x32, 0x06, 0xc1, 0xcd, 0x3c, 0x09, 0xc1, 0xcd,
+    0x4c, 0x15, 0xc1, 0xcd, 0x58, 0x42, 0x11, 0xee, 0xc1, 0xcd, 0x6a, 0x91,
+    0x05, 0x57, 0x09, 0x87, 0x05, 0x57, 0x01, 0xc3, 0x18, 0x95, 0x05, 0x5e,
+    0xa1, 0xc5, 0xd5, 0x92, 0x05, 0x5e, 0x89, 0xc2, 0x05, 0x1d, 0x05, 0x5e,
+    0x71, 0xc3, 0xcc, 0x38, 0x05, 0x5e, 0x69, 0xc4, 0xb0, 0x02, 0x05, 0x5e,
+    0x61, 0xc3, 0x27, 0x01, 0x05, 0x5e, 0x1b, 0x01, 0xcd, 0x74, 0xc3, 0x02,
+    0xf9, 0x05, 0x5e, 0x13, 0x01, 0xcd, 0x7a, 0xc3, 0x0c, 0x26, 0x05, 0x5e,
+    0x59, 0x0c, 0x41, 0xcd, 0x80, 0xc7, 0xc0, 0x82, 0x0f, 0xb7, 0xa9, 0xc4,
+    0xd0, 0x81, 0x0f, 0xb7, 0x28, 0x00, 0x41, 0xcd, 0x8c, 0xc4, 0x00, 0x87,
+    0x0f, 0xa1, 0x69, 0xc4, 0xd0, 0xf1, 0x0f, 0xd5, 0x20, 0xc5, 0x61, 0xc0,
+    0x0e, 0x98, 0x01, 0x1b, 0x41, 0xcd, 0x9e, 0x46, 0x45, 0x87, 0xc1, 0xcd,
+    0xaa, 0xd9, 0x1e, 0x69, 0x08, 0xb3, 0x19, 0xcf, 0x62, 0x5b, 0x00, 0xc0,
+    0x30, 0xca, 0x01, 0x28, 0x08, 0xb3, 0x4b, 0x01, 0xcd, 0xb0, 0xdc, 0x14,
+    0x85, 0x00, 0xc0, 0x38, 0xd5, 0x01, 0x32, 0x08, 0xb3, 0x40, 0x46, 0x00,
+    0x8b, 0x41, 0xcd, 0xb6, 0x46, 0x00, 0x8b, 0x41, 0xcd, 0xc2, 0xd9, 0x1e,
+    0x9b, 0x08, 0xb3, 0x11, 0x45, 0x09, 0x98, 0x41, 0xcd, 0xce, 0xc2, 0x01,
+    0xc3, 0x00, 0xc1, 0x73, 0x01, 0xcd, 0xf2, 0x83, 0x00, 0xc1, 0x03, 0x01,
+    0xcd, 0xf8, 0x16, 0xc1, 0xce, 0x04, 0x42, 0x11, 0xee, 0xc1, 0xce, 0x14,
+    0x15, 0xc1, 0xce, 0x1f, 0x1c, 0xc1, 0xce, 0x2f, 0x0e, 0xc1, 0xce, 0x3f,
+    0xc3, 0x39, 0x6e, 0x00, 0xc1, 0xf1, 0x0d, 0xc1, 0xce, 0x49, 0xc2, 0x00,
+    0x87, 0x00, 0xc1, 0xc9, 0xc2, 0x01, 0x4a, 0x00, 0xc1, 0xc1, 0xc2, 0x00,
+    0x39, 0x00, 0xc1, 0xb9, 0xc2, 0x19, 0x2c, 0x00, 0xc1, 0xb1, 0xc2, 0x25,
+    0x3b, 0x00, 0xc1, 0xa9, 0xc2, 0x0e, 0x9a, 0x00, 0xc1, 0x99, 0xc2, 0x01,
+    0x30, 0x00, 0xc1, 0x69, 0xc2, 0x0f, 0x9a, 0x00, 0xc1, 0x61, 0xc2, 0x00,
+    0xb0, 0x00, 0xc1, 0x59, 0xc2, 0x01, 0x5d, 0x00, 0xc1, 0x51, 0xc2, 0x00,
+    0xc1, 0x00, 0xc1, 0x41, 0x87, 0x00, 0xc1, 0x0b, 0x01, 0xce, 0x53, 0x97,
+    0x00, 0xc1, 0x23, 0x01, 0xce, 0x57, 0x91, 0x00, 0xc1, 0x1b, 0x01, 0xce,
+    0x5b, 0x8b, 0x00, 0xc1, 0x10, 0x57, 0x28, 0x43, 0xc1, 0xce, 0x5f, 0xc8,
+    0x3b, 0x7a, 0x00, 0xc0, 0x29, 0xc8, 0x11, 0xf7, 0x00, 0xc0, 0x18, 0xc9,
+    0x11, 0xf6, 0x00, 0xc0, 0x49, 0xc5, 0x0a, 0x8a, 0x00, 0xc0, 0x40, 0xc3,
+    0x0d, 0xe5, 0x00, 0xc0, 0x21, 0xc3, 0x0a, 0x8c, 0x00, 0xc0, 0x10, 0xca,
+    0xa0, 0xf8, 0x0f, 0xa5, 0xc1, 0xc3, 0x32, 0x20, 0x0f, 0xa5, 0x80, 0x06,
+    0xc1, 0xce, 0x6f, 0x45, 0x00, 0xba, 0xc1, 0xce, 0x81, 0xd1, 0x50, 0xce,
+    0x08, 0xb2, 0x19, 0x4b, 0x6f, 0xc7, 0xc1, 0xce, 0x91, 0x47, 0x02, 0x0e,
+    0x41, 0xce, 0xb1, 0x47, 0x02, 0x0e, 0xc1, 0xcf, 0x16, 0xd9, 0x1d, 0x88,
+    0x05, 0x5a, 0xd8, 0x48, 0x0b, 0x17, 0xc1, 0xcf, 0x5c, 0x12, 0xc1, 0xcf,
+    0xfd, 0xca, 0x9c, 0xac, 0x0e, 0xb8, 0xd1, 0xcc, 0x8b, 0x65, 0x0e, 0xb8,
+    0xc1, 0xcc, 0x89, 0xfd, 0x0e, 0xb8, 0xb9, 0xce, 0x10, 0x3e, 0x0e, 0xb8,
+    0xb1, 0x46, 0x03, 0x13, 0xc1, 0xd0, 0x0f, 0xc5, 0xdb, 0xf0, 0x0e, 0xb7,
+    0xd8, 0x15, 0xc1, 0xd0, 0xaf, 0x46, 0x09, 0x97, 0xc1, 0xd0, 0xbb, 0x48,
+    0x0b, 0x17, 0xc1, 0xd0, 0xdf, 0x47, 0xc7, 0x4a, 0xc1, 0xd1, 0x80, 0x12,
+    0xc1, 0xd1, 0xae, 0xca, 0x9c, 0xac, 0x0e, 0xb7, 0x01, 0xcc, 0x8b, 0x65,
+    0x0e, 0xb6, 0xf1, 0xcc, 0x89, 0xfd, 0x0e, 0xb6, 0xe9, 0xce, 0x10, 0x3e,
+    0x0e, 0xb6, 0xe1, 0xc5, 0xdb, 0xf0, 0x0e, 0xb6, 0x09, 0x48, 0xbd, 0x42,
+    0x41, 0xd1, 0xc0, 0x46, 0x09, 0x97, 0xc1, 0xd1, 0xcc, 0x46, 0x03, 0x13,
+    0xc1, 0xd1, 0xf0, 0x48, 0x0b, 0x17, 0x41, 0xd2, 0x58, 0x4a, 0x43, 0x55,
+    0xc1, 0xd2, 0xc0, 0x46, 0x07, 0x2f, 0x41, 0xd2, 0xde, 0x46, 0x09, 0x97,
+    0xc1, 0xd2, 0xea, 0x46, 0x03, 0x13, 0xc1, 0xd3, 0x0e, 0x48, 0x0b, 0x17,
+    0x41, 0xd3, 0x76, 0x47, 0xbd, 0x43, 0xc1, 0xd3, 0xc2, 0xcf, 0x35, 0x0c,
+    0x01, 0x3e, 0x68, 0x44, 0x00, 0x2e, 0xc1, 0xd3, 0xce, 0xcd, 0x27, 0x2f,
+    0x01, 0x3e, 0x58, 0xd5, 0x35, 0x36, 0x01, 0x3f, 0x71, 0x46, 0x01, 0xfc,
+    0xc1, 0xd3, 0xe6, 0xd4, 0x38, 0xf4, 0x01, 0x3f, 0x51, 0xcd, 0x0b, 0x91,
+    0x01, 0x3f, 0x40, 0xc3, 0x03, 0x26, 0x0e, 0x97, 0x90, 0xc4, 0x14, 0x09,
+    0x0e, 0x97, 0x88, 0xc4, 0x14, 0x09, 0x0e, 0x97, 0x80, 0xc5, 0x14, 0x08,
+    0x0e, 0x97, 0x79, 0xc2, 0x00, 0x5f, 0x0e, 0x97, 0x28, 0xc4, 0x14, 0x09,
+    0x0e, 0x97, 0x70, 0xc6, 0x52, 0xcd, 0x0e, 0x97, 0x69, 0xc3, 0x02, 0xdf,
+    0x0e, 0x97, 0x18, 0xc4, 0x22, 0x44, 0x0e, 0x97, 0x61, 0x91, 0x0e, 0x97,
+    0x10, 0xc2, 0x19, 0x2c, 0x08, 0xf7, 0x59, 0x83, 0x08, 0xf7, 0x41, 0xc2,
+    0x01, 0x30, 0x08, 0xf7, 0x10, 0xc4, 0x26, 0x78, 0x08, 0xea, 0xc9, 0xc5,
+    0x06, 0xdb, 0x08, 0xea, 0xc1, 0x15, 0xc1, 0xd3, 0xf2, 0x08, 0xc1, 0xd3,
+    0xfe, 0x16, 0xc1, 0xd4, 0x0a, 0xc3, 0x05, 0x14, 0x08, 0xea, 0x89, 0xc4,
+    0x15, 0xe7, 0x08, 0xea, 0x80, 0xc6, 0xd1, 0x39, 0x08, 0xea, 0x39, 0xc4,
+    0xbb, 0x54, 0x08, 0xea, 0x30, 0xc5, 0x1e, 0x96, 0x08, 0xea, 0x29, 0x4a,
+    0x6f, 0xc8, 0x41, 0xd4, 0x16, 0xc7, 0xc3, 0xa7, 0x08, 0xea, 0x21, 0xc6,
+    0x1e, 0x89, 0x08, 0xea, 0x19, 0xc5, 0x33, 0x5d, 0x08, 0xea, 0x11, 0xc7,
+    0x40, 0xe5, 0x08, 0xea, 0x09, 0xc8, 0x14, 0x38, 0x08, 0xea, 0x00, 0x16,
+    0xc1, 0xd4, 0x36, 0x0c, 0xc1, 0xd4, 0x4a, 0x0d, 0xc1, 0xd4, 0x5a, 0x0e,
+    0xc1, 0xd4, 0x6a, 0xc2, 0x00, 0xd0, 0x08, 0xe9, 0x61, 0x15, 0xc1, 0xd4,
+    0x74, 0xc2, 0x02, 0x41, 0x08, 0xe9, 0x41, 0xc2, 0x00, 0x39, 0x08, 0xe9,
+    0x31, 0xc2, 0x19, 0x2c, 0x08, 0xe9, 0x29, 0xc2, 0x01, 0xc3, 0x08, 0xe9,
+    0x21, 0x04, 0xc1, 0xd4, 0x84, 0x12, 0xc1, 0xd4, 0x8e, 0x10, 0xc1, 0xd4,
+    0x98, 0x06, 0xc1, 0xd4, 0xae, 0x05, 0xc1, 0xd4, 0xbc, 0x09, 0xc1, 0xd4,
+    0xc6, 0x83, 0x08, 0xe8, 0x03, 0x01, 0xd4, 0xd0, 0x91, 0x08, 0xe8, 0x49,
+    0x87, 0x08, 0xe8, 0x31, 0x97, 0x08, 0xe8, 0x23, 0x01, 0xd4, 0xdc, 0x8b,
+    0x08, 0xe8, 0x12, 0x01, 0xd4, 0xe0, 0x44, 0x00, 0xbb, 0xc1, 0xd4, 0xe4,
+    0x50, 0x5c, 0xf2, 0x41, 0xd4, 0xf0, 0x91, 0x08, 0xe5, 0xa1, 0x87, 0x08,
+    0xe5, 0x99, 0x97, 0x08, 0xe5, 0x91, 0x8b, 0x08, 0xe5, 0x89, 0xc2, 0x04,
+    0xc6, 0x08, 0xe5, 0x80, 0x83, 0x08, 0xe4, 0x79, 0xc2, 0x00, 0xd0, 0x08,
+    0xe4, 0x71, 0x15, 0xc1, 0xd5, 0x4a, 0xc2, 0x00, 0xdb, 0x08, 0xe4, 0x59,
+    0xc2, 0x00, 0x39, 0x08, 0xe4, 0x51, 0xc2, 0x19, 0x2c, 0x08, 0xe4, 0x49,
+    0xc2, 0x00, 0x02, 0x08, 0xe4, 0x41, 0x1c, 0xc1, 0xd5, 0x54, 0xc2, 0x01,
+    0x4a, 0x08, 0xe4, 0x29, 0x06, 0xc1, 0xd5, 0x5e, 0x16, 0xc1, 0xd5, 0x68,
+    0xc2, 0x01, 0xc3, 0x08, 0xe4, 0x09, 0xc2, 0x01, 0x5d, 0x08, 0xe4, 0x01,
+    0x12, 0xc1, 0xd5, 0x76, 0x10, 0xc1, 0xd5, 0x80, 0xc2, 0x25, 0x3b, 0x08,
+    0xe3, 0xc1, 0x05, 0xc1, 0xd5, 0x90, 0xc2, 0x01, 0x30, 0x08, 0xe3, 0xa1,
+    0x0d, 0x41, 0xd5, 0x9a, 0xd8, 0x20, 0xf3, 0x01, 0x35, 0x39, 0xc4, 0x00,
+    0xba, 0x01, 0x35, 0x30, 0x05, 0xc1, 0xd5, 0xa4, 0x03, 0xc1, 0xd5, 0xb6,
+    0x18, 0xc1, 0xd5, 0xc2, 0xc4, 0x00, 0xb0, 0x00, 0x6a, 0x78, 0x18, 0xc1,
+    0xd5, 0xcc, 0x83, 0x00, 0x68, 0x2b, 0x01, 0xd5, 0xdc, 0x8b, 0x00, 0x68,
+    0x3b, 0x01, 0xd5, 0xee, 0x97, 0x00, 0x68, 0x4b, 0x01, 0xd5, 0xf2, 0x87,
+    0x00, 0x68, 0x73, 0x01, 0xd5, 0xf6, 0x91, 0x00, 0x68, 0x93, 0x01, 0xd5,
+    0xfa, 0x0d, 0xc1, 0xd5, 0xfe, 0x09, 0xc1, 0xd6, 0x08, 0x10, 0xc1, 0xd6,
+    0x12, 0x05, 0xc1, 0xd6, 0x26, 0x0c, 0xc1, 0xd6, 0x2e, 0x16, 0xc1, 0xd6,
+    0x38, 0x06, 0xc1, 0xd6, 0x46, 0x12, 0xc1, 0xd6, 0x5a, 0x04, 0xc1, 0xd6,
+    0x64, 0xc2, 0x01, 0xc3, 0x00, 0x69, 0x71, 0xc2, 0x19, 0x2c, 0x00, 0x69,
+    0x79, 0x14, 0xc1, 0xd6, 0x6e, 0x0e, 0xc1, 0xd6, 0x78, 0x15, 0xc1, 0xd6,
+    0x80, 0xc2, 0x00, 0xd0, 0x00, 0x69, 0xc8, 0x03, 0xc1, 0xd6, 0x90, 0x8b,
+    0x00, 0x69, 0xfb, 0x01, 0xd6, 0x9c, 0x97, 0x00, 0x6a, 0x0b, 0x01, 0xd6,
+    0xa0, 0x48, 0xb2, 0x2d, 0xc1, 0xd6, 0xa4, 0x87, 0x00, 0x6a, 0x33, 0x01,
+    0xd6, 0xb2, 0x91, 0x00, 0x6a, 0x52, 0x01, 0xd6, 0xb6, 0x44, 0x05, 0x14,
+    0xc1, 0xd6, 0xba, 0x46, 0x02, 0xdd, 0x41, 0xd6, 0xe0, 0x45, 0x09, 0x98,
+    0xc1, 0xd6, 0xf8, 0xc8, 0xbc, 0xda, 0x00, 0x6b, 0xc8, 0xc3, 0x09, 0x41,
+    0x00, 0x6b, 0x81, 0x44, 0x05, 0x14, 0x41, 0xd7, 0x1c, 0xcb, 0x92, 0x07,
+    0x08, 0x57, 0xb1, 0xc8, 0x02, 0x9f, 0x08, 0x57, 0xa9, 0x42, 0x00, 0x58,
+    0xc1, 0xd7, 0x28, 0xc7, 0x2c, 0xab, 0x08, 0x57, 0x89, 0xc4, 0x0e, 0x6a,
+    0x08, 0x57, 0x80, 0xc3, 0x05, 0x14, 0x08, 0x57, 0x5b, 0x01, 0xd7, 0x35,
+    0x16, 0xc1, 0xd7, 0x3b, 0xc4, 0x0d, 0x13, 0x08, 0x57, 0x60, 0xc5, 0x05,
+    0x02, 0x08, 0x57, 0x31, 0xc5, 0x00, 0xd4, 0x08, 0x57, 0x28, 0x16, 0xc1,
+    0xd7, 0x47, 0x15, 0xc1, 0xd7, 0x59, 0xc4, 0x5d, 0xe2, 0x08, 0x57, 0x09,
+    0x13, 0xc1, 0xd7, 0x69, 0x1a, 0xc1, 0xd7, 0x75, 0xc2, 0x14, 0xda, 0x08,
+    0x56, 0xe1, 0xc2, 0x00, 0x67, 0x08, 0x56, 0xd9, 0x03, 0xc1, 0xd7, 0x81,
+    0xc3, 0x20, 0x18, 0x08, 0x56, 0xb9, 0xc3, 0x00, 0x4e, 0x08, 0x56, 0xb1,
+    0x06, 0xc1, 0xd7, 0x93, 0xc6, 0xcf, 0xd7, 0x08, 0x56, 0x99, 0x0d, 0xc1,
+    0xd7, 0x9f, 0xc4, 0x4a, 0xb9, 0x08, 0x56, 0x79, 0xc2, 0x01, 0x7f, 0x08,
+    0x56, 0x33, 0x01, 0xd7, 0xab, 0x0c, 0xc1, 0xd7, 0xb1, 0x1c, 0xc1, 0xd7,
+    0xbd, 0xc3, 0x7e, 0x89, 0x08, 0x56, 0x39, 0x09, 0xc1, 0xd7, 0xc9, 0x04,
+    0x41, 0xd7, 0xd5, 0xd8, 0x22, 0xd3, 0x0f, 0xab, 0xa1, 0xc6, 0xd1, 0xdb,
+    0x0f, 0xc9, 0xa8, 0xc6, 0xd2, 0x9b, 0x0f, 0xa3, 0x99, 0xca, 0xa1, 0x66,
+    0x0f, 0xa3, 0x90, 0x03, 0xc1, 0xd7, 0xe1, 0xc3, 0xa7, 0x52, 0x00, 0x42,
+    0xb9, 0xc8, 0xb9, 0xc2, 0x00, 0x42, 0xb1, 0x0b, 0xc1, 0xd8, 0x28, 0xc7,
+    0xb9, 0xc3, 0x00, 0x42, 0x29, 0xc5, 0xd6, 0xc3, 0x00, 0x42, 0x00, 0xcc,
+    0x85, 0xd1, 0x08, 0x8b, 0xb1, 0x46, 0x02, 0x0f, 0x41, 0xd8, 0x30, 0xcb,
+    0x45, 0x8e, 0x08, 0x8b, 0xa9, 0xc9, 0xad, 0xb6, 0x08, 0x8b, 0x98, 0xc5,
+    0x06, 0xbb, 0x0f, 0x81, 0x49, 0xc8, 0xb5, 0xa2, 0x0f, 0x80, 0x11, 0xcb,
+    0x8f, 0x3c, 0x0f, 0x80, 0x30, 0xc8, 0xbd, 0xa2, 0x0f, 0x80, 0x01, 0x48,
+    0xae, 0x47, 0x41, 0xd8, 0x8a, 0xc9, 0xab, 0xbe, 0x0f, 0x80, 0x09, 0x46,
+    0xd1, 0xf9, 0xc1, 0xd8, 0x94, 0x48, 0xb5, 0x32, 0xc1, 0xd8, 0x9e, 0xc5,
+    0xc1, 0x78, 0x0f, 0x81, 0x31, 0xc5, 0xda, 0x60, 0x0f, 0x81, 0x38, 0xc9,
+    0xac, 0x06, 0x0f, 0x80, 0x19, 0x47, 0xbb, 0x83, 0x41, 0xd8, 0xa8, 0x46,
+    0xbb, 0x84, 0xc1, 0xd8, 0xb2, 0xc5, 0xd6, 0xf0, 0x0f, 0x81, 0x18, 0x46,
+    0xd2, 0xe9, 0xc1, 0xd8, 0xbc, 0x48, 0xbe, 0x4a, 0x41, 0xd8, 0xc6, 0x47,
+    0xc5, 0x7c, 0xc1, 0xd8, 0xd0, 0x47, 0xc7, 0x2e, 0x41, 0xd8, 0xda, 0xc2,
+    0x00, 0x3b, 0x0f, 0x81, 0x59, 0xc4, 0x8e, 0x88, 0x0f, 0x81, 0x20, 0x15,
+    0xc1, 0xd8, 0xe4, 0xc8, 0x87, 0xb5, 0x0f, 0x9d, 0xcb, 0x01, 0xd8, 0xf0,
+    0xc4, 0x23, 0x2e, 0x0f, 0x9d, 0xa8, 0xca, 0xa2, 0xba, 0x01, 0x33, 0x79,
+    0xcc, 0x83, 0xf1, 0x01, 0x33, 0x71, 0xc9, 0xb3, 0xb0, 0x01, 0x33, 0x68,
+    0x48, 0x1f, 0x1f, 0xc1, 0xd8, 0xf6, 0xcf, 0x65, 0x2b, 0x0f, 0x9d, 0xb0,
+    0x00, 0x41, 0xd9, 0x03, 0x14, 0xc1, 0xd9, 0x0f, 0xc2, 0x00, 0xd0, 0x08,
+    0x95, 0x31, 0xc2, 0x0d, 0xf6, 0x08, 0x95, 0x29, 0xc2, 0x02, 0x41, 0x08,
+    0x95, 0x21, 0xc2, 0x00, 0xdb, 0x08, 0x95, 0x19, 0xc2, 0x19, 0x2c, 0x08,
+    0x95, 0x09, 0xc2, 0x01, 0xc3, 0x08, 0x95, 0x01, 0x04, 0xc1, 0xd9, 0x1f,
+    0x12, 0xc1, 0xd9, 0x29, 0x10, 0xc1, 0xd9, 0x33, 0x06, 0xc1, 0xd9, 0x43,
+    0x16, 0xc1, 0xd9, 0x51, 0x0c, 0xc1, 0xd9, 0x5f, 0x05, 0xc1, 0xd9, 0x69,
+    0x09, 0xc1, 0xd9, 0x73, 0x0d, 0xc1, 0xd9, 0x7d, 0x87, 0x08, 0x94, 0x19,
+    0x83, 0x08, 0x94, 0x01, 0x8b, 0x08, 0x94, 0x09, 0x97, 0x08, 0x94, 0x10,
+    0xc4, 0x18, 0x10, 0x0b, 0x53, 0x39, 0xc2, 0x22, 0xcc, 0x0b, 0x53, 0x30,
+    0xc3, 0x0d, 0x14, 0x0b, 0x53, 0x29, 0xc3, 0x09, 0x9e, 0x0b, 0x53, 0x20,
+    0xc4, 0x02, 0xde, 0x0b, 0x53, 0x19, 0xc2, 0x02, 0xa0, 0x0b, 0x53, 0x10,
+    0xa2, 0x05, 0x53, 0xe9, 0x9f, 0x05, 0x53, 0xe0, 0x44, 0x00, 0xd0, 0xc1,
+    0xd9, 0x87, 0xc6, 0x00, 0x41, 0x00, 0x82, 0x58, 0xc7, 0x14, 0x39, 0x00,
+    0x81, 0xb1, 0xc3, 0x89, 0x6c, 0x00, 0x81, 0xd0, 0xc5, 0x40, 0xe7, 0x00,
+    0x81, 0xc1, 0xc4, 0x1e, 0x97, 0x00, 0x81, 0xc8, 0x9e, 0x00, 0x83, 0x49,
+    0x9f, 0x00, 0x83, 0x51, 0xa0, 0x00, 0x83, 0x59, 0xa1, 0x00, 0x83, 0x61,
+    0xa2, 0x00, 0x83, 0x68, 0x9e, 0x00, 0x84, 0xd1, 0xa0, 0x00, 0x84, 0xd8,
+    0x45, 0xc7, 0x97, 0xc1, 0xd9, 0x99, 0xcd, 0x7b, 0xb1, 0x00, 0x82, 0x70,
+    0xc3, 0x05, 0x14, 0x00, 0x84, 0xf1, 0xcb, 0x0f, 0x09, 0x00, 0x84, 0xf8,
+    0xc2, 0x02, 0xa0, 0x00, 0x84, 0x91, 0xc4, 0x02, 0xde, 0x00, 0x84, 0x98,
+    0xc3, 0x09, 0x9e, 0x00, 0x84, 0xa1, 0xc3, 0x0d, 0x14, 0x00, 0x84, 0xa8,
+    0xc2, 0x22, 0xcc, 0x00, 0x84, 0xb1, 0xc4, 0x18, 0x10, 0x00, 0x84, 0xb8,
+    0xc7, 0xc7, 0x97, 0x05, 0x53, 0xd1, 0x97, 0x00, 0x81, 0x50, 0xc2, 0x00,
+    0xd0, 0x00, 0x80, 0x0b, 0x01, 0xd9, 0xab, 0x83, 0x00, 0x80, 0x00, 0x83,
+    0x00, 0x80, 0x83, 0x01, 0xd9, 0xb1, 0x16, 0xc1, 0xd9, 0xb7, 0xc2, 0x00,
+    0xd0, 0x00, 0x80, 0x88, 0x0a, 0xc1, 0xd9, 0xc1, 0x83, 0x00, 0x80, 0xf1,
+    0xc2, 0x0d, 0xf6, 0x00, 0x82, 0x89, 0xcd, 0x7c, 0x19, 0x00, 0x83, 0x08,
+    0x83, 0x00, 0x80, 0x11, 0xc2, 0x00, 0xd0, 0x00, 0x80, 0x19, 0xc7, 0xbd,
+    0xeb, 0x00, 0x81, 0xf8, 0xc2, 0x01, 0x30, 0x00, 0x80, 0x21, 0xc2, 0x19,
+    0x2c, 0x00, 0x80, 0x49, 0x10, 0xc1, 0xd9, 0xce, 0x83, 0x00, 0x80, 0xa0,
+    0x83, 0x00, 0x80, 0x29, 0xc2, 0x00, 0xd0, 0x00, 0x80, 0x30, 0x83, 0x00,
+    0x80, 0x39, 0xc2, 0x00, 0xd0, 0x00, 0x80, 0x40, 0x06, 0xc1, 0xd9, 0xd8,
+    0x83, 0x00, 0x80, 0x91, 0xc2, 0x00, 0xd0, 0x00, 0x80, 0x98, 0x83, 0x00,
+    0x80, 0xa9, 0xc2, 0x00, 0xd0, 0x00, 0x80, 0xb0, 0x83, 0x00, 0x80, 0xb9,
+    0xc2, 0x00, 0xd0, 0x00, 0x80, 0xc0, 0x83, 0x00, 0x80, 0xc9, 0x43, 0x01,
+    0x55, 0x41, 0xd9, 0xe2, 0x83, 0x00, 0x80, 0xd9, 0xcf, 0x65, 0x0d, 0x00,
+    0x84, 0x70, 0x83, 0x00, 0x80, 0xe1, 0xc2, 0x00, 0xdb, 0x00, 0x81, 0x00,
+    0x83, 0x00, 0x80, 0xe9, 0x51, 0x28, 0xa0, 0x41, 0xd9, 0xf8, 0x8b, 0x00,
+    0x81, 0x20, 0x97, 0x00, 0x81, 0x30, 0x51, 0x50, 0x02, 0x41, 0xda, 0x04,
+    0x94, 0x00, 0x82, 0x93, 0x01, 0xda, 0x16, 0x8e, 0x00, 0x82, 0xa2, 0x01,
+    0xda, 0x1a, 0xc4, 0x18, 0x10, 0x05, 0x4f, 0xb9, 0xc2, 0x22, 0xcc, 0x05,
+    0x4f, 0xb0, 0xc3, 0x0d, 0x14, 0x05, 0x4f, 0xa9, 0xc3, 0x09, 0x9e, 0x05,
+    0x4f, 0xa0, 0xc4, 0x02, 0xde, 0x05, 0x4f, 0x99, 0xc2, 0x02, 0xa0, 0x05,
+    0x4f, 0x90, 0xc5, 0xd5, 0xc9, 0x00, 0x84, 0xe2, 0x01, 0xda, 0x1e, 0x94,
+    0x00, 0x82, 0xb8, 0x8e, 0x00, 0x82, 0xc8, 0xc2, 0x04, 0xc6, 0x00, 0x84,
+    0x19, 0x87, 0x00, 0x84, 0x23, 0x01, 0xda, 0x22, 0xc7, 0xca, 0x30, 0x00,
+    0x84, 0x30, 0xc2, 0x19, 0x2c, 0x00, 0x81, 0xd9, 0xc2, 0x00, 0x39, 0x00,
+    0x81, 0xe1, 0xc2, 0x01, 0x4a, 0x00, 0x81, 0xe9, 0xc2, 0x00, 0xd0, 0x00,
+    0x81, 0xf0, 0xc2, 0x00, 0xc1, 0x00, 0x82, 0xf1, 0xc2, 0x01, 0xc3, 0x00,
+    0x82, 0xf9, 0xc2, 0x00, 0xdb, 0x00, 0x83, 0x00, 0x15, 0xc1, 0xda, 0x28,
+    0x83, 0x01, 0x85, 0x13, 0x01, 0xda, 0x42, 0x0f, 0xc1, 0xda, 0x48, 0x8b,
+    0x01, 0x85, 0x21, 0x97, 0x01, 0x85, 0x31, 0x87, 0x01, 0x85, 0x41, 0x91,
+    0x01, 0x85, 0x51, 0x0d, 0xc1, 0xda, 0x5f, 0x09, 0xc1, 0xda, 0x73, 0x1c,
+    0xc1, 0xda, 0x87, 0x16, 0xc1, 0xda, 0x9b, 0x06, 0xc1, 0xda, 0xaf, 0x90,
+    0x01, 0x87, 0x9b, 0x01, 0xda, 0xc3, 0x0a, 0xc1, 0xda, 0xd7, 0x04, 0xc1,
+    0xda, 0xeb, 0x12, 0xc1, 0xda, 0xff, 0x1b, 0xc1, 0xdb, 0x13, 0x14, 0xc1,
+    0xdb, 0x1f, 0x19, 0xc1, 0xdb, 0x33, 0x18, 0x41, 0xdb, 0x43, 0x97, 0x08,
+    0x85, 0xc1, 0x8b, 0x08, 0x85, 0xb1, 0x83, 0x08, 0x85, 0x80, 0x97, 0x08,
+    0x85, 0xa0, 0x8b, 0x08, 0x85, 0x90, 0xc5, 0x86, 0x20, 0x08, 0x86, 0x09,
+    0xcc, 0x45, 0x8d, 0x08, 0x85, 0xf8, 0xc5, 0x33, 0x5d, 0x08, 0x85, 0xd1,
+    0x42, 0x07, 0xb2, 0xc1, 0xdb, 0x57, 0xc8, 0x14, 0x38, 0x08, 0x84, 0x09,
+    0xcb, 0x1e, 0x89, 0x08, 0x84, 0x00, 0x83, 0x08, 0x85, 0x71, 0xc2, 0x0d,
+    0xf6, 0x08, 0x85, 0x69, 0xc2, 0x00, 0xd0, 0x08, 0x85, 0x60, 0x83, 0x08,
+    0x85, 0x49, 0xc2, 0x00, 0xd0, 0x08, 0x84, 0xe0, 0xc2, 0x00, 0xd0, 0x08,
+    0x85, 0x31, 0x83, 0x08, 0x85, 0x28, 0xc2, 0x00, 0xd0, 0x08, 0x85, 0x21,
+    0x83, 0x08, 0x85, 0x18, 0x83, 0x08, 0x85, 0x11, 0xc2, 0x00, 0xc1, 0x08,
+    0x84, 0xe9, 0xc2, 0x19, 0x2c, 0x08, 0x84, 0xb1, 0xc2, 0x01, 0x30, 0x08,
+    0x84, 0x88, 0xc2, 0x00, 0xd0, 0x08, 0x85, 0x09, 0x83, 0x08, 0x85, 0x01,
+    0x06, 0x41, 0xdb, 0x63, 0xc2, 0x00, 0xd0, 0x08, 0x84, 0xf9, 0x83, 0x08,
+    0x84, 0xf1, 0x16, 0x41, 0xdb, 0x73, 0xc2, 0x00, 0xd0, 0x08, 0x84, 0xa9,
+    0x83, 0x08, 0x84, 0xa0, 0xc2, 0x00, 0xd0, 0x08, 0x84, 0x99, 0x83, 0x08,
+    0x84, 0x90, 0xc2, 0x00, 0xd0, 0x08, 0x84, 0x81, 0x83, 0x08, 0x84, 0x78,
+    0xc2, 0x00, 0xd0, 0x08, 0x84, 0x71, 0x83, 0x08, 0x84, 0x68, 0x97, 0x08,
+    0x84, 0x61, 0x8b, 0x08, 0x84, 0x51, 0x83, 0x08, 0x84, 0x20, 0x97, 0x08,
+    0x84, 0x40, 0x8b, 0x08, 0x84, 0x30, 0xc7, 0xca, 0x76, 0x05, 0x49, 0x68,
+    0x87, 0x05, 0x49, 0x48, 0x87, 0x05, 0x49, 0x30, 0x91, 0x05, 0x49, 0x29,
+    0x87, 0x05, 0x49, 0x18, 0x83, 0x05, 0x48, 0xf9, 0xc2, 0x01, 0x6f, 0x05,
+    0x48, 0x98, 0xc2, 0x00, 0xd0, 0x05, 0x48, 0xf1, 0x83, 0x05, 0x48, 0x90,
+    0xc2, 0x00, 0xd0, 0x05, 0x48, 0xb1, 0x83, 0x05, 0x48, 0xa8, 0x83, 0x05,
+    0x48, 0xa1, 0xc2, 0x19, 0x2c, 0x05, 0x48, 0x89, 0xc2, 0x01, 0x30, 0x05,
+    0x48, 0x68, 0xc2, 0x00, 0xd0, 0x05, 0x48, 0x79, 0x83, 0x05, 0x48, 0x70,
+    0xc2, 0x00, 0xd0, 0x05, 0x48, 0x59, 0x83, 0x05, 0x48, 0x50, 0xc4, 0x18,
+    0x10, 0x05, 0x48, 0x39, 0xc2, 0x22, 0xcc, 0x05, 0x48, 0x30, 0xc3, 0x0d,
+    0x14, 0x05, 0x48, 0x29, 0xc3, 0x09, 0x9e, 0x05, 0x48, 0x20, 0xc4, 0x02,
+    0xde, 0x05, 0x48, 0x19, 0xc2, 0x02, 0xa0, 0x05, 0x48, 0x10, 0x15, 0xc1,
+    0xdb, 0x7d, 0xcb, 0x1e, 0x89, 0x00, 0x64, 0x09, 0x03, 0xc1, 0xdb, 0x89,
+    0x42, 0x07, 0xb2, 0xc1, 0xdb, 0x95, 0xc5, 0x33, 0x5d, 0x00, 0x65, 0xe1,
+    0xcb, 0x8f, 0xe1, 0x00, 0x67, 0x89, 0xcb, 0x93, 0xf6, 0x00, 0x67, 0x90,
+    0x45, 0x02, 0x10, 0xc1, 0xdb, 0xa1, 0xc9, 0x36, 0x53, 0x00, 0x66, 0xa8,
+    0x03, 0xc1, 0xdc, 0x10, 0x8b, 0x00, 0x65, 0xfb, 0x01, 0xdc, 0x1c, 0x97,
+    0x00, 0x66, 0x0b, 0x01, 0xdc, 0x20, 0x48, 0xb2, 0x2d, 0xc1, 0xdc, 0x24,
+    0x87, 0x00, 0x66, 0x33, 0x01, 0xdc, 0x32, 0x91, 0x00, 0x66, 0x52, 0x01,
+    0xdc, 0x36, 0xc4, 0x15, 0xe7, 0x00, 0x67, 0x31, 0xc3, 0x05, 0x14, 0x00,
+    0x67, 0x39, 0x16, 0xc1, 0xdc, 0x3a, 0x08, 0xc1, 0xdc, 0x46, 0x15, 0xc1,
+    0xdc, 0x52, 0xc5, 0x06, 0xdb, 0x00, 0x67, 0x71, 0xc4, 0x26, 0x78, 0x00,
+    0x67, 0x78, 0x11, 0xc1, 0xdc, 0x5e, 0x0e, 0xc1, 0xdc, 0x71, 0x06, 0xc1,
+    0xdc, 0x86, 0x15, 0xc1, 0xdc, 0x96, 0x0a, 0xc1, 0xdc, 0xe0, 0x16, 0xc1,
+    0xdc, 0xf2, 0x0f, 0xc1, 0xdd, 0x17, 0x07, 0xc1, 0xdd, 0x29, 0x05, 0xc1,
+    0xdd, 0x4c, 0x0b, 0xc1, 0xdd, 0x64, 0xc5, 0xa0, 0xc1, 0x01, 0x78, 0x89,
+    0x12, 0xc1, 0xdd, 0x6e, 0x19, 0xc1, 0xdd, 0x84, 0x14, 0xc1, 0xdd, 0x9e,
+    0x03, 0xc1, 0xdd, 0xb8, 0x09, 0xc1, 0xdd, 0xd0, 0x04, 0xc1, 0xdd, 0xe9,
+    0x10, 0xc1, 0xde, 0x03, 0x08, 0xc1, 0xde, 0x0d, 0x42, 0x25, 0x3b, 0xc1,
+    0xde, 0x2f, 0xc3, 0x26, 0x9b, 0x01, 0x7b, 0x21, 0x18, 0xc1, 0xde, 0x39,
+    0xc6, 0xc6, 0x9b, 0x01, 0x7e, 0x40, 0x06, 0xc1, 0xde, 0x45, 0x05, 0xc1,
+    0xde, 0x5d, 0x04, 0xc1, 0xde, 0x9d, 0x03, 0xc1, 0xde, 0xdd, 0x26, 0xc1,
+    0xdf, 0x1d, 0x25, 0xc1, 0xdf, 0x5d, 0x24, 0xc1, 0xdf, 0x9d, 0x23, 0xc1,
+    0xdf, 0xdd, 0x22, 0xc1, 0xe0, 0x1d, 0x21, 0xc1, 0xe0, 0x5d, 0x20, 0xc1,
+    0xe0, 0x9d, 0x1f, 0xc1, 0xe0, 0xdd, 0x1e, 0xc1, 0xe1, 0x1d, 0x1d, 0x41,
+    0xe1, 0x5d, 0x08, 0xc1, 0xe1, 0x9d, 0x07, 0xc1, 0xe1, 0xdd, 0x06, 0xc1,
+    0xe2, 0x1d, 0x05, 0xc1, 0xe2, 0x5d, 0x04, 0xc1, 0xe2, 0x9d, 0x03, 0xc1,
+    0xe2, 0xdd, 0x26, 0xc1, 0xe3, 0x1d, 0x25, 0xc1, 0xe3, 0x5d, 0x24, 0xc1,
+    0xe3, 0x9d, 0x23, 0xc1, 0xe3, 0xdd, 0x22, 0xc1, 0xe4, 0x1d, 0x21, 0xc1,
+    0xe4, 0x5d, 0x20, 0xc1, 0xe4, 0x9d, 0x1f, 0xc1, 0xe4, 0xdd, 0x1e, 0xc1,
+    0xe5, 0x1d, 0x1d, 0x41, 0xe5, 0x5d, 0xc4, 0x18, 0x10, 0x08, 0x97, 0xb9,
+    0xc2, 0x22, 0xcc, 0x08, 0x97, 0xb0, 0xc3, 0x0d, 0x14, 0x08, 0x97, 0xa9,
+    0xc3, 0x09, 0x9e, 0x08, 0x97, 0xa0, 0xc4, 0x02, 0xde, 0x08, 0x97, 0x99,
+    0xc2, 0x02, 0xa0, 0x08, 0x97, 0x90, 0x8b, 0x08, 0x97, 0x31, 0x83, 0x08,
+    0x97, 0x01, 0x97, 0x08, 0x97, 0x40, 0x97, 0x08, 0x97, 0x20, 0x8b, 0x08,
+    0x97, 0x10, 0x83, 0x08, 0x96, 0xe9, 0xc2, 0x00, 0xd0, 0x08, 0x96, 0xe0,
+    0x83, 0x08, 0x96, 0xc9, 0xc2, 0x00, 0x39, 0x08, 0x96, 0x50, 0xc2, 0x00,
+    0xd0, 0x08, 0x96, 0xb1, 0xc2, 0x01, 0x5d, 0x08, 0x96, 0xa9, 0x83, 0x08,
+    0x96, 0xa0, 0xc2, 0x00, 0xd0, 0x08, 0x96, 0x99, 0x83, 0x08, 0x96, 0x90,
+    0x83, 0x08, 0x96, 0x89, 0xc2, 0x00, 0xc1, 0x08, 0x96, 0x61, 0xc2, 0x19,
+    0x2c, 0x08, 0x96, 0x29, 0xc2, 0x01, 0x30, 0x08, 0x95, 0xf8, 0xc2, 0x00,
+    0xd0, 0x08, 0x96, 0x81, 0x83, 0x08, 0x96, 0x79, 0x06, 0x41, 0xe5, 0x9d,
+    0xc2, 0x00, 0xd0, 0x08, 0x96, 0x71, 0x83, 0x08, 0x96, 0x69, 0x16, 0x41,
+    0xe5, 0xad, 0xc2, 0x00, 0xd0, 0x08, 0x96, 0x21, 0xc2, 0x25, 0x3b, 0x08,
+    0x96, 0x19, 0x83, 0x08, 0x96, 0x10, 0xc2, 0x00, 0xd0, 0x08, 0x96, 0x09,
+    0x83, 0x08, 0x96, 0x00, 0xc2, 0x00, 0xd0, 0x08, 0x95, 0xf1, 0xc2, 0x01,
+    0x30, 0x08, 0x95, 0xe9, 0x83, 0x08, 0x95, 0xe0, 0xc2, 0x00, 0xd0, 0x08,
+    0x95, 0xd9, 0x83, 0x08, 0x95, 0xd0, 0x97, 0x08, 0x95, 0xc9, 0x8b, 0x08,
+    0x95, 0xb9, 0x83, 0x08, 0x95, 0x88, 0x97, 0x08, 0x95, 0xa8, 0x8b, 0x08,
+    0x95, 0x98, 0x15, 0xc1, 0xe5, 0xb7, 0xc5, 0x33, 0x5d, 0x08, 0x91, 0xb1,
+    0xc6, 0x1e, 0x95, 0x08, 0x91, 0xa9, 0xc8, 0x14, 0x38, 0x08, 0x91, 0xa0,
+    0xcc, 0x45, 0x8d, 0x08, 0x91, 0xe1, 0xc5, 0x86, 0x20, 0x08, 0x91, 0xc8,
+    0x97, 0x08, 0x91, 0x99, 0x8b, 0x08, 0x91, 0x89, 0x83, 0x08, 0x91, 0x60,
+    0x8b, 0x08, 0x91, 0x70, 0xc2, 0x00, 0xdb, 0x08, 0x91, 0x59, 0x83, 0x08,
+    0x91, 0x38, 0xc2, 0x00, 0xd0, 0x08, 0x91, 0x19, 0xc2, 0x01, 0x5d, 0x08,
+    0x91, 0x11, 0x83, 0x08, 0x91, 0x08, 0xc2, 0x00, 0xd0, 0x08, 0x91, 0x01,
+    0x83, 0x08, 0x90, 0xf8, 0x83, 0x08, 0x90, 0xf1, 0xc2, 0x00, 0xc1, 0x08,
+    0x90, 0xc1, 0xc2, 0x19, 0x2c, 0x08, 0x90, 0x99, 0xc2, 0x01, 0x30, 0x08,
+    0x90, 0x68, 0xc2, 0x00, 0xd0, 0x08, 0x90, 0xe9, 0x06, 0xc1, 0xe5, 0xc3,
+    0x83, 0x08, 0x90, 0xd8, 0xc2, 0x00, 0xd0, 0x08, 0x90, 0xd1, 0x83, 0x08,
+    0x90, 0xc9, 0x16, 0x41, 0xe5, 0xd3, 0xc2, 0x25, 0x3b, 0x08, 0x90, 0x89,
+    0x83, 0x08, 0x90, 0x80, 0xc2, 0x00, 0xd0, 0x08, 0x90, 0x79, 0x83, 0x08,
+    0x90, 0x70, 0xc2, 0x00, 0xd0, 0x08, 0x90, 0x61, 0xc2, 0x01, 0x30, 0x08,
+    0x90, 0x59, 0x83, 0x08, 0x90, 0x50, 0xc2, 0x00, 0xd0, 0x08, 0x90, 0x49,
+    0x83, 0x08, 0x90, 0x40, 0x97, 0x08, 0x90, 0x39, 0x8b, 0x08, 0x90, 0x29,
+    0x83, 0x08, 0x90, 0x08, 0x43, 0x4e, 0xf0, 0xc1, 0xe5, 0xdd, 0x12, 0xc1,
+    0xe5, 0xe5, 0x04, 0xc1, 0xe5, 0xf7, 0x45, 0xda, 0x97, 0xc1, 0xe6, 0x03,
+    0xc9, 0xb2, 0x51, 0x00, 0xcf, 0x81, 0x4a, 0xa2, 0x42, 0x41, 0xe6, 0x0f,
+    0x03, 0xc1, 0xe6, 0x23, 0x0d, 0xc1, 0xe6, 0x35, 0xcb, 0x93, 0x93, 0x00,
+    0xbe, 0xc9, 0x04, 0xc1, 0xe6, 0x47, 0xc7, 0xc2, 0x1f, 0x00, 0xbe, 0xb9,
+    0x05, 0xc1, 0xe6, 0x51, 0xc6, 0xcb, 0x69, 0x00, 0xbe, 0x89, 0xcd, 0x78,
+    0x23, 0x00, 0xbe, 0x81, 0x16, 0xc1, 0xe6, 0x5d, 0x14, 0xc1, 0xe6, 0x69,
+    0xcb, 0x99, 0xfa, 0x00, 0xbe, 0x49, 0xcd, 0x7d, 0x1d, 0x00, 0xbe, 0x41,
+    0xc7, 0xc4, 0x41, 0x00, 0xbe, 0x30, 0xc4, 0x18, 0x10, 0x00, 0xbf, 0x39,
+    0xc2, 0x22, 0xcc, 0x00, 0xbf, 0x30, 0xc3, 0x0d, 0x14, 0x00, 0xbf, 0x29,
+    0xc3, 0x09, 0x9e, 0x00, 0xbf, 0x20, 0xc4, 0x02, 0xde, 0x00, 0xbf, 0x19,
+    0xc2, 0x02, 0xa0, 0x00, 0xbf, 0x10, 0x03, 0xc1, 0xe6, 0x75, 0x11, 0xc1,
+    0xe6, 0x85, 0x87, 0x00, 0xbe, 0x09, 0x8b, 0x00, 0xbd, 0xbb, 0x01, 0xe6,
+    0x8d, 0x9b, 0x00, 0xbd, 0xcb, 0x01, 0xe6, 0x95, 0x97, 0x00, 0xbd, 0xda,
+    0x01, 0xe6, 0x9d, 0x83, 0x00, 0xbd, 0xa9, 0x93, 0x00, 0xbd, 0xa0, 0x03,
+    0xc1, 0xe6, 0xa5, 0x48, 0xb7, 0x6a, 0xc1, 0xe6, 0xb5, 0x87, 0x00, 0xbd,
+    0x79, 0x97, 0x00, 0xbd, 0x3b, 0x01, 0xe6, 0xc1, 0x8b, 0x00, 0xbd, 0x2a,
+    0x01, 0xe6, 0xcc, 0x9b, 0x00, 0xbd, 0x70, 0x9b, 0x00, 0xbd, 0x60, 0x83,
+    0x00, 0xbd, 0x09, 0x91, 0x00, 0xbc, 0xd8, 0x83, 0x00, 0xbc, 0xf9, 0xc2,
+    0x00, 0xfb, 0x00, 0xbc, 0xf1, 0xc2, 0x00, 0xd0, 0x00, 0xbc, 0xe8, 0x0a,
+    0xc1, 0xe6, 0xd0, 0x91, 0x00, 0xbc, 0xb0, 0x91, 0x00, 0xbc, 0x99, 0xc2,
+    0x00, 0x10, 0x00, 0xbc, 0x71, 0xc2, 0x42, 0xcd, 0x00, 0xbc, 0x49, 0xc2,
+    0x0f, 0x7b, 0x00, 0xbc, 0x20, 0x0a, 0xc1, 0xe6, 0xd8, 0x91, 0x00, 0xbc,
+    0x89, 0x83, 0x00, 0xbc, 0x79, 0x42, 0x00, 0x8e, 0x41, 0xe6, 0xe0, 0x91,
+    0x00, 0xbc, 0x61, 0x83, 0x00, 0xbc, 0x50, 0x0a, 0xc1, 0xe6, 0xe8, 0x91,
+    0x00, 0xbc, 0x39, 0x83, 0x00, 0xbc, 0x28, 0x0a, 0xc1, 0xe6, 0xf0, 0x91,
+    0x00, 0xbc, 0x11, 0x83, 0x00, 0xbc, 0x00, 0xc4, 0x22, 0xd6, 0x08, 0x52,
+    0xc1, 0xc4, 0x6e, 0x13, 0x08, 0x52, 0xa8, 0x11, 0xc1, 0xe6, 0xf8, 0xc4,
+    0x19, 0x53, 0x08, 0x52, 0xb0, 0xcb, 0x80, 0xaa, 0x08, 0x52, 0x99, 0xc5,
+    0x02, 0xd2, 0x08, 0x52, 0x90, 0xc8, 0x4b, 0x94, 0x08, 0x52, 0x39, 0xc7,
+    0x0d, 0x04, 0x08, 0x52, 0x30, 0xc5, 0x28, 0xee, 0x08, 0x52, 0x29, 0xc2,
+    0x00, 0xc4, 0x08, 0x52, 0x20, 0xc4, 0x02, 0xde, 0x08, 0x52, 0x11, 0xc2,
+    0x02, 0xa0, 0x08, 0x52, 0x08, 0xcb, 0x36, 0x51, 0x08, 0x50, 0x61, 0x45,
+    0x00, 0xba, 0x41, 0xe7, 0x02, 0xc7, 0x0e, 0x70, 0x08, 0x51, 0xd1, 0xcf,
+    0x65, 0xa3, 0x08, 0x50, 0x68, 0xc2, 0x00, 0xd0, 0x08, 0x51, 0xa9, 0x83,
+    0x08, 0x51, 0x60, 0x16, 0xc1, 0xe7, 0x18, 0xc2, 0x00, 0xd0, 0x08, 0x51,
+    0x01, 0x83, 0x08, 0x50, 0xf8, 0xc2, 0x00, 0xd0, 0x08, 0x51, 0x39, 0x83,
+    0x08, 0x51, 0x30, 0xc2, 0x00, 0xd0, 0x08, 0x51, 0x29, 0x83, 0x08, 0x51,
+    0x20, 0x83, 0x08, 0x51, 0x19, 0xc2, 0x00, 0xc1, 0x08, 0x50, 0xf1, 0xc2,
+    0x19, 0x2c, 0x08, 0x50, 0xc8, 0xc2, 0x00, 0xd0, 0x08, 0x51, 0x11, 0x83,
+    0x08, 0x51, 0x09, 0x06, 0x41, 0xe7, 0x26, 0xc2, 0x00, 0xd0, 0x08, 0x50,
+    0xb1, 0x83, 0x08, 0x50, 0xa8, 0xc2, 0x00, 0xd0, 0x08, 0x50, 0x99, 0x83,
+    0x08, 0x50, 0x90, 0xc2, 0x00, 0xd0, 0x08, 0x50, 0x89, 0x83, 0x08, 0x50,
+    0x81, 0xc2, 0x02, 0x2b, 0x08, 0x51, 0x90, 0xc2, 0x00, 0xd0, 0x08, 0x51,
+    0x69, 0xc2, 0x0d, 0xf6, 0x08, 0x51, 0x71, 0x83, 0x08, 0x51, 0x78, 0x46,
+    0x00, 0x8b, 0x41, 0xe7, 0x30, 0xca, 0xa7, 0x92, 0x0f, 0xd2, 0x53, 0x01,
+    0xe7, 0x3c, 0xc5, 0xa8, 0xf7, 0x0f, 0xd0, 0x0b, 0x01, 0xe7, 0x42, 0x0d,
+    0xc1, 0xe7, 0x48, 0xc6, 0xca, 0xfd, 0x0f, 0xd0, 0x1b, 0x01, 0xe7, 0x5a,
+    0xc4, 0xde, 0x83, 0x0f, 0xd0, 0x13, 0x01, 0xe7, 0x60, 0xc4, 0xe3, 0x93,
+    0x0f, 0xd0, 0x2b, 0x01, 0xe7, 0x66, 0x47, 0x45, 0x86, 0x41, 0xe7, 0x6c,
+    0x0b, 0xc1, 0xe7, 0x88, 0xca, 0xa0, 0x26, 0x08, 0xa2, 0xf0, 0x18, 0xc1,
+    0xe7, 0x94, 0xc2, 0x00, 0xd0, 0x08, 0xa1, 0xa1, 0x15, 0xc1, 0xe7, 0xa0,
+    0x10, 0xc1, 0xe7, 0xb0, 0x06, 0xc1, 0xe7, 0xc8, 0x16, 0xc1, 0xe7, 0xd6,
+    0x0c, 0xc1, 0xe7, 0xe4, 0x05, 0xc1, 0xe7, 0xee, 0x09, 0xc1, 0xe7, 0xf8,
+    0x0d, 0xc1, 0xe8, 0x02, 0x83, 0x08, 0xa0, 0x03, 0x01, 0xe8, 0x0c, 0x91,
+    0x08, 0xa0, 0x61, 0x87, 0x08, 0xa0, 0x51, 0x97, 0x08, 0xa0, 0x23, 0x01,
+    0xe8, 0x18, 0x8b, 0x08, 0xa0, 0x13, 0x01, 0xe8, 0x1c, 0x12, 0xc1, 0xe8,
+    0x20, 0x04, 0xc1, 0xe8, 0x2a, 0x0f, 0xc1, 0xe8, 0x34, 0xc2, 0x19, 0x2c,
+    0x08, 0xa1, 0x59, 0x14, 0xc1, 0xe8, 0x3e, 0x0e, 0xc1, 0xe8, 0x48, 0xc2,
+    0x01, 0x4a, 0x08, 0xa1, 0x80, 0x46, 0x00, 0x59, 0xc1, 0xe8, 0x52, 0x45,
+    0x09, 0x98, 0xc1, 0xe8, 0x5e, 0xc4, 0x19, 0x53, 0x08, 0xa2, 0x58, 0x03,
+    0xc1, 0xe8, 0x82, 0x91, 0x08, 0xa2, 0x01, 0x87, 0x08, 0xa1, 0xf1, 0x48,
+    0xb2, 0x2d, 0xc1, 0xe8, 0x8e, 0x97, 0x08, 0xa1, 0xc3, 0x01, 0xe8, 0x9c,
+    0x8b, 0x08, 0xa1, 0xb2, 0x01, 0xe8, 0xa0, 0xc8, 0xb9, 0x72, 0x00, 0xce,
+    0xf3, 0x01, 0xe8, 0xa4, 0x16, 0xc1, 0xe8, 0xa8, 0x46, 0x09, 0x97, 0xc1,
+    0xe8, 0xb4, 0x47, 0x02, 0x0e, 0xc1, 0xe8, 0xd8, 0x4b, 0x6f, 0xc7, 0x41,
+    0xe8, 0xea, 0xc9, 0xb2, 0xa2, 0x0f, 0x98, 0xd1, 0xc6, 0x00, 0x91, 0x0f,
+    0x98, 0x88, 0xca, 0xa2, 0x88, 0x01, 0x3a, 0x71, 0xc2, 0x15, 0x95, 0x0f,
+    0x8c, 0x79, 0xc2, 0x00, 0x03, 0x0f, 0x8c, 0x71, 0xc2, 0x0d, 0xf6, 0x0f,
+    0x8c, 0x69, 0xc2, 0x00, 0xb0, 0x0f, 0x8c, 0x61, 0xc2, 0x00, 0x63, 0x0f,
+    0x8c, 0x59, 0x55, 0x0b, 0x11, 0xc1, 0xe9, 0x0a, 0xcd, 0x2c, 0xb2, 0x0f,
+    0xde, 0x20, 0xca, 0xa3, 0xd2, 0x01, 0x27, 0xf9, 0x47, 0x34, 0x2f, 0xc1,
+    0xe9, 0x72, 0x55, 0x0b, 0x11, 0xc1, 0xe9, 0x88, 0xc8, 0x01, 0x92, 0x0f,
+    0xbe, 0xb1, 0xc6, 0x0b, 0x09, 0x0f, 0xbe, 0xc0, 0xc5, 0x0d, 0x20, 0x0f,
+    0xdd, 0xe9, 0xdc, 0x04, 0xcb, 0x0f, 0xdd, 0xf1, 0xc7, 0x3a, 0x19, 0x0f,
+    0xdd, 0xf8, 0xd6, 0x2d, 0xd0, 0x01, 0x14, 0x49, 0xd4, 0x3a, 0x20, 0x01,
+    0x14, 0x40, 0xe0, 0x07, 0x47, 0x01, 0x12, 0x38, 0xca, 0x37, 0x4e, 0x01,
+    0x13, 0xa9, 0xc5, 0x07, 0x62, 0x01, 0x13, 0x88, 0xca, 0x37, 0x4e, 0x01,
+    0x13, 0xa1, 0xc5, 0x07, 0x62, 0x01, 0x13, 0x80, 0xcf, 0x61, 0x11, 0x08,
+    0xcf, 0x21, 0x03, 0xc1, 0xe9, 0xf0, 0x91, 0x08, 0xce, 0xe1, 0x87, 0x08,
+    0xce, 0xd1, 0xc9, 0xb2, 0x2d, 0x08, 0xce, 0xb3, 0x01, 0xe9, 0xfc, 0x97,
+    0x08, 0xce, 0xa3, 0x01, 0xea, 0x00, 0x8b, 0x08, 0xce, 0x92, 0x01, 0xea,
+    0x04, 0xc7, 0xc3, 0x61, 0x08, 0xcf, 0x11, 0x03, 0xc1, 0xea, 0x08, 0x42,
+    0x07, 0xb2, 0x41, 0xea, 0x14, 0x14, 0xc1, 0xea, 0x20, 0x0e, 0xc1, 0xea,
+    0x2a, 0xc2, 0x00, 0xd0, 0x08, 0xce, 0x71, 0x15, 0xc1, 0xea, 0x34, 0x18,
+    0xc1, 0xea, 0x44, 0xc2, 0x19, 0x2c, 0x08, 0xce, 0x39, 0xc2, 0x01, 0xc3,
+    0x08, 0xce, 0x31, 0x04, 0xc1, 0xea, 0x51, 0x12, 0xc1, 0xea, 0x5b, 0x10,
+    0xc1, 0xea, 0x65, 0x06, 0xc1, 0xea, 0x7b, 0x16, 0xc1, 0xea, 0x89, 0x0c,
+    0xc1, 0xea, 0x97, 0x05, 0xc1, 0xea, 0xa1, 0x09, 0xc1, 0xea, 0xab, 0x0d,
+    0xc1, 0xea, 0xb5, 0x83, 0x08, 0xcd, 0x03, 0x01, 0xea, 0xbf, 0x91, 0x08,
+    0xcd, 0x61, 0x87, 0x08, 0xcd, 0x51, 0x97, 0x08, 0xcd, 0x23, 0x01, 0xea,
+    0xcb, 0x8b, 0x08, 0xcd, 0x12, 0x01, 0xea, 0xcf, 0xc3, 0x05, 0x14, 0x08,
+    0x45, 0x3b, 0x01, 0xea, 0xd3, 0x16, 0xc1, 0xea, 0xd9, 0x08, 0x41, 0xea,
+    0xe9, 0x16, 0xc1, 0xea, 0xf5, 0x15, 0xc1, 0xeb, 0x01, 0x46, 0x26, 0xf7,
+    0xc1, 0xeb, 0x0b, 0xc4, 0x5d, 0xe2, 0x08, 0x44, 0xd9, 0xc4, 0xb9, 0x7e,
+    0x08, 0x44, 0xd1, 0xc2, 0x00, 0x67, 0x08, 0x44, 0xc1, 0x03, 0xc1, 0xeb,
+    0x41, 0xc3, 0x20, 0x18, 0x08, 0x44, 0xa9, 0xc3, 0x00, 0x4e, 0x08, 0x44,
+    0x99, 0xc6, 0xcf, 0xd7, 0x08, 0x44, 0x89, 0xc4, 0xe0, 0xe7, 0x08, 0x44,
+    0x79, 0xc4, 0x4a, 0xb9, 0x08, 0x44, 0x69, 0xc2, 0x01, 0x7f, 0x08, 0x44,
+    0x3b, 0x01, 0xeb, 0x4d, 0xc5, 0x4a, 0xb3, 0x08, 0x44, 0x49, 0xc3, 0x7e,
+    0x89, 0x08, 0x44, 0x41, 0xc6, 0x40, 0x9a, 0x08, 0x44, 0x29, 0xc5, 0x9c,
+    0xa2, 0x08, 0x44, 0x21, 0xc4, 0xe3, 0x27, 0x08, 0x44, 0x18, 0x45, 0x20,
+    0x6c, 0xc1, 0xeb, 0x53, 0x45, 0x15, 0xa7, 0xc1, 0xeb, 0x7e, 0x46, 0x09,
+    0x91, 0x41, 0xeb, 0xa9, 0xde, 0x0e, 0x32, 0x0f, 0xaa, 0x19, 0x4a, 0x00,
+    0x27, 0x41, 0xeb, 0xc1, 0xe0, 0x0c, 0x07, 0x01, 0x3d, 0x88, 0xcc, 0x23,
+    0x9f, 0x01, 0x17, 0x60, 0x46, 0x1f, 0x87, 0xc1, 0xeb, 0xc7, 0xc3, 0x00,
+    0xbb, 0x00, 0x05, 0x60, 0xc3, 0x33, 0xa8, 0x01, 0x15, 0x69, 0xc4, 0x1e,
+    0xc9, 0x01, 0x12, 0x08, 0x43, 0x07, 0x28, 0xc1, 0xeb, 0xd3, 0xce, 0x66,
+    0xcf, 0x01, 0x12, 0x49, 0xd6, 0x2b, 0xc0, 0x01, 0x12, 0x21, 0xcc, 0x81,
+    0x99, 0x01, 0x10, 0x48, 0xca, 0x37, 0x4e, 0x01, 0x13, 0x69, 0xc5, 0x07,
+    0x62, 0x01, 0x13, 0x00, 0x86, 0x0f, 0xae, 0x51, 0xc2, 0x09, 0x3b, 0x0f,
+    0xae, 0x48, 0xd6, 0x2b, 0x68, 0x0f, 0xa6, 0xa0, 0x87, 0x0f, 0x09, 0x58,
+    0x91, 0x0f, 0x09, 0x48, 0x83, 0x0f, 0x09, 0x28, 0xc2, 0x00, 0x39, 0x0f,
+    0x09, 0x19, 0x83, 0x0f, 0x08, 0xb0, 0xc2, 0x00, 0xdb, 0x0f, 0x09, 0x09,
+    0x83, 0x0f, 0x08, 0xd0, 0xc2, 0x00, 0xdb, 0x0f, 0x09, 0x01, 0x83, 0x0f,
+    0x08, 0x00, 0x8a, 0x0f, 0x08, 0xf8, 0x12, 0xc1, 0xeb, 0xdf, 0xc2, 0x0f,
+    0x9a, 0x0f, 0x08, 0xc9, 0x16, 0xc1, 0xeb, 0xe9, 0xc2, 0x00, 0x39, 0x0f,
+    0x08, 0x89, 0xc2, 0x19, 0x2c, 0x0f, 0x08, 0x81, 0xc2, 0x00, 0x64, 0x0f,
+    0x08, 0x61, 0xc2, 0x02, 0x2b, 0x0f, 0x08, 0x39, 0x83, 0x0f, 0x08, 0x28,
+    0xc2, 0x00, 0xdb, 0x0f, 0x08, 0xe9, 0x83, 0x0f, 0x08, 0x78, 0xc2, 0x19,
+    0x2c, 0x0f, 0x08, 0xd9, 0x83, 0x0f, 0x08, 0x30, 0xc2, 0x8d, 0x8f, 0x0f,
+    0x08, 0xa1, 0x83, 0x0f, 0x08, 0x19, 0xc2, 0x0d, 0xf6, 0x0f, 0x08, 0x08,
+    0xcc, 0x86, 0x61, 0x0f, 0x09, 0xd9, 0xc6, 0xcc, 0x9b, 0x0f, 0x09, 0xd1,
+    0xc8, 0x7f, 0x59, 0x0f, 0x09, 0xc9, 0xc5, 0xd8, 0x2b, 0x0f, 0x09, 0xc1,
+    0xc6, 0x18, 0x8e, 0x0f, 0x09, 0xb8, 0x08, 0xc1, 0xeb, 0xf9, 0x07, 0xc1,
+    0xec, 0x29, 0x04, 0xc1, 0xec, 0x69, 0x26, 0xc1, 0xec, 0xa9, 0x25, 0xc1,
+    0xec, 0xe9, 0x24, 0xc1, 0xed, 0x29, 0x23, 0xc1, 0xed, 0x69, 0x22, 0xc1,
+    0xed, 0xa9, 0x21, 0xc1, 0xed, 0xe9, 0x20, 0xc1, 0xee, 0x29, 0x1f, 0xc1,
+    0xee, 0x69, 0x1e, 0xc1, 0xee, 0xa9, 0x1d, 0xc1, 0xee, 0xe9, 0x06, 0xc1,
+    0xef, 0x29, 0x05, 0xc1, 0xef, 0x69, 0x03, 0x41, 0xef, 0xa9, 0x08, 0xc1,
+    0xef, 0xe9, 0x07, 0xc1, 0xf0, 0x29, 0x06, 0xc1, 0xf0, 0x69, 0x05, 0xc1,
+    0xf0, 0xa9, 0x04, 0xc1, 0xf0, 0xe9, 0x03, 0xc1, 0xf1, 0x29, 0x26, 0xc1,
+    0xf1, 0x69, 0x25, 0xc1, 0xf1, 0xa9, 0x24, 0x41, 0xf1, 0xe9, 0x42, 0x00,
+    0x28, 0xc1, 0xf2, 0x29, 0xd1, 0x52, 0xcc, 0x01, 0x24, 0xa1, 0xcc, 0x48,
+    0x29, 0x01, 0x24, 0x88, 0xd1, 0x56, 0xa6, 0x01, 0x24, 0xc9, 0xcf, 0x66,
+    0xb1, 0x01, 0x24, 0x90, 0xd2, 0x48, 0x23, 0x01, 0x24, 0xc1, 0x0b, 0x41,
+    0xf2, 0x35, 0xd0, 0x59, 0x52, 0x01, 0x24, 0xb1, 0xd1, 0x53, 0x65, 0x01,
+    0x24, 0xa8, 0xc4, 0x18, 0x10, 0x00, 0x3e, 0x39, 0xc2, 0x22, 0xcc, 0x00,
+    0x3e, 0x30, 0xc3, 0x0d, 0x14, 0x00, 0x3e, 0x29, 0xc3, 0x09, 0x9e, 0x00,
+    0x3e, 0x20, 0xc4, 0x02, 0xde, 0x00, 0x3e, 0x19, 0xc2, 0x02, 0xa0, 0x00,
+    0x3e, 0x10, 0x44, 0xe4, 0x3f, 0xc1, 0xf2, 0x41, 0x83, 0x00, 0x3e, 0xb0,
+    0xc2, 0x19, 0x2c, 0x00, 0x3f, 0x13, 0x01, 0xf2, 0x53, 0x83, 0x00, 0x3f,
+    0x1a, 0x01, 0xf2, 0x59, 0xc2, 0x00, 0x39, 0x00, 0x3e, 0xd1, 0x83, 0x00,
+    0x3e, 0xc8, 0xc8, 0xbc, 0x92, 0x00, 0x3e, 0x88, 0x91, 0x00, 0x3e, 0x78,
+    0x87, 0x00, 0x3e, 0x58, 0xcb, 0x5a, 0x32, 0x00, 0x3f, 0x89, 0xc8, 0xae,
+    0xfb, 0x00, 0x3f, 0x81, 0xc9, 0x3d, 0x18, 0x00, 0x3f, 0x79, 0xcf, 0x64,
+    0x1d, 0x00, 0x3f, 0x70, 0xcb, 0x5a, 0x32, 0x00, 0x3f, 0x69, 0xc8, 0xae,
+    0xfb, 0x00, 0x3f, 0x61, 0xc9, 0x3d, 0x18, 0x00, 0x3f, 0x58, 0x46, 0x00,
+    0x8b, 0x41, 0xf2, 0x5f, 0x95, 0x0f, 0xae, 0x68, 0xc3, 0x23, 0x2f, 0x0f,
+    0xae, 0x2b, 0x01, 0xf2, 0x77, 0xc3, 0x15, 0xa8, 0x0f, 0xd5, 0xc8, 0xc5,
+    0x11, 0x0d, 0x01, 0x1e, 0xd1, 0x45, 0xd9, 0x2f, 0x41, 0xf2, 0x7d, 0xc4,
+    0x9e, 0x9c, 0x0f, 0x99, 0xf1, 0xc5, 0xdb, 0x9b, 0x0f, 0x99, 0xe8, 0x20,
+    0xc1, 0xf2, 0x87, 0x1f, 0xc1, 0xf2, 0xaa, 0x1e, 0xc1, 0xf2, 0xd8, 0x1d,
+    0x41, 0xf3, 0x06, 0xa6, 0x09, 0x82, 0xc9, 0xa5, 0x09, 0x82, 0xc1, 0xa4,
+    0x09, 0x82, 0xb9, 0xa3, 0x09, 0x82, 0xb1, 0xa2, 0x09, 0x82, 0xa3, 0x01,
+    0xf3, 0x30, 0xa1, 0x09, 0x82, 0x99, 0xa0, 0x09, 0x82, 0x91, 0x9f, 0x09,
+    0x82, 0x89, 0x9e, 0x09, 0x82, 0x80, 0x22, 0xc1, 0xf3, 0x34, 0x21, 0xc1,
+    0xf3, 0x3f, 0x20, 0xc1, 0xf3, 0x67, 0x1f, 0xc1, 0xf3, 0x98, 0x1e, 0xc1,
+    0xf3, 0xcc, 0x1d, 0x41, 0xf3, 0xfa, 0x47, 0x07, 0x9a, 0xc1, 0xf4, 0x27,
+    0x44, 0x00, 0xf1, 0x41, 0xf4, 0x33, 0x1e, 0xc1, 0xf4, 0x3f, 0x1d, 0x41,
+    0xf4, 0x5d, 0xa5, 0x09, 0x8c, 0x39, 0xa4, 0x09, 0x8c, 0x31, 0xa3, 0x09,
+    0x8c, 0x23, 0x01, 0xf4, 0x87, 0xa2, 0x09, 0x8c, 0x19, 0xa1, 0x09, 0x8c,
+    0x11, 0xa0, 0x09, 0x8c, 0x09, 0x9f, 0x09, 0x8c, 0x01, 0x9e, 0x09, 0x8b,
+    0xf8, 0xc2, 0xe6, 0x77, 0x09, 0x9d, 0x6b, 0x01, 0xf4, 0x8b, 0x20, 0xc1,
+    0xf4, 0x8f, 0x1f, 0xc1, 0xf4, 0xc3, 0x1e, 0xc1, 0xf4, 0xf7, 0x1d, 0x41,
+    0xf5, 0x25, 0x20, 0xc1, 0xf5, 0x52, 0x1f, 0xc1, 0xf5, 0x5e, 0x1e, 0xc1,
+    0xf5, 0x86, 0x1d, 0x41, 0xf5, 0xae, 0xc2, 0xe4, 0xef, 0x09, 0x82, 0x79,
+    0x23, 0xc1, 0xf5, 0xd5, 0x22, 0xc1, 0xf5, 0xfd, 0x21, 0xc1, 0xf6, 0x25,
+    0x20, 0xc1, 0xf6, 0x59, 0x1f, 0xc1, 0xf6, 0x84, 0x1e, 0xc1, 0xf6, 0xac,
+    0x1d, 0x41, 0xf6, 0xda, 0xa3, 0x09, 0xa0, 0x23, 0x01, 0xf7, 0x04, 0xa2,
+    0x09, 0x9f, 0xd3, 0x01, 0xf7, 0x24, 0xa1, 0x09, 0x9f, 0xc9, 0xa0, 0x09,
+    0x9f, 0xc1, 0x9f, 0x09, 0x9f, 0xb9, 0x9e, 0x09, 0x9f, 0xb1, 0x9d, 0x09,
+    0x9f, 0xa8, 0xa6, 0x09, 0x9f, 0xa1, 0xa5, 0x09, 0x9f, 0x99, 0xa4, 0x09,
+    0x9f, 0x91, 0xa3, 0x09, 0x9f, 0x89, 0xa2, 0x09, 0x9f, 0x7b, 0x01, 0xf7,
+    0x48, 0xa1, 0x09, 0x9f, 0x6b, 0x01, 0xf7, 0x4c, 0xa0, 0x09, 0x9f, 0x53,
+    0x01, 0xf7, 0x50, 0x9f, 0x09, 0x9f, 0x2b, 0x01, 0xf7, 0x58, 0x9e, 0x09,
+    0x9f, 0x20, 0x83, 0x09, 0x9e, 0xe0, 0x83, 0x09, 0x9e, 0xd0, 0x83, 0x09,
+    0x9e, 0xb8, 0x84, 0x09, 0x9e, 0xa1, 0x83, 0x09, 0x9e, 0x98, 0xa2, 0x09,
+    0x9e, 0x71, 0xa1, 0x09, 0x9e, 0x63, 0x01, 0xf7, 0x68, 0xa0, 0x09, 0x9e,
+    0x59, 0x9f, 0x09, 0x9e, 0x51, 0x9e, 0x09, 0x9e, 0x49, 0x9d, 0x09, 0x9e,
+    0x40, 0xa6, 0x09, 0x9e, 0x39, 0xa5, 0x09, 0x9e, 0x2b, 0x01, 0xf7, 0x6c,
+    0xa4, 0x09, 0x9e, 0x1b, 0x01, 0xf7, 0x70, 0xa3, 0x09, 0x9e, 0x11, 0xa2,
+    0x09, 0x9e, 0x09, 0xa1, 0x09, 0x9d, 0xfb, 0x01, 0xf7, 0x74, 0xa0, 0x09,
+    0x9d, 0xf1, 0x9f, 0x09, 0x9d, 0xe9, 0x9e, 0x09, 0x9d, 0xe1, 0x9d, 0x09,
+    0x9d, 0xd2, 0x01, 0xf7, 0x78, 0xa6, 0x09, 0x9d, 0xc3, 0x01, 0xf7, 0x7c,
+    0xa5, 0x09, 0x9d, 0xb9, 0xa4, 0x09, 0x9d, 0xb1, 0xa3, 0x09, 0x9d, 0xa9,
+    0xa2, 0x09, 0x9d, 0xa1, 0xa1, 0x09, 0x9d, 0x99, 0xa0, 0x09, 0x9d, 0x8b,
+    0x01, 0xf7, 0x80, 0x9f, 0x09, 0x9d, 0x81, 0x9e, 0x09, 0x9d, 0x78, 0x9f,
+    0x09, 0x9b, 0x09, 0x9e, 0x09, 0x9b, 0x01, 0x9d, 0x09, 0x9a, 0xf8, 0xa6,
+    0x09, 0x9a, 0xf1, 0xa5, 0x09, 0x9a, 0xe9, 0xa4, 0x09, 0x9a, 0xe1, 0xa3,
+    0x09, 0x9a, 0xd9, 0xa2, 0x09, 0x9a, 0xd1, 0xa1, 0x09, 0x9a, 0xc9, 0xa0,
+    0x09, 0x9a, 0xc1, 0x9f, 0x09, 0x9a, 0xb3, 0x01, 0xf7, 0x84, 0x9e, 0x09,
+    0x9a, 0xa9, 0x9d, 0x09, 0x9a, 0xa0, 0xa6, 0x09, 0x9a, 0x93, 0x01, 0xf7,
+    0x88, 0xa5, 0x09, 0x9a, 0x89, 0xa4, 0x09, 0x9a, 0x81, 0xa3, 0x09, 0x9a,
+    0x79, 0xa2, 0x09, 0x9a, 0x71, 0xa1, 0x09, 0x9a, 0x69, 0xa0, 0x09, 0x9a,
+    0x5b, 0x01, 0xf7, 0x8c, 0x9f, 0x09, 0x9a, 0x51, 0x9e, 0x09, 0x9a, 0x49,
+    0x9d, 0x09, 0x9a, 0x40, 0xa6, 0x09, 0x9a, 0x39, 0xa5, 0x09, 0x9a, 0x31,
+    0xa4, 0x09, 0x9a, 0x29, 0xa3, 0x09, 0x9a, 0x21, 0xa2, 0x09, 0x9a, 0x19,
+    0xa1, 0x09, 0x9a, 0x11, 0xa0, 0x09, 0x9a, 0x09, 0x9f, 0x09, 0x9a, 0x01,
+    0x9e, 0x09, 0x99, 0xf9, 0x9d, 0x09, 0x99, 0xf0, 0xa6, 0x09, 0x99, 0xe9,
+    0xa5, 0x09, 0x99, 0xe1, 0xa4, 0x09, 0x99, 0xd9, 0xa3, 0x09, 0x99, 0xc3,
+    0x01, 0xf7, 0x90, 0xa2, 0x09, 0x99, 0xb9, 0xa1, 0x09, 0x99, 0xb1, 0xa0,
+    0x09, 0x99, 0xa9, 0x9f, 0x09, 0x99, 0xa1, 0x9e, 0x09, 0x99, 0x98, 0xa3,
+    0x09, 0x99, 0x91, 0xa2, 0x09, 0x99, 0x89, 0xa1, 0x09, 0x99, 0x81, 0xa0,
+    0x09, 0x99, 0x73, 0x01, 0xf7, 0x98, 0x9f, 0x09, 0x99, 0x63, 0x01, 0xf7,
+    0x9c, 0x9e, 0x09, 0x99, 0x59, 0x9d, 0x09, 0x99, 0x50, 0xa6, 0x09, 0x99,
+    0x49, 0xa5, 0x09, 0x99, 0x41, 0xa4, 0x09, 0x99, 0x39, 0xa3, 0x09, 0x99,
+    0x31, 0xa2, 0x09, 0x99, 0x29, 0xa1, 0x09, 0x99, 0x21, 0xa0, 0x09, 0x99,
+    0x19, 0x9f, 0x09, 0x99, 0x11, 0x9e, 0x09, 0x99, 0x09, 0x9d, 0x09, 0x99,
+    0x00, 0xa6, 0x09, 0x98, 0xf9, 0xa5, 0x09, 0x98, 0xf1, 0xa4, 0x09, 0x98,
+    0xe9, 0xa3, 0x09, 0x98, 0xdb, 0x01, 0xf7, 0xa0, 0xa2, 0x09, 0x98, 0xd1,
+    0xa1, 0x09, 0x98, 0xc9, 0xa0, 0x09, 0x98, 0xc1, 0x9f, 0x09, 0x98, 0xb9,
+    0x9e, 0x09, 0x98, 0xab, 0x01, 0xf7, 0xa4, 0x9d, 0x09, 0x98, 0xa0, 0xa6,
+    0x09, 0x98, 0x93, 0x01, 0xf7, 0xa8, 0xa5, 0x09, 0x98, 0x83, 0x01, 0xf7,
+    0xac, 0xa4, 0x09, 0x98, 0x73, 0x01, 0xf7, 0xb0, 0xa3, 0x09, 0x98, 0x69,
+    0xa2, 0x09, 0x98, 0x61, 0xa1, 0x09, 0x98, 0x59, 0xa0, 0x09, 0x98, 0x4b,
+    0x01, 0xf7, 0xb4, 0x9f, 0x09, 0x98, 0x41, 0x9e, 0x09, 0x98, 0x38, 0xa3,
+    0x09, 0x98, 0x31, 0xa2, 0x09, 0x98, 0x29, 0xa1, 0x09, 0x98, 0x21, 0xa0,
+    0x09, 0x98, 0x19, 0x9f, 0x09, 0x98, 0x11, 0x9e, 0x09, 0x98, 0x09, 0x9d,
+    0x09, 0x98, 0x00, 0xa6, 0x09, 0x97, 0xf9, 0xa5, 0x09, 0x97, 0xf1, 0xa4,
+    0x09, 0x97, 0xe9, 0xa3, 0x09, 0x97, 0xe1, 0xa2, 0x09, 0x97, 0xd3, 0x01,
+    0xf7, 0xb8, 0xa1, 0x09, 0x97, 0xc9, 0xa0, 0x09, 0x97, 0xc1, 0x9f, 0x09,
+    0x97, 0xb9, 0x9e, 0x09, 0x97, 0xb1, 0x9d, 0x09, 0x97, 0xa8, 0xa6, 0x09,
+    0x97, 0xa1, 0xa5, 0x09, 0x97, 0x99, 0xa4, 0x09, 0x97, 0x91, 0xa3, 0x09,
+    0x97, 0x7b, 0x01, 0xf7, 0xbc, 0xa2, 0x09, 0x97, 0x71, 0xa1, 0x09, 0x97,
+    0x69, 0xa0, 0x09, 0x97, 0x61, 0x9f, 0x09, 0x97, 0x59, 0x9e, 0x09, 0x97,
+    0x51, 0x9d, 0x09, 0x97, 0x48, 0xa6, 0x09, 0x97, 0x41, 0xa5, 0x09, 0x97,
+    0x39, 0xa4, 0x09, 0x97, 0x2b, 0x01, 0xf7, 0xc4, 0xa3, 0x09, 0x97, 0x21,
+    0xa2, 0x09, 0x97, 0x19, 0xa1, 0x09, 0x97, 0x03, 0x01, 0xf7, 0xc8, 0xa0,
+    0x09, 0x96, 0xf9, 0x9f, 0x09, 0x96, 0xf1, 0x9e, 0x09, 0x96, 0xe9, 0x9d,
+    0x09, 0x96, 0xe0, 0xa6, 0x09, 0x96, 0xd9, 0xa5, 0x09, 0x96, 0xd1, 0xa4,
+    0x09, 0x96, 0xc9, 0xa3, 0x09, 0x96, 0xbb, 0x01, 0xf7, 0xd0, 0xa2, 0x09,
+    0x96, 0xb1, 0xa1, 0x09, 0x96, 0xa9, 0xa0, 0x09, 0x96, 0xa1, 0x9f, 0x09,
+    0x96, 0x93, 0x01, 0xf7, 0xd4, 0x9e, 0x09, 0x96, 0x88, 0xa6, 0x09, 0x96,
+    0x81, 0xa5, 0x09, 0x96, 0x79, 0xa4, 0x09, 0x96, 0x71, 0xa3, 0x09, 0x96,
+    0x69, 0xa2, 0x09, 0x96, 0x61, 0xa1, 0x09, 0x96, 0x59, 0xa0, 0x09, 0x96,
+    0x51, 0x9f, 0x09, 0x96, 0x49, 0x9e, 0x09, 0x96, 0x41, 0x9d, 0x09, 0x96,
+    0x38, 0xa6, 0x09, 0x96, 0x31, 0xa5, 0x09, 0x96, 0x29, 0xa4, 0x09, 0x96,
+    0x21, 0xa3, 0x09, 0x96, 0x13, 0x01, 0xf7, 0xd8, 0xa2, 0x09, 0x96, 0x09,
+    0xa1, 0x09, 0x96, 0x01, 0xa0, 0x09, 0x95, 0xf9, 0x9f, 0x09, 0x95, 0xf1,
+    0x9e, 0x09, 0x95, 0xe9, 0x9d, 0x09, 0x95, 0xda, 0x01, 0xf7, 0xdc, 0xa6,
+    0x09, 0x95, 0xd1, 0xa5, 0x09, 0x95, 0xc9, 0xa4, 0x09, 0x95, 0xc1, 0xa3,
+    0x09, 0x95, 0xb9, 0xa2, 0x09, 0x95, 0xb1, 0xa1, 0x09, 0x95, 0xa9, 0xa0,
+    0x09, 0x95, 0x93, 0x01, 0xf7, 0xe0, 0x9f, 0x09, 0x95, 0x83, 0x01, 0xf7,
+    0xe8, 0x9e, 0x09, 0x95, 0x78, 0x9e, 0x09, 0x95, 0x39, 0x9d, 0x09, 0x95,
+    0x30, 0xa6, 0x09, 0x95, 0x29, 0xa5, 0x09, 0x95, 0x21, 0xa4, 0x09, 0x95,
+    0x19, 0xa3, 0x09, 0x95, 0x11, 0xa2, 0x09, 0x95, 0x09, 0xa1, 0x09, 0x95,
+    0x01, 0xa0, 0x09, 0x94, 0xf3, 0x01, 0xf7, 0xec, 0x9f, 0x09, 0x94, 0xe9,
+    0x9e, 0x09, 0x94, 0xda, 0x01, 0xf7, 0xf0, 0x1f, 0xc1, 0xf7, 0xf4, 0x1e,
+    0xc1, 0xf8, 0x03, 0x1d, 0x41, 0xf8, 0x34, 0xc2, 0xdc, 0x39, 0x09, 0x91,
+    0xa9, 0x1e, 0xc1, 0xf8, 0x58, 0x1d, 0x41, 0xf8, 0x83, 0x21, 0xc1, 0xf8,
+    0xaa, 0x20, 0xc1, 0xf8, 0xb6, 0x1f, 0xc1, 0xf8, 0xea, 0x1e, 0xc1, 0xf9,
+    0x15, 0x1d, 0x41, 0xf9, 0x40, 0xa1, 0x09, 0x8f, 0x71, 0xa0, 0x09, 0x8f,
+    0x69, 0x9f, 0x09, 0x8f, 0x61, 0x9e, 0x09, 0x8f, 0x59, 0x9d, 0x09, 0x8f,
+    0x4a, 0x01, 0xf9, 0x64, 0xa6, 0x09, 0x8f, 0x41, 0xa5, 0x09, 0x8f, 0x39,
+    0xa4, 0x09, 0x8f, 0x31, 0xa3, 0x09, 0x8f, 0x29, 0xa2, 0x09, 0x8f, 0x21,
+    0xa1, 0x09, 0x8f, 0x19, 0xa0, 0x09, 0x8f, 0x03, 0x01, 0xf9, 0x68, 0x9f,
+    0x09, 0x8e, 0xf9, 0x9e, 0x09, 0x8e, 0xeb, 0x01, 0xf9, 0x70, 0x9d, 0x09,
+    0x8e, 0xe0, 0xa6, 0x09, 0x8e, 0xd9, 0xa5, 0x09, 0x8e, 0xcb, 0x01, 0xf9,
+    0x74, 0xa4, 0x09, 0x8e, 0xc1, 0xa3, 0x09, 0x8e, 0xb9, 0xa2, 0x09, 0x8e,
+    0xb1, 0xa1, 0x09, 0x8e, 0xa3, 0x01, 0xf9, 0x78, 0xa0, 0x09, 0x8e, 0x99,
+    0x9f, 0x09, 0x8e, 0x8b, 0x01, 0xf9, 0x7c, 0x9e, 0x09, 0x8e, 0x81, 0x9d,
+    0x09, 0x8e, 0x78, 0xa6, 0x09, 0x8e, 0x71, 0xa5, 0x09, 0x8e, 0x69, 0xa4,
+    0x09, 0x8e, 0x5b, 0x01, 0xf9, 0x80, 0xa3, 0x09, 0x8e, 0x4b, 0x01, 0xf9,
+    0x84, 0xa2, 0x09, 0x8e, 0x3b, 0x01, 0xf9, 0x88, 0xa1, 0x09, 0x8e, 0x31,
+    0xa0, 0x09, 0x8e, 0x29, 0x9f, 0x09, 0x8d, 0xe3, 0x01, 0xf9, 0x8c, 0x9e,
+    0x09, 0x8d, 0xd9, 0x9d, 0x09, 0x8d, 0xca, 0x01, 0xf9, 0xac, 0xa6, 0x09,
+    0x8d, 0xc1, 0xa5, 0x09, 0x8d, 0xb9, 0xa4, 0x09, 0x8d, 0xb1, 0xa3, 0x09,
+    0x8d, 0xa9, 0xa2, 0x09, 0x8d, 0xa1, 0xa1, 0x09, 0x8d, 0x99, 0xa0, 0x09,
+    0x8d, 0x8b, 0x01, 0xf9, 0xb0, 0x9f, 0x09, 0x8d, 0x81, 0x9e, 0x09, 0x8d,
+    0x6a, 0x01, 0xf9, 0xb4, 0x83, 0x09, 0x8d, 0x50, 0x83, 0x09, 0x8d, 0x28,
+    0xa1, 0x09, 0x8b, 0xf1, 0xa0, 0x09, 0x8b, 0xe9, 0x9f, 0x09, 0x8b, 0xe1,
+    0x9e, 0x09, 0x8b, 0xd9, 0x9d, 0x09, 0x8b, 0xd0, 0xa6, 0x09, 0x8b, 0xc9,
+    0xa5, 0x09, 0x8b, 0xc1, 0xa4, 0x09, 0x8b, 0xb9, 0xa3, 0x09, 0x8b, 0xb1,
+    0xa2, 0x09, 0x8b, 0xa3, 0x01, 0xf9, 0xbc, 0xa1, 0x09, 0x8b, 0x99, 0xa0,
+    0x09, 0x8b, 0x8b, 0x01, 0xf9, 0xc0, 0x9f, 0x09, 0x8b, 0x81, 0x9e, 0x09,
+    0x8b, 0x79, 0x9d, 0x09, 0x8b, 0x70, 0xa6, 0x09, 0x8b, 0x69, 0xa5, 0x09,
+    0x8b, 0x61, 0xa4, 0x09, 0x8b, 0x53, 0x01, 0xf9, 0xc4, 0xa3, 0x09, 0x8b,
+    0x43, 0x01, 0xf9, 0xc8, 0xa2, 0x09, 0x8b, 0x39, 0xa1, 0x09, 0x8b, 0x31,
+    0xa0, 0x09, 0x8b, 0x29, 0x9f, 0x09, 0x8b, 0x21, 0x9e, 0x09, 0x8b, 0x19,
+    0x9d, 0x09, 0x8b, 0x10, 0xa6, 0x09, 0x8b, 0x09, 0xa5, 0x09, 0x8b, 0x01,
+    0xa4, 0x09, 0x8a, 0xf9, 0xa3, 0x09, 0x8a, 0xeb, 0x01, 0xf9, 0xcc, 0xa2,
+    0x09, 0x8a, 0xe1, 0xa1, 0x09, 0x8a, 0xd9, 0xa0, 0x09, 0x8a, 0xd1, 0x9f,
+    0x09, 0x8a, 0xc9, 0x9e, 0x09, 0x8a, 0xc1, 0x9d, 0x09, 0x8a, 0xb2, 0x01,
+    0xf9, 0xd0, 0xa6, 0x09, 0x8a, 0xa9, 0xa5, 0x09, 0x8a, 0xa1, 0xa4, 0x09,
+    0x8a, 0x99, 0xa3, 0x09, 0x8a, 0x91, 0xa2, 0x09, 0x8a, 0x89, 0xa1, 0x09,
+    0x8a, 0x81, 0xa0, 0x09, 0x8a, 0x79, 0x9f, 0x09, 0x8a, 0x71, 0x9e, 0x09,
+    0x8a, 0x63, 0x01, 0xf9, 0xd4, 0x9d, 0x09, 0x8a, 0x58, 0xa6, 0x09, 0x8a,
+    0x51, 0xa5, 0x09, 0x8a, 0x49, 0xa4, 0x09, 0x8a, 0x33, 0x01, 0xf9, 0xd8,
+    0xa3, 0x09, 0x8a, 0x23, 0x01, 0xf9, 0xe0, 0xa2, 0x09, 0x8a, 0x19, 0xa1,
+    0x09, 0x8a, 0x11, 0xa0, 0x09, 0x8a, 0x09, 0x9f, 0x09, 0x8a, 0x01, 0x9e,
+    0x09, 0x89, 0xf8, 0xa0, 0x09, 0x89, 0xf1, 0x9f, 0x09, 0x89, 0xe9, 0x9e,
+    0x09, 0x89, 0xcb, 0x01, 0xf9, 0xe4, 0x9d, 0x09, 0x89, 0xc0, 0xa6, 0x09,
+    0x89, 0xb9, 0xa5, 0x09, 0x89, 0xb1, 0xa4, 0x09, 0x89, 0xa3, 0x01, 0xf9,
+    0xf0, 0xa3, 0x09, 0x89, 0x93, 0x01, 0xf9, 0xf4, 0xa2, 0x09, 0x89, 0x83,
+    0x01, 0xf9, 0xf8, 0xa1, 0x09, 0x89, 0x79, 0xa0, 0x09, 0x89, 0x71, 0x9f,
+    0x09, 0x89, 0x69, 0x9e, 0x09, 0x89, 0x61, 0x9d, 0x09, 0x89, 0x58, 0xa6,
+    0x09, 0x89, 0x51, 0xa5, 0x09, 0x89, 0x43, 0x01, 0xf9, 0xfc, 0xa4, 0x09,
+    0x89, 0x33, 0x01, 0xfa, 0x00, 0xa3, 0x09, 0x89, 0x29, 0xa2, 0x09, 0x89,
+    0x21, 0xa1, 0x09, 0x89, 0x19, 0xa0, 0x09, 0x89, 0x11, 0x9f, 0x09, 0x89,
+    0x09, 0x9e, 0x09, 0x88, 0xfb, 0x01, 0xfa, 0x04, 0x9d, 0x09, 0x88, 0xf0,
+    0xa6, 0x09, 0x88, 0xe9, 0xa5, 0x09, 0x88, 0xe1, 0xa4, 0x09, 0x88, 0xd9,
+    0xa3, 0x09, 0x88, 0xd1, 0xa2, 0x09, 0x88, 0xc9, 0xa1, 0x09, 0x88, 0xc1,
+    0xa0, 0x09, 0x88, 0xb9, 0x9f, 0x09, 0x88, 0xb1, 0x9e, 0x09, 0x88, 0xa3,
+    0x01, 0xfa, 0x08, 0x9d, 0x09, 0x88, 0x98, 0xa6, 0x09, 0x88, 0x91, 0xa5,
+    0x09, 0x88, 0x89, 0xa4, 0x09, 0x88, 0x81, 0xa3, 0x09, 0x88, 0x79, 0xa2,
+    0x09, 0x88, 0x71, 0xa1, 0x09, 0x88, 0x69, 0xa0, 0x09, 0x88, 0x5b, 0x01,
+    0xfa, 0x0c, 0x9f, 0x09, 0x88, 0x51, 0x9e, 0x09, 0x88, 0x49, 0x9d, 0x09,
+    0x88, 0x40, 0xa6, 0x09, 0x88, 0x39, 0xa5, 0x09, 0x88, 0x31, 0xa4, 0x09,
+    0x88, 0x29, 0xa3, 0x09, 0x88, 0x21, 0xa2, 0x09, 0x88, 0x19, 0xa1, 0x09,
+    0x88, 0x11, 0xa0, 0x09, 0x88, 0x09, 0x9f, 0x09, 0x88, 0x01, 0x9e, 0x09,
+    0x87, 0xf2, 0x01, 0xfa, 0x10, 0xa4, 0x09, 0x86, 0x4b, 0x01, 0xfa, 0x14,
+    0xa3, 0x09, 0x86, 0x41, 0xa2, 0x09, 0x86, 0x39, 0xa1, 0x09, 0x86, 0x31,
+    0xa0, 0x09, 0x86, 0x29, 0x9f, 0x09, 0x86, 0x21, 0x9e, 0x09, 0x86, 0x19,
+    0x9d, 0x09, 0x86, 0x10, 0xa6, 0x09, 0x86, 0x09, 0xa5, 0x09, 0x86, 0x01,
+    0xa4, 0x09, 0x85, 0xf9, 0xa3, 0x09, 0x85, 0xf1, 0xa2, 0x09, 0x85, 0xe9,
+    0xa1, 0x09, 0x85, 0xdb, 0x01, 0xfa, 0x34, 0xa0, 0x09, 0x85, 0xd1, 0x9f,
+    0x09, 0x85, 0xc3, 0x01, 0xfa, 0x38, 0x9e, 0x09, 0x85, 0xb9, 0x9d, 0x09,
+    0x85, 0x6a, 0x01, 0xfa, 0x3c, 0xa6, 0x09, 0x85, 0x61, 0xa5, 0x09, 0x85,
+    0x53, 0x01, 0xfa, 0x60, 0xa4, 0x09, 0x85, 0x49, 0xa3, 0x09, 0x85, 0x3b,
+    0x01, 0xfa, 0x64, 0xa2, 0x09, 0x85, 0x31, 0xa1, 0x09, 0x85, 0x29, 0xa0,
+    0x09, 0x85, 0x21, 0x9f, 0x09, 0x85, 0x19, 0x9e, 0x09, 0x85, 0x11, 0x9d,
+    0x09, 0x85, 0x08, 0xa6, 0x09, 0x85, 0x01, 0xa5, 0x09, 0x84, 0xf9, 0xa4,
+    0x09, 0x84, 0xf1, 0xa3, 0x09, 0x84, 0xe9, 0xa2, 0x09, 0x84, 0xe1, 0xa1,
+    0x09, 0x84, 0xd3, 0x01, 0xfa, 0x68, 0xa0, 0x09, 0x84, 0xc9, 0x9f, 0x09,
+    0x84, 0xc1, 0x9e, 0x09, 0x84, 0xb3, 0x01, 0xfa, 0x6c, 0x9d, 0x09, 0x84,
+    0xa8, 0xa6, 0x09, 0x84, 0xa1, 0xa5, 0x09, 0x84, 0x99, 0xa4, 0x09, 0x84,
+    0x8b, 0x01, 0xfa, 0x70, 0xa3, 0x09, 0x84, 0x81, 0xa2, 0x09, 0x84, 0x79,
+    0xa1, 0x09, 0x84, 0x71, 0xa0, 0x09, 0x84, 0x69, 0x9f, 0x09, 0x84, 0x61,
+    0x9e, 0x09, 0x84, 0x59, 0x9d, 0x09, 0x84, 0x50, 0xa6, 0x09, 0x84, 0x49,
+    0xa5, 0x09, 0x84, 0x41, 0xa4, 0x09, 0x84, 0x39, 0xa3, 0x09, 0x84, 0x31,
+    0xa2, 0x09, 0x84, 0x29, 0xa1, 0x09, 0x84, 0x21, 0xa0, 0x09, 0x84, 0x19,
+    0x9f, 0x09, 0x84, 0x11, 0x9e, 0x09, 0x84, 0x09, 0x9d, 0x09, 0x84, 0x00,
+    0xa6, 0x09, 0x83, 0xf9, 0xa5, 0x09, 0x83, 0xeb, 0x01, 0xfa, 0x74, 0xa4,
+    0x09, 0x83, 0xe1, 0xa3, 0x09, 0x83, 0xd9, 0xa2, 0x09, 0x83, 0xd1, 0xa1,
+    0x09, 0x83, 0xc9, 0xa0, 0x09, 0x83, 0xc1, 0x9f, 0x09, 0x83, 0xb9, 0x9e,
+    0x09, 0x83, 0xb0, 0xa1, 0x09, 0x83, 0xa9, 0xa0, 0x09, 0x83, 0xa1, 0x9f,
+    0x09, 0x83, 0x99, 0x9e, 0x09, 0x83, 0x91, 0x9d, 0x09, 0x83, 0x88, 0xa6,
+    0x09, 0x83, 0x81, 0xa5, 0x09, 0x83, 0x79, 0xa4, 0x09, 0x83, 0x71, 0xa3,
+    0x09, 0x83, 0x69, 0xa2, 0x09, 0x83, 0x61, 0xa1, 0x09, 0x83, 0x59, 0xa0,
+    0x09, 0x83, 0x51, 0x9f, 0x09, 0x83, 0x49, 0x9e, 0x09, 0x83, 0x41, 0x9d,
+    0x09, 0x83, 0x32, 0x01, 0xfa, 0x78, 0xa6, 0x09, 0x83, 0x29, 0xa5, 0x09,
+    0x83, 0x21, 0xa4, 0x09, 0x83, 0x19, 0xa3, 0x09, 0x83, 0x11, 0xa2, 0x09,
+    0x83, 0x09, 0xa1, 0x09, 0x83, 0x01, 0xa0, 0x09, 0x82, 0xf9, 0x9f, 0x09,
+    0x82, 0xdb, 0x01, 0xfa, 0x7c, 0x9e, 0x09, 0x82, 0xd0, 0xcb, 0x58, 0xc7,
+    0x0f, 0xbd, 0x39, 0x46, 0x01, 0xfc, 0xc1, 0xfa, 0x88, 0x15, 0xc1, 0xfa,
+    0x94, 0xd4, 0x3c, 0xb4, 0x0f, 0xbd, 0xa0, 0xc4, 0x18, 0x10, 0x00, 0x37,
+    0xb9, 0xc2, 0x22, 0xcc, 0x00, 0x37, 0xb0, 0xc3, 0x0d, 0x14, 0x00, 0x37,
+    0xa9, 0xc3, 0x09, 0x9e, 0x00, 0x37, 0xa0, 0xc4, 0x02, 0xde, 0x00, 0x37,
+    0x99, 0xc2, 0x02, 0xa0, 0x00, 0x37, 0x90, 0x97, 0x00, 0x98, 0x4b, 0x01,
+    0xfa, 0xa0, 0x47, 0x23, 0x34, 0xc1, 0xfa, 0xa6, 0x83, 0x00, 0x98, 0x43,
+    0x01, 0xfa, 0xc9, 0x8b, 0x00, 0x98, 0x51, 0x87, 0x00, 0x98, 0x6b, 0x01,
+    0xfa, 0xcd, 0x91, 0x00, 0x98, 0x73, 0x01, 0xfa, 0xd1, 0x19, 0xc1, 0xfa,
+    0xd5, 0x09, 0xc1, 0xfa, 0xe7, 0x1b, 0x41, 0xfb, 0x05, 0x0a, 0xc1, 0xfb,
+    0x1f, 0x83, 0x00, 0x90, 0x03, 0x01, 0xfb, 0x41, 0x97, 0x00, 0x90, 0x09,
+    0x8b, 0x00, 0x90, 0x11, 0x87, 0x00, 0x90, 0x2b, 0x01, 0xfb, 0x45, 0x91,
+    0x00, 0x90, 0x32, 0x01, 0xfb, 0x49, 0x04, 0xc1, 0xfb, 0x4d, 0x83, 0x00,
+    0x93, 0x03, 0x01, 0xfb, 0x67, 0x97, 0x00, 0x93, 0x09, 0x8b, 0x00, 0x93,
+    0x11, 0x87, 0x00, 0x93, 0x2b, 0x01, 0xfb, 0x6b, 0x91, 0x00, 0x93, 0x33,
+    0x01, 0xfb, 0x6f, 0x19, 0x41, 0xfb, 0x73, 0x05, 0xc1, 0xfb, 0x82, 0x83,
+    0x00, 0x93, 0xc3, 0x01, 0xfb, 0xa0, 0x97, 0x00, 0x93, 0xc9, 0x8b, 0x00,
+    0x93, 0xd1, 0x87, 0x00, 0x93, 0xeb, 0x01, 0xfb, 0xa4, 0x91, 0x00, 0x93,
+    0xf3, 0x01, 0xfb, 0xa8, 0xc2, 0x01, 0x4a, 0x00, 0x93, 0xf9, 0x0a, 0x41,
+    0xfb, 0xac, 0x1c, 0xc1, 0xfb, 0xcf, 0x06, 0xc1, 0xfb, 0xe8, 0x83, 0x00,
+    0x97, 0x83, 0x01, 0xfc, 0x0f, 0x97, 0x00, 0x97, 0x89, 0x8b, 0x00, 0x97,
+    0x91, 0x87, 0x00, 0x97, 0xab, 0x01, 0xfc, 0x13, 0x91, 0x00, 0x97, 0xb3,
+    0x01, 0xfc, 0x17, 0xc2, 0x01, 0x4a, 0x00, 0x97, 0xb8, 0x42, 0x00, 0x8e,
+    0xc1, 0xfc, 0x1b, 0x83, 0x00, 0x93, 0x83, 0x01, 0xfc, 0x34, 0x97, 0x00,
+    0x93, 0x89, 0x8b, 0x00, 0x93, 0x91, 0x87, 0x00, 0x93, 0xab, 0x01, 0xfc,
+    0x38, 0x91, 0x00, 0x93, 0xb3, 0x01, 0xfc, 0x3c, 0xc2, 0x01, 0x4a, 0x00,
+    0x93, 0xb9, 0x0a, 0xc1, 0xfc, 0x40, 0x15, 0xc1, 0xfc, 0x63, 0x1c, 0x41,
+    0xfc, 0x83, 0x83, 0x00, 0x90, 0x43, 0x01, 0xfc, 0xa0, 0x97, 0x00, 0x90,
+    0x49, 0x8b, 0x00, 0x90, 0x51, 0x87, 0x00, 0x90, 0x6b, 0x01, 0xfc, 0xa4,
+    0x91, 0x00, 0x90, 0x73, 0x01, 0xfc, 0xa8, 0xc2, 0x01, 0x4a, 0x00, 0x90,
+    0x78, 0x83, 0x00, 0x90, 0xc3, 0x01, 0xfc, 0xac, 0x97, 0x00, 0x90, 0xc9,
+    0x8b, 0x00, 0x90, 0xd1, 0x87, 0x00, 0x90, 0xeb, 0x01, 0xfc, 0xb0, 0x91,
+    0x00, 0x90, 0xf3, 0x01, 0xfc, 0xb4, 0x19, 0xc1, 0xfc, 0xb8, 0xc2, 0x19,
+    0x2c, 0x00, 0x9a, 0xc8, 0x1c, 0xc1, 0xfc, 0xc7, 0x83, 0x00, 0x91, 0x83,
+    0x01, 0xfc, 0xe7, 0x97, 0x00, 0x91, 0x89, 0x8b, 0x00, 0x91, 0x91, 0x87,
+    0x00, 0x91, 0xab, 0x01, 0xfc, 0xeb, 0x91, 0x00, 0x91, 0xb3, 0x01, 0xfc,
+    0xf5, 0xc2, 0x01, 0x4a, 0x00, 0x91, 0xb9, 0x0a, 0xc1, 0xfc, 0xf9, 0x15,
+    0x41, 0xfd, 0x1c, 0x83, 0x00, 0x91, 0x43, 0x01, 0xfd, 0x36, 0x97, 0x00,
+    0x91, 0x49, 0x8b, 0x00, 0x91, 0x51, 0x87, 0x00, 0x91, 0x6b, 0x01, 0xfd,
+    0x3a, 0x91, 0x00, 0x91, 0x73, 0x01, 0xfd, 0x3e, 0xc2, 0x01, 0x4a, 0x00,
+    0x91, 0x79, 0xc2, 0x19, 0x2c, 0x00, 0x9a, 0xc0, 0x83, 0x00, 0x92, 0x03,
+    0x01, 0xfd, 0x42, 0x97, 0x00, 0x92, 0x09, 0x8b, 0x00, 0x92, 0x11, 0x87,
+    0x00, 0x92, 0x2b, 0x01, 0xfd, 0x46, 0x91, 0x00, 0x92, 0x33, 0x01, 0xfd,
+    0x4a, 0x19, 0xc1, 0xfd, 0x4e, 0x0a, 0xc1, 0xfd, 0x60, 0x1b, 0x41, 0xfd,
+    0x7e, 0x83, 0x00, 0x93, 0x43, 0x01, 0xfd, 0x98, 0x97, 0x00, 0x93, 0x49,
+    0x8b, 0x00, 0x93, 0x51, 0x87, 0x00, 0x93, 0x6b, 0x01, 0xfd, 0x9c, 0x91,
+    0x00, 0x93, 0x71, 0xc2, 0x01, 0x4a, 0x00, 0x93, 0x78, 0x83, 0x00, 0x94,
+    0x03, 0x01, 0xfd, 0xa0, 0x97, 0x00, 0x94, 0x09, 0x8b, 0x00, 0x94, 0x11,
+    0x87, 0x00, 0x94, 0x2b, 0x01, 0xfd, 0xa4, 0x91, 0x00, 0x94, 0x33, 0x01,
+    0xfd, 0xa8, 0x19, 0xc1, 0xfd, 0xac, 0x1b, 0x41, 0xfd, 0xbe, 0x83, 0x00,
+    0x94, 0x83, 0x01, 0xfd, 0xd8, 0x97, 0x00, 0x94, 0x89, 0x8b, 0x00, 0x94,
+    0x91, 0x87, 0x00, 0x94, 0xab, 0x01, 0xfd, 0xdc, 0x91, 0x00, 0x94, 0xb3,
+    0x01, 0xfd, 0xe0, 0xc2, 0x01, 0x4a, 0x00, 0x94, 0xb9, 0x1b, 0x41, 0xfd,
+    0xe4, 0x83, 0x00, 0x95, 0x43, 0x01, 0xfe, 0x07, 0x97, 0x00, 0x95, 0x49,
+    0x8b, 0x00, 0x95, 0x51, 0x87, 0x00, 0x95, 0x6b, 0x01, 0xfe, 0x0b, 0x91,
+    0x00, 0x95, 0x73, 0x01, 0xfe, 0x0f, 0x19, 0xc1, 0xfe, 0x13, 0x1a, 0xc1,
+    0xfe, 0x25, 0x1b, 0x41, 0xfe, 0x43, 0x83, 0x00, 0x96, 0x43, 0x01, 0xfe,
+    0x5d, 0x97, 0x00, 0x96, 0x49, 0x8b, 0x00, 0x96, 0x51, 0x87, 0x00, 0x96,
+    0x6b, 0x01, 0xfe, 0x61, 0x91, 0x00, 0x96, 0x72, 0x01, 0xfe, 0x65, 0x0a,
+    0xc1, 0xfe, 0x69, 0x83, 0x00, 0x9a, 0x83, 0x01, 0xfe, 0x8c, 0x97, 0x00,
+    0x9a, 0x89, 0x8b, 0x00, 0x9a, 0x91, 0x87, 0x00, 0x9a, 0xab, 0x01, 0xfe,
+    0x90, 0x91, 0x00, 0x9a, 0xb3, 0x01, 0xfe, 0x94, 0x19, 0x41, 0xfe, 0x98,
+    0x83, 0x00, 0x96, 0xc3, 0x01, 0xfe, 0xa7, 0x97, 0x00, 0x96, 0xc9, 0x8b,
+    0x00, 0x96, 0xd1, 0x87, 0x00, 0x96, 0xeb, 0x01, 0xfe, 0xab, 0x91, 0x00,
+    0x96, 0xf3, 0x01, 0xfe, 0xaf, 0xc2, 0x01, 0x4a, 0x00, 0x96, 0xf9, 0x0a,
+    0xc1, 0xfe, 0xb3, 0x1c, 0x41, 0xfe, 0xd3, 0x83, 0x00, 0x97, 0x43, 0x01,
+    0xfe, 0xed, 0x97, 0x00, 0x97, 0x49, 0x8b, 0x00, 0x97, 0x51, 0x87, 0x00,
+    0x97, 0x6b, 0x01, 0xfe, 0xf1, 0x91, 0x00, 0x97, 0x72, 0x01, 0xfe, 0xf5,
+    0x83, 0x00, 0x98, 0x03, 0x01, 0xfe, 0xf9, 0x97, 0x00, 0x98, 0x09, 0x8b,
+    0x00, 0x98, 0x11, 0x87, 0x00, 0x98, 0x2b, 0x01, 0xfe, 0xfd, 0x91, 0x00,
+    0x98, 0x33, 0x01, 0xff, 0x01, 0xc2, 0x01, 0x4a, 0x00, 0x98, 0x38, 0x83,
+    0x00, 0x9a, 0x43, 0x01, 0xff, 0x05, 0x97, 0x00, 0x9a, 0x49, 0x8b, 0x00,
+    0x9a, 0x51, 0x87, 0x00, 0x9a, 0x6b, 0x01, 0xff, 0x09, 0x91, 0x00, 0x9a,
+    0x71, 0x19, 0xc1, 0xff, 0x0d, 0xc2, 0x19, 0x2c, 0x00, 0x9a, 0xd0, 0x4b,
+    0x63, 0xff, 0xc1, 0xff, 0x1c, 0xd1, 0x36, 0x4b, 0x00, 0x9a, 0xf0, 0xc9,
+    0x57, 0x20, 0x00, 0x9b, 0xe0, 0xc6, 0xce, 0xc9, 0x00, 0x9c, 0xc0, 0x48,
+    0x6e, 0x42, 0xc1, 0xff, 0x28, 0x45, 0x00, 0x8c, 0x41, 0xff, 0x34, 0xc5,
+    0x01, 0xa2, 0x01, 0x18, 0x09, 0xc5, 0xd8, 0x53, 0x0f, 0xa9, 0x31, 0xc4,
+    0xe3, 0xdb, 0x0f, 0xa8, 0x61, 0xca, 0xa5, 0x94, 0x0f, 0xa5, 0x08, 0xc2,
+    0x39, 0x8b, 0x08, 0x7f, 0xa9, 0xc3, 0x1e, 0x1b, 0x08, 0x7f, 0x40, 0xc3,
+    0x11, 0xef, 0x08, 0x7f, 0xa1, 0x03, 0x41, 0xff, 0x58, 0xc2, 0x00, 0x8e,
+    0x08, 0x7f, 0x38, 0xc4, 0x36, 0xb5, 0x08, 0x7f, 0x01, 0xc3, 0x16, 0x5a,
+    0x08, 0x7f, 0x78, 0x87, 0x08, 0x29, 0x29, 0xc4, 0x38, 0x2c, 0x08, 0x29,
+    0x30, 0xd6, 0x2e, 0x6a, 0x01, 0x39, 0xb9, 0xcd, 0x0e, 0x61, 0x01, 0x39,
+    0xa9, 0xca, 0x22, 0x51, 0x01, 0x39, 0xa0, 0xc2, 0x00, 0x55, 0x01, 0x10,
+    0x71, 0xcb, 0x6d, 0x97, 0x00, 0x04, 0xb8, 0xcb, 0x98, 0xd1, 0x00, 0x00,
+    0x23, 0x01, 0xff, 0x64, 0xc3, 0x09, 0x3f, 0x00, 0x00, 0x18, 0x43, 0x05,
+    0xb2, 0xc1, 0xff, 0x6a, 0xcd, 0x76, 0x76, 0x01, 0x12, 0xe8, 0x00, 0x41,
+    0xff, 0x82, 0xc4, 0x18, 0x10, 0x08, 0xed, 0x39, 0xc2, 0x22, 0xcc, 0x08,
+    0xed, 0x30, 0xc3, 0x0d, 0x14, 0x08, 0xed, 0x29, 0xc3, 0x09, 0x9e, 0x08,
+    0xed, 0x20, 0xc4, 0x02, 0xde, 0x08, 0xed, 0x19, 0xc2, 0x02, 0xa0, 0x08,
+    0xed, 0x10, 0x03, 0xc1, 0xff, 0x8c, 0xc2, 0x01, 0x24, 0x08, 0xec, 0x99,
+    0xc2, 0x02, 0xe0, 0x08, 0xec, 0x81, 0x97, 0x08, 0xec, 0x6b, 0x01, 0xff,
+    0x98, 0x8b, 0x08, 0xec, 0x5a, 0x01, 0xff, 0x9c, 0xc2, 0x00, 0xd0, 0x08,
+    0xec, 0x31, 0x83, 0x08, 0xec, 0x28, 0xc2, 0x01, 0x30, 0x08, 0xec, 0x21,
+    0x83, 0x08, 0xeb, 0xd0, 0x06, 0xc1, 0xff, 0xa0, 0xc2, 0x00, 0xd0, 0x08,
+    0xeb, 0xc9, 0x83, 0x08, 0xeb, 0xc0, 0xc2, 0x00, 0xd0, 0x08, 0xec, 0x09,
+    0x83, 0x08, 0xec, 0x00, 0xc2, 0x00, 0xdb, 0x08, 0xeb, 0xf9, 0x83, 0x08,
+    0xeb, 0xa8, 0x16, 0xc1, 0xff, 0xaa, 0xc2, 0x00, 0xd0, 0x08, 0xeb, 0xa1,
+    0x83, 0x08, 0xeb, 0x98, 0xc2, 0x00, 0xd0, 0x08, 0xeb, 0xe1, 0x83, 0x08,
+    0xeb, 0xd8, 0xc2, 0x00, 0xd0, 0x08, 0xeb, 0xb9, 0x83, 0x08, 0xeb, 0xb0,
+    0xc2, 0x00, 0xd0, 0x08, 0xeb, 0x91, 0x83, 0x08, 0xeb, 0x88, 0xc2, 0x00,
+    0xd0, 0x08, 0xeb, 0x79, 0x83, 0x08, 0xeb, 0x70, 0x97, 0x08, 0xeb, 0x59,
+    0x8b, 0x08, 0xeb, 0x41, 0x83, 0x08, 0xeb, 0x08, 0x97, 0x08, 0xeb, 0x28,
+    0x8b, 0x08, 0xeb, 0x18, 0xc5, 0x40, 0xe7, 0x00, 0x50, 0x19, 0xc4, 0x1e,
+    0x97, 0x00, 0x52, 0x68, 0x83, 0x00, 0x50, 0x31, 0x8b, 0x00, 0x50, 0x81,
+    0x97, 0x00, 0x50, 0xa0, 0x8b, 0x00, 0x50, 0x40, 0x97, 0x00, 0x50, 0x50,
+    0x83, 0x00, 0x50, 0xa9, 0x0a, 0x41, 0xff, 0xb4, 0x83, 0x00, 0x50, 0xb9,
+    0x0a, 0x41, 0xff, 0xbe, 0xc2, 0x01, 0x30, 0x00, 0x50, 0xc9, 0xc2, 0x19,
+    0x2c, 0x00, 0x50, 0xf1, 0xc2, 0x00, 0xc1, 0x00, 0x51, 0x19, 0x83, 0x00,
+    0x51, 0x40, 0x83, 0x00, 0x50, 0xd1, 0xc2, 0x00, 0xd0, 0x00, 0x50, 0xd8,
+    0x83, 0x00, 0x50, 0xe1, 0xc2, 0x00, 0xd0, 0x00, 0x50, 0xe8, 0x16, 0xc1,
+    0xff, 0xc8, 0x83, 0x00, 0x51, 0x21, 0xc2, 0x00, 0xd0, 0x00, 0x51, 0x28,
+    0x06, 0xc1, 0xff, 0xd2, 0x83, 0x00, 0x51, 0x31, 0xc2, 0x00, 0xd0, 0x00,
+    0x51, 0x38, 0x83, 0x00, 0x51, 0x51, 0xc2, 0x00, 0xd0, 0x00, 0x51, 0x58,
+    0x83, 0x00, 0x51, 0x61, 0xc2, 0x00, 0xd0, 0x00, 0x51, 0x68, 0x83, 0x00,
+    0x51, 0x81, 0xc2, 0x00, 0x39, 0x00, 0x52, 0xe0, 0x83, 0x00, 0x51, 0x91,
+    0xc2, 0x00, 0xdb, 0x00, 0x51, 0x98, 0xc2, 0x00, 0xd0, 0x00, 0x51, 0xb1,
+    0x83, 0x00, 0x51, 0xc0, 0x83, 0x00, 0x51, 0xf1, 0x8b, 0x00, 0x52, 0x41,
+    0x97, 0x00, 0x52, 0x60, 0x8b, 0x00, 0x52, 0x00, 0x97, 0x00, 0x52, 0x10,
+    0xc2, 0x02, 0xa0, 0x00, 0x53, 0x41, 0xc4, 0x02, 0xde, 0x00, 0x53, 0x48,
+    0xc3, 0x09, 0x9e, 0x00, 0x53, 0x51, 0xc3, 0x0d, 0x14, 0x00, 0x53, 0x58,
+    0xc2, 0x22, 0xcc, 0x00, 0x53, 0x61, 0xc4, 0x18, 0x10, 0x00, 0x53, 0x68,
+    0xca, 0x1e, 0x8a, 0x00, 0x54, 0x09, 0xd1, 0x33, 0x57, 0x00, 0x57, 0xf0,
+    0xc7, 0x14, 0x39, 0x00, 0x54, 0x11, 0xc7, 0x7a, 0x7f, 0x00, 0x55, 0xe8,
+    0xc5, 0x40, 0xe7, 0x00, 0x54, 0x19, 0xc4, 0x1e, 0x97, 0x00, 0x56, 0x68,
+    0xc4, 0xdb, 0xfb, 0x00, 0x57, 0xd1, 0xc5, 0xd7, 0x18, 0x00, 0x57, 0xd8,
+    0xd4, 0x3a, 0x84, 0x00, 0x57, 0xe9, 0xd5, 0x33, 0x53, 0x00, 0x57, 0xf8,
+    0x83, 0x00, 0x54, 0x31, 0x8b, 0x00, 0x54, 0x81, 0x97, 0x00, 0x54, 0xa0,
+    0x8b, 0x00, 0x54, 0x40, 0x97, 0x00, 0x54, 0x50, 0x47, 0xb2, 0x2e, 0xc1,
+    0xff, 0xdc, 0x83, 0x00, 0x55, 0xa8, 0x83, 0x00, 0x54, 0xa9, 0xc2, 0x00,
+    0xd0, 0x00, 0x54, 0xb0, 0x83, 0x00, 0x54, 0xb9, 0xc2, 0x00, 0xd0, 0x00,
+    0x54, 0xc0, 0xc2, 0x01, 0x30, 0x00, 0x54, 0xc9, 0xc2, 0x19, 0x2c, 0x00,
+    0x54, 0xf1, 0xc2, 0x00, 0xc1, 0x00, 0x55, 0x19, 0x83, 0x00, 0x55, 0x40,
+    0x83, 0x00, 0x54, 0xd1, 0xc2, 0x00, 0xd0, 0x00, 0x54, 0xd8, 0x83, 0x00,
+    0x54, 0xe1, 0xc2, 0x00, 0xd0, 0x00, 0x54, 0xe8, 0x16, 0xc1, 0xff, 0xea,
+    0x83, 0x00, 0x55, 0x21, 0xc2, 0x00, 0xd0, 0x00, 0x55, 0x28, 0x06, 0xc1,
+    0xff, 0xf4, 0x83, 0x00, 0x55, 0x31, 0xc2, 0x00, 0xd0, 0x00, 0x55, 0x38,
+    0x83, 0x00, 0x55, 0x51, 0xc2, 0x00, 0xd0, 0x00, 0x55, 0x58, 0x83, 0x00,
+    0x55, 0x61, 0xc2, 0x00, 0xd0, 0x00, 0x55, 0x68, 0x83, 0x00, 0x55, 0x91,
+    0xc2, 0x00, 0xdb, 0x00, 0x55, 0x98, 0xc2, 0x00, 0xd0, 0x00, 0x55, 0xb1,
+    0xc2, 0x0d, 0xf6, 0x00, 0x55, 0xb9, 0x83, 0x00, 0x55, 0xc0, 0x87, 0x00,
+    0x54, 0x69, 0x91, 0x00, 0x54, 0x88, 0x03, 0xc1, 0xff, 0xfe, 0x8b, 0x00,
+    0x55, 0xfb, 0x02, 0x00, 0x0a, 0x97, 0x00, 0x56, 0x0b, 0x02, 0x00, 0x0e,
+    0x48, 0xb2, 0x2d, 0xc2, 0x00, 0x12, 0x47, 0xc7, 0x7b, 0xc2, 0x00, 0x20,
+    0x87, 0x00, 0x56, 0x39, 0x91, 0x00, 0x56, 0x58, 0xc2, 0x02, 0xa0, 0x00,
+    0x57, 0x41, 0xc4, 0x02, 0xde, 0x00, 0x57, 0x48, 0xc3, 0x09, 0x9e, 0x00,
+    0x57, 0x51, 0xc3, 0x0d, 0x14, 0x00, 0x57, 0x58, 0xc2, 0x22, 0xcc, 0x00,
+    0x57, 0x61, 0xc4, 0x18, 0x10, 0x00, 0x57, 0x68, 0xc2, 0x0d, 0x10, 0x08,
+    0x1a, 0x09, 0xc8, 0x0d, 0x03, 0x08, 0x1a, 0x50, 0x0f, 0xc2, 0x00, 0x28,
+    0x42, 0x00, 0x74, 0xc2, 0x00, 0x34, 0x18, 0xc2, 0x00, 0x40, 0x06, 0xc2,
+    0x00, 0x4c, 0x11, 0xc2, 0x00, 0x61, 0x48, 0x0b, 0x17, 0xc2, 0x00, 0x79,
+    0x15, 0xc2, 0x00, 0x95, 0x12, 0xc2, 0x00, 0xad, 0x0d, 0xc2, 0x00, 0xce,
+    0x0e, 0xc2, 0x00, 0xde, 0xcc, 0x56, 0x9a, 0x00, 0x1b, 0xa1, 0x1b, 0xc2,
+    0x00, 0xf6, 0xcd, 0x2c, 0xb2, 0x00, 0x1b, 0xf1, 0x16, 0xc2, 0x01, 0x02,
+    0x03, 0xc2, 0x01, 0x1e, 0xcb, 0x93, 0xa9, 0x00, 0x1e, 0x81, 0x14, 0xc2,
+    0x01, 0x2e, 0x08, 0xc2, 0x01, 0x3a, 0xcb, 0x92, 0x3e, 0x08, 0x0c, 0x29,
+    0xcb, 0x8c, 0xb3, 0x08, 0x0c, 0x41, 0xc9, 0xab, 0x7f, 0x08, 0x0c, 0x51,
+    0x4d, 0x78, 0x4a, 0x42, 0x01, 0x46, 0xc4, 0xe3, 0x33, 0x0f, 0xa6, 0xb9,
+    0xc5, 0x1c, 0xae, 0x0f, 0xa4, 0xd1, 0xc5, 0xd7, 0x1d, 0x0f, 0x9a, 0x79,
+    0xc5, 0xd9, 0xfc, 0x0f, 0xca, 0xb8, 0x4a, 0x37, 0x44, 0xc2, 0x01, 0x58,
+    0xcf, 0x65, 0xc1, 0x01, 0x55, 0x28, 0xc3, 0x02, 0xa3, 0x01, 0x16, 0xb9,
+    0xcd, 0x78, 0x30, 0x01, 0x53, 0xd1, 0xd3, 0x43, 0x39, 0x01, 0x53, 0xe0,
+    0x42, 0x00, 0x2a, 0xc2, 0x01, 0x64, 0x43, 0x00, 0x5f, 0x42, 0x01, 0x7f,
+    0x45, 0x00, 0xd5, 0xc2, 0x01, 0x8b, 0x43, 0x02, 0x9c, 0x42, 0x01, 0x9d,
+    0xd4, 0x00, 0xd3, 0x01, 0x55, 0x48, 0x48, 0xb2, 0x2d, 0xc2, 0x01, 0xa9,
+    0x03, 0xc2, 0x01, 0xb7, 0xc2, 0x01, 0x24, 0x08, 0x9a, 0x59, 0xc2, 0x02,
+    0xe0, 0x08, 0x9a, 0x39, 0x97, 0x08, 0x9a, 0x0b, 0x02, 0x01, 0xc3, 0x8b,
+    0x08, 0x99, 0xfa, 0x02, 0x01, 0xc7, 0x18, 0xc2, 0x01, 0xcb, 0xc2, 0x00,
+    0xd0, 0x08, 0x99, 0xc9, 0x15, 0xc2, 0x01, 0xdb, 0x0e, 0xc2, 0x01, 0xeb,
+    0xc2, 0x00, 0x39, 0x08, 0x99, 0x81, 0xc2, 0x19, 0x2c, 0x08, 0x99, 0x79,
+    0xc2, 0x01, 0xc3, 0x08, 0x99, 0x71, 0x04, 0xc2, 0x01, 0xf5, 0x12, 0xc2,
+    0x01, 0xff, 0x06, 0xc2, 0x02, 0x09, 0x16, 0xc2, 0x02, 0x17, 0x10, 0xc2,
+    0x02, 0x25, 0x0c, 0xc2, 0x02, 0x3b, 0x05, 0xc2, 0x02, 0x45, 0x09, 0xc2,
+    0x02, 0x4f, 0x0d, 0xc2, 0x02, 0x59, 0x83, 0x08, 0x98, 0x2b, 0x02, 0x02,
+    0x63, 0xc2, 0x01, 0x24, 0x08, 0x98, 0x99, 0x97, 0x08, 0x98, 0x4b, 0x02,
+    0x02, 0x6f, 0x8b, 0x08, 0x98, 0x3b, 0x02, 0x02, 0x73, 0xc2, 0x02, 0xe0,
+    0x08, 0x98, 0x78, 0xc5, 0xd7, 0x3b, 0x08, 0x9a, 0xe9, 0x42, 0x07, 0xb2,
+    0xc2, 0x02, 0x77, 0x03, 0xc2, 0x02, 0x83, 0xc5, 0x33, 0x5d, 0x08, 0x99,
+    0xe1, 0x05, 0x42, 0x02, 0x8f, 0x46, 0x00, 0x8b, 0x42, 0x02, 0x9b, 0xc5,
+    0x07, 0x62, 0x01, 0x12, 0x89, 0xca, 0x37, 0x4e, 0x01, 0x12, 0x70, 0x42,
+    0x00, 0xdb, 0xc2, 0x02, 0xa5, 0x0a, 0xc2, 0x02, 0xaf, 0x03, 0xc2, 0x02,
+    0xc3, 0x16, 0xc2, 0x02, 0xd3, 0x07, 0xc2, 0x02, 0xdd, 0xc2, 0x17, 0xb6,
+    0x00, 0xe5, 0xb9, 0xc2, 0x02, 0x09, 0x00, 0xe5, 0xb1, 0xc2, 0x00, 0x28,
+    0x00, 0xe5, 0x99, 0x0c, 0xc2, 0x02, 0xe7, 0xc3, 0xe6, 0x47, 0x00, 0xe5,
+    0x71, 0x05, 0xc2, 0x02, 0xf3, 0x15, 0xc2, 0x03, 0x03, 0xc3, 0xe5, 0x69,
+    0x00, 0xe5, 0x39, 0x09, 0xc2, 0x03, 0x0f, 0x0d, 0xc2, 0x03, 0x1b, 0x12,
+    0xc2, 0x03, 0x27, 0xc2, 0x05, 0x1d, 0x00, 0xe5, 0x19, 0xc3, 0x82, 0x78,
+    0x00, 0xe5, 0x01, 0x1c, 0xc2, 0x03, 0x33, 0xc2, 0x00, 0x45, 0x00, 0xe4,
+    0xe9, 0xc3, 0x09, 0xe6, 0x00, 0xe4, 0xe1, 0xc3, 0x12, 0xb8, 0x00, 0xe4,
+    0xd9, 0xc2, 0x00, 0x74, 0x00, 0xe4, 0xc1, 0xc3, 0x21, 0x7e, 0x00, 0xe4,
+    0xa9, 0xc3, 0x62, 0xe1, 0x00, 0xe4, 0x99, 0xc3, 0x10, 0xd0, 0x00, 0xe4,
+    0x88, 0x03, 0xc2, 0x03, 0x3f, 0xc3, 0x10, 0xd0, 0x00, 0x85, 0x09, 0x09,
+    0xc2, 0x03, 0x49, 0xc3, 0x62, 0xe1, 0x00, 0x85, 0x19, 0xc2, 0x00, 0xc4,
+    0x00, 0x85, 0x21, 0xc3, 0x21, 0x7e, 0x00, 0x85, 0x29, 0x1c, 0xc2, 0x03,
+    0x55, 0x42, 0x01, 0x6f, 0xc2, 0x03, 0x61, 0xc2, 0x00, 0x74, 0x00, 0x85,
+    0x41, 0x0d, 0xc2, 0x03, 0x69, 0xc3, 0x03, 0x03, 0x00, 0x85, 0x51, 0xc3,
+    0x12, 0xb8, 0x00, 0x85, 0x59, 0xc3, 0x09, 0xe6, 0x00, 0x85, 0x61, 0xc2,
+    0x00, 0x45, 0x00, 0x85, 0x69, 0x12, 0xc2, 0x03, 0x75, 0xc3, 0x82, 0x78,
+    0x00, 0x85, 0x81, 0x15, 0xc2, 0x03, 0x81, 0xc2, 0x05, 0x1d, 0x00, 0x85,
+    0x99, 0xc3, 0xe5, 0x69, 0x00, 0x85, 0xb9, 0x05, 0xc2, 0x03, 0x8d, 0x0c,
+    0xc2, 0x03, 0x9d, 0xc3, 0xe6, 0x47, 0x00, 0x85, 0xf1, 0x0a, 0xc2, 0x03,
+    0xa9, 0xc2, 0x00, 0x28, 0x00, 0x86, 0x19, 0xc2, 0x17, 0xb6, 0x00, 0x86,
+    0x38, 0x03, 0xc2, 0x03, 0xbd, 0xc3, 0x10, 0xd0, 0x00, 0x86, 0x89, 0x09,
+    0xc2, 0x03, 0xcd, 0xc3, 0x62, 0xe1, 0x00, 0x86, 0x99, 0x07, 0xc2, 0x03,
+    0xd9, 0xc3, 0x21, 0x7e, 0x00, 0x86, 0xa9, 0x1c, 0xc2, 0x03, 0xe3, 0x16,
+    0xc2, 0x03, 0xef, 0xc2, 0x00, 0x74, 0x00, 0x86, 0xc1, 0x0d, 0xc2, 0x03,
+    0xf9, 0x42, 0x00, 0xdb, 0xc2, 0x04, 0x05, 0xc3, 0x12, 0xb8, 0x00, 0x86,
+    0xd9, 0xc3, 0x09, 0xe6, 0x00, 0x86, 0xe1, 0xc2, 0x00, 0x45, 0x00, 0x86,
+    0xe9, 0x12, 0xc2, 0x04, 0x0f, 0xc3, 0x82, 0x78, 0x00, 0x87, 0x01, 0x15,
+    0xc2, 0x04, 0x1b, 0xc2, 0x05, 0x1d, 0x00, 0x87, 0x19, 0xc3, 0xe5, 0x69,
+    0x00, 0x87, 0x39, 0x05, 0xc2, 0x04, 0x27, 0x0c, 0xc2, 0x04, 0x37, 0xc3,
+    0xe6, 0x47, 0x00, 0x87, 0x71, 0x0a, 0xc2, 0x04, 0x43, 0xc2, 0x00, 0x28,
+    0x00, 0x87, 0x99, 0xc2, 0x02, 0x09, 0x00, 0x87, 0xb1, 0xc2, 0x17, 0xb6,
+    0x00, 0x87, 0xb8, 0x03, 0xc2, 0x04, 0x57, 0xc3, 0x10, 0xd0, 0x01, 0x68,
+    0x09, 0x09, 0xc2, 0x04, 0x61, 0xc3, 0x62, 0xe1, 0x01, 0x68, 0x19, 0xc2,
+    0x00, 0xc4, 0x01, 0x68, 0x21, 0xc3, 0x21, 0x7e, 0x01, 0x68, 0x29, 0x1c,
+    0xc2, 0x04, 0x6d, 0x42, 0x01, 0x6f, 0xc2, 0x04, 0x79, 0xc2, 0x00, 0x74,
+    0x01, 0x68, 0x41, 0x0d, 0xc2, 0x04, 0x81, 0xc3, 0x03, 0x03, 0x01, 0x68,
+    0x51, 0xc3, 0x12, 0xb8, 0x01, 0x68, 0x59, 0xc3, 0x09, 0xe6, 0x01, 0x68,
+    0x61, 0xc2, 0x00, 0x45, 0x01, 0x68, 0x69, 0x12, 0xc2, 0x04, 0x8d, 0xc3,
+    0x82, 0x78, 0x01, 0x68, 0x81, 0x15, 0xc2, 0x04, 0x99, 0xc2, 0x05, 0x1d,
+    0x01, 0x68, 0x99, 0xc3, 0xe5, 0x69, 0x01, 0x68, 0xb9, 0x05, 0xc2, 0x04,
+    0xa5, 0x0c, 0xc2, 0x04, 0xb5, 0xc3, 0xe6, 0x47, 0x01, 0x68, 0xf1, 0x0a,
+    0xc2, 0x04, 0xc1, 0xc2, 0x00, 0x28, 0x01, 0x69, 0x19, 0xc2, 0x17, 0xb6,
+    0x01, 0x69, 0x38, 0xc3, 0xe5, 0x4b, 0x01, 0x60, 0x01, 0x04, 0xc2, 0x04,
+    0xd5, 0xc4, 0xdf, 0x83, 0x01, 0x60, 0x11, 0xc7, 0xc1, 0xf5, 0x01, 0x60,
+    0x19, 0x06, 0xc2, 0x04, 0xe1, 0x1b, 0xc2, 0x04, 0xf3, 0x1c, 0xc2, 0x05,
+    0x05, 0x8b, 0x01, 0x60, 0x5b, 0x02, 0x05, 0x11, 0xc4, 0xe1, 0x6b, 0x01,
+    0x60, 0x69, 0x0e, 0xc2, 0x05, 0x23, 0xc7, 0x60, 0xdd, 0x01, 0x60, 0x79,
+    0xc5, 0xdb, 0x78, 0x01, 0x60, 0x81, 0x11, 0xc2, 0x05, 0x2f, 0x12, 0xc2,
+    0x05, 0x3b, 0xc5, 0xd7, 0xb3, 0x01, 0x60, 0x99, 0x15, 0xc2, 0x05, 0x45,
+    0x16, 0xc2, 0x05, 0x5e, 0xc3, 0xc5, 0x6f, 0x01, 0x60, 0xb1, 0x08, 0xc2,
+    0x05, 0x70, 0xc4, 0xdf, 0x9f, 0x01, 0x60, 0xc1, 0x05, 0x42, 0x05, 0x7c,
+    0xc3, 0xe5, 0x4b, 0x01, 0x61, 0x81, 0x04, 0xc2, 0x05, 0x88, 0xc4, 0xdf,
+    0x83, 0x01, 0x61, 0x91, 0xc7, 0xc1, 0xf5, 0x01, 0x61, 0x99, 0x06, 0xc2,
+    0x05, 0x94, 0x1b, 0xc2, 0x05, 0xa6, 0x1c, 0xc2, 0x05, 0xb8, 0x8b, 0x01,
+    0x61, 0xdb, 0x02, 0x05, 0xc4, 0xc4, 0xe1, 0x6b, 0x01, 0x61, 0xe9, 0x0e,
+    0xc2, 0x05, 0xd6, 0xc7, 0x60, 0xdd, 0x01, 0x61, 0xf9, 0xc5, 0xdb, 0x78,
+    0x01, 0x62, 0x01, 0x11, 0xc2, 0x05, 0xe2, 0x12, 0xc2, 0x05, 0xee, 0xc5,
+    0xd7, 0xb3, 0x01, 0x62, 0x19, 0x15, 0xc2, 0x05, 0xf8, 0x16, 0xc2, 0x06,
+    0x11, 0xc3, 0xc5, 0x6f, 0x01, 0x62, 0x31, 0x08, 0xc2, 0x06, 0x23, 0xc4,
+    0xdf, 0x9f, 0x01, 0x62, 0x41, 0x05, 0x42, 0x06, 0x2f, 0xcb, 0x1e, 0x89,
+    0x00, 0x58, 0x09, 0x03, 0xc2, 0x06, 0x3b, 0x42, 0x07, 0xb2, 0xc2, 0x06,
+    0x47, 0xc5, 0x33, 0x5d, 0x00, 0x59, 0xe1, 0xc8, 0x7d, 0xa4, 0x00, 0x5a,
+    0xa8, 0x83, 0x00, 0x58, 0x2b, 0x02, 0x06, 0x53, 0x8b, 0x00, 0x58, 0x3b,
+    0x02, 0x06, 0x5f, 0x97, 0x00, 0x58, 0x4b, 0x02, 0x06, 0x63, 0x18, 0xc2,
+    0x06, 0x67, 0x87, 0x00, 0x58, 0x79, 0x91, 0x00, 0x58, 0x99, 0x0d, 0xc2,
+    0x06, 0x71, 0x09, 0xc2, 0x06, 0x7b, 0x10, 0xc2, 0x06, 0x85, 0x05, 0xc2,
+    0x06, 0x9b, 0x0c, 0xc2, 0x06, 0xa5, 0x16, 0xc2, 0x06, 0xaf, 0x06, 0xc2,
+    0x06, 0xbd, 0x12, 0xc2, 0x06, 0xcb, 0x04, 0xc2, 0x06, 0xd5, 0xc2, 0x01,
+    0xc3, 0x00, 0x59, 0x71, 0x1b, 0xc2, 0x06, 0xdf, 0x14, 0xc2, 0x06, 0xe9,
+    0x0e, 0xc2, 0x06, 0xf9, 0x15, 0xc2, 0x07, 0x03, 0xc2, 0x00, 0xd0, 0x00,
+    0x59, 0xc9, 0xc2, 0x01, 0x4a, 0x00, 0x5b, 0x88, 0x03, 0xc2, 0x07, 0x13,
+    0x8b, 0x00, 0x59, 0xfb, 0x02, 0x07, 0x1f, 0x97, 0x00, 0x5a, 0x0b, 0x02,
+    0x07, 0x23, 0x48, 0xb2, 0x2d, 0xc2, 0x07, 0x27, 0x87, 0x00, 0x5a, 0x39,
+    0x91, 0x00, 0x5a, 0x58, 0xcd, 0x74, 0xcd, 0x00, 0x5a, 0xb1, 0xcd, 0x73,
+    0x0d, 0x00, 0x5a, 0xb8, 0xc4, 0x15, 0xe7, 0x00, 0x5b, 0x31, 0xc3, 0x05,
+    0x14, 0x00, 0x5b, 0x39, 0x16, 0xc2, 0x07, 0x35, 0x08, 0xc2, 0x07, 0x41,
+    0x15, 0xc2, 0x07, 0x4d, 0xc5, 0x06, 0xdb, 0x00, 0x5b, 0x71, 0xc4, 0x26,
+    0x78, 0x00, 0x5b, 0x78, 0x44, 0x05, 0x14, 0xc2, 0x07, 0x59, 0x46, 0x02,
+    0xdd, 0x42, 0x07, 0x71, 0x0a, 0xc2, 0x07, 0x7d, 0x19, 0xc2, 0x07, 0x8f,
+    0xc2, 0x00, 0xc4, 0x0f, 0x68, 0x52, 0x02, 0x07, 0x9f, 0x11, 0xc2, 0x07,
+    0xa5, 0x0b, 0x42, 0x07, 0xb7, 0x00, 0x42, 0x07, 0xc9, 0xc2, 0x22, 0xcc,
+    0x0f, 0x68, 0x33, 0x02, 0x07, 0xd5, 0xc4, 0x18, 0x10, 0x0f, 0x68, 0x3a,
+    0x02, 0x07, 0xe2, 0x9b, 0x0f, 0x68, 0x8b, 0x02, 0x07, 0xef, 0x00, 0x42,
+    0x07, 0xf5, 0xc2, 0x0d, 0x10, 0x0f, 0x68, 0x93, 0x02, 0x08, 0x01, 0x00,
+    0x42, 0x08, 0x07, 0xc2, 0x02, 0xa0, 0x0f, 0x69, 0x7b, 0x02, 0x08, 0x13,
+    0xc4, 0x02, 0xde, 0x0f, 0x69, 0x81, 0xc2, 0x00, 0xc4, 0x0f, 0x69, 0xba,
+    0x02, 0x08, 0x19, 0xc3, 0x09, 0x9e, 0x0f, 0x69, 0x8b, 0x02, 0x08, 0x1f,
+    0xc3, 0x0d, 0x14, 0x0f, 0x69, 0x90, 0xc2, 0x22, 0xcc, 0x0f, 0x69, 0x9b,
+    0x02, 0x08, 0x25, 0xc4, 0x18, 0x10, 0x0f, 0x69, 0xa0, 0xc6, 0x72, 0x26,
+    0x01, 0x01, 0x21, 0xd9, 0x11, 0xc9, 0x01, 0x71, 0x58, 0x42, 0x06, 0x62,
+    0xc2, 0x08, 0x2b, 0x47, 0x0f, 0x81, 0xc2, 0x08, 0x37, 0x42, 0x00, 0x6b,
+    0xc2, 0x08, 0x4f, 0x08, 0xc2, 0x08, 0x59, 0xc4, 0x04, 0x1f, 0x0f, 0xa8,
+    0x99, 0x4d, 0x7f, 0x32, 0xc2, 0x08, 0x65, 0xca, 0x6c, 0x80, 0x0f, 0xa2,
+    0x80, 0xd9, 0x1d, 0x56, 0x01, 0x3d, 0xf1, 0x4f, 0x66, 0x75, 0x42, 0x08,
+    0x71, 0xce, 0x1c, 0x92, 0x0b, 0x7f, 0x19, 0xc9, 0xa9, 0xea, 0x0b, 0x7f,
+    0x10, 0x4c, 0x11, 0xe2, 0xc2, 0x08, 0x7d, 0x4a, 0x51, 0x89, 0xc2, 0x08,
+    0x8f, 0x47, 0x02, 0x0e, 0x42, 0x08, 0x9b, 0x46, 0xc9, 0x58, 0xc2, 0x08,
+    0xf1, 0x4c, 0x86, 0x0d, 0x42, 0x09, 0x01, 0x47, 0x34, 0x2f, 0xc2, 0x09,
+    0x0d, 0x4d, 0x29, 0xb9, 0xc2, 0x09, 0x22, 0x4f, 0x0b, 0x17, 0x42, 0x09,
+    0x5d, 0x47, 0xc8, 0x07, 0xc2, 0x09, 0x98, 0x48, 0xb6, 0x6a, 0x42, 0x09,
+    0xb7, 0x47, 0x34, 0x2f, 0xc2, 0x09, 0xd0, 0x47, 0x02, 0x0e, 0x42, 0x09,
+    0xda, 0x15, 0xc2, 0x0a, 0x3c, 0x4b, 0x52, 0x39, 0x42, 0x0a, 0x48, 0x47,
+    0x02, 0x0e, 0xc2, 0x0a, 0xbb, 0x48, 0x56, 0x9a, 0x42, 0x0b, 0x18, 0xcd,
+    0x77, 0x6d, 0x00, 0xe3, 0xf9, 0xc6, 0x77, 0x74, 0x00, 0xe3, 0xf0, 0x8a,
+    0x00, 0xe3, 0xb9, 0x98, 0x00, 0xe3, 0xb1, 0x84, 0x00, 0xe3, 0xa9, 0xc2,
+    0x02, 0x10, 0x00, 0xe3, 0xa0, 0x91, 0x00, 0xe3, 0x99, 0x87, 0x00, 0xe3,
+    0x71, 0x97, 0x00, 0xe3, 0x49, 0x8b, 0x00, 0xe3, 0x21, 0x83, 0x00, 0xe2,
+    0xd2, 0x02, 0x0b, 0x2a, 0xc2, 0x01, 0xa3, 0x00, 0xe3, 0x91, 0x90, 0x00,
+    0xe3, 0x89, 0xc2, 0x04, 0xcd, 0x00, 0xe3, 0x81, 0x92, 0x00, 0xe3, 0x78,
+    0x9b, 0x00, 0xe3, 0x69, 0xc2, 0x1b, 0x88, 0x00, 0xe3, 0x61, 0x86, 0x00,
+    0xe3, 0x59, 0x85, 0x00, 0xe3, 0x50, 0x94, 0x00, 0xe3, 0x41, 0xc2, 0x16,
+    0x59, 0x00, 0xe3, 0x39, 0x8a, 0x00, 0xe3, 0x31, 0x95, 0x00, 0xe3, 0x28,
+    0x03, 0xc2, 0x0b, 0x2e, 0x8e, 0x00, 0xe2, 0xf1, 0xc2, 0x00, 0x75, 0x00,
+    0xe2, 0xe9, 0x89, 0x00, 0xe2, 0xe1, 0x96, 0x00, 0xe2, 0xd8, 0xc4, 0x18,
+    0x10, 0x00, 0xe2, 0xb9, 0xc2, 0x22, 0xcc, 0x00, 0xe2, 0xb0, 0xc3, 0x0d,
+    0x14, 0x00, 0xe2, 0xa9, 0xc3, 0x09, 0x9e, 0x00, 0xe2, 0xa0, 0xc4, 0x02,
+    0xde, 0x00, 0xe2, 0x99, 0xc2, 0x02, 0xa0, 0x00, 0xe2, 0x90, 0x46, 0x01,
+    0xfc, 0xc2, 0x0b, 0x3e, 0xcd, 0x56, 0x88, 0x01, 0x5d, 0xe0, 0xc9, 0xaa,
+    0x56, 0x00, 0xb4, 0xc9, 0xc5, 0xd7, 0xa9, 0x00, 0xb4, 0xa9, 0xc5, 0xcc,
+    0x96, 0x00, 0xb4, 0x98, 0xc3, 0x09, 0x38, 0x00, 0xb4, 0xc1, 0xc6, 0xcc,
+    0x95, 0x00, 0xb4, 0xa0, 0xc7, 0xc7, 0x82, 0x00, 0xb4, 0xb9, 0x94, 0x00,
+    0xb4, 0x91, 0xc3, 0x04, 0xa7, 0x00, 0xb4, 0x30, 0x94, 0x00, 0xb4, 0xb1,
+    0xc2, 0x1b, 0x88, 0x00, 0xb4, 0x88, 0xc5, 0xd8, 0xad, 0x00, 0xb4, 0x71,
+    0xc3, 0x14, 0xa7, 0x00, 0xb4, 0x20, 0xc6, 0xd1, 0x15, 0x00, 0xb4, 0x69,
+    0xc3, 0x00, 0x44, 0x00, 0xb4, 0x28, 0xc4, 0xe2, 0xef, 0x00, 0xb4, 0x51,
+    0xc3, 0x1f, 0x48, 0x00, 0xb4, 0x48, 0xc3, 0x00, 0x49, 0x08, 0x24, 0x01,
+    0x83, 0x08, 0x24, 0xd8, 0xc2, 0x00, 0xd0, 0x08, 0x24, 0x29, 0xc3, 0xb8,
+    0xac, 0x08, 0x24, 0x78, 0xc3, 0x0e, 0x66, 0x08, 0x24, 0x31, 0xc2, 0x00,
+    0xd0, 0x08, 0x24, 0x50, 0x83, 0x08, 0x24, 0x41, 0xc4, 0xdf, 0xb7, 0x08,
+    0x24, 0x48, 0x87, 0x08, 0x24, 0xe0, 0x91, 0x08, 0x24, 0xe8, 0xc2, 0x02,
+    0xa0, 0x08, 0x25, 0x11, 0xc4, 0x02, 0xde, 0x08, 0x25, 0x18, 0xc3, 0x09,
+    0x9e, 0x08, 0x25, 0x21, 0xc3, 0x0d, 0x14, 0x08, 0x25, 0x28, 0xc2, 0x22,
+    0xcc, 0x08, 0x25, 0x31, 0xc4, 0x18, 0x10, 0x08, 0x25, 0x38, 0x8b, 0x08,
+    0x25, 0x8b, 0x02, 0x0b, 0x4a, 0x8a, 0x08, 0x25, 0x98, 0x0a, 0xc2, 0x0b,
+    0x4e, 0xc2, 0x00, 0x74, 0x08, 0x25, 0xc0, 0x83, 0x08, 0x25, 0xc9, 0xc2,
+    0x19, 0x2c, 0x08, 0x25, 0xd0, 0x83, 0x08, 0x25, 0xe1, 0xc2, 0x19, 0x2c,
+    0x08, 0x25, 0xf1, 0xc2, 0x00, 0xd0, 0x08, 0x26, 0x80, 0xc2, 0x00, 0x74,
+    0x08, 0x26, 0x18, 0x83, 0x08, 0x26, 0x31, 0xc2, 0x00, 0xd0, 0x08, 0x26,
+    0x38, 0x83, 0x08, 0x26, 0x41, 0x15, 0x42, 0x0b, 0x64, 0x83, 0x08, 0x26,
+    0x91, 0xc2, 0x00, 0xd0, 0x08, 0x26, 0x98, 0x8b, 0x08, 0x26, 0xcb, 0x02,
+    0x0b, 0x6e, 0x8a, 0x08, 0x26, 0xd8, 0x0a, 0xc2, 0x0b, 0x72, 0xc2, 0x00,
+    0x74, 0x08, 0x27, 0x00, 0x83, 0x08, 0x27, 0x09, 0xc2, 0x19, 0x2c, 0x08,
+    0x27, 0x10, 0x83, 0x08, 0x27, 0x21, 0xc2, 0x19, 0x2c, 0x08, 0x27, 0x31,
+    0xc2, 0x00, 0xd0, 0x08, 0x27, 0xc0, 0xc2, 0x00, 0x74, 0x08, 0x27, 0x58,
+    0x83, 0x08, 0x27, 0x71, 0xc2, 0x00, 0xd0, 0x08, 0x27, 0x78, 0x83, 0x08,
+    0x27, 0x81, 0x15, 0x42, 0x0b, 0x88, 0x83, 0x08, 0x27, 0xd1, 0xc2, 0x00,
+    0xd0, 0x08, 0x27, 0xd8, 0xc2, 0x14, 0x49, 0x0e, 0x7e, 0x19, 0xc3, 0x9c,
+    0x8d, 0x0e, 0x7a, 0xe1, 0xc6, 0xcd, 0x49, 0x0e, 0x7a, 0x90, 0xc8, 0xbb,
+    0x92, 0x0e, 0x7c, 0x81, 0xc8, 0x93, 0xed, 0x0e, 0x7b, 0x80, 0xcf, 0x69,
+    0x9f, 0x0e, 0x7a, 0xc8, 0xd0, 0x5f, 0x82, 0x0e, 0x7b, 0xa9, 0xc6, 0xcd,
+    0x85, 0x0e, 0x7b, 0x68, 0x00, 0x42, 0x0b, 0x92, 0xc2, 0x25, 0xa1, 0x0e,
+    0x7c, 0x09, 0xc2, 0x14, 0x49, 0x0e, 0x7a, 0x82, 0x02, 0x0b, 0xa2, 0x45,
+    0xd6, 0xfa, 0xc2, 0x0b, 0xa8, 0xc4, 0xe1, 0xc7, 0x0e, 0x7c, 0x33, 0x02,
+    0x0b, 0xcc, 0xc6, 0xce, 0x03, 0x0e, 0x7a, 0xb2, 0x02, 0x0b, 0xd0, 0x00,
+    0x42, 0x0b, 0xd4, 0x4d, 0x75, 0xe7, 0xc2, 0x0b, 0xe0, 0x47, 0x87, 0x3a,
+    0xc2, 0x0b, 0xf8, 0x16, 0xc2, 0x0c, 0x04, 0xc8, 0x4e, 0x4b, 0x0e, 0x7b,
+    0x91, 0xc9, 0xa9, 0x48, 0x0e, 0x7b, 0x88, 0x47, 0x87, 0x3a, 0xc2, 0x0c,
+    0x10, 0xc7, 0xc8, 0x69, 0x0e, 0x7d, 0x40, 0xc7, 0x2d, 0x19, 0x0e, 0x7a,
+    0xe9, 0xc6, 0xcb, 0xdb, 0x0e, 0x7a, 0xa8, 0xcb, 0x93, 0x7d, 0x0e, 0x7b,
+    0x51, 0xc8, 0x4e, 0x4b, 0x0e, 0x7b, 0x49, 0xc9, 0xa9, 0x48, 0x0e, 0x7b,
+    0x41, 0xc8, 0xbf, 0x6a, 0x0e, 0x7b, 0x38, 0xc8, 0xbf, 0x8a, 0x0e, 0x7b,
+    0x11, 0xc4, 0xca, 0xab, 0x0e, 0x7a, 0xf8, 0xc4, 0x78, 0xdc, 0x0e, 0x7a,
+    0x03, 0x02, 0x0c, 0x22, 0xc5, 0xdb, 0xb4, 0x0e, 0x79, 0x49, 0xc6, 0xcd,
+    0xeb, 0x0e, 0x79, 0x40, 0xca, 0x9b, 0x26, 0x0e, 0x79, 0xf9, 0xc6, 0xd2,
+    0xad, 0x0e, 0x79, 0xc2, 0x02, 0x0c, 0x28, 0xc9, 0xb3, 0x56, 0x0e, 0x79,
+    0xe9, 0xd4, 0x3e, 0xf8, 0x0e, 0x79, 0xa0, 0xc5, 0xbe, 0xad, 0x0e, 0x79,
+    0xe1, 0xc6, 0x6d, 0xaa, 0x0e, 0x79, 0x19, 0x45, 0xda, 0x15, 0x42, 0x0c,
+    0x2e, 0xce, 0x38, 0x5a, 0x0e, 0x79, 0xd9, 0xc4, 0xe0, 0xab, 0x0e, 0x79,
+    0x59, 0xd3, 0x42, 0x68, 0x0e, 0x78, 0xd1, 0x49, 0xa9, 0xd8, 0x42, 0x0c,
+    0x3a, 0xc7, 0xc5, 0xde, 0x0e, 0x79, 0xd1, 0xc7, 0xca, 0x5a, 0x0e, 0x79,
+    0xa9, 0x90, 0x0e, 0x79, 0x08, 0x06, 0xc2, 0x0c, 0x46, 0x46, 0x75, 0x93,
+    0x42, 0x0c, 0x55, 0xc8, 0x3f, 0x04, 0x0e, 0x79, 0x99, 0x07, 0x42, 0x0c,
+    0x5f, 0xc5, 0xd6, 0x50, 0x0e, 0x79, 0x61, 0xc3, 0xe5, 0x72, 0x0e, 0x79,
+    0x10, 0xc6, 0xc2, 0x7a, 0x0e, 0x78, 0xf9, 0x46, 0xcd, 0xdf, 0x42, 0x0c,
+    0x6b, 0x15, 0xc2, 0x0c, 0x77, 0x43, 0x01, 0x55, 0x42, 0x0c, 0x83, 0x43,
+    0x3d, 0xd0, 0xc2, 0x0c, 0x8f, 0x43, 0x01, 0x55, 0x42, 0x0c, 0x9b, 0x43,
+    0x01, 0x55, 0xc2, 0x0c, 0xa7, 0x4d, 0x78, 0xd9, 0x42, 0x0c, 0xb3, 0xc5,
+    0x40, 0xe7, 0x08, 0xd1, 0xc9, 0xc4, 0x1e, 0x97, 0x08, 0xd1, 0xa0, 0xce,
+    0x1e, 0x74, 0x08, 0xd1, 0xb9, 0xc5, 0x1e, 0x8f, 0x08, 0xd1, 0xaa, 0x02,
+    0x0c, 0xbf, 0xc2, 0x02, 0x41, 0x08, 0xd1, 0xf1, 0xc2, 0x00, 0xdb, 0x08,
+    0xd1, 0xe9, 0xc2, 0x00, 0x39, 0x08, 0xd1, 0xe1, 0xc2, 0x19, 0x2c, 0x08,
+    0xd1, 0xd8, 0xc2, 0x00, 0xd0, 0x08, 0xd1, 0x31, 0x83, 0x08, 0xd1, 0x28,
+    0xc2, 0x00, 0xd0, 0x08, 0xd0, 0xb9, 0x83, 0x08, 0xd0, 0xb0, 0xc2, 0x00,
+    0xd0, 0x08, 0xd1, 0x21, 0x83, 0x08, 0xd1, 0x18, 0xc2, 0x00, 0xd0, 0x08,
+    0xd0, 0xa9, 0x83, 0x08, 0xd0, 0xa0, 0x97, 0x08, 0xd0, 0x41, 0x8b, 0x08,
+    0xd0, 0x38, 0x87, 0x08, 0xd0, 0x28, 0x87, 0x08, 0xd0, 0x10, 0xc9, 0xaf,
+    0x1e, 0x01, 0x51, 0x09, 0xc5, 0xd5, 0x6a, 0x01, 0x51, 0x00, 0x03, 0xc2,
+    0x0c, 0xc5, 0x12, 0xc2, 0x0c, 0xd4, 0xc5, 0xd5, 0x56, 0x05, 0x4e, 0x31,
+    0x0e, 0xc2, 0x0c, 0xe0, 0xc5, 0xdb, 0x91, 0x05, 0x4e, 0x21, 0xcd, 0x79,
+    0x9c, 0x05, 0x4e, 0xf1, 0xc9, 0xaa, 0xe6, 0x05, 0x4e, 0xf8, 0xc7, 0xc5,
+    0x13, 0x05, 0x4e, 0x79, 0xc3, 0x1f, 0x62, 0x05, 0x4e, 0x00, 0xc2, 0x01,
+    0x30, 0x05, 0x4c, 0x93, 0x02, 0x0c, 0xea, 0xc2, 0x00, 0xd0, 0x05, 0x4d,
+    0x91, 0xc2, 0x0d, 0xf6, 0x05, 0x4d, 0x8b, 0x02, 0x0c, 0xf0, 0xc2, 0x01,
+    0x4a, 0x05, 0x4d, 0x71, 0xc2, 0x00, 0xdb, 0x05, 0x4d, 0x69, 0xc2, 0x00,
+    0x39, 0x05, 0x4d, 0x5b, 0x02, 0x0c, 0xf6, 0xc2, 0x19, 0x2c, 0x05, 0x4d,
+    0x51, 0xc2, 0x01, 0xc3, 0x05, 0x4d, 0x49, 0xc2, 0x01, 0x5d, 0x05, 0x4d,
+    0x3b, 0x02, 0x0c, 0xfc, 0xc2, 0x00, 0xb0, 0x05, 0x4d, 0x2b, 0x02, 0x0d,
+    0x02, 0x10, 0xc2, 0x0d, 0x06, 0x06, 0xc2, 0x0d, 0x1f, 0x16, 0xc2, 0x0d,
+    0x2f, 0xc2, 0x25, 0x3b, 0x05, 0x4c, 0xbb, 0x02, 0x0d, 0x3f, 0xc2, 0x00,
+    0x64, 0x05, 0x4c, 0xab, 0x02, 0x0d, 0x45, 0xc2, 0x02, 0x2b, 0x05, 0x4c,
+    0x7b, 0x02, 0x0d, 0x4b, 0x91, 0x05, 0x4c, 0x71, 0x83, 0x05, 0x4c, 0x23,
+    0x02, 0x0d, 0x4f, 0x87, 0x05, 0x4c, 0x61, 0x97, 0x05, 0x4c, 0x41, 0x8b,
+    0x05, 0x4c, 0x32, 0x02, 0x0d, 0x53, 0xc4, 0x02, 0xde, 0x05, 0x4e, 0x99,
+    0xc2, 0x02, 0xa0, 0x05, 0x4e, 0x90, 0xc3, 0x09, 0x9e, 0x05, 0x4e, 0xa1,
+    0xc3, 0x0d, 0x14, 0x05, 0x4e, 0xa8, 0xc2, 0x22, 0xcc, 0x05, 0x4e, 0xb1,
+    0xc4, 0x18, 0x10, 0x05, 0x4e, 0xb8, 0x03, 0xc2, 0x0d, 0x5d, 0xc5, 0x0d,
+    0xe4, 0x05, 0x4d, 0xa8, 0xc7, 0xc5, 0x91, 0x05, 0x4d, 0xc8, 0xc6, 0xcb,
+    0xb1, 0x05, 0x4d, 0xb8, 0xc5, 0xda, 0x8d, 0x05, 0x4d, 0x98, 0xc5, 0x00,
+    0x2c, 0x01, 0x2c, 0xeb, 0x02, 0x0d, 0x69, 0xc4, 0x00, 0x49, 0x01, 0x2c,
+    0xc2, 0x02, 0x0d, 0x72, 0xc5, 0x00, 0x2c, 0x01, 0x2c, 0xb9, 0xc4, 0x00,
+    0x49, 0x01, 0x2c, 0xb0, 0x1b, 0xc2, 0x0d, 0x78, 0x0c, 0xc2, 0x0d, 0x8d,
+    0x14, 0xc2, 0x0d, 0xa9, 0x09, 0xc2, 0x0d, 0xcc, 0x1c, 0xc2, 0x0d, 0xf3,
+    0x04, 0xc2, 0x0e, 0x1a, 0x06, 0xc2, 0x0e, 0x3d, 0x8b, 0x05, 0x0b, 0xfb,
+    0x02, 0x0e, 0x60, 0x83, 0x05, 0x0c, 0x2b, 0x02, 0x0e, 0x73, 0x97, 0x05,
+    0x0c, 0x9b, 0x02, 0x0e, 0x7b, 0x91, 0x05, 0x0c, 0x63, 0x02, 0x0e, 0x95,
+    0x87, 0x05, 0x0c, 0x7a, 0x02, 0x0e, 0xa1, 0x0c, 0xc2, 0x0e, 0xa9, 0x9b,
+    0x05, 0x1f, 0xc3, 0x02, 0x0e, 0xc5, 0x97, 0x05, 0x1f, 0x93, 0x02, 0x0e,
+    0xd8, 0x91, 0x05, 0x1f, 0x73, 0x02, 0x0e, 0xf2, 0x8b, 0x05, 0x1f, 0x12,
+    0x02, 0x0e, 0xfe, 0x9b, 0x05, 0x20, 0xa3, 0x02, 0x0f, 0x11, 0x97, 0x05,
+    0x20, 0x73, 0x02, 0x0f, 0x24, 0x91, 0x05, 0x20, 0x53, 0x02, 0x0f, 0x3e,
+    0x8b, 0x05, 0x1f, 0xf2, 0x02, 0x0f, 0x4a, 0x9b, 0x05, 0x1e, 0xe3, 0x02,
+    0x0f, 0x5d, 0x97, 0x05, 0x1e, 0xb3, 0x02, 0x0f, 0x70, 0x87, 0x05, 0x1e,
+    0x93, 0x02, 0x0f, 0x8a, 0x91, 0x05, 0x1e, 0x7b, 0x02, 0x0f, 0x92, 0x83,
+    0x05, 0x1e, 0x43, 0x02, 0x0f, 0x9e, 0x14, 0x42, 0x0f, 0xaa, 0x0a, 0xc2,
+    0x0f, 0xcd, 0x15, 0xc2, 0x0f, 0xf0, 0x8b, 0x05, 0x18, 0x5b, 0x02, 0x10,
+    0x1a, 0x83, 0x05, 0x18, 0x93, 0x02, 0x10, 0x2d, 0x97, 0x05, 0x19, 0x03,
+    0x02, 0x10, 0x39, 0x91, 0x05, 0x18, 0xcb, 0x02, 0x10, 0x53, 0x87, 0x05,
+    0x18, 0xe3, 0x02, 0x10, 0x5f, 0x9b, 0x05, 0x19, 0x32, 0x02, 0x10, 0x67,
+    0x0a, 0xc2, 0x10, 0x7a, 0x9b, 0x05, 0x16, 0x63, 0x02, 0x10, 0x9d, 0x87,
+    0x05, 0x16, 0x13, 0x02, 0x10, 0xb0, 0x97, 0x05, 0x16, 0x33, 0x02, 0x10,
+    0xb8, 0x8b, 0x05, 0x15, 0x83, 0x02, 0x10, 0xd2, 0x83, 0x05, 0x15, 0xc3,
+    0x02, 0x10, 0xe5, 0x91, 0x05, 0x15, 0xfa, 0x02, 0x10, 0xf1, 0x87, 0x05,
+    0x15, 0x03, 0x02, 0x10, 0xfd, 0x91, 0x05, 0x14, 0xeb, 0x02, 0x11, 0x05,
+    0x97, 0x05, 0x15, 0x23, 0x02, 0x11, 0x11, 0x83, 0x05, 0x14, 0xb3, 0x02,
+    0x11, 0x2b, 0x8b, 0x05, 0x14, 0x7b, 0x02, 0x11, 0x37, 0x1c, 0xc2, 0x11,
+    0x4a, 0x0a, 0xc2, 0x11, 0x74, 0x9b, 0x05, 0x15, 0x52, 0x02, 0x11, 0x97,
+    0x87, 0x05, 0x14, 0x5b, 0x02, 0x11, 0xaa, 0x91, 0x05, 0x14, 0x43, 0x02,
+    0x11, 0xb2, 0x97, 0x05, 0x00, 0xab, 0x02, 0x11, 0xba, 0x83, 0x05, 0x14,
+    0x12, 0x02, 0x11, 0xc1, 0x87, 0x05, 0x13, 0xf3, 0x02, 0x11, 0xcd, 0x1a,
+    0xc2, 0x11, 0xd5, 0x0b, 0xc2, 0x11, 0xfa, 0x83, 0x05, 0x13, 0x9b, 0x02,
+    0x12, 0x05, 0xc2, 0x01, 0xba, 0x05, 0x13, 0xbb, 0x02, 0x12, 0x11, 0x91,
+    0x05, 0x13, 0xdb, 0x02, 0x12, 0x1d, 0x0f, 0xc2, 0x12, 0x29, 0x10, 0xc2,
+    0x12, 0x4c, 0x0e, 0x42, 0x12, 0x69, 0x8b, 0x05, 0x23, 0x9b, 0x02, 0x12,
+    0x93, 0x97, 0x05, 0x24, 0x1b, 0x02, 0x12, 0xa6, 0x91, 0x05, 0x23, 0xfb,
+    0x02, 0x12, 0xc0, 0x9b, 0x05, 0x24, 0x4a, 0x02, 0x12, 0xcc, 0x9b, 0x05,
+    0x23, 0x6b, 0x02, 0x12, 0xdf, 0x8b, 0x05, 0x22, 0xfb, 0x02, 0x12, 0xf2,
+    0x91, 0x05, 0x23, 0x4b, 0x02, 0x13, 0x05, 0xc2, 0x01, 0xba, 0x05, 0x23,
+    0x32, 0x02, 0x13, 0x11, 0x09, 0xc2, 0x13, 0x15, 0x8b, 0x05, 0x05, 0x83,
+    0x02, 0x13, 0x3a, 0x83, 0x05, 0x05, 0xbb, 0x02, 0x13, 0x4d, 0x97, 0x05,
+    0x06, 0x2b, 0x02, 0x13, 0x59, 0x91, 0x05, 0x05, 0xfb, 0x02, 0x13, 0x73,
+    0x87, 0x05, 0x06, 0x13, 0x02, 0x13, 0x7f, 0x9b, 0x05, 0x06, 0x5a, 0x02,
+    0x13, 0x83, 0x96, 0x05, 0x00, 0x03, 0x02, 0x13, 0x8f, 0x9a, 0x05, 0x00,
+    0x09, 0x92, 0x05, 0x00, 0x19, 0x87, 0x05, 0x00, 0x32, 0x02, 0x13, 0x95,
+    0x96, 0x05, 0x00, 0x41, 0x9a, 0x05, 0x00, 0x49, 0x92, 0x05, 0x00, 0x58,
+    0x9a, 0x05, 0x00, 0x61, 0x92, 0x05, 0x00, 0x70, 0x96, 0x05, 0x00, 0x79,
+    0x9a, 0x05, 0x00, 0x81, 0x92, 0x05, 0x00, 0x90, 0x9a, 0x05, 0x00, 0x98,
+    0x8b, 0x05, 0x00, 0xc3, 0x02, 0x13, 0xa1, 0x83, 0x05, 0x01, 0x03, 0x02,
+    0x13, 0xb4, 0x97, 0x05, 0x01, 0x73, 0x02, 0x13, 0xc0, 0x91, 0x05, 0x01,
+    0x3b, 0x02, 0x13, 0xda, 0x87, 0x05, 0x01, 0x53, 0x02, 0x13, 0xe6, 0x9b,
+    0x05, 0x01, 0xa3, 0x02, 0x13, 0xee, 0x04, 0x42, 0x14, 0x01, 0x8b, 0x05,
+    0x01, 0xd3, 0x02, 0x14, 0x2b, 0x83, 0x05, 0x02, 0x0b, 0x02, 0x14, 0x3e,
+    0x97, 0x05, 0x02, 0x63, 0x02, 0x14, 0x4a, 0x91, 0x05, 0x02, 0x43, 0x02,
+    0x14, 0x64, 0x9b, 0x05, 0x02, 0x92, 0x02, 0x14, 0x70, 0x8b, 0x05, 0x06,
+    0x7b, 0x02, 0x14, 0x83, 0x83, 0x05, 0x06, 0x9b, 0x02, 0x14, 0x8f, 0x91,
+    0x05, 0x06, 0xb3, 0x02, 0x14, 0x9b, 0x97, 0x05, 0x06, 0xd3, 0x02, 0x14,
+    0xa3, 0x9b, 0x05, 0x07, 0x02, 0x02, 0x14, 0xb6, 0x8b, 0x05, 0x07, 0x23,
+    0x02, 0x14, 0xc2, 0x83, 0x05, 0x07, 0x63, 0x02, 0x14, 0xd5, 0x91, 0x05,
+    0x07, 0x83, 0x02, 0x14, 0xe1, 0x07, 0xc2, 0x14, 0xed, 0x97, 0x05, 0x07,
+    0xb3, 0x02, 0x14, 0xf5, 0x9b, 0x05, 0x07, 0xe2, 0x02, 0x15, 0x08, 0x8b,
+    0x05, 0x08, 0x13, 0x02, 0x15, 0x1b, 0x83, 0x05, 0x08, 0x4b, 0x02, 0x15,
+    0x2e, 0x97, 0x05, 0x08, 0xb3, 0x02, 0x15, 0x3a, 0x91, 0x05, 0x08, 0x7b,
+    0x02, 0x15, 0x54, 0x87, 0x05, 0x08, 0x93, 0x02, 0x15, 0x60, 0x06, 0x42,
+    0x15, 0x68, 0x8b, 0x05, 0x08, 0xe3, 0x02, 0x15, 0x8b, 0x83, 0x05, 0x09,
+    0x1b, 0x02, 0x15, 0x9e, 0x97, 0x05, 0x09, 0x93, 0x02, 0x15, 0xaa, 0x91,
+    0x05, 0x09, 0x5b, 0x02, 0x15, 0xc4, 0x87, 0x05, 0x09, 0x72, 0x02, 0x15,
+    0xd0, 0x8b, 0x05, 0x0d, 0xcb, 0x02, 0x15, 0xd8, 0x83, 0x05, 0x0e, 0x0b,
+    0x02, 0x15, 0xeb, 0x97, 0x05, 0x0e, 0x83, 0x02, 0x15, 0xf7, 0x91, 0x05,
+    0x0e, 0x4b, 0x02, 0x16, 0x11, 0x87, 0x05, 0x0e, 0x63, 0x02, 0x16, 0x1d,
+    0x9b, 0x05, 0x0e, 0xb2, 0x02, 0x16, 0x25, 0x8b, 0x05, 0x0e, 0xe3, 0x02,
+    0x16, 0x38, 0x83, 0x05, 0x0f, 0x23, 0x02, 0x16, 0x4b, 0x97, 0x05, 0x0f,
+    0xa3, 0x02, 0x16, 0x57, 0x91, 0x05, 0x0f, 0x63, 0x02, 0x16, 0x71, 0x87,
+    0x05, 0x0f, 0x83, 0x02, 0x16, 0x7d, 0x09, 0x42, 0x16, 0x89, 0x8b, 0x05,
+    0x0f, 0xd3, 0x02, 0x16, 0xac, 0x83, 0x05, 0x10, 0x0b, 0x02, 0x16, 0xbf,
+    0x97, 0x05, 0x10, 0x83, 0x02, 0x16, 0xcb, 0x91, 0x05, 0x10, 0x43, 0x02,
+    0x16, 0xe5, 0x87, 0x05, 0x10, 0x62, 0x02, 0x16, 0xf1, 0x8b, 0x05, 0x24,
+    0x8b, 0x02, 0x16, 0xfd, 0xc2, 0x1d, 0xc1, 0x05, 0x24, 0xd0, 0xc2, 0x00,
+    0x8d, 0x05, 0x24, 0x91, 0x87, 0x05, 0x26, 0x30, 0x1b, 0xc2, 0x17, 0x01,
+    0xc3, 0xe4, 0xe8, 0x05, 0x25, 0xa1, 0xc3, 0xa9, 0x68, 0x05, 0x26, 0x28,
+    0x9b, 0x05, 0x25, 0xe3, 0x02, 0x17, 0x0d, 0xc3, 0xe4, 0xe5, 0x05, 0x25,
+    0xe9, 0xc2, 0x00, 0x7e, 0x05, 0x25, 0xf1, 0xc2, 0x01, 0x7f, 0x05, 0x26,
+    0x18, 0xc2, 0x00, 0xba, 0x05, 0x24, 0xa9, 0x0a, 0x42, 0x17, 0x15, 0x09,
+    0xc2, 0x17, 0x2b, 0xc2, 0x02, 0x37, 0x05, 0x24, 0xb9, 0x83, 0x05, 0x25,
+    0x09, 0xc2, 0x01, 0xbb, 0x05, 0x25, 0xb0, 0x8b, 0x05, 0x24, 0xc1, 0xc2,
+    0x00, 0x11, 0x05, 0x24, 0xe0, 0x1a, 0xc2, 0x17, 0x37, 0xc2, 0x00, 0xa2,
+    0x05, 0x25, 0x68, 0xc3, 0x02, 0xaa, 0x05, 0x24, 0xd9, 0xc2, 0x00, 0x33,
+    0x05, 0x25, 0x28, 0x91, 0x05, 0x24, 0xe9, 0xc2, 0x00, 0x8d, 0x05, 0x25,
+    0x70, 0xc2, 0x00, 0xa4, 0x05, 0x24, 0xf1, 0xc2, 0x63, 0xd6, 0x05, 0x25,
+    0x60, 0xc2, 0x00, 0xfe, 0x05, 0x25, 0x01, 0x97, 0x05, 0x25, 0x40, 0x17,
+    0xc2, 0x17, 0x49, 0xc2, 0x01, 0xbb, 0x05, 0x25, 0x59, 0x83, 0x05, 0x25,
+    0x91, 0xc4, 0xdf, 0x23, 0x05, 0x26, 0x20, 0xc3, 0x66, 0x20, 0x05, 0x25,
+    0x21, 0x97, 0x05, 0x25, 0xc8, 0x0c, 0xc2, 0x17, 0x51, 0x91, 0x05, 0x25,
+    0x98, 0xc2, 0x00, 0x33, 0x05, 0x25, 0x79, 0xc2, 0x02, 0x37, 0x05, 0x25,
+    0x88, 0xd6, 0x30, 0x64, 0x08, 0x75, 0x88, 0xcf, 0x33, 0xad, 0x08, 0x75,
+    0x80, 0x96, 0x08, 0x75, 0x49, 0x99, 0x08, 0x75, 0x31, 0xc2, 0x17, 0xb6,
+    0x08, 0x74, 0xb9, 0xc3, 0x6b, 0x53, 0x08, 0x74, 0x00, 0xc2, 0x0c, 0x42,
+    0x08, 0x75, 0x39, 0xc2, 0x00, 0xd0, 0x08, 0x74, 0x48, 0xc3, 0x48, 0x60,
+    0x08, 0x74, 0xf1, 0xc2, 0x0f, 0x9b, 0x08, 0x74, 0xe8, 0xcf, 0x6b, 0x25,
+    0x08, 0x74, 0xd8, 0xc4, 0xdf, 0xa3, 0x08, 0x74, 0xc1, 0x83, 0x08, 0x74,
+    0x50, 0x87, 0x08, 0x74, 0xb1, 0x83, 0x08, 0x74, 0x7a, 0x02, 0x17, 0x61,
+    0x83, 0x08, 0x74, 0xa9, 0xc2, 0x01, 0x7f, 0x08, 0x74, 0x20, 0x86, 0x08,
+    0x74, 0xa1, 0x8e, 0x08, 0x74, 0x58, 0xc2, 0x01, 0x9d, 0x08, 0x74, 0x99,
+    0xc3, 0x11, 0xef, 0x08, 0x74, 0x91, 0xc2, 0x00, 0x74, 0x08, 0x74, 0x89,
+    0x87, 0x08, 0x74, 0x28, 0xc2, 0x00, 0xd0, 0x08, 0x74, 0x71, 0x83, 0x08,
+    0x74, 0x68, 0x0a, 0xc2, 0x17, 0x65, 0xc2, 0x03, 0x4e, 0x08, 0x74, 0x30,
+    0xc2, 0x01, 0x7f, 0x08, 0x74, 0x19, 0x87, 0x08, 0x74, 0x10, 0xc9, 0x1c,
+    0x63, 0x00, 0x04, 0xa1, 0xc3, 0x16, 0x32, 0x70, 0x03, 0xf8, 0x83, 0x08,
+    0xd5, 0xf9, 0x91, 0x08, 0xd5, 0xf1, 0x8b, 0x08, 0xd5, 0xe9, 0x87, 0x08,
+    0xd5, 0xe0, 0x9b, 0x00, 0xc5, 0xfb, 0x02, 0x17, 0x71, 0x83, 0x00, 0xa7,
+    0xaa, 0x02, 0x17, 0x77, 0x19, 0xc2, 0x17, 0x7b, 0x83, 0x00, 0xa8, 0xab,
+    0x02, 0x17, 0x94, 0x91, 0x00, 0xa8, 0x9b, 0x02, 0x17, 0x9c, 0x8b, 0x00,
+    0xa8, 0x8b, 0x02, 0x17, 0xa4, 0x87, 0x00, 0xa8, 0x80, 0x9b, 0x00, 0xc5,
+    0xf1, 0x4c, 0x86, 0x01, 0xc2, 0x17, 0xa8, 0x91, 0x00, 0xa7, 0x90, 0x83,
+    0x00, 0xa8, 0x03, 0x02, 0x17, 0xc0, 0x87, 0x00, 0xa7, 0xb1, 0x8b, 0x00,
+    0xa7, 0xc3, 0x02, 0x17, 0xc4, 0x91, 0x00, 0xa7, 0xe2, 0x02, 0x17, 0xc8,
+    0x8b, 0x00, 0xa7, 0x80, 0x47, 0xc6, 0x8d, 0xc2, 0x17, 0xcc, 0x9b, 0x00,
+    0xc5, 0xe1, 0x46, 0xd3, 0x4f, 0xc2, 0x17, 0xd6, 0x83, 0x00, 0xa6, 0x42,
+    0x02, 0x18, 0x02, 0x91, 0x00, 0xc6, 0x53, 0x02, 0x18, 0x06, 0x8b, 0x00,
+    0xc6, 0x33, 0x02, 0x18, 0x0a, 0x87, 0x00, 0xa6, 0x49, 0x83, 0x00, 0xa6,
+    0x5a, 0x02, 0x18, 0x0e, 0x9b, 0x00, 0xc5, 0xd9, 0x91, 0x00, 0xa6, 0x28,
+    0x83, 0x00, 0xb3, 0xab, 0x02, 0x18, 0x12, 0x91, 0x00, 0xb3, 0x9b, 0x02,
+    0x18, 0x16, 0x8b, 0x00, 0xb3, 0x8a, 0x02, 0x18, 0x1a, 0x83, 0x00, 0xac,
+    0x9b, 0x02, 0x18, 0x1e, 0x91, 0x00, 0xac, 0x8b, 0x02, 0x18, 0x29, 0x8b,
+    0x00, 0xac, 0x7a, 0x02, 0x18, 0x2d, 0xc4, 0x4b, 0x20, 0x00, 0xab, 0xe1,
+    0xc4, 0xe1, 0x1f, 0x00, 0xab, 0xda, 0x02, 0x18, 0x31, 0x8b, 0x00, 0xab,
+    0x0b, 0x02, 0x18, 0x4a, 0x87, 0x00, 0xaa, 0xf8, 0x8b, 0x00, 0xa6, 0x18,
+    0x46, 0x69, 0x75, 0xc2, 0x18, 0x4e, 0x83, 0x00, 0xa4, 0x8a, 0x02, 0x18,
+    0xa6, 0x91, 0x00, 0xa4, 0xc3, 0x02, 0x18, 0xaa, 0x8b, 0x00, 0xa4, 0xa3,
+    0x02, 0x18, 0xae, 0x87, 0x00, 0xa4, 0x91, 0x83, 0x00, 0xa4, 0xe2, 0x02,
+    0x18, 0xb2, 0x91, 0x00, 0xa4, 0x70, 0x8b, 0x00, 0xa4, 0x60, 0x94, 0x00,
+    0xc7, 0xa1, 0x8e, 0x00, 0xc7, 0x98, 0x99, 0x00, 0xb3, 0xfb, 0x02, 0x18,
+    0xb6, 0x0d, 0xc2, 0x18, 0xc6, 0x10, 0xc2, 0x18, 0xd6, 0x83, 0x00, 0xad,
+    0x99, 0x91, 0x00, 0xad, 0x91, 0x8b, 0x00, 0xad, 0x89, 0x87, 0x00, 0xad,
+    0x81, 0x95, 0x00, 0xa8, 0x40, 0x91, 0x00, 0xac, 0x43, 0x02, 0x18, 0xe6,
+    0xc2, 0x00, 0x28, 0x00, 0xc7, 0x41, 0x83, 0x00, 0xac, 0x49, 0x8b, 0x00,
+    0xac, 0x39, 0x87, 0x00, 0xac, 0x30, 0x8a, 0x00, 0xab, 0x7b, 0x02, 0x18,
+    0xea, 0x87, 0x00, 0xa3, 0x39, 0x8b, 0x00, 0xa3, 0x41, 0x91, 0x00, 0xa3,
+    0x49, 0x83, 0x00, 0xa3, 0x50, 0x19, 0xc2, 0x19, 0x06, 0xc8, 0xbc, 0x52,
+    0x00, 0xad, 0x73, 0x02, 0x19, 0x11, 0x83, 0x00, 0xab, 0x33, 0x02, 0x19,
+    0x2a, 0x91, 0x00, 0xab, 0x23, 0x02, 0x19, 0x2e, 0x8b, 0x00, 0xab, 0x03,
+    0x02, 0x19, 0x32, 0x87, 0x00, 0xaa, 0xf0, 0x9b, 0x00, 0xc5, 0xb9, 0x83,
+    0x00, 0xa2, 0xb2, 0x02, 0x19, 0x36, 0x83, 0x00, 0xab, 0x99, 0x91, 0x00,
+    0xab, 0x91, 0x8b, 0x00, 0xab, 0x89, 0x87, 0x00, 0xab, 0x80, 0x91, 0x00,
+    0xa2, 0xeb, 0x02, 0x19, 0x3a, 0x8b, 0x00, 0xa2, 0xcb, 0x02, 0x19, 0x3e,
+    0x87, 0x00, 0xa2, 0xb9, 0x83, 0x00, 0xa3, 0x0a, 0x02, 0x19, 0x42, 0x91,
+    0x00, 0xa2, 0x88, 0x8b, 0x00, 0xa2, 0x78, 0x42, 0x00, 0x15, 0x42, 0x19,
+    0x46, 0x9b, 0x00, 0xc5, 0x99, 0x83, 0x00, 0xa0, 0xc8, 0x91, 0x00, 0xa0,
+    0xa2, 0x02, 0x19, 0x52, 0x8b, 0x00, 0xa0, 0x80, 0xc2, 0x00, 0x28, 0x00,
+    0xc7, 0x01, 0x87, 0x00, 0xaa, 0x18, 0x83, 0x00, 0xc6, 0x9b, 0x02, 0x19,
+    0x58, 0x91, 0x00, 0xc6, 0x8b, 0x02, 0x19, 0x5c, 0x8b, 0x00, 0xc6, 0x7b,
+    0x02, 0x19, 0x60, 0xc2, 0x02, 0xe0, 0x00, 0xc6, 0x70, 0x9b, 0x00, 0xc6,
+    0x29, 0x83, 0x00, 0xaa, 0x62, 0x02, 0x19, 0x64, 0x91, 0x00, 0xaa, 0x48,
+    0x8b, 0x00, 0xaa, 0x38, 0x44, 0x10, 0x6a, 0xc2, 0x19, 0x68, 0x8b, 0x00,
+    0xaa, 0xb0, 0x83, 0x00, 0xaa, 0xd2, 0x02, 0x19, 0x9a, 0x91, 0x00, 0xaa,
+    0xc0, 0x95, 0x00, 0xc6, 0xd3, 0x02, 0x19, 0x9e, 0x90, 0x00, 0xc6, 0xcb,
+    0x02, 0x19, 0xa2, 0x8f, 0x00, 0xc6, 0xc1, 0x85, 0x00, 0xc6, 0xb9, 0x8d,
+    0x00, 0xc6, 0xb1, 0x96, 0x00, 0xc6, 0xa9, 0x92, 0x00, 0xc6, 0xa0, 0x9b,
+    0x00, 0xc6, 0x21, 0x83, 0x00, 0xa9, 0x72, 0x02, 0x19, 0xa6, 0x9b, 0x00,
+    0xc6, 0x19, 0x91, 0x00, 0xa9, 0x58, 0x83, 0x00, 0xa9, 0xcb, 0x02, 0x19,
+    0xaa, 0x91, 0x00, 0xa9, 0xab, 0x02, 0x19, 0xae, 0x8b, 0x00, 0xa9, 0x8b,
+    0x02, 0x19, 0xb2, 0x87, 0x00, 0xa9, 0x78, 0xc3, 0x4d, 0xc4, 0x00, 0xa9,
+    0x61, 0xc3, 0x2b, 0xd4, 0x00, 0xa2, 0x91, 0x12, 0xc2, 0x19, 0xb6, 0xc3,
+    0x90, 0xd8, 0x00, 0xa4, 0x79, 0xc2, 0x01, 0x24, 0x00, 0xa0, 0x39, 0x99,
+    0x00, 0xa0, 0xe9, 0xc3, 0x15, 0xdb, 0x00, 0xa5, 0x49, 0xc3, 0x11, 0xf1,
+    0x00, 0xa6, 0x31, 0xc3, 0x15, 0x31, 0x00, 0xa6, 0xc9, 0xc3, 0x19, 0xe1,
+    0x00, 0xa7, 0x99, 0xc3, 0xd5, 0x5e, 0x00, 0xa3, 0x88, 0x8b, 0x00, 0xa9,
+    0x48, 0x9b, 0x00, 0xc5, 0xe9, 0x83, 0x00, 0xa6, 0xda, 0x02, 0x19, 0xc2,
+    0x83, 0x00, 0xad, 0x23, 0x02, 0x19, 0xc6, 0x91, 0x00, 0xad, 0x13, 0x02,
+    0x19, 0xca, 0x8b, 0x00, 0xad, 0x02, 0x02, 0x19, 0xce, 0x8b, 0x00, 0xa6,
+    0xb0, 0x91, 0x00, 0xa6, 0xc0, 0x87, 0x00, 0xa6, 0xe1, 0x8b, 0x00, 0xa6,
+    0xf3, 0x02, 0x19, 0xd2, 0x91, 0x00, 0xa7, 0x13, 0x02, 0x19, 0xd6, 0x83,
+    0x00, 0xa7, 0x32, 0x02, 0x19, 0xda, 0x9b, 0x00, 0xc5, 0xd1, 0x83, 0x00,
+    0xa5, 0x5a, 0x02, 0x19, 0xde, 0x45, 0x30, 0xa1, 0x42, 0x19, 0xe2, 0x91,
+    0x00, 0xa5, 0x42, 0x02, 0x19, 0xea, 0x8b, 0x00, 0xa5, 0x30, 0x87, 0x00,
+    0xa5, 0x61, 0x8b, 0x00, 0xa5, 0x73, 0x02, 0x19, 0xf0, 0x91, 0x00, 0xa5,
+    0x93, 0x02, 0x19, 0xf4, 0x83, 0x00, 0xa5, 0xb2, 0x02, 0x19, 0xf8, 0x83,
+    0x00, 0xa3, 0xf3, 0x02, 0x19, 0xfc, 0x87, 0x00, 0xa3, 0xa1, 0x8b, 0x00,
+    0xa3, 0xb3, 0x02, 0x1a, 0x04, 0x91, 0x00, 0xa3, 0xd2, 0x02, 0x1a, 0x08,
+    0x9b, 0x00, 0xc5, 0xc1, 0x83, 0x00, 0xa3, 0x9a, 0x02, 0x1a, 0x0c, 0x8b,
+    0x00, 0xa3, 0x70, 0x91, 0x00, 0xa3, 0x80, 0x91, 0x00, 0xa2, 0x03, 0x02,
+    0x1a, 0x10, 0x83, 0x00, 0xa2, 0x23, 0x02, 0x1a, 0x18, 0x8b, 0x00, 0xa1,
+    0xe3, 0x02, 0x1a, 0x1c, 0x87, 0x00, 0xa1, 0xd0, 0x9b, 0x00, 0xc5, 0xa9,
+    0x83, 0x00, 0xa1, 0xca, 0x02, 0x1a, 0x20, 0x9b, 0x00, 0xc5, 0xa1, 0x91,
+    0x00, 0xa1, 0xa0, 0x8b, 0x00, 0xa1, 0x90, 0x9b, 0x00, 0xc5, 0x91, 0x8b,
+    0x00, 0xa0, 0x10, 0xc7, 0xc6, 0x4e, 0x00, 0xad, 0x78, 0x95, 0x00, 0xa8,
+    0x31, 0x8f, 0x00, 0xa5, 0xf0, 0x8b, 0x00, 0xb3, 0x79, 0x83, 0x00, 0xac,
+    0x22, 0x02, 0x1a, 0x24, 0x91, 0x00, 0xac, 0x10, 0x8b, 0x00, 0xac, 0x00,
+    0x97, 0x08, 0x15, 0x22, 0x02, 0x1a, 0x28, 0x9f, 0x08, 0x16, 0x70, 0xa0,
+    0x08, 0x16, 0x61, 0xa1, 0x08, 0x16, 0x69, 0x9f, 0x08, 0x16, 0x58, 0x9f,
+    0x08, 0x15, 0xb0, 0x9f, 0x08, 0x15, 0x78, 0x9f, 0x08, 0x16, 0x18, 0xc2,
+    0x00, 0x72, 0x08, 0x29, 0x81, 0xc2, 0x00, 0xbf, 0x08, 0x2a, 0x40, 0xc2,
+    0x03, 0x4e, 0x08, 0x29, 0x91, 0xc4, 0xdf, 0x8f, 0x08, 0x2a, 0xc0, 0xc2,
+    0x00, 0xfe, 0x08, 0x29, 0x99, 0xc3, 0x2e, 0x0f, 0x08, 0x2a, 0x09, 0x1c,
+    0x42, 0x1a, 0x34, 0x84, 0x08, 0x29, 0xa1, 0xc2, 0x17, 0xb6, 0x08, 0x29,
+    0xb0, 0xc3, 0x1a, 0xfe, 0x08, 0x29, 0xa9, 0x0a, 0x42, 0x1a, 0x40, 0xc2,
+    0x02, 0x2c, 0x08, 0x29, 0xc1, 0xc3, 0x4b, 0x13, 0x08, 0x2a, 0x99, 0xc3,
+    0xe5, 0xc0, 0x08, 0x2a, 0xe0, 0x0a, 0xc2, 0x1a, 0x4a, 0x03, 0xc2, 0x1a,
+    0x5b, 0x42, 0x19, 0x2c, 0x42, 0x1a, 0x65, 0xc3, 0x02, 0x05, 0x08, 0x29,
+    0xd1, 0xc3, 0xe5, 0x51, 0x08, 0x2b, 0x08, 0xc2, 0x01, 0x5f, 0x08, 0x29,
+    0xe1, 0xc3, 0x2d, 0xfd, 0x08, 0x29, 0xf9, 0xc2, 0x01, 0x48, 0x08, 0x2a,
+    0xf0, 0x0a, 0xc2, 0x1a, 0x6d, 0xc3, 0xe6, 0x44, 0x08, 0x2a, 0xd0, 0xc2,
+    0x00, 0xd1, 0x08, 0x29, 0xf1, 0xc3, 0xb7, 0xb1, 0x08, 0x2a, 0x28, 0xc3,
+    0xe5, 0xcf, 0x08, 0x2a, 0x19, 0xc3, 0x53, 0x85, 0x08, 0x2a, 0x88, 0xc2,
+    0x00, 0xb1, 0x08, 0x2a, 0x21, 0xc2, 0x33, 0x52, 0x08, 0x2b, 0x18, 0x9b,
+    0x08, 0x2a, 0x39, 0x94, 0x08, 0x2a, 0x68, 0xc2, 0x00, 0xc4, 0x08, 0x2a,
+    0xb9, 0xc3, 0xe5, 0xc0, 0x08, 0x2b, 0x10, 0x9d, 0x17, 0xcf, 0x01, 0x88,
+    0x17, 0xcf, 0x79, 0x87, 0x17, 0xcf, 0x71, 0x86, 0x17, 0xcf, 0x69, 0x85,
+    0x17, 0xcf, 0x61, 0x84, 0x17, 0xcf, 0x59, 0x83, 0x17, 0xcf, 0x51, 0xa6,
+    0x17, 0xcf, 0x49, 0xa5, 0x17, 0xcf, 0x41, 0xa4, 0x17, 0xcf, 0x39, 0xa3,
+    0x17, 0xcf, 0x31, 0xa2, 0x17, 0xcf, 0x29, 0xa1, 0x17, 0xcf, 0x21, 0xa0,
+    0x17, 0xcf, 0x19, 0x9f, 0x17, 0xcf, 0x11, 0x9e, 0x17, 0xcf, 0x08, 0x88,
+    0x17, 0xce, 0xf9, 0x87, 0x17, 0xce, 0xf1, 0xa6, 0x17, 0xce, 0xc9, 0x86,
+    0x17, 0xce, 0xe9, 0x85, 0x17, 0xce, 0xe1, 0x84, 0x17, 0xce, 0xd9, 0x83,
+    0x17, 0xce, 0xd1, 0xa5, 0x17, 0xce, 0xc1, 0xa4, 0x17, 0xce, 0xb9, 0xa3,
+    0x17, 0xce, 0xb1, 0xa2, 0x17, 0xce, 0xa9, 0xa1, 0x17, 0xce, 0xa1, 0xa0,
+    0x17, 0xce, 0x99, 0x9f, 0x17, 0xce, 0x91, 0x9e, 0x17, 0xce, 0x89, 0x9d,
+    0x17, 0xce, 0x80, 0x83, 0x17, 0xcd, 0x51, 0xa6, 0x17, 0xcd, 0x49, 0xa5,
+    0x17, 0xcd, 0x41, 0xa4, 0x17, 0xcd, 0x39, 0xa3, 0x17, 0xcd, 0x31, 0xa2,
+    0x17, 0xcd, 0x29, 0xa1, 0x17, 0xcd, 0x21, 0x86, 0x17, 0xcd, 0x69, 0x85,
+    0x17, 0xcd, 0x61, 0x84, 0x17, 0xcd, 0x59, 0xa0, 0x17, 0xcd, 0x19, 0x9f,
+    0x17, 0xcd, 0x11, 0x9e, 0x17, 0xcd, 0x09, 0x9d, 0x17, 0xcd, 0x01, 0x87,
+    0x17, 0xcd, 0x71, 0x88, 0x17, 0xcd, 0x78, 0x88, 0x17, 0xcf, 0xf9, 0x87,
+    0x17, 0xcf, 0xf1, 0x86, 0x17, 0xcf, 0xe9, 0x85, 0x17, 0xcf, 0xe1, 0x84,
+    0x17, 0xcf, 0xd9, 0x83, 0x17, 0xcf, 0xd1, 0xa6, 0x17, 0xcf, 0xc9, 0xa5,
+    0x17, 0xcf, 0xc1, 0xa4, 0x17, 0xcf, 0xb9, 0xa3, 0x17, 0xcf, 0xb1, 0xa2,
+    0x17, 0xcf, 0xa9, 0xa1, 0x17, 0xcf, 0xa1, 0xa0, 0x17, 0xcf, 0x99, 0x9f,
+    0x17, 0xcf, 0x91, 0x9e, 0x17, 0xcf, 0x89, 0x9d, 0x17, 0xcf, 0x80, 0x9d,
+    0x17, 0xcb, 0x81, 0x88, 0x17, 0xcb, 0xf9, 0x87, 0x17, 0xcb, 0xf1, 0x86,
+    0x17, 0xcb, 0xe9, 0x85, 0x17, 0xcb, 0xe1, 0x84, 0x17, 0xcb, 0xd9, 0x83,
+    0x17, 0xcb, 0xd1, 0xa6, 0x17, 0xcb, 0xc9, 0xa5, 0x17, 0xcb, 0xc1, 0xa4,
+    0x17, 0xcb, 0xb9, 0xa3, 0x17, 0xcb, 0xb1, 0xa2, 0x17, 0xcb, 0xa9, 0xa1,
+    0x17, 0xcb, 0xa1, 0xa0, 0x17, 0xcb, 0x99, 0x9f, 0x17, 0xcb, 0x91, 0x9e,
+    0x17, 0xcb, 0x88, 0x88, 0x17, 0xcb, 0x79, 0x87, 0x17, 0xcb, 0x71, 0x86,
+    0x17, 0xcb, 0x69, 0x85, 0x17, 0xcb, 0x61, 0x84, 0x17, 0xcb, 0x59, 0x83,
+    0x17, 0xcb, 0x51, 0xa6, 0x17, 0xcb, 0x49, 0xa5, 0x17, 0xcb, 0x41, 0xa4,
+    0x17, 0xcb, 0x39, 0xa3, 0x17, 0xcb, 0x31, 0xa2, 0x17, 0xcb, 0x29, 0xa1,
+    0x17, 0xcb, 0x21, 0x9d, 0x17, 0xcb, 0x01, 0x9e, 0x17, 0xcb, 0x09, 0x9f,
+    0x17, 0xcb, 0x11, 0xa0, 0x17, 0xcb, 0x18, 0x9d, 0x17, 0xc9, 0x81, 0x88,
+    0x17, 0xc9, 0xf9, 0x87, 0x17, 0xc9, 0xf1, 0x86, 0x17, 0xc9, 0xe9, 0x85,
+    0x17, 0xc9, 0xe1, 0x84, 0x17, 0xc9, 0xd9, 0x83, 0x17, 0xc9, 0xd1, 0xa6,
+    0x17, 0xc9, 0xc9, 0xa5, 0x17, 0xc9, 0xc1, 0xa4, 0x17, 0xc9, 0xb9, 0xa3,
+    0x17, 0xc9, 0xb1, 0xa2, 0x17, 0xc9, 0xa9, 0xa1, 0x17, 0xc9, 0xa1, 0xa0,
+    0x17, 0xc9, 0x99, 0x9f, 0x17, 0xc9, 0x91, 0x9e, 0x17, 0xc9, 0x88, 0x88,
+    0x17, 0xc9, 0x79, 0x87, 0x17, 0xc9, 0x71, 0x86, 0x17, 0xc9, 0x69, 0x85,
+    0x17, 0xc9, 0x61, 0x84, 0x17, 0xc9, 0x59, 0x83, 0x17, 0xc9, 0x51, 0xa6,
+    0x17, 0xc9, 0x49, 0xa5, 0x17, 0xc9, 0x41, 0xa4, 0x17, 0xc9, 0x39, 0xa3,
+    0x17, 0xc9, 0x31, 0xa2, 0x17, 0xc9, 0x29, 0xa1, 0x17, 0xc9, 0x21, 0xa0,
+    0x17, 0xc9, 0x19, 0x9f, 0x17, 0xc9, 0x11, 0x9e, 0x17, 0xc9, 0x09, 0x9d,
+    0x17, 0xc9, 0x00, 0x88, 0x17, 0xc8, 0xf9, 0x87, 0x17, 0xc8, 0xf1, 0x86,
+    0x17, 0xc8, 0xe9, 0x85, 0x17, 0xc8, 0xe1, 0x84, 0x17, 0xc8, 0xd9, 0x83,
+    0x17, 0xc8, 0xd1, 0xa6, 0x17, 0xc8, 0xc9, 0xa5, 0x17, 0xc8, 0xc1, 0xa4,
+    0x17, 0xc8, 0xb9, 0xa3, 0x17, 0xc8, 0xb1, 0xa2, 0x17, 0xc8, 0xa9, 0xa1,
+    0x17, 0xc8, 0xa1, 0xa0, 0x17, 0xc8, 0x99, 0x9f, 0x17, 0xc8, 0x91, 0x9e,
+    0x17, 0xc8, 0x89, 0x9d, 0x17, 0xc8, 0x80, 0x88, 0x17, 0xc8, 0x79, 0x87,
+    0x17, 0xc8, 0x71, 0x86, 0x17, 0xc8, 0x69, 0x85, 0x17, 0xc8, 0x61, 0x84,
+    0x17, 0xc8, 0x59, 0x83, 0x17, 0xc8, 0x51, 0xa6, 0x17, 0xc8, 0x49, 0xa5,
+    0x17, 0xc8, 0x41, 0xa4, 0x17, 0xc8, 0x39, 0xa3, 0x17, 0xc8, 0x31, 0xa2,
+    0x17, 0xc8, 0x29, 0xa1, 0x17, 0xc8, 0x21, 0xa0, 0x17, 0xc8, 0x19, 0x9f,
+    0x17, 0xc8, 0x11, 0x9e, 0x17, 0xc8, 0x09, 0x9d, 0x17, 0xc8, 0x00, 0x88,
+    0x17, 0xce, 0x79, 0x87, 0x17, 0xce, 0x71, 0x86, 0x17, 0xce, 0x69, 0x85,
+    0x17, 0xce, 0x61, 0x84, 0x17, 0xce, 0x59, 0x83, 0x17, 0xce, 0x51, 0xa6,
+    0x17, 0xce, 0x49, 0xa5, 0x17, 0xce, 0x41, 0xa4, 0x17, 0xce, 0x39, 0xa3,
+    0x17, 0xce, 0x31, 0xa2, 0x17, 0xce, 0x29, 0xa1, 0x17, 0xce, 0x21, 0xa0,
+    0x17, 0xce, 0x19, 0x9f, 0x17, 0xce, 0x11, 0x9d, 0x17, 0xce, 0x01, 0x9e,
+    0x17, 0xce, 0x08, 0x87, 0x17, 0xcd, 0xf1, 0x86, 0x17, 0xcd, 0xe9, 0x85,
+    0x17, 0xcd, 0xe1, 0x84, 0x17, 0xcd, 0xd9, 0x83, 0x17, 0xcd, 0xd1, 0xa6,
+    0x17, 0xcd, 0xc9, 0xa5, 0x17, 0xcd, 0xc1, 0xa4, 0x17, 0xcd, 0xb9, 0xa3,
+    0x17, 0xcd, 0xb1, 0xa2, 0x17, 0xcd, 0xa9, 0xa1, 0x17, 0xcd, 0xa1, 0x9d,
+    0x17, 0xcd, 0x81, 0x9e, 0x17, 0xcd, 0x89, 0x9f, 0x17, 0xcd, 0x91, 0xa0,
+    0x17, 0xcd, 0x99, 0x88, 0x17, 0xcd, 0xf8, 0x88, 0x17, 0xcc, 0xf9, 0x87,
+    0x17, 0xcc, 0xf1, 0x86, 0x17, 0xcc, 0xe9, 0x85, 0x17, 0xcc, 0xe1, 0x84,
+    0x17, 0xcc, 0xd9, 0x83, 0x17, 0xcc, 0xd1, 0xa6, 0x17, 0xcc, 0xc9, 0xa5,
+    0x17, 0xcc, 0xc1, 0xa4, 0x17, 0xcc, 0xb9, 0xa3, 0x17, 0xcc, 0xb1, 0xa2,
+    0x17, 0xcc, 0xa9, 0xa1, 0x17, 0xcc, 0xa1, 0x9d, 0x17, 0xcc, 0x81, 0x9e,
+    0x17, 0xcc, 0x89, 0x9f, 0x17, 0xcc, 0x91, 0xa0, 0x17, 0xcc, 0x98, 0x88,
+    0x17, 0xcc, 0x79, 0x87, 0x17, 0xcc, 0x71, 0x86, 0x17, 0xcc, 0x69, 0x85,
+    0x17, 0xcc, 0x61, 0x84, 0x17, 0xcc, 0x59, 0x83, 0x17, 0xcc, 0x51, 0xa6,
+    0x17, 0xcc, 0x49, 0xa5, 0x17, 0xcc, 0x41, 0xa4, 0x17, 0xcc, 0x39, 0xa3,
+    0x17, 0xcc, 0x31, 0xa2, 0x17, 0xcc, 0x29, 0xa1, 0x17, 0xcc, 0x21, 0xa0,
+    0x17, 0xcc, 0x19, 0x9f, 0x17, 0xcc, 0x11, 0x9e, 0x17, 0xcc, 0x09, 0x9d,
+    0x17, 0xcc, 0x00, 0xa5, 0x17, 0xca, 0xc1, 0xa4, 0x17, 0xca, 0xb9, 0xa3,
+    0x17, 0xca, 0xb1, 0xa2, 0x17, 0xca, 0xa9, 0xa1, 0x17, 0xca, 0xa1, 0x9e,
+    0x17, 0xca, 0x89, 0x9d, 0x17, 0xca, 0x81, 0x9f, 0x17, 0xca, 0x91, 0xa0,
+    0x17, 0xca, 0x99, 0xa6, 0x17, 0xca, 0xc9, 0x83, 0x17, 0xca, 0xd1, 0x84,
+    0x17, 0xca, 0xd9, 0x85, 0x17, 0xca, 0xe1, 0x86, 0x17, 0xca, 0xe9, 0x87,
+    0x17, 0xca, 0xf1, 0x88, 0x17, 0xca, 0xf8, 0x88, 0x17, 0xca, 0x79, 0x87,
+    0x17, 0xca, 0x71, 0x86, 0x17, 0xca, 0x69, 0x85, 0x17, 0xca, 0x61, 0x84,
+    0x17, 0xca, 0x59, 0x83, 0x17, 0xca, 0x51, 0xa6, 0x17, 0xca, 0x49, 0xa5,
+    0x17, 0xca, 0x41, 0xa4, 0x17, 0xca, 0x39, 0xa3, 0x17, 0xca, 0x31, 0xa2,
+    0x17, 0xca, 0x29, 0xa1, 0x17, 0xca, 0x21, 0xa0, 0x17, 0xca, 0x19, 0x9f,
+    0x17, 0xca, 0x11, 0x9e, 0x17, 0xca, 0x09, 0x9d, 0x17, 0xca, 0x00, 0xa2,
+    0x17, 0xc3, 0xa9, 0x9f, 0x17, 0xc3, 0x91, 0x88, 0x17, 0xc3, 0xf9, 0x87,
+    0x17, 0xc3, 0xf1, 0x86, 0x17, 0xc3, 0xe9, 0x85, 0x17, 0xc3, 0xe1, 0x84,
+    0x17, 0xc3, 0xd9, 0x83, 0x17, 0xc3, 0xd1, 0xa6, 0x17, 0xc3, 0xc9, 0xa5,
+    0x17, 0xc3, 0xc1, 0xa4, 0x17, 0xc3, 0xb9, 0xa3, 0x17, 0xc3, 0xb1, 0xa1,
+    0x17, 0xc3, 0xa1, 0xa0, 0x17, 0xc3, 0x99, 0x9e, 0x17, 0xc3, 0x89, 0x9d,
+    0x17, 0xc3, 0x80, 0x83, 0x17, 0xc3, 0x51, 0xa2, 0x17, 0xc3, 0x29, 0xa1,
+    0x17, 0xc3, 0x21, 0xa0, 0x17, 0xc3, 0x19, 0x9f, 0x17, 0xc3, 0x11, 0x9e,
+    0x17, 0xc3, 0x09, 0x88, 0x17, 0xc3, 0x79, 0x87, 0x17, 0xc3, 0x71, 0x86,
+    0x17, 0xc3, 0x69, 0x85, 0x17, 0xc3, 0x61, 0x84, 0x17, 0xc3, 0x59, 0xa6,
+    0x17, 0xc3, 0x49, 0xa5, 0x17, 0xc3, 0x41, 0xa4, 0x17, 0xc3, 0x39, 0xa3,
+    0x17, 0xc3, 0x31, 0x9d, 0x17, 0xc3, 0x00, 0xa6, 0x17, 0xc2, 0xc9, 0xa5,
+    0x17, 0xc2, 0xc1, 0xa4, 0x17, 0xc2, 0xb9, 0xa3, 0x17, 0xc2, 0xb1, 0xa2,
+    0x17, 0xc2, 0xa9, 0xa1, 0x17, 0xc2, 0xa1, 0xa0, 0x17, 0xc2, 0x99, 0x9f,
+    0x17, 0xc2, 0x91, 0x9e, 0x17, 0xc2, 0x89, 0x9d, 0x17, 0xc2, 0x81, 0x85,
+    0x17, 0xc2, 0xe1, 0x84, 0x17, 0xc2, 0xd9, 0x83, 0x17, 0xc2, 0xd1, 0x86,
+    0x17, 0xc2, 0xe9, 0x87, 0x17, 0xc2, 0xf1, 0x88, 0x17, 0xc2, 0xf8, 0x88,
+    0x17, 0xc2, 0x79, 0x87, 0x17, 0xc2, 0x71, 0xa6, 0x17, 0xc2, 0x49, 0xa5,
+    0x17, 0xc2, 0x41, 0xa4, 0x17, 0xc2, 0x39, 0xa3, 0x17, 0xc2, 0x31, 0xa2,
+    0x17, 0xc2, 0x29, 0xa1, 0x17, 0xc2, 0x21, 0xa0, 0x17, 0xc2, 0x19, 0x86,
+    0x17, 0xc2, 0x69, 0x85, 0x17, 0xc2, 0x61, 0x84, 0x17, 0xc2, 0x59, 0x83,
+    0x17, 0xc2, 0x51, 0x9f, 0x17, 0xc2, 0x11, 0x9e, 0x17, 0xc2, 0x09, 0x9d,
+    0x17, 0xc2, 0x00, 0xa5, 0x17, 0xc1, 0x41, 0xa4, 0x17, 0xc1, 0x39, 0xa3,
+    0x17, 0xc1, 0x31, 0xa2, 0x17, 0xc1, 0x29, 0xa1, 0x17, 0xc1, 0x21, 0x88,
+    0x17, 0xc1, 0x79, 0x87, 0x17, 0xc1, 0x71, 0x86, 0x17, 0xc1, 0x69, 0x85,
+    0x17, 0xc1, 0x61, 0x84, 0x17, 0xc1, 0x59, 0x83, 0x17, 0xc1, 0x51, 0xa6,
+    0x17, 0xc1, 0x49, 0xa0, 0x17, 0xc1, 0x19, 0x9f, 0x17, 0xc1, 0x11, 0x9e,
+    0x17, 0xc1, 0x09, 0x9d, 0x17, 0xc1, 0x00, 0xa5, 0x17, 0xc0, 0x41, 0xa4,
+    0x17, 0xc0, 0x39, 0x88, 0x17, 0xc0, 0x79, 0x87, 0x17, 0xc0, 0x71, 0x86,
+    0x17, 0xc0, 0x69, 0x85, 0x17, 0xc0, 0x61, 0x84, 0x17, 0xc0, 0x59, 0x83,
+    0x17, 0xc0, 0x51, 0xa6, 0x17, 0xc0, 0x49, 0xa3, 0x17, 0xc0, 0x31, 0xa2,
+    0x17, 0xc0, 0x29, 0xa1, 0x17, 0xc0, 0x21, 0x9d, 0x17, 0xc0, 0x01, 0x9e,
+    0x17, 0xc0, 0x09, 0x9f, 0x17, 0xc0, 0x11, 0xa0, 0x17, 0xc0, 0x18, 0x88,
+    0x17, 0xc7, 0xf9, 0x87, 0x17, 0xc7, 0xf1, 0x86, 0x17, 0xc7, 0xe9, 0x85,
+    0x17, 0xc7, 0xe1, 0x84, 0x17, 0xc7, 0xd9, 0x83, 0x17, 0xc7, 0xd1, 0xa6,
+    0x17, 0xc7, 0xc9, 0xa5, 0x17, 0xc7, 0xc1, 0xa4, 0x17, 0xc7, 0xb9, 0xa3,
+    0x17, 0xc7, 0xb1, 0xa2, 0x17, 0xc7, 0xa9, 0xa1, 0x17, 0xc7, 0xa1, 0xa0,
+    0x17, 0xc7, 0x99, 0x9f, 0x17, 0xc7, 0x91, 0x9e, 0x17, 0xc7, 0x89, 0x9d,
+    0x17, 0xc7, 0x80, 0x9d, 0x17, 0xc5, 0x81, 0x88, 0x17, 0xc5, 0xf9, 0x87,
+    0x17, 0xc5, 0xf1, 0x86, 0x17, 0xc5, 0xe9, 0x85, 0x17, 0xc5, 0xe1, 0x84,
+    0x17, 0xc5, 0xd9, 0x83, 0x17, 0xc5, 0xd1, 0xa6, 0x17, 0xc5, 0xc9, 0xa5,
+    0x17, 0xc5, 0xc1, 0xa4, 0x17, 0xc5, 0xb9, 0xa3, 0x17, 0xc5, 0xb1, 0xa2,
+    0x17, 0xc5, 0xa9, 0xa1, 0x17, 0xc5, 0xa1, 0xa0, 0x17, 0xc5, 0x99, 0x9f,
+    0x17, 0xc5, 0x91, 0x9e, 0x17, 0xc5, 0x88, 0x88, 0x17, 0xc5, 0x79, 0x87,
+    0x17, 0xc5, 0x71, 0x86, 0x17, 0xc5, 0x69, 0x85, 0x17, 0xc5, 0x61, 0x84,
+    0x17, 0xc5, 0x59, 0x83, 0x17, 0xc5, 0x51, 0xa6, 0x17, 0xc5, 0x49, 0xa5,
+    0x17, 0xc5, 0x41, 0xa4, 0x17, 0xc5, 0x39, 0xa3, 0x17, 0xc5, 0x31, 0xa2,
+    0x17, 0xc5, 0x29, 0xa1, 0x17, 0xc5, 0x21, 0xa0, 0x17, 0xc5, 0x19, 0x9f,
+    0x17, 0xc5, 0x11, 0x9e, 0x17, 0xc5, 0x09, 0x9d, 0x17, 0xc5, 0x00, 0x88,
+    0x17, 0xc4, 0xf9, 0x87, 0x17, 0xc4, 0xf1, 0x86, 0x17, 0xc4, 0xe9, 0x85,
+    0x17, 0xc4, 0xe1, 0x84, 0x17, 0xc4, 0xd9, 0x83, 0x17, 0xc4, 0xd1, 0xa6,
+    0x17, 0xc4, 0xc9, 0xa5, 0x17, 0xc4, 0xc1, 0xa4, 0x17, 0xc4, 0xb9, 0xa3,
+    0x17, 0xc4, 0xb1, 0xa2, 0x17, 0xc4, 0xa9, 0xa1, 0x17, 0xc4, 0xa1, 0xa0,
+    0x17, 0xc4, 0x99, 0x9f, 0x17, 0xc4, 0x91, 0x9e, 0x17, 0xc4, 0x89, 0x9d,
+    0x17, 0xc4, 0x80, 0x88, 0x17, 0xc4, 0x79, 0x87, 0x17, 0xc4, 0x71, 0x86,
+    0x17, 0xc4, 0x69, 0x85, 0x17, 0xc4, 0x61, 0x84, 0x17, 0xc4, 0x59, 0x83,
+    0x17, 0xc4, 0x51, 0xa6, 0x17, 0xc4, 0x49, 0xa5, 0x17, 0xc4, 0x41, 0xa4,
+    0x17, 0xc4, 0x39, 0xa3, 0x17, 0xc4, 0x31, 0xa2, 0x17, 0xc4, 0x29, 0xa1,
+    0x17, 0xc4, 0x21, 0xa0, 0x17, 0xc4, 0x19, 0x9f, 0x17, 0xc4, 0x11, 0x9e,
+    0x17, 0xc4, 0x09, 0x9d, 0x17, 0xc4, 0x00, 0x88, 0x17, 0xc7, 0x79, 0x87,
+    0x17, 0xc7, 0x71, 0x86, 0x17, 0xc7, 0x69, 0x85, 0x17, 0xc7, 0x61, 0x84,
+    0x17, 0xc7, 0x59, 0x83, 0x17, 0xc7, 0x51, 0xa6, 0x17, 0xc7, 0x49, 0xa5,
+    0x17, 0xc7, 0x41, 0xa4, 0x17, 0xc7, 0x39, 0xa3, 0x17, 0xc7, 0x31, 0xa2,
+    0x17, 0xc7, 0x29, 0xa1, 0x17, 0xc7, 0x21, 0x9d, 0x17, 0xc7, 0x01, 0x9e,
+    0x17, 0xc7, 0x09, 0x9f, 0x17, 0xc7, 0x11, 0xa0, 0x17, 0xc7, 0x18, 0xa6,
+    0x17, 0xc6, 0xc9, 0xa5, 0x17, 0xc6, 0xc1, 0xa4, 0x17, 0xc6, 0xb9, 0xa3,
+    0x17, 0xc6, 0xb1, 0xa2, 0x17, 0xc6, 0xa9, 0xa1, 0x17, 0xc6, 0xa1, 0xa0,
+    0x17, 0xc6, 0x99, 0x9f, 0x17, 0xc6, 0x91, 0x9e, 0x17, 0xc6, 0x89, 0x9d,
+    0x17, 0xc6, 0x81, 0x83, 0x17, 0xc6, 0xd1, 0x84, 0x17, 0xc6, 0xd9, 0x85,
+    0x17, 0xc6, 0xe1, 0x86, 0x17, 0xc6, 0xe9, 0x87, 0x17, 0xc6, 0xf1, 0x88,
+    0x17, 0xc6, 0xf8, 0x88, 0x17, 0xc6, 0x79, 0x87, 0x17, 0xc6, 0x71, 0x86,
+    0x17, 0xc6, 0x69, 0x85, 0x17, 0xc6, 0x61, 0x84, 0x17, 0xc6, 0x59, 0x83,
+    0x17, 0xc6, 0x51, 0xa6, 0x17, 0xc6, 0x49, 0xa5, 0x17, 0xc6, 0x41, 0xa4,
+    0x17, 0xc6, 0x39, 0xa3, 0x17, 0xc6, 0x31, 0xa2, 0x17, 0xc6, 0x29, 0xa1,
+    0x17, 0xc6, 0x21, 0xa0, 0x17, 0xc6, 0x19, 0x9f, 0x17, 0xc6, 0x11, 0x9e,
+    0x17, 0xc6, 0x09, 0x9d, 0x17, 0xc6, 0x00, 0x88, 0x17, 0xc1, 0xf9, 0x87,
+    0x17, 0xc1, 0xf1, 0x86, 0x17, 0xc1, 0xe9, 0x85, 0x17, 0xc1, 0xe1, 0x84,
+    0x17, 0xc1, 0xd9, 0x83, 0x17, 0xc1, 0xd1, 0xa6, 0x17, 0xc1, 0xc9, 0xa5,
+    0x17, 0xc1, 0xc1, 0xa4, 0x17, 0xc1, 0xb9, 0xa3, 0x17, 0xc1, 0xb1, 0xa2,
+    0x17, 0xc1, 0xa9, 0xa1, 0x17, 0xc1, 0xa1, 0xa0, 0x17, 0xc1, 0x99, 0x9f,
+    0x17, 0xc1, 0x91, 0x9e, 0x17, 0xc1, 0x89, 0x9d, 0x17, 0xc1, 0x80, 0x88,
+    0x17, 0xc0, 0xf9, 0x87, 0x17, 0xc0, 0xf1, 0x86, 0x17, 0xc0, 0xe9, 0x85,
+    0x17, 0xc0, 0xe1, 0x84, 0x17, 0xc0, 0xd9, 0x83, 0x17, 0xc0, 0xd1, 0xa6,
+    0x17, 0xc0, 0xc9, 0xa5, 0x17, 0xc0, 0xc1, 0xa4, 0x17, 0xc0, 0xb9, 0xa3,
+    0x17, 0xc0, 0xb1, 0xa2, 0x17, 0xc0, 0xa9, 0xa1, 0x17, 0xc0, 0xa1, 0xa0,
+    0x17, 0xc0, 0x99, 0x9f, 0x17, 0xc0, 0x91, 0x9e, 0x17, 0xc0, 0x89, 0x9d,
+    0x17, 0xc0, 0x80, 0x86, 0x17, 0xd0, 0xe9, 0x85, 0x17, 0xd0, 0xe1, 0x84,
+    0x17, 0xd0, 0xd9, 0x83, 0x17, 0xd0, 0xd1, 0xa6, 0x17, 0xd0, 0xc9, 0xa5,
+    0x17, 0xd0, 0xc1, 0xa4, 0x17, 0xd0, 0xb9, 0xa3, 0x17, 0xd0, 0xb1, 0xa2,
+    0x17, 0xd0, 0xa9, 0xa1, 0x17, 0xd0, 0xa1, 0xa0, 0x17, 0xd0, 0x99, 0x9f,
+    0x17, 0xd0, 0x91, 0x9e, 0x17, 0xd0, 0x89, 0x9d, 0x17, 0xd0, 0x80, 0x88,
+    0x17, 0xd0, 0x79, 0x87, 0x17, 0xd0, 0x71, 0x86, 0x17, 0xd0, 0x69, 0x85,
+    0x17, 0xd0, 0x61, 0x84, 0x17, 0xd0, 0x59, 0x83, 0x17, 0xd0, 0x51, 0xa6,
+    0x17, 0xd0, 0x49, 0xa5, 0x17, 0xd0, 0x41, 0xa4, 0x17, 0xd0, 0x39, 0xa3,
+    0x17, 0xd0, 0x31, 0xa2, 0x17, 0xd0, 0x29, 0xa1, 0x17, 0xd0, 0x21, 0xa0,
+    0x17, 0xd0, 0x19, 0x9f, 0x17, 0xd0, 0x11, 0x9e, 0x17, 0xd0, 0x09, 0x9d,
+    0x17, 0xd0, 0x00, 0xa6, 0x07, 0xd6, 0xc9, 0xa5, 0x07, 0xd6, 0xc1, 0xa4,
+    0x07, 0xd6, 0xb9, 0xa3, 0x07, 0xd6, 0xb1, 0xa2, 0x07, 0xd6, 0xa9, 0xa1,
+    0x07, 0xd6, 0xa1, 0xa0, 0x07, 0xd6, 0x99, 0x9f, 0x07, 0xd6, 0x91, 0x9e,
+    0x07, 0xd6, 0x89, 0x9d, 0x07, 0xd6, 0x80, 0x88, 0x07, 0xd6, 0x79, 0x87,
+    0x07, 0xd6, 0x71, 0x86, 0x07, 0xd6, 0x69, 0x85, 0x07, 0xd6, 0x61, 0x84,
+    0x07, 0xd6, 0x59, 0x83, 0x07, 0xd6, 0x51, 0xa6, 0x07, 0xd6, 0x49, 0xa5,
+    0x07, 0xd6, 0x41, 0xa4, 0x07, 0xd6, 0x39, 0xa3, 0x07, 0xd6, 0x31, 0xa2,
+    0x07, 0xd6, 0x29, 0xa1, 0x07, 0xd6, 0x21, 0xa0, 0x07, 0xd6, 0x19, 0x9f,
+    0x07, 0xd6, 0x11, 0x9e, 0x07, 0xd6, 0x09, 0x9d, 0x07, 0xd6, 0x00, 0x88,
+    0x07, 0xd5, 0xf9, 0x87, 0x07, 0xd5, 0xf1, 0x86, 0x07, 0xd5, 0xe9, 0x85,
+    0x07, 0xd5, 0xe1, 0x84, 0x07, 0xd5, 0xd9, 0x83, 0x07, 0xd5, 0xd1, 0xa6,
+    0x07, 0xd5, 0xc9, 0xa5, 0x07, 0xd5, 0xc1, 0xa4, 0x07, 0xd5, 0xb9, 0xa3,
+    0x07, 0xd5, 0xb1, 0xa2, 0x07, 0xd5, 0xa9, 0xa1, 0x07, 0xd5, 0xa1, 0xa0,
+    0x07, 0xd5, 0x99, 0x9f, 0x07, 0xd5, 0x91, 0x9e, 0x07, 0xd5, 0x89, 0x9d,
+    0x07, 0xd5, 0x80, 0x88, 0x07, 0xd5, 0x79, 0x87, 0x07, 0xd5, 0x71, 0x86,
+    0x07, 0xd5, 0x69, 0x85, 0x07, 0xd5, 0x61, 0x84, 0x07, 0xd5, 0x59, 0x83,
+    0x07, 0xd5, 0x51, 0xa6, 0x07, 0xd5, 0x49, 0xa5, 0x07, 0xd5, 0x41, 0xa4,
+    0x07, 0xd5, 0x39, 0xa3, 0x07, 0xd5, 0x31, 0xa2, 0x07, 0xd5, 0x29, 0xa1,
+    0x07, 0xd5, 0x21, 0xa0, 0x07, 0xd5, 0x19, 0x9f, 0x07, 0xd5, 0x11, 0x9e,
+    0x07, 0xd5, 0x09, 0x9d, 0x07, 0xd5, 0x00, 0x88, 0x07, 0xd4, 0xf9, 0x87,
+    0x07, 0xd4, 0xf1, 0x86, 0x07, 0xd4, 0xe9, 0x85, 0x07, 0xd4, 0xe1, 0x84,
+    0x07, 0xd4, 0xd9, 0x83, 0x07, 0xd4, 0xd1, 0xa6, 0x07, 0xd4, 0xc9, 0xa5,
+    0x07, 0xd4, 0xc1, 0xa4, 0x07, 0xd4, 0xb9, 0xa3, 0x07, 0xd4, 0xb1, 0xa2,
+    0x07, 0xd4, 0xa9, 0xa1, 0x07, 0xd4, 0xa1, 0xa0, 0x07, 0xd4, 0x99, 0x9f,
+    0x07, 0xd4, 0x91, 0x9e, 0x07, 0xd4, 0x89, 0x9d, 0x07, 0xd4, 0x80, 0x88,
+    0x07, 0xd4, 0x79, 0x87, 0x07, 0xd4, 0x71, 0x86, 0x07, 0xd4, 0x69, 0x85,
+    0x07, 0xd4, 0x61, 0x84, 0x07, 0xd4, 0x59, 0x83, 0x07, 0xd4, 0x51, 0xa6,
+    0x07, 0xd4, 0x49, 0xa5, 0x07, 0xd4, 0x41, 0xa4, 0x07, 0xd4, 0x39, 0xa3,
+    0x07, 0xd4, 0x31, 0xa2, 0x07, 0xd4, 0x29, 0xa1, 0x07, 0xd4, 0x21, 0xa0,
+    0x07, 0xd4, 0x19, 0x9f, 0x07, 0xd4, 0x11, 0x9e, 0x07, 0xd4, 0x09, 0x9d,
+    0x07, 0xd4, 0x00, 0x86, 0x07, 0xd3, 0xe9, 0x85, 0x07, 0xd3, 0xe1, 0x84,
+    0x07, 0xd3, 0xd9, 0x83, 0x07, 0xd3, 0xd1, 0xa6, 0x07, 0xd3, 0xc9, 0xa5,
+    0x07, 0xd3, 0xc1, 0xa4, 0x07, 0xd3, 0xb9, 0xa3, 0x07, 0xd3, 0xb1, 0xa2,
+    0x07, 0xd3, 0xa9, 0xa1, 0x07, 0xd3, 0xa1, 0xa0, 0x07, 0xd3, 0x99, 0x9f,
+    0x07, 0xd3, 0x91, 0x9e, 0x07, 0xd3, 0x89, 0x9d, 0x07, 0xd3, 0x81, 0x87,
+    0x07, 0xd3, 0xf1, 0x88, 0x07, 0xd3, 0xf8, 0x86, 0x07, 0xd3, 0x69, 0x85,
+    0x07, 0xd3, 0x61, 0x84, 0x07, 0xd3, 0x59, 0x83, 0x07, 0xd3, 0x51, 0xa6,
+    0x07, 0xd3, 0x49, 0xa5, 0x07, 0xd3, 0x41, 0xa4, 0x07, 0xd3, 0x39, 0xa3,
+    0x07, 0xd3, 0x31, 0xa2, 0x07, 0xd3, 0x29, 0xa1, 0x07, 0xd3, 0x21, 0xa0,
+    0x07, 0xd3, 0x19, 0x9f, 0x07, 0xd3, 0x11, 0x9e, 0x07, 0xd3, 0x09, 0x9d,
+    0x07, 0xd3, 0x00, 0x88, 0x07, 0xd2, 0xf9, 0x87, 0x07, 0xd2, 0xf1, 0x86,
+    0x07, 0xd2, 0xe9, 0x85, 0x07, 0xd2, 0xe1, 0x84, 0x07, 0xd2, 0xd9, 0x83,
+    0x07, 0xd2, 0xd1, 0xa6, 0x07, 0xd2, 0xc9, 0xa5, 0x07, 0xd2, 0xc1, 0xa4,
+    0x07, 0xd2, 0xb9, 0xa3, 0x07, 0xd2, 0xb1, 0xa2, 0x07, 0xd2, 0xa9, 0xa1,
+    0x07, 0xd2, 0xa1, 0xa0, 0x07, 0xd2, 0x99, 0x9f, 0x07, 0xd2, 0x91, 0x9e,
+    0x07, 0xd2, 0x89, 0x9d, 0x07, 0xd2, 0x80, 0x88, 0x07, 0xd2, 0x79, 0x87,
+    0x07, 0xd2, 0x71, 0x86, 0x07, 0xd2, 0x69, 0x85, 0x07, 0xd2, 0x61, 0x84,
+    0x07, 0xd2, 0x59, 0x83, 0x07, 0xd2, 0x51, 0xa6, 0x07, 0xd2, 0x49, 0xa5,
+    0x07, 0xd2, 0x41, 0xa4, 0x07, 0xd2, 0x39, 0xa3, 0x07, 0xd2, 0x31, 0xa2,
+    0x07, 0xd2, 0x29, 0xa1, 0x07, 0xd2, 0x21, 0xa0, 0x07, 0xd2, 0x19, 0x9f,
+    0x07, 0xd2, 0x11, 0x9d, 0x07, 0xd2, 0x01, 0x9e, 0x07, 0xd2, 0x08, 0x88,
+    0x07, 0xd1, 0xf9, 0x87, 0x07, 0xd1, 0xf1, 0x86, 0x07, 0xd1, 0xe9, 0x85,
+    0x07, 0xd1, 0xe1, 0x84, 0x07, 0xd1, 0xd9, 0x83, 0x07, 0xd1, 0xd1, 0xa6,
+    0x07, 0xd1, 0xc9, 0xa5, 0x07, 0xd1, 0xc1, 0xa4, 0x07, 0xd1, 0xb9, 0xa3,
+    0x07, 0xd1, 0xb1, 0xa2, 0x07, 0xd1, 0xa9, 0xa1, 0x07, 0xd1, 0xa1, 0xa0,
+    0x07, 0xd1, 0x99, 0x9f, 0x07, 0xd1, 0x91, 0x9e, 0x07, 0xd1, 0x89, 0x9d,
+    0x07, 0xd1, 0x80, 0x88, 0x07, 0xd1, 0x79, 0x87, 0x07, 0xd1, 0x71, 0x86,
+    0x07, 0xd1, 0x69, 0x85, 0x07, 0xd1, 0x61, 0x84, 0x07, 0xd1, 0x59, 0x83,
+    0x07, 0xd1, 0x51, 0xa6, 0x07, 0xd1, 0x49, 0xa5, 0x07, 0xd1, 0x41, 0xa4,
+    0x07, 0xd1, 0x39, 0xa3, 0x07, 0xd1, 0x31, 0xa2, 0x07, 0xd1, 0x29, 0xa1,
+    0x07, 0xd1, 0x21, 0xa0, 0x07, 0xd1, 0x19, 0x9f, 0x07, 0xd1, 0x11, 0x9e,
+    0x07, 0xd1, 0x09, 0x9d, 0x07, 0xd1, 0x00, 0x88, 0x07, 0xd0, 0xf9, 0x87,
+    0x07, 0xd0, 0xf1, 0x86, 0x07, 0xd0, 0xe9, 0x85, 0x07, 0xd0, 0xe1, 0x84,
+    0x07, 0xd0, 0xd9, 0x83, 0x07, 0xd0, 0xd1, 0xa6, 0x07, 0xd0, 0xc9, 0xa5,
+    0x07, 0xd0, 0xc1, 0xa4, 0x07, 0xd0, 0xb9, 0xa3, 0x07, 0xd0, 0xb1, 0xa2,
+    0x07, 0xd0, 0xa9, 0xa1, 0x07, 0xd0, 0xa1, 0xa0, 0x07, 0xd0, 0x99, 0x9f,
+    0x07, 0xd0, 0x91, 0x9e, 0x07, 0xd0, 0x89, 0x9d, 0x07, 0xd0, 0x80, 0x88,
+    0x07, 0xd0, 0x79, 0x87, 0x07, 0xd0, 0x71, 0x86, 0x07, 0xd0, 0x69, 0x85,
+    0x07, 0xd0, 0x61, 0x84, 0x07, 0xd0, 0x59, 0x83, 0x07, 0xd0, 0x51, 0xa6,
+    0x07, 0xd0, 0x49, 0xa5, 0x07, 0xd0, 0x41, 0xa4, 0x07, 0xd0, 0x39, 0xa3,
+    0x07, 0xd0, 0x31, 0xa2, 0x07, 0xd0, 0x29, 0xa1, 0x07, 0xd0, 0x21, 0xa0,
+    0x07, 0xd0, 0x19, 0x9f, 0x07, 0xd0, 0x11, 0x9e, 0x07, 0xd0, 0x09, 0x9d,
+    0x07, 0xd0, 0x00, 0x88, 0x07, 0xcf, 0xf9, 0x87, 0x07, 0xcf, 0xf1, 0x86,
+    0x07, 0xcf, 0xe9, 0x85, 0x07, 0xcf, 0xe1, 0x84, 0x07, 0xcf, 0xd9, 0x83,
+    0x07, 0xcf, 0xd1, 0xa6, 0x07, 0xcf, 0xc9, 0xa5, 0x07, 0xcf, 0xc1, 0xa4,
+    0x07, 0xcf, 0xb9, 0xa3, 0x07, 0xcf, 0xb1, 0xa2, 0x07, 0xcf, 0xa9, 0xa1,
+    0x07, 0xcf, 0xa1, 0xa0, 0x07, 0xcf, 0x99, 0x9f, 0x07, 0xcf, 0x91, 0x9e,
+    0x07, 0xcf, 0x89, 0x9d, 0x07, 0xcf, 0x80, 0x88, 0x07, 0xcf, 0x79, 0x87,
+    0x07, 0xcf, 0x71, 0x86, 0x07, 0xcf, 0x69, 0x85, 0x07, 0xcf, 0x61, 0x84,
+    0x07, 0xcf, 0x59, 0x83, 0x07, 0xcf, 0x51, 0xa6, 0x07, 0xcf, 0x49, 0xa5,
+    0x07, 0xcf, 0x41, 0xa4, 0x07, 0xcf, 0x39, 0xa3, 0x07, 0xcf, 0x31, 0xa2,
+    0x07, 0xcf, 0x29, 0xa1, 0x07, 0xcf, 0x21, 0xa0, 0x07, 0xcf, 0x19, 0x9f,
+    0x07, 0xcf, 0x11, 0x9e, 0x07, 0xcf, 0x09, 0x9d, 0x07, 0xcf, 0x00, 0x88,
+    0x07, 0xce, 0xf9, 0x87, 0x07, 0xce, 0xf1, 0x86, 0x07, 0xce, 0xe9, 0x85,
+    0x07, 0xce, 0xe1, 0x84, 0x07, 0xce, 0xd9, 0x83, 0x07, 0xce, 0xd1, 0xa6,
+    0x07, 0xce, 0xc9, 0xa5, 0x07, 0xce, 0xc1, 0xa4, 0x07, 0xce, 0xb9, 0xa3,
+    0x07, 0xce, 0xb1, 0xa2, 0x07, 0xce, 0xa9, 0xa1, 0x07, 0xce, 0xa1, 0xa0,
+    0x07, 0xce, 0x99, 0x9f, 0x07, 0xce, 0x91, 0x9e, 0x07, 0xce, 0x89, 0x9d,
+    0x07, 0xce, 0x80, 0x88, 0x07, 0xce, 0x79, 0x87, 0x07, 0xce, 0x71, 0x86,
+    0x07, 0xce, 0x69, 0x85, 0x07, 0xce, 0x61, 0x84, 0x07, 0xce, 0x59, 0x83,
+    0x07, 0xce, 0x51, 0xa6, 0x07, 0xce, 0x49, 0xa5, 0x07, 0xce, 0x41, 0xa4,
+    0x07, 0xce, 0x39, 0xa3, 0x07, 0xce, 0x31, 0xa2, 0x07, 0xce, 0x29, 0xa1,
+    0x07, 0xce, 0x21, 0xa0, 0x07, 0xce, 0x19, 0x9f, 0x07, 0xce, 0x11, 0x9e,
+    0x07, 0xce, 0x09, 0x9d, 0x07, 0xce, 0x00, 0x88, 0x07, 0xcd, 0xf9, 0x87,
+    0x07, 0xcd, 0xf1, 0x86, 0x07, 0xcd, 0xe9, 0x85, 0x07, 0xcd, 0xe1, 0x84,
+    0x07, 0xcd, 0xd9, 0x83, 0x07, 0xcd, 0xd1, 0xa6, 0x07, 0xcd, 0xc9, 0xa5,
+    0x07, 0xcd, 0xc1, 0xa4, 0x07, 0xcd, 0xb9, 0xa3, 0x07, 0xcd, 0xb1, 0xa2,
+    0x07, 0xcd, 0xa9, 0xa1, 0x07, 0xcd, 0xa1, 0xa0, 0x07, 0xcd, 0x99, 0x9f,
+    0x07, 0xcd, 0x91, 0x9e, 0x07, 0xcd, 0x89, 0x9d, 0x07, 0xcd, 0x80, 0x88,
+    0x07, 0xcd, 0x79, 0x87, 0x07, 0xcd, 0x71, 0x86, 0x07, 0xcd, 0x69, 0x85,
+    0x07, 0xcd, 0x61, 0x84, 0x07, 0xcd, 0x59, 0x83, 0x07, 0xcd, 0x51, 0xa6,
+    0x07, 0xcd, 0x49, 0xa5, 0x07, 0xcd, 0x41, 0xa4, 0x07, 0xcd, 0x39, 0xa3,
+    0x07, 0xcd, 0x31, 0xa2, 0x07, 0xcd, 0x29, 0xa1, 0x07, 0xcd, 0x21, 0xa0,
+    0x07, 0xcd, 0x19, 0x9f, 0x07, 0xcd, 0x11, 0x9e, 0x07, 0xcd, 0x09, 0x9d,
+    0x07, 0xcd, 0x00, 0x88, 0x07, 0xcc, 0xf9, 0x87, 0x07, 0xcc, 0xf1, 0x86,
+    0x07, 0xcc, 0xe9, 0x85, 0x07, 0xcc, 0xe1, 0x84, 0x07, 0xcc, 0xd9, 0x83,
+    0x07, 0xcc, 0xd1, 0xa6, 0x07, 0xcc, 0xc9, 0xa5, 0x07, 0xcc, 0xc1, 0xa4,
+    0x07, 0xcc, 0xb9, 0xa3, 0x07, 0xcc, 0xb1, 0xa2, 0x07, 0xcc, 0xa9, 0xa1,
+    0x07, 0xcc, 0xa1, 0xa0, 0x07, 0xcc, 0x99, 0x9f, 0x07, 0xcc, 0x91, 0x9e,
+    0x07, 0xcc, 0x89, 0x9d, 0x07, 0xcc, 0x80, 0x88, 0x07, 0xcc, 0x79, 0x87,
+    0x07, 0xcc, 0x71, 0x86, 0x07, 0xcc, 0x69, 0x85, 0x07, 0xcc, 0x61, 0x84,
+    0x07, 0xcc, 0x59, 0x83, 0x07, 0xcc, 0x51, 0xa6, 0x07, 0xcc, 0x49, 0xa5,
+    0x07, 0xcc, 0x41, 0xa4, 0x07, 0xcc, 0x39, 0xa3, 0x07, 0xcc, 0x31, 0xa2,
+    0x07, 0xcc, 0x29, 0xa1, 0x07, 0xcc, 0x21, 0xa0, 0x07, 0xcc, 0x19, 0x9f,
+    0x07, 0xcc, 0x11, 0x9e, 0x07, 0xcc, 0x09, 0x9d, 0x07, 0xcc, 0x00, 0x88,
+    0x07, 0xcb, 0xf9, 0x87, 0x07, 0xcb, 0xf1, 0x86, 0x07, 0xcb, 0xe9, 0x85,
+    0x07, 0xcb, 0xe1, 0x84, 0x07, 0xcb, 0xd9, 0x83, 0x07, 0xcb, 0xd1, 0xa6,
+    0x07, 0xcb, 0xc9, 0xa5, 0x07, 0xcb, 0xc1, 0xa4, 0x07, 0xcb, 0xb9, 0xa3,
+    0x07, 0xcb, 0xb1, 0xa2, 0x07, 0xcb, 0xa9, 0xa1, 0x07, 0xcb, 0xa1, 0xa0,
+    0x07, 0xcb, 0x99, 0x9f, 0x07, 0xcb, 0x91, 0x9e, 0x07, 0xcb, 0x89, 0x9d,
+    0x07, 0xcb, 0x80, 0x88, 0x07, 0xcb, 0x79, 0x87, 0x07, 0xcb, 0x71, 0x86,
+    0x07, 0xcb, 0x69, 0x85, 0x07, 0xcb, 0x61, 0x84, 0x07, 0xcb, 0x59, 0x83,
+    0x07, 0xcb, 0x51, 0xa6, 0x07, 0xcb, 0x49, 0xa5, 0x07, 0xcb, 0x41, 0xa4,
+    0x07, 0xcb, 0x39, 0xa3, 0x07, 0xcb, 0x31, 0xa2, 0x07, 0xcb, 0x29, 0xa1,
+    0x07, 0xcb, 0x21, 0xa0, 0x07, 0xcb, 0x19, 0x9f, 0x07, 0xcb, 0x11, 0x9e,
+    0x07, 0xcb, 0x09, 0x9d, 0x07, 0xcb, 0x00, 0x88, 0x07, 0xca, 0xf9, 0x87,
+    0x07, 0xca, 0xf1, 0x86, 0x07, 0xca, 0xe9, 0x85, 0x07, 0xca, 0xe1, 0x84,
+    0x07, 0xca, 0xd9, 0x83, 0x07, 0xca, 0xd1, 0xa6, 0x07, 0xca, 0xc9, 0xa5,
+    0x07, 0xca, 0xc1, 0xa4, 0x07, 0xca, 0xb9, 0xa3, 0x07, 0xca, 0xb1, 0xa2,
+    0x07, 0xca, 0xa9, 0xa1, 0x07, 0xca, 0xa1, 0xa0, 0x07, 0xca, 0x99, 0x9f,
+    0x07, 0xca, 0x91, 0x9e, 0x07, 0xca, 0x89, 0x9d, 0x07, 0xca, 0x80, 0x88,
+    0x07, 0xca, 0x79, 0x87, 0x07, 0xca, 0x71, 0x86, 0x07, 0xca, 0x69, 0x85,
+    0x07, 0xca, 0x61, 0x84, 0x07, 0xca, 0x59, 0x83, 0x07, 0xca, 0x51, 0xa6,
+    0x07, 0xca, 0x49, 0xa5, 0x07, 0xca, 0x41, 0xa4, 0x07, 0xca, 0x39, 0xa3,
+    0x07, 0xca, 0x31, 0xa2, 0x07, 0xca, 0x29, 0xa1, 0x07, 0xca, 0x21, 0xa0,
+    0x07, 0xca, 0x19, 0x9f, 0x07, 0xca, 0x11, 0x9e, 0x07, 0xca, 0x09, 0x9d,
+    0x07, 0xca, 0x00, 0x88, 0x07, 0xc9, 0xf9, 0x87, 0x07, 0xc9, 0xf1, 0x86,
+    0x07, 0xc9, 0xe9, 0x85, 0x07, 0xc9, 0xe1, 0x84, 0x07, 0xc9, 0xd9, 0x83,
+    0x07, 0xc9, 0xd1, 0xa6, 0x07, 0xc9, 0xc9, 0xa5, 0x07, 0xc9, 0xc1, 0xa4,
+    0x07, 0xc9, 0xb9, 0xa3, 0x07, 0xc9, 0xb1, 0xa2, 0x07, 0xc9, 0xa9, 0xa1,
+    0x07, 0xc9, 0xa1, 0xa0, 0x07, 0xc9, 0x99, 0x9d, 0x07, 0xc9, 0x81, 0x9e,
+    0x07, 0xc9, 0x89, 0x9f, 0x07, 0xc9, 0x90, 0xa4, 0x07, 0xc9, 0x39, 0xa3,
+    0x07, 0xc9, 0x31, 0xa2, 0x07, 0xc9, 0x29, 0xa1, 0x07, 0xc9, 0x21, 0xa0,
+    0x07, 0xc9, 0x19, 0x9f, 0x07, 0xc9, 0x11, 0x9d, 0x07, 0xc9, 0x01, 0x9e,
+    0x07, 0xc9, 0x09, 0xa5, 0x07, 0xc9, 0x41, 0xa6, 0x07, 0xc9, 0x49, 0x83,
+    0x07, 0xc9, 0x51, 0x84, 0x07, 0xc9, 0x59, 0x85, 0x07, 0xc9, 0x61, 0x86,
+    0x07, 0xc9, 0x69, 0x87, 0x07, 0xc9, 0x71, 0x88, 0x07, 0xc9, 0x78, 0x86,
+    0x07, 0xc8, 0xe9, 0x85, 0x07, 0xc8, 0xe1, 0x84, 0x07, 0xc8, 0xd9, 0x83,
+    0x07, 0xc8, 0xd1, 0xa6, 0x07, 0xc8, 0xc9, 0xa5, 0x07, 0xc8, 0xc1, 0xa4,
+    0x07, 0xc8, 0xb9, 0xa3, 0x07, 0xc8, 0xb1, 0xa2, 0x07, 0xc8, 0xa9, 0xa1,
+    0x07, 0xc8, 0xa1, 0xa0, 0x07, 0xc8, 0x99, 0x9f, 0x07, 0xc8, 0x91, 0x9e,
+    0x07, 0xc8, 0x89, 0x9d, 0x07, 0xc8, 0x81, 0x87, 0x07, 0xc8, 0xf1, 0x88,
+    0x07, 0xc8, 0xf8, 0x88, 0x07, 0xc8, 0x79, 0x87, 0x07, 0xc8, 0x71, 0x86,
+    0x07, 0xc8, 0x69, 0x85, 0x07, 0xc8, 0x61, 0x84, 0x07, 0xc8, 0x59, 0x83,
+    0x07, 0xc8, 0x51, 0xa6, 0x07, 0xc8, 0x49, 0xa5, 0x07, 0xc8, 0x41, 0xa4,
+    0x07, 0xc8, 0x39, 0xa3, 0x07, 0xc8, 0x31, 0xa2, 0x07, 0xc8, 0x29, 0xa1,
+    0x07, 0xc8, 0x21, 0xa0, 0x07, 0xc8, 0x19, 0x9d, 0x07, 0xc8, 0x01, 0x9e,
+    0x07, 0xc8, 0x09, 0x9f, 0x07, 0xc8, 0x10, 0xc3, 0xa6, 0x59, 0x01, 0x75,
+    0x81, 0xc2, 0x02, 0xe0, 0x01, 0x76, 0x29, 0xc5, 0x6f, 0xb7, 0x01, 0x76,
+    0x41, 0xc4, 0x08, 0x92, 0x01, 0x76, 0x49, 0xc3, 0x07, 0xe5, 0x01, 0x77,
+    0x38, 0xc3, 0x1a, 0x7c, 0x01, 0x76, 0x81, 0xc3, 0x00, 0xfe, 0x01, 0x76,
+    0xa0, 0xc3, 0x08, 0x48, 0x01, 0x76, 0x99, 0xc3, 0x47, 0x24, 0x01, 0x76,
+    0xd0, 0xcd, 0x7f, 0x4c, 0x01, 0x76, 0xc9, 0xc4, 0xe4, 0x33, 0x01, 0x77,
+    0x71, 0xc5, 0xd5, 0x65, 0x01, 0x77, 0x98, 0xc2, 0x00, 0x35, 0x01, 0x76,
+    0xe1, 0xc3, 0x04, 0x5a, 0x01, 0x77, 0x29, 0xc3, 0x23, 0x6d, 0x01, 0x77,
+    0x50, 0xc2, 0x00, 0xfe, 0x01, 0x77, 0x01, 0xc3, 0x18, 0x11, 0x01, 0x77,
+    0x60, 0xc3, 0x05, 0x14, 0x01, 0x74, 0x11, 0x16, 0x42, 0x1a, 0x7a, 0xc3,
+    0x05, 0x14, 0x01, 0x74, 0xa1, 0xc3, 0x02, 0x9f, 0x01, 0x74, 0xa8, 0x0a,
+    0xc2, 0x1a, 0x86, 0x19, 0xc2, 0x1a, 0x92, 0xc6, 0xc6, 0x9b, 0x01, 0x77,
+    0x48, 0xc2, 0x02, 0xa0, 0x01, 0x74, 0x79, 0xc4, 0x02, 0xde, 0x01, 0x74,
+    0x80, 0xc3, 0x05, 0x14, 0x01, 0x74, 0xb1, 0xc3, 0x02, 0x9f, 0x01, 0x74,
+    0xb8, 0xc3, 0x05, 0x14, 0x01, 0x76, 0xa9, 0xc3, 0x02, 0x9f, 0x01, 0x76,
+    0xb0, 0xc3, 0x05, 0x14, 0x01, 0x75, 0x09, 0xc3, 0x02, 0x9f, 0x01, 0x75,
+    0x10, 0xc3, 0x05, 0x14, 0x01, 0x76, 0x69, 0xc3, 0x02, 0x9f, 0x01, 0x76,
+    0x70, 0xc4, 0xe4, 0x33, 0x01, 0x77, 0x69, 0xc5, 0xd5, 0x65, 0x01, 0x77,
+    0x90, 0xc2, 0x02, 0xa0, 0x01, 0x76, 0xf1, 0xc4, 0x02, 0xde, 0x01, 0x76,
+    0xf8, 0xc2, 0x02, 0xa0, 0x01, 0x75, 0xf9, 0xc4, 0x02, 0xde, 0x01, 0x76,
+    0x00, 0x92, 0x01, 0x8e, 0x59, 0x9c, 0x01, 0x8e, 0x72, 0x02, 0x1a, 0x9e,
+    0x89, 0x01, 0x8e, 0x40, 0x09, 0xc2, 0x1a, 0xa2, 0x98, 0x05, 0x5b, 0xa9,
+    0x97, 0x05, 0x5b, 0xa1, 0x91, 0x05, 0x5b, 0x99, 0x8b, 0x05, 0x5b, 0x91,
+    0x87, 0x05, 0x5b, 0x89, 0x83, 0x05, 0x5b, 0x81, 0x1b, 0xc2, 0x1a, 0xba,
+    0x19, 0xc2, 0x1a, 0xd2, 0x16, 0xc2, 0x1a, 0xea, 0x10, 0xc2, 0x1a, 0xfe,
+    0x0a, 0xc2, 0x1b, 0x19, 0x0f, 0xc2, 0x1b, 0x37, 0x0e, 0xc2, 0x1b, 0x4f,
+    0xc2, 0x02, 0x2b, 0x05, 0x5b, 0xb9, 0x42, 0x00, 0xe3, 0xc2, 0x1b, 0x67,
+    0x95, 0x05, 0x5c, 0xeb, 0x02, 0x1b, 0x7f, 0x06, 0x42, 0x1b, 0x97, 0x83,
+    0x00, 0x9d, 0x01, 0x87, 0x00, 0x9d, 0x09, 0x8b, 0x00, 0x9d, 0x11, 0x91,
+    0x00, 0x9d, 0x19, 0x97, 0x00, 0x9d, 0x21, 0x98, 0x00, 0x9d, 0x29, 0x09,
+    0xc2, 0x1b, 0xb5, 0xc2, 0x02, 0x2b, 0x00, 0x9d, 0x39, 0x0a, 0xc2, 0x1b,
+    0xcd, 0x0e, 0xc2, 0x1b, 0xeb, 0x0f, 0xc2, 0x1c, 0x03, 0x10, 0xc2, 0x1c,
+    0x1b, 0x42, 0x00, 0xe3, 0xc2, 0x1c, 0x36, 0x95, 0x00, 0x9e, 0x6b, 0x02,
+    0x1c, 0x4e, 0x06, 0xc2, 0x1c, 0x66, 0x16, 0xc2, 0x1c, 0x84, 0x19, 0xc2,
+    0x1c, 0x98, 0x1b, 0x42, 0x1c, 0xb0, 0x00, 0x42, 0x1c, 0xc8, 0xcd, 0x77,
+    0x94, 0x0f, 0xa5, 0xc8, 0xc3, 0x39, 0x6e, 0x08, 0x8a, 0x21, 0xc2, 0x04,
+    0xc6, 0x08, 0x89, 0x18, 0xc2, 0x04, 0xc6, 0x08, 0x89, 0x09, 0xc3, 0xa9,
+    0x9c, 0x08, 0x89, 0x00, 0xc3, 0x39, 0x6e, 0x08, 0x88, 0xf1, 0xc2, 0x04,
+    0xc6, 0x08, 0x88, 0xe8, 0xc3, 0x39, 0x6e, 0x08, 0x88, 0xe1, 0xc2, 0x04,
+    0xc6, 0x08, 0x88, 0xd8, 0xc2, 0x04, 0xc6, 0x08, 0x88, 0xd1, 0xc3, 0x3c,
+    0x8a, 0x08, 0x88, 0xa9, 0xc3, 0xa9, 0x9c, 0x08, 0x88, 0x81, 0xc3, 0x4f,
+    0x37, 0x08, 0x88, 0x58, 0xc3, 0x39, 0x6e, 0x08, 0x88, 0xc9, 0xc2, 0x04,
+    0xc6, 0x08, 0x88, 0xc1, 0x06, 0x42, 0x1c, 0xd4, 0xc3, 0x39, 0x6e, 0x08,
+    0x88, 0xb9, 0xc2, 0x04, 0xc6, 0x08, 0x88, 0xb1, 0x16, 0x42, 0x1c, 0xe0,
+    0xc3, 0x39, 0x6e, 0x08, 0x88, 0x79, 0xc2, 0x04, 0xc6, 0x08, 0x88, 0x70,
+    0xc3, 0x39, 0x6e, 0x08, 0x88, 0x69, 0xc2, 0x04, 0xc6, 0x08, 0x88, 0x60,
+    0xc3, 0x39, 0x6e, 0x08, 0x88, 0x51, 0xc2, 0x04, 0xc6, 0x08, 0x88, 0x48,
+    0xc3, 0x39, 0x6e, 0x08, 0x88, 0x41, 0xc2, 0x04, 0xc6, 0x08, 0x88, 0x38,
+    0x87, 0x08, 0x89, 0x63, 0x02, 0x1c, 0xec, 0x83, 0x08, 0x89, 0x3b, 0x02,
+    0x1c, 0xf0, 0x91, 0x08, 0x89, 0x73, 0x02, 0x1c, 0xfc, 0x97, 0x08, 0x89,
+    0x53, 0x02, 0x1d, 0x00, 0x8b, 0x08, 0x89, 0x42, 0x02, 0x1d, 0x04, 0xc4,
+    0x26, 0x78, 0x08, 0x89, 0xf9, 0xc5, 0x06, 0xdb, 0x08, 0x89, 0xf1, 0x15,
+    0xc2, 0x1d, 0x08, 0x08, 0xc2, 0x1d, 0x14, 0x16, 0xc2, 0x1d, 0x20, 0xc3,
+    0x05, 0x14, 0x08, 0x89, 0xb9, 0xc4, 0x15, 0xe7, 0x08, 0x89, 0xb0, 0xc7,
+    0x40, 0xe5, 0x08, 0x88, 0x11, 0xc8, 0x14, 0x38, 0x08, 0x88, 0x09, 0xcb,
+    0x1e, 0x89, 0x08, 0x88, 0x00, 0x8a, 0x05, 0x52, 0x69, 0x8f, 0x05, 0x52,
+    0x61, 0xc2, 0x00, 0x75, 0x05, 0x52, 0x18, 0x87, 0x05, 0x51, 0x90, 0x97,
+    0x05, 0x51, 0x89, 0x8b, 0x05, 0x51, 0x81, 0x83, 0x05, 0x51, 0x48, 0x87,
+    0x05, 0x51, 0x70, 0x8b, 0x05, 0x51, 0x58, 0x83, 0x05, 0x51, 0x39, 0xc2,
+    0x0d, 0xf6, 0x05, 0x51, 0x30, 0x09, 0xc2, 0x1d, 0x2c, 0x83, 0x05, 0x50,
+    0xc1, 0xc2, 0x0f, 0xe1, 0x05, 0x50, 0xb9, 0x0a, 0x42, 0x1d, 0x36, 0xc2,
+    0x00, 0xd0, 0x05, 0x50, 0x49, 0x83, 0x05, 0x50, 0x40, 0xc2, 0x00, 0xd0,
+    0x05, 0x50, 0x39, 0x83, 0x05, 0x50, 0x30, 0x8b, 0x05, 0x50, 0x20, 0xc2,
+    0x00, 0xcc, 0x05, 0x52, 0x59, 0x8e, 0x05, 0x52, 0x51, 0x94, 0x05, 0x52,
+    0x49, 0x9b, 0x05, 0x52, 0x41, 0x92, 0x05, 0x52, 0x39, 0x90, 0x05, 0x52,
+    0x33, 0x02, 0x1d, 0x46, 0x96, 0x05, 0x52, 0x29, 0xc2, 0x11, 0xee, 0x05,
+    0x52, 0x21, 0x89, 0x05, 0x52, 0x09, 0x8d, 0x05, 0x52, 0x00, 0xc2, 0x01,
+    0x5d, 0x05, 0x51, 0x09, 0x83, 0x05, 0x50, 0xe9, 0xc2, 0x00, 0xd0, 0x05,
+    0x50, 0xf0, 0x83, 0x05, 0x51, 0x01, 0xc2, 0x0f, 0xe1, 0x05, 0x50, 0xf8,
+    0xc2, 0x00, 0xd0, 0x05, 0x50, 0xe1, 0xc2, 0x00, 0xb0, 0x05, 0x50, 0xd9,
+    0x83, 0x05, 0x50, 0xd0, 0xc2, 0x0e, 0x9a, 0x05, 0x50, 0xc9, 0xc2, 0x00,
+    0xd0, 0x05, 0x50, 0xb1, 0x83, 0x05, 0x50, 0xa8, 0xc2, 0x00, 0xd0, 0x05,
+    0x50, 0xa1, 0x83, 0x05, 0x50, 0x98, 0xc2, 0x00, 0xd0, 0x05, 0x50, 0x79,
+    0x83, 0x05, 0x50, 0x70, 0xc2, 0x00, 0xd0, 0x05, 0x50, 0x69, 0x83, 0x05,
+    0x50, 0x60, 0xcb, 0x97, 0xf5, 0x05, 0x52, 0xf1, 0xc4, 0x19, 0x53, 0x05,
+    0x52, 0xe8, 0xc4, 0x18, 0x10, 0x05, 0x52, 0xb9, 0xc2, 0x22, 0xcc, 0x05,
+    0x52, 0xb0, 0xc3, 0x0d, 0x14, 0x05, 0x52, 0xa9, 0xc3, 0x09, 0x9e, 0x05,
+    0x52, 0xa0, 0xc4, 0x02, 0xde, 0x05, 0x52, 0x99, 0xc2, 0x02, 0xa0, 0x05,
+    0x52, 0x90, 0xc8, 0x0d, 0x03, 0x08, 0x7e, 0x58, 0x19, 0xc2, 0x1d, 0x4a,
+    0xc2, 0x00, 0xc4, 0x08, 0x7e, 0x49, 0xc4, 0x02, 0xde, 0x08, 0x7e, 0x38,
+    0xc3, 0x11, 0xef, 0x08, 0x7e, 0x19, 0xca, 0xa5, 0xf8, 0x08, 0x7d, 0x89,
+    0xc5, 0xdc, 0x2c, 0x08, 0x7d, 0xf8, 0xc2, 0x00, 0x8e, 0x08, 0x7d, 0xc8,
+    0xc4, 0x36, 0xb5, 0x08, 0x7d, 0x81, 0xc3, 0x16, 0x5a, 0x08, 0x7e, 0x00,
+    0xc9, 0xad, 0x41, 0x01, 0x31, 0x49, 0xc8, 0xb8, 0xa2, 0x01, 0x31, 0x40,
+    0xc5, 0xcb, 0xf4, 0x0f, 0xaa, 0x13, 0x02, 0x1d, 0x54, 0x4a, 0x9b, 0xf8,
+    0x42, 0x1d, 0x5a, 0xe0, 0x0b, 0x07, 0x0f, 0x8c, 0x50, 0x08, 0xc2, 0x1d,
+    0x66, 0x8b, 0x0f, 0x00, 0x5b, 0x02, 0x1d, 0x72, 0x04, 0xc2, 0x1d, 0x84,
+    0x1b, 0xc2, 0x1d, 0x90, 0x15, 0xc2, 0x1d, 0xa2, 0xc6, 0x7b, 0xab, 0x0f,
+    0x00, 0xe9, 0x16, 0xc2, 0x1d, 0xb2, 0xc4, 0xdf, 0x9f, 0x0f, 0x00, 0xc1,
+    0xc3, 0xc5, 0x6f, 0x0f, 0x00, 0xb1, 0xc5, 0xd7, 0xb3, 0x0f, 0x00, 0x99,
+    0xc6, 0xcb, 0xed, 0x0f, 0x00, 0x91, 0xc3, 0x06, 0xc5, 0x0f, 0x00, 0x89,
+    0xc5, 0xdb, 0x78, 0x0f, 0x00, 0x81, 0xc7, 0x60, 0xdd, 0x0f, 0x00, 0x79,
+    0xc7, 0xc4, 0x48, 0x0f, 0x00, 0x71, 0xc4, 0xe1, 0x6b, 0x0f, 0x00, 0x69,
+    0x06, 0xc2, 0x1d, 0xbe, 0x1c, 0xc2, 0x1d, 0xca, 0xc7, 0xc1, 0xf5, 0x0f,
+    0x00, 0x19, 0xc4, 0xdf, 0x83, 0x0f, 0x00, 0x11, 0xc3, 0xe5, 0x4b, 0x0f,
+    0x00, 0x00, 0x44, 0x29, 0xb5, 0xc2, 0x1d, 0xd6, 0x03, 0x42, 0x1d, 0xf4,
+    0xc5, 0x00, 0xd4, 0x01, 0x07, 0x81, 0xc5, 0x05, 0x02, 0x00, 0x1a, 0xc8,
+    0xcc, 0x80, 0xc1, 0x01, 0x07, 0x39, 0x4c, 0x05, 0xf6, 0x42, 0x1e, 0x06,
+    0xc5, 0x05, 0x02, 0x00, 0xef, 0xe9, 0xc5, 0x00, 0xd4, 0x00, 0x1a, 0x60,
+    0x02, 0xc2, 0x1e, 0x12, 0x00, 0x42, 0x1e, 0x1e, 0x43, 0x00, 0x2e, 0xc2,
+    0x1e, 0x2d, 0x43, 0x00, 0x75, 0x42, 0x1e, 0x35, 0x45, 0x01, 0xd5, 0xc2,
+    0x1e, 0x47, 0xd2, 0x49, 0xf7, 0x00, 0x19, 0x10, 0x00, 0xc2, 0x1e, 0x53,
+    0x46, 0x01, 0x4a, 0x42, 0x1e, 0x6f, 0x43, 0x00, 0x75, 0xc2, 0x1e, 0x7b,
+    0xc6, 0x80, 0x30, 0x00, 0x19, 0x90, 0x4d, 0x29, 0xb9, 0xc2, 0x1e, 0x8b,
+    0x55, 0x37, 0xac, 0x42, 0x1f, 0x0e, 0xde, 0x0f, 0x22, 0x00, 0xd5, 0xc9,
+    0x46, 0x19, 0x9d, 0x42, 0x1f, 0x22, 0xcc, 0x86, 0x91, 0x01, 0x07, 0x49,
+    0xd5, 0x32, 0xea, 0x00, 0xef, 0xc8, 0xc8, 0xb9, 0x8a, 0x01, 0x07, 0x41,
+    0xcc, 0x83, 0x0d, 0x00, 0xd6, 0x59, 0xc3, 0x02, 0xa3, 0x00, 0xd5, 0xa0,
+    0x00, 0x42, 0x1f, 0x34, 0x44, 0x00, 0x5a, 0xc2, 0x1f, 0x4c, 0x16, 0xc2,
+    0x1f, 0x56, 0x42, 0x01, 0x48, 0x42, 0x1f, 0x60, 0xcb, 0x8f, 0xc0, 0x00,
+    0xef, 0xd9, 0x49, 0xb4, 0x7f, 0x42, 0x1f, 0x6c, 0xc5, 0xd9, 0xf7, 0x00,
+    0xd5, 0x89, 0xc6, 0x05, 0x01, 0x00, 0x19, 0x20, 0xd8, 0x24, 0x23, 0x01,
+    0x07, 0x21, 0xc6, 0xce, 0xc3, 0x01, 0x07, 0x19, 0x15, 0xc2, 0x1f, 0x7e,
+    0xc6, 0x02, 0xd1, 0x01, 0x06, 0xeb, 0x02, 0x1f, 0x8a, 0xc7, 0x3a, 0x19,
+    0x01, 0x06, 0xf8, 0xcc, 0x89, 0x25, 0x01, 0x06, 0xc9, 0xcb, 0x02, 0x5c,
+    0x01, 0x06, 0xa8, 0xcd, 0x33, 0xee, 0x00, 0x24, 0x49, 0x48, 0x0d, 0x04,
+    0xc2, 0x1f, 0x90, 0x12, 0xc2, 0x1f, 0x9c, 0xce, 0x6c, 0xa6, 0x00, 0x24,
+    0x29, 0x16, 0xc2, 0x1f, 0xac, 0x47, 0x02, 0x0e, 0xc2, 0x1f, 0xc1, 0xc5,
+    0xda, 0x88, 0x05, 0x33, 0x79, 0xc6, 0x4a, 0x9f, 0x05, 0x33, 0xe0, 0xc6,
+    0x05, 0x01, 0x00, 0x19, 0x68, 0xc3, 0x01, 0xe7, 0x00, 0x18, 0x63, 0x02,
+    0x20, 0x2f, 0xc9, 0x1e, 0x8b, 0x00, 0x18, 0x80, 0x44, 0x0a, 0x8c, 0xc2,
+    0x20, 0x35, 0xcf, 0x60, 0xa8, 0x07, 0xf1, 0x32, 0x02, 0x20, 0x44, 0xd5,
+    0x36, 0x86, 0x01, 0x06, 0x99, 0x15, 0x42, 0x20, 0x4a, 0xcd, 0x7d, 0x92,
+    0x00, 0xd6, 0x29, 0xc4, 0x05, 0x03, 0x00, 0x19, 0xd8, 0xe0, 0x08, 0x47,
+    0x00, 0xd5, 0xd0, 0xc3, 0x0f, 0xbe, 0x00, 0x18, 0x33, 0x02, 0x20, 0x56,
+    0x45, 0x32, 0xf5, 0x42, 0x20, 0x62, 0xc4, 0x00, 0x49, 0x00, 0xef, 0xb9,
+    0xc5, 0x00, 0x2c, 0x00, 0xef, 0xb0, 0xd1, 0x2f, 0xfb, 0x01, 0x84, 0xc9,
+    0xd6, 0x2f, 0xf6, 0x01, 0x84, 0xd0, 0x46, 0x9a, 0x3c, 0xc2, 0x20, 0x6e,
+    0xd1, 0x3c, 0x67, 0x00, 0x1a, 0x70, 0x47, 0x1d, 0x71, 0xc2, 0x20, 0x7a,
+    0xc6, 0x65, 0x43, 0x00, 0xd5, 0x90, 0xc6, 0x00, 0xd3, 0x00, 0xee, 0x70,
+    0xc2, 0x00, 0xd1, 0x08, 0x1b, 0xb1, 0xc3, 0x63, 0x78, 0x08, 0x1b, 0xb9,
+    0xc4, 0xde, 0xdb, 0x08, 0x1b, 0xc1, 0xc5, 0xdb, 0x6e, 0x08, 0x1b, 0xc9,
+    0xc3, 0xe6, 0x0b, 0x08, 0x1b, 0xd0, 0x02, 0xc2, 0x20, 0x86, 0x00, 0x42,
+    0x20, 0x98, 0xc5, 0x00, 0xd4, 0x00, 0xd6, 0x41, 0xc5, 0x05, 0x02, 0x00,
+    0x18, 0xf8, 0x4a, 0x57, 0x93, 0xc2, 0x20, 0xb0, 0xd4, 0x3e, 0x08, 0x00,
+    0x19, 0x08, 0xc5, 0x00, 0xd4, 0x00, 0x19, 0xe9, 0xc5, 0x05, 0x02, 0x00,
+    0x1a, 0x98, 0xc5, 0x00, 0xd4, 0x00, 0x18, 0x69, 0xc5, 0x05, 0x02, 0x00,
+    0x19, 0x48, 0xc4, 0x26, 0x78, 0x0e, 0x9b, 0x89, 0xc5, 0x06, 0xdb, 0x0e,
+    0x9b, 0x81, 0x15, 0xc2, 0x20, 0xc2, 0x08, 0xc2, 0x20, 0xce, 0x16, 0xc2,
+    0x20, 0xda, 0xc3, 0x05, 0x14, 0x0e, 0x9b, 0x48, 0xc4, 0x26, 0x78, 0x0e,
+    0x9b, 0x41, 0xc5, 0x06, 0xdb, 0x0e, 0x9b, 0x39, 0x15, 0xc2, 0x20, 0xe6,
+    0x08, 0xc2, 0x20, 0xf2, 0x16, 0xc2, 0x20, 0xfe, 0xc3, 0x05, 0x14, 0x0e,
+    0x9b, 0x00, 0xc7, 0x80, 0x70, 0x01, 0x17, 0xe9, 0x48, 0x00, 0x5f, 0xc2,
+    0x21, 0x0a, 0xd6, 0x2c, 0x86, 0x01, 0x17, 0xd0, 0xcf, 0x4c, 0x01, 0x01,
+    0x15, 0x9b, 0x02, 0x21, 0x10, 0xc6, 0x00, 0x4e, 0x01, 0x10, 0x58, 0x0d,
+    0xc2, 0x21, 0x16, 0x0a, 0xc2, 0x21, 0x26, 0x42, 0x01, 0x30, 0xc2, 0x21,
+    0x32, 0x15, 0xc2, 0x21, 0x3e, 0x06, 0xc2, 0x21, 0x54, 0x03, 0xc2, 0x21,
+    0x66, 0xc4, 0xdf, 0x33, 0x01, 0x64, 0x19, 0xc3, 0xd1, 0x8c, 0x01, 0x64,
+    0x49, 0xc4, 0xde, 0xdb, 0x01, 0x64, 0x69, 0x16, 0xc2, 0x21, 0x72, 0xc5,
+    0xd8, 0xcb, 0x01, 0x64, 0x99, 0x0e, 0xc2, 0x21, 0x7e, 0xc2, 0x02, 0x2f,
+    0x01, 0x64, 0xc9, 0xc2, 0x00, 0xec, 0x01, 0x64, 0xd9, 0x91, 0x01, 0x64,
+    0xfb, 0x02, 0x21, 0x8a, 0x12, 0xc2, 0x21, 0x96, 0xc2, 0x00, 0x79, 0x01,
+    0x65, 0x19, 0xc2, 0x00, 0xe4, 0x01, 0x65, 0x49, 0x08, 0xc2, 0x21, 0xa0,
+    0x42, 0x07, 0x2f, 0xc2, 0x21, 0xaa, 0xcd, 0x7c, 0x40, 0x01, 0x67, 0x98,
+    0x0d, 0xc2, 0x21, 0xb6, 0xc5, 0xda, 0xb5, 0x01, 0x67, 0x29, 0xc5, 0xd8,
+    0x08, 0x01, 0x67, 0x31, 0x15, 0xc2, 0x21, 0xc2, 0xc6, 0xd1, 0x45, 0x01,
+    0x67, 0x40, 0x0a, 0xc2, 0x21, 0xce, 0x42, 0x01, 0x30, 0xc2, 0x21, 0xda,
+    0x15, 0xc2, 0x21, 0xe6, 0x06, 0xc2, 0x21, 0xfc, 0x03, 0xc2, 0x22, 0x0e,
+    0xc4, 0xdf, 0x33, 0x01, 0x64, 0x11, 0xc3, 0xd1, 0x8c, 0x01, 0x64, 0x41,
+    0xc4, 0xde, 0xdb, 0x01, 0x64, 0x61, 0x16, 0xc2, 0x22, 0x1a, 0xc5, 0xd8,
+    0xcb, 0x01, 0x64, 0x91, 0x0d, 0xc2, 0x22, 0x26, 0x0e, 0xc2, 0x22, 0x36,
+    0xc2, 0x02, 0x2f, 0x01, 0x64, 0xc1, 0xc2, 0x00, 0xec, 0x01, 0x64, 0xd1,
+    0x91, 0x01, 0x64, 0xf3, 0x02, 0x22, 0x42, 0x12, 0xc2, 0x22, 0x4e, 0xc2,
+    0x00, 0x79, 0x01, 0x65, 0x11, 0xc2, 0x00, 0xe4, 0x01, 0x65, 0x41, 0x08,
+    0xc2, 0x22, 0x58, 0x42, 0x07, 0x2f, 0xc2, 0x22, 0x62, 0xcd, 0x7c, 0x40,
+    0x01, 0x67, 0x90, 0xc8, 0xbb, 0xa2, 0x01, 0x67, 0x79, 0x49, 0xac, 0x21,
+    0x42, 0x22, 0x6e, 0xc3, 0x05, 0x14, 0x08, 0x17, 0x09, 0x16, 0xc2, 0x22,
+    0x7a, 0x08, 0xc2, 0x22, 0x86, 0x15, 0xc2, 0x22, 0x92, 0xc5, 0x06, 0xdb,
+    0x08, 0x17, 0x41, 0xc4, 0x26, 0x78, 0x08, 0x17, 0x48, 0x16, 0xc2, 0x22,
+    0x9e, 0x08, 0xc2, 0x22, 0xac, 0x15, 0xc2, 0x22, 0xb4, 0x45, 0x06, 0xdb,
+    0xc2, 0x22, 0xc0, 0x44, 0x26, 0x78, 0xc2, 0x22, 0xca, 0xcb, 0x0d, 0x00,
+    0x08, 0x17, 0x98, 0xcb, 0x9a, 0x5d, 0x0f, 0xa7, 0x59, 0xcc, 0x81, 0x81,
+    0x0f, 0xa7, 0x50, 0xc7, 0x57, 0x8b, 0x0f, 0x98, 0x11, 0xd0, 0x59, 0x82,
+    0x01, 0x52, 0x62, 0x02, 0x22, 0xd6, 0xc4, 0x0e, 0xa6, 0x01, 0x56, 0x7b,
+    0x02, 0x22, 0xdc, 0xc6, 0x2d, 0xd0, 0x01, 0x56, 0x82, 0x02, 0x22, 0xe2,
+    0xcf, 0x62, 0xe2, 0x01, 0x11, 0x91, 0xd2, 0x4e, 0x77, 0x01, 0x4a, 0x08,
+    0xd3, 0x44, 0x7c, 0x01, 0x0d, 0xb9, 0xe0, 0x0b, 0xa7, 0x01, 0x5b, 0x70,
+    0xdb, 0x16, 0xda, 0x0f, 0xae, 0xc1, 0x46, 0x01, 0x4a, 0x42, 0x22, 0xe8,
+    0xe0, 0x09, 0x07, 0x0f, 0xa8, 0x18, 0x19, 0xc2, 0x22, 0xf1, 0x42, 0x00,
+    0xc4, 0xc2, 0x22, 0xfb, 0x44, 0x02, 0xde, 0x42, 0x23, 0x07, 0x45, 0x66,
+    0xb1, 0xc2, 0x23, 0x13, 0x44, 0x00, 0x2d, 0x42, 0x23, 0x1f, 0xc7, 0xc1,
+    0xd9, 0x0f, 0xab, 0x21, 0xc7, 0xc7, 0x6d, 0x0f, 0xaa, 0xc0, 0x44, 0x18,
+    0x10, 0xc2, 0x23, 0x2b, 0x42, 0x22, 0xcc, 0x42, 0x23, 0x37, 0x43, 0x0d,
+    0x14, 0xc2, 0x23, 0x43, 0x43, 0x09, 0x9e, 0x42, 0x23, 0x4f, 0xc7, 0xc1,
+    0xd9, 0x0f, 0xaa, 0xe1, 0xc7, 0xc7, 0x6d, 0x0f, 0xaa, 0x80, 0x44, 0x0d,
+    0x21, 0xc2, 0x23, 0x5b, 0xd8, 0x02, 0xef, 0x0f, 0x8b, 0x71, 0x85, 0x0f,
+    0x8b, 0x69, 0x86, 0x0f, 0x89, 0x68, 0xdb, 0x15, 0x45, 0x01, 0x3d, 0x91,
+    0xd8, 0x22, 0x8b, 0x01, 0x1c, 0x49, 0xcb, 0x8f, 0x73, 0x0f, 0x8b, 0x79,
+    0x46, 0xc3, 0x3f, 0x42, 0x23, 0x65, 0x45, 0x01, 0xfd, 0xc2, 0x23, 0xab,
+    0x9c, 0x0f, 0x89, 0x70, 0x0b, 0xc2, 0x23, 0xb7, 0xc3, 0x02, 0x2c, 0x01,
+    0x14, 0xe9, 0x11, 0x42, 0x23, 0xc3, 0x45, 0x0b, 0x12, 0xc2, 0x23, 0xcd,
+    0xc8, 0x00, 0xcb, 0x01, 0x4e, 0x00, 0x16, 0xc2, 0x23, 0xd9, 0xc8, 0x4b,
+    0x5f, 0x01, 0x23, 0x91, 0x07, 0xc2, 0x23, 0xee, 0x15, 0xc2, 0x23, 0xfa,
+    0x08, 0x42, 0x24, 0x06, 0xc7, 0x01, 0x93, 0x0f, 0xbe, 0xab, 0x02, 0x24,
+    0x10, 0xc4, 0x03, 0x4e, 0x01, 0x14, 0xb8, 0xd0, 0x5b, 0x32, 0x01, 0x14,
+    0xd9, 0x4c, 0x04, 0x1b, 0x42, 0x24, 0x16, 0xcc, 0x87, 0x51, 0x01, 0x14,
+    0xd1, 0xce, 0x61, 0xd5, 0x01, 0x4d, 0xc0, 0xc4, 0x1d, 0x1e, 0x01, 0x14,
+    0xb1, 0x49, 0x1f, 0x19, 0x42, 0x24, 0x22, 0xc3, 0x25, 0xd6, 0x01, 0x14,
+    0xa9, 0xcc, 0x8b, 0x29, 0x01, 0x4d, 0xc9, 0xc7, 0x36, 0x16, 0x01, 0x4d,
+    0xb9, 0xca, 0x9a, 0xa4, 0x01, 0x81, 0xb0, 0x49, 0x9f, 0x87, 0xc2, 0x24,
+    0x28, 0x5b, 0x16, 0xf5, 0xc2, 0x24, 0x74, 0xd1, 0x53, 0x87, 0x0f, 0xb6,
+    0x40, 0xc5, 0x1c, 0xb5, 0x01, 0x4d, 0xf9, 0xc5, 0xdc, 0x4a, 0x01, 0x5d,
+    0xf8, 0x50, 0x4b, 0xf0, 0xc2, 0x24, 0x7c, 0x48, 0xbc, 0xc2, 0x42, 0x24,
+    0x88, 0x03, 0xc2, 0x24, 0xc0, 0x46, 0x02, 0xae, 0xc2, 0x24, 0xc6, 0x0e,
+    0xc2, 0x24, 0xd2, 0xd0, 0x5d, 0x52, 0x01, 0x2e, 0x89, 0xcd, 0x7a, 0x93,
+    0x01, 0x2e, 0x69, 0x43, 0x02, 0x9f, 0xc2, 0x24, 0xde, 0x15, 0xc2, 0x24,
+    0xe4, 0xce, 0x0e, 0xf1, 0x01, 0x4d, 0xa8, 0xe0, 0x07, 0xa7, 0x01, 0x4d,
+    0xd0, 0xa2, 0x09, 0x1b, 0x5b, 0x02, 0x24, 0xf0, 0xd1, 0x54, 0x20, 0x09,
+    0x2a, 0x11, 0x8f, 0x09, 0x1b, 0x71, 0xc3, 0x2b, 0x88, 0x09, 0x1b, 0x68,
+    0xa4, 0x09, 0x2a, 0x09, 0xc2, 0xd1, 0x86, 0x09, 0x1b, 0x09, 0x89, 0x09,
+    0x1b, 0x01, 0x00, 0x42, 0x24, 0xf6, 0xc2, 0xde, 0xe9, 0x09, 0x1b, 0x49,
+    0x89, 0x09, 0x1b, 0x41, 0x84, 0x09, 0x1b, 0x33, 0x02, 0x25, 0x02, 0xa0,
+    0x09, 0x1b, 0x29, 0xc8, 0xb5, 0xba, 0x09, 0x1b, 0x20, 0x97, 0x09, 0x19,
+    0xbb, 0x02, 0x25, 0x08, 0x9f, 0x09, 0x19, 0x5b, 0x02, 0x25, 0x17, 0x8b,
+    0x09, 0x19, 0xab, 0x02, 0x25, 0x1b, 0xa1, 0x09, 0x19, 0xa1, 0x00, 0x42,
+    0x25, 0x1f, 0x97, 0x09, 0x1c, 0xcb, 0x02, 0x25, 0x2b, 0x47, 0x1b, 0x73,
+    0xc2, 0x25, 0x31, 0xc3, 0x6c, 0x49, 0x09, 0x18, 0x60, 0x47, 0x03, 0x4c,
+    0xc2, 0x25, 0x43, 0xc2, 0x01, 0xdf, 0x09, 0x19, 0x1b, 0x02, 0x25, 0x5c,
+    0xc3, 0x1d, 0xd4, 0x09, 0x19, 0x10, 0x97, 0x09, 0x1a, 0xe1, 0xa0, 0x09,
+    0x1a, 0xd2, 0x02, 0x25, 0x62, 0xc3, 0xe5, 0x21, 0x09, 0x1a, 0xc1, 0x9f,
+    0x09, 0x1a, 0xb9, 0x9a, 0x09, 0x1a, 0xb1, 0x47, 0x03, 0x4c, 0x42, 0x25,
+    0x68, 0xc5, 0x39, 0xc7, 0x09, 0x19, 0x38, 0xc2, 0x0b, 0x47, 0x09, 0x18,
+    0xe1, 0x00, 0x42, 0x25, 0x7b, 0x8f, 0x09, 0x18, 0x43, 0x02, 0x25, 0x96,
+    0x94, 0x09, 0x18, 0x4b, 0x02, 0x25, 0x9c, 0x8d, 0x09, 0x18, 0x39, 0xc2,
+    0x0b, 0x48, 0x09, 0x18, 0x30, 0xc2, 0x38, 0xb6, 0x09, 0x17, 0xd3, 0x02,
+    0x25, 0xa2, 0x94, 0x09, 0x17, 0xd9, 0x89, 0x09, 0x17, 0x9b, 0x02, 0x25,
+    0xa8, 0x84, 0x09, 0x17, 0x83, 0x02, 0x25, 0xae, 0x00, 0x42, 0x25, 0xb2,
+    0x9f, 0x09, 0x1c, 0xb9, 0x94, 0x09, 0x18, 0x0b, 0x02, 0x25, 0xc4, 0x8e,
+    0x09, 0x18, 0x01, 0xc5, 0x58, 0xf4, 0x09, 0x17, 0xf8, 0xc5, 0x39, 0xc7,
+    0x09, 0x17, 0xe8, 0x00, 0xc2, 0x25, 0xc8, 0xc3, 0xd8, 0x33, 0x09, 0x17,
+    0x09, 0xc2, 0x9c, 0x98, 0x09, 0x17, 0x01, 0x89, 0x09, 0x16, 0xea, 0x02,
+    0x25, 0xd4, 0x97, 0x09, 0x16, 0xbb, 0x02, 0x25, 0xdb, 0x87, 0x09, 0x15,
+    0xd3, 0x02, 0x25, 0xee, 0x83, 0x09, 0x15, 0x6b, 0x02, 0x26, 0x05, 0x0b,
+    0x42, 0x26, 0x1f, 0x89, 0x09, 0x14, 0xab, 0x02, 0x26, 0x40, 0x94, 0x09,
+    0x15, 0x61, 0xc4, 0xe3, 0xaf, 0x09, 0x15, 0x59, 0x8e, 0x09, 0x15, 0x4a,
+    0x02, 0x26, 0x44, 0x94, 0x09, 0x17, 0x4b, 0x02, 0x26, 0x4a, 0x8f, 0x09,
+    0x17, 0x3b, 0x02, 0x26, 0x4e, 0xc3, 0x06, 0x47, 0x09, 0x17, 0x31, 0x86,
+    0x09, 0x17, 0x23, 0x02, 0x26, 0x54, 0xc8, 0x8b, 0x5c, 0x09, 0x17, 0x18,
+    0x90, 0x09, 0x1c, 0x7b, 0x02, 0x26, 0x58, 0xc3, 0x78, 0x3e, 0x09, 0x13,
+    0x01, 0x8f, 0x09, 0x12, 0x7b, 0x02, 0x26, 0x65, 0x9f, 0x09, 0x12, 0x71,
+    0xc8, 0x98, 0x84, 0x09, 0x12, 0x68, 0xc2, 0x2b, 0x85, 0x09, 0x13, 0x13,
+    0x02, 0x26, 0x6b, 0x90, 0x09, 0x13, 0x1a, 0x02, 0x26, 0x6f, 0xa1, 0x09,
+    0x1c, 0x71, 0x8f, 0x09, 0x12, 0x33, 0x02, 0x26, 0x7c, 0xc2, 0x01, 0x30,
+    0x09, 0x12, 0x03, 0x02, 0x26, 0x86, 0x9f, 0x09, 0x11, 0xf8, 0x00, 0x42,
+    0x26, 0x8e, 0xc2, 0x01, 0xe2, 0x09, 0x11, 0x93, 0x02, 0x26, 0x9a, 0xc4,
+    0xe4, 0xc3, 0x09, 0x11, 0x89, 0xc4, 0xe4, 0x7f, 0x09, 0x11, 0x81, 0x89,
+    0x09, 0x11, 0x73, 0x02, 0x26, 0xa5, 0xc8, 0xb5, 0xd2, 0x09, 0x11, 0x68,
+    0xc9, 0xab, 0xc7, 0x09, 0x28, 0xf9, 0x90, 0x09, 0x11, 0x58, 0x95, 0x09,
+    0x11, 0x4a, 0x02, 0x26, 0xab, 0xc2, 0x01, 0xe2, 0x09, 0x11, 0x33, 0x02,
+    0x26, 0xaf, 0x94, 0x09, 0x11, 0x29, 0x8a, 0x09, 0x11, 0x21, 0x9f, 0x09,
+    0x11, 0x19, 0x00, 0x42, 0x26, 0xb3, 0x9f, 0x09, 0x0f, 0xeb, 0x02, 0x26,
+    0xbf, 0x8f, 0x09, 0x10, 0xeb, 0x02, 0x26, 0xc3, 0x8e, 0x09, 0x10, 0xe1,
+    0x8a, 0x09, 0x10, 0xd9, 0xc3, 0x38, 0x73, 0x09, 0x10, 0xbb, 0x02, 0x26,
+    0xcc, 0xa0, 0x09, 0x10, 0xb1, 0xca, 0x8d, 0x2d, 0x09, 0x0f, 0xe0, 0x42,
+    0x0c, 0x67, 0xc2, 0x26, 0xd0, 0x42, 0x01, 0x30, 0xc2, 0x26, 0xf2, 0x8f,
+    0x09, 0x0f, 0xa3, 0x02, 0x27, 0x00, 0x8e, 0x09, 0x0f, 0x93, 0x02, 0x27,
+    0x09, 0xc4, 0xdf, 0xdb, 0x09, 0x0f, 0x88, 0xc2, 0x01, 0xe2, 0x09, 0x0f,
+    0xd1, 0xc4, 0xe3, 0xe7, 0x09, 0x0f, 0xc9, 0x8e, 0x09, 0x0f, 0xc0, 0x47,
+    0x03, 0x4c, 0xc2, 0x27, 0x0f, 0xc9, 0xae, 0xdf, 0x09, 0x1b, 0x79, 0xc4,
+    0x45, 0xaf, 0x09, 0x0c, 0xe3, 0x02, 0x27, 0x5b, 0x0f, 0xc2, 0x27, 0x5f,
+    0x8e, 0x09, 0x0c, 0xbb, 0x02, 0x27, 0x67, 0x8d, 0x09, 0x0c, 0xab, 0x02,
+    0x27, 0x6b, 0x06, 0xc2, 0x27, 0x71, 0x84, 0x09, 0x0c, 0x79, 0x9f, 0x09,
+    0x0c, 0x6a, 0x02, 0x27, 0x84, 0xc4, 0x5d, 0xd2, 0x09, 0x0d, 0xa9, 0x94,
+    0x09, 0x0d, 0x9b, 0x02, 0x27, 0x8a, 0x90, 0x09, 0x0d, 0x91, 0x8e, 0x09,
+    0x0d, 0x83, 0x02, 0x27, 0x90, 0xa4, 0x09, 0x0d, 0x79, 0xa1, 0x09, 0x0d,
+    0x6b, 0x02, 0x27, 0x96, 0xa0, 0x09, 0x0d, 0x61, 0x49, 0x05, 0x54, 0x42,
+    0x27, 0x9c, 0x15, 0xc2, 0x27, 0xa2, 0x90, 0x09, 0x0d, 0x29, 0x86, 0x09,
+    0x0d, 0x21, 0x47, 0x03, 0x4c, 0x42, 0x27, 0xb5, 0x47, 0x03, 0x4c, 0x42,
+    0x27, 0xc2, 0x00, 0xc2, 0x27, 0xf3, 0x8e, 0x09, 0x09, 0x60, 0xc2, 0x01,
+    0xe2, 0x09, 0x1b, 0xe9, 0xc2, 0xaf, 0x5c, 0x09, 0x09, 0xf1, 0xc2, 0x58,
+    0xf2, 0x09, 0x09, 0xc2, 0x02, 0x28, 0x02, 0x86, 0x09, 0x08, 0xf3, 0x02,
+    0x28, 0x08, 0x9f, 0x09, 0x08, 0xc3, 0x02, 0x28, 0x0c, 0x94, 0x09, 0x09,
+    0x2b, 0x02, 0x28, 0x10, 0x8f, 0x09, 0x09, 0x1b, 0x02, 0x28, 0x18, 0x8e,
+    0x09, 0x09, 0x11, 0xcc, 0x88, 0xe9, 0x09, 0x08, 0xb8, 0x15, 0xc2, 0x28,
+    0x1e, 0x89, 0x09, 0x1b, 0xe1, 0x14, 0xc2, 0x28, 0x2b, 0xc3, 0x7e, 0x08,
+    0x09, 0x08, 0x39, 0xa1, 0x09, 0x08, 0x23, 0x02, 0x28, 0x39, 0x00, 0x42,
+    0x28, 0x3d, 0xc5, 0xda, 0xba, 0x09, 0x07, 0xf3, 0x02, 0x28, 0x49, 0xc2,
+    0xe1, 0x2e, 0x09, 0x1b, 0xd8, 0xc2, 0x01, 0x5d, 0x09, 0x07, 0x73, 0x02,
+    0x28, 0x4f, 0x9f, 0x09, 0x05, 0xbb, 0x02, 0x28, 0x53, 0xc4, 0x09, 0x26,
+    0x09, 0x07, 0xe9, 0x94, 0x09, 0x07, 0xdb, 0x02, 0x28, 0x57, 0x90, 0x09,
+    0x07, 0xb3, 0x02, 0x28, 0x5b, 0x8f, 0x09, 0x07, 0xa9, 0x8e, 0x09, 0x07,
+    0x93, 0x02, 0x28, 0x62, 0x86, 0x09, 0x07, 0x83, 0x02, 0x28, 0x6e, 0xc5,
+    0x39, 0xc7, 0x09, 0x05, 0xb0, 0x00, 0x42, 0x28, 0x74, 0xce, 0x73, 0x28,
+    0x09, 0x25, 0x60, 0xc3, 0x9e, 0x4d, 0x09, 0x04, 0xfb, 0x02, 0x28, 0x80,
+    0xc2, 0x00, 0xc4, 0x09, 0x04, 0xf0, 0x47, 0x03, 0x4c, 0x42, 0x28, 0x86,
+    0x00, 0x42, 0x28, 0xac, 0xd3, 0x42, 0x55, 0x09, 0x04, 0x61, 0xc9, 0xa8,
+    0xe5, 0x09, 0x04, 0x58, 0x89, 0x09, 0x04, 0x0b, 0x02, 0x28, 0xc4, 0x84,
+    0x09, 0x03, 0xf3, 0x02, 0x28, 0xd0, 0xc2, 0x38, 0x6a, 0x09, 0x04, 0x49,
+    0x90, 0x09, 0x04, 0x23, 0x02, 0x28, 0xda, 0x8a, 0x09, 0x04, 0x19, 0x00,
+    0x42, 0x28, 0xe5, 0x8f, 0x09, 0x03, 0xa3, 0x02, 0x28, 0xf7, 0xc2, 0x01,
+    0xe2, 0x09, 0x03, 0xcb, 0x02, 0x29, 0x04, 0x90, 0x09, 0x03, 0xbb, 0x02,
+    0x29, 0x0a, 0x84, 0x09, 0x03, 0x98, 0x89, 0x09, 0x02, 0xb3, 0x02, 0x29,
+    0x10, 0xcb, 0x38, 0xad, 0x09, 0x24, 0x41, 0x94, 0x09, 0x03, 0x7b, 0x02,
+    0x29, 0x18, 0x8f, 0x09, 0x03, 0x70, 0x00, 0xc2, 0x29, 0x1c, 0x94, 0x09,
+    0x02, 0x9b, 0x02, 0x29, 0x28, 0xc3, 0x6c, 0x4d, 0x09, 0x02, 0x8a, 0x02,
+    0x29, 0x2c, 0xc4, 0x38, 0x68, 0x09, 0x02, 0x1b, 0x02, 0x29, 0x32, 0x86,
+    0x09, 0x02, 0x0b, 0x02, 0x29, 0x38, 0x94, 0x09, 0x02, 0x3b, 0x02, 0x29,
+    0x3e, 0x8e, 0x09, 0x02, 0x23, 0x02, 0x29, 0x44, 0xc2, 0xe6, 0xad, 0x09,
+    0x02, 0x10, 0x47, 0x03, 0x4c, 0x42, 0x29, 0x50, 0xcb, 0x98, 0x84, 0x09,
+    0x24, 0x10, 0x00, 0xc2, 0x29, 0x60, 0x9f, 0x09, 0x00, 0xb2, 0x02, 0x29,
+    0x6c, 0x47, 0x03, 0x4c, 0x42, 0x29, 0x72, 0x8a, 0x09, 0x01, 0xc3, 0x02,
+    0x29, 0x7e, 0xc3, 0xe5, 0x9f, 0x09, 0x01, 0xb8, 0xc3, 0x91, 0xee, 0x09,
+    0x01, 0xb1, 0xc2, 0x01, 0x9d, 0x09, 0x01, 0xa2, 0x02, 0x29, 0x8c, 0xc3,
+    0x04, 0x65, 0x09, 0x01, 0x91, 0x00, 0x42, 0x29, 0x92, 0xc3, 0x36, 0xb6,
+    0x09, 0x01, 0x51, 0xc2, 0x00, 0xd1, 0x09, 0x01, 0x49, 0x47, 0x03, 0x4c,
+    0x42, 0x29, 0xa4, 0x47, 0x03, 0x4c, 0x42, 0x29, 0xcc, 0xc3, 0x78, 0x3e,
+    0x09, 0x00, 0x41, 0xc4, 0x7a, 0x34, 0x09, 0x00, 0x39, 0xca, 0x39, 0xc2,
+    0x09, 0x00, 0x31, 0xc3, 0x04, 0x2a, 0x09, 0x00, 0x29, 0xc2, 0x00, 0xd0,
+    0x09, 0x00, 0x21, 0xc9, 0x5d, 0x99, 0x09, 0x00, 0x19, 0xc3, 0x62, 0x19,
+    0x09, 0x00, 0x11, 0x83, 0x09, 0x00, 0x08, 0x14, 0xc2, 0x29, 0xd8, 0x00,
+    0x42, 0x29, 0xe5, 0xc9, 0x0a, 0xfe, 0x09, 0x1c, 0xa0, 0x92, 0x09, 0x13,
+    0xe9, 0x90, 0x09, 0x13, 0xe1, 0x86, 0x09, 0x13, 0xd8, 0x84, 0x09, 0x14,
+    0x80, 0xc2, 0x00, 0x74, 0x09, 0x0a, 0x99, 0x00, 0x42, 0x29, 0xf1, 0x9f,
+    0x09, 0x0a, 0x69, 0xd0, 0x5d, 0x92, 0x09, 0x0a, 0x60, 0x8b, 0x09, 0x0a,
+    0x32, 0x02, 0x2a, 0x09, 0x4b, 0x96, 0xa0, 0x42, 0x2a, 0x0d, 0x97, 0x09,
+    0x20, 0xa3, 0x02, 0x2a, 0x19, 0xd3, 0x42, 0xa1, 0x09, 0x22, 0x33, 0x02,
+    0x2a, 0x1f, 0xc5, 0xdb, 0xdc, 0x09, 0x21, 0x59, 0xc5, 0xd7, 0xf9, 0x09,
+    0x20, 0xe9, 0xc4, 0x04, 0x59, 0x09, 0x20, 0x71, 0xc3, 0x02, 0x2c, 0x09,
+    0x20, 0x38, 0xc3, 0x26, 0x1a, 0x09, 0x22, 0xb9, 0xc3, 0x0f, 0xd6, 0x09,
+    0x22, 0xb0, 0x97, 0x09, 0x20, 0x9b, 0x02, 0x2a, 0x2d, 0xd1, 0x53, 0x21,
+    0x09, 0x22, 0x23, 0x02, 0x2a, 0x33, 0xc5, 0xdb, 0xdc, 0x09, 0x21, 0x51,
+    0xc5, 0xd7, 0xf9, 0x09, 0x20, 0xe1, 0xc4, 0x04, 0x59, 0x09, 0x20, 0x69,
+    0xc3, 0x02, 0x2c, 0x09, 0x20, 0x30, 0x08, 0xc2, 0x2a, 0x37, 0xca, 0x9d,
+    0x2e, 0x09, 0x23, 0x31, 0xc9, 0xac, 0xcc, 0x09, 0x23, 0x28, 0x97, 0x09,
+    0x20, 0x93, 0x02, 0x2a, 0x43, 0x51, 0x52, 0xdd, 0xc2, 0x2a, 0x49, 0xc5,
+    0xdb, 0xdc, 0x09, 0x21, 0x49, 0xc5, 0xd7, 0xf9, 0x09, 0x20, 0xd9, 0xc4,
+    0x04, 0x59, 0x09, 0x20, 0x61, 0xc3, 0x02, 0x2c, 0x09, 0x20, 0x28, 0x97,
+    0x09, 0x20, 0x8b, 0x02, 0x2a, 0x51, 0xc3, 0x02, 0x2c, 0x09, 0x20, 0x23,
+    0x02, 0x2a, 0x57, 0xd1, 0x54, 0x64, 0x09, 0x22, 0x01, 0xc5, 0xdb, 0xdc,
+    0x09, 0x21, 0x41, 0xc5, 0xd7, 0xf9, 0x09, 0x20, 0xd1, 0xc4, 0x04, 0x59,
+    0x09, 0x20, 0x58, 0xc3, 0x0f, 0xd6, 0x09, 0x21, 0x99, 0xc4, 0x04, 0x59,
+    0x09, 0x21, 0x90, 0x97, 0x09, 0x20, 0x83, 0x02, 0x2a, 0x5d, 0x15, 0xc2,
+    0x2a, 0x63, 0x04, 0xc2, 0x2a, 0x6f, 0xc3, 0x02, 0x2c, 0x09, 0x20, 0x1b,
+    0x02, 0x2a, 0x7e, 0x44, 0x64, 0xa6, 0xc2, 0x2a, 0x84, 0xc4, 0x04, 0x59,
+    0x09, 0x20, 0x50, 0x97, 0x09, 0x20, 0x7b, 0x02, 0x2a, 0x8c, 0x04, 0xc2,
+    0x2a, 0x92, 0xc3, 0x02, 0x2c, 0x09, 0x20, 0x13, 0x02, 0x2a, 0xa1, 0xd2,
+    0x49, 0x31, 0x09, 0x21, 0xe3, 0x02, 0x2a, 0xa7, 0x44, 0x7a, 0x36, 0xc2,
+    0x2a, 0xaf, 0x44, 0x64, 0xa6, 0xc2, 0x2a, 0xb7, 0xc4, 0x04, 0x59, 0x09,
+    0x20, 0x48, 0xc8, 0xbf, 0x52, 0x09, 0x23, 0x21, 0x48, 0x15, 0x02, 0xc2,
+    0x2a, 0xbf, 0x07, 0xc2, 0x2a, 0xcb, 0x46, 0x06, 0x67, 0xc2, 0x2a, 0xd7,
+    0x04, 0xc2, 0x2a, 0xe3, 0xc5, 0xdb, 0x8c, 0x09, 0x21, 0x61, 0x44, 0x64,
+    0xa6, 0x42, 0x2a, 0xef, 0xc7, 0x08, 0x79, 0x09, 0x23, 0x11, 0xc5, 0xd3,
+    0x5b, 0x09, 0x23, 0x08, 0x47, 0x8d, 0x4d, 0xc2, 0x2a, 0xf7, 0xc5, 0xdd,
+    0x62, 0x09, 0x22, 0xc9, 0x04, 0xc2, 0x2b, 0x03, 0xc3, 0x02, 0x2c, 0x09,
+    0x20, 0x03, 0x02, 0x2b, 0x0f, 0x44, 0x7a, 0x36, 0xc2, 0x2b, 0x15, 0x44,
+    0x64, 0xa6, 0x42, 0x2b, 0x1d, 0x04, 0xc2, 0x2b, 0x25, 0xc3, 0x02, 0x2c,
+    0x09, 0x20, 0x0b, 0x02, 0x2b, 0x34, 0x50, 0x57, 0x72, 0xc2, 0x2b, 0x3a,
+    0x44, 0x7a, 0x36, 0xc2, 0x2b, 0x46, 0x44, 0x64, 0xa6, 0xc2, 0x2b, 0x54,
+    0xc4, 0x04, 0x59, 0x09, 0x20, 0x40, 0xc2, 0x00, 0x11, 0x01, 0x3d, 0x81,
+    0x46, 0x19, 0xbb, 0x42, 0x2b, 0x5c, 0xa1, 0x09, 0x7f, 0x81, 0x9f, 0x09,
+    0x7f, 0x79, 0x9d, 0x09, 0x7f, 0x70, 0xa6, 0x09, 0x7f, 0x69, 0xa5, 0x09,
+    0x7f, 0x61, 0xa4, 0x09, 0x7f, 0x59, 0xa2, 0x09, 0x7f, 0x51, 0xa1, 0x09,
+    0x7f, 0x49, 0xa0, 0x09, 0x7f, 0x41, 0x9f, 0x09, 0x7f, 0x39, 0x9e, 0x09,
+    0x7f, 0x31, 0x9d, 0x09, 0x7f, 0x28, 0xa6, 0x09, 0x7f, 0x21, 0xa5, 0x09,
+    0x7f, 0x19, 0xa4, 0x09, 0x7f, 0x11, 0xa3, 0x09, 0x7f, 0x09, 0xa2, 0x09,
+    0x7f, 0x01, 0xa1, 0x09, 0x7e, 0xf9, 0x9f, 0x09, 0x7e, 0xf1, 0x9e, 0x09,
+    0x7e, 0xe9, 0x9d, 0x09, 0x7e, 0xe0, 0xa6, 0x09, 0x7e, 0xd9, 0xa5, 0x09,
+    0x7e, 0xd1, 0xa4, 0x09, 0x7e, 0xc9, 0xa3, 0x09, 0x7e, 0xc1, 0xa2, 0x09,
+    0x7e, 0xb9, 0xa1, 0x09, 0x7e, 0xb1, 0xa0, 0x09, 0x7e, 0xa9, 0x9f, 0x09,
+    0x7e, 0xa1, 0x9e, 0x09, 0x7e, 0x99, 0x9d, 0x09, 0x7e, 0x90, 0xa6, 0x09,
+    0x7e, 0x89, 0xa5, 0x09, 0x7e, 0x81, 0xa3, 0x09, 0x7e, 0x79, 0xa2, 0x09,
+    0x7e, 0x6b, 0x02, 0x2b, 0x68, 0xa1, 0x09, 0x7e, 0x61, 0xa0, 0x09, 0x7e,
+    0x59, 0x9f, 0x09, 0x7e, 0x51, 0x9e, 0x09, 0x7e, 0x49, 0x9d, 0x09, 0x7e,
+    0x40, 0xa6, 0x09, 0x7e, 0x39, 0xa5, 0x09, 0x7e, 0x31, 0xa4, 0x09, 0x7e,
+    0x29, 0xa3, 0x09, 0x7e, 0x21, 0xa1, 0x09, 0x7e, 0x19, 0xa0, 0x09, 0x7e,
+    0x11, 0x9f, 0x09, 0x7e, 0x09, 0x9e, 0x09, 0x7e, 0x01, 0x9d, 0x09, 0x7d,
+    0xf8, 0xa6, 0x09, 0x7d, 0xf1, 0xa5, 0x09, 0x7d, 0xe9, 0xa3, 0x09, 0x7d,
+    0xe1, 0xa2, 0x09, 0x7d, 0xd9, 0xa1, 0x09, 0x7d, 0xd1, 0xa0, 0x09, 0x7d,
+    0xc9, 0x9f, 0x09, 0x7d, 0xc1, 0x9e, 0x09, 0x7d, 0xb9, 0x9d, 0x09, 0x7d,
+    0xb0, 0xa6, 0x09, 0x7d, 0xa9, 0xa4, 0x09, 0x7d, 0xa1, 0xa3, 0x09, 0x7d,
+    0x99, 0xa1, 0x09, 0x7d, 0x91, 0x9e, 0x09, 0x7d, 0x89, 0x9d, 0x09, 0x7d,
+    0x80, 0xa6, 0x09, 0x7d, 0x79, 0xa5, 0x09, 0x7d, 0x71, 0xa4, 0x09, 0x7d,
+    0x69, 0xa3, 0x09, 0x7d, 0x61, 0xa2, 0x09, 0x7d, 0x59, 0xa1, 0x09, 0x7d,
+    0x51, 0xa0, 0x09, 0x7d, 0x49, 0x9d, 0x09, 0x7d, 0x40, 0xa6, 0x09, 0x7d,
+    0x39, 0xa5, 0x09, 0x7d, 0x31, 0xa4, 0x09, 0x7d, 0x29, 0xa3, 0x09, 0x7d,
+    0x21, 0xa2, 0x09, 0x7d, 0x19, 0xa1, 0x09, 0x7d, 0x11, 0xa0, 0x09, 0x7d,
+    0x09, 0x9e, 0x09, 0x7d, 0x00, 0xa6, 0x09, 0x7c, 0xf9, 0xa4, 0x09, 0x7c,
+    0xf1, 0xa2, 0x09, 0x7c, 0xe9, 0xa0, 0x09, 0x7c, 0xe1, 0x9f, 0x09, 0x7c,
+    0xd3, 0x02, 0x2b, 0x6c, 0x9e, 0x09, 0x7c, 0xc9, 0x9d, 0x09, 0x7c, 0xc0,
+    0xa6, 0x09, 0x7c, 0xb9, 0xa5, 0x09, 0x7c, 0xb1, 0xa4, 0x09, 0x7c, 0xa9,
+    0xa3, 0x09, 0x7c, 0xa1, 0xa2, 0x09, 0x7c, 0x99, 0xa1, 0x09, 0x7c, 0x91,
+    0x9f, 0x09, 0x7c, 0x89, 0x9e, 0x09, 0x7c, 0x80, 0xcb, 0x95, 0x2a, 0x00,
+    0xe4, 0x41, 0x46, 0x00, 0x8b, 0xc2, 0x2b, 0x70, 0x8d, 0x00, 0x23, 0xca,
+    0x02, 0x2b, 0x7a, 0x44, 0x03, 0x15, 0xc2, 0x2b, 0x80, 0xce, 0x73, 0xec,
+    0x00, 0xe4, 0x29, 0x87, 0x00, 0x22, 0x13, 0x02, 0x2b, 0x92, 0x15, 0xc2,
+    0x2b, 0x98, 0xc2, 0x00, 0x28, 0x05, 0x34, 0x69, 0xc3, 0x28, 0x28, 0x05,
+    0x34, 0x98, 0xc6, 0xd0, 0xf7, 0x00, 0xe4, 0x19, 0x87, 0x00, 0x28, 0xe8,
+    0xc7, 0x5b, 0xcd, 0x00, 0xe4, 0x11, 0xca, 0x9c, 0x34, 0x05, 0x32, 0x79,
+    0xc2, 0x1c, 0x52, 0x00, 0x22, 0xd0, 0xcd, 0x7b, 0x2f, 0x00, 0xe4, 0x09,
+    0xc2, 0x00, 0xd0, 0x00, 0x28, 0xa9, 0xc2, 0x1c, 0x52, 0x00, 0x22, 0xc9,
+    0xc9, 0x51, 0x80, 0x00, 0x23, 0x38, 0x44, 0x0d, 0xed, 0xc2, 0x2b, 0xae,
+    0xc2, 0x00, 0xd0, 0x00, 0x28, 0xb9, 0x48, 0x10, 0x2f, 0x42, 0x2b, 0xba,
+    0x8e, 0x00, 0x21, 0xdb, 0x02, 0x2b, 0xd2, 0x90, 0x00, 0x21, 0xeb, 0x02,
+    0x2b, 0xd8, 0xcf, 0x6b, 0x16, 0x00, 0x27, 0x69, 0x8f, 0x00, 0x21, 0xe3,
+    0x02, 0x2b, 0xde, 0x95, 0x00, 0x22, 0x0b, 0x02, 0x2b, 0xe4, 0x94, 0x00,
+    0x22, 0x03, 0x02, 0x2b, 0xea, 0x88, 0x00, 0x22, 0x20, 0xc3, 0x28, 0x28,
+    0x00, 0x29, 0x69, 0x1c, 0xc2, 0x2b, 0xf0, 0x46, 0x00, 0x59, 0xc2, 0x2c,
+    0x07, 0xc2, 0x1c, 0x52, 0x00, 0x22, 0x93, 0x02, 0x2c, 0x11, 0x87, 0x00,
+    0x21, 0xa1, 0xc2, 0x00, 0x28, 0x05, 0x34, 0x08, 0x0a, 0xc2, 0x2c, 0x17,
+    0xc4, 0x74, 0x82, 0x00, 0x26, 0xcb, 0x02, 0x2c, 0x36, 0xc9, 0xb5, 0x06,
+    0x00, 0x25, 0x7b, 0x02, 0x2c, 0x3c, 0xcc, 0x84, 0xbd, 0x00, 0x24, 0x69,
+    0x44, 0x62, 0x60, 0x42, 0x2c, 0x42, 0x87, 0x00, 0x21, 0xfb, 0x02, 0x2c,
+    0x52, 0xc7, 0xbe, 0xab, 0x00, 0x26, 0x79, 0xc2, 0x00, 0xba, 0x00, 0x23,
+    0x88, 0xc7, 0xc1, 0xcb, 0x00, 0x28, 0xf9, 0x49, 0xb1, 0x31, 0xc2, 0x2c,
+    0x58, 0x46, 0x00, 0x8b, 0x42, 0x2c, 0x6d, 0x83, 0x00, 0x22, 0x7b, 0x02,
+    0x2c, 0x79, 0xc3, 0x21, 0x51, 0x00, 0x22, 0x5b, 0x02, 0x2c, 0x81, 0x90,
+    0x05, 0x32, 0xf9, 0x97, 0x00, 0x22, 0x71, 0x8b, 0x00, 0x22, 0xb8, 0x11,
+    0xc2, 0x2c, 0x87, 0xcd, 0x78, 0x09, 0x00, 0x26, 0x61, 0x83, 0x00, 0x21,
+    0xd3, 0x02, 0x2c, 0x93, 0xc2, 0x1c, 0x52, 0x00, 0x22, 0xe1, 0xc2, 0x00,
+    0xba, 0x00, 0x23, 0x78, 0x83, 0x00, 0x22, 0x2b, 0x02, 0x2c, 0x99, 0xc2,
+    0x00, 0x28, 0x05, 0x34, 0xa8, 0xc2, 0x01, 0x7f, 0x00, 0x21, 0x9b, 0x02,
+    0x2c, 0xa5, 0xc2, 0x1c, 0x52, 0x00, 0x22, 0x98, 0x03, 0xc2, 0x2c, 0xab,
+    0xca, 0xa0, 0x44, 0x05, 0x32, 0x69, 0x87, 0x00, 0x21, 0x89, 0xca, 0xa2,
+    0x60, 0x05, 0x32, 0xd9, 0x0b, 0xc2, 0x2c, 0xba, 0xd7, 0x27, 0x46, 0x00,
+    0x22, 0xb0, 0xcf, 0x6b, 0x16, 0x00, 0x27, 0x39, 0xc4, 0x6d, 0xb5, 0x00,
+    0x23, 0x0b, 0x02, 0x2c, 0xc6, 0x96, 0x00, 0x23, 0xf8, 0x46, 0x00, 0x8b,
+    0xc2, 0x2c, 0xcc, 0x87, 0x00, 0x21, 0xab, 0x02, 0x2c, 0xde, 0xc6, 0xcb,
+    0x81, 0x00, 0x23, 0xab, 0x02, 0x2c, 0xe4, 0x91, 0x00, 0x22, 0x8a, 0x02,
+    0x2c, 0xea, 0x87, 0x00, 0x21, 0xbb, 0x02, 0x2c, 0xee, 0x0a, 0x42, 0x2c,
+    0xfa, 0xc2, 0x01, 0x7f, 0x00, 0x22, 0x3b, 0x02, 0x2d, 0x07, 0xc8, 0xb6,
+    0x5a, 0x05, 0x34, 0xd9, 0xd0, 0x51, 0x79, 0x05, 0x32, 0xc9, 0xc3, 0x28,
+    0x28, 0x05, 0x34, 0x38, 0xc8, 0x82, 0x09, 0x05, 0x32, 0x59, 0xc7, 0x7f,
+    0xba, 0x05, 0x33, 0x48, 0x8e, 0x00, 0x20, 0xdb, 0x02, 0x2d, 0x0d, 0x90,
+    0x00, 0x20, 0xeb, 0x02, 0x2d, 0x13, 0xcf, 0x6b, 0x16, 0x00, 0x27, 0x61,
+    0x8f, 0x00, 0x20, 0xe3, 0x02, 0x2d, 0x19, 0x95, 0x00, 0x21, 0x0b, 0x02,
+    0x2d, 0x1f, 0x94, 0x00, 0x21, 0x03, 0x02, 0x2d, 0x25, 0x88, 0x00, 0x21,
+    0x20, 0xc3, 0x28, 0x28, 0x00, 0x29, 0x61, 0x1c, 0xc2, 0x2d, 0x2b, 0x46,
+    0x00, 0x59, 0xc2, 0x2d, 0x42, 0xc2, 0x1c, 0x52, 0x00, 0x20, 0x13, 0x02,
+    0x2d, 0x4c, 0x87, 0x00, 0x20, 0xa1, 0xc2, 0x00, 0x28, 0x05, 0x34, 0x00,
+    0x0a, 0xc2, 0x2d, 0x52, 0xc4, 0x74, 0x82, 0x00, 0x26, 0xc3, 0x02, 0x2d,
+    0x71, 0xc9, 0xb5, 0x06, 0x00, 0x25, 0x73, 0x02, 0x2d, 0x77, 0xcc, 0x84,
+    0xbd, 0x00, 0x24, 0x61, 0x44, 0x62, 0x60, 0x42, 0x2d, 0x7d, 0x87, 0x00,
+    0x20, 0xfb, 0x02, 0x2d, 0x8d, 0xc2, 0x00, 0xba, 0x00, 0x23, 0x80, 0xc7,
+    0xc1, 0xcb, 0x00, 0x28, 0xf1, 0x49, 0xb1, 0x31, 0xc2, 0x2d, 0x93, 0x46,
+    0x00, 0x8b, 0x42, 0x2d, 0xa8, 0x83, 0x00, 0x21, 0x7b, 0x02, 0x2d, 0xb4,
+    0xc3, 0x21, 0x51, 0x00, 0x21, 0x5b, 0x02, 0x2d, 0xbc, 0x8b, 0x00, 0x20,
+    0x39, 0x97, 0x00, 0x21, 0x71, 0x90, 0x05, 0x32, 0xf0, 0xc2, 0x00, 0xd0,
+    0x00, 0x28, 0xb1, 0x48, 0x10, 0x2f, 0xc2, 0x2d, 0xc2, 0xca, 0x9b, 0xd0,
+    0x00, 0x23, 0xd0, 0xc2, 0x00, 0xd0, 0x00, 0x28, 0xa1, 0xc2, 0x1c, 0x52,
+    0x00, 0x20, 0x49, 0xc9, 0x51, 0x80, 0x00, 0x23, 0x30, 0x11, 0xc2, 0x2d,
+    0xda, 0xcd, 0x78, 0x09, 0x00, 0x26, 0x59, 0x83, 0x00, 0x20, 0xd3, 0x02,
+    0x2d, 0xe6, 0xc2, 0x1c, 0x52, 0x00, 0x20, 0x61, 0xc2, 0x00, 0xba, 0x00,
+    0x23, 0x70, 0x83, 0x00, 0x21, 0x2b, 0x02, 0x2d, 0xec, 0xc2, 0x00, 0x28,
+    0x05, 0x34, 0xa0, 0xc2, 0x01, 0x7f, 0x00, 0x20, 0x9b, 0x02, 0x2d, 0xf8,
+    0xc2, 0x1c, 0x52, 0x00, 0x20, 0x18, 0xc2, 0x01, 0x7f, 0x00, 0x21, 0x3b,
+    0x02, 0x2d, 0xfe, 0xc8, 0xb6, 0x5a, 0x05, 0x34, 0xd1, 0xd0, 0x51, 0x79,
+    0x05, 0x32, 0xc1, 0xc3, 0x28, 0x28, 0x05, 0x34, 0x30, 0x46, 0x00, 0x8b,
+    0xc2, 0x2e, 0x04, 0x8d, 0x00, 0x23, 0xc2, 0x02, 0x2e, 0x0e, 0x03, 0xc2,
+    0x2e, 0x14, 0xd7, 0x27, 0x46, 0x00, 0x20, 0x31, 0x87, 0x00, 0x20, 0x89,
+    0xca, 0xa0, 0x44, 0x05, 0x32, 0x61, 0xca, 0xa2, 0x60, 0x05, 0x32, 0xd1,
+    0x0b, 0x42, 0x2e, 0x23, 0xcf, 0x6b, 0x16, 0x00, 0x27, 0x31, 0xc4, 0x6d,
+    0xb5, 0x00, 0x23, 0x03, 0x02, 0x2e, 0x2f, 0x96, 0x00, 0x23, 0xf0, 0x46,
+    0x00, 0x8b, 0xc2, 0x2e, 0x35, 0x87, 0x00, 0x20, 0xab, 0x02, 0x2e, 0x47,
+    0xc6, 0xcb, 0x81, 0x00, 0x23, 0xa3, 0x02, 0x2e, 0x4d, 0x91, 0x00, 0x20,
+    0x0a, 0x02, 0x2e, 0x53, 0x87, 0x00, 0x20, 0xbb, 0x02, 0x2e, 0x57, 0x0a,
+    0x42, 0x2e, 0x63, 0x87, 0x00, 0x21, 0x13, 0x02, 0x2e, 0x70, 0x15, 0xc2,
+    0x2e, 0x76, 0xc2, 0x00, 0x28, 0x05, 0x34, 0x61, 0xc3, 0x28, 0x28, 0x05,
+    0x34, 0x90, 0xc2, 0x1c, 0x52, 0x00, 0x20, 0x51, 0xca, 0x9c, 0x34, 0x05,
+    0x32, 0x70, 0xc8, 0x82, 0x09, 0x05, 0x32, 0x51, 0xc7, 0x7f, 0xba, 0x05,
+    0x33, 0x40, 0xc4, 0x02, 0xde, 0x00, 0x04, 0x79, 0xc2, 0x02, 0xa0, 0x00,
+    0x04, 0x70, 0xe0, 0x06, 0x27, 0x01, 0x01, 0xd0, 0x07, 0xc2, 0x2e, 0x8c,
+    0xd3, 0x3f, 0x70, 0x01, 0x00, 0xd0, 0x44, 0x05, 0x14, 0xc2, 0x2e, 0x92,
+    0xc6, 0x2a, 0xfe, 0x08, 0x8f, 0x91, 0xc6, 0xcf, 0x9b, 0x08, 0x8f, 0x89,
+    0x15, 0xc2, 0x2e, 0x9e, 0x08, 0xc2, 0x2e, 0xaa, 0x16, 0x42, 0x2e, 0xb6,
+    0xc4, 0x26, 0x78, 0x08, 0x8f, 0x49, 0xc5, 0x06, 0xdb, 0x08, 0x8f, 0x41,
+    0x15, 0xc2, 0x2e, 0xc8, 0x08, 0xc2, 0x2e, 0xd4, 0x16, 0xc2, 0x2e, 0xe0,
+    0xc3, 0x05, 0x14, 0x08, 0x8f, 0x08, 0xc9, 0xaf, 0x39, 0x00, 0x6c, 0x11,
+    0xc8, 0xb5, 0x9a, 0x00, 0x6e, 0x50, 0x03, 0xc2, 0x2e, 0xec, 0x0b, 0xc2,
+    0x2f, 0x14, 0x17, 0xc2, 0x2f, 0x2c, 0x07, 0xc2, 0x2f, 0x38, 0x11, 0xc2,
+    0x2f, 0x44, 0x0f, 0xc2, 0x2f, 0x50, 0xd2, 0x4b, 0x05, 0x00, 0x6c, 0xf1,
+    0x48, 0xbb, 0xfa, 0xc2, 0x2f, 0x5a, 0x48, 0xb6, 0x92, 0xc2, 0x2f, 0x6a,
+    0x48, 0xb8, 0xda, 0xc2, 0x2f, 0x76, 0xc7, 0xca, 0x1b, 0x00, 0x6d, 0xd1,
+    0xc7, 0xc5, 0x3d, 0x00, 0x6d, 0xd9, 0xc7, 0xc0, 0x04, 0x00, 0x6e, 0x01,
+    0xc7, 0xc3, 0xc3, 0x00, 0x6e, 0x21, 0xc7, 0xc8, 0x0e, 0x00, 0x6e, 0x30,
+    0xc4, 0x15, 0xe7, 0x00, 0x6f, 0x31, 0xc3, 0x05, 0x14, 0x00, 0x6f, 0x39,
+    0x16, 0xc2, 0x2f, 0x88, 0x08, 0xc2, 0x2f, 0x94, 0x15, 0xc2, 0x2f, 0xa0,
+    0xc5, 0x06, 0xdb, 0x00, 0x6f, 0x71, 0xc4, 0x26, 0x78, 0x00, 0x6f, 0x78,
+    0x45, 0xb0, 0x74, 0xc2, 0x2f, 0xac, 0x44, 0xc8, 0xbe, 0x42, 0x2f, 0xbe,
+    0xca, 0xa7, 0x60, 0x00, 0x6e, 0x89, 0xc8, 0xb7, 0x82, 0x00, 0x6e, 0x99,
+    0xc9, 0xaf, 0x42, 0x00, 0x6e, 0xb1, 0xc7, 0xc8, 0xbd, 0x00, 0x6e, 0xd1,
+    0x42, 0x01, 0x30, 0x42, 0x2f, 0xcd, 0xca, 0x9c, 0xb6, 0x00, 0x6e, 0xc1,
+    0xc9, 0x93, 0x53, 0x00, 0x6e, 0xf8, 0x4a, 0x82, 0xf7, 0xc2, 0x2f, 0xd9,
+    0x02, 0x42, 0x2f, 0xfd, 0xc7, 0x0b, 0xc8, 0x0e, 0xc8, 0x99, 0xc8, 0x3b,
+    0xec, 0x0e, 0xc8, 0x91, 0xc6, 0x24, 0x3b, 0x0e, 0xc8, 0x88, 0x4c, 0x82,
+    0xf5, 0xc2, 0x30, 0x09, 0xc4, 0x09, 0x3a, 0x0e, 0xd3, 0xf0, 0xda, 0x19,
+    0xe2, 0x0e, 0xd3, 0x81, 0x44, 0x01, 0x10, 0x42, 0x30, 0x1d, 0xc8, 0xbe,
+    0x0a, 0x0e, 0xd0, 0x99, 0xc7, 0xc1, 0xb6, 0x0e, 0xd0, 0x91, 0xc7, 0x81,
+    0x92, 0x0e, 0xd0, 0x88, 0xca, 0xa4, 0x5e, 0x0e, 0xd0, 0x43, 0x02, 0x30,
+    0x27, 0xcf, 0x64, 0x2c, 0x0e, 0xd0, 0x38, 0xc3, 0x0d, 0xe5, 0x0e, 0xd4,
+    0x51, 0xc3, 0x0a, 0x8c, 0x0e, 0xd4, 0x38, 0xc6, 0xcf, 0x23, 0x0e, 0xd1,
+    0x61, 0xc7, 0x81, 0x92, 0x0e, 0xd1, 0x59, 0xc6, 0xcc, 0x7d, 0x0e, 0xd1,
+    0x50, 0xd2, 0x4d, 0x9f, 0x0e, 0xd3, 0x89, 0x44, 0x00, 0x7d, 0x42, 0x30,
+    0x2d, 0xd1, 0x57, 0x3f, 0x0e, 0xc9, 0x01, 0x15, 0xc2, 0x30, 0x39, 0x46,
+    0x17, 0x14, 0x42, 0x30, 0x45, 0xc7, 0x0b, 0xc8, 0x0e, 0xc8, 0xa9, 0xc7,
+    0x00, 0x91, 0x0e, 0xc8, 0xa0, 0xc7, 0x0b, 0xc8, 0x0e, 0xc8, 0x69, 0xc8,
+    0x3b, 0xec, 0x0e, 0xc8, 0x61, 0xc6, 0x24, 0x3b, 0x0e, 0xc8, 0x58, 0x00,
+    0xc2, 0x30, 0x51, 0x02, 0x42, 0x30, 0x6f, 0x43, 0x0e, 0xd1, 0xc2, 0x30,
+    0x7b, 0x12, 0x42, 0x30, 0x87, 0x44, 0xdf, 0x47, 0xc2, 0x30, 0x91, 0x45,
+    0xd9, 0x6b, 0xc2, 0x30, 0x9d, 0x44, 0xda, 0xac, 0x42, 0x30, 0xc1, 0xc3,
+    0x1e, 0x1b, 0x0e, 0xd3, 0x0b, 0x02, 0x30, 0xd3, 0x4b, 0x94, 0x64, 0x42,
+    0x30, 0xd7, 0x4b, 0x40, 0xb3, 0xc2, 0x30, 0xe9, 0x4a, 0x18, 0xa5, 0x42,
+    0x30, 0xf5, 0x45, 0xd7, 0x95, 0xc2, 0x31, 0x07, 0x47, 0xc0, 0x90, 0xc2,
+    0x31, 0x13, 0x00, 0xc2, 0x31, 0x25, 0x42, 0x00, 0x97, 0xc2, 0x31, 0x31,
+    0x4f, 0x67, 0x74, 0x42, 0x31, 0x4f, 0xc2, 0x01, 0x29, 0x0e, 0xd3, 0x59,
+    0x43, 0x12, 0x8f, 0x42, 0x31, 0x61, 0x00, 0x42, 0x31, 0x7f, 0x19, 0xc2,
+    0x31, 0x8b, 0xc7, 0xc1, 0xb6, 0x0e, 0xd1, 0x91, 0xc7, 0x81, 0x92, 0x0e,
+    0xd1, 0x88, 0x4b, 0x40, 0xb3, 0xc2, 0x31, 0x97, 0x4a, 0x18, 0xa5, 0xc2,
+    0x31, 0xdd, 0x49, 0x1e, 0x56, 0xc2, 0x32, 0x23, 0x46, 0xd3, 0xd3, 0x42,
+    0x32, 0x35, 0x49, 0xa9, 0x6c, 0xc2, 0x32, 0x47, 0x05, 0xc2, 0x32, 0x53,
+    0xc5, 0xaf, 0xb5, 0x0e, 0xd2, 0x83, 0x02, 0x32, 0x5f, 0xc4, 0x64, 0xa0,
+    0x0e, 0xd2, 0x6b, 0x02, 0x32, 0x63, 0x45, 0x05, 0x75, 0xc2, 0x32, 0x67,
+    0xc5, 0x7d, 0x5e, 0x0e, 0xd2, 0x0b, 0x02, 0x32, 0x8b, 0xc5, 0xa9, 0x3a,
+    0x0e, 0xd1, 0xf2, 0x02, 0x32, 0x8f, 0xc6, 0xd3, 0x3d, 0x0e, 0xd1, 0xd1,
+    0xc6, 0xcf, 0xa7, 0x0e, 0xd1, 0xc8, 0xc7, 0x0b, 0xc8, 0x0e, 0xc8, 0x81,
+    0xc8, 0x3b, 0xec, 0x0e, 0xc8, 0x79, 0xc6, 0x24, 0x3b, 0x0e, 0xc8, 0x70,
+    0xd0, 0x5a, 0x42, 0x0e, 0xd1, 0xc1, 0xc6, 0x03, 0x31, 0x0e, 0xd1, 0xb0,
+    0xd0, 0x5a, 0x42, 0x0e, 0xd1, 0xb9, 0xc7, 0x5b, 0x27, 0x0e, 0xd1, 0xa8,
+    0x48, 0xb8, 0xea, 0xc2, 0x32, 0x93, 0xca, 0xa4, 0x68, 0x0e, 0xd0, 0x79,
+    0xcc, 0x81, 0x8d, 0x0e, 0xd0, 0x70, 0xc7, 0xc0, 0x66, 0x0e, 0xcf, 0xf1,
+    0xd0, 0x5f, 0x72, 0x0e, 0xcf, 0xe9, 0x15, 0xc2, 0x32, 0x9f, 0xc7, 0x38,
+    0xd9, 0x0e, 0xcf, 0xd1, 0xc5, 0xaf, 0xb5, 0x0e, 0xcf, 0xc9, 0xc4, 0xe0,
+    0x43, 0x0e, 0xcf, 0xb9, 0x4a, 0x03, 0xde, 0x42, 0x32, 0xae, 0xca, 0xa3,
+    0x78, 0x08, 0xae, 0xe3, 0x02, 0x32, 0xba, 0x97, 0x08, 0xad, 0xd9, 0x8b,
+    0x08, 0xad, 0xc9, 0x83, 0x08, 0xad, 0x78, 0x94, 0x08, 0xad, 0xa8, 0x97,
+    0x08, 0xad, 0x98, 0x8b, 0x08, 0xad, 0x88, 0xca, 0xa3, 0x78, 0x08, 0xae,
+    0xd9, 0x97, 0x08, 0xac, 0x69, 0x8b, 0x08, 0xac, 0x59, 0x83, 0x08, 0xac,
+    0x08, 0xd5, 0x33, 0x3e, 0x08, 0xae, 0xcb, 0x02, 0x32, 0xbe, 0x0a, 0xc2,
+    0x32, 0xc2, 0x83, 0x08, 0xac, 0xe9, 0x16, 0x42, 0x32, 0xcc, 0x83, 0x08,
+    0xad, 0x69, 0xc2, 0x0d, 0xf6, 0x08, 0xad, 0x61, 0xc2, 0x00, 0xd0, 0x08,
+    0xad, 0x58, 0x83, 0x08, 0xad, 0x51, 0x47, 0xb2, 0x2e, 0x42, 0x32, 0xd6,
+    0xc2, 0x00, 0xd0, 0x08, 0xad, 0x29, 0x83, 0x08, 0xad, 0x20, 0xc2, 0x00,
+    0xd0, 0x08, 0xad, 0x19, 0x83, 0x08, 0xad, 0x10, 0x83, 0x08, 0xad, 0x09,
+    0xc2, 0x00, 0xc1, 0x08, 0xac, 0xe1, 0xc2, 0x19, 0x2c, 0x08, 0xac, 0xb9,
+    0xc2, 0x01, 0x30, 0x08, 0xac, 0x90, 0xc2, 0x00, 0xd0, 0x08, 0xad, 0x01,
+    0x83, 0x08, 0xac, 0xf9, 0x06, 0x42, 0x32, 0xe4, 0xc2, 0x00, 0xd0, 0x08,
+    0xac, 0xb1, 0x83, 0x08, 0xac, 0xa8, 0xc2, 0x00, 0xd0, 0x08, 0xac, 0xa1,
+    0x83, 0x08, 0xac, 0x98, 0xc2, 0x00, 0xd0, 0x08, 0xac, 0x89, 0x83, 0x08,
+    0xac, 0x80, 0xc2, 0x00, 0xd0, 0x08, 0xac, 0x79, 0x83, 0x08, 0xac, 0x70,
+    0x97, 0x08, 0xac, 0x28, 0x8b, 0x08, 0xac, 0x18, 0x4b, 0x94, 0x7a, 0xc2,
+    0x32, 0xee, 0x48, 0x1b, 0x0c, 0x42, 0x32, 0xfd, 0xc7, 0xc3, 0x61, 0x08,
+    0xae, 0x09, 0xc5, 0x33, 0x5d, 0x08, 0xae, 0x01, 0x42, 0x07, 0xb2, 0xc2,
+    0x33, 0x09, 0xc8, 0x14, 0x38, 0x08, 0xad, 0xe9, 0xcb, 0x1e, 0x89, 0x08,
+    0xad, 0xe0, 0xc7, 0xc7, 0x3c, 0x01, 0x39, 0x09, 0xc7, 0x18, 0x68, 0x01,
+    0x16, 0x30, 0xcf, 0x66, 0x66, 0x01, 0x5f, 0x51, 0xd0, 0x5d, 0x52, 0x01,
+    0x5f, 0x58, 0xcc, 0x7e, 0x49, 0x00, 0x04, 0x31, 0xc5, 0x0d, 0x5c, 0x00,
+    0x04, 0xc0, 0xc4, 0x1e, 0x97, 0x05, 0x46, 0x21, 0xc5, 0x40, 0xe7, 0x05,
+    0x44, 0x08, 0x97, 0x05, 0x46, 0x19, 0x8b, 0x05, 0x46, 0x01, 0x83, 0x05,
+    0x45, 0xa8, 0x91, 0x05, 0x46, 0x10, 0x87, 0x05, 0x45, 0xf8, 0x8e, 0x05,
+    0x45, 0xe3, 0x02, 0x33, 0x15, 0x94, 0x05, 0x45, 0xd2, 0x02, 0x33, 0x19,
+    0x8b, 0x05, 0x45, 0xb8, 0x97, 0x05, 0x45, 0xc8, 0xc2, 0x0d, 0xf6, 0x05,
+    0x45, 0x81, 0x83, 0x05, 0x45, 0x89, 0xc2, 0x00, 0xd0, 0x05, 0x45, 0x78,
+    0xc2, 0x00, 0xdb, 0x05, 0x45, 0x99, 0x83, 0x05, 0x45, 0x68, 0x83, 0x05,
+    0x44, 0x19, 0x8b, 0x05, 0x44, 0x71, 0x97, 0x05, 0x44, 0x88, 0x8b, 0x05,
+    0x44, 0x28, 0x97, 0x05, 0x44, 0x38, 0x47, 0xb2, 0x2e, 0xc2, 0x33, 0x1d,
+    0x83, 0x05, 0x45, 0x70, 0x87, 0x05, 0x44, 0x68, 0x91, 0x05, 0x44, 0x80,
+    0x83, 0x05, 0x44, 0x91, 0xc2, 0x00, 0xd0, 0x05, 0x44, 0x98, 0x83, 0x05,
+    0x44, 0xa1, 0xc2, 0x00, 0xd0, 0x05, 0x44, 0xa8, 0xc2, 0x01, 0x30, 0x05,
+    0x44, 0xb1, 0xc2, 0x19, 0x2c, 0x05, 0x44, 0xd9, 0xc2, 0x00, 0xc1, 0x05,
+    0x45, 0x01, 0x83, 0x05, 0x45, 0x28, 0x83, 0x05, 0x44, 0xb9, 0xc2, 0x00,
+    0xd0, 0x05, 0x44, 0xc0, 0x83, 0x05, 0x44, 0xc9, 0xc2, 0x00, 0xd0, 0x05,
+    0x44, 0xd0, 0x16, 0xc2, 0x33, 0x2b, 0x83, 0x05, 0x45, 0x09, 0xc2, 0x00,
+    0xd0, 0x05, 0x45, 0x10, 0x06, 0xc2, 0x33, 0x35, 0x83, 0x05, 0x45, 0x19,
+    0xc2, 0x00, 0xd0, 0x05, 0x45, 0x20, 0x83, 0x05, 0x45, 0x31, 0xc2, 0x00,
+    0xd0, 0x05, 0x45, 0x38, 0x83, 0x05, 0x45, 0x41, 0xc2, 0x00, 0xd0, 0x05,
+    0x45, 0x48, 0xc4, 0x15, 0xe7, 0x05, 0x46, 0x81, 0xc3, 0x05, 0x14, 0x05,
+    0x46, 0x89, 0x16, 0xc2, 0x33, 0x3f, 0x08, 0xc2, 0x33, 0x4b, 0x15, 0xc2,
+    0x33, 0x57, 0xc5, 0x06, 0xdb, 0x05, 0x46, 0xc1, 0xc4, 0x26, 0x78, 0x05,
+    0x46, 0xc8, 0xdd, 0x0a, 0x8a, 0x0f, 0xb3, 0xb9, 0x44, 0x05, 0x9e, 0x42,
+    0x33, 0x63, 0xe0, 0x0b, 0x27, 0x0f, 0xb3, 0xc0, 0xc4, 0xe3, 0xf3, 0x00,
+    0x41, 0xf1, 0xc3, 0x0d, 0xe8, 0x00, 0x41, 0x90, 0xc5, 0xd8, 0xe4, 0x00,
+    0x40, 0xb8, 0x83, 0x00, 0x40, 0xf0, 0x83, 0x00, 0x40, 0xf8, 0xd0, 0x5f,
+    0xc2, 0x01, 0x54, 0xb8, 0xd0, 0x5f, 0xc2, 0x01, 0x54, 0xc0, 0x07, 0xc2,
+    0x33, 0x69, 0x44, 0x00, 0xbb, 0xc2, 0x33, 0x75, 0xc9, 0xb0, 0x98, 0x08,
+    0x8e, 0x69, 0xca, 0xa0, 0x26, 0x08, 0x8e, 0x48, 0xc3, 0xad, 0x77, 0x08,
+    0x8e, 0xd1, 0xd5, 0x34, 0x64, 0x08, 0x8e, 0x60, 0x45, 0x09, 0x98, 0xc2,
+    0x33, 0xa3, 0xcb, 0x97, 0xf5, 0x08, 0x8e, 0x31, 0xc4, 0x19, 0x53, 0x08,
+    0x8e, 0x28, 0x45, 0x00, 0xba, 0xc2, 0x33, 0xc7, 0xcd, 0x7b, 0xbe, 0x08,
+    0x8e, 0x58, 0xc2, 0x00, 0xd0, 0x08, 0x8d, 0x91, 0x15, 0xc2, 0x33, 0xed,
+    0x18, 0xc2, 0x33, 0xfd, 0x0e, 0xc2, 0x34, 0x07, 0xc2, 0x00, 0x39, 0x08,
+    0x8d, 0x59, 0xc2, 0x19, 0x2c, 0x08, 0x8d, 0x51, 0xc2, 0x01, 0xc3, 0x08,
+    0x8d, 0x49, 0x04, 0xc2, 0x34, 0x11, 0x12, 0xc2, 0x34, 0x1b, 0x10, 0xc2,
+    0x34, 0x25, 0x06, 0xc2, 0x34, 0x3b, 0x16, 0xc2, 0x34, 0x49, 0x0c, 0xc2,
+    0x34, 0x57, 0x05, 0xc2, 0x34, 0x61, 0x09, 0xc2, 0x34, 0x6b, 0x0d, 0xc2,
+    0x34, 0x75, 0x83, 0x08, 0x8c, 0x1b, 0x02, 0x34, 0x7f, 0x91, 0x08, 0x8c,
+    0x79, 0x87, 0x08, 0x8c, 0x69, 0x97, 0x08, 0x8c, 0x3b, 0x02, 0x34, 0x8b,
+    0x8b, 0x08, 0x8c, 0x2a, 0x02, 0x34, 0x8f, 0xc2, 0x01, 0xbb, 0x08, 0x22,
+    0x89, 0x0a, 0x42, 0x34, 0x93, 0x91, 0x08, 0x22, 0xa9, 0xc3, 0x14, 0xc8,
+    0x08, 0x22, 0xb0, 0x83, 0x08, 0x22, 0xc1, 0x99, 0x08, 0x23, 0xf8, 0xc3,
+    0x38, 0x86, 0x08, 0x22, 0xc9, 0xc4, 0xe2, 0x3b, 0x08, 0x23, 0x18, 0xc6,
+    0x14, 0xc5, 0x08, 0x23, 0x01, 0xc3, 0x03, 0x4e, 0x08, 0x23, 0x28, 0x87,
+    0x08, 0x23, 0x11, 0xc2, 0x00, 0x95, 0x08, 0x23, 0x58, 0x88, 0x08, 0x23,
+    0x31, 0xc2, 0x00, 0x89, 0x08, 0x23, 0x91, 0xc2, 0x00, 0xd1, 0x08, 0x23,
+    0xf0, 0xc2, 0x0f, 0xf5, 0x08, 0x23, 0x39, 0x03, 0xc2, 0x34, 0x9f, 0xc2,
+    0x01, 0xa3, 0x08, 0x23, 0xd8, 0xc2, 0x01, 0xe2, 0x08, 0x23, 0x41, 0xc2,
+    0x00, 0x58, 0x08, 0x23, 0x49, 0x8a, 0x08, 0x23, 0x69, 0xc2, 0x09, 0x3b,
+    0x08, 0x23, 0x89, 0xc2, 0x00, 0x28, 0x08, 0x23, 0xb9, 0x14, 0xc2, 0x34,
+    0xa7, 0xc2, 0x01, 0x29, 0x08, 0x23, 0xd0, 0x90, 0x08, 0x23, 0x51, 0xc2,
+    0x00, 0xa4, 0x08, 0x23, 0x61, 0xc2, 0x06, 0x4e, 0x08, 0x23, 0xa1, 0xc3,
+    0x0a, 0xe1, 0x08, 0x23, 0xa9, 0xc2, 0x00, 0x71, 0x08, 0x23, 0xb1, 0x94,
+    0x08, 0x23, 0xc8, 0xe0, 0x09, 0xe7, 0x01, 0x4a, 0x20, 0xcd, 0x80, 0x6a,
+    0x01, 0x57, 0x38, 0x00, 0x42, 0x34, 0xb1, 0xd6, 0x2e, 0x54, 0x01, 0x5a,
+    0x79, 0x4c, 0x81, 0xa5, 0x42, 0x34, 0xbd, 0x00, 0x42, 0x34, 0xc3, 0xc3,
+    0xe5, 0x8a, 0x0f, 0xb3, 0x09, 0xc9, 0xb4, 0x91, 0x0f, 0xb2, 0xc9, 0xc4,
+    0x47, 0x23, 0x0f, 0xb2, 0x88, 0xc7, 0x10, 0x9c, 0x01, 0x5b, 0xc8, 0x00,
+    0x42, 0x34, 0xcf, 0xc3, 0xe5, 0x8a, 0x0f, 0xb3, 0x19, 0xc9, 0xb4, 0x91,
+    0x0f, 0xb2, 0xd9, 0xc4, 0x47, 0x23, 0x0f, 0xb2, 0x98, 0xc7, 0x10, 0x9c,
+    0x01, 0x5b, 0xc0, 0xc2, 0x00, 0xd0, 0x08, 0xd3, 0x49, 0x83, 0x08, 0xd3,
+    0x40, 0xc2, 0x00, 0xd0, 0x08, 0xd3, 0xb1, 0x83, 0x08, 0xd3, 0xa8, 0xc2,
+    0x00, 0xd0, 0x08, 0xd3, 0x39, 0x83, 0x08, 0xd3, 0x30, 0x8e, 0x08, 0xd2,
+    0xd1, 0x94, 0x08, 0xd2, 0xc8, 0x97, 0x08, 0xd2, 0xc1, 0x8b, 0x08, 0xd2,
+    0xb8, 0x87, 0x08, 0xd2, 0xb0, 0x87, 0x08, 0xd2, 0x90, 0xca, 0x50, 0x5e,
+    0x08, 0x7a, 0xb0, 0xc3, 0x77, 0x79, 0x08, 0x79, 0xf9, 0xc4, 0xdc, 0x2d,
+    0x08, 0x79, 0xe0, 0xc5, 0xcf, 0xae, 0x0f, 0xbc, 0xb1, 0xc2, 0x00, 0x45,
+    0x01, 0x99, 0x39, 0xc2, 0xd4, 0x88, 0x01, 0x9c, 0xa0, 0x11, 0xc2, 0x34,
+    0xe7, 0x8f, 0x01, 0x9c, 0xc8, 0x44, 0x00, 0x8d, 0xc2, 0x34, 0xf3, 0xc4,
+    0x89, 0x91, 0x01, 0x9a, 0xb9, 0x84, 0x01, 0x9e, 0xe8, 0x11, 0xc2, 0x35,
+    0x2d, 0xd5, 0x32, 0x2d, 0x01, 0x56, 0x69, 0x8f, 0x01, 0x9e, 0x81, 0x90,
+    0x01, 0x9e, 0x89, 0x9a, 0x01, 0x9e, 0x98, 0xca, 0x27, 0xba, 0x01, 0x14,
+    0x83, 0x02, 0x35, 0x37, 0xc3, 0x67, 0x21, 0x01, 0x98, 0x49, 0xc3, 0x14,
+    0x47, 0x01, 0x98, 0x51, 0x98, 0x01, 0x9b, 0xa8, 0xc7, 0x3c, 0x51, 0x01,
+    0x14, 0x7b, 0x02, 0x35, 0x3d, 0x90, 0x01, 0x9e, 0x63, 0x02, 0x35, 0x43,
+    0x97, 0x01, 0x9b, 0xd0, 0xc2, 0x02, 0xae, 0x01, 0x14, 0xa1, 0x03, 0xc2,
+    0x35, 0x4f, 0x85, 0x01, 0x9e, 0x21, 0x86, 0x01, 0x9e, 0x29, 0xc8, 0xb5,
+    0x42, 0x01, 0x9e, 0x31, 0x91, 0x01, 0x9e, 0x3b, 0x02, 0x35, 0x57, 0x8f,
+    0x01, 0x9c, 0xea, 0x02, 0x35, 0x5d, 0xc3, 0x65, 0xba, 0x01, 0x10, 0xd1,
+    0x0b, 0xc2, 0x35, 0x61, 0x17, 0xc2, 0x35, 0x73, 0x07, 0xc2, 0x35, 0x7f,
+    0xc2, 0x01, 0x9d, 0x01, 0x9d, 0x6a, 0x02, 0x35, 0x8b, 0xcc, 0x86, 0xc1,
+    0x0f, 0x90, 0x01, 0x89, 0x01, 0x96, 0x61, 0x83, 0x01, 0x9e, 0x53, 0x02,
+    0x35, 0x94, 0x17, 0xc2, 0x35, 0x9a, 0x07, 0xc2, 0x35, 0xac, 0x11, 0xc2,
+    0x35, 0xb8, 0x92, 0x01, 0x9e, 0x5b, 0x02, 0x35, 0xc0, 0x9c, 0x01, 0x9c,
+    0x80, 0x8c, 0x0f, 0x8c, 0x81, 0x83, 0x01, 0x9b, 0x93, 0x02, 0x35, 0xc4,
+    0xc3, 0x13, 0x6e, 0x01, 0x99, 0x29, 0xc3, 0x27, 0x49, 0x01, 0x99, 0x31,
+    0x84, 0x01, 0x9e, 0x41, 0x8f, 0x01, 0x9b, 0xbb, 0x02, 0x35, 0xca, 0x8e,
+    0x01, 0x9c, 0xb8, 0x11, 0xc2, 0x35, 0xce, 0x83, 0x01, 0x9d, 0x4b, 0x02,
+    0x35, 0xe0, 0x0b, 0xc2, 0x35, 0xea, 0x07, 0xc2, 0x35, 0xf4, 0x8a, 0x01,
+    0x9e, 0xb9, 0x8f, 0x01, 0x9e, 0xc1, 0xc2, 0x4c, 0x90, 0x01, 0x9e, 0xc9,
+    0x94, 0x01, 0x9e, 0xd1, 0x85, 0x01, 0x9b, 0xb1, 0x88, 0x01, 0x9c, 0x51,
+    0x95, 0x01, 0x9d, 0x81, 0x98, 0x01, 0x9d, 0xa1, 0x99, 0x01, 0x9d, 0xd0,
+    0x14, 0xc2, 0x36, 0x04, 0x98, 0x01, 0x96, 0x71, 0xc7, 0xc5, 0x6e, 0x01,
+    0x98, 0x39, 0xc4, 0x90, 0x43, 0x01, 0x98, 0x40, 0xc5, 0xd4, 0x07, 0x01,
+    0x98, 0x01, 0xc5, 0xdc, 0xa4, 0x01, 0x98, 0x09, 0xc4, 0xe4, 0x4f, 0x01,
+    0x98, 0x11, 0xc3, 0x3d, 0x51, 0x01, 0x98, 0x19, 0x97, 0x01, 0x9b, 0x99,
+    0x8f, 0x01, 0x9e, 0x11, 0xc7, 0x23, 0x58, 0x01, 0x9e, 0xf8, 0x83, 0x01,
+    0x9c, 0x23, 0x02, 0x36, 0x0e, 0xc5, 0xd9, 0x0c, 0x01, 0x98, 0x91, 0xc3,
+    0x1a, 0x05, 0x01, 0x98, 0xa3, 0x02, 0x36, 0x18, 0x42, 0x00, 0x33, 0xc2,
+    0x36, 0x2a, 0xc4, 0x2b, 0x09, 0x01, 0x98, 0xe1, 0x11, 0xc2, 0x36, 0x36,
+    0x89, 0x01, 0x9c, 0x79, 0x8d, 0x01, 0x9e, 0x69, 0x8f, 0x01, 0x9c, 0xf3,
+    0x02, 0x36, 0x42, 0x96, 0x01, 0x9e, 0x79, 0x84, 0x01, 0x9c, 0x29, 0xc3,
+    0x00, 0x64, 0x01, 0x9c, 0x49, 0xc2, 0xd4, 0x88, 0x01, 0x9c, 0x89, 0x8e,
+    0x01, 0x9c, 0xc1, 0xc2, 0x00, 0xb0, 0x01, 0x9d, 0x51, 0x98, 0x01, 0x9d,
+    0xc1, 0x99, 0x01, 0x9d, 0xf1, 0xc4, 0xe3, 0xb3, 0x01, 0x9e, 0x00, 0x03,
+    0xc2, 0x36, 0x46, 0x0b, 0xc2, 0x36, 0x56, 0xc5, 0xd2, 0x2a, 0x01, 0x98,
+    0xc3, 0x02, 0x36, 0x68, 0x9b, 0x01, 0x9e, 0x49, 0x84, 0x01, 0x9c, 0x39,
+    0xc2, 0xd4, 0x88, 0x01, 0x9c, 0x99, 0xc2, 0x00, 0xb0, 0x01, 0x9d, 0x60,
+    0x03, 0xc2, 0x36, 0x6e, 0xc6, 0xd3, 0x31, 0x01, 0x99, 0x09, 0x43, 0x00,
+    0xc4, 0xc2, 0x36, 0x7a, 0x94, 0x01, 0x9e, 0xd9, 0x98, 0x01, 0x9e, 0xe0,
+    0x83, 0x01, 0x9c, 0x0b, 0x02, 0x36, 0x82, 0xc4, 0x07, 0x9b, 0x01, 0x99,
+    0x49, 0x88, 0x01, 0x9c, 0x59, 0x8f, 0x01, 0x9c, 0xd1, 0x95, 0x01, 0x9d,
+    0x89, 0x98, 0x01, 0x9d, 0xa9, 0x99, 0x01, 0x9d, 0xd8, 0x03, 0xc2, 0x36,
+    0x88, 0xc3, 0xcd, 0xc8, 0x01, 0x99, 0x89, 0xc7, 0xc6, 0x86, 0x01, 0x99,
+    0xa1, 0xc4, 0xe2, 0xdb, 0x01, 0x99, 0xe1, 0xc5, 0xde, 0x07, 0x01, 0x99,
+    0xf1, 0x93, 0x01, 0x9e, 0x18, 0x83, 0x01, 0x9c, 0x1b, 0x02, 0x36, 0x92,
+    0x0b, 0xc2, 0x36, 0xa8, 0x07, 0xc2, 0x36, 0xbb, 0x42, 0x03, 0x53, 0xc2,
+    0x36, 0xca, 0x89, 0x01, 0x9c, 0x71, 0x00, 0xc2, 0x36, 0xea, 0x84, 0x01,
+    0x9c, 0x33, 0x02, 0x36, 0xfa, 0xc2, 0x00, 0x95, 0x01, 0x9e, 0xb1, 0xc2,
+    0xd4, 0x88, 0x01, 0x9c, 0x91, 0x8e, 0x01, 0x9c, 0xb1, 0x8f, 0x01, 0x9c,
+    0xe3, 0x02, 0x37, 0x00, 0xc2, 0x00, 0xb0, 0x01, 0x9d, 0x59, 0x95, 0x01,
+    0x9d, 0x99, 0x98, 0x01, 0x9d, 0xbb, 0x02, 0x37, 0x04, 0x99, 0x01, 0x9d,
+    0xea, 0x02, 0x37, 0x0a, 0x42, 0x04, 0xc6, 0xc2, 0x37, 0x10, 0xc3, 0x93,
+    0x9b, 0x01, 0x9a, 0x80, 0x11, 0xc2, 0x37, 0x1c, 0x45, 0x0b, 0x12, 0x42,
+    0x37, 0x28, 0xc6, 0x13, 0x52, 0x01, 0x36, 0xe9, 0xc2, 0x00, 0xa6, 0x0f,
+    0x8d, 0x51, 0xc6, 0xd2, 0x1d, 0x0f, 0x8d, 0x19, 0x07, 0xc2, 0x37, 0x34,
+    0xc2, 0x07, 0xa3, 0x0f, 0x8c, 0xc1, 0xc5, 0x0b, 0x0a, 0x01, 0x4e, 0x41,
+    0xcb, 0x12, 0x2e, 0x01, 0x4e, 0x39, 0x86, 0x0f, 0x8a, 0x61, 0x95, 0x0f,
+    0x8a, 0x68, 0xc2, 0x17, 0x28, 0x01, 0x35, 0xf9, 0x48, 0xbc, 0xc2, 0x42,
+    0x37, 0x40, 0xc4, 0x03, 0x4e, 0x01, 0x15, 0x01, 0x19, 0xc2, 0x37, 0x52,
+    0xc6, 0x02, 0xde, 0x0f, 0x8c, 0xd8, 0xc4, 0x1d, 0x1e, 0x01, 0x14, 0xf9,
+    0x98, 0x0f, 0x8a, 0x58, 0xc3, 0x25, 0xd6, 0x01, 0x14, 0xf1, 0xc2, 0x52,
+    0xdc, 0x0f, 0x8a, 0x70, 0x55, 0x30, 0x23, 0xc2, 0x37, 0x5e, 0xc3, 0x8d,
+    0x08, 0x0f, 0x8c, 0x91, 0x8e, 0x0f, 0x8c, 0x88, 0xc2, 0x00, 0x6c, 0x0f,
+    0x8d, 0x61, 0x95, 0x0f, 0x8c, 0xd0, 0xc2, 0x7e, 0x61, 0x0f, 0x8d, 0x59,
+    0xd7, 0x28, 0xcd, 0x0f, 0x8c, 0xc8, 0xc5, 0xd8, 0x62, 0x0f, 0x8d, 0x41,
+    0xc2, 0x02, 0xbc, 0x0f, 0x8d, 0x39, 0x98, 0x0f, 0x8a, 0x51, 0x85, 0x0f,
+    0x8d, 0x30, 0xd3, 0x40, 0x54, 0x0f, 0x8d, 0x21, 0x8d, 0x0f, 0x8c, 0xb8,
+    0xcd, 0x77, 0x60, 0x0f, 0x8d, 0x01, 0x44, 0x09, 0x9e, 0xc2, 0x37, 0x78,
+    0xc3, 0x02, 0xdf, 0x0f, 0x8c, 0x99, 0xd5, 0x35, 0x0c, 0x01, 0x4e, 0x28,
+    0x89, 0x0f, 0x8c, 0xb1, 0xc2, 0x04, 0xe6, 0x0f, 0x8c, 0xa8, 0xc9, 0x2a,
+    0xec, 0x01, 0x21, 0x30, 0xc2, 0x00, 0x74, 0x01, 0x20, 0x79, 0xc3, 0x00,
+    0xa3, 0x01, 0x20, 0x70, 0xc4, 0x27, 0xe3, 0x01, 0x20, 0x11, 0xc7, 0xc3,
+    0x92, 0x01, 0x20, 0x08, 0xc4, 0x6e, 0x67, 0x01, 0x21, 0x0b, 0x02, 0x37,
+    0x82, 0x4d, 0x7e, 0xd7, 0x42, 0x37, 0x88, 0xc5, 0xd5, 0xc4, 0x01, 0x21,
+    0x21, 0xd2, 0x49, 0x43, 0x01, 0x20, 0xa8, 0x45, 0x0a, 0x11, 0xc2, 0x37,
+    0x98, 0xc5, 0xd6, 0x0f, 0x01, 0x20, 0x28, 0x49, 0xb3, 0xd4, 0xc2, 0x37,
+    0xa2, 0xc2, 0x03, 0x4e, 0x00, 0x39, 0x08, 0x46, 0xcd, 0xbb, 0x42, 0x37,
+    0xca, 0xc2, 0x39, 0x8b, 0x00, 0x39, 0x61, 0xc3, 0x1e, 0x1b, 0x00, 0x38,
+    0xda, 0x02, 0x37, 0xdc, 0xc3, 0x11, 0xef, 0x00, 0x39, 0x59, 0xc4, 0x77,
+    0x78, 0x00, 0x39, 0x41, 0xc6, 0x7e, 0x1b, 0x00, 0x39, 0x19, 0xd0, 0x58,
+    0x42, 0x00, 0x38, 0x89, 0x47, 0xc9, 0x57, 0x42, 0x37, 0xe2, 0xc3, 0x04,
+    0x5a, 0x00, 0x39, 0x51, 0xca, 0x9d, 0xc4, 0x00, 0x39, 0x38, 0xc3, 0x11,
+    0x38, 0x00, 0x38, 0xf0, 0xc2, 0x00, 0x8e, 0x00, 0x38, 0xd0, 0xd2, 0x49,
+    0x79, 0x00, 0x38, 0xb1, 0xc5, 0x49, 0x81, 0x00, 0x38, 0xa8, 0xc9, 0xad,
+    0xa4, 0x00, 0x38, 0xa0, 0x00, 0xc2, 0x37, 0xf4, 0xcd, 0x75, 0x7f, 0x00,
+    0x39, 0xe0, 0xca, 0xa1, 0x02, 0x00, 0x38, 0x69, 0xc9, 0xaa, 0x71, 0x00,
+    0x38, 0x61, 0xc6, 0xaa, 0x74, 0x00, 0x38, 0x58, 0xc5, 0x05, 0x02, 0x00,
+    0x39, 0xb9, 0xc5, 0x00, 0xd4, 0x00, 0x39, 0xb0, 0xc5, 0x00, 0x2c, 0x00,
+    0x38, 0x39, 0xc4, 0x00, 0x49, 0x00, 0x38, 0x30, 0xc5, 0x33, 0x24, 0x00,
+    0x38, 0x23, 0x02, 0x38, 0x00, 0xc9, 0x11, 0xf6, 0x00, 0x38, 0x10, 0xc5,
+    0x33, 0x24, 0x00, 0x38, 0x1b, 0x02, 0x38, 0x06, 0xc9, 0x11, 0xf6, 0x00,
+    0x38, 0x08, 0xc5, 0x00, 0xd4, 0x00, 0x39, 0xe9, 0xc5, 0x05, 0x02, 0x00,
+    0x39, 0xf0, 0xc5, 0x00, 0xd4, 0x00, 0x3a, 0x19, 0xc5, 0x05, 0x02, 0x00,
+    0x3a, 0x20, 0xc5, 0x00, 0xd4, 0x00, 0x3a, 0x29, 0xc5, 0x05, 0x02, 0x00,
+    0x3a, 0x30, 0xc2, 0x01, 0x23, 0x05, 0x40, 0x89, 0x91, 0x05, 0x40, 0x80,
+    0x91, 0x05, 0x40, 0x91, 0xc2, 0x01, 0x23, 0x05, 0x40, 0x98, 0xd1, 0x52,
+    0xff, 0x0f, 0xa8, 0x51, 0xce, 0x6f, 0x1c, 0x0f, 0xa8, 0x49, 0xd3, 0x23,
+    0xc8, 0x0f, 0xa8, 0x38, 0x00, 0x42, 0x38, 0x0c, 0xcf, 0x09, 0xf8, 0x01,
+    0x4b, 0xd9, 0x42, 0x06, 0x62, 0x42, 0x38, 0x21, 0xc3, 0x02, 0xa3, 0x01,
+    0x55, 0xf1, 0xcf, 0x60, 0xf3, 0x01, 0x56, 0x01, 0xd9, 0x1f, 0x18, 0x01,
+    0x56, 0x10, 0xc6, 0x0e, 0xa4, 0x01, 0x56, 0xb9, 0xde, 0x0e, 0x8c, 0x01,
+    0x56, 0xc0, 0x52, 0x47, 0xdb, 0xc2, 0x38, 0x2d, 0xcf, 0x1d, 0xed, 0x01,
+    0x03, 0xe8, 0xca, 0x0e, 0xbe, 0x01, 0x03, 0xe1, 0xc4, 0x00, 0x2d, 0x01,
+    0x03, 0xc0, 0xc4, 0x18, 0x10, 0x01, 0x03, 0xb9, 0xc2, 0x22, 0xcc, 0x01,
+    0x03, 0xb0, 0xc3, 0x0d, 0x14, 0x01, 0x03, 0xa9, 0xc3, 0x09, 0x9e, 0x01,
+    0x03, 0xa0, 0xc2, 0x02, 0xa0, 0x00, 0x05, 0x91, 0xc4, 0x02, 0xde, 0x00,
+    0x05, 0x98, 0xc6, 0xca, 0xf7, 0x00, 0xe6, 0x11, 0xc7, 0xc6, 0x2b, 0x00,
+    0xe6, 0x08, 0x45, 0x21, 0xed, 0xc2, 0x38, 0x35, 0x83, 0x00, 0xdc, 0xb0,
+    0xc2, 0x00, 0xd0, 0x00, 0xdd, 0xe9, 0x83, 0x00, 0xdc, 0xc0, 0xc2, 0x2c,
+    0x43, 0x00, 0xdd, 0xe1, 0x83, 0x00, 0xdc, 0xe0, 0xc2, 0x2c, 0x43, 0x00,
+    0xdd, 0xd9, 0x83, 0x00, 0xdc, 0xd8, 0xc2, 0x19, 0x2c, 0x00, 0xdd, 0x79,
+    0x83, 0x00, 0xdc, 0xf0, 0xc2, 0x00, 0xd0, 0x00, 0xdd, 0x71, 0x83, 0x00,
+    0xdc, 0x50, 0x83, 0x00, 0xdc, 0xa1, 0xc2, 0x19, 0x2c, 0x00, 0xdc, 0x89,
+    0xc2, 0x01, 0x30, 0x00, 0xdc, 0x68, 0x97, 0x00, 0xdc, 0x48, 0x87, 0x00,
+    0xdc, 0x30, 0xc4, 0x18, 0x10, 0x00, 0xdd, 0xb9, 0xc2, 0x22, 0xcc, 0x00,
+    0xdd, 0xb0, 0xc3, 0x0d, 0x14, 0x00, 0xdd, 0xa9, 0xc3, 0x09, 0x9e, 0x00,
+    0xdd, 0xa0, 0xc4, 0x02, 0xde, 0x00, 0xdd, 0x99, 0xc2, 0x02, 0xa0, 0x00,
+    0xdd, 0x90, 0xc2, 0x01, 0x4a, 0x00, 0xdd, 0x69, 0xc2, 0x01, 0xc3, 0x00,
+    0xdd, 0x60, 0xc3, 0xd7, 0xd6, 0x00, 0xdd, 0x19, 0xc4, 0x89, 0x32, 0x00,
+    0xdd, 0x10, 0xc5, 0xdb, 0xc3, 0x00, 0xdd, 0x51, 0x10, 0x42, 0x38, 0x3d,
+    0xc7, 0xc6, 0x08, 0x00, 0xdd, 0x49, 0xc5, 0x0d, 0xe4, 0x00, 0xdd, 0x39,
+    0xc7, 0xc3, 0xbc, 0x00, 0xdd, 0x31, 0xc4, 0xde, 0xff, 0x00, 0xdd, 0x29,
+    0xc5, 0xd8, 0x9e, 0x00, 0xdd, 0x20, 0xcb, 0x0e, 0xbd, 0x01, 0x55, 0x81,
+    0xcc, 0x24, 0x47, 0x01, 0x55, 0x90, 0xc8, 0x07, 0x5f, 0x01, 0x55, 0xb1,
+    0xcf, 0x6a, 0x8f, 0x01, 0x55, 0xd0, 0xd1, 0x55, 0x52, 0x01, 0x14, 0x51,
+    0xcb, 0x23, 0xa0, 0x01, 0x14, 0x33, 0x02, 0x38, 0x47, 0x46, 0x00, 0xd4,
+    0x42, 0x38, 0x4d, 0xc6, 0x2d, 0xd0, 0x01, 0x56, 0x99, 0xc4, 0x0e, 0xa6,
+    0x01, 0x56, 0xa8, 0xca, 0x22, 0x09, 0x0f, 0xb0, 0x1b, 0x02, 0x38, 0x65,
+    0x0a, 0xc2, 0x38, 0x6b, 0x15, 0xc2, 0x38, 0x7d, 0xc4, 0x21, 0x23, 0x0f,
+    0xcb, 0x90, 0xca, 0x22, 0x09, 0x0f, 0xb1, 0xd1, 0xd1, 0x55, 0x0e, 0x0f,
+    0xb1, 0xd8, 0x47, 0xc2, 0x11, 0xc2, 0x38, 0x8c, 0x42, 0x0a, 0x8c, 0xc2,
+    0x38, 0x98, 0xc3, 0x0d, 0xe5, 0x07, 0xf2, 0xa8, 0xc9, 0x81, 0x9c, 0x01,
+    0x10, 0x53, 0x02, 0x38, 0xa2, 0xcf, 0x0f, 0x0a, 0x07, 0xf2, 0xb9, 0xc6,
+    0xbc, 0x34, 0x07, 0xf2, 0xc1, 0xca, 0x0e, 0xbe, 0x07, 0xf3, 0x30, 0x4d,
+    0x78, 0x7e, 0xc2, 0x38, 0xa8, 0x45, 0x00, 0x2d, 0xc2, 0x38, 0xc7, 0xce,
+    0x61, 0xd5, 0x07, 0xf3, 0x40, 0xe0, 0x05, 0x07, 0x08, 0x59, 0xd9, 0xc4,
+    0x1e, 0xc9, 0x00, 0x16, 0xe0, 0xc7, 0x2e, 0x21, 0x0f, 0xb7, 0x49, 0xc8,
+    0x36, 0x21, 0x07, 0xf3, 0x01, 0xc7, 0x0b, 0x00, 0x07, 0xf3, 0x08, 0x43,
+    0x00, 0x4b, 0xc2, 0x38, 0xd9, 0xcc, 0x8b, 0x11, 0x07, 0xf3, 0x20, 0xc8,
+    0x60, 0xf4, 0x07, 0xf3, 0x11, 0xcb, 0x8e, 0x13, 0x07, 0xf3, 0x50, 0x9f,
+    0x00, 0x04, 0x91, 0x9e, 0x00, 0x04, 0x88, 0xc3, 0x02, 0x9f, 0x00, 0x04,
+    0x91, 0xc3, 0x05, 0x14, 0x00, 0x04, 0x88, 0xc5, 0xd7, 0xa4, 0x0f, 0xad,
+    0xb0, 0xca, 0x37, 0x4e, 0x01, 0x13, 0xf1, 0xc5, 0x07, 0x62, 0x01, 0x13,
+    0xe0, 0x4c, 0x24, 0x3b, 0xc2, 0x38, 0xeb, 0xcb, 0x0e, 0xbd, 0x01, 0x55,
+    0x99, 0x44, 0x1f, 0xb2, 0xc2, 0x38, 0xf7, 0xcf, 0x6a, 0x8f, 0x01, 0x55,
+    0xb8, 0xc3, 0x0d, 0xe5, 0x07, 0xf0, 0x99, 0xc3, 0x0a, 0x8c, 0x07, 0xf0,
+    0x80, 0xcf, 0x0f, 0x0a, 0x07, 0xf0, 0xa9, 0xc6, 0xbc, 0x34, 0x07, 0xf1,
+    0x89, 0xc6, 0xcb, 0x5d, 0x07, 0xf1, 0x90, 0x44, 0x00, 0x4a, 0xc2, 0x39,
+    0x03, 0xc7, 0x80, 0x2f, 0x07, 0xf1, 0x98, 0xcb, 0x1a, 0x50, 0x07, 0xf1,
+    0xb1, 0x05, 0xc2, 0x39, 0x31, 0xd6, 0x08, 0x88, 0x07, 0xf1, 0xd1, 0xd8,
+    0x21, 0x83, 0x07, 0xf1, 0xe1, 0xd4, 0x38, 0xf4, 0x07, 0xf1, 0xf1, 0xce,
+    0x25, 0xad, 0x07, 0xf2, 0x41, 0x46, 0x01, 0xfc, 0xc2, 0x39, 0x3d, 0xcd,
+    0x0b, 0x91, 0x07, 0xf2, 0x00, 0xc5, 0x0a, 0x8a, 0x07, 0xf0, 0x89, 0xc9,
+    0x11, 0xf6, 0x07, 0xf0, 0x90, 0xc3, 0x00, 0x3a, 0x0f, 0x85, 0x01, 0xca,
+    0xa6, 0x98, 0x0f, 0x86, 0x78, 0xc6, 0xcf, 0x05, 0x0f, 0x85, 0x09, 0xc6,
+    0x78, 0x78, 0x0f, 0x85, 0x89, 0xc8, 0xba, 0x2a, 0x0f, 0x86, 0x09, 0xc5,
+    0xdd, 0x49, 0x0f, 0x86, 0x88, 0x46, 0xd2, 0xe9, 0xc2, 0x39, 0x49, 0x48,
+    0xbe, 0x4a, 0xc2, 0x39, 0x61, 0x46, 0xa8, 0xfa, 0xc2, 0x39, 0x79, 0x45,
+    0xdc, 0xf9, 0x42, 0x39, 0x91, 0x11, 0xc2, 0x39, 0xbb, 0x47, 0xc7, 0x2e,
+    0x42, 0x39, 0xc7, 0x46, 0xd1, 0xf9, 0xc2, 0x39, 0xdf, 0x48, 0xb5, 0x32,
+    0x42, 0x39, 0xf7, 0xc6, 0xcf, 0x05, 0x0f, 0x85, 0x41, 0xc6, 0x78, 0x78,
+    0x0f, 0x85, 0xc1, 0xc8, 0xba, 0x2a, 0x0f, 0x86, 0x41, 0xc5, 0xdd, 0x49,
+    0x0f, 0x86, 0xc0, 0xc6, 0xcf, 0x05, 0x0f, 0x85, 0x49, 0xc6, 0x78, 0x78,
+    0x0f, 0x85, 0xc9, 0xc8, 0xba, 0x2a, 0x0f, 0x86, 0x49, 0xc5, 0xdd, 0x49,
+    0x0f, 0x86, 0xc8, 0xc6, 0xcf, 0x05, 0x0f, 0x85, 0x59, 0xc6, 0x78, 0x78,
+    0x0f, 0x85, 0xd9, 0xc8, 0xba, 0x2a, 0x0f, 0x86, 0x59, 0xc5, 0xdd, 0x49,
+    0x0f, 0x86, 0xd8, 0x49, 0xae, 0x46, 0xc2, 0x3a, 0x0f, 0x47, 0x35, 0xce,
+    0x42, 0x3a, 0x27, 0xc6, 0xcf, 0x05, 0x0f, 0x85, 0x69, 0xc6, 0x78, 0x78,
+    0x0f, 0x85, 0xe9, 0xc8, 0xba, 0x2a, 0x0f, 0x86, 0x69, 0xc5, 0xdd, 0x49,
+    0x0f, 0x86, 0xe8, 0xc2, 0x02, 0xa0, 0x01, 0x5e, 0x99, 0xc4, 0x02, 0xde,
+    0x01, 0x5e, 0xa0, 0xc3, 0x09, 0x9e, 0x01, 0x5e, 0xa9, 0xc3, 0x0d, 0x14,
+    0x01, 0x5e, 0xb0, 0x43, 0x03, 0x35, 0xc2, 0x3a, 0x3f, 0x45, 0x00, 0x8c,
+    0xc2, 0x3a, 0x51, 0xd1, 0x0e, 0xb7, 0x01, 0x53, 0x90, 0xcb, 0x90, 0xe9,
+    0x0f, 0xae, 0xf9, 0xc3, 0x00, 0x33, 0x0f, 0xa6, 0x18, 0x45, 0x02, 0x6d,
+    0xc2, 0x3a, 0x6d, 0xcc, 0x43, 0x07, 0x01, 0x10, 0x10, 0x9c, 0x01, 0x25,
+    0xa9, 0x9b, 0x01, 0x25, 0xa1, 0x9a, 0x01, 0x25, 0x99, 0x99, 0x01, 0x25,
+    0x91, 0x98, 0x01, 0x25, 0x89, 0x97, 0x01, 0x25, 0x81, 0x96, 0x01, 0x25,
+    0x79, 0x95, 0x01, 0x25, 0x71, 0x94, 0x01, 0x25, 0x69, 0x93, 0x01, 0x25,
+    0x61, 0x92, 0x01, 0x25, 0x59, 0x91, 0x01, 0x25, 0x51, 0x90, 0x01, 0x25,
+    0x49, 0x8f, 0x01, 0x25, 0x41, 0x8e, 0x01, 0x25, 0x39, 0x8d, 0x01, 0x25,
+    0x31, 0x8c, 0x01, 0x25, 0x29, 0x8b, 0x01, 0x25, 0x21, 0x8a, 0x01, 0x25,
+    0x19, 0x89, 0x01, 0x25, 0x11, 0x88, 0x01, 0x25, 0x09, 0x87, 0x01, 0x25,
+    0x01, 0x86, 0x01, 0x24, 0xf9, 0x85, 0x01, 0x24, 0xf1, 0x84, 0x01, 0x24,
+    0xe9, 0x83, 0x01, 0x24, 0xe0, 0x99, 0x0f, 0x89, 0x31, 0x9a, 0x0f, 0x89,
+    0x39, 0x9b, 0x0f, 0x89, 0x41, 0x9c, 0x0f, 0x89, 0x49, 0x83, 0x0f, 0x88,
+    0x81, 0x84, 0x0f, 0x88, 0x89, 0x85, 0x0f, 0x88, 0x91, 0x86, 0x0f, 0x88,
+    0x99, 0x87, 0x0f, 0x88, 0xa1, 0x88, 0x0f, 0x88, 0xa9, 0x89, 0x0f, 0x88,
+    0xb1, 0x8a, 0x0f, 0x88, 0xb9, 0x8b, 0x0f, 0x88, 0xc1, 0x8c, 0x0f, 0x88,
+    0xc9, 0x8d, 0x0f, 0x88, 0xd1, 0x8e, 0x0f, 0x88, 0xd9, 0x8f, 0x0f, 0x88,
+    0xe1, 0x90, 0x0f, 0x88, 0xe9, 0x91, 0x0f, 0x88, 0xf1, 0x92, 0x0f, 0x88,
+    0xf9, 0x93, 0x0f, 0x89, 0x01, 0x94, 0x0f, 0x89, 0x09, 0x95, 0x0f, 0x89,
+    0x11, 0x96, 0x0f, 0x89, 0x19, 0x97, 0x0f, 0x89, 0x21, 0x98, 0x0f, 0x89,
+    0x28, 0x42, 0x00, 0x28, 0xc2, 0x3a, 0x85, 0xc7, 0x52, 0xcc, 0x01, 0x24,
+    0x01, 0xc2, 0x00, 0xc4, 0x01, 0x23, 0xe8, 0xc7, 0x1f, 0x6e, 0x01, 0x24,
+    0x29, 0xc5, 0x66, 0xb1, 0x01, 0x23, 0xf0, 0xc8, 0x48, 0x23, 0x01, 0x24,
+    0x21, 0xc6, 0x44, 0x9c, 0x01, 0x24, 0x18, 0xc6, 0x14, 0x07, 0x01, 0x24,
+    0x11, 0xc7, 0x34, 0x37, 0x01, 0x24, 0x08, 0xc4, 0x18, 0x10, 0x01, 0x23,
+    0xd1, 0xc2, 0x22, 0xcc, 0x01, 0x23, 0xc8, 0xc3, 0x0d, 0x14, 0x01, 0x23,
+    0xc1, 0xc3, 0x09, 0x9e, 0x01, 0x23, 0xb8, 0xc4, 0x02, 0xde, 0x01, 0x23,
+    0xb1, 0xc2, 0x02, 0xa0, 0x01, 0x23, 0xa8, 0xc5, 0x8e, 0xdf, 0x01, 0x90,
+    0x03, 0x02, 0x3a, 0x91, 0xc6, 0xbb, 0xec, 0x01, 0x90, 0x52, 0x02, 0x3a,
+    0x97, 0xc2, 0x00, 0xd3, 0x01, 0x90, 0x78, 0xc5, 0xc0, 0x7d, 0x01, 0x90,
+    0x13, 0x02, 0x3a, 0x9d, 0xc6, 0xc1, 0x86, 0x01, 0x90, 0x5a, 0x02, 0x3a,
+    0xa3, 0xc2, 0x00, 0xd3, 0x01, 0x90, 0x88, 0xc2, 0x00, 0xd3, 0x01, 0x90,
+    0x90, 0xc4, 0x79, 0xf3, 0x01, 0x90, 0x2b, 0x02, 0x3a, 0xa9, 0xc6, 0xba,
+    0x7c, 0x01, 0x90, 0x62, 0x02, 0x3a, 0xaf, 0xc2, 0x00, 0xd3, 0x01, 0x90,
+    0xa0, 0xc2, 0x00, 0xd3, 0x01, 0x90, 0xa8, 0xc4, 0xc6, 0x7a, 0x01, 0x90,
+    0x43, 0x02, 0x3a, 0xb5, 0xc6, 0xc6, 0x79, 0x01, 0x90, 0x4a, 0x02, 0x3a,
+    0xb9, 0xc2, 0x00, 0xd3, 0x01, 0x90, 0xd8, 0xc2, 0x02, 0xa0, 0x01, 0x91,
+    0x09, 0xc4, 0x02, 0xde, 0x01, 0x91, 0x11, 0xc2, 0x00, 0xc4, 0x01, 0x91,
+    0x48, 0xc3, 0x09, 0x9e, 0x01, 0x91, 0x19, 0x0b, 0xc2, 0x3a, 0xbf, 0xc7,
+    0xc8, 0x9a, 0x01, 0x92, 0x00, 0xc2, 0x22, 0xcc, 0x01, 0x91, 0x29, 0x07,
+    0xc2, 0x3a, 0xd1, 0x17, 0xc2, 0x3a, 0xdd, 0x16, 0xc2, 0x3a, 0xe7, 0xc6,
+    0xcc, 0xbf, 0x01, 0x91, 0x99, 0xc6, 0xca, 0xe5, 0x01, 0x91, 0xa8, 0xc4,
+    0x00, 0x2d, 0x01, 0x91, 0x39, 0xc4, 0x61, 0xc1, 0x01, 0x91, 0x79, 0xc9,
+    0xaf, 0x8a, 0x01, 0x91, 0xe8, 0xc3, 0x02, 0x6e, 0x01, 0x91, 0x41, 0xc3,
+    0x00, 0xc2, 0x01, 0x91, 0xa0, 0xc3, 0x01, 0x54, 0x01, 0x91, 0x51, 0xc4,
+    0x04, 0x87, 0x01, 0x91, 0x70, 0xc4, 0x03, 0xd7, 0x01, 0x91, 0x61, 0xc3,
+    0x29, 0x82, 0x01, 0x91, 0x68, 0xcd, 0x7b, 0x3c, 0x01, 0x91, 0xb9, 0xc3,
+    0x03, 0x15, 0x01, 0x91, 0xd0, 0xc7, 0x75, 0x78, 0x01, 0x91, 0xc9, 0x15,
+    0xc2, 0x3a, 0xf3, 0xc3, 0x29, 0x43, 0x01, 0x92, 0x18, 0xd1, 0x01, 0x68,
+    0x01, 0x57, 0x91, 0xce, 0x33, 0x92, 0x01, 0x57, 0x98, 0xc5, 0x26, 0xf7,
+    0x08, 0xd7, 0xb9, 0xc4, 0x0d, 0xe5, 0x08, 0xd7, 0x9a, 0x02, 0x3a, 0xfd,
+    0x45, 0x21, 0xed, 0xc2, 0x3b, 0x03, 0x83, 0x08, 0xd6, 0x98, 0x83, 0x08,
+    0xd6, 0xd8, 0x83, 0x08, 0xd6, 0xd0, 0xc2, 0x00, 0xd0, 0x08, 0xd6, 0xa1,
+    0x83, 0x08, 0xd6, 0x68, 0xc2, 0x00, 0xd0, 0x08, 0xd6, 0x89, 0x83, 0x08,
+    0xd6, 0x00, 0x83, 0x08, 0xd6, 0x81, 0xc2, 0x01, 0x30, 0x08, 0xd6, 0x38,
+    0xc2, 0x00, 0xd0, 0x08, 0xd6, 0x79, 0x83, 0x08, 0xd6, 0x70, 0xc2, 0x00,
+    0xd0, 0x08, 0xd6, 0x51, 0x83, 0x08, 0xd6, 0x08, 0xc5, 0x26, 0xf7, 0x08,
+    0xd7, 0x71, 0xc4, 0x0d, 0xe5, 0x08, 0xd7, 0x5a, 0x02, 0x3b, 0x26, 0xc6,
+    0x26, 0xf6, 0x08, 0xd7, 0x40, 0x16, 0xc2, 0x3b, 0x2c, 0x08, 0xc2, 0x3b,
+    0x3c, 0xc3, 0x05, 0x14, 0x08, 0x43, 0xc8, 0xd3, 0x42, 0xb4, 0x08, 0x43,
+    0xb9, 0x45, 0x02, 0x10, 0x42, 0x3b, 0x48, 0xc2, 0xbe, 0xd3, 0x0b, 0x5c,
+    0x79, 0xc2, 0x19, 0x2d, 0x0b, 0x5c, 0x50, 0xc2, 0x24, 0x82, 0x0b, 0x5c,
+    0x71, 0xc3, 0xa4, 0xa3, 0x0b, 0x5c, 0x41, 0xc2, 0x01, 0x24, 0x0b, 0x5c,
+    0x10, 0x15, 0xc2, 0x3b, 0xb1, 0xc3, 0xe5, 0xcc, 0x0b, 0x5c, 0x28, 0xc2,
+    0x19, 0x2d, 0x0b, 0x5c, 0x61, 0xc3, 0xe0, 0x95, 0x0b, 0x5b, 0xf0, 0x8f,
+    0x0b, 0x5c, 0x49, 0xc2, 0xbe, 0xd3, 0x0b, 0x5c, 0x18, 0xc3, 0xe5, 0x33,
+    0x0b, 0x5c, 0x01, 0xc2, 0x00, 0xfa, 0x0b, 0x5b, 0xf8, 0xc2, 0x20, 0xec,
+    0x0b, 0x59, 0x79, 0xc3, 0xa6, 0x62, 0x0b, 0x59, 0x38, 0xc2, 0x20, 0xec,
+    0x0b, 0x59, 0x61, 0x16, 0xc2, 0x3b, 0xc3, 0xc4, 0xe3, 0x17, 0x0b, 0x59,
+    0x41, 0xc3, 0xdb, 0xb7, 0x0b, 0x59, 0x11, 0xc3, 0x20, 0xeb, 0x0b, 0x59,
+    0x00, 0xc3, 0x57, 0x0c, 0x0b, 0x59, 0x49, 0xc3, 0x20, 0xeb, 0x0b, 0x59,
+    0x29, 0xc2, 0x20, 0xec, 0x0b, 0x58, 0xf8, 0xc3, 0xe6, 0x53, 0x0b, 0x5b,
+    0xa3, 0x02, 0x3b, 0xcf, 0xc7, 0xbf, 0xe1, 0x0b, 0x5a, 0x28, 0xca, 0xa7,
+    0x6a, 0x0b, 0x5b, 0x99, 0xc4, 0x12, 0xc1, 0x0b, 0x59, 0xc8, 0xc5, 0xd5,
+    0x51, 0x0b, 0x5b, 0x71, 0xc4, 0xdf, 0x2f, 0x0b, 0x5a, 0x08, 0xc2, 0x01,
+    0x24, 0x0b, 0x5b, 0x21, 0x44, 0x19, 0x61, 0x42, 0x3b, 0xd5, 0x0a, 0xc2,
+    0x3b, 0xed, 0xc9, 0xa8, 0x82, 0x0b, 0x59, 0xc0, 0x00, 0xc2, 0x3b, 0xf9,
+    0x95, 0x0b, 0x5a, 0xd8, 0x98, 0x0b, 0x58, 0xd9, 0x84, 0x0b, 0x58, 0xd0,
+    0x98, 0x0b, 0x58, 0x79, 0x84, 0x0b, 0x58, 0x70, 0x98, 0x0b, 0x58, 0x59,
+    0x84, 0x0b, 0x58, 0x50, 0x98, 0x0b, 0x58, 0x29, 0x84, 0x0b, 0x58, 0x20,
+    0x98, 0x0b, 0x58, 0xa9, 0x84, 0x0b, 0x58, 0xa0, 0x98, 0x0b, 0x58, 0x69,
+    0x84, 0x0b, 0x58, 0x60, 0x98, 0x0b, 0x58, 0x89, 0x84, 0x0b, 0x58, 0x80,
+    0x98, 0x0b, 0x58, 0x09, 0x84, 0x0b, 0x58, 0x00, 0xc5, 0x11, 0x55, 0x01,
+    0x81, 0x00, 0x45, 0x00, 0x8c, 0xc2, 0x3c, 0x05, 0xc8, 0x7d, 0x5e, 0x0f,
+    0xb2, 0x69, 0x14, 0xc2, 0x3c, 0x21, 0xcd, 0x80, 0x5d, 0x0f, 0xb2, 0x39,
+    0xcf, 0x63, 0x78, 0x0f, 0xc9, 0xc1, 0x43, 0x03, 0x35, 0xc2, 0x3c, 0x27,
+    0xc8, 0xb5, 0xb2, 0x0f, 0xce, 0xb8, 0xc4, 0x02, 0xde, 0x08, 0x48, 0xd9,
+    0x19, 0xc2, 0x3c, 0x33, 0xc2, 0x00, 0xc4, 0x08, 0x48, 0xb8, 0xc8, 0x0d,
+    0x03, 0x08, 0x48, 0xc8, 0xc2, 0x20, 0xec, 0x08, 0x48, 0xa9, 0xc2, 0x00,
+    0x3d, 0x08, 0x48, 0x40, 0xc3, 0x11, 0xef, 0x08, 0x48, 0xa1, 0xc3, 0x01,
+    0x9d, 0x08, 0x48, 0x89, 0xc3, 0x7e, 0x1b, 0x08, 0x48, 0x70, 0xc2, 0x00,
+    0x74, 0x08, 0x48, 0x79, 0xc2, 0x01, 0xd0, 0x08, 0x48, 0x00, 0x96, 0x08,
+    0x48, 0x38, 0x83, 0x05, 0x42, 0x01, 0xc2, 0x00, 0xd0, 0x05, 0x42, 0x08,
+    0x83, 0x05, 0x42, 0x11, 0xc2, 0x01, 0x30, 0x05, 0x43, 0x28, 0xc2, 0x01,
+    0x30, 0x05, 0x42, 0x19, 0xc2, 0x19, 0x2c, 0x05, 0x42, 0x39, 0x83, 0x05,
+    0x42, 0x59, 0xc2, 0x00, 0xc1, 0x05, 0x43, 0x60, 0x83, 0x05, 0x42, 0x23,
+    0x02, 0x3c, 0x3d, 0xc2, 0x00, 0xd0, 0x05, 0x42, 0x28, 0x83, 0x05, 0x42,
+    0x41, 0xc2, 0x00, 0xd0, 0x05, 0x42, 0x49, 0x15, 0xc2, 0x3c, 0x43, 0x16,
+    0x42, 0x3c, 0x4d, 0x83, 0x05, 0x42, 0x51, 0xc2, 0x02, 0x1c, 0x05, 0x42,
+    0x91, 0xc2, 0x0e, 0x9a, 0x05, 0x43, 0x58, 0x83, 0x05, 0x42, 0x61, 0xc2,
+    0x00, 0xd0, 0x05, 0x42, 0x68, 0xc2, 0x00, 0xd0, 0x05, 0x42, 0xa1, 0x83,
+    0x05, 0x42, 0xa8, 0xc6, 0x24, 0x9c, 0x05, 0x42, 0xb1, 0xc2, 0x00, 0xd0,
+    0x05, 0x42, 0xd1, 0x83, 0x05, 0x42, 0xd8, 0xcb, 0x91, 0xf1, 0x05, 0x43,
+    0x69, 0xcb, 0x8f, 0xaa, 0x05, 0x43, 0x80, 0x87, 0x05, 0x43, 0x30, 0xc8,
+    0xbc, 0x12, 0x05, 0x43, 0x71, 0xc4, 0x0c, 0x2b, 0x05, 0x43, 0x78, 0x4f,
+    0x5c, 0xf3, 0xc2, 0x3c, 0x57, 0xd2, 0x47, 0xc9, 0x05, 0x43, 0x90, 0xc9,
+    0xb4, 0xf4, 0x08, 0x0e, 0x89, 0xc8, 0xbf, 0x22, 0x08, 0x0f, 0x90, 0xc5,
+    0x61, 0xba, 0x08, 0x0e, 0x99, 0xcd, 0x76, 0x1b, 0x08, 0x0f, 0x11, 0x96,
+    0x08, 0x0f, 0x60, 0xc2, 0x00, 0x50, 0x08, 0x0f, 0x23, 0x02, 0x3c, 0x69,
+    0xc4, 0xe4, 0x9f, 0x08, 0x0f, 0x30, 0x99, 0x08, 0x0e, 0xd1, 0xc7, 0xc9,
+    0xce, 0x08, 0x0f, 0x08, 0xc4, 0xd3, 0x73, 0x08, 0x0f, 0x38, 0xc3, 0x19,
+    0x78, 0x08, 0x0e, 0xd9, 0x92, 0x08, 0x0f, 0x40, 0xc8, 0x74, 0xc4, 0x00,
+    0x4a, 0x91, 0xc6, 0x74, 0xc6, 0x00, 0x4a, 0x88, 0x42, 0x07, 0xb2, 0xc2,
+    0x3c, 0x6f, 0x03, 0xc2, 0x3c, 0x7b, 0xc5, 0x33, 0x5d, 0x00, 0x49, 0xe1,
+    0xcb, 0x1e, 0x89, 0x00, 0x48, 0x0b, 0x02, 0x3c, 0x87, 0xd4, 0x39, 0xa8,
+    0x00, 0x48, 0x01, 0x15, 0xc2, 0x3c, 0x8b, 0xc8, 0xbe, 0xca, 0x05, 0x47,
+    0xc1, 0xd9, 0x1e, 0x82, 0x05, 0x47, 0xa1, 0xd0, 0x5a, 0x12, 0x00, 0x4b,
+    0x88, 0x99, 0x00, 0x4a, 0x79, 0x97, 0x00, 0x4a, 0x61, 0x8b, 0x00, 0x4a,
+    0x41, 0x83, 0x00, 0x49, 0xf1, 0x9b, 0x05, 0x47, 0xf8, 0xc2, 0x49, 0x0c,
+    0x00, 0x49, 0xd9, 0x87, 0x00, 0x49, 0xd0, 0x91, 0x00, 0x4a, 0x51, 0x87,
+    0x00, 0x4a, 0x30, 0x91, 0x00, 0x4a, 0x49, 0x87, 0x00, 0x4a, 0x29, 0xc6,
+    0xcf, 0x2f, 0x00, 0x4a, 0xa8, 0x94, 0x00, 0x4a, 0x1b, 0x02, 0x3c, 0x97,
+    0x8e, 0x00, 0x4b, 0x12, 0x02, 0x3c, 0x9b, 0x97, 0x00, 0x4a, 0x13, 0x02,
+    0x3c, 0x9f, 0x87, 0x00, 0x4a, 0xb0, 0x8b, 0x00, 0x4a, 0x00, 0x83, 0x00,
+    0x49, 0xc9, 0xc7, 0xc4, 0xb1, 0x00, 0x4b, 0xd0, 0x83, 0x00, 0x49, 0xc1,
+    0xc2, 0x0d, 0xf6, 0x00, 0x49, 0xb9, 0x0a, 0x42, 0x3c, 0xa3, 0x83, 0x00,
+    0x49, 0xa9, 0x47, 0xb2, 0x2e, 0x42, 0x3c, 0xad, 0x0e, 0xc2, 0x3c, 0xbb,
+    0x83, 0x00, 0x49, 0x90, 0xc2, 0x00, 0x39, 0x00, 0x49, 0x89, 0x83, 0x00,
+    0x49, 0x81, 0xc2, 0x00, 0xd0, 0x00, 0x4a, 0xe8, 0x83, 0x00, 0x49, 0x79,
+    0xc2, 0x19, 0x2c, 0x00, 0x4a, 0xf8, 0xc9, 0xad, 0x53, 0x00, 0x4b, 0xc0,
+    0xc2, 0x00, 0xd0, 0x00, 0x49, 0x69, 0x83, 0x00, 0x49, 0x61, 0xc2, 0x01,
+    0x5d, 0x00, 0x4b, 0xf8, 0xc2, 0x00, 0xd0, 0x00, 0x49, 0x59, 0x83, 0x00,
+    0x49, 0x50, 0x10, 0xc2, 0x3c, 0xc5, 0x83, 0x00, 0x49, 0x41, 0xc2, 0x19,
+    0x2c, 0x00, 0x48, 0xf1, 0xc2, 0x01, 0x30, 0x00, 0x48, 0xc8, 0xc2, 0x00,
+    0xd0, 0x00, 0x49, 0x39, 0x83, 0x00, 0x49, 0x31, 0x06, 0x42, 0x3c, 0xcf,
+    0xc2, 0x00, 0xd0, 0x00, 0x49, 0x29, 0x83, 0x00, 0x49, 0x21, 0x16, 0x42,
+    0x3c, 0xdd, 0xc2, 0x00, 0xd0, 0x00, 0x48, 0xe9, 0x83, 0x00, 0x48, 0xe1,
+    0xc2, 0x25, 0x3b, 0x00, 0x4b, 0xe0, 0xc2, 0x00, 0xd0, 0x00, 0x48, 0xd9,
+    0x83, 0x00, 0x48, 0xd2, 0x02, 0x3c, 0xe7, 0x0a, 0xc2, 0x3c, 0xed, 0x83,
+    0x00, 0x48, 0xb9, 0xc2, 0x01, 0x30, 0x00, 0x4b, 0xd9, 0xcb, 0x23, 0x34,
+    0x00, 0x4b, 0xe8, 0x0a, 0xc2, 0x3c, 0xf7, 0x83, 0x00, 0x48, 0xa8, 0x97,
+    0x00, 0x48, 0xa1, 0x8b, 0x00, 0x48, 0x81, 0x83, 0x00, 0x48, 0x31, 0x9b,
+    0x05, 0x47, 0xf1, 0x99, 0x00, 0x4b, 0xa8, 0x87, 0x00, 0x4b, 0x99, 0xc2,
+    0x49, 0x0c, 0x00, 0x4b, 0xa0, 0x97, 0x00, 0x48, 0x53, 0x02, 0x3d, 0x01,
+    0x87, 0x00, 0x4b, 0xb0, 0x8b, 0x00, 0x48, 0x40, 0x83, 0x00, 0x4a, 0xd9,
+    0xc2, 0x00, 0xd0, 0x00, 0x4b, 0xc8, 0xc4, 0x26, 0x78, 0x00, 0x4b, 0x79,
+    0xc5, 0x06, 0xdb, 0x00, 0x4b, 0x71, 0x15, 0xc2, 0x3d, 0x05, 0x08, 0xc2,
+    0x3d, 0x11, 0x16, 0xc2, 0x3d, 0x1d, 0xc3, 0x05, 0x14, 0x00, 0x4b, 0x39,
+    0xc4, 0x15, 0xe7, 0x00, 0x4b, 0x30, 0x45, 0x2c, 0x86, 0xc2, 0x3d, 0x29,
+    0x46, 0x2e, 0xee, 0xc2, 0x3d, 0x3f, 0xc2, 0x0c, 0x42, 0x08, 0x20, 0x61,
+    0x11, 0xc2, 0x3d, 0x55, 0xc2, 0x14, 0x68, 0x08, 0x20, 0x71, 0xc3, 0x17,
+    0x29, 0x08, 0x20, 0x79, 0x8a, 0x08, 0x20, 0x81, 0xc3, 0x6f, 0xb7, 0x08,
+    0x20, 0x89, 0xc3, 0xb2, 0x36, 0x08, 0x20, 0x91, 0x16, 0xc2, 0x3d, 0x5d,
+    0xc3, 0x80, 0x64, 0x08, 0x20, 0xa1, 0xc4, 0x46, 0xfd, 0x08, 0x20, 0xa9,
+    0xc3, 0x30, 0xc1, 0x08, 0x20, 0xb1, 0xc3, 0x72, 0xc8, 0x08, 0x20, 0xb9,
+    0xc3, 0x93, 0x51, 0x08, 0x20, 0xc1, 0x07, 0xc2, 0x3d, 0x69, 0xc3, 0x0a,
+    0x85, 0x08, 0x20, 0xd1, 0x1c, 0x42, 0x3d, 0x91, 0x45, 0x2c, 0x86, 0xc2,
+    0x3d, 0x9d, 0x46, 0x2e, 0xee, 0xc2, 0x3d, 0xb3, 0xc2, 0x0c, 0x42, 0x08,
+    0x21, 0xa1, 0x11, 0xc2, 0x3d, 0xc9, 0xc2, 0x14, 0x68, 0x08, 0x21, 0xb1,
+    0xc3, 0x17, 0x29, 0x08, 0x21, 0xb9, 0x8a, 0x08, 0x21, 0xc1, 0xc3, 0x6f,
+    0xb7, 0x08, 0x21, 0xc9, 0xc3, 0xb2, 0x36, 0x08, 0x21, 0xd1, 0x16, 0xc2,
+    0x3d, 0xd1, 0xc3, 0x80, 0x64, 0x08, 0x21, 0xe1, 0xc4, 0x46, 0xfd, 0x08,
+    0x21, 0xe9, 0xc3, 0x30, 0xc1, 0x08, 0x21, 0xf1, 0xc3, 0x72, 0xc8, 0x08,
+    0x21, 0xf9, 0xc3, 0x93, 0x51, 0x08, 0x22, 0x01, 0x07, 0xc2, 0x3d, 0xdd,
+    0xc3, 0x0a, 0x85, 0x08, 0x22, 0x11, 0x1c, 0x42, 0x3e, 0x05, 0xc4, 0x00,
+    0x49, 0x01, 0x1e, 0x61, 0xc5, 0x00, 0x2c, 0x01, 0x1d, 0xf8, 0xc4, 0x00,
+    0x49, 0x01, 0x1e, 0x59, 0xc5, 0x00, 0x2c, 0x01, 0x1d, 0xf0, 0xc4, 0x8f,
+    0x73, 0x0e, 0x98, 0x21, 0xc5, 0x73, 0xcb, 0x0e, 0x98, 0x18, 0xc9, 0x11,
+    0xf6, 0x01, 0x24, 0x81, 0xc5, 0x0a, 0x8a, 0x0f, 0x88, 0x50, 0xc9, 0x11,
+    0xf6, 0x01, 0x24, 0x79, 0xc5, 0x0a, 0x8a, 0x0f, 0x88, 0x48, 0x00, 0x42,
+    0x3e, 0x11, 0x00, 0x42, 0x3e, 0x1d, 0x00, 0x42, 0x3e, 0x29, 0x00, 0x42,
+    0x3e, 0x35, 0x00, 0x42, 0x3e, 0x41, 0x00, 0x42, 0x3e, 0x4d, 0xc9, 0x11,
+    0xf6, 0x01, 0x24, 0x41, 0xc5, 0x0a, 0x8a, 0x0f, 0x88, 0x10, 0xc9, 0x11,
+    0xf6, 0x0f, 0x88, 0x01, 0xc5, 0x0a, 0x8a, 0x0f, 0x88, 0x08, 0xc4, 0x26,
+    0x78, 0x08, 0xca, 0xc9, 0xc5, 0x06, 0xdb, 0x08, 0xca, 0xc1, 0x15, 0xc2,
+    0x3e, 0x59, 0x08, 0xc2, 0x3e, 0x65, 0x16, 0xc2, 0x3e, 0x71, 0xc3, 0x05,
+    0x14, 0x08, 0xca, 0x89, 0xc4, 0x15, 0xe7, 0x08, 0xca, 0x80, 0x91, 0x08,
+    0xc9, 0xc1, 0x03, 0xc2, 0x3e, 0x7d, 0x87, 0x08, 0xc9, 0xa9, 0x97, 0x08,
+    0xc9, 0x9b, 0x02, 0x3e, 0x85, 0x8b, 0x08, 0xc9, 0x8a, 0x02, 0x3e, 0x89,
+    0xc2, 0x00, 0xdb, 0x08, 0xc9, 0x71, 0x83, 0x08, 0xc9, 0x40, 0x83, 0x08,
+    0xc9, 0x61, 0xc2, 0x0d, 0xf6, 0x08, 0xc9, 0x59, 0xc2, 0x00, 0xd0, 0x08,
+    0xc9, 0x50, 0xc2, 0x19, 0x2c, 0x08, 0xc9, 0x31, 0x83, 0x08, 0xc9, 0x28,
+    0xc2, 0x00, 0xd0, 0x08, 0xc9, 0x19, 0x83, 0x08, 0xc9, 0x10, 0xc2, 0x00,
+    0xd0, 0x08, 0xc9, 0x09, 0x83, 0x08, 0xc9, 0x00, 0x83, 0x08, 0xc8, 0xf9,
+    0xc2, 0x00, 0xc1, 0x08, 0xc8, 0xd1, 0xc2, 0x19, 0x2c, 0x08, 0xc8, 0xa9,
+    0xc2, 0x01, 0x30, 0x08, 0xc8, 0x80, 0xc2, 0x00, 0xd0, 0x08, 0xc8, 0xf1,
+    0x83, 0x08, 0xc8, 0xe9, 0x06, 0x42, 0x3e, 0x8d, 0xc2, 0x00, 0xd0, 0x08,
+    0xc8, 0xe1, 0x83, 0x08, 0xc8, 0xd9, 0xc2, 0x01, 0x6f, 0x08, 0xc8, 0xb0,
+    0xc2, 0x00, 0xd0, 0x08, 0xc8, 0x91, 0x83, 0x08, 0xc8, 0x88, 0xc2, 0x00,
+    0xd0, 0x08, 0xc8, 0x79, 0x83, 0x08, 0xc8, 0x70, 0xc2, 0x00, 0xd0, 0x08,
+    0xc8, 0x69, 0x83, 0x08, 0xc8, 0x60, 0x97, 0x08, 0xc8, 0x28, 0x8b, 0x08,
+    0xc8, 0x18, 0x83, 0x08, 0xc8, 0x08, 0xc4, 0x03, 0x03, 0x01, 0x10, 0xa9,
+    0xc3, 0x00, 0xbb, 0x00, 0x07, 0xb8, 0xc4, 0x26, 0x78, 0x01, 0x3c, 0x91,
+    0xc5, 0x06, 0xdb, 0x01, 0x3c, 0x89, 0x15, 0xc2, 0x3e, 0x97, 0x08, 0xc2,
+    0x3e, 0xa3, 0x16, 0xc2, 0x3e, 0xaf, 0xc3, 0x05, 0x14, 0x01, 0x3c, 0x51,
+    0xc4, 0x15, 0xe7, 0x0f, 0x88, 0x60, 0xc4, 0x18, 0x10, 0x01, 0x3b, 0xe1,
+    0xc2, 0x22, 0xcc, 0x01, 0x3b, 0xd8, 0xc3, 0x0d, 0x14, 0x01, 0x3b, 0xd1,
+    0xc3, 0x09, 0x9e, 0x01, 0x3b, 0xc8, 0xc4, 0x02, 0xde, 0x01, 0x3b, 0xc1,
+    0xc2, 0x02, 0xa0, 0x01, 0x3b, 0xb8, 0xc4, 0x18, 0x10, 0x01, 0x3c, 0x31,
+    0xc2, 0x22, 0xcc, 0x01, 0x3c, 0x28, 0xc3, 0x0d, 0x14, 0x01, 0x3c, 0x21,
+    0xc3, 0x09, 0x9e, 0x01, 0x3c, 0x18, 0xc4, 0x02, 0xde, 0x01, 0x3c, 0x11,
+    0xc2, 0x02, 0xa0, 0x01, 0x3c, 0x08, 0xcf, 0x66, 0x66, 0x01, 0x58, 0xb1,
+    0xd0, 0x5d, 0x52, 0x01, 0x58, 0xb9, 0xce, 0x74, 0xda, 0x01, 0x58, 0xc1,
+    0xd1, 0x53, 0xba, 0x01, 0x58, 0xc8, 0xc9, 0x33, 0xad, 0x0f, 0xc8, 0x50,
+    0xc9, 0x33, 0xad, 0x0f, 0xc8, 0x58, 0x42, 0x00, 0x2c, 0xc2, 0x3e, 0xbb,
+    0x42, 0x02, 0xa0, 0x42, 0x3e, 0xc7, 0xcf, 0x5b, 0xc3, 0x0f, 0xc2, 0x99,
+    0xcc, 0x88, 0xdd, 0x0f, 0xc1, 0xd8, 0x45, 0x11, 0x3a, 0xc2, 0x3e, 0xd3,
+    0x51, 0x01, 0x51, 0x42, 0x3e, 0xdf, 0xc4, 0x01, 0xa3, 0x01, 0x0c, 0x9b,
+    0x02, 0x3e, 0xeb, 0xc5, 0xdb, 0x50, 0x01, 0x70, 0xa0, 0xda, 0x1b, 0xd0,
+    0x0f, 0xc4, 0xb8, 0xcb, 0x82, 0xba, 0x01, 0x0f, 0x19, 0xcb, 0x82, 0x36,
+    0x01, 0x0e, 0x98, 0xc5, 0x01, 0xa2, 0x01, 0x58, 0x39, 0xd3, 0x43, 0xe4,
+    0x01, 0x5c, 0x58, 0xa3, 0x0f, 0x82, 0x99, 0x9d, 0x0f, 0x82, 0x69, 0x9e,
+    0x0f, 0x82, 0x71, 0x9f, 0x0f, 0x82, 0x79, 0xa0, 0x0f, 0x82, 0x81, 0xa1,
+    0x0f, 0x82, 0x89, 0xa2, 0x0f, 0x82, 0x90, 0xa3, 0x0f, 0x81, 0xf1, 0xa1,
+    0x0f, 0x81, 0xe1, 0x9d, 0x0f, 0x81, 0xc1, 0x9e, 0x0f, 0x81, 0xc9, 0x9f,
+    0x0f, 0x81, 0xd1, 0xa0, 0x0f, 0x81, 0xd9, 0xa2, 0x0f, 0x81, 0xe8, 0xa0,
+    0x0f, 0x81, 0xa1, 0x9f, 0x0f, 0x81, 0x99, 0x9e, 0x0f, 0x81, 0x91, 0x9d,
+    0x0f, 0x81, 0x89, 0xa1, 0x0f, 0x81, 0xa9, 0xa2, 0x0f, 0x81, 0xb1, 0xa3,
+    0x0f, 0x81, 0xb8, 0x9d, 0x0f, 0x81, 0xf9, 0x9e, 0x0f, 0x82, 0x01, 0x9f,
+    0x0f, 0x82, 0x09, 0xa0, 0x0f, 0x82, 0x11, 0xa1, 0x0f, 0x82, 0x19, 0xa2,
+    0x0f, 0x82, 0x21, 0xa3, 0x0f, 0x82, 0x28, 0x9d, 0x0f, 0x82, 0x31, 0x9e,
+    0x0f, 0x82, 0x39, 0x9f, 0x0f, 0x82, 0x41, 0xa0, 0x0f, 0x82, 0x49, 0xa1,
+    0x0f, 0x82, 0x51, 0xa2, 0x0f, 0x82, 0x59, 0xa3, 0x0f, 0x82, 0x60, 0x9d,
+    0x0f, 0x82, 0xa1, 0x9e, 0x0f, 0x82, 0xa9, 0x9f, 0x0f, 0x82, 0xb1, 0xa0,
+    0x0f, 0x82, 0xb9, 0xa1, 0x0f, 0x82, 0xc1, 0xa2, 0x0f, 0x82, 0xc9, 0xa3,
+    0x0f, 0x82, 0xd0, 0x9d, 0x0f, 0x82, 0xd9, 0x9e, 0x0f, 0x82, 0xe1, 0x9f,
+    0x0f, 0x82, 0xe9, 0xa0, 0x0f, 0x82, 0xf1, 0xa1, 0x0f, 0x82, 0xf9, 0xa2,
+    0x0f, 0x83, 0x01, 0xa3, 0x0f, 0x83, 0x08, 0x9d, 0x0f, 0x83, 0x19, 0x9e,
+    0x0f, 0x83, 0x21, 0x9f, 0x0f, 0x83, 0x29, 0xa0, 0x0f, 0x83, 0x31, 0xa1,
+    0x0f, 0x83, 0x39, 0xa2, 0x0f, 0x83, 0x41, 0xa3, 0x0f, 0x83, 0x48, 0x9d,
+    0x0f, 0x83, 0x51, 0x9e, 0x0f, 0x83, 0x59, 0x9f, 0x0f, 0x83, 0x61, 0xa0,
+    0x0f, 0x83, 0x69, 0xa1, 0x0f, 0x83, 0x71, 0xa2, 0x0f, 0x83, 0x79, 0xa3,
+    0x0f, 0x83, 0x80, 0x9d, 0x0f, 0x83, 0x89, 0x9e, 0x0f, 0x83, 0x91, 0x9f,
+    0x0f, 0x83, 0x99, 0xa0, 0x0f, 0x83, 0xa1, 0xa1, 0x0f, 0x83, 0xa9, 0xa2,
+    0x0f, 0x83, 0xb1, 0xa3, 0x0f, 0x83, 0xb8, 0x9d, 0x0f, 0x83, 0xc1, 0x9e,
+    0x0f, 0x83, 0xc9, 0x9f, 0x0f, 0x83, 0xd1, 0xa0, 0x0f, 0x83, 0xd9, 0xa1,
+    0x0f, 0x83, 0xe1, 0xa2, 0x0f, 0x83, 0xe9, 0xa3, 0x0f, 0x83, 0xf0, 0x9d,
+    0x0f, 0x83, 0xf9, 0x9e, 0x0f, 0x84, 0x01, 0x9f, 0x0f, 0x84, 0x09, 0xa0,
+    0x0f, 0x84, 0x11, 0xa1, 0x0f, 0x84, 0x19, 0xa2, 0x0f, 0x84, 0x21, 0xa3,
+    0x0f, 0x84, 0x28, 0x9e, 0x0f, 0x84, 0x39, 0x9f, 0x0f, 0x84, 0x41, 0xa0,
+    0x0f, 0x84, 0x49, 0xa1, 0x0f, 0x84, 0x51, 0xa2, 0x0f, 0x84, 0x59, 0xa3,
+    0x0f, 0x84, 0x61, 0x9d, 0x0f, 0x84, 0x30, 0x9d, 0x0f, 0x84, 0x69, 0x9e,
+    0x0f, 0x84, 0x71, 0x9f, 0x0f, 0x84, 0x79, 0xa0, 0x0f, 0x84, 0x81, 0xa1,
+    0x0f, 0x84, 0x89, 0xa2, 0x0f, 0x84, 0x91, 0xa3, 0x0f, 0x84, 0x98, 0xc9,
+    0xb0, 0x86, 0x01, 0x3d, 0xf9, 0x47, 0x20, 0x7d, 0xc2, 0x3e, 0xef, 0xca,
+    0xa6, 0x8e, 0x01, 0x53, 0xa0, 0xc3, 0x01, 0x5d, 0x01, 0x1f, 0xc3, 0x02,
+    0x3e, 0xfb, 0xc4, 0x02, 0x6d, 0x01, 0x00, 0xb0, 0xc4, 0x13, 0x85, 0x01,
+    0x16, 0x99, 0xc6, 0xc4, 0x5e, 0x01, 0x57, 0x58, 0xc8, 0x06, 0xbf, 0x01,
+    0x16, 0x91, 0xc4, 0x1e, 0x43, 0x01, 0x11, 0x60, 0x17, 0xc2, 0x3e, 0xff,
+    0x46, 0x1f, 0x87, 0xc2, 0x3f, 0x17, 0x16, 0xc2, 0x3f, 0x23, 0xcf, 0x62,
+    0xa6, 0x01, 0x57, 0xe8, 0x14, 0xc2, 0x3f, 0x2f, 0xc3, 0x25, 0xd6, 0x01,
+    0x4f, 0xd0, 0xc5, 0xce, 0x22, 0x01, 0x01, 0x09, 0xc8, 0x32, 0xb8, 0x01,
+    0x57, 0x50, 0xdd, 0x0f, 0xb9, 0x01, 0x00, 0xf9, 0xc5, 0x59, 0x93, 0x01,
+    0x72, 0x00, 0x11, 0xc2, 0x3f, 0x3e, 0xdc, 0x13, 0x19, 0x01, 0x4c, 0xa8,
+    0xc9, 0x00, 0xca, 0x01, 0x55, 0x0b, 0x02, 0x3f, 0x48, 0xcc, 0x07, 0xc7,
+    0x01, 0x55, 0x10, 0x47, 0xc7, 0x4a, 0xc2, 0x3f, 0x4e, 0xcf, 0x60, 0x4e,
+    0x01, 0x0a, 0x01, 0x48, 0x0b, 0x17, 0xc2, 0x3f, 0x5a, 0x46, 0x03, 0x13,
+    0x42, 0x3f, 0x7f, 0x4c, 0x24, 0xe3, 0xc2, 0x3f, 0x8b, 0x48, 0x00, 0xda,
+    0x42, 0x3f, 0x97, 0xc4, 0x1e, 0x97, 0x08, 0xc1, 0xc9, 0xc5, 0x40, 0xe7,
+    0x08, 0xc1, 0xc0, 0x97, 0x08, 0xc1, 0xb1, 0x8b, 0x08, 0xc1, 0xa1, 0x83,
+    0x08, 0xc1, 0x60, 0x94, 0x08, 0xc1, 0x90, 0x97, 0x08, 0xc1, 0x80, 0x8b,
+    0x08, 0xc1, 0x70, 0xc2, 0x00, 0x39, 0x08, 0xc1, 0x59, 0x83, 0x08, 0xc1,
+    0x20, 0x83, 0x08, 0xc1, 0x49, 0xc2, 0x0d, 0xf6, 0x08, 0xc1, 0x41, 0xc2,
+    0x00, 0xd0, 0x08, 0xc1, 0x38, 0xc2, 0x00, 0xd0, 0x08, 0xc1, 0x09, 0x83,
+    0x08, 0xc1, 0x00, 0xc2, 0x00, 0xd0, 0x08, 0xc0, 0xf9, 0x83, 0x08, 0xc0,
+    0xf0, 0x83, 0x08, 0xc0, 0xe9, 0xc2, 0x00, 0xc1, 0x08, 0xc0, 0xc1, 0xc2,
+    0x19, 0x2c, 0x08, 0xc0, 0x99, 0xc2, 0x01, 0x30, 0x08, 0xc0, 0x70, 0xc2,
+    0x00, 0xd0, 0x08, 0xc0, 0xe1, 0x83, 0x08, 0xc0, 0xd9, 0x06, 0x42, 0x3f,
+    0xa9, 0xc2, 0x00, 0xd0, 0x08, 0xc0, 0xd1, 0x83, 0x08, 0xc0, 0xc9, 0x16,
+    0x42, 0x3f, 0xb3, 0xc2, 0x00, 0xd0, 0x08, 0xc0, 0x91, 0x83, 0x08, 0xc0,
+    0x88, 0xc2, 0x00, 0xd0, 0x08, 0xc0, 0x81, 0x83, 0x08, 0xc0, 0x78, 0xc2,
+    0x00, 0xd0, 0x08, 0xc0, 0x69, 0x83, 0x08, 0xc0, 0x60, 0xc2, 0x00, 0xd0,
+    0x08, 0xc0, 0x59, 0x83, 0x08, 0xc0, 0x50, 0x97, 0x08, 0xc0, 0x49, 0x8b,
+    0x08, 0xc0, 0x39, 0x83, 0x08, 0xc0, 0x08, 0x97, 0x08, 0xc0, 0x28, 0x8b,
+    0x08, 0xc0, 0x18, 0x03, 0xc2, 0x3f, 0xbd, 0xc8, 0x00, 0x5f, 0x0d, 0xe4,
+    0xc3, 0x02, 0x3f, 0xc9, 0xc4, 0x51, 0xb7, 0x0d, 0xe4, 0xb9, 0x0e, 0xc2,
+    0x3f, 0xcf, 0xc6, 0x02, 0xd1, 0x0d, 0xe4, 0xa9, 0xc3, 0x02, 0xa3, 0x0d,
+    0xe4, 0xa1, 0xc5, 0x1f, 0x0c, 0x0d, 0xe4, 0x91, 0xcb, 0x8f, 0x94, 0x0d,
+    0xe4, 0x88, 0xc7, 0x27, 0x9b, 0x0d, 0xe3, 0xa8, 0xc3, 0x02, 0x6e, 0x0d,
+    0xe4, 0x31, 0xc9, 0xac, 0xf0, 0x0d, 0xe4, 0x18, 0xc5, 0xd9, 0x39, 0x0d,
+    0xe3, 0xc3, 0x02, 0x3f, 0xdb, 0xc2, 0x00, 0x71, 0x0d, 0xe3, 0xc8, 0x99,
+    0x0d, 0xe3, 0x00, 0xc3, 0x02, 0xe9, 0x0d, 0xe1, 0xb9, 0x95, 0x0d, 0xe1,
+    0xb0, 0x92, 0x0d, 0xe1, 0xa3, 0x02, 0x3f, 0xe1, 0x96, 0x0d, 0xe1, 0x93,
+    0x02, 0x3f, 0xe7, 0x8c, 0x0d, 0xe1, 0x03, 0x02, 0x3f, 0xed, 0x95, 0x0d,
+    0xe1, 0x51, 0xc8, 0x33, 0xae, 0x0d, 0xe1, 0x2b, 0x02, 0x3f, 0xf3, 0x8d,
+    0x0d, 0xe1, 0xfb, 0x02, 0x3f, 0xf9, 0x8f, 0x0d, 0xe1, 0xe1, 0x90, 0x0d,
+    0xe1, 0xd8, 0x8c, 0x0d, 0xe0, 0xa9, 0xc2, 0x08, 0x06, 0x0d, 0xe0, 0x91,
+    0x11, 0xc2, 0x3f, 0xff, 0xc2, 0x00, 0xd1, 0x0d, 0xe3, 0x41, 0x07, 0xc2,
+    0x40, 0x07, 0x97, 0x0d, 0xe2, 0xc0, 0x90, 0x0d, 0xe1, 0x83, 0x02, 0x40,
+    0x13, 0x95, 0x0d, 0xe1, 0x4b, 0x02, 0x40, 0x19, 0x8f, 0x0d, 0xe0, 0xfb,
+    0x02, 0x40, 0x1f, 0xc8, 0x33, 0xae, 0x0d, 0xe1, 0x1a, 0x02, 0x40, 0x25,
+    0x8f, 0x0d, 0xe0, 0xf3, 0x02, 0x40, 0x2b, 0x95, 0x0d, 0xe1, 0x41, 0xc8,
+    0x33, 0xae, 0x0d, 0xe1, 0x10, 0x83, 0x0d, 0xe3, 0x21, 0x8b, 0x0d, 0xe3,
+    0x19, 0x91, 0x0d, 0xe3, 0x11, 0x97, 0x0d, 0xe3, 0x08, 0x90, 0x0d, 0xe0,
+    0xeb, 0x02, 0x40, 0x31, 0x95, 0x0d, 0xe1, 0x39, 0xc8, 0x33, 0xae, 0x0d,
+    0xe1, 0x08, 0x97, 0x0d, 0xe2, 0xb1, 0x8b, 0x0d, 0xe2, 0x68, 0x97, 0x0d,
+    0xe2, 0xa9, 0x8b, 0x0d, 0xe2, 0x78, 0x8f, 0x0d, 0xe0, 0x79, 0xc3, 0x02,
+    0xe9, 0x0d, 0xe1, 0xe8, 0x8f, 0x0d, 0xe3, 0x31, 0x90, 0x0d, 0xe3, 0x28,
+    0xc7, 0x1b, 0x02, 0x00, 0x04, 0x69, 0xde, 0x0e, 0x50, 0x0f, 0xbe, 0x40,
+    0x00, 0x42, 0x40, 0x37, 0xcf, 0x09, 0xf8, 0x01, 0x5a, 0x09, 0xd0, 0x03,
+    0xb7, 0x01, 0x5a, 0x38, 0xda, 0x1c, 0xa0, 0x01, 0x30, 0xc9, 0xdf, 0x0c,
+    0x27, 0x0f, 0xac, 0x89, 0xca, 0x3f, 0x35, 0x01, 0x5f, 0xf0, 0xc4, 0x1e,
+    0xc9, 0x01, 0x11, 0xeb, 0x02, 0x40, 0x49, 0xcb, 0x94, 0x59, 0x01, 0x01,
+    0xb9, 0x46, 0xcf, 0x95, 0x42, 0x40, 0x4f, 0xd3, 0x46, 0xb6, 0x01, 0x0a,
+    0x19, 0xc8, 0x52, 0x00, 0x01, 0x02, 0x78, 0xcb, 0x92, 0xd8, 0x01, 0x02,
+    0x59, 0xc4, 0x18, 0x26, 0x01, 0x01, 0xa8, 0xc5, 0x18, 0x25, 0x01, 0x01,
+    0xb3, 0x02, 0x40, 0x5b, 0xcf, 0x68, 0xbe, 0x01, 0x57, 0x68, 0xce, 0x55,
+    0x99, 0x01, 0x4d, 0x28, 0xca, 0xa1, 0x34, 0x01, 0x33, 0xc9, 0xca, 0x9d,
+    0xce, 0x01, 0x33, 0xc1, 0xca, 0x9d, 0x42, 0x01, 0x33, 0xb9, 0xca, 0xa1,
+    0x48, 0x01, 0x33, 0xb1, 0xca, 0x9d, 0x9c, 0x01, 0x33, 0xa9, 0xca, 0xa0,
+    0x58, 0x01, 0x33, 0xa1, 0xca, 0x9a, 0x7c, 0x01, 0x33, 0x98, 0x83, 0x05,
+    0x4a, 0x71, 0x97, 0x05, 0x4a, 0x68, 0x97, 0x05, 0x4a, 0x61, 0x8b, 0x05,
+    0x4a, 0x50, 0xc2, 0x25, 0x3b, 0x05, 0x4a, 0x29, 0x83, 0x05, 0x49, 0xd8,
+    0xc2, 0x01, 0x30, 0x05, 0x4a, 0x19, 0x83, 0x05, 0x49, 0x90, 0xd1, 0x3f,
+    0xe4, 0x0f, 0xdc, 0x59, 0xd0, 0x05, 0xb7, 0x01, 0x16, 0x60, 0x00, 0x42,
+    0x40, 0x61, 0xd3, 0x01, 0xb4, 0x01, 0x00, 0xc9, 0xd0, 0x58, 0xd2, 0x01,
+    0x71, 0x38, 0xca, 0x6f, 0xb9, 0x0f, 0xaf, 0x49, 0xc4, 0x21, 0xdf, 0x0f,
+    0xab, 0x42, 0x02, 0x40, 0x79, 0x42, 0x00, 0xa9, 0xc2, 0x40, 0x7f, 0x09,
+    0x42, 0x40, 0x8b, 0x49, 0x05, 0xcb, 0xc2, 0x40, 0x9a, 0xd6, 0x13, 0x1f,
+    0x01, 0x4c, 0xa0, 0xcc, 0x06, 0xdb, 0x01, 0x2c, 0xa9, 0xcd, 0x15, 0x02,
+    0x0f, 0xdc, 0x38, 0x42, 0x00, 0x5b, 0xc2, 0x40, 0xa6, 0xcc, 0x01, 0xdb,
+    0x0f, 0xdc, 0x69, 0xcb, 0x96, 0x7f, 0x0f, 0xdd, 0x99, 0xc6, 0x9e, 0xf4,
+    0x0f, 0xdd, 0xd0, 0x00, 0x42, 0x40, 0xb2, 0xca, 0xa2, 0x74, 0x01, 0x1d,
+    0x01, 0xc9, 0x57, 0x36, 0x01, 0x1c, 0xf9, 0xca, 0xa3, 0x5a, 0x01, 0x1c,
+    0xf0, 0xc7, 0xb2, 0xec, 0x01, 0x4b, 0xe9, 0xd0, 0x4a, 0x77, 0x0f, 0xdc,
+    0x48, 0x44, 0x01, 0x94, 0xc2, 0x40, 0xc4, 0xd3, 0x41, 0xf6, 0x01, 0x70,
+    0x50, 0xcc, 0x86, 0xcd, 0x0f, 0xaf, 0x69, 0x44, 0x02, 0xdf, 0xc2, 0x40,
+    0xd3, 0xde, 0x06, 0x69, 0x0f, 0xde, 0x18, 0xce, 0x01, 0xb9, 0x01, 0x00,
+    0xe9, 0xcc, 0x8a, 0x09, 0x01, 0x4e, 0xd9, 0x03, 0xc2, 0x40, 0xdf, 0xcb,
+    0x1a, 0x50, 0x01, 0x71, 0x48, 0xcb, 0x1a, 0x50, 0x01, 0x4c, 0x31, 0x05,
+    0xc2, 0x40, 0xeb, 0xd2, 0x21, 0x89, 0x01, 0x80, 0xb9, 0xd6, 0x08, 0x88,
+    0x01, 0x80, 0xc9, 0xce, 0x25, 0xad, 0x01, 0x80, 0xd8, 0x00, 0x42, 0x40,
+    0xf7, 0x45, 0x01, 0x95, 0xc2, 0x41, 0x03, 0x44, 0x0b, 0x26, 0x42, 0x41,
+    0x0f, 0xcd, 0x7e, 0x3b, 0x01, 0x0d, 0x01, 0x48, 0x01, 0x9a, 0x42, 0x41,
+    0x1b, 0xcb, 0x6f, 0xff, 0x01, 0x0e, 0xe9, 0xca, 0x88, 0xdf, 0x0f, 0xc1,
+    0xd0, 0xd0, 0x58, 0x62, 0x0f, 0xc2, 0x11, 0xc5, 0x01, 0xa2, 0x0f, 0xc2,
+    0x30, 0x46, 0x01, 0x52, 0xc2, 0x41, 0x27, 0xc2, 0x02, 0x35, 0x0f, 0xd7,
+    0x88, 0x45, 0x00, 0x8c, 0xc2, 0x41, 0x33, 0x16, 0xc2, 0x41, 0x6f, 0xd4,
+    0x3b, 0x38, 0x01, 0x0e, 0x21, 0xc8, 0xae, 0xbc, 0x01, 0x0d, 0x33, 0x02,
+    0x41, 0x7b, 0x03, 0x42, 0x41, 0x81, 0xc5, 0x01, 0xa2, 0x01, 0x0e, 0x93,
+    0x02, 0x41, 0x8d, 0xca, 0x52, 0xc2, 0x01, 0x48, 0x68, 0xd3, 0x43, 0xe4,
+    0x01, 0x5c, 0x51, 0xc5, 0x01, 0xa2, 0x01, 0x5c, 0xa8, 0xca, 0x50, 0x5e,
+    0x00, 0x7e, 0xb8, 0xc7, 0x0d, 0x04, 0x01, 0x0b, 0x6b, 0x02, 0x41, 0x97,
+    0xc8, 0x4b, 0x94, 0x01, 0x0b, 0x7a, 0x02, 0x41, 0x9d, 0xc3, 0x45, 0x6b,
+    0x01, 0x0b, 0x63, 0x02, 0x41, 0xa3, 0xc2, 0x00, 0x5f, 0x01, 0x0b, 0x22,
+    0x02, 0x41, 0xa7, 0xca, 0xa0, 0xda, 0x01, 0x0c, 0x28, 0xc9, 0x57, 0x20,
+    0x01, 0x0c, 0x10, 0xc4, 0x22, 0x44, 0x01, 0x0b, 0x59, 0x91, 0x01, 0x0b,
+    0x08, 0xc8, 0xbd, 0x82, 0x08, 0x0c, 0x81, 0xc8, 0x45, 0xf0, 0x08, 0x0c,
+    0x98, 0x44, 0x1c, 0x74, 0xc2, 0x41, 0xab, 0xcf, 0x0c, 0x37, 0x0f, 0xac,
+    0x80, 0xc8, 0x0d, 0x03, 0x08, 0x73, 0xc1, 0xc2, 0x0d, 0x10, 0x08, 0x73,
+    0x78, 0xc8, 0x0d, 0x03, 0x08, 0x73, 0xb9, 0xc2, 0x0d, 0x10, 0x08, 0x73,
+    0x70, 0xca, 0x37, 0x63, 0x08, 0x73, 0xb1, 0xc3, 0x45, 0x6b, 0x08, 0x73,
+    0x68, 0xca, 0x9c, 0x5c, 0x08, 0x73, 0xa9, 0xc3, 0x0d, 0x0f, 0x08, 0x73,
+    0x60, 0xcb, 0x13, 0xfa, 0x08, 0x73, 0xa1, 0xc4, 0x0d, 0x0e, 0x08, 0x73,
+    0x58, 0xc9, 0x18, 0x05, 0x08, 0x73, 0x99, 0xc4, 0x18, 0x12, 0x08, 0x73,
+    0x50, 0x4d, 0x7e, 0xbd, 0xc2, 0x41, 0xb1, 0xcd, 0x7e, 0x21, 0x00, 0xb5,
+    0x00, 0x91, 0x00, 0xb7, 0x99, 0xce, 0x75, 0x12, 0x00, 0xb6, 0xf9, 0xc5,
+    0xd4, 0xac, 0x00, 0xb6, 0xa9, 0x90, 0x00, 0xb5, 0x81, 0x87, 0x00, 0xb5,
+    0x79, 0xc3, 0x05, 0x0d, 0x00, 0xb5, 0x48, 0x8a, 0x00, 0xb7, 0x93, 0x02,
+    0x41, 0xc7, 0xc3, 0x13, 0x00, 0x00, 0xb7, 0x29, 0xd6, 0x2e, 0x28, 0x00,
+    0xb6, 0x59, 0xc7, 0xc9, 0x5e, 0x00, 0xb6, 0x50, 0x43, 0x38, 0x85, 0x42,
+    0x41, 0xcd, 0xcb, 0x96, 0xc1, 0x00, 0xb7, 0x41, 0xc2, 0x00, 0xbf, 0x00,
+    0xb7, 0x09, 0xc2, 0x00, 0x75, 0x00, 0xb6, 0xeb, 0x02, 0x41, 0xd7, 0xc7,
+    0xc5, 0x2f, 0x00, 0xb6, 0x39, 0xcc, 0x84, 0xf9, 0x00, 0xb6, 0x08, 0x4b,
+    0x2e, 0x2e, 0xc2, 0x41, 0xdd, 0xd1, 0x55, 0xb8, 0x00, 0xb6, 0xd0, 0x07,
+    0xc2, 0x41, 0xfb, 0xc3, 0x67, 0x02, 0x00, 0xb7, 0x19, 0xc6, 0xce, 0xf9,
+    0x00, 0xb7, 0x10, 0xc2, 0x00, 0xb1, 0x00, 0xb7, 0x01, 0xc9, 0xaa, 0x0e,
+    0x00, 0xb6, 0xb1, 0xc2, 0x00, 0x75, 0x00, 0xb5, 0xb1, 0xc2, 0x00, 0x8e,
+    0x00, 0xb5, 0x38, 0xcb, 0x99, 0x97, 0x00, 0xb6, 0xf1, 0x46, 0xcb, 0xbd,
+    0x42, 0x42, 0x05, 0xce, 0x72, 0x56, 0x00, 0xb6, 0x79, 0xd3, 0x42, 0xda,
+    0x00, 0xb5, 0x30, 0xca, 0xa5, 0x08, 0x00, 0xb6, 0x49, 0xc3, 0x23, 0x1c,
+    0x00, 0xb5, 0x59, 0xc3, 0x15, 0x66, 0x00, 0xb5, 0x51, 0xc6, 0xcb, 0xc9,
+    0x00, 0xb5, 0x40, 0x07, 0xc2, 0x42, 0x11, 0xc2, 0x00, 0xb1, 0x00, 0xb5,
+    0xc0, 0xc5, 0xd9, 0x75, 0x00, 0xb5, 0xd9, 0xc6, 0xcf, 0xa1, 0x00, 0xb5,
+    0xd0, 0xcb, 0x95, 0x4b, 0x00, 0xb5, 0xc8, 0x94, 0x00, 0xb5, 0x18, 0x87,
+    0x05, 0x28, 0x03, 0x02, 0x42, 0x1b, 0x90, 0x05, 0x2f, 0x10, 0x87, 0x05,
+    0x2f, 0x23, 0x02, 0x42, 0x1f, 0x8b, 0x05, 0x29, 0x33, 0x02, 0x42, 0x27,
+    0x83, 0x05, 0x2a, 0x63, 0x02, 0x42, 0x2b, 0x91, 0x05, 0x2d, 0xeb, 0x02,
+    0x42, 0x2f, 0x97, 0x05, 0x2c, 0xba, 0x02, 0x42, 0x37, 0x87, 0x05, 0x2f,
+    0x33, 0x02, 0x42, 0x3b, 0x8b, 0x05, 0x29, 0x43, 0x02, 0x42, 0x46, 0x83,
+    0x05, 0x2a, 0x73, 0x02, 0x42, 0x4a, 0x91, 0x05, 0x2d, 0xfb, 0x02, 0x42,
+    0x4e, 0x97, 0x05, 0x2c, 0xca, 0x02, 0x42, 0x59, 0x87, 0x05, 0x2f, 0x43,
+    0x02, 0x42, 0x5d, 0x8b, 0x05, 0x29, 0x51, 0x83, 0x05, 0x2a, 0x81, 0x91,
+    0x05, 0x2e, 0x0b, 0x02, 0x42, 0x61, 0x97, 0x05, 0x2c, 0xd8, 0x0a, 0xc2,
+    0x42, 0x65, 0x87, 0x05, 0x2f, 0x53, 0x02, 0x42, 0x7f, 0x8b, 0x05, 0x29,
+    0x61, 0x83, 0x05, 0x2a, 0x91, 0x91, 0x05, 0x2e, 0x1b, 0x02, 0x42, 0x83,
+    0x97, 0x05, 0x2c, 0xe8, 0x04, 0xc2, 0x42, 0x87, 0x42, 0x1f, 0xad, 0xc2,
+    0x42, 0xa1, 0x87, 0x05, 0x30, 0x43, 0x02, 0x42, 0xbb, 0x8b, 0x05, 0x2a,
+    0x31, 0x83, 0x05, 0x2b, 0x71, 0x91, 0x05, 0x2e, 0xf3, 0x02, 0x42, 0xbf,
+    0x97, 0x05, 0x2d, 0xb8, 0x12, 0xc2, 0x42, 0xc3, 0x87, 0x05, 0x30, 0x1b,
+    0x02, 0x42, 0xe0, 0x8b, 0x05, 0x2a, 0x19, 0x83, 0x05, 0x2b, 0x53, 0x02,
+    0x42, 0xe4, 0x91, 0x05, 0x2e, 0xdb, 0x02, 0x42, 0xe8, 0x97, 0x05, 0x2d,
+    0xa0, 0x04, 0xc2, 0x42, 0xec, 0x87, 0x05, 0x30, 0x33, 0x02, 0x43, 0x06,
+    0x8b, 0x05, 0x2a, 0x29, 0x83, 0x05, 0x2b, 0x69, 0x91, 0x05, 0x2e, 0xeb,
+    0x02, 0x43, 0x0e, 0x97, 0x05, 0x2d, 0xb0, 0x87, 0x05, 0x2f, 0x8b, 0x02,
+    0x43, 0x12, 0x8b, 0x05, 0x29, 0x89, 0x83, 0x05, 0x2a, 0xc1, 0x91, 0x05,
+    0x2e, 0x4b, 0x02, 0x43, 0x16, 0x97, 0x05, 0x2d, 0x10, 0x87, 0x05, 0x2f,
+    0x93, 0x02, 0x43, 0x1a, 0x8b, 0x05, 0x29, 0x91, 0x83, 0x05, 0x2a, 0xc9,
+    0x91, 0x05, 0x2e, 0x53, 0x02, 0x43, 0x1e, 0x97, 0x05, 0x2d, 0x18, 0x87,
+    0x05, 0x2f, 0x9b, 0x02, 0x43, 0x22, 0x0a, 0xc2, 0x43, 0x26, 0x8b, 0x05,
+    0x29, 0x99, 0x83, 0x05, 0x2a, 0xd1, 0x91, 0x05, 0x2e, 0x5b, 0x02, 0x43,
+    0x40, 0x97, 0x05, 0x2d, 0x20, 0x0a, 0xc2, 0x43, 0x44, 0x87, 0x05, 0x2f,
+    0xcb, 0x02, 0x43, 0x62, 0x8b, 0x05, 0x29, 0xc9, 0x83, 0x05, 0x2b, 0x01,
+    0x91, 0x05, 0x2e, 0x8b, 0x02, 0x43, 0x66, 0x97, 0x05, 0x2d, 0x50, 0x87,
+    0x05, 0x2f, 0xbb, 0x02, 0x43, 0x6a, 0x8b, 0x05, 0x29, 0xb9, 0x83, 0x05,
+    0x2a, 0xf1, 0x91, 0x05, 0x2e, 0x7b, 0x02, 0x43, 0x74, 0x97, 0x05, 0x2d,
+    0x40, 0x87, 0x05, 0x2f, 0xc3, 0x02, 0x43, 0x78, 0x8b, 0x05, 0x29, 0xc1,
+    0x83, 0x05, 0x2a, 0xf9, 0x91, 0x05, 0x2e, 0x83, 0x02, 0x43, 0x7c, 0x97,
+    0x05, 0x2d, 0x48, 0x06, 0xc2, 0x43, 0x80, 0x0c, 0xc2, 0x43, 0x9a, 0x89,
+    0x05, 0x30, 0x5b, 0x02, 0x43, 0xb4, 0x87, 0x05, 0x30, 0x4b, 0x02, 0x43,
+    0xca, 0x1b, 0xc2, 0x43, 0xce, 0x8b, 0x05, 0x2a, 0x39, 0x83, 0x05, 0x2b,
+    0x79, 0x91, 0x05, 0x2e, 0xfb, 0x02, 0x43, 0xe8, 0x97, 0x05, 0x2d, 0xc0,
+    0x87, 0x05, 0x2f, 0xdb, 0x02, 0x43, 0xec, 0x0a, 0xc2, 0x43, 0xf0, 0x8b,
+    0x05, 0x29, 0xd9, 0x83, 0x05, 0x2b, 0x11, 0x91, 0x05, 0x2e, 0x9b, 0x02,
+    0x44, 0x0a, 0x97, 0x05, 0x2d, 0x60, 0x87, 0x05, 0x2f, 0xeb, 0x02, 0x44,
+    0x0e, 0x0a, 0xc2, 0x44, 0x12, 0x8b, 0x05, 0x29, 0xe9, 0x83, 0x05, 0x2b,
+    0x21, 0x91, 0x05, 0x2e, 0xab, 0x02, 0x44, 0x2c, 0x97, 0x05, 0x2d, 0x70,
+    0x87, 0x05, 0x2f, 0xfb, 0x02, 0x44, 0x30, 0x8b, 0x05, 0x29, 0xf9, 0x83,
+    0x05, 0x2b, 0x31, 0x91, 0x05, 0x2e, 0xbb, 0x02, 0x44, 0x34, 0x97, 0x05,
+    0x2d, 0x80, 0x87, 0x05, 0x30, 0x03, 0x02, 0x44, 0x38, 0x8b, 0x05, 0x2a,
+    0x01, 0x83, 0x05, 0x2b, 0x39, 0x91, 0x05, 0x2e, 0xc3, 0x02, 0x44, 0x3c,
+    0x97, 0x05, 0x2d, 0x88, 0x87, 0x05, 0x30, 0x13, 0x02, 0x44, 0x40, 0x8b,
+    0x05, 0x2a, 0x11, 0x83, 0x05, 0x2b, 0x49, 0x91, 0x05, 0x2e, 0xd3, 0x02,
+    0x44, 0x44, 0x97, 0x05, 0x2d, 0x98, 0x90, 0x05, 0x29, 0x28, 0x90, 0x05,
+    0x2a, 0x50, 0x91, 0x05, 0x2b, 0x8b, 0x02, 0x44, 0x48, 0x90, 0x05, 0x2d,
+    0xd8, 0x90, 0x05, 0x2c, 0xb0, 0xc4, 0xe2, 0xaf, 0x05, 0x30, 0x99, 0xc2,
+    0x04, 0xc6, 0x05, 0x30, 0xc0, 0xc4, 0xe2, 0xaf, 0x05, 0x30, 0xa1, 0xc3,
+    0x38, 0x86, 0x05, 0x30, 0xe0, 0xc3, 0x00, 0x74, 0x05, 0x30, 0xa9, 0xc2,
+    0x04, 0xc6, 0x05, 0x30, 0xc9, 0xc3, 0x08, 0x48, 0x05, 0x30, 0xe8, 0xc3,
+    0x01, 0x95, 0x05, 0x30, 0xd1, 0x11, 0x42, 0x44, 0x4c, 0xc9, 0x57, 0x36,
+    0x01, 0x1e, 0x81, 0x45, 0x00, 0x8c, 0x42, 0x44, 0x58, 0xc7, 0x33, 0xdf,
+    0x00, 0x00, 0x5b, 0x02, 0x44, 0x64, 0xc4, 0x3b, 0x19, 0x01, 0x5b, 0xf8,
+    0x00, 0x42, 0x44, 0x6a, 0xcb, 0x99, 0x1e, 0x01, 0x81, 0xa0, 0xcf, 0x15,
+    0x36, 0x0f, 0xbd, 0xf9, 0xd2, 0x22, 0x49, 0x0f, 0xbe, 0x80, 0xc6, 0x02,
+    0xd1, 0x0f, 0xbc, 0x41, 0xc6, 0x0b, 0x09, 0x0f, 0xbc, 0x90, 0xc6, 0x27,
+    0x5e, 0x0f, 0xb3, 0xe1, 0xc6, 0x13, 0x52, 0x0f, 0xbd, 0x69, 0xd2, 0x4d,
+    0x57, 0x0f, 0xbd, 0xc8, 0xce, 0x70, 0x5e, 0x00, 0xe7, 0x89, 0xcb, 0x95,
+    0x98, 0x00, 0xe7, 0x5b, 0x02, 0x44, 0x76, 0xcc, 0x88, 0xc5, 0x00, 0xe7,
+    0x51, 0xcc, 0x14, 0x41, 0x00, 0xe7, 0x48, 0xc8, 0x74, 0xc4, 0x00, 0xe7,
+    0x31, 0xc6, 0x74, 0xc6, 0x00, 0xe7, 0x20, 0xca, 0xa5, 0x12, 0x00, 0xe7,
+    0x40, 0xca, 0xa5, 0x12, 0x00, 0xe7, 0x38, 0xca, 0x9e, 0xe6, 0x00, 0xe7,
+    0xc9, 0xc7, 0x02, 0x40, 0x00, 0xe6, 0xd0, 0xe0, 0x02, 0x27, 0x00, 0xe7,
+    0x00, 0xca, 0xa4, 0x90, 0x00, 0xe6, 0xc8, 0x43, 0x00, 0x4b, 0xc2, 0x44,
+    0x7c, 0xcc, 0x8b, 0x11, 0x70, 0x01, 0xe0, 0x4f, 0x0b, 0x17, 0xc2, 0x44,
+    0x8e, 0x4d, 0x29, 0xb9, 0x42, 0x44, 0xf6, 0x42, 0x0a, 0x8c, 0xc2, 0x45,
+    0x5e, 0xc3, 0x0d, 0xe5, 0x70, 0x01, 0xd0, 0xce, 0x25, 0xad, 0x70, 0x02,
+    0xe9, 0xcb, 0x1a, 0x50, 0x70, 0x01, 0x49, 0xcd, 0x00, 0x32, 0x70, 0x03,
+    0xe8, 0xc4, 0x26, 0x78, 0x70, 0x01, 0xc9, 0xc5, 0x06, 0xdb, 0x70, 0x01,
+    0xc1, 0x15, 0xc2, 0x45, 0x68, 0x08, 0xc2, 0x45, 0x74, 0x16, 0xc2, 0x45,
+    0x80, 0xc3, 0x05, 0x14, 0x70, 0x01, 0x89, 0xc4, 0x15, 0xe7, 0x70, 0x01,
+    0x80, 0x83, 0x00, 0xbb, 0x41, 0xc2, 0x01, 0x30, 0x00, 0xbb, 0x28, 0xc9,
+    0xa9, 0xc6, 0x00, 0xb8, 0xf8, 0x83, 0x00, 0xb8, 0x41, 0xc2, 0x01, 0x30,
+    0x00, 0xb8, 0x28, 0x24, 0xc2, 0x45, 0x8c, 0x23, 0xc2, 0x45, 0xa8, 0x22,
+    0xc2, 0x45, 0xd0, 0x21, 0xc2, 0x45, 0xf8, 0x20, 0xc2, 0x46, 0x20, 0x1f,
+    0xc2, 0x46, 0x48, 0x1e, 0xc2, 0x46, 0x70, 0x1d, 0x42, 0x46, 0x98, 0xc4,
+    0x26, 0x78, 0x0b, 0x56, 0x49, 0xc5, 0x06, 0xdb, 0x0b, 0x56, 0x41, 0x15,
+    0xc2, 0x46, 0xc0, 0x08, 0xc2, 0x46, 0xcc, 0x16, 0xc2, 0x46, 0xd8, 0xc3,
+    0x05, 0x14, 0x0b, 0x56, 0x09, 0xc4, 0x15, 0xe7, 0x0b, 0x56, 0x00, 0xc2,
+    0x02, 0x1c, 0x0b, 0x55, 0xf1, 0x05, 0xc2, 0x46, 0xe4, 0x06, 0xc2, 0x46,
+    0xee, 0x08, 0xc2, 0x46, 0xf8, 0xc2, 0x8d, 0x8f, 0x0b, 0x55, 0xd1, 0x16,
+    0xc2, 0x47, 0x02, 0x0a, 0xc2, 0x47, 0x12, 0x09, 0xc2, 0x47, 0x1a, 0x15,
+    0xc2, 0x47, 0x24, 0x10, 0xc2, 0x47, 0x2c, 0xc2, 0x00, 0x39, 0x0b, 0x55,
+    0x91, 0x0e, 0xc2, 0x47, 0x42, 0x0f, 0xc2, 0x47, 0x4c, 0xc2, 0x01, 0x5d,
+    0x0b, 0x55, 0x51, 0x12, 0xc2, 0x47, 0x60, 0xc2, 0x01, 0x4a, 0x0b, 0x55,
+    0x31, 0xc2, 0x19, 0x2c, 0x0b, 0x55, 0x29, 0x0d, 0xc2, 0x47, 0x6a, 0x17,
+    0xc2, 0x47, 0x74, 0x03, 0xc2, 0x47, 0x8c, 0x0b, 0xc2, 0x47, 0xa0, 0x07,
+    0xc2, 0x47, 0xb0, 0x18, 0xc2, 0x47, 0xc0, 0x11, 0x42, 0x47, 0xd0, 0x18,
+    0xc2, 0x47, 0xe0, 0x42, 0x14, 0x48, 0xc2, 0x47, 0xee, 0x0d, 0xc2, 0x48,
+    0x00, 0x12, 0xc2, 0x48, 0x0a, 0xc7, 0xb4, 0xa5, 0x08, 0xfe, 0xc1, 0x03,
+    0xc2, 0x48, 0x14, 0xc6, 0xcd, 0xd9, 0x08, 0xfe, 0xb1, 0xc3, 0x1e, 0xe5,
+    0x08, 0xfe, 0xa8, 0xcb, 0x97, 0x9d, 0x08, 0xff, 0x49, 0xcb, 0x97, 0xa8,
+    0x08, 0xff, 0x40, 0x83, 0x00, 0x5c, 0x2b, 0x02, 0x48, 0x20, 0x8b, 0x00,
+    0x5c, 0x3b, 0x02, 0x48, 0x2c, 0x97, 0x00, 0x5c, 0x4b, 0x02, 0x48, 0x30,
+    0x87, 0x00, 0x5c, 0x73, 0x02, 0x48, 0x34, 0x91, 0x00, 0x5c, 0x93, 0x02,
+    0x48, 0x38, 0xc2, 0x02, 0x2b, 0x00, 0x5c, 0xa9, 0x10, 0xc2, 0x48, 0x3c,
+    0xc2, 0x00, 0x64, 0x00, 0x5c, 0xd1, 0xc2, 0x25, 0x3b, 0x00, 0x5c, 0xe1,
+    0x16, 0xc2, 0x48, 0x50, 0xc2, 0x00, 0xb0, 0x00, 0x5d, 0x51, 0xc2, 0x01,
+    0xc3, 0x00, 0x5d, 0x71, 0xc2, 0x19, 0x2c, 0x00, 0x5d, 0x79, 0x14, 0xc2,
+    0x48, 0x5a, 0x0e, 0xc2, 0x48, 0x64, 0xc2, 0x02, 0x41, 0x00, 0x5d, 0xa9,
+    0x15, 0xc2, 0x48, 0x6c, 0xc2, 0x00, 0xd0, 0x00, 0x5d, 0xc8, 0xc4, 0x15,
+    0xe7, 0x00, 0x5f, 0x31, 0xc3, 0x05, 0x14, 0x00, 0x5f, 0x39, 0x16, 0xc2,
+    0x48, 0x7c, 0x08, 0xc2, 0x48, 0x88, 0x15, 0xc2, 0x48, 0x94, 0xc5, 0x06,
+    0xdb, 0x00, 0x5f, 0x71, 0xc4, 0x26, 0x78, 0x00, 0x5f, 0x78, 0xc8, 0x08,
+    0x79, 0x08, 0xfe, 0x99, 0x44, 0x22, 0xcb, 0xc2, 0x48, 0xa0, 0xca, 0x1e,
+    0x15, 0x08, 0xfe, 0x69, 0xca, 0xa3, 0xfa, 0x08, 0xfe, 0x30, 0x45, 0x27,
+    0x7a, 0xc2, 0x48, 0xac, 0xc7, 0x08, 0x79, 0x08, 0xfe, 0x81, 0x08, 0xc2,
+    0x48, 0xb4, 0x45, 0x06, 0xdb, 0xc2, 0x48, 0xc0, 0x16, 0xc2, 0x48, 0xca,
+    0x44, 0x22, 0xcb, 0xc2, 0x48, 0xda, 0xd8, 0x22, 0xbb, 0x08, 0xfe, 0x08,
+    0x83, 0x00, 0x5d, 0xf1, 0x8b, 0x00, 0x5e, 0x41, 0x97, 0x00, 0x5e, 0x60,
+    0x8b, 0x00, 0x5e, 0x00, 0x97, 0x00, 0x5e, 0x10, 0x87, 0x00, 0x5e, 0x38,
+    0x91, 0x00, 0x5e, 0x58, 0xc7, 0x0d, 0x04, 0x00, 0x5f, 0x89, 0xc8, 0x4b,
+    0x94, 0x00, 0x5f, 0x90, 0xc4, 0x18, 0x10, 0x08, 0xb6, 0x39, 0xc2, 0x22,
+    0xcc, 0x08, 0xb6, 0x30, 0xc3, 0x0d, 0x14, 0x08, 0xb6, 0x29, 0xc3, 0x09,
+    0x9e, 0x08, 0xb6, 0x20, 0xc4, 0x02, 0xde, 0x08, 0xb6, 0x19, 0xc2, 0x02,
+    0xa0, 0x08, 0xb6, 0x10, 0xca, 0x9e, 0xaa, 0x08, 0xb5, 0xc1, 0x97, 0x08,
+    0xb4, 0x49, 0x8b, 0x08, 0xb4, 0x39, 0x83, 0x08, 0xb4, 0x08, 0xc2, 0x00,
+    0x39, 0x08, 0xb5, 0x51, 0x83, 0x08, 0xb5, 0x20, 0x83, 0x08, 0xb5, 0x41,
+    0xc2, 0x00, 0xd0, 0x08, 0xb5, 0x38, 0xc2, 0x00, 0xd0, 0x08, 0xb5, 0x09,
+    0x83, 0x08, 0xb5, 0x00, 0xc2, 0x00, 0xd0, 0x08, 0xb4, 0xf9, 0x83, 0x08,
+    0xb4, 0xf0, 0x83, 0x08, 0xb4, 0xe9, 0xc2, 0x00, 0xc1, 0x08, 0xb4, 0xc1,
+    0xc2, 0x19, 0x2c, 0x08, 0xb4, 0x99, 0xc2, 0x01, 0x30, 0x08, 0xb4, 0x70,
+    0xc2, 0x00, 0xd0, 0x08, 0xb4, 0xe1, 0x83, 0x08, 0xb4, 0xd9, 0x06, 0x42,
+    0x48, 0xe6, 0xc2, 0x00, 0xd0, 0x08, 0xb4, 0xd1, 0x83, 0x08, 0xb4, 0xc9,
+    0x16, 0x42, 0x48, 0xf0, 0xc2, 0x00, 0xd0, 0x08, 0xb4, 0x91, 0x83, 0x08,
+    0xb4, 0x88, 0xc2, 0x00, 0xd0, 0x08, 0xb4, 0x81, 0x83, 0x08, 0xb4, 0x78,
+    0xc2, 0x00, 0xd0, 0x08, 0xb4, 0x69, 0x83, 0x08, 0xb4, 0x60, 0xc2, 0x00,
+    0xd0, 0x08, 0xb4, 0x59, 0x83, 0x08, 0xb4, 0x50, 0x97, 0x08, 0xb4, 0x28,
+    0x8b, 0x08, 0xb4, 0x18, 0xc4, 0x1e, 0x97, 0x08, 0xb5, 0xb1, 0xc5, 0x40,
+    0xe7, 0x08, 0xb5, 0x60, 0x97, 0x08, 0xb5, 0xa9, 0x8b, 0x08, 0xb5, 0x99,
+    0x83, 0x08, 0xb5, 0x68, 0x97, 0x08, 0xb5, 0x88, 0x8b, 0x08, 0xb5, 0x78,
+    0xc3, 0x01, 0x95, 0x00, 0xd5, 0x61, 0xc2, 0x69, 0xa6, 0x00, 0xd5, 0x20,
+    0xc5, 0xd7, 0x04, 0x00, 0xd5, 0x53, 0x02, 0x48, 0xfa, 0xc3, 0x29, 0xf7,
+    0x00, 0xd5, 0x11, 0xc3, 0x1c, 0x9f, 0x00, 0xd3, 0x00, 0xc3, 0x04, 0xc6,
+    0x00, 0xd5, 0x43, 0x02, 0x49, 0x00, 0xc3, 0x3f, 0x6f, 0x00, 0xd5, 0x19,
+    0x44, 0xdf, 0xcf, 0x42, 0x49, 0x06, 0xc5, 0xd4, 0x98, 0x00, 0xd5, 0x39,
+    0xc3, 0x71, 0xe5, 0x00, 0xd3, 0xd9, 0xc4, 0xe0, 0xe3, 0x00, 0xd3, 0xa2,
+    0x02, 0x49, 0x12, 0xd4, 0x3c, 0x78, 0x00, 0xd5, 0x31, 0xc6, 0xd1, 0x81,
+    0x00, 0xd3, 0xd0, 0xc4, 0xde, 0xb7, 0x00, 0xd5, 0x08, 0x9f, 0x00, 0xd3,
+    0xb1, 0x9e, 0x00, 0xd3, 0xa8, 0xc4, 0x18, 0x10, 0x00, 0xd4, 0xb9, 0xc2,
+    0x22, 0xcc, 0x00, 0xd4, 0xb0, 0xc3, 0x0d, 0x14, 0x00, 0xd4, 0xa9, 0xc3,
+    0x09, 0x9e, 0x00, 0xd4, 0xa0, 0xc4, 0x02, 0xde, 0x00, 0xd4, 0x99, 0xc2,
+    0x02, 0xa0, 0x00, 0xd4, 0x90, 0xc4, 0x18, 0x10, 0x00, 0xd4, 0x39, 0xc2,
+    0x22, 0xcc, 0x00, 0xd4, 0x30, 0xc3, 0x0d, 0x14, 0x00, 0xd4, 0x29, 0xc3,
+    0x09, 0x9e, 0x00, 0xd4, 0x20, 0xc4, 0x02, 0xde, 0x00, 0xd4, 0x19, 0xc2,
+    0x02, 0xa0, 0x00, 0xd4, 0x10, 0xc2, 0x0d, 0xf6, 0x00, 0xd2, 0xf1, 0xc2,
+    0x01, 0x5d, 0x00, 0xd2, 0xe9, 0x0f, 0xc2, 0x49, 0x18, 0xd4, 0x3c, 0xf0,
+    0x00, 0xd2, 0xd9, 0x0e, 0xc2, 0x49, 0x22, 0xc9, 0xb4, 0x2e, 0x00, 0xd2,
+    0xc8, 0x42, 0x01, 0x31, 0xc2, 0x49, 0x2e, 0x91, 0x00, 0xd3, 0x81, 0x9b,
+    0x00, 0xd3, 0x68, 0xc6, 0xd2, 0xbf, 0x00, 0xd3, 0x91, 0xc6, 0xc6, 0xb8,
+    0x00, 0xd3, 0x20, 0x8b, 0x00, 0xd3, 0x89, 0x87, 0x00, 0xd3, 0x79, 0x83,
+    0x00, 0xd3, 0x18, 0x97, 0x00, 0xd3, 0x53, 0x02, 0x49, 0x3a, 0x87, 0x00,
+    0xd3, 0x38, 0x8b, 0x00, 0xd3, 0x30, 0x83, 0x00, 0xd2, 0x1b, 0x02, 0x49,
+    0x3e, 0x43, 0x02, 0x5f, 0xc2, 0x49, 0x42, 0xc2, 0x00, 0xdb, 0x00, 0xd2,
+    0x51, 0xc2, 0x0f, 0xe1, 0x00, 0xd2, 0x20, 0x97, 0x00, 0xd2, 0x80, 0x8b,
+    0x00, 0xd2, 0x70, 0xc2, 0x00, 0xd0, 0x00, 0xd2, 0x49, 0x15, 0xc2, 0x49,
+    0x70, 0xc2, 0x19, 0x2c, 0x00, 0xd2, 0x01, 0xc2, 0x00, 0x87, 0x00, 0xd1,
+    0xd1, 0x12, 0xc2, 0x49, 0x80, 0x16, 0xc2, 0x49, 0x8a, 0xc5, 0x3c, 0xf5,
+    0x00, 0xd1, 0x71, 0x05, 0xc2, 0x49, 0x94, 0x0d, 0x42, 0x49, 0x9e, 0xc2,
+    0x0f, 0xe1, 0x00, 0xd2, 0x11, 0x83, 0x00, 0xd2, 0x0a, 0x02, 0x49, 0xae,
+    0x83, 0x00, 0xd1, 0xb1, 0xc2, 0x19, 0x2c, 0x00, 0xd1, 0x61, 0xc2, 0x01,
+    0x30, 0x00, 0xd1, 0x30, 0xa3, 0x00, 0xcb, 0xa1, 0xa2, 0x00, 0xcb, 0x99,
+    0xa1, 0x00, 0xcb, 0x91, 0xa0, 0x00, 0xcb, 0x89, 0x9f, 0x00, 0xcb, 0x80,
+    0xc2, 0x00, 0xd0, 0x00, 0xcb, 0x09, 0x83, 0x00, 0xca, 0x98, 0xc5, 0xd8,
+    0x3f, 0x05, 0x56, 0xf9, 0x90, 0x05, 0x56, 0xd8, 0x8f, 0x05, 0x55, 0xf1,
+    0x90, 0x05, 0x55, 0xe9, 0x9b, 0x05, 0x55, 0xe1, 0xc2, 0x0f, 0xe1, 0x05,
+    0x55, 0xd9, 0x83, 0x05, 0x55, 0x88, 0x83, 0x05, 0x55, 0xd1, 0x87, 0x05,
+    0x55, 0x9a, 0x02, 0x49, 0xba, 0x83, 0x05, 0x55, 0xc0, 0x91, 0x05, 0x55,
+    0x79, 0xc2, 0x01, 0x23, 0x05, 0x55, 0x69, 0xc2, 0x17, 0xbd, 0x05, 0x55,
+    0x59, 0xc2, 0x01, 0xc8, 0x05, 0x55, 0x49, 0xc2, 0x00, 0x79, 0x05, 0x55,
+    0x39, 0xc2, 0x42, 0xcd, 0x05, 0x55, 0x29, 0xc2, 0x00, 0xa2, 0x05, 0x55,
+    0x19, 0xc2, 0x01, 0x03, 0x05, 0x55, 0x09, 0x12, 0xc2, 0x49, 0xbe, 0xc2,
+    0x00, 0x6b, 0x05, 0x54, 0xd9, 0x10, 0xc2, 0x49, 0xc8, 0x16, 0xc2, 0x49,
+    0xd8, 0xc2, 0x00, 0x58, 0x05, 0x54, 0x99, 0x05, 0xc2, 0x49, 0xe2, 0xc2,
+    0x0f, 0x7b, 0x05, 0x54, 0x39, 0x0d, 0xc2, 0x49, 0xec, 0xc2, 0x00, 0xfb,
+    0x05, 0x54, 0x78, 0x91, 0x05, 0x55, 0x71, 0xc2, 0x01, 0x23, 0x05, 0x55,
+    0x61, 0xc2, 0x17, 0xbd, 0x05, 0x55, 0x51, 0xc2, 0x01, 0xc8, 0x05, 0x55,
+    0x41, 0xc2, 0x00, 0x79, 0x05, 0x55, 0x31, 0xc2, 0x42, 0xcd, 0x05, 0x55,
+    0x21, 0xc2, 0x00, 0xa2, 0x05, 0x55, 0x11, 0xc2, 0x01, 0x03, 0x05, 0x55,
+    0x01, 0x12, 0xc2, 0x49, 0xf4, 0xc2, 0x00, 0x6b, 0x05, 0x54, 0xd1, 0x10,
+    0xc2, 0x49, 0xfe, 0x16, 0xc2, 0x4a, 0x0e, 0xc2, 0x00, 0x58, 0x05, 0x54,
+    0x91, 0x05, 0xc2, 0x4a, 0x18, 0xc2, 0x0f, 0x7b, 0x05, 0x54, 0x31, 0x0d,
+    0xc2, 0x4a, 0x22, 0xc2, 0x00, 0xfb, 0x05, 0x54, 0x70, 0xd2, 0x49, 0xe5,
+    0x0f, 0xb2, 0xb1, 0xd2, 0x47, 0x15, 0x0f, 0xb2, 0xa0, 0xc4, 0x02, 0xde,
+    0x01, 0x0c, 0x59, 0xc2, 0x02, 0xa0, 0x01, 0x0c, 0x50, 0x9b, 0x01, 0x0a,
+    0x21, 0x8e, 0x01, 0x0a, 0x11, 0x89, 0x01, 0x0a, 0x08, 0xd2, 0x49, 0xe5,
+    0x0f, 0xb2, 0xb9, 0xd2, 0x47, 0x15, 0x0f, 0xb2, 0xa8, 0xc4, 0x00, 0x49,
+    0x01, 0x34, 0xf9, 0xc5, 0x00, 0x2c, 0x01, 0x34, 0xf0, 0xc5, 0x00, 0x2c,
+    0x0f, 0xaf, 0x39, 0xc4, 0x00, 0x49, 0x0f, 0xaf, 0x31, 0xc5, 0x05, 0x02,
+    0x0f, 0xaf, 0x29, 0xc5, 0x00, 0xd4, 0x0f, 0xaf, 0x20, 0x4b, 0x03, 0x87,
+    0xc2, 0x4a, 0x2a, 0xdf, 0x0d, 0x7c, 0x01, 0x5c, 0xc0, 0xe0, 0x0b, 0xe7,
+    0x01, 0x5c, 0xc8, 0xe0, 0x07, 0xe7, 0x01, 0x3d, 0x18, 0xe0, 0x03, 0xc7,
+    0x01, 0x5c, 0xd8, 0xc6, 0x13, 0x52, 0x0f, 0xbd, 0x41, 0xc4, 0x40, 0x89,
+    0x01, 0x00, 0x48, 0xc5, 0xd6, 0x91, 0x00, 0x3d, 0x19, 0xc8, 0xb8, 0x1a,
+    0x00, 0x3c, 0x79, 0xc4, 0xd8, 0x3b, 0x00, 0x3c, 0x70, 0x91, 0x00, 0x3d,
+    0x01, 0xc7, 0xb4, 0xdb, 0x00, 0x3c, 0x99, 0xc3, 0x39, 0x6e, 0x00, 0x3c,
+    0x63, 0x02, 0x4a, 0x36, 0xc3, 0x04, 0xc5, 0x00, 0x3c, 0xc0, 0x03, 0xc2,
+    0x4a, 0x3c, 0xc5, 0xd7, 0x22, 0x00, 0x3c, 0x58, 0xc5, 0xd9, 0x20, 0x00,
+    0x3c, 0xf1, 0x0a, 0xc2, 0x4a, 0x48, 0xc4, 0xe2, 0xd7, 0x00, 0x3c, 0x80,
+    0xc3, 0x39, 0x6e, 0x00, 0x3c, 0xc9, 0xc2, 0x04, 0xc6, 0x00, 0x3c, 0x00,
+    0x03, 0xc2, 0x4a, 0x54, 0x91, 0x00, 0x3d, 0x08, 0xc4, 0xe1, 0xff, 0x00,
+    0x3c, 0x69, 0xc8, 0xb4, 0xda, 0x00, 0x3c, 0x28, 0xc4, 0xe1, 0x03, 0x00,
+    0x3c, 0x39, 0xc3, 0x16, 0xc3, 0x00, 0x3d, 0x10, 0xc4, 0xd8, 0x3b, 0x00,
+    0x3c, 0x31, 0xc3, 0x39, 0x6e, 0x00, 0x3c, 0xd0, 0xc4, 0x2b, 0xa7, 0x00,
+    0x3c, 0x11, 0xc2, 0x04, 0xc6, 0x00, 0x3d, 0x88, 0x0d, 0xc2, 0x4a, 0x5e,
+    0x10, 0xc2, 0x4a, 0x6a, 0x46, 0xcc, 0x6b, 0xc2, 0x4a, 0x7c, 0x15, 0xc2,
+    0x4a, 0x91, 0x1b, 0xc2, 0x4a, 0x9d, 0x43, 0x5d, 0x85, 0xc2, 0x4a, 0xa9,
+    0x16, 0xc2, 0x4a, 0xb5, 0xc9, 0xb4, 0x0a, 0x00, 0x70, 0xd1, 0x12, 0xc2,
+    0x4a, 0xbf, 0x42, 0x01, 0x03, 0xc2, 0x4a, 0xcf, 0x0f, 0xc2, 0x4a, 0xde,
+    0x14, 0xc2, 0x4a, 0xea, 0x0e, 0xc2, 0x4a, 0xf4, 0xc7, 0xc2, 0x5e, 0x00,
+    0x71, 0x39, 0x43, 0x60, 0xe8, 0xc2, 0x4b, 0x04, 0xc5, 0xd9, 0xd9, 0x00,
+    0x71, 0x69, 0xca, 0x9e, 0xbe, 0x00, 0x72, 0xd0, 0xc2, 0x02, 0xa0, 0x00,
+    0x72, 0x91, 0xc4, 0x02, 0xde, 0x00, 0x72, 0x98, 0xc3, 0x09, 0x9e, 0x00,
+    0x72, 0xa1, 0xc3, 0x0d, 0x14, 0x00, 0x72, 0xa8, 0xc2, 0x22, 0xcc, 0x00,
+    0x72, 0xb1, 0xc4, 0x18, 0x10, 0x00, 0x72, 0xb8, 0x87, 0x0f, 0x15, 0x58,
+    0x47, 0xc2, 0xe3, 0xc2, 0x4b, 0x10, 0x83, 0x0f, 0x14, 0x88, 0x91, 0x0f,
+    0x15, 0x40, 0x97, 0x0f, 0x15, 0x18, 0xc2, 0x01, 0x30, 0x0f, 0x14, 0xc1,
+    0x83, 0x0f, 0x14, 0xb8, 0xd0, 0x59, 0x72, 0x01, 0x4e, 0x69, 0xc8, 0x52,
+    0x09, 0x01, 0x4e, 0x59, 0xc9, 0x16, 0x14, 0x01, 0x4e, 0x51, 0xcf, 0x13,
+    0x5e, 0x0f, 0xb6, 0x30, 0xc4, 0x55, 0x73, 0x0e, 0x9a, 0x49, 0xc9, 0xaf,
+    0x15, 0x0e, 0x99, 0xe0, 0xc5, 0xba, 0x65, 0x0e, 0x9a, 0x91, 0xc5, 0x08,
+    0xe6, 0x0e, 0x9a, 0x70, 0xc6, 0xd0, 0x55, 0x0e, 0x99, 0xc1, 0x16, 0x42,
+    0x4b, 0x24, 0xc7, 0xc0, 0x58, 0x0e, 0x99, 0xe9, 0xc4, 0x1d, 0xa8, 0x0e,
+    0x99, 0x30, 0xc5, 0xd7, 0x63, 0x0e, 0x9a, 0x61, 0xc2, 0x00, 0x5f, 0x0e,
+    0x99, 0x88, 0xc5, 0xd7, 0x7c, 0x0e, 0x99, 0x71, 0x0b, 0x42, 0x4b, 0x36,
+    0xc5, 0x7c, 0xec, 0x01, 0x18, 0xa9, 0xc5, 0x36, 0xc0, 0x0f, 0xa6, 0xf2,
+    0x02, 0x4b, 0x42, 0x49, 0x29, 0x29, 0xc2, 0x4b, 0x48, 0xca, 0x1e, 0x8a,
+    0x00, 0x60, 0x08, 0xc7, 0x14, 0x39, 0x00, 0x60, 0x11, 0xc7, 0x7a, 0x7f,
+    0x00, 0x61, 0xe8, 0xc5, 0x40, 0xe7, 0x00, 0x60, 0x19, 0xc4, 0x1e, 0x97,
+    0x00, 0x62, 0x68, 0x83, 0x00, 0x60, 0x2b, 0x02, 0x4b, 0x54, 0x8b, 0x00,
+    0x60, 0x3b, 0x02, 0x4b, 0x60, 0x97, 0x00, 0x60, 0x4b, 0x02, 0x4b, 0x64,
+    0x18, 0xc2, 0x4b, 0x68, 0x87, 0x00, 0x60, 0x73, 0x02, 0x4b, 0x72, 0x91,
+    0x00, 0x60, 0x93, 0x02, 0x4b, 0x76, 0x0d, 0xc2, 0x4b, 0x7a, 0x09, 0xc2,
+    0x4b, 0x84, 0x10, 0xc2, 0x4b, 0x8e, 0x05, 0xc2, 0x4b, 0xa7, 0x0c, 0xc2,
+    0x4b, 0xb1, 0x16, 0xc2, 0x4b, 0xbb, 0x06, 0xc2, 0x4b, 0xcf, 0x12, 0xc2,
+    0x4b, 0xe3, 0x04, 0xc2, 0x4b, 0xed, 0xc2, 0x01, 0xc3, 0x00, 0x61, 0x71,
+    0xc2, 0x19, 0x2c, 0x00, 0x61, 0x79, 0x14, 0xc2, 0x4b, 0xf7, 0x0e, 0xc2,
+    0x4b, 0xff, 0x15, 0xc2, 0x4c, 0x07, 0xc2, 0x00, 0xd0, 0x00, 0x61, 0xc8,
+    0x83, 0x00, 0x61, 0xf1, 0x8b, 0x00, 0x62, 0x41, 0x97, 0x00, 0x62, 0x60,
+    0x8b, 0x00, 0x62, 0x00, 0x97, 0x00, 0x62, 0x10, 0x94, 0x00, 0x62, 0x1b,
+    0x02, 0x4c, 0x17, 0x8e, 0x00, 0x63, 0x12, 0x02, 0x4c, 0x1b, 0x87, 0x00,
+    0x62, 0x38, 0x91, 0x00, 0x62, 0x58, 0xc2, 0x02, 0xa0, 0x00, 0x63, 0x41,
+    0xc4, 0x02, 0xde, 0x00, 0x63, 0x48, 0xc3, 0x09, 0x9e, 0x00, 0x63, 0x51,
+    0xc3, 0x0d, 0x14, 0x00, 0x63, 0x58, 0xc2, 0x22, 0xcc, 0x00, 0x63, 0x61,
+    0xc4, 0x18, 0x10, 0x00, 0x63, 0x68, 0xd2, 0x15, 0xf0, 0x00, 0x63, 0xc9,
+    0xd3, 0x45, 0xbf, 0x00, 0x63, 0xe0, 0x47, 0xc3, 0x99, 0xc2, 0x4c, 0x1f,
+    0x49, 0xaa, 0x8c, 0x42, 0x4c, 0x2b, 0x46, 0x00, 0xd4, 0xc2, 0x4c, 0x37,
+    0x45, 0x00, 0x8c, 0x42, 0x4c, 0x43, 0xc5, 0x00, 0xd4, 0x01, 0x70, 0xf1,
+    0xc5, 0x05, 0x02, 0x01, 0x70, 0xf8, 0xc4, 0x18, 0x10, 0x08, 0xa6, 0xb9,
+    0xc2, 0x22, 0xcc, 0x08, 0xa6, 0xb0, 0xc3, 0x0d, 0x14, 0x08, 0xa6, 0xa9,
+    0xc3, 0x09, 0x9e, 0x08, 0xa6, 0xa0, 0xc4, 0x02, 0xde, 0x08, 0xa6, 0x99,
+    0xc2, 0x02, 0xa0, 0x08, 0xa6, 0x90, 0xc7, 0x7a, 0x7f, 0x08, 0xa6, 0x21,
+    0xc7, 0x14, 0x39, 0x08, 0xa6, 0x00, 0xc5, 0x40, 0xe7, 0x08, 0xa6, 0x09,
+    0xc4, 0x1e, 0x97, 0x08, 0xa6, 0x10, 0x97, 0x08, 0xa5, 0xf1, 0x8b, 0x08,
+    0xa5, 0xd9, 0x83, 0x08, 0xa5, 0x80, 0x91, 0x08, 0xa5, 0xe9, 0x87, 0x08,
+    0xa5, 0xd0, 0x8e, 0x08, 0xa5, 0xbb, 0x02, 0x4c, 0x4f, 0x94, 0x08, 0xa5,
+    0xaa, 0x02, 0x4c, 0x53, 0x97, 0x08, 0xa5, 0xa0, 0x8b, 0x08, 0xa5, 0x90,
+    0x83, 0x08, 0xa5, 0x71, 0xc2, 0x0d, 0xf6, 0x08, 0xa5, 0x69, 0xc2, 0x00,
+    0xd0, 0x08, 0xa5, 0x60, 0x83, 0x08, 0xa5, 0x59, 0x47, 0xb2, 0x2e, 0x42,
+    0x4c, 0x57, 0xc2, 0x00, 0xd0, 0x08, 0xa5, 0x31, 0x83, 0x08, 0xa5, 0x28,
+    0xc2, 0x00, 0xd0, 0x08, 0xa5, 0x21, 0x83, 0x08, 0xa5, 0x18, 0x83, 0x08,
+    0xa5, 0x11, 0xc2, 0x00, 0xc1, 0x08, 0xa4, 0xe9, 0xc2, 0x19, 0x2c, 0x08,
+    0xa4, 0xc1, 0xc2, 0x01, 0x30, 0x08, 0xa4, 0x98, 0xc2, 0x00, 0xd0, 0x08,
+    0xa5, 0x09, 0x83, 0x08, 0xa5, 0x01, 0x06, 0x42, 0x4c, 0x65, 0xc2, 0x00,
+    0xd0, 0x08, 0xa4, 0xf9, 0x83, 0x08, 0xa4, 0xf1, 0x16, 0x42, 0x4c, 0x6f,
+    0xc2, 0x00, 0xd0, 0x08, 0xa4, 0xb9, 0x83, 0x08, 0xa4, 0xb0, 0xc2, 0x00,
+    0xd0, 0x08, 0xa4, 0xa9, 0x83, 0x08, 0xa4, 0xa0, 0xc2, 0x00, 0xd0, 0x08,
+    0xa4, 0x91, 0x83, 0x08, 0xa4, 0x88, 0xc2, 0x00, 0xd0, 0x08, 0xa4, 0x81,
+    0x83, 0x08, 0xa4, 0x78, 0x97, 0x08, 0xa4, 0x71, 0x8b, 0x08, 0xa4, 0x61,
+    0x83, 0x08, 0xa4, 0x10, 0x97, 0x08, 0xa4, 0x30, 0x8b, 0x08, 0xa4, 0x20,
+    0xc7, 0xc2, 0xa4, 0x00, 0x7e, 0x21, 0xc7, 0xc4, 0xfe, 0x00, 0x7e, 0x2b,
+    0x02, 0x4c, 0x79, 0x12, 0xc2, 0x4c, 0x7f, 0xc6, 0xcc, 0x47, 0x00, 0x7e,
+    0x4a, 0x02, 0x4c, 0x8b, 0x44, 0xa9, 0xbe, 0xc2, 0x4c, 0x8f, 0xcd, 0x75,
+    0xf4, 0x00, 0x7b, 0xf1, 0xc8, 0x85, 0x06, 0x00, 0x7b, 0xf8, 0xc7, 0xbe,
+    0xe3, 0x00, 0x79, 0xf1, 0xc8, 0xb8, 0xd2, 0x00, 0x7c, 0x38, 0xc8, 0xbe,
+    0xe2, 0x00, 0x79, 0xf9, 0xc7, 0x4f, 0xa6, 0x00, 0x7c, 0x48, 0xc7, 0xc1,
+    0x3f, 0x00, 0x7c, 0x31, 0xc9, 0x8e, 0x8e, 0x00, 0x7c, 0x40, 0xcb, 0x95,
+    0xda, 0x00, 0x7c, 0x51, 0xcb, 0x99, 0x08, 0x00, 0x7c, 0x58, 0xcb, 0x8e,
+    0x8c, 0x00, 0x7c, 0x69, 0xc8, 0x4f, 0xa5, 0x00, 0x7c, 0x71, 0xd1, 0x4f,
+    0x9c, 0x00, 0x7c, 0x78, 0x0d, 0xc2, 0x4c, 0x9b, 0x09, 0xc2, 0x4c, 0xab,
+    0x10, 0xc2, 0x4c, 0xb5, 0x05, 0xc2, 0x4c, 0xcb, 0xc2, 0x25, 0x3b, 0x00,
+    0x7c, 0xb9, 0x16, 0xc2, 0x4c, 0xd5, 0x06, 0xc2, 0x4c, 0xe7, 0x12, 0xc2,
+    0x4c, 0xf9, 0x04, 0xc2, 0x4d, 0x03, 0xc2, 0x01, 0xc3, 0x00, 0x7d, 0x41,
+    0xc2, 0x01, 0x4a, 0x00, 0x7d, 0x69, 0x1c, 0xc2, 0x4d, 0x0d, 0xc2, 0x00,
+    0x02, 0x00, 0x7d, 0x81, 0xc2, 0x19, 0x2c, 0x00, 0x7d, 0x89, 0xc2, 0x00,
+    0x39, 0x00, 0x7d, 0x91, 0xc2, 0x00, 0xdb, 0x00, 0x7d, 0x99, 0x15, 0xc2,
+    0x4d, 0x17, 0xc2, 0x00, 0xd0, 0x00, 0x7d, 0xb9, 0x83, 0x00, 0x7d, 0xc1,
+    0x4b, 0x7f, 0xe8, 0x42, 0x4d, 0x27, 0x48, 0x16, 0x5f, 0xc2, 0x4d, 0x39,
+    0xc5, 0x32, 0x89, 0x00, 0x78, 0xa0, 0xc2, 0x00, 0x45, 0x00, 0x79, 0xd1,
+    0xc2, 0x02, 0x2c, 0x00, 0x79, 0xd8, 0xcf, 0x16, 0x5f, 0x00, 0x78, 0x21,
+    0xdb, 0x16, 0x53, 0x00, 0x7e, 0x98, 0xcf, 0x16, 0x7a, 0x00, 0x78, 0x29,
+    0xdb, 0x16, 0x6e, 0x00, 0x7e, 0xa0, 0xd4, 0x3f, 0x48, 0x00, 0x78, 0x31,
+    0x4c, 0x82, 0xad, 0x42, 0x4d, 0x45, 0x0d, 0xc2, 0x4d, 0x51, 0xc9, 0xb5,
+    0x0f, 0x00, 0x79, 0xa0, 0xc7, 0x16, 0x5f, 0x00, 0x78, 0x51, 0xcc, 0x2e,
+    0x06, 0x00, 0x7e, 0x80, 0xc4, 0x01, 0xe2, 0x00, 0x78, 0x71, 0xc5, 0x32,
+    0x89, 0x00, 0x7e, 0x92, 0x02, 0x4d, 0x5d, 0xc7, 0x70, 0x50, 0x00, 0x79,
+    0xa9, 0xca, 0xa3, 0xe6, 0x00, 0x79, 0xb8, 0xc8, 0x32, 0x8b, 0x00, 0x78,
+    0x79, 0xc7, 0xc1, 0x70, 0x00, 0x79, 0xc8, 0x83, 0x00, 0x7a, 0x01, 0xc2,
+    0x00, 0xd0, 0x00, 0x7a, 0x09, 0xc3, 0x1d, 0x35, 0x00, 0x7b, 0x49, 0xc2,
+    0x02, 0x2b, 0x00, 0x7b, 0x58, 0x83, 0x00, 0x7a, 0x11, 0xc2, 0x00, 0xd0,
+    0x00, 0x7a, 0x18, 0xc2, 0x01, 0x30, 0x00, 0x7a, 0x21, 0xc2, 0x19, 0x2c,
+    0x00, 0x7a, 0x49, 0xc2, 0x00, 0xc1, 0x00, 0x7a, 0x71, 0x83, 0x00, 0x7a,
+    0x98, 0x83, 0x00, 0x7a, 0x29, 0xc2, 0x00, 0xd0, 0x00, 0x7a, 0x30, 0x16,
+    0xc2, 0x4d, 0x63, 0x83, 0x00, 0x7a, 0x79, 0xc2, 0x00, 0xd0, 0x00, 0x7a,
+    0x81, 0x15, 0x42, 0x4d, 0x6d, 0x06, 0xc2, 0x4d, 0x77, 0x83, 0x00, 0x7a,
+    0x89, 0xc2, 0x00, 0xd0, 0x00, 0x7a, 0x91, 0x1c, 0x42, 0x4d, 0x81, 0x83,
+    0x00, 0x7a, 0xa1, 0xc2, 0x00, 0xd0, 0x00, 0x7a, 0xa8, 0x83, 0x00, 0x7a,
+    0xb1, 0xc2, 0x00, 0xd0, 0x00, 0x7a, 0xb8, 0xc2, 0x00, 0xd0, 0x00, 0x7a,
+    0xf1, 0x83, 0x00, 0x7a, 0xf8, 0x83, 0x00, 0x7b, 0x11, 0xc2, 0x00, 0x39,
+    0x00, 0x7b, 0x60, 0xc2, 0x00, 0xd0, 0x00, 0x7b, 0x21, 0xc2, 0x0d, 0xf6,
+    0x00, 0x7b, 0x29, 0x83, 0x00, 0x7b, 0x30, 0xc2, 0x02, 0xa0, 0x00, 0x79,
+    0x59, 0xc4, 0x02, 0xde, 0x00, 0x79, 0x60, 0xc3, 0x09, 0x9e, 0x00, 0x79,
+    0x69, 0xc3, 0x0d, 0x14, 0x00, 0x79, 0x70, 0xc2, 0x22, 0xcc, 0x00, 0x79,
+    0x79, 0xc4, 0x18, 0x10, 0x00, 0x79, 0x80, 0x94, 0x00, 0x7b, 0xb8, 0x8e,
+    0x00, 0x7b, 0xc8, 0x84, 0x01, 0x69, 0x8b, 0x02, 0x4d, 0x8b, 0x89, 0x01,
+    0x69, 0x9b, 0x02, 0x4d, 0x8f, 0x8c, 0x01, 0x69, 0xb1, 0x86, 0x01, 0x69,
+    0xbb, 0x02, 0x4d, 0x96, 0x88, 0x01, 0x69, 0xe1, 0x8d, 0x01, 0x69, 0xeb,
+    0x02, 0x4d, 0xa1, 0x8a, 0x01, 0x6a, 0x03, 0x02, 0x4d, 0xa8, 0x83, 0x01,
+    0x6a, 0x21, 0x93, 0x01, 0x6a, 0x39, 0x9c, 0x01, 0x6b, 0x1b, 0x02, 0x4d,
+    0xac, 0x8e, 0x01, 0x6a, 0x69, 0x8f, 0x01, 0x6a, 0x71, 0x90, 0x01, 0x6a,
+    0x79, 0x92, 0x01, 0x6a, 0x91, 0x94, 0x01, 0x6a, 0xa3, 0x02, 0x4d, 0xb4,
+    0x95, 0x01, 0x6a, 0xcb, 0x02, 0x4d, 0xb8, 0x96, 0x01, 0x6a, 0xe3, 0x02,
+    0x4d, 0xc0, 0xc2, 0x11, 0xee, 0x01, 0x6a, 0xf1, 0x98, 0x01, 0x6b, 0x01,
+    0x99, 0x01, 0x6b, 0x09, 0x9b, 0x01, 0x6b, 0x10, 0x9b, 0x01, 0x69, 0xd8,
+    0x8d, 0x01, 0x69, 0xf3, 0x02, 0x4d, 0xc8, 0x8a, 0x01, 0x6a, 0x11, 0x93,
+    0x01, 0x6a, 0x41, 0xc2, 0x25, 0xa1, 0x01, 0x6a, 0x61, 0x09, 0xc2, 0x4d,
+    0xcc, 0xc2, 0x00, 0x75, 0x01, 0x6a, 0x88, 0xcb, 0x05, 0x1c, 0x01, 0x02,
+    0xd1, 0xc6, 0x72, 0x26, 0x01, 0x01, 0x28, 0x0c, 0xc2, 0x4d, 0xd4, 0x0a,
+    0xc2, 0x4d, 0xe0, 0x15, 0xc2, 0x4d, 0xec, 0x4b, 0x92, 0x75, 0xc2, 0x4e,
+    0x00, 0x03, 0xc2, 0x4e, 0x18, 0x16, 0xc2, 0x4e, 0x2e, 0x49, 0xab, 0xf4,
+    0xc2, 0x4e, 0x3c, 0x4a, 0x60, 0x7b, 0xc2, 0x4e, 0x70, 0x0d, 0xc2, 0x4e,
+    0xa4, 0x49, 0x0d, 0xff, 0xc2, 0x4e, 0xb0, 0x13, 0xc2, 0x4e, 0xd2, 0x49,
+    0xb1, 0x0d, 0xc2, 0x4e, 0xdc, 0x04, 0xc2, 0x4f, 0x00, 0x14, 0xc2, 0x4f,
+    0x0c, 0x0f, 0xc2, 0x4f, 0x16, 0x4e, 0x74, 0x6a, 0xc2, 0x4f, 0x22, 0x49,
+    0xb2, 0x00, 0xc2, 0x4f, 0x2c, 0x56, 0x2b, 0xaa, 0xc2, 0x4f, 0x56, 0xd6,
+    0x30, 0xd2, 0x07, 0xef, 0xc0, 0x4d, 0x7f, 0x8d, 0xc2, 0x4f, 0x5c, 0x45,
+    0x02, 0x10, 0x42, 0x4f, 0x68, 0x4a, 0x9a, 0xea, 0xc2, 0x4f, 0xe9, 0xcc,
+    0x27, 0x7f, 0x00, 0x46, 0x88, 0xd4, 0x39, 0xf8, 0x00, 0x47, 0xf9, 0xcb,
+    0x3a, 0x01, 0x00, 0x32, 0xc0, 0xc7, 0xc3, 0xca, 0x00, 0x44, 0xe1, 0xc7,
+    0x2b, 0x4a, 0x00, 0x32, 0x98, 0x06, 0xc2, 0x4f, 0xfb, 0x03, 0xc2, 0x50,
+    0x03, 0xc3, 0x85, 0xf5, 0x0f, 0x70, 0x09, 0xc4, 0x30, 0xc1, 0x0f, 0x70,
+    0x11, 0xc3, 0x7e, 0x89, 0x0f, 0x70, 0x29, 0x42, 0x02, 0x1c, 0xc2, 0x50,
+    0x0f, 0xc3, 0x14, 0x4b, 0x0f, 0x70, 0x39, 0x16, 0xc2, 0x50, 0x19, 0xc3,
+    0x2b, 0xb9, 0x0f, 0x70, 0x49, 0x0d, 0xc2, 0x50, 0x27, 0x0e, 0xc2, 0x50,
+    0x33, 0xc4, 0x19, 0x60, 0x0f, 0x70, 0x61, 0xc4, 0x3a, 0x01, 0x0f, 0x70,
+    0x69, 0x15, 0xc2, 0x50, 0x3f, 0xc3, 0x0f, 0x9a, 0x0f, 0x70, 0x91, 0xc3,
+    0x72, 0xf0, 0x0f, 0x70, 0x99, 0x48, 0x10, 0xb4, 0xc2, 0x50, 0x57, 0x49,
+    0x18, 0x67, 0xc2, 0x50, 0xa9, 0xc3, 0xb1, 0x0d, 0x0f, 0x70, 0x81, 0xc5,
+    0x92, 0x75, 0x0f, 0x70, 0xd8, 0xc3, 0x0a, 0x8c, 0x00, 0x32, 0x7b, 0x02,
+    0x50, 0xb5, 0xcc, 0x85, 0x29, 0x00, 0x30, 0x68, 0xd6, 0x2f, 0x9e, 0x00,
+    0x47, 0xdb, 0x02, 0x50, 0xc2, 0xc7, 0xc0, 0x51, 0x00, 0x44, 0xf0, 0xc5,
+    0x00, 0xd4, 0x00, 0x47, 0xc3, 0x02, 0x50, 0xc8, 0xc5, 0x05, 0x02, 0x00,
+    0x47, 0xd0, 0xce, 0x71, 0x14, 0x00, 0x44, 0x41, 0x9b, 0x00, 0x30, 0x40,
+    0xe0, 0x08, 0xc7, 0x00, 0x37, 0x60, 0xce, 0x6d, 0xe8, 0x00, 0x47, 0xb1,
+    0xcd, 0x00, 0xfa, 0x07, 0xf3, 0xd1, 0xcb, 0x64, 0x7b, 0x07, 0xf3, 0xd8,
+    0xce, 0x00, 0xf9, 0x07, 0xf3, 0xa0, 0x00, 0xc2, 0x50, 0xce, 0xc3, 0x13,
+    0x00, 0x00, 0x32, 0x5a, 0x02, 0x50, 0xe0, 0x45, 0x08, 0xcb, 0xc2, 0x50,
+    0xe6, 0x44, 0x05, 0x36, 0xc2, 0x51, 0x3a, 0x42, 0x00, 0x87, 0xc2, 0x51,
+    0x50, 0xc3, 0x2b, 0xb9, 0x00, 0x37, 0x31, 0xc3, 0x7e, 0x89, 0x00, 0x37,
+    0x29, 0xc5, 0x4d, 0x40, 0x00, 0x30, 0xd1, 0xc5, 0x52, 0x4a, 0x00, 0x30,
+    0xc8, 0xc3, 0x2d, 0x2c, 0x00, 0x32, 0x93, 0x02, 0x51, 0x5c, 0xd8, 0x22,
+    0xeb, 0x00, 0x44, 0xe9, 0xcc, 0x86, 0x9d, 0x00, 0x32, 0xb0, 0x4a, 0xa3,
+    0xf0, 0xc2, 0x51, 0x60, 0xc4, 0x00, 0x9d, 0x07, 0xdd, 0xf9, 0x16, 0xc2,
+    0x51, 0x6c, 0x42, 0x00, 0x58, 0xc2, 0x51, 0x78, 0x4a, 0x3b, 0x79, 0xc2,
+    0x51, 0x84, 0xcb, 0x8f, 0x7e, 0x07, 0xde, 0x10, 0x15, 0xc2, 0x51, 0x90,
+    0xc9, 0xac, 0x0f, 0x00, 0x30, 0xa1, 0x42, 0x00, 0x39, 0xc2, 0x51, 0x9a,
+    0xcf, 0x6b, 0x70, 0x00, 0x30, 0x89, 0xc5, 0xda, 0xc4, 0x00, 0x30, 0x78,
+    0x00, 0x42, 0x51, 0xa6, 0x45, 0xd9, 0x57, 0xc2, 0x51, 0xb2, 0x49, 0x04,
+    0xf9, 0xc2, 0x51, 0xbe, 0x48, 0x05, 0x14, 0x42, 0x51, 0xca, 0xc5, 0x19,
+    0x75, 0x00, 0x32, 0x03, 0x02, 0x51, 0xd6, 0xcb, 0x92, 0xee, 0x07, 0xf3,
+    0x98, 0xc5, 0x4d, 0x40, 0x00, 0x47, 0x33, 0x02, 0x51, 0xdc, 0xc5, 0x52,
+    0x4a, 0x00, 0x47, 0x2b, 0x02, 0x51, 0xe2, 0xc5, 0x63, 0x73, 0x00, 0x47,
+    0x22, 0x02, 0x51, 0xe8, 0xc5, 0x00, 0xd4, 0x00, 0x32, 0xa1, 0xc5, 0x05,
+    0x02, 0x00, 0x32, 0xa8, 0xce, 0x74, 0x5c, 0x00, 0x44, 0x81, 0xcf, 0x65,
+    0xee, 0x00, 0x30, 0x70, 0xc9, 0x0e, 0x6e, 0x00, 0x32, 0xe1, 0xd6, 0x31,
+    0x6c, 0x00, 0x32, 0xd9, 0xcd, 0x31, 0x75, 0x00, 0x32, 0xd0, 0xc9, 0x08,
+    0xcb, 0x00, 0x37, 0x59, 0xc8, 0xb9, 0x12, 0x00, 0x37, 0x50, 0xc4, 0x44,
+    0x78, 0x00, 0x36, 0xe9, 0xc9, 0x5c, 0xe9, 0x00, 0x30, 0xe8, 0xc4, 0x18,
+    0x10, 0x00, 0x33, 0x39, 0xc2, 0x22, 0xcc, 0x00, 0x33, 0x30, 0xc3, 0x0d,
+    0x14, 0x00, 0x33, 0x29, 0xc3, 0x09, 0x9e, 0x00, 0x33, 0x20, 0xc4, 0x02,
+    0xde, 0x00, 0x33, 0x19, 0xc2, 0x02, 0xa0, 0x00, 0x33, 0x10, 0xc3, 0xe6,
+    0x1a, 0x07, 0xd8, 0xb9, 0xc3, 0x03, 0x0d, 0x07, 0xd8, 0xa9, 0xc3, 0x5f,
+    0x44, 0x07, 0xd8, 0xa1, 0xc3, 0x2a, 0x91, 0x07, 0xd8, 0x98, 0xcc, 0x23,
+    0x3f, 0x00, 0x2c, 0x41, 0xc2, 0x01, 0x48, 0x00, 0x2c, 0x10, 0x8a, 0x00,
+    0x2c, 0x21, 0x90, 0x00, 0x2b, 0x78, 0xc3, 0xe5, 0xc0, 0x00, 0x2c, 0x19,
+    0xc2, 0x16, 0x1c, 0x00, 0x2b, 0xd0, 0x91, 0x00, 0x2c, 0x09, 0x0a, 0xc2,
+    0x51, 0xee, 0x83, 0x00, 0x2b, 0x70, 0xc2, 0x16, 0x1c, 0x00, 0x2c, 0x01,
+    0x83, 0x00, 0x2b, 0xe0, 0xc3, 0xb8, 0x27, 0x00, 0x2b, 0xf9, 0x91, 0x00,
+    0x2b, 0x49, 0xc9, 0xb0, 0x47, 0x00, 0x2b, 0x00, 0xc2, 0x04, 0xe6, 0x00,
+    0x2b, 0xf1, 0x91, 0x00, 0x2b, 0xc0, 0xc2, 0x16, 0x1c, 0x00, 0x2b, 0xe9,
+    0xc2, 0x00, 0xd0, 0x00, 0x2b, 0xb8, 0xc3, 0x64, 0x77, 0x00, 0x2b, 0xd9,
+    0x83, 0x00, 0x2b, 0x88, 0xc3, 0x01, 0xe3, 0x00, 0x2b, 0x91, 0xc2, 0x03,
+    0x4e, 0x00, 0x2b, 0x18, 0xc2, 0x01, 0x7f, 0x00, 0x2b, 0x51, 0x83, 0x00,
+    0x2b, 0x30, 0x96, 0x00, 0x2b, 0x41, 0x8a, 0x00, 0x2b, 0x39, 0xc2, 0x11,
+    0xee, 0x00, 0x2b, 0x28, 0x8a, 0x00, 0x2a, 0xa1, 0x90, 0x00, 0x29, 0xf8,
+    0xc3, 0xe5, 0xc0, 0x00, 0x2a, 0x99, 0xc2, 0x16, 0x1c, 0x00, 0x2a, 0x50,
+    0xc2, 0x01, 0x48, 0x00, 0x2a, 0x90, 0x91, 0x00, 0x2a, 0x89, 0x0a, 0xc2,
+    0x51, 0xf8, 0x83, 0x00, 0x29, 0xf0, 0xc2, 0x16, 0x1c, 0x00, 0x2a, 0x81,
+    0x83, 0x00, 0x2a, 0x60, 0xc3, 0xb8, 0x27, 0x00, 0x2a, 0x79, 0x91, 0x00,
+    0x29, 0xc8, 0xc2, 0x04, 0xe6, 0x00, 0x2a, 0x71, 0x91, 0x00, 0x2a, 0x40,
+    0xc2, 0x16, 0x1c, 0x00, 0x2a, 0x69, 0xc2, 0x00, 0xd0, 0x00, 0x2a, 0x38,
+    0xc3, 0x64, 0x77, 0x00, 0x2a, 0x59, 0x83, 0x00, 0x2a, 0x08, 0xc3, 0x01,
+    0xe3, 0x00, 0x2a, 0x11, 0xc2, 0x03, 0x4e, 0x00, 0x29, 0x98, 0xc2, 0x01,
+    0x7f, 0x00, 0x29, 0xd1, 0x83, 0x00, 0x29, 0xb0, 0x96, 0x00, 0x29, 0xc1,
+    0x8a, 0x00, 0x29, 0xb9, 0xc2, 0x11, 0xee, 0x00, 0x29, 0xa8, 0xc4, 0x14,
+    0x74, 0x0f, 0x48, 0x09, 0xc2, 0x00, 0xd0, 0x0f, 0x48, 0x68, 0x83, 0x0f,
+    0x48, 0x21, 0xc2, 0x01, 0x7f, 0x0f, 0x48, 0x38, 0xc9, 0xaf, 0x27, 0x0f,
+    0x48, 0x29, 0xc2, 0x00, 0xd0, 0x0f, 0x49, 0x08, 0xc2, 0x01, 0x7f, 0x0f,
+    0x48, 0x71, 0x83, 0x0f, 0x48, 0x90, 0xc2, 0x05, 0x1d, 0x0f, 0x48, 0x81,
+    0xc2, 0x19, 0x2c, 0x0f, 0x48, 0xc9, 0xc2, 0x00, 0xd0, 0x0f, 0x48, 0xd8,
+    0xc2, 0x0f, 0x9b, 0x0f, 0x48, 0x89, 0xc2, 0x00, 0xd0, 0x0f, 0x48, 0xe9,
+    0xc2, 0x01, 0x53, 0x0f, 0x49, 0x00, 0x83, 0x0f, 0x48, 0xc1, 0xc2, 0x00,
+    0x51, 0x0f, 0x48, 0xf0, 0x9f, 0x0f, 0xba, 0x19, 0xa0, 0x0f, 0xba, 0x20,
+    0x02, 0x42, 0x52, 0x02, 0xc4, 0x1a, 0x05, 0x0f, 0xb8, 0xf1, 0xc6, 0x4c,
+    0x49, 0x0f, 0xb9, 0x1a, 0x02, 0x52, 0x12, 0xc2, 0xe5, 0xfd, 0x0f, 0xbb,
+    0x10, 0xc8, 0xb8, 0x5a, 0x0f, 0xba, 0xd0, 0x02, 0xc2, 0x52, 0x18, 0x44,
+    0x00, 0x54, 0x42, 0x52, 0x24, 0xc2, 0xe5, 0xfd, 0x0f, 0xb9, 0xe0, 0xcc,
+    0x8c, 0x85, 0x0f, 0xb9, 0x79, 0x02, 0x42, 0x52, 0x33, 0xc2, 0xe5, 0xfd,
+    0x0f, 0xb8, 0xb8, 0x45, 0x3c, 0x54, 0xc2, 0x52, 0x3b, 0xc3, 0x00, 0x44,
+    0x0f, 0xba, 0xf0, 0x44, 0x00, 0x54, 0x42, 0x52, 0x4d, 0xc2, 0xe5, 0xfd,
+    0x0f, 0xba, 0xe8, 0xc5, 0xdd, 0x80, 0x0f, 0xb8, 0x43, 0x02, 0x52, 0x59,
+    0xc5, 0xd7, 0x09, 0x0f, 0xb8, 0x32, 0x02, 0x52, 0x5f, 0xc2, 0xe5, 0xfd,
+    0x0f, 0xb9, 0xb8, 0xa0, 0x0f, 0xb8, 0x91, 0x9f, 0x0f, 0xb8, 0x88, 0x9f,
+    0x0a, 0x21, 0xd1, 0x9e, 0x0a, 0x21, 0xc9, 0x9d, 0x0a, 0x21, 0xc1, 0xa0,
+    0x0a, 0x21, 0xd9, 0xa1, 0x0a, 0x21, 0xe1, 0xa2, 0x0a, 0x21, 0xe9, 0xa3,
+    0x0a, 0x21, 0xf1, 0xa4, 0x0a, 0x21, 0xf9, 0xa5, 0x0a, 0x22, 0x01, 0xa6,
+    0x0a, 0x22, 0x08, 0xa6, 0x0a, 0x21, 0xb9, 0xa5, 0x0a, 0x21, 0xb1, 0xa4,
+    0x0a, 0x21, 0xa9, 0xa3, 0x0a, 0x21, 0x93, 0x02, 0x52, 0x65, 0xa2, 0x0a,
+    0x21, 0x83, 0x02, 0x52, 0x6d, 0xa1, 0x0a, 0x21, 0x79, 0xa0, 0x0a, 0x21,
+    0x71, 0x9f, 0x0a, 0x21, 0x69, 0x9e, 0x0a, 0x21, 0x5b, 0x02, 0x52, 0x71,
+    0x9d, 0x0a, 0x21, 0x50, 0xa6, 0x0a, 0x21, 0x43, 0x02, 0x52, 0x75, 0xa5,
+    0x0a, 0x21, 0x39, 0xa4, 0x0a, 0x21, 0x31, 0xa3, 0x0a, 0x21, 0x29, 0xa2,
+    0x0a, 0x21, 0x21, 0xa1, 0x0a, 0x21, 0x19, 0xa0, 0x0a, 0x21, 0x11, 0x9f,
+    0x0a, 0x21, 0x09, 0x9e, 0x0a, 0x21, 0x01, 0x9d, 0x0a, 0x20, 0xf8, 0xa6,
+    0x0a, 0x20, 0xf1, 0xa5, 0x0a, 0x20, 0xe9, 0xa4, 0x0a, 0x20, 0xe1, 0xa3,
+    0x0a, 0x20, 0xd3, 0x02, 0x52, 0x79, 0xa2, 0x0a, 0x20, 0xc9, 0xa1, 0x0a,
+    0x20, 0xc1, 0xa0, 0x0a, 0x20, 0xb9, 0x9f, 0x0a, 0x20, 0xb1, 0x9e, 0x0a,
+    0x20, 0xa9, 0x9d, 0x0a, 0x20, 0xa0, 0xa6, 0x0a, 0x20, 0x99, 0xa5, 0x0a,
+    0x20, 0x91, 0xa4, 0x0a, 0x20, 0x89, 0xa3, 0x0a, 0x20, 0x81, 0xa2, 0x0a,
+    0x20, 0x79, 0xa1, 0x0a, 0x20, 0x71, 0xa0, 0x0a, 0x20, 0x69, 0x9f, 0x0a,
+    0x20, 0x61, 0x9e, 0x0a, 0x20, 0x59, 0x9d, 0x0a, 0x20, 0x4a, 0x02, 0x52,
+    0x7d, 0xa6, 0x0a, 0x20, 0x41, 0xa5, 0x0a, 0x20, 0x39, 0xa4, 0x0a, 0x20,
+    0x31, 0xa3, 0x0a, 0x20, 0x29, 0xa2, 0x0a, 0x20, 0x21, 0xa1, 0x0a, 0x20,
+    0x19, 0xa0, 0x0a, 0x20, 0x11, 0x9f, 0x0a, 0x20, 0x09, 0x9e, 0x0a, 0x20,
+    0x00, 0x9d, 0x0a, 0x22, 0x11, 0x9e, 0x0a, 0x22, 0x19, 0x9f, 0x0a, 0x22,
+    0x21, 0xa0, 0x0a, 0x22, 0x29, 0xa1, 0x0a, 0x22, 0x31, 0xa2, 0x0a, 0x22,
+    0x39, 0xa3, 0x0a, 0x22, 0x43, 0x02, 0x52, 0x81, 0xa4, 0x0a, 0x22, 0x61,
+    0xa5, 0x0a, 0x22, 0x69, 0xa6, 0x0a, 0x22, 0x70, 0x9d, 0x0a, 0x22, 0x79,
+    0x9e, 0x0a, 0x22, 0x81, 0x9f, 0x0a, 0x22, 0x89, 0xa0, 0x0a, 0x22, 0x91,
+    0xa1, 0x0a, 0x22, 0x99, 0xa2, 0x0a, 0x22, 0xa1, 0xa3, 0x0a, 0x22, 0xa9,
+    0xa4, 0x0a, 0x22, 0xb1, 0xa5, 0x0a, 0x22, 0xb9, 0xa6, 0x0a, 0x22, 0xc0,
+    0x9d, 0x0a, 0x22, 0xc9, 0x9e, 0x0a, 0x22, 0xd1, 0x9f, 0x0a, 0x22, 0xd9,
+    0xa0, 0x0a, 0x22, 0xe1, 0xa1, 0x0a, 0x22, 0xe9, 0xa2, 0x0a, 0x22, 0xf1,
+    0xa3, 0x0a, 0x22, 0xf9, 0xa4, 0x0a, 0x23, 0x01, 0xa5, 0x0a, 0x23, 0x09,
+    0xa6, 0x0a, 0x23, 0x10, 0x9d, 0x0a, 0x23, 0x19, 0x9e, 0x0a, 0x23, 0x21,
+    0x9f, 0x0a, 0x23, 0x29, 0xa0, 0x0a, 0x23, 0x31, 0xa1, 0x0a, 0x23, 0x39,
+    0xa2, 0x0a, 0x23, 0x41, 0xa3, 0x0a, 0x23, 0x49, 0xa4, 0x0a, 0x23, 0x53,
+    0x02, 0x52, 0x8d, 0xa5, 0x0a, 0x23, 0x63, 0x02, 0x52, 0x91, 0xa6, 0x0a,
+    0x23, 0x70, 0x9d, 0x0a, 0x23, 0x7b, 0x02, 0x52, 0x95, 0x9e, 0x0a, 0x23,
+    0x8b, 0x02, 0x52, 0x99, 0x9f, 0x0a, 0x23, 0x9b, 0x02, 0x52, 0x9d, 0xa0,
+    0x0a, 0x23, 0xa9, 0xa1, 0x0a, 0x23, 0xb3, 0x02, 0x52, 0xa1, 0xa2, 0x0a,
+    0x23, 0xd3, 0x02, 0x52, 0xad, 0xa3, 0x0a, 0x23, 0xe9, 0xa4, 0x0a, 0x23,
+    0xf3, 0x02, 0x52, 0xb5, 0xa5, 0x0a, 0x24, 0x11, 0xa6, 0x0a, 0x24, 0x18,
+    0x9d, 0x0a, 0x24, 0x23, 0x02, 0x52, 0xc1, 0x9e, 0x0a, 0x24, 0x39, 0x9f,
+    0x0a, 0x24, 0x41, 0xa0, 0x0a, 0x24, 0x49, 0xa1, 0x0a, 0x24, 0x51, 0xa2,
+    0x0a, 0x24, 0x5b, 0x02, 0x52, 0xc9, 0xa3, 0x0a, 0x24, 0x69, 0xa4, 0x0a,
+    0x24, 0x71, 0xa5, 0x0a, 0x24, 0x79, 0xa6, 0x0a, 0x24, 0x80, 0x9d, 0x0a,
+    0x24, 0x89, 0x9e, 0x0a, 0x24, 0x91, 0x9f, 0x0a, 0x24, 0x99, 0xa0, 0x0a,
+    0x24, 0xa1, 0xa1, 0x0a, 0x24, 0xa9, 0xa2, 0x0a, 0x24, 0xb3, 0x02, 0x52,
+    0xcd, 0xa3, 0x0a, 0x24, 0xc1, 0xa4, 0x0a, 0x24, 0xc9, 0xa5, 0x0a, 0x24,
+    0xd1, 0xa6, 0x0a, 0x24, 0xd8, 0x9d, 0x0a, 0x24, 0xe1, 0x9e, 0x0a, 0x24,
+    0xe9, 0x9f, 0x0a, 0x24, 0xf1, 0xa0, 0x0a, 0x24, 0xf9, 0xa1, 0x0a, 0x25,
+    0x01, 0xa2, 0x0a, 0x25, 0x0b, 0x02, 0x52, 0xd1, 0xa3, 0x0a, 0x25, 0x19,
+    0xa4, 0x0a, 0x25, 0x21, 0xa5, 0x0a, 0x25, 0x29, 0xa6, 0x0a, 0x25, 0x30,
+    0x9d, 0x0a, 0x25, 0x39, 0x9e, 0x0a, 0x25, 0x41, 0x9f, 0x0a, 0x25, 0x49,
+    0xa0, 0x0a, 0x25, 0x51, 0xa1, 0x0a, 0x25, 0x59, 0xa2, 0x0a, 0x25, 0x61,
+    0xa3, 0x0a, 0x25, 0x69, 0xa4, 0x0a, 0x25, 0x71, 0xa5, 0x0a, 0x25, 0x79,
+    0xa6, 0x0a, 0x25, 0x80, 0x9d, 0x0a, 0x25, 0x89, 0x9e, 0x0a, 0x25, 0x91,
+    0x9f, 0x0a, 0x25, 0x99, 0xa0, 0x0a, 0x25, 0xa1, 0xa1, 0x0a, 0x25, 0xa9,
+    0xa2, 0x0a, 0x25, 0xb1, 0xa3, 0x0a, 0x25, 0xb9, 0xa4, 0x0a, 0x25, 0xc1,
+    0xa5, 0x0a, 0x25, 0xc9, 0xa6, 0x0a, 0x25, 0xd0, 0x9d, 0x0a, 0x25, 0xd9,
+    0x9e, 0x0a, 0x25, 0xe1, 0x9f, 0x0a, 0x25, 0xe9, 0xa0, 0x0a, 0x25, 0xf1,
+    0xa1, 0x0a, 0x25, 0xf9, 0xa2, 0x0a, 0x26, 0x01, 0xa3, 0x0a, 0x26, 0x09,
+    0xa4, 0x0a, 0x26, 0x11, 0xa5, 0x0a, 0x26, 0x19, 0xa6, 0x0a, 0x26, 0x20,
+    0x9d, 0x0a, 0x26, 0x29, 0x9e, 0x0a, 0x26, 0x31, 0x9f, 0x0a, 0x26, 0x39,
+    0xa0, 0x0a, 0x26, 0x41, 0xa1, 0x0a, 0x26, 0x49, 0xa2, 0x0a, 0x26, 0x51,
+    0xa3, 0x0a, 0x26, 0x59, 0xa4, 0x0a, 0x26, 0x61, 0xa5, 0x0a, 0x26, 0x69,
+    0xa6, 0x0a, 0x26, 0x70, 0x9d, 0x0a, 0x26, 0x79, 0x9e, 0x0a, 0x26, 0x81,
+    0x9f, 0x0a, 0x26, 0x89, 0xa0, 0x0a, 0x26, 0x91, 0xa1, 0x0a, 0x26, 0x99,
+    0xa2, 0x0a, 0x26, 0xa1, 0xa3, 0x0a, 0x26, 0xa9, 0xa4, 0x0a, 0x26, 0xb1,
+    0xa5, 0x0a, 0x26, 0xb9, 0xa6, 0x0a, 0x26, 0xc0, 0x9d, 0x0a, 0x26, 0xc9,
+    0x9e, 0x0a, 0x26, 0xd1, 0x9f, 0x0a, 0x26, 0xd9, 0xa0, 0x0a, 0x26, 0xe1,
+    0xa1, 0x0a, 0x26, 0xe9, 0xa2, 0x0a, 0x26, 0xf1, 0xa3, 0x0a, 0x26, 0xf9,
+    0xa4, 0x0a, 0x27, 0x01, 0xa5, 0x0a, 0x27, 0x09, 0xa6, 0x0a, 0x27, 0x10,
+    0x9d, 0x0a, 0x27, 0x19, 0x9e, 0x0a, 0x27, 0x21, 0x9f, 0x0a, 0x27, 0x2b,
+    0x02, 0x52, 0xd5, 0xa0, 0x0a, 0x27, 0x41, 0xa1, 0x0a, 0x27, 0x49, 0xa2,
+    0x0a, 0x27, 0x51, 0xa3, 0x0a, 0x27, 0x59, 0xa4, 0x0a, 0x27, 0x63, 0x02,
+    0x52, 0xdd, 0xa5, 0x0a, 0x27, 0x71, 0xa6, 0x0a, 0x27, 0x7a, 0x02, 0x52,
+    0xe1, 0x9d, 0x0a, 0x27, 0x89, 0x9e, 0x0a, 0x27, 0x91, 0x9f, 0x0a, 0x27,
+    0x99, 0xa0, 0x0a, 0x27, 0xa1, 0xa1, 0x0a, 0x27, 0xa9, 0xa2, 0x0a, 0x27,
+    0xb3, 0x02, 0x52, 0xe5, 0xa3, 0x0a, 0x27, 0xc3, 0x02, 0x52, 0xe9, 0xa4,
+    0x0a, 0x27, 0xd1, 0xa5, 0x0a, 0x27, 0xd9, 0xa6, 0x0a, 0x27, 0xe0, 0x9d,
+    0x0a, 0x27, 0xe9, 0x9e, 0x0a, 0x27, 0xf1, 0x9f, 0x0a, 0x27, 0xf9, 0xa0,
+    0x0a, 0x28, 0x01, 0xa1, 0x0a, 0x28, 0x09, 0xa2, 0x0a, 0x28, 0x11, 0xa3,
+    0x0a, 0x28, 0x19, 0xa4, 0x0a, 0x28, 0x23, 0x02, 0x52, 0xed, 0xa5, 0x0a,
+    0x28, 0x31, 0xa6, 0x0a, 0x28, 0x38, 0x9d, 0x0a, 0x28, 0x41, 0x9e, 0x0a,
+    0x28, 0x49, 0x9f, 0x0a, 0x28, 0x51, 0xa0, 0x0a, 0x28, 0x59, 0xa1, 0x0a,
+    0x28, 0x61, 0xa2, 0x0a, 0x28, 0x69, 0xa3, 0x0a, 0x28, 0x71, 0xa4, 0x0a,
+    0x28, 0x79, 0xa5, 0x0a, 0x28, 0x81, 0xa6, 0x0a, 0x28, 0x88, 0x9d, 0x0a,
+    0x28, 0x91, 0x9e, 0x0a, 0x28, 0x99, 0x9f, 0x0a, 0x28, 0xa1, 0xa0, 0x0a,
+    0x28, 0xa9, 0xa1, 0x0a, 0x28, 0xb1, 0xa2, 0x0a, 0x28, 0xb9, 0xa3, 0x0a,
+    0x28, 0xc1, 0xa4, 0x0a, 0x28, 0xc9, 0xa5, 0x0a, 0x28, 0xd1, 0xa6, 0x0a,
+    0x28, 0xd8, 0x9d, 0x0a, 0x28, 0xe1, 0x9e, 0x0a, 0x28, 0xe9, 0x9f, 0x0a,
+    0x28, 0xf1, 0xa0, 0x0a, 0x28, 0xf9, 0xa1, 0x0a, 0x29, 0x01, 0xa2, 0x0a,
+    0x29, 0x09, 0xa3, 0x0a, 0x29, 0x11, 0xa4, 0x0a, 0x29, 0x19, 0xa5, 0x0a,
+    0x29, 0x21, 0xa6, 0x0a, 0x29, 0x28, 0x9d, 0x0a, 0x29, 0x31, 0x9e, 0x0a,
+    0x29, 0x39, 0x9f, 0x0a, 0x29, 0x41, 0xa0, 0x0a, 0x29, 0x49, 0xa1, 0x0a,
+    0x29, 0x51, 0xa2, 0x0a, 0x29, 0x59, 0xa3, 0x0a, 0x29, 0x61, 0xa4, 0x0a,
+    0x29, 0x6b, 0x02, 0x52, 0xf1, 0xa5, 0x0a, 0x29, 0x79, 0xa6, 0x0a, 0x29,
+    0x80, 0x9d, 0x0a, 0x29, 0x89, 0x9e, 0x0a, 0x29, 0x91, 0x9f, 0x0a, 0x29,
+    0x99, 0xa0, 0x0a, 0x29, 0xa1, 0xa1, 0x0a, 0x29, 0xa9, 0xa2, 0x0a, 0x29,
+    0xb1, 0xa3, 0x0a, 0x29, 0xb9, 0xa4, 0x0a, 0x29, 0xc1, 0xa5, 0x0a, 0x29,
+    0xc9, 0xa6, 0x0a, 0x29, 0xd0, 0x9d, 0x0a, 0x29, 0xd9, 0x9e, 0x0a, 0x29,
+    0xe1, 0x9f, 0x0a, 0x29, 0xe9, 0xa0, 0x0a, 0x29, 0xf1, 0xa1, 0x0a, 0x29,
+    0xf9, 0xa2, 0x0a, 0x2a, 0x01, 0xa3, 0x0a, 0x2a, 0x09, 0xa4, 0x0a, 0x2a,
+    0x11, 0xa5, 0x0a, 0x2a, 0x19, 0xa6, 0x0a, 0x2a, 0x22, 0x02, 0x52, 0xf5,
+    0x9d, 0x0a, 0x2a, 0x31, 0x9e, 0x0a, 0x2a, 0x39, 0x9f, 0x0a, 0x2a, 0x41,
+    0xa0, 0x0a, 0x2a, 0x49, 0xa1, 0x0a, 0x2a, 0x53, 0x02, 0x52, 0xf9, 0xa2,
+    0x0a, 0x2a, 0x61, 0xa3, 0x0a, 0x2a, 0x69, 0xa4, 0x0a, 0x2a, 0x71, 0xa5,
+    0x0a, 0x2a, 0x79, 0xa6, 0x0a, 0x2a, 0x82, 0x02, 0x52, 0xfd, 0x9d, 0x0a,
+    0x2a, 0x91, 0x9e, 0x0a, 0x2a, 0x99, 0x9f, 0x0a, 0x2a, 0xa1, 0xa0, 0x0a,
+    0x2a, 0xa9, 0xa1, 0x0a, 0x2a, 0xb1, 0xa2, 0x0a, 0x2a, 0xb9, 0xa3, 0x0a,
+    0x2a, 0xc1, 0xa4, 0x0a, 0x2a, 0xc9, 0xa5, 0x0a, 0x2a, 0xd1, 0xa6, 0x0a,
+    0x2a, 0xda, 0x02, 0x53, 0x01, 0x9d, 0x0a, 0x2a, 0xe9, 0x9e, 0x0a, 0x2a,
+    0xf1, 0x9f, 0x0a, 0x2a, 0xf9, 0xa0, 0x0a, 0x2b, 0x01, 0xa1, 0x0a, 0x2b,
+    0x09, 0xa2, 0x0a, 0x2b, 0x11, 0xa3, 0x0a, 0x2b, 0x19, 0xa4, 0x0a, 0x2b,
+    0x21, 0xa5, 0x0a, 0x2b, 0x29, 0xa6, 0x0a, 0x2b, 0x30, 0x9d, 0x0a, 0x2b,
+    0x39, 0x9e, 0x0a, 0x2b, 0x41, 0x9f, 0x0a, 0x2b, 0x49, 0xa0, 0x0a, 0x2b,
+    0x51, 0xa1, 0x0a, 0x2b, 0x59, 0xa2, 0x0a, 0x2b, 0x61, 0xa3, 0x0a, 0x2b,
+    0x69, 0xa4, 0x0a, 0x2b, 0x71, 0xa5, 0x0a, 0x2b, 0x79, 0xa6, 0x0a, 0x2b,
+    0x82, 0x02, 0x53, 0x05, 0x9d, 0x0a, 0x2b, 0x91, 0x9e, 0x0a, 0x2b, 0x99,
+    0x1f, 0xc2, 0x53, 0x09, 0xa0, 0x0a, 0x2b, 0xb9, 0xa1, 0x0a, 0x2b, 0xc1,
+    0xa2, 0x0a, 0x2b, 0xc9, 0xa3, 0x0a, 0x2b, 0xd3, 0x02, 0x53, 0x15, 0xa4,
+    0x0a, 0x2b, 0xf1, 0xa5, 0x0a, 0x2b, 0xf9, 0xa6, 0x0a, 0x2c, 0x00, 0x9d,
+    0x0a, 0x2c, 0x09, 0x9e, 0x0a, 0x2c, 0x11, 0x9f, 0x0a, 0x2c, 0x19, 0xa0,
+    0x0a, 0x2c, 0x21, 0xa1, 0x0a, 0x2c, 0x29, 0xa2, 0x0a, 0x2c, 0x31, 0xa3,
+    0x0a, 0x2c, 0x39, 0xa4, 0x0a, 0x2c, 0x41, 0xa5, 0x0a, 0x2c, 0x49, 0xa6,
+    0x0a, 0x2c, 0x50, 0x9d, 0x0a, 0x2c, 0x59, 0x9e, 0x0a, 0x2c, 0x61, 0x9f,
+    0x0a, 0x2c, 0x69, 0xa0, 0x0a, 0x2c, 0x71, 0xa1, 0x0a, 0x2c, 0x79, 0xa2,
+    0x0a, 0x2c, 0x81, 0xa3, 0x0a, 0x2c, 0x89, 0xa4, 0x0a, 0x2c, 0x91, 0xa5,
+    0x0a, 0x2c, 0x99, 0xa6, 0x0a, 0x2c, 0xa2, 0x02, 0x53, 0x21, 0x9d, 0x0a,
+    0x2c, 0xb1, 0x9e, 0x0a, 0x2c, 0xb9, 0x9f, 0x0a, 0x2c, 0xc1, 0xa0, 0x0a,
+    0x2c, 0xc9, 0xa1, 0x0a, 0x2c, 0xd3, 0x02, 0x53, 0x25, 0xa2, 0x0a, 0x2c,
+    0xe1, 0xa3, 0x0a, 0x2c, 0xe9, 0xa4, 0x0a, 0x2c, 0xf1, 0xa5, 0x0a, 0x2c,
+    0xfb, 0x02, 0x53, 0x29, 0xa6, 0x0a, 0x2d, 0x08, 0x9d, 0x0a, 0x2d, 0x11,
+    0x9e, 0x0a, 0x2d, 0x1b, 0x02, 0x53, 0x2d, 0x9f, 0x0a, 0x2d, 0x29, 0xa0,
+    0x0a, 0x2d, 0x31, 0xa1, 0x0a, 0x2d, 0x39, 0xa2, 0x0a, 0x2d, 0x41, 0xa3,
+    0x0a, 0x2d, 0x49, 0xa4, 0x0a, 0x2d, 0x51, 0xa5, 0x0a, 0x2d, 0x59, 0xa6,
+    0x0a, 0x2d, 0x60, 0x9d, 0x0a, 0x2d, 0x69, 0x9e, 0x0a, 0x2d, 0x73, 0x02,
+    0x53, 0x31, 0x9f, 0x0a, 0x2d, 0x81, 0x20, 0xc2, 0x53, 0x35, 0xa1, 0x0a,
+    0x2d, 0x99, 0xa2, 0x0a, 0x2d, 0xa1, 0xa3, 0x0a, 0x2d, 0xab, 0x02, 0x53,
+    0x3f, 0xa4, 0x0a, 0x2d, 0xb9, 0xa5, 0x0a, 0x2d, 0xc1, 0xa6, 0x0a, 0x2d,
+    0xc8, 0x9d, 0x0a, 0x2d, 0xd1, 0x9e, 0x0a, 0x2d, 0xd9, 0x9f, 0x0a, 0x2d,
+    0xe1, 0xc7, 0xc6, 0xa9, 0x0a, 0x2d, 0xe9, 0xa1, 0x0a, 0x2d, 0xf1, 0xa2,
+    0x0a, 0x2d, 0xf9, 0xa3, 0x0a, 0x2e, 0x01, 0xa4, 0x0a, 0x2e, 0x09, 0xa5,
+    0x0a, 0x2e, 0x11, 0xa6, 0x0a, 0x2e, 0x18, 0x9d, 0x0a, 0x2e, 0x21, 0x9e,
+    0x0a, 0x2e, 0x29, 0x9f, 0x0a, 0x2e, 0x31, 0xa0, 0x0a, 0x2e, 0x39, 0xa1,
+    0x0a, 0x2e, 0x41, 0xa2, 0x0a, 0x2e, 0x49, 0xa3, 0x0a, 0x2e, 0x51, 0xa4,
+    0x0a, 0x2e, 0x59, 0xa5, 0x0a, 0x2e, 0x61, 0xa6, 0x0a, 0x2e, 0x68, 0x1d,
+    0xc2, 0x53, 0x43, 0x9e, 0x0a, 0x2e, 0x81, 0x9f, 0x0a, 0x2e, 0x89, 0xa0,
+    0x0a, 0x2e, 0x91, 0xa1, 0x0a, 0x2e, 0x99, 0xa2, 0x0a, 0x2e, 0xa1, 0xa3,
+    0x0a, 0x2e, 0xa9, 0xa4, 0x0a, 0x2e, 0xb1, 0xa5, 0x0a, 0x2e, 0xb9, 0xa6,
+    0x0a, 0x2e, 0xc0, 0x9d, 0x0a, 0x2e, 0xc9, 0x9e, 0x0a, 0x2e, 0xd1, 0x9f,
+    0x0a, 0x2e, 0xd9, 0xa0, 0x0a, 0x2e, 0xe1, 0xa1, 0x0a, 0x2e, 0xe9, 0xa2,
+    0x0a, 0x2e, 0xf1, 0xa3, 0x0a, 0x2e, 0xf9, 0xa4, 0x0a, 0x2f, 0x01, 0xa5,
+    0x0a, 0x2f, 0x09, 0xa6, 0x0a, 0x2f, 0x10, 0x9d, 0x0a, 0x2f, 0x19, 0x9e,
+    0x0a, 0x2f, 0x21, 0x9f, 0x0a, 0x2f, 0x29, 0xa0, 0x0a, 0x2f, 0x31, 0xa1,
+    0x0a, 0x2f, 0x39, 0xa2, 0x0a, 0x2f, 0x41, 0xa3, 0x0a, 0x2f, 0x49, 0xa4,
+    0x0a, 0x2f, 0x51, 0xa5, 0x0a, 0x2f, 0x59, 0xa6, 0x0a, 0x2f, 0x60, 0x9d,
+    0x0a, 0x2f, 0x69, 0x9e, 0x0a, 0x2f, 0x71, 0x9f, 0x0a, 0x2f, 0x79, 0xa0,
+    0x0a, 0x2f, 0x81, 0xa1, 0x0a, 0x2f, 0x89, 0xa2, 0x0a, 0x2f, 0x91, 0xa3,
+    0x0a, 0x2f, 0x99, 0xa4, 0x0a, 0x2f, 0xa1, 0xa5, 0x0a, 0x2f, 0xa9, 0xa6,
+    0x0a, 0x2f, 0xb0, 0x9d, 0x0a, 0x2f, 0xbb, 0x02, 0x53, 0x4f, 0x9e, 0x0a,
+    0x2f, 0xc9, 0x9f, 0x0a, 0x2f, 0xd1, 0xa0, 0x0a, 0x2f, 0xd9, 0xa1, 0x0a,
+    0x2f, 0xe1, 0xa2, 0x0a, 0x2f, 0xe9, 0xa3, 0x0a, 0x2f, 0xf1, 0xa4, 0x0a,
+    0x2f, 0xfb, 0x02, 0x53, 0x53, 0xa5, 0x0a, 0x30, 0x09, 0xa6, 0x0a, 0x30,
+    0x10, 0x9d, 0x0a, 0x30, 0x19, 0x9e, 0x0a, 0x30, 0x21, 0x9f, 0x0a, 0x30,
+    0x29, 0xa0, 0x0a, 0x30, 0x31, 0xa1, 0x0a, 0x30, 0x39, 0xa2, 0x0a, 0x30,
+    0x41, 0xa3, 0x0a, 0x30, 0x49, 0xa4, 0x0a, 0x30, 0x51, 0xa5, 0x0a, 0x30,
+    0x59, 0xa6, 0x0a, 0x30, 0x60, 0x9d, 0x0a, 0x30, 0x69, 0x9e, 0x0a, 0x30,
+    0x71, 0x9f, 0x0a, 0x30, 0x79, 0xa0, 0x0a, 0x30, 0x81, 0xa1, 0x0a, 0x30,
+    0x89, 0xa2, 0x0a, 0x30, 0x91, 0xa3, 0x0a, 0x30, 0x99, 0xa4, 0x0a, 0x30,
+    0xa1, 0xa5, 0x0a, 0x30, 0xa9, 0xa6, 0x0a, 0x30, 0xb0, 0x9d, 0x0a, 0x30,
+    0xb9, 0x9e, 0x0a, 0x30, 0xc1, 0x9f, 0x0a, 0x30, 0xc9, 0xa0, 0x0a, 0x30,
+    0xd1, 0xa1, 0x0a, 0x30, 0xd9, 0xa2, 0x0a, 0x30, 0xe1, 0xa3, 0x0a, 0x30,
+    0xe9, 0xa4, 0x0a, 0x30, 0xf1, 0xa5, 0x0a, 0x30, 0xf9, 0xa6, 0x0a, 0x31,
+    0x00, 0x9d, 0x0a, 0x31, 0x09, 0x9e, 0x0a, 0x31, 0x11, 0x9f, 0x0a, 0x31,
+    0x19, 0xa0, 0x0a, 0x31, 0x21, 0xa1, 0x0a, 0x31, 0x29, 0xa2, 0x0a, 0x31,
+    0x31, 0xa3, 0x0a, 0x31, 0x39, 0xa4, 0x0a, 0x31, 0x40, 0x9e, 0x0a, 0x31,
+    0x49, 0x9f, 0x0a, 0x31, 0x51, 0xa0, 0x0a, 0x31, 0x59, 0xa1, 0x0a, 0x31,
+    0x61, 0xa2, 0x0a, 0x31, 0x69, 0xa3, 0x0a, 0x31, 0x71, 0xa4, 0x0a, 0x31,
+    0x79, 0xa5, 0x0a, 0x31, 0x81, 0xa6, 0x0a, 0x31, 0x88, 0x9d, 0x0a, 0x31,
+    0x91, 0x9e, 0x0a, 0x31, 0x99, 0x9f, 0x0a, 0x31, 0xa1, 0xa0, 0x0a, 0x31,
+    0xa9, 0xa1, 0x0a, 0x31, 0xb1, 0xa2, 0x0a, 0x31, 0xb9, 0xa3, 0x0a, 0x31,
+    0xc1, 0xa4, 0x0a, 0x31, 0xc9, 0xa5, 0x0a, 0x31, 0xd1, 0xa6, 0x0a, 0x31,
+    0xd8, 0x9d, 0x0a, 0x31, 0xe1, 0x9e, 0x0a, 0x31, 0xe9, 0x9f, 0x0a, 0x31,
+    0xf1, 0xa0, 0x0a, 0x31, 0xf9, 0xa1, 0x0a, 0x32, 0x01, 0xa2, 0x0a, 0x32,
+    0x09, 0xa3, 0x0a, 0x32, 0x11, 0xa4, 0x0a, 0x32, 0x19, 0xa5, 0x0a, 0x32,
+    0x21, 0xa6, 0x0a, 0x32, 0x28, 0xd1, 0x05, 0x75, 0x01, 0x5b, 0x79, 0xd4,
+    0x3e, 0x1c, 0x01, 0x5c, 0x61, 0xd5, 0x36, 0x9b, 0x01, 0x5c, 0x69, 0xd3,
+    0x44, 0xa2, 0x01, 0x5c, 0x71, 0xd2, 0x47, 0x93, 0x01, 0x5c, 0x78, 0xc8,
+    0x2c, 0xb2, 0x01, 0x1b, 0x81, 0xc9, 0x24, 0x47, 0x01, 0x1b, 0x79, 0x05,
+    0xc2, 0x53, 0x57, 0x06, 0xc2, 0x53, 0x63, 0x42, 0x02, 0xae, 0xc2, 0x53,
+    0x76, 0xd0, 0x03, 0xb7, 0x01, 0x1a, 0x41, 0x42, 0x00, 0x49, 0xc2, 0x53,
+    0x82, 0xcc, 0x07, 0xc7, 0x01, 0x1a, 0x21, 0xc9, 0x02, 0xfe, 0x01, 0x1a,
+    0x11, 0xc5, 0x03, 0x02, 0x01, 0x1a, 0x09, 0xc3, 0xba, 0x27, 0x01, 0x19,
+    0xd9, 0xc5, 0x00, 0xe2, 0x01, 0x19, 0xc0, 0xc9, 0x20, 0xa8, 0x01, 0x1b,
+    0x09, 0xc3, 0xba, 0x27, 0x01, 0x1a, 0xa9, 0xc7, 0x80, 0x70, 0x01, 0x1a,
+    0x88, 0xcb, 0x95, 0xf0, 0x01, 0x1b, 0x89, 0xca, 0x94, 0xf4, 0x01, 0x1b,
+    0x31, 0x45, 0x9a, 0x3d, 0x42, 0x53, 0x8e, 0xc5, 0x1e, 0xc8, 0x01, 0x1b,
+    0x59, 0xc9, 0x20, 0xa8, 0x01, 0x1b, 0x11, 0xc5, 0x05, 0xa2, 0x01, 0x1a,
+    0x90, 0xc8, 0x52, 0x09, 0x01, 0x1a, 0xc9, 0xc5, 0x05, 0xa2, 0x01, 0x1a,
+    0x58, 0xc2, 0x00, 0xb1, 0x01, 0x1a, 0xf9, 0xc3, 0x05, 0xa4, 0x01, 0x19,
+    0xe8, 0xc2, 0x00, 0xf1, 0x01, 0x12, 0x2b, 0x02, 0x53, 0x9a, 0xcb, 0x23,
+    0xa0, 0x01, 0x53, 0x80, 0xc2, 0x0c, 0x43, 0x08, 0x59, 0x99, 0x87, 0x08,
+    0x59, 0x88, 0xc2, 0x00, 0x5f, 0x08, 0x59, 0x21, 0xc2, 0x0c, 0x43, 0x08,
+    0x59, 0x19, 0x87, 0x08, 0x59, 0x10, 0x87, 0x08, 0x58, 0x38, 0x90, 0x08,
+    0x58, 0x29, 0x91, 0x08, 0x58, 0x18, 0xc7, 0x0d, 0x04, 0x08, 0x08, 0xc9,
+    0xc8, 0x4b, 0x94, 0x08, 0x09, 0x10, 0xc3, 0x02, 0xdf, 0x08, 0x08, 0x4b,
+    0x02, 0x53, 0xa0, 0xc4, 0x0d, 0x0e, 0x08, 0x08, 0x92, 0x02, 0x53, 0xa4,
+    0xc9, 0x57, 0x20, 0x08, 0x09, 0x58, 0xc4, 0x18, 0x12, 0x08, 0x08, 0x8b,
+    0x02, 0x53, 0xaa, 0x91, 0x08, 0x08, 0x42, 0x02, 0x53, 0xb0, 0xc2, 0x00,
+    0x5f, 0x08, 0x08, 0x5b, 0x02, 0x53, 0xb4, 0xc3, 0x45, 0x6b, 0x08, 0x08,
+    0xa2, 0x02, 0x53, 0xb8, 0xc2, 0x00, 0x33, 0x08, 0x08, 0x53, 0x02, 0x53,
+    0xbe, 0xc3, 0x0d, 0x0f, 0x08, 0x08, 0x9a, 0x02, 0x53, 0xc2, 0x00, 0xc2,
+    0x53, 0xc8, 0xc2, 0x0d, 0x10, 0x08, 0x08, 0xaa, 0x02, 0x53, 0xd4, 0x00,
+    0xc2, 0x53, 0xda, 0xc2, 0x0d, 0x10, 0x08, 0x08, 0xb2, 0x02, 0x53, 0xe6,
+    0xc7, 0x0d, 0x04, 0x08, 0x09, 0x01, 0xc8, 0x4b, 0x94, 0x08, 0x09, 0x48,
+    0xc9, 0x57, 0x20, 0x08, 0x09, 0x90, 0xc7, 0x0d, 0x04, 0x08, 0x09, 0x09,
+    0xc8, 0x4b, 0x94, 0x08, 0x09, 0x50, 0xc9, 0x57, 0x20, 0x08, 0x09, 0x98,
+    0xcc, 0x14, 0xcd, 0x08, 0x09, 0xc1, 0xcd, 0x7e, 0xb0, 0x08, 0x09, 0xd8,
+    0xca, 0x01, 0x68, 0x01, 0x28, 0x03, 0x02, 0x53, 0xec, 0x06, 0xc2, 0x53,
+    0xf2, 0xc2, 0x02, 0xae, 0x01, 0x2b, 0xab, 0x02, 0x53, 0xfc, 0xc4, 0x00,
+    0x49, 0x01, 0x2b, 0xa3, 0x02, 0x54, 0x02, 0xc5, 0x00, 0x2c, 0x01, 0x2b,
+    0xb1, 0x44, 0x13, 0x1d, 0xc2, 0x54, 0x08, 0xc8, 0x00, 0x5f, 0x01, 0x28,
+    0x13, 0x02, 0x54, 0x14, 0x4f, 0x61, 0x5c, 0xc2, 0x54, 0x1a, 0x4c, 0x52,
+    0xbb, 0x42, 0x54, 0x26, 0x50, 0x5c, 0x42, 0xc2, 0x54, 0x32, 0xdd, 0x11,
+    0x34, 0x01, 0x2a, 0x29, 0xdd, 0x11, 0xff, 0x01, 0x2a, 0x19, 0x50, 0x11,
+    0x39, 0x42, 0x54, 0x44, 0x45, 0x02, 0x9a, 0x42, 0x54, 0x56, 0xd0, 0x5e,
+    0x62, 0x01, 0x2b, 0xf0, 0xc2, 0x01, 0x48, 0x01, 0x2b, 0xdb, 0x02, 0x54,
+    0x66, 0x4a, 0xa2, 0xa6, 0x42, 0x54, 0x6c, 0x45, 0x02, 0x9a, 0x42, 0x54,
+    0x78, 0xc8, 0x00, 0x5f, 0x01, 0x28, 0x59, 0xca, 0x01, 0x68, 0x01, 0x28,
+    0x48, 0xc8, 0x00, 0x5f, 0x01, 0x28, 0x39, 0xca, 0x01, 0x68, 0x01, 0x28,
+    0x28, 0xc8, 0x00, 0x5f, 0x01, 0x2a, 0x8b, 0x02, 0x54, 0x8a, 0x47, 0x54,
+    0x42, 0xc2, 0x54, 0x90, 0x49, 0x45, 0xd2, 0xc2, 0x54, 0xa2, 0xca, 0x01,
+    0x68, 0x01, 0x2a, 0x80, 0x4b, 0x99, 0xb8, 0xc2, 0x54, 0xb4, 0x4b, 0x8e,
+    0x76, 0xc2, 0x54, 0xc6, 0x4a, 0x5c, 0x42, 0xc2, 0x54, 0xd8, 0x4a, 0x11,
+    0x39, 0x42, 0x54, 0xf0, 0xd1, 0x53, 0x43, 0x01, 0x2b, 0x59, 0xcb, 0x8d,
+    0x84, 0x01, 0x2b, 0x11, 0xcc, 0x89, 0xd9, 0x01, 0x2a, 0xf8, 0xd1, 0x53,
+    0x32, 0x01, 0x2b, 0x51, 0xcb, 0x8e, 0xce, 0x01, 0x2b, 0x09, 0xcc, 0x87,
+    0xa5, 0x01, 0x2a, 0xf0, 0xd0, 0x32, 0x47, 0x01, 0x2a, 0x11, 0xca, 0xa2,
+    0xce, 0x01, 0x29, 0x41, 0xcb, 0x98, 0xe7, 0x01, 0x29, 0x00, 0xd0, 0x32,
+    0x71, 0x01, 0x29, 0xf9, 0xca, 0xa2, 0xe2, 0x01, 0x29, 0x29, 0xcb, 0x98,
+    0xdc, 0x01, 0x28, 0xe8, 0xd1, 0x53, 0x43, 0x01, 0x2b, 0x41, 0xcb, 0x8d,
+    0x84, 0x01, 0x2a, 0xe1, 0xcc, 0x89, 0xd9, 0x01, 0x2a, 0xc8, 0xd1, 0x53,
+    0x32, 0x01, 0x2b, 0x39, 0xcb, 0x8e, 0xce, 0x01, 0x2a, 0xd9, 0xcc, 0x87,
+    0xa5, 0x01, 0x2a, 0xc0, 0xd5, 0x32, 0x6c, 0x01, 0x2a, 0x41, 0xd0, 0x32,
+    0x71, 0x01, 0x29, 0xb9, 0x45, 0x00, 0x49, 0xc2, 0x55, 0x08, 0x46, 0x00,
+    0x2c, 0x42, 0x55, 0x14, 0xd5, 0x32, 0x42, 0x01, 0x2a, 0x01, 0xd0, 0x32,
+    0x47, 0x01, 0x29, 0xc1, 0x45, 0x00, 0x49, 0xc2, 0x55, 0x20, 0x46, 0x00,
+    0x2c, 0x42, 0x55, 0x2c, 0xce, 0x72, 0xaa, 0x01, 0x2a, 0x49, 0xc8, 0x11,
+    0xff, 0x01, 0x29, 0xc9, 0xca, 0x11, 0x34, 0x01, 0x29, 0x88, 0xce, 0x73,
+    0x44, 0x01, 0x29, 0xf1, 0xc8, 0x11, 0x49, 0x01, 0x29, 0xb1, 0xca, 0x12,
+    0x12, 0x01, 0x29, 0x70, 0xc5, 0x13, 0x67, 0x01, 0x18, 0xf9, 0xc3, 0x0a,
+    0xea, 0x01, 0x18, 0x60, 0xc5, 0x13, 0x67, 0x01, 0x18, 0xf1, 0xc3, 0x0a,
+    0xea, 0x01, 0x18, 0x68, 0x89, 0x01, 0x8d, 0x68, 0xc2, 0x1b, 0x88, 0x01,
+    0x8d, 0x70, 0xc2, 0x1b, 0x88, 0x01, 0x8d, 0x78, 0x89, 0x01, 0x89, 0x21,
+    0x90, 0x01, 0x8d, 0x48, 0x90, 0x01, 0x8d, 0x39, 0x89, 0x01, 0x8d, 0x90,
+    0x89, 0x01, 0x89, 0x29, 0x90, 0x01, 0x8d, 0x28, 0x90, 0x01, 0x8d, 0x98,
+    0xa2, 0x0f, 0xd8, 0xbb, 0x02, 0x55, 0x38, 0xa3, 0x0f, 0xd9, 0x38, 0xa0,
+    0x0f, 0xd8, 0x33, 0x02, 0x55, 0x3c, 0xa2, 0x0f, 0xd8, 0x93, 0x02, 0x55,
+    0x4e, 0xa1, 0x0f, 0xd8, 0x53, 0x02, 0x55, 0x52, 0xa3, 0x0f, 0xd9, 0x08,
+    0xa3, 0x0f, 0xd9, 0x70, 0xa1, 0x0f, 0xd8, 0x63, 0x02, 0x55, 0x5d, 0xa3,
+    0x0f, 0xd9, 0x19, 0xc2, 0x00, 0x22, 0x0f, 0xd9, 0x90, 0xa3, 0x0f, 0xd9,
+    0x88, 0xa3, 0x0f, 0xd9, 0x49, 0xa2, 0x0f, 0xd8, 0xd2, 0x02, 0x55, 0x68,
+    0xa3, 0x0f, 0xd9, 0x78, 0xa1, 0x0f, 0xd8, 0x6b, 0x02, 0x55, 0x6c, 0xa3,
+    0x0f, 0xd9, 0x21, 0xa2, 0x0f, 0xd8, 0xa2, 0x02, 0x55, 0x77, 0xa2, 0x0f,
+    0xd8, 0xc2, 0x02, 0x55, 0x7b, 0xa3, 0x0f, 0xd9, 0xa8, 0x45, 0xa6, 0x50,
+    0xc2, 0x55, 0x7f, 0x46, 0x3b, 0x9d, 0xc2, 0x55, 0xb6, 0xd0, 0x5d, 0x02,
+    0x01, 0x39, 0x61, 0xce, 0x71, 0x4c, 0x01, 0x37, 0x41, 0xc5, 0x02, 0xd2,
+    0x01, 0x2e, 0x7b, 0x02, 0x55, 0xce, 0xc8, 0xb8, 0x3a, 0x01, 0x33, 0x18,
+    0x4e, 0x70, 0xce, 0xc2, 0x55, 0xd2, 0xc7, 0x37, 0x27, 0x01, 0x38, 0x11,
+    0xce, 0x73, 0xa6, 0x01, 0x38, 0x01, 0xc6, 0xcb, 0xcf, 0x01, 0x36, 0x39,
+    0xc9, 0xb0, 0x1a, 0x01, 0x33, 0x01, 0x0f, 0xc2, 0x55, 0xde, 0xca, 0x50,
+    0x80, 0x01, 0x30, 0xb9, 0xc3, 0x0e, 0x6b, 0x01, 0x30, 0x29, 0xcc, 0x83,
+    0x01, 0x01, 0x30, 0x01, 0xc5, 0x0b, 0x0a, 0x01, 0x2d, 0x03, 0x02, 0x55,
+    0xea, 0xd3, 0x40, 0x08, 0x0f, 0xab, 0x88, 0x44, 0xe1, 0x27, 0xc2, 0x55,
+    0xee, 0xc4, 0x73, 0x5b, 0x01, 0x36, 0xf9, 0xd7, 0x28, 0x5a, 0x01, 0x36,
+    0xb1, 0xc8, 0x36, 0xb4, 0x01, 0x30, 0x71, 0xd2, 0x49, 0xaf, 0x0f, 0xab,
+    0xf8, 0x43, 0x01, 0x47, 0xc2, 0x56, 0x00, 0xc6, 0x3a, 0x1a, 0x01, 0x2e,
+    0x33, 0x02, 0x56, 0x12, 0x14, 0x42, 0x56, 0x16, 0x44, 0x00, 0x2d, 0xc2,
+    0x56, 0x22, 0xc8, 0x46, 0x71, 0x01, 0x2d, 0x61, 0xc6, 0xcd, 0x67, 0x0f,
+    0x9f, 0xb0, 0x43, 0x00, 0x4a, 0xc2, 0x56, 0x34, 0x11, 0xc2, 0x56, 0x44,
+    0x45, 0x17, 0x15, 0x42, 0x56, 0x50, 0x0e, 0xc2, 0x56, 0x5c, 0x11, 0x42,
+    0x56, 0x68, 0xca, 0x9c, 0x20, 0x01, 0x35, 0xc1, 0x46, 0x01, 0xdc, 0x42,
+    0x56, 0x74, 0xd9, 0x1f, 0x31, 0x01, 0x33, 0xd9, 0x12, 0x42, 0x56, 0x92,
+    0x07, 0xc2, 0x56, 0xaa, 0xd5, 0x31, 0xc4, 0x0f, 0xad, 0x51, 0x11, 0x42,
+    0x56, 0xb9, 0xcc, 0x88, 0x59, 0x01, 0x2d, 0x81, 0xc6, 0xc1, 0x01, 0x0f,
+    0xac, 0x41, 0x42, 0x00, 0xc4, 0x42, 0x56, 0xc5, 0x46, 0x05, 0x87, 0xc2,
+    0x56, 0xd1, 0x48, 0x4a, 0x54, 0x42, 0x56, 0xdd, 0xd0, 0x20, 0x66, 0x01,
+    0x3d, 0xb1, 0xd0, 0x03, 0xb7, 0x01, 0x3d, 0xa9, 0xd0, 0x3c, 0x90, 0x01,
+    0x3d, 0xa0, 0x85, 0x01, 0x09, 0x69, 0x9c, 0x01, 0x09, 0x41, 0x94, 0x01,
+    0x08, 0xe1, 0x8b, 0x01, 0x08, 0x89, 0x8a, 0x01, 0x08, 0x60, 0xd0, 0x15,
+    0x35, 0x01, 0x3a, 0x48, 0x9a, 0x01, 0x38, 0xb9, 0x42, 0x00, 0x6b, 0xc2,
+    0x56, 0xef, 0xc8, 0x8e, 0xa5, 0x0f, 0xaf, 0xa0, 0xc3, 0x45, 0xa1, 0x00,
+    0xda, 0xdb, 0x02, 0x56, 0xfc, 0xc5, 0xda, 0x3d, 0x00, 0xdb, 0x00, 0xc8,
+    0xb6, 0xb2, 0x00, 0xdb, 0xe8, 0x46, 0xce, 0x2d, 0xc2, 0x57, 0x02, 0x49,
+    0xb3, 0x7a, 0x42, 0x57, 0x14, 0x48, 0xb5, 0xf2, 0xc2, 0x57, 0x20, 0x46,
+    0xce, 0x33, 0x42, 0x57, 0x2c, 0xc4, 0x8f, 0x44, 0x00, 0xdb, 0x99, 0xc5,
+    0xd7, 0x4a, 0x00, 0xdb, 0x91, 0x44, 0xac, 0xc3, 0xc2, 0x57, 0x38, 0xc7,
+    0x7c, 0x94, 0x00, 0xdb, 0x79, 0xc5, 0xdb, 0xc8, 0x00, 0xdb, 0x61, 0xc5,
+    0xd7, 0xd1, 0x00, 0xdb, 0x58, 0x03, 0xc2, 0x57, 0x4a, 0x07, 0xc2, 0x57,
+    0x5f, 0xc3, 0x00, 0x74, 0x00, 0xdb, 0x31, 0xc3, 0x38, 0x86, 0x00, 0xdb,
+    0x19, 0xc3, 0x08, 0x48, 0x00, 0xdb, 0x08, 0xc5, 0x60, 0xcc, 0x00, 0xda,
+    0xf9, 0xc7, 0xc2, 0x65, 0x00, 0xda, 0xe8, 0xc4, 0x18, 0x10, 0x00, 0xda,
+    0xb9, 0xc2, 0x22, 0xcc, 0x00, 0xda, 0xb0, 0xc3, 0x0d, 0x14, 0x00, 0xda,
+    0xa9, 0xc3, 0x09, 0x9e, 0x00, 0xda, 0xa0, 0xc4, 0x02, 0xde, 0x00, 0xda,
+    0x99, 0xc2, 0x02, 0xa0, 0x00, 0xda, 0x90, 0xcb, 0x98, 0x37, 0x00, 0xda,
+    0x61, 0xcb, 0x91, 0x6d, 0x00, 0xda, 0x59, 0xc5, 0xd7, 0xbd, 0x00, 0xd8,
+    0x81, 0xc4, 0xa2, 0x33, 0x00, 0xd8, 0x2a, 0x02, 0x57, 0x6b, 0xc7, 0xc7,
+    0x20, 0x00, 0xda, 0x41, 0xc4, 0xa2, 0x33, 0x00, 0xd8, 0x78, 0xc9, 0xae,
+    0x61, 0x00, 0xda, 0x39, 0x83, 0x00, 0xd9, 0x12, 0x02, 0x57, 0x71, 0xc9,
+    0xa9, 0x5a, 0x00, 0xda, 0x31, 0x83, 0x00, 0xd8, 0x9a, 0x02, 0x57, 0x75,
+    0x43, 0x20, 0x27, 0x42, 0x57, 0x81, 0xc6, 0xb5, 0xac, 0x00, 0xd8, 0x6a,
+    0x02, 0x57, 0x8d, 0xc5, 0xc4, 0x7b, 0x00, 0xd8, 0x5a, 0x02, 0x57, 0x93,
+    0xc8, 0xbf, 0xaa, 0x00, 0xd9, 0x50, 0xc6, 0xcb, 0x93, 0x00, 0xd9, 0x40,
+    0x83, 0x00, 0xd9, 0x33, 0x02, 0x57, 0x99, 0xc2, 0x19, 0x2c, 0x00, 0xd8,
+    0xe1, 0xc2, 0x01, 0x30, 0x00, 0xd8, 0xb8, 0x42, 0x00, 0x4d, 0x42, 0x57,
+    0x9f, 0xc5, 0xd4, 0xa2, 0x00, 0xd8, 0xd8, 0xc5, 0xd4, 0x48, 0x00, 0xd8,
+    0xc8, 0xc5, 0xd7, 0x4f, 0x00, 0xd8, 0xb0, 0xc7, 0xc2, 0x6c, 0x00, 0xd8,
+    0x90, 0xc7, 0xc2, 0x6c, 0x00, 0xd8, 0x50, 0xc7, 0xc2, 0x6c, 0x00, 0xd8,
+    0x40, 0xc7, 0xc2, 0x6c, 0x00, 0xda, 0x18, 0xc5, 0x25, 0x91, 0x00, 0xd9,
+    0xf3, 0x02, 0x57, 0xab, 0xc5, 0xc2, 0x6e, 0x00, 0xd9, 0xa8, 0xc7, 0xc2,
+    0x6c, 0x00, 0xd9, 0xe8, 0xc7, 0xc2, 0x6c, 0x00, 0xd9, 0xd8, 0xc5, 0xd7,
+    0xcc, 0x00, 0xd9, 0xc8, 0xc5, 0xd9, 0x70, 0x00, 0xd9, 0xb8, 0xc6, 0x1e,
+    0x89, 0x00, 0xd8, 0x09, 0xc5, 0xd6, 0xaa, 0x00, 0xd8, 0x00, 0xc9, 0xae,
+    0xfa, 0x0b, 0x57, 0xa1, 0xc5, 0x28, 0xb0, 0x0b, 0x57, 0x80, 0xc9, 0xaf,
+    0x81, 0x0b, 0x57, 0x99, 0xc5, 0x28, 0xb0, 0x0b, 0x57, 0x88, 0x87, 0x0b,
+    0x57, 0x59, 0xc3, 0x1b, 0x88, 0x0b, 0x56, 0x80, 0xc2, 0x14, 0x68, 0x0b,
+    0x57, 0x00, 0x91, 0x0b, 0x57, 0x48, 0xc3, 0x2d, 0x2f, 0x0b, 0x57, 0x30,
+    0xc3, 0x26, 0x76, 0x0b, 0x57, 0x21, 0xc2, 0x02, 0x0a, 0x0b, 0x56, 0xa8,
+    0x91, 0x0b, 0x56, 0xf1, 0xc3, 0xdf, 0xb7, 0x0b, 0x56, 0xb8, 0xc2, 0x02,
+    0xaa, 0x0b, 0x56, 0xe9, 0xc2, 0x02, 0x98, 0x0b, 0x56, 0xb0, 0xc3, 0x62,
+    0x26, 0x0b, 0x56, 0xc1, 0x83, 0x0b, 0x56, 0x88, 0x42, 0x00, 0x56, 0xc2,
+    0x57, 0xaf, 0x42, 0x00, 0x5d, 0xc2, 0x57, 0xf0, 0x42, 0x00, 0xa9, 0xc2,
+    0x58, 0x30, 0x42, 0x00, 0xee, 0xc2, 0x58, 0x65, 0x42, 0x01, 0x60, 0xc2,
+    0x58, 0xa5, 0x42, 0x01, 0x31, 0x42, 0x58, 0xdd, 0xc2, 0xd0, 0x00, 0x05,
+    0x36, 0x29, 0x87, 0x05, 0x36, 0x50, 0x87, 0x05, 0x36, 0x41, 0xc2, 0x10,
+    0x11, 0x05, 0x36, 0xb8, 0x96, 0x05, 0x35, 0xd9, 0xc2, 0xd0, 0x00, 0x05,
+    0x36, 0x21, 0x90, 0x05, 0x36, 0x90, 0xc3, 0xe5, 0xab, 0x05, 0x37, 0x71,
+    0xc4, 0xe0, 0xa3, 0x05, 0x37, 0x78, 0x87, 0x05, 0x35, 0x29, 0xc2, 0xd0,
+    0x00, 0x05, 0x36, 0x81, 0x90, 0x05, 0x37, 0x08, 0x8b, 0x05, 0x35, 0x61,
+    0xc2, 0x02, 0xe0, 0x05, 0x35, 0x68, 0x87, 0x05, 0x35, 0x31, 0x83, 0x05,
+    0x35, 0x80, 0x96, 0x05, 0x37, 0x41, 0x90, 0x05, 0x37, 0x50, 0xc3, 0x7c,
+    0x57, 0x05, 0x35, 0x91, 0xc3, 0x8b, 0xa9, 0x05, 0x35, 0xf1, 0xc2, 0x02,
+    0xe0, 0x05, 0x36, 0x30, 0xc2, 0x10, 0x11, 0x05, 0x35, 0xe0, 0xc2, 0x02,
+    0xe0, 0x05, 0x36, 0x39, 0xc2, 0x5d, 0xa1, 0x05, 0x37, 0x58, 0xc5, 0xde,
+    0x75, 0x05, 0x36, 0x99, 0xc2, 0x01, 0x30, 0x05, 0x36, 0xa1, 0x83, 0x05,
+    0x36, 0xa8, 0xc3, 0xd0, 0xd7, 0x05, 0x35, 0x79, 0x90, 0x05, 0x37, 0x10,
+    0xc2, 0x00, 0xc4, 0x05, 0x37, 0x01, 0xc2, 0x04, 0xc6, 0x05, 0x37, 0x38,
+    0xc2, 0x25, 0x9f, 0x05, 0x35, 0xb1, 0xc3, 0xd7, 0xe2, 0x05, 0x35, 0xc1,
+    0x97, 0x05, 0x36, 0x01, 0x91, 0x05, 0x36, 0xb0, 0xc7, 0xc8, 0xd2, 0x05,
+    0x37, 0x81, 0xc9, 0xb1, 0x16, 0x05, 0x37, 0x88, 0xc9, 0xab, 0x88, 0x01,
+    0x5a, 0xd9, 0xcd, 0x7d, 0x2a, 0x01, 0x5a, 0xe8, 0x12, 0xc2, 0x59, 0x13,
+    0xc5, 0xdd, 0x67, 0x00, 0xdf, 0xf1, 0xc8, 0xb8, 0x82, 0x00, 0xdf, 0xe0,
+    0xd2, 0x48, 0x7d, 0x00, 0xdf, 0x78, 0x91, 0x00, 0xdf, 0x69, 0x8b, 0x00,
+    0xdf, 0x58, 0x87, 0x00, 0xdf, 0x48, 0xc2, 0x01, 0x5d, 0x00, 0xdf, 0x19,
+    0x83, 0x00, 0xde, 0xa2, 0x02, 0x59, 0x1f, 0xc2, 0x0e, 0x9a, 0x00, 0xdf,
+    0x11, 0xc2, 0x19, 0x2c, 0x00, 0xdf, 0x01, 0xc2, 0x01, 0x30, 0x00, 0xde,
+    0xe9, 0xca, 0x9d, 0x60, 0x00, 0xde, 0xb9, 0x83, 0x00, 0xde, 0x48, 0x4a,
+    0x48, 0x83, 0xc2, 0x59, 0x25, 0x83, 0x00, 0xde, 0xc1, 0xca, 0x9b, 0x94,
+    0x00, 0xde, 0xb0, 0xc7, 0xc8, 0xaf, 0x00, 0xde, 0x68, 0xc2, 0x00, 0xd0,
+    0x00, 0x4c, 0xb3, 0x02, 0x59, 0x5f, 0x83, 0x00, 0x4c, 0xa8, 0x83, 0x00,
+    0x4d, 0xc1, 0xc2, 0x0d, 0xf6, 0x00, 0x4d, 0xb9, 0xc2, 0x00, 0xd0, 0x00,
+    0x4d, 0xb0, 0x83, 0x00, 0x4d, 0x83, 0x02, 0x59, 0x65, 0xc2, 0x00, 0x39,
+    0x00, 0x4e, 0xe1, 0xc2, 0x00, 0xd0, 0x00, 0x4e, 0xe8, 0x83, 0x00, 0x4d,
+    0x79, 0xc2, 0x19, 0x2c, 0x00, 0x4e, 0xf8, 0xc2, 0x00, 0xd0, 0x00, 0x4d,
+    0x69, 0x83, 0x00, 0x4d, 0x60, 0xc2, 0x00, 0xd0, 0x00, 0x4d, 0x59, 0x83,
+    0x00, 0x4d, 0x50, 0x83, 0x00, 0x4d, 0x41, 0xc2, 0x00, 0xc1, 0x00, 0x4d,
+    0x19, 0xc2, 0x19, 0x2c, 0x00, 0x4c, 0xf1, 0xc2, 0x01, 0x30, 0x00, 0x4c,
+    0xc8, 0xc2, 0x00, 0xd0, 0x00, 0x4d, 0x39, 0x83, 0x00, 0x4d, 0x31, 0x06,
+    0x42, 0x59, 0x6b, 0xc2, 0x00, 0xd0, 0x00, 0x4d, 0x29, 0x83, 0x00, 0x4d,
+    0x21, 0x16, 0x42, 0x59, 0x75, 0xc2, 0x00, 0xd0, 0x00, 0x4c, 0xe9, 0x83,
+    0x00, 0x4c, 0xe0, 0xc2, 0x00, 0xd0, 0x00, 0x4c, 0xd9, 0x83, 0x00, 0x4c,
+    0xd0, 0xc2, 0x00, 0xd0, 0x00, 0x4c, 0xc1, 0x83, 0x00, 0x4c, 0xb8, 0x97,
+    0x00, 0x4c, 0xa1, 0x8b, 0x00, 0x4c, 0x81, 0x83, 0x00, 0x4c, 0x30, 0x8b,
+    0x00, 0x4c, 0x40, 0x97, 0x00, 0x4c, 0x50, 0x47, 0xb2, 0x2e, 0xc2, 0x59,
+    0x7f, 0xcd, 0x80, 0x36, 0x00, 0x4f, 0xe0, 0x42, 0x07, 0xb2, 0xc2, 0x59,
+    0x8d, 0x03, 0xc2, 0x59, 0x99, 0xc5, 0x33, 0x5d, 0x00, 0x4d, 0xe1, 0xcb,
+    0x1e, 0x89, 0x00, 0x4c, 0x08, 0x97, 0x00, 0x4e, 0x61, 0x8b, 0x00, 0x4e,
+    0x41, 0x83, 0x00, 0x4d, 0xf0, 0x94, 0x00, 0x4e, 0x1b, 0x02, 0x59, 0xa5,
+    0x8e, 0x00, 0x4f, 0x12, 0x02, 0x59, 0xa9, 0x97, 0x00, 0x4e, 0x10, 0x8b,
+    0x00, 0x4e, 0x00, 0xc2, 0x02, 0xa0, 0x00, 0x4f, 0x41, 0xc4, 0x02, 0xde,
+    0x00, 0x4f, 0x48, 0xc3, 0x09, 0x9e, 0x00, 0x4f, 0x51, 0xc3, 0x0d, 0x14,
+    0x00, 0x4f, 0x58, 0xc2, 0x22, 0xcc, 0x00, 0x4f, 0x61, 0xc4, 0x18, 0x10,
+    0x00, 0x4f, 0x68, 0xc3, 0x05, 0x14, 0x00, 0x4f, 0xa3, 0x02, 0x59, 0xad,
+    0x16, 0xc2, 0x59, 0xb3, 0xc4, 0x09, 0x9d, 0x00, 0x4f, 0xb8, 0x1b, 0xc2,
+    0x59, 0xbf, 0xc2, 0x00, 0x39, 0x00, 0xd0, 0x59, 0x83, 0x00, 0xd0, 0x51,
+    0x09, 0x42, 0x59, 0xc9, 0xc2, 0x00, 0xb0, 0x00, 0xd0, 0x39, 0x83, 0x00,
+    0xd0, 0x30, 0xa4, 0x01, 0x42, 0x03, 0x02, 0x59, 0xd3, 0x9e, 0x01, 0x40,
+    0x0b, 0x02, 0x59, 0xd7, 0x9f, 0x01, 0x40, 0x13, 0x02, 0x5a, 0x05, 0xa0,
+    0x01, 0x40, 0x23, 0x02, 0x5a, 0x2c, 0xa1, 0x01, 0x40, 0x43, 0x02, 0x5a,
+    0x4c, 0xa2, 0x01, 0x40, 0x83, 0x02, 0x5a, 0x65, 0xa3, 0x01, 0x41, 0x03,
+    0x02, 0x5a, 0x77, 0xa5, 0x01, 0x44, 0x00, 0x00, 0x42, 0x5a, 0x82, 0xc2,
+    0x0d, 0x10, 0x08, 0x83, 0x18, 0x9b, 0x08, 0x83, 0x10, 0xc4, 0x18, 0x10,
+    0x08, 0x82, 0xc3, 0x02, 0x5a, 0x8e, 0xc2, 0x22, 0xcc, 0x08, 0x82, 0xba,
+    0x02, 0x5a, 0x94, 0x0b, 0xc2, 0x5a, 0x9a, 0x11, 0x42, 0x5a, 0xa6, 0x0a,
+    0xc2, 0x5a, 0xb2, 0x19, 0xc2, 0x5a, 0xbe, 0xc2, 0x00, 0xc4, 0x08, 0x82,
+    0xd8, 0x49, 0x5c, 0x83, 0x42, 0x5a, 0xc8, 0xc2, 0x00, 0xdb, 0x08, 0x81,
+    0xa1, 0x83, 0x08, 0x81, 0x70, 0xc2, 0x00, 0xd0, 0x08, 0x81, 0x51, 0x83,
+    0x08, 0x81, 0x48, 0xc2, 0x00, 0xd0, 0x08, 0x81, 0x41, 0x83, 0x08, 0x81,
+    0x38, 0x83, 0x08, 0x81, 0x31, 0xc2, 0x00, 0xc1, 0x08, 0x81, 0x09, 0xc2,
+    0x19, 0x2c, 0x08, 0x80, 0xe1, 0xc2, 0x01, 0x30, 0x08, 0x80, 0xb8, 0xc2,
+    0x00, 0xd0, 0x08, 0x81, 0x29, 0x83, 0x08, 0x81, 0x21, 0x06, 0x42, 0x5a,
+    0xe0, 0xc2, 0x00, 0xd0, 0x08, 0x81, 0x19, 0x83, 0x08, 0x81, 0x11, 0x16,
+    0x42, 0x5a, 0xea, 0xc2, 0x00, 0xd0, 0x08, 0x80, 0xd9, 0x83, 0x08, 0x80,
+    0xd0, 0xc2, 0x00, 0xd0, 0x08, 0x80, 0xc9, 0x83, 0x08, 0x80, 0xc0, 0xc2,
+    0x00, 0xd0, 0x08, 0x80, 0xb1, 0x83, 0x08, 0x80, 0xa8, 0xc2, 0x00, 0xd0,
+    0x08, 0x80, 0xa1, 0x83, 0x08, 0x80, 0x98, 0x97, 0x08, 0x80, 0x91, 0x8b,
+    0x08, 0x80, 0x81, 0x83, 0x08, 0x80, 0x30, 0x47, 0xb2, 0x2e, 0xc2, 0x5a,
+    0xf4, 0x83, 0x08, 0x81, 0x78, 0x97, 0x08, 0x80, 0x50, 0x8b, 0x08, 0x80,
+    0x40, 0xc2, 0x00, 0xd0, 0x08, 0x81, 0x81, 0xc2, 0x0d, 0xf6, 0x08, 0x81,
+    0x89, 0x83, 0x08, 0x81, 0x90, 0x91, 0x08, 0x82, 0x23, 0x02, 0x5b, 0x02,
+    0x03, 0xc2, 0x5b, 0x08, 0x87, 0x08, 0x82, 0x11, 0x48, 0xb2, 0x2d, 0xc2,
+    0x5b, 0x14, 0x97, 0x08, 0x81, 0xe3, 0x02, 0x5b, 0x22, 0x8b, 0x08, 0x81,
+    0xd3, 0x02, 0x5b, 0x26, 0xce, 0x6e, 0x2e, 0x08, 0x81, 0xc8, 0xc4, 0x26,
+    0x78, 0x08, 0x83, 0x79, 0xc5, 0x06, 0xdb, 0x08, 0x83, 0x71, 0x15, 0xc2,
+    0x5b, 0x2a, 0x08, 0xc2, 0x5b, 0x36, 0x16, 0xc2, 0x5b, 0x42, 0xc3, 0x05,
+    0x14, 0x08, 0x83, 0x39, 0xc4, 0x15, 0xe7, 0x08, 0x83, 0x30, 0xc4, 0x6e,
+    0x13, 0x08, 0x82, 0x69, 0xc3, 0x02, 0x6e, 0x08, 0x82, 0x58, 0xc8, 0x3a,
+    0x36, 0x08, 0x82, 0x51, 0x96, 0x08, 0x82, 0x48, 0x42, 0x00, 0xbd, 0xc2,
+    0x5b, 0x4e, 0xc9, 0x79, 0x79, 0x0e, 0x83, 0x90, 0xc7, 0xc3, 0x1b, 0x0e,
+    0x85, 0xa9, 0xc6, 0xc5, 0x06, 0x0e, 0x85, 0xa0, 0xc4, 0x99, 0xff, 0x0e,
+    0x87, 0xa1, 0xc3, 0x2e, 0xd7, 0x0e, 0x83, 0xf8, 0x44, 0xe3, 0xbb, 0xc2,
+    0x5b, 0x60, 0xc8, 0x9c, 0xe0, 0x0e, 0x80, 0xd8, 0x00, 0x42, 0x5b, 0x72,
+    0xc5, 0xd6, 0xa5, 0x0e, 0x82, 0x10, 0x03, 0xc2, 0x5b, 0x7e, 0x11, 0x42,
+    0x5b, 0x88, 0xc3, 0x03, 0x13, 0x0e, 0x83, 0xd1, 0xc9, 0xaa, 0x68, 0x0e,
+    0x81, 0xb8, 0xc2, 0x00, 0xec, 0x0e, 0x87, 0x79, 0xc2, 0x01, 0x6c, 0x0e,
+    0x87, 0x71, 0xc2, 0x00, 0x3c, 0x0e, 0x87, 0x69, 0xc2, 0x01, 0xdd, 0x0e,
+    0x87, 0x61, 0xc2, 0x01, 0x30, 0x0e, 0x87, 0x59, 0xc3, 0x29, 0x6f, 0x0e,
+    0x87, 0x51, 0xc2, 0x00, 0xb0, 0x0e, 0x87, 0x48, 0x90, 0x0e, 0x84, 0xb9,
+    0xc9, 0x79, 0x79, 0x0e, 0x83, 0x98, 0x46, 0xce, 0xab, 0xc2, 0x5b, 0x94,
+    0x46, 0xcb, 0x03, 0xc2, 0x5b, 0xa1, 0xc5, 0x4c, 0x93, 0x0e, 0x81, 0x18,
+    0xc6, 0xd0, 0x01, 0x0e, 0x81, 0x99, 0xca, 0x6d, 0x0c, 0x0e, 0x80, 0x68,
+    0xc5, 0xd6, 0x0a, 0x0e, 0x85, 0x09, 0xc4, 0xe1, 0x7f, 0x0e, 0x84, 0xd0,
+    0xc5, 0xda, 0x42, 0x0e, 0x85, 0x01, 0x8b, 0x0e, 0x84, 0xf8, 0xc2, 0x00,
+    0xba, 0x0e, 0x84, 0xf1, 0xc4, 0x01, 0x92, 0x0e, 0x84, 0xe8, 0x8b, 0x0e,
+    0x84, 0xe1, 0xc5, 0xda, 0x42, 0x0e, 0x84, 0xd8, 0xc7, 0xc8, 0x93, 0x0e,
+    0x83, 0x11, 0xc2, 0x01, 0xc3, 0x0e, 0x82, 0xe0, 0xc9, 0xa9, 0xe1, 0x0e,
+    0x80, 0xf8, 0x00, 0x42, 0x5b, 0xad, 0x00, 0x42, 0x5b, 0xb7, 0xc4, 0xcf,
+    0x8b, 0x0e, 0x80, 0x40, 0x45, 0xda, 0xe2, 0xc2, 0x5b, 0xc1, 0xc4, 0xc8,
+    0x2c, 0x0e, 0x80, 0x98, 0xc8, 0xbe, 0x2a, 0x0e, 0x87, 0x31, 0xc5, 0xcf,
+    0x3c, 0x0e, 0x84, 0x92, 0x02, 0x5b, 0xd3, 0x46, 0xd0, 0x07, 0xc2, 0x5b,
+    0xd9, 0xc4, 0xc2, 0xa0, 0x0e, 0x84, 0xc8, 0x16, 0xc2, 0x5b, 0xeb, 0xd5,
+    0x35, 0x4b, 0x0e, 0x86, 0x91, 0xdc, 0x13, 0x35, 0x0e, 0x86, 0x89, 0xd1,
+    0x4f, 0x58, 0x0e, 0x86, 0x80, 0xc9, 0x9c, 0xdf, 0x0e, 0x84, 0x00, 0x43,
+    0x01, 0x92, 0xc2, 0x5b, 0xf7, 0xd5, 0x35, 0x4b, 0x0e, 0x86, 0xb1, 0xdc,
+    0x13, 0x35, 0x0e, 0x86, 0xa9, 0xd1, 0x4f, 0x58, 0x0e, 0x86, 0xa0, 0xc3,
+    0x2e, 0xd7, 0x0e, 0x83, 0xe9, 0xc4, 0x99, 0xff, 0x0e, 0x83, 0xe0, 0xc4,
+    0xde, 0x8f, 0x0e, 0x82, 0x99, 0xc6, 0xd0, 0x19, 0x0e, 0x80, 0x52, 0x02,
+    0x5c, 0x03, 0xc5, 0xda, 0x1a, 0x0e, 0x86, 0x39, 0xc9, 0xb1, 0x9d, 0x0e,
+    0x85, 0xe0, 0x47, 0x1a, 0x0a, 0xc2, 0x5c, 0x09, 0xcb, 0x98, 0xbb, 0x0e,
+    0x85, 0xf0, 0xca, 0xa2, 0xec, 0x0e, 0x86, 0x21, 0xc8, 0xba, 0x3a, 0x0e,
+    0x86, 0x18, 0x10, 0xc2, 0x5c, 0x15, 0xc2, 0x01, 0x6c, 0x0e, 0x86, 0x01,
+    0xc2, 0x00, 0x3c, 0x0e, 0x85, 0xf9, 0xc2, 0x01, 0xdd, 0x0e, 0x85, 0xe9,
+    0xc2, 0x00, 0xb0, 0x0e, 0x85, 0xd0, 0xcf, 0x6b, 0x07, 0x0e, 0x85, 0xc8,
+    0x44, 0x3b, 0xaf, 0xc2, 0x5c, 0x21, 0xc4, 0x65, 0xea, 0x0e, 0x85, 0xb8,
+    0xc3, 0x63, 0x2b, 0x0e, 0x82, 0x31, 0xc8, 0x9c, 0xe0, 0x0e, 0x80, 0xd0,
+    0x47, 0xc9, 0x11, 0xc2, 0x5c, 0x2b, 0x44, 0x89, 0x3e, 0x42, 0x5c, 0x37,
+    0x48, 0x6d, 0x79, 0xc2, 0x5c, 0x43, 0x42, 0x00, 0x2c, 0x42, 0x5c, 0x4f,
+    0xce, 0x6d, 0x5c, 0x0e, 0x85, 0x29, 0xcc, 0x89, 0x3d, 0x0e, 0x85, 0x18,
+    0xc6, 0xcf, 0x3b, 0x0e, 0x84, 0xb1, 0xc3, 0x1f, 0x1d, 0x0e, 0x84, 0x39,
+    0x83, 0x0e, 0x81, 0x80, 0xc7, 0xc8, 0x31, 0x0e, 0x83, 0x81, 0x12, 0xc2,
+    0x5c, 0x5b, 0xc7, 0xc4, 0xaa, 0x0e, 0x83, 0x69, 0x42, 0x00, 0xbd, 0x42,
+    0x5c, 0x67, 0xcd, 0x78, 0xe6, 0x0e, 0x83, 0xc9, 0xc2, 0x01, 0xc3, 0x0e,
+    0x81, 0x6a, 0x02, 0x5c, 0x71, 0xcf, 0x68, 0x0a, 0x0e, 0x84, 0x71, 0x16,
+    0xc2, 0x5c, 0x7d, 0xcb, 0x8f, 0x52, 0x0e, 0x84, 0x59, 0xcc, 0x80, 0xd9,
+    0x0e, 0x84, 0x50, 0xc3, 0x63, 0x2b, 0x0e, 0x82, 0x41, 0xc5, 0xcc, 0xcc,
+    0x0e, 0x80, 0x21, 0xcb, 0x6d, 0x0b, 0x0e, 0x80, 0x18, 0xc7, 0xc8, 0x31,
+    0x0e, 0x83, 0x89, 0xcb, 0x94, 0x17, 0x0e, 0x83, 0x79, 0xc7, 0xc4, 0xaa,
+    0x0e, 0x83, 0x61, 0x90, 0x0e, 0x81, 0xca, 0x02, 0x5c, 0x89, 0xc2, 0x00,
+    0x45, 0x0e, 0x80, 0xb9, 0x8b, 0x0e, 0x80, 0x00, 0x47, 0xc1, 0xee, 0xc2,
+    0x5c, 0x8f, 0xc6, 0xcf, 0x89, 0x0e, 0x80, 0x4a, 0x02, 0x5c, 0x9b, 0xc4,
+    0x77, 0x35, 0x0e, 0x82, 0x68, 0x16, 0xc2, 0x5c, 0x9f, 0xc2, 0x01, 0xc3,
+    0x0e, 0x82, 0x08, 0xc3, 0x63, 0x2b, 0x0e, 0x82, 0xc1, 0xc5, 0xcc, 0xcc,
+    0x0e, 0x80, 0x31, 0xcb, 0x6d, 0x0b, 0x0e, 0x80, 0x28, 0x94, 0x08, 0xe0,
+    0x38, 0xd1, 0x51, 0xbc, 0x0f, 0xdc, 0xf9, 0xc2, 0x00, 0x49, 0x01, 0x2f,
+    0xd0, 0x4e, 0x60, 0x6d, 0xc2, 0x5c, 0xa9, 0xcc, 0x80, 0xf1, 0x0f, 0xac,
+    0x50, 0xc9, 0xb4, 0xac, 0x0f, 0xac, 0x61, 0xc5, 0xcd, 0x8c, 0x0f, 0xac,
+    0x48, 0xd1, 0x51, 0xbc, 0x0f, 0xdc, 0xf1, 0xc2, 0x00, 0x49, 0x01, 0x2f,
+    0xf8, 0x4e, 0x01, 0xf4, 0xc2, 0x5c, 0xb5, 0xdb, 0x17, 0x61, 0x01, 0x49,
+    0xf0, 0x5b, 0x16, 0xa4, 0xc2, 0x5c, 0xc1, 0x46, 0x01, 0xc8, 0x42, 0x5c,
+    0xcd, 0xce, 0x08, 0x79, 0x01, 0x2c, 0x31, 0xcd, 0x3f, 0xe8, 0x01, 0x2c,
+    0x18, 0xc9, 0xae, 0xbb, 0x01, 0x3f, 0xf0, 0xc9, 0xae, 0xbb, 0x01, 0x3f,
+    0xe0, 0xc9, 0xae, 0xbb, 0x01, 0x3f, 0xe8, 0xc9, 0xae, 0xbb, 0x01, 0x3f,
+    0xd8, 0xcc, 0x82, 0x35, 0x01, 0x3f, 0xd1, 0xc5, 0x01, 0xa2, 0x01, 0x3f,
+    0xb8, 0xcf, 0x64, 0xd1, 0x01, 0x52, 0xe9, 0xcb, 0x98, 0x42, 0x01, 0x52,
+    0xd9, 0x42, 0x00, 0x58, 0x42, 0x5c, 0xdf, 0xc7, 0x16, 0x16, 0x01, 0x52,
+    0x89, 0x45, 0x00, 0x5a, 0x42, 0x5c, 0xeb, 0x42, 0x00, 0xa9, 0xc2, 0x5c,
+    0xf7, 0x09, 0x42, 0x5d, 0x09, 0xd3, 0x16, 0x91, 0x01, 0x4c, 0x99, 0x49,
+    0x05, 0xcb, 0x42, 0x5d, 0x18, 0x49, 0x01, 0xd3, 0xc2, 0x5d, 0x24, 0xcc,
+    0x01, 0xdb, 0x0f, 0xdc, 0x61, 0xc6, 0x02, 0xd1, 0x0f, 0xc8, 0x3b, 0x02,
+    0x5d, 0x2a, 0x42, 0x00, 0x5b, 0xc2, 0x5d, 0x30, 0xcb, 0x96, 0x7f, 0x0f,
+    0xdd, 0x91, 0xc6, 0x9e, 0xf4, 0x0f, 0xdd, 0xc8, 0xd0, 0x5b, 0xc2, 0x0f,
+    0xc2, 0xc1, 0xd1, 0x55, 0x30, 0x01, 0x0f, 0xf9, 0xc5, 0x01, 0xa2, 0x01,
+    0x0c, 0xa3, 0x02, 0x5d, 0x3c, 0xcc, 0x82, 0x35, 0x01, 0x0e, 0xa3, 0x02,
+    0x5d, 0x40, 0x19, 0xc2, 0x5d, 0x46, 0xcb, 0x94, 0x22, 0x01, 0x58, 0x61,
+    0xd5, 0x01, 0x92, 0x01, 0x5b, 0x20, 0xcc, 0x06, 0xdb, 0x01, 0x2c, 0x79,
+    0xcd, 0x15, 0x02, 0x01, 0x2c, 0x70, 0xd1, 0x3f, 0xe4, 0x01, 0x2c, 0x49,
+    0xd0, 0x05, 0xb7, 0x01, 0x16, 0x58, 0x00, 0x42, 0x5d, 0x52, 0xd3, 0x01,
+    0xb4, 0x01, 0x00, 0xc1, 0xd0, 0x58, 0xd2, 0x01, 0x71, 0x30, 0x00, 0x42,
+    0x5d, 0x6a, 0x44, 0x02, 0xdf, 0xc2, 0x5d, 0x7c, 0xcc, 0x86, 0xcd, 0x0f,
+    0xaf, 0x61, 0xde, 0x06, 0x69, 0x0f, 0xde, 0x08, 0x44, 0x01, 0x94, 0xc2,
+    0x5d, 0x88, 0xd3, 0x41, 0xf6, 0x01, 0x70, 0x48, 0xd0, 0x4a, 0x77, 0x01,
+    0x2c, 0x59, 0xc7, 0xb2, 0xec, 0x01, 0x4b, 0xe0, 0xca, 0xa2, 0x74, 0x01,
+    0x1c, 0xe9, 0xc9, 0x57, 0x36, 0x01, 0x1c, 0xe1, 0xca, 0xa3, 0x5a, 0x01,
+    0x1c, 0xd8, 0xce, 0x01, 0xb9, 0x01, 0x00, 0xe1, 0xcc, 0x8a, 0x09, 0x01,
+    0x4e, 0xd1, 0xcb, 0x1a, 0x50, 0x01, 0x71, 0x41, 0xcd, 0x0b, 0x91, 0x01,
+    0x80, 0x50, 0xcb, 0x1a, 0x50, 0x01, 0x4c, 0x29, 0x05, 0xc2, 0x5d, 0x94,
+    0xd2, 0x21, 0x89, 0x01, 0x80, 0xb1, 0xd6, 0x08, 0x88, 0x01, 0x80, 0xc1,
+    0xce, 0x25, 0xad, 0x01, 0x80, 0xd0, 0xd6, 0x08, 0x88, 0x01, 0x4c, 0xb9,
+    0xd2, 0x21, 0x89, 0x01, 0x80, 0x80, 0x50, 0x58, 0xb2, 0xc2, 0x5d, 0xa0,
+    0x4e, 0x6c, 0x36, 0x42, 0x5d, 0xac, 0xda, 0x1b, 0xd0, 0x0f, 0xc4, 0xa0,
+    0x45, 0x01, 0x95, 0xc2, 0x5d, 0xb8, 0x44, 0x0b, 0x26, 0x42, 0x5d, 0xc4,
+    0xcd, 0x7e, 0x3b, 0x01, 0x0c, 0xf1, 0x48, 0x01, 0x9a, 0x42, 0x5d, 0xd0,
+    0x45, 0x00, 0x8c, 0xc2, 0x5d, 0xdc, 0x16, 0xc2, 0x5e, 0x12, 0xd5, 0x10,
+    0x87, 0x01, 0x0e, 0x31, 0xc8, 0xae, 0xbc, 0x01, 0x0d, 0x23, 0x02, 0x5e,
+    0x1e, 0x03, 0x42, 0x5e, 0x24, 0xc5, 0x01, 0xa2, 0x01, 0x0e, 0x83, 0x02,
+    0x5e, 0x30, 0xca, 0x52, 0xc2, 0x01, 0x48, 0x60, 0xcb, 0x6f, 0xff, 0x01,
+    0x0e, 0xe1, 0xca, 0x88, 0xdf, 0x0f, 0xc1, 0xc0, 0x46, 0x01, 0x52, 0xc2,
+    0x5e, 0x3a, 0xc2, 0x02, 0x35, 0x0f, 0xd7, 0x90, 0xd0, 0x58, 0x62, 0x0f,
+    0xc2, 0x01, 0xc5, 0x01, 0xa2, 0x0f, 0xc2, 0x20, 0xc5, 0x01, 0xa2, 0x01,
+    0x58, 0x29, 0xd3, 0x43, 0xe4, 0x01, 0x5c, 0x40, 0xca, 0x50, 0x5e, 0x00,
+    0x7e, 0xc0, 0xca, 0x37, 0x4e, 0x01, 0x13, 0x91, 0xc5, 0x07, 0x62, 0x01,
+    0x13, 0x20, 0x4a, 0x33, 0xad, 0x42, 0x5e, 0x46, 0xe0, 0x09, 0xc7, 0x01,
+    0x54, 0x58, 0x47, 0xc7, 0x35, 0xc2, 0x5e, 0x55, 0x53, 0x40, 0x1b, 0x42,
+    0x5e, 0x61, 0xe0, 0x07, 0x07, 0x01, 0x54, 0x88, 0xc2, 0x00, 0xd0, 0x00,
+    0xe2, 0x71, 0x83, 0x00, 0xe2, 0x68, 0xc2, 0x00, 0xd0, 0x00, 0xe0, 0xc1,
+    0x83, 0x00, 0xe0, 0xb8, 0xc7, 0xc0, 0x97, 0x00, 0xe1, 0xf0, 0xd2, 0x4d,
+    0x57, 0x0f, 0xbd, 0xa9, 0xc6, 0x13, 0x52, 0x0f, 0xbd, 0x49, 0xc4, 0x01,
+    0xe3, 0x01, 0x2c, 0x88, 0x44, 0x00, 0x2d, 0xc2, 0x5e, 0x67, 0xc3, 0x14,
+    0xa7, 0x0f, 0xb4, 0x40, 0xe0, 0x08, 0x87, 0x01, 0x3b, 0x90, 0x52, 0x11,
+    0x92, 0xc2, 0x5e, 0x6d, 0x44, 0x0d, 0x14, 0x42, 0x5e, 0x79, 0xd7, 0x2a,
+    0xb0, 0x0f, 0xbe, 0x01, 0xd8, 0x22, 0x43, 0x0f, 0xbe, 0x90, 0xc7, 0x6f,
+    0xbc, 0x0f, 0xaf, 0x88, 0x83, 0x05, 0x26, 0xe9, 0xc2, 0x00, 0xd0, 0x05,
+    0x26, 0xf0, 0x44, 0x5d, 0xb5, 0xc2, 0x5e, 0x85, 0xc5, 0xdb, 0x87, 0x05,
+    0x27, 0xc8, 0xc4, 0xb2, 0xf8, 0x00, 0x04, 0x50, 0xd6, 0x2e, 0xd8, 0x01,
+    0x50, 0xa1, 0x45, 0x00, 0x8c, 0x42, 0x5e, 0xa3, 0x24, 0xc2, 0x5e, 0xaf,
+    0x23, 0xc2, 0x5e, 0xc3, 0x42, 0xe5, 0x28, 0xc2, 0x5e, 0xdf, 0x04, 0xc2,
+    0x5e, 0xff, 0xc4, 0xe4, 0xb7, 0x08, 0x30, 0xd9, 0x1e, 0xc2, 0x5f, 0x07,
+    0x20, 0xc2, 0x5f, 0x19, 0x21, 0xc2, 0x5f, 0x39, 0x22, 0x42, 0x5f, 0x41,
+    0x42, 0x00, 0x91, 0xc2, 0x5f, 0x69, 0x49, 0xa8, 0xca, 0xc2, 0x5f, 0x75,
+    0x4a, 0xa2, 0xd8, 0x42, 0x5f, 0x7f, 0xc4, 0x18, 0x10, 0x00, 0xca, 0x69,
+    0xc2, 0x22, 0xcc, 0x00, 0xca, 0x60, 0xc3, 0x0d, 0x14, 0x00, 0xca, 0x59,
+    0xc3, 0x09, 0x9e, 0x00, 0xca, 0x50, 0xc4, 0x02, 0xde, 0x00, 0xca, 0x49,
+    0xc2, 0x02, 0xa0, 0x00, 0xca, 0x40, 0xc3, 0x15, 0x31, 0x00, 0xca, 0x01,
+    0xc4, 0xdf, 0x0f, 0x00, 0xc9, 0xd9, 0xc9, 0xac, 0xc3, 0x00, 0xc9, 0xd1,
+    0xc9, 0xa9, 0x87, 0x00, 0xc9, 0xc8, 0xc2, 0x00, 0xdb, 0x00, 0xc9, 0xc1,
+    0xc2, 0x00, 0x39, 0x00, 0xc9, 0xb9, 0xc2, 0x01, 0xc3, 0x00, 0xc9, 0xb1,
+    0xc2, 0x00, 0xb0, 0x00, 0xc9, 0xa9, 0x10, 0xc2, 0x5f, 0x89, 0xc2, 0x01,
+    0x6f, 0x00, 0xc9, 0x99, 0xc8, 0x14, 0x38, 0x00, 0xc9, 0x91, 0xc2, 0x02,
+    0x2b, 0x00, 0xc9, 0x80, 0xc2, 0x01, 0x4a, 0x00, 0xc9, 0x59, 0xc2, 0x00,
+    0x39, 0x00, 0xc9, 0x51, 0xc2, 0x19, 0x2c, 0x00, 0xc9, 0x48, 0x91, 0x00,
+    0xc9, 0x43, 0x02, 0x5f, 0x93, 0x87, 0x00, 0xc9, 0x3b, 0x02, 0x5f, 0x97,
+    0x83, 0x00, 0xc9, 0x03, 0x02, 0x5f, 0x9b, 0x97, 0x00, 0xc9, 0x11, 0x8b,
+    0x00, 0xc9, 0x08, 0xc2, 0x00, 0x39, 0x00, 0xc8, 0xf1, 0xc2, 0x00, 0xd0,
+    0x00, 0xc8, 0x61, 0x83, 0x00, 0xc8, 0x58, 0xc3, 0x2e, 0x0f, 0x00, 0xc8,
+    0xe9, 0xc2, 0x00, 0xd0, 0x00, 0xc8, 0x21, 0x83, 0x00, 0xc8, 0x18, 0x83,
+    0x00, 0xc8, 0xd9, 0xc2, 0x0d, 0xf6, 0x00, 0xc8, 0xd1, 0xc2, 0x00, 0xd0,
+    0x00, 0xc8, 0xc8, 0x90, 0x00, 0xc8, 0x50, 0xc2, 0x00, 0xd0, 0x00, 0xc8,
+    0x99, 0x83, 0x00, 0xc8, 0x90, 0xc2, 0x00, 0xd0, 0x00, 0xc8, 0x89, 0x83,
+    0x00, 0xc8, 0x80, 0x83, 0x00, 0xc8, 0x79, 0xc2, 0x01, 0x30, 0x00, 0xc8,
+    0x28, 0xc2, 0x00, 0xd0, 0x00, 0xc8, 0x71, 0x83, 0x00, 0xc8, 0x68, 0xc2,
+    0x00, 0xd0, 0x00, 0xc8, 0x49, 0x83, 0x00, 0xc8, 0x40, 0xc2, 0x00, 0xd0,
+    0x00, 0xc8, 0x39, 0x83, 0x00, 0xc8, 0x30, 0xc2, 0x00, 0xd0, 0x00, 0xc8,
+    0x11, 0x83, 0x00, 0xc8, 0x08, 0x45, 0xdc, 0x72, 0xc2, 0x5f, 0xa3, 0x44,
+    0x87, 0x22, 0x42, 0x5f, 0xaf, 0xc6, 0x0b, 0x09, 0x0f, 0xbf, 0x29, 0xc6,
+    0x02, 0xd1, 0x0f, 0xa9, 0xa0, 0xc6, 0x02, 0xd1, 0x0f, 0xbf, 0x11, 0xc6,
+    0x0b, 0x09, 0x0f, 0xbf, 0x48, 0x43, 0x02, 0x6f, 0xc2, 0x5f, 0xc1, 0x46,
+    0x19, 0x02, 0x42, 0x5f, 0xcd, 0x43, 0x02, 0xa0, 0xc2, 0x5f, 0xdf, 0xdb,
+    0x18, 0x54, 0x01, 0x57, 0xe0, 0xc6, 0x02, 0xd1, 0x0f, 0xbf, 0x09, 0xc6,
+    0x0b, 0x09, 0x0f, 0xbf, 0x40, 0xc6, 0x02, 0xd1, 0x0f, 0xbf, 0x19, 0xc6,
+    0x0b, 0x09, 0x0f, 0xbf, 0x50, 0x46, 0x02, 0x0f, 0xc2, 0x5f, 0xeb, 0x48,
+    0x19, 0x9b, 0x42, 0x60, 0xa1, 0xcd, 0x78, 0x57, 0x00, 0xeb, 0xf1, 0xcd,
+    0x7b, 0x63, 0x00, 0xeb, 0xd8, 0xc4, 0x74, 0x82, 0x01, 0x04, 0xa0, 0x96,
+    0x00, 0xe8, 0xdb, 0x02, 0x60, 0xbd, 0x8e, 0x00, 0x14, 0xfb, 0x02, 0x60,
+    0xc3, 0x87, 0x00, 0xe8, 0x3b, 0x02, 0x60, 0xc9, 0x9c, 0x00, 0xe9, 0x11,
+    0x99, 0x00, 0xe9, 0x09, 0x98, 0x00, 0xe9, 0x01, 0x97, 0x00, 0xe8, 0xe1,
+    0x94, 0x00, 0x14, 0x03, 0x02, 0x60, 0xd5, 0x92, 0x00, 0xe8, 0xc1, 0x91,
+    0x00, 0xe8, 0x7b, 0x02, 0x60, 0xe7, 0x8f, 0x00, 0xe8, 0x69, 0x8d, 0x00,
+    0xe8, 0x59, 0x8c, 0x00, 0xe8, 0x51, 0x86, 0x00, 0xe8, 0x29, 0x85, 0x00,
+    0xe8, 0x21, 0x84, 0x00, 0x14, 0xcb, 0x02, 0x60, 0xf5, 0x83, 0x00, 0xe8,
+    0x03, 0x02, 0x60, 0xfb, 0x89, 0x00, 0x13, 0x13, 0x02, 0x60, 0xff, 0x8b,
+    0x00, 0x13, 0x53, 0x02, 0x61, 0x05, 0x90, 0x00, 0x13, 0xa1, 0x9b, 0x00,
+    0x14, 0x79, 0x8a, 0x00, 0x14, 0xe1, 0x88, 0x05, 0x39, 0x81, 0x95, 0x05,
+    0x39, 0x89, 0x93, 0x05, 0x3d, 0x78, 0xca, 0x45, 0x1d, 0x0e, 0xf8, 0x78,
+    0xc4, 0x00, 0x32, 0x0e, 0xf8, 0x71, 0xc6, 0x01, 0x73, 0x00, 0x0d, 0xf0,
+    0xd4, 0x01, 0x13, 0x0e, 0xf8, 0x50, 0xd8, 0x23, 0x33, 0x00, 0x15, 0x11,
+    0xc8, 0xba, 0xda, 0x00, 0x0d, 0x50, 0xc5, 0x01, 0x0e, 0x00, 0x14, 0xc1,
+    0xca, 0x54, 0x9e, 0x00, 0x15, 0x60, 0x9b, 0x00, 0x02, 0xcb, 0x02, 0x61,
+    0x0b, 0x8f, 0x00, 0x02, 0x6b, 0x02, 0x61, 0x17, 0x97, 0x00, 0x02, 0xab,
+    0x02, 0x61, 0x23, 0x91, 0x00, 0x02, 0x7b, 0x02, 0x61, 0x2d, 0x8b, 0x00,
+    0x02, 0x4b, 0x02, 0x61, 0x51, 0x87, 0x00, 0x02, 0x2b, 0x02, 0x61, 0x67,
+    0x83, 0x00, 0x02, 0x0b, 0x02, 0x61, 0x8f, 0x95, 0x00, 0x02, 0x9b, 0x02,
+    0x61, 0xc5, 0x9c, 0x00, 0x02, 0xd3, 0x02, 0x61, 0xe7, 0x9a, 0x00, 0x02,
+    0xc3, 0x02, 0x61, 0xed, 0x99, 0x00, 0x02, 0xbb, 0x02, 0x61, 0xf3, 0x98,
+    0x00, 0x02, 0xb3, 0x02, 0x61, 0xff, 0x96, 0x00, 0x02, 0xa3, 0x02, 0x62,
+    0x1b, 0x94, 0x00, 0x02, 0x93, 0x02, 0x62, 0x40, 0x92, 0x00, 0x02, 0x83,
+    0x02, 0x62, 0x50, 0x90, 0x00, 0x02, 0x73, 0x02, 0x62, 0x56, 0x8e, 0x00,
+    0x02, 0x63, 0x02, 0x62, 0x60, 0x8d, 0x00, 0x02, 0x5b, 0x02, 0x62, 0x6a,
+    0x8a, 0x00, 0x02, 0x43, 0x02, 0x62, 0x70, 0x89, 0x00, 0x02, 0x3b, 0x02,
+    0x62, 0x88, 0x88, 0x00, 0x02, 0x33, 0x02, 0x62, 0xa0, 0x86, 0x00, 0x02,
+    0x23, 0x02, 0x62, 0xa6, 0x85, 0x00, 0x02, 0x1b, 0x02, 0x62, 0xb3, 0x84,
+    0x00, 0x02, 0x13, 0x02, 0x62, 0xd4, 0x8c, 0x00, 0x02, 0x53, 0x02, 0x62,
+    0xe6, 0x93, 0x00, 0x02, 0x8a, 0x02, 0x62, 0xec, 0xc2, 0x00, 0x0b, 0x00,
+    0x09, 0x91, 0xc2, 0x49, 0x0c, 0x00, 0x0a, 0x90, 0x42, 0x01, 0x7c, 0xc2,
+    0x62, 0xf2, 0x43, 0xe5, 0xc3, 0x42, 0x62, 0xfe, 0xc3, 0x91, 0x00, 0x00,
+    0x74, 0x31, 0xc3, 0x1c, 0x63, 0x00, 0x74, 0x49, 0xc3, 0xe5, 0xf0, 0x00,
+    0x74, 0x61, 0x10, 0xc2, 0x63, 0x0a, 0x42, 0x02, 0x10, 0xc2, 0x63, 0x16,
+    0x06, 0xc2, 0x63, 0x20, 0xc3, 0x39, 0x6d, 0x00, 0x75, 0x01, 0xc3, 0x12,
+    0xad, 0x00, 0x75, 0x60, 0xc4, 0xdf, 0x43, 0x00, 0x74, 0xe1, 0xc3, 0x02,
+    0x45, 0x00, 0x74, 0xf0, 0xc3, 0x02, 0x45, 0x00, 0x74, 0x51, 0xc4, 0xdf,
+    0x43, 0x00, 0x75, 0x50, 0xc2, 0x00, 0xd0, 0x00, 0x75, 0x41, 0xc2, 0x0d,
+    0xf6, 0x00, 0x75, 0x48, 0xc4, 0xdf, 0x43, 0x00, 0x74, 0xb1, 0xc3, 0x02,
+    0x45, 0x00, 0x74, 0xb8, 0xc2, 0x00, 0x45, 0x00, 0x74, 0xe9, 0xc2, 0x0c,
+    0x42, 0x00, 0x74, 0xf8, 0xc3, 0x00, 0x74, 0x00, 0x75, 0x19, 0xc3, 0x65,
+    0xba, 0x00, 0x75, 0x28, 0xd1, 0x51, 0xbc, 0x0f, 0xdc, 0xe9, 0xc2, 0x00,
+    0x49, 0x01, 0x2f, 0xc8, 0x55, 0x0a, 0x4c, 0xc2, 0x63, 0x2a, 0x48, 0x0a,
+    0x53, 0xc2, 0x63, 0x3c, 0x4a, 0x13, 0xe3, 0x42, 0x63, 0x48, 0xc6, 0x04,
+    0xe1, 0x0f, 0xda, 0x91, 0xc5, 0x00, 0x2c, 0x0f, 0xda, 0x98, 0xd1, 0x51,
+    0xbc, 0x0f, 0xdc, 0xe1, 0xc2, 0x00, 0x49, 0x01, 0x2f, 0xc0, 0xc6, 0x04,
+    0xe1, 0x0f, 0xda, 0xb9, 0xc5, 0x00, 0x2c, 0x0f, 0xda, 0xc0, 0x55, 0x16,
+    0xaa, 0xc2, 0x63, 0x54, 0x48, 0x0a, 0x53, 0xc2, 0x63, 0x66, 0x4a, 0x13,
+    0xe3, 0x42, 0x63, 0x72, 0xd5, 0x35, 0x60, 0x0f, 0xdc, 0xd1, 0xd0, 0x06,
+    0xd7, 0x0f, 0xdc, 0x00, 0xe0, 0x08, 0x67, 0x0f, 0xdb, 0x50, 0xe0, 0x0a,
+    0x27, 0x0f, 0xdc, 0x90, 0xe0, 0x01, 0xc7, 0x0f, 0xdc, 0x88, 0xd9, 0x1b,
+    0xd1, 0x0f, 0xc4, 0xa9, 0xcb, 0x8a, 0x46, 0x01, 0x0f, 0x5b, 0x02, 0x63,
+    0x7e, 0xc8, 0xae, 0xbc, 0x01, 0x0f, 0x52, 0x02, 0x63, 0x84, 0xca, 0x03,
+    0xdd, 0x0f, 0xc4, 0x89, 0x48, 0x01, 0x9a, 0x42, 0x63, 0x8a, 0xd1, 0x53,
+    0x98, 0x01, 0x4a, 0x49, 0xd8, 0x05, 0xcf, 0x01, 0x5f, 0x68, 0x45, 0x00,
+    0x8c, 0xc2, 0x63, 0x9f, 0xdc, 0x14, 0x15, 0x01, 0x0e, 0x29, 0xc8, 0xae,
+    0xbc, 0x01, 0x0d, 0x29, 0xc6, 0x10, 0x9d, 0x01, 0x48, 0x91, 0xda, 0x1c,
+    0x1e, 0x0f, 0xdd, 0xc0, 0xc5, 0x01, 0x4a, 0x01, 0x0d, 0xf9, 0x00, 0x42,
+    0x63, 0xcf, 0xc5, 0x01, 0x4a, 0x01, 0x0d, 0xf1, 0x00, 0x42, 0x63, 0xe1,
+    0xdb, 0x15, 0xb1, 0x01, 0x19, 0x21, 0xd2, 0x46, 0x6b, 0x01, 0x5d, 0xc8,
+    0xd6, 0x31, 0x98, 0x01, 0x52, 0x41, 0xcc, 0x06, 0xbb, 0x01, 0x52, 0x30,
+    0xca, 0xa4, 0xcc, 0x01, 0x52, 0x29, 0xc7, 0x80, 0x70, 0x01, 0x52, 0x11,
+    0xca, 0x8d, 0xb1, 0x01, 0x52, 0x08, 0xcf, 0x15, 0x36, 0x0f, 0xbd, 0xf1,
+    0x42, 0x00, 0xac, 0xc2, 0x63, 0xed, 0x48, 0x0a, 0xa9, 0x42, 0x63, 0xf3,
+    0xc8, 0x00, 0xbf, 0x01, 0x3b, 0x11, 0xc6, 0x00, 0x91, 0x01, 0x3a, 0xb8,
+    0xc6, 0x02, 0xd1, 0x0f, 0xbc, 0x39, 0xd6, 0x2e, 0xac, 0x01, 0x36, 0xd9,
+    0xc6, 0x0b, 0x09, 0x0f, 0xbc, 0x88, 0xdd, 0x10, 0xc0, 0x0f, 0xb3, 0xd9,
+    0xc5, 0x13, 0x53, 0x0f, 0xbd, 0x60, 0x4e, 0x47, 0x15, 0xc2, 0x64, 0x05,
+    0x45, 0x20, 0x6c, 0x42, 0x64, 0x11, 0x45, 0x01, 0xb4, 0xc2, 0x64, 0x1d,
+    0x42, 0x01, 0x0c, 0x42, 0x64, 0x29, 0x49, 0x01, 0xaa, 0xc2, 0x64, 0x35,
+    0xc5, 0x01, 0xa2, 0x01, 0x3c, 0xd0, 0xc3, 0xe5, 0x8a, 0x0f, 0xb3, 0x21,
+    0xc9, 0xb4, 0x91, 0x0f, 0xb2, 0xe0, 0xc9, 0x8e, 0x0a, 0x0f, 0xaa, 0x39,
+    0xca, 0x9c, 0x48, 0x01, 0x5a, 0xa8, 0x48, 0x00, 0x29, 0xc2, 0x64, 0x41,
+    0x00, 0x42, 0x64, 0x47, 0x50, 0x01, 0xa9, 0xc2, 0x64, 0x53, 0x51, 0x08,
+    0xa9, 0x42, 0x64, 0x5f, 0xd7, 0x28, 0x88, 0x01, 0x3d, 0xd9, 0x46, 0x0a,
+    0xef, 0x42, 0x64, 0x6b, 0xca, 0x22, 0x51, 0x0f, 0xbe, 0x99, 0xcd, 0x0e,
+    0x61, 0x0f, 0xbe, 0xa0, 0x4b, 0x14, 0xd9, 0xc2, 0x64, 0x77, 0x00, 0x42,
+    0x64, 0x89, 0xe0, 0x0c, 0x07, 0x01, 0x3d, 0x70, 0xd5, 0x03, 0xd2, 0x0f,
+    0xc0, 0xc9, 0xdb, 0x17, 0x46, 0x0f, 0xc0, 0xe8, 0xe0, 0x0a, 0xa7, 0x01,
+    0x3d, 0x40, 0xce, 0x6c, 0x60, 0x01, 0x3a, 0x31, 0xc7, 0xa7, 0xc7, 0x01,
+    0x38, 0xa0, 0x46, 0x00, 0x8b, 0xc2, 0x64, 0x95, 0xc9, 0xb2, 0x48, 0x01,
+    0x5a, 0xc8, 0xe0, 0x03, 0xa7, 0x01, 0x3d, 0x00, 0x45, 0x00, 0x5a, 0xc2,
+    0x64, 0xa1, 0xc9, 0x99, 0x62, 0x0f, 0xa5, 0x91, 0x53, 0x08, 0xa7, 0x42,
+    0x64, 0xad, 0xcb, 0x03, 0xbc, 0x01, 0x3c, 0xcb, 0x02, 0x64, 0xb9, 0x50,
+    0x01, 0xa9, 0x42, 0x64, 0xbf, 0xc3, 0x05, 0x14, 0x0f, 0xc4, 0xe3, 0x02,
+    0x64, 0xcb, 0xca, 0x9d, 0x2e, 0x0f, 0xc4, 0xe8, 0xcf, 0x15, 0x36, 0x0f,
+    0xbd, 0x91, 0xd2, 0x22, 0x49, 0x0f, 0xbe, 0x50, 0xc6, 0x7c, 0x7b, 0x0f,
+    0xa4, 0xe9, 0xc5, 0x01, 0xa2, 0x0f, 0xa4, 0xc1, 0xcf, 0x64, 0x68, 0x0f,
+    0x9c, 0xa0, 0x9e, 0x0d, 0x85, 0x41, 0x9d, 0x0d, 0x85, 0x38, 0x9e, 0x0d,
+    0x81, 0x09, 0x9d, 0x0d, 0x81, 0x00, 0xcd, 0x79, 0xb6, 0x07, 0xd8, 0xf9,
+    0x47, 0x00, 0x58, 0xc2, 0x64, 0xcf, 0xc7, 0xc1, 0xaf, 0x00, 0x2f, 0x88,
+    0x46, 0x00, 0x8b, 0x42, 0x64, 0xdb, 0x46, 0x00, 0x8b, 0x42, 0x64, 0xe7,
+    0x46, 0x00, 0x8b, 0x42, 0x64, 0xf3, 0x46, 0x00, 0x8b, 0x42, 0x64, 0xff,
+    0xc2, 0x04, 0xad, 0x00, 0x2f, 0x53, 0x02, 0x65, 0x0b, 0xc4, 0xd4, 0xda,
+    0x00, 0x2f, 0x33, 0x02, 0x65, 0x11, 0xc2, 0x00, 0x3d, 0x00, 0x2e, 0xc2,
+    0x02, 0x65, 0x17, 0xc3, 0x11, 0xef, 0x00, 0x2f, 0x4b, 0x02, 0x65, 0x1d,
+    0xc5, 0xdc, 0x2c, 0x00, 0x2f, 0x0a, 0x02, 0x65, 0x23, 0xcc, 0x84, 0x75,
+    0x07, 0xda, 0x40, 0xcc, 0x84, 0x75, 0x07, 0xda, 0x38, 0xc2, 0x00, 0x67,
+    0x00, 0x2f, 0x1b, 0x02, 0x65, 0x29, 0xc3, 0xba, 0x37, 0x00, 0x2e, 0xd3,
+    0x02, 0x65, 0x2f, 0xc5, 0xd4, 0xd9, 0x00, 0x2f, 0x29, 0xc3, 0x20, 0x18,
+    0x00, 0x2e, 0xf9, 0xc3, 0x00, 0x4e, 0x00, 0x2e, 0xe8, 0xcc, 0x84, 0x75,
+    0x07, 0xda, 0x00, 0xcc, 0x84, 0x75, 0x07, 0xd9, 0xf0, 0xcc, 0x84, 0x75,
+    0x07, 0xd9, 0xe0, 0x46, 0x00, 0x8b, 0x42, 0x65, 0x35, 0xcc, 0x84, 0x75,
+    0x07, 0xd9, 0xb0, 0xcb, 0x91, 0xa4, 0x07, 0xd9, 0xa1, 0x96, 0x00, 0x2e,
+    0xb8, 0xcc, 0x84, 0x75, 0x07, 0xd9, 0x98, 0xcc, 0x84, 0x75, 0x07, 0xd9,
+    0x90, 0x0e, 0xc2, 0x65, 0x41, 0xc3, 0x16, 0x5a, 0x00, 0x2f, 0x10, 0xc3,
+    0x22, 0x14, 0x07, 0xd9, 0x41, 0xc4, 0x5d, 0xe2, 0x07, 0xd9, 0x39, 0xc9,
+    0xb4, 0xb5, 0x07, 0xd9, 0x31, 0xc5, 0xa2, 0x83, 0x07, 0xd9, 0x29, 0xc3,
+    0xba, 0x37, 0x07, 0xd9, 0x21, 0xc2, 0x01, 0x7f, 0x07, 0xd9, 0x19, 0xc5,
+    0x40, 0x9a, 0x07, 0xd9, 0x11, 0xc4, 0x06, 0x5a, 0x07, 0xd9, 0x08, 0xc5,
+    0xcc, 0xe4, 0x00, 0x2d, 0xc3, 0x02, 0x65, 0x50, 0xc5, 0xd8, 0xfd, 0x00,
+    0x2d, 0xd8, 0xc6, 0x44, 0x50, 0x00, 0x2e, 0x11, 0x0a, 0xc2, 0x65, 0x56,
+    0xc4, 0xa0, 0x89, 0x00, 0x2d, 0xb0, 0xc4, 0xd5, 0xa7, 0x00, 0x2d, 0xcb,
+    0x02, 0x65, 0x62, 0xc4, 0xd5, 0x84, 0x00, 0x2d, 0xa1, 0x45, 0xd5, 0xb5,
+    0x42, 0x65, 0x68, 0xc6, 0xcb, 0x63, 0x00, 0x2f, 0xa1, 0xc3, 0x26, 0x1a,
+    0x00, 0x2f, 0x98, 0xc3, 0x0f, 0x99, 0x00, 0x2c, 0xc1, 0x44, 0xe3, 0xeb,
+    0x42, 0x65, 0x7a, 0x46, 0xcf, 0x7d, 0xc2, 0x65, 0x86, 0xc3, 0x1e, 0x95,
+    0x00, 0x2c, 0xd8, 0xc7, 0xc5, 0xad, 0x00, 0x2c, 0xe8, 0xc7, 0xc5, 0xfa,
+    0x00, 0x2d, 0x30, 0xce, 0x73, 0xc2, 0x02, 0x6e, 0x01, 0xcc, 0x83, 0x31,
+    0x02, 0x6e, 0xe9, 0xc7, 0xc4, 0x8e, 0x02, 0x6f, 0x88, 0x14, 0xc2, 0x65,
+    0x92, 0xcc, 0x8b, 0xf5, 0x02, 0x6e, 0xe0, 0xc3, 0x08, 0x93, 0x02, 0x6f,
+    0x79, 0xc7, 0xc8, 0xf5, 0x02, 0x6f, 0xb8, 0x12, 0xc2, 0x65, 0x9e, 0xc6,
+    0xd3, 0x37, 0x02, 0x6e, 0xc8, 0xc7, 0xc9, 0x3b, 0x01, 0x5e, 0x19, 0xc7,
+    0xc2, 0xc0, 0x01, 0x59, 0x18, 0xc7, 0x33, 0xdf, 0x00, 0x00, 0x4b, 0x02,
+    0x65, 0xa8, 0xc4, 0x3b, 0x19, 0x01, 0x5b, 0xf0, 0x95, 0x0f, 0x9e, 0xc0,
+    0xc4, 0x18, 0x10, 0x08, 0x69, 0xb9, 0xc2, 0x22, 0xcc, 0x08, 0x69, 0xb0,
+    0xc3, 0x0d, 0x14, 0x08, 0x69, 0xa9, 0xc3, 0x09, 0x9e, 0x08, 0x69, 0xa0,
+    0xc4, 0x02, 0xde, 0x08, 0x69, 0x99, 0xc2, 0x02, 0xa0, 0x08, 0x69, 0x90,
+    0xc3, 0x0d, 0x23, 0x08, 0x69, 0x39, 0xc2, 0x00, 0xc1, 0x08, 0x69, 0x31,
+    0xc4, 0x75, 0x13, 0x08, 0x69, 0x28, 0xc2, 0x19, 0x2c, 0x08, 0x68, 0xd9,
+    0xc2, 0x01, 0x30, 0x08, 0x68, 0xd1, 0x83, 0x08, 0x68, 0xa8, 0x45, 0xd4,
+    0x11, 0xc2, 0x65, 0xac, 0x83, 0x08, 0x68, 0x89, 0xc2, 0x00, 0xd0, 0x08,
+    0x68, 0x40, 0xc2, 0x00, 0x39, 0x08, 0x68, 0x69, 0x83, 0x08, 0x68, 0x60,
+    0xc2, 0x0e, 0x9a, 0x08, 0x68, 0x59, 0x83, 0x08, 0x68, 0x50, 0xc2, 0x01,
+    0x6f, 0x08, 0x68, 0x21, 0x83, 0x08, 0x68, 0x18, 0x83, 0x08, 0x68, 0x79,
+    0xc2, 0x00, 0xd0, 0x08, 0x68, 0x80, 0x83, 0x00, 0xb9, 0x41, 0xc2, 0x01,
+    0x30, 0x00, 0xb9, 0x28, 0xc5, 0xd6, 0x8c, 0x00, 0x88, 0x2b, 0x02, 0x65,
+    0xb8, 0x15, 0xc2, 0x65, 0xbc, 0xc5, 0x90, 0xe4, 0x00, 0x88, 0x93, 0x02,
+    0x65, 0xcb, 0x12, 0xc2, 0x65, 0xd1, 0xc5, 0xb7, 0x9d, 0x00, 0x88, 0x5b,
+    0x02, 0x65, 0xe9, 0xc5, 0xda, 0xe7, 0x00, 0x88, 0x33, 0x02, 0x65, 0xed,
+    0x16, 0xc2, 0x65, 0xf1, 0x0d, 0xc2, 0x66, 0x00, 0xc5, 0xd9, 0x61, 0x00,
+    0x88, 0x13, 0x02, 0x66, 0x15, 0x05, 0xc2, 0x66, 0x19, 0x42, 0x0c, 0x43,
+    0xc2, 0x66, 0x2e, 0xc6, 0x92, 0x0c, 0x00, 0x8a, 0xf8, 0x49, 0xb4, 0x76,
+    0xc2, 0x66, 0x3a, 0x49, 0xad, 0x02, 0x42, 0x66, 0x71, 0x0d, 0xc2, 0x66,
+    0xb8, 0x15, 0xc2, 0x66, 0xcd, 0xc5, 0xd9, 0x61, 0x01, 0x89, 0xa3, 0x02,
+    0x66, 0xdc, 0x16, 0xc2, 0x66, 0xe0, 0xc5, 0xd6, 0x8c, 0x01, 0x89, 0xcb,
+    0x02, 0x66, 0xec, 0xc5, 0xda, 0xe7, 0x01, 0x8a, 0x0b, 0x02, 0x66, 0xf0,
+    0x12, 0xc2, 0x66, 0xf4, 0x8b, 0x01, 0x8b, 0x1b, 0x02, 0x67, 0x09, 0x05,
+    0xc2, 0x67, 0x0f, 0xc5, 0x90, 0xe4, 0x01, 0x8a, 0x71, 0x83, 0x01, 0x8a,
+    0x7b, 0x02, 0x67, 0x1b, 0x1b, 0xc2, 0x67, 0x28, 0x87, 0x01, 0x8a, 0xa3,
+    0x02, 0x67, 0x48, 0x91, 0x01, 0x8a, 0xbb, 0x02, 0x67, 0x50, 0x19, 0xc2,
+    0x67, 0x54, 0x97, 0x01, 0x8a, 0xe0, 0x19, 0xc2, 0x67, 0x66, 0x0a, 0xc2,
+    0x67, 0x70, 0xc2, 0x00, 0xc4, 0x01, 0x81, 0xc0, 0xc3, 0x09, 0x9e, 0x01,
+    0x81, 0x21, 0xc3, 0x0d, 0x14, 0x01, 0x81, 0x28, 0xc2, 0x22, 0xcc, 0x01,
+    0x81, 0x31, 0xc4, 0x18, 0x10, 0x01, 0x81, 0x38, 0xc8, 0x0d, 0x03, 0x08,
+    0x47, 0xf8, 0xc5, 0x28, 0xee, 0x08, 0x47, 0xf1, 0xc2, 0x00, 0xc4, 0x08,
+    0x47, 0xe8, 0xc2, 0x39, 0x8b, 0x08, 0x47, 0xa9, 0xc3, 0x1e, 0x1b, 0x08,
+    0x47, 0x40, 0xc3, 0x11, 0xef, 0x08, 0x47, 0xa1, 0x03, 0x42, 0x67, 0x7c,
+    0xc2, 0x17, 0xb6, 0x08, 0x47, 0x79, 0xc4, 0x36, 0xb5, 0x08, 0x47, 0x00,
+    0xc2, 0x00, 0x8e, 0x08, 0x47, 0x38, 0x19, 0xc2, 0x67, 0x88, 0x15, 0xc2,
+    0x67, 0x90, 0x83, 0x07, 0xfb, 0x89, 0x8b, 0x07, 0xfb, 0x91, 0x97, 0x07,
+    0xfb, 0x99, 0x87, 0x07, 0xfb, 0xa1, 0x91, 0x07, 0xfb, 0xa9, 0x0d, 0xc2,
+    0x67, 0xaa, 0x16, 0xc2, 0x67, 0xbe, 0x90, 0x07, 0xfc, 0xeb, 0x02, 0x67,
+    0xd2, 0x0a, 0xc2, 0x67, 0xe6, 0x0f, 0xc2, 0x67, 0xfa, 0x1b, 0xc2, 0x68,
+    0x0e, 0x14, 0x42, 0x68, 0x1a, 0xc5, 0x8e, 0xdf, 0x07, 0xfd, 0x0b, 0x02,
+    0x68, 0x2e, 0xc6, 0xbb, 0xec, 0x07, 0xfd, 0xd8, 0x44, 0x3a, 0xbf, 0xc2,
+    0x68, 0x34, 0xc3, 0x39, 0x37, 0x07, 0xfd, 0xa8, 0x02, 0x42, 0x68, 0x52,
+    0xc4, 0x79, 0xf3, 0x07, 0xfd, 0x93, 0x02, 0x68, 0x74, 0xc6, 0xba, 0x7c,
+    0x07, 0xfd, 0xe8, 0xc4, 0xb7, 0x9e, 0x07, 0xfd, 0xb8, 0xc4, 0xc6, 0x7a,
+    0x07, 0xfd, 0xc1, 0xc6, 0xc6, 0x79, 0x07, 0xfd, 0xd0, 0xc6, 0xc1, 0x86,
+    0x07, 0xfd, 0xe1, 0xc5, 0xc0, 0x7d, 0x07, 0xfd, 0x38, 0x87, 0x07, 0xfe,
+    0x18, 0x83, 0x07, 0xfe, 0x23, 0x02, 0x68, 0x7a, 0x87, 0x07, 0xfe, 0x5b,
+    0x02, 0x68, 0x7e, 0x91, 0x07, 0xfe, 0x91, 0x97, 0x07, 0xfe, 0xb9, 0x8b,
+    0x07, 0xfe, 0xd8, 0x91, 0x07, 0xfe, 0x31, 0x97, 0x07, 0xfe, 0xd0, 0x87,
+    0x07, 0xfe, 0x78, 0x83, 0x07, 0xfe, 0x6b, 0x02, 0x68, 0x82, 0x87, 0x07,
+    0xfe, 0xab, 0x02, 0x68, 0x86, 0x8b, 0x07, 0xfe, 0xb0, 0x02, 0x42, 0x68,
+    0x8a, 0xc2, 0x0c, 0x43, 0x0d, 0x80, 0x09, 0xc2, 0x14, 0x68, 0x0d, 0x88,
+    0xf8, 0x19, 0xc2, 0x68, 0x96, 0x83, 0x01, 0x82, 0x09, 0x8b, 0x01, 0x82,
+    0x19, 0x97, 0x01, 0x82, 0x29, 0x87, 0x01, 0x82, 0x39, 0x91, 0x01, 0x82,
+    0x49, 0xc2, 0x00, 0x16, 0x01, 0x83, 0x19, 0x1b, 0xc2, 0x68, 0xa6, 0x0d,
+    0x42, 0x68, 0xb2, 0xcd, 0x78, 0xcc, 0x0f, 0xdc, 0xb1, 0xc5, 0x01, 0xc2,
+    0x0f, 0xdd, 0x88, 0xe0, 0x08, 0xa7, 0x0f, 0xdd, 0xa0, 0xc5, 0x68, 0x6e,
+    0x01, 0x11, 0xf1, 0xc9, 0xaf, 0x4b, 0x01, 0x72, 0x2a, 0x02, 0x68, 0xba,
+    0xc6, 0xca, 0xcd, 0x07, 0xff, 0xc9, 0xc9, 0x1b, 0x0a, 0x07, 0xff, 0xd1,
+    0xca, 0x7c, 0x02, 0x07, 0xff, 0xd8, 0x43, 0x13, 0x6d, 0xc2, 0x68, 0xc0,
+    0x46, 0x00, 0xd4, 0xc2, 0x68, 0xc6, 0x45, 0x00, 0x8c, 0x42, 0x68, 0xd2,
+    0x42, 0x05, 0x1d, 0xc2, 0x68, 0xe4, 0xc7, 0x80, 0x70, 0x01, 0x50, 0xd9,
+    0xcc, 0x06, 0xbb, 0x01, 0x50, 0xc9, 0xca, 0x9d, 0xb0, 0x01, 0x50, 0xc1,
+    0xd9, 0x1f, 0x4a, 0x01, 0x50, 0xb9, 0xcd, 0x75, 0xa6, 0x01, 0x50, 0x70,
+    0xd6, 0x30, 0xa6, 0x01, 0x50, 0xa9, 0xd1, 0x56, 0x40, 0x01, 0x50, 0x78,
+    0xc3, 0x05, 0x14, 0x08, 0x5b, 0xc3, 0x02, 0x68, 0xf0, 0x16, 0xc2, 0x68,
+    0xf4, 0xc4, 0x09, 0x9d, 0x08, 0x5b, 0xd8, 0x16, 0xc2, 0x69, 0x04, 0x15,
+    0xc2, 0x69, 0x10, 0xc2, 0x00, 0x67, 0x08, 0x5b, 0x79, 0xc3, 0x20, 0x18,
+    0x08, 0x5b, 0x69, 0xc8, 0xb9, 0x7a, 0x08, 0x5b, 0x61, 0xc6, 0xcf, 0xd7,
+    0x08, 0x5b, 0x59, 0xc4, 0xe0, 0xe7, 0x08, 0x5b, 0x51, 0xc4, 0x4a, 0xb9,
+    0x08, 0x5b, 0x49, 0xc2, 0x01, 0x7f, 0x08, 0x5b, 0x23, 0x02, 0x69, 0x1a,
+    0xc5, 0x4a, 0xb3, 0x08, 0x5b, 0x31, 0xcd, 0x7e, 0x89, 0x08, 0x5b, 0x29,
+    0xc6, 0x40, 0x9a, 0x08, 0x5b, 0x19, 0xc5, 0x9c, 0xa2, 0x08, 0x5b, 0x11,
+    0xc4, 0xe3, 0x27, 0x08, 0x5b, 0x09, 0xc5, 0xa5, 0xfd, 0x08, 0x5b, 0x00,
+    0xc3, 0x05, 0x14, 0x08, 0x5a, 0xc3, 0x02, 0x69, 0x20, 0x16, 0xc2, 0x69,
+    0x24, 0xc4, 0x09, 0x9d, 0x08, 0x5a, 0xd8, 0x16, 0xc2, 0x69, 0x34, 0x15,
+    0xc2, 0x69, 0x40, 0xc4, 0x5d, 0xe2, 0x08, 0x5a, 0x99, 0xc3, 0x00, 0x4e,
+    0x08, 0x5a, 0x61, 0xc6, 0xcf, 0xd7, 0x08, 0x5a, 0x59, 0xc4, 0xe0, 0xe7,
+    0x08, 0x5a, 0x51, 0xc4, 0x4a, 0xb9, 0x08, 0x5a, 0x49, 0xc2, 0x01, 0x7f,
+    0x08, 0x5a, 0x23, 0x02, 0x69, 0x4a, 0xc5, 0x4a, 0xb3, 0x08, 0x5a, 0x31,
+    0xc3, 0x7e, 0x89, 0x08, 0x5a, 0x29, 0xc6, 0x40, 0x9a, 0x08, 0x5a, 0x19,
+    0xc5, 0x9c, 0xa2, 0x08, 0x5a, 0x11, 0xc4, 0xe3, 0x27, 0x08, 0x5a, 0x09,
+    0x03, 0xc2, 0x69, 0x50, 0xc3, 0x20, 0x18, 0x08, 0x5a, 0x69, 0xc2, 0x00,
+    0x67, 0x08, 0x5a, 0x81, 0xc4, 0xb9, 0x7e, 0x08, 0x5a, 0x90, 0xc3, 0x05,
+    0x14, 0x00, 0x00, 0xf9, 0x16, 0xc2, 0x69, 0x5c, 0xc4, 0x09, 0x9d, 0x00,
+    0x00, 0xe0, 0x4a, 0x0c, 0x8c, 0xc2, 0x69, 0x68, 0x49, 0x44, 0xee, 0xc2,
+    0x69, 0x72, 0xc5, 0xdc, 0xa9, 0x0f, 0x65, 0x0b, 0x02, 0x69, 0x90, 0xc4,
+    0x41, 0x55, 0x0f, 0x64, 0xf3, 0x02, 0x69, 0x96, 0xc4, 0x26, 0x78, 0x0f,
+    0x63, 0xcb, 0x02, 0x69, 0x9c, 0xc5, 0x06, 0xdb, 0x0f, 0x63, 0xc3, 0x02,
+    0x69, 0xa9, 0x15, 0xc2, 0x69, 0xb4, 0x08, 0xc2, 0x69, 0xc6, 0x16, 0xc2,
+    0x69, 0xce, 0xc3, 0x05, 0x14, 0x0f, 0x63, 0x8a, 0x02, 0x69, 0xdf, 0xce,
+    0x08, 0x73, 0x0f, 0x65, 0x79, 0x44, 0x05, 0x14, 0x42, 0x69, 0xe3, 0xc3,
+    0x0d, 0x14, 0x0e, 0x9b, 0xb1, 0xc3, 0x09, 0x9e, 0x0e, 0x9b, 0xa8, 0xc4,
+    0x02, 0xde, 0x0e, 0x9b, 0xa1, 0xc2, 0x02, 0xa0, 0x0e, 0x9b, 0x98, 0x0c,
+    0xc2, 0x69, 0xef, 0xc8, 0xb6, 0x8a, 0x01, 0x96, 0x09, 0x42, 0x01, 0xc3,
+    0xc2, 0x69, 0xf9, 0x03, 0xc2, 0x6a, 0x03, 0xc9, 0xa8, 0xee, 0x01, 0x96,
+    0x41, 0xc7, 0xc9, 0x2d, 0x01, 0x96, 0x49, 0xc8, 0xbc, 0x22, 0x01, 0x96,
+    0x51, 0x06, 0xc2, 0x6a, 0x0f, 0x45, 0xd6, 0x19, 0x42, 0x6a, 0x1b, 0xc5,
+    0x00, 0x2c, 0x01, 0x7f, 0x81, 0xd0, 0x5d, 0x62, 0x01, 0x7f, 0x90, 0xc5,
+    0x05, 0x02, 0x01, 0x7f, 0x89, 0xd0, 0x5d, 0x72, 0x01, 0x7f, 0x98, 0xc5,
+    0x00, 0xd4, 0x01, 0x7f, 0xa9, 0xc5, 0x05, 0x02, 0x01, 0x7f, 0xb1, 0x0e,
+    0xc2, 0x6a, 0x40, 0x46, 0x02, 0xae, 0x42, 0x6a, 0x4c, 0xc8, 0xbd, 0x1a,
+    0x01, 0x8c, 0xa1, 0xc8, 0xb6, 0x72, 0x01, 0x8c, 0xd8, 0xc5, 0x01, 0xc2,
+    0x01, 0x8c, 0xa9, 0xc7, 0x36, 0x55, 0x01, 0x8c, 0xe0, 0xc2, 0x00, 0xc4,
+    0x08, 0x42, 0xdb, 0x02, 0x6a, 0x58, 0x19, 0xc2, 0x6a, 0x5e, 0xc4, 0x02,
+    0xde, 0x08, 0x42, 0xd0, 0x00, 0x42, 0x6a, 0x68, 0xc2, 0x39, 0x8b, 0x08,
+    0x42, 0xa9, 0xc3, 0x1e, 0x1b, 0x08, 0x42, 0x40, 0xc3, 0x11, 0xef, 0x08,
+    0x42, 0xa1, 0x03, 0x42, 0x6a, 0x74, 0xc3, 0x16, 0x5a, 0x08, 0x42, 0x79,
+    0xc4, 0x36, 0xb5, 0x08, 0x42, 0x00, 0xc2, 0x00, 0x8e, 0x08, 0x42, 0x38,
+    0xca, 0xa7, 0x92, 0x0f, 0xd2, 0x43, 0x02, 0x6a, 0x80, 0xc4, 0xde, 0x83,
+    0x01, 0x32, 0xb3, 0x02, 0x6a, 0x86, 0xc4, 0xe3, 0x93, 0x01, 0x32, 0xcb,
+    0x02, 0x6a, 0x8c, 0x0d, 0xc2, 0x6a, 0x92, 0xc6, 0xca, 0xfd, 0x01, 0x32,
+    0xbb, 0x02, 0x6a, 0xa4, 0xc5, 0xa8, 0xf7, 0x01, 0x32, 0xab, 0x02, 0x6a,
+    0xaa, 0x47, 0x45, 0x86, 0x42, 0x6a, 0xb0, 0x00, 0x42, 0x6a, 0xcc, 0x46,
+    0x00, 0x8b, 0x42, 0x6a, 0xd8, 0x03, 0xc2, 0x6a, 0xe4, 0xc5, 0xc2, 0xc2,
+    0x01, 0x59, 0x08, 0xc7, 0xc6, 0xef, 0x01, 0x4e, 0xb1, 0xd0, 0x5a, 0x62,
+    0x01, 0x59, 0x68, 0x00, 0x42, 0x6a, 0xf3, 0x00, 0x42, 0x6b, 0x05, 0xca,
+    0x82, 0xd3, 0x01, 0x31, 0xd1, 0x44, 0x03, 0x15, 0x42, 0x6b, 0x14, 0xc9,
+    0x8e, 0x0a, 0x0f, 0xaa, 0x31, 0xca, 0x9d, 0x1a, 0x01, 0x58, 0xe0, 0x00,
+    0xc2, 0x6b, 0x1e, 0x4a, 0x01, 0xa9, 0x42, 0x6b, 0x2a, 0xe0, 0x0a, 0xc7,
+    0x0f, 0xbd, 0x00, 0x00, 0x42, 0x6b, 0x3c, 0xc4, 0x5b, 0x26, 0x01, 0x36,
+    0x09, 0xc3, 0x12, 0xb8, 0x01, 0x36, 0x00, 0x4a, 0x03, 0x3d, 0xc2, 0x6b,
+    0x54, 0x4a, 0x01, 0xa9, 0x42, 0x6b, 0x66, 0x46, 0x01, 0x94, 0xc2, 0x6b,
+    0x72, 0xc7, 0xc4, 0x80, 0x01, 0x1f, 0x10, 0x11, 0xc2, 0x6b, 0x78, 0xc2,
+    0x00, 0xb3, 0x01, 0x34, 0x82, 0x02, 0x6b, 0x84, 0xc4, 0x0e, 0x6a, 0x01,
+    0x39, 0x39, 0xc4, 0x11, 0xa4, 0x01, 0x5e, 0x70, 0x4a, 0x03, 0x3d, 0xc2,
+    0x6b, 0x8a, 0x4a, 0x01, 0xa9, 0x42, 0x6b, 0x96, 0xc5, 0x06, 0x82, 0x01,
+    0x30, 0xe9, 0xce, 0x24, 0xd5, 0x0f, 0xa2, 0x30, 0xc8, 0x01, 0x92, 0x01,
+    0x2d, 0x9b, 0x02, 0x6b, 0xa6, 0xce, 0x6c, 0x8a, 0x01, 0x2d, 0xa9, 0xc7,
+    0xc6, 0x6a, 0x0f, 0xde, 0x50, 0x15, 0xc2, 0x6b, 0xac, 0xc7, 0x3a, 0x19,
+    0x01, 0x59, 0x31, 0xc7, 0x0a, 0xe0, 0x01, 0x59, 0x40, 0xc4, 0x2b, 0xf1,
+    0x0f, 0x9f, 0x89, 0xc5, 0xbb, 0xcd, 0x01, 0x59, 0x00, 0xc9, 0x46, 0x70,
+    0x01, 0x2d, 0x79, 0xc3, 0x01, 0x5d, 0x01, 0x57, 0xf1, 0xc7, 0x5a, 0x6b,
+    0x01, 0x59, 0x78, 0xc4, 0x18, 0x10, 0x0f, 0x17, 0xb9, 0xc2, 0x22, 0xcc,
+    0x0f, 0x17, 0xb0, 0xc3, 0x0d, 0x14, 0x0f, 0x17, 0xa9, 0xc3, 0x09, 0x9e,
+    0x0f, 0x17, 0xa0, 0xc4, 0x02, 0xde, 0x0f, 0x17, 0x99, 0xc2, 0x02, 0xa0,
+    0x0f, 0x17, 0x90, 0xc2, 0x00, 0xec, 0x0f, 0x17, 0x78, 0xc2, 0x00, 0xec,
+    0x0f, 0x17, 0x68, 0xc2, 0x14, 0x77, 0x0f, 0x17, 0x59, 0x83, 0x0f, 0x16,
+    0x30, 0xc2, 0x00, 0xc4, 0x0f, 0x17, 0x50, 0xc2, 0x19, 0x2c, 0x0f, 0x17,
+    0x49, 0xc2, 0x01, 0x30, 0x0f, 0x16, 0xe9, 0x83, 0x0f, 0x16, 0x48, 0x83,
+    0x0f, 0x16, 0x03, 0x02, 0x6b, 0xbe, 0xc2, 0x00, 0x75, 0x0f, 0x17, 0x21,
+    0x97, 0x0f, 0x16, 0xb0, 0x90, 0x0f, 0x17, 0x38, 0x90, 0x0f, 0x17, 0x32,
+    0x02, 0x6b, 0xc5, 0xc2, 0x00, 0x75, 0x0f, 0x17, 0x28, 0xc2, 0x00, 0x39,
+    0x0f, 0x17, 0x09, 0xc2, 0x0d, 0xf6, 0x0f, 0x17, 0x01, 0xc2, 0x00, 0xd0,
+    0x0f, 0x16, 0x61, 0x83, 0x0f, 0x16, 0x58, 0xc3, 0x64, 0x58, 0x0f, 0x16,
+    0xf9, 0x83, 0x0f, 0x16, 0x40, 0xc2, 0x00, 0xd0, 0x0f, 0x16, 0xc9, 0x83,
+    0x0f, 0x16, 0xa0, 0xc2, 0x00, 0xd0, 0x0f, 0x16, 0x79, 0x83, 0x0f, 0x16,
+    0x70, 0x83, 0x0f, 0x16, 0x51, 0xc2, 0x00, 0xd0, 0x0f, 0x16, 0x38, 0xc6,
+    0x18, 0x10, 0x08, 0xc7, 0x81, 0xc4, 0xd2, 0x1d, 0x08, 0xc7, 0x78, 0xc4,
+    0x45, 0x6a, 0x08, 0xc7, 0x71, 0xc4, 0x4a, 0x2e, 0x08, 0xc7, 0x68, 0xc5,
+    0x0d, 0x0d, 0x08, 0xc7, 0x61, 0xc5, 0x28, 0xee, 0x08, 0xc7, 0x59, 0xc2,
+    0x00, 0xc4, 0x08, 0xc7, 0x50, 0xc4, 0x18, 0x10, 0x08, 0xc7, 0x39, 0xc2,
+    0x22, 0xcc, 0x08, 0xc7, 0x30, 0xc3, 0x0d, 0x14, 0x08, 0xc7, 0x29, 0xc3,
+    0x09, 0x9e, 0x08, 0xc7, 0x20, 0xc4, 0x02, 0xde, 0x08, 0xc7, 0x19, 0xc2,
+    0x02, 0xa0, 0x08, 0xc7, 0x10, 0xc2, 0x25, 0x9f, 0x08, 0xc6, 0xf1, 0xc3,
+    0xe5, 0xed, 0x08, 0xc6, 0xe8, 0xc2, 0x00, 0xb1, 0x08, 0xc6, 0xe1, 0x11,
+    0xc2, 0x6b, 0xc9, 0xc3, 0xbe, 0x83, 0x08, 0xc6, 0xc8, 0x8f, 0x08, 0xc6,
+    0xb1, 0x96, 0x08, 0xc6, 0xa9, 0xc2, 0x00, 0x75, 0x08, 0xc6, 0x50, 0xc3,
+    0x38, 0x86, 0x08, 0xc6, 0x99, 0xc3, 0x4f, 0x37, 0x08, 0xc6, 0x00, 0xc2,
+    0x04, 0xcd, 0x08, 0xc6, 0x88, 0x10, 0x42, 0x6b, 0xd5, 0x85, 0x08, 0xc6,
+    0x79, 0x97, 0x08, 0xc6, 0x38, 0x97, 0x08, 0xc6, 0x1b, 0x02, 0x6b, 0xdd,
+    0x91, 0x08, 0xc6, 0x29, 0x83, 0x08, 0xc6, 0x20, 0xc2, 0x25, 0x9f, 0x08,
+    0xc5, 0xf1, 0xc3, 0xe5, 0xed, 0x08, 0xc5, 0xe8, 0xc2, 0x00, 0xb1, 0x08,
+    0xc5, 0xe1, 0x11, 0xc2, 0x6b, 0xe1, 0xc3, 0xbe, 0x83, 0x08, 0xc5, 0xc8,
+    0x8f, 0x08, 0xc5, 0xb1, 0x96, 0x08, 0xc5, 0xa9, 0xc2, 0x00, 0x75, 0x08,
+    0xc5, 0x50, 0xc3, 0x38, 0x86, 0x08, 0xc5, 0x99, 0xc3, 0x4f, 0x37, 0x08,
+    0xc5, 0x00, 0xc2, 0x04, 0xcd, 0x08, 0xc5, 0x88, 0x10, 0x42, 0x6b, 0xed,
+    0x85, 0x08, 0xc5, 0x79, 0x97, 0x08, 0xc5, 0x38, 0x97, 0x08, 0xc5, 0x1b,
+    0x02, 0x6b, 0xf5, 0x91, 0x08, 0xc5, 0x29, 0x83, 0x08, 0xc5, 0x20, 0xd3,
+    0x46, 0x7d, 0x01, 0x39, 0x29, 0x43, 0x00, 0xbf, 0x42, 0x6b, 0xf9, 0xc4,
+    0x01, 0xc3, 0x01, 0x02, 0xd9, 0xcb, 0x05, 0x1c, 0x01, 0x02, 0xc0, 0x12,
+    0xc2, 0x6b, 0xff, 0xcc, 0x88, 0x1d, 0x0f, 0xc8, 0xa9, 0x16, 0xc2, 0x6c,
+    0x11, 0x11, 0xc2, 0x6c, 0x1d, 0xcf, 0x60, 0x99, 0x0f, 0xb2, 0x29, 0xcc,
+    0x87, 0x75, 0x0f, 0xb2, 0x21, 0xd0, 0x5a, 0xf2, 0x0f, 0xb0, 0xdb, 0x02,
+    0x6c, 0x2f, 0x42, 0x00, 0x99, 0xc2, 0x6c, 0x35, 0xcf, 0x67, 0x0b, 0x0f,
+    0xb1, 0x21, 0x0f, 0xc2, 0x6c, 0x41, 0xdb, 0x17, 0x7c, 0x0f, 0xc9, 0x59,
+    0xda, 0x1b, 0xea, 0x0f, 0xcb, 0xa1, 0xce, 0x6d, 0x6a, 0x0f, 0xd7, 0x20,
+    0xcf, 0x36, 0xc5, 0x01, 0x49, 0x61, 0xd0, 0x20, 0x66, 0x01, 0x49, 0x78,
+    0xc4, 0x26, 0x78, 0x07, 0xf8, 0xc9, 0xc4, 0x15, 0xe7, 0x07, 0xf8, 0x81,
+    0xc3, 0x05, 0x14, 0x07, 0xf8, 0x89, 0x16, 0xc2, 0x6c, 0x4d, 0x08, 0xc2,
+    0x6c, 0x59, 0x15, 0xc2, 0x6c, 0x65, 0xc5, 0x06, 0xdb, 0x07, 0xf8, 0xc0,
+    0xc3, 0x0d, 0xe5, 0x07, 0xf8, 0xd1, 0x42, 0x0a, 0x8c, 0x42, 0x6c, 0x71,
+    0xcc, 0x8b, 0x11, 0x07, 0xf8, 0xe1, 0x43, 0x00, 0x4b, 0x42, 0x6c, 0x7b,
+    0x4f, 0x0b, 0x17, 0xc2, 0x6c, 0x93, 0x4d, 0x29, 0xb9, 0x42, 0x6c, 0xfb,
+    0xce, 0x25, 0xad, 0x07, 0xf9, 0xe9, 0xcd, 0x00, 0x32, 0x07, 0xfa, 0xe9,
+    0xd1, 0x4f, 0x7a, 0x07, 0xfb, 0x01, 0xcb, 0x1a, 0x50, 0x07, 0xf8, 0x48,
+    0xc9, 0xb2, 0xa2, 0x0f, 0x98, 0xd9, 0xc6, 0x00, 0x91, 0x0f, 0x98, 0x98,
+    0x44, 0x1a, 0xce, 0xc2, 0x6d, 0x63, 0xc3, 0x01, 0xe2, 0x0b, 0x79, 0x90,
+    0xa5, 0x0b, 0x7c, 0xc9, 0xa4, 0x0b, 0x7c, 0xc1, 0xa3, 0x0b, 0x7c, 0xb9,
+    0xa2, 0x0b, 0x7c, 0xb1, 0xa1, 0x0b, 0x7c, 0xa9, 0xa0, 0x0b, 0x7c, 0xa1,
+    0x9f, 0x0b, 0x7c, 0x98, 0x87, 0x0b, 0x7a, 0x49, 0x83, 0x0b, 0x79, 0xb9,
+    0xc2, 0x00, 0xd0, 0x0b, 0x79, 0x71, 0xc2, 0x0d, 0xf6, 0x0b, 0x79, 0x50,
+    0xc2, 0x19, 0x2c, 0x0b, 0x78, 0xe1, 0x83, 0x0b, 0x78, 0xd0, 0xca, 0x56,
+    0xca, 0x0b, 0x7a, 0x80, 0xc2, 0x00, 0xd0, 0x0b, 0x79, 0x69, 0x83, 0x0b,
+    0x79, 0x60, 0xc2, 0x00, 0xd0, 0x0b, 0x79, 0x21, 0x83, 0x0b, 0x79, 0x18,
+    0xc2, 0x00, 0xd0, 0x0b, 0x78, 0xa9, 0x83, 0x0b, 0x78, 0xa0, 0xc2, 0x16,
+    0x5a, 0x0b, 0x7a, 0x39, 0x83, 0x0b, 0x79, 0xc1, 0xc2, 0x00, 0xd0, 0x0b,
+    0x79, 0x79, 0xc2, 0x02, 0x1c, 0x0b, 0x79, 0x58, 0xc2, 0x19, 0x2c, 0x0b,
+    0x78, 0xe9, 0x83, 0x0b, 0x78, 0xd8, 0xc3, 0x90, 0x65, 0x0b, 0x79, 0xf9,
+    0x10, 0xc2, 0x6d, 0x7b, 0xc2, 0x01, 0xc3, 0x0b, 0x78, 0x30, 0x15, 0xc2,
+    0x6d, 0x85, 0xc2, 0x19, 0x2c, 0x0b, 0x7a, 0x01, 0x83, 0x0b, 0x79, 0xe8,
+    0x83, 0x0b, 0x79, 0xe1, 0xc2, 0x00, 0xd0, 0x0b, 0x79, 0xb0, 0x15, 0xc2,
+    0x6d, 0x8f, 0x83, 0x0b, 0x78, 0x69, 0xc2, 0x01, 0x6f, 0x0b, 0x78, 0x60,
+    0xc2, 0x00, 0xd0, 0x0b, 0x79, 0x49, 0x83, 0x0b, 0x79, 0x40, 0xc2, 0x19,
+    0x2c, 0x0b, 0x78, 0xc9, 0x83, 0x0b, 0x78, 0xc0, 0x90, 0x0b, 0x7b, 0x62,
+    0x02, 0x6d, 0x99, 0xc2, 0x00, 0x75, 0x0b, 0x7c, 0x30, 0x90, 0x0b, 0x7b,
+    0x1a, 0x02, 0x6d, 0x9d, 0x94, 0x0b, 0x7b, 0xa8, 0x89, 0x0b, 0x7a, 0xf8,
+    0x94, 0x0b, 0x7c, 0x11, 0x9b, 0x0b, 0x7b, 0x00, 0x87, 0x0b, 0x7b, 0xa0,
+    0x89, 0x0b, 0x7a, 0xc0, 0x00, 0x42, 0x6d, 0xa1, 0xcd, 0x0e, 0x61, 0x0f,
+    0xbe, 0x19, 0xca, 0x22, 0x51, 0x0f, 0xbe, 0x08, 0xc6, 0x0b, 0x09, 0x0f,
+    0xbc, 0x79, 0xc6, 0x02, 0xd1, 0x01, 0x35, 0x50, 0xd0, 0x5c, 0x62, 0x0f,
+    0xbc, 0x29, 0xcb, 0x85, 0x72, 0x01, 0x35, 0x58, 0x00, 0xc2, 0x6d, 0xad,
+    0xe0, 0x0b, 0x87, 0x01, 0x3b, 0x68, 0x00, 0xc2, 0x6d, 0xb9, 0xe0, 0x0b,
+    0x87, 0x01, 0x3b, 0x60, 0x49, 0x35, 0x21, 0xc2, 0x6d, 0xc5, 0xd3, 0x3c,
+    0xb5, 0x0f, 0xbd, 0x81, 0x4c, 0x0e, 0x55, 0x42, 0x6d, 0xd1, 0xd1, 0x52,
+    0x11, 0x01, 0x35, 0x61, 0xc4, 0x01, 0xe3, 0x01, 0x2c, 0x91, 0xc6, 0x13,
+    0x52, 0x0f, 0xbd, 0x51, 0x43, 0x4d, 0x57, 0x42, 0x6d, 0xdd, 0xcf, 0x15,
+    0x36, 0x0f, 0xbd, 0xe1, 0xd2, 0x22, 0x49, 0x0f, 0xbe, 0x70, 0x9b, 0x0b,
+    0x73, 0xfb, 0x02, 0x6d, 0xe9, 0x83, 0x0b, 0x73, 0x6b, 0x02, 0x6d, 0xed,
+    0x91, 0x0b, 0x73, 0xeb, 0x02, 0x6d, 0xf7, 0x94, 0x0b, 0x73, 0xe1, 0x90,
+    0x0b, 0x73, 0xdb, 0x02, 0x6d, 0xfb, 0x86, 0x0b, 0x73, 0xc9, 0x9a, 0x0b,
+    0x73, 0xc1, 0x8a, 0x0b, 0x73, 0xb3, 0x02, 0x6e, 0x03, 0x93, 0x0b, 0x73,
+    0xa9, 0x8e, 0x0b, 0x73, 0xa1, 0x97, 0x0b, 0x73, 0x91, 0x85, 0x0b, 0x73,
+    0x89, 0x84, 0x0b, 0x73, 0x81, 0x87, 0x0b, 0x73, 0x79, 0x8c, 0x0b, 0x73,
+    0x71, 0x8d, 0x0b, 0x73, 0x63, 0x02, 0x6e, 0x07, 0x8b, 0x0b, 0x73, 0x59,
+    0x88, 0x0b, 0x73, 0x51, 0x89, 0x0b, 0x73, 0x49, 0x96, 0x0b, 0x73, 0x41,
+    0x92, 0x0b, 0x73, 0x39, 0x9c, 0x0b, 0x73, 0x29, 0x99, 0x0b, 0x73, 0x19,
+    0x98, 0x0b, 0x73, 0x11, 0x95, 0x0b, 0x73, 0x09, 0x8f, 0x0b, 0x73, 0x00,
+    0x9b, 0x0b, 0x72, 0xfb, 0x02, 0x6e, 0x0b, 0x83, 0x0b, 0x72, 0x6b, 0x02,
+    0x6e, 0x0f, 0x91, 0x0b, 0x72, 0xeb, 0x02, 0x6e, 0x19, 0x94, 0x0b, 0x72,
+    0xe1, 0x90, 0x0b, 0x72, 0xdb, 0x02, 0x6e, 0x1d, 0x86, 0x0b, 0x72, 0xc9,
+    0x9a, 0x0b, 0x72, 0xc1, 0x8a, 0x0b, 0x72, 0xb3, 0x02, 0x6e, 0x25, 0x93,
+    0x0b, 0x72, 0xa9, 0x8e, 0x0b, 0x72, 0xa1, 0x97, 0x0b, 0x72, 0x91, 0x85,
+    0x0b, 0x72, 0x89, 0x84, 0x0b, 0x72, 0x81, 0x87, 0x0b, 0x72, 0x79, 0x8c,
+    0x0b, 0x72, 0x71, 0x8d, 0x0b, 0x72, 0x63, 0x02, 0x6e, 0x29, 0x8b, 0x0b,
+    0x72, 0x59, 0x88, 0x0b, 0x72, 0x51, 0x89, 0x0b, 0x72, 0x49, 0x96, 0x0b,
+    0x72, 0x41, 0x92, 0x0b, 0x72, 0x39, 0x9c, 0x0b, 0x72, 0x29, 0x99, 0x0b,
+    0x72, 0x19, 0x98, 0x0b, 0x72, 0x11, 0x95, 0x0b, 0x72, 0x09, 0x8f, 0x0b,
+    0x72, 0x00, 0xc4, 0x02, 0xde, 0x0b, 0x74, 0x1b, 0x02, 0x6e, 0x2d, 0xc2,
+    0x02, 0xa0, 0x0b, 0x74, 0x12, 0x02, 0x6e, 0x33, 0xcf, 0x6b, 0x25, 0x0b,
+    0x74, 0xa0, 0xc4, 0x18, 0x10, 0x0b, 0x74, 0x39, 0xc2, 0x22, 0xcc, 0x0b,
+    0x74, 0x30, 0xc3, 0x0d, 0x14, 0x0b, 0x74, 0x29, 0xc3, 0x09, 0x9e, 0x0b,
+    0x74, 0x20, 0xc7, 0x1f, 0x6e, 0x0b, 0x74, 0x91, 0xc5, 0x66, 0xb1, 0x0b,
+    0x74, 0x58, 0xc8, 0x48, 0x23, 0x0b, 0x74, 0x89, 0xc6, 0x44, 0x9c, 0x0b,
+    0x74, 0x80, 0xc6, 0x14, 0x07, 0x0b, 0x74, 0x79, 0xc7, 0x34, 0x37, 0x0b,
+    0x74, 0x70, 0xc7, 0x52, 0xcc, 0x0b, 0x74, 0x69, 0xc5, 0x22, 0x43, 0x0b,
+    0x74, 0x61, 0xc2, 0x00, 0xc4, 0x0b, 0x74, 0x50, 0xc6, 0x06, 0xaf, 0x01,
+    0x1e, 0xb1, 0xc9, 0x67, 0xa7, 0x01, 0x1e, 0xa8, 0x24, 0xc2, 0x6e, 0x39,
+    0x25, 0xc2, 0x6e, 0x75, 0x1f, 0xc2, 0x6e, 0xb1, 0x1e, 0xc2, 0x6e, 0xed,
+    0x26, 0xc2, 0x6f, 0x29, 0x22, 0xc2, 0x6f, 0x65, 0x1d, 0xc2, 0x6f, 0xa1,
+    0x21, 0xc2, 0x6f, 0xd7, 0x23, 0xc2, 0x70, 0x13, 0x20, 0x42, 0x70, 0x4f,
+    0x26, 0xc2, 0x70, 0x8b, 0x20, 0xc2, 0x70, 0xbb, 0x1e, 0xc2, 0x70, 0xf7,
+    0x23, 0xc2, 0x71, 0x33, 0x24, 0xc2, 0x71, 0x6f, 0x21, 0xc2, 0x71, 0xab,
+    0x1d, 0xc2, 0x71, 0xe7, 0x22, 0xc2, 0x72, 0x23, 0x25, 0xc2, 0x72, 0x5f,
+    0x1f, 0x42, 0x72, 0x9b, 0xc2, 0x02, 0xa0, 0x0f, 0x46, 0x41, 0xc4, 0x02,
+    0xde, 0x0f, 0x46, 0x48, 0xc3, 0x09, 0x9e, 0x0f, 0x46, 0x51, 0xc3, 0x0d,
+    0x14, 0x0f, 0x46, 0x58, 0xc2, 0x22, 0xcc, 0x0f, 0x46, 0x61, 0xc4, 0x18,
+    0x10, 0x0f, 0x46, 0x68, 0x07, 0xc2, 0x72, 0xd7, 0xc8, 0x4b, 0x95, 0x0f,
+    0x46, 0x98, 0x95, 0x0f, 0x46, 0x91, 0xca, 0xa2, 0x92, 0x0f, 0x46, 0xa8,
+    0x16, 0xc2, 0x72, 0xe1, 0xcd, 0x76, 0xf8, 0x08, 0x4f, 0xf1, 0x07, 0xc2,
+    0x72, 0xf3, 0x15, 0xc2, 0x72, 0xff, 0x08, 0xc2, 0x73, 0x0b, 0x44, 0x05,
+    0x14, 0x42, 0x73, 0x17, 0xc4, 0x26, 0x78, 0x08, 0x4e, 0x43, 0x02, 0x73,
+    0x23, 0xc5, 0x06, 0xdb, 0x08, 0x4e, 0x3b, 0x02, 0x73, 0x2d, 0x15, 0xc2,
+    0x73, 0x37, 0x08, 0xc2, 0x73, 0x49, 0x16, 0xc2, 0x73, 0x51, 0xc3, 0x05,
+    0x14, 0x08, 0x4e, 0x02, 0x02, 0x73, 0x62, 0x48, 0x3f, 0x14, 0xc2, 0x73,
+    0x66, 0x46, 0x02, 0x0f, 0x42, 0x73, 0x72, 0xc2, 0xe5, 0xfd, 0x08, 0x4c,
+    0xf8, 0xc2, 0x0e, 0x9a, 0x08, 0x4c, 0xe9, 0x16, 0xc2, 0x73, 0xd1, 0xc2,
+    0x0f, 0x9a, 0x08, 0x4c, 0xb9, 0x0d, 0xc2, 0x73, 0xe3, 0x15, 0xc2, 0x73,
+    0xed, 0xc3, 0xe6, 0x71, 0x08, 0x4c, 0x91, 0x83, 0x08, 0x4c, 0x01, 0x87,
+    0x08, 0x4c, 0x09, 0x8b, 0x08, 0x4c, 0x11, 0x91, 0x08, 0x4c, 0x19, 0xc2,
+    0x19, 0x2c, 0x08, 0x4c, 0x21, 0xc2, 0x01, 0x4a, 0x08, 0x4c, 0x29, 0xc2,
+    0x01, 0x5d, 0x08, 0x4c, 0x33, 0x02, 0x73, 0xf8, 0xc2, 0x00, 0xb0, 0x08,
+    0x4c, 0x41, 0xc2, 0x01, 0xc3, 0x08, 0x4c, 0x49, 0x10, 0xc2, 0x73, 0xfe,
+    0xc2, 0x00, 0x39, 0x08, 0x4c, 0x73, 0x02, 0x74, 0x0c, 0xc2, 0x00, 0xdb,
+    0x08, 0x4c, 0x80, 0x47, 0x22, 0x04, 0xc2, 0x74, 0x12, 0xcc, 0x8b, 0x4d,
+    0x01, 0x4c, 0xd8, 0xc3, 0x7f, 0x18, 0x05, 0x5f, 0x29, 0x03, 0xc2, 0x74,
+    0x18, 0x97, 0x05, 0x57, 0x70, 0xc3, 0x7f, 0x18, 0x05, 0x5f, 0x21, 0x8b,
+    0x05, 0x57, 0x58, 0x97, 0x05, 0x57, 0x61, 0xc3, 0x7f, 0x18, 0x05, 0x5f,
+    0x40, 0xc7, 0xc9, 0xe3, 0x05, 0x5f, 0x10, 0xc3, 0x71, 0x83, 0x05, 0x5e,
+    0x4b, 0x02, 0x74, 0x20, 0x83, 0x05, 0x5e, 0x2b, 0x02, 0x74, 0x26, 0xc2,
+    0x00, 0xc1, 0x05, 0x57, 0x41, 0xc2, 0x19, 0x2c, 0x05, 0x57, 0x18, 0xc2,
+    0x00, 0x71, 0x05, 0x5e, 0x3b, 0x02, 0x74, 0x2c, 0x16, 0xc2, 0x74, 0x32,
+    0xc3, 0x18, 0xb0, 0x05, 0x5e, 0x50, 0x83, 0x05, 0x5e, 0x23, 0x02, 0x74,
+    0x3c, 0xc3, 0x08, 0x09, 0x05, 0x5e, 0x80, 0xc2, 0x01, 0x25, 0x05, 0x5e,
+    0x03, 0x02, 0x74, 0x42, 0xc3, 0x18, 0xb0, 0x05, 0x5e, 0x40, 0xc3, 0x08,
+    0x09, 0x05, 0x5e, 0xd1, 0x83, 0x05, 0x5e, 0xa8, 0xc3, 0x18, 0xb0, 0x05,
+    0x5e, 0xc9, 0x06, 0xc2, 0x74, 0x48, 0xc2, 0x00, 0x71, 0x05, 0x5e, 0xb8,
+    0xc3, 0x18, 0xb0, 0x05, 0x5e, 0xc1, 0xc2, 0x01, 0x25, 0x05, 0x5e, 0x90,
+    0xc2, 0x0d, 0xf6, 0x05, 0x57, 0x51, 0xc2, 0x00, 0xd0, 0x05, 0x57, 0x49,
+    0xc2, 0x00, 0xc2, 0x05, 0x5e, 0x08, 0x83, 0x05, 0x57, 0x11, 0xc2, 0x00,
+    0x71, 0x05, 0x5e, 0x30, 0xc7, 0xc9, 0xe3, 0x05, 0x5e, 0xe8, 0xc7, 0xc9,
+    0xe3, 0x05, 0x5e, 0xe0, 0xc3, 0x08, 0x09, 0x05, 0x5e, 0x99, 0xc2, 0x00,
+    0x71, 0x05, 0x5e, 0xb0, 0xc9, 0xb1, 0xc1, 0x0f, 0xb5, 0xa9, 0xc7, 0x61,
+    0x82, 0x0f, 0xb4, 0xf1, 0xc8, 0xb7, 0xaa, 0x0f, 0xb5, 0x00, 0xc2, 0x00,
+    0x74, 0x01, 0x34, 0x59, 0xc3, 0x01, 0x95, 0x01, 0x34, 0x50, 0xe0, 0x01,
+    0x27, 0x08, 0xb3, 0x60, 0x46, 0x00, 0x8b, 0x42, 0x74, 0x52, 0xcf, 0x01,
+    0x38, 0x08, 0xb3, 0x31, 0xc8, 0x00, 0xbf, 0x08, 0xb3, 0x28, 0xcf, 0x01,
+    0x38, 0x08, 0xb3, 0x21, 0xc8, 0x00, 0xbf, 0x08, 0xb3, 0x00, 0xc4, 0x26,
+    0x78, 0x00, 0xc0, 0xc9, 0xc5, 0x06, 0xdb, 0x00, 0xc0, 0xc1, 0x15, 0xc2,
+    0x74, 0x5e, 0x08, 0xc2, 0x74, 0x6a, 0x16, 0xc2, 0x74, 0x76, 0xc3, 0x05,
+    0x14, 0x00, 0xc0, 0x89, 0xc4, 0x15, 0xe7, 0x00, 0xc0, 0x80, 0x45, 0xc2,
+    0x59, 0x42, 0x74, 0x82, 0x48, 0xb1, 0x71, 0xc2, 0x74, 0xa4, 0xc2, 0x00,
+    0x75, 0x00, 0xc1, 0x48, 0x44, 0x62, 0x5b, 0xc2, 0x74, 0xf0, 0xc2, 0x0d,
+    0xf6, 0x00, 0xc1, 0xe1, 0x83, 0x00, 0xc1, 0x90, 0x83, 0x00, 0xc1, 0xa3,
+    0x02, 0x75, 0x61, 0x8b, 0x00, 0xc2, 0x10, 0x44, 0x14, 0x85, 0xc2, 0x75,
+    0x67, 0xc2, 0x00, 0xd0, 0x00, 0xc1, 0x89, 0x83, 0x00, 0xc1, 0x80, 0xc2,
+    0x00, 0x0a, 0x00, 0xc2, 0x09, 0xc2, 0x00, 0x39, 0x00, 0xc1, 0xf9, 0x83,
+    0x00, 0xc1, 0xe8, 0xc2, 0x00, 0xd0, 0x00, 0xc2, 0x01, 0x83, 0x00, 0xc1,
+    0x78, 0xc2, 0x00, 0xd0, 0x00, 0xc1, 0xd9, 0x83, 0x00, 0xc1, 0xd0, 0x87,
+    0x00, 0xc1, 0x38, 0x87, 0x00, 0xc1, 0x30, 0x87, 0x00, 0xc1, 0x28, 0xc4,
+    0x09, 0x9d, 0x00, 0xc0, 0x79, 0x16, 0xc2, 0x75, 0xc9, 0xc3, 0x05, 0x14,
+    0x00, 0xc0, 0x58, 0x45, 0x09, 0x98, 0xc2, 0x75, 0xd5, 0xcb, 0x97, 0xf5,
+    0x08, 0xb2, 0x11, 0xc4, 0x19, 0x53, 0x08, 0xb2, 0x08, 0xc4, 0xe3, 0x83,
+    0x08, 0xb2, 0x21, 0x03, 0xc2, 0x75, 0xf9, 0x42, 0x07, 0xb2, 0x42, 0x76,
+    0x05, 0x03, 0xc2, 0x76, 0x11, 0x91, 0x08, 0xb1, 0xd9, 0x87, 0x08, 0xb1,
+    0xc9, 0x48, 0xb2, 0x2d, 0xc2, 0x76, 0x1d, 0x97, 0x08, 0xb1, 0x9b, 0x02,
+    0x76, 0x2b, 0x8b, 0x08, 0xb1, 0x8a, 0x02, 0x76, 0x2f, 0x0e, 0xc2, 0x76,
+    0x33, 0xc2, 0x00, 0xd0, 0x08, 0xb1, 0x71, 0x15, 0xc2, 0x76, 0x3d, 0x18,
+    0xc2, 0x76, 0x4d, 0xc2, 0x00, 0x39, 0x08, 0xb1, 0x41, 0xc2, 0x19, 0x2c,
+    0x08, 0xb1, 0x39, 0xc2, 0x01, 0xc3, 0x08, 0xb1, 0x31, 0x04, 0xc2, 0x76,
+    0x57, 0x12, 0xc2, 0x76, 0x61, 0x10, 0xc2, 0x76, 0x6b, 0x06, 0xc2, 0x76,
+    0x81, 0x16, 0xc2, 0x76, 0x8f, 0x0c, 0xc2, 0x76, 0x9d, 0x05, 0xc2, 0x76,
+    0xa7, 0x09, 0xc2, 0x76, 0xb1, 0x0d, 0xc2, 0x76, 0xbb, 0x83, 0x08, 0xb0,
+    0x03, 0x02, 0x76, 0xc5, 0x91, 0x08, 0xb0, 0x61, 0x87, 0x08, 0xb0, 0x51,
+    0x97, 0x08, 0xb0, 0x23, 0x02, 0x76, 0xd1, 0x8b, 0x08, 0xb0, 0x12, 0x02,
+    0x76, 0xd5, 0x15, 0xc2, 0x76, 0xd9, 0x05, 0xc2, 0x76, 0xef, 0x14, 0xc2,
+    0x77, 0x19, 0x0e, 0xc2, 0x77, 0x2f, 0x09, 0xc2, 0x77, 0x41, 0x04, 0xc2,
+    0x77, 0x56, 0x06, 0xc2, 0x77, 0x62, 0x03, 0xc2, 0x77, 0x6c, 0x12, 0xc2,
+    0x77, 0x7e, 0x16, 0xc2, 0x77, 0x8a, 0x17, 0xc2, 0x77, 0x96, 0x18, 0xc2,
+    0x77, 0xa6, 0x0f, 0xc2, 0x77, 0xb2, 0x07, 0xc2, 0x77, 0xbc, 0x0a, 0xc2,
+    0x77, 0xc8, 0x1b, 0xc2, 0x77, 0xd4, 0xca, 0x9c, 0xf2, 0x00, 0x17, 0xf0,
+    0x89, 0x0e, 0xa1, 0xd3, 0x02, 0x77, 0xe0, 0x88, 0x0e, 0xa1, 0xc9, 0x87,
+    0x0e, 0xa1, 0xc3, 0x02, 0x77, 0xe6, 0x86, 0x0e, 0xa1, 0xbb, 0x02, 0x77,
+    0xf2, 0x85, 0x0e, 0xa1, 0xb3, 0x02, 0x77, 0xf8, 0x84, 0x0e, 0xa1, 0xab,
+    0x02, 0x77, 0xfe, 0x83, 0x0e, 0xa1, 0xa3, 0x02, 0x78, 0x04, 0x91, 0x0e,
+    0xa2, 0x13, 0x02, 0x78, 0x0a, 0x92, 0x0e, 0xa2, 0x1b, 0x02, 0x78, 0x0e,
+    0x97, 0x0e, 0xa2, 0x43, 0x02, 0x78, 0x1e, 0x96, 0x0e, 0xa2, 0x3b, 0x02,
+    0x78, 0x24, 0x95, 0x0e, 0xa2, 0x33, 0x02, 0x78, 0x33, 0x94, 0x0e, 0xa2,
+    0x2b, 0x02, 0x78, 0x39, 0x9a, 0x0e, 0xa2, 0x5b, 0x02, 0x78, 0x3f, 0x90,
+    0x0e, 0xa2, 0x0b, 0x02, 0x78, 0x43, 0x8f, 0x0e, 0xa2, 0x03, 0x02, 0x78,
+    0x47, 0x8e, 0x0e, 0xa1, 0xfb, 0x02, 0x78, 0x4b, 0x8d, 0x0e, 0xa1, 0xf3,
+    0x02, 0x78, 0x51, 0x8b, 0x0e, 0xa1, 0xe3, 0x02, 0x78, 0x57, 0x9c, 0x0e,
+    0xa2, 0x6b, 0x02, 0x78, 0x5d, 0x9b, 0x0e, 0xa2, 0x61, 0x99, 0x0e, 0xa2,
+    0x51, 0x98, 0x0e, 0xa2, 0x49, 0x93, 0x0e, 0xa2, 0x21, 0x8c, 0x0e, 0xa1,
+    0xe9, 0x8a, 0x0e, 0xa1, 0xd8, 0xc8, 0x9c, 0x0e, 0x0e, 0xb8, 0xd9, 0xc9,
+    0xaa, 0x9e, 0x0e, 0xb8, 0xc9, 0xd3, 0x43, 0x00, 0x0e, 0xb8, 0xa8, 0x91,
+    0x0e, 0xa2, 0xe3, 0x02, 0x78, 0x63, 0x92, 0x0e, 0xa2, 0xeb, 0x02, 0x78,
+    0x67, 0x85, 0x0e, 0xa2, 0x83, 0x02, 0x78, 0x77, 0x97, 0x0e, 0xa3, 0x13,
+    0x02, 0x78, 0x7d, 0x96, 0x0e, 0xa3, 0x0b, 0x02, 0x78, 0x83, 0x95, 0x0e,
+    0xa3, 0x03, 0x02, 0x78, 0x8f, 0x88, 0x0e, 0xa2, 0x9b, 0x02, 0x78, 0x95,
+    0x94, 0x0e, 0xa2, 0xfb, 0x02, 0x78, 0x9b, 0x9a, 0x0e, 0xa3, 0x2b, 0x02,
+    0x78, 0xa1, 0x90, 0x0e, 0xa2, 0xdb, 0x02, 0x78, 0xa5, 0x8f, 0x0e, 0xa2,
+    0xd3, 0x02, 0x78, 0xa9, 0x8e, 0x0e, 0xa2, 0xcb, 0x02, 0x78, 0xad, 0x8d,
+    0x0e, 0xa2, 0xc3, 0x02, 0x78, 0xb3, 0x8b, 0x0e, 0xa2, 0xb3, 0x02, 0x78,
+    0xb9, 0x87, 0x0e, 0xa2, 0x93, 0x02, 0x78, 0xbf, 0x9c, 0x0e, 0xa3, 0x3b,
+    0x02, 0x78, 0xcb, 0x86, 0x0e, 0xa2, 0x8b, 0x02, 0x78, 0xd1, 0x89, 0x0e,
+    0xa2, 0xa3, 0x02, 0x78, 0xdd, 0x84, 0x0e, 0xa2, 0x7b, 0x02, 0x78, 0xe3,
+    0x83, 0x0e, 0xa2, 0x73, 0x02, 0x78, 0xe9, 0x9b, 0x0e, 0xa3, 0x31, 0x99,
+    0x0e, 0xa3, 0x21, 0x98, 0x0e, 0xa3, 0x19, 0x93, 0x0e, 0xa2, 0xf1, 0x8c,
+    0x0e, 0xa2, 0xb8, 0x45, 0x03, 0x14, 0xc2, 0x78, 0xef, 0x46, 0x07, 0x2f,
+    0x42, 0x79, 0x93, 0xc4, 0x26, 0x78, 0x0e, 0xbe, 0xb9, 0xc5, 0x06, 0xdb,
+    0x0e, 0xbe, 0xb1, 0x15, 0xc2, 0x79, 0x9f, 0x08, 0xc2, 0x79, 0xab, 0x16,
+    0xc2, 0x79, 0xb7, 0xc3, 0x05, 0x14, 0x0e, 0xbe, 0x79, 0xc4, 0x15, 0xe7,
+    0x0e, 0xbe, 0x70, 0x86, 0x0e, 0xa0, 0x1b, 0x02, 0x79, 0xc3, 0x91, 0x0e,
+    0xa0, 0x73, 0x02, 0x79, 0xcf, 0x92, 0x0e, 0xa0, 0x7b, 0x02, 0x79, 0xd3,
+    0x85, 0x0e, 0xa0, 0x13, 0x02, 0x79, 0xe3, 0x97, 0x0e, 0xa0, 0xa3, 0x02,
+    0x79, 0xe9, 0x96, 0x0e, 0xa0, 0x9b, 0x02, 0x79, 0xef, 0x95, 0x0e, 0xa0,
+    0x93, 0x02, 0x79, 0xfe, 0x94, 0x0e, 0xa0, 0x8b, 0x02, 0x7a, 0x04, 0x9a,
+    0x0e, 0xa0, 0xbb, 0x02, 0x7a, 0x0a, 0x90, 0x0e, 0xa0, 0x6b, 0x02, 0x7a,
+    0x0e, 0x8f, 0x0e, 0xa0, 0x63, 0x02, 0x7a, 0x12, 0x8e, 0x0e, 0xa0, 0x5b,
+    0x02, 0x7a, 0x16, 0x8d, 0x0e, 0xa0, 0x53, 0x02, 0x7a, 0x1c, 0x8b, 0x0e,
+    0xa0, 0x43, 0x02, 0x7a, 0x22, 0x87, 0x0e, 0xa0, 0x23, 0x02, 0x7a, 0x28,
+    0x9c, 0x0e, 0xa0, 0xcb, 0x02, 0x7a, 0x34, 0x89, 0x0e, 0xa0, 0x33, 0x02,
+    0x7a, 0x3a, 0x84, 0x0e, 0xa0, 0x0b, 0x02, 0x7a, 0x40, 0x83, 0x0e, 0xa0,
+    0x03, 0x02, 0x7a, 0x46, 0x9b, 0x0e, 0xa0, 0xc1, 0x99, 0x0e, 0xa0, 0xb1,
+    0x98, 0x0e, 0xa0, 0xa9, 0x93, 0x0e, 0xa0, 0x81, 0x8c, 0x0e, 0xa0, 0x49,
+    0x8a, 0x0e, 0xa0, 0x39, 0x88, 0x0e, 0xa0, 0x28, 0x12, 0xc2, 0x7a, 0x4c,
+    0xca, 0x9c, 0xac, 0x0e, 0xba, 0xa1, 0xcc, 0x8b, 0x65, 0x0e, 0xba, 0x91,
+    0xcc, 0x89, 0xfd, 0x0e, 0xba, 0x89, 0xce, 0x10, 0x3e, 0x0e, 0xba, 0x81,
+    0x46, 0x03, 0x13, 0xc2, 0x7a, 0x5e, 0xc5, 0xdb, 0xf0, 0x0e, 0xb9, 0xa9,
+    0x48, 0x0b, 0x17, 0x42, 0x7b, 0x02, 0xc8, 0x9c, 0x0e, 0x0e, 0xb7, 0x09,
+    0xc9, 0xaa, 0x9e, 0x0e, 0xb6, 0xf9, 0xd3, 0x43, 0x00, 0x0e, 0xb6, 0xd8,
+    0x46, 0x03, 0x13, 0xc2, 0x7b, 0xa3, 0x48, 0x0b, 0x17, 0x42, 0x7c, 0x0b,
+    0xc4, 0x26, 0x78, 0x0e, 0xbf, 0xf9, 0xc5, 0x06, 0xdb, 0x0e, 0xbf, 0xf1,
+    0x15, 0xc2, 0x7c, 0x73, 0x08, 0xc2, 0x7c, 0x7f, 0x16, 0xc2, 0x7c, 0x8b,
+    0xc3, 0x05, 0x14, 0x0e, 0xbf, 0xb9, 0xc4, 0x15, 0xe7, 0x0e, 0xbf, 0xb0,
+    0x9c, 0x0e, 0xb5, 0x19, 0x9b, 0x0e, 0xb5, 0x11, 0x9a, 0x0e, 0xb5, 0x09,
+    0x99, 0x0e, 0xb5, 0x01, 0x98, 0x0e, 0xb4, 0xf9, 0x97, 0x0e, 0xb4, 0xf1,
+    0x96, 0x0e, 0xb4, 0xe9, 0x95, 0x0e, 0xb4, 0xe1, 0x94, 0x0e, 0xb4, 0xd9,
+    0x93, 0x0e, 0xb4, 0xd1, 0x92, 0x0e, 0xb4, 0xc9, 0x91, 0x0e, 0xb4, 0xc1,
+    0x90, 0x0e, 0xb4, 0xb9, 0x8f, 0x0e, 0xb4, 0xb1, 0x8e, 0x0e, 0xb4, 0xa9,
+    0x8d, 0x0e, 0xb4, 0xa1, 0x8c, 0x0e, 0xb4, 0x99, 0x8b, 0x0e, 0xb4, 0x91,
+    0x8a, 0x0e, 0xb4, 0x89, 0x89, 0x0e, 0xb4, 0x81, 0x88, 0x0e, 0xb4, 0x79,
+    0x87, 0x0e, 0xb4, 0x71, 0x86, 0x0e, 0xb4, 0x69, 0x85, 0x0e, 0xb4, 0x61,
+    0x84, 0x0e, 0xb4, 0x59, 0x83, 0x0e, 0xb4, 0x50, 0x9c, 0x0e, 0xb4, 0x49,
+    0x9b, 0x0e, 0xb4, 0x41, 0x9a, 0x0e, 0xb4, 0x39, 0x99, 0x0e, 0xb4, 0x31,
+    0x98, 0x0e, 0xb4, 0x29, 0x97, 0x0e, 0xb4, 0x21, 0x96, 0x0e, 0xb4, 0x19,
+    0x95, 0x0e, 0xb4, 0x11, 0x94, 0x0e, 0xb4, 0x09, 0x93, 0x0e, 0xb4, 0x01,
+    0x92, 0x0e, 0xb3, 0xf9, 0x91, 0x0e, 0xb3, 0xf1, 0x90, 0x0e, 0xb3, 0xe9,
+    0x8f, 0x0e, 0xb3, 0xe1, 0x8e, 0x0e, 0xb3, 0xd9, 0x8d, 0x0e, 0xb3, 0xd1,
+    0x8c, 0x0e, 0xb3, 0xc9, 0x8b, 0x0e, 0xb3, 0xc1, 0x8a, 0x0e, 0xb3, 0xb9,
+    0x89, 0x0e, 0xb3, 0xb1, 0x88, 0x0e, 0xb3, 0xa9, 0x87, 0x0e, 0xb3, 0xa1,
+    0x86, 0x0e, 0xb3, 0x99, 0x85, 0x0e, 0xb3, 0x91, 0x84, 0x0e, 0xb3, 0x89,
+    0x83, 0x0e, 0xb3, 0x80, 0x45, 0x58, 0xc2, 0xc2, 0x7c, 0x97, 0x46, 0x09,
+    0x97, 0xc2, 0x7c, 0xd1, 0x47, 0xc7, 0x4a, 0xc2, 0x7c, 0xf5, 0x46, 0x03,
+    0x13, 0xc2, 0x7d, 0x01, 0x48, 0x0b, 0x17, 0x42, 0x7d, 0x69, 0x46, 0x03,
+    0x13, 0xc2, 0x7d, 0xd1, 0x48, 0x0b, 0x17, 0x42, 0x7e, 0x2d, 0xc4, 0x26,
+    0x78, 0x0e, 0xbf, 0x09, 0xc5, 0x06, 0xdb, 0x0e, 0xbf, 0x01, 0x15, 0xc2,
+    0x7e, 0x75, 0x08, 0xc2, 0x7e, 0x81, 0x16, 0xc2, 0x7e, 0x8d, 0xc3, 0x05,
+    0x14, 0x0e, 0xbe, 0xc9, 0xc4, 0x15, 0xe7, 0x0e, 0xbe, 0xc0, 0x9c, 0x0e,
+    0xab, 0x59, 0x9b, 0x0e, 0xab, 0x51, 0x9a, 0x0e, 0xab, 0x49, 0x99, 0x0e,
+    0xab, 0x41, 0x98, 0x0e, 0xab, 0x39, 0x97, 0x0e, 0xab, 0x31, 0x96, 0x0e,
+    0xab, 0x29, 0x95, 0x0e, 0xab, 0x21, 0x94, 0x0e, 0xab, 0x19, 0x93, 0x0e,
+    0xab, 0x11, 0x92, 0x0e, 0xab, 0x09, 0x91, 0x0e, 0xab, 0x01, 0x90, 0x0e,
+    0xaa, 0xf9, 0x8f, 0x0e, 0xaa, 0xf1, 0x8e, 0x0e, 0xaa, 0xe9, 0x8d, 0x0e,
+    0xaa, 0xe1, 0x8c, 0x0e, 0xaa, 0xd9, 0x8b, 0x0e, 0xaa, 0xd1, 0x8a, 0x0e,
+    0xaa, 0xc9, 0x89, 0x0e, 0xaa, 0xc1, 0x88, 0x0e, 0xaa, 0xb9, 0x87, 0x0e,
+    0xaa, 0xb1, 0x86, 0x0e, 0xaa, 0xa9, 0x85, 0x0e, 0xaa, 0xa1, 0x84, 0x0e,
+    0xaa, 0x99, 0x83, 0x0e, 0xaa, 0x90, 0x9b, 0x0e, 0xaa, 0x81, 0x9a, 0x0e,
+    0xaa, 0x79, 0x99, 0x0e, 0xaa, 0x71, 0x98, 0x0e, 0xaa, 0x69, 0x97, 0x0e,
+    0xaa, 0x61, 0x96, 0x0e, 0xaa, 0x59, 0x95, 0x0e, 0xaa, 0x51, 0x91, 0x0e,
+    0xaa, 0x31, 0x8f, 0x0e, 0xaa, 0x21, 0x8e, 0x0e, 0xaa, 0x19, 0x8d, 0x0e,
+    0xaa, 0x11, 0x8c, 0x0e, 0xaa, 0x09, 0x8b, 0x0e, 0xaa, 0x01, 0x89, 0x0e,
+    0xa9, 0xf1, 0x88, 0x0e, 0xa9, 0xe9, 0x87, 0x0e, 0xa9, 0xe1, 0x86, 0x0e,
+    0xa9, 0xd9, 0x84, 0x0e, 0xa9, 0xc9, 0x83, 0x0e, 0xa9, 0xc0, 0x46, 0x03,
+    0x13, 0xc2, 0x7e, 0x99, 0x48, 0x0b, 0x17, 0x42, 0x7f, 0x01, 0xd5, 0x35,
+    0x36, 0x01, 0x3f, 0x79, 0x46, 0x01, 0xfc, 0xc2, 0x7f, 0x55, 0xd4, 0x38,
+    0xf4, 0x01, 0x3f, 0x59, 0xcd, 0x0b, 0x91, 0x01, 0x3f, 0x48, 0xd6, 0x08,
+    0x88, 0x01, 0x3f, 0x61, 0xce, 0x25, 0xad, 0x01, 0x3f, 0x30, 0xc4, 0x18,
+    0x10, 0x08, 0xea, 0xb9, 0xc2, 0x22, 0xcc, 0x08, 0xea, 0xb0, 0xc3, 0x0d,
+    0x14, 0x08, 0xea, 0xa9, 0xc3, 0x09, 0x9e, 0x08, 0xea, 0xa0, 0xc4, 0x02,
+    0xde, 0x08, 0xea, 0x99, 0xc2, 0x02, 0xa0, 0x08, 0xea, 0x90, 0x03, 0xc2,
+    0x7f, 0x61, 0x91, 0x08, 0xe9, 0xe9, 0x87, 0x08, 0xe9, 0xd1, 0xc9, 0xb2,
+    0x2d, 0x08, 0xe9, 0xb1, 0x97, 0x08, 0xe9, 0xa3, 0x02, 0x7f, 0x6d, 0x8b,
+    0x08, 0xe9, 0x92, 0x02, 0x7f, 0x71, 0xc2, 0x00, 0x39, 0x08, 0xe9, 0x81,
+    0xc2, 0x00, 0xd0, 0x08, 0xe8, 0xe1, 0x83, 0x08, 0xe8, 0xd9, 0x16, 0x42,
+    0x7f, 0x75, 0xc3, 0x2d, 0xfd, 0x08, 0xe9, 0x79, 0xc2, 0x00, 0xd0, 0x08,
+    0xe8, 0xa1, 0x83, 0x08, 0xe8, 0x98, 0xc3, 0x1d, 0x35, 0x08, 0xe9, 0x71,
+    0xc2, 0x00, 0xd0, 0x08, 0xe8, 0x69, 0x83, 0x08, 0xe8, 0x60, 0xc2, 0x00,
+    0xdb, 0x08, 0xe9, 0x69, 0x83, 0x08, 0xe9, 0x38, 0x83, 0x08, 0xe9, 0x59,
+    0xc2, 0x0d, 0xf6, 0x08, 0xe9, 0x51, 0xc2, 0x00, 0xd0, 0x08, 0xe9, 0x48,
+    0xc2, 0x00, 0xd0, 0x08, 0xe9, 0x19, 0x83, 0x08, 0xe9, 0x10, 0xc2, 0x00,
+    0xd0, 0x08, 0xe9, 0x09, 0x83, 0x08, 0xe9, 0x00, 0x83, 0x08, 0xe8, 0xf9,
+    0xc2, 0x00, 0xc1, 0x08, 0xe8, 0xd1, 0xc2, 0x19, 0x2c, 0x08, 0xe8, 0xa9,
+    0xc2, 0x01, 0x30, 0x08, 0xe8, 0x80, 0xc2, 0x00, 0xd0, 0x08, 0xe8, 0xf1,
+    0x83, 0x08, 0xe8, 0xe9, 0x06, 0x42, 0x7f, 0x7f, 0xc2, 0x00, 0xd0, 0x08,
+    0xe8, 0x91, 0x83, 0x08, 0xe8, 0x88, 0xc2, 0x00, 0xd0, 0x08, 0xe8, 0x79,
+    0x83, 0x08, 0xe8, 0x70, 0x97, 0x08, 0xe8, 0x59, 0x8b, 0x08, 0xe8, 0x41,
+    0x83, 0x08, 0xe8, 0x08, 0x97, 0x08, 0xe8, 0x28, 0x8b, 0x08, 0xe8, 0x18,
+    0xcb, 0x1e, 0x89, 0x08, 0xe5, 0xb1, 0xc8, 0x14, 0x38, 0x08, 0xe5, 0xa8,
+    0x83, 0x08, 0xe5, 0x79, 0xc2, 0x00, 0xd0, 0x08, 0xe5, 0x71, 0x15, 0xc2,
+    0x7f, 0x89, 0xc2, 0x00, 0xdb, 0x08, 0xe5, 0x59, 0xc2, 0x00, 0x39, 0x08,
+    0xe5, 0x51, 0xc2, 0x19, 0x2c, 0x08, 0xe5, 0x49, 0x1c, 0xc2, 0x7f, 0x93,
+    0xc2, 0x01, 0x4a, 0x08, 0xe5, 0x29, 0x06, 0xc2, 0x7f, 0x9d, 0x16, 0xc2,
+    0x7f, 0xa7, 0xc2, 0x01, 0xc3, 0x08, 0xe5, 0x09, 0xc2, 0x01, 0x5d, 0x08,
+    0xe5, 0x01, 0x12, 0xc2, 0x7f, 0xb5, 0x10, 0xc2, 0x7f, 0xbf, 0xc2, 0x25,
+    0x3b, 0x08, 0xe4, 0xc1, 0x05, 0xc2, 0x7f, 0xcf, 0xc2, 0x01, 0x30, 0x08,
+    0xe4, 0xa1, 0x0d, 0x42, 0x7f, 0xd9, 0x83, 0x08, 0xe4, 0x69, 0xc2, 0x00,
+    0xd0, 0x08, 0xe4, 0x60, 0x83, 0x08, 0xe4, 0x39, 0xc2, 0x00, 0xd0, 0x08,
+    0xe4, 0x30, 0xc2, 0x02, 0x1c, 0x08, 0xe4, 0x21, 0x83, 0x08, 0xe3, 0xe0,
+    0x15, 0xc2, 0x7f, 0xe3, 0xc2, 0x00, 0xd0, 0x08, 0xe3, 0xd9, 0x83, 0x08,
+    0xe3, 0xd0, 0xc2, 0x00, 0xd0, 0x08, 0xe3, 0xf9, 0x83, 0x08, 0xe3, 0xf0,
+    0x83, 0x08, 0xe3, 0xe9, 0xc2, 0x19, 0x2c, 0x08, 0xe3, 0xc9, 0xc2, 0x01,
+    0x30, 0x08, 0xe3, 0xa8, 0xc2, 0x00, 0xd0, 0x08, 0xe3, 0xb9, 0x83, 0x08,
+    0xe3, 0xb0, 0xc2, 0x00, 0xd0, 0x08, 0xe3, 0x99, 0x83, 0x08, 0xe3, 0x90,
+    0xd7, 0x29, 0x29, 0x00, 0x68, 0x01, 0xca, 0x1e, 0x8a, 0x00, 0x68, 0x09,
+    0xce, 0x71, 0x5a, 0x00, 0x69, 0xe0, 0xc7, 0x14, 0x39, 0x00, 0x68, 0x11,
+    0xc7, 0x7a, 0x7f, 0x00, 0x69, 0xe8, 0x0b, 0xc2, 0x7f, 0xed, 0xd2, 0x48,
+    0xb3, 0x00, 0x69, 0xd8, 0xcd, 0x80, 0x36, 0x00, 0x68, 0x21, 0x47, 0xb2,
+    0x2e, 0xc2, 0x7f, 0xf9, 0x83, 0x00, 0x69, 0xa8, 0x83, 0x00, 0x68, 0x31,
+    0x8b, 0x00, 0x68, 0x81, 0x97, 0x00, 0x68, 0xa1, 0xc9, 0xa9, 0x90, 0x00,
+    0x6a, 0xf8, 0x8b, 0x00, 0x68, 0x40, 0x97, 0x00, 0x68, 0x50, 0x87, 0x00,
+    0x68, 0x78, 0x91, 0x00, 0x68, 0x98, 0x83, 0x00, 0x68, 0xa9, 0xc2, 0x00,
+    0xd0, 0x00, 0x68, 0xb0, 0x83, 0x00, 0x68, 0xb9, 0xc2, 0x00, 0xd0, 0x00,
+    0x68, 0xc0, 0xc2, 0x01, 0x30, 0x00, 0x68, 0xc9, 0xc2, 0x19, 0x2c, 0x00,
+    0x68, 0xf1, 0x10, 0xc2, 0x80, 0x07, 0x83, 0x00, 0x69, 0x40, 0x83, 0x00,
+    0x68, 0xd1, 0x0a, 0x42, 0x80, 0x11, 0x83, 0x00, 0x68, 0xe1, 0xc2, 0x00,
+    0xd0, 0x00, 0x68, 0xe8, 0x16, 0xc2, 0x80, 0x1b, 0x83, 0x00, 0x69, 0x21,
+    0xc2, 0x00, 0xd0, 0x00, 0x69, 0x28, 0x06, 0xc2, 0x80, 0x2b, 0x83, 0x00,
+    0x69, 0x31, 0xc2, 0x00, 0xd0, 0x00, 0x69, 0x39, 0xc7, 0xc7, 0x58, 0x00,
+    0x6a, 0x70, 0x83, 0x00, 0x69, 0x51, 0xc2, 0x00, 0xd0, 0x00, 0x69, 0x58,
+    0x83, 0x00, 0x69, 0x61, 0xc2, 0x00, 0xd0, 0x00, 0x69, 0x68, 0x83, 0x00,
+    0x69, 0x81, 0xc2, 0x00, 0x39, 0x00, 0x69, 0x88, 0x83, 0x00, 0x69, 0x91,
+    0x0e, 0x42, 0x80, 0x35, 0xc2, 0x00, 0xd0, 0x00, 0x69, 0xb1, 0xc2, 0x0d,
+    0xf6, 0x00, 0x69, 0xb9, 0x83, 0x00, 0x69, 0xc0, 0x83, 0x00, 0x69, 0xf1,
+    0x8b, 0x00, 0x6a, 0x41, 0x97, 0x00, 0x6a, 0x60, 0x8b, 0x00, 0x6a, 0x00,
+    0x97, 0x00, 0x6a, 0x10, 0x94, 0x00, 0x6a, 0x1b, 0x02, 0x80, 0x3f, 0x8e,
+    0x00, 0x6b, 0x12, 0x02, 0x80, 0x43, 0x87, 0x00, 0x6a, 0x38, 0x91, 0x00,
+    0x6a, 0x58, 0xd8, 0x22, 0xbb, 0x00, 0x6a, 0xc1, 0x08, 0xc2, 0x80, 0x47,
+    0x16, 0xc2, 0x80, 0x53, 0xc7, 0x08, 0x79, 0x00, 0x6b, 0x99, 0xc4, 0x01,
+    0xce, 0x00, 0x6b, 0xa1, 0xc9, 0x67, 0x38, 0x00, 0x6b, 0xb1, 0xc6, 0x06,
+    0xdb, 0x00, 0x6b, 0xb8, 0xca, 0xa3, 0xfa, 0x00, 0x6a, 0xd1, 0xca, 0x1e,
+    0x15, 0x00, 0x6a, 0xe9, 0xc8, 0x08, 0x79, 0x00, 0x6b, 0xa9, 0xca, 0xa7,
+    0x88, 0x00, 0x6b, 0xc0, 0xc4, 0x15, 0xe7, 0x00, 0x6b, 0x31, 0xc3, 0x05,
+    0x14, 0x00, 0x6b, 0x39, 0x16, 0xc2, 0x80, 0x5f, 0x08, 0xc2, 0x80, 0x6b,
+    0x15, 0xc2, 0x80, 0x77, 0xc5, 0x06, 0xdb, 0x00, 0x6b, 0x71, 0xc4, 0x26,
+    0x78, 0x00, 0x6b, 0x78, 0xc7, 0x0d, 0x04, 0x00, 0x6b, 0x89, 0xc8, 0x4b,
+    0x94, 0x00, 0x6b, 0x90, 0x96, 0x08, 0x57, 0xa3, 0x02, 0x80, 0x83, 0xd3,
+    0x44, 0x43, 0x08, 0x57, 0x90, 0xc8, 0x0d, 0x03, 0x08, 0x57, 0x78, 0xc5,
+    0x28, 0xee, 0x08, 0x57, 0x71, 0xc2, 0x00, 0xc4, 0x08, 0x57, 0x68, 0xc2,
+    0x39, 0x8b, 0x08, 0x57, 0x21, 0xc6, 0xd2, 0xc5, 0x08, 0x56, 0xa9, 0xc3,
+    0x1e, 0x1b, 0x08, 0x56, 0x70, 0xc4, 0x3e, 0x5a, 0x08, 0x57, 0x19, 0xc3,
+    0x11, 0xef, 0x08, 0x57, 0x11, 0x03, 0x42, 0x80, 0x89, 0xc4, 0xe0, 0x03,
+    0x08, 0x57, 0x01, 0xc3, 0x2d, 0x8a, 0x08, 0x56, 0xf0, 0xc3, 0x2d, 0x8a,
+    0x08, 0x56, 0xf9, 0xc3, 0x00, 0xb6, 0x08, 0x56, 0x88, 0xc4, 0x40, 0x95,
+    0x08, 0x56, 0xd1, 0xc3, 0x16, 0x5a, 0x08, 0x56, 0xc9, 0xc4, 0x36, 0xb5,
+    0x08, 0x56, 0x00, 0xc6, 0xd2, 0xc5, 0x08, 0x56, 0xa1, 0xc5, 0x40, 0x9b,
+    0x08, 0x56, 0x28, 0xc4, 0xdc, 0xe6, 0x08, 0x56, 0x91, 0xc3, 0x00, 0xb6,
+    0x08, 0x56, 0x80, 0xc2, 0x00, 0x8e, 0x08, 0x56, 0x68, 0xc5, 0xd6, 0x78,
+    0x08, 0x56, 0x61, 0xc4, 0x40, 0x95, 0x08, 0x56, 0x58, 0xc5, 0xd6, 0x78,
+    0x08, 0x56, 0x51, 0xc4, 0x40, 0x95, 0x08, 0x56, 0x48, 0xc5, 0xd5, 0xdd,
+    0x08, 0x56, 0x21, 0xc4, 0x9c, 0xa3, 0x08, 0x56, 0x18, 0xc4, 0x9b, 0x90,
+    0x08, 0x56, 0x11, 0xc3, 0x1e, 0x1b, 0x08, 0x56, 0x08, 0xc2, 0x00, 0x74,
+    0x00, 0x42, 0xc1, 0x96, 0x00, 0x42, 0xab, 0x02, 0x80, 0x95, 0x95, 0x00,
+    0x42, 0x73, 0x02, 0x80, 0x99, 0x94, 0x00, 0x42, 0x99, 0x93, 0x00, 0x42,
+    0x91, 0x92, 0x00, 0x42, 0x81, 0x90, 0x00, 0x42, 0x69, 0x8f, 0x00, 0x42,
+    0x61, 0x8e, 0x00, 0x42, 0x59, 0x8d, 0x00, 0x42, 0x53, 0x02, 0x80, 0xa1,
+    0x9c, 0x00, 0x42, 0x31, 0x8a, 0x00, 0x42, 0x21, 0x86, 0x00, 0x42, 0x19,
+    0x89, 0x00, 0x42, 0x11, 0x84, 0x00, 0x42, 0x08, 0x90, 0x00, 0x42, 0x79,
+    0x96, 0x00, 0x42, 0x38, 0x14, 0xc2, 0x80, 0xa7, 0xc2, 0x00, 0xd0, 0x08,
+    0x8b, 0x89, 0xc2, 0x0d, 0xf6, 0x08, 0x8b, 0x81, 0xc2, 0x02, 0x41, 0x08,
+    0x8b, 0x79, 0xc2, 0x00, 0xdb, 0x08, 0x8b, 0x71, 0xc2, 0x01, 0xc3, 0x08,
+    0x8b, 0x61, 0x04, 0xc2, 0x80, 0xb1, 0x12, 0xc2, 0x80, 0xbb, 0x10, 0xc2,
+    0x80, 0xc5, 0x06, 0xc2, 0x80, 0xd5, 0x16, 0xc2, 0x80, 0xe3, 0x0c, 0xc2,
+    0x80, 0xf1, 0x05, 0xc2, 0x80, 0xfb, 0x09, 0xc2, 0x81, 0x05, 0x0d, 0xc2,
+    0x81, 0x0f, 0x91, 0x08, 0x8a, 0xa1, 0x87, 0x08, 0x8a, 0x99, 0x97, 0x08,
+    0x8a, 0x91, 0x8b, 0x08, 0x8a, 0x89, 0x83, 0x08, 0x8a, 0x80, 0x05, 0xc2,
+    0x81, 0x19, 0xc7, 0xc0, 0xcf, 0x0f, 0x80, 0xb8, 0x05, 0xc2, 0x81, 0x25,
+    0xc7, 0xc0, 0xcf, 0x0f, 0x80, 0xa8, 0x05, 0xc2, 0x81, 0x31, 0xc7, 0xc0,
+    0xcf, 0x0f, 0x80, 0xb0, 0x05, 0xc2, 0x81, 0x3d, 0xc7, 0xc0, 0xcf, 0x0f,
+    0x80, 0xc0, 0x05, 0xc2, 0x81, 0x49, 0xc7, 0xc0, 0xcf, 0x0f, 0x80, 0x80,
+    0x05, 0xc2, 0x81, 0x55, 0xc7, 0xc0, 0xcf, 0x0f, 0x80, 0x88, 0x05, 0xc2,
+    0x81, 0x61, 0xc7, 0xc0, 0xcf, 0x0f, 0x80, 0x90, 0x05, 0xc2, 0x81, 0x6d,
+    0xc7, 0xc0, 0xcf, 0x0f, 0x80, 0x98, 0x05, 0xc2, 0x81, 0x79, 0xc7, 0xc0,
+    0xcf, 0x0f, 0x80, 0xa0, 0x46, 0x10, 0x79, 0xc2, 0x81, 0x85, 0xc4, 0xe3,
+    0x7f, 0x0f, 0x9d, 0xe0, 0xcb, 0x8d, 0x0b, 0x0f, 0x9c, 0xc0, 0x9a, 0x01,
+    0x38, 0xa9, 0xc4, 0x00, 0xba, 0x00, 0x06, 0xba, 0x02, 0x81, 0xeb, 0xc5,
+    0x13, 0x84, 0x01, 0x14, 0x71, 0xce, 0x1f, 0x18, 0x01, 0x14, 0x68, 0xc2,
+    0x00, 0xd0, 0x08, 0x95, 0x41, 0xc2, 0x00, 0x39, 0x08, 0x95, 0x39, 0x83,
+    0x08, 0x95, 0x10, 0xc2, 0x00, 0xd0, 0x08, 0x94, 0xf9, 0x83, 0x08, 0x94,
+    0xe8, 0xc2, 0x00, 0xd0, 0x08, 0x94, 0xe1, 0x83, 0x08, 0x94, 0xd8, 0x83,
+    0x08, 0x94, 0xd1, 0xc2, 0x00, 0xc1, 0x08, 0x94, 0xa9, 0xc2, 0x19, 0x2c,
+    0x08, 0x94, 0x78, 0xc2, 0x00, 0xd0, 0x08, 0x94, 0xc9, 0x83, 0x08, 0x94,
+    0xc1, 0x06, 0x42, 0x81, 0xef, 0xc2, 0x00, 0xd0, 0x08, 0x94, 0xb9, 0x83,
+    0x08, 0x94, 0xb1, 0x16, 0x42, 0x81, 0xff, 0x83, 0x08, 0x94, 0x61, 0xc2,
+    0x25, 0x3b, 0x08, 0x94, 0x68, 0x83, 0x08, 0x94, 0x51, 0xc2, 0x00, 0xd0,
+    0x08, 0x94, 0x58, 0xc2, 0x00, 0xd0, 0x08, 0x94, 0x41, 0x83, 0x08, 0x94,
+    0x30, 0xc2, 0x00, 0xd0, 0x08, 0x94, 0x29, 0x83, 0x08, 0x94, 0x20, 0xc3,
+    0x4d, 0x47, 0x05, 0x4f, 0x29, 0x45, 0x28, 0xb1, 0xc2, 0x82, 0x09, 0x48,
+    0xba, 0xb2, 0x42, 0x82, 0x19, 0xc3, 0x02, 0x9f, 0x05, 0x53, 0xc9, 0xc3,
+    0x05, 0x14, 0x05, 0x53, 0xc1, 0xcb, 0x0f, 0x09, 0x05, 0x53, 0xb8, 0x44,
+    0x3d, 0xbb, 0x42, 0x82, 0x25, 0x48, 0x68, 0x93, 0x42, 0x82, 0x69, 0x83,
+    0x00, 0x80, 0x59, 0xc2, 0x00, 0xd0, 0x00, 0x80, 0x60, 0x83, 0x00, 0x82,
+    0x83, 0x02, 0x82, 0x89, 0x4b, 0x91, 0x8e, 0x42, 0x82, 0x8f, 0xc2, 0x19,
+    0x2c, 0x00, 0x80, 0x51, 0x83, 0x00, 0x80, 0x78, 0x83, 0x00, 0x80, 0x69,
+    0xc2, 0x00, 0xd0, 0x00, 0x80, 0x70, 0x87, 0x00, 0x81, 0x41, 0xc3, 0x20,
+    0xf1, 0x00, 0x82, 0xd1, 0xc3, 0xe5, 0xf0, 0x00, 0x82, 0xd9, 0x42, 0x3f,
+    0x98, 0x42, 0x82, 0x9b, 0xc3, 0x00, 0xcf, 0x00, 0x83, 0x29, 0xc3, 0x09,
+    0x0e, 0x00, 0x83, 0x30, 0xc3, 0x3a, 0x09, 0x00, 0x83, 0x71, 0xc3, 0xdf,
+    0x5b, 0x00, 0x83, 0x79, 0xc4, 0xaa, 0x0d, 0x00, 0x83, 0x80, 0x94, 0x00,
+    0x82, 0x98, 0x8e, 0x00, 0x82, 0xa8, 0x8b, 0x00, 0x84, 0xe8, 0xc6, 0x00,
+    0xd3, 0x00, 0x84, 0x28, 0x45, 0x03, 0x14, 0xc2, 0x82, 0xa3, 0x83, 0x01,
+    0x85, 0xa9, 0x8b, 0x01, 0x85, 0xb9, 0x97, 0x01, 0x85, 0xc9, 0x87, 0x01,
+    0x85, 0xd9, 0x91, 0x01, 0x85, 0xe8, 0x47, 0x78, 0xc0, 0x42, 0x82, 0xe0,
+    0x8b, 0x01, 0x86, 0xfb, 0x02, 0x82, 0xee, 0x83, 0x01, 0x86, 0xf1, 0x97,
+    0x01, 0x87, 0x01, 0x87, 0x01, 0x87, 0x09, 0x91, 0x01, 0x87, 0x10, 0x83,
+    0x01, 0x85, 0x59, 0x8b, 0x01, 0x85, 0x69, 0x97, 0x01, 0x85, 0x79, 0x87,
+    0x01, 0x85, 0x89, 0x91, 0x01, 0x85, 0x98, 0x83, 0x01, 0x85, 0x61, 0x8b,
+    0x01, 0x85, 0x71, 0x97, 0x01, 0x85, 0x81, 0x87, 0x01, 0x85, 0x91, 0x91,
+    0x01, 0x85, 0xa0, 0x83, 0x01, 0x85, 0xb1, 0x8b, 0x01, 0x85, 0xc1, 0x97,
+    0x01, 0x85, 0xd1, 0x87, 0x01, 0x85, 0xe1, 0x91, 0x01, 0x85, 0xf0, 0x83,
+    0x01, 0x85, 0xf9, 0x8b, 0x01, 0x86, 0x09, 0x97, 0x01, 0x86, 0x21, 0x87,
+    0x01, 0x86, 0x31, 0x91, 0x01, 0x86, 0x40, 0x83, 0x01, 0x86, 0x01, 0x8b,
+    0x01, 0x86, 0x11, 0x97, 0x01, 0x86, 0x29, 0x87, 0x01, 0x86, 0x39, 0x91,
+    0x01, 0x86, 0x48, 0x83, 0x01, 0x86, 0x51, 0x8b, 0x01, 0x86, 0x59, 0x97,
+    0x01, 0x86, 0x61, 0x87, 0x01, 0x86, 0x69, 0x91, 0x01, 0x86, 0x70, 0x83,
+    0x01, 0x86, 0x79, 0x8b, 0x01, 0x86, 0x91, 0x97, 0x01, 0x86, 0xa9, 0x87,
+    0x01, 0x86, 0xc1, 0x91, 0x01, 0x86, 0xd8, 0x83, 0x01, 0x86, 0x81, 0x8b,
+    0x01, 0x86, 0x99, 0x97, 0x01, 0x86, 0xb1, 0x87, 0x01, 0x86, 0xc9, 0x91,
+    0x01, 0x86, 0xe0, 0x83, 0x01, 0x86, 0x89, 0x8b, 0x01, 0x86, 0xa1, 0x97,
+    0x01, 0x86, 0xb9, 0x87, 0x01, 0x86, 0xd1, 0x91, 0x01, 0x86, 0xe8, 0x83,
+    0x01, 0x87, 0x21, 0x97, 0x01, 0x87, 0x31, 0x91, 0x01, 0x87, 0x40, 0x83,
+    0x01, 0x87, 0x49, 0x8b, 0x01, 0x87, 0x51, 0x97, 0x01, 0x87, 0x59, 0x87,
+    0x01, 0x87, 0x61, 0x91, 0x01, 0x87, 0x68, 0x83, 0x01, 0x87, 0x79, 0x8b,
+    0x01, 0x87, 0x81, 0x87, 0x01, 0x87, 0x89, 0x91, 0x01, 0x87, 0x90, 0x97,
+    0x01, 0x87, 0xa1, 0x83, 0x01, 0x87, 0xb9, 0x8b, 0x01, 0x87, 0xc1, 0x87,
+    0x01, 0x87, 0xc9, 0x91, 0x01, 0x87, 0xd0, 0xc4, 0x1e, 0x97, 0x08, 0x85,
+    0xc9, 0xc5, 0x40, 0xe7, 0x08, 0x84, 0x10, 0xc2, 0x00, 0xd0, 0x08, 0x84,
+    0xd9, 0xc3, 0x40, 0xe2, 0x08, 0x84, 0xd1, 0x83, 0x08, 0x84, 0xc8, 0xc2,
+    0x00, 0xd0, 0x08, 0x84, 0xc1, 0x83, 0x08, 0x84, 0xb8, 0xd2, 0x4a, 0x87,
+    0x00, 0x64, 0x01, 0xc6, 0xc3, 0x62, 0x00, 0x64, 0x20, 0xc7, 0x14, 0x39,
+    0x00, 0x64, 0x11, 0xc7, 0x7a, 0x7f, 0x00, 0x65, 0xe8, 0xc5, 0x40, 0xe7,
+    0x00, 0x64, 0x19, 0xc4, 0x1e, 0x97, 0x00, 0x66, 0x68, 0x83, 0x00, 0x64,
+    0x2b, 0x02, 0x82, 0xf4, 0x8b, 0x00, 0x64, 0x3b, 0x02, 0x83, 0x00, 0x97,
+    0x00, 0x64, 0x4b, 0x02, 0x83, 0x04, 0x18, 0xc2, 0x83, 0x08, 0x87, 0x00,
+    0x64, 0x73, 0x02, 0x83, 0x12, 0x91, 0x00, 0x64, 0x93, 0x02, 0x83, 0x16,
+    0x0d, 0xc2, 0x83, 0x1a, 0x09, 0xc2, 0x83, 0x24, 0x10, 0xc2, 0x83, 0x2e,
+    0x05, 0xc2, 0x83, 0x47, 0x0c, 0xc2, 0x83, 0x51, 0x16, 0xc2, 0x83, 0x5b,
+    0x06, 0xc2, 0x83, 0x69, 0x12, 0xc2, 0x83, 0x77, 0x04, 0xc2, 0x83, 0x81,
+    0xc2, 0x01, 0xc3, 0x00, 0x65, 0x71, 0xc2, 0x19, 0x2c, 0x00, 0x65, 0x79,
+    0x14, 0xc2, 0x83, 0x8b, 0x0e, 0xc2, 0x83, 0x95, 0x15, 0xc2, 0x83, 0x9d,
+    0xc2, 0x00, 0xd0, 0x00, 0x65, 0xc9, 0xc2, 0x00, 0x87, 0x00, 0x66, 0xf0,
+    0x83, 0x00, 0x65, 0xf1, 0x8b, 0x00, 0x66, 0x41, 0x97, 0x00, 0x66, 0x60,
+    0x8b, 0x00, 0x66, 0x00, 0x97, 0x00, 0x66, 0x10, 0x94, 0x00, 0x66, 0x1b,
+    0x02, 0x83, 0xad, 0x8e, 0x00, 0x67, 0x12, 0x02, 0x83, 0xb1, 0x87, 0x00,
+    0x66, 0x38, 0x91, 0x00, 0x66, 0x58, 0xc2, 0x02, 0xa0, 0x00, 0x67, 0x41,
+    0xc4, 0x02, 0xde, 0x00, 0x67, 0x48, 0xc3, 0x09, 0x9e, 0x00, 0x67, 0x51,
+    0xc3, 0x0d, 0x14, 0x00, 0x67, 0x58, 0xc2, 0x22, 0xcc, 0x00, 0x67, 0x61,
+    0xc4, 0x18, 0x10, 0x00, 0x67, 0x68, 0xc2, 0x02, 0x6f, 0x01, 0x78, 0x03,
+    0x02, 0x83, 0xb5, 0x12, 0xc2, 0x83, 0xbb, 0xc2, 0x18, 0xb3, 0x01, 0x7b,
+    0xe0, 0x0b, 0xc2, 0x83, 0xc7, 0x07, 0xc2, 0x83, 0xd7, 0x03, 0xc2, 0x83,
+    0xe7, 0xc3, 0x08, 0x48, 0x01, 0x7d, 0x3a, 0x02, 0x83, 0xf3, 0x11, 0xc2,
+    0x83, 0xf9, 0x0b, 0xc2, 0x84, 0x1c, 0x14, 0xc2, 0x84, 0x2c, 0x07, 0x42,
+    0x84, 0x3c, 0x0e, 0xc2, 0x84, 0x48, 0x07, 0xc2, 0x84, 0x52, 0x12, 0xc2,
+    0x84, 0x68, 0x05, 0xc2, 0x84, 0x7e, 0xc4, 0x03, 0x14, 0x01, 0x79, 0x49,
+    0x0a, 0xc2, 0x84, 0x8a, 0xc4, 0xb0, 0xd3, 0x01, 0x79, 0xc9, 0x16, 0xc2,
+    0x84, 0x92, 0xc5, 0x0b, 0x0a, 0x01, 0x7a, 0x29, 0xc2, 0x05, 0x1d, 0x01,
+    0x7a, 0x39, 0x03, 0xc2, 0x84, 0xa0, 0xc4, 0x49, 0x26, 0x01, 0x7b, 0x11,
+    0x0b, 0xc2, 0x84, 0xb0, 0xc3, 0x56, 0x1d, 0x01, 0x7b, 0x51, 0xc4, 0x0d,
+    0xed, 0x01, 0x7d, 0x98, 0x11, 0xc2, 0x84, 0xbc, 0xcf, 0x67, 0xec, 0x01,
+    0x78, 0xb1, 0x07, 0xc2, 0x84, 0xc6, 0x03, 0x42, 0x84, 0xd0, 0xc2, 0x02,
+    0xa0, 0x01, 0x78, 0x33, 0x02, 0x84, 0xe0, 0x03, 0xc2, 0x84, 0xe6, 0xc2,
+    0x00, 0xc4, 0x01, 0x78, 0xb9, 0x42, 0x00, 0x33, 0xc2, 0x84, 0xf8, 0x14,
+    0xc2, 0x85, 0x04, 0x0b, 0xc2, 0x85, 0x16, 0x11, 0x42, 0x85, 0x22, 0xc2,
+    0x00, 0xd1, 0x01, 0x78, 0x41, 0x11, 0xc2, 0x85, 0x2e, 0x07, 0xc2, 0x85,
+    0x3c, 0x0b, 0x42, 0x85, 0x48, 0x10, 0xc2, 0x85, 0x54, 0xc4, 0x00, 0x2d,
+    0x01, 0x78, 0x59, 0x03, 0xc2, 0x85, 0x60, 0xc3, 0x18, 0x11, 0x01, 0x7e,
+    0x8b, 0x02, 0x85, 0x6b, 0xc2, 0x0c, 0x43, 0x01, 0x7b, 0x61, 0xc9, 0xa9,
+    0xf3, 0x01, 0x7e, 0x58, 0x11, 0xc2, 0x85, 0x71, 0x0e, 0xc2, 0x85, 0x8d,
+    0xc4, 0xdf, 0xbb, 0x01, 0x79, 0x31, 0x03, 0xc2, 0x85, 0x9d, 0xc3, 0x25,
+    0x4d, 0x01, 0x7d, 0x10, 0xc2, 0x00, 0x89, 0x01, 0x78, 0x71, 0x10, 0x42,
+    0x85, 0xaf, 0xc4, 0x00, 0x27, 0x01, 0x78, 0x91, 0x14, 0xc2, 0x85, 0xbb,
+    0xc3, 0x01, 0xc8, 0x01, 0x7b, 0xf1, 0xc2, 0x00, 0x2d, 0x01, 0x7c, 0xb8,
+    0x14, 0xc2, 0x85, 0xc7, 0x11, 0xc2, 0x85, 0xd3, 0x07, 0xc2, 0x85, 0xdf,
+    0x03, 0xc2, 0x85, 0xeb, 0x0a, 0xc2, 0x85, 0xfa, 0x42, 0x00, 0x74, 0x42,
+    0x86, 0x06, 0x0b, 0xc2, 0x86, 0x0e, 0xc3, 0xbb, 0x1c, 0x01, 0x79, 0x39,
+    0x03, 0xc2, 0x86, 0x20, 0xc2, 0x00, 0xa8, 0x01, 0x7c, 0xd1, 0xc2, 0x05,
+    0x1d, 0x01, 0x7c, 0xd8, 0xc4, 0x46, 0xf6, 0x01, 0x78, 0xe1, 0xc2, 0x24,
+    0xe2, 0x01, 0x7a, 0x21, 0x42, 0x01, 0xa3, 0xc2, 0x86, 0x2e, 0xc2, 0x02,
+    0x35, 0x01, 0x7b, 0xe8, 0x91, 0x01, 0x79, 0x0b, 0x02, 0x86, 0x3a, 0x42,
+    0x00, 0x39, 0xc2, 0x86, 0x46, 0xc3, 0x00, 0xfe, 0x01, 0x7d, 0x41, 0xc4,
+    0xe0, 0x07, 0x01, 0x7e, 0x08, 0x0b, 0xc2, 0x86, 0x52, 0x11, 0xc2, 0x86,
+    0x62, 0x14, 0xc2, 0x86, 0x7e, 0x03, 0xc2, 0x86, 0x90, 0x0e, 0xc2, 0x86,
+    0x9c, 0xc3, 0x0e, 0x8b, 0x01, 0x7c, 0xb0, 0x11, 0xc2, 0x86, 0xae, 0xc2,
+    0x00, 0x3d, 0x01, 0x7b, 0xc8, 0xc2, 0x00, 0x33, 0x01, 0x7a, 0x89, 0x0b,
+    0xc2, 0x86, 0xb8, 0x03, 0xc2, 0x86, 0xd0, 0xc6, 0x14, 0xdb, 0x01, 0x7b,
+    0xd9, 0xc3, 0x65, 0xba, 0x01, 0x7c, 0xe1, 0x0e, 0xc2, 0x86, 0xe2, 0x14,
+    0x42, 0x86, 0xec, 0xc2, 0x00, 0x06, 0x01, 0x7a, 0xf9, 0x94, 0x01, 0x7b,
+    0xc0, 0xc5, 0xd9, 0xf2, 0x01, 0x7c, 0xa9, 0xc6, 0xd0, 0xaf, 0x01, 0x7d,
+    0x28, 0xa2, 0x0c, 0x66, 0xa9, 0xa1, 0x0c, 0x66, 0xa1, 0xa0, 0x0c, 0x66,
+    0x99, 0x9f, 0x0c, 0x66, 0x91, 0x9e, 0x0c, 0x66, 0x89, 0x9d, 0x0c, 0x66,
+    0x80, 0x88, 0x0c, 0x66, 0x79, 0x87, 0x0c, 0x66, 0x71, 0x86, 0x0c, 0x66,
+    0x69, 0x85, 0x0c, 0x66, 0x61, 0x84, 0x0c, 0x66, 0x59, 0x83, 0x0c, 0x66,
+    0x51, 0xa6, 0x0c, 0x66, 0x49, 0xa5, 0x0c, 0x66, 0x41, 0xa4, 0x0c, 0x66,
+    0x39, 0xa3, 0x0c, 0x66, 0x31, 0xa2, 0x0c, 0x66, 0x29, 0xa1, 0x0c, 0x66,
+    0x21, 0xa0, 0x0c, 0x66, 0x19, 0x9f, 0x0c, 0x66, 0x11, 0x9e, 0x0c, 0x66,
+    0x09, 0x9d, 0x0c, 0x66, 0x00, 0x88, 0x0c, 0x65, 0xf9, 0x87, 0x0c, 0x65,
+    0xf1, 0x86, 0x0c, 0x65, 0xe9, 0x85, 0x0c, 0x65, 0xe1, 0x84, 0x0c, 0x65,
+    0xd9, 0x83, 0x0c, 0x65, 0xd1, 0xa6, 0x0c, 0x65, 0xc9, 0xa5, 0x0c, 0x65,
+    0xc1, 0xa4, 0x0c, 0x65, 0xb9, 0xa3, 0x0c, 0x65, 0xb1, 0xa2, 0x0c, 0x65,
+    0xa9, 0xa1, 0x0c, 0x65, 0xa1, 0xa0, 0x0c, 0x65, 0x99, 0x9f, 0x0c, 0x65,
+    0x91, 0x9e, 0x0c, 0x65, 0x89, 0x9d, 0x0c, 0x65, 0x80, 0x88, 0x0c, 0x65,
+    0x79, 0x87, 0x0c, 0x65, 0x71, 0x86, 0x0c, 0x65, 0x69, 0x85, 0x0c, 0x65,
+    0x61, 0x84, 0x0c, 0x65, 0x59, 0x83, 0x0c, 0x65, 0x51, 0xa6, 0x0c, 0x65,
+    0x49, 0xa5, 0x0c, 0x65, 0x41, 0xa4, 0x0c, 0x65, 0x39, 0xa3, 0x0c, 0x65,
+    0x31, 0xa2, 0x0c, 0x65, 0x29, 0xa1, 0x0c, 0x65, 0x21, 0xa0, 0x0c, 0x65,
+    0x19, 0x9f, 0x0c, 0x65, 0x11, 0x9e, 0x0c, 0x65, 0x09, 0x9d, 0x0c, 0x65,
+    0x00, 0x88, 0x0c, 0x64, 0xf9, 0x87, 0x0c, 0x64, 0xf1, 0x86, 0x0c, 0x64,
+    0xe9, 0x85, 0x0c, 0x64, 0xe1, 0x84, 0x0c, 0x64, 0xd9, 0x83, 0x0c, 0x64,
+    0xd1, 0xa6, 0x0c, 0x64, 0xc9, 0xa5, 0x0c, 0x64, 0xc1, 0xa4, 0x0c, 0x64,
+    0xb9, 0xa3, 0x0c, 0x64, 0xb1, 0xa2, 0x0c, 0x64, 0xa9, 0xa1, 0x0c, 0x64,
+    0xa1, 0xa0, 0x0c, 0x64, 0x99, 0x9f, 0x0c, 0x64, 0x91, 0x9e, 0x0c, 0x64,
+    0x89, 0x9d, 0x0c, 0x64, 0x80, 0x88, 0x0c, 0x64, 0x79, 0x87, 0x0c, 0x64,
+    0x71, 0x86, 0x0c, 0x64, 0x69, 0x85, 0x0c, 0x64, 0x61, 0x84, 0x0c, 0x64,
+    0x59, 0x83, 0x0c, 0x64, 0x51, 0xa6, 0x0c, 0x64, 0x49, 0xa5, 0x0c, 0x64,
+    0x41, 0xa4, 0x0c, 0x64, 0x39, 0xa3, 0x0c, 0x64, 0x31, 0xa2, 0x0c, 0x64,
+    0x29, 0xa1, 0x0c, 0x64, 0x21, 0xa0, 0x0c, 0x64, 0x19, 0x9f, 0x0c, 0x64,
+    0x11, 0x9e, 0x0c, 0x64, 0x09, 0x9d, 0x0c, 0x64, 0x00, 0x88, 0x0c, 0x63,
+    0xf9, 0x87, 0x0c, 0x63, 0xf1, 0x86, 0x0c, 0x63, 0xe9, 0x85, 0x0c, 0x63,
+    0xe1, 0x84, 0x0c, 0x63, 0xd9, 0x83, 0x0c, 0x63, 0xd1, 0xa6, 0x0c, 0x63,
+    0xc9, 0xa5, 0x0c, 0x63, 0xc1, 0xa4, 0x0c, 0x63, 0xb9, 0xa3, 0x0c, 0x63,
+    0xb1, 0xa2, 0x0c, 0x63, 0xa9, 0xa1, 0x0c, 0x63, 0xa1, 0xa0, 0x0c, 0x63,
+    0x99, 0x9f, 0x0c, 0x63, 0x91, 0x9e, 0x0c, 0x63, 0x89, 0x9d, 0x0c, 0x63,
+    0x80, 0x88, 0x0c, 0x63, 0x79, 0x87, 0x0c, 0x63, 0x71, 0x86, 0x0c, 0x63,
+    0x69, 0x85, 0x0c, 0x63, 0x61, 0x84, 0x0c, 0x63, 0x59, 0x83, 0x0c, 0x63,
+    0x51, 0xa6, 0x0c, 0x63, 0x49, 0xa5, 0x0c, 0x63, 0x41, 0xa4, 0x0c, 0x63,
+    0x39, 0xa3, 0x0c, 0x63, 0x31, 0xa2, 0x0c, 0x63, 0x29, 0xa1, 0x0c, 0x63,
+    0x21, 0xa0, 0x0c, 0x63, 0x19, 0x9f, 0x0c, 0x63, 0x11, 0x9e, 0x0c, 0x63,
+    0x09, 0x9d, 0x0c, 0x63, 0x00, 0x88, 0x0c, 0x62, 0xf9, 0x87, 0x0c, 0x62,
+    0xf1, 0x86, 0x0c, 0x62, 0xe9, 0x85, 0x0c, 0x62, 0xe1, 0x84, 0x0c, 0x62,
+    0xd9, 0x83, 0x0c, 0x62, 0xd1, 0xa6, 0x0c, 0x62, 0xc9, 0xa5, 0x0c, 0x62,
+    0xc1, 0xa4, 0x0c, 0x62, 0xb9, 0xa3, 0x0c, 0x62, 0xb1, 0xa2, 0x0c, 0x62,
+    0xa9, 0xa1, 0x0c, 0x62, 0xa1, 0xa0, 0x0c, 0x62, 0x99, 0x9f, 0x0c, 0x62,
+    0x91, 0x9e, 0x0c, 0x62, 0x89, 0x9d, 0x0c, 0x62, 0x80, 0x88, 0x0c, 0x62,
+    0x79, 0x87, 0x0c, 0x62, 0x71, 0x86, 0x0c, 0x62, 0x69, 0x85, 0x0c, 0x62,
+    0x61, 0x84, 0x0c, 0x62, 0x59, 0x83, 0x0c, 0x62, 0x51, 0xa6, 0x0c, 0x62,
+    0x49, 0xa5, 0x0c, 0x62, 0x41, 0xa4, 0x0c, 0x62, 0x39, 0xa3, 0x0c, 0x62,
+    0x31, 0xa2, 0x0c, 0x62, 0x29, 0xa1, 0x0c, 0x62, 0x21, 0xa0, 0x0c, 0x62,
+    0x19, 0x9f, 0x0c, 0x62, 0x11, 0x9e, 0x0c, 0x62, 0x09, 0x9d, 0x0c, 0x62,
+    0x00, 0x88, 0x0c, 0x61, 0xf9, 0x87, 0x0c, 0x61, 0xf1, 0x86, 0x0c, 0x61,
+    0xe9, 0x85, 0x0c, 0x61, 0xe1, 0x84, 0x0c, 0x61, 0xd9, 0x83, 0x0c, 0x61,
+    0xd1, 0xa6, 0x0c, 0x61, 0xc9, 0xa5, 0x0c, 0x61, 0xc1, 0xa4, 0x0c, 0x61,
+    0xb9, 0xa3, 0x0c, 0x61, 0xb1, 0xa2, 0x0c, 0x61, 0xa9, 0xa1, 0x0c, 0x61,
+    0xa1, 0xa0, 0x0c, 0x61, 0x99, 0x9f, 0x0c, 0x61, 0x91, 0x9e, 0x0c, 0x61,
+    0x89, 0x9d, 0x0c, 0x61, 0x80, 0x88, 0x0c, 0x61, 0x79, 0x87, 0x0c, 0x61,
+    0x71, 0x86, 0x0c, 0x61, 0x69, 0x85, 0x0c, 0x61, 0x61, 0x84, 0x0c, 0x61,
+    0x59, 0x83, 0x0c, 0x61, 0x51, 0xa6, 0x0c, 0x61, 0x49, 0xa5, 0x0c, 0x61,
+    0x41, 0xa4, 0x0c, 0x61, 0x39, 0xa3, 0x0c, 0x61, 0x31, 0xa2, 0x0c, 0x61,
+    0x29, 0xa1, 0x0c, 0x61, 0x21, 0xa0, 0x0c, 0x61, 0x19, 0x9f, 0x0c, 0x61,
+    0x11, 0x9e, 0x0c, 0x61, 0x09, 0x9d, 0x0c, 0x61, 0x00, 0x88, 0x0c, 0x60,
+    0xf9, 0x87, 0x0c, 0x60, 0xf1, 0x86, 0x0c, 0x60, 0xe9, 0x85, 0x0c, 0x60,
+    0xe1, 0x84, 0x0c, 0x60, 0xd9, 0x83, 0x0c, 0x60, 0xd1, 0xa6, 0x0c, 0x60,
+    0xc9, 0xa5, 0x0c, 0x60, 0xc1, 0xa4, 0x0c, 0x60, 0xb9, 0xa3, 0x0c, 0x60,
+    0xb1, 0xa2, 0x0c, 0x60, 0xa9, 0xa1, 0x0c, 0x60, 0xa1, 0xa0, 0x0c, 0x60,
+    0x99, 0x9f, 0x0c, 0x60, 0x91, 0x9e, 0x0c, 0x60, 0x89, 0x9d, 0x0c, 0x60,
+    0x80, 0x88, 0x0c, 0x60, 0x79, 0x87, 0x0c, 0x60, 0x71, 0x86, 0x0c, 0x60,
+    0x69, 0x85, 0x0c, 0x60, 0x61, 0x84, 0x0c, 0x60, 0x59, 0x83, 0x0c, 0x60,
+    0x51, 0xa6, 0x0c, 0x60, 0x49, 0xa5, 0x0c, 0x60, 0x41, 0xa4, 0x0c, 0x60,
+    0x39, 0xa3, 0x0c, 0x60, 0x31, 0xa2, 0x0c, 0x60, 0x29, 0xa1, 0x0c, 0x60,
+    0x21, 0xa0, 0x0c, 0x60, 0x19, 0x9f, 0x0c, 0x60, 0x11, 0x9e, 0x0c, 0x60,
+    0x09, 0x9d, 0x0c, 0x60, 0x00, 0x88, 0x0c, 0x5f, 0xf9, 0x87, 0x0c, 0x5f,
+    0xf1, 0x86, 0x0c, 0x5f, 0xe9, 0x85, 0x0c, 0x5f, 0xe1, 0x84, 0x0c, 0x5f,
+    0xd9, 0x83, 0x0c, 0x5f, 0xd1, 0xa6, 0x0c, 0x5f, 0xc9, 0xa5, 0x0c, 0x5f,
+    0xc1, 0xa4, 0x0c, 0x5f, 0xb9, 0xa3, 0x0c, 0x5f, 0xb1, 0xa2, 0x0c, 0x5f,
+    0xa9, 0xa1, 0x0c, 0x5f, 0xa1, 0xa0, 0x0c, 0x5f, 0x99, 0x9f, 0x0c, 0x5f,
+    0x91, 0x9e, 0x0c, 0x5f, 0x89, 0x9d, 0x0c, 0x5f, 0x80, 0x88, 0x0c, 0x5f,
+    0x79, 0x87, 0x0c, 0x5f, 0x71, 0x86, 0x0c, 0x5f, 0x69, 0x85, 0x0c, 0x5f,
+    0x61, 0x84, 0x0c, 0x5f, 0x59, 0x83, 0x0c, 0x5f, 0x51, 0xa6, 0x0c, 0x5f,
+    0x49, 0xa5, 0x0c, 0x5f, 0x41, 0xa4, 0x0c, 0x5f, 0x39, 0xa3, 0x0c, 0x5f,
+    0x31, 0xa2, 0x0c, 0x5f, 0x29, 0xa1, 0x0c, 0x5f, 0x21, 0xa0, 0x0c, 0x5f,
+    0x19, 0x9f, 0x0c, 0x5f, 0x11, 0x9e, 0x0c, 0x5f, 0x09, 0x9d, 0x0c, 0x5f,
+    0x00, 0x88, 0x0c, 0x5e, 0xf9, 0x87, 0x0c, 0x5e, 0xf1, 0x86, 0x0c, 0x5e,
+    0xe9, 0x85, 0x0c, 0x5e, 0xe1, 0x84, 0x0c, 0x5e, 0xd9, 0x83, 0x0c, 0x5e,
+    0xd1, 0xa6, 0x0c, 0x5e, 0xc9, 0xa5, 0x0c, 0x5e, 0xc1, 0xa4, 0x0c, 0x5e,
+    0xb9, 0xa3, 0x0c, 0x5e, 0xb1, 0xa2, 0x0c, 0x5e, 0xa9, 0xa1, 0x0c, 0x5e,
+    0xa1, 0xa0, 0x0c, 0x5e, 0x99, 0x9f, 0x0c, 0x5e, 0x91, 0x9e, 0x0c, 0x5e,
+    0x89, 0x9d, 0x0c, 0x5e, 0x80, 0x88, 0x0c, 0x5e, 0x79, 0x87, 0x0c, 0x5e,
+    0x71, 0x86, 0x0c, 0x5e, 0x69, 0x85, 0x0c, 0x5e, 0x61, 0x84, 0x0c, 0x5e,
+    0x59, 0x83, 0x0c, 0x5e, 0x51, 0xa6, 0x0c, 0x5e, 0x49, 0xa5, 0x0c, 0x5e,
+    0x41, 0xa4, 0x0c, 0x5e, 0x39, 0xa3, 0x0c, 0x5e, 0x31, 0xa2, 0x0c, 0x5e,
+    0x29, 0xa1, 0x0c, 0x5e, 0x21, 0xa0, 0x0c, 0x5e, 0x19, 0x9f, 0x0c, 0x5e,
+    0x11, 0x9e, 0x0c, 0x5e, 0x09, 0x9d, 0x0c, 0x5e, 0x00, 0x88, 0x0c, 0x5d,
+    0xf9, 0x87, 0x0c, 0x5d, 0xf1, 0x86, 0x0c, 0x5d, 0xe9, 0x85, 0x0c, 0x5d,
+    0xe1, 0x84, 0x0c, 0x5d, 0xd9, 0x83, 0x0c, 0x5d, 0xd1, 0xa6, 0x0c, 0x5d,
+    0xc9, 0xa5, 0x0c, 0x5d, 0xc1, 0xa4, 0x0c, 0x5d, 0xb9, 0xa3, 0x0c, 0x5d,
+    0xb1, 0xa2, 0x0c, 0x5d, 0xa9, 0xa1, 0x0c, 0x5d, 0xa1, 0xa0, 0x0c, 0x5d,
+    0x99, 0x9f, 0x0c, 0x5d, 0x91, 0x9e, 0x0c, 0x5d, 0x89, 0x9d, 0x0c, 0x5d,
+    0x80, 0x88, 0x0c, 0x5d, 0x79, 0x87, 0x0c, 0x5d, 0x71, 0x86, 0x0c, 0x5d,
+    0x69, 0x85, 0x0c, 0x5d, 0x61, 0x84, 0x0c, 0x5d, 0x59, 0x83, 0x0c, 0x5d,
+    0x51, 0xa6, 0x0c, 0x5d, 0x49, 0xa5, 0x0c, 0x5d, 0x41, 0xa4, 0x0c, 0x5d,
+    0x39, 0xa3, 0x0c, 0x5d, 0x31, 0xa2, 0x0c, 0x5d, 0x29, 0xa1, 0x0c, 0x5d,
+    0x21, 0xa0, 0x0c, 0x5d, 0x19, 0x9f, 0x0c, 0x5d, 0x11, 0x9e, 0x0c, 0x5d,
+    0x09, 0x9d, 0x0c, 0x5d, 0x00, 0x88, 0x0c, 0x5c, 0xf9, 0x87, 0x0c, 0x5c,
+    0xf1, 0x86, 0x0c, 0x5c, 0xe9, 0x85, 0x0c, 0x5c, 0xe1, 0x84, 0x0c, 0x5c,
+    0xd9, 0x83, 0x0c, 0x5c, 0xd1, 0xa6, 0x0c, 0x5c, 0xc9, 0xa5, 0x0c, 0x5c,
+    0xc1, 0xa4, 0x0c, 0x5c, 0xb9, 0xa3, 0x0c, 0x5c, 0xb1, 0xa2, 0x0c, 0x5c,
+    0xa9, 0xa1, 0x0c, 0x5c, 0xa1, 0xa0, 0x0c, 0x5c, 0x99, 0x9f, 0x0c, 0x5c,
+    0x91, 0x9e, 0x0c, 0x5c, 0x89, 0x9d, 0x0c, 0x5c, 0x80, 0x88, 0x0c, 0x5c,
+    0x79, 0x87, 0x0c, 0x5c, 0x71, 0x86, 0x0c, 0x5c, 0x69, 0x85, 0x0c, 0x5c,
+    0x61, 0x84, 0x0c, 0x5c, 0x59, 0x83, 0x0c, 0x5c, 0x51, 0xa6, 0x0c, 0x5c,
+    0x49, 0xa5, 0x0c, 0x5c, 0x41, 0xa4, 0x0c, 0x5c, 0x39, 0xa3, 0x0c, 0x5c,
+    0x31, 0xa2, 0x0c, 0x5c, 0x29, 0xa1, 0x0c, 0x5c, 0x21, 0xa0, 0x0c, 0x5c,
+    0x19, 0x9f, 0x0c, 0x5c, 0x11, 0x9e, 0x0c, 0x5c, 0x09, 0x9d, 0x0c, 0x5c,
+    0x00, 0x88, 0x0c, 0x5b, 0xf9, 0x87, 0x0c, 0x5b, 0xf1, 0x86, 0x0c, 0x5b,
+    0xe9, 0x85, 0x0c, 0x5b, 0xe1, 0x84, 0x0c, 0x5b, 0xd9, 0x83, 0x0c, 0x5b,
+    0xd1, 0xa6, 0x0c, 0x5b, 0xc9, 0xa5, 0x0c, 0x5b, 0xc1, 0xa4, 0x0c, 0x5b,
+    0xb9, 0xa3, 0x0c, 0x5b, 0xb1, 0xa2, 0x0c, 0x5b, 0xa9, 0xa1, 0x0c, 0x5b,
+    0xa1, 0xa0, 0x0c, 0x5b, 0x99, 0x9f, 0x0c, 0x5b, 0x91, 0x9e, 0x0c, 0x5b,
+    0x89, 0x9d, 0x0c, 0x5b, 0x80, 0x88, 0x0c, 0x5b, 0x79, 0x87, 0x0c, 0x5b,
+    0x71, 0x86, 0x0c, 0x5b, 0x69, 0x85, 0x0c, 0x5b, 0x61, 0x84, 0x0c, 0x5b,
+    0x59, 0x83, 0x0c, 0x5b, 0x51, 0xa6, 0x0c, 0x5b, 0x49, 0xa5, 0x0c, 0x5b,
+    0x41, 0xa4, 0x0c, 0x5b, 0x39, 0xa3, 0x0c, 0x5b, 0x31, 0xa2, 0x0c, 0x5b,
+    0x29, 0xa1, 0x0c, 0x5b, 0x21, 0xa0, 0x0c, 0x5b, 0x19, 0x9f, 0x0c, 0x5b,
+    0x11, 0x9e, 0x0c, 0x5b, 0x09, 0x9d, 0x0c, 0x5b, 0x00, 0x88, 0x0c, 0x5a,
+    0xf9, 0x87, 0x0c, 0x5a, 0xf1, 0x86, 0x0c, 0x5a, 0xe9, 0x85, 0x0c, 0x5a,
+    0xe1, 0x84, 0x0c, 0x5a, 0xd9, 0x83, 0x0c, 0x5a, 0xd1, 0xa6, 0x0c, 0x5a,
+    0xc9, 0xa5, 0x0c, 0x5a, 0xc1, 0xa4, 0x0c, 0x5a, 0xb9, 0xa3, 0x0c, 0x5a,
+    0xb1, 0xa2, 0x0c, 0x5a, 0xa9, 0xa1, 0x0c, 0x5a, 0xa1, 0xa0, 0x0c, 0x5a,
+    0x99, 0x9f, 0x0c, 0x5a, 0x91, 0x9e, 0x0c, 0x5a, 0x89, 0x9d, 0x0c, 0x5a,
+    0x80, 0x88, 0x0c, 0x5a, 0x79, 0x87, 0x0c, 0x5a, 0x71, 0x86, 0x0c, 0x5a,
+    0x69, 0x85, 0x0c, 0x5a, 0x61, 0x84, 0x0c, 0x5a, 0x59, 0x83, 0x0c, 0x5a,
+    0x51, 0xa6, 0x0c, 0x5a, 0x49, 0xa5, 0x0c, 0x5a, 0x41, 0xa4, 0x0c, 0x5a,
+    0x39, 0xa3, 0x0c, 0x5a, 0x31, 0xa2, 0x0c, 0x5a, 0x29, 0xa1, 0x0c, 0x5a,
+    0x21, 0xa0, 0x0c, 0x5a, 0x19, 0x9f, 0x0c, 0x5a, 0x11, 0x9e, 0x0c, 0x5a,
+    0x09, 0x9d, 0x0c, 0x5a, 0x00, 0x88, 0x0c, 0x59, 0xf9, 0x87, 0x0c, 0x59,
+    0xf1, 0x86, 0x0c, 0x59, 0xe9, 0x85, 0x0c, 0x59, 0xe1, 0x84, 0x0c, 0x59,
+    0xd9, 0x83, 0x0c, 0x59, 0xd1, 0xa6, 0x0c, 0x59, 0xc9, 0xa5, 0x0c, 0x59,
+    0xc1, 0xa4, 0x0c, 0x59, 0xb9, 0xa3, 0x0c, 0x59, 0xb1, 0xa2, 0x0c, 0x59,
+    0xa9, 0xa1, 0x0c, 0x59, 0xa1, 0xa0, 0x0c, 0x59, 0x99, 0x9f, 0x0c, 0x59,
+    0x91, 0x9e, 0x0c, 0x59, 0x89, 0x9d, 0x0c, 0x59, 0x80, 0x88, 0x0c, 0x59,
+    0x79, 0x87, 0x0c, 0x59, 0x71, 0x86, 0x0c, 0x59, 0x69, 0x85, 0x0c, 0x59,
+    0x61, 0x84, 0x0c, 0x59, 0x59, 0x83, 0x0c, 0x59, 0x51, 0xa6, 0x0c, 0x59,
+    0x49, 0xa5, 0x0c, 0x59, 0x41, 0xa4, 0x0c, 0x59, 0x39, 0xa3, 0x0c, 0x59,
+    0x31, 0xa2, 0x0c, 0x59, 0x29, 0xa1, 0x0c, 0x59, 0x21, 0xa0, 0x0c, 0x59,
+    0x19, 0x9f, 0x0c, 0x59, 0x11, 0x9e, 0x0c, 0x59, 0x09, 0x9d, 0x0c, 0x59,
+    0x00, 0x88, 0x0c, 0x58, 0xf9, 0x87, 0x0c, 0x58, 0xf1, 0x86, 0x0c, 0x58,
+    0xe9, 0x85, 0x0c, 0x58, 0xe1, 0x84, 0x0c, 0x58, 0xd9, 0x83, 0x0c, 0x58,
+    0xd1, 0xa6, 0x0c, 0x58, 0xc9, 0xa5, 0x0c, 0x58, 0xc1, 0xa4, 0x0c, 0x58,
+    0xb9, 0xa3, 0x0c, 0x58, 0xb1, 0xa2, 0x0c, 0x58, 0xa9, 0xa1, 0x0c, 0x58,
+    0xa1, 0xa0, 0x0c, 0x58, 0x99, 0x9f, 0x0c, 0x58, 0x91, 0x9e, 0x0c, 0x58,
+    0x89, 0x9d, 0x0c, 0x58, 0x80, 0x88, 0x0c, 0x58, 0x79, 0x87, 0x0c, 0x58,
+    0x71, 0x86, 0x0c, 0x58, 0x69, 0x85, 0x0c, 0x58, 0x61, 0x84, 0x0c, 0x58,
+    0x59, 0x83, 0x0c, 0x58, 0x51, 0xa6, 0x0c, 0x58, 0x49, 0xa5, 0x0c, 0x58,
+    0x41, 0xa4, 0x0c, 0x58, 0x39, 0xa3, 0x0c, 0x58, 0x31, 0xa2, 0x0c, 0x58,
+    0x29, 0xa1, 0x0c, 0x58, 0x21, 0xa0, 0x0c, 0x58, 0x19, 0x9f, 0x0c, 0x58,
+    0x11, 0x9e, 0x0c, 0x58, 0x09, 0x9d, 0x0c, 0x58, 0x00, 0xc2, 0x00, 0xd0,
+    0x08, 0x96, 0x59, 0xc2, 0x0e, 0x9a, 0x08, 0x96, 0x49, 0x83, 0x08, 0x96,
+    0x40, 0xc2, 0x00, 0xd0, 0x08, 0x96, 0x39, 0x83, 0x08, 0x96, 0x30, 0xc4,
+    0xdb, 0xfb, 0x08, 0x91, 0xf1, 0xc5, 0xd7, 0x18, 0x08, 0x91, 0xb8, 0xc2,
+    0x0e, 0x9a, 0x08, 0x90, 0xe1, 0xc2, 0x00, 0xd0, 0x08, 0x90, 0xb9, 0x83,
+    0x08, 0x90, 0xb0, 0xc2, 0x00, 0xd0, 0x08, 0x90, 0xa9, 0x83, 0x08, 0x90,
+    0xa0, 0x02, 0xc2, 0x86, 0xf8, 0x00, 0x42, 0x87, 0x06, 0x43, 0x13, 0x3a,
+    0xc2, 0x87, 0x12, 0x43, 0x71, 0xed, 0xc2, 0x87, 0x1a, 0xc9, 0xb0, 0xc5,
+    0x00, 0xcf, 0x00, 0x44, 0xdf, 0x37, 0xc2, 0x87, 0x26, 0x43, 0x93, 0x74,
+    0x42, 0x87, 0x32, 0xc3, 0x38, 0x5b, 0x00, 0xcf, 0x89, 0xc4, 0xe0, 0xaf,
+    0x00, 0xcf, 0x08, 0x12, 0xc2, 0x87, 0x3e, 0x04, 0xc2, 0x87, 0x4d, 0xc4,
+    0xda, 0x97, 0x00, 0xbf, 0x89, 0xc3, 0x18, 0x91, 0x00, 0xbf, 0x80, 0xc7,
+    0xc6, 0x24, 0x00, 0xbe, 0xe9, 0xcc, 0x89, 0x31, 0x00, 0xbe, 0xe1, 0xc4,
+    0xe0, 0x0b, 0x00, 0xbe, 0x78, 0xc6, 0xcd, 0xb5, 0x00, 0xbe, 0xd1, 0xc3,
+    0x00, 0xd0, 0x00, 0xbe, 0xa1, 0xc6, 0xcd, 0x97, 0x00, 0xbe, 0x70, 0xc5,
+    0xdc, 0x22, 0x00, 0xbe, 0xc1, 0x03, 0x42, 0x87, 0x59, 0xce, 0x71, 0xe6,
+    0x00, 0xbe, 0xb1, 0xc4, 0xe4, 0x1f, 0x00, 0xbe, 0x90, 0xca, 0xa3, 0x50,
+    0x00, 0xbe, 0x69, 0xc6, 0xcc, 0xdd, 0x00, 0xbe, 0x50, 0xc4, 0xe4, 0x17,
+    0x00, 0xbe, 0x61, 0xc6, 0xd3, 0xd9, 0x00, 0xbe, 0x38, 0x97, 0x00, 0xbe,
+    0x29, 0x8b, 0x00, 0xbe, 0x19, 0x87, 0x00, 0xbe, 0x11, 0x83, 0x00, 0xbd,
+    0xb0, 0x91, 0x00, 0xbe, 0x21, 0x87, 0x00, 0xbd, 0xf0, 0x87, 0x00, 0xbe,
+    0x01, 0x8b, 0x00, 0xbd, 0xc0, 0x83, 0x00, 0xbd, 0xf9, 0x9b, 0x00, 0xbd,
+    0xd0, 0x83, 0x00, 0xbd, 0xe9, 0x97, 0x00, 0xbd, 0xe0, 0x97, 0x00, 0xbd,
+    0x99, 0x8b, 0x00, 0xbd, 0x81, 0x83, 0x00, 0xbd, 0x21, 0x93, 0x00, 0xbd,
+    0x18, 0xc3, 0x02, 0x9f, 0x00, 0xbd, 0x91, 0xc3, 0x05, 0x14, 0x00, 0xbd,
+    0x88, 0x97, 0x00, 0xbd, 0x4b, 0x02, 0x87, 0x6b, 0x8d, 0x00, 0xbd, 0x40,
+    0x8b, 0x00, 0xbd, 0x30, 0x91, 0x00, 0xbc, 0xb9, 0x83, 0x00, 0xbc, 0xa8,
+    0x91, 0x00, 0xbc, 0x91, 0x83, 0x00, 0xbc, 0x80, 0x91, 0x00, 0xbc, 0x69,
+    0x83, 0x00, 0xbc, 0x58, 0x91, 0x00, 0xbc, 0x41, 0x83, 0x00, 0xbc, 0x30,
+    0x91, 0x00, 0xbc, 0x19, 0x83, 0x00, 0xbc, 0x08, 0xca, 0x97, 0xf6, 0x08,
+    0x52, 0xb9, 0x96, 0x08, 0x52, 0x80, 0x91, 0x08, 0x50, 0x31, 0x87, 0x08,
+    0x50, 0x29, 0xc9, 0xb2, 0x2d, 0x08, 0x50, 0x19, 0x97, 0x08, 0x50, 0x11,
+    0x8b, 0x08, 0x50, 0x08, 0x16, 0xc2, 0x87, 0x6f, 0xc2, 0x00, 0xd0, 0x08,
+    0x50, 0xd9, 0x83, 0x08, 0x50, 0xd0, 0xc2, 0x00, 0xd0, 0x08, 0x50, 0xe9,
+    0x83, 0x08, 0x50, 0xe0, 0xcb, 0x20, 0x9d, 0x0f, 0xb0, 0xd1, 0xcc, 0x1d,
+    0x4a, 0x0f, 0xb0, 0xc8, 0xd7, 0x2a, 0xf5, 0x0f, 0xd2, 0x68, 0x49, 0x2a,
+    0xf5, 0x42, 0x87, 0x79, 0xc3, 0x00, 0x74, 0x0f, 0xd0, 0x03, 0x02, 0x87,
+    0x85, 0xc5, 0x56, 0xa5, 0x0f, 0xd0, 0x22, 0x02, 0x87, 0x8b, 0x49, 0x2a,
+    0xf5, 0x42, 0x87, 0x91, 0x49, 0x2a, 0xf5, 0x42, 0x87, 0x9d, 0x49, 0x2a,
+    0xf5, 0x42, 0x87, 0xa9, 0x0d, 0xc2, 0x87, 0xb5, 0xc5, 0xa8, 0xf7, 0x0f,
+    0xd1, 0x59, 0xc4, 0xde, 0x83, 0x0f, 0xd1, 0x61, 0xc6, 0xca, 0xfd, 0x0f,
+    0xd1, 0x69, 0xc4, 0xe3, 0x93, 0x0f, 0xd1, 0x78, 0x43, 0x00, 0xbc, 0xc2,
+    0x87, 0xc1, 0xc4, 0xe3, 0x5b, 0x08, 0xa2, 0x50, 0xcd, 0x80, 0x36, 0x08,
+    0xa2, 0xf9, 0x47, 0xb2, 0x2e, 0x42, 0x87, 0xe9, 0x83, 0x08, 0xa1, 0x99,
+    0xc2, 0x00, 0xd0, 0x08, 0xa1, 0x89, 0xc2, 0x0d, 0xf6, 0x08, 0xa1, 0x90,
+    0x83, 0x08, 0xa1, 0x19, 0xc2, 0x00, 0xc1, 0x08, 0xa0, 0xf1, 0x1b, 0xc2,
+    0x87, 0xf7, 0x09, 0xc2, 0x88, 0x01, 0xc2, 0x00, 0xd0, 0x08, 0xa1, 0x20,
+    0xc2, 0x00, 0xd0, 0x08, 0xa1, 0x11, 0x83, 0x08, 0xa1, 0x09, 0x06, 0x42,
+    0x88, 0x0b, 0xc2, 0x00, 0xd0, 0x08, 0xa1, 0x01, 0x83, 0x08, 0xa0, 0xf9,
+    0x16, 0x42, 0x88, 0x15, 0xc2, 0x00, 0xd0, 0x08, 0xa0, 0xb9, 0x83, 0x08,
+    0xa0, 0xb0, 0xc2, 0x00, 0xd0, 0x08, 0xa0, 0xa9, 0x83, 0x08, 0xa0, 0xa0,
+    0xc2, 0x00, 0xd0, 0x08, 0xa0, 0x89, 0x83, 0x08, 0xa0, 0x80, 0xc2, 0x00,
+    0xd0, 0x08, 0xa0, 0x79, 0x83, 0x08, 0xa0, 0x70, 0x97, 0x08, 0xa0, 0x69,
+    0x8b, 0x08, 0xa0, 0x59, 0x83, 0x08, 0xa0, 0x08, 0x97, 0x08, 0xa0, 0x28,
+    0x8b, 0x08, 0xa0, 0x18, 0x83, 0x08, 0xa1, 0x29, 0xc2, 0x00, 0xd0, 0x08,
+    0xa1, 0x30, 0x83, 0x08, 0xa1, 0x39, 0xc2, 0x00, 0xd0, 0x08, 0xa1, 0x40,
+    0x83, 0x08, 0xa1, 0x49, 0xc2, 0x00, 0xd0, 0x08, 0xa1, 0x50, 0x83, 0x08,
+    0xa1, 0x61, 0xc2, 0x00, 0xd0, 0x08, 0xa1, 0x68, 0x83, 0x08, 0xa1, 0x71,
+    0xc2, 0x00, 0xd0, 0x08, 0xa1, 0x78, 0xc5, 0x0a, 0x8a, 0x08, 0xa2, 0xd1,
+    0xc5, 0x86, 0x20, 0x08, 0xa2, 0x60, 0xc4, 0x26, 0x78, 0x08, 0xa2, 0xc9,
+    0xc5, 0x06, 0xdb, 0x08, 0xa2, 0xc1, 0x15, 0xc2, 0x88, 0x1f, 0x08, 0xc2,
+    0x88, 0x2b, 0x16, 0xc2, 0x88, 0x37, 0xc3, 0x05, 0x14, 0x08, 0xa2, 0x89,
+    0xc4, 0x15, 0xe7, 0x08, 0xa2, 0x80, 0x97, 0x08, 0xa2, 0x09, 0x8b, 0x08,
+    0xa1, 0xf9, 0x83, 0x08, 0xa1, 0xa8, 0x8e, 0x08, 0xa1, 0xe3, 0x02, 0x88,
+    0x43, 0x94, 0x08, 0xa1, 0xd2, 0x02, 0x88, 0x47, 0x97, 0x08, 0xa1, 0xc8,
+    0x8b, 0x08, 0xa1, 0xb8, 0x98, 0x00, 0xce, 0xf8, 0xcd, 0x78, 0xf3, 0x00,
+    0xce, 0xd1, 0x49, 0xac, 0xb1, 0x42, 0x88, 0x4b, 0xc4, 0x26, 0x78, 0x00,
+    0xce, 0xc9, 0xc5, 0x06, 0xdb, 0x00, 0xce, 0xc1, 0x15, 0xc2, 0x88, 0x53,
+    0x08, 0xc2, 0x88, 0x5f, 0x16, 0xc2, 0x88, 0x6b, 0xc3, 0x05, 0x14, 0x00,
+    0xce, 0x89, 0xc4, 0x15, 0xe7, 0x00, 0xce, 0x80, 0x46, 0x26, 0xf7, 0xc2,
+    0x88, 0x77, 0x44, 0x05, 0x36, 0xc2, 0x88, 0x92, 0x45, 0x08, 0xcb, 0x42,
+    0x88, 0xe0, 0x0b, 0xc2, 0x89, 0x2e, 0x97, 0x00, 0xcd, 0x9b, 0x02, 0x89,
+    0x36, 0x91, 0x00, 0xcd, 0xbb, 0x02, 0x89, 0x45, 0x03, 0xc2, 0x89, 0x50,
+    0x87, 0x00, 0xcd, 0xa9, 0xcf, 0x6a, 0x35, 0x00, 0xcd, 0x80, 0x9c, 0x0f,
+    0x8c, 0x49, 0x9b, 0x0f, 0x8c, 0x41, 0x9a, 0x0f, 0x8c, 0x39, 0x99, 0x0f,
+    0x8c, 0x31, 0x98, 0x0f, 0x8c, 0x29, 0x97, 0x0f, 0x8c, 0x21, 0x96, 0x0f,
+    0x8c, 0x19, 0x95, 0x0f, 0x8c, 0x11, 0x94, 0x0f, 0x8c, 0x09, 0x93, 0x0f,
+    0x8c, 0x01, 0x92, 0x0f, 0x8b, 0xf9, 0x91, 0x0f, 0x8b, 0xf1, 0x90, 0x0f,
+    0x8b, 0xe9, 0x8f, 0x0f, 0x8b, 0xe1, 0x8e, 0x0f, 0x8b, 0xd9, 0x8d, 0x0f,
+    0x8b, 0xd1, 0x8c, 0x0f, 0x8b, 0xc9, 0x8b, 0x0f, 0x8b, 0xc1, 0x8a, 0x0f,
+    0x8b, 0xb9, 0x89, 0x0f, 0x8b, 0xb1, 0x88, 0x0f, 0x8b, 0xa9, 0x87, 0x0f,
+    0x8b, 0xa1, 0x86, 0x0f, 0x8b, 0x99, 0x85, 0x0f, 0x8b, 0x91, 0x84, 0x0f,
+    0x8b, 0x89, 0x83, 0x0f, 0x8b, 0x80, 0x16, 0xc2, 0x89, 0x5f, 0xc8, 0x4b,
+    0x5f, 0x01, 0x27, 0x99, 0x07, 0xc2, 0x89, 0x6b, 0x15, 0xc2, 0x89, 0x77,
+    0x08, 0x42, 0x89, 0x83, 0x9c, 0x0f, 0x8b, 0x49, 0x9b, 0x0f, 0x8b, 0x41,
+    0x9a, 0x0f, 0x8b, 0x39, 0x99, 0x0f, 0x8b, 0x31, 0x98, 0x0f, 0x8b, 0x29,
+    0x97, 0x0f, 0x8b, 0x21, 0x96, 0x0f, 0x8b, 0x19, 0x95, 0x0f, 0x8b, 0x11,
+    0x94, 0x0f, 0x8b, 0x09, 0x93, 0x0f, 0x8b, 0x01, 0x92, 0x0f, 0x8a, 0xf9,
+    0x91, 0x0f, 0x8a, 0xf1, 0x90, 0x0f, 0x8a, 0xe9, 0x8f, 0x0f, 0x8a, 0xe1,
+    0x8e, 0x0f, 0x8a, 0xd9, 0x8d, 0x0f, 0x8a, 0xd1, 0x8c, 0x0f, 0x8a, 0xc9,
+    0x8b, 0x0f, 0x8a, 0xc1, 0x8a, 0x0f, 0x8a, 0xb9, 0x89, 0x0f, 0x8a, 0xb1,
+    0x88, 0x0f, 0x8a, 0xa9, 0x87, 0x0f, 0x8a, 0xa1, 0x86, 0x0f, 0x8a, 0x99,
+    0x85, 0x0f, 0x8a, 0x91, 0x84, 0x0f, 0x8a, 0x89, 0x83, 0x0f, 0x8a, 0x80,
+    0x97, 0x08, 0xce, 0xe9, 0x8b, 0x08, 0xce, 0xd9, 0x83, 0x08, 0xce, 0x88,
+    0x94, 0x08, 0xce, 0xb8, 0x97, 0x08, 0xce, 0xa8, 0x8b, 0x08, 0xce, 0x98,
+    0xc7, 0x7a, 0x7f, 0x08, 0xcf, 0x09, 0xc7, 0x14, 0x39, 0x08, 0xce, 0xf0,
+    0xc4, 0x1e, 0x97, 0x08, 0xcf, 0x01, 0xc5, 0x40, 0xe7, 0x08, 0xce, 0xf8,
+    0xc2, 0x00, 0x39, 0x08, 0xce, 0x81, 0x83, 0x08, 0xce, 0x40, 0xc2, 0x00,
+    0xdb, 0x08, 0xce, 0x79, 0x83, 0x08, 0xce, 0x48, 0x83, 0x08, 0xce, 0x69,
+    0xc2, 0x0d, 0xf6, 0x08, 0xce, 0x61, 0xc2, 0x00, 0xd0, 0x08, 0xce, 0x58,
+    0x83, 0x08, 0xce, 0x51, 0xc8, 0xb2, 0x2e, 0x08, 0xcd, 0x32, 0x02, 0x89,
+    0x8f, 0xc2, 0x00, 0xd0, 0x08, 0xce, 0x29, 0x83, 0x08, 0xce, 0x20, 0xc2,
+    0x00, 0xd0, 0x08, 0xce, 0x19, 0x83, 0x08, 0xce, 0x10, 0x83, 0x08, 0xce,
+    0x09, 0xc2, 0x00, 0xc1, 0x08, 0xcd, 0xe1, 0xc2, 0x19, 0x2c, 0x08, 0xcd,
+    0xb9, 0xc2, 0x01, 0x30, 0x08, 0xcd, 0x90, 0xc2, 0x00, 0xd0, 0x08, 0xce,
+    0x01, 0x83, 0x08, 0xcd, 0xf9, 0x06, 0x42, 0x89, 0x93, 0xc2, 0x00, 0xd0,
+    0x08, 0xcd, 0xf1, 0x83, 0x08, 0xcd, 0xe9, 0x16, 0x42, 0x89, 0x9d, 0xc2,
+    0x00, 0xd0, 0x08, 0xcd, 0xb1, 0x83, 0x08, 0xcd, 0xa8, 0xc2, 0x00, 0xd0,
+    0x08, 0xcd, 0xa1, 0x83, 0x08, 0xcd, 0x98, 0xc2, 0x00, 0xd0, 0x08, 0xcd,
+    0x89, 0x83, 0x08, 0xcd, 0x80, 0xc2, 0x00, 0xd0, 0x08, 0xcd, 0x79, 0x83,
+    0x08, 0xcd, 0x70, 0x97, 0x08, 0xcd, 0x69, 0x8b, 0x08, 0xcd, 0x59, 0x83,
+    0x08, 0xcd, 0x08, 0x97, 0x08, 0xcd, 0x28, 0x8b, 0x08, 0xcd, 0x18, 0xc8,
+    0x0d, 0x03, 0x08, 0x45, 0x78, 0x19, 0xc2, 0x89, 0xa7, 0xc2, 0x00, 0xc4,
+    0x08, 0x45, 0x69, 0xc4, 0x02, 0xde, 0x08, 0x45, 0x48, 0xc3, 0x0d, 0x14,
+    0x08, 0x45, 0x61, 0xc3, 0x09, 0x9e, 0x08, 0x45, 0x50, 0xc2, 0x39, 0x8b,
+    0x08, 0x44, 0xf1, 0xc3, 0x1e, 0x1b, 0x08, 0x44, 0x58, 0xc3, 0x11, 0xef,
+    0x08, 0x44, 0xe9, 0x03, 0x42, 0x89, 0xb1, 0xc4, 0x3e, 0x5a, 0x08, 0x44,
+    0xe1, 0xc3, 0x20, 0x18, 0x08, 0x44, 0xa1, 0xc3, 0x00, 0x4e, 0x08, 0x44,
+    0x91, 0xc6, 0xcf, 0xd7, 0x08, 0x44, 0x81, 0xc4, 0xe0, 0xe7, 0x08, 0x44,
+    0x71, 0xc4, 0x4a, 0xb9, 0x08, 0x44, 0x61, 0xc2, 0x01, 0x7f, 0x08, 0x44,
+    0x31, 0xc4, 0xe3, 0x27, 0x08, 0x44, 0x11, 0xc5, 0xa5, 0xfd, 0x08, 0x44,
+    0x00, 0xc3, 0x16, 0x5a, 0x08, 0x44, 0xb9, 0xc4, 0x36, 0xb5, 0x08, 0x44,
+    0x08, 0xc2, 0x00, 0x8e, 0x08, 0x44, 0x50, 0x49, 0x01, 0xaa, 0xc2, 0x89,
+    0xbd, 0xcc, 0x82, 0x35, 0x01, 0x0e, 0xb9, 0x03, 0xc2, 0x89, 0xcf, 0xcb,
+    0x01, 0xfc, 0x01, 0x58, 0x01, 0xcb, 0x94, 0x22, 0x01, 0x58, 0x41, 0xd5,
+    0x01, 0x92, 0x01, 0x5b, 0x3b, 0x02, 0x89, 0xde, 0xd0, 0x5b, 0xc2, 0x0f,
+    0xc2, 0xa8, 0x03, 0xc2, 0x89, 0xe4, 0xcc, 0x82, 0x35, 0x01, 0x0e, 0xb1,
+    0x49, 0x01, 0xaa, 0xc2, 0x89, 0xf3, 0xcb, 0x01, 0xfc, 0x01, 0x58, 0x09,
+    0xcb, 0x94, 0x22, 0x01, 0x58, 0x49, 0xd5, 0x01, 0x92, 0x01, 0x5b, 0x33,
+    0x02, 0x8a, 0x05, 0xd0, 0x5b, 0xc2, 0x0f, 0xc2, 0xa0, 0x49, 0x53, 0xa9,
+    0xc2, 0x8a, 0x0b, 0x43, 0x00, 0xe3, 0xc2, 0x8a, 0x17, 0xd0, 0x5f, 0x92,
+    0x05, 0x41, 0xb9, 0xca, 0xa6, 0xc0, 0x05, 0x41, 0xc0, 0xe0, 0x0c, 0x07,
+    0x01, 0x3d, 0x78, 0xd7, 0x27, 0xb9, 0x01, 0x17, 0x19, 0xd4, 0x3c, 0x50,
+    0x01, 0x17, 0x10, 0xc9, 0x2d, 0xd0, 0x01, 0x14, 0x29, 0xc7, 0x3a, 0x20,
+    0x01, 0x14, 0x20, 0xc2, 0x00, 0xdb, 0x0f, 0x08, 0xf1, 0x83, 0x0f, 0x08,
+    0xe0, 0xc2, 0x8d, 0x8f, 0x0f, 0x08, 0x99, 0xc2, 0x0d, 0xf6, 0x0f, 0x08,
+    0x69, 0x83, 0x0f, 0x08, 0x10, 0x84, 0x0d, 0x97, 0xd9, 0x83, 0x0d, 0x97,
+    0xd1, 0xa6, 0x0d, 0x97, 0xc9, 0xa5, 0x0d, 0x97, 0xc1, 0xa4, 0x0d, 0x97,
+    0xb9, 0xa3, 0x0d, 0x97, 0xb1, 0xa2, 0x0d, 0x97, 0xa9, 0xa1, 0x0d, 0x97,
+    0xa1, 0xa0, 0x0d, 0x97, 0x99, 0x9f, 0x0d, 0x97, 0x91, 0x9e, 0x0d, 0x97,
+    0x89, 0x9d, 0x0d, 0x97, 0x80, 0x88, 0x0d, 0x97, 0x79, 0x87, 0x0d, 0x97,
+    0x71, 0x86, 0x0d, 0x97, 0x69, 0x83, 0x0d, 0x97, 0x51, 0xa6, 0x0d, 0x97,
+    0x49, 0xa2, 0x0d, 0x97, 0x29, 0x85, 0x0d, 0x97, 0x61, 0x84, 0x0d, 0x97,
+    0x59, 0xa5, 0x0d, 0x97, 0x41, 0xa4, 0x0d, 0x97, 0x39, 0xa3, 0x0d, 0x97,
+    0x31, 0xa1, 0x0d, 0x97, 0x21, 0xa0, 0x0d, 0x97, 0x19, 0x9f, 0x0d, 0x97,
+    0x11, 0x9e, 0x0d, 0x97, 0x09, 0x9d, 0x0d, 0x97, 0x00, 0x83, 0x0d, 0x95,
+    0xd1, 0x88, 0x0d, 0x95, 0xf9, 0x87, 0x0d, 0x95, 0xf1, 0xa6, 0x0d, 0x95,
+    0xc9, 0xa5, 0x0d, 0x95, 0xc1, 0xa4, 0x0d, 0x95, 0xb9, 0xa3, 0x0d, 0x95,
+    0xb1, 0xa2, 0x0d, 0x95, 0xa9, 0xa1, 0x0d, 0x95, 0xa1, 0xa0, 0x0d, 0x95,
+    0x99, 0x9f, 0x0d, 0x95, 0x91, 0x9e, 0x0d, 0x95, 0x89, 0x9d, 0x0d, 0x95,
+    0x81, 0x84, 0x0d, 0x95, 0xd9, 0x85, 0x0d, 0x95, 0xe1, 0x86, 0x0d, 0x95,
+    0xe8, 0x83, 0x0d, 0x94, 0xd1, 0xa6, 0x0d, 0x94, 0xc9, 0xa5, 0x0d, 0x94,
+    0xc1, 0xa4, 0x0d, 0x94, 0xb9, 0xa3, 0x0d, 0x94, 0xb1, 0xa2, 0x0d, 0x94,
+    0xa9, 0xa1, 0x0d, 0x94, 0xa1, 0xa0, 0x0d, 0x94, 0x99, 0x9f, 0x0d, 0x94,
+    0x91, 0x9e, 0x0d, 0x94, 0x89, 0x9d, 0x0d, 0x94, 0x81, 0x88, 0x0d, 0x94,
+    0xf9, 0x87, 0x0d, 0x94, 0xf1, 0x86, 0x0d, 0x94, 0xe9, 0x85, 0x0d, 0x94,
+    0xe1, 0x84, 0x0d, 0x94, 0xd8, 0x88, 0x0d, 0x94, 0x79, 0x87, 0x0d, 0x94,
+    0x71, 0x86, 0x0d, 0x94, 0x69, 0x85, 0x0d, 0x94, 0x61, 0x84, 0x0d, 0x94,
+    0x59, 0x83, 0x0d, 0x94, 0x51, 0xa6, 0x0d, 0x94, 0x49, 0xa5, 0x0d, 0x94,
+    0x41, 0xa4, 0x0d, 0x94, 0x39, 0xa3, 0x0d, 0x94, 0x31, 0xa2, 0x0d, 0x94,
+    0x29, 0xa1, 0x0d, 0x94, 0x21, 0xa0, 0x0d, 0x94, 0x19, 0x9f, 0x0d, 0x94,
+    0x11, 0x9e, 0x0d, 0x94, 0x09, 0x9d, 0x0d, 0x94, 0x00, 0x88, 0x0d, 0x93,
+    0xf9, 0x87, 0x0d, 0x93, 0xf1, 0x86, 0x0d, 0x93, 0xe9, 0x85, 0x0d, 0x93,
+    0xe1, 0x84, 0x0d, 0x93, 0xd9, 0x83, 0x0d, 0x93, 0xd1, 0xa6, 0x0d, 0x93,
+    0xc9, 0xa5, 0x0d, 0x93, 0xc1, 0xa4, 0x0d, 0x93, 0xb9, 0xa3, 0x0d, 0x93,
+    0xb1, 0xa2, 0x0d, 0x93, 0xa9, 0xa1, 0x0d, 0x93, 0xa1, 0xa0, 0x0d, 0x93,
+    0x99, 0x9f, 0x0d, 0x93, 0x91, 0x9e, 0x0d, 0x93, 0x89, 0x9d, 0x0d, 0x93,
+    0x80, 0x88, 0x0d, 0x93, 0x79, 0x87, 0x0d, 0x93, 0x71, 0x86, 0x0d, 0x93,
+    0x69, 0x85, 0x0d, 0x93, 0x61, 0x84, 0x0d, 0x93, 0x59, 0x83, 0x0d, 0x93,
+    0x51, 0xa6, 0x0d, 0x93, 0x49, 0xa5, 0x0d, 0x93, 0x41, 0xa4, 0x0d, 0x93,
+    0x39, 0xa3, 0x0d, 0x93, 0x31, 0xa2, 0x0d, 0x93, 0x29, 0xa1, 0x0d, 0x93,
+    0x21, 0xa0, 0x0d, 0x93, 0x19, 0x9f, 0x0d, 0x93, 0x11, 0x9e, 0x0d, 0x93,
+    0x09, 0x9d, 0x0d, 0x93, 0x00, 0x88, 0x0d, 0x92, 0xf9, 0x87, 0x0d, 0x92,
+    0xf1, 0x86, 0x0d, 0x92, 0xe9, 0x85, 0x0d, 0x92, 0xe1, 0x84, 0x0d, 0x92,
+    0xd9, 0x83, 0x0d, 0x92, 0xd1, 0xa6, 0x0d, 0x92, 0xc9, 0xa5, 0x0d, 0x92,
+    0xc1, 0xa4, 0x0d, 0x92, 0xb9, 0xa3, 0x0d, 0x92, 0xb1, 0xa2, 0x0d, 0x92,
+    0xa9, 0xa1, 0x0d, 0x92, 0xa1, 0xa0, 0x0d, 0x92, 0x99, 0x9f, 0x0d, 0x92,
+    0x91, 0x9e, 0x0d, 0x92, 0x89, 0x9d, 0x0d, 0x92, 0x80, 0x88, 0x0d, 0x92,
+    0x79, 0x87, 0x0d, 0x92, 0x71, 0x86, 0x0d, 0x92, 0x69, 0x85, 0x0d, 0x92,
+    0x61, 0x84, 0x0d, 0x92, 0x59, 0x83, 0x0d, 0x92, 0x51, 0xa6, 0x0d, 0x92,
+    0x49, 0xa5, 0x0d, 0x92, 0x41, 0xa4, 0x0d, 0x92, 0x39, 0xa3, 0x0d, 0x92,
+    0x31, 0xa2, 0x0d, 0x92, 0x29, 0xa1, 0x0d, 0x92, 0x21, 0xa0, 0x0d, 0x92,
+    0x19, 0x9f, 0x0d, 0x92, 0x11, 0x9e, 0x0d, 0x92, 0x09, 0x9d, 0x0d, 0x92,
+    0x00, 0x88, 0x0d, 0x91, 0xf9, 0x87, 0x0d, 0x91, 0xf1, 0x86, 0x0d, 0x91,
+    0xe9, 0x85, 0x0d, 0x91, 0xe1, 0x84, 0x0d, 0x91, 0xd9, 0x83, 0x0d, 0x91,
+    0xd1, 0xa6, 0x0d, 0x91, 0xc9, 0xa5, 0x0d, 0x91, 0xc1, 0xa4, 0x0d, 0x91,
+    0xb9, 0xa3, 0x0d, 0x91, 0xb1, 0xa2, 0x0d, 0x91, 0xa9, 0xa1, 0x0d, 0x91,
+    0xa1, 0xa0, 0x0d, 0x91, 0x99, 0x9f, 0x0d, 0x91, 0x91, 0x9e, 0x0d, 0x91,
+    0x89, 0x9d, 0x0d, 0x91, 0x80, 0x88, 0x0d, 0x91, 0x79, 0x87, 0x0d, 0x91,
+    0x71, 0x86, 0x0d, 0x91, 0x69, 0x85, 0x0d, 0x91, 0x61, 0x84, 0x0d, 0x91,
+    0x59, 0x83, 0x0d, 0x91, 0x51, 0xa6, 0x0d, 0x91, 0x49, 0xa5, 0x0d, 0x91,
+    0x41, 0xa4, 0x0d, 0x91, 0x39, 0xa3, 0x0d, 0x91, 0x31, 0xa2, 0x0d, 0x91,
+    0x29, 0xa1, 0x0d, 0x91, 0x21, 0xa0, 0x0d, 0x91, 0x19, 0x9f, 0x0d, 0x91,
+    0x11, 0x9e, 0x0d, 0x91, 0x09, 0x9d, 0x0d, 0x91, 0x00, 0x88, 0x0d, 0x90,
+    0xf9, 0x87, 0x0d, 0x90, 0xf1, 0x86, 0x0d, 0x90, 0xe9, 0x85, 0x0d, 0x90,
+    0xe1, 0x84, 0x0d, 0x90, 0xd9, 0x83, 0x0d, 0x90, 0xd1, 0xa6, 0x0d, 0x90,
+    0xc9, 0xa5, 0x0d, 0x90, 0xc1, 0xa4, 0x0d, 0x90, 0xb9, 0xa3, 0x0d, 0x90,
+    0xb1, 0xa2, 0x0d, 0x90, 0xa9, 0xa1, 0x0d, 0x90, 0xa1, 0xa0, 0x0d, 0x90,
+    0x99, 0x9f, 0x0d, 0x90, 0x91, 0x9e, 0x0d, 0x90, 0x89, 0x9d, 0x0d, 0x90,
+    0x80, 0x88, 0x0d, 0x90, 0x79, 0x87, 0x0d, 0x90, 0x71, 0x86, 0x0d, 0x90,
+    0x69, 0x85, 0x0d, 0x90, 0x61, 0x84, 0x0d, 0x90, 0x59, 0x83, 0x0d, 0x90,
+    0x51, 0xa6, 0x0d, 0x90, 0x49, 0xa5, 0x0d, 0x90, 0x41, 0xa4, 0x0d, 0x90,
+    0x39, 0xa3, 0x0d, 0x90, 0x31, 0xa2, 0x0d, 0x90, 0x29, 0xa1, 0x0d, 0x90,
+    0x21, 0xa0, 0x0d, 0x90, 0x19, 0x9f, 0x0d, 0x90, 0x11, 0x9e, 0x0d, 0x90,
+    0x09, 0x9d, 0x0d, 0x90, 0x00, 0x88, 0x0d, 0x96, 0xf9, 0x87, 0x0d, 0x96,
+    0xf1, 0x86, 0x0d, 0x96, 0xe9, 0x85, 0x0d, 0x96, 0xe1, 0x84, 0x0d, 0x96,
+    0xd9, 0x83, 0x0d, 0x96, 0xd1, 0xa6, 0x0d, 0x96, 0xc9, 0xa5, 0x0d, 0x96,
+    0xc1, 0xa4, 0x0d, 0x96, 0xb9, 0xa3, 0x0d, 0x96, 0xb1, 0xa2, 0x0d, 0x96,
+    0xa9, 0xa1, 0x0d, 0x96, 0xa1, 0xa0, 0x0d, 0x96, 0x99, 0x9f, 0x0d, 0x96,
+    0x91, 0x9e, 0x0d, 0x96, 0x89, 0x9d, 0x0d, 0x96, 0x80, 0x88, 0x0d, 0x96,
+    0x79, 0x87, 0x0d, 0x96, 0x71, 0x86, 0x0d, 0x96, 0x69, 0x85, 0x0d, 0x96,
+    0x61, 0x84, 0x0d, 0x96, 0x59, 0x83, 0x0d, 0x96, 0x51, 0xa6, 0x0d, 0x96,
+    0x49, 0xa5, 0x0d, 0x96, 0x41, 0xa4, 0x0d, 0x96, 0x39, 0xa3, 0x0d, 0x96,
+    0x31, 0xa2, 0x0d, 0x96, 0x29, 0xa1, 0x0d, 0x96, 0x21, 0xa0, 0x0d, 0x96,
+    0x19, 0x9f, 0x0d, 0x96, 0x11, 0x9e, 0x0d, 0x96, 0x09, 0x9d, 0x0d, 0x96,
+    0x00, 0x88, 0x0d, 0x95, 0x79, 0x87, 0x0d, 0x95, 0x71, 0x86, 0x0d, 0x95,
+    0x69, 0x85, 0x0d, 0x95, 0x61, 0x84, 0x0d, 0x95, 0x59, 0x83, 0x0d, 0x95,
+    0x51, 0xa6, 0x0d, 0x95, 0x49, 0xa5, 0x0d, 0x95, 0x41, 0xa4, 0x0d, 0x95,
+    0x39, 0xa3, 0x0d, 0x95, 0x31, 0xa2, 0x0d, 0x95, 0x29, 0xa1, 0x0d, 0x95,
+    0x21, 0xa0, 0x0d, 0x95, 0x19, 0x9f, 0x0d, 0x95, 0x11, 0x9e, 0x0d, 0x95,
+    0x09, 0x9d, 0x0d, 0x95, 0x00, 0x88, 0x0d, 0x8f, 0xf9, 0x87, 0x0d, 0x8f,
+    0xf1, 0x86, 0x0d, 0x8f, 0xe9, 0x85, 0x0d, 0x8f, 0xe1, 0x84, 0x0d, 0x8f,
+    0xd9, 0x83, 0x0d, 0x8f, 0xd1, 0xa6, 0x0d, 0x8f, 0xc9, 0xa5, 0x0d, 0x8f,
+    0xc1, 0xa4, 0x0d, 0x8f, 0xb9, 0xa3, 0x0d, 0x8f, 0xb1, 0xa2, 0x0d, 0x8f,
+    0xa9, 0xa1, 0x0d, 0x8f, 0xa1, 0xa0, 0x0d, 0x8f, 0x99, 0x9f, 0x0d, 0x8f,
+    0x91, 0x9e, 0x0d, 0x8f, 0x89, 0x9d, 0x0d, 0x8f, 0x80, 0x88, 0x0d, 0x8f,
+    0x79, 0x87, 0x0d, 0x8f, 0x71, 0x86, 0x0d, 0x8f, 0x69, 0x85, 0x0d, 0x8f,
+    0x61, 0x84, 0x0d, 0x8f, 0x59, 0x83, 0x0d, 0x8f, 0x51, 0xa6, 0x0d, 0x8f,
+    0x49, 0xa5, 0x0d, 0x8f, 0x41, 0xa4, 0x0d, 0x8f, 0x39, 0xa3, 0x0d, 0x8f,
+    0x31, 0xa2, 0x0d, 0x8f, 0x29, 0xa1, 0x0d, 0x8f, 0x21, 0xa0, 0x0d, 0x8f,
+    0x19, 0x9f, 0x0d, 0x8f, 0x11, 0x9e, 0x0d, 0x8f, 0x09, 0x9d, 0x0d, 0x8f,
+    0x00, 0x88, 0x0d, 0x8e, 0xf9, 0x87, 0x0d, 0x8e, 0xf1, 0x86, 0x0d, 0x8e,
+    0xe9, 0x85, 0x0d, 0x8e, 0xe1, 0x84, 0x0d, 0x8e, 0xd9, 0x83, 0x0d, 0x8e,
+    0xd1, 0xa6, 0x0d, 0x8e, 0xc9, 0xa5, 0x0d, 0x8e, 0xc1, 0xa4, 0x0d, 0x8e,
+    0xb9, 0xa3, 0x0d, 0x8e, 0xb1, 0xa2, 0x0d, 0x8e, 0xa9, 0xa1, 0x0d, 0x8e,
+    0xa1, 0xa0, 0x0d, 0x8e, 0x99, 0x9f, 0x0d, 0x8e, 0x91, 0x9e, 0x0d, 0x8e,
+    0x89, 0x9d, 0x0d, 0x8e, 0x80, 0x88, 0x0d, 0x8e, 0x79, 0x87, 0x0d, 0x8e,
+    0x71, 0x86, 0x0d, 0x8e, 0x69, 0x85, 0x0d, 0x8e, 0x61, 0x84, 0x0d, 0x8e,
+    0x59, 0x83, 0x0d, 0x8e, 0x51, 0xa6, 0x0d, 0x8e, 0x49, 0xa5, 0x0d, 0x8e,
+    0x41, 0xa4, 0x0d, 0x8e, 0x39, 0xa3, 0x0d, 0x8e, 0x31, 0xa2, 0x0d, 0x8e,
+    0x29, 0xa1, 0x0d, 0x8e, 0x21, 0xa0, 0x0d, 0x8e, 0x19, 0x9f, 0x0d, 0x8e,
+    0x11, 0x9e, 0x0d, 0x8e, 0x09, 0x9d, 0x0d, 0x8e, 0x00, 0x88, 0x0d, 0x8d,
+    0xf9, 0x87, 0x0d, 0x8d, 0xf1, 0x86, 0x0d, 0x8d, 0xe9, 0x85, 0x0d, 0x8d,
+    0xe1, 0x84, 0x0d, 0x8d, 0xd9, 0x83, 0x0d, 0x8d, 0xd1, 0xa6, 0x0d, 0x8d,
+    0xc9, 0xa5, 0x0d, 0x8d, 0xc1, 0xa4, 0x0d, 0x8d, 0xb9, 0xa3, 0x0d, 0x8d,
+    0xb1, 0xa2, 0x0d, 0x8d, 0xa9, 0xa1, 0x0d, 0x8d, 0xa1, 0xa0, 0x0d, 0x8d,
+    0x99, 0x9f, 0x0d, 0x8d, 0x91, 0x9e, 0x0d, 0x8d, 0x89, 0x9d, 0x0d, 0x8d,
+    0x80, 0x88, 0x0d, 0x8d, 0x79, 0x87, 0x0d, 0x8d, 0x71, 0x86, 0x0d, 0x8d,
+    0x69, 0x85, 0x0d, 0x8d, 0x61, 0x84, 0x0d, 0x8d, 0x59, 0x83, 0x0d, 0x8d,
+    0x51, 0xa6, 0x0d, 0x8d, 0x49, 0xa5, 0x0d, 0x8d, 0x41, 0xa4, 0x0d, 0x8d,
+    0x39, 0xa3, 0x0d, 0x8d, 0x31, 0xa2, 0x0d, 0x8d, 0x29, 0xa1, 0x0d, 0x8d,
+    0x21, 0xa0, 0x0d, 0x8d, 0x19, 0x9f, 0x0d, 0x8d, 0x11, 0x9e, 0x0d, 0x8d,
+    0x09, 0x9d, 0x0d, 0x8d, 0x00, 0x88, 0x0d, 0x8c, 0xf9, 0x87, 0x0d, 0x8c,
+    0xf1, 0x86, 0x0d, 0x8c, 0xe9, 0x85, 0x0d, 0x8c, 0xe1, 0x84, 0x0d, 0x8c,
+    0xd9, 0x83, 0x0d, 0x8c, 0xd1, 0xa6, 0x0d, 0x8c, 0xc9, 0xa5, 0x0d, 0x8c,
+    0xc1, 0xa4, 0x0d, 0x8c, 0xb9, 0xa3, 0x0d, 0x8c, 0xb1, 0xa2, 0x0d, 0x8c,
+    0xa9, 0xa1, 0x0d, 0x8c, 0xa1, 0xa0, 0x0d, 0x8c, 0x99, 0x9f, 0x0d, 0x8c,
+    0x91, 0x9e, 0x0d, 0x8c, 0x89, 0x9d, 0x0d, 0x8c, 0x80, 0x88, 0x0d, 0x8c,
+    0x79, 0x87, 0x0d, 0x8c, 0x71, 0x86, 0x0d, 0x8c, 0x69, 0x85, 0x0d, 0x8c,
+    0x61, 0x84, 0x0d, 0x8c, 0x59, 0x83, 0x0d, 0x8c, 0x51, 0xa6, 0x0d, 0x8c,
+    0x49, 0xa5, 0x0d, 0x8c, 0x41, 0xa4, 0x0d, 0x8c, 0x39, 0xa3, 0x0d, 0x8c,
+    0x31, 0xa2, 0x0d, 0x8c, 0x29, 0xa1, 0x0d, 0x8c, 0x21, 0xa0, 0x0d, 0x8c,
+    0x19, 0x9f, 0x0d, 0x8c, 0x11, 0x9e, 0x0d, 0x8c, 0x09, 0x9d, 0x0d, 0x8c,
+    0x00, 0x88, 0x0d, 0x8b, 0xf9, 0x87, 0x0d, 0x8b, 0xf1, 0x86, 0x0d, 0x8b,
+    0xe9, 0x85, 0x0d, 0x8b, 0xe1, 0x84, 0x0d, 0x8b, 0xd9, 0x83, 0x0d, 0x8b,
+    0xd1, 0xa6, 0x0d, 0x8b, 0xc9, 0xa5, 0x0d, 0x8b, 0xc1, 0xa4, 0x0d, 0x8b,
+    0xb9, 0xa3, 0x0d, 0x8b, 0xb1, 0xa2, 0x0d, 0x8b, 0xa9, 0xa1, 0x0d, 0x8b,
+    0xa1, 0xa0, 0x0d, 0x8b, 0x99, 0x9f, 0x0d, 0x8b, 0x91, 0x9e, 0x0d, 0x8b,
+    0x89, 0x9d, 0x0d, 0x8b, 0x80, 0xcd, 0x79, 0x1a, 0x01, 0x24, 0xd9, 0xcd,
+    0x7d, 0xac, 0x01, 0x24, 0x98, 0xcf, 0x69, 0x36, 0x01, 0x24, 0xb9, 0xc2,
+    0x00, 0xbc, 0x00, 0x01, 0x18, 0xc2, 0x00, 0x39, 0x00, 0x3f, 0x51, 0xc3,
+    0x1c, 0x63, 0x00, 0x3f, 0x49, 0xc2, 0x25, 0x3b, 0x00, 0x3f, 0x40, 0xc7,
+    0xc3, 0xf4, 0x00, 0x3f, 0x38, 0xc7, 0xc3, 0xf4, 0x00, 0x3f, 0x00, 0xd0,
+    0x5b, 0xa2, 0x01, 0x4d, 0xa1, 0xd1, 0x02, 0x56, 0x01, 0x4d, 0x99, 0xd2,
+    0x4b, 0xdd, 0x01, 0x4d, 0x91, 0xc7, 0x80, 0x70, 0x01, 0x4d, 0x88, 0x43,
+    0x00, 0xaf, 0x42, 0x8a, 0x23, 0x03, 0xc2, 0x8a, 0x2d, 0xcd, 0x79, 0xa9,
+    0x0f, 0x98, 0x68, 0xa5, 0x09, 0x87, 0xe9, 0xa4, 0x09, 0x87, 0xe1, 0xa3,
+    0x09, 0x87, 0xd9, 0xa1, 0x09, 0x87, 0xcb, 0x02, 0x8a, 0x39, 0xa0, 0x09,
+    0x87, 0xc1, 0x9f, 0x09, 0x87, 0xb9, 0x9e, 0x09, 0x87, 0xb1, 0x9d, 0x09,
+    0x87, 0xa8, 0xa6, 0x09, 0x87, 0xa1, 0xa5, 0x09, 0x87, 0x93, 0x02, 0x8a,
+    0x3d, 0xa4, 0x09, 0x87, 0x89, 0xa3, 0x09, 0x87, 0x81, 0xa2, 0x09, 0x87,
+    0x79, 0xa1, 0x09, 0x87, 0x71, 0xa0, 0x09, 0x87, 0x69, 0x9f, 0x09, 0x87,
+    0x61, 0x9e, 0x09, 0x87, 0x59, 0x9d, 0x09, 0x87, 0x4a, 0x02, 0x8a, 0x41,
+    0xa6, 0x09, 0x87, 0x41, 0xa5, 0x09, 0x87, 0x39, 0xa4, 0x09, 0x87, 0x2b,
+    0x02, 0x8a, 0x45, 0xa3, 0x09, 0x87, 0x1b, 0x02, 0x8a, 0x49, 0xa2, 0x09,
+    0x87, 0x11, 0xa1, 0x09, 0x87, 0x09, 0xa0, 0x09, 0x87, 0x01, 0x9f, 0x09,
+    0x86, 0xf9, 0x9e, 0x09, 0x86, 0xf1, 0x9d, 0x09, 0x86, 0xe8, 0xa6, 0x09,
+    0x86, 0xdb, 0x02, 0x8a, 0x4d, 0xa5, 0x09, 0x86, 0xcb, 0x02, 0x8a, 0x51,
+    0xa4, 0x09, 0x86, 0xc1, 0xa3, 0x09, 0x86, 0xb9, 0xa2, 0x09, 0x86, 0xb1,
+    0xa1, 0x09, 0x86, 0xa9, 0xa0, 0x09, 0x86, 0xa1, 0x9f, 0x09, 0x86, 0x99,
+    0x9e, 0x09, 0x86, 0x90, 0x83, 0x09, 0x82, 0xa8, 0x9e, 0x09, 0x94, 0xd1,
+    0x9d, 0x09, 0x94, 0xba, 0x02, 0x8a, 0x55, 0xa6, 0x09, 0x94, 0xb1, 0xa5,
+    0x09, 0x94, 0xa9, 0xa4, 0x09, 0x94, 0xa1, 0xa3, 0x09, 0x94, 0x99, 0xa2,
+    0x09, 0x94, 0x91, 0xa1, 0x09, 0x94, 0x89, 0xa0, 0x09, 0x94, 0x81, 0x9f,
+    0x09, 0x94, 0x79, 0x9e, 0x09, 0x94, 0x71, 0x9d, 0x09, 0x94, 0x68, 0xa6,
+    0x09, 0x94, 0x61, 0xa5, 0x09, 0x94, 0x59, 0xa4, 0x09, 0x94, 0x51, 0xa3,
+    0x09, 0x94, 0x2b, 0x02, 0x8a, 0x5d, 0xa2, 0x09, 0x94, 0x21, 0xa1, 0x09,
+    0x94, 0x19, 0xa0, 0x09, 0x94, 0x0b, 0x02, 0x8a, 0x6d, 0x9f, 0x09, 0x94,
+    0x01, 0x9e, 0x09, 0x93, 0xf9, 0x9d, 0x09, 0x93, 0xea, 0x02, 0x8a, 0x71,
+    0xa6, 0x09, 0x93, 0xdb, 0x02, 0x8a, 0x75, 0xa5, 0x09, 0x93, 0xd1, 0xa4,
+    0x09, 0x93, 0xc9, 0xa3, 0x09, 0x93, 0xc1, 0xa2, 0x09, 0x93, 0xb3, 0x02,
+    0x8a, 0x79, 0xa1, 0x09, 0x93, 0xa3, 0x02, 0x8a, 0x7d, 0xa0, 0x09, 0x93,
+    0x99, 0x9f, 0x09, 0x93, 0x91, 0x9e, 0x09, 0x93, 0x89, 0x9d, 0x09, 0x93,
+    0x7a, 0x02, 0x8a, 0x81, 0xa6, 0x09, 0x93, 0x6b, 0x02, 0x8a, 0x85, 0xa5,
+    0x09, 0x93, 0x61, 0xa4, 0x09, 0x93, 0x59, 0xa3, 0x09, 0x93, 0x51, 0xa2,
+    0x09, 0x93, 0x49, 0xa1, 0x09, 0x93, 0x41, 0xa0, 0x09, 0x93, 0x39, 0x9f,
+    0x09, 0x93, 0x31, 0x9e, 0x09, 0x93, 0x29, 0x9d, 0x09, 0x93, 0x0a, 0x02,
+    0x8a, 0x89, 0xa6, 0x09, 0x93, 0x01, 0xa5, 0x09, 0x92, 0xf9, 0xa4, 0x09,
+    0x92, 0xf1, 0xa3, 0x09, 0x92, 0xbb, 0x02, 0x8a, 0x95, 0xa2, 0x09, 0x92,
+    0xab, 0x02, 0x8a, 0xad, 0xa1, 0x09, 0x92, 0xa1, 0xa0, 0x09, 0x92, 0x99,
+    0x9f, 0x09, 0x92, 0x91, 0x9e, 0x09, 0x92, 0x82, 0x02, 0x8a, 0xb1, 0xc3,
+    0x02, 0x39, 0x09, 0xa1, 0xa9, 0xc5, 0xdd, 0xd0, 0x09, 0xa1, 0x98, 0xc3,
+    0x02, 0x39, 0x09, 0xa1, 0xa1, 0xc5, 0xdd, 0xd0, 0x09, 0xa1, 0x90, 0xa2,
+    0x09, 0x8c, 0xd1, 0xa1, 0x09, 0x8c, 0xc9, 0xa0, 0x09, 0x8c, 0xc1, 0x9f,
+    0x09, 0x8c, 0xb9, 0x9e, 0x09, 0x8c, 0xab, 0x02, 0x8a, 0xb5, 0x9d, 0x09,
+    0x8c, 0x9a, 0x02, 0x8a, 0xb9, 0xa6, 0x09, 0x8c, 0x8b, 0x02, 0x8a, 0xbd,
+    0xa5, 0x09, 0x8c, 0x81, 0xa4, 0x09, 0x8c, 0x79, 0xa3, 0x09, 0x8c, 0x71,
+    0xa2, 0x09, 0x8c, 0x63, 0x02, 0x8a, 0xc1, 0xa1, 0x09, 0x8c, 0x59, 0xa0,
+    0x09, 0x8c, 0x51, 0x9f, 0x09, 0x8c, 0x49, 0x9e, 0x09, 0x8c, 0x40, 0x83,
+    0x09, 0x8c, 0x28, 0x83, 0x09, 0x9d, 0x70, 0xa6, 0x09, 0x9d, 0x61, 0xa5,
+    0x09, 0x9d, 0x59, 0xa4, 0x09, 0x9d, 0x4b, 0x02, 0x8a, 0xc5, 0xa3, 0x09,
+    0x9d, 0x41, 0xa2, 0x09, 0x9d, 0x39, 0xa1, 0x09, 0x9d, 0x31, 0xa0, 0x09,
+    0x9d, 0x23, 0x02, 0x8a, 0xc9, 0x9f, 0x09, 0x9d, 0x19, 0x9e, 0x09, 0x9d,
+    0x0b, 0x02, 0x8a, 0xcd, 0x9d, 0x09, 0x9c, 0xfa, 0x02, 0x8a, 0xd1, 0xa6,
+    0x09, 0x9c, 0xeb, 0x02, 0x8a, 0xd5, 0xa5, 0x09, 0x9c, 0xdb, 0x02, 0x8a,
+    0xd9, 0xa4, 0x09, 0x9c, 0xd1, 0xa3, 0x09, 0x9c, 0xc9, 0xa2, 0x09, 0x9c,
+    0xc1, 0xa1, 0x09, 0x9c, 0xb9, 0xa0, 0x09, 0x9c, 0xab, 0x02, 0x8a, 0xdd,
+    0x9f, 0x09, 0x9c, 0xa1, 0x9e, 0x09, 0x9c, 0x99, 0x9d, 0x09, 0x9c, 0x32,
+    0x02, 0x8a, 0xe1, 0xa6, 0x09, 0x9c, 0x29, 0xa5, 0x09, 0x9c, 0x21, 0xa4,
+    0x09, 0x9c, 0x19, 0xa3, 0x09, 0x9c, 0x11, 0xa2, 0x09, 0x9c, 0x09, 0xa1,
+    0x09, 0x9c, 0x01, 0xa0, 0x09, 0x9b, 0xf9, 0x9f, 0x09, 0x9b, 0xe3, 0x02,
+    0x8b, 0x11, 0x9e, 0x09, 0x9b, 0xc3, 0x02, 0x8b, 0x19, 0x9d, 0x09, 0x9b,
+    0xb8, 0xa6, 0x09, 0x9b, 0xb1, 0xa5, 0x09, 0x9b, 0xa9, 0xa4, 0x09, 0x9b,
+    0x93, 0x02, 0x8b, 0x25, 0xa3, 0x09, 0x9b, 0x89, 0xa2, 0x09, 0x9b, 0x81,
+    0xa1, 0x09, 0x9b, 0x79, 0xa0, 0x09, 0x9b, 0x71, 0x9f, 0x09, 0x9b, 0x63,
+    0x02, 0x8b, 0x2d, 0x9e, 0x09, 0x9b, 0x12, 0x02, 0x8b, 0x31, 0x9f, 0x09,
+    0xa1, 0x71, 0x9e, 0x09, 0xa1, 0x69, 0x9d, 0x09, 0xa1, 0x60, 0xa6, 0x09,
+    0xa1, 0x59, 0xa5, 0x09, 0xa1, 0x51, 0xa4, 0x09, 0xa1, 0x49, 0xa3, 0x09,
+    0xa1, 0x41, 0xa2, 0x09, 0xa1, 0x39, 0xa1, 0x09, 0xa1, 0x31, 0xa0, 0x09,
+    0xa1, 0x29, 0x9f, 0x09, 0xa1, 0x21, 0x9e, 0x09, 0xa1, 0x19, 0x9d, 0x09,
+    0xa1, 0x10, 0xa6, 0x09, 0xa1, 0x09, 0xa5, 0x09, 0xa1, 0x01, 0xa4, 0x09,
+    0xa0, 0xf9, 0xa3, 0x09, 0xa0, 0xf1, 0xa2, 0x09, 0xa0, 0xe9, 0xa1, 0x09,
+    0xa0, 0xe1, 0xa0, 0x09, 0xa0, 0xd9, 0x9f, 0x09, 0xa0, 0xd1, 0x9e, 0x09,
+    0xa0, 0xc9, 0x9d, 0x09, 0xa0, 0xc0, 0xa6, 0x09, 0xa0, 0xb9, 0xa5, 0x09,
+    0xa0, 0xb1, 0xa4, 0x09, 0xa0, 0x9b, 0x02, 0x8b, 0x55, 0xa3, 0x09, 0xa0,
+    0x91, 0xa2, 0x09, 0xa0, 0x89, 0xa1, 0x09, 0xa0, 0x81, 0xa0, 0x09, 0xa0,
+    0x79, 0x9f, 0x09, 0xa0, 0x71, 0x9e, 0x09, 0xa0, 0x68, 0xa6, 0x09, 0x82,
+    0x71, 0xa5, 0x09, 0x82, 0x69, 0xa4, 0x09, 0x82, 0x61, 0xa3, 0x09, 0x82,
+    0x59, 0xa2, 0x09, 0x82, 0x51, 0xa1, 0x09, 0x82, 0x49, 0xa0, 0x09, 0x82,
+    0x41, 0x9f, 0x09, 0x82, 0x39, 0x9e, 0x09, 0x82, 0x31, 0x9d, 0x09, 0x82,
+    0x28, 0xa6, 0x09, 0x82, 0x21, 0xa5, 0x09, 0x82, 0x19, 0xa4, 0x09, 0x82,
+    0x11, 0xa3, 0x09, 0x82, 0x09, 0xa2, 0x09, 0x82, 0x01, 0xa1, 0x09, 0x81,
+    0xf9, 0xa0, 0x09, 0x81, 0xf1, 0x9f, 0x09, 0x81, 0xe9, 0x9e, 0x09, 0x81,
+    0xe1, 0x9d, 0x09, 0x81, 0xd8, 0xa6, 0x09, 0x81, 0xd1, 0xa5, 0x09, 0x81,
+    0xc9, 0xa4, 0x09, 0x81, 0xc1, 0xa3, 0x09, 0x81, 0xb9, 0xa2, 0x09, 0x81,
+    0xab, 0x02, 0x8b, 0x5d, 0xa1, 0x09, 0x81, 0xa1, 0xa0, 0x09, 0x81, 0x93,
+    0x02, 0x8b, 0x61, 0x9f, 0x09, 0x81, 0x83, 0x02, 0x8b, 0x65, 0x9e, 0x09,
+    0x81, 0x79, 0x9d, 0x09, 0x81, 0x6a, 0x02, 0x8b, 0x69, 0xa6, 0x09, 0x81,
+    0x61, 0xa5, 0x09, 0x81, 0x59, 0xa4, 0x09, 0x81, 0x51, 0xa3, 0x09, 0x81,
+    0x49, 0xa2, 0x09, 0x81, 0x41, 0xa1, 0x09, 0x81, 0x39, 0xa0, 0x09, 0x81,
+    0x31, 0x9f, 0x09, 0x81, 0x23, 0x02, 0x8b, 0x6d, 0x9e, 0x09, 0x81, 0x19,
+    0x9d, 0x09, 0x81, 0x10, 0xa6, 0x09, 0x81, 0x09, 0xa5, 0x09, 0x81, 0x01,
+    0xa4, 0x09, 0x80, 0xf9, 0xa3, 0x09, 0x80, 0xf1, 0xa2, 0x09, 0x80, 0xe9,
+    0xa1, 0x09, 0x80, 0xe1, 0xa0, 0x09, 0x80, 0xd9, 0x9f, 0x09, 0x80, 0xd1,
+    0x9e, 0x09, 0x80, 0xc9, 0x9d, 0x09, 0x80, 0xc0, 0xa6, 0x09, 0x80, 0xb9,
+    0xa5, 0x09, 0x80, 0xb1, 0xa4, 0x09, 0x80, 0xa3, 0x02, 0x8b, 0x71, 0xa3,
+    0x09, 0x80, 0x99, 0xa2, 0x09, 0x80, 0x91, 0xa1, 0x09, 0x80, 0x83, 0x02,
+    0x8b, 0x75, 0xa0, 0x09, 0x80, 0x79, 0x9f, 0x09, 0x80, 0x71, 0x9e, 0x09,
+    0x80, 0x69, 0x9d, 0x09, 0x80, 0x60, 0xa6, 0x09, 0x80, 0x59, 0xa5, 0x09,
+    0x80, 0x51, 0xa4, 0x09, 0x80, 0x49, 0xa3, 0x09, 0x80, 0x33, 0x02, 0x8b,
+    0x79, 0xa2, 0x09, 0x80, 0x23, 0x02, 0x8b, 0x81, 0xa1, 0x09, 0x80, 0x19,
+    0xa0, 0x09, 0x80, 0x11, 0x9f, 0x09, 0x80, 0x09, 0x9e, 0x09, 0x80, 0x00,
+    0x8a, 0x09, 0xa0, 0x61, 0x89, 0x09, 0xa0, 0x59, 0x88, 0x09, 0xa0, 0x51,
+    0x87, 0x09, 0xa0, 0x49, 0x86, 0x09, 0xa0, 0x41, 0x85, 0x09, 0xa0, 0x39,
+    0x84, 0x09, 0xa0, 0x31, 0x83, 0x09, 0xa0, 0x28, 0x8b, 0x09, 0xa0, 0x19,
+    0x8a, 0x09, 0xa0, 0x11, 0x89, 0x09, 0xa0, 0x09, 0x88, 0x09, 0xa0, 0x01,
+    0x87, 0x09, 0x9f, 0xf9, 0x86, 0x09, 0x9f, 0xf1, 0x85, 0x09, 0x9f, 0xe9,
+    0x84, 0x09, 0x9f, 0xe1, 0x83, 0x09, 0x9f, 0xd8, 0x83, 0x09, 0x9f, 0x80,
+    0x83, 0x09, 0x9f, 0x70, 0x84, 0x09, 0x9f, 0x61, 0x83, 0x09, 0x9f, 0x58,
+    0x86, 0x09, 0x9f, 0x49, 0x85, 0x09, 0x9f, 0x41, 0x84, 0x09, 0x9f, 0x39,
+    0x83, 0x09, 0x9f, 0x30, 0x83, 0x09, 0x9e, 0x68, 0x83, 0x09, 0x9e, 0x30,
+    0x83, 0x09, 0x9e, 0x20, 0x83, 0x09, 0x9e, 0x00, 0x83, 0x09, 0x9d, 0xd8,
+    0x83, 0x09, 0x9d, 0xc8, 0x83, 0x09, 0x9d, 0x90, 0x83, 0x09, 0x9a, 0xb8,
+    0x83, 0x09, 0x9a, 0x98, 0x83, 0x09, 0x9a, 0x60, 0x84, 0x09, 0x99, 0xd1,
+    0x83, 0x09, 0x99, 0xc8, 0x83, 0x09, 0x99, 0x78, 0x83, 0x09, 0x99, 0x68,
+    0x83, 0x09, 0x98, 0xe0, 0x83, 0x09, 0x98, 0xb0, 0x83, 0x09, 0x98, 0x98,
+    0x83, 0x09, 0x98, 0x88, 0x83, 0x09, 0x98, 0x78, 0x83, 0x09, 0x98, 0x50,
+    0x83, 0x09, 0x97, 0xd8, 0x84, 0x09, 0x97, 0x89, 0x83, 0x09, 0x97, 0x80,
+    0x83, 0x09, 0x97, 0x30, 0x84, 0x09, 0x97, 0x11, 0x83, 0x09, 0x97, 0x08,
+    0x83, 0x09, 0x96, 0xc0, 0x83, 0x09, 0x96, 0x98, 0x83, 0x09, 0x96, 0x18,
+    0x83, 0x09, 0x95, 0xe0, 0x84, 0x09, 0x95, 0xa1, 0x83, 0x09, 0x95, 0x98,
+    0x83, 0x09, 0x95, 0x88, 0x83, 0x09, 0x94, 0xf8, 0x83, 0x09, 0x94, 0xe0,
+    0x9f, 0x09, 0x92, 0x73, 0x02, 0x8b, 0x85, 0x9e, 0x09, 0x92, 0x69, 0x9d,
+    0x09, 0x92, 0x60, 0xa6, 0x09, 0x92, 0x59, 0xa5, 0x09, 0x92, 0x4b, 0x02,
+    0x8b, 0x89, 0xa4, 0x09, 0x92, 0x41, 0xa3, 0x09, 0x92, 0x39, 0xa2, 0x09,
+    0x92, 0x31, 0xa1, 0x09, 0x92, 0x29, 0xa0, 0x09, 0x92, 0x21, 0x9f, 0x09,
+    0x92, 0x19, 0x9e, 0x09, 0x92, 0x0b, 0x02, 0x8b, 0x8d, 0x9d, 0x09, 0x91,
+    0xfa, 0x02, 0x8b, 0x91, 0xa6, 0x09, 0x91, 0xf1, 0xa5, 0x09, 0x91, 0xe9,
+    0xa4, 0x09, 0x91, 0xe1, 0xa3, 0x09, 0x91, 0xd9, 0xa2, 0x09, 0x91, 0xd1,
+    0xa1, 0x09, 0x91, 0xc9, 0xa0, 0x09, 0x91, 0xc1, 0x9f, 0x09, 0x91, 0xb9,
+    0x9e, 0x09, 0x91, 0xb0, 0xa6, 0x09, 0x91, 0xa1, 0xa5, 0x09, 0x91, 0x99,
+    0xa4, 0x09, 0x91, 0x8b, 0x02, 0x8b, 0x95, 0xa3, 0x09, 0x91, 0x81, 0xa2,
+    0x09, 0x91, 0x79, 0xa1, 0x09, 0x91, 0x71, 0xa0, 0x09, 0x91, 0x69, 0x9f,
+    0x09, 0x91, 0x61, 0x9e, 0x09, 0x91, 0x59, 0x9d, 0x09, 0x91, 0x50, 0xa6,
+    0x09, 0x91, 0x49, 0xa5, 0x09, 0x91, 0x41, 0xa4, 0x09, 0x91, 0x39, 0xa3,
+    0x09, 0x91, 0x31, 0xa2, 0x09, 0x91, 0x23, 0x02, 0x8b, 0x99, 0xa1, 0x09,
+    0x91, 0x19, 0xa0, 0x09, 0x91, 0x11, 0x9f, 0x09, 0x91, 0x09, 0x9e, 0x09,
+    0x91, 0x00, 0x9f, 0x09, 0x90, 0xf9, 0x9e, 0x09, 0x90, 0xf1, 0x9d, 0x09,
+    0x90, 0xe8, 0xa6, 0x09, 0x90, 0xe1, 0xa5, 0x09, 0x90, 0xd9, 0xa4, 0x09,
+    0x90, 0xcb, 0x02, 0x8b, 0x9d, 0xa3, 0x09, 0x90, 0xc1, 0xa2, 0x09, 0x90,
+    0xb3, 0x02, 0x8b, 0xa1, 0xa1, 0x09, 0x90, 0xa3, 0x02, 0x8b, 0xa5, 0xa0,
+    0x09, 0x90, 0x93, 0x02, 0x8b, 0xa9, 0x9f, 0x09, 0x90, 0x89, 0x9e, 0x09,
+    0x90, 0x81, 0x9d, 0x09, 0x90, 0x78, 0xa6, 0x09, 0x90, 0x71, 0xa5, 0x09,
+    0x90, 0x69, 0xa4, 0x09, 0x90, 0x61, 0xa3, 0x09, 0x90, 0x59, 0xa2, 0x09,
+    0x90, 0x4b, 0x02, 0x8b, 0xad, 0xa1, 0x09, 0x90, 0x41, 0xa0, 0x09, 0x90,
+    0x39, 0x9f, 0x09, 0x90, 0x31, 0x9e, 0x09, 0x90, 0x29, 0x9d, 0x09, 0x90,
+    0x20, 0xa6, 0x09, 0x90, 0x19, 0xa5, 0x09, 0x90, 0x03, 0x02, 0x8b, 0xb1,
+    0xa4, 0x09, 0x8f, 0xf9, 0xa3, 0x09, 0x8f, 0xf1, 0xa2, 0x09, 0x8f, 0xe9,
+    0xa1, 0x09, 0x8f, 0xe1, 0xa0, 0x09, 0x8f, 0xd9, 0x9f, 0x09, 0x8f, 0xd1,
+    0x9e, 0x09, 0x8f, 0xc9, 0x9d, 0x09, 0x8f, 0xc0, 0xa6, 0x09, 0x8f, 0xb9,
+    0xa5, 0x09, 0x8f, 0xb1, 0xa4, 0x09, 0x8f, 0xa9, 0xa3, 0x09, 0x8f, 0xa1,
+    0xa2, 0x09, 0x8f, 0x99, 0xa1, 0x09, 0x8f, 0x91, 0xa0, 0x09, 0x8f, 0x89,
+    0x9f, 0x09, 0x8f, 0x81, 0x9e, 0x09, 0x8f, 0x78, 0x83, 0x09, 0x8f, 0x50,
+    0x84, 0x09, 0x8f, 0x11, 0x83, 0x09, 0x8f, 0x08, 0x83, 0x09, 0x8e, 0xf0,
+    0x83, 0x09, 0x8e, 0xd0, 0x83, 0x09, 0x8e, 0xa8, 0x83, 0x09, 0x8e, 0x90,
+    0x83, 0x09, 0x8e, 0x60, 0x83, 0x09, 0x8e, 0x50, 0x83, 0x09, 0x8e, 0x40,
+    0x8a, 0x09, 0x8e, 0x21, 0x89, 0x09, 0x8e, 0x19, 0x88, 0x09, 0x8e, 0x11,
+    0x87, 0x09, 0x8e, 0x09, 0x86, 0x09, 0x8e, 0x01, 0x85, 0x09, 0x8d, 0xf9,
+    0x84, 0x09, 0x8d, 0xf1, 0x83, 0x09, 0x8d, 0xe8, 0x83, 0x09, 0x8d, 0xd0,
+    0x83, 0x09, 0x8d, 0x90, 0x84, 0x09, 0x8d, 0x79, 0x83, 0x09, 0x8d, 0x70,
+    0x83, 0x09, 0x8b, 0xa8, 0x83, 0x09, 0x8b, 0x90, 0x83, 0x09, 0x8b, 0x58,
+    0x83, 0x09, 0x8b, 0x48, 0x83, 0x09, 0x8a, 0xf0, 0x83, 0x09, 0x8a, 0xb8,
+    0x83, 0x09, 0x8a, 0x68, 0x84, 0x09, 0x8a, 0x41, 0x83, 0x09, 0x8a, 0x38,
+    0x83, 0x09, 0x8a, 0x28, 0x85, 0x09, 0x89, 0xe1, 0x84, 0x09, 0x89, 0xd9,
+    0x83, 0x09, 0x89, 0xd0, 0x83, 0x09, 0x89, 0xa8, 0x83, 0x09, 0x89, 0x98,
+    0x83, 0x09, 0x89, 0x88, 0x83, 0x09, 0x89, 0x48, 0x83, 0x09, 0x89, 0x38,
+    0x83, 0x09, 0x89, 0x00, 0x83, 0x09, 0x88, 0xa8, 0x83, 0x09, 0x88, 0x60,
+    0x83, 0x09, 0x87, 0xf8, 0x8a, 0x09, 0x86, 0x89, 0x89, 0x09, 0x86, 0x81,
+    0x88, 0x09, 0x86, 0x79, 0x87, 0x09, 0x86, 0x71, 0x86, 0x09, 0x86, 0x69,
+    0x85, 0x09, 0x86, 0x61, 0x84, 0x09, 0x86, 0x59, 0x83, 0x09, 0x86, 0x50,
+    0x83, 0x09, 0x85, 0xe0, 0x83, 0x09, 0x85, 0xc8, 0x8b, 0x09, 0x85, 0xb1,
+    0x8a, 0x09, 0x85, 0xa9, 0x89, 0x09, 0x85, 0xa1, 0x88, 0x09, 0x85, 0x99,
+    0x87, 0x09, 0x85, 0x91, 0x86, 0x09, 0x85, 0x89, 0x85, 0x09, 0x85, 0x81,
+    0x84, 0x09, 0x85, 0x79, 0x83, 0x09, 0x85, 0x70, 0x83, 0x09, 0x85, 0x58,
+    0x83, 0x09, 0x85, 0x40, 0x83, 0x09, 0x84, 0xd8, 0x83, 0x09, 0x84, 0xb8,
+    0x83, 0x09, 0x84, 0x90, 0x83, 0x09, 0x83, 0xf0, 0x83, 0x09, 0x83, 0x38,
+    0x85, 0x09, 0x82, 0xf1, 0x84, 0x09, 0x82, 0xe9, 0x83, 0x09, 0x82, 0xe0,
+    0xc6, 0x02, 0xd1, 0x0f, 0xbc, 0x49, 0xc6, 0x0b, 0x09, 0x0f, 0xbc, 0x98,
+    0xc6, 0x13, 0x52, 0x0f, 0xbd, 0x71, 0xd2, 0x4d, 0x57, 0x0f, 0xbd, 0xd0,
+    0x45, 0x56, 0x42, 0x42, 0x8b, 0xb9, 0x83, 0x00, 0x95, 0x03, 0x02, 0x8b,
+    0xe9, 0x97, 0x00, 0x95, 0x09, 0x8b, 0x00, 0x95, 0x11, 0x87, 0x00, 0x95,
+    0x2b, 0x02, 0x8b, 0xed, 0x91, 0x00, 0x95, 0x33, 0x02, 0x8b, 0xf1, 0xc2,
+    0x01, 0x4a, 0x00, 0x95, 0x38, 0x83, 0x00, 0x98, 0x58, 0x87, 0x00, 0x98,
+    0x60, 0x83, 0x00, 0x98, 0x78, 0x83, 0x00, 0x98, 0x83, 0x02, 0x8b, 0xf5,
+    0x8b, 0x00, 0x98, 0x91, 0x87, 0x00, 0x98, 0xaa, 0x02, 0x8b, 0xf9, 0x83,
+    0x00, 0x98, 0xc3, 0x02, 0x8b, 0xfd, 0x97, 0x00, 0x98, 0xc9, 0x8b, 0x00,
+    0x98, 0xd1, 0x87, 0x00, 0x98, 0xeb, 0x02, 0x8c, 0x01, 0x91, 0x00, 0x98,
+    0xf1, 0x19, 0x42, 0x8c, 0x05, 0x83, 0x01, 0x6e, 0xc3, 0x02, 0x8c, 0x17,
+    0x97, 0x01, 0x6e, 0xc9, 0x8b, 0x01, 0x6e, 0xd1, 0x87, 0x01, 0x6e, 0xeb,
+    0x02, 0x8c, 0x1b, 0x91, 0x01, 0x6e, 0xf0, 0x19, 0xc2, 0x8c, 0x1f, 0x1b,
+    0xc2, 0x8c, 0x2e, 0x83, 0x00, 0x90, 0x83, 0x02, 0x8c, 0x48, 0x97, 0x00,
+    0x90, 0x89, 0x8b, 0x00, 0x90, 0x91, 0x87, 0x00, 0x90, 0xab, 0x02, 0x8c,
+    0x4c, 0x91, 0x00, 0x90, 0xb0, 0x83, 0x00, 0x90, 0x18, 0x87, 0x00, 0x90,
+    0x20, 0x83, 0x00, 0x90, 0x38, 0x91, 0x05, 0x59, 0x71, 0x87, 0x05, 0x59,
+    0x6b, 0x02, 0x8c, 0x50, 0x83, 0x05, 0x59, 0x43, 0x02, 0x8c, 0x54, 0x8b,
+    0x05, 0x59, 0x51, 0x97, 0x05, 0x59, 0x48, 0x83, 0x00, 0x93, 0x18, 0x87,
+    0x00, 0x93, 0x20, 0x83, 0x01, 0x6c, 0x28, 0x83, 0x00, 0x93, 0x39, 0x8b,
+    0x00, 0x9c, 0x29, 0x87, 0x00, 0x9c, 0x3a, 0x02, 0x8c, 0x58, 0x0a, 0xc2,
+    0x8c, 0x5c, 0x83, 0x01, 0x6d, 0x43, 0x02, 0x8c, 0x7a, 0x97, 0x01, 0x6d,
+    0x49, 0x8b, 0x01, 0x6d, 0x51, 0x87, 0x01, 0x6d, 0x6b, 0x02, 0x8c, 0x7e,
+    0x91, 0x01, 0x6d, 0x70, 0x83, 0x00, 0x93, 0xd8, 0x87, 0x00, 0x93, 0xe0,
+    0x83, 0x01, 0x6c, 0x38, 0x83, 0x00, 0x99, 0x43, 0x02, 0x8c, 0x82, 0x97,
+    0x00, 0x99, 0x49, 0x8b, 0x00, 0x99, 0x51, 0x87, 0x00, 0x99, 0x6b, 0x02,
+    0x8c, 0x86, 0x91, 0x00, 0x99, 0x73, 0x02, 0x8c, 0x8a, 0xc2, 0x01, 0x4a,
+    0x00, 0x99, 0x78, 0x91, 0x05, 0x58, 0xb1, 0x87, 0x05, 0x58, 0xab, 0x02,
+    0x8c, 0x8e, 0xc2, 0x04, 0xc6, 0x05, 0x58, 0x99, 0x8b, 0x05, 0x58, 0x91,
+    0x97, 0x05, 0x58, 0x88, 0x0a, 0xc2, 0x8c, 0x92, 0x83, 0x00, 0x97, 0xc3,
+    0x02, 0x8c, 0xab, 0x97, 0x00, 0x97, 0xc9, 0x8b, 0x00, 0x97, 0xd1, 0x87,
+    0x00, 0x97, 0xeb, 0x02, 0x8c, 0xaf, 0x91, 0x00, 0x97, 0xf3, 0x02, 0x8c,
+    0xb3, 0xc2, 0x01, 0x4a, 0x00, 0x97, 0xf8, 0x83, 0x00, 0x97, 0x98, 0x87,
+    0x00, 0x97, 0xa0, 0x83, 0x01, 0x6c, 0x60, 0x91, 0x05, 0x58, 0x31, 0x87,
+    0x05, 0x58, 0x2b, 0x02, 0x8c, 0xb7, 0xc2, 0x04, 0xc6, 0x05, 0x58, 0x19,
+    0x8b, 0x05, 0x58, 0x11, 0x97, 0x05, 0x58, 0x08, 0x83, 0x00, 0x93, 0x98,
+    0x87, 0x00, 0x93, 0xa0, 0x83, 0x01, 0x6c, 0x30, 0x83, 0x00, 0x99, 0x03,
+    0x02, 0x8c, 0xbb, 0x97, 0x00, 0x99, 0x09, 0x8b, 0x00, 0x99, 0x11, 0x87,
+    0x00, 0x99, 0x2b, 0x02, 0x8c, 0xbf, 0x91, 0x00, 0x99, 0x33, 0x02, 0x8c,
+    0xc3, 0xc2, 0x01, 0x4a, 0x00, 0x99, 0x38, 0x83, 0x00, 0x99, 0xc3, 0x02,
+    0x8c, 0xc7, 0x97, 0x00, 0x99, 0xc9, 0x8b, 0x00, 0x99, 0xd1, 0x87, 0x00,
+    0x99, 0xeb, 0x02, 0x8c, 0xcb, 0x91, 0x00, 0x99, 0xf1, 0xc2, 0x01, 0x4a,
+    0x00, 0x99, 0xf8, 0x83, 0x00, 0x9a, 0x03, 0x02, 0x8c, 0xcf, 0x97, 0x00,
+    0x9a, 0x09, 0x8b, 0x00, 0x9a, 0x11, 0x87, 0x00, 0x9a, 0x2b, 0x02, 0x8c,
+    0xd3, 0x91, 0x00, 0x9a, 0x32, 0x02, 0x8c, 0xd7, 0x83, 0x00, 0x90, 0x58,
+    0x87, 0x00, 0x90, 0x60, 0x83, 0x01, 0x6c, 0x00, 0x83, 0x00, 0x90, 0xd8,
+    0x87, 0x00, 0x90, 0xe0, 0x83, 0x01, 0x6c, 0x08, 0x83, 0x00, 0x90, 0xf9,
+    0x8b, 0x00, 0x9c, 0x09, 0x87, 0x00, 0x9c, 0x1a, 0x02, 0x8c, 0xdb, 0x83,
+    0x00, 0x91, 0x03, 0x02, 0x8c, 0xdf, 0x97, 0x00, 0x91, 0x09, 0x8b, 0x00,
+    0x91, 0x11, 0x87, 0x00, 0x91, 0x2b, 0x02, 0x8c, 0xe3, 0x91, 0x00, 0x91,
+    0x31, 0xc2, 0x01, 0x4a, 0x00, 0x91, 0x38, 0x83, 0x00, 0x91, 0x98, 0x87,
+    0x00, 0x91, 0xa1, 0x48, 0xbd, 0x4a, 0x42, 0x8c, 0xe7, 0x83, 0x01, 0x6c,
+    0x18, 0x83, 0x00, 0x91, 0xc3, 0x02, 0x8c, 0xff, 0x97, 0x00, 0x91, 0xc9,
+    0x8b, 0x00, 0x91, 0xd1, 0x87, 0x00, 0x91, 0xeb, 0x02, 0x8d, 0x03, 0x91,
+    0x00, 0x91, 0xf3, 0x02, 0x8d, 0x07, 0xc2, 0x01, 0x4a, 0x00, 0x91, 0xf8,
+    0x83, 0x01, 0x6d, 0x03, 0x02, 0x8d, 0x0b, 0x97, 0x01, 0x6d, 0x09, 0x8b,
+    0x01, 0x6d, 0x11, 0x87, 0x01, 0x6d, 0x2b, 0x02, 0x8d, 0x0f, 0x91, 0x01,
+    0x6d, 0x30, 0x83, 0x00, 0x91, 0x58, 0x87, 0x00, 0x91, 0x60, 0x83, 0x01,
+    0x6c, 0x10, 0x83, 0x00, 0x92, 0x18, 0x87, 0x00, 0x92, 0x20, 0x83, 0x00,
+    0x92, 0x38, 0x83, 0x00, 0x92, 0x43, 0x02, 0x8d, 0x13, 0x8b, 0x00, 0x92,
+    0x51, 0x87, 0x00, 0x92, 0x6a, 0x02, 0x8d, 0x17, 0x83, 0x00, 0x92, 0x83,
+    0x02, 0x8d, 0x1b, 0x97, 0x00, 0x92, 0x89, 0x8b, 0x00, 0x92, 0x91, 0x87,
+    0x00, 0x92, 0xab, 0x02, 0x8d, 0x1f, 0x91, 0x00, 0x92, 0xb1, 0x19, 0x42,
+    0x8d, 0x23, 0x83, 0x01, 0x6e, 0x03, 0x02, 0x8d, 0x35, 0x97, 0x01, 0x6e,
+    0x09, 0x8b, 0x01, 0x6e, 0x11, 0x87, 0x01, 0x6e, 0x2b, 0x02, 0x8d, 0x39,
+    0x91, 0x01, 0x6e, 0x30, 0x83, 0x00, 0x93, 0x58, 0x87, 0x00, 0x93, 0x60,
+    0x83, 0x00, 0x94, 0x18, 0x87, 0x00, 0x94, 0x20, 0x83, 0x00, 0x94, 0x38,
+    0x83, 0x00, 0x94, 0x43, 0x02, 0x8d, 0x3d, 0x8b, 0x00, 0x94, 0x51, 0x87,
+    0x00, 0x94, 0x6a, 0x02, 0x8d, 0x41, 0x83, 0x01, 0x6e, 0x83, 0x02, 0x8d,
+    0x45, 0x97, 0x01, 0x6e, 0x89, 0x8b, 0x01, 0x6e, 0x91, 0x87, 0x01, 0x6e,
+    0xab, 0x02, 0x8d, 0x49, 0x91, 0x01, 0x6e, 0xb0, 0x83, 0x00, 0x94, 0x98,
+    0x87, 0x00, 0x94, 0xa0, 0x83, 0x01, 0x6c, 0x40, 0x83, 0x00, 0x94, 0xc3,
+    0x02, 0x8d, 0x4d, 0x97, 0x00, 0x94, 0xc9, 0x8b, 0x00, 0x94, 0xd1, 0x87,
+    0x00, 0x94, 0xeb, 0x02, 0x8d, 0x51, 0x91, 0x00, 0x94, 0xf3, 0x02, 0x8d,
+    0x55, 0xc2, 0x01, 0x4a, 0x00, 0x94, 0xf8, 0x83, 0x00, 0x95, 0x58, 0x87,
+    0x00, 0x95, 0x60, 0x83, 0x00, 0x95, 0x78, 0x83, 0x00, 0x95, 0x83, 0x02,
+    0x8d, 0x59, 0x8b, 0x00, 0x95, 0x91, 0x87, 0x00, 0x95, 0xaa, 0x02, 0x8d,
+    0x5d, 0x83, 0x00, 0x95, 0xc3, 0x02, 0x8d, 0x61, 0x97, 0x00, 0x95, 0xc9,
+    0x8b, 0x00, 0x95, 0xd1, 0x87, 0x00, 0x95, 0xeb, 0x02, 0x8d, 0x65, 0x91,
+    0x00, 0x95, 0xf1, 0x19, 0x42, 0x8d, 0x69, 0x83, 0x01, 0x6e, 0x43, 0x02,
+    0x8d, 0x7b, 0x97, 0x01, 0x6e, 0x49, 0x8b, 0x01, 0x6e, 0x51, 0x87, 0x01,
+    0x6e, 0x6b, 0x02, 0x8d, 0x7f, 0x91, 0x01, 0x6e, 0x70, 0x83, 0x00, 0x96,
+    0x58, 0x87, 0x00, 0x96, 0x60, 0x83, 0x00, 0x96, 0x78, 0x83, 0x00, 0x99,
+    0x83, 0x02, 0x8d, 0x83, 0x97, 0x00, 0x99, 0x89, 0x8b, 0x00, 0x99, 0x91,
+    0x87, 0x00, 0x99, 0xab, 0x02, 0x8d, 0x8d, 0x91, 0x00, 0x99, 0xb3, 0x02,
+    0x8d, 0x91, 0xc2, 0x01, 0x4a, 0x00, 0x99, 0xb8, 0x83, 0x00, 0x9a, 0x98,
+    0x87, 0x00, 0x9a, 0xa0, 0x83, 0x01, 0x6c, 0x90, 0x83, 0x00, 0x9a, 0xb9,
+    0x8b, 0x00, 0x9c, 0x69, 0x87, 0x00, 0x9c, 0x7a, 0x02, 0x8d, 0x95, 0x83,
+    0x00, 0x96, 0xd8, 0x87, 0x00, 0x96, 0xe0, 0x83, 0x01, 0x6c, 0x58, 0x83,
+    0x00, 0x97, 0x03, 0x02, 0x8d, 0x99, 0x97, 0x00, 0x97, 0x09, 0x8b, 0x00,
+    0x97, 0x11, 0x87, 0x00, 0x97, 0x2b, 0x02, 0x8d, 0x9d, 0x91, 0x00, 0x97,
+    0x31, 0xc2, 0x01, 0x4a, 0x00, 0x97, 0x38, 0x83, 0x01, 0x6d, 0x83, 0x02,
+    0x8d, 0xa1, 0x97, 0x01, 0x6d, 0x89, 0x8b, 0x01, 0x6d, 0x91, 0x87, 0x01,
+    0x6d, 0xab, 0x02, 0x8d, 0xa5, 0x91, 0x01, 0x6d, 0xb0, 0x83, 0x00, 0x97,
+    0x58, 0x87, 0x00, 0x97, 0x60, 0x83, 0x00, 0x97, 0x78, 0x83, 0x00, 0x98,
+    0x18, 0x87, 0x00, 0x98, 0x20, 0x83, 0x01, 0x6c, 0x70, 0x83, 0x00, 0x9a,
+    0x58, 0x87, 0x00, 0x9a, 0x60, 0x83, 0x00, 0x9a, 0x79, 0x8b, 0x00, 0x9c,
+    0x49, 0x87, 0x00, 0x9c, 0x5a, 0x02, 0x8d, 0xa9, 0xd5, 0x36, 0x47, 0x00,
+    0x9a, 0xe9, 0xc4, 0x01, 0xc3, 0x00, 0x9a, 0xf8, 0xc7, 0x09, 0x0d, 0x01,
+    0x3e, 0x91, 0xc9, 0x03, 0xc8, 0x01, 0x56, 0xc8, 0xd6, 0x2d, 0xba, 0x01,
+    0x17, 0xc9, 0xc8, 0x52, 0x09, 0x01, 0x17, 0xc1, 0xc7, 0x80, 0x70, 0x01,
+    0x17, 0xb1, 0xc9, 0x16, 0x14, 0x01, 0x17, 0xa9, 0x48, 0x00, 0x5f, 0xc2,
+    0x8d, 0xad, 0xd6, 0x2c, 0x86, 0x01, 0x17, 0x90, 0xc3, 0x77, 0x79, 0x08,
+    0x7f, 0x89, 0xc4, 0xdc, 0x2d, 0x08, 0x7f, 0x70, 0xc6, 0x06, 0xe1, 0x00,
+    0x00, 0xb8, 0xc8, 0xb7, 0x72, 0x01, 0x16, 0xf9, 0xc8, 0xbf, 0x4a, 0x01,
+    0x16, 0xf1, 0xcc, 0x07, 0xc7, 0x01, 0x16, 0xe9, 0xc9, 0x00, 0xca, 0x01,
+    0x16, 0xe0, 0x03, 0xc2, 0x8d, 0xb3, 0x45, 0x00, 0x8c, 0x42, 0x8d, 0xc2,
+    0x97, 0x08, 0xec, 0xa1, 0x8b, 0x08, 0xec, 0x89, 0x83, 0x08, 0xec, 0x50,
+    0x97, 0x08, 0xec, 0x70, 0x8b, 0x08, 0xec, 0x60, 0xc2, 0x00, 0xd0, 0x08,
+    0xec, 0x19, 0x83, 0x08, 0xec, 0x10, 0xc2, 0x00, 0xd0, 0x08, 0xeb, 0xf1,
+    0x83, 0x08, 0xeb, 0xe8, 0x83, 0x00, 0x50, 0xb1, 0xc2, 0x00, 0xd0, 0x00,
+    0x52, 0xc8, 0x83, 0x00, 0x50, 0xc1, 0xc2, 0x00, 0xd0, 0x00, 0x52, 0xd0,
+    0x83, 0x00, 0x50, 0xf9, 0xc2, 0x00, 0xd0, 0x00, 0x51, 0x00, 0x83, 0x00,
+    0x51, 0x09, 0xc2, 0x00, 0xd0, 0x00, 0x51, 0x10, 0x94, 0x00, 0x54, 0x5b,
+    0x02, 0x8d, 0xd8, 0x8e, 0x00, 0x54, 0x62, 0x02, 0x8d, 0xdc, 0x83, 0x00,
+    0x54, 0xf9, 0xc2, 0x00, 0xd0, 0x00, 0x55, 0x00, 0x83, 0x00, 0x55, 0x09,
+    0xc2, 0x00, 0xd0, 0x00, 0x55, 0x10, 0x83, 0x00, 0x55, 0xf1, 0x8b, 0x00,
+    0x56, 0x41, 0x97, 0x00, 0x56, 0x60, 0x8b, 0x00, 0x56, 0x00, 0x97, 0x00,
+    0x56, 0x10, 0x94, 0x00, 0x56, 0x1b, 0x02, 0x8d, 0xe0, 0x8e, 0x00, 0x57,
+    0x12, 0x02, 0x8d, 0xe4, 0x87, 0x00, 0x56, 0x29, 0x91, 0x00, 0x56, 0x48,
+    0xcd, 0x7c, 0xb5, 0x0e, 0x92, 0x29, 0xcc, 0x8c, 0x0d, 0x08, 0x0c, 0x08,
+    0x5b, 0x17, 0x97, 0xc2, 0x8d, 0xe8, 0xcc, 0x81, 0x21, 0x08, 0x0c, 0x68,
+    0x55, 0x37, 0xd6, 0xc2, 0x8e, 0x10, 0xc4, 0x28, 0x48, 0x00, 0xff, 0x78,
+    0xc4, 0x59, 0x13, 0x00, 0xff, 0xf3, 0x02, 0x8e, 0x3d, 0x49, 0x63, 0xd3,
+    0xc2, 0x8e, 0x43, 0xcb, 0x9a, 0x26, 0x08, 0x0b, 0xd8, 0xc3, 0x46, 0x46,
+    0x00, 0xff, 0xe9, 0x43, 0x02, 0x6f, 0xc2, 0x8e, 0x4f, 0xc8, 0xb6, 0xa2,
+    0x08, 0x0b, 0xe1, 0xca, 0xa4, 0xf4, 0x08, 0x0c, 0x20, 0x0e, 0xc2, 0x8e,
+    0x5e, 0xca, 0x9c, 0x84, 0x00, 0x1e, 0x79, 0xcc, 0x89, 0xfd, 0x00, 0x1f,
+    0xa1, 0x49, 0x11, 0x74, 0xc2, 0x8e, 0x6a, 0xda, 0x1a, 0x64, 0x00, 0x1f,
+    0xf0, 0x45, 0x03, 0x14, 0xc2, 0x8e, 0x76, 0x56, 0x2c, 0xde, 0xc2, 0x8e,
+    0x88, 0xcc, 0x86, 0x49, 0x08, 0x0c, 0x61, 0xcd, 0x79, 0xdd, 0x08, 0x0d,
+    0x00, 0xc4, 0x7a, 0x04, 0x00, 0xfd, 0xfb, 0x02, 0x8e, 0xa6, 0xca, 0x94,
+    0x91, 0x00, 0xfe, 0x01, 0xcd, 0x42, 0x94, 0x00, 0xfd, 0xf1, 0xc8, 0x9c,
+    0x0e, 0x00, 0x1e, 0xb1, 0xc9, 0xaa, 0x9e, 0x00, 0x1e, 0xa8, 0xc6, 0x57,
+    0xbc, 0x00, 0xfd, 0xe9, 0x03, 0xc2, 0x8e, 0xac, 0xd0, 0x5b, 0xe2, 0x08,
+    0x0c, 0x10, 0x46, 0x02, 0x0f, 0xc2, 0x8e, 0xb8, 0xd1, 0x56, 0x95, 0x00,
+    0x1b, 0xa9, 0x46, 0x10, 0x38, 0xc2, 0x8e, 0xd4, 0xc9, 0xab, 0x76, 0x08,
+    0x0c, 0x18, 0xcc, 0x4a, 0x69, 0x00, 0x1b, 0xd1, 0xc8, 0xab, 0xe3, 0x08,
+    0x0b, 0xc8, 0xc4, 0x63, 0xdd, 0x00, 0x1c, 0x21, 0x0a, 0xc2, 0x8e, 0xe0,
+    0x43, 0x02, 0xa0, 0xc2, 0x8e, 0xec, 0xca, 0xa1, 0xf2, 0x08, 0x0b, 0xd1,
+    0xd1, 0x54, 0x53, 0x08, 0x0c, 0x48, 0xc9, 0xae, 0x10, 0x00, 0x1c, 0x39,
+    0x4a, 0xa2, 0x9c, 0xc2, 0x8e, 0xf8, 0x14, 0x42, 0x8f, 0x2a, 0x43, 0x60,
+    0xe8, 0xc2, 0x8f, 0x36, 0xdd, 0x10, 0x2f, 0x00, 0x1f, 0xb0, 0xce, 0x71,
+    0x84, 0x08, 0x0b, 0xf9, 0xce, 0x72, 0x64, 0x08, 0x0c, 0x00, 0xcb, 0x20,
+    0xb6, 0x00, 0x1e, 0x91, 0xd5, 0x31, 0xee, 0x00, 0x1e, 0x99, 0xd9, 0x20,
+    0xa8, 0x00, 0x1e, 0xa0, 0xca, 0x37, 0x4e, 0x01, 0x17, 0x39, 0xc5, 0x07,
+    0x62, 0x01, 0x13, 0x48, 0xc9, 0x00, 0xca, 0x01, 0x13, 0xb9, 0x43, 0x00,
+    0xe2, 0xc2, 0x8f, 0x42, 0xd0, 0x5a, 0x92, 0x01, 0x53, 0xf3, 0x02, 0x8f,
+    0x4e, 0xcb, 0x1a, 0x1a, 0x01, 0x54, 0x30, 0xc9, 0x07, 0x5e, 0x01, 0x13,
+    0x39, 0xd1, 0x51, 0x01, 0x01, 0x55, 0x20, 0xd0, 0x03, 0xb7, 0x01, 0x4b,
+    0xc1, 0x06, 0xc2, 0x8f, 0x54, 0x15, 0xc2, 0x8f, 0x5a, 0x0e, 0x42, 0x8f,
+    0x66, 0xd8, 0x24, 0x3b, 0x01, 0x54, 0x41, 0xcf, 0x62, 0xb5, 0x01, 0x54,
+    0x50, 0x8e, 0x08, 0x9b, 0x13, 0x02, 0x8f, 0x6c, 0x94, 0x08, 0x9a, 0x1a,
+    0x02, 0x8f, 0x70, 0x97, 0x08, 0x9a, 0x61, 0x8b, 0x08, 0x9a, 0x41, 0x83,
+    0x08, 0x99, 0xf0, 0x97, 0x08, 0x9a, 0x10, 0x8b, 0x08, 0x9a, 0x00, 0x47,
+    0xb2, 0x2e, 0xc2, 0x8f, 0x74, 0x45, 0x04, 0xaf, 0xc2, 0x8f, 0x82, 0x83,
+    0x08, 0x99, 0xa8, 0x83, 0x08, 0x99, 0xc1, 0xc2, 0x0d, 0xf6, 0x08, 0x99,
+    0xb9, 0xc2, 0x00, 0xd0, 0x08, 0x99, 0xb0, 0xc2, 0x00, 0xdb, 0x08, 0x99,
+    0x99, 0x83, 0x08, 0x99, 0x90, 0xc2, 0x00, 0xd0, 0x08, 0x99, 0x69, 0x83,
+    0x08, 0x99, 0x60, 0xc2, 0x00, 0xd0, 0x08, 0x99, 0x59, 0x83, 0x08, 0x99,
+    0x50, 0xc2, 0x00, 0xd0, 0x08, 0x99, 0x39, 0x83, 0x08, 0x99, 0x31, 0x06,
+    0x42, 0x8f, 0x8e, 0xc2, 0x00, 0xd0, 0x08, 0x99, 0x29, 0x16, 0xc2, 0x8f,
+    0x98, 0x83, 0x08, 0x99, 0x20, 0xc2, 0x19, 0x2c, 0x08, 0x98, 0xf1, 0xc2,
+    0x01, 0x30, 0x08, 0x98, 0xc9, 0xc2, 0x00, 0xc1, 0x08, 0x99, 0x19, 0x83,
+    0x08, 0x99, 0x40, 0xc2, 0x00, 0xd0, 0x08, 0x98, 0xe9, 0x83, 0x08, 0x98,
+    0xe0, 0xc2, 0x00, 0xd0, 0x08, 0x98, 0xd9, 0x83, 0x08, 0x98, 0xd0, 0xc2,
+    0x00, 0xd0, 0x08, 0x98, 0xc1, 0x83, 0x08, 0x98, 0xb8, 0xc2, 0x00, 0xd0,
+    0x08, 0x98, 0xb1, 0x83, 0x08, 0x98, 0xa8, 0x97, 0x08, 0x98, 0xa1, 0x8b,
+    0x08, 0x98, 0x81, 0x83, 0x08, 0x98, 0x30, 0x97, 0x08, 0x98, 0x50, 0x8b,
+    0x08, 0x98, 0x40, 0xc4, 0x1e, 0x97, 0x08, 0x9a, 0x69, 0xc5, 0x40, 0xe7,
+    0x08, 0x98, 0x18, 0xc7, 0x7a, 0x7f, 0x08, 0x99, 0xe9, 0xc7, 0x14, 0x39,
+    0x08, 0x98, 0x10, 0xca, 0x1e, 0x8a, 0x08, 0x98, 0x09, 0xd7, 0x29, 0x29,
+    0x08, 0x98, 0x00, 0x15, 0xc2, 0x8f, 0xa2, 0xdb, 0x17, 0x10, 0x0f, 0xc9,
+    0x50, 0xc9, 0xb1, 0xee, 0x00, 0xe5, 0xf9, 0x95, 0x00, 0xe4, 0xd0, 0x03,
+    0xc2, 0x8f, 0xae, 0xc2, 0x49, 0x0c, 0x00, 0xe5, 0xa9, 0xc2, 0x02, 0x0a,
+    0x00, 0xe5, 0x91, 0x87, 0x00, 0xe5, 0x88, 0xc2, 0x00, 0xc4, 0x00, 0xe5,
+    0xe9, 0xc2, 0x00, 0x74, 0x00, 0xe5, 0xd1, 0x90, 0x00, 0xe4, 0x80, 0xc9,
+    0xb1, 0x04, 0x00, 0xe5, 0xc9, 0x03, 0x42, 0x8f, 0xb9, 0xc4, 0x8c, 0x72,
+    0x00, 0xe5, 0xc1, 0x90, 0x00, 0xe4, 0xa0, 0xc3, 0x00, 0xd0, 0x00, 0xe5,
+    0x79, 0xc2, 0x00, 0x71, 0x00, 0xe5, 0x58, 0x0a, 0xc2, 0x8f, 0xc1, 0xc2,
+    0x00, 0x71, 0x00, 0xe5, 0x61, 0xc2, 0x00, 0xd1, 0x00, 0xe5, 0x50, 0xc3,
+    0x11, 0xef, 0x00, 0xe5, 0x41, 0xc2, 0x00, 0xd1, 0x00, 0xe5, 0x08, 0xc3,
+    0x00, 0xd0, 0x00, 0xe5, 0x31, 0xc2, 0x00, 0xd1, 0x00, 0xe4, 0x90, 0xc3,
+    0x01, 0x50, 0x00, 0xe5, 0x29, 0xc2, 0x00, 0xd1, 0x00, 0xe4, 0xc8, 0xc3,
+    0x01, 0x50, 0x00, 0xe5, 0x21, 0xc2, 0x00, 0xb1, 0x00, 0xe4, 0xf0, 0xc3,
+    0x01, 0x50, 0x00, 0xe4, 0xf9, 0xc2, 0x00, 0xc4, 0x00, 0xe4, 0xb0, 0x90,
+    0x00, 0x85, 0x01, 0xc2, 0x00, 0xc4, 0x00, 0x86, 0x68, 0xc2, 0x00, 0xd1,
+    0x00, 0x85, 0x11, 0xc3, 0x00, 0xd0, 0x00, 0x85, 0xb0, 0xc2, 0x00, 0xc4,
+    0x00, 0x85, 0x31, 0xc3, 0x01, 0x50, 0x00, 0x85, 0x78, 0x90, 0x00, 0x85,
+    0x39, 0x94, 0x00, 0x85, 0x90, 0xc2, 0x00, 0xd1, 0x00, 0x85, 0x49, 0xc3,
+    0x01, 0x50, 0x00, 0x85, 0xa8, 0xc2, 0x00, 0xb1, 0x00, 0x85, 0x71, 0xc3,
+    0x01, 0x50, 0x00, 0x85, 0xa0, 0xc2, 0x00, 0xd1, 0x00, 0x85, 0x89, 0xc3,
+    0x11, 0xef, 0x00, 0x85, 0xc0, 0x0a, 0xc2, 0x8f, 0xcd, 0xc2, 0x00, 0xd1,
+    0x00, 0x85, 0xd1, 0xc2, 0x00, 0x71, 0x00, 0x85, 0xe0, 0xc2, 0x00, 0x71,
+    0x00, 0x85, 0xd9, 0xc3, 0x00, 0xd0, 0x00, 0x85, 0xf8, 0x03, 0xc2, 0x8f,
+    0xd9, 0x87, 0x00, 0x86, 0x09, 0xc2, 0x02, 0x0a, 0x00, 0x86, 0x11, 0xc2,
+    0x49, 0x0c, 0x00, 0x86, 0x28, 0x90, 0x00, 0x86, 0x81, 0xc2, 0x00, 0x74,
+    0x00, 0x87, 0xd1, 0xc2, 0x00, 0xc4, 0x00, 0x87, 0xe8, 0xc2, 0x00, 0xd1,
+    0x00, 0x86, 0x91, 0xc3, 0x00, 0xd0, 0x00, 0x87, 0x30, 0x90, 0x00, 0x86,
+    0xa1, 0xc4, 0x8c, 0x72, 0x00, 0x87, 0xc0, 0xc2, 0x00, 0xc4, 0x00, 0x86,
+    0xb1, 0xc3, 0x01, 0x50, 0x00, 0x86, 0xf8, 0x03, 0xc2, 0x8f, 0xe1, 0xc9,
+    0xb1, 0x04, 0x00, 0x87, 0xc8, 0xc2, 0x00, 0xd1, 0x00, 0x86, 0xc9, 0xc3,
+    0x01, 0x50, 0x00, 0x87, 0x28, 0x95, 0x00, 0x86, 0xd1, 0xc9, 0xb1, 0xee,
+    0x00, 0x87, 0xf8, 0xc2, 0x00, 0xb1, 0x00, 0x86, 0xf1, 0xc3, 0x01, 0x50,
+    0x00, 0x87, 0x20, 0xc2, 0x00, 0xd1, 0x00, 0x87, 0x09, 0xc3, 0x11, 0xef,
+    0x00, 0x87, 0x40, 0x0a, 0xc2, 0x8f, 0xe9, 0xc2, 0x00, 0xd1, 0x00, 0x87,
+    0x51, 0xc2, 0x00, 0x71, 0x00, 0x87, 0x60, 0xc2, 0x00, 0x71, 0x00, 0x87,
+    0x59, 0xc3, 0x00, 0xd0, 0x00, 0x87, 0x78, 0x03, 0xc2, 0x8f, 0xf5, 0x87,
+    0x00, 0x87, 0x89, 0xc2, 0x02, 0x0a, 0x00, 0x87, 0x91, 0xc2, 0x49, 0x0c,
+    0x00, 0x87, 0xa8, 0x90, 0x01, 0x68, 0x01, 0xc2, 0x00, 0xc4, 0x01, 0x69,
+    0x68, 0xc2, 0x00, 0xd1, 0x01, 0x68, 0x11, 0xc3, 0x00, 0xd0, 0x01, 0x68,
+    0xb0, 0xc2, 0x00, 0xc4, 0x01, 0x68, 0x31, 0xc3, 0x01, 0x50, 0x01, 0x68,
+    0x78, 0x90, 0x01, 0x68, 0x39, 0x94, 0x01, 0x68, 0x90, 0xc2, 0x00, 0xd1,
+    0x01, 0x68, 0x49, 0xc3, 0x01, 0x50, 0x01, 0x68, 0xa8, 0xc2, 0x00, 0xb1,
+    0x01, 0x68, 0x71, 0xc3, 0x01, 0x50, 0x01, 0x68, 0xa0, 0xc2, 0x00, 0xd1,
+    0x01, 0x68, 0x89, 0xc3, 0x11, 0xef, 0x01, 0x68, 0xc0, 0x0a, 0xc2, 0x90,
+    0x00, 0xc2, 0x00, 0xd1, 0x01, 0x68, 0xd1, 0xc2, 0x00, 0x71, 0x01, 0x68,
+    0xe0, 0xc2, 0x00, 0x71, 0x01, 0x68, 0xd9, 0xc3, 0x00, 0xd0, 0x01, 0x68,
+    0xf8, 0x03, 0xc2, 0x90, 0x0c, 0x87, 0x01, 0x69, 0x09, 0xc2, 0x02, 0x0a,
+    0x01, 0x69, 0x11, 0xc2, 0x49, 0x0c, 0x01, 0x69, 0x28, 0xc3, 0xc8, 0x92,
+    0x01, 0x60, 0x09, 0xc6, 0xc8, 0x01, 0x01, 0x61, 0x40, 0xc4, 0xe4, 0x2b,
+    0x01, 0x60, 0x21, 0xc4, 0xdf, 0x03, 0x01, 0x60, 0x39, 0xc5, 0xdd, 0xb2,
+    0x01, 0x60, 0x60, 0x07, 0xc2, 0x90, 0x14, 0xc3, 0x01, 0xbd, 0x01, 0x61,
+    0x09, 0x97, 0x01, 0x61, 0x19, 0x91, 0x01, 0x61, 0x30, 0xc6, 0xd3, 0x73,
+    0x01, 0x60, 0x31, 0xc5, 0xda, 0x24, 0x01, 0x60, 0x40, 0x42, 0x25, 0xa1,
+    0xc2, 0x90, 0x1e, 0xcb, 0x98, 0x00, 0x01, 0x60, 0x51, 0x47, 0x1c, 0xa0,
+    0x42, 0x90, 0x28, 0xc6, 0xc4, 0x49, 0x01, 0x60, 0x71, 0xcf, 0x60, 0xd5,
+    0x01, 0x61, 0x70, 0xc2, 0x06, 0xc6, 0x01, 0x60, 0x89, 0xc2, 0x00, 0x16,
+    0x01, 0x60, 0xc8, 0xc5, 0xcb, 0xee, 0x01, 0x60, 0x91, 0x87, 0x01, 0x60,
+    0xd0, 0xc4, 0xe4, 0xc7, 0x01, 0x60, 0xa1, 0x0a, 0xc2, 0x90, 0x34, 0xc9,
+    0xae, 0x19, 0x01, 0x61, 0x11, 0xc8, 0xae, 0x6b, 0x01, 0x61, 0x22, 0x02,
+    0x90, 0x41, 0xc5, 0xd9, 0x52, 0x01, 0x60, 0xa9, 0xc2, 0x00, 0xba, 0x01,
+    0x60, 0xe1, 0xcb, 0x97, 0x92, 0x01, 0x61, 0x68, 0xc4, 0xac, 0x24, 0x01,
+    0x60, 0xb9, 0xc3, 0x02, 0x44, 0x01, 0x61, 0x50, 0xc5, 0x7b, 0xac, 0x01,
+    0x60, 0xe9, 0xcd, 0x7b, 0xa4, 0x01, 0x61, 0x78, 0xc3, 0xc8, 0x92, 0x01,
+    0x61, 0x89, 0xc6, 0xc8, 0x01, 0x01, 0x62, 0xc0, 0xc4, 0xe4, 0x2b, 0x01,
+    0x61, 0xa1, 0xc4, 0xdf, 0x03, 0x01, 0x61, 0xb9, 0xc5, 0xdd, 0xb2, 0x01,
+    0x61, 0xe0, 0x07, 0xc2, 0x90, 0x47, 0xc3, 0x01, 0xbd, 0x01, 0x62, 0x89,
+    0x97, 0x01, 0x62, 0x99, 0x91, 0x01, 0x62, 0xb0, 0xc6, 0xd3, 0x73, 0x01,
+    0x61, 0xb1, 0xc5, 0xda, 0x24, 0x01, 0x61, 0xc0, 0x42, 0x25, 0xa1, 0xc2,
+    0x90, 0x51, 0xcb, 0x98, 0x00, 0x01, 0x61, 0xd1, 0x47, 0x1c, 0xa0, 0x42,
+    0x90, 0x5b, 0xc6, 0xc4, 0x49, 0x01, 0x61, 0xf1, 0xcf, 0x60, 0xd5, 0x01,
+    0x62, 0xf0, 0xc2, 0x06, 0xc6, 0x01, 0x62, 0x09, 0xc2, 0x00, 0x16, 0x01,
+    0x62, 0x48, 0xc5, 0xcb, 0xee, 0x01, 0x62, 0x11, 0x87, 0x01, 0x62, 0x50,
+    0xc4, 0xe4, 0xc7, 0x01, 0x62, 0x21, 0x0a, 0xc2, 0x90, 0x67, 0xc9, 0xae,
+    0x19, 0x01, 0x62, 0x91, 0xc8, 0xae, 0x6b, 0x01, 0x62, 0xa2, 0x02, 0x90,
+    0x74, 0xc5, 0xd9, 0x52, 0x01, 0x62, 0x29, 0xc2, 0x00, 0xba, 0x01, 0x62,
+    0x61, 0xcb, 0x97, 0x92, 0x01, 0x62, 0xe8, 0xc4, 0xac, 0x24, 0x01, 0x62,
+    0x39, 0xc3, 0x02, 0x44, 0x01, 0x62, 0xd0, 0xc5, 0x7b, 0xac, 0x01, 0x62,
+    0x69, 0xcd, 0x7b, 0xa4, 0x01, 0x62, 0xf8, 0xc7, 0x14, 0x39, 0x00, 0x58,
+    0x11, 0xc7, 0x7a, 0x7f, 0x00, 0x59, 0xe8, 0xc5, 0x40, 0xe7, 0x00, 0x58,
+    0x19, 0xc4, 0x1e, 0x97, 0x00, 0x5a, 0x68, 0x83, 0x00, 0x58, 0x31, 0x8b,
+    0x00, 0x58, 0x81, 0x97, 0x00, 0x58, 0xa0, 0x8b, 0x00, 0x58, 0x40, 0x97,
+    0x00, 0x58, 0x50, 0x47, 0xb2, 0x2e, 0xc2, 0x90, 0x7a, 0x83, 0x00, 0x59,
+    0xa8, 0x83, 0x00, 0x58, 0xa9, 0xc2, 0x00, 0xd0, 0x00, 0x58, 0xb0, 0x83,
+    0x00, 0x58, 0xb9, 0xc2, 0x00, 0xd0, 0x00, 0x58, 0xc0, 0xc2, 0x01, 0x30,
+    0x00, 0x58, 0xc9, 0xc2, 0x19, 0x2c, 0x00, 0x58, 0xf1, 0xc2, 0x00, 0xc1,
+    0x00, 0x59, 0x19, 0x83, 0x00, 0x59, 0x40, 0x83, 0x00, 0x58, 0xd1, 0xc2,
+    0x00, 0xd0, 0x00, 0x58, 0xd8, 0x83, 0x00, 0x58, 0xe1, 0xc2, 0x00, 0xd0,
+    0x00, 0x58, 0xe8, 0x16, 0xc2, 0x90, 0x88, 0x83, 0x00, 0x59, 0x21, 0xc2,
+    0x00, 0xd0, 0x00, 0x59, 0x28, 0x06, 0xc2, 0x90, 0x92, 0x83, 0x00, 0x59,
+    0x31, 0xc2, 0x00, 0xd0, 0x00, 0x59, 0x38, 0x83, 0x00, 0x59, 0x51, 0xc2,
+    0x00, 0xd0, 0x00, 0x59, 0x58, 0x83, 0x00, 0x59, 0x61, 0xc2, 0x00, 0xd0,
+    0x00, 0x59, 0x68, 0x83, 0x00, 0x59, 0x79, 0xc2, 0x19, 0x2c, 0x00, 0x5a,
+    0xf8, 0x83, 0x00, 0x59, 0x81, 0xc2, 0x00, 0x39, 0x00, 0x5a, 0xe1, 0xc2,
+    0x00, 0xd0, 0x00, 0x5a, 0xe8, 0x83, 0x00, 0x59, 0x91, 0xc2, 0x00, 0xdb,
+    0x00, 0x59, 0x98, 0xc2, 0x00, 0xd0, 0x00, 0x59, 0xb1, 0xc2, 0x0d, 0xf6,
+    0x00, 0x59, 0xb9, 0x83, 0x00, 0x59, 0xc0, 0x83, 0x00, 0x59, 0xf1, 0x8b,
+    0x00, 0x5a, 0x41, 0x97, 0x00, 0x5a, 0x60, 0x8b, 0x00, 0x5a, 0x00, 0x97,
+    0x00, 0x5a, 0x10, 0x94, 0x00, 0x5a, 0x1b, 0x02, 0x90, 0x9c, 0x8e, 0x00,
+    0x5b, 0x12, 0x02, 0x90, 0xa0, 0xc2, 0x02, 0xa0, 0x00, 0x5b, 0x41, 0xc4,
+    0x02, 0xde, 0x00, 0x5b, 0x48, 0xc3, 0x09, 0x9e, 0x00, 0x5b, 0x51, 0xc3,
+    0x0d, 0x14, 0x00, 0x5b, 0x58, 0xc2, 0x22, 0xcc, 0x00, 0x5b, 0x61, 0xc4,
+    0x18, 0x10, 0x00, 0x5b, 0x68, 0xc7, 0x08, 0x79, 0x00, 0x5b, 0x91, 0xc4,
+    0x01, 0xce, 0x00, 0x5b, 0x99, 0xc9, 0x67, 0x38, 0x00, 0x5b, 0xa9, 0xc6,
+    0x06, 0xdb, 0x00, 0x5b, 0xb0, 0xc8, 0x08, 0x79, 0x00, 0x5b, 0xa1, 0xca,
+    0xa7, 0x88, 0x00, 0x5b, 0xb8, 0xc3, 0x02, 0xdf, 0x0f, 0x68, 0x1b, 0x02,
+    0x90, 0xa4, 0xc4, 0x0d, 0x0e, 0x0f, 0x68, 0x62, 0x02, 0x90, 0xa8, 0x91,
+    0x0f, 0x68, 0x13, 0x02, 0x90, 0xae, 0xc4, 0x18, 0x12, 0x0f, 0x68, 0x5a,
+    0x02, 0x90, 0xb2, 0xc9, 0x57, 0x20, 0x0f, 0x69, 0x28, 0xc2, 0x00, 0x33,
+    0x0f, 0x68, 0x23, 0x02, 0x90, 0xb8, 0xc3, 0x0d, 0x0f, 0x0f, 0x68, 0x6a,
+    0x02, 0x90, 0xbc, 0xc2, 0x00, 0x5f, 0x0f, 0x68, 0x2b, 0x02, 0x90, 0xc2,
+    0xc3, 0x45, 0x6b, 0x0f, 0x68, 0x72, 0x02, 0x90, 0xc6, 0xc7, 0x0d, 0x04,
+    0x0f, 0x68, 0x99, 0xc8, 0x4b, 0x94, 0x0f, 0x68, 0xe0, 0xc2, 0x0d, 0x10,
+    0x0f, 0x68, 0x7b, 0x02, 0x90, 0xcc, 0x00, 0x42, 0x90, 0xd2, 0xc2, 0x0d,
+    0x10, 0x0f, 0x68, 0x83, 0x02, 0x90, 0xde, 0x00, 0x42, 0x90, 0xe4, 0xc9,
+    0x57, 0x20, 0x0f, 0x69, 0x60, 0xc7, 0x0d, 0x04, 0x0f, 0x68, 0xd1, 0xc8,
+    0x4b, 0x94, 0x0f, 0x69, 0x18, 0xc9, 0x57, 0x20, 0x0f, 0x69, 0x68, 0xc7,
+    0x0d, 0x04, 0x0f, 0x68, 0xd9, 0xc8, 0x4b, 0x94, 0x0f, 0x69, 0x20, 0xc9,
+    0x57, 0x20, 0x0f, 0x69, 0xd0, 0xc9, 0x57, 0x20, 0x0f, 0x69, 0xd8, 0xc8,
+    0x0d, 0x03, 0x0f, 0x69, 0xc0, 0xc8, 0x0d, 0x03, 0x0f, 0x69, 0xc8, 0xc6,
+    0x2d, 0xd0, 0x01, 0x3e, 0x21, 0xc4, 0x0e, 0xa6, 0x01, 0x3e, 0x18, 0xd8,
+    0x21, 0x23, 0x01, 0x39, 0xe1, 0xc8, 0x0a, 0xff, 0x01, 0x39, 0x91, 0xca,
+    0x22, 0x51, 0x01, 0x39, 0x59, 0xc5, 0x0d, 0x20, 0x01, 0x38, 0xd8, 0x9a,
+    0x01, 0x21, 0x19, 0xc2, 0x01, 0x25, 0x0f, 0xa6, 0xb0, 0xc5, 0x5f, 0x98,
+    0x0f, 0xae, 0x09, 0xca, 0x9e, 0xf0, 0x0f, 0xa6, 0x10, 0xcc, 0x81, 0x81,
+    0x0f, 0xa7, 0x69, 0xcb, 0x9a, 0x5d, 0x0f, 0xa7, 0x60, 0xcd, 0x78, 0xa5,
+    0x01, 0x1c, 0x81, 0xcd, 0x7a, 0x45, 0x01, 0x1c, 0x78, 0xc9, 0x3b, 0x79,
+    0x08, 0x7c, 0x49, 0x44, 0x02, 0x9f, 0xc2, 0x90, 0xf0, 0xc3, 0x01, 0x5d,
+    0x08, 0x7c, 0x30, 0x49, 0x04, 0xf9, 0xc2, 0x90, 0xfc, 0x44, 0x05, 0x18,
+    0x42, 0x91, 0x08, 0x0e, 0xc2, 0x91, 0x14, 0xc3, 0xb5, 0x3e, 0x08, 0x7c,
+    0x01, 0xc2, 0x00, 0x67, 0x08, 0x7b, 0xe1, 0x15, 0xc2, 0x91, 0x20, 0xc3,
+    0x20, 0x18, 0x08, 0x7b, 0xd1, 0xc3, 0x00, 0x4e, 0x08, 0x7b, 0xc9, 0xc4,
+    0xe0, 0xe7, 0x08, 0x7b, 0xb9, 0xc4, 0x4a, 0xb9, 0x08, 0x7b, 0xb1, 0xca,
+    0x9b, 0x8a, 0x08, 0x7b, 0xa9, 0xc5, 0x4a, 0xb3, 0x08, 0x7b, 0xa1, 0xc3,
+    0x7e, 0x89, 0x08, 0x7b, 0x99, 0xca, 0x9c, 0xa2, 0x08, 0x7b, 0x91, 0xc4,
+    0xe3, 0x27, 0x08, 0x7b, 0x89, 0xc5, 0xa5, 0xfd, 0x08, 0x7b, 0x81, 0xc4,
+    0x5d, 0xe2, 0x08, 0x7b, 0xf0, 0xd1, 0x53, 0xa9, 0x08, 0x79, 0x31, 0x47,
+    0x34, 0x2f, 0xc2, 0x91, 0x2a, 0x0e, 0x42, 0x91, 0x3b, 0x43, 0x2f, 0x2a,
+    0xc2, 0x91, 0x47, 0x47, 0x02, 0x0e, 0x42, 0x91, 0x53, 0xc3, 0x09, 0x41,
+    0x08, 0x67, 0xe1, 0x42, 0x02, 0x09, 0xc2, 0x91, 0xb0, 0xc3, 0x05, 0x14,
+    0x08, 0x67, 0xd2, 0x02, 0x91, 0xbc, 0x97, 0x08, 0x67, 0x53, 0x02, 0x91,
+    0xc0, 0x87, 0x08, 0x66, 0x4b, 0x02, 0x91, 0xce, 0x4a, 0xa7, 0x74, 0xc2,
+    0x92, 0x2e, 0x4b, 0x95, 0x61, 0xc2, 0x92, 0x3a, 0xc8, 0xb9, 0x6a, 0x08,
+    0x67, 0x19, 0x91, 0x08, 0x66, 0xdb, 0x02, 0x92, 0x46, 0x83, 0x08, 0x66,
+    0x03, 0x02, 0x92, 0x50, 0x8b, 0x08, 0x66, 0x83, 0x02, 0x92, 0x64, 0xc7,
+    0xc9, 0x9d, 0x08, 0x66, 0x50, 0x87, 0x08, 0x64, 0x4b, 0x02, 0x92, 0x68,
+    0xc8, 0xb9, 0x6a, 0x08, 0x65, 0x19, 0x91, 0x08, 0x64, 0xdb, 0x02, 0x92,
+    0xc8, 0x4a, 0xa7, 0x74, 0xc2, 0x92, 0xd2, 0x4b, 0x95, 0x61, 0xc2, 0x92,
+    0xde, 0x97, 0x08, 0x65, 0x53, 0x02, 0x92, 0xea, 0x83, 0x08, 0x64, 0x03,
+    0x02, 0x92, 0xf8, 0x8b, 0x08, 0x64, 0x83, 0x02, 0x93, 0x0c, 0xc7, 0xc9,
+    0x9d, 0x08, 0x64, 0x50, 0xc4, 0xe1, 0xaf, 0x08, 0x62, 0x41, 0x91, 0x08,
+    0x60, 0x33, 0x02, 0x93, 0x10, 0x83, 0x08, 0x60, 0x03, 0x02, 0x93, 0x23,
+    0x07, 0xc2, 0x93, 0x52, 0x8b, 0x08, 0x60, 0x1a, 0x02, 0x93, 0x72, 0x83,
+    0x08, 0x60, 0x0b, 0x02, 0x93, 0x7a, 0x87, 0x08, 0x60, 0x2b, 0x02, 0x93,
+    0xad, 0x11, 0xc2, 0x93, 0xbf, 0x8b, 0x08, 0x60, 0x22, 0x02, 0x93, 0xca,
+    0x16, 0xc2, 0x93, 0xce, 0xc3, 0x05, 0x14, 0x08, 0x54, 0xe8, 0x42, 0x02,
+    0x1c, 0xc2, 0x93, 0xda, 0x16, 0xc2, 0x93, 0xe4, 0xc3, 0x2b, 0xb9, 0x08,
+    0x54, 0xd1, 0x09, 0xc2, 0x93, 0xf4, 0x42, 0x0e, 0x9a, 0xc2, 0x94, 0x00,
+    0x43, 0xe6, 0x2c, 0xc2, 0x94, 0x08, 0xc3, 0x7e, 0x89, 0x08, 0x54, 0x29,
+    0xc3, 0x0f, 0x9a, 0x08, 0x54, 0x21, 0xc4, 0x19, 0x60, 0x08, 0x54, 0x19,
+    0x0a, 0xc2, 0x94, 0x14, 0xc3, 0x0d, 0xff, 0x08, 0x54, 0x09, 0xc3, 0x72,
+    0xf0, 0x08, 0x54, 0x39, 0xc3, 0x85, 0xf5, 0x08, 0x54, 0x41, 0x0d, 0xc2,
+    0x94, 0x20, 0xc4, 0x3a, 0x01, 0x08, 0x54, 0x61, 0xc3, 0x0d, 0xf6, 0x08,
+    0x54, 0x71, 0xc3, 0xb1, 0x0d, 0x08, 0x54, 0x81, 0x03, 0x42, 0x94, 0x2c,
+    0xcd, 0x7a, 0xa0, 0x0f, 0xad, 0x99, 0x44, 0x19, 0xb0, 0x42, 0x94, 0x38,
+    0xc2, 0x00, 0xd1, 0x08, 0x1a, 0x81, 0xc3, 0x2b, 0x88, 0x08, 0x1a, 0x89,
+    0xc3, 0x46, 0xf6, 0x08, 0x1a, 0x91, 0x06, 0xc2, 0x94, 0x4a, 0x87, 0x08,
+    0x1a, 0xa3, 0x02, 0x94, 0x54, 0x1c, 0xc2, 0x94, 0x58, 0x8b, 0x08, 0x1a,
+    0xcb, 0x02, 0x94, 0x64, 0xc4, 0xe0, 0xb3, 0x08, 0x1a, 0xd1, 0xc3, 0x39,
+    0xa6, 0x08, 0x1a, 0xd9, 0xc5, 0xdb, 0x19, 0x08, 0x1a, 0xe1, 0xc5, 0xdb,
+    0x6e, 0x08, 0x1a, 0xe9, 0x18, 0xc2, 0x94, 0x6c, 0xc4, 0xcf, 0x74, 0x08,
+    0x1a, 0xf9, 0xc3, 0x26, 0x92, 0x08, 0x1b, 0x01, 0x15, 0xc2, 0x94, 0x78,
+    0x16, 0xc2, 0x94, 0x82, 0x97, 0x08, 0x1b, 0x19, 0xc5, 0xdd, 0x1c, 0x08,
+    0x1b, 0x21, 0x1b, 0xc2, 0x94, 0x8e, 0x91, 0x08, 0x1b, 0x4b, 0x02, 0x94,
+    0xa8, 0xc2, 0x00, 0xd0, 0x08, 0x1b, 0x60, 0xc2, 0x00, 0x51, 0x08, 0x18,
+    0x09, 0x0d, 0xc2, 0x94, 0xac, 0xc2, 0x00, 0x06, 0x08, 0x18, 0x19, 0x87,
+    0x08, 0x18, 0x23, 0x02, 0x94, 0xbe, 0xc2, 0x00, 0x5f, 0x08, 0x18, 0x29,
+    0xc2, 0x0a, 0xe2, 0x08, 0x18, 0x31, 0xc2, 0x01, 0x7f, 0x08, 0x18, 0x39,
+    0x16, 0xc2, 0x94, 0xe2, 0x8b, 0x08, 0x18, 0x4b, 0x02, 0x94, 0xec, 0x83,
+    0x08, 0x18, 0x01, 0x91, 0x08, 0x18, 0x79, 0x12, 0xc2, 0x94, 0xf0, 0x15,
+    0xc2, 0x94, 0xfa, 0x97, 0x08, 0x18, 0xb3, 0x02, 0x95, 0x06, 0xc3, 0x28,
+    0x28, 0x08, 0x18, 0xe1, 0xc2, 0x0c, 0x43, 0x08, 0x19, 0x69, 0xcc, 0x82,
+    0xc5, 0x08, 0x19, 0x70, 0xc3, 0x05, 0x14, 0x08, 0x19, 0x01, 0x42, 0x02,
+    0x09, 0xc2, 0x95, 0x0a, 0xc3, 0x09, 0x41, 0x08, 0x19, 0x10, 0x83, 0x00,
+    0xe2, 0xf8, 0x99, 0x00, 0xe3, 0x19, 0x8f, 0x00, 0xe3, 0x11, 0x8c, 0x00,
+    0xe3, 0x09, 0x8d, 0x00, 0xe3, 0x00, 0xc7, 0x56, 0x8e, 0x01, 0x5d, 0xd1,
+    0xd1, 0x56, 0x84, 0x01, 0x5d, 0xd8, 0x90, 0x08, 0x25, 0x90, 0xc3, 0x1c,
+    0x63, 0x08, 0x25, 0xb1, 0xc2, 0x02, 0x2b, 0x08, 0x25, 0xe9, 0xc2, 0x00,
+    0xb0, 0x08, 0x26, 0x29, 0x16, 0x42, 0x95, 0x16, 0x83, 0x08, 0x26, 0x51,
+    0xc2, 0x00, 0xd0, 0x08, 0x26, 0x60, 0x90, 0x08, 0x26, 0xd0, 0xc3, 0x1c,
+    0x63, 0x08, 0x26, 0xf1, 0xc2, 0x02, 0x2b, 0x08, 0x27, 0x29, 0xc2, 0x00,
+    0xb0, 0x08, 0x27, 0x69, 0x16, 0x42, 0x95, 0x20, 0x83, 0x08, 0x27, 0x91,
+    0xc2, 0x00, 0xd0, 0x08, 0x27, 0xa0, 0x0d, 0xc2, 0x95, 0x2a, 0xcb, 0x93,
+    0x7d, 0x0e, 0x7d, 0x89, 0xc8, 0x4e, 0x4b, 0x0e, 0x7d, 0x80, 0xc6, 0xca,
+    0xa9, 0x0e, 0x7a, 0x88, 0x0d, 0xc2, 0x95, 0x36, 0x16, 0xc2, 0x95, 0x42,
+    0x44, 0xe0, 0x6b, 0xc2, 0x95, 0x4e, 0x49, 0x75, 0xe7, 0xc2, 0x95, 0x5b,
+    0xce, 0x69, 0xa0, 0x0e, 0x7c, 0xb9, 0x12, 0xc2, 0x95, 0x68, 0xce, 0x6d,
+    0xa2, 0x0e, 0x7c, 0x98, 0x00, 0x42, 0x95, 0x72, 0x00, 0x42, 0x95, 0x87,
+    0x42, 0x00, 0x97, 0xc2, 0x95, 0x93, 0xc8, 0xb8, 0x72, 0x0e, 0x7b, 0xf8,
+    0xcb, 0x87, 0x3a, 0x0e, 0x7b, 0xe1, 0xce, 0x69, 0xa0, 0x0e, 0x7b, 0xd9,
+    0xc8, 0x4e, 0x4b, 0x0e, 0x7b, 0xd1, 0xc8, 0xbf, 0x6a, 0x0e, 0x7b, 0xc8,
+    0x45, 0x4e, 0x46, 0xc2, 0x95, 0x9f, 0xce, 0x69, 0xa0, 0x0e, 0x7b, 0xb8,
+    0xc6, 0x6d, 0xaa, 0x0e, 0x7b, 0xa1, 0xca, 0x93, 0x7e, 0x0e, 0x7b, 0x98,
+    0xcc, 0x84, 0x51, 0x0e, 0x7d, 0x59, 0xc7, 0xc8, 0x69, 0x0e, 0x7d, 0x51,
+    0xc3, 0xe5, 0x9c, 0x0e, 0x7d, 0x48, 0xc8, 0xb8, 0x12, 0x0e, 0x79, 0x68,
+    0xc8, 0xbb, 0xe2, 0x0e, 0x79, 0xc8, 0xc9, 0x78, 0xd9, 0x0e, 0x78, 0xc1,
+    0x43, 0x01, 0x55, 0x42, 0x95, 0xab, 0xc5, 0x00, 0x2c, 0x0e, 0x78, 0x89,
+    0xc4, 0x00, 0x49, 0x0e, 0x78, 0x28, 0xc7, 0x93, 0xee, 0x0e, 0x79, 0xb3,
+    0x02, 0x95, 0xb7, 0xc6, 0xcb, 0x33, 0x0e, 0x79, 0x30, 0x15, 0xc2, 0x95,
+    0xbd, 0x43, 0x01, 0x55, 0x42, 0x95, 0xc9, 0xc3, 0xe5, 0x2d, 0x0e, 0x79,
+    0x51, 0xc2, 0x01, 0xc8, 0x0e, 0x79, 0x00, 0x43, 0x01, 0x55, 0xc2, 0x95,
+    0xd5, 0x4d, 0x78, 0xd9, 0x42, 0x95, 0xe1, 0xc6, 0x42, 0x68, 0x0e, 0x78,
+    0xf1, 0x42, 0x00, 0xe7, 0x42, 0x95, 0xed, 0xc5, 0x00, 0x2c, 0x0e, 0x78,
+    0x91, 0xc4, 0x00, 0x49, 0x0e, 0x78, 0x30, 0xc6, 0x78, 0xdc, 0x0e, 0x78,
+    0xe9, 0x4b, 0x8e, 0xfa, 0x42, 0x95, 0xf9, 0xc5, 0x00, 0x2c, 0x0e, 0x78,
+    0xa1, 0xc4, 0x00, 0x49, 0x0e, 0x78, 0x40, 0xc5, 0x00, 0x2c, 0x0e, 0x78,
+    0x81, 0xc4, 0x00, 0x49, 0x0e, 0x78, 0x20, 0xc5, 0x00, 0x2c, 0x0e, 0x78,
+    0x69, 0xc4, 0x00, 0x49, 0x0e, 0x78, 0x08, 0xce, 0x1e, 0x74, 0x08, 0xd1,
+    0xb0, 0xc3, 0x0d, 0x18, 0x05, 0x4e, 0x53, 0x02, 0x96, 0x05, 0xc4, 0xe3,
+    0x8f, 0x05, 0x4e, 0x18, 0xc6, 0xcd, 0xfd, 0x05, 0x4e, 0x39, 0xc6, 0x45,
+    0xa6, 0x05, 0x4e, 0x60, 0x17, 0xc2, 0x96, 0x0b, 0xc5, 0x3a, 0xbc, 0x05,
+    0x4e, 0x40, 0xc6, 0xcb, 0x27, 0x05, 0x4c, 0x98, 0x42, 0x00, 0x4d, 0x42,
+    0x96, 0x17, 0xc6, 0xcb, 0x21, 0x05, 0x4d, 0x60, 0xc6, 0xcb, 0x27, 0x05,
+    0x4d, 0x40, 0x00, 0x42, 0x96, 0x23, 0x83, 0x05, 0x4d, 0x23, 0x02, 0x96,
+    0x2f, 0xc2, 0x19, 0x2c, 0x05, 0x4c, 0xd3, 0x02, 0x96, 0x35, 0xc2, 0x01,
+    0x30, 0x05, 0x4c, 0xa2, 0x02, 0x96, 0x3b, 0x83, 0x05, 0x4d, 0x13, 0x02,
+    0x96, 0x44, 0xc2, 0x0e, 0x9a, 0x05, 0x4c, 0xea, 0x02, 0x96, 0x4a, 0x83,
+    0x05, 0x4d, 0x03, 0x02, 0x96, 0x50, 0xc2, 0x01, 0x6f, 0x05, 0x4c, 0xda,
+    0x02, 0x96, 0x56, 0xca, 0x60, 0x26, 0x05, 0x4c, 0xc8, 0xc6, 0xcb, 0x27,
+    0x05, 0x4c, 0xb0, 0x00, 0x42, 0x96, 0x5c, 0x8b, 0x05, 0x4c, 0x68, 0x8b,
+    0x05, 0x4c, 0x39, 0xc5, 0xd5, 0x2e, 0x05, 0x4c, 0x28, 0xc4, 0x04, 0x15,
+    0x05, 0x4d, 0xd1, 0xc4, 0xdf, 0x53, 0x05, 0x4d, 0xa0, 0xcf, 0x6a, 0xe9,
+    0x01, 0x2c, 0xf2, 0x02, 0x96, 0x68, 0x45, 0x02, 0x9a, 0x42, 0x96, 0x6e,
+    0x97, 0x05, 0x22, 0xdb, 0x02, 0x96, 0x7a, 0x91, 0x05, 0x22, 0xbb, 0x02,
+    0x96, 0x8d, 0x8b, 0x05, 0x22, 0x62, 0x02, 0x96, 0x99, 0x9b, 0x05, 0x22,
+    0x33, 0x02, 0x96, 0xac, 0x97, 0x05, 0x22, 0x03, 0x02, 0x96, 0xbf, 0x91,
+    0x05, 0x21, 0xeb, 0x02, 0x96, 0xd5, 0x8b, 0x05, 0x21, 0x9a, 0x02, 0x96,
+    0xe1, 0x9b, 0x05, 0x1d, 0x3b, 0x02, 0x96, 0xf4, 0x97, 0x05, 0x1d, 0x0b,
+    0x02, 0x97, 0x07, 0x87, 0x05, 0x1c, 0xeb, 0x02, 0x97, 0x1a, 0x91, 0x05,
+    0x1c, 0xcb, 0x02, 0x97, 0x26, 0x83, 0x05, 0x1c, 0xb2, 0x02, 0x97, 0x2e,
+    0xc2, 0x02, 0x0a, 0x05, 0x12, 0xf3, 0x02, 0x97, 0x3a, 0x83, 0x05, 0x13,
+    0x13, 0x02, 0x97, 0x42, 0xc2, 0x01, 0xba, 0x05, 0x13, 0x33, 0x02, 0x97,
+    0x4e, 0x91, 0x05, 0x13, 0x4b, 0x02, 0x97, 0x56, 0x87, 0x05, 0x13, 0x62,
+    0x02, 0x97, 0x62, 0x8b, 0x05, 0x17, 0x7b, 0x02, 0x97, 0x6a, 0x83, 0x05,
+    0x17, 0xb3, 0x02, 0x97, 0x7d, 0x97, 0x05, 0x17, 0xfb, 0x02, 0x97, 0x89,
+    0x11, 0xc2, 0x97, 0x9f, 0x87, 0x05, 0x17, 0xeb, 0x02, 0x97, 0xa7, 0x9b,
+    0x05, 0x18, 0x2a, 0x02, 0x97, 0xab, 0x8b, 0x05, 0x03, 0xc3, 0x02, 0x97,
+    0xbe, 0x83, 0x05, 0x03, 0xfb, 0x02, 0x97, 0xd1, 0x91, 0x05, 0x04, 0x1b,
+    0x02, 0x97, 0xdd, 0x97, 0x05, 0x04, 0x3b, 0x02, 0x97, 0xe9, 0x9b, 0x05,
+    0x04, 0x6a, 0x02, 0x97, 0xfc, 0x8b, 0x05, 0x0a, 0x9b, 0x02, 0x98, 0x0f,
+    0x83, 0x05, 0x0a, 0xcb, 0x02, 0x98, 0x22, 0x91, 0x05, 0x0a, 0xeb, 0x02,
+    0x98, 0x2e, 0x87, 0x05, 0x0b, 0x03, 0x02, 0x98, 0x3a, 0x97, 0x05, 0x0b,
+    0x22, 0x02, 0x98, 0x42, 0x96, 0x05, 0x0b, 0xe9, 0x9a, 0x05, 0x0b, 0xf1,
+    0x92, 0x05, 0x0c, 0x01, 0x87, 0x05, 0x0c, 0x12, 0x02, 0x98, 0x55, 0x9a,
+    0x05, 0x0c, 0x21, 0x92, 0x05, 0x0c, 0x30, 0x91, 0x05, 0x0c, 0x43, 0x02,
+    0x98, 0x5d, 0x96, 0x05, 0x0c, 0x89, 0x9a, 0x05, 0x0c, 0x91, 0x92, 0x05,
+    0x0c, 0xa1, 0x94, 0x05, 0x0c, 0xb2, 0x02, 0x98, 0x65, 0x96, 0x05, 0x0c,
+    0x51, 0x9a, 0x05, 0x0c, 0x59, 0x92, 0x05, 0x0c, 0x68, 0x9a, 0x05, 0x0c,
+    0x71, 0x92, 0x05, 0x0c, 0x80, 0x9b, 0x05, 0x21, 0x7b, 0x02, 0x98, 0x69,
+    0x97, 0x05, 0x21, 0x4b, 0x02, 0x98, 0x75, 0x91, 0x05, 0x21, 0x2b, 0x02,
+    0x98, 0x8f, 0x8b, 0x05, 0x20, 0xd2, 0x02, 0x98, 0x9b, 0x94, 0x05, 0x1f,
+    0xdb, 0x02, 0x98, 0xae, 0x92, 0x05, 0x1f, 0xc9, 0x9a, 0x05, 0x1f, 0xb9,
+    0x96, 0x05, 0x1f, 0xb0, 0x94, 0x05, 0x1f, 0xab, 0x02, 0x98, 0xb2, 0x92,
+    0x05, 0x1f, 0x99, 0x9a, 0x05, 0x1f, 0x89, 0x96, 0x05, 0x1f, 0x81, 0x91,
+    0x05, 0x1f, 0x52, 0x02, 0x98, 0xb6, 0x92, 0x05, 0x1f, 0x79, 0x9a, 0x05,
+    0x1f, 0x69, 0x96, 0x05, 0x1f, 0x60, 0x87, 0x05, 0x1f, 0x33, 0x02, 0x98,
+    0xc2, 0x92, 0x05, 0x1f, 0x19, 0x9a, 0x05, 0x1f, 0x09, 0x96, 0x05, 0x1f,
+    0x00, 0x94, 0x05, 0x20, 0xbb, 0x02, 0x98, 0xce, 0x92, 0x05, 0x20, 0xa9,
+    0x9a, 0x05, 0x20, 0x99, 0x96, 0x05, 0x20, 0x90, 0x94, 0x05, 0x20, 0x8b,
+    0x02, 0x98, 0xd2, 0x92, 0x05, 0x20, 0x79, 0x9a, 0x05, 0x20, 0x69, 0x96,
+    0x05, 0x20, 0x61, 0x91, 0x05, 0x20, 0x32, 0x02, 0x98, 0xd6, 0x92, 0x05,
+    0x20, 0x59, 0x9a, 0x05, 0x20, 0x49, 0x96, 0x05, 0x20, 0x40, 0x87, 0x05,
+    0x20, 0x13, 0x02, 0x98, 0xe2, 0x92, 0x05, 0x1f, 0xf9, 0x9a, 0x05, 0x1f,
+    0xe9, 0x96, 0x05, 0x1f, 0xe0, 0x94, 0x05, 0x1e, 0xfb, 0x02, 0x98, 0xee,
+    0x92, 0x05, 0x1e, 0xe9, 0x9a, 0x05, 0x1e, 0xd9, 0x96, 0x05, 0x1e, 0xd0,
+    0x94, 0x05, 0x1e, 0xcb, 0x02, 0x98, 0xf2, 0x92, 0x05, 0x1e, 0xb9, 0x9a,
+    0x05, 0x1e, 0xa9, 0x96, 0x05, 0x1e, 0xa1, 0x91, 0x05, 0x1e, 0x5a, 0x02,
+    0x98, 0xf6, 0x92, 0x05, 0x1e, 0x99, 0x9a, 0x05, 0x1e, 0x88, 0x92, 0x05,
+    0x1e, 0x81, 0x9a, 0x05, 0x1e, 0x71, 0x96, 0x05, 0x1e, 0x68, 0x92, 0x05,
+    0x1e, 0x49, 0x9a, 0x05, 0x1e, 0x39, 0x96, 0x05, 0x1e, 0x30, 0x9b, 0x05,
+    0x1c, 0x83, 0x02, 0x98, 0xfe, 0x97, 0x05, 0x1c, 0x53, 0x02, 0x99, 0x11,
+    0x87, 0x05, 0x1c, 0x33, 0x02, 0x99, 0x2b, 0x91, 0x05, 0x1c, 0x13, 0x02,
+    0x99, 0x37, 0x83, 0x05, 0x1b, 0xea, 0x02, 0x99, 0x43, 0x9b, 0x05, 0x1e,
+    0x13, 0x02, 0x99, 0x47, 0x97, 0x05, 0x1d, 0xe3, 0x02, 0x99, 0x5a, 0x87,
+    0x05, 0x1d, 0xc3, 0x02, 0x99, 0x74, 0x91, 0x05, 0x1d, 0xa3, 0x02, 0x99,
+    0x80, 0x83, 0x05, 0x1d, 0x6a, 0x02, 0x99, 0x8c, 0x9b, 0x05, 0x1a, 0x13,
+    0x02, 0x99, 0x98, 0x8b, 0x05, 0x19, 0x63, 0x02, 0x99, 0xab, 0x83, 0x05,
+    0x19, 0x9b, 0x02, 0x99, 0xbe, 0x91, 0x05, 0x19, 0xbb, 0x02, 0x99, 0xca,
+    0x87, 0x05, 0x19, 0xd3, 0x02, 0x99, 0xd6, 0x97, 0x05, 0x19, 0xf2, 0x02,
+    0x99, 0xde, 0x96, 0x05, 0x18, 0x49, 0x9a, 0x05, 0x18, 0x51, 0x92, 0x05,
+    0x18, 0x61, 0x87, 0x05, 0x18, 0x72, 0x02, 0x99, 0xea, 0x96, 0x05, 0x18,
+    0x81, 0x9a, 0x05, 0x18, 0x89, 0x92, 0x05, 0x18, 0x98, 0x91, 0x05, 0x18,
+    0xab, 0x02, 0x99, 0xf2, 0x96, 0x05, 0x18, 0xf1, 0x9a, 0x05, 0x18, 0xf9,
+    0x92, 0x05, 0x19, 0x09, 0x94, 0x05, 0x19, 0x1a, 0x02, 0x99, 0xfa, 0x96,
+    0x05, 0x18, 0xb9, 0x9a, 0x05, 0x18, 0xc1, 0x92, 0x05, 0x18, 0xd0, 0x9a,
+    0x05, 0x18, 0xd9, 0x92, 0x05, 0x18, 0xe8, 0x96, 0x05, 0x19, 0x21, 0x9a,
+    0x05, 0x19, 0x29, 0x92, 0x05, 0x19, 0x39, 0x94, 0x05, 0x19, 0x4a, 0x02,
+    0x99, 0xfe, 0x9b, 0x05, 0x1b, 0xc3, 0x02, 0x9a, 0x02, 0x97, 0x05, 0x1b,
+    0x93, 0x02, 0x9a, 0x15, 0x87, 0x05, 0x1b, 0x7b, 0x02, 0x9a, 0x2b, 0x91,
+    0x05, 0x1b, 0x5b, 0x02, 0x9a, 0x37, 0x83, 0x05, 0x1b, 0x1a, 0x02, 0x9a,
+    0x43, 0x94, 0x05, 0x16, 0x7b, 0x02, 0x9a, 0x4f, 0x96, 0x05, 0x16, 0x51,
+    0x9a, 0x05, 0x16, 0x59, 0x92, 0x05, 0x16, 0x68, 0x92, 0x05, 0x16, 0x19,
+    0x9a, 0x05, 0x16, 0x08, 0x96, 0x05, 0x16, 0x21, 0x9a, 0x05, 0x16, 0x29,
+    0x92, 0x05, 0x16, 0x39, 0x94, 0x05, 0x16, 0x4b, 0x02, 0x9a, 0x53, 0x91,
+    0x05, 0x15, 0xda, 0x02, 0x9a, 0x57, 0x96, 0x05, 0x15, 0x71, 0x9a, 0x05,
+    0x15, 0x79, 0x92, 0x05, 0x15, 0x89, 0x87, 0x05, 0x15, 0xa2, 0x02, 0x9a,
+    0x5f, 0x96, 0x05, 0x15, 0xb1, 0x9a, 0x05, 0x15, 0xb9, 0x92, 0x05, 0x15,
+    0xc8, 0x96, 0x05, 0x15, 0xe9, 0x9a, 0x05, 0x15, 0xf1, 0x92, 0x05, 0x16,
+    0x00, 0x9a, 0x05, 0x14, 0xf9, 0x92, 0x05, 0x15, 0x08, 0x92, 0x05, 0x14,
+    0xf1, 0x9a, 0x05, 0x14, 0xe1, 0x96, 0x05, 0x14, 0xd8, 0x91, 0x05, 0x14,
+    0xcb, 0x02, 0x9a, 0x6b, 0x96, 0x05, 0x15, 0x11, 0x9a, 0x05, 0x15, 0x19,
+    0x92, 0x05, 0x15, 0x29, 0x94, 0x05, 0x15, 0x3a, 0x02, 0x9a, 0x73, 0x92,
+    0x05, 0x14, 0xb9, 0x9a, 0x05, 0x14, 0xa9, 0x96, 0x05, 0x14, 0xa0, 0x87,
+    0x05, 0x14, 0x93, 0x02, 0x9a, 0x77, 0x92, 0x05, 0x14, 0x81, 0x9a, 0x05,
+    0x14, 0x71, 0x96, 0x05, 0x14, 0x68, 0x91, 0x05, 0x16, 0xeb, 0x02, 0x9a,
+    0x7f, 0x83, 0x05, 0x16, 0xd3, 0x02, 0x9a, 0x87, 0x8b, 0x05, 0x16, 0x93,
+    0x02, 0x9a, 0x93, 0x87, 0x05, 0x17, 0x03, 0x02, 0x9a, 0xa6, 0x97, 0x05,
+    0x17, 0x1b, 0x02, 0x9a, 0xae, 0x9b, 0x05, 0x17, 0x4a, 0x02, 0x9a, 0xbd,
+    0x9b, 0x05, 0x1a, 0xeb, 0x02, 0x9a, 0xd0, 0x97, 0x05, 0x1a, 0xbb, 0x02,
+    0x9a, 0xe3, 0x87, 0x05, 0x1a, 0x9b, 0x02, 0x9a, 0xfd, 0x91, 0x05, 0x1a,
+    0x7b, 0x02, 0x9b, 0x09, 0x83, 0x05, 0x1a, 0x42, 0x02, 0x9b, 0x15, 0x96,
+    0x05, 0x15, 0x41, 0x9a, 0x05, 0x15, 0x49, 0x92, 0x05, 0x15, 0x59, 0x94,
+    0x05, 0x15, 0x6a, 0x02, 0x9b, 0x21, 0x92, 0x05, 0x14, 0x61, 0x9a, 0x05,
+    0x14, 0x50, 0x92, 0x05, 0x14, 0x49, 0x9a, 0x05, 0x14, 0x38, 0x91, 0x05,
+    0x14, 0x2a, 0x02, 0x9b, 0x25, 0x92, 0x05, 0x14, 0x19, 0x9a, 0x05, 0x14,
+    0x09, 0x96, 0x05, 0x14, 0x00, 0x92, 0x05, 0x13, 0xf9, 0x9a, 0x05, 0x13,
+    0xe8, 0x87, 0x05, 0x12, 0xdb, 0x02, 0x9b, 0x2d, 0x91, 0x05, 0x12, 0xc3,
+    0x02, 0x9b, 0x35, 0xc2, 0x01, 0xba, 0x05, 0x12, 0xa3, 0x02, 0x9b, 0x41,
+    0x83, 0x05, 0x12, 0x83, 0x02, 0x9b, 0x4d, 0x8b, 0x05, 0x12, 0x42, 0x02,
+    0x9b, 0x59, 0x96, 0x05, 0x13, 0x71, 0x87, 0x05, 0x13, 0x82, 0x02, 0x9b,
+    0x6c, 0x96, 0x05, 0x13, 0x89, 0x9a, 0x05, 0x13, 0x91, 0x92, 0x05, 0x13,
+    0xa0, 0x96, 0x05, 0x13, 0xa9, 0x9a, 0x05, 0x13, 0xb1, 0x92, 0x05, 0x13,
+    0xc0, 0x96, 0x05, 0x13, 0xc9, 0x9a, 0x05, 0x13, 0xd1, 0x92, 0x05, 0x13,
+    0xe0, 0x8b, 0x05, 0x04, 0x9b, 0x02, 0x9b, 0x70, 0x83, 0x05, 0x04, 0xd3,
+    0x02, 0x9b, 0x83, 0x97, 0x05, 0x05, 0x2b, 0x02, 0x9b, 0x8f, 0x91, 0x05,
+    0x05, 0x0b, 0x02, 0x9b, 0xa9, 0x9b, 0x05, 0x05, 0x52, 0x02, 0x9b, 0xb5,
+    0x8b, 0x05, 0x0b, 0x53, 0x02, 0x9b, 0xc4, 0x83, 0x05, 0x0b, 0x93, 0x02,
+    0x9b, 0xd7, 0x17, 0xc2, 0x9b, 0xe3, 0x11, 0xc2, 0x9b, 0xee, 0x87, 0x05,
+    0x0b, 0xd2, 0x02, 0x9b, 0xfa, 0x8b, 0x05, 0x0c, 0xcb, 0x02, 0x9c, 0x02,
+    0x83, 0x05, 0x0d, 0x03, 0x02, 0x9c, 0x15, 0x97, 0x05, 0x0d, 0x6b, 0x02,
+    0x9c, 0x21, 0x91, 0x05, 0x0d, 0x33, 0x02, 0x9c, 0x3b, 0x87, 0x05, 0x0d,
+    0x4b, 0x02, 0x9c, 0x43, 0x9b, 0x05, 0x0d, 0x9a, 0x02, 0x9c, 0x4b, 0x87,
+    0x05, 0x23, 0xbb, 0x02, 0x9c, 0x5e, 0x92, 0x05, 0x23, 0xa1, 0x9a, 0x05,
+    0x23, 0x91, 0x96, 0x05, 0x23, 0x88, 0x91, 0x05, 0x23, 0xdb, 0x02, 0x9c,
+    0x6a, 0x96, 0x05, 0x24, 0x09, 0x9a, 0x05, 0x24, 0x11, 0x92, 0x05, 0x24,
+    0x21, 0x94, 0x05, 0x24, 0x32, 0x02, 0x9c, 0x76, 0x96, 0x05, 0x23, 0xe9,
+    0x9a, 0x05, 0x23, 0xf1, 0x92, 0x05, 0x24, 0x00, 0x96, 0x05, 0x24, 0x39,
+    0x9a, 0x05, 0x24, 0x41, 0x92, 0x05, 0x24, 0x51, 0x94, 0x05, 0x24, 0x62,
+    0x02, 0x9c, 0x7a, 0x94, 0x05, 0x23, 0x83, 0x02, 0x9c, 0x7e, 0x92, 0x05,
+    0x23, 0x71, 0x9a, 0x05, 0x23, 0x61, 0x96, 0x05, 0x23, 0x58, 0x96, 0x05,
+    0x22, 0xe9, 0x9a, 0x05, 0x22, 0xf1, 0x92, 0x05, 0x23, 0x01, 0x87, 0x05,
+    0x23, 0x1a, 0x02, 0x9c, 0x82, 0x9a, 0x05, 0x23, 0x41, 0x92, 0x05, 0x23,
+    0x51, 0x96, 0x05, 0x23, 0x38, 0x9a, 0x05, 0x23, 0x28, 0x97, 0x05, 0x12,
+    0x13, 0x02, 0x9c, 0x8e, 0xc2, 0x02, 0x0a, 0x05, 0x11, 0x8b, 0x02, 0x9c,
+    0xa8, 0x83, 0x05, 0x11, 0xa3, 0x02, 0x9c, 0xac, 0x91, 0x05, 0x11, 0xdb,
+    0x02, 0x9c, 0xb8, 0x87, 0x05, 0x11, 0xf2, 0x02, 0x9c, 0xc4, 0x96, 0x05,
+    0x05, 0x71, 0x9a, 0x05, 0x05, 0x79, 0x92, 0x05, 0x05, 0x89, 0x87, 0x05,
+    0x05, 0x9a, 0x02, 0x9c, 0xcc, 0x96, 0x05, 0x05, 0xa9, 0x9a, 0x05, 0x05,
+    0xb1, 0x92, 0x05, 0x05, 0xc0, 0x91, 0x05, 0x05, 0xdb, 0x02, 0x9c, 0xd4,
+    0x96, 0x05, 0x06, 0x19, 0x9a, 0x05, 0x06, 0x21, 0x92, 0x05, 0x06, 0x31,
+    0x94, 0x05, 0x06, 0x42, 0x02, 0x9c, 0xe0, 0x96, 0x05, 0x05, 0xe9, 0x9a,
+    0x05, 0x05, 0xf1, 0x92, 0x05, 0x06, 0x00, 0x9a, 0x05, 0x06, 0x08, 0x96,
+    0x05, 0x06, 0x49, 0x9a, 0x05, 0x06, 0x51, 0x92, 0x05, 0x06, 0x60, 0xcc,
+    0x1c, 0x94, 0x05, 0x00, 0xa8, 0x96, 0x05, 0x00, 0x21, 0x9a, 0x05, 0x00,
+    0x29, 0x92, 0x05, 0x00, 0x38, 0x96, 0x05, 0x00, 0xb1, 0x9a, 0x05, 0x00,
+    0xb9, 0x92, 0x05, 0x00, 0xc9, 0x87, 0x05, 0x00, 0xe2, 0x02, 0x9c, 0xe4,
+    0x96, 0x05, 0x00, 0xf1, 0x9a, 0x05, 0x00, 0xf9, 0x92, 0x05, 0x01, 0x08,
+    0x91, 0x05, 0x01, 0x1b, 0x02, 0x9c, 0xf0, 0x96, 0x05, 0x01, 0x61, 0x9a,
+    0x05, 0x01, 0x69, 0x92, 0x05, 0x01, 0x79, 0x94, 0x05, 0x01, 0x8a, 0x02,
+    0x9c, 0xf8, 0x96, 0x05, 0x01, 0x29, 0x9a, 0x05, 0x01, 0x31, 0x92, 0x05,
+    0x01, 0x40, 0x9a, 0x05, 0x01, 0x49, 0x92, 0x05, 0x01, 0x58, 0x96, 0x05,
+    0x01, 0x91, 0x9a, 0x05, 0x01, 0x99, 0x92, 0x05, 0x01, 0xa9, 0x94, 0x05,
+    0x01, 0xba, 0x02, 0x9c, 0xfc, 0x8b, 0x05, 0x02, 0xc3, 0x02, 0x9d, 0x00,
+    0x83, 0x05, 0x03, 0x03, 0x02, 0x9d, 0x13, 0x97, 0x05, 0x03, 0x73, 0x02,
+    0x9d, 0x1f, 0x91, 0x05, 0x03, 0x3b, 0x02, 0x9d, 0x39, 0x87, 0x05, 0x03,
+    0x53, 0x02, 0x9d, 0x45, 0x9b, 0x05, 0x03, 0xa2, 0x02, 0x9d, 0x4d, 0x96,
+    0x05, 0x01, 0xc1, 0x9a, 0x05, 0x01, 0xc9, 0x92, 0x05, 0x01, 0xd9, 0x87,
+    0x05, 0x01, 0xea, 0x02, 0x9d, 0x59, 0x96, 0x05, 0x01, 0xf9, 0x9a, 0x05,
+    0x02, 0x01, 0x92, 0x05, 0x02, 0x10, 0x91, 0x05, 0x02, 0x23, 0x02, 0x9d,
+    0x61, 0x96, 0x05, 0x02, 0x51, 0x9a, 0x05, 0x02, 0x59, 0x92, 0x05, 0x02,
+    0x69, 0x94, 0x05, 0x02, 0x7a, 0x02, 0x9d, 0x69, 0x96, 0x05, 0x02, 0x31,
+    0x9a, 0x05, 0x02, 0x39, 0x92, 0x05, 0x02, 0x48, 0x96, 0x05, 0x02, 0x81,
+    0x9a, 0x05, 0x02, 0x89, 0x92, 0x05, 0x02, 0x99, 0x94, 0x05, 0x02, 0xaa,
+    0x02, 0x9d, 0x6d, 0x96, 0x05, 0x06, 0x69, 0x9a, 0x05, 0x06, 0x71, 0x92,
+    0x05, 0x06, 0x80, 0x96, 0x05, 0x06, 0x89, 0x9a, 0x05, 0x06, 0x91, 0x92,
+    0x05, 0x06, 0xa0, 0x9a, 0x05, 0x06, 0xa9, 0x92, 0x05, 0x06, 0xb8, 0x96,
+    0x05, 0x06, 0xc1, 0x9a, 0x05, 0x06, 0xc9, 0x92, 0x05, 0x06, 0xd9, 0x94,
+    0x05, 0x06, 0xea, 0x02, 0x9d, 0x71, 0x96, 0x05, 0x06, 0xf1, 0x9a, 0x05,
+    0x06, 0xf9, 0x92, 0x05, 0x07, 0x08, 0x96, 0x05, 0x07, 0x11, 0x9a, 0x05,
+    0x07, 0x19, 0x92, 0x05, 0x07, 0x29, 0x87, 0x05, 0x07, 0x42, 0x02, 0x9d,
+    0x75, 0x96, 0x05, 0x07, 0x51, 0x9a, 0x05, 0x07, 0x59, 0x92, 0x05, 0x07,
+    0x68, 0x96, 0x05, 0x07, 0x71, 0x9a, 0x05, 0x07, 0x79, 0x92, 0x05, 0x07,
+    0x88, 0x9a, 0x05, 0x07, 0x91, 0x92, 0x05, 0x07, 0x98, 0x96, 0x05, 0x07,
+    0xa1, 0x9a, 0x05, 0x07, 0xa9, 0x92, 0x05, 0x07, 0xb9, 0x94, 0x05, 0x07,
+    0xca, 0x02, 0x9d, 0x81, 0x96, 0x05, 0x07, 0xd1, 0x9a, 0x05, 0x07, 0xd9,
+    0x92, 0x05, 0x07, 0xe9, 0x94, 0x05, 0x07, 0xfa, 0x02, 0x9d, 0x85, 0x96,
+    0x05, 0x08, 0x01, 0x9a, 0x05, 0x08, 0x09, 0x92, 0x05, 0x08, 0x19, 0x87,
+    0x05, 0x08, 0x2a, 0x02, 0x9d, 0x89, 0x96, 0x05, 0x08, 0x39, 0x9a, 0x05,
+    0x08, 0x41, 0x92, 0x05, 0x08, 0x50, 0x91, 0x05, 0x08, 0x63, 0x02, 0x9d,
+    0x91, 0x96, 0x05, 0x08, 0xa1, 0x9a, 0x05, 0x08, 0xa9, 0x92, 0x05, 0x08,
+    0xb9, 0x94, 0x05, 0x08, 0xca, 0x02, 0x9d, 0x95, 0x96, 0x05, 0x08, 0x69,
+    0x9a, 0x05, 0x08, 0x71, 0x92, 0x05, 0x08, 0x80, 0x9a, 0x05, 0x08, 0x89,
+    0x92, 0x05, 0x08, 0x98, 0x8b, 0x05, 0x09, 0xc3, 0x02, 0x9d, 0x99, 0x83,
+    0x05, 0x09, 0xfb, 0x02, 0x9d, 0xac, 0x97, 0x05, 0x0a, 0x6b, 0x02, 0x9d,
+    0xb8, 0x91, 0x05, 0x0a, 0x33, 0x02, 0x9d, 0xd2, 0x87, 0x05, 0x0a, 0x4a,
+    0x02, 0x9d, 0xde, 0x96, 0x05, 0x08, 0xd1, 0x9a, 0x05, 0x08, 0xd9, 0x92,
+    0x05, 0x08, 0xe9, 0x87, 0x05, 0x08, 0xfa, 0x02, 0x9d, 0xe6, 0x96, 0x05,
+    0x09, 0x09, 0x9a, 0x05, 0x09, 0x11, 0x92, 0x05, 0x09, 0x20, 0x91, 0x05,
+    0x09, 0x3b, 0x02, 0x9d, 0xee, 0x96, 0x05, 0x09, 0x81, 0x9a, 0x05, 0x09,
+    0x89, 0x92, 0x05, 0x09, 0x99, 0x94, 0x05, 0x09, 0xaa, 0x02, 0x9d, 0xfa,
+    0x96, 0x05, 0x09, 0x49, 0x9a, 0x05, 0x09, 0x51, 0x92, 0x05, 0x09, 0x60,
+    0x9a, 0x05, 0x09, 0x69, 0x92, 0x05, 0x09, 0x78, 0x96, 0x05, 0x0d, 0xb9,
+    0x9a, 0x05, 0x0d, 0xc1, 0x92, 0x05, 0x0d, 0xd1, 0x87, 0x05, 0x0d, 0xea,
+    0x02, 0x9d, 0xfe, 0x96, 0x05, 0x0d, 0xf9, 0x9a, 0x05, 0x0e, 0x01, 0x92,
+    0x05, 0x0e, 0x10, 0x91, 0x05, 0x0e, 0x2b, 0x02, 0x9e, 0x0a, 0x96, 0x05,
+    0x0e, 0x71, 0x9a, 0x05, 0x0e, 0x79, 0x92, 0x05, 0x0e, 0x89, 0x94, 0x05,
+    0x0e, 0x9a, 0x02, 0x9e, 0x16, 0x96, 0x05, 0x0e, 0x39, 0x9a, 0x05, 0x0e,
+    0x41, 0x92, 0x05, 0x0e, 0x50, 0x9a, 0x05, 0x0e, 0x59, 0x92, 0x05, 0x0e,
+    0x68, 0x96, 0x05, 0x0e, 0xa1, 0x9a, 0x05, 0x0e, 0xa9, 0x92, 0x05, 0x0e,
+    0xb9, 0x94, 0x05, 0x0e, 0xca, 0x02, 0x9e, 0x1a, 0x96, 0x05, 0x0e, 0xd1,
+    0x9a, 0x05, 0x0e, 0xd9, 0x92, 0x05, 0x0e, 0xe9, 0x87, 0x05, 0x0f, 0x02,
+    0x02, 0x9e, 0x1e, 0x96, 0x05, 0x0f, 0x11, 0x9a, 0x05, 0x0f, 0x19, 0x92,
+    0x05, 0x0f, 0x28, 0x91, 0x05, 0x0f, 0x43, 0x02, 0x9e, 0x2a, 0x96, 0x05,
+    0x0f, 0x91, 0x9a, 0x05, 0x0f, 0x99, 0x92, 0x05, 0x0f, 0xa9, 0x94, 0x05,
+    0x0f, 0xba, 0x02, 0x9e, 0x36, 0x96, 0x05, 0x0f, 0x51, 0x9a, 0x05, 0x0f,
+    0x59, 0x92, 0x05, 0x0f, 0x68, 0x96, 0x05, 0x0f, 0x71, 0x9a, 0x05, 0x0f,
+    0x79, 0x92, 0x05, 0x0f, 0x88, 0x8b, 0x05, 0x10, 0xb3, 0x02, 0x9e, 0x3a,
+    0x83, 0x05, 0x10, 0xe3, 0x02, 0x9e, 0x49, 0x97, 0x05, 0x11, 0x63, 0x02,
+    0x9e, 0x55, 0x91, 0x05, 0x11, 0x23, 0x02, 0x9e, 0x6f, 0x87, 0x05, 0x11,
+    0x42, 0x02, 0x9e, 0x7b, 0x96, 0x05, 0x0f, 0xc1, 0x9a, 0x05, 0x0f, 0xc9,
+    0x92, 0x05, 0x0f, 0xd9, 0x87, 0x05, 0x0f, 0xea, 0x02, 0x9e, 0x87, 0x96,
+    0x05, 0x0f, 0xf9, 0x9a, 0x05, 0x10, 0x01, 0x92, 0x05, 0x10, 0x10, 0x91,
+    0x05, 0x10, 0x23, 0x02, 0x9e, 0x8f, 0x96, 0x05, 0x10, 0x71, 0x9a, 0x05,
+    0x10, 0x79, 0x92, 0x05, 0x10, 0x89, 0x94, 0x05, 0x10, 0x9a, 0x02, 0x9e,
+    0x97, 0x96, 0x05, 0x10, 0x31, 0x9a, 0x05, 0x10, 0x39, 0x92, 0x05, 0x10,
+    0x48, 0x96, 0x05, 0x10, 0x51, 0x9a, 0x05, 0x10, 0x59, 0x92, 0x05, 0x10,
+    0x68, 0x87, 0x05, 0x25, 0xd8, 0xc2, 0x00, 0x7e, 0x05, 0x24, 0x99, 0xc2,
+    0x00, 0x11, 0x05, 0x25, 0x38, 0x92, 0x05, 0x24, 0xa1, 0x96, 0x05, 0x25,
+    0x18, 0x9b, 0x05, 0x25, 0x81, 0xc2, 0x00, 0x33, 0x05, 0x25, 0xd1, 0xc2,
+    0x00, 0xfe, 0x05, 0x26, 0x01, 0xc2, 0x00, 0x11, 0x05, 0x26, 0x10, 0xc2,
+    0x00, 0x11, 0x05, 0x24, 0xb1, 0xc2, 0x01, 0xba, 0x05, 0x25, 0x30, 0xc2,
+    0x00, 0x8d, 0x05, 0x24, 0xc9, 0xc2, 0x01, 0xba, 0x05, 0x24, 0xf9, 0xc2,
+    0x00, 0x11, 0x05, 0x25, 0xf8, 0x92, 0x05, 0x25, 0x11, 0x94, 0x05, 0x26,
+    0x08, 0xc2, 0x00, 0xa4, 0x05, 0x25, 0x51, 0x9b, 0x05, 0x25, 0xa9, 0xc2,
+    0x02, 0x0a, 0x05, 0x25, 0xb8, 0x8e, 0x08, 0x74, 0x60, 0xc3, 0x32, 0xce,
+    0x08, 0x74, 0x41, 0xc2, 0x03, 0x4e, 0x08, 0x74, 0x38, 0x44, 0xe1, 0x77,
+    0x42, 0x9e, 0x9b, 0x8b, 0x00, 0xa7, 0x70, 0x91, 0x00, 0xa8, 0xeb, 0x02,
+    0x9e, 0xb9, 0x83, 0x00, 0xa9, 0x0b, 0x02, 0x9e, 0xc1, 0x8b, 0x00, 0xa8,
+    0xcb, 0x02, 0x9e, 0xc5, 0x87, 0x00, 0xa8, 0xb8, 0x9b, 0x00, 0xc6, 0x09,
+    0x83, 0x00, 0xa8, 0xb0, 0x9b, 0x00, 0xc6, 0x01, 0x91, 0x00, 0xa8, 0xa0,
+    0x8b, 0x00, 0xa8, 0x90, 0xc2, 0x16, 0x1c, 0x00, 0xa4, 0x29, 0xc2, 0x14,
+    0x77, 0x00, 0xa4, 0x31, 0xc2, 0x38, 0x2a, 0x00, 0xa4, 0x39, 0xc2, 0x02,
+    0x98, 0x00, 0xa4, 0x40, 0x83, 0x00, 0xa8, 0x10, 0x8b, 0x00, 0xa7, 0xd0,
+    0x91, 0x00, 0xa7, 0xf0, 0x43, 0x67, 0xcd, 0xc2, 0x9e, 0xc9, 0x0a, 0x42,
+    0x9e, 0xde, 0xc4, 0xdf, 0x5b, 0x00, 0xa9, 0xe9, 0x19, 0xc2, 0x9e, 0xf3,
+    0x15, 0xc2, 0x9e, 0xff, 0xc4, 0xe0, 0xc3, 0x00, 0xa4, 0x11, 0xc4, 0xe3,
+    0x43, 0x00, 0xa5, 0x01, 0xc4, 0xda, 0xeb, 0x00, 0xa5, 0xd1, 0xc4, 0xe4,
+    0x67, 0x00, 0xa6, 0x79, 0xc4, 0xde, 0xb6, 0x00, 0xa3, 0x28, 0x8b, 0x00,
+    0xa6, 0x08, 0x91, 0x00, 0xc6, 0x60, 0x8b, 0x00, 0xc6, 0x40, 0x83, 0x00,
+    0xa6, 0x68, 0x83, 0x00, 0xb3, 0xb0, 0x91, 0x00, 0xb3, 0xa0, 0x8b, 0x00,
+    0xb3, 0x90, 0x8b, 0x00, 0xb3, 0x81, 0x83, 0x00, 0xac, 0xa2, 0x02, 0x9f,
+    0x26, 0x91, 0x00, 0xac, 0x90, 0x8b, 0x00, 0xac, 0x80, 0x83, 0x00, 0xab,
+    0xcb, 0x02, 0x9f, 0x2a, 0x91, 0x00, 0xab, 0xbb, 0x02, 0x9f, 0x2e, 0x8b,
+    0x00, 0xab, 0xab, 0x02, 0x9f, 0x32, 0x87, 0x00, 0xab, 0xa0, 0x8b, 0x00,
+    0xab, 0x18, 0x06, 0xc2, 0x9f, 0x36, 0x0c, 0xc2, 0x9f, 0x46, 0x09, 0xc2,
+    0x9f, 0x67, 0x16, 0xc2, 0x9f, 0x89, 0x42, 0x11, 0xee, 0xc2, 0x9f, 0x99,
+    0x1b, 0xc2, 0x9f, 0xb0, 0x0f, 0xc2, 0x9f, 0xc7, 0x10, 0xc2, 0x9f, 0xde,
+    0x0d, 0xc2, 0x9f, 0xf9, 0x92, 0x00, 0xaf, 0x73, 0x02, 0xa0, 0x04, 0x8a,
+    0x00, 0xa2, 0x5b, 0x02, 0xa0, 0x1b, 0x19, 0xc2, 0xa0, 0x29, 0x14, 0xc2,
+    0xa0, 0x40, 0x0e, 0xc2, 0xa0, 0x57, 0xc2, 0x02, 0xe0, 0x00, 0xa0, 0x41,
+    0x8b, 0x00, 0xa0, 0x4b, 0x02, 0xa0, 0x72, 0x9c, 0x00, 0xb2, 0x33, 0x02,
+    0xa0, 0x78, 0x15, 0x42, 0xa0, 0x8f, 0x8b, 0x00, 0xa4, 0x50, 0x91, 0x00,
+    0xa4, 0xd0, 0x8b, 0x00, 0xa4, 0xb0, 0x83, 0x00, 0xa4, 0xf0, 0x83, 0x00,
+    0xad, 0xb9, 0x91, 0x00, 0xad, 0xb1, 0x8b, 0x00, 0xad, 0xa9, 0x87, 0x00,
+    0xad, 0xa0, 0x83, 0x00, 0xad, 0xf9, 0x91, 0x00, 0xad, 0xf1, 0x8b, 0x00,
+    0xad, 0xe9, 0x87, 0x00, 0xad, 0xe0, 0x83, 0x00, 0xad, 0xd9, 0x91, 0x00,
+    0xad, 0xd1, 0x8b, 0x00, 0xad, 0xc9, 0x87, 0x00, 0xad, 0xc0, 0x91, 0x00,
+    0xc7, 0x48, 0x83, 0x00, 0xab, 0x73, 0x02, 0xa0, 0xad, 0x91, 0x00, 0xab,
+    0x6b, 0x02, 0xa0, 0xb1, 0xc2, 0x00, 0x28, 0x00, 0xc7, 0x29, 0x8b, 0x00,
+    0xab, 0x61, 0x87, 0x00, 0xab, 0x58, 0x83, 0x00, 0xc7, 0x23, 0x02, 0xa0,
+    0xb5, 0x87, 0x00, 0xc7, 0x18, 0x83, 0x00, 0xad, 0x63, 0x02, 0xa0, 0xb9,
+    0x91, 0x00, 0xad, 0x53, 0x02, 0xa0, 0xbd, 0x8b, 0x00, 0xad, 0x43, 0x02,
+    0xa0, 0xc1, 0x87, 0x00, 0xad, 0x38, 0x83, 0x00, 0xab, 0x38, 0x91, 0x00,
+    0xab, 0x28, 0x8b, 0x00, 0xab, 0x10, 0x8b, 0x00, 0xa2, 0x68, 0x91, 0x00,
+    0xa2, 0xf8, 0x8b, 0x00, 0xa2, 0xd8, 0x83, 0x00, 0xa3, 0x18, 0x46, 0x92,
+    0x9a, 0xc2, 0xa0, 0xc5, 0xc5, 0xbc, 0x9d, 0x00, 0xc6, 0xe8, 0x48, 0xba,
+    0x1a, 0x42, 0xa1, 0x0c, 0x83, 0x00, 0xaa, 0x70, 0x91, 0x00, 0xc6, 0x90,
+    0x8b, 0x00, 0xc6, 0x80, 0x8b, 0x00, 0xaa, 0x28, 0x14, 0xc2, 0xa1, 0x1b,
+    0x15, 0xc2, 0xa1, 0x25, 0xc5, 0x31, 0xee, 0x00, 0xa0, 0xf9, 0xc5, 0x1f,
+    0x0c, 0x00, 0xa1, 0x01, 0xd0, 0x58, 0x02, 0x00, 0xa1, 0x09, 0xcd, 0x7f,
+    0x3f, 0x00, 0xa1, 0x11, 0x42, 0x00, 0x58, 0xc2, 0xa1, 0x31, 0xca, 0x3b,
+    0x06, 0x00, 0xa1, 0x39, 0xc4, 0x25, 0xd5, 0x00, 0xa1, 0x48, 0x8b, 0x00,
+    0xaa, 0xa0, 0x8a, 0x00, 0xc6, 0xd8, 0x19, 0x42, 0xa1, 0x3d, 0x8b, 0x00,
+    0xa9, 0x38, 0x83, 0x00, 0xa9, 0xd8, 0x91, 0x00, 0xa9, 0xb8, 0x8b, 0x00,
+    0xa9, 0x98, 0xc3, 0x14, 0x72, 0x00, 0xa2, 0x41, 0xc2, 0x01, 0x24, 0x00,
+    0xa1, 0xa8, 0x8b, 0x00, 0xa6, 0xa0, 0x83, 0x00, 0xad, 0x28, 0x91, 0x00,
+    0xad, 0x18, 0x8b, 0x00, 0xad, 0x08, 0x8b, 0x00, 0xa7, 0x00, 0x91, 0x00,
+    0xa7, 0x20, 0x83, 0x00, 0xa7, 0x40, 0x8b, 0x00, 0xa5, 0x20, 0x94, 0x00,
+    0xaa, 0x91, 0x8e, 0x00, 0xa7, 0x60, 0xca, 0xa5, 0x8a, 0x00, 0xa8, 0x48,
+    0x8b, 0x00, 0xa5, 0x80, 0x91, 0x00, 0xa5, 0xa0, 0x83, 0x00, 0xa5, 0xc0,
+    0x9b, 0x00, 0xc5, 0xc9, 0x83, 0x00, 0xa4, 0x00, 0x8b, 0x00, 0xa3, 0xc0,
+    0x91, 0x00, 0xa3, 0xe0, 0x8b, 0x00, 0xa3, 0x60, 0x9b, 0x00, 0xc5, 0xb1,
+    0x91, 0x00, 0xa2, 0x10, 0x83, 0x00, 0xa2, 0x30, 0x8b, 0x00, 0xa1, 0xf0,
+    0x8b, 0x00, 0xa1, 0x80, 0x8b, 0x00, 0xab, 0xf0, 0x97, 0x08, 0x15, 0xd9,
+    0x9f, 0x08, 0x16, 0x41, 0xa0, 0x08, 0x16, 0x80, 0xc3, 0x4b, 0x13, 0x08,
+    0x2a, 0x79, 0xc2, 0x0c, 0x42, 0x08, 0x2a, 0xa8, 0xc2, 0x00, 0x71, 0x08,
+    0x29, 0xb9, 0x83, 0x08, 0x29, 0xd8, 0x83, 0x08, 0x29, 0xcb, 0x02, 0xa1,
+    0x4b, 0xc2, 0x69, 0xa6, 0x08, 0x2a, 0x49, 0x8b, 0x08, 0x2a, 0x50, 0x94,
+    0x08, 0x2a, 0x11, 0xc2, 0x17, 0xb6, 0x08, 0x2b, 0x00, 0x9b, 0x08, 0x2a,
+    0x59, 0x99, 0x08, 0x2a, 0xf8, 0x83, 0x08, 0x29, 0xeb, 0x02, 0xa1, 0x4f,
+    0xc2, 0x69, 0xa6, 0x08, 0x2a, 0xe8, 0xc2, 0x02, 0xa0, 0x01, 0x74, 0x19,
+    0xc4, 0x02, 0xde, 0x01, 0x74, 0x20, 0xce, 0x70, 0x88, 0x01, 0x75, 0x31,
+    0xc3, 0x00, 0xbf, 0x01, 0x76, 0x30, 0xc3, 0xac, 0xc1, 0x01, 0x76, 0x61,
+    0xc4, 0x8e, 0x34, 0x01, 0x77, 0x40, 0x89, 0x01, 0x8f, 0x08, 0x83, 0x05,
+    0x5b, 0xb1, 0x87, 0x05, 0x5b, 0xc1, 0x8b, 0x05, 0x5b, 0xc9, 0x91, 0x05,
+    0x5b, 0xd1, 0x97, 0x05, 0x5b, 0xd9, 0x98, 0x05, 0x5b, 0xe0, 0x83, 0x05,
+    0x5d, 0xf9, 0x87, 0x00, 0x9f, 0xc1, 0x8b, 0x00, 0x9f, 0xc9, 0x91, 0x00,
+    0x9f, 0xd1, 0x97, 0x00, 0x9f, 0xd9, 0x98, 0x00, 0x9f, 0xe0, 0x98, 0x05,
+    0x5d, 0xf1, 0x97, 0x05, 0x5d, 0xe9, 0x91, 0x05, 0x5d, 0xe1, 0x8b, 0x05,
+    0x5d, 0xd9, 0x87, 0x05, 0x5d, 0xd1, 0x83, 0x05, 0x5d, 0xc8, 0x15, 0xc2,
+    0xa1, 0x53, 0x0e, 0xc2, 0xa1, 0x6b, 0x83, 0x05, 0x5d, 0x21, 0x8b, 0x05,
+    0x5d, 0x41, 0x87, 0x05, 0x5d, 0x30, 0x91, 0x05, 0x5c, 0x99, 0x8b, 0x05,
+    0x5c, 0x91, 0x87, 0x05, 0x5c, 0x89, 0x83, 0x05, 0x5c, 0x73, 0x02, 0xa1,
+    0x83, 0x97, 0x05, 0x5c, 0xa1, 0x98, 0x05, 0x5c, 0xa8, 0xc2, 0x00, 0xc1,
+    0x05, 0x5c, 0x79, 0x83, 0x05, 0x5b, 0xe9, 0x87, 0x05, 0x5b, 0xf1, 0x8b,
+    0x05, 0x5b, 0xf9, 0x91, 0x05, 0x5c, 0x01, 0x97, 0x05, 0x5c, 0x09, 0x98,
+    0x05, 0x5c, 0x10, 0x97, 0x05, 0x5c, 0x69, 0x91, 0x05, 0x5c, 0x61, 0x8b,
+    0x05, 0x5c, 0x59, 0x87, 0x05, 0x5c, 0x51, 0x83, 0x05, 0x5c, 0x49, 0x98,
+    0x00, 0x9f, 0xe8, 0x98, 0x05, 0x5c, 0x41, 0x97, 0x05, 0x5c, 0x39, 0x91,
+    0x05, 0x5c, 0x31, 0x8b, 0x05, 0x5c, 0x29, 0x87, 0x05, 0x5c, 0x21, 0x83,
+    0x05, 0x5c, 0x18, 0x83, 0x05, 0x5c, 0xb1, 0x87, 0x05, 0x5c, 0xb9, 0x8b,
+    0x05, 0x5c, 0xc1, 0x91, 0x05, 0x5c, 0xc9, 0x97, 0x05, 0x5c, 0xd1, 0x98,
+    0x05, 0x5c, 0xd8, 0x83, 0x05, 0x5c, 0xe1, 0x87, 0x05, 0x5c, 0xf1, 0x8b,
+    0x05, 0x5c, 0xf9, 0x91, 0x05, 0x5d, 0x01, 0x97, 0x05, 0x5d, 0x09, 0x98,
+    0x05, 0x5d, 0x10, 0x83, 0x05, 0x5d, 0x19, 0x87, 0x05, 0x5d, 0x29, 0x8b,
+    0x05, 0x5d, 0x39, 0x91, 0x05, 0x5d, 0x49, 0x97, 0x05, 0x5d, 0x51, 0x98,
+    0x05, 0x5d, 0x59, 0xc2, 0x00, 0xdb, 0x05, 0x5d, 0x60, 0x83, 0x00, 0x9d,
+    0x31, 0x87, 0x00, 0x9d, 0x41, 0x8b, 0x00, 0x9d, 0x49, 0x91, 0x00, 0x9d,
+    0x51, 0x97, 0x00, 0x9d, 0x59, 0x98, 0x00, 0x9d, 0x60, 0x83, 0x00, 0x9d,
+    0x69, 0x87, 0x00, 0x9d, 0x71, 0x8b, 0x00, 0x9d, 0x79, 0x91, 0x00, 0x9d,
+    0x81, 0x97, 0x00, 0x9d, 0x89, 0x98, 0x00, 0x9d, 0x91, 0xc2, 0x00, 0xc1,
+    0x00, 0x9d, 0xf8, 0x83, 0x00, 0x9d, 0x99, 0x87, 0x00, 0x9d, 0xa1, 0x8b,
+    0x00, 0x9d, 0xa9, 0x91, 0x00, 0x9d, 0xb1, 0x97, 0x00, 0x9d, 0xb9, 0x98,
+    0x00, 0x9d, 0xc0, 0x83, 0x00, 0x9d, 0xc9, 0x87, 0x00, 0x9d, 0xd1, 0x8b,
+    0x00, 0x9d, 0xd9, 0x91, 0x00, 0x9d, 0xe1, 0x97, 0x00, 0x9d, 0xe9, 0x98,
+    0x00, 0x9f, 0xa8, 0x83, 0x00, 0x9d, 0xf3, 0x02, 0xa1, 0x87, 0x87, 0x00,
+    0x9e, 0x09, 0x8b, 0x00, 0x9e, 0x11, 0x91, 0x00, 0x9e, 0x19, 0x97, 0x00,
+    0x9e, 0x21, 0x98, 0x00, 0x9e, 0x28, 0x83, 0x00, 0x9e, 0x31, 0x87, 0x00,
+    0x9e, 0x39, 0x8b, 0x00, 0x9e, 0x41, 0x91, 0x00, 0x9e, 0x49, 0x97, 0x00,
+    0x9e, 0x51, 0x98, 0x00, 0x9e, 0x58, 0x83, 0x00, 0x9e, 0x61, 0x87, 0x00,
+    0x9e, 0x71, 0x8b, 0x00, 0x9e, 0x79, 0x91, 0x00, 0x9e, 0x81, 0x97, 0x00,
+    0x9e, 0x89, 0x98, 0x00, 0x9e, 0x90, 0x83, 0x00, 0x9e, 0x99, 0x87, 0x00,
+    0x9e, 0xa9, 0x8b, 0x00, 0x9e, 0xb9, 0x91, 0x00, 0x9e, 0xc9, 0x97, 0x00,
+    0x9e, 0xd1, 0x98, 0x00, 0x9e, 0xd9, 0xc2, 0x00, 0xdb, 0x00, 0x9e, 0xe0,
+    0x83, 0x00, 0x9e, 0xa1, 0x87, 0x00, 0x9e, 0xb1, 0x8b, 0x00, 0x9e, 0xc1,
+    0x0e, 0xc2, 0xa1, 0x8b, 0x15, 0x42, 0xa1, 0xa3, 0x83, 0x00, 0x9f, 0x49,
+    0x87, 0x00, 0x9f, 0x51, 0x8b, 0x00, 0x9f, 0x59, 0x91, 0x00, 0x9f, 0x61,
+    0x97, 0x00, 0x9f, 0x69, 0x98, 0x00, 0x9f, 0x70, 0x83, 0x00, 0x9f, 0x79,
+    0x87, 0x00, 0x9f, 0x81, 0x8b, 0x00, 0x9f, 0x89, 0x91, 0x00, 0x9f, 0x91,
+    0x97, 0x00, 0x9f, 0x99, 0x98, 0x00, 0x9f, 0xa0, 0xc3, 0x0e, 0xa7, 0x00,
+    0x04, 0x41, 0xd2, 0x49, 0x55, 0x00, 0x04, 0x48, 0xc3, 0x39, 0x6e, 0x08,
+    0x88, 0xa1, 0xc2, 0x04, 0xc6, 0x08, 0x88, 0x98, 0xc3, 0x39, 0x6e, 0x08,
+    0x88, 0x91, 0xc2, 0x04, 0xc6, 0x08, 0x88, 0x88, 0x8b, 0x08, 0x8a, 0x30,
+    0x83, 0x08, 0x8a, 0x29, 0x97, 0x08, 0x89, 0x79, 0x8b, 0x08, 0x89, 0x68,
+    0x8b, 0x08, 0x89, 0x80, 0x97, 0x08, 0x89, 0x58, 0x8b, 0x08, 0x89, 0x48,
+    0xc4, 0x18, 0x10, 0x08, 0x89, 0xe9, 0xc2, 0x22, 0xcc, 0x08, 0x89, 0xe0,
+    0xc3, 0x0d, 0x14, 0x08, 0x89, 0xd9, 0xc3, 0x09, 0x9e, 0x08, 0x89, 0xd0,
+    0xc4, 0x02, 0xde, 0x08, 0x89, 0xc9, 0xc2, 0x02, 0xa0, 0x08, 0x89, 0xc0,
+    0xc2, 0x0f, 0xe1, 0x05, 0x50, 0x51, 0x83, 0x05, 0x50, 0x58, 0xc2, 0x25,
+    0x3b, 0x05, 0x50, 0x91, 0x83, 0x05, 0x50, 0x89, 0xc2, 0x0f, 0xe1, 0x05,
+    0x50, 0x80, 0x89, 0x05, 0x52, 0x10, 0xc4, 0x18, 0x12, 0x08, 0x7e, 0x51,
+    0x91, 0x08, 0x7e, 0x30, 0xd7, 0x27, 0x74, 0x0f, 0xaa, 0x08, 0xce, 0x74,
+    0x24, 0x01, 0x72, 0x81, 0xcd, 0x79, 0x5b, 0x01, 0x72, 0x88, 0xc3, 0x02,
+    0x44, 0x0f, 0x01, 0x51, 0xc4, 0xac, 0x24, 0x0f, 0x00, 0xb8, 0x47, 0x1c,
+    0xa0, 0xc2, 0xa1, 0xbb, 0xcb, 0x98, 0x00, 0x0f, 0x00, 0x51, 0xc3, 0x78,
+    0xde, 0x0f, 0x00, 0x48, 0xc6, 0xc8, 0x01, 0x0f, 0x01, 0x41, 0xc3, 0xc8,
+    0x92, 0x0f, 0x00, 0x08, 0x91, 0x0f, 0x01, 0x31, 0x97, 0x0f, 0x01, 0x19,
+    0xc3, 0x01, 0xbd, 0x0f, 0x01, 0x09, 0x07, 0x42, 0xa1, 0xc7, 0xc8, 0xae,
+    0x6b, 0x0f, 0x01, 0x21, 0x0a, 0xc2, 0xa1, 0xd1, 0xc4, 0xe4, 0xc7, 0x0f,
+    0x00, 0xa0, 0xc2, 0x00, 0xba, 0x0f, 0x00, 0xe1, 0xc5, 0xd9, 0x52, 0x0f,
+    0x00, 0xa8, 0xc5, 0xdd, 0xb2, 0x0f, 0x00, 0x61, 0xc4, 0xe4, 0x2b, 0x0f,
+    0x00, 0x20, 0xc5, 0xda, 0x24, 0x0f, 0x00, 0x41, 0xc6, 0xd3, 0x73, 0x0f,
+    0x00, 0x30, 0x48, 0x23, 0x26, 0xc2, 0xa1, 0xdb, 0xcb, 0x94, 0x90, 0x00,
+    0x1a, 0x11, 0xc7, 0xc8, 0xd9, 0x00, 0x1a, 0x19, 0xcf, 0x63, 0xd2, 0x00,
+    0x1a, 0x21, 0xcd, 0x4a, 0x68, 0x00, 0x1a, 0x28, 0x45, 0xda, 0x51, 0xc2,
+    0xa1, 0xe5, 0x42, 0x00, 0x5f, 0xc2, 0xa1, 0xf1, 0xcc, 0x85, 0x59, 0x00,
+    0x1a, 0x78, 0xcc, 0x89, 0x25, 0x01, 0x06, 0xd1, 0xcb, 0x02, 0x5c, 0x01,
+    0x06, 0xa0, 0xcb, 0x8e, 0xe4, 0x00, 0xee, 0x49, 0xc6, 0x60, 0xb1, 0x00,
+    0xee, 0x38, 0xc6, 0x09, 0x01, 0x00, 0x18, 0x0b, 0x02, 0xa1, 0xf9, 0xc9,
+    0x2b, 0x5f, 0x00, 0x1a, 0x08, 0x00, 0xc2, 0xa1, 0xff, 0x19, 0x42, 0xa2,
+    0x17, 0xc7, 0x20, 0x88, 0x01, 0x06, 0xc1, 0xc5, 0x00, 0xd4, 0x00, 0x18,
+    0x51, 0xc5, 0x05, 0x02, 0x00, 0x19, 0x28, 0xd0, 0x2c, 0x60, 0x01, 0x07,
+    0x29, 0xcd, 0x52, 0x59, 0x00, 0x18, 0xa0, 0x03, 0xc2, 0xa2, 0x1d, 0x4c,
+    0x02, 0x56, 0xc2, 0xa2, 0x29, 0x42, 0x00, 0xd0, 0xc2, 0xa2, 0x35, 0x4c,
+    0x1a, 0x50, 0xc2, 0xa2, 0x41, 0xca, 0x9a, 0x3d, 0x00, 0x18, 0xc0, 0xdb,
+    0x0b, 0x6c, 0x01, 0x07, 0x69, 0xcd, 0x7a, 0x38, 0x01, 0x07, 0x50, 0xd6,
+    0x2c, 0x5a, 0x01, 0x07, 0x59, 0xd5, 0x36, 0x86, 0x01, 0x06, 0x91, 0x15,
+    0x42, 0xa2, 0x4d, 0x97, 0x00, 0x1b, 0x3b, 0x02, 0xa2, 0x59, 0x91, 0x00,
+    0x1b, 0x33, 0x02, 0xa2, 0x5f, 0x83, 0x00, 0x1b, 0x1b, 0x02, 0xa2, 0x65,
+    0x99, 0x00, 0xef, 0x8b, 0x02, 0xa2, 0x7d, 0x87, 0x00, 0x1b, 0x23, 0x02,
+    0xa2, 0x83, 0x92, 0x00, 0xef, 0x71, 0x8e, 0x00, 0xee, 0xeb, 0x02, 0xa2,
+    0x8f, 0x88, 0x00, 0xef, 0x5b, 0x02, 0xa2, 0x9b, 0x95, 0x00, 0xef, 0x23,
+    0x02, 0xa2, 0xa1, 0x84, 0x00, 0xef, 0x43, 0x02, 0xa2, 0xa7, 0x9c, 0x00,
+    0xef, 0x31, 0x94, 0x00, 0x1b, 0x63, 0x02, 0xa2, 0xad, 0x90, 0x00, 0xef,
+    0x01, 0x8d, 0x00, 0xee, 0xe1, 0x89, 0x00, 0xee, 0xd1, 0x8b, 0x00, 0x1b,
+    0x2b, 0x02, 0xa2, 0xb1, 0x85, 0x00, 0x1b, 0x43, 0x02, 0xa2, 0xb7, 0x96,
+    0x00, 0x1b, 0x6b, 0x02, 0xa2, 0xbd, 0x86, 0x00, 0x1b, 0x49, 0x8a, 0x00,
+    0x1b, 0x51, 0x8f, 0x00, 0x1b, 0x59, 0x98, 0x00, 0x1b, 0x71, 0x9a, 0x00,
+    0x1b, 0x78, 0x94, 0x00, 0xef, 0x11, 0x90, 0x00, 0xef, 0x09, 0x8f, 0x00,
+    0xee, 0xf9, 0x8e, 0x00, 0xee, 0xf1, 0x89, 0x00, 0xee, 0xd8, 0xc9, 0x0f,
+    0x6e, 0x07, 0xf1, 0x03, 0x02, 0xa2, 0xc3, 0xca, 0x09, 0xb7, 0x07, 0xf1,
+    0x0a, 0x02, 0xa2, 0xc9, 0xc5, 0x05, 0x02, 0x00, 0x19, 0x81, 0xc7, 0x20,
+    0x88, 0x00, 0x19, 0xa1, 0xcf, 0x66, 0x57, 0x07, 0xf1, 0x49, 0xd0, 0x5d,
+    0x42, 0x07, 0xf1, 0x50, 0x00, 0xc2, 0xa2, 0xcf, 0xd3, 0x41, 0x5e, 0x00,
+    0xd5, 0x80, 0x00, 0xc2, 0xa3, 0x1f, 0x44, 0x00, 0xde, 0x42, 0xa3, 0x31,
+    0xcb, 0x03, 0xbc, 0x00, 0xd5, 0x99, 0xcb, 0x9a, 0x3c, 0x00, 0x18, 0xf0,
+    0xcd, 0x7a, 0x79, 0x05, 0x47, 0x89, 0x47, 0x02, 0x0e, 0xc2, 0xa3, 0x3d,
+    0x46, 0x09, 0x97, 0x42, 0xa3, 0x63, 0xc5, 0x50, 0xb1, 0x01, 0x07, 0x11,
+    0xc5, 0x0b, 0x0a, 0x01, 0x06, 0xf0, 0xca, 0x02, 0xfd, 0x01, 0x07, 0x00,
+    0xce, 0x74, 0xb0, 0x00, 0x24, 0x41, 0xcd, 0x33, 0xee, 0x05, 0x33, 0x88,
+    0xc7, 0xc8, 0xee, 0x00, 0x24, 0x39, 0xcd, 0x7a, 0x04, 0x00, 0x24, 0x31,
+    0x03, 0x42, 0xa3, 0x87, 0xc4, 0x90, 0x77, 0x00, 0x24, 0x1b, 0x02, 0xa3,
+    0x93, 0xd0, 0x5c, 0xc2, 0x05, 0x33, 0x81, 0xd5, 0x33, 0xe6, 0x05, 0x33,
+    0x90, 0x07, 0xc2, 0xa3, 0x97, 0x8b, 0x05, 0x33, 0xab, 0x02, 0xa3, 0xb2,
+    0x97, 0x05, 0x33, 0xbb, 0x02, 0xa3, 0xbc, 0x1b, 0xc2, 0xa3, 0xc2, 0xc2,
+    0x00, 0xd0, 0x01, 0x6f, 0x7b, 0x02, 0xa3, 0xd6, 0x15, 0xc2, 0xa3, 0xdc,
+    0x91, 0x01, 0x6f, 0x53, 0x02, 0xa3, 0xe6, 0x04, 0xc2, 0xa3, 0xec, 0xc2,
+    0x00, 0x5f, 0x01, 0x6f, 0x09, 0xc3, 0xc0, 0x19, 0x01, 0x6f, 0x11, 0x06,
+    0xc2, 0xa3, 0xf6, 0x1c, 0xc2, 0xa4, 0x00, 0xc2, 0x02, 0x2b, 0x01, 0x6f,
+    0x31, 0xc2, 0x00, 0x67, 0x01, 0x6f, 0x59, 0x16, 0xc2, 0xa4, 0x0a, 0xc3,
+    0x28, 0x28, 0x01, 0x6f, 0x89, 0xc4, 0xe0, 0x1b, 0x01, 0x6f, 0xa1, 0x83,
+    0x01, 0x6f, 0xb1, 0xcc, 0x82, 0x05, 0x01, 0x6f, 0xc9, 0xca, 0x51, 0x7f,
+    0x01, 0x6f, 0xe8, 0xc6, 0x05, 0x01, 0x00, 0x19, 0x60, 0xc5, 0x00, 0xd4,
+    0x00, 0x18, 0x9b, 0x02, 0xa4, 0x14, 0xc5, 0x05, 0x02, 0x00, 0x19, 0x30,
+    0xc6, 0x05, 0x01, 0x07, 0xf1, 0x68, 0xcd, 0x42, 0x35, 0x00, 0x19, 0xa9,
+    0xce, 0x2c, 0x62, 0x00, 0x19, 0xb8, 0xc7, 0xc1, 0x31, 0x00, 0xee, 0x59,
+    0xc6, 0x05, 0x01, 0x00, 0x19, 0x70, 0xc5, 0x05, 0x02, 0x00, 0x19, 0x51,
+    0xc5, 0x00, 0xd4, 0x00, 0x1a, 0x30, 0xc5, 0x00, 0xd4, 0x00, 0xef, 0xa9,
+    0xc5, 0x05, 0x02, 0x00, 0x18, 0xe8, 0x4c, 0x83, 0x6d, 0xc2, 0xa4, 0x1a,
+    0x42, 0x00, 0x38, 0x42, 0xa4, 0x26, 0xc5, 0x1d, 0x88, 0x00, 0xee, 0x61,
+    0xc5, 0x1f, 0x0c, 0x00, 0xee, 0x31, 0xc5, 0x31, 0xee, 0x00, 0xee, 0x20,
+    0xc5, 0x05, 0x02, 0x00, 0x19, 0x89, 0xc9, 0x0f, 0x6e, 0x07, 0xf1, 0x23,
+    0x02, 0xa4, 0x35, 0xca, 0x09, 0xb7, 0x07, 0xf1, 0x2a, 0x02, 0xa4, 0x3b,
+    0xc7, 0x20, 0x88, 0x00, 0xd5, 0xf1, 0xc5, 0x05, 0x02, 0x00, 0xd5, 0xe9,
+    0xc5, 0x00, 0xd4, 0x00, 0xd5, 0xd8, 0xc4, 0x18, 0x10, 0x0e, 0x9b, 0x79,
+    0xc2, 0x22, 0xcc, 0x0e, 0x9b, 0x70, 0xc3, 0x0d, 0x14, 0x0e, 0x9b, 0x69,
+    0xc3, 0x09, 0x9e, 0x0e, 0x9b, 0x60, 0xc4, 0x02, 0xde, 0x0e, 0x9b, 0x59,
+    0xc2, 0x02, 0xa0, 0x0e, 0x9b, 0x50, 0xc4, 0x18, 0x10, 0x0e, 0x9b, 0x31,
+    0xc2, 0x22, 0xcc, 0x0e, 0x9b, 0x28, 0xc3, 0x0d, 0x14, 0x0e, 0x9b, 0x21,
+    0xc3, 0x09, 0x9e, 0x0e, 0x9b, 0x18, 0xc4, 0x02, 0xde, 0x0e, 0x9b, 0x11,
+    0xc2, 0x02, 0xa0, 0x0e, 0x9b, 0x08, 0xe0, 0x0a, 0x07, 0x01, 0x17, 0xd8,
+    0xcc, 0x23, 0x9f, 0x01, 0x15, 0xa8, 0x0a, 0xc2, 0xa4, 0x41, 0xc3, 0x0b,
+    0x65, 0x01, 0x64, 0xa9, 0xc2, 0x00, 0xba, 0x01, 0x64, 0xe8, 0xc3, 0x01,
+    0x69, 0x00, 0x1f, 0x49, 0xc3, 0x00, 0xfe, 0x01, 0x64, 0x78, 0xc4, 0xd0,
+    0x3f, 0x00, 0x1f, 0x59, 0xc3, 0x0a, 0x8c, 0x01, 0x64, 0x28, 0x0a, 0xc2,
+    0xa4, 0x4b, 0xc2, 0x00, 0x59, 0x01, 0x64, 0x59, 0xc3, 0x07, 0x4a, 0x01,
+    0x65, 0x29, 0xc4, 0x87, 0xf5, 0x01, 0x66, 0x08, 0xc2, 0x06, 0xdb, 0x00,
+    0x1f, 0x79, 0xc4, 0xe2, 0x73, 0x01, 0x64, 0x39, 0x49, 0xa9, 0x00, 0x42,
+    0xa4, 0x57, 0xc3, 0xe5, 0xe7, 0x01, 0x64, 0x09, 0xcc, 0x8c, 0x3d, 0x01,
+    0x66, 0x48, 0xc5, 0xd6, 0xd7, 0x01, 0x64, 0x89, 0xc2, 0x20, 0xec, 0x01,
+    0x65, 0x38, 0xc4, 0xe1, 0xcf, 0x01, 0x64, 0xb9, 0xca, 0xa7, 0x7e, 0x01,
+    0x66, 0x88, 0xc2, 0x00, 0x59, 0x01, 0x65, 0x89, 0x43, 0x1d, 0xbb, 0x42,
+    0xa4, 0x6f, 0x8b, 0x01, 0x65, 0x09, 0xc2, 0x00, 0xba, 0x01, 0x65, 0x78,
+    0x8b, 0x01, 0x65, 0x59, 0xc2, 0x06, 0xdb, 0x00, 0x1f, 0x28, 0x4c, 0x1d,
+    0xdd, 0xc2, 0xa4, 0x7b, 0xca, 0x9b, 0xa8, 0x01, 0x66, 0x18, 0xc2, 0x02,
+    0xfa, 0x01, 0x67, 0x21, 0xc5, 0xd6, 0xe1, 0x01, 0x67, 0x48, 0xc6, 0xd1,
+    0x21, 0x01, 0x67, 0x39, 0xc9, 0xa9, 0x75, 0x01, 0x67, 0x50, 0xc3, 0x01,
+    0x69, 0x00, 0x1f, 0x41, 0xc3, 0x00, 0xfe, 0x01, 0x64, 0x70, 0xc4, 0xd0,
+    0x3f, 0x00, 0x1f, 0x51, 0xc3, 0x0a, 0x8c, 0x01, 0x64, 0x20, 0x0a, 0xc2,
+    0xa4, 0x93, 0xc2, 0x00, 0x59, 0x01, 0x64, 0x51, 0xc3, 0x07, 0x4a, 0x01,
+    0x65, 0x21, 0xc4, 0x87, 0xf5, 0x01, 0x66, 0x00, 0xc2, 0x06, 0xdb, 0x00,
+    0x1f, 0x71, 0xc4, 0xe2, 0x73, 0x01, 0x64, 0x31, 0x49, 0xa9, 0x00, 0x42,
+    0xa4, 0x9f, 0xc3, 0xe5, 0xe7, 0x01, 0x64, 0x01, 0xcc, 0x8c, 0x3d, 0x01,
+    0x66, 0x40, 0xc5, 0xd6, 0xd7, 0x01, 0x64, 0x81, 0xc2, 0x20, 0xec, 0x01,
+    0x65, 0x30, 0xc3, 0x0b, 0x65, 0x01, 0x64, 0xa1, 0xc2, 0x00, 0xba, 0x01,
+    0x64, 0xe1, 0x0a, 0x42, 0xa4, 0xb7, 0xc4, 0xe1, 0xcf, 0x01, 0x64, 0xb1,
+    0xca, 0xa7, 0x7e, 0x01, 0x66, 0x80, 0xc2, 0x00, 0x59, 0x01, 0x65, 0x81,
+    0x43, 0x1d, 0xbb, 0x42, 0xa4, 0xc1, 0x8b, 0x01, 0x65, 0x01, 0xc2, 0x00,
+    0xba, 0x01, 0x65, 0x70, 0x8b, 0x01, 0x65, 0x51, 0xc2, 0x06, 0xdb, 0x00,
+    0x1f, 0x20, 0x4c, 0x1d, 0xdd, 0xc2, 0xa4, 0xcd, 0xca, 0x9b, 0xa8, 0x01,
+    0x66, 0x10, 0xc5, 0xd6, 0xc8, 0x01, 0x67, 0x81, 0xc5, 0x3b, 0x5e, 0x01,
+    0x67, 0x88, 0xc2, 0x02, 0xa0, 0x08, 0x17, 0x11, 0xc4, 0x02, 0xde, 0x08,
+    0x17, 0x18, 0xc3, 0x09, 0x9e, 0x08, 0x17, 0x21, 0xc3, 0x0d, 0x14, 0x08,
+    0x17, 0x28, 0xc2, 0x22, 0xcc, 0x08, 0x17, 0x31, 0xc4, 0x18, 0x10, 0x08,
+    0x17, 0x38, 0xc2, 0x00, 0xc4, 0x08, 0x17, 0x51, 0x19, 0xc2, 0xa4, 0xe5,
+    0x0a, 0x42, 0xa4, 0xf1, 0x11, 0xc2, 0xa4, 0xfd, 0x0b, 0x42, 0xa5, 0x09,
+    0x42, 0x22, 0xcc, 0xc2, 0xa5, 0x15, 0x44, 0x18, 0x10, 0x42, 0xa5, 0x21,
+    0x9b, 0x08, 0x17, 0x89, 0xc8, 0x0d, 0x03, 0x08, 0x17, 0xd0, 0xc2, 0x0d,
+    0x10, 0x08, 0x17, 0x91, 0xc8, 0x0d, 0x03, 0x08, 0x17, 0xd8, 0xd2, 0x4a,
+    0x09, 0x01, 0x52, 0x80, 0xcc, 0x23, 0x9f, 0x01, 0x56, 0x88, 0xcc, 0x23,
+    0x9f, 0x01, 0x56, 0x90, 0xe0, 0x05, 0x67, 0x0f, 0xa8, 0x0a, 0x02, 0xa5,
+    0x2d, 0x44, 0x22, 0x44, 0xc2, 0xa5, 0x33, 0x11, 0x42, 0xa5, 0x3f, 0xc7,
+    0xc1, 0xd9, 0x0f, 0xab, 0x29, 0xc7, 0xc7, 0x6d, 0x0f, 0xaa, 0xc8, 0xc7,
+    0xc1, 0xd9, 0x0f, 0xaa, 0xf1, 0xc7, 0xc7, 0x6d, 0x0f, 0xaa, 0x90, 0xc7,
+    0xc1, 0xd9, 0x0f, 0xab, 0x31, 0xc7, 0xc7, 0x6d, 0x0f, 0xaa, 0xd0, 0xc7,
+    0xc1, 0xd9, 0x0f, 0xab, 0x19, 0xc7, 0xc7, 0x6d, 0x0f, 0xaa, 0xb8, 0xc7,
+    0xc1, 0xd9, 0x0f, 0xab, 0x11, 0xc7, 0xc7, 0x6d, 0x0f, 0xaa, 0xb0, 0xc7,
+    0xc1, 0xd9, 0x0f, 0xab, 0x09, 0xc7, 0xc7, 0x6d, 0x0f, 0xaa, 0xa8, 0xc7,
+    0xc1, 0xd9, 0x0f, 0xab, 0x01, 0xc7, 0xc7, 0x6d, 0x0f, 0xaa, 0xa0, 0xc7,
+    0xc1, 0xd9, 0x0f, 0xaa, 0xf9, 0xc7, 0xc7, 0x6d, 0x0f, 0xaa, 0x98, 0x00,
+    0xc2, 0xa5, 0x4b, 0xc9, 0xae, 0xe8, 0x01, 0x36, 0x90, 0x0d, 0xc2, 0xa5,
+    0x5a, 0xc5, 0xd9, 0x61, 0x01, 0x93, 0x0b, 0x02, 0xa5, 0x6c, 0x16, 0xc2,
+    0xa5, 0x72, 0xc5, 0xd6, 0x8c, 0x01, 0x93, 0x1b, 0x02, 0xa5, 0x84, 0xc5,
+    0xda, 0xe7, 0x01, 0x93, 0x23, 0x02, 0xa5, 0x8a, 0x12, 0xc2, 0xa5, 0x90,
+    0xc4, 0xad, 0x2b, 0x01, 0x93, 0x33, 0x02, 0xa5, 0xa2, 0xc5, 0xb7, 0x9d,
+    0x01, 0x93, 0x3b, 0x02, 0xa5, 0xa8, 0x05, 0xc2, 0xa5, 0xac, 0xc5, 0x90,
+    0xe4, 0x01, 0x93, 0x6a, 0x02, 0xa5, 0xbe, 0xc4, 0x0e, 0x6a, 0x01, 0x39,
+    0x51, 0xc6, 0x1c, 0xb4, 0x01, 0x4d, 0xf0, 0x44, 0x09, 0x99, 0xc2, 0xa5,
+    0xc4, 0x48, 0x30, 0xf3, 0x42, 0xa5, 0xe8, 0xca, 0x30, 0xb2, 0x01, 0x14,
+    0xc9, 0x0e, 0x42, 0xa5, 0xf4, 0x4d, 0x29, 0xb9, 0xc2, 0xa5, 0xfa, 0x4f,
+    0x0b, 0x17, 0x42, 0xa6, 0x62, 0x42, 0x00, 0x28, 0xc2, 0xa6, 0xca, 0x44,
+    0x0d, 0x0d, 0xc2, 0xa6, 0xd9, 0xc2, 0x00, 0xc4, 0x01, 0x23, 0x4a, 0x02,
+    0xa6, 0xe6, 0x44, 0x00, 0x2d, 0xc2, 0xa6, 0xec, 0xc5, 0x66, 0xb1, 0x01,
+    0x23, 0x50, 0x45, 0x18, 0x10, 0xc2, 0xa6, 0xf8, 0x43, 0x22, 0xcc, 0x42,
+    0xa7, 0x04, 0x43, 0x14, 0x07, 0xc2, 0xa7, 0x10, 0x11, 0x42, 0xa7, 0x1d,
+    0xc5, 0x03, 0xc7, 0x01, 0x1c, 0x50, 0xd6, 0x30, 0xe8, 0x01, 0x4d, 0xe1,
+    0xc6, 0x01, 0xa1, 0x0f, 0x88, 0x70, 0xe0, 0x08, 0xe7, 0x01, 0x51, 0xb0,
+    0x03, 0xc2, 0xa7, 0x2c, 0xc8, 0x2c, 0xb2, 0x01, 0x92, 0x21, 0x0d, 0xc2,
+    0xa7, 0x44, 0x15, 0xc2, 0xa7, 0x50, 0xc3, 0x05, 0x14, 0x01, 0x94, 0x01,
+    0x16, 0xc2, 0xa7, 0x74, 0x08, 0xc2, 0xa7, 0x86, 0x07, 0xc2, 0xa7, 0x96,
+    0x10, 0xc2, 0xa7, 0xae, 0x0f, 0xc2, 0xa7, 0xb8, 0x19, 0xc2, 0xa7, 0xc8,
+    0x0a, 0xc2, 0xa7, 0xd4, 0x05, 0xc2, 0xa7, 0xe0, 0x0e, 0xc2, 0xa7, 0xea,
+    0xc5, 0xb9, 0xbc, 0x01, 0x94, 0xf1, 0xc4, 0xaa, 0xbb, 0x01, 0x95, 0x01,
+    0x14, 0x42, 0xa7, 0xfc, 0x85, 0x0f, 0x89, 0x59, 0x94, 0x0f, 0x89, 0x60,
+    0xc6, 0xcd, 0xf7, 0x01, 0x93, 0xe1, 0xc5, 0xde, 0x34, 0x01, 0x93, 0xe8,
+    0x83, 0x01, 0x96, 0x81, 0x8b, 0x01, 0x96, 0x89, 0x97, 0x01, 0x96, 0x91,
+    0x87, 0x01, 0x96, 0x99, 0x91, 0x01, 0x96, 0xa1, 0x0d, 0xc2, 0xa8, 0x06,
+    0x15, 0xc2, 0xa8, 0x1a, 0x16, 0xc2, 0xa8, 0x2e, 0x10, 0xc2, 0xa8, 0x42,
+    0x0a, 0xc2, 0xa8, 0x56, 0x0f, 0xc2, 0xa8, 0x6a, 0x1b, 0xc2, 0xa8, 0x7e,
+    0x14, 0xc2, 0xa8, 0x8a, 0x19, 0x42, 0xa8, 0x9e, 0xe0, 0x02, 0xa7, 0x01,
+    0x2e, 0xa8, 0xd4, 0x3d, 0x7c, 0x01, 0x2e, 0xa1, 0xca, 0x1e, 0x5f, 0x01,
+    0x2e, 0x98, 0xcf, 0x63, 0x2d, 0x01, 0x2e, 0x91, 0xce, 0x66, 0x67, 0x01,
+    0x2e, 0x80, 0xe0, 0x01, 0x67, 0x01, 0x4e, 0x18, 0xd8, 0x24, 0x83, 0x01,
+    0x4e, 0x11, 0xcd, 0x76, 0x90, 0x01, 0x4d, 0xd8, 0x47, 0x03, 0x4c, 0x42,
+    0xa8, 0xae, 0xd1, 0x51, 0xcd, 0x09, 0x1a, 0xf9, 0xc4, 0x58, 0xf5, 0x09,
+    0x1a, 0xf0, 0xca, 0xa1, 0x20, 0x09, 0x1b, 0x38, 0x47, 0x03, 0x4c, 0xc2,
+    0xa8, 0xb8, 0xc2, 0x0e, 0x9a, 0x09, 0x1a, 0x7a, 0x02, 0xa8, 0xfb, 0x00,
+    0x42, 0xa9, 0x01, 0xa0, 0x09, 0x19, 0xb0, 0xc7, 0x6c, 0xd0, 0x09, 0x19,
+    0x51, 0xcb, 0x94, 0x0c, 0x09, 0x19, 0x48, 0xc2, 0x02, 0xad, 0x09, 0x18,
+    0x68, 0xda, 0x1b, 0x68, 0x09, 0x18, 0x81, 0xcc, 0x8b, 0x59, 0x09, 0x18,
+    0x79, 0xd7, 0x29, 0x9c, 0x09, 0x18, 0x70, 0xc2, 0x00, 0x4e, 0x09, 0x1c,
+    0xc3, 0x02, 0xa9, 0x0d, 0x97, 0x09, 0x19, 0x09, 0xc4, 0x55, 0x25, 0x09,
+    0x19, 0x01, 0xc5, 0x03, 0x47, 0x09, 0x18, 0xf0, 0x47, 0x03, 0x4c, 0x42,
+    0xa9, 0x13, 0xcd, 0x80, 0x84, 0x09, 0x1a, 0xd8, 0xc4, 0x38, 0xb4, 0x09,
+    0x1a, 0xa9, 0xc2, 0x05, 0x52, 0x09, 0x1a, 0x9b, 0x02, 0xa9, 0x1f, 0x83,
+    0x09, 0x1a, 0x90, 0xc7, 0x6c, 0xd0, 0x09, 0x18, 0xd3, 0x02, 0xa9, 0x23,
+    0xc4, 0x39, 0xc8, 0x09, 0x18, 0xc9, 0x46, 0x03, 0x4d, 0xc2, 0xa9, 0x29,
+    0xc6, 0xd0, 0x97, 0x09, 0x18, 0xa0, 0x47, 0x03, 0x4c, 0x42, 0xa9, 0x3e,
+    0xd4, 0x39, 0x58, 0x09, 0x18, 0x50, 0xc9, 0xac, 0x18, 0x09, 0x29, 0xc8,
+    0x47, 0x03, 0x4c, 0x42, 0xa9, 0x4a, 0x00, 0x42, 0xa9, 0x68, 0xc4, 0x39,
+    0xc8, 0x09, 0x17, 0x79, 0x46, 0x03, 0x4d, 0xc2, 0xa9, 0x74, 0xc8, 0x0a,
+    0xff, 0x09, 0x17, 0x60, 0x00, 0x42, 0xa9, 0x80, 0xca, 0x38, 0xae, 0x09,
+    0x29, 0xc1, 0xc4, 0x39, 0xc8, 0x09, 0x16, 0xe0, 0xa1, 0x09, 0x16, 0xf2,
+    0x02, 0xa9, 0x8f, 0x9f, 0x09, 0x16, 0xcb, 0x02, 0xa9, 0x95, 0xc3, 0x2b,
+    0x88, 0x09, 0x16, 0xd1, 0xd2, 0x47, 0x27, 0x09, 0x16, 0xc0, 0x00, 0xc2,
+    0xa9, 0x9b, 0xc2, 0x01, 0xe2, 0x09, 0x16, 0x03, 0x02, 0xa9, 0xb0, 0x90,
+    0x09, 0x15, 0xf9, 0xc2, 0xe6, 0xab, 0x09, 0x15, 0xf0, 0xa3, 0x09, 0x15,
+    0xbb, 0x02, 0xa9, 0xba, 0xc2, 0x38, 0x6a, 0x09, 0x15, 0xc9, 0xc2, 0xe5,
+    0x8e, 0x09, 0x15, 0xc1, 0xa0, 0x09, 0x15, 0x72, 0x02, 0xa9, 0xc0, 0xc2,
+    0x01, 0x6f, 0x09, 0x16, 0xb1, 0x94, 0x09, 0x16, 0x9b, 0x02, 0xa9, 0xc6,
+    0xc3, 0x56, 0xa5, 0x09, 0x16, 0x91, 0x8f, 0x09, 0x16, 0x33, 0x02, 0xa9,
+    0xca, 0x86, 0x09, 0x16, 0x1a, 0x02, 0xa9, 0xd0, 0x00, 0x42, 0xa9, 0xd6,
+    0xd1, 0x56, 0xb7, 0x09, 0x15, 0x50, 0xa6, 0x09, 0x17, 0x50, 0xc3, 0x02,
+    0x2c, 0x09, 0x17, 0x40, 0x9f, 0x09, 0x17, 0x28, 0xc3, 0xe4, 0xe2, 0x09,
+    0x12, 0x93, 0x02, 0xa9, 0xf1, 0xa6, 0x09, 0x1c, 0x80, 0x49, 0x38, 0x6c,
+    0x42, 0xa9, 0xf7, 0x00, 0x42, 0xaa, 0x03, 0xc2, 0x4d, 0x4c, 0x09, 0x13,
+    0x6b, 0x02, 0xaa, 0x15, 0x00, 0x42, 0xaa, 0x19, 0x9f, 0x09, 0x12, 0x39,
+    0xc8, 0xb7, 0xa2, 0x09, 0x12, 0x28, 0x94, 0x09, 0x12, 0x21, 0x00, 0x42,
+    0xaa, 0x34, 0xc7, 0x6c, 0xd0, 0x09, 0x12, 0x59, 0x46, 0x03, 0x4d, 0x42,
+    0xaa, 0x46, 0x00, 0xc2, 0xaa, 0x50, 0xa0, 0x09, 0x11, 0xca, 0x02, 0xaa,
+    0x65, 0xc5, 0x39, 0xc7, 0x09, 0x11, 0x78, 0x8a, 0x09, 0x1c, 0x60, 0x9f,
+    0x09, 0x11, 0x38, 0xc4, 0x39, 0xc8, 0x09, 0x11, 0x11, 0xca, 0x38, 0xae,
+    0x09, 0x11, 0x08, 0x00, 0x42, 0xaa, 0x69, 0xc9, 0xac, 0xa8, 0x09, 0x10,
+    0xf2, 0x02, 0xaa, 0x83, 0x00, 0x42, 0xaa, 0x89, 0x24, 0xc2, 0xaa, 0x93,
+    0x23, 0xc2, 0xaa, 0x9f, 0xc3, 0xe5, 0x7e, 0x09, 0x27, 0xf9, 0x21, 0xc2,
+    0xaa, 0xbd, 0x20, 0xc2, 0xaa, 0xd5, 0x1f, 0xc2, 0xaa, 0xe3, 0x1e, 0xc2,
+    0xaa, 0xf5, 0x1d, 0x42, 0xab, 0x01, 0x84, 0x09, 0x0d, 0xc3, 0x02, 0xab,
+    0x2b, 0x94, 0x09, 0x0f, 0x62, 0x02, 0xab, 0x2f, 0xca, 0x51, 0xd4, 0x09,
+    0x0f, 0xaa, 0x02, 0xab, 0x33, 0xca, 0x8c, 0xf6, 0x09, 0x0f, 0x98, 0x97,
+    0x09, 0x0c, 0x3b, 0x02, 0xab, 0x39, 0x0d, 0xc2, 0xab, 0x5a, 0x04, 0xc2,
+    0xab, 0x68, 0x16, 0xc2, 0xab, 0x74, 0x15, 0xc2, 0xab, 0x7e, 0x12, 0xc2,
+    0xab, 0x95, 0x0e, 0xc2, 0xab, 0x9d, 0xcd, 0x05, 0x5a, 0x09, 0x1c, 0x11,
+    0x09, 0xc2, 0xab, 0xa8, 0x83, 0x09, 0x0a, 0xc3, 0x02, 0xab, 0xbd, 0xc2,
+    0x2e, 0x48, 0x09, 0x0c, 0x61, 0xc2, 0x17, 0x99, 0x09, 0x0b, 0xe9, 0x10,
+    0xc2, 0xab, 0xd0, 0x0f, 0xc2, 0xab, 0xda, 0x0b, 0xc2, 0xab, 0xe8, 0x07,
+    0x42, 0xab, 0xf2, 0x00, 0x42, 0xab, 0xfe, 0xa1, 0x09, 0x0c, 0xd9, 0x9f,
+    0x09, 0x0c, 0xd0, 0x00, 0x42, 0xac, 0x0a, 0xcf, 0x6a, 0x17, 0x09, 0x0c,
+    0xb0, 0xa2, 0x09, 0x0c, 0x9b, 0x02, 0xac, 0x16, 0xa1, 0x09, 0x0c, 0x91,
+    0xa0, 0x09, 0x0c, 0x89, 0x9f, 0x09, 0x0c, 0x80, 0xcd, 0x7b, 0x8a, 0x09,
+    0x0c, 0x70, 0xcd, 0x7a, 0x5f, 0x09, 0x0d, 0xa0, 0xc5, 0x39, 0xc7, 0x09,
+    0x0d, 0x88, 0xcd, 0x77, 0xe2, 0x09, 0x0d, 0x70, 0xe0, 0x05, 0x47, 0x09,
+    0x0d, 0x58, 0xc3, 0x68, 0xd0, 0x09, 0x0d, 0x43, 0x02, 0xac, 0x1c, 0x8a,
+    0x09, 0x0d, 0x39, 0xc2, 0x00, 0x65, 0x09, 0x0d, 0x30, 0x97, 0x09, 0x0d,
+    0x13, 0x02, 0xac, 0x22, 0xc3, 0x62, 0x19, 0x09, 0x0d, 0x08, 0xc3, 0x02,
+    0x2c, 0x09, 0x09, 0x73, 0x02, 0xac, 0x26, 0x97, 0x09, 0x09, 0xb1, 0xc3,
+    0x04, 0x65, 0x09, 0x09, 0xa9, 0xc3, 0x20, 0x18, 0x09, 0x09, 0xa1, 0xc3,
+    0x56, 0x1d, 0x09, 0x09, 0x99, 0xc3, 0x1a, 0xe7, 0x09, 0x09, 0x91, 0xc4,
+    0x04, 0x59, 0x09, 0x09, 0x89, 0xc3, 0x62, 0x19, 0x09, 0x09, 0x80, 0xc4,
+    0x58, 0xf5, 0x09, 0x09, 0x53, 0x02, 0xac, 0x30, 0xc4, 0x39, 0xc8, 0x09,
+    0x09, 0x58, 0x47, 0x03, 0x4c, 0x42, 0xac, 0x36, 0x00, 0x42, 0xac, 0x54,
+    0x00, 0x42, 0xac, 0x66, 0x17, 0xc2, 0xac, 0x72, 0xa4, 0x09, 0x09, 0x30,
+    0xca, 0xa6, 0x48, 0x09, 0x09, 0x20, 0x8a, 0x09, 0x08, 0x8b, 0x02, 0xac,
+    0x7c, 0xc2, 0x00, 0x65, 0x09, 0x08, 0x80, 0xa0, 0x09, 0x08, 0x53, 0x02,
+    0xac, 0x80, 0x9f, 0x09, 0x08, 0x42, 0x02, 0xac, 0x86, 0x00, 0x42, 0xac,
+    0x8c, 0xcb, 0x47, 0xaa, 0x09, 0x08, 0x19, 0x46, 0x03, 0x4d, 0x42, 0xac,
+    0x98, 0x47, 0x03, 0x4c, 0x42, 0xac, 0xa0, 0x00, 0x42, 0xac, 0xaa, 0x00,
+    0x42, 0xac, 0xb6, 0xa0, 0x09, 0x07, 0xe0, 0x9f, 0x09, 0x07, 0xba, 0x02,
+    0xac, 0xc2, 0xc2, 0x00, 0xc2, 0x09, 0x07, 0xa1, 0xda, 0x1a, 0xe6, 0x09,
+    0x07, 0x98, 0xd6, 0x1a, 0xea, 0x09, 0x07, 0x88, 0x46, 0x03, 0x4d, 0xc2,
+    0xac, 0xc6, 0x4e, 0x6c, 0xd0, 0x42, 0xad, 0x01, 0xc2, 0x5c, 0x27, 0x09,
+    0x25, 0x58, 0xc3, 0x0b, 0x64, 0x09, 0x25, 0x51, 0xc3, 0x51, 0xdb, 0x09,
+    0x25, 0x49, 0x97, 0x09, 0x04, 0x99, 0x15, 0xc2, 0xad, 0x2b, 0xc2, 0x02,
+    0x2f, 0x09, 0x04, 0x81, 0xc3, 0x1a, 0xf4, 0x09, 0x04, 0x79, 0xd1, 0x4e,
+    0xe1, 0x09, 0x04, 0x70, 0xc7, 0x0b, 0x09, 0x09, 0x04, 0xe9, 0xcb, 0x96,
+    0xed, 0x09, 0x04, 0xe1, 0xcb, 0x94, 0x38, 0x09, 0x04, 0xd9, 0x46, 0x03,
+    0x4d, 0x42, 0xad, 0x37, 0x47, 0x03, 0x4c, 0xc2, 0xad, 0x46, 0xc2, 0x04,
+    0x3d, 0x09, 0x04, 0x10, 0x47, 0x03, 0x4c, 0xc2, 0xad, 0x7e, 0x9f, 0x09,
+    0x04, 0x00, 0xa1, 0x09, 0x04, 0x41, 0xa0, 0x09, 0x04, 0x2a, 0x02, 0xad,
+    0x8a, 0xc7, 0x6c, 0xd0, 0x09, 0x03, 0xe9, 0xc4, 0x39, 0xc8, 0x09, 0x03,
+    0xe1, 0xc7, 0xc6, 0x47, 0x09, 0x03, 0xd8, 0x9f, 0x09, 0x03, 0xb3, 0x02,
+    0xad, 0x93, 0x47, 0x03, 0x4c, 0x42, 0xad, 0x99, 0xc9, 0xa3, 0x1e, 0x09,
+    0x1b, 0xa8, 0xd3, 0x45, 0xac, 0x09, 0x03, 0xc0, 0x00, 0xc2, 0xad, 0xab,
+    0xa0, 0x09, 0x1b, 0xa0, 0x03, 0x42, 0xad, 0xb7, 0x48, 0xb6, 0x2a, 0xc2,
+    0xad, 0xbf, 0xcb, 0x94, 0x2d, 0x09, 0x02, 0x80, 0x9f, 0x09, 0x02, 0xa0,
+    0xcb, 0x96, 0x95, 0x09, 0x02, 0x90, 0x47, 0x03, 0x4c, 0x42, 0xad, 0xd1,
+    0xd0, 0x5d, 0xc2, 0x09, 0x24, 0x18, 0xc2, 0x7b, 0x95, 0x09, 0x02, 0x40,
+    0xc2, 0x00, 0xb3, 0x09, 0x02, 0x31, 0xc9, 0xac, 0xba, 0x09, 0x02, 0x28,
+    0xc8, 0x6a, 0x1e, 0x09, 0x02, 0x61, 0xc3, 0x1a, 0xf4, 0x09, 0x02, 0x59,
+    0x83, 0x09, 0x02, 0x50, 0x46, 0x03, 0x4d, 0xc2, 0xad, 0xe3, 0xc4, 0x39,
+    0xc8, 0x09, 0x00, 0xa8, 0x47, 0x03, 0x4c, 0x42, 0xae, 0x1a, 0xc3, 0xd1,
+    0x2b, 0x09, 0x1b, 0x91, 0xc3, 0x04, 0x65, 0x09, 0x01, 0x60, 0xc3, 0x03,
+    0x49, 0x09, 0x01, 0xf9, 0x9f, 0x09, 0x01, 0xf1, 0x00, 0x42, 0xae, 0x3c,
+    0xca, 0x51, 0xd4, 0x09, 0x01, 0xa8, 0x4a, 0x9e, 0x64, 0xc2, 0xae, 0x4e,
+    0xcb, 0x8f, 0x05, 0x09, 0x01, 0x79, 0xc7, 0xc6, 0x0f, 0x09, 0x01, 0x70,
+    0xc3, 0x5d, 0xd1, 0x09, 0x01, 0x41, 0xc3, 0x04, 0x65, 0x09, 0x01, 0x39,
+    0x0d, 0xc2, 0xae, 0x5a, 0xc2, 0x00, 0xd0, 0x09, 0x01, 0x21, 0xc4, 0x38,
+    0xa9, 0x09, 0x01, 0x19, 0xc4, 0xe2, 0xab, 0x09, 0x01, 0x11, 0xc2, 0x00,
+    0x65, 0x09, 0x01, 0x08, 0xcf, 0x68, 0x73, 0x09, 0x00, 0xf9, 0xc5, 0x9e,
+    0x4b, 0x09, 0x00, 0xf0, 0x9f, 0x09, 0x1c, 0xa9, 0xc2, 0x00, 0x2d, 0x09,
+    0x14, 0x52, 0x02, 0xae, 0x64, 0xcb, 0x94, 0x4e, 0x09, 0x14, 0x49, 0x46,
+    0x03, 0x4d, 0x42, 0xae, 0x68, 0xc7, 0x0b, 0x09, 0x09, 0x0a, 0x91, 0xcb,
+    0x96, 0xf8, 0x09, 0x0a, 0x89, 0xcb, 0x94, 0x43, 0x09, 0x0a, 0x81, 0xca,
+    0x38, 0xae, 0x09, 0x0a, 0x78, 0x00, 0x42, 0xae, 0x85, 0xc7, 0x0b, 0x09,
+    0x09, 0x0a, 0x21, 0xc3, 0x2b, 0x88, 0x09, 0x0a, 0x18, 0xcd, 0x77, 0xe2,
+    0x09, 0x23, 0x70, 0xc2, 0x00, 0xd3, 0x09, 0x22, 0x49, 0xa1, 0x09, 0x22,
+    0x41, 0xa0, 0x09, 0x22, 0x38, 0xcd, 0x77, 0xe2, 0x09, 0x23, 0x68, 0xa0,
+    0x09, 0x22, 0x28, 0xc4, 0x45, 0x6a, 0x09, 0x23, 0x41, 0xc4, 0x4a, 0x2e,
+    0x09, 0x23, 0x38, 0xcd, 0x77, 0xe2, 0x09, 0x23, 0x60, 0x00, 0xc2, 0xae,
+    0x9d, 0xa0, 0x09, 0x22, 0x08, 0xcd, 0x77, 0xe2, 0x09, 0x23, 0x58, 0xc5,
+    0x58, 0xf4, 0x09, 0x22, 0x70, 0xcd, 0x77, 0xe2, 0x09, 0x23, 0x50, 0xca,
+    0x9d, 0x74, 0x09, 0x22, 0xe1, 0x43, 0x01, 0x50, 0x42, 0xae, 0xa5, 0xc3,
+    0x5d, 0x9a, 0x09, 0x22, 0xa3, 0x02, 0xae, 0xad, 0xc3, 0x9f, 0x30, 0x09,
+    0x21, 0xc8, 0xc5, 0x58, 0xf4, 0x09, 0x22, 0x68, 0x97, 0x09, 0x21, 0x11,
+    0x9f, 0x09, 0x20, 0xc8, 0xcd, 0x77, 0xe2, 0x09, 0x23, 0x48, 0xc3, 0x5d,
+    0x9a, 0x09, 0x22, 0x93, 0x02, 0xae, 0xb3, 0xc3, 0x9f, 0x30, 0x09, 0x21,
+    0xc0, 0xc5, 0x58, 0xf4, 0x09, 0x22, 0x60, 0x00, 0xc2, 0xae, 0xb9, 0xa1,
+    0x09, 0x21, 0xe8, 0x97, 0x09, 0x21, 0x81, 0x9f, 0x09, 0x21, 0x30, 0x97,
+    0x09, 0x21, 0x09, 0x9f, 0x09, 0x20, 0xc0, 0xc3, 0x8f, 0x7a, 0x09, 0x23,
+    0x19, 0xc3, 0x02, 0x2c, 0x09, 0x23, 0x00, 0xc9, 0xad, 0xf5, 0x09, 0x22,
+    0xf9, 0xc4, 0xdd, 0x63, 0x09, 0x22, 0xc0, 0xce, 0x54, 0x64, 0x09, 0x22,
+    0xe9, 0xc4, 0x04, 0x59, 0x09, 0x22, 0xd0, 0xc3, 0x5d, 0x9a, 0x09, 0x22,
+    0x79, 0xc3, 0x9f, 0x30, 0x09, 0x21, 0xa0, 0x97, 0x09, 0x20, 0xf1, 0x9f,
+    0x09, 0x20, 0xa8, 0xce, 0x54, 0x64, 0x09, 0x22, 0xf1, 0xc4, 0x04, 0x59,
+    0x09, 0x22, 0xd8, 0xc3, 0x5d, 0x9a, 0x09, 0x22, 0x81, 0xc3, 0x9f, 0x30,
+    0x09, 0x21, 0xa8, 0xc5, 0x58, 0xf4, 0x09, 0x22, 0x50, 0x97, 0x09, 0x21,
+    0x69, 0x9f, 0x09, 0x21, 0x18, 0x97, 0x09, 0x20, 0xf9, 0x9f, 0x09, 0x20,
+    0xb0, 0xc3, 0x5d, 0x9a, 0x09, 0x22, 0x89, 0xc3, 0x9f, 0x30, 0x09, 0x21,
+    0xb2, 0x02, 0xae, 0xc1, 0xc5, 0x58, 0xf4, 0x09, 0x22, 0x58, 0xc2, 0xe5,
+    0xf7, 0x09, 0x21, 0xd9, 0xc2, 0xe6, 0x89, 0x09, 0x21, 0xd0, 0x97, 0x09,
+    0x21, 0x73, 0x02, 0xae, 0xc7, 0x9f, 0x09, 0x21, 0x22, 0x02, 0xae, 0xcd,
+    0x97, 0x09, 0x21, 0x01, 0x9f, 0x09, 0x20, 0xb8, 0xc3, 0x02, 0x9b, 0x01,
+    0x16, 0x79, 0xc2, 0x00, 0xbf, 0x01, 0x16, 0x70, 0x84, 0x09, 0x7e, 0x70,
+    0x84, 0x09, 0x7c, 0xd8, 0x06, 0xc2, 0xae, 0xd3, 0xc6, 0x60, 0xb1, 0x00,
+    0x27, 0x78, 0xca, 0x91, 0xbb, 0x00, 0x22, 0xa0, 0xc3, 0x2d, 0x1a, 0x00,
+    0xe4, 0x39, 0xc9, 0xa8, 0x3a, 0x00, 0xe4, 0x31, 0xc2, 0x00, 0xac, 0x00,
+    0xe4, 0x20, 0x46, 0x00, 0x8b, 0x42, 0xae, 0xdf, 0x87, 0x00, 0x22, 0x31,
+    0xc2, 0x01, 0x7f, 0x00, 0x22, 0xd9, 0xc2, 0x00, 0x28, 0x05, 0x34, 0x79,
+    0xc2, 0x00, 0x40, 0x05, 0x34, 0x88, 0xc5, 0x13, 0xb4, 0x00, 0xe4, 0x01,
+    0xc6, 0x9b, 0xd4, 0x00, 0x23, 0xd8, 0xc2, 0x0a, 0xe2, 0x00, 0x28, 0x89,
+    0xc3, 0xe5, 0x2a, 0x05, 0x32, 0x29, 0xc2, 0x13, 0xc0, 0x05, 0x32, 0xa9,
+    0xc3, 0x3b, 0x0f, 0x05, 0x33, 0x08, 0x46, 0x00, 0x8b, 0x42, 0xae, 0xeb,
+    0x46, 0x00, 0x8b, 0x42, 0xaf, 0x03, 0xca, 0xa5, 0x12, 0x00, 0x26, 0x70,
+    0xcf, 0x69, 0x54, 0x00, 0x25, 0x58, 0xca, 0xa5, 0xb2, 0x00, 0x24, 0x78,
+    0x1c, 0xc2, 0xaf, 0x21, 0x87, 0x00, 0x22, 0xab, 0x02, 0xaf, 0x2b, 0xc2,
+    0x01, 0x7f, 0x00, 0x22, 0xf9, 0xc2, 0x00, 0x28, 0x05, 0x34, 0x18, 0x91,
+    0x05, 0x34, 0xc9, 0xcb, 0x98, 0xa5, 0x05, 0x33, 0x68, 0xc2, 0x04, 0xab,
+    0x05, 0x32, 0x48, 0xc2, 0x00, 0xd0, 0x00, 0x25, 0xdb, 0x02, 0xaf, 0x31,
+    0x44, 0x2e, 0xf0, 0xc2, 0xaf, 0x37, 0xc2, 0x00, 0x28, 0x05, 0x34, 0xb9,
+    0x83, 0x00, 0x22, 0x41, 0xc3, 0x1c, 0x63, 0x00, 0x22, 0x48, 0xcf, 0x6b,
+    0x16, 0x00, 0x26, 0xd8, 0xcc, 0x23, 0x3f, 0x00, 0x25, 0x88, 0xc2, 0x00,
+    0x06, 0x05, 0x33, 0x19, 0x07, 0xc2, 0xaf, 0x42, 0xc4, 0x00, 0xba, 0x00,
+    0x22, 0x60, 0x46, 0x00, 0x8b, 0x42, 0xaf, 0x4a, 0xc3, 0xe5, 0x2a, 0x00,
+    0x27, 0x09, 0xc3, 0x28, 0x28, 0x00, 0x25, 0xeb, 0x02, 0xaf, 0x56, 0xc2,
+    0x00, 0xd0, 0x00, 0x25, 0x48, 0xc9, 0x20, 0xa8, 0x00, 0x26, 0x99, 0xc5,
+    0x1d, 0x88, 0x00, 0x26, 0x88, 0x87, 0x00, 0x28, 0xc9, 0x96, 0x00, 0x23,
+    0x18, 0x46, 0x00, 0x8b, 0x42, 0xaf, 0x5c, 0x43, 0x5d, 0xc0, 0xc2, 0xaf,
+    0x68, 0xc3, 0x78, 0xc9, 0x00, 0x24, 0x08, 0x46, 0x00, 0x8b, 0x42, 0xaf,
+    0x8a, 0x46, 0x00, 0x8b, 0xc2, 0xaf, 0xa2, 0xc7, 0x8a, 0x86, 0x00, 0x22,
+    0x50, 0x46, 0x00, 0x8b, 0x42, 0xaf, 0xb4, 0xc6, 0xc3, 0x77, 0x00, 0x27,
+    0x4b, 0x02, 0xaf, 0xcf, 0xc8, 0xba, 0x0a, 0x00, 0x25, 0x08, 0xc9, 0x98,
+    0xa7, 0x05, 0x33, 0x59, 0xc5, 0xc8, 0x02, 0x00, 0x23, 0x58, 0xcb, 0x90,
+    0x70, 0x00, 0x23, 0xe8, 0xc9, 0x20, 0xa8, 0x00, 0x27, 0x29, 0xc6, 0x60,
+    0xb1, 0x00, 0x27, 0x19, 0xc5, 0x1f, 0x0c, 0x00, 0x22, 0xe8, 0x46, 0x00,
+    0x8b, 0x42, 0xaf, 0xd5, 0xd9, 0x1e, 0xff, 0x00, 0x23, 0xb8, 0x16, 0x42,
+    0xaf, 0xe1, 0x47, 0x01, 0x32, 0xc2, 0xaf, 0xeb, 0xc4, 0xe4, 0xbf, 0x05,
+    0x32, 0x08, 0x87, 0x00, 0x21, 0xb3, 0x02, 0xaf, 0xf7, 0xc2, 0x00, 0x28,
+    0x05, 0x34, 0x28, 0x46, 0x00, 0x8b, 0x42, 0xaf, 0xfd, 0x46, 0x00, 0x8b,
+    0x42, 0xb0, 0x07, 0x46, 0x00, 0x8b, 0x42, 0xb0, 0x1f, 0xca, 0xa5, 0x12,
+    0x00, 0x26, 0x68, 0xcf, 0x69, 0x54, 0x00, 0x25, 0x50, 0xca, 0xa5, 0xb2,
+    0x00, 0x24, 0x70, 0x1c, 0xc2, 0xb0, 0x3d, 0x87, 0x00, 0x20, 0x2b, 0x02,
+    0xb0, 0x47, 0xc2, 0x01, 0x7f, 0x00, 0x20, 0x79, 0xc2, 0x00, 0x28, 0x05,
+    0x34, 0x10, 0x91, 0x05, 0x34, 0xc1, 0xcb, 0x98, 0xa5, 0x05, 0x33, 0x60,
+    0xc2, 0x04, 0xab, 0x05, 0x32, 0x40, 0xc2, 0x00, 0xd0, 0x00, 0x25, 0xd3,
+    0x02, 0xb0, 0x4d, 0x44, 0x2e, 0xf0, 0xc2, 0xb0, 0x53, 0x83, 0x00, 0x21,
+    0x41, 0xc3, 0x1c, 0x63, 0x00, 0x21, 0x49, 0xc2, 0x00, 0x28, 0x05, 0x34,
+    0xb0, 0xcf, 0x6b, 0x16, 0x00, 0x26, 0xd0, 0xcc, 0x23, 0x3f, 0x00, 0x25,
+    0x80, 0xc4, 0x00, 0xba, 0x00, 0x21, 0x61, 0xc2, 0x00, 0x06, 0x05, 0x33,
+    0x11, 0x07, 0x42, 0xb0, 0x5e, 0x46, 0x00, 0x8b, 0x42, 0xb0, 0x66, 0xc3,
+    0xe5, 0x2a, 0x00, 0x27, 0x01, 0xc3, 0x28, 0x28, 0x00, 0x25, 0xe3, 0x02,
+    0xb0, 0x72, 0xc2, 0x00, 0xd0, 0x00, 0x25, 0x40, 0xc9, 0x20, 0xa8, 0x00,
+    0x26, 0x91, 0xc5, 0x1d, 0x88, 0x00, 0x26, 0x80, 0x87, 0x00, 0x28, 0xc1,
+    0x96, 0x00, 0x23, 0x10, 0x46, 0x00, 0x8b, 0x42, 0xb0, 0x78, 0xc2, 0x0a,
+    0xe2, 0x00, 0x28, 0x81, 0xc3, 0xe5, 0x2a, 0x05, 0x32, 0x21, 0xc2, 0x13,
+    0xc0, 0x05, 0x32, 0xa1, 0xc3, 0x3b, 0x0f, 0x05, 0x33, 0x00, 0x43, 0x5d,
+    0xc0, 0xc2, 0xb0, 0x84, 0xc3, 0x78, 0xc9, 0x00, 0x24, 0x00, 0x46, 0x00,
+    0x8b, 0x42, 0xb0, 0xa6, 0x46, 0x00, 0x8b, 0xc2, 0xb0, 0xbe, 0xc7, 0x8a,
+    0x86, 0x00, 0x21, 0x50, 0x46, 0x00, 0x8b, 0x42, 0xb0, 0xd0, 0x46, 0x00,
+    0x8b, 0x42, 0xb0, 0xeb, 0x06, 0xc2, 0xb0, 0xf5, 0xc6, 0x60, 0xb1, 0x00,
+    0x27, 0x70, 0xca, 0x91, 0xbb, 0x00, 0x20, 0x20, 0xc6, 0xc3, 0x77, 0x00,
+    0x27, 0x43, 0x02, 0xb1, 0x01, 0xc8, 0xba, 0x0a, 0x00, 0x25, 0x00, 0xc9,
+    0x98, 0xa7, 0x05, 0x33, 0x51, 0xc5, 0xc8, 0x02, 0x00, 0x23, 0x50, 0xcb,
+    0x90, 0x70, 0x00, 0x23, 0xe0, 0xc9, 0x20, 0xa8, 0x00, 0x27, 0x21, 0xc6,
+    0x60, 0xb1, 0x00, 0x27, 0x11, 0xc5, 0x1f, 0x0c, 0x00, 0x20, 0x68, 0x46,
+    0x00, 0x8b, 0x42, 0xb1, 0x07, 0xd9, 0x1e, 0xff, 0x00, 0x23, 0xb0, 0x16,
+    0x42, 0xb1, 0x13, 0x47, 0x01, 0x32, 0xc2, 0xb1, 0x1d, 0xc4, 0xe4, 0xbf,
+    0x05, 0x32, 0x00, 0x87, 0x00, 0x20, 0xb3, 0x02, 0xb1, 0x29, 0xc2, 0x00,
+    0x28, 0x05, 0x34, 0x20, 0x46, 0x00, 0x8b, 0x42, 0xb1, 0x2f, 0xc2, 0x01,
+    0x7f, 0x00, 0x20, 0x59, 0x87, 0x00, 0x21, 0x31, 0xc2, 0x00, 0x28, 0x05,
+    0x34, 0x71, 0xc2, 0x00, 0x40, 0x05, 0x34, 0x80, 0xe0, 0x01, 0xa7, 0x01,
+    0x01, 0xc8, 0xc8, 0x4b, 0x94, 0x08, 0x8f, 0xa1, 0xc7, 0x0d, 0x04, 0x08,
+    0x8f, 0x98, 0xc6, 0x18, 0x10, 0x08, 0x8f, 0x81, 0xc4, 0xd2, 0x1d, 0x08,
+    0x8f, 0x78, 0xc4, 0x45, 0x6a, 0x08, 0x8f, 0x71, 0xc4, 0x4a, 0x2e, 0x08,
+    0x8f, 0x68, 0xc5, 0x0d, 0x0d, 0x08, 0x8f, 0x61, 0xc5, 0x28, 0xee, 0x08,
+    0x8f, 0x59, 0xc2, 0x00, 0xc4, 0x08, 0x8f, 0x50, 0xc4, 0x18, 0x10, 0x08,
+    0x8f, 0x39, 0xc2, 0x22, 0xcc, 0x08, 0x8f, 0x30, 0xc3, 0x0d, 0x14, 0x08,
+    0x8f, 0x29, 0xc3, 0x09, 0x9e, 0x08, 0x8f, 0x20, 0xc4, 0x02, 0xde, 0x08,
+    0x8f, 0x19, 0xc2, 0x02, 0xa0, 0x08, 0x8f, 0x10, 0xc5, 0x69, 0xa7, 0x00,
+    0x6c, 0x29, 0xc6, 0x8e, 0x9c, 0x00, 0x6c, 0x31, 0x07, 0xc2, 0xb1, 0x3b,
+    0xc6, 0xd2, 0x47, 0x00, 0x6c, 0x99, 0xc6, 0xcc, 0xd1, 0x00, 0x6c, 0xb1,
+    0x4a, 0xa1, 0xa2, 0xc2, 0xb1, 0x47, 0xcb, 0x8e, 0x97, 0x00, 0x6d, 0xc8,
+    0xc5, 0x69, 0xa7, 0x00, 0x6c, 0x49, 0xc6, 0xd2, 0x47, 0x00, 0x6c, 0x51,
+    0x42, 0x17, 0x99, 0xc2, 0xb1, 0x73, 0x42, 0x10, 0x37, 0x42, 0xb1, 0x7f,
+    0xc5, 0x69, 0xa7, 0x00, 0x6c, 0x59, 0xc6, 0xcc, 0xd1, 0x00, 0x6c, 0x60,
+    0xc5, 0x69, 0xa7, 0x00, 0x6c, 0x89, 0xc6, 0xd2, 0x3b, 0x00, 0x6c, 0x90,
+    0xc5, 0x69, 0xa7, 0x00, 0x6c, 0xa1, 0xc6, 0x69, 0xa6, 0x00, 0x6c, 0xa8,
+    0x03, 0xc2, 0xb1, 0x8b, 0x49, 0xb0, 0xe9, 0x42, 0xb1, 0x97, 0xc7, 0xca,
+    0x29, 0x00, 0x6c, 0xf9, 0xc7, 0xc7, 0xc1, 0x00, 0x6d, 0x31, 0x06, 0x42,
+    0xb1, 0xa9, 0xca, 0x4b, 0x0d, 0x00, 0x6d, 0x21, 0x42, 0x0d, 0xf6, 0x42,
+    0xb1, 0xb5, 0xc7, 0xc4, 0xdb, 0x00, 0x6d, 0x89, 0xc7, 0xc2, 0x18, 0x00,
+    0x6d, 0xe9, 0xc7, 0xc1, 0xa8, 0x00, 0x6e, 0x18, 0xc2, 0x02, 0xa0, 0x00,
+    0x6f, 0x41, 0xc4, 0x02, 0xde, 0x00, 0x6f, 0x48, 0xc3, 0x09, 0x9e, 0x00,
+    0x6f, 0x51, 0xc3, 0x0d, 0x14, 0x00, 0x6f, 0x58, 0xc2, 0x22, 0xcc, 0x00,
+    0x6f, 0x61, 0xc4, 0x18, 0x10, 0x00, 0x6f, 0x68, 0xca, 0xa7, 0x60, 0x00,
+    0x6e, 0x81, 0xc8, 0xb7, 0x82, 0x00, 0x6e, 0x91, 0xc9, 0xaf, 0x42, 0x00,
+    0x6e, 0xa0, 0xc2, 0x02, 0x41, 0x00, 0x6e, 0xcb, 0x02, 0xb1, 0xc1, 0xc5,
+    0xd8, 0x21, 0x00, 0x6e, 0xd8, 0xca, 0x9c, 0xb6, 0x00, 0x6f, 0x91, 0xc9,
+    0x93, 0x53, 0x00, 0x6f, 0x98, 0x1e, 0xc2, 0xb1, 0xc7, 0xa6, 0x0e, 0xd5,
+    0x41, 0xa5, 0x0e, 0xd5, 0x39, 0xa4, 0x0e, 0xd5, 0x31, 0xa3, 0x0e, 0xd5,
+    0x29, 0xa2, 0x0e, 0xd5, 0x21, 0xa1, 0x0e, 0xd5, 0x19, 0xa0, 0x0e, 0xd5,
+    0x11, 0x9f, 0x0e, 0xd5, 0x08, 0x4b, 0x40, 0xb3, 0xc2, 0xb1, 0xe3, 0x4a,
+    0x18, 0xa5, 0x42, 0xb1, 0xfe, 0xa3, 0x0e, 0xd4, 0xf9, 0xa2, 0x0e, 0xd4,
+    0xf1, 0xa1, 0x0e, 0xd4, 0xe9, 0xa0, 0x0e, 0xd4, 0xe1, 0x9f, 0x0e, 0xd4,
+    0xd8, 0x15, 0xc2, 0xb2, 0x16, 0x46, 0x17, 0x14, 0x42, 0xb2, 0x22, 0xc8,
+    0x00, 0x6f, 0x0e, 0xd0, 0x48, 0xc9, 0x6e, 0x18, 0x0e, 0xd3, 0x71, 0xc5,
+    0xda, 0x5b, 0x0e, 0xd3, 0x68, 0xc9, 0x65, 0x4f, 0x0e, 0xc8, 0xd1, 0x45,
+    0x03, 0x14, 0x42, 0xb2, 0x2e, 0xc8, 0x3b, 0xec, 0x0e, 0xc8, 0xc1, 0xc6,
+    0x24, 0x3b, 0x0e, 0xc8, 0xb0, 0xcc, 0x83, 0x61, 0x0e, 0xd4, 0x31, 0xc5,
+    0xd8, 0x1c, 0x0e, 0xd4, 0x29, 0x42, 0x01, 0x7f, 0xc2, 0xb2, 0x3a, 0xc5,
+    0xdb, 0x5a, 0x0e, 0xd4, 0x19, 0xc5, 0x48, 0x65, 0x0e, 0xd4, 0x10, 0xd0,
+    0x60, 0x02, 0x0e, 0xd4, 0x01, 0xcf, 0x6a, 0xad, 0x0e, 0xd3, 0xf8, 0x47,
+    0xc2, 0x2d, 0xc2, 0xb2, 0x46, 0xcb, 0x98, 0x0b, 0x0e, 0xd3, 0xb0, 0x00,
+    0xc2, 0xb2, 0x62, 0xd2, 0x4d, 0xf9, 0x0e, 0xd2, 0x98, 0xd3, 0x40, 0xb3,
+    0x0e, 0xd3, 0xa1, 0x4a, 0x18, 0xa5, 0x42, 0xb2, 0x6e, 0x47, 0x0f, 0x81,
+    0xc2, 0xb2, 0x7a, 0xd3, 0x46, 0xef, 0x0e, 0xd2, 0xf1, 0xd4, 0x38, 0xcc,
+    0x0e, 0xd2, 0xe9, 0x44, 0x08, 0xba, 0xc2, 0xb2, 0x86, 0xcc, 0x82, 0x95,
+    0x0e, 0xd2, 0xd1, 0xd0, 0x5b, 0x22, 0x0e, 0xd2, 0xc8, 0xc7, 0x0b, 0xc8,
+    0x0e, 0xc8, 0x39, 0xc8, 0x3b, 0xec, 0x0e, 0xc8, 0x31, 0xc6, 0x24, 0x3b,
+    0x0e, 0xc8, 0x28, 0x00, 0x42, 0xb2, 0x92, 0xc3, 0x01, 0xc8, 0x0e, 0xd1,
+    0x79, 0xc6, 0x04, 0xcb, 0x0e, 0xd1, 0x71, 0xc4, 0x08, 0xcb, 0x0e, 0xd1,
+    0x68, 0xc7, 0xc4, 0xe9, 0x0e, 0xcc, 0x39, 0x49, 0xab, 0x01, 0x42, 0xb2,
+    0xa4, 0x4b, 0x99, 0x4a, 0xc2, 0xb2, 0xb0, 0xc7, 0xc4, 0xe9, 0x0e, 0xca,
+    0x89, 0x49, 0xab, 0x01, 0x42, 0xb2, 0xc2, 0x4a, 0x18, 0xa5, 0xc2, 0xb2,
+    0xce, 0x4b, 0x40, 0xb3, 0x42, 0xb2, 0xdb, 0xca, 0x45, 0x02, 0x0e, 0xd1,
+    0x01, 0xc4, 0x03, 0xc8, 0x0e, 0xd0, 0xf9, 0xc2, 0x02, 0xae, 0x0e, 0xd0,
+    0xf0, 0xc4, 0x91, 0x78, 0x0e, 0xd0, 0xe9, 0x46, 0xca, 0xbb, 0x42, 0xb2,
+    0xea, 0x44, 0x06, 0xa6, 0xc2, 0xb2, 0xf6, 0x45, 0x01, 0xce, 0xc2, 0xb3,
+    0x02, 0xc6, 0x07, 0xa1, 0x0e, 0xd0, 0xb1, 0xc8, 0xba, 0x92, 0x0e, 0xd0,
+    0xa9, 0xc4, 0x05, 0x75, 0x0e, 0xd0, 0xa0, 0xc4, 0x03, 0xc8, 0x0e, 0xd0,
+    0x61, 0xc7, 0x81, 0x92, 0x0e, 0xd0, 0x59, 0xc2, 0x02, 0xae, 0x0e, 0xd0,
+    0x50, 0x08, 0xc2, 0xb3, 0x0e, 0xc5, 0x01, 0x95, 0x0e, 0xc4, 0x2b, 0x02,
+    0xb3, 0x20, 0x0a, 0xc2, 0xb3, 0x24, 0x05, 0xc2, 0xb3, 0x36, 0xc4, 0x38,
+    0xc1, 0x0e, 0xc3, 0xba, 0x02, 0xb3, 0x4c, 0x48, 0x51, 0x1b, 0xc2, 0xb3,
+    0x50, 0xc3, 0x18, 0x26, 0x0e, 0xd0, 0x00, 0xc6, 0xd0, 0x37, 0x0e, 0xd1,
+    0xa1, 0xc7, 0xa9, 0x6d, 0x0e, 0xd1, 0x98, 0xc3, 0xe5, 0x35, 0x0e, 0xd3,
+    0x49, 0x48, 0x17, 0x7c, 0xc2, 0xb3, 0x5a, 0x19, 0xc2, 0xb3, 0x66, 0x58,
+    0x22, 0x2b, 0xc2, 0xb3, 0x72, 0x15, 0xc2, 0xb3, 0x84, 0x45, 0xd9, 0x57,
+    0xc2, 0xb3, 0x90, 0x45, 0xd8, 0x76, 0xc2, 0xb3, 0x9c, 0x05, 0xc2, 0xb3,
+    0xa8, 0x46, 0xcb, 0x0f, 0xc2, 0xb3, 0xc0, 0x47, 0x2e, 0x48, 0xc2, 0xb3,
+    0xd2, 0x04, 0xc2, 0xb3, 0xe4, 0x47, 0x2c, 0x2e, 0xc2, 0xb3, 0xf0, 0x47,
+    0x00, 0x58, 0x42, 0xb4, 0x02, 0xc3, 0xe5, 0x35, 0x0e, 0xd3, 0x41, 0x48,
+    0x17, 0x7c, 0xc2, 0xb4, 0x17, 0x19, 0xc2, 0xb4, 0x23, 0x4b, 0x22, 0x2b,
+    0xc2, 0xb4, 0x2f, 0x45, 0xd9, 0x57, 0xc2, 0xb4, 0x3b, 0x45, 0xd8, 0x76,
+    0xc2, 0xb4, 0x56, 0x05, 0xc2, 0xb4, 0x6e, 0x15, 0xc2, 0xb4, 0x86, 0x46,
+    0xcb, 0x0f, 0xc2, 0xb4, 0x92, 0x47, 0x2e, 0x48, 0xc2, 0xb4, 0xa4, 0x04,
+    0xc2, 0xb4, 0xb6, 0x47, 0x2c, 0x2e, 0xc2, 0xb4, 0xc2, 0x47, 0x00, 0x58,
+    0x42, 0xb4, 0xd7, 0x48, 0x0b, 0xc8, 0xc2, 0xb4, 0xec, 0x48, 0xbf, 0xc2,
+    0xc2, 0xb4, 0xf8, 0x45, 0xd5, 0xf1, 0x42, 0xb5, 0x0d, 0xd5, 0x37, 0x19,
+    0x0e, 0xc9, 0x39, 0x43, 0x11, 0x49, 0xc2, 0xb5, 0x22, 0xcf, 0x65, 0x49,
+    0x0e, 0xc9, 0x20, 0xc6, 0x00, 0x58, 0x0e, 0xd2, 0xc1, 0xc6, 0x24, 0x3b,
+    0x0e, 0xd2, 0xb8, 0xc6, 0x13, 0x67, 0x0e, 0xd2, 0xb1, 0x46, 0x17, 0x8d,
+    0x42, 0xb5, 0x2e, 0x00, 0x42, 0xb5, 0x40, 0x00, 0x42, 0xb5, 0x4c, 0xc9,
+    0x46, 0x70, 0x0e, 0xd2, 0x53, 0x02, 0xb5, 0x58, 0xc4, 0x38, 0xc1, 0x0e,
+    0xd2, 0x3b, 0x02, 0xb5, 0x5c, 0xc8, 0xbe, 0x0a, 0x0e, 0xd2, 0x31, 0xc7,
+    0x27, 0xb2, 0x0e, 0xd2, 0x29, 0xc6, 0x02, 0xd1, 0x0e, 0xd2, 0x20, 0x00,
+    0x42, 0xb5, 0x60, 0x00, 0x42, 0xb5, 0x6c, 0xc2, 0x02, 0xae, 0x0e, 0xd0,
+    0x81, 0xc4, 0x03, 0xc8, 0x0e, 0xd0, 0x68, 0xcb, 0x90, 0xbd, 0x0e, 0xcf,
+    0xdb, 0x02, 0xb5, 0x78, 0xc3, 0x01, 0xc8, 0x0e, 0xcf, 0xc0, 0xc5, 0x17,
+    0x14, 0x0e, 0xcf, 0xb1, 0xc5, 0x03, 0x13, 0x0e, 0xcf, 0xa8, 0x97, 0x08,
+    0xae, 0xe8, 0x8b, 0x08, 0xae, 0xd0, 0xd6, 0x2e, 0x96, 0x08, 0xae, 0xc1,
+    0x83, 0x08, 0xac, 0xf0, 0xc2, 0x00, 0xd0, 0x08, 0xac, 0xc9, 0x83, 0x08,
+    0xac, 0xc0, 0x8e, 0x08, 0xac, 0x43, 0x02, 0xb5, 0x7e, 0x94, 0x08, 0xac,
+    0x32, 0x02, 0xb5, 0x82, 0xc2, 0x00, 0xd0, 0x08, 0xac, 0xd9, 0x83, 0x08,
+    0xac, 0xd0, 0x45, 0x00, 0x8c, 0xc2, 0xb5, 0x86, 0xcb, 0x99, 0x76, 0x08,
+    0xae, 0x7a, 0x02, 0xb5, 0xaa, 0xc3, 0x01, 0x5d, 0x08, 0xae, 0x29, 0xc3,
+    0x02, 0xa3, 0x08, 0xae, 0x20, 0xc4, 0x1e, 0x97, 0x08, 0xad, 0xf9, 0xc5,
+    0x40, 0xe7, 0x08, 0xad, 0xf0, 0x8e, 0x05, 0x45, 0xe8, 0x94, 0x05, 0x45,
+    0xd8, 0x94, 0x05, 0x44, 0x43, 0x02, 0xb5, 0xb0, 0x8e, 0x05, 0x44, 0x52,
+    0x02, 0xb5, 0xb4, 0x83, 0x05, 0x44, 0xe1, 0xc2, 0x00, 0xd0, 0x05, 0x44,
+    0xe8, 0x83, 0x05, 0x44, 0xf1, 0xc2, 0x00, 0xd0, 0x05, 0x44, 0xf8, 0xc2,
+    0x02, 0xa0, 0x05, 0x46, 0x91, 0xc4, 0x02, 0xde, 0x05, 0x46, 0x98, 0xc3,
+    0x09, 0x9e, 0x05, 0x46, 0xa1, 0xc3, 0x0d, 0x14, 0x05, 0x46, 0xa8, 0xc2,
+    0x22, 0xcc, 0x05, 0x46, 0xb1, 0xc4, 0x18, 0x10, 0x05, 0x46, 0xb8, 0xe0,
+    0x0a, 0x87, 0x0f, 0xb3, 0xb0, 0x4b, 0x94, 0x85, 0xc2, 0xb5, 0xb8, 0xc7,
+    0x1b, 0x0c, 0x08, 0x8e, 0x40, 0xc7, 0xc3, 0x61, 0x08, 0x8e, 0xd9, 0xd4,
+    0x39, 0xa8, 0x08, 0x8e, 0x79, 0xc5, 0x33, 0x5d, 0x08, 0x8e, 0x51, 0xcb,
+    0x93, 0xf6, 0x08, 0x8e, 0x19, 0xcb, 0x8f, 0xe1, 0x08, 0x8e, 0x11, 0x03,
+    0xc2, 0xb5, 0xc0, 0x42, 0x07, 0xb2, 0xc2, 0xb5, 0xcc, 0xcb, 0x1e, 0x89,
+    0x08, 0x8c, 0x00, 0xc4, 0x26, 0x78, 0x08, 0x8e, 0xc9, 0xc5, 0x06, 0xdb,
+    0x08, 0x8e, 0xc1, 0x15, 0xc2, 0xb5, 0xd8, 0x08, 0xc2, 0xb5, 0xe4, 0x16,
+    0xc2, 0xb5, 0xf0, 0xc3, 0x05, 0x14, 0x08, 0x8e, 0x89, 0xc4, 0x15, 0xe7,
+    0x08, 0x8e, 0x80, 0xcf, 0x61, 0x11, 0x08, 0x8e, 0x71, 0x03, 0xc2, 0xb5,
+    0xfc, 0x91, 0x08, 0x8d, 0xf1, 0x87, 0x08, 0x8d, 0xe1, 0x48, 0xb2, 0x2d,
+    0xc2, 0xb6, 0x08, 0x97, 0x08, 0x8d, 0xb3, 0x02, 0xb6, 0x16, 0x8b, 0x08,
+    0x8d, 0xa2, 0x02, 0xb6, 0x1a, 0x83, 0x08, 0x8d, 0x89, 0xc2, 0x0d, 0xf6,
+    0x08, 0x8d, 0x81, 0xc2, 0x00, 0xd0, 0x08, 0x8d, 0x78, 0x83, 0x08, 0x8d,
+    0x71, 0x47, 0xb2, 0x2e, 0x42, 0xb6, 0x1e, 0xc2, 0x00, 0xdb, 0x08, 0x8d,
+    0x69, 0x83, 0x08, 0x8d, 0x60, 0xc2, 0x00, 0xd0, 0x08, 0x8d, 0x41, 0x83,
+    0x08, 0x8d, 0x38, 0xc2, 0x00, 0xd0, 0x08, 0x8d, 0x31, 0x83, 0x08, 0x8d,
+    0x28, 0x83, 0x08, 0x8d, 0x21, 0xc2, 0x00, 0xc1, 0x08, 0x8c, 0xf9, 0xc2,
+    0x19, 0x2c, 0x08, 0x8c, 0xd1, 0xc2, 0x01, 0x30, 0x08, 0x8c, 0xa8, 0xc2,
+    0x00, 0xd0, 0x08, 0x8d, 0x19, 0x83, 0x08, 0x8d, 0x11, 0x06, 0x42, 0xb6,
+    0x2c, 0xc2, 0x00, 0xd0, 0x08, 0x8d, 0x09, 0x83, 0x08, 0x8d, 0x01, 0x16,
+    0x42, 0xb6, 0x36, 0xc2, 0x00, 0xd0, 0x08, 0x8c, 0xc9, 0x83, 0x08, 0x8c,
+    0xc0, 0xc2, 0x00, 0xd0, 0x08, 0x8c, 0xb9, 0x83, 0x08, 0x8c, 0xb0, 0xc2,
+    0x00, 0xd0, 0x08, 0x8c, 0xa1, 0x83, 0x08, 0x8c, 0x98, 0xc2, 0x00, 0xd0,
+    0x08, 0x8c, 0x91, 0x83, 0x08, 0x8c, 0x88, 0x97, 0x08, 0x8c, 0x81, 0x8b,
+    0x08, 0x8c, 0x71, 0x83, 0x08, 0x8c, 0x20, 0x97, 0x08, 0x8c, 0x40, 0x8b,
+    0x08, 0x8c, 0x30, 0xc3, 0x00, 0x2d, 0x08, 0x22, 0xa1, 0xc2, 0x17, 0x28,
+    0x08, 0x22, 0xf0, 0x96, 0x08, 0x23, 0x81, 0x94, 0x08, 0x23, 0xe8, 0x87,
+    0x08, 0x23, 0xc1, 0xc3, 0x5d, 0x32, 0x08, 0x23, 0xe0, 0xcd, 0x55, 0x9a,
+    0x01, 0x57, 0x41, 0xd5, 0x32, 0xab, 0x01, 0x57, 0x48, 0xe0, 0x06, 0x07,
+    0x01, 0x5a, 0xf8, 0xc9, 0x1f, 0x5a, 0x01, 0x49, 0x31, 0xd4, 0x3c, 0x28,
+    0x01, 0x49, 0x50, 0xc9, 0xb4, 0x5b, 0x01, 0x0f, 0x91, 0xc9, 0x1f, 0x5a,
+    0x01, 0x49, 0x29, 0xd4, 0x3c, 0xa0, 0x01, 0x49, 0x49, 0xd9, 0x20, 0x5d,
+    0x01, 0x49, 0x68, 0xca, 0x9d, 0x06, 0x01, 0x37, 0xb1, 0xc2, 0x01, 0xbb,
+    0x01, 0x1e, 0x68, 0x0e, 0xc2, 0xb6, 0x40, 0x46, 0x02, 0xae, 0xc2, 0xb6,
+    0x4c, 0xd0, 0x5d, 0x52, 0x01, 0x2f, 0x41, 0xd8, 0x24, 0x0b, 0x01, 0x2d,
+    0x49, 0xda, 0x1c, 0xd4, 0x01, 0x2d, 0x31, 0xcd, 0x7a, 0x93, 0x01, 0x2d,
+    0x29, 0xcf, 0x64, 0xe0, 0x01, 0x2d, 0x21, 0xd1, 0x4f, 0xbe, 0x01, 0x4f,
+    0x01, 0xce, 0x74, 0xda, 0x01, 0x58, 0x91, 0xd1, 0x53, 0xba, 0x01, 0x58,
+    0x98, 0xc5, 0x0a, 0xe2, 0x01, 0x18, 0x89, 0x89, 0x01, 0x9e, 0x90, 0x44,
+    0x1a, 0x16, 0x42, 0xb6, 0x58, 0x44, 0x1a, 0x16, 0x42, 0xb6, 0x64, 0xc4,
+    0x78, 0x47, 0x01, 0x98, 0x21, 0xc2, 0x00, 0x43, 0x01, 0x98, 0x28, 0x92,
+    0x01, 0x14, 0x99, 0x8e, 0x01, 0x9c, 0x40, 0xc9, 0xad, 0x0b, 0x01, 0x9b,
+    0xf8, 0x00, 0x42, 0xb6, 0x70, 0xd5, 0x35, 0x8a, 0x01, 0x56, 0x71, 0xc5,
+    0xd5, 0x06, 0x01, 0x9a, 0x89, 0xc2, 0x00, 0x39, 0x01, 0x9a, 0x90, 0xc3,
+    0x71, 0xec, 0x01, 0x9a, 0x99, 0xc5, 0xd8, 0xf8, 0x01, 0x9a, 0xa0, 0xc2,
+    0x14, 0x48, 0x01, 0x9a, 0xa9, 0xc6, 0xcc, 0x0b, 0x01, 0x9a, 0xb0, 0xc7,
+    0x04, 0x32, 0x01, 0x9d, 0x72, 0x02, 0xb6, 0x7c, 0xc3, 0x19, 0x86, 0x01,
+    0x99, 0x50, 0xc6, 0xca, 0x8b, 0x01, 0x99, 0x91, 0xc4, 0xe1, 0x3f, 0x01,
+    0x99, 0x99, 0xc3, 0x00, 0xea, 0x01, 0x99, 0xa8, 0xc7, 0xc8, 0x8c, 0x01,
+    0x99, 0xb1, 0xc4, 0xde, 0xfb, 0x01, 0x99, 0xc8, 0x90, 0x01, 0x99, 0xf9,
+    0x11, 0x42, 0xb6, 0x82, 0x83, 0x01, 0x9b, 0x88, 0xc3, 0x14, 0xc6, 0x01,
+    0x99, 0x20, 0x00, 0x42, 0xb6, 0x8c, 0xd0, 0x5f, 0x32, 0x01, 0x5e, 0x81,
+    0xc4, 0x0f, 0xd7, 0x01, 0x99, 0xe9, 0xc3, 0x2d, 0x61, 0x01, 0x9a, 0x00,
+    0x03, 0xc2, 0xb6, 0x98, 0xc5, 0xd6, 0x64, 0x01, 0x9c, 0x00, 0xc7, 0xc4,
+    0xbf, 0x01, 0x99, 0x71, 0x0d, 0x42, 0xb6, 0xa4, 0xc2, 0x00, 0xfb, 0x01,
+    0x99, 0xb9, 0x10, 0xc2, 0xb6, 0xae, 0xc3, 0x90, 0x19, 0x01, 0x99, 0xd8,
+    0x89, 0x01, 0x96, 0x69, 0x47, 0xc0, 0xe4, 0x42, 0xb6, 0xba, 0xc3, 0x02,
+    0x30, 0x01, 0x98, 0x59, 0x14, 0x42, 0xb6, 0xd8, 0xc6, 0xd2, 0x29, 0x01,
+    0x98, 0xa9, 0xc7, 0xc3, 0xd1, 0x01, 0x98, 0xb1, 0xc5, 0xdb, 0xaa, 0x01,
+    0x98, 0xb8, 0xc6, 0xcc, 0xfb, 0x01, 0x98, 0xd1, 0xc4, 0xe4, 0x27, 0x01,
+    0x98, 0xd8, 0xc4, 0xdf, 0x4f, 0x01, 0x98, 0xe9, 0xc3, 0x79, 0x25, 0x01,
+    0x98, 0xf0, 0x00, 0x42, 0xb6, 0xe4, 0xc3, 0x01, 0xe7, 0x01, 0x98, 0x71,
+    0xc3, 0x51, 0xee, 0x01, 0x98, 0x79, 0x8e, 0x01, 0x9f, 0xf8, 0xc2, 0x01,
+    0x30, 0x01, 0x98, 0x81, 0xc3, 0xe6, 0x5f, 0x01, 0x98, 0x89, 0xc5, 0xdc,
+    0xae, 0x01, 0x98, 0x98, 0xc3, 0x0f, 0xd9, 0x01, 0x98, 0xc8, 0xc5, 0xd7,
+    0xd6, 0x01, 0x98, 0xf9, 0xc6, 0xcb, 0xff, 0x01, 0x99, 0x00, 0x8b, 0x01,
+    0x99, 0x11, 0x91, 0x01, 0x99, 0x18, 0xc2, 0x00, 0x10, 0x01, 0x99, 0x40,
+    0xc5, 0xd6, 0xbe, 0x01, 0x99, 0x69, 0x94, 0x01, 0x9b, 0xa0, 0x0b, 0xc2,
+    0xb6, 0xee, 0xc3, 0xe6, 0x71, 0x01, 0x9a, 0x29, 0xc4, 0xdf, 0x7b, 0x01,
+    0x9a, 0x31, 0xc5, 0xda, 0x06, 0x01, 0x9a, 0x38, 0xc5, 0xdd, 0xdf, 0x01,
+    0x9a, 0x41, 0xc2, 0x00, 0x2c, 0x01, 0x9a, 0x4b, 0x02, 0xb6, 0xfa, 0x8e,
+    0x01, 0x9e, 0xa8, 0xc2, 0x01, 0x30, 0x01, 0x9a, 0x5b, 0x02, 0xb7, 0x00,
+    0xc5, 0xc3, 0xd3, 0x01, 0x9a, 0x68, 0x88, 0x01, 0x9c, 0x61, 0x89, 0x01,
+    0x9c, 0x69, 0x83, 0x01, 0x9c, 0x11, 0x8e, 0x01, 0x9c, 0xa9, 0x8f, 0x01,
+    0x9c, 0xd9, 0x95, 0x01, 0x9d, 0x91, 0x98, 0x01, 0x9d, 0xb1, 0x99, 0x01,
+    0x9d, 0xe0, 0x11, 0xc2, 0xb7, 0x06, 0xc7, 0x0b, 0x09, 0x01, 0x9d, 0x09,
+    0xc5, 0xd9, 0x11, 0x01, 0x9d, 0x28, 0xc6, 0x03, 0x12, 0x01, 0x9e, 0xa0,
+    0x00, 0x42, 0xb7, 0x15, 0xc5, 0x6d, 0xb4, 0x01, 0x9d, 0xc8, 0xc5, 0x6d,
+    0xb4, 0x01, 0x9d, 0xf8, 0xc2, 0x00, 0x58, 0x01, 0x9a, 0x71, 0xc2, 0x17,
+    0x99, 0x01, 0x9a, 0x78, 0x46, 0x19, 0xbb, 0xc2, 0xb7, 0x21, 0xc6, 0xd0,
+    0xa3, 0x0f, 0x8d, 0x48, 0xce, 0x6e, 0x20, 0x0f, 0x8d, 0x29, 0x4f, 0x0b,
+    0x17, 0x42, 0xb7, 0x2d, 0xcd, 0x7b, 0xf2, 0x0f, 0x8d, 0x09, 0xcb, 0x97,
+    0x66, 0x0f, 0x8c, 0xe0, 0xc2, 0x00, 0x06, 0x0f, 0x90, 0x99, 0xc2, 0x0d,
+    0xf6, 0x0f, 0x90, 0x11, 0xc4, 0xe0, 0xb7, 0x0f, 0x90, 0x08, 0xd2, 0x48,
+    0xfb, 0x0f, 0x8d, 0x11, 0xc3, 0x28, 0xa9, 0x0f, 0x8c, 0xe8, 0x26, 0xc2,
+    0xb7, 0x95, 0x22, 0xc2, 0xb7, 0xa1, 0x24, 0xc2, 0xb7, 0xd5, 0x23, 0xc2,
+    0xb7, 0xf1, 0x25, 0xc2, 0xb8, 0x15, 0x42, 0xe6, 0x8f, 0x42, 0xb8, 0x27,
+    0x8d, 0x0f, 0x8c, 0xf1, 0xcf, 0x05, 0x18, 0x01, 0x71, 0x60, 0xc9, 0x2a,
+    0xec, 0x01, 0x21, 0x28, 0xc4, 0x09, 0x9d, 0x01, 0x20, 0xa1, 0x16, 0xc2,
+    0xb8, 0x3d, 0xc3, 0x05, 0x14, 0x01, 0x20, 0x88, 0xc6, 0x01, 0xdb, 0x01,
+    0x20, 0xc9, 0x16, 0x42, 0xb8, 0x49, 0xc3, 0x1d, 0x35, 0x00, 0x43, 0x51,
+    0x42, 0x02, 0xa7, 0xc2, 0xb8, 0x58, 0xc2, 0x00, 0x39, 0x00, 0x43, 0x39,
+    0xc3, 0x39, 0x6d, 0x00, 0x43, 0x31, 0x10, 0xc2, 0xb8, 0x62, 0xc3, 0x1f,
+    0xdf, 0x00, 0x43, 0x19, 0xc2, 0x25, 0x3b, 0x00, 0x43, 0x08, 0xc7, 0xc2,
+    0xf8, 0x00, 0x39, 0x79, 0xc6, 0xce, 0xdb, 0x00, 0x39, 0x71, 0xc5, 0xd7,
+    0xae, 0x00, 0x39, 0x68, 0xc9, 0xad, 0xa4, 0x00, 0x38, 0xe0, 0xc2, 0x14,
+    0xda, 0x00, 0x3a, 0x79, 0xc5, 0xdc, 0xe5, 0x00, 0x3a, 0x71, 0xc5, 0xd4,
+    0x20, 0x00, 0x3a, 0x68, 0xc5, 0x05, 0x02, 0x00, 0x39, 0xd9, 0xc5, 0x00,
+    0xd4, 0x00, 0x39, 0xd0, 0x48, 0x84, 0x8d, 0x42, 0xb8, 0x72, 0xcc, 0x84,
+    0x8d, 0x00, 0x38, 0x40, 0xd1, 0x55, 0x52, 0x01, 0x14, 0x59, 0xcb, 0x23,
+    0xa0, 0x01, 0x14, 0x3b, 0x02, 0xb8, 0x7e, 0x46, 0x00, 0xd4, 0x42, 0xb8,
+    0x84, 0xc4, 0x0e, 0xa6, 0x01, 0x56, 0xa1, 0xc6, 0x2d, 0xd0, 0x01, 0x56,
+    0xb0, 0x90, 0x01, 0x03, 0xf9, 0x8b, 0x01, 0x03, 0x88, 0x8f, 0x00, 0xdd,
+    0xf9, 0x8d, 0x00, 0xdd, 0xf0, 0x09, 0xc2, 0xb8, 0x9c, 0xc5, 0xd4, 0xc0,
+    0x00, 0xdc, 0x00, 0xcf, 0x33, 0xad, 0x01, 0x56, 0x18, 0xcb, 0x0e, 0xbd,
+    0x01, 0x56, 0x29, 0xce, 0x33, 0x92, 0x01, 0x56, 0x39, 0xcf, 0x6a, 0x8f,
+    0x01, 0x56, 0x49, 0xcc, 0x24, 0x47, 0x01, 0x56, 0x58, 0x45, 0x02, 0x9a,
+    0x42, 0xb8, 0xa8, 0xc3, 0x3b, 0x36, 0x0f, 0xb0, 0x39, 0xc4, 0x75, 0x6e,
+    0x0f, 0xb0, 0x41, 0xd0, 0x55, 0x0f, 0x0f, 0xb0, 0x68, 0xcb, 0x1d, 0x4b,
+    0x0f, 0xb0, 0x53, 0x02, 0xb8, 0xba, 0xc9, 0xb4, 0xd0, 0x0f, 0xb0, 0x70,
+    0x45, 0x00, 0x8c, 0xc2, 0xb8, 0xc0, 0xc9, 0xb4, 0x49, 0x01, 0x10, 0x68,
+    0x83, 0x07, 0xf2, 0x81, 0xc9, 0xb4, 0x64, 0x07, 0xf3, 0x58, 0x46, 0x00,
+    0x8b, 0x42, 0xb8, 0xcc, 0xc3, 0x05, 0x14, 0x01, 0x0b, 0x83, 0x02, 0xb8,
+    0xd8, 0x08, 0xc2, 0xb8, 0xdc, 0x16, 0xc2, 0xb8, 0xe6, 0x07, 0xc2, 0xb8,
+    0xf6, 0xc4, 0x26, 0x78, 0x01, 0x0b, 0xc1, 0x15, 0x42, 0xb9, 0x02, 0xcb,
+    0x1a, 0x50, 0x07, 0xf2, 0xd1, 0xd6, 0x08, 0x88, 0x07, 0xf2, 0xf1, 0xcd,
+    0x00, 0x32, 0x07, 0xf2, 0xe0, 0xcb, 0x1a, 0x50, 0x07, 0xf2, 0xc9, 0xcd,
+    0x00, 0x32, 0x07, 0xf2, 0xd9, 0xd6, 0x08, 0x88, 0x07, 0xf2, 0xe8, 0xcb,
+    0x0e, 0xbd, 0x01, 0x55, 0x79, 0xcc, 0x24, 0x47, 0x01, 0x55, 0x88, 0xc8,
+    0x07, 0x5f, 0x01, 0x55, 0xa9, 0xcf, 0x6a, 0x8f, 0x01, 0x55, 0xc8, 0xcb,
+    0x1a, 0x50, 0x07, 0xf1, 0xa9, 0xd6, 0x08, 0x88, 0x07, 0xf1, 0xc9, 0xd8,
+    0x21, 0x83, 0x07, 0xf1, 0xd9, 0xd4, 0x38, 0xf4, 0x07, 0xf1, 0xe9, 0xcd,
+    0x0b, 0x91, 0x07, 0xf1, 0xf9, 0x46, 0x01, 0xfc, 0xc2, 0xb9, 0x0e, 0xce,
+    0x25, 0xad, 0x07, 0xf2, 0x39, 0x05, 0x42, 0xb9, 0x1a, 0xcc, 0x00, 0x33,
+    0x07, 0xf1, 0xc1, 0xcd, 0x69, 0x65, 0x07, 0xf2, 0x10, 0x4e, 0x21, 0x89,
+    0xc2, 0xb9, 0x26, 0xce, 0x69, 0x64, 0x07, 0xf2, 0x20, 0xc6, 0xcf, 0x05,
+    0x0f, 0x85, 0x11, 0xc6, 0x78, 0x78, 0x0f, 0x85, 0x91, 0xc8, 0xba, 0x2a,
+    0x0f, 0x86, 0x11, 0xc5, 0xdd, 0x49, 0x0f, 0x86, 0x90, 0xc6, 0xcf, 0x05,
+    0x0f, 0x85, 0x19, 0xc6, 0x78, 0x78, 0x0f, 0x85, 0x99, 0xc8, 0xba, 0x2a,
+    0x0f, 0x86, 0x19, 0xc5, 0xdd, 0x49, 0x0f, 0x86, 0x98, 0xc6, 0xcf, 0x05,
+    0x0f, 0x85, 0x51, 0xc6, 0x78, 0x78, 0x0f, 0x85, 0xd1, 0xc8, 0xba, 0x2a,
+    0x0f, 0x86, 0x51, 0xc5, 0xdd, 0x49, 0x0f, 0x86, 0xd0, 0x9e, 0x0f, 0x87,
+    0x0b, 0x02, 0xb9, 0x32, 0x9f, 0x0f, 0x87, 0x13, 0x02, 0xb9, 0x5a, 0xa0,
+    0x0f, 0x87, 0x19, 0xa1, 0x0f, 0x87, 0x21, 0xa2, 0x0f, 0x87, 0x29, 0xa3,
+    0x0f, 0x87, 0x31, 0xa4, 0x0f, 0x87, 0x39, 0xa5, 0x0f, 0x87, 0x41, 0xa6,
+    0x0f, 0x87, 0x48, 0x46, 0xc5, 0x7d, 0xc2, 0xb9, 0x62, 0xc2, 0x00, 0x95,
+    0x0f, 0x87, 0x00, 0xc6, 0xcf, 0x05, 0x0f, 0x85, 0x29, 0xc6, 0x78, 0x78,
+    0x0f, 0x85, 0xa9, 0xc8, 0xba, 0x2a, 0x0f, 0x86, 0x29, 0xc5, 0xdd, 0x49,
+    0x0f, 0x86, 0xa8, 0xc6, 0xcf, 0x05, 0x0f, 0x85, 0x31, 0xc6, 0x78, 0x78,
+    0x0f, 0x85, 0xb1, 0xc8, 0xba, 0x2a, 0x0f, 0x86, 0x31, 0xc5, 0xdd, 0x49,
+    0x0f, 0x86, 0xb0, 0xc6, 0xcf, 0x05, 0x0f, 0x85, 0x39, 0xc6, 0x78, 0x78,
+    0x0f, 0x85, 0xb9, 0xc8, 0xba, 0x2a, 0x0f, 0x86, 0x39, 0xc5, 0xdd, 0x49,
+    0x0f, 0x86, 0xb8, 0xc6, 0xcf, 0x05, 0x0f, 0x85, 0x61, 0xc6, 0x78, 0x78,
+    0x0f, 0x85, 0xe1, 0xc8, 0xba, 0x2a, 0x0f, 0x86, 0x61, 0xc5, 0xdd, 0x49,
+    0x0f, 0x86, 0xe0, 0xc6, 0xcf, 0x05, 0x0f, 0x85, 0x71, 0xc6, 0x78, 0x78,
+    0x0f, 0x85, 0xf1, 0xc8, 0xba, 0x2a, 0x0f, 0x86, 0x71, 0xc5, 0xdd, 0x49,
+    0x0f, 0x86, 0xf0, 0xc8, 0x01, 0x92, 0x01, 0x51, 0xc9, 0xd1, 0x51, 0x56,
+    0x01, 0x51, 0x71, 0xd0, 0x5b, 0x92, 0x01, 0x51, 0x68, 0xce, 0x6b, 0x8e,
+    0x01, 0x51, 0x41, 0x15, 0xc2, 0xb9, 0x7a, 0x46, 0x33, 0x92, 0xc2, 0xb9,
+    0x86, 0xc9, 0x0e, 0x6e, 0x01, 0x51, 0x29, 0xd7, 0x26, 0x60, 0x01, 0x51,
+    0x18, 0xc2, 0x02, 0xae, 0x00, 0x04, 0x61, 0xc8, 0xbd, 0x3a, 0x00, 0x04,
+    0x61, 0xc4, 0x03, 0xc8, 0x00, 0x04, 0x59, 0xc7, 0x27, 0xb2, 0x00, 0x04,
+    0x58, 0xc3, 0x18, 0x13, 0x01, 0x24, 0x39, 0xc3, 0x22, 0x45, 0x01, 0x23,
+    0xf8, 0xc2, 0x00, 0xd3, 0x01, 0x90, 0x70, 0xc2, 0x00, 0xd3, 0x01, 0x90,
+    0xc0, 0xc2, 0x00, 0xd3, 0x01, 0x90, 0x80, 0xc2, 0x00, 0xd3, 0x01, 0x90,
+    0xc8, 0xc2, 0x00, 0xd3, 0x01, 0x90, 0x98, 0xc2, 0x00, 0xd3, 0x01, 0x90,
+    0xd0, 0x00, 0x42, 0xb9, 0x92, 0xc2, 0x00, 0xd3, 0x01, 0x90, 0xb8, 0xc2,
+    0x00, 0x5f, 0x01, 0x91, 0x21, 0xc2, 0x01, 0x19, 0x01, 0x91, 0x59, 0xc7,
+    0xc4, 0xf0, 0x01, 0x91, 0xb0, 0xc3, 0x18, 0x11, 0x01, 0x91, 0x31, 0xc2,
+    0x01, 0xd0, 0x01, 0x92, 0x10, 0x90, 0x01, 0x91, 0x81, 0xc7, 0xc8, 0x54,
+    0x01, 0x91, 0xe0, 0xc3, 0x04, 0x20, 0x01, 0x91, 0x89, 0xc3, 0xe5, 0x0f,
+    0x01, 0x91, 0xd8, 0xc5, 0x53, 0x93, 0x01, 0x91, 0xf1, 0x96, 0x01, 0x92,
+    0x08, 0xc6, 0x26, 0xf6, 0x08, 0xd7, 0xb0, 0x9b, 0x08, 0xd7, 0x21, 0x90,
+    0x08, 0xd7, 0x03, 0x02, 0xb9, 0x9a, 0x99, 0x08, 0xd7, 0x11, 0x8e, 0x08,
+    0xd7, 0x09, 0x8f, 0x08, 0xd6, 0xf9, 0x96, 0x08, 0xd6, 0xf1, 0x8d, 0x08,
+    0xd6, 0xe9, 0x92, 0x08, 0xd6, 0xe0, 0xc6, 0x26, 0xf6, 0x08, 0xd7, 0x68,
+    0x19, 0xc2, 0xb9, 0x9e, 0xc2, 0x00, 0xc4, 0x08, 0x43, 0xf1, 0xc4, 0x02,
+    0xde, 0x08, 0x43, 0xd8, 0xc3, 0x0d, 0x14, 0x08, 0x43, 0xe9, 0xc3, 0x09,
+    0x9e, 0x08, 0x43, 0xe0, 0x16, 0xc2, 0xb9, 0xa8, 0x15, 0xc2, 0xb9, 0xb4,
+    0xc4, 0x5d, 0xe2, 0x08, 0x43, 0xa1, 0xc4, 0xb9, 0x7e, 0x08, 0x43, 0x99,
+    0xc2, 0x00, 0x67, 0x08, 0x43, 0x89, 0x03, 0xc2, 0xb9, 0xbe, 0xc3, 0x20,
+    0x18, 0x08, 0x43, 0x71, 0xc9, 0xb3, 0x5f, 0x08, 0x43, 0x69, 0xc3, 0x00,
+    0x4e, 0x08, 0x43, 0x61, 0xc6, 0xcf, 0xd7, 0x08, 0x43, 0x59, 0xc4, 0xe0,
+    0xe7, 0x08, 0x43, 0x51, 0xc4, 0x4a, 0xb9, 0x08, 0x43, 0x49, 0xc2, 0x01,
+    0x7f, 0x08, 0x43, 0x23, 0x02, 0xb9, 0xca, 0xc5, 0x4a, 0xb3, 0x08, 0x43,
+    0x31, 0xc3, 0x7e, 0x89, 0x08, 0x43, 0x29, 0xc6, 0x40, 0x9a, 0x08, 0x43,
+    0x19, 0xc5, 0x9c, 0xa2, 0x08, 0x43, 0x11, 0xc4, 0xe3, 0x27, 0x08, 0x43,
+    0x08, 0xc2, 0x15, 0xb0, 0x0b, 0x5c, 0x69, 0xc2, 0x00, 0x03, 0x0b, 0x5c,
+    0x31, 0xc4, 0x9f, 0x7d, 0x0b, 0x5b, 0xe8, 0xc3, 0xa6, 0x62, 0x0b, 0x59,
+    0x59, 0xc3, 0x48, 0x8d, 0x0b, 0x58, 0xe8, 0xc5, 0xd6, 0x23, 0x0b, 0x5b,
+    0xa8, 0xc4, 0xe0, 0x3f, 0x0b, 0x59, 0xf9, 0xc3, 0x49, 0x2f, 0x0b, 0x59,
+    0xf1, 0xc3, 0x79, 0xe7, 0x0b, 0x59, 0xe9, 0xc5, 0xda, 0x38, 0x0b, 0x59,
+    0xe0, 0xc3, 0x44, 0x23, 0x0b, 0x59, 0xd1, 0xc2, 0x00, 0x7a, 0x0b, 0x59,
+    0xb8, 0xc8, 0xbe, 0x3a, 0x0b, 0x5b, 0x01, 0xc9, 0x4b, 0x94, 0x0b, 0x5a,
+    0xe8, 0x04, 0xc2, 0xb9, 0xd0, 0xcc, 0x87, 0x09, 0x0f, 0xb2, 0x79, 0xcc,
+    0x85, 0xc5, 0x0f, 0xb2, 0x71, 0xc9, 0xa8, 0x31, 0x0f, 0xce, 0xa9, 0xc5,
+    0xda, 0x01, 0x0f, 0xd6, 0x28, 0xe0, 0x07, 0x67, 0x0f, 0xb2, 0x60, 0xcb,
+    0x92, 0x6a, 0x0f, 0xce, 0xb1, 0xce, 0x6e, 0x12, 0x0f, 0xce, 0xc0, 0x91,
+    0x08, 0x48, 0xd1, 0xc4, 0x18, 0x12, 0x08, 0x48, 0xc0, 0xc9, 0x1e, 0x8b,
+    0x05, 0x43, 0x98, 0x83, 0x05, 0x42, 0x81, 0xc2, 0x00, 0xd0, 0x05, 0x42,
+    0x88, 0x83, 0x05, 0x43, 0x49, 0xc2, 0x00, 0xd0, 0x05, 0x43, 0x50, 0xc2,
+    0x01, 0x4a, 0x05, 0x43, 0x39, 0xc2, 0x19, 0x2c, 0x05, 0x43, 0x41, 0xc2,
+    0x00, 0x39, 0x05, 0x43, 0x88, 0xd4, 0x38, 0xe0, 0x08, 0x0f, 0xe8, 0xc4,
+    0x1e, 0x97, 0x00, 0x4a, 0x69, 0xc5, 0x40, 0xe7, 0x00, 0x48, 0x18, 0xc7,
+    0x7a, 0x7f, 0x00, 0x49, 0xe9, 0xc7, 0x14, 0x39, 0x00, 0x48, 0x10, 0x00,
+    0x42, 0xb9, 0xdc, 0xc6, 0xc3, 0x62, 0x05, 0x47, 0xe1, 0xd2, 0x4a, 0x87,
+    0x05, 0x47, 0x90, 0x94, 0x00, 0x4a, 0x20, 0x8e, 0x00, 0x4b, 0x18, 0x87,
+    0x00, 0x4a, 0xb8, 0x83, 0x00, 0x49, 0xb1, 0x44, 0x2e, 0xf0, 0x42, 0xb9,
+    0xec, 0x8e, 0x00, 0x48, 0x63, 0x02, 0xb9, 0xf8, 0x94, 0x00, 0x48, 0x5a,
+    0x02, 0xb9, 0xfc, 0xc2, 0x00, 0xdb, 0x00, 0x49, 0xa1, 0x83, 0x00, 0x49,
+    0x98, 0xc2, 0x00, 0xc1, 0x00, 0x49, 0x49, 0x83, 0x00, 0x49, 0x18, 0xc2,
+    0x00, 0xd0, 0x00, 0x49, 0x11, 0x83, 0x00, 0x49, 0x09, 0x06, 0x42, 0xba,
+    0x00, 0xc2, 0x00, 0xd0, 0x00, 0x49, 0x01, 0x83, 0x00, 0x48, 0xf8, 0x45,
+    0xc7, 0x7d, 0x42, 0xba, 0x0a, 0x83, 0x00, 0x48, 0xc1, 0xc2, 0x00, 0xd0,
+    0x00, 0x4a, 0xd0, 0x83, 0x00, 0x48, 0xb1, 0xc2, 0x00, 0xd0, 0x00, 0x4a,
+    0xc8, 0x87, 0x00, 0x4b, 0xb8, 0xc4, 0x18, 0x10, 0x00, 0x4b, 0x69, 0xc2,
+    0x22, 0xcc, 0x00, 0x4b, 0x60, 0xc3, 0x0d, 0x14, 0x00, 0x4b, 0x59, 0xc3,
+    0x09, 0x9e, 0x00, 0x4b, 0x50, 0xc4, 0x02, 0xde, 0x00, 0x4b, 0x49, 0xc2,
+    0x02, 0xa0, 0x00, 0x4b, 0x40, 0x8b, 0x08, 0x20, 0x01, 0x83, 0x08, 0x20,
+    0x13, 0x02, 0xba, 0x16, 0x91, 0x08, 0x20, 0x23, 0x02, 0xba, 0x1a, 0x87,
+    0x08, 0x20, 0x08, 0x8b, 0x08, 0x20, 0x31, 0x87, 0x08, 0x20, 0x39, 0x83,
+    0x08, 0x20, 0x43, 0x02, 0xba, 0x1e, 0x91, 0x08, 0x20, 0x52, 0x02, 0xba,
+    0x22, 0x99, 0x08, 0x20, 0x69, 0x8b, 0x08, 0x21, 0x30, 0xc2, 0x02, 0xe0,
+    0x08, 0x20, 0x99, 0xc3, 0x0e, 0x65, 0x08, 0x20, 0xe0, 0x88, 0x08, 0x20,
+    0xc9, 0xc2, 0x00, 0x8e, 0x08, 0x20, 0xd9, 0x95, 0x08, 0x20, 0xeb, 0x02,
+    0xba, 0x26, 0x94, 0x08, 0x21, 0x09, 0x8e, 0x08, 0x21, 0x11, 0x8f, 0x08,
+    0x21, 0x19, 0x90, 0x08, 0x21, 0x23, 0x02, 0xba, 0x2a, 0x99, 0x08, 0x21,
+    0x38, 0xc2, 0x02, 0xe0, 0x08, 0x20, 0xf1, 0xc3, 0x0e, 0x65, 0x08, 0x21,
+    0x00, 0x8b, 0x08, 0x21, 0x41, 0x87, 0x08, 0x21, 0x49, 0x83, 0x08, 0x21,
+    0x53, 0x02, 0xba, 0x2e, 0x91, 0x08, 0x21, 0x62, 0x02, 0xba, 0x32, 0x8b,
+    0x08, 0x21, 0x71, 0x87, 0x08, 0x21, 0x79, 0x83, 0x08, 0x21, 0x83, 0x02,
+    0xba, 0x36, 0x91, 0x08, 0x21, 0x92, 0x02, 0xba, 0x3a, 0x99, 0x08, 0x21,
+    0xa9, 0x8b, 0x08, 0x22, 0x70, 0xc2, 0x02, 0xe0, 0x08, 0x21, 0xd9, 0xc3,
+    0x0e, 0x65, 0x08, 0x22, 0x20, 0x88, 0x08, 0x22, 0x09, 0xc2, 0x00, 0x8e,
+    0x08, 0x22, 0x19, 0x95, 0x08, 0x22, 0x2b, 0x02, 0xba, 0x3e, 0x94, 0x08,
+    0x22, 0x49, 0x8e, 0x08, 0x22, 0x51, 0x8f, 0x08, 0x22, 0x59, 0x90, 0x08,
+    0x22, 0x63, 0x02, 0xba, 0x42, 0x99, 0x08, 0x22, 0x78, 0xc2, 0x02, 0xe0,
+    0x08, 0x22, 0x31, 0xc3, 0x0e, 0x65, 0x08, 0x22, 0x40, 0xc9, 0x11, 0xf6,
+    0x01, 0x24, 0x71, 0xc5, 0x0a, 0x8a, 0x0f, 0x88, 0x40, 0xc9, 0x11, 0xf6,
+    0x01, 0x24, 0x69, 0xc5, 0x0a, 0x8a, 0x0f, 0x88, 0x38, 0xc9, 0x11, 0xf6,
+    0x01, 0x24, 0x61, 0xc5, 0x0a, 0x8a, 0x0f, 0x88, 0x30, 0xc9, 0x11, 0xf6,
+    0x01, 0x24, 0x59, 0xc5, 0x0a, 0x8a, 0x0f, 0x88, 0x28, 0xc9, 0x11, 0xf6,
+    0x01, 0x24, 0x51, 0xc5, 0x0a, 0x8a, 0x0f, 0x88, 0x20, 0xc9, 0x11, 0xf6,
+    0x01, 0x24, 0x49, 0xc5, 0x0a, 0x8a, 0x0f, 0x88, 0x18, 0xc4, 0x18, 0x10,
+    0x08, 0xca, 0xb9, 0xc2, 0x22, 0xcc, 0x08, 0xca, 0xb0, 0xc3, 0x0d, 0x14,
+    0x08, 0xca, 0xa9, 0xc3, 0x09, 0x9e, 0x08, 0xca, 0xa0, 0xc4, 0x02, 0xde,
+    0x08, 0xca, 0x99, 0xc2, 0x02, 0xa0, 0x08, 0xca, 0x90, 0x8b, 0x08, 0xc9,
+    0xb9, 0x83, 0x08, 0xc9, 0x80, 0x97, 0x08, 0xc9, 0xa0, 0x8b, 0x08, 0xc9,
+    0x90, 0xc2, 0x00, 0xd0, 0x08, 0xc8, 0xc9, 0x83, 0x08, 0xc8, 0xc0, 0xc4,
+    0x18, 0x10, 0x01, 0x3c, 0x81, 0xc2, 0x22, 0xcc, 0x01, 0x3c, 0x78, 0xc3,
+    0x0d, 0x14, 0x01, 0x3c, 0x71, 0xc3, 0x09, 0x9e, 0x01, 0x3c, 0x68, 0xc4,
+    0x02, 0xde, 0x01, 0x3c, 0x61, 0xc2, 0x02, 0xa0, 0x01, 0x3c, 0x58, 0x45,
+    0x01, 0x95, 0xc2, 0xba, 0x46, 0xc9, 0x61, 0x53, 0x01, 0x48, 0x58, 0xcd,
+    0x7e, 0x3b, 0x01, 0x0d, 0x09, 0x46, 0x01, 0x9a, 0x42, 0xba, 0x52, 0xc5,
+    0x01, 0xa2, 0x0f, 0xc2, 0x39, 0xd0, 0x58, 0x62, 0x0f, 0xc2, 0x18, 0x44,
+    0x00, 0x49, 0xc2, 0xba, 0x58, 0x45, 0x00, 0x2c, 0x42, 0xba, 0x62, 0x00,
+    0x42, 0xba, 0x6c, 0xca, 0xa3, 0x64, 0x01, 0x27, 0xf1, 0x46, 0x09, 0x97,
+    0x42, 0xba, 0x8a, 0x00, 0x42, 0xba, 0xa8, 0xc6, 0x2d, 0xd0, 0x01, 0x16,
+    0x89, 0xc4, 0x0e, 0xa6, 0x01, 0x16, 0x81, 0xc6, 0xb7, 0x74, 0x01, 0x55,
+    0xe1, 0xcd, 0x6c, 0x99, 0x01, 0x72, 0x20, 0xc5, 0x13, 0x84, 0x01, 0x52,
+    0x79, 0xcc, 0x06, 0xbb, 0x01, 0x52, 0x70, 0xcd, 0x68, 0xc0, 0x01, 0x57,
+    0x61, 0xcb, 0x8d, 0x42, 0x01, 0x72, 0x48, 0xc3, 0x03, 0x4e, 0x01, 0x01,
+    0x9b, 0x02, 0xba, 0xb4, 0xc6, 0xbf, 0x4c, 0x01, 0x55, 0xd8, 0x19, 0xc2,
+    0xba, 0xba, 0x46, 0x19, 0xbb, 0x42, 0xba, 0xc4, 0xce, 0x55, 0x99, 0x01,
+    0x55, 0x18, 0x46, 0x03, 0x13, 0xc2, 0xba, 0xd0, 0xc9, 0xb2, 0xbd, 0x01,
+    0x0a, 0x28, 0x92, 0x01, 0x08, 0xcb, 0x02, 0xba, 0xe0, 0xc5, 0x51, 0x51,
+    0x01, 0x09, 0xf1, 0x9c, 0x01, 0x09, 0x21, 0x94, 0x01, 0x08, 0xe9, 0x93,
+    0x01, 0x08, 0xd1, 0x90, 0x01, 0x08, 0xa9, 0x8a, 0x01, 0x08, 0x69, 0x85,
+    0x01, 0x08, 0x10, 0xc5, 0x51, 0x51, 0x01, 0x09, 0xe9, 0xc2, 0x0b, 0x19,
+    0x01, 0x09, 0xe0, 0xc9, 0x00, 0xca, 0x01, 0x54, 0xc9, 0xcc, 0x07, 0xc7,
+    0x01, 0x54, 0xd0, 0x4c, 0x24, 0xe3, 0xc2, 0xba, 0xe4, 0xd5, 0x38, 0x3f,
+    0x01, 0x57, 0xc9, 0xd8, 0x23, 0x93, 0x01, 0x57, 0xd0, 0xc2, 0x00, 0xd0,
+    0x08, 0xc0, 0xb9, 0x83, 0x08, 0xc0, 0xb0, 0xc2, 0x00, 0xd0, 0x08, 0xc0,
+    0xa9, 0x83, 0x08, 0xc0, 0xa0, 0xc4, 0x01, 0xa3, 0x0d, 0xe4, 0xc9, 0xc4,
+    0x31, 0xef, 0x0d, 0xe4, 0x80, 0xc7, 0x27, 0x9b, 0x0d, 0xe3, 0x98, 0xc3,
+    0x02, 0x6e, 0x0d, 0xe4, 0xb1, 0xc9, 0xac, 0xf0, 0x0d, 0xe4, 0x98, 0xc5,
+    0x01, 0x22, 0x0d, 0xe3, 0xe0, 0xc2, 0x00, 0x2b, 0x0d, 0xe1, 0xa8, 0xc2,
+    0x00, 0x2b, 0x0d, 0xe1, 0x98, 0xc2, 0x00, 0x3f, 0x0d, 0xe1, 0x70, 0xc6,
+    0x05, 0x01, 0x0d, 0xe1, 0x30, 0xc2, 0x00, 0x2b, 0x0d, 0xe2, 0x00, 0x90,
+    0x0d, 0xe3, 0x49, 0x99, 0x0d, 0xe2, 0x10, 0x90, 0x0d, 0xe3, 0x39, 0x87,
+    0x0d, 0xe2, 0x71, 0x8a, 0x0d, 0xe2, 0x60, 0xc2, 0x00, 0x3f, 0x0d, 0xe1,
+    0x88, 0xc9, 0x33, 0xad, 0x0d, 0xe1, 0x78, 0xc2, 0x00, 0x3f, 0x0d, 0xe1,
+    0x68, 0xd2, 0x4e, 0x9b, 0x0d, 0xe1, 0x20, 0xc2, 0x00, 0x3f, 0x0d, 0xe1,
+    0x60, 0xc2, 0x00, 0x3f, 0x0d, 0xe1, 0x58, 0xd0, 0x5e, 0xe2, 0x01, 0x3e,
+    0x41, 0xd6, 0x30, 0x7a, 0x01, 0x4f, 0xb9, 0xc8, 0x18, 0x67, 0x01, 0x4f,
+    0xa8, 0xc7, 0x0e, 0xbc, 0x01, 0x16, 0x68, 0xc9, 0xb2, 0xfc, 0x0f, 0xac,
+    0x99, 0xc7, 0xc4, 0x6b, 0x0f, 0xac, 0x90, 0xcf, 0x01, 0xb8, 0x01, 0x80,
+    0xe8, 0xcc, 0x84, 0x99, 0x01, 0x1d, 0x31, 0xc9, 0x57, 0x36, 0x01, 0x1d,
+    0x29, 0xcc, 0x80, 0xcd, 0x01, 0x1d, 0x21, 0x45, 0x00, 0x8c, 0x42, 0xba,
+    0xf0, 0x46, 0x00, 0x8b, 0x42, 0xbb, 0x0e, 0xd6, 0x06, 0xd1, 0x0f, 0xdb,
+    0xf9, 0xd6, 0x2d, 0x36, 0x0f, 0xdb, 0xf0, 0xc2, 0x00, 0x49, 0x01, 0x10,
+    0xfb, 0x02, 0xbb, 0x1a, 0xc9, 0xb2, 0x75, 0x0f, 0xaf, 0x78, 0xcc, 0x8a,
+    0xed, 0x01, 0x3f, 0xa1, 0xcc, 0x12, 0x2d, 0x01, 0x0f, 0xa0, 0x44, 0x04,
+    0x91, 0xc2, 0xbb, 0x1e, 0xc3, 0x04, 0x20, 0x01, 0x2c, 0x80, 0xca, 0xa2,
+    0x74, 0x01, 0x1d, 0x69, 0xcc, 0x82, 0xe9, 0x01, 0x1d, 0x61, 0xca, 0xa3,
+    0x5a, 0x01, 0x1d, 0x58, 0xc2, 0x00, 0x49, 0x01, 0x15, 0xfb, 0x02, 0xbb,
+    0x2a, 0xd6, 0x14, 0xf9, 0x0f, 0xdb, 0x70, 0xcd, 0x3f, 0xe8, 0x0f, 0xdc,
+    0x41, 0xce, 0x08, 0x79, 0x0f, 0xdc, 0x50, 0xd6, 0x30, 0xfe, 0x01, 0x4b,
+    0x81, 0xcc, 0x0b, 0x92, 0x01, 0x80, 0x58, 0xcc, 0x00, 0x33, 0x01, 0x4c,
+    0x21, 0xcd, 0x69, 0x65, 0x01, 0x80, 0x78, 0xd9, 0x1b, 0xd1, 0x0f, 0xc4,
+    0xb1, 0xc9, 0xb0, 0x8f, 0x01, 0x0f, 0x80, 0xca, 0x03, 0xdd, 0x0f, 0xc4,
+    0x91, 0x48, 0x01, 0x9a, 0x42, 0xbb, 0x30, 0xc5, 0x01, 0xa2, 0x01, 0x0e,
+    0xd9, 0xca, 0x52, 0xc2, 0x01, 0x48, 0x78, 0x46, 0x02, 0x5c, 0xc2, 0xbb,
+    0x45, 0xd1, 0x52, 0xbb, 0x01, 0x48, 0x80, 0xd6, 0x2b, 0x94, 0x01, 0x0e,
+    0x61, 0x4a, 0x01, 0x58, 0x42, 0xbb, 0x51, 0xd5, 0x03, 0xd2, 0x0f, 0xc0,
+    0xb1, 0x0e, 0xc2, 0xbb, 0x5d, 0x15, 0xc2, 0xbb, 0x69, 0x42, 0x00, 0x58,
+    0xc2, 0xbb, 0x75, 0xcf, 0x2c, 0x35, 0x01, 0x0f, 0xc1, 0xd0, 0x58, 0x12,
+    0x01, 0x0d, 0xa1, 0xc4, 0x01, 0x23, 0x01, 0x0d, 0x51, 0x16, 0xc2, 0xbb,
+    0x81, 0xca, 0x9e, 0x28, 0x01, 0x4a, 0x29, 0xd9, 0x1f, 0xf9, 0x0f, 0xc0,
+    0x31, 0xcc, 0x84, 0xb1, 0x0f, 0xc4, 0xd0, 0x43, 0x10, 0x9e, 0xc2, 0xbb,
+    0x90, 0x47, 0x25, 0xf3, 0x42, 0xbb, 0x9c, 0xd1, 0x56, 0x73, 0x01, 0x49,
+    0x00, 0x45, 0x00, 0xd5, 0xc2, 0xbb, 0xac, 0x43, 0x02, 0x9c, 0x42, 0xbb,
+    0xc4, 0x00, 0xc2, 0xbb, 0xca, 0xc5, 0x14, 0xa5, 0x01, 0x48, 0xe0, 0xc9,
+    0x57, 0x20, 0x01, 0x0c, 0x40, 0xc4, 0xe4, 0x87, 0x01, 0x0c, 0x00, 0x00,
+    0x42, 0xbb, 0xd6, 0x00, 0x42, 0xbb, 0xe2, 0xe0, 0x0b, 0xc7, 0x0f, 0xac,
+    0xb0, 0x03, 0xc2, 0xbb, 0xee, 0xc2, 0x16, 0x1c, 0x00, 0xb7, 0xb1, 0xc2,
+    0x00, 0xfa, 0x00, 0xb7, 0xa9, 0xc2, 0x07, 0xa3, 0x00, 0xb7, 0xa0, 0x49,
+    0xad, 0x1d, 0x42, 0xbb, 0xf8, 0xc2, 0x00, 0xe7, 0x00, 0xb5, 0xa1, 0x83,
+    0x00, 0xb5, 0x90, 0xc3, 0x72, 0x57, 0x00, 0xb6, 0xe0, 0xc2, 0x1d, 0xc1,
+    0x00, 0xb7, 0x31, 0xc6, 0xd2, 0x35, 0x00, 0xb6, 0xc1, 0xc5, 0xd6, 0x82,
+    0x00, 0xb6, 0x29, 0xc8, 0xbf, 0x3a, 0x00, 0xb5, 0xe1, 0xc5, 0x71, 0x4d,
+    0x00, 0xb5, 0x60, 0xc3, 0x67, 0x02, 0x00, 0xb7, 0x21, 0x90, 0x00, 0xb5,
+    0x98, 0x8e, 0x00, 0xb6, 0xd9, 0x92, 0x00, 0xb6, 0xa1, 0x90, 0x00, 0xb6,
+    0x00, 0x94, 0x00, 0xb6, 0x21, 0xc9, 0xb3, 0xe6, 0x00, 0xb5, 0xb8, 0x90,
+    0x05, 0x28, 0x08, 0x87, 0x05, 0x28, 0x11, 0x90, 0x05, 0x2f, 0x28, 0x90,
+    0x05, 0x29, 0x38, 0x90, 0x05, 0x2a, 0x68, 0x91, 0x05, 0x2b, 0x99, 0x90,
+    0x05, 0x2d, 0xf0, 0x90, 0x05, 0x2c, 0xc0, 0x87, 0x05, 0x28, 0x1b, 0x02,
+    0xbc, 0x2e, 0x90, 0x05, 0x2f, 0x38, 0x90, 0x05, 0x29, 0x48, 0x90, 0x05,
+    0x2a, 0x78, 0x91, 0x05, 0x2b, 0xa3, 0x02, 0xbc, 0x32, 0x90, 0x05, 0x2e,
+    0x00, 0x90, 0x05, 0x2c, 0xd0, 0x87, 0x05, 0x28, 0x28, 0x91, 0x05, 0x2b,
+    0xb0, 0x87, 0x05, 0x2f, 0x4b, 0x02, 0xbc, 0x36, 0x8b, 0x05, 0x29, 0x59,
+    0x83, 0x05, 0x2a, 0x89, 0x91, 0x05, 0x2e, 0x13, 0x02, 0xbc, 0x3a, 0x97,
+    0x05, 0x2c, 0xe0, 0x87, 0x05, 0x28, 0x38, 0x91, 0x05, 0x2b, 0xc0, 0x87,
+    0x05, 0x2f, 0x5b, 0x02, 0xbc, 0x3e, 0x8b, 0x05, 0x29, 0x69, 0x83, 0x05,
+    0x2a, 0x99, 0x91, 0x05, 0x2e, 0x23, 0x02, 0xbc, 0x42, 0x97, 0x05, 0x2c,
+    0xf0, 0x87, 0x05, 0x2f, 0x73, 0x02, 0xbc, 0x46, 0x8b, 0x05, 0x29, 0x79,
+    0x83, 0x05, 0x2a, 0xb1, 0x91, 0x05, 0x2e, 0x33, 0x02, 0xbc, 0x4a, 0x97,
+    0x05, 0x2d, 0x00, 0x87, 0x05, 0x29, 0x08, 0x91, 0x05, 0x2c, 0x90, 0x87,
+    0x05, 0x2f, 0x63, 0x02, 0xbc, 0x4e, 0x8b, 0x05, 0x29, 0x71, 0x83, 0x05,
+    0x2a, 0xa3, 0x02, 0xbc, 0x56, 0x91, 0x05, 0x2e, 0x2b, 0x02, 0xbc, 0x5a,
+    0x97, 0x05, 0x2c, 0xf8, 0x87, 0x05, 0x28, 0xf0, 0x90, 0x05, 0x2b, 0x58,
+    0x91, 0x05, 0x2c, 0x78, 0x87, 0x05, 0x2f, 0x7b, 0x02, 0xbc, 0x5e, 0x8b,
+    0x05, 0x29, 0x81, 0x83, 0x05, 0x2a, 0xb9, 0x91, 0x05, 0x2e, 0x3b, 0x02,
+    0xbc, 0x66, 0x97, 0x05, 0x2d, 0x08, 0x87, 0x05, 0x29, 0x01, 0x90, 0x05,
+    0x30, 0x38, 0x91, 0x05, 0x2c, 0x88, 0x87, 0x05, 0x28, 0x60, 0x91, 0x05,
+    0x2b, 0xe8, 0x87, 0x05, 0x28, 0x68, 0x91, 0x05, 0x2b, 0xf0, 0x87, 0x05,
+    0x28, 0x70, 0x87, 0x05, 0x2f, 0xa3, 0x02, 0xbc, 0x6e, 0x8b, 0x05, 0x29,
+    0xa1, 0x83, 0x05, 0x2a, 0xd9, 0x91, 0x05, 0x2e, 0x63, 0x02, 0xbc, 0x72,
+    0x97, 0x05, 0x2d, 0x28, 0x91, 0x05, 0x2b, 0xf8, 0x87, 0x05, 0x2f, 0xab,
+    0x02, 0xbc, 0x76, 0x0a, 0xc2, 0xbc, 0x7a, 0x8b, 0x05, 0x29, 0xa9, 0x83,
+    0x05, 0x2a, 0xe1, 0x91, 0x05, 0x2e, 0x6b, 0x02, 0xbc, 0x94, 0x97, 0x05,
+    0x2d, 0x30, 0x87, 0x05, 0x28, 0xa0, 0x91, 0x05, 0x2c, 0x28, 0x87, 0x05,
+    0x28, 0x91, 0xc8, 0x4a, 0xd9, 0x05, 0x30, 0x60, 0x91, 0x05, 0x2c, 0x18,
+    0x87, 0x05, 0x28, 0x98, 0x91, 0x05, 0x2c, 0x20, 0x87, 0x05, 0x2f, 0xd3,
+    0x02, 0xbc, 0x98, 0x8b, 0x05, 0x29, 0xd1, 0x83, 0x05, 0x2b, 0x09, 0x91,
+    0x05, 0x2e, 0x93, 0x02, 0xbc, 0x9c, 0x97, 0x05, 0x2d, 0x58, 0x87, 0x05,
+    0x30, 0x0b, 0x02, 0xbc, 0xa6, 0x8b, 0x05, 0x2a, 0x09, 0x83, 0x05, 0x2b,
+    0x41, 0x91, 0x05, 0x2e, 0xcb, 0x02, 0xbc, 0xaa, 0x97, 0x05, 0x2d, 0x90,
+    0x09, 0xc2, 0xbc, 0xae, 0xc2, 0x00, 0xd1, 0x05, 0x2a, 0x59, 0xc2, 0x00,
+    0x45, 0x05, 0x2d, 0xe1, 0xc2, 0x00, 0xc4, 0x05, 0x2f, 0x18, 0x87, 0x05,
+    0x29, 0x10, 0x87, 0x05, 0x30, 0x53, 0x02, 0xbc, 0xc8, 0x8b, 0x05, 0x2a,
+    0x41, 0x83, 0x05, 0x2b, 0x81, 0x91, 0x05, 0x2f, 0x03, 0x02, 0xbc, 0xcc,
+    0x97, 0x05, 0x2d, 0xc8, 0x91, 0x05, 0x2c, 0x98, 0x87, 0x05, 0x28, 0xb0,
+    0x87, 0x05, 0x2f, 0xe3, 0x02, 0xbc, 0xd0, 0x8b, 0x05, 0x29, 0xe1, 0x83,
+    0x05, 0x2b, 0x19, 0x91, 0x05, 0x2e, 0xa3, 0x02, 0xbc, 0xd4, 0x97, 0x05,
+    0x2d, 0x68, 0x91, 0x05, 0x2c, 0x38, 0x87, 0x05, 0x28, 0xc0, 0x87, 0x05,
+    0x2f, 0xf3, 0x02, 0xbc, 0xd8, 0x8b, 0x05, 0x29, 0xf1, 0x83, 0x05, 0x2b,
+    0x29, 0x91, 0x05, 0x2e, 0xb3, 0x02, 0xbc, 0xdc, 0x97, 0x05, 0x2d, 0x78,
+    0x91, 0x05, 0x2c, 0x48, 0x87, 0x05, 0x28, 0xd0, 0x91, 0x05, 0x2c, 0x58,
+    0x87, 0x05, 0x28, 0xd8, 0x91, 0x05, 0x2c, 0x60, 0x87, 0x05, 0x28, 0xe8,
+    0x91, 0x05, 0x2c, 0x70, 0x90, 0x05, 0x2b, 0x90, 0xc3, 0x08, 0x48, 0x05,
+    0x30, 0xd9, 0xc2, 0x37, 0xea, 0x05, 0x30, 0xf0, 0xca, 0x3b, 0x06, 0x01,
+    0x1b, 0xf9, 0x47, 0x02, 0xd1, 0x42, 0xbc, 0xe0, 0xc4, 0xb2, 0xf8, 0x00,
+    0x04, 0x50, 0xca, 0x99, 0x1f, 0x01, 0x81, 0x99, 0xca, 0x01, 0xc8, 0x01,
+    0x81, 0xa8, 0xca, 0xa5, 0x12, 0x00, 0xe7, 0x60, 0xce, 0x25, 0xad, 0x70,
+    0x02, 0xd9, 0xcb, 0x1a, 0x50, 0x70, 0x01, 0x41, 0xcd, 0x00, 0x32, 0x70,
+    0x03, 0xd8, 0x9c, 0x70, 0x02, 0xd1, 0x9b, 0x70, 0x02, 0xc9, 0x9a, 0x70,
+    0x02, 0xc1, 0x99, 0x70, 0x02, 0xb9, 0x98, 0x70, 0x02, 0xb1, 0x97, 0x70,
+    0x02, 0xa9, 0x96, 0x70, 0x02, 0xa1, 0x95, 0x70, 0x02, 0x99, 0x94, 0x70,
+    0x02, 0x91, 0x93, 0x70, 0x02, 0x89, 0x92, 0x70, 0x02, 0x81, 0x91, 0x70,
+    0x02, 0x79, 0x90, 0x70, 0x02, 0x71, 0x8f, 0x70, 0x02, 0x69, 0x8e, 0x70,
+    0x02, 0x61, 0x8d, 0x70, 0x02, 0x59, 0x8c, 0x70, 0x02, 0x51, 0x8b, 0x70,
+    0x02, 0x49, 0x8a, 0x70, 0x02, 0x41, 0x89, 0x70, 0x02, 0x39, 0x88, 0x70,
+    0x02, 0x31, 0x87, 0x70, 0x02, 0x29, 0x86, 0x70, 0x02, 0x21, 0x85, 0x70,
+    0x02, 0x19, 0x84, 0x70, 0x02, 0x11, 0x83, 0x70, 0x02, 0x08, 0x9c, 0x70,
+    0x03, 0xd1, 0x9b, 0x70, 0x03, 0xc9, 0x9a, 0x70, 0x03, 0xc1, 0x99, 0x70,
+    0x03, 0xb9, 0x98, 0x70, 0x03, 0xb1, 0x97, 0x70, 0x03, 0xa9, 0x96, 0x70,
+    0x03, 0xa1, 0x95, 0x70, 0x03, 0x99, 0x94, 0x70, 0x03, 0x91, 0x93, 0x70,
+    0x03, 0x89, 0x92, 0x70, 0x03, 0x81, 0x91, 0x70, 0x03, 0x79, 0x90, 0x70,
+    0x03, 0x71, 0x8f, 0x70, 0x03, 0x69, 0x8e, 0x70, 0x03, 0x61, 0x8d, 0x70,
+    0x03, 0x59, 0x8c, 0x70, 0x03, 0x51, 0x8b, 0x70, 0x03, 0x49, 0x8a, 0x70,
+    0x03, 0x41, 0x89, 0x70, 0x03, 0x39, 0x88, 0x70, 0x03, 0x31, 0x87, 0x70,
+    0x03, 0x29, 0x86, 0x70, 0x03, 0x21, 0x85, 0x70, 0x03, 0x19, 0x84, 0x70,
+    0x03, 0x11, 0x83, 0x70, 0x03, 0x08, 0xc9, 0xb4, 0x64, 0x70, 0x02, 0x01,
+    0x83, 0x70, 0x01, 0x60, 0xc4, 0x18, 0x10, 0x70, 0x01, 0xb9, 0xc2, 0x22,
+    0xcc, 0x70, 0x01, 0xb0, 0xc3, 0x0d, 0x14, 0x70, 0x01, 0xa9, 0xc3, 0x09,
+    0x9e, 0x70, 0x01, 0xa0, 0xc4, 0x02, 0xde, 0x70, 0x01, 0x99, 0xc2, 0x02,
+    0xa0, 0x70, 0x01, 0x90, 0x23, 0xc2, 0xbc, 0xec, 0x22, 0xc2, 0xbd, 0x10,
+    0x21, 0xc2, 0xbd, 0x38, 0x20, 0xc2, 0xbd, 0x60, 0x1f, 0xc2, 0xbd, 0x88,
+    0x1e, 0xc2, 0xbd, 0xb0, 0x1d, 0x42, 0xbd, 0xd8, 0x26, 0xc2, 0xbe, 0x00,
+    0x25, 0xc2, 0xbe, 0x28, 0x24, 0xc2, 0xbe, 0x50, 0x23, 0xc2, 0xbe, 0x78,
+    0x22, 0xc2, 0xbe, 0xa0, 0x21, 0xc2, 0xbe, 0xc8, 0x20, 0xc2, 0xbe, 0xf0,
+    0x1f, 0xc2, 0xbf, 0x18, 0x1e, 0xc2, 0xbf, 0x40, 0x1d, 0x42, 0xbf, 0x68,
+    0x26, 0xc2, 0xbf, 0x90, 0x25, 0xc2, 0xbf, 0xb8, 0x24, 0xc2, 0xbf, 0xe0,
+    0x23, 0xc2, 0xc0, 0x08, 0x22, 0xc2, 0xc0, 0x30, 0x21, 0xc2, 0xc0, 0x58,
+    0x20, 0xc2, 0xc0, 0x80, 0x1f, 0xc2, 0xc0, 0xa8, 0x1e, 0xc2, 0xc0, 0xd0,
+    0x1d, 0x42, 0xc0, 0xf8, 0x26, 0xc2, 0xc1, 0x20, 0x25, 0xc2, 0xc1, 0x48,
+    0x24, 0xc2, 0xc1, 0x70, 0x23, 0xc2, 0xc1, 0x98, 0x22, 0xc2, 0xc1, 0xc0,
+    0x21, 0xc2, 0xc1, 0xe8, 0x20, 0xc2, 0xc2, 0x10, 0x1f, 0xc2, 0xc2, 0x38,
+    0x1e, 0xc2, 0xc2, 0x60, 0x1d, 0x42, 0xc2, 0x88, 0x26, 0xc2, 0xc2, 0xb0,
+    0x25, 0xc2, 0xc2, 0xd8, 0x24, 0xc2, 0xc3, 0x00, 0x23, 0xc2, 0xc3, 0x28,
+    0x22, 0xc2, 0xc3, 0x50, 0x21, 0xc2, 0xc3, 0x78, 0x20, 0xc2, 0xc3, 0xa0,
+    0x1f, 0xc2, 0xc3, 0xc8, 0x1e, 0xc2, 0xc3, 0xf0, 0x1d, 0x42, 0xc4, 0x18,
+    0x26, 0xc2, 0xc4, 0x40, 0x25, 0xc2, 0xc4, 0x68, 0x24, 0xc2, 0xc4, 0x90,
+    0x23, 0xc2, 0xc4, 0xb8, 0x22, 0xc2, 0xc4, 0xe0, 0x21, 0xc2, 0xc5, 0x08,
+    0x20, 0xc2, 0xc5, 0x30, 0x1f, 0xc2, 0xc5, 0x58, 0x1e, 0xc2, 0xc5, 0x80,
+    0x1d, 0x42, 0xc5, 0xa8, 0x26, 0xc2, 0xc5, 0xd0, 0x25, 0xc2, 0xc5, 0xf8,
+    0x24, 0xc2, 0xc6, 0x20, 0x23, 0xc2, 0xc6, 0x48, 0x22, 0xc2, 0xc6, 0x70,
+    0x21, 0xc2, 0xc6, 0x98, 0x20, 0xc2, 0xc6, 0xc0, 0x1f, 0xc2, 0xc6, 0xe8,
+    0x1e, 0xc2, 0xc7, 0x10, 0x1d, 0x42, 0xc7, 0x38, 0x26, 0xc2, 0xc7, 0x60,
+    0x25, 0xc2, 0xc7, 0x88, 0x24, 0xc2, 0xc7, 0xb0, 0x23, 0xc2, 0xc7, 0xd8,
+    0x22, 0xc2, 0xc8, 0x00, 0x21, 0xc2, 0xc8, 0x28, 0x20, 0xc2, 0xc8, 0x50,
+    0x1f, 0xc2, 0xc8, 0x78, 0x1e, 0xc2, 0xc8, 0xa0, 0x1d, 0x42, 0xc8, 0xc8,
+    0xc4, 0x18, 0x10, 0x0b, 0x56, 0x39, 0xc2, 0x22, 0xcc, 0x0b, 0x56, 0x30,
+    0xc3, 0x0d, 0x14, 0x0b, 0x56, 0x29, 0xc3, 0x09, 0x9e, 0x0b, 0x56, 0x20,
+    0xc4, 0x02, 0xde, 0x0b, 0x56, 0x19, 0xc2, 0x02, 0xa0, 0x0b, 0x56, 0x10,
+    0xc2, 0x00, 0xd0, 0x0b, 0x55, 0xe9, 0x83, 0x0b, 0x55, 0xa8, 0xc2, 0x00,
+    0xd0, 0x0b, 0x55, 0xe1, 0x83, 0x0b, 0x55, 0x88, 0x83, 0x0b, 0x55, 0xd9,
+    0xc7, 0xb4, 0x2f, 0x0b, 0x54, 0x80, 0xc2, 0x00, 0xd0, 0x0b, 0x55, 0xc9,
+    0xc2, 0x0d, 0xf6, 0x0b, 0x55, 0xb1, 0x83, 0x0b, 0x55, 0x80, 0x16, 0xc2,
+    0xc8, 0xec, 0x83, 0x0b, 0x55, 0x68, 0xc2, 0x00, 0xd0, 0x0b, 0x55, 0xb9,
+    0x83, 0x0b, 0x55, 0x10, 0x0a, 0xc2, 0xc8, 0xf6, 0x83, 0x0b, 0x55, 0x20,
+    0xc2, 0x00, 0xd0, 0x0b, 0x55, 0x99, 0x83, 0x0b, 0x55, 0x61, 0xc2, 0x19,
+    0x2c, 0x0b, 0x55, 0x41, 0xc2, 0x01, 0x30, 0x0b, 0x55, 0x18, 0x83, 0x0b,
+    0x55, 0x71, 0xc7, 0xc6, 0xda, 0x0b, 0x54, 0x88, 0x83, 0x0b, 0x55, 0x59,
+    0x9a, 0x0b, 0x54, 0xf9, 0x93, 0x0b, 0x54, 0xf1, 0x85, 0x0b, 0x54, 0xe9,
+    0x9c, 0x0b, 0x54, 0xe0, 0xc2, 0x00, 0xd0, 0x0b, 0x55, 0x49, 0x83, 0x0b,
+    0x55, 0x38, 0xc2, 0x00, 0xd0, 0x0b, 0x55, 0x09, 0x83, 0x0b, 0x55, 0x00,
+    0x0b, 0xc2, 0xc9, 0x00, 0x07, 0xc2, 0xc9, 0x14, 0x9a, 0x0b, 0x54, 0x39,
+    0x93, 0x0b, 0x54, 0x31, 0x85, 0x0b, 0x54, 0x29, 0x9c, 0x0b, 0x54, 0x20,
+    0x19, 0xc2, 0xc9, 0x24, 0x9a, 0x0b, 0x53, 0xb9, 0x93, 0x0b, 0x53, 0xb1,
+    0x85, 0x0b, 0x53, 0xa9, 0x9c, 0x0b, 0x53, 0xa0, 0x9a, 0x0b, 0x54, 0x19,
+    0x93, 0x0b, 0x54, 0x11, 0x85, 0x0b, 0x54, 0x09, 0x9c, 0x0b, 0x54, 0x00,
+    0x9a, 0x0b, 0x53, 0xf9, 0x93, 0x0b, 0x53, 0xf1, 0x85, 0x0b, 0x53, 0xe9,
+    0x9c, 0x0b, 0x53, 0xe0, 0x9a, 0x0b, 0x53, 0xd9, 0x93, 0x0b, 0x53, 0xd1,
+    0x85, 0x0b, 0x53, 0xc9, 0x9c, 0x0b, 0x53, 0xc0, 0x9a, 0x0b, 0x53, 0x99,
+    0x93, 0x0b, 0x53, 0x91, 0x85, 0x0b, 0x53, 0x89, 0x9c, 0x0b, 0x53, 0x80,
+    0x03, 0xc2, 0xc9, 0x34, 0xc3, 0x29, 0x78, 0x08, 0xff, 0x19, 0x0b, 0x42,
+    0xc9, 0x40, 0xc7, 0xc9, 0x1f, 0x08, 0xff, 0x81, 0xc7, 0xc9, 0xea, 0x08,
+    0xfe, 0xe1, 0xc9, 0xb4, 0xa3, 0x08, 0xfe, 0xc8, 0x17, 0xc2, 0xc9, 0x4c,
+    0xc4, 0xe2, 0x47, 0x08, 0xfe, 0xe8, 0x03, 0xc2, 0xc9, 0x58, 0xc2, 0x00,
+    0x45, 0x08, 0xfe, 0xf8, 0xc8, 0xbc, 0xaa, 0x08, 0xfe, 0xb9, 0xc7, 0x14,
+    0x39, 0x00, 0x5c, 0x10, 0x83, 0x00, 0x5c, 0x31, 0x8b, 0x00, 0x5c, 0x81,
+    0x97, 0x00, 0x5c, 0xa0, 0x8b, 0x00, 0x5c, 0x40, 0x97, 0x00, 0x5c, 0x50,
+    0x87, 0x00, 0x5c, 0x78, 0x91, 0x00, 0x5c, 0x98, 0xc2, 0x01, 0x30, 0x00,
+    0x5c, 0xc9, 0xc2, 0x19, 0x2c, 0x00, 0x5c, 0xf1, 0x10, 0xc2, 0xc9, 0x6a,
+    0x83, 0x00, 0x5d, 0x40, 0xc2, 0x01, 0x6f, 0x00, 0x5c, 0xf9, 0x83, 0x00,
+    0x5d, 0x20, 0x83, 0x00, 0x5d, 0x81, 0xc2, 0x00, 0x39, 0x00, 0x5d, 0x88,
+    0x83, 0x00, 0x5d, 0x91, 0x0e, 0x42, 0xc9, 0x74, 0xc2, 0x00, 0xd0, 0x00,
+    0x5d, 0xb1, 0xc2, 0x0d, 0xf6, 0x00, 0x5d, 0xb9, 0x83, 0x00, 0x5d, 0xc0,
+    0xc2, 0x02, 0xa0, 0x00, 0x5f, 0x41, 0xc4, 0x02, 0xde, 0x00, 0x5f, 0x48,
+    0xc3, 0x09, 0x9e, 0x00, 0x5f, 0x51, 0xc3, 0x0d, 0x14, 0x00, 0x5f, 0x58,
+    0xc2, 0x22, 0xcc, 0x00, 0x5f, 0x61, 0xc4, 0x18, 0x10, 0x00, 0x5f, 0x68,
+    0xc6, 0xa7, 0x8c, 0x08, 0xfe, 0x71, 0xc9, 0xaf, 0xdb, 0x08, 0xfe, 0x38,
+    0x9f, 0x08, 0xfe, 0x91, 0x9e, 0x08, 0xfe, 0x88, 0xc4, 0x9c, 0x07, 0x08,
+    0xfe, 0x79, 0xc7, 0xc7, 0x74, 0x08, 0xfe, 0x20, 0x8a, 0x08, 0xfe, 0x61,
+    0xc4, 0x1e, 0x1a, 0x08, 0xfe, 0x10, 0xc4, 0x0f, 0x1f, 0x08, 0xfe, 0x59,
+    0xc8, 0x1e, 0x16, 0x08, 0xfe, 0x41, 0x0a, 0x42, 0xc9, 0x7e, 0x46, 0xcf,
+    0x4d, 0xc2, 0xc9, 0x8a, 0xc8, 0xaf, 0xd2, 0x08, 0xfe, 0x18, 0xc2, 0x00,
+    0xd0, 0x08, 0xb4, 0xb9, 0x83, 0x08, 0xb4, 0xb0, 0xc2, 0x00, 0xd0, 0x08,
+    0xb4, 0xa9, 0x83, 0x08, 0xb4, 0xa0, 0xc3, 0x71, 0xf0, 0x00, 0xd5, 0x58,
+    0xc3, 0x71, 0xf0, 0x00, 0xd5, 0x48, 0xca, 0xa2, 0x2e, 0x00, 0xd3, 0xe1,
+    0x46, 0x28, 0xb0, 0x42, 0xc9, 0x92, 0xc4, 0x68, 0x94, 0x00, 0xd2, 0xc0,
+    0x83, 0x00, 0xd2, 0xe1, 0x46, 0x30, 0xa0, 0x42, 0xc9, 0x9e, 0xc5, 0x2c,
+    0xf5, 0x00, 0xd2, 0xd1, 0xca, 0xa1, 0xc0, 0x00, 0xd2, 0xb8, 0xc5, 0x00,
+    0xd4, 0x00, 0xd3, 0x99, 0xc5, 0x05, 0x02, 0x00, 0xd3, 0x60, 0x87, 0x00,
+    0xd3, 0x40, 0x87, 0x00, 0xd2, 0x98, 0xc2, 0x00, 0xd0, 0x00, 0xd2, 0x61,
+    0xc2, 0x19, 0x2c, 0x00, 0xd1, 0xf9, 0x12, 0xc2, 0xc9, 0xaa, 0xc2, 0x00,
+    0x87, 0x00, 0xd1, 0xe1, 0x16, 0xc2, 0xc9, 0xb4, 0xc5, 0x3c, 0xf5, 0x00,
+    0xd1, 0x81, 0x05, 0xc2, 0xc9, 0xbe, 0xc2, 0x0d, 0xf6, 0x00, 0xd1, 0x51,
+    0x0d, 0x42, 0xc9, 0xc8, 0x83, 0x00, 0xd2, 0x41, 0xc2, 0x0d, 0xf6, 0x00,
+    0xd2, 0x39, 0xc2, 0x00, 0xd0, 0x00, 0xd2, 0x30, 0xc2, 0x00, 0xd0, 0x00,
+    0xd1, 0xc9, 0x83, 0x00, 0xd1, 0xc0, 0xc2, 0x00, 0xd0, 0x00, 0xd1, 0x99,
+    0x83, 0x00, 0xd1, 0x90, 0xc2, 0x00, 0xd0, 0x00, 0xd1, 0x41, 0x83, 0x00,
+    0xd1, 0x38, 0xc2, 0x8d, 0x8f, 0x00, 0xd1, 0x11, 0xc2, 0x00, 0xd0, 0x00,
+    0xd1, 0x09, 0x83, 0x00, 0xd1, 0x00, 0xc2, 0x00, 0xc1, 0x00, 0xd1, 0x89,
+    0xc2, 0x01, 0x6f, 0x00, 0xd1, 0x68, 0x83, 0x05, 0x55, 0xc8, 0xc2, 0x01,
+    0x23, 0x05, 0x54, 0xf9, 0x91, 0x05, 0x54, 0xe8, 0x91, 0x05, 0x54, 0xc9,
+    0xc2, 0x0f, 0x7b, 0x05, 0x54, 0x49, 0xc2, 0x42, 0xcd, 0x05, 0x54, 0x88,
+    0xc2, 0x01, 0x23, 0x05, 0x54, 0xb9, 0x91, 0x05, 0x54, 0xa8, 0x91, 0x05,
+    0x54, 0x59, 0xc2, 0x01, 0x23, 0x05, 0x54, 0x68, 0x0a, 0xc2, 0xc9, 0xd8,
+    0x91, 0x05, 0x54, 0x08, 0xc2, 0x01, 0x23, 0x05, 0x54, 0xf1, 0x91, 0x05,
+    0x54, 0xe0, 0x91, 0x05, 0x54, 0xc1, 0xc2, 0x0f, 0x7b, 0x05, 0x54, 0x41,
+    0xc2, 0x42, 0xcd, 0x05, 0x54, 0x80, 0xc2, 0x01, 0x23, 0x05, 0x54, 0xb1,
+    0x91, 0x05, 0x54, 0xa0, 0xc2, 0x01, 0x23, 0x05, 0x54, 0x61, 0x91, 0x05,
+    0x54, 0x50, 0x0a, 0xc2, 0xc9, 0xe2, 0x91, 0x05, 0x54, 0x00, 0xd5, 0x03,
+    0xd2, 0x01, 0x5c, 0xd1, 0xc9, 0x03, 0xde, 0x01, 0x3d, 0x10, 0xc2, 0x10,
+    0x37, 0x00, 0x3c, 0xd8, 0xc4, 0xd9, 0x21, 0x00, 0x3c, 0xf9, 0xc6, 0xb4,
+    0xdc, 0x00, 0x3c, 0x88, 0xc4, 0xe2, 0xd7, 0x00, 0x3c, 0xe9, 0xc7, 0xb4,
+    0xdb, 0x00, 0x3c, 0x08, 0xc6, 0xb4, 0xdc, 0x00, 0x3c, 0x91, 0x83, 0x00,
+    0x3c, 0xe0, 0xc5, 0xd9, 0xd4, 0x00, 0x70, 0x09, 0x42, 0x01, 0x23, 0x42,
+    0xc9, 0xec, 0xc6, 0xcf, 0x47, 0x00, 0x70, 0x39, 0x43, 0xcf, 0x48, 0xc2,
+    0xc9, 0xf6, 0xc7, 0xc8, 0x38, 0x00, 0x72, 0x68, 0xc2, 0x00, 0xd1, 0x00,
+    0x70, 0x43, 0x02, 0xca, 0x00, 0xc3, 0x00, 0x74, 0x00, 0x70, 0x49, 0xc2,
+    0x49, 0x0c, 0x00, 0x70, 0x60, 0x42, 0x01, 0x7c, 0xc2, 0xca, 0x04, 0x44,
+    0x14, 0x3d, 0x42, 0xca, 0x0e, 0x43, 0xe6, 0x14, 0xc2, 0xca, 0x2b, 0xc7,
+    0xca, 0x68, 0x00, 0x72, 0x70, 0xc5, 0xdc, 0x90, 0x00, 0x70, 0x71, 0xc3,
+    0x13, 0x4b, 0x00, 0x70, 0xa0, 0x42, 0x01, 0x7c, 0xc2, 0xca, 0x37, 0x0a,
+    0x42, 0xca, 0x43, 0xc5, 0xd9, 0xc5, 0x00, 0x70, 0xd9, 0x0a, 0xc2, 0xca,
+    0x4f, 0xc8, 0xb8, 0x7a, 0x00, 0x71, 0x78, 0xc3, 0x05, 0xad, 0x00, 0x70,
+    0xeb, 0x02, 0xca, 0x5b, 0xc5, 0xd9, 0x7f, 0x00, 0x72, 0x78, 0xc4, 0x42,
+    0x6d, 0x00, 0x71, 0x09, 0x42, 0x02, 0xfa, 0x42, 0xca, 0x5f, 0xc5, 0xd9,
+    0xc0, 0x00, 0x71, 0x19, 0x97, 0x00, 0x71, 0x20, 0x42, 0x01, 0x7c, 0xc2,
+    0xca, 0x6f, 0x97, 0x00, 0x71, 0x31, 0xca, 0xa4, 0xd6, 0x00, 0x72, 0x28,
+    0xc3, 0x00, 0x7d, 0x00, 0x71, 0x59, 0xc6, 0xcc, 0x35, 0x00, 0x71, 0x70,
+    0xc2, 0x10, 0x11, 0x0f, 0x15, 0x61, 0x87, 0x0f, 0x15, 0x3b, 0x02, 0xca,
+    0x7b, 0x8b, 0x0f, 0x15, 0x12, 0x02, 0xca, 0x7f, 0xc6, 0x7b, 0xb6, 0x0e,
+    0x98, 0xf1, 0xc3, 0x05, 0xaf, 0x0e, 0x98, 0xa9, 0xc7, 0xc5, 0x1a, 0x0e,
+    0x98, 0x58, 0xc5, 0xdb, 0x55, 0x0e, 0x99, 0x61, 0xc6, 0xd0, 0x5b, 0x0e,
+    0x98, 0xd8, 0xca, 0xa1, 0xfc, 0x0f, 0xab, 0xe0, 0xd1, 0x50, 0x9b, 0x00,
+    0x60, 0x01, 0xce, 0x29, 0x32, 0x00, 0x60, 0x20, 0x83, 0x00, 0x60, 0x31,
+    0x8b, 0x00, 0x60, 0x81, 0x97, 0x00, 0x60, 0xa0, 0x8b, 0x00, 0x60, 0x40,
+    0x97, 0x00, 0x60, 0x50, 0x47, 0xb2, 0x2e, 0xc2, 0xca, 0x83, 0x83, 0x00,
+    0x61, 0xa8, 0x87, 0x00, 0x60, 0x78, 0x91, 0x00, 0x60, 0x98, 0x83, 0x00,
+    0x60, 0xa9, 0xc2, 0x00, 0xd0, 0x00, 0x60, 0xb0, 0x83, 0x00, 0x60, 0xb9,
+    0xc2, 0x00, 0xd0, 0x00, 0x60, 0xc0, 0xc2, 0x01, 0x30, 0x00, 0x60, 0xc9,
+    0xc2, 0x19, 0x2c, 0x00, 0x60, 0xf1, 0xc2, 0x00, 0xc1, 0x00, 0x61, 0x19,
+    0x83, 0x00, 0x61, 0x42, 0x02, 0xca, 0x91, 0x83, 0x00, 0x60, 0xd1, 0xc2,
+    0x00, 0xd0, 0x00, 0x60, 0xd8, 0x83, 0x00, 0x60, 0xe1, 0xc2, 0x00, 0xd0,
+    0x00, 0x60, 0xe8, 0x16, 0xc2, 0xca, 0x97, 0x83, 0x00, 0x61, 0x21, 0xc2,
+    0x00, 0xd0, 0x00, 0x61, 0x29, 0xc2, 0x0d, 0xf6, 0x00, 0x62, 0xc0, 0x06,
+    0xc2, 0xca, 0xa1, 0x83, 0x00, 0x61, 0x31, 0xc2, 0x00, 0xd0, 0x00, 0x61,
+    0x39, 0xc2, 0x02, 0x1c, 0x00, 0x62, 0xc8, 0x83, 0x00, 0x61, 0x51, 0xc2,
+    0x00, 0xd0, 0x00, 0x61, 0x58, 0x83, 0x00, 0x61, 0x61, 0xc2, 0x00, 0xd0,
+    0x00, 0x61, 0x68, 0x83, 0x00, 0x61, 0x81, 0x14, 0x42, 0xca, 0xab, 0x83,
+    0x00, 0x61, 0x91, 0x0e, 0x42, 0xca, 0xb5, 0xc2, 0x00, 0xd0, 0x00, 0x61,
+    0xb1, 0xc2, 0x0d, 0xf6, 0x00, 0x61, 0xb9, 0x83, 0x00, 0x61, 0xc0, 0x94,
+    0x00, 0x62, 0x20, 0x8e, 0x00, 0x63, 0x18, 0xd2, 0x15, 0xf0, 0x00, 0x63,
+    0xd1, 0xd3, 0x45, 0xbf, 0x00, 0x63, 0xe8, 0xd2, 0x15, 0xf0, 0x00, 0x63,
+    0xd9, 0xd3, 0x45, 0xbf, 0x00, 0x63, 0xf0, 0xd0, 0x03, 0xb7, 0x01, 0x4b,
+    0x91, 0xcf, 0x09, 0xf8, 0x01, 0x5a, 0x48, 0xcb, 0x93, 0x9e, 0x01, 0x53,
+    0x59, 0xc9, 0x16, 0x14, 0x01, 0x53, 0x50, 0x8e, 0x08, 0xa5, 0xc0, 0x94,
+    0x08, 0xa5, 0xb0, 0x8e, 0x08, 0xa4, 0x4b, 0x02, 0xca, 0xbf, 0x94, 0x08,
+    0xa4, 0x3a, 0x02, 0xca, 0xc3, 0xc2, 0x00, 0xd0, 0x08, 0xa4, 0xe1, 0x83,
+    0x08, 0xa4, 0xd8, 0xc2, 0x00, 0xd0, 0x08, 0xa4, 0xd1, 0x83, 0x08, 0xa4,
+    0xc8, 0xca, 0xa5, 0xc6, 0x00, 0x7e, 0x38, 0xc9, 0xb3, 0x17, 0x00, 0x7e,
+    0x31, 0xc6, 0xcf, 0x83, 0x00, 0x7e, 0x40, 0x00, 0x42, 0xca, 0xc7, 0x45,
+    0xda, 0xbf, 0xc2, 0xca, 0xd9, 0x44, 0xe3, 0xef, 0x42, 0xca, 0xe3, 0x83,
+    0x00, 0x7c, 0x81, 0xc2, 0x00, 0xd0, 0x00, 0x7c, 0x89, 0xc3, 0x1d, 0x35,
+    0x00, 0x7d, 0xc8, 0x83, 0x00, 0x7c, 0x91, 0xc2, 0x00, 0xd0, 0x00, 0x7c,
+    0x98, 0xc2, 0x01, 0x30, 0x00, 0x7c, 0xa1, 0xc2, 0x19, 0x2c, 0x00, 0x7c,
+    0xc9, 0xc2, 0x00, 0xc1, 0x00, 0x7c, 0xf1, 0x83, 0x00, 0x7d, 0x18, 0x83,
+    0x00, 0x7c, 0xa9, 0xc2, 0x00, 0xd0, 0x00, 0x7c, 0xb0, 0x16, 0xc2, 0xca,
+    0xed, 0x83, 0x00, 0x7c, 0xf9, 0xc2, 0x00, 0xd0, 0x00, 0x7d, 0x01, 0x15,
+    0x42, 0xca, 0xf7, 0x06, 0xc2, 0xcb, 0x01, 0x83, 0x00, 0x7d, 0x09, 0xc2,
+    0x00, 0xd0, 0x00, 0x7d, 0x11, 0x1c, 0x42, 0xcb, 0x0b, 0x83, 0x00, 0x7d,
+    0x21, 0xc2, 0x00, 0xd0, 0x00, 0x7d, 0x28, 0x83, 0x00, 0x7d, 0x31, 0xc2,
+    0x00, 0xd0, 0x00, 0x7d, 0x38, 0xc2, 0x00, 0xd0, 0x00, 0x7d, 0x71, 0x83,
+    0x00, 0x7d, 0x78, 0xc2, 0x00, 0xd0, 0x00, 0x7d, 0xa1, 0xc2, 0x0d, 0xf6,
+    0x00, 0x7d, 0xa9, 0x83, 0x00, 0x7d, 0xb0, 0xc2, 0x01, 0x4a, 0x00, 0x7d,
+    0xd1, 0xc2, 0x19, 0x2c, 0x00, 0x7d, 0xd9, 0xc2, 0x00, 0x39, 0x00, 0x7d,
+    0xe0, 0xcb, 0x90, 0x0d, 0x00, 0x78, 0x09, 0x44, 0xe3, 0xbf, 0x42, 0xcb,
+    0x15, 0xcb, 0x98, 0xfd, 0x00, 0x78, 0x99, 0xcc, 0x7c, 0xc3, 0x00, 0x79,
+    0xb0, 0xca, 0x9c, 0x52, 0x00, 0x78, 0x49, 0xd4, 0x39, 0x44, 0x00, 0x7e,
+    0x80, 0xc5, 0x01, 0xe1, 0x00, 0x78, 0x80, 0x83, 0x00, 0x7a, 0x51, 0xc2,
+    0x00, 0xd0, 0x00, 0x7a, 0x58, 0x83, 0x00, 0x7a, 0xc9, 0xc2, 0x00, 0xd0,
+    0x00, 0x7a, 0xd0, 0x83, 0x00, 0x7a, 0x61, 0xc2, 0x00, 0xd0, 0x00, 0x7a,
+    0x68, 0x83, 0x00, 0x7a, 0xd9, 0xc2, 0x00, 0xd0, 0x00, 0x7a, 0xe0, 0x8a,
+    0x01, 0x69, 0x90, 0x8a, 0x01, 0x6a, 0xb2, 0x02, 0xcb, 0x21, 0x8a, 0x01,
+    0x69, 0xc1, 0x86, 0x01, 0x69, 0xca, 0x02, 0xcb, 0x25, 0x8a, 0x01, 0x6a,
+    0x2a, 0x02, 0xcb, 0x29, 0x8a, 0x01, 0x6a, 0x18, 0x8a, 0x01, 0x6a, 0x51,
+    0x9c, 0x01, 0x6b, 0x28, 0x94, 0x01, 0x6a, 0xa8, 0x95, 0x01, 0x6a, 0xd1,
+    0x8a, 0x01, 0x6a, 0xd8, 0x8a, 0x01, 0x6a, 0xe9, 0x96, 0x01, 0x6a, 0xf8,
+    0x8a, 0x01, 0x6a, 0x30, 0x90, 0x01, 0x6a, 0x81, 0x8a, 0x01, 0x6a, 0xb8,
+    0x49, 0x19, 0x61, 0xc2, 0xcb, 0x2d, 0xce, 0x6f, 0xd2, 0x07, 0xef, 0xd8,
+    0x48, 0x19, 0x6b, 0xc2, 0xcb, 0x45, 0x48, 0xab, 0xf5, 0x42, 0xcb, 0x5d,
+    0x0a, 0xc2, 0xcb, 0x7b, 0x49, 0xb2, 0x6c, 0xc2, 0xcb, 0x87, 0x03, 0xc2,
+    0xcb, 0xaf, 0xd4, 0x39, 0x6c, 0x07, 0xef, 0xf0, 0x44, 0x2b, 0xb9, 0xc2,
+    0xcb, 0xb9, 0x45, 0x19, 0x60, 0xc2, 0xcb, 0xc5, 0x46, 0x30, 0xc1, 0xc2,
+    0xcb, 0xcf, 0x4d, 0x06, 0x5a, 0x42, 0xcb, 0xdb, 0x48, 0x92, 0x78, 0xc2,
+    0xcb, 0xe7, 0x0e, 0xc2, 0xcb, 0xff, 0xd2, 0x4b, 0x29, 0x07, 0xef, 0x99,
+    0xcb, 0x90, 0x65, 0x07, 0xef, 0xf8, 0x03, 0xc2, 0xcc, 0x11, 0x0a, 0xc2,
+    0xcc, 0x1d, 0x48, 0xab, 0xf5, 0x42, 0xcc, 0x29, 0x0a, 0xc2, 0xcc, 0x5d,
+    0x45, 0x19, 0x60, 0xc2, 0xcc, 0x67, 0x44, 0x2b, 0xb9, 0xc2, 0xcc, 0x7d,
+    0x4d, 0x06, 0x5a, 0xc2, 0xcc, 0x89, 0x46, 0x50, 0xf0, 0xc2, 0xcc, 0x95,
+    0x45, 0x30, 0xc1, 0xc2, 0xcc, 0xa1, 0xce, 0x72, 0xf0, 0x07, 0xe4, 0x89,
+    0xcf, 0x69, 0x81, 0x07, 0xe4, 0x91, 0xcf, 0x60, 0x8a, 0x07, 0xe4, 0xa0,
+    0x0a, 0xc2, 0xcc, 0xab, 0x44, 0x2b, 0xb9, 0xc2, 0xcc, 0xb7, 0x4d, 0x06,
+    0x5a, 0xc2, 0xcc, 0xc3, 0x45, 0x19, 0x60, 0xc2, 0xcc, 0xcf, 0x46, 0x50,
+    0xf0, 0xc2, 0xcc, 0xe5, 0x45, 0x30, 0xc1, 0xc2, 0xcc, 0xf1, 0xce, 0x72,
+    0xf0, 0x07, 0xe4, 0x51, 0xcf, 0x69, 0x81, 0x07, 0xe4, 0x59, 0xcf, 0x60,
+    0x8a, 0x07, 0xe4, 0x68, 0x48, 0x0f, 0x9b, 0xc2, 0xcc, 0xfb, 0x49, 0x19,
+    0x6a, 0x42, 0xcd, 0x25, 0x44, 0x2b, 0xb9, 0xc2, 0xcd, 0x43, 0x45, 0x06,
+    0x5a, 0xc2, 0xcd, 0x4f, 0x45, 0x19, 0x60, 0xc2, 0xcd, 0x67, 0x45, 0x50,
+    0xf0, 0xc2, 0xcd, 0x7d, 0x0a, 0xc2, 0xcd, 0x87, 0x45, 0x30, 0xc1, 0x42,
+    0xcd, 0x93, 0x03, 0xc2, 0xcd, 0x9d, 0xcd, 0x7e, 0x55, 0x07, 0xea, 0x58,
+    0x44, 0x2b, 0xb9, 0xc2, 0xcd, 0xa9, 0x4d, 0x06, 0x5a, 0xc2, 0xcd, 0xb5,
+    0x45, 0x19, 0x60, 0xc2, 0xcd, 0xc1, 0x45, 0x50, 0xf0, 0xc2, 0xcd, 0xcb,
+    0x45, 0x50, 0xf1, 0xc2, 0xcd, 0xd5, 0x46, 0x30, 0xc1, 0x42, 0xcd, 0xe1,
+    0x48, 0xab, 0xf5, 0xc2, 0xcd, 0xed, 0xdc, 0x12, 0xa9, 0x07, 0xef, 0xe8,
+    0x46, 0x2b, 0xba, 0xc2, 0xce, 0x21, 0x03, 0x42, 0xce, 0x27, 0x49, 0x19,
+    0x61, 0xc2, 0xce, 0x3c, 0xd5, 0x38, 0x2a, 0x07, 0xef, 0xa0, 0x0b, 0xc2,
+    0xce, 0x60, 0xcb, 0x64, 0x7b, 0x07, 0xe9, 0xd8, 0x46, 0x50, 0x13, 0xc2,
+    0xce, 0x6c, 0x45, 0x50, 0xf0, 0xc2, 0xce, 0x78, 0x44, 0x19, 0x6a, 0xc2,
+    0xce, 0x82, 0x46, 0x30, 0xc1, 0xc2, 0xce, 0x8c, 0x44, 0x72, 0xf0, 0xc2,
+    0xce, 0x98, 0x4d, 0x06, 0x5a, 0xc2, 0xce, 0xa4, 0x44, 0x2b, 0xb9, 0x42,
+    0xce, 0xb0, 0x60, 0x08, 0x07, 0x42, 0xce, 0xbc, 0xc5, 0x05, 0x02, 0x00,
+    0x47, 0xc9, 0xc5, 0x00, 0xd4, 0x00, 0x47, 0xb8, 0x08, 0xc2, 0xce, 0xc6,
+    0x09, 0xc2, 0xce, 0xd8, 0x0e, 0xc2, 0xce, 0xf9, 0x42, 0x1c, 0x52, 0xc2,
+    0xcf, 0x08, 0x03, 0xc2, 0xcf, 0x18, 0x0d, 0xc2, 0xcf, 0x34, 0x16, 0xc2,
+    0xcf, 0x50, 0xc3, 0xd5, 0x25, 0x00, 0x33, 0xf3, 0x02, 0xcf, 0x78, 0x1b,
+    0xc2, 0xcf, 0x85, 0x14, 0xc2, 0xcf, 0x95, 0x42, 0x00, 0x51, 0xc2, 0xcf,
+    0xb6, 0x97, 0x00, 0x36, 0x3b, 0x02, 0xcf, 0xc6, 0xc3, 0x0f, 0x9a, 0x00,
+    0x32, 0x13, 0x02, 0xcf, 0xd0, 0x87, 0x00, 0x36, 0x83, 0x02, 0xcf, 0xd4,
+    0x42, 0x02, 0x1c, 0xc2, 0xcf, 0xd8, 0x15, 0xc2, 0xcf, 0xe8, 0x06, 0xc2,
+    0xd0, 0x15, 0xc2, 0x00, 0x5f, 0x00, 0x36, 0x5b, 0x02, 0xd0, 0x37, 0xc3,
+    0x7e, 0x89, 0x00, 0x32, 0x43, 0x02, 0xd0, 0x42, 0x0f, 0xc2, 0xd0, 0x46,
+    0xc2, 0x49, 0x0c, 0x00, 0x36, 0x33, 0x02, 0xd0, 0x55, 0x10, 0xc2, 0xd0,
+    0x59, 0x0a, 0x42, 0xd0, 0x72, 0xd3, 0x43, 0xd1, 0x00, 0x46, 0x91, 0xc5,
+    0x05, 0x02, 0x00, 0x46, 0x79, 0xc5, 0x00, 0xd4, 0x00, 0x46, 0x70, 0x11,
+    0xc2, 0xd0, 0x88, 0x03, 0x42, 0xd0, 0x94, 0xc3, 0x00, 0x49, 0x0f, 0x70,
+    0x01, 0xc2, 0x00, 0x74, 0x0f, 0x70, 0x78, 0xc2, 0x00, 0x74, 0x0f, 0x70,
+    0x31, 0x8a, 0x0f, 0x70, 0xd0, 0x03, 0xc2, 0xd0, 0x9c, 0xc2, 0x16, 0x1c,
+    0x0f, 0x70, 0xa9, 0x0a, 0x42, 0xd0, 0xa6, 0xc2, 0x0f, 0x9b, 0x0f, 0x70,
+    0x51, 0xc3, 0x14, 0x4b, 0x0f, 0x70, 0xb8, 0xc2, 0x00, 0xc2, 0x0f, 0x70,
+    0x59, 0x46, 0xce, 0x45, 0x42, 0xd0, 0xb2, 0xc3, 0x03, 0x26, 0x0f, 0x70,
+    0x71, 0xc4, 0xdf, 0x93, 0x0f, 0x70, 0xa1, 0x49, 0x9f, 0xe0, 0xc2, 0xd1,
+    0x16, 0xc2, 0x01, 0x9d, 0x0f, 0x70, 0x88, 0xc3, 0x85, 0xf5, 0x0f, 0x71,
+    0x09, 0xc4, 0x30, 0xc1, 0x0f, 0x71, 0x11, 0x0a, 0xc2, 0xd1, 0x66, 0xc3,
+    0x2b, 0xb9, 0x0f, 0x71, 0x49, 0x0d, 0xc2, 0xd1, 0x72, 0xc3, 0x0d, 0xff,
+    0x0f, 0x71, 0x59, 0xc4, 0x19, 0x60, 0x0f, 0x71, 0x61, 0xc4, 0x3a, 0x01,
+    0x0f, 0x71, 0x69, 0x15, 0xc2, 0xd1, 0x7e, 0xc3, 0x03, 0x0c, 0x0f, 0x71,
+    0x79, 0xc3, 0xb1, 0x0d, 0x0f, 0x71, 0x81, 0xc3, 0x0f, 0x9a, 0x0f, 0x71,
+    0x91, 0x16, 0xc2, 0xd1, 0x90, 0xc3, 0xb2, 0x00, 0x0f, 0x71, 0xc9, 0xc5,
+    0x92, 0x75, 0x0f, 0x71, 0xd8, 0xda, 0x19, 0x60, 0x0f, 0x77, 0x81, 0xcc,
+    0x88, 0x95, 0x0f, 0x77, 0x88, 0x00, 0xc2, 0xd1, 0x9c, 0xc3, 0x13, 0x00,
+    0x00, 0x32, 0x62, 0x02, 0xd1, 0xae, 0xc9, 0x33, 0xad, 0x00, 0x47, 0xe0,
+    0xc9, 0x33, 0xad, 0x00, 0x47, 0xe8, 0x45, 0x00, 0x8c, 0xc2, 0xd1, 0xb4,
+    0xcd, 0x00, 0xfa, 0x07, 0xf3, 0xb1, 0xcb, 0x64, 0x7b, 0x07, 0xf3, 0xb8,
+    0xce, 0x00, 0xf9, 0x07, 0xf3, 0x80, 0x19, 0xc2, 0xd1, 0xc6, 0x15, 0xc2,
+    0xd1, 0xd2, 0x08, 0xc2, 0xd1, 0xe4, 0xc4, 0x3a, 0x01, 0x00, 0x37, 0x43,
+    0x02, 0xd1, 0xf0, 0xc3, 0x0f, 0x9a, 0x00, 0x46, 0xb9, 0xc3, 0x03, 0x0c,
+    0x00, 0x46, 0xb1, 0x42, 0x02, 0x1c, 0xc2, 0xd1, 0xf6, 0xc3, 0x2b, 0xb9,
+    0x00, 0x37, 0x3b, 0x02, 0xd2, 0x00, 0x0f, 0xc2, 0xd2, 0x06, 0xd4, 0x3c,
+    0x3c, 0x00, 0x37, 0x09, 0xd8, 0x21, 0x53, 0x00, 0x37, 0x01, 0xcc, 0x8c,
+    0x91, 0x00, 0x36, 0xf9, 0x16, 0xc2, 0xd2, 0x12, 0xc4, 0x30, 0xc1, 0x00,
+    0x36, 0xd1, 0x0e, 0x42, 0xd2, 0x1e, 0xcf, 0x60, 0x7b, 0x00, 0x46, 0xc9,
+    0x19, 0xc2, 0xd2, 0x2a, 0xc4, 0x19, 0x60, 0x00, 0x37, 0x69, 0xc4, 0xdf,
+    0x6b, 0x00, 0x37, 0x18, 0xc7, 0xbf, 0x83, 0x00, 0x46, 0x49, 0xc3, 0x00,
+    0xcf, 0x00, 0x30, 0xc0, 0x00, 0x42, 0xd2, 0x36, 0xc5, 0x05, 0x02, 0x07,
+    0xde, 0x09, 0xc5, 0x00, 0xd4, 0x07, 0xde, 0x00, 0x48, 0x04, 0xe7, 0xc2,
+    0xd2, 0x48, 0x4a, 0x0e, 0x7d, 0x42, 0xd2, 0x5a, 0xd7, 0x2b, 0x0c, 0x07,
+    0xdd, 0xe1, 0x42, 0x00, 0x30, 0x42, 0xd2, 0x6c, 0xc5, 0x05, 0x02, 0x07,
+    0xdd, 0xd9, 0xc5, 0x00, 0xd4, 0x07, 0xdd, 0xd0, 0x46, 0xd1, 0x0f, 0xc2,
+    0xd2, 0x78, 0x03, 0x42, 0xd2, 0x84, 0xcf, 0x63, 0xf0, 0x00, 0x30, 0x99,
+    0xd0, 0x5a, 0x82, 0x00, 0x30, 0x90, 0xcd, 0x00, 0xfa, 0x07, 0xf3, 0xe1,
+    0xcb, 0x64, 0x7b, 0x07, 0xf3, 0xe8, 0xc5, 0x05, 0x02, 0x00, 0x47, 0x79,
+    0xc5, 0x00, 0xd4, 0x00, 0x47, 0x60, 0xc5, 0x05, 0x02, 0x00, 0x47, 0x71,
+    0xc5, 0x00, 0xd4, 0x00, 0x47, 0x58, 0xc5, 0x05, 0x02, 0x00, 0x47, 0x69,
+    0xc5, 0x00, 0xd4, 0x00, 0x47, 0x50, 0x46, 0x00, 0x8b, 0x42, 0xd2, 0x9c,
+    0xc3, 0x13, 0x00, 0x00, 0x47, 0x48, 0xc3, 0x13, 0x00, 0x00, 0x47, 0x40,
+    0xc3, 0x13, 0x00, 0x00, 0x47, 0x38, 0x83, 0x00, 0x2b, 0xc9, 0xc2, 0x16,
+    0x1c, 0x00, 0x2b, 0x98, 0x83, 0x00, 0x2a, 0x49, 0xc2, 0x16, 0x1c, 0x00,
+    0x2a, 0x18, 0x9f, 0x0f, 0xbb, 0x31, 0xa0, 0x0f, 0xbb, 0x39, 0xa1, 0x0f,
+    0xbb, 0x41, 0xa2, 0x0f, 0xbb, 0x48, 0xc2, 0xe5, 0xfd, 0x0f, 0xb9, 0x20,
+    0xa1, 0x0f, 0xb9, 0xa9, 0x9f, 0x0f, 0xb9, 0x99, 0xa0, 0x0f, 0xb9, 0xa0,
+    0xc8, 0x8c, 0x89, 0x0f, 0xb9, 0x83, 0x02, 0xd2, 0xae, 0xc4, 0x1a, 0x05,
+    0x0f, 0xb8, 0xf8, 0x9f, 0x0f, 0xb8, 0x59, 0xa0, 0x0f, 0xb8, 0x60, 0x48,
+    0xba, 0x6a, 0xc2, 0xd2, 0xb4, 0xc8, 0x8c, 0x89, 0x0f, 0xb9, 0x61, 0xc6,
+    0x4c, 0x49, 0x0f, 0xb9, 0x10, 0xc8, 0x8c, 0x89, 0x0f, 0xb9, 0x69, 0xd2,
+    0x4c, 0x49, 0x0f, 0xb9, 0x30, 0xc2, 0xe5, 0xfd, 0x0f, 0xb8, 0x48, 0xc2,
+    0xe5, 0xfd, 0x0f, 0xb8, 0x38, 0x84, 0x0a, 0x21, 0xa1, 0x83, 0x0a, 0x21,
+    0x98, 0x83, 0x0a, 0x21, 0x88, 0x83, 0x0a, 0x21, 0x60, 0x83, 0x0a, 0x21,
+    0x48, 0x83, 0x0a, 0x20, 0xd8, 0x83, 0x0a, 0x20, 0x50, 0x83, 0x0a, 0x22,
+    0x49, 0x84, 0x0a, 0x22, 0x51, 0x85, 0x0a, 0x22, 0x58, 0x83, 0x0a, 0x23,
+    0x58, 0x83, 0x0a, 0x23, 0x68, 0x83, 0x0a, 0x23, 0x80, 0x83, 0x0a, 0x23,
+    0x90, 0x83, 0x0a, 0x23, 0xa0, 0x83, 0x0a, 0x23, 0xb9, 0x84, 0x0a, 0x23,
+    0xc1, 0x85, 0x0a, 0x23, 0xc8, 0x83, 0x0a, 0x23, 0xd9, 0x84, 0x0a, 0x23,
+    0xe0, 0x83, 0x0a, 0x23, 0xf9, 0x84, 0x0a, 0x24, 0x01, 0x85, 0x0a, 0x24,
+    0x08, 0x83, 0x0a, 0x24, 0x29, 0x84, 0x0a, 0x24, 0x30, 0x83, 0x0a, 0x24,
+    0x60, 0x83, 0x0a, 0x24, 0xb8, 0x83, 0x0a, 0x25, 0x10, 0x83, 0x0a, 0x27,
+    0x31, 0x84, 0x0a, 0x27, 0x38, 0x83, 0x0a, 0x27, 0x68, 0x83, 0x0a, 0x27,
+    0x80, 0x83, 0x0a, 0x27, 0xb8, 0x83, 0x0a, 0x27, 0xc8, 0x83, 0x0a, 0x28,
+    0x28, 0x83, 0x0a, 0x29, 0x70, 0x83, 0x0a, 0x2a, 0x28, 0x83, 0x0a, 0x2a,
+    0x58, 0x83, 0x0a, 0x2a, 0x88, 0x83, 0x0a, 0x2a, 0xe0, 0x83, 0x0a, 0x2b,
+    0x88, 0x83, 0x0a, 0x2b, 0xa1, 0x84, 0x0a, 0x2b, 0xa9, 0x85, 0x0a, 0x2b,
+    0xb0, 0x83, 0x0a, 0x2b, 0xd9, 0x84, 0x0a, 0x2b, 0xe1, 0x85, 0x0a, 0x2b,
+    0xe8, 0x83, 0x0a, 0x2c, 0xa8, 0x83, 0x0a, 0x2c, 0xd8, 0x83, 0x0a, 0x2d,
+    0x00, 0x83, 0x0a, 0x2d, 0x20, 0x83, 0x0a, 0x2d, 0x78, 0xc9, 0xae, 0xa0,
+    0x0a, 0x2d, 0x89, 0x83, 0x0a, 0x2d, 0x90, 0x83, 0x0a, 0x2d, 0xb0, 0xd4,
+    0x3f, 0x0c, 0x0a, 0x2e, 0x71, 0xd3, 0x44, 0xc8, 0x0a, 0x2e, 0x78, 0x83,
+    0x0a, 0x2f, 0xc0, 0x83, 0x0a, 0x30, 0x00, 0xc4, 0x0d, 0xe4, 0x01, 0x1b,
+    0x01, 0xc5, 0x02, 0xd2, 0x01, 0x19, 0xe0, 0x43, 0x01, 0x47, 0xc2, 0xd2,
+    0xc0, 0xc2, 0x05, 0x03, 0x01, 0x1a, 0xa3, 0x02, 0xd2, 0xcc, 0x0b, 0x42,
+    0xd2, 0xd2, 0xc6, 0xcd, 0xc1, 0x01, 0x1a, 0x99, 0xcb, 0x03, 0xbc, 0x01,
+    0x1a, 0x80, 0xcd, 0x09, 0xfa, 0x01, 0x1a, 0x39, 0xc7, 0x00, 0xcc, 0x01,
+    0x1a, 0x18, 0xc3, 0xba, 0x27, 0x01, 0x1a, 0x71, 0xc8, 0x52, 0x09, 0x01,
+    0x1a, 0x50, 0xd0, 0x5b, 0xd2, 0x01, 0x12, 0x90, 0x00, 0x42, 0xd2, 0xde,
+    0xc9, 0x57, 0x20, 0x08, 0x09, 0x68, 0xc9, 0x57, 0x20, 0x08, 0x09, 0x60,
+    0x00, 0x42, 0xd2, 0xea, 0x00, 0x42, 0xd2, 0xf6, 0xc9, 0x57, 0x20, 0x08,
+    0x09, 0x78, 0x00, 0x42, 0xd3, 0x02, 0xc9, 0x57, 0x20, 0x08, 0x09, 0x70,
+    0xc7, 0x0d, 0x04, 0x08, 0x08, 0xf1, 0xc8, 0x4b, 0x94, 0x08, 0x09, 0x38,
+    0xc9, 0x57, 0x20, 0x08, 0x09, 0x80, 0xc7, 0x0d, 0x04, 0x08, 0x08, 0xf9,
+    0xc8, 0x4b, 0x94, 0x08, 0x09, 0x40, 0xc9, 0x57, 0x20, 0x08, 0x09, 0x88,
+    0xd5, 0x35, 0xf3, 0x0f, 0xdd, 0x78, 0x48, 0x1e, 0x57, 0xc2, 0xd3, 0x0e,
+    0x11, 0x42, 0xd3, 0x26, 0x45, 0x02, 0x9a, 0x42, 0xd3, 0x35, 0xd0, 0x5e,
+    0x32, 0x01, 0x2b, 0xe0, 0x47, 0x54, 0x42, 0xc2, 0xd3, 0x45, 0x49, 0x45,
+    0xd2, 0x42, 0xd3, 0x51, 0x45, 0x02, 0x9a, 0x42, 0xd3, 0x5d, 0xc8, 0x00,
+    0x5f, 0x01, 0x28, 0x51, 0xca, 0x01, 0x68, 0x01, 0x28, 0x40, 0xc8, 0x00,
+    0x5f, 0x01, 0x28, 0x31, 0xca, 0x01, 0x68, 0x01, 0x28, 0x20, 0xce, 0x72,
+    0xaa, 0x01, 0x2a, 0x51, 0xc8, 0x11, 0xff, 0x01, 0x29, 0xd1, 0xca, 0x11,
+    0x34, 0x01, 0x29, 0x90, 0xce, 0x73, 0x44, 0x01, 0x29, 0xe9, 0xc8, 0x11,
+    0x49, 0x01, 0x29, 0xa9, 0xca, 0x12, 0x12, 0x01, 0x29, 0x68, 0x0e, 0xc2,
+    0xd3, 0x6f, 0xca, 0x01, 0x68, 0x01, 0x29, 0xd9, 0xc5, 0x00, 0x2c, 0x01,
+    0x28, 0xb8, 0x45, 0x02, 0x9a, 0x42, 0xd3, 0x7b, 0xc8, 0x00, 0x5f, 0x01,
+    0x2a, 0x79, 0xca, 0x01, 0x68, 0x01, 0x2a, 0x68, 0xca, 0x01, 0x68, 0x01,
+    0x2a, 0x59, 0xc4, 0x00, 0x49, 0x01, 0x29, 0x59, 0xc5, 0x00, 0x2c, 0x01,
+    0x29, 0x18, 0x45, 0x02, 0x9a, 0x42, 0xd3, 0x8d, 0xca, 0x01, 0x68, 0x01,
+    0x2b, 0x49, 0xc4, 0x00, 0x49, 0x01, 0x2a, 0xe9, 0xc5, 0x00, 0x2c, 0x01,
+    0x2a, 0xd0, 0xca, 0x01, 0x68, 0x01, 0x2b, 0x31, 0xc4, 0x00, 0x49, 0x01,
+    0x2a, 0xb9, 0xc5, 0x00, 0x2c, 0x01, 0x2a, 0xa0, 0xd1, 0x53, 0x43, 0x01,
+    0x2b, 0x29, 0xcb, 0x8d, 0x84, 0x01, 0x2a, 0xb1, 0xcc, 0x89, 0xd9, 0x01,
+    0x2a, 0x98, 0xd1, 0x53, 0x32, 0x01, 0x2b, 0x21, 0xcb, 0x8e, 0xce, 0x01,
+    0x2a, 0xa9, 0xcc, 0x87, 0xa5, 0x01, 0x2a, 0x90, 0xd3, 0x42, 0x7b, 0x01,
+    0x2a, 0x39, 0xd0, 0x32, 0x71, 0x01, 0x29, 0x79, 0x45, 0x00, 0x49, 0xc2,
+    0xd3, 0x9f, 0x46, 0x00, 0x2c, 0x42, 0xd3, 0xab, 0xd3, 0x41, 0xaa, 0x01,
+    0x2a, 0x09, 0xd0, 0x32, 0x47, 0x01, 0x29, 0x81, 0x45, 0x00, 0x49, 0xc2,
+    0xd3, 0xb7, 0x46, 0x00, 0x2c, 0x42, 0xd3, 0xc3, 0xca, 0x11, 0x34, 0x01,
+    0x29, 0x51, 0xc5, 0x11, 0x39, 0x01, 0x28, 0xc8, 0xca, 0x11, 0x34, 0x01,
+    0x29, 0x11, 0xc5, 0x11, 0x39, 0x01, 0x28, 0xa8, 0xca, 0x12, 0x12, 0x01,
+    0x29, 0x31, 0xc5, 0x07, 0xeb, 0x01, 0x28, 0xd0, 0xca, 0x12, 0x12, 0x01,
+    0x28, 0xf1, 0xc5, 0x07, 0xeb, 0x01, 0x28, 0xb0, 0xa3, 0x0f, 0xd9, 0xb0,
+    0xa2, 0x0f, 0xd8, 0xab, 0x02, 0xd3, 0xcf, 0xa1, 0x0f, 0xd8, 0x73, 0x02,
+    0xd3, 0xd3, 0xa3, 0x0f, 0xd9, 0x28, 0xa3, 0x0f, 0xd9, 0x80, 0xa3, 0x0f,
+    0xd9, 0x41, 0xa2, 0x0f, 0xd8, 0xca, 0x02, 0xd3, 0xdb, 0xa3, 0x0f, 0xd9,
+    0x51, 0xa2, 0x0f, 0xd8, 0xda, 0x02, 0xd3, 0xdf, 0xa3, 0x0f, 0xd9, 0xc8,
+    0xa3, 0x0f, 0xd9, 0x59, 0xa2, 0x0f, 0xd8, 0xe2, 0x02, 0xd3, 0xe3, 0xa3,
+    0x0f, 0xd9, 0x98, 0xa3, 0x0f, 0xd9, 0xb8, 0xca, 0xa7, 0x92, 0x0f, 0xd2,
+    0x4b, 0x02, 0xd3, 0xe7, 0x0d, 0xc2, 0xd3, 0xed, 0xc4, 0xe3, 0x93, 0x01,
+    0x32, 0xfb, 0x02, 0xd3, 0xff, 0xc6, 0xca, 0xfd, 0x01, 0x32, 0xeb, 0x02,
+    0xd4, 0x05, 0xc4, 0xde, 0x83, 0x01, 0x32, 0xe3, 0x02, 0xd4, 0x0b, 0xc5,
+    0xa8, 0xf7, 0x01, 0x32, 0xdb, 0x02, 0xd4, 0x11, 0x47, 0x45, 0x86, 0x42,
+    0xd4, 0x17, 0x4e, 0x6e, 0xe4, 0xc2, 0xd4, 0x33, 0x4e, 0x0e, 0x14, 0xc2,
+    0xd4, 0x3f, 0x4c, 0x12, 0xe1, 0xc2, 0xd4, 0x4b, 0x4f, 0x61, 0x3e, 0x42,
+    0xd4, 0x57, 0x00, 0x42, 0xd4, 0x63, 0xc6, 0x0b, 0x09, 0x0f, 0xbc, 0x69,
+    0xc6, 0x02, 0xd1, 0x0f, 0xbc, 0x20, 0xca, 0x82, 0xd3, 0x01, 0x31, 0xd9,
+    0x44, 0x03, 0x15, 0x42, 0xd4, 0x6f, 0x00, 0x42, 0xd4, 0x7f, 0xc6, 0x0b,
+    0x09, 0x0f, 0xbc, 0x61, 0xc7, 0x3a, 0x19, 0x0f, 0xbc, 0xb9, 0xc7, 0x0a,
+    0xe0, 0x0f, 0xbc, 0xe8, 0x4a, 0x01, 0xa9, 0xc2, 0xd4, 0x91, 0xd8, 0x24,
+    0xcb, 0x0f, 0xad, 0x19, 0xdb, 0x03, 0xcc, 0x01, 0x5c, 0xf8, 0x00, 0x42,
+    0xd4, 0xa9, 0x47, 0xbe, 0x33, 0xc2, 0xd4, 0xc7, 0xc5, 0xdd, 0x76, 0x0f,
+    0x99, 0x10, 0x4a, 0x01, 0xa9, 0xc2, 0xd4, 0xd3, 0x46, 0x01, 0x4a, 0xc2,
+    0xd4, 0xf5, 0x4a, 0x03, 0x3d, 0x42, 0xd5, 0x0a, 0x4a, 0x01, 0xa9, 0xc2,
+    0xd5, 0x16, 0x00, 0xc2, 0xd5, 0x37, 0x46, 0x01, 0x4a, 0x42, 0xd5, 0x43,
+    0x44, 0x00, 0x28, 0xc2, 0xd5, 0x4f, 0xc5, 0x0a, 0xe2, 0x01, 0x4f, 0x58,
+    0xc6, 0x0b, 0x09, 0x01, 0x58, 0xd9, 0xc6, 0x02, 0xd1, 0x01, 0x59, 0x20,
+    0xc6, 0x04, 0xa1, 0x01, 0x39, 0xf9, 0xc2, 0x00, 0xb3, 0x01, 0x34, 0x88,
+    0xcf, 0x66, 0xde, 0x01, 0x39, 0x31, 0xc4, 0x18, 0xb3, 0x0f, 0xad, 0xf8,
+    0x15, 0xc2, 0xd5, 0x5b, 0x06, 0xc2, 0xd5, 0x67, 0xd4, 0x3c, 0x14, 0x01,
+    0x1f, 0xb3, 0x02, 0xd5, 0x76, 0xd7, 0x2a, 0x0f, 0x01, 0x1f, 0xab, 0x02,
+    0xd5, 0x7c, 0x0e, 0x42, 0xd5, 0x82, 0x44, 0x00, 0x67, 0xc2, 0xd5, 0x91,
+    0x4a, 0x01, 0xa9, 0xc2, 0xd5, 0x9d, 0xd8, 0x24, 0xcb, 0x0f, 0xad, 0x11,
+    0xdb, 0x03, 0xcc, 0x01, 0x5c, 0xe8, 0xc3, 0x08, 0x7b, 0x0f, 0xad, 0x23,
+    0x02, 0xd5, 0xb5, 0xc5, 0xc2, 0xc2, 0x01, 0x59, 0x10, 0xc7, 0xc6, 0xef,
+    0x01, 0x4e, 0xb9, 0xd0, 0x5a, 0x62, 0x01, 0x59, 0x60, 0xc4, 0x2b, 0xf1,
+    0x0f, 0x9f, 0x91, 0xc5, 0xbb, 0xcd, 0x01, 0x58, 0xf8, 0xc9, 0x46, 0x70,
+    0x01, 0x2d, 0x71, 0xc7, 0x5a, 0x6b, 0x01, 0x59, 0x70, 0xc6, 0x0b, 0x09,
+    0x01, 0x58, 0xe9, 0xc7, 0x3a, 0x19, 0x0f, 0xbc, 0xc1, 0xc7, 0x0a, 0xe0,
+    0x0f, 0xbc, 0xf0, 0x9a, 0x01, 0x30, 0x83, 0x02, 0xd5, 0xbb, 0xcb, 0x8e,
+    0xa2, 0x0f, 0xaf, 0xb0, 0xc8, 0xb6, 0xb2, 0x00, 0xdb, 0xf0, 0xc3, 0x00,
+    0x74, 0x00, 0xdb, 0xe1, 0xc3, 0x38, 0x86, 0x00, 0xdb, 0xc9, 0xc3, 0x01,
+    0x95, 0x00, 0xdb, 0xc0, 0xc2, 0x14, 0x49, 0x00, 0xdb, 0xd9, 0xc2, 0x06,
+    0x4e, 0x00, 0xdb, 0xd0, 0xc2, 0x06, 0x4e, 0x00, 0xdb, 0xb9, 0xc2, 0x14,
+    0x49, 0x00, 0xdb, 0xb0, 0xc2, 0x00, 0xb3, 0x00, 0xdb, 0xa9, 0xc2, 0x0b,
+    0x47, 0x00, 0xdb, 0xa0, 0xc2, 0x01, 0x0f, 0x00, 0xdb, 0x73, 0x02, 0xd5,
+    0xc1, 0xc2, 0x03, 0x66, 0x00, 0xdb, 0x6a, 0x02, 0xd5, 0xc7, 0xc2, 0x00,
+    0x75, 0x00, 0xdb, 0x23, 0x02, 0xd5, 0xcd, 0xc3, 0x00, 0x74, 0x00, 0xdb,
+    0x49, 0xc3, 0x0a, 0xe3, 0x00, 0xdb, 0x38, 0xc3, 0x38, 0x86, 0x00, 0xdb,
+    0x41, 0xc2, 0x00, 0x75, 0x00, 0xdb, 0x10, 0xc7, 0xc2, 0x6c, 0x00, 0xd8,
+    0x30, 0x00, 0x42, 0xd5, 0xd1, 0xc7, 0xc7, 0x20, 0x00, 0xda, 0x29, 0xca,
+    0x60, 0x26, 0x00, 0xd8, 0xa0, 0xc2, 0x00, 0xb0, 0x00, 0xd9, 0x89, 0xc2,
+    0x01, 0x30, 0x00, 0xd9, 0x80, 0xc7, 0xc2, 0x6c, 0x00, 0xd8, 0x70, 0xc7,
+    0xc2, 0x6c, 0x00, 0xd8, 0x60, 0xc7, 0xbf, 0xef, 0x00, 0xd9, 0x08, 0xc3,
+    0x1b, 0xe8, 0x00, 0xd9, 0x29, 0x45, 0x60, 0x22, 0x42, 0xd5, 0xe3, 0x00,
+    0x42, 0xd5, 0xef, 0x0d, 0xc2, 0xd5, 0xfe, 0x97, 0x0b, 0x50, 0x21, 0xc4,
+    0xdf, 0x8b, 0x0b, 0x51, 0xc1, 0x15, 0xc2, 0xd6, 0x1a, 0x16, 0xc2, 0xd6,
+    0x34, 0x8f, 0x0b, 0x50, 0x8b, 0x02, 0xd6, 0x3e, 0x14, 0xc2, 0xd6, 0x50,
+    0x0e, 0xc2, 0xd6, 0x5c, 0x19, 0xc2, 0xd6, 0x6a, 0xc3, 0xe6, 0x0e, 0x0b,
+    0x51, 0x59, 0x12, 0xc2, 0xd6, 0x74, 0x10, 0xc2, 0xd6, 0x7e, 0x1b, 0xc2,
+    0xd6, 0xa9, 0xc2, 0x02, 0xe0, 0x0b, 0x50, 0x30, 0x09, 0xc2, 0xd6, 0xb3,
+    0x19, 0xc2, 0xd6, 0xbd, 0x0d, 0xc2, 0xd6, 0xc7, 0x10, 0xc2, 0xd6, 0xdd,
+    0x16, 0xc2, 0xd7, 0x0a, 0x12, 0xc2, 0xd7, 0x1a, 0x14, 0xc2, 0xd7, 0x37,
+    0x15, 0xc2, 0xd7, 0x47, 0x0e, 0xc2, 0xd7, 0x61, 0x18, 0xc2, 0xd7, 0x73,
+    0x0f, 0xc2, 0xd7, 0x7d, 0x08, 0xc2, 0xd7, 0xb5, 0x1b, 0xc2, 0xd7, 0xcc,
+    0x8b, 0x0b, 0x4e, 0xc1, 0x91, 0x0b, 0x4e, 0xb9, 0x83, 0x0b, 0x4e, 0xa8,
+    0x10, 0xc2, 0xd7, 0xe6, 0x0e, 0xc2, 0xd8, 0x06, 0x8f, 0x0b, 0x4a, 0x8b,
+    0x02, 0xd8, 0x1c, 0x16, 0xc2, 0xd8, 0x42, 0x0d, 0xc2, 0xd8, 0x5d, 0x15,
+    0xc2, 0xd8, 0x74, 0x08, 0xc2, 0xd8, 0x8c, 0x1b, 0xc2, 0xd8, 0x98, 0x14,
+    0xc2, 0xd8, 0xa8, 0x12, 0xc2, 0xd8, 0xba, 0x42, 0x00, 0x09, 0xc2, 0xd8,
+    0xce, 0x19, 0x42, 0xd8, 0xda, 0x0d, 0xc2, 0xd8, 0xe6, 0x15, 0xc2, 0xd8,
+    0xfa, 0x16, 0xc2, 0xd9, 0x08, 0x12, 0xc2, 0xd9, 0x18, 0x0e, 0xc2, 0xd9,
+    0x22, 0x10, 0xc2, 0xd9, 0x30, 0x0f, 0xc2, 0xd9, 0x52, 0x1b, 0xc2, 0xd9,
+    0x6c, 0x19, 0xc2, 0xd9, 0x7c, 0xc2, 0x17, 0x99, 0x0b, 0x46, 0x19, 0x43,
+    0x2c, 0xdc, 0xc2, 0xd9, 0x88, 0xc4, 0xe3, 0x03, 0x0b, 0x46, 0x01, 0xc3,
+    0xe6, 0x29, 0x0b, 0x45, 0xe1, 0x09, 0x42, 0xd9, 0x92, 0x10, 0xc2, 0xd9,
+    0x9e, 0x0f, 0xc2, 0xd9, 0xb6, 0x12, 0xc2, 0xd9, 0xd1, 0x47, 0xc0, 0x19,
+    0xc2, 0xd9, 0xe9, 0x0d, 0xc2, 0xd9, 0xf3, 0x0e, 0xc2, 0xda, 0x03, 0x42,
+    0x14, 0xda, 0xc2, 0xda, 0x13, 0x15, 0xc2, 0xda, 0x1d, 0x16, 0xc2, 0xda,
+    0x3b, 0xc5, 0xd5, 0xfb, 0x0b, 0x43, 0xb1, 0xc4, 0xa6, 0xdc, 0x0b, 0x43,
+    0x99, 0x1b, 0x42, 0xda, 0x47, 0xc3, 0x0a, 0x85, 0x0b, 0x42, 0x91, 0x15,
+    0xc2, 0xda, 0x53, 0x16, 0xc2, 0xda, 0x6d, 0x0d, 0xc2, 0xda, 0x7d, 0x0f,
+    0xc2, 0xda, 0x91, 0x10, 0xc2, 0xda, 0xb1, 0x0e, 0xc2, 0xda, 0xe7, 0x12,
+    0xc2, 0xdb, 0x00, 0x17, 0xc2, 0xdb, 0x16, 0xc3, 0x00, 0x79, 0x0b, 0x41,
+    0xd1, 0xc4, 0xe0, 0x17, 0x0b, 0x41, 0xc9, 0x09, 0x42, 0xdb, 0x22, 0xc7,
+    0xc8, 0x5b, 0x00, 0xdf, 0xf9, 0xc9, 0xaf, 0xc0, 0x00, 0xdf, 0xe8, 0x49,
+    0xa9, 0x09, 0x42, 0xdb, 0x2e, 0xc2, 0x00, 0xdb, 0x00, 0xde, 0xf9, 0xc2,
+    0x19, 0x2c, 0x00, 0xde, 0xe1, 0xc2, 0x0d, 0xf6, 0x00, 0xde, 0xc9, 0xc2,
+    0x01, 0xc3, 0x00, 0xde, 0xa9, 0xc2, 0x00, 0x39, 0x00, 0xde, 0x99, 0xc2,
+    0x01, 0x30, 0x00, 0xde, 0x79, 0xc2, 0x01, 0x4a, 0x00, 0xde, 0x61, 0xc2,
+    0x00, 0xb0, 0x00, 0xde, 0x41, 0xc2, 0x00, 0xd0, 0x00, 0xde, 0x19, 0x83,
+    0x00, 0xde, 0x08, 0xc6, 0xcd, 0x7f, 0x00, 0x4e, 0x70, 0x46, 0x00, 0x8b,
+    0x42, 0xdb, 0x40, 0xc2, 0x00, 0xd0, 0x00, 0x4d, 0x11, 0x83, 0x00, 0x4d,
+    0x08, 0xc2, 0x00, 0xd0, 0x00, 0x4d, 0x01, 0x83, 0x00, 0x4c, 0xf8, 0x94,
+    0x00, 0x4c, 0x5b, 0x02, 0xdb, 0x4c, 0x8e, 0x00, 0x4c, 0x62, 0x02, 0xdb,
+    0x50, 0xc4, 0x1e, 0x97, 0x00, 0x4e, 0x69, 0xc5, 0x40, 0xe7, 0x00, 0x4c,
+    0x18, 0xc7, 0x7a, 0x7f, 0x00, 0x4d, 0xe9, 0xc7, 0x14, 0x39, 0x00, 0x4c,
+    0x10, 0x94, 0x00, 0x4e, 0x20, 0x8e, 0x00, 0x4f, 0x18, 0xda, 0x1c, 0x04,
+    0x00, 0x4f, 0xc0, 0xc2, 0x02, 0xa0, 0x00, 0x4f, 0xa9, 0xc4, 0x02, 0xde,
+    0x00, 0x4f, 0xb0, 0xc2, 0x00, 0x64, 0x00, 0xd0, 0x79, 0x83, 0x00, 0xd0,
+    0x70, 0xc2, 0x02, 0x2b, 0x00, 0xd0, 0x19, 0x83, 0x00, 0xd0, 0x10, 0xa5,
+    0x01, 0x46, 0x00, 0x9f, 0x01, 0x40, 0x1b, 0x02, 0xdb, 0x54, 0xa0, 0x01,
+    0x40, 0x2b, 0x02, 0xdb, 0x7b, 0xa1, 0x01, 0x40, 0x4b, 0x02, 0xdb, 0x9b,
+    0xa2, 0x01, 0x40, 0x8b, 0x02, 0xdb, 0xb4, 0xa3, 0x01, 0x41, 0x0b, 0x02,
+    0xdb, 0xc6, 0xa5, 0x01, 0x44, 0x09, 0xa4, 0x01, 0x42, 0x0a, 0x02, 0xdb,
+    0xd1, 0xa0, 0x01, 0x40, 0x33, 0x02, 0xdb, 0xd5, 0xa1, 0x01, 0x40, 0x53,
+    0x02, 0xdb, 0xf5, 0xa2, 0x01, 0x40, 0x93, 0x02, 0xdc, 0x0e, 0xa3, 0x01,
+    0x41, 0x13, 0x02, 0xdc, 0x20, 0xa5, 0x01, 0x44, 0x11, 0xa4, 0x01, 0x42,
+    0x12, 0x02, 0xdc, 0x2b, 0xa1, 0x01, 0x40, 0x63, 0x02, 0xdc, 0x2f, 0xa2,
+    0x01, 0x40, 0xa3, 0x02, 0xdc, 0x48, 0xa3, 0x01, 0x41, 0x23, 0x02, 0xdc,
+    0x5a, 0xa5, 0x01, 0x44, 0x21, 0xa4, 0x01, 0x42, 0x22, 0x02, 0xdc, 0x65,
+    0xa2, 0x01, 0x40, 0xc3, 0x02, 0xdc, 0x69, 0xa3, 0x01, 0x41, 0x43, 0x02,
+    0xdc, 0x7b, 0xa5, 0x01, 0x44, 0x41, 0xa4, 0x01, 0x42, 0x42, 0x02, 0xdc,
+    0x86, 0xa3, 0x01, 0x41, 0x83, 0x02, 0xdc, 0x8a, 0xa5, 0x01, 0x44, 0x81,
+    0xa4, 0x01, 0x42, 0x82, 0x02, 0xdc, 0x95, 0xa5, 0x01, 0x45, 0x01, 0xa4,
+    0x01, 0x43, 0x02, 0x02, 0xdc, 0x99, 0xc8, 0x4b, 0x94, 0x08, 0x83, 0x29,
+    0xc7, 0x0d, 0x04, 0x08, 0x83, 0x20, 0xc2, 0x0d, 0x10, 0x08, 0x83, 0x08,
+    0xc2, 0x0d, 0x10, 0x08, 0x83, 0x00, 0xc3, 0x45, 0x6b, 0x08, 0x82, 0xf9,
+    0xc2, 0x00, 0x5f, 0x08, 0x82, 0xb0, 0xc3, 0x0d, 0x0f, 0x08, 0x82, 0xf1,
+    0xc2, 0x00, 0x33, 0x08, 0x82, 0xa8, 0xc4, 0x0d, 0x0e, 0x08, 0x82, 0xe9,
+    0xc3, 0x02, 0xdf, 0x08, 0x82, 0xa0, 0xc4, 0x18, 0x12, 0x08, 0x82, 0xe1,
+    0x91, 0x08, 0x82, 0x98, 0x42, 0x02, 0xa7, 0xc2, 0xdc, 0x9d, 0x46, 0x2e,
+    0xee, 0xc2, 0xdc, 0xa7, 0xc4, 0xd8, 0xde, 0x08, 0x81, 0xb9, 0xc3, 0x7e,
+    0x5e, 0x08, 0x81, 0xb0, 0xc2, 0x00, 0xd0, 0x08, 0x81, 0x01, 0x83, 0x08,
+    0x80, 0xf8, 0xc2, 0x00, 0xd0, 0x08, 0x80, 0xf1, 0x83, 0x08, 0x80, 0xe8,
+    0x8e, 0x08, 0x80, 0x6b, 0x02, 0xdc, 0xaf, 0x94, 0x08, 0x80, 0x5a, 0x02,
+    0xdc, 0xb3, 0x4f, 0x66, 0x39, 0x42, 0xdc, 0xb7, 0x97, 0x08, 0x82, 0x29,
+    0x8b, 0x08, 0x82, 0x19, 0x83, 0x08, 0x81, 0xc0, 0x8e, 0x08, 0x82, 0x03,
+    0x02, 0xdc, 0xbf, 0x94, 0x08, 0x81, 0xf2, 0x02, 0xdc, 0xc3, 0x97, 0x08,
+    0x81, 0xe8, 0x8b, 0x08, 0x81, 0xd8, 0xc4, 0x18, 0x10, 0x08, 0x83, 0x69,
+    0xc2, 0x22, 0xcc, 0x08, 0x83, 0x60, 0xc3, 0x0d, 0x14, 0x08, 0x83, 0x59,
+    0xc3, 0x09, 0x9e, 0x08, 0x83, 0x50, 0xc4, 0x02, 0xde, 0x08, 0x83, 0x49,
+    0xc2, 0x02, 0xa0, 0x08, 0x83, 0x40, 0x44, 0xe3, 0xbb, 0xc2, 0xdc, 0xc7,
+    0x4e, 0x6b, 0x44, 0xc2, 0xdc, 0xd3, 0xc8, 0x9c, 0xe0, 0x0e, 0x80, 0xb0,
+    0xc4, 0x99, 0xff, 0x0e, 0x87, 0x99, 0xc4, 0xe4, 0xa7, 0x0e, 0x87, 0x89,
+    0xc3, 0x2e, 0xd7, 0x0e, 0x82, 0x78, 0x44, 0xe3, 0xbb, 0xc2, 0xdc, 0xdf,
+    0xc8, 0x9c, 0xe0, 0x0e, 0x80, 0xe0, 0x00, 0xc2, 0xdc, 0xf1, 0xc2, 0x01,
+    0x6f, 0x0e, 0x81, 0x90, 0xc8, 0xbb, 0x0a, 0x0e, 0x82, 0xa1, 0xc8, 0xad,
+    0x15, 0x0e, 0x82, 0x60, 0x42, 0x02, 0x32, 0xc2, 0xdc, 0xfb, 0x95, 0x0e,
+    0x80, 0x8a, 0x02, 0xdd, 0x07, 0xc3, 0x63, 0x2b, 0x0e, 0x84, 0x21, 0xc8,
+    0x9c, 0xe0, 0x0e, 0x81, 0x10, 0x16, 0xc2, 0xdd, 0x0b, 0xc7, 0xc3, 0x22,
+    0x0e, 0x87, 0x18, 0x16, 0xc2, 0xdd, 0x17, 0xc7, 0xc3, 0x22, 0x0e, 0x86,
+    0xf8, 0xc3, 0x63, 0x2b, 0x0e, 0x83, 0x29, 0xcc, 0x84, 0x5d, 0x0e, 0x81,
+    0x59, 0xc8, 0x9c, 0xe0, 0x0e, 0x81, 0x50, 0x4f, 0x6b, 0x43, 0x42, 0xdd,
+    0x23, 0xc7, 0xc0, 0xf9, 0x0e, 0x86, 0xe9, 0xc5, 0xcc, 0xcc, 0x0e, 0x86,
+    0xe1, 0x46, 0xca, 0xf1, 0x42, 0xdd, 0x2f, 0x42, 0x00, 0x2c, 0xc2, 0xdd,
+    0x3b, 0xcc, 0x2e, 0x8a, 0x0e, 0x86, 0x78, 0xd5, 0x35, 0xb4, 0x0e, 0x86,
+    0xb9, 0xc8, 0x2e, 0x8e, 0x0e, 0x86, 0x68, 0xc6, 0xcc, 0xcb, 0x0e, 0x80,
+    0x58, 0xc6, 0xd2, 0x5f, 0x0e, 0x86, 0x31, 0xc5, 0x1a, 0x11, 0x0e, 0x86,
+    0x28, 0x42, 0x02, 0x32, 0xc2, 0xdd, 0x47, 0xc3, 0x09, 0xe5, 0x0e, 0x85,
+    0xd8, 0xc2, 0x00, 0x45, 0x0e, 0x85, 0xc1, 0x83, 0x0e, 0x81, 0xa8, 0xce,
+    0x6d, 0x78, 0x0e, 0x85, 0x99, 0xc5, 0x6d, 0x65, 0x0e, 0x85, 0x58, 0xcb,
+    0x94, 0xbc, 0x0e, 0x85, 0x91, 0xc7, 0x6d, 0x63, 0x0e, 0x85, 0x10, 0xcd,
+    0x7a, 0xfb, 0x0e, 0x85, 0x49, 0xc5, 0x6d, 0x65, 0x0e, 0x85, 0x40, 0xc6,
+    0x92, 0x38, 0x0e, 0x85, 0x39, 0xc9, 0x6d, 0x7d, 0x0e, 0x85, 0x30, 0xca,
+    0x94, 0x18, 0x0e, 0x83, 0x71, 0xc8, 0xb9, 0x3a, 0x0e, 0x83, 0x58, 0xc3,
+    0x63, 0x2b, 0x0e, 0x83, 0x19, 0x03, 0x42, 0xdd, 0x53, 0xc7, 0xc5, 0x05,
+    0x0e, 0x83, 0xc1, 0x48, 0xbf, 0x1a, 0x42, 0xdd, 0x5f, 0xcf, 0x65, 0xdf,
+    0x0e, 0x84, 0x69, 0xcc, 0x85, 0xb9, 0x0e, 0x84, 0x60, 0xc4, 0x77, 0x35,
+    0x0e, 0x82, 0xd0, 0xc3, 0x63, 0x2b, 0x0e, 0x82, 0xf9, 0xc8, 0x9c, 0xe0,
+    0x0e, 0x81, 0xe8, 0x00, 0x42, 0xdd, 0x6b, 0xc9, 0xad, 0x14, 0x0e, 0x82,
+    0x59, 0x8b, 0x0e, 0x82, 0x48, 0x5b, 0x18, 0xc0, 0xc2, 0xdd, 0x77, 0x46,
+    0x02, 0xae, 0x42, 0xdd, 0x83, 0xc6, 0x0b, 0x09, 0x01, 0x3a, 0x89, 0xc6,
+    0x02, 0xd1, 0x0f, 0xa9, 0xf0, 0xc6, 0x04, 0xe1, 0x0f, 0xda, 0x09, 0xc5,
+    0x00, 0x2c, 0x0f, 0xda, 0x10, 0x55, 0x16, 0xaa, 0xc2, 0xdd, 0x95, 0x48,
+    0x0a, 0x53, 0xc2, 0xdd, 0xa7, 0x4a, 0x13, 0xe3, 0x42, 0xdd, 0xb3, 0xc7,
+    0x16, 0x16, 0x01, 0x52, 0x91, 0x45, 0x00, 0x5a, 0x42, 0xdd, 0xbf, 0xc7,
+    0x80, 0x70, 0x01, 0x52, 0xf1, 0xc8, 0x52, 0x09, 0x01, 0x53, 0x00, 0x46,
+    0x00, 0x2c, 0xc2, 0xdd, 0xcb, 0x46, 0x01, 0xc8, 0xc2, 0xdd, 0xd5, 0x46,
+    0x02, 0xae, 0x42, 0xdd, 0xe1, 0xc9, 0xb2, 0x75, 0x0f, 0xaf, 0x71, 0xca,
+    0x0b, 0x94, 0x01, 0x80, 0x42, 0x02, 0xdd, 0xed, 0xcc, 0x12, 0x2d, 0x01,
+    0x59, 0x81, 0xcc, 0x8a, 0xed, 0x01, 0x59, 0x90, 0xe0, 0x09, 0xa7, 0x0f,
+    0xdc, 0xa0, 0x46, 0x00, 0x8b, 0x42, 0xdd, 0xf3, 0x44, 0x04, 0x91, 0xc2,
+    0xde, 0x03, 0xc3, 0x04, 0x20, 0x01, 0x2c, 0x60, 0x00, 0x42, 0xde, 0x0f,
+    0x46, 0x00, 0x8b, 0x42, 0xde, 0x1b, 0xc9, 0xb0, 0x6b, 0x01, 0x0d, 0x69,
+    0xca, 0x01, 0xfd, 0x01, 0x58, 0x20, 0xcc, 0x84, 0x99, 0x01, 0x1d, 0x19,
+    0xc9, 0x57, 0x36, 0x01, 0x1d, 0x11, 0xcc, 0x80, 0xcd, 0x01, 0x1d, 0x09,
+    0x45, 0x00, 0x8c, 0x42, 0xde, 0x27, 0xca, 0xa2, 0x74, 0x01, 0x1d, 0x49,
+    0xcc, 0x82, 0xe9, 0x01, 0x1d, 0x41, 0xca, 0xa3, 0x5a, 0x01, 0x1d, 0x38,
+    0xcd, 0x3f, 0xe8, 0x01, 0x2c, 0x69, 0xce, 0x08, 0x79, 0x01, 0x2c, 0x50,
+    0xd6, 0x31, 0x40, 0x01, 0x4e, 0x79, 0xd6, 0x14, 0xf9, 0x0f, 0xdb, 0x60,
+    0xcc, 0x00, 0x33, 0x01, 0x4c, 0x19, 0xcd, 0x69, 0x65, 0x01, 0x80, 0x70,
+    0xcc, 0x84, 0x15, 0x01, 0x4a, 0x81, 0xca, 0xa4, 0x18, 0x01, 0x4a, 0x58,
+    0xcc, 0x84, 0x15, 0x01, 0x4a, 0x51, 0xca, 0xa4, 0x18, 0x01, 0x4a, 0x70,
+    0xca, 0x03, 0xdd, 0x0f, 0xc4, 0x81, 0x48, 0x01, 0x9a, 0x42, 0xde, 0x45,
+    0xc5, 0x01, 0xa2, 0x01, 0x0e, 0xd1, 0xca, 0x52, 0xc2, 0x01, 0x48, 0x70,
+    0x46, 0x02, 0x5c, 0xc2, 0xde, 0x5a, 0xd1, 0x52, 0xbb, 0x01, 0x59, 0xb8,
+    0xd9, 0x1f, 0xf9, 0x0f, 0xc0, 0x21, 0x15, 0xc2, 0xde, 0x66, 0x42, 0x00,
+    0x58, 0xc2, 0xde, 0x72, 0xcf, 0x2c, 0x35, 0x01, 0x0f, 0xb9, 0x0e, 0xc2,
+    0xde, 0x7e, 0xc4, 0x01, 0x23, 0x01, 0x0d, 0x49, 0x16, 0xc2, 0xde, 0x8a,
+    0xca, 0x9e, 0x28, 0x01, 0x4a, 0x31, 0xd5, 0x03, 0xd2, 0x0f, 0xc0, 0xa1,
+    0xcc, 0x84, 0xb1, 0x0f, 0xc4, 0xc0, 0x43, 0x10, 0x9e, 0xc2, 0xde, 0x99,
+    0x47, 0x25, 0xf3, 0x42, 0xde, 0xa8, 0xd1, 0x56, 0x73, 0x01, 0x48, 0xf8,
+    0x45, 0x00, 0xd5, 0xc2, 0xde, 0xb8, 0x43, 0x02, 0x9c, 0x42, 0xde, 0xd0,
+    0x00, 0xc2, 0xde, 0xd6, 0xc5, 0x14, 0xa5, 0x01, 0x48, 0xd8, 0xd7, 0x2a,
+    0x26, 0x01, 0x0e, 0x59, 0x4a, 0x01, 0x58, 0x42, 0xde, 0xe2, 0xc6, 0x0e,
+    0xe0, 0x01, 0x53, 0xf9, 0xc5, 0x00, 0xd4, 0x01, 0x54, 0x0a, 0x02, 0xde,
+    0xee, 0xc8, 0x23, 0xa0, 0x01, 0x54, 0x69, 0xd2, 0x09, 0xd5, 0x01, 0x54,
+    0x78, 0xe0, 0x07, 0xc7, 0x01, 0x54, 0x98, 0xe0, 0x08, 0x87, 0x01, 0x3b,
+    0x98, 0xc4, 0x11, 0xa4, 0x01, 0x5e, 0x61, 0xc4, 0x0e, 0x6a, 0x0f, 0xbe,
+    0x20, 0xcf, 0x15, 0x36, 0x0f, 0xbd, 0x79, 0xd2, 0x22, 0x49, 0x0f, 0xbe,
+    0x48, 0xc2, 0x00, 0x43, 0x05, 0x27, 0xc1, 0xc3, 0xe4, 0xfa, 0x05, 0x27,
+    0xd1, 0xc2, 0x00, 0x6b, 0x05, 0x27, 0xd9, 0xc2, 0x00, 0xc1, 0x05, 0x27,
+    0xe1, 0xc3, 0xe6, 0x02, 0x05, 0x27, 0xe8, 0xdd, 0x10, 0xfa, 0x01, 0x50,
+    0x99, 0xdc, 0x12, 0xfd, 0x01, 0x50, 0x90, 0x1e, 0xc2, 0xde, 0xf4, 0x1d,
+    0xc2, 0xdf, 0x1e, 0xc7, 0xc8, 0x15, 0x08, 0x3a, 0xa1, 0xc5, 0xd6, 0x1e,
+    0x08, 0x3a, 0xa8, 0x23, 0xc2, 0xdf, 0x52, 0x1d, 0xc2, 0xdf, 0x66, 0x1e,
+    0xc2, 0xdf, 0x86, 0x1f, 0xc2, 0xdf, 0xae, 0x20, 0xc2, 0xdf, 0xd2, 0x21,
+    0xc2, 0xdf, 0xde, 0x22, 0x42, 0xdf, 0xfe, 0x9d, 0x08, 0x3b, 0x01, 0x9e,
+    0x08, 0x3b, 0x09, 0x9f, 0x08, 0x3b, 0x11, 0xa0, 0x08, 0x3b, 0x19, 0xa1,
+    0x08, 0x3b, 0x21, 0xa2, 0x08, 0x3b, 0x29, 0xa3, 0x08, 0x3b, 0x31, 0xa4,
+    0x08, 0x3b, 0x38, 0x1d, 0xc2, 0xe0, 0x22, 0x1e, 0x42, 0xe0, 0x46, 0xc6,
+    0xcf, 0x41, 0x08, 0x32, 0x39, 0xc3, 0xe6, 0x50, 0x08, 0x32, 0x79, 0xc3,
+    0xe6, 0x5c, 0x08, 0x32, 0x50, 0x1d, 0xc2, 0xe0, 0x6c, 0x1e, 0xc2, 0xe0,
+    0x90, 0x1f, 0xc2, 0xe0, 0xb8, 0x20, 0xc2, 0xe0, 0xe0, 0x21, 0xc2, 0xe1,
+    0x08, 0x22, 0xc2, 0xe1, 0x30, 0x23, 0xc2, 0xe1, 0x58, 0x24, 0x42, 0xe1,
+    0x80, 0x1d, 0xc2, 0xe1, 0x88, 0x1e, 0x42, 0xe1, 0xc4, 0x1d, 0xc2, 0xe1,
+    0xfa, 0x1e, 0xc2, 0xe2, 0x1a, 0x1f, 0xc2, 0xe2, 0x32, 0x20, 0xc2, 0xe2,
+    0x56, 0x21, 0xc2, 0xe2, 0x7a, 0x22, 0xc2, 0xe2, 0x96, 0x23, 0xc2, 0xe2,
+    0xba, 0x24, 0xc2, 0xe2, 0xd2, 0x25, 0xc2, 0xe2, 0xfa, 0x26, 0x42, 0xe3,
+    0x22, 0x49, 0xae, 0x8e, 0xc2, 0xe3, 0x3a, 0x47, 0xc3, 0x7d, 0x42, 0xe3,
+    0x62, 0x04, 0xc2, 0xe3, 0x8a, 0x48, 0xbf, 0x62, 0x42, 0xe3, 0x92, 0x1e,
+    0xc2, 0xe3, 0xa2, 0xc9, 0xae, 0x2b, 0x08, 0x06, 0x90, 0x83, 0x00, 0xc9,
+    0xa1, 0xc2, 0x01, 0x30, 0x00, 0xc9, 0x88, 0x91, 0x00, 0xc9, 0x28, 0x87,
+    0x00, 0xc9, 0x18, 0x97, 0x00, 0xc9, 0x31, 0x8b, 0x00, 0xc9, 0x20, 0xc6,
+    0x0b, 0x09, 0x0f, 0xbf, 0x59, 0xc6, 0x02, 0xd1, 0x0f, 0xbf, 0x20, 0xc7,
+    0x3a, 0x19, 0x0f, 0xa9, 0xb9, 0xc6, 0x02, 0xd1, 0x0f, 0xa9, 0xa9, 0xc6,
+    0x0b, 0x09, 0x0f, 0xbf, 0x30, 0xdf, 0x0d, 0x5d, 0x08, 0x59, 0xf9, 0xdd,
+    0x11, 0xc5, 0x08, 0x59, 0xe8, 0xc7, 0x3a, 0x19, 0x0f, 0xa9, 0xb1, 0xc6,
+    0x02, 0xd1, 0x0f, 0xbf, 0x01, 0xc6, 0x0b, 0x09, 0x0f, 0xbf, 0x38, 0xdf,
+    0x0c, 0xa3, 0x08, 0x59, 0xf1, 0xdd, 0x05, 0x0a, 0x08, 0x59, 0xe0, 0x95,
+    0x00, 0x03, 0x9b, 0x02, 0xe3, 0xb0, 0x85, 0x00, 0x03, 0x1b, 0x02, 0xe3,
+    0xd4, 0x96, 0x00, 0x03, 0xa3, 0x02, 0xe3, 0xf8, 0x91, 0x00, 0x03, 0x7b,
+    0x02, 0xe4, 0x32, 0x8b, 0x00, 0x03, 0x4b, 0x02, 0xe4, 0x56, 0x86, 0x00,
+    0x03, 0x23, 0x02, 0xe4, 0x6a, 0x87, 0x00, 0x03, 0x2b, 0x02, 0xe4, 0x8b,
+    0x94, 0x00, 0x03, 0x93, 0x02, 0xe4, 0xb9, 0x8e, 0x00, 0x03, 0x63, 0x02,
+    0xe4, 0xd2, 0x88, 0x00, 0x03, 0x33, 0x02, 0xe5, 0x01, 0x9b, 0x00, 0x03,
+    0xcb, 0x02, 0xe5, 0x10, 0x8f, 0x00, 0x03, 0x6b, 0x02, 0xe5, 0x1c, 0x97,
+    0x00, 0x03, 0xab, 0x02, 0xe5, 0x2e, 0x83, 0x00, 0x03, 0x0b, 0x02, 0xe5,
+    0x4b, 0x99, 0x00, 0x03, 0xbb, 0x02, 0xe5, 0x7c, 0x8a, 0x00, 0x03, 0x43,
+    0x02, 0xe5, 0x82, 0x9c, 0x00, 0x03, 0xd3, 0x02, 0xe5, 0x9b, 0x9a, 0x00,
+    0x03, 0xc3, 0x02, 0xe5, 0xa1, 0x98, 0x00, 0x03, 0xb3, 0x02, 0xe5, 0xa7,
+    0x92, 0x00, 0x03, 0x83, 0x02, 0xe5, 0xc3, 0x90, 0x00, 0x03, 0x73, 0x02,
+    0xe5, 0xcf, 0x8d, 0x00, 0x03, 0x5b, 0x02, 0xe5, 0xdd, 0x89, 0x00, 0x03,
+    0x3b, 0x02, 0xe5, 0xe9, 0x84, 0x00, 0x03, 0x13, 0x02, 0xe6, 0x01, 0x8c,
+    0x00, 0x03, 0x53, 0x02, 0xe6, 0x23, 0x93, 0x00, 0x03, 0x8a, 0x02, 0xe6,
+    0x29, 0xc2, 0x00, 0x15, 0x07, 0xd8, 0x31, 0xc8, 0xb8, 0xe2, 0x07, 0xd8,
+    0x29, 0x08, 0xc2, 0xe6, 0x35, 0xc2, 0x00, 0x0b, 0x00, 0x09, 0x99, 0xc2,
+    0x49, 0x0c, 0x00, 0x0a, 0x98, 0x46, 0x45, 0x87, 0x42, 0xe6, 0x44, 0x46,
+    0x00, 0x8b, 0x42, 0xe6, 0x58, 0xc2, 0x25, 0xa1, 0x00, 0xe9, 0x19, 0xc2,
+    0x00, 0x8e, 0x00, 0xe8, 0x30, 0x48, 0x10, 0x2f, 0xc2, 0xe6, 0x64, 0xcf,
+    0x6a, 0x26, 0x05, 0x5a, 0x31, 0xc2, 0x01, 0xdf, 0x05, 0x3b, 0xb0, 0x97,
+    0x00, 0xe8, 0xa9, 0xc5, 0xd4, 0x9d, 0x00, 0xe8, 0x81, 0x87, 0x00, 0x13,
+    0xb0, 0xc7, 0xc3, 0x84, 0x00, 0xe8, 0x18, 0x87, 0x00, 0xe8, 0x08, 0xca,
+    0x1f, 0x59, 0x00, 0x14, 0xd8, 0xc9, 0xab, 0xb5, 0x00, 0x14, 0x08, 0x46,
+    0x00, 0x8b, 0xc2, 0xe6, 0x6c, 0xc3, 0x3c, 0x63, 0x00, 0x10, 0xe0, 0x45,
+    0x04, 0xcc, 0xc2, 0xe6, 0xa3, 0x46, 0x00, 0x8b, 0x42, 0xe6, 0xaf, 0x00,
+    0xc2, 0xe6, 0xc1, 0xc6, 0x10, 0x3f, 0x00, 0x0d, 0x88, 0x46, 0x00, 0x8b,
+    0xc2, 0xe6, 0xcd, 0x91, 0x05, 0x3a, 0x71, 0xc4, 0x6d, 0xb5, 0x05, 0x3d,
+    0xb1, 0xcb, 0x8e, 0xc3, 0x05, 0x3e, 0x01, 0x44, 0x05, 0x76, 0xc2, 0xe7,
+    0x18, 0x8b, 0x00, 0x0d, 0x11, 0x97, 0x00, 0x11, 0x10, 0x46, 0x00, 0x8b,
+    0xc2, 0xe7, 0x20, 0x95, 0x05, 0x3b, 0x61, 0x47, 0x67, 0x21, 0xc2, 0xe7,
+    0x61, 0xc3, 0x01, 0xbb, 0x00, 0x0c, 0xb0, 0x46, 0x00, 0x8b, 0xc2, 0xe7,
+    0x79, 0x4e, 0x73, 0x36, 0xc2, 0xe7, 0xbd, 0x96, 0x05, 0x3b, 0x53, 0x02,
+    0xe7, 0xc9, 0xc2, 0x00, 0x75, 0x00, 0x0a, 0x51, 0xc2, 0x01, 0xe2, 0x00,
+    0x0d, 0x49, 0xc2, 0x25, 0xa1, 0x00, 0x0d, 0xba, 0x02, 0xe7, 0xcd, 0x46,
+    0x00, 0x8b, 0xc2, 0xe7, 0xd1, 0x87, 0x00, 0x06, 0x33, 0x02, 0xe8, 0x18,
+    0x83, 0x05, 0x39, 0x91, 0x91, 0x05, 0x39, 0xa1, 0x97, 0x05, 0x39, 0xb1,
+    0x98, 0x05, 0x39, 0xc3, 0x02, 0xe8, 0x1e, 0x9b, 0x05, 0x39, 0xe1, 0xca,
+    0xa4, 0x72, 0x05, 0x3e, 0x11, 0xc4, 0xde, 0x3f, 0x01, 0x63, 0x69, 0xc8,
+    0xbd, 0x8a, 0x00, 0x0c, 0x48, 0xc6, 0xa2, 0xbb, 0x00, 0xf4, 0xf1, 0x46,
+    0x00, 0x8b, 0xc2, 0xe8, 0x24, 0xc7, 0xc8, 0xfc, 0x05, 0x3c, 0x59, 0x05,
+    0xc2, 0xe8, 0x47, 0xc8, 0xbe, 0x02, 0x05, 0x3e, 0xc1, 0x45, 0x03, 0x14,
+    0x42, 0xe8, 0x53, 0x46, 0x00, 0x8b, 0x42, 0xe8, 0x5f, 0x47, 0x01, 0x32,
+    0x42, 0xe8, 0x83, 0x46, 0x00, 0x8b, 0xc2, 0xe8, 0x8f, 0xc3, 0x95, 0x51,
+    0x00, 0x0f, 0xb8, 0x46, 0x00, 0x8b, 0xc2, 0xe8, 0xab, 0x9b, 0x05, 0x3b,
+    0x01, 0xcb, 0x91, 0x15, 0x05, 0x3b, 0x11, 0xc3, 0x02, 0x39, 0x05, 0x3b,
+    0x41, 0x47, 0xc8, 0xcb, 0x42, 0xe8, 0xbb, 0x46, 0x00, 0x8b, 0xc2, 0xe8,
+    0xcd, 0x9c, 0x05, 0x39, 0x41, 0xc7, 0xc3, 0xa0, 0x05, 0x39, 0x51, 0xc4,
+    0x2a, 0xcc, 0x00, 0x06, 0xf3, 0x02, 0xe8, 0xed, 0x46, 0x45, 0x87, 0xc2,
+    0xe8, 0xf6, 0x44, 0x05, 0x14, 0x42, 0xe9, 0x1b, 0x00, 0xc2, 0xe9, 0x2d,
+    0x48, 0x10, 0x2f, 0xc2, 0xe9, 0x39, 0xca, 0xa6, 0x66, 0x05, 0x3a, 0xe0,
+    0x46, 0x00, 0x8b, 0x42, 0xe9, 0x4f, 0x46, 0x00, 0x8b, 0xc2, 0xe9, 0x6b,
+    0x8c, 0x00, 0x0e, 0x50, 0x46, 0x00, 0x8b, 0xc2, 0xe9, 0x95, 0x8c, 0x00,
+    0x0e, 0x38, 0x46, 0x00, 0x8b, 0x42, 0xe9, 0xbf, 0x46, 0x00, 0x8b, 0xc2,
+    0xe9, 0xe8, 0xc4, 0xde, 0xa3, 0x00, 0x0f, 0xb1, 0xc3, 0x0a, 0xe3, 0x05,
+    0x39, 0x31, 0xc5, 0xd3, 0x2c, 0x01, 0x63, 0xa8, 0x46, 0x00, 0x8b, 0xc2,
+    0xea, 0x02, 0x47, 0x23, 0x34, 0xc2, 0xea, 0x30, 0xc4, 0x38, 0x2c, 0x00,
+    0x0c, 0xa1, 0xc2, 0x00, 0xd0, 0x00, 0x0d, 0x10, 0x46, 0x00, 0x8b, 0x42,
+    0xea, 0x42, 0x46, 0x00, 0x8b, 0xc2, 0xea, 0x54, 0x9c, 0x00, 0x0f, 0x8a,
+    0x02, 0xea, 0x74, 0x46, 0x00, 0x8b, 0xc2, 0xea, 0x7a, 0xc2, 0x00, 0x0a,
+    0x05, 0x3d, 0x99, 0xc8, 0xba, 0x4a, 0x05, 0x39, 0x63, 0x02, 0xea, 0xa2,
+    0xc2, 0x00, 0x45, 0x05, 0x3b, 0x71, 0xcf, 0x67, 0x1a, 0x05, 0x3e, 0x80,
+    0x46, 0x00, 0x8b, 0xc2, 0xea, 0xa8, 0xc3, 0x04, 0x87, 0x05, 0x3d, 0xa1,
+    0xc7, 0xc9, 0xb9, 0x05, 0x3a, 0x30, 0x46, 0x00, 0x8b, 0x42, 0xea, 0xcc,
+    0x46, 0x00, 0x8b, 0x42, 0xea, 0xd6, 0xc4, 0xdf, 0x43, 0x00, 0x74, 0x11,
+    0xc3, 0x02, 0x45, 0x00, 0x74, 0x20, 0xc2, 0x0f, 0x7b, 0x00, 0x76, 0xf1,
+    0xc3, 0x4d, 0xc3, 0x00, 0x76, 0xf8, 0xc2, 0x19, 0x2c, 0x00, 0x74, 0x71,
+    0xc2, 0x00, 0xc1, 0x00, 0x74, 0x98, 0x83, 0x00, 0x74, 0x79, 0xc2, 0x00,
+    0xd0, 0x00, 0x74, 0x80, 0x06, 0xc2, 0xea, 0xe2, 0xc2, 0x00, 0xd0, 0x00,
+    0x74, 0xc0, 0xc5, 0x00, 0x2c, 0x0f, 0xda, 0xa9, 0xc6, 0x04, 0xe1, 0x0f,
+    0xda, 0xa1, 0xcc, 0x04, 0xcb, 0x0f, 0xdb, 0x38, 0x46, 0x01, 0xc8, 0xc2,
+    0xea, 0xec, 0xd2, 0x4b, 0x83, 0x0f, 0xdb, 0x18, 0xd2, 0x4b, 0x83, 0x0f,
+    0xdb, 0x11, 0x46, 0x01, 0xc8, 0x42, 0xea, 0xf8, 0xc6, 0x04, 0xe1, 0x0f,
+    0xda, 0xc9, 0xc5, 0x00, 0x2c, 0x0f, 0xda, 0xd1, 0xcc, 0x04, 0xcb, 0x0f,
+    0xda, 0xe0, 0x46, 0x02, 0xae, 0xc2, 0xeb, 0x04, 0xd2, 0x4c, 0x37, 0x0f,
+    0xda, 0xf0, 0xd2, 0x4c, 0x37, 0x0f, 0xda, 0xe9, 0x46, 0x02, 0xae, 0x42,
+    0xeb, 0x10, 0x46, 0x00, 0x8b, 0x42, 0xeb, 0x1c, 0xd4, 0x3e, 0x6c, 0x01,
+    0x5d, 0xc0, 0xc5, 0x01, 0xa2, 0x01, 0x5b, 0x0b, 0x02, 0xeb, 0x28, 0xcc,
+    0x82, 0xb9, 0x01, 0x5b, 0x59, 0xcd, 0x7c, 0xa8, 0x01, 0x5c, 0x28, 0xd5,
+    0x03, 0xd2, 0x0f, 0xc0, 0xa9, 0xd8, 0x22, 0x5b, 0x0f, 0xc0, 0x49, 0xd9,
+    0x1f, 0xf9, 0x0f, 0xc0, 0x29, 0x46, 0x03, 0x13, 0xc2, 0xeb, 0x2c, 0xcd,
+    0x75, 0xa6, 0x01, 0x0e, 0xf1, 0x44, 0x08, 0xba, 0xc2, 0xeb, 0x38, 0xd1,
+    0x01, 0x68, 0x01, 0x48, 0x49, 0xcc, 0x84, 0xb1, 0x0f, 0xc4, 0xc8, 0x47,
+    0x13, 0x6d, 0xc2, 0xeb, 0x44, 0xc6, 0x10, 0x9d, 0x01, 0x4a, 0xc1, 0xc8,
+    0xae, 0xbc, 0x01, 0x4b, 0x00, 0xc8, 0xae, 0xbc, 0x01, 0x4a, 0xe1, 0xc6,
+    0x10, 0x9d, 0x01, 0x4a, 0xa0, 0xe0, 0x0a, 0xe7, 0x01, 0x3a, 0x58, 0xd6,
+    0x2e, 0x6a, 0x01, 0x39, 0xc1, 0xca, 0x22, 0x51, 0x0f, 0xbe, 0x79, 0xcd,
+    0x0e, 0x61, 0x0f, 0xbe, 0x88, 0xc3, 0xe5, 0x8a, 0x0f, 0xb3, 0x29, 0xc9,
+    0xb4, 0x91, 0x0f, 0xb2, 0xe8, 0xc5, 0x01, 0xa2, 0x01, 0x3c, 0xc1, 0x49,
+    0x01, 0xaa, 0x42, 0xeb, 0x4e, 0xdd, 0x0a, 0x8a, 0x01, 0x3a, 0xe1, 0x44,
+    0x05, 0x9e, 0x42, 0xeb, 0x5a, 0xcf, 0x15, 0x36, 0x0f, 0xbd, 0xc1, 0xd2,
+    0x22, 0x49, 0x0f, 0xbe, 0x60, 0xc3, 0xe5, 0x8a, 0x0f, 0xb3, 0x31, 0xc9,
+    0xb4, 0x91, 0x0f, 0xb2, 0xf0, 0xe0, 0x0c, 0x07, 0x01, 0x3d, 0x68, 0x44,
+    0x00, 0x58, 0xc2, 0xeb, 0x60, 0x44, 0x07, 0x69, 0x42, 0xeb, 0x66, 0xd0,
+    0x08, 0x97, 0x01, 0x3b, 0x81, 0xd7, 0x0a, 0x90, 0x01, 0x3b, 0x70, 0xd5,
+    0x03, 0xd2, 0x0f, 0xc0, 0xc1, 0xdb, 0x17, 0x46, 0x0f, 0xc0, 0xe0, 0xd1,
+    0x56, 0x0d, 0x01, 0x3a, 0x19, 0xc8, 0x0a, 0xff, 0x01, 0x39, 0xe8, 0xd0,
+    0x20, 0x66, 0x01, 0x3d, 0xc9, 0xd0, 0x03, 0xb7, 0x01, 0x3d, 0xc1, 0xd0,
+    0x3c, 0x90, 0x01, 0x3d, 0xb8, 0x47, 0x3b, 0x9c, 0xc2, 0xeb, 0x6c, 0xc5,
+    0x1c, 0xae, 0x01, 0x3b, 0x20, 0xd9, 0x1e, 0x1e, 0x01, 0x37, 0x19, 0xcd,
+    0x78, 0x30, 0x01, 0x5a, 0xb8, 0xdd, 0x0a, 0x8a, 0x01, 0x3a, 0xf1, 0x44,
+    0x05, 0x9e, 0x42, 0xeb, 0x78, 0xd5, 0x03, 0xd2, 0x0f, 0xc0, 0xd9, 0xdb,
+    0x17, 0x46, 0x0f, 0xc0, 0xf8, 0x46, 0x00, 0x8b, 0x42, 0xeb, 0x7e, 0xd0,
+    0x08, 0x97, 0x01, 0x3b, 0x89, 0xd7, 0x0a, 0x90, 0x01, 0x3b, 0x78, 0x00,
+    0x42, 0xeb, 0x8a, 0xc3, 0x4a, 0xb9, 0x00, 0x2f, 0x91, 0xc3, 0x04, 0xac,
+    0x00, 0x2f, 0x80, 0xc4, 0xe4, 0x8b, 0x07, 0xda, 0x71, 0xc6, 0x64, 0xa4,
+    0x07, 0xda, 0x20, 0xc4, 0xe4, 0x8b, 0x07, 0xda, 0x69, 0xc6, 0x64, 0xa4,
+    0x07, 0xd9, 0xd8, 0xc4, 0xe4, 0x8b, 0x07, 0xda, 0x61, 0xc6, 0x64, 0xa4,
+    0x07, 0xd9, 0x88, 0xc5, 0xd5, 0xa6, 0x07, 0xda, 0x59, 0xc6, 0x64, 0xa4,
+    0x07, 0xd9, 0xa8, 0xcc, 0x84, 0x75, 0x07, 0xda, 0x50, 0xcc, 0x84, 0x75,
+    0x07, 0xda, 0x30, 0xcc, 0x84, 0x75, 0x07, 0xd9, 0xc0, 0x46, 0x00, 0x8b,
+    0x42, 0xeb, 0x96, 0xcc, 0x84, 0x75, 0x07, 0xda, 0x08, 0xcc, 0x84, 0x75,
+    0x07, 0xda, 0x18, 0xcc, 0x84, 0x75, 0x07, 0xd9, 0xd0, 0xc6, 0x64, 0xa4,
+    0x07, 0xd9, 0xc9, 0xc5, 0xd5, 0x83, 0x07, 0xd8, 0xe8, 0xc2, 0x00, 0x07,
+    0x00, 0x2e, 0x83, 0x02, 0xeb, 0xa3, 0x4a, 0x9f, 0x18, 0x42, 0xeb, 0xa9,
+    0xc6, 0xcc, 0x59, 0x00, 0x2e, 0x38, 0xc6, 0x44, 0x50, 0x00, 0x2e, 0x09,
+    0xc3, 0x62, 0x7d, 0x00, 0x2d, 0x80, 0xce, 0x6d, 0xda, 0x00, 0x2d, 0xd0,
+    0xc6, 0xcc, 0xe3, 0x00, 0x2d, 0x99, 0xc5, 0x79, 0xbe, 0x00, 0x2d, 0x91,
+    0xc5, 0xa0, 0x88, 0x00, 0x2d, 0x88, 0xc5, 0xd7, 0xfe, 0x00, 0x2c, 0xa9,
+    0xc5, 0xcc, 0x5a, 0x00, 0x2c, 0xa0, 0xc6, 0xcc, 0xd7, 0x00, 0x2d, 0x49,
+    0xc6, 0xd0, 0xe5, 0x00, 0x2d, 0x00, 0xc2, 0x4a, 0xce, 0x02, 0x6e, 0x31,
+    0xce, 0x71, 0xa0, 0x02, 0x6f, 0x90, 0x11, 0xc2, 0xeb, 0xb5, 0xcc, 0x7f,
+    0xdc, 0x02, 0x6e, 0xd8, 0x00, 0x42, 0xeb, 0xc1, 0xc2, 0x19, 0x2c, 0x08,
+    0x68, 0xc9, 0xc2, 0x01, 0x4a, 0x08, 0x68, 0xb8, 0x02, 0x42, 0xeb, 0xcd,
+    0x44, 0x3a, 0xbf, 0xc2, 0xeb, 0xf9, 0xc3, 0x39, 0x37, 0x00, 0x88, 0x4a,
+    0x02, 0xec, 0x39, 0xc5, 0xd9, 0xca, 0x05, 0x4b, 0xd8, 0xc6, 0xba, 0x7c,
+    0x00, 0x88, 0x8b, 0x02, 0xec, 0x3d, 0xc4, 0x79, 0xf3, 0x00, 0x88, 0x3b,
+    0x02, 0xec, 0x41, 0xc6, 0xca, 0x0e, 0x00, 0x8a, 0x00, 0x02, 0x42, 0xec,
+    0x45, 0x02, 0x42, 0xec, 0x6f, 0xc5, 0xc0, 0x7d, 0x00, 0x88, 0x1b, 0x02,
+    0xec, 0x87, 0xc6, 0xc1, 0x86, 0x00, 0x88, 0x80, 0xc5, 0x8e, 0xdf, 0x00,
+    0x88, 0x03, 0x02, 0xec, 0x8b, 0xc6, 0xbb, 0xec, 0x00, 0x88, 0x79, 0x47,
+    0x79, 0xeb, 0x42, 0xec, 0x91, 0x02, 0x42, 0xec, 0xa7, 0xc4, 0xc6, 0x7a,
+    0x00, 0x88, 0x63, 0x02, 0xec, 0xcb, 0x42, 0x00, 0x0a, 0xc2, 0xec, 0xd1,
+    0x4a, 0xa3, 0x00, 0x42, 0xec, 0xe0, 0xc6, 0xb7, 0x9c, 0x00, 0x8a, 0x61,
+    0xc9, 0x90, 0xe0, 0x00, 0x8a, 0xc8, 0xc6, 0x92, 0x0c, 0x00, 0x8b, 0x01,
+    0x83, 0x00, 0x8b, 0x0b, 0x02, 0xec, 0xe8, 0x1b, 0xc2, 0xec, 0xf9, 0x87,
+    0x00, 0x8b, 0x33, 0x02, 0xed, 0x1c, 0x91, 0x00, 0x8b, 0x4b, 0x02, 0xed,
+    0x2a, 0x19, 0xc2, 0xed, 0x32, 0x97, 0x00, 0x8b, 0x73, 0x02, 0xed, 0x44,
+    0x8b, 0x00, 0x8b, 0xab, 0x02, 0xed, 0x48, 0xca, 0xa6, 0x02, 0x00, 0x8d,
+    0x10, 0x0d, 0xc2, 0xed, 0x4c, 0x15, 0xc2, 0xed, 0x61, 0xc5, 0xd9, 0x61,
+    0x00, 0x8d, 0x5b, 0x02, 0xed, 0x70, 0x16, 0xc2, 0xed, 0x74, 0xc5, 0xd6,
+    0x8c, 0x00, 0x8d, 0x7b, 0x02, 0xed, 0x83, 0xc5, 0xda, 0xe7, 0x00, 0x8d,
+    0xbb, 0x02, 0xed, 0x87, 0x12, 0xc2, 0xed, 0x8b, 0xc5, 0xb7, 0x9d, 0x00,
+    0x8d, 0xe3, 0x02, 0xed, 0xa6, 0x05, 0xc2, 0xed, 0xaa, 0xc5, 0x90, 0xe4,
+    0x00, 0x8e, 0x13, 0x02, 0xed, 0xb9, 0x42, 0x0c, 0x43, 0x42, 0xed, 0xbd,
+    0xc5, 0x8e, 0xdf, 0x01, 0x89, 0x8b, 0x02, 0xed, 0xcc, 0xc6, 0xbb, 0xec,
+    0x01, 0x8a, 0x59, 0x47, 0x79, 0xeb, 0x42, 0xed, 0xd2, 0x44, 0x3a, 0xbf,
+    0xc2, 0xed, 0xe2, 0xc3, 0x39, 0x37, 0x01, 0x8a, 0x2a, 0x02, 0xee, 0x12,
+    0x02, 0x42, 0xee, 0x16, 0xc5, 0xc0, 0x7d, 0x01, 0x89, 0xb9, 0xc6, 0xc1,
+    0x86, 0x01, 0x8a, 0x60, 0x02, 0x42, 0xee, 0x34, 0x02, 0x42, 0xee, 0x5d,
+    0xc4, 0x79, 0xf3, 0x01, 0x8a, 0x13, 0x02, 0xee, 0x67, 0xc6, 0xba, 0x7c,
+    0x01, 0x8a, 0x69, 0xc6, 0xca, 0x0e, 0x01, 0x8b, 0xf8, 0xc4, 0xb7, 0x9e,
+    0x01, 0x8a, 0x38, 0xc4, 0xc6, 0x7a, 0x01, 0x8a, 0x41, 0xc6, 0xc6, 0x79,
+    0x01, 0x8a, 0x50, 0x87, 0x01, 0x8a, 0x81, 0xc4, 0xa6, 0x08, 0x01, 0x8c,
+    0x6a, 0x02, 0xee, 0x6b, 0x83, 0x01, 0x8a, 0x8b, 0x02, 0xee, 0x6f, 0x87,
+    0x01, 0x8a, 0xb3, 0x02, 0xee, 0x73, 0x91, 0x01, 0x8a, 0xdb, 0x02, 0xee,
+    0x83, 0x97, 0x01, 0x8b, 0x03, 0x02, 0xee, 0x87, 0x8b, 0x01, 0x8b, 0x10,
+    0x91, 0x01, 0x8a, 0x99, 0x97, 0x01, 0x8b, 0x08, 0x87, 0x01, 0x8a, 0xd0,
+    0x83, 0x01, 0x8a, 0xc3, 0x02, 0xee, 0x8b, 0x87, 0x01, 0x8a, 0xf3, 0x02,
+    0xee, 0x8f, 0x8b, 0x01, 0x8a, 0xf8, 0x91, 0x01, 0x81, 0x11, 0xc4, 0x18,
+    0x12, 0x01, 0x81, 0xc8, 0xc3, 0x02, 0xdf, 0x01, 0x81, 0x19, 0xc4, 0x0d,
+    0x0e, 0x01, 0x81, 0xd0, 0xc3, 0x77, 0x79, 0x08, 0x47, 0x89, 0xc4, 0xdc,
+    0x2d, 0x08, 0x47, 0x70, 0x91, 0x07, 0xfb, 0x31, 0x83, 0x07, 0xfc, 0xe0,
+    0x45, 0x03, 0x14, 0xc2, 0xee, 0x93, 0x83, 0x07, 0xfb, 0xd9, 0x97, 0x07,
+    0xfb, 0xe9, 0x87, 0x07, 0xfb, 0xf1, 0x91, 0x07, 0xfb, 0xf9, 0x8b, 0x07,
+    0xfb, 0xe0, 0x83, 0x07, 0xfb, 0xb1, 0x8b, 0x07, 0xfb, 0xb9, 0x87, 0x07,
+    0xfb, 0xc9, 0x91, 0x07, 0xfb, 0xd1, 0x97, 0x07, 0xfb, 0xc0, 0x83, 0x07,
+    0xfc, 0x01, 0x8b, 0x07, 0xfc, 0x09, 0x97, 0x07, 0xfc, 0x11, 0x87, 0x07,
+    0xfc, 0x19, 0x91, 0x07, 0xfc, 0x20, 0x87, 0x07, 0xfc, 0x41, 0x91, 0x07,
+    0xfc, 0x49, 0x83, 0x07, 0xfc, 0x29, 0x8b, 0x07, 0xfc, 0x31, 0x97, 0x07,
+    0xfc, 0x38, 0x8b, 0x07, 0xfc, 0x59, 0x97, 0x07, 0xfc, 0x61, 0x87, 0x07,
+    0xfc, 0x69, 0x83, 0x07, 0xfc, 0x51, 0x91, 0x07, 0xfc, 0x70, 0x8b, 0x07,
+    0xfc, 0x81, 0x91, 0x07, 0xfc, 0x99, 0x83, 0x07, 0xfc, 0x79, 0x97, 0x07,
+    0xfc, 0x89, 0x87, 0x07, 0xfc, 0x90, 0x83, 0x07, 0xfc, 0xa1, 0x97, 0x07,
+    0xfc, 0xa9, 0x91, 0x07, 0xfc, 0xb0, 0x97, 0x07, 0xfc, 0xc9, 0x87, 0x07,
+    0xfc, 0xd1, 0x91, 0x07, 0xfc, 0xd9, 0x83, 0x07, 0xfc, 0xb9, 0x8b, 0x07,
+    0xfc, 0xc0, 0xc5, 0xd9, 0xca, 0x07, 0xfd, 0x18, 0xc6, 0x8e, 0xde, 0x07,
+    0xfd, 0x11, 0xc5, 0x79, 0xf2, 0x07, 0xfd, 0x99, 0xc4, 0xad, 0x2b, 0x07,
+    0xfd, 0xb1, 0xc5, 0xdb, 0xff, 0x07, 0xfd, 0xc9, 0xc6, 0xc0, 0x7c, 0x07,
+    0xfd, 0x40, 0xc6, 0x8e, 0xde, 0x07, 0xfd, 0x51, 0xc5, 0xda, 0xe7, 0x07,
+    0xfd, 0x59, 0x12, 0xc2, 0xee, 0xb1, 0xc4, 0xad, 0x2b, 0x07, 0xfd, 0x69,
+    0xc7, 0xc1, 0x85, 0x07, 0xfd, 0x71, 0xc5, 0x90, 0xe4, 0x07, 0xfd, 0x80,
+    0xc5, 0xd9, 0xca, 0x07, 0xfd, 0xa0, 0x87, 0x07, 0xfe, 0x28, 0x91, 0x07,
+    0xfe, 0x50, 0x87, 0x07, 0xfe, 0x70, 0x91, 0x07, 0xfe, 0xa0, 0xc5, 0xdb,
+    0xff, 0x07, 0xfd, 0x29, 0xc5, 0x90, 0xe4, 0x07, 0xfd, 0x30, 0x91, 0x0d,
+    0x8a, 0x91, 0x87, 0x0d, 0x8a, 0x89, 0x8b, 0x0d, 0x8a, 0x81, 0x83, 0x01,
+    0x84, 0x70, 0x83, 0x01, 0x84, 0x19, 0x97, 0x01, 0x84, 0x29, 0x91, 0x01,
+    0x84, 0x38, 0x83, 0x01, 0x84, 0xa9, 0x87, 0x01, 0x84, 0xb0, 0xd2, 0x4a,
+    0x99, 0x01, 0x72, 0x30, 0xe0, 0x06, 0xa7, 0x01, 0x52, 0x58, 0xcf, 0x62,
+    0x97, 0x01, 0x52, 0x49, 0xc5, 0x13, 0x84, 0x01, 0x52, 0x38, 0xcb, 0x2a,
+    0xa5, 0x01, 0x52, 0x21, 0xc7, 0x80, 0x70, 0x01, 0x52, 0x19, 0xc3, 0x02,
+    0xa3, 0x01, 0x52, 0x00, 0xc6, 0x52, 0x0b, 0x01, 0x50, 0xe1, 0xc3, 0x00,
+    0x44, 0x01, 0x50, 0xd0, 0x00, 0x42, 0xee, 0xbd, 0x19, 0xc2, 0xee, 0xc9,
+    0xc2, 0x00, 0xc4, 0x08, 0x5b, 0xe1, 0xc4, 0x02, 0xde, 0x08, 0x5b, 0xd0,
+    0xc2, 0x39, 0x8b, 0x08, 0x5b, 0x91, 0xc3, 0x1e, 0x1b, 0x08, 0x5b, 0x40,
+    0xc3, 0x11, 0xef, 0x08, 0x5b, 0x89, 0x03, 0x42, 0xee, 0xd3, 0xc2, 0x00,
+    0x8e, 0x08, 0x5b, 0x38, 0x00, 0x42, 0xee, 0xdf, 0x19, 0xc2, 0xee, 0xeb,
+    0xc2, 0x00, 0xc4, 0x08, 0x5a, 0xe1, 0xc4, 0x02, 0xde, 0x08, 0x5a, 0xd0,
+    0xc2, 0x39, 0x8b, 0x08, 0x5a, 0xa9, 0xc3, 0x1e, 0x1b, 0x08, 0x5a, 0x40,
+    0xc3, 0x11, 0xef, 0x08, 0x5a, 0xa1, 0x03, 0x42, 0xee, 0xf5, 0xc2, 0x00,
+    0x8e, 0x08, 0x5a, 0x38, 0xc4, 0x36, 0xb5, 0x08, 0x5a, 0x01, 0xc3, 0x16,
+    0x5a, 0x08, 0x5a, 0x78, 0xc2, 0x02, 0xa0, 0x00, 0x00, 0xf1, 0xc4, 0x02,
+    0xde, 0x00, 0x00, 0xe8, 0x16, 0xc2, 0xef, 0x01, 0xc3, 0x05, 0x14, 0x0f,
+    0x65, 0x88, 0xc4, 0x26, 0x78, 0x0f, 0x65, 0x59, 0xc5, 0x06, 0xdb, 0x0f,
+    0x65, 0x51, 0x15, 0xc2, 0xef, 0x0d, 0x08, 0xc2, 0xef, 0x19, 0x16, 0xc2,
+    0xef, 0x25, 0xc3, 0x05, 0x14, 0x0f, 0x65, 0x18, 0xc2, 0x00, 0xd1, 0x0f,
+    0x65, 0x10, 0xc2, 0x00, 0xd1, 0x0f, 0x64, 0xf8, 0xc2, 0x0d, 0x10, 0x0f,
+    0x64, 0x13, 0x02, 0xef, 0x31, 0x00, 0x42, 0xef, 0x37, 0x9b, 0x0f, 0x64,
+    0x0b, 0x02, 0xef, 0x43, 0x00, 0x42, 0xef, 0x49, 0xc4, 0x18, 0x10, 0x0f,
+    0x63, 0xbb, 0x02, 0xef, 0x55, 0xc2, 0x22, 0xcc, 0x0f, 0x63, 0xb2, 0x02,
+    0xef, 0x62, 0x0b, 0xc2, 0xef, 0x6f, 0x11, 0x42, 0xef, 0x81, 0x0a, 0xc2,
+    0xef, 0x93, 0x19, 0xc2, 0xef, 0xa5, 0xc2, 0x00, 0xc4, 0x0f, 0x63, 0xd2,
+    0x02, 0xef, 0xb5, 0x00, 0x42, 0xef, 0xbb, 0xc4, 0x01, 0xce, 0x0f, 0x65,
+    0x71, 0xc7, 0x08, 0x79, 0x0f, 0x65, 0x68, 0xc6, 0xcc, 0x2f, 0x01, 0x96,
+    0x01, 0x17, 0x42, 0xef, 0xc7, 0xc3, 0x78, 0xc0, 0x01, 0x96, 0x11, 0x9b,
+    0x01, 0x96, 0x20, 0xc4, 0xe3, 0xdf, 0x01, 0x96, 0x19, 0xc5, 0xd9, 0x4d,
+    0x01, 0x96, 0x38, 0xc7, 0xc5, 0x44, 0x01, 0x96, 0x59, 0x43, 0x1a, 0xd3,
+    0x42, 0xef, 0xd3, 0xc4, 0x15, 0xe7, 0x01, 0x9a, 0xc1, 0xc3, 0x05, 0x14,
+    0x01, 0x9a, 0xc9, 0x16, 0xc2, 0xef, 0xf2, 0x08, 0xc2, 0xf0, 0x00, 0x15,
+    0xc2, 0xf0, 0x0d, 0x07, 0xc2, 0xf0, 0x1f, 0xc4, 0x26, 0x78, 0x01, 0x9b,
+    0x0a, 0x02, 0xf0, 0x2e, 0xc3, 0x00, 0x4a, 0x01, 0x7f, 0xb9, 0xc9, 0x03,
+    0x68, 0x01, 0x7f, 0xd0, 0xc4, 0x00, 0x49, 0x01, 0x7f, 0xc1, 0xc5, 0x00,
+    0x2c, 0x01, 0x7f, 0xc8, 0xc9, 0x57, 0x20, 0x08, 0x42, 0xf8, 0xc4, 0x18,
+    0x12, 0x08, 0x42, 0xe1, 0x91, 0x08, 0x42, 0xc8, 0xc8, 0x4b, 0x94, 0x08,
+    0x42, 0xf1, 0xc7, 0x0d, 0x04, 0x08, 0x42, 0xe8, 0xc4, 0xdc, 0x2d, 0x08,
+    0x42, 0x71, 0xc3, 0x77, 0x79, 0x08, 0x42, 0x88, 0xd7, 0x2a, 0xf5, 0x0f,
+    0xd2, 0x58, 0x49, 0x2a, 0xf5, 0x42, 0xf0, 0x34, 0x49, 0x2a, 0xf5, 0x42,
+    0xf0, 0x40, 0xc5, 0x56, 0xa5, 0x01, 0x32, 0xc3, 0x02, 0xf0, 0x4c, 0xc3,
+    0x00, 0x74, 0x01, 0x32, 0xa2, 0x02, 0xf0, 0x56, 0x49, 0x2a, 0xf5, 0x42,
+    0xf0, 0x5c, 0x49, 0x2a, 0xf5, 0x42, 0xf0, 0x68, 0x0d, 0xc2, 0xf0, 0x74,
+    0xc5, 0xa8, 0xf7, 0x0f, 0xd0, 0xf9, 0xc4, 0xde, 0x83, 0x0f, 0xd1, 0x01,
+    0xc6, 0xca, 0xfd, 0x0f, 0xd1, 0x09, 0xc4, 0xe3, 0x93, 0x0f, 0xd1, 0x18,
+    0xdd, 0x12, 0x1c, 0x0f, 0xbc, 0x51, 0x45, 0x00, 0x8c, 0x42, 0xf0, 0x80,
+    0xcf, 0x61, 0x2f, 0x01, 0x3f, 0x19, 0xce, 0x6f, 0x2a, 0x01, 0x3f, 0x10,
+    0xc2, 0x00, 0x61, 0x0f, 0xc8, 0x6b, 0x02, 0xf0, 0x98, 0x43, 0x11, 0x3c,
+    0x42, 0xf0, 0x9e, 0x51, 0x0a, 0xc9, 0xc2, 0xf0, 0xaa, 0x45, 0x00, 0x8c,
+    0xc2, 0xf0, 0xbc, 0xc6, 0x86, 0xfd, 0x0f, 0xa9, 0x98, 0x45, 0x00, 0x8c,
+    0xc2, 0xf0, 0xd6, 0xcc, 0x85, 0xf5, 0x0f, 0x99, 0x2a, 0x02, 0xf0, 0xe2,
+    0x15, 0xc2, 0xf0, 0xe8, 0xc7, 0x0a, 0xe0, 0x01, 0x59, 0x58, 0xca, 0xa6,
+    0x34, 0x01, 0x36, 0xc9, 0x49, 0x01, 0xaa, 0x42, 0xf0, 0xf4, 0xc7, 0x46,
+    0x3d, 0x01, 0x2e, 0x29, 0xce, 0x6c, 0x8a, 0x01, 0x2e, 0x19, 0xc8, 0x01,
+    0x92, 0x01, 0x2e, 0x08, 0xd0, 0x5e, 0xa2, 0x01, 0x3e, 0x81, 0xc9, 0xaf,
+    0xa5, 0x01, 0x36, 0x59, 0xc4, 0x22, 0xdc, 0x01, 0x33, 0x11, 0x51, 0x0a,
+    0xc9, 0x42, 0xf1, 0x00, 0xc5, 0x06, 0x82, 0x01, 0x30, 0xf9, 0xcf, 0x66,
+    0x84, 0x0f, 0xac, 0xb9, 0xce, 0x24, 0xd5, 0x0f, 0xa2, 0x38, 0xce, 0x6c,
+    0x8a, 0x01, 0x2d, 0xf9, 0xc8, 0x01, 0x92, 0x01, 0x2d, 0xe8, 0xe0, 0x03,
+    0x07, 0x01, 0x3e, 0x08, 0xc5, 0x04, 0xa2, 0x01, 0x3a, 0x01, 0xc3, 0x00,
+    0x28, 0x0f, 0xa5, 0x70, 0x44, 0x00, 0x8b, 0x42, 0xf1, 0x12, 0xc5, 0x06,
+    0x82, 0x01, 0x30, 0xf1, 0xce, 0x24, 0xd5, 0x0f, 0xa2, 0x48, 0x12, 0xc2,
+    0xf1, 0x18, 0xce, 0x6c, 0x8a, 0x01, 0x2d, 0xc9, 0xc8, 0x01, 0x92, 0x01,
+    0x2d, 0xb8, 0xc9, 0x33, 0xad, 0x01, 0x2f, 0x60, 0xcb, 0x51, 0x6d, 0x01,
+    0x2f, 0xe9, 0xc5, 0x0b, 0x0a, 0x01, 0x2f, 0xd9, 0xc3, 0x0e, 0x6b, 0x01,
+    0x5a, 0x80, 0x90, 0x0f, 0x17, 0x42, 0x02, 0xf1, 0x24, 0x89, 0x0f, 0x17,
+    0x10, 0xc2, 0x01, 0xa3, 0x08, 0xc6, 0xd9, 0xc2, 0x01, 0xc8, 0x08, 0xc6,
+    0xd0, 0x90, 0x08, 0xc6, 0x81, 0x9b, 0x08, 0xc6, 0x68, 0x8c, 0x08, 0xc6,
+    0x70, 0xc2, 0x01, 0xa3, 0x08, 0xc5, 0xd9, 0xc2, 0x01, 0xc8, 0x08, 0xc5,
+    0xd0, 0x90, 0x08, 0xc5, 0x81, 0x9b, 0x08, 0xc5, 0x68, 0x8c, 0x08, 0xc5,
+    0x70, 0xe0, 0x04, 0x07, 0x01, 0x5c, 0xa0, 0xcc, 0x81, 0x2d, 0x0f, 0xcb,
+    0xd1, 0xd7, 0x2a, 0xc7, 0x0f, 0xcb, 0x99, 0xca, 0xa5, 0xbc, 0x0f, 0xd7,
+    0x18, 0xcb, 0x85, 0x1e, 0x0f, 0xb0, 0x11, 0xca, 0x9b, 0x3a, 0x0f, 0xc8,
+    0x90, 0xc9, 0xad, 0x65, 0x0f, 0xb2, 0x31, 0x44, 0x05, 0x76, 0xc2, 0xf1,
+    0x28, 0xd1, 0x55, 0xfc, 0x0f, 0xc9, 0x40, 0x45, 0x02, 0x9a, 0x42, 0xf1,
+    0x37, 0xc8, 0x6c, 0x12, 0x0f, 0xb0, 0x99, 0xc8, 0xb8, 0xb2, 0x0f, 0xc9,
+    0x00, 0xcb, 0x92, 0xcd, 0x0f, 0xb1, 0xb9, 0xc6, 0xcc, 0x29, 0x0f, 0xce,
+    0x80, 0xc2, 0x02, 0xa0, 0x07, 0xf8, 0x91, 0xc4, 0x02, 0xde, 0x07, 0xf8,
+    0x98, 0xc3, 0x09, 0x9e, 0x07, 0xf8, 0xa1, 0xc3, 0x0d, 0x14, 0x07, 0xf8,
+    0xa8, 0xc2, 0x22, 0xcc, 0x07, 0xf8, 0xb1, 0xc4, 0x18, 0x10, 0x07, 0xf8,
+    0xb8, 0xc9, 0xb4, 0x64, 0x07, 0xf9, 0x01, 0x83, 0x07, 0xf8, 0x60, 0xce,
+    0x25, 0xad, 0x07, 0xf9, 0xd9, 0xcd, 0x00, 0x32, 0x07, 0xfa, 0xd9, 0xd1,
+    0x4f, 0x7a, 0x07, 0xfa, 0xf9, 0xcb, 0x1a, 0x50, 0x07, 0xf8, 0x40, 0x83,
+    0x07, 0xf9, 0x09, 0x84, 0x07, 0xf9, 0x11, 0x85, 0x07, 0xf9, 0x19, 0x86,
+    0x07, 0xf9, 0x21, 0x87, 0x07, 0xf9, 0x29, 0x88, 0x07, 0xf9, 0x31, 0x89,
+    0x07, 0xf9, 0x39, 0x8a, 0x07, 0xf9, 0x41, 0x8b, 0x07, 0xf9, 0x49, 0x8c,
+    0x07, 0xf9, 0x51, 0x8d, 0x07, 0xf9, 0x59, 0x8e, 0x07, 0xf9, 0x61, 0x8f,
+    0x07, 0xf9, 0x69, 0x95, 0x07, 0xf9, 0x99, 0x96, 0x07, 0xf9, 0xa1, 0x97,
+    0x07, 0xf9, 0xa9, 0x98, 0x07, 0xf9, 0xb1, 0x99, 0x07, 0xf9, 0xb9, 0x9a,
+    0x07, 0xf9, 0xc1, 0x9b, 0x07, 0xf9, 0xc9, 0x9c, 0x07, 0xf9, 0xd1, 0x90,
+    0x07, 0xf9, 0x71, 0x91, 0x07, 0xf9, 0x79, 0x92, 0x07, 0xf9, 0x81, 0x93,
+    0x07, 0xf9, 0x89, 0x94, 0x07, 0xf9, 0x90, 0x83, 0x07, 0xfa, 0x09, 0x84,
+    0x07, 0xfa, 0x11, 0x85, 0x07, 0xfa, 0x19, 0x87, 0x07, 0xfa, 0x29, 0x88,
+    0x07, 0xfa, 0x31, 0x89, 0x07, 0xfa, 0x39, 0x8a, 0x07, 0xfa, 0x41, 0x8b,
+    0x07, 0xfa, 0x49, 0x8c, 0x07, 0xfa, 0x51, 0x8d, 0x07, 0xfa, 0x59, 0x8e,
+    0x07, 0xfa, 0x61, 0x8f, 0x07, 0xfa, 0x69, 0x90, 0x07, 0xfa, 0x71, 0x91,
+    0x07, 0xfa, 0x79, 0x92, 0x07, 0xfa, 0x81, 0x93, 0x07, 0xfa, 0x89, 0x94,
+    0x07, 0xfa, 0x91, 0x95, 0x07, 0xfa, 0x99, 0x96, 0x07, 0xfa, 0xa1, 0x97,
+    0x07, 0xfa, 0xa9, 0x98, 0x07, 0xfa, 0xb1, 0x99, 0x07, 0xfa, 0xb9, 0x9a,
+    0x07, 0xfa, 0xc1, 0x9b, 0x07, 0xfa, 0xc9, 0x9c, 0x07, 0xfa, 0xd1, 0x86,
+    0x07, 0xfa, 0x20, 0xa5, 0x0b, 0x7c, 0xf9, 0xa3, 0x0b, 0x7c, 0xf1, 0xa2,
+    0x0b, 0x7c, 0xe9, 0xa1, 0x0b, 0x7c, 0xe1, 0x9f, 0x0b, 0x7c, 0xd9, 0x9e,
+    0x0b, 0x7c, 0xd0, 0xc2, 0x01, 0x30, 0x0b, 0x79, 0x29, 0x83, 0x0b, 0x78,
+    0x98, 0xc2, 0x19, 0x2c, 0x0b, 0x7a, 0x09, 0x83, 0x0b, 0x79, 0xf0, 0x83,
+    0x0b, 0x79, 0xc9, 0xc2, 0x00, 0xd0, 0x0b, 0x79, 0x80, 0x89, 0x0b, 0x7b,
+    0x68, 0x89, 0x0b, 0x7b, 0x20, 0xcb, 0x1b, 0xd5, 0x01, 0x51, 0xd1, 0x45,
+    0x00, 0x8c, 0x42, 0xf1, 0x43, 0xd6, 0x30, 0x0c, 0x01, 0x3b, 0xa9, 0xd4,
+    0x1a, 0x50, 0x01, 0x3b, 0x48, 0xd6, 0x30, 0x0c, 0x01, 0x3b, 0xa1, 0xd4,
+    0x1a, 0x50, 0x01, 0x3b, 0x40, 0xda, 0x1a, 0x4a, 0x01, 0x3b, 0x59, 0xd9,
+    0x1d, 0xec, 0x01, 0x3b, 0x50, 0xca, 0x22, 0x51, 0x0f, 0xbe, 0x29, 0xcd,
+    0x0e, 0x61, 0x0f, 0xbe, 0x38, 0xcf, 0x15, 0x36, 0x0f, 0xbd, 0xb1, 0xd2,
+    0x22, 0x49, 0x0f, 0xbe, 0x58, 0x97, 0x0b, 0x73, 0x98, 0x8b, 0x0b, 0x73,
+    0xf1, 0xc3, 0x7a, 0xd8, 0x0b, 0x73, 0x20, 0x87, 0x0b, 0x73, 0xd0, 0x89,
+    0x0b, 0x73, 0xb9, 0x9b, 0x0b, 0x73, 0xb8, 0x92, 0x0b, 0x73, 0xb0, 0x92,
+    0x0b, 0x73, 0x30, 0x97, 0x0b, 0x72, 0x98, 0x8b, 0x0b, 0x72, 0xf1, 0xc3,
+    0x7a, 0xd8, 0x0b, 0x72, 0x20, 0x87, 0x0b, 0x72, 0xd0, 0x89, 0x0b, 0x72,
+    0xb9, 0x9b, 0x0b, 0x72, 0xb8, 0x92, 0x0b, 0x72, 0xb0, 0x92, 0x0b, 0x72,
+    0x30, 0xcf, 0x6b, 0x25, 0x0b, 0x74, 0xb0, 0xcf, 0x6b, 0x25, 0x0b, 0x74,
+    0xa8, 0xc4, 0xe0, 0x37, 0x0f, 0x41, 0xd1, 0xc4, 0xe2, 0x23, 0x0f, 0x41,
+    0xa1, 0xc5, 0xd6, 0x14, 0x0f, 0x40, 0x29, 0xc4, 0xe2, 0x7b, 0x0f, 0x42,
+    0xf1, 0xc5, 0xd4, 0x8e, 0x0f, 0x42, 0xe9, 0xc5, 0xd4, 0xf2, 0x0f, 0x44,
+    0xc1, 0xc5, 0xd6, 0x28, 0x0f, 0x45, 0x09, 0xc6, 0xd2, 0x11, 0x0f, 0x45,
+    0x59, 0xc5, 0xde, 0x43, 0x0f, 0x45, 0x61, 0xc4, 0xe2, 0x4f, 0x0f, 0x45,
+    0xf8, 0xc5, 0xd6, 0x3c, 0x0f, 0x41, 0xc9, 0xc5, 0xd8, 0x03, 0x0f, 0x43,
+    0x99, 0xc6, 0xd0, 0x49, 0x0f, 0x43, 0x79, 0xc4, 0xe2, 0x07, 0x0f, 0x43,
+    0x01, 0xc4, 0xe4, 0x93, 0x0f, 0x42, 0xb9, 0xc5, 0xdc, 0xb3, 0x0f, 0x42,
+    0x09, 0xc6, 0xce, 0x81, 0x0f, 0x43, 0xc9, 0xcb, 0x8d, 0xa5, 0x0f, 0x44,
+    0x01, 0xc5, 0xd4, 0x61, 0x0f, 0x44, 0x79, 0xc4, 0xe3, 0x87, 0x0f, 0x45,
+    0xe8, 0xc4, 0xe1, 0x93, 0x0f, 0x41, 0xc1, 0xc4, 0xe1, 0xbb, 0x0f, 0x41,
+    0xb9, 0xc4, 0xe2, 0xa7, 0x0f, 0x41, 0xb1, 0xc4, 0xe1, 0x57, 0x0f, 0x41,
+    0x81, 0xc4, 0xe4, 0x8f, 0x0f, 0x41, 0x79, 0xc4, 0xe2, 0x13, 0x0f, 0x42,
+    0x61, 0xc4, 0xe1, 0xfb, 0x0f, 0x42, 0x59, 0xc4, 0xe2, 0xf7, 0x0f, 0x42,
+    0x31, 0xc4, 0xe0, 0x2b, 0x0f, 0x42, 0x29, 0xc4, 0x38, 0x6b, 0x0f, 0x42,
+    0x20, 0xc4, 0xe2, 0x27, 0x0f, 0x41, 0x71, 0xc3, 0xe4, 0xf7, 0x0f, 0x41,
+    0x21, 0xc3, 0xd6, 0x5f, 0x0f, 0x41, 0x19, 0xc3, 0xe6, 0x26, 0x0f, 0x41,
+    0x11, 0xc4, 0xe1, 0x37, 0x0f, 0x40, 0xe9, 0xc4, 0xb7, 0x12, 0x0f, 0x40,
+    0xe1, 0xc4, 0xe3, 0x97, 0x0f, 0x40, 0xd9, 0xc4, 0xe2, 0x63, 0x0f, 0x42,
+    0x01, 0xc4, 0xe1, 0xd7, 0x0f, 0x41, 0xf9, 0xc4, 0xe2, 0xff, 0x0f, 0x41,
+    0xf0, 0xc4, 0xe1, 0xdb, 0x0f, 0x40, 0xf9, 0xc5, 0xd6, 0x2d, 0x0f, 0x40,
+    0xc1, 0xc4, 0xd6, 0x96, 0x0f, 0x40, 0x21, 0xc4, 0xe3, 0x2f, 0x0f, 0x43,
+    0x61, 0xc5, 0xd5, 0x0b, 0x0f, 0x42, 0x39, 0xc6, 0xd1, 0x9f, 0x0f, 0x43,
+    0xb9, 0xc4, 0xe2, 0x33, 0x0f, 0x44, 0x69, 0xc5, 0xd5, 0x42, 0x0f, 0x45,
+    0x01, 0xc6, 0xd0, 0x43, 0x0f, 0x45, 0x49, 0xc6, 0xd1, 0xb7, 0x0f, 0x46,
+    0x18, 0xc5, 0xdd, 0x26, 0x0f, 0x40, 0xb9, 0xc5, 0xd4, 0x5c, 0x0f, 0x43,
+    0xa1, 0xc5, 0xd7, 0x31, 0x0f, 0x43, 0x89, 0xc4, 0xe3, 0x53, 0x0f, 0x42,
+    0x41, 0xc5, 0xd6, 0x37, 0x0f, 0x41, 0xd9, 0xc6, 0xd0, 0x13, 0x0f, 0x44,
+    0x51, 0xc4, 0xe3, 0xe3, 0x0f, 0x44, 0x71, 0xc4, 0xd4, 0x61, 0x0f, 0x44,
+    0x81, 0xc5, 0xd5, 0x9c, 0x0f, 0x45, 0x39, 0xc6, 0xd1, 0xd5, 0x0f, 0x46,
+    0x08, 0xc5, 0xdc, 0x59, 0x0f, 0x40, 0xb1, 0xc5, 0xdd, 0x3a, 0x0f, 0x40,
+    0xa9, 0xc5, 0xd4, 0x52, 0x0f, 0x40, 0xa1, 0xc4, 0xe1, 0xdf, 0x0f, 0x40,
+    0x51, 0xc4, 0xe3, 0x77, 0x0f, 0x40, 0x49, 0xc4, 0xe2, 0xe7, 0x0f, 0x40,
+    0x41, 0xc4, 0xe0, 0x7f, 0x0f, 0x40, 0x11, 0xc4, 0xe1, 0x2f, 0x0f, 0x40,
+    0x09, 0xc4, 0xe0, 0xbf, 0x0f, 0x40, 0x00, 0xc5, 0xdd, 0xad, 0x0f, 0x40,
+    0x91, 0xc4, 0xd2, 0x6b, 0x0f, 0x40, 0x71, 0xc4, 0xe1, 0xf7, 0x0f, 0x40,
+    0x31, 0xc5, 0xd4, 0xa7, 0x0f, 0x43, 0x69, 0xc5, 0xdd, 0xcb, 0x0f, 0x43,
+    0x59, 0xc4, 0xe0, 0xcb, 0x0f, 0x43, 0x49, 0xc6, 0xd3, 0xaf, 0x0f, 0x43,
+    0xb1, 0xc6, 0xce, 0xed, 0x0f, 0x43, 0xc1, 0xc6, 0xd0, 0x61, 0x0f, 0x44,
+    0xb1, 0xc6, 0xcf, 0x71, 0x0f, 0x45, 0x10, 0xc5, 0xd4, 0x34, 0x0f, 0x40,
+    0x89, 0xc5, 0xd6, 0x96, 0x0f, 0x40, 0x19, 0xc4, 0xe0, 0x83, 0x0f, 0x42,
+    0x89, 0xc4, 0xe2, 0x1b, 0x0f, 0x42, 0x51, 0xc4, 0xe1, 0xf3, 0x0f, 0x44,
+    0x61, 0xc4, 0xe1, 0xb3, 0x0f, 0x44, 0x91, 0xc5, 0xd5, 0x10, 0x0f, 0x44,
+    0xa1, 0xc6, 0xd0, 0x8b, 0x0f, 0x45, 0x99, 0xc5, 0xd5, 0x97, 0x0f, 0x45,
+    0xa1, 0xc6, 0xd1, 0x33, 0x0f, 0x46, 0x20, 0xc5, 0xde, 0x52, 0x0f, 0x43,
+    0x29, 0xc5, 0xdd, 0xf8, 0x0f, 0x43, 0x21, 0xc5, 0xd5, 0x33, 0x0f, 0x43,
+    0x19, 0xc4, 0xe3, 0x3b, 0x0f, 0x42, 0xe1, 0xc4, 0xe2, 0x7f, 0x0f, 0x42,
+    0xd9, 0xc4, 0xe2, 0xbf, 0x0f, 0x42, 0xd1, 0xc4, 0xe0, 0x33, 0x0f, 0x42,
+    0xa9, 0xc4, 0xdf, 0xaf, 0x0f, 0x42, 0xa1, 0xc4, 0xe1, 0x4f, 0x0f, 0x42,
+    0x99, 0xc4, 0xe3, 0xff, 0x0f, 0x42, 0x68, 0xc5, 0xd8, 0x80, 0x0f, 0x41,
+    0xa9, 0xc4, 0xe2, 0x93, 0x0f, 0x41, 0x61, 0xc5, 0xd5, 0xb0, 0x0f, 0x40,
+    0x79, 0xc5, 0xd7, 0x9f, 0x0f, 0x43, 0xa9, 0xc5, 0xd7, 0xef, 0x0f, 0x43,
+    0x09, 0xc5, 0xde, 0x66, 0x0f, 0x44, 0x31, 0xc6, 0xd2, 0xa1, 0x0f, 0x45,
+    0x89, 0xc5, 0xd4, 0x7f, 0x0f, 0x45, 0xb0, 0xc5, 0xd8, 0x7b, 0x0f, 0x41,
+    0x99, 0xc4, 0xe2, 0xb7, 0x0f, 0x41, 0x59, 0xc4, 0xe0, 0xc7, 0x0f, 0x41,
+    0x51, 0xc4, 0xe1, 0xcb, 0x0f, 0x41, 0x49, 0xc4, 0xe2, 0x67, 0x0f, 0x41,
+    0x09, 0xc5, 0xdd, 0xa8, 0x0f, 0x40, 0x99, 0xc5, 0xde, 0x6b, 0x0f, 0x43,
+    0x91, 0xc5, 0xd7, 0x59, 0x0f, 0x42, 0xf9, 0xc5, 0xd5, 0x47, 0x0f, 0x44,
+    0xf9, 0xc6, 0xd3, 0x61, 0x0f, 0x45, 0xc0, 0xc4, 0xe1, 0xe3, 0x0f, 0x41,
+    0x91, 0xc5, 0xd5, 0xba, 0x0f, 0x40, 0x69, 0xc4, 0xe2, 0x3f, 0x0f, 0x40,
+    0x61, 0xc5, 0xd4, 0x4d, 0x0f, 0x43, 0x31, 0xc4, 0xe0, 0x87, 0x0f, 0x42,
+    0x79, 0xc9, 0xac, 0xe7, 0x0f, 0x41, 0xe9, 0xc7, 0xc3, 0xb5, 0x0f, 0x43,
+    0xd1, 0xc4, 0xe0, 0xcf, 0x0f, 0x44, 0x21, 0xc6, 0xcf, 0xb9, 0x0f, 0x45,
+    0x21, 0xc5, 0xde, 0x16, 0x0f, 0x45, 0x90, 0xc5, 0xd4, 0xde, 0x0f, 0x41,
+    0x89, 0xc4, 0xe3, 0x5f, 0x0f, 0x41, 0x39, 0xc4, 0xe0, 0x93, 0x0f, 0x41,
+    0x29, 0xc5, 0xde, 0x5c, 0x0f, 0x43, 0x39, 0xc5, 0xdd, 0xc6, 0x0f, 0x42,
+    0x81, 0xc4, 0xe2, 0x03, 0x0f, 0x44, 0x29, 0xc6, 0xd3, 0xb5, 0x0f, 0x44,
+    0x39, 0xc6, 0xd0, 0xc7, 0x0f, 0x44, 0x41, 0xca, 0x9a, 0xc2, 0x0f, 0x44,
+    0xe1, 0xc6, 0xd3, 0xcd, 0x0f, 0x46, 0x00, 0xc4, 0xe2, 0x0f, 0x0f, 0x41,
+    0x69, 0xc5, 0xdc, 0xc2, 0x0f, 0x40, 0x39, 0xc4, 0xe3, 0x1f, 0x0f, 0x43,
+    0x41, 0xc9, 0xa9, 0x24, 0x0f, 0x42, 0x91, 0xc7, 0xc5, 0xc2, 0x0f, 0x44,
+    0x59, 0xc6, 0xce, 0x99, 0x0f, 0x44, 0xc9, 0xc5, 0xd6, 0x32, 0x0f, 0x44,
+    0xd1, 0xc4, 0xe0, 0xdf, 0x0f, 0x45, 0x69, 0xc5, 0xd8, 0x35, 0x0f, 0x45,
+    0xe1, 0xc6, 0xd1, 0x3f, 0x0f, 0x46, 0x10, 0xc3, 0xe5, 0x66, 0x0f, 0x41,
+    0x41, 0xc5, 0xd8, 0x6c, 0x0f, 0x40, 0x81, 0xc4, 0xe3, 0xfb, 0x0f, 0x43,
+    0x71, 0xc5, 0xd4, 0xe8, 0x0f, 0x42, 0xc1, 0xc6, 0xce, 0x9f, 0x0f, 0x43,
+    0xd9, 0xc5, 0xd6, 0xa0, 0x0f, 0x44, 0x99, 0xca, 0xa0, 0x12, 0x0f, 0x44,
+    0xf1, 0xc5, 0xd3, 0xf8, 0x0f, 0x45, 0x41, 0xc6, 0xd0, 0xbb, 0x0f, 0x45,
+    0xb9, 0xc5, 0xd5, 0xbf, 0x0f, 0x45, 0xf0, 0xc3, 0xe5, 0xd2, 0x0f, 0x41,
+    0x31, 0xc5, 0xd7, 0xea, 0x0f, 0x41, 0x01, 0xc5, 0xdc, 0x18, 0x0f, 0x43,
+    0x11, 0xc5, 0xdd, 0xe9, 0x0f, 0x42, 0xb1, 0xc5, 0xd5, 0xab, 0x0f, 0x42,
+    0x49, 0xcc, 0x89, 0xa9, 0x0f, 0x44, 0x09, 0xc5, 0xd4, 0xb6, 0x0f, 0x44,
+    0x89, 0xcb, 0x8e, 0x81, 0x0f, 0x44, 0xe9, 0xc5, 0xd3, 0xee, 0x0f, 0x45,
+    0x19, 0xc5, 0xd7, 0xf4, 0x0f, 0x45, 0x50, 0xc5, 0xdd, 0x94, 0x0f, 0x40,
+    0xf1, 0xc6, 0xd3, 0xa3, 0x0f, 0x40, 0xc9, 0xc5, 0xd8, 0x0d, 0x0f, 0x42,
+    0x71, 0xc4, 0x92, 0x28, 0x0f, 0x41, 0xe1, 0xc7, 0xc1, 0x46, 0x0f, 0x43,
+    0xe1, 0xc7, 0xc8, 0x85, 0x0f, 0x43, 0xf1, 0xc4, 0xe2, 0xbb, 0x0f, 0x44,
+    0x19, 0xc5, 0xd5, 0x38, 0x0f, 0x45, 0x29, 0xc5, 0xd4, 0x57, 0x0f, 0x45,
+    0xa9, 0xc4, 0xe1, 0x53, 0x0f, 0x45, 0xd8, 0xc6, 0xce, 0x51, 0x0f, 0x40,
+    0xd1, 0xc4, 0xd3, 0xaf, 0x0f, 0x43, 0x51, 0xc4, 0xe0, 0xdb, 0x0f, 0x42,
+    0x19, 0xc5, 0xdd, 0xc1, 0x0f, 0x42, 0x11, 0xcb, 0x92, 0x28, 0x0f, 0x44,
+    0x11, 0xc6, 0xd3, 0x55, 0x0f, 0x44, 0x49, 0xc6, 0xd2, 0x89, 0x0f, 0x44,
+    0xb9, 0xc6, 0xd0, 0x85, 0x0f, 0x44, 0xd9, 0xc4, 0xdf, 0xf7, 0x0f, 0x45,
+    0xc9, 0xc4, 0xe3, 0x3f, 0x0f, 0x45, 0xd0, 0xc5, 0xd7, 0x68, 0x0f, 0x40,
+    0x59, 0xc6, 0xd3, 0x43, 0x0f, 0x43, 0x81, 0xc4, 0xd4, 0xe8, 0x0f, 0x42,
+    0xc9, 0xc6, 0xd0, 0xdf, 0x0f, 0x43, 0xe9, 0xc7, 0xc7, 0x90, 0x0f, 0x43,
+    0xf9, 0xc5, 0xd4, 0xc5, 0x0f, 0x44, 0xa9, 0xc5, 0xd6, 0x4b, 0x0f, 0x45,
+    0x31, 0xc5, 0xd8, 0xd5, 0x0f, 0x45, 0x71, 0xc5, 0xde, 0x20, 0x0f, 0x45,
+    0x79, 0xc5, 0xd6, 0x69, 0x0f, 0x45, 0x80, 0xc3, 0x57, 0x39, 0x0f, 0x46,
+    0x81, 0x10, 0x42, 0xf1, 0x5b, 0xcb, 0x71, 0xb1, 0x08, 0x4f, 0xf9, 0xcd,
+    0x7c, 0x9b, 0x08, 0x4f, 0xc1, 0xcb, 0x8d, 0xf2, 0x08, 0x4f, 0xb8, 0xcd,
+    0x7d, 0x85, 0x08, 0x4f, 0xe9, 0xce, 0x71, 0xae, 0x08, 0x4d, 0xe0, 0xcd,
+    0x71, 0xaf, 0x08, 0x4f, 0xe1, 0xcb, 0x91, 0x83, 0x08, 0x4f, 0xd8, 0xcc,
+    0x8c, 0x79, 0x08, 0x4f, 0xd1, 0xcc, 0x86, 0xa9, 0x08, 0x4f, 0xc8, 0xc7,
+    0x71, 0xb4, 0x08, 0x4f, 0xb1, 0xc4, 0x01, 0xce, 0x08, 0x4d, 0xe8, 0x00,
+    0xc2, 0xf1, 0x65, 0xcb, 0x92, 0xb7, 0x08, 0x4f, 0x60, 0x00, 0xc2, 0xf1,
+    0x74, 0xca, 0x92, 0xb8, 0x08, 0x4f, 0x58, 0xc4, 0x18, 0x10, 0x08, 0x4e,
+    0x33, 0x02, 0xf1, 0x83, 0xc2, 0x22, 0xcc, 0x08, 0x4e, 0x2a, 0x02, 0xf1,
+    0x90, 0x0b, 0xc2, 0xf1, 0x9d, 0x11, 0x42, 0xf1, 0xaf, 0x0a, 0xc2, 0xf1,
+    0xc1, 0x19, 0xc2, 0xf1, 0xd3, 0xc2, 0x00, 0xc4, 0x08, 0x4e, 0x4a, 0x02,
+    0xf1, 0xe3, 0x00, 0x42, 0xf1, 0xe9, 0xc3, 0xe5, 0xb1, 0x08, 0x4d, 0xf9,
+    0xc3, 0x64, 0x84, 0x08, 0x4d, 0xf0, 0xc2, 0x0e, 0x9a, 0x08, 0x4d, 0xb9,
+    0x16, 0xc2, 0xf1, 0xf8, 0xc2, 0x0f, 0x9a, 0x08, 0x4d, 0x99, 0x0d, 0xc2,
+    0xf2, 0x04, 0x15, 0xc2, 0xf2, 0x0e, 0x83, 0x08, 0x4d, 0x03, 0x02, 0xf2,
+    0x16, 0xc3, 0xe6, 0x71, 0x08, 0x4d, 0x71, 0xc2, 0x00, 0xdb, 0x08, 0x4d,
+    0x61, 0xc2, 0x00, 0x39, 0x08, 0x4d, 0x59, 0x10, 0xc2, 0xf2, 0x1c, 0xc2,
+    0x01, 0xc3, 0x08, 0x4d, 0x41, 0xc2, 0x00, 0xb0, 0x08, 0x4d, 0x39, 0xc2,
+    0x01, 0x5d, 0x08, 0x4d, 0x31, 0xc2, 0x01, 0x4a, 0x08, 0x4d, 0x29, 0xc2,
+    0x19, 0x2c, 0x08, 0x4d, 0x21, 0x91, 0x08, 0x4d, 0x19, 0x8b, 0x08, 0x4d,
+    0x11, 0x87, 0x08, 0x4d, 0x08, 0x91, 0x08, 0x4c, 0xe1, 0x87, 0x08, 0x4c,
+    0xd3, 0x02, 0xf2, 0x24, 0x83, 0x08, 0x4c, 0xc2, 0x02, 0xf2, 0x2a, 0x83,
+    0x08, 0x4c, 0xb1, 0xc2, 0x00, 0xd0, 0x08, 0x4c, 0x88, 0x87, 0x08, 0x4c,
+    0xa9, 0x83, 0x08, 0x4c, 0x9a, 0x02, 0xf2, 0x30, 0xc2, 0xe5, 0xfd, 0x08,
+    0x4c, 0x38, 0x83, 0x08, 0x4c, 0x53, 0x02, 0xf2, 0x36, 0x87, 0x08, 0x4c,
+    0x62, 0x02, 0xf2, 0x3c, 0xc2, 0xe5, 0xfd, 0x08, 0x4c, 0x78, 0x60, 0x03,
+    0x27, 0x42, 0xf2, 0x42, 0x97, 0x05, 0x57, 0x79, 0x8b, 0x05, 0x57, 0x68,
+    0xc7, 0xc9, 0xe3, 0x05, 0x5f, 0x08, 0xc7, 0xc9, 0xe3, 0x05, 0x5e, 0xf8,
+    0xc7, 0xc9, 0xe3, 0x05, 0x5f, 0x00, 0xc2, 0x00, 0xd0, 0x05, 0x57, 0x29,
+    0x83, 0x05, 0x57, 0x20, 0xc7, 0xc9, 0xe3, 0x05, 0x5e, 0xf0, 0xc7, 0xc9,
+    0xe3, 0x05, 0x5e, 0xd8, 0xc2, 0x00, 0xd0, 0x05, 0x57, 0x39, 0x83, 0x05,
+    0x57, 0x30, 0xcf, 0x01, 0x38, 0x08, 0xb3, 0x59, 0xc8, 0x00, 0xbf, 0x08,
+    0xb3, 0x50, 0xc4, 0x18, 0x10, 0x00, 0xc0, 0xb9, 0xc2, 0x22, 0xcc, 0x00,
+    0xc0, 0xb0, 0xc3, 0x0d, 0x14, 0x00, 0xc0, 0xa9, 0xc3, 0x09, 0x9e, 0x00,
+    0xc0, 0xa0, 0xc4, 0x02, 0xde, 0x00, 0xc0, 0x99, 0xc2, 0x02, 0xa0, 0x00,
+    0xc0, 0x90, 0x49, 0xb1, 0x70, 0xc2, 0xf2, 0x5a, 0xc3, 0xb4, 0xa6, 0x00,
+    0xc3, 0xb9, 0xc2, 0x00, 0x87, 0x00, 0xc3, 0xb1, 0xc2, 0x00, 0x39, 0x00,
+    0xc3, 0xa9, 0xc2, 0x02, 0x2b, 0x00, 0xc3, 0xa1, 0x8b, 0x00, 0xc3, 0x98,
+    0x06, 0xc2, 0xf2, 0x8e, 0x45, 0x01, 0xce, 0xc2, 0xf2, 0x9b, 0x83, 0x00,
+    0xc4, 0x3b, 0x02, 0xf2, 0xa5, 0x1c, 0xc2, 0xf2, 0xaf, 0xc3, 0x1d, 0x35,
+    0x00, 0xc4, 0xa1, 0x12, 0xc2, 0xf2, 0xb9, 0x16, 0xc2, 0xf2, 0xc3, 0x10,
+    0xc2, 0xf2, 0xd1, 0xc2, 0x00, 0x64, 0x00, 0xc4, 0x59, 0xc2, 0x02, 0x2b,
+    0x00, 0xc4, 0x49, 0x8b, 0x00, 0xc4, 0x43, 0x02, 0xf2, 0xdd, 0xc6, 0x8c,
+    0xa2, 0x00, 0xc4, 0x29, 0xc7, 0x62, 0x18, 0x00, 0xc4, 0x19, 0xcb, 0x96,
+    0x32, 0x00, 0xc4, 0x08, 0x03, 0xc2, 0xf2, 0xe3, 0x06, 0xc2, 0xf2, 0xef,
+    0xc3, 0x27, 0x57, 0x00, 0xc2, 0xd9, 0x0c, 0xc2, 0xf2, 0xf9, 0xc3, 0x39,
+    0x6e, 0x00, 0xc2, 0xc9, 0xc2, 0x01, 0x30, 0x00, 0xc2, 0x73, 0x02, 0xf3,
+    0x03, 0xc2, 0x02, 0x2b, 0x00, 0xc2, 0xb9, 0xc2, 0x01, 0x4a, 0x00, 0xc2,
+    0xb1, 0xc2, 0x19, 0x2c, 0x00, 0xc2, 0xa9, 0x16, 0xc2, 0xf3, 0x07, 0xc3,
+    0x1c, 0x63, 0x00, 0xc2, 0x91, 0xc2, 0x01, 0xc3, 0x00, 0xc2, 0x79, 0xc2,
+    0x0f, 0x9a, 0x00, 0xc2, 0x69, 0xc2, 0x00, 0xb0, 0x00, 0xc2, 0x61, 0xc2,
+    0x01, 0x5d, 0x00, 0xc2, 0x59, 0x97, 0x00, 0xc2, 0x3b, 0x02, 0xf3, 0x11,
+    0x91, 0x00, 0xc2, 0x33, 0x02, 0xf3, 0x15, 0x8b, 0x00, 0xc2, 0x29, 0x87,
+    0x00, 0xc2, 0x21, 0xcf, 0x66, 0x2a, 0x00, 0xc2, 0x18, 0xce, 0x17, 0xd4,
+    0x00, 0xc3, 0xc0, 0x1c, 0xc2, 0xf3, 0x19, 0xc3, 0x1c, 0x63, 0x00, 0xc3,
+    0x89, 0xc3, 0x47, 0xd9, 0x00, 0xc3, 0x81, 0x16, 0xc2, 0xf3, 0x23, 0xc2,
+    0x00, 0xd0, 0x00, 0xc3, 0x2b, 0x02, 0xf3, 0x2d, 0xc2, 0x01, 0x30, 0x00,
+    0xc3, 0x23, 0x02, 0xf3, 0x31, 0xc2, 0x00, 0x87, 0x00, 0xc3, 0x59, 0xc2,
+    0x25, 0x3b, 0x00, 0xc3, 0x51, 0xc2, 0x0e, 0x9a, 0x00, 0xc3, 0x49, 0xc3,
+    0x01, 0xe2, 0x00, 0xc3, 0x39, 0xc2, 0x00, 0xb0, 0x00, 0xc3, 0x31, 0xc2,
+    0x02, 0x2b, 0x00, 0xc3, 0x19, 0xc3, 0x01, 0x95, 0x00, 0xc3, 0x11, 0x97,
+    0x00, 0xc3, 0x0b, 0x02, 0xf3, 0x35, 0x8b, 0x00, 0xc2, 0xf3, 0x02, 0xf3,
+    0x39, 0x87, 0x00, 0xc2, 0xe8, 0xc4, 0x02, 0xde, 0x00, 0xc0, 0x69, 0xc2,
+    0x02, 0xa0, 0x00, 0xc0, 0x60, 0xc4, 0x26, 0x78, 0x08, 0xb2, 0xc9, 0xc5,
+    0x06, 0xdb, 0x08, 0xb2, 0xc1, 0x15, 0xc2, 0xf3, 0x3d, 0x08, 0xc2, 0xf3,
+    0x49, 0x16, 0xc2, 0xf3, 0x55, 0xc3, 0x05, 0x14, 0x08, 0xb2, 0x89, 0xc4,
+    0x15, 0xe7, 0x08, 0xb2, 0x80, 0xca, 0xa0, 0xee, 0x08, 0xb2, 0x01, 0xc7,
+    0x14, 0x39, 0x08, 0xb1, 0xe8, 0xc4, 0x1e, 0x97, 0x08, 0xb1, 0xf9, 0xc5,
+    0x40, 0xe7, 0x08, 0xb1, 0xf0, 0x97, 0x08, 0xb1, 0xe1, 0x8b, 0x08, 0xb1,
+    0xd1, 0x83, 0x08, 0xb1, 0x80, 0x8e, 0x08, 0xb1, 0xbb, 0x02, 0xf3, 0x61,
+    0x94, 0x08, 0xb1, 0xaa, 0x02, 0xf3, 0x65, 0x97, 0x08, 0xb1, 0xa0, 0x8b,
+    0x08, 0xb1, 0x90, 0xc2, 0x00, 0xdb, 0x08, 0xb1, 0x79, 0x83, 0x08, 0xb1,
+    0x48, 0x83, 0x08, 0xb1, 0x69, 0xc2, 0x0d, 0xf6, 0x08, 0xb1, 0x61, 0xc2,
+    0x00, 0xd0, 0x08, 0xb1, 0x58, 0x83, 0x08, 0xb1, 0x51, 0x47, 0xb2, 0x2e,
+    0x42, 0xf3, 0x69, 0xc2, 0x00, 0xd0, 0x08, 0xb1, 0x29, 0x83, 0x08, 0xb1,
+    0x20, 0xc2, 0x00, 0xd0, 0x08, 0xb1, 0x19, 0x83, 0x08, 0xb1, 0x10, 0x83,
+    0x08, 0xb1, 0x09, 0xc2, 0x00, 0xc1, 0x08, 0xb0, 0xe1, 0xc2, 0x19, 0x2c,
+    0x08, 0xb0, 0xb9, 0xc2, 0x01, 0x30, 0x08, 0xb0, 0x90, 0xc2, 0x00, 0xd0,
+    0x08, 0xb1, 0x01, 0x83, 0x08, 0xb0, 0xf9, 0x06, 0x42, 0xf3, 0x77, 0xc2,
+    0x00, 0xd0, 0x08, 0xb0, 0xf1, 0x83, 0x08, 0xb0, 0xe9, 0x16, 0x42, 0xf3,
+    0x81, 0xc2, 0x00, 0xd0, 0x08, 0xb0, 0xb1, 0x83, 0x08, 0xb0, 0xa8, 0xc2,
+    0x00, 0xd0, 0x08, 0xb0, 0xa1, 0x83, 0x08, 0xb0, 0x98, 0xc2, 0x00, 0xd0,
+    0x08, 0xb0, 0x89, 0x83, 0x08, 0xb0, 0x80, 0xc2, 0x00, 0xd0, 0x08, 0xb0,
+    0x79, 0x83, 0x08, 0xb0, 0x70, 0x97, 0x08, 0xb0, 0x69, 0x8b, 0x08, 0xb0,
+    0x59, 0x83, 0x08, 0xb0, 0x08, 0x97, 0x08, 0xb0, 0x28, 0x8b, 0x08, 0xb0,
+    0x18, 0x45, 0x03, 0x14, 0xc2, 0xf3, 0x8b, 0x4b, 0x07, 0x2a, 0xc2, 0xf4,
+    0x2e, 0x4a, 0x9f, 0x5e, 0xc2, 0xf4, 0x3a, 0x0a, 0x42, 0xf4, 0x46, 0x48,
+    0xba, 0x9a, 0xc2, 0xf4, 0x52, 0x47, 0x0b, 0x18, 0xc2, 0xf4, 0x64, 0x4d,
+    0x77, 0x1f, 0xc2, 0xf4, 0xcb, 0xd0, 0x08, 0xf7, 0x00, 0x16, 0x31, 0x47,
+    0x5e, 0xa8, 0xc2, 0xf4, 0xd7, 0xcb, 0x98, 0x21, 0x00, 0x16, 0xf9, 0xc4,
+    0x0d, 0xe4, 0x05, 0x3c, 0x48, 0x45, 0x00, 0x2d, 0xc2, 0xf4, 0xe3, 0x07,
+    0xc2, 0xf4, 0xf5, 0xca, 0x9f, 0xb8, 0x00, 0x16, 0xf1, 0x46, 0x0c, 0x27,
+    0x42, 0xf4, 0xff, 0x44, 0x00, 0x4a, 0xc2, 0xf5, 0x1d, 0xcc, 0x79, 0x0e,
+    0x08, 0x3d, 0xb9, 0x42, 0x00, 0x27, 0x42, 0xf5, 0x2f, 0xcb, 0x23, 0x34,
+    0x00, 0x16, 0x03, 0x02, 0xf5, 0x39, 0xcb, 0x1f, 0x0d, 0x00, 0x16, 0x59,
+    0xcb, 0x8f, 0x10, 0x00, 0x87, 0xe0, 0xcd, 0x80, 0x0f, 0x08, 0x3d, 0xa9,
+    0x45, 0x3f, 0x0e, 0x42, 0xf5, 0x3f, 0xcb, 0x83, 0xe6, 0x08, 0x3d, 0xb1,
+    0x11, 0x42, 0xf5, 0x4b, 0xcd, 0x7c, 0xdc, 0x08, 0x3d, 0xc1, 0xc9, 0x2d,
+    0x85, 0x00, 0x15, 0xe1, 0xcb, 0x83, 0x0e, 0x00, 0x16, 0x50, 0xc4, 0x18,
+    0x26, 0x00, 0x15, 0xc9, 0xc8, 0x60, 0xf4, 0x00, 0x16, 0xb0, 0xcb, 0x52,
+    0x55, 0x00, 0x15, 0xd9, 0xcf, 0x33, 0x1a, 0x00, 0x16, 0x80, 0x42, 0x00,
+    0x7f, 0xc2, 0xf5, 0x5d, 0xca, 0xa4, 0xea, 0x00, 0x17, 0x69, 0x95, 0x05,
+    0x3b, 0x80, 0xcc, 0x36, 0x87, 0x00, 0x16, 0x41, 0xc6, 0xc1, 0x63, 0x00,
+    0x17, 0x60, 0xc5, 0x60, 0xb2, 0x00, 0x16, 0x49, 0x0b, 0x42, 0xf5, 0x69,
+    0x45, 0xd7, 0xc7, 0xc2, 0xf5, 0x73, 0x43, 0x02, 0x9c, 0x42, 0xf5, 0x7f,
+    0x44, 0x08, 0xcc, 0xc2, 0xf5, 0x8b, 0xd4, 0x33, 0x15, 0x00, 0x16, 0x88,
+    0xd6, 0x2f, 0x30, 0x00, 0x17, 0x51, 0xd7, 0x2b, 0x51, 0x00, 0x17, 0x58,
+    0xc4, 0x38, 0x2c, 0x0e, 0xb7, 0x20, 0xc2, 0x01, 0x6f, 0x0e, 0xb7, 0x41,
+    0xc6, 0x10, 0x3f, 0x0e, 0xb7, 0x30, 0xc4, 0xdb, 0x4c, 0x0e, 0xb7, 0x28,
+    0xc2, 0x00, 0x0a, 0x0e, 0xb7, 0xc0, 0xc3, 0x04, 0x87, 0x0e, 0xb7, 0x18,
+    0xc4, 0xde, 0x3f, 0x0e, 0xb7, 0x10, 0x0f, 0x42, 0xf5, 0x9d, 0xc2, 0x00,
+    0xba, 0x0e, 0xb7, 0xc9, 0xc2, 0x00, 0x0a, 0x0e, 0xb7, 0xb9, 0x8b, 0x0e,
+    0xb7, 0x88, 0xc6, 0x10, 0x3f, 0x0e, 0xb7, 0xb0, 0xc2, 0x20, 0xec, 0x0e,
+    0xb7, 0xa9, 0xc4, 0x89, 0xfe, 0x0e, 0xb7, 0x4a, 0x02, 0xf5, 0xa9, 0xc4,
+    0x1a, 0x73, 0x0e, 0xb7, 0xa0, 0xc2, 0x01, 0x23, 0x0e, 0xb7, 0x90, 0x8b,
+    0x0e, 0xb7, 0x78, 0x97, 0x0e, 0xb7, 0x70, 0x97, 0x0e, 0xb7, 0x68, 0xc4,
+    0xdd, 0x9a, 0x0e, 0xb7, 0x60, 0xc4, 0x8b, 0x66, 0x0e, 0xb7, 0x58, 0xc3,
+    0x01, 0xbb, 0x0e, 0xb7, 0x50, 0xc3, 0x04, 0x87, 0x0e, 0xb7, 0x38, 0x0f,
+    0x42, 0xf5, 0xaf, 0xc2, 0x00, 0xba, 0x0e, 0xb8, 0x99, 0xc2, 0x00, 0x0a,
+    0x0e, 0xb8, 0x89, 0x8b, 0x0e, 0xb8, 0x58, 0xc2, 0x00, 0x0a, 0x0e, 0xb8,
+    0x90, 0xc6, 0x10, 0x3f, 0x0e, 0xb8, 0x80, 0xc2, 0x20, 0xec, 0x0e, 0xb8,
+    0x79, 0xc4, 0x89, 0xfe, 0x0e, 0xb8, 0x18, 0xc4, 0x1a, 0x73, 0x0e, 0xb8,
+    0x70, 0xca, 0x91, 0x2c, 0x0e, 0xb8, 0x68, 0xc2, 0x01, 0x23, 0x0e, 0xb8,
+    0x60, 0x8b, 0x0e, 0xb8, 0x48, 0x97, 0x0e, 0xb8, 0x40, 0x97, 0x0e, 0xb8,
+    0x38, 0xc4, 0xdd, 0x9a, 0x0e, 0xb8, 0x30, 0xc4, 0x8b, 0x66, 0x0e, 0xb8,
+    0x28, 0xc3, 0x01, 0xbb, 0x0e, 0xb8, 0x20, 0xc2, 0x01, 0x6f, 0x0e, 0xb8,
+    0x11, 0xc6, 0x10, 0x3f, 0x0e, 0xb8, 0x00, 0xc3, 0x04, 0x87, 0x0e, 0xb8,
+    0x08, 0xc4, 0xdb, 0x4c, 0x0e, 0xb7, 0xf9, 0x47, 0x3b, 0xc4, 0x42, 0xf5,
+    0xbb, 0xc4, 0x38, 0x2c, 0x0e, 0xb7, 0xf0, 0xc3, 0x04, 0x87, 0x0e, 0xb7,
+    0xe8, 0xc4, 0xde, 0x3f, 0x0e, 0xb7, 0xe0, 0x9c, 0x0e, 0xa1, 0x9b, 0x02,
+    0xf5, 0xc3, 0x9b, 0x0e, 0xa1, 0x91, 0x9a, 0x0e, 0xa1, 0x8b, 0x02, 0xf5,
+    0xc9, 0x99, 0x0e, 0xa1, 0x81, 0x98, 0x0e, 0xa1, 0x79, 0x97, 0x0e, 0xa1,
+    0x73, 0x02, 0xf5, 0xcd, 0x86, 0x0e, 0xa0, 0xeb, 0x02, 0xf5, 0xd3, 0x91,
+    0x0e, 0xa1, 0x43, 0x02, 0xf5, 0xdf, 0x92, 0x0e, 0xa1, 0x4b, 0x02, 0xf5,
+    0xe3, 0x85, 0x0e, 0xa0, 0xe3, 0x02, 0xf5, 0xf3, 0x96, 0x0e, 0xa1, 0x6b,
+    0x02, 0xf5, 0xf9, 0x95, 0x0e, 0xa1, 0x63, 0x02, 0xf6, 0x05, 0x88, 0x0e,
+    0xa0, 0xfb, 0x02, 0xf6, 0x0b, 0x94, 0x0e, 0xa1, 0x5b, 0x02, 0xf6, 0x11,
+    0x90, 0x0e, 0xa1, 0x3b, 0x02, 0xf6, 0x17, 0x8f, 0x0e, 0xa1, 0x33, 0x02,
+    0xf6, 0x1b, 0x8e, 0x0e, 0xa1, 0x2b, 0x02, 0xf6, 0x1f, 0x8d, 0x0e, 0xa1,
+    0x23, 0x02, 0xf6, 0x25, 0x8b, 0x0e, 0xa1, 0x13, 0x02, 0xf6, 0x2b, 0x87,
+    0x0e, 0xa0, 0xf3, 0x02, 0xf6, 0x31, 0x89, 0x0e, 0xa1, 0x03, 0x02, 0xf6,
+    0x3d, 0x84, 0x0e, 0xa0, 0xdb, 0x02, 0xf6, 0x43, 0x83, 0x0e, 0xa0, 0xd3,
+    0x02, 0xf6, 0x49, 0x93, 0x0e, 0xa1, 0x51, 0x8c, 0x0e, 0xa1, 0x19, 0x8a,
+    0x0e, 0xa1, 0x08, 0x46, 0x03, 0x13, 0xc2, 0xf6, 0x4f, 0x48, 0x0b, 0x17,
+    0x42, 0xf6, 0xb7, 0xc4, 0x18, 0x10, 0x0e, 0xbe, 0xa9, 0xc2, 0x22, 0xcc,
+    0x0e, 0xbe, 0xa0, 0xc3, 0x0d, 0x14, 0x0e, 0xbe, 0x99, 0xc3, 0x09, 0x9e,
+    0x0e, 0xbe, 0x90, 0xc4, 0x02, 0xde, 0x0e, 0xbe, 0x89, 0xc2, 0x02, 0xa0,
+    0x0e, 0xbe, 0x80, 0xc6, 0x51, 0x50, 0x0e, 0xbe, 0x51, 0xc4, 0xdb, 0x4c,
+    0x0e, 0xb5, 0x58, 0x0f, 0x42, 0xf7, 0x1f, 0xc2, 0x00, 0xba, 0x0e, 0xb5,
+    0xf9, 0xc2, 0x00, 0x0a, 0x0e, 0xb5, 0xe9, 0x8b, 0x0e, 0xb5, 0xb8, 0xc2,
+    0x00, 0x0a, 0x0e, 0xb5, 0xf0, 0xc6, 0x10, 0x3f, 0x0e, 0xb5, 0xe0, 0xc2,
+    0x20, 0xec, 0x0e, 0xb5, 0xd9, 0xc4, 0x89, 0xfe, 0x0e, 0xb5, 0x7a, 0x02,
+    0xf7, 0x2b, 0xc4, 0x1a, 0x73, 0x0e, 0xb5, 0xd0, 0xc2, 0x01, 0x23, 0x0e,
+    0xb5, 0xc0, 0x8b, 0x0e, 0xb5, 0xa8, 0x97, 0x0e, 0xb5, 0xa0, 0x97, 0x0e,
+    0xb5, 0x98, 0xc4, 0xdd, 0x9a, 0x0e, 0xb5, 0x90, 0xc4, 0x8b, 0x66, 0x0e,
+    0xb5, 0x88, 0xc3, 0x01, 0xbb, 0x0e, 0xb5, 0x80, 0xc2, 0x01, 0x6f, 0x0e,
+    0xb5, 0x71, 0xc6, 0x10, 0x3f, 0x0e, 0xb5, 0x60, 0xc3, 0x04, 0x87, 0x0e,
+    0xb5, 0x68, 0xc4, 0x38, 0x2c, 0x0e, 0xb5, 0x50, 0xc3, 0x04, 0x87, 0x0e,
+    0xb5, 0x48, 0xc4, 0xde, 0x3f, 0x0e, 0xb5, 0x40, 0xc8, 0x9c, 0x0e, 0x0e,
+    0xba, 0xa9, 0xc9, 0xaa, 0x9e, 0x0e, 0xba, 0x99, 0xd3, 0x43, 0x00, 0x0e,
+    0xba, 0x78, 0x91, 0x0e, 0xa4, 0x83, 0x02, 0xf7, 0x31, 0x92, 0x0e, 0xa4,
+    0x8b, 0x02, 0xf7, 0x35, 0x85, 0x0e, 0xa4, 0x23, 0x02, 0xf7, 0x45, 0x97,
+    0x0e, 0xa4, 0xb3, 0x02, 0xf7, 0x4b, 0x96, 0x0e, 0xa4, 0xab, 0x02, 0xf7,
+    0x51, 0x95, 0x0e, 0xa4, 0xa3, 0x02, 0xf7, 0x5d, 0x88, 0x0e, 0xa4, 0x3b,
+    0x02, 0xf7, 0x63, 0x94, 0x0e, 0xa4, 0x9b, 0x02, 0xf7, 0x69, 0x9a, 0x0e,
+    0xa4, 0xcb, 0x02, 0xf7, 0x6f, 0x90, 0x0e, 0xa4, 0x7b, 0x02, 0xf7, 0x73,
+    0x8f, 0x0e, 0xa4, 0x73, 0x02, 0xf7, 0x77, 0x8e, 0x0e, 0xa4, 0x6b, 0x02,
+    0xf7, 0x7b, 0x8d, 0x0e, 0xa4, 0x63, 0x02, 0xf7, 0x81, 0x8b, 0x0e, 0xa4,
+    0x53, 0x02, 0xf7, 0x87, 0x87, 0x0e, 0xa4, 0x33, 0x02, 0xf7, 0x8d, 0x9c,
+    0x0e, 0xa4, 0xdb, 0x02, 0xf7, 0x99, 0x86, 0x0e, 0xa4, 0x2b, 0x02, 0xf7,
+    0x9f, 0x89, 0x0e, 0xa4, 0x43, 0x02, 0xf7, 0xa5, 0x84, 0x0e, 0xa4, 0x1b,
+    0x02, 0xf7, 0xab, 0x83, 0x0e, 0xa4, 0x13, 0x02, 0xf7, 0xb1, 0x9b, 0x0e,
+    0xa4, 0xd1, 0x99, 0x0e, 0xa4, 0xc1, 0x98, 0x0e, 0xa4, 0xb9, 0x93, 0x0e,
+    0xa4, 0x91, 0x8c, 0x0e, 0xa4, 0x59, 0x8a, 0x0e, 0xa4, 0x48, 0x91, 0x0e,
+    0xa3, 0xb3, 0x02, 0xf7, 0xb7, 0x92, 0x0e, 0xa3, 0xbb, 0x02, 0xf7, 0xbb,
+    0x85, 0x0e, 0xa3, 0x53, 0x02, 0xf7, 0xcb, 0x97, 0x0e, 0xa3, 0xe3, 0x02,
+    0xf7, 0xd1, 0x96, 0x0e, 0xa3, 0xdb, 0x02, 0xf7, 0xd7, 0x95, 0x0e, 0xa3,
+    0xd3, 0x02, 0xf7, 0xe6, 0x94, 0x0e, 0xa3, 0xcb, 0x02, 0xf7, 0xec, 0x9a,
+    0x0e, 0xa3, 0xfb, 0x02, 0xf7, 0xf2, 0x90, 0x0e, 0xa3, 0xab, 0x02, 0xf7,
+    0xf6, 0x8f, 0x0e, 0xa3, 0xa3, 0x02, 0xf7, 0xfa, 0x8e, 0x0e, 0xa3, 0x9b,
+    0x02, 0xf7, 0xfe, 0x8d, 0x0e, 0xa3, 0x93, 0x02, 0xf8, 0x04, 0x8b, 0x0e,
+    0xa3, 0x83, 0x02, 0xf8, 0x0a, 0x87, 0x0e, 0xa3, 0x63, 0x02, 0xf8, 0x10,
+    0x9c, 0x0e, 0xa4, 0x0b, 0x02, 0xf8, 0x1c, 0x86, 0x0e, 0xa3, 0x5b, 0x02,
+    0xf8, 0x22, 0x89, 0x0e, 0xa3, 0x73, 0x02, 0xf8, 0x28, 0x84, 0x0e, 0xa3,
+    0x4b, 0x02, 0xf8, 0x2e, 0x83, 0x0e, 0xa3, 0x43, 0x02, 0xf8, 0x34, 0x9b,
+    0x0e, 0xa4, 0x01, 0x99, 0x0e, 0xa3, 0xf1, 0x98, 0x0e, 0xa3, 0xe9, 0x93,
+    0x0e, 0xa3, 0xc1, 0x8c, 0x0e, 0xa3, 0x89, 0x8a, 0x0e, 0xa3, 0x79, 0x88,
+    0x0e, 0xa3, 0x68, 0x9c, 0x0e, 0xac, 0xf9, 0x9b, 0x0e, 0xac, 0xf1, 0x9a,
+    0x0e, 0xac, 0xe9, 0x99, 0x0e, 0xac, 0xe1, 0x98, 0x0e, 0xac, 0xd9, 0x97,
+    0x0e, 0xac, 0xd1, 0x96, 0x0e, 0xac, 0xc9, 0x95, 0x0e, 0xac, 0xc1, 0x94,
+    0x0e, 0xac, 0xb9, 0x93, 0x0e, 0xac, 0xb1, 0x92, 0x0e, 0xac, 0xa9, 0x91,
+    0x0e, 0xac, 0xa1, 0x90, 0x0e, 0xac, 0x99, 0x8f, 0x0e, 0xac, 0x91, 0x8e,
+    0x0e, 0xac, 0x89, 0x8d, 0x0e, 0xac, 0x81, 0x8c, 0x0e, 0xac, 0x79, 0x8b,
+    0x0e, 0xac, 0x71, 0x8a, 0x0e, 0xac, 0x69, 0x89, 0x0e, 0xac, 0x61, 0x88,
+    0x0e, 0xac, 0x59, 0x87, 0x0e, 0xac, 0x51, 0x86, 0x0e, 0xac, 0x49, 0x85,
+    0x0e, 0xac, 0x41, 0x84, 0x0e, 0xac, 0x39, 0x83, 0x0e, 0xac, 0x30, 0x9c,
+    0x0e, 0xac, 0x29, 0x9b, 0x0e, 0xac, 0x21, 0x9a, 0x0e, 0xac, 0x19, 0x99,
+    0x0e, 0xac, 0x11, 0x98, 0x0e, 0xac, 0x09, 0x97, 0x0e, 0xac, 0x01, 0x96,
+    0x0e, 0xab, 0xf9, 0x95, 0x0e, 0xab, 0xf1, 0x94, 0x0e, 0xab, 0xe9, 0x93,
+    0x0e, 0xab, 0xe1, 0x92, 0x0e, 0xab, 0xd9, 0x91, 0x0e, 0xab, 0xd1, 0x90,
+    0x0e, 0xab, 0xc9, 0x8f, 0x0e, 0xab, 0xc1, 0x8e, 0x0e, 0xab, 0xb9, 0x8d,
+    0x0e, 0xab, 0xb1, 0x8c, 0x0e, 0xab, 0xa9, 0x8b, 0x0e, 0xab, 0xa1, 0x8a,
+    0x0e, 0xab, 0x99, 0x89, 0x0e, 0xab, 0x91, 0x88, 0x0e, 0xab, 0x89, 0x87,
+    0x0e, 0xab, 0x81, 0x86, 0x0e, 0xab, 0x79, 0x85, 0x0e, 0xab, 0x71, 0x84,
+    0x0e, 0xab, 0x69, 0x83, 0x0e, 0xab, 0x60, 0xc4, 0x18, 0x10, 0x0e, 0xbf,
+    0xe9, 0xc2, 0x22, 0xcc, 0x0e, 0xbf, 0xe0, 0xc3, 0x0d, 0x14, 0x0e, 0xbf,
+    0xd9, 0xc3, 0x09, 0x9e, 0x0e, 0xbf, 0xd0, 0xc4, 0x02, 0xde, 0x0e, 0xbf,
+    0xc9, 0xc2, 0x02, 0xa0, 0x0e, 0xbf, 0xc0, 0x46, 0x09, 0x97, 0xc2, 0xf8,
+    0x3a, 0x47, 0xc7, 0x4a, 0xc2, 0xf8, 0x5e, 0x12, 0xc2, 0xf8, 0x8c, 0xca,
+    0x9c, 0xac, 0x0e, 0xbc, 0x71, 0xcc, 0x8b, 0x65, 0x0e, 0xbc, 0x61, 0xcc,
+    0x89, 0xfd, 0x0e, 0xbc, 0x59, 0xce, 0x10, 0x3e, 0x0e, 0xbc, 0x51, 0x46,
+    0x03, 0x13, 0xc2, 0xf8, 0x9e, 0xc5, 0xdb, 0xf0, 0x0e, 0xbb, 0x79, 0x48,
+    0x0b, 0x17, 0x42, 0xf9, 0x42, 0xc4, 0x26, 0x78, 0x0e, 0xbf, 0x59, 0xc5,
+    0x06, 0xdb, 0x0e, 0xbf, 0x51, 0x15, 0xc2, 0xf9, 0xe3, 0x08, 0xc2, 0xf9,
+    0xef, 0x16, 0xc2, 0xf9, 0xfb, 0xc3, 0x05, 0x14, 0x0e, 0xbf, 0x19, 0xc4,
+    0x15, 0xe7, 0x0e, 0xbf, 0x10, 0x46, 0x03, 0x13, 0xc2, 0xfa, 0x07, 0x48,
+    0x0b, 0x17, 0x42, 0xfa, 0x6f, 0x9c, 0x0e, 0xae, 0x99, 0x9b, 0x0e, 0xae,
+    0x91, 0x9a, 0x0e, 0xae, 0x89, 0x99, 0x0e, 0xae, 0x81, 0x98, 0x0e, 0xae,
+    0x79, 0x97, 0x0e, 0xae, 0x71, 0x96, 0x0e, 0xae, 0x69, 0x95, 0x0e, 0xae,
+    0x61, 0x94, 0x0e, 0xae, 0x59, 0x93, 0x0e, 0xae, 0x51, 0x92, 0x0e, 0xae,
+    0x49, 0x91, 0x0e, 0xae, 0x41, 0x90, 0x0e, 0xae, 0x39, 0x8f, 0x0e, 0xae,
+    0x31, 0x8e, 0x0e, 0xae, 0x29, 0x8d, 0x0e, 0xae, 0x21, 0x8c, 0x0e, 0xae,
+    0x19, 0x8b, 0x0e, 0xae, 0x11, 0x8a, 0x0e, 0xae, 0x09, 0x89, 0x0e, 0xae,
+    0x01, 0x88, 0x0e, 0xad, 0xf9, 0x87, 0x0e, 0xad, 0xf1, 0x86, 0x0e, 0xad,
+    0xe9, 0x85, 0x0e, 0xad, 0xe1, 0x84, 0x0e, 0xad, 0xd9, 0x83, 0x0e, 0xad,
+    0xd0, 0x9c, 0x0e, 0xad, 0xc9, 0x9b, 0x0e, 0xad, 0xc1, 0x9a, 0x0e, 0xad,
+    0xb9, 0x99, 0x0e, 0xad, 0xb1, 0x98, 0x0e, 0xad, 0xa9, 0x97, 0x0e, 0xad,
+    0xa1, 0x96, 0x0e, 0xad, 0x99, 0x95, 0x0e, 0xad, 0x91, 0x94, 0x0e, 0xad,
+    0x89, 0x93, 0x0e, 0xad, 0x81, 0x92, 0x0e, 0xad, 0x79, 0x91, 0x0e, 0xad,
+    0x71, 0x90, 0x0e, 0xad, 0x69, 0x8f, 0x0e, 0xad, 0x61, 0x8e, 0x0e, 0xad,
+    0x59, 0x8d, 0x0e, 0xad, 0x51, 0x8c, 0x0e, 0xad, 0x49, 0x8b, 0x0e, 0xad,
+    0x41, 0x8a, 0x0e, 0xad, 0x39, 0x89, 0x0e, 0xad, 0x31, 0x88, 0x0e, 0xad,
+    0x29, 0x87, 0x0e, 0xad, 0x21, 0x86, 0x0e, 0xad, 0x19, 0x85, 0x0e, 0xad,
+    0x11, 0x84, 0x0e, 0xad, 0x09, 0x83, 0x0e, 0xad, 0x00, 0x9c, 0x0e, 0xa6,
+    0x79, 0x9b, 0x0e, 0xa6, 0x71, 0x9a, 0x0e, 0xa6, 0x69, 0x99, 0x0e, 0xa6,
+    0x61, 0x98, 0x0e, 0xa6, 0x59, 0x97, 0x0e, 0xa6, 0x51, 0x96, 0x0e, 0xa6,
+    0x49, 0x95, 0x0e, 0xa6, 0x41, 0x94, 0x0e, 0xa6, 0x39, 0x93, 0x0e, 0xa6,
+    0x31, 0x92, 0x0e, 0xa6, 0x29, 0x90, 0x0e, 0xa6, 0x19, 0x8f, 0x0e, 0xa6,
+    0x11, 0x8e, 0x0e, 0xa6, 0x09, 0x8d, 0x0e, 0xa6, 0x01, 0x8c, 0x0e, 0xa5,
+    0xf9, 0x8b, 0x0e, 0xa5, 0xf1, 0x8a, 0x0e, 0xa5, 0xe9, 0x88, 0x0e, 0xa5,
+    0xd9, 0x86, 0x0e, 0xa5, 0xc9, 0x85, 0x0e, 0xa5, 0xc1, 0x84, 0x0e, 0xa5,
+    0xb9, 0x83, 0x0e, 0xa5, 0xb0, 0x9c, 0x0e, 0xa5, 0xa9, 0x9b, 0x0e, 0xa5,
+    0xa1, 0x9a, 0x0e, 0xa5, 0x99, 0x99, 0x0e, 0xa5, 0x91, 0x98, 0x0e, 0xa5,
+    0x89, 0x97, 0x0e, 0xa5, 0x81, 0x96, 0x0e, 0xa5, 0x79, 0x95, 0x0e, 0xa5,
+    0x71, 0x93, 0x0e, 0xa5, 0x61, 0x92, 0x0e, 0xa5, 0x59, 0x91, 0x0e, 0xa5,
+    0x51, 0x90, 0x0e, 0xa5, 0x49, 0x8d, 0x0e, 0xa5, 0x31, 0x8c, 0x0e, 0xa5,
+    0x29, 0x89, 0x0e, 0xa5, 0x11, 0x86, 0x0e, 0xa4, 0xf9, 0x85, 0x0e, 0xa4,
+    0xf1, 0x83, 0x0e, 0xa4, 0xe0, 0xc4, 0x18, 0x10, 0x0e, 0xbe, 0xf9, 0xc2,
+    0x22, 0xcc, 0x0e, 0xbe, 0xf0, 0xc3, 0x0d, 0x14, 0x0e, 0xbe, 0xe9, 0xc3,
+    0x09, 0x9e, 0x0e, 0xbe, 0xe0, 0xc4, 0x02, 0xde, 0x0e, 0xbe, 0xd9, 0xc2,
+    0x02, 0xa0, 0x0e, 0xbe, 0xd0, 0x9c, 0x0e, 0xa9, 0xb9, 0x9b, 0x0e, 0xa9,
+    0xb1, 0x9a, 0x0e, 0xa9, 0xa9, 0x99, 0x0e, 0xa9, 0xa1, 0x98, 0x0e, 0xa9,
+    0x99, 0x97, 0x0e, 0xa9, 0x91, 0x96, 0x0e, 0xa9, 0x89, 0x95, 0x0e, 0xa9,
+    0x81, 0x94, 0x0e, 0xa9, 0x79, 0x93, 0x0e, 0xa9, 0x71, 0x92, 0x0e, 0xa9,
+    0x69, 0x91, 0x0e, 0xa9, 0x61, 0x90, 0x0e, 0xa9, 0x59, 0x8f, 0x0e, 0xa9,
+    0x51, 0x8e, 0x0e, 0xa9, 0x49, 0x8d, 0x0e, 0xa9, 0x41, 0x8c, 0x0e, 0xa9,
+    0x39, 0x8b, 0x0e, 0xa9, 0x31, 0x8a, 0x0e, 0xa9, 0x29, 0x89, 0x0e, 0xa9,
+    0x21, 0x88, 0x0e, 0xa9, 0x19, 0x87, 0x0e, 0xa9, 0x11, 0x86, 0x0e, 0xa9,
+    0x09, 0x85, 0x0e, 0xa9, 0x01, 0x84, 0x0e, 0xa8, 0xf9, 0x83, 0x0e, 0xa8,
+    0xf0, 0x9b, 0x0e, 0xa8, 0xe1, 0x9a, 0x0e, 0xa8, 0xd9, 0x99, 0x0e, 0xa8,
+    0xd1, 0x98, 0x0e, 0xa8, 0xc9, 0x97, 0x0e, 0xa8, 0xc1, 0x96, 0x0e, 0xa8,
+    0xb9, 0x95, 0x0e, 0xa8, 0xb1, 0x93, 0x0e, 0xa8, 0xa1, 0x92, 0x0e, 0xa8,
+    0x99, 0x91, 0x0e, 0xa8, 0x91, 0x90, 0x0e, 0xa8, 0x89, 0x8f, 0x0e, 0xa8,
+    0x81, 0x8e, 0x0e, 0xa8, 0x79, 0x8d, 0x0e, 0xa8, 0x71, 0x8c, 0x0e, 0xa8,
+    0x69, 0x89, 0x0e, 0xa8, 0x51, 0x88, 0x0e, 0xa8, 0x49, 0x87, 0x0e, 0xa8,
+    0x41, 0x86, 0x0e, 0xa8, 0x39, 0x84, 0x0e, 0xa8, 0x29, 0x83, 0x0e, 0xa8,
+    0x20, 0xd6, 0x08, 0x88, 0x01, 0x3f, 0x69, 0xce, 0x25, 0xad, 0x01, 0x3f,
+    0x38, 0x97, 0x08, 0xe9, 0xf9, 0x8b, 0x08, 0xe9, 0xe1, 0x83, 0x08, 0xe9,
+    0x88, 0x97, 0x08, 0xe9, 0xa8, 0x8b, 0x08, 0xe9, 0x98, 0xc2, 0x00, 0xd0,
+    0x08, 0xe8, 0xb9, 0x83, 0x08, 0xe8, 0xb0, 0xc2, 0x00, 0xd0, 0x08, 0xe8,
+    0xc9, 0x83, 0x08, 0xe8, 0xc0, 0x83, 0x08, 0xe5, 0x69, 0xc2, 0x00, 0xd0,
+    0x08, 0xe5, 0x60, 0x83, 0x08, 0xe5, 0x39, 0xc2, 0x00, 0xd0, 0x08, 0xe5,
+    0x30, 0xc2, 0x02, 0x1c, 0x08, 0xe5, 0x21, 0x83, 0x08, 0xe4, 0xe0, 0x15,
+    0xc2, 0xfa, 0xd7, 0xc2, 0x00, 0xd0, 0x08, 0xe4, 0xd9, 0x83, 0x08, 0xe4,
+    0xd0, 0xc2, 0x00, 0xd0, 0x08, 0xe4, 0xf9, 0x83, 0x08, 0xe4, 0xf0, 0x83,
+    0x08, 0xe4, 0xe9, 0xc2, 0x19, 0x2c, 0x08, 0xe4, 0xc9, 0xc2, 0x01, 0x30,
+    0x08, 0xe4, 0xa8, 0xc2, 0x00, 0xd0, 0x08, 0xe4, 0xb9, 0x83, 0x08, 0xe4,
+    0xb0, 0xc2, 0x00, 0xd0, 0x08, 0xe4, 0x99, 0x83, 0x08, 0xe4, 0x90, 0xc2,
+    0x00, 0xd0, 0x08, 0xe4, 0x19, 0x83, 0x08, 0xe4, 0x10, 0xc5, 0x40, 0xe7,
+    0x00, 0x68, 0x19, 0xc4, 0x1e, 0x97, 0x00, 0x6a, 0x68, 0x94, 0x00, 0x68,
+    0x5b, 0x02, 0xfa, 0xe1, 0x8e, 0x00, 0x68, 0x62, 0x02, 0xfa, 0xe5, 0x83,
+    0x00, 0x69, 0x19, 0xc2, 0x00, 0xc1, 0x00, 0x69, 0x48, 0x83, 0x00, 0x68,
+    0xd9, 0x45, 0xd4, 0x7a, 0x42, 0xfa, 0xe9, 0x83, 0x00, 0x68, 0xf9, 0xc2,
+    0x00, 0xd0, 0x00, 0x69, 0x01, 0xc2, 0x01, 0x6f, 0x00, 0x69, 0xd0, 0x83,
+    0x00, 0x69, 0x09, 0xc2, 0x00, 0xd0, 0x00, 0x69, 0x10, 0x83, 0x00, 0x69,
+    0x99, 0xc2, 0x00, 0xdb, 0x00, 0x69, 0xa0, 0x94, 0x00, 0x6a, 0x20, 0x8e,
+    0x00, 0x6b, 0x18, 0xc7, 0xc7, 0x74, 0x00, 0x6a, 0xc9, 0xc4, 0x9c, 0x07,
+    0x00, 0x6a, 0xf0, 0xc8, 0x1e, 0x16, 0x00, 0x6a, 0xd9, 0xc4, 0x0f, 0x1f,
+    0x00, 0x6a, 0xe0, 0xc2, 0x02, 0xa0, 0x00, 0x6b, 0x41, 0xc4, 0x02, 0xde,
+    0x00, 0x6b, 0x48, 0xc3, 0x09, 0x9e, 0x00, 0x6b, 0x51, 0xc3, 0x0d, 0x14,
+    0x00, 0x6b, 0x58, 0xc2, 0x22, 0xcc, 0x00, 0x6b, 0x61, 0xc4, 0x18, 0x10,
+    0x00, 0x6b, 0x68, 0xcb, 0x44, 0x4b, 0x08, 0x57, 0x98, 0xc3, 0x77, 0x79,
+    0x08, 0x56, 0xe9, 0xc4, 0xdc, 0x2d, 0x08, 0x56, 0xc0, 0x96, 0x00, 0x42,
+    0x40, 0x8a, 0x00, 0x42, 0xa1, 0x9c, 0x00, 0x42, 0x88, 0xc2, 0x0d, 0xf6,
+    0x00, 0x42, 0x48, 0xc2, 0x00, 0x39, 0x08, 0x8b, 0x91, 0x83, 0x08, 0x8b,
+    0x68, 0xc2, 0x00, 0xd0, 0x08, 0x8b, 0x59, 0x83, 0x08, 0x8b, 0x50, 0xc2,
+    0x00, 0xd0, 0x08, 0x8b, 0x49, 0x83, 0x08, 0x8b, 0x40, 0x83, 0x08, 0x8b,
+    0x39, 0xc2, 0x00, 0xc1, 0x08, 0x8b, 0x11, 0xc2, 0x19, 0x2c, 0x08, 0x8a,
+    0xe8, 0xc2, 0x00, 0xd0, 0x08, 0x8b, 0x31, 0x83, 0x08, 0x8b, 0x29, 0x06,
+    0x42, 0xfb, 0x09, 0xc2, 0x00, 0xd0, 0x08, 0x8b, 0x21, 0x83, 0x08, 0x8b,
+    0x19, 0x16, 0x42, 0xfb, 0x13, 0xc2, 0x00, 0xd0, 0x08, 0x8a, 0xe1, 0x83,
+    0x08, 0x8a, 0xd8, 0xc2, 0x00, 0xd0, 0x08, 0x8a, 0xd1, 0x83, 0x08, 0x8a,
+    0xc8, 0xc2, 0x00, 0xd0, 0x08, 0x8a, 0xc1, 0x83, 0x08, 0x8a, 0xb8, 0xc2,
+    0x00, 0xd0, 0x08, 0x8a, 0xb1, 0x83, 0x08, 0x8a, 0xa8, 0xc9, 0xa8, 0x4c,
+    0x0f, 0x80, 0x71, 0xc6, 0x3a, 0x70, 0x0f, 0x81, 0x00, 0xc9, 0xa8, 0x4c,
+    0x0f, 0x80, 0x61, 0xc6, 0x3a, 0x70, 0x0f, 0x80, 0xf0, 0xc9, 0xa8, 0x4c,
+    0x0f, 0x80, 0x69, 0xc6, 0x3a, 0x70, 0x0f, 0x80, 0xf8, 0xc9, 0xa8, 0x4c,
+    0x0f, 0x80, 0x79, 0xc6, 0x3a, 0x70, 0x0f, 0x81, 0x08, 0xc9, 0xa8, 0x4c,
+    0x0f, 0x80, 0x39, 0xc6, 0x3a, 0x70, 0x0f, 0x80, 0xc8, 0xc9, 0xa8, 0x4c,
+    0x0f, 0x80, 0x41, 0xc6, 0x3a, 0x70, 0x0f, 0x80, 0xd0, 0xc9, 0xa8, 0x4c,
+    0x0f, 0x80, 0x49, 0xc6, 0x3a, 0x70, 0x0f, 0x80, 0xd8, 0xc9, 0xa8, 0x4c,
+    0x0f, 0x80, 0x51, 0xc6, 0x3a, 0x70, 0x0f, 0x80, 0xe0, 0xc9, 0xa8, 0x4c,
+    0x0f, 0x80, 0x59, 0xc6, 0x3a, 0x70, 0x0f, 0x80, 0xe8, 0x0d, 0xc2, 0xfb,
+    0x1d, 0x15, 0xc2, 0xfb, 0x29, 0x12, 0xc2, 0xfb, 0x50, 0x16, 0xc2, 0xfb,
+    0x6a, 0x05, 0xc2, 0xfb, 0x93, 0x18, 0xc2, 0xfb, 0xb7, 0x09, 0xc2, 0xfb,
+    0xc3, 0x0f, 0xc2, 0xfb, 0xd6, 0x04, 0xc2, 0xfb, 0xf7, 0x0e, 0xc2, 0xfc,
+    0x01, 0x08, 0xc2, 0xfc, 0x10, 0x06, 0xc2, 0xfc, 0x36, 0x19, 0xc2, 0xfc,
+    0x4a, 0x42, 0x00, 0xd0, 0xc2, 0xfc, 0x56, 0x07, 0xc2, 0xfc, 0x62, 0x10,
+    0xc2, 0xfc, 0x6e, 0x11, 0xc2, 0xfc, 0x86, 0xcd, 0x77, 0x05, 0x0e, 0x8c,
+    0xc1, 0x9c, 0x0e, 0x8c, 0x71, 0x14, 0xc2, 0xfc, 0x98, 0x4b, 0x99, 0x8c,
+    0xc2, 0xfc, 0xa0, 0x42, 0x00, 0xe3, 0xc2, 0xfc, 0xac, 0xca, 0x5c, 0x28,
+    0x0e, 0x8a, 0x18, 0x00, 0x42, 0xfc, 0xb8, 0xc2, 0x00, 0xd0, 0x08, 0x94,
+    0xa1, 0xc2, 0x0e, 0x9a, 0x08, 0x94, 0x99, 0x83, 0x08, 0x94, 0x90, 0x83,
+    0x08, 0x94, 0x81, 0xc2, 0x00, 0xd0, 0x08, 0x94, 0x88, 0x9f, 0x00, 0x84,
+    0x39, 0xa0, 0x00, 0x84, 0x41, 0xa2, 0x00, 0x84, 0x49, 0xa3, 0x00, 0x84,
+    0x50, 0x45, 0x28, 0xb1, 0xc2, 0xfc, 0xc4, 0xcd, 0x7a, 0x6c, 0x00, 0x84,
+    0x68, 0xc2, 0x00, 0x39, 0x05, 0x53, 0x99, 0xc2, 0x02, 0x1c, 0x05, 0x53,
+    0x91, 0xc2, 0x8d, 0x8f, 0x05, 0x53, 0x89, 0xc2, 0x00, 0x87, 0x05, 0x53,
+    0x79, 0xc3, 0x12, 0xad, 0x05, 0x53, 0x71, 0x0a, 0xc2, 0xfc, 0xcc, 0xc2,
+    0x0d, 0xf6, 0x05, 0x53, 0x61, 0x10, 0xc2, 0xfc, 0xd6, 0x06, 0xc2, 0xfc,
+    0xe0, 0x42, 0x02, 0x10, 0xc2, 0xfc, 0xea, 0x0c, 0xc2, 0xfc, 0xf4, 0x05,
+    0xc2, 0xfc, 0xfe, 0xc2, 0x01, 0x30, 0x05, 0x53, 0x00, 0x04, 0xc2, 0xfd,
+    0x08, 0x06, 0xc2, 0xfd, 0x12, 0xc3, 0x12, 0xad, 0x05, 0x4f, 0xd1, 0x10,
+    0xc2, 0xfd, 0x20, 0x0c, 0xc2, 0xfd, 0x2c, 0x09, 0xc2, 0xfd, 0x36, 0xc2,
+    0x00, 0x87, 0x05, 0x4f, 0x40, 0x42, 0x00, 0xbd, 0x42, 0xfd, 0x40, 0xc3,
+    0x1c, 0x63, 0x05, 0x53, 0xf1, 0xc3, 0x01, 0xe2, 0x05, 0x53, 0xf8, 0x83,
+    0x00, 0x82, 0xe1, 0x87, 0x00, 0x82, 0xe8, 0x90, 0x0d, 0x8b, 0x3b, 0x02,
+    0xfd, 0x82, 0x19, 0xc2, 0xfd, 0x86, 0x83, 0x01, 0x85, 0x09, 0x8b, 0x01,
+    0x85, 0x19, 0x97, 0x01, 0x85, 0x29, 0x87, 0x01, 0x85, 0x39, 0x91, 0x01,
+    0x85, 0x49, 0x16, 0xc2, 0xfd, 0x96, 0x1b, 0xc2, 0xfd, 0x9e, 0x0d, 0xc2,
+    0xfd, 0xaa, 0x15, 0xc2, 0xfd, 0xb6, 0x0a, 0xc2, 0xfd, 0xbe, 0xc2, 0x14,
+    0x48, 0x01, 0x8f, 0xd1, 0x14, 0x42, 0xfd, 0xd2, 0x87, 0x0d, 0x80, 0x01,
+    0xc2, 0x14, 0x68, 0x0d, 0x89, 0x11, 0x1b, 0x42, 0xfd, 0xe6, 0x45, 0xd8,
+    0xdf, 0x42, 0xfd, 0xee, 0x83, 0x00, 0x64, 0x31, 0x8b, 0x00, 0x64, 0x81,
+    0x97, 0x00, 0x64, 0xa0, 0x8b, 0x00, 0x64, 0x40, 0x97, 0x00, 0x64, 0x50,
+    0x47, 0xb2, 0x2e, 0xc2, 0xfd, 0xfa, 0x83, 0x00, 0x65, 0xa8, 0x87, 0x00,
+    0x64, 0x78, 0x91, 0x00, 0x64, 0x98, 0x83, 0x00, 0x64, 0xa9, 0xc2, 0x00,
+    0xd0, 0x00, 0x64, 0xb0, 0x83, 0x00, 0x64, 0xb9, 0xc2, 0x00, 0xd0, 0x00,
+    0x64, 0xc0, 0xc2, 0x01, 0x30, 0x00, 0x64, 0xc9, 0xc2, 0x19, 0x2c, 0x00,
+    0x64, 0xf1, 0xc2, 0x00, 0xc1, 0x00, 0x65, 0x19, 0x83, 0x00, 0x65, 0x42,
+    0x02, 0xfe, 0x08, 0x83, 0x00, 0x64, 0xd1, 0xc2, 0x00, 0xd0, 0x00, 0x64,
+    0xd8, 0x83, 0x00, 0x64, 0xe1, 0xc2, 0x00, 0xd0, 0x00, 0x64, 0xe8, 0x16,
+    0xc2, 0xfe, 0x0e, 0x83, 0x00, 0x65, 0x21, 0xc2, 0x00, 0xd0, 0x00, 0x65,
+    0x28, 0x06, 0xc2, 0xfe, 0x18, 0x83, 0x00, 0x65, 0x31, 0xc2, 0x00, 0xd0,
+    0x00, 0x65, 0x38, 0x83, 0x00, 0x65, 0x51, 0xc2, 0x00, 0xd0, 0x00, 0x65,
+    0x58, 0x83, 0x00, 0x65, 0x61, 0xc2, 0x00, 0xd0, 0x00, 0x65, 0x68, 0x83,
+    0x00, 0x65, 0x81, 0xc2, 0x00, 0x39, 0x00, 0x65, 0x88, 0x83, 0x00, 0x65,
+    0x91, 0x0e, 0x42, 0xfe, 0x22, 0xc2, 0x00, 0xd0, 0x00, 0x65, 0xb1, 0xc2,
+    0x0d, 0xf6, 0x00, 0x65, 0xb9, 0x83, 0x00, 0x65, 0xc0, 0x94, 0x00, 0x66,
+    0x20, 0x8e, 0x00, 0x67, 0x18, 0xc4, 0xdf, 0x5f, 0x01, 0x79, 0x80, 0xc6,
+    0x32, 0x33, 0x01, 0x78, 0x81, 0xc4, 0x76, 0x96, 0x01, 0x7c, 0x38, 0xc2,
+    0x02, 0x6f, 0x01, 0x78, 0x09, 0x86, 0x01, 0x78, 0x39, 0xc2, 0x14, 0xda,
+    0x01, 0x7b, 0x18, 0xc2, 0x0c, 0xa6, 0x01, 0x78, 0x49, 0x03, 0xc2, 0xfe,
+    0x2c, 0xc2, 0x13, 0x4c, 0x01, 0x7d, 0x90, 0xc2, 0x00, 0x4e, 0x01, 0x79,
+    0x51, 0xc2, 0x00, 0x3b, 0x01, 0x7a, 0x58, 0xc7, 0xc4, 0xe2, 0x01, 0x79,
+    0xa8, 0x96, 0x01, 0x78, 0x13, 0x02, 0xfe, 0x36, 0xc6, 0xd1, 0xc9, 0x01,
+    0x78, 0x61, 0xc2, 0x00, 0xbf, 0x01, 0x79, 0xf1, 0xc4, 0x17, 0xa1, 0x01,
+    0x7a, 0x79, 0xc6, 0xce, 0x63, 0x01, 0x7a, 0xc1, 0x89, 0x01, 0x7a, 0xe8,
+    0xc8, 0xab, 0xad, 0x01, 0x78, 0xc1, 0xc4, 0x02, 0xaf, 0x01, 0x7a, 0x19,
+    0x15, 0x42, 0xfe, 0x3c, 0x9b, 0x01, 0x79, 0x91, 0xc2, 0x01, 0xdf, 0x01,
+    0x7e, 0x71, 0xc4, 0x1e, 0x58, 0x01, 0x7e, 0x98, 0xc3, 0x04, 0xa7, 0x01,
+    0x7a, 0x69, 0xc2, 0x00, 0x29, 0x01, 0x7e, 0x28, 0x03, 0xc2, 0xfe, 0x46,
+    0xc3, 0x17, 0xbf, 0x01, 0x7a, 0xd0, 0xc4, 0x71, 0x24, 0x01, 0x78, 0x21,
+    0xc2, 0x00, 0x65, 0x01, 0x78, 0xc9, 0xc2, 0x01, 0xd0, 0x01, 0x7c, 0x19,
+    0x87, 0x01, 0x7c, 0x90, 0xc3, 0x01, 0x54, 0x01, 0x78, 0xa1, 0xc4, 0xab,
+    0xbd, 0x01, 0x79, 0x61, 0x07, 0xc2, 0xfe, 0x52, 0xc4, 0xac, 0x23, 0x01,
+    0x7b, 0x80, 0xc5, 0xd5, 0xd8, 0x01, 0x79, 0x01, 0xc4, 0x07, 0x30, 0x01,
+    0x7a, 0x10, 0x11, 0xc2, 0xfe, 0x5e, 0x07, 0x42, 0xfe, 0x6a, 0x07, 0xc2,
+    0xfe, 0x76, 0x11, 0xc2, 0xfe, 0x80, 0xc3, 0x02, 0x9b, 0x01, 0x7b, 0xa0,
+    0x9b, 0x01, 0x7a, 0x41, 0xce, 0x6e, 0xf2, 0x01, 0x7d, 0xf9, 0xc2, 0x00,
+    0x72, 0x01, 0x7e, 0x20, 0xc6, 0xcd, 0x0d, 0x01, 0x7b, 0x39, 0xc2, 0x8c,
+    0x30, 0x01, 0x7b, 0xb8, 0xc2, 0x01, 0x25, 0x01, 0x78, 0x29, 0x14, 0x42,
+    0xfe, 0x8d, 0x03, 0xc2, 0xfe, 0x97, 0xc2, 0x15, 0x10, 0x01, 0x7e, 0x38,
+    0x0e, 0xc2, 0xfe, 0xa1, 0xc2, 0x02, 0x35, 0x01, 0x79, 0xf9, 0xc2, 0x01,
+    0x29, 0x01, 0x7d, 0xe8, 0xc6, 0x07, 0x72, 0x01, 0x79, 0xb0, 0xc3, 0x00,
+    0x5b, 0x01, 0x78, 0x79, 0xcc, 0x7f, 0x4d, 0x01, 0x7d, 0x89, 0xc2, 0x02,
+    0xa7, 0x01, 0x7d, 0xe0, 0xc3, 0x10, 0xd0, 0x01, 0x79, 0x89, 0xc3, 0x0f,
+    0xed, 0x01, 0x7e, 0xa0, 0xc2, 0x02, 0xe0, 0x01, 0x7a, 0x51, 0xc3, 0x00,
+    0x3a, 0x01, 0x7b, 0x89, 0xc4, 0xe1, 0x6f, 0x01, 0x7e, 0x68, 0xc2, 0x00,
+    0x49, 0x01, 0x7b, 0x09, 0xc3, 0x09, 0x3b, 0x01, 0x7c, 0x60, 0xc4, 0x5a,
+    0xfe, 0x01, 0x7c, 0x31, 0xc3, 0x07, 0x6f, 0x01, 0x7e, 0x90, 0x17, 0xc2,
+    0xfe, 0xad, 0xc2, 0x00, 0x45, 0x01, 0x7a, 0x49, 0x14, 0x42, 0xfe, 0xb7,
+    0xc3, 0x0d, 0xe5, 0x01, 0x7b, 0x01, 0xc2, 0x00, 0xfe, 0x01, 0x7c, 0x08,
+    0xc6, 0xcd, 0xa3, 0x01, 0x7c, 0x11, 0xc4, 0x1c, 0xb6, 0x01, 0x7e, 0x48,
+    0xc3, 0x02, 0x11, 0x01, 0x78, 0x51, 0xc7, 0x63, 0x9d, 0x01, 0x78, 0xf0,
+    0x94, 0x01, 0x7b, 0xfb, 0x02, 0xfe, 0xc3, 0x96, 0x01, 0x7d, 0xb8, 0xc3,
+    0x00, 0x74, 0x01, 0x79, 0x18, 0xc3, 0x00, 0x5f, 0x01, 0x78, 0x69, 0xc4,
+    0xde, 0xc3, 0x01, 0x79, 0x59, 0xc5, 0xda, 0x83, 0x01, 0x7a, 0x81, 0x99,
+    0x01, 0x7a, 0xe1, 0xc3, 0x04, 0xa0, 0x01, 0x7c, 0x50, 0xc3, 0x43, 0x08,
+    0x01, 0x78, 0xd1, 0x03, 0xc2, 0xfe, 0xc9, 0xc5, 0x78, 0xee, 0x01, 0x7c,
+    0x80, 0xc2, 0x00, 0x5f, 0x01, 0x7b, 0x99, 0xc2, 0x00, 0x61, 0x01, 0x7c,
+    0xf1, 0xc6, 0xc4, 0x8f, 0x01, 0x7e, 0x00, 0xc2, 0x11, 0xee, 0x01, 0x79,
+    0x41, 0xc4, 0x00, 0x40, 0x01, 0x7c, 0x68, 0xc5, 0xc2, 0xd6, 0x01, 0x78,
+    0xd9, 0xc6, 0xcc, 0x77, 0x01, 0x7a, 0xf0, 0xc2, 0x00, 0xb6, 0x01, 0x78,
+    0x99, 0xc3, 0x08, 0x48, 0x01, 0x7d, 0x70, 0xc3, 0x12, 0xb8, 0x01, 0x79,
+    0x29, 0xc2, 0x01, 0xc5, 0x01, 0x79, 0x78, 0xc4, 0xe1, 0x97, 0x01, 0x7a,
+    0x71, 0xc2, 0x00, 0x15, 0x01, 0x7c, 0x88, 0xc3, 0x02, 0x11, 0x01, 0x7a,
+    0xa1, 0xc2, 0x8c, 0x30, 0x01, 0x7d, 0x0a, 0x02, 0xfe, 0xd1, 0xc3, 0x01,
+    0xfe, 0x01, 0x7b, 0x49, 0xc3, 0x04, 0xa6, 0x01, 0x7e, 0x30, 0x87, 0x01,
+    0x7d, 0x19, 0x86, 0x01, 0x7d, 0xa8, 0xcc, 0x35, 0x8a, 0x01, 0x78, 0xa9,
+    0xc3, 0x00, 0x5f, 0x01, 0x79, 0x71, 0xc2, 0x00, 0x89, 0x01, 0x7b, 0xb0,
+    0x92, 0x01, 0x7a, 0x09, 0xc2, 0x00, 0x74, 0x01, 0x7d, 0x61, 0x96, 0x01,
+    0x7e, 0x78, 0xc2, 0x00, 0x27, 0x01, 0x7b, 0x71, 0xc3, 0x0d, 0x14, 0x01,
+    0x7c, 0x20, 0xc7, 0xc5, 0x4b, 0x01, 0x79, 0x11, 0xc2, 0x18, 0xb3, 0x01,
+    0x7d, 0x30, 0xc2, 0x00, 0x74, 0x01, 0x7b, 0x91, 0xc2, 0x00, 0xcc, 0x01,
+    0x7c, 0x58, 0x89, 0x01, 0x79, 0x21, 0xc4, 0x02, 0x10, 0x01, 0x7c, 0xf9,
+    0xc2, 0x01, 0x4c, 0x01, 0x7e, 0x18, 0x99, 0x01, 0x79, 0xc1, 0xcb, 0x93,
+    0x67, 0x01, 0x7b, 0x31, 0xc2, 0x00, 0xfe, 0x01, 0x7c, 0x41, 0xc2, 0x00,
+    0x6d, 0x01, 0x7c, 0xe9, 0xc2, 0x02, 0x6f, 0x01, 0x7d, 0xd8, 0xc5, 0xdd,
+    0xbc, 0x01, 0x79, 0xd1, 0xc4, 0x1e, 0xce, 0x01, 0x7a, 0x01, 0xc3, 0x5d,
+    0xd1, 0x01, 0x7c, 0x00, 0xc4, 0x8e, 0x88, 0x01, 0x7b, 0xa9, 0xc4, 0xe2,
+    0xa3, 0x01, 0x7c, 0xc0, 0xc3, 0x29, 0x82, 0x01, 0x7c, 0x71, 0xc2, 0x0f,
+    0xe1, 0x01, 0x7d, 0x69, 0xc3, 0x00, 0x3a, 0x01, 0x7e, 0x50, 0x96, 0x01,
+    0x7a, 0x31, 0xc2, 0x00, 0x40, 0x01, 0x7e, 0x80, 0xc2, 0x01, 0x19, 0x01,
+    0x7a, 0xa9, 0xc3, 0x19, 0x78, 0x01, 0x7b, 0x29, 0xc3, 0x00, 0x2e, 0x01,
+    0x7d, 0xf1, 0xc2, 0x01, 0xe2, 0x01, 0x7e, 0x10, 0xc4, 0x14, 0xdd, 0x01,
+    0x7a, 0xb9, 0xc2, 0x00, 0x75, 0x01, 0x7a, 0xd9, 0xc2, 0x00, 0x89, 0x01,
+    0x7d, 0x78, 0x9b, 0x01, 0x7d, 0xb1, 0xc3, 0x31, 0xf0, 0x01, 0x7e, 0xa8,
+    0xc6, 0xd2, 0x59, 0x01, 0x7d, 0xc9, 0xc2, 0x13, 0x38, 0x01, 0x7e, 0x60,
+    0x12, 0xc2, 0xfe, 0xd7, 0x04, 0xc2, 0xfe, 0xe3, 0x45, 0xda, 0x97, 0x42,
+    0xfe, 0xef, 0xc3, 0x38, 0x5b, 0x00, 0xcf, 0xd1, 0xc4, 0xe0, 0xaf, 0x00,
+    0xcf, 0x50, 0x02, 0xc2, 0xfe, 0xfb, 0x00, 0x42, 0xff, 0x0b, 0xc3, 0x38,
+    0x5b, 0x00, 0xcf, 0x91, 0xc4, 0xe0, 0xaf, 0x00, 0xcf, 0x10, 0xc3, 0x38,
+    0x5b, 0x00, 0xcf, 0xa1, 0xc4, 0xe0, 0xaf, 0x00, 0xcf, 0x20, 0xc3, 0x38,
+    0x5b, 0x00, 0xcf, 0x99, 0xc4, 0xe0, 0xaf, 0x00, 0xcf, 0x18, 0xc3, 0x13,
+    0x3a, 0x00, 0xbf, 0xab, 0x02, 0xff, 0x17, 0xc2, 0x25, 0x9f, 0x00, 0xbf,
+    0x90, 0xc3, 0xdf, 0x37, 0x00, 0xbf, 0xa1, 0xc2, 0x06, 0xdb, 0x00, 0xbf,
+    0x98, 0xc8, 0xbe, 0x82, 0x00, 0xbe, 0xa9, 0xc8, 0xb0, 0xc6, 0x00, 0xbe,
+    0x99, 0xc4, 0xe4, 0x77, 0x00, 0xbe, 0x58, 0x98, 0x00, 0xbd, 0x50, 0x83,
+    0x08, 0x51, 0xa1, 0xc2, 0x00, 0xd0, 0x08, 0x51, 0x98, 0xce, 0x2a, 0xfe,
+    0x0f, 0xd0, 0xb1, 0xdb, 0x18, 0x03, 0x0f, 0xd2, 0x00, 0x49, 0x2a, 0xf5,
+    0x42, 0xff, 0x1b, 0x49, 0x2a, 0xf5, 0x42, 0xff, 0x27, 0xce, 0x2a, 0xfe,
+    0x0f, 0xd0, 0xc1, 0xdb, 0x18, 0x03, 0x0f, 0xd2, 0x10, 0xce, 0x2a, 0xfe,
+    0x0f, 0xd0, 0xb9, 0xdb, 0x18, 0x03, 0x0f, 0xd2, 0x08, 0xce, 0x2a, 0xfe,
+    0x0f, 0xd0, 0xd1, 0xdb, 0x18, 0x03, 0x0f, 0xd2, 0x20, 0xc3, 0x00, 0x74,
+    0x0f, 0xd1, 0x51, 0xc5, 0x56, 0xa5, 0x0f, 0xd1, 0x70, 0xcb, 0x93, 0xf6,
+    0x08, 0xa3, 0x09, 0xcb, 0x8f, 0xe1, 0x08, 0xa3, 0x01, 0xce, 0x6b, 0xfe,
+    0x08, 0xa2, 0x41, 0x03, 0xc2, 0xff, 0x3d, 0xc5, 0x33, 0x5d, 0x08, 0xa2,
+    0x31, 0x42, 0x07, 0xb2, 0xc2, 0xff, 0x49, 0xcb, 0x1e, 0x89, 0x08, 0xa2,
+    0x18, 0x8e, 0x08, 0xa0, 0x43, 0x02, 0xff, 0x55, 0x94, 0x08, 0xa0, 0x32,
+    0x02, 0xff, 0x59, 0xc2, 0x00, 0xd0, 0x08, 0xa0, 0xc9, 0x83, 0x08, 0xa0,
+    0xc0, 0xc2, 0x00, 0xd0, 0x08, 0xa0, 0x99, 0x83, 0x08, 0xa0, 0x90, 0xc2,
+    0x00, 0xd0, 0x08, 0xa0, 0xe9, 0x83, 0x08, 0xa0, 0xe0, 0xc2, 0x00, 0xd0,
+    0x08, 0xa0, 0xd9, 0x83, 0x08, 0xa0, 0xd0, 0xc4, 0x18, 0x10, 0x08, 0xa2,
+    0xb9, 0xc2, 0x22, 0xcc, 0x08, 0xa2, 0xb0, 0xc3, 0x0d, 0x14, 0x08, 0xa2,
+    0xa9, 0xc3, 0x09, 0x9e, 0x08, 0xa2, 0xa0, 0xc4, 0x02, 0xde, 0x08, 0xa2,
+    0x99, 0xc2, 0x02, 0xa0, 0x08, 0xa2, 0x90, 0x8e, 0x08, 0xa1, 0xe8, 0x94,
+    0x08, 0xa1, 0xd8, 0x9f, 0x00, 0xce, 0x49, 0x9e, 0x00, 0xce, 0x40, 0xc4,
+    0x18, 0x10, 0x00, 0xce, 0xb9, 0xc2, 0x22, 0xcc, 0x00, 0xce, 0xb0, 0xc3,
+    0x0d, 0x14, 0x00, 0xce, 0xa9, 0xc3, 0x09, 0x9e, 0x00, 0xce, 0xa0, 0xc4,
+    0x02, 0xde, 0x00, 0xce, 0x99, 0xc2, 0x02, 0xa0, 0x00, 0xce, 0x90, 0x84,
+    0x00, 0xce, 0x39, 0x86, 0x00, 0xce, 0x31, 0x8d, 0x00, 0xce, 0x29, 0x8f,
+    0x00, 0xce, 0x21, 0x90, 0x00, 0xce, 0x1b, 0x02, 0xff, 0x5d, 0x98, 0x00,
+    0xce, 0x08, 0x15, 0xc2, 0xff, 0x61, 0x1a, 0xc2, 0xff, 0x6b, 0x0d, 0xc2,
+    0xff, 0x75, 0xc2, 0x01, 0x5d, 0x00, 0xcd, 0x29, 0xc2, 0x0e, 0x9a, 0x00,
+    0xcd, 0x21, 0xc2, 0x00, 0xd0, 0x00, 0xcd, 0x19, 0xc2, 0x00, 0xdb, 0x00,
+    0xcc, 0xf9, 0xc2, 0x02, 0x41, 0x00, 0xcc, 0xf1, 0xc2, 0x00, 0x87, 0x00,
+    0xcc, 0xe9, 0xc2, 0x01, 0xc3, 0x00, 0xcc, 0xc9, 0x12, 0xc2, 0xff, 0x7f,
+    0x10, 0xc2, 0xff, 0x89, 0x16, 0xc2, 0xff, 0x93, 0xc2, 0x19, 0x2c, 0x00,
+    0xcc, 0x69, 0xc2, 0x0f, 0x9a, 0x00, 0xcc, 0x08, 0x15, 0xc2, 0xff, 0xa3,
+    0x1a, 0xc2, 0xff, 0xad, 0x0d, 0xc2, 0xff, 0xb7, 0xc2, 0x01, 0x5d, 0x00,
+    0xcd, 0x11, 0xc2, 0x0e, 0x9a, 0x00, 0xcd, 0x09, 0xc2, 0x00, 0xd0, 0x00,
+    0xcd, 0x01, 0xc2, 0x00, 0xdb, 0x00, 0xcc, 0xe1, 0xc2, 0x02, 0x41, 0x00,
+    0xcc, 0xd9, 0xc2, 0x00, 0x87, 0x00, 0xcc, 0xd1, 0xc2, 0x01, 0xc3, 0x00,
+    0xcc, 0xb1, 0x12, 0xc2, 0xff, 0xc1, 0x10, 0xc2, 0xff, 0xcb, 0x16, 0xc2,
+    0xff, 0xd5, 0xc2, 0x19, 0x2c, 0x00, 0xcc, 0x51, 0xc2, 0x0f, 0x9a, 0x00,
+    0xcc, 0x00, 0x9b, 0x00, 0xce, 0x01, 0x8b, 0x00, 0xcd, 0x90, 0x87, 0x00,
+    0xcd, 0xcb, 0x02, 0xff, 0xe5, 0x9b, 0x00, 0xcd, 0xe1, 0x97, 0x00, 0xcd,
+    0xa0, 0x83, 0x00, 0xcd, 0xc3, 0x02, 0xff, 0xe9, 0x9b, 0x00, 0xcd, 0xe8,
+    0x83, 0x00, 0xcd, 0x8b, 0x02, 0xff, 0xed, 0x9b, 0x00, 0xcd, 0xd1, 0x87,
+    0x00, 0xcd, 0xb0, 0x42, 0x00, 0x28, 0xc2, 0xff, 0xf1, 0xc7, 0x52, 0xcc,
+    0x01, 0x27, 0x68, 0xc7, 0x1f, 0x6e, 0x01, 0x27, 0x91, 0xc5, 0x66, 0xb1,
+    0x01, 0x27, 0x58, 0xc8, 0x48, 0x23, 0x01, 0x27, 0x89, 0xc6, 0x44, 0x9c,
+    0x01, 0x27, 0x80, 0xc6, 0x14, 0x07, 0x01, 0x27, 0x79, 0xc7, 0x34, 0x37,
+    0x01, 0x27, 0x70, 0x94, 0x08, 0xcd, 0x38, 0xc2, 0x00, 0xd0, 0x08, 0xcd,
+    0xd9, 0x83, 0x08, 0xcd, 0xd0, 0xc2, 0x00, 0xd0, 0x08, 0xcd, 0xc9, 0x83,
+    0x08, 0xcd, 0xc0, 0xc4, 0x18, 0x12, 0x08, 0x45, 0x71, 0x91, 0x08, 0x45,
+    0x40, 0xc3, 0x77, 0x79, 0x08, 0x44, 0xc9, 0xc4, 0xdc, 0x2d, 0x08, 0x44,
+    0xb0, 0xc3, 0xe5, 0x8a, 0x0f, 0xb3, 0x11, 0xc9, 0xb4, 0x91, 0x0f, 0xb2,
+    0xd1, 0xc4, 0x47, 0x23, 0x0f, 0xb2, 0x90, 0xc4, 0x01, 0xa3, 0x01, 0x0c,
+    0xbb, 0x02, 0xff, 0xfd, 0xd3, 0x3c, 0xa1, 0x01, 0x49, 0x10, 0xc7, 0x10,
+    0x9c, 0x01, 0x5b, 0xb8, 0xc4, 0x01, 0xa3, 0x01, 0x0c, 0xb3, 0x03, 0x00,
+    0x01, 0xd3, 0x3c, 0x8d, 0x01, 0x49, 0x08, 0xc3, 0xe5, 0x8a, 0x0f, 0xb3,
+    0x01, 0xc9, 0xb4, 0x91, 0x0f, 0xb2, 0xc1, 0xc4, 0x47, 0x23, 0x0f, 0xb2,
+    0x80, 0xc7, 0x10, 0x9c, 0x01, 0x5b, 0xb0, 0x44, 0x05, 0x14, 0xc3, 0x00,
+    0x05, 0x46, 0x02, 0xdd, 0x43, 0x00, 0x1d, 0xc9, 0xac, 0x7b, 0x05, 0x41,
+    0xb1, 0xca, 0xa1, 0xd4, 0x05, 0x41, 0xc8, 0x86, 0x0f, 0xae, 0x39, 0xc2,
+    0x09, 0x3b, 0x0f, 0xae, 0x30, 0xcd, 0x7c, 0x81, 0x0f, 0x98, 0x79, 0xc7,
+    0xc3, 0x29, 0x0f, 0x98, 0x70, 0x83, 0x09, 0x87, 0xd0, 0x83, 0x09, 0x87,
+    0x98, 0x83, 0x09, 0x87, 0x50, 0x83, 0x09, 0x87, 0x30, 0x83, 0x09, 0x87,
+    0x20, 0x83, 0x09, 0x86, 0xe0, 0x83, 0x09, 0x86, 0xd0, 0x84, 0x09, 0x94,
+    0xc9, 0x83, 0x09, 0x94, 0xc0, 0x86, 0x09, 0x94, 0x49, 0x85, 0x09, 0x94,
+    0x41, 0x84, 0x09, 0x94, 0x39, 0x83, 0x09, 0x94, 0x30, 0x83, 0x09, 0x94,
+    0x10, 0x83, 0x09, 0x93, 0xf0, 0x83, 0x09, 0x93, 0xe0, 0x83, 0x09, 0x93,
+    0xb8, 0x83, 0x09, 0x93, 0xa8, 0x83, 0x09, 0x93, 0x80, 0x83, 0x09, 0x93,
+    0x70, 0x85, 0x09, 0x93, 0x21, 0x84, 0x09, 0x93, 0x19, 0x83, 0x09, 0x93,
+    0x10, 0x88, 0x09, 0x92, 0xe9, 0x87, 0x09, 0x92, 0xe1, 0x86, 0x09, 0x92,
+    0xd9, 0x85, 0x09, 0x92, 0xd1, 0x84, 0x09, 0x92, 0xc9, 0x83, 0x09, 0x92,
+    0xc0, 0x83, 0x09, 0x92, 0xb0, 0x83, 0x09, 0x92, 0x88, 0x83, 0x09, 0x8c,
+    0xb0, 0x83, 0x09, 0x8c, 0xa0, 0x83, 0x09, 0x8c, 0x90, 0x83, 0x09, 0x8c,
+    0x68, 0x83, 0x09, 0x9d, 0x50, 0x83, 0x09, 0x9d, 0x28, 0x83, 0x09, 0x9d,
+    0x10, 0x83, 0x09, 0x9d, 0x00, 0x83, 0x09, 0x9c, 0xf0, 0x83, 0x09, 0x9c,
+    0xe0, 0x83, 0x09, 0x9c, 0xb0, 0x8e, 0x09, 0x9c, 0x91, 0x8d, 0x09, 0x9c,
+    0x89, 0x8c, 0x09, 0x9c, 0x81, 0x8b, 0x09, 0x9c, 0x79, 0x8a, 0x09, 0x9c,
+    0x71, 0x89, 0x09, 0x9c, 0x69, 0x88, 0x09, 0x9c, 0x61, 0x87, 0x09, 0x9c,
+    0x59, 0x86, 0x09, 0x9c, 0x51, 0x85, 0x09, 0x9c, 0x49, 0x84, 0x09, 0x9c,
+    0x41, 0x83, 0x09, 0x9c, 0x38, 0x84, 0x09, 0x9b, 0xf1, 0x83, 0x09, 0x9b,
+    0xe8, 0x85, 0x09, 0x9b, 0xd9, 0x84, 0x09, 0x9b, 0xd1, 0x83, 0x09, 0x9b,
+    0xc8, 0x84, 0x09, 0x9b, 0xa1, 0x83, 0x09, 0x9b, 0x98, 0x83, 0x09, 0x9b,
+    0x68, 0x8b, 0x09, 0x9b, 0x59, 0x8a, 0x09, 0x9b, 0x51, 0x89, 0x09, 0x9b,
+    0x49, 0x88, 0x09, 0x9b, 0x41, 0x87, 0x09, 0x9b, 0x39, 0x86, 0x09, 0x9b,
+    0x31, 0x85, 0x09, 0x9b, 0x29, 0x84, 0x09, 0x9b, 0x21, 0x83, 0x09, 0x9b,
+    0x18, 0x84, 0x09, 0xa0, 0xa9, 0x83, 0x09, 0xa0, 0xa0, 0x83, 0x09, 0x81,
+    0xb0, 0x83, 0x09, 0x81, 0x98, 0x83, 0x09, 0x81, 0x88, 0x83, 0x09, 0x81,
+    0x70, 0x83, 0x09, 0x81, 0x28, 0x83, 0x09, 0x80, 0xa8, 0x83, 0x09, 0x80,
+    0x88, 0x84, 0x09, 0x80, 0x41, 0x83, 0x09, 0x80, 0x38, 0x83, 0x09, 0x80,
+    0x28, 0x83, 0x09, 0x92, 0x78, 0x83, 0x09, 0x92, 0x50, 0x83, 0x09, 0x92,
+    0x10, 0x83, 0x09, 0x92, 0x00, 0x83, 0x09, 0x91, 0x90, 0x83, 0x09, 0x91,
+    0x28, 0x83, 0x09, 0x90, 0xd0, 0x83, 0x09, 0x90, 0xb8, 0x83, 0x09, 0x90,
+    0xa8, 0x83, 0x09, 0x90, 0x98, 0x83, 0x09, 0x90, 0x50, 0x84, 0x09, 0x90,
+    0x11, 0x83, 0x09, 0x90, 0x08, 0x42, 0x09, 0x0e, 0xc3, 0x00, 0x29, 0x42,
+    0xbc, 0x43, 0xc3, 0x00, 0x33, 0x42, 0xc3, 0x98, 0xc3, 0x00, 0x3d, 0x42,
+    0x04, 0x22, 0xc3, 0x00, 0x48, 0x42, 0xc5, 0xb6, 0xc3, 0x00, 0x53, 0x42,
+    0xe6, 0x95, 0xc3, 0x00, 0x5d, 0x42, 0x30, 0xd1, 0xc3, 0x00, 0x68, 0xc4,
+    0xdf, 0xab, 0x0f, 0x3f, 0x40, 0x83, 0x00, 0x95, 0x18, 0x87, 0x00, 0x95,
+    0x20, 0x83, 0x01, 0x6c, 0x50, 0x83, 0x00, 0x98, 0x98, 0x87, 0x00, 0x98,
+    0xa0, 0x83, 0x00, 0x98, 0xd8, 0x87, 0x00, 0x98, 0xe0, 0x83, 0x01, 0x6c,
+    0x9b, 0x03, 0x00, 0x72, 0x8b, 0x01, 0x6c, 0xa1, 0x87, 0x01, 0x6c, 0xb2,
+    0x03, 0x00, 0x76, 0x83, 0x01, 0x6e, 0xd8, 0x87, 0x01, 0x6e, 0xe0, 0x87,
+    0x0f, 0x3f, 0x5b, 0x03, 0x00, 0x7a, 0x8b, 0x0f, 0x3f, 0x49, 0x83, 0x00,
+    0x90, 0xb8, 0x91, 0x0f, 0x3f, 0x31, 0x87, 0x0f, 0x3f, 0x2b, 0x03, 0x00,
+    0x7e, 0x83, 0x0f, 0x3f, 0x03, 0x03, 0x00, 0x82, 0x8b, 0x0f, 0x3f, 0x11,
+    0x97, 0x0f, 0x3f, 0x08, 0x83, 0x00, 0x90, 0x98, 0x87, 0x00, 0x90, 0xa0,
+    0x87, 0x05, 0x59, 0x60, 0x83, 0x05, 0x59, 0x58, 0x87, 0x00, 0x9c, 0x30,
+    0x0a, 0xc3, 0x00, 0x86, 0x83, 0x01, 0x6d, 0xc3, 0x03, 0x00, 0xa0, 0x97,
+    0x01, 0x6d, 0xc9, 0x8b, 0x01, 0x6d, 0xd1, 0x87, 0x01, 0x6d, 0xeb, 0x03,
+    0x00, 0xa4, 0x91, 0x01, 0x6d, 0xf0, 0x83, 0x01, 0x6d, 0x58, 0x87, 0x01,
+    0x6d, 0x60, 0x83, 0x00, 0x99, 0x58, 0x87, 0x00, 0x99, 0x60, 0x83, 0x01,
+    0x6c, 0x80, 0x87, 0x05, 0x58, 0xa0, 0x91, 0x05, 0x58, 0x71, 0x87, 0x05,
+    0x58, 0x6b, 0x03, 0x00, 0xa8, 0xc2, 0x04, 0xc6, 0x05, 0x58, 0x59, 0x8b,
+    0x05, 0x58, 0x51, 0x97, 0x05, 0x58, 0x48, 0x83, 0x00, 0x97, 0xd8, 0x87,
+    0x00, 0x97, 0xe0, 0x83, 0x01, 0x6c, 0x68, 0x87, 0x05, 0x58, 0x20, 0x83,
+    0x00, 0x99, 0x18, 0x87, 0x00, 0x99, 0x20, 0x83, 0x01, 0x6c, 0x78, 0x83,
+    0x00, 0x99, 0xd8, 0x87, 0x00, 0x99, 0xe0, 0x83, 0x00, 0x9a, 0x18, 0x87,
+    0x00, 0x9a, 0x20, 0x83, 0x00, 0x9a, 0x38, 0x87, 0x00, 0x9c, 0x10, 0x83,
+    0x00, 0x91, 0x18, 0x87, 0x00, 0x91, 0x20, 0xc3, 0x30, 0xd1, 0x00, 0x9c,
+    0x01, 0xc3, 0xc5, 0xb6, 0x00, 0x9c, 0x21, 0xc3, 0xe0, 0x77, 0x00, 0x9c,
+    0x41, 0xc3, 0x09, 0x0e, 0x00, 0x9c, 0x60, 0x83, 0x00, 0x91, 0xd8, 0x87,
+    0x00, 0x91, 0xe0, 0x83, 0x01, 0x6c, 0x20, 0x83, 0x01, 0x6d, 0x18, 0x87,
+    0x01, 0x6d, 0x20, 0x83, 0x00, 0x92, 0x58, 0x87, 0x00, 0x92, 0x60, 0x83,
+    0x00, 0x92, 0x98, 0x87, 0x00, 0x92, 0xa0, 0x83, 0x00, 0x92, 0xc3, 0x03,
+    0x00, 0xac, 0x8b, 0x00, 0x92, 0xd1, 0x87, 0x00, 0x92, 0xea, 0x03, 0x00,
+    0xb0, 0x83, 0x01, 0x6e, 0x18, 0x87, 0x01, 0x6e, 0x20, 0x83, 0x00, 0x94,
+    0x58, 0x87, 0x00, 0x94, 0x60, 0x83, 0x01, 0x6e, 0x98, 0x87, 0x01, 0x6e,
+    0xa0, 0x83, 0x00, 0x94, 0xd8, 0x87, 0x00, 0x94, 0xe0, 0x83, 0x01, 0x6c,
+    0x48, 0x83, 0x00, 0x95, 0x98, 0x87, 0x00, 0x95, 0xa0, 0x83, 0x00, 0x95,
+    0xd8, 0x87, 0x00, 0x95, 0xe0, 0x83, 0x00, 0x96, 0x03, 0x03, 0x00, 0xb4,
+    0x8b, 0x00, 0x96, 0x11, 0x87, 0x00, 0x96, 0x2a, 0x03, 0x00, 0xb8, 0x83,
+    0x01, 0x6e, 0x58, 0x87, 0x01, 0x6e, 0x60, 0x48, 0x17, 0xb5, 0xc3, 0x00,
+    0xbc, 0x83, 0x00, 0x99, 0x98, 0x87, 0x00, 0x99, 0xa0, 0x83, 0x01, 0x6c,
+    0x88, 0x87, 0x00, 0x9c, 0x70, 0x83, 0x00, 0x97, 0x18, 0x87, 0x00, 0x97,
+    0x20, 0x83, 0x01, 0x6d, 0x98, 0x87, 0x01, 0x6d, 0xa0, 0x87, 0x00, 0x9c,
+    0x50, 0xe0, 0x0a, 0x07, 0x01, 0x17, 0x98, 0xd3, 0x36, 0x0a, 0x01, 0x4f,
+    0x1b, 0x03, 0x00, 0xd6, 0x45, 0x00, 0xd5, 0x43, 0x00, 0xdc, 0x16, 0xc3,
+    0x00, 0xf4, 0xc9, 0x0e, 0x6e, 0x01, 0x53, 0x31, 0xcb, 0x98, 0x2c, 0x01,
+    0x55, 0x71, 0xce, 0x6c, 0xb4, 0x01, 0x5f, 0xc8, 0x94, 0x00, 0x57, 0x00,
+    0x8e, 0x00, 0x57, 0x08, 0x94, 0x00, 0x56, 0x20, 0x8e, 0x00, 0x57, 0x18,
+    0xa2, 0x0e, 0x91, 0x03, 0x03, 0x00, 0xfa, 0xa1, 0x0e, 0x90, 0xfb, 0x03,
+    0x01, 0x0e, 0x20, 0xc3, 0x01, 0x2a, 0x9f, 0x0e, 0x90, 0xf3, 0x03, 0x01,
+    0x42, 0x9e, 0x0e, 0x90, 0xeb, 0x03, 0x01, 0x5a, 0xa5, 0x0e, 0x91, 0x11,
+    0xa4, 0x0e, 0x91, 0x08, 0xa2, 0x0e, 0x90, 0x23, 0x03, 0x01, 0x76, 0x9f,
+    0x0e, 0x90, 0x0b, 0x03, 0x01, 0x8a, 0x9e, 0x0e, 0x90, 0x03, 0x03, 0x01,
+    0x9e, 0xa6, 0x0e, 0x90, 0x41, 0xa5, 0x0e, 0x90, 0x39, 0xa4, 0x0e, 0x90,
+    0x31, 0xa3, 0x0e, 0x90, 0x29, 0xa1, 0x0e, 0x90, 0x19, 0xa0, 0x0e, 0x90,
+    0x10, 0x45, 0x02, 0x9a, 0x43, 0x01, 0xc6, 0x44, 0x02, 0x9b, 0xc3, 0x01,
+    0xd8, 0xc5, 0x63, 0xdc, 0x00, 0x1c, 0x28, 0xc9, 0xb4, 0x13, 0x08, 0x0b,
+    0xab, 0x03, 0x01, 0xea, 0xcc, 0x88, 0xd1, 0x08, 0x0c, 0x58, 0x46, 0x02,
+    0x0f, 0xc3, 0x01, 0xf0, 0xd2, 0x1a, 0x6c, 0x00, 0x1f, 0xc8, 0xd3, 0x1a,
+    0x6b, 0x00, 0x1f, 0xe9, 0xda, 0x1a, 0x64, 0x00, 0x1f, 0xf8, 0x47, 0x02,
+    0x0e, 0xc3, 0x02, 0x6d, 0x49, 0x11, 0x74, 0xc3, 0x02, 0xe6, 0xda, 0x1a,
+    0x64, 0x00, 0x1b, 0xe0, 0xc3, 0x11, 0xee, 0x00, 0xeb, 0x51, 0xc3, 0x1c,
+    0x8d, 0x00, 0xeb, 0x49, 0xc3, 0x79, 0xe7, 0x00, 0xeb, 0x41, 0xc5, 0x51,
+    0x51, 0x00, 0xeb, 0x39, 0xc4, 0x93, 0xa9, 0x00, 0xeb, 0x30, 0x45, 0x02,
+    0x9a, 0x43, 0x02, 0xf2, 0xc8, 0x9c, 0x0e, 0x00, 0x1e, 0xb9, 0xca, 0x8b,
+    0x67, 0x00, 0x1f, 0x80, 0x15, 0xc3, 0x03, 0x04, 0xcd, 0x78, 0xbf, 0x00,
+    0x1e, 0xc1, 0xc3, 0xe5, 0xb4, 0x00, 0x1f, 0x99, 0xc7, 0x51, 0x4f, 0x00,
+    0x1e, 0xe1, 0xc5, 0x78, 0xc7, 0x00, 0x1e, 0xf0, 0xcc, 0x1a, 0x72, 0x00,
+    0x1f, 0x91, 0xce, 0x10, 0x3e, 0x00, 0x1f, 0xa8, 0xca, 0x89, 0xff, 0x00,
+    0x1e, 0x89, 0x44, 0x02, 0xdf, 0x43, 0x03, 0x16, 0xcb, 0x8d, 0x4d, 0x08,
+    0x0b, 0xb9, 0xca, 0x71, 0x88, 0x08, 0x0b, 0xe8, 0x46, 0xcd, 0x55, 0xc3,
+    0x03, 0x22, 0x43, 0x14, 0xde, 0xc3, 0x03, 0x34, 0x16, 0xc3, 0x03, 0x40,
+    0x4b, 0x8e, 0x4a, 0xc3, 0x03, 0x4c, 0x05, 0xc3, 0x03, 0x5b, 0xcd, 0x75,
+    0xda, 0x08, 0x0b, 0x19, 0xd1, 0x4f, 0x8b, 0x08, 0x0b, 0x99, 0xd3, 0x45,
+    0x60, 0x08, 0x0b, 0xa1, 0xd3, 0x46, 0x44, 0x08, 0x0b, 0x80, 0xc9, 0xac,
+    0x33, 0x08, 0x0c, 0x31, 0xc9, 0xab, 0xfd, 0x08, 0x0c, 0x38, 0xc6, 0x00,
+    0x91, 0x00, 0x1f, 0x89, 0xd2, 0x47, 0x5d, 0x00, 0x1f, 0xe0, 0xca, 0x37,
+    0x4e, 0x01, 0x13, 0x99, 0xc5, 0x07, 0x62, 0x01, 0x13, 0x28, 0x4a, 0x33,
+    0xad, 0x43, 0x03, 0x67, 0xe0, 0x02, 0x67, 0x01, 0x54, 0x60, 0x47, 0xc7,
+    0x35, 0xc3, 0x03, 0x76, 0x50, 0x40, 0x1b, 0x43, 0x03, 0x82, 0xe0, 0x04,
+    0x67, 0x01, 0x54, 0x90, 0x8e, 0x08, 0x9b, 0x18, 0x94, 0x08, 0x9a, 0x20,
+    0x8e, 0x08, 0x98, 0x63, 0x03, 0x03, 0x88, 0x94, 0x08, 0x98, 0x5a, 0x03,
+    0x03, 0x8c, 0xcf, 0x14, 0x31, 0x08, 0x9a, 0xf9, 0xc8, 0x14, 0x38, 0x08,
+    0x9a, 0xf0, 0xc2, 0x00, 0xd0, 0x08, 0x99, 0x11, 0x83, 0x08, 0x99, 0x08,
+    0xc2, 0x00, 0xd0, 0x08, 0x99, 0x01, 0x83, 0x08, 0x98, 0xf8, 0xcb, 0x1d,
+    0x4b, 0x0f, 0xb0, 0x09, 0xc8, 0xb6, 0xc2, 0x0f, 0xc9, 0x48, 0x94, 0x00,
+    0xe5, 0xa3, 0x03, 0x03, 0x90, 0x87, 0x00, 0xe5, 0x80, 0x94, 0x00, 0xe5,
+    0x11, 0x90, 0x00, 0xe4, 0xb8, 0xc2, 0x00, 0xb1, 0x00, 0xe5, 0x69, 0xc2,
+    0x00, 0x74, 0x00, 0xe5, 0x48, 0xc2, 0x00, 0x74, 0x00, 0x85, 0xc9, 0xc2,
+    0x00, 0xb1, 0x00, 0x85, 0xe8, 0x87, 0x00, 0x86, 0x01, 0x94, 0x00, 0x86,
+    0x20, 0x90, 0x00, 0x86, 0xb9, 0x94, 0x00, 0x87, 0x10, 0xc2, 0x00, 0x74,
+    0x00, 0x87, 0x49, 0xc2, 0x00, 0xb1, 0x00, 0x87, 0x68, 0x87, 0x00, 0x87,
+    0x81, 0x94, 0x00, 0x87, 0xa2, 0x03, 0x03, 0x96, 0xc2, 0x00, 0x74, 0x01,
+    0x68, 0xc9, 0xc2, 0x00, 0xb1, 0x01, 0x68, 0xe8, 0x87, 0x01, 0x69, 0x01,
+    0x94, 0x01, 0x69, 0x20, 0xc3, 0x00, 0x15, 0x01, 0x60, 0x29, 0x14, 0x43,
+    0x03, 0x9c, 0x87, 0x01, 0x60, 0x49, 0xc4, 0x7a, 0xc3, 0x01, 0x61, 0x58,
+    0xc9, 0xae, 0x6a, 0x01, 0x61, 0x39, 0xc7, 0xc8, 0x00, 0x01, 0x61, 0x48,
+    0xc2, 0x01, 0x6f, 0x01, 0x60, 0xdb, 0x03, 0x03, 0xa4, 0x83, 0x01, 0x60,
+    0xf0, 0xca, 0xa5, 0x12, 0x01, 0x61, 0x28, 0xc3, 0x00, 0x15, 0x01, 0x61,
+    0xa9, 0x14, 0x43, 0x03, 0xaa, 0x87, 0x01, 0x61, 0xc9, 0xc4, 0x7a, 0xc3,
+    0x01, 0x62, 0xd8, 0xc9, 0xae, 0x6a, 0x01, 0x62, 0xb9, 0xc7, 0xc8, 0x00,
+    0x01, 0x62, 0xc8, 0xc2, 0x01, 0x6f, 0x01, 0x62, 0x5b, 0x03, 0x03, 0xb2,
+    0x83, 0x01, 0x62, 0x70, 0xca, 0xa5, 0x12, 0x01, 0x62, 0xa8, 0x94, 0x00,
+    0x58, 0x5b, 0x03, 0x03, 0xb8, 0x8e, 0x00, 0x58, 0x62, 0x03, 0x03, 0xbc,
+    0x83, 0x00, 0x58, 0xf9, 0xc2, 0x00, 0xd0, 0x00, 0x59, 0x00, 0x83, 0x00,
+    0x59, 0x09, 0xc2, 0x00, 0xd0, 0x00, 0x59, 0x10, 0x94, 0x00, 0x5a, 0x20,
+    0x8e, 0x00, 0x5b, 0x18, 0x00, 0x43, 0x03, 0xc0, 0xc9, 0x57, 0x20, 0x0f,
+    0x69, 0x38, 0x00, 0x43, 0x03, 0xcc, 0xc9, 0x57, 0x20, 0x0f, 0x69, 0x30,
+    0x00, 0x43, 0x03, 0xd8, 0xc9, 0x57, 0x20, 0x0f, 0x69, 0x40, 0x00, 0x43,
+    0x03, 0xe4, 0xc9, 0x57, 0x20, 0x0f, 0x69, 0x48, 0xc9, 0x57, 0x20, 0x0f,
+    0x69, 0x50, 0xc7, 0x0d, 0x04, 0x0f, 0x68, 0xc1, 0xc8, 0x4b, 0x94, 0x0f,
+    0x69, 0x08, 0xc9, 0x57, 0x20, 0x0f, 0x69, 0x58, 0xc7, 0x0d, 0x04, 0x0f,
+    0x68, 0xc9, 0xc8, 0x4b, 0x94, 0x0f, 0x69, 0x10, 0xc4, 0x02, 0xa3, 0x08,
+    0x7c, 0x41, 0xc4, 0x10, 0xa0, 0x08, 0x7c, 0x38, 0xc5, 0x05, 0x02, 0x08,
+    0x7c, 0x29, 0xc5, 0x00, 0xd4, 0x08, 0x7c, 0x20, 0xc5, 0x05, 0x02, 0x08,
+    0x7c, 0x19, 0xc5, 0x00, 0xd4, 0x08, 0x7c, 0x10, 0xc3, 0x26, 0x1a, 0x08,
+    0x7c, 0x09, 0xc5, 0xcf, 0xd8, 0x08, 0x7b, 0xc0, 0x03, 0xc3, 0x03, 0xf0,
+    0xc3, 0x11, 0xef, 0x08, 0x7b, 0xf8, 0xc3, 0x05, 0x14, 0x08, 0x78, 0xeb,
+    0x03, 0x03, 0xfc, 0x16, 0xc3, 0x04, 0x02, 0x08, 0x43, 0x04, 0x10, 0x46,
+    0x02, 0x0f, 0xc3, 0x04, 0x1c, 0xd3, 0x40, 0x8d, 0x08, 0x79, 0x38, 0xce,
+    0x70, 0x26, 0x08, 0x53, 0xf9, 0x44, 0x00, 0x51, 0x43, 0x04, 0x7b, 0x16,
+    0xc3, 0x04, 0x87, 0xc4, 0x4a, 0xb9, 0x08, 0x53, 0xd1, 0x06, 0xc3, 0x04,
+    0x97, 0xc4, 0xdf, 0x07, 0x08, 0x53, 0xc1, 0x09, 0xc3, 0x04, 0xa3, 0xc4,
+    0xe3, 0x27, 0x08, 0x53, 0x41, 0xc4, 0x5d, 0xe2, 0x08, 0x53, 0x39, 0x15,
+    0xc3, 0x04, 0xaf, 0xc3, 0x7e, 0x89, 0x08, 0x53, 0x29, 0xc4, 0xb9, 0x7e,
+    0x08, 0x53, 0x21, 0xc3, 0x00, 0x4e, 0x08, 0x53, 0x19, 0xc2, 0x01, 0x7f,
+    0x08, 0x53, 0x03, 0x03, 0x04, 0xb9, 0xc6, 0xcf, 0xd7, 0x08, 0x53, 0x09,
+    0x0d, 0xc3, 0x04, 0xbf, 0xc3, 0x20, 0x18, 0x08, 0x53, 0x61, 0xc2, 0x14,
+    0xda, 0x08, 0x53, 0x81, 0x03, 0x43, 0x04, 0xcb, 0xc2, 0x00, 0x5f, 0x08,
+    0x67, 0xd9, 0xc3, 0x45, 0x6b, 0x08, 0x67, 0xe8, 0x00, 0x43, 0x04, 0xd7,
+    0x95, 0x08, 0x67, 0x91, 0x97, 0x08, 0x67, 0x59, 0xc2, 0x1e, 0xd5, 0x08,
+    0x66, 0xa8, 0x90, 0x08, 0x66, 0xcb, 0x03, 0x04, 0xe3, 0x9c, 0x08, 0x67,
+    0x7b, 0x03, 0x04, 0xf2, 0x98, 0x08, 0x67, 0x71, 0x85, 0x08, 0x66, 0x23,
+    0x03, 0x04, 0xf6, 0x96, 0x08, 0x67, 0x33, 0x03, 0x04, 0xfe, 0x95, 0x08,
+    0x67, 0x23, 0x03, 0x05, 0x02, 0x8f, 0x08, 0x66, 0xc3, 0x03, 0x05, 0x06,
+    0x8e, 0x08, 0x66, 0xb3, 0x03, 0x05, 0x0a, 0x8d, 0x08, 0x66, 0x99, 0x8c,
+    0x08, 0x66, 0x91, 0x8a, 0x08, 0x66, 0x79, 0x89, 0x08, 0x66, 0x6b, 0x03,
+    0x05, 0x0e, 0x88, 0x08, 0x66, 0x61, 0x87, 0x08, 0x66, 0x59, 0x86, 0x08,
+    0x66, 0x39, 0x84, 0x08, 0x66, 0x11, 0x92, 0x08, 0x67, 0x01, 0x94, 0x08,
+    0x67, 0x10, 0xc2, 0x0f, 0xe1, 0x08, 0x67, 0x69, 0xc2, 0x49, 0x0c, 0x08,
+    0x66, 0xf0, 0xc2, 0x0f, 0xe1, 0x08, 0x67, 0x61, 0xc2, 0x49, 0x0c, 0x08,
+    0x66, 0xe8, 0x91, 0x08, 0x66, 0xe1, 0xc2, 0x02, 0xe0, 0x08, 0x66, 0xf8,
+    0x8d, 0x08, 0x66, 0xa1, 0xc2, 0x02, 0x35, 0x08, 0x66, 0x41, 0xc2, 0x00,
+    0x50, 0x08, 0x66, 0x19, 0x83, 0x08, 0x66, 0x08, 0x8b, 0x08, 0x66, 0x88,
+    0x90, 0x08, 0x64, 0xcb, 0x03, 0x05, 0x12, 0x96, 0x08, 0x65, 0x33, 0x03,
+    0x05, 0x21, 0x95, 0x08, 0x65, 0x23, 0x03, 0x05, 0x25, 0x92, 0x08, 0x65,
+    0x01, 0x8f, 0x08, 0x64, 0xc3, 0x03, 0x05, 0x29, 0x8e, 0x08, 0x64, 0xb3,
+    0x03, 0x05, 0x2d, 0x8d, 0x08, 0x64, 0x99, 0x8c, 0x08, 0x64, 0x91, 0x8a,
+    0x08, 0x64, 0x79, 0x89, 0x08, 0x64, 0x6b, 0x03, 0x05, 0x31, 0x88, 0x08,
+    0x64, 0x61, 0x87, 0x08, 0x64, 0x59, 0x86, 0x08, 0x64, 0x39, 0x85, 0x08,
+    0x64, 0x23, 0x03, 0x05, 0x35, 0x84, 0x08, 0x64, 0x11, 0x94, 0x08, 0x65,
+    0x11, 0x98, 0x08, 0x65, 0x71, 0x9c, 0x08, 0x65, 0x7a, 0x03, 0x05, 0x3d,
+    0xc2, 0x02, 0xe0, 0x08, 0x64, 0xf9, 0x91, 0x08, 0x64, 0xe0, 0xc2, 0x49,
+    0x0c, 0x08, 0x64, 0xf1, 0xc2, 0x0f, 0xe1, 0x08, 0x65, 0x68, 0xc2, 0x49,
+    0x0c, 0x08, 0x64, 0xe9, 0xc2, 0x0f, 0xe1, 0x08, 0x65, 0x60, 0xc2, 0x1e,
+    0xd5, 0x08, 0x64, 0xa9, 0x97, 0x08, 0x65, 0x59, 0x95, 0x08, 0x65, 0x90,
+    0x8d, 0x08, 0x64, 0xa1, 0xc2, 0x02, 0x35, 0x08, 0x64, 0x41, 0xc2, 0x00,
+    0x50, 0x08, 0x64, 0x19, 0x83, 0x08, 0x64, 0x08, 0x8b, 0x08, 0x64, 0x88,
+    0x96, 0x08, 0x62, 0x39, 0x93, 0x08, 0x61, 0xc1, 0x87, 0x08, 0x60, 0x3b,
+    0x03, 0x05, 0x41, 0x92, 0x08, 0x61, 0x80, 0x07, 0xc3, 0x05, 0x45, 0x96,
+    0x08, 0x62, 0x19, 0x95, 0x08, 0x61, 0xeb, 0x03, 0x05, 0x6d, 0x94, 0x08,
+    0x61, 0xd1, 0x93, 0x08, 0x61, 0xa1, 0x90, 0x08, 0x61, 0x19, 0x8e, 0x08,
+    0x60, 0xf1, 0x9b, 0x08, 0x60, 0xb1, 0x86, 0x08, 0x60, 0x89, 0x89, 0x08,
+    0x60, 0x69, 0x84, 0x08, 0x60, 0x48, 0xc2, 0x01, 0xe2, 0x08, 0x62, 0x09,
+    0x10, 0xc3, 0x05, 0x71, 0x8f, 0x08, 0x61, 0x11, 0xc2, 0x00, 0x72, 0x08,
+    0x61, 0x09, 0x9c, 0x08, 0x60, 0xa1, 0x92, 0x08, 0x61, 0x79, 0x85, 0x08,
+    0x61, 0x90, 0x93, 0x08, 0x61, 0xb1, 0x85, 0x08, 0x61, 0x88, 0x87, 0x08,
+    0x60, 0x13, 0x03, 0x05, 0x81, 0x96, 0x08, 0x62, 0x21, 0xc2, 0x01, 0xe2,
+    0x08, 0x62, 0x01, 0x94, 0x08, 0x61, 0xd9, 0x93, 0x08, 0x61, 0xa9, 0x8e,
+    0x08, 0x60, 0xf9, 0x9b, 0x08, 0x60, 0xb9, 0x86, 0x08, 0x60, 0x91, 0x89,
+    0x08, 0x60, 0x71, 0x84, 0x08, 0x60, 0x51, 0xc2, 0x00, 0x75, 0x08, 0x61,
+    0x60, 0xc2, 0x01, 0xe2, 0x08, 0x62, 0x11, 0x85, 0x08, 0x61, 0x99, 0x10,
+    0xc3, 0x05, 0x9c, 0x9c, 0x08, 0x60, 0xa8, 0x93, 0x08, 0x61, 0xc9, 0x87,
+    0x08, 0x60, 0x42, 0x03, 0x05, 0xa8, 0x93, 0x08, 0x61, 0xb8, 0xc5, 0x28,
+    0xee, 0x08, 0x54, 0xf9, 0xc2, 0x00, 0xc4, 0x08, 0x54, 0xf0, 0x8a, 0x08,
+    0x54, 0xe1, 0xc2, 0x00, 0x74, 0x08, 0x54, 0xc0, 0x0a, 0xc3, 0x05, 0xac,
+    0xc2, 0x02, 0x98, 0x08, 0x54, 0xb9, 0xc2, 0x16, 0x1c, 0x08, 0x54, 0x48,
+    0xc4, 0x92, 0x76, 0x08, 0x54, 0xb1, 0xc3, 0x12, 0xc2, 0x08, 0x54, 0xa0,
+    0x8e, 0x08, 0x54, 0xa9, 0x86, 0x08, 0x54, 0x98, 0x9f, 0x08, 0x54, 0x31,
+    0x9e, 0x08, 0x54, 0x51, 0xa0, 0x08, 0x54, 0x78, 0xc2, 0x02, 0x98, 0x08,
+    0x54, 0x11, 0xc2, 0x16, 0x1c, 0x08, 0x54, 0x00, 0xc2, 0x0f, 0x9b, 0x08,
+    0x54, 0x59, 0xc3, 0x14, 0x4b, 0x08, 0x54, 0x68, 0xc3, 0x00, 0x49, 0x08,
+    0x54, 0x89, 0xc2, 0x00, 0x74, 0x08, 0x54, 0x90, 0x45, 0x00, 0xba, 0xc3,
+    0x05, 0xb8, 0xcc, 0x1d, 0xc7, 0x08, 0x1e, 0x81, 0x47, 0x34, 0x2f, 0x43,
+    0x06, 0x21, 0xc2, 0x00, 0x82, 0x08, 0x1a, 0x99, 0x1c, 0x43, 0x06, 0x31,
+    0x88, 0x08, 0x1b, 0x58, 0xc3, 0xd3, 0x4c, 0x08, 0x1a, 0xa9, 0xc3, 0x13,
+    0x4e, 0x08, 0x1a, 0xb8, 0x87, 0x08, 0x1b, 0x91, 0x83, 0x08, 0x1b, 0xa8,
+    0xc3, 0xc1, 0x4b, 0x08, 0x1a, 0xf1, 0xc2, 0x00, 0x29, 0x08, 0x1b, 0x70,
+    0xc2, 0x25, 0x9f, 0x08, 0x1b, 0x09, 0x0a, 0x43, 0x06, 0x3d, 0xc2, 0x02,
+    0xfa, 0x08, 0x1b, 0x11, 0xc3, 0xc5, 0xef, 0x08, 0x1b, 0x68, 0xc2, 0x14,
+    0x98, 0x08, 0x1b, 0x39, 0xc2, 0x00, 0x29, 0x08, 0x1b, 0x7b, 0x03, 0x06,
+    0x49, 0x83, 0x08, 0x1b, 0xa3, 0x03, 0x06, 0x51, 0x97, 0x08, 0x1b, 0x98,
+    0x91, 0x08, 0x1b, 0x50, 0x87, 0x08, 0x18, 0x11, 0x83, 0x08, 0x18, 0x51,
+    0x97, 0x08, 0x18, 0x91, 0xc2, 0x01, 0x7f, 0x08, 0x18, 0xc8, 0x8e, 0x08,
+    0x18, 0x59, 0x8f, 0x08, 0x18, 0x61, 0x90, 0x08, 0x18, 0x69, 0x95, 0x08,
+    0x18, 0xa3, 0x03, 0x06, 0x55, 0x94, 0x08, 0x18, 0x9b, 0x03, 0x06, 0x5d,
+    0xc2, 0x01, 0x26, 0x08, 0x18, 0xb9, 0x88, 0x08, 0x18, 0xd0, 0xc2, 0x01,
+    0x7f, 0x08, 0x18, 0x41, 0x87, 0x08, 0x18, 0xa8, 0x8b, 0x08, 0x18, 0xe8,
+    0x87, 0x08, 0x18, 0x81, 0xc2, 0x01, 0x7f, 0x08, 0x18, 0xc0, 0xc2, 0x01,
+    0x7f, 0x08, 0x18, 0x89, 0xcb, 0x97, 0x3a, 0x08, 0x19, 0x78, 0x97, 0x08,
+    0x18, 0xf0, 0xc2, 0x00, 0x5f, 0x08, 0x19, 0x09, 0xc3, 0x45, 0x6b, 0x08,
+    0x19, 0x18, 0x83, 0x08, 0x26, 0x49, 0xc2, 0x0d, 0xf6, 0x08, 0x26, 0x58,
+    0x83, 0x08, 0x27, 0x89, 0xc2, 0x0d, 0xf6, 0x08, 0x27, 0x98, 0x4b, 0x8b,
+    0x36, 0xc3, 0x06, 0x61, 0xd2, 0x4e, 0x41, 0x0e, 0x7d, 0x90, 0x42, 0x14,
+    0x98, 0xc3, 0x06, 0x6d, 0x46, 0x87, 0x3b, 0x43, 0x06, 0x7c, 0x45, 0xdc,
+    0x3b, 0xc3, 0x06, 0x88, 0xce, 0x6e, 0x9e, 0x0e, 0x7c, 0xd0, 0x11, 0xc3,
+    0x06, 0x9a, 0xc4, 0x69, 0xaa, 0x0e, 0x7d, 0x12, 0x03, 0x06, 0xac, 0x11,
+    0xc3, 0x06, 0xb2, 0xc3, 0x2d, 0x1d, 0x0e, 0x7c, 0xda, 0x03, 0x06, 0xc1,
+    0x11, 0xc3, 0x06, 0xc7, 0xc7, 0xbf, 0x6b, 0x0e, 0x7c, 0x90, 0xce, 0x69,
+    0xa0, 0x0e, 0x7c, 0x89, 0x42, 0x00, 0x97, 0xc3, 0x06, 0xd3, 0xc9, 0xaa,
+    0x83, 0x0e, 0x7c, 0x5a, 0x03, 0x06, 0xf1, 0xd4, 0x38, 0x54, 0x0e, 0x7a,
+    0xd1, 0xc8, 0xbc, 0x2a, 0x0e, 0x7a, 0xb8, 0xc7, 0x78, 0xdb, 0x0e, 0x7c,
+    0x01, 0xc8, 0x94, 0x9e, 0x0e, 0x7b, 0xf0, 0xc7, 0x6d, 0xa9, 0x0e, 0x7b,
+    0xc1, 0xc8, 0x4e, 0x4b, 0x0e, 0x7b, 0xb0, 0xc5, 0x00, 0x2c, 0x0e, 0x78,
+    0x71, 0xc4, 0x00, 0x49, 0x0e, 0x78, 0x10, 0xd5, 0x35, 0xf3, 0x0e, 0x79,
+    0xb8, 0xc6, 0x42, 0x68, 0x0e, 0x78, 0xe1, 0x42, 0x00, 0xe7, 0x43, 0x06,
+    0xf7, 0xc5, 0x00, 0x2c, 0x0e, 0x78, 0x99, 0xc4, 0x00, 0x49, 0x0e, 0x78,
+    0x38, 0xc5, 0x00, 0x2c, 0x0e, 0x78, 0x79, 0xc4, 0x00, 0x49, 0x0e, 0x78,
+    0x18, 0xc5, 0x00, 0x2c, 0x0e, 0x78, 0x61, 0xc4, 0x00, 0x49, 0x0e, 0x78,
+    0x00, 0xc6, 0x78, 0xdc, 0x0e, 0x78, 0xc9, 0x4b, 0x8e, 0xfa, 0x43, 0x07,
+    0x03, 0xc5, 0x00, 0x2c, 0x0e, 0x78, 0xb9, 0xc4, 0x00, 0x49, 0x0e, 0x78,
+    0x58, 0xc5, 0xb3, 0x3f, 0x05, 0x4e, 0x58, 0xc4, 0xad, 0x29, 0x05, 0x4e,
+    0x49, 0xc3, 0x06, 0x47, 0x05, 0x4e, 0x28, 0xc8, 0x60, 0x28, 0x05, 0x4d,
+    0x81, 0xc4, 0x60, 0x22, 0x05, 0x4d, 0x78, 0xc5, 0x60, 0x21, 0x05, 0x4d,
+    0x31, 0xc5, 0xdc, 0x68, 0x05, 0x4c, 0x48, 0xc6, 0xcb, 0x27, 0x05, 0x4c,
+    0xf8, 0xc6, 0xcb, 0x27, 0x05, 0x4c, 0xc0, 0xc6, 0xcb, 0x45, 0x05, 0x4c,
+    0x52, 0x03, 0x07, 0x0f, 0xca, 0x60, 0x26, 0x05, 0x4d, 0x18, 0xca, 0x60,
+    0x26, 0x05, 0x4c, 0xf0, 0xc6, 0xcb, 0x27, 0x05, 0x4d, 0x08, 0xca, 0x60,
+    0x26, 0x05, 0x4c, 0xe0, 0xc5, 0x60, 0x21, 0x05, 0x4c, 0x89, 0xc5, 0x91,
+    0x73, 0x05, 0x4c, 0x80, 0xd0, 0x3d, 0x58, 0x01, 0x2c, 0xf8, 0x56, 0x2c,
+    0x44, 0xc3, 0x07, 0x15, 0x46, 0x01, 0xc8, 0x43, 0x07, 0x21, 0x9a, 0x05,
+    0x22, 0xd1, 0x96, 0x05, 0x22, 0xc9, 0x91, 0x05, 0x22, 0x9b, 0x03, 0x07,
+    0x2d, 0x92, 0x05, 0x22, 0xe0, 0x92, 0x05, 0x22, 0xc1, 0x9a, 0x05, 0x22,
+    0xb1, 0x96, 0x05, 0x22, 0xa8, 0x87, 0x05, 0x22, 0x83, 0x03, 0x07, 0x35,
+    0x92, 0x05, 0x22, 0x69, 0x9a, 0x05, 0x22, 0x59, 0x96, 0x05, 0x22, 0x50,
+    0x94, 0x05, 0x22, 0x4b, 0x03, 0x07, 0x41, 0x92, 0x05, 0x22, 0x39, 0x9a,
+    0x05, 0x22, 0x29, 0x96, 0x05, 0x22, 0x20, 0x94, 0x05, 0x22, 0x1b, 0x03,
+    0x07, 0x45, 0x92, 0x05, 0x22, 0x09, 0x9a, 0x05, 0x21, 0xf9, 0x91, 0x05,
+    0x21, 0xd2, 0x03, 0x07, 0x49, 0x92, 0x05, 0x21, 0xf1, 0x9a, 0x05, 0x21,
+    0xe1, 0x96, 0x05, 0x21, 0xd8, 0x87, 0x05, 0x21, 0xbb, 0x03, 0x07, 0x4d,
+    0x92, 0x05, 0x21, 0xa1, 0x9a, 0x05, 0x21, 0x91, 0x96, 0x05, 0x21, 0x88,
+    0x94, 0x05, 0x1d, 0x53, 0x03, 0x07, 0x59, 0x92, 0x05, 0x1d, 0x41, 0x9a,
+    0x05, 0x1d, 0x31, 0x96, 0x05, 0x1d, 0x28, 0x94, 0x05, 0x1d, 0x23, 0x03,
+    0x07, 0x5d, 0x92, 0x05, 0x1d, 0x11, 0x9a, 0x05, 0x1d, 0x01, 0x96, 0x05,
+    0x1c, 0xf8, 0x92, 0x05, 0x1c, 0xf1, 0x9a, 0x05, 0x1c, 0xe1, 0x96, 0x05,
+    0x1c, 0xd8, 0x92, 0x05, 0x1c, 0xd1, 0x9a, 0x05, 0x1c, 0xc0, 0x92, 0x05,
+    0x1c, 0xb9, 0x9a, 0x05, 0x1c, 0xa9, 0x96, 0x05, 0x1c, 0xa0, 0x9a, 0x05,
+    0x12, 0xe9, 0x92, 0x05, 0x12, 0xf8, 0x96, 0x05, 0x13, 0x01, 0x9a, 0x05,
+    0x13, 0x09, 0x92, 0x05, 0x13, 0x18, 0x96, 0x05, 0x13, 0x21, 0x9a, 0x05,
+    0x13, 0x28, 0x96, 0x05, 0x13, 0x39, 0x9a, 0x05, 0x13, 0x41, 0x92, 0x05,
+    0x13, 0x50, 0x9a, 0x05, 0x13, 0x59, 0x92, 0x05, 0x13, 0x68, 0x96, 0x05,
+    0x17, 0x69, 0x9a, 0x05, 0x17, 0x71, 0x92, 0x05, 0x17, 0x81, 0x87, 0x05,
+    0x17, 0x92, 0x03, 0x07, 0x61, 0x96, 0x05, 0x17, 0xa1, 0x9a, 0x05, 0x17,
+    0xa9, 0x92, 0x05, 0x17, 0xb8, 0x91, 0x05, 0x17, 0xcb, 0x03, 0x07, 0x69,
+    0x9a, 0x05, 0x17, 0xf1, 0x92, 0x05, 0x18, 0x01, 0x94, 0x05, 0x18, 0x12,
+    0x03, 0x07, 0x6d, 0x9a, 0x05, 0x17, 0xd1, 0x92, 0x05, 0x17, 0xd8, 0x9a,
+    0x05, 0x17, 0xe0, 0x96, 0x05, 0x18, 0x19, 0x9a, 0x05, 0x18, 0x21, 0x92,
+    0x05, 0x18, 0x31, 0x94, 0x05, 0x18, 0x42, 0x03, 0x07, 0x71, 0x96, 0x05,
+    0x03, 0xb1, 0x9a, 0x05, 0x03, 0xb9, 0x92, 0x05, 0x03, 0xc9, 0x87, 0x05,
+    0x03, 0xda, 0x03, 0x07, 0x75, 0x96, 0x05, 0x03, 0xe9, 0x9a, 0x05, 0x03,
+    0xf1, 0x92, 0x05, 0x04, 0x00, 0x96, 0x05, 0x04, 0x09, 0x9a, 0x05, 0x04,
+    0x11, 0x92, 0x05, 0x04, 0x20, 0x96, 0x05, 0x04, 0x29, 0x9a, 0x05, 0x04,
+    0x31, 0x92, 0x05, 0x04, 0x41, 0x94, 0x05, 0x04, 0x52, 0x03, 0x07, 0x7d,
+    0x96, 0x05, 0x04, 0x59, 0x9a, 0x05, 0x04, 0x61, 0x92, 0x05, 0x04, 0x71,
+    0x94, 0x05, 0x04, 0x82, 0x03, 0x07, 0x81, 0x96, 0x05, 0x0a, 0x89, 0x9a,
+    0x05, 0x0a, 0x91, 0x92, 0x05, 0x0a, 0xa1, 0x87, 0x05, 0x0a, 0xb2, 0x03,
+    0x07, 0x85, 0x96, 0x05, 0x0a, 0xb9, 0x9a, 0x05, 0x0a, 0xc1, 0x92, 0x05,
+    0x0a, 0xd0, 0x96, 0x05, 0x0a, 0xd9, 0x9a, 0x05, 0x0a, 0xe1, 0x92, 0x05,
+    0x0a, 0xf0, 0x9a, 0x05, 0x0a, 0xf9, 0x92, 0x05, 0x0b, 0x08, 0x96, 0x05,
+    0x0b, 0x11, 0x9a, 0x05, 0x0b, 0x19, 0x92, 0x05, 0x0b, 0x29, 0x94, 0x05,
+    0x0b, 0x3a, 0x03, 0x07, 0x89, 0x9a, 0x05, 0x0c, 0x09, 0x92, 0x05, 0x0c,
+    0x18, 0x9a, 0x05, 0x0c, 0x39, 0x92, 0x05, 0x0c, 0x48, 0x9a, 0x05, 0x0c,
+    0xa8, 0x92, 0x05, 0x21, 0x81, 0x9a, 0x05, 0x21, 0x71, 0x96, 0x05, 0x21,
+    0x68, 0x94, 0x05, 0x21, 0x63, 0x03, 0x07, 0x8d, 0x92, 0x05, 0x21, 0x51,
+    0x9a, 0x05, 0x21, 0x41, 0x96, 0x05, 0x21, 0x39, 0x91, 0x05, 0x21, 0x0a,
+    0x03, 0x07, 0x91, 0x92, 0x05, 0x21, 0x31, 0x9a, 0x05, 0x21, 0x21, 0x96,
+    0x05, 0x21, 0x18, 0x87, 0x05, 0x20, 0xf3, 0x03, 0x07, 0x99, 0x92, 0x05,
+    0x20, 0xd9, 0x9a, 0x05, 0x20, 0xc9, 0x96, 0x05, 0x20, 0xc0, 0x9a, 0x05,
+    0x1f, 0xd0, 0x9a, 0x05, 0x1f, 0xa0, 0x92, 0x05, 0x1f, 0x59, 0x9a, 0x05,
+    0x1f, 0x49, 0x96, 0x05, 0x1f, 0x40, 0x92, 0x05, 0x1f, 0x39, 0x9a, 0x05,
+    0x1f, 0x29, 0x96, 0x05, 0x1f, 0x20, 0x9a, 0x05, 0x20, 0xb0, 0x9a, 0x05,
+    0x20, 0x80, 0x92, 0x05, 0x20, 0x39, 0x9a, 0x05, 0x20, 0x29, 0x96, 0x05,
+    0x20, 0x20, 0x92, 0x05, 0x20, 0x19, 0x9a, 0x05, 0x20, 0x09, 0x96, 0x05,
+    0x20, 0x00, 0x9a, 0x05, 0x1e, 0xf0, 0x9a, 0x05, 0x1e, 0xc0, 0x92, 0x05,
+    0x1e, 0x61, 0x9a, 0x05, 0x1e, 0x50, 0x94, 0x05, 0x1c, 0x9b, 0x03, 0x07,
+    0xa5, 0x92, 0x05, 0x1c, 0x89, 0x9a, 0x05, 0x1c, 0x79, 0x96, 0x05, 0x1c,
+    0x70, 0x94, 0x05, 0x1c, 0x6b, 0x03, 0x07, 0xa9, 0x92, 0x05, 0x1c, 0x59,
+    0x9a, 0x05, 0x1c, 0x49, 0x96, 0x05, 0x1c, 0x41, 0x91, 0x05, 0x1b, 0xfa,
+    0x03, 0x07, 0xad, 0x92, 0x05, 0x1c, 0x39, 0x9a, 0x05, 0x1c, 0x29, 0x96,
+    0x05, 0x1c, 0x20, 0x92, 0x05, 0x1c, 0x19, 0x9a, 0x05, 0x1c, 0x09, 0x96,
+    0x05, 0x1c, 0x00, 0x9a, 0x05, 0x1b, 0xe0, 0x94, 0x05, 0x1e, 0x2b, 0x03,
+    0x07, 0xb1, 0x92, 0x05, 0x1e, 0x19, 0x9a, 0x05, 0x1e, 0x09, 0x96, 0x05,
+    0x1e, 0x00, 0x94, 0x05, 0x1d, 0xfb, 0x03, 0x07, 0xb5, 0x92, 0x05, 0x1d,
+    0xe9, 0x9a, 0x05, 0x1d, 0xd9, 0x96, 0x05, 0x1d, 0xd1, 0x91, 0x05, 0x1d,
+    0x82, 0x03, 0x07, 0xb9, 0x92, 0x05, 0x1d, 0xc9, 0x9a, 0x05, 0x1d, 0xb9,
+    0x96, 0x05, 0x1d, 0xb0, 0x92, 0x05, 0x1d, 0xa9, 0x9a, 0x05, 0x1d, 0x99,
+    0x96, 0x05, 0x1d, 0x90, 0x92, 0x05, 0x1d, 0x71, 0x9a, 0x05, 0x1d, 0x61,
+    0x96, 0x05, 0x1d, 0x58, 0x92, 0x05, 0x1a, 0x19, 0x94, 0x05, 0x1a, 0x2b,
+    0x03, 0x07, 0xc1, 0x96, 0x05, 0x1a, 0x01, 0x9a, 0x05, 0x1a, 0x08, 0x96,
+    0x05, 0x19, 0x51, 0x9a, 0x05, 0x19, 0x59, 0x92, 0x05, 0x19, 0x69, 0x87,
+    0x05, 0x19, 0x7a, 0x03, 0x07, 0xc5, 0x96, 0x05, 0x19, 0x89, 0x9a, 0x05,
+    0x19, 0x91, 0x92, 0x05, 0x19, 0xa0, 0x96, 0x05, 0x19, 0xa9, 0x9a, 0x05,
+    0x19, 0xb1, 0x92, 0x05, 0x19, 0xc0, 0x9a, 0x05, 0x19, 0xc9, 0x92, 0x05,
+    0x19, 0xd8, 0x96, 0x05, 0x19, 0xe1, 0x9a, 0x05, 0x19, 0xe9, 0x92, 0x05,
+    0x19, 0xf8, 0x9a, 0x05, 0x18, 0x69, 0x92, 0x05, 0x18, 0x78, 0x9a, 0x05,
+    0x18, 0xa1, 0x92, 0x05, 0x18, 0xb0, 0x9a, 0x05, 0x19, 0x10, 0x9a, 0x05,
+    0x19, 0x40, 0x94, 0x05, 0x1b, 0xdb, 0x03, 0x07, 0xcd, 0x92, 0x05, 0x1b,
+    0xc9, 0x9a, 0x05, 0x1b, 0xb9, 0x96, 0x05, 0x1b, 0xb0, 0x94, 0x05, 0x1b,
+    0xab, 0x03, 0x07, 0xd1, 0x92, 0x05, 0x1b, 0x99, 0x9a, 0x05, 0x1b, 0x89,
+    0x91, 0x05, 0x1b, 0x3a, 0x03, 0x07, 0xd5, 0x92, 0x05, 0x1b, 0x81, 0x9a,
+    0x05, 0x1b, 0x71, 0x96, 0x05, 0x1b, 0x68, 0x92, 0x05, 0x1b, 0x61, 0x9a,
+    0x05, 0x1b, 0x51, 0x96, 0x05, 0x1b, 0x48, 0x92, 0x05, 0x1b, 0x21, 0x96,
+    0x05, 0x1b, 0x09, 0x9a, 0x05, 0x1b, 0x10, 0x9a, 0x05, 0x16, 0x70, 0x9a,
+    0x05, 0x16, 0x40, 0x9a, 0x05, 0x15, 0xd1, 0x92, 0x05, 0x15, 0xe0, 0x96,
+    0x05, 0x15, 0x91, 0x9a, 0x05, 0x15, 0x99, 0x92, 0x05, 0x15, 0xa8, 0x92,
+    0x05, 0x14, 0xd1, 0x9a, 0x05, 0x14, 0xc0, 0x9a, 0x05, 0x15, 0x30, 0x92,
+    0x05, 0x14, 0x99, 0x9a, 0x05, 0x14, 0x88, 0x9a, 0x05, 0x16, 0xe1, 0x92,
+    0x05, 0x16, 0xf0, 0x92, 0x05, 0x16, 0xd9, 0x9a, 0x05, 0x16, 0xc9, 0x96,
+    0x05, 0x16, 0xc0, 0x87, 0x05, 0x16, 0xb3, 0x03, 0x07, 0xe1, 0x92, 0x05,
+    0x16, 0x99, 0x9a, 0x05, 0x16, 0x89, 0x96, 0x05, 0x16, 0x80, 0x9a, 0x05,
+    0x16, 0xf9, 0x92, 0x05, 0x17, 0x08, 0x9a, 0x05, 0x17, 0x11, 0x92, 0x05,
+    0x17, 0x21, 0x94, 0x05, 0x17, 0x32, 0x03, 0x07, 0xed, 0x96, 0x05, 0x17,
+    0x39, 0x9a, 0x05, 0x17, 0x41, 0x92, 0x05, 0x17, 0x51, 0x94, 0x05, 0x17,
+    0x62, 0x03, 0x07, 0xf1, 0x94, 0x05, 0x1b, 0x03, 0x03, 0x07, 0xf5, 0x92,
+    0x05, 0x1a, 0xf1, 0x9a, 0x05, 0x1a, 0xe1, 0x96, 0x05, 0x1a, 0xd8, 0x94,
+    0x05, 0x1a, 0xd3, 0x03, 0x07, 0xf9, 0x92, 0x05, 0x1a, 0xc1, 0x9a, 0x05,
+    0x1a, 0xb1, 0x96, 0x05, 0x1a, 0xa9, 0x91, 0x05, 0x1a, 0x5a, 0x03, 0x07,
+    0xfd, 0x92, 0x05, 0x1a, 0xa1, 0x9a, 0x05, 0x1a, 0x91, 0x96, 0x05, 0x1a,
+    0x88, 0x92, 0x05, 0x1a, 0x81, 0x96, 0x05, 0x1a, 0x69, 0x9a, 0x05, 0x1a,
+    0x70, 0x96, 0x05, 0x1a, 0x31, 0x9a, 0x05, 0x1a, 0x39, 0x92, 0x05, 0x1a,
+    0x48, 0x9a, 0x05, 0x15, 0x60, 0x92, 0x05, 0x14, 0x31, 0x9a, 0x05, 0x14,
+    0x20, 0x92, 0x05, 0x12, 0xe1, 0x9a, 0x05, 0x12, 0xd0, 0x92, 0x05, 0x12,
+    0xc9, 0x9a, 0x05, 0x12, 0xb9, 0x96, 0x05, 0x12, 0xb0, 0x92, 0x05, 0x12,
+    0xa9, 0x9a, 0x05, 0x12, 0x99, 0x96, 0x05, 0x12, 0x90, 0x92, 0x05, 0x12,
+    0x89, 0x9a, 0x05, 0x12, 0x79, 0x96, 0x05, 0x12, 0x70, 0x96, 0x05, 0x12,
+    0x31, 0x9a, 0x05, 0x12, 0x39, 0x92, 0x05, 0x12, 0x49, 0x87, 0x05, 0x12,
+    0x62, 0x03, 0x08, 0x05, 0x9a, 0x05, 0x13, 0x78, 0x96, 0x05, 0x04, 0x89,
+    0x9a, 0x05, 0x04, 0x91, 0x92, 0x05, 0x04, 0xa1, 0x87, 0x05, 0x04, 0xb2,
+    0x03, 0x08, 0x11, 0x96, 0x05, 0x04, 0xc1, 0x9a, 0x05, 0x04, 0xc9, 0x92,
+    0x05, 0x04, 0xd8, 0x91, 0x05, 0x04, 0xeb, 0x03, 0x08, 0x19, 0x96, 0x05,
+    0x05, 0x19, 0x9a, 0x05, 0x05, 0x21, 0x92, 0x05, 0x05, 0x31, 0x94, 0x05,
+    0x05, 0x42, 0x03, 0x08, 0x21, 0x96, 0x05, 0x04, 0xf9, 0x9a, 0x05, 0x05,
+    0x01, 0x92, 0x05, 0x05, 0x10, 0x9a, 0x05, 0x05, 0x49, 0x92, 0x05, 0x05,
+    0x59, 0x94, 0x05, 0x05, 0x6a, 0x03, 0x08, 0x25, 0x96, 0x05, 0x0b, 0x41,
+    0x9a, 0x05, 0x0b, 0x49, 0x92, 0x05, 0x0b, 0x59, 0x87, 0x05, 0x0b, 0x72,
+    0x03, 0x08, 0x29, 0x96, 0x05, 0x0b, 0x81, 0x9a, 0x05, 0x0b, 0x89, 0x92,
+    0x05, 0x0b, 0x98, 0x91, 0x05, 0x0b, 0xab, 0x03, 0x08, 0x35, 0x96, 0x05,
+    0x0b, 0xe0, 0x96, 0x05, 0x0b, 0xb1, 0x9a, 0x05, 0x0b, 0xb9, 0x92, 0x05,
+    0x0b, 0xc0, 0x9a, 0x05, 0x0b, 0xc9, 0x92, 0x05, 0x0b, 0xd8, 0x96, 0x05,
+    0x0c, 0xb9, 0x9a, 0x05, 0x0c, 0xc1, 0x92, 0x05, 0x0c, 0xd1, 0x87, 0x05,
+    0x0c, 0xe2, 0x03, 0x08, 0x39, 0x96, 0x05, 0x0c, 0xf1, 0x9a, 0x05, 0x0c,
+    0xf9, 0x92, 0x05, 0x0d, 0x08, 0x91, 0x05, 0x0d, 0x1b, 0x03, 0x08, 0x41,
+    0x96, 0x05, 0x0d, 0x59, 0x9a, 0x05, 0x0d, 0x61, 0x92, 0x05, 0x0d, 0x71,
+    0x94, 0x05, 0x0d, 0x82, 0x03, 0x08, 0x49, 0x9a, 0x05, 0x0d, 0x29, 0x92,
+    0x05, 0x0d, 0x38, 0x9a, 0x05, 0x0d, 0x41, 0x92, 0x05, 0x0d, 0x50, 0x96,
+    0x05, 0x0d, 0x89, 0x9a, 0x05, 0x0d, 0x91, 0x92, 0x05, 0x0d, 0xa1, 0x94,
+    0x05, 0x0d, 0xb2, 0x03, 0x08, 0x4d, 0x9a, 0x05, 0x23, 0xb1, 0x96, 0x05,
+    0x23, 0xa9, 0x92, 0x05, 0x23, 0xc0, 0x96, 0x05, 0x23, 0xc9, 0x9a, 0x05,
+    0x23, 0xd1, 0x92, 0x05, 0x23, 0xe0, 0x9a, 0x05, 0x24, 0x28, 0x9a, 0x05,
+    0x24, 0x58, 0x9a, 0x05, 0x23, 0x78, 0x96, 0x05, 0x23, 0x09, 0x9a, 0x05,
+    0x23, 0x11, 0x92, 0x05, 0x23, 0x20, 0x92, 0x05, 0x12, 0x19, 0x94, 0x05,
+    0x12, 0x2b, 0x03, 0x08, 0x51, 0x91, 0x05, 0x11, 0xbb, 0x03, 0x08, 0x55,
+    0x96, 0x05, 0x12, 0x01, 0x9a, 0x05, 0x12, 0x08, 0x9a, 0x05, 0x11, 0x80,
+    0x96, 0x05, 0x11, 0x91, 0x9a, 0x05, 0x11, 0x99, 0x92, 0x05, 0x11, 0xa8,
+    0x96, 0x05, 0x11, 0xc9, 0x9a, 0x05, 0x11, 0xd1, 0x92, 0x05, 0x11, 0xe0,
+    0x9a, 0x05, 0x11, 0xe9, 0x92, 0x05, 0x11, 0xf8, 0x9a, 0x05, 0x05, 0x91,
+    0x92, 0x05, 0x05, 0xa0, 0x96, 0x05, 0x05, 0xc9, 0x9a, 0x05, 0x05, 0xd1,
+    0x92, 0x05, 0x05, 0xe0, 0x9a, 0x05, 0x06, 0x38, 0x96, 0x05, 0x00, 0xd1,
+    0x9a, 0x05, 0x00, 0xd9, 0x92, 0x05, 0x00, 0xe8, 0x9a, 0x05, 0x01, 0x11,
+    0x92, 0x05, 0x01, 0x20, 0x9a, 0x05, 0x01, 0x80, 0x9a, 0x05, 0x01, 0xb0,
+    0x96, 0x05, 0x02, 0xb1, 0x9a, 0x05, 0x02, 0xb9, 0x92, 0x05, 0x02, 0xc9,
+    0x87, 0x05, 0x02, 0xe2, 0x03, 0x08, 0x5d, 0x96, 0x05, 0x02, 0xf1, 0x9a,
+    0x05, 0x02, 0xf9, 0x92, 0x05, 0x03, 0x08, 0x91, 0x05, 0x03, 0x1b, 0x03,
+    0x08, 0x69, 0x96, 0x05, 0x03, 0x61, 0x9a, 0x05, 0x03, 0x69, 0x92, 0x05,
+    0x03, 0x79, 0x94, 0x05, 0x03, 0x8a, 0x03, 0x08, 0x71, 0x96, 0x05, 0x03,
+    0x29, 0x9a, 0x05, 0x03, 0x31, 0x92, 0x05, 0x03, 0x40, 0x9a, 0x05, 0x03,
+    0x49, 0x92, 0x05, 0x03, 0x58, 0x96, 0x05, 0x03, 0x91, 0x9a, 0x05, 0x03,
+    0x99, 0x92, 0x05, 0x03, 0xa8, 0x9a, 0x05, 0x01, 0xe1, 0x92, 0x05, 0x01,
+    0xf0, 0x9a, 0x05, 0x02, 0x19, 0x92, 0x05, 0x02, 0x28, 0x9a, 0x05, 0x02,
+    0x70, 0x9a, 0x05, 0x02, 0xa0, 0x9a, 0x05, 0x06, 0xe0, 0x96, 0x05, 0x07,
+    0x31, 0x9a, 0x05, 0x07, 0x39, 0x92, 0x05, 0x07, 0x48, 0x9a, 0x05, 0x07,
+    0xc0, 0x9a, 0x05, 0x07, 0xf0, 0x9a, 0x05, 0x08, 0x21, 0x92, 0x05, 0x08,
+    0x30, 0x9a, 0x05, 0x08, 0x58, 0x9a, 0x05, 0x08, 0xc0, 0x96, 0x05, 0x09,
+    0xb1, 0x9a, 0x05, 0x09, 0xb9, 0x92, 0x05, 0x09, 0xc9, 0x87, 0x05, 0x09,
+    0xda, 0x03, 0x08, 0x75, 0x96, 0x05, 0x09, 0xe9, 0x9a, 0x05, 0x09, 0xf1,
+    0x92, 0x05, 0x0a, 0x00, 0x91, 0x05, 0x0a, 0x13, 0x03, 0x08, 0x7d, 0x96,
+    0x05, 0x0a, 0x59, 0x9a, 0x05, 0x0a, 0x61, 0x92, 0x05, 0x0a, 0x71, 0x94,
+    0x05, 0x0a, 0x82, 0x03, 0x08, 0x85, 0x96, 0x05, 0x0a, 0x21, 0x9a, 0x05,
+    0x0a, 0x29, 0x92, 0x05, 0x0a, 0x38, 0x9a, 0x05, 0x0a, 0x41, 0x92, 0x05,
+    0x0a, 0x50, 0x9a, 0x05, 0x08, 0xf1, 0x92, 0x05, 0x09, 0x00, 0x96, 0x05,
+    0x09, 0x29, 0x9a, 0x05, 0x09, 0x31, 0x92, 0x05, 0x09, 0x40, 0x9a, 0x05,
+    0x09, 0xa0, 0x96, 0x05, 0x0d, 0xd9, 0x9a, 0x05, 0x0d, 0xe1, 0x92, 0x05,
+    0x0d, 0xf0, 0x96, 0x05, 0x0e, 0x19, 0x9a, 0x05, 0x0e, 0x21, 0x92, 0x05,
+    0x0e, 0x30, 0x9a, 0x05, 0x0e, 0x90, 0x9a, 0x05, 0x0e, 0xc0, 0x96, 0x05,
+    0x0e, 0xf1, 0x9a, 0x05, 0x0e, 0xf9, 0x92, 0x05, 0x0f, 0x08, 0x96, 0x05,
+    0x0f, 0x31, 0x9a, 0x05, 0x0f, 0x39, 0x92, 0x05, 0x0f, 0x48, 0x9a, 0x05,
+    0x0f, 0xb0, 0x96, 0x05, 0x10, 0xa1, 0x9a, 0x05, 0x10, 0xa9, 0x87, 0x05,
+    0x10, 0xc2, 0x03, 0x08, 0x89, 0x96, 0x05, 0x10, 0xd1, 0x9a, 0x05, 0x10,
+    0xd9, 0x92, 0x05, 0x10, 0xe8, 0x91, 0x05, 0x11, 0x03, 0x03, 0x08, 0x91,
+    0x96, 0x05, 0x11, 0x51, 0x9a, 0x05, 0x11, 0x59, 0x92, 0x05, 0x11, 0x69,
+    0x94, 0x05, 0x11, 0x7a, 0x03, 0x08, 0x9d, 0x96, 0x05, 0x11, 0x11, 0x9a,
+    0x05, 0x11, 0x19, 0x92, 0x05, 0x11, 0x28, 0x96, 0x05, 0x11, 0x31, 0x9a,
+    0x05, 0x11, 0x39, 0x92, 0x05, 0x11, 0x48, 0x9a, 0x05, 0x0f, 0xe1, 0x92,
+    0x05, 0x0f, 0xf0, 0x9a, 0x05, 0x10, 0x19, 0x92, 0x05, 0x10, 0x28, 0x9a,
+    0x05, 0x10, 0x90, 0x0c, 0xc3, 0x08, 0xa1, 0x0a, 0xc3, 0x08, 0xac, 0x42,
+    0x01, 0xe2, 0xc3, 0x08, 0xbf, 0xc2, 0x16, 0x5a, 0x00, 0xaa, 0x09, 0xc2,
+    0x00, 0x8e, 0x00, 0xa5, 0x11, 0x8f, 0x00, 0xa5, 0xf8, 0x9b, 0x00, 0xc6,
+    0x11, 0x91, 0x00, 0xa8, 0xf8, 0x83, 0x00, 0xa9, 0x18, 0x8b, 0x00, 0xa8,
+    0xd8, 0x83, 0x08, 0xd5, 0xd3, 0x03, 0x08, 0xd8, 0x91, 0x08, 0xd5, 0xc3,
+    0x03, 0x08, 0xdc, 0x8b, 0x08, 0xd5, 0xb2, 0x03, 0x08, 0xe0, 0x83, 0x08,
+    0xd5, 0xa3, 0x03, 0x08, 0xe4, 0x91, 0x08, 0xd5, 0x93, 0x03, 0x08, 0xe8,
+    0x8b, 0x08, 0xd5, 0x82, 0x03, 0x08, 0xec, 0xc2, 0x04, 0xc6, 0x00, 0xa0,
+    0xd9, 0xc2, 0x01, 0x24, 0x00, 0xa0, 0xb0, 0xc3, 0xe3, 0x43, 0x00, 0xa8,
+    0x79, 0xc2, 0x04, 0x22, 0x00, 0xa8, 0x53, 0x03, 0x08, 0xf0, 0xc3, 0x01,
+    0x8b, 0x00, 0xa8, 0x69, 0xc3, 0x14, 0x72, 0x00, 0xa8, 0x21, 0xc2, 0x1a,
+    0xd1, 0x00, 0xa8, 0x59, 0xc3, 0x09, 0x0e, 0x00, 0xa8, 0x60, 0x8b, 0x00,
+    0xac, 0x70, 0x83, 0x00, 0xab, 0xd0, 0x91, 0x00, 0xab, 0xc0, 0x8b, 0x00,
+    0xab, 0xb0, 0x07, 0xc3, 0x08, 0xf4, 0x8b, 0x00, 0xa2, 0xa1, 0x0e, 0xc3,
+    0x08, 0xfc, 0x1c, 0x43, 0x09, 0x13, 0xc2, 0x01, 0x4a, 0x00, 0xc7, 0x91,
+    0x83, 0x00, 0xb0, 0xd9, 0x8b, 0x00, 0xb0, 0xc9, 0x87, 0x00, 0xb0, 0xbb,
+    0x03, 0x09, 0x2a, 0x91, 0x00, 0xb0, 0xb1, 0x97, 0x00, 0xb0, 0xa1, 0x0c,
+    0x43, 0x09, 0x2e, 0x19, 0xc3, 0x09, 0x45, 0x83, 0x00, 0xaf, 0xa3, 0x03,
+    0x09, 0x4d, 0x8b, 0x00, 0xaf, 0x99, 0x87, 0x00, 0xaf, 0x8b, 0x03, 0x09,
+    0x51, 0x91, 0x00, 0xaf, 0x81, 0x97, 0x00, 0xaf, 0x79, 0x0a, 0x43, 0x09,
+    0x55, 0x16, 0xc3, 0x09, 0x6c, 0x15, 0xc3, 0x09, 0x87, 0x0a, 0xc3, 0x09,
+    0x9e, 0x0e, 0x43, 0x09, 0xb5, 0x83, 0x00, 0xb3, 0x31, 0x8b, 0x00, 0xb3,
+    0x29, 0x87, 0x00, 0xb3, 0x1b, 0x03, 0x09, 0xd0, 0x91, 0x00, 0xb3, 0x11,
+    0x97, 0x00, 0xb3, 0x08, 0x83, 0x00, 0xb0, 0x99, 0x8b, 0x00, 0xb0, 0x91,
+    0x87, 0x00, 0xb0, 0x83, 0x03, 0x09, 0xd4, 0x91, 0x00, 0xb0, 0x79, 0x97,
+    0x00, 0xb0, 0x70, 0x83, 0x00, 0xb0, 0x69, 0x8b, 0x00, 0xb0, 0x61, 0x87,
+    0x00, 0xb0, 0x53, 0x03, 0x09, 0xd8, 0x91, 0x00, 0xb0, 0x49, 0x97, 0x00,
+    0xb0, 0x40, 0x83, 0x00, 0xb0, 0x39, 0x8b, 0x00, 0xb0, 0x31, 0x87, 0x00,
+    0xb0, 0x23, 0x03, 0x09, 0xdc, 0x91, 0x00, 0xb0, 0x19, 0x97, 0x00, 0xb0,
+    0x11, 0x89, 0x00, 0xa6, 0x88, 0x8d, 0x00, 0xb0, 0x0b, 0x03, 0x09, 0xe0,
+    0x0a, 0x43, 0x09, 0xf7, 0x83, 0x00, 0xaf, 0x69, 0x8b, 0x00, 0xaf, 0x61,
+    0x87, 0x00, 0xaf, 0x53, 0x03, 0x0a, 0x0e, 0x91, 0x00, 0xaf, 0x49, 0x97,
+    0x00, 0xaf, 0x40, 0x19, 0xc3, 0x0a, 0x12, 0xc2, 0x02, 0xe0, 0x00, 0xa1,
+    0xb1, 0x8b, 0x00, 0xa1, 0xb8, 0x83, 0x00, 0xae, 0xa9, 0x8b, 0x00, 0xae,
+    0xa1, 0x87, 0x00, 0xae, 0x93, 0x03, 0x0a, 0x29, 0x91, 0x00, 0xae, 0x89,
+    0x97, 0x00, 0xae, 0x80, 0x83, 0x00, 0xae, 0x79, 0x8b, 0x00, 0xae, 0x71,
+    0x87, 0x00, 0xae, 0x63, 0x03, 0x0a, 0x2d, 0x91, 0x00, 0xae, 0x59, 0x97,
+    0x00, 0xae, 0x50, 0x0a, 0xc3, 0x0a, 0x31, 0x97, 0x00, 0xb1, 0x11, 0x91,
+    0x00, 0xb1, 0x19, 0x87, 0x00, 0xb1, 0x23, 0x03, 0x0a, 0x48, 0x8b, 0x00,
+    0xb1, 0x31, 0x83, 0x00, 0xb1, 0x38, 0xc8, 0xbb, 0x62, 0x00, 0xb2, 0x38,
+    0x97, 0x00, 0xb2, 0x01, 0x91, 0x00, 0xb2, 0x09, 0x87, 0x00, 0xb2, 0x13,
+    0x03, 0x0a, 0x4c, 0x8b, 0x00, 0xb2, 0x21, 0x83, 0x00, 0xb2, 0x28, 0x97,
+    0x00, 0xb2, 0x71, 0x91, 0x00, 0xb2, 0x79, 0x87, 0x00, 0xb2, 0x83, 0x03,
+    0x0a, 0x50, 0x8b, 0x00, 0xb2, 0x91, 0x83, 0x00, 0xb2, 0x99, 0x8a, 0x00,
+    0xb2, 0xd2, 0x03, 0x0a, 0x54, 0x83, 0x00, 0xc7, 0x38, 0x91, 0x00, 0xc7,
+    0x30, 0x83, 0x00, 0xab, 0x40, 0x83, 0x00, 0xad, 0x68, 0x91, 0x00, 0xad,
+    0x58, 0x8b, 0x00, 0xad, 0x48, 0x8e, 0x00, 0xa7, 0x5b, 0x03, 0x0a, 0x6b,
+    0x94, 0x00, 0xaa, 0x8b, 0x03, 0x0a, 0x81, 0x16, 0xc3, 0x0a, 0x97, 0xc4,
+    0xe0, 0x77, 0x00, 0xaa, 0xe1, 0x9b, 0x00, 0xaa, 0x03, 0x03, 0x0a, 0xa1,
+    0x15, 0xc3, 0x0a, 0xa5, 0x92, 0x00, 0xa2, 0x53, 0x03, 0x0a, 0xaf, 0x42,
+    0x28, 0x70, 0xc3, 0x0a, 0xb3, 0x19, 0xc3, 0x0a, 0xcc, 0x42, 0x04, 0x22,
+    0xc3, 0x0a, 0xe5, 0x8f, 0x00, 0xa5, 0xe3, 0x03, 0x0a, 0xfe, 0x42, 0x01,
+    0x49, 0x43, 0x0b, 0x02, 0xc8, 0xb7, 0x02, 0x00, 0xb3, 0xf1, 0xc2, 0x00,
+    0x8e, 0x00, 0xac, 0xfa, 0x03, 0x0b, 0x0d, 0xc9, 0xb4, 0x25, 0x00, 0xc6,
+    0xf9, 0x0b, 0x43, 0x0b, 0x21, 0xc9, 0xaa, 0x05, 0x00, 0xc6, 0xf1, 0xd6,
+    0x2f, 0xca, 0x00, 0xa1, 0x40, 0x45, 0x00, 0x5a, 0xc3, 0x0b, 0x2d, 0xc7,
+    0x32, 0xb9, 0x00, 0xa1, 0x50, 0x91, 0x00, 0xc6, 0x5b, 0x03, 0x0b, 0x39,
+    0x8b, 0x00, 0xc6, 0x3a, 0x03, 0x0b, 0x3d, 0x96, 0x08, 0x2a, 0xb0, 0x8d,
+    0x08, 0x2a, 0x80, 0x98, 0x05, 0x5d, 0xc1, 0x97, 0x05, 0x5d, 0xb9, 0x91,
+    0x05, 0x5d, 0xb1, 0x8b, 0x05, 0x5d, 0xa9, 0x83, 0x05, 0x5d, 0x99, 0x87,
+    0x05, 0x5d, 0xa0, 0x98, 0x05, 0x5d, 0x91, 0x83, 0x05, 0x5d, 0x69, 0x87,
+    0x05, 0x5d, 0x71, 0x97, 0x05, 0x5d, 0x89, 0x8b, 0x05, 0x5d, 0x79, 0x91,
+    0x05, 0x5d, 0x80, 0x8a, 0x05, 0x5c, 0x80, 0x8a, 0x00, 0x9e, 0x00, 0x83,
+    0x00, 0x9e, 0xe9, 0x87, 0x00, 0x9e, 0xf1, 0x8b, 0x00, 0x9e, 0xf9, 0x91,
+    0x00, 0x9f, 0x01, 0x97, 0x00, 0x9f, 0x09, 0x98, 0x00, 0x9f, 0x10, 0x83,
+    0x00, 0x9f, 0x19, 0x87, 0x00, 0x9f, 0x21, 0x8b, 0x00, 0x9f, 0x29, 0x91,
+    0x00, 0x9f, 0x31, 0x97, 0x00, 0x9f, 0x39, 0x98, 0x00, 0x9f, 0x40, 0xc7,
+    0xc8, 0x00, 0x0f, 0x01, 0x49, 0xc9, 0xae, 0x6a, 0x0f, 0x01, 0x38, 0x14,
+    0xc3, 0x0b, 0x41, 0xc3, 0x00, 0x15, 0x0f, 0x00, 0x28, 0x83, 0x0f, 0x00,
+    0xf1, 0xc2, 0x01, 0x6f, 0x0f, 0x00, 0xd8, 0xc9, 0xaf, 0x93, 0x0e, 0x92,
+    0x21, 0x16, 0x43, 0x0b, 0x49, 0x47, 0x02, 0x0e, 0xc3, 0x0b, 0x55, 0x46,
+    0x09, 0x97, 0x43, 0x0b, 0x71, 0x02, 0xc3, 0x0b, 0x8b, 0x00, 0x43, 0x0b,
+    0x97, 0xc6, 0x05, 0x01, 0x00, 0x18, 0xb8, 0x45, 0x01, 0xa2, 0xc3, 0x0b,
+    0xa6, 0x42, 0x00, 0xd0, 0xc3, 0x0b, 0xb0, 0x4c, 0x1a, 0x50, 0xc3, 0x0b,
+    0xbc, 0xca, 0x9a, 0x3d, 0x00, 0x18, 0xc8, 0xe0, 0x0b, 0x67, 0x01, 0x07,
+    0x60, 0x44, 0x01, 0xa3, 0xc3, 0x0b, 0xc8, 0x45, 0x01, 0xb4, 0x43, 0x0b,
+    0xd2, 0xc5, 0x00, 0xd4, 0x01, 0x07, 0x09, 0xc5, 0x05, 0x02, 0x00, 0x1a,
+    0x68, 0xcb, 0x95, 0xe5, 0x01, 0x06, 0x81, 0x48, 0xbc, 0x3a, 0x43, 0x0b,
+    0xde, 0xca, 0x9e, 0x6e, 0x00, 0xd6, 0x19, 0xca, 0x09, 0xf2, 0x00, 0xd6,
+    0x08, 0xcd, 0x42, 0x35, 0x00, 0x19, 0xb1, 0xce, 0x2c, 0x62, 0x00, 0x19,
+    0xc0, 0x46, 0x00, 0x8b, 0x43, 0x0b, 0xea, 0x46, 0x00, 0x8b, 0x43, 0x0b,
+    0xf6, 0xcf, 0x6b, 0x16, 0x00, 0xef, 0x91, 0xc4, 0xde, 0x3f, 0x00, 0xef,
+    0x39, 0x98, 0x00, 0xee, 0xb1, 0x91, 0x00, 0xee, 0xa9, 0x87, 0x00, 0xee,
+    0xa0, 0xc6, 0x05, 0x01, 0x00, 0xd5, 0xf8, 0xc2, 0x01, 0xe2, 0x00, 0xef,
+    0x79, 0xc2, 0x00, 0x8e, 0x00, 0xee, 0xc8, 0xd9, 0x1e, 0xb4, 0x00, 0xef,
+    0x61, 0xc5, 0xb8, 0xe3, 0x00, 0xef, 0x28, 0xd5, 0x35, 0x21, 0x00, 0xee,
+    0x98, 0xc4, 0x74, 0x82, 0x00, 0xef, 0x50, 0xc3, 0x04, 0x87, 0x00, 0xef,
+    0x48, 0x00, 0x43, 0x0c, 0x02, 0x47, 0x67, 0x21, 0x43, 0x0c, 0x0e, 0xc8,
+    0xbd, 0xf2, 0x00, 0xee, 0xb8, 0xcd, 0x76, 0x83, 0x00, 0xd6, 0x00, 0xc6,
+    0x05, 0x01, 0x07, 0xf1, 0x38, 0xc6, 0x05, 0x01, 0x07, 0xf1, 0x40, 0x49,
+    0x07, 0xbb, 0xc3, 0x0c, 0x1e, 0xce, 0x1d, 0x93, 0x00, 0x1b, 0x0b, 0x03,
+    0x0c, 0x2a, 0xd0, 0x30, 0x6a, 0x00, 0xee, 0x69, 0x12, 0xc3, 0x0c, 0x30,
+    0x11, 0xc3, 0x0c, 0x3c, 0xcc, 0x83, 0x0d, 0x00, 0x18, 0x59, 0xcc, 0x1f,
+    0x0c, 0x00, 0x18, 0x79, 0xc8, 0x80, 0x2e, 0x00, 0x19, 0x99, 0x42, 0x00,
+    0x2c, 0xc3, 0x0c, 0x48, 0xc5, 0x1d, 0x88, 0x00, 0x1a, 0xeb, 0x03, 0x0c,
+    0x54, 0xc6, 0x60, 0xb1, 0x00, 0x1a, 0xf3, 0x03, 0x0c, 0x5a, 0xc5, 0x1e,
+    0xc8, 0x00, 0x1b, 0x02, 0x03, 0x0c, 0x60, 0xc5, 0x05, 0x02, 0x00, 0x19,
+    0x1b, 0x03, 0x0c, 0x64, 0xc5, 0x00, 0xd4, 0x00, 0x18, 0x3a, 0x03, 0x0c,
+    0x6a, 0xcc, 0x83, 0x0d, 0x00, 0xee, 0x09, 0xcc, 0x1f, 0x0c, 0x00, 0xee,
+    0x00, 0xc2, 0x07, 0xb2, 0x05, 0x47, 0x81, 0xc2, 0x00, 0x39, 0x05, 0x47,
+    0x79, 0xc2, 0x00, 0xb0, 0x05, 0x47, 0x71, 0xc2, 0x00, 0xc1, 0x05, 0x47,
+    0x69, 0xc2, 0x02, 0x2b, 0x05, 0x47, 0x61, 0x97, 0x05, 0x47, 0x59, 0x83,
+    0x05, 0x47, 0x50, 0xc4, 0x26, 0x78, 0x05, 0x47, 0x49, 0xc5, 0x06, 0xdb,
+    0x05, 0x47, 0x41, 0x15, 0xc3, 0x0c, 0x6e, 0x08, 0xc3, 0x0c, 0x7a, 0x16,
+    0xc3, 0x0c, 0x86, 0xc3, 0x05, 0x14, 0x05, 0x47, 0x09, 0xc4, 0x15, 0xe7,
+    0x05, 0x47, 0x00, 0xcc, 0x8a, 0x8d, 0x00, 0x24, 0x21, 0xc5, 0xc7, 0x29,
+    0x05, 0x33, 0xe8, 0x00, 0x43, 0x0c, 0x92, 0x88, 0x05, 0x34, 0xf1, 0x8e,
+    0x01, 0x6f, 0x39, 0x8f, 0x01, 0x6f, 0x41, 0x90, 0x01, 0x6f, 0x49, 0x94,
+    0x01, 0x6f, 0x61, 0x95, 0x01, 0x6f, 0x6a, 0x03, 0x0c, 0x9e, 0x48, 0xb7,
+    0x92, 0xc3, 0x0c, 0xa4, 0x87, 0x01, 0x6f, 0xb8, 0xcb, 0x91, 0xba, 0x05,
+    0x33, 0xa0, 0x8b, 0x05, 0x33, 0xb1, 0xc3, 0x21, 0x51, 0x05, 0x33, 0xc9,
+    0xc2, 0x00, 0xfe, 0x01, 0x6f, 0xd1, 0x97, 0x01, 0x6f, 0xd8, 0xc7, 0x8a,
+    0x86, 0x05, 0x33, 0xc0, 0xc8, 0x84, 0xc1, 0x05, 0x33, 0xd1, 0x0a, 0x43,
+    0x0c, 0xb2, 0xc4, 0x6d, 0xb5, 0x05, 0x33, 0xd8, 0x87, 0x01, 0x6f, 0x01,
+    0xc6, 0xc8, 0x01, 0x01, 0x6f, 0xf0, 0x87, 0x01, 0x6f, 0x19, 0xc4, 0xdd,
+    0xb2, 0x01, 0x6f, 0xc0, 0xc2, 0x01, 0x7f, 0x01, 0x6f, 0x21, 0x87, 0x01,
+    0x6f, 0x28, 0x87, 0x01, 0x6f, 0x71, 0xc2, 0x00, 0x40, 0x01, 0x6f, 0x80,
+    0xc6, 0x00, 0x2b, 0x00, 0x18, 0xa8, 0xc5, 0x00, 0xd4, 0x00, 0xd6, 0x39,
+    0xc5, 0x05, 0x02, 0x00, 0x19, 0x58, 0xc3, 0x0f, 0xbe, 0x00, 0x18, 0x8b,
+    0x03, 0x0c, 0xbc, 0xca, 0x32, 0xf5, 0x00, 0x19, 0xd0, 0xc6, 0x05, 0x01,
+    0x07, 0xf1, 0x58, 0xc6, 0x05, 0x01, 0x07, 0xf1, 0x60, 0xc2, 0x06, 0xdb,
+    0x00, 0x1f, 0x39, 0x8b, 0x01, 0x65, 0x68, 0xc3, 0x07, 0x4a, 0x00, 0x1f,
+    0x69, 0xc2, 0x06, 0xdb, 0x00, 0x1f, 0x18, 0xc4, 0x06, 0x5a, 0x01, 0x65,
+    0x99, 0xc4, 0xca, 0x0b, 0x01, 0x65, 0xc9, 0xc2, 0x00, 0xec, 0x01, 0x65,
+    0xd9, 0xc4, 0x01, 0x68, 0x01, 0x66, 0x58, 0x47, 0xc1, 0x7e, 0xc3, 0x0c,
+    0xc2, 0x47, 0x96, 0x0a, 0x43, 0x0c, 0xea, 0xc3, 0xd1, 0x8c, 0x01, 0x65,
+    0xb9, 0xc2, 0x00, 0xec, 0x01, 0x65, 0xe9, 0xc4, 0x9b, 0xae, 0x01, 0x67,
+    0x61, 0xc6, 0xd0, 0x3d, 0x01, 0x67, 0x70, 0xc3, 0x07, 0x4a, 0x00, 0x1f,
+    0x61, 0xc2, 0x06, 0xdb, 0x00, 0x1f, 0x10, 0xc4, 0x06, 0x5a, 0x01, 0x65,
+    0x91, 0xc4, 0xca, 0x0b, 0x01, 0x65, 0xc1, 0xc2, 0x00, 0xec, 0x01, 0x65,
+    0xd1, 0xc4, 0x01, 0x68, 0x01, 0x66, 0x50, 0x8b, 0x01, 0x65, 0x61, 0xc2,
+    0x06, 0xdb, 0x00, 0x1f, 0x30, 0x47, 0xc1, 0x7e, 0xc3, 0x0c, 0xfa, 0x47,
+    0x96, 0x0a, 0x43, 0x0d, 0x22, 0xc3, 0xd1, 0x8c, 0x01, 0x65, 0xb1, 0xc2,
+    0x00, 0xec, 0x01, 0x65, 0xe1, 0xc4, 0x9b, 0xae, 0x01, 0x67, 0x59, 0xc6,
+    0xd0, 0x3d, 0x01, 0x67, 0x68, 0xc4, 0x18, 0x12, 0x08, 0x17, 0x59, 0xc9,
+    0x18, 0x05, 0x08, 0x17, 0xa0, 0xc4, 0x0d, 0x0e, 0x08, 0x17, 0x61, 0xcb,
+    0x13, 0xfa, 0x08, 0x17, 0xa8, 0xc3, 0x0d, 0x0f, 0x08, 0x17, 0x69, 0xca,
+    0x9c, 0x5c, 0x08, 0x17, 0xb0, 0xc3, 0x45, 0x6b, 0x08, 0x17, 0x71, 0xca,
+    0x37, 0x63, 0x08, 0x17, 0xb8, 0xc2, 0x0d, 0x10, 0x08, 0x17, 0x79, 0xc8,
+    0x0d, 0x03, 0x08, 0x17, 0xc0, 0xc8, 0x0d, 0x03, 0x08, 0x17, 0xc9, 0xc2,
+    0x0d, 0x10, 0x08, 0x17, 0x80, 0xd9, 0x20, 0x76, 0x0f, 0xa8, 0x10, 0xc7,
+    0xc1, 0xd9, 0x0f, 0xab, 0x39, 0xc7, 0xc7, 0x6d, 0x0f, 0xaa, 0xd8, 0xc7,
+    0xc1, 0xd9, 0x0f, 0xaa, 0xe9, 0xc7, 0xc7, 0x6d, 0x0f, 0xaa, 0x88, 0xc6,
+    0xd0, 0xfd, 0x0f, 0xc8, 0x13, 0x03, 0x0d, 0x32, 0xc6, 0xcb, 0xf3, 0x0f,
+    0xaa, 0x00, 0xc5, 0x8e, 0xdf, 0x01, 0x93, 0x03, 0x03, 0x0d, 0x38, 0xc6,
+    0xbb, 0xec, 0x01, 0x93, 0x52, 0x03, 0x0d, 0x3e, 0xc2, 0x00, 0xd3, 0x01,
+    0x93, 0x78, 0xc5, 0xc0, 0x7d, 0x01, 0x93, 0x13, 0x03, 0x0d, 0x44, 0xc6,
+    0xc1, 0x86, 0x01, 0x93, 0x5a, 0x03, 0x0d, 0x4a, 0xc2, 0x00, 0xd3, 0x01,
+    0x93, 0x88, 0xc2, 0x00, 0xd3, 0x01, 0x93, 0x90, 0xc4, 0x79, 0xf3, 0x01,
+    0x93, 0x2b, 0x03, 0x0d, 0x50, 0xc6, 0xba, 0x7c, 0x01, 0x93, 0x62, 0x03,
+    0x0d, 0x56, 0xc2, 0x00, 0xd3, 0x01, 0x93, 0xa0, 0x00, 0x43, 0x0d, 0x5c,
+    0xc4, 0xc6, 0x7a, 0x01, 0x93, 0x43, 0x03, 0x0d, 0x64, 0xc6, 0xc6, 0x79,
+    0x01, 0x93, 0x4a, 0x03, 0x0d, 0x6a, 0xc2, 0x00, 0xd3, 0x01, 0x93, 0xd8,
+    0xc4, 0x15, 0xe7, 0x01, 0x27, 0x51, 0xc4, 0x26, 0x78, 0x01, 0x23, 0x41,
+    0xc5, 0x06, 0xdb, 0x01, 0x23, 0x39, 0x15, 0xc3, 0x0d, 0x70, 0x08, 0xc3,
+    0x0d, 0x7c, 0x16, 0xc3, 0x0d, 0x88, 0xc3, 0x05, 0x14, 0x01, 0x23, 0x00,
+    0xc4, 0x03, 0x03, 0x01, 0x14, 0xc1, 0xc3, 0x00, 0xbb, 0x01, 0x51, 0xc0,
+    0xe0, 0x02, 0xe7, 0x0f, 0x88, 0x78, 0x9c, 0x01, 0x27, 0x49, 0x9b, 0x01,
+    0x27, 0x41, 0x9a, 0x01, 0x27, 0x39, 0x99, 0x01, 0x27, 0x31, 0x98, 0x01,
+    0x27, 0x29, 0x97, 0x01, 0x27, 0x21, 0x96, 0x01, 0x27, 0x19, 0x95, 0x01,
+    0x27, 0x11, 0x94, 0x01, 0x27, 0x09, 0x93, 0x01, 0x27, 0x01, 0x92, 0x01,
+    0x26, 0xf9, 0x91, 0x01, 0x26, 0xf1, 0x90, 0x01, 0x26, 0xe9, 0x8f, 0x01,
+    0x26, 0xe1, 0x8e, 0x01, 0x26, 0xd9, 0x8d, 0x01, 0x26, 0xd1, 0x8c, 0x01,
+    0x26, 0xc9, 0x8b, 0x01, 0x26, 0xc1, 0x8a, 0x01, 0x26, 0xb9, 0x89, 0x01,
+    0x26, 0xb1, 0x88, 0x01, 0x26, 0xa9, 0x87, 0x01, 0x26, 0xa1, 0x86, 0x01,
+    0x26, 0x99, 0x85, 0x01, 0x26, 0x91, 0x84, 0x01, 0x26, 0x89, 0x83, 0x01,
+    0x26, 0x80, 0x9c, 0x01, 0x26, 0x79, 0x9b, 0x01, 0x26, 0x71, 0x9a, 0x01,
+    0x26, 0x69, 0x99, 0x01, 0x26, 0x61, 0x98, 0x01, 0x26, 0x59, 0x97, 0x01,
+    0x26, 0x51, 0x96, 0x01, 0x26, 0x49, 0x95, 0x01, 0x26, 0x41, 0x94, 0x01,
+    0x26, 0x39, 0x93, 0x01, 0x26, 0x31, 0x92, 0x01, 0x26, 0x29, 0x91, 0x01,
+    0x26, 0x21, 0x90, 0x01, 0x26, 0x19, 0x8f, 0x01, 0x26, 0x11, 0x8e, 0x01,
+    0x26, 0x09, 0x8d, 0x01, 0x26, 0x01, 0x8c, 0x01, 0x25, 0xf9, 0x8b, 0x01,
+    0x25, 0xf1, 0x8a, 0x01, 0x25, 0xe9, 0x89, 0x01, 0x25, 0xe1, 0x88, 0x01,
+    0x25, 0xd9, 0x87, 0x01, 0x25, 0xd1, 0x86, 0x01, 0x25, 0xc9, 0x85, 0x01,
+    0x25, 0xc1, 0x84, 0x01, 0x25, 0xb9, 0x83, 0x01, 0x25, 0xb0, 0xc3, 0x18,
+    0x13, 0x01, 0x23, 0x9b, 0x03, 0x0d, 0x94, 0xc3, 0x22, 0x45, 0x01, 0x23,
+    0x58, 0xc3, 0x03, 0x26, 0x01, 0x23, 0x61, 0x9b, 0x01, 0x92, 0xd2, 0x03,
+    0x0d, 0x98, 0xd0, 0x55, 0xa8, 0x01, 0x92, 0x40, 0xc3, 0x03, 0x26, 0x01,
+    0x23, 0x89, 0xd1, 0x55, 0xa7, 0x01, 0x92, 0x78, 0xc3, 0x03, 0x26, 0x01,
+    0x23, 0x81, 0xd1, 0x55, 0xa7, 0x01, 0x92, 0x70, 0xc3, 0x03, 0x26, 0x01,
+    0x23, 0x79, 0xd1, 0x55, 0xa7, 0x01, 0x92, 0x68, 0xc3, 0x03, 0x26, 0x01,
+    0x23, 0x71, 0x9b, 0x01, 0x95, 0xfa, 0x03, 0x0d, 0x9c, 0xc6, 0x34, 0x38,
+    0x01, 0x23, 0x69, 0xc3, 0x0d, 0x0f, 0x01, 0x95, 0xaa, 0x03, 0x0d, 0xa2,
+    0xc5, 0xdc, 0x13, 0x0f, 0x92, 0x89, 0xc8, 0xb9, 0xfa, 0x0f, 0x92, 0x81,
+    0xc8, 0xb6, 0xda, 0x01, 0x94, 0xf9, 0xc7, 0xba, 0x63, 0x01, 0x95, 0x78,
+    0xcb, 0x90, 0x2e, 0x01, 0x92, 0x29, 0xc3, 0x81, 0x06, 0x01, 0x92, 0x38,
+    0xc5, 0xdc, 0xef, 0x01, 0x92, 0x31, 0xc2, 0x22, 0xcc, 0x01, 0x94, 0x29,
+    0x07, 0xc3, 0x0d, 0xa6, 0x17, 0xc3, 0x0d, 0xb2, 0x16, 0xc3, 0x0d, 0xc2,
+    0xc6, 0xcc, 0xbf, 0x01, 0x94, 0x99, 0xc6, 0xca, 0xe5, 0x01, 0x94, 0xa8,
+    0xc2, 0x02, 0xa0, 0x01, 0x94, 0x09, 0xc4, 0x02, 0xde, 0x01, 0x94, 0x11,
+    0xc2, 0x00, 0xc4, 0x01, 0x94, 0x48, 0xc3, 0x09, 0x9e, 0x01, 0x94, 0x19,
+    0x0b, 0xc3, 0x0d, 0xce, 0xc5, 0x1b, 0xbd, 0x01, 0x94, 0xd8, 0xc4, 0x00,
+    0x2d, 0x01, 0x94, 0x39, 0xc4, 0x61, 0xc1, 0x01, 0x94, 0x79, 0xc8, 0xbc,
+    0xca, 0x01, 0x94, 0xe9, 0xc9, 0xaf, 0x8a, 0x01, 0x95, 0x68, 0x0b, 0xc3,
+    0x0d, 0xe0, 0xc3, 0x00, 0xc2, 0x01, 0x94, 0xa0, 0xc3, 0x01, 0x54, 0x01,
+    0x94, 0x51, 0x07, 0xc3, 0x0d, 0xec, 0xc3, 0x04, 0x85, 0x01, 0x94, 0xd0,
+    0xc4, 0x03, 0xd7, 0x01, 0x94, 0x61, 0xc3, 0x29, 0x82, 0x01, 0x94, 0x68,
+    0xc3, 0x04, 0xad, 0x01, 0x94, 0x91, 0xc3, 0x00, 0x2d, 0x01, 0x95, 0x20,
+    0x11, 0xc3, 0x0d, 0xf8, 0xc5, 0x04, 0xe2, 0x01, 0x95, 0x28, 0xc4, 0xdd,
+    0x72, 0x01, 0x94, 0xc1, 0xc2, 0x00, 0x27, 0x01, 0x95, 0x31, 0xc3, 0x00,
+    0x4a, 0x01, 0x95, 0x38, 0x07, 0xc3, 0x0e, 0x0a, 0xc4, 0x00, 0x2d, 0x01,
+    0x95, 0x40, 0x83, 0x01, 0x96, 0xa9, 0x8b, 0x01, 0x96, 0xb1, 0x97, 0x01,
+    0x96, 0xb9, 0x87, 0x01, 0x96, 0xc1, 0x91, 0x01, 0x96, 0xc8, 0x83, 0x01,
+    0x96, 0xd1, 0x8b, 0x01, 0x96, 0xd9, 0x97, 0x01, 0x96, 0xe1, 0x87, 0x01,
+    0x96, 0xe9, 0x91, 0x01, 0x96, 0xf0, 0x83, 0x01, 0x96, 0xf9, 0x8b, 0x01,
+    0x97, 0x01, 0x97, 0x01, 0x97, 0x09, 0x87, 0x01, 0x97, 0x11, 0x91, 0x01,
+    0x97, 0x18, 0x83, 0x01, 0x97, 0x21, 0x8b, 0x01, 0x97, 0x29, 0x97, 0x01,
+    0x97, 0x31, 0x87, 0x01, 0x97, 0x39, 0x91, 0x01, 0x97, 0x40, 0x83, 0x01,
+    0x97, 0x49, 0x8b, 0x01, 0x97, 0x51, 0x97, 0x01, 0x97, 0x59, 0x87, 0x01,
+    0x97, 0x61, 0x91, 0x01, 0x97, 0x68, 0x83, 0x01, 0x97, 0x71, 0x8b, 0x01,
+    0x97, 0x79, 0x97, 0x01, 0x97, 0x81, 0x87, 0x01, 0x97, 0x89, 0x91, 0x01,
+    0x97, 0x90, 0x83, 0x01, 0x97, 0x99, 0x97, 0x01, 0x97, 0xa1, 0x91, 0x01,
+    0x97, 0xa8, 0x83, 0x01, 0x97, 0xb1, 0x8b, 0x01, 0x97, 0xb9, 0x97, 0x01,
+    0x97, 0xc1, 0x87, 0x01, 0x97, 0xc9, 0x91, 0x01, 0x97, 0xd0, 0x83, 0x01,
+    0x97, 0xd9, 0x8b, 0x01, 0x97, 0xe1, 0x87, 0x01, 0x97, 0xe9, 0x91, 0x01,
+    0x97, 0xf0, 0xcf, 0x64, 0xc2, 0x09, 0x2a, 0x19, 0x83, 0x09, 0x1b, 0x60,
+    0x0e, 0xc3, 0x0e, 0x14, 0x06, 0xc3, 0x0e, 0x1e, 0x17, 0xc3, 0x0e, 0x2a,
+    0xc2, 0x00, 0x16, 0x09, 0x1a, 0x59, 0x15, 0xc3, 0x0e, 0x3a, 0xc2, 0x00,
+    0xb0, 0x09, 0x1a, 0x41, 0xc3, 0x0f, 0xd6, 0x09, 0x1a, 0x39, 0xc2, 0x06,
+    0x52, 0x09, 0x1a, 0x29, 0x0b, 0xc3, 0x0e, 0x46, 0xc2, 0x00, 0xd0, 0x09,
+    0x1a, 0x09, 0x09, 0xc3, 0x0e, 0x56, 0xc3, 0x01, 0x5d, 0x09, 0x19, 0xd1,
+    0x83, 0x09, 0x19, 0xc2, 0x03, 0x0e, 0x61, 0xc8, 0x03, 0x4c, 0x09, 0x1a,
+    0x80, 0x46, 0x03, 0x4d, 0xc3, 0x0e, 0x67, 0xc8, 0x1d, 0x6f, 0x09, 0x29,
+    0xe0, 0xc8, 0x4e, 0xea, 0x09, 0x18, 0xf8, 0xc2, 0x00, 0xb0, 0x09, 0x19,
+    0x29, 0xc6, 0x45, 0xad, 0x09, 0x19, 0x20, 0x94, 0x09, 0x1a, 0xa0, 0xca,
+    0x8d, 0x2d, 0x09, 0x18, 0xd8, 0xcf, 0x65, 0xd0, 0x09, 0x18, 0xbb, 0x03,
+    0x0e, 0x7b, 0xc2, 0x02, 0x2f, 0x09, 0x18, 0xb1, 0xc3, 0x62, 0x19, 0x09,
+    0x18, 0xa8, 0xca, 0x64, 0xc2, 0x09, 0x29, 0xd9, 0xc9, 0x5d, 0x99, 0x09,
+    0x29, 0xd0, 0xc2, 0x04, 0x3d, 0x09, 0x17, 0xc9, 0xc4, 0x0b, 0x46, 0x09,
+    0x17, 0xc1, 0x42, 0x01, 0xe2, 0xc3, 0x0e, 0x81, 0xc3, 0x6c, 0x49, 0x09,
+    0x17, 0xa9, 0xc2, 0x01, 0x2d, 0x09, 0x17, 0xa0, 0xc7, 0x0b, 0x09, 0x09,
+    0x17, 0x91, 0x42, 0x00, 0x9a, 0x43, 0x0e, 0x89, 0xc2, 0x02, 0x2f, 0x09,
+    0x17, 0x71, 0xc2, 0x00, 0x0a, 0x09, 0x17, 0x68, 0xc8, 0xb6, 0xe2, 0x09,
+    0x18, 0x1b, 0x03, 0x0e, 0x8f, 0xca, 0x38, 0xae, 0x09, 0x18, 0x10, 0xcf,
+    0x69, 0x90, 0x09, 0x16, 0xf8, 0x46, 0x25, 0xd4, 0x43, 0x0e, 0x95, 0x45,
+    0x25, 0xd5, 0xc3, 0x0e, 0xa1, 0xc8, 0xb6, 0xea, 0x09, 0x29, 0x93, 0x03,
+    0x0e, 0xb3, 0xc2, 0x06, 0x47, 0x09, 0x15, 0xd8, 0xc3, 0x0d, 0xff, 0x09,
+    0x16, 0x11, 0x9f, 0x09, 0x16, 0x08, 0xc5, 0x58, 0xf4, 0x09, 0x29, 0x88,
+    0x47, 0x03, 0x4c, 0x43, 0x0e, 0xb7, 0x00, 0x43, 0x0e, 0xe0, 0x47, 0x03,
+    0x4c, 0x43, 0x0e, 0xec, 0x47, 0x03, 0x4c, 0x43, 0x0f, 0x21, 0x46, 0x03,
+    0x4d, 0xc3, 0x0f, 0x2b, 0xc4, 0x39, 0xc8, 0x09, 0x15, 0x43, 0x03, 0x0f,
+    0x6e, 0xc8, 0xb6, 0xf2, 0x09, 0x15, 0x39, 0xc7, 0xb7, 0xa3, 0x09, 0x14,
+    0xa0, 0x47, 0x03, 0x4c, 0x43, 0x0f, 0x74, 0xd0, 0x5f, 0xa2, 0x09, 0x12,
+    0x89, 0xc7, 0x5d, 0x9b, 0x09, 0x12, 0x80, 0xd6, 0x2a, 0xf6, 0x09, 0x1c,
+    0x99, 0xd6, 0x2b, 0x7e, 0x09, 0x16, 0xa9, 0xc4, 0x58, 0xf5, 0x09, 0x16,
+    0xa0, 0x00, 0x43, 0x0f, 0xb8, 0xcc, 0x81, 0xf9, 0x09, 0x13, 0x5b, 0x03,
+    0x0f, 0xc7, 0xc8, 0x20, 0x13, 0x09, 0x13, 0x51, 0xc4, 0x58, 0xf5, 0x09,
+    0x13, 0x49, 0x4c, 0x20, 0x1c, 0x43, 0x0f, 0xcd, 0xcd, 0x76, 0x0e, 0x09,
+    0x12, 0x19, 0xce, 0x75, 0x2e, 0x09, 0x12, 0x11, 0xc8, 0x1d, 0x6f, 0x09,
+    0x12, 0x08, 0xc2, 0x04, 0x3d, 0x09, 0x12, 0x51, 0x83, 0x09, 0x12, 0x48,
+    0xc9, 0xaf, 0x66, 0x09, 0x11, 0xb3, 0x03, 0x0f, 0xe8, 0xcd, 0x7a, 0x2b,
+    0x09, 0x11, 0xc1, 0x46, 0x03, 0x4d, 0x43, 0x0f, 0xee, 0x00, 0x43, 0x0f,
+    0xfe, 0x16, 0xc3, 0x10, 0x0a, 0xce, 0x73, 0x98, 0x09, 0x28, 0xc9, 0x15,
+    0xc3, 0x10, 0x16, 0xcc, 0x8a, 0x15, 0x09, 0x10, 0x99, 0xcc, 0x83, 0xc1,
+    0x09, 0x10, 0x90, 0xcd, 0x1a, 0xf3, 0x09, 0x10, 0xf8, 0xc7, 0x6c, 0xd0,
+    0x09, 0x10, 0xd1, 0x11, 0x43, 0x10, 0x25, 0xc2, 0xe6, 0x8b, 0x09, 0x28,
+    0xc1, 0xc2, 0xae, 0x2b, 0x09, 0x28, 0xb8, 0xc2, 0xe6, 0x79, 0x09, 0x28,
+    0x6b, 0x03, 0x10, 0x31, 0xc2, 0xe1, 0xa2, 0x09, 0x28, 0x61, 0xc2, 0xe6,
+    0x87, 0x09, 0x28, 0x0b, 0x03, 0x10, 0x37, 0xc2, 0x71, 0x49, 0x09, 0x28,
+    0x00, 0x26, 0xc3, 0x10, 0x3d, 0xc2, 0xe6, 0x7b, 0x09, 0x27, 0xd1, 0xc2,
+    0xe4, 0xef, 0x09, 0x27, 0xc9, 0x22, 0xc3, 0x10, 0x4d, 0x21, 0x43, 0x10,
+    0x55, 0xc2, 0xe6, 0xa7, 0x09, 0x27, 0x79, 0x25, 0xc3, 0x10, 0x60, 0x21,
+    0x43, 0x10, 0x68, 0x23, 0xc3, 0x10, 0x74, 0xc2, 0xe6, 0x83, 0x09, 0x27,
+    0x39, 0x1f, 0xc3, 0x10, 0x7c, 0x1e, 0x43, 0x10, 0x88, 0xc2, 0xe4, 0xf2,
+    0x09, 0x27, 0x09, 0xc2, 0xe6, 0x4a, 0x09, 0x27, 0x00, 0xc2, 0xe6, 0xa9,
+    0x09, 0x26, 0xf9, 0x25, 0xc3, 0x10, 0x90, 0xd4, 0x3c, 0xc8, 0x09, 0x26,
+    0xe1, 0xc2, 0xe5, 0x48, 0x09, 0x26, 0xd9, 0x22, 0xc3, 0x10, 0x9a, 0xc2,
+    0xe6, 0x4a, 0x09, 0x26, 0xc1, 0x1f, 0xc3, 0x10, 0xa2, 0xc2, 0xe6, 0x4f,
+    0x09, 0x26, 0xa8, 0x00, 0x43, 0x10, 0xaa, 0x00, 0x43, 0x10, 0xb6, 0xc8,
+    0x38, 0x76, 0x09, 0x0f, 0xb0, 0x94, 0x09, 0x26, 0x9b, 0x03, 0x10, 0xc8,
+    0xc4, 0xdd, 0x2c, 0x09, 0x26, 0x91, 0xc2, 0x01, 0xe2, 0x09, 0x0c, 0x59,
+    0xcc, 0x82, 0x11, 0x09, 0x0c, 0x51, 0x86, 0x09, 0x0c, 0x49, 0x9f, 0x09,
+    0x0c, 0x40, 0x83, 0x09, 0x26, 0x8b, 0x03, 0x10, 0xcc, 0x8b, 0x09, 0x0b,
+    0x82, 0x03, 0x10, 0xd0, 0x97, 0x09, 0x26, 0x81, 0x8b, 0x09, 0x0a, 0xf9,
+    0x03, 0x43, 0x10, 0xd4, 0x97, 0x09, 0x1c, 0x31, 0xc2, 0x00, 0xb1, 0x09,
+    0x0c, 0x30, 0x0a, 0xc3, 0x10, 0xe2, 0xc4, 0xdf, 0x77, 0x09, 0x0c, 0x29,
+    0xc2, 0x00, 0x2d, 0x09, 0x0c, 0x21, 0x83, 0x09, 0x0b, 0xf2, 0x03, 0x10,
+    0xf7, 0x83, 0x09, 0x1c, 0x21, 0x8b, 0x09, 0x0b, 0xe0, 0x97, 0x09, 0x0b,
+    0x9b, 0x03, 0x10, 0xfb, 0x8b, 0x09, 0x0b, 0x90, 0x97, 0x09, 0x0b, 0x5b,
+    0x03, 0x10, 0xff, 0x8b, 0x09, 0x0b, 0x3b, 0x03, 0x11, 0x09, 0x83, 0x09,
+    0x0b, 0x12, 0x03, 0x11, 0x18, 0x42, 0x01, 0xe2, 0xc3, 0x11, 0x29, 0xc4,
+    0x99, 0xe3, 0x09, 0x1b, 0xf1, 0x86, 0x09, 0x0a, 0xca, 0x03, 0x11, 0x31,
+    0xc2, 0x05, 0x1d, 0x09, 0x0b, 0xd9, 0x87, 0x09, 0x0b, 0xd0, 0x8b, 0x09,
+    0x0b, 0xc3, 0x03, 0x11, 0x37, 0x87, 0x09, 0x0b, 0xa2, 0x03, 0x11, 0x3d,
+    0x8f, 0x09, 0x0b, 0x71, 0xc2, 0x04, 0x2b, 0x09, 0x0b, 0x68, 0xc3, 0x05,
+    0x4e, 0x09, 0x0b, 0x09, 0xc4, 0x9e, 0x4c, 0x09, 0x0b, 0x00, 0x4c, 0x87,
+    0x99, 0xc3, 0x11, 0x43, 0xe0, 0x03, 0x47, 0x09, 0x0c, 0xe8, 0xcc, 0x83,
+    0xcd, 0x09, 0x0c, 0xc9, 0xc9, 0x8d, 0x2e, 0x09, 0x0c, 0xc0, 0xca, 0xa7,
+    0x4c, 0x09, 0x0c, 0xa0, 0xcc, 0x8a, 0x21, 0x09, 0x0d, 0x48, 0x86, 0x09,
+    0x0d, 0x18, 0xd2, 0x05, 0x54, 0x09, 0x26, 0x79, 0x9f, 0x09, 0x09, 0x78,
+    0xc5, 0x39, 0xc7, 0x09, 0x26, 0x70, 0xc2, 0x04, 0x3d, 0x09, 0x09, 0xe9,
+    0xc4, 0x81, 0x55, 0x09, 0x09, 0xe1, 0xc6, 0x45, 0xad, 0x09, 0x09, 0xd9,
+    0xc3, 0x01, 0xce, 0x09, 0x09, 0xd1, 0xc2, 0x00, 0xd1, 0x09, 0x09, 0xc8,
+    0xd4, 0x38, 0xa4, 0x09, 0x26, 0x69, 0xce, 0x6c, 0x44, 0x09, 0x09, 0x09,
+    0x46, 0x03, 0x4d, 0x43, 0x11, 0x49, 0x46, 0x03, 0x4d, 0xc3, 0x11, 0x55,
+    0xc4, 0x39, 0xc8, 0x09, 0x08, 0xe8, 0xc2, 0x01, 0xe2, 0x09, 0x09, 0x41,
+    0x90, 0x09, 0x09, 0x38, 0x00, 0x43, 0x11, 0x70, 0x47, 0x03, 0x4c, 0x43,
+    0x11, 0x7a, 0xc5, 0x39, 0xc7, 0x09, 0x08, 0x48, 0xcc, 0x83, 0xd9, 0x09,
+    0x08, 0x31, 0xc8, 0xb6, 0xfa, 0x09, 0x08, 0x28, 0x97, 0x09, 0x08, 0x11,
+    0x87, 0x09, 0x08, 0x08, 0x97, 0x09, 0x26, 0x51, 0xc3, 0x51, 0xdb, 0x09,
+    0x07, 0xf8, 0xd6, 0x2a, 0xf6, 0x09, 0x26, 0x49, 0xcd, 0x7a, 0x11, 0x09,
+    0x07, 0x78, 0x46, 0x03, 0x4d, 0xc3, 0x11, 0x98, 0xc8, 0xb6, 0x22, 0x09,
+    0x07, 0x68, 0x00, 0x43, 0x11, 0xe1, 0x15, 0xc3, 0x11, 0xf3, 0xc3, 0x6c,
+    0x49, 0x09, 0x1b, 0xb9, 0x17, 0xc3, 0x11, 0xfd, 0x0e, 0xc3, 0x12, 0x05,
+    0x0d, 0xc3, 0x12, 0x14, 0xc8, 0x6a, 0x1e, 0x09, 0x05, 0x59, 0xc2, 0x00,
+    0xd0, 0x09, 0x05, 0x4b, 0x03, 0x12, 0x23, 0xc9, 0x75, 0x04, 0x09, 0x05,
+    0x3b, 0x03, 0x12, 0x29, 0xc3, 0x62, 0x19, 0x09, 0x05, 0x31, 0x83, 0x09,
+    0x05, 0x12, 0x03, 0x12, 0x2f, 0xc2, 0x06, 0x62, 0x09, 0x25, 0xa1, 0xc2,
+    0x00, 0x4e, 0x09, 0x25, 0x93, 0x03, 0x12, 0x3c, 0xc2, 0x00, 0xdb, 0x09,
+    0x25, 0x83, 0x03, 0x12, 0x40, 0xc8, 0x6a, 0x1e, 0x09, 0x25, 0x79, 0xc2,
+    0x00, 0x0a, 0x09, 0x25, 0x71, 0xc3, 0x02, 0x2c, 0x09, 0x25, 0x68, 0xc2,
+    0x01, 0x7f, 0x09, 0x04, 0x91, 0xc2, 0x00, 0x65, 0x09, 0x04, 0x88, 0xc2,
+    0x00, 0x4e, 0x09, 0x04, 0xd1, 0xc4, 0x5d, 0x99, 0x09, 0x04, 0xc2, 0x03,
+    0x12, 0x44, 0x15, 0xc3, 0x12, 0x4a, 0xc2, 0x0b, 0x19, 0x09, 0x25, 0x31,
+    0xc2, 0x00, 0xec, 0x09, 0x25, 0x29, 0x0f, 0xc3, 0x12, 0x56, 0x0e, 0xc3,
+    0x12, 0x66, 0x0d, 0xc3, 0x12, 0x70, 0xc8, 0x6a, 0x1e, 0x09, 0x24, 0xc9,
+    0x0a, 0xc3, 0x12, 0x7a, 0x09, 0xc3, 0x12, 0x82, 0xc5, 0x9e, 0x4b, 0x09,
+    0x24, 0x91, 0x06, 0xc3, 0x12, 0x8d, 0x03, 0x43, 0x12, 0x99, 0xc3, 0x04,
+    0x65, 0x09, 0x1b, 0xb1, 0xc4, 0x73, 0x32, 0x09, 0x03, 0xf8, 0xc5, 0x39,
+    0xc7, 0x09, 0x04, 0x32, 0x03, 0x12, 0xa8, 0xc9, 0xaa, 0xdd, 0x09, 0x24,
+    0x60, 0xc5, 0xdd, 0x2b, 0x09, 0x24, 0x59, 0xc3, 0x04, 0x2a, 0x09, 0x24,
+    0x51, 0xc3, 0x04, 0x65, 0x09, 0x03, 0xa8, 0xc9, 0x51, 0xd5, 0x09, 0x24,
+    0x49, 0x4d, 0x68, 0xcd, 0x43, 0x12, 0xae, 0xa1, 0x09, 0x03, 0x89, 0xa0,
+    0x09, 0x03, 0x80, 0xc9, 0xaa, 0x20, 0x09, 0x24, 0x39, 0xc2, 0x05, 0x1d,
+    0x09, 0x02, 0x79, 0xc2, 0x00, 0x03, 0x09, 0x02, 0x70, 0xc2, 0x02, 0x1c,
+    0x09, 0x24, 0x31, 0xc2, 0x00, 0xec, 0x09, 0x24, 0x29, 0xc3, 0x58, 0xf1,
+    0x09, 0x24, 0x20, 0x42, 0x01, 0xe2, 0xc3, 0x12, 0xef, 0xc3, 0x20, 0x18,
+    0x09, 0x1b, 0x83, 0x03, 0x12, 0xfb, 0xcf, 0x65, 0xd0, 0x09, 0x00, 0xa1,
+    0xc5, 0x03, 0x47, 0x09, 0x00, 0x91, 0x0b, 0xc3, 0x13, 0x01, 0xc2, 0x00,
+    0xd0, 0x09, 0x00, 0x79, 0x42, 0x01, 0x30, 0xc3, 0x13, 0x0d, 0xc9, 0x75,
+    0x04, 0x09, 0x00, 0x61, 0xc4, 0x05, 0x4d, 0x09, 0x00, 0x58, 0x83, 0x09,
+    0x1b, 0x89, 0xc4, 0x38, 0xb4, 0x09, 0x00, 0xd9, 0xc4, 0x55, 0x25, 0x09,
+    0x00, 0xd1, 0xca, 0xa7, 0xb0, 0x09, 0x00, 0xc9, 0xc9, 0x5d, 0x99, 0x09,
+    0x00, 0xc1, 0xc5, 0xd8, 0xa8, 0x09, 0x00, 0xb8, 0x49, 0x0d, 0x2d, 0xc3,
+    0x13, 0x17, 0xc9, 0xa1, 0x21, 0x09, 0x01, 0xd1, 0xc9, 0x83, 0xac, 0x09,
+    0x01, 0xc8, 0xc7, 0x0b, 0x09, 0x09, 0x01, 0x89, 0xd5, 0x37, 0xeb, 0x09,
+    0x01, 0x80, 0x8b, 0x09, 0x01, 0x31, 0xc3, 0xe1, 0x68, 0x09, 0x01, 0x28,
+    0x00, 0x43, 0x13, 0x24, 0x97, 0x09, 0x14, 0x3b, 0x03, 0x13, 0x30, 0x8b,
+    0x09, 0x14, 0x2b, 0x03, 0x13, 0x34, 0x87, 0x09, 0x14, 0x21, 0x04, 0xc3,
+    0x13, 0x38, 0x83, 0x09, 0x14, 0x02, 0x03, 0x13, 0x40, 0xc4, 0x39, 0xc8,
+    0x09, 0x0a, 0x51, 0x42, 0x00, 0x9a, 0xc3, 0x13, 0x44, 0xc2, 0x00, 0x2c,
+    0x09, 0x0a, 0x41, 0xc3, 0xe3, 0x01, 0x09, 0x0a, 0x38, 0x84, 0x09, 0x22,
+    0x19, 0x83, 0x09, 0x22, 0x10, 0x97, 0x09, 0x21, 0x89, 0x9f, 0x09, 0x21,
+    0x38, 0xcd, 0x77, 0xe2, 0x09, 0x22, 0xa8, 0xcd, 0x77, 0xe2, 0x09, 0x22,
+    0x98, 0x84, 0x09, 0x21, 0xf9, 0x83, 0x09, 0x21, 0xf0, 0xcd, 0x77, 0xe2,
+    0x09, 0x21, 0xb8, 0xcd, 0x77, 0xe2, 0x09, 0x21, 0x78, 0xcd, 0x77, 0xe2,
+    0x09, 0x21, 0x28, 0xcb, 0x97, 0xc9, 0x00, 0x27, 0x99, 0xc8, 0x20, 0xa9,
+    0x00, 0x27, 0x88, 0xc9, 0x25, 0xfa, 0x00, 0x25, 0x69, 0xcb, 0x99, 0xc3,
+    0x05, 0x34, 0x58, 0xc9, 0x25, 0xfa, 0x00, 0x29, 0x79, 0xcb, 0x99, 0xc3,
+    0x00, 0x29, 0x09, 0xc4, 0x01, 0x23, 0x00, 0x28, 0x99, 0xc4, 0x14, 0xa6,
+    0x00, 0x26, 0x30, 0xc9, 0x6d, 0x45, 0x00, 0x29, 0x49, 0xcb, 0x99, 0xc3,
+    0x00, 0x29, 0x19, 0xc4, 0x14, 0xa6, 0x00, 0x26, 0x51, 0xc4, 0x01, 0x23,
+    0x00, 0x26, 0x41, 0xc9, 0x25, 0xfa, 0x00, 0x25, 0x18, 0xc2, 0x01, 0x7f,
+    0x00, 0x29, 0x59, 0x87, 0x05, 0x34, 0x48, 0xc2, 0x01, 0xc8, 0x05, 0x32,
+    0x18, 0xcf, 0x69, 0x54, 0x00, 0x29, 0x38, 0x8b, 0x00, 0x21, 0xcb, 0x03,
+    0x13, 0x4a, 0x97, 0x00, 0x22, 0xf0, 0x8e, 0x05, 0x33, 0x29, 0x8f, 0x05,
+    0x33, 0x38, 0xc9, 0x25, 0xfa, 0x00, 0x29, 0x29, 0xcb, 0x99, 0xc3, 0x00,
+    0x25, 0x38, 0xcf, 0x69, 0x54, 0x00, 0x25, 0xf8, 0xc9, 0x20, 0xa8, 0x00,
+    0x27, 0xc9, 0xc8, 0xbd, 0x7a, 0x05, 0x32, 0x88, 0xc3, 0xe6, 0x68, 0x00,
+    0x28, 0x79, 0xc3, 0xc7, 0xce, 0x00, 0x28, 0x69, 0xc3, 0xd0, 0xbd, 0x00,
+    0x28, 0x59, 0xc3, 0xe5, 0xde, 0x00, 0x28, 0x49, 0x06, 0xc3, 0x13, 0x50,
+    0xc3, 0xe5, 0x3c, 0x00, 0x28, 0x28, 0xc4, 0x01, 0x23, 0x00, 0x26, 0x21,
+    0xc6, 0x01, 0x73, 0x00, 0x24, 0xf9, 0xc9, 0x25, 0xfa, 0x00, 0x24, 0xd9,
+    0xcf, 0x2c, 0x35, 0x00, 0x24, 0xe8, 0xc6, 0x01, 0x73, 0x00, 0x27, 0xf9,
+    0xc4, 0x01, 0x23, 0x00, 0x27, 0xe9, 0xc9, 0x25, 0xfa, 0x00, 0x25, 0x98,
+    0xc6, 0x01, 0x73, 0x00, 0x24, 0x9b, 0x03, 0x13, 0x60, 0xc9, 0x25, 0xfa,
+    0x00, 0x27, 0xb9, 0xc6, 0x5e, 0xdc, 0x00, 0x24, 0x89, 0xcb, 0x99, 0xc3,
+    0x00, 0x24, 0xa8, 0xcf, 0x6b, 0x16, 0x00, 0x27, 0x58, 0xc5, 0x1d, 0x88,
+    0x00, 0x26, 0xb9, 0xc5, 0x1f, 0x0c, 0x00, 0x22, 0x80, 0x83, 0x05, 0x32,
+    0x39, 0x46, 0x30, 0x28, 0x43, 0x13, 0x66, 0xc8, 0x20, 0xa9, 0x00, 0x26,
+    0xf9, 0xc8, 0x25, 0xfb, 0x00, 0x24, 0xc8, 0x46, 0x00, 0x8b, 0x43, 0x13,
+    0x86, 0xcf, 0x2c, 0x35, 0x00, 0x25, 0xc9, 0x06, 0x43, 0x13, 0x90, 0xc9,
+    0x25, 0xfa, 0x00, 0x29, 0x71, 0xcb, 0x99, 0xc3, 0x00, 0x29, 0x01, 0xc4,
+    0x01, 0x23, 0x00, 0x28, 0x91, 0xc4, 0x14, 0xa6, 0x00, 0x26, 0x28, 0xc9,
+    0x6d, 0x45, 0x00, 0x29, 0x41, 0xcb, 0x99, 0xc3, 0x00, 0x29, 0x11, 0xc4,
+    0x14, 0xa6, 0x00, 0x26, 0x49, 0xc4, 0x01, 0x23, 0x00, 0x26, 0x39, 0xc9,
+    0x25, 0xfa, 0x00, 0x25, 0x10, 0xc2, 0x01, 0x7f, 0x00, 0x29, 0x51, 0x87,
+    0x05, 0x34, 0x40, 0xc2, 0x01, 0xc8, 0x05, 0x32, 0x10, 0xcf, 0x69, 0x54,
+    0x00, 0x29, 0x30, 0x8b, 0x00, 0x20, 0xcb, 0x03, 0x13, 0x9c, 0x97, 0x00,
+    0x20, 0x70, 0x8e, 0x05, 0x33, 0x21, 0x8f, 0x05, 0x33, 0x30, 0xc9, 0x25,
+    0xfa, 0x00, 0x29, 0x21, 0xcb, 0x99, 0xc3, 0x00, 0x25, 0x30, 0xcf, 0x69,
+    0x54, 0x00, 0x25, 0xf0, 0xc9, 0x20, 0xa8, 0x00, 0x27, 0xc1, 0xc8, 0xbd,
+    0x7a, 0x05, 0x32, 0x80, 0xc3, 0xe6, 0x68, 0x00, 0x28, 0x71, 0xc3, 0xc7,
+    0xce, 0x00, 0x28, 0x61, 0xc3, 0xd0, 0xbd, 0x00, 0x28, 0x51, 0xc3, 0xe5,
+    0xde, 0x00, 0x28, 0x41, 0x06, 0xc3, 0x13, 0xa2, 0xc3, 0xe5, 0x3c, 0x00,
+    0x28, 0x20, 0xc4, 0x01, 0x23, 0x00, 0x26, 0x19, 0xc9, 0x25, 0xfa, 0x00,
+    0x24, 0xd1, 0xcf, 0x2c, 0x35, 0x00, 0x24, 0xe1, 0xc6, 0x01, 0x73, 0x00,
+    0x24, 0xf0, 0xc6, 0x01, 0x73, 0x00, 0x27, 0xf1, 0xc4, 0x01, 0x23, 0x00,
+    0x27, 0xe1, 0xc9, 0x25, 0xfa, 0x00, 0x25, 0x90, 0xc6, 0x01, 0x73, 0x00,
+    0x24, 0x93, 0x03, 0x13, 0xb2, 0xc9, 0x25, 0xfa, 0x00, 0x27, 0xb1, 0xc6,
+    0x5e, 0xdc, 0x00, 0x24, 0x81, 0xcb, 0x99, 0xc3, 0x00, 0x24, 0xa0, 0x06,
+    0xc3, 0x13, 0xb8, 0xcf, 0x2c, 0x35, 0x00, 0x25, 0xc0, 0xcb, 0x97, 0xc9,
+    0x00, 0x27, 0x91, 0xc8, 0x20, 0xa9, 0x00, 0x27, 0x80, 0xcf, 0x6b, 0x16,
+    0x00, 0x27, 0x50, 0xc5, 0x1d, 0x88, 0x00, 0x26, 0xb1, 0xc5, 0x1f, 0x0c,
+    0x00, 0x20, 0x00, 0x83, 0x05, 0x32, 0x31, 0x46, 0x30, 0x28, 0x43, 0x13,
+    0xc4, 0xc8, 0x20, 0xa9, 0x00, 0x26, 0xf1, 0xc8, 0x25, 0xfb, 0x00, 0x24,
+    0xc0, 0x46, 0x00, 0x8b, 0x43, 0x13, 0xe4, 0xc9, 0x25, 0xfa, 0x00, 0x25,
+    0x61, 0xcb, 0x99, 0xc3, 0x05, 0x34, 0x50, 0xc5, 0x69, 0xa7, 0x00, 0x6c,
+    0x39, 0xc6, 0xd2, 0x3b, 0x00, 0x6c, 0x40, 0xc7, 0xc6, 0x32, 0x00, 0x6c,
+    0xd1, 0xc7, 0xca, 0x29, 0x00, 0x6c, 0xe1, 0xc7, 0xc7, 0xdd, 0x00, 0x6d,
+    0x01, 0xc7, 0xc7, 0xc1, 0x00, 0x6d, 0x11, 0x16, 0xc3, 0x13, 0xee, 0x06,
+    0xc3, 0x13, 0xfa, 0xc7, 0xc8, 0x1c, 0x00, 0x6d, 0xa1, 0xc7, 0x8e, 0x9b,
+    0x00, 0x6d, 0xb0, 0xc5, 0x69, 0xa7, 0x00, 0x6c, 0x69, 0xc6, 0xcc, 0xd1,
+    0x00, 0x6c, 0x70, 0xc5, 0x69, 0xa7, 0x00, 0x6c, 0x79, 0xc6, 0xcc, 0xd1,
+    0x00, 0x6c, 0x80, 0x4a, 0x9b, 0x62, 0xc3, 0x14, 0x06, 0xc5, 0x69, 0xa7,
+    0x00, 0x6d, 0xc0, 0xc7, 0xc4, 0xdb, 0x00, 0x6d, 0x59, 0xc7, 0xc1, 0xa8,
+    0x00, 0x6e, 0x11, 0xc7, 0xc2, 0x18, 0x00, 0x6e, 0x28, 0xc7, 0xc4, 0x25,
+    0x00, 0x6d, 0x61, 0xc6, 0x8e, 0x9c, 0x00, 0x6d, 0x98, 0xd2, 0x4d, 0xc3,
+    0x00, 0x6d, 0x29, 0xc5, 0x69, 0xa7, 0x00, 0x6e, 0x08, 0x45, 0xd7, 0x40,
+    0x43, 0x14, 0x32, 0xa3, 0x0e, 0xd5, 0x79, 0xa2, 0x0e, 0xd5, 0x71, 0xa1,
+    0x0e, 0xd5, 0x69, 0xa0, 0x0e, 0xd5, 0x61, 0x9f, 0x0e, 0xd5, 0x59, 0x9e,
+    0x0e, 0xd5, 0x51, 0x9d, 0x0e, 0xd5, 0x48, 0xcb, 0x57, 0x45, 0x0e, 0xcf,
+    0x0b, 0x03, 0x14, 0x44, 0xc6, 0x00, 0x58, 0x0e, 0xcf, 0x03, 0x03, 0x14,
+    0x4a, 0xc6, 0x24, 0x3b, 0x0e, 0xce, 0xfa, 0x03, 0x14, 0x50, 0x48, 0x0c,
+    0x8c, 0xc3, 0x14, 0x56, 0xc6, 0x00, 0x58, 0x0e, 0xcd, 0x1b, 0x03, 0x14,
+    0x60, 0xc6, 0x24, 0x3b, 0x0e, 0xcd, 0x12, 0x03, 0x14, 0x66, 0xc9, 0x65,
+    0x4f, 0x0e, 0xc8, 0xf9, 0x45, 0x03, 0x14, 0x43, 0x14, 0x6c, 0xc8, 0x3b,
+    0xec, 0x0e, 0xc8, 0xe9, 0xc6, 0x24, 0x3b, 0x0e, 0xc8, 0xd8, 0xc8, 0x3b,
+    0xec, 0x0e, 0xc8, 0xc9, 0xc6, 0x24, 0x3b, 0x0e, 0xc8, 0xb8, 0xc7, 0xc3,
+    0x0d, 0x0e, 0xd4, 0x21, 0xc4, 0x00, 0x2d, 0x0e, 0xd4, 0x08, 0xa4, 0x0e,
+    0xd3, 0xe9, 0xa3, 0x0e, 0xd3, 0xe1, 0xa2, 0x0e, 0xd3, 0xd9, 0xa1, 0x0e,
+    0xd3, 0xd1, 0xa0, 0x0e, 0xd3, 0xc9, 0x9f, 0x0e, 0xd3, 0xc1, 0x9e, 0x0e,
+    0xd3, 0xb8, 0xd0, 0x58, 0x82, 0x0e, 0xd2, 0xa9, 0xd0, 0x5a, 0xe2, 0x0e,
+    0xd2, 0xa0, 0xcb, 0x93, 0xca, 0x0e, 0xd3, 0x99, 0xd0, 0x5b, 0x12, 0x0e,
+    0xd3, 0x90, 0xcc, 0x35, 0xa8, 0x0e, 0xd3, 0x01, 0xcc, 0x5b, 0x22, 0x0e,
+    0xd2, 0xf8, 0xd5, 0x35, 0x9f, 0x0e, 0xd2, 0xe1, 0xcc, 0x86, 0x31, 0x0e,
+    0xd2, 0xd8, 0xc9, 0xb0, 0x50, 0x0e, 0xd3, 0x39, 0x43, 0x01, 0x55, 0xc3,
+    0x14, 0x78, 0xc8, 0x51, 0x1b, 0x0e, 0xd3, 0x10, 0x4a, 0x18, 0xa5, 0xc3,
+    0x14, 0x8a, 0x4b, 0x40, 0xb3, 0x43, 0x14, 0x9c, 0xc6, 0x2c, 0x2e, 0x0e,
+    0xca, 0xa1, 0xc6, 0x00, 0x58, 0x0e, 0xca, 0x99, 0xc6, 0x24, 0x3b, 0x0e,
+    0xca, 0x90, 0x4b, 0x40, 0xb3, 0xc3, 0x14, 0xae, 0x4a, 0x18, 0xa5, 0x43,
+    0x14, 0xc0, 0x05, 0xc3, 0x14, 0xd2, 0xc8, 0x45, 0x27, 0x0e, 0xd1, 0x0a,
+    0x03, 0x14, 0xde, 0xc6, 0x3b, 0x9c, 0x0e, 0xd1, 0x41, 0xc8, 0x45, 0x27,
+    0x0e, 0xd1, 0x22, 0x03, 0x14, 0xe2, 0xc8, 0x3b, 0xec, 0x0e, 0xd0, 0xc1,
+    0xc6, 0x24, 0x3b, 0x0e, 0xd0, 0xb8, 0xcd, 0x76, 0xd1, 0x0e, 0xd0, 0xe1,
+    0xc5, 0x05, 0x74, 0x0e, 0xd0, 0xd0, 0xc6, 0x07, 0xa1, 0x0e, 0xd0, 0xd9,
+    0xc4, 0x05, 0x75, 0x0e, 0xd0, 0xc8, 0xc3, 0x1d, 0xb1, 0x0e, 0xc8, 0x1b,
+    0x03, 0x14, 0xe6, 0xc3, 0x00, 0xfd, 0x0e, 0xc2, 0xd2, 0x03, 0x14, 0xea,
+    0x00, 0x43, 0x14, 0xee, 0xc4, 0x09, 0x39, 0x0e, 0xc3, 0xeb, 0x03, 0x15,
+    0x0c, 0xc3, 0x01, 0x24, 0x0e, 0xc3, 0x5a, 0x03, 0x15, 0x10, 0x17, 0xc3,
+    0x15, 0x14, 0xc3, 0xc9, 0xd8, 0x0e, 0xc3, 0x33, 0x03, 0x15, 0x24, 0xc5,
+    0x02, 0xd2, 0x0e, 0xc3, 0xb2, 0x03, 0x15, 0x28, 0x00, 0x43, 0x15, 0x2c,
+    0xc7, 0x05, 0x79, 0x0e, 0xd0, 0x31, 0x02, 0x43, 0x15, 0x50, 0x54, 0x3a,
+    0x70, 0xc3, 0x15, 0x5c, 0xc6, 0xc1, 0xb7, 0x0e, 0xc9, 0x48, 0x59, 0x20,
+    0x2b, 0xc3, 0x15, 0x68, 0x44, 0x1f, 0x0e, 0x43, 0x15, 0x74, 0x46, 0x17,
+    0x14, 0xc3, 0x15, 0x84, 0x47, 0x01, 0xdb, 0xc3, 0x15, 0x90, 0x46, 0x03,
+    0x13, 0x43, 0x15, 0x9c, 0xcf, 0x64, 0x4a, 0x0e, 0xcf, 0x11, 0x46, 0x2d,
+    0x11, 0x43, 0x15, 0xa8, 0xc5, 0x03, 0x13, 0x0e, 0xce, 0xd9, 0x48, 0x20,
+    0x37, 0x43, 0x15, 0xb4, 0xc5, 0x03, 0x13, 0x0e, 0xce, 0xd1, 0x48, 0x20,
+    0x37, 0x43, 0x15, 0xc0, 0x45, 0x0e, 0xd5, 0xc3, 0x15, 0xcc, 0xc4, 0x6b,
+    0x03, 0x0e, 0xcb, 0xb9, 0x46, 0x35, 0x01, 0xc3, 0x15, 0xed, 0xc4, 0x0d,
+    0x21, 0x0e, 0xcb, 0x70, 0xc5, 0x17, 0x14, 0x0e, 0xcc, 0x01, 0xc6, 0x01,
+    0xdb, 0x0e, 0xcb, 0xf9, 0xc5, 0x03, 0x13, 0x0e, 0xcb, 0xf0, 0xc5, 0x17,
+    0x14, 0x0e, 0xcb, 0xe9, 0xc6, 0x01, 0xdb, 0x0e, 0xcb, 0xe1, 0xc5, 0x03,
+    0x13, 0x0e, 0xcb, 0xd8, 0x43, 0x32, 0x37, 0xc3, 0x15, 0xff, 0xc3, 0x02,
+    0x39, 0x0e, 0xcb, 0x98, 0x4c, 0x8b, 0xc5, 0xc3, 0x16, 0x11, 0xca, 0x91,
+    0x42, 0x0e, 0xcb, 0x81, 0xd1, 0x51, 0x12, 0x0e, 0xcb, 0x78, 0xcb, 0x57,
+    0x45, 0x0e, 0xcb, 0x63, 0x03, 0x16, 0x1d, 0xca, 0x91, 0x42, 0x0e, 0xcb,
+    0x59, 0xc8, 0x45, 0x27, 0x0e, 0xcb, 0x50, 0x47, 0x3a, 0x70, 0xc3, 0x16,
+    0x23, 0xc6, 0xc1, 0xb7, 0x0e, 0xc9, 0x40, 0x52, 0x47, 0xed, 0xc3, 0x16,
+    0x2f, 0x44, 0x1f, 0x0e, 0x43, 0x16, 0x3b, 0x47, 0x01, 0xdb, 0xc3, 0x16,
+    0x4d, 0x46, 0x03, 0x13, 0x43, 0x16, 0x59, 0x48, 0x20, 0x37, 0xc3, 0x16,
+    0x65, 0xc5, 0x03, 0x13, 0x0e, 0xcc, 0xab, 0x03, 0x16, 0x71, 0xc5, 0x17,
+    0x14, 0x0e, 0xcc, 0xb9, 0xc6, 0x01, 0xdb, 0x0e, 0xcc, 0xb0, 0x48, 0x20,
+    0x37, 0xc3, 0x16, 0x77, 0xc5, 0x17, 0x14, 0x0e, 0xcc, 0xa1, 0xc6, 0x01,
+    0xdb, 0x0e, 0xcc, 0x99, 0xc5, 0x03, 0x13, 0x0e, 0xcc, 0x90, 0x44, 0x0e,
+    0xd5, 0xc3, 0x16, 0x83, 0x45, 0x6b, 0x03, 0xc3, 0x16, 0x8d, 0x46, 0x35,
+    0x01, 0xc3, 0x16, 0x9f, 0xc4, 0x0d, 0x21, 0x0e, 0xc9, 0x98, 0xc6, 0x64,
+    0x4a, 0x0e, 0xcd, 0x29, 0x46, 0x2d, 0x11, 0x43, 0x16, 0xb7, 0xc5, 0x17,
+    0x14, 0x0e, 0xca, 0x51, 0xc6, 0x01, 0xdb, 0x0e, 0xca, 0x49, 0xc5, 0x03,
+    0x13, 0x0e, 0xca, 0x40, 0xc5, 0x17, 0x14, 0x0e, 0xca, 0x39, 0xc6, 0x01,
+    0xdb, 0x0e, 0xca, 0x31, 0xc5, 0x03, 0x13, 0x0e, 0xca, 0x28, 0x43, 0x32,
+    0x37, 0xc3, 0x16, 0xc3, 0x44, 0x0a, 0x0f, 0x43, 0x16, 0xd5, 0xcb, 0x57,
+    0x45, 0x0e, 0xc9, 0xb3, 0x03, 0x16, 0xe7, 0xca, 0x91, 0x42, 0x0e, 0xc9,
+    0xa9, 0xd1, 0x51, 0x12, 0x0e, 0xc9, 0xa0, 0xcb, 0x57, 0x45, 0x0e, 0xc9,
+    0x8b, 0x03, 0x16, 0xed, 0xca, 0x91, 0x42, 0x0e, 0xc9, 0x81, 0xc8, 0x45,
+    0x27, 0x0e, 0xc9, 0x78, 0x48, 0xbf, 0xc2, 0xc3, 0x16, 0xf3, 0x45, 0xd5,
+    0xf1, 0x43, 0x17, 0x08, 0xc5, 0x17, 0x14, 0x0e, 0xca, 0xdb, 0x03, 0x17,
+    0x1d, 0xc6, 0x01, 0xdb, 0x0e, 0xca, 0xd1, 0xc5, 0x03, 0x13, 0x0e, 0xca,
+    0xc8, 0xc5, 0x17, 0x14, 0x0e, 0xca, 0xbb, 0x03, 0x17, 0x23, 0xc6, 0x01,
+    0xdb, 0x0e, 0xca, 0xb1, 0xc5, 0x03, 0x13, 0x0e, 0xca, 0xa8, 0x45, 0x11,
+    0x17, 0xc3, 0x17, 0x29, 0xca, 0x65, 0x4e, 0x0e, 0xc9, 0x18, 0xc7, 0xc1,
+    0xb6, 0x0e, 0xd1, 0xe9, 0xc7, 0x27, 0xb2, 0x0e, 0xd1, 0xe1, 0xc7, 0x81,
+    0x92, 0x0e, 0xd1, 0xd8, 0xc6, 0xcf, 0x23, 0x0e, 0xd2, 0x91, 0xc7, 0x27,
+    0xb2, 0x0e, 0xd2, 0x88, 0xc8, 0xbe, 0x0a, 0x0e, 0xd2, 0x79, 0xc7, 0x27,
+    0xb2, 0x0e, 0xd2, 0x70, 0x00, 0x43, 0x17, 0x3b, 0x00, 0x43, 0x17, 0x47,
+    0xc4, 0x05, 0x75, 0x0e, 0xd2, 0x19, 0xc8, 0xbe, 0x0a, 0x0e, 0xd2, 0x10,
+    0xc4, 0x05, 0x75, 0x0e, 0xd2, 0x01, 0xc8, 0xbe, 0x0a, 0x0e, 0xd1, 0xf8,
+    0xcc, 0x57, 0x44, 0x0e, 0xcf, 0xe0, 0x8e, 0x08, 0xac, 0x48, 0x94, 0x08,
+    0xac, 0x38, 0x4c, 0x8b, 0x71, 0xc3, 0x17, 0x53, 0xd2, 0x4b, 0x3b, 0x08,
+    0xae, 0xa1, 0xd3, 0x44, 0xb5, 0x08, 0xae, 0x99, 0x43, 0x01, 0x92, 0xc3,
+    0x17, 0x65, 0xd0, 0x58, 0x22, 0x08, 0xae, 0x89, 0x50, 0x5d, 0x32, 0x43,
+    0x17, 0x71, 0xca, 0x83, 0x03, 0x08, 0xae, 0x80, 0x94, 0x05, 0x44, 0x48,
+    0x8e, 0x05, 0x44, 0x58, 0x9f, 0x08, 0x8e, 0xf9, 0x9e, 0x08, 0x8e, 0xf0,
+    0xc7, 0x7a, 0x7f, 0x08, 0x8e, 0x09, 0xc7, 0x14, 0x39, 0x08, 0x8c, 0x08,
+    0xc4, 0x1e, 0x97, 0x08, 0x8e, 0x01, 0xc5, 0x40, 0xe7, 0x08, 0x8c, 0x10,
+    0xc4, 0x18, 0x10, 0x08, 0x8e, 0xb9, 0xc2, 0x22, 0xcc, 0x08, 0x8e, 0xb0,
+    0xc3, 0x0d, 0x14, 0x08, 0x8e, 0xa9, 0xc3, 0x09, 0x9e, 0x08, 0x8e, 0xa0,
+    0xc4, 0x02, 0xde, 0x08, 0x8e, 0x99, 0xc2, 0x02, 0xa0, 0x08, 0x8e, 0x90,
+    0x97, 0x08, 0x8d, 0xf9, 0x8b, 0x08, 0x8d, 0xe9, 0x83, 0x08, 0x8d, 0x98,
+    0x8e, 0x08, 0x8d, 0xd3, 0x03, 0x17, 0x83, 0x94, 0x08, 0x8d, 0xc2, 0x03,
+    0x17, 0x87, 0x97, 0x08, 0x8d, 0xb8, 0x8b, 0x08, 0x8d, 0xa8, 0x8e, 0x08,
+    0x8c, 0x5b, 0x03, 0x17, 0x8b, 0x94, 0x08, 0x8c, 0x4a, 0x03, 0x17, 0x8f,
+    0xc2, 0x00, 0xd0, 0x08, 0x8c, 0xf1, 0x83, 0x08, 0x8c, 0xe8, 0xc2, 0x00,
+    0xd0, 0x08, 0x8c, 0xe1, 0x83, 0x08, 0x8c, 0xd8, 0x45, 0x00, 0x27, 0xc3,
+    0x17, 0x93, 0xce, 0x66, 0x67, 0x01, 0x2f, 0x38, 0x45, 0x00, 0x49, 0xc3,
+    0x17, 0x9f, 0x46, 0x00, 0x2c, 0x43, 0x17, 0xab, 0xcc, 0x24, 0x47, 0x01,
+    0x17, 0x29, 0xc8, 0x07, 0x5f, 0x01, 0x14, 0x90, 0xcc, 0x24, 0x47, 0x01,
+    0x17, 0x21, 0xc8, 0x07, 0x5f, 0x01, 0x14, 0x88, 0xc7, 0x0b, 0x09, 0x01,
+    0x9d, 0x01, 0xc5, 0xd9, 0x11, 0x01, 0x9d, 0x20, 0xc8, 0x0b, 0x08, 0x01,
+    0x9d, 0x78, 0xc2, 0x17, 0x99, 0x01, 0x9a, 0x09, 0x90, 0x01, 0x9a, 0x10,
+    0xc7, 0x0b, 0x09, 0x01, 0x9b, 0xc1, 0xc5, 0xd9, 0x11, 0x01, 0x9b, 0xc8,
+    0xc5, 0xd9, 0x34, 0x01, 0x99, 0x59, 0xc2, 0x00, 0x16, 0x01, 0x99, 0x60,
+    0xc3, 0x9f, 0x30, 0x01, 0x99, 0x79, 0x91, 0x01, 0x99, 0x80, 0xc3, 0xa9,
+    0x98, 0x01, 0x99, 0xc1, 0xc2, 0x06, 0x62, 0x01, 0x99, 0xd0, 0xc5, 0xd6,
+    0xff, 0x01, 0x97, 0xf9, 0xc6, 0xd1, 0xff, 0x01, 0x9b, 0xd9, 0xc6, 0xd1,
+    0xcf, 0x01, 0x9b, 0xe1, 0xc7, 0xc5, 0x52, 0x01, 0x9b, 0xe9, 0xc5, 0xdb,
+    0x05, 0x01, 0x9b, 0xf0, 0xc4, 0x89, 0x91, 0x01, 0x98, 0x61, 0xc4, 0xe4,
+    0x6f, 0x01, 0x98, 0x68, 0x05, 0xc3, 0x17, 0xb7, 0xc7, 0x0b, 0x09, 0x01,
+    0x9d, 0x10, 0xc4, 0xdd, 0xdf, 0x01, 0x9a, 0x19, 0xc2, 0x17, 0x99, 0x01,
+    0x9a, 0x20, 0xc5, 0xd8, 0x26, 0x01, 0x9a, 0x50, 0xc3, 0x0f, 0xd9, 0x01,
+    0x9a, 0x60, 0xc2, 0x02, 0x2e, 0x01, 0x9e, 0x09, 0xc5, 0x04, 0x34, 0x01,
+    0x9d, 0x3a, 0x03, 0x17, 0xc3, 0xc7, 0x0b, 0x09, 0x01, 0x9c, 0xf9, 0xc5,
+    0xd9, 0x11, 0x01, 0x9d, 0x18, 0xc2, 0x00, 0xbf, 0x01, 0x3e, 0x79, 0xc3,
+    0x02, 0x9b, 0x01, 0x3e, 0x70, 0x95, 0x0f, 0x8a, 0x11, 0x94, 0x0f, 0x8a,
+    0x09, 0x93, 0x0f, 0x8a, 0x01, 0x92, 0x0f, 0x89, 0xf9, 0x91, 0x0f, 0x89,
+    0xf1, 0x90, 0x0f, 0x89, 0xe9, 0x8f, 0x0f, 0x89, 0xe1, 0x8e, 0x0f, 0x89,
+    0xd9, 0x8d, 0x0f, 0x89, 0xd1, 0x8c, 0x0f, 0x89, 0xc9, 0x8b, 0x0f, 0x89,
+    0xc1, 0x8a, 0x0f, 0x89, 0xb9, 0x89, 0x0f, 0x89, 0xb1, 0x88, 0x0f, 0x89,
+    0xa9, 0x87, 0x0f, 0x89, 0xa1, 0x86, 0x0f, 0x89, 0x99, 0x83, 0x0f, 0x89,
+    0x81, 0x84, 0x0f, 0x89, 0x89, 0x85, 0x0f, 0x89, 0x91, 0x96, 0x0f, 0x8a,
+    0x19, 0x97, 0x0f, 0x8a, 0x21, 0x98, 0x0f, 0x8a, 0x29, 0x99, 0x0f, 0x8a,
+    0x31, 0x9a, 0x0f, 0x8a, 0x39, 0x9b, 0x0f, 0x8a, 0x41, 0x9c, 0x0f, 0x8a,
+    0x48, 0xc3, 0xe5, 0xbd, 0x0f, 0x91, 0xd9, 0xc3, 0xe5, 0xe1, 0x0f, 0x91,
+    0x58, 0xc3, 0xe5, 0x8d, 0x0f, 0x91, 0xd1, 0x1f, 0xc3, 0x17, 0xc9, 0x21,
+    0xc3, 0x17, 0xdb, 0x20, 0xc3, 0x17, 0xe7, 0xc3, 0xe4, 0xdf, 0x0f, 0x91,
+    0x61, 0xc3, 0xe5, 0x09, 0x0f, 0x91, 0x21, 0xc3, 0xe5, 0xb7, 0x0f, 0x90,
+    0xf1, 0xc3, 0xe6, 0x35, 0x0f, 0x90, 0xe9, 0x26, 0xc3, 0x17, 0xf3, 0xc3,
+    0xe5, 0x42, 0x0f, 0x90, 0x88, 0x22, 0xc3, 0x17, 0xff, 0xc3, 0xe5, 0x12,
+    0x0f, 0x91, 0x99, 0xc3, 0xe5, 0x1b, 0x0f, 0x91, 0x91, 0xc3, 0xe4, 0xf1,
+    0x0f, 0x91, 0x09, 0xc3, 0xe5, 0xf6, 0x0f, 0x90, 0xd0, 0x42, 0xe4, 0xef,
+    0xc3, 0x18, 0x0b, 0xc3, 0xe5, 0x27, 0x0f, 0x91, 0xa9, 0x1f, 0xc3, 0x18,
+    0x13, 0x20, 0xc3, 0x18, 0x25, 0xc3, 0xe6, 0x38, 0x0f, 0x91, 0x31, 0x22,
+    0xc3, 0x18, 0x31, 0xc3, 0xe5, 0x48, 0x0f, 0x90, 0xc8, 0xc3, 0xe4, 0xee,
+    0x0f, 0x91, 0x81, 0xc3, 0xe5, 0x7b, 0x0f, 0x91, 0x19, 0xc3, 0xe5, 0x1e,
+    0x0f, 0x90, 0xb0, 0xc2, 0x81, 0x20, 0x0f, 0x91, 0x69, 0x1d, 0xc3, 0x18,
+    0x3d, 0xc2, 0xd5, 0x96, 0x0f, 0x90, 0xc1, 0xc2, 0x8c, 0x54, 0x0f, 0x90,
+    0xa0, 0xc4, 0x02, 0xde, 0x01, 0x20, 0x99, 0xc2, 0x02, 0xa0, 0x01, 0x20,
+    0x90, 0xcb, 0x98, 0xd1, 0x01, 0x20, 0x23, 0x03, 0x18, 0x45, 0xc3, 0x09,
+    0x3f, 0x01, 0x20, 0x18, 0xc2, 0x00, 0xdb, 0x00, 0x43, 0x49, 0x83, 0x00,
+    0x43, 0x40, 0x10, 0xc3, 0x18, 0x4b, 0xc2, 0x19, 0x2c, 0x00, 0x43, 0x11,
+    0xc2, 0x01, 0x30, 0x00, 0x43, 0x00, 0xc4, 0x00, 0x49, 0x00, 0x38, 0x49,
+    0xc5, 0x00, 0x2c, 0x00, 0x38, 0x48, 0xcf, 0x33, 0xad, 0x01, 0x56, 0x20,
+    0xcb, 0x0e, 0xbd, 0x01, 0x56, 0x31, 0xce, 0x33, 0x92, 0x01, 0x56, 0x41,
+    0xcf, 0x6a, 0x8f, 0x01, 0x56, 0x51, 0xcc, 0x24, 0x47, 0x01, 0x56, 0x60,
+    0xc5, 0xd4, 0xcf, 0x00, 0xdc, 0x11, 0xc5, 0xd9, 0xcf, 0x00, 0xdc, 0x08,
+    0xca, 0x6c, 0x10, 0x0f, 0xb0, 0x29, 0xcc, 0x1d, 0x4a, 0x0f, 0xb0, 0x21,
+    0xd3, 0x41, 0x38, 0x0f, 0xb0, 0x30, 0x45, 0x02, 0x9a, 0x43, 0x18, 0x55,
+    0xc7, 0x80, 0x70, 0x01, 0x17, 0xf1, 0x48, 0x00, 0x5f, 0x43, 0x18, 0x61,
+    0xc7, 0x80, 0x70, 0x01, 0x17, 0xb9, 0x48, 0x00, 0x5f, 0x43, 0x18, 0x67,
+    0x00, 0x43, 0x18, 0x6d, 0x0b, 0xc3, 0x18, 0x79, 0xc3, 0x09, 0x9e, 0x01,
+    0x0b, 0x98, 0x19, 0xc3, 0x18, 0x88, 0xc2, 0x00, 0xc4, 0x01, 0x0b, 0xc9,
+    0xc4, 0x02, 0xde, 0x01, 0x0b, 0x90, 0xc5, 0x66, 0xb1, 0x01, 0x0b, 0xd1,
+    0xc4, 0x00, 0x2d, 0x01, 0x0b, 0xb8, 0xc4, 0x18, 0x10, 0x01, 0x0b, 0xb1,
+    0xc2, 0x22, 0xcc, 0x01, 0x0b, 0xa8, 0xce, 0x69, 0x64, 0x07, 0xf2, 0x19,
+    0xd2, 0x21, 0x89, 0x07, 0xf0, 0xb8, 0xcc, 0x00, 0x33, 0x07, 0xf1, 0xb9,
+    0xcd, 0x69, 0x65, 0x07, 0xf2, 0x08, 0xc4, 0x00, 0x3b, 0x07, 0xf0, 0xc1,
+    0xc4, 0xe0, 0xeb, 0x07, 0xf0, 0xc0, 0x9d, 0x0f, 0x87, 0x51, 0x9e, 0x0f,
+    0x87, 0x59, 0x9f, 0x0f, 0x87, 0x61, 0xa0, 0x0f, 0x87, 0x69, 0xa1, 0x0f,
+    0x87, 0x71, 0xa2, 0x0f, 0x87, 0x79, 0xa3, 0x0f, 0x87, 0x81, 0xa4, 0x0f,
+    0x87, 0x89, 0xa5, 0x0f, 0x87, 0x91, 0xa6, 0x0f, 0x87, 0x98, 0x9d, 0x0f,
+    0x87, 0xa1, 0x9e, 0x0f, 0x87, 0xa8, 0xc6, 0xcf, 0x05, 0x0f, 0x85, 0x21,
+    0xc6, 0x78, 0x78, 0x0f, 0x85, 0xa1, 0xc8, 0xba, 0x2a, 0x0f, 0x86, 0x21,
+    0xc5, 0xdd, 0x49, 0x0f, 0x86, 0xa0, 0xcc, 0x82, 0x4d, 0x01, 0x51, 0x39,
+    0xd1, 0x4b, 0xde, 0x01, 0x51, 0x10, 0xc5, 0x05, 0x02, 0x01, 0x51, 0x31,
+    0xc5, 0x00, 0xd4, 0x01, 0x51, 0x20, 0x83, 0x01, 0x90, 0xb1, 0x97, 0x01,
+    0x90, 0xe0, 0x89, 0x08, 0xd7, 0x18, 0xc4, 0x18, 0x12, 0x08, 0x43, 0xf9,
+    0x91, 0x08, 0x43, 0xd0, 0xc2, 0x39, 0x8b, 0x08, 0x43, 0xb1, 0xc3, 0x1e,
+    0x1b, 0x08, 0x43, 0x40, 0xc3, 0x11, 0xef, 0x08, 0x43, 0xa9, 0x03, 0x43,
+    0x18, 0x92, 0xc3, 0x16, 0x5a, 0x08, 0x43, 0x81, 0xc4, 0x36, 0xb5, 0x08,
+    0x43, 0x00, 0xc2, 0x00, 0x8e, 0x08, 0x43, 0x38, 0xc3, 0x03, 0x15, 0x01,
+    0x37, 0xc9, 0xc9, 0xa8, 0x8b, 0x0f, 0xa3, 0x88, 0xc8, 0x7a, 0x7e, 0x05,
+    0x47, 0xb9, 0x16, 0xc3, 0x18, 0x9e, 0xc6, 0x1e, 0x95, 0x05, 0x47, 0x98,
+    0x91, 0x00, 0x48, 0x91, 0x87, 0x00, 0x48, 0x71, 0x83, 0x00, 0x48, 0x20,
+    0x8e, 0x00, 0x4b, 0x08, 0x94, 0x00, 0x4b, 0x00, 0xc2, 0x00, 0xd0, 0x00,
+    0x4a, 0xe1, 0x83, 0x00, 0x4b, 0xf0, 0x91, 0x00, 0x48, 0x89, 0x87, 0x00,
+    0x48, 0x69, 0x83, 0x00, 0x4b, 0x90, 0x8a, 0x08, 0x20, 0x18, 0x91, 0x08,
+    0x20, 0x28, 0x8a, 0x08, 0x20, 0x48, 0x91, 0x08, 0x20, 0x58, 0x8a, 0x08,
+    0x20, 0xf8, 0x89, 0x08, 0x21, 0x28, 0x8a, 0x08, 0x21, 0x58, 0x91, 0x08,
+    0x21, 0x68, 0x8a, 0x08, 0x21, 0x88, 0x91, 0x08, 0x21, 0x98, 0x8a, 0x08,
+    0x22, 0x38, 0x89, 0x08, 0x22, 0x68, 0xca, 0x03, 0xdd, 0x0f, 0xc4, 0x99,
+    0x48, 0x01, 0x9a, 0x43, 0x18, 0xaa, 0xe0, 0x05, 0xc7, 0x01, 0x5f, 0x78,
+    0xc5, 0x01, 0x4a, 0x01, 0x0e, 0x19, 0x00, 0x43, 0x18, 0xc5, 0xc5, 0x01,
+    0x4a, 0x01, 0x0e, 0x11, 0x00, 0x43, 0x18, 0xd7, 0x45, 0x00, 0x8c, 0xc3,
+    0x18, 0xe3, 0xda, 0x1b, 0x34, 0x01, 0x0f, 0xa9, 0xc8, 0xae, 0xbc, 0x01,
+    0x0d, 0x39, 0xc6, 0x10, 0x9d, 0x01, 0x48, 0x99, 0xda, 0x1c, 0x1e, 0x0f,
+    0xdd, 0xb8, 0xc4, 0x26, 0x78, 0x01, 0x27, 0xe9, 0xc5, 0x06, 0xdb, 0x01,
+    0x27, 0xe1, 0x15, 0xc3, 0x19, 0x19, 0x08, 0xc3, 0x19, 0x25, 0x16, 0xc3,
+    0x19, 0x31, 0xc3, 0x05, 0x14, 0x01, 0x27, 0xa8, 0x47, 0x00, 0x58, 0xc3,
+    0x19, 0x3d, 0xce, 0x34, 0xd4, 0x01, 0x57, 0x18, 0xcf, 0x01, 0xb8, 0x01,
+    0x80, 0xf0, 0x02, 0xc3, 0x19, 0x49, 0xc5, 0x27, 0xf9, 0x01, 0x00, 0xb8,
+    0xc2, 0x00, 0xbf, 0x01, 0x52, 0xa1, 0xc3, 0x02, 0x9b, 0x01, 0x52, 0x98,
+    0x8c, 0x01, 0x0a, 0x49, 0x8b, 0x01, 0x0a, 0x41, 0x87, 0x01, 0x0a, 0x39,
+    0x86, 0x01, 0x0a, 0x30, 0x8b, 0x01, 0x09, 0xf8, 0xc9, 0x00, 0xca, 0x01,
+    0x54, 0xd9, 0xcc, 0x07, 0xc7, 0x01, 0x54, 0xe0, 0xc5, 0x78, 0x04, 0x01,
+    0x02, 0x31, 0x48, 0xbc, 0xfa, 0xc3, 0x19, 0x55, 0xc8, 0x52, 0x09, 0x01,
+    0x4c, 0x61, 0xc6, 0x01, 0x73, 0x01, 0x72, 0xb1, 0xcd, 0x75, 0xa6, 0x01,
+    0x72, 0xc0, 0xd1, 0x52, 0xff, 0x0f, 0xab, 0x51, 0xce, 0x6f, 0x1c, 0x0f,
+    0xab, 0x48, 0x00, 0x43, 0x19, 0x61, 0xc6, 0x02, 0xd1, 0x01, 0x2e, 0xb9,
+    0xc4, 0x0e, 0x6a, 0x01, 0x5f, 0x48, 0xd4, 0x3f, 0x5c, 0x01, 0x4e, 0x70,
+    0xc5, 0x01, 0xa2, 0x01, 0x5b, 0x13, 0x03, 0x19, 0x82, 0xcc, 0x82, 0xb9,
+    0x01, 0x5b, 0x61, 0xcd, 0x7c, 0xa8, 0x01, 0x5c, 0x30, 0x45, 0x00, 0x8c,
+    0xc3, 0x19, 0x86, 0xc8, 0xae, 0xbc, 0x01, 0x48, 0x28, 0x44, 0x03, 0xc8,
+    0xc3, 0x19, 0x96, 0x42, 0x02, 0xae, 0x43, 0x19, 0xa0, 0xd7, 0x22, 0x5c,
+    0x0f, 0xc0, 0x51, 0xc3, 0x7e, 0x79, 0x01, 0x0d, 0x60, 0x45, 0x03, 0x14,
+    0xc3, 0x19, 0xaa, 0xc5, 0x01, 0x74, 0x01, 0x0c, 0xd8, 0xd4, 0x2d, 0x64,
+    0x01, 0x0f, 0xd9, 0xc9, 0xb3, 0xf8, 0x01, 0x48, 0x88, 0xc3, 0x14, 0xa7,
+    0x01, 0x0d, 0x1b, 0x03, 0x19, 0xb6, 0x43, 0x00, 0x7e, 0x43, 0x19, 0xbc,
+    0xc2, 0x00, 0xb1, 0x01, 0x0f, 0x29, 0xcc, 0x56, 0x78, 0x01, 0x48, 0xf0,
+    0x9a, 0x01, 0x4a, 0x39, 0xcc, 0x07, 0xc7, 0x01, 0x5a, 0x19, 0xc8, 0xb7,
+    0x52, 0x01, 0x5a, 0x20, 0xcf, 0x6a, 0x8f, 0x01, 0x4b, 0xa9, 0xce, 0x33,
+    0x92, 0x01, 0x4b, 0xa1, 0xd5, 0x36, 0xef, 0x01, 0x4a, 0x11, 0x48, 0x61,
+    0xd4, 0x43, 0x19, 0xc8, 0xe0, 0x06, 0xc7, 0x0f, 0xdd, 0xb0, 0x45, 0x00,
+    0x8c, 0xc3, 0x19, 0xd4, 0xc8, 0xae, 0xbc, 0x01, 0x48, 0x38, 0xc8, 0x4b,
+    0x94, 0x01, 0x0c, 0x39, 0xca, 0xa7, 0xce, 0x01, 0x0c, 0x30, 0xc8, 0x4b,
+    0x94, 0x01, 0x0c, 0x09, 0xc7, 0x0d, 0x04, 0x01, 0x0b, 0x70, 0xc3, 0x23,
+    0x1c, 0x00, 0xb7, 0xc1, 0x85, 0x00, 0xb7, 0xb8, 0xc2, 0x1d, 0xc1, 0x00,
+    0xb7, 0x39, 0xc6, 0xd2, 0x35, 0x00, 0xb6, 0xc9, 0xc9, 0x25, 0x3a, 0x00,
+    0xb6, 0x99, 0xc5, 0x72, 0x5f, 0x00, 0xb6, 0x81, 0xc5, 0x2e, 0x39, 0x00,
+    0xb6, 0x61, 0xc4, 0x05, 0xf1, 0x00, 0xb6, 0x31, 0xc6, 0x57, 0x17, 0x00,
+    0xb5, 0xf9, 0xc8, 0xbf, 0x3a, 0x00, 0xb5, 0xe9, 0xc5, 0x71, 0x4d, 0x00,
+    0xb5, 0x68, 0x90, 0x05, 0x28, 0x20, 0x90, 0x05, 0x2b, 0xa8, 0x87, 0x05,
+    0x28, 0x30, 0x91, 0x05, 0x2b, 0xb8, 0x87, 0x05, 0x28, 0x40, 0x91, 0x05,
+    0x2b, 0xc8, 0x87, 0x05, 0x28, 0x50, 0x91, 0x05, 0x2b, 0xd8, 0x87, 0x05,
+    0x28, 0x49, 0x90, 0x05, 0x2f, 0x68, 0x90, 0x05, 0x2a, 0xa8, 0x91, 0x05,
+    0x2b, 0xd0, 0x87, 0x05, 0x28, 0x59, 0x90, 0x05, 0x2f, 0x80, 0x91, 0x05,
+    0x2b, 0xe1, 0x90, 0x05, 0x2e, 0x40, 0x87, 0x05, 0x28, 0x78, 0x91, 0x05,
+    0x2c, 0x00, 0x87, 0x05, 0x28, 0x80, 0x87, 0x05, 0x2f, 0xb3, 0x03, 0x19,
+    0xe0, 0x8b, 0x05, 0x29, 0xb1, 0x83, 0x05, 0x2a, 0xe9, 0x91, 0x05, 0x2e,
+    0x73, 0x03, 0x19, 0xe4, 0x97, 0x05, 0x2d, 0x38, 0x91, 0x05, 0x2c, 0x08,
+    0x87, 0x05, 0x28, 0xa8, 0x91, 0x05, 0x2c, 0x31, 0x43, 0x00, 0x5c, 0x43,
+    0x19, 0xe8, 0x87, 0x05, 0x28, 0xe0, 0x91, 0x05, 0x2c, 0x68, 0x87, 0x05,
+    0x30, 0x23, 0x03, 0x1a, 0x06, 0x8b, 0x05, 0x2a, 0x21, 0x83, 0x05, 0x2b,
+    0x61, 0x91, 0x05, 0x2e, 0xe3, 0x03, 0x1a, 0x0e, 0x97, 0x05, 0x2d, 0xa8,
+    0x87, 0x05, 0x29, 0x18, 0x91, 0x05, 0x2c, 0xa0, 0x87, 0x05, 0x28, 0xb8,
+    0x91, 0x05, 0x2c, 0x40, 0x87, 0x05, 0x28, 0xc8, 0x91, 0x05, 0x2c, 0x50,
+    0xc5, 0x00, 0xd4, 0x01, 0x57, 0x79, 0xc5, 0x05, 0x02, 0x01, 0x57, 0x80,
+    0xa5, 0x0c, 0x57, 0xf9, 0xa4, 0x0c, 0x57, 0xf1, 0xa3, 0x0c, 0x57, 0xe9,
+    0xa2, 0x0c, 0x57, 0xe1, 0xa1, 0x0c, 0x57, 0xd9, 0xa0, 0x0c, 0x57, 0xd1,
+    0x9f, 0x0c, 0x57, 0xc9, 0x9e, 0x0c, 0x57, 0xc1, 0x9d, 0x0c, 0x57, 0xb8,
+    0xa6, 0x0c, 0x57, 0xb1, 0xa5, 0x0c, 0x57, 0xa9, 0xa4, 0x0c, 0x57, 0xa1,
+    0xa3, 0x0c, 0x57, 0x99, 0xa2, 0x0c, 0x57, 0x91, 0xa1, 0x0c, 0x57, 0x89,
+    0xa0, 0x0c, 0x57, 0x81, 0x9f, 0x0c, 0x57, 0x79, 0x9e, 0x0c, 0x57, 0x71,
+    0x9d, 0x0c, 0x57, 0x68, 0xa6, 0x0c, 0x57, 0x61, 0xa5, 0x0c, 0x57, 0x59,
+    0xa4, 0x0c, 0x57, 0x51, 0xa3, 0x0c, 0x57, 0x49, 0xa2, 0x0c, 0x57, 0x41,
+    0xa1, 0x0c, 0x57, 0x39, 0xa0, 0x0c, 0x57, 0x31, 0x9f, 0x0c, 0x57, 0x29,
+    0x9e, 0x0c, 0x57, 0x21, 0x9d, 0x0c, 0x57, 0x18, 0xa6, 0x0c, 0x57, 0x11,
+    0xa5, 0x0c, 0x57, 0x09, 0xa4, 0x0c, 0x57, 0x01, 0xa3, 0x0c, 0x56, 0xf9,
+    0xa2, 0x0c, 0x56, 0xf1, 0xa1, 0x0c, 0x56, 0xe9, 0xa0, 0x0c, 0x56, 0xe1,
+    0x9f, 0x0c, 0x56, 0xd9, 0x9e, 0x0c, 0x56, 0xd1, 0x9d, 0x0c, 0x56, 0xc8,
+    0xa6, 0x0c, 0x56, 0xc1, 0xa5, 0x0c, 0x56, 0xb9, 0xa4, 0x0c, 0x56, 0xb1,
+    0xa3, 0x0c, 0x56, 0xa9, 0xa2, 0x0c, 0x56, 0xa1, 0xa1, 0x0c, 0x56, 0x99,
+    0xa0, 0x0c, 0x56, 0x91, 0x9f, 0x0c, 0x56, 0x89, 0x9e, 0x0c, 0x56, 0x81,
+    0x9d, 0x0c, 0x56, 0x78, 0xa6, 0x0c, 0x56, 0x71, 0xa5, 0x0c, 0x56, 0x69,
+    0xa4, 0x0c, 0x56, 0x61, 0xa3, 0x0c, 0x56, 0x59, 0xa2, 0x0c, 0x56, 0x51,
+    0xa1, 0x0c, 0x56, 0x49, 0xa0, 0x0c, 0x56, 0x41, 0x9f, 0x0c, 0x56, 0x39,
+    0x9e, 0x0c, 0x56, 0x31, 0x9d, 0x0c, 0x56, 0x28, 0xa6, 0x0c, 0x56, 0x21,
+    0xa5, 0x0c, 0x56, 0x19, 0xa4, 0x0c, 0x56, 0x11, 0xa3, 0x0c, 0x56, 0x09,
+    0xa2, 0x0c, 0x56, 0x01, 0xa1, 0x0c, 0x55, 0xf9, 0xa0, 0x0c, 0x55, 0xf1,
+    0x9f, 0x0c, 0x55, 0xe9, 0x9e, 0x0c, 0x55, 0xe1, 0x9d, 0x0c, 0x55, 0xd8,
+    0xa6, 0x0c, 0x55, 0xd1, 0xa5, 0x0c, 0x55, 0xc9, 0xa4, 0x0c, 0x55, 0xc1,
+    0xa3, 0x0c, 0x55, 0xb9, 0xa2, 0x0c, 0x55, 0xb1, 0xa1, 0x0c, 0x55, 0xa9,
+    0xa0, 0x0c, 0x55, 0xa1, 0x9f, 0x0c, 0x55, 0x99, 0x9e, 0x0c, 0x55, 0x91,
+    0x9d, 0x0c, 0x55, 0x88, 0xa6, 0x0c, 0x55, 0x81, 0xa5, 0x0c, 0x55, 0x79,
+    0xa4, 0x0c, 0x55, 0x71, 0xa3, 0x0c, 0x55, 0x69, 0xa2, 0x0c, 0x55, 0x61,
+    0xa1, 0x0c, 0x55, 0x59, 0xa0, 0x0c, 0x55, 0x51, 0x9f, 0x0c, 0x55, 0x49,
+    0x9e, 0x0c, 0x55, 0x41, 0x9d, 0x0c, 0x55, 0x38, 0xa6, 0x0c, 0x55, 0x31,
+    0xa5, 0x0c, 0x55, 0x29, 0xa4, 0x0c, 0x55, 0x21, 0xa3, 0x0c, 0x55, 0x19,
+    0xa2, 0x0c, 0x55, 0x11, 0xa1, 0x0c, 0x55, 0x09, 0xa0, 0x0c, 0x55, 0x01,
+    0x9f, 0x0c, 0x54, 0xf9, 0x9e, 0x0c, 0x54, 0xf1, 0x9d, 0x0c, 0x54, 0xe8,
+    0xa6, 0x0c, 0x54, 0xe1, 0xa5, 0x0c, 0x54, 0xd9, 0xa4, 0x0c, 0x54, 0xd1,
+    0xa3, 0x0c, 0x54, 0xc9, 0xa2, 0x0c, 0x54, 0xc1, 0xa1, 0x0c, 0x54, 0xb9,
+    0xa0, 0x0c, 0x54, 0xb1, 0x9f, 0x0c, 0x54, 0xa9, 0x9e, 0x0c, 0x54, 0xa1,
+    0x9d, 0x0c, 0x54, 0x98, 0xa6, 0x0c, 0x54, 0x91, 0xa5, 0x0c, 0x54, 0x89,
+    0xa4, 0x0c, 0x54, 0x81, 0xa3, 0x0c, 0x54, 0x79, 0xa2, 0x0c, 0x54, 0x71,
+    0xa1, 0x0c, 0x54, 0x69, 0xa0, 0x0c, 0x54, 0x61, 0x9f, 0x0c, 0x54, 0x59,
+    0x9e, 0x0c, 0x54, 0x51, 0x9d, 0x0c, 0x54, 0x48, 0xa6, 0x0c, 0x54, 0x41,
+    0xa5, 0x0c, 0x54, 0x39, 0xa4, 0x0c, 0x54, 0x31, 0xa3, 0x0c, 0x54, 0x29,
+    0xa2, 0x0c, 0x54, 0x21, 0xa1, 0x0c, 0x54, 0x19, 0xa0, 0x0c, 0x54, 0x11,
+    0x9f, 0x0c, 0x54, 0x09, 0x9e, 0x0c, 0x54, 0x01, 0x9d, 0x0c, 0x53, 0xf8,
+    0xa6, 0x0c, 0x53, 0xf1, 0xa5, 0x0c, 0x53, 0xe9, 0xa4, 0x0c, 0x53, 0xe1,
+    0xa3, 0x0c, 0x53, 0xd9, 0xa2, 0x0c, 0x53, 0xd1, 0xa1, 0x0c, 0x53, 0xc9,
+    0xa0, 0x0c, 0x53, 0xc1, 0x9f, 0x0c, 0x53, 0xb9, 0x9e, 0x0c, 0x53, 0xb1,
+    0x9d, 0x0c, 0x53, 0xa8, 0xa6, 0x0c, 0x53, 0xa1, 0xa5, 0x0c, 0x53, 0x99,
+    0xa4, 0x0c, 0x53, 0x91, 0xa3, 0x0c, 0x53, 0x89, 0xa2, 0x0c, 0x53, 0x81,
+    0xa1, 0x0c, 0x53, 0x79, 0xa0, 0x0c, 0x53, 0x71, 0x9f, 0x0c, 0x53, 0x69,
+    0x9e, 0x0c, 0x53, 0x61, 0x9d, 0x0c, 0x53, 0x58, 0xa6, 0x0c, 0x53, 0x51,
+    0xa5, 0x0c, 0x53, 0x49, 0xa4, 0x0c, 0x53, 0x41, 0xa3, 0x0c, 0x53, 0x39,
+    0xa2, 0x0c, 0x53, 0x31, 0xa1, 0x0c, 0x53, 0x29, 0xa0, 0x0c, 0x53, 0x21,
+    0x9f, 0x0c, 0x53, 0x19, 0x9e, 0x0c, 0x53, 0x11, 0x9d, 0x0c, 0x53, 0x08,
+    0xa6, 0x0c, 0x53, 0x01, 0xa5, 0x0c, 0x52, 0xf9, 0xa4, 0x0c, 0x52, 0xf1,
+    0xa3, 0x0c, 0x52, 0xe9, 0xa2, 0x0c, 0x52, 0xe1, 0xa1, 0x0c, 0x52, 0xd9,
+    0xa0, 0x0c, 0x52, 0xd1, 0x9f, 0x0c, 0x52, 0xc9, 0x9e, 0x0c, 0x52, 0xc1,
+    0x9d, 0x0c, 0x52, 0xb8, 0xa6, 0x0c, 0x52, 0xb1, 0xa5, 0x0c, 0x52, 0xa9,
+    0xa4, 0x0c, 0x52, 0xa1, 0xa3, 0x0c, 0x52, 0x99, 0xa2, 0x0c, 0x52, 0x91,
+    0xa1, 0x0c, 0x52, 0x89, 0xa0, 0x0c, 0x52, 0x81, 0x9f, 0x0c, 0x52, 0x79,
+    0x9e, 0x0c, 0x52, 0x71, 0x9d, 0x0c, 0x52, 0x68, 0xa6, 0x0c, 0x52, 0x61,
+    0xa5, 0x0c, 0x52, 0x59, 0xa4, 0x0c, 0x52, 0x51, 0xa3, 0x0c, 0x52, 0x49,
+    0xa2, 0x0c, 0x52, 0x41, 0xa1, 0x0c, 0x52, 0x39, 0xa0, 0x0c, 0x52, 0x31,
+    0x9f, 0x0c, 0x52, 0x29, 0x9e, 0x0c, 0x52, 0x21, 0x9d, 0x0c, 0x52, 0x18,
+    0xa6, 0x0c, 0x52, 0x11, 0xa5, 0x0c, 0x52, 0x09, 0xa4, 0x0c, 0x52, 0x01,
+    0xa3, 0x0c, 0x51, 0xf9, 0xa2, 0x0c, 0x51, 0xf1, 0xa1, 0x0c, 0x51, 0xe9,
+    0xa0, 0x0c, 0x51, 0xe1, 0x9f, 0x0c, 0x51, 0xd9, 0x9e, 0x0c, 0x51, 0xd1,
+    0x9d, 0x0c, 0x51, 0xc8, 0xa6, 0x0c, 0x51, 0xc1, 0xa5, 0x0c, 0x51, 0xb9,
+    0xa4, 0x0c, 0x51, 0xb1, 0xa3, 0x0c, 0x51, 0xa9, 0xa2, 0x0c, 0x51, 0xa1,
+    0xa1, 0x0c, 0x51, 0x99, 0xa0, 0x0c, 0x51, 0x91, 0x9f, 0x0c, 0x51, 0x89,
+    0x9e, 0x0c, 0x51, 0x81, 0x9d, 0x0c, 0x51, 0x78, 0xa6, 0x0c, 0x51, 0x71,
+    0xa5, 0x0c, 0x51, 0x69, 0xa4, 0x0c, 0x51, 0x61, 0xa3, 0x0c, 0x51, 0x59,
+    0xa2, 0x0c, 0x51, 0x51, 0xa1, 0x0c, 0x51, 0x49, 0xa0, 0x0c, 0x51, 0x41,
+    0x9f, 0x0c, 0x51, 0x39, 0x9e, 0x0c, 0x51, 0x31, 0x9d, 0x0c, 0x51, 0x28,
+    0xa6, 0x0c, 0x51, 0x21, 0xa5, 0x0c, 0x51, 0x19, 0xa4, 0x0c, 0x51, 0x11,
+    0xa3, 0x0c, 0x51, 0x09, 0xa2, 0x0c, 0x51, 0x01, 0xa1, 0x0c, 0x50, 0xf9,
+    0xa0, 0x0c, 0x50, 0xf1, 0x9f, 0x0c, 0x50, 0xe9, 0x9e, 0x0c, 0x50, 0xe1,
+    0x9d, 0x0c, 0x50, 0xd8, 0xa6, 0x0c, 0x50, 0xd1, 0xa5, 0x0c, 0x50, 0xc9,
+    0xa4, 0x0c, 0x50, 0xc1, 0xa3, 0x0c, 0x50, 0xb9, 0xa2, 0x0c, 0x50, 0xb1,
+    0xa1, 0x0c, 0x50, 0xa9, 0xa0, 0x0c, 0x50, 0xa1, 0x9f, 0x0c, 0x50, 0x99,
+    0x9e, 0x0c, 0x50, 0x91, 0x9d, 0x0c, 0x50, 0x88, 0xa6, 0x0c, 0x50, 0x81,
+    0xa5, 0x0c, 0x50, 0x79, 0xa4, 0x0c, 0x50, 0x71, 0xa3, 0x0c, 0x50, 0x69,
+    0xa2, 0x0c, 0x50, 0x61, 0xa1, 0x0c, 0x50, 0x59, 0xa0, 0x0c, 0x50, 0x51,
+    0x9f, 0x0c, 0x50, 0x49, 0x9e, 0x0c, 0x50, 0x41, 0x9d, 0x0c, 0x50, 0x38,
+    0xa6, 0x0c, 0x50, 0x31, 0xa5, 0x0c, 0x50, 0x29, 0xa4, 0x0c, 0x50, 0x21,
+    0xa3, 0x0c, 0x50, 0x19, 0xa2, 0x0c, 0x50, 0x11, 0xa1, 0x0c, 0x50, 0x09,
+    0xa0, 0x0c, 0x50, 0x01, 0x9f, 0x0c, 0x4f, 0xf9, 0x9e, 0x0c, 0x4f, 0xf1,
+    0x9d, 0x0c, 0x4f, 0xe8, 0xa6, 0x0c, 0x4f, 0xe1, 0xa5, 0x0c, 0x4f, 0xd9,
+    0xa4, 0x0c, 0x4f, 0xd1, 0xa3, 0x0c, 0x4f, 0xc9, 0xa2, 0x0c, 0x4f, 0xc1,
+    0xa1, 0x0c, 0x4f, 0xb9, 0xa0, 0x0c, 0x4f, 0xb1, 0x9f, 0x0c, 0x4f, 0xa9,
+    0x9e, 0x0c, 0x4f, 0xa1, 0x9d, 0x0c, 0x4f, 0x98, 0xa6, 0x0c, 0x4f, 0x91,
+    0xa5, 0x0c, 0x4f, 0x89, 0xa4, 0x0c, 0x4f, 0x81, 0xa3, 0x0c, 0x4f, 0x79,
+    0xa2, 0x0c, 0x4f, 0x71, 0xa1, 0x0c, 0x4f, 0x69, 0xa0, 0x0c, 0x4f, 0x61,
+    0x9f, 0x0c, 0x4f, 0x59, 0x9e, 0x0c, 0x4f, 0x51, 0x9d, 0x0c, 0x4f, 0x48,
+    0xa6, 0x0c, 0x4f, 0x41, 0xa5, 0x0c, 0x4f, 0x39, 0xa4, 0x0c, 0x4f, 0x31,
+    0xa3, 0x0c, 0x4f, 0x29, 0xa2, 0x0c, 0x4f, 0x21, 0xa1, 0x0c, 0x4f, 0x19,
+    0xa0, 0x0c, 0x4f, 0x11, 0x9f, 0x0c, 0x4f, 0x09, 0x9e, 0x0c, 0x4f, 0x01,
+    0x9d, 0x0c, 0x4e, 0xf8, 0xa6, 0x0c, 0x4e, 0xf1, 0xa5, 0x0c, 0x4e, 0xe9,
+    0xa4, 0x0c, 0x4e, 0xe1, 0xa3, 0x0c, 0x4e, 0xd9, 0xa2, 0x0c, 0x4e, 0xd1,
+    0xa1, 0x0c, 0x4e, 0xc9, 0xa0, 0x0c, 0x4e, 0xc1, 0x9f, 0x0c, 0x4e, 0xb9,
+    0x9e, 0x0c, 0x4e, 0xb1, 0x9d, 0x0c, 0x4e, 0xa8, 0xa6, 0x0c, 0x4e, 0xa1,
+    0xa5, 0x0c, 0x4e, 0x99, 0xa4, 0x0c, 0x4e, 0x91, 0xa3, 0x0c, 0x4e, 0x89,
+    0xa2, 0x0c, 0x4e, 0x81, 0xa1, 0x0c, 0x4e, 0x79, 0xa0, 0x0c, 0x4e, 0x71,
+    0x9f, 0x0c, 0x4e, 0x69, 0x9e, 0x0c, 0x4e, 0x61, 0x9d, 0x0c, 0x4e, 0x58,
+    0xa6, 0x0c, 0x4e, 0x51, 0xa5, 0x0c, 0x4e, 0x49, 0xa4, 0x0c, 0x4e, 0x41,
+    0xa3, 0x0c, 0x4e, 0x39, 0xa2, 0x0c, 0x4e, 0x31, 0xa1, 0x0c, 0x4e, 0x29,
+    0xa0, 0x0c, 0x4e, 0x21, 0x9f, 0x0c, 0x4e, 0x19, 0x9e, 0x0c, 0x4e, 0x11,
+    0x9d, 0x0c, 0x4e, 0x08, 0xa6, 0x0c, 0x4e, 0x01, 0xa5, 0x0c, 0x4d, 0xf9,
+    0xa4, 0x0c, 0x4d, 0xf1, 0xa3, 0x0c, 0x4d, 0xe9, 0xa2, 0x0c, 0x4d, 0xe1,
+    0xa1, 0x0c, 0x4d, 0xd9, 0xa0, 0x0c, 0x4d, 0xd1, 0x9f, 0x0c, 0x4d, 0xc9,
+    0x9e, 0x0c, 0x4d, 0xc1, 0x9d, 0x0c, 0x4d, 0xb8, 0xa6, 0x0c, 0x4d, 0xb1,
+    0xa5, 0x0c, 0x4d, 0xa9, 0xa4, 0x0c, 0x4d, 0xa1, 0xa3, 0x0c, 0x4d, 0x99,
+    0xa2, 0x0c, 0x4d, 0x91, 0xa1, 0x0c, 0x4d, 0x89, 0xa0, 0x0c, 0x4d, 0x81,
+    0x9f, 0x0c, 0x4d, 0x79, 0x9e, 0x0c, 0x4d, 0x71, 0x9d, 0x0c, 0x4d, 0x68,
+    0xa6, 0x0c, 0x4d, 0x61, 0xa5, 0x0c, 0x4d, 0x59, 0xa4, 0x0c, 0x4d, 0x51,
+    0xa3, 0x0c, 0x4d, 0x49, 0xa2, 0x0c, 0x4d, 0x41, 0xa1, 0x0c, 0x4d, 0x39,
+    0xa0, 0x0c, 0x4d, 0x31, 0x9f, 0x0c, 0x4d, 0x29, 0x9e, 0x0c, 0x4d, 0x21,
+    0x9d, 0x0c, 0x4d, 0x18, 0xa6, 0x0c, 0x4d, 0x11, 0xa5, 0x0c, 0x4d, 0x09,
+    0xa4, 0x0c, 0x4d, 0x01, 0xa3, 0x0c, 0x4c, 0xf9, 0xa2, 0x0c, 0x4c, 0xf1,
+    0xa1, 0x0c, 0x4c, 0xe9, 0xa0, 0x0c, 0x4c, 0xe1, 0x9f, 0x0c, 0x4c, 0xd9,
+    0x9e, 0x0c, 0x4c, 0xd1, 0x9d, 0x0c, 0x4c, 0xc8, 0xa6, 0x0c, 0x4c, 0xc1,
+    0xa5, 0x0c, 0x4c, 0xb9, 0xa4, 0x0c, 0x4c, 0xb1, 0xa3, 0x0c, 0x4c, 0xa9,
+    0xa2, 0x0c, 0x4c, 0xa1, 0xa1, 0x0c, 0x4c, 0x99, 0xa0, 0x0c, 0x4c, 0x91,
+    0x9f, 0x0c, 0x4c, 0x89, 0x9e, 0x0c, 0x4c, 0x81, 0x9d, 0x0c, 0x4c, 0x78,
+    0xa6, 0x0c, 0x4c, 0x71, 0xa5, 0x0c, 0x4c, 0x69, 0xa4, 0x0c, 0x4c, 0x61,
+    0xa3, 0x0c, 0x4c, 0x59, 0xa2, 0x0c, 0x4c, 0x51, 0xa1, 0x0c, 0x4c, 0x49,
+    0xa0, 0x0c, 0x4c, 0x41, 0x9f, 0x0c, 0x4c, 0x39, 0x9e, 0x0c, 0x4c, 0x31,
+    0x9d, 0x0c, 0x4c, 0x28, 0xa6, 0x0c, 0x4c, 0x21, 0xa5, 0x0c, 0x4c, 0x19,
+    0xa4, 0x0c, 0x4c, 0x11, 0xa3, 0x0c, 0x4c, 0x09, 0xa2, 0x0c, 0x4c, 0x01,
+    0xa1, 0x0c, 0x4b, 0xf9, 0xa0, 0x0c, 0x4b, 0xf1, 0x9f, 0x0c, 0x4b, 0xe9,
+    0x9e, 0x0c, 0x4b, 0xe1, 0x9d, 0x0c, 0x4b, 0xd8, 0xa6, 0x0c, 0x4b, 0xd1,
+    0xa5, 0x0c, 0x4b, 0xc9, 0xa4, 0x0c, 0x4b, 0xc1, 0xa3, 0x0c, 0x4b, 0xb9,
+    0xa2, 0x0c, 0x4b, 0xb1, 0xa1, 0x0c, 0x4b, 0xa9, 0xa0, 0x0c, 0x4b, 0xa1,
+    0x9f, 0x0c, 0x4b, 0x99, 0x9e, 0x0c, 0x4b, 0x91, 0x9d, 0x0c, 0x4b, 0x88,
+    0xa6, 0x0c, 0x4b, 0x81, 0xa5, 0x0c, 0x4b, 0x79, 0xa4, 0x0c, 0x4b, 0x71,
+    0xa3, 0x0c, 0x4b, 0x69, 0xa2, 0x0c, 0x4b, 0x61, 0xa1, 0x0c, 0x4b, 0x59,
+    0xa0, 0x0c, 0x4b, 0x51, 0x9f, 0x0c, 0x4b, 0x49, 0x9e, 0x0c, 0x4b, 0x41,
+    0x9d, 0x0c, 0x4b, 0x38, 0xa6, 0x0c, 0x4b, 0x31, 0xa5, 0x0c, 0x4b, 0x29,
+    0xa4, 0x0c, 0x4b, 0x21, 0xa3, 0x0c, 0x4b, 0x19, 0xa2, 0x0c, 0x4b, 0x11,
+    0xa1, 0x0c, 0x4b, 0x09, 0xa0, 0x0c, 0x4b, 0x01, 0x9f, 0x0c, 0x4a, 0xf9,
+    0x9e, 0x0c, 0x4a, 0xf1, 0x9d, 0x0c, 0x4a, 0xe8, 0xa6, 0x0c, 0x4a, 0xe1,
+    0xa5, 0x0c, 0x4a, 0xd9, 0xa4, 0x0c, 0x4a, 0xd1, 0xa3, 0x0c, 0x4a, 0xc9,
+    0xa2, 0x0c, 0x4a, 0xc1, 0xa1, 0x0c, 0x4a, 0xb9, 0xa0, 0x0c, 0x4a, 0xb1,
+    0x9f, 0x0c, 0x4a, 0xa9, 0x9e, 0x0c, 0x4a, 0xa1, 0x9d, 0x0c, 0x4a, 0x98,
+    0xa6, 0x0c, 0x4a, 0x91, 0xa5, 0x0c, 0x4a, 0x89, 0xa4, 0x0c, 0x4a, 0x81,
+    0xa3, 0x0c, 0x4a, 0x79, 0xa2, 0x0c, 0x4a, 0x71, 0xa1, 0x0c, 0x4a, 0x69,
+    0xa0, 0x0c, 0x4a, 0x61, 0x9f, 0x0c, 0x4a, 0x59, 0x9e, 0x0c, 0x4a, 0x51,
+    0x9d, 0x0c, 0x4a, 0x48, 0xa6, 0x0c, 0x4a, 0x41, 0xa5, 0x0c, 0x4a, 0x39,
+    0xa4, 0x0c, 0x4a, 0x31, 0xa3, 0x0c, 0x4a, 0x29, 0xa2, 0x0c, 0x4a, 0x21,
+    0xa1, 0x0c, 0x4a, 0x19, 0xa0, 0x0c, 0x4a, 0x11, 0x9f, 0x0c, 0x4a, 0x09,
+    0x9e, 0x0c, 0x4a, 0x01, 0x9d, 0x0c, 0x49, 0xf8, 0xa6, 0x0c, 0x49, 0xf1,
+    0xa5, 0x0c, 0x49, 0xe9, 0xa4, 0x0c, 0x49, 0xe1, 0xa3, 0x0c, 0x49, 0xd9,
+    0xa2, 0x0c, 0x49, 0xd1, 0xa1, 0x0c, 0x49, 0xc9, 0xa0, 0x0c, 0x49, 0xc1,
+    0x9f, 0x0c, 0x49, 0xb9, 0x9e, 0x0c, 0x49, 0xb1, 0x9d, 0x0c, 0x49, 0xa8,
+    0xa6, 0x0c, 0x49, 0xa1, 0xa5, 0x0c, 0x49, 0x99, 0xa4, 0x0c, 0x49, 0x91,
+    0xa3, 0x0c, 0x49, 0x89, 0xa2, 0x0c, 0x49, 0x81, 0xa1, 0x0c, 0x49, 0x79,
+    0xa0, 0x0c, 0x49, 0x71, 0x9f, 0x0c, 0x49, 0x69, 0x9e, 0x0c, 0x49, 0x61,
+    0x9d, 0x0c, 0x49, 0x58, 0xa6, 0x0c, 0x49, 0x51, 0xa5, 0x0c, 0x49, 0x49,
+    0xa4, 0x0c, 0x49, 0x41, 0xa3, 0x0c, 0x49, 0x39, 0xa2, 0x0c, 0x49, 0x31,
+    0xa1, 0x0c, 0x49, 0x29, 0xa0, 0x0c, 0x49, 0x21, 0x9f, 0x0c, 0x49, 0x19,
+    0x9e, 0x0c, 0x49, 0x11, 0x9d, 0x0c, 0x49, 0x08, 0xa6, 0x0c, 0x49, 0x01,
+    0xa5, 0x0c, 0x48, 0xf9, 0xa4, 0x0c, 0x48, 0xf1, 0xa3, 0x0c, 0x48, 0xe9,
+    0xa2, 0x0c, 0x48, 0xe1, 0xa1, 0x0c, 0x48, 0xd9, 0xa0, 0x0c, 0x48, 0xd1,
+    0x9f, 0x0c, 0x48, 0xc9, 0x9e, 0x0c, 0x48, 0xc1, 0x9d, 0x0c, 0x48, 0xb8,
+    0xa6, 0x0c, 0x48, 0xb1, 0xa5, 0x0c, 0x48, 0xa9, 0xa4, 0x0c, 0x48, 0xa1,
+    0xa3, 0x0c, 0x48, 0x99, 0xa2, 0x0c, 0x48, 0x91, 0xa1, 0x0c, 0x48, 0x89,
+    0xa0, 0x0c, 0x48, 0x81, 0x9f, 0x0c, 0x48, 0x79, 0x9e, 0x0c, 0x48, 0x71,
+    0x9d, 0x0c, 0x48, 0x68, 0xa6, 0x0c, 0x48, 0x61, 0xa5, 0x0c, 0x48, 0x59,
+    0xa4, 0x0c, 0x48, 0x51, 0xa3, 0x0c, 0x48, 0x49, 0xa2, 0x0c, 0x48, 0x41,
+    0xa1, 0x0c, 0x48, 0x39, 0xa0, 0x0c, 0x48, 0x31, 0x9f, 0x0c, 0x48, 0x29,
+    0x9e, 0x0c, 0x48, 0x21, 0x9d, 0x0c, 0x48, 0x18, 0xa6, 0x0c, 0x48, 0x11,
+    0xa5, 0x0c, 0x48, 0x09, 0xa4, 0x0c, 0x48, 0x01, 0xa3, 0x0c, 0x47, 0xf9,
+    0xa2, 0x0c, 0x47, 0xf1, 0xa1, 0x0c, 0x47, 0xe9, 0xa0, 0x0c, 0x47, 0xe1,
+    0x9f, 0x0c, 0x47, 0xd9, 0x9e, 0x0c, 0x47, 0xd1, 0x9d, 0x0c, 0x47, 0xc8,
+    0xa6, 0x0c, 0x47, 0xc1, 0xa5, 0x0c, 0x47, 0xb9, 0xa4, 0x0c, 0x47, 0xb1,
+    0xa3, 0x0c, 0x47, 0xa9, 0xa2, 0x0c, 0x47, 0xa1, 0xa1, 0x0c, 0x47, 0x99,
+    0xa0, 0x0c, 0x47, 0x91, 0x9f, 0x0c, 0x47, 0x89, 0x9e, 0x0c, 0x47, 0x81,
+    0x9d, 0x0c, 0x47, 0x78, 0xa6, 0x0c, 0x47, 0x71, 0xa5, 0x0c, 0x47, 0x69,
+    0xa4, 0x0c, 0x47, 0x61, 0xa3, 0x0c, 0x47, 0x59, 0xa2, 0x0c, 0x47, 0x51,
+    0xa1, 0x0c, 0x47, 0x49, 0xa0, 0x0c, 0x47, 0x41, 0x9f, 0x0c, 0x47, 0x39,
+    0x9e, 0x0c, 0x47, 0x31, 0x9d, 0x0c, 0x47, 0x28, 0xa6, 0x0c, 0x47, 0x21,
+    0xa5, 0x0c, 0x47, 0x19, 0xa4, 0x0c, 0x47, 0x11, 0xa3, 0x0c, 0x47, 0x09,
+    0xa2, 0x0c, 0x47, 0x01, 0xa1, 0x0c, 0x46, 0xf9, 0xa0, 0x0c, 0x46, 0xf1,
+    0x9f, 0x0c, 0x46, 0xe9, 0x9e, 0x0c, 0x46, 0xe1, 0x9d, 0x0c, 0x46, 0xd8,
+    0xa6, 0x0c, 0x46, 0xd1, 0xa5, 0x0c, 0x46, 0xc9, 0xa4, 0x0c, 0x46, 0xc1,
+    0xa3, 0x0c, 0x46, 0xb9, 0xa2, 0x0c, 0x46, 0xb1, 0xa1, 0x0c, 0x46, 0xa9,
+    0xa0, 0x0c, 0x46, 0xa1, 0x9f, 0x0c, 0x46, 0x99, 0x9e, 0x0c, 0x46, 0x91,
+    0x9d, 0x0c, 0x46, 0x88, 0xa6, 0x0c, 0x46, 0x81, 0xa5, 0x0c, 0x46, 0x79,
+    0xa4, 0x0c, 0x46, 0x71, 0xa3, 0x0c, 0x46, 0x69, 0xa2, 0x0c, 0x46, 0x61,
+    0xa1, 0x0c, 0x46, 0x59, 0xa0, 0x0c, 0x46, 0x51, 0x9f, 0x0c, 0x46, 0x49,
+    0x9e, 0x0c, 0x46, 0x41, 0x9d, 0x0c, 0x46, 0x38, 0xa6, 0x0c, 0x46, 0x31,
+    0xa5, 0x0c, 0x46, 0x29, 0xa4, 0x0c, 0x46, 0x21, 0xa3, 0x0c, 0x46, 0x19,
+    0xa2, 0x0c, 0x46, 0x11, 0xa1, 0x0c, 0x46, 0x09, 0xa0, 0x0c, 0x46, 0x01,
+    0x9f, 0x0c, 0x45, 0xf9, 0x9e, 0x0c, 0x45, 0xf1, 0x9d, 0x0c, 0x45, 0xe8,
+    0xa6, 0x0c, 0x45, 0xe1, 0xa5, 0x0c, 0x45, 0xd9, 0xa4, 0x0c, 0x45, 0xd1,
+    0xa3, 0x0c, 0x45, 0xc9, 0xa2, 0x0c, 0x45, 0xc1, 0xa1, 0x0c, 0x45, 0xb9,
+    0xa0, 0x0c, 0x45, 0xb1, 0x9f, 0x0c, 0x45, 0xa9, 0x9e, 0x0c, 0x45, 0xa1,
+    0x9d, 0x0c, 0x45, 0x98, 0xa6, 0x0c, 0x45, 0x91, 0xa5, 0x0c, 0x45, 0x89,
+    0xa4, 0x0c, 0x45, 0x81, 0xa3, 0x0c, 0x45, 0x79, 0xa2, 0x0c, 0x45, 0x71,
+    0xa1, 0x0c, 0x45, 0x69, 0xa0, 0x0c, 0x45, 0x61, 0x9f, 0x0c, 0x45, 0x59,
+    0x9e, 0x0c, 0x45, 0x51, 0x9d, 0x0c, 0x45, 0x48, 0xa6, 0x0c, 0x45, 0x41,
+    0xa5, 0x0c, 0x45, 0x39, 0xa4, 0x0c, 0x45, 0x31, 0xa3, 0x0c, 0x45, 0x29,
+    0xa2, 0x0c, 0x45, 0x21, 0xa1, 0x0c, 0x45, 0x19, 0xa0, 0x0c, 0x45, 0x11,
+    0x9f, 0x0c, 0x45, 0x09, 0x9e, 0x0c, 0x45, 0x01, 0x9d, 0x0c, 0x44, 0xf8,
+    0xa6, 0x0c, 0x44, 0xf1, 0xa5, 0x0c, 0x44, 0xe9, 0xa4, 0x0c, 0x44, 0xe1,
+    0xa3, 0x0c, 0x44, 0xd9, 0xa2, 0x0c, 0x44, 0xd1, 0xa1, 0x0c, 0x44, 0xc9,
+    0xa0, 0x0c, 0x44, 0xc1, 0x9f, 0x0c, 0x44, 0xb9, 0x9e, 0x0c, 0x44, 0xb1,
+    0x9d, 0x0c, 0x44, 0xa8, 0xa6, 0x0c, 0x44, 0xa1, 0xa5, 0x0c, 0x44, 0x99,
+    0xa4, 0x0c, 0x44, 0x91, 0xa3, 0x0c, 0x44, 0x89, 0xa2, 0x0c, 0x44, 0x81,
+    0xa1, 0x0c, 0x44, 0x79, 0xa0, 0x0c, 0x44, 0x71, 0x9f, 0x0c, 0x44, 0x69,
+    0x9e, 0x0c, 0x44, 0x61, 0x9d, 0x0c, 0x44, 0x58, 0xa6, 0x0c, 0x44, 0x51,
+    0xa5, 0x0c, 0x44, 0x49, 0xa4, 0x0c, 0x44, 0x41, 0xa3, 0x0c, 0x44, 0x39,
+    0xa2, 0x0c, 0x44, 0x31, 0xa1, 0x0c, 0x44, 0x29, 0xa0, 0x0c, 0x44, 0x21,
+    0x9f, 0x0c, 0x44, 0x19, 0x9e, 0x0c, 0x44, 0x11, 0x9d, 0x0c, 0x44, 0x08,
+    0xa6, 0x0c, 0x44, 0x01, 0xa5, 0x0c, 0x43, 0xf9, 0xa4, 0x0c, 0x43, 0xf1,
+    0xa3, 0x0c, 0x43, 0xe9, 0xa2, 0x0c, 0x43, 0xe1, 0xa1, 0x0c, 0x43, 0xd9,
+    0xa0, 0x0c, 0x43, 0xd1, 0x9f, 0x0c, 0x43, 0xc9, 0x9e, 0x0c, 0x43, 0xc1,
+    0x9d, 0x0c, 0x43, 0xb8, 0xa6, 0x0c, 0x43, 0xb1, 0xa5, 0x0c, 0x43, 0xa9,
+    0xa4, 0x0c, 0x43, 0xa1, 0xa3, 0x0c, 0x43, 0x99, 0xa2, 0x0c, 0x43, 0x91,
+    0xa1, 0x0c, 0x43, 0x89, 0xa0, 0x0c, 0x43, 0x81, 0x9f, 0x0c, 0x43, 0x79,
+    0x9e, 0x0c, 0x43, 0x71, 0x9d, 0x0c, 0x43, 0x68, 0xa6, 0x0c, 0x43, 0x61,
+    0xa5, 0x0c, 0x43, 0x59, 0xa4, 0x0c, 0x43, 0x51, 0xa3, 0x0c, 0x43, 0x49,
+    0xa2, 0x0c, 0x43, 0x41, 0xa1, 0x0c, 0x43, 0x39, 0xa0, 0x0c, 0x43, 0x31,
+    0x9f, 0x0c, 0x43, 0x29, 0x9e, 0x0c, 0x43, 0x21, 0x9d, 0x0c, 0x43, 0x18,
+    0xa6, 0x0c, 0x43, 0x11, 0xa5, 0x0c, 0x43, 0x09, 0xa4, 0x0c, 0x43, 0x01,
+    0xa3, 0x0c, 0x42, 0xf9, 0xa2, 0x0c, 0x42, 0xf1, 0xa1, 0x0c, 0x42, 0xe9,
+    0xa0, 0x0c, 0x42, 0xe1, 0x9f, 0x0c, 0x42, 0xd9, 0x9e, 0x0c, 0x42, 0xd1,
+    0x9d, 0x0c, 0x42, 0xc8, 0xa6, 0x0c, 0x42, 0xc1, 0xa5, 0x0c, 0x42, 0xb9,
+    0xa4, 0x0c, 0x42, 0xb1, 0xa3, 0x0c, 0x42, 0xa9, 0xa2, 0x0c, 0x42, 0xa1,
+    0xa1, 0x0c, 0x42, 0x99, 0xa0, 0x0c, 0x42, 0x91, 0x9f, 0x0c, 0x42, 0x89,
+    0x9e, 0x0c, 0x42, 0x81, 0x9d, 0x0c, 0x42, 0x78, 0xa6, 0x0c, 0x42, 0x71,
+    0xa5, 0x0c, 0x42, 0x69, 0xa4, 0x0c, 0x42, 0x61, 0xa3, 0x0c, 0x42, 0x59,
+    0xa2, 0x0c, 0x42, 0x51, 0xa1, 0x0c, 0x42, 0x49, 0xa0, 0x0c, 0x42, 0x41,
+    0x9f, 0x0c, 0x42, 0x39, 0x9e, 0x0c, 0x42, 0x31, 0x9d, 0x0c, 0x42, 0x28,
+    0xa6, 0x0c, 0x42, 0x21, 0xa5, 0x0c, 0x42, 0x19, 0xa4, 0x0c, 0x42, 0x11,
+    0xa3, 0x0c, 0x42, 0x09, 0xa2, 0x0c, 0x42, 0x01, 0xa1, 0x0c, 0x41, 0xf9,
+    0xa0, 0x0c, 0x41, 0xf1, 0x9f, 0x0c, 0x41, 0xe9, 0x9e, 0x0c, 0x41, 0xe1,
+    0x9d, 0x0c, 0x41, 0xd8, 0xa6, 0x0c, 0x41, 0xd1, 0xa5, 0x0c, 0x41, 0xc9,
+    0xa4, 0x0c, 0x41, 0xc1, 0xa3, 0x0c, 0x41, 0xb9, 0xa2, 0x0c, 0x41, 0xb1,
+    0xa1, 0x0c, 0x41, 0xa9, 0xa0, 0x0c, 0x41, 0xa1, 0x9f, 0x0c, 0x41, 0x99,
+    0x9e, 0x0c, 0x41, 0x91, 0x9d, 0x0c, 0x41, 0x88, 0xa6, 0x0c, 0x41, 0x81,
+    0xa5, 0x0c, 0x41, 0x79, 0xa4, 0x0c, 0x41, 0x71, 0xa3, 0x0c, 0x41, 0x69,
+    0xa2, 0x0c, 0x41, 0x61, 0xa1, 0x0c, 0x41, 0x59, 0xa0, 0x0c, 0x41, 0x51,
+    0x9f, 0x0c, 0x41, 0x49, 0x9e, 0x0c, 0x41, 0x41, 0x9d, 0x0c, 0x41, 0x38,
+    0xa6, 0x0c, 0x41, 0x31, 0xa5, 0x0c, 0x41, 0x29, 0xa4, 0x0c, 0x41, 0x21,
+    0xa3, 0x0c, 0x41, 0x19, 0xa2, 0x0c, 0x41, 0x11, 0xa1, 0x0c, 0x41, 0x09,
+    0xa0, 0x0c, 0x41, 0x01, 0x9f, 0x0c, 0x40, 0xf9, 0x9e, 0x0c, 0x40, 0xf1,
+    0x9d, 0x0c, 0x40, 0xe8, 0xa6, 0x0c, 0x40, 0xe1, 0xa5, 0x0c, 0x40, 0xd9,
+    0xa4, 0x0c, 0x40, 0xd1, 0xa3, 0x0c, 0x40, 0xc9, 0xa2, 0x0c, 0x40, 0xc1,
+    0xa1, 0x0c, 0x40, 0xb9, 0xa0, 0x0c, 0x40, 0xb1, 0x9f, 0x0c, 0x40, 0xa9,
+    0x9e, 0x0c, 0x40, 0xa1, 0x9d, 0x0c, 0x40, 0x98, 0xa6, 0x0c, 0x40, 0x91,
+    0xa5, 0x0c, 0x40, 0x89, 0xa4, 0x0c, 0x40, 0x81, 0xa3, 0x0c, 0x40, 0x79,
+    0xa2, 0x0c, 0x40, 0x71, 0xa1, 0x0c, 0x40, 0x69, 0xa0, 0x0c, 0x40, 0x61,
+    0x9f, 0x0c, 0x40, 0x59, 0x9e, 0x0c, 0x40, 0x51, 0x9d, 0x0c, 0x40, 0x48,
+    0xa6, 0x0c, 0x40, 0x41, 0xa5, 0x0c, 0x40, 0x39, 0xa4, 0x0c, 0x40, 0x31,
+    0xa3, 0x0c, 0x40, 0x29, 0xa2, 0x0c, 0x40, 0x21, 0xa1, 0x0c, 0x40, 0x19,
+    0xa0, 0x0c, 0x40, 0x11, 0x9f, 0x0c, 0x40, 0x09, 0x9e, 0x0c, 0x40, 0x00,
+    0xc2, 0x01, 0x6f, 0x0b, 0x55, 0xc1, 0x83, 0x0b, 0x55, 0x78, 0x83, 0x0b,
+    0x55, 0xa1, 0x44, 0x2e, 0xf0, 0x43, 0x1a, 0x12, 0x17, 0xc3, 0x1a, 0x1e,
+    0x9a, 0x0b, 0x54, 0x79, 0x93, 0x0b, 0x54, 0x71, 0x85, 0x0b, 0x54, 0x69,
+    0x9c, 0x0b, 0x54, 0x60, 0x9a, 0x0b, 0x54, 0xb9, 0x93, 0x0b, 0x54, 0xb1,
+    0x9c, 0x0b, 0x54, 0xa9, 0x85, 0x0b, 0x54, 0xa0, 0x9a, 0x0b, 0x54, 0x59,
+    0x93, 0x0b, 0x54, 0x51, 0x85, 0x0b, 0x54, 0x49, 0x9c, 0x0b, 0x54, 0x40,
+    0xc8, 0xb5, 0x2a, 0x08, 0xff, 0x89, 0xc6, 0xce, 0x27, 0x08, 0xff, 0x00,
+    0xc5, 0x40, 0xe7, 0x00, 0x5c, 0x19, 0xc4, 0x1e, 0x97, 0x00, 0x5e, 0x68,
+    0xc3, 0x7c, 0xc4, 0x08, 0xff, 0x11, 0xc4, 0xc9, 0xed, 0x08, 0xfe, 0xd0,
+    0xc4, 0x70, 0x1e, 0x08, 0xff, 0x09, 0xc3, 0x00, 0xc1, 0x08, 0xfe, 0xf1,
+    0xc6, 0xd1, 0x27, 0x08, 0xfe, 0xd8, 0x83, 0x00, 0x5d, 0x19, 0xc2, 0x00,
+    0xc1, 0x00, 0x5d, 0x48, 0x83, 0x00, 0x5d, 0x99, 0xc2, 0x00, 0xdb, 0x00,
+    0x5d, 0xa0, 0xcb, 0x8b, 0xe9, 0x08, 0xfe, 0x29, 0xd9, 0x1e, 0x05, 0x08,
+    0xfe, 0x00, 0x9f, 0x08, 0xfe, 0x51, 0x9e, 0x08, 0xfe, 0x48, 0xa2, 0x00,
+    0xd3, 0xc9, 0xa1, 0x00, 0xd3, 0xc1, 0xa0, 0x00, 0xd3, 0xb8, 0xc2, 0x00,
+    0xdb, 0x00, 0xd2, 0xb1, 0xc2, 0x00, 0x39, 0x00, 0xd2, 0xa8, 0xc2, 0x00,
+    0xd0, 0x00, 0xd1, 0xe9, 0x83, 0x00, 0xd1, 0xd8, 0xc2, 0x00, 0xd0, 0x00,
+    0xd1, 0xa9, 0x83, 0x00, 0xd1, 0xa0, 0xc2, 0x00, 0xd0, 0x00, 0xd1, 0x59,
+    0x83, 0x00, 0xd1, 0x48, 0xc2, 0x00, 0xd0, 0x00, 0xd1, 0x29, 0xc2, 0x8d,
+    0x8f, 0x00, 0xd1, 0x21, 0x83, 0x00, 0xd1, 0x18, 0xc2, 0x01, 0x23, 0x05,
+    0x54, 0x29, 0x91, 0x05, 0x54, 0x18, 0xc2, 0x01, 0x23, 0x05, 0x54, 0x21,
+    0x91, 0x05, 0x54, 0x10, 0x00, 0xc3, 0x1a, 0x2e, 0xc3, 0x9b, 0x00, 0x00,
+    0x72, 0xd8, 0xc2, 0x00, 0xc4, 0x00, 0x70, 0x99, 0x97, 0x00, 0x70, 0xc8,
+    0x89, 0x00, 0x70, 0x50, 0x15, 0xc3, 0x1a, 0x3a, 0xc4, 0xde, 0xf3, 0x00,
+    0x71, 0x48, 0x83, 0x00, 0x71, 0x83, 0x03, 0x1a, 0x4a, 0x8b, 0x00, 0x71,
+    0xa3, 0x03, 0x1a, 0x5c, 0x97, 0x00, 0x71, 0xc3, 0x03, 0x1a, 0x60, 0x87,
+    0x00, 0x72, 0x01, 0x91, 0x00, 0x72, 0x10, 0xc3, 0x00, 0x74, 0x00, 0x70,
+    0x69, 0xc2, 0x06, 0x4e, 0x00, 0x71, 0x10, 0xc5, 0xd4, 0x25, 0x00, 0x70,
+    0x79, 0xc3, 0x97, 0x59, 0x00, 0x70, 0xa8, 0x42, 0x01, 0x7c, 0xc3, 0x1a,
+    0x6b, 0xc9, 0xb1, 0x3a, 0x00, 0x72, 0x60, 0x42, 0x01, 0x7c, 0xc3, 0x1a,
+    0x7d, 0xc5, 0xd4, 0x2f, 0x00, 0x71, 0xd0, 0x90, 0x00, 0x70, 0xf8, 0x00,
+    0xc3, 0x1a, 0x89, 0xc5, 0xd4, 0x98, 0x00, 0x72, 0x31, 0xc6, 0xd3, 0x07,
+    0x00, 0x72, 0x38, 0xc4, 0x04, 0x15, 0x00, 0x71, 0x29, 0xc5, 0xdb, 0xf5,
+    0x00, 0x71, 0x60, 0x91, 0x0f, 0x15, 0x48, 0x97, 0x0f, 0x15, 0x20, 0x94,
+    0x00, 0x60, 0x5b, 0x03, 0x1a, 0x9f, 0x8e, 0x00, 0x60, 0x62, 0x03, 0x1a,
+    0xa3, 0xcb, 0x90, 0x44, 0x00, 0x62, 0xe8, 0x83, 0x00, 0x60, 0xf9, 0xc2,
+    0x00, 0xd0, 0x00, 0x61, 0x00, 0x83, 0x00, 0x61, 0x09, 0xc2, 0x00, 0xd0,
+    0x00, 0x61, 0x10, 0x83, 0x00, 0x61, 0x89, 0xc2, 0x00, 0x39, 0x00, 0x62,
+    0xd0, 0x83, 0x00, 0x61, 0x99, 0xc2, 0x00, 0xdb, 0x00, 0x61, 0xa0, 0x8e,
+    0x08, 0xa4, 0x50, 0x94, 0x08, 0xa4, 0x40, 0xcb, 0x97, 0x19, 0x00, 0x7e,
+    0x51, 0xcb, 0x8f, 0x47, 0x00, 0x7e, 0x59, 0xcb, 0x97, 0xd4, 0x00, 0x7e,
+    0x60, 0x09, 0xc3, 0x1a, 0xa7, 0xc8, 0xbc, 0x82, 0x00, 0x78, 0xf8, 0x09,
+    0xc3, 0x1a, 0xb9, 0xc9, 0xa9, 0xbd, 0x00, 0x7e, 0x70, 0x83, 0x00, 0x7c,
+    0xd1, 0xc2, 0x00, 0xd0, 0x00, 0x7c, 0xd8, 0x83, 0x00, 0x7d, 0x49, 0xc2,
+    0x00, 0xd0, 0x00, 0x7d, 0x50, 0x83, 0x00, 0x7c, 0xe1, 0xc2, 0x00, 0xd0,
+    0x00, 0x7c, 0xe8, 0x83, 0x00, 0x7d, 0x59, 0xc2, 0x00, 0xd0, 0x00, 0x7d,
+    0x60, 0xcc, 0x85, 0x05, 0x00, 0x78, 0x11, 0xcd, 0x75, 0x99, 0x00, 0x78,
+    0x18, 0x8a, 0x01, 0x69, 0xa0, 0x8a, 0x01, 0x69, 0xd0, 0x8a, 0x01, 0x69,
+    0xf8, 0x4d, 0x06, 0x5a, 0xc3, 0x1a, 0xcb, 0x45, 0x19, 0x60, 0xc3, 0x1a,
+    0xd7, 0x44, 0x19, 0x6a, 0xc3, 0x1a, 0xe1, 0x44, 0x2b, 0xb9, 0x43, 0x1a,
+    0xeb, 0x44, 0x2b, 0xb9, 0xc3, 0x1a, 0xf7, 0x4d, 0x06, 0x5a, 0xc3, 0x1b,
+    0x03, 0x45, 0x19, 0x60, 0xc3, 0x1b, 0x0f, 0x45, 0x30, 0xc1, 0x43, 0x1b,
+    0x19, 0xd1, 0x4f, 0xad, 0x07, 0xe2, 0xa1, 0xda, 0x1c, 0xba, 0x07, 0xe2,
+    0x99, 0x45, 0x19, 0x60, 0xc3, 0x1b, 0x23, 0x46, 0x30, 0xc1, 0xc3, 0x1b,
+    0x2d, 0xdd, 0x10, 0xa3, 0x07, 0xe6, 0xc8, 0x49, 0xb2, 0x6c, 0xc3, 0x1b,
+    0x39, 0x4a, 0xa7, 0xe2, 0x43, 0x1b, 0x61, 0x4d, 0x06, 0x5a, 0xc3, 0x1b,
+    0x79, 0x45, 0x19, 0x60, 0xc3, 0x1b, 0x85, 0x45, 0x50, 0xf0, 0xc3, 0x1b,
+    0x95, 0x0a, 0xc3, 0x1b, 0xa5, 0x45, 0x30, 0xc1, 0xc3, 0x1b, 0xb1, 0x44,
+    0x72, 0xf0, 0xc3, 0x1b, 0xc1, 0x44, 0x2b, 0xb9, 0x43, 0x1b, 0xcd, 0x47,
+    0x06, 0xb4, 0xc3, 0x1b, 0xd9, 0x0e, 0x43, 0x1b, 0xfd, 0xcd, 0x00, 0xfa,
+    0x07, 0xe7, 0xd1, 0xca, 0x26, 0xf7, 0x07, 0xe8, 0xb0, 0x0b, 0xc3, 0x1c,
+    0x07, 0x45, 0x00, 0x8c, 0x43, 0x1c, 0x13, 0xcc, 0x00, 0xfb, 0x07, 0xe1,
+    0x59, 0xcb, 0x10, 0xb5, 0x07, 0xe5, 0xe0, 0xca, 0x26, 0xf7, 0x07, 0xe8,
+    0xa9, 0xcd, 0x00, 0xfa, 0x07, 0xe7, 0xc8, 0x4d, 0x06, 0x5a, 0xc3, 0x1c,
+    0x25, 0x45, 0x19, 0x60, 0xc3, 0x1c, 0x31, 0x45, 0x30, 0xc1, 0xc3, 0x1c,
+    0x3b, 0x44, 0x2b, 0xb9, 0x43, 0x1c, 0x45, 0x43, 0x06, 0x5c, 0xc3, 0x1c,
+    0x51, 0x43, 0x14, 0x6d, 0xc3, 0x1c, 0x5d, 0xd1, 0x51, 0x9a, 0x07, 0xef,
+    0x90, 0x47, 0x0e, 0x9d, 0xc3, 0x1c, 0x6d, 0xd2, 0x47, 0x81, 0x07, 0xea,
+    0x70, 0x48, 0xab, 0xf5, 0xc3, 0x1c, 0x85, 0x46, 0x38, 0xb9, 0x43, 0x1c,
+    0xb5, 0x44, 0x2b, 0xb9, 0xc3, 0x1c, 0xbb, 0x4d, 0x06, 0x5a, 0xc3, 0x1c,
+    0xc7, 0xcf, 0x60, 0x8a, 0x07, 0xe3, 0x99, 0x45, 0x19, 0x60, 0xc3, 0x1c,
+    0xd3, 0xcf, 0x69, 0x81, 0x07, 0xe3, 0x89, 0xce, 0x72, 0xf0, 0x07, 0xe3,
+    0x81, 0x45, 0x50, 0xf0, 0xc3, 0x1c, 0xe9, 0x0a, 0xc3, 0x1c, 0xf3, 0x45,
+    0x30, 0xc1, 0x43, 0x1c, 0xff, 0x43, 0x2b, 0xba, 0xc3, 0x1d, 0x09, 0x03,
+    0x43, 0x1d, 0x15, 0xcb, 0x64, 0x7b, 0x07, 0xe7, 0x81, 0x0b, 0xc3, 0x1d,
+    0x21, 0xca, 0x26, 0xf7, 0x07, 0xe4, 0x99, 0x45, 0x00, 0x8c, 0x43, 0x1d,
+    0x2d, 0xcd, 0x00, 0xfa, 0x07, 0xe2, 0xd1, 0xca, 0x26, 0xf7, 0x07, 0xe4,
+    0xb0, 0xcd, 0x00, 0xfa, 0x07, 0xe2, 0xc9, 0xca, 0x26, 0xf7, 0x07, 0xe4,
+    0xa8, 0xcc, 0x00, 0xfb, 0x07, 0xe2, 0xb9, 0xcb, 0x10, 0xb5, 0x07, 0xe6,
+    0xe0, 0x0b, 0xc3, 0x1d, 0x39, 0xd3, 0x43, 0x72, 0x07, 0xed, 0x78, 0x43,
+    0x2b, 0xba, 0xc3, 0x1d, 0x45, 0x43, 0x02, 0x98, 0x43, 0x1d, 0x51, 0xcd,
+    0x00, 0xfa, 0x07, 0xe2, 0x81, 0xca, 0x26, 0xf7, 0x07, 0xe4, 0x78, 0xcd,
+    0x00, 0xfa, 0x07, 0xe2, 0x79, 0xca, 0x26, 0xf7, 0x07, 0xe4, 0x70, 0x0b,
+    0xc3, 0x1d, 0x5b, 0xca, 0x26, 0xf7, 0x07, 0xe4, 0x61, 0x45, 0x00, 0x8c,
+    0xc3, 0x1d, 0x67, 0xcb, 0x64, 0x7b, 0x07, 0xe7, 0x70, 0xcc, 0x00, 0xfb,
+    0x07, 0xe2, 0x69, 0xcb, 0x10, 0xb5, 0x07, 0xe6, 0xa0, 0x0b, 0xc3, 0x1d,
+    0x73, 0x45, 0x00, 0x8c, 0x43, 0x1d, 0x7f, 0x45, 0x19, 0x60, 0xc3, 0x1d,
+    0x97, 0x44, 0x0d, 0xff, 0xc3, 0x1d, 0xad, 0x44, 0x2b, 0xb9, 0xc3, 0x1d,
+    0xbd, 0x45, 0x06, 0x5a, 0xc3, 0x1d, 0xc9, 0x46, 0x50, 0xf0, 0xc3, 0x1d,
+    0xdb, 0x45, 0x50, 0xf1, 0xc3, 0x1d, 0xe7, 0x46, 0x30, 0xc1, 0x43, 0x1d,
+    0xf3, 0x46, 0x50, 0x13, 0xc3, 0x1d, 0xff, 0xd1, 0x54, 0x31, 0x07, 0xe0,
+    0xd1, 0x46, 0x30, 0xc1, 0xc3, 0x1e, 0x0b, 0x4d, 0x06, 0x5a, 0xc3, 0x1e,
+    0x17, 0x44, 0x2b, 0xb9, 0x43, 0x1e, 0x23, 0xca, 0x26, 0xf7, 0x07, 0xe4,
+    0x39, 0xcd, 0x00, 0xfa, 0x07, 0xe2, 0x20, 0x48, 0x06, 0x5f, 0xc3, 0x1e,
+    0x2f, 0x45, 0x00, 0x8c, 0xc3, 0x1e, 0x3b, 0xcd, 0x00, 0xfa, 0x07, 0xf7,
+    0xd9, 0xca, 0x26, 0xf7, 0x07, 0xf7, 0xe0, 0xca, 0x26, 0xf7, 0x07, 0xe4,
+    0x29, 0x0b, 0xc3, 0x1e, 0x47, 0xcb, 0x64, 0x7b, 0x07, 0xe7, 0x69, 0x45,
+    0x00, 0x8c, 0x43, 0x1e, 0x53, 0x0b, 0xc3, 0x1e, 0x5f, 0x4a, 0x74, 0x6e,
+    0x43, 0x1e, 0x6b, 0x43, 0x02, 0x98, 0xc3, 0x1e, 0x77, 0xcf, 0x64, 0xef,
+    0x07, 0xe6, 0x68, 0x0b, 0xc3, 0x1e, 0x81, 0x45, 0x00, 0x8c, 0x43, 0x1e,
+    0x8d, 0x47, 0x0f, 0x9c, 0xc3, 0x1e, 0x9f, 0x4a, 0xa6, 0xca, 0x43, 0x1e,
+    0xb7, 0xca, 0x26, 0xf7, 0x07, 0xe3, 0xe9, 0xcd, 0x00, 0xfa, 0x07, 0xe1,
+    0x90, 0xca, 0x26, 0xf7, 0x07, 0xe3, 0xe1, 0xcd, 0x00, 0xfa, 0x07, 0xe1,
+    0x88, 0x0b, 0xc3, 0x1e, 0xbd, 0xd3, 0x43, 0x72, 0x07, 0xee, 0x08, 0x0b,
+    0xc3, 0x1e, 0xc9, 0x4a, 0x74, 0x6e, 0x43, 0x1e, 0xd5, 0xcc, 0x00, 0xfb,
+    0x07, 0xe1, 0x71, 0xcb, 0x10, 0xb5, 0x07, 0xe5, 0xf8, 0xcc, 0x00, 0xfb,
+    0x07, 0xe1, 0x69, 0xcb, 0x10, 0xb5, 0x07, 0xe5, 0xf0, 0x44, 0x2b, 0xb9,
+    0xc3, 0x1e, 0xe1, 0x4d, 0x06, 0x5a, 0xc3, 0x1e, 0xed, 0xcf, 0x60, 0x8a,
+    0x07, 0xe3, 0x69, 0x45, 0x19, 0x60, 0xc3, 0x1e, 0xf9, 0xcf, 0x69, 0x81,
+    0x07, 0xe3, 0x59, 0xce, 0x72, 0xf0, 0x07, 0xe3, 0x51, 0x45, 0x50, 0xf0,
+    0xc3, 0x1f, 0x09, 0x0a, 0xc3, 0x1f, 0x13, 0x46, 0x30, 0xc1, 0x43, 0x1f,
+    0x1f, 0xe0, 0x07, 0x27, 0x07, 0xe2, 0xe0, 0xce, 0x6d, 0x32, 0x07, 0xea,
+    0x0b, 0x03, 0x1f, 0x2b, 0x46, 0xd2, 0x23, 0xc3, 0x1f, 0x35, 0xd2, 0x4e,
+    0xad, 0x07, 0xef, 0xb0, 0xd1, 0x4f, 0xad, 0x07, 0xe2, 0x51, 0x45, 0x06,
+    0x5a, 0xc3, 0x1f, 0x41, 0x45, 0x19, 0x60, 0xc3, 0x1f, 0x4d, 0x45, 0x50,
+    0xf0, 0xc3, 0x1f, 0x5d, 0x44, 0x19, 0x6a, 0xc3, 0x1f, 0x67, 0x45, 0x30,
+    0xc1, 0x43, 0x1f, 0x71, 0xcc, 0x00, 0xfb, 0x07, 0xe1, 0x41, 0xcb, 0x10,
+    0xb5, 0x07, 0xe5, 0xc8, 0xcc, 0x00, 0xfb, 0x07, 0xe1, 0x29, 0xcb, 0x10,
+    0xb5, 0x07, 0xe5, 0xb8, 0x0b, 0xc3, 0x1f, 0x7b, 0x4a, 0x74, 0x6e, 0x43,
+    0x1f, 0x87, 0x0b, 0xc3, 0x1f, 0x93, 0x45, 0x00, 0x8c, 0x43, 0x1f, 0x9f,
+    0xcc, 0x00, 0xfb, 0x07, 0xe1, 0x11, 0xcb, 0x10, 0xb5, 0x07, 0xe5, 0xa0,
+    0xcd, 0x00, 0xfa, 0x07, 0xe8, 0x81, 0xca, 0x26, 0xf7, 0x07, 0xe9, 0x60,
+    0xca, 0x26, 0xf7, 0x07, 0xe9, 0x19, 0xcd, 0x00, 0xfa, 0x07, 0xe8, 0x38,
+    0xca, 0x26, 0xf7, 0x07, 0xe9, 0x21, 0xcd, 0x00, 0xfa, 0x07, 0xe8, 0x40,
+    0x0b, 0xc3, 0x1f, 0xab, 0xca, 0x26, 0xf7, 0x07, 0xdf, 0xd0, 0xc8, 0xbf,
+    0x82, 0x00, 0x36, 0x63, 0x03, 0x1f, 0xb7, 0xc2, 0x16, 0x1c, 0x00, 0x32,
+    0x0a, 0x03, 0x1f, 0xbb, 0xc3, 0x1a, 0xe0, 0x00, 0x46, 0x41, 0xc4, 0x92,
+    0x76, 0x00, 0x31, 0xd3, 0x03, 0x1f, 0xbf, 0xc2, 0x0f, 0x9b, 0x00, 0x35,
+    0x7b, 0x03, 0x1f, 0xc3, 0xc3, 0xe5, 0x03, 0x00, 0x35, 0x9a, 0x03, 0x1f,
+    0xc7, 0xc2, 0x00, 0xc2, 0x00, 0x32, 0x23, 0x03, 0x1f, 0xcb, 0xc7, 0xca,
+    0x6f, 0x00, 0x45, 0x68, 0xc2, 0x00, 0x4f, 0x00, 0x31, 0x63, 0x03, 0x1f,
+    0xcf, 0x8a, 0x00, 0x34, 0xc2, 0x03, 0x1f, 0xd3, 0x47, 0xbd, 0x8a, 0xc3,
+    0x1f, 0xd7, 0xc2, 0x00, 0x74, 0x00, 0x31, 0xcb, 0x03, 0x1f, 0xec, 0xc3,
+    0x00, 0x49, 0x00, 0x31, 0x3b, 0x03, 0x1f, 0xf0, 0x87, 0x00, 0x36, 0xa8,
+    0xc4, 0xe2, 0xe3, 0x00, 0x35, 0x4b, 0x03, 0x1f, 0xf4, 0x03, 0xc3, 0x1f,
+    0xf8, 0x47, 0x06, 0x53, 0xc3, 0x20, 0x05, 0xc3, 0x14, 0x4b, 0x00, 0x31,
+    0x72, 0x03, 0x20, 0x17, 0xc4, 0xe3, 0x1b, 0x00, 0x34, 0x33, 0x03, 0x20,
+    0x1b, 0xc3, 0x2f, 0xc8, 0x00, 0x33, 0xcb, 0x03, 0x20, 0x28, 0xc2, 0x16,
+    0x1c, 0x00, 0x31, 0x53, 0x03, 0x20, 0x35, 0xc2, 0x02, 0x98, 0x00, 0x31,
+    0xbb, 0x03, 0x20, 0x42, 0x0a, 0x43, 0x20, 0x46, 0x00, 0xc3, 0x20, 0x5e,
+    0xc2, 0x16, 0x1c, 0x00, 0x35, 0x32, 0x03, 0x20, 0x74, 0xc2, 0x16, 0x1c,
+    0x00, 0x32, 0x53, 0x03, 0x20, 0x78, 0x97, 0x00, 0x36, 0x42, 0x03, 0x20,
+    0x7c, 0xc2, 0x16, 0x1c, 0x00, 0x31, 0x8b, 0x03, 0x20, 0x80, 0xcb, 0x96,
+    0x53, 0x00, 0x45, 0x61, 0xc4, 0x3a, 0x01, 0x00, 0x35, 0xdb, 0x03, 0x20,
+    0x84, 0xc3, 0x72, 0xf0, 0x00, 0x34, 0x8a, 0x03, 0x20, 0x88, 0x8a, 0x00,
+    0x31, 0x43, 0x03, 0x20, 0x8c, 0xc2, 0x16, 0x1c, 0x00, 0x33, 0xda, 0x03,
+    0x20, 0x99, 0x42, 0x00, 0x2d, 0xc3, 0x20, 0x9d, 0x00, 0x43, 0x20, 0xa3,
+    0x00, 0x43, 0x20, 0xb8, 0x00, 0x43, 0x20, 0xce, 0xc2, 0x00, 0x74, 0x00,
+    0x31, 0x93, 0x03, 0x20, 0xde, 0x8a, 0x00, 0x31, 0xc2, 0x03, 0x20, 0xe2,
+    0xcb, 0x8c, 0xdf, 0x00, 0x45, 0x89, 0xc2, 0x01, 0x9d, 0x00, 0x31, 0xab,
+    0x03, 0x20, 0xe6, 0xc4, 0xdf, 0x93, 0x00, 0x31, 0xa3, 0x03, 0x20, 0xea,
+    0xc8, 0xba, 0x32, 0x00, 0x35, 0x51, 0xc3, 0x03, 0x26, 0x00, 0x31, 0x9b,
+    0x03, 0x20, 0xee, 0xcf, 0x07, 0x2a, 0x00, 0x33, 0x80, 0x03, 0xc3, 0x20,
+    0xf2, 0x42, 0x0e, 0x9a, 0xc3, 0x21, 0x09, 0xc2, 0x03, 0x66, 0x00, 0x34,
+    0x73, 0x03, 0x21, 0x19, 0xc3, 0x2b, 0xb9, 0x00, 0x34, 0x23, 0x03, 0x21,
+    0x1d, 0x47, 0x3b, 0xc4, 0x43, 0x21, 0x21, 0x00, 0xc3, 0x21, 0x33, 0x8a,
+    0x00, 0x35, 0x22, 0x03, 0x21, 0x3f, 0x00, 0x43, 0x21, 0x43, 0xc3, 0x12,
+    0xc2, 0x00, 0x32, 0x2b, 0x03, 0x21, 0x55, 0xc3, 0x01, 0xc4, 0x00, 0x30,
+    0xe0, 0x00, 0x43, 0x21, 0x59, 0x89, 0x00, 0x35, 0x6b, 0x03, 0x21, 0x65,
+    0xc3, 0x01, 0x54, 0x00, 0x32, 0x33, 0x03, 0x21, 0x72, 0xc3, 0x2b, 0xb9,
+    0x00, 0x34, 0x1a, 0x03, 0x21, 0x76, 0x03, 0xc3, 0x21, 0x7a, 0xc2, 0x16,
+    0x1c, 0x00, 0x32, 0x3b, 0x03, 0x21, 0x8a, 0xc9, 0xae, 0xd6, 0x00, 0x33,
+    0xa2, 0x03, 0x21, 0x8e, 0x4c, 0x73, 0x54, 0xc3, 0x21, 0x92, 0x46, 0x3b,
+    0xc5, 0x43, 0x21, 0xfa, 0x8e, 0x0f, 0x70, 0x19, 0x86, 0x0f, 0x70, 0xc8,
+    0x8a, 0x0f, 0x70, 0x41, 0x45, 0x14, 0xa8, 0x43, 0x22, 0x12, 0xc2, 0x16,
+    0x1c, 0x0f, 0x70, 0xb1, 0xc2, 0x00, 0x65, 0x0f, 0x70, 0xc0, 0x03, 0xc3,
+    0x22, 0x50, 0xc3, 0x85, 0xf5, 0x0f, 0x74, 0x09, 0xc4, 0x30, 0xc1, 0x0f,
+    0x74, 0x11, 0x42, 0x0e, 0x9a, 0xc3, 0x22, 0x5c, 0x0a, 0xc3, 0x22, 0x64,
+    0xc3, 0x7e, 0x89, 0x0f, 0x74, 0x29, 0x42, 0x02, 0x1c, 0xc3, 0x22, 0x70,
+    0x16, 0xc3, 0x22, 0x7a, 0xc3, 0x2b, 0xb9, 0x0f, 0x74, 0x49, 0xc3, 0x0d,
+    0xff, 0x0f, 0x74, 0x59, 0xc4, 0x19, 0x60, 0x0f, 0x74, 0x61, 0xc4, 0x3a,
+    0x01, 0x0f, 0x74, 0x69, 0x15, 0xc3, 0x22, 0x8a, 0xc3, 0xb1, 0x0d, 0x0f,
+    0x74, 0x81, 0xc3, 0x0f, 0x9a, 0x0f, 0x74, 0x91, 0xc3, 0x72, 0xf0, 0x0f,
+    0x74, 0x99, 0xc4, 0x14, 0x4a, 0x0f, 0x74, 0xb9, 0xc5, 0x92, 0x75, 0x0f,
+    0x74, 0xd8, 0xc3, 0x85, 0xf5, 0x0f, 0x73, 0x09, 0xc4, 0x30, 0xc1, 0x0f,
+    0x73, 0x11, 0x0a, 0xc3, 0x22, 0x9c, 0x16, 0xc3, 0x22, 0xa8, 0xc3, 0x2b,
+    0xb9, 0x0f, 0x73, 0x49, 0x0d, 0xc3, 0x22, 0xba, 0xc4, 0x19, 0x60, 0x0f,
+    0x73, 0x61, 0xc4, 0x3a, 0x01, 0x0f, 0x73, 0x69, 0x15, 0xc3, 0x22, 0xc6,
+    0xc3, 0x03, 0x0c, 0x0f, 0x73, 0x79, 0xc3, 0xb1, 0x0d, 0x0f, 0x73, 0x81,
+    0xc3, 0x0f, 0x9a, 0x0f, 0x73, 0x91, 0x06, 0xc3, 0x22, 0xd8, 0xc3, 0x74,
+    0x6a, 0x0f, 0x73, 0xd1, 0xc5, 0x92, 0x75, 0x0f, 0x73, 0xd8, 0xc2, 0x16,
+    0x1c, 0x0f, 0x71, 0x21, 0xc2, 0x02, 0x98, 0x0f, 0x71, 0x38, 0xc2, 0x0f,
+    0x9b, 0x0f, 0x71, 0x51, 0xc3, 0x14, 0x4b, 0x0f, 0x71, 0xb8, 0xc3, 0x03,
+    0x26, 0x0f, 0x71, 0x71, 0xc2, 0x01, 0x9d, 0x0f, 0x71, 0x89, 0xc4, 0xdf,
+    0x93, 0x0f, 0x71, 0xa0, 0xc2, 0x16, 0x1c, 0x0f, 0x71, 0xa9, 0xc3, 0x64,
+    0x77, 0x0f, 0x71, 0xb0, 0xc8, 0x33, 0xae, 0x00, 0x47, 0xf1, 0xcd, 0x00,
+    0xfa, 0x07, 0xf3, 0xc1, 0xcb, 0x64, 0x7b, 0x07, 0xf3, 0xc8, 0xce, 0x00,
+    0xf9, 0x07, 0xf3, 0x90, 0xc9, 0x16, 0x14, 0x00, 0x47, 0xa9, 0xc4, 0x00,
+    0x9d, 0x00, 0x47, 0xa1, 0xc8, 0x02, 0x9f, 0x00, 0x32, 0xf0, 0xc2, 0x39,
+    0x8b, 0x00, 0x47, 0x99, 0x44, 0x1d, 0xc8, 0x43, 0x22, 0xe4, 0xc9, 0xad,
+    0xbf, 0x00, 0x47, 0x09, 0xc2, 0x01, 0x9d, 0x00, 0x46, 0xa9, 0xc3, 0x03,
+    0x26, 0x00, 0x36, 0xe0, 0xce, 0x6f, 0x7e, 0x00, 0x47, 0x01, 0xc8, 0xbf,
+    0x82, 0x00, 0x46, 0x50, 0xcb, 0x60, 0x7f, 0x00, 0x46, 0xc0, 0x8a, 0x00,
+    0x46, 0x69, 0xc2, 0x00, 0x74, 0x00, 0x30, 0xb8, 0xdb, 0x17, 0xcd, 0x00,
+    0x46, 0x58, 0xc4, 0x41, 0xc1, 0x00, 0x37, 0x21, 0x45, 0x30, 0xc2, 0x43,
+    0x22, 0xfa, 0xc9, 0x02, 0xde, 0x00, 0x36, 0xd9, 0xc2, 0x02, 0x98, 0x00,
+    0x30, 0xa8, 0xc7, 0xca, 0x7d, 0x00, 0x36, 0xc9, 0x48, 0x19, 0x9b, 0x43,
+    0x23, 0x06, 0xc2, 0x39, 0x8b, 0x00, 0x46, 0x99, 0x44, 0x1d, 0xc8, 0x43,
+    0x23, 0x18, 0xc5, 0x05, 0x02, 0x00, 0x46, 0x81, 0xcd, 0x00, 0xfa, 0x07,
+    0xf3, 0xf1, 0xcb, 0x64, 0x7b, 0x07, 0xf3, 0xf8, 0x4b, 0x05, 0x29, 0xc3,
+    0x23, 0x22, 0xc5, 0x05, 0x02, 0x07, 0xdd, 0xa9, 0xc5, 0x00, 0xd4, 0x07,
+    0xdd, 0xa0, 0x53, 0x26, 0x03, 0xc3, 0x23, 0x2e, 0xc5, 0x05, 0x02, 0x07,
+    0xdd, 0xb9, 0xc5, 0x00, 0xd4, 0x07, 0xdd, 0xb0, 0xc5, 0x05, 0x02, 0x07,
+    0xdd, 0x99, 0xc5, 0x00, 0xd4, 0x07, 0xdd, 0x90, 0xd0, 0x5f, 0xd2, 0x00,
+    0x37, 0xf1, 0xc9, 0x36, 0x20, 0x00, 0x37, 0xe8, 0xda, 0x1d, 0x22, 0x00,
+    0x30, 0x81, 0xc4, 0xe4, 0x0f, 0x00, 0x30, 0x21, 0xc3, 0xa8, 0x39, 0x00,
+    0x30, 0x19, 0xc3, 0x39, 0x71, 0x00, 0x30, 0x08, 0xce, 0x04, 0xf9, 0x00,
+    0x44, 0x29, 0x4b, 0x97, 0x5b, 0xc3, 0x23, 0x3a, 0xce, 0x71, 0x06, 0x07,
+    0xf3, 0x88, 0xc2, 0xe5, 0xfd, 0x0f, 0xb9, 0x88, 0xc8, 0x8c, 0x89, 0x0f,
+    0xb9, 0x71, 0xc6, 0x4c, 0x49, 0x0f, 0xb9, 0x38, 0xcb, 0x03, 0xbc, 0x01,
+    0x1a, 0xb9, 0xc6, 0xcd, 0xc1, 0x01, 0x1a, 0x60, 0xc2, 0x01, 0x6f, 0x01,
+    0x1a, 0x68, 0xc5, 0x3a, 0x1b, 0x01, 0x19, 0xd1, 0xc4, 0x07, 0xb2, 0x01,
+    0x19, 0xc8, 0xc7, 0x0d, 0x04, 0x08, 0x08, 0xd9, 0xc8, 0x4b, 0x94, 0x08,
+    0x09, 0x20, 0xc7, 0x0d, 0x04, 0x08, 0x08, 0xd1, 0xc8, 0x4b, 0x94, 0x08,
+    0x09, 0x18, 0xc7, 0x0d, 0x04, 0x08, 0x08, 0xe9, 0xc8, 0x4b, 0x94, 0x08,
+    0x09, 0x30, 0xc7, 0x0d, 0x04, 0x08, 0x08, 0xe1, 0xc8, 0x4b, 0x94, 0x08,
+    0x09, 0x28, 0xc7, 0x3a, 0x19, 0x0f, 0xdd, 0x71, 0x47, 0x04, 0xcb, 0xc3,
+    0x23, 0x46, 0x46, 0x02, 0xae, 0xc3, 0x23, 0x52, 0xc5, 0x0d, 0x20, 0x01,
+    0x2b, 0x98, 0xc2, 0x01, 0x48, 0x01, 0x2b, 0xbb, 0x03, 0x23, 0x64, 0x4a,
+    0xa2, 0xa6, 0x43, 0x23, 0x6a, 0x0a, 0xc3, 0x23, 0x76, 0xc4, 0x00, 0x49,
+    0x01, 0x28, 0xc1, 0xc5, 0x00, 0x2c, 0x01, 0x28, 0xa0, 0xc5, 0x00, 0x2c,
+    0x01, 0x2b, 0x81, 0xc4, 0x00, 0x49, 0x01, 0x2b, 0x78, 0xc4, 0x00, 0x49,
+    0x01, 0x2b, 0x71, 0xc5, 0x00, 0x2c, 0x01, 0x2b, 0x68, 0xca, 0x01, 0x68,
+    0x01, 0x29, 0xe1, 0xc4, 0x00, 0x49, 0x01, 0x29, 0x21, 0xc5, 0x00, 0x2c,
+    0x01, 0x28, 0xe0, 0xc9, 0x12, 0x0d, 0x01, 0x2b, 0xf9, 0xc3, 0x00, 0x4a,
+    0x01, 0x28, 0xd8, 0xca, 0x01, 0x68, 0x01, 0x29, 0x99, 0xc4, 0x00, 0x49,
+    0x01, 0x28, 0x99, 0xc5, 0x00, 0x2c, 0x01, 0x28, 0x78, 0xca, 0x01, 0x68,
+    0x01, 0x2b, 0x61, 0xc4, 0x00, 0x49, 0x01, 0x2b, 0x19, 0xc5, 0x00, 0x2c,
+    0x01, 0x2b, 0x00, 0xc8, 0x11, 0xff, 0x01, 0x29, 0x49, 0xc5, 0x11, 0x39,
+    0x01, 0x28, 0x88, 0xc8, 0x11, 0xff, 0x01, 0x29, 0x09, 0xc5, 0x11, 0x39,
+    0x01, 0x28, 0x68, 0xc8, 0x11, 0x49, 0x01, 0x29, 0x39, 0xc5, 0x07, 0xeb,
+    0x01, 0x28, 0x90, 0xc8, 0x11, 0x49, 0x01, 0x28, 0xf9, 0xc5, 0x07, 0xeb,
+    0x01, 0x28, 0x70, 0xa3, 0x0f, 0xd9, 0xa0, 0xa3, 0x0f, 0xd9, 0x61, 0xa2,
+    0x0f, 0xd8, 0xe8, 0xa3, 0x0f, 0xd9, 0xc0, 0xa3, 0x0f, 0xd9, 0xd0, 0xa3,
+    0x0f, 0xd9, 0xd8, 0xd7, 0x2a, 0xf5, 0x0f, 0xd2, 0x60, 0xc5, 0x56, 0xa5,
+    0x01, 0x32, 0xf3, 0x03, 0x23, 0x82, 0xc3, 0x00, 0x74, 0x01, 0x32, 0xd2,
+    0x03, 0x23, 0x8c, 0x49, 0x2a, 0xf5, 0x43, 0x23, 0x92, 0x49, 0x2a, 0xf5,
+    0x43, 0x23, 0x9e, 0x49, 0x2a, 0xf5, 0x43, 0x23, 0xaa, 0x49, 0x2a, 0xf5,
+    0x43, 0x23, 0xb6, 0x0d, 0xc3, 0x23, 0xc2, 0xc5, 0xa8, 0xf7, 0x0f, 0xd1,
+    0x29, 0xc4, 0xde, 0x83, 0x0f, 0xd1, 0x31, 0xc6, 0xca, 0xfd, 0x0f, 0xd1,
+    0x39, 0xc4, 0xe3, 0x93, 0x0f, 0xd1, 0x48, 0xcf, 0x14, 0x22, 0x01, 0x5d,
+    0x71, 0xcd, 0x1b, 0x41, 0x01, 0x5d, 0x60, 0xcf, 0x09, 0xf8, 0x01, 0x5d,
+    0x41, 0xd0, 0x03, 0xb7, 0x01, 0x5d, 0x48, 0xcf, 0x09, 0xf8, 0x01, 0x5d,
+    0x51, 0xd0, 0x03, 0xb7, 0x01, 0x5d, 0x58, 0xcd, 0x1b, 0x41, 0x01, 0x5d,
+    0x69, 0xcf, 0x14, 0x22, 0x01, 0x5d, 0x78, 0x45, 0x00, 0x8c, 0xc3, 0x23,
+    0xce, 0xca, 0xa0, 0x62, 0x01, 0x1f, 0xd0, 0x15, 0xc3, 0x23, 0xe0, 0xc7,
+    0x3a, 0x19, 0x01, 0x59, 0x49, 0xc7, 0x0a, 0xe0, 0x01, 0x59, 0x50, 0xc8,
+    0xbe, 0xa2, 0x01, 0x1f, 0xc9, 0xc6, 0x86, 0xfd, 0x0f, 0xa9, 0x91, 0xc7,
+    0x5e, 0xa7, 0x01, 0x5e, 0x00, 0xd8, 0x23, 0xab, 0x0f, 0xbc, 0x19, 0xce,
+    0x6c, 0x8a, 0x01, 0x2d, 0xf1, 0xc8, 0x01, 0x92, 0x01, 0x2d, 0xe1, 0xcf,
+    0x65, 0x94, 0x01, 0x1f, 0x60, 0xcd, 0x7d, 0xc6, 0x01, 0x3a, 0xb1, 0xc4,
+    0x22, 0xdc, 0x01, 0x33, 0x31, 0xcf, 0x6a, 0x44, 0x01, 0x4f, 0x51, 0xc7,
+    0x5e, 0xa7, 0x01, 0x5e, 0x09, 0xc8, 0xb5, 0xe2, 0x01, 0x5e, 0xf0, 0xc4,
+    0x5b, 0x26, 0x01, 0x36, 0x19, 0xc3, 0x12, 0xb8, 0x01, 0x36, 0x10, 0xd8,
+    0x23, 0xab, 0x0f, 0xbc, 0x11, 0x12, 0xc3, 0x23, 0xec, 0xce, 0x6c, 0x8a,
+    0x01, 0x2d, 0xc1, 0xc8, 0x01, 0x92, 0x01, 0x2d, 0xb3, 0x03, 0x23, 0xf8,
+    0xcf, 0x65, 0x94, 0x01, 0x1f, 0x4a, 0x03, 0x23, 0xfe, 0xc5, 0x01, 0xa2,
+    0x01, 0x3d, 0x0b, 0x03, 0x24, 0x04, 0xc6, 0x1c, 0xb4, 0x01, 0x02, 0x69,
+    0xd5, 0x03, 0xd2, 0x01, 0x5c, 0xf0, 0xc5, 0x06, 0x82, 0x01, 0x30, 0xd9,
+    0xce, 0x24, 0xd5, 0x0f, 0xac, 0xe8, 0xd8, 0x23, 0xab, 0x0f, 0xbc, 0x01,
+    0xc7, 0x46, 0x3d, 0x01, 0x2e, 0x21, 0xce, 0x6c, 0x8a, 0x01, 0x2e, 0x11,
+    0xc8, 0x01, 0x92, 0x01, 0x2e, 0x01, 0xcf, 0x65, 0x94, 0x01, 0x1f, 0x52,
+    0x03, 0x24, 0x0a, 0xca, 0xa6, 0x34, 0x01, 0x36, 0xc1, 0x49, 0x01, 0xaa,
+    0x43, 0x24, 0x10, 0xc6, 0x1c, 0xb4, 0x01, 0x02, 0x61, 0xd5, 0x03, 0xd2,
+    0x01, 0x5c, 0xe0, 0xcd, 0x2f, 0x72, 0x01, 0x2f, 0x19, 0xce, 0x23, 0xb5,
+    0x01, 0x2f, 0x10, 0x45, 0x03, 0x14, 0xc3, 0x24, 0x1c, 0xc5, 0x0b, 0x0a,
+    0x01, 0x2f, 0xe0, 0xd5, 0x2e, 0xad, 0x01, 0x1f, 0xbb, 0x03, 0x24, 0x2e,
+    0xc6, 0x3a, 0x1a, 0x01, 0x59, 0x28, 0xc8, 0x5e, 0xa6, 0x01, 0x5e, 0x28,
+    0xc8, 0x5e, 0xa6, 0x01, 0x5e, 0x40, 0xd5, 0x32, 0x03, 0x01, 0x1f, 0xa3,
+    0x03, 0x24, 0x34, 0xc6, 0x0a, 0xe1, 0x01, 0x59, 0x38, 0xce, 0x23, 0xb5,
+    0x01, 0x2f, 0x29, 0xcd, 0x2f, 0x72, 0x01, 0x2f, 0x20, 0xce, 0x6c, 0x8a,
+    0x01, 0x2d, 0xa1, 0xc8, 0x01, 0x92, 0x01, 0x2d, 0x91, 0xcf, 0x65, 0x94,
+    0x01, 0x1f, 0x59, 0xd8, 0x23, 0xab, 0x0f, 0xbc, 0x08, 0xc5, 0x22, 0xdb,
+    0x01, 0x33, 0x28, 0x46, 0x00, 0x8b, 0x43, 0x24, 0x3a, 0xcd, 0x7c, 0x8e,
+    0x00, 0xdb, 0x88, 0xcd, 0x7c, 0x8e, 0x00, 0xdb, 0x80, 0x00, 0x43, 0x24,
+    0x54, 0xc4, 0xb5, 0x3e, 0x00, 0xd9, 0x19, 0xcf, 0x60, 0x21, 0x00, 0xd8,
+    0xf1, 0xc5, 0xdc, 0x9f, 0x00, 0xd8, 0xe8, 0xc9, 0x60, 0x27, 0x00, 0xd9,
+    0x01, 0xc9, 0xb3, 0x8c, 0x00, 0xd8, 0xf8, 0xc4, 0xa1, 0x14, 0x00, 0xd9,
+    0xfb, 0x03, 0x24, 0x60, 0xc6, 0xc2, 0x6d, 0x00, 0xda, 0x00, 0x97, 0x0b,
+    0x50, 0x29, 0x83, 0x0b, 0x50, 0x19, 0xc2, 0x00, 0xb0, 0x0b, 0x51, 0xb1,
+    0x91, 0x0b, 0x51, 0x79, 0x07, 0xc3, 0x24, 0x66, 0xc3, 0x17, 0x29, 0x0b,
+    0x50, 0xb0, 0xc4, 0xbf, 0xf1, 0x0b, 0x51, 0xb9, 0x0a, 0xc3, 0x24, 0x6e,
+    0xc3, 0xd7, 0xe2, 0x0b, 0x50, 0xa9, 0x8b, 0x0b, 0x50, 0xa1, 0xc2, 0x5d,
+    0xa1, 0x0b, 0x50, 0x90, 0xc2, 0x00, 0x3d, 0x0b, 0x51, 0xa9, 0x03, 0x43,
+    0x24, 0x7c, 0x04, 0xc3, 0x24, 0x84, 0x91, 0x0b, 0x51, 0x99, 0x83, 0x0b,
+    0x51, 0x91, 0xc4, 0xe2, 0x9f, 0x0b, 0x50, 0x68, 0x07, 0xc3, 0x24, 0x90,
+    0x97, 0x0b, 0x51, 0x19, 0x0b, 0x43, 0x24, 0x9e, 0xc2, 0x7f, 0xc0, 0x0b,
+    0x51, 0x71, 0x8b, 0x0b, 0x51, 0x69, 0x83, 0x0b, 0x50, 0x50, 0x83, 0x0b,
+    0x51, 0x61, 0xc2, 0x0f, 0xe1, 0x0b, 0x51, 0x08, 0xc3, 0x8b, 0xa9, 0x0b,
+    0x51, 0x51, 0x07, 0x43, 0x24, 0xa8, 0x09, 0xc3, 0x24, 0xb2, 0x8b, 0x0b,
+    0x51, 0x21, 0xc3, 0x14, 0x09, 0x0b, 0x51, 0x01, 0xc3, 0x01, 0xe2, 0x0b,
+    0x50, 0xf1, 0x0c, 0xc3, 0x24, 0xbe, 0x97, 0x0b, 0x50, 0xcb, 0x03, 0x24,
+    0xca, 0xc3, 0x4f, 0x43, 0x0b, 0x50, 0x79, 0xc2, 0x16, 0x5a, 0x0b, 0x50,
+    0x48, 0x83, 0x0b, 0x50, 0xe9, 0xc2, 0x7f, 0xc0, 0x0b, 0x50, 0xd8, 0x0a,
+    0xc3, 0x24, 0xd0, 0x42, 0x00, 0x51, 0x43, 0x24, 0xe0, 0x17, 0xc3, 0x24,
+    0xea, 0xc3, 0xd7, 0xe2, 0x0b, 0x4c, 0xf0, 0xc4, 0xe3, 0xf7, 0x0b, 0x4b,
+    0xa1, 0x8b, 0x0b, 0x4f, 0xf1, 0x91, 0x0b, 0x4f, 0xc9, 0x07, 0xc3, 0x24,
+    0xf2, 0x17, 0x43, 0x24, 0xfa, 0x09, 0xc3, 0x25, 0x0a, 0x06, 0xc3, 0x25,
+    0x29, 0x42, 0x01, 0xe2, 0xc3, 0x25, 0x37, 0x83, 0x0b, 0x4f, 0xb3, 0x03,
+    0x25, 0x41, 0x0c, 0xc3, 0x25, 0x45, 0x16, 0xc3, 0x25, 0x4f, 0x1c, 0xc3,
+    0x25, 0x5b, 0x43, 0x70, 0x51, 0xc3, 0x25, 0x67, 0xc3, 0xbc, 0x2f, 0x0b,
+    0x4d, 0x40, 0x03, 0xc3, 0x25, 0x73, 0x11, 0xc3, 0x25, 0x88, 0x07, 0xc3,
+    0x25, 0x93, 0x17, 0x43, 0x25, 0x9e, 0x97, 0x0b, 0x4d, 0x03, 0x03, 0x25,
+    0xab, 0x03, 0xc3, 0x25, 0xb7, 0x8b, 0x0b, 0x4f, 0xbb, 0x03, 0x25, 0xc4,
+    0x07, 0xc3, 0x25, 0xc8, 0x91, 0x0b, 0x4c, 0xc2, 0x03, 0x25, 0xd2, 0x03,
+    0xc3, 0x25, 0xd8, 0xc3, 0xd7, 0xe2, 0x0b, 0x4f, 0x79, 0xc5, 0xd4, 0x39,
+    0x0b, 0x4c, 0x10, 0xc2, 0x00, 0x7a, 0x0b, 0x4b, 0x69, 0x0a, 0xc3, 0x25,
+    0xe0, 0xc4, 0xb5, 0x1a, 0x0b, 0x4c, 0xd9, 0x07, 0xc3, 0x25, 0xf3, 0xc2,
+    0x04, 0xc6, 0x0b, 0x4c, 0x28, 0x11, 0xc3, 0x25, 0xfb, 0x03, 0xc3, 0x26,
+    0x07, 0x97, 0x0b, 0x4f, 0x69, 0xc5, 0xdc, 0xea, 0x0b, 0x4d, 0x98, 0xc2,
+    0x00, 0x7a, 0x0b, 0x4b, 0x51, 0x07, 0x43, 0x26, 0x15, 0x42, 0x00, 0x51,
+    0xc3, 0x26, 0x1f, 0xc2, 0x00, 0x45, 0x0b, 0x4f, 0xf9, 0x83, 0x0b, 0x4f,
+    0xdb, 0x03, 0x26, 0x29, 0xc2, 0x00, 0xc4, 0x0b, 0x4f, 0xd1, 0x8b, 0x0b,
+    0x4f, 0x73, 0x03, 0x26, 0x38, 0xc2, 0x07, 0xb2, 0x0b, 0x4e, 0x49, 0xc3,
+    0x8b, 0xa9, 0x0b, 0x4e, 0x31, 0xc4, 0xe0, 0x4f, 0x0b, 0x4d, 0x79, 0x42,
+    0x1f, 0xad, 0x43, 0x26, 0x3e, 0x83, 0x0b, 0x4d, 0xdb, 0x03, 0x26, 0x48,
+    0x17, 0xc3, 0x26, 0x4c, 0xc2, 0x02, 0xe0, 0x0b, 0x4f, 0x59, 0xc2, 0x00,
+    0x7a, 0x0b, 0x4e, 0x98, 0x17, 0xc3, 0x26, 0x57, 0x43, 0x8a, 0x2d, 0xc3,
+    0x26, 0x6b, 0x42, 0x2c, 0x43, 0xc3, 0x26, 0x77, 0x0b, 0xc3, 0x26, 0x88,
+    0xc2, 0x00, 0xb6, 0x0b, 0x4d, 0x60, 0x09, 0xc3, 0x26, 0x92, 0x15, 0xc3,
+    0x26, 0x9a, 0x16, 0xc3, 0x26, 0xaa, 0x06, 0xc3, 0x26, 0xb4, 0x8b, 0x0b,
+    0x4a, 0xd9, 0x97, 0x0b, 0x4a, 0xb9, 0x1b, 0xc3, 0x26, 0xc4, 0x0c, 0x43,
+    0x26, 0xda, 0x07, 0xc3, 0x26, 0xf3, 0xc2, 0x7f, 0xc0, 0x0b, 0x4a, 0xf9,
+    0xc2, 0x01, 0xdf, 0x0b, 0x48, 0xf1, 0xc3, 0x8f, 0x8a, 0x0b, 0x47, 0xb0,
+    0x03, 0xc3, 0x27, 0x01, 0x07, 0xc3, 0x27, 0x0d, 0x04, 0xc3, 0x27, 0x17,
+    0xc3, 0x9c, 0xc7, 0x0b, 0x4a, 0xf1, 0x97, 0x0b, 0x4a, 0x99, 0x08, 0xc3,
+    0x27, 0x26, 0x42, 0x1f, 0xad, 0xc3, 0x27, 0x39, 0xc3, 0x07, 0x85, 0x0b,
+    0x48, 0xc8, 0x07, 0xc3, 0x27, 0x4b, 0x97, 0x0b, 0x48, 0x8b, 0x03, 0x27,
+    0x55, 0x8b, 0x0b, 0x4b, 0x09, 0xc2, 0x7f, 0xc0, 0x0b, 0x4a, 0x61, 0xc2,
+    0x10, 0x11, 0x0b, 0x4a, 0x58, 0x97, 0x0b, 0x4a, 0x4b, 0x03, 0x27, 0x5b,
+    0xc3, 0x17, 0x29, 0x0b, 0x4a, 0xb1, 0x07, 0xc3, 0x27, 0x69, 0xc4, 0xde,
+    0xb3, 0x0b, 0x49, 0x08, 0x17, 0xc3, 0x27, 0x71, 0x03, 0xc3, 0x27, 0x7f,
+    0x0a, 0xc3, 0x27, 0x87, 0xc2, 0x01, 0xbb, 0x0b, 0x49, 0x21, 0xc5, 0x8b,
+    0xa8, 0x0b, 0x48, 0x60, 0xc8, 0xb5, 0xc2, 0x0b, 0x48, 0xa1, 0xc2, 0x04,
+    0xc6, 0x0b, 0x4b, 0x28, 0xc6, 0xcb, 0x1b, 0x0b, 0x48, 0x29, 0x17, 0xc3,
+    0x27, 0x9b, 0xc2, 0x00, 0xc4, 0x0b, 0x48, 0x68, 0x43, 0x03, 0x27, 0xc3,
+    0x27, 0xa5, 0xc2, 0x25, 0x9f, 0x0b, 0x4a, 0x71, 0xc3, 0x7c, 0x57, 0x0b,
+    0x49, 0x38, 0x17, 0xc3, 0x27, 0xb1, 0x07, 0xc3, 0x27, 0xbb, 0xc2, 0x00,
+    0xb6, 0x0b, 0x49, 0xa9, 0xc2, 0x00, 0x7e, 0x0b, 0x49, 0x68, 0xc4, 0x8b,
+    0xa8, 0x0b, 0x4a, 0x41, 0xc2, 0x04, 0xc6, 0x0b, 0x48, 0x90, 0xc4, 0xb5,
+    0x1a, 0x0b, 0x47, 0xd9, 0xc2, 0x00, 0xb6, 0x0b, 0x47, 0x90, 0x07, 0xc3,
+    0x27, 0xc5, 0x17, 0xc3, 0x27, 0xd3, 0xc2, 0x04, 0xc6, 0x0b, 0x45, 0x49,
+    0xc5, 0x5c, 0x98, 0x0b, 0x45, 0x40, 0x0a, 0xc3, 0x27, 0xdd, 0x07, 0xc3,
+    0x27, 0xe9, 0xc4, 0xa1, 0xee, 0x0b, 0x45, 0x78, 0x07, 0xc3, 0x27, 0xf5,
+    0x42, 0x00, 0x8d, 0xc3, 0x27, 0xff, 0xc6, 0xcf, 0xc5, 0x0b, 0x45, 0x60,
+    0xc2, 0x00, 0xc4, 0x0b, 0x47, 0x79, 0x0b, 0x43, 0x28, 0x0b, 0xc2, 0x14,
+    0xbe, 0x0b, 0x47, 0x69, 0x97, 0x0b, 0x46, 0x69, 0x03, 0x43, 0x28, 0x15,
+    0x03, 0xc3, 0x28, 0x1d, 0x09, 0xc3, 0x28, 0x27, 0x0c, 0xc3, 0x28, 0x3b,
+    0x06, 0xc3, 0x28, 0x49, 0x15, 0xc3, 0x28, 0x5f, 0x16, 0xc3, 0x28, 0x79,
+    0x1c, 0xc3, 0x28, 0x89, 0xd0, 0x5c, 0x92, 0x0b, 0x44, 0xc8, 0xc3, 0x8b,
+    0xa9, 0x0b, 0x47, 0x39, 0xc3, 0x8f, 0x8a, 0x0b, 0x47, 0x31, 0x04, 0xc3,
+    0x28, 0x93, 0x03, 0xc3, 0x28, 0xa6, 0xc6, 0xd1, 0x93, 0x0b, 0x45, 0xc0,
+    0x17, 0xc3, 0x28, 0xae, 0xc2, 0x04, 0xc6, 0x0b, 0x46, 0xc9, 0xc3, 0x92,
+    0xb4, 0x0b, 0x45, 0x38, 0xc2, 0x02, 0xae, 0x0b, 0x46, 0x89, 0xc7, 0xc5,
+    0xbb, 0x0b, 0x44, 0x90, 0xc5, 0xdb, 0x0a, 0x0b, 0x46, 0x09, 0x9a, 0x0b,
+    0x45, 0x88, 0x42, 0x00, 0xd0, 0xc3, 0x28, 0xbe, 0xc4, 0xe1, 0xb7, 0x0b,
+    0x44, 0xc0, 0x09, 0xc3, 0x28, 0xc8, 0x15, 0xc3, 0x28, 0xd8, 0x1b, 0xc3,
+    0x28, 0xe4, 0xc7, 0xc2, 0x81, 0x0b, 0x43, 0x29, 0xcb, 0x8f, 0x89, 0x0b,
+    0x43, 0x20, 0x08, 0xc3, 0x28, 0xf0, 0x83, 0x0b, 0x44, 0x63, 0x03, 0x28,
+    0xfc, 0x04, 0xc3, 0x29, 0x02, 0x42, 0x2c, 0x43, 0xc3, 0x29, 0x18, 0xc7,
+    0xc5, 0xd7, 0x0b, 0x43, 0xf8, 0xc2, 0x00, 0x8d, 0x0b, 0x43, 0x39, 0xc6,
+    0xcb, 0x4b, 0x0b, 0x44, 0x09, 0xc4, 0xdb, 0x8e, 0x0b, 0x43, 0x91, 0xc5,
+    0xd9, 0x48, 0x0b, 0x43, 0x08, 0xc4, 0xdc, 0xeb, 0x0b, 0x43, 0x31, 0x90,
+    0x0b, 0x43, 0x78, 0x0b, 0xc3, 0x29, 0x22, 0x42, 0x2c, 0x43, 0xc3, 0x29,
+    0x2c, 0xc2, 0x00, 0xc2, 0x0b, 0x43, 0x00, 0xc2, 0x00, 0x3d, 0x0b, 0x44,
+    0x49, 0x03, 0xc3, 0x29, 0x3e, 0xc8, 0xb6, 0x02, 0x0b, 0x42, 0xd8, 0x87,
+    0x0b, 0x44, 0x29, 0xc2, 0xd0, 0x00, 0x0b, 0x44, 0x18, 0xc2, 0x0f, 0xe1,
+    0x0b, 0x43, 0xe9, 0xc6, 0xcd, 0x3d, 0x0b, 0x43, 0xb9, 0x42, 0x01, 0x7f,
+    0xc3, 0x29, 0x4a, 0xc5, 0xdd, 0xee, 0x0b, 0x42, 0xd1, 0xc3, 0x8f, 0x8a,
+    0x0b, 0x42, 0xc8, 0xc3, 0x76, 0x32, 0x0b, 0x43, 0xc1, 0x42, 0x03, 0x53,
+    0x43, 0x29, 0x56, 0xcc, 0x82, 0x71, 0x0b, 0x43, 0x11, 0xc5, 0xdc, 0x9a,
+    0x0b, 0x42, 0xf0, 0x11, 0xc3, 0x29, 0x62, 0x0a, 0xc3, 0x29, 0x70, 0xc3,
+    0x40, 0xe6, 0x0b, 0x41, 0x19, 0xc2, 0x5d, 0xa1, 0x0b, 0x40, 0xa9, 0xc6,
+    0xce, 0x69, 0x0b, 0x40, 0x88, 0x42, 0x2c, 0x43, 0xc3, 0x29, 0x7e, 0x17,
+    0xc3, 0x29, 0x8a, 0xc8, 0xb7, 0xb2, 0x0b, 0x40, 0x30, 0xc3, 0xe5, 0x6c,
+    0x0b, 0x41, 0xd9, 0x03, 0xc3, 0x29, 0x96, 0xc3, 0x8f, 0x91, 0x0b, 0x41,
+    0xa9, 0x07, 0x43, 0x29, 0xa0, 0x03, 0xc3, 0x29, 0xaa, 0x42, 0x01, 0x5d,
+    0xc3, 0x29, 0xba, 0x11, 0xc3, 0x29, 0xc4, 0xcb, 0x92, 0xac, 0x0b, 0x41,
+    0x29, 0xc5, 0xd1, 0x93, 0x0b, 0x41, 0x21, 0xc9, 0xb5, 0x18, 0x0b, 0x40,
+    0x80, 0x03, 0xc3, 0x29, 0xd0, 0xc2, 0x00, 0xc4, 0x0b, 0x42, 0xa1, 0x42,
+    0x01, 0xe2, 0xc3, 0x29, 0xda, 0x1b, 0xc3, 0x29, 0xe4, 0xc3, 0xe4, 0x60,
+    0x0b, 0x42, 0x39, 0x09, 0xc3, 0x29, 0xf1, 0x0d, 0xc3, 0x2a, 0x03, 0x16,
+    0xc3, 0x2a, 0x0f, 0x42, 0x0e, 0x9a, 0xc3, 0x2a, 0x1e, 0xc3, 0x3d, 0xb5,
+    0x0b, 0x41, 0x61, 0x1c, 0x43, 0x2a, 0x2a, 0x97, 0x0b, 0x42, 0x9b, 0x03,
+    0x2a, 0x36, 0xc5, 0x8e, 0x46, 0x0b, 0x41, 0xc1, 0xc6, 0xd0, 0xa9, 0x0b,
+    0x40, 0xc1, 0xc4, 0xe1, 0x8f, 0x0b, 0x40, 0xb8, 0x03, 0xc3, 0x2a, 0x3c,
+    0xc2, 0x02, 0xae, 0x0b, 0x41, 0x69, 0xc2, 0x00, 0x3d, 0x0b, 0x41, 0x51,
+    0x43, 0x01, 0x55, 0x43, 0x2a, 0x52, 0xc6, 0xcc, 0xe9, 0x0b, 0x42, 0x21,
+    0xc8, 0xbb, 0xaa, 0x0b, 0x41, 0x00, 0x45, 0xcf, 0x0c, 0xc3, 0x2a, 0x5e,
+    0xc8, 0xbe, 0x62, 0x0b, 0x40, 0x08, 0xc2, 0x0d, 0xf6, 0x00, 0xde, 0xd1,
+    0xc2, 0x00, 0xc1, 0x00, 0xde, 0x51, 0xc2, 0x00, 0xd0, 0x00, 0xde, 0x20,
+    0xcf, 0x67, 0x92, 0x00, 0x4f, 0x81, 0xce, 0x6e, 0x74, 0x00, 0x4f, 0x88,
+    0x94, 0x00, 0x4f, 0x00, 0x8e, 0x00, 0x4f, 0x08, 0xa0, 0x01, 0x40, 0x3b,
+    0x03, 0x2a, 0x6a, 0xa1, 0x01, 0x40, 0x5b, 0x03, 0x2a, 0x8a, 0xa2, 0x01,
+    0x40, 0x9b, 0x03, 0x2a, 0xa3, 0xa3, 0x01, 0x41, 0x1b, 0x03, 0x2a, 0xb5,
+    0xa5, 0x01, 0x44, 0x19, 0xa4, 0x01, 0x42, 0x1a, 0x03, 0x2a, 0xc0, 0xa1,
+    0x01, 0x40, 0x6b, 0x03, 0x2a, 0xc4, 0xa2, 0x01, 0x40, 0xab, 0x03, 0x2a,
+    0xdd, 0xa3, 0x01, 0x41, 0x2b, 0x03, 0x2a, 0xef, 0xa5, 0x01, 0x44, 0x29,
+    0xa4, 0x01, 0x42, 0x2a, 0x03, 0x2a, 0xfa, 0xa2, 0x01, 0x40, 0xcb, 0x03,
+    0x2a, 0xfe, 0xa3, 0x01, 0x41, 0x4b, 0x03, 0x2b, 0x10, 0xa5, 0x01, 0x44,
+    0x49, 0xa4, 0x01, 0x42, 0x4a, 0x03, 0x2b, 0x1b, 0xa3, 0x01, 0x41, 0x8b,
+    0x03, 0x2b, 0x1f, 0xa5, 0x01, 0x44, 0x89, 0xa4, 0x01, 0x42, 0x8a, 0x03,
+    0x2b, 0x2a, 0xa5, 0x01, 0x45, 0x09, 0xa4, 0x01, 0x43, 0x0a, 0x03, 0x2b,
+    0x2e, 0xa5, 0x01, 0x46, 0x08, 0xa1, 0x01, 0x40, 0x73, 0x03, 0x2b, 0x32,
+    0xa2, 0x01, 0x40, 0xb3, 0x03, 0x2b, 0x4b, 0xa3, 0x01, 0x41, 0x33, 0x03,
+    0x2b, 0x5d, 0xa5, 0x01, 0x44, 0x31, 0xa4, 0x01, 0x42, 0x32, 0x03, 0x2b,
+    0x68, 0xa2, 0x01, 0x40, 0xd3, 0x03, 0x2b, 0x6c, 0xa3, 0x01, 0x41, 0x53,
+    0x03, 0x2b, 0x7e, 0xa5, 0x01, 0x44, 0x51, 0xa4, 0x01, 0x42, 0x52, 0x03,
+    0x2b, 0x89, 0xa3, 0x01, 0x41, 0x93, 0x03, 0x2b, 0x8d, 0xa5, 0x01, 0x44,
+    0x91, 0xa4, 0x01, 0x42, 0x92, 0x03, 0x2b, 0x98, 0xa5, 0x01, 0x45, 0x11,
+    0xa4, 0x01, 0x43, 0x12, 0x03, 0x2b, 0x9c, 0xa5, 0x01, 0x46, 0x10, 0xa2,
+    0x01, 0x40, 0xe3, 0x03, 0x2b, 0xa0, 0xa3, 0x01, 0x41, 0x63, 0x03, 0x2b,
+    0xb2, 0xa5, 0x01, 0x44, 0x61, 0xa4, 0x01, 0x42, 0x62, 0x03, 0x2b, 0xbd,
+    0xa3, 0x01, 0x41, 0xa3, 0x03, 0x2b, 0xc1, 0xa5, 0x01, 0x44, 0xa1, 0xa4,
+    0x01, 0x42, 0xa2, 0x03, 0x2b, 0xcc, 0xa5, 0x01, 0x45, 0x21, 0xa4, 0x01,
+    0x43, 0x22, 0x03, 0x2b, 0xd0, 0xa5, 0x01, 0x46, 0x20, 0xa3, 0x01, 0x41,
+    0xc3, 0x03, 0x2b, 0xd4, 0xa5, 0x01, 0x44, 0xc1, 0xa4, 0x01, 0x42, 0xc2,
+    0x03, 0x2b, 0xdf, 0xa5, 0x01, 0x45, 0x41, 0xa4, 0x01, 0x43, 0x42, 0x03,
+    0x2b, 0xe3, 0xa5, 0x01, 0x46, 0x40, 0xa5, 0x01, 0x45, 0x81, 0xa4, 0x01,
+    0x43, 0x82, 0x03, 0x2b, 0xe7, 0xa5, 0x01, 0x46, 0x80, 0xa5, 0x01, 0x47,
+    0x00, 0x83, 0x08, 0x83, 0xa9, 0xc2, 0x00, 0xdb, 0x08, 0x81, 0xa8, 0x91,
+    0x08, 0x83, 0x91, 0x87, 0x08, 0x83, 0x88, 0x8e, 0x08, 0x80, 0x70, 0x94,
+    0x08, 0x80, 0x60, 0x91, 0x08, 0x83, 0xa1, 0x87, 0x08, 0x83, 0x98, 0x8e,
+    0x08, 0x82, 0x08, 0x94, 0x08, 0x81, 0xf8, 0xc4, 0x99, 0xff, 0x0e, 0x87,
+    0xa9, 0xc3, 0x2e, 0xd7, 0x0e, 0x84, 0x78, 0xc5, 0xa9, 0xe5, 0x0e, 0x84,
+    0x89, 0xc8, 0xb2, 0xd8, 0x0e, 0x84, 0x80, 0xc4, 0x99, 0xff, 0x0e, 0x87,
+    0x91, 0xc4, 0xe4, 0xa7, 0x0e, 0x87, 0x81, 0xc3, 0x2e, 0xd7, 0x0e, 0x82,
+    0x70, 0xc3, 0x63, 0x2b, 0x0e, 0x84, 0x19, 0x03, 0x43, 0x2b, 0xeb, 0xd0,
+    0x32, 0xc5, 0x0e, 0x85, 0x69, 0xcd, 0x77, 0x2c, 0x0e, 0x82, 0x90, 0x00,
+    0x43, 0x2b, 0xf7, 0xc9, 0xb0, 0x35, 0x0e, 0x87, 0x29, 0xc7, 0xc5, 0x83,
+    0x0e, 0x87, 0x20, 0xc9, 0xb0, 0x35, 0x0e, 0x87, 0x09, 0xc7, 0xc5, 0x83,
+    0x0e, 0x87, 0x00, 0xc5, 0xa9, 0xe5, 0x0e, 0x84, 0xa9, 0x49, 0xb2, 0xd8,
+    0x43, 0x2c, 0x03, 0xc5, 0xd9, 0x3e, 0x0e, 0x86, 0xd9, 0xc4, 0x80, 0xbc,
+    0x0e, 0x86, 0xd0, 0xd5, 0x35, 0xb4, 0x0e, 0x86, 0x99, 0xc8, 0x2e, 0x8e,
+    0x0e, 0x86, 0x70, 0xc3, 0x2e, 0xd7, 0x0e, 0x86, 0x11, 0xc4, 0x99, 0xff,
+    0x0e, 0x86, 0x08, 0xc3, 0x15, 0x30, 0x0e, 0x82, 0x19, 0xc7, 0x9c, 0xe1,
+    0x0e, 0x81, 0xb0, 0xc2, 0x6d, 0x08, 0x0e, 0x83, 0xb9, 0xc2, 0x00, 0xfb,
+    0x0e, 0x83, 0xb0, 0xc3, 0x63, 0x2b, 0x0e, 0x82, 0xf1, 0xc8, 0x9c, 0xe0,
+    0x0e, 0x81, 0xf0, 0xc6, 0x04, 0xe1, 0x0f, 0xd9, 0xe1, 0xc5, 0x00, 0x2c,
+    0x0f, 0xd9, 0xe8, 0x55, 0x0a, 0x4c, 0xc3, 0x2c, 0x0f, 0x48, 0x0a, 0x53,
+    0xc3, 0x2c, 0x21, 0x4a, 0x13, 0xe3, 0x43, 0x2c, 0x2d, 0xc6, 0x04, 0xe1,
+    0x0f, 0xda, 0x19, 0xc5, 0x00, 0x2c, 0x0f, 0xda, 0x21, 0xcc, 0x04, 0xcb,
+    0x0f, 0xda, 0x30, 0x46, 0x02, 0xae, 0xc3, 0x2c, 0x39, 0xd2, 0x4c, 0x37,
+    0x0f, 0xda, 0x40, 0xd2, 0x4c, 0x37, 0x0f, 0xda, 0x39, 0x46, 0x02, 0xae,
+    0x43, 0x2c, 0x45, 0xc7, 0x80, 0x70, 0x01, 0x53, 0x11, 0xc8, 0x52, 0x09,
+    0x01, 0x53, 0x18, 0x16, 0xc3, 0x2c, 0x51, 0xd0, 0x57, 0xa2, 0x01, 0x3e,
+    0xd0, 0x49, 0x09, 0xb3, 0xc3, 0x2c, 0x5d, 0xd0, 0x06, 0xd7, 0x0f, 0xdb,
+    0xe0, 0x49, 0x09, 0xb3, 0xc3, 0x2c, 0x63, 0xd0, 0x06, 0xd7, 0x0f, 0xdb,
+    0xe8, 0xc9, 0x33, 0xad, 0x01, 0x4c, 0x88, 0x16, 0xc3, 0x2c, 0x69, 0xc9,
+    0x3b, 0x79, 0x0f, 0xc8, 0x19, 0xc3, 0x02, 0xa3, 0x0f, 0xc8, 0x30, 0xc6,
+    0x02, 0xd1, 0x01, 0x2e, 0xb1, 0xc4, 0x0e, 0x6a, 0x01, 0x5f, 0x40, 0x45,
+    0x00, 0x8c, 0xc3, 0x2c, 0x75, 0xd4, 0x3b, 0x4c, 0x01, 0x4a, 0x40, 0xc6,
+    0x01, 0x73, 0x01, 0x0e, 0x71, 0xcf, 0x2c, 0x35, 0x01, 0x48, 0x20, 0xc5,
+    0x78, 0x04, 0x01, 0x02, 0x29, 0x48, 0xbc, 0xfa, 0xc3, 0x2c, 0x87, 0xc8,
+    0x52, 0x09, 0x01, 0x4c, 0x59, 0xc6, 0x01, 0x73, 0x01, 0x72, 0xa9, 0xcd,
+    0x75, 0xa6, 0x01, 0x72, 0xb8, 0xc5, 0x01, 0xa2, 0x01, 0x5b, 0x03, 0x03,
+    0x2c, 0x93, 0xcc, 0x82, 0xb9, 0x01, 0x5b, 0x51, 0xcd, 0x7c, 0xa8, 0x01,
+    0x5c, 0x20, 0x45, 0x00, 0x8c, 0xc3, 0x2c, 0x97, 0xc8, 0xae, 0xbc, 0x01,
+    0x59, 0xb0, 0x45, 0x03, 0x14, 0xc3, 0x2c, 0xa7, 0xc5, 0x01, 0x74, 0x01,
+    0x0c, 0xd0, 0xd4, 0x2d, 0x64, 0x01, 0x0f, 0xd1, 0xc9, 0xb3, 0xf8, 0x01,
+    0x59, 0xc0, 0xc3, 0x7e, 0x79, 0x01, 0x0d, 0x59, 0xd7, 0x22, 0x5c, 0x0f,
+    0xc0, 0x40, 0xc3, 0x14, 0xa7, 0x01, 0x0d, 0x13, 0x03, 0x2c, 0xb3, 0x43,
+    0x00, 0x7e, 0x43, 0x2c, 0xb9, 0xc2, 0x00, 0xb1, 0x01, 0x0f, 0x23, 0x03,
+    0x2c, 0xc5, 0xcc, 0x56, 0x78, 0x01, 0x48, 0xe8, 0xc6, 0x0e, 0xa4, 0x01,
+    0x4b, 0xd1, 0xc9, 0x00, 0xca, 0x01, 0x4b, 0xb9, 0x9a, 0x01, 0x59, 0xf0,
+    0xce, 0x33, 0x92, 0x01, 0x4b, 0x99, 0xd6, 0x2f, 0x5c, 0x01, 0x4a, 0x19,
+    0x48, 0x61, 0xd4, 0xc3, 0x2c, 0xcb, 0xcf, 0x6a, 0x8f, 0x01, 0x5a, 0x50,
+    0xe0, 0x06, 0xc7, 0x0f, 0xdd, 0xa8, 0x45, 0x00, 0x8c, 0xc3, 0x2c, 0xd7,
+    0xc8, 0xae, 0xbc, 0x01, 0x48, 0x30, 0x44, 0x03, 0xc8, 0xc3, 0x2c, 0xe3,
+    0x42, 0x02, 0xae, 0x43, 0x2c, 0xed, 0xc6, 0x00, 0x2b, 0x01, 0x54, 0x18,
+    0xc3, 0xe5, 0xea, 0x08, 0x3a, 0x71, 0xc3, 0x52, 0x99, 0x08, 0x3a, 0x69,
+    0xc3, 0xdf, 0xaf, 0x08, 0x3a, 0x79, 0xc7, 0xc0, 0xc8, 0x08, 0x3a, 0x81,
+    0xc5, 0xd6, 0x5f, 0x08, 0x3a, 0x89, 0xc4, 0xe2, 0x8b, 0x08, 0x3a, 0x91,
+    0xc4, 0xe1, 0xd3, 0x08, 0x3a, 0x98, 0x26, 0xc3, 0x2c, 0xf7, 0xc3, 0xb6,
+    0x4a, 0x08, 0x3a, 0x39, 0xc3, 0xd8, 0x0d, 0x08, 0x3a, 0x31, 0xc3, 0xd3,
+    0xaf, 0x08, 0x3a, 0x29, 0xc3, 0xe2, 0x7b, 0x08, 0x3a, 0x21, 0xc3, 0xe6,
+    0x32, 0x08, 0x3a, 0x19, 0xc3, 0xe6, 0x65, 0x08, 0x3a, 0x11, 0xc3, 0xe1,
+    0x37, 0x08, 0x3a, 0x09, 0xc3, 0xc7, 0x9e, 0x08, 0x3a, 0x00, 0x9e, 0x08,
+    0x39, 0x99, 0x9f, 0x08, 0x39, 0xa1, 0xa0, 0x08, 0x39, 0xa9, 0xa1, 0x08,
+    0x39, 0xb1, 0x9d, 0x08, 0x39, 0x90, 0x9d, 0x08, 0x38, 0x19, 0x9e, 0x08,
+    0x38, 0x21, 0x9f, 0x08, 0x38, 0x29, 0xa0, 0x08, 0x38, 0x31, 0xa1, 0x08,
+    0x38, 0x39, 0xa3, 0x08, 0x38, 0x41, 0xa5, 0x08, 0x38, 0x49, 0xa6, 0x08,
+    0x38, 0x50, 0x9d, 0x08, 0x38, 0x59, 0x9e, 0x08, 0x38, 0x61, 0x9f, 0x08,
+    0x38, 0x69, 0xa0, 0x08, 0x38, 0x71, 0xa1, 0x08, 0x38, 0x79, 0xa2, 0x08,
+    0x38, 0x81, 0xa3, 0x08, 0x38, 0x89, 0xa4, 0x08, 0x38, 0x91, 0xa5, 0x08,
+    0x38, 0x99, 0xa6, 0x08, 0x38, 0xa0, 0x9d, 0x08, 0x38, 0xa9, 0x9e, 0x08,
+    0x38, 0xb1, 0x9f, 0x08, 0x38, 0xb9, 0xa0, 0x08, 0x38, 0xc1, 0xa1, 0x08,
+    0x38, 0xc9, 0xa3, 0x08, 0x38, 0xd1, 0xa4, 0x08, 0x38, 0xd9, 0xa5, 0x08,
+    0x38, 0xe1, 0xa6, 0x08, 0x38, 0xe8, 0xa1, 0x08, 0x38, 0xf1, 0xa4, 0x08,
+    0x38, 0xf9, 0xa5, 0x08, 0x39, 0x00, 0x9d, 0x08, 0x39, 0x09, 0x9f, 0x08,
+    0x39, 0x11, 0xa0, 0x08, 0x39, 0x19, 0xa1, 0x08, 0x39, 0x21, 0xa2, 0x08,
+    0x39, 0x29, 0xa3, 0x08, 0x39, 0x31, 0xa5, 0x08, 0x39, 0x39, 0xa6, 0x08,
+    0x39, 0x40, 0xa0, 0x08, 0x39, 0x59, 0xa1, 0x08, 0x39, 0x61, 0xa2, 0x08,
+    0x39, 0x69, 0xa3, 0x08, 0x39, 0x71, 0xa4, 0x08, 0x39, 0x79, 0xa5, 0x08,
+    0x39, 0x81, 0x9e, 0x08, 0x39, 0x49, 0x9f, 0x08, 0x39, 0x51, 0xa6, 0x08,
+    0x39, 0x88, 0x1d, 0xc3, 0x2d, 0x01, 0x1e, 0xc3, 0x2d, 0x25, 0x1f, 0xc3,
+    0x2d, 0x39, 0x20, 0xc3, 0x2d, 0x66, 0x21, 0xc3, 0x2d, 0x7e, 0x22, 0xc3,
+    0x2d, 0x9e, 0x23, 0xc3, 0x2d, 0xc2, 0x24, 0xc3, 0x2d, 0xda, 0x25, 0x43,
+    0x2d, 0xf6, 0xc2, 0x8c, 0x53, 0x08, 0x32, 0x41, 0x1f, 0xc3, 0x2e, 0x0e,
+    0x42, 0xd5, 0xf8, 0xc3, 0x2e, 0x1a, 0xc2, 0xe6, 0x8a, 0x08, 0x32, 0x81,
+    0xc2, 0xe6, 0x7f, 0x08, 0x32, 0x89, 0x25, 0xc3, 0x2e, 0x22, 0xc2, 0xe6,
+    0x86, 0x08, 0x32, 0xa0, 0x9e, 0x08, 0x32, 0xa9, 0x9f, 0x08, 0x32, 0xb1,
+    0xa0, 0x08, 0x32, 0xb9, 0xa1, 0x08, 0x32, 0xc1, 0xa2, 0x08, 0x32, 0xc9,
+    0xa3, 0x08, 0x32, 0xd1, 0xa4, 0x08, 0x32, 0xd9, 0xa5, 0x08, 0x32, 0xe1,
+    0x26, 0x43, 0x2e, 0x2a, 0x9d, 0x08, 0x33, 0x01, 0x9e, 0x08, 0x33, 0x09,
+    0x9f, 0x08, 0x33, 0x11, 0x20, 0xc3, 0x2e, 0x36, 0xa1, 0x08, 0x33, 0x31,
+    0xa2, 0x08, 0x33, 0x39, 0xa3, 0x08, 0x33, 0x41, 0xa4, 0x08, 0x33, 0x49,
+    0xa5, 0x08, 0x33, 0x51, 0xa6, 0x08, 0x33, 0x58, 0x9d, 0x08, 0x33, 0x61,
+    0x9e, 0x08, 0x33, 0x69, 0x9f, 0x08, 0x33, 0x71, 0xa0, 0x08, 0x33, 0x79,
+    0xa1, 0x08, 0x33, 0x81, 0xa2, 0x08, 0x33, 0x89, 0xa3, 0x08, 0x33, 0x91,
+    0xa4, 0x08, 0x33, 0x99, 0xa5, 0x08, 0x33, 0xa1, 0xa6, 0x08, 0x33, 0xa8,
+    0x9d, 0x08, 0x33, 0xb1, 0x9e, 0x08, 0x33, 0xb9, 0x9f, 0x08, 0x33, 0xc1,
+    0xa0, 0x08, 0x33, 0xc9, 0xa1, 0x08, 0x33, 0xd1, 0xa2, 0x08, 0x33, 0xd9,
+    0xa3, 0x08, 0x33, 0xe1, 0xa4, 0x08, 0x33, 0xe9, 0xa5, 0x08, 0x33, 0xf1,
+    0xa6, 0x08, 0x33, 0xf8, 0x9d, 0x08, 0x34, 0x01, 0x9e, 0x08, 0x34, 0x09,
+    0x9f, 0x08, 0x34, 0x11, 0xa0, 0x08, 0x34, 0x19, 0xa1, 0x08, 0x34, 0x21,
+    0xa2, 0x08, 0x34, 0x29, 0xa3, 0x08, 0x34, 0x31, 0xa4, 0x08, 0x34, 0x39,
+    0xa5, 0x08, 0x34, 0x41, 0xa6, 0x08, 0x34, 0x48, 0x9d, 0x08, 0x34, 0x51,
+    0x9e, 0x08, 0x34, 0x59, 0x9f, 0x08, 0x34, 0x61, 0xa0, 0x08, 0x34, 0x69,
+    0xa3, 0x08, 0x34, 0x81, 0xa4, 0x08, 0x34, 0x89, 0xa5, 0x08, 0x34, 0x91,
+    0xa6, 0x08, 0x34, 0x99, 0xa1, 0x08, 0x34, 0x71, 0xa2, 0x08, 0x34, 0x78,
+    0x9d, 0x08, 0x34, 0xa1, 0x9e, 0x08, 0x34, 0xa9, 0x9f, 0x08, 0x34, 0xb1,
+    0xa0, 0x08, 0x34, 0xb9, 0xa1, 0x08, 0x34, 0xc1, 0xa2, 0x08, 0x34, 0xc9,
+    0xa3, 0x08, 0x34, 0xd1, 0xa4, 0x08, 0x34, 0xd9, 0xa5, 0x08, 0x34, 0xe1,
+    0xa6, 0x08, 0x34, 0xe8, 0x9d, 0x08, 0x34, 0xf1, 0x9e, 0x08, 0x34, 0xf8,
+    0xc5, 0xdc, 0xb8, 0x08, 0x35, 0x01, 0xc5, 0xd5, 0x15, 0x08, 0x35, 0x09,
+    0xc5, 0xd4, 0x1b, 0x08, 0x35, 0x11, 0xc5, 0xd8, 0x58, 0x08, 0x35, 0x19,
+    0xc5, 0xd6, 0xd2, 0x08, 0x35, 0x21, 0xc5, 0xd6, 0xeb, 0x08, 0x35, 0x29,
+    0xc5, 0xd7, 0x77, 0x08, 0x35, 0x31, 0xc5, 0xd5, 0x74, 0x08, 0x35, 0x39,
+    0xc5, 0xdd, 0x9e, 0x08, 0x35, 0x41, 0xc5, 0xd9, 0xbb, 0x08, 0x35, 0x48,
+    0xc5, 0xdc, 0xb8, 0x08, 0x35, 0x51, 0xc5, 0xd5, 0x15, 0x08, 0x35, 0x59,
+    0xc5, 0xd4, 0x1b, 0x08, 0x35, 0x61, 0xc5, 0xd8, 0x58, 0x08, 0x35, 0x69,
+    0xc5, 0xd6, 0xd2, 0x08, 0x35, 0x71, 0xc5, 0xd6, 0xeb, 0x08, 0x35, 0x79,
+    0xc5, 0xd7, 0x77, 0x08, 0x35, 0x81, 0xc5, 0xd5, 0x74, 0x08, 0x35, 0x89,
+    0xc5, 0xdd, 0x9e, 0x08, 0x35, 0x90, 0x9e, 0x08, 0x35, 0x99, 0x9f, 0x08,
+    0x35, 0xa1, 0xa0, 0x08, 0x35, 0xa9, 0xa1, 0x08, 0x35, 0xb1, 0xa2, 0x08,
+    0x35, 0xb9, 0xa3, 0x08, 0x35, 0xc1, 0xa5, 0x08, 0x35, 0xc9, 0xa6, 0x08,
+    0x35, 0xd0, 0x9d, 0x08, 0x35, 0xd9, 0x9e, 0x08, 0x35, 0xe1, 0x9f, 0x08,
+    0x35, 0xe9, 0xa0, 0x08, 0x35, 0xf1, 0xa2, 0x08, 0x35, 0xf9, 0xa3, 0x08,
+    0x36, 0x00, 0x9d, 0x08, 0x36, 0x09, 0x9e, 0x08, 0x36, 0x11, 0xa0, 0x08,
+    0x36, 0x19, 0xa1, 0x08, 0x36, 0x21, 0xa2, 0x08, 0x36, 0x29, 0xa3, 0x08,
+    0x36, 0x31, 0xa4, 0x08, 0x36, 0x39, 0xa5, 0x08, 0x36, 0x41, 0xa6, 0x08,
+    0x36, 0x48, 0x9d, 0x08, 0x36, 0x51, 0x9e, 0x08, 0x36, 0x59, 0x9f, 0x08,
+    0x36, 0x61, 0xa1, 0x08, 0x36, 0x69, 0xa2, 0x08, 0x36, 0x71, 0xa3, 0x08,
+    0x36, 0x79, 0xa4, 0x08, 0x36, 0x81, 0xa5, 0x08, 0x36, 0x89, 0xa6, 0x08,
+    0x36, 0x90, 0x9d, 0x08, 0x36, 0x99, 0x9e, 0x08, 0x36, 0xa1, 0x9f, 0x08,
+    0x36, 0xa9, 0xa2, 0x08, 0x36, 0xb1, 0xa4, 0x08, 0x36, 0xb9, 0xa5, 0x08,
+    0x36, 0xc1, 0xa6, 0x08, 0x36, 0xc8, 0x9d, 0x08, 0x36, 0xd1, 0x9e, 0x08,
+    0x36, 0xd9, 0x9f, 0x08, 0x36, 0xe1, 0xa0, 0x08, 0x36, 0xe9, 0xa1, 0x08,
+    0x36, 0xf1, 0xa2, 0x08, 0x36, 0xf9, 0xa3, 0x08, 0x37, 0x01, 0xa4, 0x08,
+    0x37, 0x09, 0xa6, 0x08, 0x37, 0x10, 0xa0, 0x08, 0x37, 0x19, 0xa1, 0x08,
+    0x37, 0x21, 0xa2, 0x08, 0x37, 0x29, 0xa3, 0x08, 0x37, 0x31, 0xa5, 0x08,
+    0x37, 0x39, 0xa6, 0x08, 0x37, 0x40, 0x9d, 0x08, 0x37, 0x49, 0x9e, 0x08,
+    0x37, 0x51, 0x9f, 0x08, 0x37, 0x59, 0xa0, 0x08, 0x37, 0x61, 0xa1, 0x08,
+    0x37, 0x69, 0xa2, 0x08, 0x37, 0x71, 0xa3, 0x08, 0x37, 0x79, 0xa4, 0x08,
+    0x37, 0x81, 0xa5, 0x08, 0x37, 0x89, 0xa6, 0x08, 0x37, 0x90, 0x9d, 0x08,
+    0x37, 0x99, 0x9e, 0x08, 0x37, 0xa1, 0x9f, 0x08, 0x37, 0xa9, 0xa0, 0x08,
+    0x37, 0xb1, 0xa1, 0x08, 0x37, 0xb9, 0xa2, 0x08, 0x37, 0xc1, 0xa3, 0x08,
+    0x37, 0xc9, 0xa4, 0x08, 0x37, 0xd1, 0xa5, 0x08, 0x37, 0xd9, 0xa6, 0x08,
+    0x37, 0xe0, 0x9e, 0x08, 0x37, 0xe9, 0x9f, 0x08, 0x37, 0xf1, 0xa1, 0x08,
+    0x37, 0xf9, 0xa2, 0x08, 0x38, 0x01, 0xa3, 0x08, 0x38, 0x09, 0xa5, 0x08,
+    0x38, 0x10, 0x1d, 0xc3, 0x2e, 0x42, 0x1e, 0xc3, 0x2e, 0x78, 0x22, 0xc3,
+    0x2e, 0xa8, 0x21, 0xc3, 0x2e, 0xde, 0x23, 0xc3, 0x2f, 0x0e, 0x25, 0xc3,
+    0x2f, 0x3e, 0x24, 0xc3, 0x2f, 0x56, 0x1f, 0xc3, 0x2f, 0x8c, 0x20, 0xc3,
+    0x2f, 0xc2, 0x26, 0x43, 0x2f, 0xf2, 0x1e, 0xc3, 0x2f, 0xfe, 0xc2, 0xe1,
+    0x2e, 0x08, 0x02, 0x91, 0xc2, 0x00, 0x20, 0x08, 0x02, 0x99, 0x21, 0xc3,
+    0x30, 0x06, 0xc2, 0x00, 0x22, 0x08, 0x02, 0xb1, 0x23, 0xc3, 0x30, 0x0e,
+    0xc2, 0x3c, 0xc8, 0x08, 0x02, 0xc9, 0x25, 0x43, 0x30, 0x16, 0x1e, 0xc3,
+    0x30, 0x26, 0x1f, 0x43, 0x30, 0x4a, 0xc3, 0xe5, 0xba, 0x08, 0x06, 0xf1,
+    0x1f, 0xc3, 0x30, 0x5a, 0xc3, 0xe6, 0x4a, 0x08, 0x07, 0xd0, 0x1f, 0xc3,
+    0x30, 0x6c, 0x20, 0xc3, 0x30, 0x78, 0xc8, 0xbe, 0x92, 0x08, 0x05, 0x20,
+    0x46, 0x00, 0x8b, 0xc3, 0x30, 0x84, 0x05, 0xc3, 0x30, 0xb3, 0x0b, 0xc3,
+    0x30, 0xc2, 0x03, 0xc3, 0x30, 0xce, 0xc8, 0xbf, 0x12, 0x05, 0x5a, 0x29,
+    0xd1, 0x52, 0x66, 0x00, 0x14, 0x29, 0xc6, 0xa2, 0xbb, 0x00, 0x06, 0xf8,
+    0x46, 0x00, 0x8b, 0xc3, 0x30, 0xda, 0xc2, 0x00, 0x0a, 0x05, 0x5a, 0x9b,
+    0x03, 0x31, 0x08, 0x46, 0x17, 0x8d, 0xc3, 0x31, 0x0e, 0xc8, 0xba, 0x4a,
+    0x05, 0x39, 0x6b, 0x03, 0x31, 0x1e, 0xc2, 0x00, 0x45, 0x05, 0x3b, 0x78,
+    0xcb, 0x8d, 0x37, 0x00, 0x15, 0x3b, 0x03, 0x31, 0x24, 0x17, 0xc3, 0x31,
+    0x2a, 0x46, 0x00, 0x8b, 0xc3, 0x31, 0x34, 0x0a, 0xc3, 0x31, 0x63, 0x11,
+    0xc3, 0x31, 0x72, 0xc9, 0xab, 0x40, 0x00, 0x15, 0x33, 0x03, 0x31, 0x7e,
+    0xd3, 0x45, 0x14, 0x00, 0x15, 0x41, 0x9c, 0x05, 0x39, 0x49, 0xc7, 0xc3,
+    0xa0, 0x05, 0x39, 0x59, 0xcb, 0x98, 0x8f, 0x01, 0x63, 0xb8, 0x46, 0x00,
+    0x8b, 0xc3, 0x31, 0x84, 0x44, 0x05, 0x76, 0xc3, 0x31, 0xda, 0x91, 0x05,
+    0x3a, 0x79, 0xc4, 0x6d, 0xb5, 0x05, 0x3d, 0xb9, 0xcb, 0x8e, 0xc3, 0x05,
+    0x3e, 0x09, 0x8b, 0x00, 0x0d, 0x19, 0x97, 0x00, 0x11, 0x18, 0x46, 0x00,
+    0x8b, 0xc3, 0x31, 0xe8, 0x42, 0x01, 0xbb, 0xc3, 0x32, 0x32, 0x10, 0xc3,
+    0x32, 0x3f, 0x95, 0x05, 0x3b, 0x68, 0x07, 0xc3, 0x32, 0x4b, 0x46, 0x00,
+    0x8b, 0xc3, 0x32, 0x5a, 0x9c, 0x00, 0x0f, 0x9b, 0x03, 0x32, 0x87, 0x11,
+    0xc3, 0x32, 0x8b, 0xc2, 0x01, 0xdf, 0x05, 0x3b, 0x89, 0xc9, 0xb2, 0x09,
+    0x00, 0x11, 0xc0, 0xc2, 0x25, 0xa1, 0x00, 0x14, 0x93, 0x03, 0x32, 0x97,
+    0xc2, 0x00, 0x75, 0x00, 0x0a, 0x5b, 0x03, 0x32, 0x9b, 0xc2, 0x01, 0xe2,
+    0x00, 0x14, 0x1b, 0x03, 0x32, 0xa1, 0x46, 0x00, 0x8b, 0xc3, 0x32, 0xa7,
+    0x4e, 0x73, 0x36, 0xc3, 0x32, 0xfd, 0x96, 0x05, 0x3b, 0x5a, 0x03, 0x33,
+    0x09, 0x00, 0xc3, 0x33, 0x0d, 0x48, 0x10, 0x2f, 0xc3, 0x33, 0x19, 0xc8,
+    0xb7, 0xda, 0x00, 0x13, 0x21, 0xc2, 0x01, 0xdf, 0x05, 0x3b, 0xaa, 0x03,
+    0x33, 0x46, 0x46, 0x00, 0x8b, 0xc3, 0x33, 0x4c, 0x07, 0xc3, 0x33, 0x93,
+    0xc5, 0xb8, 0xe3, 0x00, 0x0b, 0xfb, 0x03, 0x33, 0xa2, 0xc9, 0xab, 0x40,
+    0x00, 0x15, 0x51, 0xc9, 0xa8, 0x67, 0x00, 0x15, 0x59, 0xc2, 0x01, 0xdf,
+    0x05, 0x3b, 0x91, 0xd1, 0x4f, 0x47, 0x00, 0x0c, 0xd9, 0x8c, 0x00, 0x0e,
+    0x48, 0xcb, 0x92, 0x5f, 0x00, 0x15, 0x4b, 0x03, 0x33, 0xa8, 0x46, 0x00,
+    0x8b, 0x43, 0x33, 0xae, 0x46, 0x00, 0x8b, 0xc3, 0x33, 0xcc, 0xc3, 0x3c,
+    0x63, 0x00, 0x10, 0xe8, 0x45, 0x04, 0xcc, 0xc3, 0x34, 0x07, 0x46, 0x00,
+    0x8b, 0xc3, 0x34, 0x13, 0xc2, 0x01, 0xdf, 0x05, 0x3b, 0x98, 0x00, 0xc3,
+    0x34, 0x37, 0xc6, 0x10, 0x3f, 0x00, 0x14, 0x53, 0x03, 0x34, 0x46, 0x87,
+    0x00, 0xeb, 0x59, 0x91, 0x05, 0x5b, 0x19, 0x8b, 0x05, 0x5a, 0x81, 0x8f,
+    0x05, 0x3b, 0xc0, 0x00, 0xc3, 0x34, 0x4c, 0xc4, 0xde, 0x3f, 0x00, 0x12,
+    0x8b, 0x03, 0x34, 0x58, 0x87, 0x00, 0x07, 0x33, 0x03, 0x34, 0x5e, 0x83,
+    0x05, 0x39, 0x99, 0x91, 0x05, 0x39, 0xa9, 0x97, 0x05, 0x39, 0xb9, 0x98,
+    0x05, 0x39, 0xcb, 0x03, 0x34, 0x64, 0x9b, 0x05, 0x39, 0xe9, 0xca, 0xa4,
+    0x72, 0x05, 0x3e, 0x18, 0x46, 0x00, 0x8b, 0x43, 0x34, 0x6a, 0x46, 0x00,
+    0x8b, 0xc3, 0x34, 0x8c, 0xc3, 0x0a, 0xe3, 0x05, 0x39, 0x3b, 0x03, 0x34,
+    0xb2, 0x98, 0x00, 0x0c, 0xa9, 0xc5, 0xd3, 0x2c, 0x01, 0x63, 0xb0, 0x46,
+    0x00, 0x8b, 0x43, 0x34, 0xb8, 0x46, 0x00, 0x8b, 0x43, 0x34, 0xe8, 0x46,
+    0x00, 0x8b, 0xc3, 0x34, 0xf8, 0x9b, 0x05, 0x3b, 0x09, 0xcb, 0x91, 0x15,
+    0x05, 0x3b, 0x19, 0xc3, 0x02, 0x39, 0x05, 0x3b, 0x49, 0x47, 0xc8, 0xcb,
+    0x43, 0x35, 0x1a, 0x46, 0x00, 0x8b, 0xc3, 0x35, 0x2c, 0xc2, 0x00, 0x0a,
+    0x00, 0x13, 0xc0, 0x00, 0xc3, 0x35, 0x54, 0xc2, 0x01, 0xdf, 0x05, 0x3b,
+    0xa1, 0x8c, 0x00, 0x0e, 0x60, 0x46, 0x00, 0x8b, 0xc3, 0x35, 0x60, 0xc2,
+    0x00, 0x39, 0x00, 0x09, 0xc0, 0x46, 0x00, 0x8b, 0xc3, 0x35, 0x8f, 0x47,
+    0x23, 0x34, 0xc3, 0x35, 0xc3, 0xc4, 0x38, 0x2c, 0x00, 0x13, 0x19, 0xc2,
+    0x00, 0xd0, 0x00, 0x0d, 0x18, 0x46, 0x00, 0x8b, 0xc3, 0x35, 0xd5, 0xcc,
+    0x8b, 0x95, 0x00, 0xe8, 0xb9, 0x03, 0xc3, 0x36, 0x05, 0x4b, 0x8d, 0x58,
+    0xc3, 0x36, 0x11, 0xc7, 0xc9, 0xb9, 0x05, 0x3a, 0x39, 0xc3, 0x04, 0x87,
+    0x05, 0x3d, 0xa8, 0x46, 0x00, 0x8b, 0x43, 0x36, 0x1c, 0x46, 0x00, 0x8b,
+    0xc3, 0x36, 0x26, 0xc9, 0xae, 0xf1, 0x00, 0x11, 0xc8, 0x88, 0x07, 0xd8,
+    0x03, 0x03, 0x36, 0x3b, 0x8e, 0x07, 0xd8, 0x11, 0x8b, 0x07, 0xd8, 0x08,
+    0x8d, 0x0e, 0xf8, 0x81, 0x89, 0x0e, 0xf8, 0x11, 0x94, 0x00, 0xe8, 0xd1,
+    0x8f, 0x05, 0x3f, 0xd1, 0x87, 0x01, 0x63, 0xd8, 0xc4, 0xa8, 0x1a, 0x0e,
+    0xf8, 0x21, 0xc6, 0x01, 0x73, 0x00, 0xe8, 0x60, 0x94, 0x00, 0xe8, 0xc9,
+    0x90, 0x00, 0xe8, 0x70, 0xc4, 0xb0, 0x8b, 0x00, 0xf7, 0xf1, 0xc5, 0x1e,
+    0xc8, 0x00, 0xf7, 0xc1, 0xc4, 0x01, 0x23, 0x00, 0x0d, 0x9b, 0x03, 0x36,
+    0x43, 0x06, 0xc3, 0x36, 0x49, 0xc5, 0x1f, 0x0c, 0x00, 0xf7, 0x91, 0xc5,
+    0x31, 0xee, 0x00, 0x06, 0xe9, 0xca, 0x08, 0xf6, 0x00, 0x0b, 0xb1, 0xc6,
+    0x60, 0xb1, 0x00, 0x11, 0x91, 0xc6, 0x01, 0x73, 0x00, 0x12, 0x70, 0x47,
+    0xc0, 0x2e, 0xc3, 0x36, 0x55, 0xc8, 0xba, 0x02, 0x05, 0x3e, 0xb0, 0x44,
+    0x05, 0x18, 0xc3, 0x36, 0x5f, 0xc5, 0x31, 0xee, 0x00, 0xf1, 0xf1, 0xc4,
+    0x01, 0x23, 0x01, 0x63, 0x70, 0x45, 0x00, 0x8c, 0xc3, 0x36, 0x6b, 0xc3,
+    0x01, 0x5d, 0x00, 0x12, 0x20, 0x42, 0x01, 0x23, 0xc3, 0x36, 0xb5, 0x05,
+    0xc3, 0x36, 0xc4, 0x06, 0xc3, 0x36, 0xd3, 0x0f, 0xc3, 0x36, 0xe0, 0xc5,
+    0x1e, 0xc8, 0x00, 0x06, 0xab, 0x03, 0x36, 0xef, 0xc6, 0x01, 0x73, 0x00,
+    0x06, 0xc3, 0x03, 0x36, 0xf5, 0xc5, 0x1f, 0x0c, 0x00, 0x06, 0x91, 0xc5,
+    0x31, 0xee, 0x00, 0x06, 0x99, 0x42, 0x01, 0xc8, 0xc3, 0x36, 0xfb, 0xc5,
+    0x1d, 0x88, 0x00, 0x0a, 0x71, 0xc6, 0xcc, 0x8f, 0x00, 0x0f, 0x53, 0x03,
+    0x37, 0x07, 0xce, 0x1d, 0x93, 0x00, 0x10, 0x70, 0x91, 0x00, 0x0c, 0x31,
+    0x87, 0x00, 0x0c, 0x80, 0x06, 0xc3, 0x37, 0x0d, 0xca, 0x9e, 0x5a, 0x00,
+    0xf6, 0x41, 0xc5, 0x1e, 0xc8, 0x00, 0x09, 0x43, 0x03, 0x37, 0x1a, 0xc5,
+    0x1f, 0x0c, 0x00, 0x06, 0x61, 0xc5, 0x31, 0xee, 0x00, 0x06, 0x69, 0x05,
+    0xc3, 0x37, 0x20, 0xc6, 0x60, 0xb1, 0x00, 0x09, 0x51, 0xc5, 0x1d, 0x88,
+    0x00, 0x09, 0x61, 0xc6, 0xcc, 0x8f, 0x00, 0x09, 0x71, 0xc6, 0x01, 0x73,
+    0x00, 0x0c, 0xb9, 0xce, 0x1d, 0x93, 0x00, 0x10, 0x50, 0x88, 0x05, 0x3b,
+    0xd9, 0x89, 0x05, 0x3b, 0xe9, 0x94, 0x05, 0x3c, 0x11, 0x95, 0x05, 0x3c,
+    0x21, 0x96, 0x05, 0x3c, 0x31, 0x86, 0x05, 0x3b, 0xc8, 0x05, 0xc3, 0x37,
+    0x2c, 0xc5, 0x1e, 0xc8, 0x00, 0xf5, 0xe3, 0x03, 0x37, 0x44, 0xca, 0x9e,
+    0x5a, 0x00, 0xf5, 0xd1, 0x06, 0xc3, 0x37, 0x4a, 0xc6, 0x60, 0xb1, 0x00,
+    0x08, 0x93, 0x03, 0x37, 0x54, 0xc5, 0x1f, 0x0c, 0x00, 0x06, 0x41, 0xc5,
+    0x31, 0xee, 0x00, 0x06, 0x49, 0xc5, 0x1d, 0x88, 0x00, 0x08, 0xa1, 0xc6,
+    0xcc, 0x8f, 0x00, 0x08, 0xc1, 0xce, 0x1d, 0x93, 0x00, 0x10, 0x31, 0xc6,
+    0x01, 0x73, 0x00, 0x12, 0x30, 0xc3, 0x00, 0x49, 0x05, 0x39, 0x11, 0xc2,
+    0x00, 0x74, 0x05, 0x39, 0x20, 0x8a, 0x00, 0x06, 0x80, 0x00, 0x43, 0x37,
+    0x5a, 0xc5, 0x1d, 0x88, 0x00, 0x08, 0x13, 0x03, 0x37, 0x66, 0x05, 0xc3,
+    0x37, 0x6c, 0xca, 0x9e, 0x5a, 0x00, 0xf5, 0x11, 0x06, 0xc3, 0x37, 0x7b,
+    0x45, 0x00, 0x9d, 0xc3, 0x37, 0x88, 0xce, 0x1d, 0x93, 0x00, 0x10, 0x11,
+    0xc5, 0x1f, 0x0c, 0x00, 0x06, 0x01, 0xc5, 0x31, 0xee, 0x00, 0x06, 0x09,
+    0xc5, 0x1e, 0xc8, 0x00, 0x06, 0x19, 0xc6, 0x60, 0xb1, 0x00, 0x08, 0x01,
+    0xc6, 0xcc, 0x8f, 0x00, 0x08, 0x21, 0xc6, 0x01, 0x73, 0x00, 0x11, 0xd0,
+    0x46, 0x00, 0x8b, 0x43, 0x37, 0x97, 0xd4, 0x3e, 0x6c, 0x05, 0x39, 0xd0,
+    0x44, 0x05, 0x18, 0xc3, 0x37, 0xa3, 0x05, 0xc3, 0x37, 0xb2, 0xc5, 0x31,
+    0xee, 0x00, 0x0a, 0xd3, 0x03, 0x37, 0xcd, 0xce, 0x38, 0xe6, 0x05, 0x3d,
+    0x41, 0xc4, 0x01, 0x23, 0x05, 0x3e, 0x29, 0x15, 0x43, 0x37, 0xd3, 0xc6,
+    0xbb, 0x8c, 0x05, 0x3d, 0x61, 0xc3, 0x74, 0x83, 0x00, 0x0c, 0x78, 0xd0,
+    0x5f, 0x12, 0x00, 0x12, 0x51, 0xc9, 0xb1, 0xca, 0x05, 0x3d, 0x70, 0xca,
+    0x64, 0x13, 0x00, 0xf4, 0xa1, 0x06, 0xc3, 0x37, 0xdf, 0x05, 0xc3, 0x37,
+    0xeb, 0xcc, 0x51, 0x28, 0x05, 0x3e, 0x31, 0xc5, 0x31, 0xee, 0x00, 0x0b,
+    0xc9, 0x15, 0xc3, 0x37, 0xf7, 0xc4, 0x01, 0x23, 0x00, 0x11, 0x20, 0xc8,
+    0x20, 0xa9, 0x00, 0xf4, 0x61, 0xc8, 0x16, 0x15, 0x00, 0xf4, 0x50, 0x06,
+    0xc3, 0x38, 0x03, 0xc5, 0x31, 0xee, 0x00, 0xf4, 0x11, 0xc5, 0x1f, 0x0c,
+    0x00, 0xf4, 0x01, 0xc4, 0x01, 0x23, 0x01, 0x63, 0x91, 0xca, 0x08, 0xf6,
+    0x00, 0x0b, 0xa0, 0x06, 0xc3, 0x38, 0x0f, 0xc5, 0x1e, 0xc8, 0x00, 0xf3,
+    0xe1, 0xc4, 0x01, 0x23, 0x00, 0x0d, 0x90, 0xc2, 0x10, 0x11, 0x05, 0x3c,
+    0xd1, 0xc2, 0x49, 0x0c, 0x05, 0x3c, 0xe1, 0xc2, 0x0f, 0xe1, 0x05, 0x3c,
+    0xf0, 0x05, 0xc3, 0x38, 0x1b, 0xca, 0x64, 0x13, 0x00, 0xf3, 0x71, 0x06,
+    0xc3, 0x38, 0x33, 0xc6, 0x01, 0x73, 0x00, 0x0b, 0x31, 0xc4, 0x01, 0x23,
+    0x00, 0x0d, 0x61, 0xce, 0x01, 0x19, 0x00, 0x0d, 0x70, 0xcc, 0x23, 0x3f,
+    0x05, 0x3b, 0x22, 0x03, 0x38, 0x3f, 0xc9, 0x67, 0x20, 0x05, 0x3b, 0xf1,
+    0x8e, 0x05, 0x3c, 0x01, 0x8a, 0x05, 0x3c, 0x69, 0x8d, 0x05, 0x3d, 0x81,
+    0x96, 0x05, 0x3d, 0x89, 0x8f, 0x00, 0x0c, 0xe1, 0x98, 0x00, 0x12, 0x29,
+    0x83, 0x01, 0x63, 0x7a, 0x03, 0x38, 0x45, 0xc3, 0x22, 0xcb, 0x00, 0x0c,
+    0x21, 0xc3, 0x02, 0x9f, 0x00, 0x0d, 0x39, 0xc4, 0x0d, 0x13, 0x00, 0x0d,
+    0xe0, 0x45, 0x00, 0x8c, 0xc3, 0x38, 0x4b, 0xc7, 0xa6, 0x69, 0x05, 0x3a,
+    0xd0, 0xca, 0x9a, 0xe0, 0x05, 0x39, 0xf1, 0xc6, 0x21, 0xa3, 0x05, 0x3d,
+    0x59, 0x87, 0x00, 0x0c, 0x71, 0xc6, 0xd3, 0x2b, 0x05, 0x3f, 0xa8, 0xc9,
+    0x16, 0x14, 0x00, 0xf2, 0xb1, 0xc5, 0x31, 0xee, 0x00, 0xf2, 0xa1, 0x15,
+    0xc3, 0x38, 0x79, 0xc4, 0x01, 0x23, 0x00, 0x0d, 0x21, 0xc8, 0xbe, 0x9a,
+    0x05, 0x3a, 0x90, 0x05, 0xc3, 0x38, 0x88, 0x0e, 0xc3, 0x38, 0x9a, 0x06,
+    0xc3, 0x38, 0xac, 0xc5, 0x1f, 0x0c, 0x00, 0x0f, 0xc1, 0xc5, 0x1e, 0xc8,
+    0x00, 0x06, 0x89, 0xc5, 0x31, 0xee, 0x00, 0x0a, 0x19, 0xce, 0x38, 0xe6,
+    0x05, 0x3d, 0x21, 0xce, 0x6e, 0x04, 0x00, 0x0e, 0x58, 0x05, 0xc3, 0x38,
+    0xb8, 0xca, 0x64, 0x13, 0x00, 0xf1, 0xd1, 0x42, 0x00, 0x58, 0xc3, 0x38,
+    0xca, 0xcb, 0x8f, 0xb5, 0x05, 0x3a, 0x41, 0xc5, 0x31, 0xee, 0x00, 0x09,
+    0xc9, 0x47, 0x04, 0xcb, 0xc3, 0x38, 0xd9, 0x15, 0xc3, 0x38, 0xe5, 0x04,
+    0x43, 0x38, 0xf1, 0xca, 0x64, 0x13, 0x00, 0xf1, 0xa1, 0x06, 0xc3, 0x38,
+    0xfd, 0xc5, 0x31, 0xee, 0x00, 0xf1, 0x81, 0xc6, 0x01, 0x73, 0x05, 0x3a,
+    0x03, 0x03, 0x39, 0x0f, 0x05, 0xc3, 0x39, 0x15, 0xce, 0x38, 0xe6, 0x05,
+    0x3d, 0x11, 0xc4, 0x01, 0x23, 0x00, 0x0c, 0xc0, 0xcb, 0x97, 0x2f, 0x00,
+    0xf1, 0x51, 0x05, 0xc3, 0x39, 0x21, 0x06, 0xc3, 0x39, 0x33, 0xc6, 0x01,
+    0x73, 0x00, 0x09, 0x31, 0xc4, 0x01, 0x23, 0x05, 0x3d, 0x50, 0xc6, 0x60,
+    0xb1, 0x00, 0xf1, 0x01, 0xc5, 0x31, 0xee, 0x00, 0x0f, 0xa1, 0x05, 0xc3,
+    0x39, 0x45, 0xc5, 0x1d, 0x88, 0x00, 0x08, 0xf1, 0xc9, 0x16, 0x14, 0x00,
+    0x09, 0x01, 0xce, 0x38, 0xe6, 0x05, 0x3d, 0x01, 0xc4, 0x01, 0x23, 0x00,
+    0x0c, 0x99, 0xc6, 0x01, 0x73, 0x00, 0x0f, 0x20, 0x97, 0x05, 0x3d, 0xf1,
+    0x8b, 0x05, 0x3d, 0xe1, 0x83, 0x05, 0x3d, 0xd1, 0xc4, 0x00, 0xf0, 0x00,
+    0x12, 0x08, 0xc9, 0x16, 0x14, 0x00, 0xf0, 0xf1, 0xc6, 0x01, 0x73, 0x05,
+    0x3c, 0xc1, 0xc4, 0x01, 0x23, 0x00, 0x0c, 0x88, 0x05, 0xc3, 0x39, 0x57,
+    0xca, 0x64, 0x13, 0x00, 0xf0, 0x71, 0x44, 0x05, 0x18, 0xc3, 0x39, 0x69,
+    0x15, 0xc3, 0x39, 0x75, 0xc4, 0x01, 0x23, 0x00, 0x0c, 0x51, 0xc6, 0xcf,
+    0xcb, 0x00, 0x0c, 0x58, 0xcb, 0x8e, 0x60, 0x00, 0x0e, 0x20, 0x05, 0xc3,
+    0x39, 0x8a, 0xc5, 0x31, 0xee, 0x00, 0x08, 0x31, 0xc9, 0x16, 0x14, 0x00,
+    0x08, 0x51, 0xc3, 0x01, 0x5d, 0x05, 0x3c, 0x91, 0xcc, 0x51, 0x28, 0x05,
+    0x3e, 0x21, 0xc4, 0x01, 0x23, 0x00, 0x0c, 0x39, 0xc6, 0x01, 0x73, 0x00,
+    0x11, 0xd8, 0xcb, 0x8e, 0x3f, 0x05, 0x39, 0x70, 0xca, 0x64, 0x13, 0x00,
+    0xf0, 0x31, 0x44, 0x05, 0x18, 0xc3, 0x39, 0x9f, 0xc8, 0xbe, 0x9a, 0x05,
+    0x3c, 0xb1, 0xc4, 0x01, 0x23, 0x00, 0x0c, 0x09, 0xc6, 0xcf, 0xcb, 0x00,
+    0x0c, 0x11, 0xc6, 0x01, 0x73, 0x00, 0x12, 0x18, 0x05, 0xc3, 0x39, 0xab,
+    0xc6, 0x01, 0x73, 0x00, 0x12, 0x40, 0xd8, 0x25, 0xeb, 0x05, 0x3a, 0xb1,
+    0xcf, 0x3e, 0xad, 0x05, 0x3a, 0xc0, 0x83, 0x00, 0x74, 0x89, 0xc2, 0x00,
+    0xd0, 0x00, 0x74, 0x90, 0xc6, 0x04, 0xe1, 0x0f, 0xda, 0xb1, 0xcc, 0x04,
+    0xcb, 0x0f, 0xdb, 0x28, 0xcc, 0x04, 0xcb, 0x0f, 0xdb, 0x21, 0xc5, 0x00,
+    0x2c, 0x0f, 0xdb, 0x30, 0xc6, 0x04, 0xe1, 0x0f, 0xda, 0xd9, 0xcc, 0x04,
+    0xcb, 0x0f, 0xdb, 0x00, 0xcc, 0x04, 0xcb, 0x0f, 0xda, 0xf9, 0xc5, 0x00,
+    0x2c, 0x0f, 0xdb, 0x08, 0xcc, 0x07, 0xbb, 0x01, 0x0f, 0x69, 0xce, 0x0e,
+    0xf1, 0x01, 0x0f, 0x60, 0x00, 0x43, 0x39, 0xb7, 0xd2, 0x05, 0xd4, 0x0f,
+    0xc0, 0x09, 0xd5, 0x03, 0xd2, 0x0f, 0xc0, 0x88, 0xca, 0x03, 0x87, 0x01,
+    0x0d, 0x89, 0xc9, 0x01, 0x88, 0x01, 0x0d, 0x80, 0x06, 0xc3, 0x39, 0xc9,
+    0xdf, 0x0d, 0x3e, 0x01, 0x4b, 0x18, 0xc3, 0xe5, 0x8a, 0x0f, 0xb3, 0x39,
+    0xc9, 0xb4, 0x91, 0x0f, 0xb2, 0xf8, 0xe0, 0x0a, 0x87, 0x01, 0x3a, 0xd8,
+    0xe0, 0x0b, 0x27, 0x01, 0x3b, 0x00, 0xe0, 0x0b, 0x27, 0x01, 0x3a, 0xf8,
+    0xdc, 0x12, 0xe1, 0x01, 0x3d, 0x31, 0xde, 0x0e, 0x14, 0x01, 0x3d, 0x28,
+    0xe0, 0x0a, 0x87, 0x01, 0x3a, 0xe8, 0xd5, 0x03, 0xd2, 0x0f, 0xc0, 0xd1,
+    0xdb, 0x17, 0x46, 0x0f, 0xc0, 0xf0, 0xc4, 0x01, 0xce, 0x0f, 0xc4, 0xf1,
+    0xc5, 0x06, 0x67, 0x0f, 0xc4, 0xf8, 0xc6, 0x64, 0xa4, 0x07, 0xda, 0x4b,
+    0x03, 0x39, 0xcf, 0x15, 0x43, 0x39, 0xd5, 0x46, 0x00, 0x8b, 0x43, 0x39,
+    0xe1, 0xc9, 0x60, 0xf3, 0x07, 0xd9, 0x49, 0xc4, 0x40, 0x95, 0x07, 0xd9,
+    0x00, 0xc8, 0x4c, 0xcc, 0x02, 0x6e, 0x69, 0xc3, 0x00, 0x28, 0x02, 0x6f,
+    0x08, 0xc3, 0x0e, 0xa7, 0x00, 0x04, 0x41, 0xd2, 0x49, 0x55, 0x00, 0x04,
+    0x48, 0x0d, 0xc3, 0x39, 0xf3, 0x15, 0xc3, 0x3a, 0x05, 0xc5, 0x79, 0xf2,
+    0x05, 0x4b, 0x49, 0xc5, 0xda, 0xe7, 0x05, 0x4b, 0x41, 0xc6, 0xc0, 0x7c,
+    0x05, 0x4b, 0x31, 0xc5, 0xd9, 0x61, 0x00, 0x88, 0xc1, 0xc5, 0x90, 0xe4,
+    0x00, 0x88, 0xd1, 0xc5, 0xdb, 0xff, 0x05, 0x4b, 0x68, 0xcb, 0x90, 0xde,
+    0x05, 0x4b, 0xe1, 0x16, 0xc3, 0x3a, 0x11, 0xc5, 0xdb, 0xff, 0x00, 0x88,
+    0x6b, 0x03, 0x3a, 0x1d, 0xc4, 0xad, 0x2b, 0x00, 0x88, 0x53, 0x03, 0x3a,
+    0x23, 0xc6, 0x8e, 0xde, 0x00, 0x88, 0x09, 0xc5, 0x79, 0xf2, 0x00, 0x88,
+    0x41, 0xc5, 0xd9, 0x61, 0x00, 0x88, 0xa1, 0xc5, 0xd6, 0x8c, 0x00, 0x88,
+    0xc9, 0xc5, 0xb7, 0x9d, 0x00, 0x8a, 0x39, 0xc5, 0x90, 0xe4, 0x00, 0x8a,
+    0xc0, 0x02, 0x43, 0x3a, 0x29, 0x02, 0x43, 0x3a, 0x5d, 0x02, 0x43, 0x3a,
+    0x69, 0xc5, 0x90, 0xe4, 0x05, 0x4b, 0xb9, 0xc5, 0xd6, 0x8c, 0x05, 0x4b,
+    0xb1, 0xc6, 0x8e, 0xde, 0x00, 0x8a, 0x09, 0x16, 0xc3, 0x3a, 0x8b, 0xc5,
+    0xda, 0xe7, 0x00, 0x8a, 0x19, 0x12, 0xc3, 0x3a, 0x97, 0xc4, 0xad, 0x2b,
+    0x00, 0x8a, 0x29, 0x05, 0x43, 0x3a, 0xa9, 0xc4, 0xad, 0x2b, 0x05, 0x4b,
+    0x89, 0xc6, 0xc0, 0x7c, 0x05, 0x4b, 0x81, 0xc6, 0x8e, 0xde, 0x05, 0x4b,
+    0x79, 0xc5, 0x79, 0xf2, 0x00, 0x88, 0xe0, 0x02, 0x43, 0x3a, 0xb5, 0xc7,
+    0xc0, 0x7b, 0x00, 0x8a, 0xd0, 0xc5, 0xd6, 0x8c, 0x00, 0x88, 0xd9, 0xc5,
+    0xda, 0xe7, 0x00, 0x88, 0xe9, 0x12, 0xc3, 0x3a, 0xd9, 0xca, 0xa7, 0x2e,
+    0x00, 0x89, 0x60, 0xc6, 0x8e, 0xde, 0x00, 0x88, 0x99, 0xc6, 0xc0, 0x7c,
+    0x00, 0x88, 0xa9, 0xc5, 0x79, 0xf2, 0x00, 0x88, 0xb1, 0xc4, 0xad, 0x2b,
+    0x00, 0x8a, 0xd9, 0xc5, 0xdb, 0xff, 0x00, 0x8a, 0xe1, 0xc5, 0x90, 0xe4,
+    0x00, 0x8a, 0xe8, 0xc6, 0xd1, 0x03, 0x00, 0x8a, 0x68, 0xc4, 0xc6, 0x7b,
+    0x00, 0x88, 0x73, 0x03, 0x3a, 0xe5, 0x45, 0xd5, 0x1f, 0x43, 0x3a, 0xe9,
+    0x15, 0xc3, 0x3a, 0xf1, 0x05, 0x43, 0x3a, 0xfd, 0x87, 0x00, 0x8b, 0x11,
+    0x02, 0xc3, 0x3b, 0x09, 0xc4, 0xa6, 0x08, 0x00, 0x8c, 0xf2, 0x03, 0x3b,
+    0x17, 0x83, 0x00, 0x8b, 0x1b, 0x03, 0x3b, 0x1b, 0x87, 0x00, 0x8b, 0x43,
+    0x03, 0x3b, 0x23, 0x91, 0x00, 0x8b, 0x6b, 0x03, 0x3b, 0x2a, 0x97, 0x00,
+    0x8b, 0x93, 0x03, 0x3b, 0x2e, 0x8b, 0x00, 0x8b, 0xa2, 0x03, 0x3b, 0x32,
+    0x91, 0x00, 0x8b, 0x2b, 0x03, 0x3b, 0x38, 0x97, 0x00, 0x8b, 0x9a, 0x03,
+    0x3b, 0x3c, 0x87, 0x00, 0x8b, 0x61, 0x02, 0x43, 0x3b, 0x40, 0x83, 0x00,
+    0x8b, 0x53, 0x03, 0x3b, 0x56, 0x87, 0x00, 0x8b, 0x83, 0x03, 0x3b, 0x5a,
+    0x8b, 0x00, 0x8b, 0x88, 0x02, 0x43, 0x3b, 0x5e, 0x02, 0x43, 0x3b, 0x7e,
+    0xc5, 0x8e, 0xdf, 0x00, 0x8d, 0x43, 0x03, 0x3b, 0x9e, 0xc6, 0xbb, 0xec,
+    0x00, 0x8d, 0xf9, 0x47, 0x79, 0xeb, 0x43, 0x3b, 0xa2, 0x44, 0x3a, 0xbf,
+    0xc3, 0x3b, 0xb2, 0xc3, 0x39, 0x37, 0x00, 0x8d, 0xd2, 0x03, 0x3b, 0xf7,
+    0x02, 0x43, 0x3b, 0xfb, 0xc5, 0xc0, 0x7d, 0x00, 0x8d, 0x73, 0x03, 0x3c,
+    0x21, 0xc6, 0xc1, 0x86, 0x00, 0x8e, 0x00, 0x02, 0x43, 0x3c, 0x25, 0x02,
+    0x43, 0x3c, 0x50, 0xc4, 0x79, 0xf3, 0x00, 0x8d, 0xc3, 0x03, 0x3c, 0x74,
+    0xc6, 0xba, 0x7c, 0x00, 0x8e, 0x0b, 0x03, 0x3c, 0x78, 0xc6, 0xca, 0x0e,
+    0x00, 0x8f, 0x5a, 0x03, 0x3c, 0x7c, 0x02, 0x43, 0x3c, 0x80, 0xc4, 0xc6,
+    0x7a, 0x00, 0x8d, 0xeb, 0x03, 0x3c, 0x8a, 0xc6, 0xc6, 0x79, 0x00, 0x8d,
+    0xf0, 0x02, 0x43, 0x3c, 0x8e, 0xc6, 0xb7, 0x9c, 0x00, 0x8f, 0x83, 0x03,
+    0x3c, 0xa6, 0xc9, 0x90, 0xe0, 0x00, 0x8f, 0xc8, 0xc5, 0xd9, 0xca, 0x01,
+    0x89, 0x98, 0xc5, 0xda, 0xe7, 0x01, 0x8b, 0x89, 0x12, 0xc3, 0x3c, 0xaa,
+    0xca, 0xa7, 0x2e, 0x01, 0x8b, 0xc8, 0xc6, 0x8e, 0xde, 0x01, 0x89, 0x91,
+    0xc6, 0xc0, 0x7c, 0x01, 0x89, 0xc1, 0xc5, 0x79, 0xf2, 0x01, 0x8a, 0x19,
+    0xc4, 0xad, 0x2b, 0x01, 0x8a, 0x31, 0xc5, 0xdb, 0xff, 0x01, 0x8a, 0x49,
+    0xc5, 0xd9, 0x61, 0x01, 0x8b, 0x29, 0xc5, 0xb7, 0x9d, 0x01, 0x8c, 0x01,
+    0xc5, 0x90, 0xe4, 0x01, 0x8c, 0x28, 0x02, 0x43, 0x3c, 0xb6, 0xc5, 0xdb,
+    0xff, 0x01, 0x89, 0xa9, 0xc5, 0x90, 0xe4, 0x01, 0x89, 0xb1, 0xc6, 0xc0,
+    0x7c, 0x01, 0x8b, 0x31, 0xc4, 0xad, 0x2b, 0x01, 0x8b, 0x39, 0xc7, 0xca,
+    0x0d, 0x01, 0x8b, 0x40, 0xc6, 0x8e, 0xde, 0x01, 0x89, 0xd3, 0x03, 0x3c,
+    0xd4, 0xc5, 0xda, 0xe7, 0x01, 0x89, 0xd9, 0x12, 0xc3, 0x3c, 0xda, 0xc4,
+    0xad, 0x2b, 0x01, 0x89, 0xe9, 0x16, 0xc3, 0x3c, 0xef, 0xc5, 0x90, 0xe4,
+    0x01, 0x8a, 0x01, 0xcb, 0x90, 0xde, 0x01, 0x8b, 0x68, 0x12, 0xc3, 0x3c,
+    0xfb, 0xc4, 0xad, 0x2b, 0x01, 0x8b, 0x78, 0x02, 0x43, 0x3d, 0x07, 0x87,
+    0x01, 0x8c, 0x70, 0x87, 0x01, 0x8a, 0x90, 0x91, 0x01, 0x8a, 0xab, 0x03,
+    0x3d, 0x20, 0xc6, 0xb7, 0x9c, 0x01, 0x8c, 0x0a, 0x03, 0x3d, 0x26, 0x02,
+    0x43, 0x3d, 0x2a, 0x02, 0x43, 0x3d, 0x37, 0x87, 0x01, 0x8a, 0xc8, 0x91,
+    0x01, 0x8a, 0xe8, 0x83, 0x07, 0xfb, 0x39, 0x8b, 0x07, 0xfb, 0x41, 0x97,
+    0x07, 0xfb, 0x49, 0x87, 0x07, 0xfb, 0x51, 0x91, 0x07, 0xfb, 0x59, 0x1b,
+    0xc3, 0x3d, 0x44, 0xc2, 0x00, 0x16, 0x07, 0xfb, 0x78, 0xc4, 0x79, 0xf3,
+    0x07, 0xfd, 0x61, 0xc6, 0xba, 0x7c, 0x07, 0xfd, 0x78, 0xc8, 0x4b, 0x94,
+    0x08, 0x5b, 0xf9, 0xc7, 0x0d, 0x04, 0x08, 0x5b, 0xf0, 0xc4, 0x18, 0x12,
+    0x08, 0x5b, 0xe9, 0x91, 0x08, 0x5b, 0xc8, 0xc3, 0x77, 0x79, 0x08, 0x5b,
+    0x81, 0xc4, 0xdc, 0x2d, 0x08, 0x5b, 0x70, 0xc8, 0x4b, 0x94, 0x08, 0x5a,
+    0xf9, 0xc7, 0x0d, 0x04, 0x08, 0x5a, 0xf0, 0xc4, 0x18, 0x12, 0x08, 0x5a,
+    0xe9, 0x91, 0x08, 0x5a, 0xc8, 0xc4, 0xdc, 0x2d, 0x08, 0x5a, 0x71, 0xc3,
+    0x77, 0x79, 0x08, 0x5a, 0x88, 0xcb, 0x57, 0x1e, 0x0f, 0x65, 0x99, 0xc2,
+    0x02, 0xa0, 0x0f, 0x65, 0x90, 0xc4, 0x18, 0x10, 0x0f, 0x65, 0x49, 0xc2,
+    0x22, 0xcc, 0x0f, 0x65, 0x40, 0xc3, 0x0d, 0x14, 0x0f, 0x65, 0x39, 0xc3,
+    0x09, 0x9e, 0x0f, 0x65, 0x30, 0xc4, 0x02, 0xde, 0x0f, 0x65, 0x29, 0xc2,
+    0x02, 0xa0, 0x0f, 0x65, 0x20, 0xc9, 0x57, 0x20, 0x0f, 0x64, 0xe8, 0xc8,
+    0x4b, 0x94, 0x0f, 0x64, 0xa1, 0xc7, 0x0d, 0x04, 0x0f, 0x64, 0x58, 0xc9,
+    0x57, 0x20, 0x0f, 0x64, 0xe0, 0xc8, 0x4b, 0x94, 0x0f, 0x64, 0x99, 0xc7,
+    0x0d, 0x04, 0x0f, 0x64, 0x50, 0xc2, 0x0d, 0x10, 0x0f, 0x64, 0x03, 0x03,
+    0x3d, 0x50, 0x00, 0x43, 0x3d, 0x56, 0xc2, 0x0d, 0x10, 0x0f, 0x63, 0xfb,
+    0x03, 0x3d, 0x62, 0x00, 0x43, 0x3d, 0x68, 0xc3, 0x45, 0x6b, 0x0f, 0x63,
+    0xf3, 0x03, 0x3d, 0x74, 0xc2, 0x00, 0x5f, 0x0f, 0x63, 0xaa, 0x03, 0x3d,
+    0x7a, 0xc3, 0x0d, 0x0f, 0x0f, 0x63, 0xeb, 0x03, 0x3d, 0x7e, 0xc2, 0x00,
+    0x33, 0x0f, 0x63, 0xa2, 0x03, 0x3d, 0x84, 0xc4, 0x0d, 0x0e, 0x0f, 0x63,
+    0xe3, 0x03, 0x3d, 0x88, 0xc3, 0x02, 0xdf, 0x0f, 0x63, 0x9a, 0x03, 0x3d,
+    0x8e, 0xc4, 0x18, 0x12, 0x0f, 0x63, 0xdb, 0x03, 0x3d, 0x92, 0x91, 0x0f,
+    0x63, 0x92, 0x03, 0x3d, 0x98, 0xc9, 0x57, 0x20, 0x0f, 0x64, 0xa8, 0xc8,
+    0x4b, 0x94, 0x0f, 0x64, 0x61, 0xc7, 0x0d, 0x04, 0x0f, 0x64, 0x18, 0xc2,
+    0x02, 0x6f, 0x01, 0x96, 0x29, 0xc2, 0x00, 0x35, 0x01, 0x96, 0x30, 0xc3,
+    0x05, 0x14, 0x01, 0x9f, 0x01, 0x16, 0xc3, 0x3d, 0x9c, 0x08, 0xc3, 0x3d,
+    0xaa, 0x15, 0xc3, 0x3d, 0xb7, 0x07, 0xc3, 0x3d, 0xc9, 0xc4, 0x26, 0x78,
+    0x01, 0x9f, 0x42, 0x03, 0x3d, 0xd8, 0x19, 0xc3, 0x3d, 0xde, 0x0a, 0xc3,
+    0x3d, 0xe6, 0xc2, 0x00, 0xc4, 0x01, 0x9b, 0x10, 0xc3, 0x09, 0x9e, 0x01,
+    0x9a, 0xe3, 0x03, 0x3d, 0xf2, 0x0b, 0x43, 0x3d, 0xf8, 0xc2, 0x22, 0xcc,
+    0x01, 0x9a, 0xf3, 0x03, 0x3e, 0x04, 0xc4, 0x18, 0x10, 0x01, 0x9a, 0xfa,
+    0x03, 0x3e, 0x0a, 0xc4, 0x00, 0x2d, 0x01, 0x9b, 0x03, 0x03, 0x3e, 0x10,
+    0xc5, 0x66, 0xb1, 0x01, 0x9b, 0x18, 0xc4, 0x14, 0x09, 0x01, 0x9b, 0x58,
+    0xdb, 0x18, 0x03, 0x0f, 0xd1, 0xa9, 0xce, 0x2a, 0xfe, 0x0f, 0xd0, 0x58,
+    0xce, 0x2a, 0xfe, 0x0f, 0xd0, 0x71, 0xdb, 0x18, 0x03, 0x0f, 0xd1, 0xc0,
+    0x49, 0x2a, 0xf5, 0xc3, 0x3e, 0x16, 0x02, 0x43, 0x3e, 0x2c, 0x49, 0x2a,
+    0xf5, 0x43, 0x3e, 0x3e, 0xce, 0x2a, 0xfe, 0x0f, 0xd0, 0x61, 0xdb, 0x18,
+    0x03, 0x0f, 0xd1, 0xb0, 0xce, 0x2a, 0xfe, 0x0f, 0xd0, 0x51, 0xdb, 0x18,
+    0x03, 0x0f, 0xd1, 0xa0, 0xc3, 0x00, 0x74, 0x0f, 0xd0, 0xf1, 0xc5, 0x56,
+    0xa5, 0x0f, 0xd1, 0x10, 0xc8, 0x02, 0x9f, 0x01, 0x34, 0x39, 0x42, 0x00,
+    0x58, 0xc3, 0x3e, 0x4a, 0x46, 0x02, 0xae, 0xc3, 0x3e, 0x56, 0x46, 0x01,
+    0xc8, 0x43, 0x3e, 0x62, 0xc5, 0x22, 0xdb, 0x01, 0x33, 0x08, 0xca, 0xa7,
+    0xc4, 0x01, 0x38, 0x29, 0xdc, 0x13, 0x51, 0x0f, 0xde, 0x00, 0xcd, 0x77,
+    0xd5, 0x0f, 0xbc, 0xa9, 0xcc, 0x51, 0x6c, 0x01, 0x2d, 0x19, 0xd1, 0x51,
+    0x67, 0x0f, 0xbc, 0xa0, 0x14, 0xc3, 0x3e, 0x6e, 0x0e, 0xc3, 0x3e, 0x7a,
+    0x46, 0x02, 0xae, 0xc3, 0x3e, 0x86, 0xd7, 0x27, 0xe7, 0x01, 0x2f, 0x59,
+    0xd4, 0x3d, 0x68, 0x01, 0x1c, 0x28, 0xc4, 0x5d, 0x32, 0x01, 0x31, 0xe1,
+    0xcb, 0x93, 0x3b, 0x0f, 0x99, 0x20, 0xca, 0xa1, 0xac, 0x0f, 0x99, 0x30,
+    0xc5, 0x0b, 0x0a, 0x01, 0x2d, 0x59, 0xc3, 0x0e, 0x6b, 0x01, 0x5a, 0x90,
+    0xc5, 0x06, 0x82, 0x01, 0x30, 0xe1, 0xce, 0x24, 0xd5, 0x0f, 0xa2, 0x40,
+    0xcd, 0x4a, 0x56, 0x01, 0x2e, 0x41, 0xd2, 0x4a, 0x51, 0x0f, 0xbc, 0xd1,
+    0xce, 0x74, 0xa2, 0x0f, 0xbc, 0xd8, 0xe0, 0x08, 0x27, 0x01, 0x37, 0xf8,
+    0xc6, 0x46, 0x3e, 0x01, 0x2d, 0xd9, 0xc7, 0xbb, 0xcb, 0x01, 0x5a, 0xa0,
+    0x89, 0x0f, 0x17, 0x18, 0xc5, 0x00, 0xa2, 0x0f, 0xb1, 0x73, 0x03, 0x3e,
+    0x92, 0xd8, 0x23, 0x4b, 0x0f, 0xd7, 0x10, 0xd3, 0x41, 0x38, 0x0f, 0xb0,
+    0xe9, 0xcb, 0x91, 0x78, 0x0f, 0xb0, 0xe0, 0xcb, 0x93, 0x9e, 0x01, 0x51,
+    0x61, 0xcc, 0x8b, 0xd1, 0x01, 0x51, 0x59, 0xc9, 0x0e, 0x6e, 0x01, 0x51,
+    0x51, 0xcb, 0x52, 0x5b, 0x01, 0x51, 0x48, 0x95, 0x0f, 0x46, 0x89, 0xca,
+    0xa2, 0x92, 0x0f, 0x46, 0xa0, 0xc7, 0x0d, 0x04, 0x08, 0x4e, 0xd3, 0x03,
+    0x3e, 0x96, 0xc8, 0x4b, 0x94, 0x08, 0x4f, 0x18, 0xc7, 0x0d, 0x04, 0x08,
+    0x4e, 0xcb, 0x03, 0x3e, 0x9c, 0xc8, 0x4b, 0x94, 0x08, 0x4f, 0x10, 0x00,
+    0xc3, 0x3e, 0xa2, 0xc2, 0x0d, 0x10, 0x08, 0x4e, 0x7a, 0x03, 0x3e, 0xb1,
+    0x00, 0xc3, 0x3e, 0xb7, 0xc2, 0x0d, 0x10, 0x08, 0x4e, 0x72, 0x03, 0x3e,
+    0xc6, 0xc2, 0x00, 0x5f, 0x08, 0x4e, 0x23, 0x03, 0x3e, 0xcc, 0xc3, 0x45,
+    0x6b, 0x08, 0x4e, 0x6a, 0x03, 0x3e, 0xd0, 0xc2, 0x00, 0x33, 0x08, 0x4e,
+    0x1b, 0x03, 0x3e, 0xd6, 0xc3, 0x0d, 0x0f, 0x08, 0x4e, 0x62, 0x03, 0x3e,
+    0xda, 0xc3, 0x02, 0xdf, 0x08, 0x4e, 0x13, 0x03, 0x3e, 0xe0, 0xc4, 0x0d,
+    0x0e, 0x08, 0x4e, 0x5a, 0x03, 0x3e, 0xe4, 0x91, 0x08, 0x4e, 0x0b, 0x03,
+    0x3e, 0xea, 0xc4, 0x18, 0x12, 0x08, 0x4e, 0x52, 0x03, 0x3e, 0xee, 0xc9,
+    0x57, 0x20, 0x08, 0x4f, 0x20, 0xc7, 0x0d, 0x04, 0x08, 0x4e, 0x93, 0x03,
+    0x3e, 0xf4, 0xc8, 0x4b, 0x94, 0x08, 0x4e, 0xd8, 0x91, 0x08, 0x4d, 0xb1,
+    0x87, 0x08, 0x4d, 0xa9, 0x83, 0x08, 0x4d, 0xa0, 0x83, 0x08, 0x4d, 0x91,
+    0xc2, 0x00, 0xd0, 0x08, 0x4d, 0x68, 0x87, 0x08, 0x4d, 0x89, 0x83, 0x08,
+    0x4d, 0x78, 0xc9, 0x87, 0xed, 0x08, 0x4d, 0x80, 0x87, 0x08, 0x4d, 0x51,
+    0x83, 0x08, 0x4d, 0x48, 0xc2, 0xe5, 0xfd, 0x08, 0x4c, 0xd8, 0xc2, 0xe5,
+    0xfd, 0x08, 0x4c, 0xc8, 0xc2, 0xe5, 0xfd, 0x08, 0x4c, 0xa0, 0xc2, 0xe5,
+    0xfd, 0x08, 0x4c, 0x58, 0xc2, 0xe5, 0xfd, 0x08, 0x4c, 0x68, 0x49, 0x3d,
+    0x54, 0xc3, 0x3e, 0xfa, 0x4a, 0x2c, 0x4a, 0xc3, 0x3f, 0x06, 0x49, 0x45,
+    0xd2, 0xc3, 0x3f, 0x12, 0x47, 0x54, 0x42, 0x43, 0x3f, 0x1e, 0xc3, 0x64,
+    0x58, 0x00, 0xc5, 0x51, 0xc3, 0x39, 0x6d, 0x00, 0xc5, 0x41, 0x1c, 0xc3,
+    0x3f, 0x2a, 0x05, 0xc3, 0x3f, 0x34, 0xc3, 0x1d, 0x35, 0x00, 0xc5, 0x11,
+    0x06, 0xc3, 0x3f, 0x3e, 0x16, 0xc3, 0x3f, 0x4a, 0xc3, 0xe5, 0xf0, 0x00,
+    0xc4, 0xe9, 0xc3, 0x20, 0xf1, 0x00, 0xc4, 0xd9, 0xc3, 0x91, 0x00, 0x00,
+    0xc4, 0xd0, 0x83, 0x00, 0xc4, 0x8b, 0x03, 0x3f, 0x54, 0xc2, 0x0e, 0x9a,
+    0x00, 0xc4, 0x70, 0xc2, 0x19, 0x2c, 0x00, 0xc5, 0x39, 0x97, 0x00, 0xc5,
+    0x30, 0x8a, 0x00, 0xc4, 0xb9, 0xcb, 0x97, 0x71, 0x00, 0xc4, 0x00, 0x83,
+    0x00, 0xc4, 0xb1, 0xc2, 0x00, 0xd0, 0x00, 0xc4, 0xa8, 0xc2, 0x00, 0xd0,
+    0x00, 0xc4, 0x99, 0x83, 0x00, 0xc4, 0x90, 0x83, 0x00, 0xc4, 0x81, 0x16,
+    0xc3, 0x3f, 0x60, 0xcb, 0x8c, 0x9d, 0x00, 0xc4, 0x30, 0xc2, 0x00, 0xc1,
+    0x00, 0xc4, 0x79, 0xc2, 0x01, 0x30, 0x00, 0xc4, 0x50, 0xcf, 0x62, 0x10,
+    0x00, 0xc4, 0x20, 0x48, 0xb1, 0x71, 0xc3, 0x3f, 0x6a, 0xc2, 0x00, 0x75,
+    0x00, 0xc2, 0x50, 0xc2, 0x02, 0x1c, 0x00, 0xc2, 0xe1, 0x83, 0x00, 0xc2,
+    0x88, 0xc2, 0x01, 0x94, 0x00, 0xc2, 0xd1, 0x83, 0x00, 0xc2, 0x98, 0x83,
+    0x00, 0xc2, 0xc0, 0xc2, 0x0d, 0xf6, 0x00, 0xc2, 0xa1, 0x83, 0x00, 0xc2,
+    0x80, 0x87, 0x00, 0xc2, 0x48, 0x87, 0x00, 0xc2, 0x40, 0xc2, 0x00, 0xd0,
+    0x00, 0xc3, 0x91, 0x83, 0x00, 0xc3, 0x78, 0xc2, 0x0d, 0xf6, 0x00, 0xc3,
+    0x71, 0x83, 0x00, 0xc3, 0x40, 0x83, 0x00, 0xc3, 0x68, 0x83, 0x00, 0xc3,
+    0x60, 0x87, 0x00, 0xc3, 0x00, 0x9b, 0x00, 0xc2, 0xf8, 0xc4, 0x18, 0x10,
+    0x08, 0xb2, 0xb9, 0xc2, 0x22, 0xcc, 0x08, 0xb2, 0xb0, 0xc3, 0x0d, 0x14,
+    0x08, 0xb2, 0xa9, 0xc3, 0x09, 0x9e, 0x08, 0xb2, 0xa0, 0xc4, 0x02, 0xde,
+    0x08, 0xb2, 0x99, 0xc2, 0x02, 0xa0, 0x08, 0xb2, 0x90, 0x8e, 0x08, 0xb1,
+    0xc0, 0x94, 0x08, 0xb1, 0xb0, 0x8e, 0x08, 0xb0, 0x43, 0x03, 0x3f, 0x76,
+    0x94, 0x08, 0xb0, 0x32, 0x03, 0x3f, 0x7a, 0xc2, 0x00, 0xd0, 0x08, 0xb0,
+    0xd9, 0x83, 0x08, 0xb0, 0xd0, 0xc2, 0x00, 0xd0, 0x08, 0xb0, 0xc9, 0x83,
+    0x08, 0xb0, 0xc0, 0x96, 0x00, 0xea, 0xbb, 0x03, 0x3f, 0x7e, 0x87, 0x00,
+    0xea, 0x4b, 0x03, 0x3f, 0xab, 0x9c, 0x00, 0xed, 0xdb, 0x03, 0x3f, 0xc3,
+    0x98, 0x00, 0xea, 0xdb, 0x03, 0x3f, 0xc9, 0x85, 0x00, 0xec, 0xe3, 0x03,
+    0x3f, 0xcf, 0x97, 0x00, 0xea, 0xc3, 0x03, 0x3f, 0xe7, 0x95, 0x00, 0x17,
+    0x13, 0x03, 0x3f, 0xf1, 0x92, 0x00, 0xea, 0xb3, 0x03, 0x40, 0x01, 0x84,
+    0x00, 0xea, 0x3b, 0x03, 0x40, 0x07, 0x47, 0x01, 0x56, 0xc3, 0x40, 0x1f,
+    0x8f, 0x00, 0xea, 0x83, 0x03, 0x40, 0x2b, 0x8e, 0x00, 0x17, 0x0b, 0x03,
+    0x40, 0x31, 0x8c, 0x00, 0x15, 0x93, 0x03, 0x40, 0x52, 0x0b, 0xc3, 0x40,
+    0x58, 0x86, 0x00, 0xea, 0x43, 0x03, 0x40, 0x64, 0x88, 0x00, 0xed, 0x03,
+    0x03, 0x40, 0x80, 0x94, 0x00, 0x15, 0x9b, 0x03, 0x40, 0x86, 0x89, 0x00,
+    0xea, 0x6b, 0x03, 0x40, 0x98, 0x83, 0x00, 0xea, 0x1b, 0x03, 0x40, 0xaa,
+    0x91, 0x00, 0xea, 0x93, 0x03, 0x40, 0xba, 0x8d, 0x00, 0xea, 0x79, 0x8a,
+    0x00, 0x15, 0x83, 0x03, 0x40, 0xc6, 0x99, 0x00, 0x15, 0xb9, 0x9b, 0x00,
+    0x15, 0xc1, 0x9a, 0x00, 0x17, 0x19, 0x93, 0x08, 0x3d, 0x28, 0xd5, 0x33,
+    0x14, 0x08, 0x3c, 0x11, 0xd0, 0x33, 0x19, 0x08, 0x3c, 0x08, 0xc9, 0x3d,
+    0x18, 0x05, 0x39, 0x01, 0xc8, 0xae, 0xfb, 0x05, 0x39, 0x08, 0xc3, 0x63,
+    0x85, 0x00, 0x17, 0xe9, 0xcf, 0x63, 0x00, 0x05, 0x3c, 0x50, 0xc2, 0x00,
+    0xc4, 0x00, 0xeb, 0xc1, 0xc9, 0xa8, 0x3a, 0x05, 0x34, 0xe1, 0xc9, 0x84,
+    0xc0, 0x05, 0x34, 0xe8, 0x99, 0x00, 0xea, 0x11, 0x97, 0x00, 0xea, 0x09,
+    0x96, 0x00, 0xea, 0x01, 0x94, 0x00, 0xe9, 0xfb, 0x03, 0x40, 0xd5, 0x92,
+    0x00, 0xe9, 0xf1, 0x91, 0x00, 0xe9, 0xe3, 0x03, 0x40, 0xdb, 0x90, 0x00,
+    0xe9, 0xd1, 0x8f, 0x00, 0xe9, 0xc9, 0x8e, 0x00, 0xe9, 0xc1, 0x8d, 0x00,
+    0xe9, 0xb9, 0x8c, 0x00, 0xe9, 0xb1, 0x8b, 0x00, 0xe9, 0xa9, 0x8a, 0x00,
+    0xe9, 0xa3, 0x03, 0x40, 0xdf, 0x89, 0x00, 0xe9, 0x99, 0x87, 0x00, 0xe9,
+    0x89, 0x86, 0x00, 0xe9, 0x81, 0x84, 0x00, 0xe9, 0x73, 0x03, 0x40, 0xe5,
+    0x83, 0x00, 0xe9, 0x63, 0x03, 0x40, 0xeb, 0x85, 0x05, 0x3f, 0x91, 0x88,
+    0x05, 0x3f, 0x99, 0x93, 0x05, 0x3f, 0xa1, 0x98, 0x01, 0x63, 0xe8, 0x43,
+    0x03, 0x35, 0xc3, 0x40, 0xef, 0x44, 0x10, 0xd1, 0x43, 0x41, 0x07, 0xcf,
+    0x61, 0x89, 0x00, 0x16, 0x91, 0xce, 0x0f, 0x6e, 0x00, 0x16, 0x98, 0xc4,
+    0x32, 0xbc, 0x05, 0x5b, 0x59, 0xc9, 0x0f, 0x73, 0x00, 0x15, 0xf1, 0xc9,
+    0x03, 0xde, 0x00, 0x16, 0x18, 0x47, 0x10, 0x30, 0xc3, 0x41, 0x1f, 0x16,
+    0x43, 0x41, 0x2e, 0xc8, 0x4d, 0x8d, 0x05, 0x38, 0xd9, 0xca, 0x3e, 0xe4,
+    0x05, 0x38, 0xe1, 0xd0, 0x0f, 0x09, 0x05, 0x38, 0xe9, 0xd9, 0x1d, 0x6f,
+    0x05, 0x38, 0xf1, 0xc5, 0x33, 0x24, 0x00, 0x17, 0xc0, 0xc4, 0x32, 0xbc,
+    0x05, 0x5b, 0x51, 0xc9, 0x0f, 0x73, 0x00, 0x15, 0xf9, 0xc9, 0x03, 0xde,
+    0x00, 0x16, 0x10, 0x00, 0xc3, 0x41, 0x34, 0xd5, 0x34, 0xf7, 0x05, 0x38,
+    0xd0, 0xcc, 0x23, 0x3f, 0x08, 0x3d, 0x98, 0xc9, 0x3d, 0x18, 0x00, 0x17,
+    0xc9, 0xc8, 0xae, 0xfb, 0x00, 0x17, 0xd8, 0x45, 0x00, 0x5a, 0xc3, 0x41,
+    0x74, 0x43, 0x11, 0x19, 0xc3, 0x41, 0x80, 0x42, 0x00, 0x30, 0x43, 0x41,
+    0x8c, 0xc9, 0x03, 0xde, 0x00, 0x16, 0x21, 0xc4, 0x32, 0xbc, 0x00, 0x16,
+    0xa0, 0x06, 0xc3, 0x41, 0x9e, 0xc8, 0x68, 0x56, 0x00, 0x16, 0xb8, 0x45,
+    0x08, 0xcb, 0xc3, 0x41, 0xa8, 0x44, 0x05, 0x36, 0x43, 0x41, 0xba, 0xc9,
+    0x3d, 0x18, 0x00, 0x17, 0xd1, 0xc8, 0xae, 0xfb, 0x00, 0x17, 0xe0, 0x47,
+    0x19, 0x7a, 0xc3, 0x41, 0xcc, 0xd2, 0x4e, 0x89, 0x05, 0x38, 0x99, 0xc8,
+    0x4e, 0x93, 0x00, 0x17, 0x30, 0xc3, 0x11, 0x7e, 0x0e, 0xb7, 0xd1, 0xc5,
+    0xd8, 0x8f, 0x0e, 0xb7, 0x80, 0xc7, 0x00, 0x90, 0x0e, 0xb7, 0x98, 0xc3,
+    0x11, 0x7e, 0x0e, 0xb8, 0xa1, 0xc5, 0xd8, 0x8f, 0x0e, 0xb8, 0x50, 0x8c,
+    0x0e, 0xb5, 0x29, 0x8b, 0x0e, 0xb5, 0x20, 0xc3, 0x04, 0x87, 0x0e, 0xb6,
+    0x38, 0x8b, 0x0e, 0xb6, 0x78, 0xc6, 0x10, 0x3f, 0x0e, 0xb6, 0xb0, 0xc6,
+    0x51, 0x50, 0x0e, 0xbe, 0x59, 0xc4, 0xdb, 0x4c, 0x0e, 0xb6, 0x28, 0x0f,
+    0x43, 0x41, 0xd8, 0xc2, 0x00, 0xba, 0x0e, 0xb6, 0xc9, 0xc2, 0x00, 0x0a,
+    0x0e, 0xb6, 0xb9, 0x8b, 0x0e, 0xb6, 0x88, 0xc2, 0x00, 0x0a, 0x0e, 0xb6,
+    0xc0, 0xc2, 0x20, 0xec, 0x0e, 0xb6, 0xa9, 0xc4, 0x89, 0xfe, 0x0e, 0xb6,
+    0x48, 0xc4, 0x1a, 0x73, 0x0e, 0xb6, 0xa0, 0xca, 0x91, 0x2c, 0x0e, 0xb6,
+    0x98, 0xc2, 0x01, 0x23, 0x0e, 0xb6, 0x90, 0x97, 0x0e, 0xb6, 0x70, 0x97,
+    0x0e, 0xb6, 0x68, 0xc4, 0xdd, 0x9a, 0x0e, 0xb6, 0x60, 0xc4, 0x8b, 0x66,
+    0x0e, 0xb6, 0x58, 0xc3, 0x01, 0xbb, 0x0e, 0xb6, 0x50, 0xc2, 0x01, 0x6f,
+    0x0e, 0xb6, 0x41, 0xc6, 0x10, 0x3f, 0x0e, 0xb6, 0x30, 0xc4, 0x38, 0x2c,
+    0x0e, 0xb6, 0x20, 0xc3, 0x04, 0x87, 0x0e, 0xb6, 0x18, 0xc4, 0xde, 0x3f,
+    0x0e, 0xb6, 0x10, 0x9c, 0x0e, 0xa8, 0x19, 0x9b, 0x0e, 0xa8, 0x11, 0x9a,
+    0x0e, 0xa8, 0x09, 0x99, 0x0e, 0xa8, 0x01, 0x98, 0x0e, 0xa7, 0xf9, 0x97,
+    0x0e, 0xa7, 0xf1, 0x96, 0x0e, 0xa7, 0xe9, 0x95, 0x0e, 0xa7, 0xe1, 0x94,
+    0x0e, 0xa7, 0xd9, 0x93, 0x0e, 0xa7, 0xd1, 0x92, 0x0e, 0xa7, 0xc9, 0x91,
+    0x0e, 0xa7, 0xc1, 0x90, 0x0e, 0xa7, 0xb9, 0x8f, 0x0e, 0xa7, 0xb1, 0x8e,
+    0x0e, 0xa7, 0xa9, 0x8d, 0x0e, 0xa7, 0xa1, 0x8c, 0x0e, 0xa7, 0x99, 0x8b,
+    0x0e, 0xa7, 0x91, 0x8a, 0x0e, 0xa7, 0x89, 0x89, 0x0e, 0xa7, 0x81, 0x88,
+    0x0e, 0xa7, 0x79, 0x87, 0x0e, 0xa7, 0x71, 0x86, 0x0e, 0xa7, 0x69, 0x85,
+    0x0e, 0xa7, 0x61, 0x84, 0x0e, 0xa7, 0x59, 0x83, 0x0e, 0xa7, 0x50, 0x9c,
+    0x0e, 0xa7, 0x49, 0x9b, 0x0e, 0xa7, 0x41, 0x9a, 0x0e, 0xa7, 0x39, 0x99,
+    0x0e, 0xa7, 0x31, 0x98, 0x0e, 0xa7, 0x29, 0x97, 0x0e, 0xa7, 0x21, 0x96,
+    0x0e, 0xa7, 0x19, 0x95, 0x0e, 0xa7, 0x11, 0x94, 0x0e, 0xa7, 0x09, 0x93,
+    0x0e, 0xa7, 0x01, 0x92, 0x0e, 0xa6, 0xf9, 0x91, 0x0e, 0xa6, 0xf1, 0x90,
+    0x0e, 0xa6, 0xe9, 0x8f, 0x0e, 0xa6, 0xe1, 0x8e, 0x0e, 0xa6, 0xd9, 0x8d,
+    0x0e, 0xa6, 0xd1, 0x8c, 0x0e, 0xa6, 0xc9, 0x8b, 0x0e, 0xa6, 0xc1, 0x8a,
+    0x0e, 0xa6, 0xb9, 0x89, 0x0e, 0xa6, 0xb1, 0x88, 0x0e, 0xa6, 0xa9, 0x87,
+    0x0e, 0xa6, 0xa1, 0x86, 0x0e, 0xa6, 0x99, 0x85, 0x0e, 0xa6, 0x91, 0x84,
+    0x0e, 0xa6, 0x89, 0x83, 0x0e, 0xa6, 0x80, 0xc3, 0x11, 0x7e, 0x0e, 0xb6,
+    0x01, 0xc5, 0xd8, 0x8f, 0x0e, 0xb5, 0xb0, 0xc7, 0x00, 0x90, 0x0e, 0xb5,
+    0xc8, 0x0f, 0x43, 0x41, 0xe4, 0xc2, 0x00, 0xba, 0x0e, 0xba, 0x69, 0xc2,
+    0x00, 0x0a, 0x0e, 0xba, 0x59, 0x8b, 0x0e, 0xba, 0x28, 0xc2, 0x00, 0x0a,
+    0x0e, 0xba, 0x60, 0xc6, 0x10, 0x3f, 0x0e, 0xba, 0x50, 0xc2, 0x20, 0xec,
+    0x0e, 0xba, 0x49, 0xc4, 0x89, 0xfe, 0x0e, 0xb9, 0xe8, 0xc4, 0x1a, 0x73,
+    0x0e, 0xba, 0x40, 0xca, 0x91, 0x2c, 0x0e, 0xba, 0x38, 0xc2, 0x01, 0x23,
+    0x0e, 0xba, 0x30, 0x8b, 0x0e, 0xba, 0x18, 0x97, 0x0e, 0xba, 0x10, 0x97,
+    0x0e, 0xba, 0x08, 0xc4, 0xdd, 0x9a, 0x0e, 0xba, 0x00, 0xc4, 0x8b, 0x66,
+    0x0e, 0xb9, 0xf8, 0xc3, 0x01, 0xbb, 0x0e, 0xb9, 0xf0, 0xc2, 0x01, 0x6f,
+    0x0e, 0xb9, 0xe1, 0xc6, 0x10, 0x3f, 0x0e, 0xb9, 0xd0, 0xc3, 0x04, 0x87,
+    0x0e, 0xb9, 0xd8, 0xc4, 0xdb, 0x4c, 0x0e, 0xb9, 0xc8, 0xc4, 0x38, 0x2c,
+    0x0e, 0xb9, 0xc0, 0xc3, 0x04, 0x87, 0x0e, 0xb9, 0xb8, 0xc4, 0xde, 0x3f,
+    0x0e, 0xb9, 0xb0, 0x0f, 0x43, 0x41, 0xf0, 0xc2, 0x00, 0xba, 0x0e, 0xb9,
+    0x99, 0xc2, 0x00, 0x0a, 0x0e, 0xb9, 0x89, 0x8b, 0x0e, 0xb9, 0x58, 0xc2,
+    0x00, 0x0a, 0x0e, 0xb9, 0x90, 0xc6, 0x10, 0x3f, 0x0e, 0xb9, 0x80, 0xc2,
+    0x20, 0xec, 0x0e, 0xb9, 0x79, 0xc4, 0x89, 0xfe, 0x0e, 0xb9, 0x1a, 0x03,
+    0x41, 0xfc, 0xc4, 0x1a, 0x73, 0x0e, 0xb9, 0x70, 0xc2, 0x01, 0x23, 0x0e,
+    0xb9, 0x60, 0x8b, 0x0e, 0xb9, 0x48, 0x97, 0x0e, 0xb9, 0x40, 0x97, 0x0e,
+    0xb9, 0x38, 0xc4, 0xdd, 0x9a, 0x0e, 0xb9, 0x30, 0xc4, 0x8b, 0x66, 0x0e,
+    0xb9, 0x28, 0xc3, 0x01, 0xbb, 0x0e, 0xb9, 0x20, 0xc2, 0x01, 0x6f, 0x0e,
+    0xb9, 0x11, 0xc6, 0x10, 0x3f, 0x0e, 0xb9, 0x00, 0xc3, 0x04, 0x87, 0x0e,
+    0xb9, 0x08, 0xc4, 0xdb, 0x4c, 0x0e, 0xb8, 0xf8, 0xc4, 0x38, 0x2c, 0x0e,
+    0xb8, 0xf0, 0xc3, 0x04, 0x87, 0x0e, 0xb8, 0xe8, 0xc4, 0xde, 0x3f, 0x0e,
+    0xb8, 0xe0, 0xc4, 0x26, 0x78, 0x0e, 0xbf, 0xa9, 0xc5, 0x06, 0xdb, 0x0e,
+    0xbf, 0xa1, 0x15, 0xc3, 0x42, 0x02, 0x08, 0xc3, 0x42, 0x0e, 0x16, 0xc3,
+    0x42, 0x1a, 0xc3, 0x05, 0x14, 0x0e, 0xbf, 0x69, 0xc4, 0x15, 0xe7, 0x0e,
+    0xbf, 0x60, 0x12, 0xc3, 0x42, 0x26, 0xca, 0x9c, 0xac, 0x0e, 0xbe, 0x41,
+    0xcc, 0x8b, 0x65, 0x0e, 0xbe, 0x31, 0xcc, 0x89, 0xfd, 0x0e, 0xbe, 0x29,
+    0xce, 0x10, 0x3e, 0x0e, 0xbe, 0x21, 0x46, 0x03, 0x13, 0xc3, 0x42, 0x38,
+    0xc5, 0xdb, 0xf0, 0x0e, 0xbd, 0x49, 0x48, 0x0b, 0x17, 0x43, 0x42, 0xdc,
+    0xc8, 0x9c, 0x0e, 0x0e, 0xbc, 0x79, 0xc9, 0xaa, 0x9e, 0x0e, 0xbc, 0x69,
+    0xd3, 0x43, 0x00, 0x0e, 0xbc, 0x48, 0x91, 0x0e, 0xaf, 0xe3, 0x03, 0x43,
+    0x7d, 0x92, 0x0e, 0xaf, 0xeb, 0x03, 0x43, 0x81, 0x85, 0x0e, 0xaf, 0x83,
+    0x03, 0x43, 0x91, 0x97, 0x0e, 0xb0, 0x13, 0x03, 0x43, 0x97, 0x96, 0x0e,
+    0xb0, 0x0b, 0x03, 0x43, 0x9d, 0x95, 0x0e, 0xb0, 0x03, 0x03, 0x43, 0xa9,
+    0x88, 0x0e, 0xaf, 0x9b, 0x03, 0x43, 0xaf, 0x94, 0x0e, 0xaf, 0xfb, 0x03,
+    0x43, 0xb5, 0x9a, 0x0e, 0xb0, 0x2b, 0x03, 0x43, 0xbb, 0x90, 0x0e, 0xaf,
+    0xdb, 0x03, 0x43, 0xbf, 0x8f, 0x0e, 0xaf, 0xd3, 0x03, 0x43, 0xc3, 0x8e,
+    0x0e, 0xaf, 0xcb, 0x03, 0x43, 0xc7, 0x8d, 0x0e, 0xaf, 0xc3, 0x03, 0x43,
+    0xcd, 0x8b, 0x0e, 0xaf, 0xb3, 0x03, 0x43, 0xd3, 0x87, 0x0e, 0xaf, 0x93,
+    0x03, 0x43, 0xd9, 0x9c, 0x0e, 0xb0, 0x3b, 0x03, 0x43, 0xe5, 0x86, 0x0e,
+    0xaf, 0x8b, 0x03, 0x43, 0xeb, 0x89, 0x0e, 0xaf, 0xa3, 0x03, 0x43, 0xf1,
+    0x84, 0x0e, 0xaf, 0x7b, 0x03, 0x43, 0xf7, 0x83, 0x0e, 0xaf, 0x73, 0x03,
+    0x43, 0xfd, 0x9b, 0x0e, 0xb0, 0x31, 0x99, 0x0e, 0xb0, 0x21, 0x98, 0x0e,
+    0xb0, 0x19, 0x93, 0x0e, 0xaf, 0xf1, 0x8c, 0x0e, 0xaf, 0xb9, 0x8a, 0x0e,
+    0xaf, 0xa8, 0x91, 0x0e, 0xaf, 0x13, 0x03, 0x44, 0x03, 0x92, 0x0e, 0xaf,
+    0x1b, 0x03, 0x44, 0x07, 0x85, 0x0e, 0xae, 0xb3, 0x03, 0x44, 0x17, 0x97,
+    0x0e, 0xaf, 0x43, 0x03, 0x44, 0x1d, 0x96, 0x0e, 0xaf, 0x3b, 0x03, 0x44,
+    0x23, 0x95, 0x0e, 0xaf, 0x33, 0x03, 0x44, 0x32, 0x94, 0x0e, 0xaf, 0x2b,
+    0x03, 0x44, 0x38, 0x9a, 0x0e, 0xaf, 0x5b, 0x03, 0x44, 0x3e, 0x90, 0x0e,
+    0xaf, 0x0b, 0x03, 0x44, 0x42, 0x8f, 0x0e, 0xaf, 0x03, 0x03, 0x44, 0x46,
+    0x8e, 0x0e, 0xae, 0xfb, 0x03, 0x44, 0x4a, 0x8d, 0x0e, 0xae, 0xf3, 0x03,
+    0x44, 0x50, 0x8b, 0x0e, 0xae, 0xe3, 0x03, 0x44, 0x56, 0x87, 0x0e, 0xae,
+    0xc3, 0x03, 0x44, 0x5c, 0x9c, 0x0e, 0xaf, 0x6b, 0x03, 0x44, 0x68, 0x86,
+    0x0e, 0xae, 0xbb, 0x03, 0x44, 0x6e, 0x89, 0x0e, 0xae, 0xd3, 0x03, 0x44,
+    0x74, 0x84, 0x0e, 0xae, 0xab, 0x03, 0x44, 0x7a, 0x83, 0x0e, 0xae, 0xa3,
+    0x03, 0x44, 0x80, 0x9b, 0x0e, 0xaf, 0x61, 0x99, 0x0e, 0xaf, 0x51, 0x98,
+    0x0e, 0xaf, 0x49, 0x93, 0x0e, 0xaf, 0x21, 0x8c, 0x0e, 0xae, 0xe9, 0x8a,
+    0x0e, 0xae, 0xd9, 0x88, 0x0e, 0xae, 0xc8, 0xc4, 0x18, 0x10, 0x0e, 0xbf,
+    0x49, 0xc2, 0x22, 0xcc, 0x0e, 0xbf, 0x40, 0xc3, 0x0d, 0x14, 0x0e, 0xbf,
+    0x39, 0xc3, 0x09, 0x9e, 0x0e, 0xbf, 0x30, 0xc4, 0x02, 0xde, 0x0e, 0xbf,
+    0x29, 0xc2, 0x02, 0xa0, 0x0e, 0xbf, 0x20, 0x9c, 0x0e, 0xb1, 0xd9, 0x9b,
+    0x0e, 0xb1, 0xd1, 0x9a, 0x0e, 0xb1, 0xc9, 0x99, 0x0e, 0xb1, 0xc1, 0x98,
+    0x0e, 0xb1, 0xb9, 0x97, 0x0e, 0xb1, 0xb1, 0x96, 0x0e, 0xb1, 0xa9, 0x95,
+    0x0e, 0xb1, 0xa1, 0x94, 0x0e, 0xb1, 0x99, 0x93, 0x0e, 0xb1, 0x91, 0x92,
+    0x0e, 0xb1, 0x89, 0x91, 0x0e, 0xb1, 0x81, 0x90, 0x0e, 0xb1, 0x79, 0x8f,
+    0x0e, 0xb1, 0x71, 0x8e, 0x0e, 0xb1, 0x69, 0x8d, 0x0e, 0xb1, 0x61, 0x8c,
+    0x0e, 0xb1, 0x59, 0x8b, 0x0e, 0xb1, 0x51, 0x8a, 0x0e, 0xb1, 0x49, 0x89,
+    0x0e, 0xb1, 0x41, 0x88, 0x0e, 0xb1, 0x39, 0x87, 0x0e, 0xb1, 0x31, 0x86,
+    0x0e, 0xb1, 0x29, 0x85, 0x0e, 0xb1, 0x21, 0x84, 0x0e, 0xb1, 0x19, 0x83,
+    0x0e, 0xb1, 0x10, 0x9c, 0x0e, 0xb1, 0x09, 0x9b, 0x0e, 0xb1, 0x01, 0x9a,
+    0x0e, 0xb0, 0xf9, 0x99, 0x0e, 0xb0, 0xf1, 0x98, 0x0e, 0xb0, 0xe9, 0x97,
+    0x0e, 0xb0, 0xe1, 0x96, 0x0e, 0xb0, 0xd9, 0x95, 0x0e, 0xb0, 0xd1, 0x94,
+    0x0e, 0xb0, 0xc9, 0x93, 0x0e, 0xb0, 0xc1, 0x92, 0x0e, 0xb0, 0xb9, 0x91,
+    0x0e, 0xb0, 0xb1, 0x90, 0x0e, 0xb0, 0xa9, 0x8f, 0x0e, 0xb0, 0xa1, 0x8e,
+    0x0e, 0xb0, 0x99, 0x8d, 0x0e, 0xb0, 0x91, 0x8c, 0x0e, 0xb0, 0x89, 0x8b,
+    0x0e, 0xb0, 0x81, 0x8a, 0x0e, 0xb0, 0x79, 0x89, 0x0e, 0xb0, 0x71, 0x88,
+    0x0e, 0xb0, 0x69, 0x87, 0x0e, 0xb0, 0x61, 0x86, 0x0e, 0xb0, 0x59, 0x85,
+    0x0e, 0xb0, 0x51, 0x84, 0x0e, 0xb0, 0x49, 0x83, 0x0e, 0xb0, 0x40, 0xc2,
+    0x00, 0xd0, 0x08, 0xe5, 0x19, 0x83, 0x08, 0xe5, 0x10, 0x94, 0x00, 0x6b,
+    0x00, 0x8e, 0x00, 0x6b, 0x08, 0x8f, 0x00, 0x6a, 0xa1, 0x9b, 0x00, 0x6a,
+    0xa9, 0x8e, 0x00, 0x6b, 0xeb, 0x03, 0x44, 0x86, 0x90, 0x00, 0x6b, 0xdb,
+    0x03, 0x44, 0x8d, 0xc2, 0x01, 0xa3, 0x00, 0x6b, 0xe1, 0x8d, 0x00, 0x6b,
+    0xf8, 0xc2, 0x00, 0xd0, 0x08, 0x8b, 0x09, 0x83, 0x08, 0x8b, 0x00, 0xc2,
+    0x00, 0xd0, 0x08, 0x8a, 0xf9, 0x83, 0x08, 0x8a, 0xf0, 0xc4, 0x57, 0xbc,
+    0x0e, 0x8f, 0x51, 0x46, 0xd1, 0x8d, 0x43, 0x44, 0x91, 0xc3, 0x01, 0x69,
+    0x0e, 0x8f, 0x49, 0xc8, 0xb7, 0x7a, 0x0e, 0x8e, 0xb3, 0x03, 0x44, 0xb7,
+    0x46, 0x1f, 0x87, 0xc3, 0x44, 0xbd, 0x07, 0xc3, 0x44, 0xc7, 0xc5, 0xd9,
+    0xb1, 0x0e, 0x8c, 0x69, 0x0b, 0xc3, 0x44, 0xd3, 0x0a, 0x43, 0x44, 0xdd,
+    0x07, 0xc3, 0x44, 0xe9, 0x11, 0xc3, 0x44, 0xf5, 0xc4, 0xdf, 0xe7, 0x0e,
+    0x8c, 0x79, 0xd3, 0x42, 0x1c, 0x0e, 0x8a, 0xb1, 0xcc, 0x81, 0x75, 0x0e,
+    0x8a, 0x20, 0xc7, 0xc8, 0x46, 0x0e, 0x8e, 0xc3, 0x03, 0x45, 0x04, 0x46,
+    0xce, 0xcf, 0xc3, 0x45, 0x0a, 0xc3, 0x05, 0x9f, 0x0e, 0x8c, 0xbb, 0x03,
+    0x45, 0x16, 0x94, 0x0e, 0x8c, 0xb3, 0x03, 0x45, 0x1a, 0x0a, 0xc3, 0x45,
+    0x20, 0xcd, 0x79, 0xd0, 0x0e, 0x88, 0xb8, 0x0e, 0xc3, 0x45, 0x2c, 0x14,
+    0xc3, 0x45, 0x36, 0x11, 0xc3, 0x45, 0x42, 0xd0, 0x5c, 0x22, 0x0e, 0x8a,
+    0x29, 0xc7, 0xc8, 0x4d, 0x0e, 0x89, 0xa9, 0xc5, 0xac, 0x87, 0x0e, 0x89,
+    0x09, 0xc6, 0xd3, 0x1f, 0x0e, 0x88, 0x98, 0xc4, 0x01, 0x2e, 0x0e, 0x8e,
+    0x99, 0xcc, 0x8b, 0x89, 0x0e, 0x8a, 0xb8, 0x14, 0xc3, 0x45, 0x4c, 0x49,
+    0xad, 0xad, 0xc3, 0x45, 0x58, 0xc5, 0xac, 0x87, 0x0e, 0x88, 0xf2, 0x03,
+    0x45, 0x64, 0xc5, 0xc3, 0x54, 0x0e, 0x8d, 0xdb, 0x03, 0x45, 0x6a, 0xc5,
+    0xc0, 0x9e, 0x0e, 0x8d, 0xb1, 0xc4, 0xe0, 0x2f, 0x0e, 0x8c, 0x81, 0x4d,
+    0x7a, 0x1e, 0xc3, 0x45, 0x6e, 0x44, 0x1f, 0x19, 0x43, 0x45, 0x7a, 0x14,
+    0xc3, 0x45, 0x86, 0x45, 0x3f, 0x0e, 0x43, 0x45, 0x90, 0xc4, 0xcb, 0x41,
+    0x0e, 0x8d, 0xbb, 0x03, 0x45, 0xa8, 0xcf, 0x65, 0x76, 0x0e, 0x88, 0x30,
+    0x44, 0xa1, 0xbe, 0xc3, 0x45, 0xac, 0x11, 0xc3, 0x45, 0xb8, 0x0b, 0xc3,
+    0x45, 0xc4, 0x44, 0xb3, 0xb1, 0xc3, 0x45, 0xce, 0xc5, 0xac, 0x87, 0x0e,
+    0x89, 0x13, 0x03, 0x45, 0xda, 0xc6, 0xcf, 0xef, 0x0e, 0x88, 0x82, 0x03,
+    0x45, 0xe0, 0x03, 0xc3, 0x45, 0xe6, 0x07, 0xc3, 0x46, 0x01, 0x46, 0x00,
+    0x59, 0xc3, 0x46, 0x0d, 0x49, 0xac, 0x84, 0x43, 0x46, 0x1f, 0xcf, 0x68,
+    0xa0, 0x0e, 0x8d, 0x99, 0x45, 0xa6, 0x7b, 0x43, 0x46, 0x27, 0x43, 0x01,
+    0xd0, 0xc3, 0x46, 0x33, 0xc9, 0xb4, 0x9a, 0x0e, 0x8d, 0x30, 0x43, 0x02,
+    0x9c, 0xc3, 0x46, 0x45, 0x46, 0x06, 0xdc, 0x43, 0x46, 0x63, 0xca, 0xa3,
+    0xbe, 0x0e, 0x8d, 0x39, 0xcc, 0x81, 0xb1, 0x0e, 0x8a, 0xc9, 0xcd, 0x77,
+    0xae, 0x0e, 0x8a, 0xc1, 0x47, 0x83, 0xf2, 0x43, 0x46, 0x6f, 0x4f, 0x63,
+    0x3c, 0xc3, 0x46, 0x7b, 0x42, 0x02, 0x6f, 0xc3, 0x46, 0xa2, 0x46, 0xb7,
+    0xd4, 0x43, 0x46, 0xae, 0x0b, 0xc3, 0x46, 0xba, 0x07, 0x43, 0x46, 0xc6,
+    0xc4, 0x03, 0xc8, 0x0e, 0x8c, 0x21, 0xc2, 0x02, 0xae, 0x0e, 0x8c, 0x18,
+    0x46, 0x15, 0x04, 0xc3, 0x46, 0xd2, 0x4b, 0x90, 0x02, 0x43, 0x46, 0xe4,
+    0x43, 0x03, 0x35, 0xc3, 0x46, 0xf0, 0x45, 0x00, 0x8c, 0x43, 0x47, 0x08,
+    0x9f, 0x00, 0x84, 0x59, 0xa0, 0x00, 0x84, 0x60, 0xc2, 0x00, 0xd0, 0x05,
+    0x53, 0x71, 0x83, 0x05, 0x53, 0x68, 0x83, 0x05, 0x53, 0x59, 0xc2, 0x19,
+    0x2c, 0x05, 0x53, 0x28, 0xc2, 0x00, 0xd0, 0x05, 0x53, 0x51, 0x06, 0x43,
+    0x47, 0x14, 0xc2, 0x00, 0xd0, 0x05, 0x53, 0x39, 0x83, 0x05, 0x53, 0x30,
+    0xc2, 0x00, 0xd0, 0x05, 0x53, 0x21, 0x83, 0x05, 0x53, 0x18, 0xc2, 0x00,
+    0xd0, 0x05, 0x53, 0x11, 0x83, 0x05, 0x53, 0x08, 0xc2, 0x00, 0xd0, 0x05,
+    0x4f, 0xf1, 0x83, 0x05, 0x4f, 0xe8, 0xc2, 0x00, 0xd0, 0x05, 0x4f, 0xe1,
+    0x83, 0x05, 0x4f, 0xd9, 0x06, 0x43, 0x47, 0x1e, 0xc2, 0x00, 0xc1, 0x05,
+    0x4f, 0x79, 0xc2, 0x19, 0x2c, 0x05, 0x4f, 0x38, 0xc2, 0x00, 0xd0, 0x05,
+    0x4f, 0x61, 0x83, 0x05, 0x4f, 0x58, 0xc2, 0x00, 0xd0, 0x05, 0x4f, 0x51,
+    0x83, 0x05, 0x4f, 0x48, 0x04, 0xc3, 0x47, 0x28, 0x10, 0xc3, 0x47, 0x32,
+    0xc3, 0xe5, 0xf0, 0x05, 0x4f, 0x11, 0x83, 0x00, 0x81, 0x11, 0x0d, 0xc3,
+    0x47, 0x42, 0x09, 0xc3, 0x47, 0x4c, 0x05, 0xc3, 0x47, 0x56, 0xc2, 0x02,
+    0x1c, 0x00, 0x83, 0xc9, 0xc2, 0x0e, 0x9a, 0x00, 0x83, 0xd9, 0xc3, 0x17,
+    0xb2, 0x00, 0x83, 0xe9, 0xc2, 0x00, 0x87, 0x00, 0x83, 0xf1, 0xc3, 0x00,
+    0xcf, 0x00, 0x84, 0x01, 0xc2, 0x00, 0xd0, 0x00, 0x84, 0x08, 0x97, 0x01,
+    0x8f, 0xa0, 0x91, 0x0d, 0x8b, 0x31, 0x87, 0x0d, 0x8b, 0x29, 0x8b, 0x0d,
+    0x8b, 0x21, 0x83, 0x01, 0x87, 0x70, 0x97, 0x01, 0x86, 0x19, 0x91, 0x01,
+    0x8f, 0x98, 0x83, 0x01, 0x87, 0x19, 0x97, 0x01, 0x87, 0x29, 0x91, 0x01,
+    0x87, 0x38, 0x83, 0x01, 0x87, 0xa9, 0x87, 0x01, 0x87, 0xb1, 0x97, 0x01,
+    0x8f, 0x80, 0x8b, 0x01, 0x8f, 0x89, 0x97, 0x01, 0x8f, 0x90, 0x83, 0x01,
+    0x8f, 0xa9, 0x8b, 0x01, 0x8f, 0xb1, 0x97, 0x01, 0x8f, 0xb9, 0x87, 0x01,
+    0x8f, 0xc1, 0x91, 0x01, 0x8f, 0xc8, 0x83, 0x01, 0x8f, 0xd9, 0x8b, 0x01,
+    0x8f, 0xe1, 0x97, 0x01, 0x8f, 0xe9, 0x87, 0x01, 0x8f, 0xf1, 0x91, 0x01,
+    0x8f, 0xf8, 0x87, 0x0d, 0x89, 0x09, 0x8b, 0x0d, 0x89, 0x00, 0x4f, 0x60,
+    0x3f, 0xc3, 0x47, 0x60, 0x45, 0x28, 0xb1, 0x43, 0x47, 0x7c, 0x94, 0x00,
+    0x64, 0x5b, 0x03, 0x47, 0x94, 0x8e, 0x00, 0x64, 0x62, 0x03, 0x47, 0x98,
+    0xcb, 0x90, 0x44, 0x00, 0x66, 0xe8, 0x83, 0x00, 0x64, 0xf9, 0xc2, 0x00,
+    0xd0, 0x00, 0x65, 0x00, 0x83, 0x00, 0x65, 0x09, 0xc2, 0x00, 0xd0, 0x00,
+    0x65, 0x10, 0x83, 0x00, 0x65, 0x99, 0xc2, 0x00, 0xdb, 0x00, 0x66, 0xf0,
+    0xc4, 0x14, 0xdd, 0x01, 0x7d, 0x81, 0x88, 0x01, 0x7d, 0xa0, 0x44, 0x00,
+    0xde, 0x43, 0x47, 0x9c, 0x8a, 0x01, 0x7b, 0x59, 0xc8, 0x92, 0xfa, 0x01,
+    0x7d, 0x20, 0xc2, 0x01, 0xe2, 0x01, 0x78, 0x19, 0xc2, 0x00, 0x5f, 0x01,
+    0x7d, 0x50, 0xc2, 0x00, 0xb1, 0x01, 0x7b, 0x69, 0xc3, 0x5f, 0x44, 0x01,
+    0x7c, 0xa0, 0x44, 0xdf, 0x4b, 0xc3, 0x47, 0xa8, 0xc2, 0x01, 0xbb, 0x01,
+    0x79, 0xb8, 0xc2, 0x02, 0x37, 0x01, 0x7b, 0xd1, 0xc2, 0x02, 0xa7, 0x01,
+    0x7c, 0xc8, 0x92, 0x01, 0x79, 0xd9, 0xc2, 0x00, 0xc2, 0x01, 0x7a, 0x98,
+    0x92, 0x01, 0x7a, 0x63, 0x03, 0x47, 0xb4, 0xc2, 0x02, 0x6f, 0x01, 0x7b,
+    0x78, 0x90, 0x01, 0x7c, 0x99, 0xc2, 0x00, 0x40, 0x01, 0x7d, 0xd0, 0xc2,
+    0x00, 0x61, 0x01, 0x79, 0xe1, 0x86, 0x01, 0x7d, 0xc0, 0xc4, 0xe3, 0x23,
+    0x01, 0x79, 0xe9, 0xcc, 0x70, 0x8a, 0x01, 0x7a, 0xc8, 0xc2, 0x00, 0x8e,
+    0x01, 0x78, 0xe9, 0x10, 0x43, 0x47, 0xba, 0xc3, 0x0e, 0x6b, 0x01, 0x7c,
+    0x29, 0xc4, 0x03, 0x0e, 0x01, 0x7d, 0x00, 0xc2, 0x00, 0x8e, 0x01, 0x78,
+    0xf8, 0x90, 0x01, 0x7a, 0x91, 0x99, 0x01, 0x7a, 0xb0, 0xca, 0x63, 0x9a,
+    0x01, 0x7c, 0x78, 0x44, 0x23, 0x70, 0xc3, 0x47, 0xc4, 0x43, 0x71, 0xed,
+    0x43, 0x47, 0xd0, 0x44, 0xdf, 0x37, 0xc3, 0x47, 0xdc, 0x43, 0x93, 0x74,
+    0x43, 0x47, 0xe8, 0xc3, 0x38, 0x5b, 0x00, 0xcf, 0xd9, 0xc4, 0xe0, 0xaf,
+    0x00, 0xcf, 0x58, 0x04, 0xc3, 0x47, 0xf4, 0x44, 0x71, 0xec, 0xc3, 0x48,
+    0x00, 0x45, 0xda, 0x97, 0x43, 0x48, 0x0c, 0xc3, 0x38, 0x5b, 0x00, 0xcf,
+    0xa9, 0xc4, 0xe0, 0xaf, 0x00, 0xcf, 0x28, 0x02, 0x43, 0x48, 0x18, 0xce,
+    0x2a, 0xfe, 0x0f, 0xd0, 0xa9, 0xdb, 0x18, 0x03, 0x0f, 0xd1, 0xf8, 0xd2,
+    0x4a, 0x2d, 0x0f, 0xd0, 0x41, 0xce, 0x2a, 0xfe, 0x0f, 0xd0, 0xc9, 0xdf,
+    0x0d, 0x00, 0x0f, 0xd0, 0xe9, 0x16, 0x43, 0x48, 0x28, 0xc7, 0x7a, 0x7f,
+    0x08, 0xa2, 0x39, 0xc7, 0x14, 0x39, 0x08, 0xa2, 0x20, 0xc5, 0x40, 0xe7,
+    0x08, 0xa2, 0x29, 0xc4, 0x1e, 0x97, 0x08, 0xa2, 0x10, 0x8e, 0x08, 0xa0,
+    0x48, 0x94, 0x08, 0xa0, 0x38, 0x89, 0x00, 0xce, 0x10, 0xc2, 0x00, 0xe4,
+    0x00, 0xcd, 0x59, 0x83, 0x00, 0xcc, 0x60, 0xc2, 0x02, 0x41, 0x00, 0xcd,
+    0x49, 0x83, 0x00, 0xcc, 0x30, 0xc2, 0x02, 0x41, 0x00, 0xcd, 0x41, 0x83,
+    0x00, 0xcc, 0x28, 0xc2, 0x00, 0xd0, 0x00, 0xcc, 0xc1, 0x83, 0x00, 0xcc,
+    0xb8, 0x83, 0x00, 0xcc, 0x99, 0xc2, 0x01, 0x30, 0x00, 0xcc, 0x38, 0xc2,
+    0x00, 0xd0, 0x00, 0xcc, 0x91, 0x83, 0x00, 0xcc, 0x89, 0xc2, 0x0d, 0xf6,
+    0x00, 0xcc, 0x58, 0xc2, 0x00, 0xe4, 0x00, 0xcd, 0x51, 0x83, 0x00, 0xcc,
+    0x48, 0xc2, 0x02, 0x41, 0x00, 0xcd, 0x39, 0x83, 0x00, 0xcc, 0x18, 0xc2,
+    0x02, 0x41, 0x00, 0xcd, 0x31, 0x83, 0x00, 0xcc, 0x10, 0xc2, 0x00, 0xd0,
+    0x00, 0xcc, 0xa9, 0x83, 0x00, 0xcc, 0xa0, 0x83, 0x00, 0xcc, 0x81, 0xc2,
+    0x01, 0x30, 0x00, 0xcc, 0x20, 0xc2, 0x00, 0xd0, 0x00, 0xcc, 0x79, 0x83,
+    0x00, 0xcc, 0x71, 0xc2, 0x0d, 0xf6, 0x00, 0xcc, 0x40, 0x9b, 0x00, 0xcd,
+    0xf8, 0x9b, 0x00, 0xcd, 0xf0, 0x9b, 0x00, 0xcd, 0xd8, 0xc3, 0x18, 0x13,
+    0x01, 0x27, 0xa1, 0xc3, 0x22, 0x45, 0x01, 0x27, 0x60, 0x00, 0x43, 0x48,
+    0x34, 0x00, 0x43, 0x48, 0x46, 0xc7, 0x08, 0x79, 0x05, 0x41, 0x81, 0xc4,
+    0x01, 0xce, 0x05, 0x41, 0x89, 0xc9, 0x67, 0x38, 0x05, 0x41, 0x99, 0xc6,
+    0x06, 0xdb, 0x05, 0x41, 0xa0, 0xc8, 0x08, 0x79, 0x05, 0x41, 0x91, 0xca,
+    0xa7, 0x88, 0x05, 0x41, 0xa8, 0xc2, 0x02, 0xe0, 0x0f, 0x3f, 0xf1, 0x8b,
+    0x0f, 0x3f, 0xe8, 0xc2, 0x02, 0xe0, 0x0f, 0x3f, 0xe1, 0x8b, 0x0f, 0x3f,
+    0xd8, 0x87, 0x0f, 0x3f, 0xd3, 0x03, 0x48, 0x5e, 0x8b, 0x0f, 0x3f, 0xc0,
+    0x87, 0x0f, 0x3f, 0xbb, 0x03, 0x48, 0x62, 0x8b, 0x0f, 0x3f, 0xa8, 0xc2,
+    0x02, 0xe0, 0x0f, 0x3f, 0xa1, 0x8b, 0x0f, 0x3f, 0x98, 0x87, 0x0f, 0x3f,
+    0x93, 0x03, 0x48, 0x66, 0x8b, 0x0f, 0x3f, 0x80, 0xc2, 0x02, 0xe0, 0x0f,
+    0x3f, 0x71, 0x8b, 0x0f, 0x3f, 0x68, 0x83, 0x00, 0x98, 0xf8, 0x87, 0x01,
+    0x6c, 0xa8, 0x87, 0x0f, 0x3f, 0x50, 0x87, 0x0f, 0x3f, 0x20, 0x83, 0x0f,
+    0x3f, 0x18, 0x91, 0x05, 0x59, 0x31, 0x87, 0x05, 0x59, 0x2b, 0x03, 0x48,
+    0x6a, 0x83, 0x05, 0x59, 0x03, 0x03, 0x48, 0x6e, 0x8b, 0x05, 0x59, 0x11,
+    0x97, 0x05, 0x59, 0x08, 0x83, 0x01, 0x6d, 0xd8, 0x87, 0x01, 0x6d, 0xe0,
+    0x87, 0x05, 0x58, 0x60, 0x83, 0x00, 0x92, 0xd8, 0x87, 0x00, 0x92, 0xe0,
+    0x83, 0x00, 0x96, 0x18, 0x87, 0x00, 0x96, 0x20, 0x83, 0x00, 0x96, 0x83,
+    0x03, 0x48, 0x72, 0x97, 0x00, 0x96, 0x89, 0x8b, 0x00, 0x96, 0x91, 0x87,
+    0x00, 0x96, 0xab, 0x03, 0x48, 0x76, 0x91, 0x00, 0x96, 0xb0, 0xd1, 0x50,
+    0xbd, 0x01, 0x4f, 0x20, 0xd0, 0x03, 0xb7, 0x01, 0x4b, 0x89, 0xce, 0x33,
+    0x92, 0x01, 0x53, 0x99, 0xc9, 0x60, 0xf3, 0x01, 0x53, 0x89, 0xcf, 0x09,
+    0xf8, 0x01, 0x5a, 0x00, 0xe0, 0x04, 0xe7, 0x01, 0x53, 0xb8, 0xa1, 0x0e,
+    0x92, 0x09, 0xa0, 0x0e, 0x92, 0x01, 0x9f, 0x0e, 0x91, 0xf9, 0x9e, 0x0e,
+    0x91, 0xf1, 0x9d, 0x0e, 0x91, 0xe8, 0xa6, 0x0e, 0x91, 0xe1, 0xa5, 0x0e,
+    0x91, 0xd9, 0xa4, 0x0e, 0x91, 0xd1, 0xa2, 0x0e, 0x91, 0xc9, 0xa0, 0x0e,
+    0x91, 0xc1, 0x9f, 0x0e, 0x91, 0xb9, 0x9d, 0x0e, 0x91, 0xb0, 0xa6, 0x0e,
+    0x91, 0xa9, 0xa5, 0x0e, 0x91, 0xa1, 0xa4, 0x0e, 0x91, 0x99, 0xa3, 0x0e,
+    0x91, 0x91, 0x9f, 0x0e, 0x91, 0x89, 0x9d, 0x0e, 0x91, 0x80, 0xa6, 0x0e,
+    0x91, 0x79, 0xa4, 0x0e, 0x91, 0x71, 0xa3, 0x0e, 0x91, 0x69, 0xa2, 0x0e,
+    0x91, 0x61, 0xa1, 0x0e, 0x91, 0x59, 0xa0, 0x0e, 0x91, 0x50, 0xa6, 0x0e,
+    0x91, 0x49, 0xa5, 0x0e, 0x91, 0x41, 0xa4, 0x0e, 0x91, 0x39, 0xa1, 0x0e,
+    0x91, 0x31, 0xa0, 0x0e, 0x91, 0x29, 0x9f, 0x0e, 0x91, 0x21, 0x9e, 0x0e,
+    0x91, 0x18, 0xa1, 0x0e, 0x90, 0xe1, 0xa0, 0x0e, 0x90, 0xd9, 0x9f, 0x0e,
+    0x90, 0xd1, 0x9e, 0x0e, 0x90, 0xc9, 0x9d, 0x0e, 0x90, 0xc0, 0xa1, 0x0e,
+    0x90, 0xb9, 0xa0, 0x0e, 0x90, 0xb1, 0x9f, 0x0e, 0x90, 0xa9, 0x9e, 0x0e,
+    0x90, 0xa1, 0x9d, 0x0e, 0x90, 0x98, 0xa6, 0x0e, 0x90, 0x91, 0xa5, 0x0e,
+    0x90, 0x89, 0xa4, 0x0e, 0x90, 0x81, 0xa3, 0x0e, 0x90, 0x79, 0xa2, 0x0e,
+    0x90, 0x71, 0xa1, 0x0e, 0x90, 0x69, 0xa0, 0x0e, 0x90, 0x61, 0x9f, 0x0e,
+    0x90, 0x59, 0x9e, 0x0e, 0x90, 0x51, 0x9d, 0x0e, 0x90, 0x48, 0xcb, 0x94,
+    0x90, 0x00, 0xfe, 0xf9, 0xc4, 0xe3, 0xab, 0x00, 0xfe, 0xf1, 0xc5, 0x28,
+    0x47, 0x00, 0xfe, 0xe8, 0xc4, 0xe3, 0xab, 0x00, 0xff, 0x71, 0xc5, 0x28,
+    0x47, 0x00, 0xff, 0x69, 0xcb, 0x94, 0x90, 0x00, 0xfe, 0x08, 0xcf, 0x6b,
+    0x25, 0x08, 0x0b, 0xb0, 0x42, 0x00, 0x7a, 0xc3, 0x48, 0x7a, 0xc3, 0x79,
+    0xe7, 0x00, 0x1d, 0x0b, 0x03, 0x48, 0x8c, 0xc7, 0x78, 0x4a, 0x00, 0x1d,
+    0x2b, 0x03, 0x48, 0x92, 0xc4, 0x29, 0xc6, 0x00, 0x1c, 0xcb, 0x03, 0x48,
+    0x98, 0x07, 0xc3, 0x48, 0x9e, 0x03, 0xc3, 0x48, 0xb0, 0xc4, 0x89, 0xfe,
+    0x00, 0x1b, 0x81, 0x12, 0xc3, 0x48, 0xbf, 0xc3, 0xe5, 0xb4, 0x00, 0x1b,
+    0xf9, 0xc4, 0x93, 0xa9, 0x00, 0x1c, 0x91, 0xc5, 0x51, 0x51, 0x00, 0x1c,
+    0x99, 0xc5, 0xdb, 0x4b, 0x00, 0x1c, 0xa1, 0xc4, 0xde, 0x9b, 0x00, 0x1c,
+    0xb1, 0x16, 0xc3, 0x48, 0xd5, 0xc5, 0x8b, 0x65, 0x00, 0x1c, 0xd1, 0xc5,
+    0xdd, 0x99, 0x00, 0x1c, 0xd9, 0xc2, 0x14, 0x48, 0x00, 0x1c, 0xe1, 0xc2,
+    0x06, 0xc6, 0x00, 0x1c, 0xe9, 0xc2, 0x07, 0x49, 0x00, 0x1c, 0xf1, 0x15,
+    0xc3, 0x48, 0xe1, 0xc3, 0x11, 0xee, 0x00, 0x1d, 0x38, 0x42, 0x00, 0x7a,
+    0xc3, 0x48, 0xf3, 0xc7, 0x78, 0x4a, 0x00, 0x1e, 0x2b, 0x03, 0x49, 0x05,
+    0xc3, 0x79, 0xe7, 0x00, 0x1e, 0x0b, 0x03, 0x49, 0x0b, 0xc4, 0x29, 0xc6,
+    0x00, 0x1d, 0xcb, 0x03, 0x49, 0x11, 0x07, 0xc3, 0x49, 0x17, 0x03, 0xc3,
+    0x49, 0x29, 0xc4, 0x89, 0xfe, 0x00, 0x1b, 0x89, 0xc4, 0x93, 0xa9, 0x00,
+    0x1d, 0x91, 0xc5, 0x51, 0x51, 0x00, 0x1d, 0x99, 0x06, 0xc3, 0x49, 0x38,
+    0xc4, 0xde, 0x9b, 0x00, 0x1d, 0xb1, 0x16, 0xc3, 0x49, 0x44, 0x0d, 0xc3,
+    0x49, 0x50, 0xc5, 0xdd, 0x99, 0x00, 0x1d, 0xd9, 0xc2, 0x14, 0x48, 0x00,
+    0x1d, 0xe1, 0xc2, 0x06, 0xc6, 0x00, 0x1d, 0xe9, 0xc2, 0x07, 0x49, 0x00,
+    0x1d, 0xf1, 0x12, 0xc3, 0x49, 0x5c, 0xcb, 0x91, 0x2b, 0x00, 0x1e, 0x11,
+    0x15, 0xc3, 0x49, 0x72, 0xc3, 0x11, 0xee, 0x00, 0x1e, 0x38, 0xd3, 0x1a,
+    0x6b, 0x00, 0x1b, 0xd9, 0xda, 0x1a, 0x64, 0x00, 0x1b, 0xe8, 0xcb, 0x94,
+    0x90, 0x00, 0xfe, 0x79, 0xc4, 0xe3, 0xab, 0x00, 0xfe, 0x71, 0xc5, 0x28,
+    0x47, 0x00, 0xfe, 0x68, 0x4d, 0x37, 0xb4, 0xc3, 0x49, 0x88, 0xc5, 0xd6,
+    0xe6, 0x00, 0x1e, 0xd1, 0xc4, 0x87, 0xf5, 0x00, 0x1f, 0x00, 0xcd, 0x7f,
+    0xc1, 0x08, 0x0b, 0xc1, 0xca, 0x71, 0x88, 0x08, 0x0b, 0xf0, 0x44, 0x05,
+    0x14, 0xc3, 0x49, 0xa4, 0x42, 0x02, 0x09, 0xc3, 0x49, 0xba, 0x44, 0x57,
+    0x1d, 0x43, 0x49, 0xcc, 0xd1, 0x52, 0x88, 0x08, 0x0a, 0xc1, 0x48, 0xb9,
+    0xaa, 0x43, 0x49, 0xdc, 0x48, 0xbd, 0x62, 0xc3, 0x49, 0xee, 0x4a, 0x9f,
+    0xea, 0x43, 0x4a, 0x01, 0xc3, 0x02, 0x9f, 0x08, 0x0a, 0xdb, 0x03, 0x4a,
+    0x10, 0xcc, 0x37, 0x61, 0x08, 0x0b, 0x60, 0xd4, 0x3d, 0xf4, 0x08, 0x0a,
+    0xe9, 0xd5, 0x37, 0x58, 0x08, 0x0b, 0x78, 0xc6, 0x0e, 0xe0, 0x01, 0x54,
+    0x01, 0xc5, 0x00, 0xd4, 0x01, 0x54, 0x12, 0x03, 0x4a, 0x16, 0xc8, 0x23,
+    0xa0, 0x01, 0x54, 0x71, 0xcf, 0x02, 0x78, 0x01, 0x54, 0x80, 0xe0, 0x00,
+    0xc7, 0x01, 0x54, 0xa0, 0x8e, 0x08, 0x9b, 0x08, 0x94, 0x08, 0x9b, 0x00,
+    0xc6, 0x42, 0xd4, 0x00, 0xe5, 0xf0, 0xc6, 0x42, 0xd4, 0x00, 0x87, 0xf0,
+    0x97, 0x01, 0x60, 0xf9, 0x8b, 0x01, 0x61, 0x00, 0xc3, 0x87, 0xc2, 0x01,
+    0x61, 0x60, 0x97, 0x01, 0x62, 0x79, 0x8b, 0x01, 0x62, 0x80, 0xc3, 0x87,
+    0xc2, 0x01, 0x62, 0xe0, 0x94, 0x00, 0x5b, 0x00, 0x8e, 0x00, 0x5b, 0x08,
+    0xc7, 0x0d, 0x04, 0x0f, 0x68, 0xa9, 0xc8, 0x4b, 0x94, 0x0f, 0x68, 0xf0,
+    0xc7, 0x0d, 0x04, 0x0f, 0x68, 0xa1, 0xc8, 0x4b, 0x94, 0x0f, 0x68, 0xe8,
+    0xc7, 0x0d, 0x04, 0x0f, 0x68, 0xb1, 0xc8, 0x4b, 0x94, 0x0f, 0x68, 0xf8,
+    0xc7, 0x0d, 0x04, 0x0f, 0x68, 0xb9, 0xc8, 0x4b, 0x94, 0x0f, 0x69, 0x00,
+    0xc4, 0xdc, 0x2d, 0x08, 0x7b, 0xd9, 0xc3, 0x77, 0x79, 0x08, 0x7b, 0xe8,
+    0xc8, 0x0d, 0x03, 0x08, 0x79, 0x28, 0x0a, 0xc3, 0x4a, 0x1c, 0x19, 0xc3,
+    0x4a, 0x28, 0xc2, 0x00, 0xc4, 0x08, 0x79, 0x10, 0xc3, 0x0d, 0x14, 0x08,
+    0x79, 0x09, 0xc3, 0x09, 0x9e, 0x08, 0x79, 0x00, 0x46, 0x26, 0xf7, 0xc3,
+    0x4a, 0x32, 0xc3, 0xb5, 0x3e, 0x08, 0x78, 0xd1, 0x15, 0xc3, 0x4a, 0x5f,
+    0xd0, 0x5d, 0xe2, 0x08, 0x78, 0xc1, 0xc2, 0x00, 0x67, 0x08, 0x78, 0xa1,
+    0x03, 0xc3, 0x4a, 0x69, 0xc3, 0x20, 0x18, 0x08, 0x78, 0x71, 0xc3, 0x00,
+    0x4e, 0x08, 0x78, 0x69, 0xc6, 0xcf, 0xd7, 0x08, 0x78, 0x61, 0xc4, 0xe0,
+    0xe7, 0x08, 0x78, 0x59, 0xc4, 0x4a, 0xb9, 0x08, 0x78, 0x51, 0xc2, 0x01,
+    0x7f, 0x08, 0x78, 0x2b, 0x03, 0x4a, 0x73, 0xc5, 0x4a, 0xb3, 0x08, 0x78,
+    0x41, 0xc3, 0x7e, 0x89, 0x08, 0x78, 0x39, 0xc5, 0x9c, 0xa2, 0x08, 0x78,
+    0x21, 0xc4, 0xe3, 0x27, 0x08, 0x78, 0x10, 0xc5, 0x45, 0x69, 0x08, 0x53,
+    0xf1, 0xc3, 0x05, 0x14, 0x08, 0x53, 0xe8, 0x0a, 0xc3, 0x4a, 0x79, 0xc3,
+    0x1e, 0x1b, 0x08, 0x53, 0xb9, 0xc2, 0x39, 0x8b, 0x08, 0x53, 0x48, 0x42,
+    0x00, 0xd0, 0xc3, 0x4a, 0x85, 0xc5, 0x40, 0x9b, 0x08, 0x53, 0xa8, 0xc4,
+    0xdf, 0xc3, 0x08, 0x53, 0xb1, 0xc4, 0x9c, 0xa3, 0x08, 0x53, 0xa0, 0xc3,
+    0x11, 0xef, 0x08, 0x53, 0x31, 0x03, 0x43, 0x4a, 0x91, 0xc2, 0x00, 0x8e,
+    0x08, 0x53, 0x10, 0xc3, 0x00, 0xb6, 0x08, 0x53, 0x59, 0xc4, 0x9b, 0x90,
+    0x08, 0x53, 0x68, 0xc3, 0x00, 0x49, 0x08, 0x53, 0x89, 0xc2, 0x17, 0xb6,
+    0x08, 0x53, 0x90, 0xc7, 0x0d, 0x04, 0x08, 0x67, 0xf1, 0xc8, 0x4b, 0x94,
+    0x08, 0x67, 0xf8, 0x96, 0x08, 0x67, 0x3b, 0x03, 0x4a, 0xa1, 0x9b, 0x08,
+    0x66, 0xd1, 0x85, 0x08, 0x66, 0x28, 0x95, 0x08, 0x67, 0x80, 0x8a, 0x08,
+    0x67, 0x49, 0x95, 0x08, 0x66, 0x30, 0x9b, 0x08, 0x67, 0x40, 0x9c, 0x08,
+    0x67, 0x28, 0x92, 0x08, 0x67, 0x08, 0x9b, 0x08, 0x66, 0xb8, 0x9b, 0x08,
+    0x66, 0x70, 0x96, 0x08, 0x65, 0x3b, 0x03, 0x4a, 0xa7, 0x9b, 0x08, 0x64,
+    0xd1, 0x85, 0x08, 0x64, 0x28, 0x9b, 0x08, 0x65, 0x40, 0x9c, 0x08, 0x65,
+    0x28, 0x92, 0x08, 0x65, 0x08, 0x9b, 0x08, 0x64, 0xb8, 0x9b, 0x08, 0x64,
+    0x70, 0x95, 0x08, 0x64, 0x31, 0x8a, 0x08, 0x65, 0x48, 0x95, 0x08, 0x65,
+    0x80, 0x8d, 0x08, 0x60, 0xe0, 0x96, 0x08, 0x62, 0x29, 0x95, 0x08, 0x61,
+    0xf1, 0x94, 0x08, 0x61, 0xe1, 0x90, 0x08, 0x61, 0x21, 0x8e, 0x08, 0x61,
+    0x01, 0x8d, 0x08, 0x60, 0xd1, 0x9b, 0x08, 0x60, 0xc1, 0x86, 0x08, 0x60,
+    0x99, 0x89, 0x08, 0x60, 0x79, 0x84, 0x08, 0x60, 0x58, 0x8a, 0x08, 0x61,
+    0xf8, 0x85, 0x08, 0x61, 0x41, 0x96, 0x08, 0x61, 0x31, 0x9b, 0x08, 0x61,
+    0x51, 0x89, 0x08, 0x61, 0x68, 0x96, 0x08, 0x62, 0x31, 0x90, 0x08, 0x61,
+    0x2b, 0x03, 0x4a, 0xad, 0x8d, 0x08, 0x60, 0xd9, 0x9b, 0x08, 0x60, 0xc9,
+    0x89, 0x08, 0x60, 0x81, 0x84, 0x08, 0x60, 0x60, 0x96, 0x08, 0x61, 0x39,
+    0x85, 0x08, 0x61, 0x49, 0x9b, 0x08, 0x61, 0x58, 0x8d, 0x08, 0x60, 0xe8,
+    0xc2, 0x16, 0x1c, 0x08, 0x54, 0xd9, 0xc2, 0x00, 0x65, 0x08, 0x54, 0xc8,
+    0x83, 0x08, 0x1d, 0x03, 0x03, 0x4a, 0xb1, 0x8b, 0x08, 0x1d, 0x09, 0x97,
+    0x08, 0x1d, 0x11, 0x0d, 0xc3, 0x4a, 0xba, 0x09, 0xc3, 0x4a, 0xc2, 0x1a,
+    0xc3, 0x4a, 0xca, 0xc2, 0x00, 0x64, 0x08, 0x1d, 0x41, 0x0c, 0xc3, 0x4a,
+    0xd4, 0x16, 0xc3, 0x4a, 0xdc, 0x06, 0xc3, 0x4a, 0xea, 0xc2, 0x00, 0xb0,
+    0x08, 0x1d, 0x89, 0x04, 0xc3, 0x4a, 0xf9, 0xc2, 0x00, 0x87, 0x08, 0x1d,
+    0x99, 0x10, 0xc3, 0x4b, 0x06, 0x0f, 0xc3, 0x4b, 0x0e, 0xc2, 0x19, 0x2c,
+    0x08, 0x1d, 0xc9, 0x18, 0xc3, 0x4b, 0x1a, 0x14, 0xc3, 0x4b, 0x22, 0xc2,
+    0x00, 0xdb, 0x08, 0x1d, 0xf1, 0x15, 0xc3, 0x4b, 0x2a, 0xc2, 0x02, 0x1c,
+    0x08, 0x1e, 0x01, 0xc2, 0x00, 0xd0, 0x08, 0x1e, 0x18, 0xc3, 0x05, 0x14,
+    0x08, 0x1e, 0x89, 0x16, 0xc3, 0x4b, 0x3a, 0xc7, 0x0d, 0x04, 0x08, 0x1e,
+    0xa8, 0xc3, 0xd3, 0x4c, 0x08, 0x1a, 0xb1, 0xc3, 0x02, 0x44, 0x08, 0x1a,
+    0xc0, 0xc3, 0xc1, 0x4b, 0x08, 0x1b, 0x29, 0xc5, 0xdc, 0xf4, 0x08, 0x1b,
+    0x30, 0x97, 0x08, 0x1b, 0x41, 0x8b, 0x08, 0x1b, 0x80, 0x96, 0x08, 0x1b,
+    0x88, 0x8a, 0x08, 0x18, 0x71, 0x95, 0x08, 0x18, 0xf8, 0x95, 0x08, 0x18,
+    0xd8, 0xce, 0x69, 0xa0, 0x0e, 0x7d, 0xa1, 0xc8, 0x4e, 0x4b, 0x0e, 0x7d,
+    0x98, 0xc7, 0x4e, 0x43, 0x0e, 0x7d, 0xab, 0x03, 0x4b, 0x44, 0xc7, 0xa6,
+    0x73, 0x0e, 0x7c, 0xa0, 0xce, 0x69, 0xa0, 0x0e, 0x7c, 0xc9, 0xc9, 0x92,
+    0x8d, 0x0e, 0x7c, 0xc0, 0xc9, 0xac, 0xd5, 0x0e, 0x7d, 0x71, 0xc9, 0x92,
+    0x8d, 0x0e, 0x7d, 0x69, 0xc8, 0xbc, 0xa2, 0x0e, 0x7d, 0x60, 0xca, 0xa6,
+    0x70, 0x0e, 0x7d, 0x2b, 0x03, 0x4b, 0x48, 0xc9, 0x92, 0x8d, 0x0e, 0x7d,
+    0x1a, 0x03, 0x4b, 0x4e, 0xd6, 0x2d, 0x0a, 0x0e, 0x7d, 0x00, 0xc9, 0x92,
+    0x8d, 0x0e, 0x7c, 0xeb, 0x03, 0x4b, 0x54, 0xca, 0xa6, 0x70, 0x0e, 0x7c,
+    0xe0, 0xcc, 0x87, 0x39, 0x0e, 0x7c, 0xf0, 0xc7, 0x92, 0x8f, 0x0e, 0x7c,
+    0xb1, 0xcb, 0x92, 0x8b, 0x0e, 0x7c, 0xa8, 0xc8, 0x94, 0x9e, 0x0e, 0x7c,
+    0x3b, 0x03, 0x4b, 0x5a, 0xd0, 0x5d, 0xb2, 0x0e, 0x7c, 0x71, 0xc5, 0xd4,
+    0xca, 0x0e, 0x7c, 0x69, 0xc7, 0x78, 0xdb, 0x0e, 0x7c, 0x42, 0x03, 0x4b,
+    0x60, 0xcb, 0x95, 0x56, 0x0e, 0x7c, 0x60, 0xc6, 0x78, 0xdc, 0x0e, 0x78,
+    0xd9, 0x4b, 0x8e, 0xfa, 0x43, 0x4b, 0x66, 0xc5, 0x00, 0x2c, 0x0e, 0x78,
+    0xa9, 0xc4, 0x00, 0x49, 0x0e, 0x78, 0x48, 0xc8, 0xbc, 0x4a, 0x05, 0x4c,
+    0x58, 0xc5, 0x00, 0x2c, 0x01, 0x2c, 0xe1, 0xc4, 0x00, 0x49, 0x01, 0x2c,
+    0xd8, 0xc5, 0x00, 0x2c, 0x01, 0x2c, 0xd1, 0xd4, 0x3d, 0x54, 0x01, 0x2c,
+    0xc8, 0x92, 0x05, 0x22, 0xa1, 0x9a, 0x05, 0x22, 0x90, 0x92, 0x05, 0x22,
+    0x89, 0x9a, 0x05, 0x22, 0x79, 0x96, 0x05, 0x22, 0x70, 0x9a, 0x05, 0x22,
+    0x40, 0x9a, 0x05, 0x22, 0x10, 0x9a, 0x05, 0x21, 0xc8, 0x92, 0x05, 0x21,
+    0xc1, 0x9a, 0x05, 0x21, 0xb1, 0x96, 0x05, 0x21, 0xa8, 0x9a, 0x05, 0x1d,
+    0x48, 0x9a, 0x05, 0x1d, 0x18, 0x9a, 0x05, 0x17, 0x89, 0x92, 0x05, 0x17,
+    0x98, 0x9a, 0x05, 0x17, 0xc0, 0x9a, 0x05, 0x18, 0x08, 0x9a, 0x05, 0x18,
+    0x38, 0x9a, 0x05, 0x03, 0xd1, 0x92, 0x05, 0x03, 0xe0, 0x9a, 0x05, 0x04,
+    0x48, 0x9a, 0x05, 0x04, 0x78, 0x9a, 0x05, 0x0a, 0xa8, 0x9a, 0x05, 0x0b,
+    0x30, 0x9a, 0x05, 0x21, 0x58, 0x92, 0x05, 0x21, 0x11, 0x9a, 0x05, 0x21,
+    0x00, 0x92, 0x05, 0x20, 0xf9, 0x9a, 0x05, 0x20, 0xe9, 0x96, 0x05, 0x20,
+    0xe0, 0x9a, 0x05, 0x1c, 0x90, 0x9a, 0x05, 0x1c, 0x60, 0x9a, 0x05, 0x1b,
+    0xf0, 0x9a, 0x05, 0x1e, 0x20, 0x9a, 0x05, 0x1d, 0xf0, 0x92, 0x05, 0x1d,
+    0x89, 0x9a, 0x05, 0x1d, 0x78, 0x9a, 0x05, 0x1a, 0x20, 0x9a, 0x05, 0x19,
+    0x71, 0x92, 0x05, 0x19, 0x80, 0x9a, 0x05, 0x1b, 0xd0, 0x9a, 0x05, 0x1b,
+    0xa0, 0x92, 0x05, 0x1b, 0x41, 0x9a, 0x05, 0x1b, 0x31, 0x96, 0x05, 0x1b,
+    0x28, 0x92, 0x05, 0x16, 0xb9, 0x9a, 0x05, 0x16, 0xa9, 0x96, 0x05, 0x16,
+    0xa0, 0x9a, 0x05, 0x17, 0x28, 0x9a, 0x05, 0x17, 0x58, 0x9a, 0x05, 0x1a,
+    0xf8, 0x9a, 0x05, 0x1a, 0xc8, 0x9a, 0x05, 0x1a, 0x51, 0x92, 0x05, 0x1a,
+    0x60, 0x96, 0x05, 0x12, 0x51, 0x9a, 0x05, 0x12, 0x59, 0x92, 0x05, 0x12,
+    0x68, 0x9a, 0x05, 0x04, 0xa9, 0x92, 0x05, 0x04, 0xb8, 0x9a, 0x05, 0x04,
+    0xe1, 0x92, 0x05, 0x04, 0xf0, 0x9a, 0x05, 0x05, 0x38, 0x9a, 0x05, 0x05,
+    0x60, 0x96, 0x05, 0x0b, 0x61, 0x9a, 0x05, 0x0b, 0x69, 0x92, 0x05, 0x0b,
+    0x78, 0x9a, 0x05, 0x0b, 0xa0, 0x9a, 0x05, 0x0c, 0xd9, 0x92, 0x05, 0x0c,
+    0xe8, 0x9a, 0x05, 0x0d, 0x11, 0x92, 0x05, 0x0d, 0x20, 0x9a, 0x05, 0x0d,
+    0x78, 0x9a, 0x05, 0x0d, 0xa8, 0x9a, 0x05, 0x12, 0x20, 0x9a, 0x05, 0x11,
+    0xb1, 0x92, 0x05, 0x11, 0xc0, 0x96, 0x05, 0x02, 0xd1, 0x9a, 0x05, 0x02,
+    0xd9, 0x92, 0x05, 0x02, 0xe8, 0x9a, 0x05, 0x03, 0x11, 0x92, 0x05, 0x03,
+    0x20, 0x9a, 0x05, 0x03, 0x80, 0x9a, 0x05, 0x09, 0xd1, 0x92, 0x05, 0x09,
+    0xe0, 0x9a, 0x05, 0x0a, 0x09, 0x92, 0x05, 0x0a, 0x18, 0x9a, 0x05, 0x0a,
+    0x78, 0x9a, 0x05, 0x10, 0xb9, 0x92, 0x05, 0x10, 0xc8, 0x96, 0x05, 0x10,
+    0xf1, 0x9a, 0x05, 0x10, 0xf9, 0x92, 0x05, 0x11, 0x08, 0x9a, 0x05, 0x11,
+    0x70, 0x97, 0x00, 0xb0, 0xab, 0x03, 0x4b, 0x72, 0x8b, 0x00, 0xb0, 0xd0,
+    0x91, 0x00, 0xae, 0x13, 0x03, 0x4b, 0x76, 0x83, 0x00, 0xae, 0x19, 0x8b,
+    0x00, 0xae, 0x09, 0x87, 0x00, 0xae, 0x00, 0x91, 0x00, 0xac, 0xcb, 0x03,
+    0x4b, 0x7a, 0xc2, 0x00, 0x28, 0x00, 0xc7, 0x51, 0x83, 0x00, 0xac, 0xd1,
+    0x8b, 0x00, 0xac, 0xc1, 0x87, 0x00, 0xac, 0xb8, 0x83, 0x08, 0xd5, 0xd8,
+    0x91, 0x08, 0xd5, 0xc8, 0x8b, 0x08, 0xd5, 0xb8, 0x83, 0x08, 0xd5, 0xa8,
+    0x91, 0x08, 0xd5, 0x98, 0x8b, 0x08, 0xd5, 0x88, 0x83, 0x00, 0xa8, 0x70,
+    0x10, 0xc3, 0x4b, 0x7e, 0x87, 0x00, 0xa2, 0x98, 0x83, 0x00, 0xb1, 0x69,
+    0x8b, 0x00, 0xb1, 0x61, 0x87, 0x00, 0xb1, 0x53, 0x03, 0x4b, 0x8a, 0x91,
+    0x00, 0xb1, 0x49, 0x97, 0x00, 0xb1, 0x40, 0x97, 0x00, 0xb2, 0x41, 0x91,
+    0x00, 0xb2, 0x49, 0x87, 0x00, 0xb2, 0x53, 0x03, 0x4b, 0x8e, 0x8b, 0x00,
+    0xb2, 0x61, 0x83, 0x00, 0xb2, 0x68, 0x87, 0x00, 0xb0, 0xc0, 0x97, 0x00,
+    0xb0, 0xe1, 0x91, 0x00, 0xb0, 0xe9, 0x87, 0x00, 0xb0, 0xf3, 0x03, 0x4b,
+    0x92, 0x8b, 0x00, 0xb1, 0x01, 0x83, 0x00, 0xb1, 0x08, 0x83, 0x00, 0xc7,
+    0x81, 0x97, 0x00, 0xc7, 0x68, 0x83, 0x00, 0xc7, 0x78, 0x87, 0x00, 0xaf,
+    0x90, 0x83, 0x00, 0xae, 0x49, 0x8b, 0x00, 0xae, 0x41, 0x87, 0x00, 0xae,
+    0x33, 0x03, 0x4b, 0x96, 0x91, 0x00, 0xae, 0x29, 0x97, 0x00, 0xae, 0x20,
+    0x15, 0xc3, 0x4b, 0x9a, 0x83, 0x00, 0xaf, 0x39, 0x8b, 0x00, 0xaf, 0x31,
+    0x87, 0x00, 0xaf, 0x23, 0x03, 0x4b, 0xb1, 0x91, 0x00, 0xaf, 0x19, 0x97,
+    0x00, 0xaf, 0x10, 0x83, 0x00, 0xb3, 0x01, 0x8b, 0x00, 0xb2, 0xf9, 0x87,
+    0x00, 0xb2, 0xeb, 0x03, 0x4b, 0xb5, 0x97, 0x00, 0xb2, 0xd9, 0x91, 0x00,
+    0xb2, 0xe0, 0x83, 0x00, 0xaf, 0x09, 0x8b, 0x00, 0xaf, 0x01, 0x87, 0x00,
+    0xae, 0xf3, 0x03, 0x4b, 0xb9, 0x91, 0x00, 0xae, 0xe9, 0x97, 0x00, 0xae,
+    0xe0, 0x0a, 0xc3, 0x4b, 0xbd, 0x97, 0x00, 0xb1, 0xd1, 0x91, 0x00, 0xb1,
+    0xd9, 0x87, 0x00, 0xb1, 0xe3, 0x03, 0x4b, 0xd4, 0x8b, 0x00, 0xb1, 0xf1,
+    0x83, 0x00, 0xb1, 0xf8, 0x87, 0x00, 0xb3, 0x20, 0x87, 0x00, 0xb0, 0x88,
+    0x87, 0x00, 0xb0, 0x58, 0x87, 0x00, 0xb0, 0x28, 0x83, 0x00, 0xb0, 0x01,
+    0x8b, 0x00, 0xaf, 0xf9, 0x87, 0x00, 0xaf, 0xeb, 0x03, 0x4b, 0xd8, 0x91,
+    0x00, 0xaf, 0xe1, 0x97, 0x00, 0xaf, 0xd8, 0x83, 0x00, 0xaf, 0xd1, 0x8b,
+    0x00, 0xaf, 0xc9, 0x87, 0x00, 0xaf, 0xbb, 0x03, 0x4b, 0xdc, 0x91, 0x00,
+    0xaf, 0xb1, 0x97, 0x00, 0xaf, 0xa8, 0x87, 0x00, 0xaf, 0x58, 0x83, 0x00,
+    0xae, 0xd9, 0x8b, 0x00, 0xae, 0xd1, 0x87, 0x00, 0xae, 0xc3, 0x03, 0x4b,
+    0xe0, 0x91, 0x00, 0xae, 0xb9, 0x97, 0x00, 0xae, 0xb0, 0x87, 0x00, 0xae,
+    0x98, 0x87, 0x00, 0xae, 0x68, 0x83, 0x00, 0xb1, 0x99, 0x8b, 0x00, 0xb1,
+    0x91, 0x87, 0x00, 0xb1, 0x83, 0x03, 0x4b, 0xe4, 0x91, 0x00, 0xb1, 0x79,
+    0x97, 0x00, 0xb1, 0x70, 0x87, 0x00, 0xb1, 0x28, 0x87, 0x00, 0xb2, 0x18,
+    0x87, 0x00, 0xb2, 0x88, 0x97, 0x00, 0xb2, 0xa1, 0x91, 0x00, 0xb2, 0xa9,
+    0x87, 0x00, 0xb2, 0xb3, 0x03, 0x4b, 0xe8, 0x8b, 0x00, 0xb2, 0xc1, 0x83,
+    0x00, 0xb2, 0xc8, 0x83, 0x00, 0xaa, 0x6b, 0x03, 0x4b, 0xec, 0x91, 0x00,
+    0xaa, 0x53, 0x03, 0x4b, 0xf0, 0x87, 0x00, 0xaa, 0x21, 0x19, 0x43, 0x4b,
+    0xf4, 0x83, 0x00, 0xac, 0x69, 0x91, 0x00, 0xac, 0x61, 0x8b, 0x00, 0xac,
+    0x59, 0x87, 0x00, 0xac, 0x51, 0xc3, 0x14, 0x72, 0x00, 0xaa, 0x78, 0xc4,
+    0xdf, 0xc7, 0x00, 0xab, 0x49, 0x19, 0x43, 0x4c, 0x0d, 0x19, 0x43, 0x4c,
+    0x26, 0x42, 0x15, 0xa6, 0xc3, 0x4c, 0x3f, 0x19, 0x43, 0x4c, 0x58, 0x19,
+    0x43, 0x4c, 0x71, 0x91, 0x00, 0xa4, 0xcb, 0x03, 0x4c, 0x8a, 0x8b, 0x00,
+    0xa4, 0xab, 0x03, 0x4c, 0x8e, 0x87, 0x00, 0xa4, 0x99, 0x83, 0x00, 0xa4,
+    0xea, 0x03, 0x4c, 0x92, 0x83, 0x00, 0xa0, 0xc3, 0x03, 0x4c, 0x96, 0x91,
+    0x00, 0xa0, 0x9b, 0x03, 0x4c, 0x9a, 0x8b, 0x00, 0xa0, 0x7b, 0x03, 0x4c,
+    0x9e, 0x87, 0x00, 0xa0, 0x68, 0x83, 0x00, 0xa3, 0xfb, 0x03, 0x4c, 0xa2,
+    0x87, 0x00, 0xa3, 0xa9, 0x8b, 0x00, 0xa3, 0xbb, 0x03, 0x4c, 0xa6, 0x91,
+    0x00, 0xa3, 0xda, 0x03, 0x4c, 0xaa, 0x19, 0x43, 0x4c, 0xae, 0x87, 0x00,
+    0xa6, 0x51, 0x83, 0x00, 0xa6, 0x62, 0x03, 0x4c, 0xc7, 0x19, 0xc3, 0x4c,
+    0xcb, 0x83, 0x00, 0xac, 0xf1, 0x91, 0x00, 0xac, 0xe9, 0x8b, 0x00, 0xac,
+    0xe1, 0x87, 0x00, 0xac, 0xd8, 0xcd, 0x61, 0x8b, 0x00, 0xa1, 0x19, 0xc2,
+    0x00, 0x75, 0x00, 0xa1, 0x20, 0xc5, 0x31, 0xee, 0x00, 0xa1, 0x29, 0xd6,
+    0x2e, 0xee, 0x00, 0xa1, 0x30, 0x91, 0x00, 0xc6, 0x68, 0x8b, 0x00, 0xc6,
+    0x48, 0x8b, 0x0f, 0x01, 0x01, 0x97, 0x0f, 0x00, 0xf8, 0xc8, 0xb5, 0x5a,
+    0x0e, 0x92, 0x19, 0xc6, 0xcd, 0xe5, 0x0e, 0x92, 0x10, 0xc2, 0x00, 0xb0,
+    0x08, 0x9b, 0xa1, 0xc2, 0x07, 0xb2, 0x08, 0x9b, 0x99, 0xc2, 0x00, 0xc1,
+    0x08, 0x9b, 0x91, 0xc2, 0x02, 0x2b, 0x08, 0x9b, 0x89, 0x83, 0x08, 0x9b,
+    0x80, 0xc3, 0x22, 0xcb, 0x08, 0x9b, 0x61, 0x08, 0xc3, 0x4c, 0xe6, 0x16,
+    0xc3, 0x4c, 0xf2, 0xc3, 0x05, 0x14, 0x08, 0x9b, 0x39, 0xc4, 0x15, 0xe7,
+    0x08, 0x9b, 0x30, 0xcb, 0x8e, 0xef, 0x00, 0xee, 0x41, 0xc6, 0x60, 0xb1,
+    0x00, 0xee, 0x28, 0xc6, 0x09, 0x01, 0x00, 0x18, 0x03, 0x03, 0x4c, 0xfe,
+    0xc9, 0x2b, 0x5f, 0x00, 0x1a, 0x00, 0x00, 0xc3, 0x4d, 0x04, 0x45, 0x03,
+    0xe3, 0x43, 0x4d, 0x10, 0xcb, 0x95, 0xe5, 0x01, 0x06, 0x89, 0x48, 0xbc,
+    0x3a, 0x43, 0x4d, 0x1a, 0xcb, 0x93, 0xe0, 0x00, 0xd6, 0x21, 0xcb, 0x92,
+    0xe3, 0x00, 0xd6, 0x10, 0x00, 0xc3, 0x4d, 0x26, 0x45, 0x03, 0xe3, 0x43,
+    0x4d, 0x32, 0xc5, 0x00, 0xd4, 0x00, 0x18, 0xd1, 0xc5, 0x05, 0x02, 0x00,
+    0x1a, 0x48, 0xc5, 0x05, 0x02, 0x00, 0x18, 0xe1, 0xc5, 0x00, 0xd4, 0x00,
+    0x1a, 0x88, 0xc9, 0x20, 0xa8, 0x00, 0xef, 0xa1, 0xdb, 0x19, 0x11, 0x00,
+    0xef, 0x80, 0xc9, 0x20, 0xa8, 0x00, 0xef, 0x99, 0xdb, 0x19, 0x11, 0x00,
+    0xef, 0x68, 0xc7, 0xa6, 0x69, 0x00, 0xef, 0x19, 0xc5, 0x05, 0x02, 0x00,
+    0xee, 0x50, 0x86, 0x00, 0xee, 0xc1, 0x96, 0x00, 0xd6, 0x71, 0x94, 0x00,
+    0xd6, 0x69, 0x89, 0x00, 0xd6, 0x60, 0xce, 0x42, 0x34, 0x01, 0x07, 0x31,
+    0x45, 0x02, 0x6d, 0x43, 0x4d, 0x3e, 0xc6, 0x05, 0x01, 0x00, 0xef, 0xe0,
+    0x49, 0x60, 0xf4, 0xc3, 0x4d, 0x4a, 0xd0, 0x57, 0x92, 0x00, 0xd5, 0xe0,
+    0xce, 0x6d, 0xf6, 0x00, 0xd5, 0xc1, 0xc7, 0x7d, 0xa5, 0x00, 0x19, 0xf8,
+    0xc8, 0x65, 0xaa, 0x00, 0x1a, 0xd1, 0xd4, 0x3c, 0x64, 0x00, 0x1b, 0x10,
+    0xc6, 0x05, 0x01, 0x00, 0x1a, 0xe0, 0xc6, 0x05, 0x01, 0x00, 0x1a, 0xf8,
+    0x00, 0x43, 0x4d, 0x56, 0xc5, 0x00, 0x48, 0x00, 0xef, 0xd0, 0x00, 0x43,
+    0x4d, 0x62, 0xc4, 0x18, 0x10, 0x05, 0x47, 0x39, 0xc2, 0x22, 0xcc, 0x05,
+    0x47, 0x30, 0xc3, 0x0d, 0x14, 0x05, 0x47, 0x29, 0xc3, 0x09, 0x9e, 0x05,
+    0x47, 0x20, 0xc4, 0x02, 0xde, 0x05, 0x47, 0x19, 0xc2, 0x02, 0xa0, 0x05,
+    0x47, 0x10, 0xc9, 0x0f, 0x6e, 0x07, 0xf1, 0x71, 0xca, 0x09, 0xb7, 0x07,
+    0xf1, 0x78, 0xc3, 0xe6, 0x62, 0x01, 0x6f, 0xa8, 0x87, 0x05, 0x34, 0xf9,
+    0x83, 0x01, 0x6f, 0xe1, 0xc7, 0xc8, 0x00, 0x01, 0x6f, 0xf8, 0x83, 0x01,
+    0x6f, 0x91, 0xc3, 0x1c, 0x63, 0x01, 0x6f, 0x98, 0xc6, 0x05, 0x01, 0x00,
+    0x19, 0x78, 0xc3, 0x03, 0x0c, 0x01, 0x65, 0xa9, 0xc3, 0xb8, 0xf8, 0x01,
+    0x65, 0xf9, 0x42, 0x01, 0xe2, 0xc3, 0x4d, 0x6e, 0xc3, 0x26, 0x1a, 0x01,
+    0x66, 0x39, 0x0a, 0xc3, 0x4d, 0x7a, 0xc6, 0xd0, 0x3d, 0x01, 0x66, 0xb9,
+    0xc3, 0xe5, 0x24, 0x01, 0x66, 0xc8, 0xc5, 0xda, 0x9c, 0x01, 0x66, 0xe9,
+    0x10, 0xc3, 0x4d, 0x8d, 0xc3, 0xe4, 0xf4, 0x01, 0x67, 0x18, 0xc3, 0x03,
+    0x0c, 0x01, 0x65, 0xa1, 0xc3, 0xb8, 0xf8, 0x01, 0x65, 0xf1, 0x42, 0x01,
+    0xe2, 0xc3, 0x4d, 0x99, 0xc3, 0x26, 0x1a, 0x01, 0x66, 0x31, 0x0a, 0xc3,
+    0x4d, 0xa5, 0xc6, 0xd0, 0x3d, 0x01, 0x66, 0xb1, 0xc3, 0xe5, 0x24, 0x01,
+    0x66, 0xc0, 0xc5, 0xda, 0x9c, 0x01, 0x66, 0xe1, 0x10, 0xc3, 0x4d, 0xb8,
+    0xc3, 0xe4, 0xf4, 0x01, 0x67, 0x10, 0x46, 0x00, 0x8b, 0x43, 0x4d, 0xc4,
+    0xc2, 0x00, 0xd3, 0x01, 0x93, 0x70, 0xc2, 0x00, 0xd3, 0x01, 0x93, 0xc0,
+    0xc2, 0x00, 0xd3, 0x01, 0x93, 0x80, 0xc2, 0x00, 0xd3, 0x01, 0x93, 0xc8,
+    0xc2, 0x00, 0xd3, 0x01, 0x93, 0x98, 0xc2, 0x00, 0xd3, 0x01, 0x93, 0xd0,
+    0x83, 0x01, 0x93, 0xa9, 0x97, 0x01, 0x93, 0xf0, 0xc2, 0x00, 0xd3, 0x01,
+    0x93, 0xb0, 0xc2, 0x00, 0xd3, 0x01, 0x93, 0xb8, 0xc4, 0x18, 0x10, 0x01,
+    0x23, 0x31, 0xc2, 0x22, 0xcc, 0x01, 0x23, 0x28, 0xc3, 0x0d, 0x14, 0x01,
+    0x23, 0x21, 0xc3, 0x09, 0x9e, 0x01, 0x23, 0x18, 0xc4, 0x02, 0xde, 0x01,
+    0x23, 0x11, 0xc2, 0x02, 0xa0, 0x01, 0x23, 0x08, 0x00, 0x43, 0x4d, 0xd0,
+    0x00, 0x43, 0x4d, 0xee, 0xd0, 0x55, 0xa8, 0x01, 0x92, 0x60, 0x00, 0x43,
+    0x4e, 0x0c, 0xc3, 0x18, 0x11, 0x01, 0x94, 0x31, 0xc4, 0xe3, 0x8b, 0x01,
+    0x94, 0xc8, 0x90, 0x01, 0x94, 0x81, 0xc6, 0xd2, 0x8f, 0x01, 0x94, 0xe1,
+    0xc7, 0xc8, 0x54, 0x01, 0x95, 0x60, 0xc3, 0x04, 0x20, 0x01, 0x94, 0x89,
+    0xc3, 0xe5, 0x0f, 0x01, 0x95, 0x58, 0xc2, 0x00, 0x5f, 0x01, 0x94, 0x21,
+    0xc2, 0x01, 0x19, 0x01, 0x94, 0x59, 0xc7, 0xc4, 0xf0, 0x01, 0x94, 0xb0,
+    0xc2, 0x02, 0x6f, 0x01, 0x94, 0x41, 0xc3, 0x00, 0x2e, 0x01, 0x95, 0x80,
+    0xc3, 0x01, 0x6f, 0x01, 0x94, 0x71, 0xc6, 0xca, 0xc1, 0x01, 0x95, 0x48,
+    0xcc, 0x7b, 0x3d, 0x01, 0x94, 0xb9, 0xc2, 0x18, 0x8b, 0x01, 0x95, 0x11,
+    0xc5, 0xc7, 0xc8, 0x01, 0x95, 0x18, 0x15, 0xc3, 0x4e, 0x2a, 0xc6, 0xce,
+    0x75, 0x01, 0x95, 0x50, 0x17, 0xc3, 0x4e, 0x34, 0xc6, 0xcd, 0x79, 0x09,
+    0x29, 0xf8, 0xc4, 0xe1, 0x9f, 0x09, 0x29, 0xf1, 0xc2, 0x05, 0x1d, 0x09,
+    0x19, 0xd8, 0xc4, 0xdc, 0xae, 0x09, 0x1a, 0x71, 0x86, 0x09, 0x1a, 0x69,
+    0xc9, 0xab, 0x25, 0x09, 0x1a, 0x60, 0xc3, 0x69, 0x97, 0x09, 0x1a, 0x51,
+    0xc2, 0x01, 0x7f, 0x09, 0x1a, 0x48, 0xc2, 0x01, 0xe2, 0x09, 0x1a, 0x21,
+    0x8f, 0x09, 0x1a, 0x19, 0xc2, 0x04, 0x2b, 0x09, 0x1a, 0x10, 0x97, 0x09,
+    0x1a, 0x01, 0x83, 0x09, 0x19, 0xe2, 0x03, 0x4e, 0x3c, 0xc5, 0xcb, 0x88,
+    0x09, 0x19, 0xc8, 0x17, 0xc3, 0x4e, 0x4a, 0xc3, 0x20, 0x18, 0x09, 0x19,
+    0x81, 0xc2, 0x00, 0xd0, 0x09, 0x19, 0x79, 0x03, 0x43, 0x4e, 0x55, 0xc5,
+    0x39, 0xc7, 0x09, 0x18, 0xc0, 0x97, 0x09, 0x17, 0xb9, 0x87, 0x09, 0x17,
+    0xb0, 0xe0, 0x04, 0x47, 0x09, 0x17, 0x88, 0xda, 0x1a, 0xe6, 0x09, 0x18,
+    0x20, 0xcb, 0x8d, 0xc6, 0x09, 0x29, 0xb9, 0xcc, 0x84, 0x21, 0x09, 0x29,
+    0xb0, 0xc3, 0x40, 0xe7, 0x09, 0x29, 0xa9, 0xc4, 0xe3, 0xa3, 0x09, 0x29,
+    0xa1, 0xc4, 0xc5, 0xa3, 0x09, 0x29, 0x98, 0x00, 0x43, 0x4e, 0x5f, 0x97,
+    0x09, 0x15, 0xab, 0x03, 0x4e, 0x6b, 0xc3, 0x05, 0x9e, 0x09, 0x15, 0xa1,
+    0xc4, 0x5d, 0xd2, 0x09, 0x15, 0x99, 0xc2, 0x02, 0x6f, 0x09, 0x15, 0x91,
+    0xc4, 0x38, 0xa9, 0x09, 0x15, 0x89, 0xc3, 0x62, 0x19, 0x09, 0x15, 0x81,
+    0x83, 0x09, 0x15, 0x78, 0xd6, 0x2b, 0xd6, 0x09, 0x16, 0xa9, 0xc4, 0x58,
+    0xf5, 0x09, 0x16, 0xa0, 0xc3, 0x13, 0x51, 0x09, 0x16, 0x89, 0xc3, 0x49,
+    0x41, 0x09, 0x16, 0x81, 0xc3, 0x65, 0x57, 0x09, 0x16, 0x79, 0xc6, 0xd0,
+    0x97, 0x09, 0x16, 0x71, 0xc3, 0x04, 0x2a, 0x09, 0x16, 0x63, 0x03, 0x4e,
+    0x71, 0xc3, 0x1a, 0xf4, 0x09, 0x16, 0x59, 0xc3, 0x03, 0x30, 0x09, 0x16,
+    0x51, 0x04, 0xc3, 0x4e, 0x77, 0x83, 0x09, 0x16, 0x38, 0xc2, 0x03, 0x4e,
+    0x09, 0x16, 0x29, 0x83, 0x09, 0x16, 0x20, 0x42, 0x01, 0x6f, 0xc3, 0x4e,
+    0x83, 0x15, 0xc3, 0x4e, 0x8d, 0xc2, 0x00, 0xc4, 0x09, 0x29, 0x71, 0xc8,
+    0x6a, 0x1e, 0x09, 0x1c, 0xb1, 0x17, 0xc3, 0x4e, 0x97, 0xc3, 0x20, 0x18,
+    0x09, 0x14, 0xf1, 0xc2, 0x02, 0x2f, 0x09, 0x14, 0xe9, 0xc3, 0x81, 0xc8,
+    0x09, 0x14, 0xe1, 0x0d, 0xc3, 0x4e, 0xad, 0xc2, 0x00, 0xd0, 0x09, 0x14,
+    0xc9, 0xc2, 0x05, 0xc3, 0x09, 0x14, 0xbb, 0x03, 0x4e, 0xb9, 0x83, 0x09,
+    0x14, 0xb0, 0xc9, 0xa9, 0xab, 0x09, 0x29, 0x68, 0x97, 0x09, 0x29, 0x53,
+    0x03, 0x4e, 0xbd, 0xcc, 0x36, 0x5c, 0x09, 0x29, 0x49, 0x0f, 0xc3, 0x4e,
+    0xd5, 0xc7, 0xc9, 0x26, 0x09, 0x29, 0x39, 0xc5, 0xdd, 0x0d, 0x09, 0x29,
+    0x31, 0xc2, 0x00, 0x0a, 0x09, 0x29, 0x29, 0x09, 0xc3, 0x4e, 0xe1, 0xc8,
+    0xb9, 0x4a, 0x09, 0x29, 0x11, 0xc3, 0x15, 0x2e, 0x09, 0x1c, 0x89, 0xc3,
+    0x04, 0x65, 0x09, 0x12, 0xd3, 0x03, 0x4e, 0xec, 0x10, 0xc3, 0x4e, 0xf2,
+    0x03, 0x43, 0x4e, 0xfc, 0xcf, 0x68, 0xcd, 0x09, 0x13, 0xc3, 0x03, 0x4f,
+    0x09, 0x4a, 0xa4, 0xa4, 0x43, 0x4f, 0x0f, 0xd1, 0x56, 0xea, 0x09, 0x13,
+    0x60, 0xc3, 0x5d, 0xd1, 0x09, 0x13, 0x41, 0xc3, 0x13, 0x51, 0x09, 0x13,
+    0x33, 0x03, 0x4f, 0x4b, 0xc4, 0x4a, 0x0f, 0x09, 0x13, 0x29, 0xc3, 0x1a,
+    0xf4, 0x09, 0x13, 0x20, 0x47, 0x03, 0x4c, 0x43, 0x4f, 0x51, 0xc2, 0x02,
+    0x1c, 0x09, 0x11, 0xa9, 0xc3, 0x51, 0xdb, 0x09, 0x11, 0xa1, 0x83, 0x09,
+    0x11, 0x98, 0x46, 0x03, 0x4d, 0xc3, 0x4f, 0x63, 0xc4, 0x39, 0xc8, 0x09,
+    0x11, 0xe8, 0x45, 0x03, 0x4e, 0xc3, 0x4f, 0x76, 0xc3, 0x58, 0xf6, 0x09,
+    0x10, 0x88, 0xc6, 0x6c, 0xd1, 0x09, 0x10, 0xab, 0x03, 0x4f, 0xc6, 0xc6,
+    0x0b, 0x0a, 0x09, 0x10, 0xa0, 0xcd, 0x7c, 0x0c, 0x09, 0x10, 0xc9, 0xc9,
+    0xb2, 0x3f, 0x09, 0x10, 0xc0, 0x47, 0x03, 0x4c, 0x43, 0x4f, 0xcc, 0x47,
+    0x03, 0x4c, 0x43, 0x4f, 0xf7, 0xa2, 0x09, 0x27, 0xf1, 0xa0, 0x09, 0x27,
+    0xe9, 0x9f, 0x09, 0x27, 0xe1, 0x9d, 0x09, 0x27, 0xd8, 0xa4, 0x09, 0x27,
+    0xc1, 0x9d, 0x09, 0x27, 0xb8, 0xa6, 0x09, 0x27, 0x8b, 0x03, 0x50, 0x1d,
+    0x9e, 0x09, 0x27, 0x80, 0xa1, 0x09, 0x27, 0x71, 0xa0, 0x09, 0x27, 0x68,
+    0xa5, 0x09, 0x27, 0x61, 0xa4, 0x09, 0x27, 0x59, 0xa0, 0x09, 0x27, 0x50,
+    0xa3, 0x09, 0x27, 0x49, 0xa2, 0x09, 0x27, 0x40, 0xa5, 0x09, 0x27, 0x31,
+    0xa2, 0x09, 0x27, 0x29, 0x9d, 0x09, 0x27, 0x20, 0xa6, 0x09, 0x27, 0x19,
+    0x9d, 0x09, 0x27, 0x10, 0xce, 0x71, 0x3e, 0x09, 0x26, 0xf1, 0x9d, 0x09,
+    0x26, 0xe8, 0x9e, 0x09, 0x26, 0xd1, 0x9d, 0x09, 0x26, 0xc8, 0xa2, 0x09,
+    0x26, 0xb9, 0x9e, 0x09, 0x26, 0xb0, 0x46, 0x03, 0x4d, 0xc3, 0x50, 0x23,
+    0xc7, 0x0b, 0x09, 0x09, 0x0f, 0x58, 0xc4, 0x39, 0xc8, 0x09, 0x0f, 0x7b,
+    0x03, 0x50, 0x6d, 0xc9, 0xa6, 0x49, 0x09, 0x0f, 0x6a, 0x03, 0x50, 0x73,
+    0x9f, 0x09, 0x1c, 0x38, 0x8d, 0x09, 0x0b, 0x78, 0x86, 0x09, 0x0b, 0x88,
+    0x94, 0x09, 0x0a, 0xf1, 0xc3, 0x03, 0x47, 0x09, 0x0a, 0xe9, 0x86, 0x09,
+    0x0a, 0xe0, 0x97, 0x09, 0x0c, 0x1b, 0x03, 0x50, 0x79, 0xc2, 0x02, 0xfb,
+    0x09, 0x0c, 0x11, 0x87, 0x09, 0x0c, 0x09, 0x83, 0x09, 0x0c, 0x00, 0x94,
+    0x09, 0x0b, 0xf8, 0x8f, 0x09, 0x1c, 0x18, 0x86, 0x09, 0x1c, 0x09, 0xc2,
+    0xe6, 0x97, 0x09, 0x0b, 0x60, 0xc2, 0x01, 0xe2, 0x09, 0x1c, 0x03, 0x03,
+    0x50, 0x7d, 0xc2, 0x38, 0x6a, 0x09, 0x0b, 0x40, 0x94, 0x09, 0x0b, 0x2b,
+    0x03, 0x50, 0x81, 0xc7, 0x5d, 0x9b, 0x09, 0x0b, 0x21, 0x8e, 0x09, 0x0b,
+    0x18, 0xa0, 0x09, 0x1b, 0xf9, 0x9f, 0x09, 0x0a, 0xd8, 0xc9, 0xaa, 0xf8,
+    0x09, 0x0a, 0xd0, 0xcb, 0x97, 0xdf, 0x09, 0x0b, 0xc8, 0x46, 0x25, 0xd4,
+    0x43, 0x50, 0x87, 0xe0, 0x03, 0x47, 0x09, 0x0c, 0xf0, 0xc3, 0x51, 0xdb,
+    0x09, 0x09, 0x01, 0xca, 0xa3, 0x82, 0x09, 0x08, 0xf8, 0xc8, 0x6a, 0x1e,
+    0x09, 0x26, 0x61, 0xcd, 0x79, 0x68, 0x09, 0x08, 0xe1, 0xc3, 0x20, 0x18,
+    0x09, 0x08, 0xd9, 0xc3, 0x32, 0xbf, 0x09, 0x08, 0xca, 0x03, 0x50, 0x99,
+    0x16, 0xc3, 0x50, 0x9f, 0xcd, 0x47, 0xaa, 0x09, 0x08, 0x90, 0xc2, 0x00,
+    0xb0, 0x09, 0x08, 0x79, 0xcb, 0x92, 0x12, 0x09, 0x08, 0x71, 0xc3, 0x04,
+    0x2a, 0x09, 0x08, 0x69, 0xc9, 0x5d, 0x99, 0x09, 0x08, 0x61, 0xca, 0xa3,
+    0xb4, 0x09, 0x08, 0x58, 0xc4, 0xde, 0xe7, 0x09, 0x26, 0x41, 0x15, 0xc3,
+    0x50, 0xab, 0x10, 0xc3, 0x50, 0xb9, 0x0f, 0xc3, 0x50, 0xc9, 0x0e, 0xc3,
+    0x50, 0xd9, 0x0d, 0xc3, 0x50, 0xe6, 0x0a, 0xc3, 0x50, 0xf7, 0x09, 0xc3,
+    0x51, 0x07, 0x07, 0xc3, 0x51, 0x15, 0x06, 0xc3, 0x51, 0x29, 0x04, 0xc3,
+    0x51, 0x38, 0x03, 0xc3, 0x51, 0x45, 0x97, 0x09, 0x07, 0x53, 0x03, 0x51,
+    0x61, 0xc4, 0x38, 0xb4, 0x09, 0x07, 0x49, 0xc2, 0x00, 0xb0, 0x09, 0x07,
+    0x11, 0x0b, 0x43, 0x51, 0x68, 0xcd, 0x79, 0xc3, 0x09, 0x07, 0xd1, 0xc9,
+    0xaf, 0x78, 0x09, 0x07, 0xc9, 0xc4, 0x58, 0xf5, 0x09, 0x07, 0xc0, 0x97,
+    0x09, 0x25, 0xa9, 0xc2, 0x01, 0x7f, 0x09, 0x1b, 0xc0, 0x86, 0x09, 0x05,
+    0xa1, 0x9f, 0x09, 0x05, 0x98, 0x97, 0x09, 0x05, 0x91, 0x8b, 0x09, 0x05,
+    0x89, 0x83, 0x09, 0x05, 0x7a, 0x03, 0x51, 0x74, 0xc2, 0x36, 0x6f, 0x09,
+    0x05, 0x71, 0xc5, 0x45, 0xae, 0x09, 0x05, 0x62, 0x03, 0x51, 0x7a, 0xc5,
+    0x39, 0xc7, 0x09, 0x05, 0x50, 0xc5, 0x39, 0xc7, 0x09, 0x05, 0x40, 0x90,
+    0x09, 0x05, 0x29, 0xc9, 0xaa, 0xef, 0x09, 0x05, 0x1a, 0x03, 0x51, 0x80,
+    0x95, 0x09, 0x25, 0x98, 0x8e, 0x09, 0x25, 0x88, 0xc5, 0x58, 0xf4, 0x09,
+    0x04, 0xc8, 0xc6, 0x6a, 0x20, 0x09, 0x25, 0x41, 0xc2, 0x01, 0x7f, 0x09,
+    0x25, 0x38, 0x8b, 0x09, 0x25, 0x21, 0xc2, 0x00, 0xcb, 0x09, 0x25, 0x19,
+    0xc3, 0x02, 0x2c, 0x09, 0x25, 0x10, 0xcc, 0x84, 0xed, 0x09, 0x25, 0x09,
+    0x03, 0x43, 0x51, 0x86, 0x17, 0xc3, 0x51, 0x93, 0xc5, 0x45, 0xae, 0x09,
+    0x24, 0xd0, 0x8b, 0x09, 0x24, 0xc1, 0x83, 0x09, 0x24, 0xb8, 0x8b, 0x09,
+    0x24, 0xa3, 0x03, 0x51, 0xa0, 0x83, 0x09, 0x24, 0x98, 0xc2, 0x05, 0x1d,
+    0x09, 0x24, 0x89, 0xc2, 0x00, 0x74, 0x09, 0x24, 0x80, 0xc2, 0x01, 0xe2,
+    0x09, 0x24, 0x73, 0x03, 0x51, 0xac, 0xc4, 0x99, 0xe3, 0x09, 0x24, 0x68,
+    0xc5, 0x39, 0xc7, 0x09, 0x04, 0x38, 0x17, 0xc3, 0x51, 0xb2, 0xc4, 0x38,
+    0xb4, 0x09, 0x03, 0x59, 0xc2, 0x00, 0xba, 0x09, 0x03, 0x51, 0xcc, 0x36,
+    0x5c, 0x09, 0x03, 0x49, 0xc2, 0x02, 0x6f, 0x09, 0x03, 0x41, 0x0e, 0xc3,
+    0x51, 0xbe, 0xc3, 0x32, 0xbf, 0x09, 0x03, 0x19, 0xc2, 0x01, 0x29, 0x09,
+    0x03, 0x0b, 0x03, 0x51, 0xc9, 0xc2, 0x00, 0xd0, 0x09, 0x03, 0x01, 0x09,
+    0xc3, 0x51, 0xcf, 0x04, 0xc3, 0x51, 0xe3, 0x03, 0x43, 0x51, 0xed, 0xc2,
+    0x5d, 0xd4, 0x09, 0x24, 0x09, 0xc3, 0x26, 0x1a, 0x09, 0x00, 0x98, 0xc5,
+    0x58, 0xf4, 0x09, 0x24, 0x00, 0xc3, 0x0f, 0xd6, 0x09, 0x00, 0x89, 0xc7,
+    0x6a, 0x1f, 0x09, 0x00, 0x80, 0xc7, 0x5d, 0x9b, 0x09, 0x00, 0x71, 0x8e,
+    0x09, 0x00, 0x68, 0xc8, 0x0d, 0x2d, 0x09, 0x01, 0xe3, 0x03, 0x51, 0xf9,
+    0x16, 0x43, 0x51, 0xff, 0xce, 0x71, 0xca, 0x09, 0x14, 0x71, 0x46, 0x03,
+    0x4d, 0x43, 0x52, 0x05, 0x9f, 0x09, 0x14, 0x40, 0x84, 0x09, 0x14, 0x30,
+    0x97, 0x09, 0x14, 0x19, 0x8b, 0x09, 0x14, 0x10, 0x84, 0x09, 0x14, 0x08,
+    0xe0, 0x04, 0x27, 0x09, 0x0a, 0x48, 0xca, 0xa5, 0x12, 0x00, 0x24, 0x58,
+    0xc3, 0xe5, 0x3c, 0x00, 0x28, 0x39, 0xc2, 0x1c, 0x52, 0x00, 0x28, 0x19,
+    0x87, 0x00, 0x28, 0x08, 0xc9, 0x20, 0xb1, 0x00, 0x27, 0xd8, 0xc3, 0x2d,
+    0x1a, 0x05, 0x32, 0x99, 0x83, 0x05, 0x32, 0xb9, 0xd1, 0x51, 0x78, 0x05,
+    0x32, 0xe9, 0x87, 0x00, 0x23, 0x29, 0xca, 0x51, 0x7f, 0x00, 0x23, 0x49,
+    0xc7, 0xc8, 0x00, 0x00, 0x23, 0x68, 0x06, 0xc3, 0x52, 0x17, 0xc5, 0x1d,
+    0x88, 0x00, 0x26, 0x10, 0xc8, 0x25, 0xfb, 0x00, 0x25, 0xb9, 0xc8, 0x20,
+    0xa9, 0x00, 0x27, 0xa8, 0xca, 0xa5, 0x12, 0x00, 0x24, 0x50, 0xc3, 0xe5,
+    0x3c, 0x00, 0x28, 0x31, 0xc2, 0x1c, 0x52, 0x00, 0x28, 0x11, 0x87, 0x00,
+    0x28, 0x00, 0xc9, 0x20, 0xb1, 0x00, 0x27, 0xd0, 0xc8, 0x20, 0xa9, 0x00,
+    0x27, 0xa1, 0xc8, 0x25, 0xfb, 0x00, 0x25, 0xb0, 0xc3, 0x2d, 0x1a, 0x05,
+    0x32, 0x91, 0x83, 0x05, 0x32, 0xb1, 0xd1, 0x51, 0x78, 0x05, 0x32, 0xe1,
+    0x87, 0x00, 0x23, 0x21, 0xca, 0x51, 0x7f, 0x00, 0x23, 0x41, 0xc7, 0xc8,
+    0x00, 0x00, 0x23, 0x60, 0x06, 0xc3, 0x52, 0x23, 0xc5, 0x1d, 0x88, 0x00,
+    0x26, 0x08, 0xc7, 0xc7, 0xeb, 0x00, 0x6d, 0x39, 0xc6, 0x8e, 0x9c, 0x00,
+    0x6d, 0x68, 0xc7, 0xc4, 0x25, 0x00, 0x6d, 0x49, 0xc6, 0x8e, 0x9c, 0x00,
+    0x6d, 0x78, 0xc7, 0xc6, 0x32, 0x00, 0x6c, 0xd9, 0xc7, 0xca, 0x29, 0x00,
+    0x6c, 0xe9, 0xc7, 0xc7, 0xdd, 0x00, 0x6d, 0x09, 0xc7, 0xc7, 0xc1, 0x00,
+    0x6d, 0x19, 0x16, 0xc3, 0x52, 0x2f, 0x06, 0xc3, 0x52, 0x3b, 0xc7, 0xc8,
+    0x1c, 0x00, 0x6d, 0xa9, 0xc7, 0x8e, 0x9b, 0x00, 0x6d, 0xb8, 0xca, 0x63,
+    0xc8, 0x00, 0x6e, 0xe1, 0xcf, 0x63, 0xc3, 0x00, 0x6e, 0xe9, 0xcb, 0x93,
+    0x51, 0x00, 0x6e, 0xf0, 0x49, 0x20, 0x36, 0x43, 0x52, 0x47, 0x49, 0x20,
+    0x36, 0x43, 0x52, 0x53, 0x49, 0x20, 0x36, 0x43, 0x52, 0x5f, 0x4c, 0x87,
+    0x45, 0xc3, 0x52, 0x6b, 0x87, 0x0e, 0xcd, 0x20, 0x49, 0x20, 0x36, 0x43,
+    0x52, 0x77, 0x49, 0x20, 0x36, 0x43, 0x52, 0x83, 0xc8, 0x3b, 0xec, 0x0e,
+    0xc8, 0xf1, 0xc6, 0x24, 0x3b, 0x0e, 0xc8, 0xe0, 0xc4, 0x17, 0x93, 0x0e,
+    0xd3, 0x2b, 0x03, 0x52, 0x8f, 0xc6, 0x5a, 0xfc, 0x0e, 0xd3, 0x1a, 0x03,
+    0x52, 0x95, 0xcb, 0x57, 0x45, 0x0e, 0xcc, 0x31, 0xc6, 0x00, 0x58, 0x0e,
+    0xcc, 0x29, 0xc6, 0x24, 0x3b, 0x0e, 0xcc, 0x20, 0xcb, 0x57, 0x45, 0x0e,
+    0xcc, 0x19, 0xc6, 0x00, 0x58, 0x0e, 0xcc, 0x11, 0xc6, 0x24, 0x3b, 0x0e,
+    0xcc, 0x08, 0xcb, 0x57, 0x45, 0x0e, 0xca, 0x81, 0xc6, 0x00, 0x58, 0x0e,
+    0xca, 0x79, 0xc6, 0x24, 0x3b, 0x0e, 0xca, 0x70, 0xcb, 0x57, 0x45, 0x0e,
+    0xca, 0x69, 0xc6, 0x00, 0x58, 0x0e, 0xca, 0x61, 0xc6, 0x24, 0x3b, 0x0e,
+    0xca, 0x58, 0xc7, 0x04, 0x12, 0x0e, 0xd1, 0x49, 0xc5, 0x19, 0x2f, 0x0e,
+    0xd1, 0x38, 0x00, 0x43, 0x52, 0x9b, 0x00, 0x43, 0x52, 0xa7, 0x00, 0x43,
+    0x52, 0xb3, 0x00, 0x43, 0x52, 0xe3, 0xc5, 0x06, 0x82, 0x0e, 0xc0, 0x2b,
+    0x03, 0x53, 0x02, 0xd2, 0x13, 0x89, 0x0e, 0xc6, 0xa3, 0x03, 0x53, 0x06,
+    0x45, 0x00, 0x9d, 0xc3, 0x53, 0x0a, 0x47, 0x13, 0x95, 0x43, 0x53, 0x16,
+    0x00, 0x43, 0x53, 0x25, 0x00, 0x43, 0x53, 0x68, 0x92, 0x0e, 0xc3, 0x6b,
+    0x03, 0x53, 0x80, 0xc6, 0xbc, 0x5c, 0x0e, 0xc3, 0xaa, 0x03, 0x53, 0x84,
+    0x00, 0x43, 0x53, 0x88, 0x00, 0x43, 0x53, 0xa9, 0xcb, 0x13, 0x90, 0x0e,
+    0xc5, 0x91, 0xc9, 0xad, 0x9b, 0x0e, 0xc4, 0xa9, 0x46, 0x0e, 0xce, 0xc3,
+    0x53, 0xc4, 0xc8, 0xbc, 0x62, 0x0e, 0xc3, 0xc9, 0xd3, 0x46, 0x57, 0x0e,
+    0xc2, 0xb1, 0xc5, 0x06, 0x82, 0x0e, 0xc0, 0x18, 0x4b, 0x40, 0xb3, 0xc3,
+    0x53, 0xd0, 0x4a, 0x18, 0xa5, 0x43, 0x53, 0xdc, 0xc6, 0x00, 0x58, 0x0e,
+    0xcf, 0xa1, 0xc6, 0x24, 0x3b, 0x0e, 0xcf, 0x98, 0xc6, 0x00, 0x58, 0x0e,
+    0xcf, 0x81, 0xc6, 0x24, 0x3b, 0x0e, 0xcf, 0x78, 0xc5, 0x17, 0x14, 0x0e,
+    0xce, 0xf1, 0x15, 0xc3, 0x53, 0xee, 0x48, 0x20, 0x37, 0x43, 0x53, 0xfa,
+    0xc6, 0x00, 0x58, 0x0e, 0xcf, 0x61, 0xc6, 0x24, 0x3b, 0x0e, 0xcf, 0x48,
+    0xc6, 0x00, 0x58, 0x0e, 0xcf, 0x59, 0xc6, 0x24, 0x3b, 0x0e, 0xcf, 0x40,
+    0xc6, 0x00, 0x58, 0x0e, 0xcf, 0x51, 0xc6, 0x24, 0x3b, 0x0e, 0xcf, 0x38,
+    0xca, 0x91, 0x42, 0x0e, 0xcb, 0x49, 0x49, 0x45, 0x27, 0x43, 0x54, 0x06,
+    0x46, 0x20, 0xe5, 0xc3, 0x54, 0x1b, 0x48, 0xb7, 0x3a, 0x43, 0x54, 0x27,
+    0x46, 0x20, 0xe5, 0xc3, 0x54, 0x33, 0x48, 0xb7, 0x3a, 0x43, 0x54, 0x45,
+    0xc8, 0xbb, 0x22, 0x0e, 0xce, 0xc9, 0xc5, 0x17, 0x14, 0x0e, 0xce, 0xbb,
+    0x03, 0x54, 0x51, 0xc6, 0x01, 0xdb, 0x0e, 0xce, 0xb1, 0xc5, 0x03, 0x13,
+    0x0e, 0xce, 0xa9, 0x48, 0x20, 0x37, 0x43, 0x54, 0x57, 0xc5, 0x17, 0x14,
+    0x0e, 0xcb, 0xb1, 0xc6, 0x01, 0xdb, 0x0e, 0xcb, 0xa9, 0xc5, 0x03, 0x13,
+    0x0e, 0xcb, 0xa0, 0xc5, 0x17, 0x14, 0x0e, 0xcb, 0xd1, 0xc6, 0x01, 0xdb,
+    0x0e, 0xcb, 0xc9, 0xc5, 0x03, 0x13, 0x0e, 0xcb, 0xc0, 0xca, 0x91, 0x42,
+    0x0e, 0xcb, 0x91, 0xc8, 0x51, 0x1b, 0x0e, 0xcb, 0x88, 0xcb, 0x91, 0x41,
+    0x0e, 0xcb, 0x68, 0xc6, 0x00, 0x58, 0x0e, 0xcf, 0x91, 0xc6, 0x24, 0x3b,
+    0x0e, 0xcf, 0x88, 0xc6, 0x00, 0x58, 0x0e, 0xcf, 0x71, 0xc6, 0x24, 0x3b,
+    0x0e, 0xcf, 0x68, 0x4e, 0x6d, 0x86, 0xc3, 0x54, 0x63, 0x48, 0x20, 0x37,
+    0xc3, 0x54, 0x75, 0x46, 0x0e, 0xd4, 0x43, 0x54, 0x81, 0xc6, 0x00, 0x58,
+    0x0e, 0xcf, 0x31, 0xc6, 0x24, 0x3b, 0x0e, 0xcf, 0x20, 0xc6, 0x00, 0x58,
+    0x0e, 0xcf, 0x29, 0xc6, 0x24, 0x3b, 0x0e, 0xcf, 0x18, 0xc5, 0xdd, 0x17,
+    0x0e, 0xcd, 0x79, 0xca, 0x9e, 0x8c, 0x0e, 0xcd, 0x40, 0xc7, 0x00, 0x57,
+    0x0e, 0xcc, 0xc0, 0xc5, 0xdd, 0x17, 0x0e, 0xcd, 0x71, 0xca, 0x9e, 0x8c,
+    0x0e, 0xcd, 0x38, 0x00, 0xc3, 0x54, 0x8d, 0x48, 0xbb, 0x7a, 0x43, 0x54,
+    0x9d, 0xc5, 0x17, 0x14, 0x0e, 0xca, 0x09, 0xc6, 0x01, 0xdb, 0x0e, 0xca,
+    0x01, 0xc5, 0x03, 0x13, 0x0e, 0xc9, 0xf8, 0xc8, 0x5a, 0x49, 0x0e, 0xc9,
+    0xf1, 0xc5, 0x17, 0x14, 0x0e, 0xc9, 0xe9, 0xc6, 0x01, 0xdb, 0x0e, 0xc9,
+    0xe1, 0xc5, 0x03, 0x13, 0x0e, 0xc9, 0xd8, 0xca, 0x91, 0x42, 0x0e, 0xc9,
+    0x71, 0x49, 0x45, 0x27, 0x43, 0x54, 0xa9, 0xc5, 0x17, 0x14, 0x0e, 0xca,
+    0x21, 0xc6, 0x01, 0xdb, 0x0e, 0xca, 0x19, 0xc5, 0x03, 0x13, 0x0e, 0xca,
+    0x10, 0xc5, 0x17, 0x14, 0x0e, 0xc9, 0xd1, 0xc6, 0x01, 0xdb, 0x0e, 0xc9,
+    0xc9, 0xc5, 0x03, 0x13, 0x0e, 0xc9, 0xc0, 0xcb, 0x91, 0x41, 0x0e, 0xc9,
+    0xb8, 0xcb, 0x91, 0x41, 0x0e, 0xc9, 0x90, 0xc5, 0x17, 0x14, 0x0e, 0xcb,
+    0x1b, 0x03, 0x54, 0xbe, 0xc6, 0x01, 0xdb, 0x0e, 0xcb, 0x11, 0xc5, 0x03,
+    0x13, 0x0e, 0xcb, 0x08, 0xc5, 0x17, 0x14, 0x0e, 0xca, 0xfb, 0x03, 0x54,
+    0xc4, 0xc6, 0x01, 0xdb, 0x0e, 0xca, 0xf1, 0xc5, 0x03, 0x13, 0x0e, 0xca,
+    0xe8, 0xc2, 0x00, 0x15, 0x0e, 0xca, 0xe0, 0xc2, 0x00, 0x15, 0x0e, 0xca,
+    0xc0, 0x4c, 0x8b, 0xc5, 0xc3, 0x54, 0xca, 0xc5, 0x03, 0x13, 0x0e, 0xc9,
+    0x11, 0xc5, 0x17, 0x14, 0x0e, 0xc9, 0x08, 0xc4, 0x94, 0xa5, 0x0e, 0xd2,
+    0x61, 0xc8, 0xbe, 0x0a, 0x0e, 0xd2, 0x58, 0xc4, 0x94, 0xa5, 0x0e, 0xd2,
+    0x49, 0xc8, 0xbe, 0x0a, 0x0e, 0xd2, 0x40, 0xcf, 0x63, 0x96, 0x08, 0xae,
+    0xb9, 0xce, 0x6f, 0x8c, 0x08, 0xae, 0xb1, 0xc4, 0x5d, 0x32, 0x08, 0xae,
+    0xa8, 0xcd, 0x44, 0xbb, 0x08, 0xae, 0x91, 0x49, 0xb1, 0x43, 0x43, 0x54,
+    0xd6, 0xd0, 0x5f, 0x22, 0x08, 0xae, 0x71, 0xd0, 0x58, 0x22, 0x08, 0xae,
+    0x69, 0xc9, 0x44, 0xbf, 0x08, 0xae, 0x60, 0x8e, 0x08, 0x8d, 0xd8, 0x94,
+    0x08, 0x8d, 0xc8, 0x8e, 0x08, 0x8c, 0x60, 0x94, 0x08, 0x8c, 0x50, 0xd9,
+    0x1e, 0x50, 0x01, 0x2f, 0x51, 0xd8, 0x25, 0x5b, 0x01, 0x58, 0xa8, 0xd3,
+    0x1e, 0x56, 0x01, 0x2f, 0x49, 0xd3, 0x43, 0x26, 0x01, 0x2d, 0x38, 0xd2,
+    0x47, 0x6f, 0x01, 0x2d, 0x41, 0xd3, 0x1e, 0x56, 0x01, 0x58, 0xa0, 0xc6,
+    0x0b, 0x18, 0x01, 0x9e, 0x71, 0xc4, 0xd9, 0x12, 0x01, 0x9d, 0x30, 0xc8,
+    0x0b, 0x08, 0x01, 0x9d, 0x40, 0xc2, 0xe5, 0xa5, 0x0f, 0x91, 0xc9, 0xc2,
+    0xe6, 0x91, 0x0f, 0x91, 0x01, 0xc2, 0x83, 0xe4, 0x0f, 0x90, 0xe0, 0xc2,
+    0x71, 0x49, 0x0f, 0x91, 0xa1, 0xc2, 0xe6, 0x81, 0x0f, 0x91, 0x28, 0xc2,
+    0xe6, 0xa5, 0x0f, 0x91, 0x71, 0xc2, 0x09, 0x02, 0x0f, 0x90, 0x90, 0xc2,
+    0xe6, 0x85, 0x0f, 0x90, 0xb9, 0xc2, 0xe6, 0x93, 0x0f, 0x90, 0xa8, 0xc2,
+    0xe0, 0x7e, 0x0f, 0x91, 0xc1, 0xc2, 0x7e, 0x13, 0x0f, 0x91, 0x10, 0xa5,
+    0x0f, 0x91, 0xb9, 0xa6, 0x0f, 0x91, 0xb0, 0xc2, 0xe6, 0x5a, 0x0f, 0x91,
+    0x89, 0xc2, 0xe5, 0x7d, 0x0f, 0x91, 0x39, 0xc2, 0xe6, 0x8d, 0x0f, 0x90,
+    0x80, 0xc2, 0x3c, 0xd4, 0x0f, 0x91, 0x79, 0xc2, 0xe5, 0x7e, 0x0f, 0x91,
+    0x40, 0xc2, 0xae, 0x95, 0x0f, 0x90, 0xf9, 0xc2, 0xe6, 0xa3, 0x0f, 0x90,
+    0xd8, 0xa6, 0x0f, 0x91, 0x51, 0x9d, 0x0f, 0x91, 0x48, 0xc6, 0x06, 0xe1,
+    0x01, 0x20, 0xb8, 0xc2, 0x00, 0xc1, 0x00, 0x43, 0x29, 0x83, 0x00, 0x43,
+    0x20, 0xd3, 0x43, 0x5f, 0x0f, 0xc9, 0x69, 0xcc, 0x87, 0xf9, 0x0f, 0xcb,
+    0x80, 0xe0, 0x0a, 0x07, 0x01, 0x17, 0xe0, 0xe0, 0x0a, 0x07, 0x01, 0x17,
+    0xa0, 0xc8, 0x4b, 0x94, 0x01, 0x0b, 0xf9, 0xc7, 0x0d, 0x04, 0x01, 0x0b,
+    0xe8, 0xc2, 0x00, 0x5f, 0x01, 0x0b, 0xa3, 0x03, 0x54, 0xe2, 0xc3, 0x45,
+    0x6b, 0x01, 0x0b, 0xe0, 0xc4, 0x22, 0x44, 0x01, 0x0b, 0xd9, 0x91, 0x01,
+    0x0b, 0x88, 0xc3, 0x77, 0x79, 0x08, 0x43, 0x91, 0xc4, 0xdc, 0x2d, 0x08,
+    0x43, 0x78, 0xc4, 0x02, 0xde, 0x05, 0x47, 0xb1, 0xc2, 0x02, 0xa0, 0x05,
+    0x47, 0xa8, 0xc5, 0x01, 0xa2, 0x01, 0x5b, 0x1b, 0x03, 0x54, 0xe8, 0xcc,
+    0x2e, 0x48, 0x01, 0x5a, 0x69, 0xcc, 0x82, 0xb9, 0x01, 0x5b, 0x69, 0xcd,
+    0x7c, 0xa8, 0x01, 0x5c, 0x38, 0x47, 0x13, 0x6d, 0xc3, 0x54, 0xec, 0xc6,
+    0x10, 0x9d, 0x01, 0x4a, 0xc9, 0xc8, 0xae, 0xbc, 0x01, 0x4b, 0x08, 0xc8,
+    0xae, 0xbc, 0x01, 0x4a, 0xe9, 0xc6, 0x10, 0x9d, 0x01, 0x4a, 0xa8, 0xd8,
+    0x22, 0x5b, 0x0f, 0xc0, 0x59, 0x46, 0x03, 0x13, 0xc3, 0x54, 0xf6, 0xcd,
+    0x75, 0xa6, 0x01, 0x0e, 0xf9, 0xd0, 0x59, 0x42, 0x01, 0x0d, 0xa9, 0x44,
+    0x08, 0xba, 0xc3, 0x55, 0x02, 0xd1, 0x01, 0x68, 0x01, 0x48, 0x41, 0xd9,
+    0x1f, 0xf9, 0x0f, 0xc0, 0x39, 0xd5, 0x03, 0xd2, 0x0f, 0xc0, 0xb9, 0xcc,
+    0x84, 0xb1, 0x0f, 0xc4, 0xd8, 0xc4, 0x18, 0x10, 0x01, 0x27, 0xd9, 0xc2,
+    0x22, 0xcc, 0x01, 0x27, 0xd0, 0xc3, 0x0d, 0x14, 0x01, 0x27, 0xc9, 0xc3,
+    0x09, 0x9e, 0x01, 0x27, 0xc0, 0xc4, 0x02, 0xde, 0x01, 0x27, 0xb9, 0xc2,
+    0x02, 0xa0, 0x01, 0x27, 0xb0, 0xcf, 0x05, 0x98, 0x01, 0x15, 0x59, 0xce,
+    0x34, 0xd4, 0x01, 0x57, 0x28, 0xd0, 0x0f, 0xc6, 0x01, 0x00, 0xf1, 0xd9,
+    0x0f, 0xbd, 0x01, 0x72, 0x10, 0xca, 0x9f, 0xa4, 0x01, 0x4c, 0x81, 0xcd,
+    0x7f, 0x80, 0x01, 0x4c, 0x70, 0x45, 0x00, 0x8c, 0xc3, 0x55, 0x0e, 0xd3,
+    0x41, 0x71, 0x01, 0x4c, 0xe1, 0xc7, 0x00, 0x38, 0x01, 0x80, 0x4b, 0x03,
+    0x55, 0x1a, 0xd3, 0x19, 0x81, 0x01, 0x70, 0x01, 0xda, 0x19, 0x7a, 0x01,
+    0x70, 0x08, 0x00, 0x43, 0x55, 0x20, 0xcf, 0x2c, 0x35, 0x01, 0x48, 0x01,
+    0xd6, 0x2d, 0x62, 0x01, 0x48, 0x09, 0x16, 0x43, 0x55, 0x32, 0xc5, 0x01,
+    0x4a, 0x01, 0x0e, 0x09, 0x00, 0x43, 0x55, 0x41, 0xc5, 0x01, 0x4a, 0x01,
+    0x0e, 0x01, 0x00, 0x43, 0x55, 0x59, 0xd2, 0x05, 0xd4, 0x0f, 0xc0, 0x11,
+    0xd5, 0x03, 0xd2, 0x0f, 0xc0, 0x90, 0x46, 0x00, 0x8b, 0x43, 0x55, 0x6b,
+    0xc9, 0x03, 0xc8, 0x01, 0x58, 0x71, 0xc7, 0x09, 0x0d, 0x01, 0x58, 0x78,
+    0xcf, 0x6a, 0x8f, 0x01, 0x5a, 0x41, 0xce, 0x33, 0x92, 0x01, 0x5a, 0x60,
+    0xc6, 0x01, 0x73, 0x01, 0x0e, 0x79, 0xcf, 0x2c, 0x35, 0x01, 0x48, 0x18,
+    0x87, 0x05, 0x28, 0x88, 0x91, 0x05, 0x2c, 0x10, 0xc2, 0x00, 0x87, 0x05,
+    0x30, 0x81, 0xc2, 0x02, 0x2b, 0x05, 0x30, 0x89, 0xc3, 0x19, 0xe1, 0x05,
+    0x30, 0x91, 0xc2, 0x01, 0xc3, 0x05, 0x31, 0x51, 0xc2, 0x00, 0x58, 0x05,
+    0x31, 0x58, 0x87, 0x05, 0x28, 0xf9, 0x90, 0x05, 0x30, 0x28, 0x91, 0x05,
+    0x2c, 0x80, 0xc3, 0xe5, 0x36, 0x0b, 0x54, 0x99, 0xc3, 0xe5, 0x06, 0x0b,
+    0x54, 0x90, 0x9a, 0x0b, 0x54, 0xd9, 0x93, 0x0b, 0x54, 0xd1, 0x85, 0x0b,
+    0x54, 0xc9, 0x9c, 0x0b, 0x54, 0xc0, 0x42, 0x06, 0x46, 0xc3, 0x55, 0x77,
+    0xc7, 0xc4, 0x02, 0x00, 0x70, 0x30, 0x91, 0x00, 0x70, 0x59, 0xc3, 0x14,
+    0x6b, 0x00, 0x71, 0x41, 0xc2, 0x00, 0xe4, 0x00, 0x71, 0x50, 0x83, 0x00,
+    0x71, 0x91, 0x8f, 0x00, 0x71, 0x99, 0x87, 0x00, 0x72, 0x09, 0x46, 0xce,
+    0x87, 0x43, 0x55, 0x8f, 0x8b, 0x00, 0x71, 0xa8, 0x87, 0x00, 0x71, 0xb3,
+    0x03, 0x55, 0x9b, 0x97, 0x00, 0x71, 0xc8, 0x42, 0x00, 0x8e, 0xc3, 0x55,
+    0x9f, 0xca, 0xa5, 0x30, 0x00, 0x70, 0x89, 0xc7, 0xc6, 0xbe, 0x00, 0x70,
+    0x90, 0x42, 0x00, 0xb7, 0xc3, 0x55, 0xaf, 0xc7, 0xc1, 0x54, 0x00, 0x71,
+    0x00, 0xc8, 0xb9, 0x42, 0x00, 0x71, 0x89, 0xc2, 0x13, 0x4c, 0x00, 0x72,
+    0x41, 0x16, 0xc3, 0x55, 0xbb, 0xc8, 0xb5, 0x3a, 0x00, 0x72, 0x58, 0x94,
+    0x00, 0x63, 0x00, 0x8e, 0x00, 0x63, 0x08, 0xc3, 0xad, 0xf4, 0x00, 0x78,
+    0xd1, 0xc4, 0x97, 0x19, 0x00, 0x78, 0xd9, 0xc3, 0x60, 0x54, 0x00, 0x78,
+    0xe0, 0xc3, 0xad, 0xf4, 0x00, 0x78, 0xe9, 0xc4, 0x97, 0x19, 0x00, 0x78,
+    0xf1, 0xc3, 0x60, 0x54, 0x00, 0x7e, 0x78, 0xcd, 0x00, 0xfa, 0x07, 0xe8,
+    0x09, 0xca, 0x26, 0xf7, 0x07, 0xe8, 0xe8, 0x0b, 0xc3, 0x55, 0xc7, 0x45,
+    0x00, 0x8c, 0x43, 0x55, 0xd3, 0x0b, 0xc3, 0x55, 0xe5, 0x45, 0x00, 0x8c,
+    0x43, 0x55, 0xf1, 0xca, 0x26, 0xf7, 0x07, 0xe8, 0xf1, 0xcd, 0x00, 0xfa,
+    0x07, 0xe8, 0x10, 0xcd, 0x00, 0xfa, 0x07, 0xe8, 0x01, 0xca, 0x26, 0xf7,
+    0x07, 0xe8, 0xe0, 0xcd, 0x00, 0xfa, 0x07, 0xe7, 0xf9, 0xca, 0x26, 0xf7,
+    0x07, 0xe8, 0xd8, 0x0b, 0xc3, 0x55, 0xfd, 0x45, 0x00, 0x8c, 0x43, 0x56,
+    0x09, 0x0b, 0xc3, 0x56, 0x15, 0xd3, 0x43, 0x72, 0x07, 0xed, 0xf8, 0x0b,
+    0xc3, 0x56, 0x21, 0x45, 0x00, 0x8c, 0x43, 0x56, 0x2d, 0xcc, 0x00, 0xfb,
+    0x07, 0xe2, 0x89, 0xcb, 0x10, 0xb5, 0x07, 0xe6, 0xb8, 0x44, 0x2b, 0xb9,
+    0xc3, 0x56, 0x39, 0x0a, 0xc3, 0x56, 0x45, 0x45, 0x19, 0x60, 0xc3, 0x56,
+    0x51, 0x4d, 0x06, 0x5a, 0xc3, 0x56, 0x67, 0x45, 0x30, 0xc1, 0xc3, 0x56,
+    0x73, 0x45, 0x50, 0xf0, 0xc3, 0x56, 0x89, 0x44, 0x72, 0xf0, 0x43, 0x56,
+    0x99, 0x45, 0x4d, 0x40, 0xc3, 0x56, 0xa5, 0x45, 0x52, 0x4a, 0xc3, 0x56,
+    0xaf, 0x46, 0xd2, 0xa7, 0xc3, 0x56, 0xb9, 0xde, 0x07, 0x29, 0x07, 0xe3,
+    0x18, 0xcd, 0x00, 0xfa, 0x07, 0xe7, 0xd9, 0xca, 0x26, 0xf7, 0x07, 0xe8,
+    0xb8, 0x0b, 0xc3, 0x56, 0xc5, 0x45, 0x00, 0x8c, 0xc3, 0x56, 0xd1, 0xcb,
+    0x64, 0x7b, 0x07, 0xe7, 0x38, 0x0b, 0xc3, 0x56, 0xe3, 0xcb, 0x64, 0x7b,
+    0x07, 0xe9, 0xb1, 0x45, 0x00, 0x8c, 0x43, 0x56, 0xef, 0x43, 0x02, 0x98,
+    0xc3, 0x56, 0xfb, 0x43, 0x2b, 0xba, 0x43, 0x57, 0x0b, 0x0b, 0xc3, 0x57,
+    0x17, 0xcb, 0x64, 0x7b, 0x07, 0xe9, 0xa1, 0x45, 0x00, 0x8c, 0x43, 0x57,
+    0x23, 0xca, 0x26, 0xf7, 0x07, 0xe9, 0x51, 0xcd, 0x00, 0xfa, 0x07, 0xe8,
+    0x70, 0xcd, 0x00, 0xfa, 0x07, 0xe7, 0xe1, 0xca, 0x26, 0xf7, 0x07, 0xe8,
+    0xc0, 0x45, 0x19, 0x60, 0xc3, 0x57, 0x2f, 0x44, 0x19, 0x6a, 0xc3, 0x57,
+    0x39, 0x44, 0x72, 0xf0, 0xc3, 0x57, 0x43, 0xd1, 0x50, 0xf0, 0x07, 0xe5,
+    0x91, 0x4d, 0x06, 0x5a, 0xc3, 0x57, 0x4f, 0x44, 0x2b, 0xb9, 0x43, 0x57,
+    0x5b, 0x42, 0x00, 0xdb, 0xc3, 0x57, 0x67, 0x03, 0x43, 0x57, 0x71, 0xcc,
+    0x00, 0xfb, 0x07, 0xe1, 0x61, 0xcb, 0x10, 0xb5, 0x07, 0xe5, 0xe8, 0xce,
+    0x43, 0x77, 0x07, 0xeb, 0xd1, 0xd7, 0x26, 0xea, 0x07, 0xeb, 0xd9, 0xcf,
+    0x67, 0x65, 0x07, 0xeb, 0xc8, 0xcd, 0x00, 0xfa, 0x07, 0xe7, 0xb9, 0xca,
+    0x26, 0xf7, 0x07, 0xe8, 0x98, 0x0b, 0xc3, 0x57, 0x7d, 0x45, 0x00, 0x8c,
+    0x43, 0x57, 0x89, 0x0b, 0xc3, 0x57, 0x9b, 0x4a, 0x74, 0x6e, 0x43, 0x57,
+    0xa7, 0xca, 0x26, 0xf7, 0x07, 0xe8, 0xa1, 0xcd, 0x00, 0xfa, 0x07, 0xe7,
+    0xc0, 0x5e, 0x0d, 0xba, 0xc3, 0x57, 0xb3, 0x4e, 0x6e, 0xba, 0x43, 0x57,
+    0xbf, 0x0b, 0xc3, 0x57, 0xcb, 0xcc, 0x82, 0xa1, 0x07, 0xea, 0x69, 0xcf,
+    0x65, 0x1c, 0x07, 0xef, 0xb8, 0x44, 0x2b, 0xb9, 0xc3, 0x57, 0xd5, 0x4d,
+    0x06, 0x5a, 0xc3, 0x57, 0xe1, 0x45, 0x19, 0x60, 0xc3, 0x57, 0xed, 0x45,
+    0x50, 0xf1, 0x43, 0x57, 0xfd, 0x44, 0x2b, 0xb9, 0xc3, 0x58, 0x09, 0x4d,
+    0x06, 0x5a, 0xc3, 0x58, 0x15, 0xcf, 0x60, 0x8a, 0x07, 0xe3, 0xc9, 0x45,
+    0x19, 0x60, 0xc3, 0x58, 0x21, 0xcf, 0x69, 0x81, 0x07, 0xe3, 0xb9, 0xce,
+    0x72, 0xf0, 0x07, 0xe3, 0xb1, 0xd2, 0x4a, 0xbd, 0x07, 0xe0, 0x89, 0xcf,
+    0x64, 0x77, 0x07, 0xe7, 0x30, 0xe0, 0x07, 0x27, 0x07, 0xe2, 0xd8, 0xca,
+    0x26, 0xf7, 0x07, 0xe3, 0xa9, 0xcd, 0x00, 0xfa, 0x07, 0xe0, 0x80, 0xca,
+    0x26, 0xf7, 0x07, 0xe3, 0xa1, 0xcd, 0x00, 0xfa, 0x07, 0xe0, 0x78, 0xca,
+    0x26, 0xf7, 0x07, 0xe3, 0x91, 0x0b, 0xc3, 0x58, 0x31, 0xcb, 0x64, 0x7b,
+    0x07, 0xe7, 0x19, 0x45, 0x00, 0x8c, 0x43, 0x58, 0x3d, 0x0b, 0xc3, 0x58,
+    0x5b, 0x45, 0x00, 0x8c, 0x43, 0x58, 0x67, 0x43, 0x02, 0x98, 0xc3, 0x58,
+    0x79, 0x43, 0x2b, 0xba, 0x43, 0x58, 0x83, 0x0b, 0xc3, 0x58, 0x8f, 0x45,
+    0x00, 0x8c, 0x43, 0x58, 0x9b, 0xcb, 0x64, 0x7b, 0x07, 0xe7, 0x89, 0xcc,
+    0x10, 0xb4, 0x07, 0xe6, 0xf0, 0x4f, 0x08, 0x0b, 0xc3, 0x58, 0xad, 0x42,
+    0x00, 0x8f, 0x43, 0x58, 0xf5, 0xcc, 0x00, 0xfb, 0x07, 0xe2, 0xc1, 0xcb,
+    0x10, 0xb5, 0x07, 0xe6, 0xe8, 0x45, 0x19, 0x60, 0xc3, 0x58, 0xff, 0xce,
+    0x43, 0x77, 0x07, 0xed, 0x80, 0xcc, 0x00, 0xfb, 0x07, 0xe2, 0xa9, 0xcb,
+    0x10, 0xb5, 0x07, 0xe6, 0xd0, 0xcb, 0x64, 0x7b, 0x07, 0xe7, 0x79, 0xcc,
+    0x10, 0xb4, 0x07, 0xe6, 0xb0, 0x0b, 0xc3, 0x59, 0x0b, 0x45, 0x00, 0x8c,
+    0x43, 0x59, 0x17, 0xcc, 0x00, 0xfb, 0x07, 0xe2, 0x71, 0xcb, 0x10, 0xb5,
+    0x07, 0xe6, 0xa8, 0xce, 0x43, 0x77, 0x07, 0xec, 0xd1, 0xd7, 0x26, 0xea,
+    0x07, 0xec, 0xd8, 0xcc, 0x00, 0xfb, 0x07, 0xe2, 0x59, 0xcb, 0x10, 0xb5,
+    0x07, 0xe6, 0x90, 0xd7, 0x26, 0xea, 0x07, 0xec, 0xc9, 0x44, 0x19, 0x6a,
+    0xc3, 0x59, 0x29, 0xce, 0x43, 0x77, 0x07, 0xee, 0x39, 0x45, 0x19, 0x60,
+    0x43, 0x59, 0x35, 0xcb, 0x64, 0x7b, 0x07, 0xe7, 0x61, 0xca, 0x26, 0xf7,
+    0x07, 0xe4, 0x11, 0x0b, 0xc3, 0x59, 0x41, 0x45, 0x00, 0x8c, 0x43, 0x59,
+    0x4d, 0xcb, 0x64, 0x7b, 0x07, 0xe7, 0x59, 0xca, 0x26, 0xf7, 0x07, 0xe4,
+    0x09, 0x0b, 0x43, 0x59, 0x59, 0xca, 0x26, 0xf7, 0x07, 0xe4, 0x21, 0xcd,
+    0x00, 0xfa, 0x07, 0xe1, 0xf0, 0x48, 0x06, 0x5f, 0xc3, 0x59, 0x65, 0xca,
+    0x26, 0xf7, 0x07, 0xe4, 0x01, 0xcd, 0x00, 0xfa, 0x07, 0xe1, 0xb8, 0xcc,
+    0x00, 0xfb, 0x07, 0xe1, 0xd1, 0xcb, 0x10, 0xb5, 0x07, 0xe6, 0x30, 0xcc,
+    0x00, 0xfb, 0x07, 0xe1, 0xc9, 0xcb, 0x10, 0xb5, 0x07, 0xe6, 0x28, 0xcc,
+    0x00, 0xfb, 0x07, 0xe1, 0xc1, 0xcb, 0x10, 0xb5, 0x07, 0xe6, 0x20, 0xcc,
+    0x00, 0xfb, 0x07, 0xe0, 0xd9, 0xcb, 0x10, 0xb5, 0x07, 0xe5, 0x60, 0xcc,
+    0x00, 0xfb, 0x07, 0xe0, 0xc9, 0xcb, 0x10, 0xb5, 0x07, 0xe5, 0x58, 0xca,
+    0x26, 0xf7, 0x07, 0xe8, 0xf9, 0xcd, 0x00, 0xfa, 0x07, 0xe8, 0x18, 0xca,
+    0x26, 0xf7, 0x07, 0xe9, 0x01, 0xcd, 0x00, 0xfa, 0x07, 0xe8, 0x20, 0xca,
+    0x26, 0xf7, 0x07, 0xe4, 0x31, 0xcd, 0x00, 0xfa, 0x07, 0xe2, 0x18, 0x4c,
+    0x82, 0x59, 0xc3, 0x59, 0x71, 0x46, 0x08, 0x09, 0x43, 0x59, 0x7d, 0xcc,
+    0x00, 0xfb, 0x07, 0xe2, 0x11, 0xcb, 0x10, 0xb5, 0x07, 0xe6, 0x60, 0x44,
+    0x19, 0x6a, 0xc3, 0x59, 0x89, 0xce, 0x43, 0x77, 0x07, 0xed, 0x68, 0xcc,
+    0x00, 0xfb, 0x07, 0xe2, 0x09, 0xcb, 0x10, 0xb5, 0x07, 0xe6, 0x58, 0xca,
+    0x26, 0xf7, 0x07, 0xec, 0x29, 0xcc, 0x10, 0xb4, 0x07, 0xec, 0x30, 0x0b,
+    0xc3, 0x59, 0x95, 0x45, 0x00, 0x8c, 0x43, 0x59, 0xa1, 0xcc, 0x00, 0xfb,
+    0x07, 0xe1, 0xf9, 0xcb, 0x10, 0xb5, 0x07, 0xe6, 0x48, 0x45, 0x30, 0xc1,
+    0xc3, 0x59, 0xb3, 0x45, 0x19, 0x60, 0xc3, 0x59, 0xbf, 0xce, 0x43, 0x77,
+    0x07, 0xed, 0x60, 0x44, 0x2b, 0xb9, 0xc3, 0x59, 0xcb, 0x4d, 0x06, 0x5a,
+    0xc3, 0x59, 0xd7, 0x45, 0x19, 0x60, 0xc3, 0x59, 0xe3, 0x45, 0x50, 0xf1,
+    0x43, 0x59, 0xed, 0xe0, 0x00, 0xe7, 0x07, 0xef, 0x88, 0xcc, 0x00, 0xfb,
+    0x07, 0xe1, 0x81, 0xcb, 0x10, 0xb5, 0x07, 0xe6, 0x08, 0xcc, 0x00, 0xfb,
+    0x07, 0xe1, 0x79, 0xcb, 0x10, 0xb5, 0x07, 0xe6, 0x00, 0xca, 0x26, 0xf7,
+    0x07, 0xeb, 0xe1, 0xcc, 0x10, 0xb4, 0x07, 0xeb, 0xe8, 0xca, 0x26, 0xf7,
+    0x07, 0xe3, 0x79, 0xcd, 0x00, 0xfa, 0x07, 0xe0, 0x50, 0xca, 0x26, 0xf7,
+    0x07, 0xe3, 0x71, 0xcd, 0x00, 0xfa, 0x07, 0xe0, 0x48, 0xca, 0x26, 0xf7,
+    0x07, 0xe3, 0x61, 0x0b, 0xc3, 0x59, 0xf9, 0xcb, 0x64, 0x7b, 0x07, 0xe7,
+    0x08, 0x0b, 0xc3, 0x5a, 0x05, 0xd3, 0x43, 0x72, 0x07, 0xec, 0xf0, 0x43,
+    0x02, 0x98, 0xc3, 0x5a, 0x11, 0x43, 0x2b, 0xba, 0x43, 0x5a, 0x1b, 0xcc,
+    0x00, 0xfb, 0x07, 0xe0, 0x29, 0xcb, 0x10, 0xb5, 0x07, 0xe4, 0xe0, 0xc2,
+    0x04, 0xc6, 0x07, 0xea, 0x11, 0x17, 0x43, 0x5a, 0x27, 0xc8, 0xb8, 0x52,
+    0x07, 0xea, 0x79, 0xc7, 0x6d, 0x34, 0x07, 0xea, 0x00, 0xd5, 0x1c, 0xbf,
+    0x07, 0xe2, 0x49, 0xca, 0x26, 0xf7, 0x07, 0xe4, 0x40, 0x0b, 0xc3, 0x5a,
+    0x34, 0xca, 0x26, 0xf7, 0x07, 0xe4, 0x49, 0xd3, 0x43, 0x72, 0x07, 0xed,
+    0x88, 0x0b, 0xc3, 0x5a, 0x40, 0x45, 0x00, 0x8c, 0x43, 0x5a, 0x4c, 0x0b,
+    0xc3, 0x5a, 0x5e, 0x45, 0x00, 0x8c, 0x43, 0x5a, 0x6a, 0x0b, 0xc3, 0x5a,
+    0x7c, 0x45, 0x00, 0x8c, 0x43, 0x5a, 0x88, 0xcc, 0x00, 0xfb, 0x07, 0xe1,
+    0x21, 0xcb, 0x10, 0xb5, 0x07, 0xe5, 0xb0, 0xca, 0x26, 0xf7, 0x07, 0xeb,
+    0x79, 0xcc, 0x10, 0xb4, 0x07, 0xeb, 0x80, 0xcc, 0x00, 0xfb, 0x07, 0xe1,
+    0x19, 0xcb, 0x10, 0xb5, 0x07, 0xe5, 0xa8, 0xd7, 0x26, 0xea, 0x07, 0xeb,
+    0x71, 0xce, 0x43, 0x77, 0x07, 0xed, 0x58, 0xcb, 0x10, 0xb5, 0x07, 0xdf,
+    0xd9, 0xcc, 0x00, 0xfb, 0x07, 0xdf, 0xc8, 0x00, 0x43, 0x5a, 0xa0, 0x00,
+    0x43, 0x5a, 0xb6, 0x00, 0x43, 0x5a, 0xcc, 0x00, 0x43, 0x5a, 0xe2, 0x00,
+    0x43, 0x5a, 0xf8, 0x00, 0x43, 0x5b, 0x08, 0x00, 0x43, 0x5b, 0x1e, 0x00,
+    0x43, 0x5b, 0x34, 0xc3, 0x0f, 0x9a, 0x00, 0x45, 0xe3, 0x03, 0x5b, 0x40,
+    0xc4, 0x3a, 0x01, 0x00, 0x45, 0xe9, 0xc3, 0xb1, 0x0d, 0x00, 0x45, 0xd8,
+    0x00, 0x43, 0x5b, 0x46, 0x00, 0x43, 0x5b, 0x5c, 0x00, 0x43, 0x5b, 0x75,
+    0x88, 0x00, 0x32, 0x1b, 0x03, 0x5b, 0x8b, 0xca, 0xa2, 0x1a, 0x00, 0x31,
+    0x00, 0xc2, 0x13, 0xc0, 0x00, 0x36, 0x4b, 0x03, 0x5b, 0x8f, 0xc2, 0x49,
+    0x0c, 0x00, 0x36, 0x2a, 0x03, 0x5b, 0x93, 0x00, 0x43, 0x5b, 0x97, 0x00,
+    0xc3, 0x5b, 0xa7, 0xc2, 0x16, 0x1c, 0x00, 0x34, 0x3a, 0x03, 0x5b, 0xbd,
+    0x00, 0xc3, 0x5b, 0xc1, 0xc2, 0x16, 0x1c, 0x00, 0x33, 0xd2, 0x03, 0x5b,
+    0xd7, 0x00, 0xc3, 0x5b, 0xdb, 0xc2, 0x16, 0x1c, 0x00, 0x33, 0xfa, 0x03,
+    0x5b, 0xef, 0x00, 0x43, 0x5b, 0xf3, 0xc6, 0xd0, 0x91, 0x00, 0x44, 0x31,
+    0xc2, 0x00, 0x65, 0x00, 0x31, 0x83, 0x03, 0x5c, 0x09, 0xc2, 0x16, 0x1c,
+    0x00, 0x31, 0x5a, 0x03, 0x5c, 0x0d, 0x4b, 0x88, 0x05, 0xc3, 0x5c, 0x11,
+    0xcb, 0x64, 0x7b, 0x07, 0xda, 0xc9, 0x0b, 0xc3, 0x5c, 0x1b, 0xca, 0x26,
+    0xf7, 0x07, 0xda, 0xb8, 0x00, 0x43, 0x5c, 0x27, 0x00, 0x43, 0x5c, 0x37,
+    0x00, 0x43, 0x5c, 0x56, 0x00, 0x43, 0x5c, 0x62, 0x00, 0x43, 0x5c, 0x74,
+    0x00, 0x43, 0x5c, 0x84, 0x00, 0xc3, 0x5c, 0x90, 0xc2, 0x16, 0x1c, 0x00,
+    0x34, 0x02, 0x03, 0x5c, 0xa6, 0x00, 0x43, 0x5c, 0xaa, 0x60, 0x06, 0x47,
+    0x43, 0x5c, 0xba, 0xd0, 0x5f, 0x02, 0x00, 0x33, 0xbb, 0x03, 0x5c, 0xc6,
+    0xca, 0x26, 0xf7, 0x07, 0xde, 0xc1, 0xcd, 0x00, 0xfa, 0x07, 0xde, 0xb8,
+    0x45, 0x00, 0x8c, 0xc3, 0x5c, 0xcc, 0xca, 0x26, 0xf7, 0x07, 0xf6, 0xb1,
+    0x0b, 0xc3, 0x5c, 0xd8, 0xcb, 0x64, 0x7b, 0x07, 0xf6, 0xc0, 0xcb, 0x64,
+    0x7b, 0x07, 0xdf, 0x39, 0x0b, 0xc3, 0x5c, 0xe4, 0xca, 0x26, 0xf7, 0x07,
+    0xdf, 0x28, 0x00, 0x43, 0x5c, 0xf0, 0x00, 0x43, 0x5d, 0x02, 0x00, 0x43,
+    0x5d, 0x12, 0x00, 0x43, 0x5d, 0x28, 0x00, 0x43, 0x5d, 0x3e, 0x8e, 0x00,
+    0x31, 0x7b, 0x03, 0x5d, 0x54, 0xc3, 0x01, 0xce, 0x00, 0x34, 0x63, 0x03,
+    0x5d, 0x58, 0x86, 0x00, 0x31, 0xb2, 0x03, 0x5d, 0x5c, 0x8e, 0x00, 0x34,
+    0x43, 0x03, 0x5d, 0x60, 0xc3, 0x01, 0xce, 0x00, 0x34, 0x6a, 0x03, 0x5d,
+    0x64, 0x00, 0x43, 0x5d, 0x68, 0x00, 0x43, 0x5d, 0x74, 0xc3, 0xb1, 0x0d,
+    0x00, 0x35, 0x09, 0xc3, 0x0f, 0x9a, 0x00, 0x33, 0x79, 0xc3, 0x85, 0xf5,
+    0x00, 0x33, 0x70, 0xca, 0x26, 0xf7, 0x07, 0xde, 0xf9, 0xcd, 0x00, 0xfa,
+    0x07, 0xde, 0xf0, 0x00, 0x43, 0x5d, 0x84, 0x45, 0x00, 0x8c, 0xc3, 0x5d,
+    0x94, 0xcd, 0x00, 0xfa, 0x07, 0xf7, 0x69, 0xca, 0x26, 0xf7, 0x07, 0xf7,
+    0x70, 0x00, 0x43, 0x5d, 0xb5, 0xca, 0x26, 0xf7, 0x07, 0xde, 0xd1, 0xcd,
+    0x00, 0xfa, 0x07, 0xde, 0xc8, 0x00, 0xc3, 0x5d, 0xcb, 0xc3, 0xe6, 0x23,
+    0x00, 0x35, 0x8a, 0x03, 0x5d, 0xdb, 0x00, 0x43, 0x5d, 0xdf, 0x00, 0x43,
+    0x5d, 0xfe, 0x8a, 0x00, 0x31, 0x6b, 0x03, 0x5e, 0x0e, 0xc3, 0x08, 0x0b,
+    0x00, 0x31, 0x0a, 0x03, 0x5e, 0x12, 0x00, 0x43, 0x5e, 0x18, 0x00, 0x43,
+    0x5e, 0x40, 0x16, 0xc3, 0x5e, 0x52, 0x15, 0xc3, 0x5e, 0x62, 0xc3, 0x72,
+    0xf0, 0x0f, 0x75, 0x99, 0xc3, 0x0f, 0x9a, 0x0f, 0x75, 0x91, 0xc3, 0xb1,
+    0x0d, 0x0f, 0x75, 0x81, 0xc3, 0x03, 0x0c, 0x0f, 0x75, 0x79, 0xc4, 0x3a,
+    0x01, 0x0f, 0x75, 0x69, 0xc4, 0x19, 0x60, 0x0f, 0x75, 0x61, 0xc3, 0x0d,
+    0xff, 0x0f, 0x75, 0x59, 0xc3, 0x2b, 0xb9, 0x0f, 0x75, 0x49, 0xc3, 0x14,
+    0x4b, 0x0f, 0x75, 0x39, 0x42, 0x02, 0x1c, 0xc3, 0x5e, 0x74, 0xc3, 0x7e,
+    0x89, 0x0f, 0x75, 0x29, 0x42, 0x0e, 0x9a, 0xc3, 0x5e, 0x7e, 0xc4, 0x30,
+    0xc1, 0x0f, 0x75, 0x11, 0xc3, 0x85, 0xf5, 0x0f, 0x75, 0x09, 0xc4, 0x14,
+    0x4a, 0x0f, 0x75, 0xb9, 0xc5, 0x92, 0x75, 0x0f, 0x75, 0xd8, 0xc3, 0x85,
+    0xf5, 0x0f, 0x70, 0xe1, 0xc4, 0x3a, 0x01, 0x0f, 0x70, 0xe9, 0xc3, 0xb1,
+    0x0d, 0x0f, 0x70, 0xf1, 0xc3, 0x0f, 0x9a, 0x0f, 0x70, 0xf8, 0xc4, 0x30,
+    0xc1, 0x0f, 0x72, 0x11, 0xc3, 0x14, 0x4b, 0x0f, 0x72, 0x39, 0xc3, 0x2b,
+    0xb9, 0x0f, 0x72, 0x49, 0xc3, 0x0d, 0xff, 0x0f, 0x72, 0x59, 0xc4, 0x3a,
+    0x01, 0x0f, 0x72, 0x69, 0x15, 0xc3, 0x5e, 0x86, 0xc3, 0x03, 0x0c, 0x0f,
+    0x72, 0x79, 0xc3, 0x0f, 0x9a, 0x0f, 0x72, 0x91, 0xc4, 0x14, 0x4a, 0x0f,
+    0x72, 0xb9, 0x06, 0xc3, 0x5e, 0x98, 0xc5, 0x92, 0x75, 0x0f, 0x72, 0xd8,
+    0xc3, 0x00, 0x49, 0x0f, 0x74, 0x01, 0xc2, 0x00, 0x74, 0x0f, 0x74, 0x78,
+    0x8e, 0x0f, 0x74, 0x19, 0x86, 0x0f, 0x74, 0xc8, 0xc2, 0x16, 0x1c, 0x0f,
+    0x74, 0x21, 0xc2, 0x02, 0x98, 0x0f, 0x74, 0x38, 0xc2, 0x00, 0x74, 0x0f,
+    0x74, 0x31, 0x8a, 0x0f, 0x74, 0xd0, 0xc2, 0x02, 0x98, 0x0f, 0x74, 0x41,
+    0xc2, 0x16, 0x1c, 0x0f, 0x74, 0xa9, 0x0a, 0x43, 0x5e, 0xa4, 0xc3, 0x03,
+    0x26, 0x0f, 0x74, 0x71, 0xc2, 0x01, 0x9d, 0x0f, 0x74, 0x89, 0xc4, 0xdf,
+    0x93, 0x0f, 0x74, 0xa0, 0xc2, 0x16, 0x1c, 0x0f, 0x73, 0x21, 0xc2, 0x02,
+    0x98, 0x0f, 0x73, 0x38, 0xc2, 0x02, 0x98, 0x0f, 0x73, 0x41, 0xc2, 0x16,
+    0x1c, 0x0f, 0x73, 0xa9, 0xc3, 0x64, 0x77, 0x0f, 0x73, 0xb0, 0xc2, 0x0f,
+    0x9b, 0x0f, 0x73, 0x51, 0xc3, 0x14, 0x4b, 0x0f, 0x73, 0xb8, 0xc3, 0x03,
+    0x26, 0x0f, 0x73, 0x71, 0xc2, 0x01, 0x9d, 0x0f, 0x73, 0x89, 0xc4, 0xdf,
+    0x93, 0x0f, 0x73, 0xa0, 0xc2, 0x01, 0x9d, 0x0f, 0x73, 0xc9, 0x47, 0x3b,
+    0xc4, 0x43, 0x5e, 0xb0, 0xc5, 0xda, 0xd3, 0x00, 0x46, 0xf9, 0xc3, 0xe5,
+    0x63, 0x00, 0x46, 0xf1, 0x42, 0x0d, 0xf6, 0xc3, 0x5e, 0xbc, 0x03, 0x43,
+    0x5e, 0xc6, 0xcc, 0x00, 0xfb, 0x00, 0x37, 0x11, 0xcb, 0x10, 0xb5, 0x00,
+    0x36, 0xc0, 0xde, 0x0f, 0x9a, 0x00, 0x36, 0xb9, 0xde, 0x0d, 0xf6, 0x00,
+    0x36, 0xb1, 0xd6, 0x2f, 0x88, 0x00, 0x30, 0xb0, 0xc7, 0xc9, 0xf8, 0x00,
+    0x44, 0xd9, 0x0b, 0x43, 0x5e, 0xe4, 0xc5, 0x05, 0x02, 0x07, 0xdd, 0xf1,
+    0xc5, 0x00, 0xd4, 0x07, 0xdd, 0xe8, 0xc5, 0x05, 0x02, 0x07, 0xdd, 0xc9,
+    0xc5, 0x00, 0xd4, 0x07, 0xdd, 0xc0, 0xc3, 0x7e, 0x89, 0x00, 0x44, 0x21,
+    0xc5, 0x08, 0x09, 0x00, 0x44, 0x18, 0x49, 0x04, 0xd2, 0xc3, 0x5e, 0xf0,
+    0x48, 0x0a, 0x53, 0x43, 0x5e, 0xfc, 0x51, 0x13, 0xe3, 0xc3, 0x5f, 0x0e,
+    0xd3, 0x43, 0x98, 0x01, 0x2b, 0x91, 0xd3, 0x43, 0xbe, 0x01, 0x2b, 0x88,
+    0x45, 0x02, 0x9a, 0x43, 0x5f, 0x20, 0xc8, 0x00, 0x5f, 0x01, 0x2a, 0x71,
+    0xca, 0x01, 0x68, 0x01, 0x2a, 0x60, 0xc9, 0xb0, 0x3e, 0x01, 0x2b, 0xe9,
+    0xc9, 0x01, 0x69, 0x01, 0x29, 0xa0, 0x49, 0x2a, 0xf5, 0xc3, 0x5f, 0x32,
+    0x02, 0x43, 0x5f, 0x48, 0x49, 0x2a, 0xf5, 0x43, 0x5f, 0x5a, 0xce, 0x2a,
+    0xfe, 0x0f, 0xd0, 0xa1, 0xdb, 0x18, 0x03, 0x0f, 0xd1, 0xf0, 0xce, 0x2a,
+    0xfe, 0x0f, 0xd0, 0x91, 0xdb, 0x18, 0x03, 0x0f, 0xd1, 0xe0, 0xce, 0x2a,
+    0xfe, 0x0f, 0xd0, 0x89, 0xdb, 0x18, 0x03, 0x0f, 0xd1, 0xd8, 0xce, 0x2a,
+    0xfe, 0x0f, 0xd0, 0x81, 0xdb, 0x18, 0x03, 0x0f, 0xd1, 0xd0, 0xc3, 0x00,
+    0x74, 0x0f, 0xd1, 0x21, 0xc5, 0x56, 0xa5, 0x0f, 0xd1, 0x40, 0xce, 0x6f,
+    0x38, 0x01, 0x34, 0x49, 0xcf, 0x6a, 0x9e, 0x01, 0x34, 0x41, 0xca, 0x3e,
+    0xe4, 0x01, 0x4f, 0x68, 0xc5, 0x0b, 0x0a, 0x01, 0x2d, 0x51, 0xc3, 0x0e,
+    0x6b, 0x01, 0x5a, 0x88, 0xc6, 0x46, 0x3e, 0x01, 0x2d, 0xd1, 0xc7, 0xbb,
+    0xcb, 0x01, 0x5a, 0x98, 0xd9, 0x20, 0x44, 0x01, 0x1f, 0x78, 0xd2, 0x1c,
+    0x40, 0x01, 0x1f, 0x68, 0xc4, 0x01, 0x9b, 0x01, 0x3d, 0x20, 0xd2, 0x1c,
+    0x40, 0x01, 0x1f, 0x70, 0xc5, 0x06, 0x82, 0x01, 0x30, 0xd1, 0xce, 0x24,
+    0xd5, 0x0f, 0xac, 0xe0, 0xc6, 0x0b, 0x09, 0x01, 0x2f, 0xf1, 0xc7, 0x3a,
+    0x19, 0x0f, 0xbc, 0xc9, 0xc7, 0x0a, 0xe0, 0x0f, 0xbc, 0xf8, 0xc8, 0x5e,
+    0xa6, 0x01, 0x5e, 0x30, 0xc8, 0x5e, 0xa6, 0x01, 0x5e, 0x38, 0x9a, 0x01,
+    0x30, 0x91, 0xc5, 0x6b, 0x02, 0x01, 0x30, 0x89, 0x04, 0xc3, 0x5f, 0x66,
+    0xc8, 0x8e, 0xa5, 0x0f, 0xaf, 0xa9, 0xc7, 0xc0, 0xba, 0x01, 0x5d, 0xe8,
+    0xc4, 0xe0, 0x97, 0x00, 0xdb, 0x51, 0xc6, 0xcf, 0x59, 0x00, 0xdb, 0x28,
+    0xc7, 0xc2, 0x6c, 0x00, 0xda, 0x08, 0x90, 0x0b, 0x51, 0x31, 0x96, 0x0b,
+    0x50, 0xb8, 0x91, 0x0b, 0x51, 0x49, 0x97, 0x0b, 0x50, 0xe1, 0xc2, 0x25,
+    0x9f, 0x0b, 0x50, 0x98, 0x83, 0x0b, 0x50, 0x71, 0x87, 0x0b, 0x50, 0x40,
+    0xc2, 0x04, 0xc6, 0x0b, 0x51, 0xa1, 0xc2, 0x00, 0xc4, 0x0b, 0x51, 0x80,
+    0x90, 0x0b, 0x51, 0x89, 0xc2, 0xd0, 0x00, 0x0b, 0x51, 0x29, 0x87, 0x0b,
+    0x50, 0x38, 0xc2, 0x02, 0xe0, 0x0b, 0x50, 0x61, 0x8b, 0x0b, 0x50, 0x58,
+    0x87, 0x0b, 0x51, 0x11, 0xc2, 0xd0, 0x00, 0x0b, 0x50, 0xf8, 0xc2, 0x01,
+    0x30, 0x0b, 0x51, 0x41, 0xc5, 0xde, 0x75, 0x0b, 0x51, 0x38, 0xc3, 0x8b,
+    0xa9, 0x0b, 0x50, 0xd1, 0xc3, 0x7c, 0x57, 0x0b, 0x50, 0x80, 0xc2, 0x10,
+    0x11, 0x0b, 0x50, 0xc0, 0xc2, 0x00, 0x7a, 0x0b, 0x50, 0x11, 0x07, 0xc3,
+    0x5f, 0x72, 0xc5, 0xd8, 0xe9, 0x0b, 0x4d, 0x10, 0xc2, 0xd0, 0x00, 0x0b,
+    0x4d, 0xa9, 0x96, 0x0b, 0x4d, 0x48, 0x91, 0x0b, 0x4b, 0xa9, 0x87, 0x0b,
+    0x4f, 0x50, 0x17, 0xc3, 0x5f, 0x7a, 0x96, 0x0b, 0x4d, 0xb8, 0x96, 0x0b,
+    0x4e, 0x61, 0xc2, 0x00, 0x3d, 0x0b, 0x4d, 0x59, 0xc2, 0x00, 0x11, 0x0b,
+    0x4b, 0xd0, 0x0d, 0xc3, 0x5f, 0x84, 0x83, 0x0b, 0x4f, 0x91, 0xc3, 0x8b,
+    0xa9, 0x0b, 0x4f, 0x03, 0x03, 0x5f, 0x95, 0x09, 0xc3, 0x5f, 0x99, 0xc6,
+    0xce, 0xa5, 0x0b, 0x4d, 0x19, 0x11, 0x43, 0x5f, 0xa1, 0xc2, 0x05, 0x1d,
+    0x0b, 0x4b, 0x81, 0x03, 0xc3, 0x5f, 0xa9, 0x0b, 0x43, 0x5f, 0xb3, 0x17,
+    0xc3, 0x5f, 0xbd, 0xc3, 0x8f, 0x8a, 0x0b, 0x4b, 0xe0, 0x87, 0x0b, 0x4e,
+    0x28, 0x07, 0xc3, 0x5f, 0xc7, 0xc5, 0xc0, 0x3e, 0x0b, 0x4c, 0x50, 0xc2,
+    0x00, 0xb6, 0x0b, 0x4e, 0x71, 0xc2, 0x01, 0xdf, 0x0b, 0x4d, 0xe0, 0xc2,
+    0x92, 0xb5, 0x0b, 0x4e, 0x09, 0xc2, 0x5c, 0x9b, 0x0b, 0x4d, 0x38, 0xc7,
+    0x0b, 0xc8, 0x0b, 0x4e, 0x01, 0xc7, 0xc8, 0xa1, 0x0b, 0x4d, 0x68, 0x8f,
+    0x0b, 0x4b, 0x91, 0x93, 0x0b, 0x4e, 0xe1, 0x83, 0x0b, 0x4e, 0xdb, 0x03,
+    0x5f, 0xd4, 0xc8, 0xbc, 0x02, 0x0b, 0x4c, 0x78, 0x91, 0x0b, 0x4b, 0xcb,
+    0x03, 0x5f, 0xd8, 0x93, 0x0b, 0x4e, 0xb0, 0x90, 0x0b, 0x50, 0x01, 0x97,
+    0x0b, 0x4f, 0xea, 0x03, 0x5f, 0xdc, 0x8f, 0x0b, 0x4d, 0x53, 0x03, 0x5f,
+    0xe2, 0xc2, 0x10, 0x11, 0x0b, 0x4c, 0xb0, 0x03, 0xc3, 0x5f, 0xe8, 0x87,
+    0x0b, 0x4f, 0x49, 0x8f, 0x0b, 0x4c, 0x88, 0x83, 0x0b, 0x4b, 0x63, 0x03,
+    0x5f, 0xf0, 0x42, 0x00, 0xaf, 0x43, 0x5f, 0xf4, 0x07, 0x43, 0x60, 0x00,
+    0x17, 0xc3, 0x60, 0x0a, 0xc2, 0x00, 0x4f, 0x0b, 0x4c, 0x20, 0xc2, 0x00,
+    0x45, 0x0b, 0x4e, 0x10, 0x93, 0x0b, 0x4b, 0x71, 0x87, 0x0b, 0x4f, 0x80,
+    0x91, 0x0b, 0x4f, 0x9b, 0x03, 0x60, 0x12, 0xc2, 0x14, 0xbe, 0x0b, 0x4e,
+    0xf1, 0xc5, 0x8b, 0xa8, 0x0b, 0x4d, 0x20, 0x96, 0x0b, 0x4c, 0x81, 0x87,
+    0x0b, 0x4b, 0xb0, 0x11, 0xc3, 0x60, 0x16, 0x93, 0x0b, 0x4f, 0xc1, 0x8f,
+    0x0b, 0x4b, 0xd8, 0x92, 0x0b, 0x4b, 0x49, 0x93, 0x0b, 0x4e, 0xc9, 0xc2,
+    0x00, 0xc2, 0x0b, 0x4c, 0xf8, 0x87, 0x0b, 0x4f, 0x61, 0xc3, 0x8b, 0xa9,
+    0x0b, 0x4c, 0xe8, 0xc2, 0x01, 0xdf, 0x0b, 0x4b, 0x41, 0x87, 0x0b, 0x4d,
+    0x30, 0x93, 0x0b, 0x4f, 0xe1, 0x87, 0x0b, 0x4d, 0xc3, 0x03, 0x60, 0x1e,
+    0x92, 0x0b, 0x4c, 0x58, 0xc2, 0x02, 0xe0, 0x0b, 0x4e, 0x18, 0xc2, 0x00,
+    0xc4, 0x0b, 0x4d, 0x29, 0x83, 0x0b, 0x4c, 0x38, 0x93, 0x0b, 0x50, 0x08,
+    0x00, 0xc3, 0x60, 0x22, 0x87, 0x0b, 0x4d, 0xa2, 0x03, 0x60, 0x32, 0x90,
+    0x0b, 0x4f, 0x29, 0x93, 0x0b, 0x4f, 0x21, 0xc3, 0xb5, 0x1b, 0x0b, 0x4f,
+    0x09, 0xc2, 0x00, 0xe2, 0x0b, 0x4d, 0x90, 0xc5, 0x00, 0x99, 0x0b, 0x4f,
+    0x19, 0xc8, 0xb7, 0x2a, 0x0b, 0x4f, 0x10, 0x9a, 0x0b, 0x4e, 0xf9, 0xc2,
+    0x10, 0x11, 0x0b, 0x4c, 0xbb, 0x03, 0x60, 0x36, 0x8f, 0x0b, 0x4d, 0xf0,
+    0x96, 0x0b, 0x4d, 0x71, 0xc2, 0x02, 0xe0, 0x0b, 0x4c, 0xa0, 0x09, 0xc3,
+    0x60, 0x3a, 0x0d, 0x43, 0x60, 0x50, 0xc2, 0x01, 0xdf, 0x0b, 0x4a, 0x01,
+    0x0a, 0xc3, 0x60, 0x6e, 0x43, 0x8f, 0x8a, 0x43, 0x60, 0x7a, 0x07, 0xc3,
+    0x60, 0x82, 0xc2, 0x5d, 0xa1, 0x0b, 0x4b, 0x10, 0xc2, 0x00, 0xc2, 0x0b,
+    0x49, 0xb9, 0x07, 0xc3, 0x60, 0x8c, 0xc2, 0x00, 0x45, 0x0b, 0x48, 0xc0,
+    0x8b, 0x0b, 0x4a, 0x69, 0xc2, 0x0f, 0xe1, 0x0b, 0x49, 0x79, 0xc2, 0x00,
+    0x3d, 0x0b, 0x49, 0x11, 0xc2, 0x00, 0xc2, 0x0b, 0x47, 0xd0, 0xc3, 0xdf,
+    0x8c, 0x0b, 0x4a, 0x39, 0x42, 0x2c, 0x43, 0xc3, 0x60, 0x96, 0xc2, 0x00,
+    0xb6, 0x0b, 0x48, 0x11, 0x8b, 0x0b, 0x47, 0x9a, 0x03, 0x60, 0xa0, 0x17,
+    0xc3, 0x60, 0xa6, 0xc3, 0xd0, 0xd7, 0x0b, 0x4a, 0x79, 0x96, 0x0b, 0x49,
+    0x80, 0xc5, 0xda, 0x33, 0x0b, 0x4a, 0x11, 0xc5, 0xd9, 0x9d, 0x0b, 0x48,
+    0x50, 0x17, 0xc3, 0x60, 0xb0, 0xc3, 0xd0, 0xd7, 0x0b, 0x4a, 0x80, 0xc2,
+    0x04, 0xc6, 0x0b, 0x49, 0x03, 0x03, 0x60, 0xb8, 0xc2, 0x01, 0xba, 0x0b,
+    0x47, 0x88, 0xc3, 0x8f, 0x8a, 0x0b, 0x49, 0x91, 0x42, 0x2c, 0x43, 0xc3,
+    0x60, 0xbe, 0x91, 0x0b, 0x48, 0xea, 0x03, 0x60, 0xc8, 0xc3, 0x8f, 0x8a,
+    0x0b, 0x48, 0xe1, 0xc3, 0x5c, 0x9f, 0x0b, 0x48, 0xd1, 0xc4, 0xe4, 0x1b,
+    0x0b, 0x48, 0xb0, 0x17, 0xc3, 0x60, 0xcc, 0xc3, 0xd0, 0xd7, 0x0b, 0x49,
+    0x40, 0xc2, 0x01, 0xbb, 0x0b, 0x49, 0xe8, 0x93, 0x0b, 0x49, 0xf9, 0x90,
+    0x0b, 0x49, 0xd1, 0xc2, 0x00, 0x7a, 0x0b, 0x48, 0x30, 0x17, 0xc3, 0x60,
+    0xda, 0x96, 0x0b, 0x48, 0x20, 0xc2, 0x10, 0x11, 0x0b, 0x49, 0xc9, 0x97,
+    0x0b, 0x4a, 0x91, 0x87, 0x0b, 0x48, 0x18, 0x93, 0x0b, 0x4b, 0x21, 0x92,
+    0x0b, 0x48, 0x38, 0xc2, 0x7f, 0xc0, 0x0b, 0x4a, 0xe1, 0x97, 0x0b, 0x4a,
+    0xc1, 0x07, 0xc3, 0x60, 0xee, 0xc2, 0x25, 0x9f, 0x0b, 0x4a, 0xa0, 0x11,
+    0xc3, 0x60, 0xf6, 0xc3, 0xe5, 0x00, 0x0b, 0x49, 0x28, 0xc4, 0xb5, 0xd8,
+    0x0b, 0x4b, 0x01, 0xc3, 0x1a, 0x7c, 0x0b, 0x4a, 0x50, 0x93, 0x0b, 0x4a,
+    0xe9, 0xc2, 0x00, 0xa4, 0x0b, 0x48, 0xd8, 0x87, 0x0b, 0x4a, 0xd1, 0xc4,
+    0xc3, 0x35, 0x0b, 0x49, 0x70, 0x42, 0x00, 0xbd, 0xc3, 0x60, 0xfe, 0x17,
+    0xc3, 0x61, 0x0a, 0x96, 0x0b, 0x46, 0x48, 0xca, 0x9c, 0x7a, 0x0b, 0x46,
+    0xa9, 0x96, 0x0b, 0x46, 0x70, 0xc2, 0x14, 0xbe, 0x0b, 0x47, 0x41, 0xc3,
+    0xdf, 0x8c, 0x0b, 0x46, 0xd8, 0xc4, 0xdf, 0x1b, 0x0b, 0x46, 0xe1, 0xc2,
+    0xd0, 0x00, 0x0b, 0x45, 0x50, 0x96, 0x0b, 0x47, 0x81, 0xc5, 0xd7, 0xe0,
+    0x0b, 0x45, 0xd0, 0xc4, 0xd2, 0x85, 0x0b, 0x46, 0x31, 0xc5, 0xda, 0x56,
+    0x0b, 0x45, 0x70, 0x90, 0x0b, 0x47, 0x71, 0xc5, 0xd6, 0x87, 0x0b, 0x44,
+    0xe0, 0x8f, 0x0b, 0x46, 0x29, 0x92, 0x0b, 0x45, 0xb0, 0x93, 0x0b, 0x47,
+    0x61, 0xc6, 0xcb, 0xe7, 0x0b, 0x45, 0x90, 0xc2, 0x5c, 0x9b, 0x0b, 0x47,
+    0x59, 0x09, 0xc3, 0x61, 0x18, 0xc2, 0x00, 0x7a, 0x0b, 0x46, 0x81, 0x0d,
+    0x43, 0x61, 0x25, 0x07, 0xc3, 0x61, 0x31, 0x03, 0xc3, 0x61, 0x3d, 0xc3,
+    0xdf, 0x8c, 0x0b, 0x45, 0x68, 0x03, 0xc3, 0x61, 0x47, 0x42, 0x2c, 0x43,
+    0xc3, 0x61, 0x4f, 0xc3, 0x83, 0xad, 0x0b, 0x45, 0x59, 0xc4, 0xc8, 0xbe,
+    0x0b, 0x44, 0xe8, 0x17, 0xc3, 0x61, 0x59, 0xc2, 0x00, 0x7a, 0x0b, 0x46,
+    0x99, 0xc3, 0x88, 0xcf, 0x0b, 0x45, 0xf9, 0x83, 0x0b, 0x45, 0xf1, 0xc5,
+    0xb5, 0x19, 0x0b, 0x45, 0x28, 0x07, 0xc3, 0x61, 0x63, 0xc2, 0x04, 0xc6,
+    0x0b, 0x45, 0xa1, 0xc6, 0xd0, 0x79, 0x0b, 0x44, 0xd0, 0xc3, 0x47, 0x4a,
+    0x0b, 0x45, 0x19, 0x83, 0x0b, 0x44, 0x80, 0x03, 0xc3, 0x61, 0x6d, 0x07,
+    0xc3, 0x61, 0x79, 0x8b, 0x0b, 0x46, 0xeb, 0x03, 0x61, 0x89, 0x17, 0x43,
+    0x61, 0x93, 0x07, 0xc3, 0x61, 0x9d, 0x00, 0x43, 0x61, 0xa9, 0xc3, 0xe5,
+    0x00, 0x0b, 0x47, 0x21, 0xc7, 0xc5, 0xd0, 0x0b, 0x45, 0x11, 0x8f, 0x0b,
+    0x44, 0x88, 0x92, 0x0b, 0x45, 0x01, 0xc3, 0x82, 0x78, 0x0b, 0x44, 0xb0,
+    0x09, 0xc3, 0x61, 0xb5, 0xc2, 0x00, 0x7a, 0x0b, 0x44, 0x71, 0xca, 0x9c,
+    0xc0, 0x0b, 0x43, 0xa0, 0xc2, 0x00, 0xc4, 0x0b, 0x44, 0x59, 0xc4, 0xc1,
+    0x3b, 0x0b, 0x42, 0xb8, 0xc5, 0xda, 0xfb, 0x0b, 0x44, 0x01, 0xc7, 0xc1,
+    0x4d, 0x0b, 0x43, 0x68, 0xc9, 0xac, 0x69, 0x0b, 0x43, 0x59, 0xc4, 0x96,
+    0xdd, 0x0b, 0x43, 0xe0, 0x43, 0x7c, 0x4f, 0x43, 0x61, 0xca, 0xc3, 0x8f,
+    0x91, 0x0b, 0x44, 0x21, 0xc4, 0x85, 0xb7, 0x0b, 0x43, 0xf1, 0xca, 0x9a,
+    0xd6, 0x0b, 0x43, 0x61, 0x03, 0x43, 0x61, 0xd6, 0xc8, 0xb7, 0xe2, 0x0b,
+    0x44, 0x11, 0x93, 0x0b, 0x43, 0xc8, 0x93, 0x0b, 0x44, 0x69, 0xc3, 0x12,
+    0xc2, 0x0b, 0x42, 0xe8, 0xc3, 0x7c, 0x57, 0x0b, 0x44, 0x31, 0xc4, 0xde,
+    0xab, 0x0b, 0x43, 0x81, 0xc3, 0xe5, 0x5d, 0x0b, 0x43, 0x70, 0xc4, 0xb3,
+    0x92, 0x0b, 0x43, 0x89, 0xcc, 0x83, 0x91, 0x0b, 0x43, 0x18, 0xc6, 0xcf,
+    0xfb, 0x0b, 0x43, 0x51, 0xc6, 0xd3, 0x25, 0x0b, 0x43, 0x48, 0xc5, 0xda,
+    0x10, 0x0b, 0x43, 0x41, 0xc9, 0xa9, 0x63, 0x0b, 0x42, 0xc0, 0x96, 0x0b,
+    0x42, 0x59, 0x93, 0x0b, 0x41, 0xe1, 0xc4, 0xe4, 0x5f, 0x0b, 0x41, 0x80,
+    0xcc, 0x8a, 0x2d, 0x0b, 0x42, 0x01, 0x0b, 0xc3, 0x61, 0xe2, 0x17, 0x43,
+    0x61, 0xee, 0xc3, 0xb5, 0x1b, 0x0b, 0x42, 0x51, 0xc6, 0xd1, 0x99, 0x0b,
+    0x41, 0x88, 0xc3, 0x48, 0xc4, 0x0b, 0x41, 0x71, 0xc7, 0xb1, 0xde, 0x0b,
+    0x40, 0x60, 0x93, 0x0b, 0x42, 0x81, 0xc2, 0x00, 0x87, 0x0b, 0x41, 0x38,
+    0x96, 0x0b, 0x41, 0x99, 0xc8, 0xb8, 0xfa, 0x0b, 0x40, 0x98, 0x07, 0xc3,
+    0x61, 0xf8, 0xc7, 0xc9, 0x49, 0x0b, 0x41, 0xe9, 0xc5, 0xda, 0x0b, 0x0b,
+    0x40, 0x78, 0x93, 0x0b, 0x42, 0xb1, 0xc3, 0x16, 0x59, 0x0b, 0x42, 0x40,
+    0x42, 0x00, 0x7a, 0xc3, 0x62, 0x11, 0xca, 0xa1, 0xe8, 0x0b, 0x40, 0xf0,
+    0x93, 0x0b, 0x42, 0xa9, 0xc6, 0xb7, 0xb4, 0x0b, 0x40, 0x20, 0x83, 0x0b,
+    0x42, 0x89, 0xc3, 0x8f, 0x8a, 0x0b, 0x42, 0x68, 0x8b, 0x0b, 0x42, 0x7b,
+    0x03, 0x62, 0x1d, 0xc2, 0x00, 0x3d, 0x0b, 0x42, 0x48, 0xc3, 0x53, 0x54,
+    0x0b, 0x42, 0x29, 0x43, 0xe6, 0x05, 0xc3, 0x62, 0x23, 0xc4, 0x08, 0x6b,
+    0x0b, 0x40, 0x68, 0xc5, 0x9c, 0x7f, 0x0b, 0x42, 0x19, 0xc4, 0x09, 0x91,
+    0x0b, 0x40, 0xa0, 0xc2, 0x00, 0xb6, 0x0b, 0x41, 0xfb, 0x03, 0x62, 0x2f,
+    0xc5, 0xdc, 0xbd, 0x0b, 0x40, 0x90, 0xc9, 0xb2, 0xc6, 0x0b, 0x41, 0xa1,
+    0xc9, 0x82, 0x74, 0x0b, 0x41, 0x48, 0xc7, 0xc3, 0x30, 0x0b, 0x40, 0xf9,
+    0xc6, 0xb7, 0xb4, 0x0b, 0x40, 0x38, 0xc3, 0x48, 0xc4, 0x0b, 0x41, 0x78,
+    0x03, 0xc3, 0x62, 0x33, 0xc9, 0x82, 0x74, 0x0b, 0x41, 0x41, 0xc5, 0xda,
+    0x65, 0x0b, 0x40, 0xe9, 0xc4, 0x99, 0x41, 0x0b, 0x40, 0xd8, 0x4d, 0x7c,
+    0x4d, 0xc3, 0x62, 0x3d, 0x4b, 0x98, 0xf2, 0x43, 0x62, 0x49, 0xc6, 0xcf,
+    0x0b, 0x0b, 0x41, 0x09, 0xc3, 0x82, 0x78, 0x0b, 0x40, 0xe0, 0xa1, 0x01,
+    0x40, 0x7b, 0x03, 0x62, 0x55, 0xa2, 0x01, 0x40, 0xbb, 0x03, 0x62, 0x6e,
+    0xa3, 0x01, 0x41, 0x3b, 0x03, 0x62, 0x80, 0xa5, 0x01, 0x44, 0x39, 0xa4,
+    0x01, 0x42, 0x3a, 0x03, 0x62, 0x8b, 0xa2, 0x01, 0x40, 0xdb, 0x03, 0x62,
+    0x8f, 0xa3, 0x01, 0x41, 0x5b, 0x03, 0x62, 0xa1, 0xa5, 0x01, 0x44, 0x59,
+    0xa4, 0x01, 0x42, 0x5a, 0x03, 0x62, 0xac, 0xa3, 0x01, 0x41, 0x9b, 0x03,
+    0x62, 0xb0, 0xa5, 0x01, 0x44, 0x99, 0xa4, 0x01, 0x42, 0x9a, 0x03, 0x62,
+    0xbb, 0xa5, 0x01, 0x45, 0x19, 0xa4, 0x01, 0x43, 0x1a, 0x03, 0x62, 0xbf,
+    0xa5, 0x01, 0x46, 0x18, 0xa2, 0x01, 0x40, 0xeb, 0x03, 0x62, 0xc3, 0xa3,
+    0x01, 0x41, 0x6b, 0x03, 0x62, 0xd5, 0xa5, 0x01, 0x44, 0x69, 0xa4, 0x01,
+    0x42, 0x6a, 0x03, 0x62, 0xe0, 0xa3, 0x01, 0x41, 0xab, 0x03, 0x62, 0xe4,
+    0xa5, 0x01, 0x44, 0xa9, 0xa4, 0x01, 0x42, 0xaa, 0x03, 0x62, 0xef, 0xa5,
+    0x01, 0x45, 0x29, 0xa4, 0x01, 0x43, 0x2a, 0x03, 0x62, 0xf3, 0xa5, 0x01,
+    0x46, 0x28, 0xa3, 0x01, 0x41, 0xcb, 0x03, 0x62, 0xf7, 0xa5, 0x01, 0x44,
+    0xc9, 0xa4, 0x01, 0x42, 0xca, 0x03, 0x63, 0x02, 0xa5, 0x01, 0x45, 0x49,
+    0xa4, 0x01, 0x43, 0x4a, 0x03, 0x63, 0x06, 0xa5, 0x01, 0x46, 0x48, 0xa5,
+    0x01, 0x45, 0x89, 0xa4, 0x01, 0x43, 0x8a, 0x03, 0x63, 0x0a, 0xa5, 0x01,
+    0x46, 0x88, 0xa5, 0x01, 0x47, 0x08, 0xa2, 0x01, 0x40, 0xf3, 0x03, 0x63,
+    0x0e, 0xa3, 0x01, 0x41, 0x73, 0x03, 0x63, 0x20, 0xa5, 0x01, 0x44, 0x71,
+    0xa4, 0x01, 0x42, 0x72, 0x03, 0x63, 0x2b, 0xa3, 0x01, 0x41, 0xb3, 0x03,
+    0x63, 0x2f, 0xa5, 0x01, 0x44, 0xb1, 0xa4, 0x01, 0x42, 0xb2, 0x03, 0x63,
+    0x3a, 0xa5, 0x01, 0x45, 0x31, 0xa4, 0x01, 0x43, 0x32, 0x03, 0x63, 0x3e,
+    0xa5, 0x01, 0x46, 0x30, 0xa3, 0x01, 0x41, 0xd3, 0x03, 0x63, 0x42, 0xa5,
+    0x01, 0x44, 0xd1, 0xa4, 0x01, 0x42, 0xd2, 0x03, 0x63, 0x4d, 0xa5, 0x01,
+    0x45, 0x51, 0xa4, 0x01, 0x43, 0x52, 0x03, 0x63, 0x51, 0xa5, 0x01, 0x46,
+    0x50, 0xa5, 0x01, 0x45, 0x91, 0xa4, 0x01, 0x43, 0x92, 0x03, 0x63, 0x55,
+    0xa5, 0x01, 0x46, 0x90, 0xa5, 0x01, 0x47, 0x10, 0xa3, 0x01, 0x41, 0xe3,
+    0x03, 0x63, 0x59, 0xa5, 0x01, 0x44, 0xe1, 0xa4, 0x01, 0x42, 0xe2, 0x03,
+    0x63, 0x64, 0xa5, 0x01, 0x45, 0x61, 0xa4, 0x01, 0x43, 0x62, 0x03, 0x63,
+    0x68, 0xa5, 0x01, 0x46, 0x60, 0xa5, 0x01, 0x45, 0xa1, 0xa4, 0x01, 0x43,
+    0xa2, 0x03, 0x63, 0x6c, 0xa5, 0x01, 0x46, 0xa0, 0xa5, 0x01, 0x47, 0x20,
+    0xa5, 0x01, 0x45, 0xc1, 0xa4, 0x01, 0x43, 0xc2, 0x03, 0x63, 0x70, 0xa5,
+    0x01, 0x46, 0xc0, 0xa5, 0x01, 0x47, 0x40, 0xa5, 0x01, 0x47, 0x80, 0xc3,
+    0x15, 0x30, 0x0e, 0x84, 0x11, 0xc7, 0x9c, 0xe1, 0x0e, 0x84, 0x08, 0xc3,
+    0x63, 0x2b, 0x0e, 0x82, 0x89, 0xc5, 0xcc, 0xcc, 0x0e, 0x80, 0x90, 0xc3,
+    0x2e, 0xd7, 0x0e, 0x84, 0xa1, 0xc4, 0x99, 0xff, 0x0e, 0x84, 0x98, 0xc6,
+    0x04, 0xe1, 0x0f, 0xd9, 0xf1, 0xc5, 0x00, 0x2c, 0x0f, 0xd9, 0xf9, 0xcc,
+    0x04, 0xcb, 0x0f, 0xda, 0x88, 0x46, 0x01, 0xc8, 0xc3, 0x63, 0x74, 0xd2,
+    0x4b, 0x83, 0x0f, 0xda, 0x68, 0xd2, 0x4b, 0x83, 0x0f, 0xda, 0x61, 0x46,
+    0x01, 0xc8, 0x43, 0x63, 0x80, 0xc6, 0x04, 0xe1, 0x0f, 0xda, 0x29, 0xcc,
+    0x04, 0xcb, 0x0f, 0xda, 0x50, 0xcc, 0x04, 0xcb, 0x0f, 0xda, 0x49, 0xc5,
+    0x00, 0x2c, 0x0f, 0xda, 0x58, 0xd4, 0x35, 0x61, 0x0f, 0xdc, 0xd9, 0xc3,
+    0x00, 0x3a, 0x01, 0x3e, 0xd8, 0xe0, 0x08, 0x67, 0x0f, 0xdb, 0x48, 0xe0,
+    0x08, 0x67, 0x0f, 0xdb, 0x58, 0xc7, 0x02, 0xa0, 0x0f, 0xc8, 0x29, 0xc9,
+    0x02, 0xde, 0x0f, 0xc8, 0x20, 0xd6, 0x2d, 0x62, 0x01, 0x0f, 0xe1, 0xcf,
+    0x2c, 0x35, 0x01, 0x0f, 0xc9, 0xc6, 0x01, 0x73, 0x01, 0x0d, 0x70, 0xcd,
+    0x7f, 0x80, 0x01, 0x4c, 0x79, 0xca, 0x9f, 0xa4, 0x01, 0x4c, 0x68, 0x00,
+    0x43, 0x63, 0x8c, 0xcf, 0x2c, 0x35, 0x01, 0x59, 0xa1, 0xd6, 0x2d, 0x62,
+    0x01, 0x59, 0xa9, 0x16, 0x43, 0x63, 0x9e, 0xd2, 0x05, 0xd4, 0x0f, 0xc0,
+    0x01, 0xd5, 0x03, 0xd2, 0x0f, 0xc0, 0x80, 0x46, 0x00, 0x8b, 0x43, 0x63,
+    0xad, 0xc9, 0x03, 0xc8, 0x01, 0x58, 0x81, 0xc7, 0x09, 0x0d, 0x01, 0x58,
+    0x88, 0xdd, 0x10, 0x86, 0x01, 0x0d, 0xc8, 0xcf, 0x6a, 0x8f, 0x01, 0x5a,
+    0x11, 0xce, 0x33, 0x92, 0x01, 0x5a, 0x58, 0xc6, 0x01, 0x73, 0x01, 0x0e,
+    0x69, 0xcf, 0x2c, 0x35, 0x01, 0x48, 0x10, 0xc5, 0x01, 0x4a, 0x01, 0x0d,
+    0xe9, 0x00, 0x43, 0x63, 0xb9, 0xc5, 0x01, 0x4a, 0x01, 0x0d, 0xe1, 0x00,
+    0x43, 0x63, 0xd1, 0x02, 0xc3, 0x63, 0xe3, 0xc2, 0x00, 0x48, 0x08, 0x3a,
+    0x40, 0x9e, 0x08, 0x30, 0x01, 0x9f, 0x08, 0x30, 0x09, 0xa0, 0x08, 0x30,
+    0x11, 0xa1, 0x08, 0x30, 0x19, 0xa2, 0x08, 0x30, 0x21, 0xa3, 0x08, 0x30,
+    0x29, 0xa4, 0x08, 0x30, 0x31, 0xa5, 0x08, 0x30, 0x39, 0xa6, 0x08, 0x30,
+    0x40, 0x9d, 0x08, 0x30, 0x49, 0xa0, 0x08, 0x30, 0x59, 0xa3, 0x08, 0x30,
+    0x61, 0xa4, 0x08, 0x30, 0x69, 0x9e, 0x08, 0x30, 0x50, 0x9d, 0x08, 0x30,
+    0x71, 0x9e, 0x08, 0x30, 0x7b, 0x03, 0x63, 0xfb, 0x9f, 0x08, 0x30, 0x93,
+    0x03, 0x64, 0x03, 0xa0, 0x08, 0x30, 0xab, 0x03, 0x64, 0x0b, 0xa1, 0x08,
+    0x30, 0xb9, 0xa3, 0x08, 0x30, 0xc1, 0xa4, 0x08, 0x30, 0xc9, 0xa5, 0x08,
+    0x30, 0xd1, 0xa6, 0x08, 0x30, 0xe0, 0x9d, 0x08, 0x30, 0xe9, 0x9e, 0x08,
+    0x30, 0xf1, 0xa1, 0x08, 0x30, 0xf9, 0xa4, 0x08, 0x31, 0x01, 0xa5, 0x08,
+    0x31, 0x09, 0xa6, 0x08, 0x31, 0x10, 0x9d, 0x08, 0x31, 0x19, 0x9e, 0x08,
+    0x31, 0x21, 0xa1, 0x08, 0x31, 0x29, 0xa2, 0x08, 0x31, 0x31, 0xa3, 0x08,
+    0x31, 0x39, 0xa4, 0x08, 0x31, 0x41, 0xa5, 0x08, 0x31, 0x49, 0xa6, 0x08,
+    0x31, 0x50, 0x9d, 0x08, 0x31, 0x59, 0x9e, 0x08, 0x31, 0x61, 0xa0, 0x08,
+    0x31, 0x69, 0xa1, 0x08, 0x31, 0x71, 0xa2, 0x08, 0x31, 0x79, 0xa3, 0x08,
+    0x31, 0x81, 0xa4, 0x08, 0x31, 0x89, 0xa5, 0x08, 0x31, 0x91, 0xa6, 0x08,
+    0x31, 0x98, 0x9d, 0x08, 0x31, 0xa1, 0x9e, 0x08, 0x31, 0xa9, 0xa2, 0x08,
+    0x31, 0xb1, 0xa3, 0x08, 0x31, 0xb9, 0xa4, 0x08, 0x31, 0xc1, 0xa6, 0x08,
+    0x31, 0xc8, 0x9d, 0x08, 0x31, 0xd1, 0xa0, 0x08, 0x31, 0xd9, 0xa1, 0x08,
+    0x31, 0xe1, 0xa3, 0x08, 0x31, 0xe9, 0xa4, 0x08, 0x31, 0xf1, 0xa5, 0x08,
+    0x31, 0xf9, 0xa6, 0x08, 0x32, 0x00, 0x9d, 0x08, 0x32, 0x09, 0x9e, 0x08,
+    0x32, 0x11, 0x9f, 0x08, 0x32, 0x19, 0xa3, 0x08, 0x32, 0x29, 0xa4, 0x08,
+    0x32, 0x31, 0xa2, 0x08, 0x32, 0x20, 0x9f, 0x08, 0x32, 0x59, 0xa0, 0x08,
+    0x32, 0x61, 0x9d, 0x08, 0x32, 0x48, 0x83, 0x08, 0x32, 0x69, 0x84, 0x08,
+    0x32, 0x70, 0x9d, 0x08, 0x32, 0x91, 0xa5, 0x08, 0x32, 0x98, 0x83, 0x08,
+    0x32, 0xe9, 0x84, 0x08, 0x32, 0xf1, 0x85, 0x08, 0x32, 0xf8, 0x83, 0x08,
+    0x33, 0x19, 0x84, 0x08, 0x33, 0x21, 0x85, 0x08, 0x33, 0x28, 0xc3, 0xe5,
+    0x90, 0x08, 0x00, 0x01, 0xc4, 0xe1, 0xa3, 0x08, 0x00, 0xc9, 0xc4, 0xe0,
+    0x1f, 0x08, 0x00, 0xf1, 0xc4, 0xe2, 0xfb, 0x08, 0x01, 0x99, 0xc4, 0xe3,
+    0x47, 0x08, 0x01, 0xa9, 0xc4, 0xe1, 0x13, 0x08, 0x00, 0x29, 0xc4, 0xae,
+    0x2d, 0x08, 0x00, 0x39, 0xc4, 0xdf, 0xd3, 0x08, 0x01, 0x59, 0xc4, 0xe2,
+    0x2b, 0x08, 0x01, 0x70, 0xc4, 0xe2, 0xcf, 0x08, 0x00, 0x41, 0xc4, 0xe0,
+    0x27, 0x08, 0x00, 0xa9, 0xc4, 0xe0, 0xf3, 0x08, 0x01, 0x09, 0xc4, 0xe2,
+    0x6f, 0x08, 0x01, 0xe1, 0xc3, 0xe5, 0xd5, 0x08, 0x00, 0x21, 0xc4, 0xe3,
+    0x6b, 0x08, 0x00, 0xb9, 0xc4, 0xe1, 0xab, 0x08, 0x01, 0x19, 0xc4, 0xdf,
+    0xcb, 0x08, 0x01, 0x80, 0xc4, 0xe0, 0xd3, 0x08, 0x00, 0x49, 0xc4, 0xdf,
+    0xef, 0x08, 0x00, 0xe1, 0xc4, 0xe3, 0x67, 0x08, 0x00, 0xe9, 0xc4, 0xe3,
+    0xe3, 0x08, 0x01, 0x11, 0xc4, 0xe2, 0xd3, 0x08, 0x01, 0xb9, 0xc4, 0xe1,
+    0x87, 0x08, 0x00, 0x51, 0xc4, 0xe0, 0x3b, 0x08, 0x01, 0x51, 0xc4, 0xe2,
+    0x43, 0x08, 0x01, 0x89, 0xc4, 0xe2, 0x17, 0x08, 0x01, 0x90, 0xc4, 0xe2,
+    0xcb, 0x08, 0x00, 0x81, 0xc4, 0xe4, 0xcf, 0x08, 0x01, 0xc9, 0xc4, 0xc5,
+    0xa6, 0x08, 0x01, 0xd1, 0xc4, 0xe0, 0x9f, 0x08, 0x02, 0x09, 0xc5, 0xdc,
+    0x31, 0x08, 0x02, 0x29, 0xc4, 0xe2, 0x87, 0x08, 0x00, 0x31, 0xc4, 0xe3,
+    0x2b, 0x08, 0x00, 0x59, 0xc4, 0xe1, 0x5b, 0x08, 0x01, 0x78, 0xc4, 0xe1,
+    0x9b, 0x08, 0x00, 0x89, 0xc4, 0xe2, 0x1f, 0x08, 0x01, 0xb1, 0xc5, 0xd4,
+    0xbb, 0x08, 0x02, 0x39, 0xc5, 0xdc, 0xd6, 0x08, 0x02, 0x51, 0xc5, 0xd8,
+    0x8a, 0x08, 0x02, 0x59, 0xc3, 0x71, 0x3e, 0x08, 0x00, 0x19, 0xc4, 0xe2,
+    0xb3, 0x08, 0x00, 0x71, 0xc4, 0xe4, 0xdb, 0x08, 0x01, 0x40, 0xc4, 0xe0,
+    0xbb, 0x08, 0x00, 0x99, 0xc4, 0xdc, 0x27, 0x08, 0x00, 0xa1, 0xc4, 0xe2,
+    0x8f, 0x08, 0x02, 0x11, 0xc5, 0xd5, 0x29, 0x08, 0x02, 0x60, 0xc4, 0xe0,
+    0x23, 0x08, 0x00, 0xb1, 0xc4, 0xdf, 0xe3, 0x08, 0x00, 0xf9, 0xc4, 0xe1,
+    0xef, 0x08, 0x01, 0x21, 0xc4, 0xe3, 0x73, 0x08, 0x01, 0xc1, 0xc4, 0xe2,
+    0xdf, 0x08, 0x01, 0xe9, 0xc5, 0xdc, 0xe0, 0x08, 0x02, 0x19, 0xc5, 0xd8,
+    0xa3, 0x08, 0x02, 0x41, 0xc4, 0xd0, 0x73, 0x08, 0x00, 0x79, 0xc4, 0xe4,
+    0x0b, 0x08, 0x00, 0x90, 0xc4, 0xe2, 0xc3, 0x08, 0x00, 0xd1, 0xc4, 0xe0,
+    0xef, 0x08, 0x01, 0x29, 0xc4, 0xe4, 0x83, 0x08, 0x01, 0xf9, 0xc5, 0xde,
+    0x2f, 0x08, 0x02, 0x31, 0xc3, 0xe4, 0xeb, 0x08, 0x00, 0x11, 0xc4, 0xe0,
+    0x0f, 0x08, 0x00, 0xc1, 0xc4, 0xe2, 0x5b, 0x08, 0x01, 0x49, 0xc4, 0xe1,
+    0xa7, 0x08, 0x01, 0x61, 0xc4, 0xe2, 0x97, 0x08, 0x02, 0x00, 0xc4, 0xe3,
+    0xd7, 0x08, 0x00, 0xd9, 0xc4, 0xe2, 0x2f, 0x08, 0x01, 0x01, 0xc4, 0xe2,
+    0x53, 0x08, 0x01, 0xa1, 0xc5, 0xd8, 0x12, 0x08, 0x02, 0x49, 0xc3, 0xe2,
+    0x0f, 0x08, 0x00, 0x09, 0xc4, 0xe1, 0xc3, 0x08, 0x00, 0x69, 0xc4, 0xdf,
+    0xd7, 0x08, 0x01, 0x31, 0xc4, 0xe1, 0x5f, 0x08, 0x01, 0x68, 0xc5, 0xd4,
+    0xb1, 0x08, 0x02, 0x69, 0xc5, 0xdd, 0x30, 0x08, 0x02, 0x20, 0xa5, 0x08,
+    0x02, 0x81, 0xa6, 0x08, 0x02, 0x88, 0xa4, 0x08, 0x02, 0xa1, 0xa6, 0x08,
+    0x02, 0xa8, 0xa0, 0x08, 0x02, 0xb9, 0xa1, 0x08, 0x02, 0xc0, 0x9f, 0x08,
+    0x02, 0xd1, 0xa0, 0x08, 0x02, 0xd9, 0xa3, 0x08, 0x02, 0xe1, 0xa6, 0x08,
+    0x02, 0xe8, 0x1d, 0xc3, 0x64, 0x0f, 0x1f, 0xc3, 0x64, 0x35, 0x20, 0xc3,
+    0x64, 0x53, 0x21, 0xc3, 0x64, 0x63, 0x22, 0xc3, 0x64, 0x7d, 0x23, 0xc3,
+    0x64, 0xa1, 0x24, 0xc3, 0x64, 0xcd, 0x25, 0xc3, 0x64, 0xf5, 0x26, 0x43,
+    0x65, 0x11, 0x1f, 0xc3, 0x65, 0x1b, 0x20, 0xc3, 0x65, 0x27, 0x21, 0xc3,
+    0x65, 0x45, 0x22, 0x43, 0x65, 0x6d, 0x1d, 0xc3, 0x65, 0x93, 0x1e, 0xc3,
+    0x65, 0xbb, 0x1f, 0xc3, 0x65, 0xe3, 0xc2, 0xc9, 0x2b, 0x08, 0x07, 0xc8,
+    0xc6, 0xd0, 0x73, 0x08, 0x04, 0x99, 0xc8, 0xb6, 0x4a, 0x08, 0x04, 0xa0,
+    0xc6, 0xd2, 0xe3, 0x08, 0x04, 0xc9, 0xc7, 0xc5, 0x59, 0x08, 0x04, 0xc0,
+    0x05, 0xc3, 0x65, 0xfb, 0x44, 0x05, 0x18, 0xc3, 0x66, 0x1c, 0xc5, 0x31,
+    0xee, 0x00, 0x0a, 0xdb, 0x03, 0x66, 0x2b, 0xcc, 0x51, 0x28, 0x00, 0xec,
+    0x51, 0xcc, 0x1e, 0xc1, 0x00, 0xeb, 0xa1, 0xc4, 0x01, 0x23, 0x00, 0x14,
+    0x11, 0xce, 0x38, 0xe6, 0x05, 0x3d, 0x49, 0x15, 0x43, 0x66, 0x31, 0xc3,
+    0x74, 0x83, 0x00, 0x12, 0xcb, 0x03, 0x66, 0x3d, 0x45, 0x07, 0x30, 0x43,
+    0x66, 0x43, 0x47, 0x39, 0xfa, 0xc3, 0x66, 0x51, 0xc7, 0xbe, 0x03, 0x05,
+    0x3e, 0xc8, 0xc7, 0xca, 0x22, 0x05, 0x5b, 0x01, 0xc6, 0xc8, 0xfd, 0x05,
+    0x3c, 0x60, 0xce, 0x01, 0x19, 0x0e, 0xf8, 0xe9, 0x05, 0xc3, 0x66, 0x68,
+    0xc5, 0x31, 0xee, 0x00, 0x08, 0x39, 0xc9, 0x16, 0x14, 0x00, 0x08, 0x59,
+    0xc3, 0x01, 0x5d, 0x05, 0x3c, 0x99, 0xcc, 0x51, 0x28, 0x05, 0x3c, 0xa1,
+    0xc4, 0x01, 0x23, 0x00, 0x0c, 0x41, 0xc6, 0x01, 0x73, 0x00, 0x11, 0xe0,
+    0x4a, 0xa3, 0xc8, 0x43, 0x66, 0x83, 0xcf, 0x61, 0xe3, 0x00, 0x12, 0xf1,
+    0x11, 0xc3, 0x66, 0x8f, 0xc9, 0x67, 0x20, 0x05, 0x3e, 0x88, 0xcb, 0x8e,
+    0x3f, 0x05, 0x39, 0x78, 0x46, 0x00, 0x8b, 0x43, 0x66, 0x9b, 0x45, 0x45,
+    0x88, 0xc3, 0x66, 0xa7, 0x8f, 0x05, 0x3b, 0xb8, 0xc4, 0x01, 0x23, 0x00,
+    0x0d, 0x6b, 0x03, 0x67, 0x00, 0x06, 0xc3, 0x67, 0x06, 0x05, 0xc3, 0x67,
+    0x12, 0xca, 0x64, 0x13, 0x00, 0xf3, 0x79, 0xcc, 0x1e, 0xc1, 0x00, 0xeb,
+    0xa9, 0xce, 0x01, 0x19, 0x00, 0x14, 0x41, 0xcc, 0x51, 0x28, 0x00, 0x0d,
+    0x59, 0xc6, 0x01, 0x73, 0x00, 0x0b, 0x38, 0xd3, 0x3f, 0x83, 0x00, 0xeb,
+    0xd1, 0xc3, 0x00, 0xbf, 0x00, 0x07, 0xf2, 0x03, 0x67, 0x30, 0xc8, 0xad,
+    0x81, 0x00, 0xe8, 0xb1, 0x43, 0x02, 0x6f, 0x43, 0x67, 0x39, 0xd4, 0x01,
+    0x13, 0x05, 0x5b, 0x38, 0xce, 0x01, 0x19, 0x0e, 0xf8, 0xd9, 0x42, 0x01,
+    0x23, 0xc3, 0x67, 0x4b, 0x05, 0xc3, 0x67, 0x5a, 0x06, 0xc3, 0x67, 0x69,
+    0xc6, 0x60, 0xb1, 0x00, 0x0a, 0x6b, 0x03, 0x67, 0x76, 0xc5, 0x1e, 0xc8,
+    0x00, 0x07, 0xab, 0x03, 0x67, 0x7c, 0xc6, 0x01, 0x73, 0x00, 0x07, 0xc3,
+    0x03, 0x67, 0x82, 0xc5, 0x1f, 0x0c, 0x00, 0x07, 0x91, 0xc5, 0x31, 0xee,
+    0x00, 0x07, 0x99, 0x42, 0x01, 0xc8, 0xc3, 0x67, 0x88, 0xc5, 0x1d, 0x88,
+    0x00, 0x0a, 0x79, 0xc6, 0xcc, 0x8f, 0x00, 0x0f, 0x5b, 0x03, 0x67, 0x9a,
+    0xce, 0x1d, 0x93, 0x00, 0x10, 0x78, 0x91, 0x00, 0x12, 0xa3, 0x03, 0x67,
+    0xa0, 0x87, 0x00, 0x12, 0xda, 0x03, 0x67, 0xaa, 0xc6, 0x01, 0x73, 0x00,
+    0x13, 0x43, 0x03, 0x67, 0xb0, 0x06, 0xc3, 0x67, 0xb6, 0xca, 0x9e, 0x5a,
+    0x00, 0xf6, 0x49, 0xc5, 0x1e, 0xc8, 0x00, 0x09, 0x4b, 0x03, 0x67, 0xc3,
+    0xce, 0x01, 0x19, 0x00, 0xec, 0xb1, 0xc5, 0x1f, 0x0c, 0x00, 0x07, 0x61,
+    0xc5, 0x31, 0xee, 0x00, 0x07, 0x69, 0x05, 0xc3, 0x67, 0xc9, 0xc6, 0x60,
+    0xb1, 0x00, 0x09, 0x59, 0xc5, 0x1d, 0x88, 0x00, 0x09, 0x69, 0xc6, 0xcc,
+    0x8f, 0x00, 0x09, 0x79, 0xce, 0x1d, 0x93, 0x00, 0x10, 0x58, 0x83, 0x00,
+    0x13, 0x4b, 0x03, 0x67, 0xd5, 0xc7, 0xca, 0x53, 0x05, 0x5b, 0x08, 0x46,
+    0x51, 0xbb, 0xc3, 0x67, 0xdb, 0x47, 0x1d, 0x71, 0x43, 0x67, 0xf3, 0xca,
+    0x9a, 0x86, 0x00, 0x15, 0x23, 0x03, 0x67, 0xff, 0xc3, 0x80, 0x9f, 0x00,
+    0xf4, 0xf8, 0x05, 0xc3, 0x68, 0x05, 0xca, 0x64, 0x13, 0x00, 0xf0, 0x79,
+    0x44, 0x05, 0x18, 0xc3, 0x68, 0x1d, 0xc4, 0x01, 0x23, 0x00, 0x12, 0xbb,
+    0x03, 0x68, 0x29, 0xcc, 0x51, 0x28, 0x00, 0xec, 0x09, 0xcc, 0x1e, 0xc1,
+    0x00, 0xeb, 0x69, 0x15, 0xc3, 0x68, 0x2f, 0x16, 0x43, 0x68, 0x3b, 0x00,
+    0x43, 0x68, 0x47, 0x45, 0x00, 0x5a, 0xc3, 0x68, 0x56, 0x46, 0x3b, 0xc5,
+    0x43, 0x68, 0x69, 0x00, 0x43, 0x68, 0x74, 0x46, 0x00, 0x8b, 0x43, 0x68,
+    0x80, 0x46, 0x00, 0x8b, 0x43, 0x68, 0x8c, 0x05, 0xc3, 0x68, 0xa7, 0xc5,
+    0x1e, 0xc8, 0x00, 0xf5, 0xeb, 0x03, 0x68, 0xbf, 0xca, 0x9e, 0x5a, 0x00,
+    0xf5, 0xd9, 0x06, 0xc3, 0x68, 0xc5, 0xc6, 0x60, 0xb1, 0x00, 0x08, 0x9b,
+    0x03, 0x68, 0xcf, 0xce, 0x01, 0x19, 0x00, 0xec, 0x91, 0xc8, 0xbe, 0x9a,
+    0x05, 0x59, 0xa1, 0xc5, 0x1f, 0x0c, 0x00, 0x07, 0x41, 0xc5, 0x31, 0xee,
+    0x00, 0x07, 0x49, 0xc5, 0x1d, 0x88, 0x00, 0x08, 0xa9, 0xc6, 0xcc, 0x8f,
+    0x00, 0x08, 0xc9, 0xce, 0x1d, 0x93, 0x00, 0x10, 0x39, 0xc6, 0x01, 0x73,
+    0x00, 0x12, 0x39, 0xc5, 0x22, 0x9e, 0x01, 0x63, 0xc0, 0xc3, 0x00, 0x49,
+    0x05, 0x39, 0x19, 0xc2, 0x00, 0x74, 0x05, 0x39, 0x28, 0x8a, 0x00, 0x07,
+    0x80, 0x44, 0x00, 0x8c, 0xc3, 0x68, 0xd5, 0xc7, 0xa6, 0x69, 0x05, 0x3a,
+    0xd8, 0x87, 0x00, 0x12, 0xc3, 0x03, 0x68, 0xdf, 0x8d, 0x0e, 0xf8, 0x19,
+    0xc8, 0xbb, 0x8a, 0x0e, 0xf8, 0x09, 0x85, 0x01, 0x0c, 0x23, 0x03, 0x68,
+    0xe5, 0xc6, 0x21, 0xa3, 0x00, 0x12, 0xe3, 0x03, 0x68, 0xeb, 0xcf, 0x61,
+    0x6b, 0x00, 0x13, 0xf9, 0xc6, 0xd3, 0x2b, 0x05, 0x3f, 0xb0, 0xc8, 0xa6,
+    0x68, 0x05, 0x3a, 0xe8, 0x04, 0xc3, 0x68, 0xf1, 0xc8, 0x61, 0x72, 0x0e,
+    0xf8, 0x89, 0x05, 0xc3, 0x69, 0x00, 0xca, 0x64, 0x13, 0x00, 0xf1, 0xd9,
+    0x42, 0x00, 0x58, 0xc3, 0x69, 0x18, 0xcc, 0x51, 0x28, 0x00, 0xec, 0x29,
+    0x47, 0x04, 0xcb, 0xc3, 0x69, 0x27, 0xcf, 0x68, 0x64, 0x05, 0x59, 0xb9,
+    0xce, 0x01, 0x19, 0x00, 0x13, 0x6b, 0x03, 0x69, 0x39, 0xcb, 0x8f, 0xb5,
+    0x05, 0x3a, 0x49, 0xc5, 0x31, 0xee, 0x00, 0x09, 0xd1, 0xc6, 0x01, 0x73,
+    0x00, 0x0a, 0x10, 0xc2, 0x25, 0xa1, 0x00, 0x13, 0x73, 0x03, 0x69, 0x3f,
+    0xc5, 0xd9, 0x07, 0x05, 0x59, 0xa8, 0x46, 0x00, 0x8b, 0x43, 0x69, 0x45,
+    0xcb, 0x90, 0x5a, 0x0e, 0xf8, 0x00, 0xc9, 0x16, 0x14, 0x00, 0xf0, 0xf9,
+    0xcc, 0x51, 0x28, 0x00, 0xec, 0x11, 0xcc, 0x1e, 0xc1, 0x00, 0xeb, 0x71,
+    0xc6, 0x01, 0x73, 0x05, 0x3c, 0xc9, 0xc4, 0x01, 0x23, 0x00, 0x0c, 0x90,
+    0xc4, 0xb0, 0x8b, 0x00, 0xf7, 0xf9, 0xc5, 0x1e, 0xc8, 0x00, 0xf7, 0xc9,
+    0xc4, 0x01, 0x23, 0x00, 0x0d, 0xa3, 0x03, 0x69, 0x4f, 0x06, 0xc3, 0x69,
+    0x55, 0xc5, 0x1f, 0x0c, 0x00, 0xf7, 0x99, 0xca, 0x9e, 0xe6, 0x00, 0xf4,
+    0xc9, 0x15, 0xc3, 0x69, 0x61, 0xc5, 0x31, 0xee, 0x00, 0x07, 0xe9, 0xca,
+    0x08, 0xf6, 0x00, 0x0b, 0xb9, 0xc6, 0x60, 0xb1, 0x00, 0x11, 0x98, 0x47,
+    0xc0, 0x2e, 0xc3, 0x69, 0x6d, 0xc8, 0xba, 0x02, 0x05, 0x3e, 0xb8, 0x44,
+    0x05, 0x18, 0xc3, 0x69, 0x77, 0xc5, 0x31, 0xee, 0x00, 0xf1, 0xf9, 0xcc,
+    0x51, 0x28, 0x00, 0xec, 0x31, 0xcc, 0x1e, 0xc1, 0x00, 0xeb, 0x79, 0xcc,
+    0x4d, 0x15, 0x05, 0x59, 0xd1, 0xc4, 0x01, 0x23, 0x00, 0x13, 0x88, 0x45,
+    0x00, 0x8c, 0xc3, 0x69, 0x83, 0xc3, 0x01, 0x5d, 0x00, 0x14, 0x4a, 0x03,
+    0x69, 0xcf, 0xcc, 0x23, 0x3f, 0x00, 0xeb, 0xf8, 0x45, 0x00, 0x8c, 0xc3,
+    0x69, 0xd5, 0xce, 0x74, 0x78, 0x05, 0x59, 0x88, 0xd4, 0x01, 0x13, 0x00,
+    0xec, 0x80, 0x46, 0x00, 0x8b, 0x43, 0x6a, 0x1a, 0xd4, 0x3e, 0x6c, 0x05,
+    0x39, 0xd8, 0xca, 0x9e, 0xe6, 0x00, 0xf4, 0xc1, 0x06, 0xc3, 0x6a, 0x26,
+    0xc5, 0x31, 0xee, 0x00, 0xf4, 0x19, 0xc5, 0x1f, 0x0c, 0x00, 0xf4, 0x09,
+    0xca, 0x08, 0xf6, 0x00, 0x0b, 0xa9, 0xc4, 0x01, 0x23, 0x01, 0x63, 0x98,
+    0xca, 0x64, 0x13, 0x00, 0xf4, 0xb1, 0xcb, 0x97, 0x2f, 0x00, 0xf1, 0x59,
+    0x05, 0xc3, 0x6a, 0x32, 0x06, 0xc3, 0x6a, 0x44, 0xc4, 0x01, 0x23, 0x00,
+    0x13, 0x31, 0xc6, 0x01, 0x73, 0x00, 0x09, 0x39, 0xcc, 0x51, 0x28, 0x05,
+    0x3c, 0xa8, 0xca, 0x1f, 0x59, 0x00, 0x13, 0x38, 0xca, 0x64, 0x13, 0x00,
+    0xf4, 0xa9, 0x06, 0xc3, 0x6a, 0x56, 0x05, 0xc3, 0x6a, 0x62, 0xcc, 0x51,
+    0x28, 0x00, 0xec, 0x71, 0xcc, 0x1e, 0xc1, 0x00, 0xeb, 0xb1, 0xce, 0x01,
+    0x19, 0x00, 0x14, 0x81, 0xc5, 0x31, 0xee, 0x00, 0x0b, 0xd1, 0x15, 0xc3,
+    0x6a, 0x74, 0xc4, 0x01, 0x23, 0x00, 0x11, 0x28, 0x06, 0xc3, 0x6a, 0x80,
+    0xcc, 0x51, 0x28, 0x00, 0xec, 0x69, 0x42, 0x01, 0xc8, 0x43, 0x6a, 0x8c,
+    0x06, 0xc3, 0x6a, 0x9b, 0xc5, 0x1e, 0xc8, 0x00, 0xf3, 0xe9, 0xcc, 0x51,
+    0x28, 0x00, 0xec, 0x61, 0xc4, 0x01, 0x23, 0x00, 0x14, 0x59, 0xca, 0x9f,
+    0x4a, 0x01, 0x63, 0x89, 0xc4, 0x00, 0x32, 0x01, 0x63, 0xa0, 0xc2, 0x10,
+    0x11, 0x05, 0x3c, 0xd9, 0xc2, 0x49, 0x0c, 0x05, 0x3c, 0xe9, 0xc2, 0x0f,
+    0xe1, 0x05, 0x3c, 0xf8, 0xc9, 0x16, 0x14, 0x00, 0xf2, 0xb9, 0xc5, 0x31,
+    0xee, 0x00, 0xf2, 0xa9, 0xcc, 0x51, 0x28, 0x00, 0xec, 0x41, 0x15, 0xc3,
+    0x6a, 0xa7, 0xcc, 0x1e, 0xc1, 0x00, 0xeb, 0x89, 0xc8, 0xbe, 0x9a, 0x05,
+    0x3a, 0x99, 0xc4, 0x01, 0x23, 0x00, 0x0d, 0x28, 0x45, 0x00, 0x8c, 0xc3,
+    0x6a, 0xb6, 0xd6, 0x2d, 0x78, 0x00, 0x0a, 0x48, 0xca, 0x64, 0x13, 0x00,
+    0xf1, 0xa9, 0x06, 0xc3, 0x6a, 0xec, 0xc5, 0x31, 0xee, 0x00, 0xf1, 0x89,
+    0xcc, 0x51, 0x28, 0x00, 0xec, 0x21, 0xc6, 0x01, 0x73, 0x05, 0x3a, 0x0b,
+    0x03, 0x6a, 0xfe, 0x05, 0xc3, 0x6b, 0x04, 0xce, 0x38, 0xe6, 0x05, 0x3d,
+    0x19, 0xc4, 0x01, 0x23, 0x00, 0x0c, 0xc8, 0xc6, 0x60, 0xb1, 0x00, 0xf1,
+    0x09, 0xcc, 0x51, 0x28, 0x00, 0xec, 0x19, 0xc5, 0x31, 0xee, 0x00, 0x0f,
+    0xa9, 0xc4, 0x01, 0x23, 0x00, 0x13, 0x01, 0x05, 0xc3, 0x6b, 0x10, 0xc5,
+    0x1d, 0x88, 0x00, 0x08, 0xf9, 0xc9, 0x16, 0x14, 0x00, 0x09, 0x09, 0xce,
+    0x38, 0xe6, 0x05, 0x3d, 0x09, 0xc6, 0x01, 0x73, 0x00, 0x0f, 0x28, 0x8b,
+    0x05, 0x3d, 0xe9, 0x83, 0x05, 0x3d, 0xd9, 0x97, 0x05, 0x3d, 0xf9, 0xc4,
+    0x00, 0xf0, 0x00, 0x12, 0x10, 0xca, 0x64, 0x13, 0x00, 0xf0, 0x39, 0x44,
+    0x05, 0x18, 0xc3, 0x6b, 0x22, 0xcc, 0x51, 0x28, 0x00, 0xec, 0x01, 0xcc,
+    0x1e, 0xc1, 0x00, 0xeb, 0x61, 0xc8, 0xbe, 0x9a, 0x05, 0x3c, 0xb9, 0xc6,
+    0x01, 0x73, 0x00, 0x0c, 0x01, 0xc6, 0xcf, 0xcb, 0x00, 0x0c, 0x19, 0xc4,
+    0x01, 0x23, 0x00, 0x12, 0x98, 0xca, 0xa4, 0x9a, 0x05, 0x5a, 0x69, 0x45,
+    0x7b, 0x4a, 0x43, 0x6b, 0x2e, 0x91, 0x05, 0x59, 0xeb, 0x03, 0x6b, 0x3c,
+    0x87, 0x05, 0x59, 0x90, 0x05, 0xc3, 0x6b, 0x42, 0xc6, 0x01, 0x73, 0x00,
+    0x12, 0x48, 0xc4, 0x01, 0x23, 0x00, 0x15, 0x03, 0x03, 0x6b, 0x54, 0xd8,
+    0x25, 0xeb, 0x05, 0x3a, 0xb9, 0xcf, 0x3e, 0xad, 0x05, 0x3a, 0xc8, 0x8e,
+    0x07, 0xd8, 0x21, 0x8b, 0x07, 0xd8, 0x18, 0xc6, 0x00, 0xd3, 0x00, 0xf7,
+    0xb0, 0x43, 0x05, 0x19, 0xc3, 0x6b, 0x5a, 0xc8, 0x20, 0xa9, 0x00, 0x0b,
+    0xc0, 0x98, 0x00, 0xf7, 0xe1, 0xc2, 0x02, 0xa7, 0x00, 0xf7, 0xd0, 0xc5,
+    0x05, 0x02, 0x00, 0xf2, 0x11, 0xc5, 0x00, 0xd4, 0x00, 0xf2, 0x00, 0x42,
+    0x01, 0x23, 0xc3, 0x6b, 0x66, 0x06, 0xc3, 0x6b, 0x75, 0xc6, 0x60, 0xb1,
+    0x00, 0x0b, 0x53, 0x03, 0x6b, 0x82, 0xc5, 0x1e, 0xc8, 0x00, 0x0b, 0x43,
+    0x03, 0x6b, 0x88, 0x05, 0xc3, 0x6b, 0x8c, 0xc5, 0x1f, 0x0c, 0x00, 0x06,
+    0xc9, 0xc5, 0x31, 0xee, 0x00, 0x06, 0xd1, 0xc6, 0x01, 0x73, 0x05, 0x3d,
+    0xc1, 0xc5, 0x1d, 0x88, 0x00, 0x0b, 0x61, 0xca, 0x9e, 0xe6, 0x00, 0x0b,
+    0x71, 0xce, 0x1d, 0x93, 0x00, 0x10, 0xb1, 0xc6, 0xcc, 0x8f, 0x00, 0x0b,
+    0x90, 0xc2, 0x00, 0xc0, 0x00, 0x0d, 0x03, 0x03, 0x6b, 0x9b, 0xc8, 0x9e,
+    0x5c, 0x00, 0xf6, 0x70, 0xc9, 0x08, 0xf7, 0x00, 0x06, 0xa3, 0x03, 0x6b,
+    0xa1, 0xc4, 0x65, 0xe2, 0x00, 0x0e, 0x88, 0x11, 0xc3, 0x6b, 0xa7, 0xc8,
+    0x20, 0xa9, 0x00, 0x06, 0xb2, 0x03, 0x6b, 0xb3, 0xc5, 0x60, 0xb2, 0x00,
+    0x0a, 0x63, 0x03, 0x6b, 0xb9, 0xcb, 0x1e, 0xc2, 0x00, 0x0c, 0xf8, 0x45,
+    0x02, 0x9a, 0x43, 0x6b, 0xbf, 0xca, 0x9b, 0xda, 0x00, 0x0f, 0xf0, 0xd1,
+    0x53, 0x76, 0x05, 0x3a, 0x51, 0xc2, 0x00, 0x11, 0x05, 0x3a, 0x60, 0xcb,
+    0x98, 0x58, 0x00, 0x0f, 0x60, 0x11, 0xc3, 0x6b, 0xd1, 0xc8, 0x20, 0xa9,
+    0x00, 0x06, 0x7a, 0x03, 0x6b, 0xdd, 0xc6, 0x05, 0x01, 0x00, 0xf1, 0x60,
+    0xc9, 0x08, 0xf7, 0x00, 0x06, 0x71, 0xc4, 0x65, 0xe2, 0x00, 0x0e, 0x78,
+    0xc9, 0x08, 0xf7, 0x00, 0x06, 0x53, 0x03, 0x6b, 0xe3, 0xc6, 0xbd, 0xf4,
+    0x00, 0x11, 0x43, 0x03, 0x6b, 0xe7, 0xc4, 0x65, 0xe2, 0x00, 0x08, 0xd0,
+    0xc6, 0x05, 0x01, 0x00, 0xf0, 0xd0, 0x11, 0xc3, 0x6b, 0xed, 0xc8, 0x20,
+    0xa9, 0x00, 0x06, 0x58, 0x45, 0x02, 0x9a, 0x43, 0x6b, 0xf9, 0xc8, 0x0f,
+    0xbd, 0x00, 0x0d, 0xc1, 0xca, 0x8e, 0x61, 0x00, 0x0f, 0x70, 0x45, 0x02,
+    0x9a, 0x43, 0x6c, 0x05, 0xc9, 0x08, 0xf7, 0x00, 0x06, 0x13, 0x03, 0x6c,
+    0x23, 0xc4, 0x65, 0xe2, 0x00, 0x0e, 0x68, 0x11, 0xc3, 0x6c, 0x29, 0xc8,
+    0x20, 0xa9, 0x00, 0x06, 0x22, 0x03, 0x6c, 0x35, 0xc5, 0x05, 0x02, 0x00,
+    0xf0, 0x01, 0xc5, 0x00, 0xd4, 0x00, 0x06, 0x2a, 0x03, 0x6c, 0x3b, 0xc5,
+    0x31, 0xee, 0x00, 0x0f, 0xe1, 0xc6, 0x60, 0xb1, 0x00, 0x0f, 0x10, 0xc5,
+    0x05, 0x02, 0x00, 0xf3, 0x13, 0x03, 0x6c, 0x41, 0xc5, 0x00, 0xd4, 0x00,
+    0xf3, 0x00, 0xc4, 0x65, 0xe2, 0x00, 0x0b, 0x03, 0x03, 0x6c, 0x47, 0xc9,
+    0x08, 0xf7, 0x00, 0x0a, 0xe1, 0xc6, 0xbd, 0xf4, 0x00, 0x0a, 0xf1, 0xca,
+    0xa7, 0x1a, 0x00, 0x10, 0xc0, 0xce, 0x16, 0x0f, 0x00, 0xf3, 0x20, 0xd3,
+    0x42, 0x2f, 0x05, 0x3e, 0x49, 0xc9, 0xb4, 0xeb, 0x01, 0x63, 0xf0, 0x43,
+    0x05, 0x19, 0xc3, 0x6c, 0x4d, 0xc8, 0x25, 0xfb, 0x01, 0x63, 0x58, 0xc9,
+    0x08, 0xf7, 0x00, 0xf4, 0x81, 0xc4, 0x65, 0xe2, 0x00, 0x0b, 0xe8, 0xc5,
+    0x01, 0x74, 0x00, 0x0d, 0xa9, 0xc9, 0xb4, 0xeb, 0x01, 0x63, 0xf8, 0x43,
+    0x05, 0x19, 0xc3, 0x6c, 0x59, 0xc8, 0x20, 0xa9, 0x00, 0xf4, 0x20, 0xc8,
+    0x0e, 0x6f, 0x00, 0xf3, 0xf1, 0xce, 0x3e, 0xae, 0x05, 0x3a, 0xf0, 0xcf,
+    0x68, 0x82, 0x00, 0xf3, 0x81, 0xc6, 0xbd, 0xf4, 0x00, 0x0b, 0x11, 0xc4,
+    0x65, 0xe2, 0x00, 0x0b, 0x21, 0xca, 0xa7, 0x1a, 0x00, 0x10, 0xd0, 0x43,
+    0x05, 0x19, 0xc3, 0x6c, 0x65, 0xce, 0x3e, 0xae, 0x00, 0x11, 0xf0, 0xd2,
+    0x25, 0xf1, 0x05, 0x3b, 0x30, 0xc4, 0xde, 0x3f, 0x01, 0x63, 0x80, 0xca,
+    0x64, 0x13, 0x00, 0xf2, 0xf1, 0x42, 0x00, 0x58, 0xc3, 0x6c, 0x71, 0xce,
+    0x38, 0xe6, 0x05, 0x3d, 0x31, 0xc5, 0x31, 0xee, 0x00, 0x0a, 0xa1, 0x05,
+    0xc3, 0x6c, 0x7d, 0xce, 0x1d, 0x93, 0x00, 0x10, 0x91, 0xc6, 0x01, 0x73,
+    0x00, 0x12, 0x61, 0xc4, 0x14, 0xa6, 0x01, 0x63, 0x20, 0xc5, 0x01, 0x74,
+    0x01, 0x63, 0x1b, 0x03, 0x6c, 0x89, 0xcc, 0x89, 0x01, 0x05, 0x3a, 0xa0,
+    0xcf, 0x68, 0x82, 0x00, 0xf2, 0x51, 0xc6, 0xbd, 0xf4, 0x00, 0x0a, 0x29,
+    0xc4, 0x65, 0xe2, 0x00, 0x0a, 0x38, 0xc9, 0x64, 0x14, 0x00, 0xf2, 0x41,
+    0xc8, 0x6d, 0x46, 0x00, 0x0c, 0xe9, 0xcd, 0x7b, 0x08, 0x00, 0x11, 0x00,
+    0x43, 0x05, 0x19, 0xc3, 0x6c, 0x8f, 0xc8, 0x25, 0xfb, 0x05, 0x3c, 0x80,
+    0xcf, 0x68, 0x82, 0x00, 0xf1, 0xe1, 0xc6, 0xbd, 0xf4, 0x00, 0x09, 0xd9,
+    0xc4, 0x65, 0xe2, 0x00, 0x09, 0xe8, 0xc7, 0x0e, 0x70, 0x00, 0xf1, 0xb3,
+    0x03, 0x6c, 0x9b, 0xc8, 0xa7, 0x26, 0x01, 0x63, 0x00, 0xc3, 0x02, 0xa3,
+    0x00, 0x09, 0xf9, 0xc5, 0x1e, 0xc8, 0x01, 0x63, 0x10, 0xc5, 0x01, 0x74,
+    0x00, 0x0a, 0x09, 0xcd, 0x6e, 0x05, 0x00, 0x0e, 0x40, 0xc2, 0x00, 0xb1,
+    0x00, 0x11, 0xe9, 0xc3, 0x3a, 0xe6, 0x05, 0x3d, 0x68, 0xc8, 0x0e, 0x6f,
+    0x00, 0xf1, 0x91, 0xce, 0x3e, 0xae, 0x05, 0x3a, 0x11, 0xc8, 0x25, 0xfb,
+    0x01, 0x63, 0x48, 0xd4, 0x3e, 0xa8, 0x05, 0x3a, 0x20, 0xc6, 0xbd, 0xf4,
+    0x00, 0x09, 0xb1, 0xc4, 0x65, 0xe2, 0x00, 0x0f, 0x40, 0xc6, 0xbd, 0xf4,
+    0x00, 0xf1, 0x41, 0xc9, 0x08, 0xf7, 0x00, 0x09, 0x21, 0xc4, 0x65, 0xe2,
+    0x00, 0x10, 0xf0, 0xc8, 0x20, 0xa9, 0x00, 0xf1, 0x31, 0x43, 0x05, 0x19,
+    0xc3, 0x6c, 0xa1, 0xc8, 0x25, 0xfb, 0x01, 0x63, 0x38, 0xc9, 0x08, 0xf7,
+    0x00, 0x08, 0xe1, 0xc6, 0xbd, 0xf4, 0x00, 0x09, 0x11, 0xc4, 0x65, 0xe2,
+    0x00, 0x0f, 0x30, 0xcf, 0x68, 0x82, 0x00, 0xf0, 0x91, 0xc6, 0xbd, 0xf4,
+    0x00, 0xf0, 0x81, 0xc4, 0x65, 0xe2, 0x00, 0x08, 0x70, 0xc5, 0x05, 0x02,
+    0x00, 0xf0, 0x61, 0xc5, 0x00, 0xd4, 0x00, 0xf0, 0x50, 0xcd, 0x77, 0xa1,
+    0x00, 0x0f, 0x93, 0x03, 0x6c, 0xad, 0xc5, 0x01, 0x74, 0x00, 0x08, 0x81,
+    0xd3, 0x42, 0x2f, 0x05, 0x3e, 0x38, 0xc6, 0xbd, 0xf4, 0x00, 0x06, 0x3b,
+    0x03, 0x6c, 0xb3, 0xc9, 0x08, 0xf7, 0x00, 0x08, 0x41, 0xc4, 0x65, 0xe2,
+    0x00, 0x08, 0x60, 0xc5, 0x05, 0x02, 0x00, 0xf0, 0x21, 0xc5, 0x00, 0xd4,
+    0x00, 0xf0, 0x10, 0xc9, 0x08, 0xf7, 0x00, 0x09, 0xa1, 0xcb, 0x4d, 0x16,
+    0x05, 0x3d, 0x90, 0x45, 0x00, 0x8c, 0xc3, 0x6c, 0xb9, 0xc6, 0x10, 0x9d,
+    0x01, 0x5b, 0x89, 0x4c, 0x14, 0x15, 0x43, 0x6c, 0xe3, 0xe0, 0x01, 0x47,
+    0x01, 0x4b, 0x70, 0x46, 0x05, 0x39, 0x43, 0x6c, 0xe9, 0xc6, 0x44, 0x50,
+    0x07, 0xd9, 0x59, 0xc7, 0x44, 0x4f, 0x07, 0xd9, 0x50, 0xc5, 0x64, 0xae,
+    0x07, 0xd9, 0x81, 0xc5, 0x79, 0xbe, 0x07, 0xd9, 0x71, 0xc6, 0xcc, 0xe3,
+    0x07, 0xd9, 0x78, 0xcc, 0x79, 0xeb, 0x05, 0x4b, 0x59, 0xc5, 0x8e, 0xdf,
+    0x05, 0x4b, 0x21, 0xc6, 0xbb, 0xec, 0x05, 0x4b, 0x70, 0xc3, 0x39, 0x37,
+    0x05, 0x4b, 0x61, 0x44, 0x3a, 0xbf, 0x43, 0x6c, 0xf5, 0xc6, 0xc1, 0x86,
+    0x05, 0x4b, 0xc9, 0xc5, 0xc0, 0x7d, 0x00, 0x88, 0x20, 0xc6, 0xce, 0xb1,
+    0x05, 0x4b, 0xc0, 0xc6, 0xd1, 0x57, 0x05, 0x4b, 0xa8, 0x0d, 0xc3, 0x6d,
+    0x07, 0xc5, 0xd9, 0x61, 0x00, 0x89, 0x71, 0x16, 0xc3, 0x6d, 0x13, 0xc5,
+    0xd6, 0x8c, 0x00, 0x89, 0x81, 0xc5, 0xda, 0xe7, 0x00, 0x89, 0x89, 0x12,
+    0xc3, 0x6d, 0x1f, 0xc9, 0xad, 0x26, 0x00, 0x89, 0xa1, 0xc5, 0xb7, 0x9d,
+    0x00, 0x89, 0xa9, 0x05, 0xc3, 0x6d, 0x2e, 0xc5, 0x90, 0xe4, 0x00, 0x89,
+    0xd8, 0xc5, 0x90, 0xe4, 0x05, 0x4b, 0xd1, 0xc5, 0x79, 0xf2, 0x00, 0x8a,
+    0xb0, 0xc5, 0x90, 0xe4, 0x05, 0x4b, 0xa1, 0x0d, 0xc3, 0x6d, 0x3a, 0x15,
+    0xc3, 0x6d, 0x46, 0xc5, 0xd9, 0x61, 0x00, 0x88, 0xf9, 0x16, 0xc3, 0x6d,
+    0x55, 0x05, 0xc3, 0x6d, 0x61, 0xc7, 0xba, 0x7b, 0x00, 0x89, 0x50, 0xc5,
+    0xc0, 0x7d, 0x00, 0x8a, 0x11, 0xc6, 0xc1, 0x86, 0x00, 0x8a, 0x50, 0xc4,
+    0x79, 0xf3, 0x00, 0x8a, 0x21, 0xc6, 0xca, 0x0e, 0x00, 0x8a, 0x31, 0xc6,
+    0xba, 0x7c, 0x00, 0x8a, 0x58, 0xc4, 0xc6, 0x7a, 0x00, 0x8a, 0x41, 0xc6,
+    0xc6, 0x79, 0x00, 0x8a, 0x48, 0xc5, 0xdb, 0xff, 0x05, 0x4b, 0x19, 0xc4,
+    0xad, 0x2b, 0x05, 0x4b, 0x11, 0xc5, 0x79, 0xf2, 0x05, 0x4b, 0x09, 0xc5,
+    0xda, 0xe7, 0x05, 0x4b, 0x01, 0xc6, 0x8e, 0xde, 0x00, 0x88, 0xb9, 0xc5,
+    0xd6, 0x8c, 0x00, 0x8a, 0xf0, 0xc4, 0x79, 0xf3, 0x00, 0x89, 0x59, 0xc6,
+    0xba, 0x7c, 0x00, 0x8a, 0xb8, 0x02, 0x43, 0x6d, 0x6d, 0x15, 0xc3, 0x6d,
+    0x79, 0x05, 0x43, 0x6d, 0x85, 0xc3, 0x39, 0x37, 0x00, 0x89, 0xf1, 0x44,
+    0x3a, 0xbf, 0x43, 0x6d, 0x91, 0xc4, 0xc6, 0x7a, 0x00, 0x8a, 0x81, 0xc6,
+    0xc6, 0x79, 0x00, 0x8a, 0xa8, 0x91, 0x00, 0x8b, 0xb1, 0x97, 0x00, 0x8b,
+    0xb9, 0xc2, 0x2c, 0x43, 0x00, 0x8d, 0x18, 0x02, 0x43, 0x6d, 0x9d, 0x87,
+    0x00, 0x8b, 0x21, 0x02, 0x43, 0x6d, 0xb0, 0x91, 0x00, 0x8b, 0x3a, 0x03,
+    0x6d, 0xbe, 0x02, 0x43, 0x6d, 0xc2, 0x02, 0x43, 0x6d, 0xdb, 0xc2, 0x27,
+    0x51, 0x00, 0x8c, 0xb8, 0x02, 0x43, 0x6d, 0xfe, 0x02, 0x43, 0x6e, 0x0c,
+    0x87, 0x00, 0x8c, 0x03, 0x03, 0x6e, 0x1f, 0x1b, 0xc3, 0x6e, 0x23, 0x91,
+    0x00, 0x8c, 0x13, 0x03, 0x6e, 0x31, 0x97, 0x00, 0x8c, 0x18, 0x87, 0x00,
+    0x8b, 0x58, 0x91, 0x00, 0x8b, 0x78, 0x83, 0x00, 0x8c, 0x4b, 0x03, 0x6e,
+    0x37, 0xc5, 0xd9, 0xa7, 0x00, 0x8c, 0x59, 0xc2, 0x0c, 0x43, 0x00, 0x8c,
+    0x63, 0x03, 0x6e, 0x3b, 0x97, 0x00, 0x8c, 0x69, 0xc3, 0xe5, 0xc9, 0x06,
+    0xbd, 0xb0, 0x83, 0x00, 0x8c, 0xc3, 0x03, 0x6e, 0x3f, 0x1b, 0xc3, 0x6e,
+    0x45, 0x91, 0x00, 0x8c, 0xd3, 0x03, 0x6e, 0x5b, 0x97, 0x00, 0x8c, 0xd9,
+    0xc2, 0x2c, 0x43, 0x00, 0x8c, 0xe1, 0x8b, 0x06, 0xbe, 0x20, 0x02, 0x43,
+    0x6e, 0x61, 0xc5, 0xda, 0xe7, 0x00, 0x8f, 0x11, 0x12, 0xc3, 0x6e, 0x8e,
+    0xc5, 0xd6, 0x8c, 0x06, 0xbe, 0xe8, 0xc6, 0x8e, 0xde, 0x00, 0x8d, 0x49,
+    0xc4, 0xad, 0x2b, 0x00, 0x8d, 0xdb, 0x03, 0x6e, 0x9a, 0xc5, 0xd6, 0x8c,
+    0x00, 0x8e, 0x83, 0x03, 0x6e, 0x9e, 0xc8, 0xb7, 0x9a, 0x00, 0x8f, 0x71,
+    0xc5, 0xb7, 0x9d, 0x00, 0x8f, 0x71, 0xc5, 0xd9, 0x61, 0x00, 0x8f, 0xf9,
+    0xc6, 0xc0, 0x7c, 0x06, 0xbe, 0x6b, 0x03, 0x6e, 0xa4, 0xc5, 0xda, 0xe7,
+    0x06, 0xbf, 0x01, 0xc5, 0x79, 0xf2, 0x06, 0xbf, 0x31, 0xc5, 0xdb, 0xff,
+    0x06, 0xbf, 0xc8, 0x02, 0x43, 0x6e, 0xaa, 0x05, 0xc3, 0x6e, 0xcc, 0xc5,
+    0x90, 0xe4, 0x00, 0x8d, 0x69, 0xc6, 0x8e, 0xde, 0x00, 0x8e, 0x29, 0x16,
+    0xc3, 0x6e, 0xd8, 0xc4, 0xad, 0x2b, 0x00, 0x8e, 0x39, 0xc7, 0xca, 0x0d,
+    0x00, 0x8e, 0x41, 0xc5, 0xd6, 0x8c, 0x06, 0xbe, 0x58, 0x02, 0x43, 0x6e,
+    0xe4, 0x0d, 0xc3, 0x6f, 0x09, 0xc5, 0xda, 0xe7, 0x00, 0x8d, 0x8b, 0x03,
+    0x6f, 0x1e, 0x12, 0xc3, 0x6f, 0x22, 0x15, 0xc3, 0x6f, 0x37, 0x16, 0xc3,
+    0x6f, 0x43, 0xc5, 0x90, 0xe4, 0x00, 0x8d, 0xb1, 0xc5, 0xd9, 0x61, 0x00,
+    0x8e, 0x69, 0x42, 0x0c, 0x43, 0x43, 0x6f, 0x52, 0xc6, 0x8e, 0xde, 0x00,
+    0x8e, 0xd1, 0xc5, 0xd6, 0x8c, 0x00, 0x8e, 0xd9, 0x12, 0xc3, 0x6f, 0x61,
+    0x15, 0xc3, 0x6f, 0x70, 0x05, 0xc3, 0x6f, 0x7c, 0xc5, 0x90, 0xe4, 0x00,
+    0x8f, 0x09, 0xc5, 0xd9, 0x61, 0x06, 0xbe, 0xf0, 0x02, 0x43, 0x6f, 0x88,
+    0x02, 0x43, 0x6f, 0xb6, 0x02, 0x43, 0x6f, 0xc8, 0x0d, 0xc3, 0x6f, 0xd4,
+    0xcb, 0x8e, 0xd9, 0x00, 0x8f, 0x68, 0x02, 0x43, 0x6f, 0xe0, 0xc5, 0xd9,
+    0x61, 0x00, 0x8f, 0xa9, 0xc5, 0xd6, 0x8c, 0x00, 0x8f, 0xb1, 0xc5, 0xda,
+    0xe7, 0x00, 0x8f, 0xb9, 0xc5, 0x79, 0xf2, 0x00, 0x8f, 0xc0, 0x02, 0x43,
+    0x6f, 0xec, 0xc4, 0x79, 0xf3, 0x01, 0x8b, 0xc1, 0xc6, 0xba, 0x7c, 0x01,
+    0x8c, 0x20, 0xc6, 0x8e, 0xde, 0x01, 0x8b, 0xd1, 0xc5, 0xd9, 0x61, 0x01,
+    0x8b, 0xd9, 0xc6, 0xc0, 0x7c, 0x01, 0x8b, 0xe1, 0xc5, 0x79, 0xf2, 0x01,
+    0x8b, 0xe9, 0xc5, 0xdb, 0xff, 0x01, 0x8b, 0xf0, 0xc5, 0xd9, 0xca, 0x01,
+    0x8b, 0x48, 0xc4, 0x79, 0xf3, 0x01, 0x89, 0xe3, 0x03, 0x70, 0x06, 0xc6,
+    0xba, 0x7c, 0x01, 0x89, 0xf9, 0xc6, 0xca, 0x0e, 0x01, 0x8b, 0x60, 0xc6,
+    0xc1, 0x86, 0x01, 0x89, 0xf1, 0xc5, 0xc0, 0x7d, 0x01, 0x8b, 0x50, 0xc4,
+    0x79, 0xf3, 0x01, 0x8b, 0x71, 0xc6, 0xca, 0x0e, 0x01, 0x8b, 0x80, 0xc4,
+    0xad, 0x2b, 0x01, 0x8a, 0x23, 0x03, 0x70, 0x0c, 0xc6, 0x8e, 0xde, 0x01,
+    0x8b, 0x91, 0x16, 0xc3, 0x70, 0x10, 0xc5, 0xdb, 0xff, 0x01, 0x8b, 0xb0,
+    0xc8, 0x90, 0xe1, 0x01, 0x8c, 0x30, 0x02, 0x43, 0x70, 0x1c, 0xc2, 0x19,
+    0x2c, 0x01, 0x8c, 0x3b, 0x03, 0x70, 0x28, 0x8b, 0x01, 0x8c, 0x48, 0xc2,
+    0x0c, 0x43, 0x01, 0x8c, 0x5b, 0x03, 0x70, 0x2c, 0x8b, 0x01, 0x8c, 0x60,
+    0x83, 0x07, 0xfb, 0x61, 0x97, 0x07, 0xfb, 0x69, 0x91, 0x07, 0xfb, 0x70,
+    0xc9, 0x57, 0x20, 0x0f, 0x64, 0xd8, 0xc8, 0x4b, 0x94, 0x0f, 0x64, 0x91,
+    0xc7, 0x0d, 0x04, 0x0f, 0x64, 0x48, 0xc9, 0x57, 0x20, 0x0f, 0x64, 0xd0,
+    0xc8, 0x4b, 0x94, 0x0f, 0x64, 0x89, 0xc7, 0x0d, 0x04, 0x0f, 0x64, 0x40,
+    0xc9, 0x57, 0x20, 0x0f, 0x64, 0xc8, 0x00, 0x43, 0x70, 0x30, 0xc9, 0x57,
+    0x20, 0x0f, 0x64, 0xc0, 0x00, 0x43, 0x70, 0x3c, 0xc9, 0x57, 0x20, 0x0f,
+    0x64, 0xb8, 0x00, 0x43, 0x70, 0x48, 0xc9, 0x57, 0x20, 0x0f, 0x64, 0xb0,
+    0x00, 0x43, 0x70, 0x54, 0x19, 0xc3, 0x70, 0x60, 0x0a, 0xc3, 0x70, 0x68,
+    0xc2, 0x00, 0xc4, 0x01, 0x9f, 0x48, 0xc3, 0x09, 0x9e, 0x01, 0x9f, 0x1b,
+    0x03, 0x70, 0x74, 0x0b, 0x43, 0x70, 0x7a, 0xc2, 0x22, 0xcc, 0x01, 0x9f,
+    0x2b, 0x03, 0x70, 0x86, 0xc4, 0x18, 0x10, 0x01, 0x9f, 0x32, 0x03, 0x70,
+    0x8c, 0xc4, 0x00, 0x2d, 0x01, 0x9f, 0x3b, 0x03, 0x70, 0x92, 0xc5, 0x66,
+    0xb1, 0x01, 0x9f, 0x50, 0xc4, 0x14, 0x09, 0x01, 0x9f, 0x90, 0x91, 0x01,
+    0x9a, 0xd1, 0x07, 0x43, 0x70, 0x98, 0xc3, 0x02, 0xdf, 0x01, 0x9a, 0xd9,
+    0xc6, 0x52, 0xcd, 0x01, 0x9b, 0x28, 0xc4, 0x14, 0x09, 0x01, 0x9b, 0x30,
+    0xc2, 0x00, 0x5f, 0x01, 0x9a, 0xe9, 0xc5, 0x14, 0x08, 0x01, 0x9b, 0x38,
+    0xc4, 0x14, 0x09, 0x01, 0x9b, 0x40, 0xc4, 0x14, 0x09, 0x01, 0x9b, 0x48,
+    0xc3, 0x03, 0x26, 0x01, 0x9b, 0x50, 0xd2, 0x4a, 0x2d, 0x0f, 0xd0, 0x31,
+    0xce, 0x2a, 0xfe, 0x0f, 0xd0, 0x69, 0xdf, 0x0d, 0x00, 0x0f, 0xd0, 0xd9,
+    0x16, 0x43, 0x70, 0xa7, 0xc5, 0xa8, 0xf7, 0x0f, 0xd2, 0x71, 0xc4, 0xde,
+    0x83, 0x0f, 0xd2, 0x79, 0xc6, 0xca, 0xfd, 0x0f, 0xd2, 0x80, 0xce, 0x2a,
+    0xfe, 0x0f, 0xd0, 0x49, 0xdb, 0x18, 0x03, 0x0f, 0xd1, 0x98, 0xc7, 0x02,
+    0x54, 0x01, 0x34, 0x31, 0xc8, 0x3e, 0xe6, 0x01, 0x4f, 0x60, 0xce, 0x3d,
+    0x7c, 0x01, 0x2f, 0xb9, 0xcd, 0x02, 0xb4, 0x01, 0x2f, 0xa0, 0xce, 0x3d,
+    0x7c, 0x01, 0x2f, 0xb1, 0xcd, 0x02, 0xb4, 0x01, 0x2f, 0xa8, 0xce, 0x61,
+    0x30, 0x01, 0x3f, 0x29, 0xce, 0x13, 0x5f, 0x01, 0x2d, 0x10, 0xcd, 0x6f,
+    0x2b, 0x01, 0x3f, 0x21, 0x45, 0x00, 0x27, 0x43, 0x70, 0xb3, 0xce, 0x3d,
+    0x7c, 0x01, 0x2f, 0x99, 0xcd, 0x02, 0xb4, 0x01, 0x2f, 0x80, 0x00, 0x43,
+    0x70, 0xbf, 0xc9, 0x57, 0x20, 0x08, 0x4f, 0xa8, 0xc9, 0x57, 0x20, 0x08,
+    0x4f, 0xa0, 0xc7, 0x0d, 0x04, 0x08, 0x4e, 0xc3, 0x03, 0x70, 0xcb, 0xc8,
+    0x4b, 0x94, 0x08, 0x4f, 0x08, 0xc9, 0x57, 0x20, 0x08, 0x4f, 0x50, 0xc7,
+    0x0d, 0x04, 0x08, 0x4e, 0xbb, 0x03, 0x70, 0xd1, 0xc8, 0x4b, 0x94, 0x08,
+    0x4f, 0x00, 0xc9, 0x57, 0x20, 0x08, 0x4f, 0x48, 0x00, 0x43, 0x70, 0xd7,
+    0xc9, 0x57, 0x20, 0x08, 0x4f, 0x40, 0x00, 0x43, 0x70, 0xe6, 0xc9, 0x57,
+    0x20, 0x08, 0x4f, 0x38, 0x00, 0x43, 0x70, 0xf5, 0xc9, 0x57, 0x20, 0x08,
+    0x4f, 0x30, 0x00, 0x43, 0x71, 0x04, 0xc9, 0x57, 0x20, 0x08, 0x4f, 0x28,
+    0xc9, 0x57, 0x20, 0x08, 0x4f, 0x68, 0xc4, 0x03, 0xc8, 0x01, 0x4d, 0x79,
+    0xc2, 0x02, 0xae, 0x01, 0x4d, 0x68, 0xc4, 0x03, 0xc8, 0x01, 0x4d, 0x71,
+    0xc2, 0x02, 0xae, 0x01, 0x4d, 0x60, 0xc4, 0x00, 0x49, 0x01, 0x4d, 0x59,
+    0xc5, 0x00, 0x2c, 0x01, 0x4d, 0x50, 0xc4, 0x00, 0x49, 0x01, 0x4d, 0x49,
+    0xc5, 0x00, 0x2c, 0x01, 0x4d, 0x40, 0x83, 0x00, 0xc5, 0x29, 0xc2, 0x00,
+    0xd0, 0x00, 0xc5, 0x20, 0xc2, 0x19, 0x2c, 0x00, 0xc5, 0x19, 0x83, 0x00,
+    0xc4, 0xe0, 0xc2, 0x00, 0xd0, 0x00, 0xc5, 0x09, 0xc3, 0x40, 0xe2, 0x00,
+    0xc4, 0xf8, 0x83, 0x00, 0xc5, 0x01, 0xc2, 0x01, 0x6f, 0x00, 0xc4, 0xf0,
+    0xc5, 0x7c, 0x16, 0x00, 0xc5, 0x49, 0xc4, 0xe4, 0xa3, 0x00, 0xc4, 0x10,
+    0xc2, 0x00, 0xd0, 0x00, 0xc4, 0x69, 0x83, 0x00, 0xc4, 0x60, 0xc3, 0xb4,
+    0xa6, 0x00, 0xc4, 0xc9, 0xc2, 0x01, 0x6f, 0x00, 0xc4, 0xc0, 0x8e, 0x08,
+    0xb0, 0x48, 0x94, 0x08, 0xb0, 0x38, 0xc4, 0x89, 0xfe, 0x00, 0xed, 0xf9,
+    0x46, 0x45, 0x87, 0xc3, 0x71, 0x13, 0x46, 0x00, 0x8b, 0xc3, 0x71, 0x45,
+    0xc9, 0xad, 0x80, 0x00, 0xea, 0xa1, 0xd3, 0x45, 0x14, 0x08, 0x3d, 0x59,
+    0xc9, 0xab, 0x40, 0x08, 0x3d, 0x63, 0x03, 0x71, 0x51, 0xcb, 0x8d, 0x37,
+    0x08, 0x3d, 0x70, 0xc2, 0x25, 0xa1, 0x00, 0xed, 0xf1, 0xc2, 0x01, 0xe2,
+    0x00, 0xed, 0xa1, 0xc2, 0x00, 0x8e, 0x00, 0xec, 0xf1, 0xc2, 0x00, 0x75,
+    0x00, 0xea, 0x88, 0x46, 0x00, 0x8b, 0x43, 0x71, 0x57, 0x46, 0x00, 0x8b,
+    0x43, 0x71, 0x63, 0x47, 0x0b, 0x18, 0xc3, 0x71, 0x6f, 0xca, 0x45, 0x1d,
+    0x00, 0xec, 0xe9, 0xc2, 0x00, 0x0a, 0x00, 0xeb, 0x09, 0x46, 0x17, 0x8d,
+    0x43, 0x71, 0xa8, 0xc6, 0x10, 0x3f, 0x00, 0xed, 0xb9, 0x00, 0x43, 0x71,
+    0xb4, 0x46, 0x00, 0x8b, 0xc3, 0x71, 0xc0, 0x05, 0xc3, 0x71, 0xcc, 0xc9,
+    0xa8, 0x94, 0x00, 0xea, 0xc8, 0xc2, 0x00, 0x0a, 0x00, 0xed, 0x90, 0xc7,
+    0xc3, 0x76, 0x00, 0xed, 0x89, 0xc3, 0x04, 0x87, 0x00, 0xea, 0xe9, 0xcc,
+    0x8b, 0x95, 0x00, 0xea, 0xa9, 0xca, 0x1f, 0x59, 0x08, 0x3c, 0x28, 0xce,
+    0x01, 0x19, 0x00, 0xed, 0x79, 0xc9, 0x6d, 0x45, 0x00, 0xed, 0x70, 0xca,
+    0x1f, 0x59, 0x00, 0xed, 0x60, 0x46, 0x00, 0x8b, 0xc3, 0x71, 0xd8, 0xca,
+    0x9f, 0xcc, 0x05, 0x3f, 0xc9, 0xc9, 0xab, 0x40, 0x08, 0x3c, 0xc9, 0xc9,
+    0xa8, 0x67, 0x08, 0x3c, 0xd1, 0xc3, 0xe6, 0x41, 0x08, 0x3c, 0xf2, 0x03,
+    0x71, 0xf9, 0xd2, 0x4d, 0x0f, 0x00, 0xed, 0x40, 0xc3, 0x01, 0xbb, 0x00,
+    0xed, 0x29, 0xcc, 0x23, 0x3f, 0x00, 0xed, 0x20, 0xd4, 0x3b, 0xc4, 0x00,
+    0xed, 0x0b, 0x03, 0x71, 0xff, 0x07, 0xc3, 0x72, 0x05, 0x46, 0x00, 0x8b,
+    0xc3, 0x72, 0x11, 0xc9, 0xa8, 0x67, 0x08, 0x3c, 0x3a, 0x03, 0x72, 0x20,
+    0xcb, 0x92, 0x5f, 0x08, 0x3c, 0x80, 0x48, 0x10, 0x2f, 0xc3, 0x72, 0x26,
+    0xc8, 0xb7, 0xda, 0x08, 0x3c, 0x89, 0x46, 0x00, 0x8b, 0x43, 0x72, 0x36,
+    0x45, 0x29, 0xb4, 0xc3, 0x72, 0x42, 0xc4, 0x38, 0x2c, 0x00, 0x17, 0x01,
+    0xca, 0x1f, 0x59, 0x08, 0x3c, 0x98, 0xc2, 0x00, 0x74, 0x00, 0xea, 0xe1,
+    0xc4, 0xde, 0x3f, 0x00, 0xea, 0x29, 0x87, 0x08, 0x3c, 0x18, 0x44, 0x05,
+    0x76, 0xc3, 0x72, 0x4e, 0xcc, 0x23, 0x3f, 0x08, 0x3d, 0x10, 0xc3, 0x0a,
+    0xe3, 0x05, 0x5a, 0xe3, 0x03, 0x72, 0x56, 0x46, 0x00, 0x8b, 0x43, 0x72,
+    0x5c, 0x48, 0x10, 0x2f, 0x43, 0x72, 0x68, 0x97, 0x00, 0xe9, 0xe8, 0xcc,
+    0x23, 0x3f, 0x05, 0x3f, 0xc0, 0xc7, 0xc3, 0x84, 0x00, 0xe9, 0x78, 0x87,
+    0x00, 0xe9, 0x68, 0xc4, 0x2a, 0xa0, 0x05, 0x38, 0x01, 0xc5, 0xdb, 0xcd,
+    0x05, 0x38, 0x11, 0xc2, 0x00, 0xe3, 0x05, 0x38, 0x21, 0xc2, 0x17, 0x99,
+    0x05, 0x38, 0x30, 0xc4, 0x2a, 0xa0, 0x05, 0x38, 0x09, 0xc5, 0xdb, 0xcd,
+    0x05, 0x38, 0x19, 0xc2, 0x00, 0xe3, 0x05, 0x38, 0x29, 0xc2, 0x17, 0x99,
+    0x05, 0x38, 0x38, 0xcc, 0x23, 0x33, 0x00, 0x16, 0x0b, 0x03, 0x72, 0x70,
+    0xc5, 0x0a, 0x8a, 0x00, 0x15, 0xe8, 0xe0, 0x01, 0x07, 0x08, 0x3d, 0xc8,
+    0xcd, 0x36, 0x86, 0x00, 0x16, 0x61, 0xc6, 0x60, 0xb1, 0x00, 0x16, 0x69,
+    0xcc, 0x1f, 0x0c, 0x00, 0x16, 0x71, 0xcc, 0x83, 0x0d, 0x00, 0x16, 0x79,
+    0x42, 0x00, 0x58, 0xc3, 0x72, 0x76, 0x44, 0x00, 0x49, 0xc3, 0x72, 0x82,
+    0xd9, 0x1d, 0x6f, 0x05, 0x38, 0xf9, 0x16, 0xc3, 0x72, 0x91, 0xcc, 0x4d,
+    0x8d, 0x00, 0x17, 0x81, 0x42, 0x00, 0x2c, 0xc3, 0x72, 0x9d, 0xd1, 0x08,
+    0xf6, 0x05, 0x3c, 0x40, 0xc5, 0x18, 0x25, 0x00, 0x15, 0xd1, 0xca, 0x2d,
+    0x84, 0x00, 0x17, 0x70, 0xc9, 0x03, 0xde, 0x00, 0x16, 0x29, 0xc4, 0x32,
+    0xbc, 0x00, 0x16, 0xa8, 0xcc, 0x07, 0xbb, 0x05, 0x38, 0xb9, 0xc5, 0x03,
+    0x02, 0x05, 0x38, 0xc1, 0xce, 0x0e, 0xf1, 0x05, 0x38, 0xc8, 0x00, 0xc3,
+    0x72, 0xa9, 0x44, 0x04, 0xce, 0x43, 0x72, 0xbb, 0x47, 0x19, 0x7a, 0xc3,
+    0x72, 0xc7, 0xd2, 0x4e, 0x89, 0x05, 0x38, 0x91, 0xc8, 0x4e, 0x93, 0x00,
+    0x17, 0x28, 0x47, 0x19, 0x7a, 0xc3, 0x72, 0xd3, 0xd2, 0x4e, 0x89, 0x05,
+    0x38, 0xb1, 0xc8, 0x4e, 0x93, 0x00, 0x17, 0x48, 0xc8, 0x4e, 0x93, 0x05,
+    0x38, 0x49, 0xd2, 0x4e, 0x89, 0x05, 0x38, 0x70, 0xc3, 0x11, 0x7e, 0x0e,
+    0xb6, 0xd1, 0xc5, 0xd8, 0x8f, 0x0e, 0xb6, 0x80, 0xc3, 0x11, 0x7e, 0x0e,
+    0xba, 0x71, 0xc5, 0xd8, 0x8f, 0x0e, 0xba, 0x20, 0xc3, 0x11, 0x7e, 0x0e,
+    0xb9, 0xa1, 0xc5, 0xd8, 0x8f, 0x0e, 0xb9, 0x50, 0xc7, 0x00, 0x90, 0x0e,
+    0xb9, 0x68, 0xc4, 0x18, 0x10, 0x0e, 0xbf, 0x99, 0xc2, 0x22, 0xcc, 0x0e,
+    0xbf, 0x90, 0xc3, 0x0d, 0x14, 0x0e, 0xbf, 0x89, 0xc3, 0x09, 0x9e, 0x0e,
+    0xbf, 0x80, 0xc4, 0x02, 0xde, 0x0e, 0xbf, 0x79, 0xc2, 0x02, 0xa0, 0x0e,
+    0xbf, 0x70, 0xc8, 0x9c, 0x0e, 0x0e, 0xbe, 0x49, 0xc9, 0xaa, 0x9e, 0x0e,
+    0xbe, 0x39, 0xd3, 0x43, 0x00, 0x0e, 0xbe, 0x18, 0x91, 0x0e, 0xb3, 0x23,
+    0x03, 0x72, 0xdf, 0x92, 0x0e, 0xb3, 0x2b, 0x03, 0x72, 0xe3, 0x85, 0x0e,
+    0xb2, 0xc3, 0x03, 0x72, 0xf3, 0x97, 0x0e, 0xb3, 0x53, 0x03, 0x72, 0xf9,
+    0x96, 0x0e, 0xb3, 0x4b, 0x03, 0x72, 0xff, 0x95, 0x0e, 0xb3, 0x43, 0x03,
+    0x73, 0x0b, 0x88, 0x0e, 0xb2, 0xdb, 0x03, 0x73, 0x11, 0x94, 0x0e, 0xb3,
+    0x3b, 0x03, 0x73, 0x17, 0x9a, 0x0e, 0xb3, 0x6b, 0x03, 0x73, 0x1d, 0x90,
+    0x0e, 0xb3, 0x1b, 0x03, 0x73, 0x21, 0x8f, 0x0e, 0xb3, 0x13, 0x03, 0x73,
+    0x25, 0x8e, 0x0e, 0xb3, 0x0b, 0x03, 0x73, 0x29, 0x8d, 0x0e, 0xb3, 0x03,
+    0x03, 0x73, 0x2f, 0x8b, 0x0e, 0xb2, 0xf3, 0x03, 0x73, 0x35, 0x87, 0x0e,
+    0xb2, 0xd3, 0x03, 0x73, 0x3b, 0x9c, 0x0e, 0xb3, 0x7b, 0x03, 0x73, 0x47,
+    0x86, 0x0e, 0xb2, 0xcb, 0x03, 0x73, 0x4d, 0x89, 0x0e, 0xb2, 0xe3, 0x03,
+    0x73, 0x53, 0x84, 0x0e, 0xb2, 0xbb, 0x03, 0x73, 0x59, 0x83, 0x0e, 0xb2,
+    0xb3, 0x03, 0x73, 0x5f, 0x9b, 0x0e, 0xb3, 0x71, 0x99, 0x0e, 0xb3, 0x61,
+    0x98, 0x0e, 0xb3, 0x59, 0x93, 0x0e, 0xb3, 0x31, 0x8c, 0x0e, 0xb2, 0xf9,
+    0x8a, 0x0e, 0xb2, 0xe8, 0x91, 0x0e, 0xb2, 0x53, 0x03, 0x73, 0x65, 0x92,
+    0x0e, 0xb2, 0x5b, 0x03, 0x73, 0x69, 0x85, 0x0e, 0xb1, 0xf3, 0x03, 0x73,
+    0x79, 0x97, 0x0e, 0xb2, 0x83, 0x03, 0x73, 0x7f, 0x96, 0x0e, 0xb2, 0x7b,
+    0x03, 0x73, 0x85, 0x95, 0x0e, 0xb2, 0x73, 0x03, 0x73, 0x94, 0x94, 0x0e,
+    0xb2, 0x6b, 0x03, 0x73, 0x9a, 0x9a, 0x0e, 0xb2, 0x9b, 0x03, 0x73, 0xa0,
+    0x90, 0x0e, 0xb2, 0x4b, 0x03, 0x73, 0xa4, 0x8f, 0x0e, 0xb2, 0x43, 0x03,
+    0x73, 0xa8, 0x8e, 0x0e, 0xb2, 0x3b, 0x03, 0x73, 0xac, 0x8d, 0x0e, 0xb2,
+    0x33, 0x03, 0x73, 0xb2, 0x8b, 0x0e, 0xb2, 0x23, 0x03, 0x73, 0xb8, 0x87,
+    0x0e, 0xb2, 0x03, 0x03, 0x73, 0xbe, 0x9c, 0x0e, 0xb2, 0xab, 0x03, 0x73,
+    0xca, 0x86, 0x0e, 0xb1, 0xfb, 0x03, 0x73, 0xd0, 0x89, 0x0e, 0xb2, 0x13,
+    0x03, 0x73, 0xd6, 0x84, 0x0e, 0xb1, 0xeb, 0x03, 0x73, 0xdc, 0x83, 0x0e,
+    0xb1, 0xe3, 0x03, 0x73, 0xe2, 0x9b, 0x0e, 0xb2, 0xa1, 0x99, 0x0e, 0xb2,
+    0x91, 0x98, 0x0e, 0xb2, 0x89, 0x93, 0x0e, 0xb2, 0x61, 0x8c, 0x0e, 0xb2,
+    0x29, 0x8a, 0x0e, 0xb2, 0x19, 0x88, 0x0e, 0xb2, 0x08, 0x0f, 0x43, 0x73,
+    0xe8, 0xc2, 0x00, 0xba, 0x0e, 0xbc, 0x39, 0xc2, 0x00, 0x0a, 0x0e, 0xbc,
+    0x29, 0x8b, 0x0e, 0xbb, 0xf8, 0xc2, 0x00, 0x0a, 0x0e, 0xbc, 0x30, 0xc6,
+    0x10, 0x3f, 0x0e, 0xbc, 0x20, 0xc2, 0x20, 0xec, 0x0e, 0xbc, 0x19, 0xc4,
+    0x89, 0xfe, 0x0e, 0xbb, 0xb8, 0xc4, 0x1a, 0x73, 0x0e, 0xbc, 0x10, 0xca,
+    0x91, 0x2c, 0x0e, 0xbc, 0x08, 0xc2, 0x01, 0x23, 0x0e, 0xbc, 0x00, 0x8b,
+    0x0e, 0xbb, 0xe8, 0x97, 0x0e, 0xbb, 0xe0, 0x97, 0x0e, 0xbb, 0xd8, 0xc4,
+    0xdd, 0x9a, 0x0e, 0xbb, 0xd0, 0xc4, 0x8b, 0x66, 0x0e, 0xbb, 0xc8, 0xc3,
+    0x01, 0xbb, 0x0e, 0xbb, 0xc0, 0xc2, 0x01, 0x6f, 0x0e, 0xbb, 0xb1, 0xc6,
+    0x10, 0x3f, 0x0e, 0xbb, 0xa0, 0xc3, 0x04, 0x87, 0x0e, 0xbb, 0xa8, 0xc4,
+    0xdb, 0x4c, 0x0e, 0xbb, 0x98, 0xc4, 0x38, 0x2c, 0x0e, 0xbb, 0x90, 0xc3,
+    0x04, 0x87, 0x0e, 0xbb, 0x88, 0xc4, 0xde, 0x3f, 0x0e, 0xbb, 0x80, 0x0f,
+    0x43, 0x73, 0xf4, 0xc2, 0x00, 0xba, 0x0e, 0xbb, 0x69, 0xc2, 0x00, 0x0a,
+    0x0e, 0xbb, 0x59, 0x8b, 0x0e, 0xbb, 0x28, 0xc2, 0x00, 0x0a, 0x0e, 0xbb,
+    0x60, 0xc6, 0x10, 0x3f, 0x0e, 0xbb, 0x50, 0xc2, 0x20, 0xec, 0x0e, 0xbb,
+    0x49, 0xc4, 0x89, 0xfe, 0x0e, 0xba, 0xea, 0x03, 0x74, 0x00, 0xc4, 0x1a,
+    0x73, 0x0e, 0xbb, 0x40, 0xc2, 0x01, 0x23, 0x0e, 0xbb, 0x30, 0x8b, 0x0e,
+    0xbb, 0x18, 0x97, 0x0e, 0xbb, 0x10, 0x97, 0x0e, 0xbb, 0x08, 0xc4, 0xdd,
+    0x9a, 0x0e, 0xbb, 0x00, 0xc4, 0x8b, 0x66, 0x0e, 0xba, 0xf8, 0xc3, 0x01,
+    0xbb, 0x0e, 0xba, 0xf0, 0xc2, 0x01, 0x6f, 0x0e, 0xba, 0xe1, 0xc6, 0x10,
+    0x3f, 0x0e, 0xba, 0xd0, 0xc3, 0x04, 0x87, 0x0e, 0xba, 0xd8, 0xc4, 0xdb,
+    0x4c, 0x0e, 0xba, 0xc8, 0xc4, 0x38, 0x2c, 0x0e, 0xba, 0xc0, 0xc3, 0x04,
+    0x87, 0x0e, 0xba, 0xb8, 0xc4, 0xde, 0x3f, 0x0e, 0xba, 0xb0, 0x8e, 0x00,
+    0x6b, 0xf2, 0x03, 0x74, 0x06, 0x90, 0x00, 0x6b, 0xd0, 0x08, 0xc3, 0x74,
+    0x0a, 0x07, 0xc3, 0x74, 0x16, 0x52, 0x48, 0xa1, 0xc3, 0x74, 0x22, 0xc9,
+    0xb2, 0xe1, 0x0e, 0x8f, 0x19, 0xca, 0xa6, 0x7a, 0x0e, 0x8f, 0x11, 0xcf,
+    0x61, 0xc5, 0x0e, 0x8f, 0x09, 0xc6, 0xcb, 0x39, 0x0e, 0x8e, 0xf0, 0xc7,
+    0xc8, 0xe7, 0x0e, 0x8e, 0xd8, 0x84, 0x0e, 0x8e, 0x91, 0x49, 0x32, 0x9d,
+    0x43, 0x74, 0x2e, 0x42, 0x02, 0x2f, 0xc3, 0x74, 0x3a, 0xc3, 0x61, 0xff,
+    0x0e, 0x88, 0x58, 0x1a, 0xc3, 0x74, 0x46, 0xcc, 0x82, 0x29, 0x0e, 0x88,
+    0x00, 0x44, 0x28, 0xcb, 0xc3, 0x74, 0x4e, 0xcb, 0x96, 0x11, 0x0e, 0x88,
+    0x28, 0xcc, 0x81, 0x69, 0x0e, 0x8e, 0xe9, 0x44, 0xa1, 0x2c, 0x43, 0x74,
+    0x5a, 0xc7, 0xc7, 0xc8, 0x0e, 0x8e, 0xcb, 0x03, 0x74, 0x66, 0xc5, 0xda,
+    0x4c, 0x0e, 0x8e, 0xa0, 0xca, 0xa5, 0xe4, 0x0e, 0x8e, 0xe0, 0x5b, 0x15,
+    0x0f, 0xc3, 0x74, 0x6c, 0x59, 0x15, 0x11, 0x43, 0x74, 0x7b, 0x00, 0x43,
+    0x74, 0x8a, 0x46, 0x01, 0x94, 0x43, 0x74, 0x96, 0x4c, 0x8b, 0xe9, 0xc3,
+    0x74, 0xa2, 0xce, 0x74, 0x94, 0x0e, 0x88, 0xc0, 0x0b, 0xc3, 0x74, 0xae,
+    0x4f, 0x60, 0x5d, 0x43, 0x74, 0xba, 0xc3, 0xe6, 0x2f, 0x0e, 0x8e, 0x79,
+    0xc7, 0xb2, 0x1d, 0x0e, 0x8c, 0x90, 0x0f, 0xc3, 0x74, 0xc6, 0xc2, 0x0e,
+    0x9a, 0x0e, 0x88, 0x60, 0x48, 0xbb, 0xc2, 0xc3, 0x74, 0xd2, 0x49, 0xb1,
+    0x67, 0x43, 0x74, 0xde, 0xc4, 0x03, 0xc8, 0x0e, 0x8d, 0x91, 0xc2, 0x02,
+    0xae, 0x0e, 0x8d, 0x88, 0x48, 0xb7, 0xd2, 0x43, 0x74, 0xea, 0x00, 0x43,
+    0x74, 0xf6, 0xc5, 0x02, 0xc2, 0x0e, 0x8a, 0x99, 0xc5, 0x01, 0xfc, 0x0e,
+    0x8a, 0x90, 0xc5, 0x5e, 0x2d, 0x0e, 0x89, 0xd1, 0xd0, 0x5e, 0x22, 0x0e,
+    0x89, 0x48, 0x07, 0xc3, 0x75, 0x02, 0x42, 0x00, 0x3a, 0x43, 0x75, 0x0c,
+    0xc6, 0x2c, 0xfc, 0x0e, 0x8b, 0xc9, 0xc4, 0xdf, 0x3b, 0x0e, 0x8b, 0xb9,
+    0xc3, 0x1e, 0x19, 0x0e, 0x8b, 0xa9, 0xc4, 0xd8, 0xf4, 0x0e, 0x8b, 0x98,
+    0x00, 0x43, 0x75, 0x16, 0xc5, 0x02, 0xc2, 0x0e, 0x8e, 0x01, 0xc5, 0x01,
+    0xfc, 0x0e, 0x8d, 0xf8, 0xc3, 0x08, 0x7c, 0x0e, 0x8c, 0x89, 0x43, 0xb1,
+    0x5e, 0x43, 0x75, 0x22, 0x10, 0xc3, 0x75, 0x2e, 0xcd, 0x7d, 0x03, 0x0e,
+    0x88, 0xd0, 0xc4, 0x03, 0xc8, 0x0e, 0x89, 0x69, 0xc2, 0x02, 0xae, 0x0e,
+    0x89, 0x60, 0x48, 0xb7, 0xd2, 0x43, 0x75, 0x3a, 0xc6, 0x05, 0x01, 0x0e,
+    0x88, 0x88, 0xc2, 0x15, 0x10, 0x0e, 0x8d, 0xa3, 0x03, 0x75, 0x46, 0xc5,
+    0xd6, 0x5a, 0x0e, 0x88, 0x51, 0xc7, 0xc9, 0x65, 0x0e, 0x88, 0x49, 0xcc,
+    0x81, 0xd5, 0x0e, 0x88, 0x20, 0xca, 0x9e, 0xa0, 0x0e, 0x8d, 0x49, 0xc9,
+    0xb2, 0x1b, 0x0e, 0x8c, 0x98, 0xc4, 0x35, 0x36, 0x0e, 0x89, 0x59, 0xc5,
+    0xa2, 0xba, 0x0e, 0x89, 0x51, 0xc7, 0x44, 0x3c, 0x0e, 0x88, 0x08, 0x9f,
+    0x0e, 0x89, 0x31, 0x9e, 0x0e, 0x89, 0x28, 0xc4, 0x23, 0x2e, 0x0e, 0x8a,
+    0xe9, 0xc4, 0x2c, 0x0d, 0x0e, 0x89, 0xd8, 0xca, 0xa1, 0x2a, 0x0e, 0x8d,
+    0x81, 0xc4, 0x23, 0x2e, 0x0e, 0x8a, 0xf1, 0xc4, 0x2c, 0x0d, 0x0e, 0x89,
+    0xe0, 0xc9, 0xab, 0x13, 0x0e, 0x8d, 0x41, 0xc6, 0x2c, 0xfc, 0x0e, 0x8b,
+    0xd1, 0xc4, 0xdf, 0x3b, 0x0e, 0x8b, 0xc1, 0xc3, 0x1e, 0x19, 0x0e, 0x8b,
+    0xb1, 0xc4, 0xd8, 0xf4, 0x0e, 0x8b, 0xa0, 0xc4, 0x23, 0x2e, 0x0e, 0x8b,
+    0x01, 0xc4, 0x2c, 0x0d, 0x0e, 0x89, 0xf0, 0xc4, 0x03, 0xc8, 0x0e, 0x89,
+    0x79, 0xc2, 0x02, 0xae, 0x0e, 0x89, 0x70, 0x9e, 0x0e, 0x8c, 0xdb, 0x03,
+    0x75, 0x4c, 0xa6, 0x0e, 0x8d, 0x19, 0xa5, 0x0e, 0x8d, 0x11, 0xa4, 0x0e,
+    0x8d, 0x09, 0xa3, 0x0e, 0x8d, 0x01, 0xa2, 0x0e, 0x8c, 0xf9, 0xa1, 0x0e,
+    0x8c, 0xf1, 0xa0, 0x0e, 0x8c, 0xe9, 0x9f, 0x0e, 0x8c, 0xe0, 0x57, 0x28,
+    0xe4, 0xc3, 0x75, 0x54, 0xcb, 0x74, 0x97, 0x0e, 0x88, 0xb0, 0xc5, 0xd7,
+    0x6d, 0x0e, 0x89, 0xb9, 0xc4, 0xe2, 0x4b, 0x0e, 0x89, 0xb0, 0xc9, 0xa8,
+    0x79, 0x0e, 0x8c, 0x61, 0xcf, 0x61, 0xf2, 0x0e, 0x88, 0x38, 0x44, 0x61,
+    0xf8, 0xc3, 0x75, 0x60, 0xd3, 0x44, 0x30, 0x0e, 0x88, 0x18, 0xc4, 0x23,
+    0x2e, 0x0e, 0x8a, 0xf9, 0xc4, 0x2c, 0x0d, 0x0e, 0x89, 0xe9, 0x45, 0x2b,
+    0x5f, 0x43, 0x75, 0x6c, 0xc5, 0xd7, 0x6d, 0x0e, 0x89, 0xc9, 0xc4, 0xe2,
+    0x4b, 0x0e, 0x89, 0xc0, 0xc8, 0x01, 0x92, 0x01, 0x51, 0xd9, 0xcd, 0x76,
+    0x35, 0x01, 0x51, 0xb9, 0xd1, 0x51, 0x56, 0x01, 0x51, 0xa9, 0xd0, 0x5b,
+    0x92, 0x01, 0x51, 0xa0, 0xc8, 0x52, 0x09, 0x01, 0x51, 0x89, 0xc9, 0x16,
+    0x14, 0x01, 0x51, 0x80, 0xc2, 0x00, 0xd0, 0x05, 0x53, 0x49, 0x83, 0x05,
+    0x53, 0x40, 0xc2, 0x00, 0xd0, 0x05, 0x4f, 0x71, 0x83, 0x05, 0x4f, 0x68,
+    0xc2, 0x00, 0xd0, 0x05, 0x4f, 0x21, 0x83, 0x00, 0x83, 0xf8, 0xc2, 0x00,
+    0xc1, 0x05, 0x4f, 0x19, 0xc2, 0x19, 0x2c, 0x00, 0x83, 0xd1, 0x83, 0x00,
+    0x83, 0xe0, 0x83, 0x00, 0x83, 0xa9, 0xc2, 0x00, 0xd0, 0x00, 0x83, 0xb0,
+    0x83, 0x00, 0x83, 0xb9, 0xc2, 0x00, 0xd0, 0x05, 0x4f, 0x00, 0x83, 0x00,
+    0x83, 0xc1, 0xc2, 0x00, 0xd0, 0x05, 0x4f, 0x08, 0xa5, 0x0d, 0x7f, 0xf1,
+    0xa4, 0x0d, 0x7f, 0xe9, 0xa2, 0x0d, 0x7f, 0xd9, 0xa1, 0x0d, 0x7f, 0xd1,
+    0xa0, 0x0d, 0x7f, 0xc9, 0x9f, 0x0d, 0x7f, 0xc1, 0x9e, 0x0d, 0x7f, 0xb8,
+    0xa5, 0x0d, 0x7f, 0xb1, 0xa4, 0x0d, 0x7f, 0xa9, 0xa2, 0x0d, 0x7f, 0x99,
+    0xa1, 0x0d, 0x7f, 0x91, 0xa0, 0x0d, 0x7f, 0x89, 0x9f, 0x0d, 0x7f, 0x80,
+    0x94, 0x00, 0x67, 0x00, 0x8e, 0x00, 0x67, 0x08, 0xc5, 0xde, 0x4d, 0x01,
+    0x79, 0xa1, 0xc4, 0xb6, 0xdb, 0x01, 0x7b, 0x40, 0xc5, 0x8c, 0xf0, 0x01,
+    0x79, 0x99, 0xca, 0xa3, 0x14, 0x01, 0x7d, 0x58, 0xc4, 0x2a, 0xa0, 0x01,
+    0x7c, 0x48, 0xc4, 0x03, 0x0b, 0x01, 0x79, 0x69, 0x86, 0x01, 0x7d, 0x48,
+    0xc3, 0x38, 0x5b, 0x00, 0xcf, 0xf9, 0xc4, 0xe0, 0xaf, 0x00, 0xcf, 0x78,
+    0xc3, 0x38, 0x5b, 0x00, 0xcf, 0xe1, 0xc4, 0xe0, 0xaf, 0x00, 0xcf, 0x60,
+    0xc3, 0x38, 0x5b, 0x00, 0xcf, 0xf1, 0xc4, 0xe0, 0xaf, 0x00, 0xcf, 0x70,
+    0xc3, 0x38, 0x5b, 0x00, 0xcf, 0xe9, 0xc4, 0xe0, 0xaf, 0x00, 0xcf, 0x68,
+    0x44, 0xdf, 0x37, 0xc3, 0x75, 0x78, 0x43, 0x93, 0x74, 0x43, 0x75, 0x84,
+    0xc3, 0x38, 0x5b, 0x00, 0xcf, 0xb9, 0xc4, 0xe0, 0xaf, 0x00, 0xcf, 0x38,
+    0xc3, 0x38, 0x5b, 0x00, 0xcf, 0xb1, 0xc4, 0xe0, 0xaf, 0x00, 0xcf, 0x30,
+    0x04, 0xc3, 0x75, 0x90, 0xc3, 0x71, 0xec, 0x00, 0xbf, 0xb9, 0xc4, 0xda,
+    0x97, 0x00, 0xbf, 0xb0, 0x4b, 0x18, 0x04, 0xc3, 0x75, 0x9c, 0xdc, 0x13,
+    0xf9, 0x0f, 0xd2, 0x38, 0xc9, 0x1f, 0x5a, 0x01, 0x49, 0x21, 0xd4, 0x3c,
+    0x8c, 0x01, 0x49, 0x41, 0x49, 0x0d, 0x20, 0x43, 0x75, 0xa8, 0x43, 0x01,
+    0x7b, 0xc3, 0x75, 0xb4, 0xc9, 0x1f, 0x5a, 0x01, 0x49, 0x19, 0xd4, 0x39,
+    0xd0, 0x01, 0x49, 0x39, 0xd9, 0x20, 0x5d, 0x01, 0x49, 0x90, 0x87, 0x0f,
+    0x3f, 0xc8, 0x87, 0x0f, 0x3f, 0xb0, 0x87, 0x0f, 0x3f, 0x88, 0x87, 0x05,
+    0x59, 0x20, 0x83, 0x05, 0x59, 0x18, 0x83, 0x00, 0x96, 0x98, 0x87, 0x00,
+    0x96, 0xa0, 0xc3, 0x11, 0x7e, 0x00, 0x1d, 0x4b, 0x03, 0x75, 0xc0, 0xc5,
+    0xd8, 0x8f, 0x00, 0x1c, 0xfa, 0x03, 0x75, 0xc6, 0xcb, 0x8f, 0x9f, 0x00,
+    0xff, 0x60, 0x46, 0x00, 0x8b, 0x43, 0x75, 0xcc, 0x46, 0x00, 0x8b, 0x43,
+    0x75, 0xe6, 0xc2, 0x01, 0x6f, 0x00, 0x1c, 0xbb, 0x03, 0x76, 0x09, 0xc6,
+    0x10, 0x3f, 0x00, 0x1c, 0xaa, 0x03, 0x76, 0x0f, 0xc4, 0xde, 0x3f, 0x00,
+    0x1c, 0x8b, 0x03, 0x76, 0x15, 0xcc, 0x87, 0xed, 0x00, 0x1b, 0x90, 0xd1,
+    0x51, 0x45, 0x00, 0x1b, 0xb1, 0x8b, 0x00, 0x1d, 0x01, 0xc2, 0x00, 0x0a,
+    0x00, 0x1d, 0x31, 0xc2, 0x00, 0xba, 0x00, 0x1d, 0x40, 0xc4, 0x89, 0xfe,
+    0x00, 0x1c, 0xc1, 0xc2, 0x20, 0xec, 0x00, 0x1d, 0x20, 0xc4, 0x1a, 0x73,
+    0x00, 0x1d, 0x19, 0xc2, 0x01, 0x23, 0x00, 0x1f, 0xb9, 0xc2, 0x00, 0xd1,
+    0x00, 0x1f, 0xd0, 0xc3, 0x11, 0x7e, 0x00, 0x1e, 0x4b, 0x03, 0x76, 0x1b,
+    0xc5, 0xd8, 0x8f, 0x00, 0x1d, 0xfa, 0x03, 0x76, 0x21, 0x46, 0x00, 0x8b,
+    0x43, 0x76, 0x27, 0x46, 0x00, 0x8b, 0x43, 0x76, 0x45, 0x46, 0x00, 0x8b,
+    0x43, 0x76, 0x51, 0xc2, 0x01, 0x6f, 0x00, 0x1d, 0xbb, 0x03, 0x76, 0x6f,
+    0xc6, 0x10, 0x3f, 0x00, 0x1d, 0xaa, 0x03, 0x76, 0x75, 0xc4, 0xde, 0x3f,
+    0x00, 0x1d, 0x8b, 0x03, 0x76, 0x7b, 0x47, 0x78, 0xc0, 0x43, 0x76, 0x81,
+    0xc4, 0xdb, 0x4c, 0x00, 0x1d, 0xa1, 0xc6, 0x51, 0x50, 0x00, 0x1e, 0xe8,
+    0xc4, 0x89, 0xfe, 0x00, 0x1d, 0xc1, 0xc2, 0x20, 0xec, 0x00, 0x1e, 0x20,
+    0xc4, 0x8b, 0x66, 0x00, 0x1d, 0xd1, 0xc4, 0x78, 0xc8, 0x00, 0x1e, 0xf8,
+    0x8b, 0x00, 0x1e, 0x01, 0xc2, 0x00, 0x0a, 0x00, 0x1e, 0x31, 0xc2, 0x00,
+    0xba, 0x00, 0x1e, 0x41, 0xd1, 0x51, 0x45, 0x00, 0x1b, 0xb8, 0xc4, 0x1a,
+    0x73, 0x00, 0x1e, 0x19, 0xc5, 0xd6, 0xe6, 0x00, 0x1e, 0xd9, 0xc2, 0x01,
+    0x23, 0x00, 0x1f, 0xc1, 0x03, 0x43, 0x76, 0x8d, 0x12, 0xc3, 0x76, 0x97,
+    0xc3, 0x79, 0xe7, 0x00, 0xe9, 0x49, 0xc5, 0xdd, 0x99, 0x00, 0xe9, 0x39,
+    0xc5, 0x51, 0x51, 0x00, 0xe9, 0x31, 0xc5, 0x9b, 0xd5, 0x05, 0x5b, 0x28,
+    0xc7, 0x08, 0x79, 0x08, 0x0a, 0x01, 0x0a, 0xc3, 0x76, 0xa1, 0xc7, 0x3e,
+    0x00, 0x08, 0x0a, 0x11, 0x49, 0x57, 0x21, 0x43, 0x76, 0xad, 0xc2, 0x00,
+    0x5f, 0x08, 0x0a, 0x1b, 0x03, 0x76, 0xb9, 0xc3, 0x45, 0x6b, 0x08, 0x0a,
+    0x22, 0x03, 0x76, 0xbd, 0x16, 0xc3, 0x76, 0xc1, 0xc7, 0x67, 0xc7, 0x08,
+    0x0a, 0x81, 0xc4, 0x45, 0x6f, 0x08, 0x0a, 0xb8, 0xc3, 0x05, 0x14, 0x08,
+    0x0a, 0xd1, 0xc3, 0x09, 0x41, 0x08, 0x0b, 0x11, 0xc5, 0x45, 0x69, 0x08,
+    0x0b, 0x40, 0xc3, 0x05, 0x14, 0x08, 0x0a, 0xcb, 0x03, 0x76, 0xcd, 0x16,
+    0xc3, 0x76, 0xd1, 0x42, 0x02, 0x09, 0x43, 0x76, 0xe1, 0x42, 0x02, 0x09,
+    0xc3, 0x76, 0xed, 0xc3, 0x09, 0x41, 0x08, 0x0b, 0x02, 0x03, 0x76, 0xff,
+    0xc9, 0x3d, 0xff, 0x08, 0x0a, 0xf0, 0xc5, 0x00, 0x48, 0x01, 0x54, 0x20,
+    0xc4, 0x0d, 0x0e, 0x08, 0x79, 0x21, 0xc3, 0x02, 0xdf, 0x08, 0x78, 0xf8,
+    0xc4, 0x18, 0x12, 0x08, 0x79, 0x19, 0x91, 0x08, 0x78, 0xf0, 0xc3, 0xb5,
+    0x3e, 0x08, 0x78, 0xdb, 0x03, 0x77, 0x05, 0xc5, 0xd9, 0xde, 0x08, 0x78,
+    0xb3, 0x03, 0x77, 0x0b, 0xc3, 0x20, 0x18, 0x08, 0x78, 0x7b, 0x03, 0x77,
+    0x11, 0xc2, 0x01, 0x7f, 0x08, 0x78, 0x31, 0xc4, 0xe3, 0x27, 0x08, 0x78,
+    0x19, 0xc5, 0xa5, 0xfd, 0x08, 0x78, 0x08, 0xc3, 0x11, 0xef, 0x08, 0x78,
+    0xc9, 0x03, 0x43, 0x77, 0x17, 0x0e, 0xc3, 0x77, 0x23, 0xc3, 0x16, 0x5a,
+    0x08, 0x78, 0x90, 0xc2, 0x00, 0x8e, 0x08, 0x78, 0x48, 0xc3, 0x1e, 0x1b,
+    0x08, 0x53, 0xe1, 0xc2, 0x39, 0x8b, 0x08, 0x53, 0xd8, 0xc4, 0x40, 0x9c,
+    0x08, 0x53, 0xc9, 0xc3, 0x77, 0x79, 0x08, 0x53, 0x98, 0x96, 0x08, 0x53,
+    0x51, 0xc3, 0x77, 0x79, 0x08, 0x53, 0x71, 0xc4, 0xdc, 0x2d, 0x08, 0x53,
+    0x78, 0xcc, 0x89, 0xb5, 0x08, 0x67, 0x88, 0xcc, 0x89, 0xb5, 0x08, 0x65,
+    0x88, 0x89, 0x08, 0x61, 0x70, 0xc9, 0xb1, 0x28, 0x08, 0x1e, 0x42, 0x03,
+    0x77, 0x2f, 0x83, 0x08, 0x1d, 0x19, 0x97, 0x08, 0x1d, 0x20, 0x83, 0x08,
+    0x1d, 0x29, 0x97, 0x08, 0x1d, 0x30, 0x83, 0x08, 0x1d, 0x39, 0xcb, 0x95,
+    0x09, 0x08, 0x1e, 0x58, 0x83, 0x08, 0x1d, 0x49, 0x8b, 0x08, 0x1d, 0x50,
+    0x83, 0x08, 0x1d, 0x59, 0x97, 0x08, 0x1d, 0x61, 0xc2, 0x00, 0xd0, 0x08,
+    0x1d, 0x80, 0x83, 0x08, 0x1d, 0x6b, 0x03, 0x77, 0x3b, 0x8b, 0x08, 0x1d,
+    0x71, 0x97, 0x08, 0x1d, 0x78, 0x83, 0x08, 0x1d, 0x93, 0x03, 0x77, 0x44,
+    0xc6, 0xcc, 0x11, 0x08, 0x1e, 0x78, 0x83, 0x08, 0x1d, 0xa1, 0x97, 0x08,
+    0x1d, 0xa8, 0x83, 0x08, 0x1d, 0xb1, 0x8b, 0x08, 0x1d, 0xb9, 0x97, 0x08,
+    0x1d, 0xc0, 0x83, 0x08, 0x1d, 0xd1, 0x8b, 0x08, 0x1d, 0xd8, 0x83, 0x08,
+    0x1d, 0xe1, 0x97, 0x08, 0x1d, 0xe8, 0x83, 0x08, 0x1d, 0xf9, 0xc2, 0x00,
+    0xd0, 0x08, 0x1e, 0x09, 0xc2, 0x0d, 0xf6, 0x08, 0x1e, 0x10, 0x19, 0xc3,
+    0x77, 0x4a, 0xc2, 0x00, 0xc4, 0x08, 0x1e, 0x98, 0x00, 0x43, 0x77, 0x54,
+    0xca, 0xa2, 0x6a, 0x0e, 0x7d, 0x30, 0x46, 0x00, 0x8b, 0x43, 0x77, 0x66,
+    0xcc, 0x87, 0x39, 0x0e, 0x7c, 0xf8, 0x43, 0x94, 0x9b, 0x43, 0x77, 0x72,
+    0xcb, 0x94, 0x9b, 0x0e, 0x7c, 0x50, 0xc5, 0x00, 0x2c, 0x0e, 0x78, 0xb1,
+    0xc4, 0x00, 0x49, 0x0e, 0x78, 0x50, 0x97, 0x00, 0xc7, 0x88, 0x91, 0x00,
+    0xc7, 0x60, 0x91, 0x00, 0xc7, 0x58, 0xc5, 0x01, 0x6f, 0x00, 0xc7, 0xa9,
+    0xc5, 0xdb, 0xa5, 0x00, 0xc7, 0x70, 0x87, 0x00, 0xb1, 0x58, 0x87, 0x00,
+    0xb2, 0x58, 0x87, 0x00, 0xb0, 0xf8, 0x87, 0x00, 0xae, 0x38, 0x83, 0x00,
+    0xb3, 0x61, 0x8b, 0x00, 0xb3, 0x59, 0x87, 0x00, 0xb3, 0x4b, 0x03, 0x77,
+    0x7e, 0x91, 0x00, 0xb3, 0x41, 0x97, 0x00, 0xb3, 0x38, 0x87, 0x00, 0xaf,
+    0x28, 0x87, 0x00, 0xb2, 0xf0, 0x87, 0x00, 0xae, 0xf8, 0x8b, 0x00, 0xb1,
+    0xc1, 0x87, 0x00, 0xb1, 0xb3, 0x03, 0x77, 0x82, 0x91, 0x00, 0xb1, 0xa9,
+    0x97, 0x00, 0xb1, 0xa1, 0x83, 0x00, 0xb1, 0xc8, 0x87, 0x00, 0xb1, 0xe8,
+    0x87, 0x00, 0xaf, 0xf0, 0x87, 0x00, 0xaf, 0xc0, 0x87, 0x00, 0xae, 0xc8,
+    0x87, 0x00, 0xb1, 0x88, 0x87, 0x00, 0xb2, 0xb8, 0x83, 0x00, 0xc7, 0x10,
+    0x91, 0x00, 0xc7, 0x08, 0x87, 0x00, 0xa6, 0xe9, 0x8b, 0x00, 0xa6, 0xfb,
+    0x03, 0x77, 0x86, 0x91, 0x00, 0xa7, 0x1b, 0x03, 0x77, 0x8a, 0x83, 0x00,
+    0xa7, 0x3a, 0x03, 0x77, 0x8e, 0x8b, 0x00, 0xa2, 0xd3, 0x03, 0x77, 0x92,
+    0x87, 0x00, 0xa2, 0xc1, 0x91, 0x00, 0xa2, 0xf3, 0x03, 0x77, 0x96, 0x83,
+    0x00, 0xa3, 0x12, 0x03, 0x77, 0x9a, 0x83, 0x00, 0xa9, 0xd3, 0x03, 0x77,
+    0x9e, 0x91, 0x00, 0xa9, 0xb3, 0x03, 0x77, 0xa2, 0x8b, 0x00, 0xa9, 0x93,
+    0x03, 0x77, 0xa6, 0x87, 0x00, 0xa9, 0x80, 0x83, 0x00, 0xa9, 0x13, 0x03,
+    0x77, 0xaa, 0x8b, 0x00, 0xa8, 0xd3, 0x03, 0x77, 0xae, 0x87, 0x00, 0xa8,
+    0xc1, 0x91, 0x00, 0xa8, 0xf2, 0x03, 0x77, 0xb2, 0x83, 0x00, 0xa8, 0x0b,
+    0x03, 0x77, 0xb6, 0x87, 0x00, 0xa7, 0xb9, 0x8b, 0x00, 0xa7, 0xcb, 0x03,
+    0x77, 0xba, 0x91, 0x00, 0xa7, 0xea, 0x03, 0x77, 0xbe, 0x83, 0x00, 0xa2,
+    0x2b, 0x03, 0x77, 0xc2, 0x91, 0x00, 0xa2, 0x0b, 0x03, 0x77, 0xc6, 0x8b,
+    0x00, 0xa1, 0xeb, 0x03, 0x77, 0xca, 0x87, 0x00, 0xa1, 0xd8, 0x91, 0x00,
+    0xa4, 0xd8, 0x8b, 0x00, 0xa4, 0xb8, 0x83, 0x00, 0xa4, 0xf8, 0x83, 0x00,
+    0xa0, 0xd0, 0x91, 0x00, 0xa0, 0xa8, 0x8b, 0x00, 0xa0, 0x88, 0x83, 0x00,
+    0xa4, 0x08, 0x8b, 0x00, 0xa3, 0xc8, 0x91, 0x00, 0xa3, 0xe8, 0x87, 0x00,
+    0xa5, 0x69, 0x8b, 0x00, 0xa5, 0x7b, 0x03, 0x77, 0xce, 0x91, 0x00, 0xa5,
+    0x9b, 0x03, 0x77, 0xd2, 0x83, 0x00, 0xa5, 0xba, 0x03, 0x77, 0xd6, 0x83,
+    0x00, 0xa6, 0x70, 0x83, 0x00, 0xb3, 0xe3, 0x03, 0x77, 0xda, 0x91, 0x00,
+    0xb3, 0xd3, 0x03, 0x77, 0xde, 0x8b, 0x00, 0xb3, 0xc3, 0x03, 0x77, 0xe2,
+    0xc2, 0x02, 0xe0, 0x00, 0xb3, 0xb8, 0xc3, 0x0d, 0x14, 0x08, 0x9b, 0x59,
+    0xc3, 0x09, 0x9e, 0x08, 0x9b, 0x50, 0xc4, 0x02, 0xde, 0x08, 0x9b, 0x49,
+    0xc2, 0x02, 0xa0, 0x08, 0x9b, 0x40, 0xc6, 0x05, 0x01, 0x00, 0x18, 0xb0,
+    0xc5, 0x05, 0x02, 0x01, 0x07, 0x79, 0xc5, 0x00, 0xd4, 0x01, 0x06, 0xb8,
+    0x03, 0xc3, 0x77, 0xe6, 0xc5, 0x05, 0x02, 0x00, 0x1a, 0xa8, 0xc5, 0x05,
+    0x02, 0x00, 0x19, 0xc9, 0xc5, 0x00, 0xd4, 0x00, 0x1a, 0xb8, 0xc5, 0x05,
+    0x02, 0x01, 0x07, 0x71, 0xc5, 0x00, 0xd4, 0x01, 0x06, 0xb0, 0xc5, 0x00,
+    0xd4, 0x00, 0xef, 0xf1, 0xc5, 0x05, 0x02, 0x00, 0x1a, 0xa0, 0xc5, 0x00,
+    0xd4, 0x00, 0x18, 0x71, 0xc5, 0x05, 0x02, 0x00, 0x1a, 0x40, 0xc5, 0x05,
+    0x02, 0x00, 0xd6, 0x51, 0xc5, 0x00, 0xd4, 0x00, 0xd6, 0x48, 0xc9, 0x0f,
+    0x6e, 0x07, 0xf1, 0x11, 0xca, 0x09, 0xb7, 0x07, 0xf1, 0x18, 0xc4, 0x00,
+    0x49, 0x00, 0xef, 0xc1, 0xc5, 0x00, 0x2c, 0x00, 0x1a, 0xc0, 0xc2, 0x06,
+    0xdb, 0x01, 0x66, 0x29, 0xc3, 0x07, 0x4a, 0x01, 0x66, 0xd8, 0xc3, 0x01,
+    0x69, 0x01, 0x66, 0x69, 0x83, 0x01, 0x66, 0x7b, 0x03, 0x77, 0xf2, 0xc2,
+    0x06, 0xdb, 0x01, 0x66, 0x98, 0xc2, 0x04, 0x2b, 0x01, 0x66, 0xf9, 0xc2,
+    0x16, 0x5a, 0x01, 0x67, 0x08, 0xc2, 0x06, 0xdb, 0x01, 0x66, 0x21, 0xc3,
+    0x07, 0x4a, 0x01, 0x66, 0xd0, 0xc3, 0x01, 0x69, 0x01, 0x66, 0x61, 0x83,
+    0x01, 0x66, 0x73, 0x03, 0x77, 0xf6, 0xc2, 0x06, 0xdb, 0x01, 0x66, 0x90,
+    0xc2, 0x04, 0x2b, 0x01, 0x66, 0xf1, 0xc2, 0x16, 0x5a, 0x01, 0x67, 0x00,
+    0xc8, 0x02, 0x9f, 0x0f, 0xc8, 0x09, 0xc9, 0x3b, 0x79, 0x0f, 0xc8, 0x00,
+    0x42, 0x00, 0x45, 0xc3, 0x77, 0xfa, 0x16, 0xc3, 0x78, 0x04, 0x08, 0xc3,
+    0x78, 0x10, 0x15, 0xc3, 0x78, 0x1c, 0xc5, 0x06, 0xdb, 0x01, 0x92, 0xc1,
+    0xc4, 0x26, 0x78, 0x01, 0x92, 0xc8, 0x42, 0x00, 0x45, 0xc3, 0x78, 0x28,
+    0x16, 0xc3, 0x78, 0x32, 0x08, 0xc3, 0x78, 0x3e, 0x15, 0xc3, 0x78, 0x4a,
+    0xc5, 0x06, 0xdb, 0x01, 0x95, 0x99, 0xc4, 0x26, 0x78, 0x01, 0x95, 0xa0,
+    0x42, 0x00, 0x45, 0xc3, 0x78, 0x56, 0x16, 0xc3, 0x78, 0x60, 0x08, 0xc3,
+    0x78, 0x6c, 0x15, 0xc3, 0x78, 0x78, 0xc5, 0x06, 0xdb, 0x01, 0x95, 0xe9,
+    0xc4, 0x26, 0x78, 0x01, 0x95, 0xf0, 0x96, 0x01, 0x95, 0x09, 0xc5, 0x53,
+    0x93, 0x01, 0x95, 0x70, 0xa0, 0x09, 0x2a, 0x01, 0x8f, 0x09, 0x1a, 0x30,
+    0x94, 0x09, 0x19, 0xf9, 0xc7, 0x5d, 0x9b, 0x09, 0x19, 0xf1, 0x8e, 0x09,
+    0x19, 0xe8, 0x86, 0x09, 0x29, 0xe9, 0x9f, 0x09, 0x19, 0x8a, 0x03, 0x78,
+    0x84, 0x8e, 0x09, 0x19, 0x71, 0x46, 0x25, 0xd4, 0x43, 0x78, 0x8a, 0xd9,
+    0x1f, 0xe0, 0x09, 0x15, 0xe9, 0xd9, 0x1a, 0xe7, 0x09, 0x15, 0xe0, 0xc7,
+    0x25, 0xd4, 0x09, 0x15, 0xb0, 0xc5, 0x39, 0xc7, 0x09, 0x16, 0x68, 0xc4,
+    0x96, 0x9c, 0x09, 0x16, 0x49, 0xc2, 0x00, 0x65, 0x09, 0x16, 0x40, 0xc2,
+    0x38, 0xb6, 0x09, 0x29, 0x81, 0x84, 0x09, 0x15, 0x08, 0x0a, 0xc3, 0x78,
+    0x96, 0xc2, 0x00, 0x65, 0x09, 0x14, 0xf8, 0xc2, 0x01, 0xe2, 0x09, 0x15,
+    0x31, 0x94, 0x09, 0x15, 0x29, 0x8f, 0x09, 0x15, 0x21, 0x84, 0x09, 0x15,
+    0x19, 0x9f, 0x09, 0x15, 0x10, 0xc2, 0x00, 0x33, 0x09, 0x14, 0xd9, 0xc2,
+    0x06, 0x4e, 0x09, 0x14, 0xd0, 0x84, 0x09, 0x14, 0xc0, 0xc4, 0xdc, 0xae,
+    0x09, 0x29, 0x61, 0xc7, 0x65, 0xd1, 0x09, 0x29, 0x59, 0xc2, 0x01, 0xe2,
+    0x09, 0x12, 0xf9, 0xca, 0xa0, 0xb2, 0x09, 0x12, 0xf0, 0xc3, 0x02, 0x2c,
+    0x09, 0x29, 0x41, 0xd0, 0x5e, 0x12, 0x09, 0x12, 0xb8, 0x17, 0xc3, 0x78,
+    0xa2, 0x8b, 0x09, 0x1c, 0x92, 0x03, 0x78, 0xaa, 0x47, 0x25, 0xd4, 0x43,
+    0x78, 0xb0, 0xc2, 0x05, 0x1d, 0x09, 0x12, 0xc9, 0x87, 0x09, 0x12, 0xc0,
+    0xc2, 0x01, 0xe2, 0x09, 0x12, 0xa3, 0x03, 0x78, 0xbf, 0x90, 0x09, 0x12,
+    0x98, 0xc2, 0x02, 0xad, 0x09, 0x13, 0xc8, 0xc2, 0x5d, 0xd4, 0x09, 0x13,
+    0xb9, 0xc5, 0xda, 0x7e, 0x09, 0x13, 0xb1, 0xc2, 0x02, 0x6f, 0x09, 0x13,
+    0xa9, 0xc2, 0x00, 0xdb, 0x09, 0x13, 0xa1, 0xc4, 0xe1, 0x67, 0x09, 0x13,
+    0x99, 0xc8, 0x6a, 0x1e, 0x09, 0x13, 0x91, 0xc3, 0x6c, 0x49, 0x09, 0x13,
+    0x89, 0xc3, 0x84, 0x21, 0x09, 0x13, 0x81, 0xc2, 0x01, 0x2d, 0x09, 0x13,
+    0x79, 0xc6, 0xcb, 0x87, 0x09, 0x13, 0x70, 0xd9, 0x20, 0x12, 0x09, 0x13,
+    0x38, 0xc3, 0x32, 0xbf, 0x09, 0x29, 0x09, 0xc2, 0x01, 0x30, 0x09, 0x29,
+    0x01, 0xc9, 0xb1, 0x8b, 0x09, 0x11, 0xb8, 0xc2, 0x02, 0x1c, 0x09, 0x1c,
+    0x69, 0xc2, 0x01, 0xdd, 0x09, 0x11, 0xe1, 0x83, 0x09, 0x11, 0xd2, 0x03,
+    0x78, 0xc5, 0x16, 0xc3, 0x78, 0xcb, 0xc3, 0x0b, 0x64, 0x09, 0x28, 0xe3,
+    0x03, 0x78, 0xd7, 0x0a, 0xc3, 0x78, 0xdd, 0xc4, 0x04, 0x59, 0x09, 0x28,
+    0xd1, 0x15, 0xc3, 0x78, 0xe9, 0xc4, 0x73, 0x32, 0x09, 0x10, 0x03, 0x03,
+    0x78, 0xf3, 0x10, 0xc3, 0x78, 0xf7, 0xca, 0xa7, 0xb0, 0x09, 0x10, 0x59,
+    0x42, 0x00, 0xdb, 0xc3, 0x78, 0xff, 0x0d, 0xc3, 0x79, 0x0b, 0xc2, 0x03,
+    0x4e, 0x09, 0x10, 0x21, 0xc9, 0x5d, 0x99, 0x09, 0x10, 0x11, 0xc3, 0x62,
+    0x19, 0x09, 0x0f, 0xf9, 0xc2, 0x00, 0x65, 0x09, 0x0f, 0xf0, 0xca, 0x8d,
+    0x2d, 0x09, 0x1c, 0x48, 0x17, 0xc3, 0x79, 0x15, 0xcd, 0x7b, 0x56, 0x09,
+    0x28, 0xa1, 0xd5, 0x36, 0x5c, 0x09, 0x28, 0x99, 0xc2, 0x00, 0xec, 0x09,
+    0x28, 0x91, 0xc3, 0x04, 0x2a, 0x09, 0x28, 0x83, 0x03, 0x79, 0x1f, 0xc2,
+    0x01, 0x30, 0x09, 0x28, 0x79, 0xc3, 0xd5, 0x59, 0x09, 0x28, 0x70, 0x17,
+    0xc3, 0x79, 0x25, 0x16, 0xc3, 0x79, 0x33, 0xc2, 0x00, 0xdb, 0x09, 0x28,
+    0x31, 0xc3, 0xaa, 0xfe, 0x09, 0x28, 0x29, 0xce, 0x75, 0x04, 0x09, 0x28,
+    0x21, 0xc3, 0x62, 0x19, 0x09, 0x28, 0x19, 0xc3, 0x02, 0x2c, 0x09, 0x28,
+    0x10, 0x47, 0x03, 0x4c, 0x43, 0x79, 0x3d, 0xca, 0x9e, 0x00, 0x09, 0x26,
+    0xa1, 0x09, 0xc3, 0x79, 0x55, 0x97, 0x09, 0x0f, 0x2b, 0x03, 0x79, 0x69,
+    0x16, 0xc3, 0x79, 0x7f, 0x15, 0xc3, 0x79, 0x89, 0xc2, 0x02, 0x6f, 0x09,
+    0x0e, 0xd9, 0x0f, 0xc3, 0x79, 0x93, 0x0e, 0xc3, 0x79, 0xa0, 0x0d, 0xc3,
+    0x79, 0xb3, 0x0b, 0xc3, 0x79, 0xbe, 0x0a, 0xc3, 0x79, 0xcb, 0xc2, 0x00,
+    0xc4, 0x09, 0x0e, 0x19, 0xc3, 0x14, 0x96, 0x09, 0x0e, 0x11, 0x04, 0xc3,
+    0x79, 0xd8, 0x83, 0x09, 0x0d, 0xca, 0x03, 0x79, 0xe2, 0xd4, 0x39, 0xbc,
+    0x09, 0x0f, 0x80, 0xc9, 0xa6, 0x17, 0x09, 0x0f, 0x70, 0x8e, 0x09, 0x1c,
+    0x28, 0x00, 0x43, 0x79, 0xf6, 0xd1, 0x55, 0x1f, 0x09, 0x0b, 0x30, 0xc2,
+    0x00, 0xac, 0x09, 0x0b, 0xb9, 0xc2, 0x04, 0x2b, 0x09, 0x0b, 0xb1, 0xc2,
+    0x05, 0xc3, 0x09, 0x0b, 0xa8, 0xcf, 0x6a, 0x17, 0x09, 0x08, 0xd0, 0x45,
+    0x03, 0x4e, 0xc3, 0x7a, 0x02, 0xc3, 0x58, 0xf6, 0x09, 0x08, 0xa8, 0x0a,
+    0xc3, 0x7a, 0x14, 0xc2, 0x01, 0xdf, 0x09, 0x07, 0x41, 0x03, 0x43, 0x7a,
+    0x1f, 0x87, 0x09, 0x26, 0x23, 0x03, 0x7a, 0x27, 0xc2, 0x05, 0x1d, 0x09,
+    0x07, 0x02, 0x03, 0x7a, 0x2d, 0xc3, 0x5d, 0xd1, 0x09, 0x26, 0x19, 0x8b,
+    0x09, 0x06, 0xf9, 0xc9, 0xa7, 0xb1, 0x09, 0x06, 0xf0, 0xc2, 0x53, 0x31,
+    0x09, 0x26, 0x11, 0x83, 0x09, 0x06, 0xea, 0x03, 0x7a, 0x33, 0x17, 0xc3,
+    0x7a, 0x3a, 0xc2, 0x02, 0xfb, 0x09, 0x06, 0xd3, 0x03, 0x7a, 0x46, 0x03,
+    0x43, 0x7a, 0x4c, 0x03, 0xc3, 0x7a, 0x56, 0xc3, 0xc5, 0xa4, 0x09, 0x06,
+    0xa9, 0xc9, 0xaa, 0x44, 0x09, 0x06, 0xa0, 0x83, 0x09, 0x25, 0xdb, 0x03,
+    0x7a, 0x63, 0x8b, 0x09, 0x06, 0x6a, 0x03, 0x7a, 0x70, 0xc3, 0x1a, 0x52,
+    0x09, 0x25, 0xd1, 0x90, 0x09, 0x06, 0x4b, 0x03, 0x7a, 0x7d, 0x8e, 0x09,
+    0x06, 0x3a, 0x03, 0x7a, 0x83, 0x17, 0xc3, 0x7a, 0x89, 0x8b, 0x09, 0x06,
+    0x23, 0x03, 0x7a, 0x93, 0x83, 0x09, 0x06, 0x18, 0x03, 0xc3, 0x7a, 0x99,
+    0xc2, 0x00, 0x33, 0x09, 0x06, 0x0a, 0x03, 0x7a, 0xa9, 0xc2, 0x01, 0xe2,
+    0x09, 0x05, 0xeb, 0x03, 0x7a, 0xaf, 0x90, 0x09, 0x05, 0xe3, 0x03, 0x7a,
+    0xb6, 0xd0, 0x58, 0xf2, 0x09, 0x05, 0xd9, 0x46, 0x25, 0xd4, 0x43, 0x7a,
+    0xbc, 0x86, 0x09, 0x07, 0x5a, 0x03, 0x7a, 0xce, 0xd3, 0x40, 0xa0, 0x09,
+    0x06, 0xb9, 0xc7, 0x6a, 0x1f, 0x09, 0x06, 0xb0, 0xcb, 0x8c, 0xf5, 0x09,
+    0x05, 0x80, 0xc8, 0x0b, 0x08, 0x09, 0x05, 0x68, 0xca, 0x8c, 0xf6, 0x09,
+    0x05, 0x20, 0x8f, 0x09, 0x24, 0xfb, 0x03, 0x7a, 0xd4, 0xc5, 0xdc, 0x36,
+    0x09, 0x24, 0xf0, 0xc4, 0x5d, 0xd2, 0x09, 0x24, 0xe3, 0x03, 0x7a, 0xda,
+    0x94, 0x09, 0x24, 0xd8, 0xc2, 0x01, 0xe2, 0x09, 0x24, 0xb1, 0xc7, 0xc4,
+    0x4f, 0x09, 0x24, 0xa8, 0xc8, 0x10, 0x61, 0x09, 0x24, 0x78, 0x47, 0x5d,
+    0xd5, 0xc3, 0x7a, 0xe0, 0xc2, 0x01, 0xe2, 0x09, 0x03, 0x68, 0x97, 0x09,
+    0x03, 0x2b, 0x03, 0x7a, 0xec, 0x83, 0x09, 0x03, 0x20, 0xc8, 0x36, 0x68,
+    0x09, 0x03, 0x10, 0xc2, 0x04, 0x3d, 0x09, 0x02, 0xf9, 0x8b, 0x09, 0x02,
+    0xeb, 0x03, 0x7a, 0xf6, 0x83, 0x09, 0x02, 0xda, 0x03, 0x7a, 0xfc, 0x8b,
+    0x09, 0x02, 0xd1, 0xc4, 0x4f, 0x68, 0x09, 0x02, 0xc8, 0xc3, 0x01, 0xc3,
+    0x09, 0x02, 0xc1, 0xca, 0x97, 0xbe, 0x09, 0x02, 0xb8, 0xdf, 0x0d, 0x1f,
+    0x09, 0x01, 0xe8, 0xe0, 0x0b, 0x47, 0x09, 0x01, 0xd8, 0xc2, 0x02, 0x1c,
+    0x09, 0x14, 0x69, 0xc2, 0x04, 0x3d, 0x09, 0x14, 0x61, 0xc3, 0x45, 0xb0,
+    0x09, 0x14, 0x58, 0xc8, 0x20, 0xa9, 0x00, 0x26, 0xe9, 0xc8, 0x25, 0xfb,
+    0x00, 0x24, 0xb8, 0xc8, 0x20, 0xa9, 0x00, 0x26, 0xe1, 0xc8, 0x25, 0xfb,
+    0x00, 0x24, 0xb0, 0xc7, 0xc7, 0xeb, 0x00, 0x6d, 0x41, 0xc6, 0x8e, 0x9c,
+    0x00, 0x6d, 0x70, 0xc7, 0xc4, 0x25, 0x00, 0x6d, 0x51, 0xc6, 0x8e, 0x9c,
+    0x00, 0x6d, 0x80, 0xc5, 0x20, 0xe5, 0x0e, 0xce, 0xa1, 0xc7, 0xb7, 0x3a,
+    0x0e, 0xce, 0x28, 0xc5, 0x20, 0xe5, 0x0e, 0xce, 0x99, 0xc7, 0xb7, 0x3a,
+    0x0e, 0xce, 0x20, 0xc5, 0x20, 0xe5, 0x0e, 0xce, 0x91, 0xc7, 0xb7, 0x3a,
+    0x0e, 0xce, 0x18, 0xc5, 0xdd, 0x17, 0x0e, 0xcd, 0x99, 0xca, 0x9e, 0x8c,
+    0x0e, 0xcd, 0x60, 0xc5, 0xdd, 0x17, 0x0e, 0xcd, 0x91, 0xca, 0x9e, 0x8c,
+    0x0e, 0xcd, 0x58, 0xc5, 0xdd, 0x17, 0x0e, 0xcd, 0x89, 0xca, 0x9e, 0x8c,
+    0x0e, 0xcd, 0x50, 0xc9, 0x51, 0x1a, 0x0e, 0xd3, 0x30, 0xc9, 0x51, 0x1a,
+    0x0e, 0xd3, 0x20, 0xcb, 0x57, 0x45, 0x0e, 0xd1, 0x19, 0xc6, 0x00, 0x58,
+    0x0e, 0xd1, 0x10, 0xcb, 0x57, 0x45, 0x0e, 0xd1, 0x31, 0xc6, 0x00, 0x58,
+    0x0e, 0xd1, 0x28, 0xc4, 0x0e, 0x65, 0x0e, 0xc8, 0x21, 0xc5, 0x0e, 0xce,
+    0x0e, 0xc7, 0xab, 0x03, 0x7b, 0x02, 0xc5, 0x06, 0x82, 0x0e, 0xc0, 0x03,
+    0x03, 0x7b, 0x06, 0x47, 0x04, 0xcb, 0xc3, 0x7b, 0x0a, 0x45, 0x00, 0x9d,
+    0xc3, 0x7b, 0x2f, 0x47, 0x13, 0x95, 0xc3, 0x7b, 0x5c, 0xdb, 0x18, 0xdb,
+    0x0e, 0xc2, 0x50, 0x46, 0xd1, 0x5d, 0xc3, 0x7b, 0x84, 0x46, 0x0e, 0xce,
+    0xc3, 0x7b, 0x99, 0xc4, 0x0e, 0x65, 0x0e, 0xc2, 0xe3, 0x03, 0x7b, 0xab,
+    0xd4, 0x3a, 0xac, 0x0e, 0xc2, 0xd9, 0x08, 0x43, 0x7b, 0xaf, 0x00, 0x43,
+    0x7b, 0xc1, 0x00, 0x43, 0x7b, 0xd9, 0xc6, 0x13, 0x95, 0x0e, 0xc5, 0x99,
+    0xdd, 0x11, 0x17, 0x0e, 0xc5, 0x68, 0xc5, 0x06, 0x82, 0x0e, 0xc5, 0x1b,
+    0x03, 0x7b, 0xe5, 0xc2, 0x02, 0xae, 0x0e, 0xc4, 0xb0, 0xc5, 0x06, 0x82,
+    0x0e, 0xc0, 0x23, 0x03, 0x7b, 0xee, 0xc6, 0x04, 0xcb, 0x0e, 0xc6, 0x2b,
+    0x03, 0x7b, 0xf2, 0xc4, 0x00, 0x9d, 0x0e, 0xc5, 0x3b, 0x03, 0x7b, 0xf8,
+    0xc6, 0x13, 0x95, 0x0e, 0xc4, 0x53, 0x03, 0x7b, 0xfe, 0x46, 0x0e, 0xce,
+    0xc3, 0x7c, 0x02, 0xc8, 0xbc, 0x62, 0x0e, 0xc4, 0x11, 0xc4, 0x05, 0x75,
+    0x0e, 0xc3, 0xdb, 0x03, 0x7c, 0x11, 0xc5, 0x03, 0x13, 0x0e, 0xc3, 0xf1,
+    0x08, 0x43, 0x7c, 0x15, 0x47, 0x04, 0xcb, 0xc3, 0x7c, 0x21, 0x52, 0x3c,
+    0x00, 0xc3, 0x7c, 0x30, 0xca, 0x4c, 0x69, 0x0e, 0xc5, 0xc9, 0xc8, 0xbc,
+    0x5a, 0x0e, 0xc3, 0x50, 0x00, 0x43, 0x7c, 0x42, 0x00, 0x43, 0x7c, 0x6f,
+    0xde, 0x0e, 0xc8, 0x0e, 0xc7, 0x49, 0xdc, 0x13, 0x89, 0x0e, 0xc6, 0xb3,
+    0x03, 0x7c, 0x81, 0x46, 0x0e, 0xce, 0xc3, 0x7c, 0x87, 0xc8, 0xbc, 0x62,
+    0x0e, 0xc3, 0x41, 0xd6, 0x18, 0xdb, 0x0e, 0xc2, 0x48, 0x47, 0x04, 0xcb,
+    0xc3, 0x7c, 0x93, 0xc5, 0x06, 0x82, 0x0e, 0xc0, 0x0b, 0x03, 0x7c, 0xa2,
+    0xcb, 0x13, 0x90, 0x0e, 0xc5, 0x89, 0x47, 0x13, 0x95, 0x43, 0x7c, 0xa6,
+    0xc7, 0x27, 0xb2, 0x0e, 0xc3, 0xd1, 0xc4, 0x0e, 0xe2, 0x0e, 0xc3, 0xc0,
+    0xc5, 0x0e, 0xd4, 0x0e, 0xd0, 0x29, 0xc8, 0x45, 0x27, 0x0e, 0xd0, 0x18,
+    0xc5, 0x0e, 0xd4, 0x0e, 0xd0, 0x21, 0xc4, 0x00, 0x70, 0x0e, 0xd0, 0x11,
+    0xc8, 0x45, 0x27, 0x0e, 0xd0, 0x08, 0xc4, 0x03, 0x14, 0x0e, 0xce, 0xe9,
+    0xc4, 0xa2, 0x4c, 0x0e, 0xce, 0xe0, 0x46, 0x20, 0xe5, 0xc3, 0x7c, 0xb2,
+    0x48, 0xb7, 0x3a, 0x43, 0x7c, 0xbe, 0xc5, 0x17, 0x14, 0x0e, 0xcb, 0x3b,
+    0x03, 0x7c, 0xca, 0xc6, 0x01, 0xdb, 0x0e, 0xcb, 0x31, 0xc5, 0x03, 0x13,
+    0x0e, 0xcb, 0x28, 0x46, 0x17, 0x14, 0xc3, 0x7c, 0xd0, 0x46, 0x03, 0x13,
+    0x43, 0x7c, 0xdc, 0x46, 0x17, 0x14, 0xc3, 0x7c, 0xe8, 0x46, 0x03, 0x13,
+    0x43, 0x7c, 0xf4, 0x47, 0x2c, 0x2e, 0xc3, 0x7d, 0x00, 0xcc, 0x8a, 0x39,
+    0x0e, 0xce, 0x49, 0xcc, 0x81, 0xe1, 0x0e, 0xce, 0x40, 0x46, 0x17, 0x14,
+    0xc3, 0x7d, 0x0c, 0x46, 0x03, 0x13, 0x43, 0x7d, 0x18, 0xc2, 0x00, 0x15,
+    0x0e, 0xce, 0xc0, 0x46, 0x20, 0xe5, 0xc3, 0x7d, 0x24, 0x48, 0xb7, 0x3a,
+    0x43, 0x7d, 0x30, 0xc5, 0x17, 0x14, 0x0e, 0xcd, 0xb1, 0xc6, 0x01, 0xdb,
+    0x0e, 0xcd, 0xa9, 0xc5, 0x03, 0x13, 0x0e, 0xcd, 0xa0, 0xc5, 0xdd, 0x17,
+    0x0e, 0xcd, 0x81, 0xca, 0x9e, 0x8c, 0x0e, 0xcd, 0x48, 0x47, 0x2c, 0x2e,
+    0xc3, 0x7d, 0x3c, 0x47, 0x00, 0x58, 0x43, 0x7d, 0x4e, 0x0a, 0xc3, 0x7d,
+    0x60, 0x42, 0x00, 0x8e, 0xc3, 0x7d, 0x6c, 0x48, 0x15, 0x02, 0x43, 0x7d,
+    0x78, 0xc6, 0x01, 0xdb, 0x0e, 0xcd, 0x09, 0xc5, 0x03, 0x13, 0x0e, 0xcd,
+    0x00, 0xc5, 0x17, 0x14, 0x0e, 0xc9, 0x63, 0x03, 0x7d, 0x8d, 0xc6, 0x01,
+    0xdb, 0x0e, 0xc9, 0x59, 0xc5, 0x03, 0x13, 0x0e, 0xc9, 0x50, 0xc2, 0x00,
+    0x15, 0x0e, 0xcb, 0x20, 0xc2, 0x00, 0x15, 0x0e, 0xcb, 0x00, 0xc5, 0x03,
+    0x13, 0x0e, 0xc9, 0x31, 0xc5, 0x17, 0x14, 0x0e, 0xc9, 0x28, 0xd0, 0x59,
+    0x02, 0x08, 0xae, 0x59, 0xd2, 0x48, 0x8f, 0x08, 0xae, 0x50, 0xc8, 0x0d,
+    0x03, 0x01, 0x0b, 0xf0, 0x00, 0x43, 0x7d, 0x93, 0xdf, 0x0d, 0x3e, 0x01,
+    0x4b, 0x79, 0x06, 0x43, 0x7d, 0xa5, 0xd2, 0x05, 0xd4, 0x0f, 0xc0, 0x19,
+    0xd5, 0x03, 0xd2, 0x0f, 0xc0, 0x98, 0xca, 0x03, 0x87, 0x01, 0x0d, 0x99,
+    0xc9, 0x01, 0x88, 0x01, 0x0d, 0x90, 0xd6, 0x2e, 0x3e, 0x01, 0x1b, 0xe1,
+    0xc3, 0x13, 0x1d, 0x01, 0x15, 0xf0, 0xc9, 0x33, 0xad, 0x01, 0x4c, 0x90,
+    0x45, 0x00, 0x8c, 0xc3, 0x7d, 0xab, 0xc6, 0x10, 0x9d, 0x01, 0x5b, 0x91,
+    0x44, 0x00, 0x9a, 0x43, 0x7d, 0xd5, 0xc3, 0x14, 0xa7, 0x01, 0x48, 0xb3,
+    0x03, 0x7d, 0xdb, 0xd2, 0x05, 0xd5, 0x01, 0x5f, 0x70, 0xcf, 0x62, 0x3d,
+    0x01, 0x4b, 0x69, 0x46, 0x00, 0xd4, 0xc3, 0x7d, 0xe1, 0xc6, 0x10, 0x9d,
+    0x01, 0x4a, 0xb9, 0xc8, 0xae, 0xbc, 0x01, 0x4a, 0xf8, 0x46, 0x00, 0xd4,
+    0xc3, 0x7d, 0xe7, 0xc8, 0xae, 0xbc, 0x01, 0x4a, 0xd9, 0xc6, 0x10, 0x9d,
+    0x01, 0x4a, 0x98, 0xcf, 0x2c, 0x35, 0x01, 0x48, 0xa1, 0xd6, 0x2d, 0x62,
+    0x01, 0x48, 0xa8, 0xc2, 0x02, 0xfa, 0x00, 0x70, 0x11, 0xc3, 0x05, 0x21,
+    0x00, 0x70, 0x19, 0xc3, 0x0c, 0x26, 0x00, 0x70, 0x21, 0xc2, 0x00, 0x45,
+    0x00, 0x70, 0x28, 0xc3, 0x93, 0x9b, 0x00, 0x72, 0x19, 0xc4, 0xcb, 0x97,
+    0x00, 0x72, 0x20, 0x87, 0x00, 0x71, 0xb8, 0x03, 0xc3, 0x7d, 0xef, 0xc3,
+    0x38, 0x86, 0x00, 0x70, 0xb1, 0xc3, 0x08, 0x48, 0x00, 0x70, 0xc0, 0xc3,
+    0x38, 0x86, 0x00, 0x70, 0xe1, 0xc2, 0x00, 0xd1, 0x00, 0x70, 0xf0, 0xc2,
+    0x01, 0x23, 0x00, 0x72, 0x49, 0xc2, 0x00, 0x2c, 0x00, 0x72, 0x50, 0xcc,
+    0x00, 0xfb, 0x07, 0xe0, 0xb1, 0xcb, 0x10, 0xb5, 0x07, 0xe5, 0x40, 0x44,
+    0x19, 0x6a, 0xc3, 0x7d, 0xf9, 0xce, 0x43, 0x77, 0x07, 0xed, 0x29, 0xd7,
+    0x26, 0xea, 0x07, 0xed, 0x38, 0xcc, 0x00, 0xfb, 0x07, 0xe0, 0xa9, 0xcb,
+    0x10, 0xb5, 0x07, 0xe5, 0x38, 0xd7, 0x26, 0xea, 0x07, 0xed, 0x31, 0xce,
+    0x43, 0x77, 0x07, 0xed, 0xf0, 0xcc, 0x00, 0xfb, 0x07, 0xe0, 0xc1, 0xcb,
+    0x10, 0xb5, 0x07, 0xe5, 0x50, 0xce, 0x43, 0x77, 0x07, 0xea, 0xd1, 0xd7,
+    0x26, 0xea, 0x07, 0xea, 0xd8, 0xcc, 0x00, 0xfb, 0x07, 0xe0, 0xb9, 0xcb,
+    0x10, 0xb5, 0x07, 0xe5, 0x48, 0xcc, 0x00, 0xfb, 0x07, 0xe2, 0x91, 0xcb,
+    0x10, 0xb5, 0x07, 0xe6, 0xc0, 0xd1, 0x30, 0xc1, 0x07, 0xec, 0x99, 0xd1,
+    0x50, 0x13, 0x07, 0xec, 0xa0, 0xcd, 0x00, 0xfa, 0x07, 0xe7, 0xf1, 0xca,
+    0x26, 0xf7, 0x07, 0xe8, 0xd0, 0x43, 0x2b, 0xba, 0xc3, 0x7e, 0x05, 0x43,
+    0x02, 0x98, 0x43, 0x7e, 0x11, 0xcb, 0x64, 0x7b, 0x07, 0xe7, 0x49, 0xca,
+    0x26, 0xf7, 0x07, 0xe9, 0x41, 0x0b, 0xc3, 0x7e, 0x27, 0x45, 0x00, 0x8c,
+    0x43, 0x7e, 0x33, 0xca, 0x26, 0xf7, 0x07, 0xe8, 0xc9, 0xcd, 0x00, 0xfa,
+    0x07, 0xe7, 0xe8, 0xca, 0x26, 0xf7, 0x07, 0xe9, 0x29, 0x0b, 0xc3, 0x7e,
+    0x3f, 0xd3, 0x43, 0x72, 0x07, 0xeb, 0x49, 0xcb, 0x64, 0x7b, 0x07, 0xe9,
+    0xb8, 0xca, 0x26, 0xf7, 0x07, 0xe9, 0x39, 0x0b, 0xc3, 0x7e, 0x4b, 0xcb,
+    0x64, 0x7b, 0x07, 0xe9, 0xc8, 0xca, 0x26, 0xf7, 0x07, 0xe9, 0x49, 0xcd,
+    0x00, 0xfa, 0x07, 0xe8, 0x68, 0x00, 0xc3, 0x7e, 0x57, 0xd1, 0x56, 0x51,
+    0x07, 0xe2, 0xf8, 0x00, 0xc3, 0x7e, 0x63, 0xd1, 0x56, 0x51, 0x07, 0xe2,
+    0xf0, 0xcb, 0x64, 0x7b, 0x07, 0xe7, 0x91, 0xcd, 0x00, 0xfa, 0x07, 0xe3,
+    0x00, 0xcc, 0x00, 0xfb, 0x07, 0xe0, 0xf9, 0xcb, 0x10, 0xb5, 0x07, 0xe5,
+    0x80, 0x44, 0x19, 0x6a, 0xc3, 0x7e, 0x6f, 0xd1, 0x30, 0xc1, 0x07, 0xeb,
+    0x09, 0x45, 0x19, 0x60, 0x43, 0x7e, 0x7b, 0xcc, 0x00, 0xfb, 0x07, 0xe0,
+    0xf1, 0xcb, 0x10, 0xb5, 0x07, 0xe5, 0x78, 0xd7, 0x26, 0xea, 0x07, 0xed,
+    0x41, 0xce, 0x43, 0x77, 0x07, 0xee, 0x30, 0x0b, 0xc3, 0x7e, 0x87, 0xcb,
+    0x64, 0x7b, 0x07, 0xe9, 0xa9, 0xd6, 0x30, 0xbc, 0x07, 0xea, 0xe0, 0xcc,
+    0x10, 0xb4, 0x07, 0xe9, 0x89, 0xcb, 0x64, 0x7b, 0x07, 0xe7, 0x40, 0xcc,
+    0x00, 0xfb, 0x07, 0xe0, 0xe1, 0xcb, 0x10, 0xb5, 0x07, 0xe5, 0x68, 0xd0,
+    0x50, 0xf1, 0x07, 0xea, 0xe9, 0xd7, 0x26, 0xea, 0x07, 0xea, 0xf0, 0x0b,
+    0xc3, 0x7e, 0x93, 0x4a, 0x74, 0x6e, 0x43, 0x7e, 0x9f, 0x0b, 0xc3, 0x7e,
+    0xab, 0x45, 0x00, 0x8c, 0x43, 0x7e, 0xb7, 0xcd, 0x00, 0xfa, 0x07, 0xe8,
+    0x79, 0xca, 0x26, 0xf7, 0x07, 0xe9, 0x58, 0xca, 0x26, 0xf7, 0x07, 0xe9,
+    0x09, 0xcd, 0x00, 0xfa, 0x07, 0xe8, 0x28, 0xca, 0x26, 0xf7, 0x07, 0xe9,
+    0x11, 0xcd, 0x00, 0xfa, 0x07, 0xe8, 0x30, 0x43, 0x12, 0xad, 0xc3, 0x7e,
+    0xc3, 0x00, 0x43, 0x7e, 0xcd, 0xcd, 0x77, 0x53, 0x07, 0xee, 0x79, 0xcf,
+    0x30, 0xd9, 0x07, 0xef, 0xa8, 0xcc, 0x00, 0xfb, 0x07, 0xe1, 0x51, 0xcb,
+    0x10, 0xb5, 0x07, 0xe5, 0xd8, 0xce, 0x43, 0x77, 0x07, 0xed, 0xb1, 0x45,
+    0x19, 0x60, 0xc3, 0x7e, 0xd9, 0xd7, 0x26, 0xea, 0x07, 0xeb, 0xc0, 0xcc,
+    0x00, 0xfb, 0x07, 0xe1, 0x49, 0xcb, 0x10, 0xb5, 0x07, 0xe5, 0xd0, 0xca,
+    0x26, 0xf7, 0x07, 0xeb, 0xa9, 0xcc, 0x10, 0xb4, 0x07, 0xee, 0x20, 0xcd,
+    0x00, 0xfa, 0x07, 0xe2, 0xe9, 0xca, 0x26, 0xf7, 0x07, 0xe4, 0x80, 0xca,
+    0x26, 0xf7, 0x07, 0xe9, 0xe1, 0xcd, 0x00, 0xfa, 0x07, 0xe9, 0xe8, 0x49,
+    0x82, 0xa3, 0xc3, 0x7e, 0xe5, 0x0f, 0x43, 0x7e, 0xef, 0xcd, 0x00, 0xfa,
+    0x07, 0xe7, 0xb1, 0xca, 0x26, 0xf7, 0x07, 0xe8, 0x90, 0xcd, 0x00, 0xfa,
+    0x07, 0xe7, 0xa9, 0xca, 0x26, 0xf7, 0x07, 0xe8, 0x88, 0x0b, 0xc3, 0x7e,
+    0xfb, 0xcb, 0x64, 0x7b, 0x07, 0xe9, 0xd1, 0x45, 0x00, 0x8c, 0x43, 0x7f,
+    0x07, 0xcc, 0x00, 0xfb, 0x07, 0xe1, 0x31, 0xcb, 0x10, 0xb5, 0x07, 0xe5,
+    0xc0, 0xca, 0x26, 0xf7, 0x07, 0xe3, 0xd9, 0xcd, 0x00, 0xfa, 0x07, 0xe0,
+    0xa0, 0xca, 0x26, 0xf7, 0x07, 0xe3, 0xd1, 0xcd, 0x00, 0xfa, 0x07, 0xe0,
+    0x98, 0xca, 0x26, 0xf7, 0x07, 0xe3, 0xc1, 0x0b, 0xc3, 0x7f, 0x19, 0xcb,
+    0x64, 0x7b, 0x07, 0xe7, 0x28, 0xcc, 0x00, 0xfb, 0x07, 0xe0, 0x71, 0xcb,
+    0x10, 0xb5, 0x07, 0xe5, 0x20, 0xd1, 0x30, 0xc1, 0x07, 0xea, 0xa9, 0xd0,
+    0x50, 0xf1, 0x07, 0xea, 0xb1, 0xd1, 0x50, 0xf0, 0x07, 0xea, 0xb9, 0xce,
+    0x43, 0x77, 0x07, 0xed, 0x19, 0xd7, 0x26, 0xea, 0x07, 0xed, 0x20, 0xcc,
+    0x00, 0xfb, 0x07, 0xe0, 0x69, 0xcb, 0x10, 0xb5, 0x07, 0xe5, 0x18, 0xd1,
+    0x50, 0x13, 0x07, 0xea, 0xa1, 0xce, 0x43, 0x77, 0x07, 0xed, 0x09, 0xd7,
+    0x26, 0xea, 0x07, 0xed, 0x10, 0x0b, 0xc3, 0x7f, 0x25, 0x45, 0x00, 0x8c,
+    0x43, 0x7f, 0x31, 0xcc, 0x10, 0xb4, 0x07, 0xe5, 0x29, 0xcb, 0x64, 0x7b,
+    0x07, 0xe7, 0x20, 0xcc, 0x00, 0xfb, 0x07, 0xe0, 0x59, 0xcb, 0x10, 0xb5,
+    0x07, 0xe5, 0x08, 0xd1, 0x50, 0x13, 0x07, 0xea, 0x81, 0xce, 0x43, 0x77,
+    0x07, 0xec, 0xf9, 0xd7, 0x26, 0xea, 0x07, 0xed, 0x00, 0x1b, 0xc3, 0x7f,
+    0x3d, 0x03, 0xc3, 0x7f, 0x49, 0xcf, 0x60, 0x8a, 0x07, 0xe3, 0x39, 0x45,
+    0x19, 0x60, 0xc3, 0x7f, 0x55, 0xcf, 0x69, 0x81, 0x07, 0xe3, 0x29, 0xce,
+    0x72, 0xf0, 0x07, 0xe3, 0x21, 0x0a, 0xc3, 0x7f, 0x65, 0x46, 0x30, 0xc1,
+    0xc3, 0x7f, 0x71, 0x42, 0x00, 0x5d, 0xc3, 0x7f, 0x7d, 0x43, 0x94, 0xf6,
+    0xc3, 0x7f, 0x87, 0x42, 0x03, 0x53, 0xc3, 0x7f, 0x93, 0x44, 0xdf, 0x2b,
+    0xc3, 0x7f, 0x9f, 0xd1, 0x50, 0xf0, 0x07, 0xe4, 0xc8, 0x0b, 0xc3, 0x7f,
+    0xab, 0xd3, 0x43, 0x72, 0x07, 0xed, 0x70, 0xca, 0x26, 0xf7, 0x07, 0xec,
+    0xe1, 0xcc, 0x10, 0xb4, 0x07, 0xec, 0xe8, 0xcc, 0x00, 0xfb, 0x07, 0xe2,
+    0x61, 0xcb, 0x10, 0xb5, 0x07, 0xe6, 0x98, 0xd1, 0x50, 0x13, 0x07, 0xec,
+    0xa9, 0xd7, 0x26, 0xea, 0x07, 0xec, 0xb1, 0xce, 0x43, 0x77, 0x07, 0xed,
+    0x98, 0xcc, 0x10, 0xb4, 0x07, 0xed, 0xc1, 0xca, 0x26, 0xf7, 0x07, 0xed,
+    0xe8, 0xca, 0x26, 0xf7, 0x07, 0xec, 0xb9, 0xcc, 0x10, 0xb4, 0x07, 0xec,
+    0xc0, 0xcc, 0x00, 0xfb, 0x07, 0xe1, 0xe1, 0xcb, 0x10, 0xb5, 0x07, 0xe6,
+    0x40, 0x45, 0x19, 0x60, 0xc3, 0x7f, 0xb7, 0xce, 0x43, 0x77, 0x07, 0xed,
+    0xb8, 0xcc, 0x00, 0xfb, 0x07, 0xe1, 0xd9, 0xcb, 0x10, 0xb5, 0x07, 0xe6,
+    0x38, 0xca, 0x26, 0xf7, 0x07, 0xe4, 0x19, 0xcd, 0x00, 0xfa, 0x07, 0xe1,
+    0xe8, 0xcd, 0x00, 0xfa, 0x07, 0xf7, 0xa9, 0xca, 0x26, 0xf7, 0x07, 0xf7,
+    0xb0, 0x46, 0x05, 0x34, 0xc3, 0x7f, 0xc3, 0x46, 0x00, 0xd4, 0x43, 0x7f,
+    0xcf, 0xca, 0x26, 0xf7, 0x07, 0xec, 0x39, 0xcc, 0x10, 0xb4, 0x07, 0xec,
+    0x40, 0xcc, 0x00, 0xfb, 0x07, 0xe2, 0x01, 0xcb, 0x10, 0xb5, 0x07, 0xe6,
+    0x50, 0x45, 0x19, 0x60, 0xc3, 0x7f, 0xdb, 0xce, 0x43, 0x77, 0x07, 0xec,
+    0x09, 0xd7, 0x26, 0xea, 0x07, 0xec, 0x10, 0xca, 0x26, 0xf7, 0x07, 0xec,
+    0x21, 0xcc, 0x10, 0xb4, 0x07, 0xec, 0x18, 0xcc, 0x10, 0xb4, 0x07, 0xed,
+    0xd1, 0xca, 0x26, 0xf7, 0x07, 0xed, 0xe0, 0xca, 0x26, 0xf7, 0x07, 0xe3,
+    0xf9, 0xcd, 0x00, 0xfa, 0x07, 0xe1, 0xb0, 0xca, 0x26, 0xf7, 0x07, 0xe3,
+    0xf1, 0xcd, 0x00, 0xfa, 0x07, 0xe1, 0xa8, 0x0b, 0xc3, 0x7f, 0xe7, 0x45,
+    0x00, 0x8c, 0x43, 0x7f, 0xf3, 0xcc, 0x00, 0xfb, 0x07, 0xe1, 0x99, 0xcb,
+    0x10, 0xb5, 0x07, 0xe6, 0x10, 0xcc, 0x00, 0xfb, 0x07, 0xe0, 0x41, 0xcb,
+    0x10, 0xb5, 0x07, 0xe4, 0xf8, 0xcc, 0x00, 0xfb, 0x07, 0xe0, 0x39, 0xcb,
+    0x10, 0xb5, 0x07, 0xe4, 0xf0, 0x0b, 0xc3, 0x80, 0x05, 0xd3, 0x43, 0x72,
+    0x07, 0xee, 0x10, 0xcb, 0x64, 0x7b, 0x07, 0xe7, 0x11, 0xcc, 0x10, 0xb4,
+    0x07, 0xe5, 0x00, 0x8f, 0x07, 0xea, 0x1b, 0x03, 0x80, 0x11, 0xc3, 0x3a,
+    0x09, 0x07, 0xea, 0x28, 0xcc, 0x00, 0xfb, 0x07, 0xe2, 0x41, 0xcb, 0x10,
+    0xb5, 0x07, 0xe6, 0x88, 0xcc, 0x00, 0xfb, 0x07, 0xe2, 0x39, 0xcb, 0x10,
+    0xb5, 0x07, 0xe6, 0x80, 0xd1, 0x30, 0xc1, 0x07, 0xec, 0x71, 0xd1, 0x50,
+    0x13, 0x07, 0xec, 0x79, 0xce, 0x43, 0x77, 0x07, 0xed, 0xc8, 0xcc, 0x00,
+    0xfb, 0x07, 0xe2, 0x31, 0xcb, 0x10, 0xb5, 0x07, 0xe6, 0x78, 0xd1, 0x30,
+    0xc1, 0x07, 0xec, 0x49, 0xd1, 0x50, 0x13, 0x07, 0xec, 0x51, 0xce, 0x43,
+    0x77, 0x07, 0xec, 0x58, 0xcc, 0x00, 0xfb, 0x07, 0xe2, 0x29, 0xcb, 0x10,
+    0xb5, 0x07, 0xe6, 0x70, 0xd0, 0x50, 0xf1, 0x07, 0xec, 0x61, 0xd1, 0x50,
+    0x13, 0x07, 0xec, 0x69, 0xce, 0x43, 0x77, 0x07, 0xee, 0x01, 0xd1, 0x50,
+    0xf0, 0x07, 0xec, 0x90, 0xcb, 0x64, 0x7b, 0x07, 0xdf, 0xf9, 0x0b, 0xc3,
+    0x80, 0x17, 0xca, 0x26, 0xf7, 0x07, 0xdf, 0xe9, 0x45, 0x00, 0x8c, 0x43,
+    0x80, 0x23, 0x45, 0x00, 0x8c, 0xc3, 0x80, 0x33, 0x0b, 0xc3, 0x80, 0x3d,
+    0xca, 0x26, 0xf7, 0x07, 0xf6, 0x91, 0xcb, 0x64, 0x7b, 0x07, 0xf6, 0xa0,
+    0x45, 0x00, 0x8c, 0xc3, 0x80, 0x49, 0x0b, 0xc3, 0x80, 0x55, 0xca, 0x26,
+    0xf7, 0x07, 0xf6, 0x71, 0xcb, 0x64, 0x7b, 0x07, 0xf6, 0x80, 0x45, 0x00,
+    0x8c, 0xc3, 0x80, 0x61, 0xcb, 0x64, 0x7b, 0x07, 0xdc, 0xa9, 0x0b, 0xc3,
+    0x80, 0x71, 0xca, 0x26, 0xf7, 0x07, 0xdc, 0x98, 0xcb, 0x64, 0x7b, 0x07,
+    0xdc, 0xc9, 0x0b, 0xc3, 0x80, 0x7d, 0xca, 0x26, 0xf7, 0x07, 0xdc, 0xb8,
+    0x45, 0x00, 0x8c, 0xc3, 0x80, 0x89, 0x0b, 0xc3, 0x80, 0xa1, 0xca, 0x26,
+    0xf7, 0x07, 0xf6, 0xf1, 0xcb, 0x64, 0x7b, 0x07, 0xf7, 0x00, 0x46, 0x02,
+    0xd8, 0xc3, 0x80, 0xad, 0x0b, 0xc3, 0x80, 0xb9, 0xca, 0x26, 0xf7, 0x07,
+    0xf4, 0xf1, 0xcb, 0x64, 0x7b, 0x07, 0xf5, 0x00, 0xca, 0x26, 0xf7, 0x07,
+    0xdc, 0x59, 0xcd, 0x00, 0xfa, 0x07, 0xdc, 0x50, 0xd6, 0x2c, 0xc8, 0x00,
+    0x46, 0x20, 0x46, 0x02, 0xd8, 0xc3, 0x80, 0xc5, 0xcb, 0x64, 0x7b, 0x07,
+    0xf6, 0x61, 0x0b, 0xc3, 0x80, 0xd1, 0xca, 0x26, 0xf7, 0x07, 0xf6, 0x50,
+    0x19, 0xc3, 0x80, 0xdd, 0xc7, 0x06, 0x5f, 0x00, 0x32, 0x4b, 0x03, 0x80,
+    0xec, 0xcd, 0x00, 0xfa, 0x07, 0xf4, 0x69, 0xca, 0x26, 0xf7, 0x07, 0xf4,
+    0x70, 0x45, 0x00, 0x8c, 0xc3, 0x80, 0xf0, 0xcb, 0x64, 0x7b, 0x07, 0xdc,
+    0x89, 0x0b, 0xc3, 0x81, 0x00, 0xca, 0x26, 0xf7, 0x07, 0xdc, 0x78, 0x00,
+    0x43, 0x81, 0x0c, 0x00, 0x43, 0x81, 0x22, 0x00, 0x43, 0x81, 0x2e, 0x0b,
+    0xc3, 0x81, 0x3a, 0xca, 0x26, 0xf7, 0x07, 0xf5, 0x31, 0xcb, 0x64, 0x7b,
+    0x07, 0xf5, 0x40, 0x45, 0x00, 0x8c, 0xc3, 0x81, 0x46, 0xcb, 0x64, 0x7b,
+    0x07, 0xdb, 0xe9, 0x0b, 0xc3, 0x81, 0x52, 0xca, 0x26, 0xf7, 0x07, 0xdb,
+    0xd8, 0x00, 0x43, 0x81, 0x5e, 0xcc, 0x88, 0x05, 0x00, 0x46, 0x01, 0xcb,
+    0x64, 0x7b, 0x07, 0xdb, 0x49, 0x0b, 0xc3, 0x81, 0x6e, 0xca, 0x26, 0xf7,
+    0x07, 0xdb, 0x38, 0x00, 0x43, 0x81, 0x7a, 0x45, 0x00, 0x8c, 0xc3, 0x81,
+    0x8a, 0x0f, 0xc3, 0x81, 0x9c, 0x0b, 0xc3, 0x81, 0xab, 0xca, 0x26, 0xf7,
+    0x07, 0xf4, 0xb0, 0x00, 0x43, 0x81, 0xb7, 0x45, 0x00, 0x8c, 0xc3, 0x81,
+    0xc7, 0x0b, 0xc3, 0x81, 0xd1, 0xca, 0x26, 0xf7, 0x07, 0xf6, 0x11, 0xcb,
+    0x64, 0x7b, 0x07, 0xf6, 0x20, 0x00, 0x43, 0x81, 0xdd, 0x00, 0x43, 0x81,
+    0xe9, 0x98, 0x00, 0x45, 0xf1, 0xca, 0xa6, 0xd4, 0x00, 0x45, 0xb8, 0xcb,
+    0x10, 0xb5, 0x07, 0xda, 0xc1, 0xcc, 0x00, 0xfb, 0x07, 0xda, 0xb0, 0xcb,
+    0x64, 0x7b, 0x07, 0xdb, 0x89, 0x0b, 0xc3, 0x81, 0xf9, 0xca, 0x26, 0xf7,
+    0x07, 0xdb, 0x78, 0x45, 0x00, 0x8c, 0xc3, 0x82, 0x05, 0xc6, 0x17, 0xce,
+    0x00, 0x36, 0x93, 0x03, 0x82, 0x18, 0x0b, 0xc3, 0x82, 0x1c, 0xca, 0x26,
+    0xf7, 0x07, 0xf7, 0x91, 0xcb, 0x64, 0x7b, 0x07, 0xf7, 0xa0, 0xca, 0x26,
+    0xf7, 0x07, 0xde, 0xe1, 0xcd, 0x00, 0xfa, 0x07, 0xde, 0xd8, 0x45, 0x00,
+    0x8c, 0xc3, 0x82, 0x28, 0xcd, 0x00, 0xfa, 0x07, 0xf5, 0x69, 0xca, 0x26,
+    0xf7, 0x07, 0xf5, 0x70, 0xcb, 0x64, 0x7b, 0x07, 0xdd, 0x19, 0x0b, 0xc3,
+    0x82, 0x59, 0xca, 0x26, 0xf7, 0x07, 0xdd, 0x08, 0xca, 0x26, 0xf7, 0x07,
+    0xdc, 0x69, 0xcd, 0x00, 0xfa, 0x07, 0xdc, 0x60, 0x45, 0x00, 0x8c, 0xc3,
+    0x82, 0x65, 0x0b, 0xc3, 0x82, 0x81, 0xca, 0x26, 0xf7, 0x07, 0xf4, 0x81,
+    0xcb, 0x64, 0x7b, 0x07, 0xf4, 0x90, 0x00, 0x43, 0x82, 0x8d, 0xcb, 0x64,
+    0x7b, 0x07, 0xda, 0xa9, 0x0b, 0xc3, 0x82, 0x9d, 0xca, 0x26, 0xf7, 0x07,
+    0xda, 0x98, 0xcb, 0x64, 0x7b, 0x07, 0xdf, 0x49, 0xcc, 0x10, 0xb4, 0x07,
+    0xdf, 0x40, 0xce, 0x00, 0xf9, 0x07, 0xde, 0xe8, 0x44, 0x05, 0x18, 0xc3,
+    0x82, 0xa9, 0xd0, 0x0e, 0x7c, 0x00, 0x35, 0x40, 0xcb, 0x10, 0xb5, 0x07,
+    0xf6, 0xb9, 0xcc, 0x00, 0xfb, 0x07, 0xf6, 0xa8, 0xcb, 0x10, 0xb5, 0x07,
+    0xdf, 0x31, 0xcc, 0x00, 0xfb, 0x07, 0xdf, 0x20, 0xd5, 0x35, 0x75, 0x00,
+    0x45, 0x91, 0xcd, 0x00, 0xfa, 0x07, 0xf5, 0x79, 0xca, 0x26, 0xf7, 0x07,
+    0xf5, 0x80, 0x0b, 0xc3, 0x82, 0xb8, 0xca, 0x26, 0xf7, 0x07, 0xf6, 0x31,
+    0xcb, 0x64, 0x7b, 0x07, 0xf6, 0x40, 0x46, 0x02, 0xd8, 0xc3, 0x82, 0xc4,
+    0x0b, 0xc3, 0x82, 0xd0, 0xca, 0x26, 0xf7, 0x07, 0xf5, 0xd1, 0xcb, 0x64,
+    0x7b, 0x07, 0xf5, 0xe0, 0xce, 0x6d, 0xe8, 0x00, 0x37, 0xd1, 0x0b, 0xc3,
+    0x82, 0xdc, 0xca, 0x26, 0xf7, 0x07, 0xf5, 0xb1, 0xcb, 0x64, 0x7b, 0x07,
+    0xf5, 0xc0, 0x45, 0x00, 0x8c, 0xc3, 0x82, 0xe8, 0x0b, 0xc3, 0x83, 0x0a,
+    0xca, 0x26, 0xf7, 0x07, 0xf5, 0x91, 0xcb, 0x64, 0x7b, 0x07, 0xf5, 0xa0,
+    0x00, 0x43, 0x83, 0x16, 0x00, 0x43, 0x83, 0x28, 0x00, 0x43, 0x83, 0x34,
+    0x00, 0x43, 0x83, 0x4a, 0x00, 0x43, 0x83, 0x56, 0xca, 0x26, 0xf7, 0x07,
+    0xdc, 0x39, 0xcd, 0x00, 0xfa, 0x07, 0xdc, 0x30, 0xcb, 0x64, 0x7b, 0x07,
+    0xdb, 0xa9, 0x0b, 0xc3, 0x83, 0x62, 0xca, 0x26, 0xf7, 0x07, 0xdb, 0x98,
+    0xcb, 0x64, 0x7b, 0x07, 0xdb, 0x69, 0x0b, 0xc3, 0x83, 0x6e, 0xca, 0x26,
+    0xf7, 0x07, 0xdb, 0x58, 0x44, 0x05, 0x18, 0xc3, 0x83, 0x7a, 0xce, 0x1e,
+    0x29, 0x00, 0x36, 0x51, 0xc4, 0x00, 0x9d, 0x00, 0x36, 0x21, 0xcb, 0x08,
+    0x09, 0x00, 0x31, 0x23, 0x03, 0x83, 0x86, 0x5d, 0x10, 0x12, 0x43, 0x83,
+    0x8a, 0x45, 0x00, 0x8c, 0xc3, 0x83, 0x96, 0x0b, 0xc3, 0x83, 0xa2, 0xca,
+    0x26, 0xf7, 0x07, 0xf7, 0x11, 0xcb, 0x64, 0x7b, 0x07, 0xf7, 0x20, 0xcb,
+    0x64, 0x7b, 0x07, 0xde, 0xb1, 0x0b, 0xc3, 0x83, 0xae, 0xca, 0x26, 0xf7,
+    0x07, 0xde, 0xa0, 0x00, 0x43, 0x83, 0xba, 0x45, 0x00, 0x8c, 0xc3, 0x83,
+    0xca, 0xc6, 0x3a, 0x06, 0x00, 0x35, 0xd3, 0x03, 0x83, 0xe6, 0x0b, 0xc3,
+    0x83, 0xea, 0xca, 0x26, 0xf7, 0x07, 0xf7, 0x31, 0xcb, 0x64, 0x7b, 0x07,
+    0xf7, 0x40, 0xcb, 0x64, 0x7b, 0x07, 0xdb, 0xc9, 0x0b, 0xc3, 0x83, 0xf6,
+    0xca, 0x26, 0xf7, 0x07, 0xdb, 0xb8, 0x00, 0x43, 0x84, 0x02, 0xce, 0x00,
+    0xf9, 0x07, 0xf4, 0x00, 0xcb, 0x98, 0x6e, 0x00, 0x35, 0xf3, 0x03, 0x84,
+    0x18, 0xc4, 0xe0, 0x63, 0x00, 0x36, 0x0b, 0x03, 0x84, 0x1c, 0x45, 0x00,
+    0x8c, 0xc3, 0x84, 0x20, 0x0b, 0xc3, 0x84, 0x2f, 0xca, 0x26, 0xf7, 0x07,
+    0xf7, 0x51, 0xcb, 0x64, 0x7b, 0x07, 0xf7, 0x60, 0xc3, 0x2b, 0xb9, 0x00,
+    0x33, 0xc1, 0xc4, 0x06, 0x5a, 0x00, 0x33, 0xa9, 0xc3, 0x7e, 0x89, 0x00,
+    0x33, 0xb0, 0xc2, 0x16, 0x1c, 0x0f, 0x75, 0xa9, 0xc2, 0x02, 0x98, 0x0f,
+    0x75, 0x41, 0x0a, 0x43, 0x84, 0x3b, 0xc4, 0xdf, 0x93, 0x0f, 0x75, 0xa1,
+    0xc2, 0x01, 0x9d, 0x0f, 0x75, 0x89, 0xc3, 0x03, 0x26, 0x0f, 0x75, 0x70,
+    0xc2, 0x00, 0x74, 0x0f, 0x75, 0x31, 0x8a, 0x0f, 0x75, 0xd0, 0x8e, 0x0f,
+    0x75, 0x19, 0x86, 0x0f, 0x75, 0xc8, 0xc3, 0x03, 0x26, 0x0f, 0x72, 0x71,
+    0xc2, 0x01, 0x9d, 0x0f, 0x72, 0x89, 0xc4, 0xdf, 0x93, 0x0f, 0x72, 0xa0,
+    0xc2, 0x01, 0x9d, 0x0f, 0x72, 0xc9, 0x47, 0x3b, 0xc4, 0x43, 0x84, 0x47,
+    0xc2, 0x16, 0x1c, 0x0f, 0x74, 0xb1, 0xc2, 0x00, 0x65, 0x0f, 0x74, 0xc0,
+    0xc3, 0x85, 0xf5, 0x0f, 0x73, 0xe1, 0xc3, 0xb1, 0x0d, 0x0f, 0x73, 0xf0,
+    0xc3, 0x33, 0x5f, 0x00, 0x46, 0xe9, 0x8a, 0x00, 0x46, 0x60, 0xc6, 0xcb,
+    0x9f, 0x00, 0x46, 0xe1, 0xc7, 0xc1, 0xc4, 0x00, 0x46, 0xd9, 0xcb, 0x92,
+    0x49, 0x00, 0x46, 0xd1, 0xc5, 0xd6, 0x55, 0x00, 0x46, 0xa1, 0xc5, 0xde,
+    0x57, 0x00, 0x44, 0xc0, 0xc5, 0xdb, 0xaf, 0x00, 0x44, 0xd1, 0xc6, 0xcb,
+    0x6f, 0x00, 0x44, 0xc8, 0x4b, 0x13, 0xdd, 0xc3, 0x84, 0x53, 0xcc, 0x04,
+    0xdb, 0x0f, 0xdd, 0x18, 0xdc, 0x13, 0xdd, 0x0f, 0xdd, 0x3b, 0x03, 0x84,
+    0x59, 0xcc, 0x04, 0xdb, 0x0f, 0xdd, 0x12, 0x03, 0x84, 0x5f, 0xc4, 0x00,
+    0x49, 0x0f, 0xdd, 0x03, 0x03, 0x84, 0x65, 0xc5, 0x00, 0x2c, 0x0f, 0xdd,
+    0x0a, 0x03, 0x84, 0x69, 0xca, 0x01, 0x68, 0x01, 0x29, 0x61, 0xc4, 0x00,
+    0x49, 0x01, 0x28, 0x81, 0xc5, 0x00, 0x2c, 0x01, 0x28, 0x60, 0x16, 0xc3,
+    0x84, 0x6d, 0xd2, 0x4a, 0x2d, 0x0f, 0xd0, 0x39, 0xce, 0x2a, 0xfe, 0x0f,
+    0xd0, 0x99, 0xdf, 0x0d, 0x00, 0x0f, 0xd0, 0xe0, 0xc5, 0xa8, 0xf7, 0x0f,
+    0xd2, 0x89, 0xc4, 0xde, 0x83, 0x0f, 0xd2, 0x91, 0xc6, 0xca, 0xfd, 0x0f,
+    0xd2, 0x98, 0xce, 0x2a, 0xfe, 0x0f, 0xd0, 0x79, 0xdb, 0x18, 0x03, 0x0f,
+    0xd1, 0xc8, 0x44, 0x1d, 0xba, 0xc3, 0x84, 0x79, 0xc5, 0xc0, 0x74, 0x0f,
+    0xaf, 0x98, 0x17, 0xc3, 0x84, 0x85, 0x96, 0x0b, 0x4d, 0xd0, 0x9a, 0x0b,
+    0x4f, 0x31, 0xc2, 0x10, 0x11, 0x0b, 0x4c, 0xd0, 0x83, 0x0b, 0x4b, 0x9b,
+    0x03, 0x84, 0x93, 0x17, 0xc3, 0x84, 0x99, 0x42, 0x2c, 0x43, 0x43, 0x84,
+    0xa1, 0x96, 0x0b, 0x4f, 0x88, 0x17, 0xc3, 0x84, 0xab, 0x07, 0x43, 0x84,
+    0xbb, 0x93, 0x0b, 0x4c, 0x01, 0x92, 0x0b, 0x4b, 0xe8, 0x42, 0x01, 0x31,
+    0xc3, 0x84, 0xca, 0x92, 0x0b, 0x4b, 0x30, 0xc2, 0x5c, 0x9b, 0x0b, 0x4d,
+    0x81, 0x93, 0x0b, 0x4c, 0x70, 0xc2, 0x00, 0x11, 0x0b, 0x4b, 0x79, 0x87,
+    0x0b, 0x4c, 0x08, 0x87, 0x0b, 0x4e, 0xa3, 0x03, 0x84, 0xd6, 0xc2, 0xd0,
+    0x00, 0x0b, 0x4c, 0x18, 0x93, 0x0b, 0x4d, 0x08, 0x90, 0x0b, 0x4b, 0x38,
+    0xc3, 0x8b, 0xaa, 0x0b, 0x4c, 0xe0, 0xc2, 0x10, 0x11, 0x0b, 0x4c, 0xc8,
+    0x87, 0x0b, 0x4b, 0x89, 0x93, 0x0b, 0x4e, 0x50, 0x8f, 0x0b, 0x4b, 0xc0,
+    0xc5, 0xdb, 0x32, 0x0b, 0x4e, 0xd1, 0xc5, 0xd9, 0xa2, 0x0b, 0x4e, 0x88,
+    0x96, 0x0b, 0x4e, 0x69, 0xc2, 0x00, 0xe2, 0x0b, 0x4d, 0x88, 0x9a, 0x0b,
+    0x4f, 0x39, 0x96, 0x0b, 0x4d, 0xe8, 0x93, 0x0b, 0x4f, 0xa0, 0x90, 0x0b,
+    0x4b, 0x59, 0x96, 0x0b, 0x4c, 0x60, 0x8f, 0x0b, 0x4b, 0xf0, 0xc6, 0xcb,
+    0x7b, 0x0b, 0x4f, 0xa9, 0xc4, 0x05, 0x2e, 0x0b, 0x4e, 0x91, 0x8b, 0x0b,
+    0x4e, 0x40, 0x96, 0x0b, 0x4e, 0x20, 0x96, 0x0b, 0x4e, 0x78, 0xc3, 0xc5,
+    0xd2, 0x0b, 0x4a, 0x29, 0x03, 0xc3, 0x84, 0xdc, 0xc3, 0xd7, 0xe2, 0x0b,
+    0x49, 0xd9, 0xc4, 0xc2, 0x61, 0x0b, 0x49, 0x98, 0xc3, 0x8f, 0x8a, 0x0b,
+    0x49, 0xe1, 0xc3, 0x17, 0x29, 0x0b, 0x48, 0x99, 0x42, 0x2c, 0x43, 0xc3,
+    0x84, 0xe9, 0xc2, 0x00, 0xb6, 0x0b, 0x47, 0xf1, 0xc2, 0x05, 0x1d, 0x0b,
+    0x47, 0xe0, 0xc2, 0x00, 0xa4, 0x0b, 0x4a, 0x31, 0xc2, 0x02, 0xe0, 0x0b,
+    0x47, 0xc0, 0x96, 0x0b, 0x49, 0x59, 0x92, 0x0b, 0x48, 0xf8, 0xc2, 0x01,
+    0xdf, 0x0b, 0x49, 0xc1, 0x87, 0x0b, 0x4a, 0xc8, 0x87, 0x0b, 0x48, 0xa9,
+    0xc2, 0xd0, 0x00, 0x0b, 0x48, 0x48, 0xc3, 0x7c, 0x57, 0x0b, 0x48, 0x71,
+    0x96, 0x0b, 0x47, 0xb8, 0xc2, 0x02, 0xe0, 0x0b, 0x47, 0xa8, 0x8f, 0x0b,
+    0x4a, 0x21, 0xc3, 0x48, 0xc4, 0x0b, 0x48, 0xb8, 0x90, 0x0b, 0x49, 0xf1,
+    0x96, 0x0b, 0x48, 0x58, 0xc6, 0x18, 0x0e, 0x0b, 0x4b, 0x18, 0xc2, 0x10,
+    0x11, 0x0b, 0x49, 0x51, 0x96, 0x0b, 0x48, 0x40, 0x90, 0x0b, 0x47, 0xa0,
+    0x90, 0x0b, 0x4a, 0x09, 0xc3, 0xb5, 0x1b, 0x0b, 0x49, 0x19, 0x96, 0x0b,
+    0x48, 0x00, 0x92, 0x0b, 0x49, 0x61, 0x8f, 0x0b, 0x49, 0x31, 0xc8, 0xb7,
+    0xba, 0x0b, 0x48, 0x79, 0xc7, 0xc3, 0x37, 0x0b, 0x47, 0xf8, 0x17, 0xc3,
+    0x84, 0xf5, 0x87, 0x0b, 0x47, 0xe8, 0x92, 0x0b, 0x49, 0xb1, 0x8f, 0x0b,
+    0x49, 0xa0, 0xc3, 0xc9, 0xd8, 0x0b, 0x47, 0x49, 0xc7, 0xc7, 0x66, 0x0b,
+    0x47, 0x50, 0x8f, 0x0b, 0x47, 0x11, 0x15, 0xc3, 0x84, 0xff, 0xc3, 0xe6,
+    0x08, 0x0b, 0x45, 0x08, 0x97, 0x0b, 0x46, 0x53, 0x03, 0x85, 0x0b, 0xc2,
+    0x00, 0xc4, 0x0b, 0x44, 0x98, 0xc2, 0x5c, 0x9b, 0x0b, 0x44, 0xa9, 0xc9,
+    0xb1, 0xdc, 0x0b, 0x44, 0x78, 0xc2, 0xd0, 0x00, 0x0b, 0x47, 0x29, 0xc3,
+    0xd0, 0xd7, 0x0b, 0x46, 0x40, 0x8f, 0x0b, 0x46, 0x79, 0xc2, 0x00, 0x4f,
+    0x0b, 0x46, 0x20, 0x92, 0x0b, 0x46, 0xd1, 0x8f, 0x0b, 0x46, 0xb8, 0x96,
+    0x0b, 0x45, 0xe9, 0xc5, 0xdb, 0x14, 0x0b, 0x44, 0xa0, 0x90, 0x0b, 0x46,
+    0xb1, 0xc7, 0xc7, 0x43, 0x0b, 0x46, 0x38, 0x90, 0x0b, 0x46, 0xa1, 0xc5,
+    0xda, 0x6f, 0x0b, 0x45, 0xc8, 0x42, 0x01, 0x31, 0xc3, 0x85, 0x21, 0xc3,
+    0x16, 0x59, 0x0b, 0x46, 0xf8, 0x17, 0xc3, 0x85, 0x2d, 0xc3, 0x82, 0x78,
+    0x0b, 0x46, 0x11, 0xc5, 0xd8, 0xee, 0x0b, 0x44, 0xb8, 0xc5, 0xd6, 0x87,
+    0x0b, 0x45, 0xb9, 0x96, 0x0b, 0x45, 0x30, 0xc3, 0x7c, 0x57, 0x0b, 0x46,
+    0x61, 0x87, 0x0b, 0x45, 0x20, 0xc3, 0x8e, 0x97, 0x0b, 0x46, 0xf1, 0xc2,
+    0x00, 0xba, 0x0b, 0x46, 0x58, 0xc5, 0xda, 0x10, 0x0b, 0x46, 0xc1, 0xc7,
+    0xc6, 0x71, 0x0b, 0x45, 0x98, 0xc6, 0xd1, 0x1b, 0x0b, 0x43, 0xa9, 0xc3,
+    0x76, 0x32, 0x0b, 0x44, 0x51, 0xc3, 0x8f, 0x91, 0x0b, 0x43, 0xd2, 0x03,
+    0x85, 0x35, 0xc3, 0xe5, 0x93, 0x0b, 0x44, 0x41, 0xc6, 0xce, 0xbd, 0x0b,
+    0x44, 0x38, 0xc4, 0x9c, 0x80, 0x0b, 0x42, 0xf9, 0xc7, 0xca, 0x4c, 0x0b,
+    0x42, 0xe0, 0xc3, 0x82, 0x78, 0x0b, 0x41, 0xf1, 0xca, 0xa5, 0x80, 0x0b,
+    0x40, 0x40, 0x8f, 0x0b, 0x41, 0xb9, 0xc7, 0xc1, 0x38, 0x0b, 0x40, 0x28,
+    0x8f, 0x0b, 0x42, 0x73, 0x03, 0x85, 0x3b, 0xc2, 0x00, 0xba, 0x0b, 0x42,
+    0x31, 0xc3, 0x16, 0x59, 0x0b, 0x41, 0x91, 0xc4, 0x2c, 0x42, 0x0b, 0x40,
+    0xd0, 0xc3, 0x4e, 0x64, 0x0b, 0x41, 0xb1, 0xc3, 0xe5, 0x5d, 0x0b, 0x41,
+    0x30, 0xcc, 0x8b, 0xa1, 0x0b, 0x42, 0x08, 0xc5, 0xd9, 0xe3, 0x0b, 0x40,
+    0xb1, 0xc5, 0xb7, 0xb5, 0x0b, 0x40, 0x00, 0x00, 0x43, 0x85, 0x4d, 0x8f,
+    0x0b, 0x42, 0x61, 0xc3, 0x82, 0x78, 0x0b, 0x42, 0x10, 0xc2, 0x01, 0x5d,
+    0x0b, 0x40, 0x51, 0xc5, 0xa9, 0x67, 0x0b, 0x40, 0x48, 0xc2, 0x01, 0x5d,
+    0x0b, 0x40, 0x19, 0xc5, 0xa9, 0x67, 0x0b, 0x40, 0x10, 0xa2, 0x01, 0x40,
+    0xfb, 0x03, 0x85, 0x59, 0xa3, 0x01, 0x41, 0x7b, 0x03, 0x85, 0x6b, 0xa5,
+    0x01, 0x44, 0x79, 0xa4, 0x01, 0x42, 0x7a, 0x03, 0x85, 0x76, 0xa3, 0x01,
+    0x41, 0xbb, 0x03, 0x85, 0x7a, 0xa5, 0x01, 0x44, 0xb9, 0xa4, 0x01, 0x42,
+    0xba, 0x03, 0x85, 0x85, 0xa5, 0x01, 0x45, 0x39, 0xa4, 0x01, 0x43, 0x3a,
+    0x03, 0x85, 0x89, 0xa5, 0x01, 0x46, 0x38, 0xa3, 0x01, 0x41, 0xdb, 0x03,
+    0x85, 0x8d, 0xa5, 0x01, 0x44, 0xd9, 0xa4, 0x01, 0x42, 0xda, 0x03, 0x85,
+    0x98, 0xa5, 0x01, 0x45, 0x59, 0xa4, 0x01, 0x43, 0x5a, 0x03, 0x85, 0x9c,
+    0xa5, 0x01, 0x46, 0x58, 0xa5, 0x01, 0x45, 0x99, 0xa4, 0x01, 0x43, 0x9a,
+    0x03, 0x85, 0xa0, 0xa5, 0x01, 0x46, 0x98, 0xa5, 0x01, 0x47, 0x18, 0xa3,
+    0x01, 0x41, 0xeb, 0x03, 0x85, 0xa4, 0xa5, 0x01, 0x44, 0xe9, 0xa4, 0x01,
+    0x42, 0xea, 0x03, 0x85, 0xaf, 0xa5, 0x01, 0x45, 0x69, 0xa4, 0x01, 0x43,
+    0x6a, 0x03, 0x85, 0xb3, 0xa5, 0x01, 0x46, 0x68, 0xa5, 0x01, 0x45, 0xa9,
+    0xa4, 0x01, 0x43, 0xaa, 0x03, 0x85, 0xb7, 0xa5, 0x01, 0x46, 0xa8, 0xa5,
+    0x01, 0x47, 0x28, 0xa5, 0x01, 0x45, 0xc9, 0xa4, 0x01, 0x43, 0xca, 0x03,
+    0x85, 0xbb, 0xa5, 0x01, 0x46, 0xc8, 0xa5, 0x01, 0x47, 0x48, 0xa5, 0x01,
+    0x47, 0x88, 0xa3, 0x01, 0x41, 0xf3, 0x03, 0x85, 0xbf, 0xa5, 0x01, 0x44,
+    0xf1, 0xa4, 0x01, 0x42, 0xf2, 0x03, 0x85, 0xca, 0xa5, 0x01, 0x45, 0x71,
+    0xa4, 0x01, 0x43, 0x72, 0x03, 0x85, 0xce, 0xa5, 0x01, 0x46, 0x70, 0xa5,
+    0x01, 0x45, 0xb1, 0xa4, 0x01, 0x43, 0xb2, 0x03, 0x85, 0xd2, 0xa5, 0x01,
+    0x46, 0xb0, 0xa5, 0x01, 0x47, 0x30, 0xa5, 0x01, 0x45, 0xd1, 0xa4, 0x01,
+    0x43, 0xd2, 0x03, 0x85, 0xd6, 0xa5, 0x01, 0x46, 0xd0, 0xa5, 0x01, 0x47,
+    0x50, 0xa5, 0x01, 0x47, 0x90, 0xa5, 0x01, 0x45, 0xe1, 0xa4, 0x01, 0x43,
+    0xe2, 0x03, 0x85, 0xda, 0xa5, 0x01, 0x46, 0xe0, 0xa5, 0x01, 0x47, 0x60,
+    0xa5, 0x01, 0x47, 0xa0, 0xa5, 0x01, 0x47, 0xc0, 0xc6, 0x04, 0xe1, 0x0f,
+    0xda, 0x01, 0xcc, 0x04, 0xcb, 0x0f, 0xda, 0x78, 0xcc, 0x04, 0xcb, 0x0f,
+    0xda, 0x71, 0xc5, 0x00, 0x2c, 0x0f, 0xda, 0x80, 0x45, 0x00, 0x8c, 0xc3,
+    0x85, 0xde, 0xc6, 0x10, 0x9d, 0x01, 0x5b, 0x81, 0x45, 0x03, 0x55, 0x43,
+    0x86, 0x08, 0xc3, 0x14, 0xa7, 0x01, 0x59, 0xdb, 0x03, 0x86, 0x0e, 0xd2,
+    0x05, 0xd5, 0x01, 0x5f, 0x60, 0xcf, 0x2c, 0x35, 0x01, 0x59, 0xc9, 0xd6,
+    0x2d, 0x62, 0x01, 0x59, 0xd0, 0xcf, 0x62, 0x3d, 0x01, 0x4b, 0x59, 0x47,
+    0x92, 0xe3, 0xc3, 0x86, 0x14, 0xc8, 0xae, 0xbc, 0x01, 0x4a, 0xf1, 0xc6,
+    0x10, 0x9d, 0x01, 0x4a, 0xb0, 0x46, 0x00, 0xd4, 0xc3, 0x86, 0x1a, 0xc8,
+    0xae, 0xbc, 0x01, 0x4a, 0xd1, 0xc6, 0x10, 0x9d, 0x01, 0x4a, 0x90, 0xc4,
+    0xe1, 0xbf, 0x08, 0x3a, 0x61, 0xc4, 0xe2, 0xc7, 0x08, 0x3a, 0x59, 0xc4,
+    0xe0, 0x7b, 0x08, 0x3a, 0x51, 0xc4, 0xe1, 0x2b, 0x08, 0x3a, 0x48, 0x88,
+    0x08, 0x30, 0x81, 0x8f, 0x08, 0x30, 0x88, 0x88, 0x08, 0x30, 0x99, 0x8f,
+    0x08, 0x30, 0xa0, 0x8f, 0x08, 0x30, 0xb0, 0xc5, 0xdc, 0x27, 0x08, 0x04,
+    0x01, 0xc7, 0xc5, 0xa6, 0x08, 0x04, 0x09, 0xc6, 0xcf, 0x65, 0x08, 0x04,
+    0x11, 0x23, 0xc3, 0x86, 0x24, 0x24, 0xc3, 0x86, 0x30, 0x25, 0xc3, 0x86,
+    0x3c, 0x26, 0xc3, 0x86, 0x48, 0x22, 0x43, 0x86, 0x54, 0xc7, 0xc6, 0x1d,
+    0x08, 0x04, 0x71, 0xc8, 0xb7, 0x12, 0x08, 0x04, 0x79, 0xc7, 0xc9, 0xc0,
+    0x08, 0x04, 0x81, 0xc7, 0xc1, 0x23, 0x08, 0x04, 0x89, 0xc9, 0xa9, 0x1b,
+    0x08, 0x04, 0x90, 0xc5, 0xdd, 0x03, 0x08, 0x04, 0xa9, 0xc6, 0xd1, 0xb1,
+    0x08, 0x04, 0xb1, 0x9f, 0x08, 0x04, 0xb8, 0xc8, 0xba, 0xea, 0x08, 0x04,
+    0xd1, 0xc6, 0xd2, 0x17, 0x08, 0x04, 0xd9, 0x9f, 0x08, 0x04, 0xe1, 0xc6,
+    0xd2, 0x6b, 0x08, 0x04, 0xe9, 0xa3, 0x08, 0x04, 0xf0, 0x9d, 0x08, 0x04,
+    0xf9, 0xc6, 0xd3, 0x01, 0x08, 0x05, 0x01, 0x9f, 0x08, 0x05, 0x09, 0xa0,
+    0x08, 0x05, 0x11, 0xa1, 0x08, 0x05, 0x19, 0xa4, 0x08, 0x05, 0x29, 0xa5,
+    0x08, 0x05, 0x31, 0xc7, 0xc5, 0x8a, 0x08, 0x05, 0x38, 0x9d, 0x08, 0x05,
+    0x41, 0x9e, 0x08, 0x05, 0x49, 0xc9, 0xaf, 0x5d, 0x08, 0x05, 0x51, 0xc8,
+    0xbe, 0x1a, 0x08, 0x05, 0x59, 0xa1, 0x08, 0x05, 0x61, 0xa2, 0x08, 0x05,
+    0x69, 0xa3, 0x08, 0x05, 0x71, 0xa4, 0x08, 0x05, 0x79, 0xa5, 0x08, 0x05,
+    0x81, 0xa6, 0x08, 0x05, 0x88, 0x9d, 0x08, 0x05, 0x91, 0x9f, 0x08, 0x05,
+    0xa1, 0xc7, 0xc8, 0xa8, 0x08, 0x05, 0xa9, 0xa1, 0x08, 0x05, 0xb1, 0xa4,
+    0x08, 0x05, 0xc1, 0xa5, 0x08, 0x05, 0xc9, 0xa6, 0x08, 0x05, 0xd1, 0x9e,
+    0x08, 0x05, 0x99, 0xc6, 0xd0, 0xd3, 0x08, 0x05, 0xb8, 0x9d, 0x08, 0x05,
+    0xd9, 0x9e, 0x08, 0x05, 0xe1, 0x9f, 0x08, 0x05, 0xe9, 0xa0, 0x08, 0x05,
+    0xf1, 0xa1, 0x08, 0x05, 0xf9, 0xa2, 0x08, 0x06, 0x01, 0xa6, 0x08, 0x06,
+    0x08, 0x9d, 0x08, 0x06, 0x11, 0xc8, 0xb7, 0xea, 0x08, 0x06, 0x18, 0xcb,
+    0x8d, 0x00, 0x08, 0x06, 0x21, 0xc9, 0xaa, 0x32, 0x08, 0x06, 0x28, 0xc7,
+    0xc6, 0x40, 0x08, 0x06, 0x31, 0xc7, 0xc7, 0x9e, 0x08, 0x06, 0x39, 0x9f,
+    0x08, 0x06, 0x41, 0xc7, 0xc1, 0x2a, 0x08, 0x06, 0x49, 0xa1, 0x08, 0x06,
+    0x51, 0xa3, 0x08, 0x06, 0x58, 0xc9, 0xad, 0x2f, 0x08, 0x06, 0x69, 0xcf,
+    0x6b, 0x61, 0x08, 0x06, 0x71, 0xc7, 0xc2, 0x26, 0x08, 0x06, 0x79, 0xa2,
+    0x08, 0x06, 0x81, 0xa3, 0x08, 0x06, 0x89, 0xa5, 0x08, 0x06, 0x99, 0xa6,
+    0x08, 0x06, 0xa1, 0xd1, 0x52, 0x99, 0x08, 0x06, 0x60, 0x9e, 0x08, 0x06,
+    0xa9, 0x9f, 0x08, 0x06, 0xb1, 0xa0, 0x08, 0x06, 0xb9, 0xc6, 0xcf, 0x5f,
+    0x08, 0x06, 0xc1, 0xa2, 0x08, 0x06, 0xc9, 0xa3, 0x08, 0x06, 0xd1, 0xa4,
+    0x08, 0x06, 0xd9, 0xa5, 0x08, 0x06, 0xe1, 0xa6, 0x08, 0x06, 0xe8, 0x9d,
+    0x08, 0x06, 0xf9, 0x9e, 0x08, 0x07, 0x01, 0x9f, 0x08, 0x07, 0x09, 0xa0,
+    0x08, 0x07, 0x11, 0xa1, 0x08, 0x07, 0x19, 0xa2, 0x08, 0x07, 0x21, 0xa4,
+    0x08, 0x07, 0x31, 0xa5, 0x08, 0x07, 0x39, 0xa6, 0x08, 0x07, 0x41, 0xa3,
+    0x08, 0x07, 0x28, 0x9d, 0x08, 0x07, 0x49, 0x9e, 0x08, 0x07, 0x51, 0x9f,
+    0x08, 0x07, 0x59, 0xa0, 0x08, 0x07, 0x61, 0xa1, 0x08, 0x07, 0x69, 0xa2,
+    0x08, 0x07, 0x71, 0xa4, 0x08, 0x07, 0x81, 0xa3, 0x08, 0x07, 0x79, 0xa5,
+    0x08, 0x07, 0x89, 0xa6, 0x08, 0x07, 0x90, 0x9e, 0x08, 0x07, 0x99, 0x9f,
+    0x08, 0x07, 0xa1, 0xa3, 0x08, 0x07, 0xa9, 0xa4, 0x08, 0x07, 0xb1, 0xa5,
+    0x08, 0x07, 0xb9, 0xa6, 0x08, 0x07, 0xc0, 0xc3, 0x00, 0x33, 0x0e, 0xf8,
+    0xf1, 0xc4, 0x65, 0xe2, 0x00, 0x0b, 0x0b, 0x03, 0x86, 0x66, 0xc9, 0x08,
+    0xf7, 0x00, 0x0a, 0xe9, 0xca, 0xa7, 0x1a, 0x00, 0x10, 0xc9, 0xc6, 0xbd,
+    0xf4, 0x00, 0x0a, 0xf8, 0xc5, 0x05, 0x02, 0x00, 0xf3, 0x1b, 0x03, 0x86,
+    0x6c, 0xc5, 0x00, 0xd4, 0x00, 0xf3, 0x08, 0xce, 0x16, 0x0f, 0x00, 0xf3,
+    0x28, 0xd3, 0x42, 0x2f, 0x05, 0x3e, 0x51, 0xc9, 0xb4, 0xeb, 0x00, 0x11,
+    0xf8, 0x46, 0x00, 0x8b, 0x43, 0x86, 0x72, 0x94, 0x05, 0x5a, 0x5b, 0x03,
+    0x86, 0x7e, 0x89, 0x00, 0x13, 0x0a, 0x03, 0x86, 0x84, 0xc8, 0xb7, 0xc2,
+    0x00, 0xe8, 0xf9, 0xcd, 0x7c, 0x26, 0x00, 0xe8, 0xf1, 0x97, 0x00, 0xe8,
+    0xe9, 0x91, 0x00, 0xe8, 0x8a, 0x03, 0x86, 0x8a, 0xc6, 0xbd, 0xf4, 0x00,
+    0x07, 0x3b, 0x03, 0x86, 0x96, 0xc9, 0x08, 0xf7, 0x00, 0x08, 0x49, 0xc4,
+    0x65, 0xe2, 0x00, 0x08, 0x69, 0xc3, 0x00, 0x33, 0x00, 0x12, 0xa8, 0xca,
+    0xa3, 0xaa, 0x05, 0x5a, 0xa9, 0xca, 0x4c, 0x63, 0x05, 0x5a, 0xa0, 0xc4,
+    0x6d, 0xb5, 0x00, 0x13, 0xb9, 0xc5, 0x21, 0xa4, 0x00, 0x14, 0xd0, 0xce,
+    0x01, 0x19, 0x0e, 0xf8, 0xe1, 0xcc, 0x51, 0x28, 0x0e, 0xf8, 0xb8, 0x94,
+    0x00, 0x13, 0xcb, 0x03, 0x86, 0x9c, 0x96, 0x00, 0x14, 0x3b, 0x03, 0x86,
+    0xa2, 0x9b, 0x00, 0x14, 0x73, 0x03, 0x86, 0xa8, 0x89, 0x00, 0xeb, 0xb9,
+    0x11, 0xc3, 0x86, 0xae, 0x8b, 0x00, 0xe8, 0x4b, 0x03, 0x86, 0xc4, 0x83,
+    0x00, 0x12, 0x83, 0x03, 0x86, 0xca, 0xc2, 0x03, 0xd4, 0x05, 0x5a, 0x89,
+    0x8a, 0x00, 0x13, 0x2b, 0x03, 0x86, 0xd4, 0x8f, 0x00, 0x13, 0x7b, 0x03,
+    0x86, 0xdd, 0x98, 0x00, 0x14, 0x61, 0x99, 0x00, 0x14, 0x69, 0x8d, 0x00,
+    0x14, 0xf1, 0x8e, 0x05, 0x3c, 0x09, 0xc5, 0xdb, 0x4b, 0x00, 0x0c, 0x69,
+    0x87, 0x00, 0x0e, 0xe8, 0xd3, 0x42, 0xed, 0x0e, 0xf8, 0x48, 0x42, 0x01,
+    0x94, 0xc3, 0x86, 0xe3, 0x43, 0x05, 0x19, 0x43, 0x86, 0xef, 0xcf, 0x68,
+    0x82, 0x00, 0xf3, 0x89, 0xc6, 0xbd, 0xf4, 0x00, 0x0b, 0x19, 0xc4, 0x65,
+    0xe2, 0x00, 0x0b, 0x29, 0xca, 0xa7, 0x1a, 0x00, 0x10, 0xd9, 0xc3, 0x00,
+    0x33, 0x00, 0x11, 0xb0, 0xcc, 0x23, 0x3f, 0x05, 0x3b, 0x2a, 0x03, 0x86,
+    0xfb, 0xc3, 0x22, 0xcb, 0x00, 0x0c, 0x29, 0xc3, 0x02, 0x9f, 0x00, 0x0d,
+    0x41, 0xc4, 0x0d, 0x13, 0x00, 0x0d, 0xe8, 0xc2, 0x00, 0xc0, 0x00, 0x0d,
+    0x0b, 0x03, 0x87, 0x01, 0xc8, 0x9e, 0x5c, 0x00, 0xf6, 0x78, 0xc9, 0x08,
+    0xf7, 0x00, 0x07, 0xa3, 0x03, 0x87, 0x07, 0xc4, 0x65, 0xe2, 0x00, 0x0e,
+    0x90, 0x11, 0xc3, 0x87, 0x0d, 0xc8, 0x20, 0xa9, 0x00, 0x07, 0xb2, 0x03,
+    0x87, 0x19, 0x45, 0x02, 0x9a, 0x43, 0x87, 0x1f, 0x45, 0x02, 0x9a, 0x43,
+    0x87, 0x2b, 0xca, 0x9b, 0xda, 0x00, 0x0f, 0xf8, 0xd1, 0x53, 0x76, 0x05,
+    0x3a, 0x59, 0xc2, 0x00, 0x11, 0x05, 0x3a, 0x69, 0xcd, 0x7d, 0xfa, 0x01,
+    0x63, 0xd0, 0xcb, 0x98, 0x58, 0x00, 0x0f, 0x68, 0x46, 0x00, 0x8b, 0xc3,
+    0x87, 0x3d, 0x87, 0x05, 0x5b, 0x10, 0xd4, 0x01, 0x13, 0x00, 0xec, 0x98,
+    0xd3, 0x42, 0xed, 0x0e, 0xf8, 0xd0, 0x11, 0xc3, 0x87, 0x49, 0xc8, 0x20,
+    0xa9, 0x00, 0x07, 0x7a, 0x03, 0x87, 0x55, 0xc6, 0x05, 0x01, 0x00, 0xf1,
+    0x68, 0xc9, 0x08, 0xf7, 0x00, 0x07, 0x71, 0xc4, 0x65, 0xe2, 0x00, 0x0e,
+    0x80, 0xcc, 0x23, 0x3f, 0x00, 0xeb, 0xe0, 0x89, 0x00, 0xeb, 0xc9, 0x88,
+    0x05, 0x3b, 0xe1, 0x94, 0x05, 0x3c, 0x19, 0x95, 0x05, 0x3c, 0x29, 0x96,
+    0x05, 0x3c, 0x39, 0x86, 0x05, 0x3b, 0xd0, 0xc5, 0xde, 0x3e, 0x05, 0x5b,
+    0x21, 0xc2, 0x49, 0x0c, 0x05, 0x5a, 0x00, 0x46, 0x00, 0x8b, 0x43, 0x87,
+    0x5b, 0xcf, 0x68, 0x82, 0x00, 0xf0, 0x99, 0xc6, 0xbd, 0xf4, 0x00, 0xf0,
+    0x89, 0xc4, 0x65, 0xe2, 0x00, 0x08, 0x79, 0xc3, 0x00, 0x33, 0x00, 0x11,
+    0x08, 0xc5, 0x05, 0x02, 0x00, 0xf0, 0x69, 0xc5, 0x00, 0xd4, 0x00, 0xf0,
+    0x58, 0xc9, 0xa9, 0x2d, 0x00, 0xec, 0x88, 0xd3, 0x42, 0x2f, 0x05, 0x3e,
+    0x41, 0xc5, 0x01, 0x74, 0x00, 0x08, 0x88, 0xc5, 0xcf, 0xcc, 0x00, 0x0c,
+    0x61, 0xc3, 0x14, 0xa7, 0x00, 0x12, 0xb0, 0xc7, 0x45, 0x16, 0x00, 0x15,
+    0x1b, 0x03, 0x87, 0x67, 0xca, 0x8e, 0x61, 0x00, 0x0e, 0x30, 0x94, 0x05,
+    0x5a, 0x43, 0x03, 0x87, 0x6d, 0xc5, 0x42, 0xe8, 0x05, 0x3e, 0x99, 0xc4,
+    0x95, 0x50, 0x05, 0x3e, 0xa8, 0x8c, 0x00, 0x11, 0xbb, 0x03, 0x87, 0x73,
+    0x8b, 0x00, 0x09, 0x88, 0x45, 0x00, 0x8c, 0xc3, 0x87, 0x7c, 0xc8, 0x0f,
+    0xbd, 0x00, 0x0d, 0xc8, 0xcc, 0x51, 0x28, 0x0e, 0xf8, 0xa1, 0xcc, 0x4d,
+    0x15, 0x05, 0x59, 0xe0, 0xca, 0xa7, 0x24, 0x0e, 0xf8, 0x5b, 0x03, 0x87,
+    0x92, 0xce, 0x01, 0x19, 0x00, 0xec, 0xc1, 0xcc, 0x51, 0x28, 0x00, 0xec,
+    0x59, 0xc4, 0x00, 0x32, 0x00, 0x14, 0x30, 0xc9, 0x08, 0xf7, 0x00, 0x07,
+    0x53, 0x03, 0x87, 0x98, 0xc6, 0xbd, 0xf4, 0x00, 0x11, 0x4b, 0x03, 0x87,
+    0x9c, 0xc4, 0x65, 0xe2, 0x00, 0x08, 0xd8, 0xc6, 0x05, 0x01, 0x00, 0xf0,
+    0xd8, 0x11, 0xc3, 0x87, 0xa2, 0xc8, 0x20, 0xa9, 0x00, 0x07, 0x58, 0x45,
+    0x02, 0x9a, 0x43, 0x87, 0xae, 0x00, 0xc3, 0x87, 0xba, 0xca, 0x4b, 0x1f,
+    0x05, 0x5a, 0x38, 0xc2, 0x00, 0x75, 0x0e, 0xf8, 0x38, 0xc9, 0x33, 0xad,
+    0x05, 0x39, 0xf8, 0x46, 0x00, 0x8b, 0x43, 0x87, 0xfb, 0xc3, 0x3a, 0xe6,
+    0x00, 0x13, 0x63, 0x03, 0x88, 0x07, 0xc2, 0x00, 0xb1, 0x00, 0x0c, 0xd0,
+    0xcf, 0x68, 0x82, 0x00, 0xf1, 0xe9, 0xc6, 0xbd, 0xf4, 0x00, 0x09, 0xe1,
+    0xc4, 0x65, 0xe2, 0x00, 0x09, 0xf1, 0xc3, 0x00, 0x33, 0x00, 0x11, 0xa0,
+    0xc7, 0x0e, 0x70, 0x00, 0xf1, 0xbb, 0x03, 0x88, 0x0d, 0x45, 0x00, 0x5a,
+    0x43, 0x88, 0x13, 0xc4, 0x00, 0x9d, 0x05, 0x59, 0xc9, 0xc5, 0x1e, 0xc8,
+    0x00, 0x13, 0x59, 0xc3, 0x02, 0xa3, 0x00, 0x0a, 0x00, 0xc9, 0xaa, 0xcb,
+    0x05, 0x3c, 0x70, 0xd4, 0x01, 0x13, 0x0e, 0xf8, 0x28, 0xcb, 0x8f, 0xb5,
+    0x00, 0xf4, 0xe9, 0x06, 0x43, 0x88, 0x1f, 0xc6, 0x00, 0xd3, 0x00, 0xf7,
+    0xb8, 0x43, 0x05, 0x19, 0xc3, 0x88, 0x2b, 0xc8, 0x20, 0xa9, 0x00, 0x07,
+    0xf8, 0xce, 0x36, 0x39, 0x05, 0x5a, 0xd1, 0xc5, 0x01, 0x74, 0x00, 0x12,
+    0x78, 0x98, 0x00, 0xf7, 0xe9, 0xc2, 0x02, 0xa7, 0x00, 0xf7, 0xd8, 0xc5,
+    0x05, 0x02, 0x00, 0xf2, 0x19, 0xc5, 0x00, 0xd4, 0x00, 0xf2, 0x08, 0x42,
+    0x01, 0x23, 0xc3, 0x88, 0x37, 0x06, 0xc3, 0x88, 0x46, 0xc6, 0x60, 0xb1,
+    0x00, 0x0b, 0x5b, 0x03, 0x88, 0x53, 0xc5, 0x1e, 0xc8, 0x00, 0x0b, 0x4b,
+    0x03, 0x88, 0x59, 0x05, 0xc3, 0x88, 0x5d, 0x14, 0xc3, 0x88, 0x6c, 0xc9,
+    0x6d, 0x45, 0x05, 0x5a, 0x91, 0x15, 0xc3, 0x88, 0x78, 0xc5, 0x1f, 0x0c,
+    0x00, 0x07, 0xc9, 0xc5, 0x31, 0xee, 0x00, 0x07, 0xd1, 0xc5, 0x1d, 0x88,
+    0x00, 0x0b, 0x69, 0xc6, 0xcc, 0x8f, 0x00, 0x0b, 0x99, 0xce, 0x1d, 0x93,
+    0x00, 0x10, 0xb8, 0xd5, 0x36, 0x32, 0x05, 0x5a, 0x78, 0xc5, 0x1d, 0x88,
+    0x00, 0x08, 0x1b, 0x03, 0x88, 0x84, 0x05, 0xc3, 0x88, 0x8a, 0xca, 0x9e,
+    0x5a, 0x00, 0xf5, 0x19, 0x06, 0xc3, 0x88, 0x99, 0x14, 0xc3, 0x88, 0xa6,
+    0xce, 0x1d, 0x93, 0x00, 0x10, 0x19, 0xc5, 0x1f, 0x0c, 0x00, 0x07, 0x01,
+    0xc5, 0x31, 0xee, 0x00, 0x07, 0x09, 0xc5, 0x1e, 0xc8, 0x00, 0x07, 0x19,
+    0xc6, 0x60, 0xb1, 0x00, 0x08, 0x09, 0xc6, 0xcc, 0x8f, 0x00, 0x08, 0x29,
+    0xc6, 0x01, 0x73, 0x01, 0x63, 0x28, 0xc5, 0x31, 0xee, 0x00, 0x0f, 0xe9,
+    0xc6, 0x60, 0xb1, 0x00, 0x0f, 0x18, 0x43, 0x05, 0x19, 0xc3, 0x88, 0xb0,
+    0xc8, 0x20, 0xa9, 0x00, 0xf4, 0x28, 0xc6, 0xbd, 0xf4, 0x00, 0xf1, 0x49,
+    0xc9, 0x08, 0xf7, 0x00, 0x09, 0x29, 0xc4, 0x65, 0xe2, 0x00, 0x10, 0xf8,
+    0xc8, 0x20, 0xa9, 0x00, 0xf1, 0x39, 0x43, 0x05, 0x19, 0xc3, 0x88, 0xbc,
+    0xc8, 0x25, 0xfb, 0x01, 0x63, 0x40, 0x43, 0x05, 0x19, 0xc3, 0x88, 0xc8,
+    0xc8, 0x25, 0xfb, 0x01, 0x63, 0x60, 0xc9, 0x08, 0xf7, 0x00, 0xf4, 0x89,
+    0xc3, 0x00, 0x33, 0x00, 0x14, 0x89, 0xc4, 0x65, 0xe2, 0x00, 0x0b, 0xf0,
+    0xc5, 0x01, 0x74, 0x00, 0x0d, 0xb1, 0xc9, 0xb4, 0xeb, 0x00, 0x12, 0x00,
+    0xc8, 0x20, 0xa9, 0x00, 0xf4, 0x69, 0xc8, 0x16, 0x15, 0x00, 0xf4, 0x58,
+    0xcb, 0x95, 0xae, 0x05, 0x5a, 0xbb, 0x03, 0x88, 0xd4, 0xcc, 0x4c, 0x61,
+    0x05, 0x5a, 0xb0, 0xc8, 0x0e, 0x6f, 0x00, 0xf3, 0xf9, 0xce, 0x3e, 0xae,
+    0x05, 0x3a, 0xf8, 0xc5, 0x01, 0x74, 0x00, 0xeb, 0xeb, 0x03, 0x88, 0xd8,
+    0xcc, 0x89, 0x01, 0x05, 0x3a, 0xa8, 0x05, 0xc3, 0x88, 0xde, 0x0e, 0xc3,
+    0x88, 0xfc, 0x06, 0xc3, 0x89, 0x0e, 0xcc, 0x51, 0x28, 0x00, 0xec, 0x39,
+    0xcc, 0x1e, 0xc1, 0x00, 0xeb, 0x81, 0xc5, 0x1f, 0x0c, 0x00, 0x0f, 0xc9,
+    0xce, 0x01, 0x19, 0x00, 0x13, 0x99, 0xc5, 0x1e, 0xc8, 0x00, 0x07, 0x89,
+    0xc5, 0x31, 0xee, 0x00, 0x0a, 0x21, 0xce, 0x38, 0xe6, 0x05, 0x3d, 0x28,
+    0xc8, 0x0e, 0x6f, 0x00, 0xf1, 0x99, 0xce, 0x3e, 0xae, 0x05, 0x3a, 0x19,
+    0xc8, 0x25, 0xfb, 0x01, 0x63, 0x50, 0xd4, 0x3e, 0xa8, 0x05, 0x3a, 0x28,
+    0xc6, 0xbd, 0xf4, 0x00, 0x09, 0xb9, 0xc4, 0x65, 0xe2, 0x00, 0x0f, 0x48,
+    0xc9, 0x08, 0xf7, 0x00, 0x08, 0xe9, 0xc6, 0xbd, 0xf4, 0x00, 0x09, 0x19,
+    0xc4, 0x65, 0xe2, 0x00, 0x0f, 0x38, 0xc5, 0x05, 0x02, 0x00, 0xf0, 0x29,
+    0xc5, 0x00, 0xd4, 0x00, 0xf0, 0x18, 0x87, 0x05, 0x59, 0x99, 0xc5, 0xde,
+    0x3e, 0x05, 0x59, 0x81, 0x91, 0x00, 0x13, 0xa8, 0xcc, 0x23, 0x3f, 0x05,
+    0x59, 0xf0, 0xcb, 0x4d, 0x16, 0x00, 0x14, 0xe9, 0xc9, 0x08, 0xf7, 0x00,
+    0x09, 0xa9, 0xc4, 0x65, 0xe2, 0x00, 0x0f, 0x80, 0xc5, 0x41, 0x20, 0x00,
+    0x12, 0x58, 0xc5, 0x05, 0x02, 0x00, 0xf7, 0xa1, 0xc5, 0x00, 0xd4, 0x00,
+    0xf4, 0x70, 0xc2, 0x00, 0xc0, 0x00, 0x0d, 0x7b, 0x03, 0x89, 0x1a, 0xc8,
+    0x9e, 0x5c, 0x00, 0xf7, 0x30, 0x11, 0xc3, 0x89, 0x20, 0xc8, 0x20, 0xa9,
+    0x00, 0x06, 0xe2, 0x03, 0x89, 0x2c, 0xce, 0x74, 0xe8, 0x00, 0xf3, 0xd0,
+    0x00, 0x43, 0x89, 0x30, 0xc9, 0x08, 0xf7, 0x00, 0x06, 0xdb, 0x03, 0x89,
+    0x3c, 0xc4, 0x65, 0xe2, 0x00, 0x0e, 0x98, 0x45, 0x02, 0x9a, 0x43, 0x89,
+    0x42, 0x45, 0x02, 0x9a, 0x43, 0x89, 0x60, 0x42, 0x00, 0x30, 0xc3, 0x89,
+    0x7e, 0x45, 0x00, 0x5a, 0x43, 0x89, 0x8d, 0xcb, 0x98, 0x58, 0x00, 0x11,
+    0x50, 0x45, 0x02, 0x9a, 0x43, 0x89, 0x99, 0xc9, 0x20, 0xa8, 0x00, 0xf2,
+    0x71, 0xc5, 0x31, 0xee, 0x00, 0xf2, 0x61, 0xc6, 0x60, 0xb1, 0x00, 0x11,
+    0x60, 0x42, 0x00, 0x30, 0xc3, 0x89, 0xa5, 0xca, 0x1f, 0x07, 0x00, 0x10,
+    0x40, 0xca, 0x9b, 0xda, 0x00, 0xf1, 0x70, 0x00, 0x43, 0x89, 0xb1, 0xca,
+    0x9b, 0x80, 0x00, 0xf0, 0xe0, 0x42, 0x00, 0x30, 0xc3, 0x89, 0xbd, 0xca,
+    0x1f, 0x07, 0x00, 0x10, 0x20, 0xc5, 0x31, 0xee, 0x00, 0xf0, 0xb1, 0xc5,
+    0x1f, 0x0c, 0x00, 0xf0, 0xa0, 0xc9, 0x0e, 0x6e, 0x00, 0xf5, 0xb1, 0xc5,
+    0x1e, 0xc8, 0x00, 0xf5, 0xa1, 0xca, 0x9e, 0x5a, 0x00, 0xf5, 0x91, 0xc5,
+    0x1f, 0x0c, 0x00, 0xf5, 0x81, 0xc5, 0x31, 0xee, 0x00, 0xf5, 0x70, 0x45,
+    0x02, 0x9a, 0x43, 0x89, 0xc9, 0x42, 0x00, 0x30, 0xc3, 0x89, 0xe7, 0xca,
+    0x1f, 0x07, 0x00, 0x10, 0x00, 0xcb, 0x98, 0x58, 0x00, 0x0e, 0xf0, 0xca,
+    0x9b, 0xda, 0x00, 0x0f, 0xd0, 0xce, 0x16, 0x0f, 0x00, 0xf3, 0x40, 0xce,
+    0x16, 0x0f, 0x00, 0xf3, 0x30, 0xc5, 0x05, 0x02, 0x00, 0xf4, 0x91, 0xc5,
+    0x00, 0xd4, 0x00, 0x0b, 0xd8, 0xc5, 0x05, 0x02, 0x00, 0xf4, 0x41, 0xc5,
+    0x00, 0xd4, 0x00, 0xf4, 0x30, 0xc5, 0x05, 0x02, 0x00, 0xf3, 0x61, 0xc5,
+    0x00, 0xd4, 0x00, 0xf3, 0x50, 0x42, 0x00, 0x30, 0xc3, 0x89, 0xf6, 0xca,
+    0x1f, 0x07, 0x00, 0x10, 0x80, 0xc6, 0xbd, 0xf4, 0x00, 0x0a, 0xb1, 0xc4,
+    0x65, 0xe2, 0x00, 0x0a, 0xc0, 0xd2, 0x25, 0xf1, 0x05, 0x3a, 0x80, 0xc5,
+    0x05, 0x02, 0x00, 0xf2, 0x31, 0xc5, 0x00, 0xd4, 0x00, 0xf2, 0x20, 0xcb,
+    0x98, 0x58, 0x00, 0xf1, 0xc0, 0xc5, 0x05, 0x02, 0x00, 0xf1, 0x21, 0xc5,
+    0x00, 0xd4, 0x00, 0xf1, 0x10, 0xcb, 0x8e, 0x60, 0x00, 0x0e, 0x28, 0xca,
+    0x9b, 0xda, 0x00, 0xf0, 0x40, 0xd0, 0x57, 0xc2, 0x0f, 0xc1, 0x89, 0xcb,
+    0x57, 0xc7, 0x0f, 0xc1, 0x69, 0xca, 0xa0, 0x08, 0x0f, 0xc1, 0x49, 0x49,
+    0xa8, 0xdc, 0xc3, 0x8a, 0x05, 0xd8, 0x24, 0xb3, 0x01, 0x5b, 0xd9, 0xcc,
+    0x84, 0x09, 0x0f, 0xc1, 0x09, 0xcc, 0x82, 0x1d, 0x0f, 0xc1, 0x28, 0xe0,
+    0x03, 0xe7, 0x01, 0x5c, 0x08, 0xc6, 0x44, 0x50, 0x07, 0xd9, 0x69, 0xc7,
+    0x44, 0x4f, 0x07, 0xd9, 0x60, 0xc5, 0x79, 0xf2, 0x05, 0x4b, 0x51, 0xc6,
+    0xc0, 0x7c, 0x05, 0x4b, 0x39, 0xc6, 0x8e, 0xde, 0x05, 0x4b, 0x28, 0xc5,
+    0x8e, 0xdf, 0x00, 0x89, 0x69, 0xc6, 0xbb, 0xec, 0x00, 0x89, 0xc0, 0xc5,
+    0xc0, 0x7d, 0x00, 0x89, 0x79, 0xc6, 0xc1, 0x86, 0x00, 0x89, 0xc8, 0xc4,
+    0x79, 0xf3, 0x00, 0x89, 0x93, 0x03, 0x8a, 0x11, 0xc6, 0xba, 0x7c, 0x00,
+    0x89, 0xd0, 0xc4, 0xc6, 0x7a, 0x00, 0x89, 0xb1, 0xc6, 0xc6, 0x79, 0x00,
+    0x89, 0xb8, 0xc6, 0xbb, 0xec, 0x05, 0x4b, 0x99, 0xc5, 0x8e, 0xdf, 0x00,
+    0x88, 0xf0, 0xc3, 0x39, 0x37, 0x00, 0x89, 0x0b, 0x03, 0x8a, 0x17, 0xc8,
+    0xad, 0x27, 0x00, 0x89, 0x28, 0xc5, 0xc0, 0x7d, 0x00, 0x89, 0x01, 0xc6,
+    0xc1, 0x86, 0x00, 0x89, 0x48, 0xc4, 0xc6, 0x7a, 0x00, 0x89, 0x39, 0xc6,
+    0xc6, 0x79, 0x00, 0x89, 0x40, 0xc7, 0xbb, 0xeb, 0x00, 0x8a, 0x91, 0xc5,
+    0x90, 0xe4, 0x00, 0x8a, 0x98, 0xc3, 0x39, 0x37, 0x00, 0x89, 0xe1, 0x44,
+    0x3a, 0xbf, 0x43, 0x8a, 0x1b, 0xc4, 0xc6, 0x7a, 0x00, 0x8a, 0x71, 0xc6,
+    0xc6, 0x79, 0x00, 0x8a, 0xa0, 0xc4, 0xad, 0x2b, 0x00, 0x89, 0xf9, 0xc5,
+    0xdb, 0xff, 0x00, 0x8a, 0x88, 0x87, 0x06, 0xbe, 0x33, 0x03, 0x8a, 0x27,
+    0x97, 0x00, 0x8d, 0x01, 0x8b, 0x00, 0x8d, 0x09, 0x83, 0x06, 0xbe, 0x28,
+    0x91, 0x00, 0x8b, 0xc1, 0xc2, 0x42, 0xcd, 0x00, 0x8b, 0xc9, 0x97, 0x00,
+    0x8d, 0x20, 0x02, 0x43, 0x8a, 0x2b, 0x1b, 0xc3, 0x8a, 0x39, 0x91, 0x00,
+    0x8c, 0x39, 0x8b, 0x00, 0x8c, 0x41, 0x83, 0x06, 0xbd, 0x93, 0x03, 0x8a,
+    0x46, 0xc2, 0x02, 0x66, 0x06, 0xbd, 0xa0, 0x83, 0x00, 0x8c, 0x73, 0x03,
+    0x8a, 0x4a, 0x87, 0x00, 0x8c, 0x83, 0x03, 0x8a, 0x4e, 0xc2, 0x0c, 0x43,
+    0x00, 0x8c, 0x93, 0x03, 0x8a, 0x52, 0x97, 0x00, 0x8c, 0x99, 0x8b, 0x00,
+    0x8c, 0xa1, 0x91, 0x06, 0xbd, 0xc0, 0x91, 0x00, 0x8b, 0xd1, 0x97, 0x00,
+    0x8b, 0xd9, 0xc2, 0x2c, 0x43, 0x00, 0x8b, 0xe0, 0x97, 0x00, 0x8c, 0xa9,
+    0x87, 0x06, 0xbd, 0xdb, 0x03, 0x8a, 0x56, 0x83, 0x06, 0xbd, 0xc9, 0x91,
+    0x06, 0xbd, 0xe0, 0x91, 0x00, 0x8b, 0xf8, 0x87, 0x00, 0x8c, 0x0b, 0x03,
+    0x8a, 0x5e, 0x83, 0x00, 0x8d, 0x32, 0x03, 0x8a, 0x62, 0xc2, 0x09, 0x90,
+    0x06, 0xbd, 0x88, 0x87, 0x00, 0x8c, 0x50, 0x91, 0x06, 0xbd, 0xa8, 0xc4,
+    0xa6, 0x08, 0x00, 0x8c, 0xe8, 0x83, 0x00, 0x8c, 0xcb, 0x03, 0x8a, 0x66,
+    0x87, 0x06, 0xbe, 0x03, 0x03, 0x8a, 0x70, 0x91, 0x06, 0xbe, 0x11, 0x97,
+    0x06, 0xbe, 0x18, 0xc2, 0x09, 0x90, 0x06, 0xbe, 0x08, 0xc4, 0xad, 0x2b,
+    0x00, 0x8d, 0x53, 0x03, 0x8a, 0x74, 0xc5, 0xd6, 0x8c, 0x00, 0x8e, 0x19,
+    0xc5, 0xd9, 0x61, 0x00, 0x8f, 0xd1, 0xc5, 0x79, 0xf2, 0x00, 0x8f, 0xd9,
+    0xc7, 0xc6, 0x78, 0x00, 0x8f, 0xe1, 0xc7, 0xbb, 0xeb, 0x00, 0x8f, 0xe9,
+    0xc5, 0x90, 0xe4, 0x00, 0x8f, 0xf0, 0xc4, 0x79, 0xf3, 0x00, 0x8f, 0x31,
+    0xc6, 0xba, 0x7c, 0x00, 0x8f, 0xa0, 0x02, 0x43, 0x8a, 0x7a, 0xc8, 0xbb,
+    0xea, 0x06, 0xbe, 0xb8, 0xc6, 0xd1, 0x57, 0x06, 0xbe, 0x70, 0x0d, 0xc3,
+    0x8a, 0x86, 0x16, 0xc3, 0x8a, 0x92, 0xc5, 0xd6, 0x8c, 0x00, 0x8f, 0x49,
+    0x12, 0xc3, 0x8a, 0x9e, 0xc5, 0xda, 0xe7, 0x06, 0xbf, 0x51, 0x05, 0xc3,
+    0x8a, 0xaa, 0xc5, 0x90, 0xe4, 0x06, 0xbf, 0x90, 0xc4, 0xc6, 0x7a, 0x00,
+    0x8d, 0x61, 0xc6, 0xc6, 0x79, 0x06, 0xbe, 0x60, 0xc5, 0xc0, 0x7d, 0x00,
+    0x8e, 0x31, 0xc6, 0xc1, 0x86, 0x00, 0x8e, 0x48, 0xc6, 0x8e, 0xde, 0x00,
+    0x8e, 0x51, 0xc5, 0xd6, 0x8c, 0x00, 0x8e, 0x59, 0xc5, 0x79, 0xf2, 0x06,
+    0xbe, 0x79, 0xc4, 0xad, 0x2b, 0x06, 0xbe, 0x83, 0x03, 0x8a, 0xb6, 0x05,
+    0xc3, 0x8a, 0xbc, 0xc7, 0xc1, 0x85, 0x06, 0xbe, 0xa0, 0xc5, 0x8e, 0xdf,
+    0x00, 0x8d, 0x83, 0x03, 0x8a, 0xc8, 0xcc, 0x79, 0xeb, 0x00, 0x8e, 0xa9,
+    0xc6, 0xbb, 0xec, 0x00, 0x8e, 0xc0, 0x02, 0x43, 0x8a, 0xcc, 0xc4, 0x79,
+    0xf3, 0x00, 0x8d, 0x93, 0x03, 0x8a, 0xde, 0xc6, 0xba, 0x7c, 0x00, 0x8d,
+    0xa9, 0xc6, 0xca, 0x0e, 0x00, 0x8e, 0xb8, 0xc3, 0x39, 0x37, 0x00, 0x8d,
+    0x99, 0x44, 0x3a, 0xbf, 0x43, 0x8a, 0xe2, 0xc6, 0xc1, 0x86, 0x00, 0x8d,
+    0xa1, 0xc5, 0xc0, 0x7d, 0x00, 0x8e, 0x72, 0x03, 0x8a, 0xee, 0xc9, 0x90,
+    0xe0, 0x00, 0x8e, 0xcb, 0x03, 0x8a, 0xf4, 0xc6, 0xb7, 0x9c, 0x06, 0xbe,
+    0xd8, 0xc4, 0x79, 0xf3, 0x00, 0x8e, 0xe3, 0x03, 0x8a, 0xfa, 0xc6, 0xca,
+    0x0e, 0x00, 0x8e, 0xf8, 0xc3, 0x39, 0x37, 0x00, 0x8e, 0xe9, 0x44, 0x3a,
+    0xbf, 0x43, 0x8b, 0x00, 0xc6, 0xc6, 0x79, 0x00, 0x8f, 0x01, 0xc4, 0xc6,
+    0x7a, 0x06, 0xbf, 0x10, 0xc4, 0xad, 0x2b, 0x00, 0x8d, 0xcb, 0x03, 0x8b,
+    0x0c, 0xc5, 0xd6, 0x8c, 0x00, 0x8f, 0x1b, 0x03, 0x8b, 0x12, 0xc7, 0xba,
+    0x7b, 0x00, 0x8f, 0x21, 0xc5, 0x90, 0xe4, 0x00, 0x8f, 0x29, 0xc6, 0xc0,
+    0x7c, 0x06, 0xbf, 0x19, 0xc5, 0xda, 0xe7, 0x06, 0xbf, 0x29, 0x05, 0x43,
+    0x8b, 0x18, 0xc5, 0x79, 0xf2, 0x00, 0x8f, 0x99, 0xc4, 0xad, 0x2b, 0x06,
+    0xbf, 0xd1, 0xc7, 0xc1, 0x85, 0x06, 0xbf, 0xd8, 0xc5, 0x79, 0xf2, 0x06,
+    0xbf, 0x99, 0xcd, 0x79, 0xea, 0x06, 0xbf, 0xa0, 0xc5, 0x8e, 0xdf, 0x00,
+    0x8f, 0x61, 0xc6, 0xbb, 0xec, 0x00, 0x8f, 0x78, 0xc5, 0x79, 0xf2, 0x06,
+    0xbf, 0xb9, 0xca, 0xa7, 0x2e, 0x06, 0xbf, 0xc0, 0x0d, 0xc3, 0x8b, 0x24,
+    0x15, 0xc3, 0x8b, 0x30, 0xc7, 0xca, 0x0d, 0x00, 0x8f, 0x91, 0xc5, 0xda,
+    0xe7, 0x06, 0xbf, 0xa9, 0xc5, 0x90, 0xe4, 0x06, 0xbf, 0xb0, 0xc5, 0xd9,
+    0xca, 0x01, 0x8b, 0x58, 0x02, 0x43, 0x8b, 0x3c, 0xc5, 0xc0, 0x7d, 0x01,
+    0x8b, 0x99, 0xc6, 0xc1, 0x86, 0x01, 0x8b, 0xb8, 0xc4, 0xad, 0x2b, 0x01,
+    0x8c, 0x11, 0xc7, 0xca, 0x0d, 0x01, 0x8c, 0x18, 0x87, 0x01, 0x8c, 0x40,
+    0x91, 0x01, 0x8c, 0x50, 0xc8, 0x4b, 0x94, 0x0f, 0x64, 0x81, 0xc7, 0x0d,
+    0x04, 0x0f, 0x64, 0x38, 0xc8, 0x4b, 0x94, 0x0f, 0x64, 0x79, 0xc7, 0x0d,
+    0x04, 0x0f, 0x64, 0x30, 0xc8, 0x4b, 0x94, 0x0f, 0x64, 0x71, 0xc7, 0x0d,
+    0x04, 0x0f, 0x64, 0x28, 0xc8, 0x4b, 0x94, 0x0f, 0x64, 0x69, 0xc7, 0x0d,
+    0x04, 0x0f, 0x64, 0x20, 0x91, 0x01, 0x9f, 0x09, 0x07, 0x43, 0x8b, 0x48,
+    0xc3, 0x02, 0xdf, 0x01, 0x9f, 0x11, 0x43, 0x0d, 0x0e, 0x43, 0x8b, 0x57,
+    0xc4, 0x14, 0x09, 0x01, 0x9f, 0x68, 0xc2, 0x00, 0x5f, 0x01, 0x9f, 0x21,
+    0xc5, 0x14, 0x08, 0x01, 0x9f, 0x70, 0xc4, 0x14, 0x09, 0x01, 0x9f, 0x78,
+    0xc4, 0x14, 0x09, 0x01, 0x9f, 0x80, 0xc3, 0x03, 0x26, 0x01, 0x9f, 0x88,
+    0xc3, 0x22, 0x45, 0x01, 0x9b, 0x21, 0xc3, 0x18, 0x13, 0x01, 0x9b, 0x62,
+    0x03, 0x8b, 0x64, 0x4b, 0x18, 0x04, 0xc3, 0x8b, 0x68, 0xdc, 0x13, 0xf9,
+    0x0f, 0xd2, 0x28, 0xce, 0x3d, 0x7c, 0x01, 0x2f, 0x91, 0xcd, 0x02, 0xb4,
+    0x01, 0x2f, 0x88, 0xce, 0x6c, 0x0c, 0x0f, 0xb1, 0x81, 0xc8, 0xba, 0xaa,
+    0x0f, 0xc9, 0x70, 0xc9, 0x57, 0x20, 0x08, 0x4f, 0x98, 0xc9, 0x57, 0x20,
+    0x08, 0x4f, 0x90, 0xc7, 0x0d, 0x04, 0x08, 0x4e, 0xb3, 0x03, 0x8b, 0x74,
+    0xc8, 0x4b, 0x94, 0x08, 0x4e, 0xf8, 0xc7, 0x0d, 0x04, 0x08, 0x4e, 0xab,
+    0x03, 0x8b, 0x7a, 0xc8, 0x4b, 0x94, 0x08, 0x4e, 0xf0, 0xc7, 0x0d, 0x04,
+    0x08, 0x4e, 0xa3, 0x03, 0x8b, 0x80, 0xc8, 0x4b, 0x94, 0x08, 0x4e, 0xe8,
+    0xc7, 0x0d, 0x04, 0x08, 0x4e, 0x9b, 0x03, 0x8b, 0x86, 0xc8, 0x4b, 0x94,
+    0x08, 0x4e, 0xe0, 0x98, 0x00, 0xed, 0xd1, 0x8f, 0x00, 0xea, 0xd3, 0x03,
+    0x8b, 0x8c, 0x8a, 0x00, 0xed, 0x19, 0x83, 0x00, 0xea, 0x23, 0x03, 0x8b,
+    0x92, 0x8b, 0x00, 0xea, 0x71, 0xc6, 0x21, 0xa3, 0x00, 0xea, 0x61, 0x99,
+    0x05, 0x5b, 0x49, 0x94, 0x00, 0x15, 0xa3, 0x03, 0x8b, 0x9c, 0x9b, 0x08,
+    0x3d, 0x02, 0x03, 0x8b, 0xa2, 0xcc, 0x51, 0x28, 0x00, 0xed, 0xa9, 0xce,
+    0x01, 0x19, 0x08, 0x3d, 0x78, 0xd4, 0x01, 0x13, 0x08, 0x3d, 0x68, 0xc4,
+    0x00, 0x32, 0x00, 0xed, 0xe9, 0xce, 0x01, 0x19, 0x00, 0xed, 0xe0, 0xc4,
+    0x01, 0x23, 0x00, 0xed, 0xc9, 0xca, 0x9f, 0x4a, 0x08, 0x3d, 0x80, 0x97,
+    0x00, 0xed, 0xc1, 0x90, 0x00, 0xed, 0x81, 0x8e, 0x00, 0xed, 0x5b, 0x03,
+    0x8b, 0xa8, 0x8b, 0x00, 0xed, 0x33, 0x03, 0x8b, 0xae, 0x84, 0x08, 0x3c,
+    0x21, 0xc2, 0x04, 0xc6, 0x08, 0x3c, 0x01, 0x9b, 0x08, 0x3d, 0x91, 0x89,
+    0x08, 0x3c, 0x93, 0x03, 0x8b, 0xba, 0x8a, 0x08, 0x3c, 0xb1, 0xc2, 0x49,
+    0x0c, 0x08, 0x3d, 0x19, 0x94, 0x08, 0x3d, 0x50, 0xcf, 0x61, 0xe3, 0x08,
+    0x3c, 0x79, 0xc5, 0x9b, 0xd5, 0x08, 0x3d, 0x20, 0xc3, 0x01, 0x5d, 0x00,
+    0xed, 0xb1, 0xce, 0x6d, 0x40, 0x05, 0x5a, 0xf8, 0xc4, 0x01, 0x23, 0x00,
+    0xed, 0x99, 0xc4, 0x00, 0x32, 0x08, 0x3d, 0xd0, 0xc6, 0xbb, 0x8c, 0x00,
+    0xed, 0x11, 0xc3, 0x74, 0x83, 0x00, 0xea, 0x50, 0xcc, 0x51, 0x28, 0x00,
+    0xed, 0x51, 0xce, 0x01, 0x19, 0x00, 0xed, 0x4b, 0x03, 0x8b, 0xc0, 0xcc,
+    0x1e, 0xc1, 0x05, 0x5a, 0xf1, 0xcf, 0x68, 0x64, 0x05, 0x5a, 0xe9, 0xc4,
+    0xa8, 0x1a, 0x08, 0x3c, 0xd8, 0xd4, 0x01, 0x13, 0x08, 0x3c, 0xf8, 0xc9,
+    0x20, 0xb1, 0x08, 0x3c, 0xc0, 0xc3, 0x80, 0x9f, 0x00, 0xea, 0xf9, 0xca,
+    0x9a, 0x86, 0x08, 0x3c, 0x50, 0xc4, 0x01, 0x23, 0x08, 0x3c, 0x63, 0x03,
+    0x8b, 0xc6, 0xc4, 0x14, 0xa6, 0x08, 0x3c, 0x58, 0x46, 0x00, 0x8b, 0x43,
+    0x8b, 0xcc, 0xc6, 0x21, 0xa3, 0x00, 0xec, 0xf9, 0x87, 0x08, 0x3c, 0x71,
+    0xcc, 0x23, 0x33, 0x00, 0x17, 0x20, 0xc4, 0x14, 0xa6, 0x08, 0x3d, 0x41,
+    0xc8, 0x61, 0x72, 0x08, 0x3d, 0x48, 0xc3, 0x1c, 0x8d, 0x00, 0xeb, 0x01,
+    0xc5, 0x51, 0x51, 0x00, 0xea, 0xf0, 0x91, 0x00, 0xea, 0x99, 0x87, 0x00,
+    0xea, 0x58, 0xca, 0x1f, 0x59, 0x08, 0x3c, 0xb8, 0xc4, 0x01, 0x23, 0x00,
+    0x15, 0x89, 0xc6, 0x01, 0x73, 0x08, 0x3c, 0xa8, 0x90, 0x00, 0xe9, 0xd9,
+    0x87, 0x00, 0xe9, 0x90, 0xcc, 0x23, 0x3f, 0x08, 0x3d, 0xa0, 0x45, 0x19,
+    0x7c, 0xc3, 0x8b, 0xd8, 0xcc, 0x3e, 0xe6, 0x00, 0x17, 0x78, 0xce, 0x4e,
+    0x8d, 0x05, 0x38, 0xa9, 0xc6, 0x01, 0xa1, 0x00, 0x17, 0xfa, 0x03, 0x8b,
+    0xe4, 0xc7, 0x4e, 0x94, 0x00, 0x17, 0x41, 0xc4, 0x1e, 0xc9, 0x00, 0x17,
+    0xb8, 0xcd, 0x2f, 0xa1, 0x00, 0x17, 0x91, 0xc2, 0x00, 0x75, 0x00, 0x17,
+    0x98, 0x47, 0x19, 0x7a, 0xc3, 0x8b, 0xea, 0xd2, 0x4e, 0x89, 0x05, 0x38,
+    0xa1, 0xc8, 0x4e, 0x93, 0x00, 0x17, 0x38, 0xcc, 0x1f, 0x0c, 0x00, 0x17,
+    0xa1, 0x47, 0x00, 0x58, 0x43, 0x8b, 0xf6, 0xc8, 0x4e, 0x93, 0x05, 0x38,
+    0x41, 0xd2, 0x4e, 0x89, 0x05, 0x38, 0x68, 0xc8, 0x4e, 0x93, 0x05, 0x38,
+    0x61, 0xd2, 0x4e, 0x89, 0x05, 0x38, 0x88, 0x0f, 0x43, 0x8c, 0x02, 0xc2,
+    0x00, 0xba, 0x0e, 0xbe, 0x09, 0xc2, 0x00, 0x0a, 0x0e, 0xbd, 0xf9, 0x8b,
+    0x0e, 0xbd, 0xc8, 0xc2, 0x00, 0x0a, 0x0e, 0xbe, 0x00, 0xc6, 0x10, 0x3f,
+    0x0e, 0xbd, 0xf0, 0xc2, 0x20, 0xec, 0x0e, 0xbd, 0xe9, 0xc4, 0x89, 0xfe,
+    0x0e, 0xbd, 0x88, 0xc4, 0x1a, 0x73, 0x0e, 0xbd, 0xe0, 0xca, 0x91, 0x2c,
+    0x0e, 0xbd, 0xd8, 0xc2, 0x01, 0x23, 0x0e, 0xbd, 0xd0, 0x8b, 0x0e, 0xbd,
+    0xb8, 0x97, 0x0e, 0xbd, 0xb0, 0x97, 0x0e, 0xbd, 0xa8, 0xc4, 0xdd, 0x9a,
+    0x0e, 0xbd, 0xa0, 0xc4, 0x8b, 0x66, 0x0e, 0xbd, 0x98, 0xc3, 0x01, 0xbb,
+    0x0e, 0xbd, 0x90, 0xc2, 0x01, 0x6f, 0x0e, 0xbd, 0x81, 0xc6, 0x10, 0x3f,
+    0x0e, 0xbd, 0x70, 0xc3, 0x04, 0x87, 0x0e, 0xbd, 0x78, 0xc4, 0xdb, 0x4c,
+    0x0e, 0xbd, 0x68, 0xc4, 0x38, 0x2c, 0x0e, 0xbd, 0x60, 0xc3, 0x04, 0x87,
+    0x0e, 0xbd, 0x58, 0xc4, 0xde, 0x3f, 0x0e, 0xbd, 0x50, 0x0f, 0x43, 0x8c,
+    0x0e, 0xc2, 0x00, 0xba, 0x0e, 0xbd, 0x39, 0xc2, 0x00, 0x0a, 0x0e, 0xbd,
+    0x29, 0x8b, 0x0e, 0xbc, 0xf8, 0xc2, 0x00, 0x0a, 0x0e, 0xbd, 0x30, 0xc6,
+    0x10, 0x3f, 0x0e, 0xbd, 0x20, 0xc2, 0x20, 0xec, 0x0e, 0xbd, 0x19, 0xc4,
+    0x89, 0xfe, 0x0e, 0xbc, 0xba, 0x03, 0x8c, 0x1a, 0xc4, 0x1a, 0x73, 0x0e,
+    0xbd, 0x10, 0xc2, 0x01, 0x23, 0x0e, 0xbd, 0x00, 0x8b, 0x0e, 0xbc, 0xe8,
+    0x97, 0x0e, 0xbc, 0xe0, 0x97, 0x0e, 0xbc, 0xd8, 0xc4, 0xdd, 0x9a, 0x0e,
+    0xbc, 0xd0, 0xc4, 0x8b, 0x66, 0x0e, 0xbc, 0xc8, 0xc3, 0x01, 0xbb, 0x0e,
+    0xbc, 0xc0, 0xc2, 0x01, 0x6f, 0x0e, 0xbc, 0xb1, 0xc6, 0x10, 0x3f, 0x0e,
+    0xbc, 0xa0, 0xc3, 0x04, 0x87, 0x0e, 0xbc, 0xa8, 0xc4, 0xdb, 0x4c, 0x0e,
+    0xbc, 0x98, 0xc4, 0x38, 0x2c, 0x0e, 0xbc, 0x90, 0xc3, 0x04, 0x87, 0x0e,
+    0xbc, 0x88, 0xc4, 0xde, 0x3f, 0x0e, 0xbc, 0x80, 0xc3, 0x11, 0x7e, 0x0e,
+    0xbc, 0x41, 0xc5, 0xd8, 0x8f, 0x0e, 0xbb, 0xf0, 0xc3, 0x11, 0x7e, 0x0e,
+    0xbb, 0x71, 0xc5, 0xd8, 0x8f, 0x0e, 0xbb, 0x20, 0xc7, 0x00, 0x90, 0x0e,
+    0xbb, 0x38, 0x8e, 0x00, 0x6a, 0xb0, 0xc8, 0xb3, 0xb1, 0x0e, 0x8f, 0x41,
+    0xc9, 0xaf, 0xae, 0x0e, 0x8f, 0x00, 0x50, 0x59, 0xd2, 0xc3, 0x8c, 0x20,
+    0xcb, 0x94, 0xdd, 0x0e, 0x8e, 0xf8, 0xc2, 0x02, 0xae, 0x0e, 0x8f, 0x29,
+    0xc4, 0x03, 0xc8, 0x0e, 0x8f, 0x20, 0xc5, 0x02, 0xc2, 0x0e, 0x8a, 0x39,
+    0xc5, 0x01, 0xfc, 0x0e, 0x8a, 0x30, 0x47, 0xc3, 0x53, 0xc3, 0x8c, 0x2c,
+    0x47, 0xc6, 0x94, 0x43, 0x8c, 0x3e, 0x16, 0xc3, 0x8c, 0x50, 0x02, 0x43,
+    0x8c, 0x5c, 0xc4, 0x03, 0xc8, 0x0e, 0x89, 0x89, 0xc2, 0x02, 0xae, 0x0e,
+    0x89, 0x80, 0xc7, 0xc4, 0x9c, 0x0e, 0x8d, 0x79, 0xc4, 0x01, 0xc3, 0x0e,
+    0x8d, 0x70, 0xc7, 0xc8, 0xe7, 0x0e, 0x8e, 0xd0, 0xca, 0x68, 0x19, 0x0e,
+    0x8e, 0x5b, 0x03, 0x8c, 0x68, 0xc8, 0x68, 0x1b, 0x0e, 0x8e, 0x50, 0xc8,
+    0x68, 0x1b, 0x0e, 0x8e, 0x3b, 0x03, 0x8c, 0x6e, 0xca, 0x68, 0x19, 0x0e,
+    0x8e, 0x40, 0xc2, 0x02, 0xae, 0x0e, 0x8c, 0xd1, 0xc5, 0x03, 0x02, 0x0e,
+    0x8c, 0xc8, 0x55, 0x32, 0x96, 0xc3, 0x8c, 0x74, 0x4a, 0x32, 0x9c, 0x43,
+    0x8c, 0x80, 0xc4, 0x23, 0x2e, 0x0e, 0x8b, 0x11, 0xc4, 0x2c, 0x0d, 0x0e,
+    0x8a, 0x00, 0xc5, 0xdb, 0xeb, 0x0e, 0x8e, 0xb9, 0xc3, 0x30, 0xf3, 0x0e,
+    0x8e, 0xa8, 0xc5, 0x02, 0xc2, 0x0e, 0x8a, 0xd9, 0xc5, 0x01, 0xfc, 0x0e,
+    0x8a, 0xd0, 0x47, 0x1d, 0xd4, 0xc3, 0x8c, 0x98, 0xc8, 0xb9, 0x62, 0x0e,
+    0x89, 0xa0, 0xc6, 0xd1, 0xe1, 0x0e, 0x8e, 0x89, 0xc6, 0xcb, 0x39, 0x0e,
+    0x8e, 0x80, 0xc8, 0xbc, 0x72, 0x0e, 0x8c, 0xa9, 0xc5, 0x03, 0x02, 0x0e,
+    0x8c, 0xa0, 0xc5, 0xd7, 0x6d, 0x0e, 0x89, 0x01, 0xc4, 0xe2, 0x4b, 0x0e,
+    0x88, 0xf8, 0xc4, 0x2c, 0x0d, 0x0e, 0x8e, 0x29, 0xc5, 0x02, 0xc2, 0x0e,
+    0x8d, 0xe0, 0x18, 0xc3, 0x8c, 0xd7, 0xc8, 0xbe, 0x42, 0x0e, 0x88, 0x90,
+    0xc3, 0x00, 0x3c, 0x0e, 0x88, 0xa9, 0x87, 0x0e, 0x88, 0xa0, 0xcf, 0x68,
+    0x19, 0x0e, 0x8e, 0x11, 0xcd, 0x68, 0x1b, 0x0e, 0x8e, 0x08, 0xd0, 0x5b,
+    0x02, 0x0e, 0x88, 0xe9, 0xca, 0x74, 0x98, 0x0e, 0x88, 0xc8, 0x4e, 0x6d,
+    0xbe, 0xc3, 0x8c, 0xe4, 0xca, 0x44, 0x39, 0x0e, 0x88, 0x10, 0xc5, 0xd7,
+    0x6d, 0x0e, 0x89, 0x21, 0xc4, 0xe2, 0x4b, 0x0e, 0x89, 0x18, 0xc4, 0x63,
+    0xf2, 0x0e, 0x8d, 0xa8, 0x9e, 0x0e, 0x8d, 0x29, 0x9d, 0x0e, 0x8d, 0x20,
+    0xc4, 0x23, 0x2e, 0x0e, 0x8b, 0x21, 0xc4, 0x2c, 0x0d, 0x0e, 0x8a, 0x10,
+    0x4a, 0xa1, 0xde, 0xc3, 0x8c, 0xf0, 0xc5, 0x02, 0xa2, 0x0e, 0x88, 0x40,
+    0xc4, 0x35, 0x36, 0x0e, 0x89, 0x99, 0xc5, 0xa2, 0xba, 0x0e, 0x89, 0x90,
+    0xc3, 0x38, 0x5b, 0x00, 0xcf, 0xc9, 0xc4, 0xe0, 0xaf, 0x00, 0xcf, 0x48,
+    0xc3, 0x38, 0x5b, 0x00, 0xcf, 0xc1, 0xc4, 0xe0, 0xaf, 0x00, 0xcf, 0x40,
+    0xc3, 0xdf, 0x37, 0x00, 0xbf, 0xc9, 0xc2, 0x06, 0xdb, 0x00, 0xbf, 0xc0,
+    0xd3, 0x45, 0x4d, 0x0f, 0xd1, 0x91, 0xcf, 0x18, 0x0f, 0x0f, 0xd2, 0x18,
+    0xd0, 0x3c, 0x90, 0x01, 0x49, 0x71, 0xd0, 0x3c, 0x2c, 0x01, 0x49, 0x88,
+    0xc6, 0x13, 0x66, 0x01, 0x0f, 0x89, 0xc8, 0xb8, 0xca, 0x01, 0x0d, 0xc0,
+    0x46, 0x00, 0x8b, 0x43, 0x8c, 0xfc, 0x46, 0x00, 0x8b, 0x43, 0x8d, 0x1b,
+    0xc4, 0xe3, 0xab, 0x00, 0xff, 0x59, 0x18, 0xc3, 0x8d, 0x3f, 0xc6, 0x60,
+    0xb1, 0x00, 0xff, 0x49, 0x06, 0xc3, 0x8d, 0x4b, 0xc5, 0x63, 0xdc, 0x00,
+    0x1c, 0x70, 0xc4, 0xe3, 0xab, 0x00, 0xfe, 0xd9, 0x18, 0xc3, 0x8d, 0x5a,
+    0xc6, 0x60, 0xb1, 0x00, 0xfe, 0xc9, 0x06, 0xc3, 0x8d, 0x66, 0xc5, 0xd8,
+    0xc1, 0x00, 0xf9, 0xc3, 0x03, 0x8d, 0x75, 0xc5, 0x63, 0xdc, 0x00, 0x1c,
+    0x50, 0x46, 0x00, 0x8b, 0x43, 0x8d, 0x7b, 0x46, 0x00, 0x8b, 0x43, 0x8d,
+    0x9a, 0x46, 0x00, 0x8b, 0x43, 0x8d, 0xbe, 0x46, 0x00, 0x8b, 0x43, 0x8d,
+    0xe1, 0x46, 0x00, 0x8b, 0x43, 0x8e, 0x0c, 0x06, 0xc3, 0x8e, 0x30, 0x12,
+    0xc3, 0x8e, 0x42, 0xc6, 0x60, 0xb1, 0x00, 0xff, 0x09, 0x18, 0xc3, 0x8e,
+    0x51, 0xc4, 0xe3, 0xab, 0x00, 0xfb, 0xd9, 0xc5, 0x63, 0xdc, 0x00, 0x1e,
+    0x68, 0xc5, 0x6c, 0xa6, 0x00, 0xff, 0x29, 0xc5, 0xd8, 0xc1, 0x00, 0xff,
+    0x20, 0x06, 0xc3, 0x8e, 0x5d, 0x12, 0xc3, 0x8e, 0x6f, 0xc6, 0x60, 0xb1,
+    0x00, 0xfe, 0x89, 0x18, 0xc3, 0x8e, 0x7e, 0xc4, 0xe3, 0xab, 0x00, 0xfb,
+    0xb9, 0xc5, 0x63, 0xdc, 0x00, 0x1d, 0x78, 0x46, 0x00, 0x8b, 0x43, 0x8e,
+    0x8a, 0x46, 0x00, 0x8b, 0x43, 0x8e, 0xb5, 0x46, 0x00, 0x8b, 0x43, 0x8e,
+    0xd9, 0xc5, 0x78, 0xc7, 0x00, 0x1e, 0xc9, 0xc5, 0x87, 0xf4, 0x00, 0x1b,
+    0x98, 0x90, 0x00, 0x1f, 0xd9, 0xc3, 0x87, 0xf6, 0x00, 0x1f, 0x08, 0xc2,
+    0x00, 0xba, 0x00, 0xe9, 0x51, 0x8b, 0x00, 0xe9, 0x40, 0xc3, 0x01, 0xcf,
+    0x08, 0x0a, 0x09, 0x47, 0x0d, 0x05, 0x43, 0x8f, 0x05, 0xc7, 0xb9, 0xdb,
+    0x08, 0x0a, 0x69, 0xc7, 0x67, 0xc7, 0x08, 0x0a, 0xa0, 0x00, 0x43, 0x8f,
+    0x11, 0x00, 0x43, 0x8f, 0x24, 0xc6, 0xb9, 0xdc, 0x08, 0x0a, 0x49, 0xcf,
+    0x67, 0xbf, 0x08, 0x0a, 0xa8, 0x00, 0x43, 0x8f, 0x2e, 0xc2, 0x02, 0xa0,
+    0x08, 0x0a, 0xe1, 0xc2, 0x00, 0xc4, 0x08, 0x0b, 0x21, 0x0a, 0x43, 0x8f,
+    0x3a, 0xc3, 0x45, 0x6b, 0x08, 0x0b, 0x49, 0x43, 0x00, 0xc7, 0x43, 0x8f,
+    0x46, 0xc2, 0x00, 0x5f, 0x08, 0x0a, 0xfb, 0x03, 0x8f, 0x52, 0xc3, 0x45,
+    0x6b, 0x08, 0x0b, 0x32, 0x03, 0x8f, 0x58, 0xcf, 0x6b, 0x25, 0x08, 0x0b,
+    0x08, 0xd3, 0x41, 0x12, 0x08, 0x78, 0xe0, 0xd3, 0x41, 0x12, 0x08, 0x78,
+    0xb8, 0xd3, 0x41, 0x12, 0x08, 0x78, 0x80, 0xc3, 0x77, 0x79, 0x08, 0x78,
+    0xa9, 0xc4, 0xdc, 0x2d, 0x08, 0x78, 0x88, 0xcc, 0x85, 0xdd, 0x08, 0x78,
+    0x99, 0xc3, 0x36, 0xb6, 0x08, 0x78, 0x00, 0xc2, 0xe5, 0xfd, 0x08, 0x1e,
+    0x49, 0xc2, 0x00, 0xd0, 0x08, 0x1e, 0x50, 0xc7, 0xc1, 0x8c, 0x08, 0x1e,
+    0x62, 0x03, 0x8f, 0x5e, 0xc2, 0x01, 0x30, 0x08, 0x1e, 0x70, 0x91, 0x08,
+    0x1e, 0x91, 0xc4, 0x18, 0x12, 0x08, 0x1e, 0xa0, 0xc7, 0xca, 0x06, 0x0e,
+    0x7d, 0xf1, 0x44, 0xe0, 0x6b, 0xc3, 0x8f, 0x64, 0xc9, 0x92, 0x8d, 0x0e,
+    0x7d, 0xb0, 0xd0, 0x58, 0xe2, 0x0e, 0x7d, 0x21, 0xd0, 0x2d, 0x10, 0x0e,
+    0x7d, 0x08, 0xcb, 0x93, 0xeb, 0x0e, 0x7c, 0x79, 0xc7, 0x78, 0xdb, 0x0e,
+    0x7c, 0x48, 0x87, 0x00, 0xb3, 0x50, 0x87, 0x00, 0xb1, 0xb8, 0x8b, 0x00,
+    0xa7, 0x08, 0x91, 0x00, 0xa7, 0x28, 0x83, 0x00, 0xa7, 0x48, 0x8b, 0x00,
+    0xa2, 0xe0, 0x91, 0x00, 0xa3, 0x00, 0x83, 0x00, 0xa3, 0x20, 0x83, 0x00,
+    0xa9, 0xe0, 0x91, 0x00, 0xa9, 0xc0, 0x8b, 0x00, 0xa9, 0xa0, 0x83, 0x00,
+    0xa9, 0x20, 0x8b, 0x00, 0xa8, 0xe0, 0x91, 0x00, 0xa9, 0x00, 0x83, 0x00,
+    0xa8, 0x18, 0x8b, 0x00, 0xa7, 0xd8, 0x91, 0x00, 0xa7, 0xf8, 0x83, 0x00,
+    0xa2, 0x38, 0x91, 0x00, 0xa2, 0x18, 0x8b, 0x00, 0xa1, 0xf8, 0x8b, 0x00,
+    0xa5, 0x88, 0x91, 0x00, 0xa5, 0xa8, 0x83, 0x00, 0xa5, 0xc8, 0x83, 0x00,
+    0xb3, 0xe8, 0x91, 0x00, 0xb3, 0xd8, 0x8b, 0x00, 0xb3, 0xc8, 0x43, 0x02,
+    0x9c, 0xc3, 0x8f, 0x71, 0xc4, 0x00, 0xd5, 0x00, 0x1a, 0x80, 0x96, 0x01,
+    0x66, 0xa8, 0x96, 0x01, 0x66, 0xa0, 0xcd, 0x0d, 0xad, 0x01, 0x92, 0x49,
+    0x87, 0x01, 0x92, 0x88, 0xc2, 0x02, 0xa0, 0x01, 0x92, 0x91, 0xc4, 0x02,
+    0xde, 0x01, 0x92, 0x98, 0xc3, 0x09, 0x9e, 0x01, 0x92, 0xa1, 0xc3, 0x0d,
+    0x14, 0x01, 0x92, 0xa8, 0xc2, 0x22, 0xcc, 0x01, 0x92, 0xb1, 0xc4, 0x18,
+    0x10, 0x01, 0x92, 0xb8, 0xcd, 0x0d, 0xad, 0x01, 0x92, 0x51, 0x87, 0x01,
+    0x92, 0xd8, 0xc2, 0x02, 0xa0, 0x01, 0x92, 0xe1, 0xc4, 0x02, 0xde, 0x01,
+    0x92, 0xe8, 0xc3, 0x09, 0x9e, 0x01, 0x92, 0xf1, 0xc3, 0x0d, 0x14, 0x01,
+    0x92, 0xf8, 0xc2, 0x22, 0xcc, 0x01, 0x95, 0x89, 0xc4, 0x18, 0x10, 0x01,
+    0x95, 0x90, 0xcd, 0x0d, 0xad, 0x01, 0x92, 0x59, 0x87, 0x01, 0x95, 0xb0,
+    0xc2, 0x02, 0xa0, 0x01, 0x95, 0xb9, 0xc4, 0x02, 0xde, 0x01, 0x95, 0xc0,
+    0xc3, 0x09, 0x9e, 0x01, 0x95, 0xc9, 0xc3, 0x0d, 0x14, 0x01, 0x95, 0xd0,
+    0xc2, 0x22, 0xcc, 0x01, 0x95, 0xd9, 0xc4, 0x18, 0x10, 0x01, 0x95, 0xe0,
+    0x46, 0x25, 0xd4, 0x43, 0x8f, 0x7d, 0xc2, 0x00, 0xc1, 0x09, 0x19, 0x69,
+    0xc2, 0x00, 0xd0, 0x09, 0x19, 0x60, 0xc9, 0xb4, 0x6d, 0x09, 0x29, 0x79,
+    0xc2, 0x02, 0xfb, 0x09, 0x15, 0x00, 0x8e, 0x09, 0x29, 0x21, 0x86, 0x09,
+    0x12, 0xb0, 0xc2, 0x01, 0xe2, 0x09, 0x29, 0x18, 0xc2, 0x01, 0xe2, 0x09,
+    0x12, 0xe3, 0x03, 0x8f, 0x89, 0xc3, 0x01, 0xb2, 0x09, 0x12, 0xd8, 0xc9,
+    0x40, 0xaa, 0x09, 0x12, 0xa8, 0xc8, 0xb5, 0x8a, 0x09, 0x11, 0xd8, 0xc3,
+    0x38, 0xb5, 0x09, 0x28, 0xf1, 0xc3, 0x0b, 0x47, 0x09, 0x10, 0x80, 0xd2,
+    0x36, 0x5f, 0x09, 0x28, 0xe8, 0xc2, 0x00, 0x65, 0x09, 0x28, 0xd9, 0xcb,
+    0x8d, 0x2c, 0x09, 0x10, 0x18, 0xc2, 0x06, 0x47, 0x09, 0x1c, 0x59, 0x0b,
+    0x43, 0x8f, 0x8f, 0x00, 0x43, 0x8f, 0x9b, 0x97, 0x09, 0x10, 0x69, 0x87,
+    0x09, 0x10, 0x60, 0xc3, 0x03, 0x49, 0x09, 0x10, 0x51, 0xc9, 0x40, 0xaa,
+    0x09, 0x10, 0x48, 0x8b, 0x09, 0x10, 0x41, 0x42, 0x01, 0x9d, 0x43, 0x8f,
+    0xa7, 0xc4, 0xdc, 0xae, 0x09, 0x28, 0xb1, 0x86, 0x09, 0x28, 0xa8, 0xc5,
+    0x39, 0xc7, 0x09, 0x28, 0x88, 0xc4, 0xdc, 0xae, 0x09, 0x28, 0x59, 0x86,
+    0x09, 0x28, 0x51, 0x9f, 0x09, 0x28, 0x48, 0x87, 0x09, 0x28, 0x41, 0xc2,
+    0x00, 0xb1, 0x09, 0x28, 0x38, 0xca, 0xa6, 0xfc, 0x09, 0x27, 0xb1, 0x49,
+    0x36, 0x5c, 0xc3, 0x8f, 0xb2, 0xc3, 0x04, 0x2a, 0x09, 0x27, 0x99, 0xc2,
+    0x08, 0x6d, 0x09, 0x27, 0x90, 0x8b, 0x09, 0x1c, 0x41, 0xc2, 0x04, 0x3d,
+    0x09, 0x0e, 0x33, 0x03, 0x8f, 0xbe, 0x83, 0x09, 0x0e, 0x22, 0x03, 0x8f,
+    0xc4, 0xc2, 0x01, 0xe2, 0x09, 0x0f, 0x51, 0x86, 0x09, 0x0f, 0x49, 0xca,
+    0xa0, 0xb2, 0x09, 0x0f, 0x41, 0x46, 0x25, 0xd4, 0x43, 0x8f, 0xc8, 0xd8,
+    0x25, 0xd3, 0x09, 0x0f, 0x21, 0x03, 0x43, 0x8f, 0xd2, 0xc2, 0x01, 0xdf,
+    0x09, 0x0f, 0x09, 0x0a, 0x43, 0x8f, 0xdc, 0xc3, 0x5d, 0xd1, 0x09, 0x0e,
+    0xd1, 0x87, 0x09, 0x0e, 0xc2, 0x03, 0x8f, 0xf1, 0x97, 0x09, 0x0e, 0xb3,
+    0x03, 0x8f, 0xf7, 0xc3, 0x04, 0x5a, 0x09, 0x0e, 0xa9, 0xc4, 0x03, 0x48,
+    0x09, 0x0e, 0xa0, 0x17, 0xc3, 0x8f, 0xfb, 0x8b, 0x09, 0x0e, 0x7a, 0x03,
+    0x90, 0x06, 0x8f, 0x09, 0x0e, 0x63, 0x03, 0x90, 0x0a, 0xc7, 0x6a, 0x1f,
+    0x09, 0x0e, 0x58, 0xcb, 0x8d, 0x21, 0x09, 0x0e, 0x51, 0x83, 0x09, 0x0e,
+    0x42, 0x03, 0x90, 0x10, 0x8b, 0x09, 0x0e, 0x09, 0xc2, 0x01, 0x9d, 0x09,
+    0x0e, 0x00, 0xcc, 0x83, 0xa9, 0x09, 0x0d, 0xf9, 0x90, 0x09, 0x0d, 0xf1,
+    0x8e, 0x09, 0x0d, 0xe9, 0x46, 0x25, 0xd4, 0x43, 0x90, 0x14, 0xcd, 0x47,
+    0xaa, 0x09, 0x0b, 0x51, 0xc8, 0x54, 0x29, 0x09, 0x0b, 0x48, 0xd2, 0x47,
+    0xa5, 0x09, 0x26, 0x59, 0xc4, 0x38, 0xb4, 0x09, 0x08, 0xa1, 0xc3, 0x62,
+    0x19, 0x09, 0x08, 0x98, 0x0b, 0xc3, 0x90, 0x26, 0x87, 0x09, 0x07, 0x2a,
+    0x03, 0x90, 0x2e, 0x94, 0x09, 0x07, 0x21, 0x8e, 0x09, 0x07, 0x18, 0x46,
+    0x25, 0xd4, 0x43, 0x90, 0x34, 0xc9, 0x20, 0x12, 0x09, 0x07, 0x08, 0x8f,
+    0x09, 0x26, 0x02, 0x03, 0x90, 0x40, 0xd0, 0x5d, 0xd2, 0x09, 0x25, 0xf9,
+    0xc9, 0xaa, 0x17, 0x09, 0x06, 0xe0, 0xc9, 0xaa, 0xef, 0x09, 0x06, 0xd8,
+    0xc4, 0x45, 0xaf, 0x09, 0x06, 0xc9, 0x8d, 0x09, 0x06, 0xc0, 0x46, 0x25,
+    0xd4, 0xc3, 0x90, 0x46, 0x8e, 0x09, 0x06, 0x92, 0x03, 0x90, 0x50, 0x94,
+    0x09, 0x06, 0x63, 0x03, 0x90, 0x56, 0xc7, 0x5d, 0x9b, 0x09, 0x06, 0x58,
+    0xca, 0x9c, 0x66, 0x09, 0x06, 0x81, 0xa1, 0x09, 0x06, 0x72, 0x03, 0x90,
+    0x5c, 0xd0, 0x5d, 0x92, 0x09, 0x06, 0x50, 0xc8, 0xaa, 0xef, 0x09, 0x06,
+    0x40, 0x48, 0x6c, 0xd6, 0xc3, 0x90, 0x62, 0x84, 0x09, 0x06, 0x30, 0x42,
+    0x00, 0x47, 0x43, 0x90, 0x6e, 0xc4, 0x38, 0x68, 0x09, 0x25, 0xb1, 0xc9,
+    0xaa, 0x5f, 0x09, 0x06, 0x01, 0x86, 0x09, 0x05, 0xf8, 0xc8, 0xaa, 0x60,
+    0x09, 0x06, 0x10, 0x9f, 0x09, 0x1b, 0xd2, 0x03, 0x90, 0x7a, 0xd0, 0x5b,
+    0x42, 0x09, 0x1b, 0xc8, 0xc3, 0x04, 0x2a, 0x09, 0x05, 0xd1, 0xc2, 0x00,
+    0xd0, 0x09, 0x05, 0xc9, 0xca, 0xa4, 0x4a, 0x09, 0x05, 0xc0, 0xc8, 0xb5,
+    0x92, 0x09, 0x07, 0x60, 0xca, 0x51, 0xd4, 0x09, 0x25, 0x00, 0xcc, 0x5d,
+    0xd6, 0x09, 0x24, 0xe8, 0xc4, 0x4a, 0x0f, 0x09, 0x1b, 0x99, 0xc4, 0xe0,
+    0x5f, 0x09, 0x03, 0x60, 0x8f, 0x09, 0x03, 0x39, 0xcb, 0x97, 0xbe, 0x09,
+    0x03, 0x30, 0xc2, 0x38, 0x6a, 0x09, 0x02, 0xf0, 0xca, 0x97, 0xbe, 0x09,
+    0x02, 0xe0, 0x00, 0x43, 0x90, 0x80, 0x00, 0x43, 0x90, 0xa4, 0x14, 0xc3,
+    0x90, 0xd8, 0xc6, 0x13, 0x95, 0x0e, 0xc6, 0x61, 0x46, 0x0e, 0xce, 0xc3,
+    0x90, 0xe4, 0xc2, 0x02, 0xae, 0x0e, 0xc6, 0x33, 0x03, 0x90, 0xfa, 0xc4,
+    0x03, 0xc8, 0x0e, 0xc6, 0x21, 0xcf, 0x62, 0x2e, 0x0e, 0xc0, 0xe0, 0xc5,
+    0x0e, 0xce, 0x0e, 0xc5, 0xc1, 0xc5, 0x06, 0x82, 0x0e, 0xc5, 0xb9, 0xc6,
+    0x04, 0xcb, 0x0e, 0xc5, 0xa3, 0x03, 0x91, 0x00, 0xc6, 0x13, 0x95, 0x0e,
+    0xc5, 0x81, 0xce, 0x3a, 0x9d, 0x0e, 0xc5, 0x79, 0xc2, 0x02, 0xae, 0x0e,
+    0xc5, 0x71, 0xc4, 0x03, 0xc8, 0x0e, 0xc5, 0x58, 0xc5, 0x06, 0x82, 0x0e,
+    0xc5, 0x03, 0x03, 0x91, 0x04, 0x16, 0xc3, 0x91, 0x0a, 0xc4, 0x18, 0xf2,
+    0x0e, 0xc4, 0xc1, 0xce, 0x3a, 0x9d, 0x0e, 0xc4, 0xb9, 0xc2, 0x02, 0xae,
+    0x0e, 0xc4, 0x91, 0xc4, 0x03, 0xc8, 0x0e, 0xc4, 0x72, 0x03, 0x91, 0x16,
+    0xc6, 0x13, 0x95, 0x0e, 0xc3, 0x29, 0xc6, 0x04, 0xe1, 0x0e, 0xc3, 0x13,
+    0x03, 0x91, 0x1a, 0xd0, 0x5a, 0x02, 0x0e, 0xc3, 0x08, 0xc7, 0x27, 0xb2,
+    0x0e, 0xc3, 0x01, 0xc4, 0x18, 0xf2, 0x0e, 0xc2, 0xf9, 0xc4, 0x0e, 0xe2,
+    0x0e, 0xc2, 0xe8, 0x00, 0x43, 0x91, 0x23, 0xd2, 0x26, 0x32, 0x0e, 0xc2,
+    0x63, 0x03, 0x91, 0x32, 0xcb, 0x18, 0xdc, 0x0e, 0xc2, 0x22, 0x03, 0x91,
+    0x36, 0xc5, 0x0e, 0xce, 0x0e, 0xc7, 0xa3, 0x03, 0x91, 0x3a, 0xcb, 0x13,
+    0x90, 0x0e, 0xc6, 0x1b, 0x03, 0x91, 0x3e, 0x47, 0x04, 0xcb, 0x43, 0x91,
+    0x44, 0xc2, 0x00, 0x74, 0x0e, 0xc6, 0x99, 0xc3, 0x00, 0xa3, 0x0e, 0xc6,
+    0x90, 0xd2, 0x4c, 0x6d, 0x0e, 0xc4, 0xfa, 0x03, 0x91, 0x50, 0x00, 0x43,
+    0x91, 0x56, 0xcc, 0x13, 0x8f, 0x0e, 0xc6, 0x88, 0xdd, 0x11, 0xa8, 0x0e,
+    0xc5, 0x60, 0x00, 0x43, 0x91, 0x71, 0xd3, 0x40, 0xff, 0x0e, 0xc4, 0x21,
+    0xc4, 0x0e, 0xe2, 0x0e, 0xc4, 0x02, 0x03, 0x91, 0x80, 0x00, 0x43, 0x91,
+    0x86, 0xd7, 0x26, 0x32, 0x0e, 0xc2, 0xa9, 0xd5, 0x18, 0xdc, 0x0e, 0xc2,
+    0x58, 0xd5, 0x13, 0x90, 0x0e, 0xc6, 0xd3, 0x03, 0x91, 0x92, 0xc5, 0x0e,
+    0xce, 0x0e, 0xc6, 0x50, 0xc5, 0x16, 0xca, 0x0e, 0xc5, 0xf9, 0xc2, 0x00,
+    0x74, 0x0e, 0xc5, 0xf1, 0xc3, 0x00, 0xa3, 0x0e, 0xc5, 0xe8, 0xc5, 0x06,
+    0x82, 0x0e, 0xc0, 0x13, 0x03, 0x91, 0x96, 0xd2, 0x13, 0x89, 0x0e, 0xc6,
+    0x81, 0x46, 0x0e, 0xce, 0xc3, 0x91, 0x9a, 0xc4, 0x05, 0x75, 0x0e, 0xc3,
+    0x63, 0x03, 0x91, 0xa6, 0xc8, 0xbc, 0x62, 0x0e, 0xc3, 0x89, 0xd3, 0x46,
+    0x57, 0x0e, 0xc2, 0x9a, 0x03, 0x91, 0xaa, 0xd5, 0x37, 0x04, 0x0e, 0xc6,
+    0x79, 0xd4, 0x3c, 0x00, 0x0e, 0xc5, 0xe1, 0xc4, 0x05, 0x75, 0x0e, 0xc3,
+    0xa0, 0xc5, 0x37, 0x20, 0x0e, 0xc6, 0xb8, 0xc7, 0x27, 0xb2, 0x0e, 0xc3,
+    0x49, 0xc4, 0x0e, 0xe2, 0x0e, 0xc3, 0x38, 0xcb, 0x13, 0x90, 0x0e, 0xc6,
+    0x73, 0x03, 0x91, 0xb0, 0xc2, 0x02, 0xae, 0x0e, 0xc6, 0x38, 0x00, 0x43,
+    0x91, 0xb6, 0xc5, 0x06, 0x82, 0x0e, 0xc5, 0x09, 0xc2, 0x02, 0xae, 0x0e,
+    0xc4, 0xa0, 0xc5, 0x17, 0x14, 0x0e, 0xce, 0x89, 0xc5, 0x03, 0x13, 0x0e,
+    0xce, 0x80, 0xc5, 0x17, 0x14, 0x0e, 0xce, 0x11, 0xc5, 0x03, 0x13, 0x0e,
+    0xce, 0x08, 0xc2, 0x00, 0x15, 0x0e, 0xcb, 0x40, 0xc6, 0x00, 0x58, 0x0e,
+    0xce, 0x79, 0xc6, 0x24, 0x3b, 0x0e, 0xce, 0x68, 0xc6, 0x00, 0x58, 0x0e,
+    0xce, 0x71, 0xc6, 0x24, 0x3b, 0x0e, 0xce, 0x60, 0xc6, 0x00, 0x58, 0x0e,
+    0xce, 0x01, 0xc6, 0x24, 0x3b, 0x0e, 0xcd, 0xf0, 0xc6, 0x00, 0x58, 0x0e,
+    0xcd, 0xf9, 0xc6, 0x24, 0x3b, 0x0e, 0xcd, 0xe8, 0xcc, 0x8a, 0xf9, 0x0e,
+    0xce, 0x59, 0xcc, 0x82, 0x89, 0x0e, 0xce, 0x50, 0xc6, 0x2c, 0x2e, 0x0e,
+    0xcd, 0xe1, 0xc6, 0x00, 0x58, 0x0e, 0xcd, 0xd0, 0xc6, 0x2c, 0x2e, 0x0e,
+    0xcd, 0xd9, 0xc6, 0x00, 0x58, 0x0e, 0xcd, 0xc8, 0xc5, 0x17, 0x14, 0x0e,
+    0xce, 0x39, 0xc5, 0x03, 0x13, 0x0e, 0xce, 0x30, 0xc5, 0x17, 0x14, 0x0e,
+    0xcd, 0xc1, 0xc5, 0x03, 0x13, 0x0e, 0xcd, 0xb8, 0xc5, 0x17, 0x14, 0x0e,
+    0xcc, 0xf1, 0xc6, 0x01, 0xdb, 0x0e, 0xcc, 0xe9, 0xc5, 0x03, 0x13, 0x0e,
+    0xcc, 0xe0, 0xc5, 0x17, 0x14, 0x0e, 0xcc, 0xd9, 0xc6, 0x01, 0xdb, 0x0e,
+    0xcc, 0xd1, 0xc5, 0x03, 0x13, 0x0e, 0xcc, 0xc8, 0x47, 0x20, 0x38, 0xc3,
+    0x91, 0xd1, 0x4b, 0x27, 0x7b, 0x43, 0x91, 0xdd, 0xcb, 0x93, 0x1a, 0x0e,
+    0xcc, 0xf9, 0x53, 0x41, 0xd0, 0x43, 0x91, 0xf2, 0xc5, 0x17, 0x14, 0x0e,
+    0xcc, 0x53, 0x03, 0x91, 0xfe, 0xc6, 0x01, 0xdb, 0x0e, 0xcc, 0x49, 0xc5,
+    0x03, 0x13, 0x0e, 0xcc, 0x40, 0xc2, 0x00, 0x15, 0x0e, 0xc9, 0x68, 0x45,
+    0x00, 0x8c, 0xc3, 0x92, 0x04, 0xc6, 0x10, 0x9d, 0x01, 0x5b, 0x99, 0x4a,
+    0x01, 0x88, 0x43, 0x92, 0x2e, 0xe0, 0x01, 0x47, 0x01, 0x4b, 0x28, 0xd0,
+    0x57, 0xc2, 0x0f, 0xc1, 0x91, 0xcb, 0x57, 0xc7, 0x0f, 0xc1, 0x71, 0xca,
+    0xa0, 0x08, 0x0f, 0xc1, 0x51, 0x47, 0x00, 0x58, 0xc3, 0x92, 0x34, 0x49,
+    0xa8, 0xdc, 0xc3, 0x92, 0x40, 0xcc, 0x84, 0x09, 0x0f, 0xc1, 0x11, 0xcc,
+    0x82, 0x1d, 0x0f, 0xc1, 0x30, 0xe0, 0x01, 0x87, 0x01, 0x5c, 0x10, 0x46,
+    0x00, 0x8b, 0x43, 0x92, 0x4c, 0xe0, 0x09, 0x67, 0x01, 0x4b, 0x48, 0x0e,
+    0xc3, 0x92, 0x58, 0x14, 0x43, 0x92, 0x64, 0x90, 0x00, 0x70, 0x81, 0xc3,
+    0x00, 0xd0, 0x00, 0x70, 0xb8, 0xca, 0x26, 0xf7, 0x07, 0xea, 0xc1, 0xcc,
+    0x10, 0xb4, 0x07, 0xea, 0xc8, 0xcb, 0x64, 0x7b, 0x07, 0xe7, 0x51, 0xcc,
+    0x10, 0xb4, 0x07, 0xe9, 0x90, 0x0b, 0xc3, 0x92, 0x6a, 0xca, 0x26, 0xf7,
+    0x07, 0xe9, 0x31, 0xcb, 0x64, 0x7b, 0x07, 0xe9, 0xc1, 0x45, 0x00, 0x8c,
+    0x43, 0x92, 0x76, 0xcb, 0x10, 0xb5, 0x07, 0xe9, 0x81, 0xcc, 0x00, 0xfb,
+    0x07, 0xe8, 0x60, 0x45, 0x50, 0xf0, 0xc3, 0x92, 0x82, 0x45, 0x19, 0x60,
+    0x43, 0x92, 0x8e, 0xcb, 0x10, 0xb5, 0x07, 0xe9, 0x69, 0xcc, 0x00, 0xfb,
+    0x07, 0xe8, 0x48, 0xcb, 0x10, 0xb5, 0x07, 0xe9, 0x79, 0xcc, 0x00, 0xfb,
+    0x07, 0xe8, 0x58, 0xcb, 0x64, 0x7b, 0x07, 0xe7, 0xa1, 0xcd, 0x00, 0xfa,
+    0x07, 0xe3, 0x10, 0xcb, 0x64, 0x7b, 0x07, 0xe7, 0x99, 0xcd, 0x00, 0xfa,
+    0x07, 0xe3, 0x08, 0xca, 0x26, 0xf7, 0x07, 0xea, 0xf9, 0xcc, 0x10, 0xb4,
+    0x07, 0xeb, 0x00, 0xca, 0x26, 0xf7, 0x07, 0xeb, 0x11, 0xcc, 0x10, 0xb4,
+    0x07, 0xeb, 0x18, 0xcc, 0x00, 0xfb, 0x07, 0xe0, 0xe9, 0xcb, 0x10, 0xb5,
+    0x07, 0xe5, 0x70, 0xcc, 0x00, 0xfb, 0x07, 0xe1, 0x09, 0xcb, 0x10, 0xb5,
+    0x07, 0xe5, 0x98, 0xca, 0x26, 0xf7, 0x07, 0xeb, 0x31, 0xcc, 0x10, 0xb4,
+    0x07, 0xee, 0x28, 0xcc, 0x00, 0xfb, 0x07, 0xe1, 0x01, 0xcb, 0x10, 0xb5,
+    0x07, 0xe5, 0x88, 0x44, 0x19, 0x6a, 0xc3, 0x92, 0x9a, 0xce, 0x43, 0x77,
+    0x07, 0xed, 0x48, 0xd3, 0x40, 0x41, 0x07, 0xea, 0x31, 0x0a, 0x43, 0x92,
+    0xa6, 0x47, 0xa6, 0xcd, 0xc3, 0x92, 0xb2, 0xcd, 0x00, 0xfa, 0x07, 0xef,
+    0xc8, 0xca, 0x26, 0xf7, 0x07, 0xeb, 0xb1, 0xcc, 0x10, 0xb4, 0x07, 0xeb,
+    0xb8, 0x8f, 0x07, 0xea, 0x39, 0xcd, 0x76, 0x28, 0x07, 0xea, 0x50, 0xca,
+    0x82, 0xa3, 0x07, 0xea, 0x41, 0xcc, 0x82, 0xa1, 0x07, 0xea, 0x48, 0xcc,
+    0x00, 0xfb, 0x07, 0xe1, 0x39, 0xcb, 0x10, 0xb5, 0x07, 0xe9, 0x98, 0x44,
+    0x19, 0x6a, 0xc3, 0x92, 0xb8, 0xd1, 0x50, 0x13, 0x07, 0xeb, 0x99, 0xce,
+    0x43, 0x77, 0x07, 0xeb, 0xa0, 0xcc, 0x00, 0xfb, 0x07, 0xe0, 0x91, 0xcb,
+    0x10, 0xb5, 0x07, 0xe5, 0x30, 0xcc, 0x00, 0xfb, 0x07, 0xe0, 0x61, 0xcb,
+    0x10, 0xb5, 0x07, 0xe5, 0x10, 0x45, 0x30, 0xc1, 0xc3, 0x92, 0xc4, 0xd1,
+    0x50, 0x13, 0x07, 0xea, 0x98, 0x43, 0x2b, 0xba, 0xc3, 0x92, 0xd0, 0x42,
+    0x03, 0x53, 0x43, 0x92, 0xdc, 0x44, 0x06, 0x5b, 0xc3, 0x92, 0xe8, 0x42,
+    0x00, 0x5d, 0x43, 0x92, 0xfa, 0xca, 0x26, 0xf7, 0x07, 0xe3, 0x31, 0x0b,
+    0xc3, 0x93, 0x06, 0xcb, 0x64, 0x7b, 0x07, 0xe6, 0xf8, 0x44, 0x50, 0xf2,
+    0xc3, 0x93, 0x12, 0x43, 0x2b, 0xba, 0x43, 0x93, 0x1e, 0xcc, 0x00, 0xfb,
+    0x07, 0xe0, 0x01, 0xcb, 0x10, 0xb5, 0x07, 0xe4, 0xb8, 0x0b, 0xc3, 0x93,
+    0x2a, 0xca, 0x26, 0xf7, 0x07, 0xdf, 0xb8, 0xca, 0x26, 0xf7, 0x07, 0xdf,
+    0x99, 0xcd, 0x00, 0xfa, 0x07, 0xdf, 0x90, 0xca, 0x26, 0xf7, 0x07, 0xdf,
+    0x89, 0xcd, 0x00, 0xfa, 0x07, 0xdf, 0x80, 0xca, 0x26, 0xf7, 0x07, 0xdf,
+    0x79, 0xcd, 0x00, 0xfa, 0x07, 0xdf, 0x70, 0xcc, 0x00, 0xfb, 0x07, 0xe2,
+    0xb1, 0xcb, 0x10, 0xb5, 0x07, 0xe6, 0xd8, 0xca, 0x26, 0xf7, 0x07, 0xed,
+    0xd9, 0xcc, 0x10, 0xb4, 0x07, 0xee, 0x18, 0xcd, 0x00, 0xfa, 0x07, 0xf7,
+    0xc9, 0xca, 0x26, 0xf7, 0x07, 0xf7, 0xd0, 0xcd, 0x00, 0xfa, 0x07, 0xf7,
+    0xb9, 0xca, 0x26, 0xf7, 0x07, 0xf7, 0xc0, 0xca, 0x26, 0xf7, 0x07, 0xec,
+    0x01, 0xcc, 0x10, 0xb4, 0x07, 0xed, 0xa8, 0xcc, 0x00, 0xfb, 0x07, 0xe1,
+    0xa1, 0xcb, 0x10, 0xb5, 0x07, 0xe6, 0x18, 0x44, 0x19, 0x6a, 0xc3, 0x93,
+    0x36, 0xcf, 0x67, 0x65, 0x07, 0xeb, 0xf9, 0xce, 0x43, 0x77, 0x07, 0xed,
+    0x90, 0xcc, 0x00, 0xfb, 0x07, 0xe0, 0x31, 0xcb, 0x10, 0xb5, 0x07, 0xe4,
+    0xe8, 0xc2, 0x04, 0xc6, 0x07, 0xea, 0x20, 0xcb, 0x10, 0xb5, 0x07, 0xdf,
+    0xf1, 0xcc, 0x00, 0xfb, 0x07, 0xdf, 0xe0, 0x16, 0xc3, 0x93, 0x42, 0xca,
+    0x35, 0x7a, 0x00, 0x31, 0xe9, 0x5c, 0x10, 0x12, 0x43, 0x93, 0x4e, 0x44,
+    0x05, 0x18, 0xc3, 0x93, 0x58, 0x16, 0x43, 0x93, 0x67, 0xcc, 0x00, 0xfb,
+    0x07, 0xf6, 0x89, 0xcb, 0x10, 0xb5, 0x07, 0xf6, 0x98, 0xd0, 0x0e, 0x7c,
+    0x00, 0x46, 0x19, 0xc9, 0x0e, 0x6e, 0x00, 0x37, 0xe0, 0xcc, 0x00, 0xfb,
+    0x07, 0xf6, 0x69, 0xcb, 0x10, 0xb5, 0x07, 0xf6, 0x78, 0xcf, 0x67, 0xb0,
+    0x00, 0x45, 0x81, 0x16, 0xc3, 0x93, 0x73, 0xc4, 0x00, 0x9d, 0x00, 0x35,
+    0x80, 0xcb, 0x10, 0xb5, 0x07, 0xdc, 0xa1, 0xcc, 0x00, 0xfb, 0x07, 0xdc,
+    0x90, 0xcb, 0x10, 0xb5, 0x07, 0xdc, 0xc1, 0xcc, 0x00, 0xfb, 0x07, 0xdc,
+    0xb0, 0x46, 0x03, 0x13, 0xc3, 0x93, 0x7f, 0x42, 0x00, 0x58, 0xc3, 0x93,
+    0x89, 0x4b, 0x0e, 0x7c, 0xc3, 0x93, 0x95, 0xc3, 0x01, 0x5d, 0x00, 0x3b,
+    0x50, 0xcc, 0x00, 0xfb, 0x07, 0xf6, 0xe9, 0xcb, 0x10, 0xb5, 0x07, 0xf6,
+    0xf8, 0x4a, 0x0e, 0x7d, 0xc3, 0x93, 0xa1, 0xcd, 0x04, 0xe7, 0x00, 0x45,
+    0x10, 0xcc, 0x00, 0xfb, 0x07, 0xf4, 0xe9, 0xcb, 0x10, 0xb5, 0x07, 0xf4,
+    0xf8, 0x4a, 0x0e, 0x7d, 0xc3, 0x93, 0xad, 0x48, 0x04, 0xe7, 0x43, 0x93,
+    0xbf, 0xcc, 0x00, 0xfb, 0x07, 0xf6, 0x49, 0xcb, 0x10, 0xb5, 0x07, 0xf6,
+    0x58, 0x44, 0x00, 0x8d, 0xc3, 0x93, 0xcb, 0xc4, 0x3e, 0x06, 0x00, 0x33,
+    0x8a, 0x03, 0x94, 0x01, 0x00, 0x43, 0x94, 0x05, 0xc7, 0x31, 0x5f, 0x00,
+    0x46, 0x11, 0x16, 0xc3, 0x94, 0x11, 0xc9, 0x16, 0x14, 0x00, 0x3b, 0x10,
+    0xcc, 0x00, 0xfb, 0x07, 0xdc, 0x71, 0xcb, 0x10, 0xb5, 0x07, 0xdc, 0x80,
+    0x45, 0x00, 0x8c, 0xc3, 0x94, 0x1d, 0x0b, 0xc3, 0x94, 0x2d, 0xcb, 0x64,
+    0x7b, 0x07, 0xf6, 0xe1, 0xca, 0x26, 0xf7, 0x07, 0xf6, 0xd0, 0xca, 0x26,
+    0xf7, 0x07, 0xdf, 0x19, 0xcd, 0x00, 0xfa, 0x07, 0xdf, 0x10, 0xca, 0x26,
+    0xf7, 0x07, 0xdf, 0x09, 0xcd, 0x00, 0xfa, 0x07, 0xdf, 0x00, 0xcc, 0x00,
+    0xfb, 0x07, 0xf5, 0x29, 0xcb, 0x10, 0xb5, 0x07, 0xf5, 0x38, 0xc7, 0x31,
+    0x5f, 0x00, 0x46, 0x09, 0xc9, 0x16, 0x14, 0x00, 0x35, 0xf8, 0xcb, 0x10,
+    0xb5, 0x07, 0xdb, 0xe1, 0xcc, 0x00, 0xfb, 0x07, 0xdb, 0xd0, 0xcb, 0x64,
+    0x7b, 0x07, 0xdc, 0x09, 0x0b, 0xc3, 0x94, 0x39, 0xca, 0x26, 0xf7, 0x07,
+    0xdb, 0xf8, 0xcb, 0x10, 0xb5, 0x07, 0xdb, 0x41, 0xcc, 0x00, 0xfb, 0x07,
+    0xdb, 0x30, 0x0b, 0xc3, 0x94, 0x45, 0xca, 0x26, 0xf7, 0x07, 0xda, 0xf9,
+    0xcb, 0x64, 0x7b, 0x07, 0xdb, 0x08, 0x46, 0x03, 0x13, 0xc3, 0x94, 0x51,
+    0xc4, 0x00, 0x9d, 0x00, 0x33, 0xe1, 0xda, 0x1b, 0x4e, 0x00, 0x33, 0xe8,
+    0xc6, 0xcb, 0x51, 0x00, 0x31, 0x4b, 0x03, 0x94, 0x5b, 0xca, 0x64, 0x7c,
+    0x07, 0xf4, 0xc0, 0xcc, 0x00, 0xfb, 0x07, 0xf4, 0xa9, 0xcb, 0x10, 0xb5,
+    0x07, 0xf4, 0xb8, 0xcb, 0x64, 0x7b, 0x07, 0xdb, 0x29, 0x0b, 0xc3, 0x94,
+    0x5f, 0xca, 0x26, 0xf7, 0x07, 0xdb, 0x18, 0x16, 0xc3, 0x94, 0x6b, 0xc9,
+    0x0e, 0x6e, 0x00, 0x44, 0x58, 0xcc, 0x00, 0xfb, 0x07, 0xf6, 0x09, 0xcb,
+    0x10, 0xb5, 0x07, 0xf6, 0x18, 0xcd, 0x00, 0xfa, 0x07, 0xf5, 0x59, 0xca,
+    0x26, 0xf7, 0x07, 0xf5, 0x60, 0x0b, 0xc3, 0x94, 0x77, 0xca, 0x26, 0xf7,
+    0x07, 0xf4, 0xd1, 0xcb, 0x64, 0x7b, 0x07, 0xf4, 0xe0, 0xcb, 0x10, 0xb5,
+    0x07, 0xdb, 0x81, 0xcc, 0x00, 0xfb, 0x07, 0xdb, 0x70, 0x16, 0xc3, 0x94,
+    0x83, 0xc7, 0x31, 0x5f, 0x00, 0x36, 0x71, 0xcb, 0x08, 0x09, 0x00, 0x31,
+    0x32, 0x03, 0x94, 0x95, 0x00, 0x43, 0x94, 0x99, 0xcc, 0x00, 0xfb, 0x07,
+    0xf7, 0x89, 0xcb, 0x10, 0xb5, 0x07, 0xf7, 0x98, 0x15, 0xc3, 0x94, 0xab,
+    0xc4, 0xb0, 0x8b, 0x00, 0x45, 0x51, 0xca, 0x35, 0x7a, 0x00, 0x37, 0x79,
+    0xcf, 0x3b, 0x79, 0x00, 0x34, 0xc9, 0x49, 0x04, 0xf9, 0xc3, 0x94, 0xb7,
+    0xc9, 0x0e, 0x6e, 0x00, 0x34, 0xa3, 0x03, 0x94, 0xc3, 0xc4, 0x00, 0x9d,
+    0x00, 0x34, 0x99, 0xcb, 0x08, 0x09, 0x00, 0x3b, 0x60, 0xcc, 0x00, 0xfb,
+    0x07, 0xdd, 0x01, 0xcb, 0x10, 0xb5, 0x07, 0xdd, 0x10, 0x46, 0x03, 0x13,
+    0xc3, 0x94, 0xc9, 0xcb, 0x08, 0x09, 0x00, 0x45, 0x09, 0xd6, 0x31, 0x56,
+    0x00, 0x3a, 0xa9, 0x16, 0xc3, 0x94, 0xd6, 0xde, 0x0e, 0x6e, 0x00, 0x3a,
+    0x88, 0xcc, 0x00, 0xfb, 0x07, 0xf4, 0x79, 0xcb, 0x10, 0xb5, 0x07, 0xf4,
+    0x88, 0xcb, 0x64, 0x7b, 0x07, 0xda, 0xe9, 0x0b, 0xc3, 0x94, 0xe2, 0xca,
+    0x26, 0xf7, 0x07, 0xda, 0xd8, 0xcb, 0x10, 0xb5, 0x07, 0xda, 0xa1, 0xcc,
+    0x00, 0xfb, 0x07, 0xda, 0x90, 0xc5, 0x05, 0x02, 0x00, 0x45, 0x2b, 0x03,
+    0x94, 0xee, 0xc5, 0x00, 0xd4, 0x00, 0x35, 0x38, 0xcc, 0x00, 0xfb, 0x07,
+    0xf6, 0x29, 0xcb, 0x10, 0xb5, 0x07, 0xf6, 0x38, 0x4a, 0x0e, 0x7d, 0xc3,
+    0x94, 0xf4, 0xcd, 0x04, 0xfa, 0x00, 0x34, 0xe8, 0xcc, 0x00, 0xfb, 0x07,
+    0xf5, 0xc9, 0xcb, 0x10, 0xb5, 0x07, 0xf5, 0xd8, 0xcc, 0x00, 0xfb, 0x07,
+    0xf5, 0xa9, 0xcb, 0x10, 0xb5, 0x07, 0xf5, 0xb8, 0x16, 0xc3, 0x95, 0x00,
+    0xd7, 0x29, 0x57, 0x00, 0x34, 0xd1, 0xca, 0x35, 0x7a, 0x00, 0x3b, 0xf1,
+    0x46, 0x09, 0x3f, 0xc3, 0x95, 0x0f, 0xcf, 0x3b, 0x79, 0x00, 0x3a, 0xe1,
+    0x44, 0x03, 0x13, 0x43, 0x95, 0x15, 0xcc, 0x00, 0xfb, 0x07, 0xf5, 0x89,
+    0xcb, 0x10, 0xb5, 0x07, 0xf5, 0x98, 0x45, 0x00, 0x8c, 0xc3, 0x95, 0x1b,
+    0xcd, 0x00, 0xfa, 0x07, 0xf5, 0x49, 0xca, 0x26, 0xf7, 0x07, 0xf5, 0x50,
+    0xca, 0x26, 0xf7, 0x07, 0xdc, 0x29, 0xcd, 0x00, 0xfa, 0x07, 0xdc, 0x20,
+    0xce, 0x6d, 0xe8, 0x00, 0x37, 0xd9, 0x0b, 0xc3, 0x95, 0x3a, 0xca, 0x26,
+    0xf7, 0x07, 0xf5, 0xf1, 0xcb, 0x64, 0x7b, 0x07, 0xf6, 0x00, 0xca, 0x26,
+    0xf7, 0x07, 0xdc, 0x49, 0xcd, 0x00, 0xfa, 0x07, 0xdc, 0x40, 0xca, 0x26,
+    0xf7, 0x07, 0xdc, 0x19, 0xcd, 0x00, 0xfa, 0x07, 0xdc, 0x10, 0xcb, 0x10,
+    0xb5, 0x07, 0xdb, 0xa1, 0xcc, 0x00, 0xfb, 0x07, 0xdb, 0x90, 0xcb, 0x10,
+    0xb5, 0x07, 0xdb, 0x61, 0xcc, 0x00, 0xfb, 0x07, 0xdb, 0x50, 0xc6, 0x1b,
+    0xd1, 0x00, 0x45, 0x59, 0xc5, 0x00, 0xd4, 0x00, 0x36, 0x78, 0x00, 0x43,
+    0x95, 0x46, 0xc8, 0xbf, 0x42, 0x00, 0x3b, 0xc1, 0xca, 0x9f, 0x72, 0x00,
+    0x3b, 0xc8, 0xd0, 0x0e, 0x7c, 0x00, 0x45, 0x39, 0x44, 0x05, 0x18, 0x43,
+    0x95, 0x52, 0xcc, 0x00, 0xfb, 0x07, 0xf7, 0x09, 0xcb, 0x10, 0xb5, 0x07,
+    0xf7, 0x18, 0xcb, 0x10, 0xb5, 0x07, 0xde, 0xa9, 0xcc, 0x00, 0xfb, 0x07,
+    0xde, 0x98, 0xcb, 0x64, 0x7b, 0x07, 0xdc, 0xe9, 0x0b, 0xc3, 0x95, 0x5e,
+    0xca, 0x26, 0xf7, 0x07, 0xdc, 0xd8, 0xd0, 0x31, 0x56, 0x00, 0x44, 0x49,
+    0x16, 0xc3, 0x95, 0x6a, 0xc4, 0x00, 0x9d, 0x00, 0x35, 0xe1, 0xc9, 0x0e,
+    0x6e, 0x00, 0x35, 0xc9, 0x46, 0x03, 0x13, 0x43, 0x95, 0x76, 0x00, 0x43,
+    0x95, 0x80, 0xcc, 0x00, 0xfb, 0x07, 0xf7, 0x29, 0xcb, 0x10, 0xb5, 0x07,
+    0xf7, 0x38, 0xcb, 0x10, 0xb5, 0x07, 0xdb, 0xc1, 0xcc, 0x00, 0xfb, 0x07,
+    0xdb, 0xb0, 0x45, 0x00, 0x8c, 0xc3, 0x95, 0x8c, 0x0b, 0xc3, 0x95, 0xa8,
+    0xca, 0x26, 0xf7, 0x07, 0xf5, 0x11, 0xcb, 0x64, 0x7b, 0x07, 0xf5, 0x20,
+    0x00, 0x43, 0x95, 0xb4, 0x00, 0x43, 0x95, 0xc4, 0xc9, 0xab, 0xeb, 0x00,
+    0x36, 0x03, 0x03, 0x95, 0xda, 0xca, 0x35, 0x7a, 0x00, 0x37, 0xf8, 0xcc,
+    0x00, 0xfb, 0x07, 0xf7, 0x49, 0xcb, 0x10, 0xb5, 0x07, 0xf7, 0x58, 0xc2,
+    0x16, 0x1c, 0x0f, 0x75, 0xb1, 0xc2, 0x00, 0x65, 0x0f, 0x75, 0xc0, 0xc4,
+    0x3a, 0x01, 0x0f, 0x72, 0xe9, 0xc3, 0x0f, 0x9a, 0x0f, 0x72, 0xf8, 0xe0,
+    0x0a, 0x47, 0x0f, 0xdd, 0x68, 0xd0, 0x04, 0xd7, 0x0f, 0xdd, 0x60, 0xd0,
+    0x13, 0xe9, 0x0f, 0xdd, 0x30, 0x00, 0x43, 0x95, 0xde, 0x00, 0x43, 0x95,
+    0xed, 0x4b, 0x18, 0x04, 0xc3, 0x95, 0xfc, 0xdc, 0x13, 0xf9, 0x0f, 0xd2,
+    0x30, 0xc5, 0x6b, 0x02, 0x0f, 0xaf, 0xc9, 0xc8, 0x8e, 0xa5, 0x0f, 0xaf,
+    0xb8, 0xc2, 0x10, 0x11, 0x0b, 0x4e, 0x39, 0x90, 0x0b, 0x4c, 0xa9, 0x9a,
+    0x0b, 0x4c, 0x40, 0xc3, 0x14, 0x83, 0x0b, 0x4d, 0xc8, 0x8f, 0x0b, 0x4e,
+    0x59, 0x92, 0x0b, 0x4d, 0xb0, 0xc3, 0x7c, 0x57, 0x0b, 0x4c, 0x49, 0x9a,
+    0x0b, 0x4b, 0xf8, 0x92, 0x0b, 0x4e, 0x81, 0xcb, 0x99, 0x3f, 0x0b, 0x4c,
+    0x99, 0xc3, 0x82, 0x78, 0x0b, 0x4c, 0x30, 0xc3, 0x8b, 0xa9, 0x0b, 0x4d,
+    0xfb, 0x03, 0x96, 0x08, 0xc3, 0xd0, 0xd7, 0x0b, 0x4c, 0x68, 0xc8, 0xb9,
+    0xd2, 0x0b, 0x4e, 0xe9, 0xc8, 0xbb, 0x72, 0x0b, 0x4c, 0x90, 0xc6, 0xcc,
+    0xa7, 0x0b, 0x4f, 0x40, 0x92, 0x0b, 0x4a, 0x19, 0xc2, 0x00, 0xc2, 0x0b,
+    0x49, 0x8a, 0x03, 0x96, 0x0c, 0xc3, 0x8b, 0xaa, 0x0b, 0x49, 0x49, 0xc2,
+    0x00, 0x2c, 0x0b, 0x48, 0x80, 0x9a, 0x0b, 0x4a, 0xa9, 0xc2, 0x10, 0x11,
+    0x0b, 0x48, 0x08, 0xc3, 0xd7, 0xe2, 0x0b, 0x47, 0x01, 0xc6, 0xd2, 0x83,
+    0x0b, 0x44, 0xf8, 0xc3, 0x49, 0x27, 0x0b, 0x46, 0x91, 0x8f, 0x0b, 0x45,
+    0xd9, 0xc2, 0x00, 0x45, 0x0b, 0x45, 0xa9, 0xc8, 0xb9, 0x5a, 0x0b, 0x45,
+    0x80, 0xc6, 0xce, 0x15, 0x0b, 0x47, 0x19, 0xcc, 0x8b, 0xb9, 0x0b, 0x44,
+    0xf0, 0x9a, 0x0b, 0x47, 0x09, 0x8f, 0x0b, 0x44, 0xd8, 0xc6, 0x17, 0x13,
+    0x0b, 0x43, 0xd8, 0xc4, 0x61, 0x79, 0x0b, 0x41, 0x59, 0xc4, 0xde, 0xc7,
+    0x0b, 0x40, 0x71, 0xc6, 0xcd, 0x43, 0x0b, 0x40, 0x58, 0xc4, 0xe4, 0x7b,
+    0x0b, 0x41, 0x11, 0xc4, 0xe4, 0x9b, 0x0b, 0x40, 0xc8, 0xa3, 0x01, 0x41,
+    0xfb, 0x03, 0x96, 0x12, 0xa5, 0x01, 0x44, 0xf9, 0xa4, 0x01, 0x42, 0xfa,
+    0x03, 0x96, 0x1d, 0xa5, 0x01, 0x45, 0x79, 0xa4, 0x01, 0x43, 0x7a, 0x03,
+    0x96, 0x21, 0xa5, 0x01, 0x46, 0x78, 0xa5, 0x01, 0x45, 0xb9, 0xa4, 0x01,
+    0x43, 0xba, 0x03, 0x96, 0x25, 0xa5, 0x01, 0x46, 0xb8, 0xa5, 0x01, 0x47,
+    0x38, 0xa5, 0x01, 0x45, 0xd9, 0xa4, 0x01, 0x43, 0xda, 0x03, 0x96, 0x29,
+    0xa5, 0x01, 0x46, 0xd8, 0xa5, 0x01, 0x47, 0x58, 0xa5, 0x01, 0x47, 0x98,
+    0xa5, 0x01, 0x45, 0xe9, 0xa4, 0x01, 0x43, 0xea, 0x03, 0x96, 0x2d, 0xa5,
+    0x01, 0x46, 0xe8, 0xa5, 0x01, 0x47, 0x68, 0xa5, 0x01, 0x47, 0xa8, 0xa5,
+    0x01, 0x47, 0xc8, 0xa5, 0x01, 0x45, 0xf1, 0xa4, 0x01, 0x43, 0xf2, 0x03,
+    0x96, 0x31, 0xa5, 0x01, 0x46, 0xf0, 0xa5, 0x01, 0x47, 0x70, 0xa5, 0x01,
+    0x47, 0xb0, 0xa5, 0x01, 0x47, 0xd0, 0xa5, 0x01, 0x47, 0xe0, 0xd0, 0x57,
+    0xc2, 0x0f, 0xc1, 0x81, 0xcb, 0x57, 0xc7, 0x0f, 0xc1, 0x61, 0x49, 0xa8,
+    0xdc, 0xc3, 0x96, 0x35, 0x47, 0x00, 0x58, 0xc3, 0x96, 0x41, 0xcc, 0x84,
+    0x09, 0x0f, 0xc1, 0x01, 0xcc, 0x82, 0x1d, 0x0f, 0xc1, 0x21, 0xca, 0xa0,
+    0x08, 0x0f, 0xc1, 0x40, 0xe0, 0x03, 0x87, 0x01, 0x5c, 0x00, 0x46, 0x00,
+    0x8b, 0x43, 0x96, 0x4d, 0xe0, 0x06, 0xe7, 0x01, 0x4b, 0x38, 0x0e, 0xc3,
+    0x96, 0x59, 0xdf, 0x0c, 0xc2, 0x01, 0x4b, 0x30, 0xc5, 0xdb, 0xbe, 0x08,
+    0x04, 0x39, 0xc5, 0xdc, 0x81, 0x08, 0x04, 0x30, 0xca, 0x9d, 0x88, 0x08,
+    0x04, 0x41, 0xc9, 0xa9, 0xb4, 0x08, 0x04, 0x48, 0xc5, 0xdc, 0x77, 0x08,
+    0x04, 0x51, 0xc6, 0xd3, 0x67, 0x08, 0x04, 0x58, 0xc5, 0xdc, 0x04, 0x08,
+    0x04, 0x61, 0xc6, 0xd3, 0x6d, 0x08, 0x04, 0x68, 0xc6, 0xcc, 0xb3, 0x08,
+    0x04, 0x19, 0xc6, 0xd2, 0x0b, 0x08, 0x04, 0x21, 0xca, 0xa7, 0x38, 0x08,
+    0x04, 0x28, 0xce, 0x16, 0x0f, 0x00, 0xf3, 0x38, 0xce, 0x16, 0x0f, 0x00,
+    0xf3, 0x48, 0xce, 0x01, 0x19, 0x00, 0xec, 0xa9, 0xc4, 0x01, 0x23, 0x00,
+    0x12, 0xd0, 0xca, 0xa2, 0xb0, 0x05, 0x5a, 0x60, 0xd2, 0x4d, 0x0f, 0x05,
+    0x59, 0xb0, 0xcc, 0x23, 0x3f, 0x00, 0xe8, 0x99, 0xc5, 0xd4, 0x9d, 0x00,
+    0xe8, 0x90, 0xca, 0x9b, 0xda, 0x00, 0xf0, 0x48, 0x46, 0x00, 0x8b, 0x43,
+    0x96, 0x65, 0xca, 0x45, 0x1d, 0x0e, 0xf8, 0x68, 0xca, 0xa8, 0x14, 0x0e,
+    0xf8, 0x30, 0x87, 0x00, 0xe8, 0xa3, 0x03, 0x96, 0x86, 0xc5, 0x21, 0xa4,
+    0x00, 0xe8, 0x41, 0xc7, 0xc5, 0xc9, 0x05, 0x5a, 0x1a, 0x03, 0x96, 0x8c,
+    0xc8, 0x67, 0x21, 0x05, 0x3b, 0xf8, 0x87, 0x00, 0xe8, 0x11, 0xc4, 0xde,
+    0x3f, 0x00, 0x12, 0x90, 0xce, 0x61, 0x6c, 0x00, 0x15, 0x72, 0x03, 0x96,
+    0x92, 0xce, 0x74, 0x86, 0x00, 0x13, 0x80, 0xc6, 0x20, 0xab, 0x00, 0xf4,
+    0xb9, 0xcc, 0x3e, 0xb0, 0x01, 0x63, 0x30, 0xc5, 0x05, 0x02, 0x00, 0xf3,
+    0x69, 0xc5, 0x00, 0xd4, 0x00, 0xf3, 0x58, 0xd2, 0x25, 0xf1, 0x05, 0x3b,
+    0x38, 0x45, 0x02, 0x9a, 0x43, 0x96, 0x98, 0x45, 0x02, 0x9a, 0x43, 0x96,
+    0xb6, 0x42, 0x00, 0x30, 0xc3, 0x96, 0xd4, 0x45, 0x00, 0x5a, 0x43, 0x96,
+    0xe3, 0xcb, 0x98, 0x58, 0x00, 0x11, 0x58, 0xc5, 0x31, 0xee, 0x00, 0xf2,
+    0x99, 0xc5, 0x1f, 0x0c, 0x00, 0xf2, 0x88, 0xc9, 0x20, 0xa8, 0x00, 0xf2,
+    0x79, 0xc5, 0x31, 0xee, 0x00, 0xf2, 0x69, 0xc6, 0x60, 0xb1, 0x00, 0x11,
+    0x68, 0xce, 0x01, 0x19, 0x00, 0xec, 0xb9, 0xc6, 0x01, 0x73, 0x05, 0x59,
+    0xf8, 0xc7, 0x0e, 0x70, 0x00, 0xf6, 0x59, 0xca, 0x1f, 0x07, 0x00, 0x10,
+    0x48, 0xca, 0x9b, 0xda, 0x00, 0xf1, 0x78, 0xce, 0x01, 0x19, 0x0e, 0xf8,
+    0xc9, 0xcc, 0x51, 0x28, 0x0e, 0xf8, 0x90, 0x46, 0x00, 0x8b, 0x43, 0x96,
+    0xef, 0xd2, 0x4d, 0x0f, 0x05, 0x5a, 0x50, 0xcc, 0x23, 0x3f, 0x00, 0x12,
+    0xfa, 0x03, 0x96, 0xfb, 0xcc, 0x51, 0x28, 0x0e, 0xf8, 0xc1, 0xce, 0x01,
+    0x19, 0x00, 0xec, 0xd1, 0x05, 0xc3, 0x97, 0x01, 0xc4, 0x14, 0xa6, 0x00,
+    0x0d, 0xd0, 0xc9, 0xaa, 0x95, 0x0e, 0xf8, 0x60, 0x00, 0x43, 0x97, 0x0d,
+    0xca, 0x9b, 0x80, 0x00, 0xf0, 0xe8, 0x42, 0x00, 0x30, 0xc3, 0x97, 0x19,
+    0xca, 0x1f, 0x07, 0x00, 0x10, 0x28, 0xc5, 0x31, 0xee, 0x00, 0xf0, 0xb9,
+    0xc5, 0x1f, 0x0c, 0x00, 0xf0, 0xa8, 0xc8, 0x61, 0x72, 0x00, 0x13, 0xf3,
+    0x03, 0x97, 0x25, 0x0e, 0xc3, 0x97, 0x2b, 0x42, 0x00, 0x58, 0xc3, 0x97,
+    0x37, 0xcc, 0x51, 0x28, 0x00, 0xec, 0x49, 0xcc, 0x1e, 0xc1, 0x00, 0xeb,
+    0x91, 0x05, 0xc3, 0x97, 0x43, 0xc4, 0x14, 0xa6, 0x00, 0x13, 0xe9, 0xce,
+    0x38, 0xe6, 0x05, 0x3d, 0x39, 0xc5, 0x31, 0xee, 0x00, 0x0a, 0xa9, 0xce,
+    0x1d, 0x93, 0x00, 0x10, 0x99, 0xc6, 0x01, 0x73, 0x00, 0x12, 0x68, 0xce,
+    0x01, 0x19, 0x00, 0xec, 0xa1, 0xc4, 0x01, 0x23, 0x00, 0x12, 0xe8, 0xd1,
+    0x51, 0x23, 0x0e, 0xf8, 0x98, 0xcb, 0x98, 0x58, 0x00, 0xf1, 0xc8, 0xcc,
+    0x1e, 0xc1, 0x05, 0x59, 0xc1, 0xc3, 0x01, 0x5d, 0x01, 0x63, 0x08, 0xce,
+    0x3e, 0xae, 0x00, 0xf4, 0xe1, 0xc8, 0x16, 0x15, 0x00, 0xf4, 0xd8, 0xc5,
+    0x05, 0x02, 0x00, 0xf7, 0xa9, 0xc5, 0x00, 0xd4, 0x00, 0xf4, 0x78, 0xc2,
+    0x00, 0xc0, 0x00, 0x0d, 0x83, 0x03, 0x97, 0x55, 0xc8, 0x9e, 0x5c, 0x00,
+    0xf7, 0x38, 0x11, 0xc3, 0x97, 0x5b, 0xc8, 0x20, 0xa9, 0x00, 0x07, 0xe2,
+    0x03, 0x97, 0x67, 0xce, 0x74, 0xe8, 0x00, 0xf3, 0xd8, 0x00, 0x43, 0x97,
+    0x6b, 0xc9, 0x08, 0xf7, 0x00, 0x07, 0xdb, 0x03, 0x97, 0x77, 0xc4, 0x65,
+    0xe2, 0x00, 0x0e, 0xa0, 0xcd, 0x01, 0x1a, 0x00, 0xec, 0xc9, 0xc9, 0x9e,
+    0xe7, 0x00, 0x0b, 0x78, 0xce, 0x36, 0x39, 0x05, 0x5a, 0x71, 0xc5, 0x01,
+    0x74, 0x05, 0x3d, 0xc8, 0x45, 0x02, 0x9a, 0x43, 0x97, 0x7d, 0xc9, 0x08,
+    0xf7, 0x00, 0x07, 0x13, 0x03, 0x97, 0x9b, 0xc4, 0x65, 0xe2, 0x00, 0x0e,
+    0x70, 0x11, 0xc3, 0x97, 0xa1, 0xc8, 0x20, 0xa9, 0x00, 0x07, 0x22, 0x03,
+    0x97, 0xad, 0x0b, 0xc3, 0x97, 0xb3, 0xcd, 0x01, 0x1a, 0x00, 0xec, 0x78,
+    0xc5, 0x05, 0x02, 0x00, 0xf4, 0x49, 0xc5, 0x00, 0xd4, 0x00, 0xf4, 0x38,
+    0xc5, 0x05, 0x02, 0x00, 0xf1, 0x29, 0xc5, 0x00, 0xd4, 0x00, 0xf1, 0x18,
+    0xc5, 0x05, 0x02, 0x00, 0xf4, 0x99, 0xc5, 0x00, 0xd4, 0x00, 0x0b, 0xe0,
+    0x00, 0x43, 0x97, 0xbf, 0xd2, 0x25, 0xf1, 0x05, 0x3a, 0x88, 0xcf, 0x68,
+    0x82, 0x00, 0xf2, 0x59, 0xcb, 0x4d, 0x16, 0x05, 0x59, 0xd9, 0xc6, 0xbd,
+    0xf4, 0x00, 0x0a, 0x31, 0xc4, 0x65, 0xe2, 0x00, 0x0a, 0x41, 0xc3, 0x00,
+    0x33, 0x00, 0x11, 0xa8, 0xc9, 0x64, 0x14, 0x00, 0xf2, 0x49, 0xc8, 0x6d,
+    0x46, 0x00, 0x13, 0x91, 0xcd, 0x7b, 0x08, 0x00, 0x0c, 0xf0, 0x43, 0x05,
+    0x19, 0xc3, 0x97, 0xcb, 0xc8, 0x25, 0xfb, 0x05, 0x3c, 0x88, 0x45, 0x02,
+    0x9a, 0x43, 0x97, 0xd7, 0xc7, 0x0e, 0x70, 0x00, 0xf7, 0x21, 0x45, 0x00,
+    0x5a, 0x43, 0x97, 0xf5, 0x00, 0x43, 0x98, 0x01, 0xc9, 0x9b, 0xdb, 0x00,
+    0xf3, 0xc1, 0xc5, 0x05, 0x02, 0x00, 0xf3, 0xa0, 0xc6, 0x05, 0x01, 0x00,
+    0xf3, 0xb0, 0xc9, 0x0e, 0x6e, 0x00, 0xf7, 0x11, 0xc5, 0x1e, 0xc8, 0x00,
+    0xf7, 0x01, 0xca, 0x9e, 0x5a, 0x00, 0xf6, 0xf1, 0xc5, 0x1f, 0x0c, 0x00,
+    0xf6, 0xe1, 0xc5, 0x31, 0xee, 0x00, 0xf6, 0xd0, 0xc9, 0x0e, 0x6e, 0x00,
+    0xf6, 0xc1, 0xc5, 0x1e, 0xc8, 0x00, 0xf6, 0xb1, 0xca, 0x9e, 0x5a, 0x00,
+    0xf6, 0xa1, 0xc5, 0x1f, 0x0c, 0x00, 0xf6, 0x91, 0xc5, 0x31, 0xee, 0x00,
+    0xf6, 0x80, 0xc5, 0x05, 0x02, 0x00, 0xf6, 0x61, 0xc5, 0x00, 0xd4, 0x00,
+    0x11, 0x72, 0x03, 0x98, 0x0d, 0xc5, 0x31, 0xee, 0x00, 0x0a, 0x81, 0xc5,
+    0x1f, 0x0c, 0x00, 0x10, 0x60, 0xc5, 0x31, 0xee, 0x00, 0xf2, 0x91, 0xc5,
+    0x1f, 0x0c, 0x00, 0xf2, 0x80, 0xc5, 0x05, 0x02, 0x00, 0xf6, 0x51, 0xc5,
+    0x00, 0xd4, 0x00, 0x09, 0x80, 0x44, 0x02, 0x9b, 0xc3, 0x98, 0x13, 0xc5,
+    0x05, 0x02, 0x00, 0xf0, 0xc0, 0xc5, 0x05, 0x02, 0x00, 0xf5, 0xc1, 0xc5,
+    0x00, 0xd4, 0x00, 0x08, 0xb0, 0xc9, 0x0e, 0x6e, 0x00, 0xf5, 0x61, 0xc5,
+    0x1e, 0xc8, 0x00, 0xf5, 0x51, 0xca, 0x9e, 0x5a, 0x00, 0xf5, 0x41, 0xc5,
+    0x1f, 0x0c, 0x00, 0xf5, 0x31, 0xc5, 0x31, 0xee, 0x00, 0xf5, 0x20, 0xc5,
+    0x05, 0x02, 0x00, 0xf5, 0x01, 0xc5, 0x00, 0xd4, 0x00, 0x11, 0x32, 0x03,
+    0x98, 0x31, 0xc5, 0x05, 0x02, 0x00, 0xf2, 0xd3, 0x03, 0x98, 0x37, 0xc5,
+    0x00, 0xd4, 0x00, 0xf2, 0xc0, 0xca, 0x03, 0x87, 0x01, 0x5d, 0x19, 0xc9,
+    0x01, 0x88, 0x01, 0x5d, 0x10, 0xc7, 0xc2, 0x03, 0x00, 0x89, 0x98, 0x02,
+    0x43, 0x98, 0x3d, 0xc4, 0xad, 0x2b, 0x00, 0x89, 0xe9, 0xc5, 0xdb, 0xff,
+    0x00, 0x8a, 0x78, 0x91, 0x00, 0x8c, 0xf8, 0x91, 0x00, 0x8b, 0xe9, 0x97,
+    0x00, 0x8b, 0xf1, 0xc2, 0x19, 0x2c, 0x00, 0x8d, 0x28, 0x83, 0x00, 0x8c,
+    0x23, 0x03, 0x98, 0x53, 0xc2, 0x02, 0x66, 0x00, 0x8c, 0x30, 0x87, 0x06,
+    0xbd, 0x98, 0x87, 0x06, 0xbd, 0xb8, 0x91, 0x00, 0x8c, 0x78, 0x91, 0x00,
+    0x8c, 0x88, 0x97, 0x00, 0x8c, 0xb1, 0x91, 0x06, 0xbd, 0xd0, 0x91, 0x06,
+    0xbd, 0x80, 0x87, 0x00, 0x8d, 0x38, 0xc2, 0x37, 0xea, 0x06, 0xbd, 0xe9,
+    0x87, 0x06, 0xbd, 0xf0, 0x91, 0x06, 0xbd, 0xf8, 0xc7, 0xc2, 0x03, 0x00,
+    0x8e, 0x20, 0xc6, 0x8e, 0xde, 0x06, 0xbf, 0x61, 0xc6, 0xc0, 0x7c, 0x06,
+    0xbf, 0x68, 0xc5, 0x8e, 0xdf, 0x00, 0x8f, 0x39, 0xcc, 0x79, 0xeb, 0x06,
+    0xbf, 0x58, 0xc5, 0xc0, 0x7d, 0x00, 0x8f, 0x41, 0xc6, 0xc1, 0x86, 0x06,
+    0xbf, 0x88, 0xc4, 0x79, 0xf3, 0x00, 0x8f, 0x51, 0xc6, 0xca, 0x0e, 0x06,
+    0xbf, 0x70, 0xc4, 0xc6, 0x7a, 0x06, 0xbf, 0x79, 0xc6, 0xc6, 0x79, 0x06,
+    0xbf, 0x80, 0xc7, 0xc2, 0x03, 0x06, 0xbe, 0x88, 0xc4, 0xc6, 0x7a, 0x06,
+    0xbe, 0x91, 0xc6, 0xc6, 0x79, 0x06, 0xbe, 0x98, 0x02, 0x43, 0x98, 0x57,
+    0xc6, 0x8e, 0xde, 0x00, 0x8e, 0x89, 0xc4, 0xad, 0x2b, 0x00, 0x8e, 0x91,
+    0xc5, 0x90, 0xe4, 0x06, 0xbe, 0xc0, 0x02, 0x43, 0x98, 0x63, 0xc4, 0xad,
+    0x2b, 0x00, 0x8e, 0xb1, 0xc6, 0x8e, 0xde, 0x06, 0xbe, 0xa8, 0xc6, 0xce,
+    0xb1, 0x00, 0x8e, 0x78, 0xc6, 0xce, 0xb1, 0x06, 0xbe, 0xe0, 0xc5, 0xd9,
+    0xca, 0x06, 0xbf, 0x08, 0xc4, 0xad, 0x2b, 0x00, 0x8e, 0xf1, 0xc5, 0xd9,
+    0x61, 0x06, 0xbe, 0xf8, 0xc7, 0xc0, 0x7b, 0x06, 0xbf, 0x38, 0xc8, 0xba,
+    0x7a, 0x06, 0xbf, 0x20, 0xc4, 0xc6, 0x7a, 0x06, 0xbf, 0x41, 0xc6, 0xc6,
+    0x79, 0x06, 0xbf, 0x48, 0xc5, 0x8e, 0xdf, 0x00, 0x8f, 0x61, 0xc6, 0xbb,
+    0xec, 0x00, 0x8f, 0x78, 0xca, 0x8e, 0xda, 0x00, 0x8f, 0x69, 0xc3, 0x39,
+    0x37, 0x00, 0x8f, 0x88, 0xc6, 0x8e, 0xde, 0x01, 0x8b, 0xa1, 0xc6, 0xc0,
+    0x7c, 0x01, 0x8b, 0xa8, 0xc3, 0x22, 0x45, 0x01, 0x9f, 0x59, 0xc3, 0x18,
+    0x13, 0x01, 0x9f, 0x9a, 0x03, 0x98, 0x7b, 0xc3, 0x03, 0x26, 0x01, 0x9f,
+    0x61, 0x9b, 0x01, 0x9f, 0xea, 0x03, 0x98, 0x7f, 0x02, 0x43, 0x98, 0x85,
+    0xd3, 0x45, 0x4d, 0x0f, 0xd1, 0x81, 0xcf, 0x18, 0x0f, 0x0f, 0xd1, 0xb8,
+    0xc9, 0x57, 0x20, 0x08, 0x4f, 0x88, 0xc9, 0x57, 0x20, 0x08, 0x4f, 0x80,
+    0xc9, 0x57, 0x20, 0x08, 0x4f, 0x78, 0xc9, 0x57, 0x20, 0x08, 0x4f, 0x70,
+    0xce, 0x74, 0x86, 0x00, 0xed, 0x68, 0xc4, 0xde, 0x3f, 0x00, 0xec, 0xd9,
+    0x87, 0x00, 0xea, 0x30, 0x46, 0x00, 0x8b, 0x43, 0x98, 0x95, 0xca, 0xa8,
+    0x14, 0x08, 0x3d, 0x08, 0xca, 0xa8, 0x14, 0x08, 0x3c, 0xe0, 0xcc, 0x23,
+    0x3f, 0x00, 0xed, 0x39, 0xc9, 0xab, 0xb5, 0x00, 0x15, 0xb0, 0xca, 0x1f,
+    0x59, 0x08, 0x3c, 0xa0, 0xc9, 0xaa, 0xcb, 0x08, 0x3c, 0xe8, 0xc9, 0xa9,
+    0x2d, 0x08, 0x3c, 0x68, 0xc4, 0x00, 0x32, 0x08, 0x3c, 0x49, 0xce, 0x01,
+    0x19, 0x08, 0x3c, 0x40, 0xc8, 0x4e, 0x93, 0x05, 0x38, 0x59, 0xd2, 0x4e,
+    0x89, 0x05, 0x38, 0x80, 0xc4, 0x01, 0x9b, 0x00, 0x17, 0x88, 0xc8, 0x4e,
+    0x93, 0x05, 0x38, 0x51, 0xd2, 0x4e, 0x89, 0x05, 0x38, 0x78, 0xcc, 0x1f,
+    0x0c, 0x00, 0x17, 0xa9, 0xcc, 0x83, 0x0d, 0x00, 0x17, 0xb0, 0xc3, 0x11,
+    0x7e, 0x0e, 0xbe, 0x11, 0xc5, 0xd8, 0x8f, 0x0e, 0xbd, 0xc0, 0xc3, 0x11,
+    0x7e, 0x0e, 0xbd, 0x41, 0xc5, 0xd8, 0x8f, 0x0e, 0xbc, 0xf0, 0xc7, 0x00,
+    0x90, 0x0e, 0xbd, 0x08, 0xc2, 0x02, 0xae, 0x0e, 0x8f, 0x39, 0xc4, 0x03,
+    0xc8, 0x0e, 0x8f, 0x30, 0xc4, 0x2c, 0x0d, 0x0e, 0x8e, 0x31, 0xc5, 0x02,
+    0xc2, 0x0e, 0x8d, 0xf1, 0xc5, 0x01, 0xfc, 0x0e, 0x8d, 0xe8, 0xc4, 0x2c,
+    0x0d, 0x0e, 0x8e, 0x21, 0xc5, 0x02, 0xc2, 0x0e, 0x8d, 0xd1, 0xc5, 0x01,
+    0xfc, 0x0e, 0x8d, 0xc8, 0x49, 0xaf, 0xd2, 0xc3, 0x98, 0xa4, 0x46, 0x67,
+    0x3c, 0x43, 0x98, 0xb0, 0xd0, 0x5b, 0x02, 0x0e, 0x88, 0xe1, 0xca, 0x74,
+    0x98, 0x0e, 0x88, 0xd8, 0x4c, 0x7e, 0x07, 0x43, 0x98, 0xbc, 0xcd, 0x7e,
+    0x07, 0x0e, 0x8e, 0x48, 0xc5, 0x02, 0xc2, 0x0e, 0x8a, 0xa9, 0xc5, 0x01,
+    0xfc, 0x0e, 0x8a, 0xa0, 0x43, 0x11, 0x49, 0xc3, 0x98, 0xc8, 0x45, 0x11,
+    0x17, 0xc3, 0x98, 0xda, 0x46, 0x00, 0x2c, 0xc3, 0x98, 0xe6, 0x45, 0x00,
+    0x49, 0x43, 0x98, 0xf2, 0x15, 0xc3, 0x98, 0xfe, 0xc8, 0xbe, 0xfa, 0x0e,
+    0x8d, 0x61, 0xc6, 0xcd, 0x9d, 0x0e, 0x8d, 0x59, 0x42, 0x00, 0x58, 0xc3,
+    0x99, 0x14, 0x16, 0xc3, 0x99, 0x26, 0xc4, 0x93, 0xd1, 0x0e, 0x8c, 0x49,
+    0x42, 0x01, 0x09, 0xc3, 0x99, 0x30, 0xc3, 0x07, 0x30, 0x0e, 0x8c, 0x31,
+    0xc5, 0xdb, 0x69, 0x0e, 0x8c, 0x11, 0x03, 0xc3, 0x99, 0x3c, 0xc7, 0xc2,
+    0x73, 0x0e, 0x8b, 0xfa, 0x03, 0x99, 0x4b, 0xc2, 0x00, 0xfa, 0x0e, 0x8d,
+    0xc3, 0x03, 0x99, 0x51, 0x87, 0x0e, 0x8a, 0xe0, 0xa0, 0x0e, 0x8b, 0x61,
+    0x9f, 0x0e, 0x8b, 0x59, 0x9e, 0x0e, 0x8b, 0x50, 0xa0, 0x0e, 0x88, 0x79,
+    0x9f, 0x0e, 0x88, 0x71, 0x9e, 0x0e, 0x88, 0x68, 0x12, 0xc3, 0x99, 0x57,
+    0xc4, 0xe3, 0xab, 0x00, 0xff, 0xd9, 0xc5, 0x28, 0x47, 0x00, 0xff, 0xd1,
+    0xc5, 0x6c, 0xa6, 0x00, 0xfb, 0x4b, 0x03, 0x99, 0x66, 0xc5, 0x63, 0xdc,
+    0x00, 0x1c, 0x78, 0xc4, 0xe3, 0xab, 0x00, 0xff, 0xc9, 0xc5, 0x28, 0x47,
+    0x00, 0xff, 0xc1, 0xc5, 0x6c, 0xa6, 0x00, 0xfa, 0x4b, 0x03, 0x99, 0x6c,
+    0xc5, 0xd8, 0xc1, 0x00, 0xfa, 0x43, 0x03, 0x99, 0x72, 0xc5, 0x63, 0xdc,
+    0x00, 0x1c, 0x60, 0xc4, 0x28, 0x48, 0x00, 0xff, 0x51, 0xc5, 0xd6, 0x41,
+    0x00, 0xff, 0x40, 0xc4, 0x59, 0x13, 0x00, 0xfa, 0xcb, 0x03, 0x99, 0x78,
+    0xc8, 0x63, 0xd3, 0x00, 0x1d, 0x58, 0xc4, 0x28, 0x48, 0x00, 0xfe, 0xd1,
+    0xc5, 0xd6, 0x41, 0x00, 0xfe, 0xc0, 0xc4, 0x59, 0x13, 0x00, 0xf9, 0xcb,
+    0x03, 0x99, 0x7e, 0xc8, 0x63, 0xd3, 0x00, 0x1d, 0x50, 0x45, 0x02, 0x9a,
+    0x43, 0x99, 0x84, 0x12, 0xc3, 0x99, 0x96, 0xc4, 0xe3, 0xab, 0x00, 0xfe,
+    0x59, 0xc5, 0x28, 0x47, 0x00, 0xfe, 0x51, 0xc5, 0x6c, 0xa6, 0x00, 0xf9,
+    0x4b, 0x03, 0x99, 0xa5, 0xc5, 0x63, 0xdc, 0x00, 0x1c, 0x48, 0xc4, 0xe3,
+    0xab, 0x00, 0xfe, 0x49, 0xc5, 0x28, 0x47, 0x00, 0xfe, 0x41, 0xc5, 0x6c,
+    0xa6, 0x00, 0xf8, 0xcb, 0x03, 0x99, 0xab, 0xc5, 0xd8, 0xc1, 0x00, 0xf8,
+    0xc3, 0x03, 0x99, 0xb1, 0xc5, 0x63, 0xdc, 0x00, 0x1c, 0x40, 0x12, 0xc3,
+    0x99, 0xb7, 0xc4, 0xe3, 0xab, 0x00, 0xfd, 0xd9, 0x18, 0xc3, 0x99, 0xc6,
+    0xc6, 0x60, 0xb1, 0x00, 0xfd, 0xc9, 0xc5, 0x6c, 0xa6, 0x00, 0xf8, 0x4b,
+    0x03, 0x99, 0xd2, 0xc5, 0x63, 0xdc, 0x00, 0x1c, 0x30, 0x12, 0xc3, 0x99,
+    0xd8, 0xc4, 0xe3, 0xab, 0x00, 0xfb, 0xeb, 0x03, 0x99, 0xea, 0xcd, 0x4a,
+    0x68, 0x00, 0xff, 0x99, 0xc5, 0x28, 0x47, 0x00, 0xfb, 0xe3, 0x03, 0x99,
+    0xf0, 0xc5, 0x6c, 0xa6, 0x00, 0xfb, 0x0b, 0x03, 0x99, 0xf6, 0xc5, 0x63,
+    0xdc, 0x00, 0x1e, 0x70, 0xc4, 0xe3, 0xab, 0x00, 0xfb, 0xc9, 0xc5, 0x28,
+    0x47, 0x00, 0xfb, 0xc1, 0xc5, 0x6c, 0xa6, 0x00, 0xfa, 0x0b, 0x03, 0x99,
+    0xfc, 0xc5, 0xd8, 0xc1, 0x00, 0xfa, 0x03, 0x03, 0x9a, 0x02, 0xc5, 0x63,
+    0xdc, 0x00, 0x1e, 0x60, 0xc8, 0x63, 0xd3, 0x00, 0x1e, 0x5b, 0x03, 0x9a,
+    0x08, 0xc4, 0x59, 0x13, 0x00, 0xfa, 0x8a, 0x03, 0x9a, 0x0e, 0xca, 0x94,
+    0x91, 0x00, 0xff, 0x31, 0xc4, 0x7a, 0x04, 0x00, 0xfa, 0x82, 0x03, 0x9a,
+    0x14, 0xc5, 0xd6, 0x41, 0x00, 0xff, 0x01, 0xc4, 0x28, 0x48, 0x00, 0xfb,
+    0xd0, 0xc8, 0x63, 0xd3, 0x00, 0x1e, 0x53, 0x03, 0x9a, 0x1a, 0xc4, 0x59,
+    0x13, 0x00, 0xf9, 0x8a, 0x03, 0x9a, 0x20, 0xca, 0x94, 0x91, 0x00, 0xfe,
+    0xb1, 0xc4, 0x7a, 0x04, 0x00, 0xf9, 0x82, 0x03, 0x9a, 0x26, 0xc5, 0xd6,
+    0x41, 0x00, 0xfe, 0x81, 0xc4, 0x28, 0x48, 0x00, 0xfb, 0xb0, 0x12, 0xc3,
+    0x9a, 0x2c, 0xc4, 0xe3, 0xab, 0x00, 0xfb, 0xab, 0x03, 0x9a, 0x3e, 0xcd,
+    0x4a, 0x68, 0x00, 0xfe, 0x19, 0xc5, 0x28, 0x47, 0x00, 0xfb, 0xa3, 0x03,
+    0x9a, 0x44, 0xc5, 0x6c, 0xa6, 0x00, 0xf9, 0x0b, 0x03, 0x9a, 0x4a, 0xc5,
+    0x63, 0xdc, 0x00, 0x1d, 0x70, 0xc4, 0xe3, 0xab, 0x00, 0xfb, 0x99, 0xc5,
+    0x28, 0x47, 0x00, 0xfb, 0x91, 0xc5, 0x6c, 0xa6, 0x00, 0xf8, 0x8b, 0x03,
+    0x9a, 0x50, 0xc5, 0xd8, 0xc1, 0x00, 0xf8, 0x83, 0x03, 0x9a, 0x56, 0xc5,
+    0x63, 0xdc, 0x00, 0x1d, 0x68, 0x12, 0xc3, 0x9a, 0x5c, 0xc4, 0xe3, 0xab,
+    0x00, 0xfb, 0x8b, 0x03, 0x9a, 0x6e, 0xcd, 0x4a, 0x68, 0x00, 0xfd, 0x99,
+    0x18, 0xc3, 0x9a, 0x74, 0xc6, 0x60, 0xb1, 0x00, 0xfd, 0x89, 0xc5, 0x6c,
+    0xa6, 0x00, 0xf8, 0x0b, 0x03, 0x9a, 0x83, 0xc5, 0x63, 0xdc, 0x00, 0x1d,
+    0x60, 0xc7, 0xb9, 0xdb, 0x08, 0x0a, 0x59, 0xc7, 0x67, 0xc7, 0x08, 0x0a,
+    0x90, 0xc7, 0x0d, 0x04, 0x08, 0x0a, 0x2b, 0x03, 0x9a, 0x89, 0x16, 0xc3,
+    0x9a, 0x8d, 0xc7, 0x67, 0xc7, 0x08, 0x0a, 0x78, 0x16, 0xc3, 0x9a, 0x9c,
+    0xc7, 0x67, 0xc7, 0x08, 0x0a, 0x88, 0xc7, 0x0d, 0x04, 0x08, 0x0b, 0x51,
+    0xc8, 0x4b, 0x94, 0x08, 0x0b, 0x88, 0xc4, 0x0d, 0x0e, 0x08, 0x0b, 0x29,
+    0xcb, 0x13, 0xfa, 0x08, 0x0b, 0x58, 0xc8, 0x4b, 0x94, 0x08, 0x0b, 0x91,
+    0xc7, 0x0d, 0x04, 0x08, 0x0b, 0x70, 0xc8, 0x0d, 0x03, 0x08, 0x0b, 0x68,
+    0xcf, 0x6b, 0x25, 0x08, 0x0b, 0x38, 0xc2, 0xe5, 0xfd, 0x08, 0x1e, 0x68,
+    0x11, 0xc3, 0x9a, 0xab, 0xc4, 0x69, 0xaa, 0x0e, 0x7d, 0xca, 0x03, 0x9a,
+    0xbd, 0xd4, 0x3e, 0xe4, 0x00, 0xef, 0xf9, 0xd2, 0x4d, 0x8d, 0x00, 0x1a,
+    0xb0, 0xc2, 0x01, 0x2d, 0x09, 0x19, 0x99, 0xc3, 0x02, 0x2c, 0x09, 0x19,
+    0x90, 0xc9, 0x40, 0xaa, 0x09, 0x12, 0xe8, 0xca, 0x9c, 0x98, 0x09, 0x10,
+    0x79, 0xc9, 0x40, 0xaa, 0x09, 0x10, 0x70, 0xc8, 0xaa, 0xf0, 0x09, 0x1c,
+    0x51, 0xc4, 0x58, 0xf5, 0x09, 0x10, 0x08, 0xa0, 0x09, 0x10, 0x33, 0x03,
+    0x9a, 0xc3, 0x9f, 0x09, 0x10, 0x28, 0xcc, 0x36, 0x65, 0x09, 0x27, 0xa9,
+    0xc3, 0x36, 0x6e, 0x09, 0x27, 0xa0, 0xc9, 0xab, 0x25, 0x09, 0x0e, 0x38,
+    0x94, 0x09, 0x0e, 0x28, 0xc8, 0x65, 0xd0, 0x09, 0x0f, 0x39, 0x83, 0x09,
+    0x0f, 0x30, 0xc2, 0x38, 0xb6, 0x09, 0x0f, 0x19, 0x89, 0x09, 0x0f, 0x10,
+    0xc2, 0x5d, 0xd4, 0x09, 0x0e, 0xfb, 0x03, 0x9a, 0xc9, 0x4e, 0x72, 0x8e,
+    0xc3, 0x9a, 0xcf, 0xca, 0xa6, 0x16, 0x09, 0x0e, 0xe0, 0xc8, 0xa7, 0xb2,
+    0x09, 0x0e, 0xc8, 0x8e, 0x09, 0x0e, 0xb8, 0x8e, 0x09, 0x0e, 0x93, 0x03,
+    0x9a, 0xdb, 0xa0, 0x09, 0x0e, 0x88, 0x90, 0x09, 0x0e, 0x80, 0x46, 0x25,
+    0xd4, 0x43, 0x9a, 0xe1, 0x8e, 0x09, 0x0e, 0x48, 0xc3, 0x1d, 0x23, 0x09,
+    0x0d, 0xe1, 0xc3, 0x1a, 0xf4, 0x09, 0x0d, 0xd9, 0xca, 0xa4, 0x4a, 0x09,
+    0x0d, 0xd0, 0x8f, 0x09, 0x26, 0x39, 0x86, 0x09, 0x07, 0x38, 0xc9, 0xab,
+    0xd0, 0x09, 0x07, 0x30, 0xc2, 0x04, 0x2b, 0x09, 0x26, 0x31, 0xc2, 0x8d,
+    0xc6, 0x09, 0x26, 0x28, 0xca, 0x51, 0xd4, 0x09, 0x26, 0x08, 0x83, 0x09,
+    0x25, 0xf1, 0xcc, 0x81, 0x15, 0x09, 0x06, 0x88, 0xc8, 0xaa, 0xef, 0x09,
+    0x06, 0x98, 0x46, 0x25, 0xd4, 0x43, 0x9a, 0xed, 0xc7, 0x25, 0xd4, 0x09,
+    0x06, 0x78, 0xc6, 0x45, 0xad, 0x09, 0x25, 0xc9, 0xc8, 0x6a, 0x1e, 0x09,
+    0x25, 0xc0, 0xc4, 0x39, 0xc8, 0x09, 0x25, 0xb9, 0xc9, 0xa6, 0x49, 0x09,
+    0x06, 0x28, 0xc9, 0xab, 0x37, 0x09, 0x05, 0xf0, 0x45, 0x03, 0x55, 0xc3,
+    0x9a, 0xf9, 0x46, 0x1f, 0x67, 0xc3, 0x9b, 0x05, 0x48, 0x0b, 0xc8, 0xc3,
+    0x9b, 0x1b, 0xc7, 0x27, 0xb2, 0x0e, 0xc7, 0xd1, 0x45, 0x13, 0x6f, 0xc3,
+    0x9b, 0x30, 0xc4, 0x0e, 0x65, 0x0e, 0xc7, 0xb0, 0x46, 0x0e, 0xce, 0xc3,
+    0x9b, 0x42, 0x14, 0xc3, 0x9b, 0x64, 0xc6, 0x04, 0xcb, 0x0e, 0xc0, 0x73,
+    0x03, 0x9b, 0x70, 0xc6, 0x58, 0xac, 0x0e, 0xc0, 0x5b, 0x03, 0x9b, 0x74,
+    0xd0, 0x58, 0xa2, 0x0e, 0xc0, 0x9b, 0x03, 0x9b, 0x78, 0xc4, 0x18, 0xf2,
+    0x0e, 0xc0, 0x33, 0x03, 0x9b, 0x7e, 0xc6, 0xcc, 0x41, 0x0e, 0xc0, 0x50,
+    0xca, 0x13, 0x91, 0x0e, 0xc6, 0x69, 0xcd, 0x3a, 0x9e, 0x0e, 0xc6, 0x40,
+    0xc6, 0x13, 0x95, 0x0e, 0xc6, 0x59, 0x47, 0xc6, 0xcc, 0xc3, 0x9b, 0x84,
+    0x05, 0xc3, 0x9b, 0x90, 0xcf, 0x64, 0xb3, 0x0e, 0xc1, 0x80, 0xcb, 0x4d,
+    0x82, 0x0e, 0xc6, 0x48, 0x00, 0x43, 0x9b, 0x9c, 0xc6, 0x0e, 0xcd, 0x0e,
+    0xc4, 0xe0, 0xc4, 0x0e, 0xcf, 0x0e, 0xc4, 0xd1, 0xcc, 0x86, 0xe5, 0x0e,
+    0xc4, 0xc8, 0x00, 0x43, 0x9b, 0xa8, 0xcb, 0x4d, 0x82, 0x0e, 0xc3, 0x1a,
+    0x03, 0x9b, 0xb4, 0xca, 0x4d, 0x83, 0x0e, 0xc2, 0xf1, 0xd3, 0x46, 0x57,
+    0x0e, 0xc2, 0x6a, 0x03, 0x9b, 0xba, 0x00, 0x43, 0x9b, 0xbe, 0x00, 0x43,
+    0x9b, 0xd9, 0x00, 0x43, 0x9b, 0xee, 0xc4, 0x0c, 0x4d, 0x0e, 0xc6, 0x10,
+    0xc6, 0x13, 0x95, 0x0e, 0xc5, 0x41, 0xc4, 0x00, 0x9d, 0x0e, 0xc4, 0x48,
+    0xc4, 0x0c, 0x4d, 0x0e, 0xc4, 0xf0, 0xc5, 0x0e, 0xce, 0x0e, 0xc7, 0x83,
+    0x03, 0x9b, 0xfa, 0xc6, 0x58, 0xac, 0x0e, 0xc6, 0xd9, 0xcb, 0x13, 0x90,
+    0x0e, 0xc6, 0x09, 0x47, 0x04, 0xcb, 0x43, 0x9b, 0xfe, 0xc5, 0x06, 0x82,
+    0x0e, 0xc5, 0x13, 0x03, 0x9c, 0x0d, 0xc5, 0x0e, 0xce, 0x0e, 0xc4, 0xd8,
+    0xcf, 0x69, 0xdb, 0x0e, 0xc4, 0x18, 0xc8, 0xbc, 0x62, 0x0e, 0xc4, 0x09,
+    0x46, 0x0e, 0xce, 0x43, 0x9c, 0x13, 0x00, 0x43, 0x9c, 0x1f, 0x00, 0x43,
+    0x9c, 0x2b, 0xc7, 0x27, 0xb2, 0x0e, 0xc3, 0x99, 0xc4, 0x0e, 0xe2, 0x0e,
+    0xc3, 0x78, 0x00, 0x43, 0x9c, 0x3a, 0xc5, 0x05, 0x74, 0x0e, 0xc2, 0xa0,
+    0xc5, 0x18, 0xf1, 0x0e, 0xc6, 0xa8, 0xcb, 0x13, 0x90, 0x0e, 0xc5, 0xd9,
+    0xc6, 0x04, 0xcb, 0x0e, 0xc0, 0x7b, 0x03, 0x9c, 0x46, 0xc5, 0x58, 0xac,
+    0x0e, 0xc0, 0x69, 0xc4, 0x18, 0xf2, 0x0e, 0xc0, 0x38, 0xc5, 0xdd, 0x17,
+    0x0e, 0xcd, 0x69, 0xca, 0x9e, 0x8c, 0x0e, 0xcd, 0x30, 0xc5, 0x17, 0x14,
+    0x0e, 0xcc, 0x73, 0x03, 0x9c, 0x4a, 0xc6, 0x01, 0xdb, 0x0e, 0xcc, 0x69,
+    0xc5, 0x03, 0x13, 0x0e, 0xcc, 0x60, 0xc6, 0x01, 0xdb, 0x0e, 0xcc, 0x89,
+    0xc5, 0x03, 0x13, 0x0e, 0xcc, 0x80, 0xc2, 0x00, 0x15, 0x0e, 0xcc, 0x58,
+    0xcb, 0x57, 0xc7, 0x0f, 0xc1, 0x79, 0xca, 0xa0, 0x08, 0x0f, 0xc1, 0x59,
+    0x49, 0xa8, 0xdc, 0xc3, 0x9c, 0x50, 0xd8, 0x24, 0xb3, 0x01, 0x5b, 0xe9,
+    0xcc, 0x84, 0x09, 0x0f, 0xc1, 0x19, 0xcc, 0x82, 0x1d, 0x0f, 0xc1, 0x39,
+    0xd0, 0x57, 0xc2, 0x0f, 0xc1, 0x98, 0xe0, 0x09, 0x47, 0x01, 0x5c, 0x18,
+    0xcf, 0x2c, 0x35, 0x01, 0x5b, 0xe1, 0xd1, 0x01, 0x68, 0x01, 0x5b, 0xe0,
+    0xc7, 0x09, 0x0d, 0x01, 0x5d, 0x29, 0xc9, 0x03, 0xc8, 0x01, 0x5d, 0x38,
+    0xcf, 0x2c, 0x35, 0x01, 0x48, 0xb9, 0xd6, 0x2d, 0x62, 0x01, 0x48, 0xc0,
+    0xc8, 0x62, 0x44, 0x01, 0x4b, 0x61, 0xdd, 0x10, 0xdd, 0x01, 0x4b, 0x40,
+    0xe0, 0x06, 0xe7, 0x01, 0x4b, 0x20, 0xcc, 0x00, 0xfb, 0x07, 0xe8, 0x51,
+    0xcb, 0x10, 0xb5, 0x07, 0xe9, 0x70, 0x45, 0x19, 0x60, 0xc3, 0x9c, 0x5c,
+    0xce, 0x43, 0x77, 0x07, 0xed, 0x50, 0xcc, 0x10, 0xb4, 0x07, 0xeb, 0x59,
+    0xca, 0x26, 0xf7, 0x07, 0xeb, 0x50, 0xca, 0x26, 0xf7, 0x07, 0xeb, 0x61,
+    0xcc, 0x10, 0xb4, 0x07, 0xeb, 0x68, 0xcc, 0x10, 0xb4, 0x07, 0xeb, 0x29,
+    0xca, 0x26, 0xf7, 0x07, 0xeb, 0x20, 0xdc, 0x14, 0x69, 0x07, 0xea, 0x61,
+    0xd2, 0x49, 0x9d, 0x07, 0xef, 0xd0, 0xe0, 0x00, 0xe7, 0x07, 0xef, 0x80,
+    0xca, 0x26, 0xf7, 0x07, 0xeb, 0x89, 0xcc, 0x10, 0xb4, 0x07, 0xeb, 0x90,
+    0xca, 0x26, 0xf7, 0x07, 0xea, 0x89, 0xcc, 0x10, 0xb4, 0x07, 0xea, 0x90,
+    0xca, 0x26, 0xf7, 0x07, 0xe3, 0x49, 0xcd, 0x00, 0xfa, 0x07, 0xe0, 0x20,
+    0xca, 0x26, 0xf7, 0x07, 0xdf, 0xa9, 0xcd, 0x00, 0xfa, 0x07, 0xdf, 0xa0,
+    0x48, 0x06, 0x5f, 0xc3, 0x9c, 0x68, 0xca, 0x26, 0xf7, 0x07, 0xdf, 0x59,
+    0xcd, 0x00, 0xfa, 0x07, 0xdf, 0x50, 0xca, 0x26, 0xf7, 0x07, 0xdf, 0x69,
+    0xcd, 0x00, 0xfa, 0x07, 0xdf, 0x60, 0xcc, 0x00, 0xfb, 0x07, 0xe0, 0x11,
+    0xcb, 0x10, 0xb5, 0x07, 0xe4, 0xd0, 0xcc, 0x00, 0xfb, 0x07, 0xe0, 0x09,
+    0xcb, 0x10, 0xb5, 0x07, 0xe4, 0xc0, 0xcb, 0x64, 0x7b, 0x07, 0xe7, 0x01,
+    0xcc, 0x10, 0xb4, 0x07, 0xe4, 0xd8, 0xcb, 0x10, 0xb5, 0x07, 0xdf, 0xc1,
+    0xcc, 0x00, 0xfb, 0x07, 0xdf, 0xb0, 0xca, 0x26, 0xf7, 0x07, 0xeb, 0xf1,
+    0xcc, 0x10, 0xb4, 0x07, 0xed, 0xa0, 0xcf, 0x0e, 0x7d, 0x00, 0x31, 0xf9,
+    0xcd, 0x04, 0xe7, 0x00, 0x31, 0xf0, 0xca, 0x09, 0x9d, 0x00, 0x3b, 0xb9,
+    0x16, 0x43, 0x9c, 0x74, 0xc5, 0x05, 0x02, 0x00, 0x35, 0x1b, 0x03, 0x9c,
+    0x80, 0xcb, 0x98, 0xb0, 0x00, 0x35, 0x10, 0x4a, 0x0e, 0x7d, 0xc3, 0x9c,
+    0x86, 0xcd, 0x04, 0xfa, 0x00, 0x3b, 0x00, 0xcf, 0x0e, 0x7d, 0x00, 0x35,
+    0xa1, 0xcd, 0x04, 0xfa, 0x00, 0x35, 0x90, 0xd7, 0x2b, 0x3a, 0x00, 0x46,
+    0x39, 0x98, 0x00, 0x35, 0xa8, 0xc8, 0xa7, 0x26, 0x00, 0x45, 0x31, 0xc7,
+    0x16, 0x16, 0x00, 0x35, 0xb0, 0xc5, 0x05, 0x02, 0x00, 0x35, 0xc1, 0xc5,
+    0x00, 0xd4, 0x00, 0x35, 0xb8, 0xc5, 0x05, 0x02, 0x00, 0x46, 0x31, 0xc5,
+    0x00, 0xd4, 0x00, 0x46, 0x28, 0xc5, 0x05, 0x02, 0x00, 0x45, 0x99, 0xc5,
+    0x00, 0xd4, 0x00, 0x35, 0x01, 0xd8, 0x26, 0x03, 0x00, 0x3a, 0xf0, 0xc5,
+    0x00, 0xd4, 0x00, 0x3a, 0xe9, 0xd0, 0x25, 0x7b, 0x00, 0x3a, 0xf8, 0x49,
+    0xb2, 0xab, 0xc3, 0x9c, 0x92, 0xd3, 0x45, 0x3a, 0x00, 0x43, 0x93, 0x03,
+    0x9c, 0xba, 0xc9, 0x16, 0x14, 0x00, 0x43, 0xd1, 0xd2, 0x4e, 0x53, 0x00,
+    0x43, 0x99, 0x4b, 0x5e, 0x02, 0xc3, 0x9c, 0xc0, 0x46, 0x08, 0x09, 0xc3,
+    0x9c, 0xcc, 0xcb, 0x82, 0x59, 0x00, 0x31, 0x13, 0x03, 0x9c, 0xde, 0x5d,
+    0x10, 0x12, 0x43, 0x9c, 0xe2, 0x00, 0x43, 0x9c, 0xee, 0xcd, 0x00, 0xfa,
+    0x07, 0xf7, 0x79, 0xca, 0x26, 0xf7, 0x07, 0xf7, 0x80, 0x48, 0x04, 0xe7,
+    0xc3, 0x9c, 0xfa, 0x4a, 0x0e, 0x7d, 0x43, 0x9d, 0x06, 0x44, 0x05, 0x18,
+    0xc3, 0x9d, 0x18, 0x16, 0xc3, 0x9d, 0x24, 0xc4, 0x00, 0x9d, 0x00, 0x35,
+    0x58, 0xcb, 0x10, 0xb5, 0x07, 0xf6, 0xd9, 0xcc, 0x00, 0xfb, 0x07, 0xf6,
+    0xc8, 0xcb, 0x10, 0xb5, 0x07, 0xdc, 0x01, 0xcc, 0x00, 0xfb, 0x07, 0xdb,
+    0xf0, 0xcb, 0x10, 0xb5, 0x07, 0xdb, 0x01, 0xcc, 0x00, 0xfb, 0x07, 0xda,
+    0xf0, 0x98, 0x00, 0x45, 0xf9, 0xc9, 0xad, 0xda, 0x00, 0x45, 0xc0, 0x00,
+    0x43, 0x9d, 0x30, 0xcb, 0x10, 0xb5, 0x07, 0xdb, 0x21, 0xcc, 0x00, 0xfb,
+    0x07, 0xdb, 0x10, 0xcd, 0x04, 0xe7, 0x00, 0x45, 0x19, 0x4a, 0x0e, 0x7d,
+    0x43, 0x9d, 0x42, 0xcc, 0x00, 0xfb, 0x07, 0xf4, 0xc9, 0xcb, 0x10, 0xb5,
+    0x07, 0xf4, 0xd8, 0x52, 0x16, 0x02, 0xc3, 0x9d, 0x4e, 0xcf, 0x67, 0xce,
+    0x00, 0x36, 0x89, 0xc3, 0x14, 0xa7, 0x00, 0x36, 0x68, 0x00, 0x43, 0x9d,
+    0x60, 0x45, 0x00, 0x8c, 0xc3, 0x9d, 0x70, 0xca, 0x26, 0xf7, 0x07, 0xdd,
+    0x79, 0xcd, 0x00, 0xfa, 0x07, 0xdd, 0x70, 0x45, 0x03, 0x14, 0xc3, 0x9d,
+    0x7f, 0xc5, 0x01, 0x74, 0x00, 0x3a, 0xd8, 0xc5, 0x00, 0xd4, 0x00, 0x34,
+    0xb9, 0xd0, 0x25, 0x7b, 0x00, 0x3b, 0x58, 0xce, 0x16, 0x0f, 0x00, 0x34,
+    0xb0, 0xca, 0xa6, 0xd4, 0x00, 0x45, 0xb1, 0x98, 0x00, 0x3a, 0xb2, 0x03,
+    0x9d, 0x92, 0xdb, 0x16, 0x02, 0x00, 0x3a, 0xa1, 0x4a, 0x0e, 0x7d, 0x43,
+    0x9d, 0x98, 0xcb, 0x10, 0xb5, 0x07, 0xda, 0xe1, 0xcc, 0x00, 0xfb, 0x07,
+    0xda, 0xd0, 0xd2, 0x49, 0xc1, 0x00, 0x45, 0xa8, 0xc5, 0x05, 0x02, 0x00,
+    0x45, 0x79, 0xc5, 0x00, 0xd4, 0x00, 0x34, 0xf0, 0xcf, 0x67, 0xce, 0x00,
+    0x34, 0xdb, 0x03, 0x9d, 0xa7, 0xd8, 0x25, 0x73, 0x00, 0x3b, 0x68, 0xe0,
+    0x0a, 0x67, 0x00, 0x3b, 0xe8, 0xe0, 0x02, 0x87, 0x00, 0x3b, 0x80, 0x16,
+    0xc3, 0x9d, 0xad, 0x49, 0x1d, 0x6f, 0xc3, 0x9d, 0xb9, 0xcf, 0x3b, 0x79,
+    0x00, 0x34, 0x81, 0xc9, 0x0e, 0x6e, 0x00, 0x34, 0x53, 0x03, 0x9d, 0xc3,
+    0xc4, 0x00, 0x9d, 0x00, 0x34, 0x48, 0xcc, 0x00, 0xfb, 0x07, 0xf5, 0xe9,
+    0xcb, 0x10, 0xb5, 0x07, 0xf5, 0xf8, 0xcd, 0x00, 0xfa, 0x07, 0xf4, 0x29,
+    0xca, 0x26, 0xf7, 0x07, 0xf4, 0x30, 0xc5, 0x00, 0xd4, 0x00, 0x3b, 0x29,
+    0xc5, 0x05, 0x02, 0x00, 0x3b, 0x30, 0xcb, 0x10, 0xb5, 0x07, 0xdc, 0xe1,
+    0xcc, 0x00, 0xfb, 0x07, 0xdc, 0xd0, 0xcf, 0x0e, 0x7d, 0x00, 0x35, 0xe9,
+    0xcd, 0x04, 0xfa, 0x00, 0x3b, 0x38, 0xc3, 0x02, 0x97, 0x00, 0x3b, 0x41,
+    0x98, 0x00, 0x3b, 0x48, 0xcd, 0x00, 0xfa, 0x07, 0xdc, 0xf1, 0xca, 0x26,
+    0xf7, 0x07, 0xdc, 0xf8, 0xd6, 0x31, 0x56, 0x00, 0x44, 0x51, 0x16, 0xc3,
+    0x9d, 0xc9, 0xcb, 0x08, 0x09, 0x00, 0x34, 0x09, 0x46, 0x09, 0x3f, 0xc3,
+    0x9d, 0xd5, 0x58, 0x24, 0x9b, 0x43, 0x9d, 0xdb, 0xcc, 0x00, 0xfb, 0x07,
+    0xf5, 0x09, 0xcb, 0x10, 0xb5, 0x07, 0xf5, 0x18, 0xcb, 0x64, 0x7b, 0x07,
+    0xdd, 0x69, 0x0b, 0xc3, 0x9d, 0xe5, 0xca, 0x26, 0xf7, 0x07, 0xdd, 0x58,
+    0xcb, 0x64, 0x7b, 0x07, 0xdd, 0x49, 0x0b, 0xc3, 0x9d, 0xf1, 0xca, 0x26,
+    0xf7, 0x07, 0xdd, 0x39, 0xd0, 0x5f, 0x02, 0x00, 0x36, 0x10, 0x00, 0x43,
+    0x9d, 0xfd, 0xcf, 0x04, 0xd8, 0x0f, 0xdd, 0x23, 0x03, 0x9e, 0x09, 0xe0,
+    0x04, 0xc7, 0x0f, 0xdd, 0x40, 0xcf, 0x04, 0xd8, 0x0f, 0xdd, 0x2b, 0x03,
+    0x9e, 0x0f, 0xdf, 0x0c, 0xe1, 0x0f, 0xdd, 0x48, 0xd3, 0x45, 0x4d, 0x0f,
+    0xd1, 0x89, 0xcf, 0x18, 0x0f, 0x0f, 0xd1, 0xe8, 0x96, 0x0b, 0x4b, 0xb8,
+    0xc2, 0x10, 0x11, 0x0b, 0x47, 0xc8, 0xa5, 0x01, 0x45, 0xf9, 0xa4, 0x01,
+    0x43, 0xfa, 0x03, 0x9e, 0x15, 0xa5, 0x01, 0x46, 0xf8, 0xa5, 0x01, 0x47,
+    0x78, 0xa5, 0x01, 0x47, 0xb8, 0xa5, 0x01, 0x47, 0xd8, 0xa5, 0x01, 0x47,
+    0xe8, 0xa5, 0x01, 0x47, 0xf0, 0xc7, 0x09, 0x0d, 0x01, 0x5d, 0x21, 0xc9,
+    0x03, 0xc8, 0x01, 0x5d, 0x30, 0xcf, 0x2c, 0x35, 0x01, 0x5b, 0xd1, 0xd1,
+    0x01, 0x68, 0x01, 0x5b, 0xd0, 0xcf, 0x2c, 0x35, 0x01, 0x59, 0xe1, 0xd6,
+    0x2d, 0x62, 0x01, 0x59, 0xe8, 0xc8, 0x62, 0x44, 0x01, 0x4b, 0x51, 0xdf,
+    0x09, 0x68, 0x01, 0x4b, 0x10, 0xcc, 0x51, 0x28, 0x0e, 0xf8, 0xa9, 0xc8,
+    0x74, 0x8c, 0x00, 0x13, 0xd3, 0x03, 0x9e, 0x19, 0xcc, 0x1e, 0xc1, 0x05,
+    0x5b, 0x41, 0xc4, 0x01, 0x23, 0x00, 0x13, 0xd9, 0xc4, 0x14, 0xa6, 0x01,
+    0x63, 0xc8, 0x46, 0x00, 0x8b, 0x43, 0x9e, 0x1f, 0xcc, 0x23, 0x3f, 0x05,
+    0x5a, 0x20, 0xc9, 0xa9, 0x2d, 0x00, 0x15, 0x78, 0xc9, 0x0e, 0x6e, 0x00,
+    0xf7, 0x19, 0xc5, 0x1e, 0xc8, 0x00, 0xf7, 0x09, 0xca, 0x9e, 0x5a, 0x00,
+    0xf6, 0xf9, 0xc5, 0x1f, 0x0c, 0x00, 0xf6, 0xe9, 0xc5, 0x31, 0xee, 0x00,
+    0xf6, 0xd8, 0xc9, 0x0e, 0x6e, 0x00, 0xf6, 0xc9, 0xc5, 0x1e, 0xc8, 0x00,
+    0xf6, 0xb9, 0xca, 0x9e, 0x5a, 0x00, 0xf6, 0xa9, 0xc5, 0x1f, 0x0c, 0x00,
+    0xf6, 0x99, 0xc5, 0x31, 0xee, 0x00, 0xf6, 0x88, 0xc5, 0x05, 0x02, 0x00,
+    0xf6, 0x69, 0xc5, 0x00, 0xd4, 0x00, 0x11, 0x7a, 0x03, 0x9e, 0x2b, 0xc5,
+    0x31, 0xee, 0x00, 0x0a, 0x89, 0xc5, 0x1f, 0x0c, 0x00, 0x10, 0x68, 0xce,
+    0x01, 0x19, 0x05, 0x5b, 0x31, 0xc4, 0x00, 0x32, 0x00, 0x15, 0x28, 0xc9,
+    0x20, 0xb1, 0x00, 0x14, 0x20, 0xc3, 0x00, 0x33, 0x00, 0x14, 0x99, 0xc4,
+    0x65, 0xe2, 0x00, 0x0f, 0x78, 0x44, 0x02, 0x9b, 0xc3, 0x9e, 0x31, 0xc5,
+    0x05, 0x02, 0x00, 0xf0, 0xc8, 0xc5, 0x05, 0x02, 0x00, 0xf5, 0xc9, 0xc5,
+    0x00, 0xd4, 0x00, 0x08, 0xb8, 0x45, 0x02, 0x9a, 0x43, 0x9e, 0x4f, 0xc9,
+    0x64, 0x14, 0x00, 0xf2, 0xf9, 0xc7, 0x74, 0x8d, 0x00, 0x13, 0xe0, 0x42,
+    0x00, 0x30, 0xc3, 0x9e, 0x5b, 0xca, 0x1f, 0x07, 0x00, 0x10, 0x88, 0xcb,
+    0x4d, 0x16, 0x05, 0x5a, 0x49, 0xc6, 0xbd, 0xf4, 0x00, 0x0a, 0xb9, 0xc4,
+    0x65, 0xe2, 0x00, 0x0a, 0xc8, 0x45, 0x02, 0x9a, 0x43, 0x9e, 0x6a, 0xc7,
+    0x0e, 0x70, 0x00, 0xf7, 0x29, 0x45, 0x00, 0x5a, 0x43, 0x9e, 0x88, 0x00,
+    0x43, 0x9e, 0x94, 0xc9, 0x9b, 0xdb, 0x00, 0xf3, 0xc9, 0xc5, 0x05, 0x02,
+    0x00, 0xf3, 0xa8, 0xc6, 0x05, 0x01, 0x00, 0xf3, 0xb8, 0xc9, 0x0e, 0x6e,
+    0x00, 0xf5, 0xb9, 0xc5, 0x1e, 0xc8, 0x00, 0xf5, 0xa9, 0xca, 0x9e, 0x5a,
+    0x00, 0xf5, 0x99, 0xc5, 0x1f, 0x0c, 0x00, 0xf5, 0x89, 0xc5, 0x31, 0xee,
+    0x00, 0xf5, 0x78, 0x45, 0x02, 0x9a, 0x43, 0x9e, 0xa0, 0x42, 0x00, 0x30,
+    0xc3, 0x9e, 0xbe, 0xca, 0x1f, 0x07, 0x00, 0x10, 0x08, 0xcb, 0x98, 0x58,
+    0x00, 0x0e, 0xf8, 0xcd, 0x61, 0x8b, 0x00, 0xf4, 0xd1, 0x43, 0x00, 0x75,
+    0x43, 0x9e, 0xcd, 0xca, 0x25, 0x08, 0x05, 0x5a, 0xc9, 0xd2, 0x4c, 0x5b,
+    0x05, 0x5a, 0xc0, 0xc5, 0x05, 0x02, 0x00, 0xf2, 0x39, 0xc5, 0x00, 0xd4,
+    0x00, 0xf2, 0x28, 0xc9, 0x0e, 0x6e, 0x00, 0xf7, 0x81, 0xc5, 0x1e, 0xc8,
+    0x00, 0xf7, 0x71, 0xca, 0x9e, 0x5a, 0x00, 0xf7, 0x61, 0xc5, 0x1f, 0x0c,
+    0x00, 0xf7, 0x51, 0xc5, 0x31, 0xee, 0x00, 0xf7, 0x40, 0xc5, 0x31, 0xee,
+    0x00, 0x0b, 0x81, 0xc5, 0x1f, 0x0c, 0x00, 0x10, 0xa0, 0xc5, 0x05, 0x02,
+    0x00, 0xf3, 0x91, 0x44, 0x02, 0x9b, 0x43, 0x9e, 0xdc, 0xcb, 0x98, 0x58,
+    0x00, 0x11, 0x80, 0xc9, 0x0e, 0x6e, 0x00, 0xf6, 0x31, 0xc5, 0x1e, 0xc8,
+    0x00, 0xf6, 0x21, 0xca, 0x9e, 0x5a, 0x00, 0xf6, 0x11, 0xc5, 0x1f, 0x0c,
+    0x00, 0xf6, 0x01, 0xc5, 0x31, 0xee, 0x00, 0xf5, 0xf0, 0xcb, 0x98, 0x58,
+    0x00, 0x0f, 0x00, 0xcb, 0x98, 0x58, 0x00, 0xf2, 0xe0, 0x16, 0xc3, 0x9e,
+    0xf4, 0xc6, 0x8e, 0xde, 0x00, 0x89, 0x11, 0xc5, 0x79, 0xf2, 0x00, 0x89,
+    0x21, 0xc5, 0xdb, 0xff, 0x00, 0x89, 0x30, 0x87, 0x00, 0x8c, 0x28, 0xc4,
+    0xad, 0x2b, 0x00, 0x8e, 0x61, 0xc5, 0x90, 0xe4, 0x06, 0xbe, 0xb0, 0xc4,
+    0xad, 0x2b, 0x00, 0x8e, 0x99, 0xc5, 0x90, 0xe4, 0x00, 0x8e, 0xa1, 0xc6,
+    0xc0, 0x7c, 0x06, 0xbe, 0xc9, 0xc7, 0xba, 0x7b, 0x06, 0xbe, 0xd0, 0x02,
+    0x43, 0x9f, 0x00, 0xc4, 0xe4, 0x57, 0x01, 0x9f, 0xf0, 0xc3, 0x05, 0x14,
+    0x01, 0x9b, 0x69, 0x16, 0xc3, 0x9f, 0x1e, 0xc4, 0x09, 0x9d, 0x01, 0x9b,
+    0x80, 0xc4, 0x01, 0x23, 0x00, 0x15, 0xa9, 0xc8, 0x74, 0x8c, 0x08, 0x3d,
+    0x32, 0x03, 0x9f, 0x2a, 0xc4, 0x23, 0x2e, 0x0e, 0x8b, 0x19, 0xc4, 0x2c,
+    0x0d, 0x0e, 0x8a, 0x08, 0xc4, 0x23, 0x2e, 0x0e, 0x8b, 0x09, 0xc4, 0x2c,
+    0x0d, 0x0e, 0x89, 0xf8, 0xa0, 0x0e, 0x8e, 0x71, 0x9f, 0x0e, 0x8e, 0x69,
+    0x9e, 0x0e, 0x8e, 0x60, 0x46, 0x00, 0x2c, 0xc3, 0x9f, 0x30, 0xc5, 0x02,
+    0xc2, 0x0e, 0x8a, 0x49, 0xc5, 0x01, 0xfc, 0x0e, 0x8a, 0x40, 0xc5, 0x02,
+    0xc2, 0x0e, 0x8a, 0x79, 0xc5, 0x01, 0xfc, 0x0e, 0x8a, 0x70, 0xc5, 0x02,
+    0xc2, 0x0e, 0x8a, 0x69, 0xc5, 0x01, 0xfc, 0x0e, 0x8a, 0x60, 0xc5, 0x02,
+    0xc2, 0x0e, 0x8a, 0x59, 0xc5, 0x01, 0xfc, 0x0e, 0x8a, 0x50, 0xcd, 0x7f,
+    0x18, 0x0e, 0x8d, 0x69, 0xc4, 0xe4, 0xd3, 0x0e, 0x8c, 0x41, 0x16, 0xc3,
+    0x9f, 0x3c, 0xd0, 0x5f, 0x42, 0x0e, 0x8b, 0x30, 0xc6, 0xd1, 0xc3, 0x0e,
+    0x8d, 0x51, 0xcb, 0x91, 0x62, 0x0e, 0x8c, 0x51, 0xc2, 0x00, 0x8d, 0x0e,
+    0x8c, 0x28, 0x14, 0xc3, 0x9f, 0x48, 0xc5, 0xd9, 0xac, 0x0e, 0x8b, 0xe8,
+    0xc2, 0x00, 0x7e, 0x0e, 0x8c, 0x39, 0x43, 0xe5, 0x96, 0x43, 0x9f, 0x54,
+    0xc5, 0x09, 0x02, 0x0e, 0x8b, 0xdb, 0x03, 0x9f, 0x68, 0xcf, 0x65, 0x67,
+    0x0e, 0x8b, 0x68, 0xc9, 0xab, 0x1c, 0x0e, 0x8c, 0x00, 0xc5, 0x5e, 0x2d,
+    0x0e, 0x8e, 0x18, 0xcd, 0x42, 0x94, 0x00, 0xff, 0xe1, 0xc4, 0x7a, 0x04,
+    0x00, 0xfb, 0x42, 0x03, 0x9f, 0x6e, 0x45, 0x02, 0x9a, 0x43, 0x9f, 0x74,
+    0x45, 0x02, 0x9a, 0x43, 0x9f, 0x8a, 0x45, 0x02, 0x9a, 0x43, 0x9f, 0x96,
+    0x45, 0x02, 0x9a, 0x43, 0x9f, 0xa2, 0x45, 0x02, 0x9a, 0x43, 0x9f, 0xb4,
+    0xcb, 0x94, 0x90, 0x00, 0xf9, 0xf1, 0xc4, 0xe3, 0xab, 0x00, 0xf9, 0xe1,
+    0xc5, 0x28, 0x47, 0x00, 0xf9, 0xd0, 0xcd, 0x42, 0x94, 0x00, 0xfe, 0x61,
+    0xc4, 0x7a, 0x04, 0x00, 0xf9, 0x42, 0x03, 0x9f, 0xc6, 0x45, 0x02, 0x9a,
+    0x43, 0x9f, 0xcc, 0x45, 0x02, 0x9a, 0x43, 0x9f, 0xe2, 0x45, 0x02, 0x9a,
+    0x43, 0x9f, 0xee, 0xcd, 0x42, 0x94, 0x00, 0xfd, 0xe1, 0xc4, 0x7a, 0x04,
+    0x00, 0xf8, 0x42, 0x03, 0x9f, 0xfa, 0xc4, 0x28, 0x48, 0x00, 0xfd, 0xd1,
+    0xc5, 0xd6, 0x41, 0x00, 0xfd, 0xc0, 0x45, 0x02, 0x9a, 0x43, 0xa0, 0x00,
+    0xca, 0x94, 0x91, 0x00, 0xff, 0xb3, 0x03, 0xa0, 0x16, 0xc4, 0x7a, 0x04,
+    0x00, 0xfb, 0x02, 0x03, 0xa0, 0x1c, 0xd2, 0x4a, 0x63, 0x00, 0xff, 0xa0,
+    0xd2, 0x4a, 0x63, 0x00, 0xff, 0x90, 0x45, 0x02, 0x9a, 0x43, 0xa0, 0x22,
+    0x45, 0x02, 0x9a, 0x43, 0xa0, 0x43, 0x45, 0x02, 0x9a, 0x43, 0xa0, 0x4f,
+    0x45, 0x02, 0x9a, 0x43, 0xa0, 0x5b, 0x45, 0x02, 0x9a, 0x43, 0xa0, 0x73,
+    0x45, 0x02, 0x9a, 0x43, 0xa0, 0x85, 0x45, 0x02, 0x9a, 0x43, 0xa0, 0x97,
+    0x45, 0x02, 0x9a, 0x43, 0xa0, 0xaf, 0x45, 0x02, 0x9a, 0x43, 0xa0, 0xc1,
+    0xca, 0x94, 0x91, 0x00, 0xfe, 0x33, 0x03, 0xa0, 0xd3, 0xc4, 0x7a, 0x04,
+    0x00, 0xf9, 0x02, 0x03, 0xa0, 0xd9, 0xd2, 0x4a, 0x63, 0x00, 0xfe, 0x20,
+    0xd2, 0x4a, 0x63, 0x00, 0xfe, 0x10, 0x45, 0x02, 0x9a, 0x43, 0xa0, 0xdf,
+    0x45, 0x02, 0x9a, 0x43, 0xa1, 0x00, 0x45, 0x02, 0x9a, 0x43, 0xa1, 0x0c,
+    0xca, 0x94, 0x91, 0x00, 0xfd, 0xb3, 0x03, 0xa1, 0x18, 0xc4, 0x7a, 0x04,
+    0x00, 0xf8, 0x02, 0x03, 0xa1, 0x1e, 0xd2, 0x4a, 0x63, 0x00, 0xfd, 0xa0,
+    0xc4, 0x28, 0x48, 0x00, 0xfb, 0x83, 0x03, 0xa1, 0x24, 0xc5, 0xd6, 0x41,
+    0x00, 0xfd, 0x80, 0x45, 0x02, 0x9a, 0x43, 0xa1, 0x2a, 0x00, 0x43, 0xa1,
+    0x4b, 0xc7, 0x33, 0xe6, 0x08, 0x0a, 0x33, 0x03, 0xa1, 0x57, 0xc6, 0xb9,
+    0xdc, 0x08, 0x0a, 0x40, 0xc7, 0x33, 0xe6, 0x08, 0x0a, 0x3b, 0x03, 0xa1,
+    0x5d, 0xc6, 0xb9, 0xdc, 0x08, 0x0a, 0x50, 0xca, 0xa6, 0x70, 0x0e, 0x7d,
+    0xe3, 0x03, 0xa1, 0x63, 0xc9, 0x92, 0x8d, 0x0e, 0x7d, 0xd2, 0x03, 0xa1,
+    0x69, 0xd6, 0x2d, 0x0a, 0x0e, 0x7d, 0xb8, 0xc9, 0x40, 0xaa, 0x09, 0x10,
+    0x38, 0xca, 0xa3, 0x1e, 0x09, 0x0f, 0x00, 0xc4, 0x58, 0xf5, 0x09, 0x0e,
+    0xf1, 0xca, 0x9e, 0x46, 0x09, 0x0e, 0xe8, 0xcf, 0x6a, 0xbc, 0x09, 0x0e,
+    0x98, 0xc2, 0x10, 0x37, 0x09, 0x0e, 0x71, 0xc2, 0x00, 0xd0, 0x09, 0x0e,
+    0x68, 0xc2, 0x02, 0x6f, 0x09, 0x25, 0xe9, 0xc2, 0x01, 0xdd, 0x09, 0x25,
+    0xe0, 0xd4, 0x3a, 0x98, 0x0e, 0xc8, 0x11, 0xcb, 0x92, 0xa1, 0x0e, 0xc7,
+    0xf8, 0xcc, 0x18, 0xdb, 0x0e, 0xc8, 0x09, 0x16, 0xc3, 0xa1, 0x6f, 0xc9,
+    0xad, 0x9b, 0x0e, 0xc4, 0x99, 0xca, 0xa1, 0x70, 0x0e, 0xc0, 0x40, 0xcb,
+    0x13, 0x90, 0x0e, 0xc7, 0xe9, 0xcb, 0x13, 0x89, 0x0e, 0xc7, 0xe1, 0xcc,
+    0x85, 0x95, 0x0e, 0xc7, 0xda, 0x03, 0xa1, 0x7b, 0xc4, 0x18, 0xf2, 0x0e,
+    0xc7, 0xc9, 0xc9, 0x13, 0x9c, 0x0e, 0xc7, 0xc1, 0xc8, 0x1e, 0x56, 0x0e,
+    0xc7, 0xb8, 0x05, 0xc3, 0xa1, 0x81, 0xc4, 0x01, 0x23, 0x0e, 0xc7, 0x33,
+    0x03, 0xa1, 0x8e, 0x4e, 0x6b, 0xd4, 0xc3, 0xa1, 0x94, 0xc4, 0x0e, 0xe2,
+    0x0e, 0xc6, 0xe3, 0x03, 0xa1, 0xa0, 0x47, 0xc6, 0xcc, 0x43, 0xa1, 0xa4,
+    0xca, 0x13, 0x91, 0x0e, 0xc5, 0xd1, 0xcd, 0x3a, 0x9e, 0x0e, 0xc0, 0x48,
+    0x00, 0x43, 0xa1, 0xb0, 0x00, 0x43, 0xa1, 0xe5, 0x47, 0x0e, 0xcd, 0x43,
+    0xa1, 0xf4, 0xcc, 0x8a, 0x5d, 0x0e, 0xc0, 0xe8, 0xc8, 0x64, 0xba, 0x0e,
+    0xc2, 0x11, 0x4a, 0x9b, 0x1c, 0x43, 0xa2, 0x00, 0x4d, 0x76, 0xc4, 0xc3,
+    0xa2, 0x0c, 0xce, 0x70, 0xdc, 0x0e, 0xc1, 0xb0, 0xcf, 0x3a, 0x9d, 0x0e,
+    0xc5, 0xb1, 0xc9, 0x13, 0x9c, 0x0e, 0xc5, 0xa8, 0xce, 0x70, 0xea, 0x0e,
+    0xc4, 0x89, 0x47, 0xc6, 0x63, 0x43, 0xa2, 0x18, 0xc5, 0x18, 0xf1, 0x0e,
+    0xc3, 0x20, 0x00, 0x43, 0xa2, 0x24, 0xc6, 0x58, 0xac, 0x0e, 0xc2, 0xbb,
+    0x03, 0xa2, 0x30, 0xcd, 0x27, 0xac, 0x0e, 0xc2, 0x91, 0xc4, 0x18, 0xf2,
+    0x0e, 0xc2, 0x81, 0xc9, 0xb4, 0x40, 0x0e, 0xc2, 0x70, 0xc9, 0x13, 0x9c,
+    0x0e, 0xc2, 0x3b, 0x03, 0xa2, 0x34, 0xc6, 0x58, 0xac, 0x0e, 0xc2, 0x31,
+    0xc4, 0x18, 0xf2, 0x0e, 0xc2, 0x28, 0xc2, 0x00, 0x74, 0x0e, 0xc7, 0x99,
+    0xc3, 0x00, 0xa3, 0x0e, 0xc7, 0x90, 0x00, 0x43, 0xa2, 0x3a, 0xc6, 0x13,
+    0x95, 0x0e, 0xc5, 0x31, 0xc4, 0x00, 0x9d, 0x0e, 0xc4, 0x42, 0x03, 0xa2,
+    0x4a, 0xc6, 0x0e, 0xcd, 0x0e, 0xc4, 0xe8, 0xc4, 0x0e, 0xe2, 0x0e, 0xc3,
+    0xf9, 0xc7, 0x27, 0xb2, 0x0e, 0xc3, 0xe0, 0xc2, 0x00, 0x74, 0x0e, 0xc6,
+    0xc9, 0xc3, 0x00, 0xa3, 0x0e, 0xc6, 0xc0, 0xc5, 0x0e, 0xce, 0x0e, 0xc7,
+    0x63, 0x03, 0xa2, 0x50, 0xcb, 0x13, 0x90, 0x0e, 0xc6, 0x00, 0x46, 0x0e,
+    0xce, 0xc3, 0xa2, 0x56, 0xc8, 0xbc, 0x62, 0x0e, 0xc3, 0x80, 0x00, 0x43,
+    0xa2, 0x62, 0xc2, 0x00, 0x15, 0x0e, 0xcc, 0x78, 0xca, 0x03, 0x87, 0x01,
+    0x5d, 0x09, 0xc9, 0x01, 0x88, 0x01, 0x5d, 0x00, 0xcc, 0x10, 0xb4, 0x07,
+    0xeb, 0x41, 0xca, 0x26, 0xf7, 0x07, 0xeb, 0x38, 0xca, 0x26, 0xf7, 0x07,
+    0xe3, 0x41, 0xcd, 0x00, 0xfa, 0x07, 0xe0, 0x18, 0xca, 0x9f, 0x72, 0x00,
+    0x3b, 0xb1, 0xc8, 0xbf, 0x42, 0x00, 0x3b, 0xa8, 0xd5, 0x0e, 0x77, 0x00,
+    0x45, 0x20, 0xc5, 0x05, 0x02, 0x00, 0x35, 0x29, 0xd6, 0x2d, 0x8e, 0x00,
+    0x3b, 0x08, 0x45, 0x00, 0x49, 0xc3, 0xa2, 0x7a, 0x14, 0xc3, 0xa2, 0x86,
+    0xd2, 0x4d, 0x33, 0x00, 0x43, 0xab, 0x03, 0xa2, 0x92, 0xcf, 0x63, 0x69,
+    0x00, 0x43, 0x8b, 0x03, 0xa2, 0x98, 0xc5, 0x4d, 0x40, 0x00, 0x43, 0xa1,
+    0xc5, 0x63, 0x73, 0x00, 0x43, 0x80, 0x45, 0x02, 0x9a, 0x43, 0xa2, 0x9e,
+    0xc5, 0x05, 0x02, 0x00, 0x33, 0x99, 0xc5, 0x00, 0xd4, 0x00, 0x33, 0x90,
+    0xc5, 0x05, 0x02, 0x00, 0x31, 0x2b, 0x03, 0xa2, 0xaa, 0xc5, 0x00, 0xd4,
+    0x00, 0x31, 0x1a, 0x03, 0xa2, 0xae, 0x00, 0x43, 0xa2, 0xb2, 0xc8, 0xbf,
+    0x42, 0x00, 0x3b, 0x99, 0xca, 0x9f, 0x72, 0x00, 0x3b, 0xa0, 0xca, 0x26,
+    0xf7, 0x07, 0xda, 0x89, 0xcd, 0x00, 0xfa, 0x07, 0xda, 0x80, 0xd0, 0x05,
+    0x29, 0x00, 0x44, 0x69, 0xc5, 0x00, 0xd4, 0x00, 0x31, 0xd8, 0xc5, 0x05,
+    0x02, 0x00, 0x31, 0xe1, 0xc5, 0x00, 0xd4, 0x00, 0x3b, 0x19, 0xd6, 0x2d,
+    0x8e, 0x00, 0x3b, 0x20, 0xc5, 0x05, 0x02, 0x00, 0x45, 0xa1, 0xc5, 0x00,
+    0xd4, 0x00, 0x35, 0x60, 0xcf, 0x67, 0xce, 0x00, 0x35, 0x71, 0xcd, 0x04,
+    0xe7, 0x00, 0x3b, 0xf8, 0xc4, 0xe0, 0x63, 0x00, 0x36, 0x19, 0xcd, 0x00,
+    0xfa, 0x07, 0xf4, 0x99, 0xca, 0x26, 0xf7, 0x07, 0xf4, 0xa0, 0xc5, 0x05,
+    0x02, 0x00, 0x44, 0x61, 0xc5, 0x00, 0xd4, 0x00, 0x34, 0xf8, 0xd0, 0x59,
+    0xa2, 0x00, 0x45, 0xd1, 0xc9, 0x16, 0x14, 0x00, 0x45, 0x49, 0xcb, 0x08,
+    0x09, 0x00, 0x45, 0x40, 0x0b, 0xc3, 0xa2, 0xbe, 0xca, 0x26, 0xf7, 0x07,
+    0xf4, 0x51, 0xcb, 0x64, 0x7b, 0x07, 0xf4, 0x60, 0xcb, 0x08, 0x09, 0x00,
+    0x36, 0x9b, 0x03, 0xa2, 0xca, 0x5d, 0x10, 0x12, 0x43, 0xa2, 0xce, 0xca,
+    0x59, 0xa8, 0x00, 0x45, 0xc9, 0x98, 0x00, 0x34, 0x93, 0x03, 0xa2, 0xda,
+    0xde, 0x02, 0x89, 0x00, 0x3b, 0x88, 0xc6, 0x05, 0x01, 0x00, 0x45, 0x00,
+    0xd6, 0x2d, 0x8e, 0x00, 0x3a, 0x93, 0x03, 0xa2, 0xe0, 0xd2, 0x4b, 0xcb,
+    0x00, 0x3a, 0x80, 0xd5, 0x0e, 0x77, 0x00, 0x34, 0xe0, 0x4a, 0x0e, 0x7d,
+    0xc3, 0xa2, 0xe6, 0x46, 0x02, 0xa0, 0x43, 0xa2, 0xf2, 0x98, 0x00, 0x37,
+    0x71, 0xcd, 0x31, 0x5f, 0x00, 0x3a, 0xd0, 0xce, 0x05, 0x39, 0x00, 0x34,
+    0x58, 0x4a, 0x0e, 0x7d, 0xc3, 0xa2, 0xf8, 0x48, 0x04, 0xe7, 0x43, 0xa3,
+    0x04, 0xe0, 0x09, 0x87, 0x00, 0x3b, 0xe0, 0xc5, 0x05, 0x02, 0x00, 0x3b,
+    0x71, 0x03, 0x43, 0xa3, 0x10, 0xcb, 0x10, 0xb5, 0x07, 0xdd, 0x61, 0xcc,
+    0x00, 0xfb, 0x07, 0xdd, 0x50, 0xcb, 0x10, 0xb5, 0x07, 0xdd, 0x41, 0xcc,
+    0x00, 0xfb, 0x07, 0xdd, 0x30, 0xca, 0x26, 0xf7, 0x07, 0xdd, 0x29, 0xcd,
+    0x00, 0xfa, 0x07, 0xdd, 0x20, 0xd0, 0x13, 0xe9, 0x0f, 0xdd, 0x58, 0xcf,
+    0x0a, 0x48, 0x0f, 0xdd, 0x50, 0xa5, 0x01, 0x47, 0xf8, 0xd3, 0x42, 0xed,
+    0x0e, 0xf8, 0x40, 0xd1, 0x01, 0x68, 0x05, 0x5a, 0x11, 0xc6, 0x01, 0x73,
+    0x05, 0x5a, 0x08, 0xcb, 0x98, 0x58, 0x00, 0x11, 0x88, 0xc9, 0x0e, 0x6e,
+    0x00, 0xf6, 0x39, 0xc5, 0x1e, 0xc8, 0x00, 0xf6, 0x29, 0xca, 0x9e, 0x5a,
+    0x00, 0xf6, 0x19, 0xc5, 0x1f, 0x0c, 0x00, 0xf6, 0x09, 0xc5, 0x31, 0xee,
+    0x00, 0xf5, 0xf8, 0xcc, 0x51, 0x28, 0x0e, 0xf8, 0xb1, 0xcc, 0x1e, 0xc1,
+    0x00, 0xeb, 0x98, 0xc5, 0x05, 0x02, 0x00, 0xf2, 0xdb, 0x03, 0xa3, 0x1c,
+    0xc5, 0x00, 0xd4, 0x00, 0xf2, 0xc8, 0xc9, 0x0e, 0x6e, 0x00, 0xf7, 0x89,
+    0xc5, 0x1e, 0xc8, 0x00, 0xf7, 0x79, 0xca, 0x9e, 0x5a, 0x00, 0xf7, 0x69,
+    0xc5, 0x1f, 0x0c, 0x00, 0xf7, 0x59, 0xc5, 0x31, 0xee, 0x00, 0xf7, 0x48,
+    0xc5, 0x31, 0xee, 0x00, 0x0b, 0x89, 0xc5, 0x1f, 0x0c, 0x00, 0x10, 0xa8,
+    0xc5, 0x05, 0x02, 0x00, 0xf3, 0x99, 0x44, 0x02, 0x9b, 0x43, 0xa3, 0x22,
+    0xc9, 0x0e, 0x6e, 0x00, 0xf5, 0x69, 0xc5, 0x1e, 0xc8, 0x00, 0xf5, 0x59,
+    0xca, 0x9e, 0x5a, 0x00, 0xf5, 0x49, 0xc5, 0x1f, 0x0c, 0x00, 0xf5, 0x39,
+    0xc5, 0x31, 0xee, 0x00, 0xf5, 0x28, 0xc5, 0x05, 0x02, 0x00, 0xf5, 0x09,
+    0xc5, 0x00, 0xd4, 0x00, 0x11, 0x3a, 0x03, 0xa3, 0x3a, 0xc5, 0x05, 0x02,
+    0x00, 0xf0, 0x09, 0xc5, 0x00, 0xd4, 0x00, 0x07, 0x2a, 0x03, 0xa3, 0x40,
+    0xc6, 0x60, 0xb1, 0x00, 0x0e, 0xa9, 0xc5, 0x31, 0xee, 0x00, 0x0e, 0xb9,
+    0xc5, 0x8e, 0x66, 0x00, 0x0e, 0xc9, 0xc5, 0x1f, 0x0c, 0x00, 0x0e, 0xd8,
+    0xc6, 0xc1, 0x86, 0x05, 0x4b, 0x91, 0xc5, 0xc0, 0x7d, 0x00, 0x89, 0x18,
+    0xc3, 0x05, 0x14, 0x01, 0x9f, 0xa1, 0x16, 0xc3, 0xa3, 0x46, 0x08, 0xc3,
+    0xa3, 0x52, 0x15, 0xc3, 0xa3, 0x5e, 0xc5, 0x06, 0xdb, 0x01, 0x9f, 0xd9,
+    0xc4, 0x26, 0x78, 0x01, 0x9f, 0xe0, 0xc2, 0x02, 0xa0, 0x01, 0x9b, 0x71,
+    0xc4, 0x02, 0xde, 0x01, 0x9b, 0x78, 0xd3, 0x42, 0xed, 0x08, 0x3d, 0x38,
+    0xc5, 0x02, 0xc2, 0x0e, 0x8a, 0x89, 0xc5, 0x01, 0xfc, 0x0e, 0x8a, 0x80,
+    0x45, 0xab, 0x1f, 0xc3, 0xa3, 0x6a, 0xc2, 0x00, 0x4f, 0x0e, 0x8b, 0x28,
+    0xcb, 0x90, 0xf4, 0x0e, 0x8c, 0x59, 0x46, 0x6d, 0xc6, 0x43, 0xa3, 0x74,
+    0xa2, 0x0e, 0x8b, 0x91, 0xa1, 0x0e, 0x8b, 0x89, 0xa0, 0x0e, 0x8b, 0x81,
+    0x9f, 0x0e, 0x8b, 0x79, 0x9e, 0x0e, 0x8b, 0x70, 0xc9, 0xab, 0x1c, 0x0e,
+    0x8c, 0x08, 0x45, 0x02, 0x9a, 0x43, 0xa3, 0x80, 0x12, 0xc3, 0xa3, 0x96,
+    0xc4, 0xe3, 0xab, 0x00, 0xfb, 0x6b, 0x03, 0xa3, 0xa5, 0xc5, 0x28, 0x47,
+    0x00, 0xfb, 0x5a, 0x03, 0xa3, 0xab, 0xc4, 0xe3, 0xab, 0x00, 0xfa, 0x69,
+    0xc5, 0x28, 0x47, 0x00, 0xfa, 0x58, 0xc4, 0xe3, 0xab, 0x00, 0xfa, 0x61,
+    0xc5, 0x28, 0x47, 0x00, 0xfa, 0x50, 0xcb, 0x94, 0x90, 0x00, 0xfa, 0xf9,
+    0xc4, 0xe3, 0xab, 0x00, 0xfa, 0xe9, 0xc5, 0x28, 0x47, 0x00, 0xfa, 0xd8,
+    0xcb, 0x94, 0x90, 0x00, 0xf9, 0xf9, 0xc4, 0xe3, 0xab, 0x00, 0xf9, 0xe9,
+    0xc5, 0x28, 0x47, 0x00, 0xf9, 0xd8, 0x45, 0x02, 0x9a, 0x43, 0xa3, 0xb1,
+    0x12, 0xc3, 0xa3, 0xc7, 0xc4, 0xe3, 0xab, 0x00, 0xf9, 0x6b, 0x03, 0xa3,
+    0xd6, 0xc5, 0x28, 0x47, 0x00, 0xf9, 0x5a, 0x03, 0xa3, 0xdc, 0xc4, 0xe3,
+    0xab, 0x00, 0xf8, 0xe9, 0xc5, 0x28, 0x47, 0x00, 0xf8, 0xd8, 0xc4, 0xe3,
+    0xab, 0x00, 0xf8, 0xe1, 0xc5, 0x28, 0x47, 0x00, 0xf8, 0xd0, 0x45, 0x02,
+    0x9a, 0x43, 0xa3, 0xe2, 0x12, 0xc3, 0xa3, 0xf8, 0xc4, 0xe3, 0xab, 0x00,
+    0xf8, 0x6b, 0x03, 0xa4, 0x07, 0xc5, 0x28, 0x47, 0x00, 0xf8, 0x5a, 0x03,
+    0xa4, 0x0d, 0xd2, 0x4a, 0x63, 0x00, 0xff, 0xb8, 0x45, 0x02, 0x9a, 0x43,
+    0xa4, 0x13, 0xcb, 0x94, 0x90, 0x00, 0xfb, 0x3b, 0x03, 0xa4, 0x34, 0xc4,
+    0xe3, 0xab, 0x00, 0xfb, 0x2b, 0x03, 0xa4, 0x3a, 0xc5, 0x28, 0x47, 0x00,
+    0xfb, 0x1b, 0x03, 0xa4, 0x40, 0xcd, 0x4a, 0x68, 0x00, 0xfd, 0x08, 0xc4,
+    0xe3, 0xab, 0x00, 0xfa, 0x29, 0xc5, 0x28, 0x47, 0x00, 0xfa, 0x18, 0xc4,
+    0xe3, 0xab, 0x00, 0xfa, 0x21, 0xc5, 0x28, 0x47, 0x00, 0xfa, 0x10, 0xcb,
+    0x94, 0x90, 0x00, 0xff, 0x39, 0xc4, 0xe3, 0xab, 0x00, 0xff, 0x19, 0xc5,
+    0x28, 0x47, 0x00, 0xff, 0x11, 0xc5, 0x63, 0xdc, 0x00, 0x1d, 0x80, 0xcb,
+    0x94, 0x90, 0x00, 0xfa, 0xb9, 0xc4, 0xe3, 0xab, 0x00, 0xfa, 0xa9, 0xc5,
+    0x28, 0x47, 0x00, 0xfa, 0x98, 0xcb, 0x94, 0x90, 0x00, 0xfa, 0xb1, 0xc4,
+    0xe3, 0xab, 0x00, 0xfa, 0xa1, 0xc5, 0x28, 0x47, 0x00, 0xfa, 0x90, 0xcb,
+    0x94, 0x90, 0x00, 0xfe, 0xb9, 0xc4, 0xe3, 0xab, 0x00, 0xfe, 0x99, 0xc5,
+    0x28, 0x47, 0x00, 0xfe, 0x91, 0xc5, 0x63, 0xdc, 0x00, 0x1c, 0x80, 0xcb,
+    0x94, 0x90, 0x00, 0xf9, 0xb9, 0xc4, 0xe3, 0xab, 0x00, 0xf9, 0xa9, 0xc5,
+    0x28, 0x47, 0x00, 0xf9, 0x98, 0xcb, 0x94, 0x90, 0x00, 0xf9, 0xb1, 0xc4,
+    0xe3, 0xab, 0x00, 0xf9, 0xa1, 0xc5, 0x28, 0x47, 0x00, 0xf9, 0x90, 0xd2,
+    0x4a, 0x63, 0x00, 0xfe, 0x38, 0x45, 0x02, 0x9a, 0x43, 0xa4, 0x46, 0xcb,
+    0x94, 0x90, 0x00, 0xf9, 0x3b, 0x03, 0xa4, 0x67, 0xc4, 0xe3, 0xab, 0x00,
+    0xf9, 0x2b, 0x03, 0xa4, 0x6d, 0xc5, 0x28, 0x47, 0x00, 0xf9, 0x1b, 0x03,
+    0xa4, 0x73, 0xcd, 0x4a, 0x68, 0x00, 0xfc, 0x88, 0xc4, 0xe3, 0xab, 0x00,
+    0xf8, 0xa9, 0xc5, 0x28, 0x47, 0x00, 0xf8, 0x98, 0xc4, 0xe3, 0xab, 0x00,
+    0xf8, 0xa1, 0xc5, 0x28, 0x47, 0x00, 0xf8, 0x90, 0xd2, 0x4a, 0x63, 0x00,
+    0xfd, 0xb8, 0x45, 0x02, 0x9a, 0x43, 0xa4, 0x79, 0xd2, 0x4a, 0x63, 0x00,
+    0xfd, 0x90, 0xcb, 0x94, 0x90, 0x00, 0xf8, 0x3b, 0x03, 0xa4, 0x9a, 0xc4,
+    0xe3, 0xab, 0x00, 0xf8, 0x2b, 0x03, 0xa4, 0xa0, 0xc5, 0x28, 0x47, 0x00,
+    0xf8, 0x1b, 0x03, 0xa4, 0xa6, 0xcd, 0x4a, 0x68, 0x00, 0xfc, 0x08, 0xc7,
+    0xb9, 0xdb, 0x08, 0x0a, 0x61, 0xc7, 0x67, 0xc7, 0x08, 0x0a, 0x98, 0xc8,
+    0xb9, 0xda, 0x08, 0x0a, 0x70, 0xc8, 0x67, 0xc6, 0x08, 0x0a, 0xb0, 0xca,
+    0xa2, 0x6a, 0x0e, 0x7d, 0xe8, 0x46, 0x00, 0x8b, 0x43, 0xa4, 0xac, 0xcc,
+    0x89, 0x61, 0x0e, 0xc8, 0x01, 0xca, 0x92, 0xa2, 0x0e, 0xc7, 0xf0, 0xc9,
+    0x67, 0x79, 0x0e, 0xc1, 0x60, 0xc5, 0x02, 0xd2, 0x0e, 0xc7, 0x5b, 0x03,
+    0xa4, 0xb8, 0x17, 0x43, 0xa4, 0xbe, 0x4a, 0x6d, 0x50, 0x43, 0xa4, 0xc8,
+    0xc4, 0x18, 0xf2, 0x0e, 0xc7, 0x29, 0xc8, 0x45, 0x27, 0x0e, 0xc7, 0x20,
+    0x00, 0x43, 0xa4, 0xd4, 0xcc, 0x85, 0x41, 0x0e, 0xc1, 0xd9, 0xcd, 0x7e,
+    0x96, 0x0e, 0xc1, 0xd0, 0x05, 0xc3, 0xa4, 0xe6, 0xc6, 0x13, 0x95, 0x0e,
+    0xc5, 0x21, 0x14, 0xc3, 0xa4, 0xf5, 0xc5, 0x0e, 0xce, 0x0e, 0xc0, 0xf3,
+    0x03, 0xa5, 0x04, 0xd7, 0x27, 0xa2, 0x0e, 0xc1, 0x39, 0xc6, 0x58, 0xac,
+    0x0e, 0xc0, 0x93, 0x03, 0xa5, 0x08, 0xc4, 0x18, 0xf2, 0x0e, 0xc0, 0x83,
+    0x03, 0xa5, 0x0e, 0xd3, 0x45, 0x27, 0x0e, 0xc1, 0x00, 0xc9, 0x6d, 0x53,
+    0x0e, 0xc0, 0xa3, 0x03, 0xa5, 0x14, 0xc3, 0x01, 0xc8, 0x0e, 0xc0, 0x60,
+    0xc9, 0x13, 0x9c, 0x0e, 0xc1, 0x29, 0xc4, 0x0e, 0xe2, 0x0e, 0xc1, 0x20,
+    0xc7, 0x1a, 0xc5, 0x0e, 0xc2, 0x09, 0xc2, 0x02, 0xae, 0x0e, 0xc2, 0x00,
+    0xc6, 0x58, 0xac, 0x0e, 0xc1, 0xc9, 0xc2, 0x02, 0xae, 0x0e, 0xc1, 0xc0,
+    0xc6, 0x3b, 0x9c, 0x0e, 0xc4, 0x81, 0xc8, 0x45, 0x27, 0x0e, 0xc4, 0x78,
+    0xc4, 0x18, 0xf2, 0x0e, 0xc2, 0x89, 0xc9, 0xb4, 0x40, 0x0e, 0xc2, 0x78,
+    0x00, 0x43, 0xa5, 0x1a, 0xc6, 0xcd, 0x73, 0x0e, 0xc2, 0x40, 0x15, 0xc3,
+    0xa5, 0x26, 0xc5, 0x17, 0x14, 0x0e, 0xc7, 0x79, 0xc4, 0x05, 0x75, 0x0e,
+    0xc7, 0x70, 0xca, 0x13, 0x9b, 0x0e, 0xc4, 0x68, 0xc5, 0x05, 0x74, 0x0e,
+    0xc7, 0x68, 0xc7, 0x27, 0xb2, 0x0e, 0xc3, 0x91, 0xc4, 0x0e, 0xe2, 0x0e,
+    0xc3, 0x70, 0x45, 0x0d, 0x20, 0xc3, 0xa5, 0x32, 0xc6, 0x13, 0x95, 0x0e,
+    0xc5, 0x29, 0xc4, 0x00, 0x9d, 0x0e, 0xc4, 0x39, 0xc5, 0x0e, 0xce, 0x0e,
+    0xc0, 0xf8, 0xc5, 0x08, 0x09, 0x00, 0x44, 0x11, 0xc9, 0x4d, 0xde, 0x00,
+    0x43, 0xc0, 0x45, 0x00, 0x2d, 0xc3, 0xa5, 0x3e, 0x49, 0x9a, 0xeb, 0x43,
+    0xa5, 0x4a, 0x45, 0x02, 0x9a, 0x43, 0xa5, 0x56, 0x45, 0x02, 0x9a, 0x43,
+    0xa5, 0x62, 0xc9, 0xaf, 0x6f, 0x00, 0x43, 0xf9, 0xc9, 0x16, 0x14, 0x00,
+    0x43, 0xe0, 0x00, 0x43, 0xa5, 0x6e, 0x00, 0x43, 0xa5, 0x7a, 0xcd, 0x00,
+    0xfa, 0x07, 0xf4, 0x09, 0xca, 0x26, 0xf7, 0x07, 0xf4, 0x10, 0xcc, 0x00,
+    0xfb, 0x07, 0xf4, 0x49, 0xcb, 0x10, 0xb5, 0x07, 0xf4, 0x58, 0x00, 0x43,
+    0xa5, 0x86, 0xca, 0x9f, 0x72, 0x00, 0x3b, 0xd9, 0xc8, 0xbf, 0x42, 0x00,
+    0x3b, 0xd0, 0xc6, 0x05, 0x01, 0x00, 0x34, 0xa8, 0xd3, 0x1e, 0x24, 0x00,
+    0x3a, 0x98, 0xc5, 0x05, 0x02, 0x00, 0x45, 0x71, 0xcf, 0x1b, 0x59, 0x00,
+    0x34, 0x78, 0xe0, 0x05, 0x27, 0x00, 0x3a, 0xc8, 0xc5, 0x00, 0xd4, 0x00,
+    0x34, 0x29, 0xd6, 0x2d, 0x8e, 0x00, 0x3a, 0xc0, 0xce, 0x73, 0x6e, 0x00,
+    0x34, 0x11, 0xc5, 0x00, 0xd4, 0x00, 0x3a, 0xb8, 0xcb, 0x02, 0x9c, 0x00,
+    0x3b, 0x79, 0xc4, 0x00, 0xd5, 0x00, 0x3b, 0x90, 0xcb, 0x98, 0x58, 0x00,
+    0xf2, 0xe8, 0xc6, 0x60, 0xb1, 0x00, 0x0e, 0xb1, 0xc5, 0x31, 0xee, 0x00,
+    0x0e, 0xc1, 0xc5, 0x8e, 0x66, 0x00, 0x0e, 0xd1, 0xc5, 0x1f, 0x0c, 0x00,
+    0x0e, 0xe0, 0xcb, 0x98, 0x58, 0x00, 0x0f, 0x08, 0xca, 0x9b, 0xda, 0x00,
+    0x0f, 0xd8, 0xc2, 0x02, 0xa0, 0x01, 0x9f, 0xa9, 0xc4, 0x02, 0xde, 0x01,
+    0x9f, 0xb0, 0xc3, 0x09, 0x9e, 0x01, 0x9f, 0xb9, 0xc3, 0x0d, 0x14, 0x01,
+    0x9f, 0xc0, 0xc2, 0x22, 0xcc, 0x01, 0x9f, 0xc9, 0xc4, 0x18, 0x10, 0x01,
+    0x9f, 0xd0, 0xc6, 0xd2, 0xb9, 0x0e, 0x8b, 0xf1, 0x91, 0x0e, 0x8b, 0xe0,
+    0xa0, 0x0e, 0x8b, 0x49, 0x9f, 0x0e, 0x8b, 0x41, 0x9e, 0x0e, 0x8b, 0x38,
+    0x12, 0xc3, 0xa5, 0x92, 0xc4, 0xe3, 0xab, 0x00, 0xfb, 0x63, 0x03, 0xa5,
+    0xa1, 0xc5, 0x28, 0x47, 0x00, 0xfb, 0x52, 0x03, 0xa5, 0xa7, 0xca, 0x94,
+    0x91, 0x00, 0xfb, 0x7b, 0x03, 0xa5, 0xad, 0xcd, 0x42, 0x94, 0x00, 0xfd,
+    0x48, 0xd3, 0x42, 0x8e, 0x00, 0xfd, 0x68, 0xd3, 0x42, 0x8e, 0x00, 0xfd,
+    0x58, 0x12, 0xc3, 0xa5, 0xb3, 0xc4, 0xe3, 0xab, 0x00, 0xf9, 0x63, 0x03,
+    0xa5, 0xc2, 0xc5, 0x28, 0x47, 0x00, 0xf9, 0x52, 0x03, 0xa5, 0xc8, 0xca,
+    0x94, 0x91, 0x00, 0xf9, 0x7b, 0x03, 0xa5, 0xce, 0xcd, 0x42, 0x94, 0x00,
+    0xfc, 0xc8, 0xd3, 0x42, 0x8e, 0x00, 0xfc, 0xe8, 0xd3, 0x42, 0x8e, 0x00,
+    0xfc, 0xd8, 0x12, 0xc3, 0xa5, 0xd4, 0xc4, 0xe3, 0xab, 0x00, 0xf8, 0x63,
+    0x03, 0xa5, 0xe3, 0xc5, 0x28, 0x47, 0x00, 0xf8, 0x52, 0x03, 0xa5, 0xe9,
+    0xca, 0x94, 0x91, 0x00, 0xf8, 0x7b, 0x03, 0xa5, 0xef, 0xcd, 0x42, 0x94,
+    0x00, 0xfc, 0x48, 0xd3, 0x42, 0x8e, 0x00, 0xfc, 0x68, 0xd3, 0x42, 0x8e,
+    0x00, 0xfc, 0x58, 0xcb, 0x94, 0x90, 0x00, 0xfb, 0x33, 0x03, 0xa5, 0xf5,
+    0xc4, 0xe3, 0xab, 0x00, 0xfb, 0x23, 0x03, 0xa5, 0xfb, 0xc5, 0x28, 0x47,
+    0x00, 0xfb, 0x13, 0x03, 0xa6, 0x01, 0xcd, 0x4a, 0x68, 0x00, 0xfd, 0x00,
+    0xd2, 0x4a, 0x63, 0x00, 0xfd, 0x38, 0xd2, 0x4a, 0x63, 0x00, 0xfd, 0x28,
+    0xd2, 0x4a, 0x63, 0x00, 0xfd, 0x18, 0xcb, 0x94, 0x90, 0x00, 0xf9, 0x33,
+    0x03, 0xa6, 0x07, 0xc4, 0xe3, 0xab, 0x00, 0xf9, 0x23, 0x03, 0xa6, 0x0d,
+    0xc5, 0x28, 0x47, 0x00, 0xf9, 0x13, 0x03, 0xa6, 0x13, 0xcd, 0x4a, 0x68,
+    0x00, 0xfc, 0x80, 0xd2, 0x4a, 0x63, 0x00, 0xfc, 0xb8, 0xd2, 0x4a, 0x63,
+    0x00, 0xfc, 0xa8, 0xd2, 0x4a, 0x63, 0x00, 0xfc, 0x98, 0xcb, 0x94, 0x90,
+    0x00, 0xf8, 0x33, 0x03, 0xa6, 0x19, 0xc4, 0xe3, 0xab, 0x00, 0xf8, 0x23,
+    0x03, 0xa6, 0x1f, 0xc5, 0x28, 0x47, 0x00, 0xf8, 0x13, 0x03, 0xa6, 0x25,
+    0xcd, 0x4a, 0x68, 0x00, 0xfc, 0x00, 0xd2, 0x4a, 0x63, 0x00, 0xfc, 0x38,
+    0xd2, 0x4a, 0x63, 0x00, 0xfc, 0x28, 0xd2, 0x4a, 0x63, 0x00, 0xfc, 0x18,
+    0xd0, 0x58, 0xe2, 0x0e, 0x7d, 0xd9, 0xd0, 0x2d, 0x10, 0x0e, 0x7d, 0xc0,
+    0xcb, 0x6d, 0x51, 0x0e, 0xc1, 0xe0, 0x14, 0xc3, 0xa6, 0x2b, 0xce, 0x6d,
+    0x4e, 0x0e, 0xc1, 0xb8, 0xc6, 0x58, 0xac, 0x0e, 0xc2, 0x19, 0xc2, 0x02,
+    0xae, 0x0e, 0xc1, 0x88, 0x46, 0x06, 0x82, 0xc3, 0xa6, 0x37, 0xc9, 0xb3,
+    0xc2, 0x0e, 0xc7, 0x11, 0x46, 0x0e, 0xce, 0x43, 0xa6, 0x43, 0x44, 0x0d,
+    0x21, 0xc3, 0xa6, 0x55, 0xc8, 0x13, 0x9d, 0x0e, 0xc0, 0xaa, 0x03, 0xa6,
+    0x64, 0xc3, 0x00, 0x74, 0x0e, 0xc4, 0x33, 0x03, 0xa6, 0x68, 0xce, 0x3a,
+    0x9e, 0x0e, 0xc0, 0x88, 0x00, 0x43, 0xa6, 0x6c, 0xd2, 0x4d, 0x7b, 0x0e,
+    0xc1, 0x18, 0xcf, 0x69, 0xea, 0x0e, 0xc1, 0x08, 0xcb, 0x4d, 0x82, 0x0e,
+    0xc1, 0x30, 0xc8, 0xbc, 0x62, 0x0e, 0xc2, 0xc9, 0xca, 0x4d, 0x83, 0x0e,
+    0xc2, 0xc0, 0xc4, 0x03, 0x14, 0x0e, 0xc7, 0x89, 0xc3, 0x06, 0xa7, 0x0e,
+    0xc6, 0xe8, 0xc7, 0x13, 0x94, 0x0e, 0xc5, 0x51, 0xc2, 0x00, 0xa8, 0x0e,
+    0xc0, 0xd8, 0xc5, 0x08, 0x09, 0x00, 0x44, 0x09, 0xc9, 0x4d, 0xde, 0x00,
+    0x43, 0xb8, 0xc5, 0x05, 0x02, 0x00, 0x43, 0xc9, 0xc5, 0x00, 0xd4, 0x00,
+    0x43, 0xb0, 0xc9, 0xaf, 0x6f, 0x00, 0x44, 0x01, 0xc9, 0x16, 0x14, 0x00,
+    0x43, 0xe8, 0xc9, 0xaf, 0x6f, 0x00, 0x43, 0xf1, 0xc9, 0x16, 0x14, 0x00,
+    0x43, 0xd8, 0xca, 0x26, 0xf7, 0x07, 0xf4, 0x41, 0xcd, 0x00, 0xfa, 0x07,
+    0xf4, 0x38, 0xcd, 0x00, 0xfa, 0x07, 0xf4, 0x19, 0xca, 0x26, 0xf7, 0x07,
+    0xf4, 0x20, 0xca, 0x26, 0xf7, 0x07, 0xdd, 0x89, 0xcd, 0x00, 0xfa, 0x07,
+    0xdd, 0x80, 0xca, 0x94, 0x91, 0x00, 0xfb, 0x73, 0x03, 0xa6, 0x83, 0xcd,
+    0x42, 0x94, 0x00, 0xfd, 0x40, 0xd3, 0x42, 0x8e, 0x00, 0xfd, 0x60, 0xd3,
+    0x42, 0x8e, 0x00, 0xfd, 0x50, 0xd3, 0x42, 0x8e, 0x00, 0xfd, 0x78, 0xca,
+    0x94, 0x91, 0x00, 0xf9, 0x73, 0x03, 0xa6, 0x89, 0xcd, 0x42, 0x94, 0x00,
+    0xfc, 0xc0, 0xd3, 0x42, 0x8e, 0x00, 0xfc, 0xe0, 0xd3, 0x42, 0x8e, 0x00,
+    0xfc, 0xd0, 0xd3, 0x42, 0x8e, 0x00, 0xfc, 0xf8, 0xca, 0x94, 0x91, 0x00,
+    0xf8, 0x73, 0x03, 0xa6, 0x8f, 0xcd, 0x42, 0x94, 0x00, 0xfc, 0x40, 0xd3,
+    0x42, 0x8e, 0x00, 0xfc, 0x60, 0xd3, 0x42, 0x8e, 0x00, 0xfc, 0x50, 0xd3,
+    0x42, 0x8e, 0x00, 0xfc, 0x78, 0xd2, 0x4a, 0x63, 0x00, 0xfd, 0x30, 0xd2,
+    0x4a, 0x63, 0x00, 0xfd, 0x20, 0xd2, 0x4a, 0x63, 0x00, 0xfd, 0x10, 0xd2,
+    0x4a, 0x63, 0x00, 0xfc, 0xb0, 0xd2, 0x4a, 0x63, 0x00, 0xfc, 0xa0, 0xd2,
+    0x4a, 0x63, 0x00, 0xfc, 0x90, 0xd2, 0x4a, 0x63, 0x00, 0xfc, 0x30, 0xd2,
+    0x4a, 0x63, 0x00, 0xfc, 0x20, 0xd2, 0x4a, 0x63, 0x00, 0xfc, 0x10, 0x49,
+    0x0e, 0xd7, 0xc3, 0xa6, 0x95, 0xc5, 0xbc, 0x5d, 0x0e, 0xc7, 0x38, 0xc5,
+    0x58, 0xac, 0x0e, 0xc7, 0x19, 0xc4, 0x18, 0xf2, 0x0e, 0xc7, 0x08, 0xc4,
+    0x18, 0xf2, 0x0e, 0xc7, 0x01, 0xc9, 0x13, 0x9c, 0x0e, 0xc6, 0xf9, 0xc8,
+    0x1e, 0x56, 0x0e, 0xc6, 0xf0, 0xc7, 0x13, 0x94, 0x0e, 0xc5, 0x49, 0xc2,
+    0x00, 0xa8, 0x0e, 0xc0, 0xd2, 0x03, 0xa6, 0xa1, 0x00, 0x43, 0xa6, 0xa7,
+    0x00, 0x43, 0xa6, 0xcb, 0xc6, 0x77, 0x82, 0x0e, 0xc1, 0xfb, 0x03, 0xa6,
+    0xd7, 0x05, 0xc3, 0xa6, 0xdd, 0x0a, 0xc3, 0xa6, 0xef, 0xc4, 0x18, 0xf2,
+    0x0e, 0xc1, 0x10, 0xd3, 0x42, 0x8e, 0x00, 0xfd, 0x70, 0xd3, 0x42, 0x8e,
+    0x00, 0xfc, 0xf0, 0xd3, 0x42, 0x8e, 0x00, 0xfc, 0x70, 0xc5, 0x16, 0xca,
+    0x0e, 0xc7, 0x51, 0xc6, 0x0e, 0xe0, 0x0e, 0xc7, 0x40, 0xcb, 0x4d, 0x82,
+    0x0e, 0xc1, 0x98, 0xc6, 0xcc, 0x41, 0x0e, 0xc0, 0xc3, 0x03, 0xa6, 0xfb,
+    0x46, 0x0e, 0xce, 0xc3, 0xa7, 0x01, 0xc6, 0x58, 0xac, 0x0e, 0xc0, 0xcb,
+    0x03, 0xa7, 0x10, 0xcb, 0x99, 0xad, 0x0e, 0xc0, 0xb9, 0xca, 0xa1, 0x70,
+    0x0e, 0xc0, 0xb0, 0xc9, 0x13, 0x9c, 0x0e, 0xc4, 0x61, 0xc4, 0x18, 0xf2,
+    0x0e, 0xc4, 0x58, 0xc4, 0x0c, 0x4d, 0x0e, 0xc1, 0xf0, 0xcf, 0x62, 0xd3,
+    0x0e, 0xc1, 0xe9, 0xc6, 0x20, 0x7d, 0x0e, 0xc1, 0x49, 0xc5, 0x70, 0xdc,
+    0x0e, 0xc1, 0x40, 0xc5, 0x58, 0xad, 0x0e, 0xc1, 0x59, 0xc5, 0x64, 0xb4,
+    0x0e, 0xc1, 0x50, 0xce, 0x27, 0xab, 0x0e, 0xc1, 0xa8, 0xc7, 0x27, 0xb2,
+    0x0e, 0xc1, 0xa1, 0xc4, 0x0e, 0xe2, 0x0e, 0xc1, 0x6a, 0x03, 0xa7, 0x16,
+    0xcb, 0x4d, 0x82, 0x0e, 0xc1, 0x90, 0x00, 0x43, 0xa7, 0x1a, 0xc4, 0x18,
+    0xf2, 0x0e, 0xc1, 0x79, 0xc9, 0x13, 0x9c, 0x0e, 0xc1, 0x70, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0};
+const uint8_t *UnicodeNameToCodepointIndex = UnicodeNameToCodepointIndex_;
+const std::size_t UnicodeNameToCodepointIndexSize = 239405;
+const std::size_t UnicodeNameToCodepointLargestNameSize = 74;
+
+} // namespace unicode
+} // namespace sys
+} // namespace llvm
diff --git a/llvm/lib/Support/Unix/COM.inc b/llvm/lib/Support/Unix/COM.inc
index 03a690ac3766..d97b59ac02cf 100644
--- a/llvm/lib/Support/Unix/COM.inc
+++ b/llvm/lib/Support/Unix/COM.inc
@@ -21,6 +21,6 @@ namespace sys {
 InitializeCOMRAII::InitializeCOMRAII(COMThreadingMode Threading,
                                      bool SpeedOverMemory) {}
 
-InitializeCOMRAII::~InitializeCOMRAII() {}
+InitializeCOMRAII::~InitializeCOMRAII() = default;
 }
 }
diff --git a/llvm/lib/Support/Unix/Memory.inc b/llvm/lib/Support/Unix/Memory.inc
index b83477e0e4cc..5e008069dd98 100644
--- a/llvm/lib/Support/Unix/Memory.inc
+++ b/llvm/lib/Support/Unix/Memory.inc
@@ -16,6 +16,7 @@
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Process.h"
+#include "llvm/Support/Valgrind.h"
 
 #ifdef HAVE_SYS_MMAN_H
 #include <sys/mman.h>
diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc
index 788460d657fe..2ae7c6dc47e0 100644
--- a/llvm/lib/Support/Unix/Path.inc
+++ b/llvm/lib/Support/Unix/Path.inc
@@ -194,7 +194,7 @@ getprogpath(char ret[PATH_MAX], const char *bin)
 
 /// GetMainExecutable - Return the path to the main executable, given the
 /// value of argv[0] from program startup.
-std::string getMainExecutable(const char *argv0, void *MainAddr) {
+std::string getMainExecutableImpl(const char *argv0, void *MainAddr) {
 #if defined(__APPLE__)
   // On OS X the executable path is saved to the stack by dyld. Reading it
   // from there is much faster than calling dladdr, especially for large
@@ -874,12 +874,14 @@ void mapped_file_region::unmapImpl() {
 
 void mapped_file_region::dontNeedImpl() {
   assert(Mode == mapped_file_region::readonly);
+  if (!Mapping)
+      return;
 #if defined(__MVS__) || defined(_AIX)
   // If we don't have madvise, or it isn't beneficial, treat this as a no-op.
-  return;
+#elif defined(POSIX_MADV_DONTNEED)
+  ::posix_madvise(Mapping, Size, POSIX_MADV_DONTNEED);
 #else
-  if (Mapping)
-    ::madvise(Mapping, Size, MADV_DONTNEED);
+  ::madvise(Mapping, Size, MADV_DONTNEED);
 #endif
 }
 
@@ -948,7 +950,15 @@ ErrorOr<basic_file_status> directory_entry::status() const {
   return s;
 }
 
-#if !defined(F_GETPATH)
+//
+// FreeBSD optionally provides /proc/self/fd, but it is incompatible with
+// Linux. The thing to use is realpath.
+//
+#if !defined(__FreeBSD__)
+#define TRY_PROC_SELF_FD
+#endif
+
+#if !defined(F_GETPATH) && defined(TRY_PROC_SELF_FD)
 static bool hasProcSelfFD() {
   // If we have a /proc filesystem mounted, we can quickly establish the
   // real name of the file with readlink
@@ -1135,6 +1145,7 @@ std::error_code openFileForRead(const Twine &Name, int &ResultFD,
     RealPath->append(Buffer, Buffer + strlen(Buffer));
 #else
   char Buffer[PATH_MAX];
+#if defined(TRY_PROC_SELF_FD)
   if (hasProcSelfFD()) {
     char ProcPath[64];
     snprintf(ProcPath, sizeof(ProcPath), "/proc/self/fd/%d", ResultFD);
@@ -1142,13 +1153,16 @@ std::error_code openFileForRead(const Twine &Name, int &ResultFD,
     if (CharCount > 0)
       RealPath->append(Buffer, Buffer + CharCount);
   } else {
+#endif
     SmallString<128> Storage;
     StringRef P = Name.toNullTerminatedStringRef(Storage);
 
     // Use ::realpath to get the real path name
     if (::realpath(P.begin(), Buffer) != nullptr)
       RealPath->append(Buffer, Buffer + strlen(Buffer));
+#if defined(TRY_PROC_SELF_FD)
   }
+#endif
 #endif
   return std::error_code();
 }
diff --git a/llvm/lib/Support/Unix/Process.inc b/llvm/lib/Support/Unix/Process.inc
index d3d9fb7d7187..3c2d118977c5 100644
--- a/llvm/lib/Support/Unix/Process.inc
+++ b/llvm/lib/Support/Unix/Process.inc
@@ -331,6 +331,23 @@ extern "C" int tigetnum(char *capname);
 static ManagedStatic<std::mutex> TermColorMutex;
 #endif
 
+bool checkTerminalEnvironmentForColors() {
+  if (const char *TermStr = std::getenv("TERM")) {
+    return StringSwitch<bool>(TermStr)
+      .Case("ansi", true)
+      .Case("cygwin", true)
+      .Case("linux", true)
+      .StartsWith("screen", true)
+      .StartsWith("xterm", true)
+      .StartsWith("vt100", true)
+      .StartsWith("rxvt", true)
+      .EndsWith("color", true)
+      .Default(false);
+  }
+
+  return false;
+}
+
 static bool terminalHasColors(int fd) {
 #ifdef LLVM_ENABLE_TERMINFO
   // First, acquire a global lock because these C routines are thread hostile.
@@ -356,7 +373,8 @@ static bool terminalHasColors(int fd) {
   //
   // The 'tigetnum' routine returns -2 or -1 on errors, and might return 0 if
   // the terminfo says that no colors are supported.
-  bool HasColors = tigetnum(const_cast<char *>("colors")) > 0;
+  int colors_ti = tigetnum(const_cast<char *>("colors"));
+  bool HasColors = colors_ti >= 0 ? colors_ti : checkTerminalEnvironmentForColors();
 
   // Now extract the structure allocated by setupterm and free its memory
   // through a really silly dance.
@@ -364,27 +382,12 @@ static bool terminalHasColors(int fd) {
   (void)del_curterm(termp); // Drop any errors here.
 
   // Return true if we found a color capabilities for the current terminal.
-  if (HasColors)
-    return true;
+  return HasColors;
 #else
   // When the terminfo database is not available, check if the current terminal
   // is one of terminals that are known to support ANSI color escape codes.
-  if (const char *TermStr = std::getenv("TERM")) {
-    return StringSwitch<bool>(TermStr)
-      .Case("ansi", true)
-      .Case("cygwin", true)
-      .Case("linux", true)
-      .StartsWith("screen", true)
-      .StartsWith("xterm", true)
-      .StartsWith("vt100", true)
-      .StartsWith("rxvt", true)
-      .EndsWith("color", true)
-      .Default(false);
-  }
+  return checkTerminalEnvironmentForColors();
 #endif
-
-  // Otherwise, be conservative.
-  return false;
 }
 
 bool Process::FileDescriptorHasColors(int fd) {
diff --git a/llvm/lib/Support/Unix/Signals.inc b/llvm/lib/Support/Unix/Signals.inc
index 575e2aab1eab..bf145bffe8bf 100644
--- a/llvm/lib/Support/Unix/Signals.inc
+++ b/llvm/lib/Support/Unix/Signals.inc
@@ -79,8 +79,8 @@
 
 using namespace llvm;
 
-static RETSIGTYPE SignalHandler(int Sig);  // defined below.
-static RETSIGTYPE InfoSignalHandler(int Sig);  // defined below.
+static void SignalHandler(int Sig);  // defined below.
+static void InfoSignalHandler(int Sig);  // defined below.
 
 using SignalHandlerFunctionType = void (*)();
 /// The function to call if ctrl-c is pressed.
@@ -362,7 +362,7 @@ void sys::CleanupOnSignal(uintptr_t Context) {
 }
 
 // The signal handler that runs.
-static RETSIGTYPE SignalHandler(int Sig) {
+static void SignalHandler(int Sig) {
   // Restore the signal behavior to default, so that the program actually
   // crashes when we return and the signal reissues.  This also ensures that if
   // we crash in our signal handler that the program will terminate immediately
@@ -406,7 +406,7 @@ static RETSIGTYPE SignalHandler(int Sig) {
 #endif
 }
 
-static RETSIGTYPE InfoSignalHandler(int Sig) {
+static void InfoSignalHandler(int Sig) {
   SaveAndRestore<int> SaveErrnoDuringASignalHandler(errno);
   if (SignalHandlerFunctionType CurrentInfoFunction = InfoSignalFunction)
     CurrentInfoFunction();
@@ -432,6 +432,10 @@ void llvm::sys::SetOneShotPipeSignalFunction(void (*Handler)()) {
 }
 
 void llvm::sys::DefaultOneShotPipeSignalHandler() {
+  // UNIX03 conformance requires a non-zero exit code and an error message
+  // to stderr when writing to a closed stdout fails.
+  errs() << "error: write on a pipe with no reader\n";
+
   // Send a special return code that drivers can check for, from sysexits.h.
   exit(EX_IOERR);
 }
diff --git a/llvm/lib/Support/Unix/ThreadLocal.inc b/llvm/lib/Support/Unix/ThreadLocal.inc
index a402ae980424..0a958a2b952f 100644
--- a/llvm/lib/Support/Unix/ThreadLocal.inc
+++ b/llvm/lib/Support/Unix/ThreadLocal.inc
@@ -17,8 +17,6 @@
 
 #include "llvm/Config/config.h"
 
-#if defined(HAVE_PTHREAD_H) && defined(HAVE_PTHREAD_GETSPECIFIC)
-
 #include <cassert>
 #include <pthread.h>
 #include <stdlib.h>
@@ -58,13 +56,3 @@ void ThreadLocalImpl::removeInstance() {
 }
 
 }
-#else
-namespace llvm {
-using namespace sys;
-ThreadLocalImpl::ThreadLocalImpl() : data() { }
-ThreadLocalImpl::~ThreadLocalImpl() { }
-void ThreadLocalImpl::setInstance(const void* d) { data = const_cast<void*>(d);}
-void *ThreadLocalImpl::getInstance() { return data; }
-void ThreadLocalImpl::removeInstance() { setInstance(0); }
-}
-#endif
diff --git a/llvm/lib/Support/Unix/Threading.inc b/llvm/lib/Support/Unix/Threading.inc
index 5de1cf071ba9..99f64b4f553d 100644
--- a/llvm/lib/Support/Unix/Threading.inc
+++ b/llvm/lib/Support/Unix/Threading.inc
@@ -18,6 +18,7 @@
 #if defined(__APPLE__)
 #include <mach/mach_init.h>
 #include <mach/mach_port.h>
+#include <pthread/qos.h>
 #endif
 
 #include <pthread.h>
@@ -258,27 +259,29 @@ SetThreadPriorityResult llvm::set_thread_priority(ThreadPriority Priority) {
   // SCHED_OTHER   the standard round-robin time-sharing policy;
   return !pthread_setschedparam(
              pthread_self(),
-             Priority == ThreadPriority::Background ? SCHED_IDLE : SCHED_OTHER,
+             // FIXME: consider SCHED_BATCH for Low
+             Priority == ThreadPriority::Default ? SCHED_OTHER : SCHED_IDLE,
              &priority)
              ? SetThreadPriorityResult::SUCCESS
              : SetThreadPriorityResult::FAILURE;
 #elif defined(__APPLE__)
-  // https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man2/getpriority.2.html
-  // When setting a thread into background state the scheduling priority is set
-  // to lowest value, disk and network IO are throttled. Network IO will be
-  // throttled for any sockets the thread opens after going into background
-  // state. Any previously opened sockets are not affected.
-
-  // https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man3/getiopolicy_np.3.html
-  // I/Os with THROTTLE policy are called THROTTLE I/Os. If a THROTTLE I/O
-  // request occurs within a small time window (usually a fraction of a second)
-  // of another NORMAL I/O request, the thread that issues the THROTTLE I/O is
-  // forced to sleep for a certain interval. This slows down the thread that
-  // issues the THROTTLE I/O so that NORMAL I/Os can utilize most of the disk
-  // I/O bandwidth.
-  return !setpriority(PRIO_DARWIN_THREAD, 0,
-                      Priority == ThreadPriority::Background ? PRIO_DARWIN_BG
-                                                             : 0)
+  // https://developer.apple.com/documentation/apple-silicon/tuning-your-code-s-performance-for-apple-silicon
+  //
+  // Background - Applies to work that isn’t visible to the user and may take significant
+  // time to complete. Examples include indexing, backing up, or synchronizing data. This
+  // class emphasizes energy efficiency.
+  //
+  // Utility - Applies to work that takes anywhere from a few seconds to a few minutes to
+  // complete. Examples include downloading a document or importing data. This class
+  // offers a balance between responsiveness, performance, and energy efficiency.
+  const auto qosClass = [&](){
+    switch (Priority) {
+      case ThreadPriority::Background: return QOS_CLASS_BACKGROUND;
+      case ThreadPriority::Low: return QOS_CLASS_UTILITY;
+      case ThreadPriority::Default: return QOS_CLASS_DEFAULT;
+    }
+  }();
+  return !pthread_set_qos_class_self_np(qosClass, 0)
              ? SetThreadPriorityResult::SUCCESS
              : SetThreadPriorityResult::FAILURE;
 #endif
diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp
index f15e301874c4..21f0c39bfd6e 100644
--- a/llvm/lib/Support/VirtualFileSystem.cpp
+++ b/llvm/lib/Support/VirtualFileSystem.cpp
@@ -151,6 +151,10 @@ bool FileSystem::exists(const Twine &Path) {
   return Status && Status->exists();
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void FileSystem::dump() const { print(dbgs(), PrintType::RecursiveContents); }
+#endif
+
 #ifndef NDEBUG
 static bool isTraversalComponent(StringRef Component) {
   return Component.equals("..") || Component.equals(".");
@@ -273,6 +277,10 @@ public:
   std::error_code getRealPath(const Twine &Path,
                               SmallVectorImpl<char> &Output) const override;
 
+protected:
+  void printImpl(raw_ostream &OS, PrintType Type,
+                 unsigned IndentLevel) const override;
+
 private:
   // If this FS has its own working dir, use it to make Path absolute.
   // The returned twine is safe to use as long as both Storage and Path live.
@@ -354,6 +362,17 @@ RealFileSystem::getRealPath(const Twine &Path,
   return llvm::sys::fs::real_path(adjustPath(Path, Storage), Output);
 }
 
+void RealFileSystem::printImpl(raw_ostream &OS, PrintType Type,
+                               unsigned IndentLevel) const {
+  printIndent(OS, IndentLevel);
+  OS << "RealFileSystem using ";
+  if (WD)
+    OS << "own";
+  else
+    OS << "process";
+  OS << " CWD\n";
+}
+
 IntrusiveRefCntPtr<FileSystem> vfs::getRealFileSystem() {
   static IntrusiveRefCntPtr<FileSystem> FS(new RealFileSystem(true));
   return FS;
@@ -459,6 +478,19 @@ OverlayFileSystem::getRealPath(const Twine &Path,
   return errc::no_such_file_or_directory;
 }
 
+void OverlayFileSystem::printImpl(raw_ostream &OS, PrintType Type,
+                                  unsigned IndentLevel) const {
+  printIndent(OS, IndentLevel);
+  OS << "OverlayFileSystem\n";
+  if (Type == PrintType::Summary)
+    return;
+
+  if (Type == PrintType::Contents)
+    Type = PrintType::Summary;
+  for (auto FS : overlays_range())
+    FS->print(OS, Type, IndentLevel + 1);
+}
+
 llvm::vfs::detail::DirIterImpl::~DirIterImpl() = default;
 
 namespace {
@@ -467,28 +499,25 @@ namespace {
 class CombiningDirIterImpl : public llvm::vfs::detail::DirIterImpl {
   using FileSystemPtr = llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem>;
 
-  /// File systems to check for entries in. Processed in reverse order.
-  SmallVector<FileSystemPtr, 8> FSList;
-  /// The directory iterator for the current filesystem.
+  /// Iterators to combine, processed in reverse order.
+  SmallVector<directory_iterator, 8> IterList;
+  /// The iterator currently being traversed.
   directory_iterator CurrentDirIter;
-  /// The path of the directory to iterate the entries of.
-  std::string DirPath;
   /// The set of names already returned as entries.
   llvm::StringSet<> SeenNames;
 
-  /// Sets \c CurrentDirIter to an iterator of \c DirPath in the next file
-  /// system in the list, or leaves it as is (at its end position) if we've
-  /// already gone through them all.
-  std::error_code incrementFS() {
-    while (!FSList.empty()) {
-      std::error_code EC;
-      CurrentDirIter = FSList.back()->dir_begin(DirPath, EC);
-      FSList.pop_back();
-      if (EC && EC != errc::no_such_file_or_directory)
-        return EC;
+  /// Sets \c CurrentDirIter to the next iterator in the list, or leaves it as
+  /// is (at its end position) if we've already gone through them all.
+  std::error_code incrementIter(bool IsFirstTime) {
+    while (!IterList.empty()) {
+      CurrentDirIter = IterList.back();
+      IterList.pop_back();
       if (CurrentDirIter != directory_iterator())
         break; // found
     }
+
+    if (IsFirstTime && CurrentDirIter == directory_iterator())
+      return errc::no_such_file_or_directory;
     return {};
   }
 
@@ -499,7 +528,7 @@ class CombiningDirIterImpl : public llvm::vfs::detail::DirIterImpl {
     if (!IsFirstTime)
       CurrentDirIter.increment(EC);
     if (!EC && CurrentDirIter == directory_iterator())
-      EC = incrementFS();
+      EC = incrementIter(IsFirstTime);
     return EC;
   }
 
@@ -520,23 +549,24 @@ class CombiningDirIterImpl : public llvm::vfs::detail::DirIterImpl {
 
 public:
   CombiningDirIterImpl(ArrayRef<FileSystemPtr> FileSystems, std::string Dir,
-                       std::error_code &EC)
-      : FSList(FileSystems.begin(), FileSystems.end()),
-        DirPath(std::move(Dir)) {
-    if (!FSList.empty()) {
-      CurrentDirIter = FSList.back()->dir_begin(DirPath, EC);
-      FSList.pop_back();
-      if (!EC || EC == errc::no_such_file_or_directory)
-        EC = incrementImpl(true);
+                       std::error_code &EC) {
+    for (auto FS : FileSystems) {
+      std::error_code FEC;
+      directory_iterator Iter = FS->dir_begin(Dir, FEC);
+      if (FEC && FEC != errc::no_such_file_or_directory) {
+        EC = FEC;
+        return;
+      }
+      if (!FEC)
+        IterList.push_back(Iter);
     }
+    EC = incrementImpl(true);
   }
 
-  CombiningDirIterImpl(directory_iterator FirstIter, FileSystemPtr Fallback,
-                       std::string FallbackDir, std::error_code &EC)
-      : FSList({Fallback}), CurrentDirIter(FirstIter),
-        DirPath(std::move(FallbackDir)) {
-    if (!EC || EC == errc::no_such_file_or_directory)
-      EC = incrementImpl(true);
+  CombiningDirIterImpl(ArrayRef<directory_iterator> DirIters,
+                       std::error_code &EC)
+      : IterList(DirIters.begin(), DirIters.end()) {
+    EC = incrementImpl(true);
   }
 
   std::error_code increment() override { return incrementImpl(false); }
@@ -546,8 +576,11 @@ public:
 
 directory_iterator OverlayFileSystem::dir_begin(const Twine &Dir,
                                                 std::error_code &EC) {
-  return directory_iterator(
+  directory_iterator Combined = directory_iterator(
       std::make_shared<CombiningDirIterImpl>(FSList, Dir.str(), EC));
+  if (EC)
+    return {};
+  return Combined;
 }
 
 void ProxyFileSystem::anchor() {}
@@ -557,10 +590,15 @@ namespace vfs {
 
 namespace detail {
 
-enum InMemoryNodeKind { IME_File, IME_Directory, IME_HardLink };
+enum InMemoryNodeKind {
+  IME_File,
+  IME_Directory,
+  IME_HardLink,
+  IME_SymbolicLink,
+};
 
 /// The in memory file system is a tree of Nodes. Every node can either be a
-/// file , hardlink or a directory.
+/// file, symlink, hardlink or a directory.
 class InMemoryNode {
   InMemoryNodeKind Kind;
   std::string FileName;
@@ -629,6 +667,30 @@ public:
   }
 };
 
+class InMemorySymbolicLink : public InMemoryNode {
+  std::string TargetPath;
+  Status Stat;
+
+public:
+  InMemorySymbolicLink(StringRef Path, StringRef TargetPath, Status Stat)
+      : InMemoryNode(Path, IME_SymbolicLink), TargetPath(std::move(TargetPath)),
+        Stat(Stat) {}
+
+  std::string toString(unsigned Indent) const override {
+    return std::string(Indent, ' ') + "SymbolicLink to -> " + TargetPath;
+  }
+
+  Status getStatus(const Twine &RequestedName) const override {
+    return Status::copyWithNewName(Stat, RequestedName);
+  }
+
+  StringRef getTargetPath() const { return TargetPath; }
+
+  static bool classof(const InMemoryNode *N) {
+    return N->getKind() == IME_SymbolicLink;
+  }
+};
+
 /// Adapt a InMemoryFile for VFS' File interface.  The goal is to make
 /// \p InMemoryFileAdaptor mimic as much as possible the behavior of
 /// \p RealFile.
@@ -677,7 +739,7 @@ public:
 
   UniqueID getUniqueID() const { return Stat.getUniqueID(); }
 
-  InMemoryNode *getChild(StringRef Name) {
+  InMemoryNode *getChild(StringRef Name) const {
     auto I = Entries.find(Name);
     if (I != Entries.end())
       return I->second.get();
@@ -773,10 +835,10 @@ bool InMemoryFileSystem::addFile(const Twine &P, time_t ModificationTime,
 
   detail::InMemoryDirectory *Dir = Root.get();
   auto I = llvm::sys::path::begin(Path), E = sys::path::end(Path);
-  const auto ResolvedUser = User.getValueOr(0);
-  const auto ResolvedGroup = Group.getValueOr(0);
-  const auto ResolvedType = Type.getValueOr(sys::fs::file_type::regular_file);
-  const auto ResolvedPerms = Perms.getValueOr(sys::fs::all_all);
+  const auto ResolvedUser = User.value_or(0);
+  const auto ResolvedGroup = Group.value_or(0);
+  const auto ResolvedType = Type.value_or(sys::fs::file_type::regular_file);
+  const auto ResolvedPerms = Perms.value_or(sys::fs::all_all);
   // Any intermediate directories we create should be accessible by
   // the owner, even if Perms says otherwise for the final path.
   const auto NewDirectoryPerms = ResolvedPerms | sys::fs::owner_all;
@@ -864,22 +926,23 @@ bool InMemoryFileSystem::addFileNoOwn(const Twine &P, time_t ModificationTime,
                  });
 }
 
-static ErrorOr<const detail::InMemoryNode *>
-lookupInMemoryNode(const InMemoryFileSystem &FS, detail::InMemoryDirectory *Dir,
-                   const Twine &P) {
+detail::NamedNodeOrError
+InMemoryFileSystem::lookupNode(const Twine &P, bool FollowFinalSymlink,
+                               size_t SymlinkDepth) const {
   SmallString<128> Path;
   P.toVector(Path);
 
   // Fix up relative paths. This just prepends the current working directory.
-  std::error_code EC = FS.makeAbsolute(Path);
+  std::error_code EC = makeAbsolute(Path);
   assert(!EC);
   (void)EC;
 
-  if (FS.useNormalizedPaths())
+  if (useNormalizedPaths())
     llvm::sys::path::remove_dots(Path, /*remove_dot_dot=*/true);
 
+  const detail::InMemoryDirectory *Dir = Root.get();
   if (Path.empty())
-    return Dir;
+    return detail::NamedNodeOrError(Path, Dir);
 
   auto I = llvm::sys::path::begin(Path), E = llvm::sys::path::end(Path);
   while (true) {
@@ -888,43 +951,99 @@ lookupInMemoryNode(const InMemoryFileSystem &FS, detail::InMemoryDirectory *Dir,
     if (!Node)
       return errc::no_such_file_or_directory;
 
+    if (auto Symlink = dyn_cast<detail::InMemorySymbolicLink>(Node)) {
+      // If we're at the end of the path, and we're not following through
+      // terminal symlinks, then we're done.
+      if (I == E && !FollowFinalSymlink)
+        return detail::NamedNodeOrError(Path, Symlink);
+
+      if (SymlinkDepth > InMemoryFileSystem::MaxSymlinkDepth)
+        return errc::no_such_file_or_directory;
+
+      SmallString<128> TargetPath = Symlink->getTargetPath();
+      if (std::error_code EC = makeAbsolute(TargetPath))
+        return EC;
+
+      // Keep going with the target. We always want to follow symlinks here
+      // because we're either at the end of a path that we want to follow, or
+      // not at the end of a path, in which case we need to follow the symlink
+      // regardless.
+      auto Target =
+          lookupNode(TargetPath, /*FollowFinalSymlink=*/true, SymlinkDepth + 1);
+      if (!Target || I == E)
+        return Target;
+
+      if (!isa<detail::InMemoryDirectory>(*Target))
+        return errc::no_such_file_or_directory;
+
+      // Otherwise, continue on the search in the symlinked directory.
+      Dir = cast<detail::InMemoryDirectory>(*Target);
+      continue;
+    }
+
     // Return the file if it's at the end of the path.
     if (auto File = dyn_cast<detail::InMemoryFile>(Node)) {
       if (I == E)
-        return File;
+        return detail::NamedNodeOrError(Path, File);
       return errc::no_such_file_or_directory;
     }
 
     // If Node is HardLink then return the resolved file.
     if (auto File = dyn_cast<detail::InMemoryHardLink>(Node)) {
       if (I == E)
-        return &File->getResolvedFile();
+        return detail::NamedNodeOrError(Path, &File->getResolvedFile());
       return errc::no_such_file_or_directory;
     }
     // Traverse directories.
     Dir = cast<detail::InMemoryDirectory>(Node);
     if (I == E)
-      return Dir;
+      return detail::NamedNodeOrError(Path, Dir);
   }
 }
 
-bool InMemoryFileSystem::addHardLink(const Twine &FromPath,
-                                     const Twine &ToPath) {
-  auto FromNode = lookupInMemoryNode(*this, Root.get(), FromPath);
-  auto ToNode = lookupInMemoryNode(*this, Root.get(), ToPath);
+bool InMemoryFileSystem::addHardLink(const Twine &NewLink,
+                                     const Twine &Target) {
+  auto NewLinkNode = lookupNode(NewLink, /*FollowFinalSymlink=*/false);
+  // Whether symlinks in the hardlink target are followed is
+  // implementation-defined in POSIX.
+  // We're following symlinks here to be consistent with macOS.
+  auto TargetNode = lookupNode(Target, /*FollowFinalSymlink=*/true);
   // FromPath must not have been added before. ToPath must have been added
   // before. Resolved ToPath must be a File.
-  if (!ToNode || FromNode || !isa<detail::InMemoryFile>(*ToNode))
+  if (!TargetNode || NewLinkNode || !isa<detail::InMemoryFile>(*TargetNode))
     return false;
-  return addFile(FromPath, 0, nullptr, None, None, None, None,
+  return addFile(NewLink, 0, nullptr, None, None, None, None,
                  [&](detail::NewInMemoryNodeInfo NNI) {
                    return std::make_unique<detail::InMemoryHardLink>(
-                       NNI.Path.str(), *cast<detail::InMemoryFile>(*ToNode));
+                       NNI.Path.str(),
+                       *cast<detail::InMemoryFile>(*TargetNode));
+                 });
+}
+
+bool InMemoryFileSystem::addSymbolicLink(const Twine &NewLink,
+                                         const Twine &Target,
+                                         time_t ModificationTime,
+                                         Optional<uint32_t> User,
+                                         Optional<uint32_t> Group,
+                                         Optional<llvm::sys::fs::perms> Perms) {
+  auto NewLinkNode = lookupNode(NewLink, /*FollowFinalSymlink=*/false);
+  if (NewLinkNode)
+    return false;
+
+  SmallString<128> NewLinkStr, TargetStr;
+  NewLink.toVector(NewLinkStr);
+  Target.toVector(TargetStr);
+
+  return addFile(NewLinkStr, ModificationTime, nullptr, User, Group,
+                 sys::fs::file_type::symlink_file, Perms,
+                 [&](detail::NewInMemoryNodeInfo NNI) {
+                   return std::make_unique<detail::InMemorySymbolicLink>(
+                       NewLinkStr, TargetStr, NNI.makeStatus());
                  });
 }
 
 llvm::ErrorOr<Status> InMemoryFileSystem::status(const Twine &Path) {
-  auto Node = lookupInMemoryNode(*this, Root.get(), Path);
+  auto Node = lookupNode(Path, /*FollowFinalSymlink=*/true);
   if (Node)
     return (*Node)->getStatus(Path);
   return Node.getError();
@@ -932,7 +1051,7 @@ llvm::ErrorOr<Status> InMemoryFileSystem::status(const Twine &Path) {
 
 llvm::ErrorOr<std::unique_ptr<File>>
 InMemoryFileSystem::openFileForRead(const Twine &Path) {
-  auto Node = lookupInMemoryNode(*this, Root.get(), Path);
+  auto Node = lookupNode(Path,/*FollowFinalSymlink=*/true);
   if (!Node)
     return Node.getError();
 
@@ -946,10 +1065,9 @@ InMemoryFileSystem::openFileForRead(const Twine &Path) {
   return make_error_code(llvm::errc::invalid_argument);
 }
 
-namespace {
-
 /// Adaptor from InMemoryDir::iterator to directory_iterator.
-class InMemoryDirIterator : public llvm::vfs::detail::DirIterImpl {
+class InMemoryFileSystem::DirIterator : public llvm::vfs::detail::DirIterImpl {
+  const InMemoryFileSystem *FS;
   detail::InMemoryDirectory::const_iterator I;
   detail::InMemoryDirectory::const_iterator E;
   std::string RequestedDirName;
@@ -967,6 +1085,13 @@ class InMemoryDirIterator : public llvm::vfs::detail::DirIterImpl {
       case detail::IME_Directory:
         Type = sys::fs::file_type::directory_file;
         break;
+      case detail::IME_SymbolicLink:
+        if (auto SymlinkTarget =
+                FS->lookupNode(Path, /*FollowFinalSymlink=*/true)) {
+          Path = SymlinkTarget.getName();
+          Type = (*SymlinkTarget)->getStatus(Path).getType();
+        }
+        break;
       }
       CurrentEntry = directory_entry(std::string(Path.str()), Type);
     } else {
@@ -977,11 +1102,12 @@ class InMemoryDirIterator : public llvm::vfs::detail::DirIterImpl {
   }
 
 public:
-  InMemoryDirIterator() = default;
+  DirIterator() = default;
 
-  explicit InMemoryDirIterator(const detail::InMemoryDirectory &Dir,
-                               std::string RequestedDirName)
-      : I(Dir.begin()), E(Dir.end()),
+  DirIterator(const InMemoryFileSystem *FS,
+              const detail::InMemoryDirectory &Dir,
+              std::string RequestedDirName)
+      : FS(FS), I(Dir.begin()), E(Dir.end()),
         RequestedDirName(std::move(RequestedDirName)) {
     setCurrentEntry();
   }
@@ -993,22 +1119,20 @@ public:
   }
 };
 
-} // namespace
-
 directory_iterator InMemoryFileSystem::dir_begin(const Twine &Dir,
                                                  std::error_code &EC) {
-  auto Node = lookupInMemoryNode(*this, Root.get(), Dir);
+  auto Node = lookupNode(Dir, /*FollowFinalSymlink=*/true);
   if (!Node) {
     EC = Node.getError();
-    return directory_iterator(std::make_shared<InMemoryDirIterator>());
+    return directory_iterator(std::make_shared<DirIterator>());
   }
 
   if (auto *DirNode = dyn_cast<detail::InMemoryDirectory>(*Node))
     return directory_iterator(
-        std::make_shared<InMemoryDirIterator>(*DirNode, Dir.str()));
+        std::make_shared<DirIterator>(this, *DirNode, Dir.str()));
 
   EC = make_error_code(llvm::errc::not_a_directory);
-  return directory_iterator(std::make_shared<InMemoryDirIterator>());
+  return directory_iterator(std::make_shared<DirIterator>());
 }
 
 std::error_code InMemoryFileSystem::setCurrentWorkingDirectory(const Twine &P) {
@@ -1046,6 +1170,12 @@ std::error_code InMemoryFileSystem::isLocal(const Twine &Path, bool &Result) {
   return {};
 }
 
+void InMemoryFileSystem::printImpl(raw_ostream &OS, PrintType PrintContents,
+                                   unsigned IndentLevel) const {
+  printIndent(OS, IndentLevel);
+  OS << "InMemoryFileSystem\n";
+}
+
 } // namespace vfs
 } // namespace llvm
 
@@ -1079,6 +1209,14 @@ static llvm::SmallString<256> canonicalize(llvm::StringRef Path) {
   return result;
 }
 
+/// Whether the error and entry specify a file/directory that was not found.
+static bool isFileNotFound(std::error_code EC,
+                           RedirectingFileSystem::Entry *E = nullptr) {
+  if (E && !isa<RedirectingFileSystem::DirectoryRemapEntry>(E))
+    return false;
+  return EC == llvm::errc::no_such_file_or_directory;
+}
+
 } // anonymous namespace
 
 
@@ -1255,49 +1393,93 @@ directory_iterator RedirectingFileSystem::dir_begin(const Twine &Dir,
 
   ErrorOr<RedirectingFileSystem::LookupResult> Result = lookupPath(Path);
   if (!Result) {
-    EC = Result.getError();
-    if (shouldFallBackToExternalFS(EC))
+    if (Redirection != RedirectKind::RedirectOnly &&
+        isFileNotFound(Result.getError()))
       return ExternalFS->dir_begin(Path, EC);
+
+    EC = Result.getError();
     return {};
   }
 
   // Use status to make sure the path exists and refers to a directory.
   ErrorOr<Status> S = status(Path, Dir, *Result);
   if (!S) {
-    if (shouldFallBackToExternalFS(S.getError(), Result->E))
+    if (Redirection != RedirectKind::RedirectOnly &&
+        isFileNotFound(S.getError(), Result->E))
       return ExternalFS->dir_begin(Dir, EC);
+
     EC = S.getError();
     return {};
   }
+
   if (!S->isDirectory()) {
-    EC = std::error_code(static_cast<int>(errc::not_a_directory),
-                         std::system_category());
+    EC = errc::not_a_directory;
     return {};
   }
 
   // Create the appropriate directory iterator based on whether we found a
   // DirectoryRemapEntry or DirectoryEntry.
-  directory_iterator DirIter;
+  directory_iterator RedirectIter;
+  std::error_code RedirectEC;
   if (auto ExtRedirect = Result->getExternalRedirect()) {
     auto RE = cast<RedirectingFileSystem::RemapEntry>(Result->E);
-    DirIter = ExternalFS->dir_begin(*ExtRedirect, EC);
+    RedirectIter = ExternalFS->dir_begin(*ExtRedirect, RedirectEC);
 
     if (!RE->useExternalName(UseExternalNames)) {
       // Update the paths in the results to use the virtual directory's path.
-      DirIter =
+      RedirectIter =
           directory_iterator(std::make_shared<RedirectingFSDirRemapIterImpl>(
-              std::string(Path), DirIter));
+              std::string(Path), RedirectIter));
     }
   } else {
     auto DE = cast<DirectoryEntry>(Result->E);
-    DirIter = directory_iterator(std::make_shared<RedirectingFSDirIterImpl>(
-        Path, DE->contents_begin(), DE->contents_end(), EC));
+    RedirectIter =
+        directory_iterator(std::make_shared<RedirectingFSDirIterImpl>(
+            Path, DE->contents_begin(), DE->contents_end(), RedirectEC));
+  }
+
+  if (RedirectEC) {
+    if (RedirectEC != errc::no_such_file_or_directory) {
+      EC = RedirectEC;
+      return {};
+    }
+    RedirectIter = {};
   }
 
-  if (!shouldUseExternalFS())
-    return DirIter;
-  return directory_iterator(std::make_shared<CombiningDirIterImpl>(
-      DirIter, ExternalFS, std::string(Path), EC));
+  if (Redirection == RedirectKind::RedirectOnly) {
+    EC = RedirectEC;
+    return RedirectIter;
+  }
+
+  std::error_code ExternalEC;
+  directory_iterator ExternalIter = ExternalFS->dir_begin(Path, ExternalEC);
+  if (ExternalEC) {
+    if (ExternalEC != errc::no_such_file_or_directory) {
+      EC = ExternalEC;
+      return {};
+    }
+    ExternalIter = {};
+  }
+
+  SmallVector<directory_iterator, 2> Iters;
+  switch (Redirection) {
+  case RedirectKind::Fallthrough:
+    Iters.push_back(ExternalIter);
+    Iters.push_back(RedirectIter);
+    break;
+  case RedirectKind::Fallback:
+    Iters.push_back(RedirectIter);
+    Iters.push_back(ExternalIter);
+    break;
+  default:
+    llvm_unreachable("unhandled RedirectKind");
+  }
+
+  directory_iterator Combined{
+      std::make_shared<CombiningDirIterImpl>(Iters, EC)};
+  if (EC)
+    return {};
+  return Combined;
 }
 
 void RedirectingFileSystem::setExternalContentsPrefixDir(StringRef PrefixDir) {
@@ -1309,7 +1491,16 @@ StringRef RedirectingFileSystem::getExternalContentsPrefixDir() const {
 }
 
 void RedirectingFileSystem::setFallthrough(bool Fallthrough) {
-  IsFallthrough = Fallthrough;
+  if (Fallthrough) {
+    Redirection = RedirectingFileSystem::RedirectKind::Fallthrough;
+  } else {
+    Redirection = RedirectingFileSystem::RedirectKind::RedirectOnly;
+  }
+}
+
+void RedirectingFileSystem::setRedirection(
+    RedirectingFileSystem::RedirectKind Kind) {
+  Redirection = Kind;
 }
 
 std::vector<StringRef> RedirectingFileSystem::getRoots() const {
@@ -1319,34 +1510,59 @@ std::vector<StringRef> RedirectingFileSystem::getRoots() const {
   return R;
 }
 
-void RedirectingFileSystem::dump(raw_ostream &OS) const {
+void RedirectingFileSystem::printImpl(raw_ostream &OS, PrintType Type,
+                                      unsigned IndentLevel) const {
+  printIndent(OS, IndentLevel);
+  OS << "RedirectingFileSystem (UseExternalNames: "
+     << (UseExternalNames ? "true" : "false") << ")\n";
+  if (Type == PrintType::Summary)
+    return;
+
   for (const auto &Root : Roots)
-    dumpEntry(OS, Root.get());
+    printEntry(OS, Root.get(), IndentLevel);
+
+  printIndent(OS, IndentLevel);
+  OS << "ExternalFS:\n";
+  ExternalFS->print(OS, Type == PrintType::Contents ? PrintType::Summary : Type,
+                    IndentLevel + 1);
 }
 
-void RedirectingFileSystem::dumpEntry(raw_ostream &OS,
-                                      RedirectingFileSystem::Entry *E,
-                                      int NumSpaces) const {
-  StringRef Name = E->getName();
-  for (int i = 0, e = NumSpaces; i < e; ++i)
-    OS << " ";
-  OS << "'" << Name.str().c_str() << "'"
-     << "\n";
+void RedirectingFileSystem::printEntry(raw_ostream &OS,
+                                       RedirectingFileSystem::Entry *E,
+                                       unsigned IndentLevel) const {
+  printIndent(OS, IndentLevel);
+  OS << "'" << E->getName() << "'";
 
-  if (E->getKind() == RedirectingFileSystem::EK_Directory) {
-    auto *DE = dyn_cast<RedirectingFileSystem::DirectoryEntry>(E);
-    assert(DE && "Should be a directory");
+  switch (E->getKind()) {
+  case EK_Directory: {
+    auto *DE = cast<RedirectingFileSystem::DirectoryEntry>(E);
 
+    OS << "\n";
     for (std::unique_ptr<Entry> &SubEntry :
          llvm::make_range(DE->contents_begin(), DE->contents_end()))
-      dumpEntry(OS, SubEntry.get(), NumSpaces + 2);
+      printEntry(OS, SubEntry.get(), IndentLevel + 1);
+    break;
+  }
+  case EK_DirectoryRemap:
+  case EK_File: {
+    auto *RE = cast<RedirectingFileSystem::RemapEntry>(E);
+    OS << " -> '" << RE->getExternalContentsPath() << "'";
+    switch (RE->getUseName()) {
+    case NK_NotSet:
+      break;
+    case NK_External:
+      OS << " (UseExternalName: true)";
+      break;
+    case NK_Virtual:
+      OS << " (UseExternalName: false)";
+      break;
+    }
+    OS << "\n";
+    break;
+  }
   }
 }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD void RedirectingFileSystem::dump() const { dump(dbgs()); }
-#endif
-
 /// A helper class to hold the common YAML parsing state.
 class llvm::vfs::RedirectingFileSystemParser {
   yaml::Stream &Stream;
@@ -1388,6 +1604,23 @@ class llvm::vfs::RedirectingFileSystemParser {
     return false;
   }
 
+  Optional<RedirectingFileSystem::RedirectKind>
+  parseRedirectKind(yaml::Node *N) {
+    SmallString<12> Storage;
+    StringRef Value;
+    if (!parseScalarString(N, Value, Storage))
+      return None;
+
+    if (Value.equals_insensitive("fallthrough")) {
+      return RedirectingFileSystem::RedirectKind::Fallthrough;
+    } else if (Value.equals_insensitive("fallback")) {
+      return RedirectingFileSystem::RedirectKind::Fallback;
+    } else if (Value.equals_insensitive("redirect-only")) {
+      return RedirectingFileSystem::RedirectKind::RedirectOnly;
+    }
+    return None;
+  }
+
   struct KeyStatus {
     bool Required;
     bool Seen = false;
@@ -1731,6 +1964,7 @@ public:
         KeyStatusPair("use-external-names", false),
         KeyStatusPair("overlay-relative", false),
         KeyStatusPair("fallthrough", false),
+        KeyStatusPair("redirecting-with", false),
         KeyStatusPair("roots", true),
     };
 
@@ -1789,8 +2023,34 @@ public:
         if (!parseScalarBool(I.getValue(), FS->UseExternalNames))
           return false;
       } else if (Key == "fallthrough") {
-        if (!parseScalarBool(I.getValue(), FS->IsFallthrough))
+        if (Keys["redirecting-with"].Seen) {
+          error(I.getValue(),
+                "'fallthrough' and 'redirecting-with' are mutually exclusive");
+          return false;
+        }
+
+        bool ShouldFallthrough = false;
+        if (!parseScalarBool(I.getValue(), ShouldFallthrough))
+          return false;
+
+        if (ShouldFallthrough) {
+          FS->Redirection = RedirectingFileSystem::RedirectKind::Fallthrough;
+        } else {
+          FS->Redirection = RedirectingFileSystem::RedirectKind::RedirectOnly;
+        }
+      } else if (Key == "redirecting-with") {
+        if (Keys["fallthrough"].Seen) {
+          error(I.getValue(),
+                "'fallthrough' and 'redirecting-with' are mutually exclusive");
+          return false;
+        }
+
+        if (auto Kind = parseRedirectKind(I.getValue())) {
+          FS->Redirection = *Kind;
+        } else {
+          error(I.getValue(), "expected valid redirect kind");
           return false;
+        }
       } else {
         llvm_unreachable("key missing from Keys");
       }
@@ -1923,13 +2183,6 @@ RedirectingFileSystem::LookupResult::LookupResult(
   }
 }
 
-bool RedirectingFileSystem::shouldFallBackToExternalFS(
-    std::error_code EC, RedirectingFileSystem::Entry *E) const {
-  if (E && !isa<RedirectingFileSystem::DirectoryRemapEntry>(E))
-    return false;
-  return shouldUseExternalFS() && EC == llvm::errc::no_such_file_or_directory;
-}
-
 std::error_code
 RedirectingFileSystem::makeCanonical(SmallVectorImpl<char> &Path) const {
   if (std::error_code EC = makeAbsolute(Path))
@@ -2001,9 +2254,16 @@ RedirectingFileSystem::lookupPathImpl(
 static Status getRedirectedFileStatus(const Twine &OriginalPath,
                                       bool UseExternalNames,
                                       Status ExternalStatus) {
+  // The path has been mapped by some nested VFS and exposes an external path,
+  // don't override it with the original path.
+  if (ExternalStatus.ExposesExternalVFSPath)
+    return ExternalStatus;
+
   Status S = ExternalStatus;
   if (!UseExternalNames)
     S = Status::copyWithNewName(S, OriginalPath);
+  else
+    S.ExposesExternalVFSPath = true;
   S.IsVFSMapped = true;
   return S;
 }
@@ -2032,11 +2292,13 @@ ErrorOr<Status> RedirectingFileSystem::status(
 ErrorOr<Status>
 RedirectingFileSystem::getExternalStatus(const Twine &CanonicalPath,
                                          const Twine &OriginalPath) const {
-  if (auto Result = ExternalFS->status(CanonicalPath)) {
-    return Result.get().copyWithNewName(Result.get(), OriginalPath);
-  } else {
-    return Result.getError();
-  }
+  auto Result = ExternalFS->status(CanonicalPath);
+
+  // The path has been mapped by some nested VFS, don't override it with the
+  // original path.
+  if (!Result || Result->ExposesExternalVFSPath)
+    return Result;
+  return Status::copyWithNewName(Result.get(), OriginalPath);
 }
 
 ErrorOr<Status> RedirectingFileSystem::status(const Twine &OriginalPath) {
@@ -2046,17 +2308,31 @@ ErrorOr<Status> RedirectingFileSystem::status(const Twine &OriginalPath) {
   if (std::error_code EC = makeCanonical(CanonicalPath))
     return EC;
 
+  if (Redirection == RedirectKind::Fallback) {
+    // Attempt to find the original file first, only falling back to the
+    // mapped file if that fails.
+    ErrorOr<Status> S = getExternalStatus(CanonicalPath, OriginalPath);
+    if (S)
+      return S;
+  }
+
   ErrorOr<RedirectingFileSystem::LookupResult> Result =
       lookupPath(CanonicalPath);
   if (!Result) {
-    if (shouldFallBackToExternalFS(Result.getError())) {
+    // Was not able to map file, fallthrough to using the original path if
+    // that was the specified redirection type.
+    if (Redirection == RedirectKind::Fallthrough &&
+        isFileNotFound(Result.getError()))
       return getExternalStatus(CanonicalPath, OriginalPath);
-    }
     return Result.getError();
   }
 
   ErrorOr<Status> S = status(CanonicalPath, OriginalPath, *Result);
-  if (!S && shouldFallBackToExternalFS(S.getError(), Result->E)) {
+  if (!S && Redirection == RedirectKind::Fallthrough &&
+      isFileNotFound(S.getError(), Result->E)) {
+    // Mapped the file but it wasn't found in the underlying filesystem,
+    // fallthrough to using the original path if that was the specified
+    // redirection type.
     return getExternalStatus(CanonicalPath, OriginalPath);
   }
 
@@ -2092,7 +2368,9 @@ public:
 
 ErrorOr<std::unique_ptr<File>>
 File::getWithPath(ErrorOr<std::unique_ptr<File>> Result, const Twine &P) {
-  if (!Result)
+  // See \c getRedirectedFileStatus - don't update path if it's exposing an
+  // external path.
+  if (!Result || (*Result)->status()->ExposesExternalVFSPath)
     return Result;
 
   ErrorOr<std::unique_ptr<File>> F = std::move(*Result);
@@ -2110,13 +2388,24 @@ RedirectingFileSystem::openFileForRead(const Twine &OriginalPath) {
   if (std::error_code EC = makeCanonical(CanonicalPath))
     return EC;
 
+  if (Redirection == RedirectKind::Fallback) {
+    // Attempt to find the original file first, only falling back to the
+    // mapped file if that fails.
+    auto F = File::getWithPath(ExternalFS->openFileForRead(CanonicalPath),
+                               OriginalPath);
+    if (F)
+      return F;
+  }
+
   ErrorOr<RedirectingFileSystem::LookupResult> Result =
       lookupPath(CanonicalPath);
   if (!Result) {
-    if (shouldFallBackToExternalFS(Result.getError()))
+    // Was not able to map file, fallthrough to using the original path if
+    // that was the specified redirection type.
+    if (Redirection == RedirectKind::Fallthrough &&
+        isFileNotFound(Result.getError()))
       return File::getWithPath(ExternalFS->openFileForRead(CanonicalPath),
                                OriginalPath);
-
     return Result.getError();
   }
 
@@ -2133,9 +2422,14 @@ RedirectingFileSystem::openFileForRead(const Twine &OriginalPath) {
   auto ExternalFile = File::getWithPath(
       ExternalFS->openFileForRead(CanonicalRemappedPath), ExtRedirect);
   if (!ExternalFile) {
-    if (shouldFallBackToExternalFS(ExternalFile.getError(), Result->E))
+    if (Redirection == RedirectKind::Fallthrough &&
+        isFileNotFound(ExternalFile.getError(), Result->E)) {
+      // Mapped the file but it wasn't found in the underlying filesystem,
+      // fallthrough to using the original path if that was the specified
+      // redirection type.
       return File::getWithPath(ExternalFS->openFileForRead(CanonicalPath),
                                OriginalPath);
+    }
     return ExternalFile;
   }
 
@@ -2143,7 +2437,8 @@ RedirectingFileSystem::openFileForRead(const Twine &OriginalPath) {
   if (!ExternalStatus)
     return ExternalStatus.getError();
 
-  // FIXME: Update the status with the name and VFSMapped.
+  // Otherwise, the file was successfully remapped. Mark it as such. Also
+  // replace the underlying path if the external name is being used.
   Status S = getRedirectedFileStatus(
       OriginalPath, RE->useExternalName(UseExternalNames), *ExternalStatus);
   return std::unique_ptr<File>(
@@ -2151,18 +2446,30 @@ RedirectingFileSystem::openFileForRead(const Twine &OriginalPath) {
 }
 
 std::error_code
-RedirectingFileSystem::getRealPath(const Twine &Path_,
+RedirectingFileSystem::getRealPath(const Twine &OriginalPath,
                                    SmallVectorImpl<char> &Output) const {
-  SmallString<256> Path;
-  Path_.toVector(Path);
+  SmallString<256> CanonicalPath;
+  OriginalPath.toVector(CanonicalPath);
 
-  if (std::error_code EC = makeCanonical(Path))
+  if (std::error_code EC = makeCanonical(CanonicalPath))
     return EC;
 
-  ErrorOr<RedirectingFileSystem::LookupResult> Result = lookupPath(Path);
+  if (Redirection == RedirectKind::Fallback) {
+    // Attempt to find the original file first, only falling back to the
+    // mapped file if that fails.
+    std::error_code EC = ExternalFS->getRealPath(CanonicalPath, Output);
+    if (!EC)
+      return EC;
+  }
+
+  ErrorOr<RedirectingFileSystem::LookupResult> Result =
+      lookupPath(CanonicalPath);
   if (!Result) {
-    if (shouldFallBackToExternalFS(Result.getError()))
-      return ExternalFS->getRealPath(Path, Output);
+    // Was not able to map file, fallthrough to using the original path if
+    // that was the specified redirection type.
+    if (Redirection == RedirectKind::Fallthrough &&
+        isFileNotFound(Result.getError()))
+      return ExternalFS->getRealPath(CanonicalPath, Output);
     return Result.getError();
   }
 
@@ -2170,16 +2477,21 @@ RedirectingFileSystem::getRealPath(const Twine &Path_,
   // path in the external file system.
   if (auto ExtRedirect = Result->getExternalRedirect()) {
     auto P = ExternalFS->getRealPath(*ExtRedirect, Output);
-    if (!P && shouldFallBackToExternalFS(P, Result->E)) {
-      return ExternalFS->getRealPath(Path, Output);
+    if (P && Redirection == RedirectKind::Fallthrough &&
+        isFileNotFound(P, Result->E)) {
+      // Mapped the file but it wasn't found in the underlying filesystem,
+      // fallthrough to using the original path if that was the specified
+      // redirection type.
+      return ExternalFS->getRealPath(CanonicalPath, Output);
     }
     return P;
   }
 
-  // If we found a DirectoryEntry, still fall back to ExternalFS if allowed,
-  // because directories don't have a single external contents path.
-  return shouldUseExternalFS() ? ExternalFS->getRealPath(Path, Output)
-                               : llvm::errc::invalid_argument;
+  // If we found a DirectoryEntry, still fallthrough to the original path if
+  // allowed, because directories don't have a single external contents path.
+  if (Redirection == RedirectKind::Fallthrough)
+    return ExternalFS->getRealPath(CanonicalPath, Output);
+  return llvm::errc::invalid_argument;
 }
 
 std::unique_ptr<FileSystem>
@@ -2355,14 +2667,14 @@ void JSONWriter::write(ArrayRef<YAMLVFSEntry> Entries,
 
   OS << "{\n"
         "  'version': 0,\n";
-  if (IsCaseSensitive.hasValue())
+  if (IsCaseSensitive)
     OS << "  'case-sensitive': '"
        << (IsCaseSensitive.getValue() ? "true" : "false") << "',\n";
-  if (UseExternalNames.hasValue())
+  if (UseExternalNames)
     OS << "  'use-external-names': '"
        << (UseExternalNames.getValue() ? "true" : "false") << "',\n";
   bool UseOverlayRelative = false;
-  if (IsOverlayRelative.hasValue()) {
+  if (IsOverlayRelative) {
     UseOverlayRelative = IsOverlayRelative.getValue();
     OS << "  'overlay-relative': '" << (UseOverlayRelative ? "true" : "false")
        << "',\n";
diff --git a/llvm/lib/Support/Windows/Path.inc b/llvm/lib/Support/Windows/Path.inc
index 5f1a364ea1a8..433c62900a3f 100644
--- a/llvm/lib/Support/Windows/Path.inc
+++ b/llvm/lib/Support/Windows/Path.inc
@@ -130,7 +130,7 @@ namespace fs {
 
 const file_t kInvalidFile = INVALID_HANDLE_VALUE;
 
-std::string getMainExecutable(const char *argv0, void *MainExecAddr) {
+std::string getMainExecutableImpl(const char *argv0, void *MainExecAddr) {
   SmallVector<wchar_t, MAX_PATH> PathName;
   PathName.resize_for_overwrite(PathName.capacity());
   DWORD Size = ::GetModuleFileNameW(NULL, PathName.data(), PathName.size());
diff --git a/llvm/lib/Support/Windows/Process.inc b/llvm/lib/Support/Windows/Process.inc
index dfaab1613de1..b0c55a77bc93 100644
--- a/llvm/lib/Support/Windows/Process.inc
+++ b/llvm/lib/Support/Windows/Process.inc
@@ -156,9 +156,10 @@ static std::error_code WildcardExpand(StringRef Arg,
 
   // Don't expand Arg if it does not contain any wildcard characters. This is
   // the common case. Also don't wildcard expand /?. Always treat it as an
-  // option.
+  // option. Paths that start with \\?\ are absolute paths, and aren't
+  // expected to be used with wildcard expressions.
   if (Arg.find_first_of("*?") == StringRef::npos || Arg == "/?" ||
-      Arg == "-?") {
+      Arg == "-?" || Arg.startswith("\\\\?\\")) {
     Args.push_back(Arg.data());
     return EC;
   }
@@ -247,7 +248,7 @@ windows::GetCommandLineArguments(SmallVectorImpl<const char *> &Args,
 
   SmallVector<const char *, 20> TmpArgs;
   StringSaver Saver(Alloc);
-  cl::TokenizeWindowsCommandLine(Cmd, Saver, TmpArgs, /*MarkEOLs=*/false);
+  cl::TokenizeWindowsCommandLineFull(Cmd, Saver, TmpArgs, /*MarkEOLs=*/false);
 
   for (const char *Arg : TmpArgs) {
     EC = WildcardExpand(Arg, Args, Saver);
@@ -255,6 +256,9 @@ windows::GetCommandLineArguments(SmallVectorImpl<const char *> &Args,
       return EC;
   }
 
+  if (Args.size() == 0)
+    return std::make_error_code(std::errc::invalid_argument);
+
   SmallVector<char, MAX_PATH> Arg0(Args[0], Args[0] + strlen(Args[0]));
   SmallVector<char, MAX_PATH> Filename;
   sys::path::remove_filename(Arg0);
diff --git a/llvm/lib/Support/Windows/Program.inc b/llvm/lib/Support/Windows/Program.inc
index ee633411584f..58de140a60d1 100644
--- a/llvm/lib/Support/Windows/Program.inc
+++ b/llvm/lib/Support/Windows/Program.inc
@@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/Errc.h"
@@ -18,12 +19,12 @@
 #include "llvm/Support/Windows/WindowsSupport.h"
 #include "llvm/Support/WindowsError.h"
 #include "llvm/Support/raw_ostream.h"
-#include <psapi.h>
 #include <cstdio>
 #include <fcntl.h>
 #include <io.h>
 #include <malloc.h>
 #include <numeric>
+#include <psapi.h>
 
 //===----------------------------------------------------------------------===//
 //=== WARNING: Implementation here must contain only Win32 specific code
diff --git a/llvm/lib/Support/Windows/Signals.inc b/llvm/lib/Support/Windows/Signals.inc
index 32186bbe5160..32477de5184b 100644
--- a/llvm/lib/Support/Windows/Signals.inc
+++ b/llvm/lib/Support/Windows/Signals.inc
@@ -159,6 +159,10 @@ static fpSymInitialize fSymInitialize;
 typedef BOOL (WINAPI *fpEnumerateLoadedModules)(HANDLE,PENUMLOADED_MODULES_CALLBACK64,PVOID);
 static fpEnumerateLoadedModules fEnumerateLoadedModules;
 
+static bool isDebugHelpInitialized() {
+  return fStackWalk64 && fSymInitialize && fSymSetOptions && fMiniDumpWriteDump;
+}
+
 static bool load64BitDebugHelp(void) {
   HMODULE hLib = ::LoadLibraryW(L"Dbghelp.dll");
   if (hLib) {
@@ -181,7 +185,7 @@ static bool load64BitDebugHelp(void) {
     fEnumerateLoadedModules = (fpEnumerateLoadedModules)
       ::GetProcAddress(hLib, "EnumerateLoadedModules64");
   }
-  return fStackWalk64 && fSymInitialize && fSymSetOptions && fMiniDumpWriteDump;
+  return isDebugHelpInitialized();
 }
 
 using namespace llvm;
@@ -296,6 +300,12 @@ static bool findModulesAndOffsets(void **StackTrace, int Depth,
 static void PrintStackTraceForThread(llvm::raw_ostream &OS, HANDLE hProcess,
                                      HANDLE hThread, STACKFRAME64 &StackFrame,
                                      CONTEXT *Context) {
+  // It's possible that DbgHelp.dll hasn't been loaded yet (e.g. if this
+  // function is called before the main program called `llvm::InitLLVM`).
+  // In this case just return, not stacktrace will be printed.
+  if (!isDebugHelpInitialized())
+    return;
+
   // Initialize the symbol handler.
   fSymSetOptions(SYMOPT_DEFERRED_LOADS | SYMOPT_LOAD_LINES);
   fSymInitialize(hProcess, NULL, TRUE);
@@ -327,24 +337,24 @@ static void PrintStackTraceForThread(llvm::raw_ostream &OS, HANDLE hProcess,
     OS << format("0x%08lX", static_cast<DWORD>(PC));
 #endif
 
-// Print the parameters.  Assume there are four.
-#if defined(_M_X64) || defined(_M_ARM64)
-    OS << format(" (0x%016llX 0x%016llX 0x%016llX 0x%016llX)",
-            StackFrame.Params[0], StackFrame.Params[1], StackFrame.Params[2],
-            StackFrame.Params[3]);
-#elif defined(_M_IX86) || defined(_M_ARM)
-    OS << format(" (0x%08lX 0x%08lX 0x%08lX 0x%08lX)",
-            static_cast<DWORD>(StackFrame.Params[0]),
-            static_cast<DWORD>(StackFrame.Params[1]),
-            static_cast<DWORD>(StackFrame.Params[2]),
-            static_cast<DWORD>(StackFrame.Params[3]));
-#endif
     // Verify the PC belongs to a module in this process.
     if (!fSymGetModuleBase64(hProcess, PC)) {
       OS << " <unknown module>\n";
       continue;
     }
 
+    IMAGEHLP_MODULE64 M;
+    memset(&M, 0, sizeof(IMAGEHLP_MODULE64));
+    M.SizeOfStruct = sizeof(IMAGEHLP_MODULE64);
+    if (fSymGetModuleInfo64(hProcess, fSymGetModuleBase64(hProcess, PC), &M)) {
+      DWORD64 const disp = PC - M.BaseOfImage;
+      OS << format(", %s(0x%016llX) + 0x%llX byte(s)",
+                   static_cast<char *>(M.ImageName), M.BaseOfImage,
+                   static_cast<long long>(disp));
+    } else {
+      OS << ", <unknown module>";
+    }
+
     // Print the symbol name.
     char buffer[512];
     IMAGEHLP_SYMBOL64 *symbol = reinterpret_cast<IMAGEHLP_SYMBOL64 *>(buffer);
@@ -359,20 +369,16 @@ static void PrintStackTraceForThread(llvm::raw_ostream &OS, HANDLE hProcess,
     }
 
     buffer[511] = 0;
-    if (dwDisp > 0)
-      OS << format(", %s() + 0x%llX bytes(s)", (const char*)symbol->Name,
-                   dwDisp);
-    else
-      OS << format(", %s", (const char*)symbol->Name);
+    OS << format(", %s() + 0x%llX byte(s)", static_cast<char *>(symbol->Name),
+                 static_cast<long long>(dwDisp));
 
     // Print the source file and line number information.
     IMAGEHLP_LINE64 line = {};
     DWORD dwLineDisp;
     line.SizeOfStruct = sizeof(line);
     if (fSymGetLineFromAddr64(hProcess, PC, &dwLineDisp, &line)) {
-      OS << format(", %s, line %lu", line.FileName, line.LineNumber);
-      if (dwLineDisp > 0)
-        OS << format(" + 0x%lX byte(s)", dwLineDisp);
+      OS << format(", %s, line %lu + 0x%lX byte(s)", line.FileName,
+                   line.LineNumber, dwLineDisp);
     }
 
     OS << '\n';
@@ -811,6 +817,12 @@ void sys::CleanupOnSignal(uintptr_t Context) {
 static LONG WINAPI LLVMUnhandledExceptionFilter(LPEXCEPTION_POINTERS ep) {
   Cleanup(true);
 
+  // Write out the exception code.
+  if (ep && ep->ExceptionRecord)
+    llvm::errs() << format("Exception Code: 0x%08X",
+                           ep->ExceptionRecord->ExceptionCode)
+                 << "\n";
+
   // We'll automatically write a Minidump file here to help diagnose
   // the nasty sorts of crashes that aren't 100% reproducible from a set of
   // inputs (or in the event that the user is unable or unwilling to provide a
diff --git a/llvm/lib/Support/Windows/Threading.inc b/llvm/lib/Support/Windows/Threading.inc
index 7b48ca8fb1fb..11f34817dbbf 100644
--- a/llvm/lib/Support/Windows/Threading.inc
+++ b/llvm/lib/Support/Windows/Threading.inc
@@ -27,8 +27,8 @@ namespace llvm {
 HANDLE
 llvm_execute_on_thread_impl(unsigned(__stdcall *ThreadFunc)(void *), void *Arg,
                             llvm::Optional<unsigned> StackSizeInBytes) {
-  HANDLE hThread = (HANDLE)::_beginthreadex(
-      NULL, StackSizeInBytes.getValueOr(0), ThreadFunc, Arg, 0, NULL);
+  HANDLE hThread = (HANDLE)::_beginthreadex(NULL, StackSizeInBytes.value_or(0),
+                                            ThreadFunc, Arg, 0, NULL);
 
   if (!hThread) {
     ReportLastErrorFatal("_beginthreadex failed");
@@ -120,8 +120,10 @@ SetThreadPriorityResult llvm::set_thread_priority(ThreadPriority Priority) {
   // End background processing mode. The system restores the resource scheduling
   // priorities of the thread as they were before the thread entered background
   // processing mode.
+  //
+  // FIXME: consider THREAD_PRIORITY_BELOW_NORMAL for Low
   return SetThreadPriority(GetCurrentThread(),
-                           Priority == ThreadPriority::Background
+                           Priority != ThreadPriority::Default
                                ? THREAD_MODE_BACKGROUND_BEGIN
                                : THREAD_MODE_BACKGROUND_END)
              ? SetThreadPriorityResult::SUCCESS
diff --git a/llvm/lib/Support/WithColor.cpp b/llvm/lib/Support/WithColor.cpp
index b1aa709862d8..abc9fb3e5d60 100644
--- a/llvm/lib/Support/WithColor.cpp
+++ b/llvm/lib/Support/WithColor.cpp
@@ -33,6 +33,14 @@ struct CreateUseColor {
 static ManagedStatic<cl::opt<cl::boolOrDefault>, CreateUseColor> UseColor;
 void llvm::initWithColorOptions() { *UseColor; }
 
+static bool DefaultAutoDetectFunction(const raw_ostream &OS) {
+  return *UseColor == cl::BOU_UNSET ? OS.has_colors()
+                                    : *UseColor == cl::BOU_TRUE;
+}
+
+WithColor::AutoDetectFunctionType WithColor::AutoDetectFunction =
+    DefaultAutoDetectFunction;
+
 WithColor::WithColor(raw_ostream &OS, HighlightColor Color, ColorMode Mode)
     : OS(OS), Mode(Mode) {
   // Detect color from terminal type unless the user passed the --color option.
@@ -127,8 +135,7 @@ bool WithColor::colorsEnabled() {
   case ColorMode::Disable:
     return false;
   case ColorMode::Auto:
-    return *UseColor == cl::BOU_UNSET ? OS.has_colors()
-                                      : *UseColor == cl::BOU_TRUE;
+    return AutoDetectFunction(OS);
   }
   llvm_unreachable("All cases handled above.");
 }
@@ -159,3 +166,12 @@ void WithColor::defaultWarningHandler(Error Warning) {
     WithColor::warning() << Info.message() << '\n';
   });
 }
+
+WithColor::AutoDetectFunctionType WithColor::defaultAutoDetectFunction() {
+  return DefaultAutoDetectFunction;
+}
+
+void WithColor::setAutoDetectFunction(
+    AutoDetectFunctionType NewAutoDetectFunction) {
+  AutoDetectFunction = NewAutoDetectFunction;
+}
diff --git a/llvm/lib/Support/YAMLParser.cpp b/llvm/lib/Support/YAMLParser.cpp
index 200261d3ed5c..578ce228079b 100644
--- a/llvm/lib/Support/YAMLParser.cpp
+++ b/llvm/lib/Support/YAMLParser.cpp
@@ -392,6 +392,9 @@ private:
   ///        Pos is whitespace or a new line
   bool isBlankOrBreak(StringRef::iterator Position);
 
+  /// Return true if the line is a line break, false otherwise.
+  bool isLineEmpty(StringRef Line);
+
   /// Consume a single b-break[28] if it's present at the current position.
   ///
   /// Return false if the code unit at the current position isn't a line break.
@@ -470,6 +473,18 @@ private:
   /// Scan a block scalar starting with | or >.
   bool scanBlockScalar(bool IsLiteral);
 
+  /// Scan a block scalar style indicator and header.
+  ///
+  /// Note: This is distinct from scanBlockScalarHeader to mirror the fact that
+  /// YAML does not consider the style indicator to be a part of the header.
+  ///
+  /// Return false if an error occurred.
+  bool scanBlockScalarIndicators(char &StyleIndicator, char &ChompingIndicator,
+                                 unsigned &IndentIndicator, bool &IsDone);
+
+  /// Scan a style indicator in a block scalar header.
+  char scanBlockStyleIndicator();
+
   /// Scan a chomping indicator in a block scalar header.
   char scanBlockChompingIndicator();
 
@@ -1034,6 +1049,13 @@ bool Scanner::isBlankOrBreak(StringRef::iterator Position) {
          *Position == '\n';
 }
 
+bool Scanner::isLineEmpty(StringRef Line) {
+  for (const auto *Position = Line.begin(); Position != Line.end(); ++Position)
+    if (!isBlankOrBreak(Position))
+      return false;
+  return true;
+}
+
 bool Scanner::consumeLineBreakIfPresent() {
   auto Next = skip_b_break(Current);
   if (Next == Current)
@@ -1516,6 +1538,25 @@ bool Scanner::scanAliasOrAnchor(bool IsAlias) {
   return true;
 }
 
+bool Scanner::scanBlockScalarIndicators(char &StyleIndicator,
+                                        char &ChompingIndicator,
+                                        unsigned &IndentIndicator,
+                                        bool &IsDone) {
+  StyleIndicator = scanBlockStyleIndicator();
+  if (!scanBlockScalarHeader(ChompingIndicator, IndentIndicator, IsDone))
+    return false;
+  return true;
+}
+
+char Scanner::scanBlockStyleIndicator() {
+  char Indicator = ' ';
+  if (Current != End && (*Current == '>' || *Current == '|')) {
+    Indicator = *Current;
+    skip(1);
+  }
+  return Indicator;
+}
+
 char Scanner::scanBlockChompingIndicator() {
   char Indicator = ' ';
   if (Current != End && (*Current == '+' || *Current == '-')) {
@@ -1654,19 +1695,19 @@ bool Scanner::scanBlockScalarIndent(unsigned BlockIndent,
 }
 
 bool Scanner::scanBlockScalar(bool IsLiteral) {
-  // Eat '|' or '>'
   assert(*Current == '|' || *Current == '>');
-  skip(1);
-
+  char StyleIndicator;
   char ChompingIndicator;
   unsigned BlockIndent;
   bool IsDone = false;
-  if (!scanBlockScalarHeader(ChompingIndicator, BlockIndent, IsDone))
+  if (!scanBlockScalarIndicators(StyleIndicator, ChompingIndicator, BlockIndent,
+                                 IsDone))
     return false;
   if (IsDone)
     return true;
+  bool IsFolded = StyleIndicator == '>';
 
-  auto Start = Current;
+  const auto *Start = Current;
   unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent;
   unsigned LineBreaks = 0;
   if (BlockIndent == 0) {
@@ -1687,6 +1728,22 @@ bool Scanner::scanBlockScalar(bool IsLiteral) {
     auto LineStart = Current;
     advanceWhile(&Scanner::skip_nb_char);
     if (LineStart != Current) {
+      if (LineBreaks && IsFolded && !Scanner::isLineEmpty(Str)) {
+        // The folded style "folds" any single line break between content into a
+        // single space, except when that content is "empty" (only contains
+        // whitespace) in which case the line break is left as-is.
+        if (LineBreaks == 1) {
+          Str.append(LineBreaks,
+                     isLineEmpty(StringRef(LineStart, Current - LineStart))
+                         ? '\n'
+                         : ' ');
+        }
+        // If we saw a single line break, we are completely replacing it and so
+        // want `LineBreaks == 0`. Otherwise this decrement accounts for the
+        // fact that the first line break is "trimmed", only being used to
+        // signal a sequence of line breaks which should not be folded.
+        LineBreaks--;
+      }
       Str.append(LineBreaks, '\n');
       Str.append(StringRef(LineStart, Current - LineStart));
       LineBreaks = 0;
@@ -1840,11 +1897,11 @@ bool Scanner::fetchMoreTokens() {
 
 Stream::Stream(StringRef Input, SourceMgr &SM, bool ShowColors,
                std::error_code *EC)
-    : scanner(new Scanner(Input, SM, ShowColors, EC)), CurrentDoc() {}
+    : scanner(new Scanner(Input, SM, ShowColors, EC)) {}
 
 Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors,
                std::error_code *EC)
-    : scanner(new Scanner(InputBuffer, SM, ShowColors, EC)), CurrentDoc() {}
+    : scanner(new Scanner(InputBuffer, SM, ShowColors, EC)) {}
 
 Stream::~Stream() = default;
 
diff --git a/llvm/lib/Support/Z3Solver.cpp b/llvm/lib/Support/Z3Solver.cpp
index 9485536d1312..b49d8d2afbb3 100644
--- a/llvm/lib/Support/Z3Solver.cpp
+++ b/llvm/lib/Support/Z3Solver.cpp
@@ -6,16 +6,18 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/Twine.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/SMTAPI.h"
-#include <set>
 
 using namespace llvm;
 
 #if LLVM_WITH_Z3
 
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+
+#include <set>
+
 #include <z3.h>
 
 namespace {
diff --git a/llvm/lib/Support/raw_ostream.cpp b/llvm/lib/Support/raw_ostream.cpp
index 69d4fe96bee8..98ceea3c3c7a 100644
--- a/llvm/lib/Support/raw_ostream.cpp
+++ b/llvm/lib/Support/raw_ostream.cpp
@@ -408,7 +408,7 @@ raw_ostream &raw_ostream::operator<<(const FormattedBytes &FB) {
   const size_t Size = Bytes.size();
   HexPrintStyle HPS = FB.Upper ? HexPrintStyle::Upper : HexPrintStyle::Lower;
   uint64_t OffsetWidth = 0;
-  if (FB.FirstByteOffset.hasValue()) {
+  if (FB.FirstByteOffset) {
     // Figure out how many nibbles are needed to print the largest offset
     // represented by this data set, so that we can align the offset field
     // to the right width.
@@ -428,7 +428,7 @@ raw_ostream &raw_ostream::operator<<(const FormattedBytes &FB) {
   while (!Bytes.empty()) {
     indent(FB.IndentLevel);
 
-    if (FB.FirstByteOffset.hasValue()) {
+    if (FB.FirstByteOffset) {
       uint64_t Offset = FB.FirstByteOffset.getValue();
       llvm::write_hex(*this, Offset + LineIndex, HPS, OffsetWidth);
       *this << ": ";
diff --git a/llvm/lib/Support/regcomp.c b/llvm/lib/Support/regcomp.c
index ee2a1d87a267..24d01121820b 100644
--- a/llvm/lib/Support/regcomp.c
+++ b/llvm/lib/Support/regcomp.c
@@ -249,10 +249,10 @@ static char nuls[10];		/* place to point scanner in event of error */
  */
 #define	PEEK()	(*p->next)
 #define	PEEK2()	(*(p->next+1))
-#define	MORE()	(p->next < p->end)
-#define	MORE2()	(p->next+1 < p->end)
+#define	MORE()	(p->end - p->next > 0)
+#define	MORE2()	(p->end - p->next > 1)
 #define	SEE(c)	(MORE() && PEEK() == (c))
-#define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
+#define	SEETWO(a, b)	(MORE2() && PEEK() == (a) && PEEK2() == (b))
 #define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
 #define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
 #define	NEXT()	(p->next++)
@@ -800,15 +800,17 @@ p_bracket(struct parse *p)
 	int invert = 0;
 
 	/* Dept of Truly Sickening Special-Case Kludges */
-	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
-		EMIT(OBOW, 0);
-		NEXTn(6);
-		return;
-	}
-	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
-		EMIT(OEOW, 0);
-		NEXTn(6);
-		return;
+	if (p->end - p->next > 5) {
+		if (strncmp(p->next, "[:<:]]", 6) == 0) {
+			EMIT(OBOW, 0);
+			NEXTn(6);
+			return;
+		}
+		if (strncmp(p->next, "[:>:]]", 6) == 0) {
+			EMIT(OEOW, 0);
+			NEXTn(6);
+			return;
+		}
 	}
 
 	if ((cs = allocset(p)) == NULL) {
diff --git a/llvm/lib/Support/regengine.inc b/llvm/lib/Support/regengine.inc
index 41787aff1242..02680e23ddb8 100644
--- a/llvm/lib/Support/regengine.inc
+++ b/llvm/lib/Support/regengine.inc
@@ -53,6 +53,7 @@
 #define	at	sat
 #define	match	smat
 #define	nope	snope
+#define step_back	sstep_back
 #endif
 #ifdef LNAMES
 #define	matcher	lmatcher
@@ -65,6 +66,7 @@
 #define	at	lat
 #define	match	lmat
 #define	nope	lnope
+#define step_back	lstep_back
 #endif
 
 /* another structure passed up and down to avoid zillions of parameters */
@@ -288,6 +290,38 @@ matcher(struct re_guts *g, const char *string, size_t nmatch,
 	return(0);
 }
 
+/* Step back from "stop" to a position where the strip startst..stopst might
+ * match. This can always conservatively return "stop - 1", but may return an
+ * earlier position if matches at later positions are impossible. */
+static const char *
+step_back(struct re_guts *g, const char *start, const char *stop, sopno startst,
+          sopno stopst)
+{
+	/* Always step back at least one character. */
+	assert(stop > start);
+	const char *res = stop - 1;
+
+	/* Check whether the strip startst..stropst starts with a fixed character,
+	 * ignoring any closing parentheses. If not, return a conservative result. */
+	for (;;) {
+		if (startst >= stopst)
+			return res;
+		if (OP(g->strip[startst]) != ORPAREN)
+			break;
+		startst++;
+	}
+	if (OP(g->strip[startst]) != OCHAR)
+		return res;
+
+	/* Find the character that starts the following match. */
+	char ch = OPND(g->strip[startst]);
+	for (; res != start; --res) {
+		if (*res == ch)
+			break;
+	}
+	return res;
+}
+
 /*
  - dissect - figure out what matched what, no back references
  */
@@ -358,7 +392,7 @@ dissect(struct match *m, const char *start, const char *stop, sopno startst,
 				if (tail == stop)
 					break;		/* yes! */
 				/* no -- try a shorter match for this one */
-				stp = rest - 1;
+				stp = step_back(m->g, sp, rest, es, stopst);
 				assert(stp >= sp);	/* it did work */
 			}
 			ssub = ss + 1;
@@ -383,7 +417,7 @@ dissect(struct match *m, const char *start, const char *stop, sopno startst,
 				if (tail == stop)
 					break;		/* yes! */
 				/* no -- try a shorter match for this one */
-				stp = rest - 1;
+				stp = step_back(m->g, sp, rest, es, stopst);
 				assert(stp >= sp);	/* it did work */
 			}
 			ssub = ss + 1;
@@ -1032,3 +1066,4 @@ pchar(int ch)
 #undef	at
 #undef	match
 #undef	nope
+#undef	step_back
diff --git a/llvm/lib/Support/xxhash.cpp b/llvm/lib/Support/xxhash.cpp
index e9dceed2c4ae..9a3f5faa336b 100644
--- a/llvm/lib/Support/xxhash.cpp
+++ b/llvm/lib/Support/xxhash.cpp
@@ -39,7 +39,6 @@
 #include "llvm/Support/Endian.h"
 
 #include <stdlib.h>
-#include <string.h>
 
 using namespace llvm;
 using namespace support;
diff --git a/llvm/lib/TableGen/Error.cpp b/llvm/lib/TableGen/Error.cpp
index 6104573b4b25..ebe9129ebaeb 100644
--- a/llvm/lib/TableGen/Error.cpp
+++ b/llvm/lib/TableGen/Error.cpp
@@ -157,8 +157,8 @@ void PrintFatalError(const RecordVal *RecVal, const Twine &Msg) {
 // Check an assertion: Obtain the condition value and be sure it is true.
 // If not, print a nonfatal error along with the message.
 void CheckAssert(SMLoc Loc, Init *Condition, Init *Message) {
-  auto *CondValue = dyn_cast_or_null<IntInit>(
-                        Condition->convertInitializerTo(IntRecTy::get()));
+  auto *CondValue = dyn_cast_or_null<IntInit>(Condition->convertInitializerTo(
+      IntRecTy::get(Condition->getRecordKeeper())));
   if (!CondValue)
     PrintError(Loc, "assert condition must of type bit, bits, or int.");
   else if (!CondValue->getValue()) {
diff --git a/llvm/lib/TableGen/Parser.cpp b/llvm/lib/TableGen/Parser.cpp
new file mode 100644
index 000000000000..818ded19432b
--- /dev/null
+++ b/llvm/lib/TableGen/Parser.cpp
@@ -0,0 +1,39 @@
+//===- Parser.cpp - Top-Level TableGen Parser implementation --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/TableGen/Parser.h"
+#include "TGParser.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/TableGen/Record.h"
+
+using namespace llvm;
+
+bool llvm::TableGenParseFile(SourceMgr &InputSrcMgr, RecordKeeper &Records) {
+  // Initialize the global TableGen source manager by temporarily taking control
+  // of the input buffer in `SrcMgr`. This is kind of a hack, but allows for
+  // preserving TableGen's current awkward diagnostic behavior. If we can remove
+  // this reliance, we could drop all of this.
+  SrcMgr = SourceMgr();
+  SrcMgr.takeSourceBuffersFrom(InputSrcMgr);
+  SrcMgr.setIncludeDirs(InputSrcMgr.getIncludeDirs());
+  SrcMgr.setDiagHandler(InputSrcMgr.getDiagHandler(),
+                        InputSrcMgr.getDiagContext());
+
+  // Setup the record keeper and try to parse the file.
+  auto *MainFileBuffer = SrcMgr.getMemoryBuffer(SrcMgr.getMainFileID());
+  Records.saveInputFilename(MainFileBuffer->getBufferIdentifier().str());
+
+  TGParser Parser(SrcMgr, /*Macros=*/None, Records);
+  bool ParseResult = Parser.ParseFile();
+
+  // After parsing, reclaim the source manager buffers from TableGen's global
+  // manager.
+  InputSrcMgr.takeSourceBuffersFrom(SrcMgr);
+  SrcMgr = SourceMgr();
+  return ParseResult;
+}
diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp
index 58d8c9936896..6c205104d569 100644
--- a/llvm/lib/TableGen/Record.cpp
+++ b/llvm/lib/TableGen/Record.cpp
@@ -24,7 +24,6 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/Error.h"
@@ -46,14 +45,17 @@ using namespace llvm;
 
 namespace llvm {
 namespace detail {
-/// This class contains all of the contextual static state of the Record
-/// classes. This allows for better lifetime management and control of the used
-/// static data.
-struct RecordContext {
-  RecordContext()
-      : AnyRecord(0), TrueBitInit(true, &SharedBitRecTy),
+/// This class represents the internal implementation of the RecordKeeper.
+/// It contains all of the contextual static state of the Record classes. It is
+/// kept out-of-line to simplify dependencies, and also make it easier for
+/// internal classes to access the uniquer state of the keeper.
+struct RecordKeeperImpl {
+  RecordKeeperImpl(RecordKeeper &RK)
+      : SharedBitRecTy(RK), SharedIntRecTy(RK), SharedStringRecTy(RK),
+        SharedDagRecTy(RK), AnyRecord(RK, 0), TheUnsetInit(RK),
+        TrueBitInit(true, &SharedBitRecTy),
         FalseBitInit(false, &SharedBitRecTy), StringInitStringPool(Allocator),
-        StringInitCodePool(Allocator), LastRecordID(0) {}
+        StringInitCodePool(Allocator), AnonCounter(0), LastRecordID(0) {}
 
   BumpPtrAllocator Allocator;
   std::vector<BitsRecTy *> SharedBitsRecTys;
@@ -77,6 +79,7 @@ struct RecordContext {
   FoldingSet<TernOpInit> TheTernOpInitPool;
   FoldingSet<FoldOpInit> TheFoldOpInitPool;
   FoldingSet<IsAOpInit> TheIsAOpInitPool;
+  FoldingSet<ExistsOpInit> TheExistsOpInitPool;
   DenseMap<std::pair<RecTy *, Init *>, VarInit *> TheVarInitPool;
   DenseMap<std::pair<TypedInit *, unsigned>, VarBitInit *> TheVarBitInitPool;
   DenseMap<std::pair<TypedInit *, unsigned>, VarListElementInit *>
@@ -85,14 +88,14 @@ struct RecordContext {
   DenseMap<std::pair<Init *, StringInit *>, FieldInit *> TheFieldInitPool;
   FoldingSet<CondOpInit> TheCondOpInitPool;
   FoldingSet<DagInit> TheDagInitPool;
+  FoldingSet<RecordRecTy> RecordTypePool;
 
+  unsigned AnonCounter;
   unsigned LastRecordID;
 };
 } // namespace detail
 } // namespace llvm
 
-ManagedStatic<detail::RecordContext> Context;
-
 //===----------------------------------------------------------------------===//
 //    Type implementations
 //===----------------------------------------------------------------------===//
@@ -103,7 +106,7 @@ LLVM_DUMP_METHOD void RecTy::dump() const { print(errs()); }
 
 ListRecTy *RecTy::getListTy() {
   if (!ListTy)
-    ListTy = new(Context->Allocator) ListRecTy(this);
+    ListTy = new (RK.getImpl().Allocator) ListRecTy(this);
   return ListTy;
 }
 
@@ -114,7 +117,9 @@ bool RecTy::typeIsConvertibleTo(const RecTy *RHS) const {
 
 bool RecTy::typeIsA(const RecTy *RHS) const { return this == RHS; }
 
-BitRecTy *BitRecTy::get() { return &Context->SharedBitRecTy; }
+BitRecTy *BitRecTy::get(RecordKeeper &RK) {
+  return &RK.getImpl().SharedBitRecTy;
+}
 
 bool BitRecTy::typeIsConvertibleTo(const RecTy *RHS) const{
   if (RecTy::typeIsConvertibleTo(RHS) || RHS->getRecTyKind() == IntRecTyKind)
@@ -124,12 +129,13 @@ bool BitRecTy::typeIsConvertibleTo(const RecTy *RHS) const{
   return false;
 }
 
-BitsRecTy *BitsRecTy::get(unsigned Sz) {
-  if (Sz >= Context->SharedBitsRecTys.size())
-    Context->SharedBitsRecTys.resize(Sz + 1);
-  BitsRecTy *&Ty = Context->SharedBitsRecTys[Sz];
+BitsRecTy *BitsRecTy::get(RecordKeeper &RK, unsigned Sz) {
+  detail::RecordKeeperImpl &RKImpl = RK.getImpl();
+  if (Sz >= RKImpl.SharedBitsRecTys.size())
+    RKImpl.SharedBitsRecTys.resize(Sz + 1);
+  BitsRecTy *&Ty = RKImpl.SharedBitsRecTys[Sz];
   if (!Ty)
-    Ty = new (Context->Allocator) BitsRecTy(Sz);
+    Ty = new (RKImpl.Allocator) BitsRecTy(RK, Sz);
   return Ty;
 }
 
@@ -150,14 +156,18 @@ bool BitsRecTy::typeIsA(const RecTy *RHS) const {
   return false;
 }
 
-IntRecTy *IntRecTy::get() { return &Context->SharedIntRecTy; }
+IntRecTy *IntRecTy::get(RecordKeeper &RK) {
+  return &RK.getImpl().SharedIntRecTy;
+}
 
 bool IntRecTy::typeIsConvertibleTo(const RecTy *RHS) const {
   RecTyKind kind = RHS->getRecTyKind();
   return kind==BitRecTyKind || kind==BitsRecTyKind || kind==IntRecTyKind;
 }
 
-StringRecTy *StringRecTy::get() { return &Context->SharedStringRecTy; }
+StringRecTy *StringRecTy::get(RecordKeeper &RK) {
+  return &RK.getImpl().SharedStringRecTy;
+}
 
 std::string StringRecTy::getAsString() const {
   return "string";
@@ -184,7 +194,9 @@ bool ListRecTy::typeIsA(const RecTy *RHS) const {
   return false;
 }
 
-DagRecTy *DagRecTy::get() { return &Context->SharedDagRecTy; }
+DagRecTy *DagRecTy::get(RecordKeeper &RK) {
+  return &RK.getImpl().SharedDagRecTy;
+}
 
 std::string DagRecTy::getAsString() const {
   return "dag";
@@ -197,12 +209,13 @@ static void ProfileRecordRecTy(FoldingSetNodeID &ID,
     ID.AddPointer(R);
 }
 
-RecordRecTy *RecordRecTy::get(ArrayRef<Record *> UnsortedClasses) {
+RecordRecTy *RecordRecTy::get(RecordKeeper &RK,
+                              ArrayRef<Record *> UnsortedClasses) {
+  detail::RecordKeeperImpl &RKImpl = RK.getImpl();
   if (UnsortedClasses.empty())
-    return &Context->AnyRecord;
+    return &RKImpl.AnyRecord;
 
-  FoldingSet<RecordRecTy> &ThePool =
-      UnsortedClasses[0]->getRecords().RecordTypePool;
+  FoldingSet<RecordRecTy> &ThePool = RKImpl.RecordTypePool;
 
   SmallVector<Record *, 4> Classes(UnsortedClasses.begin(),
                                    UnsortedClasses.end());
@@ -227,14 +240,18 @@ RecordRecTy *RecordRecTy::get(ArrayRef<Record *> UnsortedClasses) {
   }
 #endif
 
-  void *Mem = Context->Allocator.Allocate(
+  void *Mem = RKImpl.Allocator.Allocate(
       totalSizeToAlloc<Record *>(Classes.size()), alignof(RecordRecTy));
-  RecordRecTy *Ty = new(Mem) RecordRecTy(Classes.size());
+  RecordRecTy *Ty = new (Mem) RecordRecTy(RK, Classes.size());
   std::uninitialized_copy(Classes.begin(), Classes.end(),
                           Ty->getTrailingObjects<Record *>());
   ThePool.InsertNode(Ty, IP);
   return Ty;
 }
+RecordRecTy *RecordRecTy::get(Record *Class) {
+  assert(Class && "unexpected null class");
+  return get(Class->getRecords(), Class);
+}
 
 void RecordRecTy::Profile(FoldingSetNodeID &ID) const {
   ProfileRecordRecTy(ID, getClasses());
@@ -294,7 +311,7 @@ static RecordRecTy *resolveRecordTypes(RecordRecTy *T1, RecordRecTy *T2) {
     }
   }
 
-  return RecordRecTy::get(CommonSuperClasses);
+  return RecordRecTy::get(T1->getRecordKeeper(), CommonSuperClasses);
 }
 
 RecTy *llvm::resolveTypes(RecTy *T1, RecTy *T2) {
@@ -333,7 +350,15 @@ void Init::anchor() {}
 LLVM_DUMP_METHOD void Init::dump() const { return print(errs()); }
 #endif
 
-UnsetInit *UnsetInit::get() { return &Context->TheUnsetInit; }
+RecordKeeper &Init::getRecordKeeper() const {
+  if (auto *TyInit = dyn_cast<TypedInit>(this))
+    return TyInit->getType()->getRecordKeeper();
+  return cast<UnsetInit>(this)->getRecordKeeper();
+}
+
+UnsetInit *UnsetInit::get(RecordKeeper &RK) {
+  return &RK.getImpl().TheUnsetInit;
+}
 
 Init *UnsetInit::getCastTo(RecTy *Ty) const {
   return const_cast<UnsetInit *>(this);
@@ -343,8 +368,8 @@ Init *UnsetInit::convertInitializerTo(RecTy *Ty) const {
   return const_cast<UnsetInit *>(this);
 }
 
-BitInit *BitInit::get(bool V) {
-  return V ? &Context->TrueBitInit : &Context->FalseBitInit;
+BitInit *BitInit::get(RecordKeeper &RK, bool V) {
+  return V ? &RK.getImpl().TrueBitInit : &RK.getImpl().FalseBitInit;
 }
 
 Init *BitInit::convertInitializerTo(RecTy *Ty) const {
@@ -352,12 +377,12 @@ Init *BitInit::convertInitializerTo(RecTy *Ty) const {
     return const_cast<BitInit *>(this);
 
   if (isa<IntRecTy>(Ty))
-    return IntInit::get(getValue());
+    return IntInit::get(getRecordKeeper(), getValue());
 
   if (auto *BRT = dyn_cast<BitsRecTy>(Ty)) {
     // Can only convert single bit.
     if (BRT->getNumBits() == 1)
-      return BitsInit::get(const_cast<BitInit *>(this));
+      return BitsInit::get(getRecordKeeper(), const_cast<BitInit *>(this));
   }
 
   return nullptr;
@@ -371,20 +396,21 @@ ProfileBitsInit(FoldingSetNodeID &ID, ArrayRef<Init *> Range) {
     ID.AddPointer(I);
 }
 
-BitsInit *BitsInit::get(ArrayRef<Init *> Range) {
+BitsInit *BitsInit::get(RecordKeeper &RK, ArrayRef<Init *> Range) {
   FoldingSetNodeID ID;
   ProfileBitsInit(ID, Range);
 
+  detail::RecordKeeperImpl &RKImpl = RK.getImpl();
   void *IP = nullptr;
-  if (BitsInit *I = Context->TheBitsInitPool.FindNodeOrInsertPos(ID, IP))
+  if (BitsInit *I = RKImpl.TheBitsInitPool.FindNodeOrInsertPos(ID, IP))
     return I;
 
-  void *Mem = Context->Allocator.Allocate(
-      totalSizeToAlloc<Init *>(Range.size()), alignof(BitsInit));
-  BitsInit *I = new(Mem) BitsInit(Range.size());
+  void *Mem = RKImpl.Allocator.Allocate(totalSizeToAlloc<Init *>(Range.size()),
+                                        alignof(BitsInit));
+  BitsInit *I = new (Mem) BitsInit(RK, Range.size());
   std::uninitialized_copy(Range.begin(), Range.end(),
                           I->getTrailingObjects<Init *>());
-  Context->TheBitsInitPool.InsertNode(I, IP);
+  RKImpl.TheBitsInitPool.InsertNode(I, IP);
   return I;
 }
 
@@ -412,7 +438,7 @@ Init *BitsInit::convertInitializerTo(RecTy *Ty) const {
         Result |= static_cast<int64_t>(Bit->getValue()) << i;
       else
         return nullptr;
-    return IntInit::get(Result);
+    return IntInit::get(getRecordKeeper(), Result);
   }
 
   return nullptr;
@@ -427,7 +453,7 @@ BitsInit::convertInitializerBitRange(ArrayRef<unsigned> Bits) const {
       return nullptr;
     NewBits[i] = getBit(Bits[i]);
   }
-  return BitsInit::get(NewBits);
+  return BitsInit::get(getRecordKeeper(), NewBits);
 }
 
 bool BitsInit::isConcrete() const {
@@ -482,15 +508,15 @@ Init *BitsInit::resolveReferences(Resolver &R) const {
   }
 
   if (Changed)
-    return BitsInit::get(NewBits);
+    return BitsInit::get(getRecordKeeper(), NewBits);
 
   return const_cast<BitsInit *>(this);
 }
 
-IntInit *IntInit::get(int64_t V) {
-  IntInit *&I = Context->TheIntInitPool[V];
+IntInit *IntInit::get(RecordKeeper &RK, int64_t V) {
+  IntInit *&I = RK.getImpl().TheIntInitPool[V];
   if (!I)
-    I = new (Context->Allocator) IntInit(V);
+    I = new (RK.getImpl().Allocator) IntInit(RK, V);
   return I;
 }
 
@@ -511,7 +537,7 @@ Init *IntInit::convertInitializerTo(RecTy *Ty) const {
   if (isa<BitRecTy>(Ty)) {
     int64_t Val = getValue();
     if (Val != 0 && Val != 1) return nullptr;  // Only accept 0 or 1 for a bit!
-    return BitInit::get(Val != 0);
+    return BitInit::get(getRecordKeeper(), Val != 0);
   }
 
   if (auto *BRT = dyn_cast<BitsRecTy>(Ty)) {
@@ -522,9 +548,10 @@ Init *IntInit::convertInitializerTo(RecTy *Ty) const {
 
     SmallVector<Init *, 16> NewBits(BRT->getNumBits());
     for (unsigned i = 0; i != BRT->getNumBits(); ++i)
-      NewBits[i] = BitInit::get(Value & ((i < 64) ? (1LL << i) : 0));
+      NewBits[i] =
+          BitInit::get(getRecordKeeper(), Value & ((i < 64) ? (1LL << i) : 0));
 
-    return BitsInit::get(NewBits);
+    return BitsInit::get(getRecordKeeper(), NewBits);
   }
 
   return nullptr;
@@ -538,17 +565,18 @@ IntInit::convertInitializerBitRange(ArrayRef<unsigned> Bits) const {
     if (Bits[i] >= 64)
       return nullptr;
 
-    NewBits[i] = BitInit::get(Value & (INT64_C(1) << Bits[i]));
+    NewBits[i] =
+        BitInit::get(getRecordKeeper(), Value & (INT64_C(1) << Bits[i]));
   }
-  return BitsInit::get(NewBits);
+  return BitsInit::get(getRecordKeeper(), NewBits);
 }
 
-AnonymousNameInit *AnonymousNameInit::get(unsigned V) {
-  return new (Context->Allocator) AnonymousNameInit(V);
+AnonymousNameInit *AnonymousNameInit::get(RecordKeeper &RK, unsigned V) {
+  return new (RK.getImpl().Allocator) AnonymousNameInit(RK, V);
 }
 
 StringInit *AnonymousNameInit::getNameInit() const {
-  return StringInit::get(getAsString());
+  return StringInit::get(getRecordKeeper(), getAsString());
 }
 
 std::string AnonymousNameInit::getAsString() const {
@@ -565,12 +593,13 @@ Init *AnonymousNameInit::resolveReferences(Resolver &R) const {
   return New;
 }
 
-StringInit *StringInit::get(StringRef V, StringFormat Fmt) {
-  auto &InitMap = Fmt == SF_String ? Context->StringInitStringPool
-                                   : Context->StringInitCodePool;
+StringInit *StringInit::get(RecordKeeper &RK, StringRef V, StringFormat Fmt) {
+  detail::RecordKeeperImpl &RKImpl = RK.getImpl();
+  auto &InitMap = Fmt == SF_String ? RKImpl.StringInitStringPool
+                                   : RKImpl.StringInitCodePool;
   auto &Entry = *InitMap.insert(std::make_pair(V, nullptr)).first;
   if (!Entry.second)
-    Entry.second = new (Context->Allocator) StringInit(Entry.getKey(), Fmt);
+    Entry.second = new (RKImpl.Allocator) StringInit(RK, Entry.getKey(), Fmt);
   return Entry.second;
 }
 
@@ -595,19 +624,20 @@ ListInit *ListInit::get(ArrayRef<Init *> Range, RecTy *EltTy) {
   FoldingSetNodeID ID;
   ProfileListInit(ID, Range, EltTy);
 
+  detail::RecordKeeperImpl &RK = EltTy->getRecordKeeper().getImpl();
   void *IP = nullptr;
-  if (ListInit *I = Context->TheListInitPool.FindNodeOrInsertPos(ID, IP))
+  if (ListInit *I = RK.TheListInitPool.FindNodeOrInsertPos(ID, IP))
     return I;
 
   assert(Range.empty() || !isa<TypedInit>(Range[0]) ||
          cast<TypedInit>(Range[0])->getType()->typeIsConvertibleTo(EltTy));
 
-  void *Mem = Context->Allocator.Allocate(
-      totalSizeToAlloc<Init *>(Range.size()), alignof(ListInit));
+  void *Mem = RK.Allocator.Allocate(totalSizeToAlloc<Init *>(Range.size()),
+                                    alignof(ListInit));
   ListInit *I = new (Mem) ListInit(Range.size(), EltTy);
   std::uninitialized_copy(Range.begin(), Range.end(),
                           I->getTrailingObjects<Init *>());
-  Context->TheListInitPool.InsertNode(I, IP);
+  RK.TheListInitPool.InsertNode(I, IP);
   return I;
 }
 
@@ -714,7 +744,7 @@ std::string ListInit::getAsString() const {
 }
 
 Init *OpInit::getBit(unsigned Bit) const {
-  if (getType() == BitRecTy::get())
+  if (getType() == BitRecTy::get(getRecordKeeper()))
     return const_cast<OpInit*>(this);
   return VarBitInit::get(const_cast<OpInit*>(this), Bit);
 }
@@ -730,12 +760,13 @@ UnOpInit *UnOpInit::get(UnaryOp Opc, Init *LHS, RecTy *Type) {
   FoldingSetNodeID ID;
   ProfileUnOpInit(ID, Opc, LHS, Type);
 
+  detail::RecordKeeperImpl &RK = Type->getRecordKeeper().getImpl();
   void *IP = nullptr;
-  if (UnOpInit *I = Context->TheUnOpInitPool.FindNodeOrInsertPos(ID, IP))
+  if (UnOpInit *I = RK.TheUnOpInitPool.FindNodeOrInsertPos(ID, IP))
     return I;
 
-  UnOpInit *I = new (Context->Allocator) UnOpInit(Opc, LHS, Type);
-  Context->TheUnOpInitPool.InsertNode(I, IP);
+  UnOpInit *I = new (RK.Allocator) UnOpInit(Opc, LHS, Type);
+  RK.TheUnOpInitPool.InsertNode(I, IP);
   return I;
 }
 
@@ -744,6 +775,7 @@ void UnOpInit::Profile(FoldingSetNodeID &ID) const {
 }
 
 Init *UnOpInit::Fold(Record *CurRec, bool IsFinal) const {
+  RecordKeeper &RK = getRecordKeeper();
   switch (getOpcode()) {
   case CAST:
     if (isa<StringRecTy>(getType())) {
@@ -751,11 +783,11 @@ Init *UnOpInit::Fold(Record *CurRec, bool IsFinal) const {
         return LHSs;
 
       if (DefInit *LHSd = dyn_cast<DefInit>(LHS))
-        return StringInit::get(LHSd->getAsString());
+        return StringInit::get(RK, LHSd->getAsString());
 
-      if (IntInit *LHSi =
-              dyn_cast_or_null<IntInit>(LHS->convertInitializerTo(IntRecTy::get())))
-        return StringInit::get(LHSi->getAsString());
+      if (IntInit *LHSi = dyn_cast_or_null<IntInit>(
+              LHS->convertInitializerTo(IntRecTy::get(RK))))
+        return StringInit::get(RK, LHSi->getAsString());
 
     } else if (isa<RecordRecTy>(getType())) {
       if (StringInit *Name = dyn_cast<StringInit>(LHS)) {
@@ -800,9 +832,9 @@ Init *UnOpInit::Fold(Record *CurRec, bool IsFinal) const {
     break;
 
   case NOT:
-    if (IntInit *LHSi =
-            dyn_cast_or_null<IntInit>(LHS->convertInitializerTo(IntRecTy::get())))
-      return IntInit::get(LHSi->getValue() ? 0 : 1);
+    if (IntInit *LHSi = dyn_cast_or_null<IntInit>(
+            LHS->convertInitializerTo(IntRecTy::get(RK))))
+      return IntInit::get(RK, LHSi->getValue() ? 0 : 1);
     break;
 
   case HEAD:
@@ -823,20 +855,20 @@ Init *UnOpInit::Fold(Record *CurRec, bool IsFinal) const {
 
   case SIZE:
     if (ListInit *LHSl = dyn_cast<ListInit>(LHS))
-      return IntInit::get(LHSl->size());
+      return IntInit::get(RK, LHSl->size());
     if (DagInit *LHSd = dyn_cast<DagInit>(LHS))
-      return IntInit::get(LHSd->arg_size());
+      return IntInit::get(RK, LHSd->arg_size());
     if (StringInit *LHSs = dyn_cast<StringInit>(LHS))
-      return IntInit::get(LHSs->getValue().size());
+      return IntInit::get(RK, LHSs->getValue().size());
     break;
 
   case EMPTY:
     if (ListInit *LHSl = dyn_cast<ListInit>(LHS))
-      return IntInit::get(LHSl->empty());
+      return IntInit::get(RK, LHSl->empty());
     if (DagInit *LHSd = dyn_cast<DagInit>(LHS))
-      return IntInit::get(LHSd->arg_empty());
+      return IntInit::get(RK, LHSd->arg_empty());
     if (StringInit *LHSs = dyn_cast<StringInit>(LHS))
-      return IntInit::get(LHSs->getValue().empty());
+      return IntInit::get(RK, LHSs->getValue().empty());
     break;
 
   case GETDAGOP:
@@ -893,12 +925,13 @@ BinOpInit *BinOpInit::get(BinaryOp Opc, Init *LHS, Init *RHS, RecTy *Type) {
   FoldingSetNodeID ID;
   ProfileBinOpInit(ID, Opc, LHS, RHS, Type);
 
+  detail::RecordKeeperImpl &RK = LHS->getRecordKeeper().getImpl();
   void *IP = nullptr;
-  if (BinOpInit *I = Context->TheBinOpInitPool.FindNodeOrInsertPos(ID, IP))
+  if (BinOpInit *I = RK.TheBinOpInitPool.FindNodeOrInsertPos(ID, IP))
     return I;
 
-  BinOpInit *I = new (Context->Allocator) BinOpInit(Opc, LHS, RHS, Type);
-  Context->TheBinOpInitPool.InsertNode(I, IP);
+  BinOpInit *I = new (RK.Allocator) BinOpInit(Opc, LHS, RHS, Type);
+  RK.TheBinOpInitPool.InsertNode(I, IP);
   return I;
 }
 
@@ -910,15 +943,15 @@ static StringInit *ConcatStringInits(const StringInit *I0,
                                      const StringInit *I1) {
   SmallString<80> Concat(I0->getValue());
   Concat.append(I1->getValue());
-  return StringInit::get(Concat,
-                         StringInit::determineFormat(I0->getFormat(),
-                                                     I1->getFormat()));
+  return StringInit::get(
+      I0->getRecordKeeper(), Concat,
+      StringInit::determineFormat(I0->getFormat(), I1->getFormat()));
 }
 
 static StringInit *interleaveStringList(const ListInit *List,
                                         const StringInit *Delim) {
   if (List->size() == 0)
-    return StringInit::get("");
+    return StringInit::get(List->getRecordKeeper(), "");
   StringInit *Element = dyn_cast<StringInit>(List->getElement(0));
   if (!Element)
     return nullptr;
@@ -933,30 +966,29 @@ static StringInit *interleaveStringList(const ListInit *List,
     Result.append(Element->getValue());
     Fmt = StringInit::determineFormat(Fmt, Element->getFormat());
   }
-  return StringInit::get(Result, Fmt);
+  return StringInit::get(List->getRecordKeeper(), Result, Fmt);
 }
 
 static StringInit *interleaveIntList(const ListInit *List,
                                      const StringInit *Delim) {
+  RecordKeeper &RK = List->getRecordKeeper();
   if (List->size() == 0)
-    return StringInit::get("");
-  IntInit *Element =
-      dyn_cast_or_null<IntInit>(List->getElement(0)
-                                    ->convertInitializerTo(IntRecTy::get()));
+    return StringInit::get(RK, "");
+  IntInit *Element = dyn_cast_or_null<IntInit>(
+      List->getElement(0)->convertInitializerTo(IntRecTy::get(RK)));
   if (!Element)
     return nullptr;
   SmallString<80> Result(Element->getAsString());
 
   for (unsigned I = 1, E = List->size(); I < E; ++I) {
     Result.append(Delim->getValue());
-    IntInit *Element =
-        dyn_cast_or_null<IntInit>(List->getElement(I)
-                                      ->convertInitializerTo(IntRecTy::get()));
+    IntInit *Element = dyn_cast_or_null<IntInit>(
+        List->getElement(I)->convertInitializerTo(IntRecTy::get(RK)));
     if (!Element)
       return nullptr;
     Result.append(Element->getAsString());
   }
-  return StringInit::get(Result);
+  return StringInit::get(RK, Result);
 }
 
 Init *BinOpInit::getStrConcat(Init *I0, Init *I1) {
@@ -964,7 +996,8 @@ Init *BinOpInit::getStrConcat(Init *I0, Init *I1) {
   if (const StringInit *I0s = dyn_cast<StringInit>(I0))
     if (const StringInit *I1s = dyn_cast<StringInit>(I1))
       return ConcatStringInits(I0s, I1s);
-  return BinOpInit::get(BinOpInit::STRCONCAT, I0, I1, StringRecTy::get());
+  return BinOpInit::get(BinOpInit::STRCONCAT, I0, I1,
+                        StringRecTy::get(I0->getRecordKeeper()));
 }
 
 static ListInit *ConcatListInits(const ListInit *LHS,
@@ -1003,7 +1036,7 @@ Init *BinOpInit::Fold(Record *CurRec) const {
       }
       Init *Op = LOp ? LOp : ROp;
       if (!Op)
-        Op = UnsetInit::get();
+        Op = UnsetInit::get(getRecordKeeper());
 
       SmallVector<Init*, 8> Args;
       SmallVector<StringInit*, 8> ArgNames;
@@ -1067,10 +1100,10 @@ Init *BinOpInit::Fold(Record *CurRec) const {
   case GE:
   case GT: {
     // First see if we have two bit, bits, or int.
-    IntInit *LHSi =
-        dyn_cast_or_null<IntInit>(LHS->convertInitializerTo(IntRecTy::get()));
-    IntInit *RHSi =
-        dyn_cast_or_null<IntInit>(RHS->convertInitializerTo(IntRecTy::get()));
+    IntInit *LHSi = dyn_cast_or_null<IntInit>(
+        LHS->convertInitializerTo(IntRecTy::get(getRecordKeeper())));
+    IntInit *RHSi = dyn_cast_or_null<IntInit>(
+        RHS->convertInitializerTo(IntRecTy::get(getRecordKeeper())));
 
     if (LHSi && RHSi) {
       bool Result;
@@ -1083,7 +1116,7 @@ Init *BinOpInit::Fold(Record *CurRec) const {
       case GT: Result = LHSi->getValue() >  RHSi->getValue(); break;
       default: llvm_unreachable("unhandled comparison");
       }
-      return BitInit::get(Result);
+      return BitInit::get(getRecordKeeper(), Result);
     }
 
     // Next try strings.
@@ -1101,7 +1134,7 @@ Init *BinOpInit::Fold(Record *CurRec) const {
       case GT: Result = LHSs->getValue() >  RHSs->getValue(); break;
       default: llvm_unreachable("unhandled comparison");
       }
-      return BitInit::get(Result);
+      return BitInit::get(getRecordKeeper(), Result);
     }
 
     // Finally, !eq and !ne can be used with records.
@@ -1109,8 +1142,8 @@ Init *BinOpInit::Fold(Record *CurRec) const {
       DefInit *LHSd = dyn_cast<DefInit>(LHS);
       DefInit *RHSd = dyn_cast<DefInit>(RHS);
       if (LHSd && RHSd)
-        return BitInit::get((getOpcode() == EQ) ? LHSd == RHSd
-                                                : LHSd != RHSd);
+        return BitInit::get(getRecordKeeper(),
+                            (getOpcode() == EQ) ? LHSd == RHSd : LHSd != RHSd);
     }
 
     break;
@@ -1138,10 +1171,10 @@ Init *BinOpInit::Fold(Record *CurRec) const {
   case SHL:
   case SRA:
   case SRL: {
-    IntInit *LHSi =
-      dyn_cast_or_null<IntInit>(LHS->convertInitializerTo(IntRecTy::get()));
-    IntInit *RHSi =
-      dyn_cast_or_null<IntInit>(RHS->convertInitializerTo(IntRecTy::get()));
+    IntInit *LHSi = dyn_cast_or_null<IntInit>(
+        LHS->convertInitializerTo(IntRecTy::get(getRecordKeeper())));
+    IntInit *RHSi = dyn_cast_or_null<IntInit>(
+        RHS->convertInitializerTo(IntRecTy::get(getRecordKeeper())));
     if (LHSi && RHSi) {
       int64_t LHSv = LHSi->getValue(), RHSv = RHSi->getValue();
       int64_t Result;
@@ -1157,7 +1190,7 @@ Init *BinOpInit::Fold(Record *CurRec) const {
       case SRA: Result = LHSv >> RHSv; break;
       case SRL: Result = (uint64_t)LHSv >> (uint64_t)RHSv; break;
       }
-      return IntInit::get(Result);
+      return IntInit::get(getRecordKeeper(), Result);
     }
     break;
   }
@@ -1218,12 +1251,13 @@ TernOpInit *TernOpInit::get(TernaryOp Opc, Init *LHS, Init *MHS, Init *RHS,
   FoldingSetNodeID ID;
   ProfileTernOpInit(ID, Opc, LHS, MHS, RHS, Type);
 
+  detail::RecordKeeperImpl &RK = LHS->getRecordKeeper().getImpl();
   void *IP = nullptr;
-  if (TernOpInit *I = Context->TheTernOpInitPool.FindNodeOrInsertPos(ID, IP))
+  if (TernOpInit *I = RK.TheTernOpInitPool.FindNodeOrInsertPos(ID, IP))
     return I;
 
-  TernOpInit *I = new (Context->Allocator) TernOpInit(Opc, LHS, MHS, RHS, Type);
-  Context->TheTernOpInitPool.InsertNode(I, IP);
+  TernOpInit *I = new (RK.Allocator) TernOpInit(Opc, LHS, MHS, RHS, Type);
+  RK.TheTernOpInitPool.InsertNode(I, IP);
   return I;
 }
 
@@ -1296,8 +1330,9 @@ static Init *FilterHelper(Init *LHS, Init *MHS, Init *RHS, RecTy *Type,
       Init *Include = ItemApply(LHS, Item, RHS, CurRec);
       if (!Include)
         return nullptr;
-      if (IntInit *IncludeInt = dyn_cast_or_null<IntInit>(
-              Include->convertInitializerTo(IntRecTy::get()))) {
+      if (IntInit *IncludeInt =
+              dyn_cast_or_null<IntInit>(Include->convertInitializerTo(
+                  IntRecTy::get(LHS->getRecordKeeper())))) {
         if (IncludeInt->getValue())
           NewList.push_back(Item);
       } else {
@@ -1311,6 +1346,7 @@ static Init *FilterHelper(Init *LHS, Init *MHS, Init *RHS, RecTy *Type,
 }
 
 Init *TernOpInit::Fold(Record *CurRec) const {
+  RecordKeeper &RK = getRecordKeeper();
   switch (getOpcode()) {
   case SUBST: {
     DefInit *LHSd = dyn_cast<DefInit>(LHS);
@@ -1351,7 +1387,7 @@ Init *TernOpInit::Fold(Record *CurRec) const {
         idx = found + MHSs->getValue().size();
       }
 
-      return StringInit::get(Val);
+      return StringInit::get(RK, Val);
     }
     break;
   }
@@ -1370,7 +1406,7 @@ Init *TernOpInit::Fold(Record *CurRec) const {
 
   case IF: {
     if (IntInit *LHSi = dyn_cast_or_null<IntInit>(
-                            LHS->convertInitializerTo(IntRecTy::get()))) {
+            LHS->convertInitializerTo(IntRecTy::get(RK)))) {
       if (LHSi->getValue())
         return MHS;
       return RHS;
@@ -1391,8 +1427,8 @@ Init *TernOpInit::Fold(Record *CurRec) const {
       SmallVector<std::pair<Init *, StringInit *>, 8> Children;
       unsigned Size = MHSl ? MHSl->size() : RHSl->size();
       for (unsigned i = 0; i != Size; ++i) {
-        Init *Node = MHSl ? MHSl->getElement(i) : UnsetInit::get();
-        Init *Name = RHSl ? RHSl->getElement(i) : UnsetInit::get();
+        Init *Node = MHSl ? MHSl->getElement(i) : UnsetInit::get(RK);
+        Init *Name = RHSl ? RHSl->getElement(i) : UnsetInit::get(RK);
         if (!isa<StringInit>(Name) && !isa<UnsetInit>(Name))
           return const_cast<TernOpInit *>(this);
         Children.emplace_back(Node, dyn_cast<StringInit>(Name));
@@ -1417,7 +1453,7 @@ Init *TernOpInit::Fold(Record *CurRec) const {
                        std::to_string(Start));
       if (Length < 0)
         PrintError(CurRec->getLoc(), "!substr length must be nonnegative");
-      return StringInit::get(LHSs->getValue().substr(Start, Length),
+      return StringInit::get(RK, LHSs->getValue().substr(Start, Length),
                              LHSs->getFormat());
     }
     break;
@@ -1437,8 +1473,8 @@ Init *TernOpInit::Fold(Record *CurRec) const {
                        std::to_string(Start));
       auto I = LHSs->getValue().find(MHSs->getValue(), Start);
       if (I == std::string::npos)
-        return IntInit::get(-1);
-      return IntInit::get(I);
+        return IntInit::get(RK, -1);
+      return IntInit::get(RK, I);
     }
     break;
   }
@@ -1452,7 +1488,7 @@ Init *TernOpInit::resolveReferences(Resolver &R) const {
 
   if (getOpcode() == IF && lhs != LHS) {
     if (IntInit *Value = dyn_cast_or_null<IntInit>(
-                             lhs->convertInitializerTo(IntRecTy::get()))) {
+            lhs->convertInitializerTo(IntRecTy::get(getRecordKeeper())))) {
       // Short-circuit
       if (Value->getValue())
         return MHS->resolveReferences(R);
@@ -1506,17 +1542,16 @@ static void ProfileFoldOpInit(FoldingSetNodeID &ID, Init *Start, Init *List,
 
 FoldOpInit *FoldOpInit::get(Init *Start, Init *List, Init *A, Init *B,
                             Init *Expr, RecTy *Type) {
-
   FoldingSetNodeID ID;
   ProfileFoldOpInit(ID, Start, List, A, B, Expr, Type);
 
+  detail::RecordKeeperImpl &RK = Start->getRecordKeeper().getImpl();
   void *IP = nullptr;
-  if (FoldOpInit *I = Context->TheFoldOpInitPool.FindNodeOrInsertPos(ID, IP))
+  if (FoldOpInit *I = RK.TheFoldOpInitPool.FindNodeOrInsertPos(ID, IP))
     return I;
 
-  FoldOpInit *I =
-      new (Context->Allocator) FoldOpInit(Start, List, A, B, Expr, Type);
-  Context->TheFoldOpInitPool.InsertNode(I, IP);
+  FoldOpInit *I = new (RK.Allocator) FoldOpInit(Start, List, A, B, Expr, Type);
+  RK.TheFoldOpInitPool.InsertNode(I, IP);
   return I;
 }
 
@@ -1575,12 +1610,13 @@ IsAOpInit *IsAOpInit::get(RecTy *CheckType, Init *Expr) {
   FoldingSetNodeID ID;
   ProfileIsAOpInit(ID, CheckType, Expr);
 
+  detail::RecordKeeperImpl &RK = Expr->getRecordKeeper().getImpl();
   void *IP = nullptr;
-  if (IsAOpInit *I = Context->TheIsAOpInitPool.FindNodeOrInsertPos(ID, IP))
+  if (IsAOpInit *I = RK.TheIsAOpInitPool.FindNodeOrInsertPos(ID, IP))
     return I;
 
-  IsAOpInit *I = new (Context->Allocator) IsAOpInit(CheckType, Expr);
-  Context->TheIsAOpInitPool.InsertNode(I, IP);
+  IsAOpInit *I = new (RK.Allocator) IsAOpInit(CheckType, Expr);
+  RK.TheIsAOpInitPool.InsertNode(I, IP);
   return I;
 }
 
@@ -1592,17 +1628,17 @@ Init *IsAOpInit::Fold() const {
   if (TypedInit *TI = dyn_cast<TypedInit>(Expr)) {
     // Is the expression type known to be (a subclass of) the desired type?
     if (TI->getType()->typeIsConvertibleTo(CheckType))
-      return IntInit::get(1);
+      return IntInit::get(getRecordKeeper(), 1);
 
     if (isa<RecordRecTy>(CheckType)) {
       // If the target type is not a subclass of the expression type, or if
       // the expression has fully resolved to a record, we know that it can't
       // be of the required type.
       if (!CheckType->typeIsConvertibleTo(TI->getType()) || isa<DefInit>(Expr))
-        return IntInit::get(0);
+        return IntInit::get(getRecordKeeper(), 0);
     } else {
       // We treat non-record types as not castable.
-      return IntInit::get(0);
+      return IntInit::get(getRecordKeeper(), 0);
     }
   }
   return const_cast<IsAOpInit *>(this);
@@ -1625,6 +1661,81 @@ std::string IsAOpInit::getAsString() const {
       .str();
 }
 
+static void ProfileExistsOpInit(FoldingSetNodeID &ID, RecTy *CheckType,
+                                Init *Expr) {
+  ID.AddPointer(CheckType);
+  ID.AddPointer(Expr);
+}
+
+ExistsOpInit *ExistsOpInit::get(RecTy *CheckType, Init *Expr) {
+  FoldingSetNodeID ID;
+  ProfileExistsOpInit(ID, CheckType, Expr);
+
+  detail::RecordKeeperImpl &RK = Expr->getRecordKeeper().getImpl();
+  void *IP = nullptr;
+  if (ExistsOpInit *I = RK.TheExistsOpInitPool.FindNodeOrInsertPos(ID, IP))
+    return I;
+
+  ExistsOpInit *I = new (RK.Allocator) ExistsOpInit(CheckType, Expr);
+  RK.TheExistsOpInitPool.InsertNode(I, IP);
+  return I;
+}
+
+void ExistsOpInit::Profile(FoldingSetNodeID &ID) const {
+  ProfileExistsOpInit(ID, CheckType, Expr);
+}
+
+Init *ExistsOpInit::Fold(Record *CurRec, bool IsFinal) const {
+  if (StringInit *Name = dyn_cast<StringInit>(Expr)) {
+    if (!CurRec && !IsFinal)
+      return const_cast<ExistsOpInit *>(this);
+
+    // Self-references are allowed, but their resolution is delayed until
+    // the final resolve to ensure that we get the correct type for them.
+    auto *Anonymous = dyn_cast<AnonymousNameInit>(CurRec->getNameInit());
+    if (Name == CurRec->getNameInit() ||
+        (Anonymous && Name == Anonymous->getNameInit())) {
+      if (!IsFinal)
+        return const_cast<ExistsOpInit *>(this);
+
+      // No doubt that there exists a record, so we should check if types are
+      // compatiable.
+      return IntInit::get(getRecordKeeper(),
+                          CurRec->getType()->typeIsA(CheckType));
+    }
+
+    // Look up all defined records to see if we can find one.
+    Record *D = CheckType->getRecordKeeper().getDef(Name->getValue());
+    if (!D) {
+      if (IsFinal)
+        return IntInit::get(getRecordKeeper(), 0);
+      return const_cast<ExistsOpInit *>(this);
+    }
+
+    // Check if types are compatiable.
+    return IntInit::get(getRecordKeeper(),
+                        DefInit::get(D)->getType()->typeIsA(CheckType));
+  }
+  return const_cast<ExistsOpInit *>(this);
+}
+
+Init *ExistsOpInit::resolveReferences(Resolver &R) const {
+  Init *NewExpr = Expr->resolveReferences(R);
+  if (Expr != NewExpr || R.isFinal())
+    return get(CheckType, NewExpr)->Fold(R.getCurrentRecord(), R.isFinal());
+  return const_cast<ExistsOpInit *>(this);
+}
+
+Init *ExistsOpInit::getBit(unsigned Bit) const {
+  return VarBitInit::get(const_cast<ExistsOpInit *>(this), Bit);
+}
+
+std::string ExistsOpInit::getAsString() const {
+  return (Twine("!exists<") + CheckType->getAsString() + ">(" +
+          Expr->getAsString() + ")")
+      .str();
+}
+
 RecTy *TypedInit::getFieldType(StringInit *FieldName) const {
   if (RecordRecTy *RecordType = dyn_cast<RecordRecTy>(getType())) {
     for (Record *Rec : RecordType->getClasses()) {
@@ -1642,7 +1753,7 @@ TypedInit::convertInitializerTo(RecTy *Ty) const {
 
   if (isa<BitRecTy>(getType()) && isa<BitsRecTy>(Ty) &&
       cast<BitsRecTy>(Ty)->getNumBits() == 1)
-    return BitsInit::get({const_cast<TypedInit *>(this)});
+    return BitsInit::get(getRecordKeeper(), {const_cast<TypedInit *>(this)});
 
   return nullptr;
 }
@@ -1660,7 +1771,7 @@ Init *TypedInit::convertInitializerBitRange(ArrayRef<unsigned> Bits) const {
 
     NewBits.push_back(VarBitInit::get(const_cast<TypedInit *>(this), Bit));
   }
-  return BitsInit::get(NewBits);
+  return BitsInit::get(getRecordKeeper(), NewBits);
 }
 
 Init *TypedInit::getCastTo(RecTy *Ty) const {
@@ -1698,14 +1809,15 @@ Init *TypedInit::convertInitListSlice(ArrayRef<unsigned> Elements) const {
 
 
 VarInit *VarInit::get(StringRef VN, RecTy *T) {
-  Init *Value = StringInit::get(VN);
+  Init *Value = StringInit::get(T->getRecordKeeper(), VN);
   return VarInit::get(Value, T);
 }
 
 VarInit *VarInit::get(Init *VN, RecTy *T) {
-  VarInit *&I = Context->TheVarInitPool[std::make_pair(T, VN)];
+  detail::RecordKeeperImpl &RK = T->getRecordKeeper().getImpl();
+  VarInit *&I = RK.TheVarInitPool[std::make_pair(T, VN)];
   if (!I)
-    I = new (Context->Allocator) VarInit(VN, T);
+    I = new (RK.Allocator) VarInit(VN, T);
   return I;
 }
 
@@ -1715,7 +1827,7 @@ StringRef VarInit::getName() const {
 }
 
 Init *VarInit::getBit(unsigned Bit) const {
-  if (getType() == BitRecTy::get())
+  if (getType() == BitRecTy::get(getRecordKeeper()))
     return const_cast<VarInit*>(this);
   return VarBitInit::get(const_cast<VarInit*>(this), Bit);
 }
@@ -1727,9 +1839,10 @@ Init *VarInit::resolveReferences(Resolver &R) const {
 }
 
 VarBitInit *VarBitInit::get(TypedInit *T, unsigned B) {
-  VarBitInit *&I = Context->TheVarBitInitPool[std::make_pair(T, B)];
+  detail::RecordKeeperImpl &RK = T->getRecordKeeper().getImpl();
+  VarBitInit *&I = RK.TheVarBitInitPool[std::make_pair(T, B)];
   if (!I)
-    I = new(Context->Allocator) VarBitInit(T, B);
+    I = new (RK.Allocator) VarBitInit(T, B);
   return I;
 }
 
@@ -1746,10 +1859,10 @@ Init *VarBitInit::resolveReferences(Resolver &R) const {
 }
 
 VarListElementInit *VarListElementInit::get(TypedInit *T, unsigned E) {
-  VarListElementInit *&I =
-      Context->TheVarListElementInitPool[std::make_pair(T, E)];
+  detail::RecordKeeperImpl &RK = T->getRecordKeeper().getImpl();
+  VarListElementInit *&I = RK.TheVarListElementInitPool[std::make_pair(T, E)];
   if (!I)
-    I = new (Context->Allocator) VarListElementInit(T, E);
+    I = new (RK.Allocator) VarListElementInit(T, E);
   return I;
 }
 
@@ -1771,7 +1884,7 @@ Init *VarListElementInit::resolveReferences(Resolver &R) const {
 }
 
 Init *VarListElementInit::getBit(unsigned Bit) const {
-  if (getType() == BitRecTy::get())
+  if (getType() == BitRecTy::get(getRecordKeeper()))
     return const_cast<VarListElementInit*>(this);
   return VarBitInit::get(const_cast<VarListElementInit*>(this), Bit);
 }
@@ -1808,20 +1921,25 @@ static void ProfileVarDefInit(FoldingSetNodeID &ID,
     ID.AddPointer(I);
 }
 
+VarDefInit::VarDefInit(Record *Class, unsigned N)
+    : TypedInit(IK_VarDefInit, RecordRecTy::get(Class)), Class(Class),
+      NumArgs(N) {}
+
 VarDefInit *VarDefInit::get(Record *Class, ArrayRef<Init *> Args) {
   FoldingSetNodeID ID;
   ProfileVarDefInit(ID, Class, Args);
 
+  detail::RecordKeeperImpl &RK = Class->getRecords().getImpl();
   void *IP = nullptr;
-  if (VarDefInit *I = Context->TheVarDefInitPool.FindNodeOrInsertPos(ID, IP))
+  if (VarDefInit *I = RK.TheVarDefInitPool.FindNodeOrInsertPos(ID, IP))
     return I;
 
-  void *Mem = Context->Allocator.Allocate(totalSizeToAlloc<Init *>(Args.size()),
-                                          alignof(VarDefInit));
+  void *Mem = RK.Allocator.Allocate(totalSizeToAlloc<Init *>(Args.size()),
+                                    alignof(VarDefInit));
   VarDefInit *I = new (Mem) VarDefInit(Class, Args.size());
   std::uninitialized_copy(Args.begin(), Args.end(),
                           I->getTrailingObjects<Init *>());
-  Context->TheVarDefInitPool.InsertNode(I, IP);
+  RK.TheVarDefInitPool.InsertNode(I, IP);
   return I;
 }
 
@@ -1927,14 +2045,15 @@ std::string VarDefInit::getAsString() const {
 }
 
 FieldInit *FieldInit::get(Init *R, StringInit *FN) {
-  FieldInit *&I = Context->TheFieldInitPool[std::make_pair(R, FN)];
+  detail::RecordKeeperImpl &RK = R->getRecordKeeper().getImpl();
+  FieldInit *&I = RK.TheFieldInitPool[std::make_pair(R, FN)];
   if (!I)
-    I = new (Context->Allocator) FieldInit(R, FN);
+    I = new (RK.Allocator) FieldInit(R, FN);
   return I;
 }
 
 Init *FieldInit::getBit(unsigned Bit) const {
-  if (getType() == BitRecTy::get())
+  if (getType() == BitRecTy::get(getRecordKeeper()))
     return const_cast<FieldInit*>(this);
   return VarBitInit::get(const_cast<FieldInit*>(this), Bit);
 }
@@ -1992,20 +2111,20 @@ void CondOpInit::Profile(FoldingSetNodeID &ID) const {
       ValType);
 }
 
-CondOpInit *
-CondOpInit::get(ArrayRef<Init *> CondRange,
-                ArrayRef<Init *> ValRange, RecTy *Ty) {
+CondOpInit *CondOpInit::get(ArrayRef<Init *> CondRange,
+                            ArrayRef<Init *> ValRange, RecTy *Ty) {
   assert(CondRange.size() == ValRange.size() &&
          "Number of conditions and values must match!");
 
   FoldingSetNodeID ID;
   ProfileCondOpInit(ID, CondRange, ValRange, Ty);
 
+  detail::RecordKeeperImpl &RK = Ty->getRecordKeeper().getImpl();
   void *IP = nullptr;
-  if (CondOpInit *I = Context->TheCondOpInitPool.FindNodeOrInsertPos(ID, IP))
+  if (CondOpInit *I = RK.TheCondOpInitPool.FindNodeOrInsertPos(ID, IP))
     return I;
 
-  void *Mem = Context->Allocator.Allocate(
+  void *Mem = RK.Allocator.Allocate(
       totalSizeToAlloc<Init *>(2 * CondRange.size()), alignof(BitsInit));
   CondOpInit *I = new(Mem) CondOpInit(CondRange.size(), Ty);
 
@@ -2013,7 +2132,7 @@ CondOpInit::get(ArrayRef<Init *> CondRange,
                           I->getTrailingObjects<Init *>());
   std::uninitialized_copy(ValRange.begin(), ValRange.end(),
                           I->getTrailingObjects<Init *>()+CondRange.size());
-  Context->TheCondOpInitPool.InsertNode(I, IP);
+  RK.TheCondOpInitPool.InsertNode(I, IP);
   return I;
 }
 
@@ -2041,16 +2160,18 @@ Init *CondOpInit::resolveReferences(Resolver &R) const {
 }
 
 Init *CondOpInit::Fold(Record *CurRec) const {
+  RecordKeeper &RK = getRecordKeeper();
   for ( unsigned i = 0; i < NumConds; ++i) {
     Init *Cond = getCond(i);
     Init *Val = getVal(i);
 
     if (IntInit *CondI = dyn_cast_or_null<IntInit>(
-            Cond->convertInitializerTo(IntRecTy::get()))) {
+            Cond->convertInitializerTo(IntRecTy::get(RK)))) {
       if (CondI->getValue())
         return Val->convertInitializerTo(getValType());
-    } else
-     return const_cast<CondOpInit *>(this);
+    } else {
+      return const_cast<CondOpInit *>(this);
+    }
   }
 
   PrintFatalError(CurRec->getLoc(),
@@ -2120,11 +2241,12 @@ DagInit *DagInit::get(Init *V, StringInit *VN, ArrayRef<Init *> ArgRange,
   FoldingSetNodeID ID;
   ProfileDagInit(ID, V, VN, ArgRange, NameRange);
 
+  detail::RecordKeeperImpl &RK = V->getRecordKeeper().getImpl();
   void *IP = nullptr;
-  if (DagInit *I = Context->TheDagInitPool.FindNodeOrInsertPos(ID, IP))
+  if (DagInit *I = RK.TheDagInitPool.FindNodeOrInsertPos(ID, IP))
     return I;
 
-  void *Mem = Context->Allocator.Allocate(
+  void *Mem = RK.Allocator.Allocate(
       totalSizeToAlloc<Init *, StringInit *>(ArgRange.size(), NameRange.size()),
       alignof(BitsInit));
   DagInit *I = new (Mem) DagInit(V, VN, ArgRange.size(), NameRange.size());
@@ -2132,7 +2254,7 @@ DagInit *DagInit::get(Init *V, StringInit *VN, ArrayRef<Init *> ArgRange,
                           I->getTrailingObjects<Init *>());
   std::uninitialized_copy(NameRange.begin(), NameRange.end(),
                           I->getTrailingObjects<StringInit *>());
-  Context->TheDagInitPool.InsertNode(I, IP);
+  RK.TheDagInitPool.InsertNode(I, IP);
   return I;
 }
 
@@ -2209,7 +2331,7 @@ std::string DagInit::getAsString() const {
 
 RecordVal::RecordVal(Init *N, RecTy *T, FieldKind K)
     : Name(N), TyAndKind(T, K) {
-  setValue(UnsetInit::get());
+  setValue(UnsetInit::get(N->getRecordKeeper()));
   assert(Value && "Cannot create unset value for current type!");
 }
 
@@ -2217,7 +2339,7 @@ RecordVal::RecordVal(Init *N, RecTy *T, FieldKind K)
 // a source location.
 RecordVal::RecordVal(Init *N, SMLoc Loc, RecTy *T, FieldKind K)
     : Name(N), Loc(Loc), TyAndKind(T, K) {
-  setValue(UnsetInit::get());
+  setValue(UnsetInit::get(N->getRecordKeeper()));
   assert(Value && "Cannot create unset value for current type!");
 }
 
@@ -2226,7 +2348,7 @@ StringRef RecordVal::getName() const {
 }
 
 std::string RecordVal::getPrintType() const {
-  if (getType() == StringRecTy::get()) {
+  if (getType() == StringRecTy::get(getRecordKeeper())) {
     if (auto *StrInit = dyn_cast<StringInit>(Value)) {
       if (StrInit->hasCodeFormat())
         return "code";
@@ -2252,7 +2374,7 @@ bool RecordVal::setValue(Init *V) {
           Bits.reserve(BTy->getNumBits());
           for (unsigned I = 0, E = BTy->getNumBits(); I < E; ++I)
             Bits.push_back(Value->getBit(I));
-          Value = BitsInit::get(Bits);
+          Value = BitsInit::get(V->getRecordKeeper(), Bits);
         }
       }
     }
@@ -2277,7 +2399,7 @@ bool RecordVal::setValue(Init *V, SMLoc NewLoc) {
           Bits.reserve(BTy->getNumBits());
           for (unsigned I = 0, E = BTy->getNumBits(); I < E; ++I)
             Bits.push_back(Value->getBit(I));
-          Value = BitsInit::get(Bits);
+          Value = BitsInit::get(getRecordKeeper(), Bits);
         }
       }
     }
@@ -2313,16 +2435,20 @@ void Record::checkName() {
 RecordRecTy *Record::getType() {
   SmallVector<Record *, 4> DirectSCs;
   getDirectSuperClasses(DirectSCs);
-  return RecordRecTy::get(DirectSCs);
+  return RecordRecTy::get(TrackedRecords, DirectSCs);
 }
 
 DefInit *Record::getDefInit() {
-  if (!CorrespondingDefInit)
-    CorrespondingDefInit = new (Context->Allocator) DefInit(this);
+  if (!CorrespondingDefInit) {
+    CorrespondingDefInit =
+        new (TrackedRecords.getImpl().Allocator) DefInit(this);
+  }
   return CorrespondingDefInit;
 }
 
-unsigned Record::getNewUID() { return Context->LastRecordID++; }
+unsigned Record::getNewUID(RecordKeeper &RK) {
+  return RK.getImpl().LastRecordID++;
+}
 
 void Record::setName(Init *NewName) {
   Name = NewName;
@@ -2472,7 +2598,7 @@ Init *Record::getValueInit(StringRef FieldName) const {
 
 StringRef Record::getValueAsString(StringRef FieldName) const {
   llvm::Optional<StringRef> S = getValueAsOptionalString(FieldName);
-  if (!S.hasValue())
+  if (!S)
     PrintFatalError(getLoc(), "Record `" + getName() +
       "' does not have a field named `" + FieldName + "'!\n");
   return S.getValue();
@@ -2671,6 +2797,10 @@ void Record::checkUnusedTemplateArgs() {
   }
 }
 
+RecordKeeper::RecordKeeper()
+    : Impl(std::make_unique<detail::RecordKeeperImpl>(*this)) {}
+RecordKeeper::~RecordKeeper() = default;
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void RecordKeeper::dump() const { errs() << *this; }
 #endif
@@ -2689,7 +2819,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const RecordKeeper &RK) {
 /// GetNewAnonymousName - Generate a unique anonymous name that can be used as
 /// an identifier.
 Init *RecordKeeper::getNewAnonymousName() {
-  return AnonymousNameInit::get(AnonCounter++);
+  return AnonymousNameInit::get(*this, getImpl().AnonCounter++);
 }
 
 // These functions implement the phase timing facility. Starting a timer
@@ -2733,11 +2863,10 @@ void RecordKeeper::stopBackendTimer() {
   }
 }
 
-// We cache the record vectors for single classes. Many backends request
-// the same vectors multiple times.
-std::vector<Record *> RecordKeeper::getAllDerivedDefinitions(
-    StringRef ClassName) const {
-
+std::vector<Record *>
+RecordKeeper::getAllDerivedDefinitions(StringRef ClassName) const {
+  // We cache the record vectors for single classes. Many backends request
+  // the same vectors multiple times.
   auto Pair = ClassRecordsMap.try_emplace(ClassName);
   if (Pair.second)
     Pair.first->second = getAllDerivedDefinitions(makeArrayRef(ClassName));
@@ -2768,6 +2897,12 @@ std::vector<Record *> RecordKeeper::getAllDerivedDefinitions(
   return Defs;
 }
 
+std::vector<Record *>
+RecordKeeper::getAllDerivedDefinitionsIfDefined(StringRef ClassName) const {
+  return getClass(ClassName) ? getAllDerivedDefinitions(ClassName)
+                             : std::vector<Record *>();
+}
+
 Init *MapResolver::resolve(Init *VarName) {
   auto It = Map.find(VarName);
   if (It == Map.end())
diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp
index 25079fe33edb..2a4ee4473b56 100644
--- a/llvm/lib/TableGen/TGLexer.cpp
+++ b/llvm/lib/TableGen/TGLexer.cpp
@@ -55,10 +55,8 @@ TGLexer::TGLexer(SourceMgr &SM, ArrayRef<std::string> Macros) : SrcMgr(SM) {
       std::make_unique<std::vector<PreprocessorControlDesc>>());
 
   // Put all macros defined in the command line into the DefinedMacros set.
-  std::for_each(Macros.begin(), Macros.end(),
-                [this](const std::string &MacroName) {
-                  DefinedMacros.insert(MacroName);
-                });
+  for (const std::string &MacroName : Macros)
+    DefinedMacros.insert(MacroName);
 }
 
 SMLoc TGLexer::getLoc() const {
@@ -586,6 +584,7 @@ tgtok::TokKind TGLexer::LexExclaim() {
     .Case("find", tgtok::XFind)
     .Cases("setdagop", "setop", tgtok::XSetDagOp) // !setop is deprecated.
     .Cases("getdagop", "getop", tgtok::XGetDagOp) // !getop is deprecated.
+    .Case("exists", tgtok::XExists)
     .Default(tgtok::Error);
 
   return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator");
diff --git a/llvm/lib/TableGen/TGLexer.h b/llvm/lib/TableGen/TGLexer.h
index 857ba09782e8..459ba0f4af64 100644
--- a/llvm/lib/TableGen/TGLexer.h
+++ b/llvm/lib/TableGen/TGLexer.h
@@ -56,6 +56,7 @@ namespace tgtok {
     XListConcat, XListSplat, XStrConcat, XInterleave, XSubstr, XFind, XCast,
     XSubst, XForEach, XFilter, XFoldl, XHead, XTail, XSize, XEmpty, XIf,
     XCond, XEq, XIsA, XDag, XNe, XLe, XLt, XGe, XGt, XSetDagOp, XGetDagOp,
+    XExists,
 
     // Boolean literals.
     TrueVal, FalseVal,
@@ -337,7 +338,7 @@ private:
   //
   // The method returns true upon reaching the first non-whitespace symbol
   // or EOF, CurPtr is set to point to this symbol.  The method returns false,
-  // if an error occured during skipping of a C-style comment.
+  // if an error occurred during skipping of a C-style comment.
   bool prepSkipLineBegin();
 
   // Skip any whitespaces or comments after a preprocessing directive.
@@ -345,7 +346,7 @@ private:
   // or end of the file.  If there is a multiline C-style comment
   // after the preprocessing directive, the method skips
   // the comment, so the final CurPtr may point to one of the next lines.
-  // The method returns false, if an error occured during skipping
+  // The method returns false, if an error occurred during skipping
   // C- or C++-style comment, or a non-whitespace symbol appears
   // after the preprocessing directive.
   //
diff --git a/llvm/lib/TableGen/TGParser.cpp b/llvm/lib/TableGen/TGParser.cpp
index 90646a0c642d..acf93dc3d792 100644
--- a/llvm/lib/TableGen/TGParser.cpp
+++ b/llvm/lib/TableGen/TGParser.cpp
@@ -112,14 +112,15 @@ static void checkConcrete(Record &R) {
 
 /// Return an Init with a qualifier prefix referring
 /// to CurRec's name.
-static Init *QualifyName(Record &CurRec, MultiClass *CurMultiClass,
-                        Init *Name, StringRef Scoper) {
-  Init *NewName =
-      BinOpInit::getStrConcat(CurRec.getNameInit(), StringInit::get(Scoper));
+static Init *QualifyName(Record &CurRec, MultiClass *CurMultiClass, Init *Name,
+                         StringRef Scoper) {
+  RecordKeeper &RK = CurRec.getRecords();
+  Init *NewName = BinOpInit::getStrConcat(CurRec.getNameInit(),
+                                          StringInit::get(RK, Scoper));
   NewName = BinOpInit::getStrConcat(NewName, Name);
   if (CurMultiClass && Scoper != "::") {
     Init *Prefix = BinOpInit::getStrConcat(CurMultiClass->Rec.getNameInit(),
-                                           StringInit::get("::"));
+                                           StringInit::get(RK, "::"));
     NewName = BinOpInit::getStrConcat(Prefix, NewName);
   }
 
@@ -131,7 +132,8 @@ static Init *QualifyName(Record &CurRec, MultiClass *CurMultiClass,
 /// Return the qualified version of the implicit 'NAME' template argument.
 static Init *QualifiedNameOfImplicitName(Record &Rec,
                                          MultiClass *MC = nullptr) {
-  return QualifyName(Rec, MC, StringInit::get("NAME"), MC ? "::" : ":");
+  return QualifyName(Rec, MC, StringInit::get(Rec.getRecords(), "NAME"),
+                     MC ? "::" : ":");
 }
 
 static Init *QualifiedNameOfImplicitName(MultiClass *MC) {
@@ -187,7 +189,7 @@ bool TGParser::SetValue(Record *CurRec, SMLoc Loc, Init *ValName,
                    "' is not a bits type");
 
     // Convert the incoming value to a bits type of the appropriate size...
-    Init *BI = V->getCastTo(BitsRecTy::get(BitList.size()));
+    Init *BI = V->getCastTo(BitsRecTy::get(Records, BitList.size()));
     if (!BI)
       return Error(Loc, "Initializer is not compatible with bit range");
 
@@ -206,7 +208,7 @@ bool TGParser::SetValue(Record *CurRec, SMLoc Loc, Init *ValName,
       if (!NewBits[i])
         NewBits[i] = CurVal->getBit(i);
 
-    V = BitsInit::get(NewBits);
+    V = BitsInit::get(Records, NewBits);
   }
 
   if (RV->setValue(V, Loc)) {
@@ -262,8 +264,8 @@ bool TGParser::AddSubClass(Record *CurRec, SubClassReference &SubClass) {
 
   Init *Name;
   if (CurRec->isClass())
-    Name =
-        VarInit::get(QualifiedNameOfImplicitName(*CurRec), StringRecTy::get());
+    Name = VarInit::get(QualifiedNameOfImplicitName(*CurRec),
+                        StringRecTy::get(Records));
   else
     Name = CurRec->getNameInit();
   R.set(QualifiedNameOfImplicitName(*SC), Name);
@@ -333,9 +335,9 @@ bool TGParser::AddSubMultiClass(MultiClass *CurMC,
     }
   }
 
-  TemplateArgs.emplace_back(
-      QualifiedNameOfImplicitName(SMC),
-      VarInit::get(QualifiedNameOfImplicitName(CurMC), StringRecTy::get()));
+  TemplateArgs.emplace_back(QualifiedNameOfImplicitName(SMC),
+                            VarInit::get(QualifiedNameOfImplicitName(CurMC),
+                                         StringRecTy::get(Records)));
 
   // Add all of the defs in the subclass into the current multiclass.
   return resolve(SMC->Entries, TemplateArgs, false, &CurMC->Entries);
@@ -540,7 +542,7 @@ Init *TGParser::ParseObjectName(MultiClass *CurMultiClass) {
     // These are all of the tokens that can begin an object body.
     // Some of these can also begin values but we disallow those cases
     // because they are unlikely to be useful.
-    return UnsetInit::get();
+    return UnsetInit::get(Records);
   default:
     break;
   }
@@ -549,7 +551,7 @@ Init *TGParser::ParseObjectName(MultiClass *CurMultiClass) {
   if (CurMultiClass)
     CurRec = &CurMultiClass->Rec;
 
-  Init *Name = ParseValue(CurRec, StringRecTy::get(), ParseNameMode);
+  Init *Name = ParseValue(CurRec, StringRecTy::get(Records), ParseNameMode);
   if (!Name)
     return nullptr;
 
@@ -558,8 +560,8 @@ Init *TGParser::ParseObjectName(MultiClass *CurMultiClass) {
     HasReferenceResolver R(NameStr);
     Name->resolveReferences(R);
     if (!R.found())
-      Name = BinOpInit::getStrConcat(VarInit::get(NameStr, StringRecTy::get()),
-                                     Name);
+      Name = BinOpInit::getStrConcat(
+          VarInit::get(NameStr, StringRecTy::get(Records)), Name);
   }
 
   return Name;
@@ -812,12 +814,21 @@ RecTy *TGParser::ParseType() {
   switch (Lex.getCode()) {
   default: TokError("Unknown token when expecting a type"); return nullptr;
   case tgtok::String:
-  case tgtok::Code:   Lex.Lex(); return StringRecTy::get();
-  case tgtok::Bit:    Lex.Lex(); return BitRecTy::get();
-  case tgtok::Int:    Lex.Lex(); return IntRecTy::get();
-  case tgtok::Dag:    Lex.Lex(); return DagRecTy::get();
+  case tgtok::Code:
+    Lex.Lex();
+    return StringRecTy::get(Records);
+  case tgtok::Bit:
+    Lex.Lex();
+    return BitRecTy::get(Records);
+  case tgtok::Int:
+    Lex.Lex();
+    return IntRecTy::get(Records);
+  case tgtok::Dag:
+    Lex.Lex();
+    return DagRecTy::get(Records);
   case tgtok::Id:
-    if (Record *R = ParseClassID()) return RecordRecTy::get(R);
+    if (Record *R = ParseClassID())
+      return RecordRecTy::get(R);
     TokError("unknown class name");
     return nullptr;
   case tgtok::Bits: {
@@ -835,7 +846,7 @@ RecTy *TGParser::ParseType() {
       return nullptr;
     }
     Lex.Lex();  // Eat '>'
-    return BitsRecTy::get(Val);
+    return BitsRecTy::get(Records, Val);
   }
   case tgtok::List: {
     if (Lex.Lex() != tgtok::less) { // Eat 'bits'
@@ -878,7 +889,7 @@ Init *TGParser::ParseIDValue(Record *CurRec, StringInit *Name, SMLoc NameLoc,
       RV->setUsed(true);
       return VarInit::get(TemplateArgName, RV->getType());
     } else if (Name->getValue() == "NAME") {
-      return VarInit::get(TemplateArgName, StringRecTy::get());
+      return VarInit::get(TemplateArgName, StringRecTy::get(Records));
     }
   }
 
@@ -947,7 +958,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
     case tgtok::XNOT:
       Lex.Lex();  // eat the operation
       Code = UnOpInit::NOT;
-      Type = IntRecTy::get();
+      Type = IntRecTy::get(Records);
       break;
     case tgtok::XHead:
       Lex.Lex();  // eat the operation
@@ -960,12 +971,12 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
     case tgtok::XSize:
       Lex.Lex();
       Code = UnOpInit::SIZE;
-      Type = IntRecTy::get();
+      Type = IntRecTy::get(Records);
       break;
     case tgtok::XEmpty:
       Lex.Lex();  // eat the operation
       Code = UnOpInit::EMPTY;
-      Type = IntRecTy::get();
+      Type = IntRecTy::get(Records);
       break;
     case tgtok::XGetDagOp:
       Lex.Lex();  // eat the operation
@@ -985,7 +996,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
           // but keep parsing, to consume the operand
         }
       } else {
-        Type = RecordRecTy::get({});
+        Type = RecordRecTy::get(Records, {});
       }
       Code = UnOpInit::GETDAGOP;
       break;
@@ -1085,6 +1096,52 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
     return (IsAOpInit::get(Type, LHS))->Fold();
   }
 
+  case tgtok::XExists: {
+    // Value ::= !exists '<' Type '>' '(' Value ')'
+    Lex.Lex(); // eat the operation
+
+    RecTy *Type = ParseOperatorType();
+    if (!Type)
+      return nullptr;
+
+    if (!consume(tgtok::l_paren)) {
+      TokError("expected '(' after type of !exists");
+      return nullptr;
+    }
+
+    SMLoc ExprLoc = Lex.getLoc();
+    Init *Expr = ParseValue(CurRec);
+    if (!Expr)
+      return nullptr;
+
+    TypedInit *ExprType = dyn_cast<TypedInit>(Expr);
+    if (!ExprType) {
+      Error(ExprLoc, "expected string type argument in !exists operator");
+      return nullptr;
+    }
+
+    RecordRecTy *RecType = dyn_cast<RecordRecTy>(ExprType->getType());
+    if (RecType) {
+      Error(ExprLoc,
+            "expected string type argument in !exists operator, please "
+            "use !isa instead");
+      return nullptr;
+    }
+
+    StringRecTy *SType = dyn_cast<StringRecTy>(ExprType->getType());
+    if (!SType) {
+      Error(ExprLoc, "expected string type argument in !exists operator");
+      return nullptr;
+    }
+
+    if (!consume(tgtok::r_paren)) {
+      TokError("expected ')' in !exists");
+      return nullptr;
+    }
+
+    return (ExistsOpInit::get(Type, Expr))->Fold(CurRec);
+  }
+
   case tgtok::XConcat:
   case tgtok::XADD:
   case tgtok::XSUB:
@@ -1143,8 +1200,8 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
       llvm_unreachable("Unhandled code!");
     case tgtok::XConcat:
     case tgtok::XSetDagOp:
-      Type = DagRecTy::get();
-      ArgType = DagRecTy::get();
+      Type = DagRecTy::get(Records);
+      ArgType = DagRecTy::get(Records);
       break;
     case tgtok::XAND:
     case tgtok::XOR:
@@ -1155,8 +1212,8 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
     case tgtok::XADD:
     case tgtok::XSUB:
     case tgtok::XMUL:
-      Type = IntRecTy::get();
-      ArgType = IntRecTy::get();
+      Type = IntRecTy::get(Records);
+      ArgType = IntRecTy::get(Records);
       break;
     case tgtok::XEq:
     case tgtok::XNe:
@@ -1164,7 +1221,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
     case tgtok::XLt:
     case tgtok::XGe:
     case tgtok::XGt:
-      Type = BitRecTy::get();
+      Type = BitRecTy::get(Records);
       // ArgType for the comparison operators is not yet known.
       break;
     case tgtok::XListConcat:
@@ -1175,11 +1232,11 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
       // Can't do any typechecking until we parse the first argument.
       break;
     case tgtok::XStrConcat:
-      Type = StringRecTy::get();
-      ArgType = StringRecTy::get();
+      Type = StringRecTy::get(Records);
+      ArgType = StringRecTy::get(Records);
       break;
     case tgtok::XInterleave:
-      Type = StringRecTy::get();
+      Type = StringRecTy::get(Records);
       // The first argument type is not yet known.
     }
 
@@ -1253,9 +1310,9 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
           break;
         case BinOpInit::EQ:
         case BinOpInit::NE:
-          if (!ArgType->typeIsConvertibleTo(IntRecTy::get()) &&
-              !ArgType->typeIsConvertibleTo(StringRecTy::get()) &&
-              !ArgType->typeIsConvertibleTo(RecordRecTy::get({}))) {
+          if (!ArgType->typeIsConvertibleTo(IntRecTy::get(Records)) &&
+              !ArgType->typeIsConvertibleTo(StringRecTy::get(Records)) &&
+              !ArgType->typeIsConvertibleTo(RecordRecTy::get(Records, {}))) {
             Error(InitLoc, Twine("expected bit, bits, int, string, or record; "
                                  "got value of type '") + ArgType->getAsString() + 
                                  "'");
@@ -1266,8 +1323,8 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
         case BinOpInit::LT:
         case BinOpInit::GE:
         case BinOpInit::GT:
-          if (!ArgType->typeIsConvertibleTo(IntRecTy::get()) &&
-              !ArgType->typeIsConvertibleTo(StringRecTy::get())) {
+          if (!ArgType->typeIsConvertibleTo(IntRecTy::get(Records)) &&
+              !ArgType->typeIsConvertibleTo(StringRecTy::get(Records))) {
             Error(InitLoc, Twine("expected bit, bits, int, or string; "
                                  "got value of type '") + ArgType->getAsString() + 
                                  "'");
@@ -1277,8 +1334,9 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
         case BinOpInit::INTERLEAVE:
           switch (InitList.size()) {
           case 1: // First argument must be a list of strings or integers.
-            if (ArgType != StringRecTy::get()->getListTy() &&
-                !ArgType->typeIsConvertibleTo(IntRecTy::get()->getListTy())) {
+            if (ArgType != StringRecTy::get(Records)->getListTy() &&
+                !ArgType->typeIsConvertibleTo(
+                    IntRecTy::get(Records)->getListTy())) {
               Error(InitLoc, Twine("expected list of string, int, bits, or bit; "
                                    "got value of type '") +
                                    ArgType->getAsString() + "'");
@@ -1323,7 +1381,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
         case BinOpInit::SETDAGOP:
           // After parsing the first dag argument, switch to expecting
           // a record, with no restriction on its superclasses.
-          ArgType = RecordRecTy::get({});
+          ArgType = RecordRecTy::get(Records, {});
           break;
         default:
           break;
@@ -1383,7 +1441,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
     default: llvm_unreachable("Unhandled code!");
     case tgtok::XDag:
       Code = TernOpInit::DAG;
-      Type = DagRecTy::get();
+      Type = DagRecTy::get(Records);
       ItemType = nullptr;
       break;
     case tgtok::XIf:
@@ -1445,7 +1503,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
         Error(RHSLoc, "could not determine type of the name list in !dag");
         return nullptr;
       }
-      if (RHSt && StringRecTy::get()->getListTy() != RHSt->getType()) {
+      if (RHSt && StringRecTy::get(Records)->getListTy() != RHSt->getType()) {
         Error(RHSLoc, Twine("expected list<string>, got type '") +
                           RHSt->getType()->getAsString() + "'");
         return nullptr;
@@ -1465,16 +1523,16 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
       if (TypedInit *MHSt = dyn_cast<TypedInit>(MHS))
         MHSTy = MHSt->getType();
       if (BitsInit *MHSbits = dyn_cast<BitsInit>(MHS))
-        MHSTy = BitsRecTy::get(MHSbits->getNumBits());
+        MHSTy = BitsRecTy::get(Records, MHSbits->getNumBits());
       if (isa<BitInit>(MHS))
-        MHSTy = BitRecTy::get();
+        MHSTy = BitRecTy::get(Records);
 
       if (TypedInit *RHSt = dyn_cast<TypedInit>(RHS))
         RHSTy = RHSt->getType();
       if (BitsInit *RHSbits = dyn_cast<BitsInit>(RHS))
-        RHSTy = BitsRecTy::get(RHSbits->getNumBits());
+        RHSTy = BitsRecTy::get(Records, RHSbits->getNumBits());
       if (isa<BitInit>(RHS))
-        RHSTy = BitRecTy::get();
+        RHSTy = BitRecTy::get(Records);
 
       // For UnsetInit, it's typed from the other hand.
       if (isa<UnsetInit>(MHS))
@@ -1569,7 +1627,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
       return nullptr;
     }
 
-    Init *A = StringInit::get(Lex.getCurStrVal());
+    Init *A = StringInit::get(Records, Lex.getCurStrVal());
     if (CurRec && CurRec->getValue(A)) {
       TokError((Twine("left !foldl variable '") + A->getAsString() +
                 "' already defined")
@@ -1587,7 +1645,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
       return nullptr;
     }
 
-    Init *B = StringInit::get(Lex.getCurStrVal());
+    Init *B = StringInit::get(Records, Lex.getCurStrVal());
     if (CurRec && CurRec->getValue(B)) {
       TokError((Twine("right !foldl variable '") + B->getAsString() +
                 "' already defined")
@@ -1679,7 +1737,7 @@ RecTy *TGParser::ParseOperatorType() {
 /// Substr ::= !substr(string, start-int [, length-int]) => string
 Init *TGParser::ParseOperationSubstr(Record *CurRec, RecTy *ItemType) {
   TernOpInit::TernaryOp Code = TernOpInit::SUBSTR;
-  RecTy *Type = StringRecTy::get();
+  RecTy *Type = StringRecTy::get(Records);
 
   Lex.Lex(); // eat the operation
 
@@ -1710,7 +1768,7 @@ Init *TGParser::ParseOperationSubstr(Record *CurRec, RecTy *ItemType) {
     if (!RHS)
       return nullptr;
   } else {
-    RHS = IntInit::get(std::numeric_limits<int64_t>::max());
+    RHS = IntInit::get(Records, std::numeric_limits<int64_t>::max());
   }
 
   if (!consume(tgtok::r_paren)) {
@@ -1767,7 +1825,7 @@ Init *TGParser::ParseOperationSubstr(Record *CurRec, RecTy *ItemType) {
 /// Substr ::= !find(string, string [, start-int]) => int
 Init *TGParser::ParseOperationFind(Record *CurRec, RecTy *ItemType) {
   TernOpInit::TernaryOp Code = TernOpInit::FIND;
-  RecTy *Type = IntRecTy::get();
+  RecTy *Type = IntRecTy::get(Records);
 
   Lex.Lex(); // eat the operation
 
@@ -1798,7 +1856,7 @@ Init *TGParser::ParseOperationFind(Record *CurRec, RecTy *ItemType) {
     if (!RHS)
       return nullptr;
   } else {
-    RHS = IntInit::get(0);
+    RHS = IntInit::get(Records, 0);
   }
 
   if (!consume(tgtok::r_paren)) {
@@ -1868,7 +1926,7 @@ Init *TGParser::ParseOperationForEachFilter(Record *CurRec, RecTy *ItemType) {
     return nullptr;
   }
 
-  Init *LHS = StringInit::get(Lex.getCurStrVal());
+  Init *LHS = StringInit::get(Records, Lex.getCurStrVal());
   Lex.Lex(); // eat the ID.
 
   if (CurRec && CurRec->getValue(LHS)) {
@@ -1908,7 +1966,7 @@ Init *TGParser::ParseOperationForEachFilter(Record *CurRec, RecTy *ItemType) {
       if (ListRecTy *OutListTy = dyn_cast<ListRecTy>(ItemType)) {
         ExprEltType = (Operation == tgtok::XForEach)
                           ? OutListTy->getElementType()
-                          : IntRecTy::get();
+                          : IntRecTy::get(Records);
       } else {
         Error(OpLoc,
               "expected value of type '" +
@@ -2028,9 +2086,9 @@ Init *TGParser::ParseOperationCond(Record *CurRec, RecTy *ItemType) {
     if (TypedInit *Vt = dyn_cast<TypedInit>(V))
       VTy = Vt->getType();
     if (BitsInit *Vbits = dyn_cast<BitsInit>(V))
-      VTy = BitsRecTy::get(Vbits->getNumBits());
+      VTy = BitsRecTy::get(Records, Vbits->getNumBits());
     if (isa<BitInit>(V))
-      VTy = BitRecTy::get();
+      VTy = BitRecTy::get(Records);
 
     if (Type == nullptr) {
       if (!isa<UnsetInit>(V))
@@ -2084,23 +2142,23 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
   default: TokError("Unknown or reserved token when parsing a value"); break;
 
   case tgtok::TrueVal:
-    R = IntInit::get(1);
+    R = IntInit::get(Records, 1);
     Lex.Lex();
     break;
   case tgtok::FalseVal:
-    R = IntInit::get(0);
+    R = IntInit::get(Records, 0);
     Lex.Lex();
     break;
   case tgtok::IntVal:
-    R = IntInit::get(Lex.getCurIntVal());
+    R = IntInit::get(Records, Lex.getCurIntVal());
     Lex.Lex();
     break;
   case tgtok::BinaryIntVal: {
     auto BinaryVal = Lex.getCurBinaryIntVal();
     SmallVector<Init*, 16> Bits(BinaryVal.second);
     for (unsigned i = 0, e = BinaryVal.second; i != e; ++i)
-      Bits[i] = BitInit::get(BinaryVal.first & (1LL << i));
-    R = BitsInit::get(Bits);
+      Bits[i] = BitInit::get(Records, BinaryVal.first & (1LL << i));
+    R = BitsInit::get(Records, Bits);
     Lex.Lex();
     break;
   }
@@ -2114,20 +2172,20 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
       Lex.Lex();
     }
 
-    R = StringInit::get(Val);
+    R = StringInit::get(Records, Val);
     break;
   }
   case tgtok::CodeFragment:
-    R = StringInit::get(Lex.getCurStrVal(), StringInit::SF_Code);
+    R = StringInit::get(Records, Lex.getCurStrVal(), StringInit::SF_Code);
     Lex.Lex();
     break;
   case tgtok::question:
-    R = UnsetInit::get();
+    R = UnsetInit::get(Records);
     Lex.Lex();
     break;
   case tgtok::Id: {
     SMLoc NameLoc = Lex.getLoc();
-    StringInit *Name = StringInit::get(Lex.getCurStrVal());
+    StringInit *Name = StringInit::get(Records, Lex.getCurStrVal());
     if (Lex.Lex() != tgtok::less)  // consume the Id.
       return ParseIDValue(CurRec, Name, NameLoc, Mode);    // Value ::= IDValue
 
@@ -2202,7 +2260,7 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
         // Fallthrough to try convert this to a bit.
       }
       // All other values must be convertible to just a single bit.
-      Init *Bit = Vals[i]->getCastTo(BitRecTy::get());
+      Init *Bit = Vals[i]->getCastTo(BitRecTy::get(Records));
       if (!Bit) {
         Error(BraceLoc, "Element #" + Twine(i) + " (" + Vals[i]->getAsString() +
               ") is not convertable to a bit");
@@ -2211,7 +2269,7 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
       NewBits.push_back(Bit);
     }
     std::reverse(NewBits.begin(), NewBits.end());
-    return BitsInit::get(NewBits);
+    return BitsInit::get(Records, NewBits);
   }
   case tgtok::l_square: {          // Value ::= '[' ValueList ']'
     Lex.Lex(); // eat the '['
@@ -2322,7 +2380,7 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
         TokError("expected variable name in dag operator");
         return nullptr;
       }
-      OperatorName = StringInit::get(Lex.getCurStrVal());
+      OperatorName = StringInit::get(Records, Lex.getCurStrVal());
       Lex.Lex();  // eat the VarName.
     }
 
@@ -2346,6 +2404,7 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
   case tgtok::XEmpty:
   case tgtok::XCast:
   case tgtok::XGetDagOp: // Value ::= !unop '(' Value ')'
+  case tgtok::XExists:
   case tgtok::XIsA:
   case tgtok::XConcat:
   case tgtok::XDag:
@@ -2451,7 +2510,7 @@ Init *TGParser::ParseValue(Record *CurRec, RecTy *ItemType, IDParseMode Mode) {
         TokError("expected field identifier after '.'");
         return nullptr;
       }
-      StringInit *FieldName = StringInit::get(Lex.getCurStrVal());
+      StringInit *FieldName = StringInit::get(Records, Lex.getCurStrVal());
       if (!Result->getFieldType(FieldName)) {
         TokError("Cannot access field '" + Lex.getCurStrVal() + "' of value '" +
                  Result->getAsString() + "'");
@@ -2494,9 +2553,9 @@ Init *TGParser::ParseValue(Record *CurRec, RecTy *ItemType, IDParseMode Mode) {
 
       // Create a !strconcat() operation, first casting each operand to
       // a string if necessary.
-      if (LHS->getType() != StringRecTy::get()) {
+      if (LHS->getType() != StringRecTy::get(Records)) {
         auto CastLHS = dyn_cast<TypedInit>(
-            UnOpInit::get(UnOpInit::CAST, LHS, StringRecTy::get())
+            UnOpInit::get(UnOpInit::CAST, LHS, StringRecTy::get(Records))
                 ->Fold(CurRec));
         if (!CastLHS) {
           Error(PasteLoc,
@@ -2518,7 +2577,7 @@ Init *TGParser::ParseValue(Record *CurRec, RecTy *ItemType, IDParseMode Mode) {
         // because they are unlikely to be useful.
 
         // Trailing paste, concat with an empty string.
-        RHS = StringInit::get("");
+        RHS = StringInit::get(Records, "");
         break;
 
       default:
@@ -2531,9 +2590,9 @@ Init *TGParser::ParseValue(Record *CurRec, RecTy *ItemType, IDParseMode Mode) {
           return nullptr;
         }
 
-        if (RHS->getType() != StringRecTy::get()) {
+        if (RHS->getType() != StringRecTy::get(Records)) {
           auto CastRHS = dyn_cast<TypedInit>(
-              UnOpInit::get(UnOpInit::CAST, RHS, StringRecTy::get())
+              UnOpInit::get(UnOpInit::CAST, RHS, StringRecTy::get(Records))
                   ->Fold(CurRec));
           if (!CastRHS) {
             Error(PasteLoc,
@@ -2566,8 +2625,8 @@ void TGParser::ParseDagArgList(
     // DagArg ::= VARNAME
     if (Lex.getCode() == tgtok::VarName) {
       // A missing value is treated like '?'.
-      StringInit *VarName = StringInit::get(Lex.getCurStrVal());
-      Result.emplace_back(UnsetInit::get(), VarName);
+      StringInit *VarName = StringInit::get(Records, Lex.getCurStrVal());
+      Result.emplace_back(UnsetInit::get(Records), VarName);
       Lex.Lex();
     } else {
       // DagArg ::= Value (':' VARNAME)?
@@ -2585,7 +2644,7 @@ void TGParser::ParseDagArgList(
           Result.clear();
           return;
         }
-        VarName = StringInit::get(Lex.getCurStrVal());
+        VarName = StringInit::get(Records, Lex.getCurStrVal());
         Lex.Lex();  // eat the VarName.
       }
 
@@ -2692,7 +2751,7 @@ Init *TGParser::ParseDeclaration(Record *CurRec,
   }
 
   SMLoc IdLoc = Lex.getLoc();
-  Init *DeclName = StringInit::get(Str);
+  Init *DeclName = StringInit::get(Records, Str);
   Lex.Lex();
 
   bool BadField;
@@ -2745,7 +2804,7 @@ VarInit *TGParser::ParseForeachDeclaration(Init *&ForeachListValue) {
     return nullptr;
   }
 
-  Init *DeclName = StringInit::get(Lex.getCurStrVal());
+  Init *DeclName = StringInit::get(Records, Lex.getCurStrVal());
   Lex.Lex();
 
   // If a value is present, parse it.
@@ -2799,10 +2858,10 @@ VarInit *TGParser::ParseForeachDeclaration(Init *&ForeachListValue) {
 
   if (!Ranges.empty()) {
     assert(!IterType && "Type already initialized?");
-    IterType = IntRecTy::get();
+    IterType = IntRecTy::get(Records);
     std::vector<Init *> Values;
     for (unsigned R : Ranges)
-      Values.push_back(IntInit::get(R));
+      Values.push_back(IntInit::get(Records, R));
     ForeachListValue = ListInit::get(Values, IterType);
   }
 
@@ -2879,7 +2938,7 @@ bool TGParser::ParseBodyItem(Record *CurRec) {
     return TokError("expected field identifier after let");
 
   SMLoc IdLoc = Lex.getLoc();
-  StringInit *FieldName = StringInit::get(Lex.getCurStrVal());
+  StringInit *FieldName = StringInit::get(Records, Lex.getCurStrVal());
   Lex.Lex();  // eat the field name.
 
   SmallVector<unsigned, 16> BitList;
@@ -2898,7 +2957,7 @@ bool TGParser::ParseBodyItem(Record *CurRec) {
   if (!BitList.empty() && isa<BitsRecTy>(Type)) {
     // When assigning to a subset of a 'bits' object, expect the RHS to have
     // the type of that subset instead of the type of the whole object.
-    Type = BitsRecTy::get(BitList.size());
+    Type = BitsRecTy::get(Records, BitList.size());
   }
 
   Init *Val = ParseValue(CurRec, Type);
@@ -3056,7 +3115,7 @@ bool TGParser::ParseDefset() {
 
   if (Lex.getCode() != tgtok::Id)
     return TokError("expected identifier");
-  StringInit *DeclName = StringInit::get(Lex.getCurStrVal());
+  StringInit *DeclName = StringInit::get(Records, Lex.getCurStrVal());
   if (Records.getGlobal(DeclName->getValue()))
     return TokError("def or global variable of this name already exists");
 
@@ -3093,7 +3152,7 @@ bool TGParser::ParseDefvar() {
 
   if (Lex.getCode() != tgtok::Id)
     return TokError("expected identifier");
-  StringInit *DeclName = StringInit::get(Lex.getCurStrVal());
+  StringInit *DeclName = StringInit::get(Records, Lex.getCurStrVal());
   if (CurLocalScope) {
     if (CurLocalScope->varAlreadyDefined(DeclName->getValue()))
       return TokError("local variable of this name already exists");
@@ -3201,10 +3260,10 @@ bool TGParser::ParseIf(MultiClass *CurMultiClass) {
   // loop, over a list of length 0 or 1 depending on the condition, and with no
   // iteration variable being assigned.
 
-  ListInit *EmptyList = ListInit::get({}, BitRecTy::get());
+  ListInit *EmptyList = ListInit::get({}, BitRecTy::get(Records));
   ListInit *SingletonList =
-      ListInit::get({BitInit::get(true)}, BitRecTy::get());
-  RecTy *BitListTy = ListRecTy::get(BitRecTy::get());
+      ListInit::get({BitInit::get(Records, true)}, BitRecTy::get(Records));
+  RecTy *BitListTy = ListRecTy::get(BitRecTy::get(Records));
 
   // The foreach containing the then-clause selects SingletonList if
   // the condition is true.
@@ -3369,7 +3428,7 @@ void TGParser::ParseLetList(SmallVectorImpl<LetRecord> &Result) {
       return;
     }
 
-    StringInit *Name = StringInit::get(Lex.getCurStrVal());
+    StringInit *Name = StringInit::get(Records, Lex.getCurStrVal());
     SMLoc NameLoc = Lex.getLoc();
     Lex.Lex();  // Eat the identifier.
 
@@ -3570,7 +3629,7 @@ bool TGParser::ParseDefm(MultiClass *CurMultiClass) {
     if (CurMultiClass)
       DefmName = BinOpInit::getStrConcat(
           VarInit::get(QualifiedNameOfImplicitName(CurMultiClass),
-                       StringRecTy::get()),
+                       StringRecTy::get(Records)),
           DefmName);
   }
 
diff --git a/llvm/lib/TableGen/TGParser.h b/llvm/lib/TableGen/TGParser.h
index 00883c858d58..d4b928c62fd7 100644
--- a/llvm/lib/TableGen/TGParser.h
+++ b/llvm/lib/TableGen/TGParser.h
@@ -45,7 +45,7 @@ namespace llvm {
 
     void dump() const;
 
-    RecordsEntry() {}
+    RecordsEntry() = default;
     RecordsEntry(std::unique_ptr<Record> Rec) : Rec(std::move(Rec)) {}
     RecordsEntry(std::unique_ptr<ForeachLoop> Loop)
         : Loop(std::move(Loop)) {}
diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h
index 4d1464901777..a6065d4ed9ec 100644
--- a/llvm/lib/Target/AArch64/AArch64.h
+++ b/llvm/lib/Target/AArch64/AArch64.h
@@ -16,6 +16,8 @@
 
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "Utils/AArch64BaseInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Target/TargetMachine.h"
 
@@ -71,6 +73,7 @@ void initializeAArch64A53Fix835769Pass(PassRegistry&);
 void initializeAArch64A57FPLoadBalancingPass(PassRegistry&);
 void initializeAArch64AdvSIMDScalarPass(PassRegistry&);
 void initializeAArch64BranchTargetsPass(PassRegistry&);
+void initializeAArch64CFIFixupPass(PassRegistry&);
 void initializeAArch64CollectLOHPass(PassRegistry&);
 void initializeAArch64CondBrTuningPass(PassRegistry &);
 void initializeAArch64CompressJumpTablesPass(PassRegistry&);
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 9a04b28a8b8f..f092c039b58e 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -64,6 +64,10 @@ def FeatureLSE : SubtargetFeature<"lse", "HasLSE", "true",
 def FeatureLSE2 : SubtargetFeature<"lse2", "HasLSE2", "true",
   "Enable ARMv8.4 Large System Extension 2 (LSE2) atomicity rules">;
 
+def FeatureLDAPR : SubtargetFeature<"ldapr", "HasLDAPR", "true",
+  "Use LDAPR to lower atomic loads; experimental until we "
+  "have more testing/a formal correctness proof">;
+
 def FeatureOutlineAtomics : SubtargetFeature<"outline-atomics", "OutlineAtomics", "true",
   "Enable out of line atomics to support LSE instructions">;
 
@@ -154,6 +158,10 @@ def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
 def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true",
                                         "Has zero-cycle zeroing instructions for generic registers">;
 
+// It is generally beneficial to rewrite "fmov s0, wzr" to "movi d0, #0".
+// as movi is more efficient across all cores. Newer cores can eliminate
+// fmovs early and there is no difference with movi, but this not true for
+// all implementations.
 def FeatureNoZCZeroingFP : SubtargetFeature<"no-zcz-fp", "HasZeroCycleZeroingFP", "false",
                                         "Has no zero-cycle zeroing instructions for FP registers">;
 
@@ -168,7 +176,7 @@ def FeatureZCZeroingFPWorkaround : SubtargetFeature<"zcz-fp-workaround",
     "The zero-cycle floating-point zeroing instruction has a bug">;
 
 def FeatureStrictAlign : SubtargetFeature<"strict-align",
-                                          "StrictAlign", "true",
+                                          "RequiresStrictAlign", "true",
                                           "Disallow all unaligned memory "
                                           "access">;
 
@@ -190,11 +198,11 @@ def FeaturePredictableSelectIsExpensive : SubtargetFeature<
     "Prefer likely predicted branches over selects">;
 
 def FeatureCustomCheapAsMoveHandling : SubtargetFeature<"custom-cheap-as-move",
-    "CustomAsCheapAsMove", "true",
+    "HasCustomCheapAsMoveHandling", "true",
     "Use custom handling of cheap instructions">;
 
 def FeatureExynosCheapAsMoveHandling : SubtargetFeature<"exynos-cheap-as-move",
-    "ExynosAsCheapAsMove", "true",
+    "HasExynosCheapAsMoveHandling", "true",
     "Use Exynos specific handling of cheap instructions",
     [FeatureCustomCheapAsMoveHandling]>;
 
@@ -202,12 +210,16 @@ def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler",
     "UsePostRAScheduler", "true", "Schedule again after register allocation">;
 
 def FeatureSlowMisaligned128Store : SubtargetFeature<"slow-misaligned-128store",
-    "Misaligned128StoreIsSlow", "true", "Misaligned 128 bit stores are slow">;
+    "IsMisaligned128StoreSlow", "true", "Misaligned 128 bit stores are slow">;
 
 def FeatureSlowPaired128 : SubtargetFeature<"slow-paired-128",
-    "Paired128IsSlow", "true", "Paired 128 bit loads and stores are slow">;
+    "IsPaired128Slow", "true", "Paired 128 bit loads and stores are slow">;
+
+def FeatureAscendStoreAddress : SubtargetFeature<"ascend-store-address",
+    "IsStoreAddressAscend", "false",
+    "Schedule vector stores by ascending address">;
 
-def FeatureSlowSTRQro : SubtargetFeature<"slow-strqro-store", "STRQroIsSlow",
+def FeatureSlowSTRQro : SubtargetFeature<"slow-strqro-store", "IsSTRQroSlow",
     "true", "STR of Q register with register offset is slow">;
 
 def FeatureAlternateSExtLoadCVTF32Pattern : SubtargetFeature<
@@ -246,6 +258,10 @@ def FeatureFuseCryptoEOR : SubtargetFeature<
     "fuse-crypto-eor", "HasFuseCryptoEOR", "true",
     "CPU fuses AES/PMULL and EOR operations">;
 
+def FeatureFuseAdrpAdd : SubtargetFeature<
+    "fuse-adrp-add", "HasFuseAdrpAdd", "true",
+    "CPU fuses adrp+add operations">;
+
 def FeatureFuseLiterals : SubtargetFeature<
     "fuse-literals", "HasFuseLiterals", "true",
     "CPU fuses literal generation operations">;
@@ -438,13 +454,8 @@ def FeatureEnhancedCounterVirtualization :
 def FeatureRME : SubtargetFeature<"rme", "HasRME",
     "true", "Enable Realm Management Extension">;
 
-// A subset of SVE(2) instructions are legal in Streaming SVE execution mode
-// defined by SME.
-def FeatureStreamingSVE : SubtargetFeature<"streaming-sve",
-                                           "HasStreamingSVE", "true",
-  "Enable subset of SVE(2) instructions for Streaming SVE execution mode">;
 def FeatureSME : SubtargetFeature<"sme", "HasSME", "true",
-  "Enable Scalable Matrix Extension (SME)", [FeatureStreamingSVE, FeatureBF16]>;
+  "Enable Scalable Matrix Extension (SME)", [FeatureBF16, FeatureUseScalarIncVL]>;
 
 def FeatureSMEF64 : SubtargetFeature<"sme-f64", "HasSMEF64", "true",
   "Enable Scalable Matrix Extension (SME) F64F64 instructions", [FeatureSME]>;
@@ -464,6 +475,11 @@ def FeatureEL3 : SubtargetFeature<"el3", "HasEL3", "true",
 def FeatureFixCortexA53_835769 : SubtargetFeature<"fix-cortex-a53-835769",
   "FixCortexA53_835769", "true", "Mitigate Cortex-A53 Erratum 835769">;
 
+def FeatureNoBTIAtReturnTwice : SubtargetFeature<"no-bti-at-return-twice",
+                                                 "NoBTIAtReturnTwice", "true",
+                                                 "Don't place a BTI instruction "
+                                                 "after a return-twice">;
+
 //===----------------------------------------------------------------------===//
 // Architectures.
 //
@@ -534,7 +550,18 @@ def HasV8_0rOps : SubtargetFeature<
   FeaturePAuth, FeatureRCPC,
   //v8.4
   FeatureDotProd, FeatureTRACEV8_4, FeatureTLB_RMI,
-  FeatureFlagM, FeatureDIT, FeatureSEL2, FeatureRCPC_IMMO]>;
+  FeatureFlagM, FeatureDIT, FeatureSEL2, FeatureRCPC_IMMO,
+  // Not mandatory in v8.0-R, but included here on the grounds that it
+  // only enables names of system registers
+  FeatureSpecRestrict
+  ]>;
+
+// Only intended to be used by disassemblers.
+def FeatureAll
+    : SubtargetFeature<"all", "IsAll", "true", "Enable all instructions", []>;
+
+class AssemblerPredicateWithAll<dag cond, string name="">
+    : AssemblerPredicate<(any_of FeatureAll, cond), name>;
 
 //===----------------------------------------------------------------------===//
 // Register File Description
@@ -552,6 +579,7 @@ include "AArch64Schedule.td"
 include "AArch64InstrInfo.td"
 include "AArch64SchedPredicates.td"
 include "AArch64SchedPredExynos.td"
+include "AArch64SchedPredAmpere.td"
 include "AArch64Combine.td"
 
 def AArch64InstrInfo : InstrInfo;
@@ -596,7 +624,7 @@ class AArch64Unsupported { list<Predicate> F; }
 
 def SVEUnsupported : AArch64Unsupported {
   let F = [HasSVE, HasSVE2, HasSVE2AES, HasSVE2SM4, HasSVE2SHA3,
-           HasSVE2BitPerm, HasSVEorStreamingSVE, HasSVE2orStreamingSVE];
+           HasSVE2BitPerm, HasSVEorSME, HasSVE2orSME];
 }
 
 def PAUnsupported : AArch64Unsupported {
@@ -621,6 +649,7 @@ include "AArch64SchedThunderX2T99.td"
 include "AArch64SchedA64FX.td"
 include "AArch64SchedThunderX3T110.td"
 include "AArch64SchedTSV110.td"
+include "AArch64SchedAmpere1.td"
 
 def TuneA35     : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
                                 "Cortex-A35 ARM processors">;
@@ -649,6 +678,7 @@ def TuneA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
                                    FeatureFuseAES,
                                    FeatureBalanceFPOps,
                                    FeatureCustomCheapAsMoveHandling,
+                                   FeatureFuseAdrpAdd,
                                    FeatureFuseLiterals,
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive]>;
@@ -657,11 +687,13 @@ def TuneA65     : SubtargetFeature<"a65", "ARMProcFamily", "CortexA65",
                                    "Cortex-A65 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAddress,
+                                   FeatureFuseAdrpAdd,
                                    FeatureFuseLiterals]>;
 
 def TuneA72     : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
                                    "Cortex-A72 ARM processors", [
                                    FeatureFuseAES,
+                                   FeatureFuseAdrpAdd,
                                    FeatureFuseLiterals]>;
 
 def TuneA73     : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73",
@@ -802,6 +834,7 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
                                     FeatureFuseArithmeticLogic,
                                     FeatureFuseCCSelect,
                                     FeatureFuseCryptoEOR,
+                                    FeatureFuseAdrpAdd,
                                     FeatureFuseLiterals,
                                     FeatureZCRegMove,
                                     FeatureZCZeroing]>;
@@ -813,13 +846,15 @@ def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
                                      FeatureFuseAddress,
                                      FeatureFuseAES,
                                      FeatureFuseCCSelect,
+                                     FeatureFuseAdrpAdd,
                                      FeatureFuseLiterals,
                                      FeatureLSLFast,
                                      FeaturePostRAScheduler,
                                      FeaturePredictableSelectIsExpensive]>;
 
-def TuneExynosM4 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
-                                    "Samsung Exynos-M3 processors",
+// Re-uses some scheduling and tunings from the ExynosM3 proc family.
+def TuneExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3",
+                                    "Samsung Exynos-M4 processors",
                                     [FeatureArithmeticBccFusion,
                                      FeatureArithmeticCbzFusion,
                                      FeatureExynosCheapAsMoveHandling,
@@ -828,6 +863,7 @@ def TuneExynosM4 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
                                      FeatureFuseAES,
                                      FeatureFuseArithmeticLogic,
                                      FeatureFuseCCSelect,
+                                     FeatureFuseAdrpAdd,
                                      FeatureFuseLiterals,
                                      FeatureLSLFast,
                                      FeaturePostRAScheduler,
@@ -934,6 +970,16 @@ def TuneTSV110 : SubtargetFeature<"tsv110", "ARMProcFamily", "TSV110",
                                   FeatureFuseAES,
                                   FeaturePostRAScheduler]>;
 
+def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1",
+                                   "Ampere Computing Ampere-1 processors", [
+                                   FeaturePostRAScheduler,
+                                   FeatureFuseAES,
+                                   FeatureLSLFast,
+                                   FeatureAggressiveFMA,
+                                   FeatureArithmeticBccFusion,
+                                   FeatureCmpBccFusion,
+                                   FeatureFuseAddress,
+                                   FeatureFuseLiterals]>;
 
 def ProcessorFeatures {
   list<SubtargetFeature> A53  = [HasV8_0aOps, FeatureCRC, FeatureCrypto,
@@ -947,13 +993,14 @@ def ProcessorFeatures {
                                  FeatureFP16FML];
   list<SubtargetFeature> A65  = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
                                  FeatureNEON, FeatureFullFP16, FeatureDotProd,
-                                 FeatureRCPC, FeatureSSBS, FeatureRAS];
+                                 FeatureRCPC, FeatureSSBS, FeatureRAS,
+                                 FeaturePerfMon];
   list<SubtargetFeature> A76  = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
                                  FeatureNEON, FeatureFullFP16, FeatureDotProd,
-                                 FeatureRCPC, FeatureSSBS];
+                                 FeatureRCPC, FeatureSSBS, FeaturePerfMon];
   list<SubtargetFeature> A77  = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
                                  FeatureNEON, FeatureFullFP16, FeatureDotProd,
-                                 FeatureRCPC];
+                                 FeatureRCPC, FeaturePerfMon, FeatureSSBS];
   list<SubtargetFeature> A78  = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
                                  FeatureNEON, FeatureFullFP16, FeatureDotProd,
                                  FeatureRCPC, FeaturePerfMon, FeatureSPE,
@@ -968,14 +1015,15 @@ def ProcessorFeatures {
                                  FeatureSVE2BitPerm, FeatureBF16, FeatureMatMulInt8];
   list<SubtargetFeature> R82  = [HasV8_0rOps, FeaturePerfMon, FeatureFullFP16,
                                  FeatureFP16FML, FeatureSSBS, FeaturePredRes,
-                                 FeatureSB, FeatureSpecRestrict];
+                                 FeatureSB];
   list<SubtargetFeature> X1   = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
                                  FeatureNEON, FeatureRCPC, FeaturePerfMon,
-                                 FeatureSPE, FeatureFullFP16, FeatureDotProd];
+                                 FeatureSPE, FeatureFullFP16, FeatureDotProd,
+                                 FeatureSSBS];
   list<SubtargetFeature> X1C  = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
                                  FeatureNEON, FeatureRCPC, FeaturePerfMon,
                                  FeatureSPE, FeatureFullFP16, FeatureDotProd,
-                                 FeaturePAuth];
+                                 FeaturePAuth, FeatureSSBS];
   list<SubtargetFeature> X2   = [HasV9_0aOps, FeatureNEON, FeaturePerfMon,
                                  FeatureMatMulInt8, FeatureBF16, FeatureAM,
                                  FeatureMTE, FeatureETE, FeatureSVE2BitPerm,
@@ -1012,13 +1060,15 @@ def ProcessorFeatures {
                                      FeatureRDM];
   list<SubtargetFeature> NeoverseE1 = [HasV8_2aOps, FeatureCrypto, FeatureDotProd,
                                        FeatureFPARMv8, FeatureFullFP16, FeatureNEON,
-                                       FeatureRCPC, FeatureSSBS];
+                                       FeatureRCPC, FeatureSSBS, FeaturePerfMon];
   list<SubtargetFeature> NeoverseN1 = [HasV8_2aOps, FeatureCrypto, FeatureDotProd,
                                        FeatureFPARMv8, FeatureFullFP16, FeatureNEON,
-                                       FeatureRCPC, FeatureSPE, FeatureSSBS];
+                                       FeatureRCPC, FeatureSPE, FeatureSSBS,
+                                       FeaturePerfMon];
   list<SubtargetFeature> NeoverseN2 = [HasV8_5aOps, FeatureBF16, FeatureETE,
                                        FeatureMatMulInt8, FeatureMTE, FeatureSVE2,
-                                       FeatureSVE2BitPerm, FeatureTRBE, FeatureCrypto];
+                                       FeatureSVE2BitPerm, FeatureTRBE, FeatureCrypto,
+                                       FeaturePerfMon];
   list<SubtargetFeature> Neoverse512TVB = [HasV8_4aOps, FeatureBF16, FeatureCacheDeepPersist,
                                            FeatureCrypto, FeatureFPARMv8, FeatureFP16FML,
                                            FeatureFullFP16, FeatureMatMulInt8, FeatureNEON,
@@ -1041,17 +1091,20 @@ def ProcessorFeatures {
   list<SubtargetFeature> TSV110 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
                                    FeatureNEON, FeaturePerfMon, FeatureSPE,
                                    FeatureFullFP16, FeatureFP16FML, FeatureDotProd];
+  list<SubtargetFeature> Ampere1 = [HasV8_6aOps, FeatureNEON, FeaturePerfMon,
+                                    FeatureMTE, FeatureSSBS];
 
   // ETE and TRBE are future architecture extensions. We temporarily enable them
   // by default for users targeting generic AArch64. The extensions do not
   // affect code generated by the compiler and can be used only by explicitly
   // mentioning the new system register names in assembly.
-  list<SubtargetFeature> Generic = [FeatureFPARMv8, FeatureNEON, FeaturePerfMon, FeatureETE];
+  list<SubtargetFeature> Generic = [FeatureFPARMv8, FeatureNEON, FeatureETE];
 }
 
-
+// FeatureFuseAdrpAdd is enabled under Generic to allow linker merging
+// optimizations.
 def : ProcessorModel<"generic", CortexA55Model, ProcessorFeatures.Generic,
-                     [FeatureFuseAES, FeaturePostRAScheduler]>;
+                     [FeatureFuseAES, FeatureFuseAdrpAdd, FeaturePostRAScheduler]>;
 def : ProcessorModel<"cortex-a35", CortexA53Model, ProcessorFeatures.A53,
                      [TuneA35]>;
 def : ProcessorModel<"cortex-a34", CortexA53Model, ProcessorFeatures.A53,
@@ -1178,6 +1231,10 @@ def : ProcessorModel<"a64fx", A64FXModel, ProcessorFeatures.A64FX,
 def : ProcessorModel<"carmel", NoSchedModel, ProcessorFeatures.Carmel,
                      [TuneCarmel]>;
 
+// Ampere Computing
+def : ProcessorModel<"ampere1", Ampere1Model, ProcessorFeatures.Ampere1,
+                     [TuneAmpere1]>;
+
 //===----------------------------------------------------------------------===//
 // Assembly parser
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp b/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp
index 4cdf5f144437..37a65b64a885 100644
--- a/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp
+++ b/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp
@@ -223,6 +223,7 @@ AArch64A53Fix835769::runOnBasicBlock(MachineBasicBlock &MBB) {
       if (isFirstInstructionInSequence(PrevInstr) &&
           isSecondInstructionInSequence(CurrInstr)) {
         LLVM_DEBUG(dbgs() << "   ** pattern found at Idx " << Idx << "!\n");
+        (void) Idx;
         Sequences.push_back(CurrInstr);
       }
     }
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index b54a0eaba7d1..ef4860979dd3 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -132,7 +132,7 @@ public:
 
   bool runOnMachineFunction(MachineFunction &MF) override {
     AArch64FI = MF.getInfo<AArch64FunctionInfo>();
-    STI = static_cast<const AArch64Subtarget*>(&MF.getSubtarget());
+    STI = &MF.getSubtarget<AArch64Subtarget>();
 
     SetupMachineFunction(MF);
 
@@ -143,10 +143,10 @@ public:
       int Type =
         COFF::IMAGE_SYM_DTYPE_FUNCTION << COFF::SCT_COMPLEX_TYPE_SHIFT;
 
-      OutStreamer->BeginCOFFSymbolDef(CurrentFnSym);
-      OutStreamer->EmitCOFFSymbolStorageClass(Scl);
-      OutStreamer->EmitCOFFSymbolType(Type);
-      OutStreamer->EndCOFFSymbolDef();
+      OutStreamer->beginCOFFSymbolDef(CurrentFnSym);
+      OutStreamer->emitCOFFSymbolStorageClass(Scl);
+      OutStreamer->emitCOFFSymbolType(Type);
+      OutStreamer->endCOFFSymbolDef();
     }
 
     // Emit the rest of the function body.
@@ -204,10 +204,10 @@ void AArch64AsmPrinter::emitStartOfAsmFile(Module &M) {
     // Emit an absolute @feat.00 symbol.  This appears to be some kind of
     // compiler features bitfield read by link.exe.
     MCSymbol *S = MMI->getContext().getOrCreateSymbol(StringRef("@feat.00"));
-    OutStreamer->BeginCOFFSymbolDef(S);
-    OutStreamer->EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC);
-    OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_NULL);
-    OutStreamer->EndCOFFSymbolDef();
+    OutStreamer->beginCOFFSymbolDef(S);
+    OutStreamer->emitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC);
+    OutStreamer->emitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_NULL);
+    OutStreamer->endCOFFSymbolDef();
     int64_t Feat00Flags = 0;
 
     if (M.getModuleFlag("cfguard")) {
@@ -251,7 +251,7 @@ void AArch64AsmPrinter::emitFunctionHeaderComment() {
   const AArch64FunctionInfo *FI = MF->getInfo<AArch64FunctionInfo>();
   Optional<std::string> OutlinerString = FI->getOutliningStyle();
   if (OutlinerString != None)
-    OutStreamer->GetCommentOS() << ' ' << OutlinerString;
+    OutStreamer->getCommentOS() << ' ' << OutlinerString;
 }
 
 void AArch64AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI)
@@ -378,10 +378,10 @@ void AArch64AsmPrinter::emitHwasanMemaccessSymbols(Module &M) {
     bool CompileKernel =
         (AccessInfo >> HWASanAccessInfo::CompileKernelShift) & 1;
 
-    OutStreamer->SwitchSection(OutContext.getELFSection(
+    OutStreamer->switchSection(OutContext.getELFSection(
         ".text.hot", ELF::SHT_PROGBITS,
-        ELF::SHF_EXECINSTR | ELF::SHF_ALLOC | ELF::SHF_GROUP, 0,
-        Sym->getName(), /*IsComdat=*/true));
+        ELF::SHF_EXECINSTR | ELF::SHF_ALLOC | ELF::SHF_GROUP, 0, Sym->getName(),
+        /*IsComdat=*/true));
 
     OutStreamer->emitSymbolAttribute(Sym, MCSA_ELF_TypeFunction);
     OutStreamer->emitSymbolAttribute(Sym, MCSA_Weak);
@@ -827,7 +827,7 @@ void AArch64AsmPrinter::emitJumpTableInfo() {
 
   const TargetLoweringObjectFile &TLOF = getObjFileLowering();
   MCSection *ReadOnlySec = TLOF.getSectionForJumpTable(MF->getFunction(), TM);
-  OutStreamer->SwitchSection(ReadOnlySec);
+  OutStreamer->switchSection(ReadOnlySec);
 
   auto AFI = MF->getInfo<AArch64FunctionInfo>();
   for (unsigned JTI = 0, e = JT.size(); JTI != e; ++JTI) {
@@ -865,7 +865,7 @@ void AArch64AsmPrinter::emitFunctionEntryLabel() {
   if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall ||
       MF->getFunction().getCallingConv() ==
           CallingConv::AArch64_SVE_VectorCall ||
-      STI->getRegisterInfo()->hasSVEArgsOrReturn(MF)) {
+      MF->getInfo<AArch64FunctionInfo>()->isSVECC()) {
     auto *TS =
         static_cast<AArch64TargetStreamer *>(OutStreamer->getTargetStreamer());
     TS->emitDirectiveVariantPCS(CurrentFnSym);
@@ -1129,7 +1129,8 @@ void AArch64AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI) {
 
 void AArch64AsmPrinter::emitFMov0(const MachineInstr &MI) {
   Register DestReg = MI.getOperand(0).getReg();
-  if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround()) {
+  if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround() &&
+      STI->hasNEON()) {
     // Convert H/S register to corresponding D register
     if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31)
       DestReg = AArch64::D0 + (DestReg - AArch64::H0);
@@ -1262,7 +1263,7 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
     break;
 
   case AArch64::DBG_VALUE:
-  case AArch64::DBG_VALUE_LIST: {
+  case AArch64::DBG_VALUE_LIST:
     if (isVerbose() && OutStreamer->hasRawTextSupport()) {
       SmallString<128> TmpStr;
       raw_svector_ostream OS(TmpStr);
@@ -1282,8 +1283,18 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
 
       OutStreamer->emitCFIBKeyFrame();
       return;
-    }
-    }
+  }
+
+  case AArch64::EMITMTETAGGED: {
+    ExceptionHandling ExceptionHandlingType = MAI->getExceptionHandlingType();
+    if (ExceptionHandlingType != ExceptionHandling::DwarfCFI &&
+        ExceptionHandlingType != ExceptionHandling::ARM)
+      return;
+
+    if (getFunctionCFISectionType(*MF) != CFISection::None)
+      OutStreamer->emitCFIMTETaggedFrame();
+    return;
+  }
 
   // Tail calls use pseudo instructions so they have the proper code-gen
   // attributes (isCall, isReturn, etc.). We lower them to the real
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index f26151536a58..c0da242a26de 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -82,9 +82,9 @@ def CC_AArch64_AAPCS : CallingConv<[
             nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64],
            CCPassIndirect<i64>>,
 
-  CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1],
+  CCIfType<[nxv1i1, nxv2i1, nxv4i1, nxv8i1, nxv16i1],
            CCAssignToReg<[P0, P1, P2, P3]>>,
-  CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1],
+  CCIfType<[nxv1i1, nxv2i1, nxv4i1, nxv8i1, nxv16i1],
            CCPassIndirect<i64>>,
 
   // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
@@ -149,7 +149,7 @@ def RetCC_AArch64_AAPCS : CallingConv<[
             nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64],
            CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>,
 
-  CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1],
+  CCIfType<[nxv1i1, nxv2i1, nxv4i1, nxv8i1, nxv16i1],
            CCAssignToReg<[P0, P1, P2, P3]>>
 ]>;
 
diff --git a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
index ac243347b24d..d12689970dc5 100644
--- a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -528,10 +528,8 @@ static void handleNormalInst(const MachineInstr &MI, LOHInfo *LOHInfos) {
     // count as MultiUser or block optimization. This is especially important on
     // arm64_32, where any memory operation is likely to be an explicit use of
     // xN and an implicit use of wN (the base address register).
-    if (!UsesSeen.count(Idx)) {
+    if (UsesSeen.insert(Idx).second)
       handleUse(MI, MO, LOHInfos[Idx]);
-      UsesSeen.insert(Idx);
-    }
   }
 }
 
@@ -559,7 +557,7 @@ bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
     // Walk the basic block backwards and update the per register state machine
     // in the process.
     for (const MachineInstr &MI :
-         instructionsWithoutDebug(MBB.rbegin(), MBB.rend())) {
+         instructionsWithoutDebug(MBB.instr_rbegin(), MBB.instr_rend())) {
       unsigned Opcode = MI.getOpcode();
       switch (Opcode) {
       case AArch64::ADDXri:
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 1994e0eb7fb9..18c111255e53 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -217,7 +217,7 @@ def AArch64PostLegalizerLoweringHelper
 // Post-legalization combines which are primarily optimizations.
 def AArch64PostLegalizerCombinerHelper
     : GICombinerHelper<"AArch64GenPostLegalizerCombinerHelper",
-                       [copy_prop, erase_undef_store, combines_for_extload,
+                       [copy_prop, combines_for_extload,
                         sext_trunc_sextload, mutate_anyext_to_zext,
                         hoist_logic_op_with_same_opcode_hands,
                         redundant_and, xor_of_and_with_same_reg,
@@ -228,6 +228,6 @@ def AArch64PostLegalizerCombinerHelper
                         select_combines, fold_merge_to_zext,
                         constant_fold, identity_combines,
                         ptr_add_immed_chain, overlapping_and,
-                        split_store_zero_128]> {
+                        split_store_zero_128, undef_combines]> {
   let DisableRuleOption = "aarch64postlegalizercombiner-disable-rule";
 }
diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index 82e8df3b73f9..343f888b7552 100644
--- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -247,8 +247,8 @@ void SSACCmpConv::updateTailPHIs() {
     for (unsigned oi = I.getNumOperands(); oi > 2; oi -= 2) {
       // PHI operands are (Reg, MBB) at (oi-2, oi-1).
       if (I.getOperand(oi - 1).getMBB() == CmpBB) {
-        I.RemoveOperand(oi - 1);
-        I.RemoveOperand(oi - 2);
+        I.removeOperand(oi - 1);
+        I.removeOperand(oi - 2);
       }
     }
   }
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index b0f739cc26e6..910f8cdede75 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -86,6 +86,7 @@ private:
                           unsigned N);
   bool expandCALL_RVMARKER(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI);
+  bool expandCALL_BTI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
   bool expandStoreSwiftAsyncContext(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator MBBI);
 };
@@ -759,6 +760,37 @@ bool AArch64ExpandPseudo::expandCALL_RVMARKER(
   return true;
 }
 
+bool AArch64ExpandPseudo::expandCALL_BTI(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MBBI) {
+  // Expand CALL_BTI pseudo to:
+  // - a branch to the call target
+  // - a BTI instruction
+  // Mark the sequence as a bundle, to avoid passes moving other code in
+  // between.
+
+  MachineInstr &MI = *MBBI;
+  MachineOperand &CallTarget = MI.getOperand(0);
+  assert((CallTarget.isGlobal() || CallTarget.isReg()) &&
+         "invalid operand for regular call");
+  unsigned Opc = CallTarget.isGlobal() ? AArch64::BL : AArch64::BLR;
+  MachineInstr *Call =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)).getInstr();
+  Call->addOperand(CallTarget);
+
+  MachineInstr *BTI =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::HINT))
+          // BTI J so that setjmp can to BR to this.
+          .addImm(36)
+          .getInstr();
+
+  if (MI.shouldUpdateCallSiteInfo())
+    MBB.getParent()->moveCallSiteInfo(&MI, Call);
+
+  MI.eraseFromParent();
+  finalizeBundle(MBB, Call->getIterator(), std::next(BTI->getIterator()));
+  return true;
+}
+
 bool AArch64ExpandPseudo::expandStoreSwiftAsyncContext(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) {
   Register CtxReg = MBBI->getOperand(0).getReg();
@@ -1238,6 +1270,8 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
      return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 2);
    case AArch64::BLR_RVMARKER:
      return expandCALL_RVMARKER(MBB, MBBI);
+   case AArch64::BLR_BTI:
+     return expandCALL_BTI(MBB, MBBI);
    case AArch64::StoreSwiftAsyncContext:
      return expandStoreSwiftAsyncContext(MBB, MBBI);
   }
diff --git a/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
index 793663ef97d7..6de374125466 100644
--- a/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
@@ -813,7 +813,7 @@ void FalkorHWPFFix::runOnLoop(MachineLoop &L, MachineFunction &Fn) {
 }
 
 bool FalkorHWPFFix::runOnMachineFunction(MachineFunction &Fn) {
-  auto &ST = static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
+  auto &ST = Fn.getSubtarget<AArch64Subtarget>();
   if (ST.getProcFamily() != AArch64Subtarget::Falkor)
     return false;
 
diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index c67fa62c7a92..49fffa01a974 100644
--- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -14,6 +14,7 @@
 
 #include "AArch64.h"
 #include "AArch64CallingConvention.h"
+#include "AArch64MachineFunctionInfo.h"
 #include "AArch64RegisterInfo.h"
 #include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
@@ -282,8 +283,7 @@ public:
   explicit AArch64FastISel(FunctionLoweringInfo &FuncInfo,
                            const TargetLibraryInfo *LibInfo)
       : FastISel(FuncInfo, LibInfo, /*SkipTargetIndependentISel=*/true) {
-    Subtarget =
-        &static_cast<const AArch64Subtarget &>(FuncInfo.MF->getSubtarget());
+    Subtarget = &FuncInfo.MF->getSubtarget<AArch64Subtarget>();
     Context = &FuncInfo.Fn->getContext();
   }
 
@@ -3127,6 +3127,13 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
   if (!Callee && !Symbol)
     return false;
 
+  // Allow SelectionDAG isel to handle calls to functions like setjmp that need
+  // a bti instruction following the call.
+  if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
+      !Subtarget->noBTIAtReturnTwice() &&
+      MF->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement())
+    return false;
+
   // Allow SelectionDAG isel to handle tail calls.
   if (IsTailCall)
     return false;
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index a4d20735e2b1..78babdf9f1f0 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -117,6 +117,72 @@
 //
 // FIXME: also explain the redzone concept.
 //
+// An example of the prologue:
+//
+//     .globl __foo
+//     .align 2
+//  __foo:
+// Ltmp0:
+//     .cfi_startproc
+//     .cfi_personality 155, ___gxx_personality_v0
+// Leh_func_begin:
+//     .cfi_lsda 16, Lexception33
+//
+//     stp  xa,bx, [sp, -#offset]!
+//     ...
+//     stp  x28, x27, [sp, #offset-32]
+//     stp  fp, lr, [sp, #offset-16]
+//     add  fp, sp, #offset - 16
+//     sub  sp, sp, #1360
+//
+// The Stack:
+//       +-------------------------------------------+
+// 10000 | ........ | ........ | ........ | ........ |
+// 10004 | ........ | ........ | ........ | ........ |
+//       +-------------------------------------------+
+// 10008 | ........ | ........ | ........ | ........ |
+// 1000c | ........ | ........ | ........ | ........ |
+//       +===========================================+
+// 10010 |                X28 Register               |
+// 10014 |                X28 Register               |
+//       +-------------------------------------------+
+// 10018 |                X27 Register               |
+// 1001c |                X27 Register               |
+//       +===========================================+
+// 10020 |                Frame Pointer              |
+// 10024 |                Frame Pointer              |
+//       +-------------------------------------------+
+// 10028 |                Link Register              |
+// 1002c |                Link Register              |
+//       +===========================================+
+// 10030 | ........ | ........ | ........ | ........ |
+// 10034 | ........ | ........ | ........ | ........ |
+//       +-------------------------------------------+
+// 10038 | ........ | ........ | ........ | ........ |
+// 1003c | ........ | ........ | ........ | ........ |
+//       +-------------------------------------------+
+//
+//     [sp] = 10030        ::    >>initial value<<
+//     sp = 10020          ::  stp fp, lr, [sp, #-16]!
+//     fp = sp == 10020    ::  mov fp, sp
+//     [sp] == 10020       ::  stp x28, x27, [sp, #-16]!
+//     sp == 10010         ::    >>final value<<
+//
+// The frame pointer (w29) points to address 10020. If we use an offset of
+// '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
+// for w27, and -32 for w28:
+//
+//  Ltmp1:
+//     .cfi_def_cfa w29, 16
+//  Ltmp2:
+//     .cfi_offset w30, -8
+//  Ltmp3:
+//     .cfi_offset w29, -16
+//  Ltmp4:
+//     .cfi_offset w27, -24
+//  Ltmp5:
+//     .cfi_offset w28, -32
+//
 //===----------------------------------------------------------------------===//
 
 #include "AArch64FrameLowering.h"
@@ -126,6 +192,7 @@
 #include "AArch64Subtarget.h"
 #include "AArch64TargetMachine.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -154,7 +221,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/LEB128.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
@@ -187,7 +253,7 @@ static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects",
                                        cl::init(true), cl::Hidden);
 
 cl::opt<bool> EnableHomogeneousPrologEpilog(
-    "homogeneous-prolog-epilog", cl::init(false), cl::ZeroOrMore, cl::Hidden,
+    "homogeneous-prolog-epilog", cl::Hidden,
     cl::desc("Emit homogeneous prologue and epilogue for the size "
              "optimization (default = off)"));
 
@@ -233,6 +299,7 @@ static int64_t getArgumentStackToRestore(MachineFunction &MF,
 static bool produceCompactUnwindFrame(MachineFunction &MF);
 static bool needsWinCFI(const MachineFunction &MF);
 static StackOffset getSVEStackSize(const MachineFunction &MF);
+static bool needsShadowCallStackPrologueEpilogue(MachineFunction &MF);
 
 /// Returns true if a homogeneous prolog or epilog code can be emitted
 /// for the size optimization. If possible, a frame helper call is injected.
@@ -440,137 +507,309 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
   return MBB.erase(I);
 }
 
-// Convenience function to create a DWARF expression for
-//   Expr + NumBytes + NumVGScaledBytes * AArch64::VG
-static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr,
-                                     int NumBytes, int NumVGScaledBytes, unsigned VG,
-                                     llvm::raw_string_ostream &Comment) {
-  uint8_t buffer[16];
+void AArch64FrameLowering::emitCalleeSavedGPRLocations(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
 
-  if (NumBytes) {
-    Expr.push_back(dwarf::DW_OP_consts);
-    Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer));
-    Expr.push_back((uint8_t)dwarf::DW_OP_plus);
-    Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
-  }
+  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+  if (CSI.empty())
+    return;
 
-  if (NumVGScaledBytes) {
-    Expr.push_back((uint8_t)dwarf::DW_OP_consts);
-    Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer));
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
 
-    Expr.push_back((uint8_t)dwarf::DW_OP_bregx);
-    Expr.append(buffer, buffer + encodeULEB128(VG, buffer));
-    Expr.push_back(0);
+  for (const auto &Info : CSI) {
+    if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector)
+      continue;
 
-    Expr.push_back((uint8_t)dwarf::DW_OP_mul);
-    Expr.push_back((uint8_t)dwarf::DW_OP_plus);
+    assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
+    unsigned DwarfReg = TRI.getDwarfRegNum(Info.getReg(), true);
 
-    Comment << (NumVGScaledBytes < 0 ? " - " : " + ")
-            << std::abs(NumVGScaledBytes) << " * VG";
+    int64_t Offset =
+        MFI.getObjectOffset(Info.getFrameIdx()) - getOffsetOfLocalArea();
+    unsigned CFIIndex = MF.addFrameInst(
+        MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
+    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex)
+        .setMIFlags(MachineInstr::FrameSetup);
   }
 }
 
-// Creates an MCCFIInstruction:
-//    { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
-MCCFIInstruction AArch64FrameLowering::createDefCFAExpressionFromSP(
-    const TargetRegisterInfo &TRI, const StackOffset &OffsetFromSP) const {
-  int64_t NumBytes, NumVGScaledBytes;
-  AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(OffsetFromSP, NumBytes,
-                                                        NumVGScaledBytes);
+void AArch64FrameLowering::emitCalleeSavedSVELocations(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  // Add callee saved registers to move list.
+  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+  if (CSI.empty())
+    return;
+
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+  AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
+
+  for (const auto &Info : CSI) {
+    if (!(MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector))
+      continue;
+
+    // Not all unwinders may know about SVE registers, so assume the lowest
+    // common demoninator.
+    assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
+    unsigned Reg = Info.getReg();
+    if (!static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg))
+      continue;
+
+    StackOffset Offset =
+        StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) -
+        StackOffset::getFixed(AFI.getCalleeSavedStackSize(MFI));
 
-  std::string CommentBuffer = "sp";
-  llvm::raw_string_ostream Comment(CommentBuffer);
+    unsigned CFIIndex = MF.addFrameInst(createCFAOffset(TRI, Reg, Offset));
+    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex)
+        .setMIFlags(MachineInstr::FrameSetup);
+  }
+}
 
-  // Build up the expression (SP + NumBytes + NumVGScaledBytes * AArch64::VG)
-  SmallString<64> Expr;
-  Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + /*SP*/ 31));
-  Expr.push_back(0);
-  appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes,
-                           TRI.getDwarfRegNum(AArch64::VG, true), Comment);
+void AArch64FrameLowering::emitCalleeSavedFrameMoves(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
+  emitCalleeSavedGPRLocations(MBB, MBBI);
+  emitCalleeSavedSVELocations(MBB, MBBI);
+}
 
-  // Wrap this into DW_CFA_def_cfa.
-  SmallString<64> DefCfaExpr;
-  DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
-  uint8_t buffer[16];
-  DefCfaExpr.append(buffer,
-                    buffer + encodeULEB128(Expr.size(), buffer));
-  DefCfaExpr.append(Expr.str());
-  return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(),
-                                        Comment.str());
+static void insertCFISameValue(const MCInstrDesc &Desc, MachineFunction &MF,
+                               MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator InsertPt,
+                               unsigned DwarfReg) {
+  unsigned CFIIndex =
+      MF.addFrameInst(MCCFIInstruction::createSameValue(nullptr, DwarfReg));
+  BuildMI(MBB, InsertPt, DebugLoc(), Desc).addCFIIndex(CFIIndex);
 }
 
-MCCFIInstruction AArch64FrameLowering::createCfaOffset(
-    const TargetRegisterInfo &TRI, unsigned Reg,
-    const StackOffset &OffsetFromDefCFA) const {
-  int64_t NumBytes, NumVGScaledBytes;
-  AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
-      OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
+void AArch64FrameLowering::resetCFIToInitialState(
+    MachineBasicBlock &MBB) const {
 
-  unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
+  MachineFunction &MF = *MBB.getParent();
+  const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+  const auto &TRI =
+      static_cast<const AArch64RegisterInfo &>(*Subtarget.getRegisterInfo());
+  const auto &MFI = *MF.getInfo<AArch64FunctionInfo>();
 
-  // Non-scalable offsets can use DW_CFA_offset directly.
-  if (!NumVGScaledBytes)
-    return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
+  const MCInstrDesc &CFIDesc = TII.get(TargetOpcode::CFI_INSTRUCTION);
+  DebugLoc DL;
 
-  std::string CommentBuffer;
-  llvm::raw_string_ostream Comment(CommentBuffer);
-  Comment << printReg(Reg, &TRI) << "  @ cfa";
+  // Reset the CFA to `SP + 0`.
+  MachineBasicBlock::iterator InsertPt = MBB.begin();
+  unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
+      nullptr, TRI.getDwarfRegNum(AArch64::SP, true), 0));
+  BuildMI(MBB, InsertPt, DL, CFIDesc).addCFIIndex(CFIIndex);
 
-  // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG)
-  SmallString<64> OffsetExpr;
-  appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes,
-                           TRI.getDwarfRegNum(AArch64::VG, true), Comment);
+  // Flip the RA sign state.
+  if (MFI.shouldSignReturnAddress()) {
+    CFIIndex = MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
+    BuildMI(MBB, InsertPt, DL, CFIDesc).addCFIIndex(CFIIndex);
+  }
 
-  // Wrap this into DW_CFA_expression
-  SmallString<64> CfaExpr;
-  CfaExpr.push_back(dwarf::DW_CFA_expression);
-  uint8_t buffer[16];
-  CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));
-  CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer));
-  CfaExpr.append(OffsetExpr.str());
+  // Shadow call stack uses X18, reset it.
+  if (needsShadowCallStackPrologueEpilogue(MF))
+    insertCFISameValue(CFIDesc, MF, MBB, InsertPt,
+                       TRI.getDwarfRegNum(AArch64::X18, true));
 
-  return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), Comment.str());
+  // Emit .cfi_same_value for callee-saved registers.
+  const std::vector<CalleeSavedInfo> &CSI =
+      MF.getFrameInfo().getCalleeSavedInfo();
+  for (const auto &Info : CSI) {
+    unsigned Reg = Info.getReg();
+    if (!TRI.regNeedsCFI(Reg, Reg))
+      continue;
+    insertCFISameValue(CFIDesc, MF, MBB, InsertPt,
+                       TRI.getDwarfRegNum(Reg, true));
+  }
 }
 
-void AArch64FrameLowering::emitCalleeSavedFrameMoves(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
+static void emitCalleeSavedRestores(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    bool SVE) {
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  const TargetSubtargetInfo &STI = MF.getSubtarget();
-  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
-  const TargetInstrInfo *TII = STI.getInstrInfo();
-  DebugLoc DL = MBB.findDebugLoc(MBBI);
 
-  // Add callee saved registers to move list.
   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
   if (CSI.empty())
     return;
 
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
   for (const auto &Info : CSI) {
-    Register Reg = Info.getReg();
+    if (SVE !=
+        (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector))
+      continue;
 
-    // Not all unwinders may know about SVE registers, so assume the lowest
-    // common demoninator.
-    unsigned NewReg;
-    if (static_cast<const AArch64RegisterInfo *>(TRI)->regNeedsCFI(Reg, NewReg))
-      Reg = NewReg;
-    else
+    unsigned Reg = Info.getReg();
+    if (SVE &&
+        !static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg))
       continue;
 
-    StackOffset Offset;
-    if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector) {
-      AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
-      Offset =
-          StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) -
-          StackOffset::getFixed(AFI->getCalleeSavedStackSize(MFI));
-    } else {
-      Offset = StackOffset::getFixed(MFI.getObjectOffset(Info.getFrameIdx()) -
-                                     getOffsetOfLocalArea());
-    }
-    unsigned CFIIndex = MF.addFrameInst(createCfaOffset(*TRI, Reg, Offset));
-    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+    unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestore(
+        nullptr, TRI.getDwarfRegNum(Info.getReg(), true)));
+    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
         .addCFIIndex(CFIIndex)
-        .setMIFlags(MachineInstr::FrameSetup);
+        .setMIFlags(MachineInstr::FrameDestroy);
+  }
+}
+
+void AArch64FrameLowering::emitCalleeSavedGPRRestores(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
+  emitCalleeSavedRestores(MBB, MBBI, false);
+}
+
+void AArch64FrameLowering::emitCalleeSavedSVERestores(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
+  emitCalleeSavedRestores(MBB, MBBI, true);
+}
+
+static MCRegister getRegisterOrZero(MCRegister Reg, bool HasSVE) {
+  switch (Reg.id()) {
+  default:
+    // The called routine is expected to preserve r19-r28
+    // r29 and r30 are used as frame pointer and link register resp.
+    return 0;
+
+    // GPRs
+#define CASE(n)                                                                \
+  case AArch64::W##n:                                                          \
+  case AArch64::X##n:                                                          \
+    return AArch64::X##n
+  CASE(0);
+  CASE(1);
+  CASE(2);
+  CASE(3);
+  CASE(4);
+  CASE(5);
+  CASE(6);
+  CASE(7);
+  CASE(8);
+  CASE(9);
+  CASE(10);
+  CASE(11);
+  CASE(12);
+  CASE(13);
+  CASE(14);
+  CASE(15);
+  CASE(16);
+  CASE(17);
+  CASE(18);
+#undef CASE
+
+    // FPRs
+#define CASE(n)                                                                \
+  case AArch64::B##n:                                                          \
+  case AArch64::H##n:                                                          \
+  case AArch64::S##n:                                                          \
+  case AArch64::D##n:                                                          \
+  case AArch64::Q##n:                                                          \
+    return HasSVE ? AArch64::Z##n : AArch64::Q##n
+  CASE(0);
+  CASE(1);
+  CASE(2);
+  CASE(3);
+  CASE(4);
+  CASE(5);
+  CASE(6);
+  CASE(7);
+  CASE(8);
+  CASE(9);
+  CASE(10);
+  CASE(11);
+  CASE(12);
+  CASE(13);
+  CASE(14);
+  CASE(15);
+  CASE(16);
+  CASE(17);
+  CASE(18);
+  CASE(19);
+  CASE(20);
+  CASE(21);
+  CASE(22);
+  CASE(23);
+  CASE(24);
+  CASE(25);
+  CASE(26);
+  CASE(27);
+  CASE(28);
+  CASE(29);
+  CASE(30);
+  CASE(31);
+#undef CASE
+  }
+}
+
+void AArch64FrameLowering::emitZeroCallUsedRegs(BitVector RegsToZero,
+                                                MachineBasicBlock &MBB) const {
+  // Insertion point.
+  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+
+  // Fake a debug loc.
+  DebugLoc DL;
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
+
+  const MachineFunction &MF = *MBB.getParent();
+  const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
+  const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
+
+  BitVector GPRsToZero(TRI.getNumRegs());
+  BitVector FPRsToZero(TRI.getNumRegs());
+  bool HasSVE = STI.hasSVE();
+  for (MCRegister Reg : RegsToZero.set_bits()) {
+    if (TRI.isGeneralPurposeRegister(MF, Reg)) {
+      // For GPRs, we only care to clear out the 64-bit register.
+      if (MCRegister XReg = getRegisterOrZero(Reg, HasSVE))
+        GPRsToZero.set(XReg);
+    } else if (AArch64::FPR128RegClass.contains(Reg) ||
+               AArch64::FPR64RegClass.contains(Reg) ||
+               AArch64::FPR32RegClass.contains(Reg) ||
+               AArch64::FPR16RegClass.contains(Reg) ||
+               AArch64::FPR8RegClass.contains(Reg)) {
+      // For FPRs,
+      if (MCRegister XReg = getRegisterOrZero(Reg, HasSVE))
+        FPRsToZero.set(XReg);
+    }
+  }
+
+  const AArch64InstrInfo &TII = *STI.getInstrInfo();
+
+  // Zero out GPRs.
+  for (MCRegister Reg : GPRsToZero.set_bits())
+    BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), Reg).addImm(0);
+
+  // Zero out FP/vector registers.
+  for (MCRegister Reg : FPRsToZero.set_bits())
+    if (HasSVE)
+      BuildMI(MBB, MBBI, DL, TII.get(AArch64::DUP_ZI_D), Reg)
+        .addImm(0)
+        .addImm(0);
+    else
+      BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVIv2d_ns), Reg).addImm(0);
+
+  if (HasSVE) {
+    for (MCRegister PReg :
+         {AArch64::P0, AArch64::P1, AArch64::P2, AArch64::P3, AArch64::P4,
+          AArch64::P5, AArch64::P6, AArch64::P7, AArch64::P8, AArch64::P9,
+          AArch64::P10, AArch64::P11, AArch64::P12, AArch64::P13, AArch64::P14,
+          AArch64::P15}) {
+      if (RegsToZero[PReg])
+        BuildMI(MBB, MBBI, DL, TII.get(AArch64::PFALSE), PReg);
+    }
   }
 }
 
@@ -881,16 +1120,9 @@ static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI,
 static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
     const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc,
-    bool NeedsWinCFI, bool *HasWinCFI, bool InProlog = true) {
-  // Ignore instructions that do not operate on SP, i.e. shadow call stack
-  // instructions and associated CFI instruction.
-  while (MBBI->getOpcode() == AArch64::STRXpost ||
-         MBBI->getOpcode() == AArch64::LDRXpre ||
-         MBBI->getOpcode() == AArch64::CFI_INSTRUCTION) {
-    if (MBBI->getOpcode() != AArch64::CFI_INSTRUCTION)
-      assert(MBBI->getOperand(0).getReg() != AArch64::SP);
-    ++MBBI;
-  }
+    bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI,
+    MachineInstr::MIFlag FrameFlag = MachineInstr::FrameSetup,
+    int CFAOffset = 0) {
   unsigned NewOpc;
   switch (MBBI->getOpcode()) {
   default:
@@ -949,12 +1181,14 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
 
   // If the first store isn't right where we want SP then we can't fold the
   // update in so create a normal arithmetic instruction instead.
+  MachineFunction &MF = *MBB.getParent();
   if (MBBI->getOperand(MBBI->getNumOperands() - 1).getImm() != 0 ||
       CSStackSizeInc < MinOffset || CSStackSizeInc > MaxOffset) {
     emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
-                    StackOffset::getFixed(CSStackSizeInc), TII,
-                    InProlog ? MachineInstr::FrameSetup
-                             : MachineInstr::FrameDestroy);
+                    StackOffset::getFixed(CSStackSizeInc), TII, FrameFlag,
+                    false, false, nullptr, EmitCFI,
+                    StackOffset::getFixed(CFAOffset));
+
     return std::prev(MBBI);
   }
 
@@ -981,8 +1215,15 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
   // Generate a new SEH code that corresponds to the new instruction.
   if (NeedsWinCFI) {
     *HasWinCFI = true;
-    InsertSEH(*MIB, *TII,
-              InProlog ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy);
+    InsertSEH(*MIB, *TII, FrameFlag);
+  }
+
+  if (EmitCFI) {
+    unsigned CFIIndex = MF.addFrameInst(
+        MCCFIInstruction::cfiDefCfaOffset(nullptr, CFAOffset - CSStackSizeInc));
+    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex)
+        .setMIFlags(FrameFlag);
   }
 
   return std::prev(MBB.erase(MBBI));
@@ -998,16 +1239,6 @@ static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
     return;
 
   unsigned Opc = MI.getOpcode();
-
-  // Ignore instructions that do not operate on SP, i.e. shadow call stack
-  // instructions and associated CFI instruction.
-  if (Opc == AArch64::STRXpost || Opc == AArch64::LDRXpre ||
-      Opc == AArch64::CFI_INSTRUCTION) {
-    if (Opc != AArch64::CFI_INSTRUCTION)
-      assert(MI.getOperand(0).getReg() != AArch64::SP);
-    return;
-  }
-
   unsigned Scale;
   switch (Opc) {
   case AArch64::STPXi:
@@ -1049,38 +1280,6 @@ static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
   }
 }
 
-static void adaptForLdStOpt(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator FirstSPPopI,
-                            MachineBasicBlock::iterator LastPopI) {
-  // Sometimes (when we restore in the same order as we save), we can end up
-  // with code like this:
-  //
-  // ldp      x26, x25, [sp]
-  // ldp      x24, x23, [sp, #16]
-  // ldp      x22, x21, [sp, #32]
-  // ldp      x20, x19, [sp, #48]
-  // add      sp, sp, #64
-  //
-  // In this case, it is always better to put the first ldp at the end, so
-  // that the load-store optimizer can run and merge the ldp and the add into
-  // a post-index ldp.
-  // If we managed to grab the first pop instruction, move it to the end.
-  if (ReverseCSRRestoreSeq)
-    MBB.splice(FirstSPPopI, &MBB, LastPopI);
-  // We should end up with something like this now:
-  //
-  // ldp      x24, x23, [sp, #16]
-  // ldp      x22, x21, [sp, #32]
-  // ldp      x20, x19, [sp, #48]
-  // ldp      x26, x25, [sp]
-  // add      sp, sp, #64
-  //
-  // and the load-store optimizer can merge the last two instructions into:
-  //
-  // ldp      x26, x25, [sp], #64
-  //
-}
-
 static bool isTargetWindows(const MachineFunction &MF) {
   return MF.getSubtarget<AArch64Subtarget>().isTargetWindows();
 }
@@ -1099,6 +1298,80 @@ static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
   }
 }
 
+static bool needsShadowCallStackPrologueEpilogue(MachineFunction &MF) {
+  if (!(llvm::any_of(
+            MF.getFrameInfo().getCalleeSavedInfo(),
+            [](const auto &Info) { return Info.getReg() == AArch64::LR; }) &&
+        MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack)))
+    return false;
+
+  if (!MF.getSubtarget<AArch64Subtarget>().isXRegisterReserved(18))
+    report_fatal_error("Must reserve x18 to use shadow call stack");
+
+  return true;
+}
+
+static void emitShadowCallStackPrologue(const TargetInstrInfo &TII,
+                                        MachineFunction &MF,
+                                        MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MBBI,
+                                        const DebugLoc &DL, bool NeedsWinCFI,
+                                        bool NeedsUnwindInfo) {
+  // Shadow call stack prolog: str x30, [x18], #8
+  BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXpost))
+      .addReg(AArch64::X18, RegState::Define)
+      .addReg(AArch64::LR)
+      .addReg(AArch64::X18)
+      .addImm(8)
+      .setMIFlag(MachineInstr::FrameSetup);
+
+  // This instruction also makes x18 live-in to the entry block.
+  MBB.addLiveIn(AArch64::X18);
+
+  if (NeedsWinCFI)
+    BuildMI(MBB, MBBI, DL, TII.get(AArch64::SEH_Nop))
+        .setMIFlag(MachineInstr::FrameSetup);
+
+  if (NeedsUnwindInfo) {
+    // Emit a CFI instruction that causes 8 to be subtracted from the value of
+    // x18 when unwinding past this frame.
+    static const char CFIInst[] = {
+        dwarf::DW_CFA_val_expression,
+        18, // register
+        2,  // length
+        static_cast<char>(unsigned(dwarf::DW_OP_breg18)),
+        static_cast<char>(-8) & 0x7f, // addend (sleb128)
+    };
+    unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createEscape(
+        nullptr, StringRef(CFIInst, sizeof(CFIInst))));
+    BuildMI(MBB, MBBI, DL, TII.get(AArch64::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+}
+
+static void emitShadowCallStackEpilogue(const TargetInstrInfo &TII,
+                                        MachineFunction &MF,
+                                        MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MBBI,
+                                        const DebugLoc &DL) {
+  // Shadow call stack epilog: ldr x30, [x18, #-8]!
+  BuildMI(MBB, MBBI, DL, TII.get(AArch64::LDRXpre))
+      .addReg(AArch64::X18, RegState::Define)
+      .addReg(AArch64::LR, RegState::Define)
+      .addReg(AArch64::X18)
+      .addImm(-8)
+      .setMIFlag(MachineInstr::FrameDestroy);
+
+  if (MF.getInfo<AArch64FunctionInfo>()->needsAsyncDwarfUnwindInfo()) {
+    unsigned CFIIndex =
+        MF.addFrameInst(MCCFIInstruction::createRestore(nullptr, 18));
+    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex)
+        .setMIFlags(MachineInstr::FrameDestroy);
+  }
+}
+
 void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
                                         MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.begin();
@@ -1109,8 +1382,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineModuleInfo &MMI = MF.getMMI();
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
-  bool needsFrameMoves =
-      MF.needsFrameMoves() && !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+  bool EmitCFI = AFI->needsDwarfUnwindInfo();
   bool HasFP = hasFP(MF);
   bool NeedsWinCFI = needsWinCFI(MF);
   bool HasWinCFI = false;
@@ -1128,8 +1400,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   DebugLoc DL;
 
   const auto &MFnI = *MF.getInfo<AArch64FunctionInfo>();
-  if (MFnI.shouldSignReturnAddress()) {
+  if (needsShadowCallStackPrologueEpilogue(MF))
+    emitShadowCallStackPrologue(*TII, MF, MBB, MBBI, DL, NeedsWinCFI,
+                                MFnI.needsDwarfUnwindInfo());
 
+  if (MFnI.shouldSignReturnAddress()) {
     unsigned PACI;
     if (MFnI.shouldSignWithBKey()) {
       BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITBKEY))
@@ -1145,12 +1420,17 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
           .addReg(AArch64::LR)
           .addReg(AArch64::SP, RegState::InternalRead);
     MI.setMIFlag(MachineInstr::FrameSetup);
-
-    unsigned CFIIndex =
-        MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
-    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-        .addCFIIndex(CFIIndex)
-        .setMIFlags(MachineInstr::FrameSetup);
+    if (EmitCFI) {
+      unsigned CFIIndex =
+          MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
+    }
+  }
+  if (EmitCFI && MFnI.isMTETagged()) {
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITMTETAGGED))
+        .setMIFlag(MachineInstr::FrameSetup);
   }
 
   // We signal the presence of a Swift extended frame to external tools by
@@ -1227,7 +1507,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
       emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
                       StackOffset::getFixed(-NumBytes), TII,
                       MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
-      if (needsFrameMoves) {
+      if (EmitCFI) {
         // Label used to tie together the PROLOG_LABEL and the MachineMoves.
         MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
           // Encode the stack size of the leaf function.
@@ -1261,14 +1541,16 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
     assert(!SVEStackSize && "Cannot combine SP bump with SVE");
     emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
                     StackOffset::getFixed(-NumBytes), TII,
-                    MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
+                    MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI,
+                    EmitCFI);
     NumBytes = 0;
   } else if (HomPrologEpilog) {
     // Stack has been already adjusted.
     NumBytes -= PrologueSaveSize;
   } else if (PrologueSaveSize != 0) {
     MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
-        MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI);
+        MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI,
+        EmitCFI);
     NumBytes -= PrologueSaveSize;
   }
   assert(NumBytes >= 0 && "Negative stack allocation size!?");
@@ -1322,8 +1604,27 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
                       StackOffset::getFixed(FPOffset), TII,
                       MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
     }
+    if (EmitCFI) {
+      // Define the current CFA rule to use the provided FP.
+      const int OffsetToFirstCalleeSaveFromFP =
+          AFI->getCalleeSaveBaseToFrameRecordOffset() -
+          AFI->getCalleeSavedStackSize();
+      Register FramePtr = RegInfo->getFrameRegister(MF);
+      unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
+      unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
+          nullptr, Reg, FixedObject - OffsetToFirstCalleeSaveFromFP));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
+    }
   }
 
+  // Now emit the moves for whatever callee saved regs we have (including FP,
+  // LR if those are saved). Frame instructions for SVE register are emitted
+  // later, after the instruction which actually save SVE regs.
+  if (EmitCFI)
+    emitCalleeSavedGPRLocations(MBB, MBBI);
+
   if (windowsRequiresStackProbe(MF, NumBytes)) {
     uint64_t NumWords = NumBytes >> 4;
     if (NeedsWinCFI) {
@@ -1436,14 +1737,21 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   }
 
   // Allocate space for the callee saves (if any).
-  emitFrameOffset(MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP,
-                  -AllocateBefore, TII,
-                  MachineInstr::FrameSetup);
+  emitFrameOffset(
+      MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP, -AllocateBefore, TII,
+      MachineInstr::FrameSetup, false, false, nullptr,
+      EmitCFI && !HasFP && AllocateBefore,
+      StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes));
+
+  if (EmitCFI)
+    emitCalleeSavedSVELocations(MBB, CalleeSavesEnd);
 
   // Finally allocate remaining SVE stack space.
   emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP,
-                  -AllocateAfter, TII,
-                  MachineInstr::FrameSetup);
+                  -AllocateAfter, TII, MachineInstr::FrameSetup, false, false,
+                  nullptr, EmitCFI && !HasFP && AllocateAfter,
+                  AllocateBefore + StackOffset::getFixed(
+                                       (int64_t)MFI.getStackSize() - NumBytes));
 
   // Allocate space for the rest of the frame.
   if (NumBytes) {
@@ -1458,14 +1766,17 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
     }
 
     // If we're a leaf function, try using the red zone.
-    if (!canUseRedZone(MF))
+    if (!canUseRedZone(MF)) {
       // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
       // the correct value here, as NumBytes also includes padding bytes,
       // which shouldn't be counted here.
-      emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP,
-                      StackOffset::getFixed(-NumBytes), TII,
-                      MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
-
+      emitFrameOffset(
+          MBB, MBBI, DL, scratchSPReg, AArch64::SP,
+          StackOffset::getFixed(-NumBytes), TII, MachineInstr::FrameSetup,
+          false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP,
+          SVEStackSize +
+              StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes));
+    }
     if (NeedsRealignment) {
       const unsigned NrBitsToZero = Log2(MFI.getMaxAlign());
       assert(NrBitsToZero > 1);
@@ -1532,109 +1843,6 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
       MBB.addLiveIn(AArch64::X1);
     }
   }
-
-  if (needsFrameMoves) {
-    // An example of the prologue:
-    //
-    //     .globl __foo
-    //     .align 2
-    //  __foo:
-    // Ltmp0:
-    //     .cfi_startproc
-    //     .cfi_personality 155, ___gxx_personality_v0
-    // Leh_func_begin:
-    //     .cfi_lsda 16, Lexception33
-    //
-    //     stp  xa,bx, [sp, -#offset]!
-    //     ...
-    //     stp  x28, x27, [sp, #offset-32]
-    //     stp  fp, lr, [sp, #offset-16]
-    //     add  fp, sp, #offset - 16
-    //     sub  sp, sp, #1360
-    //
-    // The Stack:
-    //       +-------------------------------------------+
-    // 10000 | ........ | ........ | ........ | ........ |
-    // 10004 | ........ | ........ | ........ | ........ |
-    //       +-------------------------------------------+
-    // 10008 | ........ | ........ | ........ | ........ |
-    // 1000c | ........ | ........ | ........ | ........ |
-    //       +===========================================+
-    // 10010 |                X28 Register               |
-    // 10014 |                X28 Register               |
-    //       +-------------------------------------------+
-    // 10018 |                X27 Register               |
-    // 1001c |                X27 Register               |
-    //       +===========================================+
-    // 10020 |                Frame Pointer              |
-    // 10024 |                Frame Pointer              |
-    //       +-------------------------------------------+
-    // 10028 |                Link Register              |
-    // 1002c |                Link Register              |
-    //       +===========================================+
-    // 10030 | ........ | ........ | ........ | ........ |
-    // 10034 | ........ | ........ | ........ | ........ |
-    //       +-------------------------------------------+
-    // 10038 | ........ | ........ | ........ | ........ |
-    // 1003c | ........ | ........ | ........ | ........ |
-    //       +-------------------------------------------+
-    //
-    //     [sp] = 10030        ::    >>initial value<<
-    //     sp = 10020          ::  stp fp, lr, [sp, #-16]!
-    //     fp = sp == 10020    ::  mov fp, sp
-    //     [sp] == 10020       ::  stp x28, x27, [sp, #-16]!
-    //     sp == 10010         ::    >>final value<<
-    //
-    // The frame pointer (w29) points to address 10020. If we use an offset of
-    // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
-    // for w27, and -32 for w28:
-    //
-    //  Ltmp1:
-    //     .cfi_def_cfa w29, 16
-    //  Ltmp2:
-    //     .cfi_offset w30, -8
-    //  Ltmp3:
-    //     .cfi_offset w29, -16
-    //  Ltmp4:
-    //     .cfi_offset w27, -24
-    //  Ltmp5:
-    //     .cfi_offset w28, -32
-
-    if (HasFP) {
-      const int OffsetToFirstCalleeSaveFromFP =
-          AFI->getCalleeSaveBaseToFrameRecordOffset() -
-          AFI->getCalleeSavedStackSize();
-      Register FramePtr = RegInfo->getFrameRegister(MF);
-
-      // Define the current CFA rule to use the provided FP.
-      unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
-      unsigned CFIIndex = MF.addFrameInst(
-          MCCFIInstruction::cfiDefCfa(nullptr, Reg, FixedObject - OffsetToFirstCalleeSaveFromFP));
-      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex)
-          .setMIFlags(MachineInstr::FrameSetup);
-    } else {
-      unsigned CFIIndex;
-      if (SVEStackSize) {
-        const TargetSubtargetInfo &STI = MF.getSubtarget();
-        const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
-        StackOffset TotalSize =
-            SVEStackSize + StackOffset::getFixed((int64_t)MFI.getStackSize());
-        CFIIndex = MF.addFrameInst(createDefCFAExpressionFromSP(TRI, TotalSize));
-      } else {
-        // Encode the stack size of the leaf function.
-        CFIIndex = MF.addFrameInst(
-            MCCFIInstruction::cfiDefCfaOffset(nullptr, MFI.getStackSize()));
-      }
-      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex)
-          .setMIFlags(MachineInstr::FrameSetup);
-    }
-
-    // Now emit the moves for whatever callee saved regs we have (including FP,
-    // LR if those are saved).
-    emitCalleeSavedFrameMoves(MBB, MBBI);
-  }
 }
 
 static void InsertReturnAddressAuth(MachineFunction &MF,
@@ -1653,7 +1861,8 @@ static void InsertReturnAddressAuth(MachineFunction &MF,
   // The AUTIASP instruction assembles to a hint instruction before v8.3a so
   // this instruction can safely used for any v8a architecture.
   // From v8.3a onwards there are optimised authenticate LR and return
-  // instructions, namely RETA{A,B}, that can be used instead.
+  // instructions, namely RETA{A,B}, that can be used instead. In this case the
+  // DW_CFA_AARCH64_negate_ra_state can't be emitted.
   if (Subtarget.hasPAuth() && MBBI != MBB.end() &&
       MBBI->getOpcode() == AArch64::RET_ReallyLR) {
     BuildMI(MBB, MBBI, DL,
@@ -1665,6 +1874,12 @@ static void InsertReturnAddressAuth(MachineFunction &MF,
         MBB, MBBI, DL,
         TII->get(MFI.shouldSignWithBKey() ? AArch64::AUTIBSP : AArch64::AUTIASP))
         .setMIFlag(MachineInstr::FrameDestroy);
+
+    unsigned CFIIndex =
+        MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
+    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex)
+        .setMIFlags(MachineInstr::FrameDestroy);
   }
 }
 
@@ -1686,6 +1901,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL;
   bool NeedsWinCFI = needsWinCFI(MF);
+  bool EmitCFI = MF.getInfo<AArch64FunctionInfo>()->needsAsyncDwarfUnwindInfo();
   bool HasWinCFI = false;
   bool IsFunclet = false;
   auto WinCFI = make_scope_exit([&]() { assert(HasWinCFI == MF.hasWinCFI()); });
@@ -1695,6 +1911,14 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
     IsFunclet = isFuncletReturnInstr(*MBBI);
   }
 
+  auto FinishingTouches = make_scope_exit([&]() {
+    InsertReturnAddressAuth(MF, MBB);
+    if (needsShadowCallStackPrologueEpilogue(MF))
+      emitShadowCallStackEpilogue(*TII, MF, MBB, MBB.getFirstTerminator(), DL);
+    if (EmitCFI)
+      emitCalleeSavedGPRRestores(MBB, MBB.getFirstTerminator());
+  });
+
   int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
                                : MFI.getStackSize();
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
@@ -1707,36 +1931,6 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   // How much of the stack used by incoming arguments this function is expected
   // to restore in this particular epilogue.
   int64_t ArgumentStackToRestore = getArgumentStackToRestore(MF, MBB);
-
-  // The stack frame should be like below,
-  //
-  //      ----------------------                     ---
-  //      |                    |                      |
-  //      | BytesInStackArgArea|              CalleeArgStackSize
-  //      | (NumReusableBytes) |                (of tail call)
-  //      |                    |                     ---
-  //      |                    |                      |
-  //      ---------------------|        ---           |
-  //      |                    |         |            |
-  //      |   CalleeSavedReg   |         |            |
-  //      | (CalleeSavedStackSize)|      |            |
-  //      |                    |         |            |
-  //      ---------------------|         |         NumBytes
-  //      |                    |     StackSize  (StackAdjustUp)
-  //      |   LocalStackSize   |         |            |
-  //      | (covering callee   |         |            |
-  //      |       args)        |         |            |
-  //      |                    |         |            |
-  //      ----------------------        ---          ---
-  //
-  // So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize
-  //             = StackSize + ArgumentPopSize
-  //
-  // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
-  // it as the 2nd argument of AArch64ISD::TC_RETURN.
-
-  auto Cleanup = make_scope_exit([&] { InsertReturnAddressAuth(MF, MBB); });
-
   bool IsWin64 =
       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
   unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
@@ -1771,9 +1965,11 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
   // Assume we can't combine the last pop with the sp restore.
 
+  bool CombineAfterCSRBump = false;
   if (!CombineSPBump && PrologueSaveSize != 0) {
     MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
-    while (AArch64InstrInfo::isSEHInstruction(*Pop))
+    while (Pop->getOpcode() == TargetOpcode::CFI_INSTRUCTION ||
+           AArch64InstrInfo::isSEHInstruction(*Pop))
       Pop = std::prev(Pop);
     // Converting the last ldp to a post-index ldp is valid only if the last
     // ldp's offset is 0.
@@ -1781,15 +1977,17 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
     // If the offset is 0 and the AfterCSR pop is not actually trying to
     // allocate more stack for arguments (in space that an untimely interrupt
     // may clobber), convert it to a post-index ldp.
-    if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0)
+    if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0) {
       convertCalleeSaveRestoreToSPPrePostIncDec(
-          MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, &HasWinCFI, false);
-    else {
+          MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, &HasWinCFI, EmitCFI,
+          MachineInstr::FrameDestroy, PrologueSaveSize);
+    } else {
       // If not, make sure to emit an add after the last ldp.
       // We're doing this by transfering the size to be restored from the
       // adjustment *before* the CSR pops to the adjustment *after* the CSR
       // pops.
       AfterCSRPopSize += PrologueSaveSize;
+      CombineAfterCSRBump = true;
     }
   }
 
@@ -1822,15 +2020,27 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   }
 
   if (hasFP(MF) && AFI->hasSwiftAsyncContext()) {
-    // We need to reset FP to its untagged state on return. Bit 60 is currently
-    // used to show the presence of an extended frame.
-
-    // BIC x29, x29, #0x1000_0000_0000_0000
-    BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::ANDXri),
-            AArch64::FP)
-        .addUse(AArch64::FP)
-        .addImm(0x10fe)
-        .setMIFlag(MachineInstr::FrameDestroy);
+    switch (MF.getTarget().Options.SwiftAsyncFramePointer) {
+    case SwiftAsyncFramePointerMode::DeploymentBased:
+      // Avoid the reload as it is GOT relative, and instead fall back to the
+      // hardcoded value below.  This allows a mismatch between the OS and
+      // application without immediately terminating on the difference.
+      LLVM_FALLTHROUGH;
+    case SwiftAsyncFramePointerMode::Always:
+      // We need to reset FP to its untagged state on return. Bit 60 is
+      // currently used to show the presence of an extended frame.
+
+      // BIC x29, x29, #0x1000_0000_0000_0000
+      BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::ANDXri),
+              AArch64::FP)
+          .addUse(AArch64::FP)
+          .addImm(0x10fe)
+          .setMIFlag(MachineInstr::FrameDestroy);
+      break;
+
+    case SwiftAsyncFramePointerMode::Never:
+      break;
+    }
   }
 
   const StackOffset &SVEStackSize = getSVEStackSize(MF);
@@ -1838,10 +2048,22 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   // If there is a single SP update, insert it before the ret and we're done.
   if (CombineSPBump) {
     assert(!SVEStackSize && "Cannot combine SP bump with SVE");
+
+    // When we are about to restore the CSRs, the CFA register is SP again.
+    if (EmitCFI && hasFP(MF)) {
+      const AArch64RegisterInfo &RegInfo = *Subtarget.getRegisterInfo();
+      unsigned Reg = RegInfo.getDwarfRegNum(AArch64::SP, true);
+      unsigned CFIIndex =
+          MF.addFrameInst(MCCFIInstruction::cfiDefCfa(nullptr, Reg, NumBytes));
+      BuildMI(MBB, LastPopI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameDestroy);
+    }
+
     emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
                     StackOffset::getFixed(NumBytes + (int64_t)AfterCSRPopSize),
                     TII, MachineInstr::FrameDestroy, false, NeedsWinCFI,
-                    &HasWinCFI);
+                    &HasWinCFI, EmitCFI, StackOffset::getFixed(NumBytes));
     if (HasWinCFI)
       BuildMI(MBB, MBB.getFirstTerminator(), DL,
               TII->get(AArch64::SEH_EpilogEnd))
@@ -1873,30 +2095,44 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
 
   // Deallocate the SVE area.
   if (SVEStackSize) {
-    if (AFI->isStackRealigned()) {
-      if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize())
+    // If we have stack realignment or variable sized objects on the stack,
+    // restore the stack pointer from the frame pointer prior to SVE CSR
+    // restoration.
+    if (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) {
+      if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
         // Set SP to start of SVE callee-save area from which they can
         // be reloaded. The code below will deallocate the stack space
         // space by moving FP -> SP.
         emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP,
                         StackOffset::getScalable(-CalleeSavedSize), TII,
                         MachineInstr::FrameDestroy);
+      }
     } else {
       if (AFI->getSVECalleeSavedStackSize()) {
         // Deallocate the non-SVE locals first before we can deallocate (and
         // restore callee saves) from the SVE area.
-        emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
-                        StackOffset::getFixed(NumBytes), TII,
-                        MachineInstr::FrameDestroy);
+        emitFrameOffset(
+            MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
+            StackOffset::getFixed(NumBytes), TII, MachineInstr::FrameDestroy,
+            false, false, nullptr, EmitCFI && !hasFP(MF),
+            SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize));
         NumBytes = 0;
       }
 
       emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
-                      DeallocateBefore, TII, MachineInstr::FrameDestroy);
+                      DeallocateBefore, TII, MachineInstr::FrameDestroy, false,
+                      false, nullptr, EmitCFI && !hasFP(MF),
+                      SVEStackSize +
+                          StackOffset::getFixed(NumBytes + PrologueSaveSize));
 
       emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
-                      DeallocateAfter, TII, MachineInstr::FrameDestroy);
+                      DeallocateAfter, TII, MachineInstr::FrameDestroy, false,
+                      false, nullptr, EmitCFI && !hasFP(MF),
+                      DeallocateAfter +
+                          StackOffset::getFixed(NumBytes + PrologueSaveSize));
     }
+    if (EmitCFI)
+      emitCalleeSavedSVERestores(MBB, RestoreEnd);
   }
 
   if (!hasFP(MF)) {
@@ -1906,23 +2142,24 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
     if (RedZone && AfterCSRPopSize == 0)
       return;
 
+    // Pop the local variables off the stack. If there are no callee-saved
+    // registers, it means we are actually positioned at the terminator and can
+    // combine stack increment for the locals and the stack increment for
+    // callee-popped arguments into (possibly) a single instruction and be done.
     bool NoCalleeSaveRestore = PrologueSaveSize == 0;
     int64_t StackRestoreBytes = RedZone ? 0 : NumBytes;
     if (NoCalleeSaveRestore)
       StackRestoreBytes += AfterCSRPopSize;
 
+    emitFrameOffset(
+        MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
+        StackOffset::getFixed(StackRestoreBytes), TII,
+        MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI, EmitCFI,
+        StackOffset::getFixed((RedZone ? 0 : NumBytes) + PrologueSaveSize));
+
     // If we were able to combine the local stack pop with the argument pop,
     // then we're done.
-    bool Done = NoCalleeSaveRestore || AfterCSRPopSize == 0;
-
-    // If we're done after this, make sure to help the load store optimizer.
-    if (Done)
-      adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI);
-
-    emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
-                    StackOffset::getFixed(StackRestoreBytes), TII,
-                    MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
-    if (Done) {
+    if (NoCalleeSaveRestore || AfterCSRPopSize == 0) {
       if (HasWinCFI) {
         BuildMI(MBB, MBB.getFirstTerminator(), DL,
                 TII->get(AArch64::SEH_EpilogEnd))
@@ -1948,29 +2185,29 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
                     StackOffset::getFixed(NumBytes), TII,
                     MachineInstr::FrameDestroy, false, NeedsWinCFI);
 
+  // When we are about to restore the CSRs, the CFA register is SP again.
+  if (EmitCFI && hasFP(MF)) {
+    const AArch64RegisterInfo &RegInfo = *Subtarget.getRegisterInfo();
+    unsigned Reg = RegInfo.getDwarfRegNum(AArch64::SP, true);
+    unsigned CFIIndex = MF.addFrameInst(
+        MCCFIInstruction::cfiDefCfa(nullptr, Reg, PrologueSaveSize));
+    BuildMI(MBB, LastPopI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex)
+        .setMIFlags(MachineInstr::FrameDestroy);
+  }
+
   // This must be placed after the callee-save restore code because that code
   // assumes the SP is at the same location as it was after the callee-save save
   // code in the prologue.
   if (AfterCSRPopSize) {
     assert(AfterCSRPopSize > 0 && "attempting to reallocate arg stack that an "
                                   "interrupt may have clobbered");
-    // Find an insertion point for the first ldp so that it goes before the
-    // shadow call stack epilog instruction. This ensures that the restore of
-    // lr from x18 is placed after the restore from sp.
-    auto FirstSPPopI = MBB.getFirstTerminator();
-    while (FirstSPPopI != Begin) {
-      auto Prev = std::prev(FirstSPPopI);
-      if (Prev->getOpcode() != AArch64::LDRXpre ||
-          Prev->getOperand(0).getReg() == AArch64::SP)
-        break;
-      FirstSPPopI = Prev;
-    }
 
-    adaptForLdStOpt(MBB, FirstSPPopI, LastPopI);
-
-    emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP,
-                    StackOffset::getFixed(AfterCSRPopSize), TII,
-                    MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
+    emitFrameOffset(
+        MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
+        StackOffset::getFixed(AfterCSRPopSize), TII, MachineInstr::FrameDestroy,
+        false, NeedsWinCFI, &HasWinCFI, EmitCFI,
+        StackOffset::getFixed(CombineAfterCSRBump ? PrologueSaveSize : 0));
   }
   if (HasWinCFI)
     BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd))
@@ -2061,8 +2298,9 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
   // right thing for the emergency spill slot.
   bool UseFP = false;
   if (AFI->hasStackFrame() && !isSVE) {
-    // We shouldn't prefer using the FP when there is an SVE area
-    // in between the FP and the non-SVE locals/spills.
+    // We shouldn't prefer using the FP to access fixed-sized stack objects when
+    // there are scalable (SVE) objects in between the FP and the fixed-sized
+    // objects.
     PreferFP &= !SVEStackSize;
 
     // Note: Keeping the following as multiple 'if' statements rather than
@@ -2083,7 +2321,7 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
       // offsets is smaller than for positive ones. If an offset is available
       // via the FP and the SP, use whichever is closest.
       bool FPOffsetFits = !ForSimm || FPOffset >= -256;
-      PreferFP |= Offset > -FPOffset;
+      PreferFP |= Offset > -FPOffset && !SVEStackSize;
 
       if (MFI.hasVarSizedObjects()) {
         // If we have variable sized objects, we can use either FP or BP, as the
@@ -2270,7 +2508,7 @@ struct RegPairInfo {
 static void computeCalleeSaveRegisterPairs(
     MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
     const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
-    bool &NeedShadowCallStackProlog, bool NeedsFrameRecord) {
+    bool NeedsFrameRecord) {
 
   if (CSI.empty())
     return;
@@ -2349,15 +2587,6 @@ static void computeCalleeSaveRegisterPairs(
       }
     }
 
-    // If either of the registers to be saved is the lr register, it means that
-    // we also need to save lr in the shadow call stack.
-    if ((RPI.Reg1 == AArch64::LR || RPI.Reg2 == AArch64::LR) &&
-        MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack)) {
-      if (!MF.getSubtarget<AArch64Subtarget>().isXRegisterReserved(18))
-        report_fatal_error("Must reserve x18 to use shadow call stack");
-      NeedShadowCallStackProlog = true;
-    }
-
     // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
     // list to come in sorted by frame index so that we can issue the store
     // pair instructions directly. Assert if we see anything otherwise.
@@ -2476,43 +2705,9 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
   DebugLoc DL;
   SmallVector<RegPairInfo, 8> RegPairs;
 
-  bool NeedShadowCallStackProlog = false;
-  computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
-                                 NeedShadowCallStackProlog, hasFP(MF));
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
-
-  if (NeedShadowCallStackProlog) {
-    // Shadow call stack prolog: str x30, [x18], #8
-    BuildMI(MBB, MI, DL, TII.get(AArch64::STRXpost))
-        .addReg(AArch64::X18, RegState::Define)
-        .addReg(AArch64::LR)
-        .addReg(AArch64::X18)
-        .addImm(8)
-        .setMIFlag(MachineInstr::FrameSetup);
-
-    if (NeedsWinCFI)
-      BuildMI(MBB, MI, DL, TII.get(AArch64::SEH_Nop))
-          .setMIFlag(MachineInstr::FrameSetup);
-
-    // Emit a CFI instruction that causes 8 to be subtracted from the value of
-    // x18 when unwinding past this frame.
-    static const char CFIInst[] = {
-        dwarf::DW_CFA_val_expression,
-        18, // register
-        2,  // length
-        static_cast<char>(unsigned(dwarf::DW_OP_breg18)),
-        static_cast<char>(-8) & 0x7f, // addend (sleb128)
-    };
-    unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createEscape(
-        nullptr, StringRef(CFIInst, sizeof(CFIInst))));
-    BuildMI(MBB, MI, DL, TII.get(AArch64::CFI_INSTRUCTION))
-        .addCFIIndex(CFIIndex)
-        .setMIFlag(MachineInstr::FrameSetup);
-
-    // This instruction also makes x18 live-in to the entry block.
-    MBB.addLiveIn(AArch64::X18);
-  }
+  computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
 
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
   if (homogeneousPrologEpilog(MF)) {
     auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Prolog))
                    .setMIFlag(MachineInstr::FrameSetup);
@@ -2622,7 +2817,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
 }
 
 bool AArch64FrameLowering::restoreCalleeSavedRegisters(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
     MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
   MachineFunction &MF = *MBB.getParent();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
@@ -2630,14 +2825,12 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
   SmallVector<RegPairInfo, 8> RegPairs;
   bool NeedsWinCFI = needsWinCFI(MF);
 
-  if (MI != MBB.end())
-    DL = MI->getDebugLoc();
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
 
-  bool NeedShadowCallStackProlog = false;
-  computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
-                                 NeedShadowCallStackProlog, hasFP(MF));
+  computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
 
-  auto EmitMI = [&](const RegPairInfo &RPI) {
+  auto EmitMI = [&](const RegPairInfo &RPI) -> MachineBasicBlock::iterator {
     unsigned Reg1 = RPI.Reg1;
     unsigned Reg2 = RPI.Reg2;
 
@@ -2694,7 +2887,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
       std::swap(Reg1, Reg2);
       std::swap(FrameIdxReg1, FrameIdxReg2);
     }
-    MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(LdrOpc));
     if (RPI.isPaired()) {
       MIB.addReg(Reg2, getDefRegState(true));
       MIB.addMemOperand(MF.getMachineMemOperand(
@@ -2711,6 +2904,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
         MachineMemOperand::MOLoad, Size, Alignment));
     if (NeedsWinCFI)
       InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
+
+    return MIB->getIterator();
   };
 
   // SVE objects are always restored in reverse order.
@@ -2718,31 +2913,33 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
     if (RPI.isScalable())
       EmitMI(RPI);
 
-  if (ReverseCSRRestoreSeq) {
-    for (const RegPairInfo &RPI : reverse(RegPairs))
-      if (!RPI.isScalable())
-        EmitMI(RPI);
-  } else if (homogeneousPrologEpilog(MF, &MBB)) {
-    auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Epilog))
+  if (homogeneousPrologEpilog(MF, &MBB)) {
+    auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog))
                    .setMIFlag(MachineInstr::FrameDestroy);
     for (auto &RPI : RegPairs) {
       MIB.addReg(RPI.Reg1, RegState::Define);
       MIB.addReg(RPI.Reg2, RegState::Define);
     }
     return true;
-  } else
-    for (const RegPairInfo &RPI : RegPairs)
-      if (!RPI.isScalable())
-        EmitMI(RPI);
-
-  if (NeedShadowCallStackProlog) {
-    // Shadow call stack epilog: ldr x30, [x18, #-8]!
-    BuildMI(MBB, MI, DL, TII.get(AArch64::LDRXpre))
-        .addReg(AArch64::X18, RegState::Define)
-        .addReg(AArch64::LR, RegState::Define)
-        .addReg(AArch64::X18)
-        .addImm(-8)
-        .setMIFlag(MachineInstr::FrameDestroy);
+  }
+
+  if (ReverseCSRRestoreSeq) {
+    MachineBasicBlock::iterator First = MBB.end();
+    for (const RegPairInfo &RPI : reverse(RegPairs)) {
+      if (RPI.isScalable())
+        continue;
+      MachineBasicBlock::iterator It = EmitMI(RPI);
+      if (First == MBB.end())
+        First = It;
+    }
+    if (First != MBB.end())
+      MBB.splice(MBBI, &MBB, First);
+  } else {
+    for (const RegPairInfo &RPI : RegPairs) {
+      if (RPI.isScalable())
+        continue;
+      (void)EmitMI(RPI);
+    }
   }
 
   return true;
@@ -2941,6 +3138,15 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
   // stack slots for them.
   MachineFrameInfo &MFI = MF.getFrameInfo();
   auto *AFI = MF.getInfo<AArch64FunctionInfo>();
+
+  bool UsesWinAAPCS = isTargetWindows(MF);
+  if (UsesWinAAPCS && hasFP(MF) && AFI->hasSwiftAsyncContext()) {
+    int FrameIdx = MFI.CreateStackObject(8, Align(16), true);
+    AFI->setSwiftAsyncContextFrameIdx(FrameIdx);
+    if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
+    if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
+  }
+
   for (auto &CS : CSI) {
     Register Reg = CS.getReg();
     const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
@@ -2954,7 +3160,8 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
     if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
 
     // Grab 8 bytes below FP for the extended asynchronous frame info.
-    if (hasFP(MF) && AFI->hasSwiftAsyncContext() && Reg == AArch64::FP) {
+    if (hasFP(MF) && AFI->hasSwiftAsyncContext() && !UsesWinAAPCS &&
+        Reg == AArch64::FP) {
       FrameIdx = MFI.CreateStackObject(8, Alignment, true);
       AFI->setSwiftAsyncContextFrameIdx(FrameIdx);
       if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
@@ -3190,7 +3397,7 @@ public:
   // instructions. May skip if the replacement is not profitable. May invalidate
   // the input iterator and replace it with a valid one.
   void emitCode(MachineBasicBlock::iterator &InsertI,
-                const AArch64FrameLowering *TFI, bool IsLast);
+                const AArch64FrameLowering *TFI, bool TryMergeSPUpdate);
 };
 
 void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) {
@@ -3329,7 +3536,8 @@ void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE,
 }
 
 void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
-                            const AArch64FrameLowering *TFI, bool IsLast) {
+                            const AArch64FrameLowering *TFI,
+                            bool TryMergeSPUpdate) {
   if (TagStores.empty())
     return;
   TagStoreInstr &FirstTagStore = TagStores[0];
@@ -3359,8 +3567,8 @@ void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
     emitUnrolled(InsertI);
   } else {
     MachineInstr *UpdateInstr = nullptr;
-    int64_t TotalOffset;
-    if (IsLast) {
+    int64_t TotalOffset = 0;
+    if (TryMergeSPUpdate) {
       // See if we can merge base register update into the STGloop.
       // This is done in AArch64LoadStoreOptimizer for "normal" stores,
       // but STGloop is way too unusual for that, and also it only
@@ -3505,7 +3713,7 @@ MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
   for (auto &Instr : Instrs) {
     if (EndOffset && *EndOffset != Instr.Offset) {
       // Found a gap.
-      TSE.emitCode(InsertI, TFI, /*IsLast = */ false);
+      TSE.emitCode(InsertI, TFI, /*TryMergeSPUpdate = */ false);
       TSE.clear();
     }
 
@@ -3513,7 +3721,11 @@ MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
     EndOffset = Instr.Offset + Instr.Size;
   }
 
-  TSE.emitCode(InsertI, TFI, /*IsLast = */ true);
+  // Multiple FP/SP updates in a loop cannot be described by CFI instructions.
+  TSE.emitCode(InsertI, TFI, /*TryMergeSPUpdate = */
+               !MBB->getParent()
+                    ->getInfo<AArch64FunctionInfo>()
+                    ->needsAsyncDwarfUnwindInfo());
 
   return InsertI;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index 31f57cbc49f2..f59860a24d9b 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -29,6 +29,8 @@ public:
   void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MBBI) const;
 
+  void resetCFIToInitialState(MachineBasicBlock &MBB) const override;
+
   MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator I) const override;
@@ -141,13 +143,20 @@ private:
   int64_t assignSVEStackObjectOffsets(MachineFrameInfo &MF,
                                       int &MinCSFrameIndex,
                                       int &MaxCSFrameIndex) const;
-  MCCFIInstruction
-  createDefCFAExpressionFromSP(const TargetRegisterInfo &TRI,
-                               const StackOffset &OffsetFromSP) const;
-  MCCFIInstruction createCfaOffset(const TargetRegisterInfo &MRI, unsigned DwarfReg,
-                                   const StackOffset &OffsetFromDefCFA) const;
   bool shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock &MBB,
                                                 unsigned StackBumpBytes) const;
+  void emitCalleeSavedGPRLocations(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI) const;
+  void emitCalleeSavedSVELocations(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI) const;
+  void emitCalleeSavedGPRRestores(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MBBI) const;
+  void emitCalleeSavedSVERestores(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MBBI) const;
+
+  /// Emit target zero call-used regs.
+  void emitZeroCallUsedRegs(BitVector RegsToZero,
+                            MachineBasicBlock &MBB) const override;
 };
 
 } // End llvm namespace
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 899f069abdd4..82fe5772c99d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -159,6 +159,22 @@ public:
     return SelectAddrModeXRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
   }
 
+  bool SelectExtractHigh(SDValue N, SDValue &Res) {
+    if (Subtarget->isLittleEndian() && N->getOpcode() == ISD::BITCAST)
+      N = N->getOperand(0);
+    if (N->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+        !isa<ConstantSDNode>(N->getOperand(1)))
+      return false;
+    EVT VT = N->getValueType(0);
+    EVT LVT = N->getOperand(0).getValueType();
+    unsigned Index = N->getConstantOperandVal(1);
+    if (!VT.is64BitVector() || !LVT.is128BitVector() ||
+        Index != VT.getVectorNumElements())
+      return false;
+    Res = N->getOperand(0);
+    return true;
+  }
+
   bool SelectDupZeroOrUndef(SDValue N) {
     switch(N->getOpcode()) {
     case ISD::UNDEF:
@@ -204,6 +220,11 @@ public:
     return SelectSVEAddSubImm(N, VT, Imm, Shift);
   }
 
+  template <MVT::SimpleValueType VT>
+  bool SelectSVECpyDupImm(SDValue N, SDValue &Imm, SDValue &Shift) {
+    return SelectSVECpyDupImm(N, VT, Imm, Shift);
+  }
+
   template <MVT::SimpleValueType VT, bool Invert = false>
   bool SelectSVELogicalImm(SDValue N, SDValue &Imm) {
     return SelectSVELogicalImm(N, VT, Imm, Invert);
@@ -219,6 +240,16 @@ public:
     return SelectSVEShiftImm(N, Low, High, AllowSaturation, Imm);
   }
 
+  bool SelectSVEShiftSplatImmR(SDValue N, SDValue &Imm) {
+    if (N->getOpcode() != ISD::SPLAT_VECTOR)
+      return false;
+
+    EVT EltVT = N->getValueType(0).getVectorElementType();
+    return SelectSVEShiftImm(N->getOperand(0), /* Low */ 1,
+                             /* High */ EltVT.getFixedSizeInBits(),
+                             /* AllowSaturation */ true, Imm);
+  }
+
   // Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N.
   template<signed Min, signed Max, signed Scale, bool Shift>
   bool SelectCntImm(SDValue N, SDValue &Imm) {
@@ -257,6 +288,15 @@ public:
     return false;
   }
 
+  template <unsigned BaseReg> bool ImmToTile(SDValue N, SDValue &Imm) {
+    if (auto *CI = dyn_cast<ConstantSDNode>(N)) {
+      uint64_t C = CI->getZExtValue();
+      Imm = CurDAG->getRegister(BaseReg + C, MVT::Other);
+      return true;
+    }
+    return false;
+  }
+
   /// Form sequences of consecutive 64/128-bit registers for use in NEON
   /// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have
   /// between 1 and 4 elements. If it contains a single element that is returned
@@ -300,6 +340,11 @@ public:
     return SelectSVERegRegAddrMode(N, Scale, Base, Offset);
   }
 
+  template <unsigned Scale>
+  bool SelectSMETileSlice(SDValue N, SDValue &Vector, SDValue &Offset) {
+    return SelectSMETileSlice(N, Scale, Vector, Offset);
+  }
+
   void SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
   void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);
   void SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
@@ -357,10 +402,8 @@ private:
 
   bool SelectCMP_SWAP(SDNode *N);
 
-  bool SelectSVE8BitLslImm(SDValue N, SDValue &Imm, SDValue &Shift);
-
   bool SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift);
-
+  bool SelectSVECpyDupImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift);
   bool SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm, bool Invert);
 
   bool SelectSVESignedArithImm(SDValue N, SDValue &Imm);
@@ -370,6 +413,8 @@ private:
   bool SelectSVEArithImm(SDValue N, MVT VT, SDValue &Imm);
   bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base,
                                SDValue &Offset);
+  bool SelectSMETileSlice(SDValue N, unsigned Scale, SDValue &Vector,
+                          SDValue &Offset);
 
   bool SelectAllActivePredicate(SDValue N);
 };
@@ -822,9 +867,17 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
 
     Reg = N.getOperand(0);
 
-    // Don't match if free 32-bit -> 64-bit zext can be used instead.
-    if (Ext == AArch64_AM::UXTW &&
-        Reg->getValueType(0).getSizeInBits() == 32 && isDef32(*Reg.getNode()))
+    // Don't match if free 32-bit -> 64-bit zext can be used instead. Use the
+    // isDef32 as a heuristic for when the operand is likely to be a 32bit def.
+    auto isDef32 = [](SDValue N) {
+      unsigned Opc = N.getOpcode();
+      return Opc != ISD::TRUNCATE && Opc != TargetOpcode::EXTRACT_SUBREG &&
+             Opc != ISD::CopyFromReg && Opc != ISD::AssertSext &&
+             Opc != ISD::AssertZext && Opc != ISD::AssertAlign &&
+             Opc != ISD::FREEZE;
+    };
+    if (Ext == AArch64_AM::UXTW && Reg->getValueType(0).getSizeInBits() == 32 &&
+        isDef32(Reg))
       return false;
   }
 
@@ -1852,6 +1905,7 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
     VT = Opd0->getValueType(0);
   } else if (isOpcWithIntImmediate(Op0, ISD::SRL, SrlImm)) {
     Opd0 = Op0->getOperand(0);
+    ClampMSB = (VT == MVT::i32);
   } else if (BiggerPattern) {
     // Let's pretend a 0 shift right has been performed.
     // The resulting code will be at least as good as the original one
@@ -2710,8 +2764,16 @@ static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits,
     // shift the needed bits into place.
     SDLoc DL(N);
     unsigned ShiftOpc = (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri;
+    uint64_t LsrImm = LSB;
+    if (Src->hasOneUse() &&
+        isOpcWithIntImmediate(Src.getNode(), ISD::SRL, LsrImm) &&
+        (LsrImm + LSB) < BitWidth) {
+      Src = Src->getOperand(0);
+      LsrImm += LSB;
+    }
+
     SDNode *LSR = CurDAG->getMachineNode(
-        ShiftOpc, DL, VT, Src, CurDAG->getTargetConstant(LSB, DL, VT),
+        ShiftOpc, DL, VT, Src, CurDAG->getTargetConstant(LsrImm, DL, VT),
         CurDAG->getTargetConstant(BitWidth - 1, DL, VT));
 
     // BFXIL is an alias of BFM, so translate to BFM operands.
@@ -2827,15 +2889,15 @@ bool AArch64DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
     SDValue Add1 = ShiftAmt->getOperand(1);
     uint64_t Add0Imm;
     uint64_t Add1Imm;
-    // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X
-    // to avoid the ADD/SUB.
-    if (isIntImmediate(Add1, Add1Imm) && (Add1Imm % Size == 0))
+    if (isIntImmediate(Add1, Add1Imm) && (Add1Imm % Size == 0)) {
+      // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X
+      // to avoid the ADD/SUB.
       NewShiftAmt = Add0;
-    // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to
-    // generate a NEG instead of a SUB of a constant.
-    else if (ShiftAmt->getOpcode() == ISD::SUB &&
-             isIntImmediate(Add0, Add0Imm) && Add0Imm != 0 &&
-             (Add0Imm % Size == 0)) {
+    } else if (ShiftAmt->getOpcode() == ISD::SUB &&
+               isIntImmediate(Add0, Add0Imm) && Add0Imm != 0 &&
+               (Add0Imm % Size == 0)) {
+      // If we are shifting by N-X where N == 0 mod Size, then just shift by -X
+      // to generate a NEG instead of a SUB from a constant.
       unsigned NegOpc;
       unsigned ZeroReg;
       EVT SubVT = ShiftAmt->getValueType(0);
@@ -2852,6 +2914,26 @@ bool AArch64DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
       MachineSDNode *Neg =
           CurDAG->getMachineNode(NegOpc, DL, SubVT, Zero, Add1);
       NewShiftAmt = SDValue(Neg, 0);
+    } else if (ShiftAmt->getOpcode() == ISD::SUB &&
+               isIntImmediate(Add0, Add0Imm) && (Add0Imm % Size == Size - 1)) {
+      // If we are shifting by N-X where N == -1 mod Size, then just shift by ~X
+      // to generate a NOT instead of a SUB from a constant.
+      unsigned NotOpc;
+      unsigned ZeroReg;
+      EVT SubVT = ShiftAmt->getValueType(0);
+      if (SubVT == MVT::i32) {
+        NotOpc = AArch64::ORNWrr;
+        ZeroReg = AArch64::WZR;
+      } else {
+        assert(SubVT == MVT::i64);
+        NotOpc = AArch64::ORNXrr;
+        ZeroReg = AArch64::XZR;
+      }
+      SDValue Zero =
+          CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, ZeroReg, SubVT);
+      MachineSDNode *Not =
+          CurDAG->getMachineNode(NotOpc, DL, SubVT, Zero, Add1);
+      NewShiftAmt = SDValue(Not, 0);
     } else
       return false;
   } else {
@@ -3108,72 +3190,81 @@ bool AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
   return true;
 }
 
-bool AArch64DAGToDAGISel::SelectSVE8BitLslImm(SDValue N, SDValue &Base,
-                                                  SDValue &Offset) {
-  auto C = dyn_cast<ConstantSDNode>(N);
-  if (!C)
+bool AArch64DAGToDAGISel::SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm,
+                                             SDValue &Shift) {
+  if (!isa<ConstantSDNode>(N))
     return false;
 
-  auto Ty = N->getValueType(0);
-
-  int64_t Imm = C->getSExtValue();
   SDLoc DL(N);
-
-  if ((Imm >= -128) && (Imm <= 127)) {
-    Base = CurDAG->getTargetConstant(Imm, DL, Ty);
-    Offset = CurDAG->getTargetConstant(0, DL, Ty);
-    return true;
-  }
-
-  if (((Imm % 256) == 0) && (Imm >= -32768) && (Imm <= 32512)) {
-    Base = CurDAG->getTargetConstant(Imm/256, DL, Ty);
-    Offset = CurDAG->getTargetConstant(8, DL, Ty);
+  uint64_t Val = cast<ConstantSDNode>(N)
+                     ->getAPIntValue()
+                     .trunc(VT.getFixedSizeInBits())
+                     .getZExtValue();
+
+  switch (VT.SimpleTy) {
+  case MVT::i8:
+    // All immediates are supported.
+    Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
+    Imm = CurDAG->getTargetConstant(Val, DL, MVT::i32);
     return true;
+  case MVT::i16:
+  case MVT::i32:
+  case MVT::i64:
+    // Support 8bit unsigned immediates.
+    if (Val <= 255) {
+      Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
+      Imm = CurDAG->getTargetConstant(Val, DL, MVT::i32);
+      return true;
+    }
+    // Support 16bit unsigned immediates that are a multiple of 256.
+    if (Val <= 65280 && Val % 256 == 0) {
+      Shift = CurDAG->getTargetConstant(8, DL, MVT::i32);
+      Imm = CurDAG->getTargetConstant(Val >> 8, DL, MVT::i32);
+      return true;
+    }
+    break;
+  default:
+    break;
   }
 
   return false;
 }
 
-bool AArch64DAGToDAGISel::SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift) {
-  if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
-    const int64_t ImmVal = CNode->getSExtValue();
-    SDLoc DL(N);
+bool AArch64DAGToDAGISel::SelectSVECpyDupImm(SDValue N, MVT VT, SDValue &Imm,
+                                             SDValue &Shift) {
+  if (!isa<ConstantSDNode>(N))
+    return false;
 
-    switch (VT.SimpleTy) {
-    case MVT::i8:
-      // Can always select i8s, no shift, mask the immediate value to
-      // deal with sign-extended value from lowering.
+  SDLoc DL(N);
+  int64_t Val = cast<ConstantSDNode>(N)
+                    ->getAPIntValue()
+                    .trunc(VT.getFixedSizeInBits())
+                    .getSExtValue();
+
+  switch (VT.SimpleTy) {
+  case MVT::i8:
+    // All immediates are supported.
+    Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
+    Imm = CurDAG->getTargetConstant(Val & 0xFF, DL, MVT::i32);
+    return true;
+  case MVT::i16:
+  case MVT::i32:
+  case MVT::i64:
+    // Support 8bit signed immediates.
+    if (Val >= -128 && Val <= 127) {
       Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
-      Imm = CurDAG->getTargetConstant(ImmVal & 0xFF, DL, MVT::i32);
+      Imm = CurDAG->getTargetConstant(Val & 0xFF, DL, MVT::i32);
+      return true;
+    }
+    // Support 16bit signed immediates that are a multiple of 256.
+    if (Val >= -32768 && Val <= 32512 && Val % 256 == 0) {
+      Shift = CurDAG->getTargetConstant(8, DL, MVT::i32);
+      Imm = CurDAG->getTargetConstant((Val >> 8) & 0xFF, DL, MVT::i32);
       return true;
-    case MVT::i16:
-      // i16 values get sign-extended to 32-bits during lowering.
-      if ((ImmVal & 0xFF) == ImmVal) {
-        Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
-        Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
-        return true;
-      } else if ((ImmVal & 0xFF) == 0) {
-        Shift = CurDAG->getTargetConstant(8, DL, MVT::i32);
-        Imm = CurDAG->getTargetConstant((ImmVal >> 8) & 0xFF, DL, MVT::i32);
-        return true;
-      }
-      break;
-    case MVT::i32:
-    case MVT::i64:
-      // Range of immediate won't trigger signedness problems for 32/64b.
-      if ((ImmVal & 0xFF) == ImmVal) {
-        Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
-        Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
-        return true;
-      } else if ((ImmVal & 0xFF00) == ImmVal) {
-        Shift = CurDAG->getTargetConstant(8, DL, MVT::i32);
-        Imm = CurDAG->getTargetConstant(ImmVal >> 8, DL, MVT::i32);
-        return true;
-      }
-      break;
-    default:
-      break;
     }
+    break;
+  default:
+    break;
   }
 
   return false;
@@ -3901,7 +3992,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
                              true);
         return;
       } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
-                 (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+                 VT == MVT::nxv8bf16) {
         SelectPredicatedLoad(Node, 2, 1, AArch64::LD2H_IMM, AArch64::LD2H,
                              true);
         return;
@@ -3922,7 +4013,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
                              true);
         return;
       } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
-                 (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+                 VT == MVT::nxv8bf16) {
         SelectPredicatedLoad(Node, 3, 1, AArch64::LD3H_IMM, AArch64::LD3H,
                              true);
         return;
@@ -3943,7 +4034,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
                              true);
         return;
       } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
-                 (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+                 VT == MVT::nxv8bf16) {
         SelectPredicatedLoad(Node, 4, 1, AArch64::LD4H_IMM, AArch64::LD4H,
                              true);
         return;
@@ -4267,7 +4358,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
         SelectPredicatedStore(Node, 2, 0, AArch64::ST2B, AArch64::ST2B_IMM);
         return;
       } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
-                 (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+                 VT == MVT::nxv8bf16) {
         SelectPredicatedStore(Node, 2, 1, AArch64::ST2H, AArch64::ST2H_IMM);
         return;
       } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
@@ -4284,7 +4375,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
         SelectPredicatedStore(Node, 3, 0, AArch64::ST3B, AArch64::ST3B_IMM);
         return;
       } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
-                 (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+                 VT == MVT::nxv8bf16) {
         SelectPredicatedStore(Node, 3, 1, AArch64::ST3H, AArch64::ST3H_IMM);
         return;
       } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
@@ -4301,7 +4392,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
         SelectPredicatedStore(Node, 4, 0, AArch64::ST4B, AArch64::ST4B_IMM);
         return;
       } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
-                 (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+                 VT == MVT::nxv8bf16) {
         SelectPredicatedStore(Node, 4, 1, AArch64::ST4H, AArch64::ST4H_IMM);
         return;
       } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
@@ -4911,7 +5002,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       SelectPredicatedLoad(Node, 2, 0, AArch64::LD2B_IMM, AArch64::LD2B);
       return;
     } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
-               (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+               VT == MVT::nxv8bf16) {
       SelectPredicatedLoad(Node, 2, 1, AArch64::LD2H_IMM, AArch64::LD2H);
       return;
     } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
@@ -4928,7 +5019,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       SelectPredicatedLoad(Node, 3, 0, AArch64::LD3B_IMM, AArch64::LD3B);
       return;
     } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
-               (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+               VT == MVT::nxv8bf16) {
       SelectPredicatedLoad(Node, 3, 1, AArch64::LD3H_IMM, AArch64::LD3H);
       return;
     } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
@@ -4945,7 +5036,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       SelectPredicatedLoad(Node, 4, 0, AArch64::LD4B_IMM, AArch64::LD4B);
       return;
     } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
-               (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+               VT == MVT::nxv8bf16) {
       SelectPredicatedLoad(Node, 4, 1, AArch64::LD4H_IMM, AArch64::LD4H);
       return;
     } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
@@ -5033,6 +5124,10 @@ static EVT getMemVTFromNode(LLVMContext &Ctx, SDNode *Root) {
 
   const unsigned IntNo =
       cast<ConstantSDNode>(Root->getOperand(1))->getZExtValue();
+  if (IntNo == Intrinsic::aarch64_sme_ldr ||
+      IntNo == Intrinsic::aarch64_sme_str)
+    return MVT::nxv16i8;
+
   if (IntNo != Intrinsic::aarch64_sve_prf)
     return EVT();
 
@@ -5051,12 +5146,19 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N,
                                                    SDValue &OffImm) {
   const EVT MemVT = getMemVTFromNode(*(CurDAG->getContext()), Root);
   const DataLayout &DL = CurDAG->getDataLayout();
+  const MachineFrameInfo &MFI = MF->getFrameInfo();
 
   if (N.getOpcode() == ISD::FrameIndex) {
     int FI = cast<FrameIndexSDNode>(N)->getIndex();
-    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
-    OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i64);
-    return true;
+    // We can only encode VL scaled offsets, so only fold in frame indexes
+    // referencing SVE objects.
+    if (FI == 0 || MFI.getStackID(FI) == TargetStackID::ScalableVector) {
+      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+      OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i64);
+      return true;
+    }
+
+    return false;
   }
 
   if (MemVT == EVT())
@@ -5083,7 +5185,10 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N,
   Base = N.getOperand(0);
   if (Base.getOpcode() == ISD::FrameIndex) {
     int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+    // We can only encode VL scaled offsets, so only fold in frame indexes
+    // referencing SVE objects.
+    if (FI == 0 || MFI.getStackID(FI) == TargetStackID::ScalableVector)
+      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
   }
 
   OffImm = CurDAG->getTargetConstant(Offset, SDLoc(N), MVT::i64);
@@ -5149,3 +5254,30 @@ bool AArch64DAGToDAGISel::SelectAllActivePredicate(SDValue N) {
 
   return TLI->isAllActivePredicate(*CurDAG, N);
 }
+
+bool AArch64DAGToDAGISel::SelectSMETileSlice(SDValue N, unsigned Scale,
+                                             SDValue &Base, SDValue &Offset) {
+  if (N.getOpcode() != ISD::ADD) {
+    Base = N;
+    Offset = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i64);
+    return true;
+  }
+
+  // Process an ADD node.
+  const SDValue LHS = N.getOperand(0);
+  const SDValue RHS = N.getOperand(1);
+
+  if (auto C = dyn_cast<ConstantSDNode>(RHS)) {
+    int64_t ImmOff = C->getSExtValue();
+    unsigned MaxSize = (1 << Scale) - 1;
+
+    if (ImmOff < 0 || ImmOff > MaxSize)
+      return false;
+
+    Base = LHS;
+    Offset = CurDAG->getTargetConstant(ImmOff, SDLoc(N), MVT::i64);
+    return true;
+  }
+
+  return false;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c539c8617d99..abfe2d507111 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -208,6 +208,7 @@ static bool isMergePassthruOpcode(unsigned Opc) {
   case AArch64ISD::BSWAP_MERGE_PASSTHRU:
   case AArch64ISD::REVH_MERGE_PASSTHRU:
   case AArch64ISD::REVW_MERGE_PASSTHRU:
+  case AArch64ISD::REVD_MERGE_PASSTHRU:
   case AArch64ISD::CTLZ_MERGE_PASSTHRU:
   case AArch64ISD::CTPOP_MERGE_PASSTHRU:
   case AArch64ISD::DUP_MERGE_PASSTHRU:
@@ -289,8 +290,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       addQRTypeForNEON(MVT::v8bf16);
   }
 
-  if (Subtarget->hasSVE()) {
+  if (Subtarget->hasSVE() || Subtarget->hasSME()) {
     // Add legal sve predicate types
+    addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
     addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
     addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
     addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
@@ -324,50 +326,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
         if (useSVEForFixedLengthVectorVT(VT))
           addRegisterClass(VT, &AArch64::ZPRRegClass);
     }
-
-    for (auto VT : { MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64 }) {
-      setOperationAction(ISD::SADDSAT, VT, Legal);
-      setOperationAction(ISD::UADDSAT, VT, Legal);
-      setOperationAction(ISD::SSUBSAT, VT, Legal);
-      setOperationAction(ISD::USUBSAT, VT, Legal);
-      setOperationAction(ISD::UREM, VT, Expand);
-      setOperationAction(ISD::SREM, VT, Expand);
-      setOperationAction(ISD::SDIVREM, VT, Expand);
-      setOperationAction(ISD::UDIVREM, VT, Expand);
-    }
-
-    for (auto VT :
-         { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
-           MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
-      setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal);
-
-    for (auto VT :
-         { MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32, MVT::nxv4f32,
-           MVT::nxv2f64 }) {
-      setCondCodeAction(ISD::SETO, VT, Expand);
-      setCondCodeAction(ISD::SETOLT, VT, Expand);
-      setCondCodeAction(ISD::SETLT, VT, Expand);
-      setCondCodeAction(ISD::SETOLE, VT, Expand);
-      setCondCodeAction(ISD::SETLE, VT, Expand);
-      setCondCodeAction(ISD::SETULT, VT, Expand);
-      setCondCodeAction(ISD::SETULE, VT, Expand);
-      setCondCodeAction(ISD::SETUGE, VT, Expand);
-      setCondCodeAction(ISD::SETUGT, VT, Expand);
-      setCondCodeAction(ISD::SETUEQ, VT, Expand);
-      setCondCodeAction(ISD::SETUNE, VT, Expand);
-
-      setOperationAction(ISD::FREM, VT, Expand);
-      setOperationAction(ISD::FPOW, VT, Expand);
-      setOperationAction(ISD::FPOWI, VT, Expand);
-      setOperationAction(ISD::FCOS, VT, Expand);
-      setOperationAction(ISD::FSIN, VT, Expand);
-      setOperationAction(ISD::FSINCOS, VT, Expand);
-      setOperationAction(ISD::FEXP, VT, Expand);
-      setOperationAction(ISD::FEXP2, VT, Expand);
-      setOperationAction(ISD::FLOG, VT, Expand);
-      setOperationAction(ISD::FLOG2, VT, Expand);
-      setOperationAction(ISD::FLOG10, VT, Expand);
-    }
   }
 
   // Compute derived properties from the register classes
@@ -389,7 +347,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
   setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
   setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
-  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
   setOperationAction(ISD::BR_CC, MVT::i32, Custom);
   setOperationAction(ISD::BR_CC, MVT::i64, Custom);
   setOperationAction(ISD::BR_CC, MVT::f16, Custom);
@@ -448,6 +406,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SELECT, MVT::f128, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
   setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
+  // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
+  // aren't handled.
 
   // Lowering for many of the conversions is actually specified by the non-f128
   // type. The LowerXXX function will be trivial when f128 isn't involved.
@@ -508,16 +468,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   // BlockAddress
   setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
 
-  // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
-  setOperationAction(ISD::ADDC, MVT::i32, Custom);
-  setOperationAction(ISD::ADDE, MVT::i32, Custom);
-  setOperationAction(ISD::SUBC, MVT::i32, Custom);
-  setOperationAction(ISD::SUBE, MVT::i32, Custom);
-  setOperationAction(ISD::ADDC, MVT::i64, Custom);
-  setOperationAction(ISD::ADDE, MVT::i64, Custom);
-  setOperationAction(ISD::SUBC, MVT::i64, Custom);
-  setOperationAction(ISD::SUBE, MVT::i64, Custom);
-
   // AArch64 lacks both left-rotate and popcount instructions.
   setOperationAction(ISD::ROTL, MVT::i32, Expand);
   setOperationAction(ISD::ROTL, MVT::i64, Expand);
@@ -568,6 +518,15 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::UMULO, MVT::i32, Custom);
   setOperationAction(ISD::UMULO, MVT::i64, Custom);
 
+  setOperationAction(ISD::ADDCARRY, MVT::i32, Custom);
+  setOperationAction(ISD::ADDCARRY, MVT::i64, Custom);
+  setOperationAction(ISD::SUBCARRY, MVT::i32, Custom);
+  setOperationAction(ISD::SUBCARRY, MVT::i64, Custom);
+  setOperationAction(ISD::SADDO_CARRY, MVT::i32, Custom);
+  setOperationAction(ISD::SADDO_CARRY, MVT::i64, Custom);
+  setOperationAction(ISD::SSUBO_CARRY, MVT::i32, Custom);
+  setOperationAction(ISD::SSUBO_CARRY, MVT::i64, Custom);
+
   setOperationAction(ISD::FSIN, MVT::f32, Expand);
   setOperationAction(ISD::FSIN, MVT::f64, Expand);
   setOperationAction(ISD::FCOS, MVT::f32, Expand);
@@ -581,64 +540,41 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   else
     setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);
 
-  setOperationAction(ISD::FREM,    MVT::f16,   Promote);
-  setOperationAction(ISD::FREM,    MVT::v4f16, Expand);
-  setOperationAction(ISD::FREM,    MVT::v8f16, Expand);
-  setOperationAction(ISD::FPOW,    MVT::f16,   Promote);
-  setOperationAction(ISD::FPOW,    MVT::v4f16, Expand);
-  setOperationAction(ISD::FPOW,    MVT::v8f16, Expand);
-  setOperationAction(ISD::FPOWI,   MVT::f16,   Promote);
-  setOperationAction(ISD::FPOWI,   MVT::v4f16, Expand);
-  setOperationAction(ISD::FPOWI,   MVT::v8f16, Expand);
-  setOperationAction(ISD::FCOS,    MVT::f16,   Promote);
-  setOperationAction(ISD::FCOS,    MVT::v4f16, Expand);
-  setOperationAction(ISD::FCOS,    MVT::v8f16, Expand);
-  setOperationAction(ISD::FSIN,    MVT::f16,   Promote);
-  setOperationAction(ISD::FSIN,    MVT::v4f16, Expand);
-  setOperationAction(ISD::FSIN,    MVT::v8f16, Expand);
-  setOperationAction(ISD::FSINCOS, MVT::f16,   Promote);
-  setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand);
-  setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand);
-  setOperationAction(ISD::FEXP,    MVT::f16,   Promote);
-  setOperationAction(ISD::FEXP,    MVT::v4f16, Expand);
-  setOperationAction(ISD::FEXP,    MVT::v8f16, Expand);
-  setOperationAction(ISD::FEXP2,   MVT::f16,   Promote);
-  setOperationAction(ISD::FEXP2,   MVT::v4f16, Expand);
-  setOperationAction(ISD::FEXP2,   MVT::v8f16, Expand);
-  setOperationAction(ISD::FLOG,    MVT::f16,   Promote);
-  setOperationAction(ISD::FLOG,    MVT::v4f16, Expand);
-  setOperationAction(ISD::FLOG,    MVT::v8f16, Expand);
-  setOperationAction(ISD::FLOG2,   MVT::f16,   Promote);
-  setOperationAction(ISD::FLOG2,   MVT::v4f16, Expand);
-  setOperationAction(ISD::FLOG2,   MVT::v8f16, Expand);
-  setOperationAction(ISD::FLOG10,  MVT::f16,   Promote);
-  setOperationAction(ISD::FLOG10,  MVT::v4f16, Expand);
-  setOperationAction(ISD::FLOG10,  MVT::v8f16, Expand);
+  for (auto Op : {ISD::FREM,        ISD::FPOW,         ISD::FPOWI,
+                  ISD::FCOS,        ISD::FSIN,         ISD::FSINCOS,
+                  ISD::FEXP,        ISD::FEXP2,        ISD::FLOG,
+                  ISD::FLOG2,       ISD::FLOG10,       ISD::STRICT_FREM,
+                  ISD::STRICT_FPOW, ISD::STRICT_FPOWI, ISD::STRICT_FCOS,
+                  ISD::STRICT_FSIN, ISD::STRICT_FEXP,  ISD::STRICT_FEXP2,
+                  ISD::STRICT_FLOG, ISD::STRICT_FLOG2, ISD::STRICT_FLOG10}) {
+    setOperationAction(Op, MVT::f16, Promote);
+    setOperationAction(Op, MVT::v4f16, Expand);
+    setOperationAction(Op, MVT::v8f16, Expand);
+  }
 
   if (!Subtarget->hasFullFP16()) {
-    setOperationAction(ISD::SELECT,      MVT::f16,  Promote);
-    setOperationAction(ISD::SELECT_CC,   MVT::f16,  Promote);
-    setOperationAction(ISD::SETCC,       MVT::f16,  Promote);
-    setOperationAction(ISD::BR_CC,       MVT::f16,  Promote);
-    setOperationAction(ISD::FADD,        MVT::f16,  Promote);
-    setOperationAction(ISD::FSUB,        MVT::f16,  Promote);
-    setOperationAction(ISD::FMUL,        MVT::f16,  Promote);
-    setOperationAction(ISD::FDIV,        MVT::f16,  Promote);
-    setOperationAction(ISD::FMA,         MVT::f16,  Promote);
-    setOperationAction(ISD::FNEG,        MVT::f16,  Promote);
-    setOperationAction(ISD::FABS,        MVT::f16,  Promote);
-    setOperationAction(ISD::FCEIL,       MVT::f16,  Promote);
-    setOperationAction(ISD::FSQRT,       MVT::f16,  Promote);
-    setOperationAction(ISD::FFLOOR,      MVT::f16,  Promote);
-    setOperationAction(ISD::FNEARBYINT,  MVT::f16,  Promote);
-    setOperationAction(ISD::FRINT,       MVT::f16,  Promote);
-    setOperationAction(ISD::FROUND,      MVT::f16,  Promote);
-    setOperationAction(ISD::FROUNDEVEN,  MVT::f16,  Promote);
-    setOperationAction(ISD::FTRUNC,      MVT::f16,  Promote);
-    setOperationAction(ISD::FMINNUM,     MVT::f16,  Promote);
-    setOperationAction(ISD::FMAXNUM,     MVT::f16,  Promote);
-    setOperationAction(ISD::FMINIMUM,    MVT::f16,  Promote);
-    setOperationAction(ISD::FMAXIMUM,    MVT::f16,  Promote);
+    for (auto Op :
+         {ISD::SELECT,         ISD::SELECT_CC,      ISD::SETCC,
+          ISD::BR_CC,          ISD::FADD,           ISD::FSUB,
+          ISD::FMUL,           ISD::FDIV,           ISD::FMA,
+          ISD::FNEG,           ISD::FABS,           ISD::FCEIL,
+          ISD::FSQRT,          ISD::FFLOOR,         ISD::FNEARBYINT,
+          ISD::FRINT,          ISD::FROUND,         ISD::FROUNDEVEN,
+          ISD::FTRUNC,         ISD::FMINNUM,        ISD::FMAXNUM,
+          ISD::FMINIMUM,       ISD::FMAXIMUM,       ISD::STRICT_FADD,
+          ISD::STRICT_FSUB,    ISD::STRICT_FMUL,    ISD::STRICT_FDIV,
+          ISD::STRICT_FMA,     ISD::STRICT_FCEIL,   ISD::STRICT_FFLOOR,
+          ISD::STRICT_FSQRT,   ISD::STRICT_FRINT,   ISD::STRICT_FNEARBYINT,
+          ISD::STRICT_FROUND,  ISD::STRICT_FTRUNC,  ISD::STRICT_FROUNDEVEN,
+          ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM,
+          ISD::STRICT_FMAXIMUM})
+      setOperationAction(Op, MVT::f16, Promote);
+
+    // Round-to-integer need custom lowering for fp16, as Promote doesn't work
+    // because the result type is integer.
+    for (auto Op : {ISD::STRICT_LROUND, ISD::STRICT_LLROUND, ISD::STRICT_LRINT,
+                    ISD::STRICT_LLRINT})
+      setOperationAction(Op, MVT::f16, Custom);
 
     // promote v4f16 to v4f32 when that is known to be safe.
     setOperationAction(ISD::FADD,        MVT::v4f16, Promote);
@@ -691,37 +627,35 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   }
 
   // AArch64 has implementations of a lot of rounding-like FP operations.
-  for (MVT Ty : {MVT::f32, MVT::f64}) {
-    setOperationAction(ISD::FFLOOR, Ty, Legal);
-    setOperationAction(ISD::FNEARBYINT, Ty, Legal);
-    setOperationAction(ISD::FCEIL, Ty, Legal);
-    setOperationAction(ISD::FRINT, Ty, Legal);
-    setOperationAction(ISD::FTRUNC, Ty, Legal);
-    setOperationAction(ISD::FROUND, Ty, Legal);
-    setOperationAction(ISD::FROUNDEVEN, Ty, Legal);
-    setOperationAction(ISD::FMINNUM, Ty, Legal);
-    setOperationAction(ISD::FMAXNUM, Ty, Legal);
-    setOperationAction(ISD::FMINIMUM, Ty, Legal);
-    setOperationAction(ISD::FMAXIMUM, Ty, Legal);
-    setOperationAction(ISD::LROUND, Ty, Legal);
-    setOperationAction(ISD::LLROUND, Ty, Legal);
-    setOperationAction(ISD::LRINT, Ty, Legal);
-    setOperationAction(ISD::LLRINT, Ty, Legal);
-  }
-
-  if (Subtarget->hasFullFP16()) {
-    setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal);
-    setOperationAction(ISD::FFLOOR,  MVT::f16, Legal);
-    setOperationAction(ISD::FCEIL,   MVT::f16, Legal);
-    setOperationAction(ISD::FRINT,   MVT::f16, Legal);
-    setOperationAction(ISD::FTRUNC,  MVT::f16, Legal);
-    setOperationAction(ISD::FROUND,  MVT::f16, Legal);
-    setOperationAction(ISD::FROUNDEVEN,  MVT::f16, Legal);
-    setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
-    setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
-    setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
-    setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
-  }
+  for (auto Op :
+       {ISD::FFLOOR,          ISD::FNEARBYINT,      ISD::FCEIL,
+        ISD::FRINT,           ISD::FTRUNC,          ISD::FROUND,
+        ISD::FROUNDEVEN,      ISD::FMINNUM,         ISD::FMAXNUM,
+        ISD::FMINIMUM,        ISD::FMAXIMUM,        ISD::LROUND,
+        ISD::LLROUND,         ISD::LRINT,           ISD::LLRINT,
+        ISD::STRICT_FFLOOR,   ISD::STRICT_FCEIL,    ISD::STRICT_FNEARBYINT,
+        ISD::STRICT_FRINT,    ISD::STRICT_FTRUNC,   ISD::STRICT_FROUNDEVEN,
+        ISD::STRICT_FROUND,   ISD::STRICT_FMINNUM,  ISD::STRICT_FMAXNUM,
+        ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_LROUND,
+        ISD::STRICT_LLROUND,  ISD::STRICT_LRINT,    ISD::STRICT_LLRINT}) {
+    for (MVT Ty : {MVT::f32, MVT::f64})
+      setOperationAction(Op, Ty, Legal);
+    if (Subtarget->hasFullFP16())
+      setOperationAction(Op, MVT::f16, Legal);
+  }
+
+  // Basic strict FP operations are legal
+  for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
+                  ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT}) {
+    for (MVT Ty : {MVT::f32, MVT::f64})
+      setOperationAction(Op, Ty, Legal);
+    if (Subtarget->hasFullFP16())
+      setOperationAction(Op, MVT::f16, Legal);
+  }
+
+  // Strict conversion to a larger type is legal
+  for (auto VT : {MVT::f32, MVT::f64})
+    setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal);
 
   setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
 
@@ -891,47 +825,33 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   // Vector add and sub nodes may conceal a high-half opportunity.
   // Also, try to fold ADD into CSINC/CSINV..
-  setTargetDAGCombine(ISD::ADD);
-  setTargetDAGCombine(ISD::ABS);
-  setTargetDAGCombine(ISD::SUB);
-  setTargetDAGCombine(ISD::XOR);
-  setTargetDAGCombine(ISD::SINT_TO_FP);
-  setTargetDAGCombine(ISD::UINT_TO_FP);
-
-  setTargetDAGCombine(ISD::FP_TO_SINT);
-  setTargetDAGCombine(ISD::FP_TO_UINT);
-  setTargetDAGCombine(ISD::FP_TO_SINT_SAT);
-  setTargetDAGCombine(ISD::FP_TO_UINT_SAT);
-  setTargetDAGCombine(ISD::FDIV);
+  setTargetDAGCombine({ISD::ADD, ISD::ABS, ISD::SUB, ISD::XOR, ISD::SINT_TO_FP,
+                       ISD::UINT_TO_FP});
+
+  setTargetDAGCombine({ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
+                       ISD::FP_TO_UINT_SAT, ISD::FDIV});
 
   // Try and combine setcc with csel
   setTargetDAGCombine(ISD::SETCC);
 
   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
 
-  setTargetDAGCombine(ISD::ANY_EXTEND);
-  setTargetDAGCombine(ISD::ZERO_EXTEND);
-  setTargetDAGCombine(ISD::SIGN_EXTEND);
-  setTargetDAGCombine(ISD::VECTOR_SPLICE);
-  setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
-  setTargetDAGCombine(ISD::TRUNCATE);
-  setTargetDAGCombine(ISD::CONCAT_VECTORS);
-  setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
-  setTargetDAGCombine(ISD::STORE);
+  setTargetDAGCombine({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND,
+                       ISD::VECTOR_SPLICE, ISD::SIGN_EXTEND_INREG,
+                       ISD::CONCAT_VECTORS, ISD::EXTRACT_SUBVECTOR,
+                       ISD::INSERT_SUBVECTOR, ISD::STORE});
   if (Subtarget->supportsAddressTopByteIgnored())
     setTargetDAGCombine(ISD::LOAD);
 
   setTargetDAGCombine(ISD::MUL);
 
-  setTargetDAGCombine(ISD::SELECT);
-  setTargetDAGCombine(ISD::VSELECT);
+  setTargetDAGCombine({ISD::SELECT, ISD::VSELECT});
 
-  setTargetDAGCombine(ISD::INTRINSIC_VOID);
-  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
-  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
-  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
-  setTargetDAGCombine(ISD::VECREDUCE_ADD);
-  setTargetDAGCombine(ISD::STEP_VECTOR);
+  setTargetDAGCombine({ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN,
+                       ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,
+                       ISD::VECREDUCE_ADD, ISD::STEP_VECTOR});
+
+  setTargetDAGCombine({ISD::MGATHER, ISD::MSCATTER});
 
   setTargetDAGCombine(ISD::FP_EXTEND);
 
@@ -980,43 +900,29 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   if (Subtarget->hasNEON()) {
     // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
     // silliness like this:
-    setOperationAction(ISD::FABS, MVT::v1f64, Expand);
-    setOperationAction(ISD::FADD, MVT::v1f64, Expand);
-    setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
-    setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
-    setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
-    setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
-    setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
-    setOperationAction(ISD::FMA, MVT::v1f64, Expand);
-    setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
-    setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
-    setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
-    setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
-    setOperationAction(ISD::FREM, MVT::v1f64, Expand);
-    setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
-    setOperationAction(ISD::FROUNDEVEN, MVT::v1f64, Expand);
-    setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
-    setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
-    setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
-    setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
-    setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
-    setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
-    setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
-    setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
-    setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
-    setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
-    setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);
-
-    setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
-    setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
-    setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
-    setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);
-
-    setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v1i64, Expand);
-    setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v1i64, Expand);
-
-    setOperationAction(ISD::MUL, MVT::v1i64, Expand);
+    for (auto Op :
+         {ISD::SELECT,         ISD::SELECT_CC,      ISD::SETCC,
+          ISD::BR_CC,          ISD::FADD,           ISD::FSUB,
+          ISD::FMUL,           ISD::FDIV,           ISD::FMA,
+          ISD::FNEG,           ISD::FABS,           ISD::FCEIL,
+          ISD::FSQRT,          ISD::FFLOOR,         ISD::FNEARBYINT,
+          ISD::FRINT,          ISD::FROUND,         ISD::FROUNDEVEN,
+          ISD::FTRUNC,         ISD::FMINNUM,        ISD::FMAXNUM,
+          ISD::FMINIMUM,       ISD::FMAXIMUM,       ISD::STRICT_FADD,
+          ISD::STRICT_FSUB,    ISD::STRICT_FMUL,    ISD::STRICT_FDIV,
+          ISD::STRICT_FMA,     ISD::STRICT_FCEIL,   ISD::STRICT_FFLOOR,
+          ISD::STRICT_FSQRT,   ISD::STRICT_FRINT,   ISD::STRICT_FNEARBYINT,
+          ISD::STRICT_FROUND,  ISD::STRICT_FTRUNC,  ISD::STRICT_FROUNDEVEN,
+          ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM,
+          ISD::STRICT_FMAXIMUM})
+      setOperationAction(Op, MVT::v1f64, Expand);
+
+    for (auto Op :
+         {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP,
+          ISD::FP_ROUND, ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT, ISD::MUL,
+          ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT,
+          ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP, ISD::STRICT_FP_ROUND})
+      setOperationAction(Op, MVT::v1i64, Expand);
 
     // AArch64 doesn't have a direct vector ->f32 conversion instructions for
     // elements smaller than i32, so promote the input to i32 first.
@@ -1024,14 +930,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
 
     // Similarly, there is no direct i32 -> f64 vector conversion instruction.
-    setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
-    setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
     // Or, direct i32 -> f16 vector conversion.  Set it so custom, so the
     // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
-    setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
+    for (auto Op : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
+                    ISD::STRICT_UINT_TO_FP})
+      for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
+        setOperationAction(Op, VT, Custom);
 
     if (Subtarget->hasFullFP16()) {
       setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Custom);
@@ -1088,6 +992,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
     for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
                    MVT::v4i32}) {
+      setOperationAction(ISD::AVGFLOORS, VT, Legal);
+      setOperationAction(ISD::AVGFLOORU, VT, Legal);
+      setOperationAction(ISD::AVGCEILS, VT, Legal);
+      setOperationAction(ISD::AVGCEILU, VT, Legal);
       setOperationAction(ISD::ABDS, VT, Legal);
       setOperationAction(ISD::ABDU, VT, Legal);
     }
@@ -1141,31 +1049,18 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     }
 
     // AArch64 has implementations of a lot of rounding-like FP operations.
-    for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
-      setOperationAction(ISD::FFLOOR, Ty, Legal);
-      setOperationAction(ISD::FNEARBYINT, Ty, Legal);
-      setOperationAction(ISD::FCEIL, Ty, Legal);
-      setOperationAction(ISD::FRINT, Ty, Legal);
-      setOperationAction(ISD::FTRUNC, Ty, Legal);
-      setOperationAction(ISD::FROUND, Ty, Legal);
-      setOperationAction(ISD::FROUNDEVEN, Ty, Legal);
-    }
-
-    if (Subtarget->hasFullFP16()) {
-      for (MVT Ty : {MVT::v4f16, MVT::v8f16}) {
-        setOperationAction(ISD::FFLOOR, Ty, Legal);
-        setOperationAction(ISD::FNEARBYINT, Ty, Legal);
-        setOperationAction(ISD::FCEIL, Ty, Legal);
-        setOperationAction(ISD::FRINT, Ty, Legal);
-        setOperationAction(ISD::FTRUNC, Ty, Legal);
-        setOperationAction(ISD::FROUND, Ty, Legal);
-        setOperationAction(ISD::FROUNDEVEN, Ty, Legal);
-      }
+    for (auto Op :
+         {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC,
+          ISD::FROUND, ISD::FROUNDEVEN, ISD::STRICT_FFLOOR,
+          ISD::STRICT_FNEARBYINT, ISD::STRICT_FCEIL, ISD::STRICT_FRINT,
+          ISD::STRICT_FTRUNC, ISD::STRICT_FROUND, ISD::STRICT_FROUNDEVEN}) {
+      for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
+        setOperationAction(Op, Ty, Legal);
+      if (Subtarget->hasFullFP16())
+        for (MVT Ty : {MVT::v4f16, MVT::v8f16})
+          setOperationAction(Op, Ty, Legal);
     }
 
-    if (Subtarget->hasSVE())
-      setOperationAction(ISD::VSCALE, MVT::i32, Custom);
-
     setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
 
     setLoadExtAction(ISD::EXTLOAD,  MVT::v4i16, MVT::v4i8, Custom);
@@ -1174,6 +1069,17 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setLoadExtAction(ISD::EXTLOAD,  MVT::v4i32, MVT::v4i8, Custom);
     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
+
+    // ADDP custom lowering
+    for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
+      setOperationAction(ISD::ADD, VT, Custom);
+    // FADDP custom lowering
+    for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
+      setOperationAction(ISD::FADD, VT, Custom);
+  }
+
+  if (Subtarget->hasSME()) {
+    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
   }
 
   if (Subtarget->hasSVE()) {
@@ -1194,7 +1100,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::MUL, VT, Custom);
       setOperationAction(ISD::MULHS, VT, Custom);
       setOperationAction(ISD::MULHU, VT, Custom);
-      setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+      setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
       setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
       setOperationAction(ISD::SELECT, VT, Custom);
       setOperationAction(ISD::SETCC, VT, Custom);
@@ -1224,6 +1130,15 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::SELECT_CC, VT, Expand);
       setOperationAction(ISD::ROTL, VT, Expand);
       setOperationAction(ISD::ROTR, VT, Expand);
+
+      setOperationAction(ISD::SADDSAT, VT, Legal);
+      setOperationAction(ISD::UADDSAT, VT, Legal);
+      setOperationAction(ISD::SSUBSAT, VT, Legal);
+      setOperationAction(ISD::USUBSAT, VT, Legal);
+      setOperationAction(ISD::UREM, VT, Expand);
+      setOperationAction(ISD::SREM, VT, Expand);
+      setOperationAction(ISD::SDIVREM, VT, Expand);
+      setOperationAction(ISD::UDIVREM, VT, Expand);
     }
 
     // Illegal unpacked integer vector types.
@@ -1234,10 +1149,16 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
     // Legalize unpacked bitcasts to REINTERPRET_CAST.
     for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16,
-                    MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
+                    MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
       setOperationAction(ISD::BITCAST, VT, Custom);
 
-    for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
+    for (auto VT :
+         { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
+           MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
+      setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal);
+
+    for (auto VT :
+         {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
       setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
       setOperationAction(ISD::SELECT, VT, Custom);
       setOperationAction(ISD::SETCC, VT, Custom);
@@ -1269,18 +1190,33 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::MSCATTER, VT, Custom);
     }
 
-    for (MVT VT : MVT::fp_scalable_vector_valuetypes()) {
-      for (MVT InnerVT : MVT::fp_scalable_vector_valuetypes()) {
-        // Avoid marking truncating FP stores as legal to prevent the
-        // DAGCombiner from creating unsupported truncating stores.
+    // Firstly, exclude all scalable vector extending loads/truncating stores,
+    // include both integer and floating scalable vector.
+    for (MVT VT : MVT::scalable_vector_valuetypes()) {
+      for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
         setTruncStoreAction(VT, InnerVT, Expand);
-        // SVE does not have floating-point extending loads.
         setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
         setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
         setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
       }
     }
 
+    // Then, selectively enable those which we directly support.
+    setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
+    setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
+    setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
+    setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
+    setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
+    setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
+    for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
+      setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
+      setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
+      setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
+      setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
+      setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
+      setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
+    }
+
     // SVE supports truncating stores of 64 and 128-bit vectors
     setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
     setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
@@ -1295,7 +1231,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::MGATHER, VT, Custom);
       setOperationAction(ISD::MSCATTER, VT, Custom);
       setOperationAction(ISD::MLOAD, VT, Custom);
-      setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+      setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
       setOperationAction(ISD::SELECT, VT, Custom);
       setOperationAction(ISD::FADD, VT, Custom);
       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
@@ -1326,6 +1262,29 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
 
       setOperationAction(ISD::SELECT_CC, VT, Expand);
+      setOperationAction(ISD::FREM, VT, Expand);
+      setOperationAction(ISD::FPOW, VT, Expand);
+      setOperationAction(ISD::FPOWI, VT, Expand);
+      setOperationAction(ISD::FCOS, VT, Expand);
+      setOperationAction(ISD::FSIN, VT, Expand);
+      setOperationAction(ISD::FSINCOS, VT, Expand);
+      setOperationAction(ISD::FEXP, VT, Expand);
+      setOperationAction(ISD::FEXP2, VT, Expand);
+      setOperationAction(ISD::FLOG, VT, Expand);
+      setOperationAction(ISD::FLOG2, VT, Expand);
+      setOperationAction(ISD::FLOG10, VT, Expand);
+
+      setCondCodeAction(ISD::SETO, VT, Expand);
+      setCondCodeAction(ISD::SETOLT, VT, Expand);
+      setCondCodeAction(ISD::SETLT, VT, Expand);
+      setCondCodeAction(ISD::SETOLE, VT, Expand);
+      setCondCodeAction(ISD::SETLE, VT, Expand);
+      setCondCodeAction(ISD::SETULT, VT, Expand);
+      setCondCodeAction(ISD::SETULE, VT, Expand);
+      setCondCodeAction(ISD::SETUGE, VT, Expand);
+      setCondCodeAction(ISD::SETUGT, VT, Expand);
+      setCondCodeAction(ISD::SETUEQ, VT, Expand);
+      setCondCodeAction(ISD::SETONE, VT, Expand);
     }
 
     for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
@@ -1334,13 +1293,23 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::MSCATTER, VT, Custom);
       setOperationAction(ISD::MLOAD, VT, Custom);
       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+      setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
     }
 
-    setOperationAction(ISD::SPLAT_VECTOR, MVT::nxv8bf16, Custom);
-
     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
 
+    // NEON doesn't support integer divides, but SVE does
+    for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
+                    MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
+      setOperationAction(ISD::SDIV, VT, Custom);
+      setOperationAction(ISD::UDIV, VT, Custom);
+    }
+
+    // NEON doesn't support 64-bit vector integer muls, but SVE does.
+    setOperationAction(ISD::MUL, MVT::v1i64, Custom);
+    setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+
     // NOTE: Currently this has to happen after computeRegisterProperties rather
     // than the preferred option of combining it with the addRegisterClass call.
     if (Subtarget->useSVEForFixedLengthVectors()) {
@@ -1367,32 +1336,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
       setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
       setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
-      setOperationAction(ISD::MUL, MVT::v1i64, Custom);
-      setOperationAction(ISD::MUL, MVT::v2i64, Custom);
       setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
       setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
       setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
       setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
-      setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
-      setOperationAction(ISD::SDIV, MVT::v16i8, Custom);
-      setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
-      setOperationAction(ISD::SDIV, MVT::v8i16, Custom);
-      setOperationAction(ISD::SDIV, MVT::v2i32, Custom);
-      setOperationAction(ISD::SDIV, MVT::v4i32, Custom);
-      setOperationAction(ISD::SDIV, MVT::v1i64, Custom);
-      setOperationAction(ISD::SDIV, MVT::v2i64, Custom);
       setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
       setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
       setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
       setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
-      setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
-      setOperationAction(ISD::UDIV, MVT::v16i8, Custom);
-      setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
-      setOperationAction(ISD::UDIV, MVT::v8i16, Custom);
-      setOperationAction(ISD::UDIV, MVT::v2i32, Custom);
-      setOperationAction(ISD::UDIV, MVT::v4i32, Custom);
-      setOperationAction(ISD::UDIV, MVT::v1i64, Custom);
-      setOperationAction(ISD::UDIV, MVT::v2i64, Custom);
       setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
       setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
       setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
@@ -1426,6 +1377,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
     setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
     setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
+
+    setOperationAction(ISD::VSCALE, MVT::i32, Custom);
   }
 
   if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
@@ -1434,6 +1387,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   }
 
   PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
+
+  IsStrictFPEnabled = true;
 }
 
 void AArch64TargetLowering::addTypeForNEON(MVT VT) {
@@ -1490,10 +1445,10 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT) {
   setOperationAction(ISD::SREM, VT, Expand);
   setOperationAction(ISD::FREM, VT, Expand);
 
-  setOperationAction(ISD::FP_TO_SINT, VT, Custom);
-  setOperationAction(ISD::FP_TO_UINT, VT, Custom);
-  setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
-  setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
+  for (unsigned Opcode :
+       {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
+        ISD::FP_TO_UINT_SAT, ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})
+    setOperationAction(Opcode, VT, Custom);
 
   if (!VT.isFloatingPoint())
     setOperationAction(ISD::ABS, VT, Legal);
@@ -1503,14 +1458,39 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT) {
     for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
       setOperationAction(Opcode, VT, Legal);
 
-  // F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
+  // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
+  // NEON types.
   if (VT.isFloatingPoint() &&
       VT.getVectorElementType() != MVT::bf16 &&
       (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
     for (unsigned Opcode :
-         {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM})
+         {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM,
+          ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_FMINNUM,
+          ISD::STRICT_FMAXNUM, ISD::STRICT_FADD, ISD::STRICT_FSUB,
+          ISD::STRICT_FMUL, ISD::STRICT_FDIV, ISD::STRICT_FMA,
+          ISD::STRICT_FSQRT})
       setOperationAction(Opcode, VT, Legal);
 
+  // Strict fp extend and trunc are legal
+  if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
+    setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal);
+  if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
+    setOperationAction(ISD::STRICT_FP_ROUND, VT, Legal);
+
+  // FIXME: We could potentially make use of the vector comparison instructions
+  // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
+  // complications:
+  //  * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
+  //    so we would need to expand when the condition code doesn't match the
+  //    kind of comparison.
+  //  * Some kinds of comparison require more than one FCMXY instruction so
+  //    would need to be expanded instead.
+  //  * The lowering of the non-strict versions involves target-specific ISD
+  //    nodes so we would likely need to add strict versions of all of them and
+  //    handle them appropriately.
+  setOperationAction(ISD::STRICT_FSETCC, VT, Expand);
+  setOperationAction(ISD::STRICT_FSETCCS, VT, Expand);
+
   if (Subtarget->isLittleEndian()) {
     for (unsigned im = (unsigned)ISD::PRE_INC;
          im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
@@ -1526,9 +1506,11 @@ bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
   if (!Subtarget->hasSVE())
     return true;
 
-  // We can only support legal predicate result types.
+  // We can only support legal predicate result types. We can use the SVE
+  // whilelo instruction for generating fixed-width predicates too.
   if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
-      ResVT != MVT::nxv16i1)
+      ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
+      ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
     return true;
 
   // The whilelo instruction only works with i32 or i64 scalar inputs.
@@ -1559,7 +1541,7 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
     setCondCodeAction(ISD::SETUGE, VT, Expand);
     setCondCodeAction(ISD::SETUGT, VT, Expand);
     setCondCodeAction(ISD::SETUEQ, VT, Expand);
-    setCondCodeAction(ISD::SETUNE, VT, Expand);
+    setCondCodeAction(ISD::SETONE, VT, Expand);
   }
 
   // Mark integer truncating stores/extending loads as having custom lowering
@@ -1830,11 +1812,21 @@ bool AArch64TargetLowering::targetShrinkDemandedConstant(
 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
 /// Mask are known to be either zero or one and return them Known.
 void AArch64TargetLowering::computeKnownBitsForTargetNode(
-    const SDValue Op, KnownBits &Known,
-    const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
+    const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+    const SelectionDAG &DAG, unsigned Depth) const {
   switch (Op.getOpcode()) {
   default:
     break;
+  case AArch64ISD::DUP: {
+    SDValue SrcOp = Op.getOperand(0);
+    Known = DAG.computeKnownBits(SrcOp, Depth + 1);
+    if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
+      assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
+             "Expected DUP implicit truncation");
+      Known = Known.trunc(Op.getScalarValueSizeInBits());
+    }
+    break;
+  }
   case AArch64ISD::CSEL: {
     KnownBits Known2;
     Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
@@ -2006,7 +1998,6 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
     MAKE_CASE(AArch64ISD::ABDS_PRED)
     MAKE_CASE(AArch64ISD::ABDU_PRED)
-    MAKE_CASE(AArch64ISD::ADD_PRED)
     MAKE_CASE(AArch64ISD::MUL_PRED)
     MAKE_CASE(AArch64ISD::MULHS_PRED)
     MAKE_CASE(AArch64ISD::MULHU_PRED)
@@ -2016,7 +2007,6 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::SMIN_PRED)
     MAKE_CASE(AArch64ISD::SRA_PRED)
     MAKE_CASE(AArch64ISD::SRL_PRED)
-    MAKE_CASE(AArch64ISD::SUB_PRED)
     MAKE_CASE(AArch64ISD::UDIV_PRED)
     MAKE_CASE(AArch64ISD::UMAX_PRED)
     MAKE_CASE(AArch64ISD::UMIN_PRED)
@@ -2061,6 +2051,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::DUPLANE16)
     MAKE_CASE(AArch64ISD::DUPLANE32)
     MAKE_CASE(AArch64ISD::DUPLANE64)
+    MAKE_CASE(AArch64ISD::DUPLANE128)
     MAKE_CASE(AArch64ISD::MOVI)
     MAKE_CASE(AArch64ISD::MOVIshift)
     MAKE_CASE(AArch64ISD::MOVIedit)
@@ -2108,10 +2099,6 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::FCMLTz)
     MAKE_CASE(AArch64ISD::SADDV)
     MAKE_CASE(AArch64ISD::UADDV)
-    MAKE_CASE(AArch64ISD::SRHADD)
-    MAKE_CASE(AArch64ISD::URHADD)
-    MAKE_CASE(AArch64ISD::SHADD)
-    MAKE_CASE(AArch64ISD::UHADD)
     MAKE_CASE(AArch64ISD::SDOT)
     MAKE_CASE(AArch64ISD::UDOT)
     MAKE_CASE(AArch64ISD::SMINV)
@@ -2150,6 +2137,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::FMINNMV_PRED)
     MAKE_CASE(AArch64ISD::FMUL_PRED)
     MAKE_CASE(AArch64ISD::FSUB_PRED)
+    MAKE_CASE(AArch64ISD::RDSVL)
     MAKE_CASE(AArch64ISD::BIC)
     MAKE_CASE(AArch64ISD::BIT)
     MAKE_CASE(AArch64ISD::CBZ)
@@ -2267,10 +2255,13 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU)
     MAKE_CASE(AArch64ISD::REVH_MERGE_PASSTHRU)
     MAKE_CASE(AArch64ISD::REVW_MERGE_PASSTHRU)
+    MAKE_CASE(AArch64ISD::REVD_MERGE_PASSTHRU)
     MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU)
     MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU)
     MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU)
     MAKE_CASE(AArch64ISD::INDEX_VECTOR)
+    MAKE_CASE(AArch64ISD::ADDP)
+    MAKE_CASE(AArch64ISD::SADDLP)
     MAKE_CASE(AArch64ISD::UADDLP)
     MAKE_CASE(AArch64ISD::CALL_RVMARKER)
     MAKE_CASE(AArch64ISD::ASSERT_ZEXT_BOOL)
@@ -2278,6 +2269,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::MOPS_MEMSET_TAGGING)
     MAKE_CASE(AArch64ISD::MOPS_MEMCOPY)
     MAKE_CASE(AArch64ISD::MOPS_MEMMOVE)
+    MAKE_CASE(AArch64ISD::CALL_BTI)
   }
 #undef MAKE_CASE
   return nullptr;
@@ -2351,6 +2343,92 @@ MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
   return BB;
 }
 
+MachineBasicBlock *
+AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
+                                    MachineInstr &MI,
+                                    MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
+
+  MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
+  MIB.add(MI.getOperand(1)); // slice index register
+  MIB.add(MI.getOperand(2)); // slice index offset
+  MIB.add(MI.getOperand(3)); // pg
+  MIB.add(MI.getOperand(4)); // base
+  MIB.add(MI.getOperand(5)); // offset
+
+  MI.eraseFromParent(); // The pseudo is gone now.
+  return BB;
+}
+
+MachineBasicBlock *
+AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  MachineInstrBuilder MIB =
+      BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
+
+  MIB.addReg(AArch64::ZA, RegState::Define);
+  MIB.add(MI.getOperand(0)); // Vector select register
+  MIB.add(MI.getOperand(1)); // Vector select offset
+  MIB.add(MI.getOperand(2)); // Base
+  MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
+
+  MI.eraseFromParent(); // The pseudo is gone now.
+  return BB;
+}
+
+MachineBasicBlock *
+AArch64TargetLowering::EmitMopa(unsigned Opc, unsigned BaseReg,
+                                MachineInstr &MI, MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
+
+  MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
+  MIB.addReg(BaseReg + MI.getOperand(0).getImm());
+  MIB.add(MI.getOperand(1)); // pn
+  MIB.add(MI.getOperand(2)); // pm
+  MIB.add(MI.getOperand(3)); // zn
+  MIB.add(MI.getOperand(4)); // zm
+
+  MI.eraseFromParent(); // The pseudo is gone now.
+  return BB;
+}
+
+MachineBasicBlock *
+AArch64TargetLowering::EmitInsertVectorToTile(unsigned Opc, unsigned BaseReg,
+                                              MachineInstr &MI,
+                                              MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
+
+  MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
+  MIB.addReg(BaseReg + MI.getOperand(0).getImm());
+  MIB.add(MI.getOperand(1)); // Slice index register
+  MIB.add(MI.getOperand(2)); // Slice index offset
+  MIB.add(MI.getOperand(3)); // pg
+  MIB.add(MI.getOperand(4)); // zn
+
+  MI.eraseFromParent(); // The pseudo is gone now.
+  return BB;
+}
+
+MachineBasicBlock *
+AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  MachineInstrBuilder MIB =
+      BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
+  MIB.add(MI.getOperand(0)); // Mask
+
+  unsigned Mask = MI.getOperand(0).getImm();
+  for (unsigned I = 0; I < 8; I++) {
+    if (Mask & (1 << I))
+      MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
+  }
+
+  MI.eraseFromParent(); // The pseudo is gone now.
+  return BB;
+}
+
 MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
     MachineInstr &MI, MachineBasicBlock *BB) const {
   switch (MI.getOpcode()) {
@@ -2366,9 +2444,14 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
   case TargetOpcode::STATEPOINT:
     // STATEPOINT is a pseudo instruction which has no implicit defs/uses
     // while bl call instruction (where statepoint will be lowered at the end)
-    // has implicit def. Add this implicit dead def here as a workaround.
-    MI.addOperand(*MI.getMF(), MachineOperand::CreateReg(AArch64::LR, true,
-                                                         true, false, true));
+    // has implicit def. This def is early-clobber as it will be set at
+    // the moment of the call and earlier than any use is read.
+    // Add this implicit dead def here as a workaround.
+    MI.addOperand(*MI.getMF(),
+                  MachineOperand::CreateReg(
+                      AArch64::LR, /*isDef*/ true,
+                      /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
+                      /*isUndef*/ false, /*isEarlyClobber*/ true));
     LLVM_FALLTHROUGH;
   case TargetOpcode::STACKMAP:
   case TargetOpcode::PATCHPOINT:
@@ -2376,6 +2459,108 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
 
   case AArch64::CATCHRET:
     return EmitLoweredCatchRet(MI, BB);
+  case AArch64::LD1_MXIPXX_H_PSEUDO_B:
+    return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
+  case AArch64::LD1_MXIPXX_H_PSEUDO_H:
+    return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
+  case AArch64::LD1_MXIPXX_H_PSEUDO_S:
+    return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
+  case AArch64::LD1_MXIPXX_H_PSEUDO_D:
+    return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
+  case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
+    return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
+  case AArch64::LD1_MXIPXX_V_PSEUDO_B:
+    return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
+  case AArch64::LD1_MXIPXX_V_PSEUDO_H:
+    return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
+  case AArch64::LD1_MXIPXX_V_PSEUDO_S:
+    return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
+  case AArch64::LD1_MXIPXX_V_PSEUDO_D:
+    return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
+  case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
+    return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
+  case AArch64::LDR_ZA_PSEUDO:
+    return EmitFill(MI, BB);
+  case AArch64::BFMOPA_MPPZZ_PSEUDO:
+    return EmitMopa(AArch64::BFMOPA_MPPZZ, AArch64::ZAS0, MI, BB);
+  case AArch64::BFMOPS_MPPZZ_PSEUDO:
+    return EmitMopa(AArch64::BFMOPS_MPPZZ, AArch64::ZAS0, MI, BB);
+  case AArch64::FMOPAL_MPPZZ_PSEUDO:
+    return EmitMopa(AArch64::FMOPAL_MPPZZ, AArch64::ZAS0, MI, BB);
+  case AArch64::FMOPSL_MPPZZ_PSEUDO:
+    return EmitMopa(AArch64::FMOPSL_MPPZZ, AArch64::ZAS0, MI, BB);
+  case AArch64::FMOPA_MPPZZ_S_PSEUDO:
+    return EmitMopa(AArch64::FMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
+  case AArch64::FMOPS_MPPZZ_S_PSEUDO:
+    return EmitMopa(AArch64::FMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
+  case AArch64::FMOPA_MPPZZ_D_PSEUDO:
+    return EmitMopa(AArch64::FMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
+  case AArch64::FMOPS_MPPZZ_D_PSEUDO:
+    return EmitMopa(AArch64::FMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
+  case AArch64::SMOPA_MPPZZ_S_PSEUDO:
+    return EmitMopa(AArch64::SMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
+  case AArch64::SMOPS_MPPZZ_S_PSEUDO:
+    return EmitMopa(AArch64::SMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
+  case AArch64::UMOPA_MPPZZ_S_PSEUDO:
+    return EmitMopa(AArch64::UMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
+  case AArch64::UMOPS_MPPZZ_S_PSEUDO:
+    return EmitMopa(AArch64::UMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
+  case AArch64::SUMOPA_MPPZZ_S_PSEUDO:
+    return EmitMopa(AArch64::SUMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
+  case AArch64::SUMOPS_MPPZZ_S_PSEUDO:
+    return EmitMopa(AArch64::SUMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
+  case AArch64::USMOPA_MPPZZ_S_PSEUDO:
+    return EmitMopa(AArch64::USMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
+  case AArch64::USMOPS_MPPZZ_S_PSEUDO:
+    return EmitMopa(AArch64::USMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
+  case AArch64::SMOPA_MPPZZ_D_PSEUDO:
+    return EmitMopa(AArch64::SMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
+  case AArch64::SMOPS_MPPZZ_D_PSEUDO:
+    return EmitMopa(AArch64::SMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
+  case AArch64::UMOPA_MPPZZ_D_PSEUDO:
+    return EmitMopa(AArch64::UMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
+  case AArch64::UMOPS_MPPZZ_D_PSEUDO:
+    return EmitMopa(AArch64::UMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
+  case AArch64::SUMOPA_MPPZZ_D_PSEUDO:
+    return EmitMopa(AArch64::SUMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
+  case AArch64::SUMOPS_MPPZZ_D_PSEUDO:
+    return EmitMopa(AArch64::SUMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
+  case AArch64::USMOPA_MPPZZ_D_PSEUDO:
+    return EmitMopa(AArch64::USMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
+  case AArch64::USMOPS_MPPZZ_D_PSEUDO:
+    return EmitMopa(AArch64::USMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
+  case AArch64::INSERT_MXIPZ_H_PSEUDO_B:
+    return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_B, AArch64::ZAB0, MI,
+                                  BB);
+  case AArch64::INSERT_MXIPZ_H_PSEUDO_H:
+    return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_H, AArch64::ZAH0, MI,
+                                  BB);
+  case AArch64::INSERT_MXIPZ_H_PSEUDO_S:
+    return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_S, AArch64::ZAS0, MI,
+                                  BB);
+  case AArch64::INSERT_MXIPZ_H_PSEUDO_D:
+    return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_D, AArch64::ZAD0, MI,
+                                  BB);
+  case AArch64::INSERT_MXIPZ_H_PSEUDO_Q:
+    return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_Q, AArch64::ZAQ0, MI,
+                                  BB);
+  case AArch64::INSERT_MXIPZ_V_PSEUDO_B:
+    return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_B, AArch64::ZAB0, MI,
+                                  BB);
+  case AArch64::INSERT_MXIPZ_V_PSEUDO_H:
+    return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_H, AArch64::ZAH0, MI,
+                                  BB);
+  case AArch64::INSERT_MXIPZ_V_PSEUDO_S:
+    return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_S, AArch64::ZAS0, MI,
+                                  BB);
+  case AArch64::INSERT_MXIPZ_V_PSEUDO_D:
+    return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_D, AArch64::ZAD0, MI,
+                                  BB);
+  case AArch64::INSERT_MXIPZ_V_PSEUDO_Q:
+    return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_Q, AArch64::ZAQ0, MI,
+                                  BB);
+  case AArch64::ZERO_M_PSEUDO:
+    return EmitZero(MI, BB);
   }
 }
 
@@ -2596,7 +2781,17 @@ static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl,
                                       bool IsSignaling) {
   EVT VT = LHS.getValueType();
   assert(VT != MVT::f128);
-  assert(VT != MVT::f16 && "Lowering of strict fp16 not yet implemented");
+
+  const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
+
+  if (VT == MVT::f16 && !FullFP16) {
+    LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
+                      {Chain, LHS});
+    RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
+                      {LHS.getValue(1), RHS});
+    Chain = RHS.getValue(1);
+    VT = MVT::f32;
+  }
   unsigned Opcode =
       IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
   return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
@@ -2605,8 +2800,7 @@ static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl,
 static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                               const SDLoc &dl, SelectionDAG &DAG) {
   EVT VT = LHS.getValueType();
-  const bool FullFP16 =
-    static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
+  const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
 
   if (VT.isFloatingPoint()) {
     assert(VT != MVT::f128);
@@ -2714,8 +2908,7 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
                                          AArch64CC::CondCode OutCC,
                                          const SDLoc &DL, SelectionDAG &DAG) {
   unsigned Opcode = 0;
-  const bool FullFP16 =
-    static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
+  const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
 
   if (LHS.getValueType().isFloatingPoint()) {
     assert(LHS.getValueType() != MVT::f128);
@@ -3282,40 +3475,68 @@ SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
   return Op;
 }
 
-static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
-  EVT VT = Op.getValueType();
+// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
+// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
+// sets 'C' bit to 0.
+static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert) {
+  SDLoc DL(Value);
+  EVT VT = Value.getValueType();
+  SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
+  SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
+  SDValue Cmp =
+      DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1);
+  return Cmp.getValue(1);
+}
 
-  // Let legalize expand this if it isn't a legal type yet.
-  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
+// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
+static SDValue carryFlagToValue(SDValue Flag, EVT VT, SelectionDAG &DAG,
+                                bool Invert) {
+  assert(Flag.getResNo() == 1);
+  SDLoc DL(Flag);
+  SDValue Zero = DAG.getConstant(0, DL, VT);
+  SDValue One = DAG.getConstant(1, DL, VT);
+  unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
+  SDValue CC = DAG.getConstant(Cond, DL, MVT::i32);
+  return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Flag);
+}
+
+// Value is 1 if 'V' bit of NZCV is 1, else 0
+static SDValue overflowFlagToValue(SDValue Flag, EVT VT, SelectionDAG &DAG) {
+  assert(Flag.getResNo() == 1);
+  SDLoc DL(Flag);
+  SDValue Zero = DAG.getConstant(0, DL, VT);
+  SDValue One = DAG.getConstant(1, DL, VT);
+  SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32);
+  return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Flag);
+}
+
+// This lowering is inefficient, but it will get cleaned up by
+// `foldOverflowCheck`
+static SDValue lowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode,
+                                bool IsSigned) {
+  EVT VT0 = Op.getValue(0).getValueType();
+  EVT VT1 = Op.getValue(1).getValueType();
+
+  if (VT0 != MVT::i32 && VT0 != MVT::i64)
     return SDValue();
 
-  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+  bool InvertCarry = Opcode == AArch64ISD::SBCS;
+  SDValue OpLHS = Op.getOperand(0);
+  SDValue OpRHS = Op.getOperand(1);
+  SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
 
-  unsigned Opc;
-  bool ExtraOp = false;
-  switch (Op.getOpcode()) {
-  default:
-    llvm_unreachable("Invalid code");
-  case ISD::ADDC:
-    Opc = AArch64ISD::ADDS;
-    break;
-  case ISD::SUBC:
-    Opc = AArch64ISD::SUBS;
-    break;
-  case ISD::ADDE:
-    Opc = AArch64ISD::ADCS;
-    ExtraOp = true;
-    break;
-  case ISD::SUBE:
-    Opc = AArch64ISD::SBCS;
-    ExtraOp = true;
-    break;
-  }
+  SDLoc DL(Op);
+  SDVTList VTs = DAG.getVTList(VT0, VT1);
+
+  SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS,
+                            OpRHS, OpCarryIn);
+
+  SDValue OutFlag =
+      IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
+               : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
 
-  if (!ExtraOp)
-    return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
-  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
-                     Op.getOperand(2));
+  return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag);
 }
 
 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
@@ -3417,7 +3638,8 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
   // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
   // Any additional optimization in this function should be recorded
   // in the cost tables.
-  EVT InVT = Op.getOperand(0).getValueType();
+  bool IsStrict = Op->isStrictFPOpcode();
+  EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
   EVT VT = Op.getValueType();
 
   if (VT.isScalableVector()) {
@@ -3437,6 +3659,12 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
       !Subtarget->hasFullFP16()) {
     MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
     SDLoc dl(Op);
+    if (IsStrict) {
+      SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other},
+                                {Op.getOperand(0), Op.getOperand(1)});
+      return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
+                         {Ext.getValue(1), Ext.getValue(0)});
+    }
     return DAG.getNode(
         Op.getOpcode(), dl, Op.getValueType(),
         DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
@@ -3446,6 +3674,13 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
   uint64_t InVTSize = InVT.getFixedSizeInBits();
   if (VTSize < InVTSize) {
     SDLoc dl(Op);
+    if (IsStrict) {
+      InVT = InVT.changeVectorElementTypeToInteger();
+      SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other},
+                               {Op.getOperand(0), Op.getOperand(1)});
+      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
+      return DAG.getMergeValues({Trunc, Cv.getValue(1)}, dl);
+    }
     SDValue Cv =
         DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
                     Op.getOperand(0));
@@ -3457,10 +3692,30 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
     MVT ExtVT =
         MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
                          VT.getVectorNumElements());
+    if (IsStrict) {
+      SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {ExtVT, MVT::Other},
+                                {Op.getOperand(0), Op.getOperand(1)});
+      return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
+                         {Ext.getValue(1), Ext.getValue(0)});
+    }
     SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
     return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
   }
 
+  // Use a scalar operation for conversions between single-element vectors of
+  // the same size.
+  if (NumElts == 1) {
+    SDLoc dl(Op);
+    SDValue Extract = DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(),
+        Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64));
+    EVT ScalarVT = VT.getScalarType();
+    if (IsStrict)
+      return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
+                         {Op.getOperand(0), Extract});
+    return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
+  }
+
   // Type changing conversions are illegal.
   return Op;
 }
@@ -3475,8 +3730,14 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
 
   // f16 conversions are promoted to f32 when full fp16 is not supported.
   if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
-    assert(!IsStrict && "Lowering of strict fp16 not yet implemented");
     SDLoc dl(Op);
+    if (IsStrict) {
+      SDValue Ext =
+          DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
+                      {Op.getOperand(0), SrcVal});
+      return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other},
+                         {Ext.getValue(1), Ext.getValue(0)});
+    }
     return DAG.getNode(
         Op.getOpcode(), dl, Op.getValueType(),
         DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
@@ -3507,7 +3768,7 @@ AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
          "Saturation width cannot exceed result width");
 
   // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
-  // Currently, the `llvm.fpto[su]i.sat.*` instrinsics don't accept scalable
+  // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
   // types, so this is hard to reach.
   if (DstVT.isScalableVector())
     return SDValue();
@@ -3545,17 +3806,14 @@ AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
   SDValue Sat;
   if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
     SDValue MinC = DAG.getConstant(
-        APInt::getSignedMaxValue(SatWidth).sextOrSelf(SrcElementWidth), DL,
-        IntVT);
+        APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
     SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
     SDValue MaxC = DAG.getConstant(
-        APInt::getSignedMinValue(SatWidth).sextOrSelf(SrcElementWidth), DL,
-        IntVT);
+        APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
     Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
   } else {
     SDValue MinC = DAG.getConstant(
-        APInt::getAllOnesValue(SatWidth).zextOrSelf(SrcElementWidth), DL,
-        IntVT);
+        APInt::getAllOnesValue(SatWidth).zext(SrcElementWidth), DL, IntVT);
     Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
   }
 
@@ -3604,14 +3862,14 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
   SDValue Sat;
   if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
     SDValue MinC = DAG.getConstant(
-        APInt::getSignedMaxValue(SatWidth).sextOrSelf(DstWidth), DL, DstVT);
+        APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
     SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
     SDValue MaxC = DAG.getConstant(
-        APInt::getSignedMinValue(SatWidth).sextOrSelf(DstWidth), DL, DstVT);
+        APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
     Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
   } else {
     SDValue MinC = DAG.getConstant(
-        APInt::getAllOnesValue(SatWidth).zextOrSelf(DstWidth), DL, DstVT);
+        APInt::getAllOnesValue(SatWidth).zext(DstWidth), DL, DstVT);
     Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
   }
 
@@ -3623,9 +3881,10 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
   // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
   // Any additional optimization in this function should be recorded
   // in the cost tables.
+  bool IsStrict = Op->isStrictFPOpcode();
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
-  SDValue In = Op.getOperand(0);
+  SDValue In = Op.getOperand(IsStrict ? 1 : 0);
   EVT InVT = In.getValueType();
   unsigned Opc = Op.getOpcode();
   bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
@@ -3653,6 +3912,13 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
     MVT CastVT =
         MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
                          InVT.getVectorNumElements());
+    if (IsStrict) {
+      In = DAG.getNode(Opc, dl, {CastVT, MVT::Other},
+                       {Op.getOperand(0), In});
+      return DAG.getNode(
+          ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
+          {In.getValue(1), In.getValue(0), DAG.getIntPtrConstant(0, dl)});
+    }
     In = DAG.getNode(Opc, dl, CastVT, In);
     return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
   }
@@ -3661,9 +3927,24 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
     unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
     EVT CastVT = VT.changeVectorElementTypeToInteger();
     In = DAG.getNode(CastOpc, dl, CastVT, In);
+    if (IsStrict)
+      return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(0), In});
     return DAG.getNode(Opc, dl, VT, In);
   }
 
+  // Use a scalar operation for conversions between single-element vectors of
+  // the same size.
+  if (VT.getVectorNumElements() == 1) {
+    SDValue Extract = DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(),
+        In, DAG.getConstant(0, dl, MVT::i64));
+    EVT ScalarVT = VT.getScalarType();
+    if (IsStrict)
+      return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
+                         {Op.getOperand(0), Extract});
+    return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
+  }
+
   return Op;
 }
 
@@ -3676,10 +3957,15 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
   SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
 
   // f16 conversions are promoted to f32 when full fp16 is not supported.
-  if (Op.getValueType() == MVT::f16 &&
-      !Subtarget->hasFullFP16()) {
-    assert(!IsStrict && "Lowering of strict fp16 not yet implemented");
+  if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
     SDLoc dl(Op);
+    if (IsStrict) {
+      SDValue Val = DAG.getNode(Op.getOpcode(), dl, {MVT::f32, MVT::Other},
+                                {Op.getOperand(0), SrcVal});
+      return DAG.getNode(
+          ISD::STRICT_FP_ROUND, dl, {MVT::f16, MVT::Other},
+          {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
+    }
     return DAG.getNode(
         ISD::FP_ROUND, dl, MVT::f16,
         DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal),
@@ -3742,6 +4028,14 @@ SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
     return LowerFixedLengthBitcastToSVE(Op, DAG);
 
   if (OpVT.isScalableVector()) {
+    // Bitcasting between unpacked vector types of different element counts is
+    // not a NOP because the live elements are laid out differently.
+    //                01234567
+    // e.g. nxv2i32 = XX??XX??
+    //      nxv4f16 = X?X?X?X?
+    if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
+      return SDValue();
+
     if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) {
       assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
              "Expected int->fp bitcast!");
@@ -3964,7 +4258,7 @@ SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
   bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64;
 
   if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED, OverrideNEON);
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
 
   // Multiplications are only custom-lowered for 128-bit vectors so that
   // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
@@ -4059,10 +4353,26 @@ static SDValue lowerConvertToSVBool(SDValue Op, SelectionDAG &DAG) {
   case AArch64ISD::SETCC_MERGE_ZERO:
     return Reinterpret;
   case ISD::INTRINSIC_WO_CHAIN:
-    if (InOp.getConstantOperandVal(0) == Intrinsic::aarch64_sve_ptrue)
+    switch (InOp.getConstantOperandVal(0)) {
+    case Intrinsic::aarch64_sve_ptrue:
+    case Intrinsic::aarch64_sve_cmpeq_wide:
+    case Intrinsic::aarch64_sve_cmpne_wide:
+    case Intrinsic::aarch64_sve_cmpge_wide:
+    case Intrinsic::aarch64_sve_cmpgt_wide:
+    case Intrinsic::aarch64_sve_cmplt_wide:
+    case Intrinsic::aarch64_sve_cmple_wide:
+    case Intrinsic::aarch64_sve_cmphs_wide:
+    case Intrinsic::aarch64_sve_cmphi_wide:
+    case Intrinsic::aarch64_sve_cmplo_wide:
+    case Intrinsic::aarch64_sve_cmpls_wide:
       return Reinterpret;
+    }
   }
 
+  // Splat vectors of one will generate ptrue instructions
+  if (ISD::isConstantSplatVectorAllOnes(InOp.getNode()))
+    return Reinterpret;
+
   // Otherwise, zero the newly introduced lanes.
   SDValue Mask = getPTrue(DAG, DL, InVT, AArch64SVEPredPattern::all);
   SDValue MaskReinterpret =
@@ -4073,12 +4383,12 @@ static SDValue lowerConvertToSVBool(SDValue Op, SelectionDAG &DAG) {
 SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
                                                       SelectionDAG &DAG) const {
   unsigned IntNo = Op.getConstantOperandVal(1);
+  SDLoc DL(Op);
   switch (IntNo) {
   default:
     return SDValue(); // Don't custom lower most intrinsics.
   case Intrinsic::aarch64_mops_memset_tag: {
     auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
-    SDLoc DL(Op);
     SDValue Chain = Node->getChain();
     SDValue Dst = Op.getOperand(2);
     SDValue Val = Op.getOperand(3);
@@ -4100,6 +4410,15 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     // changed.
     return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
   }
+  case Intrinsic::aarch64_sme_get_pstatesm: {
+    SDValue Chain = Op.getOperand(0);
+    SDValue MRS = DAG.getNode(
+        AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::Glue, MVT::Other),
+        Chain, DAG.getConstant(AArch64SysReg::SVCR, DL, MVT::i64));
+    SDValue Mask = DAG.getConstant(/* PSTATE.SM */ 1, DL, MVT::i64);
+    SDValue And = DAG.getNode(ISD::AND, DL, MVT::i64, MRS, Mask);
+    return DAG.getMergeValues({And, Chain}, DL);
+  }
   }
 }
 
@@ -4196,6 +4515,26 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::aarch64_sve_clz:
     return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+  case Intrinsic::aarch64_sme_cntsb:
+    return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
+                       DAG.getConstant(1, dl, MVT::i32));
+  case Intrinsic::aarch64_sme_cntsh: {
+    SDValue One = DAG.getConstant(1, dl, MVT::i32);
+    SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), One);
+    return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, One);
+  }
+  case Intrinsic::aarch64_sme_cntsw: {
+    SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
+                                DAG.getConstant(1, dl, MVT::i32));
+    return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
+                       DAG.getConstant(2, dl, MVT::i32));
+  }
+  case Intrinsic::aarch64_sme_cntsd: {
+    SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
+                                DAG.getConstant(1, dl, MVT::i32));
+    return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
+                       DAG.getConstant(3, dl, MVT::i32));
+  }
   case Intrinsic::aarch64_sve_cnt: {
     SDValue Data = Op.getOperand(3);
     // CTPOP only supports integer operands.
@@ -4300,6 +4639,9 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::aarch64_sve_revw:
     return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(),
                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+  case Intrinsic::aarch64_sve_revd:
+    return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, dl, Op.getValueType(),
+                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
   case Intrinsic::aarch64_sve_sxtb:
     return DAG.getNode(
         AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
@@ -4336,7 +4678,6 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
         Op.getOperand(2), Op.getOperand(3),
         DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
         Op.getOperand(1));
-
   case Intrinsic::localaddress: {
     const auto &MF = DAG.getMachineFunction();
     const auto *RegInfo = Subtarget->getRegisterInfo();
@@ -4382,9 +4723,9 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                         IntNo == Intrinsic::aarch64_neon_shadd);
     bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
                           IntNo == Intrinsic::aarch64_neon_urhadd);
-    unsigned Opcode =
-        IsSignedAdd ? (IsRoundingAdd ? AArch64ISD::SRHADD : AArch64ISD::SHADD)
-                    : (IsRoundingAdd ? AArch64ISD::URHADD : AArch64ISD::UHADD);
+    unsigned Opcode = IsSignedAdd
+                          ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
+                          : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
     return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
                        Op.getOperand(2));
   }
@@ -4395,8 +4736,11 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
                        Op.getOperand(2));
   }
+  case Intrinsic::aarch64_neon_saddlp:
   case Intrinsic::aarch64_neon_uaddlp: {
-    unsigned Opcode = AArch64ISD::UADDLP;
+    unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
+                          ? AArch64ISD::UADDLP
+                          : AArch64ISD::SADDLP;
     return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1));
   }
   case Intrinsic::aarch64_neon_sdot:
@@ -4428,19 +4772,26 @@ bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
   return false;
 }
 
-bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const {
-  if (VT.getVectorElementType() == MVT::i32 &&
-      VT.getVectorElementCount().getKnownMinValue() >= 4 &&
-      !VT.isFixedLengthVector())
-    return true;
+bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT IndexVT,
+                                                          EVT DataVT) const {
+  // SVE only supports implicit extension of 32-bit indices.
+  if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
+    return false;
 
-  return false;
+  // Indices cannot be smaller than the main data type.
+  if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
+    return false;
+
+  // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
+  // element container type, which would violate the previous clause.
+  return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
 }
 
 bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
   return ExtVal.getValueType().isScalableVector() ||
-         useSVEForFixedLengthVectorVT(ExtVal.getValueType(),
-                                      /*OverrideNEON=*/true);
+         useSVEForFixedLengthVectorVT(
+             ExtVal.getValueType(),
+             /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors());
 }
 
 unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
@@ -4466,29 +4817,6 @@ unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
   return AddrModes.find(Key)->second;
 }
 
-unsigned getScatterVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
-  std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
-      {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
-       AArch64ISD::SST1_PRED},
-      {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
-       AArch64ISD::SST1_UXTW_PRED},
-      {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
-       AArch64ISD::SST1_PRED},
-      {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
-       AArch64ISD::SST1_SXTW_PRED},
-      {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
-       AArch64ISD::SST1_SCALED_PRED},
-      {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
-       AArch64ISD::SST1_UXTW_SCALED_PRED},
-      {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
-       AArch64ISD::SST1_SCALED_PRED},
-      {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
-       AArch64ISD::SST1_SXTW_SCALED_PRED},
-  };
-  auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
-  return AddrModes.find(Key)->second;
-}
-
 unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
   switch (Opcode) {
   default:
@@ -4511,267 +4839,184 @@ unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
   }
 }
 
-bool getGatherScatterIndexIsExtended(SDValue Index) {
-  unsigned Opcode = Index.getOpcode();
-  if (Opcode == ISD::SIGN_EXTEND_INREG)
-    return true;
+SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
 
-  if (Opcode == ISD::AND) {
-    SDValue Splat = Index.getOperand(1);
-    if (Splat.getOpcode() != ISD::SPLAT_VECTOR)
-      return false;
-    ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Splat.getOperand(0));
-    if (!Mask || Mask->getZExtValue() != 0xFFFFFFFF)
-      return false;
-    return true;
+  SDLoc DL(Op);
+  SDValue Chain = MGT->getChain();
+  SDValue PassThru = MGT->getPassThru();
+  SDValue Mask = MGT->getMask();
+  SDValue BasePtr = MGT->getBasePtr();
+  SDValue Index = MGT->getIndex();
+  SDValue Scale = MGT->getScale();
+  EVT VT = Op.getValueType();
+  EVT MemVT = MGT->getMemoryVT();
+  ISD::LoadExtType ExtType = MGT->getExtensionType();
+  ISD::MemIndexType IndexType = MGT->getIndexType();
+
+  // SVE supports zero (and so undef) passthrough values only, everything else
+  // must be handled manually by an explicit select on the load's output.
+  if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
+    SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
+    SDValue Load =
+        DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
+                            MGT->getMemOperand(), IndexType, ExtType);
+    SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
+    return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
+  }
+
+  bool IsScaled = MGT->isIndexScaled();
+  bool IsSigned = MGT->isIndexSigned();
+
+  // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
+  // must be calculated before hand.
+  uint64_t ScaleVal = cast<ConstantSDNode>(Scale)->getZExtValue();
+  if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
+    assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
+    EVT IndexVT = Index.getValueType();
+    Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
+                        DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
+    Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
+
+    SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
+    return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
+                               MGT->getMemOperand(), IndexType, ExtType);
+  }
+
+  // Lower fixed length gather to a scalable equivalent.
+  if (VT.isFixedLengthVector()) {
+    assert(Subtarget->useSVEForFixedLengthVectors() &&
+           "Cannot lower when not using SVE for fixed vectors!");
+
+    // NOTE: Handle floating-point as if integer then bitcast the result.
+    EVT DataVT = VT.changeVectorElementTypeToInteger();
+    MemVT = MemVT.changeVectorElementTypeToInteger();
+
+    // Find the smallest integer fixed length vector we can use for the gather.
+    EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
+    if (DataVT.getVectorElementType() == MVT::i64 ||
+        Index.getValueType().getVectorElementType() == MVT::i64 ||
+        Mask.getValueType().getVectorElementType() == MVT::i64)
+      PromotedVT = VT.changeVectorElementType(MVT::i64);
+
+    // Promote vector operands except for passthrough, which we know is either
+    // undef or zero, and thus best constructed directly.
+    unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
+    Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
+
+    // A promoted result type forces the need for an extending load.
+    if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
+      ExtType = ISD::EXTLOAD;
+
+    EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
+
+    // Convert fixed length vector operands to scalable.
+    MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
+    Index = convertToScalableVector(DAG, ContainerVT, Index);
+    Mask = convertFixedMaskToScalableVector(Mask, DAG);
+    PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
+                                   : DAG.getConstant(0, DL, ContainerVT);
+
+    // Emit equivalent scalable vector gather.
+    SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
+    SDValue Load =
+        DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
+                            Ops, MGT->getMemOperand(), IndexType, ExtType);
+
+    // Extract fixed length data then convert to the required result type.
+    SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
+    Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
+    if (VT.isFloatingPoint())
+      Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
+
+    return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
   }
 
-  return false;
+  // Everything else is legal.
+  return Op;
 }
 
-// If the base pointer of a masked gather or scatter is null, we
-// may be able to swap BasePtr & Index and use the vector + register
-// or vector + immediate addressing mode, e.g.
-// VECTOR + REGISTER:
-//    getelementptr nullptr, <vscale x N x T> (splat(%offset)) + %indices)
-// -> getelementptr %offset, <vscale x N x T> %indices
-// VECTOR + IMMEDIATE:
-//    getelementptr nullptr, <vscale x N x T> (splat(#x)) + %indices)
-// -> getelementptr #x, <vscale x N x T> %indices
-void selectGatherScatterAddrMode(SDValue &BasePtr, SDValue &Index, EVT MemVT,
-                                 unsigned &Opcode, bool IsGather,
-                                 SelectionDAG &DAG) {
-  if (!isNullConstant(BasePtr))
-    return;
-
-  // FIXME: This will not match for fixed vector type codegen as the nodes in
-  // question will have fixed<->scalable conversions around them. This should be
-  // moved to a DAG combine or complex pattern so that is executes after all of
-  // the fixed vector insert and extracts have been removed. This deficiency
-  // will result in a sub-optimal addressing mode being used, i.e. an ADD not
-  // being folded into the scatter/gather.
-  ConstantSDNode *Offset = nullptr;
-  if (Index.getOpcode() == ISD::ADD)
-    if (auto SplatVal = DAG.getSplatValue(Index.getOperand(1))) {
-      if (isa<ConstantSDNode>(SplatVal))
-        Offset = cast<ConstantSDNode>(SplatVal);
-      else {
-        BasePtr = SplatVal;
-        Index = Index->getOperand(0);
-        return;
-      }
-    }
-
-  unsigned NewOp =
-      IsGather ? AArch64ISD::GLD1_IMM_MERGE_ZERO : AArch64ISD::SST1_IMM_PRED;
-
-  if (!Offset) {
-    std::swap(BasePtr, Index);
-    Opcode = NewOp;
-    return;
-  }
-
-  uint64_t OffsetVal = Offset->getZExtValue();
-  unsigned ScalarSizeInBytes = MemVT.getScalarSizeInBits() / 8;
-  auto ConstOffset = DAG.getConstant(OffsetVal, SDLoc(Index), MVT::i64);
-
-  if (OffsetVal % ScalarSizeInBytes || OffsetVal / ScalarSizeInBytes > 31) {
-    // Index is out of range for the immediate addressing mode
-    BasePtr = ConstOffset;
-    Index = Index->getOperand(0);
-    return;
-  }
-
-  // Immediate is in range
-  Opcode = NewOp;
-  BasePtr = Index->getOperand(0);
-  Index = ConstOffset;
-}
+SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
 
-SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
-                                            SelectionDAG &DAG) const {
   SDLoc DL(Op);
-  MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
-  assert(MGT && "Can only custom lower gather load nodes");
-
-  bool IsFixedLength = MGT->getMemoryVT().isFixedLengthVector();
-
-  SDValue Index = MGT->getIndex();
-  SDValue Chain = MGT->getChain();
-  SDValue PassThru = MGT->getPassThru();
-  SDValue Mask = MGT->getMask();
-  SDValue BasePtr = MGT->getBasePtr();
-  ISD::LoadExtType ExtTy = MGT->getExtensionType();
+  SDValue Chain = MSC->getChain();
+  SDValue StoreVal = MSC->getValue();
+  SDValue Mask = MSC->getMask();
+  SDValue BasePtr = MSC->getBasePtr();
+  SDValue Index = MSC->getIndex();
+  SDValue Scale = MSC->getScale();
+  EVT VT = StoreVal.getValueType();
+  EVT MemVT = MSC->getMemoryVT();
+  ISD::MemIndexType IndexType = MSC->getIndexType();
+  bool Truncating = MSC->isTruncatingStore();
 
-  ISD::MemIndexType IndexType = MGT->getIndexType();
-  bool IsScaled =
-      IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED;
-  bool IsSigned =
-      IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED;
-  bool IdxNeedsExtend =
-      getGatherScatterIndexIsExtended(Index) ||
-      Index.getSimpleValueType().getVectorElementType() == MVT::i32;
-  bool ResNeedsSignExtend = ExtTy == ISD::EXTLOAD || ExtTy == ISD::SEXTLOAD;
-
-  EVT VT = PassThru.getSimpleValueType();
-  EVT IndexVT = Index.getSimpleValueType();
-  EVT MemVT = MGT->getMemoryVT();
-  SDValue InputVT = DAG.getValueType(MemVT);
+  bool IsScaled = MSC->isIndexScaled();
+  bool IsSigned = MSC->isIndexSigned();
 
-  if (VT.getVectorElementType() == MVT::bf16 &&
-      !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
-    return SDValue();
+  // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
+  // must be calculated before hand.
+  uint64_t ScaleVal = cast<ConstantSDNode>(Scale)->getZExtValue();
+  if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
+    assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
+    EVT IndexVT = Index.getValueType();
+    Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
+                        DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
+    Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
 
-  if (IsFixedLength) {
-    assert(Subtarget->useSVEForFixedLengthVectors() &&
-           "Cannot lower when not using SVE for fixed vectors");
-    if (MemVT.getScalarSizeInBits() <= IndexVT.getScalarSizeInBits()) {
-      IndexVT = getContainerForFixedLengthVector(DAG, IndexVT);
-      MemVT = IndexVT.changeVectorElementType(MemVT.getVectorElementType());
-    } else {
-      MemVT = getContainerForFixedLengthVector(DAG, MemVT);
-      IndexVT = MemVT.changeTypeToInteger();
-    }
-    InputVT = DAG.getValueType(MemVT.changeTypeToInteger());
-    Mask = DAG.getNode(
-        ISD::SIGN_EXTEND, DL,
-        VT.changeVectorElementType(IndexVT.getVectorElementType()), Mask);
+    SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
+    return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
+                                MSC->getMemOperand(), IndexType, Truncating);
   }
 
-  if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
-    PassThru = SDValue();
+  // Lower fixed length scatter to a scalable equivalent.
+  if (VT.isFixedLengthVector()) {
+    assert(Subtarget->useSVEForFixedLengthVectors() &&
+           "Cannot lower when not using SVE for fixed vectors!");
 
-  if (VT.isFloatingPoint() && !IsFixedLength) {
-    // Handle FP data by using an integer gather and casting the result.
-    if (PassThru) {
-      EVT PassThruVT = getPackedSVEVectorVT(VT.getVectorElementCount());
-      PassThru = getSVESafeBitCast(PassThruVT, PassThru, DAG);
+    // Once bitcast we treat floating-point scatters as if integer.
+    if (VT.isFloatingPoint()) {
+      VT = VT.changeVectorElementTypeToInteger();
+      MemVT = MemVT.changeVectorElementTypeToInteger();
+      StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
     }
-    InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
-  }
 
-  SDVTList VTs = DAG.getVTList(IndexVT, MVT::Other);
+    // Find the smallest integer fixed length vector we can use for the scatter.
+    EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
+    if (VT.getVectorElementType() == MVT::i64 ||
+        Index.getValueType().getVectorElementType() == MVT::i64 ||
+        Mask.getValueType().getVectorElementType() == MVT::i64)
+      PromotedVT = VT.changeVectorElementType(MVT::i64);
 
-  if (getGatherScatterIndexIsExtended(Index))
-    Index = Index.getOperand(0);
+    // Promote vector operands.
+    unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
+    Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
+    StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
 
-  unsigned Opcode = getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend);
-  selectGatherScatterAddrMode(BasePtr, Index, MemVT, Opcode,
-                              /*isGather=*/true, DAG);
+    // A promoted value type forces the need for a truncating store.
+    if (PromotedVT != VT)
+      Truncating = true;
 
-  if (ResNeedsSignExtend)
-    Opcode = getSignExtendedGatherOpcode(Opcode);
+    EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
 
-  if (IsFixedLength) {
-    if (Index.getSimpleValueType().isFixedLengthVector())
-      Index = convertToScalableVector(DAG, IndexVT, Index);
-    if (BasePtr.getSimpleValueType().isFixedLengthVector())
-      BasePtr = convertToScalableVector(DAG, IndexVT, BasePtr);
+    // Convert fixed length vector operands to scalable.
+    MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
+    Index = convertToScalableVector(DAG, ContainerVT, Index);
     Mask = convertFixedMaskToScalableVector(Mask, DAG);
-  }
-
-  SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT};
-  SDValue Result = DAG.getNode(Opcode, DL, VTs, Ops);
-  Chain = Result.getValue(1);
-
-  if (IsFixedLength) {
-    Result = convertFromScalableVector(
-        DAG, VT.changeVectorElementType(IndexVT.getVectorElementType()),
-        Result);
-    Result = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Result);
-    Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
-
-    if (PassThru)
-      Result = DAG.getSelect(DL, VT, MGT->getMask(), Result, PassThru);
-  } else {
-    if (PassThru)
-      Result = DAG.getSelect(DL, IndexVT, Mask, Result, PassThru);
-
-    if (VT.isFloatingPoint())
-      Result = getSVESafeBitCast(VT, Result, DAG);
-  }
-
-  return DAG.getMergeValues({Result, Chain}, DL);
-}
-
-SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
-                                             SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
-  assert(MSC && "Can only custom lower scatter store nodes");
-
-  bool IsFixedLength = MSC->getMemoryVT().isFixedLengthVector();
-
-  SDValue Index = MSC->getIndex();
-  SDValue Chain = MSC->getChain();
-  SDValue StoreVal = MSC->getValue();
-  SDValue Mask = MSC->getMask();
-  SDValue BasePtr = MSC->getBasePtr();
-
-  ISD::MemIndexType IndexType = MSC->getIndexType();
-  bool IsScaled =
-      IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED;
-  bool IsSigned =
-      IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED;
-  bool NeedsExtend =
-      getGatherScatterIndexIsExtended(Index) ||
-      Index.getSimpleValueType().getVectorElementType() == MVT::i32;
-
-  EVT VT = StoreVal.getSimpleValueType();
-  EVT IndexVT = Index.getSimpleValueType();
-  SDVTList VTs = DAG.getVTList(MVT::Other);
-  EVT MemVT = MSC->getMemoryVT();
-  SDValue InputVT = DAG.getValueType(MemVT);
-
-  if (VT.getVectorElementType() == MVT::bf16 &&
-      !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
-    return SDValue();
+    StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
 
-  if (IsFixedLength) {
-    assert(Subtarget->useSVEForFixedLengthVectors() &&
-           "Cannot lower when not using SVE for fixed vectors");
-    if (MemVT.getScalarSizeInBits() <= IndexVT.getScalarSizeInBits()) {
-      IndexVT = getContainerForFixedLengthVector(DAG, IndexVT);
-      MemVT = IndexVT.changeVectorElementType(MemVT.getVectorElementType());
-    } else {
-      MemVT = getContainerForFixedLengthVector(DAG, MemVT);
-      IndexVT = MemVT.changeTypeToInteger();
-    }
-    InputVT = DAG.getValueType(MemVT.changeTypeToInteger());
-
-    StoreVal =
-        DAG.getNode(ISD::BITCAST, DL, VT.changeTypeToInteger(), StoreVal);
-    StoreVal = DAG.getNode(
-        ISD::ANY_EXTEND, DL,
-        VT.changeVectorElementType(IndexVT.getVectorElementType()), StoreVal);
-    StoreVal = convertToScalableVector(DAG, IndexVT, StoreVal);
-    Mask = DAG.getNode(
-        ISD::SIGN_EXTEND, DL,
-        VT.changeVectorElementType(IndexVT.getVectorElementType()), Mask);
-  } else if (VT.isFloatingPoint()) {
-    // Handle FP data by casting the data so an integer scatter can be used.
-    EVT StoreValVT = getPackedSVEVectorVT(VT.getVectorElementCount());
-    StoreVal = getSVESafeBitCast(StoreValVT, StoreVal, DAG);
-    InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
-  }
-
-  if (getGatherScatterIndexIsExtended(Index))
-    Index = Index.getOperand(0);
-
-  unsigned Opcode = getScatterVecOpcode(IsScaled, IsSigned, NeedsExtend);
-  selectGatherScatterAddrMode(BasePtr, Index, MemVT, Opcode,
-                              /*isGather=*/false, DAG);
-
-  if (IsFixedLength) {
-    if (Index.getSimpleValueType().isFixedLengthVector())
-      Index = convertToScalableVector(DAG, IndexVT, Index);
-    if (BasePtr.getSimpleValueType().isFixedLengthVector())
-      BasePtr = convertToScalableVector(DAG, IndexVT, BasePtr);
-    Mask = convertFixedMaskToScalableVector(Mask, DAG);
+    // Emit equivalent scalable vector scatter.
+    SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
+    return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
+                                MSC->getMemOperand(), IndexType, Truncating);
   }
 
-  SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, InputVT};
-  return DAG.getNode(Opcode, DL, VTs, Ops);
+  // Everything else is legal.
+  return Op;
 }
 
 SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
@@ -4780,7 +5025,9 @@ SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
   assert(LoadNode && "Expected custom lowering of a masked load node");
   EVT VT = Op->getValueType(0);
 
-  if (useSVEForFixedLengthVectorVT(VT, true))
+  if (useSVEForFixedLengthVectorVT(
+          VT,
+          /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
     return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
 
   SDValue PassThru = LoadNode->getPassThru();
@@ -4847,7 +5094,9 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
   EVT MemVT = StoreNode->getMemoryVT();
 
   if (VT.isVector()) {
-    if (useSVEForFixedLengthVectorVT(VT, true))
+    if (useSVEForFixedLengthVectorVT(
+            VT,
+            /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
       return LowerFixedLengthVectorStoreToSVE(Op, DAG);
 
     unsigned AS = StoreNode->getAddressSpace();
@@ -5007,6 +5256,22 @@ SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
                      Cmp.getValue(1));
 }
 
+static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Cond = Op.getOperand(1);
+  SDValue Dest = Op.getOperand(2);
+
+  AArch64CC::CondCode CC;
+  if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
+    SDLoc dl(Op);
+    SDValue CCVal = DAG.getConstant(CC, dl, MVT::i32);
+    return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+                       Cmp);
+  }
+
+  return SDValue();
+}
+
 SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
                                               SelectionDAG &DAG) const {
   LLVM_DEBUG(dbgs() << "Custom lowering: ");
@@ -5026,6 +5291,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   case ISD::STRICT_FSETCC:
   case ISD::STRICT_FSETCCS:
     return LowerSETCC(Op, DAG);
+  case ISD::BRCOND:
+    return LowerBRCOND(Op, DAG);
   case ISD::BR_CC:
     return LowerBR_CC(Op, DAG);
   case ISD::SELECT:
@@ -5046,11 +5313,14 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerVACOPY(Op, DAG);
   case ISD::VAARG:
     return LowerVAARG(Op, DAG);
-  case ISD::ADDC:
-  case ISD::ADDE:
-  case ISD::SUBC:
-  case ISD::SUBE:
-    return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+  case ISD::ADDCARRY:
+    return lowerADDSUBCARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
+  case ISD::SUBCARRY:
+    return lowerADDSUBCARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
+  case ISD::SADDO_CARRY:
+    return lowerADDSUBCARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
+  case ISD::SSUBO_CARRY:
+    return lowerADDSUBCARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
   case ISD::SADDO:
   case ISD::UADDO:
   case ISD::SSUBO:
@@ -5165,11 +5435,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   case ISD::MUL:
     return LowerMUL(Op, DAG);
   case ISD::MULHS:
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED,
-                               /*OverrideNEON=*/true);
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
   case ISD::MULHU:
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED,
-                               /*OverrideNEON=*/true);
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
   case ISD::INTRINSIC_W_CHAIN:
     return LowerINTRINSIC_W_CHAIN(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN:
@@ -5234,11 +5502,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
       return LowerFixedLengthVectorLoadToSVE(Op, DAG);
     return LowerLOAD(Op, DAG);
   case ISD::ADD:
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED);
   case ISD::AND:
-    return LowerToScalableOp(Op, DAG);
   case ISD::SUB:
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::SUB_PRED);
+    return LowerToScalableOp(Op, DAG);
   case ISD::FMAXIMUM:
     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
   case ISD::FMAXNUM:
@@ -5260,12 +5526,23 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   case ISD::BSWAP:
     return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
   case ISD::CTLZ:
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU,
-                               /*OverrideNEON=*/true);
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
   case ISD::CTTZ:
     return LowerCTTZ(Op, DAG);
   case ISD::VECTOR_SPLICE:
     return LowerVECTOR_SPLICE(Op, DAG);
+  case ISD::STRICT_LROUND:
+  case ISD::STRICT_LLROUND:
+  case ISD::STRICT_LRINT:
+  case ISD::STRICT_LLRINT: {
+    assert(Op.getOperand(1).getValueType() == MVT::f16 &&
+           "Expected custom lowering of rounding operations only for f16");
+    SDLoc DL(Op);
+    SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
+                              {Op.getOperand(0), Op.getOperand(1)});
+    return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
+                       {Ext.getValue(1), Ext.getValue(0)});
+  }
   }
 }
 
@@ -5275,10 +5552,7 @@ bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const {
 
 bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
     EVT VT, bool OverrideNEON) const {
-  if (!Subtarget->useSVEForFixedLengthVectors())
-    return false;
-
-  if (!VT.isFixedLengthVector())
+  if (!VT.isFixedLengthVector() || !VT.isSimple())
     return false;
 
   // Don't use SVE for vectors we cannot scalarize if required.
@@ -5300,12 +5574,16 @@ bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
 
   // All SVE implementations support NEON sized vectors.
   if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
-    return true;
+    return Subtarget->hasSVE();
 
   // Ensure NEON MVTs only belong to a single register class.
   if (VT.getFixedSizeInBits() <= 128)
     return false;
 
+  // Ensure wider than NEON code generation is enabled.
+  if (!Subtarget->useSVEForFixedLengthVectors())
+    return false;
+
   // Don't use SVE for types that don't fit.
   if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
     return false;
@@ -5322,6 +5600,36 @@ bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
 //                      Calling Convention Implementation
 //===----------------------------------------------------------------------===//
 
+static unsigned getIntrinsicID(const SDNode *N) {
+  unsigned Opcode = N->getOpcode();
+  switch (Opcode) {
+  default:
+    return Intrinsic::not_intrinsic;
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    if (IID < Intrinsic::num_intrinsics)
+      return IID;
+    return Intrinsic::not_intrinsic;
+  }
+  }
+}
+
+bool AArch64TargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
+                                                SDValue N1) const {
+  if (!N0.hasOneUse())
+    return false;
+
+  unsigned IID = getIntrinsicID(N1.getNode());
+  // Avoid reassociating expressions that can be lowered to smlal/umlal.
+  if (IID == Intrinsic::aarch64_neon_umull ||
+      N1.getOpcode() == AArch64ISD::UMULL ||
+      IID == Intrinsic::aarch64_neon_smull ||
+      N1.getOpcode() == AArch64ISD::SMULL)
+    return N0.getOpcode() != ISD::ADD;
+
+  return true;
+}
+
 /// Selects the correct CCAssignFn for a given CallingConvention value.
 CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
                                                      bool IsVarArg) const {
@@ -5368,8 +5676,16 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
+  const Function &F = MF.getFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
+  bool IsWin64 = Subtarget->isCallingConvWin64(F.getCallingConv());
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+
+  SmallVector<ISD::OutputArg, 4> Outs;
+  GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
+                DAG.getTargetLoweringInfo(), MF.getDataLayout());
+  if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
+    FuncInfo->setIsSVECC(true);
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -5383,7 +5699,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
   // we use a special version of AnalyzeFormalArguments to pass in ValVT and
   // LocVT.
   unsigned NumArgs = Ins.size();
-  Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin();
+  Function::const_arg_iterator CurOrigArg = F.arg_begin();
   unsigned CurArgIdx = 0;
   for (unsigned i = 0; i != NumArgs; ++i) {
     MVT ValVT = Ins[i].VT;
@@ -5454,11 +5770,13 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       else if (RegVT == MVT::f128 || RegVT.is128BitVector())
         RC = &AArch64::FPR128RegClass;
       else if (RegVT.isScalableVector() &&
-               RegVT.getVectorElementType() == MVT::i1)
+               RegVT.getVectorElementType() == MVT::i1) {
+        FuncInfo->setIsSVECC(true);
         RC = &AArch64::PPRRegClass;
-      else if (RegVT.isScalableVector())
+      } else if (RegVT.isScalableVector()) {
+        FuncInfo->setIsSVECC(true);
         RC = &AArch64::ZPRRegClass;
-      else
+      } else
         llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
 
       // Transform the arguments in physical registers into virtual ones.
@@ -5580,7 +5898,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       // i1 arguments are zero-extended to i8 by the caller. Emit a
       // hint to reflect this.
       if (Ins[i].isOrigArg()) {
-        Argument *OrigArg = MF.getFunction().getArg(Ins[i].getOrigArgIndex());
+        Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
         if (OrigArg->getType()->isIntegerTy(1)) {
           if (!Ins[i].Flags.isZExt()) {
             ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
@@ -5595,7 +5913,6 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
   assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
 
   // varargs
-  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   if (isVarArg) {
     if (!Subtarget->isTargetDarwin() || IsWin64) {
       // The AAPCS variadic function ABI is identical to the non-variadic
@@ -5843,14 +6160,62 @@ static bool mayTailCallThisCC(CallingConv::ID CC) {
   }
 }
 
+static void analyzeCallOperands(const AArch64TargetLowering &TLI,
+                                const AArch64Subtarget *Subtarget,
+                                const TargetLowering::CallLoweringInfo &CLI,
+                                CCState &CCInfo) {
+  const SelectionDAG &DAG = CLI.DAG;
+  CallingConv::ID CalleeCC = CLI.CallConv;
+  bool IsVarArg = CLI.IsVarArg;
+  const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
+
+  unsigned NumArgs = Outs.size();
+  for (unsigned i = 0; i != NumArgs; ++i) {
+    MVT ArgVT = Outs[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+
+    bool UseVarArgCC = false;
+    if (IsVarArg) {
+      // On Windows, the fixed arguments in a vararg call are passed in GPRs
+      // too, so use the vararg CC to force them to integer registers.
+      if (IsCalleeWin64) {
+        UseVarArgCC = true;
+      } else {
+        UseVarArgCC = !Outs[i].IsFixed;
+      }
+    } else {
+      // Get type of the original argument.
+      EVT ActualVT =
+          TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
+                       /*AllowUnknown*/ true);
+      MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
+      // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
+      if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
+        ArgVT = MVT::i8;
+      else if (ActualMVT == MVT::i16)
+        ArgVT = MVT::i16;
+    }
+
+    CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
+    bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
+    assert(!Res && "Call operand has unhandled type");
+    (void)Res;
+  }
+}
+
 bool AArch64TargetLowering::isEligibleForTailCallOptimization(
-    SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
-    const SmallVectorImpl<ISD::OutputArg> &Outs,
-    const SmallVectorImpl<SDValue> &OutVals,
-    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
+    const CallLoweringInfo &CLI) const {
+  CallingConv::ID CalleeCC = CLI.CallConv;
   if (!mayTailCallThisCC(CalleeCC))
     return false;
 
+  SDValue Callee = CLI.Callee;
+  bool IsVarArg = CLI.IsVarArg;
+  const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
+  const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
+  const SelectionDAG &DAG = CLI.DAG;
   MachineFunction &MF = DAG.getMachineFunction();
   const Function &CallerF = MF.getFunction();
   CallingConv::ID CallerCC = CallerF.getCallingConv();
@@ -5860,7 +6225,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
   // The check for matching callee-saved regs will determine whether it is
   // eligible for TCO.
   if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
-      AArch64RegisterInfo::hasSVEArgsOrReturn(&MF))
+      MF.getInfo<AArch64FunctionInfo>()->isSVECC())
     CallerCC = CallingConv::AArch64_SVE_VectorCall;
 
   bool CCMatch = CallerCC == CalleeCC;
@@ -5915,30 +6280,14 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
 
   // I want anyone implementing a new calling convention to think long and hard
   // about this assert.
-  assert((!isVarArg || CalleeCC == CallingConv::C) &&
+  assert((!IsVarArg || CalleeCC == CallingConv::C) &&
          "Unexpected variadic calling convention");
 
   LLVMContext &C = *DAG.getContext();
-  if (isVarArg && !Outs.empty()) {
-    // At least two cases here: if caller is fastcc then we can't have any
-    // memory arguments (we'd be expected to clean up the stack afterwards). If
-    // caller is C then we could potentially use its argument area.
-
-    // FIXME: for now we take the most conservative of these in both cases:
-    // disallow all variadic memory operands.
-    SmallVector<CCValAssign, 16> ArgLocs;
-    CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
-
-    CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
-    for (const CCValAssign &ArgLoc : ArgLocs)
-      if (!ArgLoc.isRegLoc())
-        return false;
-  }
-
   // Check that the call results are passed in the same way.
   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
-                                  CCAssignFnForCall(CalleeCC, isVarArg),
-                                  CCAssignFnForCall(CallerCC, isVarArg)))
+                                  CCAssignFnForCall(CalleeCC, IsVarArg),
+                                  CCAssignFnForCall(CallerCC, IsVarArg)))
     return false;
   // The callee has to preserve all registers the caller needs to preserve.
   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
@@ -5958,9 +6307,22 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
     return true;
 
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
+  CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
+
+  analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
+
+  if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
+    // When we are musttail, additional checks have been done and we can safely ignore this check
+    // At least two cases here: if caller is fastcc then we can't have any
+    // memory arguments (we'd be expected to clean up the stack afterwards). If
+    // caller is C then we could potentially use its argument area.
 
-  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
+    // FIXME: for now we take the most conservative of these in both cases:
+    // disallow all variadic memory operands.
+    for (const CCValAssign &ArgLoc : ArgLocs)
+      if (!ArgLoc.isRegLoc())
+        return false;
+  }
 
   const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
 
@@ -6051,7 +6413,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   SDValue Chain = CLI.Chain;
   SDValue Callee = CLI.Callee;
   bool &IsTailCall = CLI.IsTailCall;
-  CallingConv::ID CallConv = CLI.CallConv;
+  CallingConv::ID &CallConv = CLI.CallConv;
   bool IsVarArg = CLI.IsVarArg;
 
   MachineFunction &MF = DAG.getMachineFunction();
@@ -6061,7 +6423,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
   bool IsSibCall = false;
-  bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CallConv);
+  bool GuardWithBTI = false;
+
+  if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
+      !Subtarget->noBTIAtReturnTwice()) {
+    GuardWithBTI = FuncInfo->branchTargetEnforcement();
+  }
 
   // Check callee args/returns for SVE registers and set calling convention
   // accordingly.
@@ -6079,8 +6446,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   if (IsTailCall) {
     // Check if it's really possible to do a tail call.
-    IsTailCall = isEligibleForTailCallOptimization(
-        Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
+    IsTailCall = isEligibleForTailCallOptimization(CLI);
 
     // A sibling call is one where we're under the usual C ABI and not planning
     // to change that but can still do a tail call:
@@ -6101,56 +6467,17 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
 
   if (IsVarArg) {
-    // Handle fixed and variable vector arguments differently.
-    // Variable vector arguments always go into memory.
     unsigned NumArgs = Outs.size();
 
     for (unsigned i = 0; i != NumArgs; ++i) {
-      MVT ArgVT = Outs[i].VT;
-      if (!Outs[i].IsFixed && ArgVT.isScalableVector())
+      if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
         report_fatal_error("Passing SVE types to variadic functions is "
                            "currently not supported");
-
-      ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
-      bool UseVarArgCC = !Outs[i].IsFixed;
-      // On Windows, the fixed arguments in a vararg call are passed in GPRs
-      // too, so use the vararg CC to force them to integer registers.
-      if (IsCalleeWin64)
-        UseVarArgCC = true;
-      CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
-      bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
-      assert(!Res && "Call operand has unhandled type");
-      (void)Res;
-    }
-  } else {
-    // At this point, Outs[].VT may already be promoted to i32. To correctly
-    // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
-    // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
-    // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
-    // we use a special version of AnalyzeCallOperands to pass in ValVT and
-    // LocVT.
-    unsigned NumArgs = Outs.size();
-    for (unsigned i = 0; i != NumArgs; ++i) {
-      MVT ValVT = Outs[i].VT;
-      // Get type of the original argument.
-      EVT ActualVT = getValueType(DAG.getDataLayout(),
-                                  CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
-                                  /*AllowUnknown*/ true);
-      MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
-      ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
-      // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
-      if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
-        ValVT = MVT::i8;
-      else if (ActualMVT == MVT::i16)
-        ValVT = MVT::i16;
-
-      CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
-      bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
-      assert(!Res && "Call operand has unhandled type");
-      (void)Res;
     }
   }
 
+  analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
+
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getNextStackOffset();
 
@@ -6536,7 +6863,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
     auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
     Ops.insert(Ops.begin() + 1, GA);
-  }
+  } else if (GuardWithBTI)
+    CallOpc = AArch64ISD::CALL_BTI;
 
   // Returns a chain and a flag for retval copy to use.
   Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops);
@@ -7313,103 +7641,88 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
                                               SelectionDAG &DAG) const {
+  if (!Subtarget->hasNEON())
+    return SDValue();
+
   EVT VT = Op.getValueType();
+  EVT IntVT = VT.changeTypeToInteger();
   SDLoc DL(Op);
 
   SDValue In1 = Op.getOperand(0);
   SDValue In2 = Op.getOperand(1);
   EVT SrcVT = In2.getValueType();
 
-  if (VT.isScalableVector()) {
-    if (VT != SrcVT)
-      return SDValue();
+  if (SrcVT.bitsLT(VT))
+    In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
+  else if (SrcVT.bitsGT(VT))
+    In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL));
 
-    // copysign(x,y) -> (y & SIGN_MASK) | (x & ~SIGN_MASK)
-    //
-    // A possible alternative sequence involves using FNEG_MERGE_PASSTHRU;
-    // maybe useful for copysign operations with mismatched VTs.
-    //
-    // IntVT here is chosen so it's a legal type with the same element width
-    // as the input.
-    EVT IntVT =
+  if (VT.isScalableVector())
+    IntVT =
         getPackedSVEVectorVT(VT.getVectorElementType().changeTypeToInteger());
-    unsigned NumBits = VT.getScalarSizeInBits();
-    SDValue SignMask = DAG.getConstant(APInt::getSignMask(NumBits), DL, IntVT);
-    SDValue InvSignMask = DAG.getNOT(DL, SignMask, IntVT);
-    SDValue Sign = DAG.getNode(ISD::AND, DL, IntVT, SignMask,
-                               getSVESafeBitCast(IntVT, In2, DAG));
-    SDValue Magnitude = DAG.getNode(ISD::AND, DL, IntVT, InvSignMask,
-                                    getSVESafeBitCast(IntVT, In1, DAG));
-    SDValue IntResult = DAG.getNode(ISD::OR, DL, IntVT, Sign, Magnitude);
-    return getSVESafeBitCast(VT, IntResult, DAG);
-  }
 
-  if (!Subtarget->hasNEON())
+  if (VT != In2.getValueType())
     return SDValue();
 
-  if (SrcVT.bitsLT(VT))
-    In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
-  else if (SrcVT.bitsGT(VT))
-    In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL));
+  auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
+    if (VT.isScalableVector())
+      return getSVESafeBitCast(VT, Op, DAG);
 
-  EVT VecVT;
-  uint64_t EltMask;
-  SDValue VecVal1, VecVal2;
+    return DAG.getBitcast(VT, Op);
+  };
 
-  auto setVecVal = [&] (int Idx) {
+  SDValue VecVal1, VecVal2;
+  EVT VecVT;
+  auto SetVecVal = [&](int Idx = -1) {
     if (!VT.isVector()) {
-      VecVal1 = DAG.getTargetInsertSubreg(Idx, DL, VecVT,
-                                          DAG.getUNDEF(VecVT), In1);
-      VecVal2 = DAG.getTargetInsertSubreg(Idx, DL, VecVT,
-                                          DAG.getUNDEF(VecVT), In2);
+      VecVal1 =
+          DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
+      VecVal2 =
+          DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
     } else {
-      VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
-      VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
+      VecVal1 = BitCast(VecVT, In1, DAG);
+      VecVal2 = BitCast(VecVT, In2, DAG);
     }
   };
-
-  if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
-    VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32);
-    EltMask = 0x80000000ULL;
-    setVecVal(AArch64::ssub);
-  } else if (VT == MVT::f64 || VT == MVT::v2f64) {
+  if (VT.isVector()) {
+    VecVT = IntVT;
+    SetVecVal();
+  } else if (VT == MVT::f64) {
     VecVT = MVT::v2i64;
-
-    // We want to materialize a mask with the high bit set, but the AdvSIMD
-    // immediate moves cannot materialize that in a single instruction for
-    // 64-bit elements. Instead, materialize zero and then negate it.
-    EltMask = 0;
-
-    setVecVal(AArch64::dsub);
-  } else if (VT == MVT::f16 || VT == MVT::v4f16 || VT == MVT::v8f16) {
-    VecVT = (VT == MVT::v4f16 ? MVT::v4i16 : MVT::v8i16);
-    EltMask = 0x8000ULL;
-    setVecVal(AArch64::hsub);
+    SetVecVal(AArch64::dsub);
+  } else if (VT == MVT::f32) {
+    VecVT = MVT::v4i32;
+    SetVecVal(AArch64::ssub);
+  } else if (VT == MVT::f16) {
+    VecVT = MVT::v8i16;
+    SetVecVal(AArch64::hsub);
   } else {
     llvm_unreachable("Invalid type for copysign!");
   }
 
-  SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT);
+  unsigned BitWidth = In1.getScalarValueSizeInBits();
+  SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
 
-  // If we couldn't materialize the mask above, then the mask vector will be
-  // the zero vector, and we need to negate it here.
+  // We want to materialize a mask with every bit but the high bit set, but the
+  // AdvSIMD immediate moves cannot materialize that in a single instruction for
+  // 64-bit elements. Instead, materialize all bits set and then negate that.
   if (VT == MVT::f64 || VT == MVT::v2f64) {
-    BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
-    BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
-    BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
+    SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
+    SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
+    SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
+    SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
   }
 
-  SDValue Sel =
-      DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);
-
+  SDValue BSP =
+      DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
   if (VT == MVT::f16)
-    return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Sel);
+    return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
   if (VT == MVT::f32)
-    return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel);
-  else if (VT == MVT::f64)
-    return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel);
-  else
-    return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
+    return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
+  if (VT == MVT::f64)
+    return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
+
+  return BitCast(VT, BSP, DAG);
 }
 
 SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
@@ -7485,7 +7798,8 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
 SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   assert(VT.isScalableVector() ||
-         useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true));
+         useSVEForFixedLengthVectorVT(
+             VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
 
   SDLoc DL(Op);
   SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
@@ -7517,22 +7831,19 @@ SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
   }
 
   if (VT.isScalableVector() ||
-      useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) {
+      useSVEForFixedLengthVectorVT(
+          VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
     switch (Opcode) {
     default:
       llvm_unreachable("Wrong instruction");
     case ISD::SMAX:
-      return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED,
-                                 /*OverrideNEON=*/true);
+      return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
     case ISD::SMIN:
-      return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED,
-                                 /*OverrideNEON=*/true);
+      return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
     case ISD::UMAX:
-      return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED,
-                                 /*OverrideNEON=*/true);
+      return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
     case ISD::UMIN:
-      return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED,
-                                 /*OverrideNEON=*/true);
+      return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
     }
   }
 
@@ -7547,9 +7858,9 @@ SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
   EVT VT = Op.getValueType();
 
   if (VT.isScalableVector() ||
-      useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU,
-                               true);
+      useSVEForFixedLengthVectorVT(
+          VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
 
   SDLoc DL(Op);
   SDValue REVB;
@@ -8990,12 +9301,13 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
     if (V.isUndef())
       continue;
     else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-             !isa<ConstantSDNode>(V.getOperand(1))) {
+             !isa<ConstantSDNode>(V.getOperand(1)) ||
+             V.getOperand(0).getValueType().isScalableVector()) {
       LLVM_DEBUG(
           dbgs() << "Reshuffle failed: "
                     "a shuffle can only come from building a vector from "
-                    "various elements of other vectors, provided their "
-                    "indices are constant\n");
+                    "various elements of other fixed-width vectors, provided "
+                    "their indices are constant\n");
       return SDValue();
     }
 
@@ -9011,10 +9323,72 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
     Source->MaxElt = std::max(Source->MaxElt, EltNo);
   }
 
+  // If we have 3 or 4 sources, try to generate a TBL, which will at least be
+  // better than moving to/from gpr registers for larger vectors.
+  if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
+    // Construct a mask for the tbl. We may need to adjust the index for types
+    // larger than i8.
+    SmallVector<unsigned, 16> Mask;
+    unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
+    for (unsigned I = 0; I < NumElts; ++I) {
+      SDValue V = Op.getOperand(I);
+      if (V.isUndef()) {
+        for (unsigned OF = 0; OF < OutputFactor; OF++)
+          Mask.push_back(-1);
+        continue;
+      }
+      // Set the Mask lanes adjusted for the size of the input and output
+      // lanes. The Mask is always i8, so it will set OutputFactor lanes per
+      // output element, adjusted in their positions per input and output types.
+      unsigned Lane = V.getConstantOperandVal(1);
+      for (unsigned S = 0; S < Sources.size(); S++) {
+        if (V.getOperand(0) == Sources[S].Vec) {
+          unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
+          unsigned InputBase = 16 * S + Lane * InputSize / 8;
+          for (unsigned OF = 0; OF < OutputFactor; OF++)
+            Mask.push_back(InputBase + OF);
+          break;
+        }
+      }
+    }
+
+    // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
+    // v16i8, and the TBLMask
+    SmallVector<SDValue, 16> TBLOperands;
+    TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
+                                              ? Intrinsic::aarch64_neon_tbl3
+                                              : Intrinsic::aarch64_neon_tbl4,
+                                          dl, MVT::i32));
+    for (unsigned i = 0; i < Sources.size(); i++) {
+      SDValue Src = Sources[i].Vec;
+      EVT SrcVT = Src.getValueType();
+      Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
+      assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
+             "Expected a legally typed vector");
+      if (SrcVT.is64BitVector())
+        Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src,
+                          DAG.getUNDEF(MVT::v8i8));
+      TBLOperands.push_back(Src);
+    }
+
+    SmallVector<SDValue, 16> TBLMask;
+    for (unsigned i = 0; i < Mask.size(); i++)
+      TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32));
+    assert((Mask.size() == 8 || Mask.size() == 16) &&
+           "Expected a v8i8 or v16i8 Mask");
+    TBLOperands.push_back(
+        DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask));
+
+    SDValue Shuffle =
+        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
+                    Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
+    return DAG.getBitcast(VT, Shuffle);
+  }
+
   if (Sources.size() > 2) {
-    LLVM_DEBUG(
-        dbgs() << "Reshuffle failed: currently only do something sane when at "
-                  "most two source vectors are involved\n");
+    LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
+                      << "sensible when at most two source vectors are "
+                      << "involved\n");
     return SDValue();
   }
 
@@ -9039,8 +9413,8 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
   for (auto &Src : Sources) {
     EVT SrcVT = Src.ShuffleVec.getValueType();
 
-    uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
-    if (SrcVTSize == VTSize)
+    TypeSize SrcVTSize = SrcVT.getSizeInBits();
+    if (SrcVTSize == TypeSize::Fixed(VTSize))
       continue;
 
     // This stage of the search produces a source with the same element type as
@@ -9049,7 +9423,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
     unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
     EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
 
-    if (SrcVTSize < VTSize) {
+    if (SrcVTSize.getFixedValue() < VTSize) {
       assert(2 * SrcVTSize == VTSize);
       // We can pad out the smaller vector for free, so if it's part of a
       // shuffle...
@@ -9059,7 +9433,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
       continue;
     }
 
-    if (SrcVTSize != 2 * VTSize) {
+    if (SrcVTSize.getFixedValue() != 2 * VTSize) {
       LLVM_DEBUG(
           dbgs() << "Reshuffle failed: result vector too small to extract\n");
       return SDValue();
@@ -9205,6 +9579,56 @@ static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
   return true;
 }
 
+// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
+// v4i32s. This is really a truncate, which we can construct out of (legal)
+// concats and truncate nodes.
+static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG) {
+  if (V.getValueType() != MVT::v16i8)
+    return SDValue();
+  assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
+
+  for (unsigned X = 0; X < 4; X++) {
+    // Check the first item in each group is an extract from lane 0 of a v4i32
+    // or v4i16.
+    SDValue BaseExt = V.getOperand(X * 4);
+    if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
+         BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
+        !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
+        BaseExt.getConstantOperandVal(1) != 0)
+      return SDValue();
+    SDValue Base = BaseExt.getOperand(0);
+    // And check the other items are extracts from the same vector.
+    for (unsigned Y = 1; Y < 4; Y++) {
+      SDValue Ext = V.getOperand(X * 4 + Y);
+      if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+          Ext.getOperand(0) != Base ||
+          !isa<ConstantSDNode>(Ext.getOperand(1)) ||
+          Ext.getConstantOperandVal(1) != Y)
+        return SDValue();
+    }
+  }
+
+  // Turn the buildvector into a series of truncates and concates, which will
+  // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
+  // concat together to produce 2 v8i16. These are both truncated and concat
+  // together.
+  SDLoc DL(V);
+  SDValue Trunc[4] = {
+      V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
+      V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
+  for (int I = 0; I < 4; I++)
+    if (Trunc[I].getValueType() == MVT::v4i32)
+      Trunc[I] = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, Trunc[I]);
+  SDValue Concat0 =
+      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
+  SDValue Concat1 =
+      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
+  SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
+  SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
+}
+
 /// Check if a vector shuffle corresponds to a DUP instructions with a larger
 /// element width than the vector lane type. If that is the case the function
 /// returns true and writes the value of the DUP instruction lane operand into
@@ -9534,8 +9958,12 @@ static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
 }
 
 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
-/// the specified operations to build the shuffle.
-static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
+/// the specified operations to build the shuffle. ID is the perfect-shuffle
+//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
+//table entry and LHS/RHS are the immediate inputs for this stage of the
+//shuffle.
+static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1,
+                                      SDValue V2, unsigned PFEntry, SDValue LHS,
                                       SDValue RHS, SelectionDAG &DAG,
                                       const SDLoc &dl) {
   unsigned OpNum = (PFEntry >> 26) & 0x0F;
@@ -9552,12 +9980,13 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
     OP_VEXT1,
     OP_VEXT2,
     OP_VEXT3,
-    OP_VUZPL, // VUZP, left result
-    OP_VUZPR, // VUZP, right result
-    OP_VZIPL, // VZIP, left result
-    OP_VZIPR, // VZIP, right result
-    OP_VTRNL, // VTRN, left result
-    OP_VTRNR  // VTRN, right result
+    OP_VUZPL,  // VUZP, left result
+    OP_VUZPR,  // VUZP, right result
+    OP_VZIPL,  // VZIP, left result
+    OP_VZIPR,  // VZIP, right result
+    OP_VTRNL,  // VTRN, left result
+    OP_VTRNR,  // VTRN, right result
+    OP_MOVLANE // Move lane. RHSID is the lane to move into
   };
 
   if (OpNum == OP_COPY) {
@@ -9567,9 +9996,71 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
     return RHS;
   }
 
+  if (OpNum == OP_MOVLANE) {
+    // Decompose a PerfectShuffle ID to get the Mask for lane Elt
+    auto getPFIDLane = [](unsigned ID, int Elt) -> int {
+      assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
+      Elt = 3 - Elt;
+      while (Elt > 0) {
+        ID /= 9;
+        Elt--;
+      }
+      return (ID % 9 == 8) ? -1 : ID % 9;
+    };
+
+    // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
+    // get the lane to move from from the PFID, which is always from the
+    // original vectors (V1 or V2).
+    SDValue OpLHS = GeneratePerfectShuffle(
+        LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
+    EVT VT = OpLHS.getValueType();
+    assert(RHSID < 8 && "Expected a lane index for RHSID!");
+    unsigned ExtLane = 0;
+    SDValue Input;
+
+    // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
+    // convert into a higher type.
+    if (RHSID & 0x4) {
+      int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
+      if (MaskElt == -1)
+        MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
+      assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
+      ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
+      Input = MaskElt < 2 ? V1 : V2;
+      if (VT.getScalarSizeInBits() == 16) {
+        Input = DAG.getBitcast(MVT::v2f32, Input);
+        OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
+      } else {
+        assert(VT.getScalarSizeInBits() == 32 &&
+               "Expected 16 or 32 bit shuffle elemements");
+        Input = DAG.getBitcast(MVT::v2f64, Input);
+        OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
+      }
+    } else {
+      int MaskElt = getPFIDLane(ID, RHSID);
+      assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
+      ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
+      Input = MaskElt < 4 ? V1 : V2;
+      // Be careful about creating illegal types. Use f16 instead of i16.
+      if (VT == MVT::v4i16) {
+        Input = DAG.getBitcast(MVT::v4f16, Input);
+        OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
+      }
+    }
+    SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                              Input.getValueType().getVectorElementType(),
+                              Input, DAG.getVectorIdxConstant(ExtLane, dl));
+    SDValue Ins =
+        DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS,
+                    Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl));
+    return DAG.getBitcast(VT, Ins);
+  }
+
   SDValue OpLHS, OpRHS;
-  OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
-  OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
+  OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
+                                 RHS, DAG, dl);
+  OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
+                                 RHS, DAG, dl);
   EVT VT = OpLHS.getValueType();
 
   switch (OpNum) {
@@ -9648,14 +10139,16 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
   EVT EltVT = Op.getValueType().getVectorElementType();
   unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
 
-  SmallVector<SDValue, 8> TBLMask;
-  for (int Val : ShuffleMask) {
-    for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
-      unsigned Offset = Byte + Val * BytesPerElt;
-      TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
-    }
+  bool Swap = false;
+  if (V1.isUndef() || isZerosVector(V1.getNode())) {
+    std::swap(V1, V2);
+    Swap = true;
   }
 
+  // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
+  // out of range values with 0s. We do need to make sure that any out-of-range
+  // values are really out-of-range for a v16i8 vector.
+  bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
   MVT IndexVT = MVT::v8i8;
   unsigned IndexLen = 8;
   if (Op.getValueSizeInBits() == 128) {
@@ -9663,11 +10156,23 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
     IndexLen = 16;
   }
 
+  SmallVector<SDValue, 8> TBLMask;
+  for (int Val : ShuffleMask) {
+    for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
+      unsigned Offset = Byte + Val * BytesPerElt;
+      if (Swap)
+        Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
+      if (IsUndefOrZero && Offset >= IndexLen)
+        Offset = 255;
+      TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
+    }
+  }
+
   SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
   SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
 
   SDValue Shuffle;
-  if (V2.getNode()->isUndef()) {
+  if (IsUndefOrZero) {
     if (IndexLen == 8)
       V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
     Shuffle = DAG.getNode(
@@ -9732,6 +10237,10 @@ static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
     if (ExtIdxInBits % CastedEltBitWidth != 0)
       return false;
 
+    // Can't handle cases where vector size is not 128-bit
+    if (!Extract.getOperand(0).getValueType().is128BitVector())
+      return false;
+
     // Update the lane value by offsetting with the scaled extract index.
     LaneC += ExtIdxInBits / CastedEltBitWidth;
 
@@ -10014,10 +10523,8 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
     unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
                             PFIndexes[2] * 9 + PFIndexes[3];
     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
-    unsigned Cost = (PFEntry >> 30);
-
-    if (Cost <= 4)
-      return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+    return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
+                                  dl);
   }
 
   return GenerateTBL(Op, ShuffleMask, DAG);
@@ -10025,56 +10532,33 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
 
 SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
                                                  SelectionDAG &DAG) const {
-  SDLoc dl(Op);
   EVT VT = Op.getValueType();
-  EVT ElemVT = VT.getScalarType();
-  SDValue SplatVal = Op.getOperand(0);
 
   if (useSVEForFixedLengthVectorVT(VT))
     return LowerToScalableOp(Op, DAG);
 
-  // Extend input splat value where needed to fit into a GPR (32b or 64b only)
-  // FPRs don't have this restriction.
-  switch (ElemVT.getSimpleVT().SimpleTy) {
-  case MVT::i1: {
-    // The only legal i1 vectors are SVE vectors, so we can use SVE-specific
-    // lowering code.
-    if (auto *ConstVal = dyn_cast<ConstantSDNode>(SplatVal)) {
-      // We can hande the zero case during isel.
-      if (ConstVal->isZero())
-        return Op;
-      if (ConstVal->isOne())
-        return getPTrue(DAG, dl, VT, AArch64SVEPredPattern::all);
-    }
-    // The general case of i1.  There isn't any natural way to do this,
-    // so we use some trickery with whilelo.
-    SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
-    SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i64, SplatVal,
-                           DAG.getValueType(MVT::i1));
-    SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl,
-                                       MVT::i64);
-    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID,
-                       DAG.getConstant(0, dl, MVT::i64), SplatVal);
-  }
-  case MVT::i8:
-  case MVT::i16:
-  case MVT::i32:
-    SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32);
-    break;
-  case MVT::i64:
-    SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
-    break;
-  case MVT::f16:
-  case MVT::bf16:
-  case MVT::f32:
-  case MVT::f64:
-    // Fine as is
-    break;
-  default:
-    report_fatal_error("Unsupported SPLAT_VECTOR input operand type");
-  }
+  assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
+         "Unexpected vector type!");
+
+  // We can handle the constant cases during isel.
+  if (isa<ConstantSDNode>(Op.getOperand(0)))
+    return Op;
 
-  return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal);
+  // There isn't a natural way to handle the general i1 case, so we use some
+  // trickery with whilelo.
+  SDLoc DL(Op);
+  SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
+  SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
+                         DAG.getValueType(MVT::i1));
+  SDValue ID =
+      DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
+  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+  if (VT == MVT::nxv1i1)
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
+                       DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
+                                   Zero, SplatVal),
+                       Zero);
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
 }
 
 SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
@@ -10090,18 +10574,17 @@ SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
     return SDValue();
 
   // The DUPQ operation is indepedent of element type so normalise to i64s.
-  SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
   SDValue Idx128 = Op.getOperand(2);
 
   // DUPQ can be used when idx is in range.
   auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
   if (CIdx && (CIdx->getZExtValue() <= 3)) {
     SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
-    SDNode *DUPQ =
-        DAG.getMachineNode(AArch64::DUP_ZZI_Q, DL, MVT::nxv2i64, V, CI);
-    return DAG.getNode(ISD::BITCAST, DL, VT, SDValue(DUPQ, 0));
+    return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
   }
 
+  SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
+
   // The ACLE says this must produce the same result as:
   //   svtbl(data, svadd_x(svptrue_b64(),
   //                       svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
@@ -10358,20 +10841,6 @@ static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
   return true;
 }
 
-static unsigned getIntrinsicID(const SDNode *N) {
-  unsigned Opcode = N->getOpcode();
-  switch (Opcode) {
-  default:
-    return Intrinsic::not_intrinsic;
-  case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
-    if (IID < Intrinsic::num_intrinsics)
-      return IID;
-    return Intrinsic::not_intrinsic;
-  }
-  }
-}
-
 // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
 // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
 // BUILD_VECTORs with constant element C1, C2 is a constant, and:
@@ -10822,6 +11291,12 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
     return SDValue();
   }
 
+  // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
+  // v4i32s. This is really a truncate, which we can construct out of (legal)
+  // concats and truncate nodes.
+  if (SDValue M = ReconstructTruncateFromBuildVector(Op, DAG))
+    return M;
+
   // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
   if (NumElts >= 4) {
     if (SDValue shuffle = ReconstructShuffle(Op, DAG))
@@ -11121,29 +11596,36 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
     if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
       return SDValue();
 
-    EVT WideVT;
-    SDValue ExtVec;
+    // Here narrow and wide refers to the vector element types. After "casting"
+    // both vectors must have the same bit length and so because the subvector
+    // has fewer elements, those elements need to be bigger.
+    EVT NarrowVT = getPackedSVEVectorVT(VT.getVectorElementCount());
+    EVT WideVT = getPackedSVEVectorVT(InVT.getVectorElementCount());
 
+    // NOP cast operands to the largest legal vector of the same element count.
     if (VT.isFloatingPoint()) {
-      // The InVT type should be legal. We can safely cast the unpacked
-      // subvector from InVT -> VT.
-      WideVT = VT;
-      ExtVec = getSVESafeBitCast(VT, Vec1, DAG);
+      Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
+      Vec1 = getSVESafeBitCast(WideVT, Vec1, DAG);
     } else {
-      // Extend elements of smaller vector...
-      WideVT = InVT.widenIntegerVectorElementType(*(DAG.getContext()));
-      ExtVec = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
+      // Legal integer vectors are already their largest so Vec0 is fine as is.
+      Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
     }
 
+    // To replace the top/bottom half of vector V with vector SubV we widen the
+    // preserved half of V, concatenate this to SubV (the order depending on the
+    // half being replaced) and then narrow the result.
+    SDValue Narrow;
     if (Idx == 0) {
       SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
-      return DAG.getNode(AArch64ISD::UZP1, DL, VT, ExtVec, HiVec0);
-    } else if (Idx == InVT.getVectorMinNumElements()) {
+      Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
+    } else {
+      assert(Idx == InVT.getVectorMinNumElements() &&
+             "Invalid subvector index!");
       SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
-      return DAG.getNode(AArch64ISD::UZP1, DL, VT, LoVec0, ExtVec);
+      Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
     }
 
-    return SDValue();
+    return getSVESafeBitCast(VT, Narrow, DAG);
   }
 
   if (Idx == 0 && isPackedVectorType(VT, DAG)) {
@@ -11249,21 +11731,8 @@ bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
 
   if (VT.getVectorNumElements() == 4 &&
       (VT.is128BitVector() || VT.is64BitVector())) {
-    unsigned PFIndexes[4];
-    for (unsigned i = 0; i != 4; ++i) {
-      if (M[i] < 0)
-        PFIndexes[i] = 8;
-      else
-        PFIndexes[i] = M[i];
-    }
-
-    // Compute the index in the perfect shuffle table.
-    unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
-                            PFIndexes[2] * 9 + PFIndexes[3];
-    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
-    unsigned Cost = (PFEntry >> 30);
-
-    if (Cost <= 4)
+    unsigned Cost = getPerfectShuffleCost(M);
+    if (Cost <= 1)
       return true;
   }
 
@@ -11360,9 +11829,6 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
   unsigned EltSize = VT.getScalarSizeInBits();
 
   switch (Op.getOpcode()) {
-  default:
-    llvm_unreachable("unexpected shift opcode");
-
   case ISD::SHL:
     if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT))
       return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
@@ -11405,7 +11871,7 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
     return NegShiftLeft;
   }
 
-  return SDValue();
+  llvm_unreachable("unexpected shift opcode");
 }
 
 static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
@@ -11525,8 +11991,7 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
     return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
   }
 
-  const bool FullFP16 =
-    static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
+  const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
 
   // Make v4f16 (only) fcmp operations utilise vector instructions
   // v8f16 support will be a litle more complicated
@@ -11594,7 +12059,8 @@ SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
                       (Op.getOpcode() != ISD::VECREDUCE_ADD &&
                        SrcVT.getVectorElementType() == MVT::i64);
   if (SrcVT.isScalableVector() ||
-      useSVEForFixedLengthVectorVT(SrcVT, OverrideNEON)) {
+      useSVEForFixedLengthVectorVT(
+          SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
 
     if (SrcVT.getVectorElementType() == MVT::i1)
       return LowerPredReductionToSVE(Op, DAG);
@@ -11659,7 +12125,7 @@ SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
 
 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op,
                                                     SelectionDAG &DAG) const {
-  auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
+  auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
   if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
     return SDValue();
 
@@ -11676,7 +12142,7 @@ SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op,
 
 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
                                                     SelectionDAG &DAG) const {
-  auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
+  auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
   if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
     return SDValue();
 
@@ -11772,8 +12238,8 @@ SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
 
   SDLoc DL(Op);
   APInt MulImm = cast<ConstantSDNode>(Op.getOperand(0))->getAPIntValue();
-  return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sextOrSelf(64)),
-                            DL, VT);
+  return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
+                            VT);
 }
 
 /// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
@@ -11867,23 +12333,23 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   }
   case Intrinsic::aarch64_ldaxr:
   case Intrinsic::aarch64_ldxr: {
-    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
+    Type *ValTy = I.getParamElementType(0);
     Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::getVT(PtrTy->getPointerElementType());
+    Info.memVT = MVT::getVT(ValTy);
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
-    Info.align = DL.getABITypeAlign(PtrTy->getPointerElementType());
+    Info.align = DL.getABITypeAlign(ValTy);
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
     return true;
   }
   case Intrinsic::aarch64_stlxr:
   case Intrinsic::aarch64_stxr: {
-    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
+    Type *ValTy = I.getParamElementType(1);
     Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::getVT(PtrTy->getPointerElementType());
+    Info.memVT = MVT::getVT(ValTy);
     Info.ptrVal = I.getArgOperand(1);
     Info.offset = 0;
-    Info.align = DL.getABITypeAlign(PtrTy->getPointerElementType());
+    Info.align = DL.getABITypeAlign(ValTy);
     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
     return true;
   }
@@ -11906,22 +12372,23 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
     return true;
   case Intrinsic::aarch64_sve_ldnt1: {
-    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
+    Type *ElTy = cast<VectorType>(I.getType())->getElementType();
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::getVT(I.getType());
     Info.ptrVal = I.getArgOperand(1);
     Info.offset = 0;
-    Info.align = DL.getABITypeAlign(PtrTy->getPointerElementType());
+    Info.align = DL.getABITypeAlign(ElTy);
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonTemporal;
     return true;
   }
   case Intrinsic::aarch64_sve_stnt1: {
-    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(2)->getType());
+    Type *ElTy =
+        cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::getVT(I.getOperand(0)->getType());
     Info.ptrVal = I.getArgOperand(2);
     Info.offset = 0;
-    Info.align = DL.getABITypeAlign(PtrTy->getPointerElementType());
+    Info.align = DL.getABITypeAlign(ElTy);
     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal;
     return true;
   }
@@ -12007,8 +12474,7 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
 
   Instruction *User = I->user_back();
 
-  if (User &&
-      !(User->getOpcode() == Instruction::FSub ||
+  if (!(User->getOpcode() == Instruction::FSub ||
         User->getOpcode() == Instruction::FAdd))
     return true;
 
@@ -12194,9 +12660,6 @@ static bool isSplatShuffle(Value *V) {
 /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
 bool AArch64TargetLowering::shouldSinkOperands(
     Instruction *I, SmallVectorImpl<Use *> &Ops) const {
-  if (!I->getType()->isVectorTy())
-    return false;
-
   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
     switch (II->getIntrinsicID()) {
     case Intrinsic::aarch64_neon_smull:
@@ -12208,6 +12671,12 @@ bool AArch64TargetLowering::shouldSinkOperands(
       }
       LLVM_FALLTHROUGH;
 
+    case Intrinsic::fma:
+      if (isa<VectorType>(I->getType()) &&
+          cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
+          !Subtarget->hasFullFP16())
+        return false;
+      LLVM_FALLTHROUGH;
     case Intrinsic::aarch64_neon_sqdmull:
     case Intrinsic::aarch64_neon_sqdmulh:
     case Intrinsic::aarch64_neon_sqrdmulh:
@@ -12217,7 +12686,52 @@ bool AArch64TargetLowering::shouldSinkOperands(
       if (isSplatShuffle(II->getOperand(1)))
         Ops.push_back(&II->getOperandUse(1));
       return !Ops.empty();
-
+    case Intrinsic::aarch64_sme_write_horiz:
+    case Intrinsic::aarch64_sme_write_vert:
+    case Intrinsic::aarch64_sme_writeq_horiz:
+    case Intrinsic::aarch64_sme_writeq_vert: {
+      auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
+      if (!Idx || Idx->getOpcode() != Instruction::Add)
+        return false;
+      Ops.push_back(&II->getOperandUse(1));
+      return true;
+    }
+    case Intrinsic::aarch64_sme_read_horiz:
+    case Intrinsic::aarch64_sme_read_vert:
+    case Intrinsic::aarch64_sme_readq_horiz:
+    case Intrinsic::aarch64_sme_readq_vert:
+    case Intrinsic::aarch64_sme_ld1b_vert:
+    case Intrinsic::aarch64_sme_ld1h_vert:
+    case Intrinsic::aarch64_sme_ld1w_vert:
+    case Intrinsic::aarch64_sme_ld1d_vert:
+    case Intrinsic::aarch64_sme_ld1q_vert:
+    case Intrinsic::aarch64_sme_st1b_vert:
+    case Intrinsic::aarch64_sme_st1h_vert:
+    case Intrinsic::aarch64_sme_st1w_vert:
+    case Intrinsic::aarch64_sme_st1d_vert:
+    case Intrinsic::aarch64_sme_st1q_vert:
+    case Intrinsic::aarch64_sme_ld1b_horiz:
+    case Intrinsic::aarch64_sme_ld1h_horiz:
+    case Intrinsic::aarch64_sme_ld1w_horiz:
+    case Intrinsic::aarch64_sme_ld1d_horiz:
+    case Intrinsic::aarch64_sme_ld1q_horiz:
+    case Intrinsic::aarch64_sme_st1b_horiz:
+    case Intrinsic::aarch64_sme_st1h_horiz:
+    case Intrinsic::aarch64_sme_st1w_horiz:
+    case Intrinsic::aarch64_sme_st1d_horiz:
+    case Intrinsic::aarch64_sme_st1q_horiz: {
+      auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
+      if (!Idx || Idx->getOpcode() != Instruction::Add)
+        return false;
+      Ops.push_back(&II->getOperandUse(3));
+      return true;
+    }
+    case Intrinsic::aarch64_neon_pmull:
+      if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
+        return false;
+      Ops.push_back(&II->getOperandUse(0));
+      Ops.push_back(&II->getOperandUse(1));
+      return true;
     case Intrinsic::aarch64_neon_pmull64:
       if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
                                      II->getArgOperand(1)))
@@ -12225,12 +12739,14 @@ bool AArch64TargetLowering::shouldSinkOperands(
       Ops.push_back(&II->getArgOperandUse(0));
       Ops.push_back(&II->getArgOperandUse(1));
       return true;
-
     default:
       return false;
     }
   }
 
+  if (!I->getType()->isVectorTy())
+    return false;
+
   switch (I->getOpcode()) {
   case Instruction::Sub:
   case Instruction::Add: {
@@ -12745,12 +13261,15 @@ SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic,
   assert(VT.isScalableVector() && "Can only lower scalable vectors");
 
   unsigned N, Opcode;
-  static std::map<unsigned, std::pair<unsigned, unsigned>> IntrinsicMap = {
-      {Intrinsic::aarch64_sve_ld2, {2, AArch64ISD::SVE_LD2_MERGE_ZERO}},
-      {Intrinsic::aarch64_sve_ld3, {3, AArch64ISD::SVE_LD3_MERGE_ZERO}},
-      {Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}};
-
-  std::tie(N, Opcode) = IntrinsicMap[Intrinsic];
+  static const std::pair<unsigned, std::pair<unsigned, unsigned>>
+      IntrinsicMap[] = {
+          {Intrinsic::aarch64_sve_ld2, {2, AArch64ISD::SVE_LD2_MERGE_ZERO}},
+          {Intrinsic::aarch64_sve_ld3, {3, AArch64ISD::SVE_LD3_MERGE_ZERO}},
+          {Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}};
+
+  std::tie(N, Opcode) = llvm::find_if(IntrinsicMap, [&](auto P) {
+                          return P.first == Intrinsic;
+                        })->second;
   assert(VT.getVectorElementCount().getKnownMinValue() % N == 0 &&
          "invalid tuple vector type!");
 
@@ -12850,7 +13369,7 @@ bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
 // (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
 // if the folding leads to worse code.
 bool AArch64TargetLowering::isMulAddWithConstProfitable(
-    const SDValue &AddNode, const SDValue &ConstNode) const {
+    SDValue AddNode, SDValue ConstNode) const {
   // Let the DAGCombiner decide for vector types and large types.
   const EVT VT = AddNode.getValueType();
   if (VT.isVector() || VT.getScalarSizeInBits() > 64)
@@ -13025,6 +13544,28 @@ AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
   return true;
 }
 
+bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask(
+    const SDNode *N, CombineLevel Level) const {
+  assert(((N->getOpcode() == ISD::SHL &&
+           N->getOperand(0).getOpcode() == ISD::SRL) ||
+          (N->getOpcode() == ISD::SRL &&
+           N->getOperand(0).getOpcode() == ISD::SHL)) &&
+         "Expected shift-shift mask");
+  // Don't allow multiuse shift folding with the same shift amount.
+  if (!N->getOperand(0)->hasOneUse())
+    return false;
+
+  // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
+  EVT VT = N->getValueType(0);
+  if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
+    auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
+    auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
+  }
+
+  return true;
+}
+
 bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                                               Type *Ty) const {
   assert(Ty->isIntegerTy());
@@ -13221,6 +13762,61 @@ static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
 }
 
+// Given an (integer) vecreduce, we know the order of the inputs does not
+// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
+// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
+// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
+static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
+  auto DetectAddExtract = [&](SDValue A) {
+    // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
+    // UADDLP(x) if found.
+    if (A.getOpcode() != ISD::ADD)
+      return SDValue();
+    EVT VT = A.getValueType();
+    SDValue Op0 = A.getOperand(0);
+    SDValue Op1 = A.getOperand(1);
+    if (Op0.getOpcode() != Op0.getOpcode() ||
+        (Op0.getOpcode() != ISD::ZERO_EXTEND &&
+         Op0.getOpcode() != ISD::SIGN_EXTEND))
+      return SDValue();
+    SDValue Ext0 = Op0.getOperand(0);
+    SDValue Ext1 = Op1.getOperand(0);
+    if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+        Ext1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+        Ext0.getOperand(0) != Ext1.getOperand(0))
+      return SDValue();
+    // Check that the type is twice the add types, and the extract are from
+    // upper/lower parts of the same source.
+    if (Ext0.getOperand(0).getValueType().getVectorNumElements() !=
+        VT.getVectorNumElements() * 2)
+      return SDValue();
+    if ((Ext0.getConstantOperandVal(1) != 0 &&
+         Ext1.getConstantOperandVal(1) != VT.getVectorNumElements()) &&
+        (Ext1.getConstantOperandVal(1) != 0 &&
+         Ext0.getConstantOperandVal(1) != VT.getVectorNumElements()))
+      return SDValue();
+    unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
+                                                          : AArch64ISD::SADDLP;
+    return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
+  };
+
+  SDValue A = N->getOperand(0);
+  if (SDValue R = DetectAddExtract(A))
+    return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
+  if (A.getOpcode() == ISD::ADD) {
+    if (SDValue R = DetectAddExtract(A.getOperand(0)))
+      return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
+                         DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
+                                     A.getOperand(1)));
+    if (SDValue R = DetectAddExtract(A.getOperand(1)))
+      return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
+                         DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
+                                     A.getOperand(0)));
+  }
+  return SDValue();
+}
+
+
 static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const AArch64Subtarget *Subtarget) {
@@ -13279,6 +13875,60 @@ AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
   return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
 }
 
+SDValue
+AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
+                                     SelectionDAG &DAG,
+                                     SmallVectorImpl<SDNode *> &Created) const {
+  AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
+  if (isIntDivCheap(N->getValueType(0), Attr))
+    return SDValue(N, 0); // Lower SREM as SREM
+
+  EVT VT = N->getValueType(0);
+
+  // For scalable and fixed types, mark them as cheap so we can handle it much
+  // later. This allows us to handle larger than legal types.
+  if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
+    return SDValue(N, 0);
+
+  // fold (srem X, pow2)
+  if ((VT != MVT::i32 && VT != MVT::i64) ||
+      !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
+    return SDValue();
+
+  unsigned Lg2 = Divisor.countTrailingZeros();
+  if (Lg2 == 0)
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue N0 = N->getOperand(0);
+  SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
+  SDValue Zero = DAG.getConstant(0, DL, VT);
+  SDValue CCVal, CSNeg;
+  if (Lg2 == 1) {
+    SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
+    SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
+    CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
+
+    Created.push_back(Cmp.getNode());
+    Created.push_back(And.getNode());
+  } else {
+    SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC);
+    SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+
+    SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
+    SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
+    SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
+    CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
+                        Negs.getValue(1));
+
+    Created.push_back(Negs.getNode());
+    Created.push_back(AndPos.getNode());
+    Created.push_back(AndNeg.getNode());
+  }
+
+  return CSNeg;
+}
+
 static bool IsSVECntIntrinsic(SDValue S) {
   switch(getIntrinsicID(S.getNode())) {
   default:
@@ -13300,11 +13950,10 @@ static bool IsSVECntIntrinsic(SDValue S) {
 /// operations need a bit more inspection to get this information.
 ///
 /// \param Extend The SDNode from the DAG that represents the extend operation
-/// \param DAG The SelectionDAG hosting the \p Extend node
 ///
 /// \returns The type representing the \p Extend source type, or \p MVT::Other
 /// if no valid type can be determined
-static EVT calculatePreExtendType(SDValue Extend, SelectionDAG &DAG) {
+static EVT calculatePreExtendType(SDValue Extend) {
   switch (Extend.getOpcode()) {
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
@@ -13337,102 +13986,90 @@ static EVT calculatePreExtendType(SDValue Extend, SelectionDAG &DAG) {
   default:
     return MVT::Other;
   }
-
-  llvm_unreachable("Code path unhandled in calculatePreExtendType!");
 }
 
-/// Combines a dup(sext/zext) node pattern into sext/zext(dup)
-/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
-static SDValue performCommonVectorExtendCombine(SDValue VectorShuffle,
-                                                SelectionDAG &DAG) {
-
-  ShuffleVectorSDNode *ShuffleNode =
-      dyn_cast<ShuffleVectorSDNode>(VectorShuffle.getNode());
-  if (!ShuffleNode)
-    return SDValue();
-
-  // Ensuring the mask is zero before continuing
-  if (!ShuffleNode->isSplat() || ShuffleNode->getSplatIndex() != 0)
-    return SDValue();
-
-  SDValue InsertVectorElt = VectorShuffle.getOperand(0);
-
-  if (InsertVectorElt.getOpcode() != ISD::INSERT_VECTOR_ELT)
-    return SDValue();
-
-  SDValue InsertLane = InsertVectorElt.getOperand(2);
-  ConstantSDNode *Constant = dyn_cast<ConstantSDNode>(InsertLane.getNode());
-  // Ensures the insert is inserting into lane 0
-  if (!Constant || Constant->getZExtValue() != 0)
+/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
+/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
+/// SExt/ZExt rather than the scalar SExt/ZExt
+static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG) {
+  EVT VT = BV.getValueType();
+  if (BV.getOpcode() != ISD::BUILD_VECTOR &&
+      BV.getOpcode() != ISD::VECTOR_SHUFFLE)
     return SDValue();
 
-  SDValue Extend = InsertVectorElt.getOperand(1);
+  // Use the first item in the buildvector/shuffle to get the size of the
+  // extend, and make sure it looks valid.
+  SDValue Extend = BV->getOperand(0);
   unsigned ExtendOpcode = Extend.getOpcode();
-
   bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
                 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
                 ExtendOpcode == ISD::AssertSext;
   if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
       ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
     return SDValue();
-
-  EVT TargetType = VectorShuffle.getValueType();
-  EVT PreExtendType = calculatePreExtendType(Extend, DAG);
-
-  if ((TargetType != MVT::v8i16 && TargetType != MVT::v4i32 &&
-       TargetType != MVT::v2i64) ||
-      (PreExtendType == MVT::Other))
+  // Shuffle inputs are vector, limit to SIGN_EXTEND and ZERO_EXTEND to ensure
+  // calculatePreExtendType will work without issue.
+  if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
+      ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
     return SDValue();
 
   // Restrict valid pre-extend data type
-  if (PreExtendType != MVT::i8 && PreExtendType != MVT::i16 &&
-      PreExtendType != MVT::i32)
-    return SDValue();
-
-  EVT PreExtendVT = TargetType.changeVectorElementType(PreExtendType);
-
-  if (PreExtendVT.getVectorElementCount() != TargetType.getVectorElementCount())
+  EVT PreExtendType = calculatePreExtendType(Extend);
+  if (PreExtendType == MVT::Other ||
+      PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
     return SDValue();
 
-  if (TargetType.getScalarSizeInBits() != PreExtendVT.getScalarSizeInBits() * 2)
-    return SDValue();
-
-  SDLoc DL(VectorShuffle);
-
-  SDValue InsertVectorNode = DAG.getNode(
-      InsertVectorElt.getOpcode(), DL, PreExtendVT, DAG.getUNDEF(PreExtendVT),
-      DAG.getAnyExtOrTrunc(Extend.getOperand(0), DL, PreExtendType),
-      DAG.getConstant(0, DL, MVT::i64));
-
-  std::vector<int> ShuffleMask(TargetType.getVectorNumElements());
-
-  SDValue VectorShuffleNode =
-      DAG.getVectorShuffle(PreExtendVT, DL, InsertVectorNode,
-                           DAG.getUNDEF(PreExtendVT), ShuffleMask);
-
-  SDValue ExtendNode = DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
-                                   DL, TargetType, VectorShuffleNode);
+  // Make sure all other operands are equally extended
+  for (SDValue Op : drop_begin(BV->ops())) {
+    if (Op.isUndef())
+      continue;
+    unsigned Opc = Op.getOpcode();
+    bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
+                     Opc == ISD::AssertSext;
+    if (OpcIsSExt != IsSExt || calculatePreExtendType(Op) != PreExtendType)
+      return SDValue();
+  }
 
-  return ExtendNode;
+  SDValue NBV;
+  SDLoc DL(BV);
+  if (BV.getOpcode() == ISD::BUILD_VECTOR) {
+    EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
+    EVT PreExtendLegalType =
+        PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
+    SmallVector<SDValue, 8> NewOps;
+    for (SDValue Op : BV->ops())
+      NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
+                                    : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
+                                                           PreExtendLegalType));
+    NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
+  } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
+    EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
+    NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
+                               BV.getOperand(1).isUndef()
+                                   ? DAG.getUNDEF(PreExtendVT)
+                                   : BV.getOperand(1).getOperand(0),
+                               cast<ShuffleVectorSDNode>(BV)->getMask());
+  }
+  return DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, NBV);
 }
 
 /// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
 /// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
 static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {
   // If the value type isn't a vector, none of the operands are going to be dups
-  if (!Mul->getValueType(0).isVector())
+  EVT VT = Mul->getValueType(0);
+  if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
     return SDValue();
 
-  SDValue Op0 = performCommonVectorExtendCombine(Mul->getOperand(0), DAG);
-  SDValue Op1 = performCommonVectorExtendCombine(Mul->getOperand(1), DAG);
+  SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
+  SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
 
   // Neither operands have been changed, don't make any further changes
   if (!Op0 && !Op1)
     return SDValue();
 
   SDLoc DL(Mul);
-  return DAG.getNode(Mul->getOpcode(), DL, Mul->getValueType(0),
-                     Op0 ? Op0 : Mul->getOperand(0),
+  return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
                      Op1 ? Op1 : Mul->getOperand(1));
 }
 
@@ -13649,7 +14286,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
       !cast<LoadSDNode>(N0)->isVolatile()) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
-                               LN0->getPointerInfo(), LN0->getAlignment(),
+                               LN0->getPointerInfo(), LN0->getAlign(),
                                LN0->getMemOperand()->getFlags());
 
     // Make sure successors of the original load stay after it by updating them
@@ -13676,8 +14313,10 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   SDValue Op = N->getOperand(0);
-  if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
-      Op.getOpcode() != ISD::FMUL)
+  if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
+    return SDValue();
+
+  if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
     return SDValue();
 
   SDValue ConstVec = Op->getOperand(1);
@@ -13713,7 +14352,7 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
   if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
       N->getOpcode() == ISD::FP_TO_UINT_SAT) {
     EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
-    if (SatVT.getScalarSizeInBits() != IntBits)
+    if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
       return SDValue();
   }
 
@@ -13956,15 +14595,85 @@ static SDValue tryCombineToBSL(SDNode *N,
   return SDValue();
 }
 
+// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
+// convert to csel(ccmp(.., cc0)), depending on cc1:
+
+// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
+// =>
+// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
+//
+// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
+// =>
+// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
+static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  SDValue CSel0 = N->getOperand(0);
+  SDValue CSel1 = N->getOperand(1);
+
+  if (CSel0.getOpcode() != AArch64ISD::CSEL ||
+      CSel1.getOpcode() != AArch64ISD::CSEL)
+    return SDValue();
+
+  if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
+    return SDValue();
+
+  if (!isNullConstant(CSel0.getOperand(0)) ||
+      !isOneConstant(CSel0.getOperand(1)) ||
+      !isNullConstant(CSel1.getOperand(0)) ||
+      !isOneConstant(CSel1.getOperand(1)))
+    return SDValue();
+
+  SDValue Cmp0 = CSel0.getOperand(3);
+  SDValue Cmp1 = CSel1.getOperand(3);
+  AArch64CC::CondCode CC0 = (AArch64CC::CondCode)CSel0.getConstantOperandVal(2);
+  AArch64CC::CondCode CC1 = (AArch64CC::CondCode)CSel1.getConstantOperandVal(2);
+  if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
+    return SDValue();
+  if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
+      Cmp0.getOpcode() == AArch64ISD::SUBS) {
+    std::swap(Cmp0, Cmp1);
+    std::swap(CC0, CC1);
+  }
+
+  if (Cmp1.getOpcode() != AArch64ISD::SUBS)
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue CCmp;
+
+  if (N->getOpcode() == ISD::AND) {
+    AArch64CC::CondCode InvCC0 = AArch64CC::getInvertedCondCode(CC0);
+    SDValue Condition = DAG.getConstant(InvCC0, DL, MVT_CC);
+    unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(CC1);
+    SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
+    CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0),
+                       Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
+  } else {
+    SDLoc DL(N);
+    AArch64CC::CondCode InvCC1 = AArch64CC::getInvertedCondCode(CC1);
+    SDValue Condition = DAG.getConstant(CC0, DL, MVT_CC);
+    unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvCC1);
+    SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
+    CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0),
+                       Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
+  }
+  return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
+                     CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32),
+                     CCmp);
+}
+
 static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
                                 const AArch64Subtarget *Subtarget) {
-  // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
   SelectionDAG &DAG = DCI.DAG;
   EVT VT = N->getValueType(0);
 
+  if (SDValue R = performANDORCSELCombine(N, DAG))
+    return R;
+
   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return SDValue();
 
+  // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
   if (SDValue Res = tryCombineToEXTR(N, DCI))
     return Res;
 
@@ -14015,7 +14724,7 @@ static SDValue performSVEAndCombine(SDNode *N,
     SDValue UnpkOp = Src->getOperand(0);
     SDValue Dup = N->getOperand(1);
 
-    if (Dup.getOpcode() != AArch64ISD::DUP)
+    if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
       return SDValue();
 
     SDLoc DL(N);
@@ -14038,8 +14747,7 @@ static SDValue performSVEAndCombine(SDNode *N,
 
     // Otherwise, make sure we propagate the AND to the operand
     // of the unpack
-    Dup = DAG.getNode(AArch64ISD::DUP, DL,
-                      UnpkOp->getValueType(0),
+    Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
                       DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
 
     SDValue And = DAG.getNode(ISD::AND, DL,
@@ -14097,20 +14805,34 @@ static SDValue performANDCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI) {
   SelectionDAG &DAG = DCI.DAG;
   SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
   EVT VT = N->getValueType(0);
-  if (!VT.isVector() || !DAG.getTargetLoweringInfo().isTypeLegal(VT))
+
+  if (SDValue R = performANDORCSELCombine(N, DAG))
+    return R;
+
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return SDValue();
 
+  // Although NEON has no EORV instruction, when only the least significant bit
+  // is required the operation is synonymous with ADDV.
+  if (LHS.getOpcode() == ISD::VECREDUCE_XOR && isOneConstant(RHS) &&
+      LHS.getOperand(0).getValueType().isFixedLengthVector() &&
+      LHS.hasOneUse()) {
+    SDLoc DL(N);
+    SDValue ADDV = DAG.getNode(ISD::VECREDUCE_ADD, DL, VT, LHS.getOperand(0));
+    return DAG.getNode(ISD::AND, DL, VT, ADDV, RHS);
+  }
+
   if (VT.isScalableVector())
     return performSVEAndCombine(N, DCI);
 
   // The combining code below works only for NEON vectors. In particular, it
   // does not work for SVE when dealing with vectors wider than 128 bits.
-  if (!(VT.is64BitVector() || VT.is128BitVector()))
+  if (!VT.is64BitVector() && !VT.is128BitVector())
     return SDValue();
 
-  BuildVectorSDNode *BVN =
-      dyn_cast<BuildVectorSDNode>(N->getOperand(1).getNode());
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
   if (!BVN)
     return SDValue();
 
@@ -14141,107 +14863,125 @@ static SDValue performANDCombine(SDNode *N,
   return SDValue();
 }
 
-// Attempt to form urhadd(OpA, OpB) from
-// truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1))
-// or uhadd(OpA, OpB) from truncate(vlshr(add(zext(OpA), zext(OpB)), 1)).
-// The original form of the first expression is
-// truncate(srl(add(zext(OpB), add(zext(OpA), 1)), 1)) and the
-// (OpA + OpB + 1) subexpression will have been changed to (OpB - (~OpA)).
-// Before this function is called the srl will have been lowered to
-// AArch64ISD::VLSHR.
-// This pass can also recognize signed variants of the patterns that use sign
-// extension instead of zero extension and form a srhadd(OpA, OpB) or a
-// shadd(OpA, OpB) from them.
-static SDValue
-performVectorTruncateCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
-                             SelectionDAG &DAG) {
-  EVT VT = N->getValueType(0);
+static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
+  switch (Opcode) {
+  case ISD::STRICT_FADD:
+  case ISD::FADD:
+    return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
+  case ISD::ADD:
+    return VT == MVT::i64;
+  default:
+    return false;
+  }
+}
 
-  // Since we are looking for a right shift by a constant value of 1 and we are
-  // operating on types at least 16 bits in length (sign/zero extended OpA and
-  // OpB, which are at least 8 bits), it follows that the truncate will always
-  // discard the shifted-in bit and therefore the right shift will be logical
-  // regardless of the signedness of OpA and OpB.
-  SDValue Shift = N->getOperand(0);
-  if (Shift.getOpcode() != AArch64ISD::VLSHR)
+static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
+                        AArch64CC::CondCode Cond);
+
+static bool isPredicateCCSettingOp(SDValue N) {
+  if ((N.getOpcode() == ISD::SETCC) ||
+      (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+       (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
+        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
+        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
+        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
+        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
+        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
+        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
+        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
+        // get_active_lane_mask is lowered to a whilelo instruction.
+        N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
+    return true;
+
+  return false;
+}
+
+// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
+// ... into: "ptrue p, all" + PTEST
+static SDValue
+performFirstTrueTestVectorCombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const AArch64Subtarget *Subtarget) {
+  assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
+  // Make sure PTEST can be legalised with illegal types.
+  if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
     return SDValue();
 
-  // Is the right shift using an immediate value of 1?
-  uint64_t ShiftAmount = Shift.getConstantOperandVal(1);
-  if (ShiftAmount != 1)
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N0.getValueType();
+
+  if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
+      !isNullConstant(N->getOperand(1)))
     return SDValue();
 
-  SDValue ExtendOpA, ExtendOpB;
-  SDValue ShiftOp0 = Shift.getOperand(0);
-  unsigned ShiftOp0Opc = ShiftOp0.getOpcode();
-  if (ShiftOp0Opc == ISD::SUB) {
+  // Restricted the DAG combine to only cases where we're extracting from a
+  // flag-setting operation.
+  if (!isPredicateCCSettingOp(N0))
+    return SDValue();
 
-    SDValue Xor = ShiftOp0.getOperand(1);
-    if (Xor.getOpcode() != ISD::XOR)
-      return SDValue();
+  // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
+  return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
+}
 
-    // Is the XOR using a constant amount of all ones in the right hand side?
-    uint64_t C;
-    if (!isAllConstantBuildVector(Xor.getOperand(1), C))
-      return SDValue();
+// Materialize : Idx = (add (mul vscale, NumEls), -1)
+//               i1 = extract_vector_elt t37, Constant:i64<Idx>
+//     ... into: "ptrue p, all" + PTEST
+static SDValue
+performLastTrueTestVectorCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const AArch64Subtarget *Subtarget) {
+  assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
+  // Make sure PTEST is legal types.
+  if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
+    return SDValue();
 
-    unsigned ElemSizeInBits = VT.getScalarSizeInBits();
-    APInt CAsAPInt(ElemSizeInBits, C);
-    if (CAsAPInt != APInt::getAllOnes(ElemSizeInBits))
-      return SDValue();
+  SDValue N0 = N->getOperand(0);
+  EVT OpVT = N0.getValueType();
 
-    ExtendOpA = Xor.getOperand(0);
-    ExtendOpB = ShiftOp0.getOperand(0);
-  } else if (ShiftOp0Opc == ISD::ADD) {
-    ExtendOpA = ShiftOp0.getOperand(0);
-    ExtendOpB = ShiftOp0.getOperand(1);
-  } else
+  if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
     return SDValue();
 
-  unsigned ExtendOpAOpc = ExtendOpA.getOpcode();
-  unsigned ExtendOpBOpc = ExtendOpB.getOpcode();
-  if (!(ExtendOpAOpc == ExtendOpBOpc &&
-        (ExtendOpAOpc == ISD::ZERO_EXTEND || ExtendOpAOpc == ISD::SIGN_EXTEND)))
+  // Idx == (add (mul vscale, NumEls), -1)
+  SDValue Idx = N->getOperand(1);
+  if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
     return SDValue();
 
-  // Is the result of the right shift being truncated to the same value type as
-  // the original operands, OpA and OpB?
-  SDValue OpA = ExtendOpA.getOperand(0);
-  SDValue OpB = ExtendOpB.getOperand(0);
-  EVT OpAVT = OpA.getValueType();
-  assert(ExtendOpA.getValueType() == ExtendOpB.getValueType());
-  if (!(VT == OpAVT && OpAVT == OpB.getValueType()))
+  SDValue VS = Idx.getOperand(0);
+  if (VS.getOpcode() != ISD::VSCALE)
     return SDValue();
 
-  SDLoc DL(N);
-  bool IsSignExtend = ExtendOpAOpc == ISD::SIGN_EXTEND;
-  bool IsRHADD = ShiftOp0Opc == ISD::SUB;
-  unsigned HADDOpc = IsSignExtend
-                         ? (IsRHADD ? AArch64ISD::SRHADD : AArch64ISD::SHADD)
-                         : (IsRHADD ? AArch64ISD::URHADD : AArch64ISD::UHADD);
-  SDValue ResultHADD = DAG.getNode(HADDOpc, DL, VT, OpA, OpB);
+  unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
+  if (VS.getConstantOperandVal(0) != NumEls)
+    return SDValue();
 
-  return ResultHADD;
+  // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
+  return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
 }
 
-static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
-  switch (Opcode) {
-  case ISD::FADD:
-    return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
-  case ISD::ADD:
-    return VT == MVT::i64;
-  default:
-    return false;
-  }
-}
+static SDValue
+performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                               const AArch64Subtarget *Subtarget) {
+  assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
+  if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
+    return Res;
+  if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
+    return Res;
 
-static SDValue performExtractVectorEltCombine(SDNode *N, SelectionDAG &DAG) {
+  SelectionDAG &DAG = DCI.DAG;
   SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
   ConstantSDNode *ConstantN1 = dyn_cast<ConstantSDNode>(N1);
 
   EVT VT = N->getValueType(0);
-  const bool FullFP16 =
-      static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
+  const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
+  bool IsStrict = N0->isStrictFPOpcode();
+
+  // extract(dup x) -> x
+  if (N0.getOpcode() == AArch64ISD::DUP)
+    return DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
 
   // Rewrite for pairwise fadd pattern
   //   (f32 (extract_vector_elt
@@ -14250,11 +14990,14 @@ static SDValue performExtractVectorEltCombine(SDNode *N, SelectionDAG &DAG) {
   // ->
   //   (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
   //              (extract_vector_elt (vXf32 Other) 1))
+  // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
+  // we can only do this when it's used only by the extract_vector_elt.
   if (ConstantN1 && ConstantN1->getZExtValue() == 0 &&
-      hasPairwiseAdd(N0->getOpcode(), VT, FullFP16)) {
+      hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
+      (!IsStrict || N0.hasOneUse())) {
     SDLoc DL(N0);
-    SDValue N00 = N0->getOperand(0);
-    SDValue N01 = N0->getOperand(1);
+    SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
+    SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
 
     ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
     SDValue Other = N00;
@@ -14267,11 +15010,23 @@ static SDValue performExtractVectorEltCombine(SDNode *N, SelectionDAG &DAG) {
 
     if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
         Other == Shuffle->getOperand(0)) {
-      return DAG.getNode(N0->getOpcode(), DL, VT,
-                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
-                                     DAG.getConstant(0, DL, MVT::i64)),
-                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
-                                     DAG.getConstant(1, DL, MVT::i64)));
+      SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
+                                     DAG.getConstant(0, DL, MVT::i64));
+      SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
+                                     DAG.getConstant(1, DL, MVT::i64));
+      if (!IsStrict)
+        return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
+
+      // For strict_fadd we need uses of the final extract_vector to be replaced
+      // with the strict_fadd, but we also need uses of the chain output of the
+      // original strict_fadd to use the chain output of the new strict_fadd as
+      // otherwise it may not be deleted.
+      SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
+                                {VT, MVT::Other},
+                                {N0->getOperand(0), Extract1, Extract2});
+      DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
+      DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
+      return SDValue(N, 0);
     }
   }
 
@@ -14321,25 +15076,61 @@ static SDValue performConcatVectorsCombine(SDNode *N,
     }
   }
 
+  if (N->getOperand(0).getValueType() == MVT::v4i8) {
+    // If we have a concat of v4i8 loads, convert them to a buildvector of f32
+    // loads to prevent having to go through the v4i8 load legalization that
+    // needs to extend each element into a larger type.
+    if (N->getNumOperands() % 2 == 0 && all_of(N->op_values(), [](SDValue V) {
+          if (V.getValueType() != MVT::v4i8)
+            return false;
+          if (V.isUndef())
+            return true;
+          LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
+          return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
+                 LD->getExtensionType() == ISD::NON_EXTLOAD;
+        })) {
+      EVT NVT =
+          EVT::getVectorVT(*DAG.getContext(), MVT::f32, N->getNumOperands());
+      SmallVector<SDValue> Ops;
+
+      for (unsigned i = 0; i < N->getNumOperands(); i++) {
+        SDValue V = N->getOperand(i);
+        if (V.isUndef())
+          Ops.push_back(DAG.getUNDEF(MVT::f32));
+        else {
+          LoadSDNode *LD = cast<LoadSDNode>(V);
+          SDValue NewLoad =
+              DAG.getLoad(MVT::f32, dl, LD->getChain(), LD->getBasePtr(),
+                          LD->getMemOperand());
+          DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
+          Ops.push_back(NewLoad);
+        }
+      }
+      return DAG.getBitcast(N->getValueType(0),
+                            DAG.getBuildVector(NVT, dl, Ops));
+    }
+  }
+
+
   // Wait 'til after everything is legalized to try this. That way we have
   // legal vector types and such.
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
-  // Optimise concat_vectors of two [us]rhadds or [us]hadds that use extracted
-  // subvectors from the same original vectors. Combine these into a single
-  // [us]rhadd or [us]hadd that operates on the two original vectors. Example:
-  //  (v16i8 (concat_vectors (v8i8 (urhadd (extract_subvector (v16i8 OpA, <0>),
-  //                                        extract_subvector (v16i8 OpB,
-  //                                        <0>))),
-  //                         (v8i8 (urhadd (extract_subvector (v16i8 OpA, <8>),
-  //                                        extract_subvector (v16i8 OpB,
-  //                                        <8>)))))
+  // Optimise concat_vectors of two [us]avgceils or [us]avgfloors that use
+  // extracted subvectors from the same original vectors. Combine these into a
+  // single avg that operates on the two original vectors.
+  // avgceil is the target independant name for rhadd, avgfloor is a hadd.
+  // Example:
+  //  (concat_vectors (v8i8 (avgceils (extract_subvector (v16i8 OpA, <0>),
+  //                                   extract_subvector (v16i8 OpB, <0>))),
+  //                  (v8i8 (avgceils (extract_subvector (v16i8 OpA, <8>),
+  //                                   extract_subvector (v16i8 OpB, <8>)))))
   // ->
-  //  (v16i8(urhadd(v16i8 OpA, v16i8 OpB)))
+  //  (v16i8(avgceils(v16i8 OpA, v16i8 OpB)))
   if (N->getNumOperands() == 2 && N0Opc == N1Opc &&
-      (N0Opc == AArch64ISD::URHADD || N0Opc == AArch64ISD::SRHADD ||
-       N0Opc == AArch64ISD::UHADD || N0Opc == AArch64ISD::SHADD)) {
+      (N0Opc == ISD::AVGCEILU || N0Opc == ISD::AVGCEILS ||
+       N0Opc == ISD::AVGFLOORU || N0Opc == ISD::AVGFLOORS)) {
     SDValue N00 = N0->getOperand(0);
     SDValue N01 = N0->getOperand(1);
     SDValue N10 = N1->getOperand(0);
@@ -14410,6 +15201,29 @@ static SDValue performConcatVectorsCombine(SDNode *N,
                                  RHS));
 }
 
+static SDValue
+performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                               SelectionDAG &DAG) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
+    return SDValue();
+
+  SDValue V = N->getOperand(0);
+
+  // NOTE: This combine exists in DAGCombiner, but that version's legality check
+  // blocks this combine because the non-const case requires custom lowering.
+  //
+  // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
+  if (V.getOpcode() == ISD::SPLAT_VECTOR)
+    if (isa<ConstantSDNode>(V.getOperand(0)))
+      return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
+
+  return SDValue();
+}
+
 static SDValue
 performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
                               SelectionDAG &DAG) {
@@ -14470,33 +15284,34 @@ static SDValue tryCombineFixedPointConvert(SDNode *N,
 
   // Check the operand and see if it originates from a lane extract.
   SDValue Op1 = N->getOperand(1);
-  if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
-    // Yep, no additional predication needed. Perform the transform.
-    SDValue IID = N->getOperand(0);
-    SDValue Shift = N->getOperand(2);
-    SDValue Vec = Op1.getOperand(0);
-    SDValue Lane = Op1.getOperand(1);
-    EVT ResTy = N->getValueType(0);
-    EVT VecResTy;
-    SDLoc DL(N);
+  if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+    return SDValue();
 
-    // The vector width should be 128 bits by the time we get here, even
-    // if it started as 64 bits (the extract_vector handling will have
-    // done so).
-    assert(Vec.getValueSizeInBits() == 128 &&
-           "unexpected vector size on extract_vector_elt!");
-    if (Vec.getValueType() == MVT::v4i32)
-      VecResTy = MVT::v4f32;
-    else if (Vec.getValueType() == MVT::v2i64)
-      VecResTy = MVT::v2f64;
-    else
-      llvm_unreachable("unexpected vector type!");
+  // Yep, no additional predication needed. Perform the transform.
+  SDValue IID = N->getOperand(0);
+  SDValue Shift = N->getOperand(2);
+  SDValue Vec = Op1.getOperand(0);
+  SDValue Lane = Op1.getOperand(1);
+  EVT ResTy = N->getValueType(0);
+  EVT VecResTy;
+  SDLoc DL(N);
 
-    SDValue Convert =
-        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
-    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
-  }
-  return SDValue();
+  // The vector width should be 128 bits by the time we get here, even
+  // if it started as 64 bits (the extract_vector handling will have
+  // done so). Bail if it is not.
+  if (Vec.getValueSizeInBits() != 128)
+    return SDValue();
+
+  if (Vec.getValueType() == MVT::v4i32)
+    VecResTy = MVT::v4f32;
+  else if (Vec.getValueType() == MVT::v2i64)
+    VecResTy = MVT::v2f64;
+  else
+    return SDValue();
+
+  SDValue Convert =
+      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
 }
 
 // AArch64 high-vector "long" operations are formed by performing the non-high
@@ -14515,6 +15330,11 @@ static SDValue tryCombineFixedPointConvert(SDNode *N,
 // It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
 // similarly here.
 static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
+  MVT VT = N.getSimpleValueType();
+  if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+      N.getConstantOperandVal(1) == 0)
+    N = N.getOperand(0);
+
   switch (N.getOpcode()) {
   case AArch64ISD::DUP:
   case AArch64ISD::DUPLANE8:
@@ -14535,18 +15355,19 @@ static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
     return SDValue();
   }
 
-  MVT NarrowTy = N.getSimpleValueType();
-  if (!NarrowTy.is64BitVector())
+  if (!VT.is64BitVector())
     return SDValue();
 
-  MVT ElementTy = NarrowTy.getVectorElementType();
-  unsigned NumElems = NarrowTy.getVectorNumElements();
-  MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
+  SDLoc DL(N);
+  unsigned NumElems = VT.getVectorNumElements();
+  if (N.getValueType().is64BitVector()) {
+    MVT ElementTy = VT.getVectorElementType();
+    MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
+    N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
+  }
 
-  SDLoc dl(N);
-  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy,
-                     DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()),
-                     DAG.getConstant(NumElems, dl, MVT::i64));
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
+                     DAG.getConstant(NumElems, DL, MVT::i64));
 }
 
 static bool isEssentiallyExtractHighSubvector(SDValue N) {
@@ -14696,7 +15517,7 @@ static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
 }
 
 // ADD(UADDV a, UADDV b) -->  UADDV(ADD a, b)
-static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
   // Only scalar integer and vector types.
   if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
@@ -14708,28 +15529,103 @@ static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
       RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
     return SDValue();
 
-  auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
-  auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
-  if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
-    return SDValue();
+  auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
+  auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
+  if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
+    return SDValue();
+
+  SDValue Op1 = LHS->getOperand(0);
+  SDValue Op2 = RHS->getOperand(0);
+  EVT OpVT1 = Op1.getValueType();
+  EVT OpVT2 = Op2.getValueType();
+  if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
+      Op2.getOpcode() != AArch64ISD::UADDV ||
+      OpVT1.getVectorElementType() != VT)
+    return SDValue();
+
+  SDValue Val1 = Op1.getOperand(0);
+  SDValue Val2 = Op2.getOperand(0);
+  EVT ValVT = Val1->getValueType(0);
+  SDLoc DL(N);
+  SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+                     DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
+                     DAG.getConstant(0, DL, MVT::i64));
+}
+
+/// Perform the scalar expression combine in the form of:
+///   CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
+///   CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
+static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
+    return SDValue();
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // Handle commutivity.
+  if (LHS.getOpcode() != AArch64ISD::CSEL &&
+      LHS.getOpcode() != AArch64ISD::CSNEG) {
+    std::swap(LHS, RHS);
+    if (LHS.getOpcode() != AArch64ISD::CSEL &&
+        LHS.getOpcode() != AArch64ISD::CSNEG) {
+      return SDValue();
+    }
+  }
+
+  if (!LHS.hasOneUse())
+    return SDValue();
+
+  AArch64CC::CondCode AArch64CC =
+      static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
+
+  // The CSEL should include a const one operand, and the CSNEG should include
+  // One or NegOne operand.
+  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));
+  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
+  if (!CTVal || !CFVal)
+    return SDValue();
+
+  if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
+        (CTVal->isOne() || CFVal->isOne())) &&
+      !(LHS.getOpcode() == AArch64ISD::CSNEG &&
+        (CTVal->isOne() || CFVal->isAllOnes())))
+    return SDValue();
+
+  // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
+  if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
+      !CFVal->isOne()) {
+    std::swap(CTVal, CFVal);
+    AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
+  }
+
+  SDLoc DL(N);
+  // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
+  if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
+      !CFVal->isAllOnes()) {
+    APInt C = -1 * CFVal->getAPIntValue();
+    CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));
+    CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));
+    AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
+  }
+
+  // It might be neutral for larger constants, as the immediate need to be
+  // materialized in a register.
+  APInt ADDC = CTVal->getAPIntValue();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
+    return SDValue();
+
+  assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
+          (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
+         "Unexpected constant value");
 
-  SDValue Op1 = LHS->getOperand(0);
-  SDValue Op2 = RHS->getOperand(0);
-  EVT OpVT1 = Op1.getValueType();
-  EVT OpVT2 = Op2.getValueType();
-  if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
-      Op2.getOpcode() != AArch64ISD::UADDV ||
-      OpVT1.getVectorElementType() != VT)
-    return SDValue();
+  SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
+  SDValue CCVal = DAG.getConstant(AArch64CC, DL, MVT::i32);
+  SDValue Cmp = LHS.getOperand(3);
 
-  SDValue Val1 = Op1.getOperand(0);
-  SDValue Val2 = Op2.getOperand(0);
-  EVT ValVT = Val1->getValueType(0);
-  SDLoc DL(N);
-  SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
-                     DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
-                     DAG.getConstant(0, DL, MVT::i64));
+  return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
 }
 
 // ADD(UDOT(zero, x, y), A) -->  UDOT(A, x, y)
@@ -14755,6 +15651,49 @@ static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) {
                      Dot.getOperand(2));
 }
 
+static bool isNegatedInteger(SDValue Op) {
+  return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
+}
+
+static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue Zero = DAG.getConstant(0, DL, VT);
+  return DAG.getNode(ISD::SUB, DL, VT, Zero, Op);
+}
+
+// Try to fold
+//
+// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
+//
+// The folding helps csel to be matched with csneg without generating
+// redundant neg instruction, which includes negation of the csel expansion
+// of abs node lowered by lowerABS.
+static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG) {
+  if (!isNegatedInteger(SDValue(N, 0)))
+    return SDValue();
+
+  SDValue CSel = N->getOperand(1);
+  if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
+    return SDValue();
+
+  SDValue N0 = CSel.getOperand(0);
+  SDValue N1 = CSel.getOperand(1);
+
+  // If both of them is not negations, it's not worth the folding as it
+  // introduces two additional negations while reducing one negation.
+  if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
+    return SDValue();
+
+  SDValue N0N = getNegatedInteger(N0, DAG);
+  SDValue N1N = getNegatedInteger(N1, DAG);
+
+  SDLoc DL(N);
+  EVT VT = CSel.getValueType();
+  return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
+                     CSel.getOperand(3));
+}
+
 // The basic add/sub long vector instructions have variants with "2" on the end
 // which act on the high-half of their inputs. They are normally matched by
 // patterns like:
@@ -14808,14 +15747,120 @@ static SDValue performAddSubLongCombine(SDNode *N,
   return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
 }
 
+static bool isCMP(SDValue Op) {
+  return Op.getOpcode() == AArch64ISD::SUBS &&
+         !Op.getNode()->hasAnyUseOfValue(0);
+}
+
+// (CSEL 1 0 CC Cond) => CC
+// (CSEL 0 1 CC Cond) => !CC
+static Optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
+  if (Op.getOpcode() != AArch64ISD::CSEL)
+    return None;
+  auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
+  if (CC == AArch64CC::AL || CC == AArch64CC::NV)
+    return None;
+  SDValue OpLHS = Op.getOperand(0);
+  SDValue OpRHS = Op.getOperand(1);
+  if (isOneConstant(OpLHS) && isNullConstant(OpRHS))
+    return CC;
+  if (isNullConstant(OpLHS) && isOneConstant(OpRHS))
+    return getInvertedCondCode(CC);
+
+  return None;
+}
+
+// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
+// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
+static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
+  SDValue CmpOp = Op->getOperand(2);
+  if (!isCMP(CmpOp))
+    return SDValue();
+
+  if (IsAdd) {
+    if (!isOneConstant(CmpOp.getOperand(1)))
+      return SDValue();
+  } else {
+    if (!isNullConstant(CmpOp.getOperand(0)))
+      return SDValue();
+  }
+
+  SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);
+  auto CC = getCSETCondCode(CsetOp);
+  if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
+    return SDValue();
+
+  return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),
+                     Op->getOperand(0), Op->getOperand(1),
+                     CsetOp.getOperand(3));
+}
+
+// (ADC x 0 cond) => (CINC x HS cond)
+static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG) {
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDValue Cond = N->getOperand(2);
+
+  if (!isNullConstant(RHS))
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  // (CINC x cc cond) <=> (CSINC x x !cc cond)
+  SDValue CC = DAG.getConstant(AArch64CC::LO, DL, MVT::i32);
+  return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
+}
+
+// Transform vector add(zext i8 to i32, zext i8 to i32)
+//  into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
+// This allows extra uses of saddl/uaddl at the lower vector widths, and less
+// extends.
+static SDValue performVectorAddSubExtCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
+      (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
+       N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
+      (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
+       N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
+      N->getOperand(0).getOperand(0).getValueType() !=
+          N->getOperand(1).getOperand(0).getValueType())
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0).getOperand(0);
+  SDValue N1 = N->getOperand(1).getOperand(0);
+  EVT InVT = N0.getValueType();
+
+  EVT S1 = InVT.getScalarType();
+  EVT S2 = VT.getScalarType();
+  if ((S2 == MVT::i32 && S1 == MVT::i8) ||
+      (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
+    SDLoc DL(N);
+    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
+                                  S2.getHalfSizedIntegerVT(*DAG.getContext()),
+                                  VT.getVectorElementCount());
+    SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
+    SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
+    SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
+    return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewOp);
+  }
+  return SDValue();
+}
+
 static SDValue performAddSubCombine(SDNode *N,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     SelectionDAG &DAG) {
   // Try to change sum of two reductions.
-  if (SDValue Val = performUADDVCombine(N, DAG))
+  if (SDValue Val = performAddUADDVCombine(N, DAG))
     return Val;
   if (SDValue Val = performAddDotCombine(N, DAG))
     return Val;
+  if (SDValue Val = performAddCSelIntoCSinc(N, DAG))
+    return Val;
+  if (SDValue Val = performNegCSelCombine(N, DAG))
+    return Val;
+  if (SDValue Val = performVectorAddSubExtCombine(N, DAG))
+    return Val;
 
   return performAddSubLongCombine(N, DCI, DAG);
 }
@@ -15176,6 +16221,9 @@ static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) {
       return false;
   }
 
+  if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
+    return true;
+
   // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
   // or smaller than the implicit element type represented by N.
   // NOTE: A larger element count implies a smaller element type.
@@ -15186,8 +16234,7 @@ static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) {
   // If we're compiling for a specific vector-length, we can check if the
   // pattern's VL equals that of the scalable vector at runtime.
   if (N.getOpcode() == AArch64ISD::PTRUE) {
-    const auto &Subtarget =
-        static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
+    const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
     unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
     unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
     if (MaxSVESize && MinSVESize == MaxSVESize) {
@@ -15233,6 +16280,39 @@ static SDValue performIntrinsicCombine(SDNode *N,
   switch (IID) {
   default:
     break;
+  case Intrinsic::get_active_lane_mask: {
+    SDValue Res = SDValue();
+    EVT VT = N->getValueType(0);
+    if (VT.isFixedLengthVector()) {
+      // We can use the SVE whilelo instruction to lower this intrinsic by
+      // creating the appropriate sequence of scalable vector operations and
+      // then extracting a fixed-width subvector from the scalable vector.
+
+      SDLoc DL(N);
+      SDValue ID =
+          DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
+
+      EVT WhileVT = EVT::getVectorVT(
+          *DAG.getContext(), MVT::i1,
+          ElementCount::getScalable(VT.getVectorNumElements()));
+
+      // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32.
+      EVT PromVT = getPromotedVTForPredicate(WhileVT);
+
+      // Get the fixed-width equivalent of PromVT for extraction.
+      EVT ExtVT =
+          EVT::getVectorVT(*DAG.getContext(), PromVT.getVectorElementType(),
+                           VT.getVectorElementCount());
+
+      Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID,
+                        N->getOperand(1), N->getOperand(2));
+      Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res);
+      Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res,
+                        DAG.getConstant(0, DL, MVT::i64));
+      Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
+    }
+    return Res;
+  }
   case Intrinsic::aarch64_neon_vcvtfxs2fp:
   case Intrinsic::aarch64_neon_vcvtfxu2fp:
     return tryCombineFixedPointConvert(N, DCI, DAG);
@@ -15261,7 +16341,11 @@ static SDValue performIntrinsicCombine(SDNode *N,
     return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
                        N->getOperand(1), N->getOperand(2));
   case Intrinsic::aarch64_neon_smull:
+    return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2));
   case Intrinsic::aarch64_neon_umull:
+    return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2));
   case Intrinsic::aarch64_neon_pmull:
   case Intrinsic::aarch64_neon_sqdmull:
     return tryCombineLongOpWithDup(IID, N, DCI, DAG);
@@ -15350,6 +16434,10 @@ static SDValue performIntrinsicCombine(SDNode *N,
     return convertMergedOpToPredOp(N, ISD::XOR, DAG, true);
   case Intrinsic::aarch64_sve_orr:
     return convertMergedOpToPredOp(N, ISD::OR, DAG, true);
+  case Intrinsic::aarch64_sve_sabd:
+    return convertMergedOpToPredOp(N, ISD::ABDS, DAG, true);
+  case Intrinsic::aarch64_sve_uabd:
+    return convertMergedOpToPredOp(N, ISD::ABDU, DAG, true);
   case Intrinsic::aarch64_sve_sqadd:
     return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
   case Intrinsic::aarch64_sve_sqsub:
@@ -15538,7 +16626,7 @@ static SDValue performExtendCombine(SDNode *N,
 static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
                                SDValue SplatVal, unsigned NumVecElts) {
   assert(!St.isTruncatingStore() && "cannot split truncating vector store");
-  unsigned OrigAlignment = St.getAlignment();
+  Align OrigAlignment = St.getAlign();
   unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
 
   // Create scalar stores. This is at least as good as the code sequence for a
@@ -15563,7 +16651,7 @@ static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
 
   unsigned Offset = EltOffset;
   while (--NumVecElts) {
-    unsigned Alignment = MinAlign(OrigAlignment, Offset);
+    Align Alignment = commonAlignment(OrigAlignment, Offset);
     SDValue OffsetPtr =
         DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
                     DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
@@ -15636,10 +16724,6 @@ static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
   EVT PtrTy = N->getOperand(3).getValueType();
 
-  if (VT == MVT::nxv8bf16 &&
-      !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
-    return SDValue();
-
   EVT LoadVT = VT;
   if (VT.isFloatingPoint())
     LoadVT = VT.changeTypeToInteger();
@@ -15667,9 +16751,6 @@ static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) {
                 "Unsupported opcode.");
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
-  if (VT == MVT::nxv8bf16 &&
-      !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
-    return SDValue();
 
   EVT LoadVT = VT;
   if (VT.isFloatingPoint())
@@ -15692,10 +16773,6 @@ static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) {
   EVT HwSrcVt = getSVEContainerType(DataVT);
   SDValue InputVT = DAG.getValueType(DataVT);
 
-  if (DataVT == MVT::nxv8bf16 &&
-      !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
-    return SDValue();
-
   if (DataVT.isFloatingPoint())
     InputVT = DAG.getValueType(HwSrcVt);
 
@@ -15722,10 +16799,6 @@ static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {
   EVT DataVT = Data.getValueType();
   EVT PtrTy = N->getOperand(4).getValueType();
 
-  if (DataVT == MVT::nxv8bf16 &&
-      !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
-    return SDValue();
-
   if (DataVT.isFloatingPoint())
     Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data);
 
@@ -15912,8 +16985,8 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
   // extensions can use this to mark that it does not want splitting to happen
   // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
   // eliminating alignment hazards is only 1 in 8 for alignment of 2.
-  if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 ||
-      S->getAlignment() <= 2)
+  if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
+      S->getAlign() <= Align(2))
     return SDValue();
 
   // If we get a splat of a scalar convert this vector store to a store of
@@ -15934,11 +17007,11 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
   SDValue BasePtr = S->getBasePtr();
   SDValue NewST1 =
       DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
-                   S->getAlignment(), S->getMemOperand()->getFlags());
+                   S->getAlign(), S->getMemOperand()->getFlags());
   SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
                                   DAG.getConstant(8, DL, MVT::i64));
   return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
-                      S->getPointerInfo(), S->getAlignment(),
+                      S->getPointerInfo(), S->getAlign(),
                       S->getMemOperand()->getFlags());
 }
 
@@ -15970,6 +17043,33 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG) {
   SDValue Op1 = N->getOperand(1);
   EVT ResVT = N->getValueType(0);
 
+  // uzp1(x, undef) -> concat(truncate(x), undef)
+  if (Op1.getOpcode() == ISD::UNDEF) {
+    EVT BCVT = MVT::Other, HalfVT = MVT::Other;
+    switch (ResVT.getSimpleVT().SimpleTy) {
+    default:
+      break;
+    case MVT::v16i8:
+      BCVT = MVT::v8i16;
+      HalfVT = MVT::v8i8;
+      break;
+    case MVT::v8i16:
+      BCVT = MVT::v4i32;
+      HalfVT = MVT::v4i16;
+      break;
+    case MVT::v4i32:
+      BCVT = MVT::v2i64;
+      HalfVT = MVT::v2i32;
+      break;
+    }
+    if (BCVT != MVT::Other) {
+      SDValue BC = DAG.getBitcast(BCVT, Op0);
+      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
+      return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
+                         DAG.getUNDEF(HalfVT));
+    }
+  }
+
   // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
   if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
     if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
@@ -16267,6 +17367,152 @@ static SDValue performSTORECombine(SDNode *N,
   return SDValue();
 }
 
+/// \return true if part of the index was folded into the Base.
+static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
+                              SDLoc DL, SelectionDAG &DAG) {
+  // This function assumes a vector of i64 indices.
+  EVT IndexVT = Index.getValueType();
+  if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
+    return false;
+
+  // Simplify:
+  //   BasePtr = Ptr
+  //   Index = X + splat(Offset)
+  // ->
+  //   BasePtr = Ptr + Offset * scale.
+  //   Index = X
+  if (Index.getOpcode() == ISD::ADD) {
+    if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {
+      Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
+      BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
+      Index = Index.getOperand(0);
+      return true;
+    }
+  }
+
+  // Simplify:
+  //   BasePtr = Ptr
+  //   Index = (X + splat(Offset)) << splat(Shift)
+  // ->
+  //   BasePtr = Ptr + (Offset << Shift) * scale)
+  //   Index = X << splat(shift)
+  if (Index.getOpcode() == ISD::SHL &&
+      Index.getOperand(0).getOpcode() == ISD::ADD) {
+    SDValue Add = Index.getOperand(0);
+    SDValue ShiftOp = Index.getOperand(1);
+    SDValue OffsetOp = Add.getOperand(1);
+    if (auto Shift = DAG.getSplatValue(ShiftOp))
+      if (auto Offset = DAG.getSplatValue(OffsetOp)) {
+        Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
+        Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
+        BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
+        Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
+                            Add.getOperand(0), ShiftOp);
+        return true;
+      }
+  }
+
+  return false;
+}
+
+// Analyse the specified address returning true if a more optimal addressing
+// mode is available. When returning true all parameters are updated to reflect
+// their recommended values.
+static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N,
+                                     SDValue &BasePtr, SDValue &Index,
+                                     SelectionDAG &DAG) {
+  // Try to iteratively fold parts of the index into the base pointer to
+  // simplify the index as much as possible.
+  bool Changed = false;
+  while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
+    Changed = true;
+
+  // Only consider element types that are pointer sized as smaller types can
+  // be easily promoted.
+  EVT IndexVT = Index.getValueType();
+  if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
+    return Changed;
+
+  // Match:
+  //   Index = step(const)
+  int64_t Stride = 0;
+  if (Index.getOpcode() == ISD::STEP_VECTOR)
+    Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
+
+  // Match:
+  //   Index = step(const) << shift(const)
+  else if (Index.getOpcode() == ISD::SHL &&
+           Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
+    SDValue RHS = Index.getOperand(1);
+    if (auto *Shift =
+            dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(RHS))) {
+      int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);
+      Stride = Step << Shift->getZExtValue();
+    }
+  }
+
+  // Return early because no supported pattern is found.
+  if (Stride == 0)
+    return Changed;
+
+  if (Stride < std::numeric_limits<int32_t>::min() ||
+      Stride > std::numeric_limits<int32_t>::max())
+    return Changed;
+
+  const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
+  unsigned MaxVScale =
+      Subtarget.getMaxSVEVectorSizeInBits() / AArch64::SVEBitsPerBlock;
+  int64_t LastElementOffset =
+      IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
+
+  if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
+      LastElementOffset > std::numeric_limits<int32_t>::max())
+    return Changed;
+
+  EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
+  // Stride does not scale explicitly by 'Scale', because it happens in
+  // the gather/scatter addressing mode.
+  Index = DAG.getNode(ISD::STEP_VECTOR, SDLoc(N), NewIndexVT,
+                      DAG.getTargetConstant(Stride, SDLoc(N), MVT::i32));
+  return true;
+}
+
+static SDValue performMaskedGatherScatterCombine(
+    SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {
+  MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
+  assert(MGS && "Can only combine gather load or scatter store nodes");
+
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  SDLoc DL(MGS);
+  SDValue Chain = MGS->getChain();
+  SDValue Scale = MGS->getScale();
+  SDValue Index = MGS->getIndex();
+  SDValue Mask = MGS->getMask();
+  SDValue BasePtr = MGS->getBasePtr();
+  ISD::MemIndexType IndexType = MGS->getIndexType();
+
+  if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
+    return SDValue();
+
+  // Here we catch such cases early and change MGATHER's IndexType to allow
+  // the use of an Index that's more legalisation friendly.
+  if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
+    SDValue PassThru = MGT->getPassThru();
+    SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
+    return DAG.getMaskedGather(
+        DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
+        Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
+  }
+  auto *MSC = cast<MaskedScatterSDNode>(MGS);
+  SDValue Data = MSC->getValue();
+  SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
+  return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(), DL,
+                              Ops, MSC->getMemOperand(), IndexType,
+                              MSC->isTruncatingStore());
+}
+
 /// Target-specific DAG combine function for NEON load/store intrinsics
 /// to merge base address updates.
 static SDValue performNEONPostLDSTCombine(SDNode *N,
@@ -16723,6 +17969,47 @@ static SDValue performBRCONDCombine(SDNode *N,
   return SDValue();
 }
 
+static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG) {
+  unsigned CC = N->getConstantOperandVal(2);
+  SDValue SUBS = N->getOperand(3);
+  SDValue Zero, CTTZ;
+
+  if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
+    Zero = N->getOperand(0);
+    CTTZ = N->getOperand(1);
+  } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
+    Zero = N->getOperand(1);
+    CTTZ = N->getOperand(0);
+  } else
+    return SDValue();
+
+  if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
+      (CTTZ.getOpcode() == ISD::TRUNCATE &&
+       CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))
+    return SDValue();
+
+  assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
+         "Illegal type in CTTZ folding");
+
+  if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))
+    return SDValue();
+
+  SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
+                  ? CTTZ.getOperand(0).getOperand(0)
+                  : CTTZ.getOperand(0);
+
+  if (X != SUBS.getOperand(0))
+    return SDValue();
+
+  unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
+                          ? CTTZ.getOperand(0).getValueSizeInBits()
+                          : CTTZ.getValueSizeInBits();
+  SDValue BitWidthMinusOne =
+      DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());
+  return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,
+                     BitWidthMinusOne);
+}
+
 // Optimize CSEL instructions
 static SDValue performCSELCombine(SDNode *N,
                                   TargetLowering::DAGCombinerInfo &DCI,
@@ -16731,6 +18018,11 @@ static SDValue performCSELCombine(SDNode *N,
   if (N->getOperand(0) == N->getOperand(1))
     return N->getOperand(0);
 
+  // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
+  // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
+  if (SDValue Folded = foldCSELofCTTZ(N, DAG))
+		return Folded;
+
   return performCONDCombine(N, DCI, DAG, 2, 3);
 }
 
@@ -16739,14 +18031,14 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) {
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
   ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
 
   // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
   if (Cond == ISD::SETNE && isOneConstant(RHS) &&
       LHS->getOpcode() == AArch64ISD::CSEL &&
       isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
       LHS->hasOneUse()) {
-    SDLoc DL(N);
-
     // Invert CSEL's condition.
     auto *OpCC = cast<ConstantSDNode>(LHS.getOperand(2));
     auto OldCond = static_cast<AArch64CC::CondCode>(OpCC->getZExtValue());
@@ -16757,9 +18049,48 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) {
         DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0),
                     LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32),
                     LHS.getOperand(3));
-    return DAG.getZExtOrTrunc(CSEL, DL, N->getValueType(0));
+    return DAG.getZExtOrTrunc(CSEL, DL, VT);
+  }
+
+  // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
+  if (Cond == ISD::SETNE && isNullConstant(RHS) &&
+      LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&
+      LHS->hasOneUse()) {
+    EVT TstVT = LHS->getValueType(0);
+    if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64) {
+      // this pattern will get better opt in emitComparison
+      uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);
+      SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),
+                                DAG.getConstant(TstImm, DL, TstVT));
+      return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));
+    }
+  }
+
+  return SDValue();
+}
+
+// Replace a flag-setting operator (eg ANDS) with the generic version
+// (eg AND) if the flag is unused.
+static SDValue performFlagSettingCombine(SDNode *N,
+                                         TargetLowering::DAGCombinerInfo &DCI,
+                                         unsigned GenericOpcode) {
+  SDLoc DL(N);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+
+  // If the flag result isn't used, convert back to a generic opcode.
+  if (!N->hasAnyUseOfValue(1)) {
+    SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
+    return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(0, DL, MVT::i32)},
+                                  DL);
   }
 
+  // Combine identical generic nodes into this node, re-using the result.
+  if (SDNode *Generic = DCI.DAG.getNodeIfExists(
+          GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
+    DCI.CombineTo(Generic, SDValue(N, 0));
+
   return SDValue();
 }
 
@@ -16801,27 +18132,46 @@ static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
-static SDValue performSetccMergeZeroCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue
+performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
          "Unexpected opcode!");
 
+  SelectionDAG &DAG = DCI.DAG;
   SDValue Pred = N->getOperand(0);
   SDValue LHS = N->getOperand(1);
   SDValue RHS = N->getOperand(2);
   ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
 
-  // setcc_merge_zero pred (sign_extend (setcc_merge_zero ... pred ...)), 0, ne
-  //    => inner setcc_merge_zero
-  if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
-      LHS->getOpcode() == ISD::SIGN_EXTEND &&
-      LHS->getOperand(0)->getValueType(0) == N->getValueType(0) &&
-      LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
-      LHS->getOperand(0)->getOperand(0) == Pred)
-    return LHS->getOperand(0);
-
   if (SDValue V = performSetCCPunpkCombine(N, DAG))
     return V;
 
+  if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
+      LHS->getOpcode() == ISD::SIGN_EXTEND &&
+      LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {
+    //    setcc_merge_zero(
+    //       pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
+    // => setcc_merge_zero(pred, ...)
+    if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
+        LHS->getOperand(0)->getOperand(0) == Pred)
+      return LHS->getOperand(0);
+
+    //    setcc_merge_zero(
+    //        all_active, extend(nxvNi1 ...), != splat(0))
+    // -> nxvNi1 ...
+    if (isAllActivePredicate(DAG, Pred))
+      return LHS->getOperand(0);
+
+    //    setcc_merge_zero(
+    //        pred, extend(nxvNi1 ...), != splat(0))
+    // -> nxvNi1 and(pred, ...)
+    if (DCI.isAfterLegalizeDAG())
+      // Do this after legalization to allow more folds on setcc_merge_zero
+      // to be recognized.
+      return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),
+                         LHS->getOperand(0), Pred);
+  }
+
   return SDValue();
 }
 
@@ -16928,12 +18278,53 @@ static SDValue performTBZCombine(SDNode *N,
                      DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
 }
 
+// Swap vselect operands where it may allow a predicated operation to achieve
+// the `sel`.
+//
+//     (vselect (setcc ( condcode) (_) (_)) (a)          (op (a) (b)))
+//  => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
+static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG) {
+  auto SelectA = N->getOperand(1);
+  auto SelectB = N->getOperand(2);
+  auto NTy = N->getValueType(0);
+
+  if (!NTy.isScalableVector())
+    return SDValue();
+  SDValue SetCC = N->getOperand(0);
+  if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
+    return SDValue();
+
+  switch (SelectB.getOpcode()) {
+  default:
+    return SDValue();
+  case ISD::FMUL:
+  case ISD::FSUB:
+  case ISD::FADD:
+    break;
+  }
+  if (SelectA != SelectB.getOperand(0))
+    return SDValue();
+
+  ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
+  ISD::CondCode InverseCC =
+      ISD::getSetCCInverse(CC, SetCC.getOperand(0).getValueType());
+  auto InverseSetCC =
+      DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
+                   SetCC.getOperand(1), InverseCC);
+
+  return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
+                     {InverseSetCC, SelectB, SelectA});
+}
+
 // vselect (v1i1 setcc) ->
 //     vselect (v1iXX setcc)  (XX is the size of the compared operand type)
 // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
 // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
 // such VSELECT.
 static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
+  if (auto SwapResult = trySwapVSelectOperands(N, DAG))
+    return SwapResult;
+
   SDValue N0 = N->getOperand(0);
   EVT CCVT = N0.getValueType();
 
@@ -17064,6 +18455,24 @@ static SDValue performSelectCombine(SDNode *N,
   return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
 }
 
+static SDValue performDUPCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  EVT VT = N->getValueType(0);
+  // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
+  // 128bit vector version.
+  if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
+    EVT LVT = VT.getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
+    if (SDNode *LN = DCI.DAG.getNodeIfExists(
+            N->getOpcode(), DCI.DAG.getVTList(LVT), {N->getOperand(0)})) {
+      SDLoc DL(N);
+      return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
+                             DCI.DAG.getConstant(0, DL, MVT::i64));
+    }
+  }
+
+  return performPostLD1Combine(N, DCI, false);
+}
+
 /// Get rid of unnecessary NVCASTs (that don't change the type).
 static SDValue performNVCASTCombine(SDNode *N) {
   if (N->getValueType(0) == N->getOperand(0).getValueType())
@@ -17104,13 +18513,14 @@ static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
 
   // Check whether folding this offset is legal. It must not go out of bounds of
   // the referenced object to avoid violating the code model, and must be
-  // smaller than 2^21 because this is the largest offset expressible in all
-  // object formats.
+  // smaller than 2^20 because this is the largest offset expressible in all
+  // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
+  // stores an immediate signed 21 bit offset.)
   //
   // This check also prevents us from folding negative offsets, which will end
   // up being treated in the same way as large positive ones. They could also
   // cause code model violations, and aren't really common enough to matter.
-  if (Offset >= (1 << 21))
+  if (Offset >= (1 << 20))
     return SDValue();
 
   const GlobalValue *GV = GN->getGlobal();
@@ -17621,7 +19031,7 @@ performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   return performPostLD1Combine(N, DCI, true);
 }
 
-SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) {
   EVT Ty = N->getValueType(0);
   if (Ty.isInteger())
     return SDValue();
@@ -17643,9 +19053,9 @@ SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) {
   return DAG.getBitcast(Ty, Trunc);
 }
 
-SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG,
-                               TargetLowering::DAGCombinerInfo &DCI,
-                               const AArch64Subtarget *Subtarget) {
+static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG,
+                                      TargetLowering::DAGCombinerInfo &DCI,
+                                      const AArch64Subtarget *Subtarget) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
@@ -17675,6 +19085,31 @@ SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG,
+                                      const AArch64Subtarget *Subtarget,
+                                      bool fixedSVEVectorVT) {
+  EVT VT = N->getValueType(0);
+
+  // Don't expand for SVE2
+  if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
+    return SDValue();
+
+  // Don't expand for NEON
+  if (VT.isFixedLengthVector() && !fixedSVEVectorVT)
+    return SDValue();
+
+  SDLoc DL(N);
+
+  SDValue Mask = N->getOperand(0);
+  SDValue In1 = N->getOperand(1);
+  SDValue In2 = N->getOperand(2);
+
+  SDValue InvMask = DAG.getNOT(DL, Mask, VT);
+  SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
+  SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
+  return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
+}
+
 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -17685,6 +19120,22 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::ADD:
   case ISD::SUB:
     return performAddSubCombine(N, DCI, DAG);
+  case AArch64ISD::ANDS:
+    return performFlagSettingCombine(N, DCI, ISD::AND);
+  case AArch64ISD::ADC:
+    if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
+      return R;
+    return foldADCToCINC(N, DAG);
+  case AArch64ISD::SBC:
+    return foldOverflowCheck(N, DAG, /* IsAdd */ false);
+  case AArch64ISD::ADCS:
+    if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
+      return R;
+    return performFlagSettingCombine(N, DCI, AArch64ISD::ADC);
+  case AArch64ISD::SBCS:
+    if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
+      return R;
+    return performFlagSettingCombine(N, DCI, AArch64ISD::SBC);
   case ISD::XOR:
     return performXorCombine(N, DAG, DCI, Subtarget);
   case ISD::MUL:
@@ -17711,10 +19162,10 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performExtendCombine(N, DCI, DAG);
   case ISD::SIGN_EXTEND_INREG:
     return performSignExtendInRegCombine(N, DCI, DAG);
-  case ISD::TRUNCATE:
-    return performVectorTruncateCombine(N, DCI, DAG);
   case ISD::CONCAT_VECTORS:
     return performConcatVectorsCombine(N, DCI, DAG);
+  case ISD::EXTRACT_SUBVECTOR:
+    return performExtractSubvectorCombine(N, DCI, DAG);
   case ISD::INSERT_SUBVECTOR:
     return performInsertSubvectorCombine(N, DCI, DAG);
   case ISD::SELECT:
@@ -17729,6 +19180,9 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     break;
   case ISD::STORE:
     return performSTORECombine(N, DCI, DAG, Subtarget);
+  case ISD::MGATHER:
+  case ISD::MSCATTER:
+    return performMaskedGatherScatterCombine(N, DCI, DAG);
   case ISD::VECTOR_SPLICE:
     return performSVESpliceCombine(N, DAG);
   case ISD::FP_EXTEND:
@@ -17741,7 +19195,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
   case AArch64ISD::CSEL:
     return performCSELCombine(N, DCI, DAG);
   case AArch64ISD::DUP:
-    return performPostLD1Combine(N, DCI, false);
+    return performDUPCombine(N, DCI);
   case AArch64ISD::NVCAST:
     return performNVCASTCombine(N);
   case AArch64ISD::SPLICE:
@@ -17752,7 +19206,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
   case AArch64ISD::UZP1:
     return performUzpCombine(N, DAG);
   case AArch64ISD::SETCC_MERGE_ZERO:
-    return performSetccMergeZeroCombine(N, DAG);
+    return performSetccMergeZeroCombine(N, DCI);
   case AArch64ISD::GLD1_MERGE_ZERO:
   case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
   case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
@@ -17773,12 +19227,20 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performVectorShiftCombine(N, *this, DCI);
   case AArch64ISD::SUNPKLO:
     return performSunpkloCombine(N, DAG);
+  case AArch64ISD::BSP:
+    return performBSPExpandForSVE(
+        N, DAG, Subtarget, useSVEForFixedLengthVectorVT(N->getValueType(0)));
   case ISD::INSERT_VECTOR_ELT:
     return performInsertVectorEltCombine(N, DCI);
   case ISD::EXTRACT_VECTOR_ELT:
-    return performExtractVectorEltCombine(N, DAG);
+    return performExtractVectorEltCombine(N, DCI, Subtarget);
   case ISD::VECREDUCE_ADD:
     return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
+  case AArch64ISD::UADDV:
+    return performUADDVCombine(N, DAG);
+  case AArch64ISD::SMULL:
+  case AArch64ISD::UMULL:
+    return tryCombineLongOpWithDup(Intrinsic::not_intrinsic, N, DCI, DAG);
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:
     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
@@ -18152,6 +19614,15 @@ void AArch64TargetLowering::ReplaceBITCASTResults(
   if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
     assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
            "Expected fp->int bitcast!");
+
+    // Bitcasting between unpacked vector types of different element counts is
+    // not a NOP because the live elements are laid out differently.
+    //                01234567
+    // e.g. nxv2i32 = XX??XX??
+    //      nxv4f16 = X?X?X?X?
+    if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
+      return;
+
     SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
     Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
     return;
@@ -18169,6 +19640,53 @@ void AArch64TargetLowering::ReplaceBITCASTResults(
   Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
 }
 
+static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                               SelectionDAG &DAG,
+                               const AArch64Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  if (!VT.is256BitVector() ||
+      (VT.getScalarType().isFloatingPoint() &&
+       !N->getFlags().hasAllowReassociation()) ||
+      (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()))
+    return;
+
+  SDValue X = N->getOperand(0);
+  auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
+  if (!Shuf) {
+    Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
+    X = N->getOperand(1);
+    if (!Shuf)
+      return;
+  }
+
+  if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
+    return;
+
+  // Check the mask is 1,0,3,2,5,4,...
+  ArrayRef<int> Mask = Shuf->getMask();
+  for (int I = 0, E = Mask.size(); I < E; I++)
+    if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
+      return;
+
+  SDLoc DL(N);
+  auto LoHi = DAG.SplitVector(X, DL);
+  assert(LoHi.first.getValueType() == LoHi.second.getValueType());
+  SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
+                             LoHi.first, LoHi.second);
+
+  // Shuffle the elements back into order.
+  SmallVector<int> NMask;
+  for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
+    NMask.push_back(I);
+    NMask.push_back(I);
+  }
+  Results.push_back(
+      DAG.getVectorShuffle(VT, DL,
+                           DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
+                                       DAG.getUNDEF(LoHi.first.getValueType())),
+                           DAG.getUNDEF(VT), NMask));
+}
+
 static void ReplaceReductionResults(SDNode *N,
                                     SmallVectorImpl<SDValue> &Results,
                                     SelectionDAG &DAG, unsigned InterOp,
@@ -18346,6 +19864,10 @@ void AArch64TargetLowering::ReplaceNodeResults(
   case ISD::VECREDUCE_UMIN:
     Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
     return;
+  case ISD::ADD:
+  case ISD::FADD:
+    ReplaceAddWithADDP(N, Results, DAG, Subtarget);
+    return;
 
   case ISD::CTPOP:
     if (SDValue Result = LowerCTPOP(SDValue(N, 0), DAG))
@@ -18406,8 +19928,10 @@ void AArch64TargetLowering::ReplaceNodeResults(
     ReplaceExtractSubVectorResults(N, Results, DAG);
     return;
   case ISD::INSERT_SUBVECTOR:
-    // Custom lowering has been requested for INSERT_SUBVECTOR -- but delegate
-    // to common code for result type legalisation
+  case ISD::CONCAT_VECTORS:
+    // Custom lowering has been requested for INSERT_SUBVECTOR and
+    // CONCAT_VECTORS -- but delegate to common code for result type
+    // legalisation
     return;
   case ISD::INTRINSIC_WO_CHAIN: {
     EVT VT = N->getValueType(0);
@@ -18485,11 +20009,11 @@ bool AArch64TargetLowering::isOpSuitableForLDPSTP(const Instruction *I) const {
 
   if (auto LI = dyn_cast<LoadInst>(I))
     return LI->getType()->getPrimitiveSizeInBits() == 128 &&
-           LI->getAlignment() >= 16;
+           LI->getAlign() >= Align(16);
 
   if (auto SI = dyn_cast<StoreInst>(I))
     return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
-           SI->getAlignment() >= 16;
+           SI->getAlign() >= Align(16);
 
   return false;
 }
@@ -18502,12 +20026,12 @@ bool AArch64TargetLowering::shouldInsertFencesForAtomic(
 // Loads and stores less than 128-bits are already atomic; ones above that
 // are doomed anyway, so defer to the default libcall and blame the OS when
 // things go wrong.
-bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+TargetLoweringBase::AtomicExpansionKind
+AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
   unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
-  if (Size != 128)
-    return false;
-
-  return !isOpSuitableForLDPSTP(SI);
+  if (Size != 128 || isOpSuitableForLDPSTP(SI))
+    return AtomicExpansionKind::None;
+  return AtomicExpansionKind::Expand;
 }
 
 // Loads and stores less than 128-bits are already atomic; ones above that
@@ -18627,7 +20151,10 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
 
   const DataLayout &DL = M->getDataLayout();
   IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
-  Value *Trunc = Builder.CreateTrunc(Builder.CreateCall(Ldxr, Addr), IntEltTy);
+  CallInst *CI = Builder.CreateCall(Ldxr, Addr);
+  CI->addParamAttr(
+      0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy));
+  Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
 
   return Builder.CreateBitCast(Trunc, ValueTy);
 }
@@ -18668,10 +20195,13 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
   IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
   Val = Builder.CreateBitCast(Val, IntValTy);
 
-  return Builder.CreateCall(Stxr,
-                            {Builder.CreateZExtOrBitCast(
-                                 Val, Stxr->getFunctionType()->getParamType(0)),
-                             Addr});
+  CallInst *CI = Builder.CreateCall(
+      Stxr, {Builder.CreateZExtOrBitCast(
+                 Val, Stxr->getFunctionType()->getParamType(0)),
+             Addr});
+  CI->addParamAttr(1, Attribute::get(Builder.getContext(),
+                                     Attribute::ElementType, Val->getType()));
+  return CI;
 }
 
 bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
@@ -18993,8 +20523,7 @@ static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
   // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
   // AArch64SVEPredPattern::all, which can enable the use of unpredicated
   // variants of instructions when available.
-  const auto &Subtarget =
-      static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
+  const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
   unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
   unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
   if (MaxSVESize && MinSVESize == MaxSVESize &&
@@ -19080,22 +20609,23 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
     MemVT = MemVT.changeTypeToInteger();
   }
 
-  auto NewLoad = DAG.getMaskedLoad(
+  SDValue NewLoad = DAG.getMaskedLoad(
       LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
       DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
       Load->getAddressingMode(), Load->getExtensionType());
 
+  SDValue Result = NewLoad;
   if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
     EVT ExtendVT = ContainerVT.changeVectorElementType(
         Load->getMemoryVT().getVectorElementType());
 
-    NewLoad = getSVESafeBitCast(ExtendVT, NewLoad, DAG);
-    NewLoad = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
-                          Pg, NewLoad, DAG.getUNDEF(ContainerVT));
+    Result = getSVESafeBitCast(ExtendVT, Result, DAG);
+    Result = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
+                         Pg, Result, DAG.getUNDEF(ContainerVT));
   }
 
-  auto Result = convertFromScalableVector(DAG, VT, NewLoad);
-  SDValue MergedValues[2] = {Result, Load->getChain()};
+  Result = convertFromScalableVector(DAG, VT, Result);
+  SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
   return DAG.getMergeValues(MergedValues, DL);
 }
 
@@ -19143,19 +20673,20 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
       IsPassThruZeroOrUndef = true;
   }
 
-  auto NewLoad = DAG.getMaskedLoad(
+  SDValue NewLoad = DAG.getMaskedLoad(
       ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
       Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
       Load->getAddressingMode(), Load->getExtensionType());
 
+  SDValue Result = NewLoad;
   if (!IsPassThruZeroOrUndef) {
     SDValue OldPassThru =
         convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
-    NewLoad = DAG.getSelect(DL, ContainerVT, Mask, NewLoad, OldPassThru);
+    Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
   }
 
-  auto Result = convertFromScalableVector(DAG, VT, NewLoad);
-  SDValue MergedValues[2] = {Result, Load->getChain()};
+  Result = convertFromScalableVector(DAG, VT, Result);
+  SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
   return DAG.getMergeValues(MergedValues, DL);
 }
 
@@ -19232,7 +20763,7 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
 
   // Scalable vector i32/i64 DIV is supported.
   if (EltVT == MVT::i32 || EltVT == MVT::i64)
-    return LowerToPredicatedOp(Op, DAG, PredOpcode, /*OverrideNEON=*/true);
+    return LowerToPredicatedOp(Op, DAG, PredOpcode);
 
   // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
@@ -19387,13 +20918,13 @@ SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
 // NOTE: The results for inactive lanes are undefined.
 SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
                                                    SelectionDAG &DAG,
-                                                   unsigned NewOp,
-                                                   bool OverrideNEON) const {
+                                                   unsigned NewOp) const {
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
   auto Pg = getPredicateForVector(DAG, DL, VT);
 
-  if (useSVEForFixedLengthVectorVT(VT, OverrideNEON)) {
+  if (VT.isFixedLengthVector()) {
+    assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
     EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
 
     // Create list of operands by converting existing ones to scalable types.
@@ -19411,8 +20942,8 @@ SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
         continue;
       }
 
-      assert(useSVEForFixedLengthVectorVT(V.getValueType(), OverrideNEON) &&
-             "Only fixed length vectors are supported!");
+      assert(isTypeLegal(V.getValueType()) &&
+             "Expected only legal fixed-width types");
       Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
     }
 
@@ -19543,7 +21074,9 @@ SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
   SDValue VecOp = ScalarOp.getOperand(0);
   EVT SrcVT = VecOp.getValueType();
 
-  if (useSVEForFixedLengthVectorVT(SrcVT, true)) {
+  if (useSVEForFixedLengthVectorVT(
+          SrcVT,
+          /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
     EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
     VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
   }
@@ -19950,6 +21483,17 @@ SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
   EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType());
   EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
 
+  // Safe bitcasting between unpacked vector types of different element counts
+  // is currently unsupported because the following is missing the necessary
+  // work to ensure the result's elements live where they're supposed to within
+  // an SVE register.
+  //                01234567
+  // e.g. nxv2i32 = XX??XX??
+  //      nxv4f16 = X?X?X?X?
+  assert((VT.getVectorElementCount() == InVT.getVectorElementCount() ||
+          VT == PackedVT || InVT == PackedInVT) &&
+         "Unexpected bitcast!");
+
   // Pack input if required.
   if (InVT != PackedInVT)
     Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
@@ -20016,6 +21560,13 @@ bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
       Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
 }
 
+bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
+  return Op.getOpcode() == AArch64ISD::DUP ||
+         (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+          Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
+         TargetLowering::isTargetCanonicalConstantNode(Op);
+}
+
 bool AArch64TargetLowering::isConstantUnsignedBitfieldExtractLegal(
     unsigned Opc, LLT Ty1, LLT Ty2) const {
   return Ty1 == Ty2 && (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64));
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 2138c0ffe70a..06ea918ea32e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -55,6 +55,8 @@ enum NodeType : unsigned {
   // x29, x29` marker instruction.
   CALL_RVMARKER,
 
+  CALL_BTI, // Function call followed by a BTI instruction.
+
   // Produces the full sequence of instructions for getting the thread pointer
   // offset of a variable into X0, using the TLSDesc model.
   TLSDESC_CALLSEQ,
@@ -79,7 +81,6 @@ enum NodeType : unsigned {
   // Predicated instructions where inactive lanes produce undefined results.
   ABDS_PRED,
   ABDU_PRED,
-  ADD_PRED,
   FADD_PRED,
   FDIV_PRED,
   FMA_PRED,
@@ -98,7 +99,6 @@ enum NodeType : unsigned {
   SMIN_PRED,
   SRA_PRED,
   SRL_PRED,
-  SUB_PRED,
   UDIV_PRED,
   UMAX_PRED,
   UMIN_PRED,
@@ -158,6 +158,7 @@ enum NodeType : unsigned {
   DUPLANE16,
   DUPLANE32,
   DUPLANE64,
+  DUPLANE128,
 
   // Vector immedate moves
   MOVI,
@@ -232,15 +233,10 @@ enum NodeType : unsigned {
   SADDV,
   UADDV,
 
-  // Vector halving addition
-  SHADD,
-  UHADD,
-
-  // Vector rounding halving addition
-  SRHADD,
-  URHADD,
-
-  // Unsigned Add Long Pairwise
+  // Add Pairwise of two vectors
+  ADDP,
+  // Add Long Pairwise
+  SADDLP,
   UADDLP,
 
   // udot/sdot instructions
@@ -411,6 +407,10 @@ enum NodeType : unsigned {
   SSTNT1_PRED,
   SSTNT1_INDEX_PRED,
 
+  // SME
+  RDSVL,
+  REVD_MERGE_PASSTHRU,
+
   // Asserts that a function argument (i32) is zero-extended to i8 by
   // the caller
   ASSERT_ZEXT_BOOL,
@@ -462,23 +462,6 @@ enum NodeType : unsigned {
 
 } // end namespace AArch64ISD
 
-namespace {
-
-// Any instruction that defines a 32-bit result zeros out the high half of the
-// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
-// be copying from a truncate. But any other 32-bit operation will zero-extend
-// up to 64 bits. AssertSext/AssertZext aren't saying anything about the upper
-// 32 bits, they're probably just qualifying a CopyFromReg.
-static inline bool isDef32(const SDNode &N) {
-  unsigned Opc = N.getOpcode();
-  return Opc != ISD::TRUNCATE && Opc != TargetOpcode::EXTRACT_SUBREG &&
-         Opc != ISD::CopyFromReg && Opc != ISD::AssertSext &&
-         Opc != ISD::AssertZext && Opc != ISD::AssertAlign &&
-         Opc != ISD::FREEZE;
-}
-
-} // end anonymous namespace
-
 namespace AArch64 {
 /// Possible values of current rounding mode, which is specified in bits
 /// 23:22 of FPCR.
@@ -501,6 +484,11 @@ public:
   explicit AArch64TargetLowering(const TargetMachine &TM,
                                  const AArch64Subtarget &STI);
 
+  /// Control the following reassociation of operands: (op (op x, c1), y) -> (op
+  /// (op x, y), c1) where N0 is (op x, c1) and N1 is y.
+  bool isReassocProfitable(SelectionDAG &DAG, SDValue N0,
+                           SDValue N1) const override;
+
   /// Selects the correct CCAssignFn for a given CallingConvention value.
   CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
 
@@ -573,6 +561,17 @@ public:
   MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
                                            MachineBasicBlock *BB) const;
 
+  MachineBasicBlock *EmitTileLoad(unsigned Opc, unsigned BaseReg,
+                                  MachineInstr &MI,
+                                  MachineBasicBlock *BB) const;
+  MachineBasicBlock *EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const;
+  MachineBasicBlock *EmitMopa(unsigned Opc, unsigned BaseReg, MachineInstr &MI,
+                              MachineBasicBlock *BB) const;
+  MachineBasicBlock *EmitInsertVectorToTile(unsigned Opc, unsigned BaseReg,
+                                            MachineInstr &MI,
+                                            MachineBasicBlock *BB) const;
+  MachineBasicBlock *EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const;
+
   MachineBasicBlock *
   EmitInstrWithCustomInserter(MachineInstr &MI,
                               MachineBasicBlock *MBB) const override;
@@ -610,8 +609,8 @@ public:
   bool isLegalAddImmediate(int64_t) const override;
   bool isLegalICmpImmediate(int64_t) const override;
 
-  bool isMulAddWithConstProfitable(const SDValue &AddNode,
-                                   const SDValue &ConstNode) const override;
+  bool isMulAddWithConstProfitable(SDValue AddNode,
+                                   SDValue ConstNode) const override;
 
   bool shouldConsiderGEPOffsetSplit() const override;
 
@@ -651,6 +650,10 @@ public:
   bool isDesirableToCommuteWithShift(const SDNode *N,
                                      CombineLevel Level) const override;
 
+  /// Return true if it is profitable to fold a pair of shifts into a mask.
+  bool shouldFoldConstantShiftPairToMask(const SDNode *N,
+                                         CombineLevel Level) const override;
+
   /// Returns true if it is beneficial to convert a load of a constant
   /// to just the constant itself.
   bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
@@ -680,7 +683,8 @@ public:
 
   TargetLoweringBase::AtomicExpansionKind
   shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
-  bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+  TargetLoweringBase::AtomicExpansionKind
+  shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
   TargetLoweringBase::AtomicExpansionKind
   shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
 
@@ -898,11 +902,8 @@ private:
   SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
 
-  bool isEligibleForTailCallOptimization(
-      SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
-      const SmallVectorImpl<ISD::OutputArg> &Outs,
-      const SmallVectorImpl<SDValue> &OutVals,
-      const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
+  bool
+  isEligibleForTailCallOptimization(const CallLoweringInfo &CLI) const;
 
   /// Finds the incoming stack arguments which overlap the given fixed stack
   /// object and incorporates their load into the current chain. This prevents
@@ -980,8 +981,8 @@ private:
   SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDUPQLane(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG, unsigned NewOp,
-                              bool OverrideNEON = false) const;
+  SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG,
+                              unsigned NewOp) const;
   SDValue LowerToScalableOp(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECTOR_SPLICE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
@@ -1052,6 +1053,8 @@ private:
 
   SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
                         SmallVectorImpl<SDNode *> &Created) const override;
+  SDValue BuildSREMPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
+                        SmallVectorImpl<SDNode *> &Created) const override;
   SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
                           int &ExtraSteps, bool &UseOneConst,
                           bool Reciprocal) const override;
@@ -1093,7 +1096,7 @@ private:
   }
 
   bool shouldExtendGSIndex(EVT VT, EVT &EltTy) const override;
-  bool shouldRemoveExtendFromGSIndex(EVT VT) const override;
+  bool shouldRemoveExtendFromGSIndex(EVT IndexVT, EVT DataVT) const override;
   bool isVectorLoadExtDesirable(SDValue ExtVal) const override;
   bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
   bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
@@ -1129,6 +1132,8 @@ private:
                                          TargetLoweringOpt &TLO,
                                          unsigned Depth) const override;
 
+  bool isTargetCanonicalConstantNode(SDValue Op) const override;
+
   // Normally SVE is only used for byte size vectors that do not fit within a
   // NEON vector. This changes when OverrideNEON is true, allowing SVE to be
   // used for 64bit and 128bit vectors as well.
diff --git a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
index b220929514f9..c477a44b13b2 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -27,22 +27,43 @@ def : Pat<(atomic_fence (timm), (timm)), (DMB (i32 0xb))>;
 // supported, but when they're relaxed and anything can be used, all the
 // standard modes would be valid and may give efficiency gains.
 
+// An atomic load operation that does not need either acquire or release
+// semantics.
+class relaxed_load<PatFrag base>
+  : PatFrag<(ops node:$ptr), (base node:$ptr)> {
+  let IsAtomic = 1;
+  let IsAtomicOrderingAcquireOrStronger = 0;
+}
+
 // A atomic load operation that actually needs acquire semantics.
 class acquiring_load<PatFrag base>
   : PatFrag<(ops node:$ptr), (base node:$ptr)> {
   let IsAtomic = 1;
-  let IsAtomicOrderingAcquireOrStronger = 1;
+  let IsAtomicOrderingAcquire = 1;
 }
 
-// An atomic load operation that does not need either acquire or release
-// semantics.
-class relaxed_load<PatFrag base>
+// An atomic load operation that needs sequential consistency.
+class seq_cst_load<PatFrag base>
   : PatFrag<(ops node:$ptr), (base node:$ptr)> {
   let IsAtomic = 1;
-  let IsAtomicOrderingAcquireOrStronger = 0;
+  let IsAtomicOrderingSequentiallyConsistent = 1;
+}
+
+// RCPC extension, currently opt-in under a separate feature.
+let Predicates = [HasLDAPR] in {
+  // v8.3 Release Consistent Processor Consistent support, optional in v8.2.
+  // 8-bit loads
+  def : Pat<(acquiring_load<atomic_load_8> GPR64sp:$ptr), (LDAPRB GPR64sp:$ptr)>;
+  // 16-bit loads
+  def : Pat<(acquiring_load<atomic_load_16> GPR64sp:$ptr), (LDAPRH GPR64sp:$ptr)>;
+  // 32-bit loads
+  def : Pat<(acquiring_load<atomic_load_32> GPR64sp:$ptr), (LDAPRW GPR64sp:$ptr)>;
+  // 64-bit loads
+  def : Pat<(acquiring_load<atomic_load_64> GPR64sp:$ptr), (LDAPRX GPR64sp:$ptr)>;
 }
 
 // 8-bit loads
+def : Pat<(seq_cst_load<atomic_load_8>  GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>;
 def : Pat<(acquiring_load<atomic_load_8>  GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>;
 def : Pat<(relaxed_load<atomic_load_8> (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
                                                      ro_Wextend8:$offset)),
@@ -58,6 +79,7 @@ def : Pat<(relaxed_load<atomic_load_8>
           (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
 
 // 16-bit loads
+def : Pat<(seq_cst_load<atomic_load_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>;
 def : Pat<(acquiring_load<atomic_load_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>;
 def : Pat<(relaxed_load<atomic_load_16> (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
                                                        ro_Wextend16:$extend)),
@@ -73,6 +95,7 @@ def : Pat<(relaxed_load<atomic_load_16>
           (LDURHHi GPR64sp:$Rn, simm9:$offset)>;
 
 // 32-bit loads
+def : Pat<(seq_cst_load<atomic_load_32> GPR64sp:$ptr), (LDARW GPR64sp:$ptr)>;
 def : Pat<(acquiring_load<atomic_load_32> GPR64sp:$ptr), (LDARW GPR64sp:$ptr)>;
 def : Pat<(relaxed_load<atomic_load_32> (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
                                                        ro_Wextend32:$extend)),
@@ -88,6 +111,7 @@ def : Pat<(relaxed_load<atomic_load_32>
           (LDURWi GPR64sp:$Rn, simm9:$offset)>;
 
 // 64-bit loads
+def : Pat<(seq_cst_load<atomic_load_64> GPR64sp:$ptr), (LDARX GPR64sp:$ptr)>;
 def : Pat<(acquiring_load<atomic_load_64> GPR64sp:$ptr), (LDARX GPR64sp:$ptr)>;
 def : Pat<(relaxed_load<atomic_load_64> (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
                                                        ro_Wextend64:$extend)),
@@ -490,7 +514,8 @@ def CMP_SWAP_64 : Pseudo<(outs GPR64:$Rd, GPR32:$scratch),
 
 let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi,@earlyclobber $scratch",
     mayLoad = 1, mayStore = 1 in {
-class cmp_swap_128 : Pseudo<(outs GPR64:$RdLo, GPR64:$RdHi, GPR32common:$scratch),
+class cmp_swap_128 : Pseudo<(outs GPR64common:$RdLo, GPR64common:$RdHi,
+                                  GPR32common:$scratch),
                            (ins GPR64:$addr, GPR64:$desiredLo, GPR64:$desiredHi,
                                 GPR64:$newLo, GPR64:$newHi), []>,
                      Sched<[WriteAtomic]>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 4c1e41b7efee..78bc1b8c6f02 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -109,15 +109,19 @@ class TriOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$MHS, node:$RHS), res>;
 class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
 class UnOpFrag<dag res>  : PatFrag<(ops node:$LHS), res>;
 
-// Helper fragment for an extract of the high portion of a 128-bit vector.
+// Helper fragment for an extract of the high portion of a 128-bit vector. The
+// ComplexPattern match both extract_subvector and bitcast(extract_subvector(..)).
 def extract_high_v16i8 :
-   UnOpFrag<(extract_subvector (v16i8 node:$LHS), (i64 8))>;
+    ComplexPattern<v8i8, 1, "SelectExtractHigh", [extract_subvector, bitconvert]>;
 def extract_high_v8i16 :
-   UnOpFrag<(extract_subvector (v8i16 node:$LHS), (i64 4))>;
+    ComplexPattern<v4i16, 1, "SelectExtractHigh", [extract_subvector, bitconvert]>;
 def extract_high_v4i32 :
-   UnOpFrag<(extract_subvector (v4i32 node:$LHS), (i64 2))>;
-def extract_high_v2i64 :
-   UnOpFrag<(extract_subvector (v2i64 node:$LHS), (i64 1))>;
+    ComplexPattern<v2i32, 1, "SelectExtractHigh", [extract_subvector, bitconvert]>;
+
+def extract_high_dup_v8i16 :
+   BinOpFrag<(extract_subvector (v8i16 (AArch64duplane16 (v8i16 node:$LHS), node:$RHS)), (i64 4))>;
+def extract_high_dup_v4i32 :
+   BinOpFrag<(extract_subvector (v4i32 (AArch64duplane32 (v4i32 node:$LHS), node:$RHS)), (i64 2))>;
 
 //===----------------------------------------------------------------------===//
 // Asm Operand Classes.
@@ -1178,6 +1182,13 @@ def fpimm32XForm : SDNodeXForm<fpimm, [{
       return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
     }]>;
 
+def fpimm32SIMDModImmType4XForm : SDNodeXForm<fpimm, [{
+      uint32_t enc = AArch64_AM::encodeAdvSIMDModImmType4(N->getValueAPF()
+                                                          .bitcastToAPInt()
+                                                          .getZExtValue());
+      return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
+    }]>;
+
 def fpimm64XForm : SDNodeXForm<fpimm, [{
       APFloat InVal = N->getValueAPF();
       uint32_t enc = AArch64_AM::getFP64Imm(InVal);
@@ -1199,6 +1210,13 @@ def fpimm32 : Operand<f32>,
   let ParserMatchClass = FPImmOperand;
   let PrintMethod = "printFPImmOperand";
 }
+
+def fpimm32SIMDModImmType4 : FPImmLeaf<f32, [{
+      uint64_t Enc = Imm.bitcastToAPInt().getZExtValue();
+      return Enc != 0 && AArch64_AM::isAdvSIMDModImmType4(Enc << 32 | Enc);
+    }], fpimm32SIMDModImmType4XForm> {
+}
+
 def fpimm64 : Operand<f64>,
               FPImmLeaf<f64, [{
       return AArch64_AM::getFP64Imm(Imm) != -1;
@@ -1234,6 +1252,9 @@ def gi_fpimm32 : GICustomOperandRenderer<"renderFPImm32">,
   GISDNodeXFormEquiv<fpimm32XForm>;
 def gi_fpimm64 : GICustomOperandRenderer<"renderFPImm64">,
   GISDNodeXFormEquiv<fpimm64XForm>;
+def gi_fpimm32SIMDModImmType4 :
+    GICustomOperandRenderer<"renderFPImm32SIMDModImmType4">,
+  GISDNodeXFormEquiv<fpimm32SIMDModImmType4XForm>;
 
 // Vector lane operands
 class AsmVectorIndex<int Min, int Max, string NamePrefix=""> : AsmOperandClass {
@@ -1261,8 +1282,12 @@ def VectorIndexHOperand : AsmVectorIndex<0, 7>;
 def VectorIndexSOperand : AsmVectorIndex<0, 3>;
 def VectorIndexDOperand : AsmVectorIndex<0, 1>;
 
-defm VectorIndex0 : VectorIndex<i64, VectorIndex0Operand,
+let OperandNamespace = "AArch64" in {
+  let OperandType = "OPERAND_IMPLICIT_IMM_0" in {
+    defm VectorIndex0 : VectorIndex<i64, VectorIndex0Operand,
                                 [{ return ((uint64_t)Imm) == 0; }]>;
+  }
+}
 defm VectorIndex1 : VectorIndex<i64, VectorIndex1Operand,
                                 [{ return ((uint64_t)Imm) == 1; }]>;
 defm VectorIndexB : VectorIndex<i64, VectorIndexBOperand,
@@ -1312,6 +1337,8 @@ def sme_elm_idx0_0 : Operand<i64>, ImmLeaf<i64, [{
 }]> {
   let ParserMatchClass = Imm0_0Operand;
   let PrintMethod = "printMatrixIndex";
+  let OperandNamespace = "AArch64";
+  let OperandType = "OPERAND_IMPLICIT_IMM_0";
 }
 def sme_elm_idx0_1 : Operand<i64>, ImmLeaf<i64, [{
   return ((uint64_t)Imm) <= 1;
@@ -4512,8 +4539,9 @@ multiclass MemTagStore<bits<2> opc1, string insn> {
 //---
 
 let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
-class ExceptionGeneration<bits<3> op1, bits<2> ll, string asm>
-    : I<(outs), (ins timm32_0_65535:$imm), asm, "\t$imm", "", []>,
+class ExceptionGeneration<bits<3> op1, bits<2> ll, string asm,
+                          list<dag> pattern = []>
+    : I<(outs), (ins timm32_0_65535:$imm), asm, "\t$imm", "", pattern>,
       Sched<[WriteSys]> {
   bits<16> imm;
   let Inst{31-24} = 0b11010100;
@@ -4542,6 +4570,7 @@ let Predicates = [HasFPARMv8] in {
 // Floating point to integer conversion
 //---
 
+let mayRaiseFPException = 1 in
 class BaseFPToIntegerUnscaled<bits<2> type, bits<2> rmode, bits<3> opcode,
                       RegisterClass srcType, RegisterClass dstType,
                       string asm, list<dag> pattern>
@@ -4561,7 +4590,7 @@ class BaseFPToIntegerUnscaled<bits<2> type, bits<2> rmode, bits<3> opcode,
   let Inst{4-0}   = Rd;
 }
 
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in
 class BaseFPToInteger<bits<2> type, bits<2> rmode, bits<3> opcode,
                       RegisterClass srcType, RegisterClass dstType,
                       Operand immType, string asm, list<dag> pattern>
@@ -4683,7 +4712,7 @@ multiclass FPToIntegerScaled<bits<2> rmode, bits<3> opcode, string asm,
 // Integer to floating point conversion
 //---
 
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0, mayRaiseFPException = 1 in
 class BaseIntegerToFP<bit isUnsigned,
                       RegisterClass srcType, RegisterClass dstType,
                       Operand immType, string asm, list<dag> pattern>
@@ -4701,6 +4730,7 @@ class BaseIntegerToFP<bit isUnsigned,
   let Inst{4-0}   = Rd;
 }
 
+let mayRaiseFPException = 1 in
 class BaseIntegerToFPUnscaled<bit isUnsigned,
                       RegisterClass srcType, RegisterClass dstType,
                       ValueType dvt, string asm, SDPatternOperator node>
@@ -4937,6 +4967,7 @@ multiclass UnscaledConversion<string asm> {
 // Floating point conversion
 //---
 
+let mayRaiseFPException = 1 in
 class BaseFPConversion<bits<2> type, bits<2> opcode, RegisterClass dstType,
                        RegisterClass srcType, string asm, list<dag> pattern>
     : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "", pattern>,
@@ -4963,15 +4994,15 @@ multiclass FPConversion<string asm> {
 
   // Half-precision to Double-precision
   def DHr : BaseFPConversion<0b11, 0b01, FPR64, FPR16, asm,
-                             [(set FPR64:$Rd, (fpextend (f16 FPR16:$Rn)))]>;
+                             [(set FPR64:$Rd, (any_fpextend (f16 FPR16:$Rn)))]>;
 
   // Half-precision to Single-precision
   def SHr : BaseFPConversion<0b11, 0b00, FPR32, FPR16, asm,
-                             [(set FPR32:$Rd, (fpextend (f16 FPR16:$Rn)))]>;
+                             [(set FPR32:$Rd, (any_fpextend (f16 FPR16:$Rn)))]>;
 
   // Single-precision to Double-precision
   def DSr : BaseFPConversion<0b00, 0b01, FPR64, FPR32, asm,
-                             [(set FPR64:$Rd, (fpextend FPR32:$Rn))]>;
+                             [(set FPR64:$Rd, (any_fpextend FPR32:$Rn))]>;
 
   // Single-precision to Half-precision
   def HSr : BaseFPConversion<0b00, 0b11, FPR16, FPR32, asm,
@@ -4999,8 +5030,9 @@ class BaseSingleOperandFPData<bits<6> opcode, RegisterClass regtype,
 }
 
 multiclass SingleOperandFPData<bits<4> opcode, string asm,
-                               SDPatternOperator node = null_frag> {
-
+                               SDPatternOperator node = null_frag,
+                               int fpexceptions = 1> {
+  let mayRaiseFPException = fpexceptions in {
   def Hr : BaseSingleOperandFPData<{0b00,opcode}, FPR16, f16, asm, node> {
     let Inst{23-22} = 0b11; // 16-bit size flag
     let Predicates = [HasFullFP16];
@@ -5013,8 +5045,14 @@ multiclass SingleOperandFPData<bits<4> opcode, string asm,
   def Dr : BaseSingleOperandFPData<{0b00,opcode}, FPR64, f64, asm, node> {
     let Inst{23-22} = 0b01; // 64-bit size flag
   }
+  }
 }
 
+multiclass SingleOperandFPDataNoException<bits<4> opcode, string asm,
+                                       SDPatternOperator node = null_frag>
+    : SingleOperandFPData<opcode, asm, node, 0>;
+
+let mayRaiseFPException = 1 in
 multiclass SingleOperandFPNo16<bits<6> opcode, string asm,
                   SDPatternOperator node = null_frag>{
 
@@ -5035,7 +5073,7 @@ multiclass FRIntNNT<bits<2> opcode, string asm, SDPatternOperator node = null_fr
 // Two operand floating point data processing
 //---
 
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in
 class BaseTwoOperandFPData<bits<4> opcode, RegisterClass regtype,
                            string asm, list<dag> pat>
     : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
@@ -5075,7 +5113,8 @@ multiclass TwoOperandFPData<bits<4> opcode, string asm,
   }
 }
 
-multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> {
+multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm,
+                               SDPatternOperator node> {
   def Hrr : BaseTwoOperandFPData<opcode, FPR16, asm,
                   [(set (f16 FPR16:$Rd), (fneg (node (f16 FPR16:$Rn), (f16 FPR16:$Rm))))]> {
     let Inst{23-22} = 0b11; // 16-bit size flag
@@ -5098,6 +5137,7 @@ multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> {
 // Three operand floating point data processing
 //---
 
+let mayRaiseFPException = 1 in
 class BaseThreeOperandFPData<bit isNegated, bit isSub,
                              RegisterClass regtype, string asm, list<dag> pat>
     : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, regtype: $Ra),
@@ -5142,7 +5182,7 @@ multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
 // Floating point data comparisons
 //---
 
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in
 class BaseOneOperandFPComparison<bit signalAllNans,
                                  RegisterClass regtype, string asm,
                                  list<dag> pat>
@@ -5161,7 +5201,7 @@ class BaseOneOperandFPComparison<bit signalAllNans,
   let PostEncoderMethod = "fixOneOperandFPComparison";
 }
 
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in
 class BaseTwoOperandFPComparison<bit signalAllNans, RegisterClass regtype,
                                 string asm, list<dag> pat>
     : I<(outs), (ins regtype:$Rn, regtype:$Rm), asm, "\t$Rn, $Rm", "", pat>,
@@ -5218,7 +5258,7 @@ multiclass FPComparison<bit signalAllNans, string asm,
 // Floating point conditional comparisons
 //---
 
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in
 class BaseFPCondComparison<bit signalAllNans, RegisterClass regtype,
                            string mnemonic, list<dag> pat>
     : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm32_0_15:$nzcv, ccode:$cond),
@@ -5544,6 +5584,7 @@ multiclass SIMDThreeSameVectorB<bit U, bits<5> opc, string asm,
 }
 
 // As above, but only floating point elements supported.
+let mayRaiseFPException = 1 in
 multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<3> opc,
                                  string asm, SDPatternOperator OpNode> {
   let Predicates = [HasNEON, HasFullFP16] in {
@@ -5565,6 +5606,7 @@ multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<3> opc,
         [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
 }
 
+let mayRaiseFPException = 1 in
 multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<3> opc,
                                     string asm,
                                     SDPatternOperator OpNode> {
@@ -5587,6 +5629,7 @@ multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<3> opc,
         [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
 }
 
+let mayRaiseFPException = 1 in
 multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<3> opc,
                                  string asm, SDPatternOperator OpNode> {
   let Predicates = [HasNEON, HasFullFP16] in {
@@ -5614,6 +5657,7 @@ multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<3> opc,
 }
 
 // As above, but D and B sized elements unsupported.
+let mayRaiseFPException = 1 in
 multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm,
                                 SDPatternOperator OpNode> {
   def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64,
@@ -5718,6 +5762,7 @@ multiclass SIMDThreeSameVectorDot<bit U, bit Mixed, string asm, SDPatternOperato
 // ARMv8.2-A Fused Multiply Add-Long Instructions (Vector): These instructions
 // select inputs from 4H vectors and accumulate outputs to a 2S vector (or from
 // 8H to 4S, when Q=1).
+let mayRaiseFPException = 1 in
 class BaseSIMDThreeSameVectorFML<bit Q, bit U, bit b13, bits<3> size, string asm, string kind1,
                                  string kind2, RegisterOperand RegType,
                                  ValueType AccumType, ValueType InputType,
@@ -5986,7 +6031,9 @@ multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm,
 // Supports H, S and D element sizes, uses high bit of the size field
 // as an extra opcode bit.
 multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm,
-                           SDPatternOperator OpNode> {
+                           SDPatternOperator OpNode,
+                           int fpexceptions = 1> {
+  let mayRaiseFPException = fpexceptions in {
   let Predicates = [HasNEON, HasFullFP16] in {
   def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64,
                                 asm, ".4h", ".4h",
@@ -6004,9 +6051,15 @@ multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm,
   def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128,
                                 asm, ".2d", ".2d",
                           [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
+  }
 }
 
+multiclass SIMDTwoVectorFPNoException<bit U, bit S, bits<5> opc, string asm,
+                                      SDPatternOperator OpNode>
+    : SIMDTwoVectorFP<U, S, opc, asm, OpNode, 0>;
+
 // Supports only S and D element sizes
+let mayRaiseFPException = 1 in
 multiclass SIMDTwoVectorSD<bit U, bits<5> opc, string asm,
                            SDPatternOperator OpNode = null_frag> {
 
@@ -6036,7 +6089,7 @@ multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm,
                           [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
 }
 
-
+let mayRaiseFPException = 1 in
 multiclass SIMDTwoVectorFPToInt<bit U, bit S, bits<5> opc, string asm,
                            SDPatternOperator OpNode> {
   let Predicates = [HasNEON, HasFullFP16] in {
@@ -6058,6 +6111,7 @@ multiclass SIMDTwoVectorFPToInt<bit U, bit S, bits<5> opc, string asm,
                           [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
 }
 
+let mayRaiseFPException = 1 in
 multiclass SIMDTwoVectorIntToFP<bit U, bit S, bits<5> opc, string asm,
                            SDPatternOperator OpNode> {
   let Predicates = [HasNEON, HasFullFP16] in {
@@ -6209,6 +6263,7 @@ multiclass SIMDCmpTwoVector<bit U, bits<5> opc, string asm,
 multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc,
                               string asm, SDNode OpNode> {
 
+  let mayRaiseFPException = 1 in {
   let Predicates = [HasNEON, HasFullFP16] in {
   def v4i16rz : BaseSIMDCmpTwoVector<0, U, {S,1}, 0b11, opc, V64,
                                      asm, ".4h", "0.0",
@@ -6226,6 +6281,7 @@ multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc,
   def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, 0b00, opc, V128,
                                      asm, ".2d", "0.0",
                                      v2i64, v2f64, OpNode>;
+  }
 
   let Predicates = [HasNEON, HasFullFP16] in {
   def : InstAlias<asm # "\t$Vd.4h, $Vn.4h, #0",
@@ -6253,7 +6309,7 @@ multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc,
                   (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
 }
 
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in
 class BaseSIMDFPCvtTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
                              RegisterOperand outtype, RegisterOperand intype,
                              string asm, string VdTy, string VnTy,
@@ -6275,7 +6331,7 @@ class BaseSIMDFPCvtTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
   let Inst{4-0}   = Rd;
 }
 
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in
 class BaseSIMDFPCvtTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
                              RegisterOperand outtype, RegisterOperand intype,
                              string asm, string VdTy, string VnTy,
@@ -6457,8 +6513,8 @@ multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm,
                                               asm#"2", ".1q", ".2d", ".2d", []>;
   }
 
-  def : Pat<(v8i16 (IntOp (v8i8 (extract_high_v16i8 V128:$Rn)),
-                          (v8i8 (extract_high_v16i8 V128:$Rm)))),
+  def : Pat<(v8i16 (IntOp (v8i8 (extract_high_v16i8 (v16i8 V128:$Rn))),
+                          (v8i8 (extract_high_v16i8 (v16i8 V128:$Rm))))),
       (!cast<Instruction>(NAME#"v16i8") V128:$Rn, V128:$Rm)>;
 }
 
@@ -6471,8 +6527,8 @@ multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm,
   def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
                                                   V128, V128, V128,
                                                   asm#"2", ".4s", ".8h", ".8h",
-      [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
-                                      (extract_high_v8i16 V128:$Rm)))]>;
+      [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)),
+                                      (extract_high_v8i16 (v8i16 V128:$Rm))))]>;
   def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
                                                   V128, V64, V64,
                                                   asm, ".2d", ".2s", ".2s",
@@ -6480,8 +6536,8 @@ multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm,
   def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
                                                   V128, V128, V128,
                                                   asm#"2", ".2d", ".4s", ".4s",
-      [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
-                                      (extract_high_v4i32 V128:$Rm)))]>;
+      [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)),
+                                      (extract_high_v4i32 (v4i32 V128:$Rm))))]>;
 }
 
 multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm,
@@ -6495,8 +6551,8 @@ multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm,
                                                  V128, V128, V128,
                                                  asm#"2", ".8h", ".16b", ".16b",
       [(set (v8i16 V128:$Rd),
-            (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
-                                (extract_high_v16i8 V128:$Rm)))))]>;
+            (zext (v8i8 (OpNode (extract_high_v16i8 (v16i8 V128:$Rn)),
+                                (extract_high_v16i8 (v16i8 V128:$Rm))))))]>;
   def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
                                                   V128, V64, V64,
                                                   asm, ".4s", ".4h", ".4h",
@@ -6506,8 +6562,8 @@ multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm,
                                                   V128, V128, V128,
                                                   asm#"2", ".4s", ".8h", ".8h",
       [(set (v4i32 V128:$Rd),
-            (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
-                                  (extract_high_v8i16 V128:$Rm)))))]>;
+            (zext (v4i16 (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)),
+                                  (extract_high_v8i16 (v8i16 V128:$Rm))))))]>;
   def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
                                                   V128, V64, V64,
                                                   asm, ".2d", ".2s", ".2s",
@@ -6517,8 +6573,8 @@ multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm,
                                                   V128, V128, V128,
                                                   asm#"2", ".2d", ".4s", ".4s",
       [(set (v2i64 V128:$Rd),
-            (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
-                                 (extract_high_v4i32 V128:$Rm)))))]>;
+            (zext (v2i32 (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)),
+                                 (extract_high_v4i32 (v4i32 V128:$Rm))))))]>;
 }
 
 multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc,
@@ -6535,8 +6591,8 @@ multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc,
                                                  asm#"2", ".8h", ".16b", ".16b",
     [(set (v8i16 V128:$dst),
           (add (v8i16 V128:$Rd),
-               (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
-                                   (extract_high_v16i8 V128:$Rm))))))]>;
+               (zext (v8i8 (OpNode (extract_high_v16i8 (v16i8 V128:$Rn)),
+                                   (extract_high_v16i8 (v16i8 V128:$Rm)))))))]>;
   def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
                                                   V128, V64, V64,
                                                   asm, ".4s", ".4h", ".4h",
@@ -6548,8 +6604,8 @@ multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc,
                                                   asm#"2", ".4s", ".8h", ".8h",
     [(set (v4i32 V128:$dst),
           (add (v4i32 V128:$Rd),
-               (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
-                                    (extract_high_v8i16 V128:$Rm))))))]>;
+               (zext (v4i16 (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)),
+                                    (extract_high_v8i16 (v8i16 V128:$Rm)))))))]>;
   def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
                                                   V128, V64, V64,
                                                   asm, ".2d", ".2s", ".2s",
@@ -6561,8 +6617,8 @@ multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc,
                                                   asm#"2", ".2d", ".4s", ".4s",
     [(set (v2i64 V128:$dst),
           (add (v2i64 V128:$Rd),
-               (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
-                                    (extract_high_v4i32 V128:$Rm))))))]>;
+               (zext (v2i32 (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)),
+                                    (extract_high_v4i32 (v4i32 V128:$Rm)))))))]>;
 }
 
 multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm,
@@ -6574,8 +6630,8 @@ multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm,
   def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
                                                  V128, V128, V128,
                                                  asm#"2", ".8h", ".16b", ".16b",
-      [(set (v8i16 V128:$Rd), (OpNode (extract_high_v16i8 V128:$Rn),
-                                      (extract_high_v16i8 V128:$Rm)))]>;
+      [(set (v8i16 V128:$Rd), (OpNode (extract_high_v16i8 (v16i8 V128:$Rn)),
+                                      (extract_high_v16i8 (v16i8 V128:$Rm))))]>;
   def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
                                                   V128, V64, V64,
                                                   asm, ".4s", ".4h", ".4h",
@@ -6583,8 +6639,8 @@ multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm,
   def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
                                                   V128, V128, V128,
                                                   asm#"2", ".4s", ".8h", ".8h",
-      [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
-                                      (extract_high_v8i16 V128:$Rm)))]>;
+      [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)),
+                                      (extract_high_v8i16 (v8i16 V128:$Rm))))]>;
   def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
                                                   V128, V64, V64,
                                                   asm, ".2d", ".2s", ".2s",
@@ -6592,8 +6648,8 @@ multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm,
   def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
                                                   V128, V128, V128,
                                                   asm#"2", ".2d", ".4s", ".4s",
-      [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
-                                      (extract_high_v4i32 V128:$Rm)))]>;
+      [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)),
+                                      (extract_high_v4i32 (v4i32 V128:$Rm))))]>;
 }
 
 multiclass SIMDLongThreeVectorTiedBHS<bit U, bits<4> opc,
@@ -6609,8 +6665,8 @@ multiclass SIMDLongThreeVectorTiedBHS<bit U, bits<4> opc,
                                                  asm#"2", ".8h", ".16b", ".16b",
     [(set (v8i16 V128:$dst),
           (OpNode (v8i16 V128:$Rd),
-                  (extract_high_v16i8 V128:$Rn),
-                  (extract_high_v16i8 V128:$Rm)))]>;
+                  (extract_high_v16i8 (v16i8 V128:$Rn)),
+                  (extract_high_v16i8 (v16i8 V128:$Rm))))]>;
   def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
                                                   V128, V64, V64,
                                                   asm, ".4s", ".4h", ".4h",
@@ -6621,8 +6677,8 @@ multiclass SIMDLongThreeVectorTiedBHS<bit U, bits<4> opc,
                                                   asm#"2", ".4s", ".8h", ".8h",
     [(set (v4i32 V128:$dst),
           (OpNode (v4i32 V128:$Rd),
-                  (extract_high_v8i16 V128:$Rn),
-                  (extract_high_v8i16 V128:$Rm)))]>;
+                  (extract_high_v8i16 (v8i16 V128:$Rn)),
+                  (extract_high_v8i16 (v8i16 V128:$Rm))))]>;
   def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
                                                   V128, V64, V64,
                                                   asm, ".2d", ".2s", ".2s",
@@ -6633,8 +6689,8 @@ multiclass SIMDLongThreeVectorTiedBHS<bit U, bits<4> opc,
                                                   asm#"2", ".2d", ".4s", ".4s",
     [(set (v2i64 V128:$dst),
           (OpNode (v2i64 V128:$Rd),
-                  (extract_high_v4i32 V128:$Rn),
-                  (extract_high_v4i32 V128:$Rm)))]>;
+                  (extract_high_v4i32 (v4i32 V128:$Rn)),
+                  (extract_high_v4i32 (v4i32 V128:$Rm))))]>;
 }
 
 multiclass SIMDLongThreeVectorSQDMLXTiedHS<bit U, bits<4> opc, string asm,
@@ -6651,8 +6707,8 @@ multiclass SIMDLongThreeVectorSQDMLXTiedHS<bit U, bits<4> opc, string asm,
                                                   asm#"2", ".4s", ".8h", ".8h",
     [(set (v4i32 V128:$dst),
           (Accum (v4i32 V128:$Rd),
-                 (v4i32 (int_aarch64_neon_sqdmull (extract_high_v8i16 V128:$Rn),
-                                            (extract_high_v8i16 V128:$Rm)))))]>;
+                 (v4i32 (int_aarch64_neon_sqdmull (extract_high_v8i16 (v8i16 V128:$Rn)),
+                                            (extract_high_v8i16 (v8i16 V128:$Rm))))))]>;
   def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
                                                   V128, V64, V64,
                                                   asm, ".2d", ".2s", ".2s",
@@ -6665,8 +6721,8 @@ multiclass SIMDLongThreeVectorSQDMLXTiedHS<bit U, bits<4> opc, string asm,
                                                   asm#"2", ".2d", ".4s", ".4s",
     [(set (v2i64 V128:$dst),
           (Accum (v2i64 V128:$Rd),
-                 (v2i64 (int_aarch64_neon_sqdmull (extract_high_v4i32 V128:$Rn),
-                                            (extract_high_v4i32 V128:$Rm)))))]>;
+                 (v2i64 (int_aarch64_neon_sqdmull (extract_high_v4i32 (v4i32 V128:$Rn)),
+                                            (extract_high_v4i32 (v4i32 V128:$Rm))))))]>;
 }
 
 multiclass SIMDWideThreeVectorBHS<bit U, bits<4> opc, string asm,
@@ -6679,7 +6735,7 @@ multiclass SIMDWideThreeVectorBHS<bit U, bits<4> opc, string asm,
                                                   V128, V128, V128,
                                                   asm#"2", ".8h", ".8h", ".16b",
        [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
-                                       (extract_high_v16i8 V128:$Rm)))]>;
+                                       (extract_high_v16i8 (v16i8 V128:$Rm))))]>;
   def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
                                                   V128, V128, V64,
                                                   asm, ".4s", ".4s", ".4h",
@@ -6688,7 +6744,7 @@ multiclass SIMDWideThreeVectorBHS<bit U, bits<4> opc, string asm,
                                                   V128, V128, V128,
                                                   asm#"2", ".4s", ".4s", ".8h",
        [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
-                                       (extract_high_v8i16 V128:$Rm)))]>;
+                                       (extract_high_v8i16 (v8i16 V128:$Rm))))]>;
   def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
                                                   V128, V128, V64,
                                                   asm, ".2d", ".2d", ".2s",
@@ -6697,7 +6753,7 @@ multiclass SIMDWideThreeVectorBHS<bit U, bits<4> opc, string asm,
                                                   V128, V128, V128,
                                                   asm#"2", ".2d", ".2d", ".4s",
        [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
-                                       (extract_high_v4i32 V128:$Rm)))]>;
+                                       (extract_high_v4i32 (v4i32 V128:$Rm))))]>;
 }
 
 //----------------------------------------------------------------------------
@@ -6876,7 +6932,7 @@ multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm> {
 multiclass SIMDFPThreeScalar<bit U, bit S, bits<3> opc, string asm,
                              SDPatternOperator OpNode = null_frag,
                              Predicate pred = HasNEON> {
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in {
     let Predicates = [pred] in {
     def NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm,
       [(set (f64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
@@ -6895,7 +6951,7 @@ multiclass SIMDFPThreeScalar<bit U, bit S, bits<3> opc, string asm,
 
 multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<3> opc, string asm,
                                 SDPatternOperator OpNode = null_frag> {
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in {
     def NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm,
       [(set (i64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
     def NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm,
@@ -7025,6 +7081,7 @@ class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode,
   let Inst{4-0}   = Rd;
 }
 
+let mayRaiseFPException = 1 in
 class SIMDInexactCvtTwoScalar<bits<5> opcode, string asm>
   : I<(outs FPR32:$Rd), (ins FPR64:$Rn), asm, "\t$Rd, $Rn", "",
      [(set (f32 FPR32:$Rd), (int_aarch64_sisd_fcvtxn (f64 FPR64:$Rn)))]>,
@@ -7048,11 +7105,13 @@ multiclass SIMDCmpTwoScalarD<bit U, bits<5> opc, string asm,
 
 multiclass SIMDFPCmpTwoScalar<bit U, bit S, bits<5> opc, string asm,
                               SDPatternOperator OpNode> {
+  let mayRaiseFPException = 1 in {
   def v1i64rz  : BaseSIMDCmpTwoScalar<U, {S,1}, 0b00, opc, FPR64, asm, "0.0">;
   def v1i32rz  : BaseSIMDCmpTwoScalar<U, {S,0}, 0b00, opc, FPR32, asm, "0.0">;
   let Predicates = [HasNEON, HasFullFP16] in {
   def v1i16rz  : BaseSIMDCmpTwoScalar<U, {S,1}, 0b11, opc, FPR16, asm, "0.0">;
   }
+  }
 
   def : InstAlias<asm # "\t$Rd, $Rn, #0",
                   (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rd, FPR64:$Rn), 0>;
@@ -7076,6 +7135,7 @@ multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm,
             (!cast<Instruction>(NAME # "v1i64") FPR64:$Rn)>;
 }
 
+let mayRaiseFPException = 1 in
 multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm,
                            Predicate pred = HasNEON> {
   let Predicates = [pred] in {
@@ -7087,6 +7147,7 @@ multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm,
   }
 }
 
+let mayRaiseFPException = 1 in
 multiclass SIMDFPTwoScalarCVT<bit U, bit S, bits<5> opc, string asm,
                               SDPatternOperator OpNode> {
   def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,
@@ -7169,6 +7230,7 @@ multiclass SIMDPairwiseScalarD<bit U, bits<5> opc, string asm> {
                                       asm, ".2d">;
 }
 
+let mayRaiseFPException = 1 in
 multiclass SIMDFPPairwiseScalar<bit S, bits<5> opc, string asm> {
   let Predicates = [HasNEON, HasFullFP16] in {
   def v2i16p : BaseSIMDPairwiseScalar<0, {S,0}, opc, FPR16Op, V64,
@@ -7232,6 +7294,7 @@ multiclass SIMDAcrossLanesHSD<bit U, bits<5> opcode, string asm> {
                                    asm, ".4s", []>;
 }
 
+let mayRaiseFPException = 1 in
 multiclass SIMDFPAcrossLanes<bits<5> opcode, bit sz1, string asm,
                             Intrinsic intOp> {
   let Predicates = [HasNEON, HasFullFP16] in {
@@ -7351,7 +7414,7 @@ class SIMDMovAlias<string asm, string size, Instruction inst,
 multiclass SMov {
   // SMOV with vector index of 0 are legal in Scalable Matrix Extension (SME)
   // streaming mode.
-  let Predicates = [HasNEONorStreamingSVE] in {
+  let Predicates = [HasNEONorSME] in {
     def vi8to32_idx0 : SIMDSMov<0, ".b", GPR32, VectorIndex0> {
       let Inst{20-16} = 0b00001;
     }
@@ -7398,7 +7461,7 @@ multiclass SMov {
 multiclass UMov {
   // UMOV with vector index of 0 are legal in Scalable Matrix Extension (SME)
   // streaming mode.
-  let Predicates = [HasNEONorStreamingSVE] in {
+  let Predicates = [HasNEONorSME] in {
     def vi8_idx0 : SIMDUMov<0, ".b", v16i8, GPR32, VectorIndex0> {
       let Inst{20-16} = 0b00001;
     }
@@ -8048,6 +8111,7 @@ multiclass SIMDThreeSameVectorBF16DotI<bit U, string asm> {
                                               ".2h", V128, v4f32, v8bf16>;
 }
 
+let mayRaiseFPException = 1 in
 class SIMDBF16MLAL<bit Q, string asm, SDPatternOperator OpNode>
   : BaseSIMDThreeSameVectorTied<Q, 0b1, 0b110, 0b11111, V128, asm, ".4s",
               [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),
@@ -8056,6 +8120,7 @@ class SIMDBF16MLAL<bit Q, string asm, SDPatternOperator OpNode>
   let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h}");
 }
 
+let mayRaiseFPException = 1 in
 class SIMDBF16MLALIndex<bit Q, string asm, SDPatternOperator OpNode>
   : I<(outs V128:$dst),
       (ins V128:$Rd, V128:$Rn, V128_lo:$Rm, VectorIndexH:$idx), asm,
@@ -8095,18 +8160,21 @@ class SIMDThreeSameVectorBF16MatrixMul<string asm>
                                     ", $Rm", ".8h", "}");
 }
 
+let mayRaiseFPException = 1 in
 class SIMD_BFCVTN
   : BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128,
                            "bfcvtn", ".4h", ".4s",
     [(set (v8bf16 V128:$Rd),
           (int_aarch64_neon_bfcvtn (v4f32 V128:$Rn)))]>;
 
+let mayRaiseFPException = 1 in
 class SIMD_BFCVTN2
   : BaseSIMDMixedTwoVectorTied<1, 0, 0b10, 0b10110, V128, V128,
                            "bfcvtn2", ".8h", ".4s",
     [(set (v8bf16 V128:$dst),
           (int_aarch64_neon_bfcvtn2 (v8bf16 V128:$Rd), (v4f32 V128:$Rn)))]>;
 
+let mayRaiseFPException = 1 in
 class BF16ToSinglePrecision<string asm>
   : I<(outs FPR16:$Rd), (ins FPR32:$Rn), asm, "\t$Rd, $Rn", "",
     [(set (bf16 FPR16:$Rd), (int_aarch64_neon_bfcvt (f32 FPR32:$Rn)))]>,
@@ -8160,6 +8228,7 @@ multiclass SIMDThreeSameVectorDotIndex<bit U, bit Mixed, bits<2> size, string as
 }
 
 // ARMv8.2-A Fused Multiply Add-Long Instructions (Indexed)
+let mayRaiseFPException = 1 in
 class BaseSIMDThreeSameVectorFMLIndex<bit Q, bit U, bits<4> opc, string asm,
                                       string dst_kind, string lhs_kind,
                                       string rhs_kind, RegisterOperand RegType,
@@ -8187,6 +8256,7 @@ multiclass SIMDThreeSameVectorFMLIndex<bit U, bits<4> opc, string asm,
                                               V128, v4f32, v8f16, OpNode>;
 }
 
+let mayRaiseFPException = 1 in
 multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
                          SDPatternOperator OpNode> {
   let Predicates = [HasNEON, HasFullFP16] in {
@@ -8369,6 +8439,7 @@ multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {
                 V128:$Rm, VectorIndexD:$idx)>;
 }
 
+let mayRaiseFPException = 1 in
 multiclass SIMDFPIndexedTied<bit U, bits<4> opc, string asm> {
   let Predicates = [HasNEON, HasFullFP16] in {
   def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b00, opc, V64, V64,
@@ -8701,9 +8772,8 @@ multiclass SIMDIndexedLongSD<bit U, bits<4> opc, string asm,
                                       V128_lo, VectorIndexH,
                                       asm#"2", ".4s", ".4s", ".8h", ".h",
     [(set (v4i32 V128:$Rd),
-          (OpNode (extract_high_v8i16 V128:$Rn),
-                  (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
-                                                      VectorIndexH:$idx))))]> {
+          (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)),
+                  (extract_high_dup_v8i16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx)))]> {
 
     bits<3> idx;
     let Inst{11} = idx{2};
@@ -8728,9 +8798,8 @@ multiclass SIMDIndexedLongSD<bit U, bits<4> opc, string asm,
                                       V128, VectorIndexS,
                                       asm#"2", ".2d", ".2d", ".4s", ".s",
     [(set (v2i64 V128:$Rd),
-          (OpNode (extract_high_v4i32 V128:$Rn),
-                  (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
-                                                      VectorIndexS:$idx))))]> {
+          (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)),
+                  (extract_high_dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx)))]> {
     bits<2> idx;
     let Inst{11} = idx{1};
     let Inst{21} = idx{0};
@@ -8793,10 +8862,8 @@ multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
     [(set (v4i32 V128:$dst),
           (Accum (v4i32 V128:$Rd),
                  (v4i32 (int_aarch64_neon_sqdmull
-                            (extract_high_v8i16 V128:$Rn),
-                            (extract_high_v8i16
-                                (AArch64duplane16 (v8i16 V128_lo:$Rm),
-                                                VectorIndexH:$idx))))))]> {
+                            (extract_high_v8i16 (v8i16 V128:$Rn)),
+                            (extract_high_dup_v8i16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx)))))]> {
     bits<3> idx;
     let Inst{11} = idx{2};
     let Inst{21} = idx{1};
@@ -8825,10 +8892,8 @@ multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
     [(set (v2i64 V128:$dst),
           (Accum (v2i64 V128:$Rd),
                  (v2i64 (int_aarch64_neon_sqdmull
-                            (extract_high_v4i32 V128:$Rn),
-                            (extract_high_v4i32
-                                (AArch64duplane32 (v4i32 V128:$Rm),
-                                                VectorIndexS:$idx))))))]> {
+                            (extract_high_v4i32 (v4i32 V128:$Rn)),
+                            (extract_high_dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx)))))]> {
     bits<2> idx;
     let Inst{11} = idx{1};
     let Inst{21} = idx{0};
@@ -8881,9 +8946,8 @@ multiclass SIMDVectorIndexedLongSD<bit U, bits<4> opc, string asm,
                                       V128_lo, VectorIndexH,
                                       asm#"2", ".4s", ".4s", ".8h", ".h",
     [(set (v4i32 V128:$Rd),
-          (OpNode (extract_high_v8i16 V128:$Rn),
-                  (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
-                                                      VectorIndexH:$idx))))]> {
+          (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)),
+                  (extract_high_dup_v8i16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx)))]> {
 
     bits<3> idx;
     let Inst{11} = idx{2};
@@ -8908,9 +8972,8 @@ multiclass SIMDVectorIndexedLongSD<bit U, bits<4> opc, string asm,
                                       V128, VectorIndexS,
                                       asm#"2", ".2d", ".2d", ".4s", ".s",
     [(set (v2i64 V128:$Rd),
-          (OpNode (extract_high_v4i32 V128:$Rn),
-                  (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
-                                                      VectorIndexS:$idx))))]> {
+          (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)),
+                  (extract_high_dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx)))]> {
     bits<2> idx;
     let Inst{11} = idx{1};
     let Inst{21} = idx{0};
@@ -8940,9 +9003,8 @@ multiclass SIMDVectorIndexedLongSDTied<bit U, bits<4> opc, string asm,
                                       asm#"2", ".4s", ".4s", ".8h", ".h",
     [(set (v4i32 V128:$dst),
           (OpNode (v4i32 V128:$Rd),
-                  (extract_high_v8i16 V128:$Rn),
-                  (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
-                                                      VectorIndexH:$idx))))]> {
+                  (extract_high_v8i16 (v8i16 V128:$Rn)),
+                  (extract_high_dup_v8i16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx)))]> {
     bits<3> idx;
     let Inst{11} = idx{2};
     let Inst{21} = idx{1};
@@ -8967,9 +9029,8 @@ multiclass SIMDVectorIndexedLongSDTied<bit U, bits<4> opc, string asm,
                                       asm#"2", ".2d", ".2d", ".4s", ".s",
     [(set (v2i64 V128:$dst),
           (OpNode (v2i64 V128:$Rd),
-                  (extract_high_v4i32 V128:$Rn),
-                  (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
-                                                      VectorIndexS:$idx))))]> {
+                  (extract_high_v4i32 (v4i32 V128:$Rn)),
+                  (extract_high_dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx)))]> {
     bits<2> idx;
     let Inst{11} = idx{1};
     let Inst{21} = idx{0};
@@ -9654,7 +9715,7 @@ multiclass SIMDVectorLShiftLongBHSD<bit U, bits<5> opc, string asm,
                                   V128, V128, vecshiftL8,
                                   asm#"2", ".8h", ".16b",
       [(set (v8i16 V128:$Rd),
-            (OpNode (extract_high_v16i8 V128:$Rn), vecshiftL8:$imm))]> {
+            (OpNode (extract_high_v16i8 (v16i8 V128:$Rn)), vecshiftL8:$imm))]> {
     bits<3> imm;
     let Inst{18-16} = imm;
   }
@@ -9670,7 +9731,7 @@ multiclass SIMDVectorLShiftLongBHSD<bit U, bits<5> opc, string asm,
                                   V128, V128, vecshiftL16,
                                   asm#"2", ".4s", ".8h",
       [(set (v4i32 V128:$Rd),
-            (OpNode (extract_high_v8i16 V128:$Rn), vecshiftL16:$imm))]> {
+            (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)), vecshiftL16:$imm))]> {
 
     bits<4> imm;
     let Inst{19-16} = imm;
@@ -9687,7 +9748,7 @@ multiclass SIMDVectorLShiftLongBHSD<bit U, bits<5> opc, string asm,
                                   V128, V128, vecshiftL32,
                                   asm#"2", ".2d", ".4s",
       [(set (v2i64 V128:$Rd),
-            (OpNode (extract_high_v4i32 V128:$Rn), vecshiftL32:$imm))]> {
+            (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)), vecshiftL32:$imm))]> {
     bits<5> imm;
     let Inst{20-16} = imm;
   }
@@ -10671,7 +10732,7 @@ def complexrotateopodd : Operand<i32>, TImmLeaf<i32, [{ return Imm >= 0 && Imm <
   let ParserMatchClass = ComplexRotationOperand<180, 90, "Odd">;
   let PrintMethod = "printComplexRotationOp<180, 90>";
 }
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in
 class BaseSIMDThreeSameVectorComplex<bit Q, bit U, bits<2> size, bits<3> opcode,
                                      RegisterOperand regtype, Operand rottype,
                                      string asm, string kind, list<dag> pattern>
@@ -10742,7 +10803,7 @@ multiclass SIMDThreeSameVectorComplexHSD<bit U, bits<3> opcode, Operand rottype,
   }
 }
 
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in
 class BaseSIMDThreeSameVectorTiedComplex<bit Q, bit U, bits<2> size,
                                          bits<3> opcode,
                                          RegisterOperand regtype,
@@ -10814,7 +10875,7 @@ multiclass SIMDThreeSameVectorTiedComplexHSD<bit U, bits<3> opcode,
   }
 }
 
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in
 class BaseSIMDIndexedTiedComplex<bit Q, bit U, bit Scalar, bits<2> size,
                                  bit opc1, bit opc2, RegisterOperand dst_reg,
                                  RegisterOperand lhs_reg,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index a9191924129c..835a7b6cc81d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -42,6 +42,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LEB128.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
@@ -1094,7 +1095,10 @@ bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
     return true;
   default:;
   }
-  return isSEHInstruction(MI);
+  if (isSEHInstruction(MI))
+    return true;
+  auto Next = std::next(MI.getIterator());
+  return Next != MBB->end() && Next->isCFIInstruction();
 }
 
 /// analyzeCompare - For a comparison instruction, return the source registers
@@ -1435,7 +1439,7 @@ bool AArch64InstrInfo::optimizeCompareInstr(
       return false;
     const MCInstrDesc &MCID = get(NewOpc);
     CmpInstr.setDesc(MCID);
-    CmpInstr.RemoveOperand(DeadNZCVIdx);
+    CmpInstr.removeOperand(DeadNZCVIdx);
     bool succeeded = UpdateOperandRegClass(CmpInstr);
     (void)succeeded;
     assert(succeeded && "Some operands reg class are incompatible!");
@@ -1547,27 +1551,6 @@ findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr) {
   }
 }
 
-namespace {
-
-struct UsedNZCV {
-  bool N = false;
-  bool Z = false;
-  bool C = false;
-  bool V = false;
-
-  UsedNZCV() = default;
-
-  UsedNZCV &operator|=(const UsedNZCV &UsedFlags) {
-    this->N |= UsedFlags.N;
-    this->Z |= UsedFlags.Z;
-    this->C |= UsedFlags.C;
-    this->V |= UsedFlags.V;
-    return *this;
-  }
-};
-
-} // end anonymous namespace
-
 /// Find a condition code used by the instruction.
 /// Returns AArch64CC::Invalid if either the instruction does not use condition
 /// codes or we don't optimize CmpInstr in the presence of such instructions.
@@ -1622,15 +1605,15 @@ static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
   return UsedFlags;
 }
 
-/// \returns Conditions flags used after \p CmpInstr in its MachineBB if they
-/// are not containing C or V flags and NZCV flags are not alive in successors
-/// of the same \p CmpInstr and \p MI parent. \returns None otherwise.
+/// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
+/// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
+/// \returns None otherwise.
 ///
 /// Collect instructions using that flags in \p CCUseInstrs if provided.
-static Optional<UsedNZCV>
-examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr,
-                 const TargetRegisterInfo &TRI,
-                 SmallVectorImpl<MachineInstr *> *CCUseInstrs = nullptr) {
+Optional<UsedNZCV>
+llvm::examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr,
+                       const TargetRegisterInfo &TRI,
+                       SmallVectorImpl<MachineInstr *> *CCUseInstrs) {
   MachineBasicBlock *CmpParent = CmpInstr.getParent();
   if (MI.getParent() != CmpParent)
     return None;
@@ -1652,8 +1635,6 @@ examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr,
     if (Instr.modifiesRegister(AArch64::NZCV, &TRI))
       break;
   }
-  if (NZCVUsedAfterCmp.C || NZCVUsedAfterCmp.V)
-    return None;
   return NZCVUsedAfterCmp;
 }
 
@@ -1684,7 +1665,8 @@ static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr,
   if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
     return false;
 
-  if (!examineCFlagsUse(MI, CmpInstr, TRI))
+  Optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
+  if (!NZVCUsed || NZVCUsed->C || NZVCUsed->V)
     return false;
 
   AccessKind AccessToCheck = AK_Write;
@@ -1773,7 +1755,7 @@ static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr,
       examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs);
   // Condition flags are not used in CmpInstr basic block successors and only
   // Z or N flags allowed to be used after CmpInstr within its basic block
-  if (!NZCVUsedAfterCmp)
+  if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)
     return false;
   // Z or N flag used after CmpInstr must correspond to the flag used in MI
   if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
@@ -2270,6 +2252,19 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
   case AArch64::LD1SW_D_IMM:
   case AArch64::LD1D_IMM:
 
+  case AArch64::LD2B_IMM:
+  case AArch64::LD2H_IMM:
+  case AArch64::LD2W_IMM:
+  case AArch64::LD2D_IMM:
+  case AArch64::LD3B_IMM:
+  case AArch64::LD3H_IMM:
+  case AArch64::LD3W_IMM:
+  case AArch64::LD3D_IMM:
+  case AArch64::LD4B_IMM:
+  case AArch64::LD4H_IMM:
+  case AArch64::LD4W_IMM:
+  case AArch64::LD4D_IMM:
+
   case AArch64::ST1B_IMM:
   case AArch64::ST1B_H_IMM:
   case AArch64::ST1B_S_IMM:
@@ -2281,6 +2276,19 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
   case AArch64::ST1W_D_IMM:
   case AArch64::ST1D_IMM:
 
+  case AArch64::ST2B_IMM:
+  case AArch64::ST2H_IMM:
+  case AArch64::ST2W_IMM:
+  case AArch64::ST2D_IMM:
+  case AArch64::ST3B_IMM:
+  case AArch64::ST3H_IMM:
+  case AArch64::ST3W_IMM:
+  case AArch64::ST3D_IMM:
+  case AArch64::ST4B_IMM:
+  case AArch64::ST4H_IMM:
+  case AArch64::ST4W_IMM:
+  case AArch64::ST4D_IMM:
+
   case AArch64::LD1RB_IMM:
   case AArch64::LD1RB_H_IMM:
   case AArch64::LD1RB_S_IMM:
@@ -2897,6 +2905,45 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
     MinOffset = -8;
     MaxOffset = 7;
     break;
+  case AArch64::LD2B_IMM:
+  case AArch64::LD2H_IMM:
+  case AArch64::LD2W_IMM:
+  case AArch64::LD2D_IMM:
+  case AArch64::ST2B_IMM:
+  case AArch64::ST2H_IMM:
+  case AArch64::ST2W_IMM:
+  case AArch64::ST2D_IMM:
+    Scale = TypeSize::Scalable(32);
+    Width = SVEMaxBytesPerVector * 2;
+    MinOffset = -8;
+    MaxOffset = 7;
+    break;
+  case AArch64::LD3B_IMM:
+  case AArch64::LD3H_IMM:
+  case AArch64::LD3W_IMM:
+  case AArch64::LD3D_IMM:
+  case AArch64::ST3B_IMM:
+  case AArch64::ST3H_IMM:
+  case AArch64::ST3W_IMM:
+  case AArch64::ST3D_IMM:
+    Scale = TypeSize::Scalable(48);
+    Width = SVEMaxBytesPerVector * 3;
+    MinOffset = -8;
+    MaxOffset = 7;
+    break;
+  case AArch64::LD4B_IMM:
+  case AArch64::LD4H_IMM:
+  case AArch64::LD4W_IMM:
+  case AArch64::LD4D_IMM:
+  case AArch64::ST4B_IMM:
+  case AArch64::ST4H_IMM:
+  case AArch64::ST4W_IMM:
+  case AArch64::ST4D_IMM:
+    Scale = TypeSize::Scalable(64);
+    Width = SVEMaxBytesPerVector * 4;
+    MinOffset = -8;
+    MaxOffset = 7;
+    break;
   case AArch64::LD1B_H_IMM:
   case AArch64::LD1SB_H_IMM:
   case AArch64::LD1H_S_IMM:
@@ -3105,6 +3152,86 @@ bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) {
   return isPreLd(MI) || isPreSt(MI);
 }
 
+bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case AArch64::LDPSi:
+  case AArch64::LDPSWi:
+  case AArch64::LDPDi:
+  case AArch64::LDPQi:
+  case AArch64::LDPWi:
+  case AArch64::LDPXi:
+  case AArch64::STPSi:
+  case AArch64::STPDi:
+  case AArch64::STPQi:
+  case AArch64::STPWi:
+  case AArch64::STPXi:
+  case AArch64::STGPi:
+    return true;
+  }
+}
+
+const MachineOperand &AArch64InstrInfo::getLdStBaseOp(const MachineInstr &MI) {
+  unsigned Idx =
+      AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2
+                                                                            : 1;
+  return MI.getOperand(Idx);
+}
+
+const MachineOperand &
+AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
+  unsigned Idx =
+      AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3
+                                                                            : 2;
+  return MI.getOperand(Idx);
+}
+
+static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
+                                              Register Reg) {
+  if (MI.getParent() == nullptr)
+    return nullptr;
+  const MachineFunction *MF = MI.getParent()->getParent();
+  return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;
+}
+
+bool AArch64InstrInfo::isQForm(const MachineInstr &MI) {
+  auto IsQFPR = [&](const MachineOperand &Op) {
+    if (!Op.isReg())
+      return false;
+    auto Reg = Op.getReg();
+    if (Reg.isPhysical())
+      return AArch64::FPR128RegClass.contains(Reg);
+    const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
+    return TRC == &AArch64::FPR128RegClass ||
+           TRC == &AArch64::FPR128_loRegClass;
+  };
+  return llvm::any_of(MI.operands(), IsQFPR);
+}
+
+bool AArch64InstrInfo::isFpOrNEON(const MachineInstr &MI) {
+  auto IsFPR = [&](const MachineOperand &Op) {
+    if (!Op.isReg())
+      return false;
+    auto Reg = Op.getReg();
+    if (Reg.isPhysical())
+      return AArch64::FPR128RegClass.contains(Reg) ||
+             AArch64::FPR64RegClass.contains(Reg) ||
+             AArch64::FPR32RegClass.contains(Reg) ||
+             AArch64::FPR16RegClass.contains(Reg) ||
+             AArch64::FPR8RegClass.contains(Reg);
+
+    const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
+    return TRC == &AArch64::FPR128RegClass ||
+           TRC == &AArch64::FPR128_loRegClass ||
+           TRC == &AArch64::FPR64RegClass ||
+           TRC == &AArch64::FPR64_loRegClass ||
+           TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||
+           TRC == &AArch64::FPR8RegClass;
+  };
+  return llvm::any_of(MI.operands(), IsFPR);
+}
+
 // Scale the unscaled offsets.  Returns false if the unscaled offset can't be
 // scaled.
 static bool scaleOffset(unsigned Opc, int64_t &Offset) {
@@ -3370,7 +3497,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   // Copy a Predicate register by ORRing with itself.
   if (AArch64::PPRRegClass.contains(DestReg) &&
       AArch64::PPRRegClass.contains(SrcReg)) {
-    assert(Subtarget.hasSVE() && "Unexpected SVE register.");
+    assert((Subtarget.hasSVE() || Subtarget.hasSME()) &&
+           "Unexpected SVE register.");
     BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
       .addReg(SrcReg) // Pg
       .addReg(SrcReg)
@@ -3381,7 +3509,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   // Copy a Z register by ORRing with itself.
   if (AArch64::ZPRRegClass.contains(DestReg) &&
       AArch64::ZPRRegClass.contains(SrcReg)) {
-    assert(Subtarget.hasSVE() && "Unexpected SVE register.");
+    assert((Subtarget.hasSVE() || Subtarget.hasSME()) &&
+           "Unexpected SVE register.");
     BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
       .addReg(SrcReg)
       .addReg(SrcReg, getKillRegState(KillSrc));
@@ -3391,6 +3520,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   // Copy a Z register pair by copying the individual sub-registers.
   if (AArch64::ZPR2RegClass.contains(DestReg) &&
       AArch64::ZPR2RegClass.contains(SrcReg)) {
+    assert((Subtarget.hasSVE() || Subtarget.hasSME()) &&
+           "Unexpected SVE register.");
     static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
                      Indices);
@@ -3400,6 +3531,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   // Copy a Z register triple by copying the individual sub-registers.
   if (AArch64::ZPR3RegClass.contains(DestReg) &&
       AArch64::ZPR3RegClass.contains(SrcReg)) {
+    assert((Subtarget.hasSVE() || Subtarget.hasSME()) &&
+           "Unexpected SVE register.");
     static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
                                        AArch64::zsub2};
     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
@@ -3410,6 +3543,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   // Copy a Z register quad by copying the individual sub-registers.
   if (AArch64::ZPR4RegClass.contains(DestReg) &&
       AArch64::ZPR4RegClass.contains(SrcReg)) {
+    assert((Subtarget.hasSVE() || Subtarget.hasSME()) &&
+           "Unexpected SVE register.");
     static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
                                        AArch64::zsub2, AArch64::zsub3};
     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
@@ -3979,6 +4114,119 @@ void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
   }
 }
 
+// Convenience function to create a DWARF expression for
+//   Expr + NumBytes + NumVGScaledBytes * AArch64::VG
+static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr, int NumBytes,
+                                     int NumVGScaledBytes, unsigned VG,
+                                     llvm::raw_string_ostream &Comment) {
+  uint8_t buffer[16];
+
+  if (NumBytes) {
+    Expr.push_back(dwarf::DW_OP_consts);
+    Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer));
+    Expr.push_back((uint8_t)dwarf::DW_OP_plus);
+    Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
+  }
+
+  if (NumVGScaledBytes) {
+    Expr.push_back((uint8_t)dwarf::DW_OP_consts);
+    Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer));
+
+    Expr.push_back((uint8_t)dwarf::DW_OP_bregx);
+    Expr.append(buffer, buffer + encodeULEB128(VG, buffer));
+    Expr.push_back(0);
+
+    Expr.push_back((uint8_t)dwarf::DW_OP_mul);
+    Expr.push_back((uint8_t)dwarf::DW_OP_plus);
+
+    Comment << (NumVGScaledBytes < 0 ? " - " : " + ")
+            << std::abs(NumVGScaledBytes) << " * VG";
+  }
+}
+
+// Creates an MCCFIInstruction:
+//    { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
+static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI,
+                                               unsigned Reg,
+                                               const StackOffset &Offset) {
+  int64_t NumBytes, NumVGScaledBytes;
+  AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes,
+                                                        NumVGScaledBytes);
+  std::string CommentBuffer;
+  llvm::raw_string_ostream Comment(CommentBuffer);
+
+  if (Reg == AArch64::SP)
+    Comment << "sp";
+  else if (Reg == AArch64::FP)
+    Comment << "fp";
+  else
+    Comment << printReg(Reg, &TRI);
+
+  // Build up the expression (Reg + NumBytes + NumVGScaledBytes * AArch64::VG)
+  SmallString<64> Expr;
+  unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
+  Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfReg));
+  Expr.push_back(0);
+  appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes,
+                           TRI.getDwarfRegNum(AArch64::VG, true), Comment);
+
+  // Wrap this into DW_CFA_def_cfa.
+  SmallString<64> DefCfaExpr;
+  DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
+  uint8_t buffer[16];
+  DefCfaExpr.append(buffer, buffer + encodeULEB128(Expr.size(), buffer));
+  DefCfaExpr.append(Expr.str());
+  return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(),
+                                        Comment.str());
+}
+
+MCCFIInstruction llvm::createDefCFA(const TargetRegisterInfo &TRI,
+                                    unsigned FrameReg, unsigned Reg,
+                                    const StackOffset &Offset,
+                                    bool LastAdjustmentWasScalable) {
+  if (Offset.getScalable())
+    return createDefCFAExpression(TRI, Reg, Offset);
+
+  if (FrameReg == Reg && !LastAdjustmentWasScalable)
+    return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed()));
+
+  unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
+  return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed());
+}
+
+MCCFIInstruction llvm::createCFAOffset(const TargetRegisterInfo &TRI,
+                                       unsigned Reg,
+                                       const StackOffset &OffsetFromDefCFA) {
+  int64_t NumBytes, NumVGScaledBytes;
+  AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
+      OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
+
+  unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
+
+  // Non-scalable offsets can use DW_CFA_offset directly.
+  if (!NumVGScaledBytes)
+    return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
+
+  std::string CommentBuffer;
+  llvm::raw_string_ostream Comment(CommentBuffer);
+  Comment << printReg(Reg, &TRI) << "  @ cfa";
+
+  // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG)
+  SmallString<64> OffsetExpr;
+  appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes,
+                           TRI.getDwarfRegNum(AArch64::VG, true), Comment);
+
+  // Wrap this into DW_CFA_expression
+  SmallString<64> CfaExpr;
+  CfaExpr.push_back(dwarf::DW_CFA_expression);
+  uint8_t buffer[16];
+  CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));
+  CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer));
+  CfaExpr.append(OffsetExpr.str());
+
+  return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), Comment.str());
+}
+
 // Helper function to emit a frame offset adjustment from a given
 // pointer (SrcReg), stored into DestReg. This function is explicit
 // in that it requires the opcode.
@@ -3988,7 +4236,8 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
                                unsigned SrcReg, int64_t Offset, unsigned Opc,
                                const TargetInstrInfo *TII,
                                MachineInstr::MIFlag Flag, bool NeedsWinCFI,
-                               bool *HasWinCFI) {
+                               bool *HasWinCFI, bool EmitCFAOffset,
+                               StackOffset CFAOffset, unsigned FrameReg) {
   int Sign = 1;
   unsigned MaxEncoding, ShiftSize;
   switch (Opc) {
@@ -4013,6 +4262,13 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
     llvm_unreachable("Unsupported opcode");
   }
 
+  // `Offset` can be in bytes or in "scalable bytes".
+  int VScale = 1;
+  if (Opc == AArch64::ADDVL_XXI)
+    VScale = 16;
+  else if (Opc == AArch64::ADDPL_XXI)
+    VScale = 2;
+
   // FIXME: If the offset won't fit in 24-bits, compute the offset into a
   // scratch register.  If DestReg is a virtual register, use it as the
   // scratch register; otherwise, create a new virtual register (to be
@@ -4050,6 +4306,26 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
           AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize));
     MBI = MBI.setMIFlag(Flag);
 
+    auto Change =
+        VScale == 1
+            ? StackOffset::getFixed(ThisVal << LocalShiftSize)
+            : StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize));
+    if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri)
+      CFAOffset += Change;
+    else
+      CFAOffset -= Change;
+    if (EmitCFAOffset && DestReg == TmpReg) {
+      MachineFunction &MF = *MBB.getParent();
+      const TargetSubtargetInfo &STI = MF.getSubtarget();
+      const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+
+      unsigned CFIIndex = MF.addFrameInst(
+          createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(Flag);
+    }
+
     if (NeedsWinCFI) {
       assert(Sign == 1 && "SEH directives should always have a positive sign");
       int Imm = (int)(ThisVal << LocalShiftSize);
@@ -4086,7 +4362,9 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
                            unsigned DestReg, unsigned SrcReg,
                            StackOffset Offset, const TargetInstrInfo *TII,
                            MachineInstr::MIFlag Flag, bool SetNZCV,
-                           bool NeedsWinCFI, bool *HasWinCFI) {
+                           bool NeedsWinCFI, bool *HasWinCFI,
+                           bool EmitCFAOffset, StackOffset CFAOffset,
+                           unsigned FrameReg) {
   int64_t Bytes, NumPredicateVectors, NumDataVectors;
   AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
       Offset, Bytes, NumPredicateVectors, NumDataVectors);
@@ -4101,8 +4379,13 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
       Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
     }
     emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
-                       NeedsWinCFI, HasWinCFI);
+                       NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
+                       FrameReg);
+    CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri)
+                     ? StackOffset::getFixed(-Bytes)
+                     : StackOffset::getFixed(Bytes);
     SrcReg = DestReg;
+    FrameReg = DestReg;
   }
 
   assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) &&
@@ -4112,14 +4395,17 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
 
   if (NumDataVectors) {
     emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
-                       AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr);
+                       AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr,
+                       EmitCFAOffset, CFAOffset, FrameReg);
+    CFAOffset += StackOffset::getScalable(-NumDataVectors * 16);
     SrcReg = DestReg;
   }
 
   if (NumPredicateVectors) {
     assert(DestReg != AArch64::SP && "Unaligned access to SP");
     emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
-                       AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr);
+                       AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr,
+                       EmitCFAOffset, CFAOffset, FrameReg);
   }
 }
 
@@ -4151,6 +4437,9 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
       MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
       return nullptr;
     }
+    // Nothing can folded with copy from/to NZCV.
+    if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV)
+      return nullptr;
   }
 
   // Handle the case where a copy is being spilled or filled but the source
@@ -4577,6 +4866,10 @@ static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
       return false;
   }
 
+  if (isCombineInstrSettingFlag(CombineOpc) &&
+      MI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
+    return false;
+
   return true;
 }
 
@@ -4919,6 +5212,10 @@ static bool getFMULPatterns(MachineInstr &Root,
     MachineInstr *MI = nullptr;
     if (MO.isReg() && Register::isVirtualRegister(MO.getReg()))
       MI = MRI.getUniqueVRegDef(MO.getReg());
+    // Ignore No-op COPYs in FMUL(COPY(DUP(..)))
+    if (MI && MI->getOpcode() == TargetOpcode::COPY &&
+        MI->getOperand(1).getReg().isVirtual())
+      MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg());
     if (MI && MI->getOpcode() == Opcode) {
       Patterns.push_back(Pattern);
       return true;
@@ -5073,6 +5370,42 @@ bool AArch64InstrInfo::isThroughputPattern(
   } // end switch (Pattern)
   return false;
 }
+
+/// Find other MI combine patterns.
+static bool getMiscPatterns(MachineInstr &Root,
+                            SmallVectorImpl<MachineCombinerPattern> &Patterns)
+{
+  // A - (B + C)  ==>   (A - B) - C  or  (A - C) - B
+  unsigned Opc = Root.getOpcode();
+  MachineBasicBlock &MBB = *Root.getParent();
+
+  switch (Opc) {
+  case AArch64::SUBWrr:
+  case AArch64::SUBSWrr:
+  case AArch64::SUBXrr:
+  case AArch64::SUBSXrr:
+    // Found candidate root.
+    break;
+  default:
+    return false;
+  }
+
+  if (isCombineInstrSettingFlag(Opc) &&
+      Root.findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
+    return false;
+
+  if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) ||
+      canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) ||
+      canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) ||
+      canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) {
+    Patterns.push_back(MachineCombinerPattern::SUBADD_OP1);
+    Patterns.push_back(MachineCombinerPattern::SUBADD_OP2);
+    return true;
+  }
+
+  return false;
+}
+
 /// Return true when there is potentially a faster code sequence for an
 /// instruction chain ending in \p Root. All potential patterns are listed in
 /// the \p Pattern vector. Pattern should be sorted in priority order since the
@@ -5090,6 +5423,10 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
   if (getFMAPatterns(Root, Patterns))
     return true;
 
+  // Other patterns
+  if (getMiscPatterns(Root, Patterns))
+    return true;
+
   return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
                                                      DoRegPressureReduce);
 }
@@ -5190,6 +5527,9 @@ genIndexedMultiply(MachineInstr &Root,
   MachineInstr *Dup =
       MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg());
 
+  if (Dup->getOpcode() == TargetOpcode::COPY)
+    Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg());
+
   Register DupSrcReg = Dup->getOperand(1).getReg();
   MRI.clearKillFlags(DupSrcReg);
   MRI.constrainRegClass(DupSrcReg, RC);
@@ -5337,6 +5677,53 @@ static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
   return MUL;
 }
 
+/// Do the following transformation
+/// A - (B + C)  ==>   (A - B) - C
+/// A - (B + C)  ==>   (A - C) - B
+static void
+genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI,
+                 const TargetInstrInfo *TII, MachineInstr &Root,
+                 SmallVectorImpl<MachineInstr *> &InsInstrs,
+                 SmallVectorImpl<MachineInstr *> &DelInstrs,
+                 unsigned IdxOpd1,
+                 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) {
+  assert(IdxOpd1 == 1 || IdxOpd1 == 2);
+  unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1;
+  MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
+
+  Register ResultReg = Root.getOperand(0).getReg();
+  Register RegA = Root.getOperand(1).getReg();
+  bool RegAIsKill = Root.getOperand(1).isKill();
+  Register RegB = AddMI->getOperand(IdxOpd1).getReg();
+  bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill();
+  Register RegC = AddMI->getOperand(IdxOtherOpd).getReg();
+  bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill();
+  Register NewVR = MRI.createVirtualRegister(MRI.getRegClass(RegA));
+
+  unsigned Opcode = Root.getOpcode();
+  if (Opcode == AArch64::SUBSWrr)
+    Opcode = AArch64::SUBWrr;
+  else if (Opcode == AArch64::SUBSXrr)
+    Opcode = AArch64::SUBXrr;
+  else
+    assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&
+           "Unexpected instruction opcode.");
+
+  MachineInstrBuilder MIB1 =
+      BuildMI(MF, Root.getDebugLoc(), TII->get(Opcode), NewVR)
+          .addReg(RegA, getKillRegState(RegAIsKill))
+          .addReg(RegB, getKillRegState(RegBIsKill));
+  MachineInstrBuilder MIB2 =
+      BuildMI(MF, Root.getDebugLoc(), TII->get(Opcode), ResultReg)
+          .addReg(NewVR, getKillRegState(true))
+          .addReg(RegC, getKillRegState(RegCIsKill));
+
+  InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+  InsInstrs.push_back(MIB1);
+  InsInstrs.push_back(MIB2);
+  DelInstrs.push_back(AddMI);
+}
+
 /// When getMachineCombinerPatterns() finds potential patterns,
 /// this function generates the instructions that could replace the
 /// original code sequence
@@ -5359,6 +5746,18 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
                                                 DelInstrs, InstrIdxForVirtReg);
     return;
+  case MachineCombinerPattern::SUBADD_OP1:
+    // A - (B + C)
+    // ==> (A - B) - C
+    genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1,
+                     InstrIdxForVirtReg);
+    break;
+  case MachineCombinerPattern::SUBADD_OP2:
+    // A - (B + C)
+    // ==> (A - C) - B
+    genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2,
+                     InstrIdxForVirtReg);
+    break;
   case MachineCombinerPattern::MULADDW_OP1:
   case MachineCombinerPattern::MULADDX_OP1:
     // MUL I=A,B,0
@@ -6214,6 +6613,14 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
   if (MUL)
     DelInstrs.push_back(MUL);
   DelInstrs.push_back(&Root);
+
+  // Set the flags on the inserted instructions to be the merged flags of the
+  // instructions that we have combined.
+  uint16_t Flags = Root.getFlags();
+  if (MUL)
+    Flags = Root.mergeFlagsWith(*MUL);
+  for (auto *MI : InsInstrs)
+    MI->setFlags(Flags);
 }
 
 /// Replace csincr-branch sequence by simple conditional branch
@@ -6526,13 +6933,12 @@ enum MachineOutlinerMBBFlags {
   UnsafeRegsDead = 0x8
 };
 
-unsigned
-AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const {
-  assert(C.LRUWasSet && "LRU wasn't set?");
+Register
+AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const {
   MachineFunction *MF = C.getMF();
-  const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(
-      MF->getSubtarget().getRegisterInfo());
-
+  const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
+  const AArch64RegisterInfo *ARI =
+      static_cast<const AArch64RegisterInfo *>(&TRI);
   // Check if there is an available register across the sequence that we can
   // use.
   for (unsigned Reg : AArch64::GPR64RegClass) {
@@ -6540,12 +6946,11 @@ AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const {
         Reg != AArch64::LR &&  // LR is not reserved, but don't use it.
         Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
         Reg != AArch64::X17 && // Ditto for X17.
-        C.LRU.available(Reg) && C.UsedInSequence.available(Reg))
+        C.isAvailableAcrossAndOutOfSeq(Reg, TRI) &&
+        C.isAvailableInsideSeq(Reg, TRI))
       return Reg;
   }
-
-  // No suitable register. Return 0.
-  return 0u;
+  return Register();
 }
 
 static bool
@@ -6691,10 +7096,8 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
   unsigned FlagsSetInAll = 0xF;
 
   // Compute liveness information for each candidate, and set FlagsSetInAll.
-  std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
-                [&FlagsSetInAll](outliner::Candidate &C) {
-                  FlagsSetInAll &= C.Flags;
-                });
+  for (outliner::Candidate &C : RepeatedSequenceLocs)
+    FlagsSetInAll &= C.Flags;
 
   // According to the AArch64 Procedure Call Standard, the following are
   // undefined on entry/exit from a function call:
@@ -6712,10 +7115,8 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
     // to compute liveness here.
     if (C.Flags & UnsafeRegsDead)
       return false;
-    C.initLRU(TRI);
-    LiveRegUnits LRU = C.LRU;
-    return (!LRU.available(AArch64::W16) || !LRU.available(AArch64::W17) ||
-            !LRU.available(AArch64::NZCV));
+    return C.isAnyUnavailableAcrossOrOutOfSeq(
+        {AArch64::W16, AArch64::W17, AArch64::NZCV}, TRI);
   };
 
   // Are there any candidates where those registers are live?
@@ -6752,12 +7153,10 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
   // We check to see if CFI Instructions are present, and if they are
   // we find the number of CFI Instructions in the candidates.
   unsigned CFICount = 0;
-  MachineBasicBlock::iterator MBBI = RepeatedSequenceLocs[0].front();
-  for (unsigned Loc = RepeatedSequenceLocs[0].getStartIdx();
-       Loc < RepeatedSequenceLocs[0].getEndIdx() + 1; Loc++) {
-    if (MBBI->isCFIInstruction())
+  for (auto &I : make_range(RepeatedSequenceLocs[0].front(),
+                            std::next(RepeatedSequenceLocs[0].back()))) {
+    if (I.isCFIInstruction())
       CFICount++;
-    MBBI++;
   }
 
   // We compare the number of found CFI Instructions to  the number of CFI
@@ -6860,8 +7259,6 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
 
     // Check if we have to save LR.
     for (outliner::Candidate &C : RepeatedSequenceLocs) {
-      C.initLRU(TRI);
-
       // If we have a noreturn caller, then we're going to be conservative and
       // say that we have to save LR. If we don't have a ret at the end of the
       // block, then we can't reason about liveness accurately.
@@ -6872,7 +7269,7 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
           C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
 
       // Is LR available? If so, we don't need a save.
-      if (C.LRU.available(AArch64::LR) && !IsNoReturn) {
+      if (C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) && !IsNoReturn) {
         NumBytesNoStackCalls += 4;
         C.setCallInfo(MachineOutlinerNoLRSave, 4);
         CandidatesWithoutStackFixups.push_back(C);
@@ -6888,7 +7285,7 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
 
       // Is SP used in the sequence at all? If not, we don't have to modify
       // the stack, so we are guaranteed to get the same frame.
-      else if (C.UsedInSequence.available(AArch64::SP)) {
+      else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) {
         NumBytesNoStackCalls += 12;
         C.setCallInfo(MachineOutlinerDefault, 12);
         CandidatesWithoutStackFixups.push_back(C);
@@ -6957,11 +7354,12 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
       // LR to (ie one extra stack save/restore).
       //
       if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
-        erase_if(RepeatedSequenceLocs, [this](outliner::Candidate &C) {
+        erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) {
           return (std::any_of(
                      C.front(), std::next(C.back()),
                      [](const MachineInstr &MI) { return MI.isCall(); })) &&
-                 (!C.LRU.available(AArch64::LR) || !findRegisterToSaveLRTo(C));
+                 (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) ||
+                  !findRegisterToSaveLRTo(C));
         });
       }
     }
@@ -7032,7 +7430,7 @@ bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
   // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
   // outline from it.
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
-  if (!AFI || AFI->hasRedZone().getValueOr(true))
+  if (!AFI || AFI->hasRedZone().value_or(true))
     return false;
 
   // FIXME: Teach the outliner to generate/handle Windows unwind info.
@@ -7053,8 +7451,8 @@ bool AArch64InstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
          "Suitable Machine Function for outlining must track liveness");
   LiveRegUnits LRU(getRegisterInfo());
 
-  std::for_each(MBB.rbegin(), MBB.rend(),
-                [&LRU](MachineInstr &MI) { LRU.accumulate(MI); });
+  for (MachineInstr &MI : llvm::reverse(MBB))
+    LRU.accumulate(MI);
 
   // Check if each of the unsafe registers are available...
   bool W16AvailableInBlock = LRU.available(AArch64::W16);
@@ -7333,14 +7731,17 @@ static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
           .addReg(AArch64::SP, RegState::InternalRead);
     MI.setMIFlag(MachineInstr::FrameSetup);
 
-    unsigned CFIIndex =
-        MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
-    BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::CFI_INSTRUCTION))
-        .addCFIIndex(CFIIndex)
-        .setMIFlags(MachineInstr::FrameSetup);
+    if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo()) {
+      unsigned CFIIndex =
+          MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
+      BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
+    }
 
     // If v8.3a features are available we can replace a RET instruction by
-    // RETAA or RETAB and omit the AUT instructions
+    // RETAA or RETAB and omit the AUT instructions. In this case the
+    // DW_CFA_AARCH64_negate_ra_state can't be emitted.
     if (Subtarget.hasPAuth() && MBBAUT != MBB.end() &&
         MBBAUT->getOpcode() == AArch64::RET) {
       BuildMI(MBB, MBBAUT, DL,
@@ -7353,6 +7754,11 @@ static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
               TII->get(ShouldSignReturnAddrWithAKey ? AArch64::AUTIASP
                                                     : AArch64::AUTIBSP))
           .setMIFlag(MachineInstr::FrameDestroy);
+      unsigned CFIIndexAuth =
+          MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
+      BuildMI(MBB, MBBAUT, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndexAuth)
+          .setMIFlags(MachineInstr::FrameDestroy);
     }
   }
 }
@@ -7424,24 +7830,26 @@ void AArch64InstrInfo::buildOutlinedFrame(
                                 .addImm(-16);
     It = MBB.insert(It, STRXpre);
 
-    const TargetSubtargetInfo &STI = MF.getSubtarget();
-    const MCRegisterInfo *MRI = STI.getRegisterInfo();
-    unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true);
-
-    // Add a CFI saying the stack was moved 16 B down.
-    int64_t StackPosEntry =
-        MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16));
-    BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
-        .addCFIIndex(StackPosEntry)
-        .setMIFlags(MachineInstr::FrameSetup);
-
-    // Add a CFI saying that the LR that we want to find is now 16 B higher than
-    // before.
-    int64_t LRPosEntry =
-        MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, -16));
-    BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
-        .addCFIIndex(LRPosEntry)
-        .setMIFlags(MachineInstr::FrameSetup);
+    if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo()) {
+      const TargetSubtargetInfo &STI = MF.getSubtarget();
+      const MCRegisterInfo *MRI = STI.getRegisterInfo();
+      unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true);
+
+      // Add a CFI saying the stack was moved 16 B down.
+      int64_t StackPosEntry =
+          MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16));
+      BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
+          .addCFIIndex(StackPosEntry)
+          .setMIFlags(MachineInstr::FrameSetup);
+
+      // Add a CFI saying that the LR that we want to find is now 16 B higher
+      // than before.
+      int64_t LRPosEntry = MF.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, DwarfReg, -16));
+      BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
+          .addCFIIndex(LRPosEntry)
+          .setMIFlags(MachineInstr::FrameSetup);
+    }
 
     // Insert a restore before the terminator for the function.
     MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
@@ -7495,7 +7903,7 @@ void AArch64InstrInfo::buildOutlinedFrame(
 
 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
     Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
-    MachineFunction &MF, const outliner::Candidate &C) const {
+    MachineFunction &MF, outliner::Candidate &C) const {
 
   // Are we tail calling?
   if (C.CallConstructionID == MachineOutlinerTailCall) {
@@ -7526,8 +7934,8 @@ MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
   if (C.CallConstructionID == MachineOutlinerRegSave) {
     // FIXME: This logic should be sunk into a target-specific interface so that
     // we don't have to recompute the register.
-    unsigned Reg = findRegisterToSaveLRTo(C);
-    assert(Reg != 0 && "No callee-saved register available?");
+    Register Reg = findRegisterToSaveLRTo(C);
+    assert(Reg && "No callee-saved register available?");
 
     // LR has to be a live in so that we can save it.
     if (!MBB.isLiveIn(AArch64::LR))
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 1054bea40e68..b7a6ac301cdc 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -103,6 +103,21 @@ public:
   /// Returns whether the instruction is a pre-indexed load/store.
   static bool isPreLdSt(const MachineInstr &MI);
 
+  /// Returns whether the instruction is a paired load/store.
+  static bool isPairedLdSt(const MachineInstr &MI);
+
+  /// Returns the base register operator of a load/store.
+  static const MachineOperand &getLdStBaseOp(const MachineInstr &MI);
+
+  /// Returns the the immediate offset operator of a load/store.
+  static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI);
+
+  /// Returns whether the instruction is FP or NEON.
+  static bool isFpOrNEON(const MachineInstr &MI);
+
+  /// Returns whether the instruction is in Q form (128 bit operands)
+  static bool isQForm(const MachineInstr &MI);
+
   /// Returns the index for the immediate for a given instruction.
   static unsigned getLoadStoreImmIdx(unsigned Opc);
 
@@ -283,7 +298,7 @@ public:
   MachineBasicBlock::iterator
   insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
                      MachineBasicBlock::iterator &It, MachineFunction &MF,
-                     const outliner::Candidate &C) const override;
+                     outliner::Candidate &C) const override;
   bool shouldOutlineFromFunctionByDefault(MachineFunction &MF) const override;
   /// Returns the vector element size (B, H, S or D) of an SVE opcode.
   uint64_t getElementSizeForOpcode(unsigned Opc) const;
@@ -347,7 +362,7 @@ private:
 
   /// Returns an unused general-purpose register which can be used for
   /// constructing an outlined call if one exists. Returns 0 otherwise.
-  unsigned findRegisterToSaveLRTo(const outliner::Candidate &C) const;
+  Register findRegisterToSaveLRTo(outliner::Candidate &C) const;
 
   /// Remove a ptest of a predicate-generating operation that already sets, or
   /// can be made to set, the condition codes in an identical manner
@@ -356,12 +371,45 @@ private:
                           const MachineRegisterInfo *MRI) const;
 };
 
+struct UsedNZCV {
+  bool N = false;
+  bool Z = false;
+  bool C = false;
+  bool V = false;
+
+  UsedNZCV() = default;
+
+  UsedNZCV &operator|=(const UsedNZCV &UsedFlags) {
+    this->N |= UsedFlags.N;
+    this->Z |= UsedFlags.Z;
+    this->C |= UsedFlags.C;
+    this->V |= UsedFlags.V;
+    return *this;
+  }
+};
+
+/// \returns Conditions flags used after \p CmpInstr in its MachineBB if  NZCV
+/// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
+/// \returns None otherwise.
+///
+/// Collect instructions using that flags in \p CCUseInstrs if provided.
+Optional<UsedNZCV>
+examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr,
+                 const TargetRegisterInfo &TRI,
+                 SmallVectorImpl<MachineInstr *> *CCUseInstrs = nullptr);
+
 /// Return true if there is an instruction /after/ \p DefMI and before \p UseMI
 /// which either reads or clobbers NZCV.
 bool isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
                                      const MachineInstr &UseMI,
                                      const TargetRegisterInfo *TRI);
 
+MCCFIInstruction createDefCFA(const TargetRegisterInfo &TRI, unsigned FrameReg,
+                              unsigned Reg, const StackOffset &Offset,
+                              bool LastAdjustmentWasScalable = true);
+MCCFIInstruction createCFAOffset(const TargetRegisterInfo &MRI, unsigned Reg,
+                                 const StackOffset &OffsetFromDefCFA);
+
 /// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg
 /// plus Offset.  This is intended to be used from within the prolog/epilog
 /// insertion (PEI) pass, where a virtual scratch register may be allocated
@@ -371,7 +419,9 @@ void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                      StackOffset Offset, const TargetInstrInfo *TII,
                      MachineInstr::MIFlag = MachineInstr::NoFlags,
                      bool SetNZCV = false, bool NeedsWinCFI = false,
-                     bool *HasWinCFI = nullptr);
+                     bool *HasWinCFI = nullptr, bool EmitCFAOffset = false,
+                     StackOffset InitialOffset = {},
+                     unsigned FrameReg = AArch64::SP);
 
 /// rewriteAArch64FrameIndex - Rewrite MI to access 'Offset' bytes from the
 /// FP. Return false if the offset could not be handled directly in MI, and
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 83bf89ff97c5..3802a45ad6c1 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -14,196 +14,196 @@
 // ARM Instruction Predicate Definitions.
 //
 def HasV8_1a         : Predicate<"Subtarget->hasV8_1aOps()">,
-                                 AssemblerPredicate<(all_of HasV8_1aOps), "armv8.1a">;
+                                 AssemblerPredicateWithAll<(all_of HasV8_1aOps), "armv8.1a">;
 def HasV8_2a         : Predicate<"Subtarget->hasV8_2aOps()">,
-                                 AssemblerPredicate<(all_of HasV8_2aOps), "armv8.2a">;
+                                 AssemblerPredicateWithAll<(all_of HasV8_2aOps), "armv8.2a">;
 def HasV8_3a         : Predicate<"Subtarget->hasV8_3aOps()">,
-                                 AssemblerPredicate<(all_of HasV8_3aOps), "armv8.3a">;
+                                 AssemblerPredicateWithAll<(all_of HasV8_3aOps), "armv8.3a">;
 def HasV8_4a         : Predicate<"Subtarget->hasV8_4aOps()">,
-                                 AssemblerPredicate<(all_of HasV8_4aOps), "armv8.4a">;
+                                 AssemblerPredicateWithAll<(all_of HasV8_4aOps), "armv8.4a">;
 def HasV8_5a         : Predicate<"Subtarget->hasV8_5aOps()">,
-                                 AssemblerPredicate<(all_of HasV8_5aOps), "armv8.5a">;
+                                 AssemblerPredicateWithAll<(all_of HasV8_5aOps), "armv8.5a">;
 def HasV8_6a         : Predicate<"Subtarget->hasV8_6aOps()">,
-                                 AssemblerPredicate<(all_of HasV8_6aOps), "armv8.6a">;
+                                 AssemblerPredicateWithAll<(all_of HasV8_6aOps), "armv8.6a">;
 def HasV8_7a         : Predicate<"Subtarget->hasV8_7aOps()">,
-                                 AssemblerPredicate<(all_of HasV8_7aOps), "armv8.7a">;
+                                 AssemblerPredicateWithAll<(all_of HasV8_7aOps), "armv8.7a">;
 def HasV9_0a         : Predicate<"Subtarget->hasV9_0aOps()">,
-                                 AssemblerPredicate<(all_of HasV9_0aOps), "armv9-a">;
+                                 AssemblerPredicateWithAll<(all_of HasV9_0aOps), "armv9-a">;
 def HasV9_1a         : Predicate<"Subtarget->hasV9_1aOps()">,
-                                 AssemblerPredicate<(all_of HasV9_1aOps), "armv9.1a">;
+                                 AssemblerPredicateWithAll<(all_of HasV9_1aOps), "armv9.1a">;
 def HasV9_2a         : Predicate<"Subtarget->hasV9_2aOps()">,
-                                 AssemblerPredicate<(all_of HasV9_2aOps), "armv9.2a">;
+                                 AssemblerPredicateWithAll<(all_of HasV9_2aOps), "armv9.2a">;
 def HasV9_3a         : Predicate<"Subtarget->hasV9_3aOps()">,
-                                 AssemblerPredicate<(all_of HasV9_3aOps), "armv9.3a">;
+                                 AssemblerPredicateWithAll<(all_of HasV9_3aOps), "armv9.3a">;
 def HasV8_0r         : Predicate<"Subtarget->hasV8_0rOps()">,
-                                 AssemblerPredicate<(all_of HasV8_0rOps), "armv8-r">;
+                                 AssemblerPredicateWithAll<(all_of HasV8_0rOps), "armv8-r">;
 
 def HasEL2VMSA       : Predicate<"Subtarget->hasEL2VMSA()">,
-                       AssemblerPredicate<(all_of FeatureEL2VMSA), "el2vmsa">;
+                       AssemblerPredicateWithAll<(all_of FeatureEL2VMSA), "el2vmsa">;
 
 def HasEL3           : Predicate<"Subtarget->hasEL3()">,
-                       AssemblerPredicate<(all_of FeatureEL3), "el3">;
+                       AssemblerPredicateWithAll<(all_of FeatureEL3), "el3">;
 
 def HasVH            : Predicate<"Subtarget->hasVH()">,
-                       AssemblerPredicate<(all_of FeatureVH), "vh">;
+                       AssemblerPredicateWithAll<(all_of FeatureVH), "vh">;
 
 def HasLOR           : Predicate<"Subtarget->hasLOR()">,
-                       AssemblerPredicate<(all_of FeatureLOR), "lor">;
+                       AssemblerPredicateWithAll<(all_of FeatureLOR), "lor">;
 
 def HasPAuth         : Predicate<"Subtarget->hasPAuth()">,
-                       AssemblerPredicate<(all_of FeaturePAuth), "pauth">;
+                       AssemblerPredicateWithAll<(all_of FeaturePAuth), "pauth">;
 
 def HasJS            : Predicate<"Subtarget->hasJS()">,
-                       AssemblerPredicate<(all_of FeatureJS), "jsconv">;
+                       AssemblerPredicateWithAll<(all_of FeatureJS), "jsconv">;
 
 def HasCCIDX         : Predicate<"Subtarget->hasCCIDX()">,
-                       AssemblerPredicate<(all_of FeatureCCIDX), "ccidx">;
+                       AssemblerPredicateWithAll<(all_of FeatureCCIDX), "ccidx">;
 
 def HasComplxNum      : Predicate<"Subtarget->hasComplxNum()">,
-                       AssemblerPredicate<(all_of FeatureComplxNum), "complxnum">;
+                       AssemblerPredicateWithAll<(all_of FeatureComplxNum), "complxnum">;
 
 def HasNV            : Predicate<"Subtarget->hasNV()">,
-                       AssemblerPredicate<(all_of FeatureNV), "nv">;
+                       AssemblerPredicateWithAll<(all_of FeatureNV), "nv">;
 
 def HasMPAM          : Predicate<"Subtarget->hasMPAM()">,
-                       AssemblerPredicate<(all_of FeatureMPAM), "mpam">;
+                       AssemblerPredicateWithAll<(all_of FeatureMPAM), "mpam">;
 
 def HasDIT           : Predicate<"Subtarget->hasDIT()">,
-                       AssemblerPredicate<(all_of FeatureDIT), "dit">;
+                       AssemblerPredicateWithAll<(all_of FeatureDIT), "dit">;
 
 def HasTRACEV8_4         : Predicate<"Subtarget->hasTRACEV8_4()">,
-                       AssemblerPredicate<(all_of FeatureTRACEV8_4), "tracev8.4">;
+                       AssemblerPredicateWithAll<(all_of FeatureTRACEV8_4), "tracev8.4">;
 
 def HasAM            : Predicate<"Subtarget->hasAM()">,
-                       AssemblerPredicate<(all_of FeatureAM), "am">;
+                       AssemblerPredicateWithAll<(all_of FeatureAM), "am">;
 
 def HasSEL2          : Predicate<"Subtarget->hasSEL2()">,
-                       AssemblerPredicate<(all_of FeatureSEL2), "sel2">;
+                       AssemblerPredicateWithAll<(all_of FeatureSEL2), "sel2">;
 
 def HasTLB_RMI          : Predicate<"Subtarget->hasTLB_RMI()">,
-                       AssemblerPredicate<(all_of FeatureTLB_RMI), "tlb-rmi">;
+                       AssemblerPredicateWithAll<(all_of FeatureTLB_RMI), "tlb-rmi">;
 
 def HasFlagM         : Predicate<"Subtarget->hasFlagM()">,
-                       AssemblerPredicate<(all_of FeatureFlagM), "flagm">;
+                       AssemblerPredicateWithAll<(all_of FeatureFlagM), "flagm">;
 
 def HasRCPC_IMMO      : Predicate<"Subtarget->hasRCPCImm()">,
-                       AssemblerPredicate<(all_of FeatureRCPC_IMMO), "rcpc-immo">;
+                       AssemblerPredicateWithAll<(all_of FeatureRCPC_IMMO), "rcpc-immo">;
 
 def HasFPARMv8       : Predicate<"Subtarget->hasFPARMv8()">,
-                               AssemblerPredicate<(all_of FeatureFPARMv8), "fp-armv8">;
+                               AssemblerPredicateWithAll<(all_of FeatureFPARMv8), "fp-armv8">;
 def HasNEON          : Predicate<"Subtarget->hasNEON()">,
-                                 AssemblerPredicate<(all_of FeatureNEON), "neon">;
+                                 AssemblerPredicateWithAll<(all_of FeatureNEON), "neon">;
 def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
-                                 AssemblerPredicate<(all_of FeatureCrypto), "crypto">;
+                                 AssemblerPredicateWithAll<(all_of FeatureCrypto), "crypto">;
 def HasSM4           : Predicate<"Subtarget->hasSM4()">,
-                                 AssemblerPredicate<(all_of FeatureSM4), "sm4">;
+                                 AssemblerPredicateWithAll<(all_of FeatureSM4), "sm4">;
 def HasSHA3          : Predicate<"Subtarget->hasSHA3()">,
-                                 AssemblerPredicate<(all_of FeatureSHA3), "sha3">;
+                                 AssemblerPredicateWithAll<(all_of FeatureSHA3), "sha3">;
 def HasSHA2          : Predicate<"Subtarget->hasSHA2()">,
-                                 AssemblerPredicate<(all_of FeatureSHA2), "sha2">;
+                                 AssemblerPredicateWithAll<(all_of FeatureSHA2), "sha2">;
 def HasAES           : Predicate<"Subtarget->hasAES()">,
-                                 AssemblerPredicate<(all_of FeatureAES), "aes">;
+                                 AssemblerPredicateWithAll<(all_of FeatureAES), "aes">;
 def HasDotProd       : Predicate<"Subtarget->hasDotProd()">,
-                                 AssemblerPredicate<(all_of FeatureDotProd), "dotprod">;
+                                 AssemblerPredicateWithAll<(all_of FeatureDotProd), "dotprod">;
 def HasCRC           : Predicate<"Subtarget->hasCRC()">,
-                                 AssemblerPredicate<(all_of FeatureCRC), "crc">;
+                                 AssemblerPredicateWithAll<(all_of FeatureCRC), "crc">;
 def HasLSE           : Predicate<"Subtarget->hasLSE()">,
-                                 AssemblerPredicate<(all_of FeatureLSE), "lse">;
+                                 AssemblerPredicateWithAll<(all_of FeatureLSE), "lse">;
 def HasNoLSE         : Predicate<"!Subtarget->hasLSE()">;
 def HasRAS           : Predicate<"Subtarget->hasRAS()">,
-                                 AssemblerPredicate<(all_of FeatureRAS), "ras">;
+                                 AssemblerPredicateWithAll<(all_of FeatureRAS), "ras">;
 def HasRDM           : Predicate<"Subtarget->hasRDM()">,
-                                 AssemblerPredicate<(all_of FeatureRDM), "rdm">;
+                                 AssemblerPredicateWithAll<(all_of FeatureRDM), "rdm">;
 def HasPerfMon       : Predicate<"Subtarget->hasPerfMon()">;
 def HasFullFP16      : Predicate<"Subtarget->hasFullFP16()">,
-                                 AssemblerPredicate<(all_of FeatureFullFP16), "fullfp16">;
+                                 AssemblerPredicateWithAll<(all_of FeatureFullFP16), "fullfp16">;
 def HasFP16FML       : Predicate<"Subtarget->hasFP16FML()">,
-                                 AssemblerPredicate<(all_of FeatureFP16FML), "fp16fml">;
+                                 AssemblerPredicateWithAll<(all_of FeatureFP16FML), "fp16fml">;
 def HasSPE           : Predicate<"Subtarget->hasSPE()">,
-                                 AssemblerPredicate<(all_of FeatureSPE), "spe">;
+                                 AssemblerPredicateWithAll<(all_of FeatureSPE), "spe">;
 def HasFuseAES       : Predicate<"Subtarget->hasFuseAES()">,
-                                 AssemblerPredicate<(all_of FeatureFuseAES),
+                                 AssemblerPredicateWithAll<(all_of FeatureFuseAES),
                                  "fuse-aes">;
 def HasSVE           : Predicate<"Subtarget->hasSVE()">,
-                                 AssemblerPredicate<(all_of FeatureSVE), "sve">;
+                                 AssemblerPredicateWithAll<(all_of FeatureSVE), "sve">;
 def HasSVE2          : Predicate<"Subtarget->hasSVE2()">,
-                                 AssemblerPredicate<(all_of FeatureSVE2), "sve2">;
+                                 AssemblerPredicateWithAll<(all_of FeatureSVE2), "sve2">;
 def HasSVE2AES       : Predicate<"Subtarget->hasSVE2AES()">,
-                                 AssemblerPredicate<(all_of FeatureSVE2AES), "sve2-aes">;
+                                 AssemblerPredicateWithAll<(all_of FeatureSVE2AES), "sve2-aes">;
 def HasSVE2SM4       : Predicate<"Subtarget->hasSVE2SM4()">,
-                                 AssemblerPredicate<(all_of FeatureSVE2SM4), "sve2-sm4">;
+                                 AssemblerPredicateWithAll<(all_of FeatureSVE2SM4), "sve2-sm4">;
 def HasSVE2SHA3      : Predicate<"Subtarget->hasSVE2SHA3()">,
-                                 AssemblerPredicate<(all_of FeatureSVE2SHA3), "sve2-sha3">;
+                                 AssemblerPredicateWithAll<(all_of FeatureSVE2SHA3), "sve2-sha3">;
 def HasSVE2BitPerm   : Predicate<"Subtarget->hasSVE2BitPerm()">,
-                                 AssemblerPredicate<(all_of FeatureSVE2BitPerm), "sve2-bitperm">;
+                                 AssemblerPredicateWithAll<(all_of FeatureSVE2BitPerm), "sve2-bitperm">;
 def HasSME           : Predicate<"Subtarget->hasSME()">,
-                                 AssemblerPredicate<(all_of FeatureSME), "sme">;
+                                 AssemblerPredicateWithAll<(all_of FeatureSME), "sme">;
 def HasSMEF64        : Predicate<"Subtarget->hasSMEF64()">,
-                                 AssemblerPredicate<(all_of FeatureSMEF64), "sme-f64">;
+                                 AssemblerPredicateWithAll<(all_of FeatureSMEF64), "sme-f64">;
 def HasSMEI64        : Predicate<"Subtarget->hasSMEI64()">,
-                                 AssemblerPredicate<(all_of FeatureSMEI64), "sme-i64">;
-def HasStreamingSVE  : Predicate<"Subtarget->hasStreamingSVE()">,
-                                 AssemblerPredicate<(all_of FeatureStreamingSVE), "streaming-sve">;
+                                 AssemblerPredicateWithAll<(all_of FeatureSMEI64), "sme-i64">;
 // A subset of SVE(2) instructions are legal in Streaming SVE execution mode,
 // they should be enabled if either has been specified.
-def HasSVEorStreamingSVE
-    : Predicate<"Subtarget->hasSVE() || Subtarget->hasStreamingSVE()">,
-                AssemblerPredicate<(any_of FeatureSVE, FeatureStreamingSVE),
-                "streaming-sve or sve">;
-def HasSVE2orStreamingSVE
-    : Predicate<"Subtarget->hasSVE2() || Subtarget->hasStreamingSVE()">,
-                AssemblerPredicate<(any_of FeatureSVE2, FeatureStreamingSVE),
-                "streaming-sve or sve2">;
+def HasSVEorSME
+    : Predicate<"Subtarget->hasSVE() || Subtarget->hasSME()">,
+                AssemblerPredicateWithAll<(any_of FeatureSVE, FeatureSME),
+                "sve or sme">;
+def HasSVE2orSME
+    : Predicate<"Subtarget->hasSVE2() || Subtarget->hasSME()">,
+                AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSME),
+                "sve2 or sme">;
 // A subset of NEON instructions are legal in Streaming SVE execution mode,
 // they should be enabled if either has been specified.
-def HasNEONorStreamingSVE
-    : Predicate<"Subtarget->hasNEON() || Subtarget->hasStreamingSVE()">,
-                AssemblerPredicate<(any_of FeatureNEON, FeatureStreamingSVE),
-                "streaming-sve or neon">;
+def HasNEONorSME
+    : Predicate<"Subtarget->hasNEON() || Subtarget->hasSME()">,
+                AssemblerPredicateWithAll<(any_of FeatureNEON, FeatureSME),
+                "neon or sme">;
 def HasRCPC          : Predicate<"Subtarget->hasRCPC()">,
-                                 AssemblerPredicate<(all_of FeatureRCPC), "rcpc">;
+                                 AssemblerPredicateWithAll<(all_of FeatureRCPC), "rcpc">;
+def HasLDAPR         : Predicate<"Subtarget->hasLDAPR()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureLDAPR), "ldapr">;
 def HasAltNZCV       : Predicate<"Subtarget->hasAlternativeNZCV()">,
-                       AssemblerPredicate<(all_of FeatureAltFPCmp), "altnzcv">;
+                       AssemblerPredicateWithAll<(all_of FeatureAltFPCmp), "altnzcv">;
 def HasFRInt3264     : Predicate<"Subtarget->hasFRInt3264()">,
-                       AssemblerPredicate<(all_of FeatureFRInt3264), "frint3264">;
+                       AssemblerPredicateWithAll<(all_of FeatureFRInt3264), "frint3264">;
 def HasSB            : Predicate<"Subtarget->hasSB()">,
-                       AssemblerPredicate<(all_of FeatureSB), "sb">;
+                       AssemblerPredicateWithAll<(all_of FeatureSB), "sb">;
 def HasPredRes      : Predicate<"Subtarget->hasPredRes()">,
-                       AssemblerPredicate<(all_of FeaturePredRes), "predres">;
+                       AssemblerPredicateWithAll<(all_of FeaturePredRes), "predres">;
 def HasCCDP          : Predicate<"Subtarget->hasCCDP()">,
-                       AssemblerPredicate<(all_of FeatureCacheDeepPersist), "ccdp">;
+                       AssemblerPredicateWithAll<(all_of FeatureCacheDeepPersist), "ccdp">;
 def HasBTI           : Predicate<"Subtarget->hasBTI()">,
-                       AssemblerPredicate<(all_of FeatureBranchTargetId), "bti">;
+                       AssemblerPredicateWithAll<(all_of FeatureBranchTargetId), "bti">;
 def HasMTE           : Predicate<"Subtarget->hasMTE()">,
-                       AssemblerPredicate<(all_of FeatureMTE), "mte">;
+                       AssemblerPredicateWithAll<(all_of FeatureMTE), "mte">;
 def HasTME           : Predicate<"Subtarget->hasTME()">,
-                       AssemblerPredicate<(all_of FeatureTME), "tme">;
+                       AssemblerPredicateWithAll<(all_of FeatureTME), "tme">;
 def HasETE           : Predicate<"Subtarget->hasETE()">,
-                       AssemblerPredicate<(all_of FeatureETE), "ete">;
+                       AssemblerPredicateWithAll<(all_of FeatureETE), "ete">;
 def HasTRBE          : Predicate<"Subtarget->hasTRBE()">,
-                       AssemblerPredicate<(all_of FeatureTRBE), "trbe">;
+                       AssemblerPredicateWithAll<(all_of FeatureTRBE), "trbe">;
 def HasBF16          : Predicate<"Subtarget->hasBF16()">,
-                       AssemblerPredicate<(all_of FeatureBF16), "bf16">;
+                       AssemblerPredicateWithAll<(all_of FeatureBF16), "bf16">;
 def HasMatMulInt8    : Predicate<"Subtarget->hasMatMulInt8()">,
-                       AssemblerPredicate<(all_of FeatureMatMulInt8), "i8mm">;
+                       AssemblerPredicateWithAll<(all_of FeatureMatMulInt8), "i8mm">;
 def HasMatMulFP32    : Predicate<"Subtarget->hasMatMulFP32()">,
-                       AssemblerPredicate<(all_of FeatureMatMulFP32), "f32mm">;
+                       AssemblerPredicateWithAll<(all_of FeatureMatMulFP32), "f32mm">;
 def HasMatMulFP64    : Predicate<"Subtarget->hasMatMulFP64()">,
-                       AssemblerPredicate<(all_of FeatureMatMulFP64), "f64mm">;
+                       AssemblerPredicateWithAll<(all_of FeatureMatMulFP64), "f64mm">;
 def HasXS            : Predicate<"Subtarget->hasXS()">,
-                       AssemblerPredicate<(all_of FeatureXS), "xs">;
+                       AssemblerPredicateWithAll<(all_of FeatureXS), "xs">;
 def HasWFxT          : Predicate<"Subtarget->hasWFxT()">,
-                       AssemblerPredicate<(all_of FeatureWFxT), "wfxt">;
+                       AssemblerPredicateWithAll<(all_of FeatureWFxT), "wfxt">;
 def HasLS64          : Predicate<"Subtarget->hasLS64()">,
-                       AssemblerPredicate<(all_of FeatureLS64), "ls64">;
+                       AssemblerPredicateWithAll<(all_of FeatureLS64), "ls64">;
 def HasBRBE          : Predicate<"Subtarget->hasBRBE()">,
-                       AssemblerPredicate<(all_of FeatureBRBE), "brbe">;
+                       AssemblerPredicateWithAll<(all_of FeatureBRBE), "brbe">;
 def HasSPE_EEF       : Predicate<"Subtarget->hasSPE_EEF()">,
-                       AssemblerPredicate<(all_of FeatureSPE_EEF), "spe-eef">;
+                       AssemblerPredicateWithAll<(all_of FeatureSPE_EEF), "spe-eef">;
 def HasHBC           : Predicate<"Subtarget->hasHBC()">,
-                       AssemblerPredicate<(all_of FeatureHBC), "hbc">;
+                       AssemblerPredicateWithAll<(all_of FeatureHBC), "hbc">;
 def HasMOPS          : Predicate<"Subtarget->hasMOPS()">,
-                       AssemblerPredicate<(all_of FeatureMOPS), "mops">;
+                       AssemblerPredicateWithAll<(all_of FeatureMOPS), "mops">;
 def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
 def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
 def IsWindows        : Predicate<"Subtarget->isTargetWindows()">;
@@ -350,49 +350,49 @@ def nonext_masked_load :
          cast<MaskedLoadSDNode>(N)->isUnindexed() &&
          !cast<MaskedLoadSDNode>(N)->isNonTemporal();
 }]>;
-// sign extending masked load fragments.
-def asext_masked_load :
+// Any/Zero extending masked load fragments.
+def azext_masked_load :
   PatFrag<(ops node:$ptr, node:$pred, node:$def),
           (masked_ld node:$ptr, undef, node:$pred, node:$def),[{
   return (cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::EXTLOAD ||
-          cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD) &&
+          cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD) &&
          cast<MaskedLoadSDNode>(N)->isUnindexed();
 }]>;
-def asext_masked_load_i8 :
+def azext_masked_load_i8 :
   PatFrag<(ops node:$ptr, node:$pred, node:$def),
-          (asext_masked_load node:$ptr, node:$pred, node:$def), [{
+          (azext_masked_load node:$ptr, node:$pred, node:$def), [{
   return cast<MaskedLoadSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
 }]>;
-def asext_masked_load_i16 :
+def azext_masked_load_i16 :
   PatFrag<(ops node:$ptr, node:$pred, node:$def),
-          (asext_masked_load node:$ptr, node:$pred, node:$def), [{
+          (azext_masked_load node:$ptr, node:$pred, node:$def), [{
   return cast<MaskedLoadSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
 }]>;
-def asext_masked_load_i32 :
+def azext_masked_load_i32 :
   PatFrag<(ops node:$ptr, node:$pred, node:$def),
-          (asext_masked_load node:$ptr, node:$pred, node:$def), [{
+          (azext_masked_load node:$ptr, node:$pred, node:$def), [{
   return cast<MaskedLoadSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
 }]>;
-// zero extending masked load fragments.
-def zext_masked_load :
+// Sign extending masked load fragments.
+def sext_masked_load :
   PatFrag<(ops node:$ptr, node:$pred, node:$def),
           (masked_ld node:$ptr, undef, node:$pred, node:$def), [{
-  return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD &&
+  return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD &&
          cast<MaskedLoadSDNode>(N)->isUnindexed();
 }]>;
-def zext_masked_load_i8 :
+def sext_masked_load_i8 :
   PatFrag<(ops node:$ptr, node:$pred, node:$def),
-          (zext_masked_load node:$ptr, node:$pred, node:$def), [{
+          (sext_masked_load node:$ptr, node:$pred, node:$def), [{
   return cast<MaskedLoadSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
 }]>;
-def zext_masked_load_i16 :
+def sext_masked_load_i16 :
   PatFrag<(ops node:$ptr, node:$pred, node:$def),
-          (zext_masked_load node:$ptr, node:$pred, node:$def), [{
+          (sext_masked_load node:$ptr, node:$pred, node:$def), [{
   return cast<MaskedLoadSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
 }]>;
-def zext_masked_load_i32 :
+def sext_masked_load_i32 :
   PatFrag<(ops node:$ptr, node:$pred, node:$def),
-          (zext_masked_load node:$ptr, node:$pred, node:$def), [{
+          (sext_masked_load node:$ptr, node:$pred, node:$def), [{
   return cast<MaskedLoadSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
 }]>;
 
@@ -443,6 +443,58 @@ def non_temporal_store :
          cast<MaskedStoreSDNode>(N)->isNonTemporal();
 }]>;
 
+multiclass masked_gather_scatter<PatFrags GatherScatterOp> {
+  // offsets = (signed)Index << sizeof(elt)
+  def NAME#_signed_scaled :
+    PatFrag<(ops node:$val, node:$pred, node:$ptr, node:$idx),
+            (GatherScatterOp node:$val, node:$pred, node:$ptr, node:$idx),[{
+    auto MGS = cast<MaskedGatherScatterSDNode>(N);
+    bool Signed = MGS->isIndexSigned() ||
+        MGS->getIndex().getValueType().getVectorElementType() == MVT::i64;
+    return Signed && MGS->isIndexScaled();
+  }]>;
+  // offsets = (signed)Index
+  def NAME#_signed_unscaled :
+    PatFrag<(ops node:$val, node:$pred, node:$ptr, node:$idx),
+            (GatherScatterOp node:$val, node:$pred, node:$ptr, node:$idx),[{
+    auto MGS = cast<MaskedGatherScatterSDNode>(N);
+    bool Signed = MGS->isIndexSigned() ||
+        MGS->getIndex().getValueType().getVectorElementType() == MVT::i64;
+    return Signed && !MGS->isIndexScaled();
+  }]>;
+  // offsets = (unsigned)Index << sizeof(elt)
+  def NAME#_unsigned_scaled :
+    PatFrag<(ops node:$val, node:$pred, node:$ptr, node:$idx),
+            (GatherScatterOp node:$val, node:$pred, node:$ptr, node:$idx),[{
+    auto MGS = cast<MaskedGatherScatterSDNode>(N);
+    bool Signed = MGS->isIndexSigned() ||
+        MGS->getIndex().getValueType().getVectorElementType() == MVT::i64;
+    return !Signed && MGS->isIndexScaled();
+  }]>;
+  // offsets = (unsigned)Index
+  def NAME#_unsigned_unscaled :
+    PatFrag<(ops node:$val, node:$pred, node:$ptr, node:$idx),
+            (GatherScatterOp node:$val, node:$pred, node:$ptr, node:$idx),[{
+    auto MGS = cast<MaskedGatherScatterSDNode>(N);
+    bool Signed = MGS->isIndexSigned() ||
+        MGS->getIndex().getValueType().getVectorElementType() == MVT::i64;
+    return !Signed && !MGS->isIndexScaled();
+  }]>;
+}
+
+defm nonext_masked_gather    : masked_gather_scatter<nonext_masked_gather>;
+defm azext_masked_gather_i8  : masked_gather_scatter<azext_masked_gather_i8>;
+defm azext_masked_gather_i16 : masked_gather_scatter<azext_masked_gather_i16>;
+defm azext_masked_gather_i32 : masked_gather_scatter<azext_masked_gather_i32>;
+defm sext_masked_gather_i8   : masked_gather_scatter<sext_masked_gather_i8>;
+defm sext_masked_gather_i16  : masked_gather_scatter<sext_masked_gather_i16>;
+defm sext_masked_gather_i32  : masked_gather_scatter<sext_masked_gather_i32>;
+
+defm nontrunc_masked_scatter  : masked_gather_scatter<nontrunc_masked_scatter>;
+defm trunc_masked_scatter_i8  : masked_gather_scatter<trunc_masked_scatter_i8>;
+defm trunc_masked_scatter_i16 : masked_gather_scatter<trunc_masked_scatter_i16>;
+defm trunc_masked_scatter_i32 : masked_gather_scatter<trunc_masked_scatter_i32>;
+
 // top16Zero - answer true if the upper 16 bits of $src are 0, false otherwise
 def top16Zero: PatLeaf<(i32 GPR32:$src), [{
   return SDValue(N,0)->getValueType(0) == MVT::i32 &&
@@ -473,6 +525,11 @@ def AArch64call          : SDNode<"AArch64ISD::CALL",
                                 [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                                  SDNPVariadic]>;
 
+def AArch64call_bti      : SDNode<"AArch64ISD::CALL_BTI",
+                                SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
+                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                                 SDNPVariadic]>;
+
 def AArch64call_rvmarker: SDNode<"AArch64ISD::CALL_RVMARKER",
                              SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
@@ -526,6 +583,7 @@ def AArch64duplane8  : SDNode<"AArch64ISD::DUPLANE8", SDT_AArch64DupLane>;
 def AArch64duplane16 : SDNode<"AArch64ISD::DUPLANE16", SDT_AArch64DupLane>;
 def AArch64duplane32 : SDNode<"AArch64ISD::DUPLANE32", SDT_AArch64DupLane>;
 def AArch64duplane64 : SDNode<"AArch64ISD::DUPLANE64", SDT_AArch64DupLane>;
+def AArch64duplane128 : SDNode<"AArch64ISD::DUPLANE128", SDT_AArch64DupLane>;
 
 def AArch64insr      : SDNode<"AArch64ISD::INSR", SDT_AArch64Insr>;
 
@@ -612,8 +670,10 @@ def AArch64NvCast : SDNode<"AArch64ISD::NVCAST", SDTUnaryOp>;
 
 def SDT_AArch64mull : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
                                     SDTCisSameAs<1, 2>]>;
-def AArch64smull    : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull>;
-def AArch64umull    : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull>;
+def AArch64smull    : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull,
+                             [SDNPCommutative]>;
+def AArch64umull    : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull,
+                             [SDNPCommutative]>;
 
 def AArch64frecpe   : SDNode<"AArch64ISD::FRECPE", SDTFPUnaryOp>;
 def AArch64frecps   : SDNode<"AArch64ISD::FRECPS", SDTFPBinOp>;
@@ -630,11 +690,6 @@ def AArch64uminv    : SDNode<"AArch64ISD::UMINV", SDT_AArch64UnaryVec>;
 def AArch64smaxv    : SDNode<"AArch64ISD::SMAXV", SDT_AArch64UnaryVec>;
 def AArch64umaxv    : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>;
 
-def AArch64srhadd   : SDNode<"AArch64ISD::SRHADD", SDT_AArch64binvec>;
-def AArch64urhadd   : SDNode<"AArch64ISD::URHADD", SDT_AArch64binvec>;
-def AArch64shadd   : SDNode<"AArch64ISD::SHADD", SDT_AArch64binvec>;
-def AArch64uhadd   : SDNode<"AArch64ISD::UHADD", SDT_AArch64binvec>;
-
 def AArch64uabd     : PatFrags<(ops node:$lhs, node:$rhs),
                                [(abdu node:$lhs, node:$rhs),
                                 (int_aarch64_neon_uabd node:$lhs, node:$rhs)]>;
@@ -642,10 +697,21 @@ def AArch64sabd     : PatFrags<(ops node:$lhs, node:$rhs),
                                [(abds node:$lhs, node:$rhs),
                                 (int_aarch64_neon_sabd node:$lhs, node:$rhs)]>;
 
+def AArch64addp_n   : SDNode<"AArch64ISD::ADDP", SDT_AArch64Zip>;
 def AArch64uaddlp_n : SDNode<"AArch64ISD::UADDLP", SDT_AArch64uaddlp>;
+def AArch64saddlp_n : SDNode<"AArch64ISD::SADDLP", SDT_AArch64uaddlp>;
+def AArch64addp     : PatFrags<(ops node:$Rn, node:$Rm),
+                               [(AArch64addp_n node:$Rn, node:$Rm),
+                                (int_aarch64_neon_addp node:$Rn, node:$Rm)]>;
 def AArch64uaddlp   : PatFrags<(ops node:$src),
                                [(AArch64uaddlp_n node:$src),
                                 (int_aarch64_neon_uaddlp node:$src)]>;
+def AArch64saddlp   : PatFrags<(ops node:$src),
+                               [(AArch64saddlp_n node:$src),
+                                (int_aarch64_neon_saddlp node:$src)]>;
+def AArch64faddp     : PatFrags<(ops node:$Rn, node:$Rm),
+                                [(AArch64addp_n node:$Rn, node:$Rm),
+                                 (int_aarch64_neon_faddp node:$Rn, node:$Rm)]>;
 
 def SDT_AArch64SETTAG : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
 def AArch64stg : SDNode<"AArch64ISD::STG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
@@ -669,6 +735,22 @@ def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>;
 def AArch64mrs : SDNode<"AArch64ISD::MRS",
                         SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, i32>]>,
                         [SDNPHasChain, SDNPOutGlue]>;
+
+// Match add node and also treat an 'or' node is as an 'add' if the or'ed operands
+// have no common bits.
+def add_and_or_is_add : PatFrags<(ops node:$lhs, node:$rhs),
+                         [(add node:$lhs, node:$rhs), (or node:$lhs, node:$rhs)],[{
+   if (N->getOpcode() == ISD::ADD)
+     return true;
+   return CurDAG->haveNoCommonBitsSet(N->getOperand(0), N->getOperand(1));
+}]> {
+  let GISelPredicateCode = [{
+     // Only handle G_ADD for now. FIXME. build capability to compute whether
+     // operands of G_OR have common bits set or not.
+     return MI.getOpcode() == TargetOpcode::G_ADD;
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
@@ -939,7 +1021,7 @@ def : Pat<(v2f32 (int_aarch64_neon_bfdot
                              VectorIndexS:$idx)>;
 }
 
-let Predicates = [HasNEONorStreamingSVE, HasBF16] in {
+let Predicates = [HasNEONorSME, HasBF16] in {
 def BFCVT : BF16ToSinglePrecision<"bfcvt">;
 }
 
@@ -1025,6 +1107,15 @@ def : EOR3_pattern<v8i16>;
 def : EOR3_pattern<v4i32>;
 def : EOR3_pattern<v2i64>;
 
+class BCAX_pattern<ValueType VecTy>
+  : Pat<(xor (VecTy V128:$Vn), (and (VecTy V128:$Vm), (vnot (VecTy V128:$Va)))),
+        (BCAX (VecTy V128:$Vn), (VecTy V128:$Vm), (VecTy V128:$Va))>;
+
+def : BCAX_pattern<v16i8>;
+def : BCAX_pattern<v8i16>;
+def : BCAX_pattern<v4i32>;
+def : BCAX_pattern<v2i64>;
+
 def : SHA3_pattern<BCAX, int_aarch64_crypto_bcaxu, v16i8>;
 def : SHA3_pattern<BCAX, int_aarch64_crypto_bcaxu, v8i16>;
 def : SHA3_pattern<BCAX, int_aarch64_crypto_bcaxu, v4i32>;
@@ -2073,6 +2164,10 @@ def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>;
 def : Pat<(srl (bswap top16Zero:$Rn), (i64 16)), (REV16Wr GPR32:$Rn)>;
 def : Pat<(srl (bswap top32Zero:$Rn), (i64 32)), (REV32Xr GPR64:$Rn)>;
 
+def : Pat<(or (and (srl GPR64:$Rn, (i64 8)), (i64 0x00ff00ff00ff00ff)),
+              (and (shl GPR64:$Rn, (i64 8)), (i64 0xff00ff00ff00ff00))),
+          (REV16Xr GPR64:$Rn)>;
+
 //===----------------------------------------------------------------------===//
 // Bitfield immediate extraction instruction.
 //===----------------------------------------------------------------------===//
@@ -2320,6 +2415,8 @@ let isCall = 1, Defs = [LR], Uses = [SP] in {
                 PseudoInstExpansion<(BLR GPR64:$Rn)>;
   def BLR_RVMARKER : Pseudo<(outs), (ins variable_ops), []>,
                      Sched<[WriteBrReg]>;
+  def BLR_BTI : Pseudo<(outs), (ins variable_ops), []>,
+                Sched<[WriteBrReg]>;
 } // isCall
 
 def : Pat<(AArch64call GPR64:$Rn),
@@ -2333,6 +2430,10 @@ def : Pat<(AArch64call_rvmarker (i64 tglobaladdr:$rvfunc), GPR64:$Rn),
           (BLR_RVMARKER tglobaladdr:$rvfunc, GPR64:$Rn)>,
       Requires<[NoSLSBLRMitigation]>;
 
+def : Pat<(AArch64call_bti GPR64:$Rn),
+          (BLR_BTI GPR64:$Rn)>,
+      Requires<[NoSLSBLRMitigation]>;
+
 let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
 def BR  : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>;
 } // isBranch, isTerminator, isBarrier, isIndirectBranch
@@ -2359,6 +2460,10 @@ def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []>, Sched<[]> {
 // augmentation string.
 def EMITBKEY : Pseudo<(outs), (ins), []>, Sched<[]> {}
 
+// Pseudo instruction to tell the streamer to emit a 'G' character into the
+// augmentation string.
+def EMITMTETAGGED : Pseudo<(outs), (ins), []>, Sched<[]> {}
+
 // FIXME: maybe the scratch register used shouldn't be fixed to X1?
 // FIXME: can "hasSideEffects be dropped?
 // This gets lowered to an instruction sequence which takes 16 bytes
@@ -2409,7 +2514,8 @@ def : Pat<(AArch64call texternalsym:$func), (BL texternalsym:$func)>;
 // Exception generation instructions.
 //===----------------------------------------------------------------------===//
 let isTrap = 1 in {
-def BRK   : ExceptionGeneration<0b001, 0b00, "brk">;
+def BRK   : ExceptionGeneration<0b001, 0b00, "brk",
+                                [(int_aarch64_break timm32_0_65535:$imm)]>;
 }
 def DCPS1 : ExceptionGeneration<0b101, 0b01, "dcps1">;
 def DCPS2 : ExceptionGeneration<0b101, 0b10, "dcps2">;
@@ -3891,24 +3997,24 @@ defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fround, "FCVTAU">;
 
 
 let Predicates = [HasFullFP16] in {
-  def : Pat<(i32 (lround f16:$Rn)),
+  def : Pat<(i32 (any_lround f16:$Rn)),
             (!cast<Instruction>(FCVTASUWHr) f16:$Rn)>;
-  def : Pat<(i64 (lround f16:$Rn)),
+  def : Pat<(i64 (any_lround f16:$Rn)),
             (!cast<Instruction>(FCVTASUXHr) f16:$Rn)>;
-  def : Pat<(i64 (llround f16:$Rn)),
+  def : Pat<(i64 (any_llround f16:$Rn)),
             (!cast<Instruction>(FCVTASUXHr) f16:$Rn)>;
 }
-def : Pat<(i32 (lround f32:$Rn)),
+def : Pat<(i32 (any_lround f32:$Rn)),
           (!cast<Instruction>(FCVTASUWSr) f32:$Rn)>;
-def : Pat<(i32 (lround f64:$Rn)),
+def : Pat<(i32 (any_lround f64:$Rn)),
           (!cast<Instruction>(FCVTASUWDr) f64:$Rn)>;
-def : Pat<(i64 (lround f32:$Rn)),
+def : Pat<(i64 (any_lround f32:$Rn)),
           (!cast<Instruction>(FCVTASUXSr) f32:$Rn)>;
-def : Pat<(i64 (lround f64:$Rn)),
+def : Pat<(i64 (any_lround f64:$Rn)),
           (!cast<Instruction>(FCVTASUXDr) f64:$Rn)>;
-def : Pat<(i64 (llround f32:$Rn)),
+def : Pat<(i64 (any_llround f32:$Rn)),
           (!cast<Instruction>(FCVTASUXSr) f32:$Rn)>;
-def : Pat<(i64 (llround f64:$Rn)),
+def : Pat<(i64 (any_llround f64:$Rn)),
           (!cast<Instruction>(FCVTASUXDr) f64:$Rn)>;
 
 //===----------------------------------------------------------------------===//
@@ -3949,20 +4055,20 @@ defm FCVT : FPConversion<"fcvt">;
 // Floating point single operand instructions.
 //===----------------------------------------------------------------------===//
 
-defm FABS   : SingleOperandFPData<0b0001, "fabs", fabs>;
-defm FMOV   : SingleOperandFPData<0b0000, "fmov">;
-defm FNEG   : SingleOperandFPData<0b0010, "fneg", fneg>;
-defm FRINTA : SingleOperandFPData<0b1100, "frinta", fround>;
-defm FRINTI : SingleOperandFPData<0b1111, "frinti", fnearbyint>;
-defm FRINTM : SingleOperandFPData<0b1010, "frintm", ffloor>;
-defm FRINTN : SingleOperandFPData<0b1000, "frintn", froundeven>;
-defm FRINTP : SingleOperandFPData<0b1001, "frintp", fceil>;
+defm FABS   : SingleOperandFPDataNoException<0b0001, "fabs", fabs>;
+defm FMOV   : SingleOperandFPDataNoException<0b0000, "fmov">;
+defm FNEG   : SingleOperandFPDataNoException<0b0010, "fneg", fneg>;
+defm FRINTA : SingleOperandFPData<0b1100, "frinta", any_fround>;
+defm FRINTI : SingleOperandFPData<0b1111, "frinti", any_fnearbyint>;
+defm FRINTM : SingleOperandFPData<0b1010, "frintm", any_ffloor>;
+defm FRINTN : SingleOperandFPData<0b1000, "frintn", any_froundeven>;
+defm FRINTP : SingleOperandFPData<0b1001, "frintp", any_fceil>;
 
-defm FRINTX : SingleOperandFPData<0b1110, "frintx", frint>;
-defm FRINTZ : SingleOperandFPData<0b1011, "frintz", ftrunc>;
+defm FRINTX : SingleOperandFPData<0b1110, "frintx", any_frint>;
+defm FRINTZ : SingleOperandFPData<0b1011, "frintz", any_ftrunc>;
 
 let SchedRW = [WriteFDiv] in {
-defm FSQRT  : SingleOperandFPData<0b0011, "fsqrt", fsqrt>;
+defm FSQRT  : SingleOperandFPData<0b0011, "fsqrt", any_fsqrt>;
 }
 
 let Predicates = [HasFRInt3264] in {
@@ -3972,44 +4078,48 @@ let Predicates = [HasFRInt3264] in {
   defm FRINT64X : FRIntNNT<0b11, "frint64x", int_aarch64_frint64x>;
 } // HasFRInt3264
 
+// Emitting strict_lrint as two instructions is valid as any exceptions that
+// occur will happen in exactly one of the instructions (e.g. if the input is
+// not an integer the inexact exception will happen in the FRINTX but not then
+// in the FCVTZS as the output of FRINTX is an integer).
 let Predicates = [HasFullFP16] in {
-  def : Pat<(i32 (lrint f16:$Rn)),
+  def : Pat<(i32 (any_lrint f16:$Rn)),
             (FCVTZSUWHr (!cast<Instruction>(FRINTXHr) f16:$Rn))>;
-  def : Pat<(i64 (lrint f16:$Rn)),
+  def : Pat<(i64 (any_lrint f16:$Rn)),
             (FCVTZSUXHr (!cast<Instruction>(FRINTXHr) f16:$Rn))>;
-  def : Pat<(i64 (llrint f16:$Rn)),
+  def : Pat<(i64 (any_llrint f16:$Rn)),
             (FCVTZSUXHr (!cast<Instruction>(FRINTXHr) f16:$Rn))>;
 }
-def : Pat<(i32 (lrint f32:$Rn)),
+def : Pat<(i32 (any_lrint f32:$Rn)),
           (FCVTZSUWSr (!cast<Instruction>(FRINTXSr) f32:$Rn))>;
-def : Pat<(i32 (lrint f64:$Rn)),
+def : Pat<(i32 (any_lrint f64:$Rn)),
           (FCVTZSUWDr (!cast<Instruction>(FRINTXDr) f64:$Rn))>;
-def : Pat<(i64 (lrint f32:$Rn)),
+def : Pat<(i64 (any_lrint f32:$Rn)),
           (FCVTZSUXSr (!cast<Instruction>(FRINTXSr) f32:$Rn))>;
-def : Pat<(i64 (lrint f64:$Rn)),
+def : Pat<(i64 (any_lrint f64:$Rn)),
           (FCVTZSUXDr (!cast<Instruction>(FRINTXDr) f64:$Rn))>;
-def : Pat<(i64 (llrint f32:$Rn)),
+def : Pat<(i64 (any_llrint f32:$Rn)),
           (FCVTZSUXSr (!cast<Instruction>(FRINTXSr) f32:$Rn))>;
-def : Pat<(i64 (llrint f64:$Rn)),
+def : Pat<(i64 (any_llrint f64:$Rn)),
           (FCVTZSUXDr (!cast<Instruction>(FRINTXDr) f64:$Rn))>;
 
 //===----------------------------------------------------------------------===//
 // Floating point two operand instructions.
 //===----------------------------------------------------------------------===//
 
-defm FADD   : TwoOperandFPData<0b0010, "fadd", fadd>;
+defm FADD   : TwoOperandFPData<0b0010, "fadd", any_fadd>;
 let SchedRW = [WriteFDiv] in {
-defm FDIV   : TwoOperandFPData<0b0001, "fdiv", fdiv>;
+defm FDIV   : TwoOperandFPData<0b0001, "fdiv", any_fdiv>;
 }
-defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", fmaxnum>;
-defm FMAX   : TwoOperandFPData<0b0100, "fmax", fmaximum>;
-defm FMINNM : TwoOperandFPData<0b0111, "fminnm", fminnum>;
-defm FMIN   : TwoOperandFPData<0b0101, "fmin", fminimum>;
+defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", any_fmaxnum>;
+defm FMAX   : TwoOperandFPData<0b0100, "fmax", any_fmaximum>;
+defm FMINNM : TwoOperandFPData<0b0111, "fminnm", any_fminnum>;
+defm FMIN   : TwoOperandFPData<0b0101, "fmin", any_fminimum>;
 let SchedRW = [WriteFMul] in {
-defm FMUL   : TwoOperandFPData<0b0000, "fmul", fmul>;
-defm FNMUL  : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>;
+defm FMUL   : TwoOperandFPData<0b0000, "fmul", any_fmul>;
+defm FNMUL  : TwoOperandFPDataNeg<0b1000, "fnmul", any_fmul>;
 }
-defm FSUB   : TwoOperandFPData<0b0011, "fsub", fsub>;
+defm FSUB   : TwoOperandFPData<0b0011, "fsub", any_fsub>;
 
 def : Pat<(v1f64 (fmaximum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
           (FMAXDrr FPR64:$Rn, FPR64:$Rm)>;
@@ -4024,13 +4134,13 @@ def : Pat<(v1f64 (fminnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
 // Floating point three operand instructions.
 //===----------------------------------------------------------------------===//
 
-defm FMADD  : ThreeOperandFPData<0, 0, "fmadd", fma>;
+defm FMADD  : ThreeOperandFPData<0, 0, "fmadd", any_fma>;
 defm FMSUB  : ThreeOperandFPData<0, 1, "fmsub",
-     TriOpFrag<(fma node:$LHS, (fneg node:$MHS), node:$RHS)> >;
+     TriOpFrag<(any_fma node:$LHS, (fneg node:$MHS), node:$RHS)> >;
 defm FNMADD : ThreeOperandFPData<1, 0, "fnmadd",
-     TriOpFrag<(fneg (fma node:$LHS, node:$MHS, node:$RHS))> >;
+     TriOpFrag<(fneg (any_fma node:$LHS, node:$MHS, node:$RHS))> >;
 defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub",
-     TriOpFrag<(fma node:$LHS, node:$MHS, (fneg node:$RHS))> >;
+     TriOpFrag<(any_fma node:$LHS, node:$MHS, (fneg node:$RHS))> >;
 
 // The following def pats catch the case where the LHS of an FMA is negated.
 // The TriOpFrag above catches the case where the middle operand is negated.
@@ -4159,25 +4269,25 @@ def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))),
                                 (zext (v8i8 V64:$opB))),
                            (AArch64vashr v8i16:$src, (i32 15))))),
           (UABDLv8i8_v8i16 V64:$opA, V64:$opB)>;
-def : Pat<(abs (v8i16 (sub (zext (extract_high_v16i8 V128:$opA)),
-                           (zext (extract_high_v16i8 V128:$opB))))),
+def : Pat<(abs (v8i16 (sub (zext (extract_high_v16i8 (v16i8 V128:$opA))),
+                           (zext (extract_high_v16i8 (v16i8 V128:$opB)))))),
           (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>;
 def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))),
-               (v8i16 (add (sub (zext (extract_high_v16i8 V128:$opA)),
-                                (zext (extract_high_v16i8 V128:$opB))),
+               (v8i16 (add (sub (zext (extract_high_v16i8 (v16i8 V128:$opA))),
+                                (zext (extract_high_v16i8 (v16i8 V128:$opB)))),
                            (AArch64vashr v8i16:$src, (i32 15))))),
           (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>;
 def : Pat<(abs (v4i32 (sub (zext (v4i16 V64:$opA)),
                            (zext (v4i16 V64:$opB))))),
           (UABDLv4i16_v4i32 V64:$opA, V64:$opB)>;
-def : Pat<(abs (v4i32 (sub (zext (extract_high_v8i16 V128:$opA)),
-                           (zext (extract_high_v8i16 V128:$opB))))),
+def : Pat<(abs (v4i32 (sub (zext (extract_high_v8i16 (v8i16 V128:$opA))),
+                           (zext (extract_high_v8i16 (v8i16 V128:$opB)))))),
           (UABDLv8i16_v4i32 V128:$opA, V128:$opB)>;
 def : Pat<(abs (v2i64 (sub (zext (v2i32 V64:$opA)),
                            (zext (v2i32 V64:$opB))))),
           (UABDLv2i32_v2i64 V64:$opA, V64:$opB)>;
-def : Pat<(abs (v2i64 (sub (zext (extract_high_v4i32 V128:$opA)),
-                           (zext (extract_high_v4i32 V128:$opB))))),
+def : Pat<(abs (v2i64 (sub (zext (extract_high_v4i32 (v4i32 V128:$opA))),
+                           (zext (extract_high_v4i32 (v4i32 V128:$opB)))))),
           (UABDLv4i32_v2i64 V128:$opA, V128:$opB)>;
 
 defm ABS    : SIMDTwoVectorBHSD<0, 0b01011, "abs", abs>;
@@ -4189,7 +4299,7 @@ defm CMGT   : SIMDCmpTwoVector<0, 0b01000, "cmgt", AArch64cmgtz>;
 defm CMLE   : SIMDCmpTwoVector<1, 0b01001, "cmle", AArch64cmlez>;
 defm CMLT   : SIMDCmpTwoVector<0, 0b01010, "cmlt", AArch64cmltz>;
 defm CNT    : SIMDTwoVectorB<0, 0b00, 0b00101, "cnt", ctpop>;
-defm FABS   : SIMDTwoVectorFP<0, 1, 0b01111, "fabs", fabs>;
+defm FABS   : SIMDTwoVectorFPNoException<0, 1, 0b01111, "fabs", fabs>;
 
 def : Pat<(v8i8 (AArch64vashr (v8i8 V64:$Rn), (i32 7))),
           (CMLTv8i8rz V64:$Rn)>;
@@ -4219,9 +4329,9 @@ def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (v4i16 V64:$Rn))),
 def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn),
                                                               (i64 4)))),
           (FCVTLv8i16 V128:$Rn)>;
-def : Pat<(v2f64 (fpextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>;
+def : Pat<(v2f64 (any_fpextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>;
 
-def : Pat<(v4f32 (fpextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>;
+def : Pat<(v4f32 (any_fpextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>;
 
 defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_aarch64_neon_fcvtms>;
 defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_aarch64_neon_fcvtmu>;
@@ -4233,16 +4343,16 @@ def : Pat<(v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn))),
 def : Pat<(concat_vectors V64:$Rd,
                           (v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn)))),
           (FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
-def : Pat<(v2f32 (fpround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>;
-def : Pat<(v4f16 (fpround (v4f32 V128:$Rn))), (FCVTNv4i16 V128:$Rn)>;
-def : Pat<(concat_vectors V64:$Rd, (v2f32 (fpround (v2f64 V128:$Rn)))),
+def : Pat<(v2f32 (any_fpround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>;
+def : Pat<(v4f16 (any_fpround (v4f32 V128:$Rn))), (FCVTNv4i16 V128:$Rn)>;
+def : Pat<(concat_vectors V64:$Rd, (v2f32 (any_fpround (v2f64 V128:$Rn)))),
           (FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
 defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_aarch64_neon_fcvtps>;
 defm FCVTPU : SIMDTwoVectorFPToInt<1,1,0b11010, "fcvtpu",int_aarch64_neon_fcvtpu>;
 defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn",
                                         int_aarch64_neon_fcvtxn>;
-defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>;
-defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>;
+defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", any_fp_to_sint>;
+defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", any_fp_to_uint>;
 
 // AArch64's FCVT instructions saturate when out of range.
 multiclass SIMDTwoVectorFPToIntSatPats<SDNode to_int_sat, string INST> {
@@ -4272,15 +4382,15 @@ def : Pat<(v2i32 (int_aarch64_neon_fcvtzu v2f32:$Rn)), (FCVTZUv2f32 $Rn)>;
 def : Pat<(v4i32 (int_aarch64_neon_fcvtzu v4f32:$Rn)), (FCVTZUv4f32 $Rn)>;
 def : Pat<(v2i64 (int_aarch64_neon_fcvtzu v2f64:$Rn)), (FCVTZUv2f64 $Rn)>;
 
-defm FNEG   : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>;
+defm FNEG   : SIMDTwoVectorFPNoException<1, 1, 0b01111, "fneg", fneg>;
 defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_aarch64_neon_frecpe>;
-defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", fround>;
-defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", fnearbyint>;
-defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", ffloor>;
-defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", froundeven>;
-defm FRINTP : SIMDTwoVectorFP<0, 1, 0b11000, "frintp", fceil>;
-defm FRINTX : SIMDTwoVectorFP<1, 0, 0b11001, "frintx", frint>;
-defm FRINTZ : SIMDTwoVectorFP<0, 1, 0b11001, "frintz", ftrunc>;
+defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", any_fround>;
+defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", any_fnearbyint>;
+defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", any_ffloor>;
+defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", any_froundeven>;
+defm FRINTP : SIMDTwoVectorFP<0, 1, 0b11000, "frintp", any_fceil>;
+defm FRINTX : SIMDTwoVectorFP<1, 0, 0b11001, "frintx", any_frint>;
+defm FRINTZ : SIMDTwoVectorFP<0, 1, 0b11001, "frintz", any_ftrunc>;
 
 let Predicates = [HasFRInt3264] in {
   defm FRINT32Z : FRIntNNTVector<0, 0, "frint32z", int_aarch64_neon_frint32z>;
@@ -4290,7 +4400,7 @@ let Predicates = [HasFRInt3264] in {
 } // HasFRInt3264
 
 defm FRSQRTE: SIMDTwoVectorFP<1, 1, 0b11101, "frsqrte", int_aarch64_neon_frsqrte>;
-defm FSQRT  : SIMDTwoVectorFP<1, 1, 0b11111, "fsqrt", fsqrt>;
+defm FSQRT  : SIMDTwoVectorFP<1, 1, 0b11111, "fsqrt", any_fsqrt>;
 defm NEG    : SIMDTwoVectorBHSD<1, 0b01011, "neg",
                                UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
 defm NOT    : SIMDTwoVectorB<1, 0b00, 0b00101, "not", vnot>;
@@ -4312,9 +4422,9 @@ defm REV16  : SIMDTwoVectorB<0, 0b00, 0b00001, "rev16", AArch64rev16>;
 defm REV32  : SIMDTwoVectorBH<1, 0b00000, "rev32", AArch64rev32>;
 defm REV64  : SIMDTwoVectorBHS<0, 0b00000, "rev64", AArch64rev64>;
 defm SADALP : SIMDLongTwoVectorTied<0, 0b00110, "sadalp",
-       BinOpFrag<(add node:$LHS, (int_aarch64_neon_saddlp node:$RHS))> >;
-defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", int_aarch64_neon_saddlp>;
-defm SCVTF  : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", sint_to_fp>;
+       BinOpFrag<(add node:$LHS, (AArch64saddlp node:$RHS))> >;
+defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", AArch64saddlp>;
+defm SCVTF  : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", any_sint_to_fp>;
 defm SHLL   : SIMDVectorLShiftLongBySizeBHS;
 defm SQABS  : SIMDTwoVectorBHSD<0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
 defm SQNEG  : SIMDTwoVectorBHSD<1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
@@ -4324,7 +4434,7 @@ defm SUQADD : SIMDTwoVectorBHSDTied<0, 0b00011, "suqadd",int_aarch64_neon_suqadd
 defm UADALP : SIMDLongTwoVectorTied<1, 0b00110, "uadalp",
        BinOpFrag<(add node:$LHS, (AArch64uaddlp node:$RHS))> >;
 defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp", AArch64uaddlp>;
-defm UCVTF  : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", uint_to_fp>;
+defm UCVTF  : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", any_uint_to_fp>;
 defm UQXTN  : SIMDMixedTwoVector<1, 0b10100, "uqxtn", int_aarch64_neon_uqxtn>;
 defm URECPE : SIMDTwoVectorS<0, 1, 0b11100, "urecpe", int_aarch64_neon_urecpe>;
 defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_aarch64_neon_ursqrte>;
@@ -4348,15 +4458,15 @@ def : Pat<(v4f32  (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>;
 multiclass SIMDVectorLShiftLongBySizeBHSPats<SDPatternOperator ext> {
   def : Pat<(AArch64vshl (v8i16 (ext (v8i8 V64:$Rn))), (i32 8)),
             (SHLLv8i8 V64:$Rn)>;
-  def : Pat<(AArch64vshl (v8i16 (ext (extract_high_v16i8 V128:$Rn))), (i32 8)),
+  def : Pat<(AArch64vshl (v8i16 (ext (extract_high_v16i8 (v16i8 V128:$Rn)))), (i32 8)),
             (SHLLv16i8 V128:$Rn)>;
   def : Pat<(AArch64vshl (v4i32 (ext (v4i16 V64:$Rn))), (i32 16)),
             (SHLLv4i16 V64:$Rn)>;
-  def : Pat<(AArch64vshl (v4i32 (ext (extract_high_v8i16 V128:$Rn))), (i32 16)),
+  def : Pat<(AArch64vshl (v4i32 (ext (extract_high_v8i16 (v8i16 V128:$Rn)))), (i32 16)),
             (SHLLv8i16 V128:$Rn)>;
   def : Pat<(AArch64vshl (v2i64 (ext (v2i32 V64:$Rn))), (i32 32)),
             (SHLLv2i32 V64:$Rn)>;
-  def : Pat<(AArch64vshl (v2i64 (ext (extract_high_v4i32 V128:$Rn))), (i32 32)),
+  def : Pat<(AArch64vshl (v2i64 (ext (extract_high_v4i32 (v4i32 V128:$Rn)))), (i32 32)),
             (SHLLv4i32 V128:$Rn)>;
 }
 
@@ -4426,7 +4536,7 @@ def : Pat<(v8i16 (concat_vectors
 //===----------------------------------------------------------------------===//
 
 defm ADD     : SIMDThreeSameVector<0, 0b10000, "add", add>;
-defm ADDP    : SIMDThreeSameVector<0, 0b10111, "addp", int_aarch64_neon_addp>;
+defm ADDP    : SIMDThreeSameVector<0, 0b10111, "addp", AArch64addp>;
 defm CMEQ    : SIMDThreeSameVector<1, 0b10001, "cmeq", AArch64cmeq>;
 defm CMGE    : SIMDThreeSameVector<0, 0b00111, "cmge", AArch64cmge>;
 defm CMGT    : SIMDThreeSameVector<0, 0b00110, "cmgt", AArch64cmgt>;
@@ -4447,33 +4557,33 @@ def : Pat<(fabs (fsub VT:$Rn, VT:$Rm)), (!cast<Instruction>("FABD"#VT) VT:$Rn, V
 }
 defm FACGE   : SIMDThreeSameVectorFPCmp<1,0,0b101,"facge",int_aarch64_neon_facge>;
 defm FACGT   : SIMDThreeSameVectorFPCmp<1,1,0b101,"facgt",int_aarch64_neon_facgt>;
-defm FADDP   : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_faddp>;
-defm FADD    : SIMDThreeSameVectorFP<0,0,0b010,"fadd", fadd>;
+defm FADDP   : SIMDThreeSameVectorFP<1,0,0b010,"faddp", AArch64faddp>;
+defm FADD    : SIMDThreeSameVectorFP<0,0,0b010,"fadd", any_fadd>;
 defm FCMEQ   : SIMDThreeSameVectorFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
 defm FCMGE   : SIMDThreeSameVectorFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
 defm FCMGT   : SIMDThreeSameVectorFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
-defm FDIV    : SIMDThreeSameVectorFP<1,0,0b111,"fdiv", fdiv>;
+defm FDIV    : SIMDThreeSameVectorFP<1,0,0b111,"fdiv", any_fdiv>;
 defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b000,"fmaxnmp", int_aarch64_neon_fmaxnmp>;
-defm FMAXNM  : SIMDThreeSameVectorFP<0,0,0b000,"fmaxnm", fmaxnum>;
+defm FMAXNM  : SIMDThreeSameVectorFP<0,0,0b000,"fmaxnm", any_fmaxnum>;
 defm FMAXP   : SIMDThreeSameVectorFP<1,0,0b110,"fmaxp", int_aarch64_neon_fmaxp>;
-defm FMAX    : SIMDThreeSameVectorFP<0,0,0b110,"fmax", fmaximum>;
+defm FMAX    : SIMDThreeSameVectorFP<0,0,0b110,"fmax", any_fmaximum>;
 defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b000,"fminnmp", int_aarch64_neon_fminnmp>;
-defm FMINNM  : SIMDThreeSameVectorFP<0,1,0b000,"fminnm", fminnum>;
+defm FMINNM  : SIMDThreeSameVectorFP<0,1,0b000,"fminnm", any_fminnum>;
 defm FMINP   : SIMDThreeSameVectorFP<1,1,0b110,"fminp", int_aarch64_neon_fminp>;
-defm FMIN    : SIMDThreeSameVectorFP<0,1,0b110,"fmin", fminimum>;
+defm FMIN    : SIMDThreeSameVectorFP<0,1,0b110,"fmin", any_fminimum>;
 
 // NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the
 // instruction expects the addend first, while the fma intrinsic puts it last.
 defm FMLA     : SIMDThreeSameVectorFPTied<0, 0, 0b001, "fmla",
-            TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
+            TriOpFrag<(any_fma node:$RHS, node:$MHS, node:$LHS)> >;
 defm FMLS     : SIMDThreeSameVectorFPTied<0, 1, 0b001, "fmls",
-            TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
+            TriOpFrag<(any_fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
 
 defm FMULX    : SIMDThreeSameVectorFP<0,0,0b011,"fmulx", int_aarch64_neon_fmulx>;
-defm FMUL     : SIMDThreeSameVectorFP<1,0,0b011,"fmul", fmul>;
+defm FMUL     : SIMDThreeSameVectorFP<1,0,0b011,"fmul", any_fmul>;
 defm FRECPS   : SIMDThreeSameVectorFP<0,0,0b111,"frecps", int_aarch64_neon_frecps>;
 defm FRSQRTS  : SIMDThreeSameVectorFP<0,1,0b111,"frsqrts", int_aarch64_neon_frsqrts>;
-defm FSUB     : SIMDThreeSameVectorFP<0,1,0b010,"fsub", fsub>;
+defm FSUB     : SIMDThreeSameVectorFP<0,1,0b010,"fsub", any_fsub>;
 
 // MLA and MLS are generated in MachineCombine
 defm MLA      : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla", null_frag>;
@@ -4484,7 +4594,7 @@ defm PMUL     : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>;
 defm SABA     : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
       TriOpFrag<(add node:$LHS, (AArch64sabd node:$MHS, node:$RHS))> >;
 defm SABD     : SIMDThreeSameVectorBHS<0,0b01110,"sabd", AArch64sabd>;
-defm SHADD    : SIMDThreeSameVectorBHS<0,0b00000,"shadd", AArch64shadd>;
+defm SHADD    : SIMDThreeSameVectorBHS<0,0b00000,"shadd", avgfloors>;
 defm SHSUB    : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>;
 defm SMAXP    : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>;
 defm SMAX     : SIMDThreeSameVectorBHS<0,0b01100,"smax", smax>;
@@ -4496,14 +4606,14 @@ defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrd
 defm SQRSHL   : SIMDThreeSameVector<0,0b01011,"sqrshl", int_aarch64_neon_sqrshl>;
 defm SQSHL    : SIMDThreeSameVector<0,0b01001,"sqshl", int_aarch64_neon_sqshl>;
 defm SQSUB    : SIMDThreeSameVector<0,0b00101,"sqsub", int_aarch64_neon_sqsub>;
-defm SRHADD   : SIMDThreeSameVectorBHS<0,0b00010,"srhadd", AArch64srhadd>;
+defm SRHADD   : SIMDThreeSameVectorBHS<0,0b00010,"srhadd", avgceils>;
 defm SRSHL    : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>;
 defm SSHL     : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>;
 defm SUB      : SIMDThreeSameVector<1,0b10000,"sub", sub>;
 defm UABA     : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba",
       TriOpFrag<(add node:$LHS, (AArch64uabd node:$MHS, node:$RHS))> >;
 defm UABD     : SIMDThreeSameVectorBHS<1,0b01110,"uabd", AArch64uabd>;
-defm UHADD    : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", AArch64uhadd>;
+defm UHADD    : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", avgflooru>;
 defm UHSUB    : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>;
 defm UMAXP    : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>;
 defm UMAX     : SIMDThreeSameVectorBHS<1,0b01100,"umax", umax>;
@@ -4513,7 +4623,7 @@ defm UQADD    : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>;
 defm UQRSHL   : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>;
 defm UQSHL    : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>;
 defm UQSUB    : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>;
-defm URHADD   : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", AArch64urhadd>;
+defm URHADD   : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", avgceilu>;
 defm URSHL    : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>;
 defm USHL     : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>;
 defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah",
@@ -4753,11 +4863,13 @@ defm CMTST    : SIMDThreeScalarD<0, 0b10001, "cmtst", AArch64cmtst>;
 defm FABD     : SIMDFPThreeScalar<1, 1, 0b010, "fabd", int_aarch64_sisd_fabd>;
 def : Pat<(v1f64 (int_aarch64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
           (FABD64 FPR64:$Rn, FPR64:$Rm)>;
-let Predicates = [HasFullFP16] in {
+let Predicates = [HasNEON, HasFullFP16] in {
 def : Pat<(fabs (fsub f16:$Rn, f16:$Rm)), (FABD16 f16:$Rn, f16:$Rm)>;
 }
+let Predicates = [HasNEON] in {
 def : Pat<(fabs (fsub f32:$Rn, f32:$Rm)), (FABD32 f32:$Rn, f32:$Rm)>;
 def : Pat<(fabs (fsub f64:$Rn, f64:$Rm)), (FABD64 f64:$Rn, f64:$Rm)>;
+}
 defm FACGE    : SIMDThreeScalarFPCmp<1, 0, 0b101, "facge",
                                      int_aarch64_neon_facge>;
 defm FACGT    : SIMDThreeScalarFPCmp<1, 1, 0b101, "facgt",
@@ -4765,9 +4877,9 @@ defm FACGT    : SIMDThreeScalarFPCmp<1, 1, 0b101, "facgt",
 defm FCMEQ    : SIMDThreeScalarFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
 defm FCMGE    : SIMDThreeScalarFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
 defm FCMGT    : SIMDThreeScalarFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
-defm FMULX    : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx, HasNEONorStreamingSVE>;
-defm FRECPS   : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps, HasNEONorStreamingSVE>;
-defm FRSQRTS  : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts, HasNEONorStreamingSVE>;
+defm FMULX    : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx, HasNEONorSME>;
+defm FRECPS   : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps, HasNEONorSME>;
+defm FRSQRTS  : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts, HasNEONorSME>;
 defm SQADD    : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>;
 defm SQDMULH  : SIMDThreeScalarHS<  0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>;
 defm SQRDMULH : SIMDThreeScalarHS<  1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
@@ -4862,9 +4974,9 @@ defm FCVTPU : SIMDFPTwoScalar<   1, 1, 0b11010, "fcvtpu">;
 def  FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
 defm FCVTZS : SIMDFPTwoScalar<   0, 1, 0b11011, "fcvtzs">;
 defm FCVTZU : SIMDFPTwoScalar<   1, 1, 0b11011, "fcvtzu">;
-defm FRECPE : SIMDFPTwoScalar<   0, 1, 0b11101, "frecpe", HasNEONorStreamingSVE>;
-defm FRECPX : SIMDFPTwoScalar<   0, 1, 0b11111, "frecpx", HasNEONorStreamingSVE>;
-defm FRSQRTE : SIMDFPTwoScalar<  1, 1, 0b11101, "frsqrte", HasNEONorStreamingSVE>;
+defm FRECPE : SIMDFPTwoScalar<   0, 1, 0b11101, "frecpe", HasNEONorSME>;
+defm FRECPX : SIMDFPTwoScalar<   0, 1, 0b11111, "frecpx", HasNEONorSME>;
+defm FRSQRTE : SIMDFPTwoScalar<  1, 1, 0b11101, "frsqrte", HasNEONorSME>;
 defm NEG    : SIMDTwoScalarD<    1, 0b01011, "neg",
                                  UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
 defm SCVTF  : SIMDFPTwoScalarCVT<   0, 0, 0b11101, "scvtf", AArch64sitof>;
@@ -4980,23 +5092,21 @@ def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
 // int values in FP registers using the corresponding NEON instructions to
 // avoid more costly int <-> fp register transfers.
 let Predicates = [HasNEON] in {
-def : Pat<(f64 (sint_to_fp (i64 (fp_to_sint f64:$Rn)))),
+def : Pat<(f64 (any_sint_to_fp (i64 (any_fp_to_sint f64:$Rn)))),
           (SCVTFv1i64 (i64 (FCVTZSv1i64 f64:$Rn)))>;
-def : Pat<(f32 (sint_to_fp (i32 (fp_to_sint f32:$Rn)))),
+def : Pat<(f32 (any_sint_to_fp (i32 (any_fp_to_sint f32:$Rn)))),
           (SCVTFv1i32 (i32 (FCVTZSv1i32 f32:$Rn)))>;
-def : Pat<(f64 (uint_to_fp (i64 (fp_to_uint f64:$Rn)))),
+def : Pat<(f64 (any_uint_to_fp (i64 (any_fp_to_uint f64:$Rn)))),
           (UCVTFv1i64 (i64 (FCVTZUv1i64 f64:$Rn)))>;
-def : Pat<(f32 (uint_to_fp (i32 (fp_to_uint f32:$Rn)))),
+def : Pat<(f32 (any_uint_to_fp (i32 (any_fp_to_uint f32:$Rn)))),
           (UCVTFv1i32 (i32 (FCVTZUv1i32 f32:$Rn)))>;
 
 let Predicates = [HasFullFP16] in {
-def : Pat<(f16 (sint_to_fp (i32 (fp_to_sint f16:$Rn)))),
+def : Pat<(f16 (any_sint_to_fp (i32 (any_fp_to_sint f16:$Rn)))),
           (SCVTFv1i16 (f16 (FCVTZSv1f16 f16:$Rn)))>;
-def : Pat<(f16 (uint_to_fp (i32 (fp_to_uint f16:$Rn)))),
+def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))),
           (UCVTFv1i16 (f16 (FCVTZUv1f16 f16:$Rn)))>;
 }
-}
-
 // If an integer is about to be converted to a floating point value,
 // just load it on the floating point unit.
 // Here are the patterns for 8 and 16-bits to float.
@@ -5083,6 +5193,7 @@ def : Pat <(f64 (uint_to_fp (i32
                           (LDURSi GPR64sp:$Rn, simm9:$offset), ssub))>;
 // 64-bits -> double are handled in target specific dag combine:
 // performIntToFpCombine.
+} // let Predicates = [HasNEON]
 
 //===----------------------------------------------------------------------===//
 // Advanced SIMD three different-sized vector instructions.
@@ -5102,10 +5213,10 @@ defm SADDL   : SIMDLongThreeVectorBHS<   0, 0b0000, "saddl",
 defm SADDW   : SIMDWideThreeVectorBHS<   0, 0b0001, "saddw",
                  BinOpFrag<(add node:$LHS, (sext node:$RHS))>>;
 defm SMLAL   : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal",
-    TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+    TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
 defm SMLSL   : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl",
-    TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
-defm SMULL   : SIMDLongThreeVectorBHS<0, 0b1100, "smull", int_aarch64_neon_smull>;
+    TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
+defm SMULL   : SIMDLongThreeVectorBHS<0, 0b1100, "smull", AArch64smull>;
 defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal",
                                                int_aarch64_neon_sqadd>;
 defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl",
@@ -5123,10 +5234,10 @@ defm UADDL   : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
 defm UADDW   : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
                  BinOpFrag<(add node:$LHS, (zanyext node:$RHS))>>;
 defm UMLAL   : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal",
-    TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+    TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>;
 defm UMLSL   : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl",
-    TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
-defm UMULL   : SIMDLongThreeVectorBHS<1, 0b1100, "umull", int_aarch64_neon_umull>;
+    TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>;
+defm UMULL   : SIMDLongThreeVectorBHS<1, 0b1100, "umull", AArch64umull>;
 defm USUBL   : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
                  BinOpFrag<(sub (zanyext node:$LHS), (zanyext node:$RHS))>>;
 defm USUBW   : SIMDWideThreeVectorBHS<   1, 0b0011, "usubw",
@@ -5161,74 +5272,15 @@ multiclass Neon_mul_acc_widen_patterns<SDPatternOperator opnode, SDPatternOperat
                                      V64:$Rn, V64:$Rm)), dsub)>;
 }
 
-defm : Neon_mul_acc_widen_patterns<add, int_aarch64_neon_umull,
+defm : Neon_mul_acc_widen_patterns<add, AArch64umull,
      UMLALv8i8_v8i16, UMLALv4i16_v4i32, UMLALv2i32_v2i64>;
-defm : Neon_mul_acc_widen_patterns<add, int_aarch64_neon_smull,
+defm : Neon_mul_acc_widen_patterns<add, AArch64smull,
      SMLALv8i8_v8i16, SMLALv4i16_v4i32, SMLALv2i32_v2i64>;
-defm : Neon_mul_acc_widen_patterns<sub, int_aarch64_neon_umull,
+defm : Neon_mul_acc_widen_patterns<sub, AArch64umull,
      UMLSLv8i8_v8i16, UMLSLv4i16_v4i32, UMLSLv2i32_v2i64>;
-defm : Neon_mul_acc_widen_patterns<sub, int_aarch64_neon_smull,
+defm : Neon_mul_acc_widen_patterns<sub, AArch64smull,
      SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>;
 
-// Additional patterns for SMULL and UMULL
-multiclass Neon_mul_widen_patterns<SDPatternOperator opnode,
-  Instruction INST8B, Instruction INST4H, Instruction INST2S> {
-  def : Pat<(v8i16 (opnode (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
-            (INST8B V64:$Rn, V64:$Rm)>;
-  def : Pat<(v4i32 (opnode (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
-            (INST4H V64:$Rn, V64:$Rm)>;
-  def : Pat<(v2i64 (opnode (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
-            (INST2S V64:$Rn, V64:$Rm)>;
-}
-
-defm : Neon_mul_widen_patterns<AArch64smull, SMULLv8i8_v8i16,
-  SMULLv4i16_v4i32, SMULLv2i32_v2i64>;
-defm : Neon_mul_widen_patterns<AArch64umull, UMULLv8i8_v8i16,
-  UMULLv4i16_v4i32, UMULLv2i32_v2i64>;
-
-// Patterns for smull2/umull2.
-multiclass Neon_mul_high_patterns<SDPatternOperator opnode,
-  Instruction INST8B, Instruction INST4H, Instruction INST2S> {
-  def : Pat<(v8i16 (opnode (extract_high_v16i8 V128:$Rn),
-                           (extract_high_v16i8 V128:$Rm))),
-             (INST8B V128:$Rn, V128:$Rm)>;
-  def : Pat<(v4i32 (opnode (extract_high_v8i16 V128:$Rn),
-                           (extract_high_v8i16 V128:$Rm))),
-             (INST4H V128:$Rn, V128:$Rm)>;
-  def : Pat<(v2i64 (opnode (extract_high_v4i32 V128:$Rn),
-                           (extract_high_v4i32 V128:$Rm))),
-             (INST2S V128:$Rn, V128:$Rm)>;
-}
-
-defm : Neon_mul_high_patterns<AArch64smull, SMULLv16i8_v8i16,
-  SMULLv8i16_v4i32, SMULLv4i32_v2i64>;
-defm : Neon_mul_high_patterns<AArch64umull, UMULLv16i8_v8i16,
-  UMULLv8i16_v4i32, UMULLv4i32_v2i64>;
-
-// Additional patterns for SMLAL/SMLSL and UMLAL/UMLSL
-multiclass Neon_mulacc_widen_patterns<SDPatternOperator opnode,
-  Instruction INST8B, Instruction INST4H, Instruction INST2S> {
-  def : Pat<(v8i16 (opnode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
-            (INST8B V128:$Rd, V64:$Rn, V64:$Rm)>;
-  def : Pat<(v4i32 (opnode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
-            (INST4H V128:$Rd, V64:$Rn, V64:$Rm)>;
-  def : Pat<(v2i64 (opnode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
-            (INST2S  V128:$Rd, V64:$Rn, V64:$Rm)>;
-}
-
-defm : Neon_mulacc_widen_patterns<
-  TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>,
-  SMLALv8i8_v8i16, SMLALv4i16_v4i32, SMLALv2i32_v2i64>;
-defm : Neon_mulacc_widen_patterns<
-  TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>,
-  UMLALv8i8_v8i16, UMLALv4i16_v4i32, UMLALv2i32_v2i64>;
-defm : Neon_mulacc_widen_patterns<
-  TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>,
-  SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>;
-defm : Neon_mulacc_widen_patterns<
-  TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>,
-  UMLSLv8i8_v8i16, UMLSLv4i16_v4i32, UMLSLv2i32_v2i64>;
-
 // Patterns for 64-bit pmull
 def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm),
           (PMULLv1i64 V64:$Rn, V64:$Rm)>;
@@ -5392,19 +5444,22 @@ defm FMAXP   : SIMDFPPairwiseScalar<0, 0b01111, "fmaxp">;
 defm FMINNMP : SIMDFPPairwiseScalar<1, 0b01100, "fminnmp">;
 defm FMINP   : SIMDFPPairwiseScalar<1, 0b01111, "fminp">;
 
+// Only the lower half of the result of the inner FADDP is used in the patterns
+// below, so the second operand does not matter. Re-use the first input
+// operand, so no additional dependencies need to be introduced.
 let Predicates = [HasFullFP16] in {
 def : Pat<(f16 (vecreduce_fadd (v8f16 V128:$Rn))),
             (FADDPv2i16p
               (EXTRACT_SUBREG
-                 (FADDPv8f16 (FADDPv8f16 V128:$Rn, (v8f16 (IMPLICIT_DEF))), (v8f16 (IMPLICIT_DEF))),
+                 (FADDPv8f16 (FADDPv8f16 V128:$Rn, V128:$Rn), V128:$Rn),
                dsub))>;
 def : Pat<(f16 (vecreduce_fadd (v4f16 V64:$Rn))),
-          (FADDPv2i16p (FADDPv4f16 V64:$Rn, (v4f16 (IMPLICIT_DEF))))>;
+          (FADDPv2i16p (FADDPv4f16 V64:$Rn, V64:$Rn))>;
 }
 def : Pat<(f32 (vecreduce_fadd (v4f32 V128:$Rn))),
           (FADDPv2i32p
             (EXTRACT_SUBREG
-              (FADDPv4f32 V128:$Rn, (v4f32 (IMPLICIT_DEF))),
+              (FADDPv4f32 V128:$Rn, V128:$Rn),
              dsub))>;
 def : Pat<(f32 (vecreduce_fadd (v2f32 V64:$Rn))),
           (FADDPv2i32p V64:$Rn)>;
@@ -5856,24 +5911,28 @@ defm FMAXV   : SIMDFPAcrossLanes<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>;
 defm FMINNMV : SIMDFPAcrossLanes<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>;
 defm FMINV   : SIMDFPAcrossLanes<0b01111, 1, "fminv", int_aarch64_neon_fminv>;
 
-// Patterns for uaddv(uaddlp(x)) ==> uaddlv
-def : Pat<(i32 (vector_extract (v8i16 (insert_subvector undef,
-            (v4i16 (AArch64uaddv (v4i16 (AArch64uaddlp (v8i8 V64:$op))))),
-            (i64 0))), (i64 0))),
-          (EXTRACT_SUBREG (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
-           (UADDLVv8i8v V64:$op), hsub), ssub)>;
-def : Pat<(i32 (vector_extract (v8i16 (AArch64uaddv (v8i16 (AArch64uaddlp
-           (v16i8 V128:$op))))), (i64 0))),
-          (EXTRACT_SUBREG (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
-           (UADDLVv16i8v V128:$op), hsub), ssub)>;
-def : Pat<(v4i32 (AArch64uaddv (v4i32 (AArch64uaddlp (v8i16 V128:$op))))),
-          (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), (UADDLVv8i16v V128:$op), ssub)>;
-
-// Patterns for addp(uaddlp(x))) ==> uaddlv
-def : Pat<(v2i32 (AArch64uaddv (v2i32 (AArch64uaddlp (v4i16 V64:$op))))),
-          (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), (UADDLVv4i16v V64:$op), ssub)>;
-def : Pat<(v2i64 (AArch64uaddv (v2i64 (AArch64uaddlp (v4i32 V128:$op))))),
-          (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (UADDLVv4i32v V128:$op), dsub)>;
+multiclass SIMDAcrossLaneLongPairIntrinsic<string Opc, SDPatternOperator addlp> {
+  // Patterns for addv(addlp(x)) ==> addlv
+  def : Pat<(i32 (vector_extract (v8i16 (insert_subvector undef,
+              (v4i16 (AArch64uaddv (v4i16 (addlp (v8i8 V64:$op))))),
+              (i64 0))), (i64 0))),
+            (EXTRACT_SUBREG (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
+              (!cast<Instruction>(Opc#"v8i8v") V64:$op), hsub), ssub)>;
+  def : Pat<(i32 (vector_extract (v8i16 (AArch64uaddv (v8i16 (addlp (v16i8 V128:$op))))), (i64 0))),
+            (EXTRACT_SUBREG (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
+              (!cast<Instruction>(Opc#"v16i8v") V128:$op), hsub), ssub)>;
+  def : Pat<(v4i32 (AArch64uaddv (v4i32 (addlp (v8i16 V128:$op))))),
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), (!cast<Instruction>(Opc#"v8i16v") V128:$op), ssub)>;
+
+  // Patterns for addp(addlp(x))) ==> addlv
+  def : Pat<(v2i32 (AArch64uaddv (v2i32 (addlp (v4i16 V64:$op))))),
+            (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), (!cast<Instruction>(Opc#"v4i16v") V64:$op), ssub)>;
+  def : Pat<(v2i64 (AArch64uaddv (v2i64 (addlp (v4i32 V128:$op))))),
+            (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (!cast<Instruction>(Opc#"v4i32v") V128:$op), dsub)>;
+}
+
+defm : SIMDAcrossLaneLongPairIntrinsic<"UADDLV", AArch64uaddlp>;
+defm : SIMDAcrossLaneLongPairIntrinsic<"SADDLV", AArch64saddlp>;
 
 // Patterns for across-vector intrinsics, that have a node equivalent, that
 // returns a vector (with only the low lane defined) instead of a scalar.
@@ -6185,6 +6244,14 @@ def : Pat<(v8i8  immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>;
 let isReMaterializable = 1, isAsCheapAsAMove = 1 in
 defm MOVI      : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">;
 
+let Predicates = [HasNEON] in {
+  // Using the MOVI to materialize fp constants.
+  def : Pat<(f32 fpimm32SIMDModImmType4:$in),
+            (EXTRACT_SUBREG (MOVIv2i32 (fpimm32SIMDModImmType4XForm f32:$in),
+                                       (i32 24)),
+                            ssub)>;
+}
+
 def : InstAlias<"movi $Vd.4h, $imm", (MOVIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
 def : InstAlias<"movi $Vd.8h, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
 def : InstAlias<"movi $Vd.2s, $imm", (MOVIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
@@ -6273,18 +6340,18 @@ let hasSideEffects = 0 in {
 // On the other hand, there are quite a few valid combinatorial options due to
 // the commutativity of multiplication and the fact that (-x) * y = x * (-y).
 defm : SIMDFPIndexedTiedPatterns<"FMLA",
-           TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)>>;
+           TriOpFrag<(any_fma node:$RHS, node:$MHS, node:$LHS)>>;
 defm : SIMDFPIndexedTiedPatterns<"FMLA",
-           TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)>>;
+           TriOpFrag<(any_fma node:$MHS, node:$RHS, node:$LHS)>>;
 
 defm : SIMDFPIndexedTiedPatterns<"FMLS",
-           TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
+           TriOpFrag<(any_fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
 defm : SIMDFPIndexedTiedPatterns<"FMLS",
-           TriOpFrag<(fma node:$RHS, (fneg node:$MHS), node:$LHS)> >;
+           TriOpFrag<(any_fma node:$RHS, (fneg node:$MHS), node:$LHS)> >;
 defm : SIMDFPIndexedTiedPatterns<"FMLS",
-           TriOpFrag<(fma (fneg node:$RHS), node:$MHS, node:$LHS)> >;
+           TriOpFrag<(any_fma (fneg node:$RHS), node:$MHS, node:$LHS)> >;
 defm : SIMDFPIndexedTiedPatterns<"FMLS",
-           TriOpFrag<(fma (fneg node:$MHS), node:$RHS, node:$LHS)> >;
+           TriOpFrag<(any_fma (fneg node:$MHS), node:$RHS, node:$LHS)> >;
 
 multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> {
   // 3 variants for the .2s version: DUPLANE from 128-bit, DUPLANE from 64-bit
@@ -6363,22 +6430,22 @@ multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> {
 }
 
 defm : FMLSIndexedAfterNegPatterns<
-           TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
+           TriOpFrag<(any_fma node:$RHS, node:$MHS, node:$LHS)> >;
 defm : FMLSIndexedAfterNegPatterns<
-           TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >;
+           TriOpFrag<(any_fma node:$MHS, node:$RHS, node:$LHS)> >;
 
 defm FMULX : SIMDFPIndexed<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>;
-defm FMUL  : SIMDFPIndexed<0, 0b1001, "fmul", fmul>;
+defm FMUL  : SIMDFPIndexed<0, 0b1001, "fmul", any_fmul>;
 
-def : Pat<(v2f32 (fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
+def : Pat<(v2f32 (any_fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
           (FMULv2i32_indexed V64:$Rn,
             (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
             (i64 0))>;
-def : Pat<(v4f32 (fmul V128:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
+def : Pat<(v4f32 (any_fmul V128:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
           (FMULv4i32_indexed V128:$Rn,
             (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
             (i64 0))>;
-def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))),
+def : Pat<(v2f64 (any_fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))),
           (FMULv2i64_indexed V128:$Rn,
             (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rm, dsub),
             (i64 0))>;
@@ -6397,11 +6464,10 @@ defm MLS   : SIMDVectorIndexedHSTied<1, 0b0100, "mls", null_frag>;
 
 defm MUL   : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>;
 defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal",
-    TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+    TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
 defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl",
-    TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
-defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull",
-                int_aarch64_neon_smull>;
+    TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
+defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull", AArch64smull>;
 defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal",
                                            int_aarch64_neon_sqadd>;
 defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl",
@@ -6412,11 +6478,10 @@ defm SQRDMLSH : SIMDIndexedSQRDMLxHSDTied<1, 0b1111, "sqrdmlsh",
                                           int_aarch64_neon_sqrdmlsh>;
 defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_aarch64_neon_sqdmull>;
 defm UMLAL   : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal",
-    TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+    TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>;
 defm UMLSL   : SIMDVectorIndexedLongSDTied<1, 0b0110, "umlsl",
-    TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
-defm UMULL   : SIMDVectorIndexedLongSD<1, 0b1010, "umull",
-                int_aarch64_neon_umull>;
+    TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>;
+defm UMULL   : SIMDVectorIndexedLongSD<1, 0b1010, "umull", AArch64umull>;
 
 // A scalar sqdmull with the second operand being a vector lane can be
 // handled directly with the indexed instruction encoding.
@@ -6425,22 +6490,6 @@ def : Pat<(int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
                                                            VectorIndexS:$idx)),
           (SQDMULLv1i64_indexed FPR32:$Rn, V128:$Vm, VectorIndexS:$idx)>;
 
-// Match add node and also treat an 'or' node is as an 'add' if the or'ed operands
-// have no common bits.
-def add_and_or_is_add : PatFrags<(ops node:$lhs, node:$rhs),
-                         [(add node:$lhs, node:$rhs), (or node:$lhs, node:$rhs)],[{
-   if (N->getOpcode() == ISD::ADD)
-     return true;
-   return CurDAG->haveNoCommonBitsSet(N->getOperand(0), N->getOperand(1));
-}]> {
-  let GISelPredicateCode = [{
-     // Only handle G_ADD for now. FIXME. build capability to compute whether
-     // operands of G_OR have common bits set or not.
-     return MI.getOpcode() == TargetOpcode::G_ADD;
-  }];
-}
-
-
 //----------------------------------------------------------------------------
 // AdvSIMD scalar shift instructions
 //----------------------------------------------------------------------------
@@ -6480,7 +6529,7 @@ def : Pat<(v1f64 (int_aarch64_neon_vcvtfxu2fp (v1i64 FPR64:$Rn),
 def : Pat<(int_aarch64_neon_vcvtfxs2fp FPR32:$Rn, vecshiftR32:$imm),
           (SCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
 
-// Patterns for FP16 Instrinsics - requires reg copy to/from as i16s not supported.
+// Patterns for FP16 Intrinsics - requires reg copy to/from as i16s not supported.
 
 def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i32 (sext_inreg FPR32:$Rn, i16)), vecshiftR16:$imm)),
           (SCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>;
@@ -6787,7 +6836,7 @@ class SExtLoadi8CVTf32Pat<dag addrmode, dag INST>
                                   dsub)),
                                0),
                              ssub)))>,
-    Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;
+    Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32, HasNEON]>;
 
 def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext),
                           (LDRBroW  GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>;
@@ -6807,7 +6856,8 @@ class SExtLoadi16CVTf32Pat<dag addrmode, dag INST>
                                   INST,
                                   hsub),
                                 0),
-                            ssub)))>, Requires<[NotForCodeSize]>;
+                            ssub)))>,
+    Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32, HasNEON]>;
 
 def : SExtLoadi16CVTf32Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
                            (LDRHroW   GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
@@ -6841,7 +6891,7 @@ class SExtLoadi16CVTf64Pat<dag addrmode, dag INST>
                                    dsub)),
                                0),
                              dsub)))>,
-    Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;
+    Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32, HasNEON]>;
 
 def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
                            (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
@@ -6860,7 +6910,8 @@ class SExtLoadi32CVTf64Pat<dag addrmode, dag INST>
                                   INST,
                                   ssub),
                                0),
-                             dsub)))>, Requires<[NotForCodeSize]>;
+                             dsub)))>,
+    Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32, HasNEON]>;
 
 def : SExtLoadi32CVTf64Pat<(ro32.Wpat GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext),
                            (LDRSroW GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext)>;
@@ -7216,14 +7267,6 @@ def SHA256SU0rr : SHATiedInstVV<0b0010, "sha256su0",int_aarch64_crypto_sha256su0
 //----------------------------------------------------------------------------
 // FIXME: Like for X86, these should go in their own separate .td file.
 
-def def32 : PatLeaf<(i32 GPR32:$src), [{
-  return isDef32(*N);
-}]>;
-
-// In the case of a 32-bit def that is known to implicitly zero-extend,
-// we can use a SUBREG_TO_REG.
-def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GPR32:$src, sub_32)>;
-
 // For an anyext, we don't care what the high bits are, so we can perform an
 // INSERT_SUBREF into an IMPLICIT_DEF.
 def : Pat<(i64 (anyext GPR32:$src)),
@@ -7387,99 +7430,16 @@ def : Pat<(v4i32 (mulhu V128:$Rn, V128:$Rm)),
 //
 
 // Natural vector casts (64 bit)
-def : Pat<(v8i8 (AArch64NvCast (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>;
-def : Pat<(v4i16 (AArch64NvCast (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v4f16 (AArch64NvCast (v2i32 FPR64:$src))), (v4f16 FPR64:$src)>;
-def : Pat<(v4bf16 (AArch64NvCast (v2i32 FPR64:$src))), (v4bf16 FPR64:$src)>;
-def : Pat<(v2i32 (AArch64NvCast (v2i32 FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v2f32 (AArch64NvCast (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
-def : Pat<(v1i64 (AArch64NvCast (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
-
-def : Pat<(v8i8 (AArch64NvCast (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>;
-def : Pat<(v4i16 (AArch64NvCast (v4i16 FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v4f16 (AArch64NvCast (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>;
-def : Pat<(v4bf16 (AArch64NvCast (v4i16 FPR64:$src))), (v4bf16 FPR64:$src)>;
-def : Pat<(v2i32 (AArch64NvCast (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v1i64 (AArch64NvCast (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
-
-def : Pat<(v8i8 (AArch64NvCast (v8i8 FPR64:$src))), (v8i8 FPR64:$src)>;
-def : Pat<(v4i16 (AArch64NvCast (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v4f16 (AArch64NvCast (v8i8 FPR64:$src))), (v4f16 FPR64:$src)>;
-def : Pat<(v4bf16 (AArch64NvCast (v8i8 FPR64:$src))), (v4bf16 FPR64:$src)>;
-def : Pat<(v2i32 (AArch64NvCast (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v2f32 (AArch64NvCast (v8i8 FPR64:$src))), (v2f32 FPR64:$src)>;
-def : Pat<(v1i64 (AArch64NvCast (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>;
-
-def : Pat<(v8i8 (AArch64NvCast (f64 FPR64:$src))), (v8i8 FPR64:$src)>;
-def : Pat<(v4i16 (AArch64NvCast (f64 FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v4f16 (AArch64NvCast (f64 FPR64:$src))), (v4f16 FPR64:$src)>;
-def : Pat<(v4bf16 (AArch64NvCast (f64 FPR64:$src))), (v4bf16 FPR64:$src)>;
-def : Pat<(v2i32 (AArch64NvCast (f64 FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v2f32 (AArch64NvCast (f64 FPR64:$src))), (v2f32 FPR64:$src)>;
-def : Pat<(v1i64 (AArch64NvCast (f64 FPR64:$src))), (v1i64 FPR64:$src)>;
-def : Pat<(v1f64 (AArch64NvCast (f64 FPR64:$src))), (v1f64 FPR64:$src)>;
-
-def : Pat<(v8i8 (AArch64NvCast (v2f32 FPR64:$src))), (v8i8 FPR64:$src)>;
-def : Pat<(v4i16 (AArch64NvCast (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v2i32 (AArch64NvCast (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v2f32 (AArch64NvCast (v2f32 FPR64:$src))), (v2f32 FPR64:$src)>;
-def : Pat<(v1i64 (AArch64NvCast (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
-def : Pat<(v1f64 (AArch64NvCast (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>;
+foreach VT = [ v8i8, v4i16, v4f16, v4bf16, v2i32, v2f32, v1i64, v1f64, f64 ] in
+  foreach VT2 = [ v8i8, v4i16, v4f16, v4bf16, v2i32, v2f32, v1i64, v1f64, f64 ] in
+    def : Pat<(VT (AArch64NvCast (VT2 FPR64:$src))),
+              (VT FPR64:$src)>;
 
 // Natural vector casts (128 bit)
-def : Pat<(v16i8 (AArch64NvCast (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v8i16 (AArch64NvCast (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v8f16 (AArch64NvCast (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>;
-def : Pat<(v8bf16 (AArch64NvCast (v4i32 FPR128:$src))), (v8bf16 FPR128:$src)>;
-def : Pat<(v4i32 (AArch64NvCast (v4i32 FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v4f32 (AArch64NvCast (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v2i64 (AArch64NvCast (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v2f64 (AArch64NvCast (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
-
-def : Pat<(v16i8 (AArch64NvCast (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v8i16 (AArch64NvCast (v8i16 FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v8f16 (AArch64NvCast (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>;
-def : Pat<(v8bf16 (AArch64NvCast (v8i16 FPR128:$src))), (v8bf16 FPR128:$src)>;
-def : Pat<(v4i32 (AArch64NvCast (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v2i64 (AArch64NvCast (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v4f32 (AArch64NvCast (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v2f64 (AArch64NvCast (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
-
-def : Pat<(v16i8 (AArch64NvCast (v16i8 FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v8i16 (AArch64NvCast (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v8f16 (AArch64NvCast (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>;
-def : Pat<(v8bf16 (AArch64NvCast (v16i8 FPR128:$src))), (v8bf16 FPR128:$src)>;
-def : Pat<(v4i32 (AArch64NvCast (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v2i64 (AArch64NvCast (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v4f32 (AArch64NvCast (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v2f64 (AArch64NvCast (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
-
-def : Pat<(v16i8 (AArch64NvCast (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v8i16 (AArch64NvCast (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v8f16 (AArch64NvCast (v2i64 FPR128:$src))), (v8f16 FPR128:$src)>;
-def : Pat<(v8bf16 (AArch64NvCast (v2i64 FPR128:$src))), (v8bf16 FPR128:$src)>;
-def : Pat<(v4i32 (AArch64NvCast (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v2i64 (AArch64NvCast (v2i64 FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v4f32 (AArch64NvCast (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v2f64 (AArch64NvCast (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>;
-
-def : Pat<(v16i8 (AArch64NvCast (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v8i16 (AArch64NvCast (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v4i32 (AArch64NvCast (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v4f32 (AArch64NvCast (v4f32 FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v2i64 (AArch64NvCast (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v8f16 (AArch64NvCast (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>;
-def : Pat<(v8bf16 (AArch64NvCast (v4f32 FPR128:$src))), (v8bf16 FPR128:$src)>;
-def : Pat<(v2f64 (AArch64NvCast (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
-
-def : Pat<(v16i8 (AArch64NvCast (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v8i16 (AArch64NvCast (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v4i32 (AArch64NvCast (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v2i64 (AArch64NvCast (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v2f64 (AArch64NvCast (v2f64 FPR128:$src))), (v2f64 FPR128:$src)>;
-def : Pat<(v8f16 (AArch64NvCast (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>;
-def : Pat<(v8bf16 (AArch64NvCast (v2f64 FPR128:$src))), (v8bf16 FPR128:$src)>;
-def : Pat<(v4f32 (AArch64NvCast (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
+foreach VT = [ v16i8, v8i16, v8f16, v8bf16, v4i32, v4f32, v2i64, v2f64 ] in
+  foreach VT2 = [ v16i8, v8i16, v8f16, v8bf16, v4i32, v4f32, v2i64, v2f64 ] in
+    def : Pat<(VT (AArch64NvCast (VT2 FPR128:$src))),
+              (VT FPR128:$src)>;
 
 let Predicates = [IsLE] in {
 def : Pat<(v8i8  (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
@@ -8093,17 +8053,17 @@ defm : InsertSubvectorUndef<i64>;
 def : Pat<(i64 (add (vector_extract (v2i64 FPR128:$Rn), (i64 0)),
                     (vector_extract (v2i64 FPR128:$Rn), (i64 1)))),
            (i64 (ADDPv2i64p (v2i64 FPR128:$Rn)))>;
-def : Pat<(f64 (fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)),
-                     (vector_extract (v2f64 FPR128:$Rn), (i64 1)))),
+def : Pat<(f64 (any_fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)),
+                         (vector_extract (v2f64 FPR128:$Rn), (i64 1)))),
            (f64 (FADDPv2i64p (v2f64 FPR128:$Rn)))>;
     // vector_extract on 64-bit vectors gets promoted to a 128 bit vector,
     // so we match on v4f32 here, not v2f32. This will also catch adding
     // the low two lanes of a true v4f32 vector.
-def : Pat<(fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)),
-                (vector_extract (v4f32 FPR128:$Rn), (i64 1))),
+def : Pat<(any_fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)),
+                    (vector_extract (v4f32 FPR128:$Rn), (i64 1))),
           (f32 (FADDPv2i32p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>;
-def : Pat<(fadd (vector_extract (v8f16 FPR128:$Rn), (i64 0)),
-                (vector_extract (v8f16 FPR128:$Rn), (i64 1))),
+def : Pat<(any_fadd (vector_extract (v8f16 FPR128:$Rn), (i64 0)),
+                    (vector_extract (v8f16 FPR128:$Rn), (i64 1))),
           (f16 (FADDPv2i16p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>;
 
 // Scalar 64-bit shifts in FPR64 registers.
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 6aefc1fdb599..eaf39fc0dbb1 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -9,6 +9,12 @@
 // This file contains a pass that performs load / store related peephole
 // optimizations. This pass should be run after register allocation.
 //
+// The pass runs after the PrologEpilogInserter where we emit the CFI
+// instructions. In order to preserve the correctness of the unwind informaiton,
+// the pass should not change the order of any two instructions, one of which
+// has the FrameSetup/FrameDestroy flag or, alternatively, apply an add-hoc fix
+// to unwind information.
+//
 //===----------------------------------------------------------------------===//
 
 #include "AArch64InstrInfo.h"
@@ -31,6 +37,7 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
@@ -549,26 +556,6 @@ static unsigned getPostIndexedOpcode(unsigned Opc) {
   }
 }
 
-static bool isPairedLdSt(const MachineInstr &MI) {
-  switch (MI.getOpcode()) {
-  default:
-    return false;
-  case AArch64::LDPSi:
-  case AArch64::LDPSWi:
-  case AArch64::LDPDi:
-  case AArch64::LDPQi:
-  case AArch64::LDPWi:
-  case AArch64::LDPXi:
-  case AArch64::STPSi:
-  case AArch64::STPDi:
-  case AArch64::STPQi:
-  case AArch64::STPWi:
-  case AArch64::STPXi:
-  case AArch64::STGPi:
-    return true;
-  }
-}
-
 static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) {
 
   unsigned OpcA = FirstMI.getOpcode();
@@ -603,7 +590,7 @@ static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) {
 // Returns the scale and offset range of pre/post indexed variants of MI.
 static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,
                                        int &MinOffset, int &MaxOffset) {
-  bool IsPaired = isPairedLdSt(MI);
+  bool IsPaired = AArch64InstrInfo::isPairedLdSt(MI);
   bool IsTagStore = isTagStore(MI);
   // ST*G and all paired ldst have the same scale in pre/post-indexed variants
   // as in the "unsigned offset" variant.
@@ -625,17 +612,8 @@ static MachineOperand &getLdStRegOp(MachineInstr &MI,
   bool IsPreLdSt = AArch64InstrInfo::isPreLdSt(MI);
   if (IsPreLdSt)
     PairedRegOp += 1;
-  unsigned Idx = isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0;
-  return MI.getOperand(Idx);
-}
-
-static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) {
-  unsigned Idx = isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2 : 1;
-  return MI.getOperand(Idx);
-}
-
-static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) {
-  unsigned Idx = isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3 : 2;
+  unsigned Idx =
+      AArch64InstrInfo::isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0;
   return MI.getOperand(Idx);
 }
 
@@ -645,12 +623,14 @@ static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
   assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
   int LoadSize = TII->getMemScale(LoadInst);
   int StoreSize = TII->getMemScale(StoreInst);
-  int UnscaledStOffset = TII->hasUnscaledLdStOffset(StoreInst)
-                             ? getLdStOffsetOp(StoreInst).getImm()
-                             : getLdStOffsetOp(StoreInst).getImm() * StoreSize;
-  int UnscaledLdOffset = TII->hasUnscaledLdStOffset(LoadInst)
-                             ? getLdStOffsetOp(LoadInst).getImm()
-                             : getLdStOffsetOp(LoadInst).getImm() * LoadSize;
+  int UnscaledStOffset =
+      TII->hasUnscaledLdStOffset(StoreInst)
+          ? AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm()
+          : AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm() * StoreSize;
+  int UnscaledLdOffset =
+      TII->hasUnscaledLdStOffset(LoadInst)
+          ? AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm()
+          : AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm() * LoadSize;
   return (UnscaledStOffset <= UnscaledLdOffset) &&
          (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
 }
@@ -729,7 +709,7 @@ static bool isMergeableLdStUpdate(MachineInstr &MI) {
   case AArch64::STPWi:
   case AArch64::STPXi:
     // Make sure this is a reg+imm (as opposed to an address reloc).
-    if (!getLdStOffsetOp(MI).isImm())
+    if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
       return false;
 
     return true;
@@ -763,17 +743,18 @@ AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
   // Also based on MergeForward is from where we copy the base register operand
   // so we get the flags compatible with the input code.
   const MachineOperand &BaseRegOp =
-      MergeForward ? getLdStBaseOp(*MergeMI) : getLdStBaseOp(*I);
+      MergeForward ? AArch64InstrInfo::getLdStBaseOp(*MergeMI)
+                   : AArch64InstrInfo::getLdStBaseOp(*I);
 
   // Which register is Rt and which is Rt2 depends on the offset order.
   MachineInstr *RtMI;
-  if (getLdStOffsetOp(*I).getImm() ==
-      getLdStOffsetOp(*MergeMI).getImm() + OffsetStride)
+  if (AArch64InstrInfo::getLdStOffsetOp(*I).getImm() ==
+      AArch64InstrInfo::getLdStOffsetOp(*MergeMI).getImm() + OffsetStride)
     RtMI = &*MergeMI;
   else
     RtMI = &*I;
 
-  int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
+  int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(*RtMI).getImm();
   // Change the scaled offset from small to large type.
   if (IsScaled) {
     assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
@@ -923,6 +904,7 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
       assert(all_of(MI.operands(),
                     [this, &RenameReg](const MachineOperand &MOP) {
                       return !MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
+                             MOP.isUndef() ||
                              !TRI->regsOverlap(MOP.getReg(), *RenameReg);
                     }) &&
              "Rename register used between paired instruction, trashing the "
@@ -936,10 +918,11 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
   // Also based on MergeForward is from where we copy the base register operand
   // so we get the flags compatible with the input code.
   const MachineOperand &BaseRegOp =
-      MergeForward ? getLdStBaseOp(*Paired) : getLdStBaseOp(*I);
+      MergeForward ? AArch64InstrInfo::getLdStBaseOp(*Paired)
+                   : AArch64InstrInfo::getLdStBaseOp(*I);
 
-  int Offset = getLdStOffsetOp(*I).getImm();
-  int PairedOffset = getLdStOffsetOp(*Paired).getImm();
+  int Offset = AArch64InstrInfo::getLdStOffsetOp(*I).getImm();
+  int PairedOffset = AArch64InstrInfo::getLdStOffsetOp(*Paired).getImm();
   bool PairedIsUnscaled = TII->hasUnscaledLdStOffset(Paired->getOpcode());
   if (IsUnscaled != PairedIsUnscaled) {
     // We're trying to pair instructions that differ in how they are scaled.  If
@@ -974,7 +957,7 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
     RtMI = &*I;
     Rt2MI = &*Paired;
   }
-  int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
+  int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(*RtMI).getImm();
   // Scale the immediate offset, if necessary.
   if (TII->hasUnscaledLdStOffset(RtMI->getOpcode())) {
     assert(!(OffsetImm % TII->getMemScale(*RtMI)) &&
@@ -1132,12 +1115,14 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
     assert(IsUnscaled == TII->hasUnscaledLdStOffset(*StoreI) &&
            "Unsupported ld/st match");
     assert(LoadSize <= StoreSize && "Invalid load size");
-    int UnscaledLdOffset = IsUnscaled
-                               ? getLdStOffsetOp(*LoadI).getImm()
-                               : getLdStOffsetOp(*LoadI).getImm() * LoadSize;
-    int UnscaledStOffset = IsUnscaled
-                               ? getLdStOffsetOp(*StoreI).getImm()
-                               : getLdStOffsetOp(*StoreI).getImm() * StoreSize;
+    int UnscaledLdOffset =
+        IsUnscaled
+            ? AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm()
+            : AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm() * LoadSize;
+    int UnscaledStOffset =
+        IsUnscaled
+            ? AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm()
+            : AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm() * StoreSize;
     int Width = LoadSize * 8;
     Register DestReg =
         IsStoreXReg ? Register(TRI->getMatchingSuperReg(
@@ -1235,7 +1220,7 @@ bool AArch64LoadStoreOpt::findMatchingStore(
   MachineBasicBlock::iterator B = I->getParent()->begin();
   MachineBasicBlock::iterator MBBI = I;
   MachineInstr &LoadMI = *I;
-  Register BaseReg = getLdStBaseOp(LoadMI).getReg();
+  Register BaseReg = AArch64InstrInfo::getLdStBaseOp(LoadMI).getReg();
 
   // If the load is the first instruction in the block, there's obviously
   // not any matching store.
@@ -1264,7 +1249,8 @@ bool AArch64LoadStoreOpt::findMatchingStore(
     // Also we can't handle stores without an immediate offset operand,
     // while the operand might be the address for a global variable.
     if (MI.mayStore() && isMatchingStore(LoadMI, MI) &&
-        BaseReg == getLdStBaseOp(MI).getReg() && getLdStOffsetOp(MI).isImm() &&
+        BaseReg == AArch64InstrInfo::getLdStBaseOp(MI).getReg() &&
+        AArch64InstrInfo::getLdStOffsetOp(MI).isImm() &&
         isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&
         ModifiedRegUnits.available(getLdStRegOp(MI).getReg())) {
       StoreI = MBBI;
@@ -1467,18 +1453,19 @@ canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween,
   return true;
 }
 
-// Check if we can find a physical register for renaming. This register must:
-// * not be defined up to FirstMI (checking DefinedInBB)
-// * not used between the MI and the defining instruction of the register to
-//   rename (checked using UsedInBetween).
+// Check if we can find a physical register for renaming \p Reg. This register
+// must:
+// * not be defined already in \p DefinedInBB; DefinedInBB must contain all
+//   defined registers up to the point where the renamed register will be used,
+// * not used in \p UsedInBetween; UsedInBetween must contain all accessed
+//   registers in the range the rename register will be used,
 // * is available in all used register classes (checked using RequiredClasses).
 static Optional<MCPhysReg> tryToFindRegisterToRename(
-    MachineInstr &FirstMI, MachineInstr &MI, LiveRegUnits &DefinedInBB,
+    const MachineFunction &MF, Register Reg, LiveRegUnits &DefinedInBB,
     LiveRegUnits &UsedInBetween,
     SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
     const TargetRegisterInfo *TRI) {
-  auto &MF = *FirstMI.getParent()->getParent();
-  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  const MachineRegisterInfo &RegInfo = MF.getRegInfo();
 
   // Checks if any sub- or super-register of PR is callee saved.
   auto AnySubOrSuperRegCalleePreserved = [&MF, TRI](MCPhysReg PR) {
@@ -1499,7 +1486,7 @@ static Optional<MCPhysReg> tryToFindRegisterToRename(
     });
   };
 
-  auto *RegClass = TRI->getMinimalPhysRegClass(getLdStRegOp(FirstMI).getReg());
+  auto *RegClass = TRI->getMinimalPhysRegClass(Reg);
   for (const MCPhysReg &PR : *RegClass) {
     if (DefinedInBB.available(PR) && UsedInBetween.available(PR) &&
         !RegInfo.isReserved(PR) && !AnySubOrSuperRegCalleePreserved(PR) &&
@@ -1530,8 +1517,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
   bool MayLoad = FirstMI.mayLoad();
   bool IsUnscaled = TII->hasUnscaledLdStOffset(FirstMI);
   Register Reg = getLdStRegOp(FirstMI).getReg();
-  Register BaseReg = getLdStBaseOp(FirstMI).getReg();
-  int Offset = getLdStOffsetOp(FirstMI).getImm();
+  Register BaseReg = AArch64InstrInfo::getLdStBaseOp(FirstMI).getReg();
+  int Offset = AArch64InstrInfo::getLdStOffsetOp(FirstMI).getImm();
   int OffsetStride = IsUnscaled ? TII->getMemScale(FirstMI) : 1;
   bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
 
@@ -1566,7 +1553,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
 
     Flags.setSExtIdx(-1);
     if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
-        getLdStOffsetOp(MI).isImm()) {
+        AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) {
       assert(MI.mayLoadOrStore() && "Expected memory operation.");
       // If we've found another instruction with the same opcode, check to see
       // if the base and offset are compatible with our starting instruction.
@@ -1574,8 +1561,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
       // check for +1/-1. Make sure to check the new instruction offset is
       // actually an immediate and not a symbolic reference destined for
       // a relocation.
-      Register MIBaseReg = getLdStBaseOp(MI).getReg();
-      int MIOffset = getLdStOffsetOp(MI).getImm();
+      Register MIBaseReg = AArch64InstrInfo::getLdStBaseOp(MI).getReg();
+      int MIOffset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
       bool MIIsUnscaled = TII->hasUnscaledLdStOffset(MI);
       if (IsUnscaled != MIIsUnscaled) {
         // We're trying to pair instructions that differ in how they are scaled.
@@ -1606,15 +1593,16 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
         // can't be paired: bail and keep looking.
         if (IsPreLdSt) {
           bool IsOutOfBounds = MIOffset != TII->getMemScale(MI);
-          bool IsBaseRegUsed =
-              !UsedRegUnits.available(getLdStBaseOp(MI).getReg());
-          bool IsBaseRegModified =
-              !ModifiedRegUnits.available(getLdStBaseOp(MI).getReg());
+          bool IsBaseRegUsed = !UsedRegUnits.available(
+              AArch64InstrInfo::getLdStBaseOp(MI).getReg());
+          bool IsBaseRegModified = !ModifiedRegUnits.available(
+              AArch64InstrInfo::getLdStBaseOp(MI).getReg());
           // If the stored value and the address of the second instruction is
           // the same, it needs to be using the updated register and therefore
           // it must not be folded.
-          bool IsMIRegTheSame = TRI->regsOverlap(getLdStRegOp(MI).getReg(),
-                                                 getLdStBaseOp(MI).getReg());
+          bool IsMIRegTheSame =
+              TRI->regsOverlap(getLdStRegOp(MI).getReg(),
+                               AArch64InstrInfo::getLdStBaseOp(MI).getReg());
           if (IsOutOfBounds || IsBaseRegUsed || IsBaseRegModified ||
               IsMIRegTheSame) {
             LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
@@ -1722,8 +1710,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
 
             if (*MaybeCanRename) {
               Optional<MCPhysReg> MaybeRenameReg = tryToFindRegisterToRename(
-                  FirstMI, MI, DefinedInBB, UsedInBetween, RequiredClasses,
-                  TRI);
+                  *FirstMI.getParent()->getParent(), Reg, DefinedInBB,
+                  UsedInBetween, RequiredClasses, TRI);
               if (MaybeRenameReg) {
                 Flags.setRenameReg(*MaybeRenameReg);
                 Flags.setMergeForward(true);
@@ -1760,6 +1748,28 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
   return E;
 }
 
+static MachineBasicBlock::iterator
+maybeMoveCFI(MachineInstr &MI, MachineBasicBlock::iterator MaybeCFI) {
+  auto End = MI.getParent()->end();
+  if (MaybeCFI == End ||
+      MaybeCFI->getOpcode() != TargetOpcode::CFI_INSTRUCTION ||
+      !(MI.getFlag(MachineInstr::FrameSetup) ||
+        MI.getFlag(MachineInstr::FrameDestroy)) ||
+      AArch64InstrInfo::getLdStBaseOp(MI).getReg() != AArch64::SP)
+    return End;
+
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  unsigned CFIIndex = MaybeCFI->getOperand(0).getCFIIndex();
+  const MCCFIInstruction &CFI = MF.getFrameInstructions()[CFIIndex];
+  switch (CFI.getOperation()) {
+  case MCCFIInstruction::OpDefCfa:
+  case MCCFIInstruction::OpDefCfaOffset:
+    return MaybeCFI;
+  default:
+    return End;
+  }
+}
+
 MachineBasicBlock::iterator
 AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
                                      MachineBasicBlock::iterator Update,
@@ -1769,6 +1779,12 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
          "Unexpected base register update instruction to merge!");
   MachineBasicBlock::iterator E = I->getParent()->end();
   MachineBasicBlock::iterator NextI = next_nodbg(I, E);
+
+  // If updating the SP and the following instruction is CFA offset related CFI
+  // instruction move it after the merged instruction.
+  MachineBasicBlock::iterator CFI =
+      IsPreIdx ? maybeMoveCFI(*Update, next_nodbg(Update, E)) : E;
+
   // Return the instruction following the merged instruction, which is
   // the instruction following our unmerged load. Unless that's the add/sub
   // instruction we're merging, in which case it's the one after that.
@@ -1786,12 +1802,12 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
   MachineInstrBuilder MIB;
   int Scale, MinOffset, MaxOffset;
   getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset);
-  if (!isPairedLdSt(*I)) {
+  if (!AArch64InstrInfo::isPairedLdSt(*I)) {
     // Non-paired instruction.
     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
               .add(getLdStRegOp(*Update))
               .add(getLdStRegOp(*I))
-              .add(getLdStBaseOp(*I))
+              .add(AArch64InstrInfo::getLdStBaseOp(*I))
               .addImm(Value / Scale)
               .setMemRefs(I->memoperands())
               .setMIFlags(I->mergeFlagsWith(*Update));
@@ -1801,12 +1817,15 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
               .add(getLdStRegOp(*Update))
               .add(getLdStRegOp(*I, 0))
               .add(getLdStRegOp(*I, 1))
-              .add(getLdStBaseOp(*I))
+              .add(AArch64InstrInfo::getLdStBaseOp(*I))
               .addImm(Value / Scale)
               .setMemRefs(I->memoperands())
               .setMIFlags(I->mergeFlagsWith(*Update));
   }
-  (void)MIB;
+  if (CFI != E) {
+    MachineBasicBlock *MBB = I->getParent();
+    MBB->splice(std::next(MIB.getInstr()->getIterator()), MBB, CFI);
+  }
 
   if (IsPreIdx) {
     ++NumPreFolded;
@@ -1888,8 +1907,9 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
   MachineInstr &MemMI = *I;
   MachineBasicBlock::iterator MBBI = I;
 
-  Register BaseReg = getLdStBaseOp(MemMI).getReg();
-  int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * TII->getMemScale(MemMI);
+  Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
+  int MIUnscaledOffset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm() *
+                         TII->getMemScale(MemMI);
 
   // Scan forward looking for post-index opportunities.  Updating instructions
   // can't be formed if the memory instruction doesn't have the offset we're
@@ -1904,7 +1924,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
   // behavior in this case unlike normal stores, and always performs writeback
   // after reading the source register value.
   if (!isTagStore(MemMI) && MemMI.getOpcode() != AArch64::STGPi) {
-    bool IsPairedInsn = isPairedLdSt(MemMI);
+    bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI);
     for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
       Register DestReg = getLdStRegOp(MemMI, i).getReg();
       if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
@@ -1965,8 +1985,8 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
   MachineBasicBlock::iterator MBBI = I;
   MachineFunction &MF = *MemMI.getMF();
 
-  Register BaseReg = getLdStBaseOp(MemMI).getReg();
-  int Offset = getLdStOffsetOp(MemMI).getImm();
+  Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
+  int Offset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm();
 
   // If the load/store is the first instruction in the block, there's obviously
   // not any matching update. Ditto if the memory offset isn't zero.
@@ -1975,7 +1995,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
   // If the base register overlaps a destination register, we can't
   // merge the update.
   if (!isTagStore(MemMI)) {
-    bool IsPairedInsn = isPairedLdSt(MemMI);
+    bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI);
     for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
       Register DestReg = getLdStRegOp(MemMI, i).getReg();
       if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
@@ -2045,7 +2065,7 @@ bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
 
   // Make sure this is a reg+imm.
   // FIXME: It is possible to extend it to handle reg+reg cases.
-  if (!getLdStOffsetOp(MI).isImm())
+  if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
     return false;
 
   // Look backward up to LdStLimit instructions.
@@ -2099,7 +2119,7 @@ bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
   // range, plus allow an extra one in case we find a later insn that matches
   // with Offset-1)
   bool IsUnscaled = TII->hasUnscaledLdStOffset(MI);
-  int Offset = getLdStOffsetOp(MI).getImm();
+  int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
   int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1;
   // Allow one more for offset.
   if (Offset > 0)
@@ -2166,7 +2186,8 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
   // The immediate in the load/store is scaled by the size of the memory
   // operation. The immediate in the add we're looking for,
   // however, is not, so adjust here.
-  int UnscaledOffset = getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI);
+  int UnscaledOffset =
+      AArch64InstrInfo::getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI);
 
   // Look forward to try to find a pre-index instruction. For example,
   // ldr x1, [x0, #64]
@@ -2268,7 +2289,7 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   if (skipFunction(Fn.getFunction()))
     return false;
 
-  Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
+  Subtarget = &Fn.getSubtarget<AArch64Subtarget>();
   TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
   TRI = Subtarget->getRegisterInfo();
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index 1fc5617b49f6..5c7fb0deecd0 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -60,12 +60,13 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
   MachineLoopInfo *MLI;
   MachineRegisterInfo *MRI;
 
+  using OpcodePair = std::pair<unsigned, unsigned>;
   template <typename T>
   using SplitAndOpcFunc =
-      std::function<Optional<unsigned>(T, unsigned, T &, T &)>;
+      std::function<Optional<OpcodePair>(T, unsigned, T &, T &)>;
   using BuildMIFunc =
-      std::function<void(MachineInstr &, unsigned, unsigned, unsigned, Register,
-                         Register, Register)>;
+      std::function<void(MachineInstr &, OpcodePair, unsigned, unsigned,
+                         Register, Register, Register)>;
 
   /// For instructions where an immediate operand could be split into two
   /// separate immediate instructions, use the splitTwoPartImm two handle the
@@ -83,20 +84,19 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
   ///     %dst = <Instr>ri %tmp (encode half IMM) [...]
   template <typename T>
   bool splitTwoPartImm(MachineInstr &MI,
-                       SmallSetVector<MachineInstr *, 8> &ToBeRemoved,
                        SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr);
 
   bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI,
                         MachineInstr *&SubregToRegMI);
 
   template <typename T>
-  bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI,
-                   SmallSetVector<MachineInstr *, 8> &ToBeRemoved);
+  bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI);
   template <typename T>
-  bool visitAND(unsigned Opc, MachineInstr &MI,
-                SmallSetVector<MachineInstr *, 8> &ToBeRemoved);
-  bool visitORR(MachineInstr &MI,
-                SmallSetVector<MachineInstr *, 8> &ToBeRemoved);
+  bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI);
+
+  template <typename T>
+  bool visitAND(unsigned Opc, MachineInstr &MI);
+  bool visitORR(MachineInstr &MI);
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   StringRef getPassName() const override {
@@ -157,8 +157,7 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
 
 template <typename T>
 bool AArch64MIPeepholeOpt::visitAND(
-    unsigned Opc, MachineInstr &MI,
-    SmallSetVector<MachineInstr *, 8> &ToBeRemoved) {
+    unsigned Opc, MachineInstr &MI) {
   // Try below transformation.
   //
   // MOVi32imm + ANDWrr ==> ANDWri + ANDWri
@@ -170,28 +169,27 @@ bool AArch64MIPeepholeOpt::visitAND(
   // mov + and instructions.
 
   return splitTwoPartImm<T>(
-      MI, ToBeRemoved,
-      [Opc](T Imm, unsigned RegSize, T &Imm0, T &Imm1) -> Optional<unsigned> {
+      MI,
+      [Opc](T Imm, unsigned RegSize, T &Imm0, T &Imm1) -> Optional<OpcodePair> {
         if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))
-          return Opc;
+          return std::make_pair(Opc, Opc);
         return None;
       },
-      [&TII = TII](MachineInstr &MI, unsigned Opcode, unsigned Imm0,
+      [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
                    unsigned Imm1, Register SrcReg, Register NewTmpReg,
                    Register NewDstReg) {
         DebugLoc DL = MI.getDebugLoc();
         MachineBasicBlock *MBB = MI.getParent();
-        BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg)
+        BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
             .addReg(SrcReg)
             .addImm(Imm0);
-        BuildMI(*MBB, MI, DL, TII->get(Opcode), NewDstReg)
+        BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
             .addReg(NewTmpReg)
             .addImm(Imm1);
       });
 }
 
-bool AArch64MIPeepholeOpt::visitORR(
-    MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved) {
+bool AArch64MIPeepholeOpt::visitORR(MachineInstr &MI) {
   // Check this ORR comes from below zero-extend pattern.
   //
   // def : Pat<(i64 (zext GPR32:$src)),
@@ -216,19 +214,38 @@ bool AArch64MIPeepholeOpt::visitORR(
   // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
   // real AArch64 instruction and if it is not, do not process the opcode
   // conservatively.
-  if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END)
+  if (SrcMI->getOpcode() == TargetOpcode::COPY &&
+      SrcMI->getOperand(1).getReg().isVirtual()) {
+    const TargetRegisterClass *RC =
+        MRI->getRegClass(SrcMI->getOperand(1).getReg());
+
+    // A COPY from an FPR will become a FMOVSWr, so do so now so that we know
+    // that the upper bits are zero.
+    if (RC != &AArch64::FPR32RegClass &&
+        ((RC != &AArch64::FPR64RegClass && RC != &AArch64::FPR128RegClass) ||
+         SrcMI->getOperand(1).getSubReg() != AArch64::ssub))
+      return false;
+    Register CpySrc = SrcMI->getOperand(1).getReg();
+    if (SrcMI->getOperand(1).getSubReg() == AArch64::ssub) {
+      CpySrc = MRI->createVirtualRegister(&AArch64::FPR32RegClass);
+      BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),
+              TII->get(TargetOpcode::COPY), CpySrc)
+          .add(SrcMI->getOperand(1));
+    }
+    BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),
+            TII->get(AArch64::FMOVSWr), SrcMI->getOperand(0).getReg())
+        .addReg(CpySrc);
+    SrcMI->eraseFromParent();
+  }
+  else if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END)
     return false;
 
   Register DefReg = MI.getOperand(0).getReg();
   Register SrcReg = MI.getOperand(2).getReg();
   MRI->replaceRegWith(DefReg, SrcReg);
   MRI->clearKillFlags(SrcReg);
-  // replaceRegWith changes MI's definition register. Keep it for SSA form until
-  // deleting MI.
-  MI.getOperand(0).setReg(DefReg);
-  ToBeRemoved.insert(&MI);
-
   LLVM_DEBUG(dbgs() << "Removed: " << MI << "\n");
+  MI.eraseFromParent();
 
   return true;
 }
@@ -255,8 +272,7 @@ static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) {
 
 template <typename T>
 bool AArch64MIPeepholeOpt::visitADDSUB(
-    unsigned PosOpc, unsigned NegOpc, MachineInstr &MI,
-    SmallSetVector<MachineInstr *, 8> &ToBeRemoved) {
+    unsigned PosOpc, unsigned NegOpc, MachineInstr &MI) {
   // Try below transformation.
   //
   // MOVi32imm + ADDWrr ==> ADDWri + ADDWri
@@ -271,25 +287,65 @@ bool AArch64MIPeepholeOpt::visitADDSUB(
   // multiple `mov` + `and/sub` instructions.
 
   return splitTwoPartImm<T>(
-      MI, ToBeRemoved,
+      MI,
       [PosOpc, NegOpc](T Imm, unsigned RegSize, T &Imm0,
-                       T &Imm1) -> Optional<unsigned> {
+                       T &Imm1) -> Optional<OpcodePair> {
         if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
-          return PosOpc;
+          return std::make_pair(PosOpc, PosOpc);
         if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
-          return NegOpc;
+          return std::make_pair(NegOpc, NegOpc);
         return None;
       },
-      [&TII = TII](MachineInstr &MI, unsigned Opcode, unsigned Imm0,
+      [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
                    unsigned Imm1, Register SrcReg, Register NewTmpReg,
                    Register NewDstReg) {
         DebugLoc DL = MI.getDebugLoc();
         MachineBasicBlock *MBB = MI.getParent();
-        BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg)
+        BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
             .addReg(SrcReg)
             .addImm(Imm0)
             .addImm(12);
-        BuildMI(*MBB, MI, DL, TII->get(Opcode), NewDstReg)
+        BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
+            .addReg(NewTmpReg)
+            .addImm(Imm1)
+            .addImm(0);
+      });
+}
+
+template <typename T>
+bool AArch64MIPeepholeOpt::visitADDSSUBS(
+    OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI) {
+  // Try the same transformation as ADDSUB but with additional requirement
+  // that the condition code usages are only for Equal and Not Equal
+  return splitTwoPartImm<T>(
+      MI,
+      [PosOpcs, NegOpcs, &MI, &TRI = TRI, &MRI = MRI](
+          T Imm, unsigned RegSize, T &Imm0, T &Imm1) -> Optional<OpcodePair> {
+        OpcodePair OP;
+        if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
+          OP = PosOpcs;
+        else if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
+          OP = NegOpcs;
+        else
+          return None;
+        // Check conditional uses last since it is expensive for scanning
+        // proceeding instructions
+        MachineInstr &SrcMI = *MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
+        Optional<UsedNZCV> NZCVUsed = examineCFlagsUse(SrcMI, MI, *TRI);
+        if (!NZCVUsed || NZCVUsed->C || NZCVUsed->V)
+          return None;
+        return OP;
+      },
+      [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
+                   unsigned Imm1, Register SrcReg, Register NewTmpReg,
+                   Register NewDstReg) {
+        DebugLoc DL = MI.getDebugLoc();
+        MachineBasicBlock *MBB = MI.getParent();
+        BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
+            .addReg(SrcReg)
+            .addImm(Imm0)
+            .addImm(12);
+        BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
             .addReg(NewTmpReg)
             .addImm(Imm1)
             .addImm(0);
@@ -338,7 +394,7 @@ bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI,
 
 template <typename T>
 bool AArch64MIPeepholeOpt::splitTwoPartImm(
-    MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved,
+    MachineInstr &MI,
     SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr) {
   unsigned RegSize = sizeof(T) * 8;
   assert((RegSize == 32 || RegSize == 64) &&
@@ -357,39 +413,63 @@ bool AArch64MIPeepholeOpt::splitTwoPartImm(
   // number since it was sign extended when we assign to the 64-bit Imm.
   if (SubregToRegMI)
     Imm &= 0xFFFFFFFF;
-  unsigned Opcode;
+  OpcodePair Opcode;
   if (auto R = SplitAndOpc(Imm, RegSize, Imm0, Imm1))
-    Opcode = R.getValue();
+    Opcode = *R;
   else
     return false;
 
-  // Create new ADD/SUB MIs.
+  // Create new MIs using the first and second opcodes. Opcodes might differ for
+  // flag setting operations that should only set flags on second instruction.
+  // NewTmpReg = Opcode.first SrcReg Imm0
+  // NewDstReg = Opcode.second NewTmpReg Imm1
+
+  // Determine register classes for destinations and register operands
   MachineFunction *MF = MI.getMF();
-  const TargetRegisterClass *RC =
-      TII->getRegClass(TII->get(Opcode), 0, TRI, *MF);
-  const TargetRegisterClass *ORC =
-      TII->getRegClass(TII->get(Opcode), 1, TRI, *MF);
+  const TargetRegisterClass *FirstInstrDstRC =
+      TII->getRegClass(TII->get(Opcode.first), 0, TRI, *MF);
+  const TargetRegisterClass *FirstInstrOperandRC =
+      TII->getRegClass(TII->get(Opcode.first), 1, TRI, *MF);
+  const TargetRegisterClass *SecondInstrDstRC =
+      (Opcode.first == Opcode.second)
+          ? FirstInstrDstRC
+          : TII->getRegClass(TII->get(Opcode.second), 0, TRI, *MF);
+  const TargetRegisterClass *SecondInstrOperandRC =
+      (Opcode.first == Opcode.second)
+          ? FirstInstrOperandRC
+          : TII->getRegClass(TII->get(Opcode.second), 1, TRI, *MF);
+
+  // Get old registers destinations and new register destinations
   Register DstReg = MI.getOperand(0).getReg();
   Register SrcReg = MI.getOperand(1).getReg();
-  Register NewTmpReg = MRI->createVirtualRegister(RC);
-  Register NewDstReg = MRI->createVirtualRegister(RC);
-
-  MRI->constrainRegClass(SrcReg, RC);
-  MRI->constrainRegClass(NewTmpReg, ORC);
-  MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg));
-
+  Register NewTmpReg = MRI->createVirtualRegister(FirstInstrDstRC);
+  // In the situation that DstReg is not Virtual (likely WZR or XZR), we want to
+  // reuse that same destination register.
+  Register NewDstReg = DstReg.isVirtual()
+                           ? MRI->createVirtualRegister(SecondInstrDstRC)
+                           : DstReg;
+
+  // Constrain registers based on their new uses
+  MRI->constrainRegClass(SrcReg, FirstInstrOperandRC);
+  MRI->constrainRegClass(NewTmpReg, SecondInstrOperandRC);
+  if (DstReg != NewDstReg)
+    MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg));
+
+  // Call the delegating operation to build the instruction
   BuildInstr(MI, Opcode, Imm0, Imm1, SrcReg, NewTmpReg, NewDstReg);
 
-  MRI->replaceRegWith(DstReg, NewDstReg);
   // replaceRegWith changes MI's definition register. Keep it for SSA form until
-  // deleting MI.
-  MI.getOperand(0).setReg(DstReg);
+  // deleting MI. Only if we made a new destination register.
+  if (DstReg != NewDstReg) {
+    MRI->replaceRegWith(DstReg, NewDstReg);
+    MI.getOperand(0).setReg(DstReg);
+  }
 
   // Record the MIs need to be removed.
-  ToBeRemoved.insert(&MI);
+  MI.eraseFromParent();
   if (SubregToRegMI)
-    ToBeRemoved.insert(SubregToRegMI);
-  ToBeRemoved.insert(MovMI);
+    SubregToRegMI->eraseFromParent();
+  MovMI->eraseFromParent();
 
   return true;
 }
@@ -407,45 +487,57 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
   assert(MRI->isSSA() && "Expected to be run on SSA form!");
 
   bool Changed = false;
-  SmallSetVector<MachineInstr *, 8> ToBeRemoved;
 
   for (MachineBasicBlock &MBB : MF) {
-    for (MachineInstr &MI : MBB) {
+    for (MachineInstr &MI : make_early_inc_range(MBB)) {
       switch (MI.getOpcode()) {
       default:
         break;
       case AArch64::ANDWrr:
-        Changed = visitAND<uint32_t>(AArch64::ANDWri, MI, ToBeRemoved);
+        Changed = visitAND<uint32_t>(AArch64::ANDWri, MI);
         break;
       case AArch64::ANDXrr:
-        Changed = visitAND<uint64_t>(AArch64::ANDXri, MI, ToBeRemoved);
+        Changed = visitAND<uint64_t>(AArch64::ANDXri, MI);
         break;
       case AArch64::ORRWrs:
-        Changed = visitORR(MI, ToBeRemoved);
+        Changed = visitORR(MI);
         break;
       case AArch64::ADDWrr:
-        Changed = visitADDSUB<uint32_t>(AArch64::ADDWri, AArch64::SUBWri, MI,
-                                        ToBeRemoved);
+        Changed = visitADDSUB<uint32_t>(AArch64::ADDWri, AArch64::SUBWri, MI);
         break;
       case AArch64::SUBWrr:
-        Changed = visitADDSUB<uint32_t>(AArch64::SUBWri, AArch64::ADDWri, MI,
-                                        ToBeRemoved);
+        Changed = visitADDSUB<uint32_t>(AArch64::SUBWri, AArch64::ADDWri, MI);
         break;
       case AArch64::ADDXrr:
-        Changed = visitADDSUB<uint64_t>(AArch64::ADDXri, AArch64::SUBXri, MI,
-                                        ToBeRemoved);
+        Changed = visitADDSUB<uint64_t>(AArch64::ADDXri, AArch64::SUBXri, MI);
         break;
       case AArch64::SUBXrr:
-        Changed = visitADDSUB<uint64_t>(AArch64::SUBXri, AArch64::ADDXri, MI,
-                                        ToBeRemoved);
+        Changed = visitADDSUB<uint64_t>(AArch64::SUBXri, AArch64::ADDXri, MI);
+        break;
+      case AArch64::ADDSWrr:
+        Changed = visitADDSSUBS<uint32_t>({AArch64::ADDWri, AArch64::ADDSWri},
+                                          {AArch64::SUBWri, AArch64::SUBSWri},
+                                          MI);
+        break;
+      case AArch64::SUBSWrr:
+        Changed = visitADDSSUBS<uint32_t>({AArch64::SUBWri, AArch64::SUBSWri},
+                                          {AArch64::ADDWri, AArch64::ADDSWri},
+                                          MI);
+        break;
+      case AArch64::ADDSXrr:
+        Changed = visitADDSSUBS<uint64_t>({AArch64::ADDXri, AArch64::ADDSXri},
+                                          {AArch64::SUBXri, AArch64::SUBSXri},
+                                          MI);
+        break;
+      case AArch64::SUBSXrr:
+        Changed = visitADDSSUBS<uint64_t>({AArch64::SUBXri, AArch64::SUBSXri},
+                                          {AArch64::ADDXri, AArch64::ADDSXri},
+                                          MI);
         break;
       }
     }
   }
 
-  for (MachineInstr *MI : ToBeRemoved)
-    MI->eraseFromParent();
-
   return Changed;
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
index 6950675c5d53..a2ab2b855d80 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -15,8 +15,11 @@
 
 #include "AArch64MachineFunctionInfo.h"
 #include "AArch64InstrInfo.h"
-#include <llvm/IR/Metadata.h>
-#include <llvm/IR/Module.h>
+#include "AArch64Subtarget.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
 
 using namespace llvm;
 
@@ -30,7 +33,7 @@ void yaml::AArch64FunctionInfo::mappingImpl(yaml::IO &YamlIO) {
 
 void AArch64FunctionInfo::initializeBaseYamlFields(
     const yaml::AArch64FunctionInfo &YamlMFI) {
-  if (YamlMFI.HasRedZone.hasValue())
+  if (YamlMFI.HasRedZone)
     HasRedZone = YamlMFI.HasRedZone;
 }
 
@@ -77,15 +80,17 @@ static bool ShouldSignWithBKey(const Function &F) {
   return Key.equals_insensitive("b_key");
 }
 
-AArch64FunctionInfo::AArch64FunctionInfo(MachineFunction &MF) : MF(MF) {
+AArch64FunctionInfo::AArch64FunctionInfo(MachineFunction &MF_) : MF(&MF_) {
   // If we already know that the function doesn't have a redzone, set
   // HasRedZone here.
-  if (MF.getFunction().hasFnAttribute(Attribute::NoRedZone))
+  if (MF->getFunction().hasFnAttribute(Attribute::NoRedZone))
     HasRedZone = false;
 
-  const Function &F = MF.getFunction();
+  const Function &F = MF->getFunction();
   std::tie(SignReturnAddress, SignReturnAddressAll) = GetSignReturnAddress(F);
   SignWithBKey = ShouldSignWithBKey(F);
+  // TODO: skip functions that have no instrumented allocas for optimization
+  IsMTETagged = F.hasFnAttribute(Attribute::SanitizeMemTag);
 
   if (!F.hasFnAttribute("branch-target-enforcement")) {
     if (const auto *BTE = mdconst::extract_or_null<ConstantInt>(
@@ -101,6 +106,15 @@ AArch64FunctionInfo::AArch64FunctionInfo(MachineFunction &MF) : MF(MF) {
   BranchTargetEnforcement = BTIEnable.equals_insensitive("true");
 }
 
+MachineFunctionInfo *AArch64FunctionInfo::clone(
+    BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+    const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+    const {
+  AArch64FunctionInfo *InfoClone = DestMF.cloneInfo<AArch64FunctionInfo>(*this);
+  InfoClone->MF = &DestMF;
+  return InfoClone;
+}
+
 bool AArch64FunctionInfo::shouldSignReturnAddress(bool SpillsLR) const {
   if (!SignReturnAddress)
     return false;
@@ -111,6 +125,27 @@ bool AArch64FunctionInfo::shouldSignReturnAddress(bool SpillsLR) const {
 
 bool AArch64FunctionInfo::shouldSignReturnAddress() const {
   return shouldSignReturnAddress(llvm::any_of(
-      MF.getFrameInfo().getCalleeSavedInfo(),
+      MF->getFrameInfo().getCalleeSavedInfo(),
       [](const auto &Info) { return Info.getReg() == AArch64::LR; }));
 }
+
+bool AArch64FunctionInfo::needsDwarfUnwindInfo() const {
+  if (!NeedsDwarfUnwindInfo)
+    NeedsDwarfUnwindInfo = MF->needsFrameMoves() &&
+                           !MF->getTarget().getMCAsmInfo()->usesWindowsCFI();
+
+  return *NeedsDwarfUnwindInfo;
+}
+
+bool AArch64FunctionInfo::needsAsyncDwarfUnwindInfo() const {
+  if (!NeedsAsyncDwarfUnwindInfo) {
+    const Function &F = MF->getFunction();
+    //  The check got "minsize" is because epilogue unwind info is not emitted
+    //  (yet) for homogeneous epilogues, outlined functions, and functions
+    //  outlined from.
+    NeedsAsyncDwarfUnwindInfo = needsDwarfUnwindInfo() &&
+                                F.getUWTableKind() == UWTableKind::Async &&
+                                !F.hasMinSize();
+  }
+  return *NeedsAsyncDwarfUnwindInfo;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index e5e08e6c00d6..f070f989a5b7 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MIRYamlMapping.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCLinkerOptimizationHint.h"
@@ -36,7 +37,7 @@ class MachineInstr;
 /// contains private AArch64-specific information for each MachineFunction.
 class AArch64FunctionInfo final : public MachineFunctionInfo {
   /// Backreference to the machine function.
-  MachineFunction &MF;
+  MachineFunction *MF;
 
   /// Number of bytes of arguments this function has on the stack. If the callee
   /// is expected to restore the argument stack this should be a multiple of 16,
@@ -115,7 +116,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   /// SRetReturnReg - sret lowering includes returning the value of the
   /// returned struct in a register. This field holds the virtual register into
   /// which the sret argument is passed.
-  unsigned SRetReturnReg = 0;
+  Register SRetReturnReg;
+
   /// SVE stack size (for predicates and data vectors) are maintained here
   /// rather than in FrameInfo, as the placement and Stack IDs are target
   /// specific.
@@ -173,9 +175,29 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   /// The stack slot where the Swift asynchronous context is stored.
   int SwiftAsyncContextFrameIdx = std::numeric_limits<int>::max();
 
+  bool IsMTETagged = false;
+
+  /// The function has Scalable Vector or Scalable Predicate register argument
+  /// or return type
+  bool IsSVECC = false;
+
+  /// True if the function need unwind information.
+  mutable Optional<bool> NeedsDwarfUnwindInfo;
+
+  /// True if the function need asynchronous unwind information.
+  mutable Optional<bool> NeedsAsyncDwarfUnwindInfo;
+
 public:
   explicit AArch64FunctionInfo(MachineFunction &MF);
 
+  MachineFunctionInfo *
+  clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+        const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+      const override;
+
+  bool isSVECC() const { return IsSVECC; };
+  void setIsSVECC(bool s) { IsSVECC = s; };
+
   void initializeBaseYamlFields(const yaml::AArch64FunctionInfo &YamlMFI);
 
   unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; }
@@ -395,6 +417,7 @@ public:
   bool shouldSignReturnAddress(bool SpillsLR) const;
 
   bool shouldSignWithBKey() const { return SignWithBKey; }
+  bool isMTETagged() const { return IsMTETagged; }
 
   bool branchTargetEnforcement() const { return BranchTargetEnforcement; }
 
@@ -408,6 +431,9 @@ public:
   }
   int getSwiftAsyncContextFrameIdx() const { return SwiftAsyncContextFrameIdx; }
 
+  bool needsDwarfUnwindInfo() const;
+  bool needsAsyncDwarfUnwindInfo() const;
+
 private:
   // Hold the lists of LOHs.
   MILOHContainer LOHContainerSet;
diff --git a/llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp b/llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp
new file mode 100644
index 000000000000..6c8845ee8598
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp
@@ -0,0 +1,82 @@
+//===- AArch64MachineScheduler.cpp - MI Scheduler for AArch64 -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64MachineScheduler.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+
+using namespace llvm;
+
+static bool needReorderStoreMI(const MachineInstr *MI) {
+  if (!MI)
+    return false;
+
+  switch (MI->getOpcode()) {
+  default:
+    return false;
+  case AArch64::STURQi:
+  case AArch64::STRQui:
+    if (MI->getMF()->getSubtarget<AArch64Subtarget>().isStoreAddressAscend())
+       return false;
+    LLVM_FALLTHROUGH;
+  case AArch64::STPQi:
+    return AArch64InstrInfo::getLdStOffsetOp(*MI).isImm();
+  }
+
+  return false;
+}
+
+// Return true if two stores with same base address may overlap writes
+static bool mayOverlapWrite(const MachineInstr &MI0, const MachineInstr &MI1,
+                            int64_t &Off0, int64_t &Off1) {
+  const MachineOperand &Base0 = AArch64InstrInfo::getLdStBaseOp(MI0);
+  const MachineOperand &Base1 = AArch64InstrInfo::getLdStBaseOp(MI1);
+
+  // May overlapping writes if two store instructions without same base
+  if (!Base0.isIdenticalTo(Base1))
+    return true;
+
+  int StoreSize0 = AArch64InstrInfo::getMemScale(MI0);
+  int StoreSize1 = AArch64InstrInfo::getMemScale(MI1);
+  Off0 = AArch64InstrInfo::hasUnscaledLdStOffset(MI0.getOpcode())
+             ? AArch64InstrInfo::getLdStOffsetOp(MI0).getImm()
+             : AArch64InstrInfo::getLdStOffsetOp(MI0).getImm() * StoreSize0;
+  Off1 = AArch64InstrInfo::hasUnscaledLdStOffset(MI1.getOpcode())
+             ? AArch64InstrInfo::getLdStOffsetOp(MI1).getImm()
+             : AArch64InstrInfo::getLdStOffsetOp(MI1).getImm() * StoreSize1;
+
+  const MachineInstr &MI = (Off0 < Off1) ? MI0 : MI1;
+  int Multiples = AArch64InstrInfo::isPairedLdSt(MI) ? 2 : 1;
+  int StoreSize = AArch64InstrInfo::getMemScale(MI) * Multiples;
+
+  return llabs(Off0 - Off1) < StoreSize;
+}
+
+bool AArch64PostRASchedStrategy::tryCandidate(SchedCandidate &Cand,
+                                              SchedCandidate &TryCand) {
+  bool OriginalResult = PostGenericScheduler::tryCandidate(Cand, TryCand);
+
+  if (Cand.isValid()) {
+    MachineInstr *Instr0 = TryCand.SU->getInstr();
+    MachineInstr *Instr1 = Cand.SU->getInstr();
+
+    if (!needReorderStoreMI(Instr0) || !needReorderStoreMI(Instr1))
+      return OriginalResult;
+
+    int64_t Off0, Off1;
+    // With the same base address and non-overlapping writes.
+    if (!mayOverlapWrite(*Instr0, *Instr1, Off0, Off1)) {
+      TryCand.Reason = NodeOrder;
+      // Order them by ascending offsets.
+      return Off0 < Off1;
+    }
+  }
+
+  return OriginalResult;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64MachineScheduler.h b/llvm/lib/Target/AArch64/AArch64MachineScheduler.h
new file mode 100644
index 000000000000..23df015986d1
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64MachineScheduler.h
@@ -0,0 +1,33 @@
+//===- AArch64MachineScheduler.h - Custom AArch64 MI scheduler --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Custom AArch64 MI scheduler.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINESCHEDULER_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINESCHEDULER_H
+
+#include "llvm/CodeGen/MachineScheduler.h"
+
+namespace llvm {
+
+/// A MachineSchedStrategy implementation for AArch64 post RA scheduling.
+class AArch64PostRASchedStrategy : public PostGenericScheduler {
+public:
+  AArch64PostRASchedStrategy(const MachineSchedContext *C) :
+    PostGenericScheduler(C) {}
+
+protected:
+  bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand) override;
+};
+
+} // end namespace llvm
+
+#endif
+
diff --git a/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp b/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
index e8217eaf6ed5..c7657f37d16d 100644
--- a/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
@@ -157,16 +157,19 @@ static bool isCryptoEORPair(const MachineInstr *FirstMI,
   return false;
 }
 
-/// Literal generation.
-static bool isLiteralsPair(const MachineInstr *FirstMI,
-                           const MachineInstr &SecondMI) {
+static bool isAdrpAddPair(const MachineInstr *FirstMI,
+                          const MachineInstr &SecondMI) {
   // Assume the 1st instr to be a wildcard if it is unspecified.
-
-  // PC relative address.
   if ((FirstMI == nullptr || FirstMI->getOpcode() == AArch64::ADRP) &&
       SecondMI.getOpcode() == AArch64::ADDXri)
     return true;
+  return false;
+}
 
+/// Literal generation.
+static bool isLiteralsPair(const MachineInstr *FirstMI,
+                           const MachineInstr &SecondMI) {
+  // Assume the 1st instr to be a wildcard if it is unspecified.
   // 32 bit immediate.
   if ((FirstMI == nullptr || FirstMI->getOpcode() == AArch64::MOVZWi) &&
       (SecondMI.getOpcode() == AArch64::MOVKWi &&
@@ -397,6 +400,8 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
     return true;
   if (ST.hasFuseCryptoEOR() && isCryptoEORPair(FirstMI, SecondMI))
     return true;
+  if (ST.hasFuseAdrpAdd() && isAdrpAddPair(FirstMI, SecondMI))
+    return true;
   if (ST.hasFuseLiterals() && isLiteralsPair(FirstMI, SecondMI))
     return true;
   if (ST.hasFuseAddress() && isAddressLdStPair(FirstMI, SecondMI))
diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
index f443cd03935c..4555f1a3ebb0 100644
--- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
+++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
@@ -14,6577 +14,6608 @@
 #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64PERFECTSHUFFLE_H
 #define LLVM_LIB_TARGET_AARCH64_AARCH64PERFECTSHUFFLE_H
 
+#include "llvm/ADT/ArrayRef.h"
+
 // 31 entries have cost 0
-// 242 entries have cost 1
-// 1447 entries have cost 2
-// 3602 entries have cost 3
-// 1237 entries have cost 4
-// 2 entries have cost 5
+// 756 entries have cost 1
+// 3690 entries have cost 2
+// 2084 entries have cost 3
 
 // This table is 6561*4 = 26244 bytes in size.
-static const unsigned PerfectShuffleTable[6561+1] = {
-  135053414U, // <0,0,0,0>: Cost 1 vdup0 LHS
-  1543503974U, // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS
-  2618572962U, // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0>
-  2568054923U, // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
-  1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS
-  2550140624U, // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3>
-  2550141434U, // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3>
-  2591945711U, // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
-  135053414U, // <0,0,0,u>: Cost 1 vdup0 LHS
-  2886516736U, // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0>
-  1812775014U, // <0,0,1,1>: Cost 2 vzipl LHS, LHS
-  1618133094U, // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
-  2625209292U, // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0>
-  2886558034U, // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5>
-  2617246864U, // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7>
-  3659723031U, // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1>
-  2591953904U, // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1>
-  1812775581U, // <0,0,1,u>: Cost 2 vzipl LHS, LHS
-  3020734464U, // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0>
-  3020734474U, // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1>
-  1946992742U, // <0,0,2,2>: Cost 2 vtrnl LHS, LHS
-  2631181989U, // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0>
-  3020734668U, // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6>
-  3826550569U, // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6>
-  2617247674U, // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7>
-  2591962097U, // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
-  1946992796U, // <0,0,2,u>: Cost 2 vtrnl LHS, LHS
-  2635163787U, // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
-  2686419196U, // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0>
-  2686492933U, // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0>
-  2617248156U, // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3>
-  2617248258U, // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6>
-  3826551298U, // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6>
-  3690990200U, // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7>
-  3713551042U, // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0>
-  2635163787U, // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
-  2617248658U, // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1>
-  2888450150U, // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
-  3021570150U, // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
-  3641829519U, // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4>
-  3021570252U, // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6>
-  1543507254U, // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS
-  2752810294U, // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS
-  3786998152U, // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5>
-  1543507497U, // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS
-  2684354972U, // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7>
-  2617249488U, // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3>
-  3765617070U, // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7>
-  3635865780U, // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5>
-  2617249734U, // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6>
-  2617249796U, // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5>
-  2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7>
-  2617249960U, // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7>
-  2720039396U, // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7>
-  2684355053U, // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7>
-  3963609190U, // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS
-  2617250298U, // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3>
-  3796435464U, // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7>
-  3659762998U, // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS
-  3659763810U, // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0>
-  2617250616U, // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6>
-  2657727309U, // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0>
-  2658390942U, // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0>
-  2659054575U, // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
-  3635880854U, // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0>
-  3635881401U, // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7>
-  3734787298U, // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0>
-  2617251174U, // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6>
-  3659772002U, // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0>
-  3659772189U, // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7>
-  2617251436U, // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7>
-  2659054575U, // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
-  135053414U, // <0,0,u,0>: Cost 1 vdup0 LHS
-  1817419878U, // <0,0,u,1>: Cost 2 vzipl LHS, LHS
-  1947435110U, // <0,0,u,2>: Cost 2 vtrnl LHS, LHS
-  2568120467U, // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u>
-  1476463926U, // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS
-  1543510170U, // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS
-  2752813210U, // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS
-  2592011255U, // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u>
-  135053414U, // <0,0,u,u>: Cost 1 vdup0 LHS
-  2618581002U, // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1>
-  1557446758U, // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS
-  2618581155U, // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1>
-  2690548468U, // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0>
-  2626543954U, // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5>
-  4094985216U, // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7>
-  2592019278U, // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1>
-  2592019448U, // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0>
-  1557447325U, // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS
-  1476476938U, // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1>
-  2886517556U, // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1>
-  2886517654U, // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0>
-  2886517720U, // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3>
-  1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS
-  2886558864U, // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7>
-  2550223354U, // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3>
-  2550223856U, // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1>
-  1476482862U, // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS
-  1494401126U, // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS
-  3020735284U, // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1>
-  2562172349U, // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2>
-  835584U, // <0,1,2,3>: Cost 0 copy LHS
-  1494404406U, // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS
-  3020735488U, // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7>
-  2631190458U, // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7>
-  1518294010U, // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2>
-  835584U, // <0,1,2,u>: Cost 0 copy LHS
-  2692318156U, // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0>
-  2691875800U, // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3>
-  2691875806U, // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0>
-  2692539367U, // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0>
-  2562182454U, // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS
-  2691875840U, // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7>
-  2692760578U, // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0>
-  2639817411U, // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1>
-  2691875863U, // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3>
-  2568159334U, // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS
-  4095312692U, // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1>
-  2568160934U, // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1>
-  2568161432U, // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4>
-  2568162614U, // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS
-  1557450038U, // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS
-  2754235702U, // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS
-  2592052220U, // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4>
-  1557450281U, // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS
-  3765617775U, // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1>
-  2647781007U, // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1>
-  3704934138U, // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0>
-  2691875984U, // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7>
-  2657734598U, // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6>
-  2650435539U, // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1>
-  2651099172U, // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1>
-  2651762805U, // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1>
-  2691876029U, // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7>
-  2592063590U, // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS
-  3765617871U, // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7>
-  2654417337U, // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1>
-  3765617889U, // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7>
-  2592066870U, // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS
-  3765617907U, // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7>
-  2657071869U, // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1>
-  1583993678U, // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1>
-  1584657311U, // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1>
-  2657735672U, // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0>
-  2657735808U, // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1>
-  2631193772U, // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0>
-  2661053667U, // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1>
-  2657736038U, // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6>
-  3721524621U, // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0>
-  2657736158U, // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0>
-  2657736300U, // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7>
-  2657736322U, // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2>
-  1494450278U, // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS
-  1557452590U, // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS
-  2754238254U, // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS
-  835584U, // <0,1,u,3>: Cost 0 copy LHS
-  1494453558U, // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS
-  1557452954U, // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS
-  2754238618U, // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS
-  1518343168U, // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u>
-  835584U, // <0,1,u,u>: Cost 0 copy LHS
-  2752299008U, // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0>
-  1544847462U, // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS
-  1678557286U, // <0,2,0,2>: Cost 2 vuzpl LHS, LHS
-  2696521165U, // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0>
-  2752340172U, // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6>
-  2691876326U, // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7>
-  2618589695U, // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7>
-  2592093185U, // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0>
-  1678557340U, // <0,2,0,u>: Cost 2 vuzpl LHS, LHS
-  2618589942U, // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2>
-  2752299828U, // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1>
-  2886518376U, // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2>
-  2752299766U, // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
-  2550295862U, // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS
-  2752340992U, // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7>
-  2886559674U, // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7>
-  3934208106U, // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7>
-  2752340771U, // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2>
-  1476558868U, // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2>
-  2226628029U, // <0,2,2,1>: Cost 3 vrev <2,0,1,2>
-  2752300648U, // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2>
-  3020736114U, // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3>
-  1476562230U, // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS
-  2550304464U, // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3>
-  2618591162U, // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7>
-  2550305777U, // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2>
-  1476564782U, // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS
-  2618591382U, // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2>
-  2752301206U, // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
-  3826043121U, // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3>
-  2752301468U, // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
-  2618591746U, // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6>
-  2752301570U, // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
-  3830688102U, // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3>
-  2698807012U, // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0>
-  2752301269U, // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
-  2562261094U, // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS
-  4095313828U, // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3>
-  2226718152U, // <0,2,4,2>: Cost 3 vrev <2,0,2,4>
-  2568235169U, // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4>
-  2562264374U, // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS
-  1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS
-  1678560566U, // <0,2,4,6>: Cost 2 vuzpl LHS, RHS
-  2592125957U, // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
-  1678560584U, // <0,2,4,u>: Cost 2 vuzpl LHS, RHS
-  2691876686U, // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7>
-  2618592976U, // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3>
-  3765618528U, // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7>
-  3765618536U, // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6>
-  2618593222U, // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6>
-  2752303108U, // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5>
-  2618593378U, // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0>
-  2824785206U, // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS
-  2824785207U, // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS
-  2752303950U, // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1>
-  3830690081U, // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2>
-  2618593786U, // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3>
-  2691876794U, // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7>
-  2752303990U, // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5>
-  3830690445U, // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6>
-  2752303928U, // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6>
-  2657743695U, // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2>
-  2691876839U, // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7>
-  2659070961U, // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
-  2659734594U, // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2>
-  3734140051U, // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2>
-  2701166596U, // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0>
-  2662389094U, // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6>
-  2662389126U, // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2>
-  3736794583U, // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2>
-  2752304748U, // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7>
-  2659070961U, // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
-  1476608026U, // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u>
-  1544853294U, // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS
-  1678563118U, // <0,2,u,2>: Cost 2 vuzpl LHS, LHS
-  3021178482U, // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3>
-  1476611382U, // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS
-  1544853658U, // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS
-  1678563482U, // <0,2,u,6>: Cost 2 vuzpl LHS, RHS
-  2824785449U, // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS
-  1678563172U, // <0,2,u,u>: Cost 2 vuzpl LHS, LHS
-  2556329984U, // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0>
-  2686421142U, // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2>
-  2562303437U, // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0>
-  4094986652U, // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3>
-  2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS
-  4094986754U, // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6>
-  3798796488U, // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7>
-  3776530634U, // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0>
-  2556335918U, // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS
-  2886518934U, // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2>
-  2556338933U, // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1>
-  2691877105U, // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3>
-  2886519196U, // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3>
-  2886519298U, // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6>
-  4095740418U, // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6>
-  3659944242U, // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1>
-  3769600286U, // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3>
-  2886519582U, // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2>
-  1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS
-  1482605302U, // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2>
-  2556348008U, // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2>
-  3020736924U, // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3>
-  1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS
-  3020737026U, // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6>
-  2598154746U, // <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3>
-  2598155258U, // <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2>
-  1482610478U, // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS
-  3692341398U, // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2>
-  2635851999U, // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3>
-  3636069840U, // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3>
-  2691877276U, // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3>
-  3961522690U, // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6>
-  3826797058U, // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6>
-  3703622282U, // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7>
-  3769600452U, // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7>
-  2640497430U, // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3>
-  3962194070U, // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2>
-  2232617112U, // <0,3,4,1>: Cost 3 vrev <3,0,1,4>
-  2232690849U, // <0,3,4,2>: Cost 3 vrev <3,0,2,4>
-  4095314332U, // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3>
-  3962194434U, // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6>
-  2691877378U, // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6>
-  3826765110U, // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS
-  3665941518U, // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4>
-  2691877405U, // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6>
-  3630112870U, // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS
-  3630113526U, // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2>
-  4035199734U, // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2>
-  3769600578U, // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7>
-  2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5>
-  3779037780U, // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7>
-  2718714461U, // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7>
-  2706106975U, // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0>
-  2233141464U, // <0,3,5,u>: Cost 3 vrev <3,0,u,5>
-  2691877496U, // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7>
-  3727511914U, // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3>
-  3765619338U, // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7>
-  3765619347U, // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7>
-  3765987996U, // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7>
-  3306670270U, // <0,3,6,5>: Cost 4 vrev <3,0,5,6>
-  3792456365U, // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6>
-  2706770608U, // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0>
-  2706844345U, // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0>
-  3769600707U, // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1>
-  2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3>
-  3636102612U, // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7>
-  3769600740U, // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7>
-  3769600747U, // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5>
-  3769600758U, // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7>
-  3659993400U, // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7>
-  3781176065U, // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0>
-  2664388218U, // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3>
-  1482653798U, // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS
-  1482654460U, // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u>
-  2556397160U, // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2>
-  3021179292U, // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3>
-  1482657078U, // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS
-  3021179394U, // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6>
-  2598203898U, // <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3>
-  2708097874U, // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0>
-  1482659630U, // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS
-  2617278468U, // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4>
-  2618605670U, // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS
-  2618605734U, // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4>
-  3642091695U, // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0>
-  2753134796U, // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6>
-  2718714770U, // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1>
-  3021245750U, // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
-  3665982483U, // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0>
-  3021245768U, // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS
-  2568355942U, // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS
-  3692348212U, // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1>
-  3692348310U, // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0>
-  2568358064U, // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1>
-  2568359222U, // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS
-  1812778294U, // <0,4,1,5>: Cost 2 vzipl LHS, RHS
-  3022671158U, // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS
-  2592248852U, // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
-  1812778537U, // <0,4,1,u>: Cost 2 vzipl LHS, RHS
-  2568364134U, // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS
-  2238573423U, // <0,4,2,1>: Cost 3 vrev <4,0,1,2>
-  3692349032U, // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2>
-  2631214761U, // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4>
-  2568367414U, // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS
-  2887028022U, // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS
-  1946996022U, // <0,4,2,6>: Cost 2 vtrnl LHS, RHS
-  2592257045U, // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
-  1946996040U, // <0,4,2,u>: Cost 2 vtrnl LHS, RHS
-  3692349590U, // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2>
-  3826878614U, // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2>
-  3826878625U, // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4>
-  3692349852U, // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3>
-  3692349954U, // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6>
-  3826878978U, // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6>
-  4095200566U, // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS
-  3713583814U, // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4>
-  3692350238U, // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2>
-  2550464552U, // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4>
-  3962194914U, // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0>
-  3693677631U, // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3>
-  3642124467U, // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4>
-  2718715088U, // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4>
-  2618608950U, // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS
-  2753137974U, // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS
-  3666015255U, // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4>
-  2618609193U, // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS
-  2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS
-  2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0>
-  3636159963U, // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5>
-  2568390836U, // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5>
-  2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS
-  2718715180U, // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6>
-  1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
-  2592281624U, // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5>
-  1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
-  2550480938U, // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6>
-  3826880801U, // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2>
-  2562426332U, // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6>
-  3786190181U, // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0>
-  2718715252U, // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6>
-  3826881165U, // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6>
-  2712669568U, // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0>
-  2657760081U, // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4>
-  2718715284U, // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2>
-  3654090854U, // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS
-  3934229326U, // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1>
-  3734156437U, // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4>
-  3734820070U, // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4>
-  3654094134U, // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS
-  2713259464U, // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
-  2713333201U, // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0>
-  3654095866U, // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2>
-  2713259464U, // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
-  2568413286U, // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS
-  2618611502U, // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS
-  2753140526U, // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS
-  2568415415U, // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u>
-  2568416566U, // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS
-  1817423158U, // <0,4,u,5>: Cost 2 vzipl LHS, RHS
-  1947438390U, // <0,4,u,6>: Cost 2 vtrnl LHS, RHS
-  2592306203U, // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u>
-  1947438408U, // <0,4,u,u>: Cost 2 vtrnl LHS, RHS
-  3630219264U, // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0>
-  2625912934U, // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS
-  3692355748U, // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2>
-  3693019384U, // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5>
-  3630222646U, // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS
-  3699655062U, // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1>
-  2718715508U, // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1>
-  3087011126U, // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS
-  2625913501U, // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS
-  1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS
-  2886520528U, // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3>
-  2574403176U, // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2>
-  2574403734U, // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2>
-  1500662674U, // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1>
-  2886520836U, // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5>
-  2886520930U, // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0>
-  2718715600U, // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3>
-  1500665646U, // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS
-  2556493926U, // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS
-  2244546120U, // <0,5,2,1>: Cost 3 vrev <5,0,1,2>
-  3692357256U, // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7>
-  2568439994U, // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2>
-  2556497206U, // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS
-  3020738564U, // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5>
-  4027877161U, // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6>
-  3093220662U, // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS
-  3093220663U, // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS
-  3699656854U, // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2>
-  3699656927U, // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3>
-  3699657006U, // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1>
-  3699657116U, // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3>
-  2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5>
-  3790319453U, // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0>
-  3699657354U, // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7>
-  2716725103U, // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0>
-  2716798840U, // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0>
-  2661747602U, // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1>
-  3630252810U, // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4>
-  3636225507U, // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4>
-  3716910172U, // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5>
-  3962195892U, // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6>
-  2625916214U, // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS
-  3718901071U, // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5>
-  2718715846U, // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6>
-  2625916457U, // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS
-  3791278034U, // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0>
-  3791351771U, // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0>
-  3318386260U, // <0,5,5,2>: Cost 4 vrev <5,0,2,5>
-  3791499245U, // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0>
-  3318533734U, // <0,5,5,4>: Cost 4 vrev <5,0,4,5>
-  2718715908U, // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5>
-  2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0>
-  2718715928U, // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7>
-  2718715937U, // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7>
-  2592358502U, // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS
-  3792015404U, // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0>
-  3731509754U, // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3>
-  3785748546U, // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4>
-  2592361782U, // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS
-  2592362594U, // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0>
-  3785748576U, // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7>
-  1644974178U, // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0>
-  1645047915U, // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0>
-  2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS
-  2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0>
-  2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7>
-  3636250774U, // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2>
-  2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS
-  2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7>
-  2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0>
-  2719379635U, // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0>
-  2562512686U, // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS
-  1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS
-  2625918766U, // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS
-  2719674583U, // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0>
-  2568489152U, // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u>
-  1500720025U, // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u>
-  2625919130U, // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS
-  2586407243U, // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u>
-  1646301444U, // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0>
-  1646375181U, // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0>
-  2586411110U, // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS
-  2619949158U, // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS
-  2619949220U, // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2>
-  3785748789U, // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4>
-  2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6>
-  2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0>
-  2586415436U, // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0>
-  2952793398U, // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS
-  2619949725U, // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS
-  2562531430U, // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS
-  3693691700U, // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1>
-  2886521338U, // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3>
-  3693691864U, // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3>
-  2562534710U, // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS
-  2580450932U, // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1>
-  2886521656U, // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6>
-  2966736182U, // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
-  2966736183U, // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS
-  1500741734U, // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS
-  2250518817U, // <0,6,2,1>: Cost 3 vrev <6,0,1,2>
-  2574485096U, // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2>
-  2631894694U, // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1>
-  1500744604U, // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2>
-  2574487248U, // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3>
-  3020739384U, // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6>
-  2954136886U, // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS
-  1500747566U, // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS
-  3693693078U, // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2>
-  3705637136U, // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7>
-  3705637192U, // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0>
-  3693693340U, // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3>
-  2637867477U, // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6>
-  3705637424U, // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7>
-  3666154056U, // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0>
-  2722697800U, // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0>
-  2722771537U, // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0>
-  2562556006U, // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS
-  4095316257U, // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2>
-  2562557420U, // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4>
-  3636299926U, // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2>
-  2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS
-  2619952438U, // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS
-  2723287696U, // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0>
-  4027895094U, // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS
-  2619952681U, // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS
-  2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
-  3648250774U, // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0>
-  3792458436U, // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7>
-  3705638767U, // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0>
-  3648252831U, // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5>
-  3797619416U, // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0>
-  3792458472U, // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7>
-  4035202358U, // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS
-  2718716594U, // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
-  3786412796U, // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0>
-  3792458504U, // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3>
-  3728200126U, // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6>
-  3798135575U, // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0>
-  3786412836U, // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4>
-  3792458543U, // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6>
-  2718716728U, // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6>
-  2718716738U, // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7>
-  2718716747U, // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7>
-  2718716750U, // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1>
-  2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0>
-  3636323823U, // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7>
-  2725057384U, // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0>
-  2718716790U, // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5>
-  2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6>
-  3792458629U, // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2>
-  2725352332U, // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0>
-  2718716822U, // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1>
-  1500790886U, // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS
-  2619954990U, // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS
-  2562590192U, // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u>
-  2725721017U, // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0>
-  1500793762U, // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u>
-  2619955354U, // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS
-  2725942228U, // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0>
-  2954186038U, // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS
-  1500796718U, // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS
-  2256401391U, // <0,7,0,0>: Cost 3 vrev <7,0,0,0>
-  2632564838U, // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS
-  2256548865U, // <0,7,0,2>: Cost 3 vrev <7,0,2,0>
-  3700998396U, // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0>
-  2718716952U, // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5>
-  2718716962U, // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6>
-  2621284845U, // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7>
-  3904685542U, // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7>
-  2632565405U, // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS
-  2256409584U, // <0,7,1,0>: Cost 3 vrev <7,0,0,1>
-  3706307380U, // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1>
-  2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0>
-  3769603168U, // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5>
-  2256704532U, // <0,7,1,4>: Cost 3 vrev <7,0,4,1>
-  3769603184U, // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3>
-  3700999366U, // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7>
-  2886522476U, // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7>
-  2256999480U, // <0,7,1,u>: Cost 3 vrev <7,0,u,1>
-  2586501222U, // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS
-  1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2>
-  3636356595U, // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2>
-  2727711916U, // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0>
-  2586504502U, // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS
-  2632566606U, // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7>
-  2586505559U, // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2>
-  3020740204U, // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7>
-  1183265849U, // <0,7,2,u>: Cost 2 vrev <7,0,u,2>
-  3701000342U, // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2>
-  3706308849U, // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3>
-  3330315268U, // <0,7,3,2>: Cost 4 vrev <7,0,2,3>
-  3706309020U, // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3>
-  3706309122U, // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6>
-  3712281127U, // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7>
-  2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
-  3802412321U, // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0>
-  2640530202U, // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7>
-  3654287462U, // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS
-  2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4>
-  2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4>
-  3660262008U, // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7>
-  3786413405U, // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6>
-  2632568118U, // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS
-  3718917457U, // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7>
-  3787003255U, // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5>
-  2632568361U, // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS
-  3706310268U, // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0>
-  3792459156U, // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7>
-  3330331654U, // <0,7,5,2>: Cost 4 vrev <7,0,2,5>
-  3722899255U, // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7>
-  2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5>
-  3724226521U, // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7>
-  2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7>
-  2729997763U, // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0>
-  2720044499U, // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7>
-  3712946517U, // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0>
-  2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6>
-  3792459246U, // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7>
-  3796440567U, // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7>
-  3654307126U, // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS
-  2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7>
-  3792459281U, // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6>
-  2730661396U, // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0>
-  2658448293U, // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7>
-  3787003431U, // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1>
-  3654312854U, // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0>
-  3654313446U, // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7>
-  3804771905U, // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0>
-  3654315318U, // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS
-  3654315651U, // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7>
-  3660288348U, // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7>
-  2718717548U, // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7>
-  2664420990U, // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7>
-  2256466935U, // <0,7,u,0>: Cost 3 vrev <7,0,0,u>
-  1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u>
-  2256614409U, // <0,7,u,2>: Cost 3 vrev <7,0,2,u>
-  2731693714U, // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0>
-  2256761883U, // <0,7,u,4>: Cost 3 vrev <7,0,4,u>
-  2632571034U, // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS
-  2669066421U, // <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7>
-  2731988662U, // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0>
-  1183315007U, // <0,7,u,u>: Cost 2 vrev <7,0,u,u>
-  135053414U, // <0,u,0,0>: Cost 1 vdup0 LHS
-  1544896614U, // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS
-  1678999654U, // <0,u,0,2>: Cost 2 vuzpl LHS, LHS
-  2691880677U, // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2>
-  1476988214U, // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS
-  2718791419U, // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6>
-  3021248666U, // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
-  2592535607U, // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0>
-  135053414U, // <0,u,0,u>: Cost 1 vdup0 LHS
-  1476993097U, // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1>
-  1812780846U, // <0,u,1,1>: Cost 2 vzipl LHS, LHS
-  1618138926U, // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
-  2752742134U, // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
-  1476996406U, // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS
-  1812781210U, // <0,u,1,5>: Cost 2 vzipl LHS, RHS
-  2887006416U, // <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7>
-  2966736200U, // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
-  1812781413U, // <0,u,1,u>: Cost 2 vzipl LHS, LHS
-  1482973286U, // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS
-  1482973987U, // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2>
-  1946998574U, // <0,u,2,2>: Cost 2 vtrnl LHS, LHS
-  835584U, // <0,u,2,3>: Cost 0 copy LHS
-  1482976566U, // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS
-  3020781631U, // <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6>
-  1946998938U, // <0,u,2,6>: Cost 2 vtrnl LHS, RHS
-  1518810169U, // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2>
-  835584U, // <0,u,2,u>: Cost 0 copy LHS
-  2618640534U, // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2>
-  2752743574U, // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
-  2636556597U, // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u>
-  2752743836U, // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
-  2618640898U, // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6>
-  2752743938U, // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
-  2639202936U, // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
-  2639874762U, // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u>
-  2752743637U, // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
-  2562703462U, // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS
-  2888455982U, // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
-  3021575982U, // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
-  2568677591U, // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4>
-  2562706742U, // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS
-  1544899894U, // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS
-  1679002934U, // <0,u,4,6>: Cost 2 vuzpl LHS, RHS
-  2718718033U, // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6>
-  1679002952U, // <0,u,4,u>: Cost 2 vuzpl LHS, RHS
-  2568683622U, // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS
-  2568684438U, // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0>
-  3765622902U, // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7>
-  2691881087U, // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7>
-  2568686902U, // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS
-  2650492890U, // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u>
-  1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
-  2824834358U, // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS
-  1618139308U, // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
-  2592579686U, // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS
-  2262496983U, // <0,u,6,1>: Cost 3 vrev <u,0,1,6>
-  2654474688U, // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u>
-  2691881168U, // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7>
-  2592582966U, // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS
-  2656465587U, // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u>
-  2657129220U, // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u>
-  1584051029U, // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u>
-  1584714662U, // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u>
-  2562728038U, // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS
-  2562728854U, // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0>
-  2562729473U, // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7>
-  2661111018U, // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u>
-  2562731318U, // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS
-  2718718258U, // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6>
-  2586620261U, // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7>
-  2657793644U, // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7>
-  2562733870U, // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS
-  135053414U, // <0,u,u,0>: Cost 1 vdup0 LHS
-  1544902446U, // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS
-  1679005486U, // <0,u,u,2>: Cost 2 vuzpl LHS, LHS
-  835584U, // <0,u,u,3>: Cost 0 copy LHS
-  1483025718U, // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS
-  1544902810U, // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS
-  1679005850U, // <0,u,u,6>: Cost 2 vuzpl LHS, RHS
-  1518859327U, // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u>
-  835584U, // <0,u,u,u>: Cost 0 copy LHS
-  2689744896U, // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0>
-  1610694666U, // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1>
-  2689744916U, // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2>
-  2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0>
-  2684657701U, // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1>
-  2620637598U, // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0>
-  3708977654U, // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7>
-  3666351168U, // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0>
-  1611210825U, // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1>
-  2556780646U, // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS
-  2556781355U, // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1>
-  1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
-  3693052888U, // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3>
-  2556783926U, // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS
-  2580672143U, // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1>
-  2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7>
-  3654415354U, // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2>
-  1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS
-  2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1>
-  2685763756U, // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1>
-  2698297524U, // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0>
-  2685911230U, // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1>
-  2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6>
-  3764814038U, // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7>
-  2724839640U, // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0>
-  2592625658U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2>
-  2686279915U, // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1>
-  3087843328U, // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0>
-  3087843338U, // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1>
-  67944550U, // <1,0,3,2>: Cost 1 vrev LHS
-  2568743135U, // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3>
-  2562772278U, // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS
-  4099850454U, // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7>
-  3704998538U, // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7>
-  2592633923U, // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3>
-  68386972U, // <1,0,3,u>: Cost 1 vrev LHS
-  2620640146U, // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1>
-  2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5>
-  2689745244U, // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6>
-  3760980320U, // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1>
-  3761054057U, // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1>
-  2619313462U, // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS
-  3761201531U, // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1>
-  3666383940U, // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4>
-  2619313705U, // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS
-  4029300736U, // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0>
-  2895249510U, // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS
-  3028287590U, // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
-  3642501345U, // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5>
-  2215592058U, // <1,0,5,4>: Cost 3 vrev <0,1,4,5>
-  3724242907U, // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0>
-  3724906540U, // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0>
-  3911118134U, // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS
-  3028287644U, // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS
-  3762086375U, // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1>
-  2698297846U, // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7>
-  3760022015U, // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7>
-  3642509538U, // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6>
-  3762381323U, // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1>
-  3730215604U, // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0>
-  3730879237U, // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0>
-  2657801046U, // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0>
-  2658464679U, // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0>
-  2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0>
-  4047898278U, // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1>
-  2215460970U, // <1,0,7,2>: Cost 3 vrev <0,1,2,7>
-  3734861035U, // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0>
-  3731543398U, // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6>
-  3736188301U, // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0>
-  2663110110U, // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0>
-  3731543660U, // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7>
-  2664437376U, // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0>
-  3087884288U, // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0>
-  1616003730U, // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1>
-  67985515U, // <1,0,u,2>: Cost 1 vrev LHS
-  2689893028U, // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1>
-  2689745586U, // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6>
-  2619316378U, // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS
-  2669082807U, // <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0>
-  2592674888U, // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u>
-  68427937U, // <1,0,u,u>: Cost 1 vrev LHS
-  1543585802U, // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1>
-  1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS
-  2618654892U, // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1>
-  2689745654U, // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2>
-  2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5>
-  2620645791U, // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1>
-  3696378367U, // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7>
-  3666424905U, // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0>
-  1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1>
-  1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
-  202162278U, // <1,1,1,1>: Cost 1 vdup1 LHS
-  2622636950U, // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0>
-  2622637016U, // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3>
-  1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
-  2622637200U, // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7>
-  2622637263U, // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7>
-  2592691274U, // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
-  202162278U, // <1,1,1,u>: Cost 1 vdup1 LHS
-  2550890588U, // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2>
-  2617329183U, // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1>
-  2622637672U, // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2>
-  2622637734U, // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1>
-  2550893878U, // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS
-  3696379744U, // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7>
-  2622638010U, // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7>
-  3804554170U, // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0>
-  2622638139U, // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1>
-  2622638230U, // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2>
-  3087844148U, // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1>
-  4161585244U, // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2>
-  2014101606U, // <1,1,3,3>: Cost 2 vtrnr LHS, LHS
-  2622638594U, // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6>
-  2689745920U, // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7>
-  3763487753U, // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7>
-  2592707660U, // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3>
-  2014101611U, // <1,1,3,u>: Cost 2 vtrnr LHS, LHS
-  2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS
-  2221335351U, // <1,1,4,1>: Cost 3 vrev <1,1,1,4>
-  3696380988U, // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0>
-  3763487805U, // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5>
-  2556882230U, // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS
-  1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS
-  2758184246U, // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS
-  3666457677U, // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4>
-  1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS
-  2693653615U, // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1>
-  2617331408U, // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3>
-  4029302934U, // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2>
-  2689746064U, // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7>
-  2221564755U, // <1,1,5,4>: Cost 3 vrev <1,1,4,5>
-  2955559250U, // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5>
-  2617331810U, // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0>
-  2825293110U, // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS
-  2689746109U, // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7>
-  3696382241U, // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2>
-  2689746127U, // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7>
-  2617332218U, // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3>
-  3763487969U, // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7>
-  3696382605U, // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6>
-  4029309266U, // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5>
-  2617332536U, // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6>
-  2724840702U, // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0>
-  2725504263U, // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0>
-  2617332720U, // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1>
-  2659800138U, // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
-  3691074717U, // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3>
-  4167811174U, // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS
-  2617333094U, // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6>
-  3295396702U, // <1,1,7,5>: Cost 4 vrev <1,1,5,7>
-  3803891014U, // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0>
-  2617333356U, // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7>
-  2659800138U, // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
-  1483112550U, // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS
-  202162278U, // <1,1,u,1>: Cost 1 vdup1 LHS
-  2622642056U, // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3>
-  2014142566U, // <1,1,u,3>: Cost 2 vtrnr LHS, LHS
-  1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS
-  1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS
-  2622642384U, // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7>
-  2825293353U, // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS
-  202162278U, // <1,1,u,u>: Cost 1 vdup1 LHS
-  2635251712U, // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0>
-  1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS
-  2618663085U, // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2>
-  2696529358U, // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1>
-  2635252050U, // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5>
-  3769533926U, // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7>
-  2621317617U, // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2>
-  2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1>
-  1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS
-  2623308516U, // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2>
-  2635252532U, // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1>
-  2631271318U, // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0>
-  2958180454U, // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS
-  2550959414U, // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS
-  2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7>
-  2635252952U, // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7>
-  3732882731U, // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0>
-  2958180459U, // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS
-  2629281213U, // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2>
-  2635253280U, // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2>
-  2618664552U, // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2>
-  2689746546U, // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3>
-  3764815485U, // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5>
-  3760023176U, // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7>
-  2635253690U, // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7>
-  2659141610U, // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1>
-  2689746591U, // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3>
-  403488870U, // <1,2,3,0>: Cost 1 vext1 LHS, LHS
-  1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
-  1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
-  1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3>
-  403492150U, // <1,2,3,4>: Cost 1 vext1 LHS, RHS
-  1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
-  1525010938U, // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
-  1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
-  403494702U, // <1,2,3,u>: Cost 1 vext1 LHS, LHS
-  2641226607U, // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2>
-  3624723446U, // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6>
-  3301123609U, // <1,2,4,2>: Cost 4 vrev <2,1,2,4>
-  2598759198U, // <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2>
-  2659142864U, // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4>
-  1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS
-  2659143028U, // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6>
-  2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0>
-  1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS
-  2550988902U, // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS
-  2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7>
-  3624732264U, // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2>
-  2955559014U, // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
-  2550992182U, // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS
-  2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5>
-  2659143778U, // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0>
-  2659143848U, // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7>
-  2550994734U, // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS
-  2700289945U, // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1>
-  2635256232U, // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2>
-  2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3>
-  2689746874U, // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7>
-  3763488705U, // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5>
-  3763488716U, // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7>
-  2659144504U, // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6>
-  2657817432U, // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2>
-  2689746919U, // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7>
-  1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2>
-  2659144770U, // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2>
-  3708998858U, // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3>
-  2635257059U, // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1>
-  2659145062U, // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6>
-  3732886916U, // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0>
-  3732886998U, // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1>
-  2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1>
-  1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2>
-  403529835U, // <1,2,u,0>: Cost 1 vext1 LHS, LHS
-  1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2>
-  1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2>
-  1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2>
-  403533110U, // <1,2,u,4>: Cost 1 vext1 LHS, RHS
-  1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS
-  1525051898U, // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3>
-  1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
-  403535662U, // <1,2,u,u>: Cost 1 vext1 LHS, LHS
-  2819407872U, // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0>
-  1551564902U, // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS
-  2819408630U, // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2>
-  2619334911U, // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3>
-  2625306962U, // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5>
-  3832725879U, // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6>
-  3699048959U, // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7>
-  3776538827U, // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1>
-  1551565469U, // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS
-  2618671862U, // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2>
-  2819408692U, // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1>
-  2624643975U, // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3>
-  1745666150U, // <1,3,1,3>: Cost 2 vuzpr LHS, LHS
-  2557005110U, // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS
-  2625307792U, // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7>
-  3698386127U, // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7>
-  2592838748U, // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1>
-  1745666155U, // <1,3,1,u>: Cost 2 vuzpr LHS, LHS
-  2819408790U, // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
-  2625308193U, // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3>
-  2819408036U, // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
-  2819851890U, // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
-  2819408794U, // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
-  3893149890U, // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5>
-  2819408076U, // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
-  3772041583U, // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3>
-  2819408042U, // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
-  1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS
-  1483277128U, // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3>
-  2557019752U, // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2>
-  2819408856U, // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3>
-  1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS
-  2819409614U, // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
-  2598826490U, // <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3>
-  3087844352U, // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7>
-  1483282222U, // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS
-  2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS
-  2568971224U, // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3>
-  3832761290U, // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3>
-  2233428219U, // <1,3,4,3>: Cost 3 vrev <3,1,3,4>
-  2568973622U, // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS
-  1551568182U, // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS
-  2819410434U, // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6>
-  3666605151U, // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4>
-  1551568425U, // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS
-  2563006566U, // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS
-  2568979456U, // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7>
-  2563008035U, // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5>
-  2233436412U, // <1,3,5,3>: Cost 3 vrev <3,1,3,5>
-  2563009846U, // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS
-  2867187716U, // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5>
-  2655834214U, // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4>
-  1745669430U, // <1,3,5,7>: Cost 2 vuzpr LHS, RHS
-  1745669431U, // <1,3,5,u>: Cost 2 vuzpr LHS, RHS
-  2867187810U, // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0>
-  3699052931U, // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1>
-  2654507460U, // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3>
-  3766291091U, // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7>
-  2655834726U, // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3>
-  3923384562U, // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5>
-  2657161992U, // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3>
-  2819852218U, // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
-  2819852219U, // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
-  2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1>
-  2659816524U, // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3>
-  3636766245U, // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7>
-  2867187903U, // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3>
-  2625312102U, // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6>
-  2867188598U, // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5>
-  3728250344U, // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1>
-  2867187880U, // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7>
-  2707516171U, // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1>
-  1483317350U, // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS
-  1483318093U, // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u>
-  2819410718U, // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2>
-  1745666717U, // <1,3,u,3>: Cost 2 vuzpr LHS, LHS
-  1483320630U, // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS
-  1551571098U, // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS
-  2819410758U, // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6>
-  1745669673U, // <1,3,u,7>: Cost 2 vuzpr LHS, RHS
-  1745666722U, // <1,3,u,u>: Cost 2 vuzpr LHS, LHS
-  2617352205U, // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4>
-  2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS
-  3692421295U, // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4>
-  2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
-  2617352530U, // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5>
-  1634880402U, // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1>
-  2713930652U, // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2>
-  3732898396U, // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1>
-  1635101613U, // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1>
-  3693085430U, // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2>
-  2623988535U, // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4>
-  3693085590U, // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0>
-  3692422134U, // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6>
-  3693085726U, // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1>
-  2892401974U, // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS
-  3026619702U, // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
-  3800206324U, // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0>
-  2892402217U, // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS
-  3966978927U, // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2>
-  3966979018U, // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3>
-  3693086312U, // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2>
-  2635269798U, // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1>
-  3966979280U, // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4>
-  2893204790U, // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
-  3693086650U, // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7>
-  3666662502U, // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2>
-  2893205033U, // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS
-  2563063910U, // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS
-  2563064730U, // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4>
-  2563065386U, // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3>
-  3693087132U, // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3>
-  2619345410U, // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6>
-  3087843666U, // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5>
-  3087843676U, // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6>
-  3666670695U, // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3>
-  3087843669U, // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u>
-  2620672914U, // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1>
-  3630842706U, // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4>
-  3313069003U, // <1,4,4,2>: Cost 4 vrev <4,1,2,4>
-  3642788100U, // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4>
-  2713930960U, // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4>
-  2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS
-  2713930980U, // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6>
-  3736882642U, // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1>
-  2619346473U, // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS
-  2557108326U, // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS
-  2557109075U, // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5>
-  2598913774U, // <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1>
-  3630852246U, // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2>
-  2557111606U, // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS
-  2895252790U, // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
-  1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
-  3899059510U, // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS
-  1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS
-  2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS
-  2557117236U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1>
-  3630859880U, // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2>
-  2569062550U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2>
-  2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS
-  3763490174U, // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7>
-  3763490183U, // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7>
-  2712751498U, // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
-  2557122350U, // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS
-  2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4>
-  3732903040U, // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1>
-  3734230174U, // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4>
-  3734893807U, // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4>
-  3660729654U, // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS
-  3786493384U, // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0>
-  2713341394U, // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1>
-  3660731386U, // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2>
-  2664470148U, // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4>
-  2557132902U, // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS
-  2619348782U, // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS
-  2563106351U, // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u>
-  2713783816U, // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1>
-  2622666815U, // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6>
-  1640189466U, // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1>
-  1616006697U, // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
-  2712751498U, // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
-  1616006715U, // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS
-  2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0>
-  1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS
-  2618687664U, // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5>
-  3693093120U, // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4>
-  1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
-  2620678563U, // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5>
-  2714668660U, // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1>
-  3772042877U, // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1>
-  1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS
-  2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2>
-  2620015412U, // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1>
-  2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0>
-  2618688512U, // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7>
-  2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5>
-  2620015727U, // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1>
-  2620015859U, // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7>
-  3093728566U, // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS
-  2620015981U, // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3>
-  3692430816U, // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1>
-  2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5>
-  2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2>
-  2620016294U, // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1>
-  3693758221U, // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5>
-  3692431209U, // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7>
-  2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7>
-  4173598006U, // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS
-  2620016699U, // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1>
-  2620016790U, // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2>
-  2569110672U, // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7>
-  3693758785U, // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2>
-  2620017052U, // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3>
-  2620017154U, // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6>
-  3135623172U, // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5>
-  4161587048U, // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6>
-  2014104886U, // <1,5,3,7>: Cost 2 vtrnr LHS, RHS
-  2014104887U, // <1,5,3,u>: Cost 2 vtrnr LHS, RHS
-  2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1>
-  2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
-  3693759551U, // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3>
-  3642861837U, // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4>
-  2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4>
-  1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS
-  2759855414U, // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS
-  2713931718U, // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6>
-  1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS
-  2557182054U, // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS
-  2557182812U, // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5>
-  3630925347U, // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5>
-  4029301675U, // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3>
-  2557185334U, // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS
-  2713931780U, // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5>
-  2667794530U, // <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0>
-  2713931800U, // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7>
-  2557187886U, // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS
-  2718208036U, // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1>
-  2620019115U, // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5>
-  2667794938U, // <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3>
-  3787673666U, // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4>
-  3693761165U, // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6>
-  3319279297U, // <1,5,6,5>: Cost 4 vrev <5,1,5,6>
-  2667795256U, // <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6>
-  2713931874U, // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0>
-  2713931883U, // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0>
-  2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS
-  2557199156U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1>
-  2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1>
-  2569144592U, // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7>
-  2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS
-  2713931944U, // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7>
-  3787673770U, // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0>
-  2719387828U, // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1>
-  2557204270U, // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS
-  2620020435U, // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2>
-  1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS
-  2620020616U, // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3>
-  2620020668U, // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1>
-  1594054682U, // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5>
-  1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS
-  2620020944U, // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7>
-  2014145846U, // <1,5,u,7>: Cost 2 vtrnr LHS, RHS
-  2014145847U, // <1,5,u,u>: Cost 2 vtrnr LHS, RHS
-  3692437504U, // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0>
-  2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS
-  2618695857U, // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6>
-  3794161970U, // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1>
-  2620023122U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5>
-  2620686756U, // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6>
-  2621350389U, // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6>
-  4028599606U, // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS
-  2618696349U, // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS
-  3692438262U, // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2>
-  2625995572U, // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1>
-  3692438422U, // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0>
-  3692438488U, // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3>
-  2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6>
-  3692438672U, // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7>
-  3692438720U, // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1>
-  2958183734U, // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
-  2958183735U, // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS
-  2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1>
-  3692439097U, // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0>
-  3692439144U, // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2>
-  3692439206U, // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1>
-  3636948278U, // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS
-  3787674092U, // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7>
-  2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7>
-  2970799414U, // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
-  2970799415U, // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS
-  2563211366U, // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS
-  3699738854U, // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1>
-  2563212860U, // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3>
-  3692439964U, // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3>
-  2563214646U, // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS
-  4191820018U, // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5>
-  2587103648U, // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3>
-  3087845306U, // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7>
-  3087845307U, // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u>
-  3693767570U, // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1>
-  3693767650U, // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0>
-  3636962877U, // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4>
-  3325088134U, // <1,6,4,3>: Cost 4 vrev <6,1,3,4>
-  3693767898U, // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5>
-  2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS
-  3833670966U, // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS
-  4028632374U, // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS
-  2618699305U, // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS
-  3693768264U, // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2>
-  3630998373U, // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5>
-  3636971070U, // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5>
-  3642943767U, // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5>
-  3693768628U, // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6>
-  3732918276U, // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5>
-  2620690530U, // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0>
-  2955562294U, // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS
-  2955562295U, // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS
-  2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1>
-  3631006566U, // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6>
-  3631007674U, // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7>
-  3692442184U, // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0>
-  3631009078U, // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS
-  3787674416U, // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7>
-  2713932600U, // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6>
-  2713932610U, // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7>
-  2713932619U, // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7>
-  1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1>
-  2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1>
-  2698302306U, // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3>
-  3642960153U, // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7>
-  2713932662U, // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5>
-  2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1>
-  2724844426U, // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7>
-  4035956022U, // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS
-  1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1>
-  1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1>
-  2618701614U, // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS
-  3135663508U, // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2>
-  3692443580U, // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1>
-  2713932743U, // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5>
-  2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS
-  2622683344U, // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7>
-  3087886266U, // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7>
-  1652356071U, // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1>
-  2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1>
-  2626666598U, // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS
-  3695100067U, // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1>
-  3707044102U, // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1>
-  2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1>
-  3654921933U, // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0>
-  2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7>
-  2622022215U, // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7>
-  2626667165U, // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS
-  2593128550U, // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS
-  2626667316U, // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1>
-  3700409238U, // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0>
-  2257294428U, // <1,7,1,3>: Cost 3 vrev <7,1,3,1>
-  2593131830U, // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS
-  2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7>
-  2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7>
-  2593133696U, // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1>
-  2628658545U, // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7>
-  2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS
-  3701073445U, // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7>
-  3700409960U, // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2>
-  2638612134U, // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1>
-  2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS
-  3706382167U, // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7>
-  2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2>
-  3660911610U, // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2>
-  2587170606U, // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS
-  1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS
-  2569257984U, // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7>
-  2581202536U, // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2>
-  2569259294U, // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3>
-  1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS
-  1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3>
-  2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3>
-  2581206010U, // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2>
-  1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS
-  2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1>
-  3654951732U, // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1>
-  3330987094U, // <1,7,4,2>: Cost 4 vrev <7,1,2,4>
-  3331060831U, // <1,7,4,3>: Cost 4 vrev <7,1,3,4>
-  3787674971U, // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4>
-  2626669878U, // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS
-  3785979241U, // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0>
-  3787085176U, // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6>
-  2626670121U, // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS
-  2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS
-  2569274368U, // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7>
-  3643016808U, // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2>
-  2569275680U, // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5>
-  2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS
-  4102034790U, // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6>
-  2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7>
-  3899378998U, // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS
-  2569279278U, // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS
-  2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1>
-  2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0>
-  3643025338U, // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7>
-  3643025697U, // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6>
-  3643026742U, // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS
-  3654971091U, // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6>
-  3787675153U, // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6>
-  2724845076U, // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0>
-  2725508637U, // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0>
-  2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1>
-  3631088436U, // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1>
-  3660949158U, // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1>
-  3801904705U, // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0>
-  3631090998U, // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS
-  2662503828U, // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7>
-  3660951981U, // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7>
-  2713933420U, // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7>
-  2731406959U, // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1>
-  1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS
-  2626672430U, // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS
-  2581243496U, // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2>
-  2569300259U, // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u>
-  1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS
-  1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u>
-  2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3>
-  2581246970U, // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2>
-  1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS
-  1543643153U, // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u>
-  1546297446U, // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS
-  2819448852U, // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2>
-  2619375876U, // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u>
-  1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u>
-  1658771190U, // <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1>
-  2736789248U, // <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2>
-  2659189376U, // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1>
-  1546298013U, // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS
-  1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
-  202162278U, // <1,u,1,1>: Cost 1 vdup1 LHS
-  1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
-  1745707110U, // <1,u,1,3>: Cost 2 vuzpr LHS, LHS
-  1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
-  2620040336U, // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7>
-  3026622618U, // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
-  2958183752U, // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
-  202162278U, // <1,u,1,u>: Cost 1 vdup1 LHS
-  2819449750U, // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
-  2893207342U, // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS
-  2819448996U, // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
-  2819450482U, // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
-  2819449754U, // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
-  2893207706U, // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
-  2819449036U, // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
-  2970799432U, // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
-  2819449002U, // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
-  403931292U, // <1,u,3,0>: Cost 1 vext1 LHS, LHS
-  1477673718U, // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
-  115726126U, // <1,u,3,2>: Cost 1 vrev LHS
-  2014102173U, // <1,u,3,3>: Cost 2 vtrnr LHS, LHS
-  403934518U, // <1,u,3,4>: Cost 1 vext1 LHS, RHS
-  1507536601U, // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3>
-  1525453306U, // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
-  2014105129U, // <1,u,3,7>: Cost 2 vtrnr LHS, RHS
-  403937070U, // <1,u,3,u>: Cost 1 vext1 LHS, LHS
-  2620042157U, // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1>
-  2620042237U, // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0>
-  2263217967U, // <1,u,4,2>: Cost 3 vrev <u,1,2,4>
-  2569341224U, // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4>
-  2569342262U, // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS
-  1546300726U, // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS
-  2819449180U, // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6>
-  2724845649U, // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6>
-  1546300969U, // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS
-  2551431270U, // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS
-  2551432192U, // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7>
-  3028293422U, // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
-  2955559068U, // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
-  2551434550U, // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS
-  2895255706U, // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
-  1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
-  1745710390U, // <1,u,5,7>: Cost 2 vuzpr LHS, RHS
-  1745710391U, // <1,u,5,u>: Cost 2 vuzpr LHS, RHS
-  2653221159U, // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u>
-  2725509303U, // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0>
-  2659193338U, // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3>
-  2689751248U, // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7>
-  2867228774U, // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4>
-  3764820194U, // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7>
-  2657202957U, // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u>
-  2819450810U, // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
-  2819450811U, // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
-  1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u>
-  2557420340U, // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1>
-  2569365158U, // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1>
-  2569365803U, // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7>
-  2557422902U, // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS
-  2662512021U, // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u>
-  2724845884U, // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7>
-  2659194476U, // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7>
-  1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u>
-  403972257U, // <1,u,u,0>: Cost 1 vext1 LHS, LHS
-  202162278U, // <1,u,u,1>: Cost 1 vdup1 LHS
-  115767091U, // <1,u,u,2>: Cost 1 vrev LHS
-  1745707677U, // <1,u,u,3>: Cost 2 vuzpr LHS, LHS
-  403975478U, // <1,u,u,4>: Cost 1 vext1 LHS, RHS
-  1546303642U, // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS
-  1616009613U, // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
-  1745710633U, // <1,u,u,7>: Cost 2 vuzpr LHS, RHS
-  403978030U, // <1,u,u,u>: Cost 1 vext1 LHS, LHS
-  2551463936U, // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0>
-  2685698058U, // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1>
-  1610776596U, // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2>
-  2619384069U, // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0>
-  2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS
-  3899836596U, // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5>
-  2621374968U, // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0>
-  4168271334U, // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7>
-  1611219018U, // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2>
-  2551472138U, // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1>
-  2690564186U, // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0>
-  1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS
-  2826092646U, // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS
-  2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS
-  3692463248U, // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7>
-  2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1>
-  3661050874U, // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2>
-  1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS
-  1477738598U, // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS
-  2551481078U, // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2>
-  2551481796U, // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0>
-  2551482518U, // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2>
-  1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS
-  2551484112U, // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3>
-  2551484759U, // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2>
-  2551485434U, // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2>
-  1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS
-  2953625600U, // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
-  2953627302U, // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1>
-  2953625764U, // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2>
-  4027369695U, // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3>
-  3625233718U, // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS
-  3899836110U, // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5>
-  4032012618U, // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6>
-  3899835392U, // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7>
-  2953625770U, // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u>
-  2551496806U, // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS
-  2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5>
-  2685698396U, // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6>
-  3625240726U, // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2>
-  2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS
-  2618723638U, // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS
-  2765409590U, // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS
-  3799990664U, // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5>
-  2685698450U, // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6>
-  3625246822U, // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS
-  3289776304U, // <2,0,5,1>: Cost 4 vrev <0,2,1,5>
-  2690564526U, // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7>
-  3289923778U, // <2,0,5,3>: Cost 4 vrev <0,2,3,5>
-  2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5>
-  3726307332U, // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5>
-  3726307426U, // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0>
-  2826095926U, // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS
-  2216550639U, // <2,0,5,u>: Cost 3 vrev <0,2,u,5>
-  4162420736U, // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0>
-  2901885030U, // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS
-  2685698559U, // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7>
-  3643173171U, // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6>
-  2216263884U, // <2,0,6,4>: Cost 3 vrev <0,2,4,6>
-  3730289341U, // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0>
-  3726308152U, // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6>
-  3899836346U, // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7>
-  2216558832U, // <2,0,6,u>: Cost 3 vrev <0,2,u,6>
-  2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
-  3726308437U, // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3>
-  2726249034U, // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1>
-  3734934772U, // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0>
-  3726308710U, // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6>
-  3726308814U, // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2>
-  3736925671U, // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0>
-  3726308972U, // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7>
-  2659202049U, // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
-  1477787750U, // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS
-  2953668262U, // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1>
-  1611956893U, // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS
-  2551531670U, // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2>
-  1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS
-  2618726554U, // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS
-  2765412506U, // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS
-  2826096169U, // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS
-  1611956947U, // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS
-  2569453670U, // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS
-  2619392102U, // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS
-  3759440619U, // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0>
-  1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2>
-  2569456950U, // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS
-  2690712328U, // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2>
-  3661115841U, // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0>
-  2622046794U, // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1>
-  1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2>
-  2551545958U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS
-  2685698868U, // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1>
-  2628682646U, // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0>
-  2685698888U, // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3>
-  2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS
-  3693134992U, // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7>
-  3661124034U, // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1>
-  3625292794U, // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2>
-  2685698933U, // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3>
-  2551554150U, // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS
-  3893649571U, // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1>
-  2551555688U, // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2>
-  2685698966U, // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0>
-  2551557430U, // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS
-  3763422123U, // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3>
-  3693135802U, // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7>
-  2726249402U, // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0>
-  2685699011U, // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0>
-  2551562342U, // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS
-  2953625610U, // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1>
-  2953627798U, // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2>
-  2953626584U, // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3>
-  2551565622U, // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS
-  2953625938U, // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5>
-  2587398596U, // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
-  4032013519U, // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7>
-  2953625617U, // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u>
-  2690565154U, // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5>
-  3625313270U, // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6>
-  3771532340U, // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5>
-  1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4>
-  3625315638U, // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS
-  2619395382U, // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS
-  3837242678U, // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS
-  3799991394U, // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6>
-  1148773319U, // <2,1,4,u>: Cost 2 vrev <1,2,u,4>
-  2551578726U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS
-  2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7>
-  3625321952U, // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1>
-  2685699216U, // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7>
-  2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS
-  3740913668U, // <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5>
-  3661156806U, // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5>
-  3893652790U, // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS
-  2685699261U, // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7>
-  2551586918U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS
-  3625329398U, // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2>
-  2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7>
-  3088679014U, // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS
-  2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS
-  4029382994U, // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5>
-  3625333560U, // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6>
-  3731624800U, // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1>
-  2551592750U, // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS
-  2622051322U, // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2>
-  3733615699U, // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1>
-  3795125538U, // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0>
-  2222171037U, // <2,1,7,3>: Cost 3 vrev <1,2,3,7>
-  3740915046U, // <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6>
-  3296060335U, // <2,1,7,5>: Cost 4 vrev <1,2,5,7>
-  3736933864U, // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1>
-  3805300055U, // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u>
-  2669827714U, // <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2>
-  2551603302U, // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS
-  2953666570U, // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1>
-  2953668758U, // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2>
-  1148437406U, // <2,1,u,3>: Cost 2 vrev <1,2,3,u>
-  2551606582U, // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS
-  2953666898U, // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5>
-  2587398596U, // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
-  2669828370U, // <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1>
-  1148806091U, // <2,1,u,u>: Cost 2 vrev <1,2,u,u>
-  1543667732U, // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2>
-  1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS
-  2685699524U, // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0>
-  2685699535U, // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2>
-  2551614774U, // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS
-  3704422830U, // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7>
-  3893657642U, // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6>
-  3770574323U, // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2>
-  1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2>
-  2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2>
-  2622718772U, // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1>
-  2622718870U, // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0>
-  2819915878U, // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS
-  3625364790U, // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS
-  2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7>
-  3760031292U, // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3>
-  3667170468U, // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1>
-  2819915883U, // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS
-  1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS
-  2563572470U, // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2>
-  269271142U, // <2,2,2,2>: Cost 1 vdup2 LHS
-  2685699698U, // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3>
-  1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS
-  2685699720U, // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7>
-  2622719930U, // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7>
-  2593436837U, // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
-  269271142U, // <2,2,2,u>: Cost 1 vdup2 LHS
-  2685699750U, // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1>
-  2690565806U, // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0>
-  2953627240U, // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2>
-  1879883878U, // <2,2,3,3>: Cost 2 vzipr LHS, LHS
-  2685699790U, // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5>
-  3893659342U, // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5>
-  2958270812U, // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6>
-  2593445030U, // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3>
-  1879883883U, // <2,2,3,u>: Cost 2 vzipr LHS, LHS
-  2551644262U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS
-  3625386742U, // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2>
-  2551645902U, // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5>
-  3759441686U, // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5>
-  2551647542U, // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS
-  1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS
-  2764901686U, // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS
-  3667195047U, // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4>
-  1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS
-  3696463432U, // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2>
-  2617413328U, // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3>
-  2685699936U, // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7>
-  4027383910U, // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS
-  2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5>
-  2617413636U, // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5>
-  2617413730U, // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0>
-  2819919158U, // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS
-  2819919159U, // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS
-  3625402554U, // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6>
-  3760031652U, // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3>
-  2617414138U, // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3>
-  2685700026U, // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7>
-  3625405750U, // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS
-  3760031692U, // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7>
-  3088679116U, // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6>
-  2657891169U, // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2>
-  2685700071U, // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7>
-  2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1>
-  3704427616U, // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5>
-  2660545701U, // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2>
-  4030718054U, // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS
-  2617415014U, // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6>
-  3302033032U, // <2,2,7,5>: Cost 4 vrev <2,2,5,7>
-  3661246929U, // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7>
-  2617415276U, // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7>
-  2731558962U, // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1>
-  1489829990U, // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS
-  1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS
-  269271142U, // <2,2,u,2>: Cost 1 vdup2 LHS
-  1879924838U, // <2,2,u,3>: Cost 2 vzipr LHS, LHS
-  1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS
-  1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS
-  2953666908U, // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6>
-  2819919401U, // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS
-  269271142U, // <2,2,u,u>: Cost 1 vdup2 LHS
-  1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
-  470597734U, // <2,3,0,1>: Cost 1 vext2 LHS, LHS
-  1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
-  2619408648U, // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3>
-  1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
-  2665857454U, // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
-  2622726655U, // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
-  2593494188U, // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0>
-  470598301U, // <2,3,0,u>: Cost 1 vext2 LHS, LHS
-  1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
-  1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
-  1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
-  1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
-  2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS
-  1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
-  2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7>
-  2665858347U, // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0>
-  1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
-  2622727613U, // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2>
-  2622727711U, // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
-  1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
-  1544341158U, // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
-  2622727958U, // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5>
-  2622728032U, // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7>
-  1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
-  2665859050U, // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1>
-  1548986427U, // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
-  1548986518U, // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
-  2622728415U, // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3>
-  1489913458U, // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3>
-  1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3>
-  1548986882U, // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
-  2665859632U, // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7>
-  2234304870U, // <2,3,3,6>: Cost 3 vrev <3,2,6,3>
-  2958271632U, // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
-  1548987166U, // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2>
-  1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS
-  1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4>
-  2622729276U, // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
-  2557692054U, // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2>
-  1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS
-  470601014U, // <2,3,4,5>: Cost 1 vext2 LHS, RHS
-  1592118644U, // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
-  2593526960U, // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4>
-  470601257U, // <2,3,4,u>: Cost 1 vext2 LHS, RHS
-  2551726182U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS
-  1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
-  2665860862U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4>
-  2551728642U, // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6>
-  1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
-  1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
-  1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
-  1592119464U, // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
-  1592119545U, // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7>
-  2622730529U, // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
-  2557707164U, // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6>
-  1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
-  2665861682U, // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5>
-  2622730893U, // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
-  2665861810U, // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7>
-  1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
-  1592120142U, // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
-  1592120223U, // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1>
-  1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
-  2659890261U, // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3>
-  2660553894U, // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3>
-  2665862371U, // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
-  1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
-  2665862534U, // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
-  2665862614U, // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1>
-  1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
-  1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
-  1548990163U, // <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
-  470603566U, // <2,3,u,1>: Cost 1 vext2 LHS, LHS
-  1548990341U, // <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
-  1548990396U, // <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1>
-  1548990527U, // <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
-  470603930U, // <2,3,u,5>: Cost 1 vext2 LHS, RHS
-  1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
-  1592121600U, // <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1>
-  470604133U, // <2,3,u,u>: Cost 1 vext2 LHS, LHS
-  2617425942U, // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4>
-  2618753126U, // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS
-  2618753208U, // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4>
-  2619416841U, // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4>
-  2587593628U, // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2>
-  2712832914U, // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1>
-  1634962332U, // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
-  3799993252U, // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1>
-  1634962332U, // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
-  2619417334U, // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2>
-  3692495668U, // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1>
-  2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4>
-  2826125414U, // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS
-  3699794995U, // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4>
-  3692496016U, // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7>
-  3763424238U, // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3>
-  3667317942U, // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1>
-  2826125419U, // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS
-  2629371336U, // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4>
-  3699131946U, // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3>
-  2630698602U, // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4>
-  2618754766U, // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5>
-  2826126234U, // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4>
-  2899119414U, // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS
-  3033337142U, // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS
-  3800214597U, // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0>
-  2899119657U, // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS
-  2635344033U, // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4>
-  4032012325U, // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1>
-  3692497228U, // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4>
-  3692497308U, // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3>
-  3001404624U, // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4>
-  2953627342U, // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
-  2953625804U, // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
-  3899868160U, // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7>
-  2953625806U, // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
-  2710916266U, // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2>
-  3899869648U, // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1>
-  3899869658U, // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2>
-  3899868930U, // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3>
-  2712833232U, // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4>
-  2618756406U, // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS
-  2765737270U, // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS
-  4168304426U, // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7>
-  2618756649U, // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS
-  2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5>
-  2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2>
-  2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5>
-  2569718102U, // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5>
-  2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS
-  3625545732U, // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5>
-  1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
-  2826128694U, // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS
-  1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
-  1478066278U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS
-  2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2>
-  2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4>
-  2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2>
-  1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS
-  2901888310U, // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
-  2551812920U, // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6>
-  2726251914U, // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1>
-  1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS
-  2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
-  3786722726U, // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2>
-  3734303911U, // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4>
-  3734967544U, // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4>
-  3727005030U, // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6>
-  2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0>
-  2726251986U, // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1>
-  3727005292U, // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7>
-  2659234821U, // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
-  1478082662U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS
-  2618758958U, // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS
-  2551826024U, // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2>
-  2551826582U, // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2>
-  1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS
-  2953668302U, // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5>
-  1611959849U, // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
-  2826128937U, // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS
-  1611959867U, // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS
-  3691839488U, // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0>
-  2618097766U, // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS
-  2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2>
-  2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5>
-  2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5>
-  2620752300U, // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5>
-  3693830655U, // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7>
-  3094531382U, // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
-  2618098333U, // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS
-  3691840246U, // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2>
-  3691840308U, // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1>
-  2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0>
-  2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7>
-  2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5>
-  3691840656U, // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7>
-  3789082310U, // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2>
-  2712833744U, // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3>
-  2628715896U, // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5>
-  3693831613U, // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2>
-  4026698642U, // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1>
-  2632033896U, // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2>
-  3691841190U, // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1>
-  2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5>
-  3691841352U, // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1>
-  3691841466U, // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7>
-  3088354614U, // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
-  3088354615U, // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS
-  2557829222U, // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS
-  2557830059U, // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3>
-  2575746766U, // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5>
-  3691841948U, // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3>
-  2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6>
-  2581720847U, // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3>
-  2953628162U, // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
-  2953626624U, // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7>
-  2953626625U, // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u>
-  2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS
-  3631580076U, // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4>
-  2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5>
-  2569783646U, // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4>
-  2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS
-  2618101046U, // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS
-  3893905922U, // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6>
-  3094564150U, // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
-  2618101289U, // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS
-  2551873638U, // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS
-  3637560320U, // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7>
-  3637560966U, // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5>
-  3723030343U, // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5>
-  2551876918U, // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS
-  2712834052U, // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5>
-  4028713474U, // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6>
-  2712834072U, // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7>
-  2712834081U, // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7>
-  2575769702U, // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS
-  3631596462U, // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6>
-  2655924730U, // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3>
-  3643541856U, // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6>
-  2655924849U, // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5>
-  3787755607U, // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7>
-  4029385218U, // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6>
-  3088682294U, // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS
-  3088682295U, // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS
-  2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS
-  2551890678U, // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2>
-  2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7>
-  3637577878U, // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2>
-  2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS
-  2712834216U, // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7>
-  2712834220U, // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2>
-  4174449974U, // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS
-  2563839790U, // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS
-  2563842150U, // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS
-  2618103598U, // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS
-  2563843721U, // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u>
-  2569816418U, // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u>
-  2622748735U, // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6>
-  2618103962U, // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS
-  2953669122U, // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
-  2953667584U, // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7>
-  2618104165U, // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS
-  2620096512U, // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0>
-  1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS
-  2620096676U, // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2>
-  3693838588U, // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0>
-  1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6>
-  3694502317U, // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6>
-  2551911246U, // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1>
-  2720723287U, // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2>
-  1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS
-  2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2>
-  2620097332U, // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1>
-  2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0>
-  2820243558U, // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS
-  2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6>
-  2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7>
-  3693839585U, // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7>
-  2721386920U, // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2>
-  2820243563U, // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS
-  2714014137U, // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1>
-  2712834500U, // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3>
-  2620098152U, // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2>
-  2620098214U, // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1>
-  2632042254U, // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6>
-  2712834540U, // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7>
-  2820243660U, // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6>
-  2958265654U, // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS
-  2620098619U, // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1>
-  2620098710U, // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2>
-  3893986982U, // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1>
-  2569848762U, // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7>
-  2620098972U, // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3>
-  2620099074U, // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6>
-  3893987022U, // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5>
-  3001404644U, // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
-  1879887158U, // <2,6,3,7>: Cost 2 vzipr LHS, RHS
-  1879887159U, // <2,6,3,u>: Cost 2 vzipr LHS, RHS
-  2620099484U, // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2>
-  2620099566U, // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3>
-  2620099644U, // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0>
-  3643599207U, // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4>
-  2575830080U, // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4>
-  1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS
-  2667875700U, // <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6>
-  4028042550U, // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS
-  1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS
-  3693841992U, // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2>
-  2667876048U, // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3>
-  2712834756U, // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7>
-  3643607400U, // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5>
-  2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5>
-  2667876356U, // <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5>
-  2667876450U, // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0>
-  2820246838U, // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS
-  2820246839U, // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS
-  2563899494U, // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS
-  3893988683U, // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1>
-  2563901072U, // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6>
-  3893987236U, // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3>
-  2563902774U, // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS
-  3893988723U, // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5>
-  2712834872U, // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6>
-  2955644214U, // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS
-  2955644215U, // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS
-  2712834894U, // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1>
-  2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2>
-  2725000033U, // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2>
-  2702365544U, // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0>
-  2712834934U, // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5>
-  3776107393U, // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7>
-  2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2>
-  2726253452U, // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0>
-  2712834966U, // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1>
-  2620102355U, // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2>
-  1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS
-  2620102536U, // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3>
-  2820244125U, // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS
-  1594136612U, // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6>
-  1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS
-  2620102864U, // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7>
-  1879928118U, // <2,6,u,7>: Cost 2 vzipr LHS, RHS
-  1879928119U, // <2,6,u,u>: Cost 2 vzipr LHS, RHS
-  2726179825U, // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2>
-  1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2>
-  2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2>
-  2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0>
-  2726474773U, // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2>
-  2620768686U, // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7>
-  2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7>
-  2599760953U, // <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2>
-  1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2>
-  2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
-  3695174452U, // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1>
-  3695174550U, // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0>
-  3694511104U, // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7>
-  3713090594U, // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5>
-  3693184144U, // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7>
-  2627405016U, // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7>
-  3799995519U, // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0>
-  2639348470U, // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
-  3695175101U, // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2>
-  3643655168U, // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7>
-  2257892517U, // <2,7,2,2>: Cost 3 vrev <7,2,2,2>
-  3695175334U, // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1>
-  3695175465U, // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6>
-  2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7>
-  2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7>
-  3695175658U, // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1>
-  2634704979U, // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7>
-  1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS
-  2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7>
-  2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2>
-  2569922927U, // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3>
-  1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS
-  2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3>
-  1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3>
-  2587841530U, // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2>
-  1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS
-  2708706617U, // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6>
-  3649643418U, // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4>
-  3649644330U, // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7>
-  2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4>
-  3649645641U, // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4>
-  2621435190U, // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS
-  2712835441U, // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u>
-  3799995762U, // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0>
-  2621435433U, // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS
-  2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2>
-  3643679744U, // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7>
-  3637708424U, // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7>
-  3643681137U, // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5>
-  2599800118U, // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS
-  3786577334U, // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5>
-  3786577345U, // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7>
-  2599802214U, // <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6>
-  2599802670U, // <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS
-  2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS
-  3643687936U, // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7>
-  2663240186U, // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3>
-  3643689330U, // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6>
-  2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS
-  2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6>
-  2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6>
-  3786577428U, // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0>
-  2581894958U, // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS
-  2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1>
-  3804640817U, // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2>
-  3637724826U, // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7>
-  3734992123U, // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7>
-  2552040758U, // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS
-  3799995992U, // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5>
-  2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7>
-  2712835692U, // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7>
-  2731562607U, // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1>
-  1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS
-  1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2>
-  2587879016U, // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2>
-  2569963892U, // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u>
-  1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS
-  2621438106U, // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS
-  1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u>
-  2587882490U, // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2>
-  1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS
-  1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
-  470638699U, // <2,u,0,1>: Cost 1 vext2 LHS, LHS
-  1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
-  1658631909U, // <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2>
-  1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
-  2665898414U, // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
-  1658853120U, // <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2>
-  3094531625U, // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
-  470639261U, // <2,u,0,u>: Cost 1 vext2 LHS, LHS
-  1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
-  1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
-  1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
-  1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
-  2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
-  1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
-  2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
-  2726254427U, // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3>
-  1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3>
-  1478328422U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS
-  2618123807U, // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
-  269271142U, // <2,u,2,2>: Cost 1 vdup2 LHS
-  1544382118U, // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
-  1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS
-  2618124136U, // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
-  1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
-  3088354857U, // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
-  269271142U, // <2,u,2,u>: Cost 1 vdup2 LHS
-  1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
-  2953627374U, // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1>
-  1490282143U, // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3>
-  1879883932U, // <2,u,3,3>: Cost 2 vzipr LHS, LHS
-  1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
-  2953627378U, // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5>
-  1514172931U, // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3>
-  1879887176U, // <2,u,3,7>: Cost 2 vzipr LHS, RHS
-  1879883937U, // <2,u,3,u>: Cost 2 vzipr LHS, LHS
-  1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS
-  1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4>
-  2552088270U, // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5>
-  1190213513U, // <2,u,4,3>: Cost 2 vrev <u,2,3,4>
-  1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS
-  470641974U, // <2,u,4,5>: Cost 1 vext2 LHS, RHS
-  1592159604U, // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
-  3094564393U, // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
-  470642217U, // <2,u,4,u>: Cost 1 vext2 LHS, RHS
-  2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5>
-  1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
-  2564040353U, // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5>
-  2690275455U, // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7>
-  1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
-  1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
-  1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
-  1592160424U, // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
-  1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
-  1478361190U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS
-  2552103670U, // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2>
-  1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
-  2685704400U, // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7>
-  1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS
-  2901891226U, // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
-  1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
-  1592161102U, // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
-  1478367022U, // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS
-  1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
-  2659931226U, // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u>
-  2564056739U, // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7>
-  2665903331U, // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
-  1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
-  2665903494U, // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
-  2587947527U, // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7>
-  1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
-  1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
-  1478377574U, // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS
-  470644526U, // <2,u,u,1>: Cost 1 vext2 LHS, LHS
-  269271142U, // <2,u,u,2>: Cost 1 vdup2 LHS
-  1879924892U, // <2,u,u,3>: Cost 2 vzipr LHS, LHS
-  1478380854U, // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS
-  470644890U, // <2,u,u,5>: Cost 1 vext2 LHS, RHS
-  1611962765U, // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
-  1879928136U, // <2,u,u,7>: Cost 2 vzipr LHS, RHS
-  470645093U, // <2,u,u,u>: Cost 1 vext2 LHS, LHS
-  1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
-  1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
-  1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
-  3763576860U, // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1>
-  2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1>
-  3698508206U, // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7>
-  3763576887U, // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1>
-  3667678434U, // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0>
-  1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2>
-  1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS
-  2685632602U, // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0>
-  537706598U, // <3,0,1,2>: Cost 1 vext3 LHS, LHS
-  2624766936U, // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3>
-  1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS
-  2624767120U, // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7>
-  2732966030U, // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7>
-  2593944803U, // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1>
-  537706652U, // <3,0,1,u>: Cost 1 vext3 LHS, LHS
-  1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
-  2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
-  2685632692U, // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0>
-  2685632702U, // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
-  1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
-  2732966102U, // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7>
-  2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7>
-  2685632744U, // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7>
-  1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
-  2624768150U, // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2>
-  2685632764U, // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
-  2685632774U, // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1>
-  2624768412U, // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3>
-  2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6>
-  3702491714U, // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7>
-  2624768632U, // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7>
-  3702491843U, // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1>
-  2686959934U, // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3>
-  2689835336U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4>
-  1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
-  1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
-  3763577184U, // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1>
-  2689835374U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6>
-  1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS
-  2666573172U, // <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6>
-  3667711206U, // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4>
-  1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
-  2685190556U, // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7>
-  2666573520U, // <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3>
-  3040886886U, // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS
-  3625912834U, // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6>
-  2666573766U, // <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6>
-  2666573828U, // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5>
-  2732966354U, // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7>
-  2666573992U, // <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7>
-  3040886940U, // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS
-  2685190637U, // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7>
-  2732966390U, // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7>
-  2689835519U, // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7>
-  3667724438U, // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2>
-  3763577355U, // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1>
-  3806708243U, // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0>
-  2666574648U, // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6>
-  2657948520U, // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0>
-  2689835573U, // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7>
-  2666574842U, // <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2>
-  2685633095U, // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7>
-  2660603052U, // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0>
-  3643844997U, // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7>
-  2666575206U, // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6>
-  3655790391U, // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7>
-  3731690968U, // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3>
-  2666575468U, // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7>
-  2664584850U, // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0>
-  1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2>
-  1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
-  537707165U, // <3,0,u,2>: Cost 1 vext3 LHS, LHS
-  2689835684U, // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
-  1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
-  1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS
-  2624772304U, // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7>
-  2594002154U, // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u>
-  537707219U, // <3,0,u,u>: Cost 1 vext3 LHS, LHS
-  2552201318U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS
-  2618802278U, // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS
-  2618802366U, // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1>
-  1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
-  2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS
-  2732966663U, // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1>
-  3906258396U, // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6>
-  3667752171U, // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0>
-  1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
-  2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1>
-  1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1>
-  2624775063U, // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1>
-  1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
-  2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5>
-  2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
-  3763577701U, // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5>
-  3765273452U, // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3>
-  1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3>
-  2629420494U, // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1>
-  2689835911U, // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
-  2564163248U, // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2>
-  1611449238U, // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0>
-  2564164918U, // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS
-  2689835947U, // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
-  3692545978U, // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7>
-  2732966842U, // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0>
-  1611891651U, // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0>
-  1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS
-  1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
-  2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
-  2685633512U, // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1>
-  1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS
-  1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
-  2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
-  2733409294U, // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3>
-  1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
-  2552234086U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS
-  2732966955U, // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5>
-  2732966964U, // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5>
-  2685633597U, // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5>
-  2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS
-  2618805558U, // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS
-  2769472822U, // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS
-  3667784943U, // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4>
-  2685633642U, // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5>
-  2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1>
-  2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7>
-  2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5>
-  1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
-  2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5>
-  3759375522U, // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7>
-  3720417378U, // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0>
-  2832518454U, // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS
-  1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
-  3763578048U, // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1>
-  2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
-  2732967128U, // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7>
-  2685633761U, // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
-  3763578088U, // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5>
-  2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
-  3763578108U, // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7>
-  2732967166U, // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0>
-  2685633806U, // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7>
-  3631972454U, // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS
-  2659947612U, // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1>
-  4036102294U, // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2>
-  3095396454U, // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
-  3631975734U, // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS
-  2222982144U, // <3,1,7,5>: Cost 3 vrev <1,3,5,7>
-  3296797705U, // <3,1,7,6>: Cost 4 vrev <1,3,6,7>
-  3720418924U, // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7>
-  3095396459U, // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS
-  1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS
-  1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3>
-  2685633907U, // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
-  1611892092U, // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0>
-  1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS
-  1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
-  2685633950U, // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
-  2832518697U, // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS
-  1611892140U, // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3>
-  2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0>
-  1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS
-  2689836484U, // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0>
-  2685633997U, // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
-  2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5>
-  2732967398U, // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7>
-  2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
-  2229044964U, // <3,2,0,7>: Cost 3 vrev <2,3,7,0>
-  1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS
-  1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2>
-  2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1>
-  2623456150U, // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0>
-  2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
-  2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS
-  2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7>
-  2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
-  3667834101U, // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1>
-  1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1>
-  2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1>
-  2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3>
-  1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2>
-  1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
-  2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5>
-  2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
-  2689836688U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6>
-  3763578518U, // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3>
-  1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3>
-  1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1>
-  2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0>
-  2685191865U, // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2>
-  2685191875U, // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3>
-  1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5>
-  2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1>
-  2732967645U, // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2>
-  2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0>
-  1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1>
-  2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS
-  2558280602U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4>
-  2732967692U, // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4>
-  2685634326U, // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
-  2558283062U, // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS
-  1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS
-  2689836844U, // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0>
-  2229077736U, // <3,2,4,7>: Cost 3 vrev <2,3,7,4>
-  1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS
-  2552316006U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS
-  2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5>
-  2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
-  2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
-  1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5>
-  2665263108U, // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5>
-  2689836932U, // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
-  2665263272U, // <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7>
-  1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5>
-  2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1>
-  2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3>
-  2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6>
-  1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
-  2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5>
-  2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
-  2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
-  2665263950U, // <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1>
-  1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
-  2665264122U, // <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2>
-  2623460419U, // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3>
-  4169138340U, // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2>
-  2962358374U, // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS
-  2665264486U, // <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6>
-  2228954841U, // <3,2,7,5>: Cost 3 vrev <2,3,5,7>
-  2229028578U, // <3,2,7,6>: Cost 3 vrev <2,3,6,7>
-  2665264748U, // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7>
-  2962358379U, // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS
-  1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1>
-  1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS
-  1611449960U, // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2>
-  1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
-  1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5>
-  1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS
-  2689837168U, // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0>
-  2665265408U, // <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1>
-  1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1>
-  2685192331U, // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0>
-  1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2>
-  2685634717U, // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0>
-  2564294806U, // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2>
-  2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1>
-  2732968122U, // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2>
-  3763579075U, // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2>
-  4034053264U, // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7>
-  1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2>
-  2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3>
-  1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3>
-  2685192433U, // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3>
-  2685634808U, // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1>
-  2558332214U, // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS
-  2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3>
-  3759376661U, // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3>
-  2703477022U, // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3>
-  1555031423U, // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3>
-  2564309094U, // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS
-  2630100513U, // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3>
-  1557022322U, // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3>
-  2685192520U, // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0>
-  2564312374U, // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS
-  2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4>
-  2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3>
-  2704140655U, // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3>
-  1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3>
-  1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS
-  2624129256U, // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3>
-  2630764866U, // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3>
-  336380006U, // <3,3,3,3>: Cost 1 vdup3 LHS
-  1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS
-  2732968368U, // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5>
-  2624129683U, // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7>
-  2594182400U, // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3>
-  336380006U, // <3,3,3,u>: Cost 1 vdup3 LHS
-  2558353510U, // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS
-  2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4>
-  2564327108U, // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4>
-  2564327938U, // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6>
-  2960343962U, // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4>
-  1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6>
-  2771619126U, // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS
-  4034086032U, // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7>
-  1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6>
-  2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS
-  2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5>
-  2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5>
-  2732968512U, // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5>
-  2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS
-  3101279950U, // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5>
-  2665934946U, // <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0>
-  2826636598U, // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS
-  2826636599U, // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS
-  2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7>
-  3763579521U, // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7>
-  2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7>
-  2732968595U, // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7>
-  2732968604U, // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7>
-  3763579557U, // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7>
-  2732968621U, // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6>
-  2657973099U, // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3>
-  2658636732U, // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3>
-  2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS
-  2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7>
-  2564351687U, // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7>
-  2661291264U, // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3>
-  2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS
-  2732968694U, // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7>
-  3781126907U, // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3>
-  3095397376U, // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7>
-  2558383918U, // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS
-  1496547430U, // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS
-  1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2>
-  1592858504U, // <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3>
-  336380006U, // <3,3,u,3>: Cost 1 vdup3 LHS
-  1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS
-  1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6>
-  2690280268U, // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3>
-  2826636841U, // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS
-  336380006U, // <3,3,u,u>: Cost 1 vdup3 LHS
-  2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0>
-  1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS
-  2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2>
-  3693232384U, // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4>
-  2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5>
-  1659227026U, // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1>
-  1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2>
-  3667973382U, // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0>
-  1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS
-  2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2>
-  2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1>
-  1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4>
-  2624799704U, // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3>
-  2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS
-  2689838050U, // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0>
-  2689838062U, // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
-  2628117807U, // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4>
-  1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4>
-  3626180710U, // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS
-  2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3>
-  2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2>
-  2624800422U, // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1>
-  2624800514U, // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3>
-  2709965878U, // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3>
-  2689838140U, // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0>
-  2634090504U, // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4>
-  2689838158U, // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0>
-  2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2>
-  2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4>
-  2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4>
-  2624801180U, // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3>
-  2624801232U, // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1>
-  2905836854U, // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS
-  3040054582U, // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS
-  3702524611U, // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1>
-  2624801566U, // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2>
-  2564399206U, // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS
-  2564400026U, // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4>
-  2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4>
-  2570373542U, // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
-  1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
-  1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS
-  1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6>
-  3668006154U, // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4>
-  1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS
-  1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS
-  2689838341U, // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
-  1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5>
-  2564409494U, // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2>
-  1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS
-  2689838381U, // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7>
-  537709878U, // <3,4,5,6>: Cost 1 vext3 LHS, RHS
-  2594272523U, // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5>
-  537709896U, // <3,4,5,u>: Cost 1 vext3 LHS, RHS
-  2689838411U, // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1>
-  2558444534U, // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6>
-  2666607098U, // <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3>
-  2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6>
-  1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
-  2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
-  2689838471U, // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7>
-  2657981292U, // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4>
-  1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2>
-  2666607610U, // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2>
-  3702527072U, // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5>
-  2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4>
-  3644139945U, // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7>
-  2666607974U, // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6>
-  2732969416U, // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0>
-  2732969425U, // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0>
-  2666608236U, // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7>
-  2664617622U, // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4>
-  1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS
-  1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS
-  1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u>
-  2624804796U, // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1>
-  1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS
-  1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS
-  537710121U, // <3,4,u,6>: Cost 1 vext3 LHS, RHS
-  2594297102U, // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u>
-  537710139U, // <3,4,u,u>: Cost 1 vext3 LHS, RHS
-  3692576768U, // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0>
-  2618835046U, // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS
-  2618835138U, // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5>
-  3692577024U, // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4>
-  2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1>
-  2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
-  2732969588U, // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1>
-  2246963055U, // <3,5,0,7>: Cost 3 vrev <5,3,7,0>
-  2618835613U, // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS
-  2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS
-  3692577588U, // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1>
-  2624807835U, // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5>
-  2625471468U, // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5>
-  2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5>
-  2594311888U, // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3>
-  3699877107U, // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7>
-  1641680592U, // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3>
-  1641754329U, // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3>
-  3692578274U, // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3>
-  2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5>
-  3692578408U, // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2>
-  2625472206U, // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5>
-  2632107798U, // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5>
-  2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3>
-  3692578746U, // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7>
-  2716086049U, // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3>
-  2634762330U, // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5>
-  3692578966U, // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2>
-  2636089596U, // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5>
-  3699214668U, // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4>
-  2638080412U, // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3>
-  2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6>
-  2832844494U, // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5>
-  4033415682U, // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6>
-  3095072054U, // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS
-  3095072055U, // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS
-  2600304742U, // <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS
-  3763580815U, // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5>
-  2564474582U, // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4>
-  3699879044U, // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0>
-  2600308022U, // <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS
-  2618838326U, // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS
-  2772454710U, // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS
-  1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6>
-  1659228111U, // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6>
-  2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS
-  2624810704U, // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3>
-  2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5>
-  2570455472U, // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5>
-  2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS
-  1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
-  2732969998U, // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6>
-  1659228184U, // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7>
-  1659228193U, // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7>
-  2732970020U, // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1>
-  2732970035U, // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7>
-  2564490968U, // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6>
-  2732970050U, // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4>
-  2732970060U, // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5>
-  2732970071U, // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7>
-  2732970080U, // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7>
-  1659228258U, // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0>
-  1659228267U, // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0>
-  1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS
-  1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7>
-  2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2>
-  2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2>
-  1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS
-  1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
-  2732970154U, // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0>
-  2558531180U, // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7>
-  1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS
-  1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS
-  1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u>
-  2558535272U, // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2>
-  2558535830U, // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2>
-  1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS
-  1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7>
-  2772457626U, // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS
-  1646326023U, // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3>
-  1484797742U, // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS
-  2558541926U, // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS
-  2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2>
-  2689839404U, // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4>
-  3706519808U, // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4>
-  2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2>
-  2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7>
-  2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0>
-  2960313654U, // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
-  2689839456U, // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2>
-  3763581290U, // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3>
-  3763581297U, // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1>
-  2624816028U, // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6>
-  3763581315U, // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1>
-  2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6>
-  3763581335U, // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3>
-  2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3>
-  2721395113U, // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3>
-  2628797826U, // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6>
-  2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS
-  2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3>
-  2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6>
-  3763581395U, // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0>
-  2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6>
-  2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6>
-  2594394618U, // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3>
-  1648316922U, // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3>
-  1648390659U, // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3>
-  3693914262U, // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2>
-  3638281176U, // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3>
-  3696568678U, // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3>
-  2638088604U, // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3>
-  2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6>
-  3712494145U, // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6>
-  3698559612U, // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2>
-  2959674678U, // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
-  2959674679U, // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS
-  3763581536U, // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6>
-  2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3>
-  2732970609U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5>
-  3698560147U, // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6>
-  2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6>
-  2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6>
-  2732970640U, // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0>
-  2960346422U, // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS
-  2689839784U, // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6>
-  2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS
-  3650241270U, // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2>
-  2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7>
-  2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
-  2576501906U, // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5>
-  3650244622U, // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6>
-  4114633528U, // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6>
-  2732970735U, // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5>
-  2576504622U, // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS
-  2732970749U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1>
-  2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3>
-  2624819706U, // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3>
-  3656223234U, // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6>
-  2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4>
-  2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7>
-  1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
-  1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7>
-  1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7>
-  1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1>
-  2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7>
-  2558601146U, // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7>
-  2725081963U, // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3>
-  1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5>
-  2715423611U, // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1>
-  2722059141U, // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2>
-  2962361654U, // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS
-  1659229078U, // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1>
-  1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1>
-  2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2>
-  2558609339U, // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u>
-  2576525853U, // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6>
-  1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5>
-  2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6>
-  1659228984U, // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6>
-  1652298720U, // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3>
-  1659229159U, // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1>
-  2626813952U, // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0>
-  1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS
-  2626814116U, // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2>
-  3700556028U, // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0>
-  2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5>
-  2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0>
-  2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0>
-  2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1>
-  1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS
-  2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2>
-  2626814772U, // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1>
-  2626814870U, // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0>
-  2625487854U, // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7>
-  2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS
-  1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7>
-  2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7>
-  2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3>
-  1555064195U, // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7>
-  2588491878U, // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS
-  3700557318U, // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3>
-  2626815592U, // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2>
-  2626815654U, // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1>
-  2588495158U, // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS
-  2632787817U, // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7>
-  1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7>
-  2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3>
-  1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7>
-  2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2>
-  2626816268U, // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3>
-  2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3>
-  2626816412U, // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3>
-  2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6>
-  2638760514U, // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7>
-  2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7>
-  2826961920U, // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7>
-  2626816798U, // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2>
-  2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS
-  2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7>
-  2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7>
-  3700558996U, // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7>
-  2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS
-  1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS
-  2588512844U, // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4>
-  2564625766U, // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6>
-  1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS
-  2732971398U, // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2>
-  2626817744U, // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3>
-  3700559649U, // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3>
-  2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0>
-  2258728203U, // <3,7,5,4>: Cost 3 vrev <7,3,4,5>
-  2732971446U, // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5>
-  2732971457U, // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7>
-  2826964278U, // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS
-  2826964279U, // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS
-  2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1>
-  2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0>
-  2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3>
-  2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0>
-  2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5>
-  2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4>
-  2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6>
-  2732971540U, // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0>
-  2726041124U, // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7>
-  2570616934U, // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS
-  2570617856U, // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7>
-  2564646635U, // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7>
-  2570619332U, // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7>
-  2570620214U, // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS
-  2582564726U, // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7>
-  2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7>
-  1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
-  1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7>
-  2626819795U, // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2>
-  1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS
-  2626819973U, // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0>
-  2826961565U, // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS
-  2626820159U, // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6>
-  1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS
-  1595545808U, // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7>
-  1659229804U, // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7>
-  1553078629U, // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS
-  1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
-  1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2>
-  1659672284U, // <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2>
-  1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
-  2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1>
-  1663874806U, // <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1>
-  1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2>
-  2960313672U, // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
-  1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2>
-  1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u>
-  1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u>
-  537712430U, // <3,u,1,2>: Cost 1 vext3 LHS, LHS
-  1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3>
-  1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS
-  1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u>
-  2627486946U, // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u>
-  1659230043U, // <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3>
-  537712484U, // <3,u,1,u>: Cost 1 vext3 LHS, LHS
-  1611890852U, // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
-  2624833102U, // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3>
-  1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u>
-  1616099205U, // <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0>
-  1611890892U, // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
-  2689841054U, // <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7>
-  1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u>
-  1659230124U, // <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3>
-  1616541618U, // <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0>
-  1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1>
-  1484973079U, // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3>
-  2685638607U, // <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2>
-  336380006U, // <3,u,3,3>: Cost 1 vdup3 LHS
-  1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5>
-  1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
-  2690283512U, // <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7>
-  2959674696U, // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
-  336380006U, // <3,u,3,u>: Cost 1 vdup3 LHS
-  2558722150U, // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS
-  1659672602U, // <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5>
-  1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
-  2689841196U, // <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5>
-  1659227344U, // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
-  1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6>
-  1663875144U, // <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6>
-  1659230289U, // <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6>
-  1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6>
-  1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS
-  2689841261U, // <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7>
-  1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5>
-  1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
-  1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS
-  1659228164U, // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
-  537712794U, // <3,u,5,6>: Cost 1 vext3 LHS, RHS
-  1659230371U, // <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7>
-  537712812U, // <3,u,5,u>: Cost 1 vext3 LHS, RHS
-  2689841327U, // <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1>
-  2558739482U, // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6>
-  2689841351U, // <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7>
-  1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
-  1659227508U, // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
-  2690283746U, // <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7>
-  1659228984U, // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
-  1659230445U, // <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0>
-  1616099581U, // <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7>
-  1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS
-  1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7>
-  2558748264U, // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2>
-  3095397021U, // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
-  1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS
-  1659228328U, // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
-  2722060599U, // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2>
-  1659229804U, // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
-  1485010734U, // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS
-  1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1>
-  1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2>
-  537712997U, // <3,u,u,2>: Cost 1 vext3 LHS, LHS
-  336380006U, // <3,u,u,3>: Cost 1 vdup3 LHS
-  1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5>
-  1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6>
-  537713037U, // <3,u,u,6>: Cost 1 vext3 LHS, RHS
-  1659230607U, // <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0>
-  537713051U, // <3,u,u,u>: Cost 1 vext3 LHS, LHS
-  2691907584U, // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0>
-  2691907594U, // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1>
-  2691907604U, // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2>
-  3709862144U, // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4>
-  2684682280U, // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4>
-  3694600633U, // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0>
-  3291431290U, // <4,0,0,6>: Cost 4 vrev <0,4,6,0>
-  3668342067U, // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0>
-  2691907657U, // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1>
-  2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS
-  2570716058U, // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4>
-  1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
-  2570717648U, // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1>
-  2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS
-  2594607206U, // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4>
-  3662377563U, // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1>
-  2594608436U, // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1>
-  1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
-  2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4>
-  3759530159U, // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4>
-  2685862072U, // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4>
-  2631476937U, // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0>
-  2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6>
-  3765649622U, // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7>
-  2686157020U, // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
-  3668358453U, // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2>
-  2686304494U, // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4>
-  3632529510U, // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS
-  2686451968U, // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
-  2686525705U, // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4>
-  3760341266U, // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4>
-  3632532790U, // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS
-  3913254606U, // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5>
-  3705219740U, // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7>
-  3713845990U, // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0>
-  2686451968U, // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
-  2552823910U, // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS
-  2691907922U, // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5>
-  2691907932U, // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6>
-  3626567830U, // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2>
-  2552827190U, // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS
-  2631478582U, // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS
-  3626570017U, // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2>
-  3668374839U, // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4>
-  2552829742U, // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS
-  2558804070U, // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS
-  1839644774U, // <4,0,5,1>: Cost 2 vzipl RHS, LHS
-  2913386660U, // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2>
-  2570750420U, // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5>
-  2558807350U, // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS
-  3987128750U, // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7>
-  3987128822U, // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7>
-  2594641208U, // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5>
-  1839645341U, // <4,0,5,u>: Cost 2 vzipl RHS, LHS
-  2552840294U, // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS
-  3047604234U, // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1>
-  1973862502U, // <4,0,6,2>: Cost 2 vtrnl RHS, LHS
-  2570758613U, // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6>
-  2552843574U, // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS
-  2217664887U, // <4,0,6,5>: Cost 3 vrev <0,4,5,6>
-  3662418528U, // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6>
-  2658022257U, // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0>
-  1973862556U, // <4,0,6,u>: Cost 2 vtrnl RHS, LHS
-  3731764218U, // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2>
-  3988324454U, // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS
-  4122034278U, // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS
-  3735082246U, // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0>
-  3731764536U, // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5>
-  3937145718U, // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5>
-  3737073145U, // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0>
-  3731764844U, // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7>
-  4122034332U, // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS
-  2552856678U, // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS
-  1841635430U, // <4,0,u,1>: Cost 2 vzipl RHS, LHS
-  1618166429U, // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
-  2570774999U, // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u>
-  2552859958U, // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS
-  2631481498U, // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS
-  2686157020U, // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
-  2594665787U, // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u>
-  1618166483U, // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS
-  2617548837U, // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1>
-  2622857318U, // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS
-  3693281484U, // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6>
-  2691908342U, // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2>
-  2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5>
-  3764470538U, // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4>
-  3695272459U, // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1>
-  3733094980U, // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4>
-  2622857885U, // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS
-  3696599798U, // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2>
-  2691097399U, // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4>
-  2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4>
-  2691908424U, // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3>
-  3696600125U, // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5>
-  3696600175U, // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1>
-  3696600307U, // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7>
-  3668423997U, // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1>
-  2691908469U, // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3>
-  2570797158U, // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS
-  2570797978U, // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4>
-  3696600680U, // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2>
-  1618166682U, // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4>
-  2570800438U, // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS
-  3765650347U, // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3>
-  3696601018U, // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7>
-  3668432190U, // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2>
-  1618535367U, // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4>
-  2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS
-  2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3>
-  2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4>
-  2692572139U, // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4>
-  2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS
-  2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7>
-  2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
-  3662468090U, // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2>
-  2691908631U, // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3>
-  3760194590U, // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1>
-  3693947874U, // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0>
-  3765650484U, // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5>
-  3113877606U, // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS
-  3760194630U, // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5>
-  2622860598U, // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS
-  3297436759U, // <4,1,4,6>: Cost 4 vrev <1,4,6,4>
-  3800007772U, // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0>
-  2622860841U, // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS
-  1479164006U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
-  2552906486U, // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2>
-  2552907299U, // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5>
-  2552907926U, // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2>
-  1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
-  2913387664U, // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7>
-  2600686074U, // <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3>
-  2600686586U, // <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
-  1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS
-  2552914022U, // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS
-  2558886708U, // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1>
-  4028205206U, // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2>
-  3089858662U, // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS
-  2552917302U, // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS
-  2223637584U, // <4,1,6,5>: Cost 3 vrev <1,4,5,6>
-  4121347081U, // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7>
-  3721155406U, // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1>
-  2552919854U, // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS
-  2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
-  3733763173U, // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1>
-  3734426806U, // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1>
-  2695226671U, // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4>
-  3721155942U, // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6>
-  3721155976U, // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4>
-  3662500458U, // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7>
-  3721156204U, // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7>
-  2659357716U, // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
-  1479188582U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS
-  2552931062U, // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2>
-  2552931944U, // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2>
-  1622148480U, // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4>
-  1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS
-  2622863514U, // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS
-  2588725862U, // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
-  2600686586U, // <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
-  1479194414U, // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS
-  2617557030U, // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2>
-  2622865510U, // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS
-  2622865612U, // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6>
-  3693289753U, // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2>
-  2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6>
-  3765650918U, // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7>
-  2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4>
-  3695944285U, // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2>
-  2622866077U, // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS
-  3696607990U, // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2>
-  3696608052U, // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1>
-  3696608150U, // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0>
-  3895574630U, // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS
-  2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
-  3696608400U, // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7>
-  3760784956U, // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3>
-  3773908549U, // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3>
-  2691909162U, // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
-  3696608748U, // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4>
-  3696608828U, // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3>
-  2691909224U, // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2>
-  2691909234U, // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3>
-  3759605368U, // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0>
-  3696609156U, // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7>
-  3760785040U, // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6>
-  3668505927U, // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2>
-  2691909279U, // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3>
-  2691909286U, // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1>
-  3764840111U, // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1>
-  3765651129U, // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2>
-  2698544836U, // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4>
-  2685863630U, // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5>
-  2698692310U, // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4>
-  3772507871U, // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4>
-  2698839784U, // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4>
-  2691909358U, // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1>
-  2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS
-  2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4>
-  2564917004U, // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4>
-  2699208469U, // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4>
-  2564918582U, // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS
-  2622868790U, // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS
-  2229667632U, // <4,2,4,6>: Cost 3 vrev <2,4,6,4>
-  3800082229U, // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0>
-  2622869033U, // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS
-  2552979558U, // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS
-  2558952342U, // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0>
-  2564925032U, // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2>
-  2967060582U, // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS
-  2552982838U, // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS
-  3987130190U, // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7>
-  2913388474U, // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7>
-  3895577910U, // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS
-  2552985390U, // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS
-  1479245926U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS
-  2552988406U, // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2>
-  2552989288U, // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2>
-  2954461286U, // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS
-  1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS
-  2229610281U, // <4,2,6,5>: Cost 3 vrev <2,4,5,6>
-  2600767994U, // <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3>
-  2600768506U, // <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2>
-  1479251758U, // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS
-  2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
-  3733771366U, // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2>
-  3734434999U, // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2>
-  2701199368U, // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4>
-  4175774618U, // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4>
-  3303360298U, // <4,2,7,5>: Cost 4 vrev <2,4,5,7>
-  3727136217U, // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4>
-  3727136364U, // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7>
-  2659365909U, // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
-  1479262310U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS
-  2553004790U, // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2>
-  2553005672U, // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2>
-  2954477670U, // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS
-  1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS
-  2622871706U, // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS
-  2229700404U, // <4,2,u,6>: Cost 3 vrev <2,4,6,u>
-  2600784890U, // <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2>
-  1479268142U, // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS
-  3765651595U, // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0>
-  2691909782U, // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2>
-  2702452897U, // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4>
-  3693297946U, // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3>
-  3760711856U, // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1>
-  2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0>
-  3309349381U, // <4,3,0,6>: Cost 4 vrev <3,4,6,0>
-  3668563278U, // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0>
-  2691909845U, // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2>
-  2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1>
-  3764840678U, // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1>
-  2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4>
-  2703190267U, // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4>
-  3760195840U, // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0>
-  3765651724U, // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3>
-  3309357574U, // <4,3,1,6>: Cost 4 vrev <3,4,6,1>
-  3769633054U, // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3>
-  2703558952U, // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4>
-  3626770534U, // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS
-  2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3>
-  3765651777U, // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2>
-  2703853900U, // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4>
-  3626773814U, // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS
-  2704001374U, // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4>
-  3765651814U, // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3>
-  3769633135U, // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3>
-  2634819681U, // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3>
-  3765651839U, // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1>
-  3765651848U, // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1>
-  3710552404U, // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3>
-  2691910044U, // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
-  2704591270U, // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4>
-  3769633202U, // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7>
-  3703917212U, // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7>
-  3769633220U, // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7>
-  2691910044U, // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
-  2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1>
-  2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2>
-  2564990741U, // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4>
-  3765651946U, // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0>
-  2691910136U, // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5>
-  2686454274U, // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6>
-  2235640329U, // <4,3,4,6>: Cost 3 vrev <3,4,6,4>
-  3801483792U, // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2>
-  2691910168U, // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1>
-  2559025254U, // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS
-  2559026237U, // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5>
-  2564998862U, // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5>
-  2570971548U, // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3>
-  2559028534U, // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS
-  4163519477U, // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5>
-  3309390346U, // <4,3,5,6>: Cost 4 vrev <3,4,6,5>
-  2706139747U, // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4>
-  2559031086U, // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS
-  2559033446U, // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS
-  2559034430U, // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6>
-  2565007127U, // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6>
-  2570979740U, // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3>
-  2559036726U, // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS
-  1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6>
-  4028203932U, // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6>
-  2706803380U, // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4>
-  1162062365U, // <4,3,6,u>: Cost 2 vrev <3,4,u,6>
-  3769633475U, // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1>
-  3769633488U, // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5>
-  3638757144U, // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7>
-  3769633508U, // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7>
-  3769633515U, // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5>
-  3769633526U, // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7>
-  3662647932U, // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7>
-  3781208837U, // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4>
-  3769633547U, // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1>
-  2559049830U, // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS
-  2691910430U, // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2>
-  2565023513U, // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u>
-  2707835698U, // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4>
-  2559053110U, // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS
-  1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u>
-  2235673101U, // <4,3,u,6>: Cost 3 vrev <3,4,6,u>
-  2708130646U, // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4>
-  1162078751U, // <4,3,u,u>: Cost 2 vrev <3,4,u,u>
-  2617573416U, // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4>
-  1570373734U, // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS
-  2779676774U, // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS
-  3760196480U, // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1>
-  2576977100U, // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0>
-  2718747538U, // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1>
-  2718747548U, // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2>
-  3668637015U, // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0>
-  1570374301U, // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS
-  2644116214U, // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2>
-  2644116276U, // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1>
-  2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3>
-  2644116440U, // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3>
-  2711227356U, // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3>
-  2709310438U, // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4>
-  3765652462U, // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3>
-  3768970231U, // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3>
-  2695891968U, // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3>
-  3703260634U, // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4>
-  3765652499U, // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4>
-  2644117096U, // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2>
-  2631509709U, // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4>
-  2644117269U, // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4>
-  3705251698U, // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7>
-  2710047808U, // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4>
-  3783863369U, // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4>
-  2634827874U, // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4>
-  2644117654U, // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2>
-  3638797210U, // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4>
-  3638798082U, // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3>
-  2637482406U, // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
-  2638146039U, // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4>
-  3913287374U, // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5>
-  3765652625U, // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4>
-  3713878762U, // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4>
-  2637482406U, // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
-  1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS
-  2577007514U, // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4>
-  2577008232U, // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2>
-  2571037175U, // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4>
-  161926454U, // <4,4,4,4>: Cost 1 vdup0 RHS
-  1570377014U, // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS
-  2779680054U, // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS
-  2594927963U, // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
-  161926454U, // <4,4,4,u>: Cost 1 vdup0 RHS
-  2571042918U, // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS
-  2571043738U, // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4>
-  3638814495U, // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5>
-  2571045368U, // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5>
-  2571046198U, // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS
-  1839648054U, // <4,4,5,5>: Cost 2 vzipl RHS, RHS
-  1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
-  2594936156U, // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5>
-  1618169160U, // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
-  2553135206U, // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS
-  3626877686U, // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2>
-  2565080782U, // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5>
-  2571053561U, // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6>
-  2553138486U, // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS
-  2241555675U, // <4,4,6,5>: Cost 3 vrev <4,4,5,6>
-  1973865782U, // <4,4,6,6>: Cost 2 vtrnl RHS, RHS
-  2658055029U, // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4>
-  1973865800U, // <4,4,6,u>: Cost 2 vtrnl RHS, RHS
-  2644120570U, // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2>
-  3638829978U, // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4>
-  3638830881U, // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7>
-  3735115018U, // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4>
-  2662036827U, // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
-  2713292236U, // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4>
-  2713365973U, // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4>
-  2644121196U, // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7>
-  2662036827U, // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
-  1503297638U, // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS
-  1570379566U, // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS
-  2779682606U, // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS
-  2571069947U, // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u>
-  161926454U, // <4,4,u,4>: Cost 1 vdup0 RHS
-  1841638710U, // <4,4,u,5>: Cost 2 vzipl RHS, RHS
-  1618169385U, // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
-  2594960735U, // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u>
-  161926454U, // <4,4,u,u>: Cost 1 vdup0 RHS
-  2631516160U, // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0>
-  1557774438U, // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS
-  2618908875U, // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5>
-  2571078140U, // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0>
-  2626871634U, // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5>
-  3705258414U, // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7>
-  2594968438U, // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5>
-  2594968928U, // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0>
-  1557775005U, // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS
-  2631516918U, // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2>
-  2624217939U, // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5>
-  2631517078U, // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0>
-  2821341286U, // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS
-  3895086054U, // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4>
-  2626872471U, // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5>
-  3895083131U, // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6>
-  2718748368U, // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3>
-  2821341291U, // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS
-  2571092070U, // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS
-  3699287585U, // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3>
-  2630854269U, // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5>
-  1557776078U, // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5>
-  2631517974U, // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5>
-  3692652384U, // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7>
-  2631518138U, // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7>
-  4164013366U, // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS
-  1561094243U, // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5>
-  2631518358U, // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2>
-  3895084710U, // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1>
-  2631518540U, // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4>
-  2631518620U, // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3>
-  2631518716U, // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0>
-  2631518784U, // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5>
-  2658060980U, // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4>
-  2640145131U, // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5>
-  2631519006U, // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2>
-  2571108454U, // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS
-  3632907342U, // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4>
-  2571110094U, // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5>
-  2571110912U, // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4>
-  2571111734U, // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS
-  1557777718U, // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS
-  2645454195U, // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5>
-  2718748614U, // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6>
-  1557777961U, // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS
-  1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS
-  2913398480U, // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3>
-  2631519998U, // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4>
-  2577090710U, // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2>
-  1503349978U, // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5>
-  2631520260U, // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5>
-  2913390690U, // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0>
-  2821344566U, // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS
-  1503352622U, // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS
-  1497383014U, // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS
-  2559181904U, // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6>
-  2565154601U, // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6>
-  1497385474U, // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6>
-  1497386294U, // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS
-  3047608324U, // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5>
-  2571129656U, // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6>
-  27705344U, // <4,5,6,7>: Cost 0 copy RHS
-  27705344U, // <4,5,6,u>: Cost 0 copy RHS
-  2565161062U, // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS
-  2565161882U, // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4>
-  2565162794U, // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7>
-  2661381387U, // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5>
-  2565164342U, // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS
-  2718748840U, // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7>
-  2718748846U, // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4>
-  2719412407U, // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4>
-  2565166894U, // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS
-  1497399398U, // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS
-  1557780270U, // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS
-  2631522181U, // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0>
-  1497401860U, // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u>
-  1497402678U, // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS
-  1557780634U, // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS
-  2631522512U, // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7>
-  27705344U, // <4,5,u,7>: Cost 0 copy RHS
-  27705344U, // <4,5,u,u>: Cost 0 copy RHS
-  2618916864U, // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0>
-  1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS
-  1545175244U, // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6>
-  3692658940U, // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0>
-  2618917202U, // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5>
-  3852910806U, // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7>
-  2253525648U, // <4,6,0,6>: Cost 3 vrev <6,4,6,0>
-  4040764726U, // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS
-  1545175709U, // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS
-  2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2>
-  2618917684U, // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1>
-  2618917782U, // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0>
-  2618917848U, // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3>
-  3692659773U, // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5>
-  2618918032U, // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7>
-  3692659937U, // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7>
-  4032146742U, // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS
-  2618918253U, // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3>
-  2618918380U, // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4>
-  2618918460U, // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3>
-  2618918504U, // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2>
-  2618918566U, // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1>
-  2618918679U, // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6>
-  2618918788U, // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7>
-  2618918842U, // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7>
-  2718749178U, // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3>
-  2618918971U, // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1>
-  2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2>
-  2636171526U, // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6>
-  3692661057U, // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2>
-  2618919324U, // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3>
-  2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6>
-  2638826058U, // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6>
-  3913303030U, // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6>
-  2722730572U, // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4>
-  2618919710U, // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2>
-  2565210214U, // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS
-  2718749286U, // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3>
-  2565211952U, // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4>
-  2571184649U, // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4>
-  2565213494U, // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS
-  1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS
-  1705430326U, // <4,6,4,6>: Cost 2 vuzpl RHS, RHS
-  2595075437U, // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4>
-  1545178665U, // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS
-  2565218406U, // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS
-  2645462736U, // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3>
-  2913399290U, // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3>
-  3913305394U, // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3>
-  2645462982U, // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6>
-  2779172868U, // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5>
-  2913391416U, // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6>
-  2821426486U, // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS
-  2821426487U, // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS
-  1503428710U, // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS
-  2577171190U, // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2>
-  2645463546U, // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3>
-  2577172630U, // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2>
-  1503431908U, // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6>
-  2253501069U, // <4,6,6,5>: Cost 3 vrev <6,4,5,6>
-  2618921784U, // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6>
-  2954464566U, // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS
-  1503434542U, // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS
-  2645464058U, // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2>
-  2779173882U, // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2>
-  3638978355U, // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7>
-  2725090156U, // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4>
-  2645464422U, // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6>
-  2779174246U, // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
-  3852915914U, // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3>
-  2779174508U, // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
-  2779173945U, // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2>
-  1503445094U, // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS
-  1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS
-  1705432878U, // <4,6,u,2>: Cost 2 vuzpl RHS, LHS
-  2618922940U, // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1>
-  1503448294U, // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u>
-  1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS
-  1705433242U, // <4,6,u,6>: Cost 2 vuzpl RHS, RHS
-  2954480950U, // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS
-  1545181541U, // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS
-  3706601472U, // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0>
-  2632859750U, // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS
-  2726343685U, // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4>
-  3701293312U, // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4>
-  3706601810U, // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5>
-  2259424608U, // <4,7,0,5>: Cost 3 vrev <7,4,5,0>
-  3695321617U, // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7>
-  3800454194U, // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4>
-  2632860317U, // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS
-  2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1>
-  3700630324U, // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1>
-  2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4>
-  3769635936U, // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5>
-  3656920374U, // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS
-  3700630681U, // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7>
-  3701294314U, // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7>
-  3793818754U, // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3>
-  2259654012U, // <4,7,1,u>: Cost 3 vrev <7,4,u,1>
-  3656925286U, // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS
-  3706603050U, // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3>
-  3706603112U, // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2>
-  2727744688U, // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4>
-  3705939745U, // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7>
-  2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7>
-  3706603450U, // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7>
-  3792491731U, // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3>
-  2634852453U, // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7>
-  3706603670U, // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2>
-  3662906266U, // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4>
-  3725183326U, // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4>
-  3706603932U, // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3>
-  3701295618U, // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6>
-  2638834251U, // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7>
-  2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7>
-  3802445093U, // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4>
-  2640825150U, // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7>
-  2718750004U, // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1>
-  3706604490U, // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3>
-  3656943474U, // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7>
-  3779884371U, // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5>
-  2259383643U, // <4,7,4,4>: Cost 3 vrev <7,4,4,4>
-  2632863030U, // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS
-  2259531117U, // <4,7,4,6>: Cost 3 vrev <7,4,6,4>
-  3907340074U, // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7>
-  2632863273U, // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS
-  2913391610U, // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2>
-  3645006848U, // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7>
-  2589181646U, // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5>
-  3645008403U, // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5>
-  2913391974U, // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6>
-  2583211973U, // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5>
-  2589184670U, // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5>
-  2913392236U, // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7>
-  2913392258U, // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2>
-  1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS
-  3047609338U, // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2>
-  2583217768U, // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2>
-  2583218326U, // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2>
-  1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS
-  1509478342U, // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6>
-  2583220730U, // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3>
-  3047609964U, // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7>
-  1509480238U, // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS
-  3650994278U, // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS
-  3650995098U, // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4>
-  3650996010U, // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7>
-  3804804677U, // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4>
-  3650997486U, // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7>
-  2662725039U, // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7>
-  3662942880U, // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7>
-  2718750316U, // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7>
-  2664715938U, // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7>
-  1509490790U, // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS
-  2632865582U, // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS
-  2583234152U, // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2>
-  2583234710U, // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2>
-  1509494070U, // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS
-  1509494728U, // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u>
-  2583237114U, // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3>
-  3047757420U, // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7>
-  1509496622U, // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS
-  2618933248U, // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0>
-  1545191526U, // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS
-  1545191630U, // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u>
-  2691913445U, // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2>
-  2618933586U, // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5>
-  2265397305U, // <4,u,0,5>: Cost 3 vrev <u,4,5,0>
-  2595189625U, // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u>
-  2595190139U, // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0>
-  1545192093U, // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS
-  2618934006U, // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2>
-  2618934068U, // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1>
-  1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
-  2618934232U, // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3>
-  2695894848U, // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3>
-  2618934416U, // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7>
-  3692676321U, // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7>
-  2718750555U, // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3>
-  1618171748U, // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
-  2553397350U, // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS
-  2630215215U, // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u>
-  2618934888U, // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2>
-  1557800657U, // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u>
-  2618935065U, // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u>
-  2733864859U, // <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4>
-  2618935226U, // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7>
-  2718750636U, // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3>
-  1561118822U, // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u>
-  2618935446U, // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2>
-  2779318422U, // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2>
-  2636851545U, // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u>
-  2618935708U, // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3>
-  2618935810U, // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6>
-  2691913711U, // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7>
-  2588725862U, // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
-  2640169710U, // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u>
-  2618936094U, // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2>
-  1503559782U, // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS
-  2692282391U, // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2>
-  2565359426U, // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4>
-  2571332123U, // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4>
-  161926454U, // <4,u,4,4>: Cost 1 vdup0 RHS
-  1545194806U, // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS
-  1705577782U, // <4,u,4,6>: Cost 2 vuzpl RHS, RHS
-  2718750801U, // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6>
-  161926454U, // <4,u,4,u>: Cost 1 vdup0 RHS
-  1479164006U, // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
-  1839650606U, // <4,u,5,1>: Cost 2 vzipl RHS, LHS
-  2565367502U, // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5>
-  3089777309U, // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS
-  1479167286U, // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
-  1839650970U, // <4,u,5,5>: Cost 2 vzipl RHS, RHS
-  1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
-  3089780265U, // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS
-  1618172076U, // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
-  1479688294U, // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS
-  2553430774U, // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2>
-  1973868334U, // <4,u,6,2>: Cost 2 vtrnl RHS, LHS
-  1497606685U, // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6>
-  1479691574U, // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS
-  1509552079U, // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6>
-  1973868698U, // <4,u,6,6>: Cost 2 vtrnl RHS, RHS
-  27705344U, // <4,u,6,7>: Cost 0 copy RHS
-  27705344U, // <4,u,6,u>: Cost 0 copy RHS
-  2565382246U, // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS
-  2565383066U, // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4>
-  2565384005U, // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7>
-  2661405966U, // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u>
-  2565385526U, // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS
-  2779321702U, // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
-  2589274793U, // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7>
-  2779321964U, // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
-  2565388078U, // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS
-  1479704678U, // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS
-  1545197358U, // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS
-  1618172261U, // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
-  1497623071U, // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u>
-  161926454U, // <4,u,u,4>: Cost 1 vdup0 RHS
-  1545197722U, // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS
-  1618172301U, // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
-  27705344U, // <4,u,u,7>: Cost 0 copy RHS
-  27705344U, // <4,u,u,u>: Cost 0 copy RHS
-  2687123456U, // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0>
-  2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1>
-  2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2>
-  3710599434U, // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5>
-  2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5>
-  3657060306U, // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0>
-  3292094923U, // <5,0,0,6>: Cost 4 vrev <0,5,6,0>
-  3669005700U, // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0>
-  2687123530U, // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2>
-  2559434854U, // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS
-  2559435887U, // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1>
-  1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
-  3698656256U, // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7>
-  2559438134U, // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS
-  2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1>
-  3715908851U, // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7>
-  3657069562U, // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2>
-  1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
-  2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2>
-  2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
-  2698625208U, // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4>
-  2685944002U, // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5>
-  2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5>
-  2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5>
-  2725167324U, // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4>
-  2595280230U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
-  2686312687U, // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5>
-  3760128248U, // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5>
-  3759685888U, // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4>
-  2686533898U, // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5>
-  3760349459U, // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5>
-  2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0>
-  3776348452U, // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4>
-  3713256094U, // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0>
-  3914064896U, // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7>
-  2686976320U, // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5>
-  2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS
-  1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
-  2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6>
-  3761013092U, // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5>
-  2559462710U, // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS
-  2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS
-  3761234303U, // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5>
-  2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
-  1613381970U, // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
-  3766763926U, // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1>
-  2919268454U, // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS
-  3053486182U, // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS
-  3723210589U, // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0>
-  3766763966U, // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5>
-  2650796031U, // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0>
-  3719893090U, // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0>
-  3914067254U, // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS
-  2919269021U, // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS
-  4047519744U, // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0>
-  2920038502U, // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
-  3759759871U, // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7>
-  3645164070U, // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6>
-  3762414095U, // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5>
-  3993780690U, // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7>
-  3719893816U, // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6>
-  2662077302U, // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5>
-  2920039069U, // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS
-  2565455974U, // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS
-  2565456790U, // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0>
-  2565457742U, // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7>
-  3639199894U, // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2>
-  2565459254U, // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS
-  2589347938U, // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0>
-  2589348530U, // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7>
-  4188456422U, // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7>
-  2565461806U, // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS
-  2687124106U, // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2>
-  1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5>
-  1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
-  2689925800U, // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5>
-  2687124146U, // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6>
-  2638190746U, // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS
-  2589356723U, // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u>
-  2595280230U, // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
-  1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS
-  2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0>
-  1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS
-  2646818980U, // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2>
-  2687124214U, // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2>
-  2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5>
-  2641510814U, // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0>
-  3720561142U, // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7>
-  3298141357U, // <5,1,0,7>: Cost 4 vrev <1,5,7,0>
-  1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS
-  2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1>
-  2687124276U, // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1>
-  2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0>
-  2687124296U, // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3>
-  2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5>
-  2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5>
-  3765216101U, // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5>
-  3765289838U, // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5>
-  2687124341U, // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3>
-  3297641584U, // <5,1,2,0>: Cost 4 vrev <1,5,0,2>
-  3763520391U, // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3>
-  2646820456U, // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2>
-  2687124374U, // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0>
-  2691990436U, // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5>
-  2687124395U, // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3>
-  2646820794U, // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7>
-  3808199610U, // <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0>
-  2687124419U, // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0>
-  2577440870U, // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS
-  2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3>
-  3759686627U, // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5>
-  2692580332U, // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5>
-  2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5>
-  2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7>
-  3760866313U, // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7>
-  2692875280U, // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5>
-  2687124503U, // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3>
-  1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1>
-  2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5>
-  2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5>
-  2687124541U, // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5>
-  2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4>
-  1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS
-  2646822260U, // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6>
-  3298174129U, // <5,1,4,7>: Cost 4 vrev <1,5,7,4>
-  1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1>
-  2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1>
-  2646822543U, // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1>
-  3760866433U, // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1>
-  2687124624U, // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7>
-  2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5>
-  2646822916U, // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5>
-  2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0>
-  2646823080U, // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7>
-  2687124663U, // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1>
-  2553577574U, // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS
-  3763520719U, // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7>
-  2646823418U, // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3>
-  3760866529U, // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7>
-  2553580854U, // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS
-  2687124723U, // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7>
-  2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6>
-  2646823758U, // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1>
-  2646823839U, // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1>
-  2559557734U, // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS
-  2559558452U, // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1>
-  2571503270U, // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1>
-  2040971366U, // <5,1,7,3>: Cost 2 vtrnr RHS, LHS
-  2559561014U, // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS
-  2595393232U, // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3>
-  4188455035U, // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6>
-  2646824556U, // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7>
-  2040971371U, // <5,1,7,u>: Cost 2 vtrnr RHS, LHS
-  1591662326U, // <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1>
-  1573082926U, // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS
-  2695824760U, // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5>
-  2040979558U, // <5,1,u,3>: Cost 2 vtrnr RHS, LHS
-  2687124874U, // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5>
-  1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS
-  2646825168U, // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7>
-  2646825216U, // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1>
-  2040979563U, // <5,1,u,u>: Cost 2 vtrnr RHS, LHS
-  3702652928U, // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0>
-  2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS
-  2641518756U, // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2>
-  3759760847U, // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2>
-  3760866775U, // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1>
-  3759539680U, // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1>
-  3760866796U, // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4>
-  3304114054U, // <5,2,0,7>: Cost 4 vrev <2,5,7,0>
-  2628911773U, // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS
-  2623603464U, // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2>
-  3698008921U, // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2>
-  3633325603U, // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5>
-  2687125027U, // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5>
-  3633327414U, // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS
-  3759539760U, // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0>
-  3760866876U, // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3>
-  3304122247U, // <5,2,1,7>: Cost 4 vrev <2,5,7,1>
-  2687125072U, // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5>
-  3633332326U, // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS
-  3759760992U, // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3>
-  2687125096U, // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2>
-  2687125106U, // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3>
-  2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5>
-  3759466120U, // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7>
-  3760866960U, // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6>
-  3771926168U, // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5>
-  2687125151U, // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3>
-  2687125158U, // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1>
-  2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5>
-  2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5>
-  3759687365U, // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5>
-  1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5>
-  2698700503U, // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5>
-  3772368608U, // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5>
-  3702655716U, // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7>
-  1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5>
-  2641521555U, // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2>
-  3772368642U, // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3>
-  2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5>
-  2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5>
-  2698626848U, // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6>
-  2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS
-  2645503353U, // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2>
-  3304146826U, // <5,2,4,7>: Cost 4 vrev <2,5,7,4>
-  2628914729U, // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS
-  2553643110U, // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS
-  3758950227U, // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3>
-  3759761248U, // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7>
-  2982396006U, // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS
-  2553646390U, // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS
-  2553647108U, // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5>
-  3760867204U, // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7>
-  3702657141U, // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1>
-  2982396011U, // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS
-  3627393126U, // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS
-  3760867236U, // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3>
-  2645504506U, // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3>
-  2687125434U, // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7>
-  2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5>
-  3760867276U, // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7>
-  3763521493U, // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7>
-  3719246670U, // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1>
-  2687125479U, // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7>
-  2565603430U, // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS
-  2553660150U, // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2>
-  2565605216U, // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7>
-  2961178726U, // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS
-  2565606710U, // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS
-  4034920552U, // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5>
-  3114713292U, // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6>
-  3702658668U, // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7>
-  2961178731U, // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS
-  2687125563U, // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1>
-  2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS
-  2565613409U, // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u>
-  2687125592U, // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3>
-  1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5>
-  2628917402U, // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS
-  2702092405U, // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5>
-  3304179598U, // <5,2,u,7>: Cost 4 vrev <2,5,7,u>
-  1628498055U, // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5>
-  3760867467U, // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0>
-  2687125654U, // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2>
-  3759761565U, // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0>
-  3633391766U, // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2>
-  2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1>
-  3760277690U, // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2>
-  3310013014U, // <5,3,0,6>: Cost 4 vrev <3,5,6,0>
-  2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0>
-  2687125717U, // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2>
-  3760867551U, // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3>
-  3760867558U, // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1>
-  2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3>
-  2703198460U, // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5>
-  3760867587U, // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3>
-  2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7>
-  3698681075U, // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7>
-  2703493408U, // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5>
-  2628920721U, // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3>
-  3766765870U, // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1>
-  3698681379U, // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5>
-  3760867649U, // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2>
-  2698627404U, // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4>
-  2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5>
-  2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4>
-  3760867686U, // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3>
-  3769788783U, // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3>
-  2701945209U, // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4>
-  3760867711U, // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1>
-  2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3>
-  3772369298U, // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2>
-  2687125916U, // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3>
-  2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5>
-  2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5>
-  3709962935U, // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7>
-  3772369346U, // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5>
-  2704894411U, // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5>
-  2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5>
-  3698682850U, // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0>
-  2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3>
-  2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5>
-  2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5>
-  2685946370U, // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6>
-  3779152394U, // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5>
-  2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4>
-  2687126045U, // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6>
-  2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS
-  2559689870U, // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5>
-  2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5>
-  2571635264U, // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5>
-  2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS
-  2559692804U, // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5>
-  3720581218U, // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0>
-  2236385892U, // <5,3,5,7>: Cost 3 vrev <3,5,7,5>
-  2571638574U, // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS
-  2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS
-  3633439887U, // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6>
-  2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6>
-  2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6>
-  2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS
-  3639414630U, // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0>
-  4047521640U, // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6>
-  2725169844U, // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4>
-  2565674798U, // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS
-  1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS
-  1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7>
-  2559706728U, // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2>
-  2559707286U, // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2>
-  1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS
-  2559708880U, // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3>
-  2601513466U, // <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3>
-  3114714112U, // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7>
-  1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS
-  1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS
-  1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u>
-  2559714920U, // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2>
-  2559715478U, // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2>
-  1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS
-  2687126342U, // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6>
-  2601521658U, // <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3>
-  2236410471U, // <5,3,u,7>: Cost 3 vrev <3,5,7,u>
-  1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS
-  3627491430U, // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS
-  2636890214U, // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS
-  3703333028U, // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2>
-  3782249348U, // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5>
-  2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5>
-  2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1>
-  2242243887U, // <5,4,0,6>: Cost 3 vrev <4,5,6,0>
-  3316059448U, // <5,4,0,7>: Cost 4 vrev <4,5,7,0>
-  2636890781U, // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS
-  2241809658U, // <5,4,1,0>: Cost 3 vrev <4,5,0,1>
-  3698025307U, // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4>
-  3698688940U, // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4>
-  3698689024U, // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7>
-  3700016206U, // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4>
-  2687126498U, // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0>
-  3760868336U, // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5>
-  3316067641U, // <5,4,1,7>: Cost 4 vrev <4,5,7,1>
-  2242399554U, // <5,4,1,u>: Cost 3 vrev <4,5,u,1>
-  3703334371U, // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4>
-  3703998004U, // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4>
-  3704661637U, // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4>
-  2636891854U, // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5>
-  3705988903U, // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4>
-  2698628150U, // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3>
-  3760868415U, // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3>
-  3783871562U, // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5>
-  2666752099U, // <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5>
-  3639459942U, // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS
-  3709970701U, // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4>
-  2636892510U, // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4>
-  3710634396U, // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3>
-  2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4>
-  3766987908U, // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0>
-  2710719634U, // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5>
-  3914097664U, // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7>
-  2640874308U, // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4>
-  2583642214U, // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS
-  2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4>
-  3710635062U, // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3>
-  3717270664U, // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4>
-  2713963728U, // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4>
-  1637567706U, // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5>
-  2242276659U, // <5,4,4,6>: Cost 3 vrev <4,5,6,4>
-  2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4>
-  1637788917U, // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5>
-  2559762534U, // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS
-  2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5>
-  2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3>
-  3633506454U, // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2>
-  2559765814U, // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS
-  2583654395U, // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5>
-  1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
-  3901639990U, // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS
-  1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS
-  2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS
-  2559771648U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7>
-  3633514088U, // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2>
-  2571717122U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6>
-  2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS
-  2712636796U, // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5>
-  3760868743U, // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7>
-  2712784270U, // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5>
-  2559776558U, // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS
-  2565750886U, // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS
-  2565751706U, // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4>
-  2565752690U, // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7>
-  2571725387U, // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7>
-  2565754166U, // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS
-  3114713426U, // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5>
-  94817590U, // <5,4,7,6>: Cost 1 vrev RHS
-  2595616175U, // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7>
-  94965064U, // <5,4,7,u>: Cost 1 vrev RHS
-  2559787110U, // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS
-  2559788186U, // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u>
-  2242014483U, // <5,4,u,2>: Cost 3 vrev <4,5,2,u>
-  2667419628U, // <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4>
-  2559790390U, // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS
-  1640222238U, // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5>
-  94825783U, // <5,4,u,6>: Cost 1 vrev RHS
-  2714111536U, // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5>
-  94973257U, // <5,4,u,u>: Cost 1 vrev RHS
-  2646851584U, // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0>
-  1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS
-  2646851748U, // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2>
-  3760279130U, // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2>
-  2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1>
-  2248142847U, // <5,5,0,5>: Cost 3 vrev <5,5,5,0>
-  3720593910U, // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7>
-  4182502710U, // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS
-  1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS
-  2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2>
-  2624291676U, // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5>
-  2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0>
-  2646852568U, // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3>
-  2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5>
-  2628936848U, // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7>
-  3698033907U, // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7>
-  2713964240U, // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3>
-  2628937107U, // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5>
-  3645497446U, // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS
-  3760869099U, // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3>
-  2646853224U, // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2>
-  2698628862U, // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4>
-  3772370694U, // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3>
-  2713964303U, // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3>
-  2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7>
-  4038198272U, // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7>
-  2701946667U, // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4>
-  2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2>
-  3698034922U, // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5>
-  3702679919U, // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3>
-  2637564336U, // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5>
-  2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6>
-  2638891602U, // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5>
-  3702680247U, // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7>
-  3702680259U, // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1>
-  2646854430U, // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2>
-  2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1>
-  2642209767U, // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5>
-  3711306806U, // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3>
-  3645516369U, // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4>
-  1570458842U, // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
-  1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS
-  2645527932U, // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5>
-  2713964486U, // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6>
-  1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5>
-  1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
-  2646855376U, // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3>
-  2583725672U, // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2>
-  2583726230U, // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2>
-  1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
-  229035318U, // <5,5,5,5>: Cost 1 vdup1 RHS
-  2646855778U, // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0>
-  2646855848U, // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7>
-  229035318U, // <5,5,5,u>: Cost 1 vdup1 RHS
-  2577760358U, // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS
-  3633587361U, // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6>
-  2646856186U, // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3>
-  3633588738U, // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6>
-  2718535756U, // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5>
-  2644202223U, // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5>
-  2973780482U, // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6>
-  2646856526U, // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1>
-  2646856607U, // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1>
-  2571796582U, // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS
-  3633595392U, // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7>
-  2571798222U, // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5>
-  2571799124U, // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7>
-  2571799862U, // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS
-  3114717188U, // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5>
-  4034923010U, // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6>
-  2040974646U, // <5,5,7,7>: Cost 2 vtrnr RHS, RHS
-  2040974647U, // <5,5,7,u>: Cost 2 vtrnr RHS, RHS
-  1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS
-  1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS
-  2571806414U, // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5>
-  2571807317U, // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u>
-  1509985590U, // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS
-  229035318U, // <5,5,u,5>: Cost 1 vdup1 RHS
-  2646857936U, // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7>
-  2040982838U, // <5,5,u,7>: Cost 2 vtrnr RHS, RHS
-  229035318U, // <5,5,u,u>: Cost 1 vdup1 RHS
-  2638233600U, // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0>
-  1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS
-  2632261796U, // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2>
-  2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4>
-  2638233938U, // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5>
-  3706003885U, // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6>
-  3706003967U, // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7>
-  4047473974U, // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS
-  1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS
-  2638234358U, // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2>
-  2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1>
-  2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0>
-  2638234584U, // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3>
-  2626290768U, // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6>
-  2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7>
-  3700032719U, // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7>
-  2982366518U, // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
-  2628945300U, // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6>
-  3706004925U, // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2>
-  3711976966U, // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3>
-  2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2>
-  2638235302U, // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1>
-  2632263465U, // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6>
-  2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6>
-  2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7>
-  2713965050U, // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3>
-  2634917997U, // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6>
-  2638235798U, // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2>
-  3711977695U, // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3>
-  3710650720U, // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6>
-  2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3>
-  1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6>
-  2638236234U, // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6>
-  3711978104U, // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7>
-  4034227510U, // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS
-  1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6>
-  2577817702U, // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS
-  3700034544U, // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5>
-  2723033713U, // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5>
-  2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5>
-  2644208859U, // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6>
-  1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS
-  2645536125U, // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6>
-  2723402398U, // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5>
-  1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS
-  2577825894U, // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS
-  2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3>
-  3775836867U, // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6>
-  3711979343U, // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4>
-  2650181556U, // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6>
-  2662125572U, // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5>
-  2638237732U, // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1>
-  2982399286U, // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS
-  2982399287U, // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS
-  2583806054U, // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS
-  3711979910U, // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4>
-  2662126074U, // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3>
-  2583808514U, // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6>
-  2583809334U, // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS
-  2583810062U, // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6>
-  2638238520U, // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6>
-  2973781302U, // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
-  2973781303U, // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS
-  430358630U, // <5,6,7,0>: Cost 1 vext1 RHS, LHS
-  1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
-  1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
-  1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
-  430361910U, // <5,6,7,4>: Cost 1 vext1 RHS, RHS
-  1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6>
-  1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6>
-  1504106092U, // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7>
-  430364462U, // <5,6,7,u>: Cost 1 vext1 RHS, LHS
-  430366822U, // <5,6,u,0>: Cost 1 vext1 RHS, LHS
-  1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS
-  1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
-  1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
-  430370103U, // <5,6,u,4>: Cost 1 vext1 RHS, RHS
-  1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS
-  1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3>
-  1504113658U, // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2>
-  430372654U, // <5,6,u,u>: Cost 1 vext1 RHS, LHS
-  2625634304U, // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0>
-  1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS
-  2625634468U, // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2>
-  2571889247U, // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0>
-  2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5>
-  2595778728U, // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7>
-  3699376639U, // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7>
-  2260235715U, // <5,7,0,7>: Cost 3 vrev <7,5,7,0>
-  1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS
-  2625635062U, // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2>
-  2624308020U, // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1>
-  2625635222U, // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0>
-  1551893504U, // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7>
-  2571898166U, // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS
-  2625635472U, // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7>
-  2627626227U, // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7>
-  3702031684U, // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7>
-  1555211669U, // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7>
-  2629617126U, // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7>
-  3699377670U, // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3>
-  2625635944U, // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2>
-  2625636006U, // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1>
-  2632271658U, // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7>
-  2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7>
-  2625636282U, // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7>
-  3708004381U, // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7>
-  2625636411U, // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1>
-  2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2>
-  2625636604U, // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5>
-  3699378478U, // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1>
-  2625636764U, // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3>
-  2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6>
-  2625636959U, // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0>
-  3699378808U, // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7>
-  2640235254U, // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7>
-  2625637150U, // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2>
-  2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS
-  2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7>
-  3699379260U, // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0>
-  2571922019U, // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4>
-  2571922742U, // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS
-  1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS
-  2846277980U, // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
-  2646207951U, // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7>
-  1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS
-  2583871590U, // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS
-  2652180176U, // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3>
-  2625638177U, // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3>
-  2625638262U, // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7>
-  2583874870U, // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS
-  2846281732U, // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5>
-  2651517015U, // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7>
-  1772539190U, // <5,7,5,7>: Cost 2 vuzpr RHS, RHS
-  1772539191U, // <5,7,5,u>: Cost 2 vuzpr RHS, RHS
-  2846281826U, // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0>
-  3699380615U, // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5>
-  2846281108U, // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2>
-  2589854210U, // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6>
-  2846281830U, // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4>
-  2725467658U, // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u>
-  2846281076U, // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
-  2846279610U, // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7>
-  2846279611U, // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u>
-  1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS
-  2846282574U, // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1>
-  2583889512U, // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2>
-  2846281919U, // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3>
-  1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS
-  1510150168U, // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7>
-  2583892474U, // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3>
-  2625640044U, // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7>
-  1510151982U, // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS
-  1510154342U, // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS
-  1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS
-  2625640325U, // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0>
-  1772536477U, // <5,7,u,3>: Cost 2 vuzpr RHS, LHS
-  1510157622U, // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS
-  1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS
-  2625640656U, // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7>
-  1772539433U, // <5,7,u,7>: Cost 2 vuzpr RHS, RHS
-  1551898981U, // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS
-  2625642496U, // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0>
-  1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS
-  2625642660U, // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2>
-  2698630885U, // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2>
-  2687129325U, // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1>
-  2689783542U, // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1>
-  2266134675U, // <5,u,0,6>: Cost 3 vrev <u,5,6,0>
-  2595853772U, // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0>
-  1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS
-  2625643254U, // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2>
-  2625643316U, // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1>
-  1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
-  1551901697U, // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u>
-  2626307154U, // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u>
-  2689783622U, // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0>
-  2627634420U, // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u>
-  2982366536U, // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
-  1613387620U, // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
-  2846286742U, // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0>
-  2685796528U, // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
-  2625644136U, // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2>
-  2687129480U, // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3>
-  2632279851U, // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u>
-  2625644394U, // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u>
-  2625644474U, // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7>
-  2713966508U, // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3>
-  2625644603U, // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1>
-  2687129532U, // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1>
-  2636261649U, // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u>
-  2636925282U, // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u>
-  2625644956U, // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3>
-  1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u>
-  2625645160U, // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0>
-  2734610422U, // <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5>
-  2640243447U, // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u>
-  1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u>
-  1567828889U, // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u>
-  1661163546U, // <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5>
-  2734463012U, // <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6>
-  2698631212U, // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5>
-  1570458842U, // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
-  1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS
-  2846286172U, // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
-  2646216144U, // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u>
-  1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS
-  1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
-  2560058555U, // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5>
-  2698926194U, // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3>
-  2698631295U, // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7>
-  1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
-  229035318U, // <5,u,5,5>: Cost 1 vdup1 RHS
-  1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
-  1772547382U, // <5,u,5,7>: Cost 2 vuzpr RHS, RHS
-  229035318U, // <5,u,5,u>: Cost 1 vdup1 RHS
-  2566037606U, // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS
-  2920044334U, // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
-  2566039445U, // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6>
-  2687129808U, // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7>
-  2566040886U, // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS
-  2920044698U, // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS
-  2846289268U, // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
-  2973781320U, // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
-  2687129853U, // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7>
-  430506086U, // <5,u,7,0>: Cost 1 vext1 RHS, LHS
-  1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7>
-  1504249448U, // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
-  2040971933U, // <5,u,7,3>: Cost 2 vtrnr RHS, LHS
-  430509384U, // <5,u,7,4>: Cost 1 vext1 RHS, RHS
-  1504251600U, // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
-  118708378U, // <5,u,7,6>: Cost 1 vrev RHS
-  2040974889U, // <5,u,7,7>: Cost 2 vtrnr RHS, RHS
-  430511918U, // <5,u,7,u>: Cost 1 vext1 RHS, LHS
-  430514278U, // <5,u,u,0>: Cost 1 vext1 RHS, LHS
-  1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS
-  1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
-  1772544669U, // <5,u,u,3>: Cost 2 vuzpr RHS, LHS
-  430517577U, // <5,u,u,4>: Cost 1 vext1 RHS, RHS
-  229035318U, // <5,u,u,5>: Cost 1 vdup1 RHS
-  118716571U, // <5,u,u,6>: Cost 1 vrev RHS
-  1772547625U, // <5,u,u,7>: Cost 2 vuzpr RHS, RHS
-  430520110U, // <5,u,u,u>: Cost 1 vext1 RHS, LHS
-  2686025728U, // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0>
-  2686025738U, // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1>
-  2686025748U, // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2>
-  3779084320U, // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5>
-  2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6>
-  3657723939U, // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0>
-  3926676514U, // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6>
-  3926675786U, // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7>
-  2686025802U, // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2>
-  2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS
-  3759767642U, // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0>
-  1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
-  2583988738U, // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6>
-  2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS
-  2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1>
-  2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1>
-  2595935702U, // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1>
-  1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
-  2686025892U, // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2>
-  2685804721U, // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6>
-  3759620282U, // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6>
-  2705342658U, // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5>
-  1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6>
-  3706029956U, // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7>
-  2686173406U, // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6>
-  3651769338U, // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2>
-  1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6>
-  3706030230U, // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2>
-  2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4>
-  2705342730U, // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5>
-  3706030492U, // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3>
-  2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6>
-  3718638154U, // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6>
-  3729918619U, // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6>
-  3926672384U, // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7>
-  2705342784U, // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5>
-  2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6>
-  2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5>
-  1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6>
-  3761021285U, // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6>
-  2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6>
-  2632289590U, // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS
-  2645560704U, // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0>
-  2646224337U, // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0>
-  1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6>
-  3651788902U, // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS
-  2687795620U, // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6>
-  3761611181U, // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6>
-  3723284326U, // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0>
-  2646224838U, // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6>
-  3718639630U, // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6>
-  2652196962U, // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0>
-  2852932918U, // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS
-  2852932919U, // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS
-  2852933730U, // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0>
-  2925985894U, // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS
-  3060203622U, // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS
-  3718640178U, // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5>
-  2656178832U, // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0>
-  3725939378U, // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7>
-  2657506098U, // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0>
-  2619020110U, // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1>
-  2925986461U, // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS
-  2572091494U, // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS
-  2572092310U, // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0>
-  2980495524U, // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2>
-  2572094072U, // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7>
-  2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS
-  4054238242U, // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5>
-  3645837653U, // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0>
-  4054239054U, // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7>
-  2572097326U, // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS
-  2686026378U, // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2>
-  2686026386U, // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1>
-  1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
-  2705343144U, // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5>
-  1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6>
-  2632292506U, // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS
-  2590020356U, // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u>
-  2852933161U, // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS
-  1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS
-  2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS
-  2646229094U, // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS
-  3694092492U, // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6>
-  2686026486U, // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2>
-  2595999030U, // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS
-  3767730952U, // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2>
-  2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1>
-  2596001246U, // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0>
-  2686026531U, // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2>
-  3763602219U, // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1>
-  2686026548U, // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1>
-  3764929346U, // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6>
-  2686026568U, // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3>
-  2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6>
-  3760874332U, // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5>
-  3765224294U, // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6>
-  3669751263U, // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1>
-  2686026613U, // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3>
-  2554208358U, // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS
-  3763602311U, // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3>
-  3639895971U, // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2>
-  2686026646U, // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0>
-  2554211638U, // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS
-  3760874411U, // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3>
-  2554212858U, // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3>
-  3802973114U, // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0>
-  2686026691U, // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0>
-  2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS
-  2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3>
-  2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6>
-  3759768552U, // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1>
-  2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6>
-  2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7>
-  2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3>
-  3663795194U, // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2>
-  2686026775U, // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3>
-  2641587099U, // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1>
-  2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6>
-  3639912357U, // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4>
-  2687206462U, // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6>
-  3633941814U, // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS
-  2693399632U, // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6>
-  3765077075U, // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0>
-  2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1>
-  2687206507U, // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6>
-  2647559796U, // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1>
-  3765077118U, // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7>
-  3767583878U, // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6>
-  2686026896U, // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7>
-  2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6>
-  3767805089U, // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6>
-  2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0>
-  3908250934U, // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS
-  2686026941U, // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7>
-  2554241126U, // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS
-  3763602639U, // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7>
-  3759547607U, // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6>
-  3115221094U, // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS
-  2554244406U, // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS
-  3760874739U, // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7>
-  2554245944U, // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6>
-  3719975758U, // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1>
-  3115221099U, // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS
-  2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS
-  2560222415U, // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7>
-  2980497558U, // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2>
-  3103211622U, // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS
-  2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS
-  2980495698U, // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5>
-  3633967526U, // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0>
-  4054237686U, // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7>
-  2560227118U, // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS
-  2560229478U, // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS
-  2686027117U, // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3>
-  2686027129U, // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6>
-  2686027132U, // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0>
-  2687206795U, // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6>
-  2686027157U, // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7>
-  2590094093U, // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u>
-  2596066790U, // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u>
-  2686027177U, // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0>
-  2646900736U, // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0>
-  1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS
-  2646900900U, // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2>
-  3759769037U, // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0>
-  2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6>
-  3779085794U, // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3>
-  2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4>
-  3669816807U, // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0>
-  1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS
-  2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1>
-  2646901556U, // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1>
-  2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0>
-  2847047782U, // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS
-  3771049517U, // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6>
-  2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7>
-  2686027324U, // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3>
-  3669825000U, // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1>
-  2231117793U, // <6,2,1,u>: Cost 3 vrev <2,6,u,1>
-  3763603029U, // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1>
-  3759769184U, // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3>
-  2686027368U, // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2>
-  2686027378U, // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3>
-  2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6>
-  3759769224U, // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7>
-  2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6>
-  3920794092U, // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7>
-  2686027423U, // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3>
-  2686027430U, // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1>
-  3759769262U, // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0>
-  2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6>
-  2705344196U, // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4>
-  2686027470U, // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5>
-  2698708696U, // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6>
-  2724660961U, // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6>
-  2729232104U, // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4>
-  2686027502U, // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1>
-  1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
-  3759769351U, // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u>
-  2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6>
-  2686027543U, // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6>
-  2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6>
-  1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS
-  2686027564U, // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0>
-  3719982547U, // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2>
-  1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2>
-  3779086154U, // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3>
-  2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3>
-  3759769440U, // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7>
-  2699888488U, // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6>
-  2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5>
-  2646904836U, // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5>
-  2646904930U, // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0>
-  2847051062U, // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS
-  2700257173U, // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6>
-  2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1>
-  2686027684U, // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3>
-  2566260656U, // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6>
-  2685806522U, // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7>
-  2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5>
-  2686027724U, // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7>
-  2646905656U, // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6>
-  2646905678U, // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1>
-  2686027751U, // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7>
-  2554323046U, // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS
-  2572239606U, // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2>
-  2566268849U, // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7>
-  1906753638U, // <6,2,7,3>: Cost 2 vzipr RHS, LHS
-  2554326326U, // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS
-  3304687564U, // <6,2,7,5>: Cost 4 vrev <2,6,5,7>
-  2980495708U, // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
-  2646906476U, // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7>
-  1906753643U, // <6,2,7,u>: Cost 2 vzipr RHS, LHS
-  1591744256U, // <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2>
-  1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS
-  2701805650U, // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6>
-  1906761830U, // <6,2,u,3>: Cost 2 vzipr RHS, LHS
-  2686027875U, // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5>
-  1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS
-  2686322800U, // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0>
-  2847051305U, // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS
-  1906761835U, // <6,2,u,u>: Cost 2 vzipr RHS, LHS
-  3759769739U, // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0>
-  2686027926U, // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2>
-  2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4>
-  3640027286U, // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2>
-  2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2>
-  2705344698U, // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2>
-  3663917847U, // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0>
-  2237008560U, // <6,3,0,7>: Cost 3 vrev <3,6,7,0>
-  2686027989U, // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2>
-  3759769823U, // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3>
-  3759769830U, // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1>
-  3759769841U, // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3>
-  3759769848U, // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1>
-  2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
-  3759769868U, // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3>
-  3704063194U, // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0>
-  3767732510U, // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3>
-  2703280390U, // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
-  3704063468U, // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4>
-  2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
-  3759769921U, // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2>
-  3759769928U, // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0>
-  3704063767U, // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6>
-  3704063876U, // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7>
-  2636957626U, // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7>
-  3777907058U, // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6>
-  2630321724U, // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
-  3759769983U, // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1>
-  3710036245U, // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3>
-  2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3>
-  2686028188U, // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3>
-  2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6>
-  3773041072U, // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5>
-  3711363731U, // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7>
-  3767732676U, // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7>
-  2707999179U, // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5>
-  2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS
-  2642267118U, // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3>
-  2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3>
-  2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6>
-  2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS
-  1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6>
-  2654211444U, // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6>
-  2237041332U, // <6,3,4,7>: Cost 3 vrev <3,6,7,4>
-  1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6>
-  3640066150U, // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS
-  3772746288U, // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7>
-  3640067790U, // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5>
-  3773041216U, // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5>
-  2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6>
-  3773041236U, // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7>
-  3779086940U, // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6>
-  3767732831U, // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0>
-  2706229870U, // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6>
-  2602164326U, // <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS
-  2654212512U, // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3>
-  2566334393U, // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6>
-  3704066588U, // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1>
-  2602167524U, // <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6>
-  3710702321U, // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7>
-  2724661933U, // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6>
-  3710702465U, // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7>
-  2602170158U, // <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS
-  1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS
-  2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7>
-  1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7>
-  2566342806U, // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2>
-  1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS
-  2602176208U, // <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3>
-  2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3>
-  2980496528U, // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7>
-  1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS
-  1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS
-  2686028574U, // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2>
-  1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u>
-  2566350998U, // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2>
-  1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS
-  1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6>
-  2566353489U, // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0>
-  2980504720U, // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7>
-  1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS
-  3703406592U, // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0>
-  2629664870U, // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS
-  2629664972U, // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6>
-  3779087232U, // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1>
-  2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6>
-  2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1>
-  2687208348U, // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2>
-  3316723081U, // <6,4,0,7>: Cost 4 vrev <4,6,7,0>
-  2629665437U, // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS
-  2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1>
-  3700089652U, // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1>
-  3703407510U, // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0>
-  2852962406U, // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS
-  3628166454U, // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS
-  3760876514U, // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0>
-  2687208430U, // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3>
-  3316731274U, // <6,4,1,7>: Cost 4 vrev <4,6,7,1>
-  2243063187U, // <6,4,1,u>: Cost 3 vrev <4,6,u,1>
-  2629666284U, // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4>
-  3703408188U, // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3>
-  3703408232U, // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2>
-  3703408294U, // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1>
-  2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4>
-  2923384118U, // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS
-  2687208508U, // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0>
-  3760950341U, // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0>
-  2634975348U, // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4>
-  3703408790U, // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2>
-  3316305238U, // <6,4,3,1>: Cost 4 vrev <4,6,1,3>
-  3703408947U, // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6>
-  3703409052U, // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3>
-  2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6>
-  3718670922U, // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6>
-  2705345682U, // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5>
-  3926705152U, // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7>
-  2668817222U, // <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6>
-  2590277734U, // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS
-  3716017135U, // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4>
-  2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4>
-  3717344401U, // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4>
-  2712571088U, // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4>
-  2629668150U, // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS
-  1637649636U, // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
-  2646257109U, // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4>
-  1637649636U, // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
-  2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS
-  3760876805U, // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3>
-  2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5>
-  2584316418U, // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6>
-  2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS
-  2584318028U, // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5>
-  1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
-  2852965686U, // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS
-  1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
-  1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS
-  2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2>
-  2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2>
-  2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2>
-  1504611638U, // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS
-  2578353872U, // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3>
-  2578354682U, // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3>
-  2578355194U, // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2>
-  1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS
-  2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS
-  2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4>
-  3640157902U, // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5>
-  2572389020U, // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7>
-  2572389686U, // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS
-  2980497102U, // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5>
-  2980495564U, // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6>
-  4054239090U, // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7>
-  2572392238U, // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS
-  1504608358U, // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS
-  2629670702U, // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS
-  2566424516U, // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u>
-  2584340994U, // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6>
-  1640156694U, // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6>
-  2629671066U, // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS
-  1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS
-  2852965929U, // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS
-  1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS
-  3708723200U, // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0>
-  2634981478U, // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS
-  3694125260U, // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6>
-  3779087962U, // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2>
-  3760877154U, // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1>
-  4195110916U, // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5>
-  3696779775U, // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7>
-  1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0>
-  1175285867U, // <6,5,0,u>: Cost 2 vrev <5,6,u,0>
-  2248445988U, // <6,5,1,0>: Cost 3 vrev <5,6,0,1>
-  3698107237U, // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5>
-  3708724118U, // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0>
-  3908575334U, // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS
-  3716023376U, // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6>
-  3708724368U, // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7>
-  3767733960U, // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4>
-  2712571600U, // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3>
-  2712571609U, // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3>
-  2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS
-  3704079934U, // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5>
-  3708724840U, // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2>
-  3705407182U, // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5>
-  2578394422U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS
-  3717351272U, // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6>
-  2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7>
-  3115486518U, // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS
-  2634983541U, // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5>
-  3708725398U, // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2>
-  3710052631U, // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5>
-  3708725606U, // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3>
-  3708725660U, // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3>
-  2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6>
-  3717352010U, // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6>
-  3773632358U, // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0>
-  2248978533U, // <6,5,3,7>: Cost 3 vrev <5,6,7,3>
-  2249052270U, // <6,5,3,u>: Cost 3 vrev <5,6,u,3>
-  2596323430U, // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS
-  3716025328U, // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5>
-  3716688961U, // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5>
-  2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5>
-  2596326710U, // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS
-  2634984758U, // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS
-  3767734199U, // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0>
-  1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6>
-  1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6>
-  2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS
-  3652158198U, // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2>
-  3652159080U, // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2>
-  3652159638U, // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2>
-  2578418998U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS
-  2712571908U, // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5>
-  2718027790U, // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6>
-  2712571928U, // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7>
-  2712571937U, // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7>
-  2705346596U, // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1>
-  3767144496U, // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4>
-  3773116473U, // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4>
-  2705346626U, // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4>
-  2705346636U, // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5>
-  3908577217U, // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5>
-  2578428728U, // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6>
-  2712572002U, // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0>
-  2705346668U, // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1>
-  2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS
-  2560517363U, // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7>
-  2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7>
-  3634260118U, // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2>
-  2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS
-  2980498650U, // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5>
-  2980497922U, // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6>
-  3103214902U, // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS
-  2560522030U, // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS
-  2560524390U, // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS
-  2560525556U, // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u>
-  2566498253U, // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u>
-  2646931439U, // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7>
-  2560527670U, // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS
-  2634987674U, // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS
-  2980506114U, // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6>
-  1175277674U, // <6,5,u,7>: Cost 2 vrev <5,6,7,u>
-  1175351411U, // <6,5,u,u>: Cost 2 vrev <5,6,u,u>
-  2578448486U, // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS
-  1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS
-  2686030124U, // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4>
-  3779088690U, // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1>
-  2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2>
-  3652194000U, // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3>
-  2254852914U, // <6,6,0,6>: Cost 3 vrev <6,6,6,0>
-  4041575734U, // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS
-  1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS
-  2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2>
-  2646934324U, // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1>
-  2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0>
-  2846785638U, // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS
-  3760951694U, // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3>
-  2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7>
-  2712572320U, // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3>
-  3775549865U, // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3>
-  2846785643U, // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS
-  3759772094U, // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6>
-  3704751676U, // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3>
-  2631009936U, // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6>
-  2646935206U, // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1>
-  3759772127U, // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3>
-  3704752004U, // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7>
-  2646935482U, // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7>
-  2712572410U, // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3>
-  2712572419U, // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3>
-  2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2>
-  3777024534U, // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4>
-  3704752453U, // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6>
-  2646935964U, // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3>
-  2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5>
-  3779678778U, // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4>
-  2657553069U, // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6>
-  4039609654U, // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS
-  2708001366U, // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5>
-  2578481254U, // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS
-  3652223734U, // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2>
-  3760951922U, // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6>
-  3779089019U, // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6>
-  1570540772U, // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
-  1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS
-  2712572560U, // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0>
-  2723410591U, // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6>
-  1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6>
-  3640287334U, // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS
-  2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3>
-  3640289235U, // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5>
-  3720679279U, // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0>
-  2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6>
-  2646937604U, // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5>
-  2646937698U, // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0>
-  2846788918U, // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS
-  2846788919U, // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS
-  1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS
-  2590442230U, // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2>
-  2646938106U, // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3>
-  2590443670U, // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2>
-  1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS
-  2590445264U, // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3>
-  296144182U, // <6,6,6,6>: Cost 1 vdup2 RHS
-  2712572738U, // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7>
-  296144182U, // <6,6,6,u>: Cost 1 vdup2 RHS
-  2566561894U, // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS
-  3634332924U, // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7>
-  2566563797U, // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7>
-  2584480258U, // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6>
-  2566565174U, // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS
-  2717438846U, // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4>
-  2980500280U, // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6>
-  1906756918U, // <6,6,7,7>: Cost 2 vzipr RHS, RHS
-  1906756919U, // <6,6,7,u>: Cost 2 vzipr RHS, RHS
-  1516699750U, // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS
-  1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS
-  2566571990U, // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u>
-  2846786205U, // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS
-  1516703030U, // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS
-  1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS
-  296144182U, // <6,6,u,6>: Cost 1 vdup2 RHS
-  1906765110U, // <6,6,u,7>: Cost 2 vzipr RHS, RHS
-  296144182U, // <6,6,u,u>: Cost 1 vdup2 RHS
-  1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
-  497467494U, // <6,7,0,1>: Cost 1 vext2 RHS, LHS
-  1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
-  2644951292U, // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0>
-  1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
-  1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0>
-  2644951542U, // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
-  2584499194U, // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2>
-  497468061U, // <6,7,0,u>: Cost 1 vext2 RHS, LHS
-  1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
-  1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
-  1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
-  1571210200U, // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
-  2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5>
-  1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
-  2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
-  2578535418U, // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2>
-  1571210605U, // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3>
-  2644952509U, // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2>
-  2644952582U, // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
-  1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
-  1571210918U, // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
-  2644952828U, // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6>
-  2633009028U, // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7>
-  1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
-  2668840938U, // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1>
-  1571211323U, // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
-  1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
-  2644953311U, // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
-  2644953390U, // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1>
-  1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
-  1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
-  2644953648U, // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7>
-  2644953720U, // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7>
-  2644953795U, // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
-  1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
-  1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
-  2644954058U, // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
-  2644954166U, // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
-  2644954258U, // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5>
-  1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
-  497470774U, // <6,7,4,5>: Cost 1 vext2 RHS, RHS
-  1573203316U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
-  2646281688U, // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7>
-  497471017U, // <6,7,4,u>: Cost 1 vext2 RHS, RHS
-  2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
-  1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
-  2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
-  2644954991U, // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
-  1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
-  1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
-  1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
-  1573204136U, // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
-  1573204217U, // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
-  2644955425U, // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2>
-  2644955561U, // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3>
-  1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
-  2644955698U, // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
-  2644955789U, // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6>
-  2644955889U, // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7>
-  1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
-  1571214158U, // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
-  1573204895U, // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
-  1573204986U, // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
-  2572608656U, // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7>
-  2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3>
-  2572610231U, // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7>
-  1573205350U, // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
-  2646947220U, // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7>
-  1516786498U, // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7>
-  1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7>
-  1573205634U, // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2>
-  1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
-  497473326U, // <6,7,u,1>: Cost 1 vext2 RHS, LHS
-  1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
-  1571215292U, // <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
-  1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
-  497473690U, // <6,7,u,5>: Cost 1 vext2 RHS, RHS
-  1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
-  1573206272U, // <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1>
-  497473893U, // <6,7,u,u>: Cost 1 vext2 RHS, LHS
-  1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
-  497475686U, // <6,u,0,1>: Cost 1 vext2 RHS, LHS
-  1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
-  2689865445U, // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2>
-  1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
-  1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0>
-  2644959734U, // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
-  1193130221U, // <6,u,0,7>: Cost 2 vrev <u,6,7,0>
-  497476253U, // <6,u,0,u>: Cost 1 vext2 RHS, LHS
-  1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
-  1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
-  1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
-  1571218392U, // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
-  2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS
-  1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
-  2644960463U, // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
-  2717439835U, // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3>
-  1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
-  1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS
-  2644960774U, // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
-  1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
-  1571219110U, // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
-  1504873782U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS
-  2633017221U, // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u>
-  1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
-  2712573868U, // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3>
-  1571219515U, // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
-  1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
-  2644961503U, // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
-  2566678499U, // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3>
-  1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
-  1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
-  2689865711U, // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7>
-  2708002806U, // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5>
-  2644961987U, // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
-  1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
-  1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
-  2644962250U, // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
-  1661245476U, // <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6>
-  2686031917U, // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6>
-  1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
-  497478967U, // <6,u,4,5>: Cost 1 vext2 RHS, RHS
-  1571220852U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
-  1661614161U, // <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6>
-  497479209U, // <6,u,4,u>: Cost 1 vext2 RHS, RHS
-  2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS
-  1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
-  2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5>
-  2689865855U, // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7>
-  1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
-  1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
-  1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
-  1571221672U, // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
-  1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
-  1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS
-  2644963752U, // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
-  1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
-  2686032080U, // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7>
-  1504906550U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS
-  2644964079U, // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5>
-  296144182U, // <6,u,6,6>: Cost 1 vdup2 RHS
-  1571222350U, // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
-  296144182U, // <6,u,6,u>: Cost 1 vdup2 RHS
-  1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS
-  2560738574U, // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7>
-  1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7>
-  1906753692U, // <6,u,7,3>: Cost 2 vzipr RHS, LHS
-  1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS
-  2980495761U, // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5>
-  1516860235U, // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7>
-  1906756936U, // <6,u,7,7>: Cost 2 vzipr RHS, RHS
-  1492973358U, // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS
-  1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS
-  497481518U, // <6,u,u,1>: Cost 1 vext2 RHS, LHS
-  1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
-  1571223484U, // <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
-  1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS
-  497481882U, // <6,u,u,5>: Cost 1 vext2 RHS, RHS
-  296144182U, // <6,u,u,6>: Cost 1 vdup2 RHS
-  1906765128U, // <6,u,u,7>: Cost 2 vzipr RHS, RHS
-  497482085U, // <6,u,u,u>: Cost 1 vext2 RHS, LHS
-  1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
-  1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1>
-  1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2>
-  3646442178U, // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0>
-  2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1>
-  2651603364U, // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6>
-  2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0>
-  3785801798U, // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7>
-  1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1>
-  1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS
-  2693922911U, // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5>
-  564576358U, // <7,0,1,2>: Cost 1 vext3 RHS, LHS
-  2638996480U, // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7>
-  1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS
-  2649613456U, // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7>
-  1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1>
-  2590626808U, // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0>
-  564576412U, // <7,0,1,u>: Cost 1 vext3 RHS, LHS
-  1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
-  2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5>
-  2712060084U, // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0>
-  2712060094U, // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1>
-  1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
-  2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
-  2651604922U, // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7>
-  2686255336U, // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7>
-  1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2>
-  2651605142U, // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2>
-  2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0>
-  2712060165U, // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0>
-  2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3>
-  2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6>
-  2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0>
-  2639661744U, // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0>
-  3712740068U, // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7>
-  2640989010U, // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0>
-  2712060232U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4>
-  1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5>
-  1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6>
-  3646474950U, // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4>
-  2712060270U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6>
-  1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS
-  2651606388U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6>
-  3787792776U, // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5>
-  1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5>
-  2590654566U, // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS
-  2651606736U, // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3>
-  2712060334U, // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
-  2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0>
-  2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6>
-  2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5>
-  1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0>
-  2651607208U, // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7>
-  1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0>
-  2688393709U, // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7>
-  2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
-  2688541183U, // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7>
-  2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0>
-  3762430481U, // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7>
-  2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7>
-  2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6>
-  2651607886U, // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1>
-  2688983605U, // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7>
-  2651608058U, // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2>
-  2932703334U, // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS
-  3066921062U, // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS
-  3712742678U, // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7>
-  2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6>
-  2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7>
-  2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0>
-  2651608684U, // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7>
-  2651608706U, // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2>
-  1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2>
-  1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1>
-  564576925U, // <7,0,u,2>: Cost 1 vext3 RHS, LHS
-  2572765898U, // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u>
-  1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6>
-  1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS
-  1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u>
-  2651609344U, // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1>
-  564576979U, // <7,0,u,u>: Cost 1 vext3 RHS, LHS
-  2590687334U, // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS
-  2639003750U, // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS
-  2793357414U, // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS
-  1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2>
-  2590690614U, // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS
-  2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
-  2590692182U, // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0>
-  3785802521U, // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1>
-  1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2>
-  2712060715U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1>
-  1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
-  3774300994U, // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6>
-  1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3>
-  2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5>
-  2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7>
-  2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1>
-  3765158766U, // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5>
-  1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3>
-  2712060796U, // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1>
-  2712060807U, // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3>
-  3712747112U, // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2>
-  1638318998U, // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0>
-  2712060836U, // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5>
-  2712060843U, // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3>
-  2590708568U, // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2>
-  2735948730U, // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0>
-  1638319043U, // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0>
-  2712060876U, // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0>
-  1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
-  2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0>
-  2692596718U, // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7>
-  2712060917U, // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5>
-  1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7>
-  2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7>
-  2735948814U, // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3>
-  1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7>
-  2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5>
-  2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5>
-  2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5>
-  2712060989U, // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5>
-  3785802822U, // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5>
-  2639007030U, // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS
-  2645642634U, // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1>
-  3719384520U, // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0>
-  2639007273U, // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS
-  2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS
-  2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7>
-  3774301318U, // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6>
-  1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7>
-  2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS
-  3766486178U, // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7>
-  2651615331U, // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1>
-  2652278964U, // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1>
-  1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7>
-  3768108230U, // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7>
-  2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7>
-  2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
-  2694587617U, // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7>
-  3768403178U, // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7>
-  2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7>
-  3768550652U, // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7>
-  2652279630U, // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1>
-  2694956302U, // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7>
-  2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2>
-  2859062094U, // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1>
-  3779462437U, // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3>
-  3121938534U, // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS
-  2554916150U, // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS
-  3769140548U, // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7>
-  3726022164U, // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0>
-  2554918508U, // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7>
-  3121938539U, // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS
-  2572836966U, // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS
-  1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3>
-  2712061299U, // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0>
-  1622173059U, // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7>
-  2572840246U, // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS
-  1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7>
-  2696136094U, // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7>
-  2859060777U, // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS
-  1622541744U, // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7>
-  2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2>
-  2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2>
-  2712061380U, // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0>
-  2712061389U, // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0>
-  2712061404U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6>
-  2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7>
-  2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1>
-  3785803251U, // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2>
-  2696947201U, // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7>
-  2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3>
-  3785803276U, // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0>
-  3785803285U, // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0>
-  2712061471U, // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1>
-  2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3>
-  3766486576U, // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0>
-  2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3>
-  2602718850U, // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
-  2712061516U, // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1>
-  2712061525U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1>
-  2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3>
-  1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
-  1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3>
-  2712061565U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5>
-  2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7>
-  2712061584U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6>
-  3771795096U, // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5>
-  1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3>
-  1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1>
-  2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5>
-  2700560061U, // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6>
-  2693924551U, // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7>
-  1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5>
-  2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7>
-  2712061665U, // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6>
-  2735949540U, // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0>
-  1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1>
-  2712061692U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6>
-  2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3>
-  2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4>
-  2712061718U, // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5>
-  2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6>
-  2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7>
-  2712061740U, // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0>
-  3809691445U, // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0>
-  2699601733U, // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7>
-  2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7>
-  3766486867U, // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3>
-  2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7>
-  2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7>
-  2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7>
-  3766486907U, // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7>
-  2700117892U, // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7>
-  3771795334U, // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0>
-  2692745110U, // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7>
-  2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS
-  2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3>
-  2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7>
-  1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7>
-  2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS
-  2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7>
-  2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7>
-  3774597086U, // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7>
-  1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7>
-  2735949802U, // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1>
-  3780200434U, // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0>
-  3773564928U, // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5>
-  2986541158U, // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS
-  2554989878U, // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS
-  3775113245U, // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7>
-  4060283228U, // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6>
-  2554992236U, // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7>
-  2986541163U, // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS
-  1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1>
-  2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5>
-  1638319720U, // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2>
-  1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7>
-  1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5>
-  2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7>
-  2702108791U, // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7>
-  2735949945U, // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0>
-  1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7>
-  2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0>
-  1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2>
-  2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0>
-  2590836886U, // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2>
-  2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1>
-  2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2>
-  2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0>
-  3311414017U, // <7,3,0,7>: Cost 4 vrev <3,7,7,0>
-  1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2>
-  2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1>
-  2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1>
-  2712062193U, // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3>
-  2692745468U, // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5>
-  2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6>
-  2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3>
-  3768183059U, // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1>
-  2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5>
-  2696063273U, // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5>
-  2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1>
-  2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0>
-  2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2>
-  2712062280U, // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0>
-  2712062294U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5>
-  2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4>
-  2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3>
-  2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3>
-  2712062325U, // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0>
-  2712062335U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1>
-  2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3>
-  2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3>
-  1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
-  2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4>
-  2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7>
-  2590864235U, // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3>
-  2704837060U, // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7>
-  1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3>
-  2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1>
-  2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2>
-  2566981640U, // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4>
-  2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5>
-  2712062456U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5>
-  1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6>
-  2648313204U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6>
-  3311446789U, // <7,3,4,7>: Cost 4 vrev <3,7,7,4>
-  1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6>
-  2602819686U, // <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS
-  1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3>
-  2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3>
-  2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7>
-  2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5>
-  2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7>
-  2648313954U, // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0>
-  2692745823U, // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0>
-  1579217159U, // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3>
-  2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7>
-  2654286249U, // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3>
-  1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3>
-  2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7>
-  2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7>
-  3780422309U, // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7>
-  2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6>
-  2706827959U, // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7>
-  1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3>
-  2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1>
-  2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5>
-  2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6>
-  2572978916U, // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7>
-  2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5>
-  2707344118U, // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7>
-  2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7>
-  2648315500U, // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7>
-  2693925643U, // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1>
-  2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u>
-  1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2>
-  1593153452U, // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3>
-  1638320540U, // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3>
-  2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u>
-  1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6>
-  2712062796U, // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3>
-  2692967250U, // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0>
-  1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2>
-  2651635712U, // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0>
-  1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS
-  2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2>
-  3785804672U, // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1>
-  2651636050U, // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5>
-  1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
-  1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
-  3787795364U, // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
-  1640459181U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1>
-  2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2>
-  2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1>
-  2712062922U, // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3>
-  2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7>
-  2712062940U, // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3>
-  2712062946U, // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0>
-  2712062958U, // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3>
-  3785804791U, // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3>
-  2712062973U, // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0>
-  3785804807U, // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1>
-  3785804818U, // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3>
-  2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2>
-  2651637414U, // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1>
-  3716753194U, // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7>
-  2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
-  2712063036U, // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0>
-  3773123658U, // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5>
-  2712063054U, // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0>
-  2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2>
-  3712772348U, // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5>
-  3785804906U, // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1>
-  2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3>
-  2651638274U, // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6>
-  2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4>
-  2712063122U, // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5>
-  3712772836U, // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7>
-  2641021782U, // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4>
-  2714053802U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2>
-  3785804978U, // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1>
-  3716754505U, // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4>
-  3785804998U, // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3>
-  1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
-  1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5>
-  1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
-  3785215214U, // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7>
-  1640459509U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5>
-  1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS
-  2573034640U, // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7>
-  2712063246U, // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3>
-  2573036267U, // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5>
-  1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS
-  2711989549U, // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7>
-  564579638U, // <7,4,5,6>: Cost 1 vext3 RHS, RHS
-  2651639976U, // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7>
-  564579656U, // <7,4,5,u>: Cost 1 vext3 RHS, RHS
-  2712063307U, // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1>
-  3767668056U, // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5>
-  2651640314U, // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3>
-  2655621708U, // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4>
-  1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
-  2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7>
-  2712063367U, // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7>
-  2712210826U, // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
-  1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2>
-  2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2>
-  3773713830U, // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2>
-  3773713842U, // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5>
-  3780349372U, // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6>
-  2651641140U, // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1>
-  2712210888U, // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
-  2712210898U, // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1>
-  2651641452U, // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7>
-  2713538026U, // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7>
-  1517232230U, // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS
-  1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS
-  2712063489U, // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3>
-  2573060846U, // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u>
-  1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6>
-  1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1>
-  564579881U, // <7,4,u,6>: Cost 1 vext3 RHS, RHS
-  2714054192U, // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5>
-  564579899U, // <7,4,u,u>: Cost 1 vext3 RHS, RHS
-  2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS
-  2636382310U, // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS
-  2796339302U, // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS
-  3646810719U, // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0>
-  2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1>
-  2735951467U, // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1>
-  2735951476U, // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1>
-  2579043322U, // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2>
-  2636382877U, // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS
-  2712211087U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1>
-  3698180916U, // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1>
-  3710124950U, // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0>
-  2636383232U, // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7>
-  2712211127U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5>
-  2590994128U, // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3>
-  2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1>
-  1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
-  1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
-  3785805536U, // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1>
-  3785805544U, // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0>
-  3704817288U, // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7>
-  2712063742U, // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4>
-  3716761386U, // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7>
-  2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
-  3774304024U, // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3>
-  2712063777U, // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3>
-  2712063787U, // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4>
-  3634888806U, // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS
-  2636384544U, // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5>
-  3710790001U, // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5>
-  3710126492U, // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3>
-  3634892086U, // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS
-  2639039076U, // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5>
-  3713444533U, // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5>
-  2693926767U, // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0>
-  2712063864U, // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0>
-  2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS
-  3646841856U, // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7>
-  3716762698U, // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5>
-  3646843491U, // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4>
-  2579074358U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS
-  2636385590U, // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS
-  2645675406U, // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5>
-  1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
-  1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6>
-  2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1>
-  2652974800U, // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3>
-  3710127905U, // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3>
-  3785805808U, // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3>
-  2712211450U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4>
-  1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5>
-  2712064014U, // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
-  1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
-  1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7>
-  2712064036U, // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1>
-  2714054707U, // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7>
-  3785805879U, // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2>
-  2712064066U, // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4>
-  2712064076U, // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5>
-  2714054743U, // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
-  2712064096U, // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
-  1638322274U, // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0>
-  1638469739U, // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0>
-  1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS
-  2692747392U, // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3>
-  2585069160U, // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2>
-  2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7>
-  1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS
-  1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
-  2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
-  2712211636U, // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1>
-  1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3>
-  1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS
-  2636388142U, // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS
-  2712211671U, // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0>
-  2573134583U, // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u>
-  1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS
-  1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7>
-  2712064258U, // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7>
-  1638469892U, // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0>
-  1638469904U, // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3>
-  2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0>
-  1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS
-  2712064300U, // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4>
-  2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0>
-  2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2>
-  2585088098U, // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0>
-  2735952204U, // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0>
-  2712211799U, // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2>
-  1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS
-  1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1>
-  2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1>
-  2650325910U, // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0>
-  2650325976U, // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3>
-  2579123510U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS
-  2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7>
-  2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
-  2712064425U, // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3>
-  1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1>
-  2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1>
-  2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3>
-  2650326632U, // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2>
-  2650326694U, // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1>
-  2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5>
-  2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7>
-  2650326970U, // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7>
-  1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
-  1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
-  2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2>
-  2255172441U, // <7,6,3,1>: Cost 3 vrev <6,7,1,3>
-  2255246178U, // <7,6,3,2>: Cost 3 vrev <6,7,2,3>
-  2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3>
-  2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5>
-  2650327627U, // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7>
-  3713452726U, // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6>
-  2700563016U, // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0>
-  2712064593U, // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0>
-  2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1>
-  2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3>
-  2735952497U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5>
-  2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4>
-  2712212100U, // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6>
-  1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS
-  2714055312U, // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0>
-  2712212126U, // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5>
-  1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS
-  2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS
-  2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3>
-  2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7>
-  3785806538U, // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4>
-  1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6>
-  2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5>
-  2650329186U, // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0>
-  2712064753U, // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7>
-  1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5>
-  2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1>
-  2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3>
-  2650329594U, // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3>
-  3785806619U, // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4>
-  2712212260U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4>
-  2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7>
-  1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6>
-  1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
-  1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7>
-  1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1>
-  2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0>
-  2712064865U, // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2>
-  2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0>
-  1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5>
-  2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4>
-  2712064905U, // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6>
-  2712064915U, // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7>
-  1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1>
-  1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1>
-  1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS
-  2712212402U, // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2>
-  2712212409U, // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0>
-  1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5>
-  1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS
-  1638323000U, // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6>
-  1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3>
-  1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1>
-  2712065007U, // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0>
-  1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2>
-  2712065025U, // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0>
-  3646958337U, // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0>
-  2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1>
-  2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0>
-  2591134604U, // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0>
-  2591134714U, // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2>
-  1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2>
-  2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3>
-  2712065098U, // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1>
-  2712065109U, // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3>
-  2692748384U, // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5>
-  2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS
-  2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3>
-  2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1>
-  2735953024U, // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1>
-  2695918731U, // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3>
-  3770471574U, // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5>
-  3785807002U, // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0>
-  2712065189U, // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2>
-  2712065196U, // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0>
-  3773125818U, // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5>
-  3766490305U, // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3>
-  2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3>
-  2735953107U, // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3>
-  2701890780U, // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3>
-  2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1>
-  3766490350U, // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3>
-  3774305530U, // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6>
-  2637728196U, // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7>
-  2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5>
-  2585186486U, // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3>
-  2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7>
-  2640382728U, // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7>
-  2641046361U, // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7>
-  2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5>
-  3646989312U, // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7>
-  3785807176U, // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3>
-  3646991109U, // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4>
-  2712065371U, // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4>
-  1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6>
-  2712212845U, // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4>
-  2591167846U, // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6>
-  1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6>
-  2585198694U, // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS
-  2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7>
-  3711471393U, // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3>
-  2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7>
-  2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7>
-  1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7>
-  2712065473U, // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7>
-  2712212936U, // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5>
-  1579249931U, // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7>
-  2591178854U, // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS
-  2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0>
-  2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7>
-  2655646287U, // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7>
-  2591182134U, // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS
-  2656973553U, // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7>
-  1583895362U, // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7>
-  2712065556U, // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0>
-  1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7>
-  1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS
-  2597159670U, // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2>
-  2597160552U, // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2>
-  2597161110U, // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2>
-  1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS
-  2651002296U, // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7>
-  2657637906U, // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7>
-  363253046U, // <7,7,7,7>: Cost 1 vdup3 RHS
-  363253046U, // <7,7,7,u>: Cost 1 vdup3 RHS
-  1523417190U, // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS
-  1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2>
-  2712213132U, // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3>
-  2712213138U, // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0>
-  1523420470U, // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS
-  1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6>
-  1595840756U, // <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7>
-  363253046U, // <7,7,u,7>: Cost 1 vdup3 RHS
-  363253046U, // <7,7,u,u>: Cost 1 vdup3 RHS
-  1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
-  1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2>
-  1662211804U, // <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2>
-  1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2>
-  2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1>
-  1662359286U, // <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1>
-  1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
-  2987150664U, // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS
-  1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2>
-  1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS
-  1638318900U, // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
-  564582190U, // <7,u,1,2>: Cost 1 vext3 RHS, LHS
-  1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3>
-  1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS
-  2693928777U, // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3>
-  1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1>
-  1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
-  564582244U, // <7,u,1,u>: Cost 1 vext3 RHS, LHS
-  1638318244U, // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
-  2712065907U, // <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0>
-  1638319720U, // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
-  1638324101U, // <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0>
-  1638318284U, // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
-  2712065947U, // <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4>
-  2700564387U, // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3>
-  1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
-  1638324146U, // <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0>
-  1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1>
-  1638319064U, // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
-  2700564435U, // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6>
-  1638320540U, // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
-  1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5>
-  1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7>
-  2700564472U, // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7>
-  2695919610U, // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0>
-  1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1>
-  2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1>
-  1662212122U, // <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5>
-  1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6>
-  2712066092U, // <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5>
-  1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
-  1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6>
-  1662359624U, // <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6>
-  1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
-  1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6>
-  1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS
-  1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u>
-  2712066162U, // <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3>
-  1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7>
-  1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u>
-  1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u>
-  564582554U, // <7,u,5,6>: Cost 1 vext3 RHS, RHS
-  1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7>
-  564582572U, // <7,u,5,u>: Cost 1 vext3 RHS, RHS
-  2712066223U, // <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1>
-  2712066238U, // <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7>
-  1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u>
-  1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7>
-  1638468980U, // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
-  2712066274U, // <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7>
-  1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u>
-  1640315117U, // <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0>
-  1638324477U, // <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7>
-  1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1>
-  2692970763U, // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3>
-  2700933399U, // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6>
-  2573347601U, // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7>
-  1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5>
-  1511551171U, // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7>
-  2712213815U, // <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2>
-  363253046U, // <7,u,7,7>: Cost 1 vdup3 RHS
-  363253046U, // <7,u,7,u>: Cost 1 vdup3 RHS
-  1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1>
-  1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2>
-  564582757U, // <7,u,u,2>: Cost 1 vext3 RHS, LHS
-  1638324587U, // <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0>
-  1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5>
-  1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6>
-  564582797U, // <7,u,u,6>: Cost 1 vext3 RHS, RHS
-  363253046U, // <7,u,u,7>: Cost 1 vdup3 RHS
-  564582811U, // <7,u,u,u>: Cost 1 vext3 RHS, LHS
-  135053414U, // <u,0,0,0>: Cost 1 vdup0 LHS
-  1611489290U, // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
-  1611489300U, // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
-  2568054923U, // <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
-  1481706806U, // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS
-  2555449040U, // <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3>
-  2591282078U, // <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0>
-  2591945711U, // <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
-  135053414U, // <u,0,0,u>: Cost 1 vdup0 LHS
-  1493655654U, // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS
-  1860550758U, // <u,0,1,1>: Cost 2 vzipl LHS, LHS
-  537747563U, // <u,0,1,2>: Cost 1 vext3 LHS, LHS
-  2625135576U, // <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3>
-  1493658934U, // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS
-  2625135760U, // <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7>
-  1517548447U, // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1>
-  2591290362U, // <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2>
-  537747612U, // <u,0,1,u>: Cost 1 vext3 LHS, LHS
-  1611489444U, // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
-  2685231276U, // <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
-  1994768486U, // <u,0,2,2>: Cost 2 vtrnl LHS, LHS
-  2685231294U, // <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
-  1611489484U, // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
-  2712068310U, // <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
-  2625136570U, // <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7>
-  2591962097U, // <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
-  1611489516U, // <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
-  2954067968U, // <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
-  2685231356U, // <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
-  72589981U, // <u,0,3,2>: Cost 1 vrev LHS
-  2625137052U, // <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3>
-  2625137154U, // <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6>
-  2639071848U, // <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0>
-  2639735481U, // <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0>
-  2597279354U, // <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3>
-  73032403U, // <u,0,3,u>: Cost 1 vrev LHS
-  2687074636U, // <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u>
-  1611489618U, // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
-  1611489628U, // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
-  3629222038U, // <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2>
-  2555481398U, // <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS
-  1551396150U, // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS
-  2651680116U, // <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6>
-  2646150600U, // <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
-  1611932050U, // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
-  2561458278U, // <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS
-  1863532646U, // <u,0,5,1>: Cost 2 vzipl RHS, LHS
-  2712068526U, // <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
-  2649689976U, // <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0>
-  2220237489U, // <u,0,5,4>: Cost 3 vrev <0,u,4,5>
-  2651680772U, // <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5>
-  1577939051U, // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0>
-  2830077238U, // <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS
-  1579266317U, // <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0>
-  2555494502U, // <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS
-  2712068598U, // <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
-  1997750374U, // <u,0,6,2>: Cost 2 vtrnl RHS, LHS
-  2655662673U, // <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0>
-  2555497782U, // <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS
-  2651681459U, // <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u>
-  2651681592U, // <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6>
-  2651681614U, // <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1>
-  1997750428U, // <u,0,6,u>: Cost 2 vtrnl RHS, LHS
-  2567446630U, // <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS
-  2567447446U, // <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0>
-  2567448641U, // <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7>
-  2573421338U, // <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7>
-  2567449910U, // <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS
-  2651682242U, // <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u>
-  2591339429U, // <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7>
-  2651682412U, // <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7>
-  2567452462U, // <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS
-  135053414U, // <u,0,u,0>: Cost 1 vdup0 LHS
-  1611489938U, // <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
-  537748125U, // <u,0,u,2>: Cost 1 vext3 LHS, LHS
-  2685674148U, // <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
-  1611932338U, // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
-  1551399066U, // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS
-  1517605798U, // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u>
-  2830077481U, // <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS
-  537748179U, // <u,0,u,u>: Cost 1 vext3 LHS, LHS
-  1544101961U, // <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1>
-  1558036582U, // <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS
-  2619171051U, // <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1>
-  1611490038U, // <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
-  2555522358U, // <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS
-  2712068871U, // <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
-  2591355815U, // <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0>
-  2597328512U, // <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0>
-  1611490083U, // <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
-  1481785446U, // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS
-  202162278U, // <u,1,1,1>: Cost 1 vdup1 LHS
-  2555528808U, // <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2>
-  1611490120U, // <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
-  1481788726U, // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS
-  2689876828U, // <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
-  2591364008U, // <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1>
-  2592691274U, // <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
-  202162278U, // <u,1,1,u>: Cost 1 vdup1 LHS
-  1499709542U, // <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS
-  2689876871U, // <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
-  2631116445U, // <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1>
-  835584U, // <u,1,2,3>: Cost 0 copy LHS
-  1499712822U, // <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS
-  2689876907U, // <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
-  2631780282U, // <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7>
-  1523603074U, // <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2>
-  835584U, // <u,1,2,u>: Cost 0 copy LHS
-  1487773798U, // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS
-  1611490264U, // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
-  2685232094U, // <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
-  2018746470U, // <u,1,3,3>: Cost 2 vtrnr LHS, LHS
-  1487777078U, // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS
-  1611490304U, // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
-  2685674505U, // <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
-  2640407307U, // <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1>
-  1611490327U, // <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
-  1567992749U, // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1>
-  2693121070U, // <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u>
-  2693194807U, // <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u>
-  1152386432U, // <u,1,4,3>: Cost 2 vrev <1,u,3,4>
-  2555555126U, // <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS
-  1558039862U, // <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS
-  2645716371U, // <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1>
-  2597361284U, // <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4>
-  1152755117U, // <u,1,4,u>: Cost 2 vrev <1,u,u,4>
-  1481818214U, // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS
-  2555560694U, // <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2>
-  2555561576U, // <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2>
-  1611490448U, // <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
-  1481821494U, // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS
-  2651025435U, // <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1>
-  2651689068U, // <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1>
-  2823966006U, // <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS
-  1611932861U, // <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
-  2555568230U, // <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS
-  2689877199U, // <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
-  2712069336U, // <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
-  2685232353U, // <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
-  2555571510U, // <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS
-  2689877235U, // <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
-  2657661765U, // <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1>
-  1584583574U, // <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1>
-  1585247207U, // <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1>
-  2561548390U, // <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS
-  2561549681U, // <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7>
-  2573493926U, // <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1>
-  2042962022U, // <u,1,7,3>: Cost 2 vtrnr RHS, LHS
-  2561551670U, // <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS
-  2226300309U, // <u,1,7,5>: Cost 3 vrev <1,u,5,7>
-  2658325990U, // <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u>
-  2658326124U, // <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7>
-  2042962027U, // <u,1,7,u>: Cost 2 vtrnr RHS, LHS
-  1481842790U, // <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS
-  202162278U, // <u,1,u,1>: Cost 1 vdup1 LHS
-  2685674867U, // <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
-  835584U, // <u,1,u,3>: Cost 0 copy LHS
-  1481846070U, // <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS
-  1611933077U, // <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
-  2685674910U, // <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
-  1523652232U, // <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u>
-  835584U, // <u,1,u,u>: Cost 0 copy LHS
-  1544110154U, // <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2>
-  1545437286U, // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS
-  1545437420U, // <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2>
-  2685232589U, // <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
-  2619179346U, // <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5>
-  2712069606U, // <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7>
-  2689877484U, // <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
-  2659656273U, // <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u>
-  1545437853U, // <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS
-  1550082851U, // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2>
-  2619179828U, // <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1>
-  2619179926U, // <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0>
-  2685232671U, // <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
-  2555604278U, // <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS
-  2619180176U, // <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7>
-  2689877564U, // <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
-  2602718850U, // <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
-  1158703235U, // <u,2,1,u>: Cost 2 vrev <2,u,u,1>
-  1481867366U, // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS
-  2555609846U, // <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2>
-  269271142U, // <u,2,2,2>: Cost 1 vdup2 LHS
-  1611490930U, // <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
-  1481870646U, // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS
-  2689877640U, // <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
-  2619180986U, // <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7>
-  2593436837U, // <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
-  269271142U, // <u,2,2,u>: Cost 1 vdup2 LHS
-  408134301U, // <u,2,3,0>: Cost 1 vext1 LHS, LHS
-  1481876214U, // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
-  1481877096U, // <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
-  1880326246U, // <u,2,3,3>: Cost 2 vzipr LHS, LHS
-  408137014U, // <u,2,3,4>: Cost 1 vext1 LHS, RHS
-  1529654992U, // <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
-  1529655802U, // <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
-  1529656314U, // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
-  408139566U, // <u,2,3,u>: Cost 1 vext1 LHS, LHS
-  1567853468U, // <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
-  2561598362U, // <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4>
-  2555627214U, // <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5>
-  2685232918U, // <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
-  2555628854U, // <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS
-  1545440566U, // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS
-  1571982740U, // <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2>
-  2592125957U, // <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
-  1545440809U, // <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS
-  2555633766U, // <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS
-  2561606550U, // <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0>
-  2689877856U, // <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
-  2685233000U, // <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
-  1158441059U, // <u,2,5,4>: Cost 2 vrev <2,u,4,5>
-  2645725188U, // <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5>
-  2689877892U, // <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
-  2823900470U, // <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS
-  1158736007U, // <u,2,5,u>: Cost 2 vrev <2,u,u,5>
-  1481900134U, // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS
-  2555642614U, // <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2>
-  2555643496U, // <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2>
-  1611491258U, // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
-  1481903414U, // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS
-  2689877964U, // <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
-  2689877973U, // <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
-  2645726030U, // <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1>
-  1611933671U, // <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
-  1585919033U, // <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2>
-  2573566710U, // <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2>
-  2567596115U, // <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7>
-  1906901094U, // <u,2,7,3>: Cost 2 vzipr RHS, LHS
-  2555653430U, // <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS
-  2800080230U, // <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6>
-  2980643164U, // <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
-  2645726828U, // <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7>
-  1906901099U, // <u,2,7,u>: Cost 2 vzipr RHS, LHS
-  408175266U, // <u,2,u,0>: Cost 1 vext1 LHS, LHS
-  1545443118U, // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS
-  269271142U, // <u,2,u,2>: Cost 1 vdup2 LHS
-  1611491416U, // <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
-  408177974U, // <u,2,u,4>: Cost 1 vext1 LHS, RHS
-  1545443482U, // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS
-  1726339226U, // <u,2,u,6>: Cost 2 vuzpl LHS, RHS
-  1529697274U, // <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
-  408180526U, // <u,2,u,u>: Cost 1 vext1 LHS, LHS
-  1544781824U, // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
-  471040156U, // <u,3,0,1>: Cost 1 vext2 LHS, LHS
-  1544781988U, // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
-  2618523900U, // <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0>
-  1544782162U, // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
-  2238188352U, // <u,3,0,5>: Cost 3 vrev <3,u,5,0>
-  2623169023U, // <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
-  2238335826U, // <u,3,0,7>: Cost 3 vrev <3,u,7,0>
-  471040669U, // <u,3,0,u>: Cost 1 vext2 LHS, LHS
-  1544782582U, // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
-  1544782644U, // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
-  1544782742U, // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
-  1544782808U, // <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
-  2618524733U, // <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
-  1544782992U, // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
-  2618524897U, // <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
-  2703517987U, // <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u>
-  1544783213U, // <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
-  1529716838U, // <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS
-  1164167966U, // <u,3,2,1>: Cost 2 vrev <3,u,1,2>
-  1544783464U, // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
-  1544783526U, // <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
-  1529720118U, // <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS
-  2618525544U, // <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
-  1544783802U, // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
-  2704181620U, // <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u>
-  1544783931U, // <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
-  1544784022U, // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
-  1487922559U, // <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3>
-  1493895256U, // <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3>
-  336380006U, // <u,3,3,3>: Cost 1 vdup3 LHS
-  1544784386U, // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
-  2824054478U, // <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
-  2238286668U, // <u,3,3,6>: Cost 3 vrev <3,u,6,3>
-  2954069136U, // <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
-  336380006U, // <u,3,3,u>: Cost 1 vdup3 LHS
-  1487929446U, // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS
-  1487930752U, // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4>
-  2623171644U, // <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
-  2561673366U, // <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2>
-  1487932726U, // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS
-  471043382U, // <u,3,4,5>: Cost 1 vext2 LHS, RHS
-  1592561012U, // <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
-  2238368598U, // <u,3,4,7>: Cost 3 vrev <3,u,7,4>
-  471043625U, // <u,3,4,u>: Cost 1 vext2 LHS, RHS
-  2555707494U, // <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS
-  1574645465U, // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3>
-  2567653106U, // <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5>
-  2555709954U, // <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6>
-  1592561606U, // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
-  1592561668U, // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
-  1592561762U, // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
-  1750314294U, // <u,3,5,7>: Cost 2 vuzpr LHS, RHS
-  1750314295U, // <u,3,5,u>: Cost 2 vuzpr LHS, RHS
-  2623172897U, // <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
-  2561688962U, // <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6>
-  1581281795U, // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3>
-  2706541204U, // <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u>
-  2623173261U, // <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
-  1164495686U, // <u,3,6,5>: Cost 2 vrev <3,u,5,6>
-  1592562488U, // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
-  1592562510U, // <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
-  1164716897U, // <u,3,6,u>: Cost 2 vrev <3,u,u,6>
-  1487954022U, // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS
-  1487955331U, // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7>
-  1493928028U, // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7>
-  2561697942U, // <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2>
-  1487957302U, // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS
-  2707352311U, // <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u>
-  2655024623U, // <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u>
-  1592563308U, // <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
-  1487959854U, // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS
-  1544787667U, // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
-  471045934U, // <u,3,u,1>: Cost 1 vext2 LHS, LHS
-  1549432709U, // <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
-  336380006U, // <u,3,u,3>: Cost 1 vdup3 LHS
-  1544788031U, // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
-  471046298U, // <u,3,u,5>: Cost 1 vext2 LHS, RHS
-  1549433040U, // <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
-  1750314537U, // <u,3,u,7>: Cost 2 vuzpr LHS, RHS
-  471046501U, // <u,3,u,u>: Cost 1 vext2 LHS, LHS
-  2625167360U, // <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0>
-  1551425638U, // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS
-  2619195630U, // <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4>
-  2619343104U, // <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
-  2625167698U, // <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5>
-  1638329234U, // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
-  1638329244U, // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
-  3787803556U, // <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
-  1551426205U, // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS
-  2555748454U, // <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS
-  2625168180U, // <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1>
-  1551426503U, // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4>
-  2625168344U, // <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3>
-  2555751734U, // <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS
-  1860554038U, // <u,4,1,5>: Cost 2 vzipl LHS, RHS
-  2689879022U, // <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
-  2592248852U, // <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
-  1555408301U, // <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4>
-  2555756646U, // <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS
-  2625168943U, // <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u>
-  2625169000U, // <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2>
-  2619197134U, // <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5>
-  2555759926U, // <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS
-  2712071222U, // <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
-  1994771766U, // <u,4,2,6>: Cost 2 vtrnl LHS, RHS
-  2592257045U, // <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
-  1994771784U, // <u,4,2,u>: Cost 2 vtrnl LHS, RHS
-  2625169558U, // <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2>
-  2567709594U, // <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4>
-  2567710817U, // <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3>
-  2625169820U, // <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3>
-  2625169922U, // <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6>
-  2954069710U, // <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
-  2954068172U, // <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
-  3903849472U, // <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7>
-  2954068174U, // <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
-  1505919078U, // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS
-  2567717831U, // <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4>
-  2567719010U, // <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4>
-  2570373542U, // <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
-  161926454U, // <u,4,4,4>: Cost 1 vdup0 RHS
-  1551428918U, // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS
-  1638329572U, // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
-  2594927963U, // <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
-  161926454U, // <u,4,4,u>: Cost 1 vdup0 RHS
-  1493983334U, // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS
-  2689879301U, // <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
-  1493985379U, // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5>
-  2567727254U, // <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2>
-  1493986614U, // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS
-  1863535926U, // <u,4,5,5>: Cost 2 vzipl RHS, RHS
-  537750838U, // <u,4,5,6>: Cost 1 vext3 LHS, RHS
-  2830110006U, // <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS
-  537750856U, // <u,4,5,u>: Cost 1 vext3 LHS, RHS
-  1482047590U, // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS
-  2555790070U, // <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2>
-  2555790952U, // <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2>
-  2555791510U, // <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2>
-  1482050870U, // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS
-  2689879422U, // <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
-  1997753654U, // <u,4,6,6>: Cost 2 vtrnl RHS, RHS
-  2712071562U, // <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
-  1482053422U, // <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS
-  2567741542U, // <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS
-  2567742362U, // <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4>
-  2567743589U, // <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7>
-  2573716286U, // <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7>
-  2567744822U, // <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS
-  2712071624U, // <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
-  96808489U, // <u,4,7,6>: Cost 1 vrev RHS
-  2651715180U, // <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7>
-  96955963U, // <u,4,7,u>: Cost 1 vrev RHS
-  1482063974U, // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS
-  1551431470U, // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS
-  1494009958U, // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u>
-  2555807894U, // <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2>
-  161926454U, // <u,4,u,4>: Cost 1 vdup0 RHS
-  1551431834U, // <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS
-  537751081U, // <u,4,u,6>: Cost 1 vext3 LHS, RHS
-  2830110249U, // <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS
-  537751099U, // <u,4,u,u>: Cost 1 vext3 LHS, RHS
-  2631811072U, // <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0>
-  1558069350U, // <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS
-  2619203823U, // <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5>
-  2619867456U, // <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5>
-  1546273106U, // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
-  2733010539U, // <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
-  2597622682U, // <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5>
-  1176539396U, // <u,5,0,7>: Cost 2 vrev <5,u,7,0>
-  1558069917U, // <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS
-  1505968230U, // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS
-  2624512887U, // <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5>
-  2631811990U, // <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0>
-  2618541056U, // <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7>
-  1505971510U, // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS
-  2627167419U, // <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5>
-  2579714554U, // <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3>
-  1638330064U, // <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
-  1638477529U, // <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
-  2561802342U, // <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS
-  2561803264U, // <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7>
-  2631149217U, // <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5>
-  1558071026U, // <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5>
-  2561805622U, // <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS
-  2714062607U, // <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
-  2631813050U, // <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7>
-  3092335926U, // <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS
-  1561389191U, // <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5>
-  2561810534U, // <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS
-  2561811857U, // <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3>
-  2631813474U, // <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u>
-  2631813532U, // <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3>
-  2619869698U, // <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6>
-  3001847002U, // <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5>
-  2954070530U, // <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
-  2018749750U, // <u,5,3,7>: Cost 2 vtrnr LHS, RHS
-  2018749751U, // <u,5,3,u>: Cost 2 vtrnr LHS, RHS
-  2573762662U, // <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS
-  2620017634U, // <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
-  2573764338U, // <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5>
-  2573765444U, // <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4>
-  1570680053U, // <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5>
-  1558072630U, // <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS
-  2645749143U, // <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5>
-  1638330310U, // <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
-  1558072873U, // <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS
-  1506000998U, // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS
-  2561827984U, // <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7>
-  2579744360U, // <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2>
-  2579744918U, // <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2>
-  1506004278U, // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS
-  229035318U, // <u,5,5,5>: Cost 1 vdup1 RHS
-  2712072206U, // <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
-  1638330392U, // <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
-  229035318U, // <u,5,5,u>: Cost 1 vdup1 RHS
-  1500037222U, // <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS
-  2561836436U, // <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6>
-  2567809133U, // <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6>
-  1500040006U, // <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6>
-  1500040502U, // <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS
-  2714062935U, // <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
-  2712072288U, // <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
-  27705344U, // <u,5,6,7>: Cost 0 copy RHS
-  27705344U, // <u,5,6,u>: Cost 0 copy RHS
-  1488101478U, // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS
-  1488102805U, // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7>
-  2561844840U, // <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2>
-  2561845398U, // <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2>
-  1488104758U, // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS
-  1638330536U, // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
-  2712072362U, // <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
-  2042965302U, // <u,5,7,7>: Cost 2 vtrnr RHS, RHS
-  1488107310U, // <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS
-  1488109670U, // <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS
-  1488110998U, // <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u>
-  2561853032U, // <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2>
-  1500056392U, // <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u>
-  1488112950U, // <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS
-  229035318U, // <u,5,u,5>: Cost 1 vdup1 RHS
-  2954111490U, // <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
-  27705344U, // <u,5,u,7>: Cost 0 copy RHS
-  27705344U, // <u,5,u,u>: Cost 0 copy RHS
-  2619211776U, // <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0>
-  1545470054U, // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS
-  1545470192U, // <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6>
-  2255958969U, // <u,6,0,3>: Cost 3 vrev <6,u,3,0>
-  1546797458U, // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6>
-  2720624971U, // <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u>
-  2256180180U, // <u,6,0,6>: Cost 3 vrev <6,u,6,0>
-  2960682294U, // <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS
-  1545470621U, // <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS
-  1182004127U, // <u,6,1,0>: Cost 2 vrev <6,u,0,1>
-  2619212596U, // <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1>
-  2619212694U, // <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0>
-  2619212760U, // <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3>
-  2626511979U, // <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6>
-  2619212944U, // <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7>
-  2714063264U, // <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
-  2967326006U, // <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS
-  1182594023U, // <u,6,1,u>: Cost 2 vrev <6,u,u,1>
-  1506050150U, // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS
-  2579792630U, // <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2>
-  2619213416U, // <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2>
-  2619213478U, // <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1>
-  1506053430U, // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS
-  2633148309U, // <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6>
-  2619213754U, // <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7>
-  1638330874U, // <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
-  1638478339U, // <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
-  2619213974U, // <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2>
-  2255836074U, // <u,6,3,1>: Cost 3 vrev <6,u,1,3>
-  2255909811U, // <u,6,3,2>: Cost 3 vrev <6,u,2,3>
-  2619214236U, // <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3>
-  1564715549U, // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6>
-  2639121006U, // <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6>
-  3001847012U, // <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
-  1880329526U, // <u,6,3,7>: Cost 2 vzipr LHS, RHS
-  1880329527U, // <u,6,3,u>: Cost 2 vzipr LHS, RHS
-  2567864422U, // <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS
-  2733011558U, // <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3>
-  2567866484U, // <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4>
-  2638458005U, // <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u>
-  1570540772U, // <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
-  1545473334U, // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS
-  1572015512U, // <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6>
-  2960715062U, // <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS
-  1545473577U, // <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS
-  2567872614U, // <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS
-  2645757648U, // <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3>
-  2567874490U, // <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7>
-  2576501250U, // <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
-  1576660943U, // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6>
-  2645757956U, // <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5>
-  2645758050U, // <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0>
-  2824080694U, // <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS
-  1182626795U, // <u,6,5,u>: Cost 2 vrev <6,u,u,5>
-  1506082918U, // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS
-  2579825398U, // <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2>
-  2645758458U, // <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3>
-  2579826838U, // <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2>
-  1506086198U, // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS
-  2579828432U, // <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3>
-  296144182U, // <u,6,6,6>: Cost 1 vdup2 RHS
-  1638331202U, // <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
-  296144182U, // <u,6,6,u>: Cost 1 vdup2 RHS
-  432349286U, // <u,6,7,0>: Cost 1 vext1 RHS, LHS
-  1506091766U, // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
-  1506092648U, // <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
-  1506093206U, // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
-  432352809U, // <u,6,7,4>: Cost 1 vext1 RHS, RHS
-  1506094800U, // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
-  1506095610U, // <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3>
-  1906904374U, // <u,6,7,7>: Cost 2 vzipr RHS, RHS
-  432355118U, // <u,6,7,u>: Cost 1 vext1 RHS, LHS
-  432357478U, // <u,6,u,0>: Cost 1 vext1 RHS, LHS
-  1545475886U, // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS
-  1506100840U, // <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
-  1506101398U, // <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
-  432361002U, // <u,6,u,4>: Cost 1 vext1 RHS, RHS
-  1545476250U, // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS
-  296144182U, // <u,6,u,6>: Cost 1 vdup2 RHS
-  1880370486U, // <u,6,u,7>: Cost 2 vzipr LHS, RHS
-  432363310U, // <u,6,u,u>: Cost 1 vext1 RHS, LHS
-  1571356672U, // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
-  497614950U, // <u,7,0,1>: Cost 1 vext2 RHS, LHS
-  1571356836U, // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
-  2573880146U, // <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0>
-  1571357010U, // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
-  1512083716U, // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0>
-  2621874741U, // <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7>
-  2585826298U, // <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2>
-  497615517U, // <u,7,0,u>: Cost 1 vext2 RHS, LHS
-  1571357430U, // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
-  1571357492U, // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
-  1571357590U, // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
-  1552114715U, // <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7>
-  2573888822U, // <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS
-  1553441981U, // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7>
-  2627847438U, // <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7>
-  2727408775U, // <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u>
-  1555432880U, // <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7>
-  2629838337U, // <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7>
-  1188058754U, // <u,7,2,1>: Cost 2 vrev <7,u,1,2>
-  1571358312U, // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
-  1571358374U, // <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
-  2632492869U, // <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7>
-  2633156502U, // <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7>
-  1560078311U, // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7>
-  2728072408U, // <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u>
-  1561405577U, // <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7>
-  1571358870U, // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
-  2627184913U, // <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u>
-  2633820523U, // <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u>
-  1571359132U, // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
-  1571359234U, // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
-  1512108295U, // <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3>
-  1518080992U, // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3>
-  2640456465U, // <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7>
-  1571359518U, // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
-  1571359634U, // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
-  2573911067U, // <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7>
-  2645101622U, // <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
-  2573912918U, // <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4>
-  1571359952U, // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
-  497618248U, // <u,7,4,5>: Cost 1 vext2 RHS, RHS
-  1571360116U, // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
-  2645102024U, // <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0>
-  497618473U, // <u,7,4,u>: Cost 1 vext2 RHS, RHS
-  2645102152U, // <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
-  1571360464U, // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
-  2645102334U, // <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
-  2645102447U, // <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
-  1571360710U, // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
-  1571360772U, // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
-  1571360866U, // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
-  1571360936U, // <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
-  1571361017U, // <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
-  1530044518U, // <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS
-  2645103016U, // <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
-  1571361274U, // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
-  2645103154U, // <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
-  1530047798U, // <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS
-  1188386474U, // <u,7,6,5>: Cost 2 vrev <7,u,5,6>
-  1571361592U, // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
-  1571361614U, // <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
-  1571361695U, // <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
-  1571361786U, // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
-  2573935616U, // <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7>
-  2645103781U, // <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2>
-  2573937497U, // <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7>
-  1571362150U, // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
-  1512141067U, // <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7>
-  1518113764U, // <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7>
-  363253046U, // <u,7,7,7>: Cost 1 vdup3 RHS
-  363253046U, // <u,7,7,u>: Cost 1 vdup3 RHS
-  1571362515U, // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
-  497620782U, // <u,7,u,1>: Cost 1 vext2 RHS, LHS
-  1571362693U, // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
-  1571362748U, // <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
-  1571362879U, // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
-  497621146U, // <u,7,u,5>: Cost 1 vext2 RHS, RHS
-  1571363024U, // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
-  363253046U, // <u,7,u,7>: Cost 1 vdup3 RHS
-  497621349U, // <u,7,u,u>: Cost 1 vext2 RHS, LHS
-  135053414U, // <u,u,0,0>: Cost 1 vdup0 LHS
-  471081121U, // <u,u,0,1>: Cost 1 vext2 LHS, LHS
-  1544822948U, // <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
-  1616140005U, // <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
-  1544823122U, // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
-  1512157453U, // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0>
-  1662220032U, // <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
-  1194457487U, // <u,u,0,7>: Cost 2 vrev <u,u,7,0>
-  471081629U, // <u,u,0,u>: Cost 1 vext2 LHS, LHS
-  1544823542U, // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
-  202162278U, // <u,u,1,1>: Cost 1 vdup1 LHS
-  537753390U, // <u,u,1,2>: Cost 1 vext3 LHS, LHS
-  1544823768U, // <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
-  1494248758U, // <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS
-  1544823952U, // <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
-  1518138343U, // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1>
-  1640322907U, // <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
-  537753444U, // <u,u,1,u>: Cost 1 vext3 LHS, LHS
-  1482309734U, // <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS
-  1194031451U, // <u,u,2,1>: Cost 2 vrev <u,u,1,2>
-  269271142U, // <u,u,2,2>: Cost 1 vdup2 LHS
-  835584U, // <u,u,2,3>: Cost 0 copy LHS
-  1482313014U, // <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS
-  2618566504U, // <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
-  1544824762U, // <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
-  1638479788U, // <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
-  835584U, // <u,u,2,u>: Cost 0 copy LHS
-  408576723U, // <u,u,3,0>: Cost 1 vext1 LHS, LHS
-  1482318582U, // <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
-  120371557U, // <u,u,3,2>: Cost 1 vrev LHS
-  336380006U, // <u,u,3,3>: Cost 1 vdup3 LHS
-  408579382U, // <u,u,3,4>: Cost 1 vext1 LHS, RHS
-  1616140271U, // <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
-  1530098170U, // <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
-  1880329544U, // <u,u,3,7>: Cost 2 vzipr LHS, RHS
-  408581934U, // <u,u,3,u>: Cost 1 vext1 LHS, LHS
-  1488298086U, // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS
-  1488299437U, // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4>
-  1659271204U, // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
-  1194195311U, // <u,u,4,3>: Cost 2 vrev <u,u,3,4>
-  161926454U, // <u,u,4,4>: Cost 1 vdup0 RHS
-  471084342U, // <u,u,4,5>: Cost 1 vext2 LHS, RHS
-  1571368308U, // <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
-  1640323153U, // <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
-  471084585U, // <u,u,4,u>: Cost 1 vext2 LHS, RHS
-  1494278246U, // <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS
-  1571368656U, // <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
-  1494280327U, // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5>
-  1616140415U, // <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
-  1494281526U, // <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS
-  229035318U, // <u,u,5,5>: Cost 1 vdup1 RHS
-  537753754U, // <u,u,5,6>: Cost 1 vext3 LHS, RHS
-  1750355254U, // <u,u,5,7>: Cost 2 vuzpr LHS, RHS
-  537753772U, // <u,u,5,u>: Cost 1 vext3 LHS, RHS
-  1482342502U, // <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS
-  2556084982U, // <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2>
-  1571369466U, // <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
-  1611938000U, // <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
-  1482345782U, // <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS
-  1194359171U, // <u,u,6,5>: Cost 2 vrev <u,u,5,6>
-  296144182U, // <u,u,6,6>: Cost 1 vdup2 RHS
-  27705344U, // <u,u,6,7>: Cost 0 copy RHS
-  27705344U, // <u,u,6,u>: Cost 0 copy RHS
-  432496742U, // <u,u,7,0>: Cost 1 vext1 RHS, LHS
-  1488324016U, // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7>
-  1494296713U, // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7>
-  1906901148U, // <u,u,7,3>: Cost 2 vzipr RHS, LHS
-  432500283U, // <u,u,7,4>: Cost 1 vext1 RHS, RHS
-  1506242256U, // <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
-  120699277U, // <u,u,7,6>: Cost 1 vrev RHS
-  363253046U, // <u,u,7,7>: Cost 1 vdup3 RHS
-  432502574U, // <u,u,7,u>: Cost 1 vext1 RHS, LHS
-  408617688U, // <u,u,u,0>: Cost 1 vext1 LHS, LHS
-  471086894U, // <u,u,u,1>: Cost 1 vext2 LHS, LHS
-  537753957U, // <u,u,u,2>: Cost 1 vext3 LHS, LHS
-  835584U, // <u,u,u,3>: Cost 0 copy LHS
-  408620342U, // <u,u,u,4>: Cost 1 vext1 LHS, RHS
-  471087258U, // <u,u,u,5>: Cost 1 vext2 LHS, RHS
-  537753997U, // <u,u,u,6>: Cost 1 vext3 LHS, RHS
-  27705344U, // <u,u,u,7>: Cost 0 copy RHS
-  835584U, // <u,u,u,u>: Cost 0 copy LHS
-  0
-};
+static const unsigned PerfectShuffleTable[6561 + 1] = {
+    135053414U,  // <0,0,0,0>: Cost 1 vdup0 LHS
+    2080972802U, // <0,0,0,1>: Cost 2 ins <0,0,u,1>, lane 2
+    1679065190U, // <0,0,0,2>: Cost 2 vuzpl <0,2,0,2>, LHS
+    2085707777U, // <0,0,0,3>: Cost 2 ins <0,u,0,3>, lane 1
+    1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS
+    2080440323U, // <0,0,0,5>: Cost 2 ins <0,0,0,u>, lane 3
+    2080440323U, // <0,0,0,6>: Cost 2 ins <0,0,0,u>, lane 3
+    2080440323U, // <0,0,0,7>: Cost 2 ins <0,0,0,u>, lane 3
+    135053414U,  // <0,0,0,u>: Cost 1 vdup0 LHS
+    1812774912U, // <0,0,1,0>: Cost 2 vzipl LHS, <0,0,0,0>
+    739033190U,  // <0,0,1,1>: Cost 1 vzipl LHS, LHS
+    1812775076U, // <0,0,1,2>: Cost 2 vzipl LHS, <0,2,0,2>
+    2080514051U, // <0,0,1,3>: Cost 2 ins <0,0,1,u>, lane 3
+    1812816210U, // <0,0,1,4>: Cost 2 vzipl LHS, <0,4,1,5>
+    2085797889U, // <0,0,1,5>: Cost 2 ins <0,u,1,5>, lane 1
+    2080514051U, // <0,0,1,6>: Cost 2 ins <0,0,1,u>, lane 3
+    2080514051U, // <0,0,1,7>: Cost 2 ins <0,0,1,u>, lane 3
+    739033757U,  // <0,0,1,u>: Cost 1 vzipl LHS, LHS
+    1946992640U, // <0,0,2,0>: Cost 2 vtrnl LHS, <0,0,0,0>
+    1946992650U, // <0,0,2,1>: Cost 2 vtrnl LHS, <0,0,1,1>
+    873250918U,  // <0,0,2,2>: Cost 1 vtrnl LHS, LHS
+    1012113409U, // <0,0,2,3>: Cost 1 ins LHS, lane 1
+    1946992844U, // <0,0,2,4>: Cost 2 vtrnl LHS, <0,2,4,6>
+    2080587779U, // <0,0,2,5>: Cost 2 ins <0,0,2,u>, lane 3
+    2085879809U, // <0,0,2,6>: Cost 2 ins <0,u,2,6>, lane 1
+    2080587779U, // <0,0,2,7>: Cost 2 ins <0,0,2,u>, lane 3
+    873250972U,  // <0,0,2,u>: Cost 1 vtrnl LHS, LHS
+    2080964610U, // <0,0,3,0>: Cost 2 ins <0,0,u,0>, lane 2
+    2080972802U, // <0,0,3,1>: Cost 2 ins <0,0,u,1>, lane 2
+    2128388096U, // <0,0,3,2>: Cost 2 ins <u,0,3,2>, lane 0
+    2013437973U, // <0,0,3,3>: Cost 2 vtrnr <0,0,2,3>, <0,0,2,3>
+    3154739202U, // <0,0,3,4>: Cost 3 ins <0,0,u,4>, lane 2
+    2752809474U, // <0,0,3,5>: Cost 3 vuzpl <0,2,0,2>, <3,4,5,6>
+    3154755586U, // <0,0,3,6>: Cost 3 ins <0,0,u,6>, lane 2
+    2818573312U, // <0,0,3,7>: Cost 3 vuzpr <0,0,0,0>, <1,3,5,7>
+    2080972802U, // <0,0,3,u>: Cost 2 ins <0,0,u,1>, lane 2
+    2080964610U, // <0,0,4,0>: Cost 2 ins <0,0,u,0>, lane 2
+    1814708326U, // <0,0,4,1>: Cost 2 vzipl <0,4,1,5>, LHS
+    1947828326U, // <0,0,4,2>: Cost 2 vtrnl <0,2,4,6>, LHS
+    2086002689U, // <0,0,4,3>: Cost 2 ins <0,u,4,3>, lane 1
+    1947828428U, // <0,0,4,4>: Cost 2 vtrnl <0,2,4,6>, <0,2,4,6>
+    2081030149U, // <0,0,4,5>: Cost 2 ins <0,0,u,u>, lane 5
+    1679068470U, // <0,0,4,6>: Cost 2 vuzpl <0,2,0,2>, RHS
+    3154477059U, // <0,0,4,7>: Cost 3 ins <0,0,4,u>, lane 3
+    1679068488U, // <0,0,4,u>: Cost 2 vuzpl <0,2,0,2>, RHS
+    2080964610U, // <0,0,5,0>: Cost 2 ins <0,0,u,0>, lane 2
+    2128527360U, // <0,0,5,1>: Cost 2 ins <u,0,5,1>, lane 0
+    2080980994U, // <0,0,5,2>: Cost 2 ins <0,0,u,2>, lane 2
+    2086076417U, // <0,0,5,3>: Cost 2 ins <0,u,5,3>, lane 1
+    3202293760U, // <0,0,5,4>: Cost 3 ins <u,0,5,4>, lane 0
+    1947213953U, // <0,0,5,5>: Cost 2 vtrnl <0,1,5,3>, <0,1,5,3>
+    2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7>
+    1744833846U, // <0,0,5,7>: Cost 2 vuzpr <0,0,0,0>, RHS
+    2128527360U, // <0,0,5,u>: Cost 2 ins <u,0,5,1>, lane 0
+    2080964610U, // <0,0,6,0>: Cost 2 ins <0,0,u,0>, lane 2
+    2080972802U, // <0,0,6,1>: Cost 2 ins <0,0,u,1>, lane 2
+    2128609280U, // <0,0,6,2>: Cost 2 ins <u,0,6,2>, lane 0
+    2086150145U, // <0,0,6,3>: Cost 2 ins <0,u,6,3>, lane 1
+    3202367488U, // <0,0,6,4>: Cost 3 ins <u,0,6,4>, lane 0
+    2617250536U, // <0,0,6,5>: Cost 3 vext2 <0,0,0,0>, <6,5,6,7>
+    1947287690U, // <0,0,6,6>: Cost 2 vtrnl <0,1,6,3>, <0,1,6,3>
+    2081030149U, // <0,0,6,7>: Cost 2 ins <0,0,u,u>, lane 5
+    2080972802U, // <0,0,6,u>: Cost 2 ins <0,0,u,1>, lane 2
+    2080964610U, // <0,0,7,0>: Cost 2 ins <0,0,u,0>, lane 2
+    2080972802U, // <0,0,7,1>: Cost 2 ins <0,0,u,1>, lane 2
+    2080980994U, // <0,0,7,2>: Cost 2 ins <0,0,u,2>, lane 2
+    2086223873U, // <0,0,7,3>: Cost 2 ins <0,u,7,3>, lane 1
+    3154739202U, // <0,0,7,4>: Cost 3 ins <0,0,u,4>, lane 2
+    2617251265U, // <0,0,7,5>: Cost 3 vext2 <0,0,0,0>, <7,5,6,7>
+    3154755586U, // <0,0,7,6>: Cost 3 ins <0,0,u,6>, lane 2
+    1947361427U, // <0,0,7,7>: Cost 2 vtrnl <0,1,7,3>, <0,1,7,3>
+    2080972802U, // <0,0,7,u>: Cost 2 ins <0,0,u,1>, lane 2
+    135053414U,  // <0,0,u,0>: Cost 1 vdup0 LHS
+    743678054U,  // <0,0,u,1>: Cost 1 vzipl LHS, LHS
+    873693286U,  // <0,0,u,2>: Cost 1 vtrnl LHS, LHS
+    1012113409U, // <0,0,u,3>: Cost 1 ins LHS, lane 1
+    1947435212U, // <0,0,u,4>: Cost 2 vtrnl LHS, <0,2,4,6>
+    2085797889U, // <0,0,u,5>: Cost 2 ins <0,u,1,5>, lane 1
+    1679071386U, // <0,0,u,6>: Cost 2 vuzpl <0,2,0,2>, RHS
+    2080514051U, // <0,0,u,7>: Cost 2 ins <0,0,1,u>, lane 3
+    873693340U,  // <0,0,u,u>: Cost 1 vtrnl LHS, LHS
+    2085683201U, // <0,1,0,0>: Cost 2 ins <0,u,0,0>, lane 1
+    1007951877U, // <0,1,0,1>: Cost 1 ins LHS, lane 5
+    1680490598U, // <0,1,0,2>: Cost 2 vuzpl <0,4,1,5>, LHS
+    1007910914U, // <0,1,0,3>: Cost 1 ins LHS, lane 2
+    2081660930U, // <0,1,0,4>: Cost 2 ins <0,1,u,4>, lane 2
+    2081669122U, // <0,1,0,5>: Cost 2 ins <0,1,u,5>, lane 2
+    2081677314U, // <0,1,0,6>: Cost 2 ins <0,1,u,6>, lane 2
+    2081685506U, // <0,1,0,7>: Cost 2 ins <0,1,u,7>, lane 2
+    1007951877U, // <0,1,0,u>: Cost 1 ins LHS, lane 5
+    1812775670U, // <0,1,1,0>: Cost 2 vzipl LHS, <1,0,3,2>
+    1812775732U, // <0,1,1,1>: Cost 2 vzipl LHS, <1,1,1,1>
+    1812775830U, // <0,1,1,2>: Cost 2 vzipl LHS, <1,2,3,0>
+    1007910914U, // <0,1,1,3>: Cost 1 ins LHS, lane 2
+    1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS
+    1812817040U, // <0,1,1,5>: Cost 2 vzipl LHS, <1,5,3,7>
+    2081677314U, // <0,1,1,6>: Cost 2 ins <0,1,u,6>, lane 2
+    2081685506U, // <0,1,1,7>: Cost 2 ins <0,1,u,7>, lane 2
+    1007910914U, // <0,1,1,u>: Cost 1 ins LHS, lane 2
+    1007509507U, // <0,1,2,0>: Cost 1 ins LHS, lane 3
+    1007509507U, // <0,1,2,1>: Cost 1 ins LHS, lane 3
+    1007509507U, // <0,1,2,2>: Cost 1 ins LHS, lane 3
+    835584U,     // <0,1,2,3>: Cost 0 copy LHS
+    1007509507U, // <0,1,2,4>: Cost 1 ins LHS, lane 3
+    1007509507U, // <0,1,2,5>: Cost 1 ins LHS, lane 3
+    1007509507U, // <0,1,2,6>: Cost 1 ins LHS, lane 3
+    1007509507U, // <0,1,2,7>: Cost 1 ins LHS, lane 3
+    835584U,     // <0,1,2,u>: Cost 0 copy LHS
+    2133680132U, // <0,1,3,0>: Cost 2 ins <u,u,3,0>, lane 4
+    2081636354U, // <0,1,3,1>: Cost 2 ins <0,1,u,1>, lane 2
+    2133696516U, // <0,1,3,2>: Cost 2 ins <u,u,3,2>, lane 4
+    1007910914U, // <0,1,3,3>: Cost 1 ins LHS, lane 2
+    2133712900U, // <0,1,3,4>: Cost 2 ins <u,u,3,4>, lane 4
+    2081669122U, // <0,1,3,5>: Cost 2 ins <0,1,u,5>, lane 2
+    2081677314U, // <0,1,3,6>: Cost 2 ins <0,1,u,6>, lane 2
+    2133737476U, // <0,1,3,7>: Cost 2 ins <u,u,3,7>, lane 4
+    1007910914U, // <0,1,3,u>: Cost 1 ins LHS, lane 2
+    2081628162U, // <0,1,4,0>: Cost 2 ins <0,1,u,0>, lane 2
+    2081636354U, // <0,1,4,1>: Cost 2 ins <0,1,u,1>, lane 2
+    2081644546U, // <0,1,4,2>: Cost 2 ins <0,1,u,2>, lane 2
+    1007910914U, // <0,1,4,3>: Cost 1 ins LHS, lane 2
+    2081660930U, // <0,1,4,4>: Cost 2 ins <0,1,u,4>, lane 2
+    1007951877U, // <0,1,4,5>: Cost 1 ins LHS, lane 5
+    1680493878U, // <0,1,4,6>: Cost 2 vuzpl <0,4,1,5>, RHS
+    2081685506U, // <0,1,4,7>: Cost 2 ins <0,1,u,7>, lane 2
+    1007910914U, // <0,1,4,u>: Cost 1 ins LHS, lane 2
+    2081628162U, // <0,1,5,0>: Cost 2 ins <0,1,u,0>, lane 2
+    2133835780U, // <0,1,5,1>: Cost 2 ins <u,u,5,1>, lane 4
+    2081644546U, // <0,1,5,2>: Cost 2 ins <0,1,u,2>, lane 2
+    1007910914U, // <0,1,5,3>: Cost 1 ins LHS, lane 2
+    2081660930U, // <0,1,5,4>: Cost 2 ins <0,1,u,4>, lane 2
+    2133868548U, // <0,1,5,5>: Cost 2 ins <u,u,5,5>, lane 4
+    2133876740U, // <0,1,5,6>: Cost 2 ins <u,u,5,6>, lane 4
+    2133884932U, // <0,1,5,7>: Cost 2 ins <u,u,5,7>, lane 4
+    1007910914U, // <0,1,5,u>: Cost 1 ins LHS, lane 2
+    2081628162U, // <0,1,6,0>: Cost 2 ins <0,1,u,0>, lane 2
+    2081636354U, // <0,1,6,1>: Cost 2 ins <0,1,u,1>, lane 2
+    2133917700U, // <0,1,6,2>: Cost 2 ins <u,u,6,2>, lane 4
+    1007910914U, // <0,1,6,3>: Cost 1 ins LHS, lane 2
+    2081660930U, // <0,1,6,4>: Cost 2 ins <0,1,u,4>, lane 2
+    2081669122U, // <0,1,6,5>: Cost 2 ins <0,1,u,5>, lane 2
+    2133950468U, // <0,1,6,6>: Cost 2 ins <u,u,6,6>, lane 4
+    1060216836U, // <0,1,6,7>: Cost 1 ins RHS, lane 4
+    1007910914U, // <0,1,6,u>: Cost 1 ins LHS, lane 2
+    2133975044U, // <0,1,7,0>: Cost 2 ins <u,u,7,0>, lane 4
+    2081636354U, // <0,1,7,1>: Cost 2 ins <0,1,u,1>, lane 2
+    2081644546U, // <0,1,7,2>: Cost 2 ins <0,1,u,2>, lane 2
+    1007910914U, // <0,1,7,3>: Cost 1 ins LHS, lane 2
+    2134007812U, // <0,1,7,4>: Cost 2 ins <u,u,7,4>, lane 4
+    2081669122U, // <0,1,7,5>: Cost 2 ins <0,1,u,5>, lane 2
+    2134024196U, // <0,1,7,6>: Cost 2 ins <u,u,7,6>, lane 4
+    2134032388U, // <0,1,7,7>: Cost 2 ins <u,u,7,7>, lane 4
+    1007910914U, // <0,1,7,u>: Cost 1 ins LHS, lane 2
+    1007509507U, // <0,1,u,0>: Cost 1 ins LHS, lane 3
+    1007951877U, // <0,1,u,1>: Cost 1 ins LHS, lane 5
+    1007509507U, // <0,1,u,2>: Cost 1 ins LHS, lane 3
+    835584U,     // <0,1,u,3>: Cost 0 copy LHS
+    1007509507U, // <0,1,u,4>: Cost 1 ins LHS, lane 3
+    1007509507U, // <0,1,u,5>: Cost 1 ins LHS, lane 3
+    1007509507U, // <0,1,u,6>: Cost 1 ins LHS, lane 3
+    1007509507U, // <0,1,u,7>: Cost 1 ins LHS, lane 3
+    835584U,     // <0,1,u,u>: Cost 0 copy LHS
+    1678557184U, // <0,2,0,0>: Cost 2 vuzpl LHS, <0,0,0,0>
+    1678598154U, // <0,2,0,1>: Cost 2 vuzpl LHS, <0,0,1,1>
+    604815462U,  // <0,2,0,2>: Cost 1 vuzpl LHS, LHS
+    2081767427U, // <0,2,0,3>: Cost 2 ins <0,2,0,u>, lane 3
+    1678598348U, // <0,2,0,4>: Cost 2 vuzpl LHS, <0,2,4,6>
+    2081767427U, // <0,2,0,5>: Cost 2 ins <0,2,0,u>, lane 3
+    2082340866U, // <0,2,0,6>: Cost 2 ins <0,2,u,6>, lane 2
+    2081767427U, // <0,2,0,7>: Cost 2 ins <0,2,0,u>, lane 3
+    604815516U,  // <0,2,0,u>: Cost 1 vuzpl LHS, LHS
+    2752340940U, // <0,2,1,0>: Cost 3 vuzpl LHS, <1,3,0,0>
+    1678558004U, // <0,2,1,1>: Cost 2 vuzpl LHS, <1,1,1,1>
+    1812776552U, // <0,2,1,2>: Cost 2 vzipl LHS, <2,2,2,2>
+    1678557942U, // <0,2,1,3>: Cost 2 vuzpl LHS, <1,0,3,2>
+    2752340982U, // <0,2,1,4>: Cost 3 vuzpl LHS, <1,3,4,6>
+    1678599168U, // <0,2,1,5>: Cost 2 vuzpl LHS, <1,3,5,7>
+    1812817850U, // <0,2,1,6>: Cost 2 vzipl LHS, <2,6,3,7>
+    2860466282U, // <0,2,1,7>: Cost 3 vuzpr <7,0,1,2>, <0,1,2,7>
+    1678598947U, // <0,2,1,u>: Cost 2 vuzpl LHS, <1,0,u,2>
+    1678558886U, // <0,2,2,0>: Cost 2 vuzpl LHS, <2,3,0,1>
+    2085838849U, // <0,2,2,1>: Cost 2 ins <0,u,2,1>, lane 1
+    1678558824U, // <0,2,2,2>: Cost 2 vuzpl LHS, <2,2,2,2>
+    1012113409U, // <0,2,2,3>: Cost 1 ins LHS, lane 1
+    1678558926U, // <0,2,2,4>: Cost 2 vuzpl LHS, <2,3,4,5>
+    2085871617U, // <0,2,2,5>: Cost 2 ins <0,u,2,5>, lane 1
+    2085879809U, // <0,2,2,6>: Cost 2 ins <0,u,2,6>, lane 1
+    2085888001U, // <0,2,2,7>: Cost 2 ins <0,u,2,7>, lane 1
+    1012113409U, // <0,2,2,u>: Cost 1 ins LHS, lane 1
+    2129698816U, // <0,2,3,0>: Cost 2 ins <u,2,3,0>, lane 0
+    1678559382U, // <0,2,3,1>: Cost 2 vuzpl LHS, <3,0,1,2>
+    2082308098U, // <0,2,3,2>: Cost 2 ins <0,2,u,2>, lane 2
+    1678559644U, // <0,2,3,3>: Cost 2 vuzpl LHS, <3,3,3,3>
+    2129731584U, // <0,2,3,4>: Cost 2 ins <u,2,3,4>, lane 0
+    1678559746U, // <0,2,3,5>: Cost 2 vuzpl LHS, <3,4,5,6>
+    2082340866U, // <0,2,3,6>: Cost 2 ins <0,2,u,6>, lane 2
+    2824782848U, // <0,2,3,7>: Cost 3 vuzpr <1,0,3,2>, <1,3,5,7>
+    1678559445U, // <0,2,3,u>: Cost 2 vuzpl LHS, <3,0,u,2>
+    2082062339U, // <0,2,4,0>: Cost 2 ins <0,2,4,u>, lane 3
+    2082062339U, // <0,2,4,1>: Cost 2 ins <0,2,4,u>, lane 3
+    2082308098U, // <0,2,4,2>: Cost 2 ins <0,2,u,2>, lane 2
+    2082062339U, // <0,2,4,3>: Cost 2 ins <0,2,4,u>, lane 3
+    2082062339U, // <0,2,4,4>: Cost 2 ins <0,2,4,u>, lane 3
+    1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS
+    604818742U,  // <0,2,4,6>: Cost 1 vuzpl LHS, RHS
+    2082062339U, // <0,2,4,7>: Cost 2 ins <0,2,4,u>, lane 3
+    604818760U,  // <0,2,4,u>: Cost 1 vuzpl LHS, RHS
+    3105260438U, // <0,2,5,0>: Cost 3 vtrnr <3,0,4,5>, <1,2,3,0>
+    1678561408U, // <0,2,5,1>: Cost 2 vuzpl LHS, <5,7,1,3>
+    2082308098U, // <0,2,5,2>: Cost 2 ins <0,2,u,2>, lane 2
+    2086076417U, // <0,2,5,3>: Cost 2 ins <0,u,5,3>, lane 1
+    2756947554U, // <0,2,5,4>: Cost 3 vuzpl LHS, <5,0,4,1>
+    1678561284U, // <0,2,5,5>: Cost 2 vuzpl LHS, <5,5,5,5>
+    2082340866U, // <0,2,5,6>: Cost 2 ins <0,2,u,6>, lane 2
+    1751043382U, // <0,2,5,7>: Cost 2 vuzpr <1,0,3,2>, RHS
+    1751043383U, // <0,2,5,u>: Cost 2 vuzpr <1,0,3,2>, RHS
+    1678562126U, // <0,2,6,0>: Cost 2 vuzpl LHS, <6,7,0,1>
+    2756948257U, // <0,2,6,1>: Cost 3 vuzpl LHS, <6,0,1,2>
+    2082308098U, // <0,2,6,2>: Cost 2 ins <0,2,u,2>, lane 2
+    2086150145U, // <0,2,6,3>: Cost 2 ins <0,u,6,3>, lane 1
+    1678562166U, // <0,2,6,4>: Cost 2 vuzpl LHS, <6,7,4,5>
+    2756948621U, // <0,2,6,5>: Cost 3 vuzpl LHS, <6,4,5,6>
+    2082340866U, // <0,2,6,6>: Cost 2 ins <0,2,u,6>, lane 2
+    2082357253U, // <0,2,6,7>: Cost 2 ins <0,2,u,u>, lane 5
+    2082308098U, // <0,2,6,u>: Cost 2 ins <0,2,u,2>, lane 2
+    3099378582U, // <0,2,7,0>: Cost 3 vtrnr <2,0,5,7>, <1,2,3,0>
+    1678562298U, // <0,2,7,1>: Cost 2 vuzpl LHS, <7,0,1,2>
+    2082308098U, // <0,2,7,2>: Cost 2 ins <0,2,u,2>, lane 2
+    2130018304U, // <0,2,7,3>: Cost 2 ins <u,2,7,3>, lane 0
+    2645136742U, // <0,2,7,4>: Cost 3 vext2 <4,6,0,2>, <7,4,5,6>
+    1678562662U, // <0,2,7,5>: Cost 2 vuzpl LHS, <7,4,5,6>
+    2082340866U, // <0,2,7,6>: Cost 2 ins <0,2,u,6>, lane 2
+    1678562924U, // <0,2,7,7>: Cost 2 vuzpl LHS, <7,7,7,7>
+    2082308098U, // <0,2,7,u>: Cost 2 ins <0,2,u,2>, lane 2
+    1947436710U, // <0,2,u,0>: Cost 2 vtrnl LHS, <2,3,0,1>
+    1678603987U, // <0,2,u,1>: Cost 2 vuzpl LHS, <u,0,1,2>
+    604821294U,  // <0,2,u,2>: Cost 1 vuzpl LHS, LHS
+    1012113409U, // <0,2,u,3>: Cost 1 ins LHS, lane 1
+    1947436750U, // <0,2,u,4>: Cost 2 vtrnl LHS, <2,3,4,5>
+    1678604351U, // <0,2,u,5>: Cost 2 vuzpl LHS, <u,4,5,6>
+    604821658U,  // <0,2,u,6>: Cost 1 vuzpl LHS, RHS
+    1751043625U, // <0,2,u,7>: Cost 2 vuzpr <1,0,3,2>, RHS
+    604821348U,  // <0,2,u,u>: Cost 1 vuzpl LHS, LHS
+    2085683201U, // <0,3,0,0>: Cost 2 ins <0,u,0,0>, lane 1
+    2130149376U, // <0,3,0,1>: Cost 2 ins <u,3,0,1>, lane 0
+    2085699585U, // <0,3,0,2>: Cost 2 ins <0,u,0,2>, lane 1
+    1745002517U, // <0,3,0,3>: Cost 2 vuzpr <0,0,2,3>, <0,0,2,3>
+    2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS
+    3021244930U, // <0,3,0,5>: Cost 3 vtrnl <0,2,0,2>, <3,4,5,6>
+    3159474177U, // <0,3,0,6>: Cost 3 ins <0,u,0,6>, lane 1
+    2952791184U, // <0,3,0,7>: Cost 3 vzipr <0,0,0,0>, <1,5,3,7>
+    2130149376U, // <0,3,0,u>: Cost 2 ins <u,3,0,1>, lane 0
+    1812777110U, // <0,3,1,0>: Cost 2 vzipl LHS, <3,0,1,2>
+    2085765121U, // <0,3,1,1>: Cost 2 ins <0,u,1,1>, lane 1
+    2886519105U, // <0,3,1,2>: Cost 3 vzipl LHS, <3,2,2,2>
+    1812777372U, // <0,3,1,3>: Cost 2 vzipl LHS, <3,3,3,3>
+    1812777474U, // <0,3,1,4>: Cost 2 vzipl LHS, <3,4,5,6>
+    2085797889U, // <0,3,1,5>: Cost 2 ins <0,u,1,5>, lane 1
+    3159547905U, // <0,3,1,6>: Cost 3 ins <0,u,1,6>, lane 1
+    2966733968U, // <0,3,1,7>: Cost 3 vzipr <2,3,0,1>, <1,5,3,7>
+    1812777758U, // <0,3,1,u>: Cost 2 vzipl LHS, <3,u,1,2>
+    1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS
+    1946994838U, // <0,3,2,1>: Cost 2 vtrnl LHS, <3,0,1,2>
+    2085847041U, // <0,3,2,2>: Cost 2 ins <0,u,2,2>, lane 1
+    1012113409U, // <0,3,2,3>: Cost 1 ins LHS, lane 1
+    1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS
+    1946995202U, // <0,3,2,5>: Cost 2 vtrnl LHS, <3,4,5,6>
+    2085879809U, // <0,3,2,6>: Cost 2 ins <0,u,2,6>, lane 1
+    2085888001U, // <0,3,2,7>: Cost 2 ins <0,u,2,7>, lane 1
+    1012113409U, // <0,3,2,u>: Cost 1 ins LHS, lane 1
+    2887747734U, // <0,3,3,0>: Cost 3 vzipl <0,3,1,0>, <3,0,1,2>
+    2753022102U, // <0,3,3,1>: Cost 3 vuzpl <0,2,3,1>, <3,0,1,2>
+    2965422838U, // <0,3,3,2>: Cost 3 vzipr <2,1,0,3>, <1,0,3,2>
+    2130386944U, // <0,3,3,3>: Cost 2 ins <u,3,3,3>, lane 0
+    2887780866U, // <0,3,3,4>: Cost 3 vzipl <0,3,1,4>, <3,4,5,6>
+    2753055234U, // <0,3,3,5>: Cost 3 vuzpl <0,2,3,5>, <3,4,5,6>
+    2752375389U, // <0,3,3,6>: Cost 3 vuzpl <0,1,3,3>, <3,5,6,7>
+    3204161536U, // <0,3,3,7>: Cost 3 ins <u,3,3,7>, lane 0
+    2130386944U, // <0,3,3,u>: Cost 2 ins <u,3,3,3>, lane 0
+    2888452246U, // <0,3,4,0>: Cost 3 vzipl <0,4,1,5>, <3,0,1,2>
+    3021572246U, // <0,3,4,1>: Cost 3 vtrnl <0,2,4,6>, <3,0,1,2>
+    3021572257U, // <0,3,4,2>: Cost 3 vtrnl <0,2,4,6>, <3,0,2,4>
+    2086002689U, // <0,3,4,3>: Cost 2 ins <0,u,4,3>, lane 1
+    2888452610U, // <0,3,4,4>: Cost 3 vzipl <0,4,1,5>, <3,4,5,6>
+    2130477056U, // <0,3,4,5>: Cost 2 ins <u,3,4,5>, lane 0
+    2086027265U, // <0,3,4,6>: Cost 2 ins <0,u,4,6>, lane 1
+    2818747621U, // <0,3,4,7>: Cost 3 vuzpr <0,0,2,3>, <4,4,6,7>
+    2130477056U, // <0,3,4,u>: Cost 2 ins <u,3,4,5>, lane 0
+    3204251648U, // <0,3,5,0>: Cost 3 ins <u,3,5,0>, lane 0
+    3204259840U, // <0,3,5,1>: Cost 3 ins <u,3,5,1>, lane 0
+    2961457910U, // <0,3,5,2>: Cost 3 vzipr <1,4,0,5>, <1,0,3,2>
+    2086076417U, // <0,3,5,3>: Cost 2 ins <0,u,5,3>, lane 1
+    2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5>
+    3204292608U, // <0,3,5,5>: Cost 3 ins <u,3,5,5>, lane 0
+    2653769826U, // <0,3,5,6>: Cost 3 vext2 <6,1,0,3>, <5,6,7,0>
+    2130567168U, // <0,3,5,7>: Cost 2 ins <u,3,5,7>, lane 0
+    2130567168U, // <0,3,5,u>: Cost 2 ins <u,3,5,7>, lane 0
+    2854506594U, // <0,3,6,0>: Cost 3 vuzpr <6,0,1,3>, <5,6,7,0>
+    2653770090U, // <0,3,6,1>: Cost 3 vext2 <6,1,0,3>, <6,1,0,3>
+    3204341760U, // <0,3,6,2>: Cost 3 ins <u,3,6,2>, lane 0
+    2086150145U, // <0,3,6,3>: Cost 2 ins <0,u,6,3>, lane 1
+    3204358144U, // <0,3,6,4>: Cost 3 ins <u,3,6,4>, lane 0
+    3204366336U, // <0,3,6,5>: Cost 3 ins <u,3,6,5>, lane 0
+    3204374528U, // <0,3,6,6>: Cost 3 ins <u,3,6,6>, lane 0
+    2130640896U, // <0,3,6,7>: Cost 2 ins <u,3,6,7>, lane 0
+    2086150145U, // <0,3,6,u>: Cost 2 ins <0,u,6,3>, lane 1
+    2968109974U, // <0,3,7,0>: Cost 3 vzipr <2,5,0,7>, <1,2,3,0>
+    2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3>
+    2660406420U, // <0,3,7,2>: Cost 3 vext2 <7,2,0,3>, <7,2,0,3>
+    2086223873U, // <0,3,7,3>: Cost 2 ins <0,u,7,3>, lane 1
+    3204431872U, // <0,3,7,4>: Cost 3 ins <u,3,7,4>, lane 0
+    3204440064U, // <0,3,7,5>: Cost 3 ins <u,3,7,5>, lane 0
+    2752378305U, // <0,3,7,6>: Cost 3 vuzpl <0,1,3,3>, <7,5,6,7>
+    3204456448U, // <0,3,7,7>: Cost 3 ins <u,3,7,7>, lane 0
+    2086223873U, // <0,3,7,u>: Cost 2 ins <0,u,7,3>, lane 1
+    1817421974U, // <0,3,u,0>: Cost 2 vzipl LHS, <3,0,1,2>
+    1947437206U, // <0,3,u,1>: Cost 2 vtrnl LHS, <3,0,1,2>
+    2085699585U, // <0,3,u,2>: Cost 2 ins <0,u,0,2>, lane 1
+    1012113409U, // <0,3,u,3>: Cost 1 ins LHS, lane 1
+    1817422338U, // <0,3,u,4>: Cost 2 vzipl LHS, <3,4,5,6>
+    1947437570U, // <0,3,u,5>: Cost 2 vtrnl LHS, <3,4,5,6>
+    2085879809U, // <0,3,u,6>: Cost 2 ins <0,u,2,6>, lane 1
+    2130567168U, // <0,3,u,7>: Cost 2 ins <u,3,5,7>, lane 0
+    1012113409U, // <0,3,u,u>: Cost 1 ins LHS, lane 1
+    2085683201U, // <0,4,0,0>: Cost 2 ins <0,u,0,0>, lane 1
+    2083684357U, // <0,4,0,1>: Cost 2 ins <0,4,u,u>, lane 5
+    1679392870U, // <0,4,0,2>: Cost 2 vuzpl <0,2,4,6>, LHS
+    2085707777U, // <0,4,0,3>: Cost 2 ins <0,u,0,3>, lane 1
+    1679392972U, // <0,4,0,4>: Cost 2 vuzpl <0,2,4,6>, <0,2,4,6>
+    2083659778U, // <0,4,0,5>: Cost 2 ins <0,4,u,5>, lane 2
+    1947503926U, // <0,4,0,6>: Cost 2 vtrnl <0,2,0,2>, RHS
+    3156836355U, // <0,4,0,7>: Cost 3 ins <0,4,0,u>, lane 3
+    1947503944U, // <0,4,0,u>: Cost 2 vtrnl <0,2,0,2>, RHS
+    2083168259U, // <0,4,1,0>: Cost 2 ins <0,4,1,u>, lane 3
+    2085765121U, // <0,4,1,1>: Cost 2 ins <0,u,1,1>, lane 1
+    2083168259U, // <0,4,1,2>: Cost 2 ins <0,4,1,u>, lane 3
+    2083168259U, // <0,4,1,3>: Cost 2 ins <0,4,1,u>, lane 3
+    2083168259U, // <0,4,1,4>: Cost 2 ins <0,4,1,u>, lane 3
+    739036470U,  // <0,4,1,5>: Cost 1 vzipl LHS, RHS
+    1948929334U, // <0,4,1,6>: Cost 2 vtrnl <0,4,1,5>, RHS
+    2083168259U, // <0,4,1,7>: Cost 2 ins <0,4,1,u>, lane 3
+    739036713U,  // <0,4,1,u>: Cost 1 vzipl LHS, RHS
+    2083241987U, // <0,4,2,0>: Cost 2 ins <0,4,2,u>, lane 3
+    2083241987U, // <0,4,2,1>: Cost 2 ins <0,4,2,u>, lane 3
+    2085847041U, // <0,4,2,2>: Cost 2 ins <0,u,2,2>, lane 1
+    1012113409U, // <0,4,2,3>: Cost 1 ins LHS, lane 1
+    2083241987U, // <0,4,2,4>: Cost 2 ins <0,4,2,u>, lane 3
+    1813286198U, // <0,4,2,5>: Cost 2 vzipl <0,2,0,2>, RHS
+    873254198U,  // <0,4,2,6>: Cost 1 vtrnl LHS, RHS
+    2083241987U, // <0,4,2,7>: Cost 2 ins <0,4,2,u>, lane 3
+    873254216U,  // <0,4,2,u>: Cost 1 vtrnl LHS, RHS
+    3020811514U, // <0,4,3,0>: Cost 3 vtrnl <0,1,3,3>, <4,5,0,1>
+    2753136790U, // <0,4,3,1>: Cost 3 vuzpl <0,2,4,6>, <3,0,1,2>
+    2753136801U, // <0,4,3,2>: Cost 3 vuzpl <0,2,4,6>, <3,0,2,4>
+    2085928961U, // <0,4,3,3>: Cost 2 ins <0,u,3,3>, lane 1
+    3204800512U, // <0,4,3,4>: Cost 3 ins <u,4,3,4>, lane 0
+    2083659778U, // <0,4,3,5>: Cost 2 ins <0,4,u,5>, lane 2
+    2083667970U, // <0,4,3,6>: Cost 2 ins <0,4,u,6>, lane 2
+    3087183077U, // <0,4,3,7>: Cost 3 vtrnr <0,0,2,3>, <4,4,6,7>
+    2083659778U, // <0,4,3,u>: Cost 2 ins <0,4,u,5>, lane 2
+    2753137995U, // <0,4,4,0>: Cost 3 vuzpl <0,2,4,6>, <4,6,0,1>
+    2888453090U, // <0,4,4,1>: Cost 3 vzipl <0,4,1,5>, <4,1,5,0>
+    2888535100U, // <0,4,4,2>: Cost 3 vzipl <0,4,2,6>, <4,2,6,0>
+    2086002689U, // <0,4,4,3>: Cost 2 ins <0,u,4,3>, lane 1
+    2131132416U, // <0,4,4,4>: Cost 2 ins <u,4,4,4>, lane 0
+    1814711606U, // <0,4,4,5>: Cost 2 vzipl <0,4,1,5>, RHS
+    1679396150U, // <0,4,4,6>: Cost 2 vuzpl <0,2,4,6>, RHS
+    3157131267U, // <0,4,4,7>: Cost 3 ins <0,4,4,u>, lane 3
+    1679396168U, // <0,4,4,u>: Cost 2 vuzpl <0,2,4,6>, RHS
+    2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS
+    2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0>
+    3204931584U, // <0,4,5,2>: Cost 3 ins <u,4,5,2>, lane 0
+    2086076417U, // <0,4,5,3>: Cost 2 ins <0,u,5,3>, lane 1
+    2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS
+    2131214336U, // <0,4,5,5>: Cost 2 ins <u,4,5,5>, lane 0
+    1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+    2830699830U, // <0,4,5,7>: Cost 3 vuzpr <2,0,2,4>, RHS
+    1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+    2712227146U, // <0,4,6,0>: Cost 3 vext3 <4,6,0,0>, <4,6,0,0>
+    2753138977U, // <0,4,6,1>: Cost 3 vuzpl <0,2,4,6>, <6,0,1,2>
+    2753138988U, // <0,4,6,2>: Cost 3 vuzpl <0,2,4,6>, <6,0,2,4>
+    2086150145U, // <0,4,6,3>: Cost 2 ins <0,u,6,3>, lane 1
+    2712522094U, // <0,4,6,4>: Cost 3 vext3 <4,6,4,0>, <4,6,4,0>
+    2083659778U, // <0,4,6,5>: Cost 2 ins <0,4,u,5>, lane 2
+    2131296256U, // <0,4,6,6>: Cost 2 ins <u,4,6,6>, lane 0
+    2083684357U, // <0,4,6,7>: Cost 2 ins <0,4,u,u>, lane 5
+    2083659778U, // <0,4,6,u>: Cost 2 ins <0,4,u,5>, lane 2
+    3021106426U, // <0,4,7,0>: Cost 3 vtrnl <0,1,7,3>, <4,5,0,1>
+    2860487502U, // <0,4,7,1>: Cost 3 vuzpr <7,0,1,4>, <6,7,0,1>
+    3157377026U, // <0,4,7,2>: Cost 3 ins <0,4,u,2>, lane 2
+    2086223873U, // <0,4,7,3>: Cost 2 ins <0,u,7,3>, lane 1
+    3205095424U, // <0,4,7,4>: Cost 3 ins <u,4,7,4>, lane 0
+    2083659778U, // <0,4,7,5>: Cost 2 ins <0,4,u,5>, lane 2
+    2131369984U, // <0,4,7,6>: Cost 2 ins <u,4,7,6>, lane 0
+    2752452204U, // <0,4,7,7>: Cost 3 vuzpl <0,1,4,3>, <7,7,7,7>
+    2083659778U, // <0,4,7,u>: Cost 2 ins <0,4,u,5>, lane 2
+    2083168259U, // <0,4,u,0>: Cost 2 ins <0,4,1,u>, lane 3
+    2083684357U, // <0,4,u,1>: Cost 2 ins <0,4,u,u>, lane 5
+    1679398702U, // <0,4,u,2>: Cost 2 vuzpl <0,2,4,6>, LHS
+    1012113409U, // <0,4,u,3>: Cost 1 ins LHS, lane 1
+    1679392972U, // <0,4,u,4>: Cost 2 vuzpl <0,2,4,6>, <0,2,4,6>
+    743681334U,  // <0,4,u,5>: Cost 1 vzipl LHS, RHS
+    873696566U,  // <0,4,u,6>: Cost 1 vtrnl LHS, RHS
+    2083168259U, // <0,4,u,7>: Cost 2 ins <0,4,1,u>, lane 3
+    873696584U,  // <0,4,u,u>: Cost 1 vtrnl LHS, RHS
+    2085683201U, // <0,5,0,0>: Cost 2 ins <0,u,0,0>, lane 1
+    2131476480U, // <0,5,0,1>: Cost 2 ins <u,5,0,1>, lane 0
+    2085699585U, // <0,5,0,2>: Cost 2 ins <0,u,0,2>, lane 1
+    2085707777U, // <0,5,0,3>: Cost 2 ins <0,u,0,3>, lane 1
+    3159457793U, // <0,5,0,4>: Cost 3 ins <0,u,0,4>, lane 1
+    1678778497U, // <0,5,0,5>: Cost 2 vuzpl <0,1,5,3>, <0,1,5,3>
+    3159474177U, // <0,5,0,6>: Cost 3 ins <0,u,0,6>, lane 1
+    2013269302U, // <0,5,0,7>: Cost 2 vtrnr <0,0,0,0>, RHS
+    2085699585U, // <0,5,0,u>: Cost 2 ins <0,u,0,2>, lane 1
+    1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS
+    2085765121U, // <0,5,1,1>: Cost 2 ins <0,u,1,1>, lane 1
+    3159515137U, // <0,5,1,2>: Cost 3 ins <0,u,1,2>, lane 1
+    2085781505U, // <0,5,1,3>: Cost 2 ins <0,u,1,3>, lane 1
+    1812778950U, // <0,5,1,4>: Cost 2 vzipl LHS, <5,4,7,6>
+    2085797889U, // <0,5,1,5>: Cost 2 ins <0,u,1,5>, lane 1
+    1812779106U, // <0,5,1,6>: Cost 2 vzipl LHS, <5,6,7,0>
+    2013351222U, // <0,5,1,7>: Cost 2 vtrnr <0,0,1,1>, RHS
+    2085765121U, // <0,5,1,u>: Cost 2 ins <0,u,1,1>, lane 1
+    2085830657U, // <0,5,2,0>: Cost 2 ins <0,u,2,0>, lane 1
+    1946996864U, // <0,5,2,1>: Cost 2 vtrnl LHS, <5,7,1,3>
+    2085847041U, // <0,5,2,2>: Cost 2 ins <0,u,2,2>, lane 1
+    1012113409U, // <0,5,2,3>: Cost 1 ins LHS, lane 1
+    2085863425U, // <0,5,2,4>: Cost 2 ins <0,u,2,4>, lane 1
+    1946996740U, // <0,5,2,5>: Cost 2 vtrnl LHS, <5,5,5,5>
+    2085879809U, // <0,5,2,6>: Cost 2 ins <0,u,2,6>, lane 1
+    2019478838U, // <0,5,2,7>: Cost 2 vtrnr <1,0,3,2>, RHS
+    1012113409U, // <0,5,2,u>: Cost 1 ins LHS, lane 1
+    2637858966U, // <0,5,3,0>: Cost 3 vext2 <3,4,0,5>, <3,0,1,2>
+    3205439488U, // <0,5,3,1>: Cost 3 ins <u,5,3,1>, lane 0
+    3087183153U, // <0,5,3,2>: Cost 3 vtrnr <0,0,2,3>, <4,5,6,2>
+    2085928961U, // <0,5,3,3>: Cost 2 ins <0,u,3,3>, lane 1
+    2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5>
+    3205472256U, // <0,5,3,5>: Cost 3 ins <u,5,3,5>, lane 0
+    3205480448U, // <0,5,3,6>: Cost 3 ins <u,5,3,6>, lane 0
+    2131746816U, // <0,5,3,7>: Cost 2 ins <u,5,3,7>, lane 0
+    2131746816U, // <0,5,3,u>: Cost 2 ins <u,5,3,7>, lane 0
+    2888453704U, // <0,5,4,0>: Cost 3 vzipl <0,4,1,5>, <5,0,1,2>
+    3159728129U, // <0,5,4,1>: Cost 3 ins <0,u,4,1>, lane 1
+    3159736321U, // <0,5,4,2>: Cost 3 ins <0,u,4,2>, lane 1
+    2086002689U, // <0,5,4,3>: Cost 2 ins <0,u,4,3>, lane 1
+    2888454068U, // <0,5,4,4>: Cost 3 vzipl <0,4,1,5>, <5,4,5,6>
+    2131804160U, // <0,5,4,5>: Cost 2 ins <u,5,4,5>, lane 0
+    2086027265U, // <0,5,4,6>: Cost 2 ins <0,u,4,6>, lane 1
+    2131820544U, // <0,5,4,7>: Cost 2 ins <u,5,4,7>, lane 0
+    2086027265U, // <0,5,4,u>: Cost 2 ins <0,u,4,6>, lane 1
+    3205578752U, // <0,5,5,0>: Cost 3 ins <u,5,5,0>, lane 0
+    2997291922U, // <0,5,5,1>: Cost 3 vzipr <7,4,0,5>, <4,0,5,1>
+    2752523939U, // <0,5,5,2>: Cost 3 vuzpl <0,1,5,3>, <5,1,2,3>
+    2086076417U, // <0,5,5,3>: Cost 2 ins <0,u,5,3>, lane 1
+    3205611520U, // <0,5,5,4>: Cost 3 ins <u,5,5,4>, lane 0
+    2131877888U, // <0,5,5,5>: Cost 2 ins <u,5,5,5>, lane 0
+    2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0>
+    2131894272U, // <0,5,5,7>: Cost 2 ins <u,5,5,7>, lane 0
+    2086076417U, // <0,5,5,u>: Cost 2 ins <0,u,5,3>, lane 1
+    2131910656U, // <0,5,6,0>: Cost 2 ins <u,5,6,0>, lane 0
+    2131918848U, // <0,5,6,1>: Cost 2 ins <u,5,6,1>, lane 0
+    2131927040U, // <0,5,6,2>: Cost 2 ins <u,5,6,2>, lane 0
+    2131935232U, // <0,5,6,3>: Cost 2 ins <u,5,6,3>, lane 0
+    2131943424U, // <0,5,6,4>: Cost 2 ins <u,5,6,4>, lane 0
+    2131951616U, // <0,5,6,5>: Cost 2 ins <u,5,6,5>, lane 0
+    2131959808U, // <0,5,6,6>: Cost 2 ins <u,5,6,6>, lane 0
+    1058226176U, // <0,5,6,7>: Cost 1 ins RHS, lane 0
+    1058226176U, // <0,5,6,u>: Cost 1 ins RHS, lane 0
+    2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS
+    2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0>
+    2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7>
+    2086223873U, // <0,5,7,3>: Cost 2 ins <0,u,7,3>, lane 1
+    2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS
+    2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7>
+    2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0>
+    2132041728U, // <0,5,7,7>: Cost 2 ins <u,5,7,7>, lane 0
+    2132041728U, // <0,5,7,u>: Cost 2 ins <u,5,7,7>, lane 0
+    1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS
+    2085765121U, // <0,5,u,1>: Cost 2 ins <0,u,1,1>, lane 1
+    2085699585U, // <0,5,u,2>: Cost 2 ins <0,u,0,2>, lane 1
+    1012113409U, // <0,5,u,3>: Cost 1 ins LHS, lane 1
+    1817423814U, // <0,5,u,4>: Cost 2 vzipl LHS, <5,4,7,6>
+    2085797889U, // <0,5,u,5>: Cost 2 ins <0,u,1,5>, lane 1
+    2085879809U, // <0,5,u,6>: Cost 2 ins <0,u,2,6>, lane 1
+    1058226176U, // <0,5,u,7>: Cost 1 ins RHS, lane 0
+    1012113409U, // <0,5,u,u>: Cost 1 ins LHS, lane 1
+    2085683201U, // <0,6,0,0>: Cost 2 ins <0,u,0,0>, lane 1
+    2085691393U, // <0,6,0,1>: Cost 2 ins <0,u,0,1>, lane 1
+    2132148224U, // <0,6,0,2>: Cost 2 ins <u,6,0,2>, lane 0
+    2085707777U, // <0,6,0,3>: Cost 2 ins <0,u,0,3>, lane 1
+    2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6>
+    2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0>
+    1678852234U, // <0,6,0,6>: Cost 2 vuzpl <0,1,6,3>, <0,1,6,3>
+    1879051574U, // <0,6,0,7>: Cost 2 vzipr <0,0,0,0>, RHS
+    2132148224U, // <0,6,0,u>: Cost 2 ins <u,6,0,2>, lane 0
+    2993278336U, // <0,6,1,0>: Cost 3 vzipr <6,7,0,1>, <4,6,6,0>
+    2085765121U, // <0,6,1,1>: Cost 2 ins <0,u,1,1>, lane 1
+    1812779514U, // <0,6,1,2>: Cost 2 vzipl LHS, <6,2,7,3>
+    2085781505U, // <0,6,1,3>: Cost 2 ins <0,u,1,3>, lane 1
+    3159531521U, // <0,6,1,4>: Cost 3 ins <0,u,1,4>, lane 1
+    2085797889U, // <0,6,1,5>: Cost 2 ins <0,u,1,5>, lane 1
+    1812779832U, // <0,6,1,6>: Cost 2 vzipl LHS, <6,6,6,6>
+    1892994358U, // <0,6,1,7>: Cost 2 vzipr <2,3,0,1>, RHS
+    1892994359U, // <0,6,1,u>: Cost 2 vzipr <2,3,0,1>, RHS
+    1946997582U, // <0,6,2,0>: Cost 2 vtrnl LHS, <6,7,0,1>
+    2085838849U, // <0,6,2,1>: Cost 2 ins <0,u,2,1>, lane 1
+    2085847041U, // <0,6,2,2>: Cost 2 ins <0,u,2,2>, lane 1
+    1012113409U, // <0,6,2,3>: Cost 1 ins LHS, lane 1
+    1946997622U, // <0,6,2,4>: Cost 2 vtrnl LHS, <6,7,4,5>
+    2085871617U, // <0,6,2,5>: Cost 2 ins <0,u,2,5>, lane 1
+    2085879809U, // <0,6,2,6>: Cost 2 ins <0,u,2,6>, lane 1
+    1880395062U, // <0,6,2,7>: Cost 2 vzipr <0,2,0,2>, RHS
+    1012113409U, // <0,6,2,u>: Cost 1 ins LHS, lane 1
+    3122942050U, // <0,6,3,0>: Cost 3 vtrnr <6,0,1,3>, <5,6,7,0>
+    2250527010U, // <0,6,3,1>: Cost 3 vrev <6,0,1,3>
+    3206111232U, // <0,6,3,2>: Cost 3 ins <u,6,3,2>, lane 0
+    2085928961U, // <0,6,3,3>: Cost 2 ins <0,u,3,3>, lane 1
+    3206127616U, // <0,6,3,4>: Cost 3 ins <u,6,3,4>, lane 0
+    3206135808U, // <0,6,3,5>: Cost 3 ins <u,6,3,5>, lane 0
+    3206144000U, // <0,6,3,6>: Cost 3 ins <u,6,3,6>, lane 0
+    2132410368U, // <0,6,3,7>: Cost 2 ins <u,6,3,7>, lane 0
+    2132410368U, // <0,6,3,u>: Cost 2 ins <u,6,3,7>, lane 0
+    2888536380U, // <0,6,4,0>: Cost 3 vzipl <0,4,2,6>, <6,0,4,2>
+    3021574433U, // <0,6,4,1>: Cost 3 vtrnl <0,2,4,6>, <6,0,1,2>
+    3021574444U, // <0,6,4,2>: Cost 3 vtrnl <0,2,4,6>, <6,0,2,4>
+    2086002689U, // <0,6,4,3>: Cost 2 ins <0,u,4,3>, lane 1
+    2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS
+    2086019073U, // <0,6,4,5>: Cost 2 ins <0,u,4,5>, lane 1
+    2132475904U, // <0,6,4,6>: Cost 2 ins <u,6,4,6>, lane 0
+    2954153270U, // <0,6,4,7>: Cost 3 vzipr <0,2,0,4>, RHS
+    2132475904U, // <0,6,4,u>: Cost 2 ins <u,6,4,6>, lane 0
+    2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+    3206250496U, // <0,6,5,1>: Cost 3 ins <u,6,5,1>, lane 0
+    3206258688U, // <0,6,5,2>: Cost 3 ins <u,6,5,2>, lane 0
+    2086076417U, // <0,6,5,3>: Cost 2 ins <0,u,5,3>, lane 1
+    3206275072U, // <0,6,5,4>: Cost 3 ins <u,6,5,4>, lane 0
+    3206283264U, // <0,6,5,5>: Cost 3 ins <u,6,5,5>, lane 0
+    3206291456U, // <0,6,5,6>: Cost 3 ins <u,6,5,6>, lane 0
+    2961460534U, // <0,6,5,7>: Cost 3 vzipr <1,4,0,5>, RHS
+    2086076417U, // <0,6,5,u>: Cost 2 ins <0,u,5,3>, lane 1
+    2724172540U, // <0,6,6,0>: Cost 3 vext3 <6,6,0,0>, <6,6,0,0>
+    2889838972U, // <0,6,6,1>: Cost 3 vzipl <0,6,2,3>, <6,1,2,3>
+    2997300124U, // <0,6,6,2>: Cost 3 vzipr <7,4,0,6>, <4,0,6,2>
+    2086150145U, // <0,6,6,3>: Cost 2 ins <0,u,6,3>, lane 1
+    3206348800U, // <0,6,6,4>: Cost 3 ins <u,6,6,4>, lane 0
+    2889839336U, // <0,6,6,5>: Cost 3 vzipl <0,6,2,3>, <6,5,6,7>
+    2132623360U, // <0,6,6,6>: Cost 2 ins <u,6,6,6>, lane 0
+    2132631552U, // <0,6,6,7>: Cost 2 ins <u,6,6,7>, lane 0
+    2086150145U, // <0,6,6,u>: Cost 2 ins <0,u,6,3>, lane 1
+    2132647936U, // <0,6,7,0>: Cost 2 ins <u,6,7,0>, lane 0
+    2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0>
+    3206406144U, // <0,6,7,2>: Cost 3 ins <u,6,7,2>, lane 0
+    2086223873U, // <0,6,7,3>: Cost 2 ins <0,u,7,3>, lane 1
+    2132680704U, // <0,6,7,4>: Cost 2 ins <u,6,7,4>, lane 0
+    2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6>
+    3206438912U, // <0,6,7,6>: Cost 3 ins <u,6,7,6>, lane 0
+    2132705280U, // <0,6,7,7>: Cost 2 ins <u,6,7,7>, lane 0
+    2132647936U, // <0,6,7,u>: Cost 2 ins <u,6,7,0>, lane 0
+    2132647936U, // <0,6,u,0>: Cost 2 ins <u,6,7,0>, lane 0
+    2085765121U, // <0,6,u,1>: Cost 2 ins <0,u,1,1>, lane 1
+    2132148224U, // <0,6,u,2>: Cost 2 ins <u,6,0,2>, lane 0
+    1012113409U, // <0,6,u,3>: Cost 1 ins LHS, lane 1
+    2132680704U, // <0,6,u,4>: Cost 2 ins <u,6,7,4>, lane 0
+    2085797889U, // <0,6,u,5>: Cost 2 ins <0,u,1,5>, lane 1
+    2085879809U, // <0,6,u,6>: Cost 2 ins <0,u,2,6>, lane 1
+    1880444214U, // <0,6,u,7>: Cost 2 vzipr <0,2,0,u>, RHS
+    1012113409U, // <0,6,u,u>: Cost 1 ins LHS, lane 1
+    2085683201U, // <0,7,0,0>: Cost 2 ins <0,u,0,0>, lane 1
+    2132803584U, // <0,7,0,1>: Cost 2 ins <u,7,0,1>, lane 0
+    2085699585U, // <0,7,0,2>: Cost 2 ins <0,u,0,2>, lane 1
+    2085707777U, // <0,7,0,3>: Cost 2 ins <0,u,0,3>, lane 1
+    2580516150U, // <0,7,0,4>: Cost 3 vext1 <5,0,7,0>, RHS
+    2580516476U, // <0,7,0,5>: Cost 3 vext1 <5,0,7,0>, <5,0,7,0>
+    2586489173U, // <0,7,0,6>: Cost 3 vext1 <6,0,7,0>, <6,0,7,0>
+    1678925971U, // <0,7,0,7>: Cost 2 vuzpl <0,1,7,3>, <0,1,7,3>
+    2132803584U, // <0,7,0,u>: Cost 2 ins <u,7,0,1>, lane 0
+    1812780026U, // <0,7,1,0>: Cost 2 vzipl LHS, <7,0,1,2>
+    2085765121U, // <0,7,1,1>: Cost 2 ins <0,u,1,1>, lane 1
+    2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0>
+    2132893696U, // <0,7,1,3>: Cost 2 ins <u,7,1,3>, lane 0
+    1812780390U, // <0,7,1,4>: Cost 2 vzipl LHS, <7,4,5,6>
+    2085797889U, // <0,7,1,5>: Cost 2 ins <0,u,1,5>, lane 1
+    2586497366U, // <0,7,1,6>: Cost 3 vext1 <6,0,7,1>, <6,0,7,1>
+    1812780652U, // <0,7,1,7>: Cost 2 vzipl LHS, <7,7,7,7>
+    2085765121U, // <0,7,1,u>: Cost 2 ins <0,u,1,1>, lane 1
+    2085830657U, // <0,7,2,0>: Cost 2 ins <0,u,2,0>, lane 1
+    1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2>
+    2085847041U, // <0,7,2,2>: Cost 2 ins <0,u,2,2>, lane 1
+    1012113409U, // <0,7,2,3>: Cost 1 ins LHS, lane 1
+    2085863425U, // <0,7,2,4>: Cost 2 ins <0,u,2,4>, lane 1
+    1946998118U, // <0,7,2,5>: Cost 2 vtrnl LHS, <7,4,5,6>
+    2085879809U, // <0,7,2,6>: Cost 2 ins <0,u,2,6>, lane 1
+    1946998380U, // <0,7,2,7>: Cost 2 vtrnl LHS, <7,7,7,7>
+    1012113409U, // <0,7,2,u>: Cost 1 ins LHS, lane 1
+    2989314146U, // <0,7,3,0>: Cost 3 vzipr <6,1,0,3>, <5,6,7,0>
+    3206766592U, // <0,7,3,1>: Cost 3 ins <u,7,3,1>, lane 0
+    3020813397U, // <0,7,3,2>: Cost 3 vtrnl <0,1,3,3>, <7,1,2,3>
+    2085928961U, // <0,7,3,3>: Cost 2 ins <0,u,3,3>, lane 1
+    3206791168U, // <0,7,3,4>: Cost 3 ins <u,7,3,4>, lane 0
+    3206799360U, // <0,7,3,5>: Cost 3 ins <u,7,3,5>, lane 0
+    2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+    3206815744U, // <0,7,3,7>: Cost 3 ins <u,7,3,7>, lane 0
+    2085928961U, // <0,7,3,u>: Cost 2 ins <0,u,3,3>, lane 1
+    3206832128U, // <0,7,4,0>: Cost 3 ins <u,7,4,0>, lane 0
+    2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4>
+    2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4>
+    2086002689U, // <0,7,4,3>: Cost 2 ins <0,u,4,3>, lane 1
+    3206864896U, // <0,7,4,4>: Cost 3 ins <u,7,4,4>, lane 0
+    2133131264U, // <0,7,4,5>: Cost 2 ins <u,7,4,5>, lane 0
+    2086027265U, // <0,7,4,6>: Cost 2 ins <0,u,4,6>, lane 1
+    3020887660U, // <0,7,4,7>: Cost 3 vtrnl <0,1,4,3>, <7,7,7,7>
+    2133131264U, // <0,7,4,u>: Cost 2 ins <u,7,4,5>, lane 0
+    2993311842U, // <0,7,5,0>: Cost 3 vzipr <6,7,0,5>, <5,6,7,0>
+    3206914048U, // <0,7,5,1>: Cost 3 ins <u,7,5,1>, lane 0
+    3020960853U, // <0,7,5,2>: Cost 3 vtrnl <0,1,5,3>, <7,1,2,3>
+    2086076417U, // <0,7,5,3>: Cost 2 ins <0,u,5,3>, lane 1
+    2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5>
+    3206946816U, // <0,7,5,5>: Cost 3 ins <u,7,5,5>, lane 0
+    2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7>
+    2133221376U, // <0,7,5,7>: Cost 2 ins <u,7,5,7>, lane 0
+    2133221376U, // <0,7,5,u>: Cost 2 ins <u,7,5,7>, lane 0
+    2854834274U, // <0,7,6,0>: Cost 3 vuzpr <6,0,5,7>, <5,6,7,0>
+    2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6>
+    3206995968U, // <0,7,6,2>: Cost 3 ins <u,7,6,2>, lane 0
+    2086150145U, // <0,7,6,3>: Cost 2 ins <0,u,6,3>, lane 1
+    3207012352U, // <0,7,6,4>: Cost 3 ins <u,7,6,4>, lane 0
+    2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7>
+    3207028736U, // <0,7,6,6>: Cost 3 ins <u,7,6,6>, lane 0
+    2133295104U, // <0,7,6,7>: Cost 2 ins <u,7,6,7>, lane 0
+    2086150145U, // <0,7,6,u>: Cost 2 ins <0,u,6,3>, lane 1
+    2992001122U, // <0,7,7,0>: Cost 3 vzipr <6,5,0,7>, <5,6,7,0>
+    3207061504U, // <0,7,7,1>: Cost 3 ins <u,7,7,1>, lane 0
+    2752672853U, // <0,7,7,2>: Cost 3 vuzpl <0,1,7,3>, <7,1,2,3>
+    2086223873U, // <0,7,7,3>: Cost 2 ins <0,u,7,3>, lane 1
+    3207086080U, // <0,7,7,4>: Cost 3 ins <u,7,7,4>, lane 0
+    3207094272U, // <0,7,7,5>: Cost 3 ins <u,7,7,5>, lane 0
+    2663093724U, // <0,7,7,6>: Cost 3 vext2 <7,6,0,7>, <7,6,0,7>
+    2133368832U, // <0,7,7,7>: Cost 2 ins <u,7,7,7>, lane 0
+    2086223873U, // <0,7,7,u>: Cost 2 ins <0,u,7,3>, lane 1
+    1817424890U, // <0,7,u,0>: Cost 2 vzipl LHS, <7,0,1,2>
+    1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u>
+    2085699585U, // <0,7,u,2>: Cost 2 ins <0,u,0,2>, lane 1
+    1012113409U, // <0,7,u,3>: Cost 1 ins LHS, lane 1
+    1817425254U, // <0,7,u,4>: Cost 2 vzipl LHS, <7,4,5,6>
+    2085797889U, // <0,7,u,5>: Cost 2 ins <0,u,1,5>, lane 1
+    2085879809U, // <0,7,u,6>: Cost 2 ins <0,u,2,6>, lane 1
+    2133221376U, // <0,7,u,7>: Cost 2 ins <u,7,5,7>, lane 0
+    1012113409U, // <0,7,u,u>: Cost 1 ins LHS, lane 1
+    135053414U,  // <0,u,0,0>: Cost 1 vdup0 LHS
+    1007951877U, // <0,u,0,1>: Cost 1 ins LHS, lane 5
+    605257830U,  // <0,u,0,2>: Cost 1 vuzpl LHS, LHS
+    1007910914U, // <0,u,0,3>: Cost 1 ins LHS, lane 2
+    1678999756U, // <0,u,0,4>: Cost 2 vuzpl LHS, <0,2,4,6>
+    2081767427U, // <0,u,0,5>: Cost 2 ins <0,2,0,u>, lane 3
+    1947506842U, // <0,u,0,6>: Cost 2 vtrnl <0,2,0,2>, RHS
+    2081767427U, // <0,u,0,7>: Cost 2 ins <0,2,0,u>, lane 3
+    605257884U,  // <0,u,0,u>: Cost 1 vuzpl LHS, LHS
+    1812821715U, // <0,u,1,0>: Cost 2 vzipl LHS, <u,0,1,2>
+    739039022U,  // <0,u,1,1>: Cost 1 vzipl LHS, LHS
+    1813264264U, // <0,u,1,2>: Cost 2 vzipl LHS, <u,2,3,3>
+    1007910914U, // <0,u,1,3>: Cost 1 ins LHS, lane 2
+    1812822079U, // <0,u,1,4>: Cost 2 vzipl LHS, <u,4,5,6>
+    739039386U,  // <0,u,1,5>: Cost 1 vzipl LHS, RHS
+    1813264592U, // <0,u,1,6>: Cost 2 vzipl LHS, <u,6,3,7>
+    1892994376U, // <0,u,1,7>: Cost 2 vzipr <2,3,0,1>, RHS
+    739039589U,  // <0,u,1,u>: Cost 1 vzipl LHS, LHS
+    1007509507U, // <0,u,2,0>: Cost 1 ins LHS, lane 3
+    1007509507U, // <0,u,2,1>: Cost 1 ins LHS, lane 3
+    873256750U,  // <0,u,2,2>: Cost 1 vtrnl LHS, LHS
+    835584U,     // <0,u,2,3>: Cost 0 copy LHS
+    1007509507U, // <0,u,2,4>: Cost 1 ins LHS, lane 3
+    1007509507U, // <0,u,2,5>: Cost 1 ins LHS, lane 3
+    873257114U,  // <0,u,2,6>: Cost 1 vtrnl LHS, RHS
+    1007509507U, // <0,u,2,7>: Cost 1 ins LHS, lane 3
+    835584U,     // <0,u,2,u>: Cost 0 copy LHS
+    2133680132U, // <0,u,3,0>: Cost 2 ins <u,u,3,0>, lane 4
+    1679001750U, // <0,u,3,1>: Cost 2 vuzpl LHS, <3,0,1,2>
+    2128388096U, // <0,u,3,2>: Cost 2 ins <u,0,3,2>, lane 0
+    1007910914U, // <0,u,3,3>: Cost 1 ins LHS, lane 2
+    2133712900U, // <0,u,3,4>: Cost 2 ins <u,u,3,4>, lane 4
+    1679002114U, // <0,u,3,5>: Cost 2 vuzpl LHS, <3,4,5,6>
+    2082340866U, // <0,u,3,6>: Cost 2 ins <0,2,u,6>, lane 2
+    2133737476U, // <0,u,3,7>: Cost 2 ins <u,u,3,7>, lane 4
+    1007910914U, // <0,u,3,u>: Cost 1 ins LHS, lane 2
+    2082062339U, // <0,u,4,0>: Cost 2 ins <0,2,4,u>, lane 3
+    1814714158U, // <0,u,4,1>: Cost 2 vzipl <0,4,1,5>, LHS
+    1947834158U, // <0,u,4,2>: Cost 2 vtrnl <0,2,4,6>, LHS
+    1007910914U, // <0,u,4,3>: Cost 1 ins LHS, lane 2
+    1947828428U, // <0,u,4,4>: Cost 2 vtrnl <0,2,4,6>, <0,2,4,6>
+    1007951877U, // <0,u,4,5>: Cost 1 ins LHS, lane 5
+    605261110U,  // <0,u,4,6>: Cost 1 vuzpl LHS, RHS
+    2082062339U, // <0,u,4,7>: Cost 2 ins <0,2,4,u>, lane 3
+    605261128U,  // <0,u,4,u>: Cost 1 vuzpl LHS, RHS
+    2080964610U, // <0,u,5,0>: Cost 2 ins <0,0,u,0>, lane 2
+    2128527360U, // <0,u,5,1>: Cost 2 ins <u,0,5,1>, lane 0
+    2080980994U, // <0,u,5,2>: Cost 2 ins <0,0,u,2>, lane 2
+    1007910914U, // <0,u,5,3>: Cost 1 ins LHS, lane 2
+    2081660930U, // <0,u,5,4>: Cost 2 ins <0,1,u,4>, lane 2
+    2133868548U, // <0,u,5,5>: Cost 2 ins <u,u,5,5>, lane 4
+    1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+    1751092534U, // <0,u,5,7>: Cost 2 vuzpr <1,0,3,u>, RHS
+    1007910914U, // <0,u,5,u>: Cost 1 ins LHS, lane 2
+    1679004494U, // <0,u,6,0>: Cost 2 vuzpl LHS, <6,7,0,1>
+    2080972802U, // <0,u,6,1>: Cost 2 ins <0,0,u,1>, lane 2
+    2128609280U, // <0,u,6,2>: Cost 2 ins <u,0,6,2>, lane 0
+    1007910914U, // <0,u,6,3>: Cost 1 ins LHS, lane 2
+    1679004534U, // <0,u,6,4>: Cost 2 vuzpl LHS, <6,7,4,5>
+    2083659778U, // <0,u,6,5>: Cost 2 ins <0,4,u,5>, lane 2
+    2133950468U, // <0,u,6,6>: Cost 2 ins <u,u,6,6>, lane 4
+    1060216836U, // <0,u,6,7>: Cost 1 ins RHS, lane 4
+    1007910914U, // <0,u,6,u>: Cost 1 ins LHS, lane 2
+    2133975044U, // <0,u,7,0>: Cost 2 ins <u,u,7,0>, lane 4
+    2080972802U, // <0,u,7,1>: Cost 2 ins <0,0,u,1>, lane 2
+    2080980994U, // <0,u,7,2>: Cost 2 ins <0,0,u,2>, lane 2
+    1007910914U, // <0,u,7,3>: Cost 1 ins LHS, lane 2
+    2134007812U, // <0,u,7,4>: Cost 2 ins <u,u,7,4>, lane 4
+    2083659778U, // <0,u,7,5>: Cost 2 ins <0,4,u,5>, lane 2
+    2134024196U, // <0,u,7,6>: Cost 2 ins <u,u,7,6>, lane 4
+    2134032388U, // <0,u,7,7>: Cost 2 ins <u,u,7,7>, lane 4
+    1007910914U, // <0,u,7,u>: Cost 1 ins LHS, lane 2
+    135053414U,  // <0,u,u,0>: Cost 1 vdup0 LHS
+    743683886U,  // <0,u,u,1>: Cost 1 vzipl LHS, LHS
+    605263662U,  // <0,u,u,2>: Cost 1 vuzpl LHS, LHS
+    835584U,     // <0,u,u,3>: Cost 0 copy LHS
+    1007509507U, // <0,u,u,4>: Cost 1 ins LHS, lane 3
+    743684250U,  // <0,u,u,5>: Cost 1 vzipl LHS, RHS
+    605264026U,  // <0,u,u,6>: Cost 1 vuzpl LHS, RHS
+    1007509507U, // <0,u,u,7>: Cost 1 ins LHS, lane 3
+    835584U,     // <0,u,u,u>: Cost 0 copy LHS
+    2128150528U, // <1,0,0,0>: Cost 2 ins <u,0,0,0>, lane 0
+    1818148966U, // <1,0,0,1>: Cost 2 vzipl <1,0,3,2>, LHS
+    2086952962U, // <1,0,0,2>: Cost 2 ins <1,0,u,2>, lane 2
+    2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0>
+    2891891026U, // <1,0,0,4>: Cost 3 vzipl <1,0,3,2>, <0,4,1,5>
+    3165437953U, // <1,0,0,5>: Cost 3 ins <1,u,0,5>, lane 1
+    3160154115U, // <1,0,0,6>: Cost 3 ins <1,0,0,u>, lane 3
+    3160154115U, // <1,0,0,7>: Cost 3 ins <1,0,0,u>, lane 3
+    1818149533U, // <1,0,0,u>: Cost 2 vzipl <1,0,3,2>, LHS
+    1141522514U, // <1,0,1,0>: Cost 2 vrev <0,1,0,1>
+    1818656870U, // <1,0,1,1>: Cost 2 vzipl <1,1,1,1>, LHS
+    1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+    2091753473U, // <1,0,1,3>: Cost 2 ins <1,u,1,3>, lane 1
+    1477070134U, // <1,0,1,4>: Cost 2 vext1 <0,1,0,1>, RHS
+    2760770560U, // <1,0,1,5>: Cost 3 vuzpl <1,5,0,2>, <1,3,5,7>
+    2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7>
+    3165528065U, // <1,0,1,7>: Cost 3 ins <1,u,1,7>, lane 1
+    1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS
+    2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1>
+    1819459686U, // <1,0,2,1>: Cost 2 vzipl <1,2,3,0>, LHS
+    2128314368U, // <1,0,2,2>: Cost 2 ins <u,0,2,2>, lane 0
+    2087002117U, // <1,0,2,3>: Cost 2 ins <1,0,u,u>, lane 5
+    2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6>
+    2970798548U, // <1,0,2,5>: Cost 3 vzipr <3,0,1,2>, <3,4,0,5>
+    3165593601U, // <1,0,2,6>: Cost 3 ins <1,u,2,6>, lane 1
+    2592625730U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,1,0,2>
+    1819460253U, // <1,0,2,u>: Cost 2 vzipl <1,2,3,0>, LHS
+    2014101504U, // <1,0,3,0>: Cost 2 vtrnr LHS, <0,0,0,0>
+    2014101514U, // <1,0,3,1>: Cost 2 vtrnr LHS, <0,0,1,1>
+    67944550U,   // <1,0,3,2>: Cost 1 vrev LHS
+    2091900929U, // <1,0,3,3>: Cost 2 ins <1,u,3,3>, lane 1
+    2091909121U, // <1,0,3,4>: Cost 2 ins <1,u,3,4>, lane 1
+    2086633475U, // <1,0,3,5>: Cost 2 ins <1,0,3,u>, lane 3
+    2086633475U, // <1,0,3,6>: Cost 2 ins <1,0,3,u>, lane 3
+    2091933697U, // <1,0,3,7>: Cost 2 ins <1,u,3,7>, lane 1
+    68386972U,   // <1,0,3,u>: Cost 1 vrev LHS
+    2667752338U, // <1,0,4,0>: Cost 3 vext2 <u,4,1,0>, <4,0,5,1>
+    2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5>
+    2086952962U, // <1,0,4,2>: Cost 2 ins <1,0,u,2>, lane 2
+    2819383641U, // <1,0,4,3>: Cost 3 vuzpr <0,1,2,0>, <0,4,2,3>
+    2894569810U, // <1,0,4,4>: Cost 3 vzipl <1,4,3,5>, <0,4,1,5>
+    2087002117U, // <1,0,4,5>: Cost 2 ins <1,0,u,u>, lane 5
+    2758102326U, // <1,0,4,6>: Cost 3 vuzpl <1,1,0,0>, RHS
+    2819386597U, // <1,0,4,7>: Cost 3 vuzpr <0,1,2,0>, <4,4,6,7>
+    2086952962U, // <1,0,4,u>: Cost 2 ins <1,0,u,2>, lane 2
+    2955558912U, // <1,0,5,0>: Cost 3 vzipr <0,4,1,5>, <0,0,0,0>
+    1821507686U, // <1,0,5,1>: Cost 2 vzipl <1,5,3,7>, LHS
+    1954545766U, // <1,0,5,2>: Cost 2 vtrnl <1,3,5,7>, LHS
+    3165790209U, // <1,0,5,3>: Cost 3 ins <1,u,5,3>, lane 1
+    1141850234U, // <1,0,5,4>: Cost 2 vrev <0,1,4,5>
+    3165806593U, // <1,0,5,5>: Cost 3 ins <1,u,5,5>, lane 1
+    3202310144U, // <1,0,5,6>: Cost 3 ins <u,0,5,6>, lane 0
+    2092081153U, // <1,0,5,7>: Cost 2 ins <1,u,5,7>, lane 1
+    1954545820U, // <1,0,5,u>: Cost 2 vtrnl <1,3,5,7>, LHS
+    3202334720U, // <1,0,6,0>: Cost 3 ins <u,0,6,0>, lane 0
+    2895765606U, // <1,0,6,1>: Cost 3 vzipl <1,6,1,7>, LHS
+    2128609280U, // <1,0,6,2>: Cost 2 ins <u,0,6,2>, lane 0
+    2819383803U, // <1,0,6,3>: Cost 3 vuzpr <0,1,2,0>, <0,6,2,3>
+    2896060754U, // <1,0,6,4>: Cost 3 vzipl <1,6,5,7>, <0,4,1,5>
+    2215673988U, // <1,0,6,5>: Cost 3 vrev <0,1,5,6>
+    3165888513U, // <1,0,6,6>: Cost 3 ins <1,u,6,6>, lane 1
+    2087002117U, // <1,0,6,7>: Cost 2 ins <1,0,u,u>, lane 5
+    2128609280U, // <1,0,6,u>: Cost 2 ins <u,0,6,2>, lane 0
+    2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0>
+    2974156454U, // <1,0,7,1>: Cost 3 vzipr <3,5,1,7>, <2,3,0,1>
+    2086952962U, // <1,0,7,2>: Cost 2 ins <1,0,u,2>, lane 2
+    2861265024U, // <1,0,7,3>: Cost 3 vuzpr <7,1,3,0>, <5,7,1,3>
+    3202441216U, // <1,0,7,4>: Cost 3 ins <u,0,7,4>, lane 0
+    3165954049U, // <1,0,7,5>: Cost 3 ins <1,u,7,5>, lane 1
+    1142014094U, // <1,0,7,6>: Cost 2 vrev <0,1,6,7>
+    3165970433U, // <1,0,7,7>: Cost 3 ins <1,u,7,7>, lane 1
+    2086952962U, // <1,0,7,u>: Cost 2 ins <1,0,u,2>, lane 2
+    2014142464U, // <1,0,u,0>: Cost 2 vtrnr LHS, <0,0,0,0>
+    2014142474U, // <1,0,u,1>: Cost 2 vtrnr LHS, <0,0,1,1>
+    67985515U,   // <1,0,u,2>: Cost 1 vrev LHS
+    2091753473U, // <1,0,u,3>: Cost 2 ins <1,u,1,3>, lane 1
+    2091909121U, // <1,0,u,4>: Cost 2 ins <1,u,3,4>, lane 1
+    2086633475U, // <1,0,u,5>: Cost 2 ins <1,0,3,u>, lane 3
+    2086633475U, // <1,0,u,6>: Cost 2 ins <1,0,3,u>, lane 3
+    2091933697U, // <1,0,u,7>: Cost 2 ins <1,u,3,7>, lane 1
+    68427937U,   // <1,0,u,u>: Cost 1 vrev LHS
+    1818149622U, // <1,1,0,0>: Cost 2 vzipl <1,0,3,2>, <1,0,3,2>
+    1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS
+    1684439142U, // <1,1,0,2>: Cost 2 vuzpl <1,1,1,1>, LHS
+    2087624706U, // <1,1,0,3>: Cost 2 ins <1,1,u,3>, lane 2
+    2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5>
+    2891891856U, // <1,1,0,5>: Cost 3 vzipl <1,0,3,2>, <1,5,3,7>
+    3161391106U, // <1,1,0,6>: Cost 3 ins <1,1,u,6>, lane 2
+    3161399298U, // <1,1,0,7>: Cost 3 ins <1,1,u,7>, lane 2
+    1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1>
+    1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+    202162278U,  // <1,1,1,1>: Cost 1 vdup1 LHS
+    2087149571U, // <1,1,1,2>: Cost 2 ins <1,1,1,u>, lane 3
+    1751548006U, // <1,1,1,3>: Cost 2 vuzpr <1,1,1,1>, LHS
+    1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+    2087149571U, // <1,1,1,5>: Cost 2 ins <1,1,1,u>, lane 3
+    2087149571U, // <1,1,1,6>: Cost 2 ins <1,1,1,u>, lane 3
+    2087149571U, // <1,1,1,7>: Cost 2 ins <1,1,1,u>, lane 3
+    202162278U,  // <1,1,1,u>: Cost 1 vdup1 LHS
+    2128961536U, // <1,1,2,0>: Cost 2 ins <u,1,2,0>, lane 0
+    2128969728U, // <1,1,2,1>: Cost 2 ins <u,1,2,1>, lane 0
+    1819460502U, // <1,1,2,2>: Cost 2 vzipl <1,2,3,0>, <1,2,3,0>
+    1055244288U, // <1,1,2,3>: Cost 1 ins LHS, lane 0
+    2128994304U, // <1,1,2,4>: Cost 2 ins <u,1,2,4>, lane 0
+    2129002496U, // <1,1,2,5>: Cost 2 ins <u,1,2,5>, lane 0
+    2129010688U, // <1,1,2,6>: Cost 2 ins <u,1,2,6>, lane 0
+    2129018880U, // <1,1,2,7>: Cost 2 ins <u,1,2,7>, lane 0
+    1055244288U, // <1,1,2,u>: Cost 1 ins LHS, lane 0
+    2091876353U, // <1,1,3,0>: Cost 2 ins <1,u,3,0>, lane 1
+    2014102324U, // <1,1,3,1>: Cost 2 vtrnr LHS, <1,1,1,1>
+    2091892737U, // <1,1,3,2>: Cost 2 ins <1,u,3,2>, lane 1
+    940359782U,  // <1,1,3,3>: Cost 1 vtrnr LHS, LHS
+    2091909121U, // <1,1,3,4>: Cost 2 ins <1,u,3,4>, lane 1
+    2087297027U, // <1,1,3,5>: Cost 2 ins <1,1,3,u>, lane 3
+    2087297027U, // <1,1,3,6>: Cost 2 ins <1,1,3,u>, lane 3
+    2091933697U, // <1,1,3,7>: Cost 2 ins <1,u,3,7>, lane 1
+    940359787U,  // <1,1,3,u>: Cost 1 vtrnr LHS, LHS
+    2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS
+    2087608322U, // <1,1,4,1>: Cost 2 ins <1,1,u,1>, lane 2
+    2894496662U, // <1,1,4,2>: Cost 3 vzipl <1,4,2,5>, <1,2,3,0>
+    2087624706U, // <1,1,4,3>: Cost 2 ins <1,1,u,3>, lane 2
+    2014109799U, // <1,1,4,4>: Cost 2 vtrnr <0,1,2,4>, <0,1,2,4>
+    1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS
+    1684442422U, // <1,1,4,6>: Cost 2 vuzpl <1,1,1,1>, RHS
+    3161399298U, // <1,1,4,7>: Cost 3 ins <1,1,u,7>, lane 2
+    1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS
+    3028288624U, // <1,1,5,0>: Cost 3 vtrnl <1,3,5,7>, <1,5,0,2>
+    2087608322U, // <1,1,5,1>: Cost 2 ins <1,1,u,1>, lane 2
+    2955561110U, // <1,1,5,2>: Cost 3 vzipr <0,4,1,5>, <3,0,1,2>
+    2087624706U, // <1,1,5,3>: Cost 2 ins <1,1,u,3>, lane 2
+    2955558925U, // <1,1,5,4>: Cost 3 vzipr <0,4,1,5>, <0,0,1,4>
+    1881817426U, // <1,1,5,5>: Cost 2 vzipr <0,4,1,5>, <0,4,1,5>
+    2670415970U, // <1,1,5,6>: Cost 3 vext2 <u,u,1,1>, <5,6,7,0>
+    1751551286U, // <1,1,5,7>: Cost 2 vuzpr <1,1,1,1>, RHS
+    1751551287U, // <1,1,5,u>: Cost 2 vuzpr <1,1,1,1>, RHS
+    3165839361U, // <1,1,6,0>: Cost 3 ins <1,u,6,0>, lane 1
+    2087608322U, // <1,1,6,1>: Cost 2 ins <1,1,u,1>, lane 2
+    2973485206U, // <1,1,6,2>: Cost 3 vzipr <3,4,1,6>, <3,0,1,2>
+    2087624706U, // <1,1,6,3>: Cost 2 ins <1,1,u,3>, lane 2
+    2221572948U, // <1,1,6,4>: Cost 3 vrev <1,1,4,6>
+    2955567442U, // <1,1,6,5>: Cost 3 vzipr <0,4,1,6>, <0,4,1,5>
+    2014126185U, // <1,1,6,6>: Cost 2 vtrnr <0,1,2,6>, <0,1,2,6>
+    2087665669U, // <1,1,6,7>: Cost 2 ins <1,1,u,u>, lane 5
+    2087624706U, // <1,1,6,u>: Cost 2 ins <1,1,u,3>, lane 2
+    2670416890U, // <1,1,7,0>: Cost 3 vext2 <u,u,1,1>, <7,0,1,2>
+    2087608322U, // <1,1,7,1>: Cost 2 ins <1,1,u,1>, lane 2
+    3203088384U, // <1,1,7,2>: Cost 3 ins <u,1,7,2>, lane 0
+    2129354752U, // <1,1,7,3>: Cost 2 ins <u,1,7,3>, lane 0
+    2670417254U, // <1,1,7,4>: Cost 3 vext2 <u,u,1,1>, <7,4,5,6>
+    2221654878U, // <1,1,7,5>: Cost 3 vrev <1,1,5,7>
+    3161391106U, // <1,1,7,6>: Cost 3 ins <1,1,u,6>, lane 2
+    2014134378U, // <1,1,7,7>: Cost 2 vtrnr <0,1,2,7>, <0,1,2,7>
+    2129354752U, // <1,1,7,u>: Cost 2 ins <u,1,7,3>, lane 0
+    1818149622U, // <1,1,u,0>: Cost 2 vzipl <1,0,3,2>, <1,0,3,2>
+    202162278U,  // <1,1,u,1>: Cost 1 vdup1 LHS
+    1684444974U, // <1,1,u,2>: Cost 2 vuzpl <1,1,1,1>, LHS
+    940400742U,  // <1,1,u,3>: Cost 1 vtrnr LHS, LHS
+    1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS
+    1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS
+    1684445338U, // <1,1,u,6>: Cost 2 vuzpl <1,1,1,1>, RHS
+    1751551529U, // <1,1,u,7>: Cost 2 vuzpr <1,1,1,1>, RHS
+    940400747U,  // <1,1,u,u>: Cost 1 vtrnr LHS, LHS
+    2088263682U, // <1,2,0,0>: Cost 2 ins <1,2,u,0>, lane 2
+    1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS
+    2129494016U, // <1,2,0,2>: Cost 2 ins <u,2,0,2>, lane 0
+    2954854502U, // <1,2,0,3>: Cost 3 vzipr <0,3,1,0>, LHS
+    2088296450U, // <1,2,0,4>: Cost 2 ins <1,2,u,4>, lane 2
+    3165437953U, // <1,2,0,5>: Cost 3 ins <1,u,0,5>, lane 1
+    2891892666U, // <1,2,0,6>: Cost 3 vzipl <1,0,3,2>, <2,6,3,7>
+    2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1>
+    1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS
+    2088263682U, // <1,2,1,0>: Cost 2 ins <1,2,u,0>, lane 2
+    2091737089U, // <1,2,1,1>: Cost 2 ins <1,u,1,1>, lane 1
+    1745657957U, // <1,2,1,2>: Cost 2 vuzpr <0,1,2,2>, <0,1,2,2>
+    1884438630U, // <1,2,1,3>: Cost 2 vzipr <0,u,1,1>, LHS
+    2088296450U, // <1,2,1,4>: Cost 2 ins <1,2,u,4>, lane 2
+    2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7>
+    2958180700U, // <1,2,1,6>: Cost 3 vzipr <0,u,1,1>, <0,4,2,6>
+    3165528065U, // <1,2,1,7>: Cost 3 ins <1,u,1,7>, lane 1
+    1884438635U, // <1,2,1,u>: Cost 2 vzipr <0,u,1,1>, LHS
+    2088263682U, // <1,2,2,0>: Cost 2 ins <1,2,u,0>, lane 2
+    2893235754U, // <1,2,2,1>: Cost 3 vzipl <1,2,3,4>, <2,1,4,3>
+    2129641472U, // <1,2,2,2>: Cost 2 ins <u,2,2,2>, lane 0
+    1897054310U, // <1,2,2,3>: Cost 2 vzipr <3,0,1,2>, LHS
+    2088296450U, // <1,2,2,4>: Cost 2 ins <1,2,u,4>, lane 2
+    3165585409U, // <1,2,2,5>: Cost 3 ins <1,u,2,5>, lane 1
+    2893203386U, // <1,2,2,6>: Cost 3 vzipl <1,2,3,0>, <2,6,3,7>
+    2994684010U, // <1,2,2,7>: Cost 3 vzipr <7,0,1,2>, <0,1,2,7>
+    1897054315U, // <1,2,2,u>: Cost 2 vzipr <3,0,1,2>, LHS
+    403488870U,  // <1,2,3,0>: Cost 1 vext1 LHS, LHS
+    1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+    1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+    1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3>
+    403492150U,  // <1,2,3,4>: Cost 1 vext1 LHS, RHS
+    1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+    2014101708U, // <1,2,3,6>: Cost 2 vtrnr LHS, <0,2,4,6>
+    1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+    403494702U,  // <1,2,3,u>: Cost 1 vext1 LHS, LHS
+    2088263682U, // <1,2,4,0>: Cost 2 ins <1,2,u,0>, lane 2
+    3162013698U, // <1,2,4,1>: Cost 3 ins <1,2,u,1>, lane 2
+    3162021890U, // <1,2,4,2>: Cost 3 ins <1,2,u,2>, lane 2
+    2954887270U, // <1,2,4,3>: Cost 3 vzipr <0,3,1,4>, LHS
+    2088296450U, // <1,2,4,4>: Cost 2 ins <1,2,u,4>, lane 2
+    1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS
+    2129821696U, // <1,2,4,6>: Cost 2 ins <u,2,4,6>, lane 0
+    2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0>
+    1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS
+    2088263682U, // <1,2,5,0>: Cost 2 ins <1,2,u,0>, lane 2
+    2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7>
+    2955558932U, // <1,2,5,2>: Cost 3 vzipr <0,4,1,5>, <0,0,2,2>
+    1881817190U, // <1,2,5,3>: Cost 2 vzipr <0,4,1,5>, LHS
+    2088296450U, // <1,2,5,4>: Cost 2 ins <1,2,u,4>, lane 2
+    2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5>
+    2955559260U, // <1,2,5,6>: Cost 3 vzipr <0,4,1,5>, <0,4,2,6>
+    2092081153U, // <1,2,5,7>: Cost 2 ins <1,u,5,7>, lane 1
+    1881817195U, // <1,2,5,u>: Cost 2 vzipr <0,4,1,5>, LHS
+    2088263682U, // <1,2,6,0>: Cost 2 ins <1,2,u,0>, lane 2
+    3162013698U, // <1,2,6,1>: Cost 3 ins <1,2,u,1>, lane 2
+    2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3>
+    2954240102U, // <1,2,6,3>: Cost 3 vzipr <0,2,1,6>, LHS
+    2088296450U, // <1,2,6,4>: Cost 2 ins <1,2,u,4>, lane 2
+    3162046466U, // <1,2,6,5>: Cost 3 ins <1,2,u,5>, lane 2
+    2895914938U, // <1,2,6,6>: Cost 3 vzipl <1,6,3,7>, <2,6,3,7>
+    2088329221U, // <1,2,6,7>: Cost 2 ins <1,2,u,u>, lane 5
+    2088263682U, // <1,2,6,u>: Cost 2 ins <1,2,u,0>, lane 2
+    1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2>
+    3203743744U, // <1,2,7,1>: Cost 3 ins <u,2,7,1>, lane 0
+    3203751936U, // <1,2,7,2>: Cost 3 ins <u,2,7,2>, lane 0
+    2130018304U, // <1,2,7,3>: Cost 2 ins <u,2,7,3>, lane 0
+    2088296450U, // <1,2,7,4>: Cost 2 ins <1,2,u,4>, lane 2
+    3203776512U, // <1,2,7,5>: Cost 3 ins <u,2,7,5>, lane 0
+    3203784704U, // <1,2,7,6>: Cost 3 ins <u,2,7,6>, lane 0
+    2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1>
+    1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2>
+    403529835U,  // <1,2,u,0>: Cost 1 vext1 LHS, LHS
+    1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2>
+    1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2>
+    1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2>
+    403533110U,  // <1,2,u,4>: Cost 1 vext1 LHS, RHS
+    1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS
+    2014142668U, // <1,2,u,6>: Cost 2 vtrnr LHS, <0,2,4,6>
+    1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+    403535662U,  // <1,2,u,u>: Cost 1 vext1 LHS, LHS
+    1745666048U, // <1,3,0,0>: Cost 2 vuzpr LHS, <0,0,0,0>
+    1746108426U, // <1,3,0,1>: Cost 2 vuzpr LHS, <0,0,1,1>
+    1745666806U, // <1,3,0,2>: Cost 2 vuzpr LHS, <1,0,3,2>
+    2088951810U, // <1,3,0,3>: Cost 2 ins <1,3,u,3>, lane 2
+    2819850253U, // <1,3,0,4>: Cost 3 vuzpr LHS, <0,0,1,4>
+    2758984055U, // <1,3,0,5>: Cost 3 vuzpl <1,2,3,0>, <0,4,5,6>
+    2867183658U, // <1,3,0,6>: Cost 3 vuzpr LHS, <0,0,4,6>
+    2088984578U, // <1,3,0,7>: Cost 2 ins <1,3,u,7>, lane 2
+    1745668252U, // <1,3,0,u>: Cost 2 vuzpr LHS, <3,0,1,u>
+    2088476675U, // <1,3,1,0>: Cost 2 ins <1,3,1,u>, lane 3
+    1745666868U, // <1,3,1,1>: Cost 2 vuzpr LHS, <1,1,1,1>
+    2088476675U, // <1,3,1,2>: Cost 2 ins <1,3,1,u>, lane 3
+    671924326U,  // <1,3,1,3>: Cost 1 vuzpr LHS, LHS
+    2088476675U, // <1,3,1,4>: Cost 2 ins <1,3,1,u>, lane 3
+    2088476675U, // <1,3,1,5>: Cost 2 ins <1,3,1,u>, lane 3
+    2088476675U, // <1,3,1,6>: Cost 2 ins <1,3,1,u>, lane 3
+    2088984578U, // <1,3,1,7>: Cost 2 ins <1,3,u,7>, lane 2
+    671924331U,  // <1,3,1,u>: Cost 1 vuzpr LHS, LHS
+    1745666966U, // <1,3,2,0>: Cost 2 vuzpr LHS, <1,2,3,0>
+    2819408044U, // <1,3,2,1>: Cost 3 vuzpr LHS, <0,2,1,1>
+    1745666212U, // <1,3,2,2>: Cost 2 vuzpr LHS, <0,2,0,2>
+    1746110066U, // <1,3,2,3>: Cost 2 vuzpr LHS, <2,2,3,3>
+    1745666970U, // <1,3,2,4>: Cost 2 vuzpr LHS, <1,2,3,4>
+    2819408066U, // <1,3,2,5>: Cost 3 vuzpr LHS, <0,2,3,5>
+    1745666252U, // <1,3,2,6>: Cost 2 vuzpr LHS, <0,2,4,6>
+    2088984578U, // <1,3,2,7>: Cost 2 ins <1,3,u,7>, lane 2
+    1745666218U, // <1,3,2,u>: Cost 2 vuzpr LHS, <0,2,0,u>
+    1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS
+    1745667750U, // <1,3,3,1>: Cost 2 vuzpr LHS, <2,3,0,1>
+    2091892737U, // <1,3,3,2>: Cost 2 ins <1,u,3,2>, lane 1
+    1745667032U, // <1,3,3,3>: Cost 2 vuzpr LHS, <1,3,1,3>
+    1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS
+    1745667790U, // <1,3,3,5>: Cost 2 vuzpr LHS, <2,3,4,5>
+    2819408868U, // <1,3,3,6>: Cost 3 vuzpr LHS, <1,3,2,6>
+    2014102528U, // <1,3,3,7>: Cost 2 vtrnr LHS, <1,3,5,7>
+    1745667037U, // <1,3,3,u>: Cost 2 vuzpr LHS, <1,3,1,u>
+    2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS
+    2759019375U, // <1,3,4,1>: Cost 3 vuzpl <1,2,3,4>, <4,0,1,2>
+    2759019466U, // <1,3,4,2>: Cost 3 vuzpl <1,2,3,4>, <4,1,2,3>
+    2088951810U, // <1,3,4,3>: Cost 2 ins <1,3,u,3>, lane 2
+    1793445072U, // <1,3,4,4>: Cost 2 vuzpr LHS, <4,4,4,4>
+    1746108754U, // <1,3,4,5>: Cost 2 vuzpr LHS, <0,4,1,5>
+    1745668610U, // <1,3,4,6>: Cost 2 vuzpr LHS, <3,4,5,6>
+    2088984578U, // <1,3,4,7>: Cost 2 ins <1,3,u,7>, lane 2
+    1745668612U, // <1,3,4,u>: Cost 2 vuzpr LHS, <3,4,5,u>
+    2088771587U, // <1,3,5,0>: Cost 2 ins <1,3,5,u>, lane 3
+    2088771587U, // <1,3,5,1>: Cost 2 ins <1,3,5,u>, lane 3
+    2088771587U, // <1,3,5,2>: Cost 2 ins <1,3,5,u>, lane 3
+    2088951810U, // <1,3,5,3>: Cost 2 ins <1,3,u,3>, lane 2
+    2088771587U, // <1,3,5,4>: Cost 2 ins <1,3,5,u>, lane 3
+    1793445892U, // <1,3,5,5>: Cost 2 vuzpr LHS, <5,5,5,5>
+    2088771587U, // <1,3,5,6>: Cost 2 ins <1,3,5,u>, lane 3
+    671927606U,  // <1,3,5,7>: Cost 1 vuzpr LHS, RHS
+    671927607U,  // <1,3,5,u>: Cost 1 vuzpr LHS, RHS
+    1793445986U, // <1,3,6,0>: Cost 2 vuzpr LHS, <5,6,7,0>
+    2867185561U, // <1,3,6,1>: Cost 3 vuzpr LHS, <2,6,0,1>
+    1793445196U, // <1,3,6,2>: Cost 2 vuzpr LHS, <4,6,0,2>
+    2088951810U, // <1,3,6,3>: Cost 2 ins <1,3,u,3>, lane 2
+    1793445990U, // <1,3,6,4>: Cost 2 vuzpr LHS, <5,6,7,4>
+    2849642738U, // <1,3,6,5>: Cost 3 vuzpr <5,1,7,3>, <u,6,7,5>
+    1793445236U, // <1,3,6,6>: Cost 2 vuzpr LHS, <4,6,4,6>
+    1746110394U, // <1,3,6,7>: Cost 2 vuzpr LHS, <2,6,3,7>
+    1746110395U, // <1,3,6,u>: Cost 2 vuzpr LHS, <2,6,3,u>
+    2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1>
+    1793446734U, // <1,3,7,1>: Cost 2 vuzpr LHS, <6,7,0,1>
+    2867187830U, // <1,3,7,2>: Cost 3 vuzpr LHS, <5,7,0,2>
+    1793446016U, // <1,3,7,3>: Cost 2 vuzpr LHS, <5,7,1,3>
+    2849637679U, // <1,3,7,4>: Cost 3 vuzpr <5,1,7,3>, <1,7,3,4>
+    1793446774U, // <1,3,7,5>: Cost 2 vuzpr LHS, <6,7,4,5>
+    2867185674U, // <1,3,7,6>: Cost 3 vuzpr LHS, <2,7,3,6>
+    1793446056U, // <1,3,7,7>: Cost 2 vuzpr LHS, <5,7,5,7>
+    1793446021U, // <1,3,7,u>: Cost 2 vuzpr LHS, <5,7,1,u>
+    1746109820U, // <1,3,u,0>: Cost 2 vuzpr LHS, <1,u,3,0>
+    2014144166U, // <1,3,u,1>: Cost 2 vtrnr LHS, <2,3,0,1>
+    1745668894U, // <1,3,u,2>: Cost 2 vuzpr LHS, <3,u,1,2>
+    671924893U,  // <1,3,u,3>: Cost 1 vuzpr LHS, LHS
+    1746109824U, // <1,3,u,4>: Cost 2 vuzpr LHS, <1,u,3,4>
+    2014144206U, // <1,3,u,5>: Cost 2 vtrnr LHS, <2,3,4,5>
+    1745668934U, // <1,3,u,6>: Cost 2 vuzpr LHS, <3,u,5,6>
+    671927849U,  // <1,3,u,7>: Cost 1 vuzpr LHS, RHS
+    671924898U,  // <1,3,u,u>: Cost 1 vuzpr LHS, LHS
+    3165396993U, // <1,4,0,0>: Cost 3 ins <1,u,0,0>, lane 1
+    2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS
+    2758434918U, // <1,4,0,2>: Cost 3 vuzpl <1,1,4,5>, LHS
+    2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+    3165429761U, // <1,4,0,4>: Cost 3 ins <1,u,0,4>, lane 1
+    1818152246U, // <1,4,0,5>: Cost 2 vzipl <1,0,3,2>, RHS
+    3026537782U, // <1,4,0,6>: Cost 3 vtrnl <1,1,0,0>, RHS
+    3162808323U, // <1,4,0,7>: Cost 3 ins <1,4,0,u>, lane 3
+    1818152489U, // <1,4,0,u>: Cost 2 vzipl <1,0,3,2>, RHS
+    3204620288U, // <1,4,1,0>: Cost 3 ins <u,4,1,0>, lane 0
+    2091737089U, // <1,4,1,1>: Cost 2 ins <1,u,1,1>, lane 1
+    3204636672U, // <1,4,1,2>: Cost 3 ins <u,4,1,2>, lane 0
+    2091753473U, // <1,4,1,3>: Cost 2 ins <1,u,1,3>, lane 1
+    1745674343U, // <1,4,1,4>: Cost 2 vuzpr <0,1,2,4>, <0,1,2,4>
+    1818660150U, // <1,4,1,5>: Cost 2 vzipl <1,1,1,1>, RHS
+    1952877878U, // <1,4,1,6>: Cost 2 vtrnl <1,1,1,1>, RHS
+    3165528065U, // <1,4,1,7>: Cost 3 ins <1,u,1,7>, lane 1
+    1818660393U, // <1,4,1,u>: Cost 2 vzipl <1,1,1,1>, RHS
+    2893237103U, // <1,4,2,0>: Cost 3 vzipl <1,2,3,4>, <4,0,1,2>
+    2893237194U, // <1,4,2,1>: Cost 3 vzipl <1,2,3,4>, <4,1,2,3>
+    3165560833U, // <1,4,2,2>: Cost 3 ins <1,u,2,2>, lane 1
+    2130976768U, // <1,4,2,3>: Cost 2 ins <u,4,2,3>, lane 0
+    2893237467U, // <1,4,2,4>: Cost 3 vzipl <1,2,3,4>, <4,4,5,6>
+    1819462966U, // <1,4,2,5>: Cost 2 vzipl <1,2,3,0>, RHS
+    2131001344U, // <1,4,2,6>: Cost 2 ins <u,4,2,6>, lane 0
+    3165601793U, // <1,4,2,7>: Cost 3 ins <1,u,2,7>, lane 1
+    1819463209U, // <1,4,2,u>: Cost 2 vzipl <1,2,3,0>, RHS
+    2091876353U, // <1,4,3,0>: Cost 2 ins <1,u,3,0>, lane 1
+    3027454831U, // <1,4,3,1>: Cost 3 vtrnl <1,2,3,4>, <4,0,1,2>
+    2091892737U, // <1,4,3,2>: Cost 2 ins <1,u,3,2>, lane 1
+    2091900929U, // <1,4,3,3>: Cost 2 ins <1,u,3,3>, lane 1
+    2061880528U, // <1,4,3,4>: Cost 2 vtrnr LHS, <4,4,4,4>
+    2014101842U, // <1,4,3,5>: Cost 2 vtrnr LHS, <0,4,1,5>
+    2014101852U, // <1,4,3,6>: Cost 2 vtrnr LHS, <0,4,2,6>
+    2091933697U, // <1,4,3,7>: Cost 2 ins <1,u,3,7>, lane 1
+    2014101845U, // <1,4,3,u>: Cost 2 vtrnr LHS, <0,4,1,u>
+    2557100134U, // <1,4,4,0>: Cost 3 vext1 <1,1,4,4>, LHS
+    2557100882U, // <1,4,4,1>: Cost 3 vext1 <1,1,4,4>, <1,1,4,4>
+    3165708289U, // <1,4,4,2>: Cost 3 ins <1,u,4,2>, lane 1
+    2819416409U, // <1,4,4,3>: Cost 3 vuzpr <0,1,2,4>, <0,4,2,3>
+    2131132416U, // <1,4,4,4>: Cost 2 ins <u,4,4,4>, lane 0
+    2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS
+    2758438198U, // <1,4,4,6>: Cost 3 vuzpl <1,1,4,5>, RHS
+    2819419365U, // <1,4,4,7>: Cost 3 vuzpr <0,1,2,4>, <4,4,6,7>
+    2131132416U, // <1,4,4,u>: Cost 2 ins <u,4,4,4>, lane 0
+    1477394554U, // <1,4,5,0>: Cost 2 vext1 <0,1,4,5>, <0,1,4,5>
+    2955558949U, // <1,4,5,1>: Cost 3 vzipr <0,4,1,5>, <0,0,4,1>
+    3204931584U, // <1,4,5,2>: Cost 3 ins <u,4,5,2>, lane 0
+    3165790209U, // <1,4,5,3>: Cost 3 ins <1,u,5,3>, lane 1
+    1477397814U, // <1,4,5,4>: Cost 2 vext1 <0,1,4,5>, RHS
+    1821510966U, // <1,4,5,5>: Cost 2 vzipl <1,5,3,7>, RHS
+    1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+    2092081153U, // <1,4,5,7>: Cost 2 ins <1,u,5,7>, lane 1
+    1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS
+    2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS
+    2557117268U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,4,6>
+    3165855745U, // <1,4,6,2>: Cost 3 ins <1,u,6,2>, lane 1
+    2569062662U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,1,4,6>
+    2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS
+    2895768886U, // <1,4,6,5>: Cost 3 vzipl <1,6,1,7>, RHS
+    2131296256U, // <1,4,6,6>: Cost 2 ins <u,4,6,6>, lane 0
+    2131304448U, // <1,4,6,7>: Cost 2 ins <u,4,6,7>, lane 0
+    2131296256U, // <1,4,6,u>: Cost 2 ins <u,4,6,6>, lane 0
+    2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4>
+    3165921281U, // <1,4,7,1>: Cost 3 ins <1,u,7,1>, lane 1
+    3205079040U, // <1,4,7,2>: Cost 3 ins <u,4,7,2>, lane 0
+    2861297792U, // <1,4,7,3>: Cost 3 vuzpr <7,1,3,4>, <5,7,1,3>
+    2669778278U, // <1,4,7,4>: Cost 3 vext2 <u,7,1,4>, <7,4,5,6>
+    3205103616U, // <1,4,7,5>: Cost 3 ins <u,4,7,5>, lane 0
+    2131369984U, // <1,4,7,6>: Cost 2 ins <u,4,7,6>, lane 0
+    3165970433U, // <1,4,7,7>: Cost 3 ins <1,u,7,7>, lane 1
+    2131369984U, // <1,4,7,u>: Cost 2 ins <u,4,7,6>, lane 0
+    2091876353U, // <1,4,u,0>: Cost 2 ins <1,u,3,0>, lane 1
+    2091737089U, // <1,4,u,1>: Cost 2 ins <1,u,1,1>, lane 1
+    2091892737U, // <1,4,u,2>: Cost 2 ins <1,u,3,2>, lane 1
+    2091753473U, // <1,4,u,3>: Cost 2 ins <1,u,1,3>, lane 1
+    2061921488U, // <1,4,u,4>: Cost 2 vtrnr LHS, <4,4,4,4>
+    2014142802U, // <1,4,u,5>: Cost 2 vtrnr LHS, <0,4,1,5>
+    2014142812U, // <1,4,u,6>: Cost 2 vtrnr LHS, <0,4,2,6>
+    2091933697U, // <1,4,u,7>: Cost 2 ins <1,u,3,7>, lane 1
+    2014142805U, // <1,4,u,u>: Cost 2 vtrnr LHS, <0,4,1,u>
+    2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0>
+    1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS
+    1686110310U, // <1,5,0,2>: Cost 2 vuzpl <1,3,5,7>, LHS
+    3163471875U, // <1,5,0,3>: Cost 3 ins <1,5,0,u>, lane 3
+    1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+    3165437953U, // <1,5,0,5>: Cost 3 ins <1,u,0,5>, lane 1
+    3164045314U, // <1,5,0,6>: Cost 3 ins <1,5,u,6>, lane 2
+    2090311682U, // <1,5,0,7>: Cost 2 ins <1,5,u,7>, lane 2
+    1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS
+    2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2>
+    2091737089U, // <1,5,1,1>: Cost 2 ins <1,u,1,1>, lane 1
+    2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0>
+    2091753473U, // <1,5,1,3>: Cost 2 ins <1,u,1,3>, lane 1
+    2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5>
+    1686111232U, // <1,5,1,5>: Cost 2 vuzpl <1,3,5,7>, <1,3,5,7>
+    2958181456U, // <1,5,1,6>: Cost 3 vzipr <0,u,1,1>, <1,4,5,6>
+    2019986742U, // <1,5,1,7>: Cost 2 vtrnr <1,1,1,1>, RHS
+    2019986743U, // <1,5,1,u>: Cost 2 vtrnr <1,1,1,1>, RHS
+    2759853734U, // <1,5,2,0>: Cost 3 vuzpl <1,3,5,7>, <2,3,0,1>
+    2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5>
+    2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2>
+    2090319877U, // <1,5,2,3>: Cost 2 ins <1,5,u,u>, lane 5
+    2759853774U, // <1,5,2,4>: Cost 3 vuzpl <1,3,5,7>, <2,3,4,5>
+    2994687194U, // <1,5,2,5>: Cost 3 vzipr <7,0,1,2>, <4,4,5,5>
+    2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7>
+    2090311682U, // <1,5,2,7>: Cost 2 ins <1,5,u,7>, lane 2
+    2090319877U, // <1,5,2,u>: Cost 2 ins <1,5,u,u>, lane 5
+    2091876353U, // <1,5,3,0>: Cost 2 ins <1,u,3,0>, lane 1
+    2089951235U, // <1,5,3,1>: Cost 2 ins <1,5,3,u>, lane 3
+    2091892737U, // <1,5,3,2>: Cost 2 ins <1,u,3,2>, lane 1
+    2091900929U, // <1,5,3,3>: Cost 2 ins <1,u,3,3>, lane 1
+    2091909121U, // <1,5,3,4>: Cost 2 ins <1,u,3,4>, lane 1
+    2061881348U, // <1,5,3,5>: Cost 2 vtrnr LHS, <5,5,5,5>
+    2089951235U, // <1,5,3,6>: Cost 2 ins <1,5,3,u>, lane 3
+    940363062U,  // <1,5,3,7>: Cost 1 vtrnr LHS, RHS
+    940363063U,  // <1,5,3,u>: Cost 1 vtrnr LHS, RHS
+    2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1>
+    2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+    3164012546U, // <1,5,4,2>: Cost 3 ins <1,5,u,2>, lane 2
+    3163766787U, // <1,5,4,3>: Cost 3 ins <1,5,4,u>, lane 3
+    2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4>
+    1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS
+    1686113590U, // <1,5,4,6>: Cost 2 vuzpl <1,3,5,7>, RHS
+    2090311682U, // <1,5,4,7>: Cost 2 ins <1,5,u,7>, lane 2
+    1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS
+    2955561954U, // <1,5,5,0>: Cost 3 vzipr <0,4,1,5>, <4,1,5,0>
+    2955561874U, // <1,5,5,1>: Cost 3 vzipr <0,4,1,5>, <4,0,5,1>
+    3165782017U, // <1,5,5,2>: Cost 3 ins <1,u,5,2>, lane 1
+    2955559851U, // <1,5,5,3>: Cost 3 vzipr <0,4,1,5>, <1,2,5,3>
+    2955561958U, // <1,5,5,4>: Cost 3 vzipr <0,4,1,5>, <4,1,5,4>
+    2131877888U, // <1,5,5,5>: Cost 2 ins <u,5,5,5>, lane 0
+    2955561474U, // <1,5,5,6>: Cost 3 vzipr <0,4,1,5>, <3,4,5,6>
+    2092081153U, // <1,5,5,7>: Cost 2 ins <1,u,5,7>, lane 1
+    2092081153U, // <1,5,5,u>: Cost 2 ins <1,u,5,7>, lane 1
+    2131910656U, // <1,5,6,0>: Cost 2 ins <u,5,6,0>, lane 0
+    2131918848U, // <1,5,6,1>: Cost 2 ins <u,5,6,1>, lane 0
+    2131927040U, // <1,5,6,2>: Cost 2 ins <u,5,6,2>, lane 0
+    2131935232U, // <1,5,6,3>: Cost 2 ins <u,5,6,3>, lane 0
+    2131943424U, // <1,5,6,4>: Cost 2 ins <u,5,6,4>, lane 0
+    2131951616U, // <1,5,6,5>: Cost 2 ins <u,5,6,5>, lane 0
+    2131959808U, // <1,5,6,6>: Cost 2 ins <u,5,6,6>, lane 0
+    1058226176U, // <1,5,6,7>: Cost 1 ins RHS, lane 0
+    1058226176U, // <1,5,6,u>: Cost 1 ins RHS, lane 0
+    2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS
+    2557199198U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,5,7>
+    2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1>
+    2759857248U, // <1,5,7,3>: Cost 3 vuzpl <1,3,5,7>, <7,1,3,5>
+    2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS
+    2759857510U, // <1,5,7,5>: Cost 3 vuzpl <1,3,5,7>, <7,4,5,6>
+    2593035086U, // <1,5,7,6>: Cost 3 vext1 <7,1,5,7>, <6,7,0,1>
+    2132041728U, // <1,5,7,7>: Cost 2 ins <u,5,7,7>, lane 0
+    2132041728U, // <1,5,7,u>: Cost 2 ins <u,5,7,7>, lane 0
+    2091876353U, // <1,5,u,0>: Cost 2 ins <1,u,3,0>, lane 1
+    1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS
+    1686116142U, // <1,5,u,2>: Cost 2 vuzpl <1,3,5,7>, LHS
+    2091753473U, // <1,5,u,3>: Cost 2 ins <1,u,1,3>, lane 1
+    1594054682U, // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5>
+    1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS
+    1686116506U, // <1,5,u,6>: Cost 2 vuzpl <1,3,5,7>, RHS
+    940404022U,  // <1,5,u,7>: Cost 1 vtrnr LHS, RHS
+    940404023U,  // <1,5,u,u>: Cost 1 vtrnr LHS, RHS
+    3205873664U, // <1,6,0,0>: Cost 3 ins <u,6,0,0>, lane 0
+    2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS
+    2132148224U, // <1,6,0,2>: Cost 2 ins <u,6,0,2>, lane 0
+    3087819259U, // <1,6,0,3>: Cost 3 vtrnr <0,1,2,0>, <0,6,2,3>
+    2620023123U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,6>
+    3165437953U, // <1,6,0,5>: Cost 3 ins <1,u,0,5>, lane 1
+    3164708866U, // <1,6,0,6>: Cost 3 ins <1,6,u,6>, lane 2
+    2954857782U, // <1,6,0,7>: Cost 3 vzipr <0,3,1,0>, RHS
+    2132148224U, // <1,6,0,u>: Cost 2 ins <u,6,0,2>, lane 0
+    3205947392U, // <1,6,1,0>: Cost 3 ins <u,6,1,0>, lane 0
+    2091737089U, // <1,6,1,1>: Cost 2 ins <1,u,1,1>, lane 1
+    3005959068U, // <1,6,1,2>: Cost 3 vzipr <u,u,1,1>, <4,0,6,2>
+    2091753473U, // <1,6,1,3>: Cost 2 ins <1,u,1,3>, lane 1
+    2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6>
+    3205988352U, // <1,6,1,5>: Cost 3 ins <u,6,1,5>, lane 0
+    1745690729U, // <1,6,1,6>: Cost 2 vuzpr <0,1,2,6>, <0,1,2,6>
+    1884441910U, // <1,6,1,7>: Cost 2 vzipr <0,u,1,1>, RHS
+    1884441911U, // <1,6,1,u>: Cost 2 vzipr <0,u,1,1>, RHS
+    2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1>
+    2994687442U, // <1,6,2,1>: Cost 3 vzipr <7,0,1,2>, <4,7,6,1>
+    2994686876U, // <1,6,2,2>: Cost 3 vzipr <7,0,1,2>, <4,0,6,2>
+    2132303872U, // <1,6,2,3>: Cost 2 ins <u,6,2,3>, lane 0
+    3206053888U, // <1,6,2,4>: Cost 3 ins <u,6,2,4>, lane 0
+    3165585409U, // <1,6,2,5>: Cost 3 ins <1,u,2,5>, lane 1
+    2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7>
+    1897057590U, // <1,6,2,7>: Cost 2 vzipr <3,0,1,2>, RHS
+    1897057591U, // <1,6,2,u>: Cost 2 vzipr <3,0,1,2>, RHS
+    2061881442U, // <1,6,3,0>: Cost 2 vtrnr LHS, <5,6,7,0>
+    2987396400U, // <1,6,3,1>: Cost 3 vzipr <5,7,1,3>, <4,5,6,1>
+    2061880652U, // <1,6,3,2>: Cost 2 vtrnr LHS, <4,6,0,2>
+    2091900929U, // <1,6,3,3>: Cost 2 ins <1,u,3,3>, lane 1
+    2061881446U, // <1,6,3,4>: Cost 2 vtrnr LHS, <5,6,7,4>
+    3118078194U, // <1,6,3,5>: Cost 3 vtrnr <5,1,7,3>, <u,6,7,5>
+    2061880692U, // <1,6,3,6>: Cost 2 vtrnr LHS, <4,6,4,6>
+    2014103482U, // <1,6,3,7>: Cost 2 vtrnr LHS, <2,6,3,7>
+    2014103483U, // <1,6,3,u>: Cost 2 vtrnr LHS, <2,6,3,u>
+    3206168576U, // <1,6,4,0>: Cost 3 ins <u,6,4,0>, lane 0
+    2761256201U, // <1,6,4,1>: Cost 3 vuzpl <1,5,6,7>, <4,5,1,7>
+    3164676098U, // <1,6,4,2>: Cost 3 ins <1,6,u,2>, lane 2
+    3087852027U, // <1,6,4,3>: Cost 3 vtrnr <0,1,2,4>, <0,6,2,3>
+    3206201344U, // <1,6,4,4>: Cost 3 ins <u,6,4,4>, lane 0
+    2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS
+    2132475904U, // <1,6,4,6>: Cost 2 ins <u,6,4,6>, lane 0
+    2954890550U, // <1,6,4,7>: Cost 3 vzipr <0,3,1,4>, RHS
+    2132475904U, // <1,6,4,u>: Cost 2 ins <u,6,4,6>, lane 0
+    3164659714U, // <1,6,5,0>: Cost 3 ins <1,6,u,0>, lane 2
+    3206250496U, // <1,6,5,1>: Cost 3 ins <u,6,5,1>, lane 0
+    3003337628U, // <1,6,5,2>: Cost 3 vzipr <u,4,1,5>, <4,0,6,2>
+    3165790209U, // <1,6,5,3>: Cost 3 ins <1,u,5,3>, lane 1
+    3206275072U, // <1,6,5,4>: Cost 3 ins <u,6,5,4>, lane 0
+    3206283264U, // <1,6,5,5>: Cost 3 ins <u,6,5,5>, lane 0
+    3003337956U, // <1,6,5,6>: Cost 3 vzipr <u,4,1,5>, <4,4,6,6>
+    1881820470U, // <1,6,5,7>: Cost 2 vzipr <0,4,1,5>, RHS
+    1881820471U, // <1,6,5,u>: Cost 2 vzipr <0,4,1,5>, RHS
+    2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1>
+    2557264742U, // <1,6,6,1>: Cost 3 vext1 <1,1,6,6>, <1,1,6,6>
+    3165855745U, // <1,6,6,2>: Cost 3 ins <1,u,6,2>, lane 1
+    2819432955U, // <1,6,6,3>: Cost 3 vuzpr <0,1,2,6>, <0,6,2,3>
+    3206348800U, // <1,6,6,4>: Cost 3 ins <u,6,6,4>, lane 0
+    3206356992U, // <1,6,6,5>: Cost 3 ins <u,6,6,5>, lane 0
+    2132623360U, // <1,6,6,6>: Cost 2 ins <u,6,6,6>, lane 0
+    2132631552U, // <1,6,6,7>: Cost 2 ins <u,6,6,7>, lane 0
+    2132623360U, // <1,6,6,u>: Cost 2 ins <u,6,6,6>, lane 0
+    1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1>
+    2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1>
+    3206406144U, // <1,6,7,2>: Cost 3 ins <u,6,7,2>, lane 0
+    3206414336U, // <1,6,7,3>: Cost 3 ins <u,6,7,3>, lane 0
+    2132680704U, // <1,6,7,4>: Cost 2 ins <u,6,7,4>, lane 0
+    2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1>
+    2725507979U, // <1,6,7,6>: Cost 3 vext3 <6,u,0,1>, <6,7,6,u>
+    2132705280U, // <1,6,7,7>: Cost 2 ins <u,6,7,7>, lane 0
+    1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1>
+    1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1>
+    2091737089U, // <1,6,u,1>: Cost 2 ins <1,u,1,1>, lane 1
+    2061921612U, // <1,6,u,2>: Cost 2 vtrnr LHS, <4,6,0,2>
+    2091753473U, // <1,6,u,3>: Cost 2 ins <1,u,1,3>, lane 1
+    2061922406U, // <1,6,u,4>: Cost 2 vtrnr LHS, <5,6,7,4>
+    2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS
+    2061921652U, // <1,6,u,6>: Cost 2 vtrnr LHS, <4,6,4,6>
+    2014144442U, // <1,6,u,7>: Cost 2 vtrnr LHS, <2,6,3,7>
+    2014144443U, // <1,6,u,u>: Cost 2 vtrnr LHS, <2,6,3,u>
+    2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1>
+    2132803584U, // <1,7,0,1>: Cost 2 ins <u,7,0,1>, lane 0
+    3206553600U, // <1,7,0,2>: Cost 3 ins <u,7,0,2>, lane 0
+    2257286235U, // <1,7,0,3>: Cost 3 vrev <7,1,3,0>
+    2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1>
+    3206578176U, // <1,7,0,5>: Cost 3 ins <u,7,0,5>, lane 0
+    2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7>
+    3165380610U, // <1,7,0,7>: Cost 3 ins <1,7,u,7>, lane 2
+    2132803584U, // <1,7,0,u>: Cost 2 ins <u,7,0,1>, lane 0
+    2581184614U, // <1,7,1,0>: Cost 3 vext1 <5,1,7,1>, LHS
+    2091737089U, // <1,7,1,1>: Cost 2 ins <1,u,1,1>, lane 1
+    3206627328U, // <1,7,1,2>: Cost 3 ins <u,7,1,2>, lane 0
+    2132893696U, // <1,7,1,3>: Cost 2 ins <u,7,1,3>, lane 0
+    2581187894U, // <1,7,1,4>: Cost 3 vext1 <5,1,7,1>, RHS
+    2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7>
+    2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7>
+    1745698922U, // <1,7,1,7>: Cost 2 vuzpr <0,1,2,7>, <0,1,2,7>
+    2132893696U, // <1,7,1,u>: Cost 2 ins <u,7,1,3>, lane 0
+    2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS
+    2994687370U, // <1,7,2,1>: Cost 3 vzipr <7,0,1,2>, <4,6,7,1>
+    3206701056U, // <1,7,2,2>: Cost 3 ins <u,7,2,2>, lane 0
+    2132967424U, // <1,7,2,3>: Cost 2 ins <u,7,2,3>, lane 0
+    2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS
+    3206725632U, // <1,7,2,5>: Cost 3 ins <u,7,2,5>, lane 0
+    2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2>
+    2994688024U, // <1,7,2,7>: Cost 3 vzipr <7,0,1,2>, <5,5,7,7>
+    2132967424U, // <1,7,2,u>: Cost 2 ins <u,7,2,3>, lane 0
+    1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS
+    2061882190U, // <1,7,3,1>: Cost 2 vtrnr LHS, <6,7,0,1>
+    2091892737U, // <1,7,3,2>: Cost 2 ins <1,u,3,2>, lane 1
+    2061881472U, // <1,7,3,3>: Cost 2 vtrnr LHS, <5,7,1,3>
+    1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS
+    1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3>
+    2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3>
+    2061881512U, // <1,7,3,7>: Cost 2 vtrnr LHS, <5,7,5,7>
+    1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS
+    2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1>
+    3165331458U, // <1,7,4,1>: Cost 3 ins <1,7,u,1>, lane 2
+    2644585539U, // <1,7,4,2>: Cost 3 vext2 <4,5,1,7>, <4,2,6,7>
+    2257319007U, // <1,7,4,3>: Cost 3 vrev <7,1,3,4>
+    3206864896U, // <1,7,4,4>: Cost 3 ins <u,7,4,4>, lane 0
+    2133131264U, // <1,7,4,5>: Cost 2 ins <u,7,4,5>, lane 0
+    3206881280U, // <1,7,4,6>: Cost 3 ins <u,7,4,6>, lane 0
+    3165380610U, // <1,7,4,7>: Cost 3 ins <1,7,u,7>, lane 2
+    2133131264U, // <1,7,4,u>: Cost 2 ins <u,7,4,5>, lane 0
+    2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS
+    3028292602U, // <1,7,5,1>: Cost 3 vtrnl <1,3,5,7>, <7,0,1,2>
+    3165782017U, // <1,7,5,2>: Cost 3 ins <1,u,5,2>, lane 1
+    3028292704U, // <1,7,5,3>: Cost 3 vtrnl <1,3,5,7>, <7,1,3,5>
+    2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS
+    3028292966U, // <1,7,5,5>: Cost 3 vtrnl <1,3,5,7>, <7,4,5,6>
+    2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7>
+    2133221376U, // <1,7,5,7>: Cost 2 ins <u,7,5,7>, lane 0
+    2133221376U, // <1,7,5,u>: Cost 2 ins <u,7,5,7>, lane 0
+    2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1>
+    2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0>
+    3206995968U, // <1,7,6,2>: Cost 3 ins <u,7,6,2>, lane 0
+    3165347842U, // <1,7,6,3>: Cost 3 ins <1,7,u,3>, lane 2
+    2257409130U, // <1,7,6,4>: Cost 3 vrev <7,1,4,6>
+    3207020544U, // <1,7,6,5>: Cost 3 ins <u,7,6,5>, lane 0
+    3207028736U, // <1,7,6,6>: Cost 3 ins <u,7,6,6>, lane 0
+    2133295104U, // <1,7,6,7>: Cost 2 ins <u,7,6,7>, lane 0
+    2133295104U, // <1,7,6,u>: Cost 2 ins <u,7,6,7>, lane 0
+    2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1>
+    2861470542U, // <1,7,7,1>: Cost 3 vuzpr <7,1,5,7>, <6,7,0,1>
+    3165929473U, // <1,7,7,2>: Cost 3 ins <1,u,7,2>, lane 1
+    2998046416U, // <1,7,7,3>: Cost 3 vzipr <7,5,1,7>, <5,1,7,3>
+    3207086080U, // <1,7,7,4>: Cost 3 ins <u,7,7,4>, lane 0
+    2257491060U, // <1,7,7,5>: Cost 3 vrev <7,1,5,7>
+    3207102464U, // <1,7,7,6>: Cost 3 ins <u,7,7,6>, lane 0
+    2133368832U, // <1,7,7,7>: Cost 2 ins <u,7,7,7>, lane 0
+    2133368832U, // <1,7,7,u>: Cost 2 ins <u,7,7,7>, lane 0
+    1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS
+    2061923150U, // <1,7,u,1>: Cost 2 vtrnr LHS, <6,7,0,1>
+    2091892737U, // <1,7,u,2>: Cost 2 ins <1,u,3,2>, lane 1
+    2061922432U, // <1,7,u,3>: Cost 2 vtrnr LHS, <5,7,1,3>
+    1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS
+    1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u>
+    2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3>
+    2061922472U, // <1,7,u,7>: Cost 2 vtrnr LHS, <5,7,5,7>
+    1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS
+    1745707008U, // <1,u,0,0>: Cost 2 vuzpr LHS, <0,0,0,0>
+    1745707018U, // <1,u,0,1>: Cost 2 vuzpr LHS, <0,0,1,1>
+    1745707028U, // <1,u,0,2>: Cost 2 vuzpr LHS, <0,0,2,2>
+    2087624706U, // <1,u,0,3>: Cost 2 ins <1,1,u,3>, lane 2
+    1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u>
+    1818155162U, // <1,u,0,5>: Cost 2 vzipl <1,0,3,2>, RHS
+    2891897040U, // <1,u,0,6>: Cost 3 vzipl <1,0,3,2>, <u,6,3,7>
+    2088984578U, // <1,u,0,7>: Cost 2 ins <1,3,u,7>, lane 2
+    1745707025U, // <1,u,0,u>: Cost 2 vuzpr LHS, <0,0,1,u>
+    1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+    202162278U,  // <1,u,1,1>: Cost 1 vdup1 LHS
+    1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+    671965286U,  // <1,u,1,3>: Cost 1 vuzpr LHS, LHS
+    1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+    1818663066U, // <1,u,1,5>: Cost 2 vzipl <1,1,1,1>, RHS
+    1952880794U, // <1,u,1,6>: Cost 2 vtrnl <1,1,1,1>, RHS
+    1884441928U, // <1,u,1,7>: Cost 2 vzipr <0,u,1,1>, RHS
+    671965291U,  // <1,u,1,u>: Cost 1 vuzpr LHS, LHS
+    1745707926U, // <1,u,2,0>: Cost 2 vuzpr LHS, <1,2,3,0>
+    1819465518U, // <1,u,2,1>: Cost 2 vzipl <1,2,3,0>, LHS
+    1745707172U, // <1,u,2,2>: Cost 2 vuzpr LHS, <0,2,0,2>
+    1055244288U, // <1,u,2,3>: Cost 1 ins LHS, lane 0
+    1745707930U, // <1,u,2,4>: Cost 2 vuzpr LHS, <1,2,3,4>
+    1819465882U, // <1,u,2,5>: Cost 2 vzipl <1,2,3,0>, RHS
+    1745707212U, // <1,u,2,6>: Cost 2 vuzpr LHS, <0,2,4,6>
+    1897057608U, // <1,u,2,7>: Cost 2 vzipr <3,0,1,2>, RHS
+    1055244288U, // <1,u,2,u>: Cost 1 ins LHS, lane 0
+    403931292U,  // <1,u,3,0>: Cost 1 vext1 LHS, LHS
+    2014102162U, // <1,u,3,1>: Cost 2 vtrnr LHS, <0,u,1,1>
+    115726126U,  // <1,u,3,2>: Cost 1 vrev LHS
+    940360349U,  // <1,u,3,3>: Cost 1 vtrnr LHS, LHS
+    403934518U,  // <1,u,3,4>: Cost 1 vext1 LHS, RHS
+    2014102166U, // <1,u,3,5>: Cost 2 vtrnr LHS, <0,u,1,5>
+    2014102176U, // <1,u,3,6>: Cost 2 vtrnr LHS, <0,u,2,6>
+    940363305U,  // <1,u,3,7>: Cost 1 vtrnr LHS, RHS
+    940360354U,  // <1,u,3,u>: Cost 1 vtrnr LHS, LHS
+    2088263682U, // <1,u,4,0>: Cost 2 ins <1,2,u,0>, lane 2
+    2087608322U, // <1,u,4,1>: Cost 2 ins <1,1,u,1>, lane 2
+    2086952962U, // <1,u,4,2>: Cost 2 ins <1,0,u,2>, lane 2
+    2087624706U, // <1,u,4,3>: Cost 2 ins <1,1,u,3>, lane 2
+    1793486032U, // <1,u,4,4>: Cost 2 vuzpr LHS, <4,4,4,4>
+    1745707346U, // <1,u,4,5>: Cost 2 vuzpr LHS, <0,4,1,5>
+    1745707356U, // <1,u,4,6>: Cost 2 vuzpr LHS, <0,4,2,6>
+    2088984578U, // <1,u,4,7>: Cost 2 ins <1,3,u,7>, lane 2
+    1745707349U, // <1,u,4,u>: Cost 2 vuzpr LHS, <0,4,1,u>
+    2088263682U, // <1,u,5,0>: Cost 2 ins <1,2,u,0>, lane 2
+    1821513518U, // <1,u,5,1>: Cost 2 vzipl <1,5,3,7>, LHS
+    1954551598U, // <1,u,5,2>: Cost 2 vtrnl <1,3,5,7>, LHS
+    1881817244U, // <1,u,5,3>: Cost 2 vzipr <0,4,1,5>, LHS
+    2088296450U, // <1,u,5,4>: Cost 2 ins <1,2,u,4>, lane 2
+    1821513882U, // <1,u,5,5>: Cost 2 vzipl <1,5,3,7>, RHS
+    1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+    671968566U,  // <1,u,5,7>: Cost 1 vuzpr LHS, RHS
+    671968567U,  // <1,u,5,u>: Cost 1 vuzpr LHS, RHS
+    1793486946U, // <1,u,6,0>: Cost 2 vuzpr LHS, <5,6,7,0>
+    2087608322U, // <1,u,6,1>: Cost 2 ins <1,1,u,1>, lane 2
+    1793486156U, // <1,u,6,2>: Cost 2 vuzpr LHS, <4,6,0,2>
+    2087624706U, // <1,u,6,3>: Cost 2 ins <1,1,u,3>, lane 2
+    1793486950U, // <1,u,6,4>: Cost 2 vuzpr LHS, <5,6,7,4>
+    2131951616U, // <1,u,6,5>: Cost 2 ins <u,5,6,5>, lane 0
+    1793486196U, // <1,u,6,6>: Cost 2 vuzpr LHS, <4,6,4,6>
+    1058226176U, // <1,u,6,7>: Cost 1 ins RHS, lane 0
+    1058226176U, // <1,u,6,u>: Cost 1 ins RHS, lane 0
+    1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u>
+    1793487694U, // <1,u,7,1>: Cost 2 vuzpr LHS, <6,7,0,1>
+    2086952962U, // <1,u,7,2>: Cost 2 ins <1,0,u,2>, lane 2
+    1793486976U, // <1,u,7,3>: Cost 2 vuzpr LHS, <5,7,1,3>
+    2088296450U, // <1,u,7,4>: Cost 2 ins <1,2,u,4>, lane 2
+    1793487734U, // <1,u,7,5>: Cost 2 vuzpr LHS, <6,7,4,5>
+    2131369984U, // <1,u,7,6>: Cost 2 ins <u,4,7,6>, lane 0
+    1793487016U, // <1,u,7,7>: Cost 2 vuzpr LHS, <5,7,5,7>
+    1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u>
+    403972257U,  // <1,u,u,0>: Cost 1 vext1 LHS, LHS
+    202162278U,  // <1,u,u,1>: Cost 1 vdup1 LHS
+    115767091U,  // <1,u,u,2>: Cost 1 vrev LHS
+    671965853U,  // <1,u,u,3>: Cost 1 vuzpr LHS, LHS
+    403975478U,  // <1,u,u,4>: Cost 1 vext1 LHS, RHS
+    1745707670U, // <1,u,u,5>: Cost 2 vuzpr LHS, <0,u,1,5>
+    1745707680U, // <1,u,u,6>: Cost 2 vuzpr LHS, <0,u,2,6>
+    671968809U,  // <1,u,u,7>: Cost 1 vuzpr LHS, RHS
+    671965858U,  // <1,u,u,u>: Cost 1 vuzpr LHS, LHS
+    2128150528U, // <2,0,0,0>: Cost 2 ins <u,0,0,0>, lane 0
+    2097635329U, // <2,0,0,1>: Cost 2 ins <2,u,0,1>, lane 1
+    1691664486U, // <2,0,0,2>: Cost 2 vuzpl <2,3,0,1>, LHS
+    2826094014U, // <2,0,0,3>: Cost 3 vuzpr <1,2,3,0>, <2,0,1,3>
+    2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS
+    2826094772U, // <2,0,0,5>: Cost 3 vuzpr <1,2,3,0>, <3,0,4,5>
+    3171418113U, // <2,0,0,6>: Cost 3 ins <2,u,0,6>, lane 1
+    3094529510U, // <2,0,0,7>: Cost 3 vtrnr <1,2,3,0>, <2,0,5,7>
+    1691664540U, // <2,0,0,u>: Cost 2 vuzpl <2,3,0,1>, LHS
+    2215927971U, // <2,0,1,0>: Cost 3 vrev <0,2,0,1>
+    2128232448U, // <2,0,1,1>: Cost 2 ins <u,0,1,1>, lane 0
+    1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS
+    1752350822U, // <2,0,1,3>: Cost 2 vuzpr <1,2,3,0>, LHS
+    2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS
+    2765407232U, // <2,0,1,5>: Cost 3 vuzpl <2,3,0,1>, <1,3,5,7>
+    2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1>
+    3166707714U, // <2,0,1,7>: Cost 3 ins <2,0,u,7>, lane 2
+    1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS
+    1142194340U, // <2,0,2,0>: Cost 2 vrev <0,2,0,2>
+    1825374310U, // <2,0,2,1>: Cost 2 vzipl <2,2,2,2>, LHS
+    1959592038U, // <2,0,2,2>: Cost 2 vtrnl <2,2,2,2>, LHS
+    2128322560U, // <2,0,2,3>: Cost 2 ins <u,0,2,3>, lane 0
+    1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS
+    2599259856U, // <2,0,2,5>: Cost 3 vext1 <u,2,0,2>, <5,1,7,3>
+    3088351274U, // <2,0,2,6>: Cost 3 vtrnr <0,2,0,2>, <0,0,4,6>
+    2599261178U, // <2,0,2,7>: Cost 3 vext1 <u,2,0,2>, <7,0,1,2>
+    1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS
+    1879883776U, // <2,0,3,0>: Cost 2 vzipr LHS, <0,0,0,0>
+    1879885478U, // <2,0,3,1>: Cost 2 vzipr LHS, <2,3,0,1>
+    1879883940U, // <2,0,3,2>: Cost 2 vzipr LHS, <0,2,0,2>
+    2097872897U, // <2,0,3,3>: Cost 2 ins <2,u,3,3>, lane 1
+    2958270630U, // <2,0,3,4>: Cost 3 vzipr LHS, <0,2,0,4>
+    2826094286U, // <2,0,3,5>: Cost 3 vuzpr <1,2,3,0>, <2,3,4,5>
+    2958270794U, // <2,0,3,6>: Cost 3 vzipr LHS, <0,4,0,6>
+    2097905665U, // <2,0,3,7>: Cost 2 ins <2,u,3,7>, lane 1
+    1879883946U, // <2,0,3,u>: Cost 2 vzipr LHS, <0,2,0,u>
+    2215952550U, // <2,0,4,0>: Cost 3 vrev <0,2,0,4>
+    2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5>
+    1960427622U, // <2,0,4,2>: Cost 2 vtrnl <2,3,4,5>, LHS
+    3171688449U, // <2,0,4,3>: Cost 3 ins <2,u,4,3>, lane 1
+    2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS
+    2097963009U, // <2,0,4,5>: Cost 2 ins <2,u,4,5>, lane 1
+    1691667766U, // <2,0,4,6>: Cost 2 vuzpl <2,3,0,1>, RHS
+    3171721217U, // <2,0,4,7>: Cost 3 ins <2,u,4,7>, lane 1
+    1691667784U, // <2,0,4,u>: Cost 2 vuzpl <2,3,0,1>, RHS
+    3033596068U, // <2,0,5,0>: Cost 3 vtrnl <2,2,5,7>, <0,2,0,2>
+    2128527360U, // <2,0,5,1>: Cost 2 ins <u,0,5,1>, lane 0
+    2955632804U, // <2,0,5,2>: Cost 3 vzipr <0,4,2,5>, <0,2,0,2>
+    2216181954U, // <2,0,5,3>: Cost 3 vrev <0,2,3,5>
+    2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5>
+    2867900420U, // <2,0,5,5>: Cost 3 vuzpr <u,2,3,0>, <5,5,5,5>
+    3202310144U, // <2,0,5,6>: Cost 3 ins <u,0,5,6>, lane 0
+    1752354102U, // <2,0,5,7>: Cost 2 vuzpr <1,2,3,0>, RHS
+    1752354103U, // <2,0,5,u>: Cost 2 vuzpr <1,2,3,0>, RHS
+    3088678912U, // <2,0,6,0>: Cost 3 vtrnr <0,2,4,6>, <0,0,0,0>
+    1828143206U, // <2,0,6,1>: Cost 2 vzipl <2,6,3,7>, LHS
+    2128609280U, // <2,0,6,2>: Cost 2 ins <u,0,6,2>, lane 0
+    3171835905U, // <2,0,6,3>: Cost 3 ins <2,u,6,3>, lane 1
+    1142522060U, // <2,0,6,4>: Cost 2 vrev <0,2,4,6>
+    3171852289U, // <2,0,6,5>: Cost 3 ins <2,u,6,5>, lane 1
+    2867899764U, // <2,0,6,6>: Cost 3 vuzpr <u,2,3,0>, <4,6,4,6>
+    2128650240U, // <2,0,6,7>: Cost 2 ins <u,0,6,7>, lane 0
+    1142817008U, // <2,0,6,u>: Cost 2 vrev <0,2,u,6>
+    2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+    2867901262U, // <2,0,7,1>: Cost 3 vuzpr <u,2,3,0>, <6,7,0,1>
+    2956976292U, // <2,0,7,2>: Cost 3 vzipr <0,6,2,7>, <0,2,0,2>
+    2867900544U, // <2,0,7,3>: Cost 3 vuzpr <u,2,3,0>, <5,7,1,3>
+    3171917825U, // <2,0,7,4>: Cost 3 ins <2,u,7,4>, lane 1
+    2867901302U, // <2,0,7,5>: Cost 3 vuzpr <u,2,3,0>, <6,7,4,5>
+    3166699522U, // <2,0,7,6>: Cost 3 ins <2,0,u,6>, lane 2
+    2867900584U, // <2,0,7,7>: Cost 3 vuzpr <u,2,3,0>, <5,7,5,7>
+    2867900549U, // <2,0,7,u>: Cost 3 vuzpr <u,2,3,0>, <5,7,1,u>
+    1879924736U, // <2,0,u,0>: Cost 2 vzipr LHS, <0,0,0,0>
+    1879926438U, // <2,0,u,1>: Cost 2 vzipr LHS, <2,3,0,1>
+    1879924900U, // <2,0,u,2>: Cost 2 vzipr LHS, <0,2,0,2>
+    1752351389U, // <2,0,u,3>: Cost 2 vuzpr <1,2,3,0>, LHS
+    1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS
+    2097963009U, // <2,0,u,5>: Cost 2 ins <2,u,4,5>, lane 1
+    1691670682U, // <2,0,u,6>: Cost 2 vuzpl <2,3,0,1>, RHS
+    1752354345U, // <2,0,u,7>: Cost 2 vuzpr <1,2,3,0>, RHS
+    1879924906U, // <2,0,u,u>: Cost 2 vzipr LHS, <0,2,0,u>
+    2763497636U, // <2,1,0,0>: Cost 3 vuzpl <2,0,1,2>, <0,2,0,2>
+    2097635329U, // <2,1,0,1>: Cost 2 ins <2,u,0,1>, lane 1
+    2820130966U, // <2,1,0,2>: Cost 3 vuzpr <0,2,3,1>, <3,0,1,2>
+    1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2>
+    2767487180U, // <2,1,0,4>: Cost 3 vuzpl <2,6,1,3>, <0,2,4,6>
+    3033842688U, // <2,1,0,5>: Cost 3 vtrnl <2,3,0,1>, <1,3,5,7>
+    3171418113U, // <2,1,0,6>: Cost 3 ins <2,u,0,6>, lane 1
+    3171426305U, // <2,1,0,7>: Cost 3 ins <2,u,0,7>, lane 1
+    1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2>
+    2551546028U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, <0,2,1,1>
+    2128896000U, // <2,1,1,1>: Cost 2 ins <u,1,1,1>, lane 0
+    2954938518U, // <2,1,1,2>: Cost 3 vzipr <0,3,2,1>, <3,0,1,2>
+    2128912384U, // <2,1,1,3>: Cost 2 ins <u,1,1,3>, lane 0
+    2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS
+    3202670592U, // <2,1,1,5>: Cost 3 ins <u,1,1,5>, lane 0
+    3202678784U, // <2,1,1,6>: Cost 3 ins <u,1,1,6>, lane 0
+    2953612553U, // <2,1,1,7>: Cost 3 vzipr <0,1,2,1>, <4,5,1,7>
+    2128896000U, // <2,1,1,u>: Cost 2 ins <u,1,1,1>, lane 0
+    2128961536U, // <2,1,2,0>: Cost 2 ins <u,1,2,0>, lane 0
+    2128969728U, // <2,1,2,1>: Cost 2 ins <u,1,2,1>, lane 0
+    2128977920U, // <2,1,2,2>: Cost 2 ins <u,1,2,2>, lane 0
+    1055244288U, // <2,1,2,3>: Cost 1 ins LHS, lane 0
+    2128994304U, // <2,1,2,4>: Cost 2 ins <u,1,2,4>, lane 0
+    2129002496U, // <2,1,2,5>: Cost 2 ins <u,1,2,5>, lane 0
+    2129010688U, // <2,1,2,6>: Cost 2 ins <u,1,2,6>, lane 0
+    2129018880U, // <2,1,2,7>: Cost 2 ins <u,1,2,7>, lane 0
+    1055244288U, // <2,1,2,u>: Cost 1 ins LHS, lane 0
+    2953625609U, // <2,1,3,0>: Cost 3 vzipr LHS, <0,0,1,0>
+    1879883786U, // <2,1,3,1>: Cost 2 vzipr LHS, <0,0,1,1>
+    1879885974U, // <2,1,3,2>: Cost 2 vzipr LHS, <3,0,1,2>
+    1879884760U, // <2,1,3,3>: Cost 2 vzipr LHS, <1,3,1,3>
+    2953625856U, // <2,1,3,4>: Cost 3 vzipr LHS, <0,3,1,4>
+    1879884114U, // <2,1,3,5>: Cost 2 vzipr LHS, <0,4,1,5>
+    2958270641U, // <2,1,3,6>: Cost 3 vzipr LHS, <0,2,1,6>
+    2097905665U, // <2,1,3,7>: Cost 2 ins <2,u,3,7>, lane 1
+    1879883793U, // <2,1,3,u>: Cost 2 vzipr LHS, <0,0,1,u>
+    3171663873U, // <2,1,4,0>: Cost 3 ins <2,u,4,0>, lane 1
+    3094561588U, // <2,1,4,1>: Cost 3 vtrnr <1,2,3,4>, <1,1,1,1>
+    2900378522U, // <2,1,4,2>: Cost 3 vzipl <2,4,1,3>, <1,2,3,4>
+    1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4>
+    3171696641U, // <2,1,4,4>: Cost 3 ins <2,u,4,4>, lane 1
+    2097963009U, // <2,1,4,5>: Cost 2 ins <2,u,4,5>, lane 1
+    2763500854U, // <2,1,4,6>: Cost 3 vuzpl <2,0,1,2>, RHS
+    3171721217U, // <2,1,4,7>: Cost 3 ins <2,u,4,7>, lane 1
+    2020819051U, // <2,1,4,u>: Cost 2 vtrnr <1,2,3,4>, LHS
+    2551578800U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, <0,2,1,5>
+    2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7>
+    2901001110U, // <2,1,5,2>: Cost 3 vzipl <2,5,0,7>, <1,2,3,0>
+    2129207296U, // <2,1,5,3>: Cost 2 ins <u,1,5,3>, lane 0
+    2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS
+    3202965504U, // <2,1,5,5>: Cost 3 ins <u,1,5,5>, lane 0
+    3171786753U, // <2,1,5,6>: Cost 3 ins <2,u,5,6>, lane 1
+    2819910966U, // <2,1,5,7>: Cost 3 vuzpr <0,2,0,1>, RHS
+    2129207296U, // <2,1,5,u>: Cost 2 ins <u,1,5,3>, lane 0
+    2551586993U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, <0,2,1,6>
+    3088679732U, // <2,1,6,1>: Cost 3 vtrnr <0,2,4,6>, <1,1,1,1>
+    2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7>
+    2014937190U, // <2,1,6,3>: Cost 2 vtrnr <0,2,4,6>, LHS
+    2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS
+    2955641170U, // <2,1,6,5>: Cost 3 vzipr <0,4,2,6>, <0,4,1,5>
+    2901886177U, // <2,1,6,6>: Cost 3 vzipl <2,6,3,7>, <1,6,3,7>
+    2129313792U, // <2,1,6,7>: Cost 2 ins <u,1,6,7>, lane 0
+    2014937195U, // <2,1,6,u>: Cost 2 vtrnr <0,2,4,6>, LHS
+    3171885057U, // <2,1,7,0>: Cost 3 ins <2,u,7,0>, lane 1
+    3203080192U, // <2,1,7,1>: Cost 3 ins <u,1,7,1>, lane 0
+    3001439874U, // <2,1,7,2>: Cost 3 vzipr <u,1,2,7>, <7,u,1,2>
+    2129354752U, // <2,1,7,3>: Cost 2 ins <u,1,7,3>, lane 0
+    3171917825U, // <2,1,7,4>: Cost 3 ins <2,u,7,4>, lane 1
+    3203112960U, // <2,1,7,5>: Cost 3 ins <u,1,7,5>, lane 0
+    2222392248U, // <2,1,7,6>: Cost 3 vrev <1,2,6,7>
+    3171942401U, // <2,1,7,7>: Cost 3 ins <2,u,7,7>, lane 1
+    2129354752U, // <2,1,7,u>: Cost 2 ins <u,1,7,3>, lane 0
+    2128961536U, // <2,1,u,0>: Cost 2 ins <u,1,2,0>, lane 0
+    1879924746U, // <2,1,u,1>: Cost 2 vzipr LHS, <0,0,1,1>
+    1879926934U, // <2,1,u,2>: Cost 2 vzipr LHS, <3,0,1,2>
+    1055244288U, // <2,1,u,3>: Cost 1 ins LHS, lane 0
+    2128994304U, // <2,1,u,4>: Cost 2 ins <u,1,2,4>, lane 0
+    1879925074U, // <2,1,u,5>: Cost 2 vzipr LHS, <0,4,1,5>
+    2129010688U, // <2,1,u,6>: Cost 2 ins <u,1,2,6>, lane 0
+    2097905665U, // <2,1,u,7>: Cost 2 ins <2,u,3,7>, lane 1
+    1055244288U, // <2,1,u,u>: Cost 1 ins LHS, lane 0
+    2020787094U, // <2,2,0,0>: Cost 2 vtrnr <1,2,3,0>, <1,2,3,0>
+    1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS
+    1691156582U, // <2,2,0,2>: Cost 2 vuzpl <2,2,2,2>, LHS
+    2094260226U, // <2,2,0,3>: Cost 2 ins <2,2,u,3>, lane 2
+    2819917256U, // <2,2,0,4>: Cost 3 vuzpr <0,2,0,2>, <2,0,2,4>
+    3168018434U, // <2,2,0,5>: Cost 3 ins <2,2,u,5>, lane 2
+    2819915818U, // <2,2,0,6>: Cost 3 vuzpr <0,2,0,2>, <0,0,4,6>
+    3171426305U, // <2,2,0,7>: Cost 3 ins <2,u,0,7>, lane 1
+    1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2>
+    2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2>
+    1879867492U, // <2,2,1,1>: Cost 2 vzipr <0,1,2,1>, <0,1,2,1>
+    2094252034U, // <2,2,1,2>: Cost 2 ins <2,2,u,2>, lane 2
+    1746174054U, // <2,2,1,3>: Cost 2 vuzpr <0,2,0,2>, LHS
+    3167526915U, // <2,2,1,4>: Cost 3 ins <2,2,1,u>, lane 3
+    2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7>
+    3203342336U, // <2,2,1,6>: Cost 3 ins <u,2,1,6>, lane 0
+    3168034818U, // <2,2,1,7>: Cost 3 ins <2,2,u,7>, lane 2
+    1746174059U, // <2,2,1,u>: Cost 2 vuzpr <0,2,0,2>, LHS
+    1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS
+    2093858819U, // <2,2,2,1>: Cost 2 ins <2,2,2,u>, lane 3
+    269271142U,  // <2,2,2,2>: Cost 1 vdup2 LHS
+    1884520550U, // <2,2,2,3>: Cost 2 vzipr <0,u,2,2>, LHS
+    1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS
+    2093858819U, // <2,2,2,5>: Cost 2 ins <2,2,2,u>, lane 3
+    2093858819U, // <2,2,2,6>: Cost 2 ins <2,2,2,u>, lane 3
+    2093858819U, // <2,2,2,7>: Cost 2 ins <2,2,2,u>, lane 3
+    269271142U,  // <2,2,2,u>: Cost 1 vdup2 LHS
+    2129698816U, // <2,2,3,0>: Cost 2 ins <u,2,3,0>, lane 0
+    2093932547U, // <2,2,3,1>: Cost 2 ins <2,2,3,u>, lane 3
+    1879885416U, // <2,2,3,2>: Cost 2 vzipr LHS, <2,2,2,2>
+    806142054U,  // <2,2,3,3>: Cost 1 vzipr LHS, LHS
+    2129731584U, // <2,2,3,4>: Cost 2 ins <u,2,3,4>, lane 0
+    2093932547U, // <2,2,3,5>: Cost 2 ins <2,2,3,u>, lane 3
+    1884528988U, // <2,2,3,6>: Cost 2 vzipr LHS, <0,4,2,6>
+    2097905665U, // <2,2,3,7>: Cost 2 ins <2,u,3,7>, lane 1
+    806142059U,  // <2,2,3,u>: Cost 1 vzipr LHS, LHS
+    2551644344U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, <0,2,2,4>
+    3171672065U, // <2,2,4,1>: Cost 3 ins <2,u,4,1>, lane 1
+    2094252034U, // <2,2,4,2>: Cost 2 ins <2,2,u,2>, lane 2
+    2094260226U, // <2,2,4,3>: Cost 2 ins <2,2,u,3>, lane 2
+    2020819866U, // <2,2,4,4>: Cost 2 vtrnr <1,2,3,4>, <1,2,3,4>
+    1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS
+    1691159862U, // <2,2,4,6>: Cost 2 vuzpl <2,2,2,2>, RHS
+    3171721217U, // <2,2,4,7>: Cost 3 ins <2,u,4,7>, lane 1
+    1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS
+    3167821827U, // <2,2,5,0>: Cost 3 ins <2,2,5,u>, lane 3
+    2670497488U, // <2,2,5,1>: Cost 3 vext2 <u,u,2,2>, <5,1,7,3>
+    2094252034U, // <2,2,5,2>: Cost 2 ins <2,2,u,2>, lane 2
+    2094260226U, // <2,2,5,3>: Cost 2 ins <2,2,u,3>, lane 2
+    2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5>
+    1879900264U, // <2,2,5,5>: Cost 2 vzipr <0,1,2,5>, <0,1,2,5>
+    2670497890U, // <2,2,5,6>: Cost 3 vext2 <u,u,2,2>, <5,6,7,0>
+    1746177334U, // <2,2,5,7>: Cost 2 vuzpr <0,2,0,2>, RHS
+    1746177335U, // <2,2,5,u>: Cost 2 vuzpr <0,2,0,2>, RHS
+    3088679830U, // <2,2,6,0>: Cost 3 vtrnr <0,2,4,6>, <1,2,3,0>
+    3171819521U, // <2,2,6,1>: Cost 3 ins <2,u,6,1>, lane 1
+    2094252034U, // <2,2,6,2>: Cost 2 ins <2,2,u,2>, lane 2
+    1881899110U, // <2,2,6,3>: Cost 2 vzipr <0,4,2,6>, LHS
+    3088679078U, // <2,2,6,4>: Cost 3 vtrnr <0,2,4,6>, <0,2,0,4>
+    3171852289U, // <2,2,6,5>: Cost 3 ins <2,u,6,5>, lane 1
+    2014937292U, // <2,2,6,6>: Cost 2 vtrnr <0,2,4,6>, <0,2,4,6>
+    2094301189U, // <2,2,6,7>: Cost 2 ins <2,2,u,u>, lane 5
+    1881899115U, // <2,2,6,u>: Cost 2 vzipr <0,4,2,6>, LHS
+    2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1>
+    2867696462U, // <2,2,7,1>: Cost 3 vuzpr <u,2,0,2>, <6,7,0,1>
+    2094252034U, // <2,2,7,2>: Cost 2 ins <2,2,u,2>, lane 2
+    2130018304U, // <2,2,7,3>: Cost 2 ins <u,2,7,3>, lane 0
+    2670499174U, // <2,2,7,4>: Cost 3 vext2 <u,u,2,2>, <7,4,5,6>
+    2228291208U, // <2,2,7,5>: Cost 3 vrev <2,2,5,7>
+    3203784704U, // <2,2,7,6>: Cost 3 ins <u,2,7,6>, lane 0
+    1879916650U, // <2,2,7,7>: Cost 2 vzipr <0,1,2,7>, <0,1,2,7>
+    2130018304U, // <2,2,7,u>: Cost 2 ins <u,2,7,3>, lane 0
+    2020787094U, // <2,2,u,0>: Cost 2 vtrnr <1,2,3,0>, <1,2,3,0>
+    1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS
+    269271142U,  // <2,2,u,2>: Cost 1 vdup2 LHS
+    806183014U,  // <2,2,u,3>: Cost 1 vzipr LHS, LHS
+    1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS
+    1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS
+    1879925084U, // <2,2,u,6>: Cost 2 vzipr LHS, <0,4,2,6>
+    1746177577U, // <2,2,u,7>: Cost 2 vuzpr <0,2,0,2>, RHS
+    806183019U,  // <2,2,u,u>: Cost 1 vzipr LHS, LHS
+    1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+    470597734U,  // <2,3,0,1>: Cost 1 vext2 LHS, LHS
+    1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+    2094374915U, // <2,3,0,3>: Cost 2 ins <2,3,0,u>, lane 3
+    1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+    2094940162U, // <2,3,0,5>: Cost 2 ins <2,3,u,5>, lane 2
+    2094374915U, // <2,3,0,6>: Cost 2 ins <2,3,0,u>, lane 3
+    2094374915U, // <2,3,0,7>: Cost 2 ins <2,3,0,u>, lane 3
+    470598301U,  // <2,3,0,u>: Cost 1 vext2 LHS, LHS
+    1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+    1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+    1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+    1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+    2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS
+    1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+    2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7>
+    2094956546U, // <2,3,1,7>: Cost 2 ins <2,3,u,7>, lane 2
+    1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+    2094522371U, // <2,3,2,0>: Cost 2 ins <2,3,2,u>, lane 3
+    2094907394U, // <2,3,2,1>: Cost 2 ins <2,3,u,1>, lane 2
+    1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+    1059889156U, // <2,3,2,3>: Cost 1 ins LHS, lane 4
+    2094522371U, // <2,3,2,4>: Cost 2 ins <2,3,2,u>, lane 3
+    2094940162U, // <2,3,2,5>: Cost 2 ins <2,3,u,5>, lane 2
+    1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+    2094956546U, // <2,3,2,7>: Cost 2 ins <2,3,u,7>, lane 2
+    1059889156U, // <2,3,2,u>: Cost 1 ins LHS, lane 4
+    1879884694U, // <2,3,3,0>: Cost 2 vzipr LHS, <1,2,3,0>
+    2094907394U, // <2,3,3,1>: Cost 2 ins <2,3,u,1>, lane 2
+    1879884534U, // <2,3,3,2>: Cost 2 vzipr LHS, <1,0,3,2>
+    1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3>
+    1879884698U, // <2,3,3,4>: Cost 2 vzipr LHS, <1,2,3,4>
+    2094940162U, // <2,3,3,5>: Cost 2 ins <2,3,u,5>, lane 2
+    2953627415U, // <2,3,3,6>: Cost 3 vzipr LHS, <2,4,3,6>
+    1884529808U, // <2,3,3,7>: Cost 2 vzipr LHS, <1,5,3,7>
+    1879884702U, // <2,3,3,u>: Cost 2 vzipr LHS, <1,2,3,u>
+    1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS
+    1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4>
+    2094669827U, // <2,3,4,2>: Cost 2 ins <2,3,4,u>, lane 3
+    2094669827U, // <2,3,4,3>: Cost 2 ins <2,3,4,u>, lane 3
+    1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS
+    470601014U,  // <2,3,4,5>: Cost 1 vext2 LHS, RHS
+    1691241782U, // <2,3,4,6>: Cost 2 vuzpl <2,2,3,3>, RHS
+    2094669827U, // <2,3,4,7>: Cost 2 ins <2,3,4,u>, lane 3
+    470601257U,  // <2,3,4,u>: Cost 1 vext2 LHS, RHS
+    2551726274U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, <0,2,3,5>
+    1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+    2665860843U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,1,3>
+    2094923778U, // <2,3,5,3>: Cost 2 ins <2,3,u,3>, lane 2
+    1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+    1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+    1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+    1758350646U, // <2,3,5,7>: Cost 2 vuzpr <2,2,3,3>, RHS
+    1758350647U, // <2,3,5,u>: Cost 2 vuzpr <2,2,3,3>, RHS
+    2094817283U, // <2,3,6,0>: Cost 2 ins <2,3,6,u>, lane 3
+    2094907394U, // <2,3,6,1>: Cost 2 ins <2,3,u,1>, lane 2
+    1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+    2094923778U, // <2,3,6,3>: Cost 2 ins <2,3,u,3>, lane 2
+    2094817283U, // <2,3,6,4>: Cost 2 ins <2,3,6,u>, lane 3
+    2094940162U, // <2,3,6,5>: Cost 2 ins <2,3,u,5>, lane 2
+    1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+    1060216836U, // <2,3,6,7>: Cost 1 ins RHS, lane 4
+    1060216836U, // <2,3,6,u>: Cost 1 ins RHS, lane 4
+    1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+    2094907394U, // <2,3,7,1>: Cost 2 ins <2,3,u,1>, lane 2
+    2974892790U, // <2,3,7,2>: Cost 3 vzipr <3,6,2,7>, <1,0,3,2>
+    2133999620U, // <2,3,7,3>: Cost 2 ins <u,u,7,3>, lane 4
+    1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+    2094940162U, // <2,3,7,5>: Cost 2 ins <2,3,u,5>, lane 2
+    2134024196U, // <2,3,7,6>: Cost 2 ins <u,u,7,6>, lane 4
+    1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+    1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+    1879925654U, // <2,3,u,0>: Cost 2 vzipr LHS, <1,2,3,0>
+    470603566U,  // <2,3,u,1>: Cost 1 vext2 LHS, LHS
+    1879925494U, // <2,3,u,2>: Cost 2 vzipr LHS, <1,0,3,2>
+    1059889156U, // <2,3,u,3>: Cost 1 ins LHS, lane 4
+    1879925658U, // <2,3,u,4>: Cost 2 vzipr LHS, <1,2,3,4>
+    470603930U,  // <2,3,u,5>: Cost 1 vext2 LHS, RHS
+    1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+    1060216836U, // <2,3,u,7>: Cost 1 ins RHS, lane 4
+    470604133U,  // <2,3,u,u>: Cost 1 vext2 LHS, LHS
+    2826125312U, // <2,4,0,0>: Cost 3 vuzpr <1,2,3,4>, <0,0,0,0>
+    2097635329U, // <2,4,0,1>: Cost 2 ins <2,u,0,1>, lane 1
+    1691992166U, // <2,4,0,2>: Cost 2 vuzpl <2,3,4,5>, LHS
+    3171393537U, // <2,4,0,3>: Cost 3 ins <2,u,0,3>, lane 1
+    2765734092U, // <2,4,0,4>: Cost 3 vuzpl <2,3,4,5>, <0,2,4,6>
+    3094528338U, // <2,4,0,5>: Cost 3 vtrnr <1,2,3,0>, <0,4,1,5>
+    1960103222U, // <2,4,0,6>: Cost 2 vtrnl <2,3,0,1>, RHS
+    3171426305U, // <2,4,0,7>: Cost 3 ins <2,u,0,7>, lane 1
+    1960103240U, // <2,4,0,u>: Cost 2 vtrnl <2,3,0,1>, RHS
+    3204620288U, // <2,4,1,0>: Cost 3 ins <u,4,1,0>, lane 0
+    2826126132U, // <2,4,1,1>: Cost 3 vuzpr <1,2,3,4>, <1,1,1,1>
+    2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4>
+    1752383590U, // <2,4,1,3>: Cost 2 vuzpr <1,2,3,4>, LHS
+    3204653056U, // <2,4,1,4>: Cost 3 ins <u,4,1,4>, lane 0
+    2130919424U, // <2,4,1,5>: Cost 2 ins <u,4,1,5>, lane 0
+    3031936310U, // <2,4,1,6>: Cost 3 vtrnl <2,0,1,2>, RHS
+    3169361922U, // <2,4,1,7>: Cost 3 ins <2,4,u,7>, lane 2
+    1752383595U, // <2,4,1,u>: Cost 2 vuzpr <1,2,3,4>, LHS
+    2826126230U, // <2,4,2,0>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,0>
+    3171524609U, // <2,4,2,1>: Cost 3 ins <2,u,2,1>, lane 1
+    2097790977U, // <2,4,2,2>: Cost 2 ins <2,u,2,2>, lane 1
+    2130976768U, // <2,4,2,3>: Cost 2 ins <u,4,2,3>, lane 0
+    1752384410U, // <2,4,2,4>: Cost 2 vuzpr <1,2,3,4>, <1,2,3,4>
+    1825377590U, // <2,4,2,5>: Cost 2 vzipl <2,2,2,2>, RHS
+    1959595318U, // <2,4,2,6>: Cost 2 vtrnl <2,2,2,2>, RHS
+    3171573761U, // <2,4,2,7>: Cost 3 ins <2,u,2,7>, lane 1
+    1825377833U, // <2,4,2,u>: Cost 2 vzipl <2,2,2,2>, RHS
+    2826127049U, // <2,4,3,0>: Cost 3 vuzpr <1,2,3,4>, <2,3,4,0>
+    2958270501U, // <2,4,3,1>: Cost 3 vzipr LHS, <0,0,4,1>
+    2958270502U, // <2,4,3,2>: Cost 3 vzipr LHS, <0,0,4,2>
+    2097872897U, // <2,4,3,3>: Cost 2 ins <2,u,3,3>, lane 1
+    1927662800U, // <2,4,3,4>: Cost 2 vzipr LHS, <4,4,4,4>
+    1879885518U, // <2,4,3,5>: Cost 2 vzipr LHS, <2,3,4,5>
+    1879883980U, // <2,4,3,6>: Cost 2 vzipr LHS, <0,2,4,6>
+    2097905665U, // <2,4,3,7>: Cost 2 ins <2,u,3,7>, lane 1
+    1879883982U, // <2,4,3,u>: Cost 2 vzipr LHS, <0,2,4,u>
+    2563735654U, // <2,4,4,0>: Cost 3 vext1 <2,2,4,4>, LHS
+    2826127824U, // <2,4,4,1>: Cost 3 vuzpr <1,2,3,4>, <3,4,0,1>
+    2826127834U, // <2,4,4,2>: Cost 3 vuzpr <1,2,3,4>, <3,4,1,2>
+    2826127106U, // <2,4,4,3>: Cost 3 vuzpr <1,2,3,4>, <2,4,1,3>
+    2131132416U, // <2,4,4,4>: Cost 2 ins <u,4,4,4>, lane 0
+    2097963009U, // <2,4,4,5>: Cost 2 ins <2,u,4,5>, lane 1
+    1691995446U, // <2,4,4,6>: Cost 2 vuzpl <2,3,4,5>, RHS
+    3094562602U, // <2,4,4,7>: Cost 3 vtrnr <1,2,3,4>, <2,4,5,7>
+    1691995464U, // <2,4,4,u>: Cost 2 vuzpl <2,3,4,5>, RHS
+    2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5>
+    2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2>
+    2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5>
+    2765737726U, // <2,4,5,3>: Cost 3 vuzpl <2,3,4,5>, <5,2,3,4>
+    2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS
+    2131214336U, // <2,4,5,5>: Cost 2 ins <u,4,5,5>, lane 0
+    1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+    1752386870U, // <2,4,5,7>: Cost 2 vuzpr <1,2,3,4>, RHS
+    1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+    1478066380U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, <0,2,4,6>
+    2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2>
+    2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4>
+    2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2>
+    1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS
+    1828146486U, // <2,4,6,5>: Cost 2 vzipl <2,6,3,7>, RHS
+    2131296256U, // <2,4,6,6>: Cost 2 ins <u,4,6,6>, lane 0
+    2131304448U, // <2,4,6,7>: Cost 2 ins <u,4,6,7>, lane 0
+    1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS
+    2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+    2867934030U, // <2,4,7,1>: Cost 3 vuzpr <u,2,3,4>, <6,7,0,1>
+    3169320962U, // <2,4,7,2>: Cost 3 ins <2,4,u,2>, lane 2
+    2867933312U, // <2,4,7,3>: Cost 3 vuzpr <u,2,3,4>, <5,7,1,3>
+    3205095424U, // <2,4,7,4>: Cost 3 ins <u,4,7,4>, lane 0
+    2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0>
+    2131369984U, // <2,4,7,6>: Cost 2 ins <u,4,7,6>, lane 0
+    2867933352U, // <2,4,7,7>: Cost 3 vuzpr <u,2,3,4>, <5,7,5,7>
+    2131369984U, // <2,4,7,u>: Cost 2 ins <u,4,7,6>, lane 0
+    1478082766U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, <0,2,4,u>
+    2097635329U, // <2,4,u,1>: Cost 2 ins <2,u,0,1>, lane 1
+    1691997998U, // <2,4,u,2>: Cost 2 vuzpl <2,3,4,5>, LHS
+    1752384157U, // <2,4,u,3>: Cost 2 vuzpr <1,2,3,4>, LHS
+    1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS
+    1879926478U, // <2,4,u,5>: Cost 2 vzipr LHS, <2,3,4,5>
+    1879924940U, // <2,4,u,6>: Cost 2 vzipr LHS, <0,2,4,6>
+    1752387113U, // <2,4,u,7>: Cost 2 vuzpr <1,2,3,4>, RHS
+    1879924942U, // <2,4,u,u>: Cost 2 vzipr LHS, <0,2,4,u>
+    2765160612U, // <2,5,0,0>: Cost 3 vuzpl <2,2,5,7>, <0,2,0,2>
+    2097635329U, // <2,5,0,1>: Cost 2 ins <2,u,0,1>, lane 1
+    2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2>
+    2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5>
+    2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5>
+    3136335876U, // <2,5,0,5>: Cost 3 vtrnr <u,2,3,0>, <5,5,5,5>
+    3171418113U, // <2,5,0,6>: Cost 3 ins <2,u,0,6>, lane 1
+    2020789558U, // <2,5,0,7>: Cost 2 vtrnr <1,2,3,0>, RHS
+    2020789559U, // <2,5,0,u>: Cost 2 vtrnr <1,2,3,0>, RHS
+    2599616614U, // <2,5,1,0>: Cost 3 vext1 <u,2,5,1>, LHS
+    3205292032U, // <2,5,1,1>: Cost 3 ins <u,5,1,1>, lane 0
+    2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0>
+    2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7>
+    2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5>
+    2599620736U, // <2,5,1,5>: Cost 3 vext1 <u,2,5,1>, <5,7,1,3>
+    3205332992U, // <2,5,1,6>: Cost 3 ins <u,5,1,6>, lane 0
+    2131599360U, // <2,5,1,7>: Cost 2 ins <u,5,1,7>, lane 0
+    2131599360U, // <2,5,1,u>: Cost 2 ins <u,5,1,7>, lane 0
+    3171516417U, // <2,5,2,0>: Cost 3 ins <2,u,2,0>, lane 1
+    3006040978U, // <2,5,2,1>: Cost 3 vzipr <u,u,2,2>, <4,0,5,1>
+    2097790977U, // <2,5,2,2>: Cost 2 ins <2,u,2,2>, lane 1
+    2131640320U, // <2,5,2,3>: Cost 2 ins <u,5,2,3>, lane 0
+    2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5>
+    2820014256U, // <2,5,2,5>: Cost 3 vuzpr <0,2,1,5>, <0,2,1,5>
+    2958264834U, // <2,5,2,6>: Cost 3 vzipr <0,u,2,2>, <3,4,5,6>
+    2014612790U, // <2,5,2,7>: Cost 2 vtrnr <0,2,0,2>, RHS
+    2014612791U, // <2,5,2,u>: Cost 2 vtrnr <0,2,0,2>, RHS
+    2958273506U, // <2,5,3,0>: Cost 3 vzipr LHS, <4,1,5,0>
+    1927662482U, // <2,5,3,1>: Cost 2 vzipr LHS, <4,0,5,1>
+    2899955454U, // <2,5,3,2>: Cost 3 vzipl <2,3,4,5>, <5,2,3,4>
+    2097872897U, // <2,5,3,3>: Cost 2 ins <2,u,3,3>, lane 1
+    2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6>
+    1927662810U, // <2,5,3,5>: Cost 2 vzipr LHS, <4,4,5,5>
+    1879886338U, // <2,5,3,6>: Cost 2 vzipr LHS, <3,4,5,6>
+    1879884800U, // <2,5,3,7>: Cost 2 vzipr LHS, <1,3,5,7>
+    1879884801U, // <2,5,3,u>: Cost 2 vzipr LHS, <1,3,5,u>
+    2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS
+    3171672065U, // <2,5,4,1>: Cost 3 ins <2,u,4,1>, lane 1
+    2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5>
+    3034173182U, // <2,5,4,3>: Cost 3 vtrnl <2,3,4,5>, <5,2,3,4>
+    2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS
+    2097963009U, // <2,5,4,5>: Cost 2 ins <2,u,4,5>, lane 1
+    2820164098U, // <2,5,4,6>: Cost 3 vuzpr <0,2,3,5>, <3,4,5,6>
+    2020822326U, // <2,5,4,7>: Cost 2 vtrnr <1,2,3,4>, RHS
+    2020822327U, // <2,5,4,u>: Cost 2 vtrnr <1,2,3,4>, RHS
+    2599649382U, // <2,5,5,0>: Cost 3 vext1 <u,2,5,5>, LHS
+    3003411346U, // <2,5,5,1>: Cost 3 vzipr <u,4,2,5>, <4,0,5,1>
+    2563819142U, // <2,5,5,2>: Cost 3 vext1 <2,2,5,5>, <2,2,5,5>
+    2953642113U, // <2,5,5,3>: Cost 3 vzipr <0,1,2,5>, <0,1,5,3>
+    2599652662U, // <2,5,5,4>: Cost 3 vext1 <u,2,5,5>, RHS
+    2131877888U, // <2,5,5,5>: Cost 2 ins <u,5,5,5>, lane 0
+    2954971650U, // <2,5,5,6>: Cost 3 vzipr <0,3,2,5>, <3,4,5,6>
+    2131894272U, // <2,5,5,7>: Cost 2 ins <u,5,5,7>, lane 0
+    2131877888U, // <2,5,5,u>: Cost 2 ins <u,5,5,5>, lane 0
+    2131910656U, // <2,5,6,0>: Cost 2 ins <u,5,6,0>, lane 0
+    2131918848U, // <2,5,6,1>: Cost 2 ins <u,5,6,1>, lane 0
+    2131927040U, // <2,5,6,2>: Cost 2 ins <u,5,6,2>, lane 0
+    2131935232U, // <2,5,6,3>: Cost 2 ins <u,5,6,3>, lane 0
+    2131943424U, // <2,5,6,4>: Cost 2 ins <u,5,6,4>, lane 0
+    2131951616U, // <2,5,6,5>: Cost 2 ins <u,5,6,5>, lane 0
+    2131959808U, // <2,5,6,6>: Cost 2 ins <u,5,6,6>, lane 0
+    1058226176U, // <2,5,6,7>: Cost 1 ins RHS, lane 0
+    1058226176U, // <2,5,6,u>: Cost 1 ins RHS, lane 0
+    2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS
+    2712244352U, // <2,5,7,1>: Cost 3 vext3 <4,6,0,2>, <5,7,1,3>
+    2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7>
+    2953658497U, // <2,5,7,3>: Cost 3 vzipr <0,1,2,7>, <0,1,5,3>
+    2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS
+    2712244392U, // <2,5,7,5>: Cost 3 vext3 <4,6,0,2>, <5,7,5,7>
+    2712244396U, // <2,5,7,6>: Cost 3 vext3 <4,6,0,2>, <5,7,6,2>
+    2132041728U, // <2,5,7,7>: Cost 2 ins <u,5,7,7>, lane 0
+    2132041728U, // <2,5,7,u>: Cost 2 ins <u,5,7,7>, lane 0
+    2131910656U, // <2,5,u,0>: Cost 2 ins <u,5,6,0>, lane 0
+    1927703442U, // <2,5,u,1>: Cost 2 vzipr LHS, <4,0,5,1>
+    2097790977U, // <2,5,u,2>: Cost 2 ins <2,u,2,2>, lane 1
+    2097872897U, // <2,5,u,3>: Cost 2 ins <2,u,3,3>, lane 1
+    2131943424U, // <2,5,u,4>: Cost 2 ins <u,5,6,4>, lane 0
+    1927703770U, // <2,5,u,5>: Cost 2 vzipr LHS, <4,4,5,5>
+    1879927298U, // <2,5,u,6>: Cost 2 vzipr LHS, <3,4,5,6>
+    1058226176U, // <2,5,u,7>: Cost 1 ins RHS, lane 0
+    1058226176U, // <2,5,u,u>: Cost 1 ins RHS, lane 0
+    2820243456U, // <2,6,0,0>: Cost 3 vuzpr <0,2,4,6>, <0,0,0,0>
+    1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS
+    2132148224U, // <2,6,0,2>: Cost 2 ins <u,6,0,2>, lane 0
+    3171393537U, // <2,6,0,3>: Cost 3 ins <2,u,0,3>, lane 1
+    1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6>
+    3170672642U, // <2,6,0,5>: Cost 3 ins <2,6,u,5>, lane 2
+    3136335220U, // <2,6,0,6>: Cost 3 vtrnr <u,2,3,0>, <4,6,4,6>
+    2096947202U, // <2,6,0,7>: Cost 2 ins <2,6,u,7>, lane 2
+    1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS
+    2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2>
+    2820244276U, // <2,6,1,1>: Cost 3 vuzpr <0,2,4,6>, <1,1,1,1>
+    2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0>
+    1746501734U, // <2,6,1,3>: Cost 2 vuzpr <0,2,4,6>, LHS
+    2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6>
+    2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7>
+    3205996544U, // <2,6,1,6>: Cost 3 ins <u,6,1,6>, lane 0
+    2096947202U, // <2,6,1,7>: Cost 2 ins <2,6,u,7>, lane 2
+    1746501739U, // <2,6,1,u>: Cost 2 vuzpr <0,2,4,6>, LHS
+    2820244374U, // <2,6,2,0>: Cost 3 vuzpr <0,2,4,6>, <1,2,3,0>
+    3171524609U, // <2,6,2,1>: Cost 3 ins <2,u,2,1>, lane 1
+    2097790977U, // <2,6,2,2>: Cost 2 ins <2,u,2,2>, lane 1
+    2096955397U, // <2,6,2,3>: Cost 2 ins <2,6,u,u>, lane 5
+    2820243622U, // <2,6,2,4>: Cost 3 vuzpr <0,2,4,6>, <0,2,0,4>
+    3171557377U, // <2,6,2,5>: Cost 3 ins <2,u,2,5>, lane 1
+    1746501836U, // <2,6,2,6>: Cost 2 vuzpr <0,2,4,6>, <0,2,4,6>
+    1884523830U, // <2,6,2,7>: Cost 2 vzipr <0,u,2,2>, RHS
+    1884523831U, // <2,6,2,u>: Cost 2 vzipr <0,u,2,2>, RHS
+    2096586755U, // <2,6,3,0>: Cost 2 ins <2,6,3,u>, lane 3
+    2096586755U, // <2,6,3,1>: Cost 2 ins <2,6,3,u>, lane 3
+    1927662492U, // <2,6,3,2>: Cost 2 vzipr LHS, <4,0,6,2>
+    2097872897U, // <2,6,3,3>: Cost 2 ins <2,u,3,3>, lane 1
+    2096586755U, // <2,6,3,4>: Cost 2 ins <2,6,3,u>, lane 3
+    2096586755U, // <2,6,3,5>: Cost 2 ins <2,6,3,u>, lane 3
+    1927662820U, // <2,6,3,6>: Cost 2 vzipr LHS, <4,4,6,6>
+    806145334U,  // <2,6,3,7>: Cost 1 vzipr LHS, RHS
+    806145335U,  // <2,6,3,u>: Cost 1 vzipr LHS, RHS
+    2820245292U, // <2,6,4,0>: Cost 3 vuzpr <0,2,4,6>, <2,4,6,0>
+    3171672065U, // <2,6,4,1>: Cost 3 ins <2,u,4,1>, lane 1
+    2820243782U, // <2,6,4,2>: Cost 3 vuzpr <0,2,4,6>, <0,4,0,2>
+    3171688449U, // <2,6,4,3>: Cost 3 ins <2,u,4,3>, lane 1
+    2820243784U, // <2,6,4,4>: Cost 3 vuzpr <0,2,4,6>, <0,4,0,4>
+    1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS
+    2132475904U, // <2,6,4,6>: Cost 2 ins <u,6,4,6>, lane 0
+    2096947202U, // <2,6,4,7>: Cost 2 ins <2,6,u,7>, lane 2
+    1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS
+    3170476035U, // <2,6,5,0>: Cost 3 ins <2,6,5,u>, lane 3
+    2667876048U, // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3>
+    3206258688U, // <2,6,5,2>: Cost 3 ins <u,6,5,2>, lane 0
+    3170656258U, // <2,6,5,3>: Cost 3 ins <2,6,u,3>, lane 2
+    2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5>
+    2868023300U, // <2,6,5,5>: Cost 3 vuzpr <u,2,4,6>, <5,5,5,5>
+    2667876450U, // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0>
+    1746505014U, // <2,6,5,7>: Cost 2 vuzpr <0,2,4,6>, RHS
+    1746505015U, // <2,6,5,u>: Cost 2 vuzpr <0,2,4,6>, RHS
+    2955643964U, // <2,6,6,0>: Cost 3 vzipr <0,4,2,6>, <4,2,6,0>
+    2820246859U, // <2,6,6,1>: Cost 3 vuzpr <0,2,4,6>, <4,6,0,1>
+    2820246860U, // <2,6,6,2>: Cost 3 vuzpr <0,2,4,6>, <4,6,0,2>
+    2820245412U, // <2,6,6,3>: Cost 3 vuzpr <0,2,4,6>, <2,6,1,3>
+    2955643968U, // <2,6,6,4>: Cost 3 vzipr <0,4,2,6>, <4,2,6,4>
+    2820246899U, // <2,6,6,5>: Cost 3 vuzpr <0,2,4,6>, <4,6,4,5>
+    2132623360U, // <2,6,6,6>: Cost 2 ins <u,6,6,6>, lane 0
+    1881902390U, // <2,6,6,7>: Cost 2 vzipr <0,4,2,6>, RHS
+    1881902391U, // <2,6,6,u>: Cost 2 vzipr <0,4,2,6>, RHS
+    2132647936U, // <2,6,7,0>: Cost 2 ins <u,6,7,0>, lane 0
+    2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2>
+    3124596044U, // <2,6,7,2>: Cost 3 vtrnr <6,2,5,7>, <4,6,0,2>
+    2868023424U, // <2,6,7,3>: Cost 3 vuzpr <u,2,4,6>, <5,7,1,3>
+    2132680704U, // <2,6,7,4>: Cost 2 ins <u,6,7,4>, lane 0
+    2252181996U, // <2,6,7,5>: Cost 3 vrev <6,2,5,7>
+    2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2>
+    2132705280U, // <2,6,7,7>: Cost 2 ins <u,6,7,7>, lane 0
+    2132647936U, // <2,6,7,u>: Cost 2 ins <u,6,7,0>, lane 0
+    2096586755U, // <2,6,u,0>: Cost 2 ins <2,6,3,u>, lane 3
+    1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS
+    1927703452U, // <2,6,u,2>: Cost 2 vzipr LHS, <4,0,6,2>
+    1746502301U, // <2,6,u,3>: Cost 2 vuzpr <0,2,4,6>, LHS
+    1594136612U, // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6>
+    1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS
+    1927703780U, // <2,6,u,6>: Cost 2 vzipr LHS, <4,4,6,6>
+    806186294U,  // <2,6,u,7>: Cost 1 vzipr LHS, RHS
+    806186295U,  // <2,6,u,u>: Cost 1 vzipr LHS, RHS
+    2581839974U, // <2,7,0,0>: Cost 3 vext1 <5,2,7,0>, LHS
+    1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2>
+    2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2>
+    2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0>
+    2581843254U, // <2,7,0,4>: Cost 3 vext1 <5,2,7,0>, RHS
+    2581843742U, // <2,7,0,5>: Cost 3 vext1 <5,2,7,0>, <5,2,7,0>
+    2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7>
+    3136336040U, // <2,7,0,7>: Cost 3 vtrnr <u,2,3,0>, <5,7,5,7>
+    1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2>
+    2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+    3206619136U, // <2,7,1,1>: Cost 3 ins <u,7,1,1>, lane 0
+    3206627328U, // <2,7,1,2>: Cost 3 ins <u,7,1,2>, lane 0
+    2132893696U, // <2,7,1,3>: Cost 2 ins <u,7,1,3>, lane 0
+    2599767350U, // <2,7,1,4>: Cost 3 vext1 <u,2,7,1>, RHS
+    3206651904U, // <2,7,1,5>: Cost 3 ins <u,7,1,5>, lane 0
+    3171344386U, // <2,7,1,6>: Cost 3 ins <2,7,u,6>, lane 2
+    2599769082U, // <2,7,1,7>: Cost 3 vext1 <u,2,7,1>, <7,0,1,2>
+    2132893696U, // <2,7,1,u>: Cost 2 ins <u,7,1,3>, lane 0
+    2581856358U, // <2,7,2,0>: Cost 3 vext1 <5,2,7,2>, LHS
+    3136131918U, // <2,7,2,1>: Cost 3 vtrnr <u,2,0,2>, <6,7,0,1>
+    2097790977U, // <2,7,2,2>: Cost 2 ins <2,u,2,2>, lane 1
+    2132967424U, // <2,7,2,3>: Cost 2 ins <u,7,2,3>, lane 0
+    2581859638U, // <2,7,2,4>: Cost 3 vext1 <5,2,7,2>, RHS
+    2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7>
+    2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7>
+    1770548291U, // <2,7,2,7>: Cost 2 vuzpr <4,2,6,7>, <4,2,6,7>
+    2097790977U, // <2,7,2,u>: Cost 2 ins <2,u,2,2>, lane 1
+    1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS
+    2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7>
+    2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2>
+    1927663312U, // <2,7,3,3>: Cost 2 vzipr LHS, <5,1,7,3>
+    1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS
+    2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3>
+    1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3>
+    1927663640U, // <2,7,3,7>: Cost 2 vzipr LHS, <5,5,7,7>
+    1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS
+    2581872742U, // <2,7,4,0>: Cost 3 vext1 <5,2,7,4>, LHS
+    2581873562U, // <2,7,4,1>: Cost 3 vext1 <5,2,7,4>, <1,2,3,4>
+    3171680257U, // <2,7,4,2>: Cost 3 ins <2,u,4,2>, lane 1
+    2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4>
+    2581876022U, // <2,7,4,4>: Cost 3 vext1 <5,2,7,4>, RHS
+    2133131264U, // <2,7,4,5>: Cost 2 ins <u,7,4,5>, lane 0
+    2712245609U, // <2,7,4,6>: Cost 3 vext3 <4,6,0,2>, <7,4,6,0>
+    3136368808U, // <2,7,4,7>: Cost 3 vtrnr <u,2,3,4>, <5,7,5,7>
+    2133131264U, // <2,7,4,u>: Cost 2 ins <u,7,4,5>, lane 0
+    2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2>
+    3206914048U, // <2,7,5,1>: Cost 3 ins <u,7,5,1>, lane 0
+    2844290353U, // <2,7,5,2>: Cost 3 vuzpr <4,2,6,7>, <4,5,6,2>
+    2991469050U, // <2,7,5,3>: Cost 3 vzipr <6,4,2,5>, <6,2,7,3>
+    2599800118U, // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS
+    3206946816U, // <2,7,5,5>: Cost 3 ins <u,7,5,5>, lane 0
+    3206955008U, // <2,7,5,6>: Cost 3 ins <u,7,5,6>, lane 0
+    2133221376U, // <2,7,5,7>: Cost 2 ins <u,7,5,7>, lane 0
+    2133221376U, // <2,7,5,u>: Cost 2 ins <u,7,5,7>, lane 0
+    2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS
+    3136459598U, // <2,7,6,1>: Cost 3 vtrnr <u,2,4,6>, <6,7,0,1>
+    2901890250U, // <2,7,6,2>: Cost 3 vzipl <2,6,3,7>, <7,2,6,3>
+    3136458880U, // <2,7,6,3>: Cost 3 vtrnr <u,2,4,6>, <5,7,1,3>
+    2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS
+    2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6>
+    2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6>
+    2133295104U, // <2,7,6,7>: Cost 2 ins <u,7,6,7>, lane 0
+    2133295104U, // <2,7,6,u>: Cost 2 ins <u,7,6,7>, lane 0
+    2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1>
+    3207061504U, // <2,7,7,1>: Cost 3 ins <u,7,7,1>, lane 0
+    2563983002U, // <2,7,7,2>: Cost 3 vext1 <2,2,7,7>, <2,2,7,7>
+    2998784506U, // <2,7,7,3>: Cost 3 vzipr <7,6,2,7>, <6,2,7,3>
+    2599816502U, // <2,7,7,4>: Cost 3 vext1 <u,2,7,7>, RHS
+    3207094272U, // <2,7,7,5>: Cost 3 ins <u,7,7,5>, lane 0
+    2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7>
+    2133368832U, // <2,7,7,7>: Cost 2 ins <u,7,7,7>, lane 0
+    2133368832U, // <2,7,7,u>: Cost 2 ins <u,7,7,7>, lane 0
+    1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS
+    1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2>
+    2097790977U, // <2,7,u,2>: Cost 2 ins <2,u,2,2>, lane 1
+    1927704272U, // <2,7,u,3>: Cost 2 vzipr LHS, <5,1,7,3>
+    1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS
+    2133131264U, // <2,7,u,5>: Cost 2 ins <u,7,4,5>, lane 0
+    1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u>
+    1927704600U, // <2,7,u,7>: Cost 2 vzipr LHS, <5,5,7,7>
+    1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS
+    1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+    470638699U,  // <2,u,0,1>: Cost 1 vext2 LHS, LHS
+    1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+    2020786845U, // <2,u,0,3>: Cost 2 vtrnr <1,2,3,0>, LHS
+    1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+    2094940162U, // <2,u,0,5>: Cost 2 ins <2,3,u,5>, lane 2
+    1960106138U, // <2,u,0,6>: Cost 2 vtrnl <2,3,0,1>, RHS
+    2020789801U, // <2,u,0,7>: Cost 2 vtrnr <1,2,3,0>, RHS
+    470639261U,  // <2,u,0,u>: Cost 1 vext2 LHS, LHS
+    1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+    1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+    1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+    1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+    2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+    1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+    2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+    2096947202U, // <2,u,1,7>: Cost 2 ins <2,6,u,7>, lane 2
+    1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3>
+    1478328556U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, <0,2,u,2>
+    1825380142U, // <2,u,2,1>: Cost 2 vzipl <2,2,2,2>, LHS
+    269271142U,  // <2,u,2,2>: Cost 1 vdup2 LHS
+    1055244288U, // <2,u,2,3>: Cost 1 ins LHS, lane 0
+    1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS
+    1825380506U, // <2,u,2,5>: Cost 2 vzipl <2,2,2,2>, RHS
+    1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+    2014613033U, // <2,u,2,7>: Cost 2 vtrnr <0,2,0,2>, RHS
+    1055244288U, // <2,u,2,u>: Cost 1 ins LHS, lane 0
+    1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+    1879885550U, // <2,u,3,1>: Cost 2 vzipr LHS, <2,3,u,1>
+    1879884012U, // <2,u,3,2>: Cost 2 vzipr LHS, <0,2,u,2>
+    806142108U,  // <2,u,3,3>: Cost 1 vzipr LHS, LHS
+    1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+    1879885554U, // <2,u,3,5>: Cost 2 vzipr LHS, <2,3,u,5>
+    1879884016U, // <2,u,3,6>: Cost 2 vzipr LHS, <0,2,u,6>
+    806145352U,  // <2,u,3,7>: Cost 1 vzipr LHS, RHS
+    806142113U,  // <2,u,3,u>: Cost 1 vzipr LHS, LHS
+    1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS
+    1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4>
+    1960433454U, // <2,u,4,2>: Cost 2 vtrnl <2,3,4,5>, LHS
+    2020819613U, // <2,u,4,3>: Cost 2 vtrnr <1,2,3,4>, LHS
+    1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS
+    470641974U,  // <2,u,4,5>: Cost 1 vext2 LHS, RHS
+    1691610422U, // <2,u,4,6>: Cost 2 vuzpl <2,2,u,3>, RHS
+    2020822569U, // <2,u,4,7>: Cost 2 vtrnr <1,2,3,4>, RHS
+    470642217U,  // <2,u,4,u>: Cost 1 vext2 LHS, RHS
+    2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5>
+    1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+    2094252034U, // <2,u,5,2>: Cost 2 ins <2,2,u,2>, lane 2
+    2094260226U, // <2,u,5,3>: Cost 2 ins <2,2,u,3>, lane 2
+    1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+    1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+    1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+    1746226486U, // <2,u,5,7>: Cost 2 vuzpr <0,2,0,u>, RHS
+    1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+    1478361328U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, <0,2,u,6>
+    1828149038U, // <2,u,6,1>: Cost 2 vzipl <2,6,3,7>, LHS
+    1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+    2014937757U, // <2,u,6,3>: Cost 2 vtrnr <0,2,4,6>, LHS
+    1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS
+    1828149402U, // <2,u,6,5>: Cost 2 vzipl <2,6,3,7>, RHS
+    1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+    1060216836U, // <2,u,6,7>: Cost 1 ins RHS, lane 4
+    1060216836U, // <2,u,6,u>: Cost 1 ins RHS, lane 4
+    1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+    2094907394U, // <2,u,7,1>: Cost 2 ins <2,3,u,1>, lane 2
+    2094252034U, // <2,u,7,2>: Cost 2 ins <2,2,u,2>, lane 2
+    2129354752U, // <2,u,7,3>: Cost 2 ins <u,1,7,3>, lane 0
+    1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+    2094940162U, // <2,u,7,5>: Cost 2 ins <2,3,u,5>, lane 2
+    2134024196U, // <2,u,7,6>: Cost 2 ins <u,u,7,6>, lane 4
+    1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+    1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+    1879925699U, // <2,u,u,0>: Cost 2 vzipr LHS, <1,2,u,0>
+    470644526U,  // <2,u,u,1>: Cost 1 vext2 LHS, LHS
+    269271142U,  // <2,u,u,2>: Cost 1 vdup2 LHS
+    806183068U,  // <2,u,u,3>: Cost 1 vzipr LHS, LHS
+    1879925703U, // <2,u,u,4>: Cost 2 vzipr LHS, <1,2,u,4>
+    470644890U,  // <2,u,u,5>: Cost 1 vext2 LHS, RHS
+    1879924976U, // <2,u,u,6>: Cost 2 vzipr LHS, <0,2,u,6>
+    806186312U,  // <2,u,u,7>: Cost 1 vzipr LHS, RHS
+    470645093U,  // <2,u,u,u>: Cost 1 vext2 LHS, LHS
+    1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+    1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+    1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+    2960312624U, // <3,0,0,3>: Cost 3 vzipr <1,2,3,0>, <3,2,0,3>
+    2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1>
+    3177381889U, // <3,0,0,5>: Cost 3 ins <3,u,0,5>, lane 1
+    3177390081U, // <3,0,0,6>: Cost 3 ins <3,u,0,6>, lane 1
+    3177398273U, // <3,0,0,7>: Cost 3 ins <3,u,0,7>, lane 1
+    1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2>
+    1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS
+    2128232448U, // <3,0,1,1>: Cost 2 ins <u,0,1,1>, lane 0
+    537706598U,  // <3,0,1,2>: Cost 1 vext3 LHS, LHS
+    2098429955U, // <3,0,1,3>: Cost 2 ins <3,0,1,u>, lane 3
+    1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS
+    2098429955U, // <3,0,1,5>: Cost 2 ins <3,0,1,u>, lane 3
+    2098429955U, // <3,0,1,6>: Cost 2 ins <3,0,1,u>, lane 3
+    2098429955U, // <3,0,1,7>: Cost 2 ins <3,0,1,u>, lane 3
+    537706652U,  // <3,0,1,u>: Cost 1 vext3 LHS, LHS
+    1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+    2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+    2128314368U, // <3,0,2,2>: Cost 2 ins <u,0,2,2>, lane 0
+    2098946053U, // <3,0,2,3>: Cost 2 ins <3,0,u,u>, lane 5
+    1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+    2959000610U, // <3,0,2,5>: Cost 3 vzipr <1,0,3,2>, <1,4,0,5>
+    2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7>
+    3177545729U, // <3,0,2,7>: Cost 3 ins <3,u,2,7>, lane 1
+    1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+    2820636924U, // <3,0,3,0>: Cost 3 vuzpr <0,3,1,0>, <0,3,1,0>
+    1832091750U, // <3,0,3,1>: Cost 2 vzipl <3,3,3,3>, LHS
+    1966309478U, // <3,0,3,2>: Cost 2 vtrnl <3,3,3,3>, LHS
+    2103844865U, // <3,0,3,3>: Cost 2 ins <3,u,3,3>, lane 1
+    2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6>
+    2772716034U, // <3,0,3,5>: Cost 3 vuzpl <3,5,0,2>, <3,4,5,6>
+    3177611265U, // <3,0,3,6>: Cost 3 ins <3,u,3,6>, lane 1
+    3177619457U, // <3,0,3,7>: Cost 3 ins <3,u,3,7>, lane 1
+    1832092317U, // <3,0,3,u>: Cost 2 vzipl <3,3,3,3>, LHS
+    2689835334U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,2>
+    1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+    1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+    2906669312U, // <3,0,4,3>: Cost 3 vzipl <3,4,5,6>, <0,3,1,4>
+    2689835373U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,5>
+    1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS
+    2769382710U, // <3,0,4,6>: Cost 3 vuzpl <3,0,0,0>, RHS
+    3177693185U, // <3,0,4,7>: Cost 3 ins <3,u,4,7>, lane 1
+    1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+    3101278208U, // <3,0,5,0>: Cost 3 vtrnr <2,3,4,5>, <0,0,0,0>
+    2128527360U, // <3,0,5,1>: Cost 2 ins <u,0,5,1>, lane 0
+    1967145062U, // <3,0,5,2>: Cost 2 vtrnl <3,4,5,6>, LHS
+    3040886978U, // <3,0,5,3>: Cost 3 vtrnl <3,4,5,6>, <0,2,3,5>
+    3040886988U, // <3,0,5,4>: Cost 3 vtrnl <3,4,5,6>, <0,2,4,6>
+    2666573828U, // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5>
+    2104016897U, // <3,0,5,6>: Cost 2 ins <3,u,5,6>, lane 1
+    2820640054U, // <3,0,5,7>: Cost 3 vuzpr <0,3,1,0>, RHS
+    1967145116U, // <3,0,5,u>: Cost 2 vtrnl <3,4,5,6>, LHS
+    3202334720U, // <3,0,6,0>: Cost 3 ins <u,0,6,0>, lane 0
+    2907635814U, // <3,0,6,1>: Cost 3 vzipl <3,6,0,7>, LHS
+    2128609280U, // <3,0,6,2>: Cost 2 ins <u,0,6,2>, lane 0
+    3177807873U, // <3,0,6,3>: Cost 3 ins <3,u,6,3>, lane 1
+    3202367488U, // <3,0,6,4>: Cost 3 ins <u,0,6,4>, lane 0
+    3172663298U, // <3,0,6,5>: Cost 3 ins <3,0,u,5>, lane 2
+    2666574648U, // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6>
+    2098946053U, // <3,0,6,7>: Cost 2 ins <3,0,u,u>, lane 5
+    2128609280U, // <3,0,6,u>: Cost 2 ins <u,0,6,2>, lane 0
+    3095396352U, // <3,0,7,0>: Cost 3 vtrnr <1,3,5,7>, <0,0,0,0>
+    3095396362U, // <3,0,7,1>: Cost 3 vtrnr <1,3,5,7>, <0,0,1,1>
+    2098896898U, // <3,0,7,2>: Cost 2 ins <3,0,u,2>, lane 2
+    3177881601U, // <3,0,7,3>: Cost 3 ins <3,u,7,3>, lane 1
+    2666575206U, // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6>
+    3177897985U, // <3,0,7,5>: Cost 3 ins <3,u,7,5>, lane 1
+    3202457600U, // <3,0,7,6>: Cost 3 ins <u,0,7,6>, lane 0
+    2666575468U, // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7>
+    2098896898U, // <3,0,7,u>: Cost 2 ins <3,0,u,2>, lane 2
+    1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2>
+    1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+    537707165U,  // <3,0,u,2>: Cost 1 vext3 LHS, LHS
+    2098429955U, // <3,0,u,3>: Cost 2 ins <3,0,1,u>, lane 3
+    1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+    1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS
+    2098429955U, // <3,0,u,6>: Cost 2 ins <3,0,1,u>, lane 3
+    2098429955U, // <3,0,u,7>: Cost 2 ins <3,0,1,u>, lane 3
+    537707219U,  // <3,0,u,u>: Cost 1 vext3 LHS, LHS
+    2552201468U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, <0,3,1,0>
+    2128822272U, // <3,1,0,1>: Cost 2 ins <u,1,0,1>, lane 0
+    1695727718U, // <3,1,0,2>: Cost 2 vuzpl <3,0,1,2>, LHS
+    1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+    2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS
+    2960310610U, // <3,1,0,5>: Cost 3 vzipr <1,2,3,0>, <0,4,1,5>
+    2832516572U, // <3,1,0,6>: Cost 3 vuzpr <2,3,0,1>, <2,0,4,6>
+    3177398273U, // <3,1,0,7>: Cost 3 ins <3,u,0,7>, lane 1
+    1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+    2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1>
+    1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1>
+    2103689217U, // <3,1,1,2>: Cost 2 ins <3,u,1,2>, lane 1
+    1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+    2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5>
+    2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+    3177463809U, // <3,1,1,6>: Cost 3 ins <3,u,1,6>, lane 1
+    3100952848U, // <3,1,1,7>: Cost 3 vtrnr <2,3,0,1>, <3,1,5,7>
+    1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3>
+    2128961536U, // <3,1,2,0>: Cost 2 ins <u,1,2,0>, lane 0
+    2128969728U, // <3,1,2,1>: Cost 2 ins <u,1,2,1>, lane 0
+    2128977920U, // <3,1,2,2>: Cost 2 ins <u,1,2,2>, lane 0
+    1055244288U, // <3,1,2,3>: Cost 1 ins LHS, lane 0
+    2128994304U, // <3,1,2,4>: Cost 2 ins <u,1,2,4>, lane 0
+    2129002496U, // <3,1,2,5>: Cost 2 ins <u,1,2,5>, lane 0
+    2129010688U, // <3,1,2,6>: Cost 2 ins <u,1,2,6>, lane 0
+    2129018880U, // <3,1,2,7>: Cost 2 ins <u,1,2,7>, lane 0
+    1055244288U, // <3,1,2,u>: Cost 1 ins LHS, lane 0
+    1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS
+    1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+    2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+    2021326950U, // <3,1,3,3>: Cost 2 vtrnr <1,3,1,3>, LHS
+    1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS
+    1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+    2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+    2832516096U, // <3,1,3,7>: Cost 3 vuzpr <2,3,0,1>, <1,3,5,7>
+    1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+    2552234240U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, <0,3,1,4>
+    2960343050U, // <3,1,4,1>: Cost 3 vzipr <1,2,3,4>, <0,0,1,1>
+    2960345238U, // <3,1,4,2>: Cost 3 vzipr <1,2,3,4>, <3,0,1,2>
+    2129133568U, // <3,1,4,3>: Cost 2 ins <u,1,4,3>, lane 0
+    2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS
+    2129149952U, // <3,1,4,5>: Cost 2 ins <u,1,4,5>, lane 0
+    1695730998U, // <3,1,4,6>: Cost 2 vuzpl <3,0,1,2>, RHS
+    3177693185U, // <3,1,4,7>: Cost 3 ins <3,u,4,7>, lane 1
+    1695731016U, // <3,1,4,u>: Cost 2 vuzpl <3,0,1,2>, RHS
+    2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1>
+    2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7>
+    2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5>
+    1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+    2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5>
+    2961678674U, // <3,1,5,5>: Cost 3 vzipr <1,4,3,5>, <0,4,1,5>
+    2104016897U, // <3,1,5,6>: Cost 2 ins <3,u,5,6>, lane 1
+    1758776630U, // <3,1,5,7>: Cost 2 vuzpr <2,3,0,1>, RHS
+    1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+    2907783926U, // <3,1,6,0>: Cost 3 vzipl <3,6,2,7>, <1,0,3,2>
+    2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+    2222752740U, // <3,1,6,2>: Cost 3 vrev <1,3,2,6>
+    2129281024U, // <3,1,6,3>: Cost 2 ins <u,1,6,3>, lane 0
+    2222900214U, // <3,1,6,4>: Cost 3 vrev <1,3,4,6>
+    2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+    2868350324U, // <3,1,6,6>: Cost 3 vuzpr <u,3,0,1>, <4,6,4,6>
+    2129313792U, // <3,1,6,7>: Cost 2 ins <u,1,6,7>, lane 0
+    2129281024U, // <3,1,6,u>: Cost 2 ins <u,1,6,3>, lane 0
+    3177857025U, // <3,1,7,0>: Cost 3 ins <3,u,7,0>, lane 1
+    3095397172U, // <3,1,7,1>: Cost 3 vtrnr <1,3,5,7>, <1,1,1,1>
+    2962360470U, // <3,1,7,2>: Cost 3 vzipr <1,5,3,7>, <3,0,1,2>
+    2021654630U, // <3,1,7,3>: Cost 2 vtrnr <1,3,5,7>, LHS
+    3177889793U, // <3,1,7,4>: Cost 3 ins <3,u,7,4>, lane 1
+    1149240320U, // <3,1,7,5>: Cost 2 vrev <1,3,5,7>
+    2223055881U, // <3,1,7,6>: Cost 3 vrev <1,3,6,7>
+    2868351144U, // <3,1,7,7>: Cost 3 vuzpr <u,3,0,1>, <5,7,5,7>
+    2021654635U, // <3,1,7,u>: Cost 2 vtrnr <1,3,5,7>, LHS
+    1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS
+    1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3>
+    1695733550U, // <3,1,u,2>: Cost 2 vuzpl <3,0,1,2>, LHS
+    1055244288U, // <3,1,u,3>: Cost 1 ins LHS, lane 0
+    1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS
+    1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+    1695733914U, // <3,1,u,6>: Cost 2 vuzpl <3,0,1,2>, RHS
+    1758776873U, // <3,1,u,7>: Cost 2 vuzpr <2,3,0,1>, RHS
+    1055244288U, // <3,1,u,u>: Cost 1 ins LHS, lane 0
+    2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0>
+    1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS
+    2129494016U, // <3,2,0,2>: Cost 2 ins <u,2,0,2>, lane 0
+    1886568550U, // <3,2,0,3>: Cost 2 vzipr <1,2,3,0>, LHS
+    2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5>
+    2960311348U, // <3,2,0,5>: Cost 3 vzipr <1,2,3,0>, <1,4,2,5>
+    2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+    3177398273U, // <3,2,0,7>: Cost 3 ins <3,u,0,7>, lane 1
+    1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS
+    1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2>
+    2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1>
+    2103689217U, // <3,2,1,2>: Cost 2 ins <3,u,1,2>, lane 1
+    2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+    2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS
+    2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7>
+    2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+    3177472001U, // <3,2,1,7>: Cost 3 ins <3,u,1,7>, lane 1
+    1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1>
+    2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1>
+    2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3>
+    1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2>
+    1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+    2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5>
+    2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+    2689836685U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,3>
+    3177545729U, // <3,2,2,7>: Cost 3 ins <3,u,2,7>, lane 1
+    1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3>
+    1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1>
+    2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0>
+    1611450042U, // <3,2,3,2>: Cost 2 vext3 LHS, <2,3,2,3>
+    1885929574U, // <3,2,3,3>: Cost 2 vzipr <1,1,3,3>, LHS
+    1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5>
+    2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1>
+    1611450082U, // <3,2,3,6>: Cost 2 vext3 LHS, <2,3,6,7>
+    2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0>
+    1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1>
+    2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS
+    2558280674U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,3,2,4>
+    2960343060U, // <3,2,4,2>: Cost 3 vzipr <1,2,3,4>, <0,0,2,2>
+    1886601318U, // <3,2,4,3>: Cost 2 vzipr <1,2,3,4>, LHS
+    2960344034U, // <3,2,4,4>: Cost 3 vzipr <1,2,3,4>, <1,3,2,4>
+    1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS
+    2129821696U, // <3,2,4,6>: Cost 2 ins <u,2,4,6>, lane 0
+    3177693185U, // <3,2,4,7>: Cost 3 ins <3,u,4,7>, lane 1
+    1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS
+    2552316170U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, <0,3,2,5>
+    2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5>
+    2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+    2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+    1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5>
+    2665263108U, // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5>
+    2104016897U, // <3,2,5,6>: Cost 2 ins <3,u,5,6>, lane 1
+    2826554678U, // <3,2,5,7>: Cost 3 vuzpr <1,3,0,2>, RHS
+    1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5>
+    2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1>
+    2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3>
+    2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6>
+    1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+    2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5>
+    2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+    2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+    2129977344U, // <3,2,6,7>: Cost 2 ins <u,2,6,7>, lane 0
+    1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+    3095397270U, // <3,2,7,0>: Cost 3 vtrnr <1,3,5,7>, <1,2,3,0>
+    3203743744U, // <3,2,7,1>: Cost 3 ins <u,2,7,1>, lane 0
+    3095396516U, // <3,2,7,2>: Cost 3 vtrnr <1,3,5,7>, <0,2,0,2>
+    1888616550U, // <3,2,7,3>: Cost 2 vzipr <1,5,3,7>, LHS
+    3095397274U, // <3,2,7,4>: Cost 3 vtrnr <1,3,5,7>, <1,2,3,4>
+    3095396528U, // <3,2,7,5>: Cost 3 vtrnr <1,3,5,7>, <0,2,1,5>
+    1155286754U, // <3,2,7,6>: Cost 2 vrev <2,3,6,7>
+    2665264748U, // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7>
+    1888616555U, // <3,2,7,u>: Cost 2 vzipr <1,5,3,7>, LHS
+    1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1>
+    1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS
+    2129494016U, // <3,2,u,2>: Cost 2 ins <u,2,0,2>, lane 0
+    1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+    1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5>
+    1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS
+    2129821696U, // <3,2,u,6>: Cost 2 ins <u,2,4,6>, lane 0
+    2129977344U, // <3,2,u,7>: Cost 2 ins <u,2,6,7>, lane 0
+    1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1>
+    1886569366U, // <3,3,0,0>: Cost 2 vzipr <1,2,3,0>, <1,2,3,0>
+    1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2>
+    1697874022U, // <3,3,0,2>: Cost 2 vuzpl <3,3,3,3>, LHS
+    2100895746U, // <3,3,0,3>: Cost 2 ins <3,3,u,3>, lane 2
+    2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1>
+    3041151490U, // <3,3,0,5>: Cost 3 vtrnl <3,5,0,2>, <3,4,5,6>
+    3177390081U, // <3,3,0,6>: Cost 3 ins <3,u,0,6>, lane 1
+    2960311440U, // <3,3,0,7>: Cost 3 vzipr <1,2,3,0>, <1,5,3,7>
+    1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2>
+    2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3>
+    1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3>
+    2103689217U, // <3,3,1,2>: Cost 2 ins <3,u,1,2>, lane 1
+    1752891494U, // <3,3,1,3>: Cost 2 vuzpr <1,3,1,3>, LHS
+    2826635515U, // <3,3,1,4>: Cost 3 vuzpr <1,3,1,3>, <3,1,3,4>
+    2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3>
+    3177463809U, // <3,3,1,6>: Cost 3 ins <3,u,1,6>, lane 1
+    3100951552U, // <3,3,1,7>: Cost 3 vtrnr <2,3,0,1>, <1,3,5,7>
+    1752891499U, // <3,3,1,u>: Cost 2 vuzpr <1,3,1,3>, LHS
+    2959000470U, // <3,3,2,0>: Cost 3 vzipr <1,0,3,2>, <1,2,3,0>
+    2959000471U, // <3,3,2,1>: Cost 3 vzipr <1,0,3,2>, <1,2,3,1>
+    1885258486U, // <3,3,2,2>: Cost 2 vzipr <1,0,3,2>, <1,0,3,2>
+    2130313216U, // <3,3,2,3>: Cost 2 ins <u,3,2,3>, lane 0
+    2959000474U, // <3,3,2,4>: Cost 3 vzipr <1,0,3,2>, <1,2,3,4>
+    2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4>
+    2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3>
+    2959000720U, // <3,3,2,7>: Cost 3 vzipr <1,0,3,2>, <1,5,3,7>
+    1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3>
+    1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS
+    2100568067U, // <3,3,3,1>: Cost 2 ins <3,3,3,u>, lane 3
+    2100568067U, // <3,3,3,2>: Cost 2 ins <3,3,3,u>, lane 3
+    336380006U,  // <3,3,3,3>: Cost 1 vdup3 LHS
+    1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS
+    2100568067U, // <3,3,3,5>: Cost 2 ins <3,3,3,u>, lane 3
+    2100568067U, // <3,3,3,6>: Cost 2 ins <3,3,3,u>, lane 3
+    2100568067U, // <3,3,3,7>: Cost 2 ins <3,3,3,u>, lane 3
+    336380006U,  // <3,3,3,u>: Cost 1 vdup3 LHS
+    2960343958U, // <3,3,4,0>: Cost 3 vzipr <1,2,3,4>, <1,2,3,0>
+    2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4>
+    2960343798U, // <3,3,4,2>: Cost 3 vzipr <1,2,3,4>, <1,0,3,2>
+    2100895746U, // <3,3,4,3>: Cost 2 ins <3,3,u,3>, lane 2
+    1886602138U, // <3,3,4,4>: Cost 2 vzipr <1,2,3,4>, <1,2,3,4>
+    1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6>
+    1697877302U, // <3,3,4,6>: Cost 2 vuzpl <3,3,3,3>, RHS
+    2960344208U, // <3,3,4,7>: Cost 3 vzipr <1,2,3,4>, <1,5,3,7>
+    1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6>
+    2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS
+    2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5>
+    2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5>
+    2100895746U, // <3,3,5,3>: Cost 2 ins <3,3,u,3>, lane 2
+    2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS
+    2027538126U, // <3,3,5,5>: Cost 2 vtrnr <2,3,4,5>, <2,3,4,5>
+    2104016897U, // <3,3,5,6>: Cost 2 ins <3,u,5,6>, lane 1
+    1752894774U, // <3,3,5,7>: Cost 2 vuzpr <1,3,1,3>, RHS
+    1752894775U, // <3,3,5,u>: Cost 2 vuzpr <1,3,1,3>, RHS
+    2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7>
+    3204333568U, // <3,3,6,1>: Cost 3 ins <u,3,6,1>, lane 0
+    2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7>
+    2100895746U, // <3,3,6,3>: Cost 2 ins <3,3,u,3>, lane 2
+    2234845608U, // <3,3,6,4>: Cost 3 vrev <3,3,4,6>
+    3204366336U, // <3,3,6,5>: Cost 3 ins <u,3,6,5>, lane 0
+    1967893085U, // <3,3,6,6>: Cost 2 vtrnl <3,5,6,7>, <3,5,6,7>
+    2130640896U, // <3,3,6,7>: Cost 2 ins <u,3,6,7>, lane 0
+    2100895746U, // <3,3,6,u>: Cost 2 ins <3,3,u,3>, lane 2
+    2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS
+    2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7>
+    2962359030U, // <3,3,7,2>: Cost 3 vzipr <1,5,3,7>, <1,0,3,2>
+    2100895746U, // <3,3,7,3>: Cost 2 ins <3,3,u,3>, lane 2
+    2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS
+    3095398094U, // <3,3,7,5>: Cost 3 vtrnr <1,3,5,7>, <2,3,4,5>
+    3174662146U, // <3,3,7,6>: Cost 3 ins <3,3,u,6>, lane 2
+    2021655552U, // <3,3,7,7>: Cost 2 vtrnr <1,3,5,7>, <1,3,5,7>
+    2021655552U, // <3,3,7,u>: Cost 2 vtrnr <1,3,5,7>, <1,3,5,7>
+    1886569366U, // <3,3,u,0>: Cost 2 vzipr <1,2,3,0>, <1,2,3,0>
+    1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2>
+    1697879854U, // <3,3,u,2>: Cost 2 vuzpl <3,3,3,3>, LHS
+    336380006U,  // <3,3,u,3>: Cost 1 vdup3 LHS
+    1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS
+    1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6>
+    1697880218U, // <3,3,u,6>: Cost 2 vuzpl <3,3,3,3>, RHS
+    1752895017U, // <3,3,u,7>: Cost 2 vuzpr <1,3,1,3>, RHS
+    336380006U,  // <3,3,u,u>: Cost 1 vdup3 LHS
+    2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0>
+    1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS
+    2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2>
+    3177365505U, // <3,4,0,3>: Cost 3 ins <3,u,0,3>, lane 1
+    2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5>
+    1829948726U, // <3,4,0,5>: Cost 2 vzipl <3,0,1,2>, RHS
+    1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2>
+    3177398273U, // <3,4,0,7>: Cost 3 ins <3,u,0,7>, lane 1
+    1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS
+    2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2>
+    2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1>
+    1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4>
+    2820669542U, // <3,4,1,3>: Cost 3 vuzpr <0,3,1,4>, LHS
+    2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS
+    2130919424U, // <3,4,1,5>: Cost 2 ins <u,4,1,5>, lane 0
+    1964166454U, // <3,4,1,6>: Cost 2 vtrnl <3,0,1,2>, RHS
+    3177472001U, // <3,4,1,7>: Cost 3 ins <3,u,1,7>, lane 1
+    1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4>
+    3204694016U, // <3,4,2,0>: Cost 3 ins <u,4,2,0>, lane 0
+    2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3>
+    2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2>
+    2101600261U, // <3,4,2,3>: Cost 2 ins <3,4,u,u>, lane 5
+    2826716058U, // <3,4,2,4>: Cost 3 vuzpr <1,3,2,4>, <1,2,3,4>
+    2959001294U, // <3,4,2,5>: Cost 3 vzipr <1,0,3,2>, <2,3,4,5>
+    2131001344U, // <3,4,2,6>: Cost 2 ins <u,4,2,6>, lane 0
+    3177545729U, // <3,4,2,7>: Cost 3 ins <3,u,2,7>, lane 1
+    2101600261U, // <3,4,2,u>: Cost 2 ins <3,4,u,u>, lane 5
+    2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2>
+    2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4>
+    2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4>
+    2103844865U, // <3,4,3,3>: Cost 2 ins <3,u,3,3>, lane 1
+    2820669696U, // <3,4,3,4>: Cost 3 vuzpr <0,3,1,4>, <0,3,1,4>
+    1832095030U, // <3,4,3,5>: Cost 2 vzipl <3,3,3,3>, RHS
+    1966312758U, // <3,4,3,6>: Cost 2 vtrnl <3,3,3,3>, RHS
+    3177619457U, // <3,4,3,7>: Cost 3 ins <3,u,3,7>, lane 1
+    1832095273U, // <3,4,3,u>: Cost 2 vzipl <3,3,3,3>, RHS
+    2960344777U, // <3,4,4,0>: Cost 3 vzipr <1,2,3,4>, <2,3,4,0>
+    2960344778U, // <3,4,4,1>: Cost 3 vzipr <1,2,3,4>, <2,3,4,1>
+    2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4>
+    2960344618U, // <3,4,4,3>: Cost 3 vzipr <1,2,3,4>, <2,1,4,3>
+    1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+    1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS
+    1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6>
+    3177693185U, // <3,4,4,7>: Cost 3 ins <3,u,4,7>, lane 1
+    1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS
+    1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS
+    2101379075U, // <3,4,5,1>: Cost 2 ins <3,4,5,u>, lane 3
+    1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5>
+    2101379075U, // <3,4,5,3>: Cost 2 ins <3,4,5,u>, lane 3
+    1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS
+    2131214336U, // <3,4,5,5>: Cost 2 ins <u,4,5,5>, lane 0
+    537709878U,  // <3,4,5,6>: Cost 1 vext3 LHS, RHS
+    2101379075U, // <3,4,5,7>: Cost 2 ins <3,4,5,u>, lane 3
+    537709896U,  // <3,4,5,u>: Cost 1 vext3 LHS, RHS
+    1659227468U, // <3,4,6,0>: Cost 2 vext3 LHS, <4,6,0,2>
+    2689838422U, // <3,4,6,1>: Cost 3 vext3 LHS, <4,6,1,3>
+    2564417231U, // <3,4,6,2>: Cost 3 vext1 <2,3,4,6>, <2,3,4,6>
+    2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6>
+    1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+    2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+    2131296256U, // <3,4,6,6>: Cost 2 ins <u,4,6,6>, lane 0
+    2101600261U, // <3,4,6,7>: Cost 2 ins <3,4,u,u>, lane 5
+    1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2>
+    2666607610U, // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2>
+    2659972191U, // <3,4,7,1>: Cost 3 vext2 <7,1,3,4>, <7,1,3,4>
+    2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4>
+    3177881601U, // <3,4,7,3>: Cost 3 ins <3,u,7,3>, lane 1
+    2666607974U, // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6>
+    3095396690U, // <3,4,7,5>: Cost 3 vtrnr <1,3,5,7>, <0,4,1,5>
+    2131369984U, // <3,4,7,6>: Cost 2 ins <u,4,7,6>, lane 0
+    2666608236U, // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7>
+    2131369984U, // <3,4,7,u>: Cost 2 ins <u,4,7,6>, lane 0
+    1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS
+    1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS
+    1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u>
+    2101600261U, // <3,4,u,3>: Cost 2 ins <3,4,u,u>, lane 5
+    1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS
+    1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS
+    537710121U,  // <3,4,u,6>: Cost 1 vext3 LHS, RHS
+    2101379075U, // <3,4,u,7>: Cost 2 ins <3,4,5,u>, lane 3
+    537710139U,  // <3,4,u,u>: Cost 1 vext3 LHS, RHS
+    2832842752U, // <3,5,0,0>: Cost 3 vuzpr <2,3,4,5>, <0,0,0,0>
+    2131476480U, // <3,5,0,1>: Cost 2 ins <u,5,0,1>, lane 0
+    1698709606U, // <3,5,0,2>: Cost 2 vuzpl <3,4,5,6>, LHS
+    2772451522U, // <3,5,0,3>: Cost 3 vuzpl <3,4,5,6>, <0,2,3,5>
+    2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1>
+    2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+    2960310647U, // <3,5,0,6>: Cost 3 vzipr <1,2,3,0>, <0,4,5,6>
+    2131525632U, // <3,5,0,7>: Cost 2 ins <u,5,0,7>, lane 0
+    1698709660U, // <3,5,0,u>: Cost 2 vuzpl <3,4,5,6>, LHS
+    2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS
+    2832843572U, // <3,5,1,1>: Cost 3 vuzpr <2,3,4,5>, <1,1,1,1>
+    2103689217U, // <3,5,1,2>: Cost 2 ins <3,u,1,2>, lane 1
+    1759101030U, // <3,5,1,3>: Cost 2 vuzpr <2,3,4,5>, LHS
+    2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5>
+    2772452352U, // <3,5,1,5>: Cost 3 vuzpl <3,4,5,6>, <1,3,5,7>
+    3205332992U, // <3,5,1,6>: Cost 3 ins <u,5,1,6>, lane 0
+    2027212086U, // <3,5,1,7>: Cost 2 vtrnr <2,3,0,1>, RHS
+    2027212087U, // <3,5,1,u>: Cost 2 vtrnr <2,3,0,1>, RHS
+    2832843670U, // <3,5,2,0>: Cost 3 vuzpr <2,3,4,5>, <1,2,3,0>
+    2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5>
+    2832842916U, // <3,5,2,2>: Cost 3 vuzpr <2,3,4,5>, <0,2,0,2>
+    2131640320U, // <3,5,2,3>: Cost 2 ins <u,5,2,3>, lane 0
+    2832842936U, // <3,5,2,4>: Cost 3 vuzpr <2,3,4,5>, <0,2,2,4>
+    2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3>
+    2959002114U, // <3,5,2,6>: Cost 3 vzipr <1,0,3,2>, <3,4,5,6>
+    2131673088U, // <3,5,2,7>: Cost 2 ins <u,5,2,7>, lane 0
+    2131640320U, // <3,5,2,u>: Cost 2 ins <u,5,2,3>, lane 0
+    2772453922U, // <3,5,3,0>: Cost 3 vuzpl <3,4,5,6>, <3,5,0,2>
+    2832844454U, // <3,5,3,1>: Cost 3 vuzpr <2,3,4,5>, <2,3,0,1>
+    3177578497U, // <3,5,3,2>: Cost 3 ins <3,u,3,2>, lane 1
+    2103844865U, // <3,5,3,3>: Cost 2 ins <3,u,3,3>, lane 1
+    2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6>
+    1759102670U, // <3,5,3,5>: Cost 2 vuzpr <2,3,4,5>, <2,3,4,5>
+    2959673858U, // <3,5,3,6>: Cost 3 vzipr <1,1,3,3>, <3,4,5,6>
+    2021330230U, // <3,5,3,7>: Cost 2 vtrnr <1,3,1,3>, RHS
+    2021330231U, // <3,5,3,u>: Cost 2 vtrnr <1,3,1,3>, RHS
+    2832845308U, // <3,5,4,0>: Cost 3 vuzpr <2,3,4,5>, <3,4,5,0>
+    2732969871U, // <3,5,4,1>: Cost 3 vext3 LHS, <5,4,1,5>
+    2832844536U, // <3,5,4,2>: Cost 3 vuzpr <2,3,4,5>, <2,4,0,2>
+    3177660417U, // <3,5,4,3>: Cost 3 ins <3,u,4,3>, lane 1
+    2832845312U, // <3,5,4,4>: Cost 3 vuzpr <2,3,4,5>, <3,4,5,4>
+    2131804160U, // <3,5,4,5>: Cost 2 ins <u,5,4,5>, lane 0
+    1698712886U, // <3,5,4,6>: Cost 2 vuzpl <3,4,5,6>, RHS
+    1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6>
+    1698712904U, // <3,5,4,u>: Cost 2 vuzpl <3,4,5,6>, RHS
+    2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS
+    2832846074U, // <3,5,5,1>: Cost 3 vuzpr <2,3,4,5>, <4,5,0,1>
+    2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5>
+    2832845356U, // <3,5,5,3>: Cost 3 vuzpr <2,3,4,5>, <3,5,1,3>
+    2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS
+    1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+    2104016897U, // <3,5,5,6>: Cost 2 ins <3,u,5,6>, lane 1
+    1759104310U, // <3,5,5,7>: Cost 2 vuzpr <2,3,4,5>, RHS
+    1759104311U, // <3,5,5,u>: Cost 2 vuzpr <2,3,4,5>, RHS
+    2131910656U, // <3,5,6,0>: Cost 2 ins <u,5,6,0>, lane 0
+    2131918848U, // <3,5,6,1>: Cost 2 ins <u,5,6,1>, lane 0
+    2131927040U, // <3,5,6,2>: Cost 2 ins <u,5,6,2>, lane 0
+    2131935232U, // <3,5,6,3>: Cost 2 ins <u,5,6,3>, lane 0
+    2131943424U, // <3,5,6,4>: Cost 2 ins <u,5,6,4>, lane 0
+    2131951616U, // <3,5,6,5>: Cost 2 ins <u,5,6,5>, lane 0
+    2131959808U, // <3,5,6,6>: Cost 2 ins <u,5,6,6>, lane 0
+    1058226176U, // <3,5,6,7>: Cost 1 ins RHS, lane 0
+    1058226176U, // <3,5,6,u>: Cost 1 ins RHS, lane 0
+    1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS
+    1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7>
+    2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2>
+    2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2>
+    1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS
+    1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+    3095397528U, // <3,5,7,6>: Cost 3 vtrnr <1,3,5,7>, <1,5,4,6>
+    2021657910U, // <3,5,7,7>: Cost 2 vtrnr <1,3,5,7>, RHS
+    1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS
+    1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS
+    1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u>
+    1698715438U, // <3,5,u,2>: Cost 2 vuzpl <3,4,5,6>, LHS
+    1759101597U, // <3,5,u,3>: Cost 2 vuzpr <2,3,4,5>, LHS
+    1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS
+    1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7>
+    1698715802U, // <3,5,u,6>: Cost 2 vuzpl <3,4,5,6>, RHS
+    1058226176U, // <3,5,u,7>: Cost 1 ins RHS, lane 0
+    1058226176U, // <3,5,u,u>: Cost 1 ins RHS, lane 0
+    2732970264U, // <3,6,0,0>: Cost 3 vext3 LHS, <6,0,0,2>
+    2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2>
+    2132148224U, // <3,6,0,2>: Cost 2 ins <u,6,0,2>, lane 0
+    3177365505U, // <3,6,0,3>: Cost 3 ins <3,u,0,3>, lane 1
+    2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2>
+    2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7>
+    2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0>
+    1886571830U, // <3,6,0,7>: Cost 2 vzipr <1,2,3,0>, RHS
+    1886571831U, // <3,6,0,u>: Cost 2 vzipr <1,2,3,0>, RHS
+    2720878954U, // <3,6,1,0>: Cost 3 vext3 <6,1,0,3>, <6,1,0,3>
+    3205955584U, // <3,6,1,1>: Cost 3 ins <u,6,1,1>, lane 0
+    2103689217U, // <3,6,1,2>: Cost 2 ins <3,u,1,2>, lane 1
+    2826731622U, // <3,6,1,3>: Cost 3 vuzpr <1,3,2,6>, LHS
+    2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6>
+    3205988352U, // <3,6,1,5>: Cost 3 ins <u,6,1,5>, lane 0
+    2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3>
+    2954349878U, // <3,6,1,7>: Cost 3 vzipr <0,2,3,1>, RHS
+    2103689217U, // <3,6,1,u>: Cost 2 ins <3,u,1,2>, lane 1
+    2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS
+    2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3>
+    2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6>
+    2132303872U, // <3,6,2,3>: Cost 2 ins <u,6,2,3>, lane 0
+    2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6>
+    2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6>
+    2826731724U, // <3,6,2,6>: Cost 3 vuzpr <1,3,2,6>, <0,2,4,6>
+    1885261110U, // <3,6,2,7>: Cost 2 vzipr <1,0,3,2>, RHS
+    1885261111U, // <3,6,2,u>: Cost 2 vzipr <1,0,3,2>, RHS
+    3136876642U, // <3,6,3,0>: Cost 3 vtrnr <u,3,1,3>, <5,6,7,0>
+    3206103040U, // <3,6,3,1>: Cost 3 ins <u,6,3,1>, lane 0
+    3001478044U, // <3,6,3,2>: Cost 3 vzipr <u,1,3,3>, <4,0,6,2>
+    2103844865U, // <3,6,3,3>: Cost 2 ins <3,u,3,3>, lane 1
+    2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6>
+    3206135808U, // <3,6,3,5>: Cost 3 ins <u,6,3,5>, lane 0
+    1699457629U, // <3,6,3,6>: Cost 2 vuzpl <3,5,6,7>, <3,5,6,7>
+    1885932854U, // <3,6,3,7>: Cost 2 vzipr <1,1,3,3>, RHS
+    1885932855U, // <3,6,3,u>: Cost 2 vzipr <1,1,3,3>, RHS
+    2732970588U, // <3,6,4,0>: Cost 3 vext3 LHS, <6,4,0,2>
+    2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3>
+    2732970604U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,0>
+    2906673714U, // <3,6,4,3>: Cost 3 vzipl <3,4,5,6>, <6,3,4,5>
+    2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6>
+    2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6>
+    2132475904U, // <3,6,4,6>: Cost 2 ins <u,6,4,6>, lane 0
+    1886604598U, // <3,6,4,7>: Cost 2 vzipr <1,2,3,4>, RHS
+    1886604599U, // <3,6,4,u>: Cost 2 vzipr <1,2,3,4>, RHS
+    2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS
+    3206250496U, // <3,6,5,1>: Cost 3 ins <u,6,5,1>, lane 0
+    2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7>
+    2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+    3040891442U, // <3,6,5,4>: Cost 3 vtrnl <3,4,5,6>, <6,3,4,5>
+    3206283264U, // <3,6,5,5>: Cost 3 ins <u,6,5,5>, lane 0
+    2104016897U, // <3,6,5,6>: Cost 2 ins <3,u,5,6>, lane 1
+    2954382646U, // <3,6,5,7>: Cost 3 vzipr <0,2,3,5>, RHS
+    2104016897U, // <3,6,5,u>: Cost 2 ins <3,u,5,6>, lane 1
+    2732970748U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,0>
+    2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3>
+    2732970768U, // <3,6,6,2>: Cost 3 vext3 LHS, <6,6,2,2>
+    3177807873U, // <3,6,6,3>: Cost 3 ins <3,u,6,3>, lane 1
+    2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4>
+    2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7>
+    1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+    1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7>
+    1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7>
+    1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1>
+    2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7>
+    1611453282U, // <3,6,7,2>: Cost 2 vext3 LHS, <6,7,2,3>
+    2968996198U, // <3,6,7,3>: Cost 3 vzipr <2,6,3,7>, <3,2,6,3>
+    1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5>
+    2968995633U, // <3,6,7,5>: Cost 3 vzipr <2,6,3,7>, <2,4,6,5>
+    1611453322U, // <3,6,7,6>: Cost 2 vext3 LHS, <6,7,6,7>
+    1888619830U, // <3,6,7,7>: Cost 2 vzipr <1,5,3,7>, RHS
+    1888619831U, // <3,6,7,u>: Cost 2 vzipr <1,5,3,7>, RHS
+    1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1>
+    2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2>
+    2132148224U, // <3,6,u,2>: Cost 2 ins <u,6,0,2>, lane 0
+    2132303872U, // <3,6,u,3>: Cost 2 ins <u,6,2,3>, lane 0
+    1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5>
+    2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6>
+    2132475904U, // <3,6,u,6>: Cost 2 ins <u,6,4,6>, lane 0
+    1885310262U, // <3,6,u,7>: Cost 2 vzipr <1,0,3,u>, RHS
+    1885310263U, // <3,6,u,u>: Cost 2 vzipr <1,0,3,u>, RHS
+    2826960896U, // <3,7,0,0>: Cost 3 vuzpr <1,3,5,7>, <0,0,0,0>
+    1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS
+    2826960916U, // <3,7,0,2>: Cost 3 vuzpr <1,3,5,7>, <0,0,2,2>
+    3002117840U, // <3,7,0,3>: Cost 3 vzipr <u,2,3,0>, <5,1,7,3>
+    2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5>
+    2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0>
+    2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0>
+    2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1>
+    1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS
+    2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2>
+    2826961716U, // <3,7,1,1>: Cost 3 vuzpr <1,3,5,7>, <1,1,1,1>
+    2103689217U, // <3,7,1,2>: Cost 2 ins <3,u,1,2>, lane 1
+    1753219174U, // <3,7,1,3>: Cost 2 vuzpr <1,3,5,7>, LHS
+    2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS
+    1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7>
+    2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7>
+    2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3>
+    1753219179U, // <3,7,1,u>: Cost 2 vuzpr <1,3,5,7>, LHS
+    2826961814U, // <3,7,2,0>: Cost 3 vuzpr <1,3,5,7>, <1,2,3,0>
+    3206692864U, // <3,7,2,1>: Cost 3 ins <u,7,2,1>, lane 0
+    2826961060U, // <3,7,2,2>: Cost 3 vuzpr <1,3,5,7>, <0,2,0,2>
+    2132967424U, // <3,7,2,3>: Cost 2 ins <u,7,2,3>, lane 0
+    2826961818U, // <3,7,2,4>: Cost 3 vuzpr <1,3,5,7>, <1,2,3,4>
+    2826961072U, // <3,7,2,5>: Cost 3 vuzpr <1,3,5,7>, <0,2,1,5>
+    1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7>
+    2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3>
+    1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7>
+    2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2>
+    2826962598U, // <3,7,3,1>: Cost 3 vuzpr <1,3,5,7>, <2,3,0,1>
+    2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3>
+    2103844865U, // <3,7,3,3>: Cost 2 ins <3,u,3,3>, lane 1
+    2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6>
+    2826962638U, // <3,7,3,5>: Cost 3 vuzpr <1,3,5,7>, <2,3,4,5>
+    2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7>
+    1753220096U, // <3,7,3,7>: Cost 2 vuzpr <1,3,5,7>, <1,3,5,7>
+    1753220096U, // <3,7,3,u>: Cost 2 vuzpr <1,3,5,7>, <1,3,5,7>
+    2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS
+    2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7>
+    2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7>
+    3002150608U, // <3,7,4,3>: Cost 3 vzipr <u,2,3,4>, <5,1,7,3>
+    2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS
+    1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS
+    2826961244U, // <3,7,4,6>: Cost 3 vuzpr <1,3,5,7>, <0,4,2,6>
+    2732971383U, // <3,7,4,7>: Cost 3 vext3 LHS, <7,4,7,5>
+    1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS
+    2826963551U, // <3,7,5,0>: Cost 3 vuzpr <1,3,5,7>, <3,5,7,0>
+    2826963552U, // <3,7,5,1>: Cost 3 vuzpr <1,3,5,7>, <3,5,7,1>
+    2826962032U, // <3,7,5,2>: Cost 3 vuzpr <1,3,5,7>, <1,5,0,2>
+    2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0>
+    2826963555U, // <3,7,5,4>: Cost 3 vuzpr <1,3,5,7>, <3,5,7,4>
+    2826962044U, // <3,7,5,5>: Cost 3 vuzpr <1,3,5,7>, <1,5,1,5>
+    2104016897U, // <3,7,5,6>: Cost 2 ins <3,u,5,6>, lane 1
+    1753222454U, // <3,7,5,7>: Cost 2 vuzpr <1,3,5,7>, RHS
+    1753222455U, // <3,7,5,u>: Cost 2 vuzpr <1,3,5,7>, RHS
+    2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1>
+    2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0>
+    2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3>
+    2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0>
+    2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5>
+    2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4>
+    2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6>
+    2133295104U, // <3,7,6,7>: Cost 2 ins <u,7,6,7>, lane 0
+    2133295104U, // <3,7,6,u>: Cost 2 ins <u,7,6,7>, lane 0
+    2962362223U, // <3,7,7,0>: Cost 3 vzipr <1,5,3,7>, <5,3,7,0>
+    2826965109U, // <3,7,7,1>: Cost 3 vuzpr <1,3,5,7>, <5,7,0,1>
+    2968998474U, // <3,7,7,2>: Cost 3 vzipr <2,6,3,7>, <6,3,7,2>
+    2826963662U, // <3,7,7,3>: Cost 3 vuzpr <1,3,5,7>, <3,7,1,3>
+    2962362227U, // <3,7,7,4>: Cost 3 vzipr <1,5,3,7>, <5,3,7,4>
+    2826965149U, // <3,7,7,5>: Cost 3 vuzpr <1,3,5,7>, <5,7,4,5>
+    2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7>
+    1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+    1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7>
+    2826962300U, // <3,7,u,0>: Cost 3 vuzpr <1,3,5,7>, <1,u,3,0>
+    1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS
+    2103689217U, // <3,7,u,2>: Cost 2 ins <3,u,1,2>, lane 1
+    1753219741U, // <3,7,u,3>: Cost 2 vuzpr <1,3,5,7>, LHS
+    2826962304U, // <3,7,u,4>: Cost 3 vuzpr <1,3,5,7>, <1,u,3,4>
+    1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS
+    1595545808U, // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7>
+    1753222697U, // <3,7,u,7>: Cost 2 vuzpr <1,3,5,7>, RHS
+    1753219746U, // <3,7,u,u>: Cost 2 vuzpr <1,3,5,7>, LHS
+    1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+    1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2>
+    1696243814U, // <3,u,0,2>: Cost 2 vuzpl <3,0,u,2>, LHS
+    1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+    2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1>
+    1829951642U, // <3,u,0,5>: Cost 2 vzipl <3,0,1,2>, RHS
+    1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2>
+    1886571848U, // <3,u,0,7>: Cost 2 vzipr <1,2,3,0>, RHS
+    1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2>
+    1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u>
+    1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u>
+    537712430U,  // <3,u,1,2>: Cost 1 vext3 LHS, LHS
+    1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3>
+    1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS
+    1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u>
+    1964169370U, // <3,u,1,6>: Cost 2 vtrnl <3,0,1,2>, RHS
+    2027212329U, // <3,u,1,7>: Cost 2 vtrnr <2,3,0,1>, RHS
+    537712484U,  // <3,u,1,u>: Cost 1 vext3 LHS, LHS
+    1659672428U, // <3,u,2,0>: Cost 2 vext3 LHS, <u,2,0,2>
+    2128969728U, // <3,u,2,1>: Cost 2 ins <u,1,2,1>, lane 0
+    1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u>
+    1055244288U, // <3,u,2,3>: Cost 1 ins LHS, lane 0
+    1659672468U, // <3,u,2,4>: Cost 2 vext3 LHS, <u,2,4,6>
+    2129002496U, // <3,u,2,5>: Cost 2 ins <u,1,2,5>, lane 0
+    1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u>
+    1885261128U, // <3,u,2,7>: Cost 2 vzipr <1,0,3,2>, RHS
+    1055244288U, // <3,u,2,u>: Cost 1 ins LHS, lane 0
+    1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1>
+    1616541639U, // <3,u,3,1>: Cost 2 vext3 LHS, <u,3,1,3>
+    1966315310U, // <3,u,3,2>: Cost 2 vtrnl <3,3,3,3>, LHS
+    336380006U,  // <3,u,3,3>: Cost 1 vdup3 LHS
+    1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5>
+    1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+    1966315674U, // <3,u,3,6>: Cost 2 vtrnl <3,3,3,3>, RHS
+    1885932872U, // <3,u,3,7>: Cost 2 vzipr <1,1,3,3>, RHS
+    336380006U,  // <3,u,3,u>: Cost 1 vdup3 LHS
+    2960344003U, // <3,u,4,0>: Cost 3 vzipr <1,2,3,4>, <1,2,u,0>
+    1832933166U, // <3,u,4,1>: Cost 2 vzipl <3,4,5,6>, LHS
+    1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+    1886601372U, // <3,u,4,3>: Cost 2 vzipr <1,2,3,4>, LHS
+    1886602138U, // <3,u,4,4>: Cost 2 vzipr <1,2,3,4>, <1,2,3,4>
+    1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6>
+    1696247094U, // <3,u,4,6>: Cost 2 vuzpl <3,0,u,2>, RHS
+    1886604616U, // <3,u,4,7>: Cost 2 vzipr <1,2,3,4>, RHS
+    1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6>
+    1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS
+    2128527360U, // <3,u,5,1>: Cost 2 ins <u,0,5,1>, lane 0
+    1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5>
+    1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+    1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS
+    2027538126U, // <3,u,5,5>: Cost 2 vtrnr <2,3,4,5>, <2,3,4,5>
+    537712794U,  // <3,u,5,6>: Cost 1 vext3 LHS, RHS
+    1752935734U, // <3,u,5,7>: Cost 2 vuzpr <1,3,1,u>, RHS
+    537712812U,  // <3,u,5,u>: Cost 1 vext3 LHS, RHS
+    1663875248U, // <3,u,6,0>: Cost 2 vext3 LHS, <u,6,0,2>
+    2131918848U, // <3,u,6,1>: Cost 2 ins <u,5,6,1>, lane 0
+    2128609280U, // <3,u,6,2>: Cost 2 ins <u,0,6,2>, lane 0
+    1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+    1663875288U, // <3,u,6,4>: Cost 2 vext3 LHS, <u,6,4,6>
+    2131951616U, // <3,u,6,5>: Cost 2 ins <u,5,6,5>, lane 0
+    2131296256U, // <3,u,6,6>: Cost 2 ins <u,4,6,6>, lane 0
+    1058226176U, // <3,u,6,7>: Cost 1 ins RHS, lane 0
+    1058226176U, // <3,u,6,u>: Cost 1 ins RHS, lane 0
+    1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS
+    1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7>
+    2098896898U, // <3,u,7,2>: Cost 2 ins <3,0,u,2>, lane 2
+    2021655197U, // <3,u,7,3>: Cost 2 vtrnr <1,3,5,7>, LHS
+    1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS
+    1659230515U, // <3,u,7,5>: Cost 2 vext3 LHS, <u,7,5,7>
+    2131369984U, // <3,u,7,6>: Cost 2 ins <u,4,7,6>, lane 0
+    2021658153U, // <3,u,7,7>: Cost 2 vtrnr <1,3,5,7>, RHS
+    2021655202U, // <3,u,7,u>: Cost 2 vtrnr <1,3,5,7>, LHS
+    1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1>
+    1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2>
+    537712997U,  // <3,u,u,2>: Cost 1 vext3 LHS, LHS
+    1055244288U, // <3,u,u,3>: Cost 1 ins LHS, lane 0
+    1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5>
+    1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6>
+    537713037U,  // <3,u,u,6>: Cost 1 vext3 LHS, RHS
+    1058226176U, // <3,u,u,7>: Cost 1 ins RHS, lane 0
+    537713051U,  // <3,u,u,u>: Cost 1 vext3 LHS, LHS
+    2128150528U, // <4,0,0,0>: Cost 2 ins <u,0,0,0>, lane 0
+    2104860674U, // <4,0,0,1>: Cost 2 ins <4,0,u,1>, lane 2
+    1705607270U, // <4,0,0,2>: Cost 2 vuzpl <4,6,0,2>, LHS
+    3178070019U, // <4,0,0,3>: Cost 3 ins <4,0,0,u>, lane 3
+    2909946194U, // <4,0,0,4>: Cost 3 vzipl <4,0,5,1>, <0,4,1,5>
+    3178070019U, // <4,0,0,5>: Cost 3 ins <4,0,0,u>, lane 3
+    3183362049U, // <4,0,0,6>: Cost 3 ins <4,u,0,6>, lane 1
+    2109628417U, // <4,0,0,7>: Cost 2 ins <4,u,0,7>, lane 1
+    1705607324U, // <4,0,0,u>: Cost 2 vuzpl <4,6,0,2>, LHS
+    2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS
+    2128232448U, // <4,0,1,1>: Cost 2 ins <u,0,1,1>, lane 0
+    1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+    2833612902U, // <4,0,1,3>: Cost 3 vuzpr <2,4,6,0>, LHS
+    2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS
+    2779350016U, // <4,0,1,5>: Cost 3 vuzpl <4,6,0,2>, <1,3,5,7>
+    3202015232U, // <4,0,1,6>: Cost 3 ins <u,0,1,6>, lane 0
+    2109702145U, // <4,0,1,7>: Cost 2 ins <4,u,1,7>, lane 1
+    1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+    2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4>
+    2104860674U, // <4,0,2,1>: Cost 2 ins <4,0,u,1>, lane 2
+    2128314368U, // <4,0,2,2>: Cost 2 ins <u,0,2,2>, lane 0
+    2104918021U, // <4,0,2,3>: Cost 2 ins <4,0,u,u>, lane 5
+    2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6>
+    3044622465U, // <4,0,2,5>: Cost 3 vtrnl <4,1,2,3>, <0,1,5,3>
+    2833613004U, // <4,0,2,6>: Cost 3 vuzpr <2,4,6,0>, <0,2,4,6>
+    2109775873U, // <4,0,2,7>: Cost 2 ins <4,u,2,7>, lane 1
+    2104860674U, // <4,0,2,u>: Cost 2 ins <4,0,u,1>, lane 2
+    3202113536U, // <4,0,3,0>: Cost 3 ins <u,0,3,0>, lane 0
+    2104860674U, // <4,0,3,1>: Cost 2 ins <4,0,u,1>, lane 2
+    2128388096U, // <4,0,3,2>: Cost 2 ins <u,0,3,2>, lane 0
+    2779351452U, // <4,0,3,3>: Cost 3 vuzpl <4,6,0,2>, <3,3,3,3>
+    3178627074U, // <4,0,3,4>: Cost 3 ins <4,0,u,4>, lane 2
+    2839512782U, // <4,0,3,5>: Cost 3 vuzpr <3,4,5,0>, <2,3,4,5>
+    3178643458U, // <4,0,3,6>: Cost 3 ins <4,0,u,6>, lane 2
+    2109849601U, // <4,0,3,7>: Cost 2 ins <4,u,3,7>, lane 1
+    2104860674U, // <4,0,3,u>: Cost 2 ins <4,0,u,1>, lane 2
+    1705610572U, // <4,0,4,0>: Cost 2 vuzpl <4,6,0,2>, <4,6,0,2>
+    2104860674U, // <4,0,4,1>: Cost 2 ins <4,0,u,1>, lane 2
+    1974370406U, // <4,0,4,2>: Cost 2 vtrnl <4,6,4,6>, LHS
+    3178364931U, // <4,0,4,3>: Cost 3 ins <4,0,4,u>, lane 3
+    2109898753U, // <4,0,4,4>: Cost 2 ins <4,u,4,4>, lane 1
+    2104918021U, // <4,0,4,5>: Cost 2 ins <4,0,u,u>, lane 5
+    1705610550U, // <4,0,4,6>: Cost 2 vuzpl <4,6,0,2>, RHS
+    2109923329U, // <4,0,4,7>: Cost 2 ins <4,u,4,7>, lane 1
+    1705610568U, // <4,0,4,u>: Cost 2 vuzpl <4,6,0,2>, RHS
+    1839644672U, // <4,0,5,0>: Cost 2 vzipl RHS, <0,0,0,0>
+    765902950U,  // <4,0,5,1>: Cost 1 vzipl RHS, LHS
+    1839644836U, // <4,0,5,2>: Cost 2 vzipl RHS, <0,2,0,2>
+    2104696835U, // <4,0,5,3>: Cost 2 ins <4,0,5,u>, lane 3
+    1839645010U, // <4,0,5,4>: Cost 2 vzipl RHS, <0,4,1,5>
+    2109980673U, // <4,0,5,5>: Cost 2 ins <4,u,5,5>, lane 1
+    2104696835U, // <4,0,5,6>: Cost 2 ins <4,0,5,u>, lane 3
+    2104696835U, // <4,0,5,7>: Cost 2 ins <4,0,5,u>, lane 3
+    765903517U,  // <4,0,5,u>: Cost 1 vzipl RHS, LHS
+    1973862400U, // <4,0,6,0>: Cost 2 vtrnl RHS, <0,0,0,0>
+    1973862410U, // <4,0,6,1>: Cost 2 vtrnl RHS, <0,0,1,1>
+    900120678U,  // <4,0,6,2>: Cost 1 vtrnl RHS, LHS
+    2104770563U, // <4,0,6,3>: Cost 2 ins <4,0,6,u>, lane 3
+    1973862604U, // <4,0,6,4>: Cost 2 vtrnl RHS, <0,2,4,6>
+    2104770563U, // <4,0,6,5>: Cost 2 ins <4,0,6,u>, lane 3
+    2110062593U, // <4,0,6,6>: Cost 2 ins <4,u,6,6>, lane 1
+    1036328961U, // <4,0,6,7>: Cost 1 ins RHS, lane 1
+    900120732U,  // <4,0,6,u>: Cost 1 vtrnl RHS, LHS
+    3202408448U, // <4,0,7,0>: Cost 3 ins <u,0,7,0>, lane 0
+    2104860674U, // <4,0,7,1>: Cost 2 ins <4,0,u,1>, lane 2
+    2104868866U, // <4,0,7,2>: Cost 2 ins <4,0,u,2>, lane 2
+    3114049557U, // <4,0,7,3>: Cost 3 vtrnr <4,4,6,7>, <0,0,2,3>
+    3178627074U, // <4,0,7,4>: Cost 3 ins <4,0,u,4>, lane 2
+    2779354470U, // <4,0,7,5>: Cost 3 vuzpl <4,6,0,2>, <7,4,5,6>
+    2779354473U, // <4,0,7,6>: Cost 3 vuzpl <4,6,0,2>, <7,4,6,0>
+    2110144513U, // <4,0,7,7>: Cost 2 ins <4,u,7,7>, lane 1
+    2104860674U, // <4,0,7,u>: Cost 2 ins <4,0,u,1>, lane 2
+    1974009856U, // <4,0,u,0>: Cost 2 vtrnl RHS, <0,0,0,0>
+    767893606U,  // <4,0,u,1>: Cost 1 vzipl RHS, LHS
+    900268134U,  // <4,0,u,2>: Cost 1 vtrnl RHS, LHS
+    2104918021U, // <4,0,u,3>: Cost 2 ins <4,0,u,u>, lane 5
+    1974010060U, // <4,0,u,4>: Cost 2 vtrnl RHS, <0,2,4,6>
+    2104918021U, // <4,0,u,5>: Cost 2 ins <4,0,u,u>, lane 5
+    1705613466U, // <4,0,u,6>: Cost 2 vuzpl <4,6,0,2>, RHS
+    1036328961U, // <4,0,u,7>: Cost 1 ins RHS, lane 1
+    900268188U,  // <4,0,u,u>: Cost 1 vtrnl RHS, LHS
+    2600640614U, // <4,1,0,0>: Cost 3 vext1 <u,4,1,0>, LHS
+    2128822272U, // <4,1,0,1>: Cost 2 ins <u,1,0,1>, lane 0
+    2109587457U, // <4,1,0,2>: Cost 2 ins <4,u,0,2>, lane 1
+    2128838656U, // <4,1,0,3>: Cost 2 ins <u,1,0,3>, lane 0
+    2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5>
+    3047785472U, // <4,1,0,5>: Cost 3 vtrnl <4,6,0,2>, <1,3,5,7>
+    3183362049U, // <4,1,0,6>: Cost 3 ins <4,u,0,6>, lane 1
+    2109628417U, // <4,1,0,7>: Cost 2 ins <4,u,0,7>, lane 1
+    2109587457U, // <4,1,0,u>: Cost 2 ins <4,u,0,2>, lane 1
+    3202629632U, // <4,1,1,0>: Cost 3 ins <u,1,1,0>, lane 0
+    2128896000U, // <4,1,1,1>: Cost 2 ins <u,1,1,1>, lane 0
+    2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4>
+    2128912384U, // <4,1,1,3>: Cost 2 ins <u,1,1,3>, lane 0
+    3202662400U, // <4,1,1,4>: Cost 3 ins <u,1,1,4>, lane 0
+    2958401874U, // <4,1,1,5>: Cost 3 vzipr <0,u,4,1>, <0,4,1,5>
+    2778801323U, // <4,1,1,6>: Cost 3 vuzpl <4,5,1,7>, <1,5,6,7>
+    2109702145U, // <4,1,1,7>: Cost 2 ins <4,u,1,7>, lane 1
+    2128896000U, // <4,1,1,u>: Cost 2 ins <u,1,1,1>, lane 0
+    2128961536U, // <4,1,2,0>: Cost 2 ins <u,1,2,0>, lane 0
+    2128969728U, // <4,1,2,1>: Cost 2 ins <u,1,2,1>, lane 0
+    2128977920U, // <4,1,2,2>: Cost 2 ins <u,1,2,2>, lane 0
+    1055244288U, // <4,1,2,3>: Cost 1 ins LHS, lane 0
+    2128994304U, // <4,1,2,4>: Cost 2 ins <u,1,2,4>, lane 0
+    2129002496U, // <4,1,2,5>: Cost 2 ins <u,1,2,5>, lane 0
+    2129010688U, // <4,1,2,6>: Cost 2 ins <u,1,2,6>, lane 0
+    2129018880U, // <4,1,2,7>: Cost 2 ins <u,1,2,7>, lane 0
+    1055244288U, // <4,1,2,u>: Cost 1 ins LHS, lane 0
+    2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS
+    2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3>
+    2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4>
+    2129059840U, // <4,1,3,3>: Cost 2 ins <u,1,3,3>, lane 0
+    2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS
+    2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7>
+    2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+    2109849601U, // <4,1,3,7>: Cost 2 ins <4,u,3,7>, lane 1
+    2129059840U, // <4,1,3,u>: Cost 2 ins <u,1,3,3>, lane 0
+    2600673382U, // <4,1,4,0>: Cost 3 vext1 <u,4,1,4>, LHS
+    1705061641U, // <4,1,4,1>: Cost 2 vuzpl <4,5,1,7>, <4,5,1,7>
+    2912641946U, // <4,1,4,2>: Cost 3 vzipl <4,4,5,6>, <1,2,3,4>
+    2040135782U, // <4,1,4,3>: Cost 2 vtrnr <4,4,4,4>, LHS
+    2109898753U, // <4,1,4,4>: Cost 2 ins <4,u,4,4>, lane 1
+    2129149952U, // <4,1,4,5>: Cost 2 ins <u,1,4,5>, lane 0
+    2109915137U, // <4,1,4,6>: Cost 2 ins <4,u,4,6>, lane 1
+    2109923329U, // <4,1,4,7>: Cost 2 ins <4,u,4,7>, lane 1
+    2109915137U, // <4,1,4,u>: Cost 2 ins <4,u,4,6>, lane 1
+    1479164242U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, <0,4,1,5>
+    1839645492U, // <4,1,5,1>: Cost 2 vzipl RHS, <1,1,1,1>
+    1839645590U, // <4,1,5,2>: Cost 2 vzipl RHS, <1,2,3,0>
+    2016034918U, // <4,1,5,3>: Cost 2 vtrnr <0,4,1,5>, LHS
+    1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+    1839645840U, // <4,1,5,5>: Cost 2 vzipl RHS, <1,5,3,7>
+    3089776763U, // <4,1,5,6>: Cost 3 vtrnr <0,4,1,5>, <0,1,4,6>
+    2109997057U, // <4,1,5,7>: Cost 2 ins <4,u,5,7>, lane 1
+    1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS
+    2110013441U, // <4,1,6,0>: Cost 2 ins <4,u,6,0>, lane 1
+    1973863220U, // <4,1,6,1>: Cost 2 vtrnl RHS, <1,1,1,1>
+    2110029825U, // <4,1,6,2>: Cost 2 ins <4,u,6,2>, lane 1
+    2016116838U, // <4,1,6,3>: Cost 2 vtrnr <0,4,2,6>, LHS
+    2110046209U, // <4,1,6,4>: Cost 2 ins <4,u,6,4>, lane 1
+    1973863424U, // <4,1,6,5>: Cost 2 vtrnl RHS, <1,3,5,7>
+    2110062593U, // <4,1,6,6>: Cost 2 ins <4,u,6,6>, lane 1
+    1036328961U, // <4,1,6,7>: Cost 1 ins RHS, lane 1
+    1036328961U, // <4,1,6,u>: Cost 1 ins RHS, lane 1
+    2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+    3203080192U, // <4,1,7,1>: Cost 3 ins <u,1,7,1>, lane 0
+    3203088384U, // <4,1,7,2>: Cost 3 ins <u,1,7,2>, lane 0
+    2129354752U, // <4,1,7,3>: Cost 2 ins <u,1,7,3>, lane 0
+    2664666470U, // <4,1,7,4>: Cost 3 vext2 <7,u,4,1>, <7,4,5,6>
+    3203112960U, // <4,1,7,5>: Cost 3 ins <u,1,7,5>, lane 0
+    3114049641U, // <4,1,7,6>: Cost 3 vtrnr <4,4,6,7>, <0,1,2,6>
+    2110144513U, // <4,1,7,7>: Cost 2 ins <4,u,7,7>, lane 1
+    2129354752U, // <4,1,7,u>: Cost 2 ins <u,1,7,3>, lane 0
+    1479188821U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, <0,4,1,u>
+    1974010676U, // <4,1,u,1>: Cost 2 vtrnl RHS, <1,1,1,1>
+    1841636246U, // <4,1,u,2>: Cost 2 vzipl RHS, <1,2,3,0>
+    1055244288U, // <4,1,u,3>: Cost 1 ins LHS, lane 0
+    1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS
+    1974010880U, // <4,1,u,5>: Cost 2 vtrnl RHS, <1,3,5,7>
+    2109915137U, // <4,1,u,6>: Cost 2 ins <4,u,4,6>, lane 1
+    1036328961U, // <4,1,u,7>: Cost 1 ins RHS, lane 1
+    1055244288U, // <4,1,u,u>: Cost 1 ins LHS, lane 0
+    3047786150U, // <4,2,0,0>: Cost 3 vtrnl <4,6,0,2>, <2,3,0,1>
+    2109579265U, // <4,2,0,1>: Cost 2 ins <4,u,0,1>, lane 1
+    2129494016U, // <4,2,0,2>: Cost 2 ins <u,2,0,2>, lane 0
+    2967019622U, // <4,2,0,3>: Cost 3 vzipr <2,3,4,0>, LHS
+    2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6>
+    2909947747U, // <4,2,0,5>: Cost 3 vzipl <4,0,5,1>, <2,5,3,1>
+    2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4>
+    2109628417U, // <4,2,0,7>: Cost 2 ins <4,u,0,7>, lane 1
+    2129494016U, // <4,2,0,u>: Cost 2 ins <u,2,0,2>, lane 0
+    3203293184U, // <4,2,1,0>: Cost 3 ins <u,2,1,0>, lane 0
+    3203301376U, // <4,2,1,1>: Cost 3 ins <u,2,1,1>, lane 0
+    3203309568U, // <4,2,1,2>: Cost 3 ins <u,2,1,2>, lane 0
+    2821242982U, // <4,2,1,3>: Cost 3 vuzpr <0,4,0,2>, LHS
+    2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+    3203334144U, // <4,2,1,5>: Cost 3 ins <u,2,1,5>, lane 0
+    3203342336U, // <4,2,1,6>: Cost 3 ins <u,2,1,6>, lane 0
+    2109702145U, // <4,2,1,7>: Cost 2 ins <4,u,1,7>, lane 1
+    2109702145U, // <4,2,1,u>: Cost 2 ins <4,u,1,7>, lane 1
+    2229208824U, // <4,2,2,0>: Cost 3 vrev <2,4,0,2>
+    2911397400U, // <4,2,2,1>: Cost 3 vzipl <4,2,6,7>, <2,1,2,3>
+    2129641472U, // <4,2,2,2>: Cost 2 ins <u,2,2,2>, lane 0
+    2129649664U, // <4,2,2,3>: Cost 2 ins <u,2,2,3>, lane 0
+    2697954940U, // <4,2,2,4>: Cost 3 vext3 <2,2,4,4>, <2,2,4,4>
+    2911397764U, // <4,2,2,5>: Cost 3 vzipl <4,2,6,7>, <2,5,6,7>
+    2821243084U, // <4,2,2,6>: Cost 3 vuzpr <0,4,0,2>, <0,2,4,6>
+    2109775873U, // <4,2,2,7>: Cost 2 ins <4,u,2,7>, lane 1
+    2129641472U, // <4,2,2,u>: Cost 2 ins <u,2,2,2>, lane 0
+    2129698816U, // <4,2,3,0>: Cost 2 ins <u,2,3,0>, lane 0
+    2229290754U, // <4,2,3,1>: Cost 3 vrev <2,4,1,3>
+    3203457024U, // <4,2,3,2>: Cost 3 ins <u,2,3,2>, lane 0
+    2129723392U, // <4,2,3,3>: Cost 2 ins <u,2,3,3>, lane 0
+    2129731584U, // <4,2,3,4>: Cost 2 ins <u,2,3,4>, lane 0
+    2833188558U, // <4,2,3,5>: Cost 3 vuzpr <2,4,0,2>, <2,3,4,5>
+    3203489792U, // <4,2,3,6>: Cost 3 ins <u,2,3,6>, lane 0
+    2109849601U, // <4,2,3,7>: Cost 2 ins <4,u,3,7>, lane 1
+    2129698816U, // <4,2,3,u>: Cost 2 ins <u,2,3,0>, lane 0
+    2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS
+    2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4>
+    1702448074U, // <4,2,4,2>: Cost 2 vuzpl <4,1,2,3>, <4,1,2,3>
+    1905918054U, // <4,2,4,3>: Cost 2 vzipr <4,4,4,4>, LHS
+    2109898753U, // <4,2,4,4>: Cost 2 ins <4,u,4,4>, lane 1
+    2109906945U, // <4,2,4,5>: Cost 2 ins <4,u,4,5>, lane 1
+    2129821696U, // <4,2,4,6>: Cost 2 ins <u,2,4,6>, lane 0
+    2109923329U, // <4,2,4,7>: Cost 2 ins <4,u,4,7>, lane 1
+    2129821696U, // <4,2,4,u>: Cost 2 ins <u,2,4,6>, lane 0
+    3089777558U, // <4,2,5,0>: Cost 3 vtrnr <0,4,1,5>, <1,2,3,0>
+    2109947905U, // <4,2,5,1>: Cost 2 ins <4,u,5,1>, lane 1
+    1839646312U, // <4,2,5,2>: Cost 2 vzipl RHS, <2,2,2,2>
+    1893318758U, // <4,2,5,3>: Cost 2 vzipr <2,3,4,5>, LHS
+    3089777562U, // <4,2,5,4>: Cost 3 vtrnr <0,4,1,5>, <1,2,3,4>
+    2109980673U, // <4,2,5,5>: Cost 2 ins <4,u,5,5>, lane 1
+    1839646650U, // <4,2,5,6>: Cost 2 vzipl RHS, <2,6,3,7>
+    2109997057U, // <4,2,5,7>: Cost 2 ins <4,u,5,7>, lane 1
+    1893318763U, // <4,2,5,u>: Cost 2 vzipr <2,3,4,5>, LHS
+    1479246172U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, <0,4,2,6>
+    2110021633U, // <4,2,6,1>: Cost 2 ins <4,u,6,1>, lane 1
+    1973864040U, // <4,2,6,2>: Cost 2 vtrnl RHS, <2,2,2,2>
+    1880719462U, // <4,2,6,3>: Cost 2 vzipr <0,2,4,6>, LHS
+    1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS
+    2110054401U, // <4,2,6,5>: Cost 2 ins <4,u,6,5>, lane 1
+    2110062593U, // <4,2,6,6>: Cost 2 ins <4,u,6,6>, lane 1
+    1036328961U, // <4,2,6,7>: Cost 1 ins RHS, lane 1
+    1036328961U, // <4,2,6,u>: Cost 1 ins RHS, lane 1
+    2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+    3203743744U, // <4,2,7,1>: Cost 3 ins <u,2,7,1>, lane 0
+    3203751936U, // <4,2,7,2>: Cost 3 ins <u,2,7,2>, lane 0
+    2130018304U, // <4,2,7,3>: Cost 2 ins <u,2,7,3>, lane 0
+    3102032794U, // <4,2,7,4>: Cost 3 vtrnr <2,4,5,7>, <1,2,3,4>
+    2229618474U, // <4,2,7,5>: Cost 3 vrev <2,4,5,7>
+    3203784704U, // <4,2,7,6>: Cost 3 ins <u,2,7,6>, lane 0
+    2110144513U, // <4,2,7,7>: Cost 2 ins <4,u,7,7>, lane 1
+    2130018304U, // <4,2,7,u>: Cost 2 ins <u,2,7,3>, lane 0
+    1479262558U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, <0,4,2,u>
+    2109947905U, // <4,2,u,1>: Cost 2 ins <4,u,5,1>, lane 1
+    1974011496U, // <4,2,u,2>: Cost 2 vtrnl RHS, <2,2,2,2>
+    1880735846U, // <4,2,u,3>: Cost 2 vzipr <0,2,4,u>, LHS
+    1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS
+    2109980673U, // <4,2,u,5>: Cost 2 ins <4,u,5,5>, lane 1
+    1841637306U, // <4,2,u,6>: Cost 2 vzipl RHS, <2,6,3,7>
+    1036328961U, // <4,2,u,7>: Cost 1 ins RHS, lane 1
+    1036328961U, // <4,2,u,u>: Cost 1 ins RHS, lane 1
+    3203883008U, // <4,3,0,0>: Cost 3 ins <u,3,0,0>, lane 0
+    2130149376U, // <4,3,0,1>: Cost 2 ins <u,3,0,1>, lane 0
+    2109587457U, // <4,3,0,2>: Cost 2 ins <4,u,0,2>, lane 1
+    3047786908U, // <4,3,0,3>: Cost 3 vtrnl <4,6,0,2>, <3,3,3,3>
+    2967020442U, // <4,3,0,4>: Cost 3 vzipr <2,3,4,0>, <1,2,3,4>
+    2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0>
+    3183362049U, // <4,3,0,6>: Cost 3 ins <4,u,0,6>, lane 1
+    2109628417U, // <4,3,0,7>: Cost 2 ins <4,u,0,7>, lane 1
+    2130149376U, // <4,3,0,u>: Cost 2 ins <u,3,0,1>, lane 0
+    2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1>
+    3203964928U, // <4,3,1,1>: Cost 3 ins <u,3,1,1>, lane 0
+    2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4>
+    2130239488U, // <4,3,1,3>: Cost 2 ins <u,3,1,3>, lane 0
+    2967028634U, // <4,3,1,4>: Cost 3 vzipr <2,3,4,1>, <1,2,3,4>
+    3203997696U, // <4,3,1,5>: Cost 3 ins <u,3,1,5>, lane 0
+    2821398633U, // <4,3,1,6>: Cost 3 vuzpr <0,4,2,3>, <0,1,2,6>
+    2109702145U, // <4,3,1,7>: Cost 2 ins <4,u,1,7>, lane 1
+    2130239488U, // <4,3,1,u>: Cost 2 ins <u,3,1,3>, lane 0
+    3204030464U, // <4,3,2,0>: Cost 3 ins <u,3,2,0>, lane 0
+    2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3>
+    3204046848U, // <4,3,2,2>: Cost 3 ins <u,3,2,2>, lane 0
+    2130313216U, // <4,3,2,3>: Cost 2 ins <u,3,2,3>, lane 0
+    2833269658U, // <4,3,2,4>: Cost 3 vuzpr <2,4,1,3>, <1,2,3,4>
+    3101624014U, // <4,3,2,5>: Cost 3 vtrnr <2,4,0,2>, <2,3,4,5>
+    3204079616U, // <4,3,2,6>: Cost 3 ins <u,3,2,6>, lane 0
+    2109775873U, // <4,3,2,7>: Cost 2 ins <4,u,2,7>, lane 1
+    2130313216U, // <4,3,2,u>: Cost 2 ins <u,3,2,3>, lane 0
+    3204104192U, // <4,3,3,0>: Cost 3 ins <u,3,3,0>, lane 0
+    2779564182U, // <4,3,3,1>: Cost 3 vuzpl <4,6,3,1>, <3,0,1,2>
+    2636810580U, // <4,3,3,2>: Cost 3 vext2 <3,2,4,3>, <3,2,4,3>
+    2130386944U, // <4,3,3,3>: Cost 2 ins <u,3,3,3>, lane 0
+    2965717914U, // <4,3,3,4>: Cost 3 vzipr <2,1,4,3>, <1,2,3,4>
+    2779597314U, // <4,3,3,5>: Cost 3 vuzpl <4,6,3,5>, <3,4,5,6>
+    2778950237U, // <4,3,3,6>: Cost 3 vuzpl <4,5,3,7>, <3,5,6,7>
+    2109849601U, // <4,3,3,7>: Cost 2 ins <4,u,3,7>, lane 1
+    2130386944U, // <4,3,3,u>: Cost 2 ins <u,3,3,3>, lane 0
+    2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1>
+    2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2>
+    3183624193U, // <4,3,4,2>: Cost 3 ins <4,u,4,2>, lane 1
+    1747657049U, // <4,3,4,3>: Cost 2 vuzpr <0,4,2,3>, <0,4,2,3>
+    2109898753U, // <4,3,4,4>: Cost 2 ins <4,u,4,4>, lane 1
+    2130477056U, // <4,3,4,5>: Cost 2 ins <u,3,4,5>, lane 0
+    2109915137U, // <4,3,4,6>: Cost 2 ins <4,u,4,6>, lane 1
+    2109923329U, // <4,3,4,7>: Cost 2 ins <4,u,4,7>, lane 1
+    2130477056U, // <4,3,4,u>: Cost 2 ins <u,3,4,5>, lane 0
+    1839646870U, // <4,3,5,0>: Cost 2 vzipl RHS, <3,0,1,2>
+    2109947905U, // <4,3,5,1>: Cost 2 ins <4,u,5,1>, lane 1
+    2967061238U, // <4,3,5,2>: Cost 3 vzipr <2,3,4,5>, <1,0,3,2>
+    1839647132U, // <4,3,5,3>: Cost 2 vzipl RHS, <3,3,3,3>
+    1839647234U, // <4,3,5,4>: Cost 2 vzipl RHS, <3,4,5,6>
+    2109980673U, // <4,3,5,5>: Cost 2 ins <4,u,5,5>, lane 1
+    2913389176U, // <4,3,5,6>: Cost 3 vzipl RHS, <3,6,0,7>
+    2130567168U, // <4,3,5,7>: Cost 2 ins <u,3,5,7>, lane 0
+    1839647518U, // <4,3,5,u>: Cost 2 vzipl RHS, <3,u,1,2>
+    2110013441U, // <4,3,6,0>: Cost 2 ins <4,u,6,0>, lane 1
+    1973864598U, // <4,3,6,1>: Cost 2 vtrnl RHS, <3,0,1,2>
+    2110029825U, // <4,3,6,2>: Cost 2 ins <4,u,6,2>, lane 1
+    1973864860U, // <4,3,6,3>: Cost 2 vtrnl RHS, <3,3,3,3>
+    2110046209U, // <4,3,6,4>: Cost 2 ins <4,u,6,4>, lane 1
+    1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6>
+    2110062593U, // <4,3,6,6>: Cost 2 ins <4,u,6,6>, lane 1
+    1036328961U, // <4,3,6,7>: Cost 1 ins RHS, lane 1
+    1036328961U, // <4,3,6,u>: Cost 1 ins RHS, lane 1
+    3204399104U, // <4,3,7,0>: Cost 3 ins <u,3,7,0>, lane 0
+    3204407296U, // <4,3,7,1>: Cost 3 ins <u,3,7,1>, lane 0
+    2660701368U, // <4,3,7,2>: Cost 3 vext2 <7,2,4,3>, <7,2,4,3>
+    3204423680U, // <4,3,7,3>: Cost 3 ins <u,3,7,3>, lane 0
+    2968404890U, // <4,3,7,4>: Cost 3 vzipr <2,5,4,7>, <1,2,3,4>
+    3204440064U, // <4,3,7,5>: Cost 3 ins <u,3,7,5>, lane 0
+    2235664908U, // <4,3,7,6>: Cost 3 vrev <3,4,6,7>
+    2110144513U, // <4,3,7,7>: Cost 2 ins <4,u,7,7>, lane 1
+    2110144513U, // <4,3,7,u>: Cost 2 ins <4,u,7,7>, lane 1
+    1841637526U, // <4,3,u,0>: Cost 2 vzipl RHS, <3,0,1,2>
+    1974012054U, // <4,3,u,1>: Cost 2 vtrnl RHS, <3,0,1,2>
+    2109587457U, // <4,3,u,2>: Cost 2 ins <4,u,0,2>, lane 1
+    1974012316U, // <4,3,u,3>: Cost 2 vtrnl RHS, <3,3,3,3>
+    1841637890U, // <4,3,u,4>: Cost 2 vzipl RHS, <3,4,5,6>
+    1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u>
+    2109915137U, // <4,3,u,6>: Cost 2 ins <4,u,4,6>, lane 1
+    1036328961U, // <4,3,u,7>: Cost 1 ins RHS, lane 1
+    1036328961U, // <4,3,u,u>: Cost 1 ins RHS, lane 1
+    1974046028U, // <4,4,0,0>: Cost 2 vtrnl <4,6,0,2>, <4,6,0,2>
+    2107572229U, // <4,4,0,1>: Cost 2 ins <4,4,u,u>, lane 5
+    1705934950U, // <4,4,0,2>: Cost 2 vuzpl <4,6,4,6>, LHS
+    3180724227U, // <4,4,0,3>: Cost 3 ins <4,4,0,u>, lane 3
+    2107539458U, // <4,4,0,4>: Cost 2 ins <4,4,u,4>, lane 2
+    2107547650U, // <4,4,0,5>: Cost 2 ins <4,4,u,5>, lane 2
+    1974046006U, // <4,4,0,6>: Cost 2 vtrnl <4,6,0,2>, RHS
+    2109628417U, // <4,4,0,7>: Cost 2 ins <4,u,0,7>, lane 1
+    1974046024U, // <4,4,0,u>: Cost 2 vtrnl <4,6,0,2>, RHS
+    3204620288U, // <4,4,1,0>: Cost 3 ins <u,4,1,0>, lane 0
+    1836665802U, // <4,4,1,1>: Cost 2 vzipl <4,1,2,3>, <4,1,2,3>
+    2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3>
+    1771700326U, // <4,4,1,3>: Cost 2 vuzpr <4,4,4,4>, LHS
+    2107539458U, // <4,4,1,4>: Cost 2 ins <4,4,u,4>, lane 2
+    2130919424U, // <4,4,1,5>: Cost 2 ins <u,4,1,5>, lane 0
+    2107555842U, // <4,4,1,6>: Cost 2 ins <4,4,u,6>, lane 2
+    2109702145U, // <4,4,1,7>: Cost 2 ins <4,u,1,7>, lane 1
+    2130919424U, // <4,4,1,u>: Cost 2 ins <u,4,1,5>, lane 0
+    2779678374U, // <4,4,2,0>: Cost 3 vuzpl <4,6,4,6>, <2,3,0,1>
+    3044625673U, // <4,4,2,1>: Cost 3 vtrnl <4,1,2,3>, <4,5,1,7>
+    1970883530U, // <4,4,2,2>: Cost 2 vtrnl <4,1,2,3>, <4,1,2,3>
+    2107572229U, // <4,4,2,3>: Cost 2 ins <4,4,u,u>, lane 5
+    2107539458U, // <4,4,2,4>: Cost 2 ins <4,4,u,4>, lane 2
+    2107547650U, // <4,4,2,5>: Cost 2 ins <4,4,u,5>, lane 2
+    2131001344U, // <4,4,2,6>: Cost 2 ins <u,4,2,6>, lane 0
+    2109775873U, // <4,4,2,7>: Cost 2 ins <4,u,2,7>, lane 1
+    2107572229U, // <4,4,2,u>: Cost 2 ins <4,4,u,u>, lane 5
+    3181248514U, // <4,4,3,0>: Cost 3 ins <4,4,u,0>, lane 2
+    2779678870U, // <4,4,3,1>: Cost 3 vuzpl <4,6,4,6>, <3,0,1,2>
+    3181264898U, // <4,4,3,2>: Cost 3 ins <4,4,u,2>, lane 2
+    1880031352U, // <4,4,3,3>: Cost 2 vzipr <0,1,4,3>, <0,1,4,3>
+    2107539458U, // <4,4,3,4>: Cost 2 ins <4,4,u,4>, lane 2
+    2107547650U, // <4,4,3,5>: Cost 2 ins <4,4,u,5>, lane 2
+    2107555842U, // <4,4,3,6>: Cost 2 ins <4,4,u,6>, lane 2
+    2109849601U, // <4,4,3,7>: Cost 2 ins <4,u,3,7>, lane 1
+    2107547650U, // <4,4,3,u>: Cost 2 ins <4,4,u,5>, lane 2
+    1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS
+    2107277315U, // <4,4,4,1>: Cost 2 ins <4,4,4,u>, lane 3
+    2107277315U, // <4,4,4,2>: Cost 2 ins <4,4,4,u>, lane 3
+    2107277315U, // <4,4,4,3>: Cost 2 ins <4,4,4,u>, lane 3
+    161926454U,  // <4,4,4,4>: Cost 1 vdup0 RHS
+    2107547650U, // <4,4,4,5>: Cost 2 ins <4,4,u,5>, lane 2
+    1705938230U, // <4,4,4,6>: Cost 2 vuzpl <4,6,4,6>, RHS
+    2109923329U, // <4,4,4,7>: Cost 2 ins <4,u,4,7>, lane 1
+    161926454U,  // <4,4,4,u>: Cost 1 vdup0 RHS
+    1839647634U, // <4,4,5,0>: Cost 2 vzipl RHS, <4,0,5,1>
+    2109947905U, // <4,4,5,1>: Cost 2 ins <4,u,5,1>, lane 1
+    2107351043U, // <4,4,5,2>: Cost 2 ins <4,4,5,u>, lane 3
+    2107351043U, // <4,4,5,3>: Cost 2 ins <4,4,5,u>, lane 3
+    1839647952U, // <4,4,5,4>: Cost 2 vzipl RHS, <4,4,4,4>
+    765906230U,  // <4,4,5,5>: Cost 1 vzipl RHS, RHS
+    1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+    2107351043U, // <4,4,5,7>: Cost 2 ins <4,4,5,u>, lane 3
+    765906473U,  // <4,4,5,u>: Cost 1 vzipl RHS, RHS
+    1973865804U, // <4,4,6,0>: Cost 2 vtrnl RHS, <4,6,0,2>
+    2107424771U, // <4,4,6,1>: Cost 2 ins <4,4,6,u>, lane 3
+    2110029825U, // <4,4,6,2>: Cost 2 ins <4,u,6,2>, lane 1
+    2107424771U, // <4,4,6,3>: Cost 2 ins <4,4,6,u>, lane 3
+    1973865680U, // <4,4,6,4>: Cost 2 vtrnl RHS, <4,4,4,4>
+    1973865362U, // <4,4,6,5>: Cost 2 vtrnl RHS, <4,0,5,1>
+    900123958U,  // <4,4,6,6>: Cost 1 vtrnl RHS, RHS
+    1036328961U, // <4,4,6,7>: Cost 1 ins RHS, lane 1
+    900123976U,  // <4,4,6,u>: Cost 1 vtrnl RHS, RHS
+    3181248514U, // <4,4,7,0>: Cost 3 ins <4,4,u,0>, lane 2
+    2779681786U, // <4,4,7,1>: Cost 3 vuzpl <4,6,4,6>, <7,0,1,2>
+    3181264898U, // <4,4,7,2>: Cost 3 ins <4,4,u,2>, lane 2
+    2845442636U, // <4,4,7,3>: Cost 3 vuzpr <4,4,4,4>, <0,7,2,3>
+    2107539458U, // <4,4,7,4>: Cost 2 ins <4,4,u,4>, lane 2
+    2107547650U, // <4,4,7,5>: Cost 2 ins <4,4,u,5>, lane 2
+    2131369984U, // <4,4,7,6>: Cost 2 ins <u,4,7,6>, lane 0
+    2040311013U, // <4,4,7,7>: Cost 2 vtrnr <4,4,6,7>, <4,4,6,7>
+    2107547650U, // <4,4,7,u>: Cost 2 ins <4,4,u,5>, lane 2
+    1974013260U, // <4,4,u,0>: Cost 2 vtrnl RHS, <4,6,0,2>
+    2107572229U, // <4,4,u,1>: Cost 2 ins <4,4,u,u>, lane 5
+    1705940782U, // <4,4,u,2>: Cost 2 vuzpl <4,6,4,6>, LHS
+    2107572229U, // <4,4,u,3>: Cost 2 ins <4,4,u,u>, lane 5
+    161926454U,  // <4,4,u,4>: Cost 1 vdup0 RHS
+    767896886U,  // <4,4,u,5>: Cost 1 vzipl RHS, RHS
+    900271414U,  // <4,4,u,6>: Cost 1 vtrnl RHS, RHS
+    1036328961U, // <4,4,u,7>: Cost 1 ins RHS, lane 1
+    900271432U,  // <4,4,u,u>: Cost 1 vtrnl RHS, RHS
+    2108170242U, // <4,5,0,0>: Cost 2 ins <4,5,u,0>, lane 2
+    1034493957U, // <4,5,0,1>: Cost 1 ins RHS, lane 5
+    1707294822U, // <4,5,0,2>: Cost 2 vuzpl <4,u,5,1>, LHS
+    2108194818U, // <4,5,0,3>: Cost 2 ins <4,5,u,3>, lane 2
+    2108203010U, // <4,5,0,4>: Cost 2 ins <4,5,u,4>, lane 2
+    2108211202U, // <4,5,0,5>: Cost 2 ins <4,5,u,5>, lane 2
+    2108219394U, // <4,5,0,6>: Cost 2 ins <4,5,u,6>, lane 2
+    1034485762U, // <4,5,0,7>: Cost 1 ins RHS, lane 2
+    1034493957U, // <4,5,0,u>: Cost 1 ins RHS, lane 5
+    2108170242U, // <4,5,1,0>: Cost 2 ins <4,5,u,0>, lane 2
+    2133540868U, // <4,5,1,1>: Cost 2 ins <u,u,1,1>, lane 4
+    2133549060U, // <4,5,1,2>: Cost 2 ins <u,u,1,2>, lane 4
+    1747599462U, // <4,5,1,3>: Cost 2 vuzpr <0,4,1,5>, LHS
+    2108203010U, // <4,5,1,4>: Cost 2 ins <4,5,u,4>, lane 2
+    2133573636U, // <4,5,1,5>: Cost 2 ins <u,u,1,5>, lane 4
+    2108219394U, // <4,5,1,6>: Cost 2 ins <4,5,u,6>, lane 2
+    1034485762U, // <4,5,1,7>: Cost 1 ins RHS, lane 2
+    1034485762U, // <4,5,1,u>: Cost 1 ins RHS, lane 2
+    2108170242U, // <4,5,2,0>: Cost 2 ins <4,5,u,0>, lane 2
+    2108178434U, // <4,5,2,1>: Cost 2 ins <4,5,u,1>, lane 2
+    2133622788U, // <4,5,2,2>: Cost 2 ins <u,u,2,2>, lane 4
+    1059889156U, // <4,5,2,3>: Cost 1 ins LHS, lane 4
+    2108203010U, // <4,5,2,4>: Cost 2 ins <4,5,u,4>, lane 2
+    2108211202U, // <4,5,2,5>: Cost 2 ins <4,5,u,5>, lane 2
+    2133655556U, // <4,5,2,6>: Cost 2 ins <u,u,2,6>, lane 4
+    1034485762U, // <4,5,2,7>: Cost 1 ins RHS, lane 2
+    1059889156U, // <4,5,2,u>: Cost 1 ins LHS, lane 4
+    2133680132U, // <4,5,3,0>: Cost 2 ins <u,u,3,0>, lane 4
+    2108178434U, // <4,5,3,1>: Cost 2 ins <4,5,u,1>, lane 2
+    2133696516U, // <4,5,3,2>: Cost 2 ins <u,u,3,2>, lane 4
+    2133704708U, // <4,5,3,3>: Cost 2 ins <u,u,3,3>, lane 4
+    2133712900U, // <4,5,3,4>: Cost 2 ins <u,u,3,4>, lane 4
+    2108211202U, // <4,5,3,5>: Cost 2 ins <4,5,u,5>, lane 2
+    2108219394U, // <4,5,3,6>: Cost 2 ins <4,5,u,6>, lane 2
+    1034485762U, // <4,5,3,7>: Cost 1 ins RHS, lane 2
+    1034485762U, // <4,5,3,u>: Cost 1 ins RHS, lane 2
+    2108170242U, // <4,5,4,0>: Cost 2 ins <4,5,u,0>, lane 2
+    2108178434U, // <4,5,4,1>: Cost 2 ins <4,5,u,1>, lane 2
+    2108186626U, // <4,5,4,2>: Cost 2 ins <4,5,u,2>, lane 2
+    2108194818U, // <4,5,4,3>: Cost 2 ins <4,5,u,3>, lane 2
+    2109898753U, // <4,5,4,4>: Cost 2 ins <4,u,4,4>, lane 1
+    1034493957U, // <4,5,4,5>: Cost 1 ins RHS, lane 5
+    1707298102U, // <4,5,4,6>: Cost 2 vuzpl <4,u,5,1>, RHS
+    1034485762U, // <4,5,4,7>: Cost 1 ins RHS, lane 2
+    1034493957U, // <4,5,4,u>: Cost 1 ins RHS, lane 5
+    1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS
+    1839656656U, // <4,5,5,1>: Cost 2 vzipl RHS, <5,1,7,3>
+    2108186626U, // <4,5,5,2>: Cost 2 ins <4,5,u,2>, lane 2
+    2108194818U, // <4,5,5,3>: Cost 2 ins <4,5,u,3>, lane 2
+    1839648710U, // <4,5,5,4>: Cost 2 vzipl RHS, <5,4,7,6>
+    1839648772U, // <4,5,5,5>: Cost 2 vzipl RHS, <5,5,5,5>
+    1839648866U, // <4,5,5,6>: Cost 2 vzipl RHS, <5,6,7,0>
+    1034485762U, // <4,5,5,7>: Cost 1 ins RHS, lane 2
+    1034485762U, // <4,5,5,u>: Cost 1 ins RHS, lane 2
+    1034346499U, // <4,5,6,0>: Cost 1 ins RHS, lane 3
+    1034346499U, // <4,5,6,1>: Cost 1 ins RHS, lane 3
+    1034346499U, // <4,5,6,2>: Cost 1 ins RHS, lane 3
+    1034346499U, // <4,5,6,3>: Cost 1 ins RHS, lane 3
+    1034346499U, // <4,5,6,4>: Cost 1 ins RHS, lane 3
+    1034346499U, // <4,5,6,5>: Cost 1 ins RHS, lane 3
+    1034346499U, // <4,5,6,6>: Cost 1 ins RHS, lane 3
+    27705344U,   // <4,5,6,7>: Cost 0 copy RHS
+    27705344U,   // <4,5,6,u>: Cost 0 copy RHS
+    2133975044U, // <4,5,7,0>: Cost 2 ins <u,u,7,0>, lane 4
+    2108178434U, // <4,5,7,1>: Cost 2 ins <4,5,u,1>, lane 2
+    2108186626U, // <4,5,7,2>: Cost 2 ins <4,5,u,2>, lane 2
+    2133999620U, // <4,5,7,3>: Cost 2 ins <u,u,7,3>, lane 4
+    2134007812U, // <4,5,7,4>: Cost 2 ins <u,u,7,4>, lane 4
+    2108211202U, // <4,5,7,5>: Cost 2 ins <4,5,u,5>, lane 2
+    2134024196U, // <4,5,7,6>: Cost 2 ins <u,u,7,6>, lane 4
+    1034485762U, // <4,5,7,7>: Cost 1 ins RHS, lane 2
+    1034485762U, // <4,5,7,u>: Cost 1 ins RHS, lane 2
+    1034346499U, // <4,5,u,0>: Cost 1 ins RHS, lane 3
+    1034493957U, // <4,5,u,1>: Cost 1 ins RHS, lane 5
+    1034346499U, // <4,5,u,2>: Cost 1 ins RHS, lane 3
+    1059889156U, // <4,5,u,3>: Cost 1 ins LHS, lane 4
+    1034346499U, // <4,5,u,4>: Cost 1 ins RHS, lane 3
+    1034493957U, // <4,5,u,5>: Cost 1 ins RHS, lane 5
+    1034346499U, // <4,5,u,6>: Cost 1 ins RHS, lane 3
+    27705344U,   // <4,5,u,7>: Cost 0 copy RHS
+    27705344U,   // <4,5,u,u>: Cost 0 copy RHS
+    1705426944U, // <4,6,0,0>: Cost 2 vuzpl RHS, <0,0,0,0>
+    1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS
+    631685222U,  // <4,6,0,2>: Cost 1 vuzpl RHS, LHS
+    2108309507U, // <4,6,0,3>: Cost 2 ins <4,6,0,u>, lane 3
+    1705427148U, // <4,6,0,4>: Cost 2 vuzpl RHS, <0,2,4,6>
+    2108309507U, // <4,6,0,5>: Cost 2 ins <4,6,0,u>, lane 3
+    2108882946U, // <4,6,0,6>: Cost 2 ins <4,6,u,6>, lane 2
+    2108309507U, // <4,6,0,7>: Cost 2 ins <4,6,0,u>, lane 3
+    631685276U,  // <4,6,0,u>: Cost 1 vuzpl RHS, LHS
+    2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2>
+    1705427764U, // <4,6,1,1>: Cost 2 vuzpl RHS, <1,1,1,1>
+    2108850178U, // <4,6,1,2>: Cost 2 ins <4,6,u,2>, lane 2
+    1747681382U, // <4,6,1,3>: Cost 2 vuzpr <0,4,2,6>, LHS
+    2779169619U, // <4,6,1,4>: Cost 3 vuzpl RHS, <1,1,4,5>
+    1705427968U, // <4,6,1,5>: Cost 2 vuzpl RHS, <1,3,5,7>
+    2108882946U, // <4,6,1,6>: Cost 2 ins <4,6,u,6>, lane 2
+    2109702145U, // <4,6,1,7>: Cost 2 ins <4,u,1,7>, lane 1
+    1747681387U, // <4,6,1,u>: Cost 2 vuzpr <0,4,2,6>, LHS
+    1705428646U, // <4,6,2,0>: Cost 2 vuzpl RHS, <2,3,0,1>
+    2779170237U, // <4,6,2,1>: Cost 3 vuzpl RHS, <2,0,1,2>
+    1705428584U, // <4,6,2,2>: Cost 2 vuzpl RHS, <2,2,2,2>
+    1705428594U, // <4,6,2,3>: Cost 2 vuzpl RHS, <2,2,3,3>
+    1705428686U, // <4,6,2,4>: Cost 2 vuzpl RHS, <2,3,4,5>
+    2839560386U, // <4,6,2,5>: Cost 3 vuzpr <3,4,5,6>, <0,2,3,5>
+    2108882946U, // <4,6,2,6>: Cost 2 ins <4,6,u,6>, lane 2
+    2109775873U, // <4,6,2,7>: Cost 2 ins <4,u,2,7>, lane 1
+    1705428639U, // <4,6,2,u>: Cost 2 vuzpl RHS, <2,2,u,3>
+    2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2>
+    1705429142U, // <4,6,3,1>: Cost 2 vuzpl RHS, <3,0,1,2>
+    2108850178U, // <4,6,3,2>: Cost 2 ins <4,6,u,2>, lane 2
+    1705429404U, // <4,6,3,3>: Cost 2 vuzpl RHS, <3,3,3,3>
+    2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6>
+    1705429506U, // <4,6,3,5>: Cost 2 vuzpl RHS, <3,4,5,6>
+    2108882946U, // <4,6,3,6>: Cost 2 ins <4,6,u,6>, lane 2
+    2132410368U, // <4,6,3,7>: Cost 2 ins <u,6,3,7>, lane 0
+    1705429205U, // <4,6,3,u>: Cost 2 vuzpl RHS, <3,0,u,2>
+    1705430348U, // <4,6,4,0>: Cost 2 vuzpl RHS, <4,6,0,2>
+    2108604419U, // <4,6,4,1>: Cost 2 ins <4,6,4,u>, lane 3
+    2108850178U, // <4,6,4,2>: Cost 2 ins <4,6,u,2>, lane 2
+    2108604419U, // <4,6,4,3>: Cost 2 ins <4,6,4,u>, lane 3
+    1705430224U, // <4,6,4,4>: Cost 2 vuzpl RHS, <4,4,4,4>
+    1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS
+    631688502U,  // <4,6,4,6>: Cost 1 vuzpl RHS, RHS
+    2108604419U, // <4,6,4,7>: Cost 2 ins <4,6,4,u>, lane 3
+    631688520U,  // <4,6,4,u>: Cost 1 vuzpl RHS, RHS
+    2839563567U, // <4,6,5,0>: Cost 3 vuzpr <3,4,5,6>, <4,5,6,0>
+    1705439360U, // <4,6,5,1>: Cost 2 vuzpl RHS, <5,7,1,3>
+    1839657466U, // <4,6,5,2>: Cost 2 vzipl RHS, <6,2,7,3>
+    2839563570U, // <4,6,5,3>: Cost 3 vuzpr <3,4,5,6>, <4,5,6,3>
+    2839563571U, // <4,6,5,4>: Cost 3 vuzpr <3,4,5,6>, <4,5,6,4>
+    1705431044U, // <4,6,5,5>: Cost 2 vuzpl RHS, <5,5,5,5>
+    1839649592U, // <4,6,5,6>: Cost 2 vzipl RHS, <6,6,6,6>
+    1747684662U, // <4,6,5,7>: Cost 2 vuzpr <0,4,2,6>, RHS
+    1747684663U, // <4,6,5,u>: Cost 2 vuzpr <0,4,2,6>, RHS
+    1705431886U, // <4,6,6,0>: Cost 2 vuzpl RHS, <6,7,0,1>
+    2110021633U, // <4,6,6,1>: Cost 2 ins <4,u,6,1>, lane 1
+    2110029825U, // <4,6,6,2>: Cost 2 ins <4,u,6,2>, lane 1
+    2110038017U, // <4,6,6,3>: Cost 2 ins <4,u,6,3>, lane 1
+    1705431926U, // <4,6,6,4>: Cost 2 vuzpl RHS, <6,7,4,5>
+    2110054401U, // <4,6,6,5>: Cost 2 ins <4,u,6,5>, lane 1
+    1705431864U, // <4,6,6,6>: Cost 2 vuzpl RHS, <6,6,6,6>
+    1036328961U, // <4,6,6,7>: Cost 1 ins RHS, lane 1
+    1036328961U, // <4,6,6,u>: Cost 1 ins RHS, lane 1
+    2132647936U, // <4,6,7,0>: Cost 2 ins <u,6,7,0>, lane 0
+    1705432058U, // <4,6,7,1>: Cost 2 vuzpl RHS, <7,0,1,2>
+    2108850178U, // <4,6,7,2>: Cost 2 ins <4,6,u,2>, lane 2
+    2779173980U, // <4,6,7,3>: Cost 3 vuzpl RHS, <7,1,3,1>
+    2132680704U, // <4,6,7,4>: Cost 2 ins <u,6,7,4>, lane 0
+    1705432422U, // <4,6,7,5>: Cost 2 vuzpl RHS, <7,4,5,6>
+    2108882946U, // <4,6,7,6>: Cost 2 ins <4,6,u,6>, lane 2
+    1705432684U, // <4,6,7,7>: Cost 2 vuzpl RHS, <7,7,7,7>
+    1705432121U, // <4,6,7,u>: Cost 2 vuzpl RHS, <7,0,u,2>
+    1705433020U, // <4,6,u,0>: Cost 2 vuzpl RHS, <u,3,0,1>
+    1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS
+    631691054U,  // <4,6,u,2>: Cost 1 vuzpl RHS, LHS
+    1747681949U, // <4,6,u,3>: Cost 2 vuzpr <0,4,2,6>, LHS
+    1705433060U, // <4,6,u,4>: Cost 2 vuzpl RHS, <u,3,4,5>
+    1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS
+    631691418U,  // <4,6,u,6>: Cost 1 vuzpl RHS, RHS
+    1036328961U, // <4,6,u,7>: Cost 1 ins RHS, lane 1
+    631691108U,  // <4,6,u,u>: Cost 1 vuzpl RHS, LHS
+    3206537216U, // <4,7,0,0>: Cost 3 ins <u,7,0,0>, lane 0
+    2132803584U, // <4,7,0,1>: Cost 2 ins <u,7,0,1>, lane 0
+    2109587457U, // <4,7,0,2>: Cost 2 ins <4,u,0,2>, lane 1
+    2845614101U, // <4,7,0,3>: Cost 3 vuzpr <4,4,6,7>, <0,0,2,3>
+    3206569984U, // <4,7,0,4>: Cost 3 ins <u,7,0,4>, lane 0
+    3047789926U, // <4,7,0,5>: Cost 3 vtrnl <4,6,0,2>, <7,4,5,6>
+    3047789929U, // <4,7,0,6>: Cost 3 vtrnl <4,6,0,2>, <7,4,6,0>
+    2109628417U, // <4,7,0,7>: Cost 2 ins <4,u,0,7>, lane 1
+    2132803584U, // <4,7,0,u>: Cost 2 ins <u,7,0,1>, lane 0
+    2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1>
+    3206619136U, // <4,7,1,1>: Cost 3 ins <u,7,1,1>, lane 0
+    2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4>
+    2132893696U, // <4,7,1,3>: Cost 2 ins <u,7,1,3>, lane 0
+    3206643712U, // <4,7,1,4>: Cost 3 ins <u,7,1,4>, lane 0
+    3206651904U, // <4,7,1,5>: Cost 3 ins <u,7,1,5>, lane 0
+    2988265414U, // <4,7,1,6>: Cost 3 vzipr <5,u,4,1>, <5,4,7,6>
+    2109702145U, // <4,7,1,7>: Cost 2 ins <4,u,1,7>, lane 1
+    2132893696U, // <4,7,1,u>: Cost 2 ins <u,7,1,3>, lane 0
+    3206684672U, // <4,7,2,0>: Cost 3 ins <u,7,2,0>, lane 0
+    3206692864U, // <4,7,2,1>: Cost 3 ins <u,7,2,1>, lane 0
+    3206701056U, // <4,7,2,2>: Cost 3 ins <u,7,2,2>, lane 0
+    2132967424U, // <4,7,2,3>: Cost 2 ins <u,7,2,3>, lane 0
+    2833597338U, // <4,7,2,4>: Cost 3 vuzpr <2,4,5,7>, <1,2,3,4>
+    2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7>
+    3206733824U, // <4,7,2,6>: Cost 3 ins <u,7,2,6>, lane 0
+    2109775873U, // <4,7,2,7>: Cost 2 ins <4,u,2,7>, lane 1
+    2132967424U, // <4,7,2,u>: Cost 2 ins <u,7,2,3>, lane 0
+    3206758400U, // <4,7,3,0>: Cost 3 ins <u,7,3,0>, lane 0
+    3206766592U, // <4,7,3,1>: Cost 3 ins <u,7,3,1>, lane 0
+    3047388245U, // <4,7,3,2>: Cost 3 vtrnl <4,5,3,7>, <7,1,2,3>
+    3206782976U, // <4,7,3,3>: Cost 3 ins <u,7,3,3>, lane 0
+    2989609062U, // <4,7,3,4>: Cost 3 vzipr <6,1,4,3>, <5,6,7,4>
+    3206799360U, // <4,7,3,5>: Cost 3 ins <u,7,3,5>, lane 0
+    2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7>
+    2109849601U, // <4,7,3,7>: Cost 2 ins <4,u,3,7>, lane 1
+    2109849601U, // <4,7,3,u>: Cost 2 ins <4,u,3,7>, lane 1
+    2583199846U, // <4,7,4,0>: Cost 3 vext1 <5,4,7,4>, LHS
+    3048117242U, // <4,7,4,1>: Cost 3 vtrnl <4,6,4,6>, <7,0,1,2>
+    3183624193U, // <4,7,4,2>: Cost 3 ins <4,u,4,2>, lane 1
+    2979659923U, // <4,7,4,3>: Cost 3 vzipr <4,4,4,4>, <0,1,7,3>
+    2109898753U, // <4,7,4,4>: Cost 2 ins <4,u,4,4>, lane 1
+    2133131264U, // <4,7,4,5>: Cost 2 ins <u,7,4,5>, lane 0
+    2109915137U, // <4,7,4,6>: Cost 2 ins <4,u,4,6>, lane 1
+    1771875557U, // <4,7,4,7>: Cost 2 vuzpr <4,4,6,7>, <4,4,6,7>
+    2133131264U, // <4,7,4,u>: Cost 2 ins <u,7,4,5>, lane 0
+    1839649786U, // <4,7,5,0>: Cost 2 vzipl RHS, <7,0,1,2>
+    2109947905U, // <4,7,5,1>: Cost 2 ins <4,u,5,1>, lane 1
+    2913391781U, // <4,7,5,2>: Cost 3 vzipl RHS, <7,2,2,2>
+    2913391843U, // <4,7,5,3>: Cost 3 vzipl RHS, <7,3,0,1>
+    1839650150U, // <4,7,5,4>: Cost 2 vzipl RHS, <7,4,5,6>
+    2109980673U, // <4,7,5,5>: Cost 2 ins <4,u,5,5>, lane 1
+    2913392145U, // <4,7,5,6>: Cost 3 vzipl RHS, <7,6,6,6>
+    1839650412U, // <4,7,5,7>: Cost 2 vzipl RHS, <7,7,7,7>
+    1839650434U, // <4,7,5,u>: Cost 2 vzipl RHS, <7,u,1,2>
+    1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS
+    1973867514U, // <4,7,6,1>: Cost 2 vtrnl RHS, <7,0,1,2>
+    2110029825U, // <4,7,6,2>: Cost 2 ins <4,u,6,2>, lane 1
+    2110038017U, // <4,7,6,3>: Cost 2 ins <4,u,6,3>, lane 1
+    1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS
+    1973867878U, // <4,7,6,5>: Cost 2 vtrnl RHS, <7,4,5,6>
+    2110062593U, // <4,7,6,6>: Cost 2 ins <4,u,6,6>, lane 1
+    1036328961U, // <4,7,6,7>: Cost 1 ins RHS, lane 1
+    1036328961U, // <4,7,6,u>: Cost 1 ins RHS, lane 1
+    2914587642U, // <4,7,7,0>: Cost 3 vzipl <4,7,5,0>, <7,0,1,2>
+    2779862010U, // <4,7,7,1>: Cost 3 vuzpl <4,6,7,1>, <7,0,1,2>
+    2779247701U, // <4,7,7,2>: Cost 3 vuzpl <4,5,7,7>, <7,1,2,3>
+    3207077888U, // <4,7,7,3>: Cost 3 ins <u,7,7,3>, lane 0
+    2914620774U, // <4,7,7,4>: Cost 3 vzipl <4,7,5,4>, <7,4,5,6>
+    2779895142U, // <4,7,7,5>: Cost 3 vuzpl <4,6,7,5>, <7,4,5,6>
+    2992295878U, // <4,7,7,6>: Cost 3 vzipr <6,5,4,7>, <5,4,7,6>
+    2133368832U, // <4,7,7,7>: Cost 2 ins <u,7,7,7>, lane 0
+    2133368832U, // <4,7,7,u>: Cost 2 ins <u,7,7,7>, lane 0
+    1841640442U, // <4,7,u,0>: Cost 2 vzipl RHS, <7,0,1,2>
+    1974014970U, // <4,7,u,1>: Cost 2 vtrnl RHS, <7,0,1,2>
+    2109587457U, // <4,7,u,2>: Cost 2 ins <4,u,0,2>, lane 1
+    2132893696U, // <4,7,u,3>: Cost 2 ins <u,7,1,3>, lane 0
+    1841640806U, // <4,7,u,4>: Cost 2 vzipl RHS, <7,4,5,6>
+    1974015334U, // <4,7,u,5>: Cost 2 vtrnl RHS, <7,4,5,6>
+    2109915137U, // <4,7,u,6>: Cost 2 ins <4,u,4,6>, lane 1
+    1036328961U, // <4,7,u,7>: Cost 1 ins RHS, lane 1
+    1036328961U, // <4,7,u,u>: Cost 1 ins RHS, lane 1
+    1705574400U, // <4,u,0,0>: Cost 2 vuzpl RHS, <0,0,0,0>
+    1034493957U, // <4,u,0,1>: Cost 1 ins RHS, lane 5
+    631832678U,  // <4,u,0,2>: Cost 1 vuzpl RHS, LHS
+    2108309507U, // <4,u,0,3>: Cost 2 ins <4,6,0,u>, lane 3
+    1705574604U, // <4,u,0,4>: Cost 2 vuzpl RHS, <0,2,4,6>
+    2107547650U, // <4,u,0,5>: Cost 2 ins <4,4,u,5>, lane 2
+    1974048922U, // <4,u,0,6>: Cost 2 vtrnl <4,6,0,2>, RHS
+    1034485762U, // <4,u,0,7>: Cost 1 ins RHS, lane 2
+    631832732U,  // <4,u,0,u>: Cost 1 vuzpl RHS, LHS
+    2108170242U, // <4,u,1,0>: Cost 2 ins <4,5,u,0>, lane 2
+    1705575220U, // <4,u,1,1>: Cost 2 vuzpl RHS, <1,1,1,1>
+    1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+    1747624038U, // <4,u,1,3>: Cost 2 vuzpr <0,4,1,u>, LHS
+    2107539458U, // <4,u,1,4>: Cost 2 ins <4,4,u,4>, lane 2
+    1705575424U, // <4,u,1,5>: Cost 2 vuzpl RHS, <1,3,5,7>
+    2107555842U, // <4,u,1,6>: Cost 2 ins <4,4,u,6>, lane 2
+    1034485762U, // <4,u,1,7>: Cost 1 ins RHS, lane 2
+    1034485762U, // <4,u,1,u>: Cost 1 ins RHS, lane 2
+    1705576102U, // <4,u,2,0>: Cost 2 vuzpl RHS, <2,3,0,1>
+    2104860674U, // <4,u,2,1>: Cost 2 ins <4,0,u,1>, lane 2
+    1705576040U, // <4,u,2,2>: Cost 2 vuzpl RHS, <2,2,2,2>
+    1055244288U, // <4,u,2,3>: Cost 1 ins LHS, lane 0
+    1705576142U, // <4,u,2,4>: Cost 2 vuzpl RHS, <2,3,4,5>
+    2107547650U, // <4,u,2,5>: Cost 2 ins <4,4,u,5>, lane 2
+    2131001344U, // <4,u,2,6>: Cost 2 ins <u,4,2,6>, lane 0
+    1034485762U, // <4,u,2,7>: Cost 1 ins RHS, lane 2
+    1055244288U, // <4,u,2,u>: Cost 1 ins LHS, lane 0
+    2129698816U, // <4,u,3,0>: Cost 2 ins <u,2,3,0>, lane 0
+    1705576598U, // <4,u,3,1>: Cost 2 vuzpl RHS, <3,0,1,2>
+    2128388096U, // <4,u,3,2>: Cost 2 ins <u,0,3,2>, lane 0
+    1705576860U, // <4,u,3,3>: Cost 2 vuzpl RHS, <3,3,3,3>
+    2129731584U, // <4,u,3,4>: Cost 2 ins <u,2,3,4>, lane 0
+    1705576962U, // <4,u,3,5>: Cost 2 vuzpl RHS, <3,4,5,6>
+    2107555842U, // <4,u,3,6>: Cost 2 ins <4,4,u,6>, lane 2
+    1034485762U, // <4,u,3,7>: Cost 1 ins RHS, lane 2
+    1034485762U, // <4,u,3,u>: Cost 1 ins RHS, lane 2
+    1705577804U, // <4,u,4,0>: Cost 2 vuzpl RHS, <4,6,0,2>
+    2104860674U, // <4,u,4,1>: Cost 2 ins <4,0,u,1>, lane 2
+    1974376238U, // <4,u,4,2>: Cost 2 vtrnl <4,6,4,6>, LHS
+    2108604419U, // <4,u,4,3>: Cost 2 ins <4,6,4,u>, lane 3
+    161926454U,  // <4,u,4,4>: Cost 1 vdup0 RHS
+    1034493957U, // <4,u,4,5>: Cost 1 ins RHS, lane 5
+    631835958U,  // <4,u,4,6>: Cost 1 vuzpl RHS, RHS
+    1034485762U, // <4,u,4,7>: Cost 1 ins RHS, lane 2
+    631835976U,  // <4,u,4,u>: Cost 1 vuzpl RHS, RHS
+    1839650515U, // <4,u,5,0>: Cost 2 vzipl RHS, <u,0,1,2>
+    765908782U,  // <4,u,5,1>: Cost 1 vzipl RHS, LHS
+    1839650693U, // <4,u,5,2>: Cost 2 vzipl RHS, <u,2,3,0>
+    2016035485U, // <4,u,5,3>: Cost 2 vtrnr <0,4,1,5>, LHS
+    1839650879U, // <4,u,5,4>: Cost 2 vzipl RHS, <u,4,5,6>
+    765909146U,  // <4,u,5,5>: Cost 1 vzipl RHS, RHS
+    1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+    1034485762U, // <4,u,5,7>: Cost 1 ins RHS, lane 2
+    765909349U,  // <4,u,5,u>: Cost 1 vzipl RHS, LHS
+    1034346499U, // <4,u,6,0>: Cost 1 ins RHS, lane 3
+    1034346499U, // <4,u,6,1>: Cost 1 ins RHS, lane 3
+    900126510U,  // <4,u,6,2>: Cost 1 vtrnl RHS, LHS
+    1034346499U, // <4,u,6,3>: Cost 1 ins RHS, lane 3
+    1034346499U, // <4,u,6,4>: Cost 1 ins RHS, lane 3
+    1034346499U, // <4,u,6,5>: Cost 1 ins RHS, lane 3
+    900126874U,  // <4,u,6,6>: Cost 1 vtrnl RHS, RHS
+    27705344U,   // <4,u,6,7>: Cost 0 copy RHS
+    27705344U,   // <4,u,6,u>: Cost 0 copy RHS
+    2133975044U, // <4,u,7,0>: Cost 2 ins <u,u,7,0>, lane 4
+    1705579514U, // <4,u,7,1>: Cost 2 vuzpl RHS, <7,0,1,2>
+    2104868866U, // <4,u,7,2>: Cost 2 ins <4,0,u,2>, lane 2
+    2129354752U, // <4,u,7,3>: Cost 2 ins <u,1,7,3>, lane 0
+    2134007812U, // <4,u,7,4>: Cost 2 ins <u,u,7,4>, lane 4
+    1705579878U, // <4,u,7,5>: Cost 2 vuzpl RHS, <7,4,5,6>
+    2131369984U, // <4,u,7,6>: Cost 2 ins <u,4,7,6>, lane 0
+    1034485762U, // <4,u,7,7>: Cost 1 ins RHS, lane 2
+    1034485762U, // <4,u,7,u>: Cost 1 ins RHS, lane 2
+    1034346499U, // <4,u,u,0>: Cost 1 ins RHS, lane 3
+    767899438U,  // <4,u,u,1>: Cost 1 vzipl RHS, LHS
+    631838510U,  // <4,u,u,2>: Cost 1 vuzpl RHS, LHS
+    1055244288U, // <4,u,u,3>: Cost 1 ins LHS, lane 0
+    161926454U,  // <4,u,u,4>: Cost 1 vdup0 RHS
+    767899802U,  // <4,u,u,5>: Cost 1 vzipl RHS, RHS
+    631838874U,  // <4,u,u,6>: Cost 1 vuzpl RHS, RHS
+    27705344U,   // <4,u,u,7>: Cost 0 copy RHS
+    27705344U,   // <4,u,u,u>: Cost 0 copy RHS
+    2128150528U, // <5,0,0,0>: Cost 2 ins <u,0,0,0>, lane 0
+    2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1>
+    2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2>
+    2846220309U, // <5,0,0,3>: Cost 3 vuzpr <4,5,6,0>, <0,0,2,3>
+    2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5>
+    2583318482U, // <5,0,0,5>: Cost 3 vext1 <5,5,0,0>, <5,5,0,0>
+    3189334017U, // <5,0,0,6>: Cost 3 ins <5,u,0,6>, lane 1
+    2846223265U, // <5,0,0,7>: Cost 3 vuzpr <4,5,6,0>, <4,0,6,7>
+    2128150528U, // <5,0,0,u>: Cost 2 ins <u,0,0,0>, lane 0
+    1503608934U, // <5,0,1,0>: Cost 2 vext1 <4,5,0,1>, LHS
+    1843003494U, // <5,0,1,1>: Cost 2 vzipl <5,1,7,3>, LHS
+    1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+    2115641345U, // <5,0,1,3>: Cost 2 ins <5,u,1,3>, lane 1
+    1611612282U, // <5,0,1,4>: Cost 2 vext3 <0,1,4,5>, <0,1,4,5>
+    2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1>
+    3202015232U, // <5,0,1,6>: Cost 3 ins <u,0,1,6>, lane 0
+    3189415937U, // <5,0,1,7>: Cost 3 ins <5,u,1,7>, lane 1
+    1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+    2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2>
+    2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+    2128314368U, // <5,0,2,2>: Cost 2 ins <u,0,2,2>, lane 0
+    2128322560U, // <5,0,2,3>: Cost 2 ins <u,0,2,3>, lane 0
+    2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5>
+    2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5>
+    3189481473U, // <5,0,2,6>: Cost 3 ins <5,u,2,6>, lane 1
+    2595280262U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,5,0,2>
+    2128314368U, // <5,0,2,u>: Cost 2 ins <u,0,2,2>, lane 0
+    3202113536U, // <5,0,3,0>: Cost 3 ins <u,0,3,0>, lane 0
+    2918047846U, // <5,0,3,1>: Cost 3 vzipl <5,3,7,0>, LHS
+    2128388096U, // <5,0,3,2>: Cost 2 ins <u,0,3,2>, lane 0
+    3189530625U, // <5,0,3,3>: Cost 3 ins <5,u,3,3>, lane 1
+    2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0>
+    2785315330U, // <5,0,3,5>: Cost 3 vuzpl <5,6,0,1>, <3,4,5,6>
+    3202162688U, // <5,0,3,6>: Cost 3 ins <u,0,3,6>, lane 0
+    2840323072U, // <5,0,3,7>: Cost 3 vuzpr <3,5,7,0>, <1,3,5,7>
+    2128388096U, // <5,0,3,u>: Cost 2 ins <u,0,3,2>, lane 0
+    2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS
+    1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+    2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6>
+    3184336899U, // <5,0,4,3>: Cost 3 ins <5,0,4,u>, lane 3
+    2687345005U, // <5,0,4,4>: Cost 3 vext3 <0,4,4,5>, <0,4,4,5>
+    2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS
+    2846222850U, // <5,0,4,6>: Cost 3 vuzpr <4,5,6,0>, <3,4,5,6>
+    2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+    1845019293U, // <5,0,4,u>: Cost 2 vzipl <5,4,7,6>, LHS
+    1772481839U, // <5,0,5,0>: Cost 2 vuzpr <4,5,6,0>, <4,5,6,0>
+    1845526630U, // <5,0,5,1>: Cost 2 vzipl <5,5,5,5>, LHS
+    1979744358U, // <5,0,5,2>: Cost 2 vtrnl <5,5,5,5>, LHS
+    3189678081U, // <5,0,5,3>: Cost 3 ins <5,u,5,3>, lane 1
+    2919268690U, // <5,0,5,4>: Cost 3 vzipl <5,5,5,5>, <0,4,1,5>
+    2115952641U, // <5,0,5,5>: Cost 2 ins <5,u,5,5>, lane 1
+    3202310144U, // <5,0,5,6>: Cost 3 ins <u,0,5,6>, lane 0
+    2115969025U, // <5,0,5,7>: Cost 2 ins <5,u,5,7>, lane 1
+    1845527197U, // <5,0,5,u>: Cost 2 vzipl <5,5,5,5>, LHS
+    2973777920U, // <5,0,6,0>: Cost 3 vzipr <3,4,5,6>, <0,0,0,0>
+    1846296678U, // <5,0,6,1>: Cost 2 vzipl <5,6,7,0>, LHS
+    2128609280U, // <5,0,6,2>: Cost 2 ins <u,0,6,2>, lane 0
+    3189751809U, // <5,0,6,3>: Cost 3 ins <5,u,6,3>, lane 1
+    2920038738U, // <5,0,6,4>: Cost 3 vzipl <5,6,7,0>, <0,4,1,5>
+    2920038866U, // <5,0,6,5>: Cost 3 vzipl <5,6,7,0>, <0,5,6,7>
+    3189776385U, // <5,0,6,6>: Cost 3 ins <5,u,6,6>, lane 1
+    2128650240U, // <5,0,6,7>: Cost 2 ins <u,0,6,7>, lane 0
+    1846297245U, // <5,0,6,u>: Cost 2 vzipl <5,6,7,0>, LHS
+    2040971264U, // <5,0,7,0>: Cost 2 vtrnr RHS, <0,0,0,0>
+    2040971274U, // <5,0,7,1>: Cost 2 vtrnr RHS, <0,0,1,1>
+    2040971284U, // <5,0,7,2>: Cost 2 vtrnr RHS, <0,0,2,2>
+    2116083713U, // <5,0,7,3>: Cost 2 ins <5,u,7,3>, lane 1
+    2116091905U, // <5,0,7,4>: Cost 2 ins <5,u,7,4>, lane 1
+    3114715316U, // <5,0,7,5>: Cost 3 vtrnr RHS, <3,0,4,5>
+    2116108289U, // <5,0,7,6>: Cost 2 ins <5,u,7,6>, lane 1
+    2116116481U, // <5,0,7,7>: Cost 2 ins <5,u,7,7>, lane 1
+    2040971281U, // <5,0,7,u>: Cost 2 vtrnr RHS, <0,0,1,u>
+    2040979456U, // <5,0,u,0>: Cost 2 vtrnr RHS, <0,0,0,0>
+    1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5>
+    1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+    2115641345U, // <5,0,u,3>: Cost 2 ins <5,u,1,3>, lane 1
+    2116091905U, // <5,0,u,4>: Cost 2 ins <5,u,7,4>, lane 1
+    2115952641U, // <5,0,u,5>: Cost 2 ins <5,u,5,5>, lane 1
+    2116108289U, // <5,0,u,6>: Cost 2 ins <5,u,7,6>, lane 1
+    2115969025U, // <5,0,u,7>: Cost 2 ins <5,u,5,7>, lane 1
+    1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS
+    2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0>
+    1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS
+    1712324710U, // <5,1,0,2>: Cost 2 vuzpl <5,7,1,3>, LHS
+    2111512578U, // <5,1,0,3>: Cost 2 ins <5,1,u,3>, lane 2
+    2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5>
+    2977710418U, // <5,1,0,5>: Cost 3 vzipr <4,1,5,0>, <0,4,1,5>
+    3185278978U, // <5,1,0,6>: Cost 3 ins <5,1,u,6>, lane 2
+    3184705539U, // <5,1,0,7>: Cost 3 ins <5,1,0,u>, lane 3
+    1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS
+    2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1>
+    2128896000U, // <5,1,1,1>: Cost 2 ins <u,1,1,1>, lane 0
+    2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0>
+    2115641345U, // <5,1,1,3>: Cost 2 ins <5,u,1,3>, lane 1
+    2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5>
+    2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5>
+    3189407745U, // <5,1,1,6>: Cost 3 ins <5,u,1,6>, lane 1
+    2982367283U, // <5,1,1,7>: Cost 3 vzipr <4,u,5,1>, <5,6,1,7>
+    2115641345U, // <5,1,1,u>: Cost 2 ins <5,u,1,3>, lane 1
+    2128961536U, // <5,1,2,0>: Cost 2 ins <u,1,2,0>, lane 0
+    2128969728U, // <5,1,2,1>: Cost 2 ins <u,1,2,1>, lane 0
+    2128977920U, // <5,1,2,2>: Cost 2 ins <u,1,2,2>, lane 0
+    1055244288U, // <5,1,2,3>: Cost 1 ins LHS, lane 0
+    2128994304U, // <5,1,2,4>: Cost 2 ins <u,1,2,4>, lane 0
+    2129002496U, // <5,1,2,5>: Cost 2 ins <u,1,2,5>, lane 0
+    2129010688U, // <5,1,2,6>: Cost 2 ins <u,1,2,6>, lane 0
+    2129018880U, // <5,1,2,7>: Cost 2 ins <u,1,2,7>, lane 0
+    1055244288U, // <5,1,2,u>: Cost 1 ins LHS, lane 0
+    2571468902U, // <5,1,3,0>: Cost 3 vext1 <3,5,1,3>, LHS
+    2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3>
+    2571470542U, // <5,1,3,2>: Cost 3 vext1 <3,5,1,3>, <2,3,4,5>
+    2129059840U, // <5,1,3,3>: Cost 2 ins <u,1,3,3>, lane 0
+    2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5>
+    2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7>
+    2595361654U, // <5,1,3,6>: Cost 3 vext1 <7,5,1,3>, <6,7,4,5>
+    2840331264U, // <5,1,3,7>: Cost 3 vuzpr <3,5,7,1>, <1,3,5,7>
+    2129059840U, // <5,1,3,u>: Cost 2 ins <u,1,3,3>, lane 0
+    1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1>
+    2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5>
+    2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5>
+    2111512578U, // <5,1,4,3>: Cost 2 ins <5,1,u,3>, lane 2
+    2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4>
+    1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS
+    1712327990U, // <5,1,4,6>: Cost 2 vuzpl <5,7,1,3>, RHS
+    3185000451U, // <5,1,4,7>: Cost 3 ins <5,1,4,u>, lane 3
+    1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1>
+    2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1>
+    1712328832U, // <5,1,5,1>: Cost 2 vuzpl <5,7,1,3>, <5,7,1,3>
+    2982398102U, // <5,1,5,2>: Cost 3 vzipr <4,u,5,5>, <3,0,1,2>
+    2046853222U, // <5,1,5,3>: Cost 2 vtrnr <5,5,5,5>, LHS
+    2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5>
+    2115952641U, // <5,1,5,5>: Cost 2 ins <5,u,5,5>, lane 1
+    2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0>
+    2115969025U, // <5,1,5,7>: Cost 2 ins <5,u,5,7>, lane 1
+    2046853227U, // <5,1,5,u>: Cost 2 vtrnr <5,5,5,5>, LHS
+    2920039158U, // <5,1,6,0>: Cost 3 vzipl <5,6,7,0>, <1,0,3,2>
+    2961834642U, // <5,1,6,1>: Cost 3 vzipr <1,4,5,6>, <0,u,1,1>
+    2973780118U, // <5,1,6,2>: Cost 3 vzipr <3,4,5,6>, <3,0,1,2>
+    2111512578U, // <5,1,6,3>: Cost 2 ins <5,1,u,3>, lane 2
+    2224227480U, // <5,1,6,4>: Cost 3 vrev <1,5,4,6>
+    2973778258U, // <5,1,6,5>: Cost 3 vzipr <3,4,5,6>, <0,4,1,5>
+    2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6>
+    2111553541U, // <5,1,6,7>: Cost 2 ins <5,1,u,u>, lane 5
+    2111512578U, // <5,1,6,u>: Cost 2 ins <5,1,u,3>, lane 2
+    2116059137U, // <5,1,7,0>: Cost 2 ins <5,u,7,0>, lane 1
+    2040972084U, // <5,1,7,1>: Cost 2 vtrnr RHS, <1,1,1,1>
+    2111479811U, // <5,1,7,2>: Cost 2 ins <5,1,7,u>, lane 3
+    967229542U,  // <5,1,7,3>: Cost 1 vtrnr RHS, LHS
+    2116091905U, // <5,1,7,4>: Cost 2 ins <5,u,7,4>, lane 1
+    2111479811U, // <5,1,7,5>: Cost 2 ins <5,1,7,u>, lane 3
+    2116108289U, // <5,1,7,6>: Cost 2 ins <5,u,7,6>, lane 1
+    2116116481U, // <5,1,7,7>: Cost 2 ins <5,u,7,7>, lane 1
+    967229547U,  // <5,1,7,u>: Cost 1 vtrnr RHS, LHS
+    2116059137U, // <5,1,u,0>: Cost 2 ins <5,u,7,0>, lane 1
+    2040980276U, // <5,1,u,1>: Cost 2 vtrnr RHS, <1,1,1,1>
+    1712330542U, // <5,1,u,2>: Cost 2 vuzpl <5,7,1,3>, LHS
+    967237734U,  // <5,1,u,3>: Cost 1 vtrnr RHS, LHS
+    2116091905U, // <5,1,u,4>: Cost 2 ins <5,u,7,4>, lane 1
+    1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS
+    1712330906U, // <5,1,u,6>: Cost 2 vuzpl <5,7,1,3>, RHS
+    2115969025U, // <5,1,u,7>: Cost 2 ins <5,u,5,7>, lane 1
+    967237739U,  // <5,1,u,u>: Cost 1 vtrnr RHS, LHS
+    2786132132U, // <5,2,0,0>: Cost 3 vuzpl <5,7,2,2>, <0,2,0,2>
+    2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS
+    2129494016U, // <5,2,0,2>: Cost 2 ins <u,2,0,2>, lane 0
+    2973728870U, // <5,2,0,3>: Cost 3 vzipr <3,4,5,0>, LHS
+    2786164940U, // <5,2,0,4>: Cost 3 vuzpl <5,7,2,6>, <0,2,4,6>
+    2782158977U, // <5,2,0,5>: Cost 3 vuzpl <5,1,2,3>, <0,1,5,3>
+    3185942530U, // <5,2,0,6>: Cost 3 ins <5,2,u,6>, lane 2
+    3114658883U, // <5,2,0,7>: Cost 3 vtrnr <4,5,6,0>, <4,2,6,7>
+    2129494016U, // <5,2,0,u>: Cost 2 ins <u,2,0,2>, lane 0
+    3054503590U, // <5,2,1,0>: Cost 3 vtrnl <5,7,1,3>, <2,3,0,1>
+    3203301376U, // <5,2,1,1>: Cost 3 ins <u,2,1,1>, lane 0
+    2982363156U, // <5,2,1,2>: Cost 3 vzipr <4,u,5,1>, <0,0,2,2>
+    1908621414U, // <5,2,1,3>: Cost 2 vzipr <4,u,5,1>, LHS
+    3054503630U, // <5,2,1,4>: Cost 3 vtrnl <5,7,1,3>, <2,3,4,5>
+    2601390208U, // <5,2,1,5>: Cost 3 vext1 <u,5,2,1>, <5,7,1,3>
+    2982363484U, // <5,2,1,6>: Cost 3 vzipr <4,u,5,1>, <0,4,2,6>
+    3189415937U, // <5,2,1,7>: Cost 3 ins <5,u,1,7>, lane 1
+    1908621419U, // <5,2,1,u>: Cost 2 vzipr <4,u,5,1>, LHS
+    3203366912U, // <5,2,2,0>: Cost 3 ins <u,2,2,0>, lane 0
+    3203375104U, // <5,2,2,1>: Cost 3 ins <u,2,2,1>, lane 0
+    2129641472U, // <5,2,2,2>: Cost 2 ins <u,2,2,2>, lane 0
+    2129649664U, // <5,2,2,3>: Cost 2 ins <u,2,2,3>, lane 0
+    2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5>
+    2698036870U, // <5,2,2,5>: Cost 3 vext3 <2,2,5,5>, <2,2,5,5>
+    3189481473U, // <5,2,2,6>: Cost 3 ins <5,u,2,6>, lane 1
+    2846239811U, // <5,2,2,7>: Cost 3 vuzpr <4,5,6,2>, <4,2,6,7>
+    2129641472U, // <5,2,2,u>: Cost 2 ins <u,2,2,2>, lane 0
+    2129698816U, // <5,2,3,0>: Cost 2 ins <u,2,3,0>, lane 0
+    2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5>
+    2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5>
+    2129723392U, // <5,2,3,3>: Cost 2 ins <u,2,3,3>, lane 0
+    1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5>
+    2717943511U, // <5,2,3,5>: Cost 3 vext3 <5,5,5,5>, <2,3,5,5>
+    3203489792U, // <5,2,3,6>: Cost 3 ins <u,2,3,6>, lane 0
+    2827879424U, // <5,2,3,7>: Cost 3 vuzpr <1,5,0,2>, <1,3,5,7>
+    1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5>
+    3203514368U, // <5,2,4,0>: Cost 3 ins <u,2,4,0>, lane 0
+    3189587969U, // <5,2,4,1>: Cost 3 ins <5,u,4,1>, lane 1
+    2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5>
+    2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5>
+    3203547136U, // <5,2,4,4>: Cost 3 ins <u,2,4,4>, lane 0
+    2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS
+    2129821696U, // <5,2,4,6>: Cost 2 ins <u,2,4,6>, lane 0
+    2846239973U, // <5,2,4,7>: Cost 3 vuzpr <4,5,6,2>, <4,4,6,7>
+    2129821696U, // <5,2,4,u>: Cost 2 ins <u,2,4,6>, lane 0
+    3053487782U, // <5,2,5,0>: Cost 3 vtrnl <5,5,5,5>, <2,3,0,1>
+    3203596288U, // <5,2,5,1>: Cost 3 ins <u,2,5,1>, lane 0
+    1772498225U, // <5,2,5,2>: Cost 2 vuzpr <4,5,6,2>, <4,5,6,2>
+    1908654182U, // <5,2,5,3>: Cost 2 vzipr <4,u,5,5>, LHS
+    3053487822U, // <5,2,5,4>: Cost 3 vtrnl <5,5,5,5>, <2,3,4,5>
+    2115952641U, // <5,2,5,5>: Cost 2 ins <5,u,5,5>, lane 1
+    2982396252U, // <5,2,5,6>: Cost 3 vzipr <4,u,5,5>, <0,4,2,6>
+    2115969025U, // <5,2,5,7>: Cost 2 ins <5,u,5,7>, lane 1
+    1908654187U, // <5,2,5,u>: Cost 2 vzipr <4,u,5,5>, LHS
+    3203661824U, // <5,2,6,0>: Cost 3 ins <u,2,6,0>, lane 0
+    3189735425U, // <5,2,6,1>: Cost 3 ins <5,u,6,1>, lane 1
+    2973777940U, // <5,2,6,2>: Cost 3 vzipr <3,4,5,6>, <0,0,2,2>
+    1900036198U, // <5,2,6,3>: Cost 2 vzipr <3,4,5,6>, LHS
+    2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5>
+    2973778186U, // <5,2,6,5>: Cost 3 vzipr <3,4,5,6>, <0,3,2,5>
+    2973778268U, // <5,2,6,6>: Cost 3 vzipr <3,4,5,6>, <0,4,2,6>
+    2129977344U, // <5,2,6,7>: Cost 2 ins <u,2,6,7>, lane 0
+    1900036203U, // <5,2,6,u>: Cost 2 vzipr <3,4,5,6>, LHS
+    2040972182U, // <5,2,7,0>: Cost 2 vtrnr RHS, <1,2,3,0>
+    3114713251U, // <5,2,7,1>: Cost 3 vtrnr RHS, <0,2,0,1>
+    2040971428U, // <5,2,7,2>: Cost 2 vtrnr RHS, <0,2,0,2>
+    1887436902U, // <5,2,7,3>: Cost 2 vzipr <1,3,5,7>, LHS
+    2040972186U, // <5,2,7,4>: Cost 2 vtrnr RHS, <1,2,3,4>
+    2961178728U, // <5,2,7,5>: Cost 3 vzipr <1,3,5,7>, <0,1,2,5>
+    2040971468U, // <5,2,7,6>: Cost 2 vtrnr RHS, <0,2,4,6>
+    2116116481U, // <5,2,7,7>: Cost 2 ins <5,u,7,7>, lane 1
+    1887436907U, // <5,2,7,u>: Cost 2 vzipr <1,3,5,7>, LHS
+    2040980374U, // <5,2,u,0>: Cost 2 vtrnr RHS, <1,2,3,0>
+    2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS
+    2040979620U, // <5,2,u,2>: Cost 2 vtrnr RHS, <0,2,0,2>
+    1887445094U, // <5,2,u,3>: Cost 2 vzipr <1,3,5,u>, LHS
+    1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5>
+    2115952641U, // <5,2,u,5>: Cost 2 ins <5,u,5,5>, lane 1
+    2040979660U, // <5,2,u,6>: Cost 2 vtrnr RHS, <0,2,4,6>
+    2115969025U, // <5,2,u,7>: Cost 2 ins <5,u,5,7>, lane 1
+    1887445099U, // <5,2,u,u>: Cost 2 vzipr <1,3,5,u>, LHS
+    3203883008U, // <5,3,0,0>: Cost 3 ins <u,3,0,0>, lane 0
+    2130149376U, // <5,3,0,1>: Cost 2 ins <u,3,0,1>, lane 0
+    2782904422U, // <5,3,0,2>: Cost 3 vuzpl <5,2,3,4>, LHS
+    3186581506U, // <5,3,0,3>: Cost 3 ins <5,3,u,3>, lane 2
+    2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1>
+    3053750786U, // <5,3,0,5>: Cost 3 vtrnl <5,6,0,1>, <3,4,5,6>
+    2618302971U, // <5,3,0,6>: Cost 3 vext2 <0,1,5,3>, <0,6,2,3>
+    2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0>
+    2130149376U, // <5,3,0,u>: Cost 2 ins <u,3,0,1>, lane 0
+    2982364054U, // <5,3,1,0>: Cost 3 vzipr <4,u,5,1>, <1,2,3,0>
+    3054504086U, // <5,3,1,1>: Cost 3 vtrnl <5,7,1,3>, <3,0,1,2>
+    2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3>
+    2130239488U, // <5,3,1,3>: Cost 2 ins <u,3,1,3>, lane 0
+    2982364058U, // <5,3,1,4>: Cost 3 vzipr <4,u,5,1>, <1,2,3,4>
+    2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7>
+    3189407745U, // <5,3,1,6>: Cost 3 ins <5,u,1,6>, lane 1
+    2964448400U, // <5,3,1,7>: Cost 3 vzipr <1,u,5,1>, <1,5,3,7>
+    2130239488U, // <5,3,1,u>: Cost 2 ins <u,3,1,3>, lane 0
+    2235845154U, // <5,3,2,0>: Cost 3 vrev <3,5,0,2>
+    3204038656U, // <5,3,2,1>: Cost 3 ins <u,3,2,1>, lane 0
+    3204046848U, // <5,3,2,2>: Cost 3 ins <u,3,2,2>, lane 0
+    2130313216U, // <5,3,2,3>: Cost 2 ins <u,3,2,3>, lane 0
+    2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5>
+    2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4>
+    3204079616U, // <5,3,2,6>: Cost 3 ins <u,3,2,6>, lane 0
+    3096314880U, // <5,3,2,7>: Cost 3 vtrnr <1,5,0,2>, <1,3,5,7>
+    2130313216U, // <5,3,2,u>: Cost 2 ins <u,3,2,3>, lane 0
+    3204104192U, // <5,3,3,0>: Cost 3 ins <u,3,3,0>, lane 0
+    2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3>
+    3204120576U, // <5,3,3,2>: Cost 3 ins <u,3,3,2>, lane 0
+    2130386944U, // <5,3,3,3>: Cost 2 ins <u,3,3,3>, lane 0
+    2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5>
+    2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5>
+    3189555201U, // <5,3,3,6>: Cost 3 ins <5,u,3,6>, lane 1
+    2971763856U, // <5,3,3,7>: Cost 3 vzipr <3,1,5,3>, <1,5,3,7>
+    2130386944U, // <5,3,3,u>: Cost 2 ins <u,3,3,3>, lane 0
+    2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5>
+    2642193381U, // <5,3,4,1>: Cost 3 vext2 <4,1,5,3>, <4,1,5,3>
+    2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3>
+    2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5>
+    2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5>
+    2130477056U, // <5,3,4,5>: Cost 2 ins <u,3,4,5>, lane 0
+    2846247426U, // <5,3,4,6>: Cost 3 vuzpr <4,5,6,3>, <3,4,5,6>
+    2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4>
+    2130477056U, // <5,3,4,u>: Cost 2 ins <u,3,4,5>, lane 0
+    2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS
+    3053488278U, // <5,3,5,1>: Cost 3 vtrnl <5,5,5,5>, <3,0,1,2>
+    2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5>
+    1748320682U, // <5,3,5,3>: Cost 2 vuzpr <0,5,2,3>, <0,5,2,3>
+    2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS
+    2115952641U, // <5,3,5,5>: Cost 2 ins <5,u,5,5>, lane 1
+    3204300800U, // <5,3,5,6>: Cost 3 ins <u,3,5,6>, lane 0
+    2130567168U, // <5,3,5,7>: Cost 2 ins <u,3,5,7>, lane 0
+    2130567168U, // <5,3,5,u>: Cost 2 ins <u,3,5,7>, lane 0
+    2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS
+    3204333568U, // <5,3,6,1>: Cost 3 ins <u,3,6,1>, lane 0
+    2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6>
+    2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6>
+    2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS
+    2973778114U, // <5,3,6,5>: Cost 3 vzipr <3,4,5,6>, <0,2,3,5>
+    2973779816U, // <5,3,6,6>: Cost 3 vzipr <3,4,5,6>, <2,5,3,6>
+    2130640896U, // <5,3,6,7>: Cost 2 ins <u,3,6,7>, lane 0
+    2130640896U, // <5,3,6,u>: Cost 2 ins <u,3,6,7>, lane 0
+    1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS
+    1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7>
+    2961179382U, // <5,3,7,2>: Cost 3 vzipr <1,3,5,7>, <1,0,3,2>
+    2040972248U, // <5,3,7,3>: Cost 2 vtrnr RHS, <1,3,1,3>
+    1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS
+    2040973006U, // <5,3,7,5>: Cost 2 vtrnr RHS, <2,3,4,5>
+    2116108289U, // <5,3,7,6>: Cost 2 ins <5,u,7,6>, lane 1
+    2040972288U, // <5,3,7,7>: Cost 2 vtrnr RHS, <1,3,5,7>
+    1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS
+    1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS
+    1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u>
+    2961187574U, // <5,3,u,2>: Cost 3 vzipr <1,3,5,u>, <1,0,3,2>
+    2040980440U, // <5,3,u,3>: Cost 2 vtrnr RHS, <1,3,1,3>
+    1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS
+    2040981198U, // <5,3,u,5>: Cost 2 vtrnr RHS, <2,3,4,5>
+    2116108289U, // <5,3,u,6>: Cost 2 ins <5,u,7,6>, lane 1
+    2040980480U, // <5,3,u,7>: Cost 2 vtrnr RHS, <1,3,5,7>
+    1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS
+    3189284865U, // <5,4,0,0>: Cost 3 ins <5,u,0,0>, lane 1
+    2113544197U, // <5,4,0,1>: Cost 2 ins <5,4,u,u>, lane 5
+    2781626470U, // <5,4,0,2>: Cost 3 vuzpl <5,0,4,1>, LHS
+    2242022676U, // <5,4,0,3>: Cost 3 vrev <4,5,3,0>
+    2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5>
+    2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1>
+    2113527810U, // <5,4,0,6>: Cost 2 ins <5,4,u,6>, lane 2
+    3114659045U, // <5,4,0,7>: Cost 3 vtrnr <4,5,6,0>, <4,4,6,7>
+    2113544197U, // <5,4,0,u>: Cost 2 ins <5,4,u,u>, lane 5
+    1168067834U, // <5,4,1,0>: Cost 2 vrev <4,5,0,1>
+    3189366785U, // <5,4,1,1>: Cost 3 ins <5,u,1,1>, lane 1
+    3204636672U, // <5,4,1,2>: Cost 3 ins <u,4,1,2>, lane 0
+    2115641345U, // <5,4,1,3>: Cost 2 ins <5,u,1,3>, lane 1
+    2982366416U, // <5,4,1,4>: Cost 3 vzipr <4,u,5,1>, <4,4,4,4>
+    1843006774U, // <5,4,1,5>: Cost 2 vzipl <5,1,7,3>, RHS
+    1980763446U, // <5,4,1,6>: Cost 2 vtrnl <5,7,1,3>, RHS
+    3189415937U, // <5,4,1,7>: Cost 3 ins <5,u,1,7>, lane 1
+    1843007017U, // <5,4,1,u>: Cost 2 vzipl <5,1,7,3>, RHS
+    3204694016U, // <5,4,2,0>: Cost 3 ins <u,4,2,0>, lane 0
+    2241891588U, // <5,4,2,1>: Cost 3 vrev <4,5,1,2>
+    3189448705U, // <5,4,2,2>: Cost 3 ins <5,u,2,2>, lane 1
+    2113544197U, // <5,4,2,3>: Cost 2 ins <5,4,u,u>, lane 5
+    3204726784U, // <5,4,2,4>: Cost 3 ins <u,4,2,4>, lane 0
+    2973746894U, // <5,4,2,5>: Cost 3 vzipr <3,4,5,2>, <2,3,4,5>
+    2131001344U, // <5,4,2,6>: Cost 2 ins <u,4,2,6>, lane 0
+    3114675429U, // <5,4,2,7>: Cost 3 vtrnr <4,5,6,2>, <4,4,6,7>
+    2113544197U, // <5,4,2,u>: Cost 2 ins <5,4,u,u>, lane 5
+    3204767744U, // <5,4,3,0>: Cost 3 ins <u,4,3,0>, lane 0
+    2241899781U, // <5,4,3,1>: Cost 3 vrev <4,5,1,3>
+    1168231694U, // <5,4,3,2>: Cost 2 vrev <4,5,2,3>
+    3189530625U, // <5,4,3,3>: Cost 3 ins <5,u,3,3>, lane 1
+    2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4>
+    2978399950U, // <5,4,3,5>: Cost 3 vzipr <4,2,5,3>, <2,3,4,5>
+    2113527810U, // <5,4,3,6>: Cost 2 ins <5,4,u,6>, lane 2
+    2840355840U, // <5,4,3,7>: Cost 3 vuzpr <3,5,7,4>, <1,3,5,7>
+    2113527810U, // <5,4,3,u>: Cost 2 ins <5,4,u,6>, lane 2
+    2918763410U, // <5,4,4,0>: Cost 3 vzipl <5,4,7,6>, <4,0,5,1>
+    2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4>
+    3186991107U, // <5,4,4,2>: Cost 3 ins <5,4,4,u>, lane 3
+    3186991107U, // <5,4,4,3>: Cost 3 ins <5,4,4,u>, lane 3
+    2131132416U, // <5,4,4,4>: Cost 2 ins <u,4,4,4>, lane 0
+    1845022006U, // <5,4,4,5>: Cost 2 vzipl <5,4,7,6>, RHS
+    2113527810U, // <5,4,4,6>: Cost 2 ins <5,4,u,6>, lane 2
+    2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4>
+    1845022249U, // <5,4,4,u>: Cost 2 vzipl <5,4,7,6>, RHS
+    1503936614U, // <5,4,5,0>: Cost 2 vext1 <4,5,4,5>, LHS
+    2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5>
+    2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3>
+    3189678081U, // <5,4,5,3>: Cost 3 ins <5,u,5,3>, lane 1
+    1168395554U, // <5,4,5,4>: Cost 2 vrev <4,5,4,5>
+    1845529910U, // <5,4,5,5>: Cost 2 vzipl <5,5,5,5>, RHS
+    1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+    2115969025U, // <5,4,5,7>: Cost 2 ins <5,u,5,7>, lane 1
+    1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS
+    2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS
+    2559771800U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,5,4,6>
+    3189743617U, // <5,4,6,2>: Cost 3 ins <5,u,6,2>, lane 1
+    2571717194U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,5,4,6>
+    2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS
+    1846299958U, // <5,4,6,5>: Cost 2 vzipl <5,6,7,0>, RHS
+    2131296256U, // <5,4,6,6>: Cost 2 ins <u,4,6,6>, lane 0
+    2113544197U, // <5,4,6,7>: Cost 2 ins <5,4,u,u>, lane 5
+    1846300201U, // <5,4,6,u>: Cost 2 vzipl <5,6,7,0>, RHS
+    2116059137U, // <5,4,7,0>: Cost 2 ins <5,u,7,0>, lane 1
+    2113470467U, // <5,4,7,1>: Cost 2 ins <5,4,7,u>, lane 3
+    2113470467U, // <5,4,7,2>: Cost 2 ins <5,4,7,u>, lane 3
+    2116083713U, // <5,4,7,3>: Cost 2 ins <5,u,7,3>, lane 1
+    2040974544U, // <5,4,7,4>: Cost 2 vtrnr RHS, <4,4,4,4>
+    2040971602U, // <5,4,7,5>: Cost 2 vtrnr RHS, <0,4,1,5>
+    94817590U,   // <5,4,7,6>: Cost 1 vrev RHS
+    2116116481U, // <5,4,7,7>: Cost 2 ins <5,u,7,7>, lane 1
+    94965064U,   // <5,4,7,u>: Cost 1 vrev RHS
+    2116059137U, // <5,4,u,0>: Cost 2 ins <5,u,7,0>, lane 1
+    2113544197U, // <5,4,u,1>: Cost 2 ins <5,4,u,u>, lane 5
+    2113470467U, // <5,4,u,2>: Cost 2 ins <5,4,7,u>, lane 3
+    2115641345U, // <5,4,u,3>: Cost 2 ins <5,u,1,3>, lane 1
+    2040982736U, // <5,4,u,4>: Cost 2 vtrnr RHS, <4,4,4,4>
+    2040979794U, // <5,4,u,5>: Cost 2 vtrnr RHS, <0,4,1,5>
+    94825783U,   // <5,4,u,6>: Cost 1 vrev RHS
+    2115969025U, // <5,4,u,7>: Cost 2 ins <5,u,5,7>, lane 1
+    94973257U,   // <5,4,u,u>: Cost 1 vrev RHS
+    2040917295U, // <5,5,0,0>: Cost 2 vtrnr <4,5,6,0>, <4,5,6,0>
+    1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS
+    1711308902U, // <5,5,0,2>: Cost 2 vuzpl <5,5,5,5>, LHS
+    3187908610U, // <5,5,0,3>: Cost 3 ins <5,5,u,3>, lane 2
+    2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1>
+    2114183170U, // <5,5,0,5>: Cost 2 ins <5,5,u,5>, lane 2
+    3187933186U, // <5,5,0,6>: Cost 3 ins <5,5,u,6>, lane 2
+    2114199554U, // <5,5,0,7>: Cost 2 ins <5,5,u,7>, lane 2
+    1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS
+    2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2>
+    1908624922U, // <5,5,1,1>: Cost 2 vzipr <4,u,5,1>, <4,u,5,1>
+    2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0>
+    1778417766U, // <5,5,1,3>: Cost 2 vuzpr <5,5,5,5>, LHS
+    2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5>
+    2114183170U, // <5,5,1,5>: Cost 2 ins <5,5,u,5>, lane 2
+    2982365698U, // <5,5,1,6>: Cost 3 vzipr <4,u,5,1>, <3,4,5,6>
+    2114199554U, // <5,5,1,7>: Cost 2 ins <5,5,u,7>, lane 2
+    1778417771U, // <5,5,1,u>: Cost 2 vuzpr <5,5,5,5>, LHS
+    2785052326U, // <5,5,2,0>: Cost 3 vuzpl <5,5,5,5>, <2,3,0,1>
+    3205365760U, // <5,5,2,1>: Cost 3 ins <u,5,2,1>, lane 0
+    2040933681U, // <5,5,2,2>: Cost 2 vtrnr <4,5,6,2>, <4,5,6,2>
+    2114207749U, // <5,5,2,3>: Cost 2 ins <5,5,u,u>, lane 5
+    2785052366U, // <5,5,2,4>: Cost 3 vuzpl <5,5,5,5>, <2,3,4,5>
+    2114183170U, // <5,5,2,5>: Cost 2 ins <5,5,u,5>, lane 2
+    2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7>
+    2114199554U, // <5,5,2,7>: Cost 2 ins <5,5,u,7>, lane 2
+    2114207749U, // <5,5,2,u>: Cost 2 ins <5,5,u,u>, lane 5
+    2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2>
+    2785052822U, // <5,5,3,1>: Cost 3 vuzpl <5,5,5,5>, <3,0,1,2>
+    3187900418U, // <5,5,3,2>: Cost 3 ins <5,5,u,2>, lane 2
+    1880105089U, // <5,5,3,3>: Cost 2 vzipr <0,1,5,3>, <0,1,5,3>
+    2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6>
+    2114183170U, // <5,5,3,5>: Cost 2 ins <5,5,u,5>, lane 2
+    3205480448U, // <5,5,3,6>: Cost 3 ins <u,5,3,6>, lane 0
+    2131746816U, // <5,5,3,7>: Cost 2 ins <u,5,3,7>, lane 0
+    2131746816U, // <5,5,3,u>: Cost 2 ins <u,5,3,7>, lane 0
+    2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1>
+    2716987279U, // <5,5,4,1>: Cost 3 vext3 <5,4,1,5>, <5,4,1,5>
+    3187900418U, // <5,5,4,2>: Cost 3 ins <5,5,u,2>, lane 2
+    3187908610U, // <5,5,4,3>: Cost 3 ins <5,5,u,3>, lane 2
+    1845022662U, // <5,5,4,4>: Cost 2 vzipl <5,4,7,6>, <5,4,7,6>
+    1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS
+    1711312182U, // <5,5,4,6>: Cost 2 vuzpl <5,5,5,5>, RHS
+    2114199554U, // <5,5,4,7>: Cost 2 ins <5,5,u,7>, lane 2
+    1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5>
+    1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+    2113986563U, // <5,5,5,1>: Cost 2 ins <5,5,5,u>, lane 3
+    2113986563U, // <5,5,5,2>: Cost 2 ins <5,5,5,u>, lane 3
+    2113986563U, // <5,5,5,3>: Cost 2 ins <5,5,5,u>, lane 3
+    1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+    229035318U,  // <5,5,5,5>: Cost 1 vdup1 RHS
+    2113986563U, // <5,5,5,6>: Cost 2 ins <5,5,5,u>, lane 3
+    1778421046U, // <5,5,5,7>: Cost 2 vuzpr <5,5,5,5>, RHS
+    229035318U,  // <5,5,5,u>: Cost 1 vdup1 RHS
+    2131910656U, // <5,5,6,0>: Cost 2 ins <u,5,6,0>, lane 0
+    2131918848U, // <5,5,6,1>: Cost 2 ins <u,5,6,1>, lane 0
+    2131927040U, // <5,5,6,2>: Cost 2 ins <u,5,6,2>, lane 0
+    2131935232U, // <5,5,6,3>: Cost 2 ins <u,5,6,3>, lane 0
+    2131943424U, // <5,5,6,4>: Cost 2 ins <u,5,6,4>, lane 0
+    2131951616U, // <5,5,6,5>: Cost 2 ins <u,5,6,5>, lane 0
+    1900038658U, // <5,5,6,6>: Cost 2 vzipr <3,4,5,6>, <3,4,5,6>
+    1058226176U, // <5,5,6,7>: Cost 1 ins RHS, lane 0
+    1058226176U, // <5,5,6,u>: Cost 1 ins RHS, lane 0
+    2116059137U, // <5,5,7,0>: Cost 2 ins <5,u,7,0>, lane 1
+    2114134019U, // <5,5,7,1>: Cost 2 ins <5,5,7,u>, lane 3
+    2114134019U, // <5,5,7,2>: Cost 2 ins <5,5,7,u>, lane 3
+    2116083713U, // <5,5,7,3>: Cost 2 ins <5,u,7,3>, lane 1
+    2116091905U, // <5,5,7,4>: Cost 2 ins <5,u,7,4>, lane 1
+    2040975364U, // <5,5,7,5>: Cost 2 vtrnr RHS, <5,5,5,5>
+    2116108289U, // <5,5,7,6>: Cost 2 ins <5,u,7,6>, lane 1
+    967232822U,  // <5,5,7,7>: Cost 1 vtrnr RHS, RHS
+    967232823U,  // <5,5,7,u>: Cost 1 vtrnr RHS, RHS
+    1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS
+    1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS
+    1711314734U, // <5,5,u,2>: Cost 2 vuzpl <5,5,5,5>, LHS
+    1778418333U, // <5,5,u,3>: Cost 2 vuzpr <5,5,5,5>, LHS
+    1845022662U, // <5,5,u,4>: Cost 2 vzipl <5,4,7,6>, <5,4,7,6>
+    229035318U,  // <5,5,u,5>: Cost 1 vdup1 RHS
+    1711315098U, // <5,5,u,6>: Cost 2 vuzpl <5,5,5,5>, RHS
+    967241014U,  // <5,5,u,7>: Cost 1 vtrnr RHS, RHS
+    967241015U,  // <5,5,u,u>: Cost 1 vtrnr RHS, RHS
+    2114805762U, // <5,6,0,0>: Cost 2 ins <5,6,u,0>, lane 2
+    1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS
+    2132148224U, // <5,6,0,2>: Cost 2 ins <u,6,0,2>, lane 0
+    2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4>
+    2114838530U, // <5,6,0,4>: Cost 2 ins <5,6,u,4>, lane 2
+    3188588546U, // <5,6,0,5>: Cost 3 ins <5,6,u,5>, lane 2
+    3188596738U, // <5,6,0,6>: Cost 3 ins <5,6,u,6>, lane 2
+    2973732150U, // <5,6,0,7>: Cost 3 vzipr <3,4,5,0>, RHS
+    1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS
+    2114805762U, // <5,6,1,0>: Cost 2 ins <5,6,u,0>, lane 2
+    2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1>
+    2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0>
+    2115641345U, // <5,6,1,3>: Cost 2 ins <5,u,1,3>, lane 1
+    2114838530U, // <5,6,1,4>: Cost 2 ins <5,6,u,4>, lane 2
+    2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7>
+    2982366436U, // <5,6,1,6>: Cost 3 vzipr <4,u,5,1>, <4,4,6,6>
+    1908624694U, // <5,6,1,7>: Cost 2 vzipr <4,u,5,1>, RHS
+    1908624695U, // <5,6,1,u>: Cost 2 vzipr <4,u,5,1>, RHS
+    2114805762U, // <5,6,2,0>: Cost 2 ins <5,6,u,0>, lane 2
+    3188555778U, // <5,6,2,1>: Cost 3 ins <5,6,u,1>, lane 2
+    2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2>
+    2114871301U, // <5,6,2,3>: Cost 2 ins <5,6,u,u>, lane 5
+    2114838530U, // <5,6,2,4>: Cost 2 ins <5,6,u,4>, lane 2
+    2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6>
+    2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7>
+    2964458806U, // <5,6,2,7>: Cost 3 vzipr <1,u,5,2>, RHS
+    2114805762U, // <5,6,2,u>: Cost 2 ins <5,6,u,0>, lane 2
+    2114805762U, // <5,6,3,0>: Cost 2 ins <5,6,u,0>, lane 2
+    3206103040U, // <5,6,3,1>: Cost 3 ins <u,6,3,1>, lane 0
+    3206111232U, // <5,6,3,2>: Cost 3 ins <u,6,3,2>, lane 0
+    2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3>
+    1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6>
+    2783119874U, // <5,6,3,5>: Cost 3 vuzpl <5,2,6,3>, <3,4,5,6>
+    3206144000U, // <5,6,3,6>: Cost 3 ins <u,6,3,6>, lane 0
+    2132410368U, // <5,6,3,7>: Cost 2 ins <u,6,3,7>, lane 0
+    1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6>
+    2114805762U, // <5,6,4,0>: Cost 2 ins <5,6,u,0>, lane 2
+    3189587969U, // <5,6,4,1>: Cost 3 ins <5,u,4,1>, lane 1
+    2918765050U, // <5,6,4,2>: Cost 3 vzipl <5,4,7,6>, <6,2,7,3>
+    2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5>
+    2114838530U, // <5,6,4,4>: Cost 2 ins <5,6,u,4>, lane 2
+    1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS
+    2132475904U, // <5,6,4,6>: Cost 2 ins <u,6,4,6>, lane 0
+    2972437814U, // <5,6,4,7>: Cost 3 vzipr <3,2,5,4>, RHS
+    1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS
+    2114805762U, // <5,6,5,0>: Cost 2 ins <5,6,u,0>, lane 2
+    2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3>
+    2982398876U, // <5,6,5,2>: Cost 3 vzipr <4,u,5,5>, <4,0,6,2>
+    3189678081U, // <5,6,5,3>: Cost 3 ins <5,u,5,3>, lane 1
+    2114838530U, // <5,6,5,4>: Cost 2 ins <5,6,u,4>, lane 2
+    2115952641U, // <5,6,5,5>: Cost 2 ins <5,u,5,5>, lane 1
+    1772530997U, // <5,6,5,6>: Cost 2 vuzpr <4,5,6,6>, <4,5,6,6>
+    1908657462U, // <5,6,5,7>: Cost 2 vzipr <4,u,5,5>, RHS
+    1908657463U, // <5,6,5,u>: Cost 2 vzipr <4,u,5,5>, RHS
+    2114805762U, // <5,6,6,0>: Cost 2 ins <5,6,u,0>, lane 2
+    3189735425U, // <5,6,6,1>: Cost 3 ins <5,u,6,1>, lane 1
+    2920043002U, // <5,6,6,2>: Cost 3 vzipl <5,6,7,0>, <6,2,7,3>
+    2973781298U, // <5,6,6,3>: Cost 3 vzipr <3,4,5,6>, <4,5,6,3>
+    2114838530U, // <5,6,6,4>: Cost 2 ins <5,6,u,4>, lane 2
+    2973781138U, // <5,6,6,5>: Cost 3 vzipr <3,4,5,6>, <4,3,6,5>
+    2132623360U, // <5,6,6,6>: Cost 2 ins <u,6,6,6>, lane 0
+    1900039478U, // <5,6,6,7>: Cost 2 vzipr <3,4,5,6>, RHS
+    1900039479U, // <5,6,6,u>: Cost 2 vzipr <3,4,5,6>, RHS
+    430358630U,  // <5,6,7,0>: Cost 1 vext1 RHS, LHS
+    1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+    1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+    1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+    430361910U,  // <5,6,7,4>: Cost 1 vext1 RHS, RHS
+    1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6>
+    1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6>
+    1887440182U, // <5,6,7,7>: Cost 2 vzipr <1,3,5,7>, RHS
+    430364462U,  // <5,6,7,u>: Cost 1 vext1 RHS, LHS
+    430366822U,  // <5,6,u,0>: Cost 1 vext1 RHS, LHS
+    1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS
+    1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+    1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+    430370103U,  // <5,6,u,4>: Cost 1 vext1 RHS, RHS
+    1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS
+    1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3>
+    1887448374U, // <5,6,u,7>: Cost 2 vzipr <1,3,5,u>, RHS
+    430372654U,  // <5,6,u,u>: Cost 1 vext1 RHS, LHS
+    1772535808U, // <5,7,0,0>: Cost 2 vuzpr RHS, <0,0,0,0>
+    1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS
+    1772535828U, // <5,7,0,2>: Cost 2 vuzpr RHS, <0,0,2,2>
+    2115493890U, // <5,7,0,3>: Cost 2 ins <5,7,u,3>, lane 2
+    2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5>
+    2846279860U, // <5,7,0,5>: Cost 3 vuzpr RHS, <3,0,4,5>
+    2846277674U, // <5,7,0,6>: Cost 3 vuzpr RHS, <0,0,4,6>
+    2115526658U, // <5,7,0,7>: Cost 2 ins <5,7,u,7>, lane 2
+    1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS
+    2115018755U, // <5,7,1,0>: Cost 2 ins <5,7,1,u>, lane 3
+    1772536628U, // <5,7,1,1>: Cost 2 vuzpr RHS, <1,1,1,1>
+    2115018755U, // <5,7,1,2>: Cost 2 ins <5,7,1,u>, lane 3
+    698794086U,  // <5,7,1,3>: Cost 1 vuzpr RHS, LHS
+    2115018755U, // <5,7,1,4>: Cost 2 ins <5,7,1,u>, lane 3
+    2115018755U, // <5,7,1,5>: Cost 2 ins <5,7,1,u>, lane 3
+    2115018755U, // <5,7,1,6>: Cost 2 ins <5,7,1,u>, lane 3
+    2115526658U, // <5,7,1,7>: Cost 2 ins <5,7,u,7>, lane 2
+    698794091U,  // <5,7,1,u>: Cost 1 vuzpr RHS, LHS
+    1772536726U, // <5,7,2,0>: Cost 2 vuzpr RHS, <1,2,3,0>
+    2846277795U, // <5,7,2,1>: Cost 3 vuzpr RHS, <0,2,0,1>
+    1772535972U, // <5,7,2,2>: Cost 2 vuzpr RHS, <0,2,0,2>
+    1772537458U, // <5,7,2,3>: Cost 2 vuzpr RHS, <2,2,3,3>
+    1772536730U, // <5,7,2,4>: Cost 2 vuzpr RHS, <1,2,3,4>
+    2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7>
+    1772536012U, // <5,7,2,6>: Cost 2 vuzpr RHS, <0,2,4,6>
+    2115526658U, // <5,7,2,7>: Cost 2 ins <5,7,u,7>, lane 2
+    1772535978U, // <5,7,2,u>: Cost 2 vuzpr RHS, <0,2,0,u>
+    2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2>
+    1772537510U, // <5,7,3,1>: Cost 2 vuzpr RHS, <2,3,0,1>
+    2846278606U, // <5,7,3,2>: Cost 3 vuzpr RHS, <1,3,0,2>
+    1772536792U, // <5,7,3,3>: Cost 2 vuzpr RHS, <1,3,1,3>
+    2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6>
+    1772537550U, // <5,7,3,5>: Cost 2 vuzpr RHS, <2,3,4,5>
+    2846278628U, // <5,7,3,6>: Cost 3 vuzpr RHS, <1,3,2,6>
+    1772536832U, // <5,7,3,7>: Cost 2 vuzpr RHS, <1,3,5,7>
+    1772536797U, // <5,7,3,u>: Cost 2 vuzpr RHS, <1,3,1,u>
+    2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS
+    2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7>
+    2846277958U, // <5,7,4,2>: Cost 3 vuzpr RHS, <0,4,0,2>
+    2115493890U, // <5,7,4,3>: Cost 2 ins <5,7,u,3>, lane 2
+    1772539088U, // <5,7,4,4>: Cost 2 vuzpr RHS, <4,4,4,4>
+    1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS
+    1772536156U, // <5,7,4,6>: Cost 2 vuzpr RHS, <0,4,2,6>
+    2115526658U, // <5,7,4,7>: Cost 2 ins <5,7,u,7>, lane 2
+    1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS
+    2115313667U, // <5,7,5,0>: Cost 2 ins <5,7,5,u>, lane 3
+    2115313667U, // <5,7,5,1>: Cost 2 ins <5,7,5,u>, lane 3
+    2115313667U, // <5,7,5,2>: Cost 2 ins <5,7,5,u>, lane 3
+    2115493890U, // <5,7,5,3>: Cost 2 ins <5,7,u,3>, lane 2
+    2115313667U, // <5,7,5,4>: Cost 2 ins <5,7,5,u>, lane 3
+    1772539908U, // <5,7,5,5>: Cost 2 vuzpr RHS, <5,5,5,5>
+    2115313667U, // <5,7,5,6>: Cost 2 ins <5,7,5,u>, lane 3
+    698797366U,  // <5,7,5,7>: Cost 1 vuzpr RHS, RHS
+    698797367U,  // <5,7,5,u>: Cost 1 vuzpr RHS, RHS
+    1772540002U, // <5,7,6,0>: Cost 2 vuzpr RHS, <5,6,7,0>
+    2846279577U, // <5,7,6,1>: Cost 3 vuzpr RHS, <2,6,0,1>
+    1772539212U, // <5,7,6,2>: Cost 2 vuzpr RHS, <4,6,0,2>
+    2115493890U, // <5,7,6,3>: Cost 2 ins <5,7,u,3>, lane 2
+    1772540006U, // <5,7,6,4>: Cost 2 vuzpr RHS, <5,6,7,4>
+    2846279617U, // <5,7,6,5>: Cost 3 vuzpr RHS, <2,6,4,5>
+    1772539252U, // <5,7,6,6>: Cost 2 vuzpr RHS, <4,6,4,6>
+    1772537786U, // <5,7,6,7>: Cost 2 vuzpr RHS, <2,6,3,7>
+    1772537787U, // <5,7,6,u>: Cost 2 vuzpr RHS, <2,6,3,u>
+    1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS
+    1772540750U, // <5,7,7,1>: Cost 2 vuzpr RHS, <6,7,0,1>
+    2846281846U, // <5,7,7,2>: Cost 3 vuzpr RHS, <5,7,0,2>
+    1772540032U, // <5,7,7,3>: Cost 2 vuzpr RHS, <5,7,1,3>
+    1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS
+    1772540790U, // <5,7,7,5>: Cost 2 vuzpr RHS, <6,7,4,5>
+    2116108289U, // <5,7,7,6>: Cost 2 ins <5,u,7,6>, lane 1
+    1772540072U, // <5,7,7,7>: Cost 2 vuzpr RHS, <5,7,5,7>
+    1772540037U, // <5,7,7,u>: Cost 2 vuzpr RHS, <5,7,1,u>
+    1772537212U, // <5,7,u,0>: Cost 2 vuzpr RHS, <1,u,3,0>
+    1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS
+    1772536458U, // <5,7,u,2>: Cost 2 vuzpr RHS, <0,u,0,2>
+    698794653U,  // <5,7,u,3>: Cost 1 vuzpr RHS, LHS
+    1772537216U, // <5,7,u,4>: Cost 2 vuzpr RHS, <1,u,3,4>
+    1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS
+    1772536480U, // <5,7,u,6>: Cost 2 vuzpr RHS, <0,u,2,6>
+    698797609U,  // <5,7,u,7>: Cost 1 vuzpr RHS, RHS
+    698794658U,  // <5,7,u,u>: Cost 1 vuzpr RHS, LHS
+    1772544000U, // <5,u,0,0>: Cost 2 vuzpr RHS, <0,0,0,0>
+    1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS
+    1772544020U, // <5,u,0,2>: Cost 2 vuzpr RHS, <0,0,2,2>
+    2111512578U, // <5,u,0,3>: Cost 2 ins <5,1,u,3>, lane 2
+    2114838530U, // <5,u,0,4>: Cost 2 ins <5,6,u,4>, lane 2
+    2114183170U, // <5,u,0,5>: Cost 2 ins <5,5,u,5>, lane 2
+    2113527810U, // <5,u,0,6>: Cost 2 ins <5,4,u,6>, lane 2
+    2114199554U, // <5,u,0,7>: Cost 2 ins <5,5,u,7>, lane 2
+    1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS
+    2114805762U, // <5,u,1,0>: Cost 2 ins <5,6,u,0>, lane 2
+    1772544820U, // <5,u,1,1>: Cost 2 vuzpr RHS, <1,1,1,1>
+    1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+    698802278U,  // <5,u,1,3>: Cost 1 vuzpr RHS, LHS
+    2114838530U, // <5,u,1,4>: Cost 2 ins <5,6,u,4>, lane 2
+    1843009690U, // <5,u,1,5>: Cost 2 vzipl <5,1,7,3>, RHS
+    1980766362U, // <5,u,1,6>: Cost 2 vtrnl <5,7,1,3>, RHS
+    1908624712U, // <5,u,1,7>: Cost 2 vzipr <4,u,5,1>, RHS
+    698802283U,  // <5,u,1,u>: Cost 1 vuzpr RHS, LHS
+    1772544918U, // <5,u,2,0>: Cost 2 vuzpr RHS, <1,2,3,0>
+    2128969728U, // <5,u,2,1>: Cost 2 ins <u,1,2,1>, lane 0
+    1772544164U, // <5,u,2,2>: Cost 2 vuzpr RHS, <0,2,0,2>
+    1055244288U, // <5,u,2,3>: Cost 1 ins LHS, lane 0
+    1772544922U, // <5,u,2,4>: Cost 2 vuzpr RHS, <1,2,3,4>
+    2129002496U, // <5,u,2,5>: Cost 2 ins <u,1,2,5>, lane 0
+    1772544204U, // <5,u,2,6>: Cost 2 vuzpr RHS, <0,2,4,6>
+    2114199554U, // <5,u,2,7>: Cost 2 ins <5,5,u,7>, lane 2
+    1055244288U, // <5,u,2,u>: Cost 1 ins LHS, lane 0
+    2129698816U, // <5,u,3,0>: Cost 2 ins <u,2,3,0>, lane 0
+    1772545702U, // <5,u,3,1>: Cost 2 vuzpr RHS, <2,3,0,1>
+    2128388096U, // <5,u,3,2>: Cost 2 ins <u,0,3,2>, lane 0
+    1772544984U, // <5,u,3,3>: Cost 2 vuzpr RHS, <1,3,1,3>
+    1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u>
+    1772545742U, // <5,u,3,5>: Cost 2 vuzpr RHS, <2,3,4,5>
+    2113527810U, // <5,u,3,6>: Cost 2 ins <5,4,u,6>, lane 2
+    1772545024U, // <5,u,3,7>: Cost 2 vuzpr RHS, <1,3,5,7>
+    1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u>
+    2114805762U, // <5,u,4,0>: Cost 2 ins <5,6,u,0>, lane 2
+    1845024558U, // <5,u,4,1>: Cost 2 vzipl <5,4,7,6>, LHS
+    2642897979U, // <5,u,4,2>: Cost 3 vext2 <4,2,5,u>, <4,2,5,u>
+    2111512578U, // <5,u,4,3>: Cost 2 ins <5,1,u,3>, lane 2
+    1772547280U, // <5,u,4,4>: Cost 2 vuzpr RHS, <4,4,4,4>
+    1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS
+    1772544348U, // <5,u,4,6>: Cost 2 vuzpr RHS, <0,4,2,6>
+    2114199554U, // <5,u,4,7>: Cost 2 ins <5,5,u,7>, lane 2
+    1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS
+    1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+    1845532462U, // <5,u,5,1>: Cost 2 vzipl <5,5,5,5>, LHS
+    1979750190U, // <5,u,5,2>: Cost 2 vtrnl <5,5,5,5>, LHS
+    1908654236U, // <5,u,5,3>: Cost 2 vzipr <4,u,5,5>, LHS
+    1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+    229035318U,  // <5,u,5,5>: Cost 1 vdup1 RHS
+    1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+    698805558U,  // <5,u,5,7>: Cost 1 vuzpr RHS, RHS
+    698805559U,  // <5,u,5,u>: Cost 1 vuzpr RHS, RHS
+    1772548194U, // <5,u,6,0>: Cost 2 vuzpr RHS, <5,6,7,0>
+    1846302510U, // <5,u,6,1>: Cost 2 vzipl <5,6,7,0>, LHS
+    1772547404U, // <5,u,6,2>: Cost 2 vuzpr RHS, <4,6,0,2>
+    1900036252U, // <5,u,6,3>: Cost 2 vzipr <3,4,5,6>, LHS
+    1772548198U, // <5,u,6,4>: Cost 2 vuzpr RHS, <5,6,7,4>
+    1846302874U, // <5,u,6,5>: Cost 2 vzipl <5,6,7,0>, RHS
+    1772547444U, // <5,u,6,6>: Cost 2 vuzpr RHS, <4,6,4,6>
+    1058226176U, // <5,u,6,7>: Cost 1 ins RHS, lane 0
+    1058226176U, // <5,u,6,u>: Cost 1 ins RHS, lane 0
+    430506086U,  // <5,u,7,0>: Cost 1 vext1 RHS, LHS
+    1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7>
+    2040971914U, // <5,u,7,2>: Cost 2 vtrnr RHS, <0,u,0,2>
+    967230109U,  // <5,u,7,3>: Cost 1 vtrnr RHS, LHS
+    430509384U,  // <5,u,7,4>: Cost 1 vext1 RHS, RHS
+    2040971926U, // <5,u,7,5>: Cost 2 vtrnr RHS, <0,u,1,5>
+    118708378U,  // <5,u,7,6>: Cost 1 vrev RHS
+    967233065U,  // <5,u,7,7>: Cost 1 vtrnr RHS, RHS
+    967230114U,  // <5,u,7,u>: Cost 1 vtrnr RHS, LHS
+    430514278U,  // <5,u,u,0>: Cost 1 vext1 RHS, LHS
+    1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS
+    1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+    698802845U,  // <5,u,u,3>: Cost 1 vuzpr RHS, LHS
+    430517577U,  // <5,u,u,4>: Cost 1 vext1 RHS, RHS
+    229035318U,  // <5,u,u,5>: Cost 1 vdup1 RHS
+    118716571U,  // <5,u,u,6>: Cost 1 vrev RHS
+    698805801U,  // <5,u,u,7>: Cost 1 vuzpr RHS, RHS
+    698802850U,  // <5,u,u,u>: Cost 1 vuzpr RHS, LHS
+    2128150528U, // <6,0,0,0>: Cost 2 ins <u,0,0,0>, lane 0
+    2121523201U, // <6,0,0,1>: Cost 2 ins <6,u,0,1>, lane 1
+    1718206566U, // <6,0,0,2>: Cost 2 vuzpl <6,7,0,1>, LHS
+    2852933922U, // <6,0,0,3>: Cost 3 vuzpr <5,6,7,0>, <6,0,1,3>
+    2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6>
+    2852934680U, // <6,0,0,5>: Cost 3 vuzpr <5,6,7,0>, <7,0,4,5>
+    2852934690U, // <6,0,0,6>: Cost 3 vuzpr <5,6,7,0>, <7,0,5,6>
+    2852933962U, // <6,0,0,7>: Cost 3 vuzpr <5,6,7,0>, <6,0,5,7>
+    1718206620U, // <6,0,0,u>: Cost 2 vuzpl <6,7,0,1>, LHS
+    2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS
+    2128232448U, // <6,0,1,1>: Cost 2 ins <u,0,1,1>, lane 0
+    1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+    1779187814U, // <6,0,1,3>: Cost 2 vuzpr <5,6,7,0>, LHS
+    2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS
+    2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1>
+    2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1>
+    2791949566U, // <6,0,1,7>: Cost 3 vuzpl <6,7,0,1>, <1,6,7,0>
+    1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+    1504280678U, // <6,0,2,0>: Cost 2 vext1 <4,6,0,2>, LHS
+    1849639014U, // <6,0,2,1>: Cost 2 vzipl <6,2,7,3>, LHS
+    2128314368U, // <6,0,2,2>: Cost 2 ins <u,0,2,2>, lane 0
+    2128322560U, // <6,0,2,3>: Cost 2 ins <u,0,2,3>, lane 0
+    1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6>
+    2578026192U, // <6,0,2,5>: Cost 3 vext1 <4,6,0,2>, <5,1,7,3>
+    2578026792U, // <6,0,2,6>: Cost 3 vext1 <4,6,0,2>, <6,0,2,0>
+    2578027514U, // <6,0,2,7>: Cost 3 vext1 <4,6,0,2>, <7,0,1,2>
+    1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6>
+    3202113536U, // <6,0,3,0>: Cost 3 ins <u,0,3,0>, lane 0
+    2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4>
+    2128388096U, // <6,0,3,2>: Cost 2 ins <u,0,3,2>, lane 0
+    2852930520U, // <6,0,3,3>: Cost 3 vuzpr <5,6,7,0>, <1,3,1,3>
+    2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6>
+    2852931278U, // <6,0,3,5>: Cost 3 vuzpr <5,6,7,0>, <2,3,4,5>
+    3190587394U, // <6,0,3,6>: Cost 3 ins <6,0,u,6>, lane 2
+    2852930560U, // <6,0,3,7>: Cost 3 vuzpr <5,6,7,0>, <1,3,5,7>
+    2128388096U, // <6,0,3,u>: Cost 2 ins <u,0,3,2>, lane 0
+    2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6>
+    2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5>
+    1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6>
+    3195576321U, // <6,0,4,3>: Cost 3 ins <6,u,4,3>, lane 1
+    2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6>
+    2121850881U, // <6,0,4,5>: Cost 2 ins <6,u,4,5>, lane 1
+    1718209846U, // <6,0,4,6>: Cost 2 vuzpl <6,7,0,1>, RHS
+    3195609089U, // <6,0,4,7>: Cost 3 ins <6,u,4,7>, lane 1
+    1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6>
+    3202260992U, // <6,0,5,0>: Cost 3 ins <u,0,5,0>, lane 0
+    2128527360U, // <6,0,5,1>: Cost 2 ins <u,0,5,1>, lane 0
+    3056156774U, // <6,0,5,2>: Cost 3 vtrnl <6,0,5,7>, LHS
+    3190562818U, // <6,0,5,3>: Cost 3 ins <6,0,u,3>, lane 2
+    3058802892U, // <6,0,5,4>: Cost 3 vtrnl <6,4,5,6>, <0,2,4,6>
+    2852933636U, // <6,0,5,5>: Cost 3 vuzpr <5,6,7,0>, <5,5,5,5>
+    2852932908U, // <6,0,5,6>: Cost 3 vuzpr <5,6,7,0>, <4,5,5,6>
+    1779191094U, // <6,0,5,7>: Cost 2 vuzpr <5,6,7,0>, RHS
+    1779191095U, // <6,0,5,u>: Cost 2 vuzpr <5,6,7,0>, RHS
+    1779191906U, // <6,0,6,0>: Cost 2 vuzpr <5,6,7,0>, <5,6,7,0>
+    1852244070U, // <6,0,6,1>: Cost 2 vzipl <6,6,6,6>, LHS
+    1986461798U, // <6,0,6,2>: Cost 2 vtrnl <6,6,6,6>, LHS
+    3195723777U, // <6,0,6,3>: Cost 3 ins <6,u,6,3>, lane 1
+    2852933734U, // <6,0,6,4>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,4>
+    3195740161U, // <6,0,6,5>: Cost 3 ins <6,u,6,5>, lane 1
+    2122006529U, // <6,0,6,6>: Cost 2 ins <6,u,6,6>, lane 1
+    2128650240U, // <6,0,6,7>: Cost 2 ins <u,0,6,7>, lane 0
+    1852244637U, // <6,0,6,u>: Cost 2 vzipl <6,6,6,6>, LHS
+    1906753536U, // <6,0,7,0>: Cost 2 vzipr RHS, <0,0,0,0>
+    1906755238U, // <6,0,7,1>: Cost 2 vzipr RHS, <2,3,0,1>
+    1906753700U, // <6,0,7,2>: Cost 2 vzipr RHS, <0,2,0,2>
+    2122055681U, // <6,0,7,3>: Cost 2 ins <6,u,7,3>, lane 1
+    2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS
+    2980496418U, // <6,0,7,5>: Cost 3 vzipr RHS, <1,4,0,5>
+    2980495690U, // <6,0,7,6>: Cost 3 vzipr RHS, <0,4,0,6>
+    2122088449U, // <6,0,7,7>: Cost 2 ins <6,u,7,7>, lane 1
+    1906753706U, // <6,0,7,u>: Cost 2 vzipr RHS, <0,2,0,u>
+    1906761728U, // <6,0,u,0>: Cost 2 vzipr RHS, <0,0,0,0>
+    1906763430U, // <6,0,u,1>: Cost 2 vzipr RHS, <2,3,0,1>
+    1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+    1779188381U, // <6,0,u,3>: Cost 2 vuzpr <5,6,7,0>, LHS
+    1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6>
+    2121850881U, // <6,0,u,5>: Cost 2 ins <6,u,4,5>, lane 1
+    1718212762U, // <6,0,u,6>: Cost 2 vuzpl <6,7,0,1>, RHS
+    1779191337U, // <6,0,u,7>: Cost 2 vuzpr <5,6,7,0>, RHS
+    1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS
+    2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS
+    2121523201U, // <6,1,0,1>: Cost 2 ins <6,u,0,1>, lane 1
+    2846673046U, // <6,1,0,2>: Cost 3 vuzpr <4,6,3,1>, <3,0,1,2>
+    2047623270U, // <6,1,0,3>: Cost 2 vtrnr <5,6,7,0>, LHS
+    2787385548U, // <6,1,0,4>: Cost 3 vuzpl <6,0,1,2>, <0,2,4,6>
+    3060384768U, // <6,1,0,5>: Cost 3 vtrnl <6,7,0,1>, <1,3,5,7>
+    2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1>
+    3060385022U, // <6,1,0,7>: Cost 3 vtrnl <6,7,0,1>, <1,6,7,0>
+    2047623275U, // <6,1,0,u>: Cost 2 vtrnr <5,6,7,0>, LHS
+    2578088038U, // <6,1,1,0>: Cost 3 vext1 <4,6,1,1>, LHS
+    2128896000U, // <6,1,1,1>: Cost 2 ins <u,1,1,1>, lane 0
+    2981778426U, // <6,1,1,2>: Cost 3 vzipr <4,7,6,1>, <7,0,1,2>
+    2128912384U, // <6,1,1,3>: Cost 2 ins <u,1,1,3>, lane 0
+    2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6>
+    3202670592U, // <6,1,1,5>: Cost 3 ins <u,1,1,5>, lane 0
+    2691482470U, // <6,1,1,6>: Cost 3 vext3 <1,1,6,6>, <1,1,6,6>
+    2980449545U, // <6,1,1,7>: Cost 3 vzipr <4,5,6,1>, <4,5,1,7>
+    2128896000U, // <6,1,1,u>: Cost 2 ins <u,1,1,1>, lane 0
+    2128961536U, // <6,1,2,0>: Cost 2 ins <u,1,2,0>, lane 0
+    2128969728U, // <6,1,2,1>: Cost 2 ins <u,1,2,1>, lane 0
+    2128977920U, // <6,1,2,2>: Cost 2 ins <u,1,2,2>, lane 0
+    1055244288U, // <6,1,2,3>: Cost 1 ins LHS, lane 0
+    2128994304U, // <6,1,2,4>: Cost 2 ins <u,1,2,4>, lane 0
+    2129002496U, // <6,1,2,5>: Cost 2 ins <u,1,2,5>, lane 0
+    2129010688U, // <6,1,2,6>: Cost 2 ins <u,1,2,6>, lane 0
+    2129018880U, // <6,1,2,7>: Cost 2 ins <u,1,2,7>, lane 0
+    1055244288U, // <6,1,2,u>: Cost 1 ins LHS, lane 0
+    2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS
+    2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3>
+    2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6>
+    2129059840U, // <6,1,3,3>: Cost 2 ins <u,1,3,3>, lane 0
+    2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6>
+    2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7>
+    2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3>
+    2953923849U, // <6,1,3,7>: Cost 3 vzipr <0,1,6,3>, <4,5,1,7>
+    2129059840U, // <6,1,3,u>: Cost 2 ins <u,1,3,3>, lane 0
+    2788724044U, // <6,1,4,0>: Cost 3 vuzpl <6,2,1,3>, <4,6,0,2>
+    2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6>
+    3195568129U, // <6,1,4,2>: Cost 3 ins <6,u,4,2>, lane 1
+    2047656038U, // <6,1,4,3>: Cost 2 vtrnr <5,6,7,4>, LHS
+    2791378292U, // <6,1,4,4>: Cost 3 vuzpl <6,6,1,3>, <4,6,4,6>
+    2121850881U, // <6,1,4,5>: Cost 2 ins <6,u,4,5>, lane 1
+    2834506076U, // <6,1,4,6>: Cost 3 vuzpr <2,6,0,1>, <0,4,2,6>
+    2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1>
+    2047656043U, // <6,1,4,u>: Cost 2 vtrnr <5,6,7,4>, LHS
+    2578120806U, // <6,1,5,0>: Cost 3 vext1 <4,6,1,5>, LHS
+    2578121728U, // <6,1,5,1>: Cost 3 vext1 <4,6,1,5>, <1,3,5,7>
+    3202940928U, // <6,1,5,2>: Cost 3 ins <u,1,5,2>, lane 0
+    2129207296U, // <6,1,5,3>: Cost 2 ins <u,1,5,3>, lane 0
+    2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6>
+    3202965504U, // <6,1,5,5>: Cost 3 ins <u,1,5,5>, lane 0
+    2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0>
+    2834509110U, // <6,1,5,7>: Cost 3 vuzpr <2,6,0,1>, RHS
+    2129207296U, // <6,1,5,u>: Cost 2 ins <u,1,5,3>, lane 0
+    2925986550U, // <6,1,6,0>: Cost 3 vzipl <6,6,6,6>, <1,0,3,2>
+    2834507673U, // <6,1,6,1>: Cost 3 vuzpr <2,6,0,1>, <2,6,0,1>
+    2982480022U, // <6,1,6,2>: Cost 3 vzipr <4,u,6,6>, <3,0,1,2>
+    2041479270U, // <6,1,6,3>: Cost 2 vtrnr <4,6,4,6>, LHS
+    2602020150U, // <6,1,6,4>: Cost 3 vext1 <u,6,1,6>, RHS
+    2982478162U, // <6,1,6,5>: Cost 3 vzipr <4,u,6,6>, <0,4,1,5>
+    2122006529U, // <6,1,6,6>: Cost 2 ins <6,u,6,6>, lane 1
+    2129313792U, // <6,1,6,7>: Cost 2 ins <u,1,6,7>, lane 0
+    2041479275U, // <6,1,6,u>: Cost 2 vtrnr <4,6,4,6>, LHS
+    2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS
+    1906753546U, // <6,1,7,1>: Cost 2 vzipr RHS, <0,0,1,1>
+    1906755734U, // <6,1,7,2>: Cost 2 vzipr RHS, <3,0,1,2>
+    2029469798U, // <6,1,7,3>: Cost 2 vtrnr <2,6,3,7>, LHS
+    2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS
+    1906753874U, // <6,1,7,5>: Cost 2 vzipr RHS, <0,4,1,5>
+    2980495537U, // <6,1,7,6>: Cost 3 vzipr RHS, <0,2,1,6>
+    2122088449U, // <6,1,7,7>: Cost 2 ins <6,u,7,7>, lane 1
+    2029469803U, // <6,1,7,u>: Cost 2 vtrnr <2,6,3,7>, LHS
+    2128961536U, // <6,1,u,0>: Cost 2 ins <u,1,2,0>, lane 0
+    1906761738U, // <6,1,u,1>: Cost 2 vzipr RHS, <0,0,1,1>
+    1906763926U, // <6,1,u,2>: Cost 2 vzipr RHS, <3,0,1,2>
+    1055244288U, // <6,1,u,3>: Cost 1 ins LHS, lane 0
+    2128994304U, // <6,1,u,4>: Cost 2 ins <u,1,2,4>, lane 0
+    1906762066U, // <6,1,u,5>: Cost 2 vzipr RHS, <0,4,1,5>
+    2129010688U, // <6,1,u,6>: Cost 2 ins <u,1,2,6>, lane 0
+    2122088449U, // <6,1,u,7>: Cost 2 ins <6,u,7,7>, lane 1
+    1055244288U, // <6,1,u,u>: Cost 1 ins LHS, lane 0
+    2846457856U, // <6,2,0,0>: Cost 3 vuzpr <4,6,0,2>, <0,0,0,0>
+    1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS
+    2129494016U, // <6,2,0,2>: Cost 2 ins <u,2,0,2>, lane 0
+    2118148098U, // <6,2,0,3>: Cost 2 ins <6,2,u,3>, lane 2
+    2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6>
+    3195297793U, // <6,2,0,5>: Cost 3 ins <6,u,0,5>, lane 1
+    2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4>
+    3195314177U, // <6,2,0,7>: Cost 3 ins <6,u,0,7>, lane 1
+    1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS
+    2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1>
+    2846458676U, // <6,2,1,1>: Cost 3 vuzpr <4,6,0,2>, <1,1,1,1>
+    2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0>
+    1772716134U, // <6,2,1,3>: Cost 2 vuzpr <4,6,0,2>, LHS
+    3191414787U, // <6,2,1,4>: Cost 3 ins <6,2,1,u>, lane 3
+    2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7>
+    3114885324U, // <6,2,1,6>: Cost 3 vtrnr <4,6,0,1>, <0,2,4,6>
+    3191922690U, // <6,2,1,7>: Cost 3 ins <6,2,u,7>, lane 2
+    1772716139U, // <6,2,1,u>: Cost 2 vuzpr <4,6,0,2>, LHS
+    2846458774U, // <6,2,2,0>: Cost 3 vuzpr <4,6,0,2>, <1,2,3,0>
+    3195412481U, // <6,2,2,1>: Cost 3 ins <6,u,2,1>, lane 1
+    2129641472U, // <6,2,2,2>: Cost 2 ins <u,2,2,2>, lane 0
+    1908703334U, // <6,2,2,3>: Cost 2 vzipr <4,u,6,2>, LHS
+    2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6>
+    3195445249U, // <6,2,2,5>: Cost 3 ins <6,u,2,5>, lane 1
+    2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6>
+    2846462444U, // <6,2,2,7>: Cost 3 vuzpr <4,6,0,2>, <6,2,5,7>
+    1908703339U, // <6,2,2,u>: Cost 2 vzipr <4,u,6,2>, LHS
+    2129698816U, // <6,2,3,0>: Cost 2 ins <u,2,3,0>, lane 0
+    2230618020U, // <6,2,3,1>: Cost 3 vrev <2,6,1,3>
+    2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6>
+    2129723392U, // <6,2,3,3>: Cost 2 ins <u,2,3,3>, lane 0
+    2129731584U, // <6,2,3,4>: Cost 2 ins <u,2,3,4>, lane 0
+    2846459598U, // <6,2,3,5>: Cost 3 vuzpr <4,6,0,2>, <2,3,4,5>
+    2966528348U, // <6,2,3,6>: Cost 3 vzipr <2,2,6,3>, <0,4,2,6>
+    2846458880U, // <6,2,3,7>: Cost 3 vuzpr <4,6,0,2>, <1,3,5,7>
+    2129698816U, // <6,2,3,u>: Cost 2 ins <u,2,3,0>, lane 0
+    1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+    3191873538U, // <6,2,4,1>: Cost 3 ins <6,2,u,1>, lane 2
+    2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6>
+    2118148098U, // <6,2,4,3>: Cost 2 ins <6,2,u,3>, lane 2
+    2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6>
+    1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS
+    2129821696U, // <6,2,4,6>: Cost 2 ins <u,2,4,6>, lane 0
+    3195609089U, // <6,2,4,7>: Cost 3 ins <6,u,4,7>, lane 1
+    1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2>
+    3191709699U, // <6,2,5,0>: Cost 3 ins <6,2,5,u>, lane 3
+    2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3>
+    3203604480U, // <6,2,5,2>: Cost 3 ins <u,2,5,2>, lane 0
+    2118148098U, // <6,2,5,3>: Cost 2 ins <6,2,u,3>, lane 2
+    2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5>
+    2846461956U, // <6,2,5,5>: Cost 3 vuzpr <4,6,0,2>, <5,5,5,5>
+    3115213004U, // <6,2,5,6>: Cost 3 vtrnr <4,6,4,5>, <0,2,4,6>
+    1772719414U, // <6,2,5,7>: Cost 2 vuzpr <4,6,0,2>, RHS
+    1772719415U, // <6,2,5,u>: Cost 2 vuzpr <4,6,0,2>, RHS
+    2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1>
+    3195707393U, // <6,2,6,1>: Cost 3 ins <6,u,6,1>, lane 1
+    1772719436U, // <6,2,6,2>: Cost 2 vuzpr <4,6,0,2>, <4,6,0,2>
+    1908736102U, // <6,2,6,3>: Cost 2 vzipr <4,u,6,6>, LHS
+    2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5>
+    3195740161U, // <6,2,6,5>: Cost 3 ins <6,u,6,5>, lane 1
+    2122006529U, // <6,2,6,6>: Cost 2 ins <6,u,6,6>, lane 1
+    2118189061U, // <6,2,6,7>: Cost 2 ins <6,2,u,u>, lane 5
+    1908736107U, // <6,2,6,u>: Cost 2 vzipr <4,u,6,6>, LHS
+    2118115331U, // <6,2,7,0>: Cost 2 ins <6,2,7,u>, lane 3
+    2118115331U, // <6,2,7,1>: Cost 2 ins <6,2,7,u>, lane 3
+    1906753556U, // <6,2,7,2>: Cost 2 vzipr RHS, <0,0,2,2>
+    833011814U,  // <6,2,7,3>: Cost 1 vzipr RHS, LHS
+    2118115331U, // <6,2,7,4>: Cost 2 ins <6,2,7,u>, lane 3
+    2118115331U, // <6,2,7,5>: Cost 2 ins <6,2,7,u>, lane 3
+    1906753884U, // <6,2,7,6>: Cost 2 vzipr RHS, <0,4,2,6>
+    2122088449U, // <6,2,7,7>: Cost 2 ins <6,u,7,7>, lane 1
+    833011819U,  // <6,2,7,u>: Cost 1 vzipr RHS, LHS
+    2129698816U, // <6,2,u,0>: Cost 2 ins <u,2,3,0>, lane 0
+    1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS
+    1906761748U, // <6,2,u,2>: Cost 2 vzipr RHS, <0,0,2,2>
+    833020006U,  // <6,2,u,3>: Cost 1 vzipr RHS, LHS
+    2129731584U, // <6,2,u,4>: Cost 2 ins <u,2,3,4>, lane 0
+    1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS
+    1906762076U, // <6,2,u,6>: Cost 2 vzipr RHS, <0,4,2,6>
+    1772719657U, // <6,2,u,7>: Cost 2 vuzpr <4,6,0,2>, RHS
+    833020011U,  // <6,2,u,u>: Cost 1 vzipr RHS, LHS
+    3203883008U, // <6,3,0,0>: Cost 3 ins <u,3,0,0>, lane 0
+    2130149376U, // <6,3,0,1>: Cost 2 ins <u,3,0,1>, lane 0
+    2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4>
+    3121365976U, // <6,3,0,3>: Cost 3 vtrnr <5,6,7,0>, <1,3,1,3>
+    2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2>
+    3121366734U, // <6,3,0,5>: Cost 3 vtrnr <5,6,7,0>, <2,3,4,5>
+    3195305985U, // <6,3,0,6>: Cost 3 ins <6,u,0,6>, lane 1
+    3121366016U, // <6,3,0,7>: Cost 3 vtrnr <5,6,7,0>, <1,3,5,7>
+    2130149376U, // <6,3,0,u>: Cost 2 ins <u,3,0,1>, lane 0
+    2578235494U, // <6,3,1,0>: Cost 3 vext1 <4,6,3,1>, LHS
+    3203964928U, // <6,3,1,1>: Cost 3 ins <u,3,1,1>, lane 0
+    3203973120U, // <6,3,1,2>: Cost 3 ins <u,3,1,2>, lane 0
+    2130239488U, // <6,3,1,3>: Cost 2 ins <u,3,1,3>, lane 0
+    2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+    3203997696U, // <6,3,1,5>: Cost 3 ins <u,3,1,5>, lane 0
+    2822725737U, // <6,3,1,6>: Cost 3 vuzpr <0,6,2,3>, <0,1,2,6>
+    2970494906U, // <6,3,1,7>: Cost 3 vzipr <2,u,6,1>, <2,6,3,7>
+    2130239488U, // <6,3,1,u>: Cost 2 ins <u,3,1,3>, lane 0
+    2982445974U, // <6,3,2,0>: Cost 3 vzipr <4,u,6,2>, <1,2,3,0>
+    2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+    2630985357U, // <6,3,2,2>: Cost 3 vext2 <2,2,6,3>, <2,2,6,3>
+    2130313216U, // <6,3,2,3>: Cost 2 ins <u,3,2,3>, lane 0
+    2982445978U, // <6,3,2,4>: Cost 3 vzipr <4,u,6,2>, <1,2,3,4>
+    3114895054U, // <6,3,2,5>: Cost 3 vtrnr <4,6,0,2>, <2,3,4,5>
+    2834596044U, // <6,3,2,6>: Cost 3 vuzpr <2,6,1,3>, <0,2,4,6>
+    3114894336U, // <6,3,2,7>: Cost 3 vtrnr <4,6,0,2>, <1,3,5,7>
+    2130313216U, // <6,3,2,u>: Cost 2 ins <u,3,2,3>, lane 0
+    2578251878U, // <6,3,3,0>: Cost 3 vext1 <4,6,3,3>, LHS
+    2792163478U, // <6,3,3,1>: Cost 3 vuzpl <6,7,3,0>, <3,0,1,2>
+    2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3>
+    2130386944U, // <6,3,3,3>: Cost 2 ins <u,3,3,3>, lane 0
+    2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6>
+    2792196610U, // <6,3,3,5>: Cost 3 vuzpl <6,7,3,4>, <3,4,5,6>
+    2590200602U, // <6,3,3,6>: Cost 3 vext1 <6,6,3,3>, <6,6,3,3>
+    2972501946U, // <6,3,3,7>: Cost 3 vzipr <3,2,6,3>, <2,6,3,7>
+    2130386944U, // <6,3,3,u>: Cost 2 ins <u,3,3,3>, lane 0
+    2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS
+    2705050078U, // <6,3,4,1>: Cost 3 vext3 <3,4,1,6>, <3,4,1,6>
+    2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3>
+    2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6>
+    2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS
+    1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6>
+    2846540124U, // <6,3,4,6>: Cost 3 vuzpr <4,6,1,3>, <0,4,2,6>
+    3121398784U, // <6,3,4,7>: Cost 3 vtrnr <5,6,7,4>, <1,3,5,7>
+    1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6>
+    2578268262U, // <6,3,5,0>: Cost 3 vext1 <4,6,3,5>, LHS
+    3204259840U, // <6,3,5,1>: Cost 3 ins <u,3,5,1>, lane 0
+    2648903448U, // <6,3,5,2>: Cost 3 vext2 <5,2,6,3>, <5,2,6,3>
+    2578270722U, // <6,3,5,3>: Cost 3 vext1 <4,6,3,5>, <3,4,5,6>
+    2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6>
+    3204292608U, // <6,3,5,5>: Cost 3 ins <u,3,5,5>, lane 0
+    3204300800U, // <6,3,5,6>: Cost 3 ins <u,3,5,6>, lane 0
+    2130567168U, // <6,3,5,7>: Cost 2 ins <u,3,5,7>, lane 0
+    2130567168U, // <6,3,5,u>: Cost 2 ins <u,3,5,7>, lane 0
+    2982478742U, // <6,3,6,0>: Cost 3 vzipr <4,u,6,6>, <1,2,3,0>
+    3115222694U, // <6,3,6,1>: Cost 3 vtrnr <4,6,4,6>, <2,3,0,1>
+    2982478582U, // <6,3,6,2>: Cost 3 vzipr <4,u,6,6>, <1,0,3,2>
+    1748984315U, // <6,3,6,3>: Cost 2 vuzpr <0,6,2,3>, <0,6,2,3>
+    2982478746U, // <6,3,6,4>: Cost 3 vzipr <4,u,6,6>, <1,2,3,4>
+    3115222734U, // <6,3,6,5>: Cost 3 vtrnr <4,6,4,6>, <2,3,4,5>
+    2122006529U, // <6,3,6,6>: Cost 2 ins <6,u,6,6>, lane 1
+    2130640896U, // <6,3,6,7>: Cost 2 ins <u,3,6,7>, lane 0
+    1748984315U, // <6,3,6,u>: Cost 2 vuzpr <0,6,2,3>, <0,6,2,3>
+    1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS
+    2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7>
+    1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7>
+    1906754376U, // <6,3,7,3>: Cost 2 vzipr RHS, <1,1,3,3>
+    1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS
+    3103213262U, // <6,3,7,5>: Cost 3 vtrnr <2,6,3,7>, <2,3,4,5>
+    2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3>
+    1906754704U, // <6,3,7,7>: Cost 2 vzipr RHS, <1,5,3,7>
+    1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS
+    1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS
+    2130149376U, // <6,3,u,1>: Cost 2 ins <u,3,0,1>, lane 0
+    1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u>
+    1906762568U, // <6,3,u,3>: Cost 2 vzipr RHS, <1,1,3,3>
+    1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS
+    1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6>
+    2122006529U, // <6,3,u,6>: Cost 2 ins <6,u,6,6>, lane 1
+    1906762896U, // <6,3,u,7>: Cost 2 vzipr RHS, <1,5,3,7>
+    1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS
+    2242465098U, // <6,4,0,0>: Cost 3 vrev <4,6,0,0>
+    2121523201U, // <6,4,0,1>: Cost 2 ins <6,u,0,1>, lane 1
+    1718534246U, // <6,4,0,2>: Cost 2 vuzpl <6,7,4,5>, LHS
+    3195281409U, // <6,4,0,3>: Cost 3 ins <6,u,0,3>, lane 1
+    2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6>
+    2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1>
+    1986645302U, // <6,4,0,6>: Cost 2 vtrnl <6,7,0,1>, RHS
+    3195314177U, // <6,4,0,7>: Cost 3 ins <6,u,0,7>, lane 1
+    1986645320U, // <6,4,0,u>: Cost 2 vtrnl <6,7,0,1>, RHS
+    2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1>
+    2242547028U, // <6,4,1,1>: Cost 3 vrev <4,6,1,1>
+    3204636672U, // <6,4,1,2>: Cost 3 ins <u,4,1,2>, lane 0
+    1779220582U, // <6,4,1,3>: Cost 2 vuzpr <5,6,7,4>, LHS
+    3059813748U, // <6,4,1,4>: Cost 3 vtrnl <6,6,1,3>, <4,6,4,6>
+    2130919424U, // <6,4,1,5>: Cost 2 ins <u,4,1,5>, lane 0
+    3102941532U, // <6,4,1,6>: Cost 3 vtrnr <2,6,0,1>, <0,4,2,6>
+    2242989450U, // <6,4,1,7>: Cost 3 vrev <4,6,7,1>
+    1779220587U, // <6,4,1,u>: Cost 2 vuzpr <5,6,7,4>, LHS
+    1168739660U, // <6,4,2,0>: Cost 2 vrev <4,6,0,2>
+    3195412481U, // <6,4,2,1>: Cost 3 ins <6,u,2,1>, lane 1
+    2242628958U, // <6,4,2,2>: Cost 3 vrev <4,6,2,2>
+    2130976768U, // <6,4,2,3>: Cost 2 ins <u,4,2,3>, lane 0
+    2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4>
+    1849642294U, // <6,4,2,5>: Cost 2 vzipl <6,2,7,3>, RHS
+    2131001344U, // <6,4,2,6>: Cost 2 ins <u,4,2,6>, lane 0
+    3195461633U, // <6,4,2,7>: Cost 3 ins <6,u,2,7>, lane 1
+    1169329556U, // <6,4,2,u>: Cost 2 vrev <4,6,u,2>
+    3195478017U, // <6,4,3,0>: Cost 3 ins <6,u,3,0>, lane 1
+    2242563414U, // <6,4,3,1>: Cost 3 vrev <4,6,1,3>
+    2242637151U, // <6,4,3,2>: Cost 3 vrev <4,6,2,3>
+    2242710888U, // <6,4,3,3>: Cost 3 vrev <4,6,3,3>
+    2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6>
+    2846623438U, // <6,4,3,5>: Cost 3 vuzpr <4,6,2,4>, <2,3,4,5>
+    2965864652U, // <6,4,3,6>: Cost 3 vzipr <2,1,6,3>, <0,2,4,6>
+    2852963328U, // <6,4,3,7>: Cost 3 vuzpr <5,6,7,4>, <1,3,5,7>
+    2243079573U, // <6,4,3,u>: Cost 3 vrev <4,6,u,3>
+    2242497870U, // <6,4,4,0>: Cost 3 vrev <4,6,0,4>
+    2852967732U, // <6,4,4,1>: Cost 3 vuzpr <5,6,7,4>, <7,4,0,1>
+    2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4>
+    2852967014U, // <6,4,4,3>: Cost 3 vuzpr <5,6,7,4>, <6,4,1,3>
+    2131132416U, // <6,4,4,4>: Cost 2 ins <u,4,4,4>, lane 0
+    2121850881U, // <6,4,4,5>: Cost 2 ins <6,u,4,5>, lane 1
+    1718537526U, // <6,4,4,6>: Cost 2 vuzpl <6,7,4,5>, RHS
+    2852967054U, // <6,4,4,7>: Cost 3 vuzpr <5,6,7,4>, <6,4,5,7>
+    1718537544U, // <6,4,4,u>: Cost 2 vuzpl <6,7,4,5>, RHS
+    2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS
+    2242579800U, // <6,4,5,1>: Cost 3 vrev <4,6,1,5>
+    2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5>
+    2242727274U, // <6,4,5,3>: Cost 3 vrev <4,6,3,5>
+    2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS
+    2131214336U, // <6,4,5,5>: Cost 2 ins <u,4,5,5>, lane 0
+    1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+    1779223862U, // <6,4,5,7>: Cost 2 vuzpr <5,6,7,4>, RHS
+    1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+    1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS
+    2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2>
+    2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2>
+    2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2>
+    1169067380U, // <6,4,6,4>: Cost 2 vrev <4,6,4,6>
+    1852247350U, // <6,4,6,5>: Cost 2 vzipl <6,6,6,6>, RHS
+    1986465078U, // <6,4,6,6>: Cost 2 vtrnl <6,6,6,6>, RHS
+    2131304448U, // <6,4,6,7>: Cost 2 ins <u,4,6,7>, lane 0
+    1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS
+    2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS
+    2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4>
+    2980495398U, // <6,4,7,2>: Cost 3 vzipr RHS, <0,0,4,2>
+    2122055681U, // <6,4,7,3>: Cost 2 ins <6,u,7,3>, lane 1
+    1906756816U, // <6,4,7,4>: Cost 2 vzipr RHS, <4,4,4,4>
+    1906755278U, // <6,4,7,5>: Cost 2 vzipr RHS, <2,3,4,5>
+    1906753740U, // <6,4,7,6>: Cost 2 vzipr RHS, <0,2,4,6>
+    2122088449U, // <6,4,7,7>: Cost 2 ins <6,u,7,7>, lane 1
+    1906753742U, // <6,4,7,u>: Cost 2 vzipr RHS, <0,2,4,u>
+    1168788818U, // <6,4,u,0>: Cost 2 vrev <4,6,0,u>
+    2121523201U, // <6,4,u,1>: Cost 2 ins <6,u,0,1>, lane 1
+    1718540078U, // <6,4,u,2>: Cost 2 vuzpl <6,7,4,5>, LHS
+    1779221149U, // <6,4,u,3>: Cost 2 vuzpr <5,6,7,4>, LHS
+    1906765008U, // <6,4,u,4>: Cost 2 vzipr RHS, <4,4,4,4>
+    1906763470U, // <6,4,u,5>: Cost 2 vzipr RHS, <2,3,4,5>
+    1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS
+    1779224105U, // <6,4,u,7>: Cost 2 vuzpr <5,6,7,4>, RHS
+    1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS
+    3195256833U, // <6,5,0,0>: Cost 3 ins <6,u,0,0>, lane 1
+    2121523201U, // <6,5,0,1>: Cost 2 ins <6,u,0,1>, lane 1
+    2787721318U, // <6,5,0,2>: Cost 3 vuzpl <6,0,5,7>, LHS
+    3195281409U, // <6,5,0,3>: Cost 3 ins <6,u,0,3>, lane 1
+    2790367436U, // <6,5,0,4>: Cost 3 vuzpl <6,4,5,6>, <0,2,4,6>
+    3121369092U, // <6,5,0,5>: Cost 3 vtrnr <5,6,7,0>, <5,5,5,5>
+    2980440578U, // <6,5,0,6>: Cost 3 vzipr <4,5,6,0>, <3,4,5,6>
+    1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0>
+    2047626551U, // <6,5,0,u>: Cost 2 vtrnr <5,6,7,0>, RHS
+    2578382950U, // <6,5,1,0>: Cost 3 vext1 <4,6,5,1>, LHS
+    3205292032U, // <6,5,1,1>: Cost 3 ins <u,5,1,1>, lane 0
+    3195346945U, // <6,5,1,2>: Cost 3 ins <6,u,1,2>, lane 1
+    2834833510U, // <6,5,1,3>: Cost 3 vuzpr <2,6,4,5>, LHS
+    2578386296U, // <6,5,1,4>: Cost 3 vext1 <4,6,5,1>, <4,6,5,1>
+    2578387072U, // <6,5,1,5>: Cost 3 vext1 <4,6,5,1>, <5,7,1,3>
+    2922205282U, // <6,5,1,6>: Cost 3 vzipl <6,1,0,3>, <5,6,7,0>
+    2131599360U, // <6,5,1,7>: Cost 2 ins <u,5,1,7>, lane 0
+    2131599360U, // <6,5,1,u>: Cost 2 ins <u,5,1,7>, lane 0
+    2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS
+    2982448018U, // <6,5,2,1>: Cost 3 vzipr <4,u,6,2>, <4,0,5,1>
+    3195420673U, // <6,5,2,2>: Cost 3 ins <6,u,2,2>, lane 1
+    2131640320U, // <6,5,2,3>: Cost 2 ins <u,5,2,3>, lane 0
+    2578394489U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, <4,6,5,2>
+    3114897412U, // <6,5,2,5>: Cost 3 vtrnr <4,6,0,2>, <5,5,5,5>
+    2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7>
+    2041154870U, // <6,5,2,7>: Cost 2 vtrnr <4,6,0,2>, RHS
+    2041154871U, // <6,5,2,u>: Cost 2 vtrnr <4,6,0,2>, RHS
+    3195478017U, // <6,5,3,0>: Cost 3 ins <6,u,3,0>, lane 1
+    3205439488U, // <6,5,3,1>: Cost 3 ins <u,5,3,1>, lane 0
+    3091164465U, // <6,5,3,2>: Cost 3 vtrnr <0,6,2,3>, <4,5,6,2>
+    3195502593U, // <6,5,3,3>: Cost 3 ins <6,u,3,3>, lane 1
+    2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6>
+    3205472256U, // <6,5,3,5>: Cost 3 ins <u,5,3,5>, lane 0
+    2980465154U, // <6,5,3,6>: Cost 3 vzipr <4,5,6,3>, <3,4,5,6>
+    2131746816U, // <6,5,3,7>: Cost 2 ins <u,5,3,7>, lane 0
+    2131746816U, // <6,5,3,u>: Cost 2 ins <u,5,3,7>, lane 0
+    2789051724U, // <6,5,4,0>: Cost 3 vuzpl <6,2,5,7>, <4,6,0,2>
+    3060715648U, // <6,5,4,1>: Cost 3 vtrnl <6,7,4,5>, <5,7,1,3>
+    3195568129U, // <6,5,4,2>: Cost 3 ins <6,u,4,2>, lane 1
+    2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5>
+    2791705972U, // <6,5,4,4>: Cost 3 vuzpl <6,6,5,7>, <4,6,4,6>
+    2121850881U, // <6,5,4,5>: Cost 2 ins <6,u,4,5>, lane 1
+    2834833756U, // <6,5,4,6>: Cost 3 vuzpr <2,6,4,5>, <0,4,2,6>
+    1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6>
+    1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6>
+    2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS
+    3006363382U, // <6,5,5,1>: Cost 3 vzipr <u,u,6,5>, <u,0,5,1>
+    3205595136U, // <6,5,5,2>: Cost 3 ins <u,5,5,2>, lane 0
+    2980479105U, // <6,5,5,3>: Cost 3 vzipr <4,5,6,5>, <0,1,5,3>
+    2578419068U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, <4,6,5,5>
+    2131877888U, // <6,5,5,5>: Cost 2 ins <u,5,5,5>, lane 0
+    2979154434U, // <6,5,5,6>: Cost 3 vzipr <4,3,6,5>, <3,4,5,6>
+    2131894272U, // <6,5,5,7>: Cost 2 ins <u,5,5,7>, lane 0
+    2131877888U, // <6,5,5,u>: Cost 2 ins <u,5,5,5>, lane 0
+    2131910656U, // <6,5,6,0>: Cost 2 ins <u,5,6,0>, lane 0
+    2131918848U, // <6,5,6,1>: Cost 2 ins <u,5,6,1>, lane 0
+    2131927040U, // <6,5,6,2>: Cost 2 ins <u,5,6,2>, lane 0
+    2131935232U, // <6,5,6,3>: Cost 2 ins <u,5,6,3>, lane 0
+    2131943424U, // <6,5,6,4>: Cost 2 ins <u,5,6,4>, lane 0
+    2131951616U, // <6,5,6,5>: Cost 2 ins <u,5,6,5>, lane 0
+    2131959808U, // <6,5,6,6>: Cost 2 ins <u,5,6,6>, lane 0
+    1058226176U, // <6,5,6,7>: Cost 1 ins RHS, lane 0
+    1058226176U, // <6,5,6,u>: Cost 1 ins RHS, lane 0
+    2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS
+    1906756498U, // <6,5,7,1>: Cost 2 vzipr RHS, <4,0,5,1>
+    2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7>
+    2122055681U, // <6,5,7,3>: Cost 2 ins <6,u,7,3>, lane 1
+    2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS
+    1906756826U, // <6,5,7,5>: Cost 2 vzipr RHS, <4,4,5,5>
+    1906756098U, // <6,5,7,6>: Cost 2 vzipr RHS, <3,4,5,6>
+    2029473078U, // <6,5,7,7>: Cost 2 vtrnr <2,6,3,7>, RHS
+    2029473079U, // <6,5,7,u>: Cost 2 vtrnr <2,6,3,7>, RHS
+    2131910656U, // <6,5,u,0>: Cost 2 ins <u,5,6,0>, lane 0
+    1906764690U, // <6,5,u,1>: Cost 2 vzipr RHS, <4,0,5,1>
+    2131927040U, // <6,5,u,2>: Cost 2 ins <u,5,6,2>, lane 0
+    2122055681U, // <6,5,u,3>: Cost 2 ins <6,u,7,3>, lane 1
+    2131943424U, // <6,5,u,4>: Cost 2 ins <u,5,6,4>, lane 0
+    1906765018U, // <6,5,u,5>: Cost 2 vzipr RHS, <4,4,5,5>
+    1906764290U, // <6,5,u,6>: Cost 2 vzipr RHS, <3,4,5,6>
+    1058226176U, // <6,5,u,7>: Cost 1 ins RHS, lane 0
+    1058226176U, // <6,5,u,u>: Cost 1 ins RHS, lane 0
+    2047627362U, // <6,6,0,0>: Cost 2 vtrnr <5,6,7,0>, <5,6,7,0>
+    1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS
+    1718026342U, // <6,6,0,2>: Cost 2 vuzpl <6,6,6,6>, LHS
+    3195281409U, // <6,6,0,3>: Cost 3 ins <6,u,0,3>, lane 1
+    2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2>
+    3195297793U, // <6,6,0,5>: Cost 3 ins <6,u,0,5>, lane 1
+    2120826882U, // <6,6,0,6>: Cost 2 ins <6,6,u,6>, lane 2
+    2120835074U, // <6,6,0,7>: Cost 2 ins <6,6,u,7>, lane 2
+    1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS
+    2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2>
+    1906707760U, // <6,6,1,1>: Cost 2 vzipr <4,5,6,1>, <4,5,6,1>
+    2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0>
+    1773043814U, // <6,6,1,3>: Cost 2 vuzpr <4,6,4,6>, LHS
+    3194068995U, // <6,6,1,4>: Cost 3 ins <6,6,1,u>, lane 3
+    2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7>
+    2120826882U, // <6,6,1,6>: Cost 2 ins <6,6,u,6>, lane 2
+    2120835074U, // <6,6,1,7>: Cost 2 ins <6,6,u,7>, lane 2
+    1773043819U, // <6,6,1,u>: Cost 2 vuzpr <4,6,4,6>, LHS
+    3114896750U, // <6,6,2,0>: Cost 3 vtrnr <4,6,0,2>, <4,6,4,0>
+    3195412481U, // <6,6,2,1>: Cost 3 ins <6,u,2,1>, lane 1
+    2041154892U, // <6,6,2,2>: Cost 2 vtrnr <4,6,0,2>, <4,6,0,2>
+    2120843269U, // <6,6,2,3>: Cost 2 ins <6,6,u,u>, lane 5
+    3114897510U, // <6,6,2,4>: Cost 3 vtrnr <4,6,0,2>, <5,6,7,4>
+    3195445249U, // <6,6,2,5>: Cost 3 ins <6,u,2,5>, lane 1
+    2120826882U, // <6,6,2,6>: Cost 2 ins <6,6,u,6>, lane 2
+    1908706614U, // <6,6,2,7>: Cost 2 vzipr <4,u,6,2>, RHS
+    1908706615U, // <6,6,2,u>: Cost 2 vzipr <4,u,6,2>, RHS
+    2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2>
+    2846787238U, // <6,6,3,1>: Cost 3 vuzpr <4,6,4,6>, <2,3,0,1>
+    3206111232U, // <6,6,3,2>: Cost 3 ins <u,6,3,2>, lane 0
+    1880178826U, // <6,6,3,3>: Cost 2 vzipr <0,1,6,3>, <0,1,6,3>
+    2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5>
+    2846787278U, // <6,6,3,5>: Cost 3 vuzpr <4,6,4,6>, <2,3,4,5>
+    2120826882U, // <6,6,3,6>: Cost 2 ins <6,6,u,6>, lane 2
+    2132410368U, // <6,6,3,7>: Cost 2 ins <u,6,3,7>, lane 0
+    2132410368U, // <6,6,3,u>: Cost 2 ins <u,6,3,7>, lane 0
+    2846790288U, // <6,6,4,0>: Cost 3 vuzpr <4,6,4,6>, <6,4,6,0>
+    3194527746U, // <6,6,4,1>: Cost 3 ins <6,6,u,1>, lane 2
+    2846788778U, // <6,6,4,2>: Cost 3 vuzpr <4,6,4,6>, <4,4,0,2>
+    3195576321U, // <6,6,4,3>: Cost 3 ins <6,u,4,3>, lane 1
+    2047660134U, // <6,6,4,4>: Cost 2 vtrnr <5,6,7,4>, <5,6,7,4>
+    1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS
+    1718029622U, // <6,6,4,6>: Cost 2 vuzpl <6,6,6,6>, RHS
+    2120835074U, // <6,6,4,7>: Cost 2 ins <6,6,u,7>, lane 2
+    1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6>
+    3194363907U, // <6,6,5,0>: Cost 3 ins <6,6,5,u>, lane 3
+    2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3>
+    3206258688U, // <6,6,5,2>: Cost 3 ins <u,6,5,2>, lane 0
+    3194544130U, // <6,6,5,3>: Cost 3 ins <6,6,u,3>, lane 2
+    2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6>
+    1906740532U, // <6,6,5,5>: Cost 2 vzipr <4,5,6,5>, <4,5,6,5>
+    2120826882U, // <6,6,5,6>: Cost 2 ins <6,6,u,6>, lane 2
+    1773047094U, // <6,6,5,7>: Cost 2 vuzpr <4,6,4,6>, RHS
+    1773047095U, // <6,6,5,u>: Cost 2 vuzpr <4,6,4,6>, RHS
+    1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS
+    2120695811U, // <6,6,6,1>: Cost 2 ins <6,6,6,u>, lane 3
+    2120695811U, // <6,6,6,2>: Cost 2 ins <6,6,6,u>, lane 3
+    2120695811U, // <6,6,6,3>: Cost 2 ins <6,6,6,u>, lane 3
+    1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS
+    2120695811U, // <6,6,6,5>: Cost 2 ins <6,6,6,u>, lane 3
+    296144182U,  // <6,6,6,6>: Cost 1 vdup2 RHS
+    1908739382U, // <6,6,6,7>: Cost 2 vzipr <4,u,6,6>, RHS
+    296144182U,  // <6,6,6,u>: Cost 1 vdup2 RHS
+    2132647936U, // <6,6,7,0>: Cost 2 ins <u,6,7,0>, lane 0
+    2120769539U, // <6,6,7,1>: Cost 2 ins <6,6,7,u>, lane 3
+    1908747164U, // <6,6,7,2>: Cost 2 vzipr RHS, <4,0,6,2>
+    2122055681U, // <6,6,7,3>: Cost 2 ins <6,u,7,3>, lane 1
+    2132680704U, // <6,6,7,4>: Cost 2 ins <u,6,7,4>, lane 0
+    2120769539U, // <6,6,7,5>: Cost 2 ins <6,6,7,u>, lane 3
+    1906758456U, // <6,6,7,6>: Cost 2 vzipr RHS, <6,6,6,6>
+    833015094U,  // <6,6,7,7>: Cost 1 vzipr RHS, RHS
+    833015095U,  // <6,6,7,u>: Cost 1 vzipr RHS, RHS
+    2047627362U, // <6,6,u,0>: Cost 2 vtrnr <5,6,7,0>, <5,6,7,0>
+    1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS
+    1906764700U, // <6,6,u,2>: Cost 2 vzipr RHS, <4,0,6,2>
+    1773044381U, // <6,6,u,3>: Cost 2 vuzpr <4,6,4,6>, LHS
+    2047660134U, // <6,6,u,4>: Cost 2 vtrnr <5,6,7,4>, <5,6,7,4>
+    1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS
+    296144182U,  // <6,6,u,6>: Cost 1 vdup2 RHS
+    833023286U,  // <6,6,u,7>: Cost 1 vzipr RHS, RHS
+    833023287U,  // <6,6,u,u>: Cost 1 vzipr RHS, RHS
+    1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+    497467494U,  // <6,7,0,1>: Cost 1 vext2 RHS, LHS
+    1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+    2120916995U, // <6,7,0,3>: Cost 2 ins <6,7,0,u>, lane 3
+    1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+    1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0>
+    2120916995U, // <6,7,0,6>: Cost 2 ins <6,7,0,u>, lane 3
+    2120916995U, // <6,7,0,7>: Cost 2 ins <6,7,0,u>, lane 3
+    497468061U,  // <6,7,0,u>: Cost 1 vext2 RHS, LHS
+    1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+    1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+    1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+    1761034342U, // <6,7,1,3>: Cost 2 vuzpr <2,6,3,7>, LHS
+    2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5>
+    1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+    2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+    2121498626U, // <6,7,1,7>: Cost 2 ins <6,7,u,7>, lane 2
+    1761034347U, // <6,7,1,u>: Cost 2 vuzpr <2,6,3,7>, LHS
+    2121064451U, // <6,7,2,0>: Cost 2 ins <6,7,2,u>, lane 3
+    2121449474U, // <6,7,2,1>: Cost 2 ins <6,7,u,1>, lane 2
+    1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+    1059889156U, // <6,7,2,3>: Cost 1 ins LHS, lane 4
+    2121064451U, // <6,7,2,4>: Cost 2 ins <6,7,2,u>, lane 3
+    2121482242U, // <6,7,2,5>: Cost 2 ins <6,7,u,5>, lane 2
+    1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+    2121498626U, // <6,7,2,7>: Cost 2 ins <6,7,u,7>, lane 2
+    1059889156U, // <6,7,2,u>: Cost 1 ins LHS, lane 4
+    1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+    2121449474U, // <6,7,3,1>: Cost 2 ins <6,7,u,1>, lane 2
+    2133696516U, // <6,7,3,2>: Cost 2 ins <u,u,3,2>, lane 4
+    1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+    1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+    2121482242U, // <6,7,3,5>: Cost 2 ins <6,7,u,5>, lane 2
+    2834777789U, // <6,7,3,6>: Cost 3 vuzpr <2,6,3,7>, <2,3,2,6>
+    2133737476U, // <6,7,3,7>: Cost 2 ins <u,u,3,7>, lane 4
+    1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+    1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+    2121449474U, // <6,7,4,1>: Cost 2 ins <6,7,u,1>, lane 2
+    2121211907U, // <6,7,4,2>: Cost 2 ins <6,7,4,u>, lane 3
+    2121211907U, // <6,7,4,3>: Cost 2 ins <6,7,4,u>, lane 3
+    1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+    497470774U,  // <6,7,4,5>: Cost 1 vext2 RHS, RHS
+    1573203276U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,0,2>
+    2121211907U, // <6,7,4,7>: Cost 2 ins <6,7,4,u>, lane 3
+    497471017U,  // <6,7,4,u>: Cost 1 vext2 RHS, RHS
+    2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+    1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+    2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+    2121465858U, // <6,7,5,3>: Cost 2 ins <6,7,u,3>, lane 2
+    1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+    1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+    1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+    1761037622U, // <6,7,5,7>: Cost 2 vuzpr <2,6,3,7>, RHS
+    1761037623U, // <6,7,5,u>: Cost 2 vuzpr <2,6,3,7>, RHS
+    2121359363U, // <6,7,6,0>: Cost 2 ins <6,7,6,u>, lane 3
+    2121449474U, // <6,7,6,1>: Cost 2 ins <6,7,u,1>, lane 2
+    1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+    2121465858U, // <6,7,6,3>: Cost 2 ins <6,7,u,3>, lane 2
+    2121359363U, // <6,7,6,4>: Cost 2 ins <6,7,6,u>, lane 3
+    2121482242U, // <6,7,6,5>: Cost 2 ins <6,7,u,5>, lane 2
+    1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+    1060216836U, // <6,7,6,7>: Cost 1 ins RHS, lane 4
+    1060216836U, // <6,7,6,u>: Cost 1 ins RHS, lane 4
+    1906757730U, // <6,7,7,0>: Cost 2 vzipr RHS, <5,6,7,0>
+    2121449474U, // <6,7,7,1>: Cost 2 ins <6,7,u,1>, lane 2
+    2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3>
+    1906758138U, // <6,7,7,3>: Cost 2 vzipr RHS, <6,2,7,3>
+    1906757734U, // <6,7,7,4>: Cost 2 vzipr RHS, <5,6,7,4>
+    2121482242U, // <6,7,7,5>: Cost 2 ins <6,7,u,5>, lane 2
+    1906757574U, // <6,7,7,6>: Cost 2 vzipr RHS, <5,4,7,6>
+    1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7>
+    1906757738U, // <6,7,7,u>: Cost 2 vzipr RHS, <5,6,7,u>
+    1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+    497473326U,  // <6,7,u,1>: Cost 1 vext2 RHS, LHS
+    1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+    1059889156U, // <6,7,u,3>: Cost 1 ins LHS, lane 4
+    1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+    497473690U,  // <6,7,u,5>: Cost 1 vext2 RHS, RHS
+    1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+    1060216836U, // <6,7,u,7>: Cost 1 ins RHS, lane 4
+    497473893U,  // <6,7,u,u>: Cost 1 vext2 RHS, LHS
+    1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+    497475686U,  // <6,u,0,1>: Cost 1 vext2 RHS, LHS
+    1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+    2047623837U, // <6,u,0,3>: Cost 2 vtrnr <5,6,7,0>, LHS
+    1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+    1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0>
+    1986648218U, // <6,u,0,6>: Cost 2 vtrnl <6,7,0,1>, RHS
+    2047626793U, // <6,u,0,7>: Cost 2 vtrnr <5,6,7,0>, RHS
+    497476253U,  // <6,u,0,u>: Cost 1 vext2 RHS, LHS
+    1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+    1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+    1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+    1761042534U, // <6,u,1,3>: Cost 2 vuzpr <2,6,3,u>, LHS
+    2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS
+    1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+    2120826882U, // <6,u,1,6>: Cost 2 ins <6,6,u,6>, lane 2
+    2120835074U, // <6,u,1,7>: Cost 2 ins <6,6,u,7>, lane 2
+    1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+    1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS
+    1849644846U, // <6,u,2,1>: Cost 2 vzipl <6,2,7,3>, LHS
+    1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+    1055244288U, // <6,u,2,3>: Cost 1 ins LHS, lane 0
+    1504873876U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, <4,6,u,2>
+    1849645210U, // <6,u,2,5>: Cost 2 vzipl <6,2,7,3>, RHS
+    1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+    2041155113U, // <6,u,2,7>: Cost 2 vtrnr <4,6,0,2>, RHS
+    1055244288U, // <6,u,2,u>: Cost 1 ins LHS, lane 0
+    1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+    2121449474U, // <6,u,3,1>: Cost 2 ins <6,7,u,1>, lane 2
+    2128388096U, // <6,u,3,2>: Cost 2 ins <u,0,3,2>, lane 0
+    1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+    1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+    2121482242U, // <6,u,3,5>: Cost 2 ins <6,7,u,5>, lane 2
+    2120826882U, // <6,u,3,6>: Cost 2 ins <6,6,u,6>, lane 2
+    2131746816U, // <6,u,3,7>: Cost 2 ins <u,5,3,7>, lane 0
+    1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+    1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+    2121449474U, // <6,u,4,1>: Cost 2 ins <6,7,u,1>, lane 2
+    1986975534U, // <6,u,4,2>: Cost 2 vtrnl <6,7,4,5>, LHS
+    2047656605U, // <6,u,4,3>: Cost 2 vtrnr <5,6,7,4>, LHS
+    1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+    497478967U,  // <6,u,4,5>: Cost 1 vext2 RHS, RHS
+    1571220812U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,0,2>
+    2047659561U, // <6,u,4,7>: Cost 2 vtrnr <5,6,7,4>, RHS
+    497479209U,  // <6,u,4,u>: Cost 1 vext2 RHS, RHS
+    2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS
+    1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+    2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5>
+    2118148098U, // <6,u,5,3>: Cost 2 ins <6,2,u,3>, lane 2
+    1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+    1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+    1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+    1761045814U, // <6,u,5,7>: Cost 2 vuzpr <2,6,3,u>, RHS
+    1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+    1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS
+    1852249902U, // <6,u,6,1>: Cost 2 vzipl <6,6,6,6>, LHS
+    1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+    2041479837U, // <6,u,6,3>: Cost 2 vtrnr <4,6,4,6>, LHS
+    1504906648U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, <4,6,u,6>
+    1852250266U, // <6,u,6,5>: Cost 2 vzipl <6,6,6,6>, RHS
+    296144182U,  // <6,u,6,6>: Cost 1 vdup2 RHS
+    1058226176U, // <6,u,6,7>: Cost 1 ins RHS, lane 0
+    1058226176U, // <6,u,6,u>: Cost 1 ins RHS, lane 0
+    1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS
+    1906753609U, // <6,u,7,1>: Cost 2 vzipr RHS, <0,0,u,1>
+    1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7>
+    833011868U,  // <6,u,7,3>: Cost 1 vzipr RHS, LHS
+    1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS
+    1906753937U, // <6,u,7,5>: Cost 2 vzipr RHS, <0,4,u,5>
+    1906753776U, // <6,u,7,6>: Cost 2 vzipr RHS, <0,2,u,6>
+    833015112U,  // <6,u,7,7>: Cost 1 vzipr RHS, RHS
+    833011873U,  // <6,u,7,u>: Cost 1 vzipr RHS, LHS
+    1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS
+    497481518U,  // <6,u,u,1>: Cost 1 vext2 RHS, LHS
+    1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+    833020060U,  // <6,u,u,3>: Cost 1 vzipr RHS, LHS
+    1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS
+    497481882U,  // <6,u,u,5>: Cost 1 vext2 RHS, RHS
+    296144182U,  // <6,u,u,6>: Cost 1 vdup2 RHS
+    833023304U,  // <6,u,u,7>: Cost 1 vzipr RHS, RHS
+    497482085U,  // <6,u,u,u>: Cost 1 vext2 RHS, LHS
+    1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+    1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1>
+    1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2>
+    2987152532U, // <7,0,0,3>: Cost 3 vzipr <5,6,7,0>, <7,2,0,3>
+    2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1>
+    2987152210U, // <7,0,0,5>: Cost 3 vzipr <5,6,7,0>, <6,7,0,5>
+    2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0>
+    2987152050U, // <7,0,0,7>: Cost 3 vzipr <5,6,7,0>, <6,5,0,7>
+    1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1>
+    1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS
+    2128232448U, // <7,0,1,1>: Cost 2 ins <u,0,1,1>, lane 0
+    564576358U,  // <7,0,1,2>: Cost 1 vext3 RHS, LHS
+    2122317827U, // <7,0,1,3>: Cost 2 ins <7,0,1,u>, lane 3
+    1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS
+    2122317827U, // <7,0,1,5>: Cost 2 ins <7,0,1,u>, lane 3
+    1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1>
+    2122317827U, // <7,0,1,7>: Cost 2 ins <7,0,1,u>, lane 3
+    564576412U,  // <7,0,1,u>: Cost 1 vext3 RHS, LHS
+    1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+    2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5>
+    2128314368U, // <7,0,2,2>: Cost 2 ins <u,0,2,2>, lane 0
+    2122833925U, // <7,0,2,3>: Cost 2 ins <7,0,u,u>, lane 5
+    1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+    2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+    2712060126U, // <7,0,2,6>: Cost 3 vext3 RHS, <0,2,6,6>
+    3201433601U, // <7,0,2,7>: Cost 3 ins <7,u,2,7>, lane 1
+    1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2>
+    2983854080U, // <7,0,3,0>: Cost 3 vzipr <5,1,7,3>, <0,0,0,0>
+    2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0>
+    2128388096U, // <7,0,3,2>: Cost 2 ins <u,0,3,2>, lane 0
+    2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3>
+    2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6>
+    2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0>
+    3196559362U, // <7,0,3,6>: Cost 3 ins <7,0,u,6>, lane 2
+    3201507329U, // <7,0,3,7>: Cost 3 ins <7,u,3,7>, lane 1
+    2128388096U, // <7,0,3,u>: Cost 2 ins <u,0,3,2>, lane 0
+    2712060230U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,2>
+    1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5>
+    1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6>
+    3201548289U, // <7,0,4,3>: Cost 3 ins <7,u,4,3>, lane 1
+    2712060269U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,5>
+    1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS
+    2651606348U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,0,2>
+    3201581057U, // <7,0,4,7>: Cost 3 ins <7,u,4,7>, lane 1
+    1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5>
+    2647625340U, // <7,0,5,0>: Cost 3 vext2 <5,0,7,0>, <5,0,7,0>
+    2128527360U, // <7,0,5,1>: Cost 2 ins <u,0,5,1>, lane 0
+    1991032934U, // <7,0,5,2>: Cost 2 vtrnl <7,4,5,6>, LHS
+    2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0>
+    2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6>
+    2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5>
+    1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0>
+    2847477046U, // <7,0,5,7>: Cost 3 vuzpr <4,7,5,0>, RHS
+    1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0>
+    2985869312U, // <7,0,6,0>: Cost 3 vzipr <5,4,7,6>, <0,0,0,0>
+    2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+    2128609280U, // <7,0,6,2>: Cost 2 ins <u,0,6,2>, lane 0
+    2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0>
+    3202367488U, // <7,0,6,4>: Cost 3 ins <u,0,6,4>, lane 0
+    2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7>
+    2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6>
+    2122833925U, // <7,0,6,7>: Cost 2 ins <7,0,u,u>, lane 5
+    2128609280U, // <7,0,6,u>: Cost 2 ins <u,0,6,2>, lane 0
+    2847477192U, // <7,0,7,0>: Cost 3 vuzpr <4,7,5,0>, <4,7,5,0>
+    1858961510U, // <7,0,7,1>: Cost 2 vzipl <7,7,7,7>, LHS
+    1993179238U, // <7,0,7,2>: Cost 2 vtrnl <7,7,7,7>, LHS
+    3201769473U, // <7,0,7,3>: Cost 3 ins <7,u,7,3>, lane 1
+    2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6>
+    2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7>
+    2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0>
+    2128060417U, // <7,0,7,7>: Cost 2 ins <7,u,7,7>, lane 1
+    1858962077U, // <7,0,7,u>: Cost 2 vzipl <7,7,7,7>, LHS
+    1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2>
+    1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1>
+    564576925U,  // <7,0,u,2>: Cost 1 vext3 RHS, LHS
+    2122317827U, // <7,0,u,3>: Cost 2 ins <7,0,1,u>, lane 3
+    1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6>
+    1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS
+    1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u>
+    2122317827U, // <7,0,u,7>: Cost 2 ins <7,0,1,u>, lane 3
+    564576979U,  // <7,0,u,u>: Cost 1 vext3 RHS, LHS
+    2712060634U, // <7,1,0,0>: Cost 3 vext3 RHS, <1,0,0,1>
+    2128822272U, // <7,1,0,1>: Cost 2 ins <u,1,0,1>, lane 0
+    1719615590U, // <7,1,0,2>: Cost 2 vuzpl <7,0,1,2>, LHS
+    1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2>
+    2859062268U, // <7,1,0,4>: Cost 3 vuzpr <6,7,0,1>, <7,0,1,4>
+    2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+    2859061568U, // <7,1,0,6>: Cost 3 vuzpr <6,7,0,1>, <6,0,4,6>
+    3201286145U, // <7,1,0,7>: Cost 3 ins <7,u,0,7>, lane 1
+    1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2>
+    2712060714U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,0>
+    1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+    2127577089U, // <7,1,1,2>: Cost 2 ins <7,u,1,2>, lane 1
+    1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3>
+    2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5>
+    2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7>
+    2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1>
+    2859057294U, // <7,1,1,7>: Cost 3 vuzpr <6,7,0,1>, <0,1,6,7>
+    1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3>
+    2128961536U, // <7,1,2,0>: Cost 2 ins <u,1,2,0>, lane 0
+    2128969728U, // <7,1,2,1>: Cost 2 ins <u,1,2,1>, lane 0
+    2128977920U, // <7,1,2,2>: Cost 2 ins <u,1,2,2>, lane 0
+    1055244288U, // <7,1,2,3>: Cost 1 ins LHS, lane 0
+    2128994304U, // <7,1,2,4>: Cost 2 ins <u,1,2,4>, lane 0
+    2129002496U, // <7,1,2,5>: Cost 2 ins <u,1,2,5>, lane 0
+    2129010688U, // <7,1,2,6>: Cost 2 ins <u,1,2,6>, lane 0
+    2129018880U, // <7,1,2,7>: Cost 2 ins <u,1,2,7>, lane 0
+    1055244288U, // <7,1,2,u>: Cost 1 ins LHS, lane 0
+    1510998118U, // <7,1,3,0>: Cost 2 vext1 <5,7,1,3>, LHS
+    1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+    2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0>
+    2047869030U, // <7,1,3,3>: Cost 2 vtrnr <5,7,1,3>, LHS
+    1511001398U, // <7,1,3,4>: Cost 2 vext1 <5,7,1,3>, RHS
+    1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7>
+    2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7>
+    2983859604U, // <7,1,3,7>: Cost 3 vzipr <5,1,7,3>, <7,5,1,7>
+    1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7>
+    2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5>
+    2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5>
+    2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5>
+    2129133568U, // <7,1,4,3>: Cost 2 ins <u,1,4,3>, lane 0
+    2859060432U, // <7,1,4,4>: Cost 3 vuzpr <6,7,0,1>, <4,4,4,4>
+    2129149952U, // <7,1,4,5>: Cost 2 ins <u,1,4,5>, lane 0
+    1719618870U, // <7,1,4,6>: Cost 2 vuzpl <7,0,1,2>, RHS
+    2793360778U, // <7,1,4,7>: Cost 3 vuzpl <7,0,1,2>, <4,6,7,1>
+    1719618888U, // <7,1,4,u>: Cost 2 vuzpl <7,0,1,2>, RHS
+    2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS
+    2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7>
+    3202940928U, // <7,1,5,2>: Cost 3 ins <u,1,5,2>, lane 0
+    1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7>
+    2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS
+    2985861458U, // <7,1,5,5>: Cost 3 vzipr <5,4,7,5>, <0,4,1,5>
+    2127904769U, // <7,1,5,6>: Cost 2 ins <7,u,5,6>, lane 1
+    1785318710U, // <7,1,5,7>: Cost 2 vuzpr <6,7,0,1>, RHS
+    1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7>
+    2653606230U, // <7,1,6,0>: Cost 3 vext2 <6,0,7,1>, <6,0,7,1>
+    2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7>
+    2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+    2129281024U, // <7,1,6,3>: Cost 2 ins <u,1,6,3>, lane 0
+    2859061350U, // <7,1,6,4>: Cost 3 vuzpr <6,7,0,1>, <5,6,7,4>
+    2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7>
+    2859060596U, // <7,1,6,6>: Cost 3 vuzpr <6,7,0,1>, <4,6,4,6>
+    2129313792U, // <7,1,6,7>: Cost 2 ins <u,1,6,7>, lane 0
+    2129281024U, // <7,1,6,u>: Cost 2 ins <u,1,6,3>, lane 0
+    2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2>
+    1785320270U, // <7,1,7,1>: Cost 2 vuzpr <6,7,0,1>, <6,7,0,1>
+    2986543254U, // <7,1,7,2>: Cost 3 vzipr <5,5,7,7>, <3,0,1,2>
+    2048196710U, // <7,1,7,3>: Cost 2 vtrnr <5,7,5,7>, LHS
+    2793362538U, // <7,1,7,4>: Cost 3 vuzpl <7,0,1,2>, <7,1,4,6>
+    2986541394U, // <7,1,7,5>: Cost 3 vzipr <5,5,7,7>, <0,4,1,5>
+    3201794049U, // <7,1,7,6>: Cost 3 ins <7,u,7,6>, lane 1
+    2128060417U, // <7,1,7,7>: Cost 2 ins <7,u,7,7>, lane 1
+    2048196715U, // <7,1,7,u>: Cost 2 vtrnr <5,7,5,7>, LHS
+    1511039078U, // <7,1,u,0>: Cost 2 vext1 <5,7,1,u>, LHS
+    1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3>
+    1719621422U, // <7,1,u,2>: Cost 2 vuzpl <7,0,1,2>, LHS
+    1055244288U, // <7,1,u,3>: Cost 1 ins LHS, lane 0
+    1511042358U, // <7,1,u,4>: Cost 2 vext1 <5,7,1,u>, RHS
+    1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7>
+    1719621786U, // <7,1,u,6>: Cost 2 vuzpl <7,0,1,2>, RHS
+    1785318953U, // <7,1,u,7>: Cost 2 vuzpr <6,7,0,1>, RHS
+    1055244288U, // <7,1,u,u>: Cost 1 ins LHS, lane 0
+    2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2>
+    2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2>
+    2129494016U, // <7,2,0,2>: Cost 2 ins <u,2,0,2>, lane 0
+    1913405542U, // <7,2,0,3>: Cost 2 vzipr <5,6,7,0>, LHS
+    2712061400U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,2>
+    2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7>
+    2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1>
+    2927577066U, // <7,2,0,7>: Cost 3 vzipl <7,0,1,2>, <2,7,0,1>
+    1913405547U, // <7,2,0,u>: Cost 2 vzipr <5,6,7,0>, LHS
+    2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3>
+    3203301376U, // <7,2,1,1>: Cost 3 ins <u,2,1,1>, lane 0
+    2127577089U, // <7,2,1,2>: Cost 2 ins <7,u,1,2>, lane 1
+    2974548070U, // <7,2,1,3>: Cost 3 vzipr <3,5,7,1>, LHS
+    2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3>
+    3203334144U, // <7,2,1,5>: Cost 3 ins <u,2,1,5>, lane 0
+    2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3>
+    2602718850U, // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+    2127577089U, // <7,2,1,u>: Cost 2 ins <7,u,1,2>, lane 1
+    2712061524U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,0>
+    2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3>
+    1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+    1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3>
+    2712061564U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,4>
+    2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7>
+    2712061581U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,3>
+    3201433601U, // <7,2,2,7>: Cost 3 ins <7,u,2,7>, lane 1
+    1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3>
+    1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1>
+    2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5>
+    1638319802U, // <7,2,3,2>: Cost 2 vext3 RHS, <2,3,2,3>
+    1910112358U, // <7,2,3,3>: Cost 2 vzipr <5,1,7,3>, LHS
+    1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5>
+    2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7>
+    1625048802U, // <7,2,3,6>: Cost 2 vext3 <2,3,6,7>, <2,3,6,7>
+    2990495214U, // <7,2,3,7>: Cost 3 vzipr <6,2,7,3>, <7,6,2,7>
+    1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1>
+    2712061688U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,2>
+    2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3>
+    2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4>
+    1913438310U, // <7,2,4,3>: Cost 2 vzipr <5,6,7,4>, LHS
+    2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6>
+    2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7>
+    2129821696U, // <7,2,4,6>: Cost 2 ins <u,2,4,6>, lane 0
+    3201581057U, // <7,2,4,7>: Cost 3 ins <7,u,4,7>, lane 1
+    1913438315U, // <7,2,4,u>: Cost 2 vzipr <5,6,7,4>, LHS
+    2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7>
+    3203596288U, // <7,2,5,1>: Cost 3 ins <u,2,5,1>, lane 0
+    2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7>
+    2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7>
+    2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7>
+    3203629056U, // <7,2,5,5>: Cost 3 ins <u,2,5,5>, lane 0
+    2127904769U, // <7,2,5,6>: Cost 2 ins <7,u,5,6>, lane 1
+    2853096758U, // <7,2,5,7>: Cost 3 vuzpr <5,7,0,2>, RHS
+    2127904769U, // <7,2,5,u>: Cost 2 ins <7,u,5,6>, lane 1
+    2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS
+    2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3>
+    2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7>
+    1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7>
+    2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS
+    2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7>
+    2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7>
+    2129977344U, // <7,2,6,7>: Cost 2 ins <u,2,6,7>, lane 0
+    1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7>
+    3121939350U, // <7,2,7,0>: Cost 3 vtrnr <5,7,5,7>, <1,2,3,0>
+    3203743744U, // <7,2,7,1>: Cost 3 ins <u,2,7,1>, lane 0
+    1720366165U, // <7,2,7,2>: Cost 2 vuzpl <7,1,2,3>, <7,1,2,3>
+    1912799334U, // <7,2,7,3>: Cost 2 vzipr <5,5,7,7>, LHS
+    3121939354U, // <7,2,7,4>: Cost 3 vtrnr <5,7,5,7>, <1,2,3,4>
+    3203776512U, // <7,2,7,5>: Cost 3 ins <u,2,7,5>, lane 0
+    2986541404U, // <7,2,7,6>: Cost 3 vzipr <5,5,7,7>, <0,4,2,6>
+    2128060417U, // <7,2,7,7>: Cost 2 ins <7,u,7,7>, lane 1
+    1912799339U, // <7,2,7,u>: Cost 2 vzipr <5,5,7,7>, LHS
+    1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1>
+    2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5>
+    2129494016U, // <7,2,u,2>: Cost 2 ins <u,2,0,2>, lane 0
+    1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7>
+    1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5>
+    2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7>
+    2129821696U, // <7,2,u,6>: Cost 2 ins <u,2,4,6>, lane 0
+    2129977344U, // <7,2,u,7>: Cost 2 ins <u,2,6,7>, lane 0
+    1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7>
+    2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0>
+    1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2>
+    2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0>
+    2712062119U, // <7,3,0,3>: Cost 3 vext3 RHS, <3,0,3,1>
+    2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1>
+    2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2>
+    2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0>
+    2985157776U, // <7,3,0,7>: Cost 3 vzipr <5,3,7,0>, <1,5,3,7>
+    1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2>
+    2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1>
+    2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1>
+    2127577089U, // <7,3,1,2>: Cost 2 ins <7,u,1,2>, lane 1
+    1779433574U, // <7,3,1,3>: Cost 2 vuzpr <5,7,1,3>, LHS
+    2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6>
+    2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3>
+    2853179064U, // <7,3,1,6>: Cost 3 vuzpr <5,7,1,3>, <5,1,4,6>
+    2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5>
+    1779433579U, // <7,3,1,u>: Cost 2 vuzpr <5,7,1,3>, LHS
+    2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1>
+    2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0>
+    2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2>
+    2130313216U, // <7,3,2,3>: Cost 2 ins <u,3,2,3>, lane 0
+    2712062292U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,3>
+    2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4>
+    2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3>
+    2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3>
+    2130313216U, // <7,3,2,u>: Cost 2 ins <u,3,2,3>, lane 0
+    2712062334U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,0>
+    2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3>
+    2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3>
+    1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+    2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4>
+    2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7>
+    2990491658U, // <7,3,3,6>: Cost 3 vzipr <6,2,7,3>, <2,7,3,6>
+    2972574864U, // <7,3,3,7>: Cost 3 vzipr <3,2,7,3>, <1,5,3,7>
+    1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3>
+    2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1>
+    2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2>
+    2987180790U, // <7,3,4,2>: Cost 3 vzipr <5,6,7,4>, <1,0,3,2>
+    2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5>
+    2712062455U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,4>
+    1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6>
+    2648313164U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,0,2>
+    2985190544U, // <7,3,4,7>: Cost 3 vzipr <5,3,7,4>, <1,5,3,7>
+    1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6>
+    2712062498U, // <7,3,5,0>: Cost 3 vext3 RHS, <3,5,0,2>
+    1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3>
+    2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3>
+    2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7>
+    2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5>
+    2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7>
+    2127904769U, // <7,3,5,6>: Cost 2 ins <7,u,5,6>, lane 1
+    1779436854U, // <7,3,5,7>: Cost 2 vuzpr <5,7,1,3>, RHS
+    1779436855U, // <7,3,5,u>: Cost 2 vuzpr <5,7,1,3>, RHS
+    2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7>
+    2853178744U, // <7,3,6,1>: Cost 3 vuzpr <5,7,1,3>, <4,6,5,1>
+    1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3>
+    2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7>
+    2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7>
+    3204366336U, // <7,3,6,5>: Cost 3 ins <u,3,6,5>, lane 0
+    2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6>
+    2130640896U, // <7,3,6,7>: Cost 2 ins <u,3,6,7>, lane 0
+    1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3>
+    2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1>
+    2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5>
+    2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6>
+    1779437696U, // <7,3,7,3>: Cost 2 vuzpr <5,7,1,3>, <5,7,1,3>
+    2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5>
+    2237582070U, // <7,3,7,5>: Cost 3 vrev <3,7,5,7>
+    2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7>
+    2128060417U, // <7,3,7,7>: Cost 2 ins <7,u,7,7>, lane 1
+    1779437696U, // <7,3,7,u>: Cost 2 vuzpr <5,7,1,3>, <5,7,1,3>
+    2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u>
+    1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2>
+    1593153452U, // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3>
+    1779434141U, // <7,3,u,3>: Cost 2 vuzpr <5,7,1,3>, LHS
+    2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u>
+    1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6>
+    2127904769U, // <7,3,u,6>: Cost 2 ins <7,u,5,6>, lane 1
+    1779437097U, // <7,3,u,7>: Cost 2 vuzpr <5,7,1,3>, RHS
+    1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2>
+    2714053478U, // <7,4,0,0>: Cost 3 vext3 RHS, <4,0,0,2>
+    1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS
+    2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2>
+    3201253377U, // <7,4,0,3>: Cost 3 ins <7,u,0,3>, lane 1
+    2714053512U, // <7,4,0,4>: Cost 3 vext3 RHS, <4,0,4,0>
+    1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+    1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+    2927578568U, // <7,4,0,7>: Cost 3 vzipl <7,0,1,2>, <4,7,5,0>
+    1640311726U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,2>
+    2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2>
+    2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1>
+    2127577089U, // <7,4,1,2>: Cost 2 ins <7,u,1,2>, lane 1
+    2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7>
+    3127495888U, // <7,4,1,4>: Cost 3 vtrnr <6,7,0,1>, <4,4,4,4>
+    2130919424U, // <7,4,1,5>: Cost 2 ins <u,4,1,5>, lane 0
+    1988054326U, // <7,4,1,6>: Cost 2 vtrnl <7,0,1,2>, RHS
+    3061796234U, // <7,4,1,7>: Cost 3 vtrnl <7,0,1,2>, <4,6,7,1>
+    1988054344U, // <7,4,1,u>: Cost 2 vtrnl <7,0,1,2>, RHS
+    3204694016U, // <7,4,2,0>: Cost 3 ins <u,4,2,0>, lane 0
+    3199172610U, // <7,4,2,1>: Cost 3 ins <7,4,u,1>, lane 2
+    2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2>
+    2125488133U, // <7,4,2,3>: Cost 2 ins <7,4,u,u>, lane 5
+    2853258138U, // <7,4,2,4>: Cost 3 vuzpr <5,7,2,4>, <1,2,3,4>
+    2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+    2131001344U, // <7,4,2,6>: Cost 2 ins <u,4,2,6>, lane 0
+    3201433601U, // <7,4,2,7>: Cost 3 ins <7,u,2,7>, lane 1
+    2125488133U, // <7,4,2,u>: Cost 2 ins <7,4,u,u>, lane 5
+    2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2>
+    3201458177U, // <7,4,3,1>: Cost 3 ins <7,u,3,1>, lane 1
+    3204784128U, // <7,4,3,2>: Cost 3 ins <u,4,3,2>, lane 0
+    2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3>
+    2983857360U, // <7,4,3,4>: Cost 3 vzipr <5,1,7,3>, <4,4,4,4>
+    2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4>
+    2125471746U, // <7,4,3,6>: Cost 2 ins <7,4,u,6>, lane 2
+    3201507329U, // <7,4,3,7>: Cost 3 ins <7,u,3,7>, lane 1
+    2125471746U, // <7,4,3,u>: Cost 2 ins <7,4,u,6>, lane 2
+    2714053800U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,0>
+    3201531905U, // <7,4,4,1>: Cost 3 ins <7,u,4,1>, lane 1
+    3201540097U, // <7,4,4,2>: Cost 3 ins <7,u,4,2>, lane 1
+    2987185336U, // <7,4,4,3>: Cost 3 vzipr <5,6,7,4>, <7,2,4,3>
+    1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+    1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5>
+    1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+    2987185664U, // <7,4,4,7>: Cost 3 vzipr <5,6,7,4>, <7,6,4,7>
+    1640312054U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,6>
+    1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS
+    2125266947U, // <7,4,5,1>: Cost 2 ins <7,4,5,u>, lane 3
+    2125266947U, // <7,4,5,2>: Cost 2 ins <7,4,5,u>, lane 3
+    2125266947U, // <7,4,5,3>: Cost 2 ins <7,4,5,u>, lane 3
+    1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS
+    2131214336U, // <7,4,5,5>: Cost 2 ins <u,4,5,5>, lane 0
+    564579638U,  // <7,4,5,6>: Cost 1 vext3 RHS, RHS
+    2125266947U, // <7,4,5,7>: Cost 2 ins <7,4,5,u>, lane 3
+    564579656U,  // <7,4,5,u>: Cost 1 vext3 RHS, RHS
+    1638468940U, // <7,4,6,0>: Cost 2 vext3 RHS, <4,6,0,2>
+    2712063318U, // <7,4,6,1>: Cost 3 vext3 RHS, <4,6,1,3>
+    2712210780U, // <7,4,6,2>: Cost 3 vext3 RHS, <4,6,2,0>
+    2712210790U, // <7,4,6,3>: Cost 3 vext3 RHS, <4,6,3,1>
+    1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+    2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7>
+    2131296256U, // <7,4,6,6>: Cost 2 ins <u,4,6,6>, lane 0
+    2125488133U, // <7,4,6,7>: Cost 2 ins <7,4,u,u>, lane 5
+    1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2>
+    2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2>
+    2794279930U, // <7,4,7,1>: Cost 3 vuzpl <7,1,4,6>, <7,0,1,2>
+    3201761281U, // <7,4,7,2>: Cost 3 ins <7,u,7,2>, lane 1
+    3201769473U, // <7,4,7,3>: Cost 3 ins <7,u,7,3>, lane 1
+    2847509964U, // <7,4,7,4>: Cost 3 vuzpr <4,7,5,4>, <4,7,5,4>
+    1858964790U, // <7,4,7,5>: Cost 2 vzipl <7,7,7,7>, RHS
+    1993182518U, // <7,4,7,6>: Cost 2 vtrnl <7,7,7,7>, RHS
+    2128060417U, // <7,4,7,7>: Cost 2 ins <7,u,7,7>, lane 1
+    1858965033U, // <7,4,7,u>: Cost 2 vzipl <7,7,7,7>, RHS
+    1640312302U, // <7,4,u,0>: Cost 2 vext3 RHS, <4,u,0,2>
+    1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS
+    2127577089U, // <7,4,u,2>: Cost 2 ins <7,u,1,2>, lane 1
+    2125488133U, // <7,4,u,3>: Cost 2 ins <7,4,u,u>, lane 5
+    1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6>
+    1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1>
+    564579881U,  // <7,4,u,6>: Cost 1 vext3 RHS, RHS
+    2125266947U, // <7,4,u,7>: Cost 2 ins <7,4,5,u>, lane 3
+    564579899U,  // <7,4,u,u>: Cost 1 vext3 RHS, RHS
+    2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS
+    2131476480U, // <7,5,0,1>: Cost 2 ins <u,5,0,1>, lane 0
+    1722597478U, // <7,5,0,2>: Cost 2 vuzpl <7,4,5,6>, LHS
+    3201253377U, // <7,5,0,3>: Cost 3 ins <7,u,0,3>, lane 1
+    2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1>
+    2987150554U, // <7,5,0,5>: Cost 3 vzipr <5,6,7,0>, <4,4,5,5>
+    2987149826U, // <7,5,0,6>: Cost 3 vzipr <5,6,7,0>, <3,4,5,6>
+    2131525632U, // <7,5,0,7>: Cost 2 ins <u,5,0,7>, lane 0
+    1722597532U, // <7,5,0,u>: Cost 2 vuzpl <7,4,5,6>, LHS
+    2714054287U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1>
+    2249183358U, // <7,5,1,1>: Cost 3 vrev <5,7,1,1>
+    2127577089U, // <7,5,1,2>: Cost 2 ins <7,u,1,2>, lane 1
+    1785643110U, // <7,5,1,3>: Cost 2 vuzpr <6,7,4,5>, LHS
+    2714054327U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5>
+    3127496708U, // <7,5,1,5>: Cost 3 vtrnr <6,7,0,1>, <5,5,5,5>
+    2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1>
+    1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+    1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+    2249117814U, // <7,5,2,0>: Cost 3 vrev <5,7,0,2>
+    2714054379U, // <7,5,2,1>: Cost 3 vext3 RHS, <5,2,1,3>
+    2249265288U, // <7,5,2,2>: Cost 3 vrev <5,7,2,2>
+    2131640320U, // <7,5,2,3>: Cost 2 ins <u,5,2,3>, lane 0
+    2859385754U, // <7,5,2,4>: Cost 3 vuzpr <6,7,4,5>, <1,2,3,4>
+    2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+    2712063768U, // <7,5,2,6>: Cost 3 vext3 RHS, <5,2,6,3>
+    2131673088U, // <7,5,2,7>: Cost 2 ins <u,5,2,7>, lane 0
+    2131640320U, // <7,5,2,u>: Cost 2 ins <u,5,2,3>, lane 0
+    3201449985U, // <7,5,3,0>: Cost 3 ins <7,u,3,0>, lane 1
+    1175457920U, // <7,5,3,1>: Cost 2 vrev <5,7,1,3>
+    2249273481U, // <7,5,3,2>: Cost 3 vrev <5,7,2,3>
+    2249347218U, // <7,5,3,3>: Cost 3 vrev <5,7,3,3>
+    3201482753U, // <7,5,3,4>: Cost 3 ins <7,u,3,4>, lane 1
+    2983857370U, // <7,5,3,5>: Cost 3 vzipr <5,1,7,3>, <4,4,5,5>
+    2983856642U, // <7,5,3,6>: Cost 3 vzipr <5,1,7,3>, <3,4,5,6>
+    2047872310U, // <7,5,3,7>: Cost 2 vtrnr <5,7,1,3>, RHS
+    2047872311U, // <7,5,3,u>: Cost 2 vtrnr <5,7,1,3>, RHS
+    2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS
+    2987182994U, // <7,5,4,1>: Cost 3 vzipr <5,6,7,4>, <4,0,5,1>
+    2249281674U, // <7,5,4,2>: Cost 3 vrev <5,7,2,4>
+    3201548289U, // <7,5,4,3>: Cost 3 ins <7,u,4,3>, lane 1
+    2579074508U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, <4,7,5,4>
+    2131804160U, // <7,5,4,5>: Cost 2 ins <u,5,4,5>, lane 0
+    1722600758U, // <7,5,4,6>: Cost 2 vuzpl <7,4,5,6>, RHS
+    1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+    1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6>
+    2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1>
+    2714054620U, // <7,5,5,1>: Cost 3 vext3 RHS, <5,5,1,1>
+    3201613825U, // <7,5,5,2>: Cost 3 ins <7,u,5,2>, lane 1
+    2649657204U, // <7,5,5,3>: Cost 3 vext2 <5,3,7,5>, <5,3,7,5>
+    2714054651U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,5>
+    1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5>
+    2127904769U, // <7,5,5,6>: Cost 2 ins <7,u,5,6>, lane 1
+    1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+    1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7>
+    2131910656U, // <7,5,6,0>: Cost 2 ins <u,5,6,0>, lane 0
+    2131918848U, // <7,5,6,1>: Cost 2 ins <u,5,6,1>, lane 0
+    2131927040U, // <7,5,6,2>: Cost 2 ins <u,5,6,2>, lane 0
+    2131935232U, // <7,5,6,3>: Cost 2 ins <u,5,6,3>, lane 0
+    2131943424U, // <7,5,6,4>: Cost 2 ins <u,5,6,4>, lane 0
+    2131951616U, // <7,5,6,5>: Cost 2 ins <u,5,6,5>, lane 0
+    2131959808U, // <7,5,6,6>: Cost 2 ins <u,5,6,6>, lane 0
+    1058226176U, // <7,5,6,7>: Cost 1 ins RHS, lane 0
+    1058226176U, // <7,5,6,u>: Cost 1 ins RHS, lane 0
+    1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS
+    1638469760U, // <7,5,7,1>: Cost 2 vext3 RHS, <5,7,1,3>
+    2712211590U, // <7,5,7,2>: Cost 3 vext3 RHS, <5,7,2,0>
+    2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7>
+    1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS
+    1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+    2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+    2048199990U, // <7,5,7,7>: Cost 2 vtrnr <5,7,5,7>, RHS
+    1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3>
+    1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS
+    1638469841U, // <7,5,u,1>: Cost 2 vext3 RHS, <5,u,1,3>
+    1722603310U, // <7,5,u,2>: Cost 2 vuzpl <7,4,5,6>, LHS
+    1785643677U, // <7,5,u,3>: Cost 2 vuzpr <6,7,4,5>, LHS
+    1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS
+    1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7>
+    1722603674U, // <7,5,u,6>: Cost 2 vuzpl <7,4,5,6>, RHS
+    1058226176U, // <7,5,u,7>: Cost 1 ins RHS, lane 0
+    1058226176U, // <7,5,u,u>: Cost 1 ins RHS, lane 0
+    2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0>
+    1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS
+    2132148224U, // <7,6,0,2>: Cost 2 ins <u,6,0,2>, lane 0
+    2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0>
+    2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2>
+    2987151292U, // <7,6,0,5>: Cost 3 vzipr <5,6,7,0>, <5,4,6,5>
+    2987150564U, // <7,6,0,6>: Cost 3 vzipr <5,6,7,0>, <4,4,6,6>
+    1913408822U, // <7,6,0,7>: Cost 2 vzipr <5,6,7,0>, RHS
+    1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS
+    1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1>
+    2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1>
+    2127577089U, // <7,6,1,2>: Cost 2 ins <7,u,1,2>, lane 1
+    2841329766U, // <7,6,1,3>: Cost 3 vuzpr <3,7,2,6>, LHS
+    2579123666U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, <4,7,6,1>
+    2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7>
+    2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+    2974551350U, // <7,6,1,7>: Cost 3 vzipr <3,5,7,1>, RHS
+    1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1>
+    2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1>
+    2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3>
+    2714055117U, // <7,6,2,2>: Cost 3 vext3 RHS, <6,2,2,3>
+    2132303872U, // <7,6,2,3>: Cost 2 ins <u,6,2,3>, lane 0
+    2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5>
+    2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7>
+    2714055152U, // <7,6,2,6>: Cost 3 vext3 RHS, <6,2,6,2>
+    1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+    1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+    2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2>
+    3121614200U, // <7,6,3,1>: Cost 3 vtrnr <5,7,1,3>, <4,6,5,1>
+    1181504354U, // <7,6,3,2>: Cost 2 vrev <6,7,2,3>
+    2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3>
+    2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5>
+    3206135808U, // <7,6,3,5>: Cost 3 ins <u,6,3,5>, lane 0
+    2983857380U, // <7,6,3,6>: Cost 3 vzipr <5,1,7,3>, <4,4,6,6>
+    1910115638U, // <7,6,3,7>: Cost 2 vzipr <5,1,7,3>, RHS
+    1910115639U, // <7,6,3,u>: Cost 2 vzipr <5,1,7,3>, RHS
+    2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1>
+    2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3>
+    2714055276U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,0>
+    2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4>
+    2650328272U, // <7,6,4,4>: Cost 3 vext2 <5,4,7,6>, <4,4,4,4>
+    1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS
+    2132475904U, // <7,6,4,6>: Cost 2 ins <u,6,4,6>, lane 0
+    1913441590U, // <7,6,4,7>: Cost 2 vzipr <5,6,7,4>, RHS
+    1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS
+    2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS
+    2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3>
+    2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7>
+    3201622017U, // <7,6,5,3>: Cost 3 ins <7,u,5,3>, lane 1
+    1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6>
+    2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5>
+    2127904769U, // <7,6,5,6>: Cost 2 ins <7,u,5,6>, lane 1
+    2971929910U, // <7,6,5,7>: Cost 3 vzipr <3,1,7,5>, RHS
+    1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5>
+    2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1>
+    2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3>
+    2712212245U, // <7,6,6,2>: Cost 3 vext3 RHS, <6,6,2,7>
+    3201695745U, // <7,6,6,3>: Cost 3 ins <7,u,6,3>, lane 1
+    2714055461U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,5>
+    2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7>
+    1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6>
+    1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+    1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7>
+    1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1>
+    2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0>
+    1638323042U, // <7,6,7,2>: Cost 2 vext3 RHS, <6,7,2,3>
+    2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0>
+    1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5>
+    2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4>
+    1638323082U, // <7,6,7,6>: Cost 2 vext3 RHS, <6,7,6,7>
+    1912802614U, // <7,6,7,7>: Cost 2 vzipr <5,5,7,7>, RHS
+    1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1>
+    1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1>
+    1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS
+    2132148224U, // <7,6,u,2>: Cost 2 ins <u,6,0,2>, lane 0
+    2132303872U, // <7,6,u,3>: Cost 2 ins <u,6,2,3>, lane 0
+    1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5>
+    1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS
+    2132475904U, // <7,6,u,6>: Cost 2 ins <u,6,4,6>, lane 0
+    1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3>
+    1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1>
+    1913409634U, // <7,7,0,0>: Cost 2 vzipr <5,6,7,0>, <5,6,7,0>
+    1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2>
+    1724743782U, // <7,7,0,2>: Cost 2 vuzpl <7,7,7,7>, LHS
+    2987151056U, // <7,7,0,3>: Cost 3 vzipr <5,6,7,0>, <5,1,7,3>
+    2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1>
+    2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0>
+    2987151302U, // <7,7,0,6>: Cost 3 vzipr <5,6,7,0>, <5,4,7,6>
+    2127470594U, // <7,7,0,7>: Cost 2 ins <7,7,u,7>, lane 2
+    1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2>
+    2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3>
+    2053755726U, // <7,7,1,1>: Cost 2 vtrnr <6,7,0,1>, <6,7,0,1>
+    2127577089U, // <7,7,1,2>: Cost 2 ins <7,u,1,2>, lane 1
+    1779761254U, // <7,7,1,3>: Cost 2 vuzpr <5,7,5,7>, LHS
+    2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS
+    2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3>
+    2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1>
+    2127470594U, // <7,7,1,7>: Cost 2 ins <7,7,u,7>, lane 2
+    1779761259U, // <7,7,1,u>: Cost 2 vuzpr <5,7,5,7>, LHS
+    2853503894U, // <7,7,2,0>: Cost 3 vuzpr <5,7,5,7>, <1,2,3,0>
+    3206692864U, // <7,7,2,1>: Cost 3 ins <u,7,2,1>, lane 0
+    1988801621U, // <7,7,2,2>: Cost 2 vtrnl <7,1,2,3>, <7,1,2,3>
+    2132967424U, // <7,7,2,3>: Cost 2 ins <u,7,2,3>, lane 0
+    2853503898U, // <7,7,2,4>: Cost 3 vuzpr <5,7,5,7>, <1,2,3,4>
+    3206725632U, // <7,7,2,5>: Cost 3 ins <u,7,2,5>, lane 0
+    2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3>
+    2127470594U, // <7,7,2,7>: Cost 2 ins <7,7,u,7>, lane 2
+    1988801621U, // <7,7,2,u>: Cost 2 vtrnl <7,1,2,3>, <7,1,2,3>
+    2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1>
+    3121615694U, // <7,7,3,1>: Cost 3 vtrnr <5,7,1,3>, <6,7,0,1>
+    3201171458U, // <7,7,3,2>: Cost 3 ins <7,7,u,2>, lane 2
+    1910116048U, // <7,7,3,3>: Cost 2 vzipr <5,1,7,3>, <5,1,7,3>
+    2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5>
+    2639055462U, // <7,7,3,5>: Cost 3 vext2 <3,5,7,7>, <3,5,7,7>
+    2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7>
+    2127470594U, // <7,7,3,7>: Cost 2 ins <7,7,u,7>, lane 2
+    1910116048U, // <7,7,3,u>: Cost 2 vzipr <5,1,7,3>, <5,1,7,3>
+    2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5>
+    3062715386U, // <7,7,4,1>: Cost 3 vtrnl <7,1,4,6>, <7,0,1,2>
+    3201540097U, // <7,7,4,2>: Cost 3 ins <7,u,4,2>, lane 1
+    2987183824U, // <7,7,4,3>: Cost 3 vzipr <5,6,7,4>, <5,1,7,3>
+    1913442406U, // <7,7,4,4>: Cost 2 vzipr <5,6,7,4>, <5,6,7,4>
+    1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6>
+    1724747062U, // <7,7,4,6>: Cost 2 vuzpl <7,7,7,7>, RHS
+    2127470594U, // <7,7,4,7>: Cost 2 ins <7,7,u,7>, lane 2
+    1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6>
+    2853508547U, // <7,7,5,0>: Cost 3 vuzpr <5,7,5,7>, <7,5,7,0>
+    2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7>
+    3201613825U, // <7,7,5,2>: Cost 3 ins <7,u,5,2>, lane 1
+    2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7>
+    2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7>
+    1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7>
+    2127904769U, // <7,7,5,6>: Cost 2 ins <7,u,5,6>, lane 1
+    1779764534U, // <7,7,5,7>: Cost 2 vuzpr <5,7,5,7>, RHS
+    1779764535U, // <7,7,5,u>: Cost 2 vuzpr <5,7,5,7>, RHS
+    2985873506U, // <7,7,6,0>: Cost 3 vzipr <5,4,7,6>, <5,6,7,0>
+    2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0>
+    2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7>
+    2985873104U, // <7,7,6,3>: Cost 3 vzipr <5,4,7,6>, <5,1,7,3>
+    2985873510U, // <7,7,6,4>: Cost 3 vzipr <5,4,7,6>, <5,6,7,4>
+    2985873511U, // <7,7,6,5>: Cost 3 vzipr <5,4,7,6>, <5,6,7,5>
+    1912131526U, // <7,7,6,6>: Cost 2 vzipr <5,4,7,6>, <5,4,7,6>
+    2133295104U, // <7,7,6,7>: Cost 2 ins <u,7,6,7>, lane 0
+    1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7>
+    1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS
+    2127405059U, // <7,7,7,1>: Cost 2 ins <7,7,7,u>, lane 3
+    2127405059U, // <7,7,7,2>: Cost 2 ins <7,7,7,u>, lane 3
+    2127405059U, // <7,7,7,3>: Cost 2 ins <7,7,7,u>, lane 3
+    1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS
+    2127405059U, // <7,7,7,5>: Cost 2 ins <7,7,7,u>, lane 3
+    2127405059U, // <7,7,7,6>: Cost 2 ins <7,7,7,u>, lane 3
+    363253046U,  // <7,7,7,7>: Cost 1 vdup3 RHS
+    363253046U,  // <7,7,7,u>: Cost 1 vdup3 RHS
+    1913409634U, // <7,7,u,0>: Cost 2 vzipr <5,6,7,0>, <5,6,7,0>
+    1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2>
+    1724749614U, // <7,7,u,2>: Cost 2 vuzpl <7,7,7,7>, LHS
+    1779761821U, // <7,7,u,3>: Cost 2 vuzpr <5,7,5,7>, LHS
+    1913442406U, // <7,7,u,4>: Cost 2 vzipr <5,6,7,4>, <5,6,7,4>
+    1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6>
+    1724749978U, // <7,7,u,6>: Cost 2 vuzpl <7,7,7,7>, RHS
+    363253046U,  // <7,7,u,7>: Cost 1 vdup3 RHS
+    363253046U,  // <7,7,u,u>: Cost 1 vdup3 RHS
+    1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+    1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2>
+    1720131686U, // <7,u,0,2>: Cost 2 vuzpl <7,0,u,2>, LHS
+    1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2>
+    2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1>
+    1853839514U, // <7,u,0,5>: Cost 2 vzipl <7,0,1,2>, RHS
+    1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+    1913408840U, // <7,u,0,7>: Cost 2 vzipr <5,6,7,0>, RHS
+    1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2>
+    1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS
+    2128232448U, // <7,u,1,1>: Cost 2 ins <u,0,1,1>, lane 0
+    564582190U,  // <7,u,1,2>: Cost 1 vext3 RHS, LHS
+    1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3>
+    1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS
+    2122317827U, // <7,u,1,5>: Cost 2 ins <7,0,1,u>, lane 3
+    1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1>
+    1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+    564582244U,  // <7,u,1,u>: Cost 1 vext3 RHS, LHS
+    1662211948U, // <7,u,2,0>: Cost 2 vext3 RHS, <u,2,0,2>
+    2128969728U, // <7,u,2,1>: Cost 2 ins <u,1,2,1>, lane 0
+    2128314368U, // <7,u,2,2>: Cost 2 ins <u,0,2,2>, lane 0
+    1055244288U, // <7,u,2,3>: Cost 1 ins LHS, lane 0
+    1662211988U, // <7,u,2,4>: Cost 2 vext3 RHS, <u,2,4,6>
+    2129002496U, // <7,u,2,5>: Cost 2 ins <u,1,2,5>, lane 0
+    2131001344U, // <7,u,2,6>: Cost 2 ins <u,4,2,6>, lane 0
+    1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+    1055244288U, // <7,u,2,u>: Cost 1 ins LHS, lane 0
+    1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1>
+    1638324167U, // <7,u,3,1>: Cost 2 vext3 RHS, <u,3,1,3>
+    2128388096U, // <7,u,3,2>: Cost 2 ins <u,0,3,2>, lane 0
+    1910112412U, // <7,u,3,3>: Cost 2 vzipr <5,1,7,3>, LHS
+    1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5>
+    1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7>
+    2125471746U, // <7,u,3,6>: Cost 2 ins <7,4,u,6>, lane 2
+    1910115656U, // <7,u,3,7>: Cost 2 vzipr <5,1,7,3>, RHS
+    1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1>
+    2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1>
+    1856821038U, // <7,u,4,1>: Cost 2 vzipl <7,4,5,6>, LHS
+    1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6>
+    1913438364U, // <7,u,4,3>: Cost 2 vzipr <5,6,7,4>, LHS
+    1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+    1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6>
+    1720134966U, // <7,u,4,6>: Cost 2 vuzpl <7,0,u,2>, RHS
+    1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+    1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6>
+    1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS
+    1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u>
+    1991038766U, // <7,u,5,2>: Cost 2 vtrnl <7,4,5,6>, LHS
+    1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7>
+    1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u>
+    1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u>
+    564582554U,  // <7,u,5,6>: Cost 1 vext3 RHS, RHS
+    1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7>
+    564582572U,  // <7,u,5,u>: Cost 1 vext3 RHS, RHS
+    1662359728U, // <7,u,6,0>: Cost 2 vext3 RHS, <u,6,0,2>
+    2131918848U, // <7,u,6,1>: Cost 2 ins <u,5,6,1>, lane 0
+    1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u>
+    1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7>
+    1662359768U, // <7,u,6,4>: Cost 2 vext3 RHS, <u,6,4,6>
+    2131951616U, // <7,u,6,5>: Cost 2 ins <u,5,6,5>, lane 0
+    1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u>
+    1058226176U, // <7,u,6,7>: Cost 1 ins RHS, lane 0
+    1058226176U, // <7,u,6,u>: Cost 1 ins RHS, lane 0
+    1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1>
+    1640462603U, // <7,u,7,1>: Cost 2 vext3 RHS, <u,7,1,3>
+    1993185070U, // <7,u,7,2>: Cost 2 vtrnl <7,7,7,7>, LHS
+    1912799388U, // <7,u,7,3>: Cost 2 vzipr <5,5,7,7>, LHS
+    1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5>
+    1640462643U, // <7,u,7,5>: Cost 2 vext3 RHS, <u,7,5,7>
+    1993185434U, // <7,u,7,6>: Cost 2 vtrnl <7,7,7,7>, RHS
+    363253046U,  // <7,u,7,7>: Cost 1 vdup3 RHS
+    363253046U,  // <7,u,7,u>: Cost 1 vdup3 RHS
+    1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1>
+    1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2>
+    564582757U,  // <7,u,u,2>: Cost 1 vext3 RHS, LHS
+    1055244288U, // <7,u,u,3>: Cost 1 ins LHS, lane 0
+    1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5>
+    1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6>
+    564582797U,  // <7,u,u,6>: Cost 1 vext3 RHS, RHS
+    1058226176U, // <7,u,u,7>: Cost 1 ins RHS, lane 0
+    564582811U,  // <7,u,u,u>: Cost 1 vext3 RHS, LHS
+    135053414U,  // <u,0,0,0>: Cost 1 vdup0 LHS
+    1611489290U, // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+    1611489300U, // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+    2085707777U, // <u,0,0,3>: Cost 2 ins <0,u,0,3>, lane 1
+    1481706806U, // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS
+    2080440323U, // <u,0,0,5>: Cost 2 ins <0,0,0,u>, lane 3
+    2080440323U, // <u,0,0,6>: Cost 2 ins <0,0,0,u>, lane 3
+    2080440323U, // <u,0,0,7>: Cost 2 ins <0,0,0,u>, lane 3
+    135053414U,  // <u,0,0,u>: Cost 1 vdup0 LHS
+    1493655654U, // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS
+    786808934U,  // <u,0,1,1>: Cost 1 vzipl LHS, LHS
+    537747563U,  // <u,0,1,2>: Cost 1 vext3 LHS, LHS
+    1756332134U, // <u,0,1,3>: Cost 2 vuzpr <1,u,3,0>, LHS
+    1493658934U, // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS
+    2085797889U, // <u,0,1,5>: Cost 2 ins <0,u,1,5>, lane 1
+    1517548447U, // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1>
+    2080514051U, // <u,0,1,7>: Cost 2 ins <0,0,1,u>, lane 3
+    537747612U,  // <u,0,1,u>: Cost 1 vext3 LHS, LHS
+    1611489444U, // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+    1994768394U, // <u,0,2,1>: Cost 2 vtrnl LHS, <0,0,1,1>
+    921026662U,  // <u,0,2,2>: Cost 1 vtrnl LHS, LHS
+    1012113409U, // <u,0,2,3>: Cost 1 ins LHS, lane 1
+    1611489484U, // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+    2080587779U, // <u,0,2,5>: Cost 2 ins <0,0,2,u>, lane 3
+    2085879809U, // <u,0,2,6>: Cost 2 ins <0,u,2,6>, lane 1
+    2080587779U, // <u,0,2,7>: Cost 2 ins <0,0,2,u>, lane 3
+    921026716U,  // <u,0,2,u>: Cost 1 vtrnl LHS, LHS
+    1880326144U, // <u,0,3,0>: Cost 2 vzipr LHS, <0,0,0,0>
+    1880327846U, // <u,0,3,1>: Cost 2 vzipr LHS, <2,3,0,1>
+    72589981U,   // <u,0,3,2>: Cost 1 vrev LHS
+    2091900929U, // <u,0,3,3>: Cost 2 ins <1,u,3,3>, lane 1
+    2091909121U, // <u,0,3,4>: Cost 2 ins <1,u,3,4>, lane 1
+    2086633475U, // <u,0,3,5>: Cost 2 ins <1,0,3,u>, lane 3
+    2086633475U, // <u,0,3,6>: Cost 2 ins <1,0,3,u>, lane 3
+    2091933697U, // <u,0,3,7>: Cost 2 ins <1,u,3,7>, lane 1
+    73032403U,   // <u,0,3,u>: Cost 1 vrev LHS
+    1705610572U, // <u,0,4,0>: Cost 2 vuzpl <4,6,0,2>, <4,6,0,2>
+    1611489618U, // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+    1611489628U, // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+    2086002689U, // <u,0,4,3>: Cost 2 ins <0,u,4,3>, lane 1
+    1947828428U, // <u,0,4,4>: Cost 2 vtrnl <0,2,4,6>, <0,2,4,6>
+    1551396150U, // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS
+    1726844214U, // <u,0,4,6>: Cost 2 vuzpl <u,2,0,2>, RHS
+    2109923329U, // <u,0,4,7>: Cost 2 ins <4,u,4,7>, lane 1
+    1611932050U, // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+    1863532544U, // <u,0,5,0>: Cost 2 vzipl RHS, <0,0,0,0>
+    789790822U,  // <u,0,5,1>: Cost 1 vzipl RHS, LHS
+    1996349542U, // <u,0,5,2>: Cost 2 vtrnl <u,3,5,7>, LHS
+    2104696835U, // <u,0,5,3>: Cost 2 ins <4,0,5,u>, lane 3
+    1863532882U, // <u,0,5,4>: Cost 2 vzipl RHS, <0,4,1,5>
+    2109980673U, // <u,0,5,5>: Cost 2 ins <4,u,5,5>, lane 1
+    1577939051U, // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0>
+    1756335414U, // <u,0,5,7>: Cost 2 vuzpr <1,u,3,0>, RHS
+    789791389U,  // <u,0,5,u>: Cost 1 vzipl RHS, LHS
+    1997750272U, // <u,0,6,0>: Cost 2 vtrnl RHS, <0,0,0,0>
+    1997750282U, // <u,0,6,1>: Cost 2 vtrnl RHS, <0,0,1,1>
+    924008550U,  // <u,0,6,2>: Cost 1 vtrnl RHS, LHS
+    2104770563U, // <u,0,6,3>: Cost 2 ins <4,0,6,u>, lane 3
+    1146503858U, // <u,0,6,4>: Cost 2 vrev <0,u,4,6>
+    2104770563U, // <u,0,6,5>: Cost 2 ins <4,0,6,u>, lane 3
+    2110062593U, // <u,0,6,6>: Cost 2 ins <4,u,6,6>, lane 1
+    1036328961U, // <u,0,6,7>: Cost 1 ins RHS, lane 1
+    924008604U,  // <u,0,6,u>: Cost 1 vtrnl RHS, LHS
+    1906900992U, // <u,0,7,0>: Cost 2 vzipr RHS, <0,0,0,0>
+    1906902694U, // <u,0,7,1>: Cost 2 vzipr RHS, <2,3,0,1>
+    1906901156U, // <u,0,7,2>: Cost 2 vzipr RHS, <0,2,0,2>
+    2116083713U, // <u,0,7,3>: Cost 2 ins <5,u,7,3>, lane 1
+    2116091905U, // <u,0,7,4>: Cost 2 ins <5,u,7,4>, lane 1
+    2980643874U, // <u,0,7,5>: Cost 3 vzipr RHS, <1,4,0,5>
+    2116108289U, // <u,0,7,6>: Cost 2 ins <5,u,7,6>, lane 1
+    2116116481U, // <u,0,7,7>: Cost 2 ins <5,u,7,7>, lane 1
+    1906901162U, // <u,0,7,u>: Cost 2 vzipr RHS, <0,2,0,u>
+    135053414U,  // <u,0,u,0>: Cost 1 vdup0 LHS
+    791453798U,  // <u,0,u,1>: Cost 1 vzipl LHS, LHS
+    537748125U,  // <u,0,u,2>: Cost 1 vext3 LHS, LHS
+    1012113409U, // <u,0,u,3>: Cost 1 ins LHS, lane 1
+    1611932338U, // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+    1551399066U, // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS
+    1517605798U, // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u>
+    1036328961U, // <u,0,u,7>: Cost 1 ins RHS, lane 1
+    537748179U,  // <u,0,u,u>: Cost 1 vext3 LHS, LHS
+    1818149622U, // <u,1,0,0>: Cost 2 vzipl <1,0,3,2>, <1,0,3,2>
+    1007951877U, // <u,1,0,1>: Cost 1 ins LHS, lane 5
+    1725587558U, // <u,1,0,2>: Cost 2 vuzpl <u,0,1,2>, LHS
+    1007910914U, // <u,1,0,3>: Cost 1 ins LHS, lane 2
+    2081660930U, // <u,1,0,4>: Cost 2 ins <0,1,u,4>, lane 2
+    2081669122U, // <u,1,0,5>: Cost 2 ins <0,1,u,5>, lane 2
+    2081677314U, // <u,1,0,6>: Cost 2 ins <0,1,u,6>, lane 2
+    2081685506U, // <u,1,0,7>: Cost 2 ins <0,1,u,7>, lane 2
+    1007951877U, // <u,1,0,u>: Cost 1 ins LHS, lane 5
+    1481786002U, // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, <0,u,1,1>
+    202162278U,  // <u,1,1,1>: Cost 1 vdup1 LHS
+    1860551574U, // <u,1,1,2>: Cost 2 vzipl LHS, <1,2,3,0>
+    1007910914U, // <u,1,1,3>: Cost 1 ins LHS, lane 2
+    1481788726U, // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS
+    1860551824U, // <u,1,1,5>: Cost 2 vzipl LHS, <1,5,3,7>
+    2081677314U, // <u,1,1,6>: Cost 2 ins <0,1,u,6>, lane 2
+    2081685506U, // <u,1,1,7>: Cost 2 ins <0,1,u,7>, lane 2
+    1007910914U, // <u,1,1,u>: Cost 1 ins LHS, lane 2
+    1007509507U, // <u,1,2,0>: Cost 1 ins LHS, lane 3
+    1007509507U, // <u,1,2,1>: Cost 1 ins LHS, lane 3
+    1007509507U, // <u,1,2,2>: Cost 1 ins LHS, lane 3
+    835584U,     // <u,1,2,3>: Cost 0 copy LHS
+    1007509507U, // <u,1,2,4>: Cost 1 ins LHS, lane 3
+    1007509507U, // <u,1,2,5>: Cost 1 ins LHS, lane 3
+    1007509507U, // <u,1,2,6>: Cost 1 ins LHS, lane 3
+    1007509507U, // <u,1,2,7>: Cost 1 ins LHS, lane 3
+    835584U,     // <u,1,2,u>: Cost 0 copy LHS
+    1487773798U, // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS
+    1611490264U, // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+    1880328342U, // <u,1,3,2>: Cost 2 vzipr LHS, <3,0,1,2>
+    945004646U,  // <u,1,3,3>: Cost 1 vtrnr LHS, LHS
+    1487777078U, // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS
+    1611490304U, // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+    2087297027U, // <u,1,3,6>: Cost 2 ins <1,1,3,u>, lane 3
+    2133737476U, // <u,1,3,7>: Cost 2 ins <u,u,3,7>, lane 4
+    945004651U,  // <u,1,3,u>: Cost 1 vtrnr LHS, LHS
+    1567992749U, // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1>
+    2081636354U, // <u,1,4,1>: Cost 2 ins <0,1,u,1>, lane 2
+    2081644546U, // <u,1,4,2>: Cost 2 ins <0,1,u,2>, lane 2
+    1007910914U, // <u,1,4,3>: Cost 1 ins LHS, lane 2
+    2081660930U, // <u,1,4,4>: Cost 2 ins <0,1,u,4>, lane 2
+    1007951877U, // <u,1,4,5>: Cost 1 ins LHS, lane 5
+    1725590838U, // <u,1,4,6>: Cost 2 vuzpl <u,0,1,2>, RHS
+    2081685506U, // <u,1,4,7>: Cost 2 ins <0,1,u,7>, lane 2
+    1007910914U, // <u,1,4,u>: Cost 1 ins LHS, lane 2
+    1481818774U, // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, <0,u,1,5>
+    1863533364U, // <u,1,5,1>: Cost 2 vzipl RHS, <1,1,1,1>
+    1863533462U, // <u,1,5,2>: Cost 2 vzipl RHS, <1,2,3,0>
+    1007910914U, // <u,1,5,3>: Cost 1 ins LHS, lane 2
+    1481821494U, // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS
+    1863533712U, // <u,1,5,5>: Cost 2 vzipl RHS, <1,5,3,7>
+    2133876740U, // <u,1,5,6>: Cost 2 ins <u,u,5,6>, lane 4
+    1750224182U, // <u,1,5,7>: Cost 2 vuzpr <0,u,1,1>, RHS
+    1007910914U, // <u,1,5,u>: Cost 1 ins LHS, lane 2
+    2081628162U, // <u,1,6,0>: Cost 2 ins <0,1,u,0>, lane 2
+    1997751092U, // <u,1,6,1>: Cost 2 vtrnl RHS, <1,1,1,1>
+    2133917700U, // <u,1,6,2>: Cost 2 ins <u,u,6,2>, lane 4
+    1007910914U, // <u,1,6,3>: Cost 1 ins LHS, lane 2
+    2081660930U, // <u,1,6,4>: Cost 2 ins <0,1,u,4>, lane 2
+    1997751296U, // <u,1,6,5>: Cost 2 vtrnl RHS, <1,3,5,7>
+    2133950468U, // <u,1,6,6>: Cost 2 ins <u,u,6,6>, lane 4
+    1060216836U, // <u,1,6,7>: Cost 1 ins RHS, lane 4
+    1007910914U, // <u,1,6,u>: Cost 1 ins LHS, lane 2
+    2133975044U, // <u,1,7,0>: Cost 2 ins <u,u,7,0>, lane 4
+    1906901002U, // <u,1,7,1>: Cost 2 vzipr RHS, <0,0,1,1>
+    1906903190U, // <u,1,7,2>: Cost 2 vzipr RHS, <3,0,1,2>
+    969220198U,  // <u,1,7,3>: Cost 1 vtrnr RHS, LHS
+    2134007812U, // <u,1,7,4>: Cost 2 ins <u,u,7,4>, lane 4
+    1152558485U, // <u,1,7,5>: Cost 2 vrev <1,u,5,7>
+    2134024196U, // <u,1,7,6>: Cost 2 ins <u,u,7,6>, lane 4
+    2134032388U, // <u,1,7,7>: Cost 2 ins <u,u,7,7>, lane 4
+    969220203U,  // <u,1,7,u>: Cost 1 vtrnr RHS, LHS
+    1007509507U, // <u,1,u,0>: Cost 1 ins LHS, lane 3
+    1007951877U, // <u,1,u,1>: Cost 1 ins LHS, lane 5
+    1007509507U, // <u,1,u,2>: Cost 1 ins LHS, lane 3
+    835584U,     // <u,1,u,3>: Cost 0 copy LHS
+    1007509507U, // <u,1,u,4>: Cost 1 ins LHS, lane 3
+    1007509507U, // <u,1,u,5>: Cost 1 ins LHS, lane 3
+    1007509507U, // <u,1,u,6>: Cost 1 ins LHS, lane 3
+    1007509507U, // <u,1,u,7>: Cost 1 ins LHS, lane 3
+    835584U,     // <u,1,u,u>: Cost 0 copy LHS
+    1726332928U, // <u,2,0,0>: Cost 2 vuzpl LHS, <0,0,0,0>
+    1545437286U, // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS
+    652591206U,  // <u,2,0,2>: Cost 1 vuzpl LHS, LHS
+    1886937190U, // <u,2,0,3>: Cost 2 vzipr <1,2,u,0>, LHS
+    1726333132U, // <u,2,0,4>: Cost 2 vuzpl LHS, <0,2,4,6>
+    2081767427U, // <u,2,0,5>: Cost 2 ins <0,2,0,u>, lane 3
+    2082340866U, // <u,2,0,6>: Cost 2 ins <0,2,u,6>, lane 2
+    2081767427U, // <u,2,0,7>: Cost 2 ins <0,2,0,u>, lane 3
+    652591260U,  // <u,2,0,u>: Cost 1 vuzpl LHS, LHS
+    1550082851U, // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2>
+    1726333748U, // <u,2,1,1>: Cost 2 vuzpl LHS, <1,1,1,1>
+    1860552296U, // <u,2,1,2>: Cost 2 vzipl LHS, <2,2,2,2>
+    1750155366U, // <u,2,1,3>: Cost 2 vuzpr <0,u,0,2>, LHS
+    2088296450U, // <u,2,1,4>: Cost 2 ins <1,2,u,4>, lane 2
+    1726333952U, // <u,2,1,5>: Cost 2 vuzpl LHS, <1,3,5,7>
+    1860552634U, // <u,2,1,6>: Cost 2 vzipl LHS, <2,6,3,7>
+    2109702145U, // <u,2,1,7>: Cost 2 ins <4,u,1,7>, lane 1
+    1750155371U, // <u,2,1,u>: Cost 2 vuzpr <0,u,0,2>, LHS
+    1481867932U, // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, <0,u,2,2>
+    2085838849U, // <u,2,2,1>: Cost 2 ins <0,u,2,1>, lane 1
+    269271142U,  // <u,2,2,2>: Cost 1 vdup2 LHS
+    1012113409U, // <u,2,2,3>: Cost 1 ins LHS, lane 1
+    1481870646U, // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS
+    2085871617U, // <u,2,2,5>: Cost 2 ins <0,u,2,5>, lane 1
+    2085879809U, // <u,2,2,6>: Cost 2 ins <0,u,2,6>, lane 1
+    2085888001U, // <u,2,2,7>: Cost 2 ins <0,u,2,7>, lane 1
+    1012113409U, // <u,2,2,u>: Cost 1 ins LHS, lane 1
+    408134301U,  // <u,2,3,0>: Cost 1 vext1 LHS, LHS
+    1481876214U, // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+    1880326164U, // <u,2,3,2>: Cost 2 vzipr LHS, <0,0,2,2>
+    806584422U,  // <u,2,3,3>: Cost 1 vzipr LHS, LHS
+    408137014U,  // <u,2,3,4>: Cost 1 vext1 LHS, RHS
+    1726335490U, // <u,2,3,5>: Cost 2 vuzpl LHS, <3,4,5,6>
+    1880326492U, // <u,2,3,6>: Cost 2 vzipr LHS, <0,4,2,6>
+    1529656314U, // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+    806584427U,  // <u,2,3,u>: Cost 1 vzipr LHS, LHS
+    1726336332U, // <u,2,4,0>: Cost 2 vuzpl LHS, <4,6,0,2>
+    2082062339U, // <u,2,4,1>: Cost 2 ins <0,2,4,u>, lane 3
+    2082308098U, // <u,2,4,2>: Cost 2 ins <0,2,u,2>, lane 2
+    1886969958U, // <u,2,4,3>: Cost 2 vzipr <1,2,u,4>, LHS
+    1726336208U, // <u,2,4,4>: Cost 2 vuzpl LHS, <4,4,4,4>
+    1545440566U, // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS
+    652594486U,  // <u,2,4,6>: Cost 1 vuzpl LHS, RHS
+    2082062339U, // <u,2,4,7>: Cost 2 ins <0,2,4,u>, lane 3
+    652594504U,  // <u,2,4,u>: Cost 1 vuzpl LHS, RHS
+    2088263682U, // <u,2,5,0>: Cost 2 ins <1,2,u,0>, lane 2
+    1726337152U, // <u,2,5,1>: Cost 2 vuzpl LHS, <5,7,1,3>
+    1863534184U, // <u,2,5,2>: Cost 2 vzipl RHS, <2,2,2,2>
+    1884987494U, // <u,2,5,3>: Cost 2 vzipr <0,u,u,5>, LHS
+    1158441059U, // <u,2,5,4>: Cost 2 vrev <2,u,4,5>
+    1726337028U, // <u,2,5,5>: Cost 2 vuzpl LHS, <5,5,5,5>
+    1863534522U, // <u,2,5,6>: Cost 2 vzipl RHS, <2,6,3,7>
+    1750158646U, // <u,2,5,7>: Cost 2 vuzpr <0,u,0,2>, RHS
+    1750158647U, // <u,2,5,u>: Cost 2 vuzpr <0,u,0,2>, RHS
+    1481900704U, // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, <0,u,2,6>
+    2110021633U, // <u,2,6,1>: Cost 2 ins <4,u,6,1>, lane 1
+    1997751912U, // <u,2,6,2>: Cost 2 vtrnl RHS, <2,2,2,2>
+    1611491258U, // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+    1481903414U, // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS
+    2110054401U, // <u,2,6,5>: Cost 2 ins <4,u,6,5>, lane 1
+    1726337848U, // <u,2,6,6>: Cost 2 vuzpl LHS, <6,6,6,6>
+    1036328961U, // <u,2,6,7>: Cost 1 ins RHS, lane 1
+    1036328961U, // <u,2,6,u>: Cost 1 ins RHS, lane 1
+    2042962838U, // <u,2,7,0>: Cost 2 vtrnr RHS, <1,2,3,0>
+    1726338042U, // <u,2,7,1>: Cost 2 vuzpl LHS, <7,0,1,2>
+    1906901012U, // <u,2,7,2>: Cost 2 vzipr RHS, <0,0,2,2>
+    833159270U,  // <u,2,7,3>: Cost 1 vzipr RHS, LHS
+    2042962842U, // <u,2,7,4>: Cost 2 vtrnr RHS, <1,2,3,4>
+    1726338406U, // <u,2,7,5>: Cost 2 vuzpl LHS, <7,4,5,6>
+    1906901340U, // <u,2,7,6>: Cost 2 vzipr RHS, <0,4,2,6>
+    1726338668U, // <u,2,7,7>: Cost 2 vuzpl LHS, <7,7,7,7>
+    833159275U,  // <u,2,7,u>: Cost 1 vzipr RHS, LHS
+    408175266U,  // <u,2,u,0>: Cost 1 vext1 LHS, LHS
+    1545443118U, // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS
+    652597038U,  // <u,2,u,2>: Cost 1 vuzpl LHS, LHS
+    806625382U,  // <u,2,u,3>: Cost 1 vzipr LHS, LHS
+    408177974U,  // <u,2,u,4>: Cost 1 vext1 LHS, RHS
+    1545443482U, // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS
+    652597402U,  // <u,2,u,6>: Cost 1 vuzpl LHS, RHS
+    1036328961U, // <u,2,u,7>: Cost 1 ins RHS, lane 1
+    806625387U,  // <u,2,u,u>: Cost 1 vzipr LHS, LHS
+    1544781824U, // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+    471040156U,  // <u,3,0,1>: Cost 1 vext2 LHS, LHS
+    1544781988U, // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+    2088951810U, // <u,3,0,3>: Cost 2 ins <1,3,u,3>, lane 2
+    1544782162U, // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+    2094940162U, // <u,3,0,5>: Cost 2 ins <2,3,u,5>, lane 2
+    2094374915U, // <u,3,0,6>: Cost 2 ins <2,3,0,u>, lane 3
+    2088984578U, // <u,3,0,7>: Cost 2 ins <1,3,u,7>, lane 2
+    471040669U,  // <u,3,0,u>: Cost 1 vext2 LHS, LHS
+    1544782582U, // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+    1544782644U, // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+    1544782742U, // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+    676569190U,  // <u,3,1,3>: Cost 1 vuzpr LHS, LHS
+    1860553218U, // <u,3,1,4>: Cost 2 vzipl LHS, <3,4,5,6>
+    1544782992U, // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+    2088476675U, // <u,3,1,6>: Cost 2 ins <1,3,1,u>, lane 3
+    2088984578U, // <u,3,1,7>: Cost 2 ins <1,3,u,7>, lane 2
+    676569195U,  // <u,3,1,u>: Cost 1 vuzpr LHS, LHS
+    1750311830U, // <u,3,2,0>: Cost 2 vuzpr LHS, <1,2,3,0>
+    1164167966U, // <u,3,2,1>: Cost 2 vrev <3,u,1,2>
+    1544783464U, // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+    1012113409U, // <u,3,2,3>: Cost 1 ins LHS, lane 1
+    1750311834U, // <u,3,2,4>: Cost 2 vuzpr LHS, <1,2,3,4>
+    1994770946U, // <u,3,2,5>: Cost 2 vtrnl LHS, <3,4,5,6>
+    1544783802U, // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+    2088984578U, // <u,3,2,7>: Cost 2 ins <1,3,u,7>, lane 2
+    1012113409U, // <u,3,2,u>: Cost 1 ins LHS, lane 1
+    1544784022U, // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+    1750312614U, // <u,3,3,1>: Cost 2 vuzpr LHS, <2,3,0,1>
+    1880326902U, // <u,3,3,2>: Cost 2 vzipr LHS, <1,0,3,2>
+    336380006U,  // <u,3,3,3>: Cost 1 vdup3 LHS
+    1544784386U, // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+    1750312654U, // <u,3,3,5>: Cost 2 vuzpr LHS, <2,3,4,5>
+    2100568067U, // <u,3,3,6>: Cost 2 ins <3,3,3,u>, lane 3
+    1880327312U, // <u,3,3,7>: Cost 2 vzipr LHS, <1,5,3,7>
+    336380006U,  // <u,3,3,u>: Cost 1 vdup3 LHS
+    1487929446U, // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS
+    1487930752U, // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4>
+    2094669827U, // <u,3,4,2>: Cost 2 ins <2,3,4,u>, lane 3
+    2088951810U, // <u,3,4,3>: Cost 2 ins <1,3,u,3>, lane 2
+    1487932726U, // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS
+    471043382U,  // <u,3,4,5>: Cost 1 vext2 LHS, RHS
+    1750311260U, // <u,3,4,6>: Cost 2 vuzpr LHS, <0,4,2,6>
+    2088984578U, // <u,3,4,7>: Cost 2 ins <1,3,u,7>, lane 2
+    471043625U,  // <u,3,4,u>: Cost 1 vext2 LHS, RHS
+    1863534742U, // <u,3,5,0>: Cost 2 vzipl RHS, <3,0,1,2>
+    1574645465U, // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3>
+    2088771587U, // <u,3,5,2>: Cost 2 ins <1,3,5,u>, lane 3
+    1863535004U, // <u,3,5,3>: Cost 2 vzipl RHS, <3,3,3,3>
+    1592561606U, // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+    1592561668U, // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+    1592561762U, // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+    676572470U,  // <u,3,5,7>: Cost 1 vuzpr LHS, RHS
+    676572471U,  // <u,3,5,u>: Cost 1 vuzpr LHS, RHS
+    1798090850U, // <u,3,6,0>: Cost 2 vuzpr LHS, <5,6,7,0>
+    1997752470U, // <u,3,6,1>: Cost 2 vtrnl RHS, <3,0,1,2>
+    1581281795U, // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3>
+    1997752732U, // <u,3,6,3>: Cost 2 vtrnl RHS, <3,3,3,3>
+    1798090854U, // <u,3,6,4>: Cost 2 vuzpr LHS, <5,6,7,4>
+    1164495686U, // <u,3,6,5>: Cost 2 vrev <3,u,5,6>
+    1592562488U, // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+    1060216836U, // <u,3,6,7>: Cost 1 ins RHS, lane 4
+    1060216836U, // <u,3,6,u>: Cost 1 ins RHS, lane 4
+    1487954022U, // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS
+    1487955331U, // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7>
+    1493928028U, // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7>
+    1906901832U, // <u,3,7,3>: Cost 2 vzipr RHS, <1,1,3,3>
+    1487957302U, // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS
+    2042963662U, // <u,3,7,5>: Cost 2 vtrnr RHS, <2,3,4,5>
+    2134024196U, // <u,3,7,6>: Cost 2 ins <u,u,7,6>, lane 4
+    1906902160U, // <u,3,7,7>: Cost 2 vzipr RHS, <1,5,3,7>
+    1487959854U, // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS
+    1544787667U, // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+    471045934U,  // <u,3,u,1>: Cost 1 vext2 LHS, LHS
+    1880367862U, // <u,3,u,2>: Cost 2 vzipr LHS, <1,0,3,2>
+    676569757U,  // <u,3,u,3>: Cost 1 vuzpr LHS, LHS
+    1544788031U, // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+    471046298U,  // <u,3,u,5>: Cost 1 vext2 LHS, RHS
+    1750311584U, // <u,3,u,6>: Cost 2 vuzpr LHS, <0,u,2,6>
+    676572713U,  // <u,3,u,7>: Cost 1 vuzpr LHS, RHS
+    471046501U,  // <u,3,u,u>: Cost 1 vext2 LHS, LHS
+    1974046028U, // <u,4,0,0>: Cost 2 vtrnl <4,6,0,2>, <4,6,0,2>
+    1551425638U, // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS
+    1727168614U, // <u,4,0,2>: Cost 2 vuzpl <u,2,4,6>, LHS
+    2085707777U, // <u,4,0,3>: Cost 2 ins <0,u,0,3>, lane 1
+    1679392972U, // <u,4,0,4>: Cost 2 vuzpl <0,2,4,6>, <0,2,4,6>
+    1638329234U, // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+    1638329244U, // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+    2109628417U, // <u,4,0,7>: Cost 2 ins <4,u,0,7>, lane 1
+    1551426205U, // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS
+    1860553618U, // <u,4,1,0>: Cost 2 vzipl LHS, <4,0,5,1>
+    2085765121U, // <u,4,1,1>: Cost 2 ins <0,u,1,1>, lane 1
+    1551426503U, // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4>
+    1756364902U, // <u,4,1,3>: Cost 2 vuzpr <1,u,3,4>, LHS
+    1860553936U, // <u,4,1,4>: Cost 2 vzipl LHS, <4,4,4,4>
+    786812214U,  // <u,4,1,5>: Cost 1 vzipl LHS, RHS
+    1994026294U, // <u,4,1,6>: Cost 2 vtrnl <u,0,1,2>, RHS
+    2083168259U, // <u,4,1,7>: Cost 2 ins <0,4,1,u>, lane 3
+    786812457U,  // <u,4,1,u>: Cost 1 vzipl LHS, RHS
+    1170066926U, // <u,4,2,0>: Cost 2 vrev <4,u,0,2>
+    2083241987U, // <u,4,2,1>: Cost 2 ins <0,4,2,u>, lane 3
+    2085847041U, // <u,4,2,2>: Cost 2 ins <0,u,2,2>, lane 1
+    1012113409U, // <u,4,2,3>: Cost 1 ins LHS, lane 1
+    1994771664U, // <u,4,2,4>: Cost 2 vtrnl LHS, <4,4,4,4>
+    1994771346U, // <u,4,2,5>: Cost 2 vtrnl LHS, <4,0,5,1>
+    921029942U,  // <u,4,2,6>: Cost 1 vtrnl LHS, RHS
+    2083241987U, // <u,4,2,7>: Cost 2 ins <0,4,2,u>, lane 3
+    921029960U,  // <u,4,2,u>: Cost 1 vtrnl LHS, RHS
+    2091876353U, // <u,4,3,0>: Cost 2 ins <1,u,3,0>, lane 1
+    2954070192U, // <u,4,3,1>: Cost 3 vzipr LHS, <3,0,4,1>
+    2091892737U, // <u,4,3,2>: Cost 2 ins <1,u,3,2>, lane 1
+    2091900929U, // <u,4,3,3>: Cost 2 ins <1,u,3,3>, lane 1
+    1928105168U, // <u,4,3,4>: Cost 2 vzipr LHS, <4,4,4,4>
+    1880327886U, // <u,4,3,5>: Cost 2 vzipr LHS, <2,3,4,5>
+    1880326348U, // <u,4,3,6>: Cost 2 vzipr LHS, <0,2,4,6>
+    2091933697U, // <u,4,3,7>: Cost 2 ins <1,u,3,7>, lane 1
+    1880326350U, // <u,4,3,u>: Cost 2 vzipr LHS, <0,2,4,u>
+    1505919078U, // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS
+    2107277315U, // <u,4,4,1>: Cost 2 ins <4,4,4,u>, lane 3
+    2107277315U, // <u,4,4,2>: Cost 2 ins <4,4,4,u>, lane 3
+    2086002689U, // <u,4,4,3>: Cost 2 ins <0,u,4,3>, lane 1
+    161926454U,  // <u,4,4,4>: Cost 1 vdup0 RHS
+    1551428918U, // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS
+    1638329572U, // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+    2109923329U, // <u,4,4,7>: Cost 2 ins <4,u,4,7>, lane 1
+    161926454U,  // <u,4,4,u>: Cost 1 vdup0 RHS
+    1493983334U, // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS
+    2101379075U, // <u,4,5,1>: Cost 2 ins <3,4,5,u>, lane 3
+    1493985379U, // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5>
+    2101379075U, // <u,4,5,3>: Cost 2 ins <3,4,5,u>, lane 3
+    1493986614U, // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS
+    789794102U,  // <u,4,5,5>: Cost 1 vzipl RHS, RHS
+    537750838U,  // <u,4,5,6>: Cost 1 vext3 LHS, RHS
+    1756368182U, // <u,4,5,7>: Cost 2 vuzpr <1,u,3,4>, RHS
+    537750856U,  // <u,4,5,u>: Cost 1 vext3 LHS, RHS
+    1482048178U, // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, <0,u,4,6>
+    2107424771U, // <u,4,6,1>: Cost 2 ins <4,4,6,u>, lane 3
+    2110029825U, // <u,4,6,2>: Cost 2 ins <4,u,6,2>, lane 1
+    2107424771U, // <u,4,6,3>: Cost 2 ins <4,4,6,u>, lane 3
+    1482050870U, // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS
+    1997753234U, // <u,4,6,5>: Cost 2 vtrnl RHS, <4,0,5,1>
+    924011830U,  // <u,4,6,6>: Cost 1 vtrnl RHS, RHS
+    1036328961U, // <u,4,6,7>: Cost 1 ins RHS, lane 1
+    924011848U,  // <u,4,6,u>: Cost 1 vtrnl RHS, RHS
+    2116059137U, // <u,4,7,0>: Cost 2 ins <5,u,7,0>, lane 1
+    2113470467U, // <u,4,7,1>: Cost 2 ins <5,4,7,u>, lane 3
+    2113470467U, // <u,4,7,2>: Cost 2 ins <5,4,7,u>, lane 3
+    2116083713U, // <u,4,7,3>: Cost 2 ins <5,u,7,3>, lane 1
+    1906904272U, // <u,4,7,4>: Cost 2 vzipr RHS, <4,4,4,4>
+    1906902734U, // <u,4,7,5>: Cost 2 vzipr RHS, <2,3,4,5>
+    96808489U,   // <u,4,7,6>: Cost 1 vrev RHS
+    2116116481U, // <u,4,7,7>: Cost 2 ins <5,u,7,7>, lane 1
+    96955963U,   // <u,4,7,u>: Cost 1 vrev RHS
+    1482064564U, // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, <0,u,4,u>
+    1551431470U, // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS
+    1494009958U, // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u>
+    1012113409U, // <u,4,u,3>: Cost 1 ins LHS, lane 1
+    161926454U,  // <u,4,u,4>: Cost 1 vdup0 RHS
+    791457078U,  // <u,4,u,5>: Cost 1 vzipl LHS, RHS
+    537751081U,  // <u,4,u,6>: Cost 1 vext3 LHS, RHS
+    1036328961U, // <u,4,u,7>: Cost 1 ins RHS, lane 1
+    537751099U,  // <u,4,u,u>: Cost 1 vext3 LHS, RHS
+    2085683201U, // <u,5,0,0>: Cost 2 ins <0,u,0,0>, lane 1
+    1034493957U, // <u,5,0,1>: Cost 1 ins RHS, lane 5
+    1727914086U, // <u,5,0,2>: Cost 2 vuzpl <u,3,5,7>, LHS
+    2085707777U, // <u,5,0,3>: Cost 2 ins <0,u,0,3>, lane 1
+    1546273106U, // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+    1678778497U, // <u,5,0,5>: Cost 2 vuzpl <0,1,5,3>, <0,1,5,3>
+    2108219394U, // <u,5,0,6>: Cost 2 ins <4,5,u,6>, lane 2
+    1034485762U, // <u,5,0,7>: Cost 1 ins RHS, lane 2
+    1034493957U, // <u,5,0,u>: Cost 1 ins RHS, lane 5
+    1505968230U, // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS
+    1860554448U, // <u,5,1,1>: Cost 2 vzipl LHS, <5,1,7,3>
+    2103689217U, // <u,5,1,2>: Cost 2 ins <3,u,1,2>, lane 1
+    1750253670U, // <u,5,1,3>: Cost 2 vuzpr <0,u,1,5>, LHS
+    1505971738U, // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, <4,u,5,1>
+    1860554756U, // <u,5,1,5>: Cost 2 vzipl LHS, <5,5,5,5>
+    1860554850U, // <u,5,1,6>: Cost 2 vzipl LHS, <5,6,7,0>
+    1034485762U, // <u,5,1,7>: Cost 1 ins RHS, lane 2
+    1034485762U, // <u,5,1,u>: Cost 1 ins RHS, lane 2
+    2085830657U, // <u,5,2,0>: Cost 2 ins <0,u,2,0>, lane 1
+    1994772608U, // <u,5,2,1>: Cost 2 vtrnl LHS, <5,7,1,3>
+    2085847041U, // <u,5,2,2>: Cost 2 ins <0,u,2,2>, lane 1
+    1012113409U, // <u,5,2,3>: Cost 1 ins LHS, lane 1
+    2085863425U, // <u,5,2,4>: Cost 2 ins <0,u,2,4>, lane 1
+    1994772484U, // <u,5,2,5>: Cost 2 vtrnl LHS, <5,5,5,5>
+    2085879809U, // <u,5,2,6>: Cost 2 ins <0,u,2,6>, lane 1
+    1034485762U, // <u,5,2,7>: Cost 1 ins RHS, lane 2
+    1012113409U, // <u,5,2,u>: Cost 1 ins LHS, lane 1
+    2091876353U, // <u,5,3,0>: Cost 2 ins <1,u,3,0>, lane 1
+    1176121553U, // <u,5,3,1>: Cost 2 vrev <5,u,1,3>
+    2091892737U, // <u,5,3,2>: Cost 2 ins <1,u,3,2>, lane 1
+    2091900929U, // <u,5,3,3>: Cost 2 ins <1,u,3,3>, lane 1
+    2091909121U, // <u,5,3,4>: Cost 2 ins <1,u,3,4>, lane 1
+    1928105178U, // <u,5,3,5>: Cost 2 vzipr LHS, <4,4,5,5>
+    1880328706U, // <u,5,3,6>: Cost 2 vzipr LHS, <3,4,5,6>
+    945007926U,  // <u,5,3,7>: Cost 1 vtrnr LHS, RHS
+    945007927U,  // <u,5,3,u>: Cost 1 vtrnr LHS, RHS
+    2108170242U, // <u,5,4,0>: Cost 2 ins <4,5,u,0>, lane 2
+    2108178434U, // <u,5,4,1>: Cost 2 ins <4,5,u,1>, lane 2
+    2108186626U, // <u,5,4,2>: Cost 2 ins <4,5,u,2>, lane 2
+    2086002689U, // <u,5,4,3>: Cost 2 ins <0,u,4,3>, lane 1
+    1845022662U, // <u,5,4,4>: Cost 2 vzipl <5,4,7,6>, <5,4,7,6>
+    1034493957U, // <u,5,4,5>: Cost 1 ins RHS, lane 5
+    1727917366U, // <u,5,4,6>: Cost 2 vuzpl <u,3,5,7>, RHS
+    1034485762U, // <u,5,4,7>: Cost 1 ins RHS, lane 2
+    1034493957U, // <u,5,4,u>: Cost 1 ins RHS, lane 5
+    1506000998U, // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS
+    1863536336U, // <u,5,5,1>: Cost 2 vzipl RHS, <5,1,7,3>
+    2108186626U, // <u,5,5,2>: Cost 2 ins <4,5,u,2>, lane 2
+    2086076417U, // <u,5,5,3>: Cost 2 ins <0,u,5,3>, lane 1
+    1506004510U, // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, <4,u,5,5>
+    229035318U,  // <u,5,5,5>: Cost 1 vdup1 RHS
+    1863536738U, // <u,5,5,6>: Cost 2 vzipl RHS, <5,6,7,0>
+    1034485762U, // <u,5,5,7>: Cost 1 ins RHS, lane 2
+    1034485762U, // <u,5,5,u>: Cost 1 ins RHS, lane 2
+    1034346499U, // <u,5,6,0>: Cost 1 ins RHS, lane 3
+    1034346499U, // <u,5,6,1>: Cost 1 ins RHS, lane 3
+    1034346499U, // <u,5,6,2>: Cost 1 ins RHS, lane 3
+    1034346499U, // <u,5,6,3>: Cost 1 ins RHS, lane 3
+    1034346499U, // <u,5,6,4>: Cost 1 ins RHS, lane 3
+    1034346499U, // <u,5,6,5>: Cost 1 ins RHS, lane 3
+    1034346499U, // <u,5,6,6>: Cost 1 ins RHS, lane 3
+    27705344U,   // <u,5,6,7>: Cost 0 copy RHS
+    27705344U,   // <u,5,6,u>: Cost 0 copy RHS
+    1488101478U, // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS
+    1488102805U, // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7>
+    2114134019U, // <u,5,7,2>: Cost 2 ins <5,5,7,u>, lane 3
+    2133999620U, // <u,5,7,3>: Cost 2 ins <u,u,7,3>, lane 4
+    1488104758U, // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS
+    1638330536U, // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+    1906903554U, // <u,5,7,6>: Cost 2 vzipr RHS, <3,4,5,6>
+    969223478U,  // <u,5,7,7>: Cost 1 vtrnr RHS, RHS
+    969223479U,  // <u,5,7,u>: Cost 1 vtrnr RHS, RHS
+    1034346499U, // <u,5,u,0>: Cost 1 ins RHS, lane 3
+    1034493957U, // <u,5,u,1>: Cost 1 ins RHS, lane 5
+    1034346499U, // <u,5,u,2>: Cost 1 ins RHS, lane 3
+    1012113409U, // <u,5,u,3>: Cost 1 ins LHS, lane 1
+    1034346499U, // <u,5,u,4>: Cost 1 ins RHS, lane 3
+    1034493957U, // <u,5,u,5>: Cost 1 ins RHS, lane 5
+    1034346499U, // <u,5,u,6>: Cost 1 ins RHS, lane 3
+    27705344U,   // <u,5,u,7>: Cost 0 copy RHS
+    27705344U,   // <u,5,u,u>: Cost 0 copy RHS
+    1729314816U, // <u,6,0,0>: Cost 2 vuzpl RHS, <0,0,0,0>
+    1545470054U, // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS
+    655573094U,  // <u,6,0,2>: Cost 1 vuzpl RHS, LHS
+    2108309507U, // <u,6,0,3>: Cost 2 ins <4,6,0,u>, lane 3
+    1546797458U, // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6>
+    2108309507U, // <u,6,0,5>: Cost 2 ins <4,6,0,u>, lane 3
+    2108882946U, // <u,6,0,6>: Cost 2 ins <4,6,u,6>, lane 2
+    1886940470U, // <u,6,0,7>: Cost 2 vzipr <1,2,u,0>, RHS
+    655573148U,  // <u,6,0,u>: Cost 1 vuzpl RHS, LHS
+    1182004127U, // <u,6,1,0>: Cost 2 vrev <6,u,0,1>
+    1729315636U, // <u,6,1,1>: Cost 2 vuzpl RHS, <1,1,1,1>
+    1860555258U, // <u,6,1,2>: Cost 2 vzipl LHS, <6,2,7,3>
+    1750335590U, // <u,6,1,3>: Cost 2 vuzpr <0,u,2,6>, LHS
+    2114838530U, // <u,6,1,4>: Cost 2 ins <5,6,u,4>, lane 2
+    1729315840U, // <u,6,1,5>: Cost 2 vuzpl RHS, <1,3,5,7>
+    1860555576U, // <u,6,1,6>: Cost 2 vzipl LHS, <6,6,6,6>
+    1884958006U, // <u,6,1,7>: Cost 2 vzipr <0,u,u,1>, RHS
+    1750335595U, // <u,6,1,u>: Cost 2 vuzpr <0,u,2,6>, LHS
+    1506050150U, // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS
+    2085838849U, // <u,6,2,1>: Cost 2 ins <0,u,2,1>, lane 1
+    1729316456U, // <u,6,2,2>: Cost 2 vuzpl RHS, <2,2,2,2>
+    1012113409U, // <u,6,2,3>: Cost 1 ins LHS, lane 1
+    1506053668U, // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, <4,u,6,2>
+    2085871617U, // <u,6,2,5>: Cost 2 ins <0,u,2,5>, lane 1
+    1994773304U, // <u,6,2,6>: Cost 2 vtrnl LHS, <6,6,6,6>
+    1880984886U, // <u,6,2,7>: Cost 2 vzipr <0,2,u,2>, RHS
+    1012113409U, // <u,6,2,u>: Cost 1 ins LHS, lane 1
+    2066526306U, // <u,6,3,0>: Cost 2 vtrnr LHS, <5,6,7,0>
+    1729317014U, // <u,6,3,1>: Cost 2 vuzpl RHS, <3,0,1,2>
+    1928104860U, // <u,6,3,2>: Cost 2 vzipr LHS, <4,0,6,2>
+    1729317276U, // <u,6,3,3>: Cost 2 vuzpl RHS, <3,3,3,3>
+    1564715549U, // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6>
+    1729317378U, // <u,6,3,5>: Cost 2 vuzpl RHS, <3,4,5,6>
+    1928105188U, // <u,6,3,6>: Cost 2 vzipr LHS, <4,4,6,6>
+    806587702U,  // <u,6,3,7>: Cost 1 vzipr LHS, RHS
+    806587703U,  // <u,6,3,u>: Cost 1 vzipr LHS, RHS
+    1729318220U, // <u,6,4,0>: Cost 2 vuzpl RHS, <4,6,0,2>
+    2108604419U, // <u,6,4,1>: Cost 2 ins <4,6,4,u>, lane 3
+    2108850178U, // <u,6,4,2>: Cost 2 ins <4,6,u,2>, lane 2
+    2108604419U, // <u,6,4,3>: Cost 2 ins <4,6,4,u>, lane 3
+    1729318096U, // <u,6,4,4>: Cost 2 vuzpl RHS, <4,4,4,4>
+    1545473334U, // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS
+    655576374U,  // <u,6,4,6>: Cost 1 vuzpl RHS, RHS
+    1886973238U, // <u,6,4,7>: Cost 2 vzipr <1,2,u,4>, RHS
+    655576392U,  // <u,6,4,u>: Cost 1 vuzpl RHS, RHS
+    2114805762U, // <u,6,5,0>: Cost 2 ins <5,6,u,0>, lane 2
+    1729319040U, // <u,6,5,1>: Cost 2 vuzpl RHS, <5,7,1,3>
+    1863537146U, // <u,6,5,2>: Cost 2 vzipl RHS, <6,2,7,3>
+    2086076417U, // <u,6,5,3>: Cost 2 ins <0,u,5,3>, lane 1
+    1576660943U, // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6>
+    1729318916U, // <u,6,5,5>: Cost 2 vuzpl RHS, <5,5,5,5>
+    1863537464U, // <u,6,5,6>: Cost 2 vzipl RHS, <6,6,6,6>
+    1750338870U, // <u,6,5,7>: Cost 2 vuzpr <0,u,2,6>, RHS
+    1750338871U, // <u,6,5,u>: Cost 2 vuzpr <0,u,2,6>, RHS
+    1506082918U, // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS
+    2110021633U, // <u,6,6,1>: Cost 2 ins <4,u,6,1>, lane 1
+    2110029825U, // <u,6,6,2>: Cost 2 ins <4,u,6,2>, lane 1
+    2086150145U, // <u,6,6,3>: Cost 2 ins <0,u,6,3>, lane 1
+    1506086440U, // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, <4,u,6,6>
+    2110054401U, // <u,6,6,5>: Cost 2 ins <4,u,6,5>, lane 1
+    296144182U,  // <u,6,6,6>: Cost 1 vdup2 RHS
+    1036328961U, // <u,6,6,7>: Cost 1 ins RHS, lane 1
+    1036328961U, // <u,6,6,u>: Cost 1 ins RHS, lane 1
+    432349286U,  // <u,6,7,0>: Cost 1 vext1 RHS, LHS
+    1506091766U, // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+    1906903964U, // <u,6,7,2>: Cost 2 vzipr RHS, <4,0,6,2>
+    1506093206U, // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+    432352809U,  // <u,6,7,4>: Cost 1 vext1 RHS, RHS
+    1506094800U, // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+    1906904292U, // <u,6,7,6>: Cost 2 vzipr RHS, <4,4,6,6>
+    833162550U,  // <u,6,7,7>: Cost 1 vzipr RHS, RHS
+    833162551U,  // <u,6,7,u>: Cost 1 vzipr RHS, RHS
+    432357478U,  // <u,6,u,0>: Cost 1 vext1 RHS, LHS
+    1545475886U, // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS
+    655578926U,  // <u,6,u,2>: Cost 1 vuzpl RHS, LHS
+    1012113409U, // <u,6,u,3>: Cost 1 ins LHS, lane 1
+    432361002U,  // <u,6,u,4>: Cost 1 vext1 RHS, RHS
+    1545476250U, // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS
+    655579290U,  // <u,6,u,6>: Cost 1 vuzpl RHS, RHS
+    806628662U,  // <u,6,u,7>: Cost 1 vzipr LHS, RHS
+    806628663U,  // <u,6,u,u>: Cost 1 vzipr LHS, RHS
+    1571356672U, // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+    497614950U,  // <u,7,0,1>: Cost 1 vext2 RHS, LHS
+    1571356836U, // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+    2115493890U, // <u,7,0,3>: Cost 2 ins <5,7,u,3>, lane 2
+    1571357010U, // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+    1512083716U, // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0>
+    2120916995U, // <u,7,0,6>: Cost 2 ins <6,7,0,u>, lane 3
+    2115526658U, // <u,7,0,7>: Cost 2 ins <5,7,u,7>, lane 2
+    497615517U,  // <u,7,0,u>: Cost 1 vext2 RHS, LHS
+    1571357430U, // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+    1571357492U, // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+    1571357590U, // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+    700784742U,  // <u,7,1,3>: Cost 1 vuzpr RHS, LHS
+    1860556134U, // <u,7,1,4>: Cost 2 vzipl LHS, <7,4,5,6>
+    1553441981U, // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7>
+    2115018755U, // <u,7,1,6>: Cost 2 ins <5,7,1,u>, lane 3
+    1860556396U, // <u,7,1,7>: Cost 2 vzipl LHS, <7,7,7,7>
+    700784747U,  // <u,7,1,u>: Cost 1 vuzpr RHS, LHS
+    1774527382U, // <u,7,2,0>: Cost 2 vuzpr RHS, <1,2,3,0>
+    1188058754U, // <u,7,2,1>: Cost 2 vrev <7,u,1,2>
+    1571358312U, // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+    1012113409U, // <u,7,2,3>: Cost 1 ins LHS, lane 1
+    1774527386U, // <u,7,2,4>: Cost 2 vuzpr RHS, <1,2,3,4>
+    1994773862U, // <u,7,2,5>: Cost 2 vtrnl LHS, <7,4,5,6>
+    1560078311U, // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7>
+    1994774124U, // <u,7,2,7>: Cost 2 vtrnl LHS, <7,7,7,7>
+    1012113409U, // <u,7,2,u>: Cost 1 ins LHS, lane 1
+    1571358870U, // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+    1774528166U, // <u,7,3,1>: Cost 2 vuzpr RHS, <2,3,0,1>
+    2091892737U, // <u,7,3,2>: Cost 2 ins <1,u,3,2>, lane 1
+    1571359132U, // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+    1571359234U, // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+    1774528206U, // <u,7,3,5>: Cost 2 vuzpr RHS, <2,3,4,5>
+    1518080992U, // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3>
+    1774527488U, // <u,7,3,7>: Cost 2 vuzpr RHS, <1,3,5,7>
+    1571359518U, // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+    1571359634U, // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+    2121449474U, // <u,7,4,1>: Cost 2 ins <6,7,u,1>, lane 2
+    2121211907U, // <u,7,4,2>: Cost 2 ins <6,7,4,u>, lane 3
+    2115493890U, // <u,7,4,3>: Cost 2 ins <5,7,u,3>, lane 2
+    1571359952U, // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+    497618248U,  // <u,7,4,5>: Cost 1 vext2 RHS, RHS
+    1571360076U, // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,0,2>
+    2115526658U, // <u,7,4,7>: Cost 2 ins <5,7,u,7>, lane 2
+    497618473U,  // <u,7,4,u>: Cost 1 vext2 RHS, RHS
+    1863537658U, // <u,7,5,0>: Cost 2 vzipl RHS, <7,0,1,2>
+    1571360464U, // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+    2115313667U, // <u,7,5,2>: Cost 2 ins <5,7,5,u>, lane 3
+    2115493890U, // <u,7,5,3>: Cost 2 ins <5,7,u,3>, lane 2
+    1571360710U, // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+    1571360772U, // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+    1571360866U, // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+    700788022U,  // <u,7,5,7>: Cost 1 vuzpr RHS, RHS
+    700788023U,  // <u,7,5,u>: Cost 1 vuzpr RHS, RHS
+    1774530658U, // <u,7,6,0>: Cost 2 vuzpr RHS, <5,6,7,0>
+    1997755386U, // <u,7,6,1>: Cost 2 vtrnl RHS, <7,0,1,2>
+    1571361274U, // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+    2115493890U, // <u,7,6,3>: Cost 2 ins <5,7,u,3>, lane 2
+    1774530662U, // <u,7,6,4>: Cost 2 vuzpr RHS, <5,6,7,4>
+    1188386474U, // <u,7,6,5>: Cost 2 vrev <7,u,5,6>
+    1571361592U, // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+    1036328961U, // <u,7,6,7>: Cost 1 ins RHS, lane 1
+    1036328961U, // <u,7,6,u>: Cost 1 ins RHS, lane 1
+    1571361786U, // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+    1774531406U, // <u,7,7,1>: Cost 2 vuzpr RHS, <6,7,0,1>
+    2127405059U, // <u,7,7,2>: Cost 2 ins <7,7,7,u>, lane 3
+    1906904784U, // <u,7,7,3>: Cost 2 vzipr RHS, <5,1,7,3>
+    1571362150U, // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+    1774531446U, // <u,7,7,5>: Cost 2 vuzpr RHS, <6,7,4,5>
+    1906905030U, // <u,7,7,6>: Cost 2 vzipr RHS, <5,4,7,6>
+    363253046U,  // <u,7,7,7>: Cost 1 vdup3 RHS
+    363253046U,  // <u,7,7,u>: Cost 1 vdup3 RHS
+    1571362515U, // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+    497620782U,  // <u,7,u,1>: Cost 1 vext2 RHS, LHS
+    1571362693U, // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+    700785309U,  // <u,7,u,3>: Cost 1 vuzpr RHS, LHS
+    1571362879U, // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+    497621146U,  // <u,7,u,5>: Cost 1 vext2 RHS, RHS
+    1571363024U, // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+    700788265U,  // <u,7,u,7>: Cost 1 vuzpr RHS, RHS
+    497621349U,  // <u,7,u,u>: Cost 1 vext2 RHS, LHS
+    135053414U,  // <u,u,0,0>: Cost 1 vdup0 LHS
+    471081121U,  // <u,u,0,1>: Cost 1 vext2 LHS, LHS
+    653033574U,  // <u,u,0,2>: Cost 1 vuzpl LHS, LHS
+    1007910914U, // <u,u,0,3>: Cost 1 ins LHS, lane 2
+    1544823122U, // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+    1512157453U, // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0>
+    1995282586U, // <u,u,0,6>: Cost 2 vtrnl <u,2,0,2>, RHS
+    1034485762U, // <u,u,0,7>: Cost 1 ins RHS, lane 2
+    471081629U,  // <u,u,0,u>: Cost 1 vext2 LHS, LHS
+    1544823542U, // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+    786814766U,  // <u,u,1,1>: Cost 1 vzipl LHS, LHS
+    537753390U,  // <u,u,1,2>: Cost 1 vext3 LHS, LHS
+    676610150U,  // <u,u,1,3>: Cost 1 vuzpr LHS, LHS
+    1482304822U, // <u,u,1,4>: Cost 2 vext1 <0,u,u,1>, RHS
+    786815130U,  // <u,u,1,5>: Cost 1 vzipl LHS, RHS
+    1518138343U, // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1>
+    1034485762U, // <u,u,1,7>: Cost 1 ins RHS, lane 2
+    537753444U,  // <u,u,1,u>: Cost 1 vext3 LHS, LHS
+    1007509507U, // <u,u,2,0>: Cost 1 ins LHS, lane 3
+    1007509507U, // <u,u,2,1>: Cost 1 ins LHS, lane 3
+    921032494U,  // <u,u,2,2>: Cost 1 vtrnl LHS, LHS
+    835584U,     // <u,u,2,3>: Cost 0 copy LHS
+    1007509507U, // <u,u,2,4>: Cost 1 ins LHS, lane 3
+    1007509507U, // <u,u,2,5>: Cost 1 ins LHS, lane 3
+    921032858U,  // <u,u,2,6>: Cost 1 vtrnl LHS, RHS
+    1007509507U, // <u,u,2,7>: Cost 1 ins LHS, lane 3
+    835584U,     // <u,u,2,u>: Cost 0 copy LHS
+    408576723U,  // <u,u,3,0>: Cost 1 vext1 LHS, LHS
+    1880327918U, // <u,u,3,1>: Cost 2 vzipr LHS, <2,3,u,1>
+    120371557U,  // <u,u,3,2>: Cost 1 vrev LHS
+    806584476U,  // <u,u,3,3>: Cost 1 vzipr LHS, LHS
+    408579382U,  // <u,u,3,4>: Cost 1 vext1 LHS, RHS
+    1880327922U, // <u,u,3,5>: Cost 2 vzipr LHS, <2,3,u,5>
+    1880326384U, // <u,u,3,6>: Cost 2 vzipr LHS, <0,2,u,6>
+    806587720U,  // <u,u,3,7>: Cost 1 vzipr LHS, RHS
+    806584481U,  // <u,u,3,u>: Cost 1 vzipr LHS, LHS
+    1488298086U, // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS
+    1488299437U, // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4>
+    1659271204U, // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+    1007910914U, // <u,u,4,3>: Cost 1 ins LHS, lane 2
+    161926454U,  // <u,u,4,4>: Cost 1 vdup0 RHS
+    471084342U,  // <u,u,4,5>: Cost 1 vext2 LHS, RHS
+    653036854U,  // <u,u,4,6>: Cost 1 vuzpl LHS, RHS
+    1034485762U, // <u,u,4,7>: Cost 1 ins RHS, lane 2
+    471084585U,  // <u,u,4,u>: Cost 1 vext2 LHS, RHS
+    1482334933U, // <u,u,5,0>: Cost 2 vext1 <0,u,u,5>, <0,u,u,5>
+    789796654U,  // <u,u,5,1>: Cost 1 vzipl RHS, LHS
+    1494280327U, // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5>
+    1007910914U, // <u,u,5,3>: Cost 1 ins LHS, lane 2
+    1482337590U, // <u,u,5,4>: Cost 2 vext1 <0,u,u,5>, RHS
+    789797018U,  // <u,u,5,5>: Cost 1 vzipl RHS, RHS
+    537753754U,  // <u,u,5,6>: Cost 1 vext3 LHS, RHS
+    676613430U,  // <u,u,5,7>: Cost 1 vuzpr LHS, RHS
+    537753772U,  // <u,u,5,u>: Cost 1 vext3 LHS, RHS
+    1034346499U, // <u,u,6,0>: Cost 1 ins RHS, lane 3
+    1034346499U, // <u,u,6,1>: Cost 1 ins RHS, lane 3
+    924014382U,  // <u,u,6,2>: Cost 1 vtrnl RHS, LHS
+    1007910914U, // <u,u,6,3>: Cost 1 ins LHS, lane 2
+    1034346499U, // <u,u,6,4>: Cost 1 ins RHS, lane 3
+    1034346499U, // <u,u,6,5>: Cost 1 ins RHS, lane 3
+    924014746U,  // <u,u,6,6>: Cost 1 vtrnl RHS, RHS
+    27705344U,   // <u,u,6,7>: Cost 0 copy RHS
+    27705344U,   // <u,u,6,u>: Cost 0 copy RHS
+    432496742U,  // <u,u,7,0>: Cost 1 vext1 RHS, LHS
+    1488324016U, // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7>
+    1494296713U, // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7>
+    833159324U,  // <u,u,7,3>: Cost 1 vzipr RHS, LHS
+    432500283U,  // <u,u,7,4>: Cost 1 vext1 RHS, RHS
+    1906901393U, // <u,u,7,5>: Cost 2 vzipr RHS, <0,4,u,5>
+    120699277U,  // <u,u,7,6>: Cost 1 vrev RHS
+    833162568U,  // <u,u,7,7>: Cost 1 vzipr RHS, RHS
+    833159329U,  // <u,u,7,u>: Cost 1 vzipr RHS, LHS
+    408617688U,  // <u,u,u,0>: Cost 1 vext1 LHS, LHS
+    471086894U,  // <u,u,u,1>: Cost 1 vext2 LHS, LHS
+    537753957U,  // <u,u,u,2>: Cost 1 vext3 LHS, LHS
+    835584U,     // <u,u,u,3>: Cost 0 copy LHS
+    408620342U,  // <u,u,u,4>: Cost 1 vext1 LHS, RHS
+    471087258U,  // <u,u,u,5>: Cost 1 vext2 LHS, RHS
+    537753997U,  // <u,u,u,6>: Cost 1 vext3 LHS, RHS
+    27705344U,   // <u,u,u,7>: Cost 0 copy RHS
+    835584U,     // <u,u,u,u>: Cost 0 copy LHS
+    0};
+
+static unsigned getPerfectShuffleCost(llvm::ArrayRef<int> M) {
+  assert(M.size() == 4 && "Expected a 4 entry perfect shuffle");
+
+  // Special case zero-cost nop copies, from either LHS or RHS.
+  if (llvm::all_of(llvm::enumerate(M), [](auto &E) {
+        return E.value() < 0 || E.value() == (int)E.index();
+      }))
+    return 0;
+  if (llvm::all_of(llvm::enumerate(M), [](auto &E) {
+        return E.value() < 0 || E.value() == (int)E.index() + 4;
+      }))
+    return 0;
+
+  // Get the four mask elementd from the 2 inputs. Perfect shuffles encode undef
+  // elements with value 8.
+  unsigned PFIndexes[4];
+  for (unsigned i = 0; i != 4; ++i) {
+    assert(M[i] < 8 && "Expected a maximum entry of 8 for shuffle mask");
+    if (M[i] < 0)
+      PFIndexes[i] = 8;
+    else
+      PFIndexes[i] = M[i];
+  }
+
+  // Compute the index in the perfect shuffle table.
+  unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
+                          PFIndexes[2] * 9 + PFIndexes[3];
+  unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+  // And extract the cost from the upper bits. The cost is encoded as Cost-1.
+  return (PFEntry >> 30) + 1;
+}
 
 #endif
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index d1b901e58d27..f7c06b9fb71b 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -19,6 +19,7 @@
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -32,6 +33,8 @@
 
 using namespace llvm;
 
+#define GET_CC_REGISTER_LISTS
+#include "AArch64GenCallingConv.inc"
 #define GET_REGINFO_TARGET_DESC
 #include "AArch64GenRegisterInfo.inc"
 
@@ -63,14 +66,6 @@ bool AArch64RegisterInfo::regNeedsCFI(unsigned Reg,
   return true;
 }
 
-bool AArch64RegisterInfo::hasSVEArgsOrReturn(const MachineFunction *MF) {
-  const Function &F = MF->getFunction();
-  return isa<ScalableVectorType>(F.getReturnType()) ||
-         any_of(F.args(), [](const Argument &Arg) {
-           return isa<ScalableVectorType>(Arg.getType());
-         });
-}
-
 const MCPhysReg *
 AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   assert(MF && "Invalid MachineFunction pointer.");
@@ -108,7 +103,7 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     // This is for OSes other than Windows; Windows is a separate case further
     // above.
     return CSR_AArch64_AAPCS_X18_SaveList;
-  if (hasSVEArgsOrReturn(MF))
+  if (MF->getInfo<AArch64FunctionInfo>()->isSVECC())
     return CSR_AArch64_SVE_AAPCS_SaveList;
   return CSR_AArch64_AAPCS_SaveList;
 }
@@ -335,6 +330,13 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
     markSuperRegs(Reserved, AArch64::W16);
 
+  // SME tiles are not allocatable.
+  if (MF.getSubtarget<AArch64Subtarget>().hasSME()) {
+    for (MCSubRegIterator SubReg(AArch64::ZA, this, /*self=*/true);
+         SubReg.isValid(); ++SubReg)
+      Reserved.set(*SubReg);
+  }
+
   assert(checkAllSuperRegsMarked(Reserved));
   return Reserved;
 }
@@ -417,6 +419,68 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
   return false;
 }
 
+bool AArch64RegisterInfo::isArgumentRegister(const MachineFunction &MF,
+                                             MCRegister Reg) const {
+  CallingConv::ID CC = MF.getFunction().getCallingConv();
+  const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
+  bool IsVarArg = STI.isCallingConvWin64(MF.getFunction().getCallingConv());
+
+  auto HasReg = [](ArrayRef<MCRegister> RegList, MCRegister Reg) {
+    return llvm::any_of(RegList,
+                        [Reg](const MCRegister R) { return R == Reg; });
+  };
+
+  switch (CC) {
+  default:
+    report_fatal_error("Unsupported calling convention.");
+  case CallingConv::WebKit_JS:
+    return HasReg(CC_AArch64_WebKit_JS_ArgRegs, Reg);
+  case CallingConv::GHC:
+    return HasReg(CC_AArch64_GHC_ArgRegs, Reg);
+  case CallingConv::C:
+  case CallingConv::Fast:
+  case CallingConv::PreserveMost:
+  case CallingConv::CXX_FAST_TLS:
+  case CallingConv::Swift:
+  case CallingConv::SwiftTail:
+  case CallingConv::Tail:
+    if (STI.isTargetWindows() && IsVarArg)
+      return HasReg(CC_AArch64_Win64_VarArg_ArgRegs, Reg);
+    if (!STI.isTargetDarwin()) {
+      switch (CC) {
+      default:
+        return HasReg(CC_AArch64_AAPCS_ArgRegs, Reg);
+      case CallingConv::Swift:
+      case CallingConv::SwiftTail:
+        return HasReg(CC_AArch64_AAPCS_ArgRegs, Reg) ||
+               HasReg(CC_AArch64_AAPCS_Swift_ArgRegs, Reg);
+      }
+    }
+    if (!IsVarArg) {
+      switch (CC) {
+      default:
+        return HasReg(CC_AArch64_DarwinPCS_ArgRegs, Reg);
+      case CallingConv::Swift:
+      case CallingConv::SwiftTail:
+        return HasReg(CC_AArch64_DarwinPCS_ArgRegs, Reg) ||
+               HasReg(CC_AArch64_DarwinPCS_Swift_ArgRegs, Reg);
+      }
+    }
+    if (STI.isTargetILP32())
+      return HasReg(CC_AArch64_DarwinPCS_ILP32_VarArg_ArgRegs, Reg);
+    return HasReg(CC_AArch64_DarwinPCS_VarArg_ArgRegs, Reg);
+  case CallingConv::Win64:
+    if (IsVarArg)
+      HasReg(CC_AArch64_Win64_VarArg_ArgRegs, Reg);
+    return HasReg(CC_AArch64_AAPCS_ArgRegs, Reg);
+  case CallingConv::CFGuard_Check:
+    return HasReg(CC_AArch64_Win64_CFGuard_Check_ArgRegs, Reg);
+  case CallingConv::AArch64_VectorCall:
+  case CallingConv::AArch64_SVE_VectorCall:
+    return HasReg(CC_AArch64_AAPCS_ArgRegs, Reg);
+  }
+}
+
 Register
 AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const AArch64FrameLowering *TFI = getFrameLowering(MF);
@@ -588,23 +652,31 @@ void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
 
 // Create a scratch register for the frame index elimination in an instruction.
 // This function has special handling of stack tagging loop pseudos, in which
-// case it can also change the instruction opcode (but not the operands).
+// case it can also change the instruction opcode.
 static Register
-createScratchRegisterForInstruction(MachineInstr &MI,
+createScratchRegisterForInstruction(MachineInstr &MI, unsigned FIOperandNum,
                                     const AArch64InstrInfo *TII) {
   // ST*Gloop have a reserved scratch register in operand 1. Use it, and also
   // replace the instruction with the writeback variant because it will now
   // satisfy the operand constraints for it.
-  if (MI.getOpcode() == AArch64::STGloop) {
-    MI.setDesc(TII->get(AArch64::STGloop_wback));
-    return MI.getOperand(1).getReg();
-  } else if (MI.getOpcode() == AArch64::STZGloop) {
-    MI.setDesc(TII->get(AArch64::STZGloop_wback));
-    return MI.getOperand(1).getReg();
+  Register ScratchReg;
+  if (MI.getOpcode() == AArch64::STGloop ||
+      MI.getOpcode() == AArch64::STZGloop) {
+    assert(FIOperandNum == 3 &&
+           "Wrong frame index operand for STGloop/STZGloop");
+    unsigned Op = MI.getOpcode() == AArch64::STGloop ? AArch64::STGloop_wback
+                                                     : AArch64::STZGloop_wback;
+    ScratchReg = MI.getOperand(1).getReg();
+    MI.getOperand(3).ChangeToRegister(ScratchReg, false, false, true);
+    MI.setDesc(TII->get(Op));
+    MI.tieOperands(1, 3);
   } else {
-    return MI.getMF()->getRegInfo().createVirtualRegister(
-        &AArch64::GPR64RegClass);
+    ScratchReg =
+        MI.getMF()->getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+    MI.getOperand(FIOperandNum)
+        .ChangeToRegister(ScratchReg, false, false, true);
   }
+  return ScratchReg;
 }
 
 void AArch64RegisterInfo::getOffsetOpcodes(
@@ -721,9 +793,9 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // If we get here, the immediate doesn't fit into the instruction.  We folded
   // as much as possible above.  Handle the rest, providing a register that is
   // SP+LargeImm.
-  Register ScratchReg = createScratchRegisterForInstruction(MI, TII);
+  Register ScratchReg =
+      createScratchRegisterForInstruction(MI, FIOperandNum, TII);
   emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII);
-  MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
 }
 
 unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
index 0c871ac089a7..12dd70fa4aa8 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -42,8 +42,6 @@ public:
   void UpdateCustomCallPreservedMask(MachineFunction &MF,
                                      const uint32_t **Mask) const;
 
-  static bool hasSVEArgsOrReturn(const MachineFunction *MF);
-
   /// Code Generation virtual methods...
   const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
   const MCPhysReg *getDarwinCalleeSavedRegs(const MachineFunction *MF) const;
@@ -120,6 +118,9 @@ public:
   bool hasBasePointer(const MachineFunction &MF) const;
   unsigned getBaseRegister() const;
 
+  bool isArgumentRegister(const MachineFunction &MF,
+                          MCRegister Reg) const override;
+
   // Debug information queries.
   Register getFrameRegister(const MachineFunction &MF) const override;
 
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index 70daf5abf81d..7a2b165570cb 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -871,7 +871,7 @@ class ZPRRegOp <string Suffix, AsmOperandClass C, ElementSizeEnum Size,
 // SVE predicate register classes.
 class PPRClass<int lastreg> : RegisterClass<
                                   "AArch64",
-                                  [ nxv16i1, nxv8i1, nxv4i1, nxv2i1 ], 16,
+                                  [ nxv16i1, nxv8i1, nxv4i1, nxv2i1, nxv1i1 ], 16,
                                   (sequence "P%u", 0, lastreg)> {
   let Size = 16;
 }
@@ -1212,26 +1212,28 @@ let SubRegIndices = [zasubb] in {
 
 // SME Register Classes
 
-// Accumulator array
-def MPR : RegisterClass<"AArch64", [untyped], 2048, (add ZA)> {
-  let Size = 2048;
-}
+let isAllocatable = 0 in {
+  // Accumulator array
+  def MPR : RegisterClass<"AArch64", [untyped], 2048, (add ZA)> {
+    let Size = 2048;
+  }
 
-// Accumulator array as single tiles
-def MPR8    : RegisterClass<"AArch64", [untyped], 2048, (add (sequence "ZAB%u", 0, 0))> {
-  let Size = 2048;
-}
-def MPR16   : RegisterClass<"AArch64", [untyped], 1024, (add (sequence "ZAH%u", 0, 1))> {
-  let Size = 1024;
-}
-def MPR32   : RegisterClass<"AArch64", [untyped],  512, (add (sequence "ZAS%u", 0, 3))> {
-  let Size = 512;
-}
-def MPR64   : RegisterClass<"AArch64", [untyped],  256, (add (sequence "ZAD%u", 0, 7))> {
-  let Size = 256;
-}
-def MPR128  : RegisterClass<"AArch64", [untyped],  128, (add (sequence "ZAQ%u", 0, 15))> {
-  let Size = 128;
+  // Accumulator array as single tiles
+  def MPR8    : RegisterClass<"AArch64", [untyped], 2048, (add (sequence "ZAB%u", 0, 0))> {
+    let Size = 2048;
+  }
+  def MPR16   : RegisterClass<"AArch64", [untyped], 1024, (add (sequence "ZAH%u", 0, 1))> {
+    let Size = 1024;
+  }
+  def MPR32   : RegisterClass<"AArch64", [untyped],  512, (add (sequence "ZAS%u", 0, 3))> {
+    let Size = 512;
+  }
+  def MPR64   : RegisterClass<"AArch64", [untyped],  256, (add (sequence "ZAD%u", 0, 7))> {
+    let Size = 256;
+  }
+  def MPR128  : RegisterClass<"AArch64", [untyped],  128, (add (sequence "ZAQ%u", 0, 15))> {
+    let Size = 128;
+  }
 }
 
 // SME Register Operands
@@ -1385,3 +1387,12 @@ def svcr_op : Operand<i32> {
     return AArch64SVCR::lookupSVCRByEncoding(MCOp.getImm()) != nullptr;
   }];
 }
+
+//===----------------------------------------------------------------------===//
+// Register categories.
+//
+
+def GeneralPurposeRegisters : RegisterCategory<[GPR64, GPR32]>;
+
+def FIXED_REGS : RegisterClass<"AArch64", [i64], 64, (add FP, SP, VG, FFR)>;
+def FixedRegisters : RegisterCategory<[CCR, FIXED_REGS]>;
diff --git a/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp
index c4965e7146ff..364ce687fd55 100644
--- a/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp
@@ -360,8 +360,8 @@ AArch64SLSHardening::ConvertBLRToBL(MachineBasicBlock &MBB,
   assert(ImpSPOpIdx != -1);
   int FirstOpIdxToRemove = std::max(ImpLROpIdx, ImpSPOpIdx);
   int SecondOpIdxToRemove = std::min(ImpLROpIdx, ImpSPOpIdx);
-  BL->RemoveOperand(FirstOpIdxToRemove);
-  BL->RemoveOperand(SecondOpIdxToRemove);
+  BL->removeOperand(FirstOpIdxToRemove);
+  BL->removeOperand(SecondOpIdxToRemove);
   // Now copy over the implicit operands from the original BLR
   BL->copyImplicitOps(MF, BLR);
   MF.moveCallSiteInfo(&BLR, BL);
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index aacace64e998..e595d20c8d4e 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -14,9 +14,18 @@
 // Add vector elements horizontally or vertically to ZA tile.
 //===----------------------------------------------------------------------===//
 
+def SDT_AArch64RDSVL  : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>]>;
+def AArch64rdsvl : SDNode<"AArch64ISD::RDSVL", SDT_AArch64RDSVL>;
+
 let Predicates = [HasSME] in {
+def RDSVLI_XI  : sve_int_read_vl_a<0b0, 0b11111, "rdsvl", /*streaming_sve=*/0b1>;
+def ADDSPL_XXI : sve_int_arith_vl<0b1, "addspl", /*streaming_sve=*/0b1>;
+def ADDSVL_XXI : sve_int_arith_vl<0b0, "addsvl", /*streaming_sve=*/0b1>;
+
 def ADDHA_MPPZ_S : sme_add_vector_to_tile_u32<0b0, "addha">;
 def ADDVA_MPPZ_S : sme_add_vector_to_tile_u32<0b1, "addva">;
+
+def : Pat<(AArch64rdsvl (i32 simm6_32b:$imm)), (RDSVLI_XI simm6_32b:$imm)>;
 }
 
 let Predicates = [HasSMEI64] in {
@@ -29,41 +38,41 @@ let Predicates = [HasSME] in {
 // Outer products
 //===----------------------------------------------------------------------===//
 
-defm BFMOPA_MPPZZ  : sme_bf16_outer_product<0b0, "bfmopa">;
-defm BFMOPS_MPPZZ  : sme_bf16_outer_product<0b1, "bfmops">;
+defm BFMOPA_MPPZZ  : sme_bf16_outer_product<0b0, "bfmopa", int_aarch64_sme_mopa_wide>;
+defm BFMOPS_MPPZZ  : sme_bf16_outer_product<0b1, "bfmops", int_aarch64_sme_mops_wide>;
 
-def FMOPA_MPPZZ_S : sme_outer_product_fp32<0b0, "fmopa">;
-def FMOPS_MPPZZ_S : sme_outer_product_fp32<0b1, "fmops">;
+defm FMOPA_MPPZZ_S : sme_outer_product_fp32<0b0, "fmopa", int_aarch64_sme_mopa>;
+defm FMOPS_MPPZZ_S : sme_outer_product_fp32<0b1, "fmops", int_aarch64_sme_mops>;
 }
 
 let Predicates = [HasSMEF64] in {
-def FMOPA_MPPZZ_D : sme_outer_product_fp64<0b0, "fmopa">;
-def FMOPS_MPPZZ_D : sme_outer_product_fp64<0b1, "fmops">;
+defm FMOPA_MPPZZ_D : sme_outer_product_fp64<0b0, "fmopa", int_aarch64_sme_mopa>;
+defm FMOPS_MPPZZ_D : sme_outer_product_fp64<0b1, "fmops", int_aarch64_sme_mops>;
 }
 
 let Predicates = [HasSME] in {
-defm FMOPAL_MPPZZ  : sme_f16_outer_product<0b0, "fmopa">;
-defm FMOPSL_MPPZZ  : sme_f16_outer_product<0b1, "fmops">;
-
-def SMOPA_MPPZZ_S  : sme_int_outer_product_i32<0b000, "smopa">;
-def SMOPS_MPPZZ_S  : sme_int_outer_product_i32<0b001, "smops">;
-def UMOPA_MPPZZ_S  : sme_int_outer_product_i32<0b110, "umopa">;
-def UMOPS_MPPZZ_S  : sme_int_outer_product_i32<0b111, "umops">;
-def SUMOPA_MPPZZ_S : sme_int_outer_product_i32<0b010, "sumopa">;
-def SUMOPS_MPPZZ_S : sme_int_outer_product_i32<0b011, "sumops">;
-def USMOPA_MPPZZ_S : sme_int_outer_product_i32<0b100, "usmopa">;
-def USMOPS_MPPZZ_S : sme_int_outer_product_i32<0b101, "usmops">;
+defm FMOPAL_MPPZZ  : sme_f16_outer_product<0b0, "fmopa", int_aarch64_sme_mopa_wide>;
+defm FMOPSL_MPPZZ  : sme_f16_outer_product<0b1, "fmops", int_aarch64_sme_mops_wide>;
+
+defm SMOPA_MPPZZ_S  : sme_int_outer_product_i32<0b000, "smopa",  int_aarch64_sme_smopa_wide>;
+defm SMOPS_MPPZZ_S  : sme_int_outer_product_i32<0b001, "smops",  int_aarch64_sme_smops_wide>;
+defm UMOPA_MPPZZ_S  : sme_int_outer_product_i32<0b110, "umopa",  int_aarch64_sme_umopa_wide>;
+defm UMOPS_MPPZZ_S  : sme_int_outer_product_i32<0b111, "umops",  int_aarch64_sme_umops_wide>;
+defm SUMOPA_MPPZZ_S : sme_int_outer_product_i32<0b010, "sumopa", int_aarch64_sme_sumopa_wide>;
+defm SUMOPS_MPPZZ_S : sme_int_outer_product_i32<0b011, "sumops", int_aarch64_sme_sumops_wide>;
+defm USMOPA_MPPZZ_S : sme_int_outer_product_i32<0b100, "usmopa", int_aarch64_sme_usmopa_wide>;
+defm USMOPS_MPPZZ_S : sme_int_outer_product_i32<0b101, "usmops", int_aarch64_sme_usmops_wide>;
 }
 
 let Predicates = [HasSMEI64] in {
-def SMOPA_MPPZZ_D  : sme_int_outer_product_i64<0b000, "smopa">;
-def SMOPS_MPPZZ_D  : sme_int_outer_product_i64<0b001, "smops">;
-def UMOPA_MPPZZ_D  : sme_int_outer_product_i64<0b110, "umopa">;
-def UMOPS_MPPZZ_D  : sme_int_outer_product_i64<0b111, "umops">;
-def SUMOPA_MPPZZ_D : sme_int_outer_product_i64<0b010, "sumopa">;
-def SUMOPS_MPPZZ_D : sme_int_outer_product_i64<0b011, "sumops">;
-def USMOPA_MPPZZ_D : sme_int_outer_product_i64<0b100, "usmopa">;
-def USMOPS_MPPZZ_D : sme_int_outer_product_i64<0b101, "usmops">;
+defm SMOPA_MPPZZ_D  : sme_int_outer_product_i64<0b000, "smopa",  int_aarch64_sme_smopa_wide>;
+defm SMOPS_MPPZZ_D  : sme_int_outer_product_i64<0b001, "smops",  int_aarch64_sme_smops_wide>;
+defm UMOPA_MPPZZ_D  : sme_int_outer_product_i64<0b110, "umopa",  int_aarch64_sme_umopa_wide>;
+defm UMOPS_MPPZZ_D  : sme_int_outer_product_i64<0b111, "umops",  int_aarch64_sme_umops_wide>;
+defm SUMOPA_MPPZZ_D : sme_int_outer_product_i64<0b010, "sumopa", int_aarch64_sme_sumopa_wide>;
+defm SUMOPS_MPPZZ_D : sme_int_outer_product_i64<0b011, "sumops", int_aarch64_sme_sumops_wide>;
+defm USMOPA_MPPZZ_D : sme_int_outer_product_i64<0b100, "usmopa", int_aarch64_sme_usmopa_wide>;
+defm USMOPS_MPPZZ_D : sme_int_outer_product_i64<0b101, "usmops", int_aarch64_sme_usmops_wide>;
 }
 
 let Predicates = [HasSME] in {
@@ -129,15 +138,21 @@ def : InstAlias<"smstop",     (MSRpstatesvcrImm1 0b011, 0b0)>;
 def : InstAlias<"smstop sm",  (MSRpstatesvcrImm1 0b001, 0b0)>;
 def : InstAlias<"smstop za",  (MSRpstatesvcrImm1 0b010, 0b0)>;
 
+// Read and write TPIDR2_EL0
+def : Pat<(int_aarch64_sme_set_tpidr2 i64:$val),
+          (MSR 0xde85, GPR64:$val)>;
+def : Pat<(i64 (int_aarch64_sme_get_tpidr2)),
+          (MRS 0xde85)>;
+
 //===----------------------------------------------------------------------===//
 // SVE2 instructions
 //===----------------------------------------------------------------------===//
 
-def REVD_ZPmZ : sve2_int_perm_revd<"revd">;
+defm REVD_ZPmZ : sve2_int_perm_revd<"revd", AArch64revd_mt>;
 
-defm SCLAMP_ZZZ : sve2_clamp<"sclamp", 0b0>;
-defm UCLAMP_ZZZ : sve2_clamp<"uclamp", 0b1>;
+defm SCLAMP_ZZZ : sve2_clamp<"sclamp", 0b0, int_aarch64_sve_sclamp>;
+defm UCLAMP_ZZZ : sve2_clamp<"uclamp", 0b1, int_aarch64_sve_uclamp>;
 
-defm PSEL_PPPRI : sve2_int_perm_sel_p<"psel">;
+defm PSEL_PPPRI : sve2_int_perm_sel_p<"psel", int_aarch64_sve_psel>;
 
 } // End let Predicates = [HasSME]
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 1d162610de9c..68ff1b78e84b 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -165,8 +165,8 @@ def AArch64lasta     : SDNode<"AArch64ISD::LASTA",        SDT_AArch64Reduce>;
 def AArch64lastb     : SDNode<"AArch64ISD::LASTB",        SDT_AArch64Reduce>;
 
 def SDT_AArch64Arith : SDTypeProfile<1, 3, [
-  SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>,
-  SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2>, SDTCisSameAs<2,3>
+  SDTCisVec<0>, SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2>,
+  SDTCisSameAs<2,3>, SDTCisSameNumEltsAs<0,1>
 ]>;
 
 def SDT_AArch64FMA : SDTypeProfile<1, 4, [
@@ -175,7 +175,6 @@ def SDT_AArch64FMA : SDTypeProfile<1, 4, [
 ]>;
 
 // Predicated operations with the result of inactive lanes being unspecified.
-def AArch64add_p  : SDNode<"AArch64ISD::ADD_PRED",  SDT_AArch64Arith>;
 def AArch64asr_p  : SDNode<"AArch64ISD::SRA_PRED",  SDT_AArch64Arith>;
 def AArch64fadd_p : SDNode<"AArch64ISD::FADD_PRED", SDT_AArch64Arith>;
 def AArch64fdiv_p : SDNode<"AArch64ISD::FDIV_PRED", SDT_AArch64Arith>;
@@ -194,7 +193,6 @@ def AArch64sdiv_p : SDNode<"AArch64ISD::SDIV_PRED", SDT_AArch64Arith>;
 def AArch64smax_p : SDNode<"AArch64ISD::SMAX_PRED", SDT_AArch64Arith>;
 def AArch64smin_p : SDNode<"AArch64ISD::SMIN_PRED", SDT_AArch64Arith>;
 def AArch64smulh_p : SDNode<"AArch64ISD::MULHS_PRED", SDT_AArch64Arith>;
-def AArch64sub_p  : SDNode<"AArch64ISD::SUB_PRED",  SDT_AArch64Arith>;
 def AArch64uabd_p : SDNode<"AArch64ISD::ABDU_PRED", SDT_AArch64Arith>;
 def AArch64udiv_p : SDNode<"AArch64ISD::UDIV_PRED", SDT_AArch64Arith>;
 def AArch64umax_p : SDNode<"AArch64ISD::UMAX_PRED", SDT_AArch64Arith>;
@@ -235,6 +233,7 @@ def AArch64rbit_mt   : SDNode<"AArch64ISD::BITREVERSE_MERGE_PASSTHRU", SDT_AArch
 def AArch64revb_mt   : SDNode<"AArch64ISD::BSWAP_MERGE_PASSTHRU", SDT_AArch64Arith>;
 def AArch64revh_mt   : SDNode<"AArch64ISD::REVH_MERGE_PASSTHRU", SDT_AArch64Arith>;
 def AArch64revw_mt   : SDNode<"AArch64ISD::REVW_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64revd_mt   : SDNode<"AArch64ISD::REVD_MERGE_PASSTHRU", SDT_AArch64Arith>;
 
 // These are like the above but we don't yet have need for ISD nodes. They allow
 // a single pattern to match intrinsic and ISD operand layouts.
@@ -242,6 +241,26 @@ def AArch64cls_mt  : PatFrags<(ops node:$pg, node:$op, node:$pt), [(int_aarch64_
 def AArch64cnot_mt : PatFrags<(ops node:$pg, node:$op, node:$pt), [(int_aarch64_sve_cnot node:$pt, node:$pg, node:$op)]>;
 def AArch64not_mt  : PatFrags<(ops node:$pg, node:$op, node:$pt), [(int_aarch64_sve_not  node:$pt, node:$pg, node:$op)]>;
 
+def AArch64fmul_m1 : EitherVSelectOrPassthruPatFrags<int_aarch64_sve_fmul, AArch64fmul_p>;
+def AArch64fadd_m1 : EitherVSelectOrPassthruPatFrags<int_aarch64_sve_fadd, AArch64fadd_p>;
+def AArch64fsub_m1 : EitherVSelectOrPassthruPatFrags<int_aarch64_sve_fsub, AArch64fsub_p>;
+
+def AArch64saba : PatFrags<(ops node:$op1, node:$op2, node:$op3),
+                           [(int_aarch64_sve_saba node:$op1, node:$op2, node:$op3),
+                            (add node:$op1, (AArch64sabd_p (SVEAllActive), node:$op2, node:$op3))]>;
+
+def AArch64uaba : PatFrags<(ops node:$op1, node:$op2, node:$op3),
+                           [(int_aarch64_sve_uaba node:$op1, node:$op2, node:$op3),
+                            (add node:$op1, (AArch64uabd_p (SVEAllActive), node:$op2, node:$op3))]>;
+
+def AArch64usra : PatFrags<(ops node:$op1, node:$op2, node:$op3),
+                           [(int_aarch64_sve_usra node:$op1, node:$op2, node:$op3),
+                            (add node:$op1, (AArch64lsr_p (SVEAllActive), node:$op2, (SVEShiftSplatImmR (i32 node:$op3))))]>;
+
+def AArch64ssra : PatFrags<(ops node:$op1, node:$op2, node:$op3),
+                           [(int_aarch64_sve_ssra node:$op1, node:$op2, node:$op3),
+                            (add node:$op1, (AArch64asr_p (SVEAllActive), node:$op2, (SVEShiftSplatImmR (i32 node:$op3))))]>;
+
 def SDT_AArch64FCVT : SDTypeProfile<1, 3, [
   SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>,
   SDTCVecEltisVT<1,i1>
@@ -282,6 +301,14 @@ def AArch64mul_p_oneuse : PatFrag<(ops node:$pred, node:$src1, node:$src2),
 def AArch64fabd_p : PatFrag<(ops node:$pg, node:$op1, node:$op2),
                             (AArch64fabs_mt node:$pg, (AArch64fsub_p node:$pg, node:$op1, node:$op2), undef)>;
 
+// FMAs with a negated multiplication operand can be commuted.
+def AArch64fmls_p : PatFrags<(ops node:$pred, node:$op1, node:$op2, node:$op3),
+                          [(AArch64fma_p node:$pred, (AArch64fneg_mt node:$pred, node:$op1, (undef)), node:$op2, node:$op3),
+                           (AArch64fma_p node:$pred, node:$op2, (AArch64fneg_mt node:$pred, node:$op1, (undef)), node:$op3)]>;
+
+def AArch64fsubr_p : PatFrag<(ops node:$pg, node:$op1, node:$op2),
+                             (AArch64fsub_p node:$pg, node:$op2, node:$op1)>;
+
 def AArch64fneg_mt_nsz : PatFrag<(ops node:$pred, node:$op, node:$pt),
                                  (AArch64fneg_mt node:$pred, node:$op, node:$pt), [{
   return N->getFlags().hasNoSignedZeros();
@@ -295,11 +322,14 @@ def SDT_AArch64Arith_Unpred : SDTypeProfile<1, 2, [
 def AArch64bic_node : SDNode<"AArch64ISD::BIC",  SDT_AArch64Arith_Unpred>;
 
 def AArch64bic : PatFrags<(ops node:$op1, node:$op2),
-                          [(and node:$op1, (xor node:$op2, (AArch64dup (i32 -1)))),
-                           (and node:$op1, (xor node:$op2, (AArch64dup (i64 -1)))),
+                          [(and node:$op1, (xor node:$op2, (splat_vector (i32 -1)))),
+                           (and node:$op1, (xor node:$op2, (splat_vector (i64 -1)))),
                            (and node:$op1, (xor node:$op2, (SVEAllActive))),
                            (AArch64bic_node node:$op1, node:$op2)]>;
 
+def AArch64subr : PatFrag<(ops node:$op1, node:$op2),
+                          (sub node:$op2, node:$op1)>;
+
 let Predicates = [HasSVE] in {
   defm RDFFR_PPz  : sve_int_rdffr_pred<0b0, "rdffr", int_aarch64_sve_rdffr_z>;
   def  RDFFRS_PPz : sve_int_rdffr_pred<0b1, "rdffrs">;
@@ -308,7 +338,7 @@ let Predicates = [HasSVE] in {
   def  WRFFR      : sve_int_wrffr<"wrffr", int_aarch64_sve_wrffr>;
 } // End HasSVE
 
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
   defm ADD_ZZZ   : sve_int_bin_cons_arit_0<0b000, "add", add>;
   defm SUB_ZZZ   : sve_int_bin_cons_arit_0<0b001, "sub", sub>;
   defm SQADD_ZZZ : sve_int_bin_cons_arit_0<0b100, "sqadd", saddsat>;
@@ -325,25 +355,27 @@ let Predicates = [HasSVEorStreamingSVE] in {
   defm SUB_ZPmZ  : sve_int_bin_pred_arit_0<0b001, "sub",  "SUB_ZPZZ", int_aarch64_sve_sub, DestructiveBinaryCommWithRev, "SUBR_ZPmZ">;
   defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr", "SUBR_ZPZZ", int_aarch64_sve_subr, DestructiveBinaryCommWithRev, "SUB_ZPmZ", /*isReverseInstr*/ 1>;
 
-  defm ADD_ZPZZ  : sve_int_bin_pred_bhsd<AArch64add_p>;
-  defm SUB_ZPZZ  : sve_int_bin_pred_bhsd<AArch64sub_p>;
-} // End HasSVEorStreamingSVE
+  defm ORR_ZPmZ : sve_int_bin_pred_log<0b000, "orr", "ORR_ZPZZ", int_aarch64_sve_orr, DestructiveBinaryComm>;
+  defm EOR_ZPmZ : sve_int_bin_pred_log<0b001, "eor", "EOR_ZPZZ", int_aarch64_sve_eor, DestructiveBinaryComm>;
+  defm AND_ZPmZ : sve_int_bin_pred_log<0b010, "and", "AND_ZPZZ", int_aarch64_sve_and, DestructiveBinaryComm>;
+  defm BIC_ZPmZ : sve_int_bin_pred_log<0b011, "bic", "BIC_ZPZZ", int_aarch64_sve_bic, DestructiveBinary>;
+} // End HasSVEorSME
 
-let Predicates = [HasSVEorStreamingSVE, UseExperimentalZeroingPseudos] in {
+let Predicates = [HasSVEorSME, UseExperimentalZeroingPseudos] in {
   defm ADD_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_add>;
   defm SUB_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_sub>;
   defm SUBR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_subr>;
-} // End HasSVEorStreamingSVE, UseExperimentalZeroingPseudos
 
-let Predicates = [HasSVEorStreamingSVE] in {
-  defm ORR_ZPmZ : sve_int_bin_pred_log<0b000, "orr", int_aarch64_sve_orr>;
-  defm EOR_ZPmZ : sve_int_bin_pred_log<0b001, "eor", int_aarch64_sve_eor>;
-  defm AND_ZPmZ : sve_int_bin_pred_log<0b010, "and", int_aarch64_sve_and>;
-  defm BIC_ZPmZ : sve_int_bin_pred_log<0b011, "bic", int_aarch64_sve_bic>;
+  defm ORR_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_orr>;
+  defm EOR_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_eor>;
+  defm AND_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_and>;
+  defm BIC_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<null_frag>;
+} // End HasSVEorSME, UseExperimentalZeroingPseudos
 
+let Predicates = [HasSVEorSME] in {
   defm ADD_ZI   : sve_int_arith_imm0<0b000, "add", add>;
   defm SUB_ZI   : sve_int_arith_imm0<0b001, "sub", sub>;
-  defm SUBR_ZI  : sve_int_arith_imm0_subr<0b011, "subr", sub>;
+  defm SUBR_ZI  : sve_int_arith_imm0<0b011, "subr", AArch64subr>;
   defm SQADD_ZI : sve_int_arith_imm0<0b100, "sqadd", saddsat>;
   defm UQADD_ZI : sve_int_arith_imm0<0b101, "uqadd", uaddsat>;
   defm SQSUB_ZI : sve_int_arith_imm0<0b110, "sqsub", ssubsat>;
@@ -440,11 +472,11 @@ let Predicates = [HasSVEorStreamingSVE] in {
   defm FMINNM_ZPmI  : sve_fp_2op_i_p_zds<0b101, "fminnm", "FMINNM_ZPZI", sve_fpimm_zero_one, fpimm0, fpimm_one, int_aarch64_sve_fminnm>;
   defm FMAX_ZPmI    : sve_fp_2op_i_p_zds<0b110, "fmax", "FMAX_ZPZI", sve_fpimm_zero_one, fpimm0, fpimm_one, int_aarch64_sve_fmax>;
   defm FMIN_ZPmI    : sve_fp_2op_i_p_zds<0b111, "fmin", "FMIN_ZPZI", sve_fpimm_zero_one, fpimm0, fpimm_one, int_aarch64_sve_fmin>;
-   
+
   defm FADD_ZPZI    : sve_fp_2op_i_p_zds_hfd<sve_fpimm_half_one, fpimm_half, fpimm_one, AArch64fadd_p>;
   defm FSUB_ZPZI    : sve_fp_2op_i_p_zds_hfd<sve_fpimm_half_one, fpimm_half, fpimm_one, AArch64fsub_p>;
   defm FMUL_ZPZI    : sve_fp_2op_i_p_zds_hfd<sve_fpimm_half_two, fpimm_half, fpimm_two, AArch64fmul_p>;
-  defm FSUBR_ZPZI   : sve_fp_2op_i_p_zds_hfd<sve_fpimm_half_one, fpimm_half, fpimm_one>;
+  defm FSUBR_ZPZI   : sve_fp_2op_i_p_zds_hfd<sve_fpimm_half_one, fpimm_half, fpimm_one, AArch64fsubr_p>;
   defm FMAXNM_ZPZI  : sve_fp_2op_i_p_zds_hfd<sve_fpimm_zero_one, fpimm0, fpimm_one, AArch64fmaxnm_p>;
   defm FMINNM_ZPZI  : sve_fp_2op_i_p_zds_hfd<sve_fpimm_zero_one, fpimm0, fpimm_one, AArch64fminnm_p>;
   defm FMAX_ZPZI    : sve_fp_2op_i_p_zds_hfd<sve_fpimm_zero_one, fpimm0, fpimm_one, AArch64fmax_p>;
@@ -461,9 +493,9 @@ let Predicates = [HasSVEorStreamingSVE] in {
     defm FMIN_ZPZI    : sve_fp_2op_i_p_zds_zeroing_hfd<sve_fpimm_zero_one, fpimm0, fpimm_one, int_aarch64_sve_fmin>;
   }
 
-  defm FADD_ZPmZ   : sve_fp_2op_p_zds<0b0000, "fadd", "FADD_ZPZZ", int_aarch64_sve_fadd, DestructiveBinaryComm>;
-  defm FSUB_ZPmZ   : sve_fp_2op_p_zds<0b0001, "fsub", "FSUB_ZPZZ", int_aarch64_sve_fsub, DestructiveBinaryCommWithRev, "FSUBR_ZPmZ">;
-  defm FMUL_ZPmZ   : sve_fp_2op_p_zds<0b0010, "fmul", "FMUL_ZPZZ", int_aarch64_sve_fmul, DestructiveBinaryComm>;
+  defm FADD_ZPmZ   : sve_fp_2op_p_zds<0b0000, "fadd", "FADD_ZPZZ", AArch64fadd_m1, DestructiveBinaryComm>;
+  defm FSUB_ZPmZ   : sve_fp_2op_p_zds<0b0001, "fsub", "FSUB_ZPZZ", AArch64fsub_m1, DestructiveBinaryCommWithRev, "FSUBR_ZPmZ">;
+  defm FMUL_ZPmZ   : sve_fp_2op_p_zds<0b0010, "fmul", "FMUL_ZPZZ", AArch64fmul_m1, DestructiveBinaryComm>;
   defm FSUBR_ZPmZ  : sve_fp_2op_p_zds<0b0011, "fsubr", "FSUBR_ZPZZ", int_aarch64_sve_fsubr, DestructiveBinaryCommWithRev, "FSUB_ZPmZ", /*isReverseInstr*/ 1>;
   defm FMAXNM_ZPmZ : sve_fp_2op_p_zds<0b0100, "fmaxnm", "FMAXNM_ZPZZ", int_aarch64_sve_fmaxnm, DestructiveBinaryComm>;
   defm FMINNM_ZPmZ : sve_fp_2op_p_zds<0b0101, "fminnm", "FMINNM_ZPZZ", int_aarch64_sve_fminnm, DestructiveBinaryComm>;
@@ -484,9 +516,9 @@ let Predicates = [HasSVEorStreamingSVE] in {
   defm FMIN_ZPZZ   : sve_fp_bin_pred_hfd<AArch64fmin_p>;
   defm FABD_ZPZZ   : sve_fp_bin_pred_hfd<AArch64fabd_p>;
   defm FDIV_ZPZZ   : sve_fp_bin_pred_hfd<AArch64fdiv_p>;
-} // End HasSVEorStreamingSVE
+} // End HasSVEorSME
 
-let Predicates = [HasSVEorStreamingSVE, UseExperimentalZeroingPseudos] in {
+let Predicates = [HasSVEorSME, UseExperimentalZeroingPseudos] in {
   defm FADD_ZPZZ   : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fadd>;
   defm FSUB_ZPZZ   : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fsub>;
   defm FMUL_ZPZZ   : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmul>;
@@ -499,28 +531,28 @@ let Predicates = [HasSVEorStreamingSVE, UseExperimentalZeroingPseudos] in {
   defm FMULX_ZPZZ  : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmulx>;
   defm FDIVR_ZPZZ  : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fdivr>;
   defm FDIV_ZPZZ   : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fdiv>;
-} // End HasSVEorStreamingSVE, UseExperimentalZeroingPseudos
+} // End HasSVEorSME, UseExperimentalZeroingPseudos
 
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
   defm FADD_ZZZ    : sve_fp_3op_u_zd<0b000, "fadd", fadd, AArch64fadd_p>;
   defm FSUB_ZZZ    : sve_fp_3op_u_zd<0b001, "fsub", fsub, AArch64fsub_p>;
   defm FMUL_ZZZ    : sve_fp_3op_u_zd<0b010, "fmul", fmul, AArch64fmul_p>;
-} // End HasSVEorStreamingSVE
+} // End HasSVEorSME
 
 let Predicates = [HasSVE] in {
   defm FTSMUL_ZZZ  : sve_fp_3op_u_zd_ftsmul<0b011, "ftsmul", int_aarch64_sve_ftsmul_x>;
 } // End HasSVE
 
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
   defm FRECPS_ZZZ  : sve_fp_3op_u_zd<0b110, "frecps",  AArch64frecps>;
   defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts", AArch64frsqrts>;
-} // End HasSVEorStreamingSVE
+} // End HasSVEorSME
 
 let Predicates = [HasSVE] in {
   defm FTSSEL_ZZZ : sve_int_bin_cons_misc_0_b<"ftssel", int_aarch64_sve_ftssel_x>;
 } // End HasSVE
 
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
   defm FCADD_ZPmZ : sve_fp_fcadd<"fcadd", int_aarch64_sve_fcadd>;
   defm FCMLA_ZPmZZ : sve_fp_fcmla<"fcmla", int_aarch64_sve_fcmla>;
 
@@ -545,7 +577,7 @@ let Predicates = [HasSVEorStreamingSVE] in {
               (!cast<Instruction>("FMLA_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>;
 
     // Zd = Za + -Zn * Zm
-    def : Pat<(Ty (AArch64fma_p PredTy:$P, (AArch64fneg_mt PredTy:$P, Ty:$Zn, (Ty (undef))), Ty:$Zm, Ty:$Za)),
+    def : Pat<(Ty (AArch64fmls_p PredTy:$P, Ty:$Zn, Ty:$Zm, Ty:$Za)),
               (!cast<Instruction>("FMLS_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>;
 
     // Zd = -Za + Zn * Zm
@@ -576,26 +608,26 @@ let Predicates = [HasSVEorStreamingSVE] in {
   defm : fma<nxv4f32, nxv4i1, "S">;
   defm : fma<nxv2f32, nxv2i1, "S">;
   defm : fma<nxv2f64, nxv2i1, "D">;
-} // End HasSVEorStreamingSVE
+} // End HasSVEorSME
 
 let Predicates = [HasSVE] in {
   defm FTMAD_ZZI : sve_fp_ftmad<"ftmad", int_aarch64_sve_ftmad_x>;
 } // End HasSVE
 
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
   defm FMLA_ZZZI : sve_fp_fma_by_indexed_elem<0b0, "fmla", int_aarch64_sve_fmla_lane>;
   defm FMLS_ZZZI : sve_fp_fma_by_indexed_elem<0b1, "fmls", int_aarch64_sve_fmls_lane>;
 
   defm FCMLA_ZZZI : sve_fp_fcmla_by_indexed_elem<"fcmla", int_aarch64_sve_fcmla_lane>;
   defm FMUL_ZZZI   : sve_fp_fmul_by_indexed_elem<"fmul", int_aarch64_sve_fmul_lane>;
-} // End HasSVEorStreamingSVE
+} // End HasSVEorSME
 
 let Predicates = [HasSVE] in {
   // SVE floating point reductions.
   defm FADDA_VPZ   : sve_fp_2op_p_vd<0b000, "fadda",   AArch64fadda_p>;
 } // End HasSVE
 
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
   defm FADDV_VPZ   : sve_fp_fast_red<0b000, "faddv",   AArch64faddv_p>;
   defm FMAXNMV_VPZ : sve_fp_fast_red<0b100, "fmaxnmv", AArch64fmaxnmv_p>;
   defm FMINNMV_VPZ : sve_fp_fast_red<0b101, "fminnmv", AArch64fminnmv_p>;
@@ -613,7 +645,7 @@ let Predicates = [HasSVEorStreamingSVE] in {
   defm FCPY_ZPmI : sve_int_dup_fpimm_pred<"fcpy">;
 
   // Splat scalar register (unpredicated, GPR or vector + element index)
-  defm DUP_ZR  : sve_int_perm_dup_r<"dup", AArch64dup>;
+  defm DUP_ZR  : sve_int_perm_dup_r<"dup", splat_vector>;
   defm DUP_ZZI : sve_int_perm_dup_i<"dup">;
 
   // Splat scalar register (predicated)
@@ -621,61 +653,67 @@ let Predicates = [HasSVEorStreamingSVE] in {
   defm CPY_ZPmV : sve_int_perm_cpy_v<"cpy", AArch64dup_mt>;
 
   // Duplicate FP scalar into all vector elements
-  def : Pat<(nxv8f16 (AArch64dup (f16 FPR16:$src))),
+  def : Pat<(nxv8f16 (splat_vector (f16 FPR16:$src))),
             (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
-  def : Pat<(nxv4f16 (AArch64dup (f16 FPR16:$src))),
+  def : Pat<(nxv4f16 (splat_vector (f16 FPR16:$src))),
             (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
-  def : Pat<(nxv2f16 (AArch64dup (f16 FPR16:$src))),
+  def : Pat<(nxv2f16 (splat_vector (f16 FPR16:$src))),
             (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
-  def : Pat<(nxv4f32 (AArch64dup (f32 FPR32:$src))),
+  def : Pat<(nxv4f32 (splat_vector (f32 FPR32:$src))),
             (DUP_ZZI_S (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), 0)>;
-  def : Pat<(nxv2f32 (AArch64dup (f32 FPR32:$src))),
+  def : Pat<(nxv2f32 (splat_vector (f32 FPR32:$src))),
             (DUP_ZZI_S (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), 0)>;
-  def : Pat<(nxv2f64 (AArch64dup (f64 FPR64:$src))),
+  def : Pat<(nxv2f64 (splat_vector (f64 FPR64:$src))),
             (DUP_ZZI_D (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$src, dsub), 0)>;
-  def : Pat<(nxv8bf16 (AArch64dup (bf16 FPR16:$src))),
+  def : Pat<(nxv8bf16 (splat_vector (bf16 FPR16:$src))),
+            (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
+  def : Pat<(nxv4bf16 (splat_vector (bf16 FPR16:$src))),
+            (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
+  def : Pat<(nxv2bf16 (splat_vector (bf16 FPR16:$src))),
             (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
 
   // Duplicate +0.0 into all vector elements
-  def : Pat<(nxv8f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
-  def : Pat<(nxv4f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
-  def : Pat<(nxv2f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
-  def : Pat<(nxv4f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>;
-  def : Pat<(nxv2f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>;
-  def : Pat<(nxv2f64 (AArch64dup (f64 fpimm0))), (DUP_ZI_D 0, 0)>;
-  def : Pat<(nxv8bf16 (AArch64dup (bf16 fpimm0))), (DUP_ZI_H 0, 0)>;
+  def : Pat<(nxv8f16 (splat_vector (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
+  def : Pat<(nxv4f16 (splat_vector (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
+  def : Pat<(nxv2f16 (splat_vector (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
+  def : Pat<(nxv4f32 (splat_vector (f32 fpimm0))), (DUP_ZI_S 0, 0)>;
+  def : Pat<(nxv2f32 (splat_vector (f32 fpimm0))), (DUP_ZI_S 0, 0)>;
+  def : Pat<(nxv2f64 (splat_vector (f64 fpimm0))), (DUP_ZI_D 0, 0)>;
+  def : Pat<(nxv8bf16 (splat_vector (bf16 fpimm0))), (DUP_ZI_H 0, 0)>;
+  def : Pat<(nxv4bf16 (splat_vector (bf16 fpimm0))), (DUP_ZI_H 0, 0)>;
+  def : Pat<(nxv2bf16 (splat_vector (bf16 fpimm0))), (DUP_ZI_H 0, 0)>;
 
   // Duplicate Int immediate into all vector elements
-  def : Pat<(nxv16i8 (AArch64dup (i32 (SVE8BitLslImm32 i32:$a, i32:$b)))),
+  def : Pat<(nxv16i8 (splat_vector (i32 (SVECpyDupImm8Pat i32:$a, i32:$b)))),
             (DUP_ZI_B $a, $b)>;
-  def : Pat<(nxv8i16 (AArch64dup (i32 (SVE8BitLslImm32 i32:$a, i32:$b)))),
+  def : Pat<(nxv8i16 (splat_vector (i32 (SVECpyDupImm16Pat i32:$a, i32:$b)))),
             (DUP_ZI_H $a, $b)>;
-  def : Pat<(nxv4i32 (AArch64dup (i32 (SVE8BitLslImm32 i32:$a, i32:$b)))),
+  def : Pat<(nxv4i32 (splat_vector (i32 (SVECpyDupImm32Pat i32:$a, i32:$b)))),
             (DUP_ZI_S $a, $b)>;
-  def : Pat<(nxv2i64 (AArch64dup (i64 (SVE8BitLslImm64 i32:$a, i32:$b)))),
+  def : Pat<(nxv2i64 (splat_vector (i64 (SVECpyDupImm64Pat i32:$a, i32:$b)))),
             (DUP_ZI_D $a, $b)>;
 
   // Duplicate immediate FP into all vector elements.
-  def : Pat<(nxv2f32 (AArch64dup (f32 fpimm:$val))),
+  def : Pat<(nxv2f32 (splat_vector (f32 fpimm:$val))),
             (DUP_ZR_S (MOVi32imm (bitcast_fpimm_to_i32 f32:$val)))>;
-  def : Pat<(nxv4f32 (AArch64dup (f32 fpimm:$val))),
+  def : Pat<(nxv4f32 (splat_vector (f32 fpimm:$val))),
             (DUP_ZR_S (MOVi32imm (bitcast_fpimm_to_i32 f32:$val)))>;
-  def : Pat<(nxv2f64 (AArch64dup (f64 fpimm:$val))),
+  def : Pat<(nxv2f64 (splat_vector (f64 fpimm:$val))),
             (DUP_ZR_D (MOVi64imm (bitcast_fpimm_to_i64 f64:$val)))>;
 
   // Duplicate FP immediate into all vector elements
   let AddedComplexity = 2 in {
-    def : Pat<(nxv8f16 (AArch64dup fpimm16:$imm8)),
+    def : Pat<(nxv8f16 (splat_vector fpimm16:$imm8)),
               (FDUP_ZI_H fpimm16:$imm8)>;
-    def : Pat<(nxv4f16 (AArch64dup fpimm16:$imm8)),
+    def : Pat<(nxv4f16 (splat_vector fpimm16:$imm8)),
               (FDUP_ZI_H fpimm16:$imm8)>;
-    def : Pat<(nxv2f16 (AArch64dup fpimm16:$imm8)),
+    def : Pat<(nxv2f16 (splat_vector fpimm16:$imm8)),
               (FDUP_ZI_H fpimm16:$imm8)>;
-    def : Pat<(nxv4f32 (AArch64dup fpimm32:$imm8)),
+    def : Pat<(nxv4f32 (splat_vector fpimm32:$imm8)),
               (FDUP_ZI_S fpimm32:$imm8)>;
-    def : Pat<(nxv2f32 (AArch64dup fpimm32:$imm8)),
+    def : Pat<(nxv2f32 (splat_vector fpimm32:$imm8)),
               (FDUP_ZI_S fpimm32:$imm8)>;
-    def : Pat<(nxv2f64 (AArch64dup fpimm64:$imm8)),
+    def : Pat<(nxv2f64 (splat_vector fpimm64:$imm8)),
               (FDUP_ZI_D fpimm64:$imm8)>;
   }
 
@@ -683,13 +721,13 @@ let Predicates = [HasSVEorStreamingSVE] in {
   defm SEL_ZPZZ   : sve_int_sel_vvv<"sel", vselect>;
 
   defm SPLICE_ZPZ : sve_int_perm_splice<"splice", AArch64splice>;
-} // End HasSVEorStreamingSVE
+} // End HasSVEorSME
 
 let Predicates = [HasSVE] in {
   defm COMPACT_ZPZ : sve_int_perm_compact<"compact", int_aarch64_sve_compact>;
 } // End HasSVE
 
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
   defm INSR_ZR : sve_int_perm_insrs<"insr", AArch64insr>;
   defm INSR_ZV : sve_int_perm_insrv<"insr", AArch64insr>;
   defm EXT_ZZI : sve_int_perm_extract_i<"ext", AArch64ext>;
@@ -710,16 +748,21 @@ let Predicates = [HasSVEorStreamingSVE] in {
   defm PUNPKLO_PP : sve_int_perm_punpk<0b0, "punpklo", int_aarch64_sve_punpklo>;
   defm PUNPKHI_PP : sve_int_perm_punpk<0b1, "punpkhi", int_aarch64_sve_punpkhi>;
 
+  // Define pattern for `nxv1i1 splat_vector(1)`.
+  // We do this here instead of in ISelLowering such that PatFrag's can still
+  // recognize a splat.
+  def : Pat<(nxv1i1 immAllOnesV), (PUNPKLO_PP (PTRUE_D 31))>;
+
   defm MOVPRFX_ZPzZ : sve_int_movprfx_pred_zero<0b000, "movprfx">;
   defm MOVPRFX_ZPmZ : sve_int_movprfx_pred_merge<0b001, "movprfx">;
   def MOVPRFX_ZZ : sve_int_bin_cons_misc_0_c<0b00000001, "movprfx", ZPRAny>;
-} // End HasSVEorStreamingSVE
+} // End HasSVEorSME
 
 let Predicates = [HasSVE] in {
   defm FEXPA_ZZ : sve_int_bin_cons_misc_0_c_fexpa<"fexpa", int_aarch64_sve_fexpa_x>;
 } // End HasSVE
 
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
   defm BRKPA_PPzPP  : sve_int_brkp<0b00, "brkpa",  int_aarch64_sve_brkpa_z>;
   defm BRKPAS_PPzPP : sve_int_brkp<0b10, "brkpas", null_frag>;
   defm BRKPB_PPzPP  : sve_int_brkp<0b01, "brkpb",  int_aarch64_sve_brkpb_z>;
@@ -831,7 +874,7 @@ let Predicates = [HasSVEorStreamingSVE] in {
   defm LD1SB_S : sve_mem_cld_ss<0b1101, "ld1sb", Z_s, ZPR32, GPR64NoXZRshifted8>;
   defm LD1SB_H : sve_mem_cld_ss<0b1110, "ld1sb", Z_h, ZPR16, GPR64NoXZRshifted8>;
   defm LD1D    : sve_mem_cld_ss<0b1111, "ld1d",  Z_d, ZPR64, GPR64NoXZRshifted64>;
-} // End HasSVEorStreamingSVE
+} // End HasSVEorSME
 
 let Predicates = [HasSVE] in {
   // non-faulting continuous load with reg+immediate
@@ -871,7 +914,7 @@ let Predicates = [HasSVE] in {
   defm LDFF1D    : sve_mem_cldff_ss<0b1111, "ldff1d",  Z_d, ZPR64, GPR64shifted64>;
 } // End HasSVE
 
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
   // LD(2|3|4) structured loads with reg+immediate
   defm LD2B_IMM : sve_mem_eld_si<0b00, 0b01, ZZ_b,   "ld2b", simm4s2>;
   defm LD3B_IMM : sve_mem_eld_si<0b00, 0b10, ZZZ_b,  "ld3b", simm4s3>;
@@ -899,7 +942,7 @@ let Predicates = [HasSVEorStreamingSVE] in {
   def LD2D : sve_mem_eld_ss<0b11, 0b01, ZZ_d,   "ld2d", GPR64NoXZRshifted64>;
   def LD3D : sve_mem_eld_ss<0b11, 0b10, ZZZ_d,  "ld3d", GPR64NoXZRshifted64>;
   def LD4D : sve_mem_eld_ss<0b11, 0b11, ZZZZ_d, "ld4d", GPR64NoXZRshifted64>;
-} // End HasSVEorStreamingSVE
+} // End HasSVEorSME
 
 let Predicates = [HasSVE] in {
   // Gathers using unscaled 32-bit offsets, e.g.
@@ -1013,9 +1056,95 @@ let Predicates = [HasSVE] in {
   defm GLDFF1W_D  : sve_mem_64b_gld_sv_32_scaled<0b1011, "ldff1w",  AArch64ldff1_gather_sxtw_scaled_z,  AArch64ldff1_gather_uxtw_scaled_z,  ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
   defm GLD1D      : sve_mem_64b_gld_sv_32_scaled<0b1110, "ld1d",    AArch64ld1_gather_sxtw_scaled_z,    AArch64ld1_gather_uxtw_scaled_z,    ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;
   defm GLDFF1D    : sve_mem_64b_gld_sv_32_scaled<0b1111, "ldff1d",  AArch64ldff1_gather_sxtw_scaled_z,  AArch64ldff1_gather_uxtw_scaled_z,  ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;
+
+  multiclass sve_masked_gather_x2_scaled<ValueType Ty, SDPatternOperator Load, string Inst> {
+    // base + vector of scaled offsets
+    def : Pat<(Ty (Load (SVEDup0Undef), (nxv2i1 PPR:$gp), GPR64:$base, (nxv2i64 ZPR:$offs))),
+              (!cast<Instruction>(Inst # _SCALED) PPR:$gp, GPR64:$base, ZPR:$offs)>;
+    // base + vector of signed 32bit scaled offsets
+    def : Pat<(Ty (Load (SVEDup0Undef), (nxv2i1 PPR:$gp), GPR64:$base, (sext_inreg (nxv2i64 ZPR:$offs), nxv2i32))),
+              (!cast<Instruction>(Inst # _SXTW_SCALED) PPR:$gp, GPR64:$base, ZPR:$offs)>;
+    // base + vector of unsigned 32bit scaled offsets
+    def : Pat<(Ty (Load (SVEDup0Undef), (nxv2i1 PPR:$gp), GPR64:$base, (and (nxv2i64 ZPR:$offs), (nxv2i64 (splat_vector (i64 0xFFFFFFFF)))))),
+              (!cast<Instruction>(Inst # _UXTW_SCALED) PPR:$gp, GPR64:$base, ZPR:$offs)>;
+  }
+
+  multiclass sve_masked_gather_x2_unscaled<ValueType Ty, SDPatternOperator Load, string Inst, Operand ImmTy> {
+    // vector of pointers + immediate offset (includes zero)
+    def : Pat<(Ty (Load (SVEDup0Undef), (nxv2i1 PPR:$gp), (i64 ImmTy:$imm), (nxv2i64 ZPR:$ptrs))),
+              (!cast<Instruction>(Inst # _IMM) PPR:$gp, ZPR:$ptrs, ImmTy:$imm)>;
+    // base + vector of offsets
+    def : Pat<(Ty (Load (SVEDup0Undef), (nxv2i1 PPR:$gp), GPR64:$base, (nxv2i64 ZPR:$offs))),
+              (!cast<Instruction>(Inst) PPR:$gp, GPR64:$base, ZPR:$offs)>;
+    // base + vector of signed 32bit offsets
+    def : Pat<(Ty (Load (SVEDup0Undef), (nxv2i1 PPR:$gp), GPR64:$base, (sext_inreg (nxv2i64 ZPR:$offs), nxv2i32))),
+              (!cast<Instruction>(Inst # _SXTW) PPR:$gp, GPR64:$base, ZPR:$offs)>;
+    // base + vector of unsigned 32bit offsets
+    def : Pat<(Ty (Load (SVEDup0Undef), (nxv2i1 PPR:$gp), GPR64:$base, (and (nxv2i64 ZPR:$offs), (nxv2i64 (splat_vector (i64 0xFFFFFFFF)))))),
+              (!cast<Instruction>(Inst # _UXTW) PPR:$gp, GPR64:$base, ZPR:$offs)>;
+  }
+
+  multiclass sve_masked_gather_x4<ValueType Ty, SDPatternOperator Load, Instruction Inst> {
+    def : Pat<(Ty (Load (SVEDup0Undef), (nxv4i1 PPR:$gp), GPR64:$base, (nxv4i32 ZPR:$offs))),
+              (Inst PPR:$gp, GPR64:$base, ZPR:$offs)>;
+  }
+
+  defm : sve_masked_gather_x2_scaled<nxv2i64,  azext_masked_gather_i16_signed_scaled, "GLD1H_D">;
+  defm : sve_masked_gather_x2_scaled<nxv2i64,  sext_masked_gather_i16_signed_scaled,  "GLD1SH_D">;
+  defm : sve_masked_gather_x2_scaled<nxv2i64,  azext_masked_gather_i32_signed_scaled, "GLD1W_D">;
+  defm : sve_masked_gather_x2_scaled<nxv2i64,  sext_masked_gather_i32_signed_scaled,  "GLD1SW_D">;
+  defm : sve_masked_gather_x2_scaled<nxv2i64,  nonext_masked_gather_signed_scaled,    "GLD1D">;
+  defm : sve_masked_gather_x2_scaled<nxv2f16,  nonext_masked_gather_signed_scaled,    "GLD1H_D">;
+  defm : sve_masked_gather_x2_scaled<nxv2f32,  nonext_masked_gather_signed_scaled,    "GLD1W_D">;
+  defm : sve_masked_gather_x2_scaled<nxv2f64,  nonext_masked_gather_signed_scaled,    "GLD1D">;
+  defm : sve_masked_gather_x2_scaled<nxv2bf16, nonext_masked_gather_signed_scaled,    "GLD1H_D">;
+
+  defm : sve_masked_gather_x2_unscaled<nxv2i64,  azext_masked_gather_i8_signed_unscaled,  "GLD1B_D" , imm0_31>;
+  defm : sve_masked_gather_x2_unscaled<nxv2i64,  sext_masked_gather_i8_signed_unscaled,   "GLD1SB_D", imm0_31>;
+  defm : sve_masked_gather_x2_unscaled<nxv2i64,  azext_masked_gather_i16_signed_unscaled, "GLD1H_D",  uimm5s2>;
+  defm : sve_masked_gather_x2_unscaled<nxv2i64,  sext_masked_gather_i16_signed_unscaled,  "GLD1SH_D", uimm5s2>;
+  defm : sve_masked_gather_x2_unscaled<nxv2i64,  azext_masked_gather_i32_signed_unscaled, "GLD1W_D",  uimm5s4>;
+  defm : sve_masked_gather_x2_unscaled<nxv2i64,  sext_masked_gather_i32_signed_unscaled,  "GLD1SW_D", uimm5s4>;
+  defm : sve_masked_gather_x2_unscaled<nxv2i64,  nonext_masked_gather_signed_unscaled,    "GLD1D",    uimm5s8>;
+  defm : sve_masked_gather_x2_unscaled<nxv2f16,  nonext_masked_gather_signed_unscaled,    "GLD1H_D",  uimm5s2>;
+  defm : sve_masked_gather_x2_unscaled<nxv2f32,  nonext_masked_gather_signed_unscaled,    "GLD1W_D",  uimm5s4>;
+  defm : sve_masked_gather_x2_unscaled<nxv2f64,  nonext_masked_gather_signed_unscaled,    "GLD1D",    uimm5s8>;
+  defm : sve_masked_gather_x2_unscaled<nxv2bf16, nonext_masked_gather_signed_unscaled,    "GLD1H_D",  uimm5s2>;
+
+  defm : sve_masked_gather_x4<nxv4i32,  azext_masked_gather_i16_signed_scaled, GLD1H_S_SXTW_SCALED>;
+  defm : sve_masked_gather_x4<nxv4i32,  sext_masked_gather_i16_signed_scaled,  GLD1SH_S_SXTW_SCALED>;
+  defm : sve_masked_gather_x4<nxv4i32,  nonext_masked_gather_signed_scaled,    GLD1W_SXTW_SCALED>;
+  defm : sve_masked_gather_x4<nxv4f16,  nonext_masked_gather_signed_scaled,    GLD1H_S_SXTW_SCALED>;
+  defm : sve_masked_gather_x4<nxv4f32,  nonext_masked_gather_signed_scaled,    GLD1W_SXTW_SCALED>;
+  defm : sve_masked_gather_x4<nxv4bf16, nonext_masked_gather_signed_scaled,    GLD1H_S_SXTW_SCALED>;
+
+  defm : sve_masked_gather_x4<nxv4i32,  azext_masked_gather_i8_signed_unscaled,  GLD1B_S_SXTW>;
+  defm : sve_masked_gather_x4<nxv4i32,  sext_masked_gather_i8_signed_unscaled,   GLD1SB_S_SXTW>;
+  defm : sve_masked_gather_x4<nxv4i32,  azext_masked_gather_i16_signed_unscaled, GLD1H_S_SXTW>;
+  defm : sve_masked_gather_x4<nxv4i32,  sext_masked_gather_i16_signed_unscaled,  GLD1SH_S_SXTW>;
+  defm : sve_masked_gather_x4<nxv4i32,  nonext_masked_gather_signed_unscaled,    GLD1W_SXTW>;
+  defm : sve_masked_gather_x4<nxv4f16,  nonext_masked_gather_signed_unscaled,    GLD1H_S_SXTW>;
+  defm : sve_masked_gather_x4<nxv4f32,  nonext_masked_gather_signed_unscaled,    GLD1W_SXTW>;
+  defm : sve_masked_gather_x4<nxv4bf16, nonext_masked_gather_signed_unscaled,    GLD1H_S_SXTW>;
+
+  defm : sve_masked_gather_x4<nxv4i32,  azext_masked_gather_i16_unsigned_scaled, GLD1H_S_UXTW_SCALED>;
+  defm : sve_masked_gather_x4<nxv4i32,  sext_masked_gather_i16_unsigned_scaled,  GLD1SH_S_UXTW_SCALED>;
+  defm : sve_masked_gather_x4<nxv4i32,  nonext_masked_gather_unsigned_scaled,    GLD1W_UXTW_SCALED>;
+  defm : sve_masked_gather_x4<nxv4f16,  nonext_masked_gather_unsigned_scaled,    GLD1H_S_UXTW_SCALED>;
+  defm : sve_masked_gather_x4<nxv4f32,  nonext_masked_gather_unsigned_scaled,    GLD1W_UXTW_SCALED>;
+  defm : sve_masked_gather_x4<nxv4bf16, nonext_masked_gather_unsigned_scaled,    GLD1H_S_UXTW_SCALED>;
+
+  defm : sve_masked_gather_x4<nxv4i32,  azext_masked_gather_i8_unsigned_unscaled,  GLD1B_S_UXTW>;
+  defm : sve_masked_gather_x4<nxv4i32,  sext_masked_gather_i8_unsigned_unscaled,   GLD1SB_S_UXTW>;
+  defm : sve_masked_gather_x4<nxv4i32,  azext_masked_gather_i16_unsigned_unscaled, GLD1H_S_UXTW>;
+  defm : sve_masked_gather_x4<nxv4i32,  sext_masked_gather_i16_unsigned_unscaled,  GLD1SH_S_UXTW>;
+  defm : sve_masked_gather_x4<nxv4i32,  nonext_masked_gather_unsigned_unscaled,    GLD1W_UXTW>;
+  defm : sve_masked_gather_x4<nxv4f16,  nonext_masked_gather_unsigned_unscaled,    GLD1H_S_UXTW>;
+  defm : sve_masked_gather_x4<nxv4f32,  nonext_masked_gather_unsigned_unscaled,    GLD1W_UXTW>;
+  defm : sve_masked_gather_x4<nxv4bf16, nonext_masked_gather_unsigned_unscaled,    GLD1H_S_UXTW>;
 } // End HasSVE
 
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
   // Non-temporal contiguous loads (register + immediate)
   defm LDNT1B_ZRI : sve_mem_cldnt_si<0b00, "ldnt1b", Z_b, ZPR8>;
   defm LDNT1H_ZRI : sve_mem_cldnt_si<0b01, "ldnt1h", Z_h, ZPR16>;
@@ -1051,7 +1180,7 @@ let Predicates = [HasSVEorStreamingSVE] in {
   defm ST1W   : sve_mem_cst_ss<0b1010, "st1w", Z_s, ZPR32, GPR64NoXZRshifted32>;
   defm ST1W_D : sve_mem_cst_ss<0b1011, "st1w", Z_d, ZPR64, GPR64NoXZRshifted32>;
   defm ST1D   : sve_mem_cst_ss<0b1111, "st1d", Z_d, ZPR64, GPR64NoXZRshifted64>;
-} // End HasSVEorStreamingSVE
+} // End HasSVEorSME
 
 let Predicates = [HasSVE] in {
   // Scatters using unpacked, unscaled 32-bit offsets, e.g.
@@ -1100,12 +1229,87 @@ let Predicates = [HasSVE] in {
 
   // Scatters using scaled 64-bit offsets, e.g.
   //    st1h z0.d, p0, [x0, z0.d, lsl #1]
-  defm SST1H_D_SCALED : sve_mem_sst_sv_64_scaled<0b01, "st1h", AArch64st1_scatter_scaled, ZPR64ExtLSL16, nxv2i16>;
-  defm SST1W_D_SCALED : sve_mem_sst_sv_64_scaled<0b10, "st1w", AArch64st1_scatter_scaled, ZPR64ExtLSL32, nxv2i32>;
-  defm SST1D_SCALED   : sve_mem_sst_sv_64_scaled<0b11, "st1d", AArch64st1_scatter_scaled, ZPR64ExtLSL64, nxv2i64>;
+  defm SST1H_D : sve_mem_sst_sv_64_scaled<0b01, "st1h", AArch64st1_scatter_scaled, ZPR64ExtLSL16, nxv2i16>;
+  defm SST1W_D : sve_mem_sst_sv_64_scaled<0b10, "st1w", AArch64st1_scatter_scaled, ZPR64ExtLSL32, nxv2i32>;
+  defm SST1D   : sve_mem_sst_sv_64_scaled<0b11, "st1d", AArch64st1_scatter_scaled, ZPR64ExtLSL64, nxv2i64>;
+
+  multiclass sve_masked_scatter_x2_scaled<ValueType Ty, SDPatternOperator Store, string Inst> {
+    // base + vector of scaled offsets
+    def : Pat<(Store (Ty ZPR:$data), (nxv2i1 PPR:$gp), GPR64:$base, (nxv2i64 ZPR:$offs)),
+              (!cast<Instruction>(Inst # _SCALED) ZPR:$data, PPR:$gp, GPR64:$base, ZPR:$offs)>;
+    // base + vector of signed 32bit scaled offsets
+    def : Pat<(Store (Ty ZPR:$data), (nxv2i1 PPR:$gp), GPR64:$base, (sext_inreg (nxv2i64 ZPR:$offs), nxv2i32)),
+              (!cast<Instruction>(Inst # _SXTW_SCALED) ZPR:$data, PPR:$gp, GPR64:$base, ZPR:$offs)>;
+    // base + vector of unsigned 32bit scaled offsets
+    def : Pat<(Store (Ty ZPR:$data), (nxv2i1 PPR:$gp), GPR64:$base, (and (nxv2i64 ZPR:$offs), (nxv2i64 (splat_vector (i64 0xFFFFFFFF))))),
+              (!cast<Instruction>(Inst # _UXTW_SCALED) ZPR:$data, PPR:$gp, GPR64:$base, ZPR:$offs)>;
+  }
+
+  multiclass sve_masked_scatter_x2_unscaled<ValueType Ty, SDPatternOperator Store, string Inst, Operand ImmTy> {
+    // vector of pointers + immediate offset (includes zero)
+    def : Pat<(Store (Ty ZPR:$data), (nxv2i1 PPR:$gp), (i64 ImmTy:$imm), (nxv2i64 ZPR:$ptrs)),
+              (!cast<Instruction>(Inst # _IMM) ZPR:$data, PPR:$gp, ZPR:$ptrs, ImmTy:$imm)>;
+    // base + vector of offsets
+    def : Pat<(Store (Ty ZPR:$data), (nxv2i1 PPR:$gp), GPR64:$base, (nxv2i64 ZPR:$offs)),
+              (!cast<Instruction>(Inst) ZPR:$data, PPR:$gp, GPR64:$base, ZPR:$offs)>;
+    // base + vector of signed 32bit offsets
+    def : Pat<(Store (Ty ZPR:$data), (nxv2i1 PPR:$gp), GPR64:$base, (sext_inreg (nxv2i64 ZPR:$offs), nxv2i32)),
+              (!cast<Instruction>(Inst # _SXTW) ZPR:$data, PPR:$gp, GPR64:$base, ZPR:$offs)>;
+    // base + vector of unsigned 32bit offsets
+    def : Pat<(Store (Ty ZPR:$data), (nxv2i1 PPR:$gp), GPR64:$base, (and (nxv2i64 ZPR:$offs), (nxv2i64 (splat_vector (i64 0xFFFFFFFF))))),
+              (!cast<Instruction>(Inst # _UXTW) ZPR:$data, PPR:$gp, GPR64:$base, ZPR:$offs)>;
+  }
+
+  multiclass sve_masked_scatter_x4<ValueType Ty, SDPatternOperator Store, Instruction Inst> {
+    def : Pat<(Store (Ty ZPR:$data), (nxv4i1 PPR:$gp), GPR64:$base, (nxv4i32 ZPR:$offs)),
+              (Inst ZPR:$data, PPR:$gp, GPR64:$base, ZPR:$offs)>;
+  }
+
+  defm : sve_masked_scatter_x2_scaled<nxv2i64,  trunc_masked_scatter_i16_signed_scaled, "SST1H_D">;
+  defm : sve_masked_scatter_x2_scaled<nxv2i64,  trunc_masked_scatter_i32_signed_scaled, "SST1W_D">;
+  defm : sve_masked_scatter_x2_scaled<nxv2i64,  nontrunc_masked_scatter_signed_scaled,  "SST1D">;
+  defm : sve_masked_scatter_x2_scaled<nxv2f16,  nontrunc_masked_scatter_signed_scaled,  "SST1H_D">;
+  defm : sve_masked_scatter_x2_scaled<nxv2f32,  nontrunc_masked_scatter_signed_scaled,  "SST1W_D">;
+  defm : sve_masked_scatter_x2_scaled<nxv2f64,  nontrunc_masked_scatter_signed_scaled,  "SST1D">;
+  defm : sve_masked_scatter_x2_scaled<nxv2bf16, nontrunc_masked_scatter_signed_scaled,  "SST1H_D">;
+
+  defm : sve_masked_scatter_x2_unscaled<nxv2i64,  trunc_masked_scatter_i8_signed_unscaled,  "SST1B_D" , imm0_31>;
+  defm : sve_masked_scatter_x2_unscaled<nxv2i64,  trunc_masked_scatter_i16_signed_unscaled, "SST1H_D",  uimm5s2>;
+  defm : sve_masked_scatter_x2_unscaled<nxv2i64,  trunc_masked_scatter_i32_signed_unscaled, "SST1W_D",  uimm5s4>;
+  defm : sve_masked_scatter_x2_unscaled<nxv2i64,  nontrunc_masked_scatter_signed_unscaled,  "SST1D",    uimm5s8>;
+  defm : sve_masked_scatter_x2_unscaled<nxv2f16,  nontrunc_masked_scatter_signed_unscaled,  "SST1H_D",  uimm5s2>;
+  defm : sve_masked_scatter_x2_unscaled<nxv2f32,  nontrunc_masked_scatter_signed_unscaled,  "SST1W_D",  uimm5s4>;
+  defm : sve_masked_scatter_x2_unscaled<nxv2f64,  nontrunc_masked_scatter_signed_unscaled,  "SST1D",    uimm5s8>;
+  defm : sve_masked_scatter_x2_unscaled<nxv2bf16, nontrunc_masked_scatter_signed_unscaled,  "SST1H_D",  uimm5s2>;
+
+  defm : sve_masked_scatter_x4<nxv4i32,  trunc_masked_scatter_i16_signed_scaled, SST1H_S_SXTW_SCALED>;
+  defm : sve_masked_scatter_x4<nxv4i32,  nontrunc_masked_scatter_signed_scaled,  SST1W_SXTW_SCALED>;
+  defm : sve_masked_scatter_x4<nxv4f16,  nontrunc_masked_scatter_signed_scaled,  SST1H_S_SXTW_SCALED>;
+  defm : sve_masked_scatter_x4<nxv4f32,  nontrunc_masked_scatter_signed_scaled,  SST1W_SXTW_SCALED>;
+  defm : sve_masked_scatter_x4<nxv4bf16, nontrunc_masked_scatter_signed_scaled,  SST1H_S_SXTW_SCALED>;
+
+  defm : sve_masked_scatter_x4<nxv4i32,  trunc_masked_scatter_i8_signed_unscaled,  SST1B_S_SXTW>;
+  defm : sve_masked_scatter_x4<nxv4i32,  trunc_masked_scatter_i16_signed_unscaled, SST1H_S_SXTW>;
+  defm : sve_masked_scatter_x4<nxv4i32,  nontrunc_masked_scatter_signed_unscaled,  SST1W_SXTW>;
+  defm : sve_masked_scatter_x4<nxv4f16,  nontrunc_masked_scatter_signed_unscaled,  SST1H_S_SXTW>;
+  defm : sve_masked_scatter_x4<nxv4f32,  nontrunc_masked_scatter_signed_unscaled,  SST1W_SXTW>;
+  defm : sve_masked_scatter_x4<nxv4bf16, nontrunc_masked_scatter_signed_unscaled,  SST1H_S_SXTW>;
+
+  defm : sve_masked_scatter_x4<nxv4i32,  trunc_masked_scatter_i16_unsigned_scaled, SST1H_S_UXTW_SCALED>;
+  defm : sve_masked_scatter_x4<nxv4i32,  nontrunc_masked_scatter_unsigned_scaled,  SST1W_UXTW_SCALED>;
+  defm : sve_masked_scatter_x4<nxv4f16,  nontrunc_masked_scatter_unsigned_scaled,  SST1H_S_UXTW_SCALED>;
+  defm : sve_masked_scatter_x4<nxv4f32,  nontrunc_masked_scatter_unsigned_scaled,  SST1W_UXTW_SCALED>;
+  defm : sve_masked_scatter_x4<nxv4bf16, nontrunc_masked_scatter_unsigned_scaled,  SST1H_S_UXTW_SCALED>;
+
+  defm : sve_masked_scatter_x4<nxv4i32,  trunc_masked_scatter_i8_unsigned_unscaled,  SST1B_S_UXTW>;
+  defm : sve_masked_scatter_x4<nxv4i32,  trunc_masked_scatter_i16_unsigned_unscaled, SST1H_S_UXTW>;
+  defm : sve_masked_scatter_x4<nxv4i32,  nontrunc_masked_scatter_unsigned_unscaled,  SST1W_UXTW>;
+  defm : sve_masked_scatter_x4<nxv4f16,  nontrunc_masked_scatter_unsigned_unscaled,  SST1H_S_UXTW>;
+  defm : sve_masked_scatter_x4<nxv4f32,  nontrunc_masked_scatter_unsigned_unscaled,  SST1W_UXTW>;
+  defm : sve_masked_scatter_x4<nxv4bf16, nontrunc_masked_scatter_unsigned_unscaled,  SST1H_S_UXTW>;
 } // End HasSVE
 
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
   // ST(2|3|4) structured stores (register + immediate)
   defm ST2B_IMM : sve_mem_est_si<0b00, 0b01, ZZ_b,   "st2b", simm4s2>;
   defm ST3B_IMM : sve_mem_est_si<0b00, 0b10, ZZZ_b,  "st3b", simm4s3>;
@@ -1161,7 +1365,7 @@ let Predicates = [HasSVEorStreamingSVE] in {
   // Contiguous prefetch (register + register)
   def PRFB_PRR : sve_mem_prfm_ss<0b001, "prfb", GPR64NoXZRshifted8>;
   def PRFH_PRR : sve_mem_prfm_ss<0b011, "prfh", GPR64NoXZRshifted16>;
-  def PRFS_PRR : sve_mem_prfm_ss<0b101, "prfw", GPR64NoXZRshifted32>;
+  def PRFW_PRR : sve_mem_prfm_ss<0b101, "prfw", GPR64NoXZRshifted32>;
   def PRFD_PRR : sve_mem_prfm_ss<0b111, "prfd", GPR64NoXZRshifted64>;
 
   multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instruction RegImmInst, Instruction RegRegInst, ComplexPattern AddrCP> {
@@ -1184,9 +1388,9 @@ let Predicates = [HasSVEorStreamingSVE] in {
 
   defm : sve_prefetch<int_aarch64_sve_prf, nxv16i1, PRFB_PRI, PRFB_PRR, am_sve_regreg_lsl0>;
   defm : sve_prefetch<int_aarch64_sve_prf, nxv8i1,  PRFH_PRI, PRFH_PRR, am_sve_regreg_lsl1>;
-  defm : sve_prefetch<int_aarch64_sve_prf, nxv4i1,  PRFW_PRI, PRFS_PRR, am_sve_regreg_lsl2>;
+  defm : sve_prefetch<int_aarch64_sve_prf, nxv4i1,  PRFW_PRI, PRFW_PRR, am_sve_regreg_lsl2>;
   defm : sve_prefetch<int_aarch64_sve_prf, nxv2i1,  PRFD_PRI, PRFD_PRR, am_sve_regreg_lsl3>;
-} // End HasSVEorStreamingSVE
+} // End HasSVEorSME
 
 let Predicates = [HasSVE] in {
   // Gather prefetch using scaled 32-bit offsets, e.g.
@@ -1249,7 +1453,7 @@ let Predicates = [HasSVE] in {
   // Patterns to generate adr instruction.
   // adr z0.d, [z0.d, z0.d, uxtw]
   def : Pat<(add nxv2i64:$Op1,
-                (nxv2i64 (and nxv2i64:$Op2, (nxv2i64 (AArch64dup (i64 0xFFFFFFFF)))))),
+                (nxv2i64 (and nxv2i64:$Op2, (nxv2i64 (splat_vector (i64 0xFFFFFFFF)))))),
             (ADR_UXTW_ZZZ_D_0 $Op1, $Op2)>;
   // adr z0.d, [z0.d, z0.d, sxtw]
   def : Pat<(add nxv2i64:$Op1,
@@ -1262,7 +1466,7 @@ let Predicates = [HasSVE] in {
     def : Pat<(add Ty:$Op1,
                   (Ty (AArch64lsl_p (PredTy (SVEAllActive)),
                                     Ty:$Op2,
-                                    (Ty (AArch64dup (ShiftTy ShiftAmt)))))),
+                                    (Ty (splat_vector (ShiftTy ShiftAmt)))))),
               (DestAdrIns $Op1, $Op2)>;
   }
   defm : adrShiftPat<nxv2i64, nxv2i1, i64, ADR_LSL_ZZZ_D_1, 1>;
@@ -1277,14 +1481,14 @@ let Predicates = [HasSVE] in {
   multiclass adrXtwShiftPat<ValueType Ty, ValueType PredTy, int ShiftAmt> {
     def : Pat<(add Ty:$Op1,
                   (Ty (AArch64lsl_p (PredTy (SVEAllActive)),
-                                    (Ty (and Ty:$Op2, (Ty (AArch64dup (i64 0xFFFFFFFF))))),
-                                    (Ty (AArch64dup (i64 ShiftAmt)))))),
+                                    (Ty (and Ty:$Op2, (Ty (splat_vector (i64 0xFFFFFFFF))))),
+                                    (Ty (splat_vector (i64 ShiftAmt)))))),
               (!cast<Instruction>("ADR_UXTW_ZZZ_D_"#ShiftAmt) $Op1, $Op2)>;
 
     def : Pat<(add Ty:$Op1,
                   (Ty (AArch64lsl_p (PredTy (SVEAllActive)),
                                     (Ty (sext_inreg Ty:$Op2, nxv2i32)),
-                                    (Ty (AArch64dup (i64 ShiftAmt)))))),
+                                    (Ty (splat_vector (i64 ShiftAmt)))))),
               (!cast<Instruction>("ADR_SXTW_ZZZ_D_"#ShiftAmt) $Op1, $Op2)>;
   }
   defm : adrXtwShiftPat<nxv2i64, nxv2i1, 1>;
@@ -1292,7 +1496,7 @@ let Predicates = [HasSVE] in {
   defm : adrXtwShiftPat<nxv2i64, nxv2i1, 3>;
 } // End HasSVE
 
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
   defm TBL_ZZZ  : sve_int_perm_tbl<"tbl", AArch64tbl>;
 
   defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1", AArch64zip1>;
@@ -1310,6 +1514,10 @@ let Predicates = [HasSVEorStreamingSVE] in {
   defm TRN2_PPP : sve_int_perm_bin_perm_pp<0b101, "trn2", AArch64trn2>;
 
   // Extract lo/hi halves of legal predicate types.
+  def : Pat<(nxv1i1 (extract_subvector (nxv2i1 PPR:$Ps), (i64 0))),
+            (PUNPKLO_PP PPR:$Ps)>;
+  def : Pat<(nxv1i1 (extract_subvector (nxv2i1 PPR:$Ps), (i64 1))),
+            (PUNPKHI_PP PPR:$Ps)>;
   def : Pat<(nxv2i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 0))),
             (PUNPKLO_PP PPR:$Ps)>;
   def : Pat<(nxv2i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 2))),
@@ -1400,6 +1608,8 @@ let Predicates = [HasSVEorStreamingSVE] in {
             (UUNPKHI_ZZ_D (UUNPKHI_ZZ_S ZPR:$Zs))>;
 
   // Concatenate two predicates.
+  def : Pat<(nxv2i1 (concat_vectors nxv1i1:$p1, nxv1i1:$p2)),
+            (UZP1_PPP_D $p1, $p2)>;
   def : Pat<(nxv4i1 (concat_vectors nxv2i1:$p1, nxv2i1:$p2)),
             (UZP1_PPP_S $p1, $p2)>;
   def : Pat<(nxv8i1 (concat_vectors nxv4i1:$p1, nxv4i1:$p2)),
@@ -1475,7 +1685,7 @@ let Predicates = [HasSVEorStreamingSVE] in {
   defm FCMGE_PPzZZ : sve_fp_3op_p_pd_cc<0b000, "fcmge", SETOGE, SETGE, SETOLE, SETLE>;
   defm FCMGT_PPzZZ : sve_fp_3op_p_pd_cc<0b001, "fcmgt", SETOGT, SETGT, SETOLT, SETLT>;
   defm FCMEQ_PPzZZ : sve_fp_3op_p_pd_cc<0b010, "fcmeq", SETOEQ, SETEQ, SETOEQ, SETEQ>;
-  defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", SETONE, SETNE, SETONE, SETNE>;
+  defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", SETUNE, SETNE, SETUNE, SETNE>;
   defm FCMUO_PPzZZ : sve_fp_3op_p_pd_cc<0b100, "fcmuo", SETUO, SETUO, SETUO, SETUO>;
   defm FACGE_PPzZZ : sve_fp_3op_p_pd<0b101, "facge", int_aarch64_sve_facge>;
   defm FACGT_PPzZZ : sve_fp_3op_p_pd<0b111, "facgt", int_aarch64_sve_facgt>;
@@ -1485,7 +1695,7 @@ let Predicates = [HasSVEorStreamingSVE] in {
   defm FCMLT_PPzZ0 : sve_fp_2op_p_pd<0b010, "fcmlt", SETOLT, SETLT, SETOGT, SETGT>;
   defm FCMLE_PPzZ0 : sve_fp_2op_p_pd<0b011, "fcmle", SETOLE, SETLE, SETOGE, SETGE>;
   defm FCMEQ_PPzZ0 : sve_fp_2op_p_pd<0b100, "fcmeq", SETOEQ, SETEQ, SETOEQ, SETEQ>;
-  defm FCMNE_PPzZ0 : sve_fp_2op_p_pd<0b110, "fcmne", SETONE, SETNE, SETONE, SETNE>;
+  defm FCMNE_PPzZ0 : sve_fp_2op_p_pd<0b110, "fcmne", SETUNE, SETNE, SETUNE, SETNE>;
 
   defm WHILELT_PWW : sve_int_while4_rr<0b010, "whilelt", int_aarch64_sve_whilelt>;
   defm WHILELE_PWW : sve_int_while4_rr<0b011, "whilele", int_aarch64_sve_whilele>;
@@ -1522,7 +1732,7 @@ let Predicates = [HasSVEorStreamingSVE] in {
   defm INCD_XPiI : sve_int_pred_pattern_a<0b110, "incd", add, int_aarch64_sve_cntd>;
   defm DECD_XPiI : sve_int_pred_pattern_a<0b111, "decd", sub, int_aarch64_sve_cntd>;
 
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
   defm SQINCB_XPiWdI : sve_int_pred_pattern_b_s32<0b00000, "sqincb", int_aarch64_sve_sqincb_n32>;
   defm UQINCB_WPiI   : sve_int_pred_pattern_b_u32<0b00001, "uqincb", int_aarch64_sve_uqincb_n32>;
   defm SQDECB_XPiWdI : sve_int_pred_pattern_b_s32<0b00010, "sqdecb", int_aarch64_sve_sqdecb_n32>;
@@ -1619,16 +1829,16 @@ let Predicates = [HasSVEorStreamingSVE] in {
   defm ASR_ZPZI : sve_int_shift_pred_bhsd<AArch64asr_p, SVEShiftImmR8, SVEShiftImmR16, SVEShiftImmR32, SVEShiftImmR64>;
   defm LSR_ZPZI : sve_int_shift_pred_bhsd<AArch64lsr_p, SVEShiftImmR8, SVEShiftImmR16, SVEShiftImmR32, SVEShiftImmR64>;
   defm LSL_ZPZI : sve_int_shift_pred_bhsd<AArch64lsl_p, SVEShiftImmL8, SVEShiftImmL16, SVEShiftImmL32, SVEShiftImmL64>;
-} // End HasSVEorStreamingSVE
+} // End HasSVEorSME
 
-let Predicates = [HasSVEorStreamingSVE, UseExperimentalZeroingPseudos] in {
+let Predicates = [HasSVEorSME, UseExperimentalZeroingPseudos] in {
   defm ASR_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_asr>;
   defm LSR_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_lsr>;
   defm LSL_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_lsl>;
   defm ASRD_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<AArch64asrd_m1>;
-} // End HasSVEorStreamingSVE, UseExperimentalZeroingPseudos
+} // End HasSVEorSME, UseExperimentalZeroingPseudos
 
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
   defm ASR_ZPmZ  : sve_int_bin_pred_shift<0b000, "asr", "ASR_ZPZZ", int_aarch64_sve_asr, "ASRR_ZPmZ">;
   defm LSR_ZPmZ  : sve_int_bin_pred_shift<0b001, "lsr", "LSR_ZPZZ", int_aarch64_sve_lsr, "LSRR_ZPmZ">;
   defm LSL_ZPmZ  : sve_int_bin_pred_shift<0b011, "lsl", "LSL_ZPZZ", int_aarch64_sve_lsl, "LSLR_ZPmZ">;
@@ -1679,60 +1889,61 @@ let Predicates = [HasSVEorStreamingSVE] in {
   defm FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1111110, "fcvtzs", ZPR64, ZPR64, null_frag,                     AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
   defm FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1111111, "fcvtzu", ZPR64, ZPR64, null_frag,                     AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
 
-  def : Pat<(nxv2f32 (AArch64fcvte_mt (nxv2i1 PPR:$Pg), (nxv2f16 ZPR:$Zs), (nxv2f32 ZPR:$Zd))),
-            (FCVT_ZPmZ_HtoS ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+  //These patterns exist to improve the code quality of conversions on unpacked types.
+  def : Pat<(nxv2f32 (AArch64fcvte_mt (nxv2i1 (SVEAllActive):$Pg), (nxv2f16 ZPR:$Zs), (nxv2f32 ZPR:$Zd))),
+            (FCVT_ZPmZ_HtoS_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
 
   // FP_ROUND has an additional 'precise' flag which indicates the type of rounding.
   // This is ignored by the pattern below where it is matched by (i64 timm0_1)
-  def : Pat<(nxv2f16 (AArch64fcvtr_mt (nxv2i1 PPR:$Pg), (nxv2f32 ZPR:$Zs), (i64 timm0_1), (nxv2f16 ZPR:$Zd))),
-            (FCVT_ZPmZ_StoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+  def : Pat<(nxv2f16 (AArch64fcvtr_mt (nxv2i1 (SVEAllActive):$Pg), (nxv2f32 ZPR:$Zs), (i64 timm0_1), (nxv2f16 ZPR:$Zd))),
+            (FCVT_ZPmZ_StoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
 
-  // Floating-point -> signed integer
-  def : Pat<(nxv2f16 (AArch64scvtf_mt (nxv2i1 PPR:$Pg),
+  // Signed integer -> Floating-point 
+  def : Pat<(nxv2f16 (AArch64scvtf_mt (nxv2i1 (SVEAllActive):$Pg),
                       (sext_inreg (nxv2i64 ZPR:$Zs), nxv2i16), (nxv2f16 ZPR:$Zd))),
-            (SCVTF_ZPmZ_HtoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+            (SCVTF_ZPmZ_HtoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
 
-  def : Pat<(nxv4f16 (AArch64scvtf_mt (nxv4i1 PPR:$Pg),
+  def : Pat<(nxv4f16 (AArch64scvtf_mt (nxv4i1 (SVEAllActive):$Pg),
                       (sext_inreg (nxv4i32 ZPR:$Zs), nxv4i16), (nxv4f16 ZPR:$Zd))),
-            (SCVTF_ZPmZ_HtoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+            (SCVTF_ZPmZ_HtoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
 
-  def : Pat<(nxv2f16 (AArch64scvtf_mt (nxv2i1 PPR:$Pg),
+  def : Pat<(nxv2f16 (AArch64scvtf_mt (nxv2i1 (SVEAllActive):$Pg),
                       (sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (nxv2f16 ZPR:$Zd))),
-            (SCVTF_ZPmZ_StoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+            (SCVTF_ZPmZ_StoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
 
-  def : Pat<(nxv2f32 (AArch64scvtf_mt (nxv2i1 PPR:$Pg),
+  def : Pat<(nxv2f32 (AArch64scvtf_mt (nxv2i1 (SVEAllActive):$Pg),
                       (sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (nxv2f32 ZPR:$Zd))),
-            (SCVTF_ZPmZ_StoS ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+            (SCVTF_ZPmZ_StoS_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
 
-  def : Pat<(nxv2f64 (AArch64scvtf_mt (nxv2i1 PPR:$Pg),
+  def : Pat<(nxv2f64 (AArch64scvtf_mt (nxv2i1 (SVEAllActive):$Pg),
                       (sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (nxv2f64 ZPR:$Zd))),
-            (SCVTF_ZPmZ_StoD ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+            (SCVTF_ZPmZ_StoD_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
 
-  // Floating-point -> unsigned integer
-  def : Pat<(nxv2f16 (AArch64ucvtf_mt (nxv2i1 PPR:$Pg),
+  // Unsigned integer -> Floating-point
+  def : Pat<(nxv2f16 (AArch64ucvtf_mt (nxv2i1 (SVEAllActive):$Pg),
                       (and (nxv2i64 ZPR:$Zs),
-                       (nxv2i64 (AArch64dup (i64 0xFFFF)))), (nxv2f16 ZPR:$Zd))),
-            (UCVTF_ZPmZ_HtoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+                       (nxv2i64 (splat_vector (i64 0xFFFF)))), (nxv2f16 ZPR:$Zd))),
+            (UCVTF_ZPmZ_HtoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
 
-  def : Pat<(nxv2f16 (AArch64ucvtf_mt (nxv2i1 PPR:$Pg),
+  def : Pat<(nxv2f16 (AArch64ucvtf_mt (nxv2i1 (SVEAllActive):$Pg),
                       (and (nxv2i64 ZPR:$Zs),
-                       (nxv2i64 (AArch64dup (i64 0xFFFFFFFF)))), (nxv2f16 ZPR:$Zd))),
-            (UCVTF_ZPmZ_StoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+                       (nxv2i64 (splat_vector (i64 0xFFFFFFFF)))), (nxv2f16 ZPR:$Zd))),
+            (UCVTF_ZPmZ_StoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
 
-  def : Pat<(nxv4f16 (AArch64ucvtf_mt (nxv4i1 PPR:$Pg),
+  def : Pat<(nxv4f16 (AArch64ucvtf_mt (nxv4i1 (SVEAllActive):$Pg),
                       (and (nxv4i32 ZPR:$Zs),
-                       (nxv4i32 (AArch64dup (i32 0xFFFF)))), (nxv4f16 ZPR:$Zd))),
-            (UCVTF_ZPmZ_HtoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+                       (nxv4i32 (splat_vector (i32 0xFFFF)))), (nxv4f16 ZPR:$Zd))),
+            (UCVTF_ZPmZ_HtoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
 
-  def : Pat<(nxv2f32 (AArch64ucvtf_mt (nxv2i1 PPR:$Pg),
+  def : Pat<(nxv2f32 (AArch64ucvtf_mt (nxv2i1 (SVEAllActive):$Pg),
                       (and (nxv2i64 ZPR:$Zs),
-                       (nxv2i64 (AArch64dup (i64 0xFFFFFFFF)))), (nxv2f32 ZPR:$Zd))),
-            (UCVTF_ZPmZ_StoS ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+                       (nxv2i64 (splat_vector (i64 0xFFFFFFFF)))), (nxv2f32 ZPR:$Zd))),
+            (UCVTF_ZPmZ_StoS_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
 
-  def : Pat<(nxv2f64 (AArch64ucvtf_mt (nxv2i1 PPR:$Pg),
+  def : Pat<(nxv2f64 (AArch64ucvtf_mt (nxv2i1 (SVEAllActive):$Pg),
                       (and (nxv2i64 ZPR:$Zs),
-                       (nxv2i64 (AArch64dup (i64 0xFFFFFFFF)))), (nxv2f64 ZPR:$Zd))),
-            (UCVTF_ZPmZ_StoD ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+                       (nxv2i64 (splat_vector (i64 0xFFFFFFFF)))), (nxv2f64 ZPR:$Zd))),
+            (UCVTF_ZPmZ_StoD_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
 
   defm FRINTN_ZPmZ : sve_fp_2op_p_zd_HSD<0b00000, "frintn", AArch64frintn_mt>;
   defm FRINTP_ZPmZ : sve_fp_2op_p_zd_HSD<0b00001, "frintp", AArch64frintp_mt>;
@@ -1743,27 +1954,27 @@ let Predicates = [HasSVEorStreamingSVE] in {
   defm FRINTI_ZPmZ : sve_fp_2op_p_zd_HSD<0b00111, "frinti", AArch64frinti_mt>;
   defm FRECPX_ZPmZ : sve_fp_2op_p_zd_HSD<0b01100, "frecpx", AArch64frecpx_mt>;
   defm FSQRT_ZPmZ  : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt",  AArch64fsqrt_mt>;
-} // End HasSVEorStreamingSVE
+} // End HasSVEorSME
 
-let Predicates = [HasBF16, HasSVEorStreamingSVE] in {
+let Predicates = [HasBF16, HasSVEorSME] in {
   defm BFDOT_ZZZ    : sve_bfloat_dot<"bfdot", int_aarch64_sve_bfdot>;
   defm BFDOT_ZZI    : sve_bfloat_dot_indexed<"bfdot", int_aarch64_sve_bfdot_lane>;
-} // End HasBF16, HasSVEorStreamingSVE
+} // End HasBF16, HasSVEorSME
 
 let Predicates = [HasBF16, HasSVE] in {
   defm BFMMLA_ZZZ   : sve_bfloat_matmul<"bfmmla", int_aarch64_sve_bfmmla>;
 } // End HasBF16, HasSVE
 
-let Predicates = [HasBF16, HasSVEorStreamingSVE] in {
-  defm BFMMLA_B_ZZZ : sve_bfloat_matmul_longvecl<0b0, "bfmlalb", int_aarch64_sve_bfmlalb>;
-  defm BFMMLA_T_ZZZ : sve_bfloat_matmul_longvecl<0b1, "bfmlalt", int_aarch64_sve_bfmlalt>;
-  defm BFMMLA_B_ZZI : sve_bfloat_matmul_longvecl_idx<0b0, "bfmlalb", int_aarch64_sve_bfmlalb_lane>;
-  defm BFMMLA_T_ZZI : sve_bfloat_matmul_longvecl_idx<0b1, "bfmlalt", int_aarch64_sve_bfmlalt_lane>;
+let Predicates = [HasBF16, HasSVEorSME] in {
+  defm BFMLALB_ZZZ : sve_bfloat_matmul_longvecl<0b0, "bfmlalb", int_aarch64_sve_bfmlalb>;
+  defm BFMLALT_ZZZ : sve_bfloat_matmul_longvecl<0b1, "bfmlalt", int_aarch64_sve_bfmlalt>;
+  defm BFMLALB_ZZI : sve_bfloat_matmul_longvecl_idx<0b0, "bfmlalb", int_aarch64_sve_bfmlalb_lane>;
+  defm BFMLALT_ZZI : sve_bfloat_matmul_longvecl_idx<0b1, "bfmlalt", int_aarch64_sve_bfmlalt_lane>;
   defm BFCVT_ZPmZ   : sve_bfloat_convert<0b1, "bfcvt",   int_aarch64_sve_fcvt_bf16f32>;
   defm BFCVTNT_ZPmZ : sve_bfloat_convert<0b0, "bfcvtnt", int_aarch64_sve_fcvtnt_bf16f32>;
-} // End HasBF16, HasSVEorStreamingSVE
+} // End HasBF16, HasSVEorSME
 
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
   // InstAliases
   def : InstAlias<"mov $Zd, $Zn",
                   (ORR_ZZZ ZPR64:$Zd, ZPR64:$Zn, ZPR64:$Zn), 1>;
@@ -1875,7 +2086,7 @@ let Predicates = [HasSVEorStreamingSVE] in {
   let AddedComplexity = 1 in {
   class LD1RPat<ValueType vt, SDPatternOperator operator,
                 Instruction load, Instruction ptrue, ValueType index_vt, ComplexPattern CP, Operand immtype> :
-        Pat<(vt (AArch64dup (index_vt (operator (CP GPR64:$base, immtype:$offset))))),
+        Pat<(vt (splat_vector (index_vt (operator (CP GPR64:$base, immtype:$offset))))),
             (load (ptrue 31), GPR64:$base, $offset)>;
   }
 
@@ -1963,22 +2174,22 @@ let Predicates = [HasSVEorStreamingSVE] in {
                                              GPR32:$op, sub_32), $imm),
                                    sub_32))>;
 
-    def : Pat<(nxv8i16 (add ZPR:$op, (nxv8i16 (AArch64dup (i32 (trunc (vscale (sve_cnth_imm i32:$imm)))))))),
+    def : Pat<(nxv8i16 (add ZPR:$op, (nxv8i16 (splat_vector (i32 (trunc (vscale (sve_cnth_imm i32:$imm)))))))),
               (INCH_ZPiI ZPR:$op, 31, $imm)>;
-    def : Pat<(nxv4i32 (add ZPR:$op, (nxv4i32 (AArch64dup (i32 (trunc (vscale (sve_cntw_imm i32:$imm)))))))),
+    def : Pat<(nxv4i32 (add ZPR:$op, (nxv4i32 (splat_vector (i32 (trunc (vscale (sve_cntw_imm i32:$imm)))))))),
               (INCW_ZPiI ZPR:$op, 31, $imm)>;
-    def : Pat<(nxv2i64 (add ZPR:$op, (nxv2i64 (AArch64dup (i64 (vscale (sve_cntd_imm i32:$imm))))))),
+    def : Pat<(nxv2i64 (add ZPR:$op, (nxv2i64 (splat_vector (i64 (vscale (sve_cntd_imm i32:$imm))))))),
               (INCD_ZPiI ZPR:$op, 31, $imm)>;
 
-    def : Pat<(nxv8i16 (sub ZPR:$op, (nxv8i16 (AArch64dup (i32 (trunc (vscale (sve_cnth_imm i32:$imm)))))))),
+    def : Pat<(nxv8i16 (sub ZPR:$op, (nxv8i16 (splat_vector (i32 (trunc (vscale (sve_cnth_imm i32:$imm)))))))),
               (DECH_ZPiI ZPR:$op, 31, $imm)>;
-    def : Pat<(nxv4i32 (sub ZPR:$op, (nxv4i32 (AArch64dup (i32 (trunc (vscale (sve_cntw_imm i32:$imm)))))))),
+    def : Pat<(nxv4i32 (sub ZPR:$op, (nxv4i32 (splat_vector (i32 (trunc (vscale (sve_cntw_imm i32:$imm)))))))),
               (DECW_ZPiI ZPR:$op, 31, $imm)>;
-    def : Pat<(nxv2i64 (sub ZPR:$op, (nxv2i64 (AArch64dup (i64 (vscale (sve_cntd_imm i32:$imm))))))),
+    def : Pat<(nxv2i64 (sub ZPR:$op, (nxv2i64 (splat_vector (i64 (vscale (sve_cntd_imm i32:$imm))))))),
               (DECD_ZPiI ZPR:$op, 31, $imm)>;
   }
 
-  let Predicates = [HasSVEorStreamingSVE, UseScalarIncVL], AddedComplexity = 5 in {
+  let Predicates = [HasSVEorSME, UseScalarIncVL], AddedComplexity = 5 in {
     def : Pat<(add GPR64:$op, (vscale (sve_cnth_imm i32:$imm))),
               (INCH_XPiI GPR64:$op, 31, $imm)>;
     def : Pat<(add GPR64:$op, (vscale (sve_cntw_imm i32:$imm))),
@@ -2098,15 +2309,23 @@ let Predicates = [HasSVEorStreamingSVE] in {
   def : Pat<(nxv16i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
   def : Pat<(nxv16i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
   def : Pat<(nxv16i1 (reinterpret_cast (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv16i1 (reinterpret_cast (nxv1i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
   def : Pat<(nxv8i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
   def : Pat<(nxv8i1 (reinterpret_cast  (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
   def : Pat<(nxv8i1 (reinterpret_cast  (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv8i1 (reinterpret_cast  (nxv1i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
   def : Pat<(nxv4i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
   def : Pat<(nxv4i1 (reinterpret_cast  (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
   def : Pat<(nxv4i1 (reinterpret_cast  (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv4i1 (reinterpret_cast  (nxv1i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
   def : Pat<(nxv2i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
   def : Pat<(nxv2i1 (reinterpret_cast  (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
   def : Pat<(nxv2i1 (reinterpret_cast  (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv2i1 (reinterpret_cast  (nxv1i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv1i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv1i1 (reinterpret_cast  (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv1i1 (reinterpret_cast  (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv1i1 (reinterpret_cast  (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
 
   // These allow casting from/to unpacked floating-point types.
   def : Pat<(nxv2f16 (reinterpret_cast (nxv8f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
@@ -2145,12 +2364,12 @@ let Predicates = [HasSVEorStreamingSVE] in {
   }
 
   // 2-element contiguous loads
-  defm : pred_load<nxv2i64,  nxv2i1, zext_masked_load_i8,   LD1B_D,  LD1B_D_IMM,  am_sve_regreg_lsl0>;
-  defm : pred_load<nxv2i64,  nxv2i1, asext_masked_load_i8,  LD1SB_D, LD1SB_D_IMM, am_sve_regreg_lsl0>;
-  defm : pred_load<nxv2i64,  nxv2i1, zext_masked_load_i16,  LD1H_D,  LD1H_D_IMM,  am_sve_regreg_lsl1>;
-  defm : pred_load<nxv2i64,  nxv2i1, asext_masked_load_i16, LD1SH_D, LD1SH_D_IMM, am_sve_regreg_lsl1>;
-  defm : pred_load<nxv2i64,  nxv2i1, zext_masked_load_i32,  LD1W_D,  LD1W_D_IMM,  am_sve_regreg_lsl2>;
-  defm : pred_load<nxv2i64,  nxv2i1, asext_masked_load_i32, LD1SW_D, LD1SW_D_IMM, am_sve_regreg_lsl2>;
+  defm : pred_load<nxv2i64,  nxv2i1, azext_masked_load_i8,  LD1B_D,  LD1B_D_IMM,  am_sve_regreg_lsl0>;
+  defm : pred_load<nxv2i64,  nxv2i1, sext_masked_load_i8,   LD1SB_D, LD1SB_D_IMM, am_sve_regreg_lsl0>;
+  defm : pred_load<nxv2i64,  nxv2i1, azext_masked_load_i16, LD1H_D,  LD1H_D_IMM,  am_sve_regreg_lsl1>;
+  defm : pred_load<nxv2i64,  nxv2i1, sext_masked_load_i16,  LD1SH_D, LD1SH_D_IMM, am_sve_regreg_lsl1>;
+  defm : pred_load<nxv2i64,  nxv2i1, azext_masked_load_i32, LD1W_D,  LD1W_D_IMM,  am_sve_regreg_lsl2>;
+  defm : pred_load<nxv2i64,  nxv2i1, sext_masked_load_i32,  LD1SW_D, LD1SW_D_IMM, am_sve_regreg_lsl2>;
   defm : pred_load<nxv2i64,  nxv2i1, nonext_masked_load,    LD1D,    LD1D_IMM,    am_sve_regreg_lsl3>;
   defm : pred_load<nxv2f16,  nxv2i1, nonext_masked_load,    LD1H_D,  LD1H_D_IMM,  am_sve_regreg_lsl1>;
   defm : pred_load<nxv2bf16, nxv2i1, nonext_masked_load,    LD1H_D,  LD1H_D_IMM,  am_sve_regreg_lsl1>;
@@ -2158,18 +2377,18 @@ let Predicates = [HasSVEorStreamingSVE] in {
   defm : pred_load<nxv2f64,  nxv2i1, nonext_masked_load,    LD1D,    LD1D_IMM,    am_sve_regreg_lsl3>;
 
   // 4-element contiguous loads
-  defm : pred_load<nxv4i32,  nxv4i1, zext_masked_load_i8,   LD1B_S,  LD1B_S_IMM,  am_sve_regreg_lsl0>;
-  defm : pred_load<nxv4i32,  nxv4i1, asext_masked_load_i8,  LD1SB_S, LD1SB_S_IMM, am_sve_regreg_lsl0>;
-  defm : pred_load<nxv4i32,  nxv4i1, zext_masked_load_i16,  LD1H_S,  LD1H_S_IMM,  am_sve_regreg_lsl1>;
-  defm : pred_load<nxv4i32,  nxv4i1, asext_masked_load_i16, LD1SH_S, LD1SH_S_IMM, am_sve_regreg_lsl1>;
+  defm : pred_load<nxv4i32,  nxv4i1, azext_masked_load_i8,  LD1B_S,  LD1B_S_IMM,  am_sve_regreg_lsl0>;
+  defm : pred_load<nxv4i32,  nxv4i1, sext_masked_load_i8,   LD1SB_S, LD1SB_S_IMM, am_sve_regreg_lsl0>;
+  defm : pred_load<nxv4i32,  nxv4i1, azext_masked_load_i16, LD1H_S,  LD1H_S_IMM,  am_sve_regreg_lsl1>;
+  defm : pred_load<nxv4i32,  nxv4i1, sext_masked_load_i16,  LD1SH_S, LD1SH_S_IMM, am_sve_regreg_lsl1>;
   defm : pred_load<nxv4i32,  nxv4i1, nonext_masked_load,    LD1W,    LD1W_IMM,    am_sve_regreg_lsl2>;
   defm : pred_load<nxv4f16,  nxv4i1, nonext_masked_load,    LD1H_S,  LD1H_S_IMM,  am_sve_regreg_lsl1>;
   defm : pred_load<nxv4bf16, nxv4i1, nonext_masked_load,    LD1H_S,  LD1H_S_IMM,  am_sve_regreg_lsl1>;
   defm : pred_load<nxv4f32,  nxv4i1, nonext_masked_load,    LD1W,    LD1W_IMM,    am_sve_regreg_lsl2>;
 
   // 8-element contiguous loads
-  defm : pred_load<nxv8i16,  nxv8i1, zext_masked_load_i8,  LD1B_H,  LD1B_H_IMM,  am_sve_regreg_lsl0>;
-  defm : pred_load<nxv8i16,  nxv8i1, asext_masked_load_i8, LD1SB_H, LD1SB_H_IMM, am_sve_regreg_lsl0>;
+  defm : pred_load<nxv8i16,  nxv8i1, azext_masked_load_i8, LD1B_H,  LD1B_H_IMM,  am_sve_regreg_lsl0>;
+  defm : pred_load<nxv8i16,  nxv8i1, sext_masked_load_i8,  LD1SB_H, LD1SB_H_IMM, am_sve_regreg_lsl0>;
   defm : pred_load<nxv8i16,  nxv8i1, nonext_masked_load,   LD1H,    LD1H_IMM,    am_sve_regreg_lsl1>;
   defm : pred_load<nxv8f16,  nxv8i1, nonext_masked_load,   LD1H,    LD1H_IMM,    am_sve_regreg_lsl1>;
   defm : pred_load<nxv8bf16, nxv8i1, nonext_masked_load,   LD1H,    LD1H_IMM,    am_sve_regreg_lsl1>;
@@ -2397,7 +2616,7 @@ let Predicates = [HasSVEorStreamingSVE] in {
 
   // 16-element contiguous loads
   defm : ld1<LD1B, LD1B_IMM, nxv16i8, AArch64ld1_z, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;
-} // End HasSVEorStreamingSVE
+} // End HasSVEorSME
 
 let Predicates = [HasSVE] in {
   multiclass ldnf1<Instruction I, ValueType Ty, SDPatternOperator Load, ValueType PredTy, ValueType MemVT> {
@@ -2482,7 +2701,7 @@ let Predicates = [HasSVE] in {
   defm : ldff1<LDFF1B, nxv16i8, AArch64ldff1_z, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;
 } // End HasSVE
 
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
   multiclass st1<Instruction RegRegInst, Instruction RegImmInst, ValueType Ty,
                  SDPatternOperator Store, ValueType PredTy, ValueType MemVT, ComplexPattern AddrCP> {
     // reg + reg
@@ -2716,7 +2935,7 @@ let Predicates = [HasSVEorStreamingSVE] in {
   def : Pat<(vector_extract (nxv2f64 ZPR:$Zs), (i64 0)),
             (f64 (EXTRACT_SUBREG ZPR:$Zs, dsub))>;
   }
-} // End HasSVEorStreamingSVE
+} // End HasSVEorSME
 
 let Predicates = [HasSVE, HasMatMulInt8] in {
   defm  SMMLA_ZZZ : sve_int_matmul<0b00, "smmla", int_aarch64_sve_smmla>;
@@ -2724,11 +2943,11 @@ let Predicates = [HasSVE, HasMatMulInt8] in {
   defm USMMLA_ZZZ : sve_int_matmul<0b10, "usmmla", int_aarch64_sve_usmmla>;
 } // End HasSVE, HasMatMulInt8
 
-let Predicates = [HasSVEorStreamingSVE, HasMatMulInt8] in {
+let Predicates = [HasSVEorSME, HasMatMulInt8] in {
   defm USDOT_ZZZ  : sve_int_dot_mixed<"usdot", int_aarch64_sve_usdot>;
   defm USDOT_ZZZI : sve_int_dot_mixed_indexed<0, "usdot", int_aarch64_sve_usdot_lane>;
   defm SUDOT_ZZZI : sve_int_dot_mixed_indexed<1, "sudot", int_aarch64_sve_sudot_lane>;
-} // End HasSVEorStreamingSVE, HasMatMulInt8
+} // End HasSVEorSME, HasMatMulInt8
 
 let Predicates = [HasSVE, HasMatMulFP32] in {
   defm FMMLA_ZZZ_S : sve_fp_matrix_mla<0, "fmmla", ZPR32, int_aarch64_sve_fmmla, nxv4f32>;
@@ -2746,16 +2965,16 @@ let Predicates = [HasSVE, HasMatMulFP64] in {
   defm LD1RO_D     : sve_mem_ldor_ss<0b11, "ld1rod", Z_d, ZPR64, GPR64NoXZRshifted64, nxv2i64, nxv2i1,  AArch64ld1ro_z, am_sve_regreg_lsl3>;
 } // End HasSVE, HasMatMulFP64
 
-let Predicates = [HasSVEorStreamingSVE, HasMatMulFP64] in {
+let Predicates = [HasSVEorSME, HasMatMulFP64] in {
   defm ZIP1_ZZZ_Q  : sve_int_perm_bin_perm_128_zz<0b00, 0, "zip1", int_aarch64_sve_zip1q>;
   defm ZIP2_ZZZ_Q  : sve_int_perm_bin_perm_128_zz<0b00, 1, "zip2", int_aarch64_sve_zip2q>;
   defm UZP1_ZZZ_Q  : sve_int_perm_bin_perm_128_zz<0b01, 0, "uzp1", int_aarch64_sve_uzp1q>;
   defm UZP2_ZZZ_Q  : sve_int_perm_bin_perm_128_zz<0b01, 1, "uzp2", int_aarch64_sve_uzp2q>;
   defm TRN1_ZZZ_Q  : sve_int_perm_bin_perm_128_zz<0b11, 0, "trn1", int_aarch64_sve_trn1q>;
   defm TRN2_ZZZ_Q  : sve_int_perm_bin_perm_128_zz<0b11, 1, "trn2", int_aarch64_sve_trn2q>;
-} // End HasSVEorStreamingSVE, HasMatMulFP64
+} // End HasSVEorSME, HasMatMulFP64
 
-let Predicates = [HasSVE2orStreamingSVE] in {
+let Predicates = [HasSVE2orSME] in {
   // SVE2 integer multiply-add (indexed)
   defm MLA_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b0, "mla", int_aarch64_sve_mla_lane>;
   defm MLS_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b1, "mls", int_aarch64_sve_mls_lane>;
@@ -2903,17 +3122,17 @@ let Predicates = [HasSVE2orStreamingSVE] in {
   defm UQSHL_ZPZZ   : sve_int_bin_pred_all_active_bhsd<int_aarch64_sve_uqshl>;
   defm SQRSHL_ZPZZ  : sve_int_bin_pred_all_active_bhsd<int_aarch64_sve_sqrshl>;
   defm UQRSHL_ZPZZ  : sve_int_bin_pred_all_active_bhsd<int_aarch64_sve_uqrshl>;
-} // End HasSVE2orStreamingSVE
+} // End HasSVE2orSME
 
-let Predicates = [HasSVE2orStreamingSVE, UseExperimentalZeroingPseudos] in {
+let Predicates = [HasSVE2orSME, UseExperimentalZeroingPseudos] in {
   defm SQSHL_ZPZI  : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<null_frag>;
   defm UQSHL_ZPZI  : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<null_frag>;
   defm SRSHR_ZPZI  : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_srshr>;
   defm URSHR_ZPZI  : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_urshr>;
   defm SQSHLU_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<int_aarch64_sve_sqshlu>;
-} // End HasSVE2orStreamingSVE, UseExperimentalZeroingPseudos
+} // End HasSVE2orSME, UseExperimentalZeroingPseudos
 
-let Predicates = [HasSVE2orStreamingSVE] in {
+let Predicates = [HasSVE2orSME] in {
   // SVE2 predicated shifts
   defm SQSHL_ZPmI  : sve_int_bin_pred_shift_imm_left_dup<0b0110, "sqshl",  "SQSHL_ZPZI",  int_aarch64_sve_sqshl>;
   defm UQSHL_ZPmI  : sve_int_bin_pred_shift_imm_left_dup<0b0111, "uqshl",  "UQSHL_ZPZI",  int_aarch64_sve_uqshl>;
@@ -2960,18 +3179,18 @@ let Predicates = [HasSVE2orStreamingSVE] in {
   defm SLI_ZZI : sve2_int_bin_shift_imm_left< 0b1, "sli", int_aarch64_sve_sli>;
 
   // SVE2 bitwise shift right and accumulate
-  defm SSRA_ZZI  : sve2_int_bin_accum_shift_imm_right<0b00, "ssra",  int_aarch64_sve_ssra>;
-  defm USRA_ZZI  : sve2_int_bin_accum_shift_imm_right<0b01, "usra",  int_aarch64_sve_usra>;
-  defm SRSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b10, "srsra", int_aarch64_sve_srsra>;
-  defm URSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b11, "ursra", int_aarch64_sve_ursra>;
+  defm SSRA_ZZI  : sve2_int_bin_accum_shift_imm_right<0b00, "ssra",  AArch64ssra>;
+  defm USRA_ZZI  : sve2_int_bin_accum_shift_imm_right<0b01, "usra",  AArch64usra>;
+  defm SRSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b10, "srsra", int_aarch64_sve_srsra, int_aarch64_sve_srshr>;
+  defm URSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b11, "ursra", int_aarch64_sve_ursra, int_aarch64_sve_urshr>;
 
   // SVE2 complex integer add
   defm CADD_ZZI   : sve2_int_cadd<0b0, "cadd",   int_aarch64_sve_cadd_x>;
   defm SQCADD_ZZI : sve2_int_cadd<0b1, "sqcadd", int_aarch64_sve_sqcadd_x>;
 
   // SVE2 integer absolute difference and accumulate
-  defm SABA_ZZZ : sve2_int_absdiff_accum<0b0, "saba", int_aarch64_sve_saba>;
-  defm UABA_ZZZ : sve2_int_absdiff_accum<0b1, "uaba", int_aarch64_sve_uaba>;
+  defm SABA_ZZZ : sve2_int_absdiff_accum<0b0, "saba", AArch64saba>;
+  defm UABA_ZZZ : sve2_int_absdiff_accum<0b1, "uaba", AArch64uaba>;
 
   // SVE2 integer absolute difference and accumulate long
   defm SABALB_ZZZ : sve2_int_absdiff_accum_long<0b00, "sabalb", int_aarch64_sve_sabalb>;
@@ -3026,7 +3245,7 @@ let Predicates = [HasSVE2orStreamingSVE] in {
   defm SQXTNT_ZZ  : sve2_int_sat_extract_narrow_top<0b00, "sqxtnt",  int_aarch64_sve_sqxtnt>;
   defm UQXTNT_ZZ  : sve2_int_sat_extract_narrow_top<0b01, "uqxtnt",  int_aarch64_sve_uqxtnt>;
   defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow_top<0b10, "sqxtunt", int_aarch64_sve_sqxtunt>;
-} // End HasSVE2orStreamingSVE
+} // End HasSVE2orSME
 
 let Predicates = [HasSVE2] in {
   // SVE2 character match
@@ -3034,7 +3253,7 @@ let Predicates = [HasSVE2] in {
   defm NMATCH_PPzZZ : sve2_char_match<0b1, "nmatch", int_aarch64_sve_nmatch>;
 } // End HasSVE2
 
-let Predicates = [HasSVE2orStreamingSVE] in {
+let Predicates = [HasSVE2orSME] in {
   // SVE2 bitwise exclusive-or interleaved
   defm EORBT_ZZZ : sve2_bitwise_xor_interleaved<0b0, "eorbt", int_aarch64_sve_eorbt>;
   defm EORTB_ZZZ : sve2_bitwise_xor_interleaved<0b1, "eortb", int_aarch64_sve_eortb>;
@@ -3049,7 +3268,7 @@ let Predicates = [HasSVE2orStreamingSVE] in {
   defm SADDLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b00, "saddlbt", int_aarch64_sve_saddlbt>;
   defm SSUBLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b10, "ssublbt", int_aarch64_sve_ssublbt>;
   defm SSUBLTB_ZZZ : sve2_misc_int_addsub_long_interleaved<0b11, "ssubltb", int_aarch64_sve_ssubltb>;
-} // End HasSVE2orStreamingSVE
+} // End HasSVE2orSME
 
 let Predicates = [HasSVE2] in {
   // SVE2 histogram generation (segment)
@@ -3059,7 +3278,7 @@ let Predicates = [HasSVE2] in {
   defm HISTCNT_ZPzZZ : sve2_hist_gen_vector<"histcnt", int_aarch64_sve_histcnt>;
 } // End HasSVE2
 
-let Predicates = [HasSVE2orStreamingSVE] in {
+let Predicates = [HasSVE2orSME] in {
   // SVE2 floating-point base 2 logarithm as integer
   defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb", int_aarch64_sve_flogb>;
 
@@ -3091,7 +3310,7 @@ let Predicates = [HasSVE2orStreamingSVE] in {
   // SVE2 bitwise ternary operations
   defm EOR3_ZZZZ  : sve2_int_bitwise_ternary_op<0b000, "eor3",  int_aarch64_sve_eor3>;
   defm BCAX_ZZZZ  : sve2_int_bitwise_ternary_op<0b010, "bcax",  int_aarch64_sve_bcax>;
-  defm BSL_ZZZZ   : sve2_int_bitwise_ternary_op<0b001, "bsl",   int_aarch64_sve_bsl>;
+  defm BSL_ZZZZ   : sve2_int_bitwise_ternary_op<0b001, "bsl",   int_aarch64_sve_bsl, AArch64bsp>;
   defm BSL1N_ZZZZ : sve2_int_bitwise_ternary_op<0b011, "bsl1n", int_aarch64_sve_bsl1n>;
   defm BSL2N_ZZZZ : sve2_int_bitwise_ternary_op<0b101, "bsl2n", int_aarch64_sve_bsl2n>;
   defm NBSL_ZZZZ  : sve2_int_bitwise_ternary_op<0b111, "nbsl",  int_aarch64_sve_nbsl>;
@@ -3101,7 +3320,7 @@ let Predicates = [HasSVE2orStreamingSVE] in {
 
   // SVE2 extract vector (immediate offset, constructive)
   def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">;
-} // End HasSVE2orStreamingSVE
+} // End HasSVE2orSME
 
 let Predicates = [HasSVE2] in {
   // SVE2 non-temporal gather loads
@@ -3120,10 +3339,10 @@ let Predicates = [HasSVE2] in {
   defm LDNT1D_ZZR_D  : sve2_mem_gldnt_vs_64_ptrs<0b11110, "ldnt1d",  AArch64ldnt1_gather_z,  nxv2i64>;
 } // End HasSVE2
 
-let Predicates = [HasSVE2orStreamingSVE] in {
+let Predicates = [HasSVE2orSME] in {
   // SVE2 vector splice (constructive)
   defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">;
-} // End HasSVE2orStreamingSVE
+} // End HasSVE2orSME
 
 let Predicates = [HasSVE2] in {
   // SVE2 non-temporal scatter stores
@@ -3137,7 +3356,7 @@ let Predicates = [HasSVE2] in {
   defm STNT1D_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b110, "stnt1d", AArch64stnt1_scatter, nxv2i64>;
 } // End HasSVE2
 
-let Predicates = [HasSVE2orStreamingSVE] in {
+let Predicates = [HasSVE2orSME] in {
   // SVE2 table lookup (three sources)
   defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl", int_aarch64_sve_tbl2>;
   defm TBX_ZZZ  : sve2_int_perm_tbx<"tbx", int_aarch64_sve_tbx>;
@@ -3156,7 +3375,7 @@ let Predicates = [HasSVE2orStreamingSVE] in {
   // SVE2 pointer conflict compare
   defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr", "int_aarch64_sve_whilewr">;
   defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw", "int_aarch64_sve_whilerw">;
-} // End HasSVE2orStreamingSVE
+} // End HasSVE2orSME
 
 let Predicates = [HasSVE2AES] in {
   // SVE2 crypto destructive binary operations
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA55.td b/llvm/lib/Target/AArch64/AArch64SchedA55.td
index 009219ce3c54..c6b112d0d2f1 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA55.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA55.td
@@ -6,7 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the machine model for the ARM Cortex-A55 processors.
+// This file defines the machine model for the ARM Cortex-A55 processors. Note
+// that this schedule is currently used as the default for -mcpu=generic. As a
+// result, some of the modelling decision made do not precisely model the
+// Cortex-A55, instead aiming to be a good compromise between different cpus.
 //
 //===----------------------------------------------------------------------===//
 
@@ -149,8 +152,31 @@ def : WriteRes<WriteFCmp, [CortexA55UnitFPALU]> { let Latency = 3; }
 def : WriteRes<WriteFCvt, [CortexA55UnitFPALU]> { let Latency = 4; }
 def : WriteRes<WriteFCopy, [CortexA55UnitFPALU]> { let Latency = 3; }
 def : WriteRes<WriteFImm, [CortexA55UnitFPALU]> { let Latency = 3; }
-def : WriteRes<WriteVd, [CortexA55UnitFPALU]> { let Latency = 4; }
-def : WriteRes<WriteVq, [CortexA55UnitFPALU,CortexA55UnitFPALU]> { let Latency = 4; let BeginGroup = 1; }
+
+// NEON
+class CortexA55WriteVd<int n, ProcResourceKind res> : SchedWriteRes<[res]> {
+  let Latency = n;
+}
+class CortexA55WriteVq<int n, ProcResourceKind res> : SchedWriteRes<[res, res]> {
+  let Latency = n;
+  let BeginGroup = 1;
+}
+def CortexA55WriteDotScVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>;
+def CortexA55WriteDotVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>;
+def CortexA55WriteDotVd_4 : CortexA55WriteVd<4, CortexA55UnitFPALU>;
+def CortexA55WriteMlaLVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>;
+def CortexA55WriteMlaIxVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>;
+def CortexA55WriteMlaVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>;
+def CortexA55WriteMlaVd_4 : CortexA55WriteVd<4, CortexA55UnitFPALU>;
+def CortexA55WriteAluVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>;
+def CortexA55WriteAluVd_3 : CortexA55WriteVd<3, CortexA55UnitFPALU>;
+def CortexA55WriteAluVq_3 : CortexA55WriteVq<3, CortexA55UnitFPALU>;
+def CortexA55WriteAluVd_2 : CortexA55WriteVd<2, CortexA55UnitFPALU>;
+def CortexA55WriteAluVq_2 : CortexA55WriteVq<2, CortexA55UnitFPALU>;
+def CortexA55WriteAluVd_1 : CortexA55WriteVd<1, CortexA55UnitFPALU>;
+def CortexA55WriteAluVq_1 : CortexA55WriteVq<1, CortexA55UnitFPALU>;
+def : SchedAlias<WriteVd, CortexA55WriteVd<4, CortexA55UnitFPALU>>;
+def : SchedAlias<WriteVq, CortexA55WriteVq<4, CortexA55UnitFPALU>>;
 
 // FP ALU specific new schedwrite definitions
 def CortexA55WriteFPALU_F2 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 2;}
@@ -358,4 +384,99 @@ def : InstRW<[CortexA55WriteFSqrtHP], (instregex "^.*SQRT.*16$")>;
 def : InstRW<[CortexA55WriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
 def : InstRW<[CortexA55WriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
 
+// 4.15. Advanced SIMD integer instructions
+// ASIMD absolute diff
+def : InstRW<[CortexA55WriteAluVd_3], (instregex "[SU]ABDv(2i32|4i16|8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_3], (instregex "[SU]ABDv(16i8|4i32|8i16)")>;
+// ASIMD absolute diff accum
+def : InstRW<[CortexA55WriteAluVq_4], (instregex "[SU]ABAL?v")>;
+// ASIMD absolute diff long
+def : InstRW<[CortexA55WriteAluVq_3], (instregex "[SU]ABDLv")>;
+// ASIMD arith #1
+def : InstRW<[CortexA55WriteAluVd_2], (instregex "(ADD|SUB|NEG)v(1i64|2i32|4i16|8i8)",
+  "[SU]R?HADDv(2i32|4i16|8i8)", "[SU]HSUBv(2i32|4i16|8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_2], (instregex "(ADD|SUB|NEG)v(2i64|4i32|8i16|16i8)",
+  "[SU]R?HADDv(8i16|4i32|16i8)", "[SU]HSUBv(8i16|4i32|16i8)")>;
+// ASIMD arith #2
+def : InstRW<[CortexA55WriteAluVd_3], (instregex "ABSv(1i64|2i32|4i16|8i8)$",
+  "[SU]ADDLPv(2i32_v1i64|4i16_v2i32|8i8_v4i16)$", 
+  "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(1i16|1i32|1i64|1i8|2i32|4i16|8i8)$",
+  "ADDPv(2i32|4i16|8i8)$")>;
+def : InstRW<[CortexA55WriteAluVq_3], (instregex "ABSv(2i64|4i32|8i16|16i8)$",
+  "[SU]ADDLPv(16i8_v8i16|4i32_v2i64|8i16_v4i32)$", 
+  "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(16i8|2i64|4i32|8i16)$",
+  "ADDPv(16i8|2i64|4i32|8i16)$")>;
+// ASIMD arith #3
+def : InstRW<[CortexA55WriteAluVq_3], (instregex  "SADDLv", "UADDLv", "SADDWv",
+  "UADDWv", "SSUBLv", "USUBLv", "SSUBWv", "USUBWv", "ADDHNv", "SUBHNv")>;
+// ASIMD arith #5
+def : InstRW<[CortexA55WriteAluVq_4], (instregex "RADDHNv", "RSUBHNv")>;
+// ASIMD arith, reduce
+def : InstRW<[CortexA55WriteAluVq_3], (instregex  "ADDVv", "SADDLVv", "UADDLVv")>;
+// ASIMD compare #1
+def : InstRW<[CortexA55WriteAluVd_2], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(1i64|2i32|4i16|8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_2], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(2i64|4i32|8i16|16i8)")>;
+// ASIMD compare #2
+def : InstRW<[CortexA55WriteAluVd_3], (instregex "CMTSTv(1i64|2i32|4i16|8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_3], (instregex "CMTSTv(2i64|4i32|8i16|16i8)")>;
+// ASIMD logical $1
+def : InstRW<[CortexA55WriteAluVd_1], (instregex "(AND|EOR|NOT|ORN)v8i8",
+  "(ORR|BIC)v(2i32|4i16|8i8)$", "MVNIv(2i|2s|4i16)")>;
+def : InstRW<[CortexA55WriteAluVq_1], (instregex "(AND|EOR|NOT|ORN)v16i8",
+  "(ORR|BIC)v(16i8|4i32|8i16)$", "MVNIv(4i32|4s|8i16)")>;
+// ASIMD max/min, basic
+def : InstRW<[CortexA55WriteAluVd_2], (instregex "[SU](MIN|MAX)P?v(2i32|4i16|8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_2], (instregex "[SU](MIN|MAX)P?v(16i8|4i132|8i16)")>;
+// SIMD max/min, reduce
+def : InstRW<[CortexA55WriteAluVq_4], (instregex "[SU](MAX|MIN)Vv")>;
+// ASIMD multiply, by element
+def : InstRW<[CortexA55WriteAluVq_4], (instregex "MULv(2i32|4i16|4i32|8i16)_indexed$",
+  "SQR?DMULHv(1i16|1i32|2i32|4i16|4i32|8i16)_indexed$")>;
+// ASIMD multiply
+def : InstRW<[CortexA55WriteAluVd_3], (instrs PMULv8i8)>;
+def : InstRW<[CortexA55WriteAluVq_3], (instrs PMULv16i8)>;
+// ASIMD multiply accumulate
+def : InstRW<[CortexA55WriteMlaVd_4], (instregex "ML[AS]v(2i32|4i16|8i8)$")>;
+def : InstRW<[CortexA55WriteMlaVq_4], (instregex "ML[AS]v(16i8|4i32|8i16)$")>;
+def : InstRW<[CortexA55WriteMlaIxVq_4], (instregex "ML[AS]v(2i32|4i16|4i32|8i16)_indexed$")>;
+// ASIMD multiply accumulate half
+def : InstRW<[CortexA55WriteAluVq_4], (instregex "SQRDML[AS]H[vi]")>;
+// ASIMD multiply accumulate long
+def : InstRW<[CortexA55WriteMlaLVq_4], (instregex "[SU]ML[AS]Lv")>;
+// ASIMD multiply accumulate long #2
+def : InstRW<[CortexA55WriteAluVq_4], (instregex "SQDML[AS]L[iv]")>;
+// ASIMD dot product
+def : InstRW<[CortexA55WriteDotVd_4], (instregex "[SU]DOTv8i8")>;
+def : InstRW<[CortexA55WriteDotVq_4], (instregex "[SU]DOTv16i8")>;
+// ASIMD dot product, by scalar
+def : InstRW<[CortexA55WriteDotScVq_4], (instregex "[SU]DOTlanev")>;
+// ASIMD multiply long
+def : InstRW<[CortexA55WriteAluVq_4], (instregex "[SU]MULLv", "SQDMULL[iv]")>;
+// ASIMD polynomial (8x8) multiply long
+def : InstRW<[CortexA55WriteAluVq_3], (instrs PMULLv8i8, PMULLv16i8)>;
+// ASIMD pairwise add and accumulate
+def : InstRW<[CortexA55WriteAluVq_4], (instregex "[SU]ADALPv")>;
+// ASIMD shift accumulate
+def : InstRW<[CortexA55WriteAluVd_3], (instregex "[SU]SRA(d|v2i32|v4i16|v8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_3], (instregex "[SU]SRAv(16i8|2i64|4i32|8i16)")>;
+// ASIMD shift accumulate #2
+def : InstRW<[CortexA55WriteAluVq_4], (instregex "[SU]RSRA[vd]")>;
+// ASIMD shift by immed
+def : InstRW<[CortexA55WriteAluVd_2], (instregex "SHLd$", "SHLv",
+  "SLId$", "SRId$", "[SU]SHR[vd]", "SHRNv")>;
+// ASIMD shift by immed
+// SXTL and UXTL are aliases for SHLL
+def : InstRW<[CortexA55WriteAluVq_2], (instregex "[US]?SHLLv")>;
+// ASIMD shift by immed #2
+def : InstRW<[CortexA55WriteAluVd_3], (instregex "[SU]RSHR(d|v2i32|v4i16|v8i8)",
+  "RSHRNv(2i32|4i16|8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_3], (instregex "[SU]RSHRv(16i8|2i64|4i32|8i16)",
+  "RSHRNv(16i8|4i32|8i16)")>;
+// ASIMD shift by register
+def : InstRW<[CortexA55WriteAluVd_2], (instregex "[SU]SHLv(1i64|2i32|4i16|8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_2], (instregex "[SU]SHLv(2i64|4i32|8i16|16i8)")>;
+// ASIMD shift by register #2
+def : InstRW<[CortexA55WriteAluVd_3], (instregex "[SU]RSHLv(1i64|2i32|4i16|8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_3], (instregex "[SU]RSHLv(2i64|4i32|8i16|16i8)")>;
+
 }
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
index fa10d056b7f7..6b053f1969b4 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
@@ -22,7 +22,7 @@ def A64FXModel : SchedMachineModel {
 
   list<Predicate> UnsupportedFeatures =
     [HasSVE2, HasSVE2AES, HasSVE2SM4, HasSVE2SHA3, HasSVE2BitPerm, HasPAuth,
-     HasSVE2orStreamingSVE];
+     HasSVE2orSME];
 
   let FullInstRWOverlapCheck = 0;
 }
@@ -3348,7 +3348,7 @@ def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFH_PRI)>;
 def : InstRW<[A64FXWrite_10Cyc_GI056], (instrs PRFH_D_PZI, PRFH_S_PZI)>;
 
 // [351]   "prfw        $prfop, $Pg, [$Rn, $Rm]";
-def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFS_PRR)>;
+def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFW_PRR)>;
 
 // [352]   "prfw        $prfop, $Pg, [$Rn, $Zm]";
 def : InstRW<[A64FXWrite_14Cyc_GI0256], (instrs PRFW_D_SCALED, PRFW_D_SXTW_SCALED, PRFW_D_UXTW_SCALED, PRFW_S_SXTW_SCALED, PRFW_S_UXTW_SCALED)>;
@@ -3554,7 +3554,7 @@ def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQINCW_ZPiI)>;
 def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1B, ST1B_D, ST1B_H, ST1B_S)>;
 
 // [421]   "st1b        $Zt, $Pg, [$Rn, $Zm]";
-def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1B_D_REAL, SST1B_D_SXTW, SST1B_D_UXTW, SST1B_S_SXTW, SST1B_S_UXTW)>;
+def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1B_D, SST1B_D_SXTW, SST1B_D_UXTW, SST1B_S_SXTW, SST1B_S_UXTW)>;
 
 // [422]   "st1b        $Zt, $Pg, [$Rn, $imm4, mul vl]";
 def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1B_D_IMM, ST1B_H_IMM, ST1B_IMM, ST1B_S_IMM)>;
@@ -3566,7 +3566,7 @@ def : InstRW<[A64FXWrite_ST1W_15], (instrs SST1B_D_IMM, SST1B_S_IMM)>;
 def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1D)>;
 
 // [425]   "st1d        $Zt, $Pg, [$Rn, $Zm]";
-def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1D_REAL, SST1D_SCALED_SCALED_REAL, SST1D_SXTW, SST1D_SXTW_SCALED, SST1D_UXTW, SST1D_UXTW_SCALED)>;
+def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1D, SST1D_SCALED, SST1D_SXTW, SST1D_SXTW_SCALED, SST1D_UXTW, SST1D_UXTW_SCALED)>;
 
 // [426]   "st1d        $Zt, $Pg, [$Rn, $imm4, mul vl]";
 def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1D_IMM)>;
@@ -3578,7 +3578,7 @@ def : InstRW<[A64FXWrite_ST1W_15], (instrs SST1D_IMM)>;
 def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1H, ST1H_D, ST1H_S)>;
 
 // [429]   "st1h        $Zt, $Pg, [$Rn, $Zm]";
-def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1H_D_REAL, SST1H_D_SCALED_SCALED_REAL, SST1H_D_SXTW, SST1H_D_SXTW_SCALED, SST1H_D_UXTW, SST1H_D_UXTW_SCALED, SST1H_S_SXTW, SST1H_S_SXTW_SCALED, SST1H_S_UXTW, SST1H_S_UXTW_SCALED)>;
+def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1H_D, SST1H_D_SCALED, SST1H_D_SXTW, SST1H_D_SXTW_SCALED, SST1H_D_UXTW, SST1H_D_UXTW_SCALED, SST1H_S_SXTW, SST1H_S_SXTW_SCALED, SST1H_S_UXTW, SST1H_S_UXTW_SCALED)>;
 
 // [430]   "st1h        $Zt, $Pg, [$Rn, $imm4, mul vl]";
 def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1H_D_IMM, ST1H_IMM, ST1H_S_IMM)>;
@@ -3590,7 +3590,7 @@ def : InstRW<[A64FXWrite_ST1W_15], (instrs SST1H_D_IMM, SST1H_S_IMM)>;
 def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1W, ST1W_D)>;
 
 // [433]   "st1w        $Zt, $Pg, [$Rn, $Zm]";
-def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1W_D_REAL, SST1W_D_SCALED_SCALED_REAL, SST1W_D_SXTW, SST1W_D_SXTW_SCALED, SST1W_D_UXTW, SST1W_D_UXTW_SCALED, SST1W_SXTW, SST1W_SXTW_SCALED, SST1W_UXTW, SST1W_UXTW_SCALED)>;
+def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1W_D, SST1W_D_SCALED, SST1W_D_SXTW, SST1W_D_SXTW_SCALED, SST1W_D_UXTW, SST1W_D_UXTW_SCALED, SST1W_SXTW, SST1W_SXTW_SCALED, SST1W_UXTW, SST1W_UXTW_SCALED)>;
 
 // [434]   "st1w        $Zt, $Pg, [$Rn, $imm4, mul vl]";
 def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1W_D_IMM, ST1W_IMM)>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td b/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td
new file mode 100644
index 000000000000..32f7299fbf87
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td
@@ -0,0 +1,1136 @@
+//=- AArch64SchedAmpere1.td - Ampere-1 scheduling def -----*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for the Ampere Computing Ampere-1 to
+// support instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+// The Ampere-1 core is an out-of-order micro-architecture.  The front
+// end has branch prediction, with a 10-cycle recovery time from a
+// mispredicted branch.  Instructions coming out of the front end are
+// decoded into internal micro-ops (uops).
+
+def Ampere1Model : SchedMachineModel {
+  let IssueWidth            =   4;  // 4-way decode and dispatch
+  let MicroOpBufferSize     = 174;  // micro-op re-order buffer size
+  let LoadLatency           =   4;  // Optimistic load latency
+  let MispredictPenalty     =  10;  // Branch mispredict penalty
+  let LoopMicroOpBufferSize =  32;  // Instruction queue size
+  let CompleteModel = 1;
+
+  list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+                                                    SMEUnsupported.F);
+}
+
+let SchedModel = Ampere1Model in {
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Ampere-1.
+// Ampere-1 has 12 pipelines that 8 independent scheduler (4 integer, 2 FP,
+// and 2 memory) issue into.  The integer and FP schedulers can each issue
+// one uop per cycle, while the memory schedulers can each issue one load
+// and one store address calculation per cycle.
+
+def Ampere1UnitA  : ProcResource<2>;  // integer single-cycle, branch, and flags r/w
+def Ampere1UnitB  : ProcResource<2>;  // integer single-cycle, and complex shifts
+def Ampere1UnitBS : ProcResource<1>;  // integer multi-cycle
+def Ampere1UnitL  : ProcResource<2>;  // load
+def Ampere1UnitS  : ProcResource<2>;  // store address calculation
+def Ampere1UnitX  : ProcResource<1>;  // FP and vector operations, and flag write
+def Ampere1UnitY  : ProcResource<1>;  // FP and vector operations, and crypto
+def Ampere1UnitZ  : ProcResource<1>;  // FP store data and FP-to-integer moves
+
+def Ampere1UnitAB : ProcResGroup<[Ampere1UnitA, Ampere1UnitB]>;
+def Ampere1UnitXY : ProcResGroup<[Ampere1UnitX, Ampere1UnitY]>;
+
+//===----------------------------------------------------------------------===//
+// Define customized scheduler read/write types specific to the Ampere-1.
+
+def Ampere1Write_1cyc_1A : SchedWriteRes<[Ampere1UnitA]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_1cyc_2A : SchedWriteRes<[Ampere1UnitA, Ampere1UnitA]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_1cyc_1B : SchedWriteRes<[Ampere1UnitB]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_1cyc_1AB : SchedWriteRes<[Ampere1UnitAB]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_1cyc_1L : SchedWriteRes<[Ampere1UnitL]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_1cyc_1S : SchedWriteRes<[Ampere1UnitS]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_1cyc_2S : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1Y : SchedWriteRes<[Ampere1UnitY]> {
+  let Latency = 2;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_2cyc_2AB : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitAB]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1B_1AB : SchedWriteRes<[Ampere1UnitB, Ampere1UnitAB]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1B_1A : SchedWriteRes<[Ampere1UnitB, Ampere1UnitA]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1AB_1A : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitA]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1AB_2S : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitS,
+                                                             Ampere1UnitS]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_2cyc_1AB_1S_1Z : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitS,
+                                                                Ampere1UnitZ]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_2cyc_1B_1S : SchedWriteRes<[Ampere1UnitB, Ampere1UnitS]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 2;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_2cyc_1S_1Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitZ]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_3cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
+  let Latency = 3;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_3cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 3;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_3cyc_1B_1S_1AB : SchedWriteRes<[Ampere1UnitB, Ampere1UnitS,
+                                                               Ampere1UnitAB]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_3cyc_1S_2Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitZ, Ampere1UnitZ]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_3cyc_2S_2Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS,
+                                             Ampere1UnitZ, Ampere1UnitZ]> {
+  let Latency = 2;
+  let NumMicroOps = 4;
+}
+
+def Ampere1Write_4cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
+  let Latency = 4;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_4cyc_1L : SchedWriteRes<[Ampere1UnitL]> {
+  let Latency = 4;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_4cyc_1X : SchedWriteRes<[Ampere1UnitX]> {
+  let Latency = 4;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_4cyc_1Y : SchedWriteRes<[Ampere1UnitY]> {
+  let Latency = 4;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_4cyc_1Z : SchedWriteRes<[Ampere1UnitZ]> {
+  let Latency = 4;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_4cyc_2L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_4cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 4;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_4cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_4cyc_1XY_1S_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitS, Ampere1UnitZ]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_4cyc_3S_3Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS, Ampere1UnitS,
+                                             Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ]> {
+  let Latency = 4;
+  let NumMicroOps = 6;
+}
+
+def Ampere1Write_5cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_5cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
+  let Latency = 5;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_5cyc_1X : SchedWriteRes<[Ampere1UnitX]> {
+  let Latency = 5;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_5cyc_1L : SchedWriteRes<[Ampere1UnitL]> {
+  let Latency = 5;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_5cyc_2L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_5cyc_1L_1BS : SchedWriteRes<[Ampere1UnitL, Ampere1UnitBS]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_5cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 5;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_5cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_5cyc_4S_4Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS,
+                                             Ampere1UnitS, Ampere1UnitS,
+                                             Ampere1UnitZ, Ampere1UnitZ,
+                                             Ampere1UnitZ, Ampere1UnitZ]> {
+  let Latency = 5;
+  let NumMicroOps = 8;
+}
+
+def Ampere1Write_5cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitS, Ampere1UnitS,
+                                                 Ampere1UnitZ, Ampere1UnitZ]> {
+  let Latency = 5;
+  let NumMicroOps = 6;
+}
+
+def Ampere1Write_6cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitS, Ampere1UnitS,
+                                                 Ampere1UnitZ, Ampere1UnitZ]> {
+  let Latency = 6;
+  let NumMicroOps = 6;
+}
+
+def Ampere1Write_6cyc_3XY_3S_3Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitS, Ampere1UnitS, Ampere1UnitS,
+                                                 Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ]> {
+  let Latency = 6;
+  let NumMicroOps = 9;
+}
+
+def Ampere1Write_6cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_6cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 6;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_6cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_6cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_6cyc_3L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_6cyc_4L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+                                          Ampere1UnitL, Ampere1UnitL]> {
+  let Latency = 6;
+  let NumMicroOps = 4;
+}
+
+def Ampere1Write_6cyc_1XY_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitZ]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_7cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
+  let Latency = 7;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_7cyc_1BS_1XY : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitXY]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_7cyc_1L_1XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitXY]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_7cyc_2L_2XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+                                              Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 7;
+  let NumMicroOps = 4;
+}
+
+def Ampere1Write_7cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_7cyc_4XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitS, Ampere1UnitS,
+                                                 Ampere1UnitS, Ampere1UnitS,
+                                                 Ampere1UnitZ, Ampere1UnitZ,
+                                                 Ampere1UnitZ, Ampere1UnitZ]> {
+  let Latency = 7;
+  let NumMicroOps = 12;
+}
+
+def Ampere1Write_8cyc_1BS_1A : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitA]> {
+  let Latency = 8;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_8cyc_1BS_2A : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitA,
+                                                             Ampere1UnitA]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_8cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 8;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_8cyc_4XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+                                           Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 8;
+  let NumMicroOps = 4;
+}
+
+def Ampere1Write_8cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL,
+                                              Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 8;
+  let NumMicroOps = 6;
+}
+
+def Ampere1Write_8cyc_4L_4XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+                                              Ampere1UnitL, Ampere1UnitL,
+                                              Ampere1UnitXY, Ampere1UnitXY,
+                                              Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 8;
+  let NumMicroOps = 8;
+}
+
+def Ampere1Write_9cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL,
+                                              Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 9;
+  let NumMicroOps = 6;
+}
+
+def Ampere1Write_9cyc_4L_4XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+                                              Ampere1UnitL, Ampere1UnitL,
+                                              Ampere1UnitXY, Ampere1UnitXY,
+                                              Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 9;
+  let NumMicroOps = 8;
+}
+
+def Ampere1Write_9cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 9;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_9cyc_2L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+                                              Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 9;
+  let NumMicroOps = 5;
+}
+
+def Ampere1Write_9cyc_6XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitS, Ampere1UnitS,
+                                                 Ampere1UnitS, Ampere1UnitS,
+                                                 Ampere1UnitZ, Ampere1UnitZ,
+                                                 Ampere1UnitZ, Ampere1UnitZ]> {
+  let Latency = 9;
+  let NumMicroOps = 14;
+}
+
+def Ampere1Write_9cyc_8XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitS, Ampere1UnitS,
+                                                 Ampere1UnitS, Ampere1UnitS,
+                                                 Ampere1UnitZ, Ampere1UnitZ,
+                                                 Ampere1UnitZ, Ampere1UnitZ]> {
+  let Latency = 9;
+  let NumMicroOps = 16;
+}
+
+def Ampere1Write_10cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 10;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_10cyc_1XY_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitZ]> {
+  let Latency = 10;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_10cyc_1X_1Z : SchedWriteRes<[Ampere1UnitX, Ampere1UnitZ]> {
+  let Latency = 10;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_10cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL,
+                                               Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 10;
+  let NumMicroOps = 6;
+}
+
+def Ampere1Write_10cyc_1A_1BS_1X : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitX]> {
+  let Latency = 10;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_10cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitXY]> {
+  let Latency = 10;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_11cyc_1BS_1L : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitL]> {
+  let Latency = 11;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_11cyc_1A_1BS_1X : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitX]> {
+  let Latency = 11;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_11cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitXY]> {
+  let Latency = 11;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_11cyc_4L_8XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+                                               Ampere1UnitL, Ampere1UnitL,
+                                               Ampere1UnitXY, Ampere1UnitXY,
+                                               Ampere1UnitXY, Ampere1UnitXY,
+                                               Ampere1UnitXY, Ampere1UnitXY,
+                                               Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 11;
+  let NumMicroOps = 12;
+}
+
+def Ampere1Write_12cyc_4L_8XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+                                               Ampere1UnitL, Ampere1UnitL,
+                                               Ampere1UnitXY, Ampere1UnitXY,
+                                               Ampere1UnitXY, Ampere1UnitXY,
+                                               Ampere1UnitXY, Ampere1UnitXY,
+                                               Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 12;
+  let NumMicroOps = 12;
+}
+
+def Ampere1Write_12cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 12;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_12cyc_4XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+                                            Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 12;
+  let NumMicroOps = 4;
+}
+
+def Ampere1Write_18cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
+  let Latency = 18;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_19cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 19;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_25cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 25;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_32cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 32;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_34cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
+  let Latency = 34;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_34cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 34;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_39cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 39;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_62cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 62;
+  let NumMicroOps = 1;
+}
+
+// For basic arithmetic, we have more flexibility for short shifts (LSL shift <= 4),
+// which are a single uop, and for extended registers, which have full flexibility
+// across Unit A or B for both uops.
+def Ampere1Write_Arith : SchedWriteVariant<[
+                                SchedVar<RegExtendedPred, [Ampere1Write_2cyc_2AB]>,
+                                SchedVar<AmpereCheapLSL,  [Ampere1Write_1cyc_1AB]>,
+                                SchedVar<NoSchedPred,     [Ampere1Write_2cyc_1B_1AB]>]>;
+
+def Ampere1Write_ArithFlagsetting : SchedWriteVariant<[
+                                SchedVar<RegExtendedPred, [Ampere1Write_2cyc_1AB_1A]>,
+                                SchedVar<AmpereCheapLSL,  [Ampere1Write_1cyc_1A]>,
+                                SchedVar<NoSchedPred,     [Ampere1Write_2cyc_1B_1A]>]>;
+
+//===----------------------------------------------------------------------===//
+// Map the target-defined scheduler read/write resources and latencies for Ampere-1.
+// This provides a coarse model, which is then specialised below.
+
+def : WriteRes<WriteImm,   [Ampere1UnitAB]>;  // MOVN, MOVZ
+def : WriteRes<WriteI,     [Ampere1UnitAB]>;  // ALU
+def : WriteRes<WriteISReg, [Ampere1UnitB, Ampere1UnitA]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}  // ALU of Shifted-Reg
+def : WriteRes<WriteIEReg, [Ampere1UnitAB, Ampere1UnitA]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}  // ALU of Extended-Reg
+def : WriteRes<WriteExtr,  [Ampere1UnitB]>;  // EXTR shifts a reg pair
+def : WriteRes<WriteIS,    [Ampere1UnitB]>;  // Shift/Scale
+def : WriteRes<WriteID32,  [Ampere1UnitBS]> {
+  let Latency = 18;
+}  // 32-bit Divide
+def : WriteRes<WriteID64,  [Ampere1UnitBS]> {
+  let Latency = 34;
+}  // 64-bit Divide
+def : WriteRes<WriteIM32,  [Ampere1UnitBS]> {
+  let Latency = 3;
+}  // 32-bit Multiply
+def : WriteRes<WriteIM64,  [Ampere1UnitBS]> {
+  let Latency = 3;
+}  // 32-bit Multiply
+def : WriteRes<WriteBr,    [Ampere1UnitA]>;
+def : WriteRes<WriteBrReg, [Ampere1UnitA, Ampere1UnitA]>;
+def : WriteRes<WriteLD,    [Ampere1UnitL]> {
+  let Latency = 4;
+}  // Load from base addr plus immediate offset
+def : WriteRes<WriteST,    [Ampere1UnitS]> {
+  let Latency = 1;
+}  // Store to base addr plus immediate offset
+def : WriteRes<WriteSTP,   [Ampere1UnitS, Ampere1UnitS]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}  // Store a register pair.
+def : WriteRes<WriteAdr,   [Ampere1UnitAB]>;
+def : WriteRes<WriteLDIdx, [Ampere1UnitAB, Ampere1UnitS]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}  // Load from a register index (maybe scaled).
+def : WriteRes<WriteSTIdx, [Ampere1UnitS, Ampere1UnitS]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}  // Store to a register index (maybe scaled).
+def : WriteRes<WriteF,  [Ampere1UnitXY]> {
+  let Latency = 2;
+}  // General floating-point ops.
+def : WriteRes<WriteFCmp,  [Ampere1UnitX]> {
+  let Latency = 5;
+}  // Floating-point compare.
+def : WriteRes<WriteFCvt,  [Ampere1UnitXY]> {
+  let Latency = 6;
+}  // Float conversion.
+def : WriteRes<WriteFCopy, [Ampere1UnitXY]> {
+}  // Float-int register copy.
+def : WriteRes<WriteFImm,  [Ampere1UnitXY]> {
+  let Latency = 2;
+}  // Float-int register copy.
+def : WriteRes<WriteFMul,  [Ampere1UnitXY]> {
+  let Latency = 5;
+}  // Floating-point multiply.
+def : WriteRes<WriteFDiv,  [Ampere1UnitXY]> {
+  let Latency = 34;
+}  // Floating-point division.
+def : WriteRes<WriteVd,    [Ampere1UnitXY]> {
+  let Latency = 3;
+}  // 64bit Vector D ops.
+def : WriteRes<WriteVq,    [Ampere1UnitXY]> {
+  let Latency = 3;
+}  // 128bit Vector Q ops.
+def : WriteRes<WriteVLD,   [Ampere1UnitL, Ampere1UnitL]> {
+  let Latency = 5;
+}  // Vector loads.
+def : WriteRes<WriteVST,   [Ampere1UnitS, Ampere1UnitZ]> {
+  let Latency = 2;
+}  // Vector stores.
+
+def : WriteRes<WriteAtomic,  []> { let Unsupported = 1; }
+
+def : WriteRes<WriteSys,     []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint,    []> { let Latency = 1; }
+
+def : WriteRes<WriteLDHi,    []> {
+  let Latency = 4;
+}  // The second register of a load-pair: LDP,LDPSW,LDNP,LDXP,LDAXP
+
+// Forwarding logic.
+def : ReadAdvance<ReadI,       0>;
+def : ReadAdvance<ReadISReg,   0>;
+def : ReadAdvance<ReadIEReg,   0>;
+def : ReadAdvance<ReadIM,      0>;
+def : ReadAdvance<ReadIMA,     1, [WriteIM32, WriteIM64]>;
+def : ReadAdvance<ReadID,      0>;
+def : ReadAdvance<ReadExtrHi,  0>;
+def : ReadAdvance<ReadST,      0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD,     0>;
+
+//===----------------------------------------------------------------------===//
+// Specialising the scheduling model further for Ampere-1.
+
+def : InstRW<[Ampere1Write_1cyc_1AB], (instrs COPY)>;
+
+// Branch instructions
+def : InstRW<[Ampere1Write_1cyc_1A], (instrs Bcc, BL, RET)>;
+def : InstRW<[Ampere1Write_1cyc_1A],
+        (instrs CBZW, CBZX, CBNZW, CBNZX, TBZW, TBZX, TBNZW, TBNZX)>;
+def : InstRW<[Ampere1Write_1cyc_2A], (instrs BLR)>;
+
+// Cryptography instructions
+// -- AES encryption/decryption
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^AES[DE]")>;
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^AESI?MC")>;
+// -- Polynomial multiplication
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^PMUL", "^PMULL")>;
+// -- SHA-256 hash
+def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA256(H|H2)")>;
+// -- SHA-256 schedule update
+def : InstRW<[Ampere1Write_4cyc_1Y], (instregex "^SHA256SU[01]")>;
+// -- SHA-3 instructions
+def : InstRW<[Ampere1Write_2cyc_1XY],
+        (instregex "^BCAX", "^EOR3", "^RAX1", "^XAR")>;
+// -- SHA-512 hash
+def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA512(H|H2)")>;
+// -- SHA-512 schedule update
+def : InstRW<[Ampere1Write_4cyc_1Y], (instregex "^SHA512SU[01]")>;
+// -- SHA1 choose/majority/parity
+def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA1[CMP]")>;
+// -- SHA1 hash/schedule update
+def : InstRW<[Ampere1Write_2cyc_1Y], (instregex "^SHA1SU[01]")>;
+def : InstRW<[Ampere1Write_2cyc_1Y], (instregex "^SHA1H")>;
+
+// FP and vector load instructions
+// -- Load 1-element structure to one/all lanes
+// ---- all lanes
+def : InstRW<[Ampere1Write_7cyc_1L_1XY],
+        (instregex "^LD1Rv(8b|4h|2s|16b|8h|4s|2d)")>;
+// ---- one lane
+def : InstRW<[Ampere1Write_7cyc_1L_1XY],
+        (instregex "^LD1i(8|16|32|64)")>;
+// -- Load 1-element structure to one/all lanes, 1D size
+def : InstRW<[Ampere1Write_5cyc_1L],
+        (instregex "^LD1Rv1d")>;
+// -- Load 1-element structures to 1 register
+def : InstRW<[Ampere1Write_5cyc_1L],
+        (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Load 1-element structures to 2 registers
+def : InstRW<[Ampere1Write_5cyc_2L],
+        (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Load 1-element structures to 3 registers
+def : InstRW<[Ampere1Write_6cyc_3L],
+        (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Load 1-element structures to 4 registers
+def : InstRW<[Ampere1Write_6cyc_4L],
+        (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Load 2-element structure to all lanes of 2 registers, 1D size
+def : InstRW<[Ampere1Write_5cyc_2L],
+        (instregex "^LD2Rv1d")>;
+// -- Load 2-element structure to all lanes of 2 registers, other sizes
+def : InstRW<[Ampere1Write_7cyc_2L_2XY],
+        (instregex "^LD2Rv(8b|4h|2s|16b|8h|4s|2d)")>;
+// -- Load 2-element structure to one lane of 2 registers
+def : InstRW<[Ampere1Write_7cyc_2L_2XY],
+        (instregex "^LD2i(8|16|32|64)")>;
+// -- Load 2-element structures to 2 registers, 16B/8H/4S/2D size
+def : InstRW<[Ampere1Write_7cyc_2L_2XY],
+        (instregex "^LD2Twov(16b|8h|4s|2d)")>;
+// -- Load 2-element structures to 2 registers, 8B/4H/2S size
+def : InstRW<[Ampere1Write_9cyc_2L_3XY],
+        (instregex "^LD2Twov(8b|4h|2s)")>;
+// -- Load 3-element structure to all lanes of 3 registers, 1D size
+def : InstRW<[Ampere1Write_6cyc_3L],
+        (instregex "^LD3Rv1d")>;
+// -- Load 3-element structure to all lanes of 3 registers, other sizes
+def : InstRW<[Ampere1Write_8cyc_3L_3XY],
+        (instregex "^LD3Rv(8b|4h|2s|16b|8h|4s|2d)")>;
+// -- Load 3-element structure to one lane of 3 registers
+def : InstRW<[Ampere1Write_8cyc_3L_3XY],
+        (instregex "^LD3i(8|16|32|64)")>;
+// -- Load 3-element structures to 3 registers, 16B/8H/4S sizes
+def : InstRW<[Ampere1Write_9cyc_3L_3XY],
+        (instregex "^LD3Threev(16b|8h|4s)")>;
+// -- Load 3-element structures to 3 registers, 2D size
+def : InstRW<[Ampere1Write_8cyc_3L_3XY],
+        (instregex "^LD3Threev2d")>;
+// -- Load 3-element structures to 3 registers, 8B/4H/2S sizes
+def : InstRW<[Ampere1Write_10cyc_3L_3XY],
+        (instregex "^LD3Threev(8b|4h|2s)")>;
+// -- Load 4-element structure to all lanes of 4 registers, 1D size
+def : InstRW<[Ampere1Write_6cyc_4L],
+        (instregex "^LD4Rv1d")>;
+// -- Load 4-element structure to all lanes of 4 registers, other sizes
+def : InstRW<[Ampere1Write_8cyc_4L_4XY],
+        (instregex "^LD4Rv(8b|4h|2s|16b|8h|4s|2d)")>;
+// -- Load 4-element structure to one lane of 4 registers
+def : InstRW<[Ampere1Write_6cyc_4L],
+        (instregex "^LD4i(8|16|32|64)")>;
+// -- Load 4-element structures to 4 registers, 2D size
+def : InstRW<[Ampere1Write_9cyc_4L_4XY],
+        (instregex "^LD4Fourv2d")>;
+// -- Load 4-element structures to 4 registers, 2S size
+def : InstRW<[Ampere1Write_12cyc_4L_8XY],
+        (instregex "^LD4Fourv2s")>;
+// -- Load 4-element structures to 4 registers, other sizes
+def : InstRW<[Ampere1Write_11cyc_4L_8XY],
+        (instregex "^LD4Fourv(8b|4h|16b|8h|4s)")>;
+// -- Load pair, Q-form
+def : InstRW<[Ampere1Write_5cyc_2L], (instregex "LDN?PQ")>;
+// -- Load pair, S/D-form
+def : InstRW<[Ampere1Write_5cyc_1L_1BS], (instregex "LDN?P(S|D)")>;
+// -- Load register
+def : InstRW<[Ampere1Write_5cyc_1L], (instregex "LDU?R[BHSDQ]i")>;
+// -- Load register, sign-extended register
+def : InstRW<[Ampere1Write_6cyc_1AB_1L], (instregex "LDR[BHSDQ]ro(W|X)")>;
+
+// FP and vector store instructions
+// -- Store 1-element structure from one lane of 1 register
+def : InstRW<[Ampere1Write_4cyc_1XY_1S_1Z],
+        (instregex "^ST1i(8|16|32|64)")>;
+// -- Store 1-element structures from 1 register
+def : InstRW<[Ampere1Write_2cyc_1S_1Z],
+        (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 1-element structures from 2 registers
+def : InstRW<[Ampere1Write_3cyc_2S_2Z],
+        (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 1-element structures from 3 registers
+def : InstRW<[Ampere1Write_4cyc_3S_3Z],
+        (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 1-element structures from 4 registers
+def : InstRW<[Ampere1Write_5cyc_4S_4Z],
+        (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 2-element structure from one lane of 2 registers
+def : InstRW<[Ampere1Write_5cyc_2XY_2S_2Z],
+        (instregex "^ST2i(8|16|32|64)")>;
+// -- Store 2-element structures from 2 registers, 16B/8H/4S/2D sizes
+def : InstRW<[Ampere1Write_5cyc_2XY_2S_2Z],
+        (instregex "^ST2Twov(16b|8h|4s|2d)")>;
+// -- Store 2-element structures from 2 registers, 8B/4H/2S sizes
+def : InstRW<[Ampere1Write_6cyc_2XY_2S_2Z],
+        (instregex "^ST2Twov(8b|4h|2s)")>;
+// -- Store 3-element structure from one lane of 3 registers
+def : InstRW<[Ampere1Write_6cyc_3XY_3S_3Z],
+        (instregex "^ST3i(8|16|32|64)")>;
+// -- Store 3-element structures from 3 registers
+def : InstRW<[Ampere1Write_6cyc_3XY_3S_3Z],
+        (instregex "^ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 4-element structure from one lane of 4 registers
+def : InstRW<[Ampere1Write_7cyc_4XY_4S_4Z],
+        (instregex "^ST4i(8|16|32|64)")>;
+// -- Store 4-element structures from 4 registers, 16B/8H/4S sizes
+def : InstRW<[Ampere1Write_9cyc_8XY_4S_4Z],
+        (instregex "^ST4Fourv(16b|8h|4s)")>;
+// -- Store 4-element structures from 4 registers, 2D sizes
+def : InstRW<[Ampere1Write_7cyc_4XY_4S_4Z],
+        (instregex "^ST4Fourv2d")>;
+// -- Store 4-element structures from 4 registers, 8B/4H/2S sizes
+def : InstRW<[Ampere1Write_9cyc_6XY_4S_4Z],
+        (instregex "^ST4Fourv(8b|4h|2s)")>;
+// -- Store pair, Q-form
+def : InstRW<[Ampere1Write_3cyc_2S_2Z], (instregex "^STN?PQ")>;
+// -- Store pair, S/D-form
+def : InstRW<[Ampere1Write_3cyc_1S_2Z],	(instregex "^STN?P[SD]")>;
+// -- Store register
+def : InstRW<[Ampere1Write_2cyc_1S_1Z],	(instregex "^STU?R[BHSDQ](ui|i)")>;
+// -- Store register, sign-extended register offset
+def : InstRW<[Ampere1Write_2cyc_1AB_1S_1Z], (instregex "^STR[BHSDQ]ro[XW]")>;
+
+// FP data processing, bfloat16 format
+def : InstRW<[Ampere1Write_5cyc_1XY], (instrs BFCVT)>;
+def : InstRW<[Ampere1Write_7cyc_2XY], (instrs BFCVTN, BFCVTN2)>;
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^BFDOTv", "^BF16DOT")>;
+def : InstRW<[Ampere1Write_4cyc_2XY], (instrs BFMMLA)>;
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^BFMLAL")>;
+
+// FP data processing, scalar/vector, half precision
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(ABD|ABS)v.[fi]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY],
+        (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY],
+        (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY],
+        (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)16")>;
+def : InstRW<[Ampere1Write_4cyc_1X],
+        (instregex "^FCMPE?H")>;
+def : InstRW<[Ampere1Write_10cyc_1A_1BS_1X],
+        (instregex "^FCCMPE?H")>;
+def : InstRW<[Ampere1Write_10cyc_1A_1BS_1XY],
+        (instregex "^FCSELH")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^[SU]CVTFv.[fi]16")>;
+def : InstRW<[Ampere1Write_25cyc_1XY], (instregex "^FDIVv.[if]16", "FDIVH")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if]16")>;
+def : InstRW<[Ampere1Write_8cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv4[if]16")>;
+def : InstRW<[Ampere1Write_12cyc_3XY], (instregex "^F(MAX|MIN)(NM)?Vv8[if]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FMULX?v.[if]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instrs FMULX16)>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FN?M(ADD|SUB)[H]rrr")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FML[AS]v.[if]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FRECPXv.[if]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(RECP|RSQRT)S16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if]16")>;
+def : InstRW<[Ampere1Write_39cyc_1XY], (instregex "^FSQRTv.f16", "^FSQRTHr")>;
+
+// FP data processing, scalar/vector, single/double precision
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^F(ABD|ABS)v.[fi](32|64)")>;
+def : InstRW<[Ampere1Write_5cyc_1XY],
+        (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi](32|64)")>;
+def : InstRW<[Ampere1Write_5cyc_1XY],
+        (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi](32|64)")>;
+def : InstRW<[Ampere1Write_5cyc_1XY],
+        (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)(32|64)")>;
+def : InstRW<[Ampere1Write_5cyc_1X],
+        (instregex "^FCMPE?(S|D)")>;
+def : InstRW<[Ampere1Write_11cyc_1A_1BS_1X],
+        (instregex "^FCCMPE?(S|D)")>;
+def : InstRW<[Ampere1Write_11cyc_1A_1BS_1XY],
+        (instregex "^FCSEL(S|D)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if](32|64)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^[SU]CVTFv.[fi](32|64)")>;
+def : InstRW<[Ampere1Write_34cyc_1XY], (instregex "^FDIVv.[if](64)", "FDIVD")>;
+def : InstRW<[Ampere1Write_19cyc_1XY], (instregex "^FDIVv.[if](32)", "FDIVS")>;
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if](32|64)")>;
+def : InstRW<[Ampere1Write_10cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv.[if](32|64)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FMULX?v.[if](32|64)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instrs FMULX32, FMULX64)>;
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FN?M(ADD|SUB)[SD]rrr")>;
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FML[AS]v.[if](32|64)")>;
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FRECPXv.[if](32|64)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^F(RECP|RSQRT)S(32|64)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if](32|64)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FRINT(32|64)")>;
+def : InstRW<[Ampere1Write_62cyc_1XY], (instregex "^FSQRTv.f64", "^FSQRTDr")>;
+def : InstRW<[Ampere1Write_32cyc_1XY], (instregex "^FSQRTv.f32", "^FSQRTSr")>;
+
+// FP miscellaneous instructions
+def : InstRW<[Ampere1Write_10cyc_1XY_1Z], (instregex "^FCVT[AMNPZ][SU][SU][XW][HSD]r")>;
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FCVT[HSD]Hr")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVT[HSD][SD]r")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVTLv")>;
+def : InstRW<[Ampere1Write_8cyc_2XY], (instregex "^FCVT(N|XN)v")>;
+def : InstRW<[Ampere1Write_10cyc_1X_1Z], (instrs FJCVTZS)>;
+def : InstRW<[Ampere1Write_5cyc_1BS], (instregex "^FMOV[HSD][WX]r")>;
+def : InstRW<[Ampere1Write_7cyc_1BS_1XY], (instregex "^FMOVDXHighr")>;
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^FMOV[HSD][ri]")>;
+def : InstRW<[Ampere1Write_6cyc_1XY_1Z], (instregex "^FMOVXDHighr")>;
+def : InstRW<[Ampere1Write_4cyc_1Z], (instregex "^FMOV[WX][HSD]r")>;
+
+// Integer arithmetic and logical instructions
+def : InstRW<[Ampere1Write_1cyc_1A],
+        (instregex "ADC(W|X)r", "SBC(W|X)r")>;
+def : InstRW<[Ampere1Write_Arith],
+        (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)(W|X)r")>;
+def : InstRW<[Ampere1Write_ArithFlagsetting],
+        (instregex "(ADD|AND|BIC|SUB)S(W|X)r")>;
+def : InstRW<[Ampere1Write_1cyc_1A],
+        (instregex "(ADC|SBC)S(W|X)r")>;
+def : InstRW<[Ampere1Write_1cyc_1A], (instrs RMIF)>;
+def : InstRW<[Ampere1Write_1cyc_1A],
+        (instregex "(CCMN|CCMP)(X|W)")>;
+def : InstRW<[Ampere1Write_1cyc_1A],
+        (instregex "(CSEL|CSINC|CSINV|CSNEG)(X|W)")>;
+def : InstRW<[Ampere1Write_18cyc_1BS], (instrs SDIVWr, UDIVWr)>;
+def : InstRW<[Ampere1Write_34cyc_1BS], (instrs SDIVXr, UDIVXr)>;
+def : InstRW<[Ampere1Write_3cyc_1BS],
+        (instregex "(S|U)MULHr")>;
+def : InstRW<[Ampere1Write_4cyc_1BS],
+        (instregex "(S|U)?M(ADD|SUB)L?r")>;
+
+// Integer load instructions
+def : InstRW<[Ampere1Write_4cyc_2L],
+        (instregex "(LDNP|LDP|LDPSW)(X|W)")>;
+def : InstRW<[Ampere1Write_4cyc_1L],
+        (instregex "LDR(B|D|H|Q|S)ui")>;
+def : InstRW<[Ampere1Write_4cyc_1L],
+        (instregex "LDR(D|Q|W|X)l")>;
+def : InstRW<[Ampere1Write_4cyc_1L],
+        (instregex "LDTR(B|H|W|X)i")>;
+def : InstRW<[Ampere1Write_4cyc_1L],
+        (instregex "LDTRS(BW|BX|HW|HX|W)i")>;
+def : InstRW<[Ampere1Write_4cyc_1L],
+        (instregex "LDUR(BB|HH|X|W)i")>;
+def : InstRW<[Ampere1Write_4cyc_1L],
+        (instregex "LDURS(BW|BX|HW|HX|W)i")>;
+def : InstRW<[Ampere1Write_5cyc_1AB_1L],
+        (instregex "LDR(HH|SHW|SHX|W|X)ro(W|X)")>;
+def : InstRW<[Ampere1Write_1cyc_1L],
+        (instrs PRFMl, PRFUMi, PRFUMi)>;
+def : InstRW<[Ampere1Write_2cyc_1AB_1L],
+        (instrs PRFMroW, PRFMroX)>;
+
+// Integer miscellaneous instructions
+def : InstRW<[Ampere1Write_1cyc_1A],  (instrs ADR, ADRP)>;
+def : InstRW<[Ampere1Write_1cyc_1B],  (instregex "EXTR(W|X)")>;
+def : InstRW<[Ampere1Write_1cyc_1B],  (instregex "(S|U)?BFM(W|X)")>;
+def : InstRW<[Ampere1Write_3cyc_1BS], (instregex "^CRC32C?[BHWX]")>;
+def : InstRW<[Ampere1Write_1cyc_1B],  (instregex "CLS(W|X)")>;
+def : InstRW<[Ampere1Write_1cyc_1A],  (instrs SETF8, SETF16)>;
+def : InstRW<[Ampere1Write_1cyc_1AB],
+        (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>;
+def : InstRW<[Ampere1Write_1cyc_1B],
+        (instregex "(RBIT|REV|REV16)(W|X)r", "REV32Xr")>;
+def : InstRW<[Ampere1Write_1cyc_1B],
+        (instregex "(ASR|LSL|LSR|ROR)V(W|X)r")>;
+
+// Integer store instructions
+def : InstRW<[Ampere1Write_1cyc_2S],  (instregex "STNP(X|W)i")>;
+def : InstRW<[Ampere1Write_2cyc_1B_1S],
+        (instrs STPWi, STPXi)>;
+def : InstRW<[Ampere1Write_3cyc_1B_1S_1AB],
+        (instregex "STP(W|X)(pre|post)")>;
+def : InstRW<[Ampere1Write_1cyc_1S],
+        (instrs STTRBi, STTRHi, STTRWi, STTRXi)>;
+def : InstRW<[Ampere1Write_1cyc_1S],
+        (instregex "STUR(BB|HH|X|W)i",
+                   "STR(X|W)ui",
+                   "STUR(BB|HH|X|W)i")>;
+def : InstRW<[Ampere1Write_1cyc_2S], (instrs STRWroX, STRXroX)>;
+def : InstRW<[Ampere1Write_2cyc_1AB_2S], (instrs STRWroW, STRXroW)>;
+
+// Pointer authentication
+//def : InstRW<[Ampere1Write_7cyc_1BS],
+//	(instrs AUTIAZ, AUTIBZ, AUTIASP, AUTIBSP, AUTIA1716, AUTIB1716)>;
+def : InstRW<[Ampere1Write_8cyc_1BS_1A],
+        (instregex "BRA(A|AZ|B|BZ)", "RETA(A|B)", "ERETA(A|B)")>;
+def : InstRW<[Ampere1Write_8cyc_1BS_2A],
+        (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ)>;
+//def : InstRW<[Ampere1Write_7cyc_1BS],
+//	(instrs PACIAZ, PACIBZ, PACIASP, PACIBSP, PACIA1716, PACIB1716)>;
+def : InstRW<[Ampere1Write_11cyc_1BS_1L], (instregex "^LDRA(A|B)")>;
+def : InstRW<[Ampere1Write_7cyc_1BS], (instrs XPACD, XPACI)>;
+
+// Vector integer instructions
+// -- absolute difference
+def : InstRW<[Ampere1Write_3cyc_1XY],
+             (instregex "^SABAv", "^SABALv", "^SABDv", "^SABDLv",
+                        "^UABAv", "^UABALv", "^UABDv", "^UABDLv")>;
+// -- arithmetic
+def : InstRW<[Ampere1Write_3cyc_1XY],
+        (instregex "^ABSv", "^(ADD|SUB)v", "^SADDLv", "^SADDW", "SHADD",
+                   "SHSUB", "^SRHADD", "^URHADD", "SSUBL", "SSUBW",
+                   "^UADDLv", "^UADDW", "UHADD", "UHSUB", "USUBL", "USUBW")>;
+// -- arithmetic, horizontal, 16B
+def : InstRW<[Ampere1Write_12cyc_4XY],
+            (instregex "^ADDVv16i8v", "^SADDLVv16i8v", "^UADDLVv16i8v")>;
+def : InstRW<[Ampere1Write_12cyc_4XY],
+            (instregex "^[SU](MIN|MAX)Vv16i8v")>;
+// -- arithmetic, horizontal, 4H/4S
+def : InstRW<[Ampere1Write_6cyc_2XY],
+            (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v")>;
+def : InstRW<[Ampere1Write_6cyc_2XY],
+            (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v")>;
+// -- arithmetic, horizontal, 8B/8H
+def : InstRW<[Ampere1Write_9cyc_3XY],
+            (instregex "^[SU]?ADDL?V(v8i16|v4i32)v")>;
+def : InstRW<[Ampere1Write_9cyc_3XY],
+            (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v")>;
+// -- arithmetic, narrowing
+def : InstRW<[Ampere1Write_5cyc_2XY], (instregex "(ADD|SUB)HNv.*")>;
+def : InstRW<[Ampere1Write_5cyc_2XY], (instregex "(RADD|RSUB)HNv.*")>;
+// -- arithmetic, pairwise
+def : InstRW<[Ampere1Write_3cyc_1XY],
+        (instregex "^ADDPv", "^SADALP", "^UADALP", "^SADDLPv", "^UADDLPv")>;
+// -- arithmetic, saturating
+def : InstRW<[Ampere1Write_3cyc_1XY],
+        (instregex "^SQADD", "^SQSUB", "^SUQADD", "^UQADD", "^UQSUB", "^USQADD")>;
+// -- bit count
+def : InstRW<[Ampere1Write_2cyc_1XY],
+        (instregex "^(CLS|CLZ|CNT)v")>;
+// -- compare
+def : InstRW<[Ampere1Write_3cyc_1XY],
+        (instregex "^CMEQv", "^CMGEv", "^CMGTv", "^CMLEv", "^CMLTv",
+                   "^CMHIv", "^CMHSv")>;
+// -- compare non-zero
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^CMTSTv")>;
+// -- dot product
+def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^(S|SU|U|US)DOTv")>;
+// -- fp reciprocal estimate
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FRECPEv", "^FRSQRTEv")>;
+// -- integer reciprocal estimate
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^URECPEv", "^URSQRTEv")>;
+// -- logical
+def : InstRW<[Ampere1Write_2cyc_1XY],
+        (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>;
+// -- logical, narrowing
+def : InstRW<[Ampere1Write_5cyc_2XY],
+        (instregex "RSHRNv",
+                   "SHRNv", "SQSHRNv", "SQSHRUNv",
+                   "UQXTNv")>;
+// -- matrix multiply
+def : InstRW<[Ampere1Write_6cyc_2XY],
+        (instrs SMMLA, UMMLA, USMMLA)>;
+// -- max/min
+def : InstRW<[Ampere1Write_3cyc_1XY],
+        (instregex "^SMAXv", "^SMINv", "^UMAXv", "^UMINv")>;
+def : InstRW<[Ampere1Write_3cyc_1XY],
+        (instregex "^SMAXPv", "^SMINPv", "^UMAXPv", "^UMINPv")>;
+// -- move immediate
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^MOVIv", "^MVNIv")>;
+// -- multiply
+def : InstRW<[Ampere1Write_3cyc_1XY],
+        (instregex "MULv", "SMULLv", "UMULLv", "SQDMUL(H|L)v", "SQRDMULHv")>;
+// -- multiply accumulate
+def : InstRW<[Ampere1Write_3cyc_1XY],
+        (instregex "MLAv", "MLSv", "(S|U|SQD)(MLAL|MLSL)v", "SQRDML(A|S)Hv")>;
+// -- negation, saturating
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^SQABS", "^SQNEG")>;
+// -- reverse bits/bytes
+def : InstRW<[Ampere1Write_2cyc_1XY],
+        (instregex "^RBITv", "^REV16v", "^REV32v", "^REV64v")>;
+// -- shift
+def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>;
+// -- shift and accumulate
+def : InstRW<[Ampere1Write_3cyc_1XY],
+        (instregex "SRSRAv", "SSRAv", "URSRAv", "USRAv")>;
+// -- shift, saturating
+def : InstRW<[Ampere1Write_3cyc_1XY],
+        (instregex "^SQRSHLv", "^SQRSHRNv", "^SQRSHRUNv", "^SQSHL", "^SQSHLU",
+                   "^SQXTNv", "^SQXTUNv", "^UQSHRNv", "UQRSHRNv", "^UQRSHL",
+                   "^UQSHL")>;
+
+// Vector miscellaneous instructions
+// -- duplicate element
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^DUPv.+lane")>;
+// -- duplicate from GPR
+def : InstRW<[Ampere1Write_5cyc_1BS], (instregex "^DUPv.+gpr")>;
+// -- extract narrow
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^XTNv")>;
+// -- insert/extract element
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^EXTv", "^INSv.+lane")>;
+// -- move FP immediate
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^FMOVv")>;
+// -- move element to GPR
+def : InstRW<[Ampere1Write_6cyc_1XY_1Z], (instregex "(S|U)MOVv")>;
+// -- move from GPR to any element
+def : InstRW<[Ampere1Write_7cyc_1BS_1XY], (instregex "^INSv.+gpr")>;
+// -- table lookup
+def : InstRW<[Ampere1Write_2cyc_1XY],
+            (instrs TBLv8i8One, TBLv16i8One, TBXv8i8One, TBXv16i8One)>;
+def : InstRW<[Ampere1Write_4cyc_2XY],
+            (instrs TBLv8i8Two, TBLv16i8Two, TBXv8i8Two, TBXv16i8Two)>;
+def : InstRW<[Ampere1Write_6cyc_3XY],
+            (instrs TBLv8i8Three, TBLv16i8Three, TBXv8i8Three, TBXv16i8Three)>;
+def : InstRW<[Ampere1Write_8cyc_4XY],
+            (instrs TBLv8i8Four, TBLv16i8Four, TBXv8i8Four, TBXv16i8Four)>;
+// -- transpose
+def : InstRW<[Ampere1Write_2cyc_1XY],
+              (instregex "^TRN1v", "^TRN2v", "^UZP1v", "^UZP2v")>;
+// -- zip/unzip
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^ZIP1v", "^ZIP2v")>;
+
+} // SchedModel = Ampere1Model
diff --git a/llvm/lib/Target/AArch64/AArch64SchedPredAmpere.td b/llvm/lib/Target/AArch64/AArch64SchedPredAmpere.td
new file mode 100644
index 000000000000..8552c07bda56
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64SchedPredAmpere.td
@@ -0,0 +1,25 @@
+//===- AArch64SchedPredAmpere.td - AArch64 Sched Preds -----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines scheduling predicate definitions that are used by the
+// AArch64 Ampere Computing processors.
+//
+//===----------------------------------------------------------------------===//
+
+// Auxiliary predicates.
+
+// Check for a LSL shift <= 4
+def AmpereCheapLSL : MCSchedPredicate<
+                                CheckAny<[CheckShiftBy0,
+                                 CheckAll<
+                                   [CheckShiftLSL,
+                                    CheckAny<
+                                      [CheckShiftBy1,
+                                       CheckShiftBy2,
+                                       CheckShiftBy3,
+                                       CheckShiftBy4]>]>]>>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedPredExynos.td b/llvm/lib/Target/AArch64/AArch64SchedPredExynos.td
index fcda2394bacf..ee7cc1f5095b 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedPredExynos.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedPredExynos.td
@@ -109,10 +109,7 @@ def ExynosScaledIdxFn   : TIIPredicate<"isExynosScaledAddr",
 def ExynosScaledIdxPred : MCSchedPredicate<ExynosScaledIdxFn>;
 
 // Identify FP instructions.
-def ExynosFPPred : MCSchedPredicate<CheckAny<[CheckHForm,
-                                              CheckSForm,
-                                              CheckDForm,
-                                              CheckQForm]>>;
+def ExynosFPPred : MCSchedPredicate<CheckFpOrNEON>;
 
 // Identify 128-bit NEON instructions.
 def ExynosQFormPred : MCSchedPredicate<CheckQForm>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedPredicates.td b/llvm/lib/Target/AArch64/AArch64SchedPredicates.td
index fc13b23b4cf8..4473f3a53845 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedPredicates.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedPredicates.td
@@ -53,152 +53,23 @@ let FunctionMapper = "AArch64_AM::getShiftType" in {
 }
 
 // Check for shifting in arithmetic and logic instructions.
-foreach I = {0-3, 8} in {
+foreach I = {0-4, 8} in {
   let FunctionMapper = "AArch64_AM::getShiftValue" in
   def CheckShiftBy#I        : CheckImmOperand<3, I>;
 }
 
 // Generic predicates.
-
-// Identify whether an instruction is the 16-bit NEON form based on its result.
-def CheckHForm             : CheckAll<[CheckIsRegOperand<0>,
-                                       CheckAny<[CheckRegOperand<0, H0>,
-                                                 CheckRegOperand<0, H1>,
-                                                 CheckRegOperand<0, H2>,
-                                                 CheckRegOperand<0, H3>,
-                                                 CheckRegOperand<0, H4>,
-                                                 CheckRegOperand<0, H5>,
-                                                 CheckRegOperand<0, H6>,
-                                                 CheckRegOperand<0, H7>,
-                                                 CheckRegOperand<0, H8>,
-                                                 CheckRegOperand<0, H9>,
-                                                 CheckRegOperand<0, H10>,
-                                                 CheckRegOperand<0, H11>,
-                                                 CheckRegOperand<0, H12>,
-                                                 CheckRegOperand<0, H13>,
-                                                 CheckRegOperand<0, H14>,
-                                                 CheckRegOperand<0, H15>,
-                                                 CheckRegOperand<0, H16>,
-                                                 CheckRegOperand<0, H17>,
-                                                 CheckRegOperand<0, H18>,
-                                                 CheckRegOperand<0, H19>,
-                                                 CheckRegOperand<0, H20>,
-                                                 CheckRegOperand<0, H21>,
-                                                 CheckRegOperand<0, H22>,
-                                                 CheckRegOperand<0, H23>,
-                                                 CheckRegOperand<0, H24>,
-                                                 CheckRegOperand<0, H25>,
-                                                 CheckRegOperand<0, H26>,
-                                                 CheckRegOperand<0, H27>,
-                                                 CheckRegOperand<0, H28>,
-                                                 CheckRegOperand<0, H29>,
-                                                 CheckRegOperand<0, H30>,
-                                                 CheckRegOperand<0, H31>]>]>;
-
-// Identify whether an instruction is the 32-bit NEON form based on its result.
-def CheckSForm             : CheckAll<[CheckIsRegOperand<0>,
-                                       CheckAny<[CheckRegOperand<0, S0>,
-                                                 CheckRegOperand<0, S1>,
-                                                 CheckRegOperand<0, S2>,
-                                                 CheckRegOperand<0, S3>,
-                                                 CheckRegOperand<0, S4>,
-                                                 CheckRegOperand<0, S5>,
-                                                 CheckRegOperand<0, S6>,
-                                                 CheckRegOperand<0, S7>,
-                                                 CheckRegOperand<0, S8>,
-                                                 CheckRegOperand<0, S9>,
-                                                 CheckRegOperand<0, S10>,
-                                                 CheckRegOperand<0, S11>,
-                                                 CheckRegOperand<0, S12>,
-                                                 CheckRegOperand<0, S13>,
-                                                 CheckRegOperand<0, S14>,
-                                                 CheckRegOperand<0, S15>,
-                                                 CheckRegOperand<0, S16>,
-                                                 CheckRegOperand<0, S17>,
-                                                 CheckRegOperand<0, S18>,
-                                                 CheckRegOperand<0, S19>,
-                                                 CheckRegOperand<0, S20>,
-                                                 CheckRegOperand<0, S21>,
-                                                 CheckRegOperand<0, S22>,
-                                                 CheckRegOperand<0, S23>,
-                                                 CheckRegOperand<0, S24>,
-                                                 CheckRegOperand<0, S25>,
-                                                 CheckRegOperand<0, S26>,
-                                                 CheckRegOperand<0, S27>,
-                                                 CheckRegOperand<0, S28>,
-                                                 CheckRegOperand<0, S29>,
-                                                 CheckRegOperand<0, S30>,
-                                                 CheckRegOperand<0, S31>]>]>;
-
-// Identify whether an instruction is the 64-bit NEON form based on its result.
-def CheckDForm             : CheckAll<[CheckIsRegOperand<0>,
-                                       CheckAny<[CheckRegOperand<0, D0>,
-                                                 CheckRegOperand<0, D1>,
-                                                 CheckRegOperand<0, D2>,
-                                                 CheckRegOperand<0, D3>,
-                                                 CheckRegOperand<0, D4>,
-                                                 CheckRegOperand<0, D5>,
-                                                 CheckRegOperand<0, D6>,
-                                                 CheckRegOperand<0, D7>,
-                                                 CheckRegOperand<0, D8>,
-                                                 CheckRegOperand<0, D9>,
-                                                 CheckRegOperand<0, D10>,
-                                                 CheckRegOperand<0, D11>,
-                                                 CheckRegOperand<0, D12>,
-                                                 CheckRegOperand<0, D13>,
-                                                 CheckRegOperand<0, D14>,
-                                                 CheckRegOperand<0, D15>,
-                                                 CheckRegOperand<0, D16>,
-                                                 CheckRegOperand<0, D17>,
-                                                 CheckRegOperand<0, D18>,
-                                                 CheckRegOperand<0, D19>,
-                                                 CheckRegOperand<0, D20>,
-                                                 CheckRegOperand<0, D21>,
-                                                 CheckRegOperand<0, D22>,
-                                                 CheckRegOperand<0, D23>,
-                                                 CheckRegOperand<0, D24>,
-                                                 CheckRegOperand<0, D25>,
-                                                 CheckRegOperand<0, D26>,
-                                                 CheckRegOperand<0, D27>,
-                                                 CheckRegOperand<0, D28>,
-                                                 CheckRegOperand<0, D29>,
-                                                 CheckRegOperand<0, D30>,
-                                                 CheckRegOperand<0, D31>]>]>;
+// Identify whether an instruction is NEON or floating point
+def CheckFpOrNEON : CheckFunctionPredicateWithTII<
+  "AArch64_MC::isFpOrNEON",
+  "AArch64InstrInfo::isFpOrNEON"
+>;
 
 // Identify whether an instruction is the 128-bit NEON form based on its result.
-def CheckQForm             : CheckAll<[CheckIsRegOperand<0>,
-                                       CheckAny<[CheckRegOperand<0, Q0>,
-                                                 CheckRegOperand<0, Q1>,
-                                                 CheckRegOperand<0, Q2>,
-                                                 CheckRegOperand<0, Q3>,
-                                                 CheckRegOperand<0, Q4>,
-                                                 CheckRegOperand<0, Q5>,
-                                                 CheckRegOperand<0, Q6>,
-                                                 CheckRegOperand<0, Q7>,
-                                                 CheckRegOperand<0, Q8>,
-                                                 CheckRegOperand<0, Q9>,
-                                                 CheckRegOperand<0, Q10>,
-                                                 CheckRegOperand<0, Q11>,
-                                                 CheckRegOperand<0, Q12>,
-                                                 CheckRegOperand<0, Q13>,
-                                                 CheckRegOperand<0, Q14>,
-                                                 CheckRegOperand<0, Q15>,
-                                                 CheckRegOperand<0, Q16>,
-                                                 CheckRegOperand<0, Q17>,
-                                                 CheckRegOperand<0, Q18>,
-                                                 CheckRegOperand<0, Q19>,
-                                                 CheckRegOperand<0, Q20>,
-                                                 CheckRegOperand<0, Q21>,
-                                                 CheckRegOperand<0, Q22>,
-                                                 CheckRegOperand<0, Q23>,
-                                                 CheckRegOperand<0, Q24>,
-                                                 CheckRegOperand<0, Q25>,
-                                                 CheckRegOperand<0, Q26>,
-                                                 CheckRegOperand<0, Q27>,
-                                                 CheckRegOperand<0, Q28>,
-                                                 CheckRegOperand<0, Q29>,
-                                                 CheckRegOperand<0, Q30>,
-                                                 CheckRegOperand<0, Q31>]>]>;
+def CheckQForm : CheckFunctionPredicateWithTII<
+  "AArch64_MC::isQForm",
+  "AArch64InstrInfo::isQForm"
+>;
 
 // Identify arithmetic instructions with extend.
 def IsArithExtOp           : CheckOpcode<[ADDWrx, ADDXrx, ADDSWrx, ADDSXrx,
diff --git a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
index 77fca22a5f55..6ecfc97a4273 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
@@ -25,7 +25,8 @@ def TSV110Model : SchedMachineModel {
   let CompleteModel         =   1;
 
   list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
-                                                    PAUnsupported.F);
+                                                    PAUnsupported.F,
+                                                    SMEUnsupported.F);
 }
 
 // Define each kind of processor resource and number available on the TSV110,
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 893269c1a7ef..677797a6797b 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -91,7 +91,7 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemcpy(
 
 SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
-    SDValue Size, Align Alignment, bool isVolatile,
+    SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
     MachinePointerInfo DstPtrInfo) const {
   const AArch64Subtarget &STI =
       DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
@@ -100,38 +100,6 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
     return EmitMOPS(AArch64ISD::MOPS_MEMSET, DAG, dl, Chain, Dst, Src, Size,
                     Alignment, isVolatile, DstPtrInfo, MachinePointerInfo{});
   }
-
-  // Check to see if there is a specialized entry-point for memory zeroing.
-  ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
-  ConstantSDNode *SizeValue = dyn_cast<ConstantSDNode>(Size);
-  const char *bzeroName =
-      (V && V->isZero())
-          ? DAG.getTargetLoweringInfo().getLibcallName(RTLIB::BZERO)
-          : nullptr;
-  // For small size (< 256), it is not beneficial to use bzero
-  // instead of memset.
-  if (bzeroName && (!SizeValue || SizeValue->getZExtValue() > 256)) {
-    const AArch64TargetLowering &TLI = *STI.getTargetLowering();
-
-    EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout());
-    Type *IntPtrTy = Type::getInt8PtrTy(*DAG.getContext());
-    TargetLowering::ArgListTy Args;
-    TargetLowering::ArgListEntry Entry;
-    Entry.Node = Dst;
-    Entry.Ty = IntPtrTy;
-    Args.push_back(Entry);
-    Entry.Node = Size;
-    Args.push_back(Entry);
-    TargetLowering::CallLoweringInfo CLI(DAG);
-    CLI.setDebugLoc(dl)
-        .setChain(Chain)
-        .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-                      DAG.getExternalSymbol(bzeroName, IntPtr),
-                      std::move(Args))
-        .setDiscardResult();
-    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
-    return CallResult.second;
-  }
   return SDValue();
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
index 47fe3bf7dcf5..73f93724d6fc 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -34,7 +34,7 @@ public:
   SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
                                   SDValue Chain, SDValue Dst, SDValue Src,
                                   SDValue Size, Align Alignment,
-                                  bool isVolatile,
+                                  bool isVolatile, bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo) const override;
   SDValue
   EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,
diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index 566c7a16db23..24816bc9e9bd 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -42,20 +42,23 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/ValueHandle.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/MemoryTaggingSupport.h"
 #include <cassert>
 #include <iterator>
+#include <memory>
 #include <utility>
 
 using namespace llvm;
@@ -63,12 +66,12 @@ using namespace llvm;
 #define DEBUG_TYPE "aarch64-stack-tagging"
 
 static cl::opt<bool> ClMergeInit(
-    "stack-tagging-merge-init", cl::Hidden, cl::init(true), cl::ZeroOrMore,
+    "stack-tagging-merge-init", cl::Hidden, cl::init(true),
     cl::desc("merge stack variable initializers with tagging when possible"));
 
 static cl::opt<bool>
     ClUseStackSafety("stack-tagging-use-stack-safety", cl::Hidden,
-                     cl::init(true), cl::ZeroOrMore,
+                     cl::init(true),
                      cl::desc("Use Stack Safety analysis results"));
 
 static cl::opt<unsigned> ClScanLimit("stack-tagging-merge-init-scan-limit",
@@ -78,6 +81,12 @@ static cl::opt<unsigned>
     ClMergeInitSizeLimit("stack-tagging-merge-init-size-limit", cl::init(272),
                          cl::Hidden);
 
+static cl::opt<size_t> ClMaxLifetimes(
+    "stack-tagging-max-lifetimes-for-alloca", cl::Hidden, cl::init(3),
+    cl::ReallyHidden,
+    cl::desc("How many lifetime ends to handle for a single alloca."),
+    cl::Optional);
+
 static const Align kTagGranuleSize = Align(16);
 
 namespace {
@@ -283,15 +292,6 @@ public:
 };
 
 class AArch64StackTagging : public FunctionPass {
-  struct AllocaInfo {
-    AllocaInst *AI;
-    TrackingVH<Instruction> OldAI; // Track through RAUW to replace debug uses.
-    SmallVector<IntrinsicInst *, 2> LifetimeStart;
-    SmallVector<IntrinsicInst *, 2> LifetimeEnd;
-    SmallVector<DbgVariableIntrinsic *, 2> DbgVariableIntrinsics;
-    int Tag; // -1 for non-tagged allocations
-  };
-
   const bool MergeInit;
   const bool UseStackSafety;
 
@@ -307,7 +307,6 @@ public:
   }
 
   bool isInterestingAlloca(const AllocaInst &AI);
-  void alignAndPadAlloca(AllocaInfo &Info);
 
   void tagAlloca(AllocaInst *AI, Instruction *InsertBefore, Value *Ptr,
                  uint64_t Size);
@@ -316,9 +315,9 @@ public:
   Instruction *collectInitializers(Instruction *StartInst, Value *StartPtr,
                                    uint64_t Size, InitializerBuilder &IB);
 
-  Instruction *
-  insertBaseTaggedPointer(const MapVector<AllocaInst *, AllocaInfo> &Allocas,
-                          const DominatorTree *DT);
+  Instruction *insertBaseTaggedPointer(
+      const MapVector<AllocaInst *, memtag::AllocaInfo> &Allocas,
+      const DominatorTree *DT);
   bool runOnFunction(Function &F) override;
 
   StringRef getPassName() const override { return "AArch64 Stack Tagging"; }
@@ -419,7 +418,7 @@ bool AArch64StackTagging::isInterestingAlloca(const AllocaInst &AI) {
   bool IsInteresting =
       AI.getAllocatedType()->isSized() && AI.isStaticAlloca() &&
       // alloca() may be called with 0 size, ignore it.
-      AI.getAllocationSizeInBits(*DL).getValue() > 0 &&
+      *AI.getAllocationSizeInBits(*DL) > 0 &&
       // inalloca allocas are not treated as static, and we don't want
       // dynamic alloca instrumentation for them as well.
       !AI.isUsedWithInAlloca() &&
@@ -460,15 +459,13 @@ void AArch64StackTagging::untagAlloca(AllocaInst *AI, Instruction *InsertBefore,
 }
 
 Instruction *AArch64StackTagging::insertBaseTaggedPointer(
-    const MapVector<AllocaInst *, AllocaInfo> &Allocas,
+    const MapVector<AllocaInst *, memtag::AllocaInfo> &AllocasToInstrument,
     const DominatorTree *DT) {
   BasicBlock *PrologueBB = nullptr;
   // Try sinking IRG as deep as possible to avoid hurting shrink wrap.
-  for (auto &I : Allocas) {
-    const AllocaInfo &Info = I.second;
+  for (auto &I : AllocasToInstrument) {
+    const memtag::AllocaInfo &Info = I.second;
     AllocaInst *AI = Info.AI;
-    if (Info.Tag < 0)
-      continue;
     if (!PrologueBB) {
       PrologueBB = AI->getParent();
       continue;
@@ -486,40 +483,6 @@ Instruction *AArch64StackTagging::insertBaseTaggedPointer(
   return Base;
 }
 
-void AArch64StackTagging::alignAndPadAlloca(AllocaInfo &Info) {
-  const Align NewAlignment =
-      max(MaybeAlign(Info.AI->getAlign()), kTagGranuleSize);
-  Info.AI->setAlignment(NewAlignment);
-
-  uint64_t Size = Info.AI->getAllocationSizeInBits(*DL).getValue() / 8;
-  uint64_t AlignedSize = alignTo(Size, kTagGranuleSize);
-  if (Size == AlignedSize)
-    return;
-
-  // Add padding to the alloca.
-  Type *AllocatedType =
-      Info.AI->isArrayAllocation()
-          ? ArrayType::get(
-                Info.AI->getAllocatedType(),
-                cast<ConstantInt>(Info.AI->getArraySize())->getZExtValue())
-          : Info.AI->getAllocatedType();
-  Type *PaddingType =
-      ArrayType::get(Type::getInt8Ty(F->getContext()), AlignedSize - Size);
-  Type *TypeWithPadding = StructType::get(AllocatedType, PaddingType);
-  auto *NewAI = new AllocaInst(
-      TypeWithPadding, Info.AI->getType()->getAddressSpace(), nullptr, "", Info.AI);
-  NewAI->takeName(Info.AI);
-  NewAI->setAlignment(Info.AI->getAlign());
-  NewAI->setUsedWithInAlloca(Info.AI->isUsedWithInAlloca());
-  NewAI->setSwiftError(Info.AI->isSwiftError());
-  NewAI->copyMetadata(*Info.AI);
-
-  auto *NewPtr = new BitCastInst(NewAI, Info.AI->getType(), "", Info.AI);
-  Info.AI->replaceAllUsesWith(NewPtr);
-  Info.AI->eraseFromParent();
-  Info.AI = NewAI;
-}
-
 // FIXME: check for MTE extension
 bool AArch64StackTagging::runOnFunction(Function &Fn) {
   if (!Fn.hasFnAttribute(Attribute::SanitizeMemTag))
@@ -532,76 +495,21 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
   if (MergeInit)
     AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
-  MapVector<AllocaInst *, AllocaInfo> Allocas; // need stable iteration order
-  SmallVector<Instruction *, 8> RetVec;
-  SmallVector<Instruction *, 4> UnrecognizedLifetimes;
-
-  for (auto &BB : *F) {
-    for (Instruction &I : BB) {
-      if (auto *AI = dyn_cast<AllocaInst>(&I)) {
-        Allocas[AI].AI = AI;
-        Allocas[AI].OldAI = AI;
-        continue;
-      }
-
-      if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I)) {
-        for (Value *V : DVI->location_ops())
-          if (auto *AI = dyn_cast_or_null<AllocaInst>(V))
-            if (Allocas[AI].DbgVariableIntrinsics.empty() ||
-                Allocas[AI].DbgVariableIntrinsics.back() != DVI)
-              Allocas[AI].DbgVariableIntrinsics.push_back(DVI);
-        continue;
-      }
-
-      auto *II = dyn_cast<IntrinsicInst>(&I);
-      if (II && (II->getIntrinsicID() == Intrinsic::lifetime_start ||
-                 II->getIntrinsicID() == Intrinsic::lifetime_end)) {
-        AllocaInst *AI = findAllocaForValue(II->getArgOperand(1));
-        if (!AI) {
-          UnrecognizedLifetimes.push_back(&I);
-          continue;
-        }
-        if (II->getIntrinsicID() == Intrinsic::lifetime_start)
-          Allocas[AI].LifetimeStart.push_back(II);
-        else
-          Allocas[AI].LifetimeEnd.push_back(II);
-      }
-
-      if (isa<ReturnInst, ResumeInst, CleanupReturnInst>(&I))
-        RetVec.push_back(&I);
-    }
-  }
+  memtag::StackInfoBuilder SIB(
+      [this](const AllocaInst &AI) { return isInterestingAlloca(AI); });
+  for (Instruction &I : instructions(F))
+    SIB.visit(I);
+  memtag::StackInfo &SInfo = SIB.get();
 
-  if (Allocas.empty())
+  if (SInfo.AllocasToInstrument.empty())
     return false;
 
-  int NextTag = 0;
-  int NumInterestingAllocas = 0;
-  for (auto &I : Allocas) {
-    AllocaInfo &Info = I.second;
-    assert(Info.AI);
-
-    if (!isInterestingAlloca(*Info.AI)) {
-      Info.Tag = -1;
-      continue;
-    }
-
-    alignAndPadAlloca(Info);
-    NumInterestingAllocas++;
-    Info.Tag = NextTag;
-    NextTag = (NextTag + 1) % 16;
-  }
-
-  if (NumInterestingAllocas == 0)
-    return true;
-
   std::unique_ptr<DominatorTree> DeleteDT;
   DominatorTree *DT = nullptr;
   if (auto *P = getAnalysisIfAvailable<DominatorTreeWrapperPass>())
     DT = &P->getDomTree();
 
-  if (DT == nullptr && (NumInterestingAllocas > 1 ||
-                        !F->hasFnAttribute(Attribute::OptimizeNone))) {
+  if (DT == nullptr) {
     DeleteDT = std::make_unique<DominatorTree>(*F);
     DT = DeleteDT.get();
   }
@@ -611,38 +519,57 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
   if (auto *P = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>())
     PDT = &P->getPostDomTree();
 
-  if (PDT == nullptr && !F->hasFnAttribute(Attribute::OptimizeNone)) {
+  if (PDT == nullptr) {
     DeletePDT = std::make_unique<PostDominatorTree>(*F);
     PDT = DeletePDT.get();
   }
 
+  std::unique_ptr<LoopInfo> DeleteLI;
+  LoopInfo *LI = nullptr;
+  if (auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>()) {
+    LI = &LIWP->getLoopInfo();
+  } else {
+    DeleteLI = std::make_unique<LoopInfo>(*DT);
+    LI = DeleteLI.get();
+  }
+
   SetTagFunc =
       Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_settag);
 
-  Instruction *Base = insertBaseTaggedPointer(Allocas, DT);
+  Instruction *Base = insertBaseTaggedPointer(SInfo.AllocasToInstrument, DT);
 
-  for (auto &I : Allocas) {
-    const AllocaInfo &Info = I.second;
+  int NextTag = 0;
+  for (auto &I : SInfo.AllocasToInstrument) {
+    memtag::AllocaInfo &Info = I.second;
+    assert(Info.AI && isInterestingAlloca(*Info.AI));
+    TrackingVH<Instruction> OldAI = Info.AI;
+    memtag::alignAndPadAlloca(Info, kTagGranuleSize);
     AllocaInst *AI = Info.AI;
-    if (Info.Tag < 0)
-      continue;
-
+    int Tag = NextTag;
+    NextTag = (NextTag + 1) % 16;
     // Replace alloca with tagp(alloca).
     IRBuilder<> IRB(Info.AI->getNextNode());
     Function *TagP = Intrinsic::getDeclaration(
         F->getParent(), Intrinsic::aarch64_tagp, {Info.AI->getType()});
     Instruction *TagPCall =
         IRB.CreateCall(TagP, {Constant::getNullValue(Info.AI->getType()), Base,
-                              ConstantInt::get(IRB.getInt64Ty(), Info.Tag)});
+                              ConstantInt::get(IRB.getInt64Ty(), Tag)});
     if (Info.AI->hasName())
       TagPCall->setName(Info.AI->getName() + ".tag");
     Info.AI->replaceAllUsesWith(TagPCall);
     TagPCall->setOperand(0, Info.AI);
 
-    if (UnrecognizedLifetimes.empty() && Info.LifetimeStart.size() == 1 &&
-        Info.LifetimeEnd.size() == 1) {
+    // Calls to functions that may return twice (e.g. setjmp) confuse the
+    // postdominator analysis, and will leave us to keep memory tagged after
+    // function return. Work around this by always untagging at every return
+    // statement if return_twice functions are called.
+    bool StandardLifetime =
+        SInfo.UnrecognizedLifetimes.empty() &&
+        memtag::isStandardLifetime(Info.LifetimeStart, Info.LifetimeEnd, DT, LI,
+                                   ClMaxLifetimes) &&
+        !SInfo.CallsReturnTwice;
+    if (StandardLifetime) {
       IntrinsicInst *Start = Info.LifetimeStart[0];
-      IntrinsicInst *End = Info.LifetimeEnd[0];
       uint64_t Size =
           cast<ConstantInt>(Start->getArgOperand(0))->getZExtValue();
       Size = alignTo(Size, kTagGranuleSize);
@@ -650,14 +577,16 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
 
       auto TagEnd = [&](Instruction *Node) { untagAlloca(AI, Node, Size); };
       if (!DT || !PDT ||
-          !forAllReachableExits(*DT, *PDT, Start, Info.LifetimeEnd, RetVec,
-                                TagEnd))
-        End->eraseFromParent();
+          !memtag::forAllReachableExits(*DT, *PDT, *LI, Start, Info.LifetimeEnd,
+                                        SInfo.RetVec, TagEnd)) {
+        for (auto *End : Info.LifetimeEnd)
+          End->eraseFromParent();
+      }
     } else {
-      uint64_t Size = Info.AI->getAllocationSizeInBits(*DL).getValue() / 8;
+      uint64_t Size = *Info.AI->getAllocationSizeInBits(*DL) / 8;
       Value *Ptr = IRB.CreatePointerCast(TagPCall, IRB.getInt8PtrTy());
       tagAlloca(AI, &*IRB.GetInsertPoint(), Ptr, Size);
-      for (auto &RI : RetVec) {
+      for (auto &RI : SInfo.RetVec) {
         untagAlloca(AI, RI, Size);
       }
       // We may have inserted tag/untag outside of any lifetime interval.
@@ -670,12 +599,12 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
 
     // Fixup debug intrinsics to point to the new alloca.
     for (auto DVI : Info.DbgVariableIntrinsics)
-      DVI->replaceVariableLocationOp(Info.OldAI, Info.AI);
+      DVI->replaceVariableLocationOp(OldAI, Info.AI);
   }
 
   // If we have instrumented at least one alloca, all unrecognized lifetime
-  // instrinsics have to go.
-  for (auto &I : UnrecognizedLifetimes)
+  // intrinsics have to go.
+  for (auto &I : SInfo.UnrecognizedLifetimes)
     I->eraseFromParent();
 
   return true;
diff --git a/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp b/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
index cae6d65bed2d..7e91dc1b6385 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
@@ -50,7 +50,6 @@ cl::opt<UncheckedLdStMode> ClUncheckedLdSt(
 
 static cl::opt<bool>
     ClFirstSlot("stack-tagging-first-slot-opt", cl::Hidden, cl::init(true),
-                cl::ZeroOrMore,
                 cl::desc("Apply first slot optimization for stack tagging "
                          "(eliminate ADDG Rt, Rn, 0, 0)."));
 
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 8a7e20237271..15005304383d 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -21,6 +21,7 @@
 #include "GISel/AArch64RegisterBankInfo.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/Support/AArch64TargetParser.h"
@@ -51,6 +52,16 @@ static cl::opt<bool>
 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true),
                            cl::desc("Enable the use of AA during codegen."));
 
+static cl::opt<unsigned> OverrideVectorInsertExtractBaseCost(
+    "aarch64-insert-extract-base-cost",
+    cl::desc("Base cost of vector insert/extract element"), cl::Hidden);
+
+unsigned AArch64Subtarget::getVectorInsertExtractBaseCost() const {
+  if (OverrideVectorInsertExtractBaseCost.getNumOccurrences() > 0)
+    return OverrideVectorInsertExtractBaseCost;
+  return VectorInsertExtractBaseCost;
+}
+
 AArch64Subtarget &AArch64Subtarget::initializeSubtargetDependencies(
     StringRef FS, StringRef CPUString, StringRef TuneCPUString) {
   // Determine default and user-specified characteristics
@@ -78,14 +89,17 @@ void AArch64Subtarget::initializeProperties() {
     CacheLineSize = 64;
     break;
   case CortexA35:
-    break;
   case CortexA53:
   case CortexA55:
     PrefFunctionLogAlignment = 4;
+    PrefLoopLogAlignment = 4;
+    MaxBytesForLoopAlignment = 8;
     break;
   case CortexA57:
     MaxInterleaveFactor = 4;
     PrefFunctionLogAlignment = 4;
+    PrefLoopLogAlignment = 4;
+    MaxBytesForLoopAlignment = 8;
     break;
   case CortexA65:
     PrefFunctionLogAlignment = 3;
@@ -93,6 +107,10 @@ void AArch64Subtarget::initializeProperties() {
   case CortexA72:
   case CortexA73:
   case CortexA75:
+    PrefFunctionLogAlignment = 4;
+    PrefLoopLogAlignment = 4;
+    MaxBytesForLoopAlignment = 8;
+    break;
   case CortexA76:
   case CortexA77:
   case CortexA78:
@@ -101,12 +119,21 @@ void AArch64Subtarget::initializeProperties() {
   case CortexX1:
   case CortexX1C:
     PrefFunctionLogAlignment = 4;
+    PrefLoopLogAlignment = 5;
+    MaxBytesForLoopAlignment = 16;
     break;
   case CortexA510:
+    PrefFunctionLogAlignment = 4;
+    VScaleForTuning = 1;
+    PrefLoopLogAlignment = 4;
+    MaxBytesForLoopAlignment = 8;
+    break;
   case CortexA710:
   case CortexX2:
     PrefFunctionLogAlignment = 4;
     VScaleForTuning = 1;
+    PrefLoopLogAlignment = 5;
+    MaxBytesForLoopAlignment = 16;
     break;
   case A64FX:
     CacheLineSize = 256;
@@ -221,6 +248,12 @@ void AArch64Subtarget::initializeProperties() {
     // FIXME: remove this to enable 64-bit SLP if performance looks good.
     MinVectorRegisterBitWidth = 128;
     break;
+  case Ampere1:
+    CacheLineSize = 64;
+    PrefFunctionLogAlignment = 6;
+    PrefLoopLogAlignment = 6;
+    MaxInterleaveFactor = 4;
+    break;
   }
 }
 
@@ -352,6 +385,8 @@ bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
   if (!UseAddressTopByteIgnored)
     return false;
 
+  if (TargetTriple.isDriverKit())
+    return true;
   if (TargetTriple.isiOS()) {
     return TargetTriple.getiOSVersion() >= VersionTuple(8);
   }
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 7b2bbad30f85..c92e3e44de31 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -22,7 +22,7 @@
 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include <string>
@@ -40,6 +40,7 @@ public:
   enum ARMProcFamilyEnum : uint8_t {
     Others,
     A64FX,
+    Ampere1,
     AppleA7,
     AppleA10,
     AppleA11,
@@ -87,191 +88,14 @@ protected:
   /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
   ARMProcFamilyEnum ARMProcFamily = Others;
 
-  bool HasV8_0aOps = false;
-  bool HasV8_1aOps = false;
-  bool HasV8_2aOps = false;
-  bool HasV8_3aOps = false;
-  bool HasV8_4aOps = false;
-  bool HasV8_5aOps = false;
-  bool HasV8_6aOps = false;
-  bool HasV8_7aOps = false;
-  bool HasV8_8aOps = false;
-  bool HasV9_0aOps = false;
-  bool HasV9_1aOps = false;
-  bool HasV9_2aOps = false;
-  bool HasV9_3aOps = false;
-  bool HasV8_0rOps = false;
-
-  bool HasCONTEXTIDREL2 = false;
-  bool HasEL2VMSA = false;
-  bool HasEL3 = false;
-  bool HasFPARMv8 = false;
-  bool HasNEON = false;
-  bool HasCrypto = false;
-  bool HasDotProd = false;
-  bool HasCRC = false;
-  bool HasLSE = false;
-  bool HasLSE2 = false;
-  bool HasRAS = false;
-  bool HasRDM = false;
-  bool HasPerfMon = false;
-  bool HasFullFP16 = false;
-  bool HasFP16FML = false;
-  bool HasSPE = false;
-
-  bool FixCortexA53_835769 = false;
-
-  // ARMv8.1 extensions
-  bool HasVH = false;
-  bool HasPAN = false;
-  bool HasLOR = false;
-
-  // ARMv8.2 extensions
-  bool HasPsUAO = false;
-  bool HasPAN_RWV = false;
-  bool HasCCPP = false;
-
-  // SVE extensions
-  bool HasSVE = false;
-  bool UseExperimentalZeroingPseudos = false;
-  bool UseScalarIncVL = false;
-
-  // Armv8.2 Crypto extensions
-  bool HasSM4 = false;
-  bool HasSHA3 = false;
-  bool HasSHA2 = false;
-  bool HasAES = false;
-
-  // ARMv8.3 extensions
-  bool HasPAuth = false;
-  bool HasJS = false;
-  bool HasCCIDX = false;
-  bool HasComplxNum = false;
-
-  // ARMv8.4 extensions
-  bool HasNV = false;
-  bool HasMPAM = false;
-  bool HasDIT = false;
-  bool HasTRACEV8_4 = false;
-  bool HasAM = false;
-  bool HasSEL2 = false;
-  bool HasTLB_RMI = false;
-  bool HasFlagM = false;
-  bool HasRCPC_IMMO = false;
-
-  bool HasLSLFast = false;
-  bool HasRCPC = false;
-  bool HasAggressiveFMA = false;
-
-  // Armv8.5-A Extensions
-  bool HasAlternativeNZCV = false;
-  bool HasFRInt3264 = false;
-  bool HasSpecRestrict = false;
-  bool HasSSBS = false;
-  bool HasSB = false;
-  bool HasPredRes = false;
-  bool HasCCDP = false;
-  bool HasBTI = false;
-  bool HasRandGen = false;
-  bool HasMTE = false;
-  bool HasTME = false;
-
-  // Armv8.6-A Extensions
-  bool HasBF16 = false;
-  bool HasMatMulInt8 = false;
-  bool HasMatMulFP32 = false;
-  bool HasMatMulFP64 = false;
-  bool HasAMVS = false;
-  bool HasFineGrainedTraps = false;
-  bool HasEnhancedCounterVirtualization = false;
-
-  // Armv8.7-A Extensions
-  bool HasXS = false;
-  bool HasWFxT = false;
-  bool HasHCX = false;
-  bool HasLS64 = false;
-
-  // Armv8.8-A Extensions
-  bool HasHBC = false;
-  bool HasMOPS = false;
-
-  // Arm SVE2 extensions
-  bool HasSVE2 = false;
-  bool HasSVE2AES = false;
-  bool HasSVE2SM4 = false;
-  bool HasSVE2SHA3 = false;
-  bool HasSVE2BitPerm = false;
-
-  // Armv9-A Extensions
-  bool HasRME = false;
-
-  // Arm Scalable Matrix Extension (SME)
-  bool HasSME = false;
-  bool HasSMEF64 = false;
-  bool HasSMEI64 = false;
-  bool HasStreamingSVE = false;
-
-  // AppleA7 system register.
-  bool HasAppleA7SysReg = false;
-
-  // Future architecture extensions.
-  bool HasETE = false;
-  bool HasTRBE = false;
-  bool HasBRBE = false;
-  bool HasSPE_EEF = false;
-
-  // HasZeroCycleRegMove - Has zero-cycle register mov instructions.
-  bool HasZeroCycleRegMove = false;
-
-  // HasZeroCycleZeroing - Has zero-cycle zeroing instructions.
-  bool HasZeroCycleZeroing = false;
-  bool HasZeroCycleZeroingGP = false;
-  bool HasZeroCycleZeroingFPWorkaround = false;
-
-  // It is generally beneficial to rewrite "fmov s0, wzr" to "movi d0, #0".
-  // as movi is more efficient across all cores. Newer cores can eliminate
-  // fmovs early and there is no difference with movi, but this not true for
-  // all implementations.
-  bool HasZeroCycleZeroingFP = true;
-
-  // StrictAlign - Disallow unaligned memory accesses.
-  bool StrictAlign = false;
-
-  // NegativeImmediates - transform instructions with negative immediates
-  bool NegativeImmediates = true;
-
   // Enable 64-bit vectorization in SLP.
   unsigned MinVectorRegisterBitWidth = 64;
 
-  bool OutlineAtomics = false;
-  bool PredictableSelectIsExpensive = false;
-  bool BalanceFPOps = false;
-  bool CustomAsCheapAsMove = false;
-  bool ExynosAsCheapAsMove = false;
-  bool UsePostRAScheduler = false;
-  bool Misaligned128StoreIsSlow = false;
-  bool Paired128IsSlow = false;
-  bool STRQroIsSlow = false;
-  bool UseAlternateSExtLoadCVTF32Pattern = false;
-  bool HasArithmeticBccFusion = false;
-  bool HasArithmeticCbzFusion = false;
-  bool HasCmpBccFusion = false;
-  bool HasFuseAddress = false;
-  bool HasFuseAES = false;
-  bool HasFuseArithmeticLogic = false;
-  bool HasFuseCCSelect = false;
-  bool HasFuseCryptoEOR = false;
-  bool HasFuseLiterals = false;
-  bool DisableLatencySchedHeuristic = false;
-  bool UseRSqrt = false;
-  bool Force32BitJumpTables = false;
-  bool UseEL1ForTP = false;
-  bool UseEL2ForTP = false;
-  bool UseEL3ForTP = false;
-  bool AllowTaggedGlobals = false;
-  bool HardenSlsRetBr = false;
-  bool HardenSlsBlr = false;
-  bool HardenSlsNoComdat = false;
+// Bool members corresponding to the SubtargetFeatures defined in tablegen
+#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER)                    \
+  bool ATTRIBUTE = DEFAULT;
+#include "AArch64GenSubtargetInfo.inc"
+
   uint8_t MaxInterleaveFactor = 2;
   uint8_t VectorInsertExtractBaseCost = 3;
   uint16_t CacheLineSize = 0;
@@ -282,7 +106,6 @@ protected:
   unsigned PrefLoopLogAlignment = 0;
   unsigned MaxBytesForLoopAlignment = 0;
   unsigned MaxJumpTableSize = 0;
-  unsigned WideningBaseCost = 0;
 
   // ReserveXRegister[i] - X#i is not available as a general purpose register.
   BitVector ReserveXRegister;
@@ -331,6 +154,11 @@ public:
                    unsigned MinSVEVectorSizeInBitsOverride = 0,
                    unsigned MaxSVEVectorSizeInBitsOverride = 0);
 
+// Getters for SubtargetFeatures defined in tablegen
+#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER)                    \
+  bool GETTER() const { return ATTRIBUTE; }
+#include "AArch64GenSubtargetInfo.inc"
+
   const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
@@ -351,9 +179,7 @@ public:
   const RegisterBankInfo *getRegBankInfo() const override;
   const Triple &getTargetTriple() const { return TargetTriple; }
   bool enableMachineScheduler() const override { return true; }
-  bool enablePostRAScheduler() const override {
-    return UsePostRAScheduler;
-  }
+  bool enablePostRAScheduler() const override { return usePostRAScheduler(); }
 
   /// Returns ARM processor family.
   /// Avoid this function! CPU specifics should be kept local to this class
@@ -363,30 +189,6 @@ public:
     return ARMProcFamily;
   }
 
-  bool hasV8_0aOps() const { return HasV8_0aOps; }
-  bool hasV8_1aOps() const { return HasV8_1aOps; }
-  bool hasV8_2aOps() const { return HasV8_2aOps; }
-  bool hasV8_3aOps() const { return HasV8_3aOps; }
-  bool hasV8_4aOps() const { return HasV8_4aOps; }
-  bool hasV8_5aOps() const { return HasV8_5aOps; }
-  bool hasV9_0aOps() const { return HasV9_0aOps; }
-  bool hasV9_1aOps() const { return HasV9_1aOps; }
-  bool hasV9_2aOps() const { return HasV9_2aOps; }
-  bool hasV9_3aOps() const { return HasV9_3aOps; }
-  bool hasV8_0rOps() const { return HasV8_0rOps; }
-
-  bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; }
-
-  bool hasZeroCycleZeroingGP() const { return HasZeroCycleZeroingGP; }
-
-  bool hasZeroCycleZeroingFP() const { return HasZeroCycleZeroingFP; }
-
-  bool hasZeroCycleZeroingFPWorkaround() const {
-    return HasZeroCycleZeroingFPWorkaround;
-  }
-
-  bool requiresStrictAlign() const { return StrictAlign; }
-
   bool isXRaySupported() const override { return true; }
 
   unsigned getMinVectorRegisterBitWidth() const {
@@ -399,63 +201,16 @@ public:
     return CustomCallSavedXRegs[i];
   }
   bool hasCustomCallingConv() const { return CustomCallSavedXRegs.any(); }
-  bool hasFPARMv8() const { return HasFPARMv8; }
-  bool hasNEON() const { return HasNEON; }
-  bool hasCrypto() const { return HasCrypto; }
-  bool hasDotProd() const { return HasDotProd; }
-  bool hasCRC() const { return HasCRC; }
-  bool hasLSE() const { return HasLSE; }
-  bool hasLSE2() const { return HasLSE2; }
-  bool hasRAS() const { return HasRAS; }
-  bool hasRDM() const { return HasRDM; }
-  bool hasSM4() const { return HasSM4; }
-  bool hasSHA3() const { return HasSHA3; }
-  bool hasSHA2() const { return HasSHA2; }
-  bool hasAES() const { return HasAES; }
-  bool hasCONTEXTIDREL2() const { return HasCONTEXTIDREL2; }
-  bool balanceFPOps() const { return BalanceFPOps; }
-  bool predictableSelectIsExpensive() const {
-    return PredictableSelectIsExpensive;
-  }
-  bool hasCustomCheapAsMoveHandling() const { return CustomAsCheapAsMove; }
-  bool hasExynosCheapAsMoveHandling() const { return ExynosAsCheapAsMove; }
-  bool isMisaligned128StoreSlow() const { return Misaligned128StoreIsSlow; }
-  bool isPaired128Slow() const { return Paired128IsSlow; }
-  bool isSTRQroSlow() const { return STRQroIsSlow; }
-  bool useAlternateSExtLoadCVTF32Pattern() const {
-    return UseAlternateSExtLoadCVTF32Pattern;
-  }
-  bool hasArithmeticBccFusion() const { return HasArithmeticBccFusion; }
-  bool hasArithmeticCbzFusion() const { return HasArithmeticCbzFusion; }
-  bool hasCmpBccFusion() const { return HasCmpBccFusion; }
-  bool hasFuseAddress() const { return HasFuseAddress; }
-  bool hasFuseAES() const { return HasFuseAES; }
-  bool hasFuseArithmeticLogic() const { return HasFuseArithmeticLogic; }
-  bool hasFuseCCSelect() const { return HasFuseCCSelect; }
-  bool hasFuseCryptoEOR() const { return HasFuseCryptoEOR; }
-  bool hasFuseLiterals() const { return HasFuseLiterals; }
 
   /// Return true if the CPU supports any kind of instruction fusion.
   bool hasFusion() const {
     return hasArithmeticBccFusion() || hasArithmeticCbzFusion() ||
-           hasFuseAES() || hasFuseArithmeticLogic() ||
-           hasFuseCCSelect() || hasFuseLiterals();
+           hasFuseAES() || hasFuseArithmeticLogic() || hasFuseCCSelect() ||
+           hasFuseAdrpAdd() || hasFuseLiterals();
   }
 
-  bool hardenSlsRetBr() const { return HardenSlsRetBr; }
-  bool hardenSlsBlr() const { return HardenSlsBlr; }
-  bool hardenSlsNoComdat() const { return HardenSlsNoComdat; }
-
-  bool useEL1ForTP() const { return UseEL1ForTP; }
-  bool useEL2ForTP() const { return UseEL2ForTP; }
-  bool useEL3ForTP() const { return UseEL3ForTP; }
-
-  bool useRSqrt() const { return UseRSqrt; }
-  bool force32BitJumpTables() const { return Force32BitJumpTables; }
   unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
-  unsigned getVectorInsertExtractBaseCost() const {
-    return VectorInsertExtractBaseCost;
-  }
+  unsigned getVectorInsertExtractBaseCost() const;
   unsigned getCacheLineSize() const override { return CacheLineSize; }
   unsigned getPrefetchDistance() const override { return PrefetchDistance; }
   unsigned getMinPrefetchStride(unsigned NumMemAccesses,
@@ -478,60 +233,10 @@ public:
 
   unsigned getMaximumJumpTableSize() const { return MaxJumpTableSize; }
 
-  unsigned getWideningBaseCost() const { return WideningBaseCost; }
-
-  bool useExperimentalZeroingPseudos() const {
-    return UseExperimentalZeroingPseudos;
-  }
-
-  bool useScalarIncVL() const { return UseScalarIncVL; }
-
   /// CPU has TBI (top byte of addresses is ignored during HW address
   /// translation) and OS enables it.
   bool supportsAddressTopByteIgnored() const;
 
-  bool hasPerfMon() const { return HasPerfMon; }
-  bool hasFullFP16() const { return HasFullFP16; }
-  bool hasFP16FML() const { return HasFP16FML; }
-  bool hasSPE() const { return HasSPE; }
-  bool hasLSLFast() const { return HasLSLFast; }
-  bool hasSVE() const { return HasSVE; }
-  bool hasSVE2() const { return HasSVE2; }
-  bool hasRCPC() const { return HasRCPC; }
-  bool hasAggressiveFMA() const { return HasAggressiveFMA; }
-  bool hasAlternativeNZCV() const { return HasAlternativeNZCV; }
-  bool hasFRInt3264() const { return HasFRInt3264; }
-  bool hasSpecRestrict() const { return HasSpecRestrict; }
-  bool hasSSBS() const { return HasSSBS; }
-  bool hasSB() const { return HasSB; }
-  bool hasPredRes() const { return HasPredRes; }
-  bool hasCCDP() const { return HasCCDP; }
-  bool hasBTI() const { return HasBTI; }
-  bool hasRandGen() const { return HasRandGen; }
-  bool hasMTE() const { return HasMTE; }
-  bool hasTME() const { return HasTME; }
-  // Arm SVE2 extensions
-  bool hasSVE2AES() const { return HasSVE2AES; }
-  bool hasSVE2SM4() const { return HasSVE2SM4; }
-  bool hasSVE2SHA3() const { return HasSVE2SHA3; }
-  bool hasSVE2BitPerm() const { return HasSVE2BitPerm; }
-  bool hasMatMulInt8() const { return HasMatMulInt8; }
-  bool hasMatMulFP32() const { return HasMatMulFP32; }
-  bool hasMatMulFP64() const { return HasMatMulFP64; }
-
-  // Armv8.6-A Extensions
-  bool hasBF16() const { return HasBF16; }
-  bool hasFineGrainedTraps() const { return HasFineGrainedTraps; }
-  bool hasEnhancedCounterVirtualization() const {
-    return HasEnhancedCounterVirtualization;
-  }
-
-  // Arm Scalable Matrix Extension (SME)
-  bool hasSME() const { return HasSME; }
-  bool hasSMEF64() const { return HasSMEF64; }
-  bool hasSMEI64() const { return HasSMEI64; }
-  bool hasStreamingSVE() const { return HasStreamingSVE; }
-
   bool isLittleEndian() const { return IsLittle; }
 
   bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
@@ -552,42 +257,6 @@ public:
 
   bool useAA() const override;
 
-  bool outlineAtomics() const { return OutlineAtomics; }
-
-  bool hasVH() const { return HasVH; }
-  bool hasPAN() const { return HasPAN; }
-  bool hasLOR() const { return HasLOR; }
-
-  bool hasPsUAO() const { return HasPsUAO; }
-  bool hasPAN_RWV() const { return HasPAN_RWV; }
-  bool hasCCPP() const { return HasCCPP; }
-
-  bool hasPAuth() const { return HasPAuth; }
-  bool hasJS() const { return HasJS; }
-  bool hasCCIDX() const { return HasCCIDX; }
-  bool hasComplxNum() const { return HasComplxNum; }
-
-  bool hasNV() const { return HasNV; }
-  bool hasMPAM() const { return HasMPAM; }
-  bool hasDIT() const { return HasDIT; }
-  bool hasTRACEV8_4() const { return HasTRACEV8_4; }
-  bool hasAM() const { return HasAM; }
-  bool hasAMVS() const { return HasAMVS; }
-  bool hasXS() const { return HasXS; }
-  bool hasWFxT() const { return HasWFxT; }
-  bool hasHCX() const { return HasHCX; }
-  bool hasLS64() const { return HasLS64; }
-  bool hasSEL2() const { return HasSEL2; }
-  bool hasTLB_RMI() const { return HasTLB_RMI; }
-  bool hasFlagM() const { return HasFlagM; }
-  bool hasRCPC_IMMO() const { return HasRCPC_IMMO; }
-  bool hasEL2VMSA() const { return HasEL2VMSA; }
-  bool hasEL3() const { return HasEL3; }
-  bool hasHBC() const { return HasHBC; }
-  bool hasMOPS() const { return HasMOPS; }
-
-  bool fixCortexA53_835769() const { return FixCortexA53_835769; }
-
   bool addrSinkUsingGEPs() const override {
     // Keeping GEPs inbounds is important for exploiting AArch64
     // addressing-modes in ILP32 mode.
@@ -623,8 +292,6 @@ public:
 
   bool enableEarlyIfConversion() const override;
 
-  bool enableAdvancedRASplitCost() const override { return false; }
-
   std::unique_ptr<PBQPRAConstraint> getCustomPBQPConstraints() const override;
 
   bool isCallingConvWin64(CallingConv::ID CC) const {
diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
index cce5813fe6e9..f3788175c48d 100644
--- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
@@ -18,23 +18,23 @@ include "llvm/TableGen/SearchableTable.td"
 //===----------------------------------------------------------------------===//
 
 def HasCCPP    : Predicate<"Subtarget->hasCCPP()">,
-                 AssemblerPredicate<(all_of FeatureCCPP), "ccpp">;
+                 AssemblerPredicateWithAll<(all_of FeatureCCPP), "ccpp">;
 
 def HasPAN     : Predicate<"Subtarget->hasPAN()">,
-                 AssemblerPredicate<(all_of FeaturePAN),
+                 AssemblerPredicateWithAll<(all_of FeaturePAN),
                  "ARM v8.1  Privileged Access-Never extension">;
 
 def HasPsUAO   : Predicate<"Subtarget->hasPsUAO()">,
-                 AssemblerPredicate<(all_of FeaturePsUAO),
+                 AssemblerPredicateWithAll<(all_of FeaturePsUAO),
                  "ARM v8.2 UAO PState extension (psuao)">;
 
 def HasPAN_RWV : Predicate<"Subtarget->hasPAN_RWV()">,
-                 AssemblerPredicate<(all_of FeaturePAN_RWV),
+                 AssemblerPredicateWithAll<(all_of FeaturePAN_RWV),
                  "ARM v8.2 PAN AT S1E1R and AT S1E1W Variation">;
 
 def HasCONTEXTIDREL2
                : Predicate<"Subtarget->hasCONTEXTIDREL2()">,
-                 AssemblerPredicate<(all_of FeatureCONTEXTIDREL2),
+                 AssemblerPredicateWithAll<(all_of FeatureCONTEXTIDREL2),
                  "Target contains CONTEXTIDR_EL2 RW operand">;
 
 //===----------------------------------------------------------------------===//
@@ -631,6 +631,7 @@ def : ROSysReg<"OSLSR_EL1",          0b10, 0b000, 0b0001, 0b0001, 0b100>;
 def : ROSysReg<"DBGAUTHSTATUS_EL1",  0b10, 0b000, 0b0111, 0b1110, 0b110>;
 def : ROSysReg<"PMCEID0_EL0",        0b11, 0b011, 0b1001, 0b1100, 0b110>;
 def : ROSysReg<"PMCEID1_EL0",        0b11, 0b011, 0b1001, 0b1100, 0b111>;
+def : ROSysReg<"PMMIR_EL1",          0b11, 0b000, 0b1001, 0b1110, 0b110>;
 def : ROSysReg<"MIDR_EL1",           0b11, 0b000, 0b0000, 0b0000, 0b000>;
 def : ROSysReg<"CCSIDR_EL1",         0b11, 0b001, 0b0000, 0b0000, 0b000>;
 
@@ -977,7 +978,6 @@ def : RWSysReg<"PMUSERENR_EL0",      0b11, 0b011, 0b1001, 0b1110, 0b000>;
 def : RWSysReg<"PMINTENSET_EL1",     0b11, 0b000, 0b1001, 0b1110, 0b001>;
 def : RWSysReg<"PMINTENCLR_EL1",     0b11, 0b000, 0b1001, 0b1110, 0b010>;
 def : RWSysReg<"PMOVSSET_EL0",       0b11, 0b011, 0b1001, 0b1110, 0b011>;
-def : RWSysReg<"PMMIR_EL1",          0b11, 0b000, 0b1001, 0b1110, 0b110>;
 def : RWSysReg<"MAIR_EL1",           0b11, 0b000, 0b1010, 0b0010, 0b000>;
 def : RWSysReg<"MAIR_EL2",           0b11, 0b100, 0b1010, 0b0010, 0b000>;
 def : RWSysReg<"MAIR_EL3",           0b11, 0b110, 0b1010, 0b0010, 0b000>;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 4af28fc070dd..3f9795f5198b 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -12,6 +12,7 @@
 #include "AArch64TargetMachine.h"
 #include "AArch64.h"
 #include "AArch64MachineFunctionInfo.h"
+#include "AArch64MachineScheduler.h"
 #include "AArch64MacroFusion.h"
 #include "AArch64Subtarget.h"
 #include "AArch64TargetObjectFile.h"
@@ -21,7 +22,9 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/CFIFixup.h"
 #include "llvm/CodeGen/CSEConfigBase.h"
+#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
@@ -31,6 +34,7 @@
 #include "llvm/CodeGen/MIRParser/MIParser.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
@@ -59,6 +63,11 @@ static cl::opt<bool>
                        cl::desc("Enable the conditional branch tuning pass"),
                        cl::init(true), cl::Hidden);
 
+static cl::opt<bool> EnableAArch64CopyPropagation(
+    "aarch64-enable-copy-propagation",
+    cl::desc("Enable the copy propagation with AArch64 copy instr"),
+    cl::init(true), cl::Hidden);
+
 static cl::opt<bool> EnableMCR("aarch64-enable-mcr",
                                cl::desc("Enable the machine combiner pass"),
                                cl::init(true), cl::Hidden);
@@ -265,7 +274,7 @@ static Reloc::Model getEffectiveRelocModel(const Triple &TT,
   // On ELF platforms the default static relocation model has a smart enough
   // linker to cope with referencing external symbols defined in a shared
   // library. Hence DynamicNoPIC doesn't need to be promoted to PIC.
-  if (!RM.hasValue() || *RM == Reloc::DynamicNoPIC)
+  if (!RM || *RM == Reloc::DynamicNoPIC)
     return Reloc::Static;
   return *RM;
 }
@@ -354,6 +363,10 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT,
 
   // AArch64 supports the debug entry values.
   setSupportsDebugEntryValues(true);
+
+  // AArch64 supports fixing up the DWARF unwind information.
+  if (!getMCAsmInfo()->usesWindowsCFI())
+    setCFIFixup(true);
 }
 
 AArch64TargetMachine::~AArch64TargetMachine() = default;
@@ -379,7 +392,7 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
   if (VScaleRangeAttr.isValid()) {
     Optional<unsigned> VScaleMax = VScaleRangeAttr.getVScaleRangeMax();
     MinSVEVectorSize = VScaleRangeAttr.getVScaleRangeMin() * 128;
-    MaxSVEVectorSize = VScaleMax ? VScaleMax.getValue() * 128 : 0;
+    MaxSVEVectorSize = VScaleMax ? *VScaleMax * 128 : 0;
   } else {
     MinSVEVectorSize = SVEVectorBitsMinOpt;
     MaxSVEVectorSize = SVEVectorBitsMaxOpt;
@@ -468,15 +481,17 @@ public:
   ScheduleDAGInstrs *
   createPostMachineScheduler(MachineSchedContext *C) const override {
     const AArch64Subtarget &ST = C->MF->getSubtarget<AArch64Subtarget>();
+    ScheduleDAGMI *DAG =
+        new ScheduleDAGMI(C, std::make_unique<AArch64PostRASchedStrategy>(C),
+                          /* RemoveKillFlags=*/true);
     if (ST.hasFusion()) {
       // Run the Macro Fusion after RA again since literals are expanded from
       // pseudos then (v. addPreSched2()).
-      ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
       DAG->addMutation(createAArch64MacroFusionDAGMutation());
       return DAG;
     }
 
-    return nullptr;
+    return DAG;
   }
 
   void addIRPasses()  override;
@@ -504,7 +519,7 @@ public:
 } // end anonymous namespace
 
 TargetTransformInfo
-AArch64TargetMachine::getTargetTransformInfo(const Function &F) {
+AArch64TargetMachine::getTargetTransformInfo(const Function &F) const {
   return TargetTransformInfo(AArch64TTIImpl(this, F));
 }
 
@@ -531,6 +546,7 @@ void AArch64PassConfig::addIRPasses() {
   if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
     addPass(createCFGSimplificationPass(SimplifyCFGOptions()
                                             .forwardSwitchCondToPhi(true)
+                                            .convertSwitchRangeToICmp(true)
                                             .convertSwitchToLookupTable(true)
                                             .needCanonicalLoops(false)
                                             .hoistCommonInsts(true)
@@ -574,6 +590,9 @@ void AArch64PassConfig::addIRPasses() {
   // Add Control Flow Guard checks.
   if (TM->getTargetTriple().isOSWindows())
     addPass(createCFGuardCheckPass());
+
+  if (TM->Options.JMCInstrument)
+    addPass(createJMCInstrumenterPass());
 }
 
 // Pass Pipeline Configuration
@@ -759,6 +778,10 @@ void AArch64PassConfig::addPreEmitPass() {
   if (TM->getOptLevel() >= CodeGenOpt::Aggressive && EnableLoadStoreOpt)
     addPass(createAArch64LoadStoreOptimizationPass());
 
+  if (TM->getOptLevel() >= CodeGenOpt::Aggressive &&
+      EnableAArch64CopyPropagation)
+    addPass(createMachineCopyPropagationPass(true));
+
   addPass(createAArch64A53Fix835769());
 
   if (EnableBranchTargets)
@@ -804,8 +827,7 @@ AArch64TargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
 bool AArch64TargetMachine::parseMachineFunctionInfo(
     const yaml::MachineFunctionInfo &MFI, PerFunctionMIParsingState &PFS,
     SMDiagnostic &Error, SMRange &SourceRange) const {
-  const auto &YamlMFI =
-      reinterpret_cast<const yaml::AArch64FunctionInfo &>(MFI);
+  const auto &YamlMFI = static_cast<const yaml::AArch64FunctionInfo &>(MFI);
   MachineFunction &MF = PFS.MF;
   MF.getInfo<AArch64FunctionInfo>()->initializeBaseYamlFields(YamlMFI);
   return false;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/llvm/lib/Target/AArch64/AArch64TargetMachine.h
index 7d314bce99b1..beb109502ff9 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.h
@@ -41,7 +41,7 @@ public:
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
-  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+  TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
 
   TargetLoweringObjectFile* getObjFileLowering() const override {
     return TLOF.get();
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index b2ffdf949d8b..41c7a8c5042f 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "AArch64TargetTransformInfo.h"
 #include "AArch64ExpandImm.h"
+#include "AArch64PerfectShuffle.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -15,8 +16,8 @@
 #include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/CodeGen/CostTable.h"
 #include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Debug.h"
@@ -50,6 +51,12 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
   return (CallerBits & CalleeBits) == CalleeBits;
 }
 
+bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
+    TargetTransformInfo::RegisterKind K) const {
+  assert(K != TargetTransformInfo::RGK_Scalar);
+  return K == TargetTransformInfo::RGK_FixedWidthVector;
+}
+
 /// Calculate the cost of materializing a 64-bit value. This helper
 /// method might only calculate a fraction of a larger immediate. Therefore it
 /// is valid to return a cost of ZERO.
@@ -370,6 +377,49 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
         return Entry->Cost;
     break;
   }
+  case Intrinsic::fptosi_sat:
+  case Intrinsic::fptoui_sat: {
+    if (ICA.getArgTypes().empty())
+      break;
+    bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
+    auto LT = TLI->getTypeLegalizationCost(DL, ICA.getArgTypes()[0]);
+    EVT MTy = TLI->getValueType(DL, RetTy);
+    // Check for the legal types, which are where the size of the input and the
+    // output are the same, or we are using cvt f64->i32 or f32->i64.
+    if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
+         LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
+         LT.second == MVT::v2f64) &&
+        (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
+         (LT.second == MVT::f64 && MTy == MVT::i32) ||
+         (LT.second == MVT::f32 && MTy == MVT::i64)))
+      return LT.first;
+    // Similarly for fp16 sizes
+    if (ST->hasFullFP16() &&
+        ((LT.second == MVT::f16 && MTy == MVT::i32) ||
+         ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
+          (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits()))))
+      return LT.first;
+
+    // Otherwise we use a legal convert followed by a min+max
+    if ((LT.second.getScalarType() == MVT::f32 ||
+         LT.second.getScalarType() == MVT::f64 ||
+         (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) &&
+        LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
+      Type *LegalTy =
+          Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
+      if (LT.second.isVector())
+        LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
+      InstructionCost Cost = 1;
+      IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
+                                    LegalTy, {LegalTy, LegalTy});
+      Cost += getIntrinsicInstrCost(Attrs1, CostKind);
+      IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
+                                    LegalTy, {LegalTy, LegalTy});
+      Cost += getIntrinsicInstrCost(Attrs2, CostKind);
+      return LT.first * Cost;
+    }
+    break;
+  }
   default:
     break;
   }
@@ -525,6 +575,14 @@ static Optional<Instruction *> instCombineConvertFromSVBool(InstCombiner &IC,
   return IC.replaceInstUsesWith(II, EarliestReplacement);
 }
 
+static Optional<Instruction *> instCombineSVESel(InstCombiner &IC,
+                                                 IntrinsicInst &II) {
+  IRBuilder<> Builder(&II);
+  auto Select = Builder.CreateSelect(II.getOperand(0), II.getOperand(1),
+                                     II.getOperand(2));
+  return IC.replaceInstUsesWith(II, Select);
+}
+
 static Optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
                                                  IntrinsicInst &II) {
   IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
@@ -594,8 +652,7 @@ static Optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
     return None;
 
   auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
-  if (!VecIns ||
-      VecIns->getIntrinsicID() != Intrinsic::experimental_vector_insert)
+  if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
     return None;
 
   // Where the vector insert is a fixed constant vector insert into undef at
@@ -862,12 +919,14 @@ instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
 
   if (isAllActivePredicate(Pred)) {
     LoadInst *Load = Builder.CreateLoad(VecTy, VecPtr);
+    Load->copyMetadata(II);
     return IC.replaceInstUsesWith(II, Load);
   }
 
   CallInst *MaskedLoad =
       Builder.CreateMaskedLoad(VecTy, VecPtr, PtrOp->getPointerAlignment(DL),
                                Pred, ConstantAggregateZero::get(VecTy));
+  MaskedLoad->copyMetadata(II);
   return IC.replaceInstUsesWith(II, MaskedLoad);
 }
 
@@ -883,12 +942,14 @@ instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
       Builder.CreateBitCast(PtrOp, VecOp->getType()->getPointerTo());
 
   if (isAllActivePredicate(Pred)) {
-    Builder.CreateStore(VecOp, VecPtr);
+    StoreInst *Store = Builder.CreateStore(VecOp, VecPtr);
+    Store->copyMetadata(II);
     return IC.eraseInstFromFunction(II);
   }
 
-  Builder.CreateMaskedStore(VecOp, VecPtr, PtrOp->getPointerAlignment(DL),
-                            Pred);
+  CallInst *MaskedStore = Builder.CreateMaskedStore(
+      VecOp, VecPtr, PtrOp->getPointerAlignment(DL), Pred);
+  MaskedStore->copyMetadata(II);
   return IC.eraseInstFromFunction(II);
 }
 
@@ -1069,7 +1130,6 @@ static Optional<Instruction *> instCombineLD1GatherIndex(InstCombiner &IC,
   Value *BasePtr = II.getOperand(1);
   Value *Index = II.getOperand(2);
   Type *Ty = II.getType();
-  Type *BasePtrTy = BasePtr->getType();
   Value *PassThru = ConstantAggregateZero::get(Ty);
 
   // Contiguous gather => masked load.
@@ -1085,8 +1145,8 @@ static Optional<Instruction *> instCombineLD1GatherIndex(InstCombiner &IC,
         BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
 
     Type *VecPtrTy = PointerType::getUnqual(Ty);
-    Value *Ptr = Builder.CreateGEP(BasePtrTy->getPointerElementType(), BasePtr,
-                                   IndexBase);
+    Value *Ptr = Builder.CreateGEP(
+        cast<VectorType>(Ty)->getElementType(), BasePtr, IndexBase);
     Ptr = Builder.CreateBitCast(Ptr, VecPtrTy);
     CallInst *MaskedLoad =
         Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
@@ -1104,10 +1164,9 @@ static Optional<Instruction *> instCombineST1ScatterIndex(InstCombiner &IC,
   Value *BasePtr = II.getOperand(2);
   Value *Index = II.getOperand(3);
   Type *Ty = Val->getType();
-  Type *BasePtrTy = BasePtr->getType();
 
   // Contiguous scatter => masked store.
-  // (sve.ld1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
+  // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
   // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
   Value *IndexBase;
   if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
@@ -1118,8 +1177,8 @@ static Optional<Instruction *> instCombineST1ScatterIndex(InstCombiner &IC,
     Align Alignment =
         BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
 
-    Value *Ptr = Builder.CreateGEP(BasePtrTy->getPointerElementType(), BasePtr,
-                                   IndexBase);
+    Value *Ptr = Builder.CreateGEP(
+        cast<VectorType>(Ty)->getElementType(), BasePtr, IndexBase);
     Type *VecPtrTy = PointerType::getUnqual(Ty);
     Ptr = Builder.CreateBitCast(Ptr, VecPtrTy);
 
@@ -1165,6 +1224,52 @@ static Optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
   return None;
 }
 
+static Optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
+                                                   IntrinsicInst &II) {
+  Value *A = II.getArgOperand(0);
+  Value *B = II.getArgOperand(1);
+  if (A == B)
+    return IC.replaceInstUsesWith(II, A);
+
+  return None;
+}
+
+static Optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
+                                                   IntrinsicInst &II) {
+  IRBuilder<> Builder(&II);
+  Value *Pred = II.getOperand(0);
+  Value *Vec = II.getOperand(1);
+  Value *Shift = II.getOperand(2);
+
+  // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
+  Value *AbsPred, *MergedValue;
+  if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
+                      m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
+      !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
+                      m_Value(MergedValue), m_Value(AbsPred), m_Value())))
+
+    return None;
+
+  // Transform is valid if any of the following are true:
+  // * The ABS merge value is an undef or non-negative
+  // * The ABS predicate is all active
+  // * The ABS predicate and the SRSHL predicates are the same
+  if (!isa<UndefValue>(MergedValue) &&
+      !match(MergedValue, m_NonNegative()) &&
+      AbsPred != Pred && !isAllActivePredicate(AbsPred))
+    return None;
+
+  // Only valid when the shift amount is non-negative, otherwise the rounding
+  // behaviour of SRSHL cannot be ignored.
+  if (!match(Shift, m_NonNegative()))
+    return None;
+
+  auto LSL = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl, {II.getType()},
+                                     {Pred, Vec, Shift});
+
+  return IC.replaceInstUsesWith(II, LSL);
+}
+
 Optional<Instruction *>
 AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
                                      IntrinsicInst &II) const {
@@ -1172,6 +1277,9 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
   switch (IID) {
   default:
     break;
+  case Intrinsic::aarch64_neon_fmaxnm:
+  case Intrinsic::aarch64_neon_fminnm:
+    return instCombineMaxMinNM(IC, II);
   case Intrinsic::aarch64_sve_convert_from_svbool:
     return instCombineConvertFromSVBool(IC, II);
   case Intrinsic::aarch64_sve_dup:
@@ -1227,6 +1335,10 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
     return instCombineSVEST1(IC, II, DL);
   case Intrinsic::aarch64_sve_sdiv:
     return instCombineSVESDIV(IC, II);
+  case Intrinsic::aarch64_sve_sel:
+    return instCombineSVESel(IC, II);
+  case Intrinsic::aarch64_sve_srshl:
+    return instCombineSVESrshl(IC, II);
   }
 
   return None;
@@ -1262,7 +1374,7 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
                                            ArrayRef<const Value *> Args) {
 
   // A helper that returns a vector type from the given type. The number of
-  // elements in type Ty determine the vector width.
+  // elements in type Ty determines the vector width.
   auto toVectorTy = [&](Type *ArgTy) {
     return VectorType::get(ArgTy->getScalarType(),
                            cast<VectorType>(DstTy)->getElementCount());
@@ -1277,26 +1389,32 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
   // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
   // instructions.
   //
-  // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
+  // TODO: Add additional widening operations (e.g., shl, etc.) once we
   //       verify that their extending operands are eliminated during code
   //       generation.
   switch (Opcode) {
   case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
   case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
+  case Instruction::Mul: // SMULL(2), UMULL(2)
     break;
   default:
     return false;
   }
 
   // To be a widening instruction (either the "wide" or "long" versions), the
-  // second operand must be a sign- or zero extend having a single user. We
-  // only consider extends having a single user because they may otherwise not
-  // be eliminated.
+  // second operand must be a sign- or zero extend.
   if (Args.size() != 2 ||
-      (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) ||
-      !Args[1]->hasOneUse())
+      (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])))
     return false;
   auto *Extend = cast<CastInst>(Args[1]);
+  auto *Arg0 = dyn_cast<CastInst>(Args[0]);
+
+  // A mul only has a mull version (not like addw). Both operands need to be
+  // extending and the same type.
+  if (Opcode == Instruction::Mul &&
+      (!Arg0 || Arg0->getOpcode() != Extend->getOpcode() ||
+       Arg0->getOperand(0)->getType() != Extend->getOperand(0)->getType()))
+    return false;
 
   // Legalize the destination type and ensure it can be used in a widening
   // operation.
@@ -1334,7 +1452,7 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
 
   // If the cast is observable, and it is used by a widening instruction (e.g.,
   // uaddl, saddw, etc.), it may be free.
-  if (I && I->hasOneUse()) {
+  if (I && I->hasOneUser()) {
     auto *SingleUser = cast<Instruction>(*I->user_begin());
     SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
     if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
@@ -1606,6 +1724,36 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
                                                  SrcTy.getSimpleVT()))
     return AdjustCost(Entry->Cost);
 
+  static const TypeConversionCostTblEntry FP16Tbl[] = {
+      {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
+      {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
+      {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
+      {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
+      {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
+      {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
+      {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
+      {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
+      {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
+      {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
+      {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
+      {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
+      {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
+      {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
+      {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
+      {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
+      {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
+      {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
+      {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2},   // ushll + ucvtf
+      {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2},   // sshll + scvtf
+      {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
+      {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
+  };
+
+  if (ST->hasFullFP16())
+    if (const auto *Entry = ConvertCostTableLookup(
+            FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
+      return AdjustCost(Entry->Cost);
+
   return AdjustCost(
       BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
 }
@@ -1723,24 +1871,12 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
 
   // Legalize the type.
   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
-
-  // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
-  // add in the widening overhead specified by the sub-target. Since the
-  // extends feeding widening instructions are performed automatically, they
-  // aren't present in the generated code and have a zero cost. By adding a
-  // widening overhead here, we attach the total cost of the combined operation
-  // to the widening instruction.
-  InstructionCost Cost = 0;
-  if (isWideningInstruction(Ty, Opcode, Args))
-    Cost += ST->getWideningBaseCost();
-
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
 
   switch (ISD) {
   default:
-    return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
-                                                Opd2Info,
-                                                Opd1PropInfo, Opd2PropInfo);
+    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
+                                         Opd2Info, Opd1PropInfo, Opd2PropInfo);
   case ISD::SDIV:
     if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
         Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
@@ -1748,26 +1884,22 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
       // normally expanded to the sequence ADD + CMP + SELECT + SRA.
       // The OperandValue properties many not be same as that of previous
       // operation; conservatively assume OP_None.
-      Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
-                                     Opd1Info, Opd2Info,
-                                     TargetTransformInfo::OP_None,
-                                     TargetTransformInfo::OP_None);
-      Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
-                                     Opd1Info, Opd2Info,
-                                     TargetTransformInfo::OP_None,
+      InstructionCost Cost = getArithmeticInstrCost(
+          Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info,
+          TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
+      Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Opd1Info,
+                                     Opd2Info, TargetTransformInfo::OP_None,
                                      TargetTransformInfo::OP_None);
-      Cost += getArithmeticInstrCost(Instruction::Select, Ty, CostKind,
-                                     Opd1Info, Opd2Info,
-                                     TargetTransformInfo::OP_None,
-                                     TargetTransformInfo::OP_None);
-      Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
-                                     Opd1Info, Opd2Info,
-                                     TargetTransformInfo::OP_None,
+      Cost += getArithmeticInstrCost(
+          Instruction::Select, Ty, CostKind, Opd1Info, Opd2Info,
+          TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
+      Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Opd1Info,
+                                     Opd2Info, TargetTransformInfo::OP_None,
                                      TargetTransformInfo::OP_None);
       return Cost;
     }
     LLVM_FALLTHROUGH;
-  case ISD::UDIV:
+  case ISD::UDIV: {
     if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) {
       auto VT = TLI->getValueType(DL, Ty);
       if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
@@ -1787,9 +1919,8 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
       }
     }
 
-    Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
-                                          Opd2Info,
-                                          Opd1PropInfo, Opd2PropInfo);
+    InstructionCost Cost = BaseT::getArithmeticInstrCost(
+        Opcode, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo);
     if (Ty->isVectorTy()) {
       // On AArch64, vector divisions are not supported natively and are
       // expanded into scalar divisions of each pair of elements.
@@ -1804,27 +1935,31 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
       Cost += Cost;
     }
     return Cost;
-
+  }
   case ISD::MUL:
-    if (LT.second != MVT::v2i64)
-      return (Cost + 1) * LT.first;
     // Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive
     // as elements are extracted from the vectors and the muls scalarized.
     // As getScalarizationOverhead is a bit too pessimistic, we estimate the
     // cost for a i64 vector directly here, which is:
-    // - four i64 extracts,
-    // - two i64 inserts, and
-    // - two muls.
-    // So, for a v2i64 with LT.First = 1 the cost is 8, and for a v4i64 with
-    // LT.first = 2 the cost is 16.
-    return LT.first * 8;
+    // - four 2-cost i64 extracts,
+    // - two 2-cost i64 inserts, and
+    // - two 1-cost muls.
+    // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
+    // LT.first = 2 the cost is 28. If both operands are extensions it will not
+    // need to scalarize so the cost can be cheaper (smull or umull).
+    if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
+      return LT.first;
+    return LT.first * 14;
   case ISD::ADD:
   case ISD::XOR:
   case ISD::OR:
   case ISD::AND:
+  case ISD::SRL:
+  case ISD::SRA:
+  case ISD::SHL:
     // These nodes are marked as 'custom' for combining purposes only.
     // We know that they are legal. See LowerAdd in ISelLowering.
-    return (Cost + 1) * LT.first;
+    return LT.first;
 
   case ISD::FADD:
   case ISD::FSUB:
@@ -1834,11 +1969,10 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
     // These nodes are marked as 'custom' just to lower them to SVE.
     // We know said lowering will incur no additional cost.
     if (!Ty->getScalarType()->isFP128Ty())
-      return (Cost + 2) * LT.first;
+      return 2 * LT.first;
 
-    return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
-                                                Opd2Info,
-                                                Opd1PropInfo, Opd2PropInfo);
+    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
+                                         Opd2Info, Opd1PropInfo, Opd2PropInfo);
   }
 }
 
@@ -1946,6 +2080,10 @@ AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
   return Options;
 }
 
+bool AArch64TTIImpl::prefersVectorizedAddressing() const {
+  return ST->hasSVE();
+}
+
 InstructionCost
 AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
                                       Align Alignment, unsigned AddressSpace,
@@ -2559,11 +2697,97 @@ InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) {
 InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
                                                VectorType *Tp,
                                                ArrayRef<int> Mask, int Index,
-                                               VectorType *SubTp) {
+                                               VectorType *SubTp,
+                                               ArrayRef<const Value *> Args) {
+  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+  // If we have a Mask, and the LT is being legalized somehow, split the Mask
+  // into smaller vectors and sum the cost of each shuffle.
+  if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
+      Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
+      cast<FixedVectorType>(Tp)->getNumElements() >
+          LT.second.getVectorNumElements() &&
+      !Index && !SubTp) {
+    unsigned TpNumElts = cast<FixedVectorType>(Tp)->getNumElements();
+    assert(Mask.size() == TpNumElts && "Expected Mask and Tp size to match!");
+    unsigned LTNumElts = LT.second.getVectorNumElements();
+    unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
+    VectorType *NTp =
+        VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount());
+    InstructionCost Cost;
+    for (unsigned N = 0; N < NumVecs; N++) {
+      SmallVector<int> NMask;
+      // Split the existing mask into chunks of size LTNumElts. Track the source
+      // sub-vectors to ensure the result has at most 2 inputs.
+      unsigned Source1, Source2;
+      unsigned NumSources = 0;
+      for (unsigned E = 0; E < LTNumElts; E++) {
+        int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
+                                                      : UndefMaskElem;
+        if (MaskElt < 0) {
+          NMask.push_back(UndefMaskElem);
+          continue;
+        }
+
+        // Calculate which source from the input this comes from and whether it
+        // is new to us.
+        unsigned Source = MaskElt / LTNumElts;
+        if (NumSources == 0) {
+          Source1 = Source;
+          NumSources = 1;
+        } else if (NumSources == 1 && Source != Source1) {
+          Source2 = Source;
+          NumSources = 2;
+        } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
+          NumSources++;
+        }
+
+        // Add to the new mask. For the NumSources>2 case these are not correct,
+        // but are only used for the modular lane number.
+        if (Source == Source1)
+          NMask.push_back(MaskElt % LTNumElts);
+        else if (Source == Source2)
+          NMask.push_back(MaskElt % LTNumElts + LTNumElts);
+        else
+          NMask.push_back(MaskElt % LTNumElts);
+      }
+      // If the sub-mask has at most 2 input sub-vectors then re-cost it using
+      // getShuffleCost. If not then cost it using the worst case.
+      if (NumSources <= 2)
+        Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
+                                               : TTI::SK_PermuteTwoSrc,
+                               NTp, NMask, 0, nullptr, Args);
+      else if (any_of(enumerate(NMask), [&](const auto &ME) {
+                 return ME.value() % LTNumElts == ME.index();
+               }))
+        Cost += LTNumElts - 1;
+      else
+        Cost += LTNumElts;
+    }
+    return Cost;
+  }
+
   Kind = improveShuffleKindFromMask(Kind, Mask);
+
+  // Check for broadcast loads.
+  if (Kind == TTI::SK_Broadcast) {
+    bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
+    if (IsLoad && LT.second.isVector() &&
+        isLegalBroadcastLoad(Tp->getElementType(),
+                             LT.second.getVectorElementCount()))
+      return 0; // broadcast is handled by ld1r
+  }
+
+  // If we have 4 elements for the shuffle and a Mask, get the cost straight
+  // from the perfect shuffle tables.
+  if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) &&
+      (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) &&
+      all_of(Mask, [](int E) { return E < 8; }))
+    return getPerfectShuffleCost(Mask);
+
   if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
       Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
       Kind == TTI::SK_Reverse) {
+
     static const CostTblEntry ShuffleTbl[] = {
       // Broadcast shuffle kinds can be performed with 'dup'.
       { TTI::SK_Broadcast, MVT::v8i8,  1 },
@@ -2618,6 +2842,12 @@ InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
       { TTI::SK_Reverse, MVT::v2f32, 1 }, // mov.
       { TTI::SK_Reverse, MVT::v4f32, 2 }, // REV64; EXT
       { TTI::SK_Reverse, MVT::v2f64, 1 }, // mov.
+      { TTI::SK_Reverse, MVT::v8f16, 2 }, // REV64; EXT
+      { TTI::SK_Reverse, MVT::v8i16, 2 }, // REV64; EXT
+      { TTI::SK_Reverse, MVT::v16i8, 2 }, // REV64; EXT
+      { TTI::SK_Reverse, MVT::v4f16, 1 }, // REV64
+      { TTI::SK_Reverse, MVT::v4i16, 1 }, // REV64
+      { TTI::SK_Reverse, MVT::v8i8, 1 }, // REV64
       // Broadcast shuffle kinds for scalable vectors
       { TTI::SK_Broadcast, MVT::nxv16i8,  1 },
       { TTI::SK_Broadcast, MVT::nxv8i16,  1 },
@@ -2655,11 +2885,26 @@ InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
       { TTI::SK_Reverse, MVT::nxv4i1,   1 },
       { TTI::SK_Reverse, MVT::nxv2i1,   1 },
     };
-    std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
     if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
       return LT.first * Entry->Cost;
   }
+
   if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
     return getSpliceCost(Tp, Index);
+
+  // Inserting a subvector can often be done with either a D, S or H register
+  // move, so long as the inserted vector is "aligned".
+  if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
+      LT.second.getSizeInBits() <= 128 && SubTp) {
+    std::pair<InstructionCost, MVT> SubLT =
+        TLI->getTypeLegalizationCost(DL, SubTp);
+    if (SubLT.second.isVector()) {
+      int NumElts = LT.second.getVectorNumElements();
+      int NumSubElts = SubLT.second.getVectorNumElements();
+      if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
+        return SubLT.first;
+    }
+  }
+
   return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
 }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index a6029b9f2445..d0aacb457a39 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -135,6 +135,8 @@ public:
     return ST->getVScaleForTuning();
   }
 
+  bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const;
+
   /// Try to return an estimate cost factor that can be used as a multiplier
   /// when scalarizing an operation for a vector with ElementCount \p VF.
   /// For scalable vectors this currently takes the most pessimistic view based
@@ -148,6 +150,8 @@ public:
 
   unsigned getMaxInterleaveFactor(unsigned VF);
 
+  bool prefersVectorizedAddressing() const;
+
   InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
                                         Align Alignment, unsigned AddressSpace,
                                         TTI::TargetCostKind CostKind);
@@ -278,6 +282,23 @@ public:
     return isLegalMaskedGatherScatter(DataType);
   }
 
+  bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const {
+    // Return true if we can generate a `ld1r` splat load instruction.
+    if (!ST->hasNEON() || NumElements.isScalable())
+      return false;
+    switch (unsigned ElementBits = ElementTy->getScalarSizeInBits()) {
+    case 8:
+    case 16:
+    case 32:
+    case 64: {
+      // We accept bit-widths >= 64bits and elements {8,16,32,64} bits.
+      unsigned VectorBits = NumElements.getFixedValue() * ElementBits;
+      return VectorBits >= 64;
+    }
+    }
+    return false;
+  }
+
   bool isLegalNTStore(Type *DataType, Align Alignment) {
     // NOTE: The logic below is mostly geared towards LV, which calls it with
     //       vectors with 2 elements. We might want to improve that, if other
@@ -330,7 +351,8 @@ public:
 
   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
                                  ArrayRef<int> Mask, int Index,
-                                 VectorType *SubTp);
+                                 VectorType *SubTp,
+                                 ArrayRef<const Value *> Args = None);
   /// @}
 };
 
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 33ed7ae9780e..ade23f643538 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -127,7 +127,7 @@ private:
       return Prefix;
     }
 
-    PrefixInfo() : Active(false), Predicated(false) {}
+    PrefixInfo() = default;
     bool isActive() const { return Active; }
     bool isPredicated() const { return Predicated; }
     unsigned getElementSize() const {
@@ -141,8 +141,8 @@ private:
     }
 
   private:
-    bool Active;
-    bool Predicated;
+    bool Active = false;
+    bool Predicated = false;
     unsigned ElementSize;
     unsigned Dst;
     unsigned Pg;
@@ -157,7 +157,8 @@ private:
 
   bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);
   void createSysAlias(uint16_t Encoding, OperandVector &Operands, SMLoc S);
-  AArch64CC::CondCode parseCondCodeString(StringRef Cond);
+  AArch64CC::CondCode parseCondCodeString(StringRef Cond,
+                                          std::string &Suggestion);
   bool parseCondCode(OperandVector &Operands, bool invertCondCode);
   unsigned matchRegisterNameAlias(StringRef Name, RegKind Kind);
   bool parseRegister(OperandVector &Operands);
@@ -189,6 +190,7 @@ private:
   bool parseDirectiveUnreq(SMLoc L);
   bool parseDirectiveCFINegateRAState();
   bool parseDirectiveCFIBKeyFrame();
+  bool parseDirectiveCFIMTETaggedFrame();
 
   bool parseDirectiveVariantPCS(SMLoc L);
 
@@ -2425,7 +2427,7 @@ static Optional<std::pair<int, int>> parseVectorKind(StringRef Suffix,
 }
 
 static bool isValidVectorKind(StringRef Suffix, RegKind VectorKind) {
-  return parseVectorKind(Suffix, VectorKind).hasValue();
+  return parseVectorKind(Suffix, VectorKind).has_value();
 }
 
 static unsigned matchSVEDataVectorRegName(StringRef Name) {
@@ -2758,8 +2760,8 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
     }
 
     auto PRFM = LookupByEncoding(MCE->getValue());
-    Operands.push_back(AArch64Operand::CreatePrefetch(
-        prfop, PRFM.getValueOr(""), S, getContext()));
+    Operands.push_back(AArch64Operand::CreatePrefetch(prfop, PRFM.value_or(""),
+                                                      S, getContext()));
     return MatchOperand_Success;
   }
 
@@ -3029,8 +3031,10 @@ AArch64AsmParser::tryParseImmWithOptionalShift(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
-/// parseCondCodeString - Parse a Condition Code string.
-AArch64CC::CondCode AArch64AsmParser::parseCondCodeString(StringRef Cond) {
+/// parseCondCodeString - Parse a Condition Code string, optionally returning a
+/// suggestion to help common typos.
+AArch64CC::CondCode
+AArch64AsmParser::parseCondCodeString(StringRef Cond, std::string &Suggestion) {
   AArch64CC::CondCode CC = StringSwitch<AArch64CC::CondCode>(Cond.lower())
                     .Case("eq", AArch64CC::EQ)
                     .Case("ne", AArch64CC::NE)
@@ -3053,7 +3057,7 @@ AArch64CC::CondCode AArch64AsmParser::parseCondCodeString(StringRef Cond) {
                     .Default(AArch64CC::Invalid);
 
   if (CC == AArch64CC::Invalid &&
-      getSTI().getFeatureBits()[AArch64::FeatureSVE])
+      getSTI().getFeatureBits()[AArch64::FeatureSVE]) {
     CC = StringSwitch<AArch64CC::CondCode>(Cond.lower())
                     .Case("none",  AArch64CC::EQ)
                     .Case("any",   AArch64CC::NE)
@@ -3067,6 +3071,9 @@ AArch64CC::CondCode AArch64AsmParser::parseCondCodeString(StringRef Cond) {
                     .Case("tstop", AArch64CC::LT)
                     .Default(AArch64CC::Invalid);
 
+    if (CC == AArch64CC::Invalid && Cond.lower() == "nfirst")
+      Suggestion = "nfrst";
+  }
   return CC;
 }
 
@@ -3078,9 +3085,14 @@ bool AArch64AsmParser::parseCondCode(OperandVector &Operands,
   assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
 
   StringRef Cond = Tok.getString();
-  AArch64CC::CondCode CC = parseCondCodeString(Cond);
-  if (CC == AArch64CC::Invalid)
-    return TokError("invalid condition code");
+  std::string Suggestion;
+  AArch64CC::CondCode CC = parseCondCodeString(Cond, Suggestion);
+  if (CC == AArch64CC::Invalid) {
+    std::string Msg = "invalid condition code";
+    if (!Suggestion.empty())
+      Msg += ", did you mean " + Suggestion + "?";
+    return TokError(Msg);
+  }
   Lex(); // Eat identifier token.
 
   if (invertCondCode) {
@@ -3910,7 +3922,6 @@ AArch64AsmParser::tryParseMatrixTileList(OperandVector &Operands) {
   const MCRegisterInfo *RI = getContext().getRegisterInfo();
 
   unsigned PrevReg = FirstReg;
-  unsigned Count = 1;
 
   SmallSet<unsigned, 8> DRegs;
   AArch64Operand::ComputeRegsForAlias(FirstReg, DRegs, ElementWidth);
@@ -3942,7 +3953,6 @@ AArch64AsmParser::tryParseMatrixTileList(OperandVector &Operands) {
     }
 
     PrevReg = Reg;
-    ++Count;
   }
 
   if (parseToken(AsmToken::RCurly, "'}' expected"))
@@ -4545,9 +4555,14 @@ bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
 
     SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
                                             (Head.data() - Name.data()));
-    AArch64CC::CondCode CC = parseCondCodeString(Head);
-    if (CC == AArch64CC::Invalid)
-      return Error(SuffixLoc, "invalid condition code");
+    std::string Suggestion;
+    AArch64CC::CondCode CC = parseCondCodeString(Head, Suggestion);
+    if (CC == AArch64CC::Invalid) {
+      std::string Msg = "invalid condition code";
+      if (!Suggestion.empty())
+        Msg += ", did you mean " + Suggestion + "?";
+      return Error(SuffixLoc, Msg);
+    }
     Operands.push_back(AArch64Operand::CreateToken(".", SuffixLoc, getContext(),
                                                    /*IsSuffix=*/true));
     Operands.push_back(
@@ -6024,6 +6039,8 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
     parseDirectiveCFINegateRAState();
   else if (IDVal == ".cfi_b_key_frame")
     parseDirectiveCFIBKeyFrame();
+  else if (IDVal == ".cfi_mte_tagged_frame")
+    parseDirectiveCFIMTETaggedFrame();
   else if (IDVal == ".arch_extension")
     parseDirectiveArchExtension(Loc);
   else if (IDVal == ".variant_pcs")
@@ -6198,12 +6215,11 @@ bool AArch64AsmParser::parseDirectiveArch(SMLoc L) {
       if (Extension.Features.none())
         report_fatal_error("unsupported architectural extension: " + Name);
 
-      FeatureBitset ToggleFeatures = EnableFeature
-                                         ? (~Features & Extension.Features)
-                                         : ( Features & Extension.Features);
-      FeatureBitset Features =
-          ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures));
-      setAvailableFeatures(Features);
+      FeatureBitset ToggleFeatures =
+          EnableFeature
+              ? STI.SetFeatureBitsTransitively(~Features & Extension.Features)
+              : STI.ToggleFeature(Features & Extension.Features);
+      setAvailableFeatures(ComputeAvailableFeatures(ToggleFeatures));
       break;
     }
   }
@@ -6217,8 +6233,7 @@ bool AArch64AsmParser::parseDirectiveArchExtension(SMLoc L) {
 
   StringRef Name = getParser().parseStringToEndOfStatement().trim();
 
-  if (parseToken(AsmToken::EndOfStatement,
-                 "unexpected token in '.arch_extension' directive"))
+  if (parseEOL())
     return true;
 
   bool EnableFeature = true;
@@ -6236,12 +6251,11 @@ bool AArch64AsmParser::parseDirectiveArchExtension(SMLoc L) {
     if (Extension.Features.none())
       return Error(ExtLoc, "unsupported architectural extension: " + Name);
 
-    FeatureBitset ToggleFeatures = EnableFeature
-                                       ? (~Features & Extension.Features)
-                                       : (Features & Extension.Features);
-    FeatureBitset Features =
-        ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures));
-    setAvailableFeatures(Features);
+    FeatureBitset ToggleFeatures =
+        EnableFeature
+            ? STI.SetFeatureBitsTransitively(~Features & Extension.Features)
+            : STI.ToggleFeature(Features & Extension.Features);
+    setAvailableFeatures(ComputeAvailableFeatures(ToggleFeatures));
     return false;
   }
 
@@ -6281,7 +6295,6 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
 
   ExpandCryptoAEK(llvm::AArch64::getCPUArchKind(CPU), RequestedExtensions);
 
-  FeatureBitset Features = STI.getFeatureBits();
   for (auto Name : RequestedExtensions) {
     // Advance source location past '+'.
     CurLoc = incrementLoc(CurLoc, 1);
@@ -6301,12 +6314,12 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
       if (Extension.Features.none())
         report_fatal_error("unsupported architectural extension: " + Name);
 
-      FeatureBitset ToggleFeatures = EnableFeature
-                                         ? (~Features & Extension.Features)
-                                         : ( Features & Extension.Features);
-      FeatureBitset Features =
-          ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures));
-      setAvailableFeatures(Features);
+      FeatureBitset Features = STI.getFeatureBits();
+      FeatureBitset ToggleFeatures =
+          EnableFeature
+              ? STI.SetFeatureBitsTransitively(~Features & Extension.Features)
+              : STI.ToggleFeature(Features & Extension.Features);
+      setAvailableFeatures(ComputeAvailableFeatures(ToggleFeatures));
       FoundExtension = true;
 
       break;
@@ -6401,12 +6414,10 @@ bool AArch64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) {
 
     if (Idx + 1 == NbArgs)
       break;
-    if (parseToken(AsmToken::Comma,
-                   "unexpected token in '" + Twine(IDVal) + "' directive"))
+    if (parseComma())
       return true;
   }
-  if (parseToken(AsmToken::EndOfStatement,
-                 "unexpected token in '" + Twine(IDVal) + "' directive"))
+  if (parseEOL())
     return true;
 
   getStreamer().emitLOHDirective((MCLOHType)Kind, Args);
@@ -6416,7 +6427,7 @@ bool AArch64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) {
 /// parseDirectiveLtorg
 ///  ::= .ltorg | .pool
 bool AArch64AsmParser::parseDirectiveLtorg(SMLoc L) {
-  if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
+  if (parseEOL())
     return true;
   getTargetStreamer().emitCurrentConstantPool();
   return false;
@@ -6474,8 +6485,7 @@ bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
     return Error(SRegLoc, "register name or alias expected");
 
   // Shouldn't be anything else.
-  if (parseToken(AsmToken::EndOfStatement,
-                 "unexpected input in .req directive"))
+  if (parseEOL())
     return true;
 
   auto pair = std::make_pair(RegisterKind, (unsigned) RegNum);
@@ -6496,7 +6506,7 @@ bool AArch64AsmParser::parseDirectiveUnreq(SMLoc L) {
 }
 
 bool AArch64AsmParser::parseDirectiveCFINegateRAState() {
-  if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
+  if (parseEOL())
     return true;
   getStreamer().emitCFINegateRAState();
   return false;
@@ -6505,31 +6515,31 @@ bool AArch64AsmParser::parseDirectiveCFINegateRAState() {
 /// parseDirectiveCFIBKeyFrame
 /// ::= .cfi_b_key
 bool AArch64AsmParser::parseDirectiveCFIBKeyFrame() {
-  if (parseToken(AsmToken::EndOfStatement,
-                 "unexpected token in '.cfi_b_key_frame'"))
+  if (parseEOL())
     return true;
   getStreamer().emitCFIBKeyFrame();
   return false;
 }
 
+/// parseDirectiveCFIMTETaggedFrame
+/// ::= .cfi_mte_tagged_frame
+bool AArch64AsmParser::parseDirectiveCFIMTETaggedFrame() {
+  if (parseEOL())
+    return true;
+  getStreamer().emitCFIMTETaggedFrame();
+  return false;
+}
+
 /// parseDirectiveVariantPCS
 /// ::= .variant_pcs symbolname
 bool AArch64AsmParser::parseDirectiveVariantPCS(SMLoc L) {
-  const AsmToken &Tok = getTok();
-  if (Tok.isNot(AsmToken::Identifier))
+  StringRef Name;
+  if (getParser().parseIdentifier(Name))
     return TokError("expected symbol name");
-
-  StringRef SymbolName = Tok.getIdentifier();
-
-  MCSymbol *Sym = getContext().lookupSymbol(SymbolName);
-  if (!Sym)
-    return TokError("unknown symbol");
-
-  Lex(); // Eat the symbol
-
   if (parseEOL())
     return true;
-  getTargetStreamer().emitDirectiveVariantPCS(Sym);
+  getTargetStreamer().emitDirectiveVariantPCS(
+      getContext().getOrCreateSymbol(Name));
   return false;
 }
 
@@ -6880,7 +6890,7 @@ unsigned AArch64AsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
     // as a literal token.
     if (Op.isTokenEqual("za"))
       return Match_Success;
-    break;
+    return Match_InvalidOperand;
   }
   if (!Op.isImm())
     return Match_InvalidOperand;
diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index 9ce00f76d9c7..1b65589416c3 100644
--- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -16,9 +16,10 @@
 #include "TargetInfo/AArch64TargetInfo.h"
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm-c/Disassembler.h"
+#include "llvm/MC/MCDecoderOps.h"
 #include "llvm/MC/MCDisassembler/MCRelocationInfo.h"
-#include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
@@ -37,213 +38,226 @@ using DecodeStatus = MCDisassembler::DecodeStatus;
 
 // Forward declare these because the autogenerated code will reference them.
 // Definitions are further down.
-static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst,
-                                              unsigned RegNo, uint64_t Address,
-                                              const void *Decoder);
-static DecodeStatus DecodeFPR128_loRegisterClass(MCInst &Inst,
-                                                 unsigned RegNo,
+static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, unsigned RegNo,
+                                              uint64_t Address,
+                                              const MCDisassembler *Decoder);
+static DecodeStatus DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo,
                                                  uint64_t Address,
-                                                 const void *Decoder);
+                                                 const MCDisassembler *Decoder);
 static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
-                                             const void *Decoder);
+                                             const MCDisassembler *Decoder);
 static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
-                                             const void *Decoder);
+                                             const MCDisassembler *Decoder);
 static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
-                                             const void *Decoder);
+                                             const MCDisassembler *Decoder);
 static DecodeStatus DecodeFPR8RegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
-                                            const void *Decoder);
-static DecodeStatus DecodeGPR64commonRegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder);
+                                            const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeGPR64commonRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
+                               const MCDisassembler *Decoder);
 static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
-                                             const void *Decoder);
-static DecodeStatus DecodeGPR64x8ClassRegisterClass(MCInst &Inst,
-                                                    unsigned RegNo,
-                                                    uint64_t Address,
-                                                    const void *Decoder);
-static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst,
-                                               unsigned RegNo, uint64_t Address,
-                                               const void *Decoder);
-static DecodeStatus DecodeMatrixIndexGPR32_12_15RegisterClass(MCInst &Inst,
-                                                              unsigned RegNo,
-                                                              uint64_t Address,
-                                                              const void *Decoder);
+                                             const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeGPR64x8ClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
+                                const MCDisassembler *Decoder);
+static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeMatrixIndexGPR32_12_15RegisterClass(MCInst &Inst, unsigned RegNo,
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder);
 static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
-                                             const void *Decoder);
-static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst,
-                                               unsigned RegNo, uint64_t Address,
-                                               const void *Decoder);
+                                             const MCDisassembler *Decoder);
+static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder);
 static DecodeStatus DecodeQQRegisterClass(MCInst &Inst, unsigned RegNo,
                                           uint64_t Address,
-                                          const void *Decoder);
+                                          const MCDisassembler *Decoder);
 static DecodeStatus DecodeQQQRegisterClass(MCInst &Inst, unsigned RegNo,
                                            uint64_t Address,
-                                           const void *Decoder);
+                                           const MCDisassembler *Decoder);
 static DecodeStatus DecodeQQQQRegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
-                                            const void *Decoder);
+                                            const MCDisassembler *Decoder);
 static DecodeStatus DecodeDDRegisterClass(MCInst &Inst, unsigned RegNo,
                                           uint64_t Address,
-                                          const void *Decoder);
+                                          const MCDisassembler *Decoder);
 static DecodeStatus DecodeDDDRegisterClass(MCInst &Inst, unsigned RegNo,
                                            uint64_t Address,
-                                           const void *Decoder);
+                                           const MCDisassembler *Decoder);
 static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
-                                            const void *Decoder);
+                                            const MCDisassembler *Decoder);
 static DecodeStatus DecodeZPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                            uint64_t Address,
-                                           const void *Decoder);
+                                           const MCDisassembler *Decoder);
 static DecodeStatus DecodeZPR_4bRegisterClass(MCInst &Inst, unsigned RegNo,
                                               uint64_t Address,
-                                              const void *Decoder);
+                                              const MCDisassembler *Decoder);
 static DecodeStatus DecodeZPR_3bRegisterClass(MCInst &Inst, unsigned RegNo,
                                               uint64_t Address,
-                                              const void *Decoder);
+                                              const MCDisassembler *Decoder);
 static DecodeStatus DecodeZPR2RegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
-                                            const void *Decoder);
+                                            const MCDisassembler *Decoder);
 static DecodeStatus DecodeZPR3RegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
-                                            const void *Decoder);
+                                            const MCDisassembler *Decoder);
 static DecodeStatus DecodeZPR4RegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
-                                            const void *Decoder);
+                                            const MCDisassembler *Decoder);
 template <unsigned NumBitsForTile>
 static DecodeStatus DecodeMatrixTile(MCInst &Inst, unsigned RegNo,
-                                     uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeMatrixTileListRegisterClass(MCInst &Inst,
-                                                      unsigned RegMask,
-                                                      uint64_t Address,
-                                                      const void *Decoder);
+                                     uint64_t Address,
+                                     const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeMatrixTileListRegisterClass(MCInst &Inst, unsigned RegMask,
+                                  uint64_t Address,
+                                  const MCDisassembler *Decoder);
 static DecodeStatus DecodePPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                            uint64_t Address,
-                                           const void *Decoder);
+                                           const MCDisassembler *Decoder);
 static DecodeStatus DecodePPR_3bRegisterClass(MCInst &Inst, unsigned RegNo,
                                               uint64_t Address,
-                                              const void *Decoder);
+                                              const MCDisassembler *Decoder);
 
 static DecodeStatus DecodeFixedPointScaleImm32(MCInst &Inst, unsigned Imm,
                                                uint64_t Address,
-                                               const void *Decoder);
+                                               const MCDisassembler *Decoder);
 static DecodeStatus DecodeFixedPointScaleImm64(MCInst &Inst, unsigned Imm,
                                                uint64_t Address,
-                                               const void *Decoder);
+                                               const MCDisassembler *Decoder);
 static DecodeStatus DecodePCRelLabel19(MCInst &Inst, unsigned Imm,
-                                       uint64_t Address, const void *Decoder);
+                                       uint64_t Address,
+                                       const MCDisassembler *Decoder);
 static DecodeStatus DecodeMemExtend(MCInst &Inst, unsigned Imm,
-                                    uint64_t Address, const void *Decoder);
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder);
 static DecodeStatus DecodeMRSSystemRegister(MCInst &Inst, unsigned Imm,
-                                            uint64_t Address, const void *Decoder);
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder);
 static DecodeStatus DecodeMSRSystemRegister(MCInst &Inst, unsigned Imm,
-                                            uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeThreeAddrSRegInstruction(MCInst &Inst, uint32_t insn,
-                                                   uint64_t Address,
-                                                   const void *Decoder);
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeThreeAddrSRegInstruction(MCInst &Inst, uint32_t insn, uint64_t Address,
+                               const MCDisassembler *Decoder);
 static DecodeStatus DecodeMoveImmInstruction(MCInst &Inst, uint32_t insn,
                                              uint64_t Address,
-                                             const void *Decoder);
-static DecodeStatus DecodeUnsignedLdStInstruction(MCInst &Inst, uint32_t insn,
-                                                  uint64_t Address,
-                                                  const void *Decoder);
+                                             const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeUnsignedLdStInstruction(MCInst &Inst, uint32_t insn, uint64_t Address,
+                              const MCDisassembler *Decoder);
 static DecodeStatus DecodeSignedLdStInstruction(MCInst &Inst, uint32_t insn,
                                                 uint64_t Address,
-                                                const void *Decoder);
-static DecodeStatus DecodeExclusiveLdStInstruction(MCInst &Inst, uint32_t insn,
-                                                   uint64_t Address,
-                                                   const void *Decoder);
+                                                const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeExclusiveLdStInstruction(MCInst &Inst, uint32_t insn, uint64_t Address,
+                               const MCDisassembler *Decoder);
 static DecodeStatus DecodePairLdStInstruction(MCInst &Inst, uint32_t insn,
                                               uint64_t Address,
-                                              const void *Decoder);
+                                              const MCDisassembler *Decoder);
 static DecodeStatus DecodeAuthLoadInstruction(MCInst &Inst, uint32_t insn,
                                               uint64_t Address,
-                                              const void *Decoder);
+                                              const MCDisassembler *Decoder);
 static DecodeStatus DecodeAddSubERegInstruction(MCInst &Inst, uint32_t insn,
                                                 uint64_t Address,
-                                                const void *Decoder);
+                                                const MCDisassembler *Decoder);
 static DecodeStatus DecodeLogicalImmInstruction(MCInst &Inst, uint32_t insn,
                                                 uint64_t Address,
-                                                const void *Decoder);
+                                                const MCDisassembler *Decoder);
 static DecodeStatus DecodeModImmInstruction(MCInst &Inst, uint32_t insn,
                                             uint64_t Address,
-                                            const void *Decoder);
+                                            const MCDisassembler *Decoder);
 static DecodeStatus DecodeModImmTiedInstruction(MCInst &Inst, uint32_t insn,
                                                 uint64_t Address,
-                                                const void *Decoder);
+                                                const MCDisassembler *Decoder);
 static DecodeStatus DecodeAdrInstruction(MCInst &Inst, uint32_t insn,
-                                         uint64_t Address, const void *Decoder);
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder);
 static DecodeStatus DecodeAddSubImmShift(MCInst &Inst, uint32_t insn,
-                                         uint64_t Address, const void *Decoder);
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder);
 static DecodeStatus DecodeUnconditionalBranch(MCInst &Inst, uint32_t insn,
                                               uint64_t Address,
-                                              const void *Decoder);
-static DecodeStatus DecodeSystemPStateInstruction(MCInst &Inst, uint32_t insn,
-                                                  uint64_t Address,
-                                                  const void *Decoder);
+                                              const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeSystemPStateInstruction(MCInst &Inst, uint32_t insn, uint64_t Address,
+                              const MCDisassembler *Decoder);
 static DecodeStatus DecodeTestAndBranch(MCInst &Inst, uint32_t insn,
-                                        uint64_t Address, const void *Decoder);
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder);
 
 static DecodeStatus DecodeFMOVLaneInstruction(MCInst &Inst, unsigned Insn,
                                               uint64_t Address,
-                                              const void *Decoder);
+                                              const MCDisassembler *Decoder);
 static DecodeStatus DecodeVecShiftR64Imm(MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder);
+                                         uint64_t Addr,
+                                         const MCDisassembler *Decoder);
 static DecodeStatus DecodeVecShiftR64ImmNarrow(MCInst &Inst, unsigned Imm,
                                                uint64_t Addr,
-                                               const void *Decoder);
+                                               const MCDisassembler *Decoder);
 static DecodeStatus DecodeVecShiftR32Imm(MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder);
+                                         uint64_t Addr,
+                                         const MCDisassembler *Decoder);
 static DecodeStatus DecodeVecShiftR32ImmNarrow(MCInst &Inst, unsigned Imm,
                                                uint64_t Addr,
-                                               const void *Decoder);
+                                               const MCDisassembler *Decoder);
 static DecodeStatus DecodeVecShiftR16Imm(MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder);
+                                         uint64_t Addr,
+                                         const MCDisassembler *Decoder);
 static DecodeStatus DecodeVecShiftR16ImmNarrow(MCInst &Inst, unsigned Imm,
                                                uint64_t Addr,
-                                               const void *Decoder);
+                                               const MCDisassembler *Decoder);
 static DecodeStatus DecodeVecShiftR8Imm(MCInst &Inst, unsigned Imm,
-                                        uint64_t Addr, const void *Decoder);
+                                        uint64_t Addr,
+                                        const MCDisassembler *Decoder);
 static DecodeStatus DecodeVecShiftL64Imm(MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder);
+                                         uint64_t Addr,
+                                         const MCDisassembler *Decoder);
 static DecodeStatus DecodeVecShiftL32Imm(MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder);
+                                         uint64_t Addr,
+                                         const MCDisassembler *Decoder);
 static DecodeStatus DecodeVecShiftL16Imm(MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder);
+                                         uint64_t Addr,
+                                         const MCDisassembler *Decoder);
 static DecodeStatus DecodeVecShiftL8Imm(MCInst &Inst, unsigned Imm,
-                                        uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeWSeqPairsClassRegisterClass(MCInst &Inst,
-                                                      unsigned RegNo,
-                                                      uint64_t Addr,
-                                                      const void *Decoder);
-static DecodeStatus DecodeXSeqPairsClassRegisterClass(MCInst &Inst,
-                                                      unsigned RegNo,
-                                                      uint64_t Addr,
-                                                      const void *Decoder);
-static DecodeStatus DecodeSVELogicalImmInstruction(MCInst &Inst, uint32_t insn,
-                                                   uint64_t Address,
-                                                   const void *Decoder);
+                                        uint64_t Addr,
+                                        const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeWSeqPairsClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr,
+                                  const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeXSeqPairsClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr,
+                                  const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeSVELogicalImmInstruction(MCInst &Inst, uint32_t insn, uint64_t Address,
+                               const MCDisassembler *Decoder);
 template <int Bits>
 static DecodeStatus DecodeSImm(MCInst &Inst, uint64_t Imm, uint64_t Address,
-                               const void *Decoder);
+                               const MCDisassembler *Decoder);
 template <int ElementWidth>
-static DecodeStatus DecodeImm8OptLsl(MCInst &Inst, unsigned Imm,
-                                     uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeImm8OptLsl(MCInst &Inst, unsigned Imm, uint64_t Addr,
+                                     const MCDisassembler *Decoder);
 static DecodeStatus DecodeSVEIncDecImm(MCInst &Inst, unsigned Imm,
-                                       uint64_t Addr, const void *Decoder);
+                                       uint64_t Addr,
+                                       const MCDisassembler *Decoder);
 static DecodeStatus DecodeSVCROp(MCInst &Inst, unsigned Imm, uint64_t Address,
-                                 const void *Decoder);
+                                 const MCDisassembler *Decoder);
 static DecodeStatus DecodeCPYMemOpInstruction(MCInst &Inst, uint32_t insn,
                                               uint64_t Addr,
-                                              const void *Decoder);
+                                              const MCDisassembler *Decoder);
 static DecodeStatus DecodeSETMemOpInstruction(MCInst &Inst, uint32_t insn,
                                               uint64_t Addr,
-                                              const void *Decoder);
+                                              const MCDisassembler *Decoder);
 
 static bool Check(DecodeStatus &Out, DecodeStatus In) {
   switch (In) {
@@ -270,7 +284,8 @@ static bool Check(DecodeStatus &Out, DecodeStatus In) {
 static MCDisassembler *createAArch64Disassembler(const Target &T,
                                                const MCSubtargetInfo &STI,
                                                MCContext &Ctx) {
-  return new AArch64Disassembler(STI, Ctx);
+
+  return new AArch64Disassembler(STI, Ctx, T.createMCInstrInfo());
 }
 
 DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
@@ -295,67 +310,37 @@ DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
     DecodeStatus Result =
         decodeInstruction(Table, MI, Insn, Address, this, STI);
 
-    switch (MI.getOpcode()) {
-    default:
-      break;
+    const MCInstrDesc &Desc = MCII->get(MI.getOpcode());
+
     // For Scalable Matrix Extension (SME) instructions that have an implicit
-    // operand for the accumulator (ZA) which isn't encoded, manually insert
-    // operand.
-    case AArch64::LDR_ZA:
-    case AArch64::STR_ZA: {
-      MI.insert(MI.begin(), MCOperand::createReg(AArch64::ZA));
-      // Spill and fill instructions have a single immediate used for both the
-      // vector select offset and optional memory offset. Replicate the decoded
-      // immediate.
+    // operand for the accumulator (ZA) or implicit immediate zero which isn't
+    // encoded, manually insert operand.
+    for (unsigned i = 0; i < Desc.getNumOperands(); i++) {
+      if (Desc.OpInfo[i].OperandType == MCOI::OPERAND_REGISTER) {
+        switch (Desc.OpInfo[i].RegClass) {
+        default:
+          break;
+        case AArch64::MPRRegClassID:
+          MI.insert(MI.begin() + i, MCOperand::createReg(AArch64::ZA));
+          break;
+        case AArch64::MPR8RegClassID:
+          MI.insert(MI.begin() + i, MCOperand::createReg(AArch64::ZAB0));
+          break;
+        }
+      } else if (Desc.OpInfo[i].OperandType ==
+                 AArch64::OPERAND_IMPLICIT_IMM_0) {
+        MI.insert(MI.begin() + i, MCOperand::createImm(0));
+      }
+    }
+
+    if (MI.getOpcode() == AArch64::LDR_ZA ||
+        MI.getOpcode() == AArch64::STR_ZA) {
+      // Spill and fill instructions have a single immediate used for both
+      // the vector select offset and optional memory offset. Replicate
+      // the decoded immediate.
       const MCOperand &Imm4Op = MI.getOperand(2);
       assert(Imm4Op.isImm() && "Unexpected operand type!");
       MI.addOperand(Imm4Op);
-      break;
-    }
-    case AArch64::LD1_MXIPXX_H_B:
-    case AArch64::LD1_MXIPXX_V_B:
-    case AArch64::ST1_MXIPXX_H_B:
-    case AArch64::ST1_MXIPXX_V_B:
-    case AArch64::INSERT_MXIPZ_H_B:
-    case AArch64::INSERT_MXIPZ_V_B:
-      // e.g.
-      // MOVA ZA0<HV>.B[<Ws>, <imm>], <Pg>/M, <Zn>.B
-      //      ^ insert implicit 8-bit element tile
-      MI.insert(MI.begin(), MCOperand::createReg(AArch64::ZAB0));
-      break;
-    case AArch64::EXTRACT_ZPMXI_H_B:
-    case AArch64::EXTRACT_ZPMXI_V_B:
-      // MOVA <Zd>.B, <Pg>/M, ZA0<HV>.B[<Ws>, <imm>]
-      //                      ^ insert implicit 8-bit element tile
-      MI.insert(MI.begin()+2, MCOperand::createReg(AArch64::ZAB0));
-      break;
-    case AArch64::LD1_MXIPXX_H_Q:
-    case AArch64::LD1_MXIPXX_V_Q:
-    case AArch64::ST1_MXIPXX_H_Q:
-    case AArch64::ST1_MXIPXX_V_Q:
-      // 128-bit load/store have implicit zero vector index.
-      MI.insert(MI.begin()+2, MCOperand::createImm(0));
-      break;
-    // 128-bit mova have implicit zero vector index.
-    case AArch64::INSERT_MXIPZ_H_Q:
-    case AArch64::INSERT_MXIPZ_V_Q:
-      MI.insert(MI.begin()+2, MCOperand::createImm(0));
-      break;
-    case AArch64::EXTRACT_ZPMXI_H_Q:
-    case AArch64::EXTRACT_ZPMXI_V_Q:
-      MI.addOperand(MCOperand::createImm(0));
-      break;
-    case AArch64::SMOVvi8to32_idx0:
-    case AArch64::SMOVvi8to64_idx0:
-    case AArch64::SMOVvi16to32_idx0:
-    case AArch64::SMOVvi16to64_idx0:
-    case AArch64::SMOVvi32to64_idx0:
-    case AArch64::UMOVvi8_idx0:
-    case AArch64::UMOVvi16_idx0:
-    case AArch64::UMOVvi32_idx0:
-    case AArch64::UMOVvi64_idx0:
-      MI.addOperand(MCOperand::createImm(0));
-      break;
     }
 
     if (Result != MCDisassembler::Fail)
@@ -400,7 +385,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Disassembler() {
 
 static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, unsigned RegNo,
                                               uint64_t Addr,
-                                              const void *Decoder) {
+                                              const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return Fail;
 
@@ -410,9 +395,9 @@ static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, unsigned RegNo,
   return Success;
 }
 
-static DecodeStatus DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo,
-                                                 uint64_t Addr,
-                                                 const void *Decoder) {
+static DecodeStatus
+DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr,
+                             const MCDisassembler *Decoder) {
   if (RegNo > 15)
     return Fail;
   return DecodeFPR128RegisterClass(Inst, RegNo, Addr, Decoder);
@@ -420,7 +405,7 @@ static DecodeStatus DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Addr,
-                                             const void *Decoder) {
+                                             const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return Fail;
 
@@ -432,7 +417,7 @@ static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Addr,
-                                             const void *Decoder) {
+                                             const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return Fail;
 
@@ -444,7 +429,7 @@ static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Addr,
-                                             const void *Decoder) {
+                                             const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return Fail;
 
@@ -456,7 +441,7 @@ static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecodeFPR8RegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Addr,
-                                            const void *Decoder) {
+                                            const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return Fail;
 
@@ -466,9 +451,9 @@ static DecodeStatus DecodeFPR8RegisterClass(MCInst &Inst, unsigned RegNo,
   return Success;
 }
 
-static DecodeStatus DecodeGPR64commonRegisterClass(MCInst &Inst, unsigned RegNo,
-                                                   uint64_t Addr,
-                                                   const void *Decoder) {
+static DecodeStatus
+DecodeGPR64commonRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr,
+                               const MCDisassembler *Decoder) {
   if (RegNo > 30)
     return Fail;
 
@@ -481,7 +466,7 @@ static DecodeStatus DecodeGPR64commonRegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Addr,
-                                             const void *Decoder) {
+                                             const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return Fail;
 
@@ -491,10 +476,9 @@ static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
   return Success;
 }
 
-static DecodeStatus DecodeGPR64x8ClassRegisterClass(MCInst &Inst,
-                                                    unsigned RegNo,
-                                                    uint64_t Address,
-                                                    const void *Decoder) {
+static DecodeStatus
+DecodeGPR64x8ClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
+                                const MCDisassembler *Decoder) {
   if (RegNo > 22)
     return Fail;
   if (RegNo & 1)
@@ -509,7 +493,7 @@ static DecodeStatus DecodeGPR64x8ClassRegisterClass(MCInst &Inst,
 
 static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Addr,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return Fail;
   unsigned Register =
@@ -518,10 +502,10 @@ static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo,
   return Success;
 }
 
-static DecodeStatus DecodeMatrixIndexGPR32_12_15RegisterClass(MCInst &Inst,
-                                                              unsigned RegNo,
-                                                              uint64_t Addr,
-                                                              const void *Decoder) {
+static DecodeStatus
+DecodeMatrixIndexGPR32_12_15RegisterClass(MCInst &Inst, unsigned RegNo,
+                                          uint64_t Addr,
+                                          const MCDisassembler *Decoder) {
   if (RegNo > 3)
     return Fail;
 
@@ -534,7 +518,7 @@ static DecodeStatus DecodeMatrixIndexGPR32_12_15RegisterClass(MCInst &Inst,
 
 static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Addr,
-                                             const void *Decoder) {
+                                             const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return Fail;
 
@@ -546,7 +530,7 @@ static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Addr,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return Fail;
 
@@ -558,7 +542,7 @@ static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecodeZPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                            uint64_t Address,
-                                           const void* Decoder) {
+                                           const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return Fail;
 
@@ -570,7 +554,7 @@ static DecodeStatus DecodeZPRRegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecodeZPR_4bRegisterClass(MCInst &Inst, unsigned RegNo,
                                               uint64_t Address,
-                                              const void *Decoder) {
+                                              const MCDisassembler *Decoder) {
   if (RegNo > 15)
     return Fail;
   return DecodeZPRRegisterClass(Inst, RegNo, Address, Decoder);
@@ -578,7 +562,7 @@ static DecodeStatus DecodeZPR_4bRegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecodeZPR_3bRegisterClass(MCInst &Inst, unsigned RegNo,
                                               uint64_t Address,
-                                              const void *Decoder) {
+                                              const MCDisassembler *Decoder) {
   if (RegNo > 7)
     return Fail;
   return DecodeZPRRegisterClass(Inst, RegNo, Address, Decoder);
@@ -586,7 +570,7 @@ static DecodeStatus DecodeZPR_3bRegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecodeZPR2RegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
-                                            const void* Decoder) {
+                                            const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return Fail;
   unsigned Register =
@@ -597,7 +581,7 @@ static DecodeStatus DecodeZPR2RegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecodeZPR3RegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
-                                            const void* Decoder) {
+                                            const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return Fail;
   unsigned Register =
@@ -608,7 +592,7 @@ static DecodeStatus DecodeZPR3RegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecodeZPR4RegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
-                                            const void* Decoder) {
+                                            const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return Fail;
   unsigned Register =
@@ -617,10 +601,10 @@ static DecodeStatus DecodeZPR4RegisterClass(MCInst &Inst, unsigned RegNo,
   return Success;
 }
 
-static DecodeStatus DecodeMatrixTileListRegisterClass(MCInst &Inst,
-                                                      unsigned RegMask,
-                                                      uint64_t Address,
-                                                      const void *Decoder) {
+static DecodeStatus
+DecodeMatrixTileListRegisterClass(MCInst &Inst, unsigned RegMask,
+                                  uint64_t Address,
+                                  const MCDisassembler *Decoder) {
   if (RegMask > 0xFF)
     return Fail;
   Inst.addOperand(MCOperand::createImm(RegMask));
@@ -641,7 +625,8 @@ static const SmallVector<SmallVector<unsigned, 16>, 5>
 
 template <unsigned NumBitsForTile>
 static DecodeStatus DecodeMatrixTile(MCInst &Inst, unsigned RegNo,
-                                     uint64_t Address, const void *Decoder) {
+                                     uint64_t Address,
+                                     const MCDisassembler *Decoder) {
   unsigned LastReg = (1 << NumBitsForTile) - 1;
   if (RegNo > LastReg)
     return Fail;
@@ -651,7 +636,8 @@ static DecodeStatus DecodeMatrixTile(MCInst &Inst, unsigned RegNo,
 }
 
 static DecodeStatus DecodePPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                           uint64_t Addr, const void *Decoder) {
+                                           uint64_t Addr,
+                                           const MCDisassembler *Decoder) {
   if (RegNo > 15)
     return Fail;
 
@@ -663,7 +649,7 @@ static DecodeStatus DecodePPRRegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecodePPR_3bRegisterClass(MCInst &Inst, unsigned RegNo,
                                               uint64_t Addr,
-                                              const void* Decoder) {
+                                              const MCDisassembler *Decoder) {
   if (RegNo > 7)
     return Fail;
 
@@ -672,7 +658,8 @@ static DecodeStatus DecodePPR_3bRegisterClass(MCInst &Inst, unsigned RegNo,
 }
 
 static DecodeStatus DecodeQQRegisterClass(MCInst &Inst, unsigned RegNo,
-                                          uint64_t Addr, const void *Decoder) {
+                                          uint64_t Addr,
+                                          const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return Fail;
   unsigned Register =
@@ -682,7 +669,8 @@ static DecodeStatus DecodeQQRegisterClass(MCInst &Inst, unsigned RegNo,
 }
 
 static DecodeStatus DecodeQQQRegisterClass(MCInst &Inst, unsigned RegNo,
-                                           uint64_t Addr, const void *Decoder) {
+                                           uint64_t Addr,
+                                           const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return Fail;
   unsigned Register =
@@ -693,7 +681,7 @@ static DecodeStatus DecodeQQQRegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecodeQQQQRegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Addr,
-                                            const void *Decoder) {
+                                            const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return Fail;
   unsigned Register =
@@ -703,7 +691,8 @@ static DecodeStatus DecodeQQQQRegisterClass(MCInst &Inst, unsigned RegNo,
 }
 
 static DecodeStatus DecodeDDRegisterClass(MCInst &Inst, unsigned RegNo,
-                                          uint64_t Addr, const void *Decoder) {
+                                          uint64_t Addr,
+                                          const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return Fail;
   unsigned Register =
@@ -713,7 +702,8 @@ static DecodeStatus DecodeDDRegisterClass(MCInst &Inst, unsigned RegNo,
 }
 
 static DecodeStatus DecodeDDDRegisterClass(MCInst &Inst, unsigned RegNo,
-                                           uint64_t Addr, const void *Decoder) {
+                                           uint64_t Addr,
+                                           const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return Fail;
   unsigned Register =
@@ -724,7 +714,7 @@ static DecodeStatus DecodeDDDRegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Addr,
-                                            const void *Decoder) {
+                                            const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return Fail;
   unsigned Register =
@@ -735,7 +725,7 @@ static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecodeFixedPointScaleImm32(MCInst &Inst, unsigned Imm,
                                                uint64_t Addr,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   // scale{5} is asserted as 1 in tblgen.
   Imm |= 0x20;
   Inst.addOperand(MCOperand::createImm(64 - Imm));
@@ -744,29 +734,29 @@ static DecodeStatus DecodeFixedPointScaleImm32(MCInst &Inst, unsigned Imm,
 
 static DecodeStatus DecodeFixedPointScaleImm64(MCInst &Inst, unsigned Imm,
                                                uint64_t Addr,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   Inst.addOperand(MCOperand::createImm(64 - Imm));
   return Success;
 }
 
 static DecodeStatus DecodePCRelLabel19(MCInst &Inst, unsigned Imm,
-                                       uint64_t Addr, const void *Decoder) {
+                                       uint64_t Addr,
+                                       const MCDisassembler *Decoder) {
   int64_t ImmVal = Imm;
-  const AArch64Disassembler *Dis =
-      static_cast<const AArch64Disassembler *>(Decoder);
 
   // Sign-extend 19-bit immediate.
   if (ImmVal & (1 << (19 - 1)))
     ImmVal |= ~((1LL << 19) - 1);
 
-  if (!Dis->tryAddingSymbolicOperand(Inst, ImmVal *  4, Addr,
-                                     Inst.getOpcode() != AArch64::LDRXl, 0, 4))
+  if (!Decoder->tryAddingSymbolicOperand(
+          Inst, ImmVal * 4, Addr, Inst.getOpcode() != AArch64::LDRXl, 0, 0, 4))
     Inst.addOperand(MCOperand::createImm(ImmVal));
   return Success;
 }
 
 static DecodeStatus DecodeMemExtend(MCInst &Inst, unsigned Imm,
-                                    uint64_t Address, const void *Decoder) {
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder) {
   Inst.addOperand(MCOperand::createImm((Imm  >> 1) & 1));
   Inst.addOperand(MCOperand::createImm(Imm & 1));
   return Success;
@@ -774,7 +764,7 @@ static DecodeStatus DecodeMemExtend(MCInst &Inst, unsigned Imm,
 
 static DecodeStatus DecodeMRSSystemRegister(MCInst &Inst, unsigned Imm,
                                             uint64_t Address,
-                                            const void *Decoder) {
+                                            const MCDisassembler *Decoder) {
   Inst.addOperand(MCOperand::createImm(Imm));
 
   // Every system register in the encoding space is valid with the syntax
@@ -784,7 +774,7 @@ static DecodeStatus DecodeMRSSystemRegister(MCInst &Inst, unsigned Imm,
 
 static DecodeStatus DecodeMSRSystemRegister(MCInst &Inst, unsigned Imm,
                                             uint64_t Address,
-                                            const void *Decoder) {
+                                            const MCDisassembler *Decoder) {
   Inst.addOperand(MCOperand::createImm(Imm));
 
   return Success;
@@ -792,7 +782,7 @@ static DecodeStatus DecodeMSRSystemRegister(MCInst &Inst, unsigned Imm,
 
 static DecodeStatus DecodeFMOVLaneInstruction(MCInst &Inst, unsigned Insn,
                                               uint64_t Address,
-                                              const void *Decoder) {
+                                              const MCDisassembler *Decoder) {
   // This decoder exists to add the dummy Lane operand to the MCInst, which must
   // be 1 in assembly but has no other real manifestation.
   unsigned Rd = fieldFromInstruction(Insn, 0, 5);
@@ -826,66 +816,74 @@ static DecodeStatus DecodeVecShiftLImm(MCInst &Inst, unsigned Imm,
 }
 
 static DecodeStatus DecodeVecShiftR64Imm(MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder) {
+                                         uint64_t Addr,
+                                         const MCDisassembler *Decoder) {
   return DecodeVecShiftRImm(Inst, Imm, 64);
 }
 
 static DecodeStatus DecodeVecShiftR64ImmNarrow(MCInst &Inst, unsigned Imm,
                                                uint64_t Addr,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   return DecodeVecShiftRImm(Inst, Imm | 0x20, 64);
 }
 
 static DecodeStatus DecodeVecShiftR32Imm(MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder) {
+                                         uint64_t Addr,
+                                         const MCDisassembler *Decoder) {
   return DecodeVecShiftRImm(Inst, Imm, 32);
 }
 
 static DecodeStatus DecodeVecShiftR32ImmNarrow(MCInst &Inst, unsigned Imm,
                                                uint64_t Addr,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   return DecodeVecShiftRImm(Inst, Imm | 0x10, 32);
 }
 
 static DecodeStatus DecodeVecShiftR16Imm(MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder) {
+                                         uint64_t Addr,
+                                         const MCDisassembler *Decoder) {
   return DecodeVecShiftRImm(Inst, Imm, 16);
 }
 
 static DecodeStatus DecodeVecShiftR16ImmNarrow(MCInst &Inst, unsigned Imm,
                                                uint64_t Addr,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   return DecodeVecShiftRImm(Inst, Imm | 0x8, 16);
 }
 
 static DecodeStatus DecodeVecShiftR8Imm(MCInst &Inst, unsigned Imm,
-                                        uint64_t Addr, const void *Decoder) {
+                                        uint64_t Addr,
+                                        const MCDisassembler *Decoder) {
   return DecodeVecShiftRImm(Inst, Imm, 8);
 }
 
 static DecodeStatus DecodeVecShiftL64Imm(MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder) {
+                                         uint64_t Addr,
+                                         const MCDisassembler *Decoder) {
   return DecodeVecShiftLImm(Inst, Imm, 64);
 }
 
 static DecodeStatus DecodeVecShiftL32Imm(MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder) {
+                                         uint64_t Addr,
+                                         const MCDisassembler *Decoder) {
   return DecodeVecShiftLImm(Inst, Imm, 32);
 }
 
 static DecodeStatus DecodeVecShiftL16Imm(MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder) {
+                                         uint64_t Addr,
+                                         const MCDisassembler *Decoder) {
   return DecodeVecShiftLImm(Inst, Imm, 16);
 }
 
 static DecodeStatus DecodeVecShiftL8Imm(MCInst &Inst, unsigned Imm,
-                                        uint64_t Addr, const void *Decoder) {
+                                        uint64_t Addr,
+                                        const MCDisassembler *Decoder) {
   return DecodeVecShiftLImm(Inst, Imm, 8);
 }
 
-static DecodeStatus DecodeThreeAddrSRegInstruction(MCInst &Inst, uint32_t insn,
-                                                   uint64_t Addr,
-                                                   const void *Decoder) {
+static DecodeStatus
+DecodeThreeAddrSRegInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr,
+                               const MCDisassembler *Decoder) {
   unsigned Rd = fieldFromInstruction(insn, 0, 5);
   unsigned Rn = fieldFromInstruction(insn, 5, 5);
   unsigned Rm = fieldFromInstruction(insn, 16, 5);
@@ -947,7 +945,7 @@ static DecodeStatus DecodeThreeAddrSRegInstruction(MCInst &Inst, uint32_t insn,
 
 static DecodeStatus DecodeMoveImmInstruction(MCInst &Inst, uint32_t insn,
                                              uint64_t Addr,
-                                             const void *Decoder) {
+                                             const MCDisassembler *Decoder) {
   unsigned Rd = fieldFromInstruction(insn, 0, 5);
   unsigned imm = fieldFromInstruction(insn, 5, 16);
   unsigned shift = fieldFromInstruction(insn, 21, 2);
@@ -978,14 +976,12 @@ static DecodeStatus DecodeMoveImmInstruction(MCInst &Inst, uint32_t insn,
   return Success;
 }
 
-static DecodeStatus DecodeUnsignedLdStInstruction(MCInst &Inst, uint32_t insn,
-                                                  uint64_t Addr,
-                                                  const void *Decoder) {
+static DecodeStatus
+DecodeUnsignedLdStInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr,
+                              const MCDisassembler *Decoder) {
   unsigned Rt = fieldFromInstruction(insn, 0, 5);
   unsigned Rn = fieldFromInstruction(insn, 5, 5);
   unsigned offset = fieldFromInstruction(insn, 10, 12);
-  const AArch64Disassembler *Dis =
-      static_cast<const AArch64Disassembler *>(Decoder);
 
   switch (Inst.getOpcode()) {
   default:
@@ -1034,14 +1030,14 @@ static DecodeStatus DecodeUnsignedLdStInstruction(MCInst &Inst, uint32_t insn,
   }
 
   DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-  if (!Dis->tryAddingSymbolicOperand(Inst, offset, Addr, Fail, 0, 4))
+  if (!Decoder->tryAddingSymbolicOperand(Inst, offset, Addr, Fail, 0, 0, 4))
     Inst.addOperand(MCOperand::createImm(offset));
   return Success;
 }
 
 static DecodeStatus DecodeSignedLdStInstruction(MCInst &Inst, uint32_t insn,
                                                 uint64_t Addr,
-                                                const void *Decoder) {
+                                                const MCDisassembler *Decoder) {
   unsigned Rt = fieldFromInstruction(insn, 0, 5);
   unsigned Rn = fieldFromInstruction(insn, 5, 5);
   int64_t offset = fieldFromInstruction(insn, 12, 9);
@@ -1237,9 +1233,9 @@ static DecodeStatus DecodeSignedLdStInstruction(MCInst &Inst, uint32_t insn,
   return Success;
 }
 
-static DecodeStatus DecodeExclusiveLdStInstruction(MCInst &Inst, uint32_t insn,
-                                                   uint64_t Addr,
-                                                   const void *Decoder) {
+static DecodeStatus
+DecodeExclusiveLdStInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr,
+                               const MCDisassembler *Decoder) {
   unsigned Rt = fieldFromInstruction(insn, 0, 5);
   unsigned Rn = fieldFromInstruction(insn, 5, 5);
   unsigned Rt2 = fieldFromInstruction(insn, 10, 5);
@@ -1322,7 +1318,7 @@ static DecodeStatus DecodeExclusiveLdStInstruction(MCInst &Inst, uint32_t insn,
 
 static DecodeStatus DecodePairLdStInstruction(MCInst &Inst, uint32_t insn,
                                               uint64_t Addr,
-                                              const void *Decoder) {
+                                              const MCDisassembler *Decoder) {
   unsigned Rt = fieldFromInstruction(insn, 0, 5);
   unsigned Rn = fieldFromInstruction(insn, 5, 5);
   unsigned Rt2 = fieldFromInstruction(insn, 10, 5);
@@ -1456,7 +1452,7 @@ static DecodeStatus DecodePairLdStInstruction(MCInst &Inst, uint32_t insn,
 
 static DecodeStatus DecodeAuthLoadInstruction(MCInst &Inst, uint32_t insn,
                                               uint64_t Addr,
-                                              const void *Decoder) {
+                                              const MCDisassembler *Decoder) {
   unsigned Rt = fieldFromInstruction(insn, 0, 5);
   unsigned Rn = fieldFromInstruction(insn, 5, 5);
   uint64_t offset = fieldFromInstruction(insn, 22, 1) << 9 |
@@ -1489,7 +1485,7 @@ static DecodeStatus DecodeAuthLoadInstruction(MCInst &Inst, uint32_t insn,
 
 static DecodeStatus DecodeAddSubERegInstruction(MCInst &Inst, uint32_t insn,
                                                 uint64_t Addr,
-                                                const void *Decoder) {
+                                                const MCDisassembler *Decoder) {
   unsigned Rd = fieldFromInstruction(insn, 0, 5);
   unsigned Rn = fieldFromInstruction(insn, 5, 5);
   unsigned Rm = fieldFromInstruction(insn, 16, 5);
@@ -1546,7 +1542,7 @@ static DecodeStatus DecodeAddSubERegInstruction(MCInst &Inst, uint32_t insn,
 
 static DecodeStatus DecodeLogicalImmInstruction(MCInst &Inst, uint32_t insn,
                                                 uint64_t Addr,
-                                                const void *Decoder) {
+                                                const MCDisassembler *Decoder) {
   unsigned Rd = fieldFromInstruction(insn, 0, 5);
   unsigned Rn = fieldFromInstruction(insn, 5, 5);
   unsigned Datasize = fieldFromInstruction(insn, 31, 1);
@@ -1577,7 +1573,7 @@ static DecodeStatus DecodeLogicalImmInstruction(MCInst &Inst, uint32_t insn,
 
 static DecodeStatus DecodeModImmInstruction(MCInst &Inst, uint32_t insn,
                                             uint64_t Addr,
-                                            const void *Decoder) {
+                                            const MCDisassembler *Decoder) {
   unsigned Rd = fieldFromInstruction(insn, 0, 5);
   unsigned cmode = fieldFromInstruction(insn, 12, 4);
   unsigned imm = fieldFromInstruction(insn, 16, 3) << 5;
@@ -1616,7 +1612,7 @@ static DecodeStatus DecodeModImmInstruction(MCInst &Inst, uint32_t insn,
 
 static DecodeStatus DecodeModImmTiedInstruction(MCInst &Inst, uint32_t insn,
                                                 uint64_t Addr,
-                                                const void *Decoder) {
+                                                const MCDisassembler *Decoder) {
   unsigned Rd = fieldFromInstruction(insn, 0, 5);
   unsigned cmode = fieldFromInstruction(insn, 12, 4);
   unsigned imm = fieldFromInstruction(insn, 16, 3) << 5;
@@ -1633,26 +1629,26 @@ static DecodeStatus DecodeModImmTiedInstruction(MCInst &Inst, uint32_t insn,
 }
 
 static DecodeStatus DecodeAdrInstruction(MCInst &Inst, uint32_t insn,
-                                         uint64_t Addr, const void *Decoder) {
+                                         uint64_t Addr,
+                                         const MCDisassembler *Decoder) {
   unsigned Rd = fieldFromInstruction(insn, 0, 5);
   int64_t imm = fieldFromInstruction(insn, 5, 19) << 2;
   imm |= fieldFromInstruction(insn, 29, 2);
-  const AArch64Disassembler *Dis =
-      static_cast<const AArch64Disassembler *>(Decoder);
 
   // Sign-extend the 21-bit immediate.
   if (imm & (1 << (21 - 1)))
     imm |= ~((1LL << 21) - 1);
 
   DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
-  if (!Dis->tryAddingSymbolicOperand(Inst, imm, Addr, Fail, 0, 4))
+  if (!Decoder->tryAddingSymbolicOperand(Inst, imm, Addr, Fail, 0, 0, 4))
     Inst.addOperand(MCOperand::createImm(imm));
 
   return Success;
 }
 
 static DecodeStatus DecodeAddSubImmShift(MCInst &Inst, uint32_t insn,
-                                         uint64_t Addr, const void *Decoder) {
+                                         uint64_t Addr,
+                                         const MCDisassembler *Decoder) {
   unsigned Rd = fieldFromInstruction(insn, 0, 5);
   unsigned Rn = fieldFromInstruction(insn, 5, 5);
   unsigned Imm = fieldFromInstruction(insn, 10, 14);
@@ -1661,8 +1657,6 @@ static DecodeStatus DecodeAddSubImmShift(MCInst &Inst, uint32_t insn,
 
   unsigned ShifterVal = (Imm >> 12) & 3;
   unsigned ImmVal = Imm & 0xFFF;
-  const AArch64Disassembler *Dis =
-      static_cast<const AArch64Disassembler *>(Decoder);
 
   if (ShifterVal != 0 && ShifterVal != 1)
     return Fail;
@@ -1681,7 +1675,7 @@ static DecodeStatus DecodeAddSubImmShift(MCInst &Inst, uint32_t insn,
     DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
   }
 
-  if (!Dis->tryAddingSymbolicOperand(Inst, Imm, Addr, Fail, 0, 4))
+  if (!Decoder->tryAddingSymbolicOperand(Inst, Imm, Addr, Fail, 0, 0, 4))
     Inst.addOperand(MCOperand::createImm(ImmVal));
   Inst.addOperand(MCOperand::createImm(12 * ShifterVal));
   return Success;
@@ -1689,24 +1683,22 @@ static DecodeStatus DecodeAddSubImmShift(MCInst &Inst, uint32_t insn,
 
 static DecodeStatus DecodeUnconditionalBranch(MCInst &Inst, uint32_t insn,
                                               uint64_t Addr,
-                                              const void *Decoder) {
+                                              const MCDisassembler *Decoder) {
   int64_t imm = fieldFromInstruction(insn, 0, 26);
-  const AArch64Disassembler *Dis =
-      static_cast<const AArch64Disassembler *>(Decoder);
 
   // Sign-extend the 26-bit immediate.
   if (imm & (1 << (26 - 1)))
     imm |= ~((1LL << 26) - 1);
 
-  if (!Dis->tryAddingSymbolicOperand(Inst, imm * 4, Addr, true, 0, 4))
+  if (!Decoder->tryAddingSymbolicOperand(Inst, imm * 4, Addr, true, 0, 0, 4))
     Inst.addOperand(MCOperand::createImm(imm));
 
   return Success;
 }
 
-static DecodeStatus DecodeSystemPStateInstruction(MCInst &Inst, uint32_t insn,
-                                                  uint64_t Addr,
-                                                  const void *Decoder) {
+static DecodeStatus
+DecodeSystemPStateInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr,
+                              const MCDisassembler *Decoder) {
   uint64_t op1 = fieldFromInstruction(insn, 16, 3);
   uint64_t op2 = fieldFromInstruction(insn, 5, 3);
   uint64_t crm = fieldFromInstruction(insn, 8, 4);
@@ -1726,22 +1718,20 @@ static DecodeStatus DecodeSystemPStateInstruction(MCInst &Inst, uint32_t insn,
   Inst.addOperand(MCOperand::createImm(pstate_field));
   Inst.addOperand(MCOperand::createImm(crm));
 
-  const AArch64Disassembler *Dis =
-      static_cast<const AArch64Disassembler *>(Decoder);
   auto PState = AArch64PState::lookupPStateByEncoding(pstate_field);
-  if (PState && PState->haveFeatures(Dis->getSubtargetInfo().getFeatureBits()))
+  if (PState &&
+      PState->haveFeatures(Decoder->getSubtargetInfo().getFeatureBits()))
     return Success;
   return Fail;
 }
 
 static DecodeStatus DecodeTestAndBranch(MCInst &Inst, uint32_t insn,
-                                        uint64_t Addr, const void *Decoder) {
+                                        uint64_t Addr,
+                                        const MCDisassembler *Decoder) {
   uint64_t Rt = fieldFromInstruction(insn, 0, 5);
   uint64_t bit = fieldFromInstruction(insn, 31, 1) << 5;
   bit |= fieldFromInstruction(insn, 19, 5);
   int64_t dst = fieldFromInstruction(insn, 5, 14);
-  const AArch64Disassembler *Dis =
-      static_cast<const AArch64Disassembler *>(Decoder);
 
   // Sign-extend 14-bit immediate.
   if (dst & (1 << (14 - 1)))
@@ -1752,17 +1742,16 @@ static DecodeStatus DecodeTestAndBranch(MCInst &Inst, uint32_t insn,
   else
     DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
   Inst.addOperand(MCOperand::createImm(bit));
-  if (!Dis->tryAddingSymbolicOperand(Inst, dst * 4, Addr, true, 0, 4))
+  if (!Decoder->tryAddingSymbolicOperand(Inst, dst * 4, Addr, true, 0, 0, 4))
     Inst.addOperand(MCOperand::createImm(dst));
 
   return Success;
 }
 
-static DecodeStatus DecodeGPRSeqPairsClassRegisterClass(MCInst &Inst,
-                                                        unsigned RegClassID,
-                                                        unsigned RegNo,
-                                                        uint64_t Addr,
-                                                        const void *Decoder) {
+static DecodeStatus
+DecodeGPRSeqPairsClassRegisterClass(MCInst &Inst, unsigned RegClassID,
+                                    unsigned RegNo, uint64_t Addr,
+                                    const MCDisassembler *Decoder) {
   // Register number must be even (see CASP instruction)
   if (RegNo & 0x1)
     return Fail;
@@ -1772,27 +1761,25 @@ static DecodeStatus DecodeGPRSeqPairsClassRegisterClass(MCInst &Inst,
   return Success;
 }
 
-static DecodeStatus DecodeWSeqPairsClassRegisterClass(MCInst &Inst,
-                                                      unsigned RegNo,
-                                                      uint64_t Addr,
-                                                      const void *Decoder) {
+static DecodeStatus
+DecodeWSeqPairsClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr,
+                                  const MCDisassembler *Decoder) {
   return DecodeGPRSeqPairsClassRegisterClass(Inst,
                                              AArch64::WSeqPairsClassRegClassID,
                                              RegNo, Addr, Decoder);
 }
 
-static DecodeStatus DecodeXSeqPairsClassRegisterClass(MCInst &Inst,
-                                                      unsigned RegNo,
-                                                      uint64_t Addr,
-                                                      const void *Decoder) {
+static DecodeStatus
+DecodeXSeqPairsClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr,
+                                  const MCDisassembler *Decoder) {
   return DecodeGPRSeqPairsClassRegisterClass(Inst,
                                              AArch64::XSeqPairsClassRegClassID,
                                              RegNo, Addr, Decoder);
 }
 
-static DecodeStatus DecodeSVELogicalImmInstruction(MCInst &Inst, uint32_t insn,
-                                                   uint64_t Addr,
-                                                   const void *Decoder) {
+static DecodeStatus
+DecodeSVELogicalImmInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr,
+                               const MCDisassembler *Decoder) {
   unsigned Zdn = fieldFromInstruction(insn, 0, 5);
   unsigned imm = fieldFromInstruction(insn, 5, 13);
   if (!AArch64_AM::isValidDecodeLogicalImmediate(imm, 64))
@@ -1808,7 +1795,7 @@ static DecodeStatus DecodeSVELogicalImmInstruction(MCInst &Inst, uint32_t insn,
 
 template <int Bits>
 static DecodeStatus DecodeSImm(MCInst &Inst, uint64_t Imm, uint64_t Address,
-                               const void *Decoder) {
+                               const MCDisassembler *Decoder) {
   if (Imm & ~((1LL << Bits) - 1))
       return Fail;
 
@@ -1822,8 +1809,8 @@ static DecodeStatus DecodeSImm(MCInst &Inst, uint64_t Imm, uint64_t Address,
 
 // Decode 8-bit signed/unsigned immediate for a given element width.
 template <int ElementWidth>
-static DecodeStatus DecodeImm8OptLsl(MCInst &Inst, unsigned Imm,
-                                      uint64_t Addr, const void *Decoder) {
+static DecodeStatus DecodeImm8OptLsl(MCInst &Inst, unsigned Imm, uint64_t Addr,
+                                     const MCDisassembler *Decoder) {
   unsigned Val = (uint8_t)Imm;
   unsigned Shift = (Imm & 0x100) ? 8 : 0;
   if (ElementWidth == 8 && Shift)
@@ -1835,13 +1822,14 @@ static DecodeStatus DecodeImm8OptLsl(MCInst &Inst, unsigned Imm,
 
 // Decode uimm4 ranged from 1-16.
 static DecodeStatus DecodeSVEIncDecImm(MCInst &Inst, unsigned Imm,
-                                       uint64_t Addr, const void *Decoder) {
+                                       uint64_t Addr,
+                                       const MCDisassembler *Decoder) {
   Inst.addOperand(MCOperand::createImm(Imm + 1));
   return Success;
 }
 
 static DecodeStatus DecodeSVCROp(MCInst &Inst, unsigned Imm, uint64_t Address,
-                                 const void *Decoder) {
+                                 const MCDisassembler *Decoder) {
   if (AArch64SVCR::lookupSVCRByEncoding(Imm)) {
     Inst.addOperand(MCOperand::createImm(Imm));
     return Success;
@@ -1851,7 +1839,7 @@ static DecodeStatus DecodeSVCROp(MCInst &Inst, unsigned Imm, uint64_t Address,
 
 static DecodeStatus DecodeCPYMemOpInstruction(MCInst &Inst, uint32_t insn,
                                               uint64_t Addr,
-                                              const void *Decoder) {
+                                              const MCDisassembler *Decoder) {
   unsigned Rd = fieldFromInstruction(insn, 0, 5);
   unsigned Rs = fieldFromInstruction(insn, 16, 5);
   unsigned Rn = fieldFromInstruction(insn, 5, 5);
@@ -1876,7 +1864,7 @@ static DecodeStatus DecodeCPYMemOpInstruction(MCInst &Inst, uint32_t insn,
 
 static DecodeStatus DecodeSETMemOpInstruction(MCInst &Inst, uint32_t insn,
                                               uint64_t Addr,
-                                              const void *Decoder) {
+                                              const MCDisassembler *Decoder) {
   unsigned Rd = fieldFromInstruction(insn, 0, 5);
   unsigned Rm = fieldFromInstruction(insn, 16, 5);
   unsigned Rn = fieldFromInstruction(insn, 5, 5);
diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
index 374a89edcb74..6761d449a7f4 100644
--- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
+++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
@@ -13,13 +13,17 @@
 #define LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64DISASSEMBLER_H
 
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCInstrInfo.h"
 
 namespace llvm {
 
 class AArch64Disassembler : public MCDisassembler {
+  std::unique_ptr<const MCInstrInfo> const MCII;
+
 public:
-  AArch64Disassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
-    : MCDisassembler(STI, Ctx) {}
+  AArch64Disassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
+                      MCInstrInfo const *MCII)
+      : MCDisassembler(STI, Ctx), MCII(MCII) {}
 
   ~AArch64Disassembler() override = default;
 
diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
index 5b6f06f8dbb4..11964b2075e5 100644
--- a/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
+++ b/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
@@ -60,7 +60,7 @@ getVariant(uint64_t LLVMDisassembler_VariantKind) {
 /// an operand to the MCInst and Fail otherwise.
 bool AArch64ExternalSymbolizer::tryAddingSymbolicOperand(
     MCInst &MI, raw_ostream &CommentStream, int64_t Value, uint64_t Address,
-    bool IsBranch, uint64_t Offset, uint64_t InstSize) {
+    bool IsBranch, uint64_t Offset, uint64_t OpSize, uint64_t InstSize) {
   if (!SymbolLookUp)
     return false;
   // FIXME: This method shares a lot of code with
@@ -73,8 +73,8 @@ bool AArch64ExternalSymbolizer::tryAddingSymbolicOperand(
   SymbolicOp.Value = Value;
   uint64_t ReferenceType;
   const char *ReferenceName;
-  if (!GetOpInfo ||
-      !GetOpInfo(DisInfo, Address, 0 /* Offset */, InstSize, 1, &SymbolicOp)) {
+  if (!GetOpInfo || !GetOpInfo(DisInfo, Address, /*Offset=*/0, OpSize, InstSize,
+                               1, &SymbolicOp)) {
     if (IsBranch) {
       ReferenceType = LLVMDisassembler_ReferenceType_In_Branch;
       const char *Name = SymbolLookUp(DisInfo, Address + Value, &ReferenceType,
diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h b/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
index dc72331660cc..ca677db49739 100644
--- a/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
+++ b/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
@@ -29,7 +29,8 @@ public:
 
   bool tryAddingSymbolicOperand(MCInst &MI, raw_ostream &CommentStream,
                                 int64_t Value, uint64_t Address, bool IsBranch,
-                                uint64_t Offset, uint64_t InstSize) override;
+                                uint64_t Offset, uint64_t OpSize,
+                                uint64_t InstSize) override;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 097b93e4fcca..89e1d85a6085 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -18,6 +18,7 @@
 #include "AArch64Subtarget.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/ObjCARCUtil.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
@@ -1058,10 +1059,10 @@ bool AArch64CallLowering::lowerTailCall(
 
   // If Callee is a reg, since it is used by a target specific instruction,
   // it must have a register class matching the constraint of that instruction.
-  if (Info.Callee.isReg())
+  if (MIB->getOperand(0).isReg())
     constrainOperandRegClass(MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(),
                              *MF.getSubtarget().getRegBankInfo(), *MIB,
-                             MIB->getDesc(), Info.Callee, 0);
+                             MIB->getDesc(), MIB->getOperand(0), 0);
 
   MF.getFrameInfo().setHasTailCall();
   Info.LoweredTailCall = true;
@@ -1127,14 +1128,39 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
 
   // Create a temporarily-floating call instruction so we can add the implicit
   // uses of arg registers.
-  unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false);
+
+  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+  unsigned Opc = 0;
+  // Calls with operand bundle "clang.arc.attachedcall" are special. They should
+  // be expanded to the call, directly followed by a special marker sequence and
+  // a call to an ObjC library function.
+  if (Info.CB && objcarc::hasAttachedCallOpBundle(Info.CB))
+    Opc = AArch64::BLR_RVMARKER;
+  // A call to a returns twice function like setjmp must be followed by a bti
+  // instruction.
+  else if (Info.CB &&
+           Info.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
+           !Subtarget.noBTIAtReturnTwice() &&
+           MF.getInfo<AArch64FunctionInfo>()->branchTargetEnforcement())
+    Opc = AArch64::BLR_BTI;
+  else
+    Opc = getCallOpcode(MF, Info.Callee.isReg(), false);
 
   auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
+  unsigned CalleeOpNo = 0;
+
+  if (Opc == AArch64::BLR_RVMARKER) {
+    // Add a target global address for the retainRV/claimRV runtime function
+    // just before the call target.
+    Function *ARCFn = *objcarc::getAttachedARCFunction(Info.CB);
+    MIB.addGlobalAddress(ARCFn);
+    ++CalleeOpNo;
+  }
+
   MIB.add(Info.Callee);
 
   // Tell the call which registers are clobbered.
   const uint32_t *Mask;
-  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   const auto *TRI = Subtarget.getRegisterInfo();
 
   AArch64OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg,
@@ -1160,10 +1186,10 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   // If Callee is a reg, since it is used by a target specific
   // instruction, it must have a register class matching the
   // constraint of that instruction.
-  if (Info.Callee.isReg())
+  if (MIB->getOperand(CalleeOpNo).isReg())
     constrainOperandRegClass(MF, *TRI, MRI, *Subtarget.getInstrInfo(),
                              *Subtarget.getRegBankInfo(), *MIB, MIB->getDesc(),
-                             Info.Callee, 0);
+                             MIB->getOperand(CalleeOpNo), CalleeOpNo);
 
   // Finally we can copy the returned value back into its virtual-register. In
   // symmetry with the arguments, the physical register must be an
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 703e356f016d..9a65687735fe 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -21,13 +21,16 @@
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
-#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -38,9 +41,9 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
-#include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -62,6 +65,7 @@ namespace {
 #include "AArch64GenGlobalISel.inc"
 #undef GET_GLOBALISEL_PREDICATE_BITSET
 
+
 class AArch64InstructionSelector : public InstructionSelector {
 public:
   AArch64InstructionSelector(const AArch64TargetMachine &TM,
@@ -293,6 +297,20 @@ private:
   emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS,
                  MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const;
 
+  /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
+  /// In some cases this is even possible with OR operations in the expression.
+  MachineInstr *emitConjunction(Register Val, AArch64CC::CondCode &OutCC,
+                                MachineIRBuilder &MIB) const;
+  MachineInstr *emitConditionalComparison(Register LHS, Register RHS,
+                                          CmpInst::Predicate CC,
+                                          AArch64CC::CondCode Predicate,
+                                          AArch64CC::CondCode OutCC,
+                                          MachineIRBuilder &MIB) const;
+  MachineInstr *emitConjunctionRec(Register Val, AArch64CC::CondCode &OutCC,
+                                   bool Negate, Register CCOp,
+                                   AArch64CC::CondCode Predicate,
+                                   MachineIRBuilder &MIB) const;
+
   /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg.
   /// \p IsNegative is true if the test should be "not zero".
   /// This will also optimize the test bit instruction when possible.
@@ -419,12 +437,16 @@ private:
                      int OpIdx = -1) const;
   void renderFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI,
                      int OpIdx = -1) const;
+  void renderFPImm32SIMDModImmType4(MachineInstrBuilder &MIB,
+                                    const MachineInstr &MI,
+                                    int OpIdx = -1) const;
 
   // Materialize a GlobalValue or BlockAddress using a movz+movk sequence.
   void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags);
 
   // Optimization methods.
-  bool tryOptSelect(MachineInstr &MI);
+  bool tryOptSelect(GSelect &Sel);
+  bool tryOptSelectConjunction(GSelect &Sel, MachineInstr &CondMI);
   MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
                                       MachineOperand &Predicate,
                                       MachineIRBuilder &MIRBuilder) const;
@@ -485,9 +507,11 @@ AArch64InstructionSelector::AArch64InstructionSelector(
 
 // FIXME: This should be target-independent, inferred from the types declared
 // for each class in the bank.
+//
+/// Given a register bank, and a type, return the smallest register class that
+/// can represent that combination.
 static const TargetRegisterClass *
 getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
-                         const RegisterBankInfo &RBI,
                          bool GetAllRegSet = false) {
   if (RB.getID() == AArch64::GPRRegBankID) {
     if (Ty.getSizeInBits() <= 32)
@@ -828,39 +852,6 @@ static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
   return GenericOpc;
 }
 
-#ifndef NDEBUG
-/// Helper function that verifies that we have a valid copy at the end of
-/// selectCopy. Verifies that the source and dest have the expected sizes and
-/// then returns true.
-static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank,
-                        const MachineRegisterInfo &MRI,
-                        const TargetRegisterInfo &TRI,
-                        const RegisterBankInfo &RBI) {
-  const Register DstReg = I.getOperand(0).getReg();
-  const Register SrcReg = I.getOperand(1).getReg();
-  const unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
-  const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
-
-  // Make sure the size of the source and dest line up.
-  assert(
-      (DstSize == SrcSize ||
-       // Copies are a mean to setup initial types, the number of
-       // bits may not exactly match.
-       (Register::isPhysicalRegister(SrcReg) && DstSize <= SrcSize) ||
-       // Copies are a mean to copy bits around, as long as we are
-       // on the same register class, that's fine. Otherwise, that
-       // means we need some SUBREG_TO_REG or AND & co.
-       (((DstSize + 31) / 32 == (SrcSize + 31) / 32) && DstSize > SrcSize)) &&
-      "Copy with different width?!");
-
-  // Check the size of the destination.
-  assert((DstSize <= 64 || DstBank.getID() == AArch64::FPRRegBankID) &&
-         "GPRs cannot get more than 64-bit width values");
-
-  return true;
-}
-#endif
-
 /// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg
 /// to \p *To.
 ///
@@ -935,31 +926,6 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
     return false;
   }
 
-  // A couple helpers below, for making sure that the copy we produce is valid.
-
-  // Set to true if we insert a SUBREG_TO_REG. If we do this, then we don't want
-  // to verify that the src and dst are the same size, since that's handled by
-  // the SUBREG_TO_REG.
-  bool KnownValid = false;
-
-  // Returns true, or asserts if something we don't expect happens. Instead of
-  // returning true, we return isValidCopy() to ensure that we verify the
-  // result.
-  auto CheckCopy = [&]() {
-    // If we have a bitcast or something, we can't have physical registers.
-    assert((I.isCopy() ||
-            (!Register::isPhysicalRegister(I.getOperand(0).getReg()) &&
-             !Register::isPhysicalRegister(I.getOperand(1).getReg()))) &&
-           "No phys reg on generic operator!");
-    bool ValidCopy = true;
-#ifndef NDEBUG
-    ValidCopy = KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI);
-    assert(ValidCopy && "Invalid copy.");
-#endif
-    (void)KnownValid;
-    return ValidCopy;
-  };
-
   // Is this a copy? If so, then we may need to insert a subregister copy.
   if (I.isCopy()) {
     // Yes. Check if there's anything to fix up.
@@ -1004,15 +970,12 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
           .addImm(SubReg);
       MachineOperand &RegOp = I.getOperand(1);
       RegOp.setReg(PromoteReg);
-
-      // Promise that the copy is implicitly validated by the SUBREG_TO_REG.
-      KnownValid = true;
     }
 
     // If the destination is a physical register, then there's nothing to
     // change, so we're done.
     if (Register::isPhysicalRegister(DstReg))
-      return CheckCopy();
+      return true;
   }
 
   // No need to constrain SrcReg. It will get constrained when we hit another
@@ -1032,7 +995,7 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
   }
 
   I.setDesc(TII.get(AArch64::COPY));
-  return CheckCopy();
+  return true;
 }
 
 static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) {
@@ -1309,6 +1272,90 @@ static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
   }
 }
 
+/// changeFPCCToORAArch64CC - Convert an IR fp condition code to an AArch64 CC.
+static void changeFPCCToORAArch64CC(CmpInst::Predicate CC,
+                                    AArch64CC::CondCode &CondCode,
+                                    AArch64CC::CondCode &CondCode2) {
+  CondCode2 = AArch64CC::AL;
+  switch (CC) {
+  default:
+    llvm_unreachable("Unknown FP condition!");
+  case CmpInst::FCMP_OEQ:
+    CondCode = AArch64CC::EQ;
+    break;
+  case CmpInst::FCMP_OGT:
+    CondCode = AArch64CC::GT;
+    break;
+  case CmpInst::FCMP_OGE:
+    CondCode = AArch64CC::GE;
+    break;
+  case CmpInst::FCMP_OLT:
+    CondCode = AArch64CC::MI;
+    break;
+  case CmpInst::FCMP_OLE:
+    CondCode = AArch64CC::LS;
+    break;
+  case CmpInst::FCMP_ONE:
+    CondCode = AArch64CC::MI;
+    CondCode2 = AArch64CC::GT;
+    break;
+  case CmpInst::FCMP_ORD:
+    CondCode = AArch64CC::VC;
+    break;
+  case CmpInst::FCMP_UNO:
+    CondCode = AArch64CC::VS;
+    break;
+  case CmpInst::FCMP_UEQ:
+    CondCode = AArch64CC::EQ;
+    CondCode2 = AArch64CC::VS;
+    break;
+  case CmpInst::FCMP_UGT:
+    CondCode = AArch64CC::HI;
+    break;
+  case CmpInst::FCMP_UGE:
+    CondCode = AArch64CC::PL;
+    break;
+  case CmpInst::FCMP_ULT:
+    CondCode = AArch64CC::LT;
+    break;
+  case CmpInst::FCMP_ULE:
+    CondCode = AArch64CC::LE;
+    break;
+  case CmpInst::FCMP_UNE:
+    CondCode = AArch64CC::NE;
+    break;
+  }
+}
+
+/// Convert an IR fp condition code to an AArch64 CC.
+/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
+/// should be AND'ed instead of OR'ed.
+static void changeFPCCToANDAArch64CC(CmpInst::Predicate CC,
+                                     AArch64CC::CondCode &CondCode,
+                                     AArch64CC::CondCode &CondCode2) {
+  CondCode2 = AArch64CC::AL;
+  switch (CC) {
+  default:
+    changeFPCCToORAArch64CC(CC, CondCode, CondCode2);
+    assert(CondCode2 == AArch64CC::AL);
+    break;
+  case CmpInst::FCMP_ONE:
+    // (a one b)
+    // == ((a olt b) || (a ogt b))
+    // == ((a ord b) && (a une b))
+    CondCode = AArch64CC::VC;
+    CondCode2 = AArch64CC::NE;
+    break;
+  case CmpInst::FCMP_UEQ:
+    // (a ueq b)
+    // == ((a uno b) || (a oeq b))
+    // == ((a ule b) && (a uge b))
+    CondCode = AArch64CC::PL;
+    CondCode2 = AArch64CC::LE;
+    break;
+  }
+}
+
 /// Return a register which can be used as a bit to test in a TB(N)Z.
 static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
                               MachineRegisterInfo &MRI) {
@@ -1703,7 +1750,6 @@ static Optional<int64_t> getVectorShiftImm(Register Reg,
                                            MachineRegisterInfo &MRI) {
   assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand");
   MachineInstr *OpMI = MRI.getVRegDef(Reg);
-  assert(OpMI && "Expected to find a vreg def for vector shift operand");
   return getAArch64VectorSplatScalar(*OpMI, MRI);
 }
 
@@ -1810,7 +1856,7 @@ bool AArch64InstructionSelector::selectVectorAshrLshr(
   unsigned Opc = 0;
   unsigned NegOpc = 0;
   const TargetRegisterClass *RC =
-      getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID), RBI);
+      getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID));
   if (Ty == LLT::fixed_vector(2, 64)) {
     Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64;
     NegOpc = AArch64::NEGv2i64;
@@ -2266,6 +2312,16 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
     I.eraseFromParent();
     return true;
   }
+  case TargetOpcode::G_FENCE: {
+    if (I.getOperand(1).getImm() == 0)
+      BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CompilerBarrier))
+          .addImm(I.getOperand(0).getImm());
+    else
+      BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::DMB))
+          .addImm(I.getOperand(0).getImm() == 4 ? 0x9 : 0xb);
+    I.eraseFromParent();
+    return true;
+  }
   default:
     return false;
   }
@@ -2279,8 +2335,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
   MachineFunction &MF = *MBB.getParent();
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
-  const AArch64Subtarget *Subtarget =
-      &static_cast<const AArch64Subtarget &>(MF.getSubtarget());
+  const AArch64Subtarget *Subtarget = &MF.getSubtarget<AArch64Subtarget>();
   if (Subtarget->requiresStrictAlign()) {
     // We don't support this feature yet.
     LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n");
@@ -2312,7 +2367,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
           return false;
         }
         const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
-        DefRC = getRegClassForTypeOnBank(DefTy, RB, RBI);
+        DefRC = getRegClassForTypeOnBank(DefTy, RB);
         if (!DefRC) {
           LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
           return false;
@@ -2488,7 +2543,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
 
       // The case when we have 0.0 is covered by tablegen. Reject it here so we
       // can be sure tablegen works correctly and isn't rescued by this code.
-      // 0.0 is not covered by tablegen for FP128. So we will handle this 
+      // 0.0 is not covered by tablegen for FP128. So we will handle this
       // scenario in the code here.
       if (DefSize != 128 && I.getOperand(1).getFPImm()->isExactlyValue(0.0))
         return false;
@@ -2510,7 +2565,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     }
 
     if (isFP) {
-      const TargetRegisterClass &FPRRC = *getMinClassForRegBank(RB, DefSize);
+      const TargetRegisterClass &FPRRC = *getRegClassForTypeOnBank(DefTy, RB);
       // For 16, 64, and 128b values, emit a constant pool load.
       switch (DefSize) {
       default:
@@ -2735,12 +2790,18 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
         return false;
 
       if (isa<GLoad>(LdSt)) {
-        static unsigned Opcodes[] = {AArch64::LDARB, AArch64::LDARH,
-                                     AArch64::LDARW, AArch64::LDARX};
+        static constexpr unsigned LDAPROpcodes[] = {
+            AArch64::LDAPRB, AArch64::LDAPRH, AArch64::LDAPRW, AArch64::LDAPRX};
+        static constexpr unsigned LDAROpcodes[] = {
+            AArch64::LDARB, AArch64::LDARH, AArch64::LDARW, AArch64::LDARX};
+        ArrayRef<unsigned> Opcodes =
+            STI.hasLDAPR() && Order != AtomicOrdering::SequentiallyConsistent
+                ? LDAPROpcodes
+                : LDAROpcodes;
         I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
       } else {
-        static unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH,
-                                     AArch64::STLRW, AArch64::STLRX};
+        static constexpr unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH,
+                                               AArch64::STLRW, AArch64::STLRX};
         Register ValReg = LdSt.getReg(0);
         if (MRI.getType(ValReg).getSizeInBits() == 64 && MemSizeInBits != 64) {
           // Emit a subreg copy of 32 bits.
@@ -2774,7 +2835,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     if (isa<GStore>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
       unsigned SubReg;
       LLT MemTy = LdSt.getMMO().getMemoryType();
-      auto *RC = getRegClassForTypeOnBank(MemTy, RB, RBI);
+      auto *RC = getRegClassForTypeOnBank(MemTy, RB);
       if (!getSubRegForClass(RC, TRI, SubReg))
         return false;
 
@@ -2790,7 +2851,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
       if (RB.getID() == AArch64::FPRRegBankID) {
         unsigned SubReg;
         LLT MemTy = LdSt.getMMO().getMemoryType();
-        auto *RC = getRegClassForTypeOnBank(MemTy, RB, RBI);
+        auto *RC = getRegClassForTypeOnBank(MemTy, RB);
         if (!getSubRegForClass(RC, TRI, SubReg))
           return false;
         Register OldDst = LdSt.getReg(0);
@@ -2804,7 +2865,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
             .addImm(0)
             .addUse(NewDst)
             .addImm(SubReg);
-        auto SubRegRC = getRegClassForTypeOnBank(MRI.getType(OldDst), RB, RBI);
+        auto SubRegRC = getRegClassForTypeOnBank(MRI.getType(OldDst), RB);
         RBI.constrainGenericRegister(OldDst, *SubRegRC, MRI);
         MIB.setInstr(LdSt);
       }
@@ -2934,8 +2995,6 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
       if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
           ShiftTy.getSizeInBits() == 64) {
         assert(!ShiftTy.isVector() && "unexpected vector shift ty");
-        assert(MRI.getVRegDef(ShiftReg) &&
-               "could not find a vreg definition for shift amount");
         // Insert a subregister copy to implement a 64->32 trunc
         auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {})
                          .addReg(ShiftReg, 0, AArch64::sub_32);
@@ -2944,10 +3003,6 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
       }
     }
     LLVM_FALLTHROUGH;
-  case TargetOpcode::G_FADD:
-  case TargetOpcode::G_FSUB:
-  case TargetOpcode::G_FMUL:
-  case TargetOpcode::G_FDIV:
   case TargetOpcode::G_OR: {
     // Reject the various things we don't support yet.
     if (unsupportedBinOp(I, RBI, MRI, TRI))
@@ -3026,13 +3081,11 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     }
 
     if (DstRB.getID() == AArch64::GPRRegBankID) {
-      const TargetRegisterClass *DstRC =
-          getRegClassForTypeOnBank(DstTy, DstRB, RBI);
+      const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB);
       if (!DstRC)
         return false;
 
-      const TargetRegisterClass *SrcRC =
-          getRegClassForTypeOnBank(SrcTy, SrcRB, RBI);
+      const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(SrcTy, SrcRB);
       if (!SrcRC)
         return false;
 
@@ -3270,6 +3323,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
 
     I.setDesc(TII.get(NewOpc));
     constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+    I.setFlags(MachineInstr::NoFPExcept);
 
     return true;
   }
@@ -3291,17 +3345,18 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     return selectCopy(I, TII, MRI, TRI, RBI);
 
   case TargetOpcode::G_SELECT: {
-    if (MRI.getType(I.getOperand(1).getReg()) != LLT::scalar(1)) {
+    auto &Sel = cast<GSelect>(I);
+    if (MRI.getType(Sel.getCondReg()) != LLT::scalar(1)) {
       LLVM_DEBUG(dbgs() << "G_SELECT cond has type: " << Ty
                         << ", expected: " << LLT::scalar(1) << '\n');
       return false;
     }
 
-    const Register CondReg = I.getOperand(1).getReg();
-    const Register TReg = I.getOperand(2).getReg();
-    const Register FReg = I.getOperand(3).getReg();
+    const Register CondReg = Sel.getCondReg();
+    const Register TReg = Sel.getTrueReg();
+    const Register FReg = Sel.getFalseReg();
 
-    if (tryOptSelect(I))
+    if (tryOptSelect(Sel))
       return true;
 
     // Make sure to use an unused vreg instead of wzr, so that the peephole
@@ -3310,9 +3365,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg})
                      .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
     constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
-    if (!emitSelect(I.getOperand(0).getReg(), TReg, FReg, AArch64CC::NE, MIB))
+    if (!emitSelect(Sel.getReg(0), TReg, FReg, AArch64CC::NE, MIB))
       return false;
-    I.eraseFromParent();
+    Sel.eraseFromParent();
     return true;
   }
   case TargetOpcode::G_ICMP: {
@@ -3357,8 +3412,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
     const Register DstReg = I.getOperand(0).getReg();
     const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
-    const TargetRegisterClass *DstRC =
-        getRegClassForTypeOnBank(DstTy, DstRB, RBI);
+    const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB);
     RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
     return true;
   }
@@ -3871,7 +3925,7 @@ bool AArch64InstructionSelector::selectVectorICmp(
 
   const RegisterBank &VecRB = *RBI.getRegBank(SrcReg, MRI, TRI);
   const TargetRegisterClass *SrcRC =
-      getRegClassForTypeOnBank(SrcTy, VecRB, RBI, true);
+      getRegClassForTypeOnBank(SrcTy, VecRB, true);
   if (!SrcRC) {
     LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
     return false;
@@ -4037,7 +4091,7 @@ MachineInstr *AArch64InstructionSelector::emitExtractVectorElt(
   }
 
   const TargetRegisterClass *DstRC =
-      getRegClassForTypeOnBank(ScalarTy, DstRB, RBI, true);
+      getRegClassForTypeOnBank(ScalarTy, DstRB, true);
   if (!DstRC) {
     LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n");
     return nullptr;
@@ -4046,7 +4100,7 @@ MachineInstr *AArch64InstructionSelector::emitExtractVectorElt(
   const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI);
   const LLT &VecTy = MRI.getType(VecReg);
   const TargetRegisterClass *VecRC =
-      getRegClassForTypeOnBank(VecTy, VecRB, RBI, true);
+      getRegClassForTypeOnBank(VecTy, VecRB, true);
   if (!VecRC) {
     LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
     return nullptr;
@@ -4205,9 +4259,9 @@ bool AArch64InstructionSelector::selectUnmergeValues(MachineInstr &I,
   } else {
     // No. We have to perform subregister inserts. For each insert, create an
     // implicit def and a subregister insert, and save the register we create.
-    const TargetRegisterClass *RC =
-        getMinClassForRegBank(*RBI.getRegBank(SrcReg, MRI, TRI),
-                              WideTy.getScalarSizeInBits() * NumElts);
+    const TargetRegisterClass *RC = getRegClassForTypeOnBank(
+        LLT::fixed_vector(NumElts, WideTy.getScalarSizeInBits()),
+        *RBI.getRegBank(SrcReg, MRI, TRI));
     unsigned SubReg = 0;
     bool Found = getSubRegForClass(RC, TRI, SubReg);
     (void)Found;
@@ -4594,6 +4648,7 @@ AArch64InstructionSelector::emitFPCompare(Register LHS, Register RHS,
   // Partially build the compare. Decide if we need to add a use for the
   // third operand based off whether or not we're comparing against 0.0.
   auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addUse(LHS);
+  CmpMI.setMIFlags(MachineInstr::NoFPExcept);
   if (!ShouldUseImm)
     CmpMI.addUse(RHS);
   constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
@@ -4632,7 +4687,7 @@ MachineInstr *AArch64InstructionSelector::emitVectorConcat(
   const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits());
   const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI);
   const TargetRegisterClass *DstRC =
-      getMinClassForRegBank(FPRBank, Op1Ty.getSizeInBits() * 2);
+      getRegClassForTypeOnBank(Op1Ty.multiplyElements(2), FPRBank);
 
   MachineInstr *WidenedOp1 =
       emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder);
@@ -4701,7 +4756,256 @@ AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst,
   }
 }
 
-bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) {
+/// Returns true if @p Val is a tree of AND/OR/CMP operations that can be
+/// expressed as a conjunction.
+/// \param CanNegate    Set to true if we can negate the whole sub-tree just by
+///                     changing the conditions on the CMP tests.
+///                     (this means we can call emitConjunctionRec() with
+///                      Negate==true on this sub-tree)
+/// \param MustBeFirst  Set to true if this subtree needs to be negated and we
+///                     cannot do the negation naturally. We are required to
+///                     emit the subtree first in this case.
+/// \param WillNegate   Is true if are called when the result of this
+///                     subexpression must be negated. This happens when the
+///                     outer expression is an OR. We can use this fact to know
+///                     that we have a double negation (or (or ...) ...) that
+///                     can be implemented for free.
+static bool canEmitConjunction(Register Val, bool &CanNegate, bool &MustBeFirst,
+                               bool WillNegate, MachineRegisterInfo &MRI,
+                               unsigned Depth = 0) {
+  if (!MRI.hasOneNonDBGUse(Val))
+    return false;
+  MachineInstr *ValDef = MRI.getVRegDef(Val);
+  unsigned Opcode = ValDef->getOpcode();
+  if (Opcode == TargetOpcode::G_TRUNC) {
+    // Look through a trunc.
+    Val = ValDef->getOperand(1).getReg();
+    ValDef = MRI.getVRegDef(Val);
+    Opcode = ValDef->getOpcode();
+  }
+  if (isa<GAnyCmp>(ValDef)) {
+    CanNegate = true;
+    MustBeFirst = false;
+    return true;
+  }
+  // Protect against exponential runtime and stack overflow.
+  if (Depth > 6)
+    return false;
+  if (Opcode == TargetOpcode::G_AND || Opcode == TargetOpcode::G_OR) {
+    bool IsOR = Opcode == TargetOpcode::G_OR;
+    Register O0 = ValDef->getOperand(1).getReg();
+    Register O1 = ValDef->getOperand(2).getReg();
+    bool CanNegateL;
+    bool MustBeFirstL;
+    if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, MRI, Depth + 1))
+      return false;
+    bool CanNegateR;
+    bool MustBeFirstR;
+    if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, MRI, Depth + 1))
+      return false;
+
+    if (MustBeFirstL && MustBeFirstR)
+      return false;
+
+    if (IsOR) {
+      // For an OR expression we need to be able to naturally negate at least
+      // one side or we cannot do the transformation at all.
+      if (!CanNegateL && !CanNegateR)
+        return false;
+      // If we the result of the OR will be negated and we can naturally negate
+      // the leaves, then this sub-tree as a whole negates naturally.
+      CanNegate = WillNegate && CanNegateL && CanNegateR;
+      // If we cannot naturally negate the whole sub-tree, then this must be
+      // emitted first.
+      MustBeFirst = !CanNegate;
+    } else {
+      assert(Opcode == TargetOpcode::G_AND && "Must be G_AND");
+      // We cannot naturally negate an AND operation.
+      CanNegate = false;
+      MustBeFirst = MustBeFirstL || MustBeFirstR;
+    }
+    return true;
+  }
+  return false;
+}
+
+MachineInstr *AArch64InstructionSelector::emitConditionalComparison(
+    Register LHS, Register RHS, CmpInst::Predicate CC,
+    AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC,
+    MachineIRBuilder &MIB) const {
+  // TODO: emit CMN as an optimization.
+  auto &MRI = *MIB.getMRI();
+  LLT OpTy = MRI.getType(LHS);
+  assert(OpTy.getSizeInBits() == 32 || OpTy.getSizeInBits() == 64);
+  unsigned CCmpOpc;
+  if (CmpInst::isIntPredicate(CC)) {
+    CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWr : AArch64::CCMPXr;
+  } else {
+    switch (OpTy.getSizeInBits()) {
+    case 16:
+      CCmpOpc = AArch64::FCCMPHrr;
+      break;
+    case 32:
+      CCmpOpc = AArch64::FCCMPSrr;
+      break;
+    case 64:
+      CCmpOpc = AArch64::FCCMPDrr;
+      break;
+    default:
+      return nullptr;
+    }
+  }
+  AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
+  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
+  auto CCmp =
+      MIB.buildInstr(CCmpOpc, {}, {LHS, RHS}).addImm(NZCV).addImm(Predicate);
+  constrainSelectedInstRegOperands(*CCmp, TII, TRI, RBI);
+  return &*CCmp;
+}
+
+MachineInstr *AArch64InstructionSelector::emitConjunctionRec(
+    Register Val, AArch64CC::CondCode &OutCC, bool Negate, Register CCOp,
+    AArch64CC::CondCode Predicate, MachineIRBuilder &MIB) const {
+  // We're at a tree leaf, produce a conditional comparison operation.
+  auto &MRI = *MIB.getMRI();
+  MachineInstr *ValDef = MRI.getVRegDef(Val);
+  unsigned Opcode = ValDef->getOpcode();
+  if (Opcode == TargetOpcode::G_TRUNC) {
+    // Look through a trunc.
+    Val = ValDef->getOperand(1).getReg();
+    ValDef = MRI.getVRegDef(Val);
+    Opcode = ValDef->getOpcode();
+  }
+  if (auto *Cmp = dyn_cast<GAnyCmp>(ValDef)) {
+    Register LHS = Cmp->getLHSReg();
+    Register RHS = Cmp->getRHSReg();
+    CmpInst::Predicate CC = Cmp->getCond();
+    if (Negate)
+      CC = CmpInst::getInversePredicate(CC);
+    if (isa<GICmp>(Cmp)) {
+      OutCC = changeICMPPredToAArch64CC(CC);
+    } else {
+      // Handle special FP cases.
+      AArch64CC::CondCode ExtraCC;
+      changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
+      // Some floating point conditions can't be tested with a single condition
+      // code. Construct an additional comparison in this case.
+      if (ExtraCC != AArch64CC::AL) {
+        MachineInstr *ExtraCmp;
+        if (!CCOp)
+          ExtraCmp = emitFPCompare(LHS, RHS, MIB, CC);
+        else
+          ExtraCmp =
+              emitConditionalComparison(LHS, RHS, CC, Predicate, ExtraCC, MIB);
+        CCOp = ExtraCmp->getOperand(0).getReg();
+        Predicate = ExtraCC;
+      }
+    }
+
+    // Produce a normal comparison if we are first in the chain
+    if (!CCOp) {
+      auto Dst = MRI.cloneVirtualRegister(LHS);
+      if (isa<GICmp>(Cmp))
+        return emitSUBS(Dst, Cmp->getOperand(2), Cmp->getOperand(3), MIB);
+      return emitFPCompare(Cmp->getOperand(2).getReg(),
+                           Cmp->getOperand(3).getReg(), MIB);
+    }
+    // Otherwise produce a ccmp.
+    return emitConditionalComparison(LHS, RHS, CC, Predicate, OutCC, MIB);
+  }
+  assert(MRI.hasOneNonDBGUse(Val) && "Valid conjunction/disjunction tree");
+
+  bool IsOR = Opcode == TargetOpcode::G_OR;
+
+  Register LHS = ValDef->getOperand(1).getReg();
+  bool CanNegateL;
+  bool MustBeFirstL;
+  bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR, MRI);
+  assert(ValidL && "Valid conjunction/disjunction tree");
+  (void)ValidL;
+
+  Register RHS = ValDef->getOperand(2).getReg();
+  bool CanNegateR;
+  bool MustBeFirstR;
+  bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR, MRI);
+  assert(ValidR && "Valid conjunction/disjunction tree");
+  (void)ValidR;
+
+  // Swap sub-tree that must come first to the right side.
+  if (MustBeFirstL) {
+    assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
+    std::swap(LHS, RHS);
+    std::swap(CanNegateL, CanNegateR);
+    std::swap(MustBeFirstL, MustBeFirstR);
+  }
+
+  bool NegateR;
+  bool NegateAfterR;
+  bool NegateL;
+  bool NegateAfterAll;
+  if (Opcode == TargetOpcode::G_OR) {
+    // Swap the sub-tree that we can negate naturally to the left.
+    if (!CanNegateL) {
+      assert(CanNegateR && "at least one side must be negatable");
+      assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
+      assert(!Negate);
+      std::swap(LHS, RHS);
+      NegateR = false;
+      NegateAfterR = true;
+    } else {
+      // Negate the left sub-tree if possible, otherwise negate the result.
+      NegateR = CanNegateR;
+      NegateAfterR = !CanNegateR;
+    }
+    NegateL = true;
+    NegateAfterAll = !Negate;
+  } else {
+    assert(Opcode == TargetOpcode::G_AND &&
+           "Valid conjunction/disjunction tree");
+    assert(!Negate && "Valid conjunction/disjunction tree");
+
+    NegateL = false;
+    NegateR = false;
+    NegateAfterR = false;
+    NegateAfterAll = false;
+  }
+
+  // Emit sub-trees.
+  AArch64CC::CondCode RHSCC;
+  MachineInstr *CmpR =
+      emitConjunctionRec(RHS, RHSCC, NegateR, CCOp, Predicate, MIB);
+  if (NegateAfterR)
+    RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
+  MachineInstr *CmpL = emitConjunctionRec(
+      LHS, OutCC, NegateL, CmpR->getOperand(0).getReg(), RHSCC, MIB);
+  if (NegateAfterAll)
+    OutCC = AArch64CC::getInvertedCondCode(OutCC);
+  return CmpL;
+}
+
+MachineInstr *AArch64InstructionSelector::emitConjunction(
+    Register Val, AArch64CC::CondCode &OutCC, MachineIRBuilder &MIB) const {
+  bool DummyCanNegate;
+  bool DummyMustBeFirst;
+  if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false,
+                          *MIB.getMRI()))
+    return nullptr;
+  return emitConjunctionRec(Val, OutCC, false, Register(), AArch64CC::AL, MIB);
+}
+
+bool AArch64InstructionSelector::tryOptSelectConjunction(GSelect &SelI,
+                                                         MachineInstr &CondMI) {
+  AArch64CC::CondCode AArch64CC;
+  MachineInstr *ConjMI = emitConjunction(SelI.getCondReg(), AArch64CC, MIB);
+  if (!ConjMI)
+    return false;
+
+  emitSelect(SelI.getReg(0), SelI.getTrueReg(), SelI.getFalseReg(), AArch64CC, MIB);
+  SelI.eraseFromParent();
+  return true;
+}
+
+bool AArch64InstructionSelector::tryOptSelect(GSelect &I) {
   MachineRegisterInfo &MRI = *MIB.getMRI();
   // We want to recognize this pattern:
   //
@@ -4750,12 +5054,12 @@ bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) {
   }
 
   // Is the condition defined by a compare?
-  if (!CondDef)
-    return false;
-
   unsigned CondOpc = CondDef->getOpcode();
-  if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP)
+  if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP) {
+    if (tryOptSelectConjunction(I, *CondDef))
+      return true;
     return false;
+  }
 
   AArch64CC::CondCode CondCode;
   if (CondOpc == TargetOpcode::G_ICMP) {
@@ -5081,7 +5385,7 @@ bool AArch64InstructionSelector::selectInsertElt(MachineInstr &I,
     // the original size to get the result we want.
     Register DemoteVec = InsMI->getOperand(0).getReg();
     const TargetRegisterClass *RC =
-        getMinClassForRegBank(*RBI.getRegBank(DemoteVec, MRI, TRI), VecSize);
+        getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DemoteVec, MRI, TRI));
     if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
       LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
       return false;
@@ -5198,12 +5502,11 @@ bool AArch64InstructionSelector::tryOptBuildVecToSubregToReg(
              }))
     return false;
   unsigned SubReg;
-  const TargetRegisterClass *EltRC =
-      getMinClassForRegBank(EltRB, EltTy.getSizeInBits());
+  const TargetRegisterClass *EltRC = getRegClassForTypeOnBank(EltTy, EltRB);
   if (!EltRC)
     return false;
   const TargetRegisterClass *DstRC =
-      getMinClassForRegBank(DstRB, MRI.getType(Dst).getSizeInBits());
+      getRegClassForTypeOnBank(MRI.getType(Dst), DstRB);
   if (!DstRC)
     return false;
   if (!getSubRegForClass(EltRC, TRI, SubReg))
@@ -5261,7 +5564,7 @@ bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I,
   if (DstSize < 128) {
     // Force this to be FPR using the destination vector.
     const TargetRegisterClass *RC =
-        getMinClassForRegBank(*RBI.getRegBank(DstVec, MRI, TRI), DstSize);
+        getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DstVec, MRI, TRI));
     if (!RC)
       return false;
     if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
@@ -5528,7 +5831,7 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
     uint64_t Key = I.getOperand(3).getImm();
     Register DiscReg = I.getOperand(4).getReg();
     auto DiscVal = getIConstantVRegVal(DiscReg, MRI);
-    bool IsDiscZero = DiscVal.hasValue() && DiscVal->isNullValue();
+    bool IsDiscZero = DiscVal && DiscVal->isNullValue();
 
     if (Key > 3)
       return false;
@@ -5777,8 +6080,6 @@ AArch64InstructionSelector::selectExtendedSHL(
 
   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
   MachineInstr *OffsetInst = MRI.getVRegDef(Offset.getReg());
-  if (!OffsetInst)
-    return None;
 
   unsigned OffsetOpc = OffsetInst->getOpcode();
   bool LookedThroughZExt = false;
@@ -5932,7 +6233,7 @@ AArch64InstructionSelector::selectAddrModeRegisterOffset(
 
   // We need a GEP.
   MachineInstr *Gep = MRI.getVRegDef(Root.getReg());
-  if (!Gep || Gep->getOpcode() != TargetOpcode::G_PTR_ADD)
+  if (Gep->getOpcode() != TargetOpcode::G_PTR_ADD)
     return None;
 
   // If this is used more than once, let's not bother folding.
@@ -6112,14 +6413,12 @@ AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root,
     return None;
 
   MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
-  if (!RootDef)
-    return None;
 
   MachineOperand &OffImm = RootDef->getOperand(2);
   if (!OffImm.isReg())
     return None;
   MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg());
-  if (!RHS || RHS->getOpcode() != TargetOpcode::G_CONSTANT)
+  if (RHS->getOpcode() != TargetOpcode::G_CONSTANT)
     return None;
   int64_t RHSC;
   MachineOperand &RHSOp1 = RHS->getOperand(1);
@@ -6187,9 +6486,6 @@ AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
     return None;
 
   MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
-  if (!RootDef)
-    return None;
-
   if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
     return {{
         [=](MachineInstrBuilder &MIB) { MIB.add(RootDef->getOperand(1)); },
@@ -6210,27 +6506,26 @@ AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
     MachineOperand &RHS = RootDef->getOperand(2);
     MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg());
     MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg());
-    if (LHSDef && RHSDef) {
-      int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue();
-      unsigned Scale = Log2_32(Size);
-      if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
-        if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
-          return {{
-              [=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); },
-              [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
-          }};
 
+    int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue();
+    unsigned Scale = Log2_32(Size);
+    if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
+      if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
         return {{
-            [=](MachineInstrBuilder &MIB) { MIB.add(LHS); },
+            [=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); },
             [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
         }};
-      }
+
+      return {{
+          [=](MachineInstrBuilder &MIB) { MIB.add(LHS); },
+          [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
+      }};
     }
   }
 
   // Before falling back to our general case, check if the unscaled
   // instructions can handle this. If so, that's preferable.
-  if (selectAddrModeUnscaled(Root, Size).hasValue())
+  if (selectAddrModeUnscaled(Root, Size))
     return None;
 
   return {{
@@ -6269,8 +6564,6 @@ AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root,
   // Check if the operand is defined by an instruction which corresponds to
   // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc.
   MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg());
-  if (!ShiftInst)
-    return None;
   AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst);
   if (ShType == AArch64_AM::InvalidShiftExtend)
     return None;
@@ -6425,7 +6718,7 @@ AArch64InstructionSelector::selectArithExtendedRegister(
     // to.
     if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) {
       MachineInstr *ExtInst = MRI.getVRegDef(ExtReg);
-      if (ExtInst && isDef32(*ExtInst))
+      if (isDef32(*ExtInst))
         return None;
     }
   }
@@ -6450,7 +6743,7 @@ void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
   Optional<int64_t> CstVal =
       getIConstantVRegSExtVal(MI.getOperand(0).getReg(), MRI);
   assert(CstVal && "Expected constant value");
-  MIB.addImm(CstVal.getValue());
+  MIB.addImm(*CstVal);
 }
 
 void AArch64InstructionSelector::renderLogicalImm32(
@@ -6498,6 +6791,17 @@ void AArch64InstructionSelector::renderFPImm64(MachineInstrBuilder &MIB,
       AArch64_AM::getFP64Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
 }
 
+void AArch64InstructionSelector::renderFPImm32SIMDModImmType4(
+    MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
+  assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
+         "Expected G_FCONSTANT");
+  MIB.addImm(AArch64_AM::encodeAdvSIMDModImmType4(MI.getOperand(1)
+                                                      .getFPImm()
+                                                      ->getValueAPF()
+                                                      .bitcastToAPInt()
+                                                      .getZExtValue()));
+}
+
 bool AArch64InstructionSelector::isLoadStoreOfNumBytes(
     const MachineInstr &MI, unsigned NumBytes) const {
   if (!MI.mayLoadOrStore())
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index e9df7e001d38..74ec9373ce9e 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -169,7 +169,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .scalarize(0);
 
   getActionDefinitionsBuilder({G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
-      .lowerFor({s1, s8, s16, s32, s64, v2s64, v4s32, v2s32})
+      .lowerFor({s8, s16, s32, s64, v2s64, v4s32, v2s32})
       .widenScalarOrEltToNextPow2(0)
       .clampScalarOrElt(0, s32, s64)
       .clampNumElements(0, v2s32, v4s32)
@@ -180,7 +180,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
       .widenScalarToNextPow2(0, /*Min = */ 32)
       .clampScalar(0, s32, s64)
-      .lowerIf(typeIs(1, s1));
+      .lower();
 
   getActionDefinitionsBuilder({G_SMULH, G_UMULH})
       .legalFor({s64, v8s16, v16s8, v4s32})
@@ -308,7 +308,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       // These extends are also legal
       .legalForTypesWithMemDesc({{s32, p0, s8, 8}, {s32, p0, s16, 8}})
       .widenScalarToNextPow2(0, /* MinSize = */8)
-      .lowerIfMemSizeNotPow2()
+      .lowerIfMemSizeNotByteSizePow2()
       .clampScalar(0, s8, s64)
       .narrowScalarIf([=](const LegalityQuery &Query) {
         // Clamp extending load results to 32-bits.
@@ -317,10 +317,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
           Query.Types[0].getSizeInBits() > 32;
         },
         changeTo(0, s32))
-      // Lower any any-extending loads left into G_ANYEXT and G_LOAD
-      .lowerIf([=](const LegalityQuery &Query) {
-        return Query.Types[0] != Query.MMODescrs[0].MemoryTy;
-      })
       .clampMaxNumElements(0, s8, 16)
       .clampMaxNumElements(0, s16, 8)
       .clampMaxNumElements(0, s32, 4)
@@ -536,7 +532,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
 
   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
       .lowerIf(
-          all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(1, s1), typeIs(2, p0)));
+          all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(2, p0)));
 
   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
       .customIf([](const LegalityQuery &Query) {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
index 3dec980a819a..ba206bac68d1 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
@@ -20,11 +20,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64TargetMachine.h"
+#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
 #include "llvm/CodeGen/GlobalISel/Combiner.h"
 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
@@ -133,7 +135,7 @@ bool matchAArch64MulConstCombine(
   if (!Const)
     return false;
 
-  const APInt ConstValue = Const->Value.sextOrSelf(Ty.getSizeInBits());
+  APInt ConstValue = Const->Value.sext(Ty.getSizeInBits());
   // The following code is ported from AArch64ISelLowering.
   // Multiplication of a power of two plus/minus one can be done more
   // cheaply as as shift+add/sub. For now, this is true unilaterally. If
@@ -258,7 +260,7 @@ void applyFoldMergeToZext(MachineInstr &MI, MachineRegisterInfo &MRI,
   // %d(s64) = G_ZEXT %a(s32)
   Observer.changingInstr(MI);
   MI.setDesc(B.getTII().get(TargetOpcode::G_ZEXT));
-  MI.RemoveOperand(2);
+  MI.removeOperand(2);
   Observer.changedInstr(MI);
 }
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index 3ff67d188822..d7959a82c484 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -58,7 +58,7 @@ struct ShuffleVectorPseudo {
   ShuffleVectorPseudo(unsigned Opc, Register Dst,
                       std::initializer_list<SrcOp> SrcOps)
       : Opc(Opc), Dst(Dst), SrcOps(SrcOps){};
-  ShuffleVectorPseudo() {}
+  ShuffleVectorPseudo() = default;
 };
 
 /// Check if a vector shuffle corresponds to a REV instruction with the
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp
index cc45c6642ac5..ce6f15a799b7 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp
@@ -149,7 +149,7 @@ bool AArch64PostSelectOptimize::optimizeNZCVDefs(MachineBasicBlock &MBB) {
                                "op in fcmp range: "
                             << II);
           II.setDesc(TII->get(NewOpc));
-          II.RemoveOperand(DeadNZCVIdx);
+          II.removeOperand(DeadNZCVIdx);
           // Changing the opcode can result in differing regclass requirements,
           // e.g. SUBSWri uses gpr32 for the dest, whereas SUBWri uses gpr32sp.
           // Constrain the regclasses, possibly introducing a copy.
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
index d3f4130d2ba1..275949c5ee64 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
@@ -13,6 +13,7 @@
 
 #include "AArch64GlobalISelUtils.h"
 #include "AArch64TargetMachine.h"
+#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
 #include "llvm/CodeGen/GlobalISel/Combiner.h"
 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
@@ -162,13 +163,14 @@ static bool matchFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI,
 
   // Check whether folding this offset is legal. It must not go out of bounds of
   // the referenced object to avoid violating the code model, and must be
-  // smaller than 2^21 because this is the largest offset expressible in all
-  // object formats.
+  // smaller than 2^20 because this is the largest offset expressible in all
+  // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
+  // stores an immediate signed 21 bit offset.)
   //
   // This check also prevents us from folding negative offsets, which will end
   // up being treated in the same way as large positive ones. They could also
   // cause code model violations, and aren't really common enough to matter.
-  if (NewOffset >= (1 << 21))
+  if (NewOffset >= (1 << 20))
     return false;
 
   Type *T = GV->getValueType();
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 515a5c63a559..f0b311289c41 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -12,20 +12,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64RegisterBankInfo.h"
-#include "AArch64InstrInfo.h"
 #include "AArch64RegisterInfo.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/LowLevelType.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterBank.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -42,8 +41,8 @@
 
 using namespace llvm;
 
-AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
-    : AArch64GenRegisterBankInfo() {
+AArch64RegisterBankInfo::AArch64RegisterBankInfo(
+    const TargetRegisterInfo &TRI) {
   static llvm::once_flag InitializeRegisterBankFlag;
 
   static auto InitializeRegisterBankOnce = [&]() {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
index 2d76e48d7df2..01ef0bd92d50 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
@@ -13,7 +13,7 @@
 #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERBANKINFO_H
 #define LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERBANKINFO_H
 
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
 
 #define GET_REGBANK_DECLARATIONS
 #include "AArch64GenRegisterBank.inc"
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index dbb8e85713cb..e4b547e17f64 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -22,10 +22,10 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/TargetRegistry.h"
-#include "llvm/Support/EndianStream.h"
 #include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
 
@@ -470,7 +470,7 @@ bool AArch64AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
   // We are properly aligned, so write NOPs as requested.
   Count /= 4;
   for (uint64_t i = 0; i != Count; ++i)
-    support::endian::write<uint32_t>(OS, 0xd503201f, Endian);
+    OS.write("\x1f\x20\x03\xd5", 4);
   return true;
 }
 
@@ -592,17 +592,18 @@ public:
         if (XReg != AArch64::FP)
           return CU::UNWIND_ARM64_MODE_DWARF;
 
-        assert(XReg == AArch64::FP && "Invalid frame pointer!");
-        assert(i + 2 < e && "Insufficient CFI instructions to define a frame!");
+        if (i + 2 >= e)
+          return CU::UNWIND_ARM64_MODE_DWARF;
 
         const MCCFIInstruction &LRPush = Instrs[++i];
-        assert(LRPush.getOperation() == MCCFIInstruction::OpOffset &&
-               "Link register not pushed!");
+        if (LRPush.getOperation() != MCCFIInstruction::OpOffset)
+          return CU::UNWIND_ARM64_MODE_DWARF;
         const MCCFIInstruction &FPPush = Instrs[++i];
-        assert(FPPush.getOperation() == MCCFIInstruction::OpOffset &&
-               "Frame pointer not pushed!");
+        if (FPPush.getOperation() != MCCFIInstruction::OpOffset)
+          return CU::UNWIND_ARM64_MODE_DWARF;
 
-        assert(FPPush.getOffset() + 8 == LRPush.getOffset());
+        if (FPPush.getOffset() + 8 != LRPush.getOffset())
+          return CU::UNWIND_ARM64_MODE_DWARF;
         CurOffset = FPPush.getOffset();
 
         unsigned LRReg = *MRI.getLLVMRegNum(LRPush.getRegister(), true);
@@ -611,8 +612,8 @@ public:
         LRReg = getXRegFromWReg(LRReg);
         FPReg = getXRegFromWReg(FPReg);
 
-        assert(LRReg == AArch64::LR && FPReg == AArch64::FP &&
-               "Pushing invalid registers for frame!");
+        if (LRReg != AArch64::LR || FPReg != AArch64::FP)
+          return CU::UNWIND_ARM64_MODE_DWARF;
 
         // Indicate that the function has a frame.
         CompactUnwindEncoding |= CU::UNWIND_ARM64_MODE_FRAME;
@@ -620,7 +621,8 @@ public:
         break;
       }
       case MCCFIInstruction::OpDefCfaOffset: {
-        assert(StackSize == 0 && "We already have the CFA offset!");
+        if (StackSize != 0)
+          return CU::UNWIND_ARM64_MODE_DWARF;
         StackSize = std::abs(Inst.getOffset());
         break;
       }
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 78c0e90b1384..46edb12959d2 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -254,6 +254,7 @@ void AArch64TargetELFStreamer::emitInst(uint32_t Inst) {
 }
 
 void AArch64TargetELFStreamer::emitDirectiveVariantPCS(MCSymbol *Symbol) {
+  getStreamer().getAssembler().registerSymbol(*Symbol);
   cast<MCSymbolELF>(Symbol)->setOther(ELF::STO_AARCH64_VARIANT_PCS);
 }
 
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
index ee0870d9ef7a..5d2ba7ef02c0 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
@@ -1340,11 +1340,6 @@ void AArch64InstPrinter::printGPRSeqPairsClassOperand(const MCInst *MI,
   O << getRegisterName(Even) << ", " << getRegisterName(Odd);
 }
 
-static const unsigned MatrixZADRegisterTable[] = {
-  AArch64::ZAD0, AArch64::ZAD1, AArch64::ZAD2, AArch64::ZAD3,
-  AArch64::ZAD4, AArch64::ZAD5, AArch64::ZAD6, AArch64::ZAD7
-};
-
 void AArch64InstPrinter::printMatrixTileList(const MCInst *MI, unsigned OpNum,
                                              const MCSubtargetInfo &STI,
                                              raw_ostream &O) {
@@ -1362,7 +1357,7 @@ void AArch64InstPrinter::printMatrixTileList(const MCInst *MI, unsigned OpNum,
     unsigned Reg = RegMask & (1 << I);
     if (Reg == 0)
       continue;
-    O << getRegisterName(MatrixZADRegisterTable[I]);
+    O << getRegisterName(AArch64::ZAD0 + I);
     if (Printed + 1 != NumRegs)
       O << ", ";
     ++Printed;
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index ad97071434df..2901e5c0fe4d 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -16,6 +16,7 @@
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCFixup.h"
@@ -677,7 +678,6 @@ unsigned AArch64MCCodeEmitter::fixOneOperandFPComparison(
 #include "AArch64GenMCCodeEmitter.inc"
 
 MCCodeEmitter *llvm::createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
-                                                const MCRegisterInfo &MRI,
                                                 MCContext &Ctx) {
   return new AArch64MCCodeEmitter(MCII, Ctx);
 }
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index 844bd6bbada9..cb39c2a11487 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -17,6 +17,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index c1186ae804d2..34e3b2cf58e4 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -52,21 +52,14 @@ static MCSubtargetInfo *
 createAArch64MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
   if (CPU.empty()) {
     CPU = "generic";
+    if (FS.empty())
+      FS = "+v8a";
 
     if (TT.isArm64e())
       CPU = "apple-a12";
   }
 
-  // Most of the NEON instruction set isn't supported in streaming mode on SME
-  // targets, disable NEON unless explicitly requested.
-  bool RequestedNEON = FS.contains("neon");
-  bool RequestedStreamingSVE = FS.contains("streaming-sve");
-  MCSubtargetInfo *STI =
-      createAArch64MCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
-  if (RequestedStreamingSVE && !RequestedNEON &&
-      STI->hasFeature(AArch64::FeatureNEON))
-    STI->ToggleFeature(AArch64::FeatureNEON);
-  return STI;
+  return createAArch64MCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
 }
 
 void AArch64_MC::initLLVMToCVRegMapping(MCRegisterInfo *MRI) {
@@ -243,6 +236,31 @@ void AArch64_MC::initLLVMToCVRegMapping(MCRegisterInfo *MRI) {
     MRI->mapLLVMRegToCVReg(I.Reg, static_cast<int>(I.CVReg));
 }
 
+bool AArch64_MC::isQForm(const MCInst &MI, const MCInstrInfo *MCII) {
+  const auto &FPR128 = AArch64MCRegisterClasses[AArch64::FPR128RegClassID];
+  return llvm::any_of(MI, [&](const MCOperand &Op) {
+    return Op.isReg() && FPR128.contains(Op.getReg());
+  });
+}
+
+bool AArch64_MC::isFpOrNEON(const MCInst &MI, const MCInstrInfo *MCII) {
+  const auto &FPR128 = AArch64MCRegisterClasses[AArch64::FPR128RegClassID];
+  const auto &FPR64 = AArch64MCRegisterClasses[AArch64::FPR64RegClassID];
+  const auto &FPR32 = AArch64MCRegisterClasses[AArch64::FPR32RegClassID];
+  const auto &FPR16 = AArch64MCRegisterClasses[AArch64::FPR16RegClassID];
+  const auto &FPR8 = AArch64MCRegisterClasses[AArch64::FPR8RegClassID];
+
+  auto IsFPR = [&](const MCOperand &Op) {
+    if (!Op.isReg())
+      return false;
+    auto Reg = Op.getReg();
+    return FPR128.contains(Reg) || FPR64.contains(Reg) || FPR32.contains(Reg) ||
+           FPR16.contains(Reg) || FPR8.contains(Reg);
+  };
+
+  return llvm::any_of(MI, IsFPR);
+}
+
 static MCRegisterInfo *createAArch64MCRegisterInfo(const Triple &Triple) {
   MCRegisterInfo *X = new MCRegisterInfo();
   InitAArch64MCRegisterInfo(X, AArch64::LR);
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
index 66cb7a37a958..049c49796dc6 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCTARGETDESC_H
 #define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCTARGETDESC_H
 
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Support/DataTypes.h"
 
 #include <memory>
@@ -22,6 +23,7 @@ class formatted_raw_ostream;
 class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
+class MCInst;
 class MCInstrInfo;
 class MCInstPrinter;
 class MCRegisterInfo;
@@ -33,7 +35,6 @@ class MCTargetStreamer;
 class Target;
 
 MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
-                                          const MCRegisterInfo &MRI,
                                           MCContext &Ctx);
 MCAsmBackend *createAArch64leAsmBackend(const Target &T,
                                         const MCSubtargetInfo &STI,
@@ -60,8 +61,16 @@ MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S,
 
 namespace AArch64_MC {
 void initLLVMToCVRegMapping(MCRegisterInfo *MRI);
+bool isQForm(const MCInst &MI, const MCInstrInfo *MCII);
+bool isFpOrNEON(const MCInst &MI, const MCInstrInfo *MCII);
 }
 
+namespace AArch64 {
+enum OperandType {
+  OPERAND_IMPLICIT_IMM_0 = MCOI::OPERAND_FIRST_TARGET,
+};
+} // namespace AArch64
+
 } // End llvm namespace
 
 // Defines symbolic names for AArch64 registers.  This defines a mapping from
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
index 92552c3d41d5..1a8071ac1b33 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
@@ -76,7 +76,7 @@ void AArch64TargetStreamer::emitNoteSection(unsigned Flags) {
     return;
   }
   MCSection *Cur = OutStreamer.getCurrentSectionOnly();
-  OutStreamer.SwitchSection(Nt);
+  OutStreamer.switchSection(Nt);
 
   // Emit the note header.
   OutStreamer.emitValueToAlignment(Align(8).value());
@@ -92,7 +92,7 @@ void AArch64TargetStreamer::emitNoteSection(unsigned Flags) {
   OutStreamer.emitIntValue(0, 4);     // pad
 
   OutStreamer.endSection(Nt);
-  OutStreamer.SwitchSection(Cur);
+  OutStreamer.switchSection(Cur);
 }
 
 void AArch64TargetStreamer::emitInst(uint32_t Inst) {
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
index 0072af4cc16e..46ffa50b3e6e 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/MCWinCOFFObjectWriter.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
index b688165d3a7b..820d940c1ed2 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
@@ -8,6 +8,7 @@
 
 #include "AArch64WinCOFFStreamer.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCWin64EH.h"
@@ -26,14 +27,14 @@ public:
                          std::unique_ptr<MCObjectWriter> OW)
       : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW)) {}
 
-  void EmitWinEHHandlerData(SMLoc Loc) override;
-  void EmitWindowsUnwindTables() override;
-  void EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) override;
+  void emitWinEHHandlerData(SMLoc Loc) override;
+  void emitWindowsUnwindTables() override;
+  void emitWindowsUnwindTables(WinEH::FrameInfo *Frame) override;
   void finishImpl() override;
 };
 
-void AArch64WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) {
-  MCStreamer::EmitWinEHHandlerData(Loc);
+void AArch64WinCOFFStreamer::emitWinEHHandlerData(SMLoc Loc) {
+  MCStreamer::emitWinEHHandlerData(Loc);
 
   // We have to emit the unwind info now, because this directive
   // actually switches to the .xdata section!
@@ -41,11 +42,11 @@ void AArch64WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) {
                             /* HandlerData = */ true);
 }
 
-void AArch64WinCOFFStreamer::EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) {
+void AArch64WinCOFFStreamer::emitWindowsUnwindTables(WinEH::FrameInfo *Frame) {
   EHStreamer.EmitUnwindInfo(*this, Frame, /* HandlerData = */ false);
 }
 
-void AArch64WinCOFFStreamer::EmitWindowsUnwindTables() {
+void AArch64WinCOFFStreamer::emitWindowsUnwindTables() {
   if (!getNumWinFrameInfos())
     return;
   EHStreamer.Emit(*this);
@@ -53,7 +54,7 @@ void AArch64WinCOFFStreamer::EmitWindowsUnwindTables() {
 
 void AArch64WinCOFFStreamer::finishImpl() {
   emitFrames(nullptr);
-  EmitWindowsUnwindTables();
+  emitWindowsUnwindTables();
 
   MCWinCOFFStreamer::finishImpl();
 }
@@ -71,10 +72,9 @@ void AArch64TargetWinCOFFStreamer::emitARM64WinUnwindCode(unsigned UnwindCode,
   WinEH::FrameInfo *CurFrame = S.EnsureValidWinFrameInfo(SMLoc());
   if (!CurFrame)
     return;
-  MCSymbol *Label = S.emitCFILabel();
-  auto Inst = WinEH::Instruction(UnwindCode, Label, Reg, Offset);
+  auto Inst = WinEH::Instruction(UnwindCode, /*Label=*/nullptr, Reg, Offset);
   if (InEpilogCFI)
-    CurFrame->EpilogMap[CurrentEpilog].push_back(Inst);
+    CurFrame->EpilogMap[CurrentEpilog].Instructions.push_back(Inst);
   else
     CurFrame->Instructions.push_back(Inst);
 }
@@ -176,7 +176,8 @@ void AArch64TargetWinCOFFStreamer::emitARM64WinCFIPrologEnd() {
 
   MCSymbol *Label = S.emitCFILabel();
   CurFrame->PrologEnd = Label;
-  WinEH::Instruction Inst = WinEH::Instruction(Win64EH::UOP_End, Label, -1, 0);
+  WinEH::Instruction Inst =
+      WinEH::Instruction(Win64EH::UOP_End, /*Label=*/nullptr, -1, 0);
   auto it = CurFrame->Instructions.begin();
   CurFrame->Instructions.insert(it, Inst);
 }
@@ -198,9 +199,9 @@ void AArch64TargetWinCOFFStreamer::emitARM64WinCFIEpilogEnd() {
     return;
 
   InEpilogCFI = false;
-  MCSymbol *Label = S.emitCFILabel();
-  WinEH::Instruction Inst = WinEH::Instruction(Win64EH::UOP_End, Label, -1, 0);
-  CurFrame->EpilogMap[CurrentEpilog].push_back(Inst);
+  WinEH::Instruction Inst =
+      WinEH::Instruction(Win64EH::UOP_End, /*Label=*/nullptr, -1, 0);
+  CurFrame->EpilogMap[CurrentEpilog].Instructions.push_back(Inst);
   CurrentEpilog = nullptr;
 }
 
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 41f2cead4cf8..2744e81f99f1 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -10,14 +10,36 @@
 //
 //===----------------------------------------------------------------------===//
 
+def imm_to_tile8   : ComplexPattern<i64, 1, "ImmToTile<AArch64::ZAB0>", []>;
+def imm_to_tile16  : ComplexPattern<i64, 1, "ImmToTile<AArch64::ZAH0>", []>;
+def imm_to_tile32  : ComplexPattern<i64, 1, "ImmToTile<AArch64::ZAS0>", []>;
+def imm_to_tile64  : ComplexPattern<i64, 1, "ImmToTile<AArch64::ZAD0>", []>;
+def imm_to_tile128 : ComplexPattern<i64, 1, "ImmToTile<AArch64::ZAQ0>", []>;
+
+def tileslice8   : ComplexPattern<i32 , 2, "SelectSMETileSlice<4>", []>;
+def tileslice16  : ComplexPattern<i32 , 2, "SelectSMETileSlice<3>", []>;
+def tileslice32  : ComplexPattern<i32 , 2, "SelectSMETileSlice<2>", []>;
+def tileslice64  : ComplexPattern<i32 , 2, "SelectSMETileSlice<1>", []>;
+def tileslice128 : ComplexPattern<i32 , 2, "SelectSMETileSlice<0>", []>; // nop
+
+def am_sme_indexed_b4 :ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<0,15>", [], [SDNPWantRoot]>;
+
 //===----------------------------------------------------------------------===//
 // SME Outer Products
 //===----------------------------------------------------------------------===//
 
+class sme_outer_product_pseudo<ZPRRegOp zpr_ty>
+    : Pseudo<(outs), (ins i64imm:$tile, PPR3bAny:$pn, PPR3bAny:$pm,
+                          zpr_ty:$zn, zpr_ty:$zm), []>,
+      Sched<[]> {
+  // Translated to the actual instructions in AArch64ISelLowering.cpp
+  let usesCustomInserter = 1;
+}
+
 class sme_fp_outer_product_inst<bit S, bit sz, MatrixTileOperand za_ty,
                                 ZPRRegOp zpr_ty, string mnemonic>
     : I<(outs za_ty:$ZAda),
-        (ins PPR3bAny:$Pn, PPR3bAny:$Pm, zpr_ty:$Zn, zpr_ty:$Zm),
+      (ins za_ty:$_ZAda, PPR3bAny:$Pn, PPR3bAny:$Pm, zpr_ty:$Zn, zpr_ty:$Zm),
         mnemonic, "\t$ZAda, $Pn/m, $Pm/m, $Zn, $Zm",
         "", []>,
       Sched<[]> {
@@ -34,26 +56,42 @@ class sme_fp_outer_product_inst<bit S, bit sz, MatrixTileOperand za_ty,
   let Inst{9-5}   = Zn;
   let Inst{4}     = S;
   let Inst{3}     = 0b0;
+
+  let Constraints = "$ZAda = $_ZAda";
 }
 
-class sme_outer_product_fp32<bit S, string mnemonic>
-    : sme_fp_outer_product_inst<S, 0b0, TileOp32, ZPR32, mnemonic> {
-  bits<2> ZAda;
-  let Inst{1-0} = ZAda;
-  let Inst{2}   = 0b0;
+multiclass sme_outer_product_fp32<bit S, string mnemonic, SDPatternOperator op> {
+  def NAME : sme_fp_outer_product_inst<S, 0b0, TileOp32, ZPR32, mnemonic> {
+    bits<2> ZAda;
+    let Inst{1-0} = ZAda;
+    let Inst{2}   = 0b0;
+  }
+
+  def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR32>;
+
+  def : Pat<(op imm0_3:$tile, (nxv4i1 PPR3bAny:$pn), (nxv4i1 PPR3bAny:$pm),
+                (nxv4f32 ZPR32:$zn), (nxv4f32 ZPR32:$zm)),
+            (!cast<Instruction>(NAME # _PSEUDO) imm0_3:$tile, $pn, $pm, $zn, $zm)>;
 }
 
-class sme_outer_product_fp64<bit S, string mnemonic>
-    : sme_fp_outer_product_inst<S, 0b1, TileOp64, ZPR64, mnemonic> {
-  bits<3> ZAda;
-  let Inst{2-0} = ZAda;
+multiclass sme_outer_product_fp64<bit S, string mnemonic, SDPatternOperator op> {
+  def NAME : sme_fp_outer_product_inst<S, 0b1, TileOp64, ZPR64, mnemonic> {
+    bits<3> ZAda;
+    let Inst{2-0} = ZAda;
+  }
+
+  def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR64>;
+
+  def : Pat<(op imm0_7:$tile, (nxv2i1 PPR3bAny:$pn), (nxv2i1 PPR3bAny:$pm),
+                (nxv2f64 ZPR64:$zn), (nxv2f64 ZPR64:$zm)),
+            (!cast<Instruction>(NAME # _PSEUDO) imm0_7:$tile, $pn, $pm, $zn, $zm)>;
 }
 
 class sme_int_outer_product_inst<bit u0, bit u1, bit S, bit sz,
                                  MatrixTileOperand za_ty, ZPRRegOp zpr_ty,
                                  string mnemonic>
     : I<(outs za_ty:$ZAda),
-        (ins PPR3bAny:$Pn, PPR3bAny:$Pm, zpr_ty:$Zn, zpr_ty:$Zm),
+        (ins za_ty:$_ZAda, PPR3bAny:$Pn, PPR3bAny:$Pm, zpr_ty:$Zn, zpr_ty:$Zm),
         mnemonic, "\t$ZAda, $Pn/m, $Pm/m, $Zn, $Zm",
         "", []>,
       Sched<[]> {
@@ -72,26 +110,44 @@ class sme_int_outer_product_inst<bit u0, bit u1, bit S, bit sz,
   let Inst{9-5}   = Zn;
   let Inst{4}     = S;
   let Inst{3}     = 0b0;
+
+  let Constraints = "$ZAda = $_ZAda";
 }
 
-class sme_int_outer_product_i32<bits<3> opc, string mnemonic>
-    : sme_int_outer_product_inst<opc{2}, opc{1}, opc{0}, 0b0, TileOp32, ZPR8,
-                                 mnemonic> {
-  bits<2> ZAda;
-  let Inst{1-0} = ZAda;
-  let Inst{2}   = 0b0;
+multiclass sme_int_outer_product_i32<bits<3> opc, string mnemonic,
+                                     SDPatternOperator op> {
+  def NAME : sme_int_outer_product_inst<opc{2}, opc{1}, opc{0}, 0b0, TileOp32,
+                                        ZPR8, mnemonic> {
+    bits<2> ZAda;
+    let Inst{1-0} = ZAda;
+    let Inst{2}   = 0b0;
+  }
+
+  def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR8>;
+
+  def : Pat<(op imm0_3:$tile, (nxv16i1 PPR3bAny:$pn), (nxv16i1 PPR3bAny:$pm),
+                (nxv16i8 ZPR8:$zn), (nxv16i8 ZPR8:$zm)),
+            (!cast<Instruction>(NAME # _PSEUDO) imm0_3:$tile, $pn, $pm, $zn, $zm)>;
 }
 
-class sme_int_outer_product_i64<bits<3> opc, string mnemonic>
-    : sme_int_outer_product_inst<opc{2}, opc{1}, opc{0}, 0b1, TileOp64, ZPR16,
-                                 mnemonic> {
-  bits<3> ZAda;
-  let Inst{2-0} = ZAda;
+multiclass sme_int_outer_product_i64<bits<3> opc, string mnemonic,
+                                     SDPatternOperator op> {
+  def NAME : sme_int_outer_product_inst<opc{2}, opc{1}, opc{0}, 0b1, TileOp64,
+                                        ZPR16, mnemonic> {
+    bits<3> ZAda;
+    let Inst{2-0} = ZAda;
+  }
+
+  def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR16>;
+
+  def : Pat<(op imm0_7:$tile, (nxv8i1 PPR3bAny:$pn), (nxv8i1 PPR3bAny:$pm),
+                (nxv8i16 ZPR16:$zn), (nxv8i16 ZPR16:$zm)),
+            (!cast<Instruction>(NAME # _PSEUDO) imm0_7:$tile, $pn, $pm, $zn, $zm)>;
 }
 
 class sme_outer_product_widening_inst<bit op, bit S, string mnemonic>
     : I<(outs TileOp32:$ZAda),
-        (ins PPR3bAny:$Pn, PPR3bAny:$Pm, ZPR16:$Zn, ZPR16:$Zm),
+        (ins  TileOp32:$_ZAda, PPR3bAny:$Pn, PPR3bAny:$Pm, ZPR16:$Zn, ZPR16:$Zm),
         mnemonic, "\t$ZAda, $Pn/m, $Pm/m, $Zn, $Zm",
         "", []>,
       Sched<[]> {
@@ -109,14 +165,28 @@ class sme_outer_product_widening_inst<bit op, bit S, string mnemonic>
   let Inst{4}     = S;
   let Inst{3-2}   = 0b00;
   let Inst{1-0}   = ZAda;
+
+  let Constraints = "$ZAda = $_ZAda";
 }
 
-multiclass sme_bf16_outer_product<bit S, string mnemonic> {
-  def : sme_outer_product_widening_inst<0b0, S, mnemonic>;
+multiclass sme_bf16_outer_product<bit S, string mnemonic, SDPatternOperator op> {
+  def NAME : sme_outer_product_widening_inst<0b0, S, mnemonic>;
+
+  def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR16>;
+
+  def : Pat<(op imm0_3:$tile, (nxv8i1 PPR3bAny:$pn), (nxv8i1 PPR3bAny:$pm),
+                (nxv8bf16 ZPR16:$zn), (nxv8bf16 ZPR16:$zm)),
+            (!cast<Instruction>(NAME # _PSEUDO) imm0_3:$tile, $pn, $pm, $zn, $zm)>;
 }
 
-multiclass sme_f16_outer_product<bit S, string mnemonic> {
-  def : sme_outer_product_widening_inst<0b1, S, mnemonic>;
+multiclass sme_f16_outer_product<bit S, string mnemonic, SDPatternOperator op> {
+  def NAME : sme_outer_product_widening_inst<0b1, S, mnemonic>;
+
+  def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR16>;
+
+  def : Pat<(op imm0_3:$tile, (nxv8i1 PPR3bAny:$pn), (nxv8i1 PPR3bAny:$pm),
+                (nxv8f16 ZPR16:$zn), (nxv8f16 ZPR16:$zm)),
+            (!cast<Instruction>(NAME # _PSEUDO) imm0_3:$tile, $pn, $pm, $zn, $zm)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -126,7 +196,7 @@ multiclass sme_f16_outer_product<bit S, string mnemonic> {
 class sme_add_vector_to_tile_inst<bit op, bit V, MatrixTileOperand tile_ty,
                                   ZPRRegOp zpr_ty, string mnemonic>
     : I<(outs tile_ty:$ZAda),
-        (ins PPR3bAny:$Pn, PPR3bAny:$Pm, zpr_ty:$Zn),
+        (ins tile_ty:$_ZAda, PPR3bAny:$Pn, PPR3bAny:$Pm, zpr_ty:$Zn),
         mnemonic, "\t$ZAda, $Pn/m, $Pm/m, $Zn",
         "", []>, Sched<[]> {
   bits<3> Pm;
@@ -140,6 +210,8 @@ class sme_add_vector_to_tile_inst<bit op, bit V, MatrixTileOperand tile_ty,
   let Inst{12-10} = Pn;
   let Inst{9-5}   = Zn;
   let Inst{4-3}   = 0b00;
+
+  let Constraints = "$ZAda = $_ZAda";
 }
 
 class sme_add_vector_to_tile_u32<bit V, string mnemonic>
@@ -225,6 +297,33 @@ multiclass sme_mem_ld_ss_aliases<string inst, bit is_col> {
   defm NAME : sme_mem_ss_aliases<"ld1", inst, is_col, "/z">;
 }
 
+multiclass sme_mem_ld_ss_patterns<Instruction Inst, SDPatternOperator Load,
+                                  Operand tile_ty, Operand offset_ty,
+                                  ComplexPattern addr,
+                                  ComplexPattern tileslice> {
+  // base, tileslice
+  def : Pat<(Load PPR3bAny:$pg, GPR64sp:$base, tile_ty:$tile,
+                  (i32 (tileslice MatrixIndexGPR32Op12_15:$idx, offset_ty:$imm))),
+            (Inst tile_ty:$tile, $idx, $imm, $pg, $base, XZR)>;
+
+  // reg + reg, tileslice
+  let AddedComplexity = 1 in {
+    def : Pat<(Load PPR3bAny:$pg, (addr GPR64sp:$base, GPR64:$offset),
+                    tile_ty:$tile, (i32 (tileslice MatrixIndexGPR32Op12_15:$idx,
+                                              offset_ty:$imm))),
+              (Inst tile_ty:$tile, $idx, $imm, $pg, $base, $offset)>;
+  }
+}
+
+class sme_load_pseudo
+    : Pseudo<(outs), (ins i64imm:$tile, MatrixIndexGPR32Op12_15:$idx,
+                          i64imm:$imm, PPR3bAny:$pg, GPR64sp:$base, GPR64:$offset), []>,
+      Sched<[]> {
+  // Translated to the actual instructions in AArch64ISelLowering.cpp
+  let usesCustomInserter = 1;
+  let mayLoad = 1;
+}
+
 multiclass sme_mem_ld_v_ss<string mnemonic, bit is_col> {
   def _B : sme_mem_ld_ss_inst<0b0, 0b00, mnemonic # "b",
                               !if(is_col, TileVectorOpV8, TileVectorOpH8),
@@ -264,6 +363,40 @@ multiclass sme_mem_ld_v_ss<string mnemonic, bit is_col> {
   }
 
   defm : sme_mem_ld_ss_aliases<NAME, is_col>;
+
+  // Pseudo instructions for lowering intrinsics, using immediates instead of
+  // tile registers.
+  def _PSEUDO_B : sme_load_pseudo;
+  def _PSEUDO_H : sme_load_pseudo;
+  def _PSEUDO_S : sme_load_pseudo;
+  def _PSEUDO_D : sme_load_pseudo;
+  def _PSEUDO_Q : sme_load_pseudo;
+
+  defm : sme_mem_ld_ss_patterns<!cast<Instruction>(NAME # _PSEUDO_B),
+                                !if(is_col, int_aarch64_sme_ld1b_vert,
+                                            int_aarch64_sme_ld1b_horiz),
+                                sme_elm_idx0_0, imm0_15, am_sve_regreg_lsl0,
+                                tileslice8>;
+  defm : sme_mem_ld_ss_patterns<!cast<Instruction>(NAME # _PSEUDO_H),
+                                !if(is_col, int_aarch64_sme_ld1h_vert,
+                                            int_aarch64_sme_ld1h_horiz),
+                                imm0_1, imm0_7, am_sve_regreg_lsl1,
+                                tileslice16>;
+  defm : sme_mem_ld_ss_patterns<!cast<Instruction>(NAME # _PSEUDO_S),
+                                !if(is_col, int_aarch64_sme_ld1w_vert,
+                                            int_aarch64_sme_ld1w_horiz),
+                                imm0_3, imm0_3, am_sve_regreg_lsl2,
+                                tileslice32>;
+  defm : sme_mem_ld_ss_patterns<!cast<Instruction>(NAME # _PSEUDO_D),
+                                !if(is_col, int_aarch64_sme_ld1d_vert,
+                                            int_aarch64_sme_ld1d_horiz),
+                                imm0_7, imm0_1, am_sve_regreg_lsl3,
+                                tileslice64>;
+  defm : sme_mem_ld_ss_patterns<!cast<Instruction>(NAME # _PSEUDO_Q),
+                                !if(is_col, int_aarch64_sme_ld1q_vert,
+                                            int_aarch64_sme_ld1q_horiz),
+                                imm0_15, sme_elm_idx0_0, am_sve_regreg_lsl4,
+                                tileslice128>;
 }
 
 multiclass sme_mem_ld_ss<string mnemonic> {
@@ -310,6 +443,25 @@ multiclass sme_mem_st_ss_aliases<string inst, bit is_col> {
   defm NAME : sme_mem_ss_aliases<"st1", inst, is_col>;
 }
 
+multiclass sme_mem_st_ss_patterns<Instruction Inst, SDPatternOperator Store,
+                                  Operand offset_ty,
+                                  ComplexPattern imm2tile,
+                                  ComplexPattern addr,
+                                  ComplexPattern tileslice> {
+  // base, tileslice
+  def : Pat<(Store PPR3bAny:$pg, GPR64sp:$base, (imm2tile untyped:$tile),
+                   (i32 (tileslice MatrixIndexGPR32Op12_15:$idx, offset_ty:$imm))),
+            (Inst $tile, $idx, $imm, $pg, $base, XZR)>;
+
+  // reg + reg, tileslice
+  let AddedComplexity = 1 in {
+    def : Pat<(Store PPR3bAny:$pg, (addr GPR64sp:$base, GPR64:$offset),
+                     (imm2tile untyped:$tile),
+                     (i32 (tileslice MatrixIndexGPR32Op12_15:$idx, offset_ty:$imm))),
+              (Inst $tile, $idx, $imm, $pg, $base, $offset)>;
+  }
+}
+
 multiclass sme_mem_st_v_ss<string mnemonic, bit is_col> {
   def _B : sme_mem_st_ss_inst<0b0, 0b00, mnemonic # "b",
                               !if(is_col, TileVectorOpV8, TileVectorOpH8),
@@ -349,6 +501,32 @@ multiclass sme_mem_st_v_ss<string mnemonic, bit is_col> {
   }
 
   defm : sme_mem_st_ss_aliases<NAME, is_col>;
+
+  defm : sme_mem_st_ss_patterns<!cast<Instruction>(NAME # _B),
+                                !if(is_col, int_aarch64_sme_st1b_vert,
+                                            int_aarch64_sme_st1b_horiz),
+                                imm0_15, imm_to_tile8, am_sve_regreg_lsl0,
+                                tileslice8>;
+  defm : sme_mem_st_ss_patterns<!cast<Instruction>(NAME # _H),
+                                !if(is_col, int_aarch64_sme_st1h_vert,
+                                            int_aarch64_sme_st1h_horiz),
+                                imm0_7, imm_to_tile16, am_sve_regreg_lsl1,
+                                tileslice16>;
+  defm : sme_mem_st_ss_patterns<!cast<Instruction>(NAME # _S),
+                                !if(is_col, int_aarch64_sme_st1w_vert,
+                                            int_aarch64_sme_st1w_horiz),
+                                imm0_3, imm_to_tile32, am_sve_regreg_lsl2,
+                                tileslice32>;
+  defm : sme_mem_st_ss_patterns<!cast<Instruction>(NAME # _D),
+                                !if(is_col, int_aarch64_sme_st1d_vert,
+                                            int_aarch64_sme_st1d_horiz),
+                                imm0_1, imm_to_tile64, am_sve_regreg_lsl3,
+                                tileslice64>;
+  defm : sme_mem_st_ss_patterns<!cast<Instruction>(NAME # _Q),
+                                !if(is_col, int_aarch64_sme_st1q_vert,
+                                            int_aarch64_sme_st1q_horiz),
+                                sme_elm_idx0_0, imm_to_tile128,
+                                am_sve_regreg_lsl4, tileslice128>;
 }
 
 multiclass sme_mem_st_ss<string mnemonic> {
@@ -360,7 +538,7 @@ multiclass sme_mem_st_ss<string mnemonic> {
 // SME Save and Restore Array
 //===----------------------------------------------------------------------===//
 
-class sme_spill_fill_inst<bit isStore, dag outs, dag ins, string opcodestr>
+class sme_spill_fill_base<bit isStore, dag outs, dag ins, string opcodestr>
     : I<outs, ins, opcodestr, "\t$ZAt[$Rv, $imm4], [$Rn, $offset, mul vl]", "",
         []>,
       Sched<[]> {
@@ -375,33 +553,61 @@ class sme_spill_fill_inst<bit isStore, dag outs, dag ins, string opcodestr>
   let Inst{9-5}   = Rn;
   let Inst{4}     = 0b0;
   let Inst{3-0}   = imm4;
-
-  let mayLoad = !not(isStore);
-  let mayStore = isStore;
 }
 
-multiclass sme_spill_fill<bit isStore, dag outs, dag ins, string opcodestr> {
-  def NAME : sme_spill_fill_inst<isStore, outs, ins, opcodestr>;
-
+let mayStore = 1 in
+class sme_spill_inst<string opcodestr>
+    : sme_spill_fill_base<0b1, (outs),
+                          (ins MatrixOp:$ZAt, MatrixIndexGPR32Op12_15:$Rv,
+                               sme_elm_idx0_15:$imm4, GPR64sp:$Rn,
+                               imm0_15:$offset),
+                          opcodestr>;
+let mayLoad = 1 in
+class sme_fill_inst<string opcodestr>
+    : sme_spill_fill_base<0b0, (outs MatrixOp:$ZAt),
+                          (ins MatrixIndexGPR32Op12_15:$Rv,
+                               sme_elm_idx0_15:$imm4, GPR64sp:$Rn,
+                               imm0_15:$offset),
+                          opcodestr>;
+multiclass sme_spill<string opcodestr> {
+  def NAME : sme_spill_inst<opcodestr>;
   def : InstAlias<opcodestr # "\t$ZAt[$Rv, $imm4], [$Rn]",
                   (!cast<Instruction>(NAME) MatrixOp:$ZAt,
                    MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm4, GPR64sp:$Rn, 0), 1>;
-}
-
-multiclass sme_spill<string opcodestr> {
-  defm NAME : sme_spill_fill<0b1, (outs),
-                             (ins MatrixOp:$ZAt, MatrixIndexGPR32Op12_15:$Rv,
-                                  sme_elm_idx0_15:$imm4, GPR64sp:$Rn,
-                                  imm0_15:$offset),
-                             opcodestr>;
+  // base
+  def : Pat<(int_aarch64_sme_str MatrixIndexGPR32Op12_15:$idx, GPR64sp:$base),
+            (!cast<Instruction>(NAME) ZA, $idx, 0, $base, 0)>;
+  // scalar + immediate (mul vl)
+  let AddedComplexity = 2 in {
+    def : Pat<(int_aarch64_sme_str MatrixIndexGPR32Op12_15:$idx,
+                                   (am_sme_indexed_b4 GPR64sp:$base, imm0_15:$imm4)),
+              (!cast<Instruction>(NAME) ZA, $idx, 0, $base, $imm4)>;
+  }
 }
 
 multiclass sme_fill<string opcodestr> {
-  defm NAME : sme_spill_fill<0b0, (outs MatrixOp:$ZAt),
-                             (ins MatrixIndexGPR32Op12_15:$Rv,
-                                  sme_elm_idx0_15:$imm4, GPR64sp:$Rn,
-                                  imm0_15:$offset),
-                             opcodestr>;
+  def NAME : sme_fill_inst<opcodestr>;
+  def : InstAlias<opcodestr # "\t$ZAt[$Rv, $imm4], [$Rn]",
+                  (!cast<Instruction>(NAME) MatrixOp:$ZAt,
+                   MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm4, GPR64sp:$Rn, 0), 1>;
+  def NAME # _PSEUDO
+      : Pseudo<(outs),
+               (ins MatrixIndexGPR32Op12_15:$idx, imm0_15:$imm4,
+                    GPR64sp:$base), []>,
+        Sched<[]> {
+    // Translated to actual instruction in AArch64ISelLowering.cpp
+    let usesCustomInserter = 1;
+    let mayLoad = 1;
+  }
+  // base
+  def : Pat<(int_aarch64_sme_ldr MatrixIndexGPR32Op12_15:$idx, GPR64sp:$base),
+            (!cast<Instruction>(NAME # _PSEUDO) $idx, 0, $base)>;
+  // scalar + immediate (mul vl)
+  let AddedComplexity = 2 in {
+    def : Pat<(int_aarch64_sme_ldr MatrixIndexGPR32Op12_15:$idx,
+                                   (am_sme_indexed_b4 GPR64sp:$base, imm0_15:$imm4)),
+              (!cast<Instruction>(NAME # _PSEUDO) $idx, $imm4, $base)>;
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -429,8 +635,12 @@ class sme_vector_to_tile_inst<bit Q, bits<2> sz, MatrixTileVectorOperand tile_ty
                               bit is_col, Operand imm_ty, ZPRRegOp zpr_ty,
                               string mnemonic>
     : sme_vector_to_tile_base<Q, is_col, sz, (outs tile_ty:$ZAd),
-        (ins MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm, PPR3bAny:$Pg, zpr_ty:$Zn),
-        mnemonic, "\t$ZAd[$Rv, $imm], $Pg/m, $Zn">;
+        (ins tile_ty:$_ZAd, MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm, PPR3bAny:$Pg, zpr_ty:$Zn),
+        mnemonic, "\t$ZAd[$Rv, $imm], $Pg/m, $Zn">{
+
+  let Constraints = "$ZAd = $_ZAd";
+}
+
 
 multiclass sme_vector_to_tile_aliases<Instruction inst,
                                       MatrixTileVectorOperand tile_ty,
@@ -439,6 +649,30 @@ multiclass sme_vector_to_tile_aliases<Instruction inst,
                   (inst tile_ty:$ZAd, MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm, PPR3bAny:$Pg, zpr_ty:$Zn), 1>;
 }
 
+multiclass sme_vector_to_tile_patterns<Instruction inst, ValueType zpr_vt,
+                                       ValueType ppr_vt, Operand imm_ty,
+                                       Operand offset_ty,
+                                       SDPatternOperator op,
+                                       ComplexPattern tileslice> {
+  def : Pat<(op imm_ty:$tile, MatrixIndexGPR32Op12_15:$idx,
+                (ppr_vt PPR3bAny:$pg), (zpr_vt ZPRAny:$zn)),
+            (inst imm_ty:$tile, $idx, 0, $pg, $zn)>;
+  let AddedComplexity = 1 in {
+    def : Pat<(op imm_ty:$tile, (i32 (tileslice MatrixIndexGPR32Op12_15:$idx,
+                                                offset_ty:$imm)),
+                  (ppr_vt PPR3bAny:$pg), (zpr_vt ZPRAny:$zn)),
+              (inst imm_ty:$tile, $idx, $imm, $pg, $zn)>;
+  }
+}
+
+class sme_mova_insert_pseudo
+    : Pseudo<(outs), (ins i64imm:$tile, MatrixIndexGPR32Op12_15:$idx,
+                          i64imm:$imm, PPR3bAny:$pg, ZPRAny:$zn), []>,
+      Sched<[]> {
+  // Translated to the actual instructions in AArch64ISelLowering.cpp
+  let usesCustomInserter = 1;
+}
+
 multiclass sme_vector_v_to_tile<string mnemonic, bit is_col> {
   def _B : sme_vector_to_tile_inst<0b0, 0b00, !if(is_col, TileVectorOpV8,
                                                           TileVectorOpH8),
@@ -478,6 +712,14 @@ multiclass sme_vector_v_to_tile<string mnemonic, bit is_col> {
     let Inst{3-0} = ZAd;
   }
 
+  // Pseudo instructions for lowering intrinsics, using immediates instead of
+  // tile registers.
+  def _PSEUDO_B : sme_mova_insert_pseudo;
+  def _PSEUDO_H : sme_mova_insert_pseudo;
+  def _PSEUDO_S : sme_mova_insert_pseudo;
+  def _PSEUDO_D : sme_mova_insert_pseudo;
+  def _PSEUDO_Q : sme_mova_insert_pseudo;
+
   defm : sme_vector_to_tile_aliases<!cast<Instruction>(NAME # _B),
                                     !if(is_col, TileVectorOpV8,
                                                 TileVectorOpH8),
@@ -498,6 +740,62 @@ multiclass sme_vector_v_to_tile<string mnemonic, bit is_col> {
                                     !if(is_col, TileVectorOpV128,
                                                 TileVectorOpH128),
                                     ZPR128, sme_elm_idx0_0>;
+
+  defvar op = !if(is_col, int_aarch64_sme_write_vert,
+                          int_aarch64_sme_write_horiz);
+
+  defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_B),
+                                     nxv16i8, nxv16i1, sme_elm_idx0_0, imm0_15,
+                                     op, tileslice8>;
+  defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_H),
+                                     nxv8i16, nxv8i1, sme_elm_idx0_1, imm0_7,
+                                     op, tileslice16>;
+  defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_H),
+                                     nxv8f16, nxv8i1, sme_elm_idx0_1, imm0_7,
+                                     op, tileslice16>;
+  defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_H),
+                                     nxv8bf16, nxv8i1, sme_elm_idx0_1, imm0_7,
+                                     op, tileslice16>;
+  defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_S),
+                                     nxv4i32, nxv4i1, sme_elm_idx0_3, imm0_3,
+                                     op, tileslice32>;
+  defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_S),
+                                     nxv4f32, nxv4i1, sme_elm_idx0_3, imm0_3,
+                                     op, tileslice32>;
+  defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_D),
+                                     nxv2i64, nxv2i1, sme_elm_idx0_7, imm0_1,
+                                     op, tileslice64>;
+  defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_D),
+                                     nxv2f64, nxv2i1, sme_elm_idx0_7, imm0_1,
+                                     op, tileslice64>;
+
+  defvar opq = !if(is_col, int_aarch64_sme_writeq_vert,
+                           int_aarch64_sme_writeq_horiz);
+
+  defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_Q),
+                                     nxv16i8, nxv16i1, sme_elm_idx0_15,
+                                     sme_elm_idx0_0, opq, tileslice128>;
+  defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_Q),
+                                     nxv8i16, nxv8i1, sme_elm_idx0_15,
+                                     sme_elm_idx0_0, opq, tileslice128>;
+  defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_Q),
+                                     nxv8f16, nxv8i1, sme_elm_idx0_15,
+                                     sme_elm_idx0_0, opq, tileslice128>;
+  defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_Q),
+                                     nxv8bf16, nxv8i1, sme_elm_idx0_15,
+                                     sme_elm_idx0_0, opq, tileslice128>;
+  defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_Q),
+                                     nxv4i32, nxv4i1, sme_elm_idx0_15,
+                                     sme_elm_idx0_0, opq, tileslice128>;
+  defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_Q),
+                                     nxv4f32, nxv4i1, sme_elm_idx0_15,
+                                     sme_elm_idx0_0, opq, tileslice128>;
+  defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_Q),
+                                     nxv2i64, nxv2i1, sme_elm_idx0_15,
+                                     sme_elm_idx0_0, opq, tileslice128>;
+  defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_Q),
+                                     nxv2f64, nxv2i1, sme_elm_idx0_15,
+                                     sme_elm_idx0_0, opq, tileslice128>;
 }
 
 multiclass sme_vector_to_tile<string mnemonic> {
@@ -526,8 +824,11 @@ class sme_tile_to_vector_inst<bit Q, bits<2> sz, ZPRRegOp zpr_ty,
                               MatrixTileVectorOperand tile_ty,
                               bit is_col, Operand imm_ty, string mnemonic>
     : sme_tile_to_vector_base<Q, is_col, sz, (outs zpr_ty:$Zd),
-        (ins PPR3bAny:$Pg, tile_ty:$ZAn, MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm),
-        mnemonic, "\t$Zd, $Pg/m, $ZAn[$Rv, $imm]">;
+        (ins zpr_ty:$_Zd, PPR3bAny:$Pg, tile_ty:$ZAn, MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm),
+        mnemonic, "\t$Zd, $Pg/m, $ZAn[$Rv, $imm]"> {
+
+  let Constraints = "$Zd = $_Zd";
+}
 
 multiclass sme_tile_to_vector_aliases<Instruction inst, ZPRRegOp zpr_ty,
                                       MatrixTileVectorOperand tile_ty,
@@ -536,6 +837,23 @@ multiclass sme_tile_to_vector_aliases<Instruction inst, ZPRRegOp zpr_ty,
                   (inst zpr_ty:$Zd, PPR3bAny:$Pg, tile_ty:$ZAn, MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm), 1>;
 }
 
+multiclass sme_tile_to_vector_patterns<Instruction inst, ValueType zpr_vt,
+                                       ValueType ppr_vt, Operand offset_ty,
+                                       ComplexPattern imm2tile,
+                                       ComplexPattern tileslice,
+                                       SDPatternOperator op> {
+  def : Pat<(zpr_vt (op (zpr_vt ZPRAny:$passthru), (ppr_vt PPR3bAny:$pg),
+                        (imm2tile untyped:$tile), MatrixIndexGPR32Op12_15:$idx)),
+            (inst $passthru, $pg, $tile, $idx, 0)>;
+  let AddedComplexity = 1 in {
+    def : Pat<(zpr_vt (op (zpr_vt ZPRAny:$passthru), (ppr_vt PPR3bAny:$pg),
+                          (imm2tile untyped:$tile),
+                          (i32 (tileslice MatrixIndexGPR32Op12_15:$idx,
+                                          offset_ty:$imm)))),
+              (inst $passthru, $pg, $tile, $idx, $imm)>;
+  }
+}
+
 multiclass sme_tile_to_vector_v<string mnemonic, bit is_col> {
   def _B : sme_tile_to_vector_inst<0b0, 0b00, ZPR8, !if(is_col, TileVectorOpV8,
                                                                 TileVectorOpH8),
@@ -589,6 +907,62 @@ multiclass sme_tile_to_vector_v<string mnemonic, bit is_col> {
   defm : sme_tile_to_vector_aliases<!cast<Instruction>(NAME # _Q), ZPR128,
                                     !if(is_col, TileVectorOpV128,
                                                 TileVectorOpH128), sme_elm_idx0_0>;
+
+  defvar op = !if(is_col, int_aarch64_sme_read_vert,
+                          int_aarch64_sme_read_horiz);
+
+  defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _B),
+                                     nxv16i8, nxv16i1, imm0_15,
+                                     imm_to_tile8, tileslice8, op>;
+  defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _H),
+                                     nxv8i16, nxv8i1, imm0_7,
+                                     imm_to_tile16, tileslice16, op>;
+  defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _H),
+                                     nxv8f16, nxv8i1, imm0_7,
+                                     imm_to_tile16, tileslice16, op>;
+  defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _H),
+                                     nxv8bf16, nxv8i1, imm0_7,
+                                     imm_to_tile16, tileslice16, op>;
+  defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _S),
+                                     nxv4i32, nxv4i1, imm0_3,
+                                     imm_to_tile32, tileslice32, op>;
+  defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _S),
+                                     nxv4f32, nxv4i1, imm0_3,
+                                     imm_to_tile32, tileslice32, op>;
+  defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _D),
+                                     nxv2i64, nxv2i1, imm0_1,
+                                     imm_to_tile64, tileslice64, op>;
+  defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _D),
+                                     nxv2f64, nxv2i1, imm0_1,
+                                     imm_to_tile64, tileslice64, op>;
+
+  defvar opq = !if(is_col, int_aarch64_sme_readq_vert,
+                           int_aarch64_sme_readq_horiz);
+
+  defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _Q),
+                                     nxv16i8, nxv16i1, sme_elm_idx0_0,
+                                     imm_to_tile128, tileslice128, opq>;
+  defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _Q),
+                                     nxv8i16, nxv8i1, sme_elm_idx0_0,
+                                     imm_to_tile128, tileslice128, opq>;
+  defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _Q),
+                                     nxv8f16, nxv8i1, sme_elm_idx0_0,
+                                     imm_to_tile128, tileslice128, opq>;
+  defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _Q),
+                                     nxv8bf16, nxv8i1, sme_elm_idx0_0,
+                                     imm_to_tile128, tileslice128, opq>;
+  defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _Q),
+                                     nxv4i32, nxv4i1, sme_elm_idx0_0,
+                                     imm_to_tile128, tileslice128, opq>;
+  defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _Q),
+                                     nxv4f32, nxv4i1, sme_elm_idx0_0,
+                                     imm_to_tile128, tileslice128, opq>;
+  defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _Q),
+                                     nxv2i64, nxv2i1, sme_elm_idx0_0,
+                                     imm_to_tile128, tileslice128, opq>;
+  defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _Q),
+                                     nxv2f64, nxv2i1, sme_elm_idx0_0,
+                                     imm_to_tile128, tileslice128, opq>;
 }
 
 multiclass sme_tile_to_vector<string mnemonic> {
@@ -600,8 +974,11 @@ multiclass sme_tile_to_vector<string mnemonic> {
 // SME Zero
 //===----------------------------------------------------------------------===//
 
+// NOTE: This definition isn't really correct because there are outputs, i.e.
+// the tile registers being zeroed. We fix this up in a custom inserter that
+// marks the appropriate registers as being implicitly defined.
 class sme_zero_inst<string mnemonic>
-    : I<(outs MatrixTileList:$imm), (ins),
+    : I<(outs), (ins MatrixTileList:$imm),
         mnemonic, "\t$imm", "", []>, Sched<[]> {
   bits<8> imm;
   let Inst{31-8} = 0b110000000000100000000000;
@@ -626,6 +1003,15 @@ multiclass sme_zero<string mnemonic> {
   def : InstAlias<"zero\t\\{za0.s,za1.s,za3.s\\}", (!cast<Instruction>(NAME) 0b10111011), 1>;
   def : InstAlias<"zero\t\\{za0.s,za2.s,za3.s\\}", (!cast<Instruction>(NAME) 0b11011101), 1>;
   def : InstAlias<"zero\t\\{za1.s,za2.s,za3.s\\}", (!cast<Instruction>(NAME) 0b11101110), 1>;
+
+  def NAME # _PSEUDO : Pseudo<(outs), (ins i64imm:$tilelist), []>,
+      Sched<[]> {
+    // Translated to the actual instructions in AArch64ISelLowering.cpp
+    let usesCustomInserter = 1;
+  }
+
+  def : Pat<(int_aarch64_sme_zero imm:$imm),
+            (!cast<Instruction>(NAME # _PSEUDO) imm:$imm)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -651,6 +1037,15 @@ class sve2_int_perm_revd<string asm>
   let ElementSize = ZPR128.ElementSize;
 }
 
+multiclass sve2_int_perm_revd<string asm, SDPatternOperator op> {
+  def NAME : sve2_int_perm_revd<asm>;
+
+  def : SVE_1_Op_Passthru_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME)>;
+  def : SVE_1_Op_Passthru_Pat<nxv8i16, op, nxv8i1,  nxv8i16, !cast<Instruction>(NAME)>;
+  def : SVE_1_Op_Passthru_Pat<nxv4i32, op, nxv4i1,  nxv4i32, !cast<Instruction>(NAME)>;
+  def : SVE_1_Op_Passthru_Pat<nxv2i64, op, nxv2i1,  nxv2i64, !cast<Instruction>(NAME)>;
+}
+
 class sve2_clamp<string asm, bits<2> sz, bit U, ZPRRegOp zpr_ty>
     : I<(outs zpr_ty:$Zd), (ins zpr_ty:$Zn, zpr_ty:$Zm, zpr_ty:$_Zd),
         asm, "\t$Zd, $Zn, $Zm", "", []>,
@@ -672,11 +1067,16 @@ class sve2_clamp<string asm, bits<2> sz, bit U, ZPRRegOp zpr_ty>
   let ElementSize = zpr_ty.ElementSize;
 }
 
-multiclass sve2_clamp<string asm, bit U> {
+multiclass sve2_clamp<string asm, bit U, SDPatternOperator op> {
   def _B : sve2_clamp<asm, 0b00, U, ZPR8>;
   def _H : sve2_clamp<asm, 0b01, U, ZPR16>;
   def _S : sve2_clamp<asm, 0b10, U, ZPR32>;
   def _D : sve2_clamp<asm, 0b11, U, ZPR64>;
+
+  def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
 class sve2_int_perm_sel_p<string asm, PPRRegOp ppr_ty, Operand imm_ty>
@@ -699,7 +1099,7 @@ class sve2_int_perm_sel_p<string asm, PPRRegOp ppr_ty, Operand imm_ty>
   let Inst{3-0}   = Pd;
 }
 
-multiclass sve2_int_perm_sel_p<string asm> {
+multiclass sve2_int_perm_sel_p<string asm, SDPatternOperator op> {
   def _B : sve2_int_perm_sel_p<asm, PPR8, sme_elm_idx0_15> {
     bits<4> imm;
     let Inst{23-22} = imm{3-2};
@@ -723,4 +1123,32 @@ multiclass sve2_int_perm_sel_p<string asm> {
     let Inst{22}    = 0b1;
     let Inst{20-18} = 0b000;
   }
+
+  def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv16i1 PPRAny:$Pm),
+             MatrixIndexGPR32Op12_15:$idx)),
+            (!cast<Instruction>(NAME # _B) $Pn, $Pm, $idx, 0)>;
+  def : Pat<(nxv8i1 (op (nxv8i1 PPRAny:$Pn), (nxv8i1 PPRAny:$Pm),
+             MatrixIndexGPR32Op12_15:$idx)),
+            (!cast<Instruction>(NAME # _H) $Pn, $Pm, $idx, 0)>;
+  def : Pat<(nxv4i1 (op (nxv4i1 PPRAny:$Pn), (nxv4i1 PPRAny:$Pm),
+             MatrixIndexGPR32Op12_15:$idx)),
+            (!cast<Instruction>(NAME # _S) $Pn, $Pm, $idx, 0)>;
+  def : Pat<(nxv2i1 (op (nxv2i1 PPRAny:$Pn), (nxv2i1 PPRAny:$Pm),
+             MatrixIndexGPR32Op12_15:$idx)),
+            (!cast<Instruction>(NAME # _D) $Pn, $Pm, $idx, 0)>;
+
+  let AddedComplexity = 1 in {
+    def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv16i1 PPRAny:$Pm),
+               (i32 (tileslice8 MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_15:$imm)))),
+              (!cast<Instruction>(NAME # _B) $Pn, $Pm, $idx, $imm)>;
+    def : Pat<(nxv8i1 (op (nxv8i1 PPRAny:$Pn), (nxv8i1 PPRAny:$Pm),
+               (i32 (tileslice16 MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_7:$imm)))),
+              (!cast<Instruction>(NAME # _H) $Pn, $Pm, $idx, $imm)>;
+    def : Pat<(nxv4i1 (op (nxv4i1 PPRAny:$Pn), (nxv4i1 PPRAny:$Pm),
+               (i32 (tileslice32 MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_3:$imm)))),
+              (!cast<Instruction>(NAME # _S) $Pn, $Pm, $idx, $imm)>;
+    def : Pat<(nxv2i1 (op (nxv2i1 PPRAny:$Pn), (nxv2i1 PPRAny:$Pm),
+               (i32 (tileslice64 MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_1:$imm)))),
+              (!cast<Instruction>(NAME # _D) $Pn, $Pm, $idx, $imm)>;
+  }
 }
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 9d4bdbe5d053..3631536a32b9 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -199,6 +199,11 @@ def SVEAddSubImm16Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i16>", [
 def SVEAddSubImm32Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i32>", []>;
 def SVEAddSubImm64Pat : ComplexPattern<i64, 2, "SelectSVEAddSubImm<MVT::i64>", []>;
 
+def SVECpyDupImm8Pat  : ComplexPattern<i32, 2, "SelectSVECpyDupImm<MVT::i8>", []>;
+def SVECpyDupImm16Pat : ComplexPattern<i32, 2, "SelectSVECpyDupImm<MVT::i16>", []>;
+def SVECpyDupImm32Pat : ComplexPattern<i32, 2, "SelectSVECpyDupImm<MVT::i32>", []>;
+def SVECpyDupImm64Pat : ComplexPattern<i64, 2, "SelectSVECpyDupImm<MVT::i64>", []>;
+
 def SVELogicalImm8Pat  : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i8>", []>;
 def SVELogicalImm16Pat : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i16>", []>;
 def SVELogicalImm32Pat : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i32>", []>;
@@ -209,14 +214,6 @@ def SVELogicalImm16NotPat : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i16
 def SVELogicalImm32NotPat : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i32, true>", []>;
 def SVELogicalImm64NotPat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i64, true>", []>;
 
-def SVE8BitLslImm32 : ComplexPattern<i32, 2, "SelectSVE8BitLslImm", [imm]>;
-def SVE8BitLslImm64 : ComplexPattern<i64, 2, "SelectSVE8BitLslImm", [imm]>;
-class SVE8BitLslImm<ValueType ty> {
-  ComplexPattern Pat = !cond(
-    !eq(ty, i32): SVE8BitLslImm32,
-    !eq(ty, i64): SVE8BitLslImm64);
-}
-
 def SVEArithUImm8Pat  : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i8>", []>;
 def SVEArithUImm16Pat  : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i16>", []>;
 def SVEArithUImm32Pat  : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i32>", []>;
@@ -234,6 +231,8 @@ def SVEShiftImmR16 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 16, true>", []
 def SVEShiftImmR32 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 32, true>", []>;
 def SVEShiftImmR64 : ComplexPattern<i64, 1, "SelectSVEShiftImm<1, 64, true>", []>;
 
+def SVEShiftSplatImmR : ComplexPattern<iAny, 1, "SelectSVEShiftSplatImmR", []>;
+
 def SVEAllActive : ComplexPattern<untyped, 0, "SelectAllActivePredicate", []>;
 
 class SVEExactFPImm<string Suffix, string ValA, string ValB> : AsmOperandClass {
@@ -335,9 +334,14 @@ multiclass sve_int_ptrue<bits<3> opc, string asm, SDPatternOperator op> {
 def SDT_AArch64PTrue : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
 def AArch64ptrue : SDNode<"AArch64ISD::PTRUE", SDT_AArch64PTrue>;
 
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
   defm PTRUE  : sve_int_ptrue<0b000, "ptrue", AArch64ptrue>;
   defm PTRUES : sve_int_ptrue<0b001, "ptrues", null_frag>;
+
+  def : Pat<(nxv16i1 immAllOnesV), (PTRUE_B 31)>;
+  def : Pat<(nxv8i1 immAllOnesV), (PTRUE_H 31)>;
+  def : Pat<(nxv4i1 immAllOnesV), (PTRUE_S 31)>;
+  def : Pat<(nxv2i1 immAllOnesV), (PTRUE_D 31)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -370,24 +374,27 @@ class SVE_1_Op_Passthru_Round_Pat<ValueType vtd, SDPatternOperator op, ValueType
 : Pat<(vtd (op pg:$Op1, vts:$Op2, (i64 timm0_1), vtd:$Op3)),
       (inst $Op3, $Op1, $Op2)>;
 
-class SVE_1_Op_Imm_OptLsl_Reverse_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty,
-                                      ValueType it, ComplexPattern cpx, Instruction inst>
-  : Pat<(vt (op (vt (AArch64dup (it (cpx i32:$imm, i32:$shift)))), (vt zprty:$Op1))),
-        (inst $Op1, i32:$imm, i32:$shift)>;
+multiclass SVE_1_Op_PassthruUndef_Round_Pat<ValueType vtd, SDPatternOperator op, ValueType pg,
+                                  ValueType vts, Instruction inst>{
+  def : Pat<(vtd (op pg:$Op1, vts:$Op2, (i64 timm0_1), (vtd undef))),
+            (inst (IMPLICIT_DEF), $Op1, $Op2)>;
+  def : Pat<(vtd (op (pg (SVEAllActive:$Op1)), vts:$Op2, (i64 timm0_1), vtd:$Op3)),
+            (inst $Op3, $Op1, $Op2)>;
+}
 
 class SVE_1_Op_Imm_OptLsl_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty,
                               ValueType it, ComplexPattern cpx, Instruction inst>
-  : Pat<(vt (op (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm, i32:$shift)))))),
+  : Pat<(vt (op (vt zprty:$Op1), (vt (splat_vector (it (cpx i32:$imm, i32:$shift)))))),
         (inst $Op1, i32:$imm, i32:$shift)>;
 
 class SVE_1_Op_Imm_Arith_All_Active<ValueType vt, ValueType pt, SDPatternOperator op,
                                   ZPRRegOp zprty, ValueType it, ComplexPattern cpx, Instruction inst>
-  : Pat<(vt (op (pt (SVEAllActive)), (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))),
+  : Pat<(vt (op (pt (SVEAllActive)), (vt zprty:$Op1), (vt (splat_vector (it (cpx i32:$imm)))))),
         (inst $Op1, i32:$imm)>;
 
 class SVE_1_Op_Imm_Log_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty,
                            ValueType it, ComplexPattern cpx, Instruction inst>
-  : Pat<(vt (op (vt zprty:$Op1), (vt (AArch64dup (it (cpx i64:$imm)))))),
+  : Pat<(vt (op (vt zprty:$Op1), (vt (splat_vector (it (cpx i64:$imm)))))),
         (inst $Op1, i64:$imm)>;
 
 class SVE_2_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
@@ -489,20 +496,20 @@ multiclass SVE_InReg_Extend_PassthruUndef<ValueType vt, SDPatternOperator op, Va
 class SVE_Shift_DupImm_Pred_Pat<ValueType vt, SDPatternOperator op,
                                 ValueType pt, ValueType it,
                                 ComplexPattern cast, Instruction inst>
-: Pat<(vt (op pt:$Pg, vt:$Rn, (vt (AArch64dup (it (cast i32:$imm)))))),
+: Pat<(vt (op pt:$Pg, vt:$Rn, (vt (splat_vector (it (cast i32:$imm)))))),
       (inst $Pg, $Rn, i32:$imm)>;
 
 class SVE_Shift_DupImm_All_Active_Pat<ValueType vt, SDPatternOperator op,
                                       ValueType pt, ValueType it,
                                       ComplexPattern cast, Instruction inst>
-: Pat<(vt (op (pt (SVEAllActive)), vt:$Rn, (vt (AArch64dup (it (cast i32:$imm)))))),
+: Pat<(vt (op (pt (SVEAllActive)), vt:$Rn, (vt (splat_vector (it (cast i32:$imm)))))),
       (inst $Rn, i32:$imm)>;
 
 class SVE_2_Op_Fp_Imm_Pat<ValueType vt, SDPatternOperator op,
                           ValueType pt, ValueType it,
                           FPImmLeaf immL, int imm,
                           Instruction inst>
-: Pat<(vt (op (pt PPR_3b:$Pg), (vt ZPR:$Zs1), (vt (AArch64dup (it immL))))),
+: Pat<(vt (op (pt PPR_3b:$Pg), (vt ZPR:$Zs1), (vt (splat_vector (it immL))))),
       (inst $Pg, $Zs1, imm)>;
 
 class SVE_2_Op_Fp_Imm_Pat_Zero<ValueType vt, SDPatternOperator op,
@@ -510,9 +517,33 @@ class SVE_2_Op_Fp_Imm_Pat_Zero<ValueType vt, SDPatternOperator op,
                               FPImmLeaf immL, int imm,
                               Instruction inst>
 : Pat<(vt (op pt:$Pg, (vselect pt:$Pg, vt:$Zs1, (SVEDup0)),
-                      (vt (AArch64dup (it immL))))),
+                      (vt (splat_vector (it immL))))),
       (inst $Pg, $Zs1, imm)>;
 
+// Used to re-order the operands of BSP when lowering to BSL. BSP has the order:
+// mask, in1, in2 whereas BSL for SVE2 has them ordered in1, in2, mask
+class SVE_3_Op_BSP_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
+                   ValueType vt2, ValueType vt3, Instruction inst>
+: Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3)),
+      (inst $Op2, $Op3, $Op1)>;
+
+class SVE_Shift_Add_All_Active_Pat<ValueType vtd, SDPatternOperator op, ValueType pt,
+                                   ValueType vt1, ValueType vt2, ValueType vt3,
+                                   Instruction inst>
+: Pat<(vtd (add vt1:$Op1, (op (pt (SVEAllActive)), vt2:$Op2, vt3:$Op3))),
+      (inst $Op1, $Op2, $Op3)>;
+
+//===----------------------------------------------------------------------===//
+// SVE pattern match helpers.
+//===----------------------------------------------------------------------===//
+
+// Matches either an intrinsic, or a predicated operation with an all active predicate
+class EitherVSelectOrPassthruPatFrags<SDPatternOperator intrinsic, SDPatternOperator sdnode>
+: PatFrags<(ops node:$Pg, node:$Op1, node:$Op2), [
+    (intrinsic node:$Pg, node:$Op1, node:$Op2),
+    (vselect node:$Pg, (sdnode (SVEAllActive), node:$Op1, node:$Op2), node:$Op1),
+  ]>;
+
 //
 // Pseudo -> Instruction mappings
 //
@@ -612,10 +643,11 @@ class sve_int_pfalse<bits<6> opc, string asm>
 multiclass sve_int_pfalse<bits<6> opc, string asm> {
   def NAME : sve_int_pfalse<opc, asm>;
 
-  def : Pat<(nxv16i1 (splat_vector (i32 0))), (!cast<Instruction>(NAME))>;
-  def : Pat<(nxv8i1 (splat_vector (i32 0))), (!cast<Instruction>(NAME))>;
-  def : Pat<(nxv4i1 (splat_vector (i32 0))), (!cast<Instruction>(NAME))>;
-  def : Pat<(nxv2i1 (splat_vector (i32 0))), (!cast<Instruction>(NAME))>;
+  def : Pat<(nxv16i1 immAllZerosV), (!cast<Instruction>(NAME))>;
+  def : Pat<(nxv8i1 immAllZerosV), (!cast<Instruction>(NAME))>;
+  def : Pat<(nxv4i1 immAllZerosV), (!cast<Instruction>(NAME))>;
+  def : Pat<(nxv2i1 immAllZerosV), (!cast<Instruction>(NAME))>;
+  def : Pat<(nxv1i1 immAllZerosV), (!cast<Instruction>(NAME))>;
 }
 
 class sve_int_ptest<bits<6> opc, string asm>
@@ -885,6 +917,8 @@ class sve_int_count<bits<3> opc, string asm>
   let Inst{10}    = opc{0};
   let Inst{9-5}   = pattern;
   let Inst{4-0}   = Rd;
+
+  let isReMaterializable = 1;
 }
 
 multiclass sve_int_count<bits<3> opc, string asm, SDPatternOperator op> {
@@ -965,7 +999,7 @@ class sve_int_pred_pattern_a<bits<3> opc, string asm>
 multiclass sve_int_pred_pattern_a<bits<3> opc, string asm,
                                   SDPatternOperator op,
                                   SDPatternOperator opcnt> {
-  let Predicates = [HasSVEorStreamingSVE] in {
+  let Predicates = [HasSVEorSME] in {
     def NAME : sve_int_pred_pattern_a<opc, asm>;
 
     def : InstAlias<asm # "\t$Rdn, $pattern",
@@ -974,7 +1008,7 @@ multiclass sve_int_pred_pattern_a<bits<3> opc, string asm,
                     (!cast<Instruction>(NAME) GPR64:$Rdn, 0b11111, 1), 2>;
   }
 
-  let Predicates = [HasSVEorStreamingSVE, UseScalarIncVL] in {
+  let Predicates = [HasSVEorSME, UseScalarIncVL] in {
     def : Pat<(i64 (op GPR64:$Rdn, (opcnt sve_pred_enum:$pattern))),
               (!cast<Instruction>(NAME) GPR64:$Rdn, sve_pred_enum:$pattern, 1)>;
 
@@ -1170,28 +1204,45 @@ multiclass sve_int_perm_dup_i<string asm> {
                   (!cast<Instruction>(NAME # _Q) ZPR128:$Zd, FPR128asZPR:$Qn, 0), 2>;
 
   // Duplicate extracted element of vector into all vector elements
-  def : Pat<(nxv16i8 (AArch64dup (i32 (vector_extract (nxv16i8 ZPR:$vec), sve_elm_idx_extdup_b:$index)))),
+  def : Pat<(nxv16i8 (splat_vector (i32 (vector_extract (nxv16i8 ZPR:$vec), sve_elm_idx_extdup_b:$index)))),
             (!cast<Instruction>(NAME # _B) ZPR:$vec, sve_elm_idx_extdup_b:$index)>;
-  def : Pat<(nxv8i16 (AArch64dup (i32 (vector_extract (nxv8i16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))),
+  def : Pat<(nxv8i16 (splat_vector (i32 (vector_extract (nxv8i16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))),
             (!cast<Instruction>(NAME # _H) ZPR:$vec, sve_elm_idx_extdup_h:$index)>;
-  def : Pat<(nxv4i32 (AArch64dup (i32 (vector_extract (nxv4i32 ZPR:$vec), sve_elm_idx_extdup_s:$index)))),
+  def : Pat<(nxv4i32 (splat_vector (i32 (vector_extract (nxv4i32 ZPR:$vec), sve_elm_idx_extdup_s:$index)))),
             (!cast<Instruction>(NAME # _S) ZPR:$vec, sve_elm_idx_extdup_s:$index)>;
-  def : Pat<(nxv2i64 (AArch64dup (i64 (vector_extract (nxv2i64 ZPR:$vec), sve_elm_idx_extdup_d:$index)))),
+  def : Pat<(nxv2i64 (splat_vector (i64 (vector_extract (nxv2i64 ZPR:$vec), sve_elm_idx_extdup_d:$index)))),
             (!cast<Instruction>(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>;
-  def : Pat<(nxv8f16 (AArch64dup (f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))),
+  def : Pat<(nxv8f16 (splat_vector (f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))),
             (!cast<Instruction>(NAME # _H) ZPR:$vec, sve_elm_idx_extdup_h:$index)>;
-  def : Pat<(nxv8bf16 (AArch64dup (bf16 (vector_extract (nxv8bf16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))),
+  def : Pat<(nxv8bf16 (splat_vector (bf16 (vector_extract (nxv8bf16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))),
             (!cast<Instruction>(NAME # _H) ZPR:$vec, sve_elm_idx_extdup_h:$index)>;
-  def : Pat<(nxv4f16 (AArch64dup (f16 (vector_extract (nxv4f16 ZPR:$vec), sve_elm_idx_extdup_s:$index)))),
+  def : Pat<(nxv4f16 (splat_vector (f16 (vector_extract (nxv4f16 ZPR:$vec), sve_elm_idx_extdup_s:$index)))),
             (!cast<Instruction>(NAME # _S) ZPR:$vec, sve_elm_idx_extdup_s:$index)>;
-  def : Pat<(nxv2f16 (AArch64dup (f16 (vector_extract (nxv2f16 ZPR:$vec), sve_elm_idx_extdup_d:$index)))),
+  def : Pat<(nxv2f16 (splat_vector (f16 (vector_extract (nxv2f16 ZPR:$vec), sve_elm_idx_extdup_d:$index)))),
             (!cast<Instruction>(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>;
-  def : Pat<(nxv4f32 (AArch64dup (f32 (vector_extract (nxv4f32 ZPR:$vec), sve_elm_idx_extdup_s:$index)))),
+  def : Pat<(nxv4f32 (splat_vector (f32 (vector_extract (nxv4f32 ZPR:$vec), sve_elm_idx_extdup_s:$index)))),
             (!cast<Instruction>(NAME # _S) ZPR:$vec, sve_elm_idx_extdup_s:$index)>;
-  def : Pat<(nxv2f32 (AArch64dup (f32 (vector_extract (nxv2f32 ZPR:$vec), sve_elm_idx_extdup_d:$index)))),
+  def : Pat<(nxv2f32 (splat_vector (f32 (vector_extract (nxv2f32 ZPR:$vec), sve_elm_idx_extdup_d:$index)))),
             (!cast<Instruction>(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>;
-  def : Pat<(nxv2f64 (AArch64dup (f64 (vector_extract (nxv2f64 ZPR:$vec), sve_elm_idx_extdup_d:$index)))),
+  def : Pat<(nxv2f64 (splat_vector (f64 (vector_extract (nxv2f64 ZPR:$vec), sve_elm_idx_extdup_d:$index)))),
             (!cast<Instruction>(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>;
+
+  def : Pat<(nxv16i8 (AArch64duplane128 nxv16i8:$Op1, i64:$imm)),
+            (!cast<Instruction>(NAME # _Q) $Op1, $imm)>;
+  def : Pat<(nxv8i16 (AArch64duplane128 nxv8i16:$Op1, i64:$imm)),
+            (!cast<Instruction>(NAME # _Q) $Op1, $imm)>;
+  def : Pat<(nxv4i32 (AArch64duplane128 nxv4i32:$Op1, i64:$imm)),
+            (!cast<Instruction>(NAME # _Q) $Op1, $imm)>;
+  def : Pat<(nxv2i64 (AArch64duplane128 nxv2i64:$Op1, i64:$imm)),
+            (!cast<Instruction>(NAME # _Q) $Op1, $imm)>;
+  def : Pat<(nxv8f16 (AArch64duplane128 nxv8f16:$Op1, i64:$imm)),
+            (!cast<Instruction>(NAME # _Q) $Op1, $imm)>;
+  def : Pat<(nxv4f32 (AArch64duplane128 nxv4f32:$Op1, i64:$imm)),
+            (!cast<Instruction>(NAME # _Q) $Op1, $imm)>;
+  def : Pat<(nxv2f64 (AArch64duplane128 nxv2f64:$Op1, i64:$imm)),
+            (!cast<Instruction>(NAME # _Q) $Op1, $imm)>;
+  def : Pat<(nxv8bf16 (AArch64duplane128 nxv8bf16:$Op1, i64:$imm)),
+            (!cast<Instruction>(NAME # _Q) $Op1, $imm)>;
 }
 
 class sve_int_perm_tbl<bits<2> sz8_64, bits<2> opc, string asm, ZPRRegOp zprty,
@@ -1631,6 +1682,7 @@ multiclass sve_int_pred_log<bits<4> opc, string asm, SDPatternOperator op,
   def : SVE_3_Op_Pat<nxv8i1, op, nxv8i1, nxv8i1, nxv8i1, !cast<Instruction>(NAME)>;
   def : SVE_3_Op_Pat<nxv4i1, op, nxv4i1, nxv4i1, nxv4i1, !cast<Instruction>(NAME)>;
   def : SVE_3_Op_Pat<nxv2i1, op, nxv2i1, nxv2i1, nxv2i1, !cast<Instruction>(NAME)>;
+  def : SVE_3_Op_Pat<nxv1i1, op, nxv1i1, nxv1i1, nxv1i1, !cast<Instruction>(NAME)>;
   def : SVE_2_Op_AllActive_Pat<nxv16i1, op_nopred, nxv16i1, nxv16i1,
                                !cast<Instruction>(NAME), PTRUE_B>;
   def : SVE_2_Op_AllActive_Pat<nxv8i1, op_nopred, nxv8i1, nxv8i1,
@@ -1743,7 +1795,7 @@ multiclass sve_int_dup_mask_imm<string asm> {
   def : InstAlias<"mov $Zd, $imm",
                   (!cast<Instruction>(NAME) ZPR64:$Zd, sve_preferred_logical_imm64:$imm), 5>;
 
-  def : Pat<(nxv2i64 (AArch64dup (i64 logical_imm64:$imm))),
+  def : Pat<(nxv2i64 (splat_vector (i64 logical_imm64:$imm))),
             (!cast<Instruction>(NAME) logical_imm64:$imm)>;
 }
 
@@ -2478,7 +2530,7 @@ multiclass sve2_fp_mla_long<bits<2> opc, string asm, SDPatternOperator op> {
 // SVE Stack Allocation Group
 //===----------------------------------------------------------------------===//
 
-class sve_int_arith_vl<bit opc, string asm>
+class sve_int_arith_vl<bit opc, string asm, bit streaming_sve = 0b0>
 : I<(outs GPR64sp:$Rd), (ins GPR64sp:$Rn, simm6_32b:$imm6),
   asm, "\t$Rd, $Rn, $imm6",
   "",
@@ -2490,12 +2542,13 @@ class sve_int_arith_vl<bit opc, string asm>
   let Inst{22}    = opc;
   let Inst{21}    = 0b1;
   let Inst{20-16} = Rn;
-  let Inst{15-11} = 0b01010;
+  let Inst{15-12} = 0b0101;
+  let Inst{11}    = streaming_sve;
   let Inst{10-5}  = imm6;
   let Inst{4-0}   = Rd;
 }
 
-class sve_int_read_vl_a<bit op, bits<5> opc2, string asm>
+class sve_int_read_vl_a<bit op, bits<5> opc2, string asm, bit streaming_sve = 0b0>
 : I<(outs GPR64:$Rd), (ins simm6_32b:$imm6),
   asm, "\t$Rd, $imm6",
   "",
@@ -2506,9 +2559,12 @@ class sve_int_read_vl_a<bit op, bits<5> opc2, string asm>
   let Inst{22}    = op;
   let Inst{21}    = 0b1;
   let Inst{20-16} = opc2{4-0};
-  let Inst{15-11} = 0b01010;
+  let Inst{15-12} = 0b0101;
+  let Inst{11}    = streaming_sve;
   let Inst{10-5}  = imm6;
   let Inst{4-0}   = Rd;
+
+  let isReMaterializable = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2589,8 +2645,8 @@ multiclass sve_fp_2op_p_zd<bits<7> opc, string asm,
                            SDPatternOperator int_op,
                            SDPatternOperator ir_op, ValueType vt1,
                            ValueType vt2, ValueType vt3, ElementSizeEnum Sz> {
-  def NAME : sve_fp_2op_p_zd<opc, asm, i_zprtype, o_zprtype, Sz>;
-
+  def NAME : sve_fp_2op_p_zd<opc, asm, i_zprtype, o_zprtype, Sz>,
+             SVEPseudo2Instr<NAME, 1>;
   // convert vt1 to a packed type for the intrinsic patterns
   defvar packedvt1 = !cond(!eq(!cast<string>(vt1), "nxv2f16"): nxv8f16,
                            !eq(!cast<string>(vt1), "nxv4f16"): nxv8f16,
@@ -2604,8 +2660,11 @@ multiclass sve_fp_2op_p_zd<bits<7> opc, string asm,
                            1 : vt3);
 
   def : SVE_3_Op_Pat<packedvt1, int_op, packedvt1, vt2, packedvt3, !cast<Instruction>(NAME)>;
-
   def : SVE_1_Op_Passthru_Pat<vt1, ir_op, vt2, vt3, !cast<Instruction>(NAME)>;
+
+  def _UNDEF : PredOneOpPassthruPseudo<NAME, !cast<ZPRRegOp>(i_zprtype)>;
+
+  defm : SVE_1_Op_PassthruUndef_Pat<vt1, ir_op, vt2, vt3, !cast<Instruction>(NAME # _UNDEF)>;
 }
 
 multiclass sve_fp_2op_p_zdr<bits<7> opc, string asm,
@@ -2614,7 +2673,8 @@ multiclass sve_fp_2op_p_zdr<bits<7> opc, string asm,
                             SDPatternOperator int_op,
                             SDPatternOperator ir_op, ValueType vt1,
                             ValueType vt2, ValueType vt3, ElementSizeEnum Sz> {
-  def NAME : sve_fp_2op_p_zd<opc, asm, i_zprtype, o_zprtype, Sz>;
+  def NAME : sve_fp_2op_p_zd<opc, asm, i_zprtype, o_zprtype, Sz>,
+             SVEPseudo2Instr<NAME, 1>;
 
   // convert vt1 to a packed type for the intrinsic patterns
   defvar packedvt1 = !cond(!eq(!cast<string>(vt1), "nxv2f16"): nxv8f16,
@@ -2623,8 +2683,11 @@ multiclass sve_fp_2op_p_zdr<bits<7> opc, string asm,
                            1 : vt1);
 
   def : SVE_3_Op_Pat<packedvt1, int_op, packedvt1, vt2, vt3, !cast<Instruction>(NAME)>;
-
   def : SVE_1_Op_Passthru_Round_Pat<vt1, ir_op, vt2, vt3, !cast<Instruction>(NAME)>;
+
+  def _UNDEF : PredOneOpPassthruPseudo<NAME, !cast<ZPRRegOp>(i_zprtype)>;
+
+  defm : SVE_1_Op_PassthruUndef_Round_Pat<vt1, ir_op, vt2, vt3, !cast<Instruction>(NAME # _UNDEF)>;
 }
 
 multiclass sve_fp_2op_p_zd_HSD<bits<5> opc, string asm, SDPatternOperator op> {
@@ -2726,11 +2789,19 @@ class sve_int_bin_pred_arit_log<bits<2> sz8_64, bits<2> fmt, bits<3> opc,
   let ElementSize = zprty.ElementSize;
 }
 
-multiclass sve_int_bin_pred_log<bits<3> opc, string asm, SDPatternOperator op> {
-  def _B : sve_int_bin_pred_arit_log<0b00, 0b11, opc, asm, ZPR8>;
-  def _H : sve_int_bin_pred_arit_log<0b01, 0b11, opc, asm, ZPR16>;
-  def _S : sve_int_bin_pred_arit_log<0b10, 0b11, opc, asm, ZPR32>;
-  def _D : sve_int_bin_pred_arit_log<0b11, 0b11, opc, asm, ZPR64>;
+multiclass sve_int_bin_pred_log<bits<3> opc, string asm, string Ps,
+                                SDPatternOperator op,
+                                DestructiveInstTypeEnum flags> {
+  let DestructiveInstType = flags in {
+  def _B : sve_int_bin_pred_arit_log<0b00, 0b11, opc, asm, ZPR8>,
+             SVEPseudo2Instr<Ps # _B, 1>;
+  def _H : sve_int_bin_pred_arit_log<0b01, 0b11, opc, asm, ZPR16>,
+             SVEPseudo2Instr<Ps # _H, 1>;
+  def _S : sve_int_bin_pred_arit_log<0b10, 0b11, opc, asm, ZPR32>,
+             SVEPseudo2Instr<Ps # _S, 1>;
+  def _D : sve_int_bin_pred_arit_log<0b11, 0b11, opc, asm, ZPR64>,
+             SVEPseudo2Instr<Ps # _D, 1>;
+  }
 
   def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
   def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
@@ -3756,7 +3827,8 @@ class sve2_int_bin_accum_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
 }
 
 multiclass sve2_int_bin_accum_shift_imm_right<bits<2> opc, string asm,
-                                              SDPatternOperator op> {
+                                              SDPatternOperator op,
+                                              SDPatternOperator shift_op = null_frag> {
   def _B : sve2_int_bin_accum_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
   def _H : sve2_int_bin_accum_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
     let Inst{19} = imm{3};
@@ -3773,6 +3845,11 @@ multiclass sve2_int_bin_accum_shift_imm_right<bits<2> opc, string asm,
   def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>;
   def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>;
   def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, tvecshiftR64, !cast<Instruction>(NAME # _D)>;
+
+  def : SVE_Shift_Add_All_Active_Pat<nxv16i8, shift_op, nxv16i1, nxv16i8, nxv16i8, i32, !cast<Instruction>(NAME # _B)>;
+  def : SVE_Shift_Add_All_Active_Pat<nxv8i16, shift_op, nxv8i1, nxv8i16, nxv8i16, i32, !cast<Instruction>(NAME # _H)>;
+  def : SVE_Shift_Add_All_Active_Pat<nxv4i32, shift_op, nxv4i1, nxv4i32, nxv4i32, i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_Shift_Add_All_Active_Pat<nxv2i64, shift_op, nxv2i1, nxv2i64, nxv2i64, i32, !cast<Instruction>(NAME # _D)>;
 }
 
 class sve2_int_cadd<bits<2> sz, bit opc, string asm, ZPRRegOp zprty>
@@ -4331,18 +4408,6 @@ multiclass sve_int_arith_imm0<bits<3> opc, string asm, SDPatternOperator op> {
   def : SVE_1_Op_Imm_OptLsl_Pat<nxv2i64, op, ZPR64, i64, SVEAddSubImm64Pat, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve_int_arith_imm0_subr<bits<3> opc, string asm, SDPatternOperator op> {
-  def _B : sve_int_arith_imm0<0b00, opc, asm, ZPR8,  addsub_imm8_opt_lsl_i8>;
-  def _H : sve_int_arith_imm0<0b01, opc, asm, ZPR16, addsub_imm8_opt_lsl_i16>;
-  def _S : sve_int_arith_imm0<0b10, opc, asm, ZPR32, addsub_imm8_opt_lsl_i32>;
-  def _D : sve_int_arith_imm0<0b11, opc, asm, ZPR64, addsub_imm8_opt_lsl_i64>;
-
-  def : SVE_1_Op_Imm_OptLsl_Reverse_Pat<nxv16i8, op, ZPR8,  i32, SVEAddSubImm8Pat,  !cast<Instruction>(NAME # _B)>;
-  def : SVE_1_Op_Imm_OptLsl_Reverse_Pat<nxv8i16, op, ZPR16, i32, SVEAddSubImm16Pat, !cast<Instruction>(NAME # _H)>;
-  def : SVE_1_Op_Imm_OptLsl_Reverse_Pat<nxv4i32, op, ZPR32, i32, SVEAddSubImm32Pat, !cast<Instruction>(NAME # _S)>;
-  def : SVE_1_Op_Imm_OptLsl_Reverse_Pat<nxv2i64, op, ZPR64, i64, SVEAddSubImm64Pat, !cast<Instruction>(NAME # _D)>;
-}
-
 class sve_int_arith_imm<bits<2> sz8_64, bits<6> opc, string asm,
                         ZPRRegOp zprty, Operand immtype>
 : I<(outs zprty:$Zdn), (ins zprty:$_Zdn, immtype:$imm),
@@ -4458,7 +4523,8 @@ class sve2_int_bitwise_ternary_op_d<bits<3> opc, string asm>
   let ElementSize = ElementSizeNone;
 }
 
-multiclass sve2_int_bitwise_ternary_op<bits<3> opc, string asm, SDPatternOperator op> {
+multiclass sve2_int_bitwise_ternary_op<bits<3> opc, string asm, SDPatternOperator op,
+                                       SDPatternOperator ir_op = null_frag> {
   def NAME : sve2_int_bitwise_ternary_op_d<opc, asm>;
 
   def : InstAlias<asm # "\t$Zdn, $Zdn, $Zm, $Zk",
@@ -4472,6 +4538,12 @@ multiclass sve2_int_bitwise_ternary_op<bits<3> opc, string asm, SDPatternOperato
   def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME)>;
   def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME)>;
   def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME)>;
+
+
+  def : SVE_3_Op_BSP_Pat<nxv16i8, ir_op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME)>;
+  def : SVE_3_Op_BSP_Pat<nxv8i16, ir_op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME)>;
+  def : SVE_3_Op_BSP_Pat<nxv4i32, ir_op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME)>;
+  def : SVE_3_Op_BSP_Pat<nxv2i64, ir_op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME)>;
 }
 
 class sve2_int_rotate_right_imm<bits<4> tsz8_64, string asm,
@@ -4578,29 +4650,28 @@ class sve_int_dup_imm_pred<bits<2> sz8_64, bit m, string asm,
 }
 
 multiclass sve_int_dup_imm_pred_merge_inst<
-    bits<2> sz8_64, string asm, ZPRRegOp zprty, ValueType intty,
-    ValueType predty, ValueType scalarty, imm8_opt_lsl cpyimm> {
+    bits<2> sz8_64, string asm, ZPRRegOp zprty, imm8_opt_lsl cpyimm,
+    ValueType intty, ValueType predty, ValueType scalarty, ComplexPattern cpx> {
   let Constraints = "$Zd = $_Zd" in
   def NAME : sve_int_dup_imm_pred<sz8_64, 1, asm, zprty,  "/m",
                                   (ins zprty:$_Zd, PPRAny:$Pg, cpyimm:$imm)>;
   def : InstAlias<"mov $Zd, $Pg/m, $imm",
                   (!cast<Instruction>(NAME) zprty:$Zd, PPRAny:$Pg, cpyimm:$imm), 1>;
-  def : Pat<(intty
-              (vselect predty:$Pg,
-                (intty (AArch64dup (scalarty (SVE8BitLslImm<scalarty>.Pat i32:$imm, i32:$shift)))),
-                intty:$Zd)),
-            (!cast<Instruction>(NAME) zprty:$Zd, $Pg, i32:$imm, i32:$shift)>;
+  def : Pat<(vselect predty:$Pg,
+                (intty (splat_vector (scalarty (cpx i32:$imm, i32:$shift)))),
+                ZPR:$Zd),
+            (!cast<Instruction>(NAME) $Zd, $Pg, $imm, $shift)>;
 }
 
 multiclass sve_int_dup_imm_pred_merge<string asm> {
-  defm _B : sve_int_dup_imm_pred_merge_inst<0b00, asm, ZPR8, nxv16i8, nxv16i1,
-                                            i32, cpy_imm8_opt_lsl_i8>;
-  defm _H : sve_int_dup_imm_pred_merge_inst<0b01, asm, ZPR16, nxv8i16, nxv8i1,
-                                            i32, cpy_imm8_opt_lsl_i16>;
-  defm _S : sve_int_dup_imm_pred_merge_inst<0b10, asm, ZPR32, nxv4i32, nxv4i1,
-                                            i32, cpy_imm8_opt_lsl_i32>;
-  defm _D : sve_int_dup_imm_pred_merge_inst<0b11, asm, ZPR64, nxv2i64, nxv2i1,
-                                            i64, cpy_imm8_opt_lsl_i64>;
+  defm _B : sve_int_dup_imm_pred_merge_inst<0b00, asm, ZPR8, cpy_imm8_opt_lsl_i8,
+                                            nxv16i8, nxv16i1, i32, SVECpyDupImm8Pat>;
+  defm _H : sve_int_dup_imm_pred_merge_inst<0b01, asm, ZPR16, cpy_imm8_opt_lsl_i16,
+                                            nxv8i16, nxv8i1, i32, SVECpyDupImm16Pat>;
+  defm _S : sve_int_dup_imm_pred_merge_inst<0b10, asm, ZPR32, cpy_imm8_opt_lsl_i32,
+                                            nxv4i32, nxv4i1, i32, SVECpyDupImm32Pat>;
+  defm _D : sve_int_dup_imm_pred_merge_inst<0b11, asm, ZPR64, cpy_imm8_opt_lsl_i64,
+                                            nxv2i64, nxv2i1, i64, SVECpyDupImm64Pat>;
 
   def : InstAlias<"fmov $Zd, $Pg/m, #0.0",
                   (!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, 0, 0), 0>;
@@ -4608,11 +4679,24 @@ multiclass sve_int_dup_imm_pred_merge<string asm> {
                   (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, 0, 0), 0>;
   def : InstAlias<"fmov $Zd, $Pg/m, #0.0",
                   (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, 0, 0), 0>;
+
+  def : Pat<(vselect PPRAny:$Pg, (SVEDup0), (nxv8f16 ZPR:$Zd)),
+            (!cast<Instruction>(NAME # _H) $Zd, $Pg, 0, 0)>;
+  def : Pat<(vselect PPRAny:$Pg, (SVEDup0), (nxv4f16 ZPR:$Zd)),
+            (!cast<Instruction>(NAME # _S) $Zd, $Pg, 0, 0)>;
+  def : Pat<(vselect PPRAny:$Pg, (SVEDup0), (nxv2f16 ZPR:$Zd)),
+            (!cast<Instruction>(NAME # _D) $Zd, $Pg, 0, 0)>;
+  def : Pat<(vselect PPRAny:$Pg, (SVEDup0), (nxv4f32 ZPR:$Zd)),
+            (!cast<Instruction>(NAME # _S) $Zd, $Pg, 0, 0)>;
+  def : Pat<(vselect PPRAny:$Pg, (SVEDup0), (nxv2f32 ZPR:$Zd)),
+            (!cast<Instruction>(NAME # _D) $Zd, $Pg, 0, 0)>;
+  def : Pat<(vselect PPRAny:$Pg, (SVEDup0), (nxv2f64 ZPR:$Zd)),
+            (!cast<Instruction>(NAME # _D) $Zd, $Pg, 0, 0)>;
 }
 
 multiclass sve_int_dup_imm_pred_zero_inst<
-    bits<2> sz8_64, string asm, ZPRRegOp zprty, ValueType intty,
-    ValueType predty, ValueType scalarty, imm8_opt_lsl cpyimm> {
+    bits<2> sz8_64, string asm, ZPRRegOp zprty, imm8_opt_lsl cpyimm,
+    ValueType intty, ValueType predty, ValueType scalarty, ComplexPattern cpx> {
   def NAME : sve_int_dup_imm_pred<sz8_64, 0, asm, zprty, "/z",
                                   (ins PPRAny:$Pg, cpyimm:$imm)>;
   def : InstAlias<"mov $Zd, $Pg/z, $imm",
@@ -4623,22 +4707,21 @@ multiclass sve_int_dup_imm_pred_zero_inst<
             (!cast<Instruction>(NAME) PPRAny:$Ps1, -1, 0)>;
   def : Pat<(intty (anyext (predty PPRAny:$Ps1))),
             (!cast<Instruction>(NAME) PPRAny:$Ps1, 1, 0)>;
-  def : Pat<(intty
-              (vselect predty:$Pg,
-                (intty (AArch64dup (scalarty (SVE8BitLslImm<scalarty>.Pat i32:$imm, i32:$shift)))),
-                (intty (AArch64dup (scalarty 0))))),
-            (!cast<Instruction>(NAME) $Pg, i32:$imm, i32:$shift)>;
+  def : Pat<(vselect predty:$Pg,
+                (intty (splat_vector (scalarty (cpx i32:$imm, i32:$shift)))),
+                (intty (splat_vector (scalarty 0)))),
+            (!cast<Instruction>(NAME) $Pg, $imm, $shift)>;
 }
 
 multiclass sve_int_dup_imm_pred_zero<string asm> {
-  defm _B : sve_int_dup_imm_pred_zero_inst<0b00, asm, ZPR8,  nxv16i8, nxv16i1,
-                                           i32, cpy_imm8_opt_lsl_i8>;
-  defm _H : sve_int_dup_imm_pred_zero_inst<0b01, asm, ZPR16, nxv8i16, nxv8i1,
-                                           i32, cpy_imm8_opt_lsl_i16>;
-  defm _S : sve_int_dup_imm_pred_zero_inst<0b10, asm, ZPR32, nxv4i32, nxv4i1,
-                                           i32, cpy_imm8_opt_lsl_i32>;
-  defm _D : sve_int_dup_imm_pred_zero_inst<0b11, asm, ZPR64, nxv2i64, nxv2i1,
-                                           i64, cpy_imm8_opt_lsl_i64>;
+  defm _B : sve_int_dup_imm_pred_zero_inst<0b00, asm, ZPR8, cpy_imm8_opt_lsl_i8,
+                                           nxv16i8, nxv16i1, i32, SVECpyDupImm8Pat>;
+  defm _H : sve_int_dup_imm_pred_zero_inst<0b01, asm, ZPR16, cpy_imm8_opt_lsl_i16,
+                                           nxv8i16, nxv8i1, i32, SVECpyDupImm16Pat>;
+  defm _S : sve_int_dup_imm_pred_zero_inst<0b10, asm, ZPR32, cpy_imm8_opt_lsl_i32,
+                                           nxv4i32, nxv4i1, i32, SVECpyDupImm32Pat>;
+  defm _D : sve_int_dup_imm_pred_zero_inst<0b11, asm, ZPR64, cpy_imm8_opt_lsl_i64,
+                                           nxv2i64, nxv2i1, i64, SVECpyDupImm64Pat>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -4690,6 +4773,10 @@ multiclass SVE_SETCC_Pat_With_Zero<CondCode cc, CondCode invcc, ValueType predvt
             (cmp $Op1, $Op2)>;
   def : Pat<(predvt (AArch64setcc_z predvt:$Op1, (SVEDup0), intvt:$Op2, invcc)),
             (cmp $Op1, $Op2)>;
+  def : Pat<(predvt (and predvt:$Pg, (AArch64setcc_z (predvt (AArch64ptrue 31)), intvt:$Op1, (SVEDup0), cc))),
+            (cmp $Pg, $Op1)>;
+  def : Pat<(predvt (and predvt:$Pg, (AArch64setcc_z (predvt (AArch64ptrue 31)), (SVEDup0), intvt:$Op1, invcc))),
+            (cmp $Pg, $Op1)>;
 }
 
 multiclass sve_int_cmp_0<bits<3> opc, string asm, CondCode cc, CondCode invcc> {
@@ -4761,14 +4848,26 @@ multiclass SVE_SETCC_Imm_Pat<CondCode cc, CondCode commuted_cc,
                              ValueType predvt, ValueType intvt,
                              Operand immtype, Instruction cmp> {
   def : Pat<(predvt (AArch64setcc_z (predvt PPR_3b:$Pg),
-                                       (intvt ZPR:$Zs1),
-                                       (intvt (AArch64dup (immtype:$imm))),
-                                       cc)),
+                                    (intvt ZPR:$Zs1),
+                                    (intvt (splat_vector (immtype:$imm))),
+                                    cc)),
             (cmp $Pg, $Zs1, immtype:$imm)>;
   def : Pat<(predvt (AArch64setcc_z (predvt PPR_3b:$Pg),
-                                       (intvt (AArch64dup (immtype:$imm))),
-                                       (intvt ZPR:$Zs1),
-                                       commuted_cc)),
+                                    (intvt (splat_vector (immtype:$imm))),
+                                    (intvt ZPR:$Zs1),
+                                    commuted_cc)),
+            (cmp $Pg, $Zs1, immtype:$imm)>;
+  def : Pat<(predvt (and predvt:$Pg,
+                         (AArch64setcc_z (predvt (AArch64ptrue 31)),
+                                         (intvt ZPR:$Zs1),
+                                         (intvt (splat_vector (immtype:$imm))),
+                                         cc))),
+            (cmp $Pg, $Zs1, immtype:$imm)>;
+  def : Pat<(predvt (and predvt:$Pg,
+                         (AArch64setcc_z (predvt (AArch64ptrue 31)),
+                                         (intvt (splat_vector (immtype:$imm))),
+                                         (intvt ZPR:$Zs1),
+                                         commuted_cc))),
             (cmp $Pg, $Zs1, immtype:$imm)>;
 }
 
@@ -5148,6 +5247,8 @@ class sve_int_index_ii<bits<2> sz8_64, string asm, ZPRRegOp zprty,
   let Inst{15-10} = 0b010000;
   let Inst{9-5}   = imm5;
   let Inst{4-0}   = Zd;
+
+  let isReMaterializable = 1;
 }
 
 multiclass sve_int_index_ii<string asm> {
@@ -5166,13 +5267,13 @@ multiclass sve_int_index_ii<string asm> {
             (!cast<Instruction>(NAME # "_D") (i64 0), simm5_64b:$imm5b)>;
 
   // add(step_vector(step), dup(X)) -> index(X, step).
-  def : Pat<(add (nxv16i8 (step_vector_oneuse simm5_8b_tgt:$imm5b)), (nxv16i8 (AArch64dup(simm5_8b:$imm5)))),
+  def : Pat<(add (nxv16i8 (step_vector_oneuse simm5_8b_tgt:$imm5b)), (nxv16i8 (splat_vector(simm5_8b:$imm5)))),
             (!cast<Instruction>(NAME # "_B") simm5_8b:$imm5, (!cast<SDNodeXForm>("trunc_imm") $imm5b))>;
-  def : Pat<(add (nxv8i16 (step_vector_oneuse simm5_16b_tgt:$imm5b)), (nxv8i16 (AArch64dup(simm5_16b:$imm5)))),
+  def : Pat<(add (nxv8i16 (step_vector_oneuse simm5_16b_tgt:$imm5b)), (nxv8i16 (splat_vector(simm5_16b:$imm5)))),
             (!cast<Instruction>(NAME # "_H") simm5_16b:$imm5, (!cast<SDNodeXForm>("trunc_imm") $imm5b))>;
-  def : Pat<(add (nxv4i32 (step_vector_oneuse simm5_32b_tgt:$imm5b)), (nxv4i32 (AArch64dup(simm5_32b:$imm5)))),
+  def : Pat<(add (nxv4i32 (step_vector_oneuse simm5_32b_tgt:$imm5b)), (nxv4i32 (splat_vector(simm5_32b:$imm5)))),
             (!cast<Instruction>(NAME # "_S") simm5_32b:$imm5, simm5_32b:$imm5b)>;
-  def : Pat<(add (nxv2i64 (step_vector_oneuse simm5_64b_tgt:$imm5b)), (nxv2i64 (AArch64dup(simm5_64b:$imm5)))),
+  def : Pat<(add (nxv2i64 (step_vector_oneuse simm5_64b_tgt:$imm5b)), (nxv2i64 (splat_vector(simm5_64b:$imm5)))),
             (!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, simm5_64b:$imm5b)>;
 }
 
@@ -5211,35 +5312,35 @@ multiclass sve_int_index_ir<string asm, SDPatternOperator mulop, SDPatternOperat
             (!cast<Instruction>(NAME # "_D") (i64 0), (SUBREG_TO_REG (i64 0), (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)), sub_32))>;
 
   // add(step_vector(step), dup(X)) -> index(X, step).
-  def : Pat<(add (nxv16i8 (step_vector_oneuse i8:$imm)), (nxv16i8 (AArch64dup(simm5_8b:$imm5)))),
+  def : Pat<(add (nxv16i8 (step_vector_oneuse i8:$imm)), (nxv16i8 (splat_vector(simm5_8b:$imm5)))),
             (!cast<Instruction>(NAME # "_B") simm5_8b:$imm5, (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)))>;
-  def : Pat<(add (nxv8i16 (step_vector_oneuse i16:$imm)), (nxv8i16 (AArch64dup(simm5_16b:$imm5)))),
+  def : Pat<(add (nxv8i16 (step_vector_oneuse i16:$imm)), (nxv8i16 (splat_vector(simm5_16b:$imm5)))),
             (!cast<Instruction>(NAME # "_H") simm5_16b:$imm5, (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)))>;
-  def : Pat<(add (nxv4i32 (step_vector_oneuse i32:$imm)), (nxv4i32 (AArch64dup(simm5_32b:$imm5)))),
+  def : Pat<(add (nxv4i32 (step_vector_oneuse i32:$imm)), (nxv4i32 (splat_vector(simm5_32b:$imm5)))),
             (!cast<Instruction>(NAME # "_S") simm5_32b:$imm5, (!cast<Instruction>("MOVi32imm") $imm))>;
-  def : Pat<(add (nxv2i64 (step_vector_oneuse i64:$imm)), (nxv2i64 (AArch64dup(simm5_64b:$imm5)))),
+  def : Pat<(add (nxv2i64 (step_vector_oneuse i64:$imm)), (nxv2i64 (splat_vector(simm5_64b:$imm5)))),
             (!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, (!cast<Instruction>("MOVi64imm") $imm))>;
-  def : Pat<(add (nxv2i64 (step_vector_oneuse i64imm_32bit_tgt:$imm)), (nxv2i64 (AArch64dup(simm5_64b:$imm5)))),
+  def : Pat<(add (nxv2i64 (step_vector_oneuse i64imm_32bit_tgt:$imm)), (nxv2i64 (splat_vector(simm5_64b:$imm5)))),
             (!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, (SUBREG_TO_REG (i64 0), (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)), sub_32))>;
 
   // mul(step_vector(1), dup(Y)) -> index(0, Y).
-  def : Pat<(mulop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (AArch64dup(i32 GPR32:$Rm)))),
+  def : Pat<(mulop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (splat_vector(i32 GPR32:$Rm)))),
             (!cast<Instruction>(NAME # "_B") (i32 0), GPR32:$Rm)>;
-  def : Pat<(mulop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (AArch64dup(i32 GPR32:$Rm)))),
+  def : Pat<(mulop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (splat_vector(i32 GPR32:$Rm)))),
             (!cast<Instruction>(NAME # "_H") (i32 0), GPR32:$Rm)>;
-  def : Pat<(mulop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (AArch64dup(i32 GPR32:$Rm)))),
+  def : Pat<(mulop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (splat_vector(i32 GPR32:$Rm)))),
             (!cast<Instruction>(NAME # "_S") (i32 0), GPR32:$Rm)>;
-  def : Pat<(mulop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (AArch64dup(i64 GPR64:$Rm)))),
+  def : Pat<(mulop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (splat_vector(i64 GPR64:$Rm)))),
             (!cast<Instruction>(NAME # "_D") (i64 0), GPR64:$Rm)>;
 
   // add(mul(step_vector(1), dup(Y)), dup(X)) -> index(X, Y).
-  def : Pat<(add (muloneuseop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (AArch64dup(i32 GPR32:$Rm)))), (nxv16i8 (AArch64dup(simm5_8b:$imm5)))),
+  def : Pat<(add (muloneuseop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (splat_vector(i32 GPR32:$Rm)))), (nxv16i8 (splat_vector(simm5_8b:$imm5)))),
             (!cast<Instruction>(NAME # "_B") simm5_8b:$imm5, GPR32:$Rm)>;
-  def : Pat<(add (muloneuseop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (AArch64dup(i32 GPR32:$Rm)))), (nxv8i16 (AArch64dup(simm5_16b:$imm5)))),
+  def : Pat<(add (muloneuseop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (splat_vector(i32 GPR32:$Rm)))), (nxv8i16 (splat_vector(simm5_16b:$imm5)))),
             (!cast<Instruction>(NAME # "_H") simm5_16b:$imm5, GPR32:$Rm)>;
-  def : Pat<(add (muloneuseop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (AArch64dup(i32 GPR32:$Rm)))), (nxv4i32 (AArch64dup(simm5_32b:$imm5)))),
+  def : Pat<(add (muloneuseop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (splat_vector(i32 GPR32:$Rm)))), (nxv4i32 (splat_vector(simm5_32b:$imm5)))),
             (!cast<Instruction>(NAME # "_S") simm5_32b:$imm5, GPR32:$Rm)>;
-  def : Pat<(add (muloneuseop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (AArch64dup(i64 GPR64:$Rm)))), (nxv2i64 (AArch64dup(simm5_64b:$imm5)))),
+  def : Pat<(add (muloneuseop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (splat_vector(i64 GPR64:$Rm)))), (nxv2i64 (splat_vector(simm5_64b:$imm5)))),
             (!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, GPR64:$Rm)>;
 }
 
@@ -5267,13 +5368,13 @@ multiclass sve_int_index_ri<string asm> {
   def _D : sve_int_index_ri<0b11, asm, ZPR64, GPR64, simm5_64b>;
 
   // add(step_vector(step), dup(X)) -> index(X, step).
-  def : Pat<(add (nxv16i8 (step_vector_oneuse simm5_8b_tgt:$imm5)), (nxv16i8 (AArch64dup(i32 GPR32:$Rm)))),
+  def : Pat<(add (nxv16i8 (step_vector_oneuse simm5_8b_tgt:$imm5)), (nxv16i8 (splat_vector(i32 GPR32:$Rm)))),
             (!cast<Instruction>(NAME # "_B") GPR32:$Rm, (!cast<SDNodeXForm>("trunc_imm") $imm5))>;
-  def : Pat<(add (nxv8i16 (step_vector_oneuse simm5_16b_tgt:$imm5)), (nxv8i16 (AArch64dup(i32 GPR32:$Rm)))),
+  def : Pat<(add (nxv8i16 (step_vector_oneuse simm5_16b_tgt:$imm5)), (nxv8i16 (splat_vector(i32 GPR32:$Rm)))),
             (!cast<Instruction>(NAME # "_H") GPR32:$Rm, (!cast<SDNodeXForm>("trunc_imm") $imm5))>;
-  def : Pat<(add (nxv4i32 (step_vector_oneuse simm5_32b_tgt:$imm5)), (nxv4i32 (AArch64dup(i32 GPR32:$Rm)))),
+  def : Pat<(add (nxv4i32 (step_vector_oneuse simm5_32b_tgt:$imm5)), (nxv4i32 (splat_vector(i32 GPR32:$Rm)))),
             (!cast<Instruction>(NAME # "_S") GPR32:$Rm, simm5_32b:$imm5)>;
-  def : Pat<(add (nxv2i64 (step_vector_oneuse simm5_64b_tgt:$imm5)), (nxv2i64 (AArch64dup(i64 GPR64:$Rm)))),
+  def : Pat<(add (nxv2i64 (step_vector_oneuse simm5_64b_tgt:$imm5)), (nxv2i64 (splat_vector(i64 GPR64:$Rm)))),
             (!cast<Instruction>(NAME # "_D") GPR64:$Rm, simm5_64b:$imm5)>;
 }
 
@@ -5301,25 +5402,25 @@ multiclass sve_int_index_rr<string asm, SDPatternOperator mulop> {
   def _D : sve_int_index_rr<0b11, asm, ZPR64, GPR64>;
 
   // add(step_vector(step), dup(X)) -> index(X, step).
-  def : Pat<(add (nxv16i8 (step_vector_oneuse i8:$imm)), (nxv16i8 (AArch64dup(i32 GPR32:$Rn)))),
+  def : Pat<(add (nxv16i8 (step_vector_oneuse i8:$imm)), (nxv16i8 (splat_vector(i32 GPR32:$Rn)))),
             (!cast<Instruction>(NAME # "_B") GPR32:$Rn, (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)))>;
-  def : Pat<(add (nxv8i16 (step_vector_oneuse i16:$imm)), (nxv8i16 (AArch64dup(i32 GPR32:$Rn)))),
+  def : Pat<(add (nxv8i16 (step_vector_oneuse i16:$imm)), (nxv8i16 (splat_vector(i32 GPR32:$Rn)))),
             (!cast<Instruction>(NAME # "_H") GPR32:$Rn, (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)))>;
-  def : Pat<(add (nxv4i32 (step_vector_oneuse i32:$imm)), (nxv4i32 (AArch64dup(i32 GPR32:$Rn)))),
+  def : Pat<(add (nxv4i32 (step_vector_oneuse i32:$imm)), (nxv4i32 (splat_vector(i32 GPR32:$Rn)))),
             (!cast<Instruction>(NAME # "_S") GPR32:$Rn, (!cast<Instruction>("MOVi32imm") $imm))>;
-  def : Pat<(add (nxv2i64 (step_vector_oneuse i64:$imm)), (nxv2i64 (AArch64dup(i64 GPR64:$Rn)))),
+  def : Pat<(add (nxv2i64 (step_vector_oneuse i64:$imm)), (nxv2i64 (splat_vector(i64 GPR64:$Rn)))),
             (!cast<Instruction>(NAME # "_D") GPR64:$Rn, (!cast<Instruction>("MOVi64imm") $imm))>;
-  def : Pat<(add (nxv2i64 (step_vector_oneuse i64imm_32bit_tgt:$imm)), (nxv2i64 (AArch64dup(i64 GPR64:$Rn)))),
+  def : Pat<(add (nxv2i64 (step_vector_oneuse i64imm_32bit_tgt:$imm)), (nxv2i64 (splat_vector(i64 GPR64:$Rn)))),
             (!cast<Instruction>(NAME # "_D") GPR64:$Rn, (SUBREG_TO_REG (i64 0), (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)), sub_32))>;
 
   // add(mul(step_vector(1), dup(Y)), dup(X)) -> index(X, Y).
-  def : Pat<(add (mulop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (AArch64dup(i32 GPR32:$Rm)))), (nxv16i8 (AArch64dup(i32 GPR32:$Rn)))),
+  def : Pat<(add (mulop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (splat_vector(i32 GPR32:$Rm)))), (nxv16i8 (splat_vector(i32 GPR32:$Rn)))),
             (!cast<Instruction>(NAME # "_B") GPR32:$Rn, GPR32:$Rm)>;
-  def : Pat<(add (mulop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (AArch64dup(i32 GPR32:$Rm)))),(nxv8i16 (AArch64dup(i32 GPR32:$Rn)))),
+  def : Pat<(add (mulop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (splat_vector(i32 GPR32:$Rm)))),(nxv8i16 (splat_vector(i32 GPR32:$Rn)))),
             (!cast<Instruction>(NAME # "_H") GPR32:$Rn, GPR32:$Rm)>;
-  def : Pat<(add (mulop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (AArch64dup(i32 GPR32:$Rm)))),(nxv4i32 (AArch64dup(i32 GPR32:$Rn)))),
+  def : Pat<(add (mulop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (splat_vector(i32 GPR32:$Rm)))),(nxv4i32 (splat_vector(i32 GPR32:$Rn)))),
             (!cast<Instruction>(NAME # "_S") GPR32:$Rn, GPR32:$Rm)>;
-  def : Pat<(add (mulop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (AArch64dup(i64 GPR64:$Rm)))),(nxv2i64 (AArch64dup(i64 GPR64:$Rn)))),
+  def : Pat<(add (mulop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (splat_vector(i64 GPR64:$Rm)))),(nxv2i64 (splat_vector(i64 GPR64:$Rn)))),
             (!cast<Instruction>(NAME # "_D") GPR64:$Rn, GPR64:$Rm)>;
 }
 
@@ -5972,25 +6073,25 @@ multiclass sve_mem_sst_sv_64_scaled<bits<2> msz, string asm,
                                     SDPatternOperator op,
                                     RegisterOperand zprext,
                                     ValueType vt> {
-  def _SCALED_REAL : sve_mem_sst_sv2<msz, 1, asm, zprext>;
+  def _SCALED : sve_mem_sst_sv2<msz, 1, asm, zprext>;
 
   def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
-                 (!cast<Instruction>(NAME # _SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>;
+                 (!cast<Instruction>(NAME # _SCALED) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>;
 
   def : Pat<(op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt),
-            (!cast<Instruction>(NAME # _SCALED_REAL) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
+            (!cast<Instruction>(NAME # _SCALED) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
 }
 
 multiclass sve_mem_sst_sv_64_unscaled<bits<2> msz, string asm,
                                       SDPatternOperator op,
                                       ValueType vt> {
-  def _REAL : sve_mem_sst_sv2<msz, 0, asm, ZPR64ExtLSL8>;
+  def NAME : sve_mem_sst_sv2<msz, 0, asm, ZPR64ExtLSL8>;
 
   def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
-                 (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>;
+                 (!cast<Instruction>(NAME) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>;
 
   def : Pat<(op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt),
-            (!cast<Instruction>(NAME # _REAL) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
+            (!cast<Instruction>(NAME) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
 }
 
 class sve_mem_sst_vi<bits<3> opc, string asm, ZPRRegOp zprty,
@@ -8433,6 +8534,7 @@ def am_sve_regreg_lsl0 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<0>", [
 def am_sve_regreg_lsl1 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<1>", []>;
 def am_sve_regreg_lsl2 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<2>", []>;
 def am_sve_regreg_lsl3 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<3>", []>;
+def am_sve_regreg_lsl4 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<4>", []>;
 
 // Predicated pseudo floating point two operand instructions.
 multiclass sve_fp_bin_pred_hfd<SDPatternOperator op> {
diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
index 4a24162540a5..ccb34f367338 100644
--- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
+++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
@@ -305,8 +305,7 @@ bool SVEIntrinsicOpts::optimizePredicateStore(Instruction *I) {
 
   // ..where the value stored comes from a vector extract..
   auto *IntrI = dyn_cast<IntrinsicInst>(Store->getOperand(0));
-  if (!IntrI ||
-      IntrI->getIntrinsicID() != Intrinsic::experimental_vector_extract)
+  if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::vector_extract)
     return false;
 
   // ..that is extracting from index 0..
@@ -365,8 +364,7 @@ bool SVEIntrinsicOpts::optimizePredicateLoad(Instruction *I) {
 
   // ..whose operand is a vector_insert..
   auto *IntrI = dyn_cast<IntrinsicInst>(BitCast->getOperand(0));
-  if (!IntrI ||
-      IntrI->getIntrinsicID() != Intrinsic::experimental_vector_insert)
+  if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::vector_insert)
     return false;
 
   // ..that is inserting into index zero of an undef vector..
@@ -451,8 +449,8 @@ bool SVEIntrinsicOpts::runOnModule(Module &M) {
       continue;
 
     switch (F.getIntrinsicID()) {
-    case Intrinsic::experimental_vector_extract:
-    case Intrinsic::experimental_vector_insert:
+    case Intrinsic::vector_extract:
+    case Intrinsic::vector_insert:
     case Intrinsic::aarch64_sve_ptrue:
       for (User *U : F.users())
         Functions.insert(cast<Instruction>(U)->getFunction());
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 5906a5d6b50b..71303611265c 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -634,7 +634,8 @@ namespace AArch64SysReg {
     FeatureBitset FeaturesRequired;
 
     bool haveFeatures(FeatureBitset ActiveFeatures) const {
-      return (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
+      return ActiveFeatures[llvm::AArch64::FeatureAll] ||
+             (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
     }
   };
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 11cc1a01d248..c4680cbedadf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -91,10 +91,6 @@ ModulePass *createAMDGPULowerIntrinsicsPass();
 void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
 extern char &AMDGPULowerIntrinsicsID;
 
-ModulePass *createAMDGPUFixFunctionBitcastsPass();
-void initializeAMDGPUFixFunctionBitcastsPass(PassRegistry &);
-extern char &AMDGPUFixFunctionBitcastsID;
-
 ModulePass *createAMDGPUCtorDtorLoweringPass();
 void initializeAMDGPUCtorDtorLoweringPass(PassRegistry &);
 extern char &AMDGPUCtorDtorLoweringID;
@@ -303,6 +299,12 @@ extern char &SIMemoryLegalizerID;
 void initializeSIModeRegisterPass(PassRegistry&);
 extern char &SIModeRegisterID;
 
+void initializeAMDGPUReleaseVGPRsPass(PassRegistry &);
+extern char &AMDGPUReleaseVGPRsID;
+
+void initializeAMDGPUInsertDelayAluPass(PassRegistry &);
+extern char &AMDGPUInsertDelayAluID;
+
 void initializeSIInsertHardClausesPass(PassRegistry &);
 extern char &SIInsertHardClausesID;
 
@@ -335,6 +337,9 @@ extern char &GCNNSAReassignID;
 void initializeGCNPreRAOptimizationsPass(PassRegistry &);
 extern char &GCNPreRAOptimizationsID;
 
+FunctionPass *createAMDGPUSetWavePriorityPass();
+void initializeAMDGPUSetWavePriorityPass(PassRegistry &);
+
 namespace AMDGPU {
 enum TargetIndex {
   TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 806c0b18637a..48b5814cd482 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -86,6 +86,12 @@ def FeatureScalarFlatScratchInsts : SubtargetFeature<"scalar-flat-scratch-insts"
   "Have s_scratch_* flat memory instructions"
 >;
 
+def FeatureEnableFlatScratch : SubtargetFeature<"enable-flat-scratch",
+  "EnableFlatScratch",
+  "true",
+  "Use scratch_* flat memory instructions to access scratch"
+>;
+
 def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts",
   "AddNoCarryInsts",
   "true",
@@ -171,6 +177,12 @@ def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
   "VI SGPR initialization bug requiring a fixed SGPR allocation size"
 >;
 
+def FeatureUserSGPRInit16Bug : SubtargetFeature<"user-sgpr-init16-bug",
+  "UserSGPRInit16Bug",
+  "true",
+  "Bug requiring at least 16 user+system SGPRs to be enabled"
+>;
+
 def FeatureLdsMisalignedBug : SubtargetFeature<"lds-misaligned-bug",
   "LDSMisalignedBug",
   "true",
@@ -307,12 +319,24 @@ def FeatureGFX90AInsts : SubtargetFeature<"gfx90a-insts",
   "Additional instructions for GFX90A+"
 >;
 
+def FeatureGFX940Insts : SubtargetFeature<"gfx940-insts",
+  "GFX940Insts",
+  "true",
+  "Additional instructions for GFX940+"
+>;
+
 def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts",
   "GFX10Insts",
   "true",
   "Additional instructions for GFX10+"
 >;
 
+def FeatureGFX11Insts : SubtargetFeature<"gfx11-insts",
+  "GFX11Insts",
+  "true",
+  "Additional instructions for GFX11+"
+>;
+
 def FeatureGFX10_3Insts : SubtargetFeature<"gfx10-3-insts",
   "GFX10_3Insts",
   "true",
@@ -343,6 +367,12 @@ def Feature16BitInsts : SubtargetFeature<"16-bit-insts",
   "Has i16/f16 instructions"
 >;
 
+def FeatureTrue16BitInsts : SubtargetFeature<"true16",
+  "HasTrue16BitInsts",
+  "true",
+  "True 16-bit operand instructions"
+>;
+
 def FeatureVOP3P : SubtargetFeature<"vop3p",
   "HasVOP3PInsts",
   "true",
@@ -458,6 +488,12 @@ def FeatureNSAEncoding : SubtargetFeature<"nsa-encoding",
   "Support NSA encoding for image instructions"
 >;
 
+def FeatureImageInsts : SubtargetFeature<"image-insts",
+  "HasImageInsts",
+  "true",
+  "Support image instructions"
+>;
+
 def FeatureExtendedImageInsts : SubtargetFeature<"extended-image-insts",
   "HasExtendedImageInsts",
   "true",
@@ -536,6 +572,13 @@ def FeatureDot7Insts : SubtargetFeature<"dot7-insts",
   "Has v_dot2_f32_f16, v_dot4_u32_u8, v_dot8_u32_u4 instructions"
 >;
 
+def FeatureDot8Insts : SubtargetFeature<"dot8-insts",
+  "HasDot8Insts",
+  "true",
+  "Has v_dot2_f16_f16, v_dot2_bf16_bf16, v_dot2_f32_bf16, "
+  "v_dot4_i32_iu8, v_dot8_i32_iu4 instructions"
+>;
+
 def FeatureMAIInsts : SubtargetFeature<"mai-insts",
   "HasMAIInsts",
   "true",
@@ -548,11 +591,28 @@ def FeaturePkFmacF16Inst : SubtargetFeature<"pk-fmac-f16-inst",
   "Has v_pk_fmac_f16 instruction"
 >;
 
-def FeatureAtomicFaddInsts : SubtargetFeature<"atomic-fadd-insts",
-  "HasAtomicFaddInsts",
+def FeatureAtomicFaddRtnInsts : SubtargetFeature<"atomic-fadd-rtn-insts",
+  "HasAtomicFaddRtnInsts",
   "true",
-  "Has buffer_atomic_add_f32, buffer_atomic_pk_add_f16, global_atomic_add_f32, "
-  "global_atomic_pk_add_f16 instructions",
+  "Has buffer_atomic_add_f32 and global_atomic_add_f32 instructions that "
+  "return original value",
+  [FeatureFlatGlobalInsts]
+>;
+
+def FeatureAtomicFaddNoRtnInsts : SubtargetFeature<"atomic-fadd-no-rtn-insts",
+  "HasAtomicFaddNoRtnInsts",
+  "true",
+  "Has buffer_atomic_add_f32 and global_atomic_add_f32 instructions that "
+  "don't return original value",
+  [FeatureFlatGlobalInsts]
+>;
+
+def FeatureAtomicPkFaddNoRtnInsts
+  : SubtargetFeature<"atomic-pk-fadd-no-rtn-insts",
+  "HasAtomicPkFaddNoRtnInsts",
+  "true",
+  "Has buffer_atomic_pk_add_f16 and global_atomic_pk_add_f16 instructions that "
+  "don't return original value",
   [FeatureFlatGlobalInsts]
 >;
 
@@ -632,6 +692,12 @@ class SubtargetFeatureNSAMaxSize <int Value> : SubtargetFeature <
 def FeatureNSAMaxSize5 : SubtargetFeatureNSAMaxSize<5>;
 def FeatureNSAMaxSize13 : SubtargetFeatureNSAMaxSize<13>;
 
+def FeatureVOPD : SubtargetFeature<"vopd",
+  "HasVOPDInsts",
+  "true",
+  "Has VOPD dual issue wave32 instructions"
+>;
+
 //===------------------------------------------------------------===//
 // Subtarget Features (options and debugging)
 //===------------------------------------------------------------===//
@@ -762,7 +828,7 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
   [FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128,
   FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts,
   FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel,
-  FeatureTrigReducedRange, FeatureExtendedImageInsts
+  FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts
   ]
 >;
 
@@ -772,7 +838,8 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
   FeatureWavefrontSize64, FeatureFlatAddressSpace,
   FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange,
   FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
-  FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess
+  FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess,
+  FeatureImageInsts
   ]
 >;
 
@@ -787,7 +854,7 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
    FeatureIntClamp, FeatureTrigReducedRange, FeatureGFX8Insts,
    FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
    FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureFastDenormalF32,
-   FeatureUnalignedBufferAccess
+   FeatureUnalignedBufferAccess, FeatureImageInsts
   ]
 >;
 
@@ -824,6 +891,25 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
    FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts,
    FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
    FeatureGFX10A16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16,
+   FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts
+  ]
+>;
+
+def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
+  "gfx11",
+  [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128,
+   FeatureFlatAddressSpace, Feature16BitInsts,
+   FeatureInv2PiInlineImm, FeatureApertureRegs,
+   FeatureCIInsts, FeatureGFX8Insts, FeatureGFX9Insts, FeatureGFX10Insts,
+   FeatureGFX10_AEncoding, FeatureGFX10_BEncoding, FeatureGFX10_3Insts,
+   FeatureGFX11Insts, FeatureVOP3P, FeatureVOPD, FeatureTrue16BitInsts,
+   FeatureMovrel, FeatureFastFMAF32, FeatureDPP, FeatureIntClamp,
+   FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
+   FeatureAddNoCarryInsts, FeatureFmaMixInsts,
+   FeatureNoSdstCMPX, FeatureVscnt,
+   FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts,
+   FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
+   FeatureGFX10A16, FeatureFastDenormalF32, FeatureG16,
    FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess
   ]
 >;
@@ -910,6 +996,7 @@ def FeatureISAVersion9_0_0 : FeatureSet<
    FeatureLDSBankCount32,
    FeatureDsSrc2Insts,
    FeatureExtendedImageInsts,
+   FeatureImageInsts,
    FeatureMadMacF32Insts,
    FeatureImageGather4D16Bug]>;
 
@@ -919,6 +1006,7 @@ def FeatureISAVersion9_0_2 : FeatureSet<
    FeatureLDSBankCount32,
    FeatureDsSrc2Insts,
    FeatureExtendedImageInsts,
+   FeatureImageInsts,
    FeatureMadMacF32Insts,
    FeatureImageGather4D16Bug]>;
 
@@ -927,6 +1015,7 @@ def FeatureISAVersion9_0_4 : FeatureSet<
    FeatureLDSBankCount32,
    FeatureDsSrc2Insts,
    FeatureExtendedImageInsts,
+   FeatureImageInsts,
    FeatureMadMacF32Insts,
    FeatureFmaMixInsts,
    FeatureImageGather4D16Bug]>;
@@ -938,6 +1027,7 @@ def FeatureISAVersion9_0_6 : FeatureSet<
    FeatureLDSBankCount32,
    FeatureDsSrc2Insts,
    FeatureExtendedImageInsts,
+   FeatureImageInsts,
    FeatureMadMacF32Insts,
    FeatureDLInsts,
    FeatureDot1Insts,
@@ -953,6 +1043,7 @@ def FeatureISAVersion9_0_8 : FeatureSet<
    FeatureLDSBankCount32,
    FeatureDsSrc2Insts,
    FeatureExtendedImageInsts,
+   FeatureImageInsts,
    FeatureMadMacF32Insts,
    FeatureDLInsts,
    FeatureDot1Insts,
@@ -964,7 +1055,8 @@ def FeatureISAVersion9_0_8 : FeatureSet<
    FeatureDot7Insts,
    FeatureMAIInsts,
    FeaturePkFmacF16Inst,
-   FeatureAtomicFaddInsts,
+   FeatureAtomicFaddNoRtnInsts,
+   FeatureAtomicPkFaddNoRtnInsts,
    FeatureSupportsSRAMECC,
    FeatureMFMAInlineLiteralBug,
    FeatureImageGather4D16Bug]>;
@@ -975,6 +1067,7 @@ def FeatureISAVersion9_0_9 : FeatureSet<
    FeatureLDSBankCount32,
    FeatureDsSrc2Insts,
    FeatureExtendedImageInsts,
+   FeatureImageInsts,
    FeatureMadMacF32Insts,
    FeatureImageGather4D16Bug]>;
 
@@ -995,7 +1088,10 @@ def FeatureISAVersion9_0_A : FeatureSet<
    FeaturePackedFP32Ops,
    FeatureMAIInsts,
    FeaturePkFmacF16Inst,
-   FeatureAtomicFaddInsts,
+   FeatureAtomicFaddRtnInsts,
+   FeatureAtomicFaddNoRtnInsts,
+   FeatureAtomicPkFaddNoRtnInsts,
+   FeatureImageInsts,
    FeatureMadMacF32Insts,
    FeatureSupportsSRAMECC,
    FeaturePackedTID,
@@ -1007,9 +1103,36 @@ def FeatureISAVersion9_0_C : FeatureSet<
    FeatureLDSBankCount32,
    FeatureDsSrc2Insts,
    FeatureExtendedImageInsts,
+   FeatureImageInsts,
    FeatureMadMacF32Insts,
    FeatureImageGather4D16Bug]>;
 
+def FeatureISAVersion9_4_0 : FeatureSet<
+  [FeatureGFX9,
+   FeatureGFX90AInsts,
+   FeatureGFX940Insts,
+   FeatureFmaMixInsts,
+   FeatureLDSBankCount32,
+   FeatureDLInsts,
+   FeatureDot1Insts,
+   FeatureDot2Insts,
+   FeatureDot3Insts,
+   FeatureDot4Insts,
+   FeatureDot5Insts,
+   FeatureDot6Insts,
+   FeatureDot7Insts,
+   Feature64BitDPP,
+   FeaturePackedFP32Ops,
+   FeatureMAIInsts,
+   FeaturePkFmacF16Inst,
+   FeatureAtomicFaddRtnInsts,
+   FeatureAtomicFaddNoRtnInsts,
+   FeatureAtomicPkFaddNoRtnInsts,
+   FeatureSupportsSRAMECC,
+   FeaturePackedTID,
+   FeatureArchitectedFlatScratch,
+   FullRate64Ops]>;
+
 // TODO: Organize more features into groups.
 def FeatureGroup {
   // Bugs present on gfx10.1.
@@ -1124,6 +1247,33 @@ def FeatureISAVersion10_3_0 : FeatureSet<
    FeatureWavefrontSize32,
    FeatureShaderCyclesRegister]>;
 
+def FeatureISAVersion11_Common : FeatureSet<
+  [FeatureGFX11,
+   FeatureLDSBankCount32,
+   FeatureDLInsts,
+   FeatureDot5Insts,
+   FeatureDot7Insts,
+   FeatureDot8Insts,
+   FeatureNSAEncoding,
+   FeatureNSAMaxSize5,
+   FeatureWavefrontSize32,
+   FeatureShaderCyclesRegister,
+   FeatureArchitectedFlatScratch,
+   FeatureAtomicFaddRtnInsts,
+   FeatureAtomicFaddNoRtnInsts,
+   FeatureImageInsts,
+   FeaturePackedTID,
+   FeatureVcmpxPermlaneHazard]>;
+
+// Features for GFX 11.0.0 and 11.0.1
+def FeatureISAVersion11_0 : FeatureSet<
+  !listconcat(FeatureISAVersion11_Common.Features,
+    [FeatureUserSGPRInit16Bug])>;
+
+def FeatureISAVersion11_0_2 : FeatureSet<
+  !listconcat(FeatureISAVersion11_Common.Features,
+    [FeatureUserSGPRInit16Bug])>;
+
 //===----------------------------------------------------------------------===//
 
 def AMDGPUInstrInfo : InstrInfo {
@@ -1152,8 +1302,10 @@ def AMDGPUAsmVariants {
   int SDWA9_ID = 3;
   string DPP = "DPP";
   int DPP_ID = 4;
+  string VOP3_DPP = "VOP3_DPP";
+  int VOP3_DPP_ID = 5;
   string Disable = "Disable";
-  int Disable_ID = 5;
+  int Disable_ID = 6;
 }
 
 def DefaultAMDGPUAsmParserVariant : AsmParserVariant {
@@ -1176,12 +1328,16 @@ def SDWA9AsmParserVariant : AsmParserVariant {
   let Name = AMDGPUAsmVariants.SDWA9;
 }
 
-
 def DPPAsmParserVariant : AsmParserVariant {
   let Variant = AMDGPUAsmVariants.DPP_ID;
   let Name = AMDGPUAsmVariants.DPP;
 }
 
+def VOP3_DPPAsmParserVariant : AsmParserVariant {
+  let Variant = AMDGPUAsmVariants.VOP3_DPP_ID;
+  let Name = AMDGPUAsmVariants.VOP3_DPP;
+}
+
 def AMDGPU : Target {
   // Pull in Instruction Info:
   let InstructionSet = AMDGPUInstrInfo;
@@ -1190,7 +1346,8 @@ def AMDGPU : Target {
                                 VOP3AsmParserVariant,
                                 SDWAAsmParserVariant,
                                 SDWA9AsmParserVariant,
-                                DPPAsmParserVariant];
+                                DPPAsmParserVariant,
+                                VOP3_DPPAsmParserVariant];
   let AssemblyWriters = [AMDGPUAsmWriter];
   let AllowRegisterRenaming = 1;
 }
@@ -1216,6 +1373,12 @@ def isGFX6GFX7GFX10 :
   Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
             "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
             "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">,
+  AssemblerPredicate<(all_of (not FeatureGCN3Encoding), (not FeatureGFX11Insts))>;
+
+def isGFX6GFX7GFX10Plus :
+  Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
+            "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
+            "Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10">,
   AssemblerPredicate<(all_of (not FeatureGCN3Encoding))>;
 
 def isGFX7Only :
@@ -1225,6 +1388,12 @@ def isGFX7Only :
 def isGFX7GFX10 :
   Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
             "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">,
+  AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureCIInsts, (not FeatureGFX11Insts))>;
+
+def isGFX7GFX10GFX11 :
+  Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
+            "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10 ||"
+            "Subtarget->getGeneration() == AMDGPUSubtarget::GFX11">,
   AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureCIInsts)>;
 
 def isGFX7GFX8GFX9 :
@@ -1248,6 +1417,21 @@ def isGFX6GFX7GFX8GFX9NotGFX90A :
             " Subtarget->getGeneration() == AMDGPUSubtarget::GFX9)">,
   AssemblerPredicate<(all_of (not FeatureGFX10Insts), (not FeatureGFX90AInsts))>;
 
+def isGFX6GFX7GFX8GFX9GFX10 :
+  Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
+            "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
+            "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
+            "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9 ||"
+            "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">,
+  AssemblerPredicate<(all_of (not FeatureGFX11Insts))>;
+
+def isGFX7GFX8GFX9GFX10 :
+  Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
+            "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
+            "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9 ||"
+            "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">,
+  AssemblerPredicate<(all_of FeatureCIInsts, (not FeatureGFX11Insts))>;
+
 def isGFX7Plus :
   Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">,
   AssemblerPredicate<(all_of FeatureCIInsts)>;
@@ -1287,18 +1471,37 @@ def isGFX8GFX9NotGFX90A :
   AssemblerPredicate<(all_of FeatureGFX8Insts, FeatureGCN3Encoding, (not FeatureGFX90AInsts))>;
 
 def isGFX90AOnly :
-  Predicate<"Subtarget->hasGFX90AInsts()">,
-  AssemblerPredicate<(all_of FeatureGFX90AInsts)>;
+  Predicate<"Subtarget->hasGFX90AInsts() && !Subtarget->hasGFX940Insts()">,
+  AssemblerPredicate<(all_of FeatureGFX90AInsts, (not FeatureGFX940Insts))>;
 
 def isGFX908orGFX90A :
-  Predicate<"Subtarget->hasMAIInsts()">,
-  AssemblerPredicate<(all_of FeatureMAIInsts)>;
+  Predicate<"Subtarget->hasMAIInsts() && !Subtarget->hasGFX940Insts()">,
+  AssemblerPredicate<(all_of FeatureMAIInsts, (not FeatureGFX940Insts))>;
+
+def isGFX940Plus :
+  Predicate<"Subtarget->hasGFX940Insts()">,
+  AssemblerPredicate<(all_of FeatureGFX940Insts)>;
+
+def isGFX940GFX11Plus :
+  Predicate<"Subtarget->hasGFX940Insts() ||"
+            "Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11">,
+  AssemblerPredicate<(any_of FeatureGFX940Insts, FeatureGFX11Insts)>;
+
+def isGFX8GFX9NotGFX940 :
+  Predicate<"!Subtarget->hasGFX940Insts() &&"
+            "(Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
+            " Subtarget->getGeneration() == AMDGPUSubtarget::GFX9)">,
+  AssemblerPredicate<(all_of FeatureGFX8Insts, FeatureGCN3Encoding, (not FeatureGFX940Insts))>;
 
 def isGFX8GFX9 :
   Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
             "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,
   AssemblerPredicate<(all_of FeatureGFX8Insts, FeatureGCN3Encoding)>;
 
+def isGFX10Only :
+  Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">,
+  AssemblerPredicate<(all_of FeatureGFX10Insts, (not FeatureGFX11Insts))>;
+
 def isGFX10Plus :
   Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10">,
   AssemblerPredicate<(all_of FeatureGFX10Insts)>;
@@ -1308,6 +1511,25 @@ def isGFX10Before1030 :
             "!Subtarget->hasGFX10_3Insts()">,
   AssemblerPredicate<(all_of FeatureGFX10Insts,(not FeatureGFX10_3Insts))>;
 
+def isGFX9GFX10 :
+  Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX9 ||"
+            "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">,
+  AssemblerPredicate<(all_of FeatureGFX9Insts, (not FeatureGFX11Insts))>;
+
+def isGFX8GFX9GFX10 :
+  Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
+            "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9 ||"
+            "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">,
+  AssemblerPredicate<(all_of FeatureGFX8Insts, (not FeatureGFX11Insts))>;
+
+def isGFX11Only :
+  Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX11">,
+  AssemblerPredicate<(all_of FeatureGFX11Insts)>;
+
+def isGFX11Plus :
+  Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11">,
+  AssemblerPredicate<(all_of FeatureGFX11Insts)>;
+
 def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
   AssemblerPredicate<(all_of FeatureFlatAddressSpace)>;
 
@@ -1321,7 +1543,9 @@ def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">,
   AssemblerPredicate<(all_of FeatureGFX9Insts)>;
 
 def HasFlatScratchSTMode : Predicate<"Subtarget->hasFlatScratchSTMode()">,
-  AssemblerPredicate<(any_of FeatureGFX10_3Insts)>;
+  AssemblerPredicate<(any_of FeatureGFX10_3Insts, FeatureGFX940Insts)>;
+def HasFlatScratchSVSMode : Predicate<"Subtarget->hasFlatScratchSVSMode()">,
+  AssemblerPredicate<(any_of FeatureGFX940Insts, FeatureGFX11Insts)>;
 
 def HasGFX10_AEncoding : Predicate<"Subtarget->hasGFX10_AEncoding()">,
   AssemblerPredicate<(all_of FeatureGFX10_AEncoding)>;
@@ -1354,6 +1578,11 @@ def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarry()">;
 
 def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">,
   AssemblerPredicate<(all_of Feature16BitInsts)>;
+
+def HasTrue16BitInsts : Predicate<"Subtarget->hasTrue16BitInsts()">,
+  AssemblerPredicate<(all_of FeatureTrue16BitInsts)>;
+def NotHasTrue16BitInsts : Predicate<"!Subtarget->hasTrue16BitInsts()">;
+
 def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,
   AssemblerPredicate<(all_of FeatureVOP3P)>;
 
@@ -1385,7 +1614,10 @@ def HasPackedFP32Ops : Predicate<"Subtarget->hasPackedFP32Ops()">,
 
 def HasFmaakFmamkF32Insts :
   Predicate<"Subtarget->hasFmaakFmamkF32Insts()">,
-  AssemblerPredicate<(any_of FeatureGFX10Insts)>;
+  AssemblerPredicate<(any_of FeatureGFX10Insts, FeatureGFX940Insts)>;
+
+def HasImageInsts : Predicate<"Subtarget->hasImageInsts()">,
+  AssemblerPredicate<(all_of FeatureImageInsts)>;
 
 def HasExtendedImageInsts : Predicate<"Subtarget->hasExtendedImageInsts()">,
   AssemblerPredicate<(all_of FeatureExtendedImageInsts)>;
@@ -1454,6 +1686,9 @@ def HasDot6Insts : Predicate<"Subtarget->hasDot6Insts()">,
 def HasDot7Insts : Predicate<"Subtarget->hasDot7Insts()">,
   AssemblerPredicate<(all_of FeatureDot7Insts)>;
 
+def HasDot8Insts : Predicate<"Subtarget->hasDot8Insts()">,
+  AssemblerPredicate<(all_of FeatureDot8Insts)>;
+
 def HasGetWaveIdInst : Predicate<"Subtarget->hasGetWaveIdInst()">,
   AssemblerPredicate<(all_of FeatureGetWaveIdInst)>;
 
@@ -1478,8 +1713,13 @@ def HasMadMacF32Insts : Predicate<"Subtarget->hasMadMacF32Insts()">,
 def HasFmaLegacy32 : Predicate<"Subtarget->hasGFX10_3Insts()">,
   AssemblerPredicate<(any_of FeatureGFX10_3Insts)>;
 
-def HasAtomicFaddInsts : Predicate<"Subtarget->hasAtomicFaddInsts()">,
-  AssemblerPredicate<(all_of FeatureAtomicFaddInsts)>;
+def HasAtomicFaddRtnInsts : Predicate<"Subtarget->hasAtomicFaddRtnInsts()">,
+  AssemblerPredicate<(all_of FeatureAtomicFaddRtnInsts)>;
+def HasAtomicFaddNoRtnInsts : Predicate<"Subtarget->hasAtomicFaddNoRtnInsts()">,
+  AssemblerPredicate<(all_of FeatureAtomicFaddNoRtnInsts)>;
+def HasAtomicPkFaddNoRtnInsts
+  : Predicate<"Subtarget->hasAtomicPkFaddNoRtnInsts()">,
+  AssemblerPredicate<(all_of FeatureAtomicPkFaddNoRtnInsts)>;
 
 def HasDsSrc2Insts : Predicate<"!Subtarget->hasDsSrc2Insts()">,
   AssemblerPredicate<(all_of FeatureDsSrc2Insts)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
index bebf032b5535..74be0336851c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -14,12 +14,11 @@
 
 #include "AMDGPU.h"
 #include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/SmallSet.h"
+#include "Utils/AMDGPUMemoryUtils.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/IR/InstVisitor.h"
-#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/InitializePasses.h"
 
 #define DEBUG_TYPE "amdgpu-annotate-uniform"
@@ -33,8 +32,18 @@ class AMDGPUAnnotateUniformValues : public FunctionPass,
   LegacyDivergenceAnalysis *DA;
   MemorySSA *MSSA;
   AliasAnalysis *AA;
-  DenseMap<Value*, GetElementPtrInst*> noClobberClones;
   bool isEntryFunc;
+  bool Changed;
+
+  void setUniformMetadata(Instruction *I) {
+    I->setMetadata("amdgpu.uniform", MDNode::get(I->getContext(), {}));
+    Changed = true;
+  }
+
+  void setNoClobberMetadata(Instruction *I) {
+    I->setMetadata("amdgpu.noclobber", MDNode::get(I->getContext(), {}));
+    Changed = true;
+  }
 
 public:
   static char ID;
@@ -54,7 +63,6 @@ public:
 
   void visitBranchInst(BranchInst &I);
   void visitLoadInst(LoadInst &I);
-  bool isClobberedInFunction(LoadInst * Load);
 };
 
 } // End anonymous namespace
@@ -69,88 +77,6 @@ INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
 
 char AMDGPUAnnotateUniformValues::ID = 0;
 
-static void setUniformMetadata(Instruction *I) {
-  I->setMetadata("amdgpu.uniform", MDNode::get(I->getContext(), {}));
-}
-static void setNoClobberMetadata(Instruction *I) {
-  I->setMetadata("amdgpu.noclobber", MDNode::get(I->getContext(), {}));
-}
-
-bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst *Load) {
-  MemorySSAWalker *Walker = MSSA->getWalker();
-  SmallVector<MemoryAccess *> WorkList{Walker->getClobberingMemoryAccess(Load)};
-  SmallSet<MemoryAccess *, 8> Visited;
-  MemoryLocation Loc(MemoryLocation::get(Load));
-
-  const auto isReallyAClobber = [this, Load](MemoryDef *Def) -> bool {
-    Instruction *DefInst = Def->getMemoryInst();
-    LLVM_DEBUG(dbgs() << "  Def: " << *DefInst << '\n');
-
-    if (isa<FenceInst>(DefInst))
-      return false;
-
-    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) {
-      switch (II->getIntrinsicID()) {
-      case Intrinsic::amdgcn_s_barrier:
-      case Intrinsic::amdgcn_wave_barrier:
-        return false;
-      default:
-        break;
-      }
-    }
-
-    // Ignore atomics not aliasing with the original load, any atomic is a
-    // universal MemoryDef from MSSA's point of view too, just like a fence.
-    const auto checkNoAlias = [this, Load](auto I) -> bool {
-      return I && AA->isNoAlias(I->getPointerOperand(),
-                                Load->getPointerOperand());
-    };
-
-    if (checkNoAlias(dyn_cast<AtomicCmpXchgInst>(DefInst)) ||
-        checkNoAlias(dyn_cast<AtomicRMWInst>(DefInst)))
-      return false;
-
-    return true;
-  };
-
-  LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n');
-
-  // Start with a nearest dominating clobbering access, it will be either
-  // live on entry (nothing to do, load is not clobbered), MemoryDef, or
-  // MemoryPhi if several MemoryDefs can define this memory state. In that
-  // case add all Defs to WorkList and continue going up and checking all
-  // the definitions of this memory location until the root. When all the
-  // defs are exhausted and came to the entry state we have no clobber.
-  // Along the scan ignore barriers and fences which are considered clobbers
-  // by the MemorySSA, but not really writing anything into the memory.
-  while (!WorkList.empty()) {
-    MemoryAccess *MA = WorkList.pop_back_val();
-    if (!Visited.insert(MA).second)
-      continue;
-
-    if (MSSA->isLiveOnEntryDef(MA))
-      continue;
-
-    if (MemoryDef *Def = dyn_cast<MemoryDef>(MA)) {
-      if (isReallyAClobber(Def)) {
-        LLVM_DEBUG(dbgs() << "      -> load is clobbered\n");
-        return true;
-      }
-
-      WorkList.push_back(
-          Walker->getClobberingMemoryAccess(Def->getDefiningAccess(), Loc));
-      continue;
-    }
-
-    const MemoryPhi *Phi = cast<MemoryPhi>(MA);
-    for (auto &Use : Phi->incoming_values())
-      WorkList.push_back(cast<MemoryAccess>(&Use));
-  }
-
-  LLVM_DEBUG(dbgs() << "      -> no clobber\n");
-  return false;
-}
-
 void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) {
   if (DA->isUniform(&I))
     setUniformMetadata(&I);
@@ -160,46 +86,18 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
   Value *Ptr = I.getPointerOperand();
   if (!DA->isUniform(Ptr))
     return;
+  Instruction *PtrI = dyn_cast<Instruction>(Ptr);
+  if (PtrI)
+    setUniformMetadata(PtrI);
+
   // We're tracking up to the Function boundaries, and cannot go beyond because
   // of FunctionPass restrictions. We can ensure that is memory not clobbered
   // for memory operations that are live in to entry points only.
-  Instruction *PtrI = dyn_cast<Instruction>(Ptr);
-
-  if (!isEntryFunc) {
-    if (PtrI)
-      setUniformMetadata(PtrI);
+  if (!isEntryFunc)
     return;
-  }
-
-  bool NotClobbered = false;
   bool GlobalLoad = I.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
-  if (PtrI)
-    NotClobbered = GlobalLoad && !isClobberedInFunction(&I);
-  else if (isa<Argument>(Ptr) || isa<GlobalValue>(Ptr)) {
-    if (GlobalLoad && !isClobberedInFunction(&I)) {
-      NotClobbered = true;
-      // Lookup for the existing GEP
-      if (noClobberClones.count(Ptr)) {
-        PtrI = noClobberClones[Ptr];
-      } else {
-        // Create GEP of the Value
-        Function *F = I.getParent()->getParent();
-        Value *Idx = Constant::getIntegerValue(
-          Type::getInt32Ty(Ptr->getContext()), APInt(64, 0));
-        // Insert GEP at the entry to make it dominate all uses
-        PtrI = GetElementPtrInst::Create(I.getType(), Ptr,
-                                         ArrayRef<Value *>(Idx), Twine(""),
-                                         F->getEntryBlock().getFirstNonPHI());
-      }
-      I.replaceUsesOfWith(Ptr, PtrI);
-    }
-  }
-
-  if (PtrI) {
-    setUniformMetadata(PtrI);
-    if (NotClobbered)
-      setNoClobberMetadata(PtrI);
-  }
+  if (GlobalLoad && !AMDGPU::isClobberedInFunction(&I, MSSA, AA))
+    setNoClobberMetadata(&I);
 }
 
 bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) {
@@ -215,9 +113,9 @@ bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) {
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   isEntryFunc = AMDGPU::isEntryFunctionCC(F.getCallingConv());
 
+  Changed = false;
   visit(F);
-  noClobberClones.clear();
-  return true;
+  return Changed;
 }
 
 FunctionPass *
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 6e2984f2a04f..57a4660bc1eb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -27,6 +27,8 @@
 #include "SIMachineFunctionInfo.h"
 #include "TargetInfo/AMDGPUTargetInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
@@ -34,6 +36,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/AMDHSAKernelDescriptor.h"
+#include "llvm/Support/TargetParser.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 
@@ -111,6 +114,12 @@ AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
 }
 
 void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
+  IsTargetStreamerInitialized = false;
+}
+
+void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
+  IsTargetStreamerInitialized = true;
+
   // TODO: Which one is called first, emitStartOfAsmFile or
   // emitFunctionBodyStart?
   if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
@@ -143,6 +152,10 @@ void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
 }
 
 void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
+  // Init target streamer if it has not yet happened
+  if (!IsTargetStreamerInitialized)
+    initTargetStreamer(M);
+
   // Following code requires TargetStreamer to be present.
   if (!getTargetStreamer())
     return;
@@ -234,8 +247,8 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
   auto &ObjectFileInfo = *Context.getObjectFileInfo();
   auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
 
-  Streamer.PushSection();
-  Streamer.SwitchSection(&ReadOnlySection);
+  Streamer.pushSection();
+  Streamer.switchSection(&ReadOnlySection);
 
   // CP microcode requires the kernel descriptor to be allocated on 64 byte
   // alignment.
@@ -256,7 +269,7 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
                                     CurrentProgramInfo.FlatUsed),
       CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
 
-  Streamer.PopSection();
+  Streamer.popSection();
 }
 
 void AMDGPUAsmPrinter::emitFunctionEntryLabel() {
@@ -319,7 +332,7 @@ void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
 
     const DataLayout &DL = GV->getParent()->getDataLayout();
     uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
-    Align Alignment = GV->getAlign().getValueOr(Align(4));
+    Align Alignment = GV->getAlign().value_or(Align(4));
 
     emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
     emitLinkage(GV, GVSym);
@@ -339,7 +352,7 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) {
   if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
       (STI.getTargetTriple().getOS() == Triple::AMDHSA ||
        STI.getTargetTriple().getOS() == Triple::AMDPAL)) {
-    OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
+    OutStreamer->switchSection(getObjFileLowering().getTextSection());
     getTargetStreamer()->EmitCodeEnd(STI);
   }
 
@@ -381,7 +394,7 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
     KernelCodeProperties |=
         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
   }
-  if (MFI.hasQueuePtr()) {
+  if (MFI.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) {
     KernelCodeProperties |=
         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
   }
@@ -437,6 +450,11 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
 }
 
 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  // Init target streamer lazily on the first function so that previous passes
+  // can set metadata.
+  if (!IsTargetStreamerInitialized)
+    initTargetStreamer(*MF.getFunction().getParent());
+
   ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>();
   CurrentProgramInfo = SIProgramInfo();
 
@@ -454,7 +472,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
     MCSectionELF *ConfigSection =
         Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
-    OutStreamer->SwitchSection(ConfigSection);
+    OutStreamer->switchSection(ConfigSection);
   }
 
   if (MFI->isModuleEntryFunction()) {
@@ -491,7 +509,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   if (isVerbose()) {
     MCSectionELF *CommentSection =
         Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
-    OutStreamer->SwitchSection(CommentSection);
+    OutStreamer->switchSection(CommentSection);
 
     if (!MFI->isEntryFunction()) {
       OutStreamer->emitRawComment(" Function info:", false);
@@ -590,7 +608,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
   if (DumpCodeInstEmitter) {
 
-    OutStreamer->SwitchSection(
+    OutStreamer->switchSection(
         Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
 
     for (size_t i = 0; i < DisasmLines.size(); ++i) {
@@ -677,7 +695,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;
 
   const uint64_t MaxScratchPerWorkitem =
-      GCNSubtarget::MaxWaveScratchSize / STM.getWavefrontSize();
+      STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
   if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) {
     DiagnosticInfoStackSize DiagStackSize(MF.getFunction(),
                                           ProgInfo.ScratchSize,
@@ -857,22 +875,18 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
     LDSAlignShift = 9;
   }
 
-  unsigned LDSSpillSize =
-    MFI->getLDSWaveSpillSize() * MFI->getMaxFlatWorkGroupSize();
-
-  ProgInfo.LDSSize = MFI->getLDSSize() + LDSSpillSize;
+  ProgInfo.LDSSize = MFI->getLDSSize();
   ProgInfo.LDSBlocks =
       alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
 
-  // Scratch is allocated in 256 dword blocks.
-  unsigned ScratchAlignShift = 10;
+  // Scratch is allocated in 64-dword or 256-dword blocks.
+  unsigned ScratchAlignShift =
+      STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
   // We need to program the hardware with the amount of scratch memory that
   // is used by the entire wave.  ProgInfo.ScratchSize is the amount of
   // scratch memory used per thread.
-  ProgInfo.ScratchBlocks =
-      alignTo(ProgInfo.ScratchSize * STM.getWavefrontSize(),
-              1ULL << ScratchAlignShift) >>
-      ScratchAlignShift;
+  ProgInfo.ScratchBlocks = divideCeil(
+      ProgInfo.ScratchSize * STM.getWavefrontSize(), 1ULL << ScratchAlignShift);
 
   if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
     ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
@@ -886,8 +900,14 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   else if (MFI->hasWorkItemIDY())
     TIDIGCompCnt = 1;
 
+  // The private segment wave byte offset is the last of the system SGPRs. We
+  // initially assumed it was allocated, and may have used it. It shouldn't harm
+  // anything to disable it if we know the stack isn't used here. We may still
+  // have emitted code reading it to initialize scratch, but if that's unused
+  // reading garbage should be OK.
+  const bool EnablePrivateSegment = ProgInfo.ScratchBlocks > 0;
   ProgInfo.ComputePGMRSrc2 =
-      S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
+      S_00B84C_SCRATCH_EN(EnablePrivateSegment) |
       S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
       // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
       S_00B84C_TRAP_HANDLER(STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled()) |
@@ -931,6 +951,7 @@ static unsigned getRsrcReg(CallingConv::ID CallConv) {
 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
                                          const SIProgramInfo &CurrentProgramInfo) {
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
 
   if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
@@ -942,7 +963,10 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
     OutStreamer->emitInt32(CurrentProgramInfo.ComputePGMRSrc2);
 
     OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
-    OutStreamer->emitInt32(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks));
+    OutStreamer->emitInt32(
+        STM.getGeneration() >= AMDGPUSubtarget::GFX11
+            ? S_00B860_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks)
+            : S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
 
     // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
     // 0" comment but I don't see a corresponding field in the register spec.
@@ -951,14 +975,18 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
     OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
                               S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
     OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE);
-    OutStreamer->emitIntValue(
-        S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
+    OutStreamer->emitInt32(
+        STM.getGeneration() >= AMDGPUSubtarget::GFX11
+            ? S_0286E8_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks)
+            : S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
   }
 
   if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
     OutStreamer->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS);
-    OutStreamer->emitInt32(
-        S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks));
+    unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
+                                ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
+                                : CurrentProgramInfo.LDSBlocks;
+    OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
     OutStreamer->emitInt32(R_0286CC_SPI_PS_INPUT_ENA);
     OutStreamer->emitInt32(MFI->getPSInputEnable());
     OutStreamer->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR);
@@ -984,6 +1012,13 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
 
   MD->setEntryPoint(CC, MF.getFunction().getName());
   MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU);
+
+  // Only set AGPRs for supported devices
+  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+  if (STM.hasMAIInsts()) {
+    MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
+  }
+
   MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU);
   MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC));
   if (AMDGPU::isCompute(CC)) {
@@ -995,12 +1030,14 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
   // ScratchSize is in bytes, 16 aligned.
   MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16));
   if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
-    MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks));
+    unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
+                                ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
+                                : CurrentProgramInfo.LDSBlocks;
+    MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
     MD->setSpiPsInputEna(MFI->getPSInputEnable());
     MD->setSpiPsInputAddr(MFI->getPSInputAddr());
   }
 
-  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   if (STM.isWave32())
     MD->setWave32(MF.getFunction().getCallingConv());
 }
@@ -1067,7 +1104,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
   if (MFI->hasDispatchPtr())
     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
 
-  if (MFI->hasQueuePtr())
+  if (MFI->hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5)
     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
 
   if (MFI->hasKernargSegmentPtr())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index d5c60aa3be7d..ddda2cf107b1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -77,6 +77,8 @@ private:
       const MachineFunction &MF,
       const SIProgramInfo &PI) const;
 
+  void initTargetStreamer(Module &M);
+
 public:
   explicit AMDGPUAsmPrinter(TargetMachine &TM,
                             std::unique_ptr<MCStreamer> Streamer);
@@ -132,6 +134,7 @@ protected:
 
   std::vector<std::string> DisasmLines, HexLines;
   size_t DisasmLineMaxLen;
+  bool IsTargetStreamerInitialized;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 1e2cf3890d0a..3ccfd9dde269 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -311,6 +311,12 @@ Value *AMDGPUAtomicOptimizer::buildReduction(IRBuilder<> &B,
   if (ST->isWave32())
     return V;
 
+  if (ST->hasPermLane64()) {
+    // Reduce across the upper and lower 32 lanes.
+    return buildNonAtomicBinOp(
+        B, Op, V, B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V));
+  }
+
   // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
   // combine them with a scalar operation.
   Function *ReadLane =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def b/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def
new file mode 100644
index 000000000000..0a2cf3874245
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def
@@ -0,0 +1,31 @@
+//===--- AMDGPUAttributes.def ---------------------------------*- C++ -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains descriptions of the various function attributes
+// that indicate *absence* of the corresponding implicit kernel
+// arguments.
+//
+//===----------------------------------------------------------------------===//
+
+// NOTE: NO INCLUDE GUARD DESIRED!
+
+AMDGPU_ATTRIBUTE(DISPATCH_PTR, "amdgpu-no-dispatch-ptr")
+AMDGPU_ATTRIBUTE(QUEUE_PTR, "amdgpu-no-queue-ptr")
+AMDGPU_ATTRIBUTE(DISPATCH_ID, "amdgpu-no-dispatch-id")
+AMDGPU_ATTRIBUTE(IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr")
+AMDGPU_ATTRIBUTE(MULTIGRID_SYNC_ARG, "amdgpu-no-multigrid-sync-arg")
+AMDGPU_ATTRIBUTE(HOSTCALL_PTR, "amdgpu-no-hostcall-ptr")
+AMDGPU_ATTRIBUTE(HEAP_PTR, "amdgpu-no-heap-ptr")
+AMDGPU_ATTRIBUTE(WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x")
+AMDGPU_ATTRIBUTE(WORKGROUP_ID_Y, "amdgpu-no-workgroup-id-y")
+AMDGPU_ATTRIBUTE(WORKGROUP_ID_Z, "amdgpu-no-workgroup-id-z")
+AMDGPU_ATTRIBUTE(WORKITEM_ID_X, "amdgpu-no-workitem-id-x")
+AMDGPU_ATTRIBUTE(WORKITEM_ID_Y, "amdgpu-no-workitem-id-y")
+AMDGPU_ATTRIBUTE(WORKITEM_ID_Z, "amdgpu-no-workitem-id-z")
+
+#undef AMDGPU_ATTRIBUTE
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index b4ebc7d7d75f..8de0d7e6bff1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -12,6 +12,7 @@
 
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
+#include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsR600.h"
@@ -22,37 +23,25 @@
 
 using namespace llvm;
 
+#define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
+
+enum ImplicitArgumentPositions {
+  #include "AMDGPUAttributes.def"
+  LAST_ARG_POS
+};
+
+#define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
+
 enum ImplicitArgumentMask {
   NOT_IMPLICIT_INPUT = 0,
-
-  // SGPRs
-  DISPATCH_PTR = 1 << 0,
-  QUEUE_PTR = 1 << 1,
-  DISPATCH_ID = 1 << 2,
-  IMPLICIT_ARG_PTR = 1 << 3,
-  WORKGROUP_ID_X = 1 << 4,
-  WORKGROUP_ID_Y = 1 << 5,
-  WORKGROUP_ID_Z = 1 << 6,
-
-  // VGPRS:
-  WORKITEM_ID_X = 1 << 7,
-  WORKITEM_ID_Y = 1 << 8,
-  WORKITEM_ID_Z = 1 << 9,
-  ALL_ARGUMENT_MASK = (1 << 10) - 1
+  #include "AMDGPUAttributes.def"
+  ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1
 };
 
+#define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
 static constexpr std::pair<ImplicitArgumentMask,
                            StringLiteral> ImplicitAttrs[] = {
-  {DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
-  {QUEUE_PTR, "amdgpu-no-queue-ptr"},
-  {DISPATCH_ID, "amdgpu-no-dispatch-id"},
-  {IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
-  {WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
-  {WORKGROUP_ID_Y, "amdgpu-no-workgroup-id-y"},
-  {WORKGROUP_ID_Z, "amdgpu-no-workgroup-id-z"},
-  {WORKITEM_ID_X, "amdgpu-no-workitem-id-x"},
-  {WORKITEM_ID_Y, "amdgpu-no-workitem-id-y"},
-  {WORKITEM_ID_Z, "amdgpu-no-workitem-id-z"}
+ #include "AMDGPUAttributes.def"
 };
 
 // We do not need to note the x workitem or workgroup id because they are always
@@ -61,7 +50,9 @@ static constexpr std::pair<ImplicitArgumentMask,
 // TODO: We should not add the attributes if the known compile time workgroup
 // size is 1 for y/z.
 static ImplicitArgumentMask
-intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &IsQueuePtr) {
+intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
+                    bool HasApertureRegs, bool SupportsGetDoorBellID) {
+  unsigned CodeObjectVersion = AMDGPU::getAmdhsaCodeObjectVersion();
   switch (ID) {
   case Intrinsic::amdgcn_workitem_id_x:
     NonKernelOnly = true;
@@ -87,13 +78,23 @@ intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &IsQueuePtr) {
     return DISPATCH_ID;
   case Intrinsic::amdgcn_implicitarg_ptr:
     return IMPLICIT_ARG_PTR;
+  // Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access
+  // queue_ptr.
   case Intrinsic::amdgcn_queue_ptr:
+    NeedsImplicit = (CodeObjectVersion == 5);
+    return QUEUE_PTR;
   case Intrinsic::amdgcn_is_shared:
   case Intrinsic::amdgcn_is_private:
-    // TODO: Does not require queue ptr on gfx9+
+    if (HasApertureRegs)
+      return NOT_IMPLICIT_INPUT;
+    // Under V5, we need implicitarg_ptr + offsets to access private_base or
+    // shared_base. For pre-V5, however, need to access them through queue_ptr +
+    // offsets.
+    return CodeObjectVersion == 5 ? IMPLICIT_ARG_PTR : QUEUE_PTR;
   case Intrinsic::trap:
-  case Intrinsic::debugtrap:
-    IsQueuePtr = true;
+    if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4.
+      return CodeObjectVersion >= 4 ? NOT_IMPLICIT_INPUT : QUEUE_PTR;
+    NeedsImplicit = (CodeObjectVersion == 5); // Need impicitarg_ptr under V5.
     return QUEUE_PTR;
   default:
     return NOT_IMPLICIT_INPUT;
@@ -114,7 +115,7 @@ static bool isDSAddress(const Constant *C) {
 
 /// Returns true if the function requires the implicit argument be passed
 /// regardless of the function contents.
-static bool funcRequiresImplicitArgPtr(const Function &F) {
+static bool funcRequiresHostcallPtr(const Function &F) {
   // Sanitizers require the hostcall buffer passed in the implicit arguments.
   return F.hasFnAttribute(Attribute::SanitizeAddress) ||
          F.hasFnAttribute(Attribute::SanitizeThread) ||
@@ -140,6 +141,12 @@ public:
     return ST.hasApertureRegs();
   }
 
+  /// Check if the subtarget supports GetDoorbellID.
+  bool supportsGetDoorbellID(Function &F) {
+    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+    return ST.supportsGetDoorbellID();
+  }
+
   std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) {
     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
     return ST.getFlatWorkGroupSizes(F);
@@ -152,7 +159,7 @@ public:
   }
 
 private:
-  /// Check if the ConstantExpr \p CE requires queue ptr attribute.
+  /// Check if the ConstantExpr \p CE requires the queue pointer.
   static bool visitConstExpr(const ConstantExpr *CE) {
     if (CE->getOpcode() == Instruction::AddrSpaceCast) {
       unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
@@ -186,7 +193,7 @@ private:
   }
 
 public:
-  /// Returns true if \p Fn needs a queue ptr attribute because of \p C.
+  /// Returns true if \p Fn needs the queue pointer because of \p C.
   bool needsQueuePtr(const Constant *C, Function &Fn) {
     bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv());
     bool HasAperture = hasApertureRegs(Fn);
@@ -205,7 +212,7 @@ public:
   }
 
 private:
-  /// Used to determine if the Constant needs a queue ptr attribute.
+  /// Used to determine if the Constant needs the queue pointer.
   DenseMap<const Constant *, uint8_t> ConstantStatus;
 };
 
@@ -353,12 +360,15 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
 
     // If the function requires the implicit arg pointer due to sanitizers,
     // assume it's needed even if explicitly marked as not requiring it.
-    const bool NeedsImplicit = funcRequiresImplicitArgPtr(*F);
-    if (NeedsImplicit)
+    const bool NeedsHostcall = funcRequiresHostcallPtr(*F);
+    if (NeedsHostcall) {
       removeAssumedBits(IMPLICIT_ARG_PTR);
+      removeAssumedBits(HOSTCALL_PTR);
+    }
 
     for (auto Attr : ImplicitAttrs) {
-      if (NeedsImplicit && Attr.first == IMPLICIT_ARG_PTR)
+      if (NeedsHostcall &&
+          (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR))
         continue;
 
       if (F->hasFnAttribute(Attr.second))
@@ -388,9 +398,11 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
       return indicatePessimisticFixpoint();
 
     bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
-    auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
 
-    bool NeedsQueuePtr = false;
+    bool NeedsImplicit = false;
+    auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
+    bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
+    bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F);
 
     for (Function *Callee : AAEdges.getOptimisticEdges()) {
       Intrinsic::ID IID = Callee->getIntrinsicID();
@@ -403,20 +415,87 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
 
       bool NonKernelOnly = false;
       ImplicitArgumentMask AttrMask =
-          intrinsicToAttrMask(IID, NonKernelOnly, NeedsQueuePtr);
+          intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,
+                              HasApertureRegs, SupportsGetDoorbellID);
       if (AttrMask != NOT_IMPLICIT_INPUT) {
         if ((IsNonEntryFunc || !NonKernelOnly))
           removeAssumedBits(AttrMask);
       }
     }
 
-    // If we found that we need amdgpu-queue-ptr, nothing else to do.
-    if (NeedsQueuePtr) {
+    // Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base.
+    if (NeedsImplicit)
+      removeAssumedBits(IMPLICIT_ARG_PTR);
+
+    if (isAssumed(QUEUE_PTR) && checkForQueuePtr(A)) {
+      // Under V5, we need implicitarg_ptr + offsets to access private_base or
+      // shared_base. We do not actually need queue_ptr.
+      if (AMDGPU::getAmdhsaCodeObjectVersion() == 5)
+        removeAssumedBits(IMPLICIT_ARG_PTR);
+      else
+        removeAssumedBits(QUEUE_PTR);
+    }
+
+    if (funcRetrievesMultigridSyncArg(A)) {
+      assert(!isAssumed(IMPLICIT_ARG_PTR) &&
+             "multigrid_sync_arg needs implicitarg_ptr");
+      removeAssumedBits(MULTIGRID_SYNC_ARG);
+    }
+
+    if (funcRetrievesHostcallPtr(A)) {
+      assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr");
+      removeAssumedBits(HOSTCALL_PTR);
+    }
+
+    if (funcRetrievesHeapPtr(A)) {
+      assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr");
+      removeAssumedBits(HEAP_PTR);
+    }
+
+    if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A)) {
+      assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr");
       removeAssumedBits(QUEUE_PTR);
-      return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED :
-                                           ChangeStatus::UNCHANGED;
     }
 
+    return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
+                                       : ChangeStatus::UNCHANGED;
+  }
+
+  ChangeStatus manifest(Attributor &A) override {
+    SmallVector<Attribute, 8> AttrList;
+    LLVMContext &Ctx = getAssociatedFunction()->getContext();
+
+    for (auto Attr : ImplicitAttrs) {
+      if (isKnown(Attr.first))
+        AttrList.push_back(Attribute::get(Ctx, Attr.second));
+    }
+
+    return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
+                                              /* ForceReplace */ true);
+  }
+
+  const std::string getAsStr() const override {
+    std::string Str;
+    raw_string_ostream OS(Str);
+    OS << "AMDInfo[";
+    for (auto Attr : ImplicitAttrs)
+      OS << ' ' << Attr.second;
+    OS << " ]";
+    return OS.str();
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {}
+
+private:
+  bool checkForQueuePtr(Attributor &A) {
+    Function *F = getAssociatedFunction();
+    bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
+
+    auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
+
+    bool NeedsQueuePtr = false;
+
     auto CheckAddrSpaceCasts = [&](Instruction &I) {
       unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
       if (castRequiresQueuePtr(SrcAS)) {
@@ -431,7 +510,7 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
     // `checkForAllInstructions` is much more cheaper than going through all
     // instructions, try it first.
 
-    // amdgpu-queue-ptr is not needed if aperture regs is present.
+    // The queue pointer is not needed if aperture regs is present.
     if (!HasApertureRegs) {
       bool UsedAssumedInformation = false;
       A.checkForAllInstructions(CheckAddrSpaceCasts, *this,
@@ -439,61 +518,79 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
                                 UsedAssumedInformation);
     }
 
-    // If we found  that we need amdgpu-queue-ptr, nothing else to do.
-    if (NeedsQueuePtr) {
-      removeAssumedBits(QUEUE_PTR);
-      return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED :
-                                           ChangeStatus::UNCHANGED;
-    }
+    // If we found  that we need the queue pointer, nothing else to do.
+    if (NeedsQueuePtr)
+      return true;
 
-    if (!IsNonEntryFunc && HasApertureRegs) {
-      return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED :
-                                           ChangeStatus::UNCHANGED;
-    }
+    if (!IsNonEntryFunc && HasApertureRegs)
+      return false;
 
     for (BasicBlock &BB : *F) {
       for (Instruction &I : BB) {
         for (const Use &U : I.operands()) {
           if (const auto *C = dyn_cast<Constant>(U)) {
-            if (InfoCache.needsQueuePtr(C, *F)) {
-              removeAssumedBits(QUEUE_PTR);
-              return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED :
-                                                   ChangeStatus::UNCHANGED;
-            }
+            if (InfoCache.needsQueuePtr(C, *F))
+              return true;
           }
         }
       }
     }
 
-    return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED :
-                                         ChangeStatus::UNCHANGED;
+    return false;
   }
 
-  ChangeStatus manifest(Attributor &A) override {
-    SmallVector<Attribute, 8> AttrList;
-    LLVMContext &Ctx = getAssociatedFunction()->getContext();
+  bool funcRetrievesMultigridSyncArg(Attributor &A) {
+    auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition();
+    AAPointerInfo::OffsetAndSize OAS(Pos, 8);
+    return funcRetrievesImplicitKernelArg(A, OAS);
+  }
 
-    for (auto Attr : ImplicitAttrs) {
-      if (isKnown(Attr.first))
-        AttrList.push_back(Attribute::get(Ctx, Attr.second));
-    }
+  bool funcRetrievesHostcallPtr(Attributor &A) {
+    auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition();
+    AAPointerInfo::OffsetAndSize OAS(Pos, 8);
+    return funcRetrievesImplicitKernelArg(A, OAS);
+  }
 
-    return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
-                                              /* ForceReplace */ true);
+  bool funcRetrievesHeapPtr(Attributor &A) {
+    if (AMDGPU::getAmdhsaCodeObjectVersion() != 5)
+      return false;
+    AAPointerInfo::OffsetAndSize OAS(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, 8);
+    return funcRetrievesImplicitKernelArg(A, OAS);
   }
 
-  const std::string getAsStr() const override {
-    std::string Str;
-    raw_string_ostream OS(Str);
-    OS << "AMDInfo[";
-    for (auto Attr : ImplicitAttrs)
-      OS << ' ' << Attr.second;
-    OS << " ]";
-    return OS.str();
+  bool funcRetrievesQueuePtr(Attributor &A) {
+    if (AMDGPU::getAmdhsaCodeObjectVersion() != 5)
+      return false;
+    AAPointerInfo::OffsetAndSize OAS(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, 8);
+    return funcRetrievesImplicitKernelArg(A, OAS);
   }
 
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {}
+  bool funcRetrievesImplicitKernelArg(Attributor &A,
+                                      AAPointerInfo::OffsetAndSize OAS) {
+    // Check if this is a call to the implicitarg_ptr builtin and it
+    // is used to retrieve the hostcall pointer. The implicit arg for
+    // hostcall is not used only if every use of the implicitarg_ptr
+    // is a load that clearly does not retrieve any byte of the
+    // hostcall pointer. We check this by tracing all the uses of the
+    // initial call to the implicitarg_ptr intrinsic.
+    auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) {
+      auto &Call = cast<CallBase>(I);
+      if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr)
+        return true;
+
+      const auto &PointerInfoAA = A.getAAFor<AAPointerInfo>(
+          *this, IRPosition::callsite_returned(Call), DepClassTy::REQUIRED);
+
+      return PointerInfoAA.forallInterferingAccesses(
+          OAS, [](const AAPointerInfo::Access &Acc, bool IsExact) {
+            return Acc.getRemoteInst()->isDroppable();
+          });
+    };
+
+    bool UsedAssumedInformation = false;
+    return !A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *this,
+                                              UsedAssumedInformation);
+  }
 };
 
 AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
@@ -646,9 +743,14 @@ public:
     AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM);
     DenseSet<const char *> Allowed(
         {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
-         &AAAMDFlatWorkGroupSize::ID, &AACallEdges::ID});
+         &AAAMDFlatWorkGroupSize::ID, &AACallEdges::ID, &AAPointerInfo::ID});
+
+    AttributorConfig AC(CGUpdater);
+    AC.Allowed = &Allowed;
+    AC.IsModulePass = true;
+    AC.DefaultInitializeLiveInternals = false;
 
-    Attributor A(Functions, InfoCache, CGUpdater, &Allowed);
+    Attributor A(Functions, InfoCache, AC);
 
     for (Function &F : M) {
       if (!F.isIntrinsic()) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index cd084fd5440a..fd812eb676ef 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 
 #define DEBUG_TYPE "amdgpu-call-lowering"
@@ -349,7 +350,6 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
                                      FunctionLoweringInfo &FLI) const {
 
   MachineFunction &MF = B.getMF();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   MFI->setIfReturnsVoid(!Val);
 
@@ -365,40 +365,15 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
     return true;
   }
 
-  auto const &ST = MF.getSubtarget<GCNSubtarget>();
-
-  unsigned ReturnOpc = 0;
-  if (IsShader)
-    ReturnOpc = AMDGPU::SI_RETURN_TO_EPILOG;
-  else if (CC == CallingConv::AMDGPU_Gfx)
-    ReturnOpc = AMDGPU::S_SETPC_B64_return_gfx;
-  else
-    ReturnOpc = AMDGPU::S_SETPC_B64_return;
-
+  unsigned ReturnOpc =
+      IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::SI_RETURN;
   auto Ret = B.buildInstrNoInsert(ReturnOpc);
-  Register ReturnAddrVReg;
-  if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
-    ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass);
-    Ret.addUse(ReturnAddrVReg);
-  } else if (ReturnOpc == AMDGPU::S_SETPC_B64_return_gfx) {
-    ReturnAddrVReg =
-        MRI.createVirtualRegister(&AMDGPU::Gfx_CCR_SGPR_64RegClass);
-    Ret.addUse(ReturnAddrVReg);
-  }
 
   if (!FLI.CanLowerReturn)
     insertSRetStores(B, Val->getType(), VRegs, FLI.DemoteRegister);
   else if (!lowerReturnVal(B, Val, VRegs, Ret))
     return false;
 
-  if (ReturnOpc == AMDGPU::S_SETPC_B64_return ||
-      ReturnOpc == AMDGPU::S_SETPC_B64_return_gfx) {
-    const SIRegisterInfo *TRI = ST.getRegisterInfo();
-    Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF),
-                                         &AMDGPU::SGPR_64RegClass);
-    B.buildCopy(ReturnAddrVReg, LiveInReturn);
-  }
-
   // TODO: Handle CalleeSavedRegsViaCopy.
 
   B.insertInstr(Ret);
@@ -479,7 +454,7 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
     CCInfo.AllocateReg(DispatchPtrReg);
   }
 
-  if (Info.hasQueuePtr()) {
+  if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) {
     Register QueuePtrReg = Info.addQueuePtr(TRI);
     MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
     CCInfo.AllocateReg(QueuePtrReg);
@@ -523,7 +498,7 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
   const DataLayout &DL = F.getParent()->getDataLayout();
 
-  Info->allocateModuleLDSGlobal(F.getParent());
+  Info->allocateModuleLDSGlobal(F);
 
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
@@ -543,9 +518,8 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
     if (AllocSize == 0)
       continue;
 
-    MaybeAlign ABIAlign = IsByRef ? Arg.getParamAlign() : None;
-    if (!ABIAlign)
-      ABIAlign = DL.getABITypeAlign(ArgTy);
+    MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : None;
+    Align ABIAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy);
 
     uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
     ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
@@ -608,19 +582,11 @@ bool AMDGPUCallLowering::lowerFormalArguments(
   const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
   const DataLayout &DL = F.getParent()->getDataLayout();
 
-  Info->allocateModuleLDSGlobal(F.getParent());
+  Info->allocateModuleLDSGlobal(F);
 
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
 
-  if (!IsEntryFunc) {
-    Register ReturnAddrReg = TRI->getReturnAddressReg(MF);
-    Register LiveInReturn = MF.addLiveIn(ReturnAddrReg,
-                                         &AMDGPU::SGPR_64RegClass);
-    MBB.addLiveIn(ReturnAddrReg);
-    B.buildCopy(LiveInReturn, ReturnAddrReg);
-  }
-
   if (Info->hasImplicitBufferPtr()) {
     Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
     MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 1682d43ae671..b6c66077675f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -148,53 +148,32 @@ def CSR_AMDGPU_VGPRs : CalleeSavedRegs<
     (sequence "VGPR%u", 248, 255))
 >;
 
-def CSR_AMDGPU_AGPRs_32_255 : CalleeSavedRegs<
+def CSR_AMDGPU_AGPRs : CalleeSavedRegs<
   (sequence "AGPR%u", 32, 255)
 >;
 
-def CSR_AMDGPU_SGPRs_32_105 : CalleeSavedRegs<
-  (sequence "SGPR%u", 32, 105)
+def CSR_AMDGPU_SGPRs : CalleeSavedRegs<
+  (sequence "SGPR%u", 30, 105)
 >;
 
-def CSR_AMDGPU_SI_Gfx_SGPRs_4_29 : CalleeSavedRegs<
-  (sequence "SGPR%u", 4, 29)
+def CSR_AMDGPU_SI_Gfx_SGPRs : CalleeSavedRegs<
+  (add (sequence "SGPR%u", 4, 31), (sequence "SGPR%u", 64, 105))
 >;
 
-def CSR_AMDGPU_SI_Gfx_SGPRs_64_105 : CalleeSavedRegs<
-  (sequence "SGPR%u", 64, 105)
+def CSR_AMDGPU : CalleeSavedRegs<
+  (add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SGPRs)
 >;
 
-// Just to get the regmask, not for calling convention purposes.
-def CSR_AMDGPU_AllVGPRs : CalleeSavedRegs<
-  (sequence "VGPR%u", 0, 255)
->;
-
-def CSR_AMDGPU_AllAGPRs : CalleeSavedRegs<
-  (sequence "AGPR%u", 0, 255)
->;
-def CSR_AMDGPU_AllVectorRegs : CalleeSavedRegs<
-  (add CSR_AMDGPU_AllVGPRs, CSR_AMDGPU_AllAGPRs)
->;
-
-// Just to get the regmask, not for calling convention purposes.
-def CSR_AMDGPU_AllAllocatableSRegs : CalleeSavedRegs<
-  (add (sequence "SGPR%u", 0, 105), VCC_LO, VCC_HI)
->;
-
-def CSR_AMDGPU_HighRegs : CalleeSavedRegs<
-  (add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SGPRs_32_105)
->;
-
-def CSR_AMDGPU_HighRegs_With_AGPRs : CalleeSavedRegs<
-  (add CSR_AMDGPU_HighRegs, CSR_AMDGPU_AGPRs_32_255)
+def CSR_AMDGPU_GFX90AInsts : CalleeSavedRegs<
+  (add CSR_AMDGPU, CSR_AMDGPU_AGPRs)
 >;
 
 def CSR_AMDGPU_SI_Gfx : CalleeSavedRegs<
-  (add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SI_Gfx_SGPRs_4_29, CSR_AMDGPU_SI_Gfx_SGPRs_64_105)
+  (add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SI_Gfx_SGPRs)
 >;
 
-def CSR_AMDGPU_SI_Gfx_With_AGPRs : CalleeSavedRegs<
-  (add CSR_AMDGPU_SI_Gfx, CSR_AMDGPU_AGPRs_32_255)
+def CSR_AMDGPU_SI_Gfx_GFX90AInsts : CalleeSavedRegs<
+  (add CSR_AMDGPU_SI_Gfx, CSR_AMDGPU_AGPRs)
 >;
 
 def CSR_AMDGPU_NoRegs : CalleeSavedRegs<(add)>;
@@ -233,3 +212,24 @@ def CC_AMDGPU : CallingConv<[
            "AMDGPUSubtarget::SOUTHERN_ISLANDS && State.getCallingConv() == CallingConv::C",
         CCDelegateTo<CC_AMDGPU_Func>>
 ]>;
+
+// Trivial class to denote when a def is used only to get a RegMask, i.e.
+// SaveList is ignored and the def is not used as part of any calling
+// convention.
+class RegMask<dag mask> : CalleeSavedRegs<mask>;
+
+def AMDGPU_AllVGPRs : RegMask<
+  (sequence "VGPR%u", 0, 255)
+>;
+
+def AMDGPU_AllAGPRs : RegMask<
+  (sequence "AGPR%u", 0, 255)
+>;
+
+def AMDGPU_AllVectorRegs : RegMask<
+  (add AMDGPU_AllVGPRs, AMDGPU_AllAGPRs)
+>;
+
+def AMDGPU_AllAllocatableSRegs : RegMask<
+  (add (sequence "SGPR%u", 0, 105), VCC_LO, VCC_HI)
+>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 1920684d8f1f..94d7844e8a32 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -877,7 +877,7 @@ static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
   return getMul64(Builder, LHS, RHS).second;
 }
 
-/// Figure out how many bits are really needed for this ddivision. \p AtLeast is
+/// Figure out how many bits are really needed for this division. \p AtLeast is
 /// an optimization hint to bypass the second ComputeNumSignBits call if we the
 /// first one is insufficient. Returns -1 on failure.
 int AMDGPUCodeGenPrepare::getDivNumBits(BinaryOperator &I,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
index e79ff9b597c9..c16d8ee51a7a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
@@ -373,7 +373,8 @@ void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI,
     replaceRegWith(MRI, Dst, NegatedMatchInfo);
 
     // Recreate non negated value for other uses of old MatchInfoDst
-    Builder.setInstrAndDebugLoc(MI);
+    auto NextInst = ++MatchInfo->getIterator();
+    Builder.setInstrAndDebugLoc(*NextInst);
     Builder.buildFNeg(MatchInfoDst, NegatedMatchInfo, MI.getFlags());
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp
index 04bf623bfa46..8fcf669041b9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp
@@ -50,7 +50,7 @@ public:
   }
 
   bool createInitOrFiniKernel(Module &M, GlobalVariable *GV, bool IsCtor) {
-    if (!GV)
+    if (!GV || !GV->hasInitializer())
       return false;
     ConstantArray *GA = dyn_cast<ConstantArray>(GV->getInitializer());
     if (!GA || GA->getNumOperands() == 0)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp
index bed0707f3aa7..8236ff609f85 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp
@@ -22,7 +22,7 @@ namespace {
 
 class ExportClustering : public ScheduleDAGMutation {
 public:
-  ExportClustering() {}
+  ExportClustering() = default;
   void apply(ScheduleDAGInstrs *DAG) override;
 };
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp b/llvm/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp
deleted file mode 100644
index ea6c6d0fd212..000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-//===-- AMDGPUFixFunctionBitcasts.cpp - Fix function bitcasts -------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// Promote indirect (bitcast) calls to direct calls when they are statically
-/// known to be direct. Required when InstCombine is not run (e.g. at OptNone)
-/// because AMDGPU does not support indirect calls.
-///
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "llvm/IR/InstVisitor.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/Utils/CallPromotionUtils.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "amdgpu-fix-function-bitcasts"
-
-namespace {
-class AMDGPUFixFunctionBitcasts final
-    : public ModulePass,
-      public InstVisitor<AMDGPUFixFunctionBitcasts> {
-
-  bool runOnModule(Module &M) override;
-
-  bool Modified;
-
-public:
-  void visitCallBase(CallBase &CB) {
-    if (CB.getCalledFunction())
-      return;
-    auto *Callee =
-        dyn_cast<Function>(CB.getCalledOperand()->stripPointerCasts());
-    if (Callee && isLegalToPromote(CB, Callee)) {
-      promoteCall(CB, Callee);
-      Modified = true;
-    }
-  }
-
-  static char ID;
-  AMDGPUFixFunctionBitcasts() : ModulePass(ID) {}
-};
-} // End anonymous namespace
-
-char AMDGPUFixFunctionBitcasts::ID = 0;
-char &llvm::AMDGPUFixFunctionBitcastsID = AMDGPUFixFunctionBitcasts::ID;
-INITIALIZE_PASS(AMDGPUFixFunctionBitcasts, DEBUG_TYPE,
-                "Fix function bitcasts for AMDGPU", false, false)
-
-ModulePass *llvm::createAMDGPUFixFunctionBitcastsPass() {
-  return new AMDGPUFixFunctionBitcasts();
-}
-
-bool AMDGPUFixFunctionBitcasts::runOnModule(Module &M) {
-  Modified = false;
-  visit(M);
-  return Modified;
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 7fd94a977be7..5747fc0ca8e6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -47,10 +47,30 @@ def gi_vop3pmods :
     GIComplexOperandMatcher<s32, "selectVOP3PMods">,
     GIComplexPatternEquiv<VOP3PMods>;
 
+def gi_vop3pmodsdot :
+    GIComplexOperandMatcher<s32, "selectVOP3PModsDOT">,
+    GIComplexPatternEquiv<VOP3PModsDOT>;
+
+def gi_dotiuvop3pmods :
+    GIComplexOperandMatcher<s32, "selectDotIUVOP3PMods">,
+    GIComplexPatternEquiv<DotIUVOP3PMods>;
+
+def gi_wmmaopselvop3pmods :
+    GIComplexOperandMatcher<s32, "selectWMMAOpSelVOP3PMods">,
+    GIComplexPatternEquiv<WMMAOpSelVOP3PMods>;
+
 def gi_vop3opselmods :
     GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
     GIComplexPatternEquiv<VOP3OpSelMods>;
 
+def gi_vinterpmods :
+    GIComplexOperandMatcher<s32, "selectVINTERPMods">,
+    GIComplexPatternEquiv<VINTERPMods>;
+
+def gi_vinterpmods_hi :
+    GIComplexOperandMatcher<s32, "selectVINTERPModsHi">,
+    GIComplexPatternEquiv<VINTERPModsHi>;
+
 // FIXME: Why do we have both VOP3OpSel and VOP3OpSelMods?
 def gi_vop3opsel :
     GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
@@ -93,6 +113,10 @@ def gi_flat_scratch_saddr :
     GIComplexOperandMatcher<s32, "selectScratchSAddr">,
     GIComplexPatternEquiv<ScratchSAddr>;
 
+def gi_flat_scratch_svaddr :
+    GIComplexOperandMatcher<s32, "selectScratchSVAddr">,
+    GIComplexPatternEquiv<ScratchSVAddr>;
+
 def gi_ds_1addr_1offset :
     GIComplexOperandMatcher<s32, "selectDS1Addr1Offset">,
     GIComplexPatternEquiv<DS1Addr1Offset>;
@@ -123,7 +147,7 @@ def gi_smrd_buffer_imm32 :
 
 // Separate load nodes are defined to glue m0 initialization in
 // SelectionDAG. The GISel selector can just insert m0 initialization
-// directly before before selecting a glue-less load, so hide this
+// directly before selecting a glue-less load, so hide this
 // distinction.
 
 def : GINodeEquiv<G_LOAD, AMDGPUld_glue> {
@@ -222,6 +246,9 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMAX, SIbuffer_atomic_fmax>;
 def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>;
 def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD, SIsbuffer_load>;
 
+def : GINodeEquiv<G_FPTRUNC_ROUND_UPWARD, SIfptrunc_round_upward>;
+def : GINodeEquiv<G_FPTRUNC_ROUND_DOWNWARD, SIfptrunc_round_downward>;
+
 class GISelSop2Pat <
   SDPatternOperator node,
   Instruction inst,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
index cabdc6998011..1bbdc39a7a5e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
@@ -7,8 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUGlobalISelUtils.h"
+#include "GCNSubtarget.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/Support/LowLevelTypeImpl.h"
 
 using namespace llvm;
 using namespace MIPatternMatch;
@@ -66,3 +68,12 @@ bool AMDGPU::isLegalVOP3PShuffleMask(ArrayRef<int> Mask) {
     return true;
   return (Mask[0] & 2) == (Mask[1] & 2);
 }
+
+bool AMDGPU::hasAtomicFaddRtnForTy(const GCNSubtarget &Subtarget,
+                                   const LLT &Ty) {
+  if (Ty == LLT::scalar(32))
+    return Subtarget.hasAtomicFaddRtnInsts();
+  if (Ty == LLT::fixed_vector(2, 16) || Ty == LLT::scalar(64))
+    return Subtarget.hasGFX90AInsts();
+  return false;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
index 14d3a3fb7997..5c600d059b7a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
@@ -16,6 +16,8 @@
 namespace llvm {
 
 class MachineRegisterInfo;
+class GCNSubtarget;
+class LLT;
 
 namespace AMDGPU {
 
@@ -24,7 +26,7 @@ std::pair<Register, unsigned>
 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg);
 
 bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask);
-
+bool hasAtomicFaddRtnForTy(const GCNSubtarget &Subtarget, const LLT &Ty);
 }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index f5018e3a19ac..6fa44ffcbfaa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -400,17 +400,15 @@ void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func,
   auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(),
                                       AMDGPUAS::GLOBAL_ADDRESS);
 
-  // Emit "printf buffer" argument if printf is used, otherwise emit dummy
-  // "none" argument.
   if (HiddenArgNumBytes >= 32) {
+    // We forbid the use of features requiring hostcall when compiling OpenCL
+    // before code object V5, which makes the mutual exclusion between the
+    // "printf buffer" and "hostcall buffer" here sound.
     if (Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
       emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenPrintfBuffer);
-    else if (Func.getParent()->getFunction("__ockl_hostcall_internal")) {
-      // The printf runtime binding pass should have ensured that hostcall and
-      // printf are not used in the same module.
-      assert(!Func.getParent()->getNamedMetadata("llvm.printf.fmts"));
+    else if (!Func.hasFnAttribute("amdgpu-no-hostcall-ptr"))
       emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenHostcallBuffer);
-    } else
+    else
       emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenNone);
   }
 
@@ -427,8 +425,12 @@ void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func,
   }
 
   // Emit the pointer argument for multi-grid object.
-  if (HiddenArgNumBytes >= 56)
-    emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenMultiGridSyncArg);
+  if (HiddenArgNumBytes >= 56) {
+    if (!Func.hasFnAttribute("amdgpu-no-multigrid-sync-arg"))
+      emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenMultiGridSyncArg);
+    else
+      emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenNone);
+  }
 }
 
 bool MetadataStreamerV2::emitTo(AMDGPUTargetStreamer &TargetStreamer) {
@@ -803,6 +805,8 @@ void MetadataStreamerV3::emitHiddenKernelArgs(const MachineFunction &MF,
   auto &DL = M->getDataLayout();
   auto Int64Ty = Type::getInt64Ty(Func.getContext());
 
+  Offset = alignTo(Offset, ST.getAlignmentForImplicitArgPtr());
+
   if (HiddenArgNumBytes >= 8)
     emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_x", Offset,
                   Args);
@@ -816,19 +820,17 @@ void MetadataStreamerV3::emitHiddenKernelArgs(const MachineFunction &MF,
   auto Int8PtrTy =
       Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS);
 
-  // Emit "printf buffer" argument if printf is used, emit "hostcall buffer"
-  // if "hostcall" module flag is set, otherwise emit dummy "none" argument.
   if (HiddenArgNumBytes >= 32) {
+    // We forbid the use of features requiring hostcall when compiling OpenCL
+    // before code object V5, which makes the mutual exclusion between the
+    // "printf buffer" and "hostcall buffer" here sound.
     if (M->getNamedMetadata("llvm.printf.fmts"))
       emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset,
                     Args);
-    else if (M->getModuleFlag("amdgpu_hostcall")) {
-      // The printf runtime binding pass should have ensured that hostcall and
-      // printf are not used in the same module.
-      assert(!M->getNamedMetadata("llvm.printf.fmts"));
+    else if (!Func.hasFnAttribute("amdgpu-no-hostcall-ptr"))
       emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer", Offset,
                     Args);
-    } else
+    else
       emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
   }
 
@@ -847,9 +849,14 @@ void MetadataStreamerV3::emitHiddenKernelArgs(const MachineFunction &MF,
   }
 
   // Emit the pointer argument for multi-grid object.
-  if (HiddenArgNumBytes >= 56)
-    emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg", Offset,
-                  Args);
+  if (HiddenArgNumBytes >= 56) {
+    if (!Func.hasFnAttribute("amdgpu-no-multigrid-sync-arg")) {
+      emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg", Offset,
+                    Args);
+    } else {
+      emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
+    }
+  }
 }
 
 msgpack::MapDocNode
@@ -876,6 +883,12 @@ MetadataStreamerV3::getHSAKernelProps(const MachineFunction &MF,
       Kern.getDocument()->getNode(STM.getWavefrontSize());
   Kern[".sgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumSGPR);
   Kern[".vgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumVGPR);
+
+  // Only add AGPR count to metadata for supported devices
+  if (STM.hasMAIInsts()) {
+    Kern[".agpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumAccVGPR);
+  }
+
   Kern[".max_flat_workgroup_size"] =
       Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize());
   Kern[".sgpr_spill_count"] =
@@ -971,13 +984,20 @@ void MetadataStreamerV5::emitHiddenKernelArgs(const MachineFunction &MF,
                                               msgpack::ArrayDocNode Args) {
   auto &Func = MF.getFunction();
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+
+  // No implicit kernel argument is used.
+  if (ST.getImplicitArgNumBytes(Func) == 0)
+    return;
+
   const Module *M = Func.getParent();
   auto &DL = M->getDataLayout();
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 
   auto Int64Ty = Type::getInt64Ty(Func.getContext());
   auto Int32Ty = Type::getInt32Ty(Func.getContext());
   auto Int16Ty = Type::getInt16Ty(Func.getContext());
 
+  Offset = alignTo(Offset, ST.getAlignmentForImplicitArgPtr());
   emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_x", Offset, Args);
   emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_y", Offset, Args);
   emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_z", Offset, Args);
@@ -1008,40 +1028,49 @@ void MetadataStreamerV5::emitHiddenKernelArgs(const MachineFunction &MF,
   if (M->getNamedMetadata("llvm.printf.fmts")) {
     emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset,
                   Args);
-  } else
+  } else {
     Offset += 8; // Skipped.
+  }
 
-  if (M->getModuleFlag("amdgpu_hostcall")) {
+  if (!Func.hasFnAttribute("amdgpu-no-hostcall-ptr")) {
     emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer", Offset,
                   Args);
-  } else
+  } else {
     Offset += 8; // Skipped.
+  }
 
-  emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg", Offset,
+  if (!Func.hasFnAttribute("amdgpu-no-multigrid-sync-arg")) {
+    emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg", Offset,
                 Args);
+  } else {
+    Offset += 8; // Skipped.
+  }
 
-  // Ignore temporarily until it is implemented.
-  // emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_heap_v1", Offset, Args);
-  Offset += 8;
+  if (!Func.hasFnAttribute("amdgpu-no-heap-ptr"))
+    emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_heap_v1", Offset, Args);
+  else
+    Offset += 8; // Skipped.
 
   if (Func.hasFnAttribute("calls-enqueue-kernel")) {
     emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_default_queue", Offset,
                   Args);
     emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_completion_action", Offset,
                   Args);
-  } else
+  } else {
     Offset += 16; // Skipped.
+  }
 
   Offset += 72; // Reserved.
 
-  // hidden_private_base and hidden_shared_base are only used by GFX8.
-  if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+  // hidden_private_base and hidden_shared_base are only when the subtarget has
+  // ApertureRegs.
+  if (!ST.hasApertureRegs()) {
     emitKernelArg(DL, Int32Ty, Align(4), "hidden_private_base", Offset, Args);
     emitKernelArg(DL, Int32Ty, Align(4), "hidden_shared_base", Offset, Args);
-  } else
+  } else {
     Offset += 8; // Skipped.
+  }
 
-  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
   if (MFI.hasQueuePtr())
     emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_queue_ptr", Offset, Args);
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index bcf7fc449094..9b22d1f4d1b1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -42,7 +42,7 @@ namespace HSAMD {
 
 class MetadataStreamer {
 public:
-  virtual ~MetadataStreamer(){};
+  virtual ~MetadataStreamer() = default;
 
   virtual bool emitTo(AMDGPUTargetStreamer &TargetStreamer) = 0;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
new file mode 100644
index 000000000000..5c507ef70a8c
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -0,0 +1,439 @@
+//===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP  ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// \file This file defines a set of schedule DAG mutations that can be used to
+// override default scheduler behavior to enforce specific scheduling patterns.
+// They should be used in cases where runtime performance considerations such as
+// inter-wavefront interactions, mean that compile-time heuristics cannot
+// predict the optimal instruction ordering, or in kernels where optimum
+// instruction scheduling is important enough to warrant manual intervention.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUIGroupLP.h"
+#include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/BitmaskEnum.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "machine-scheduler"
+
+namespace {
+
+static cl::opt<bool>
+    EnableIGroupLP("amdgpu-igrouplp",
+                   cl::desc("Enable construction of Instruction Groups and "
+                            "their ordering for scheduling"),
+                   cl::init(false));
+
+static cl::opt<Optional<unsigned>>
+    VMEMGroupMaxSize("amdgpu-igrouplp-vmem-group-size", cl::init(None),
+                     cl::Hidden,
+                     cl::desc("The maximum number of instructions to include "
+                              "in VMEM group."));
+
+static cl::opt<Optional<unsigned>>
+    MFMAGroupMaxSize("amdgpu-igrouplp-mfma-group-size", cl::init(None),
+                     cl::Hidden,
+                     cl::desc("The maximum number of instructions to include "
+                              "in MFMA group."));
+
+static cl::opt<Optional<unsigned>>
+    LDRGroupMaxSize("amdgpu-igrouplp-ldr-group-size", cl::init(None),
+                    cl::Hidden,
+                    cl::desc("The maximum number of instructions to include "
+                             "in lds/gds read group."));
+
+static cl::opt<Optional<unsigned>>
+    LDWGroupMaxSize("amdgpu-igrouplp-ldw-group-size", cl::init(None),
+                    cl::Hidden,
+                    cl::desc("The maximum number of instructions to include "
+                             "in lds/gds write group."));
+
+typedef function_ref<bool(const MachineInstr &, const SIInstrInfo *)>
+    CanAddMIFn;
+
+// Classify instructions into groups to enable fine tuned control over the
+// scheduler. These groups may be more specific than current SchedModel
+// instruction classes.
+class SchedGroup {
+private:
+  // Function that returns true if a non-bundle MI may be inserted into this
+  // group.
+  const CanAddMIFn canAddMI;
+
+  // Maximum number of SUnits that can be added to this group.
+  Optional<unsigned> MaxSize;
+
+  // Collection of SUnits that are classified as members of this group.
+  SmallVector<SUnit *, 32> Collection;
+
+  ScheduleDAGInstrs *DAG;
+
+  void tryAddEdge(SUnit *A, SUnit *B) {
+    if (A != B && DAG->canAddEdge(B, A)) {
+      DAG->addEdge(B, SDep(A, SDep::Artificial));
+      LLVM_DEBUG(dbgs() << "Adding edge...\n"
+                        << "from: SU(" << A->NodeNum << ") " << *A->getInstr()
+                        << "to: SU(" << B->NodeNum << ") " << *B->getInstr());
+    }
+  }
+
+public:
+  // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If
+  // MakePred is true, SU will be a predecessor of the SUnits in this
+  // SchedGroup, otherwise SU will be a successor.
+  void link(SUnit &SU, bool MakePred = false) {
+    for (auto A : Collection) {
+      SUnit *B = &SU;
+      if (MakePred)
+        std::swap(A, B);
+
+      tryAddEdge(A, B);
+    }
+  }
+
+  // Add DAG dependencies from all SUnits in this SchedGroup and this SU. Use
+  // the predicate to determine whether SU should be a predecessor (P = true)
+  // or a successor (P = false) of this SchedGroup.
+  void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P) {
+    for (auto A : Collection) {
+      SUnit *B = &SU;
+      if (P(A, B))
+        std::swap(A, B);
+
+      tryAddEdge(A, B);
+    }
+  }
+
+  // Add DAG dependencies such that SUnits in this group shall be ordered
+  // before SUnits in OtherGroup.
+  void link(SchedGroup &OtherGroup) {
+    for (auto B : OtherGroup.Collection)
+      link(*B);
+  }
+
+  // Returns true if no more instructions may be added to this group.
+  bool isFull() { return MaxSize && Collection.size() >= *MaxSize; }
+
+  // Returns true if SU can be added to this SchedGroup.
+  bool canAddSU(SUnit &SU, const SIInstrInfo *TII) {
+    if (isFull())
+      return false;
+
+    MachineInstr &MI = *SU.getInstr();
+    if (MI.getOpcode() != TargetOpcode::BUNDLE)
+      return canAddMI(MI, TII);
+
+    // Special case for bundled MIs.
+    const MachineBasicBlock *MBB = MI.getParent();
+    MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B;
+    while (E != MBB->end() && E->isBundledWithPred())
+      ++E;
+
+    // Return true if all of the bundled MIs can be added to this group.
+    return std::all_of(
+        B, E, [this, TII](MachineInstr &MI) { return canAddMI(MI, TII); });
+  }
+
+  void add(SUnit &SU) { Collection.push_back(&SU); }
+
+  SchedGroup(CanAddMIFn canAddMI, Optional<unsigned> MaxSize,
+             ScheduleDAGInstrs *DAG)
+      : canAddMI(canAddMI), MaxSize(MaxSize), DAG(DAG) {}
+};
+
+bool isMFMASGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
+  return TII->isMFMA(MI);
+}
+
+bool isVALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
+  return TII->isVALU(MI) && !TII->isMFMA(MI);
+}
+
+bool isSALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
+  return TII->isSALU(MI);
+}
+
+bool isVMEMSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
+  return TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI));
+}
+
+bool isVMEMReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
+  return MI.mayLoad() &&
+         (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)));
+}
+
+bool isVMEMWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
+  return MI.mayStore() &&
+         (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)));
+}
+
+bool isDSWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
+  return MI.mayStore() && TII->isDS(MI);
+}
+
+bool isDSReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
+  return MI.mayLoad() && TII->isDS(MI);
+}
+
+class IGroupLPDAGMutation : public ScheduleDAGMutation {
+public:
+  const SIInstrInfo *TII;
+  ScheduleDAGMI *DAG;
+
+  IGroupLPDAGMutation() = default;
+  void apply(ScheduleDAGInstrs *DAGInstrs) override;
+};
+
+// DAG mutation that coordinates with the SCHED_BARRIER instruction and
+// corresponding builtin. The mutation adds edges from specific instruction
+// classes determined by the SCHED_BARRIER mask so that they cannot be
+// scheduled around the SCHED_BARRIER.
+class SchedBarrierDAGMutation : public ScheduleDAGMutation {
+private:
+  const SIInstrInfo *TII;
+
+  ScheduleDAGMI *DAG;
+
+  // Components of the mask that determines which instructions may not be
+  // scheduled across the SCHED_BARRIER.
+  enum class SchedBarrierMasks {
+    NONE = 0u,
+    ALU = 1u << 0,
+    VALU = 1u << 1,
+    SALU = 1u << 2,
+    MFMA = 1u << 3,
+    VMEM = 1u << 4,
+    VMEM_READ = 1u << 5,
+    VMEM_WRITE = 1u << 6,
+    DS = 1u << 7,
+    DS_READ = 1u << 8,
+    DS_WRITE = 1u << 9,
+    LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ DS_WRITE)
+  };
+
+  // Cache SchedGroups of each type if we have multiple SCHED_BARRIERs in a
+  // region.
+  //
+  std::unique_ptr<SchedGroup> MFMASchedGroup = nullptr;
+  std::unique_ptr<SchedGroup> VALUSchedGroup = nullptr;
+  std::unique_ptr<SchedGroup> SALUSchedGroup = nullptr;
+  std::unique_ptr<SchedGroup> VMEMReadSchedGroup = nullptr;
+  std::unique_ptr<SchedGroup> VMEMWriteSchedGroup = nullptr;
+  std::unique_ptr<SchedGroup> DSWriteSchedGroup = nullptr;
+  std::unique_ptr<SchedGroup> DSReadSchedGroup = nullptr;
+
+  // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
+  // not be reordered accross the SCHED_BARRIER.
+  void getSchedGroupsFromMask(int32_t Mask,
+                              SmallVectorImpl<SchedGroup *> &SchedGroups);
+
+  // Add DAG edges that enforce SCHED_BARRIER ordering.
+  void addSchedBarrierEdges(SUnit &SU);
+
+  // Classify instructions and add them to the SchedGroup.
+  void initSchedGroup(SchedGroup *SG);
+
+  // Remove all existing edges from a SCHED_BARRIER.
+  void resetSchedBarrierEdges(SUnit &SU);
+
+public:
+  void apply(ScheduleDAGInstrs *DAGInstrs) override;
+
+  SchedBarrierDAGMutation() = default;
+};
+
+void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
+  const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
+  TII = ST.getInstrInfo();
+  DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
+  const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
+  if (!TSchedModel || DAG->SUnits.empty())
+    return;
+
+  LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n");
+
+  // The order of InstructionGroups in this vector defines the
+  // order in which edges will be added. In other words, given the
+  // present ordering, we will try to make each VMEMRead instruction
+  // a predecessor of each DSRead instruction, and so on.
+  SmallVector<SchedGroup, 4> PipelineOrderGroups = {
+      SchedGroup(isVMEMSGMember, VMEMGroupMaxSize, DAG),
+      SchedGroup(isDSReadSGMember, LDRGroupMaxSize, DAG),
+      SchedGroup(isMFMASGMember, MFMAGroupMaxSize, DAG),
+      SchedGroup(isDSWriteSGMember, LDWGroupMaxSize, DAG)};
+
+  for (SUnit &SU : DAG->SUnits) {
+    LLVM_DEBUG(dbgs() << "Checking Node"; DAG->dumpNode(SU));
+    for (auto &SG : PipelineOrderGroups)
+      if (SG.canAddSU(SU, TII))
+        SG.add(SU);
+  }
+
+  for (unsigned i = 0; i < PipelineOrderGroups.size() - 1; i++) {
+    auto &GroupA = PipelineOrderGroups[i];
+    for (unsigned j = i + 1; j < PipelineOrderGroups.size(); j++) {
+      auto &GroupB = PipelineOrderGroups[j];
+      GroupA.link(GroupB);
+    }
+  }
+}
+
+void SchedBarrierDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
+  const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
+  if (!TSchedModel || DAGInstrs->SUnits.empty())
+    return;
+
+  LLVM_DEBUG(dbgs() << "Applying SchedBarrierDAGMutation...\n");
+
+  const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
+  TII = ST.getInstrInfo();
+  DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
+  for (auto &SU : DAG->SUnits)
+    if (SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER)
+      addSchedBarrierEdges(SU);
+}
+
+void SchedBarrierDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
+  MachineInstr &MI = *SchedBarrier.getInstr();
+  assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);
+  // Remove all existing edges from the SCHED_BARRIER that were added due to the
+  // instruction having side effects.
+  resetSchedBarrierEdges(SchedBarrier);
+  SmallVector<SchedGroup *, 4> SchedGroups;
+  int32_t Mask = MI.getOperand(0).getImm();
+  getSchedGroupsFromMask(Mask, SchedGroups);
+  for (auto SG : SchedGroups)
+    SG->link(
+        SchedBarrier, (function_ref<bool(const SUnit *A, const SUnit *B)>)[](
+                          const SUnit *A, const SUnit *B) {
+          return A->NodeNum > B->NodeNum;
+        });
+}
+
+void SchedBarrierDAGMutation::getSchedGroupsFromMask(
+    int32_t Mask, SmallVectorImpl<SchedGroup *> &SchedGroups) {
+  SchedBarrierMasks SBMask = (SchedBarrierMasks)Mask;
+  // See IntrinsicsAMDGPU.td for an explanation of these masks and their
+  // mappings.
+  //
+  if ((SBMask & SchedBarrierMasks::VALU) == SchedBarrierMasks::NONE &&
+      (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) {
+    if (!VALUSchedGroup) {
+      VALUSchedGroup = std::make_unique<SchedGroup>(isVALUSGMember, None, DAG);
+      initSchedGroup(VALUSchedGroup.get());
+    }
+
+    SchedGroups.push_back(VALUSchedGroup.get());
+  }
+
+  if ((SBMask & SchedBarrierMasks::SALU) == SchedBarrierMasks::NONE &&
+      (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) {
+    if (!SALUSchedGroup) {
+      SALUSchedGroup = std::make_unique<SchedGroup>(isSALUSGMember, None, DAG);
+      initSchedGroup(SALUSchedGroup.get());
+    }
+
+    SchedGroups.push_back(SALUSchedGroup.get());
+  }
+
+  if ((SBMask & SchedBarrierMasks::MFMA) == SchedBarrierMasks::NONE &&
+      (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) {
+    if (!MFMASchedGroup) {
+      MFMASchedGroup = std::make_unique<SchedGroup>(isMFMASGMember, None, DAG);
+      initSchedGroup(MFMASchedGroup.get());
+    }
+
+    SchedGroups.push_back(MFMASchedGroup.get());
+  }
+
+  if ((SBMask & SchedBarrierMasks::VMEM_READ) == SchedBarrierMasks::NONE &&
+      (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) {
+    if (!VMEMReadSchedGroup) {
+      VMEMReadSchedGroup =
+          std::make_unique<SchedGroup>(isVMEMReadSGMember, None, DAG);
+      initSchedGroup(VMEMReadSchedGroup.get());
+    }
+
+    SchedGroups.push_back(VMEMReadSchedGroup.get());
+  }
+
+  if ((SBMask & SchedBarrierMasks::VMEM_WRITE) == SchedBarrierMasks::NONE &&
+      (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) {
+    if (!VMEMWriteSchedGroup) {
+      VMEMWriteSchedGroup =
+          std::make_unique<SchedGroup>(isVMEMWriteSGMember, None, DAG);
+      initSchedGroup(VMEMWriteSchedGroup.get());
+    }
+
+    SchedGroups.push_back(VMEMWriteSchedGroup.get());
+  }
+
+  if ((SBMask & SchedBarrierMasks::DS_READ) == SchedBarrierMasks::NONE &&
+      (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) {
+    if (!DSReadSchedGroup) {
+      DSReadSchedGroup =
+          std::make_unique<SchedGroup>(isDSReadSGMember, None, DAG);
+      initSchedGroup(DSReadSchedGroup.get());
+    }
+
+    SchedGroups.push_back(DSReadSchedGroup.get());
+  }
+
+  if ((SBMask & SchedBarrierMasks::DS_WRITE) == SchedBarrierMasks::NONE &&
+      (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) {
+    if (!DSWriteSchedGroup) {
+      DSWriteSchedGroup =
+          std::make_unique<SchedGroup>(isDSWriteSGMember, None, DAG);
+      initSchedGroup(DSWriteSchedGroup.get());
+    }
+
+    SchedGroups.push_back(DSWriteSchedGroup.get());
+  }
+}
+
+void SchedBarrierDAGMutation::initSchedGroup(SchedGroup *SG) {
+  assert(SG);
+  for (auto &SU : DAG->SUnits)
+    if (SG->canAddSU(SU, TII))
+      SG->add(SU);
+}
+
+void SchedBarrierDAGMutation::resetSchedBarrierEdges(SUnit &SU) {
+  assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER);
+  for (auto &P : SU.Preds)
+    SU.removePred(P);
+
+  for (auto &S : SU.Succs) {
+    for (auto &SP : S.getSUnit()->Preds) {
+      if (SP.getSUnit() == &SU) {
+        S.getSUnit()->removePred(SP);
+      }
+    }
+  }
+}
+
+} // namespace
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() {
+  return EnableIGroupLP ? std::make_unique<IGroupLPDAGMutation>() : nullptr;
+}
+
+std::unique_ptr<ScheduleDAGMutation> createSchedBarrierDAGMutation() {
+  return std::make_unique<SchedBarrierDAGMutation>();
+}
+
+} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
new file mode 100644
index 000000000000..aeb1bbad3705
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
@@ -0,0 +1,22 @@
+//===- AMDGPUMFMAIGroupLP.h - AMDGPU MFMA IGroupLP --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMAIGROUPLP_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMAIGROUPLP_H
+
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include <memory>
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation();
+std::unique_ptr<ScheduleDAGMutation> createSchedBarrierDAGMutation();
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMAIGROUPLP_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 8236e6672247..b00df27f5fd3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -13,7 +13,9 @@
 
 #include "AMDGPUISelDAGToDAG.h"
 #include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "MCTargetDesc/R600MCTargetDesc.h"
 #include "R600RegisterInfo.h"
 #include "SIMachineFunctionInfo.h"
@@ -679,9 +681,6 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
   case ISD::FMA:
     SelectFMAD_FMA(N);
     return;
-  case AMDGPUISD::ATOMIC_CMP_SWAP:
-    SelectATOMIC_CMP_SWAP(N);
-    return;
   case AMDGPUISD::CVT_PKRTZ_F16_F32:
   case AMDGPUISD::CVT_PKNORM_I16_F32:
   case AMDGPUISD::CVT_PKNORM_U16_F32:
@@ -1008,7 +1007,12 @@ void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
 void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
   SDLoc SL(N);
   bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
-  unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
+  unsigned Opc;
+  if (Subtarget->getGeneration() == AMDGPUSubtarget::GFX11)
+    Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
+                 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
+  else
+    Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
 
   SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
   SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
@@ -1021,7 +1025,12 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
 void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
   SDLoc SL(N);
   bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
-  unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
+  unsigned Opc;
+  if (Subtarget->getGeneration() == AMDGPUSubtarget::GFX11)
+    Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
+                 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
+  else
+    Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
 
   SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
   SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
@@ -1798,6 +1807,82 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
   return true;
 }
 
+// Check whether the flat scratch SVS swizzle bug affects this access.
+bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
+    SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
+  if (!Subtarget->hasFlatScratchSVSSwizzleBug())
+    return false;
+
+  // The bug affects the swizzling of SVS accesses if there is any carry out
+  // from the two low order bits (i.e. from bit 1 into bit 2) when adding
+  // voffset to (soffset + inst_offset).
+  KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
+  KnownBits SKnown = KnownBits::computeForAddSub(
+      true, false, CurDAG->computeKnownBits(SAddr),
+      KnownBits::makeConstant(APInt(32, ImmOffset)));
+  uint64_t VMax = VKnown.getMaxValue().getZExtValue();
+  uint64_t SMax = SKnown.getMaxValue().getZExtValue();
+  return (VMax & 3) + (SMax & 3) >= 4;
+}
+
+bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
+                                             SDValue &VAddr, SDValue &SAddr,
+                                             SDValue &Offset) const  {
+  int64_t ImmOffset = 0;
+
+  SDValue LHS, RHS;
+  if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
+    int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
+    const SIInstrInfo *TII = Subtarget->getInstrInfo();
+
+    if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) {
+      Addr = LHS;
+      ImmOffset = COffsetVal;
+    } else if (!LHS->isDivergent() && COffsetVal > 0) {
+      SDLoc SL(N);
+      // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
+      //                         (large_offset & MaxOffset);
+      int64_t SplitImmOffset, RemainderOffset;
+      std::tie(SplitImmOffset, RemainderOffset)
+        = TII->splitFlatOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true);
+
+      if (isUInt<32>(RemainderOffset)) {
+        SDNode *VMov = CurDAG->getMachineNode(
+          AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
+          CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
+        VAddr = SDValue(VMov, 0);
+        SAddr = LHS;
+        if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
+          return false;
+        Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
+        return true;
+      }
+    }
+  }
+
+  if (Addr.getOpcode() != ISD::ADD)
+    return false;
+
+  LHS = Addr.getOperand(0);
+  RHS = Addr.getOperand(1);
+
+  if (!LHS->isDivergent() && RHS->isDivergent()) {
+    SAddr = LHS;
+    VAddr = RHS;
+  } else if (!RHS->isDivergent() && LHS->isDivergent()) {
+    SAddr = RHS;
+    VAddr = LHS;
+  } else {
+    return false;
+  }
+
+  if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
+    return false;
+  SAddr = SelectSAddrFI(CurDAG, SAddr);
+  Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
+  return true;
+}
+
 bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
                                           SDValue &Offset, bool &Imm) const {
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
@@ -2224,70 +2309,6 @@ void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) {
   }
 }
 
-// This is here because there isn't a way to use the generated sub0_sub1 as the
-// subreg index to EXTRACT_SUBREG in tablegen.
-void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
-  MemSDNode *Mem = cast<MemSDNode>(N);
-  unsigned AS = Mem->getAddressSpace();
-  if (AS == AMDGPUAS::FLAT_ADDRESS) {
-    SelectCode(N);
-    return;
-  }
-
-  MVT VT = N->getSimpleValueType(0);
-  bool Is32 = (VT == MVT::i32);
-  SDLoc SL(N);
-
-  MachineSDNode *CmpSwap = nullptr;
-  if (Subtarget->hasAddr64()) {
-    SDValue SRsrc, VAddr, SOffset, Offset;
-
-    if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset)) {
-      unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN :
-        AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN;
-      SDValue CmpVal = Mem->getOperand(2);
-      SDValue CPol = CurDAG->getTargetConstant(AMDGPU::CPol::GLC, SL, MVT::i32);
-
-      // XXX - Do we care about glue operands?
-
-      SDValue Ops[] = {CmpVal, VAddr, SRsrc, SOffset, Offset, CPol,
-                       Mem->getChain()};
-
-      CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
-    }
-  }
-
-  if (!CmpSwap) {
-    SDValue SRsrc, SOffset, Offset;
-    if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset)) {
-      unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN :
-        AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN;
-
-      SDValue CmpVal = Mem->getOperand(2);
-      SDValue CPol = CurDAG->getTargetConstant(AMDGPU::CPol::GLC, SL, MVT::i32);
-      SDValue Ops[] = {CmpVal, SRsrc, SOffset, Offset, CPol, Mem->getChain()};
-
-      CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
-    }
-  }
-
-  if (!CmpSwap) {
-    SelectCode(N);
-    return;
-  }
-
-  MachineMemOperand *MMO = Mem->getMemOperand();
-  CurDAG->setNodeMemRefs(CmpSwap, {MMO});
-
-  unsigned SubReg = Is32 ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
-  SDValue Extract
-    = CurDAG->getTargetExtractSubreg(SubReg, SL, VT, SDValue(CmpSwap, 0));
-
-  ReplaceUses(SDValue(N, 0), Extract);
-  ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 1));
-  CurDAG->RemoveDeadNode(N);
-}
-
 void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
   // The address is assumed to be uniform, so if it ends up in a VGPR, it will
   // be copied to an SGPR with readfirstlane.
@@ -2587,6 +2608,30 @@ bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
   return true;
 }
 
+bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
+                                               SDValue &SrcMods,
+                                               bool OpSel) const {
+  unsigned Mods;
+  if (SelectVOP3ModsImpl(In, Src, Mods, /* AllowAbs */ false)) {
+    if (OpSel)
+      Mods |= SISrcMods::OP_SEL_0;
+    SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+    return true;
+  }
+
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
+                                           SDValue &SrcMods) const {
+  return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
+}
+
+bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
+                                             SDValue &SrcMods) const {
+  return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
+}
+
 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
                                          SDValue &SrcMods, SDValue &Clamp,
                                          SDValue &Omod) const {
@@ -2619,7 +2664,7 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
 }
 
 bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
-                                         SDValue &SrcMods) const {
+                                         SDValue &SrcMods, bool IsDOT) const {
   unsigned Mods = 0;
   Src = In;
 
@@ -2628,7 +2673,8 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
     Src = Src.getOperand(0);
   }
 
-  if (Src.getOpcode() == ISD::BUILD_VECTOR) {
+  if (Src.getOpcode() == ISD::BUILD_VECTOR &&
+      (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
     unsigned VecMods = Mods;
 
     SDValue Lo = stripBitcast(Src.getOperand(0));
@@ -2716,6 +2762,40 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
   return true;
 }
 
+bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
+                                            SDValue &SrcMods) const {
+  return SelectVOP3PMods(In, Src, SrcMods, true);
+}
+
+bool AMDGPUDAGToDAGISel::SelectDotIUVOP3PMods(SDValue In, SDValue &Src) const {
+  const ConstantSDNode *C = cast<ConstantSDNode>(In);
+  // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
+  // 1 promotes packed values to signed, 0 treats them as unsigned.
+  assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
+
+  unsigned Mods = SISrcMods::OP_SEL_1;
+  unsigned SrcSign = C->getAPIntValue().getZExtValue();
+  if (SrcSign == 1)
+    Mods ^= SISrcMods::NEG;
+
+  Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
+                                                  SDValue &Src) const {
+  const ConstantSDNode *C = cast<ConstantSDNode>(In);
+  assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
+
+  unsigned Mods = SISrcMods::OP_SEL_1;
+  unsigned SrcVal = C->getAPIntValue().getZExtValue();
+  if (SrcVal == 1)
+    Mods |= SISrcMods::OP_SEL_0;
+
+  Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+  return true;
+}
+
 bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
                                          SDValue &SrcMods) const {
   Src = In;
@@ -2840,7 +2920,7 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
           }
         }
       }
-      // If "AllUsesAcceptSReg == false" so far we haven't suceeded
+      // If "AllUsesAcceptSReg == false" so far we haven't succeeded
       // commuting current user. This means have at least one use
       // that strictly require VGPR. Thus, we will not attempt to commute
       // other user instructions.
@@ -2854,26 +2934,15 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
 bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const {
   auto Ld = cast<LoadSDNode>(N);
 
-  return Ld->getAlignment() >= 4 &&
-        (
-          (
-            (
-              Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS       ||
-              Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
-            )
-            &&
-            !N->isDivergent()
-          )
-          ||
-          (
-            Subtarget->getScalarizeGlobalBehavior() &&
-            Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
-            Ld->isSimple() &&
-            !N->isDivergent() &&
-            static_cast<const SITargetLowering *>(
-              getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)
-          )
-        );
+  return Ld->getAlign() >= Align(4) &&
+         (((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
+            Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
+           !N->isDivergent()) ||
+          (Subtarget->getScalarizeGlobalBehavior() &&
+           Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
+           Ld->isSimple() && !N->isDivergent() &&
+           static_cast<const SITargetLowering *>(getTargetLowering())
+               ->isMemOpHasNoClobberedMemOperand(N)));
 }
 
 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index d638d9877a9b..862be9dc5568 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -188,6 +188,10 @@ private:
                          SDValue &VOffset, SDValue &Offset) const;
   bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
                           SDValue &Offset) const;
+  bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr,
+                                     uint64_t ImmOffset) const;
+  bool SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr,
+                           SDValue &SAddr, SDValue &Offset) const;
 
   bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
                         bool &Imm) const;
@@ -214,10 +218,20 @@ private:
   bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
                          SDValue &Clamp, SDValue &Omod) const;
 
+  bool SelectVINTERPModsImpl(SDValue In, SDValue &Src, SDValue &SrcMods,
+                             bool OpSel) const;
+  bool SelectVINTERPMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+  bool SelectVINTERPModsHi(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+
   bool SelectVOP3OMods(SDValue In, SDValue &Src, SDValue &Clamp,
                        SDValue &Omod) const;
 
-  bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+  bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods,
+                       bool IsDOT = false) const;
+  bool SelectVOP3PModsDOT(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+
+  bool SelectDotIUVOP3PMods(SDValue In, SDValue &Src) const;
+  bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const;
 
   bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
 
@@ -245,7 +259,6 @@ private:
   bool isCBranchSCC(const SDNode *N) const;
   void SelectBRCOND(SDNode *N);
   void SelectFMAD_FMA(SDNode *N);
-  void SelectATOMIC_CMP_SWAP(SDNode *N);
   void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
   void SelectDS_GWS(SDNode *N, unsigned IntrID);
   void SelectInterpP1F16(SDNode *N);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index b9d0655feef7..ef7929012597 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -19,6 +19,7 @@
 #include "GCNSubtarget.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/Support/CommandLine.h"
@@ -127,49 +128,27 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
 
   // There are no 64-bit extloads. These should be done as a 32-bit extload and
   // an extension to 64-bit.
-  for (MVT VT : MVT::integer_valuetypes()) {
-    setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
-    setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
-    setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
-  }
+  for (MVT VT : MVT::integer_valuetypes())
+    setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::i64, VT,
+                     Expand);
 
   for (MVT VT : MVT::integer_valuetypes()) {
     if (VT == MVT::i64)
       continue;
 
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
-
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
-
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
+    for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
+      setLoadExtAction(Op, VT, MVT::i1, Promote);
+      setLoadExtAction(Op, VT, MVT::i8, Legal);
+      setLoadExtAction(Op, VT, MVT::i16, Legal);
+      setLoadExtAction(Op, VT, MVT::i32, Expand);
+    }
   }
 
-  for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v3i16, Expand);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v3i16, Expand);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v3i16, Expand);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
-  }
+  for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
+    for (auto MemVT :
+         {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
+      setLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, VT, MemVT,
+                       Expand);
 
   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
   setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
@@ -304,229 +283,125 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
   setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
 
-  setOperationAction(ISD::Constant, MVT::i32, Legal);
-  setOperationAction(ISD::Constant, MVT::i64, Legal);
-  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
-  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+  setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
+  setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
 
-  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
-  setOperationAction(ISD::BRIND, MVT::Other, Expand);
+  setOperationAction({ISD::BR_JT, ISD::BRIND}, MVT::Other, Expand);
 
   // This is totally unsupported, just custom lower to produce an error.
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
 
   // Library functions.  These default to Expand, but we have instructions
   // for them.
-  setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
-  setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
-  setOperationAction(ISD::FPOW,   MVT::f32, Legal);
-  setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
-  setOperationAction(ISD::FABS,   MVT::f32, Legal);
-  setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
-  setOperationAction(ISD::FRINT,  MVT::f32, Legal);
-  setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
-  setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
-  setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+  setOperationAction({ISD::FCEIL, ISD::FEXP2, ISD::FPOW, ISD::FLOG2, ISD::FABS,
+                      ISD::FFLOOR, ISD::FRINT, ISD::FTRUNC, ISD::FMINNUM,
+                      ISD::FMAXNUM},
+                     MVT::f32, Legal);
 
-  setOperationAction(ISD::FROUND, MVT::f32, Custom);
-  setOperationAction(ISD::FROUND, MVT::f64, Custom);
+  setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
 
-  setOperationAction(ISD::FLOG, MVT::f32, Custom);
-  setOperationAction(ISD::FLOG10, MVT::f32, Custom);
-  setOperationAction(ISD::FEXP, MVT::f32, Custom);
+  setOperationAction({ISD::FLOG, ISD::FLOG10, ISD::FEXP}, MVT::f32, Custom);
 
+  setOperationAction(ISD::FNEARBYINT, {MVT::f32, MVT::f64}, Custom);
 
-  setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
-  setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
-
-  setOperationAction(ISD::FREM, MVT::f16, Custom);
-  setOperationAction(ISD::FREM, MVT::f32, Custom);
-  setOperationAction(ISD::FREM, MVT::f64, Custom);
+  setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
 
   // Expand to fneg + fadd.
   setOperationAction(ISD::FSUB, MVT::f64, Expand);
 
-  setOperationAction(ISD::CONCAT_VECTORS, MVT::v3i32, Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, MVT::v3f32, Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, MVT::v5i32, Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, MVT::v5f32, Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, MVT::v6i32, Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, MVT::v6f32, Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, MVT::v7i32, Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, MVT::v7f32, Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f16, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i16, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f16, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i16, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i32, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5f32, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v6f32, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v6i32, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v7f32, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v7i32, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f64, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i64, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f64, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i64, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f64, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i64, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f64, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i64, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f64, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i64, Custom);
+  setOperationAction(ISD::CONCAT_VECTORS,
+                     {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
+                      MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
+                      MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32},
+                     Custom);
+  setOperationAction(
+      ISD::EXTRACT_SUBVECTOR,
+      {MVT::v2f16,  MVT::v2i16,  MVT::v4f16,  MVT::v4i16,  MVT::v2f32,
+       MVT::v2i32,  MVT::v3f32,  MVT::v3i32,  MVT::v4f32,  MVT::v4i32,
+       MVT::v5f32,  MVT::v5i32,  MVT::v6f32,  MVT::v6i32,  MVT::v7f32,
+       MVT::v7i32,  MVT::v8f32,  MVT::v8i32,  MVT::v16f16, MVT::v16i16,
+       MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32, MVT::v2f64,
+       MVT::v2i64,  MVT::v3f64,  MVT::v3i64,  MVT::v4f64,  MVT::v4i64,
+       MVT::v8f64,  MVT::v8i64,  MVT::v16f64, MVT::v16i64},
+      Custom);
 
   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
-  setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
-  setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
+  setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
 
   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
   for (MVT VT : ScalarIntVTs) {
     // These should use [SU]DIVREM, so set them to expand
-    setOperationAction(ISD::SDIV, VT, Expand);
-    setOperationAction(ISD::UDIV, VT, Expand);
-    setOperationAction(ISD::SREM, VT, Expand);
-    setOperationAction(ISD::UREM, VT, Expand);
+    setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM}, VT,
+                       Expand);
 
     // GPU does not have divrem function for signed or unsigned.
-    setOperationAction(ISD::SDIVREM, VT, Custom);
-    setOperationAction(ISD::UDIVREM, VT, Custom);
+    setOperationAction({ISD::SDIVREM, ISD::UDIVREM}, VT, Custom);
 
     // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
-    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
-    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+    setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT, Expand);
 
-    setOperationAction(ISD::BSWAP, VT, Expand);
-    setOperationAction(ISD::CTTZ, VT, Expand);
-    setOperationAction(ISD::CTLZ, VT, Expand);
+    setOperationAction({ISD::BSWAP, ISD::CTTZ, ISD::CTLZ}, VT, Expand);
 
     // AMDGPU uses ADDC/SUBC/ADDE/SUBE
-    setOperationAction(ISD::ADDC, VT, Legal);
-    setOperationAction(ISD::SUBC, VT, Legal);
-    setOperationAction(ISD::ADDE, VT, Legal);
-    setOperationAction(ISD::SUBE, VT, Legal);
+    setOperationAction({ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT, Legal);
   }
 
   // The hardware supports 32-bit FSHR, but not FSHL.
   setOperationAction(ISD::FSHR, MVT::i32, Legal);
 
   // The hardware supports 32-bit ROTR, but not ROTL.
-  setOperationAction(ISD::ROTL, MVT::i32, Expand);
-  setOperationAction(ISD::ROTL, MVT::i64, Expand);
+  setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
   setOperationAction(ISD::ROTR, MVT::i64, Expand);
 
-  setOperationAction(ISD::MULHU, MVT::i16, Expand);
-  setOperationAction(ISD::MULHS, MVT::i16, Expand);
+  setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand);
 
-  setOperationAction(ISD::MUL, MVT::i64, Expand);
-  setOperationAction(ISD::MULHU, MVT::i64, Expand);
-  setOperationAction(ISD::MULHS, MVT::i64, Expand);
-  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
-  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
-  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
-  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+  setOperationAction({ISD::MUL, ISD::MULHU, ISD::MULHS}, MVT::i64, Expand);
+  setOperationAction(
+      {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
+      MVT::i64, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
 
-  setOperationAction(ISD::SMIN, MVT::i32, Legal);
-  setOperationAction(ISD::UMIN, MVT::i32, Legal);
-  setOperationAction(ISD::SMAX, MVT::i32, Legal);
-  setOperationAction(ISD::UMAX, MVT::i32, Legal);
+  setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32,
+                     Legal);
 
-  setOperationAction(ISD::CTTZ, MVT::i64, Custom);
-  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);
-  setOperationAction(ISD::CTLZ, MVT::i64, Custom);
-  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
+  setOperationAction(
+      {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
+      MVT::i64, Custom);
 
   static const MVT::SimpleValueType VectorIntTypes[] = {
       MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32};
 
   for (MVT VT : VectorIntTypes) {
     // Expand the following operations for the current type by default.
-    setOperationAction(ISD::ADD,  VT, Expand);
-    setOperationAction(ISD::AND,  VT, Expand);
-    setOperationAction(ISD::FP_TO_SINT, VT, Expand);
-    setOperationAction(ISD::FP_TO_UINT, VT, Expand);
-    setOperationAction(ISD::MUL,  VT, Expand);
-    setOperationAction(ISD::MULHU, VT, Expand);
-    setOperationAction(ISD::MULHS, VT, Expand);
-    setOperationAction(ISD::OR,   VT, Expand);
-    setOperationAction(ISD::SHL,  VT, Expand);
-    setOperationAction(ISD::SRA,  VT, Expand);
-    setOperationAction(ISD::SRL,  VT, Expand);
-    setOperationAction(ISD::ROTL, VT, Expand);
-    setOperationAction(ISD::ROTR, VT, Expand);
-    setOperationAction(ISD::SUB,  VT, Expand);
-    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
-    setOperationAction(ISD::UINT_TO_FP, VT, Expand);
-    setOperationAction(ISD::SDIV, VT, Expand);
-    setOperationAction(ISD::UDIV, VT, Expand);
-    setOperationAction(ISD::SREM, VT, Expand);
-    setOperationAction(ISD::UREM, VT, Expand);
-    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
-    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
-    setOperationAction(ISD::SDIVREM, VT, Expand);
-    setOperationAction(ISD::UDIVREM, VT, Expand);
-    setOperationAction(ISD::SELECT, VT, Expand);
-    setOperationAction(ISD::VSELECT, VT, Expand);
-    setOperationAction(ISD::SELECT_CC, VT, Expand);
-    setOperationAction(ISD::XOR,  VT, Expand);
-    setOperationAction(ISD::BSWAP, VT, Expand);
-    setOperationAction(ISD::CTPOP, VT, Expand);
-    setOperationAction(ISD::CTTZ, VT, Expand);
-    setOperationAction(ISD::CTLZ, VT, Expand);
-    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
-    setOperationAction(ISD::SETCC, VT, Expand);
+    setOperationAction({ISD::ADD,        ISD::AND,     ISD::FP_TO_SINT,
+                        ISD::FP_TO_UINT, ISD::MUL,     ISD::MULHU,
+                        ISD::MULHS,      ISD::OR,      ISD::SHL,
+                        ISD::SRA,        ISD::SRL,     ISD::ROTL,
+                        ISD::ROTR,       ISD::SUB,     ISD::SINT_TO_FP,
+                        ISD::UINT_TO_FP, ISD::SDIV,    ISD::UDIV,
+                        ISD::SREM,       ISD::UREM,    ISD::SMUL_LOHI,
+                        ISD::UMUL_LOHI,  ISD::SDIVREM, ISD::UDIVREM,
+                        ISD::SELECT,     ISD::VSELECT, ISD::SELECT_CC,
+                        ISD::XOR,        ISD::BSWAP,   ISD::CTPOP,
+                        ISD::CTTZ,       ISD::CTLZ,    ISD::VECTOR_SHUFFLE,
+                        ISD::SETCC},
+                       VT, Expand);
   }
 
   static const MVT::SimpleValueType FloatVectorTypes[] = {
       MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32};
 
   for (MVT VT : FloatVectorTypes) {
-    setOperationAction(ISD::FABS, VT, Expand);
-    setOperationAction(ISD::FMINNUM, VT, Expand);
-    setOperationAction(ISD::FMAXNUM, VT, Expand);
-    setOperationAction(ISD::FADD, VT, Expand);
-    setOperationAction(ISD::FCEIL, VT, Expand);
-    setOperationAction(ISD::FCOS, VT, Expand);
-    setOperationAction(ISD::FDIV, VT, Expand);
-    setOperationAction(ISD::FEXP2, VT, Expand);
-    setOperationAction(ISD::FEXP, VT, Expand);
-    setOperationAction(ISD::FLOG2, VT, Expand);
-    setOperationAction(ISD::FREM, VT, Expand);
-    setOperationAction(ISD::FLOG, VT, Expand);
-    setOperationAction(ISD::FLOG10, VT, Expand);
-    setOperationAction(ISD::FPOW, VT, Expand);
-    setOperationAction(ISD::FFLOOR, VT, Expand);
-    setOperationAction(ISD::FTRUNC, VT, Expand);
-    setOperationAction(ISD::FMUL, VT, Expand);
-    setOperationAction(ISD::FMA, VT, Expand);
-    setOperationAction(ISD::FRINT, VT, Expand);
-    setOperationAction(ISD::FNEARBYINT, VT, Expand);
-    setOperationAction(ISD::FSQRT, VT, Expand);
-    setOperationAction(ISD::FSIN, VT, Expand);
-    setOperationAction(ISD::FSUB, VT, Expand);
-    setOperationAction(ISD::FNEG, VT, Expand);
-    setOperationAction(ISD::VSELECT, VT, Expand);
-    setOperationAction(ISD::SELECT_CC, VT, Expand);
-    setOperationAction(ISD::FCOPYSIGN, VT, Expand);
-    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
-    setOperationAction(ISD::SETCC, VT, Expand);
-    setOperationAction(ISD::FCANONICALIZE, VT, Expand);
+    setOperationAction(
+        {ISD::FABS,    ISD::FMINNUM,      ISD::FMAXNUM,   ISD::FADD,
+         ISD::FCEIL,   ISD::FCOS,         ISD::FDIV,      ISD::FEXP2,
+         ISD::FEXP,    ISD::FLOG2,        ISD::FREM,      ISD::FLOG,
+         ISD::FLOG10,  ISD::FPOW,         ISD::FFLOOR,    ISD::FTRUNC,
+         ISD::FMUL,    ISD::FMA,          ISD::FRINT,     ISD::FNEARBYINT,
+         ISD::FSQRT,   ISD::FSIN,         ISD::FSUB,      ISD::FNEG,
+         ISD::VSELECT, ISD::SELECT_CC,    ISD::FCOPYSIGN, ISD::VECTOR_SHUFFLE,
+         ISD::SETCC,   ISD::FCANONICALIZE},
+        VT, Expand);
   }
 
   // This causes using an unrolled select operation rather than expansion with
@@ -590,26 +465,16 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   if (AMDGPUBypassSlowDiv)
     addBypassSlowDiv(64, 32);
 
-  setTargetDAGCombine(ISD::BITCAST);
-  setTargetDAGCombine(ISD::SHL);
-  setTargetDAGCombine(ISD::SRA);
-  setTargetDAGCombine(ISD::SRL);
-  setTargetDAGCombine(ISD::TRUNCATE);
-  setTargetDAGCombine(ISD::MUL);
-  setTargetDAGCombine(ISD::SMUL_LOHI);
-  setTargetDAGCombine(ISD::UMUL_LOHI);
-  setTargetDAGCombine(ISD::MULHU);
-  setTargetDAGCombine(ISD::MULHS);
-  setTargetDAGCombine(ISD::SELECT);
-  setTargetDAGCombine(ISD::SELECT_CC);
-  setTargetDAGCombine(ISD::STORE);
-  setTargetDAGCombine(ISD::FADD);
-  setTargetDAGCombine(ISD::FSUB);
-  setTargetDAGCombine(ISD::FNEG);
-  setTargetDAGCombine(ISD::FABS);
-  setTargetDAGCombine(ISD::AssertZext);
-  setTargetDAGCombine(ISD::AssertSext);
-  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+  setTargetDAGCombine({ISD::BITCAST,    ISD::SHL,
+                       ISD::SRA,        ISD::SRL,
+                       ISD::TRUNCATE,   ISD::MUL,
+                       ISD::SMUL_LOHI,  ISD::UMUL_LOHI,
+                       ISD::MULHU,      ISD::MULHS,
+                       ISD::SELECT,     ISD::SELECT_CC,
+                       ISD::STORE,      ISD::FADD,
+                       ISD::FSUB,       ISD::FNEG,
+                       ISD::FABS,       ISD::AssertZext,
+                       ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN});
 }
 
 bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
@@ -785,11 +650,11 @@ bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
   unsigned AS = MN->getAddressSpace();
   // Do not shrink an aligned scalar load to sub-dword.
   // Scalar engine cannot do sub-dword loads.
-  if (OldSize >= 32 && NewSize < 32 && MN->getAlignment() >= 4 &&
+  if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
       (AS == AMDGPUAS::CONSTANT_ADDRESS ||
        AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
-       (isa<LoadSDNode>(N) &&
-        AS == AMDGPUAS::GLOBAL_ADDRESS && MN->isInvariant())) &&
+       (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
+        MN->isInvariant())) &&
       AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand()))
     return false;
 
@@ -855,6 +720,8 @@ bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {
         AMDGPUAS::CONSTANT_ADDRESS_32BIT)
       return true;
     return false;
+  case AMDGPUISD::SETCC: // ballot-style instruction
+    return true;
   }
   return false;
 }
@@ -1072,10 +939,9 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
     const bool IsByRef = Arg.hasByRefAttr();
     Type *BaseArgTy = Arg.getType();
     Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
-    MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
-    if (!Alignment)
-      Alignment = DL.getABITypeAlign(MemArgTy);
-    MaxAlign = max(Alignment, MaxAlign);
+    Align Alignment = DL.getValueOrABITypeAlignment(
+        IsByRef ? Arg.getParamAlign() : None, MemArgTy);
+    MaxAlign = std::max(Alignment, MaxAlign);
     uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
 
     uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
@@ -1415,6 +1281,11 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
       (Start == 0 || Start == 4))
     return Op;
 
+  if (((SrcVT == MVT::v16f16 && VT == MVT::v8f16) ||
+       (SrcVT == MVT::v16i16 && VT == MVT::v8i16)) &&
+      (Start == 0 || Start == 8))
+    return Op;
+
   DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
                             VT.getVectorNumElements());
 
@@ -1589,8 +1460,8 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
   std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
 
   unsigned Size = LoMemVT.getStoreSize();
-  unsigned BaseAlign = Load->getAlignment();
-  unsigned HiAlign = MinAlign(BaseAlign, Size);
+  Align BaseAlign = Load->getAlign();
+  Align HiAlign = commonAlignment(BaseAlign, Size);
 
   SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
                                   Load->getChain(), BasePtr, SrcValue, LoMemVT,
@@ -1628,13 +1499,13 @@ SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op,
   EVT MemVT = Load->getMemoryVT();
   SDLoc SL(Op);
   const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
-  unsigned BaseAlign = Load->getAlignment();
+  Align BaseAlign = Load->getAlign();
   unsigned NumElements = MemVT.getVectorNumElements();
 
   // Widen from vec3 to vec4 when the load is at least 8-byte aligned
   // or 16-byte fully dereferenceable. Otherwise, split the vector load.
   if (NumElements != 3 ||
-      (BaseAlign < 8 &&
+      (BaseAlign < Align(8) &&
        !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
     return SplitVectorLoad(Op, DAG);
 
@@ -1681,9 +1552,9 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
   SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
 
   const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
-  unsigned BaseAlign = Store->getAlignment();
+  Align BaseAlign = Store->getAlign();
   unsigned Size = LoMemVT.getStoreSize();
-  unsigned HiAlign = MinAlign(BaseAlign, Size);
+  Align HiAlign = commonAlignment(BaseAlign, Size);
 
   SDValue LoStore =
       DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
@@ -3003,12 +2874,11 @@ SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
     // the bytes again are not eliminated in the case of an unaligned copy.
     if (!allowsMisalignedMemoryAccesses(
             VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
-      SDValue Ops[2];
-
       if (VT.isVector())
-        std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(LN, DAG);
-      else
-        std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
+        return SplitVectorLoad(SDValue(LN, 0), DAG);
+
+      SDValue Ops[2];
+      std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
 
       return DAG.getMergeValues(Ops, SDLoc(N));
     }
@@ -3059,7 +2929,7 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
     if (!allowsMisalignedMemoryAccesses(
             VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
       if (VT.isVector())
-        return scalarizeVectorStore(SN, DAG);
+        return SplitVectorStore(SDValue(SN, 0), DAG);
 
       return expandUnalignedStore(SN, DAG);
     }
@@ -3281,8 +3151,9 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
   // this improves the ability to match BFE patterns in isel.
   if (LHS.getOpcode() == ISD::AND) {
     if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
-      if (Mask->getAPIntValue().isShiftedMask() &&
-          Mask->getAPIntValue().countTrailingZeros() == ShiftAmt) {
+      unsigned MaskIdx, MaskLen;
+      if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
+          MaskIdx == ShiftAmt) {
         return DAG.getNode(
             ISD::AND, SL, VT,
             DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
@@ -4380,10 +4251,14 @@ uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
   uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) +
                        ExplicitArgOffset;
   switch (Param) {
-  case GRID_DIM:
+  case FIRST_IMPLICIT:
     return ArgOffset;
-  case GRID_OFFSET:
-    return ArgOffset + 4;
+  case PRIVATE_BASE:
+    return ArgOffset + AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET;
+  case SHARED_BASE:
+    return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
+  case QUEUE_PTR:
+    return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
   }
   llvm_unreachable("unexpected implicit parameter type");
 }
@@ -4405,7 +4280,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(TC_RETURN)
   NODE_NAME_CASE(TRAP)
   NODE_NAME_CASE(RET_FLAG)
-  NODE_NAME_CASE(RET_GFX_FLAG)
   NODE_NAME_CASE(RETURN_TO_EPILOG)
   NODE_NAME_CASE(ENDPGM)
   NODE_NAME_CASE(DWORDADDR)
@@ -4485,6 +4359,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(CONST_DATA_PTR)
   NODE_NAME_CASE(PC_ADD_REL_OFFSET)
   NODE_NAME_CASE(LDS)
+  NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD)
+  NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD)
   NODE_NAME_CASE(DUMMY_CHAIN)
   case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
   NODE_NAME_CASE(LOAD_D16_HI)
@@ -4580,6 +4456,19 @@ SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
   return SDValue();
 }
 
+static unsigned workitemIntrinsicDim(unsigned ID) {
+  switch (ID) {
+  case Intrinsic::amdgcn_workitem_id_x:
+    return 0;
+  case Intrinsic::amdgcn_workitem_id_y:
+    return 1;
+  case Intrinsic::amdgcn_workitem_id_z:
+    return 2;
+  default:
+    llvm_unreachable("not a workitem intrinsic");
+  }
+}
+
 void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
     const SDValue Op, KnownBits &Known,
     const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
@@ -4716,6 +4605,14 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
       Known.Zero.setHighBits(Size - ST.getWavefrontSizeLog2());
       break;
     }
+    case Intrinsic::amdgcn_workitem_id_x:
+    case Intrinsic::amdgcn_workitem_id_y:
+    case Intrinsic::amdgcn_workitem_id_z: {
+      unsigned MaxValue = Subtarget->getMaxWorkitemID(
+          DAG.getMachineFunction().getFunction(), workitemIntrinsicDim(IID));
+      Known.Zero.setHighBits(countLeadingZeros(MaxValue));
+      break;
+    }
     default:
       break;
     }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index b41506157b68..73081483f1c3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -320,8 +320,9 @@ public:
 
   enum ImplicitParameter {
     FIRST_IMPLICIT,
-    GRID_DIM = FIRST_IMPLICIT,
-    GRID_OFFSET,
+    PRIVATE_BASE,
+    SHARED_BASE,
+    QUEUE_PTR,
   };
 
   /// Helper function that returns the byte offset of the given
@@ -367,9 +368,6 @@ enum NodeType : unsigned {
   // Return with values from a non-entry function.
   RET_FLAG,
 
-  // Return with values from a non-entry function (AMDGPU_Gfx CC).
-  RET_GFX_FLAG,
-
   DWORDADDR,
   FRACT,
 
@@ -483,6 +481,9 @@ enum NodeType : unsigned {
   CONST_DATA_PTR,
   PC_ADD_REL_OFFSET,
   LDS,
+  FPTRUNC_ROUND_UPWARD,
+  FPTRUNC_ROUND_DOWNWARD,
+
   DUMMY_CHAIN,
   FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
   LOAD_D16_HI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
new file mode 100644
index 000000000000..c9cdbc89f3a4
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
@@ -0,0 +1,457 @@
+//===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Insert s_delay_alu instructions to avoid stalls on GFX11+.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/SetVector.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-insert-delay-alu"
+
+namespace {
+
+class AMDGPUInsertDelayAlu : public MachineFunctionPass {
+public:
+  static char ID;
+
+  const SIInstrInfo *SII;
+  const TargetRegisterInfo *TRI;
+
+  TargetSchedModel SchedModel;
+
+  AMDGPUInsertDelayAlu() : MachineFunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  // Return true if MI waits for all outstanding VALU instructions to complete.
+  static bool instructionWaitsForVALU(const MachineInstr &MI) {
+    // These instruction types wait for VA_VDST==0 before issuing.
+    const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP |
+                               SIInstrFlags::FLAT | SIInstrFlags::MIMG |
+                               SIInstrFlags::MTBUF | SIInstrFlags::MUBUF;
+    if (MI.getDesc().TSFlags & VA_VDST_0)
+      return true;
+    if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||
+        MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
+      return true;
+    if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
+        (MI.getOperand(0).getImm() & 0xf000) == 0)
+      return true;
+    return false;
+  }
+
+  // Types of delay that can be encoded in an s_delay_alu instruction.
+  enum DelayType { VALU, TRANS, SALU, OTHER };
+
+  // Get the delay type for an instruction with the specified TSFlags.
+  static DelayType getDelayType(uint64_t TSFlags) {
+    if (TSFlags & SIInstrFlags::TRANS)
+      return TRANS;
+    if (TSFlags & SIInstrFlags::VALU)
+      return VALU;
+    if (TSFlags & SIInstrFlags::SALU)
+      return SALU;
+    return OTHER;
+  }
+
+  // Information about the last instruction(s) that wrote to a particular
+  // regunit. In straight-line code there will only be one such instruction, but
+  // when control flow converges we merge the delay information from each path
+  // to represent the union of the worst-case delays of each type.
+  struct DelayInfo {
+    // One larger than the maximum number of (non-TRANS) VALU instructions we
+    // can encode in an s_delay_alu instruction.
+    static const unsigned VALU_MAX = 5;
+
+    // One larger than the maximum number of TRANS instructions we can encode in
+    // an s_delay_alu instruction.
+    static const unsigned TRANS_MAX = 4;
+
+    // If it was written by a (non-TRANS) VALU, remember how many clock cycles
+    // are left until it completes, and how many other (non-TRANS) VALU we have
+    // seen since it was issued.
+    uint8_t VALUCycles = 0;
+    uint8_t VALUNum = VALU_MAX;
+
+    // If it was written by a TRANS, remember how many clock cycles are left
+    // until it completes, and how many other TRANS we have seen since it was
+    // issued.
+    uint8_t TRANSCycles = 0;
+    uint8_t TRANSNum = TRANS_MAX;
+    // Also remember how many other (non-TRANS) VALU we have seen since it was
+    // issued. When an instruction depends on both a prior TRANS and a prior
+    // non-TRANS VALU, this is used to decide whether to encode a wait for just
+    // one or both of them.
+    uint8_t TRANSNumVALU = VALU_MAX;
+
+    // If it was written by an SALU, remember how many clock cycles are left
+    // until it completes.
+    uint8_t SALUCycles = 0;
+
+    DelayInfo() = default;
+
+    DelayInfo(DelayType Type, unsigned Cycles) {
+      switch (Type) {
+      default:
+        llvm_unreachable("unexpected type");
+      case VALU:
+        VALUCycles = Cycles;
+        VALUNum = 0;
+        break;
+      case TRANS:
+        TRANSCycles = Cycles;
+        TRANSNum = 0;
+        TRANSNumVALU = 0;
+        break;
+      case SALU:
+        SALUCycles = Cycles;
+        break;
+      }
+    }
+
+    bool operator==(const DelayInfo &RHS) const {
+      return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum &&
+             TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum &&
+             TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles;
+    }
+
+    bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); }
+
+    // Merge another DelayInfo into this one, to represent the union of the
+    // worst-case delays of each type.
+    void merge(const DelayInfo &RHS) {
+      VALUCycles = std::max(VALUCycles, RHS.VALUCycles);
+      VALUNum = std::min(VALUNum, RHS.VALUNum);
+      TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles);
+      TRANSNum = std::min(TRANSNum, RHS.TRANSNum);
+      TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU);
+      SALUCycles = std::max(SALUCycles, RHS.SALUCycles);
+    }
+
+    // Update this DelayInfo after issuing an instruction. IsVALU should be 1
+    // when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing
+    // a TRANS, else 0. Cycles is the number of cycles it takes to issue the
+    // instruction.  Return true if there is no longer any useful delay info.
+    bool advance(DelayType Type, unsigned Cycles) {
+      bool Erase = true;
+
+      VALUNum += (Type == VALU);
+      if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {
+        // Forget about the VALU instruction. It was too far back or has
+        // definitely completed by now.
+        VALUNum = VALU_MAX;
+        VALUCycles = 0;
+      } else {
+        VALUCycles -= Cycles;
+        Erase = false;
+      }
+
+      TRANSNum += (Type == TRANS);
+      TRANSNumVALU += (Type == VALU);
+      if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {
+        // Forget about any TRANS instruction. It was too far back or has
+        // definitely completed by now.
+        TRANSNum = TRANS_MAX;
+        TRANSNumVALU = VALU_MAX;
+        TRANSCycles = 0;
+      } else {
+        TRANSCycles -= Cycles;
+        Erase = false;
+      }
+
+      if (SALUCycles <= Cycles) {
+        // Forget about any SALU instruction. It has definitely completed by
+        // now.
+        SALUCycles = 0;
+      } else {
+        SALUCycles -= Cycles;
+        Erase = false;
+      }
+
+      return Erase;
+    }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    void dump() const {
+      if (VALUCycles)
+        dbgs() << " VALUCycles=" << (int)VALUCycles;
+      if (VALUNum < VALU_MAX)
+        dbgs() << " VALUNum=" << (int)VALUNum;
+      if (TRANSCycles)
+        dbgs() << " TRANSCycles=" << (int)TRANSCycles;
+      if (TRANSNum < TRANS_MAX)
+        dbgs() << " TRANSNum=" << (int)TRANSNum;
+      if (TRANSNumVALU < VALU_MAX)
+        dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU;
+      if (SALUCycles)
+        dbgs() << " SALUCycles=" << (int)SALUCycles;
+    }
+#endif
+  };
+
+  // A map from regunits to the delay info for that regunit.
+  struct DelayState : DenseMap<unsigned, DelayInfo> {
+    // Merge another DelayState into this one by merging the delay info for each
+    // regunit.
+    void merge(const DelayState &RHS) {
+      for (const auto &KV : RHS) {
+        iterator It;
+        bool Inserted;
+        std::tie(It, Inserted) = insert(KV);
+        if (!Inserted)
+          It->second.merge(KV.second);
+      }
+    }
+
+    // Advance the delay info for each regunit, erasing any that are no longer
+    // useful.
+    void advance(DelayType Type, unsigned Cycles) {
+      iterator Next;
+      for (auto I = begin(), E = end(); I != E; I = Next) {
+        Next = std::next(I);
+        if (I->second.advance(Type, Cycles))
+          erase(I);
+      }
+    }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    void dump(const TargetRegisterInfo *TRI) const {
+      if (empty()) {
+        dbgs() << "    empty\n";
+        return;
+      }
+
+      // Dump DelayInfo for each RegUnit in numerical order.
+      SmallVector<const_iterator, 8> Order;
+      Order.reserve(size());
+      for (const_iterator I = begin(), E = end(); I != E; ++I)
+        Order.push_back(I);
+      llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) {
+        return A->first < B->first;
+      });
+      for (const_iterator I : Order) {
+        dbgs() << "    " << printRegUnit(I->first, TRI);
+        I->second.dump();
+        dbgs() << "\n";
+      }
+    }
+#endif
+  };
+
+  // The saved delay state at the end of each basic block.
+  DenseMap<MachineBasicBlock *, DelayState> BlockState;
+
+  // Emit an s_delay_alu instruction if necessary before MI.
+  MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay,
+                             MachineInstr *LastDelayAlu) {
+    unsigned Imm = 0;
+
+    // Wait for a TRANS instruction.
+    if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
+      Imm |= 4 + Delay.TRANSNum;
+
+    // Wait for a VALU instruction (if it's more recent than any TRANS
+    // instruction that we're also waiting for).
+    if (Delay.VALUNum < DelayInfo::VALU_MAX &&
+        Delay.VALUNum <= Delay.TRANSNumVALU) {
+      if (Imm & 0xf)
+        Imm |= Delay.VALUNum << 7;
+      else
+        Imm |= Delay.VALUNum;
+    }
+
+    // Wait for an SALU instruction.
+    if (Delay.SALUCycles) {
+      if (Imm & 0x780) {
+        // We have already encoded a VALU and a TRANS delay. There's no room in
+        // the encoding for an SALU delay as well, so just drop it.
+      } else if (Imm & 0xf) {
+        Imm |= (Delay.SALUCycles + 8) << 7;
+      } else {
+        Imm |= Delay.SALUCycles + 8;
+      }
+    }
+
+    // Don't emit the s_delay_alu instruction if there's nothing to wait for.
+    if (!Imm)
+      return LastDelayAlu;
+
+    // If we only need to wait for one instruction, try encoding it in the last
+    // s_delay_alu that we emitted.
+    if (!(Imm & 0x780) && LastDelayAlu) {
+      unsigned Skip = 0;
+      for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu),
+                E = MachineBasicBlock::instr_iterator(MI);
+           ++I != E;) {
+        if (!I->isBundle() && !I->isMetaInstruction())
+          ++Skip;
+      }
+      if (Skip < 6) {
+        MachineOperand &Op = LastDelayAlu->getOperand(0);
+        unsigned LastImm = Op.getImm();
+        assert((LastImm & ~0xf) == 0 &&
+               "Remembered an s_delay_alu with no room for another delay!");
+        LastImm |= Imm << 7 | Skip << 4;
+        Op.setImm(LastImm);
+        return nullptr;
+      }
+    }
+
+    auto &MBB = *MI.getParent();
+    MachineInstr *DelayAlu =
+        BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm);
+    // Remember the s_delay_alu for next time if there is still room in it to
+    // encode another delay.
+    return (Imm & 0x780) ? nullptr : DelayAlu;
+  }
+
+  bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
+    DelayState State;
+    for (auto *Pred : MBB.predecessors())
+      State.merge(BlockState[Pred]);
+
+    LLVM_DEBUG(dbgs() << "  State at start of " << printMBBReference(MBB)
+                      << "\n";
+               State.dump(TRI););
+
+    bool Changed = false;
+    MachineInstr *LastDelayAlu = nullptr;
+
+    // Iterate over the contents of bundles, but don't emit any instructions
+    // inside a bundle.
+    for (auto &MI : MBB.instrs()) {
+      if (MI.isBundle() || MI.isMetaInstruction())
+        continue;
+
+      // Ignore some more instructions that do not generate any code.
+      switch (MI.getOpcode()) {
+      case AMDGPU::SI_RETURN_TO_EPILOG:
+        continue;
+      }
+
+      DelayType Type = getDelayType(MI.getDesc().TSFlags);
+
+      if (instructionWaitsForVALU(MI)) {
+        // Forget about all outstanding VALU delays.
+        State = DelayState();
+      } else if (Type != OTHER) {
+        DelayInfo Delay;
+        // TODO: Scan implicit uses too?
+        for (const auto &Op : MI.explicit_uses()) {
+          if (Op.isReg()) {
+            // One of the operands of the writelane is also the output operand.
+            // This creates the insertion of redundant delays. Hence, we have to
+            // ignore this operand.
+            if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied())
+              continue;
+            for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI) {
+              auto It = State.find(*UI);
+              if (It != State.end()) {
+                Delay.merge(It->second);
+                State.erase(*UI);
+              }
+            }
+          }
+        }
+        if (Emit && !MI.isBundledWithPred()) {
+          // TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
+          // just ignore them?
+          LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu);
+        }
+      }
+
+      if (Type != OTHER) {
+        // TODO: Scan implicit defs too?
+        for (const auto &Op : MI.defs()) {
+          unsigned Latency = SchedModel.computeOperandLatency(
+              &MI, MI.getOperandNo(&Op), nullptr, 0);
+          for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI)
+            State[*UI] = DelayInfo(Type, Latency);
+        }
+      }
+
+      // Advance by the number of cycles it takes to issue this instruction.
+      // TODO: Use a more advanced model that accounts for instructions that
+      // take multiple cycles to issue on a particular pipeline.
+      unsigned Cycles = SIInstrInfo::getNumWaitStates(MI);
+      // TODO: In wave64 mode, double the number of cycles for VALU and VMEM
+      // instructions on the assumption that they will usually have to be issued
+      // twice?
+      State.advance(Type, Cycles);
+
+      LLVM_DEBUG(dbgs() << "  State after " << MI; State.dump(TRI););
+    }
+
+    if (Emit) {
+      assert(State == BlockState[&MBB] &&
+             "Basic block state should not have changed on final pass!");
+    } else if (State != BlockState[&MBB]) {
+      BlockState[&MBB] = std::move(State);
+      Changed = true;
+    }
+    return Changed;
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    if (skipFunction(MF.getFunction()))
+      return false;
+
+    LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
+                      << "\n");
+
+    const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+    if (!ST.hasDelayAlu())
+      return false;
+
+    SII = ST.getInstrInfo();
+    TRI = ST.getRegisterInfo();
+
+    SchedModel.init(&ST);
+
+    // Calculate the delay state for each basic block, iterating until we reach
+    // a fixed point.
+    SetVector<MachineBasicBlock *> WorkList;
+    for (auto &MBB : reverse(MF))
+      WorkList.insert(&MBB);
+    while (!WorkList.empty()) {
+      auto &MBB = *WorkList.pop_back_val();
+      bool Changed = runOnMachineBasicBlock(MBB, false);
+      if (Changed)
+        WorkList.insert(MBB.succ_begin(), MBB.succ_end());
+    }
+
+    LLVM_DEBUG(dbgs() << "Final pass over all BBs\n");
+
+    // Make one last pass over all basic blocks to emit s_delay_alu
+    // instructions.
+    bool Changed = false;
+    for (auto &MBB : MF)
+      Changed |= runOnMachineBasicBlock(MBB, true);
+    return Changed;
+  }
+};
+
+} // namespace
+
+char AMDGPUInsertDelayAlu::ID = 0;
+
+char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID;
+
+INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU",
+                false, false)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 4f1d700bcd84..695093322a01 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -110,33 +110,42 @@ static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
   llvm_unreachable("Should never be called!");
 }
 
-/// Applies Function(II.Args, II.ArgTys) and replaces the intrinsic call with
-/// the modified arguments.
+/// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
+/// modified arguments (based on OldIntr) and replaces InstToReplace with
+/// this newly created intrinsic call.
 static Optional<Instruction *> modifyIntrinsicCall(
-    IntrinsicInst &II, unsigned NewIntr, InstCombiner &IC,
+    IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
+    InstCombiner &IC,
     std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
         Func) {
   SmallVector<Type *, 4> ArgTys;
-  if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys))
+  if (!Intrinsic::getIntrinsicSignature(OldIntr.getCalledFunction(), ArgTys))
     return None;
 
-  SmallVector<Value *, 8> Args(II.args());
+  SmallVector<Value *, 8> Args(OldIntr.args());
 
   // Modify arguments and types
   Func(Args, ArgTys);
 
-  Function *I = Intrinsic::getDeclaration(II.getModule(), NewIntr, ArgTys);
+  Function *I = Intrinsic::getDeclaration(OldIntr.getModule(), NewIntr, ArgTys);
 
   CallInst *NewCall = IC.Builder.CreateCall(I, Args);
-  NewCall->takeName(&II);
-  NewCall->copyMetadata(II);
+  NewCall->takeName(&OldIntr);
+  NewCall->copyMetadata(OldIntr);
   if (isa<FPMathOperator>(NewCall))
-    NewCall->copyFastMathFlags(&II);
+    NewCall->copyFastMathFlags(&OldIntr);
 
   // Erase and replace uses
-  if (!II.getType()->isVoidTy())
-    IC.replaceInstUsesWith(II, NewCall);
-  return IC.eraseInstFromFunction(II);
+  if (!InstToReplace.getType()->isVoidTy())
+    IC.replaceInstUsesWith(InstToReplace, NewCall);
+
+  bool RemoveOldIntr = &OldIntr != &InstToReplace;
+
+  auto RetValue = IC.eraseInstFromFunction(InstToReplace);
+  if (RemoveOldIntr)
+    IC.eraseInstFromFunction(OldIntr);
+
+  return RetValue;
 }
 
 static Optional<Instruction *>
@@ -153,7 +162,7 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
             AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
                                                      ImageDimIntr->Dim);
         return modifyIntrinsicCall(
-            II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
+            II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
               Args.erase(Args.begin() + ImageDimIntr->LodIndex);
             });
       }
@@ -170,7 +179,7 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
             AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
                                                      ImageDimIntr->Dim);
         return modifyIntrinsicCall(
-            II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
+            II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
               Args.erase(Args.begin() + ImageDimIntr->MipIndex);
             });
       }
@@ -187,7 +196,7 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
             AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
                                                      ImageDimIntr->Dim);
         return modifyIntrinsicCall(
-            II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
+            II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
               Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
               ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
             });
@@ -205,13 +214,41 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
             AMDGPU::getImageDimIntrinsicByBaseOpcode(
                 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);
         return modifyIntrinsicCall(
-            II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
+            II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
               Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
             });
       }
     }
   }
 
+  // Try to use D16
+  if (ST->hasD16Images()) {
+
+    const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
+        AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
+
+    if (BaseOpcode->HasD16) {
+
+      // If the only use of image intrinsic is a fptrunc (with conversion to
+      // half) then both fptrunc and image intrinsic will be replaced with image
+      // intrinsic with D16 flag.
+      if (II.hasOneUse()) {
+        Instruction *User = II.user_back();
+
+        if (User->getOpcode() == Instruction::FPTrunc &&
+            User->getType()->getScalarType()->isHalfTy()) {
+
+          return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC,
+                                     [&](auto &Args, auto &ArgTys) {
+                                       // Change return type of image intrinsic.
+                                       // Set it to return type of fptrunc.
+                                       ArgTys[0] = User->getType();
+                                     });
+        }
+      }
+    }
+  }
+
   // Try to use A16 or G16
   if (!ST->hasA16() && !ST->hasG16())
     return None;
@@ -263,7 +300,7 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
                                : Type::getInt16Ty(II.getContext());
 
   return modifyIntrinsicCall(
-      II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
+      II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
         ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
         if (!OnlyDerivatives) {
           ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
@@ -584,6 +621,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     return IC.replaceInstUsesWith(II, RightShift);
   }
   case Intrinsic::amdgcn_exp:
+  case Intrinsic::amdgcn_exp_row:
   case Intrinsic::amdgcn_exp_compr: {
     ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
     unsigned EnBits = En->getZExtValue();
@@ -882,6 +920,12 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
 
     return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
   }
+  case Intrinsic::amdgcn_permlane64:
+    // A constant value is trivially uniform.
+    if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
+      return IC.replaceInstUsesWith(II, C);
+    }
+    break;
   case Intrinsic::amdgcn_readfirstlane:
   case Intrinsic::amdgcn_readlane: {
     // A constant value is trivially uniform.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 391dc8428539..23b8fcf75f16 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -355,11 +355,7 @@ def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone,
 def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone,
     [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 
-def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
-  [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
->;
-
-def AMDGPUret_gfx_flag : SDNode<"AMDGPUISD::RET_GFX_FLAG", SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
+def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
   [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
 >;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index b7d0f0580cda..3f242fdb6d8e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 
@@ -80,8 +81,11 @@ bool AMDGPUInstructionSelector::isVCC(Register Reg,
       RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
   if (RC) {
     const LLT Ty = MRI.getType(Reg);
-    return RC->hasSuperClassEq(TRI.getBoolRC()) &&
-           Ty.isValid() && Ty.getSizeInBits() == 1;
+    if (!Ty.isValid() || Ty.getSizeInBits() != 1)
+      return false;
+    // G_TRUNC s1 result is never vcc.
+    return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
+           RC->hasSuperClassEq(TRI.getBoolRC());
   }
 
   const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
@@ -91,7 +95,7 @@ bool AMDGPUInstructionSelector::isVCC(Register Reg,
 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
                                                         unsigned NewOpc) const {
   MI.setDesc(TII.get(NewOpc));
-  MI.RemoveOperand(1); // Remove intrinsic ID.
+  MI.removeOperand(1); // Remove intrinsic ID.
   MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
 
   MachineOperand &Dst = MI.getOperand(0);
@@ -216,7 +220,7 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
     }
 
     const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
-    DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI);
+    DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
     if (!DefRC) {
       LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
       return false;
@@ -454,6 +458,24 @@ bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
   return true;
 }
 
+bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
+    MachineInstr &I) const {
+  MachineBasicBlock *BB = I.getParent();
+  MachineFunction *MF = BB->getParent();
+  const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
+
+  unsigned Opc;
+  if (Subtarget->getGeneration() == AMDGPUSubtarget::GFX11)
+    Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
+                     : AMDGPU::V_MAD_I64_I32_gfx11_e64;
+  else
+    Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
+  I.setDesc(TII.get(Opc));
+  I.addOperand(*MF, MachineOperand::CreateImm(0));
+  I.addImplicitDefUseOperands(*MF);
+  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
 // TODO: We should probably legalize these to only using 32-bit results.
 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
   MachineBasicBlock *BB = I.getParent();
@@ -481,7 +503,7 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
 
   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
   const TargetRegisterClass *SrcRC =
-    TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
+      TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
   if (!SrcRC)
     return false;
   unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
@@ -514,7 +536,7 @@ bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
   const unsigned DstSize = DstTy.getSizeInBits();
   const TargetRegisterClass *DstRC =
-    TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
+      TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
   if (!DstRC)
     return false;
 
@@ -556,7 +578,7 @@ bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
 
   const TargetRegisterClass *SrcRC =
-    TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
+      TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
   if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
     return false;
 
@@ -630,7 +652,7 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
   MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
   if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
     MI.setDesc(TII.get(AMDGPU::COPY));
-    MI.RemoveOperand(2);
+    MI.removeOperand(2);
     return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) &&
            RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI);
   }
@@ -643,6 +665,8 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
   //
   // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
   //  => (S_PACK_HH_B32_B16 $src0, $src1)
+  // (build_vector_trunc (lshr_oneuse SReg_32:$src0, 16), $src1)
+  //  => (S_PACK_HL_B32_B16 $src0, $src1)
   // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16))
   //  => (S_PACK_LH_B32_B16 $src0, $src1)
   // (build_vector_trunc $src0, $src1)
@@ -662,14 +686,20 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
   } else if (Shift1) {
     Opc = AMDGPU::S_PACK_LH_B32_B16;
     MI.getOperand(2).setReg(ShiftSrc1);
-  } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) {
-    // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
-    auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
-      .addReg(ShiftSrc0)
-      .addImm(16);
+  } else if (Shift0) {
+    if (ConstSrc1 && ConstSrc1->Value == 0) {
+      // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
+      auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
+                     .addReg(ShiftSrc0)
+                     .addImm(16);
 
-    MI.eraseFromParent();
-    return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+      MI.eraseFromParent();
+      return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+    }
+    if (STI.hasSPackHL()) {
+      Opc = AMDGPU::S_PACK_HL_B32_B16;
+      MI.getOperand(1).setReg(ShiftSrc0);
+    }
   }
 
   MI.setDesc(TII.get(Opc));
@@ -722,16 +752,16 @@ bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
 
   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
   const TargetRegisterClass *DstRC =
-    TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
+      TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
   if (!DstRC)
     return false;
 
   const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
   const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
   const TargetRegisterClass *Src0RC =
-    TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI);
+      TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
   const TargetRegisterClass *Src1RC =
-    TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI);
+      TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
 
   // Deal with weird cases where the class only partially supports the subreg
   // index.
@@ -970,6 +1000,13 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
     return selectGroupStaticSize(I);
   case Intrinsic::returnaddress:
     return selectReturnAddress(I);
+  case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
+  case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
+  case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
+  case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
+  case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
+  case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
+    return selectSMFMACIntrin(I);
   default:
     return selectImpl(I, *CoverageInfo);
   }
@@ -1142,7 +1179,7 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
   Optional<ValueAndVReg> Arg =
       getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
 
-  if (Arg.hasValue()) {
+  if (Arg) {
     const int64_t Value = Arg.getValue().Value.getSExtValue();
     if (Value == 0) {
       unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
@@ -1164,8 +1201,7 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
   Register DstReg = I.getOperand(0).getReg();
   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
-  const TargetRegisterClass *DstRC =
-    TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI);
+  const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
     return false;
 
@@ -1300,12 +1336,14 @@ bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
   unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
 
   unsigned Offset0 = OrderedCountIndex << 2;
-  unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
-                     (Instruction << 4);
+  unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
 
   if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
     Offset1 |= (CountDw - 1) << 6;
 
+  if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
+    Offset1 |= ShaderType << 2;
+
   unsigned Offset = Offset0 | (Offset1 << 8);
 
   Register M0Val = MI.getOperand(2).getReg();
@@ -1424,23 +1462,7 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
 
   if (HasVSrc) {
     Register VSrc = MI.getOperand(1).getReg();
-
-    if (STI.needsAlignedVGPRs()) {
-      // Add implicit aligned super-reg to force alignment on the data operand.
-      Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-      BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
-      Register NewVR =
-          MRI->createVirtualRegister(&AMDGPU::VReg_64_Align2RegClass);
-      BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), NewVR)
-          .addReg(VSrc, 0, MI.getOperand(1).getSubReg())
-          .addImm(AMDGPU::sub0)
-          .addReg(Undef)
-          .addImm(AMDGPU::sub1);
-      MIB.addReg(NewVR, 0, AMDGPU::sub0);
-      MIB.addReg(NewVR, RegState::Implicit);
-    } else {
-      MIB.addReg(VSrc);
-    }
+    MIB.addReg(VSrc);
 
     if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
       return false;
@@ -1449,6 +1471,8 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
   MIB.addImm(ImmOffset)
      .cloneMemRefs(MI);
 
+  TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
+
   MI.eraseFromParent();
   return true;
 }
@@ -1523,6 +1547,7 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
   unsigned IntrOpcode = Intr->BaseOpcode;
   const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
+  const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
 
   const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
 
@@ -1627,7 +1652,7 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
   }
 
   // The legalizer preprocessed the intrinsic arguments. If we aren't using
-  // NSA, these should have beeen packed into a single value in the first
+  // NSA, these should have been packed into a single value in the first
   // address register
   const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs;
   if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
@@ -1639,13 +1664,29 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
     ++NumVDataDwords;
 
   int Opcode = -1;
-  if (IsGFX10Plus) {
+  if (IsGFX11Plus) {
+    Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
+                                   UseNSA ? AMDGPU::MIMGEncGfx11NSA
+                                          : AMDGPU::MIMGEncGfx11Default,
+                                   NumVDataDwords, NumVAddrDwords);
+  } else if (IsGFX10Plus) {
     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
                                    UseNSA ? AMDGPU::MIMGEncGfx10NSA
                                           : AMDGPU::MIMGEncGfx10Default,
                                    NumVDataDwords, NumVAddrDwords);
   } else {
-    if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+    if (Subtarget->hasGFX90AInsts()) {
+      Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
+                                     NumVDataDwords, NumVAddrDwords);
+      if (Opcode == -1) {
+        LLVM_DEBUG(
+            dbgs()
+            << "requested image instruction is not supported on this GPU\n");
+        return false;
+      }
+    }
+    if (Opcode == -1 &&
+        STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
                                      NumVDataDwords, NumVAddrDwords);
     if (Opcode == -1)
@@ -1703,7 +1744,13 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
   if (IsGFX10Plus)
     MIB.addImm(IsA16 ? -1 : 0);
 
-  MIB.addImm(TFE); // tfe
+  if (!Subtarget->hasGFX90AInsts()) {
+    MIB.addImm(TFE); // tfe
+  } else if (TFE) {
+    LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
+    return false;
+  }
+
   MIB.addImm(LWE); // lwe
   if (!IsGFX10Plus)
     MIB.addImm(DimInfo->DA ? -1 : 0);
@@ -1743,7 +1790,9 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
   }
 
   MI.eraseFromParent();
-  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+  constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+  TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
+  return true;
 }
 
 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
@@ -1770,10 +1819,22 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
     return selectSBarrier(I);
   case Intrinsic::amdgcn_global_atomic_fadd:
     return selectGlobalAtomicFadd(I, I.getOperand(2), I.getOperand(3));
-  default: {
-    return selectImpl(I, *CoverageInfo);
-  }
+  case Intrinsic::amdgcn_raw_buffer_load_lds:
+  case Intrinsic::amdgcn_struct_buffer_load_lds:
+    return selectBufferLoadLds(I);
+  case Intrinsic::amdgcn_global_load_lds:
+    return selectGlobalLoadLds(I);
+  case Intrinsic::amdgcn_exp_compr:
+    if (!STI.hasCompressedExport()) {
+      Function &F = I.getMF()->getFunction();
+      DiagnosticInfoUnsupported NoFpRet(
+          F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error);
+      F.getContext().diagnose(NoFpRet);
+      return false;
+    }
+    break;
   }
+  return selectImpl(I, *CoverageInfo);
 }
 
 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
@@ -1872,10 +1933,10 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
   unsigned DstSize = DstTy.getSizeInBits();
   unsigned SrcSize = SrcTy.getSizeInBits();
 
-  const TargetRegisterClass *SrcRC
-    = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI);
-  const TargetRegisterClass *DstRC
-    = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI);
+  const TargetRegisterClass *SrcRC =
+      TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
+  const TargetRegisterClass *DstRC =
+      TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
   if (!SrcRC || !DstRC)
     return false;
 
@@ -2014,10 +2075,10 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
       return selectCOPY(I);
 
     const TargetRegisterClass *SrcRC =
-        TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank, *MRI);
+        TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
     const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
     const TargetRegisterClass *DstRC =
-        TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
+        TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
 
     Register UndefReg = MRI->createVirtualRegister(SrcRC);
     BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
@@ -2384,65 +2445,6 @@ bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
   return selectImpl(I, *CoverageInfo);
 }
 
-// TODO: No rtn optimization.
-bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(
-  MachineInstr &MI) const {
-  Register PtrReg = MI.getOperand(1).getReg();
-  const LLT PtrTy = MRI->getType(PtrReg);
-  if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
-      STI.useFlatForGlobal())
-    return selectImpl(MI, *CoverageInfo);
-
-  Register DstReg = MI.getOperand(0).getReg();
-  const LLT Ty = MRI->getType(DstReg);
-  const bool Is64 = Ty.getSizeInBits() == 64;
-  const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
-  Register TmpReg = MRI->createVirtualRegister(
-    Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
-
-  const DebugLoc &DL = MI.getDebugLoc();
-  MachineBasicBlock *BB = MI.getParent();
-
-  Register VAddr, RSrcReg, SOffset;
-  int64_t Offset = 0;
-
-  unsigned Opcode;
-  if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) {
-    Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN :
-                             AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN;
-  } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr,
-                                   RSrcReg, SOffset, Offset)) {
-    Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN :
-                    AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN;
-  } else
-    return selectImpl(MI, *CoverageInfo);
-
-  auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg)
-    .addReg(MI.getOperand(2).getReg());
-
-  if (VAddr)
-    MIB.addReg(VAddr);
-
-  MIB.addReg(RSrcReg);
-  if (SOffset)
-    MIB.addReg(SOffset);
-  else
-    MIB.addImm(0);
-
-  MIB.addImm(Offset);
-  MIB.addImm(AMDGPU::CPol::GLC);
-  MIB.cloneMemRefs(MI);
-
-  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg)
-    .addReg(TmpReg, RegState::Kill, SubReg);
-
-  MI.eraseFromParent();
-
-  MRI->setRegClass(
-    DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass);
-  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
-}
-
 static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI) {
   if (Reg.isPhysical())
     return false;
@@ -2551,7 +2553,7 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
 
   // Try to avoid emitting a bit operation when we only need to touch half of
   // the 64-bit pointer.
-  APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64);
+  APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zext(64);
   const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
   const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
 
@@ -2571,12 +2573,10 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
   const TargetRegisterClass &RegRC
     = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
 
-  const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB,
-                                                                  *MRI);
-  const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB,
-                                                                  *MRI);
+  const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
+  const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
   const TargetRegisterClass *MaskRC =
-      TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI);
+      TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
 
   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
       !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
@@ -2689,10 +2689,10 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
   if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
     return false;
 
-  const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB,
-                                                                  *MRI);
-  const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB,
-                                                                  *MRI);
+  const TargetRegisterClass *SrcRC =
+      TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
+  const TargetRegisterClass *DstRC =
+      TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
   if (!SrcRC || !DstRC)
     return false;
   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
@@ -2771,10 +2771,10 @@ bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
   if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
     return false;
 
-  const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB,
-                                                                  *MRI);
-  const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB,
-                                                                  *MRI);
+  const TargetRegisterClass *VecRC =
+      TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
+  const TargetRegisterClass *ValRC =
+      TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
 
   if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
       !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
@@ -2867,7 +2867,6 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
     return false;
 
   assert(ShufMask.size() == 2);
-  assert(STI.hasSDWA() && "no target has VOP3P but not SDWA");
 
   MachineBasicBlock *MBB = MI.getParent();
   const DebugLoc &DL = MI.getDebugLoc();
@@ -2924,17 +2923,28 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
     }
   } else if (Mask[0] == 0 && Mask[1] == 0) {
     if (IsVALU) {
-      // Write low half of the register into the high half.
-      MachineInstr *MovSDWA =
-        BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
-        .addImm(0)                             // $src0_modifiers
-        .addReg(SrcVec)                        // $src0
-        .addImm(0)                             // $clamp
-        .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel
-        .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
-        .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel
-        .addReg(SrcVec, RegState::Implicit);
-      MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
+      if (STI.hasSDWA()) {
+        // Write low half of the register into the high half.
+        MachineInstr *MovSDWA =
+            BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
+                .addImm(0)                             // $src0_modifiers
+                .addReg(SrcVec)                        // $src0
+                .addImm(0)                             // $clamp
+                .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel
+                .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
+                .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel
+                .addReg(SrcVec, RegState::Implicit);
+        MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
+      } else {
+        Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+        BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
+            .addImm(0xFFFF)
+            .addReg(SrcVec);
+        BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), DstReg)
+            .addReg(TmpReg)
+            .addImm(16)
+            .addReg(TmpReg);
+      }
     } else {
       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
         .addReg(SrcVec)
@@ -2942,17 +2952,28 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
     }
   } else if (Mask[0] == 1 && Mask[1] == 1) {
     if (IsVALU) {
-      // Write high half of the register into the low half.
-      MachineInstr *MovSDWA =
-        BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
-        .addImm(0)                             // $src0_modifiers
-        .addReg(SrcVec)                        // $src0
-        .addImm(0)                             // $clamp
-        .addImm(AMDGPU::SDWA::WORD_0)          // $dst_sel
-        .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
-        .addImm(AMDGPU::SDWA::WORD_1)          // $src0_sel
-        .addReg(SrcVec, RegState::Implicit);
-      MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
+      if (STI.hasSDWA()) {
+        // Write high half of the register into the low half.
+        MachineInstr *MovSDWA =
+            BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
+                .addImm(0)                             // $src0_modifiers
+                .addReg(SrcVec)                        // $src0
+                .addImm(0)                             // $clamp
+                .addImm(AMDGPU::SDWA::WORD_0)          // $dst_sel
+                .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
+                .addImm(AMDGPU::SDWA::WORD_1)          // $src0_sel
+                .addReg(SrcVec, RegState::Implicit);
+        MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
+      } else {
+        Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+        BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
+            .addImm(16)
+            .addReg(SrcVec);
+        BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), DstReg)
+            .addReg(TmpReg)
+            .addImm(16)
+            .addReg(TmpReg);
+      }
     } else {
       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg)
         .addReg(SrcVec)
@@ -2965,13 +2986,19 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
         .addReg(SrcVec)
         .addImm(16);
     } else {
-      Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
-      BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg)
-        .addReg(SrcVec)
-        .addImm(16);
-      BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
-        .addReg(TmpReg)
-        .addReg(SrcVec);
+      if (STI.hasSPackHL()) {
+        BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HL_B32_B16), DstReg)
+            .addReg(SrcVec)
+            .addReg(SrcVec);
+      } else {
+        Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+        BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg)
+            .addReg(SrcVec)
+            .addImm(16);
+        BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
+            .addReg(TmpReg)
+            .addReg(SrcVec);
+      }
     }
   } else
     llvm_unreachable("all shuffle masks should be handled");
@@ -2982,13 +3009,15 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
 
 bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(
   MachineInstr &MI) const {
-  if (STI.hasGFX90AInsts())
+  const Register DefReg = MI.getOperand(0).getReg();
+  LLT DefTy = MRI->getType(DefReg);
+  if (AMDGPU::hasAtomicFaddRtnForTy(STI, DefTy))
     return selectImpl(MI, *CoverageInfo);
 
   MachineBasicBlock *MBB = MI.getParent();
   const DebugLoc &DL = MI.getDebugLoc();
 
-  if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
+  if (!MRI->use_nodbg_empty(DefReg)) {
     Function &F = MBB->getParent()->getFunction();
     DiagnosticInfoUnsupported
       NoFpRet(F, "return versions of fp atomics not supported",
@@ -3105,9 +3134,236 @@ bool AMDGPUInstructionSelector::selectGlobalAtomicFadd(
   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
 }
 
+bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
+  unsigned Opc;
+  unsigned Size = MI.getOperand(3).getImm();
+
+  // The struct intrinsic variants add one additional operand over raw.
+  const bool HasVIndex = MI.getNumOperands() == 9;
+  Register VIndex;
+  int OpOffset = 0;
+  if (HasVIndex) {
+    VIndex = MI.getOperand(4).getReg();
+    OpOffset = 1;
+  }
+
+  Register VOffset = MI.getOperand(4 + OpOffset).getReg();
+  Optional<ValueAndVReg> MaybeVOffset =
+      getIConstantVRegValWithLookThrough(VOffset, *MRI);
+  const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
+
+  switch (Size) {
+  default:
+    return false;
+  case 1:
+    Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
+                                 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
+                    : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
+                                 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
+    break;
+  case 2:
+    Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
+                                 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
+                    : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
+                                 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
+    break;
+  case 4:
+    Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
+                                 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
+                    : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
+                                 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
+    break;
+  }
+
+  MachineBasicBlock *MBB = MI.getParent();
+  const DebugLoc &DL = MI.getDebugLoc();
+  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
+    .add(MI.getOperand(2));
+
+  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
+
+  if (HasVIndex && HasVOffset) {
+    Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
+    BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
+      .addReg(VIndex)
+      .addImm(AMDGPU::sub0)
+      .addReg(VOffset)
+      .addImm(AMDGPU::sub1);
+
+    MIB.addReg(IdxReg);
+  } else if (HasVIndex) {
+    MIB.addReg(VIndex);
+  } else if (HasVOffset) {
+    MIB.addReg(VOffset);
+  }
+
+  MIB.add(MI.getOperand(1));            // rsrc
+  MIB.add(MI.getOperand(5 + OpOffset)); // soffset
+  MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
+  unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
+  MIB.addImm(Aux & AMDGPU::CPol::ALL);  // cpol
+  MIB.addImm((Aux >> 3) & 1);           // swz
+
+  MachineMemOperand *LoadMMO = *MI.memoperands_begin();
+  MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
+  LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
+  MachinePointerInfo StorePtrI = LoadPtrI;
+  StorePtrI.V = nullptr;
+  StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
+
+  auto F = LoadMMO->getFlags() &
+           ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
+  LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
+                                     Size, LoadMMO->getBaseAlign());
+
+  MachineMemOperand *StoreMMO =
+      MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
+                               sizeof(int32_t), LoadMMO->getBaseAlign());
+
+  MIB.setMemRefs({LoadMMO, StoreMMO});
+
+  MI.eraseFromParent();
+  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
+/// Match a zero extend from a 32-bit value to 64-bits.
+static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
+  Register ZExtSrc;
+  if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
+    return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
+
+  // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
+  const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
+  if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
+    return false;
+
+  if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
+    return Def->getOperand(1).getReg();
+  }
+
+  return Register();
+}
+
+bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
+  unsigned Opc;
+  unsigned Size = MI.getOperand(3).getImm();
+
+  switch (Size) {
+  default:
+    return false;
+  case 1:
+    Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
+    break;
+  case 2:
+    Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
+    break;
+  case 4:
+    Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
+    break;
+  }
+
+  MachineBasicBlock *MBB = MI.getParent();
+  const DebugLoc &DL = MI.getDebugLoc();
+  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
+    .add(MI.getOperand(2));
+
+  Register Addr = MI.getOperand(1).getReg();
+  Register VOffset;
+  // Try to split SAddr and VOffset. Global and LDS pointers share the same
+  // immediate offset, so we cannot use a regular SelectGlobalSAddr().
+  if (!isSGPR(Addr)) {
+    auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
+    if (isSGPR(AddrDef->Reg)) {
+      Addr = AddrDef->Reg;
+    } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
+      Register SAddr =
+          getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
+      if (SAddr && isSGPR(SAddr)) {
+        Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
+        if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
+          Addr = SAddr;
+          VOffset = Off;
+        }
+      }
+    }
+  }
+
+  if (isSGPR(Addr)) {
+    Opc = AMDGPU::getGlobalSaddrOp(Opc);
+    if (!VOffset) {
+      VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+      BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
+        .addImm(0);
+    }
+  }
+
+  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
+    .addReg(Addr);
+
+  if (isSGPR(Addr))
+    MIB.addReg(VOffset);
+
+  MIB.add(MI.getOperand(4))  // offset
+     .add(MI.getOperand(5)); // cpol
+
+  MachineMemOperand *LoadMMO = *MI.memoperands_begin();
+  MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
+  LoadPtrI.Offset = MI.getOperand(4).getImm();
+  MachinePointerInfo StorePtrI = LoadPtrI;
+  LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
+  StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
+  auto F = LoadMMO->getFlags() &
+           ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
+  LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
+                                     Size, LoadMMO->getBaseAlign());
+  MachineMemOperand *StoreMMO =
+      MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
+                               sizeof(int32_t), Align(4));
+
+  MIB.setMemRefs({LoadMMO, StoreMMO});
+
+  MI.eraseFromParent();
+  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
 bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
   MI.setDesc(TII.get(MI.getOperand(1).getImm()));
-  MI.RemoveOperand(1);
+  MI.removeOperand(1);
+  MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
+  return true;
+}
+
+bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
+  unsigned Opc;
+  switch (MI.getIntrinsicID()) {
+  case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
+    Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
+    break;
+  case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
+    Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
+    break;
+  case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
+    Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
+    break;
+  case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
+    Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
+    break;
+  case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
+    Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
+    break;
+  case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
+    Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
+    break;
+  default:
+    llvm_unreachable("unhandled smfmac intrinsic");
+  }
+
+  auto VDst_In = MI.getOperand(4);
+
+  MI.setDesc(TII.get(Opc));
+  MI.removeOperand(4); // VDst_In
+  MI.removeOperand(1); // Intrinsic ID
+  MI.addOperand(VDst_In); // Readd VDst_In to the end
   MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
   return true;
 }
@@ -3166,6 +3422,9 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
   case TargetOpcode::G_UADDE:
   case TargetOpcode::G_USUBE:
     return selectG_UADDO_USUBO_UADDE_USUBE(I);
+  case AMDGPU::G_AMDGPU_MAD_U64_U32:
+  case AMDGPU::G_AMDGPU_MAD_I64_I32:
+    return selectG_AMDGPU_MAD_64_32(I);
   case TargetOpcode::G_INTTOPTR:
   case TargetOpcode::G_BITCAST:
   case TargetOpcode::G_PTRTOINT:
@@ -3226,8 +3485,6 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
   case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
   case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
     return selectG_LOAD_STORE_ATOMICRMW(I);
-  case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
-    return selectG_AMDGPU_ATOMIC_CMPXCHG(I);
   case TargetOpcode::G_SELECT:
     return selectG_SELECT(I);
   case TargetOpcode::G_TRUNC:
@@ -3286,9 +3543,8 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
 
 }
 
-std::pair<Register, unsigned>
-AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
-                                              bool AllowAbs) const {
+std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
+    MachineOperand &Root, bool AllowAbs, bool OpSel, bool ForceVGPR) const {
   Register Src = Root.getReg();
   Register OrigSrc = Src;
   unsigned Mods = 0;
@@ -3305,7 +3561,10 @@ AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
     Mods |= SISrcMods::ABS;
   }
 
-  if (Mods != 0 &&
+  if (OpSel)
+    Mods |= SISrcMods::OP_SEL_0;
+
+  if ((Mods != 0 || ForceVGPR) &&
       RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
     MachineInstr *UseMI = Root.getParent();
 
@@ -3407,7 +3666,7 @@ AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
 
 std::pair<Register, unsigned>
 AMDGPUInstructionSelector::selectVOP3PModsImpl(
-  Register Src, const MachineRegisterInfo &MRI) const {
+  Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const {
   unsigned Mods = 0;
   MachineInstr *MI = MRI.getVRegDef(Src);
 
@@ -3421,6 +3680,7 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(
   }
 
   // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
+  (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard()
 
   // Packed instructions do not have abs modifiers.
   Mods |= SISrcMods::OP_SEL_1;
@@ -3443,6 +3703,50 @@ AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
   }};
 }
 
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
+  MachineRegisterInfo &MRI
+    = Root.getParent()->getParent()->getParent()->getRegInfo();
+
+  Register Src;
+  unsigned Mods;
+  std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true);
+
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
+  }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectDotIUVOP3PMods(MachineOperand &Root) const {
+  // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
+  // Value is in Imm operand as i1 sign extended to int64_t.
+  // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
+  assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
+         "expected i1 value");
+  unsigned Mods = SISrcMods::OP_SEL_1;
+  if (Root.getImm() == -1)
+    Mods ^= SISrcMods::NEG;
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
+  }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
+    MachineOperand &Root) const {
+  assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
+         "expected i1 value");
+  unsigned Mods = SISrcMods::OP_SEL_1;
+  if (Root.getImm() != 0)
+    Mods |= SISrcMods::OP_SEL_0;
+
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
+  }};
+}
+
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
   Register Src;
@@ -3466,6 +3770,36 @@ AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
   }};
 }
 
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
+  Register Src;
+  unsigned Mods;
+  std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
+                                           /* AllowAbs */ false,
+                                           /* OpSel */ false,
+                                           /* ForceVGPR */ true);
+
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
+  }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
+  Register Src;
+  unsigned Mods;
+  std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
+                                           /* AllowAbs */ false,
+                                           /* OpSel */ true,
+                                           /* ForceVGPR */ true);
+
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
+  }};
+}
+
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
   SmallVector<GEPInfo, 4> AddrInfo;
@@ -3594,24 +3928,6 @@ AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
     }};
 }
 
-/// Match a zero extend from a 32-bit value to 64-bits.
-static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
-  Register ZExtSrc;
-  if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
-    return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
-
-  // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
-  const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
-  if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
-    return false;
-
-  if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
-    return Def->getOperand(1).getReg();
-  }
-
-  return Register();
-}
-
 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
@@ -3631,9 +3947,6 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
       ImmOffset = ConstOffset;
     } else {
       auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
-      if (!PtrBaseDef)
-        return None;
-
       if (isSGPR(PtrBaseDef->Reg)) {
         if (ConstOffset > 0) {
           // Offset is too large.
@@ -3679,11 +3992,8 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
     }
   }
 
-  auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
-  if (!AddrDef)
-    return None;
-
   // Match the variable offset.
+  auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
   if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
     // Look through the SGPR->VGPR copy.
     Register SAddr =
@@ -3749,9 +4059,6 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
   }
 
   auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
-  if (!AddrDef)
-    return None;
-
   if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
     int FI = AddrDef->MI->getOperand(1).getIndex();
     return {{
@@ -3768,8 +4075,7 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
     auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
     auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
 
-    if (LHSDef && RHSDef &&
-        LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
+    if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
         isSGPR(RHSDef->Reg)) {
       int FI = LHSDef->MI->getOperand(1).getIndex();
       MachineInstr &I = *Root.getParent();
@@ -3792,6 +4098,74 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
   }};
 }
 
+// Check whether the flat scratch SVS swizzle bug affects this access.
+bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
+    Register VAddr, Register SAddr, uint64_t ImmOffset) const {
+  if (!Subtarget->hasFlatScratchSVSSwizzleBug())
+    return false;
+
+  // The bug affects the swizzling of SVS accesses if there is any carry out
+  // from the two low order bits (i.e. from bit 1 into bit 2) when adding
+  // voffset to (soffset + inst_offset).
+  auto VKnown = KnownBits->getKnownBits(VAddr);
+  auto SKnown = KnownBits::computeForAddSub(
+      true, false, KnownBits->getKnownBits(SAddr),
+      KnownBits::makeConstant(APInt(32, ImmOffset)));
+  uint64_t VMax = VKnown.getMaxValue().getZExtValue();
+  uint64_t SMax = SKnown.getMaxValue().getZExtValue();
+  return (VMax & 3) + (SMax & 3) >= 4;
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
+  Register Addr = Root.getReg();
+  Register PtrBase;
+  int64_t ConstOffset;
+  int64_t ImmOffset = 0;
+
+  // Match the immediate offset first, which canonically is moved as low as
+  // possible.
+  std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
+
+  if (ConstOffset != 0 &&
+      TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
+    Addr = PtrBase;
+    ImmOffset = ConstOffset;
+  }
+
+  auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
+  if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
+    return None;
+
+  Register RHS = AddrDef->MI->getOperand(2).getReg();
+  if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
+    return None;
+
+  Register LHS = AddrDef->MI->getOperand(1).getReg();
+  auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
+
+  if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
+    return None;
+
+  if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
+    int FI = LHSDef->MI->getOperand(1).getIndex();
+    return {{
+        [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
+        [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
+        [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
+    }};
+  }
+
+  if (!isSGPR(LHS))
+    return None;
+
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
+  }};
+}
+
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
   MachineInstr *MI = Root.getParent();
@@ -3856,7 +4230,7 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
              MIB.addReg(Info->getScratchRSrcReg());
            },
            [=](MachineInstrBuilder &MIB) { // vaddr
-             if (FI.hasValue())
+             if (FI)
                MIB.addFrameIndex(FI.getValue());
              else
                MIB.addReg(VAddr);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 42095332d11a..22672ba59e76 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -97,6 +97,7 @@ private:
   bool selectG_AND_OR_XOR(MachineInstr &I) const;
   bool selectG_ADD_SUB(MachineInstr &I) const;
   bool selectG_UADDO_USUBO_UADDE_USUBE(MachineInstr &I) const;
+  bool selectG_AMDGPU_MAD_64_32(MachineInstr &I) const;
   bool selectG_EXTRACT(MachineInstr &I) const;
   bool selectG_MERGE_VALUES(MachineInstr &I) const;
   bool selectG_UNMERGE_VALUES(MachineInstr &I) const;
@@ -133,7 +134,6 @@ private:
 
   void initM0(MachineInstr &I) const;
   bool selectG_LOAD_STORE_ATOMICRMW(MachineInstr &I) const;
-  bool selectG_AMDGPU_ATOMIC_CMPXCHG(MachineInstr &I) const;
   bool selectG_SELECT(MachineInstr &I) const;
   bool selectG_BRCOND(MachineInstr &I) const;
   bool selectG_GLOBAL_VALUE(MachineInstr &I) const;
@@ -144,11 +144,15 @@ private:
   bool selectAMDGPU_BUFFER_ATOMIC_FADD(MachineInstr &I) const;
   bool selectGlobalAtomicFadd(MachineInstr &I, MachineOperand &AddrOp,
                               MachineOperand &DataOp) const;
+  bool selectBufferLoadLds(MachineInstr &MI) const;
+  bool selectGlobalLoadLds(MachineInstr &MI) const;
   bool selectBVHIntrinsic(MachineInstr &I) const;
+  bool selectSMFMACIntrin(MachineInstr &I) const;
   bool selectWaveAddress(MachineInstr &I) const;
 
-  std::pair<Register, unsigned> selectVOP3ModsImpl(MachineOperand &Root,
-                                                   bool AllowAbs = true) const;
+  std::pair<Register, unsigned>
+  selectVOP3ModsImpl(MachineOperand &Root, bool AllowAbs = true,
+                     bool OpSel = false, bool ForceVGPR = false) const;
 
   InstructionSelector::ComplexRendererFns
   selectVCSRC(MachineOperand &Root) const;
@@ -173,14 +177,29 @@ private:
   selectVOP3Mods_nnan(MachineOperand &Root) const;
 
   std::pair<Register, unsigned>
-  selectVOP3PModsImpl(Register Src, const MachineRegisterInfo &MRI) const;
+  selectVOP3PModsImpl(Register Src, const MachineRegisterInfo &MRI,
+                      bool IsDOT = false) const;
 
   InstructionSelector::ComplexRendererFns
   selectVOP3PMods(MachineOperand &Root) const;
 
+  InstructionSelector::ComplexRendererFns
+  selectVOP3PModsDOT(MachineOperand &Root) const;
+
+  InstructionSelector::ComplexRendererFns
+  selectDotIUVOP3PMods(MachineOperand &Root) const;
+
+  InstructionSelector::ComplexRendererFns
+  selectWMMAOpSelVOP3PMods(MachineOperand &Root) const;
+
   InstructionSelector::ComplexRendererFns
   selectVOP3OpSelMods(MachineOperand &Root) const;
 
+  InstructionSelector::ComplexRendererFns
+  selectVINTERPMods(MachineOperand &Root) const;
+  InstructionSelector::ComplexRendererFns
+  selectVINTERPModsHi(MachineOperand &Root) const;
+
   InstructionSelector::ComplexRendererFns
   selectSmrdImm(MachineOperand &Root) const;
   InstructionSelector::ComplexRendererFns
@@ -203,6 +222,10 @@ private:
 
   InstructionSelector::ComplexRendererFns
   selectScratchSAddr(MachineOperand &Root) const;
+  bool checkFlatScratchSVSSwizzleBug(Register VAddr, Register SAddr,
+                                     uint64_t ImmOffset) const;
+  InstructionSelector::ComplexRendererFns
+  selectScratchSVAddr(MachineOperand &Root) const;
 
   InstructionSelector::ComplexRendererFns
   selectMUBUFScratchOffen(MachineOperand &Root) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 7d3dbfd7e851..31012915457b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -40,7 +40,7 @@ class AMDGPUInst <dag outs, dag ins, string asm = "",
   // instructions to not match without killing the whole decode process. It is
   // mainly used for ARM, but Tablegen expects this field to exist or it fails
   // to build the decode table.
-  field bits<64> SoftFail = 0;
+  field bits<96> SoftFail = 0;
 
   let DecoderNamespace = Namespace;
 
@@ -87,6 +87,17 @@ class PredConcat<list<Predicate> lst, Predicate pred> {
       !listconcat([pred], !filter(item, lst, !ne(item, pred)));
 }
 
+// Add a Register to the list if does not already exist
+class RegAppend<list<Register> lst, Register reg> {
+  list<Register> ret =
+      !listconcat([reg], !filter(item, lst, !ne(item, reg)));
+}
+// Get the union of two Register lists
+class RegListUnion<list<Register> lstA, list<Register> lstB> {
+  list<Register> ret =
+      !foldl(lstA, lstB, temp, item, RegAppend<temp, item>.ret);
+}
+
 class PredicateControl {
   Predicate SubtargetPredicate = TruePredicate;
   Predicate AssemblerPredicate = TruePredicate;
@@ -444,34 +455,28 @@ def load_#as : PatFrag<(ops node:$ptr), (unindexedload node:$ptr)> {
   let IsNonExtLoad = 1;
 }
 
-def extloadi8_#as  : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
+def extloadi8_#as  : PatFrag<(ops node:$ptr), (extloadi8 node:$ptr)> {
   let IsLoad = 1;
-  let MemoryVT = i8;
 }
 
-def extloadi16_#as : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
+def extloadi16_#as : PatFrag<(ops node:$ptr), (extloadi16 node:$ptr)> {
   let IsLoad = 1;
-  let MemoryVT = i16;
 }
 
-def sextloadi8_#as  : PatFrag<(ops node:$ptr), (sextload node:$ptr)> {
+def sextloadi8_#as  : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr)> {
   let IsLoad = 1;
-  let MemoryVT = i8;
 }
 
-def sextloadi16_#as : PatFrag<(ops node:$ptr), (sextload node:$ptr)> {
+def sextloadi16_#as : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr)> {
   let IsLoad = 1;
-  let MemoryVT = i16;
 }
 
-def zextloadi8_#as  : PatFrag<(ops node:$ptr), (zextload node:$ptr)> {
+def zextloadi8_#as  : PatFrag<(ops node:$ptr), (zextloadi8 node:$ptr)> {
   let IsLoad = 1;
-  let MemoryVT = i8;
 }
 
-def zextloadi16_#as : PatFrag<(ops node:$ptr), (zextload node:$ptr)> {
+def zextloadi16_#as : PatFrag<(ops node:$ptr), (zextloadi16 node:$ptr)> {
   let IsLoad = 1;
-  let MemoryVT = i16;
 }
 
 def atomic_load_8_#as : PatFrag<(ops node:$ptr), (atomic_load_8 node:$ptr)> {
@@ -498,17 +503,15 @@ def atomic_load_64_#as : PatFrag<(ops node:$ptr), (atomic_load_64 node:$ptr)> {
 
 
 foreach as = [ "global", "flat", "local", "private", "region" ] in {
-let AddressSpaces = !cast<AddressSpaceList>("StoreAddress_"#as).AddrSpaces in {
+let IsStore = 1, AddressSpaces = !cast<AddressSpaceList>("StoreAddress_"#as).AddrSpaces in {
 def store_#as : PatFrag<(ops node:$val, node:$ptr),
                     (unindexedstore node:$val, node:$ptr)> {
-  let IsStore = 1;
   let IsTruncStore = 0;
 }
 
 // truncstore fragments.
 def truncstore_#as : PatFrag<(ops node:$val, node:$ptr),
                              (unindexedstore node:$val, node:$ptr)> {
-  let IsStore = 1;
   let IsTruncStore = 1;
 }
 
@@ -517,90 +520,133 @@ def truncstore_#as : PatFrag<(ops node:$val, node:$ptr),
 // unnecessary check that the memory size is less than the value type
 // in the generated matcher table.
 def truncstorei8_#as : PatFrag<(ops node:$val, node:$ptr),
-                               (truncstore node:$val, node:$ptr)> {
-  let IsStore = 1;
-  let MemoryVT = i8;
-}
-
+                               (truncstorei8 node:$val, node:$ptr)>;
 def truncstorei16_#as : PatFrag<(ops node:$val, node:$ptr),
-                                (truncstore node:$val, node:$ptr)> {
-  let IsStore = 1;
-  let MemoryVT = i16;
-}
+                                (truncstorei16 node:$val, node:$ptr)>;
 
 def store_hi16_#as : StoreHi16 <truncstorei16, i16>;
 def truncstorei8_hi16_#as : StoreHi16<truncstorei8, i8>;
 def truncstorei16_hi16_#as : StoreHi16<truncstorei16, i16>;
 
-defm atomic_store_#as : binary_atomic_op<atomic_store>;
+} // End let IsStore = 1, AddressSpaces = ...
 
-} // End let AddressSpaces
+let IsAtomic = 1, AddressSpaces = !cast<AddressSpaceList>("StoreAddress_"#as).AddrSpaces in {
+def atomic_store_8_#as : PatFrag<(ops node:$ptr, node:$val),
+                                 (atomic_store_8 node:$ptr, node:$val)>;
+def atomic_store_16_#as : PatFrag<(ops node:$ptr, node:$val),
+                                  (atomic_store_16 node:$ptr, node:$val)>;
+def atomic_store_32_#as : PatFrag<(ops node:$ptr, node:$val),
+                                  (atomic_store_32 node:$ptr, node:$val)>;
+def atomic_store_64_#as : PatFrag<(ops node:$ptr, node:$val),
+                                  (atomic_store_64 node:$ptr, node:$val)>;
+}
 } // End foreach as
 
+// TODO: Add GISelPredicateCode for the ret and noret PatFrags once
+// GlobalISelEmitter allows pattern matches where src and dst def count
+// mismatch.
+
+multiclass ret_noret_op {
+  let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }],
+      GISelPredicateCode = [{ return true; }] in {
+    def "_ret" : PatFrag<(ops node:$ptr, node:$data),
+      (!cast<SDPatternOperator>(NAME) node:$ptr, node:$data)>;
+  }
+
+  let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }],
+      GISelPredicateCode = [{ return false; }] in {
+    def "_noret" : PatFrag<(ops node:$ptr, node:$data),
+      (!cast<SDPatternOperator>(NAME) node:$ptr, node:$data)>;
+  }
+}
+
+defm int_amdgcn_flat_atomic_fadd : ret_noret_op;
+defm int_amdgcn_flat_atomic_fadd_v2bf16 : ret_noret_op;
+defm int_amdgcn_flat_atomic_fmin : ret_noret_op;
+defm int_amdgcn_flat_atomic_fmax : ret_noret_op;
+defm int_amdgcn_global_atomic_fadd : ret_noret_op;
+defm int_amdgcn_global_atomic_fadd_v2bf16 : ret_noret_op;
+defm int_amdgcn_global_atomic_fmin : ret_noret_op;
+defm int_amdgcn_global_atomic_fmax : ret_noret_op;
+defm int_amdgcn_ds_fadd_v2bf16 : ret_noret_op;
 
 multiclass ret_noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> {
+  let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }],
+      GISelPredicateCode = [{ return false; }] in {
+    defm "_noret" : binary_atomic_op<atomic_op, IsInt>;
+  }
+
+  let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }],
+      GISelPredicateCode = [{ return true; }] in {
+    defm "_ret" : binary_atomic_op<atomic_op, IsInt>;
+  }
+}
+
+multiclass ret_noret_ternary_atomic_op<SDNode atomic_op> {
+  let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }],
+      GISelPredicateCode = [{ return false; }] in {
+    defm "_noret" : ternary_atomic_op<atomic_op>;
+  }
+
+  let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }],
+      GISelPredicateCode = [{ return true; }] in {
+    defm "_ret" : ternary_atomic_op<atomic_op>;
+  }
+}
+
+multiclass binary_atomic_op_all_as<SDNode atomic_op, bit IsInt = 1> {
   foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in {
     let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in {
       defm "_"#as : binary_atomic_op<atomic_op, IsInt>;
-
-      let PredicateCode = [{return (SDValue(N, 0).use_empty());}] in {
-        defm "_"#as#"_noret" : binary_atomic_op<atomic_op, IsInt>;
-      }
-
-      let PredicateCode = [{return !(SDValue(N, 0).use_empty());}] in {
-        defm "_"#as#"_ret" : binary_atomic_op<atomic_op, IsInt>;
-      }
+      defm "_"#as : ret_noret_binary_atomic_op<atomic_op, IsInt>;
     }
   }
 }
 
-defm atomic_swap : ret_noret_binary_atomic_op<atomic_swap>;
-defm atomic_load_add : ret_noret_binary_atomic_op<atomic_load_add>;
-defm atomic_load_and : ret_noret_binary_atomic_op<atomic_load_and>;
-defm atomic_load_max : ret_noret_binary_atomic_op<atomic_load_max>;
-defm atomic_load_min : ret_noret_binary_atomic_op<atomic_load_min>;
-defm atomic_load_or : ret_noret_binary_atomic_op<atomic_load_or>;
-defm atomic_load_sub : ret_noret_binary_atomic_op<atomic_load_sub>;
-defm atomic_load_umax : ret_noret_binary_atomic_op<atomic_load_umax>;
-defm atomic_load_umin : ret_noret_binary_atomic_op<atomic_load_umin>;
-defm atomic_load_xor : ret_noret_binary_atomic_op<atomic_load_xor>;
-defm atomic_load_fadd : ret_noret_binary_atomic_op<atomic_load_fadd, 0>;
+defm atomic_swap : binary_atomic_op_all_as<atomic_swap>;
+defm atomic_load_add : binary_atomic_op_all_as<atomic_load_add>;
+defm atomic_load_and : binary_atomic_op_all_as<atomic_load_and>;
+defm atomic_load_max : binary_atomic_op_all_as<atomic_load_max>;
+defm atomic_load_min : binary_atomic_op_all_as<atomic_load_min>;
+defm atomic_load_or : binary_atomic_op_all_as<atomic_load_or>;
+defm atomic_load_sub : binary_atomic_op_all_as<atomic_load_sub>;
+defm atomic_load_umax : binary_atomic_op_all_as<atomic_load_umax>;
+defm atomic_load_umin : binary_atomic_op_all_as<atomic_load_umin>;
+defm atomic_load_xor : binary_atomic_op_all_as<atomic_load_xor>;
+defm atomic_load_fadd : binary_atomic_op_all_as<atomic_load_fadd, 0>;
 let MemoryVT = v2f16 in
-defm atomic_load_fadd_v2f16 : ret_noret_binary_atomic_op<atomic_load_fadd, 0>;
-defm AMDGPUatomic_cmp_swap : ret_noret_binary_atomic_op<AMDGPUatomic_cmp_swap>;
+defm atomic_load_fadd_v2f16 : binary_atomic_op_all_as<atomic_load_fadd, 0>;
+defm AMDGPUatomic_cmp_swap : binary_atomic_op_all_as<AMDGPUatomic_cmp_swap>;
 
 def load_align8_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>,
-                        Aligned<8> {
+                       Aligned<8> {
   let IsLoad = 1;
-  let IsNonExtLoad = 1;
 }
 
 def load_align16_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>,
                         Aligned<16> {
   let IsLoad = 1;
-  let IsNonExtLoad = 1;
 }
 
 def store_align8_local: PatFrag<(ops node:$val, node:$ptr),
                                 (store_local node:$val, node:$ptr)>, Aligned<8> {
   let IsStore = 1;
-  let IsTruncStore = 0;
 }
 
 def store_align16_local: PatFrag<(ops node:$val, node:$ptr),
                                 (store_local node:$val, node:$ptr)>, Aligned<16> {
   let IsStore = 1;
-  let IsTruncStore = 0;
 }
 
 let AddressSpaces = StoreAddress_local.AddrSpaces in {
 defm atomic_cmp_swap_local : ternary_atomic_op<atomic_cmp_swap>;
-defm atomic_cmp_swap_local_m0 : ternary_atomic_op<atomic_cmp_swap_glue>;
+defm atomic_cmp_swap_local : ret_noret_ternary_atomic_op<atomic_cmp_swap>;
+defm atomic_cmp_swap_local_m0 : ret_noret_ternary_atomic_op<atomic_cmp_swap_glue>;
 }
 
 let AddressSpaces = StoreAddress_region.AddrSpaces in {
-defm atomic_cmp_swap_region : ternary_atomic_op<atomic_cmp_swap>;
-defm atomic_cmp_swap_region_m0 : ternary_atomic_op<atomic_cmp_swap_glue>;
+defm atomic_cmp_swap_region : ret_noret_ternary_atomic_op<atomic_cmp_swap>;
+defm atomic_cmp_swap_region_m0 : ret_noret_ternary_atomic_op<atomic_cmp_swap_glue>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 645d05aa9238..01a3e78ea48c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -26,6 +26,7 @@
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsR600.h"
 
 #define DEBUG_TYPE "amdgpu-legalinfo"
 
@@ -134,7 +135,6 @@ static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
 static LLT getBitcastRegisterType(const LLT Ty) {
   const unsigned Size = Ty.getSizeInBits();
 
-  LLT CoercedTy;
   if (Size <= 32) {
     // <2 x s8> -> s16
     // <4 x s8> -> s32
@@ -530,13 +530,22 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 
   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
     // Full set of gfx9 features.
-    getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
+    getActionDefinitionsBuilder({G_ADD, G_SUB})
       .legalFor({S32, S16, V2S16})
+      .clampMaxNumElementsStrict(0, S16, 2)
+      .scalarize(0)
       .minScalar(0, S16)
+      .widenScalarToNextMultipleOf(0, 32)
+      .maxScalar(0, S32);
+
+    getActionDefinitionsBuilder(G_MUL)
+      .legalFor({S32, S16, V2S16})
       .clampMaxNumElementsStrict(0, S16, 2)
+      .scalarize(0)
+      .minScalar(0, S16)
       .widenScalarToNextMultipleOf(0, 32)
-      .maxScalar(0, S32)
-      .scalarize(0);
+      .custom();
+    assert(ST.hasMad64_32());
 
     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
       .legalFor({S32, S16, V2S16}) // Clamp modifier
@@ -546,13 +555,21 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       .widenScalarToNextPow2(0, 32)
       .lower();
   } else if (ST.has16BitInsts()) {
-    getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
+    getActionDefinitionsBuilder({G_ADD, G_SUB})
       .legalFor({S32, S16})
       .minScalar(0, S16)
       .widenScalarToNextMultipleOf(0, 32)
       .maxScalar(0, S32)
       .scalarize(0);
 
+    getActionDefinitionsBuilder(G_MUL)
+      .legalFor({S32, S16})
+      .scalarize(0)
+      .minScalar(0, S16)
+      .widenScalarToNextMultipleOf(0, 32)
+      .custom();
+    assert(ST.hasMad64_32());
+
     // Technically the saturating operations require clamp bit support, but this
     // was introduced at the same time as 16-bit operations.
     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
@@ -569,12 +586,23 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       .scalarize(0)
       .lower();
   } else {
-    getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
+    getActionDefinitionsBuilder({G_ADD, G_SUB})
       .legalFor({S32})
       .widenScalarToNextMultipleOf(0, 32)
       .clampScalar(0, S32, S32)
       .scalarize(0);
 
+    auto &Mul = getActionDefinitionsBuilder(G_MUL)
+      .legalFor({S32})
+      .scalarize(0)
+      .minScalar(0, S32)
+      .widenScalarToNextMultipleOf(0, 32);
+
+    if (ST.hasMad64_32())
+      Mul.custom();
+    else
+      Mul.maxScalar(0, S32);
+
     if (ST.hasIntClamp()) {
       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
         .legalFor({S32}) // Clamp modifier.
@@ -632,7 +660,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
     .legalFor({{S32, S1}, {S32, S32}})
     .minScalar(0, S32)
-    // TODO: .scalarize(0)
+    .scalarize(0)
     .lower();
 
   getActionDefinitionsBuilder(G_BITCAST)
@@ -767,13 +795,24 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
     .scalarize(0);
 
-  getActionDefinitionsBuilder(G_FSUB)
+  auto &FSubActions = getActionDefinitionsBuilder(G_FSUB);
+  if (ST.has16BitInsts()) {
+    FSubActions
+      // Use actual fsub instruction
+      .legalFor({S32, S16})
+      // Must use fadd + fneg
+      .lowerFor({S64, V2S16});
+  } else {
+    FSubActions
       // Use actual fsub instruction
       .legalFor({S32})
       // Must use fadd + fneg
-      .lowerFor({S64, S16, V2S16})
-      .scalarize(0)
-      .clampScalar(0, S32, S64);
+      .lowerFor({S64, S16, V2S16});
+  }
+
+  FSubActions
+    .scalarize(0)
+    .clampScalar(0, S32, S64);
 
   // Whether this is legal depends on the floating point mode for the function.
   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
@@ -839,6 +878,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
        .scalarize(0)
        .lower();
 
+  getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
+      .customFor({S16, S32})
+      .scalarize(0)
+      .lower();
+
   // Lower roundeven into G_FRINT
   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
     .scalarize(0)
@@ -1292,6 +1336,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
     if (ST.hasGFX90AInsts())
       Atomic.legalFor({{S64, LocalPtr}});
+    if (ST.hasGFX940Insts())
+      Atomic.legalFor({{V2S16, LocalPtr}});
   }
   if (ST.hasAtomicFaddInsts())
     Atomic.legalFor({{S32, GlobalPtr}});
@@ -1505,7 +1551,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
     .clampMaxNumElements(0, S16, 64);
 
-  // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
+  // TODO: Don't fully scalarize v2s16 pieces? Or combine out those
   // pre-legalize.
   if (ST.hasVOP3PInsts()) {
     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
@@ -1756,9 +1802,13 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
     return legalizeFFloor(MI, MRI, B);
   case TargetOpcode::G_BUILD_VECTOR:
     return legalizeBuildVector(MI, MRI, B);
+  case TargetOpcode::G_MUL:
+    return legalizeMul(Helper, MI);
   case TargetOpcode::G_CTLZ:
   case TargetOpcode::G_CTTZ:
     return legalizeCTLZ_CTTZ(MI, MRI, B);
+  case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
+    return legalizeFPTruncRound(MI, B);
   default:
     return false;
   }
@@ -1801,6 +1851,39 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
   }
 
+  // TODO: can we be smarter about machine pointer info?
+  MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
+  Register LoadAddr = MRI.createGenericVirtualRegister(
+    LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
+  // For code object version 5, private_base and shared_base are passed through
+  // implicit kernargs.
+  if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
+    AMDGPUTargetLowering::ImplicitParameter Param =
+        AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
+                                      : AMDGPUTargetLowering::PRIVATE_BASE;
+    uint64_t Offset =
+        ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
+
+    Register KernargPtrReg = MRI.createGenericVirtualRegister(
+        LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
+
+    if (!loadInputValue(KernargPtrReg, B,
+                        AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
+      return Register();
+
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+        PtrInfo,
+        MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+            MachineMemOperand::MOInvariant,
+        LLT::scalar(32), commonAlignment(Align(64), Offset));
+
+    // Pointer address
+    B.buildPtrAdd(LoadAddr, KernargPtrReg,
+                  B.buildConstant(LLT::scalar(64), Offset).getReg(0));
+    // Load address
+    return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
+  }
+
   Register QueuePtr = MRI.createGenericVirtualRegister(
     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
 
@@ -1811,17 +1894,14 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
   // private_segment_aperture_base_hi.
   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
 
-  // TODO: can we be smarter about machine pointer info?
-  MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       PtrInfo,
       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
           MachineMemOperand::MOInvariant,
       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
 
-  Register LoadAddr;
-
-  B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
+  B.buildPtrAdd(LoadAddr, QueuePtr,
+                B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
 }
 
@@ -1872,31 +1952,9 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
     return true;
   }
 
-  if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
-    // Truncate.
-    B.buildExtract(Dst, Src, 0);
-    MI.eraseFromParent();
-    return true;
-  }
-
-  if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
-    const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
-    uint32_t AddrHiVal = Info->get32BitAddressHighBits();
-
-    // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
-    // another. Merge operands are required to be the same type, but creating an
-    // extra ptrtoint would be kind of pointless.
-    auto HighAddr = B.buildConstant(
-      LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
-    B.buildMerge(Dst, {Src, HighAddr});
-    MI.eraseFromParent();
-    return true;
-  }
-
-  if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
-    assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
-           DestAS == AMDGPUAS::PRIVATE_ADDRESS);
-
+  if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
+      (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
+       DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
       // Extract low 32-bits of the pointer.
       B.buildExtract(Dst, Src, 0);
@@ -1920,37 +1978,70 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
     return true;
   }
 
-  if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
-    return false;
+  if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
+      (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
+       SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
+    if (!ST.hasFlatAddressSpace())
+      return false;
 
-  if (!ST.hasFlatAddressSpace())
-    return false;
+    Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
+    if (!ApertureReg.isValid())
+      return false;
 
-  Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
-  if (!ApertureReg.isValid())
-    return false;
+    // Coerce the type of the low half of the result so we can use merge_values.
+    Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
+
+    // TODO: Should we allow mismatched types but matching sizes in merges to
+    // avoid the ptrtoint?
+    auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
+
+    if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
+      B.buildCopy(Dst, BuildPtr);
+      MI.eraseFromParent();
+      return true;
+    }
+
+    auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
+    auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
 
-  // Coerce the type of the low half of the result so we can use merge_values.
-  Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
+    auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
+                              SegmentNull.getReg(0));
 
-  // TODO: Should we allow mismatched types but matching sizes in merges to
-  // avoid the ptrtoint?
-  auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
+    B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
+
+    MI.eraseFromParent();
+    return true;
+  }
 
-  if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
-    B.buildCopy(Dst, BuildPtr);
+  if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
+      SrcTy.getSizeInBits() == 64) {
+    // Truncate.
+    B.buildExtract(Dst, Src, 0);
     MI.eraseFromParent();
     return true;
   }
 
-  auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
-  auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
+  if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
+      DstTy.getSizeInBits() == 64) {
+    const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+    uint32_t AddrHiVal = Info->get32BitAddressHighBits();
 
-  auto CmpRes =
-      B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
+    // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
+    // another. Merge operands are required to be the same type, but creating an
+    // extra ptrtoint would be kind of pointless.
+    auto HighAddr = B.buildConstant(
+        LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
+    B.buildMerge(Dst, {Src, HighAddr});
+    MI.eraseFromParent();
+    return true;
+  }
 
-  B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
+  DiagnosticInfoUnsupported InvalidAddrSpaceCast(
+      MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
 
+  LLVMContext &Ctx = MF.getFunction().getContext();
+  Ctx.diagnose(InvalidAddrSpaceCast);
+  B.buildUndef(Dst);
   MI.eraseFromParent();
   return true;
 }
@@ -2811,6 +2902,298 @@ bool AMDGPULegalizerInfo::legalizeBuildVector(
   return true;
 }
 
+// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
+//
+// Source and accumulation registers must all be 32-bits.
+//
+// TODO: When the multiply is uniform, we should produce a code sequence
+// that is better suited to instruction selection on the SALU. Instead of
+// the outer loop going over parts of the result, the outer loop should go
+// over parts of one of the factors. This should result in instruction
+// selection that makes full use of S_ADDC_U32 instructions.
+void AMDGPULegalizerInfo::buildMultiply(
+    LegalizerHelper &Helper, MutableArrayRef<Register> Accum,
+    ArrayRef<Register> Src0, ArrayRef<Register> Src1,
+    bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const {
+  // Use (possibly empty) vectors of S1 registers to represent the set of
+  // carries from one pair of positions to the next.
+  using Carry = SmallVector<Register, 2>;
+
+  MachineIRBuilder &B = Helper.MIRBuilder;
+
+  const LLT S1 = LLT::scalar(1);
+  const LLT S32 = LLT::scalar(32);
+  const LLT S64 = LLT::scalar(64);
+
+  Register Zero32;
+  Register Zero64;
+
+  auto getZero32 = [&]() -> Register {
+    if (!Zero32)
+      Zero32 = B.buildConstant(S32, 0).getReg(0);
+    return Zero32;
+  };
+  auto getZero64 = [&]() -> Register {
+    if (!Zero64)
+      Zero64 = B.buildConstant(S64, 0).getReg(0);
+    return Zero64;
+  };
+
+  // Merge the given carries into the 32-bit LocalAccum, which is modified
+  // in-place.
+  //
+  // Returns the carry-out, which is a single S1 register or null.
+  auto mergeCarry =
+      [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
+        if (CarryIn.empty())
+          return Register();
+
+        bool HaveCarryOut = true;
+        Register CarryAccum;
+        if (CarryIn.size() == 1) {
+          if (!LocalAccum) {
+            LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
+            return Register();
+          }
+
+          CarryAccum = getZero32();
+        } else {
+          CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
+          for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
+            CarryAccum =
+                B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
+                    .getReg(0);
+          }
+
+          if (!LocalAccum) {
+            LocalAccum = getZero32();
+            HaveCarryOut = false;
+          }
+        }
+
+        auto Add =
+            B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
+        LocalAccum = Add.getReg(0);
+        return HaveCarryOut ? Add.getReg(1) : Register();
+      };
+
+  // Build a multiply-add chain to compute
+  //
+  //   LocalAccum + (partial products at DstIndex)
+  //       + (opportunistic subset of CarryIn)
+  //
+  // LocalAccum is an array of one or two 32-bit registers that are updated
+  // in-place. The incoming registers may be null.
+  //
+  // In some edge cases, carry-ins can be consumed "for free". In that case,
+  // the consumed carry bits are removed from CarryIn in-place.
+  auto buildMadChain =
+      [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
+          -> Carry {
+        assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
+               (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
+
+        Carry CarryOut;
+        unsigned j0 = 0;
+
+        // Use plain 32-bit multiplication for the most significant part of the
+        // result by default.
+        if (LocalAccum.size() == 1 &&
+            (!UsePartialMad64_32 || !CarryIn.empty())) {
+          do {
+            unsigned j1 = DstIndex - j0;
+            auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
+            if (!LocalAccum[0]) {
+              LocalAccum[0] = Mul.getReg(0);
+            } else {
+              if (CarryIn.empty()) {
+                LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
+              } else {
+                LocalAccum[0] =
+                    B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
+                        .getReg(0);
+                CarryIn.pop_back();
+              }
+            }
+            ++j0;
+          } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
+        }
+
+        // Build full 64-bit multiplies.
+        if (j0 <= DstIndex) {
+          bool HaveSmallAccum = false;
+          Register Tmp;
+
+          if (LocalAccum[0]) {
+            if (LocalAccum.size() == 1) {
+              Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
+              HaveSmallAccum = true;
+            } else if (LocalAccum[1]) {
+              Tmp = B.buildMerge(S64, LocalAccum).getReg(0);
+              HaveSmallAccum = false;
+            } else {
+              Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
+              HaveSmallAccum = true;
+            }
+          } else {
+            assert(LocalAccum.size() == 1 || !LocalAccum[1]);
+            Tmp = getZero64();
+            HaveSmallAccum = true;
+          }
+
+          do {
+            unsigned j1 = DstIndex - j0;
+            auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
+                                    {Src0[j0], Src1[j1], Tmp});
+            Tmp = Mad.getReg(0);
+            if (!HaveSmallAccum)
+              CarryOut.push_back(Mad.getReg(1));
+            HaveSmallAccum = false;
+            ++j0;
+          } while (j0 <= DstIndex);
+
+          auto Unmerge = B.buildUnmerge(S32, Tmp);
+          LocalAccum[0] = Unmerge.getReg(0);
+          if (LocalAccum.size() > 1)
+            LocalAccum[1] = Unmerge.getReg(1);
+        }
+
+        return CarryOut;
+      };
+
+  // Outer multiply loop, iterating over destination parts from least
+  // significant to most significant parts.
+  //
+  // The columns of the following diagram correspond to the destination parts
+  // affected by one iteration of the outer loop (ignoring boundary
+  // conditions).
+  //
+  //   Dest index relative to 2 * i:      1 0 -1
+  //                                      ------
+  //   Carries from previous iteration:     e o
+  //   Even-aligned partial product sum:  E E .
+  //   Odd-aligned partial product sum:     O O
+  //
+  // 'o' is OddCarry, 'e' is EvenCarry.
+  // EE and OO are computed from partial products via buildMadChain and use
+  // accumulation where possible and appropriate.
+  //
+  Register SeparateOddCarry;
+  Carry EvenCarry;
+  Carry OddCarry;
+
+  for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
+    Carry OddCarryIn = std::move(OddCarry);
+    Carry EvenCarryIn = std::move(EvenCarry);
+    OddCarry.clear();
+    EvenCarry.clear();
+
+    // Partial products at offset 2 * i.
+    if (2 * i < Accum.size()) {
+      auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
+      EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
+    }
+
+    // Partial products at offset 2 * i - 1.
+    if (i > 0) {
+      if (!SeparateOddAlignedProducts) {
+        auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
+        OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
+      } else {
+        bool IsHighest = 2 * i >= Accum.size();
+        Register SeparateOddOut[2];
+        auto LocalAccum = makeMutableArrayRef(SeparateOddOut)
+                              .take_front(IsHighest ? 1 : 2);
+        OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
+
+        MachineInstr *Lo;
+
+        if (i == 1) {
+          if (!IsHighest)
+            Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
+          else
+            Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
+        } else {
+          Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
+                            SeparateOddCarry);
+        }
+        Accum[2 * i - 1] = Lo->getOperand(0).getReg();
+
+        if (!IsHighest) {
+          auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
+                                Lo->getOperand(1).getReg());
+          Accum[2 * i] = Hi.getReg(0);
+          SeparateOddCarry = Hi.getReg(1);
+        }
+      }
+    }
+
+    // Add in the carries from the previous iteration
+    if (i > 0) {
+      if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
+        EvenCarryIn.push_back(CarryOut);
+
+      if (2 * i < Accum.size()) {
+        if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
+          OddCarry.push_back(CarryOut);
+      }
+    }
+  }
+}
+
+// Custom narrowing of wide multiplies using wide multiply-add instructions.
+//
+// TODO: If the multiply is followed by an addition, we should attempt to
+// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
+bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
+                                      MachineInstr &MI) const {
+  assert(ST.hasMad64_32());
+  assert(MI.getOpcode() == TargetOpcode::G_MUL);
+
+  MachineIRBuilder &B = Helper.MIRBuilder;
+  MachineRegisterInfo &MRI = *B.getMRI();
+
+  Register DstReg = MI.getOperand(0).getReg();
+  Register Src0 = MI.getOperand(1).getReg();
+  Register Src1 = MI.getOperand(2).getReg();
+
+  LLT Ty = MRI.getType(DstReg);
+  assert(Ty.isScalar());
+
+  unsigned Size = Ty.getSizeInBits();
+  unsigned NumParts = Size / 32;
+  assert((Size % 32) == 0);
+  assert(NumParts >= 2);
+
+  // Whether to use MAD_64_32 for partial products whose high half is
+  // discarded. This avoids some ADD instructions but risks false dependency
+  // stalls on some subtargets in some cases.
+  const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
+
+  // Whether to compute odd-aligned partial products separately. This is
+  // advisable on subtargets where the accumulator of MAD_64_32 must be placed
+  // in an even-aligned VGPR.
+  const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
+
+  LLT S32 = LLT::scalar(32);
+  SmallVector<Register, 2> Src0Parts, Src1Parts;
+  for (unsigned i = 0; i < NumParts; ++i) {
+    Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
+    Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
+  }
+  B.buildUnmerge(Src0Parts, Src0);
+  B.buildUnmerge(Src1Parts, Src1);
+
+  SmallVector<Register, 2> AccumRegs(NumParts);
+  buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
+                SeparateOddAlignedProducts);
+
+  B.buildMerge(DstReg, AccumRegs);
+  MI.eraseFromParent();
+  return true;
+
+}
+
 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
 // case with a single min instruction instead of a compare+select.
@@ -2954,6 +3337,89 @@ bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
   return true;
 }
 
+static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
+                                int64_t C) {
+  B.buildConstant(MI.getOperand(0).getReg(), C);
+  MI.eraseFromParent();
+  return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
+    MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
+    unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
+  unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
+  if (MaxID == 0)
+    return replaceWithConstant(B, MI, 0);
+
+  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
+  const ArgDescriptor *Arg;
+  const TargetRegisterClass *ArgRC;
+  LLT ArgTy;
+  std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
+
+  Register DstReg = MI.getOperand(0).getReg();
+  if (!Arg) {
+    // It's undefined behavior if a function marked with the amdgpu-no-*
+    // attributes uses the corresponding intrinsic.
+    B.buildUndef(DstReg);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  if (Arg->isMasked()) {
+    // Don't bother inserting AssertZext for packed IDs since we're emitting the
+    // masking operations anyway.
+    //
+    // TODO: We could assert the top bit is 0 for the source copy.
+    if (!loadInputValue(DstReg, B, ArgType))
+      return false;
+  } else {
+    Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
+    if (!loadInputValue(TmpReg, B, ArgType))
+      return false;
+    B.buildAssertZExt(DstReg, TmpReg, 32 - countLeadingZeros(MaxID));
+  }
+
+  MI.eraseFromParent();
+  return true;
+}
+
+Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
+                                                     int64_t Offset) const {
+  LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
+  Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
+
+  // TODO: If we passed in the base kernel offset we could have a better
+  // alignment than 4, but we don't really need it.
+  if (!loadInputValue(KernArgReg, B,
+                      AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
+    llvm_unreachable("failed to find kernarg segment ptr");
+
+  auto COffset = B.buildConstant(LLT::scalar(64), Offset);
+  // TODO: Should get nuw
+  return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
+}
+
+/// Legalize a value that's loaded from kernel arguments. This is only used by
+/// legacy intrinsics.
+bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
+                                                      MachineIRBuilder &B,
+                                                      uint64_t Offset,
+                                                      Align Alignment) const {
+  Register DstReg = MI.getOperand(0).getReg();
+
+  assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
+         "unexpected kernarg parameter type");
+
+  Register Ptr = getKernargParameterPtr(B, Offset);
+  MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
+  B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
+              MachineMemOperand::MODereferenceable |
+                  MachineMemOperand::MOInvariant);
+  MI.eraseFromParent();
+  return true;
+}
+
 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
                                        MachineRegisterInfo &MRI,
                                        MachineIRBuilder &B) const {
@@ -3688,9 +4154,9 @@ bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
   // The remaining operands were used to set fields in the MemOperand on
   // construction.
   for (int I = 6; I > 3; --I)
-    MI.RemoveOperand(I);
+    MI.removeOperand(I);
 
-  MI.RemoveOperand(1); // Remove the intrinsic ID.
+  MI.removeOperand(1); // Remove the intrinsic ID.
   Observer.changedInstr(MI);
   return true;
 }
@@ -4359,7 +4825,7 @@ static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
 ///
 /// We don't want to directly select image instructions just yet, but also want
 /// to exposes all register repacking to the legalizer/combiners. We also don't
-/// want a selected instrution entering RegBankSelect. In order to avoid
+/// want a selected instruction entering RegBankSelect. In order to avoid
 /// defining a multitude of intermediate image instructions, directly hack on
 /// the intrinsic's arguments. In cases like a16 addresses, this requires
 /// padding now unnecessary arguments with $noreg.
@@ -4508,6 +4974,10 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
     //
     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
     // allocation when possible.
+    //
+    // TODO: we can actually allow partial NSA where the final register is a
+    // contiguous set of the remaining addresses.
+    // This could help where there are more addresses than supported.
     const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
 
@@ -4607,7 +5077,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
       return false;
 
     // TODO: Make sure the TFE operand bit is set.
-    MI.RemoveOperand(1);
+    MI.removeOperand(1);
 
     // Handle the easy case that requires no repack instructions.
     if (Ty == S32) {
@@ -4737,7 +5207,7 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(
   // should be fixed to have a memory operand. Since it's readnone, we're not
   // allowed to add one.
   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
-  MI.RemoveOperand(1); // Remove intrinsic ID
+  MI.removeOperand(1); // Remove intrinsic ID
 
   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
   // TODO: Should this use datalayout alignment?
@@ -4797,6 +5267,47 @@ bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
 
 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
+  MachineFunction &MF = B.getMF();
+  const LLT S64 = LLT::scalar(64);
+
+  Register SGPR01(AMDGPU::SGPR0_SGPR1);
+  // For code object version 5, queue_ptr is passed through implicit kernarg.
+  if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
+    AMDGPUTargetLowering::ImplicitParameter Param =
+        AMDGPUTargetLowering::QUEUE_PTR;
+    uint64_t Offset =
+        ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
+
+    Register KernargPtrReg = MRI.createGenericVirtualRegister(
+        LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
+
+    if (!loadInputValue(KernargPtrReg, B,
+                        AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
+      return false;
+
+    // TODO: can we be smarter about machine pointer info?
+    MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+        PtrInfo,
+        MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+            MachineMemOperand::MOInvariant,
+        LLT::scalar(64), commonAlignment(Align(64), Offset));
+
+    // Pointer address
+    Register LoadAddr = MRI.createGenericVirtualRegister(
+        LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
+    B.buildPtrAdd(LoadAddr, KernargPtrReg,
+                  B.buildConstant(LLT::scalar(64), Offset).getReg(0));
+    // Load address
+    Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
+    B.buildCopy(SGPR01, Temp);
+    B.buildInstr(AMDGPU::S_TRAP)
+        .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
+        .addReg(SGPR01, RegState::Implicit);
+    MI.eraseFromParent();
+    return true;
+  }
+
   // Pass queue pointer to trap handler as input, and insert trap instruction
   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
   Register LiveIn =
@@ -4804,7 +5315,6 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
     return false;
 
-  Register SGPR01(AMDGPU::SGPR0_SGPR1);
   B.buildCopy(SGPR01, LiveIn);
   B.buildInstr(AMDGPU::S_TRAP)
       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
@@ -4848,6 +5358,8 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
   MachineRegisterInfo &MRI = *B.getMRI();
   const LLT S16 = LLT::scalar(16);
   const LLT S32 = LLT::scalar(32);
+  const LLT V2S16 = LLT::fixed_vector(2, 16);
+  const LLT V3S32 = LLT::fixed_vector(3, 32);
 
   Register DstReg = MI.getOperand(0).getReg();
   Register NodePtr = MI.getOperand(2).getReg();
@@ -4865,61 +5377,98 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
     return false;
   }
 
+  const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
   const unsigned NumVDataDwords = 4;
   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
-  const bool UseNSA =
-      ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
+  const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
+  const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize();
   const unsigned BaseOpcodes[2][2] = {
       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
   int Opcode;
   if (UseNSA) {
-    Opcode =
-        AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
-                              NumVDataDwords, NumVAddrDwords);
-  } else {
     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
-                                   AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
-                                   PowerOf2Ceil(NumVAddrDwords));
+                                   IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA
+                                               : AMDGPU::MIMGEncGfx10NSA,
+                                   NumVDataDwords, NumVAddrDwords);
+  } else {
+    Opcode = AMDGPU::getMIMGOpcode(
+        BaseOpcodes[Is64][IsA16],
+        IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default,
+        NumVDataDwords, PowerOf2Ceil(NumVAddrDwords));
   }
   assert(Opcode != -1);
 
   SmallVector<Register, 12> Ops;
-  if (Is64) {
-    auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
-    Ops.push_back(Unmerge.getReg(0));
-    Ops.push_back(Unmerge.getReg(1));
-  } else {
+  if (UseNSA && IsGFX11Plus) {
+    auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
+      auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
+      auto Merged = B.buildMerge(
+          V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
+      Ops.push_back(Merged.getReg(0));
+    };
+
     Ops.push_back(NodePtr);
-  }
-  Ops.push_back(RayExtent);
+    Ops.push_back(RayExtent);
+    packLanes(RayOrigin);
+
+    if (IsA16) {
+      auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
+      auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
+      auto MergedDir = B.buildMerge(
+          V3S32,
+          {B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(0),
+                                                    UnmergeRayDir.getReg(0)}))
+               .getReg(0),
+           B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(1),
+                                                    UnmergeRayDir.getReg(1)}))
+               .getReg(0),
+           B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(2),
+                                                    UnmergeRayDir.getReg(2)}))
+               .getReg(0)});
+      Ops.push_back(MergedDir.getReg(0));
+    } else {
+      packLanes(RayDir);
+      packLanes(RayInvDir);
+    }
+  } else {
+    if (Is64) {
+      auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
+      Ops.push_back(Unmerge.getReg(0));
+      Ops.push_back(Unmerge.getReg(1));
+    } else {
+      Ops.push_back(NodePtr);
+    }
+    Ops.push_back(RayExtent);
 
-  auto packLanes = [&Ops, &S32, &B](Register Src) {
-    auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
-    Ops.push_back(Unmerge.getReg(0));
-    Ops.push_back(Unmerge.getReg(1));
-    Ops.push_back(Unmerge.getReg(2));
-  };
+    auto packLanes = [&Ops, &S32, &B](Register Src) {
+      auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
+      Ops.push_back(Unmerge.getReg(0));
+      Ops.push_back(Unmerge.getReg(1));
+      Ops.push_back(Unmerge.getReg(2));
+    };
 
-  packLanes(RayOrigin);
-  if (IsA16) {
-    auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
-    auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
-    Register R1 = MRI.createGenericVirtualRegister(S32);
-    Register R2 = MRI.createGenericVirtualRegister(S32);
-    Register R3 = MRI.createGenericVirtualRegister(S32);
-    B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
-    B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
-    B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
-    Ops.push_back(R1);
-    Ops.push_back(R2);
-    Ops.push_back(R3);
-  } else {
-    packLanes(RayDir);
-    packLanes(RayInvDir);
+    packLanes(RayOrigin);
+    if (IsA16) {
+      auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
+      auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
+      Register R1 = MRI.createGenericVirtualRegister(S32);
+      Register R2 = MRI.createGenericVirtualRegister(S32);
+      Register R3 = MRI.createGenericVirtualRegister(S32);
+      B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
+      B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
+      B.buildMerge(R3,
+                   {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
+      Ops.push_back(R1);
+      Ops.push_back(R2);
+      Ops.push_back(R3);
+    } else {
+      packLanes(RayDir);
+      packLanes(RayInvDir);
+    }
   }
 
   if (!UseNSA) {
@@ -4946,9 +5495,24 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
   return true;
 }
 
-static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) {
-  B.buildConstant(MI.getOperand(0).getReg(), C);
+bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
+                                               MachineIRBuilder &B) const {
+  unsigned Opc;
+  int RoundMode = MI.getOperand(2).getImm();
+
+  if (RoundMode == (int)RoundingMode::TowardPositive)
+    Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
+  else if (RoundMode == (int)RoundingMode::TowardNegative)
+    Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
+  else
+    return false;
+
+  B.buildInstr(Opc)
+      .addDef(MI.getOperand(0).getReg())
+      .addUse(MI.getOperand(1).getReg());
+
   MI.eraseFromParent();
+
   return true;
 }
 
@@ -5055,22 +5619,14 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::amdgcn_implicitarg_ptr:
     return legalizeImplicitArgPtr(MI, MRI, B);
   case Intrinsic::amdgcn_workitem_id_x:
-    if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0)
-      return replaceWithConstant(B, MI, 0);
-    return legalizePreloadedArgIntrin(MI, MRI, B,
-                                      AMDGPUFunctionArgInfo::WORKITEM_ID_X);
+    return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
+                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
   case Intrinsic::amdgcn_workitem_id_y:
-    if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0)
-      return replaceWithConstant(B, MI, 0);
-
-    return legalizePreloadedArgIntrin(MI, MRI, B,
-                                      AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
+    return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
+                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
   case Intrinsic::amdgcn_workitem_id_z:
-    if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0)
-      return replaceWithConstant(B, MI, 0);
-
-    return legalizePreloadedArgIntrin(MI, MRI, B,
-                                      AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
+    return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
+                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
   case Intrinsic::amdgcn_workgroup_id_x:
     return legalizePreloadedArgIntrin(MI, MRI, B,
                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
@@ -5092,6 +5648,31 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::amdgcn_dispatch_id:
     return legalizePreloadedArgIntrin(MI, MRI, B,
                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
+  case Intrinsic::r600_read_ngroups_x:
+    // TODO: Emit error for hsa
+    return legalizeKernargMemParameter(MI, B,
+                                       SI::KernelInputOffsets::NGROUPS_X);
+  case Intrinsic::r600_read_ngroups_y:
+    return legalizeKernargMemParameter(MI, B,
+                                       SI::KernelInputOffsets::NGROUPS_Y);
+  case Intrinsic::r600_read_ngroups_z:
+    return legalizeKernargMemParameter(MI, B,
+                                       SI::KernelInputOffsets::NGROUPS_Z);
+  case Intrinsic::r600_read_local_size_x:
+    // TODO: Could insert G_ASSERT_ZEXT from s16
+    return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
+  case Intrinsic::r600_read_local_size_y:
+    // TODO: Could insert G_ASSERT_ZEXT from s16
+    return legalizeKernargMemParameter(MI, B,  SI::KernelInputOffsets::LOCAL_SIZE_Y);
+    // TODO: Could insert G_ASSERT_ZEXT from s16
+  case Intrinsic::r600_read_local_size_z:
+    return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
+  case Intrinsic::r600_read_global_size_x:
+    return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
+  case Intrinsic::r600_read_global_size_y:
+    return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
+  case Intrinsic::r600_read_global_size_z:
+    return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
   case Intrinsic::amdgcn_fdiv_fast:
     return legalizeFDIVFastIntrin(MI, MRI, B);
   case Intrinsic::amdgcn_is_shared:
@@ -5157,7 +5738,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
   case Intrinsic::amdgcn_struct_buffer_atomic_fadd: {
     Register DstReg = MI.getOperand(0).getReg();
-    if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) {
+    if (!MRI.use_empty(DstReg) &&
+        !AMDGPU::hasAtomicFaddRtnForTy(ST, MRI.getType(DstReg))) {
       Function &F = B.getMF().getFunction();
       DiagnosticInfoUnsupported NoFpRet(
           F, "return versions of fp atomics not supported", B.getDebugLoc(),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 964a41d3d740..cee533aa34ec 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -88,6 +88,12 @@ public:
 
   bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI,
                            MachineIRBuilder &B) const;
+
+  void buildMultiply(LegalizerHelper &Helper, MutableArrayRef<Register> Accum,
+                     ArrayRef<Register> Src0, ArrayRef<Register> Src1,
+                     bool UsePartialMad64_32,
+                     bool SeparateOddAlignedProducts) const;
+  bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const;
   bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI,
                          MachineIRBuilder &B) const;
 
@@ -96,9 +102,18 @@ public:
                       const TargetRegisterClass *ArgRC, LLT ArgTy) const;
   bool loadInputValue(Register DstReg, MachineIRBuilder &B,
                       AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
+
   bool legalizePreloadedArgIntrin(
     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
+  bool legalizeWorkitemIDIntrinsic(
+      MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
+      unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
+
+  Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const;
+  bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B,
+                                   uint64_t Offset,
+                                   Align Alignment = Align(4)) const;
 
   bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI,
                                MachineIRBuilder &B) const;
@@ -169,6 +184,8 @@ public:
 
   bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const;
 
+  bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const;
+
   bool legalizeImageIntrinsic(
       MachineInstr &MI, MachineIRBuilder &B,
       GISelChangeObserver &Observer,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index bbbadfdfd444..78e092b2e872 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -1593,8 +1593,9 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
 
   // max vector size is 16, and sincos will generate two results.
   double DVal0[16], DVal1[16];
+  int FuncVecSize = getVecSize(FInfo);
   bool hasTwoResults = (FInfo.getId() == AMDGPULibFunc::EI_SINCOS);
-  if (getVecSize(FInfo) == 1) {
+  if (FuncVecSize == 1) {
     if (!evaluateScalarMathFunc(FInfo, DVal0[0],
                                 DVal1[0], copr0, copr1, copr2)) {
       return false;
@@ -1603,7 +1604,7 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
     ConstantDataVector *CDV0 = dyn_cast_or_null<ConstantDataVector>(copr0);
     ConstantDataVector *CDV1 = dyn_cast_or_null<ConstantDataVector>(copr1);
     ConstantDataVector *CDV2 = dyn_cast_or_null<ConstantDataVector>(copr2);
-    for (int i=0; i < getVecSize(FInfo); ++i) {
+    for (int i = 0; i < FuncVecSize; ++i) {
       Constant *celt0 = CDV0 ? CDV0->getElementAsConstant(i) : nullptr;
       Constant *celt1 = CDV1 ? CDV1->getElementAsConstant(i) : nullptr;
       Constant *celt2 = CDV2 ? CDV2->getElementAsConstant(i) : nullptr;
@@ -1616,19 +1617,19 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
 
   LLVMContext &context = CI->getParent()->getParent()->getContext();
   Constant *nval0, *nval1;
-  if (getVecSize(FInfo) == 1) {
+  if (FuncVecSize == 1) {
     nval0 = ConstantFP::get(CI->getType(), DVal0[0]);
     if (hasTwoResults)
       nval1 = ConstantFP::get(CI->getType(), DVal1[0]);
   } else {
     if (getArgType(FInfo) == AMDGPULibFunc::F32) {
       SmallVector <float, 0> FVal0, FVal1;
-      for (int i=0; i < getVecSize(FInfo); ++i)
+      for (int i = 0; i < FuncVecSize; ++i)
         FVal0.push_back((float)DVal0[i]);
       ArrayRef<float> tmp0(FVal0);
       nval0 = ConstantDataVector::get(context, tmp0);
       if (hasTwoResults) {
-        for (int i=0; i < getVecSize(FInfo); ++i)
+        for (int i = 0; i < FuncVecSize; ++i)
           FVal1.push_back((float)DVal1[i]);
         ArrayRef<float> tmp1(FVal1);
         nval1 = ConstantDataVector::get(context, tmp1);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h
index dc0ac72016f3..bf0fda25b2c0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h
@@ -324,8 +324,8 @@ public:
 
 class AMDGPULibFuncImpl : public AMDGPULibFuncBase {
 public:
-  AMDGPULibFuncImpl() {}
-  virtual ~AMDGPULibFuncImpl() {}
+  AMDGPULibFuncImpl() = default;
+  virtual ~AMDGPULibFuncImpl() = default;
 
   /// Get unmangled name for mangled library function and name for unmangled
   /// library function.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
index b700dd5aa301..93d1eed2cf63 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
@@ -13,7 +13,6 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsR600.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/CommandLine.h"
@@ -156,11 +155,8 @@ bool AMDGPULowerIntrinsics::runOnModule(Module &M) {
         Changed = true;
       break;
 
-    case Intrinsic::amdgcn_workitem_id_x:
     case Intrinsic::r600_read_tidig_x:
-    case Intrinsic::amdgcn_workitem_id_y:
     case Intrinsic::r600_read_tidig_y:
-    case Intrinsic::amdgcn_workitem_id_z:
     case Intrinsic::r600_read_tidig_z:
     case Intrinsic::r600_read_local_size_x:
     case Intrinsic::r600_read_local_size_y:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index c34c12ab9fec..2e5c35f1f571 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -73,7 +73,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
   const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F);
 
   Align MaxAlign;
-  // FIXME: Alignment is broken broken with explicit arg offset.;
+  // FIXME: Alignment is broken with explicit arg offset.;
   const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign);
   if (TotalKernArgSize == 0)
     return false;
@@ -92,9 +92,8 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
   for (Argument &Arg : F.args()) {
     const bool IsByRef = Arg.hasByRefAttr();
     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
-    MaybeAlign ABITypeAlign = IsByRef ? Arg.getParamAlign() : None;
-    if (!ABITypeAlign)
-      ABITypeAlign = DL.getABITypeAlign(ArgTy);
+    MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : None;
+    Align ABITypeAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy);
 
     uint64_t Size = DL.getTypeSizeInBits(ArgTy);
     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index 08a1b970648d..f5903b3afb81 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -163,39 +163,29 @@ static bool processUse(CallInst *CI) {
     if (!GroupSize || !GridSize)
       continue;
 
+    using namespace llvm::PatternMatch;
+    auto GroupIDIntrin =
+        I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
+               : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
+                         : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
+
     for (User *U : GroupSize->users()) {
       auto *ZextGroupSize = dyn_cast<ZExtInst>(U);
       if (!ZextGroupSize)
         continue;
 
-      for (User *ZextUser : ZextGroupSize->users()) {
-        auto *SI = dyn_cast<SelectInst>(ZextUser);
-        if (!SI)
-          continue;
-
-        using namespace llvm::PatternMatch;
-        auto GroupIDIntrin = I == 0 ?
-          m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>() :
-            (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>() :
-                      m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
-
-        auto SubExpr = m_Sub(m_Specific(GridSize),
-                             m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize)));
-
-        ICmpInst::Predicate Pred;
-        if (match(SI,
-                  m_Select(m_ICmp(Pred, SubExpr, m_Specific(ZextGroupSize)),
-                           SubExpr,
-                           m_Specific(ZextGroupSize))) &&
-            Pred == ICmpInst::ICMP_ULT) {
+      for (User *UMin : ZextGroupSize->users()) {
+        if (match(UMin,
+                  m_UMin(m_Sub(m_Specific(GridSize),
+                               m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize))),
+                         m_Specific(ZextGroupSize)))) {
           if (HasReqdWorkGroupSize) {
             ConstantInt *KnownSize
               = mdconst::extract<ConstantInt>(MD->getOperand(I));
-            SI->replaceAllUsesWith(ConstantExpr::getIntegerCast(KnownSize,
-                                                                SI->getType(),
-                                                                false));
+            UMin->replaceAllUsesWith(ConstantExpr::getIntegerCast(
+                KnownSize, UMin->getType(), false));
           } else {
-            SI->replaceAllUsesWith(ZextGroupSize);
+            UMin->replaceAllUsesWith(ZextGroupSize);
           }
 
           MadeChange = true;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 6e2b5dc471bc..35922341de26 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -14,7 +14,7 @@
 // known address. AMDGPUMachineFunction allocates the LDS global.
 //
 // Local variables with constant annotation or non-undef initializer are passed
-// through unchanged for simplication or error diagnostics in later passes.
+// through unchanged for simplification or error diagnostics in later passes.
 //
 // To reduce the memory overhead variables that are only used by kernels are
 // excluded from this transform. The analysis to determine whether a variable
@@ -28,8 +28,9 @@
 
 #include "AMDGPU.h"
 #include "Utils/AMDGPUBaseInfo.h"
-#include "Utils/AMDGPULDSUtils.h"
+#include "Utils/AMDGPUMemoryUtils.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/CallGraph.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
@@ -163,9 +164,10 @@ public:
   }
 
   bool runOnModule(Module &M) override {
+    CallGraph CG = CallGraph(M);
     UsedList = getUsedList(M);
     bool Changed = superAlignLDSGlobals(M);
-    Changed |= processUsedLDS(M);
+    Changed |= processUsedLDS(CG, M);
 
     for (Function &F : M.functions()) {
       if (F.isDeclaration())
@@ -174,7 +176,7 @@ public:
       // Only lower compute kernels' LDS.
       if (!AMDGPU::isKernel(F.getCallingConv()))
         continue;
-      Changed |= processUsedLDS(M, &F);
+      Changed |= processUsedLDS(CG, M, &F);
     }
 
     UsedList.clear();
@@ -226,7 +228,7 @@ private:
     return Changed;
   }
 
-  bool processUsedLDS(Module &M, Function *F = nullptr) {
+  bool processUsedLDS(CallGraph const &CG, Module &M, Function *F = nullptr) {
     LLVMContext &Ctx = M.getContext();
     const DataLayout &DL = M.getDataLayout();
 
@@ -374,7 +376,20 @@ private:
       IRBuilder<> Builder(Ctx);
       for (Function &Func : M.functions()) {
         if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) {
-          markUsedByKernel(Builder, &Func, SGV);
+          const CallGraphNode *N = CG[&Func];
+          const bool CalleesRequireModuleLDS = N->size() > 0;
+
+          if (CalleesRequireModuleLDS) {
+            // If a function this kernel might call requires module LDS,
+            // annotate the kernel to let later passes know it will allocate
+            // this structure, even if not apparent from the IR.
+            markUsedByKernel(Builder, &Func, SGV);
+          } else {
+            // However if we are certain this kernel cannot call a function that
+            // requires module LDS, annotate the kernel so the backend can elide
+            // the allocation without repeating callgraph walks.
+            Func.addFnAttr("amdgpu-elide-module-lds");
+          }
         }
       }
     }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 3fad7e192195..ed6ddbf426fd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -120,8 +120,7 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
   // FIXME: Should be able to handle this with emitPseudoExpansionLowering. We
   // need to select it to the subtarget specific version, and there's no way to
   // do that with a single pseudo source operation.
-  if (Opcode == AMDGPU::S_SETPC_B64_return ||
-      Opcode == AMDGPU::S_SETPC_B64_return_gfx)
+  if (Opcode == AMDGPU::S_SETPC_B64_return)
     Opcode = AMDGPU::S_SETPC_B64;
   else if (Opcode == AMDGPU::SI_CALL) {
     // SI_CALL is just S_SWAPPC_B64 with an additional operand to track the
@@ -208,6 +207,16 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
       return;
     }
 
+    if (MI->getOpcode() == AMDGPU::SCHED_BARRIER) {
+      if (isVerbose()) {
+        std::string HexString;
+        raw_string_ostream HexStream(HexString);
+        HexStream << format_hex(MI->getOperand(0).getImm(), 10, true);
+        OutStreamer->emitRawComment(" sched_barrier mask(" + HexString + ")");
+      }
+      return;
+    }
+
     if (MI->getOpcode() == AMDGPU::SI_MASKED_UNREACHABLE) {
       if (isVerbose())
         OutStreamer->emitRawComment(" divergent unreachable");
@@ -240,7 +249,7 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
       raw_svector_ostream CodeStream(CodeBytes);
 
       std::unique_ptr<MCCodeEmitter> InstEmitter(createSIMCCodeEmitter(
-          *STI.getInstrInfo(), *OutContext.getRegisterInfo(), OutContext));
+          *STI.getInstrInfo(), OutContext));
       InstEmitter->encodeInstruction(TmpInst, CodeStream, Fixups, STI);
 
       assert(CodeBytes.size() == STI.getInstrInfo()->getInstSizeInBytes(*MI));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
index 0e43b4fe9461..5c656f158e71 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
@@ -1,4 +1,4 @@
-//===- AMDGPUMCInstLower.h - Lower AMDGPU MachineInstr to an MCInst -------===//
+//===- AMDGPUMCInstLower.h - Lower MachineInstr to MCInst ------*- C++ -*--===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp
index c3441f81a78e..0712466a0e88 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp
@@ -21,17 +21,18 @@ bool AMDGPUMIRFormatter::parseCustomPseudoSourceValue(
     StringRef Src, MachineFunction &MF, PerFunctionMIParsingState &PFS,
     const PseudoSourceValue *&PSV, ErrorCallbackType ErrorCallback) const {
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  const SIInstrInfo &TII = *MF.getSubtarget<GCNSubtarget>().getInstrInfo();
+  const AMDGPUTargetMachine &TM =
+      static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
   if (Src == "BufferResource") {
-    PSV = MFI->getBufferPSV(TII);
+    PSV = MFI->getBufferPSV(TM);
     return false;
   }
   if (Src == "ImageResource") {
-    PSV = MFI->getImagePSV(TII);
+    PSV = MFI->getImagePSV(TM);
     return false;
   }
   if (Src == "GWSResource") {
-    PSV = MFI->getGWSPSV(TII);
+    PSV = MFI->getGWSPSV(TM);
     return false;
   }
   llvm_unreachable("unknown MIR custom pseudo source value");
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
index 47faa6c72481..753f7edc9385 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
@@ -25,7 +25,7 @@ struct PerFunctionMIParsingState;
 
 class AMDGPUMIRFormatter final : public MIRFormatter {
 public:
-  AMDGPUMIRFormatter() {}
+  AMDGPUMIRFormatter() = default;
   virtual ~AMDGPUMIRFormatter() = default;
 
   /// Implement target specific parsing of target custom pseudo source value.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
index 4e2f98d2a5db..d837f8cb2f60 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
@@ -1295,7 +1295,7 @@ static void fixRegionTerminator(RegionMRT *Region) {
   }
 }
 
-// If a region region is just a sequence of regions (and the exit
+// If a region is just a sequence of regions (and the exit
 // block in the case of the top level region), we can simply skip
 // linearizing it, because it is already linear
 bool regionIsSequence(RegionMRT *Region) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 593388a4d819..b461c3c4bfdc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUMachineFunction.h"
+#include "AMDGPU.h"
 #include "AMDGPUPerfHintAnalysis.h"
 #include "AMDGPUSubtarget.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
@@ -32,6 +33,15 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF)
   Attribute WaveLimitAttr = F.getFnAttribute("amdgpu-wave-limiter");
   WaveLimiter = WaveLimitAttr.getValueAsBool();
 
+  // FIXME: How is this attribute supposed to interact with statically known
+  // global sizes?
+  StringRef S = F.getFnAttribute("amdgpu-gds-size").getValueAsString();
+  if (!S.empty())
+    S.consumeInteger(0, GDSSize);
+
+  // Assume the attribute allocates before any known GDS globals.
+  StaticGDSSize = GDSSize;
+
   CallingConv::ID CC = F.getCallingConv();
   if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL)
     ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign);
@@ -46,25 +56,43 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
   Align Alignment =
       DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType());
 
-  /// TODO: We should sort these to minimize wasted space due to alignment
-  /// padding. Currently the padding is decided by the first encountered use
-  /// during lowering.
-  unsigned Offset = StaticLDSSize = alignTo(StaticLDSSize, Alignment);
+  unsigned Offset;
+  if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+    /// TODO: We should sort these to minimize wasted space due to alignment
+    /// padding. Currently the padding is decided by the first encountered use
+    /// during lowering.
+    Offset = StaticLDSSize = alignTo(StaticLDSSize, Alignment);
 
-  Entry.first->second = Offset;
-  StaticLDSSize += DL.getTypeAllocSize(GV.getValueType());
+    StaticLDSSize += DL.getTypeAllocSize(GV.getValueType());
 
-  // Update the LDS size considering the padding to align the dynamic shared
-  // memory.
-  LDSSize = alignTo(StaticLDSSize, DynLDSAlign);
+    // Update the LDS size considering the padding to align the dynamic shared
+    // memory.
+    LDSSize = alignTo(StaticLDSSize, DynLDSAlign);
+  } else {
+    assert(GV.getAddressSpace() == AMDGPUAS::REGION_ADDRESS &&
+           "expected region address space");
 
+    Offset = StaticGDSSize = alignTo(StaticGDSSize, Alignment);
+    StaticGDSSize += DL.getTypeAllocSize(GV.getValueType());
+
+    // FIXME: Apply alignment of dynamic GDS
+    GDSSize = StaticGDSSize;
+  }
+
+  Entry.first->second = Offset;
   return Offset;
 }
 
-void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Module *M) {
+// This kernel calls no functions that require the module lds struct
+static bool canElideModuleLDS(const Function &F) {
+  return F.hasFnAttribute("amdgpu-elide-module-lds");
+}
+
+void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Function &F) {
+  const Module *M = F.getParent();
   if (isModuleEntryFunction()) {
     const GlobalVariable *GV = M->getNamedGlobal("llvm.amdgcn.module.lds");
-    if (GV) {
+    if (GV && !canElideModuleLDS(F)) {
       unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV);
       (void)Offset;
       assert(Offset == 0 &&
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 48cf46b5f871..df62c2314617 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -12,6 +12,10 @@
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Function.h"
 
 namespace llvm {
 
@@ -25,11 +29,13 @@ protected:
   Align MaxKernArgAlign;        // Cache for this.
 
   /// Number of bytes in the LDS that are being used.
-  unsigned LDSSize = 0;
+  uint32_t LDSSize = 0;
+  uint32_t GDSSize = 0;
 
   /// Number of bytes in the LDS allocated statically. This field is only used
   /// in the instruction selector and not part of the machine function info.
-  unsigned StaticLDSSize = 0;
+  uint32_t StaticLDSSize = 0;
+  uint32_t StaticGDSSize = 0;
 
   /// Align for dynamic shared memory if any. Dynamic shared memory is
   /// allocated directly after the static one, i.e., LDSSize. Need to pad
@@ -63,12 +69,16 @@ public:
     return ExplicitKernArgSize;
   }
 
-  unsigned getMaxKernArgAlign() const { return MaxKernArgAlign.value(); }
+  Align getMaxKernArgAlign() const { return MaxKernArgAlign; }
 
-  unsigned getLDSSize() const {
+  uint32_t getLDSSize() const {
     return LDSSize;
   }
 
+  uint32_t getGDSSize() const {
+    return GDSSize;
+  }
+
   AMDGPU::SIModeRegisterDefaults getMode() const {
     return Mode;
   }
@@ -92,7 +102,7 @@ public:
   }
 
   unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV);
-  void allocateModuleLDSGlobal(const Module *M);
+  void allocateModuleLDSGlobal(const Function &F);
 
   Align getDynLDSAlign() const { return DynLDSAlign; }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
index 6646cce8186b..2d48be9ea542 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUMachineModuleInfo.h"
+#include "llvm/MC/MCSymbol.h"
 
 namespace llvm {
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
index 5a5a5d213a1a..fb7709d66c76 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
@@ -34,6 +34,7 @@
 #include "AMDGPU.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
@@ -71,7 +72,7 @@ ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() {
   return new AMDGPUOpenCLEnqueuedBlockLowering();
 }
 
-/// Collect direct or indrect callers of \p F and save them
+/// Collect direct or indirect callers of \p F and save them
 /// to \p Callers.
 static void collectCallers(Function *F, DenseSet<Function *> &Callers) {
   for (auto U : F->users()) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
index 8ad344816ad2..09dbd2150db6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
@@ -116,7 +116,6 @@ private:
 
   bool isGlobalAddr(const Value *V) const;
   bool isLocalAddr(const Value *V) const;
-  bool isConstantAddr(const Value *V) const;
 };
 
 static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType(
@@ -153,7 +152,7 @@ bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {
 
     if (auto LD = dyn_cast<LoadInst>(V)) {
       auto M = LD->getPointerOperand();
-      if (isGlobalAddr(M) || isLocalAddr(M) || isConstantAddr(M)) {
+      if (isGlobalAddr(M)) {
         LLVM_DEBUG(dbgs() << "    is IA\n");
         return true;
       }
@@ -267,19 +266,23 @@ bool AMDGPUPerfHint::runOnFunction(Function &F) {
                     << " LSMInst cost: " << Info->LSMInstCost << '\n'
                     << " TotalInst cost: " << Info->InstCost << '\n');
 
+  bool Changed = false;
+
   if (isMemBound(*Info)) {
     LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n");
     NumMemBound++;
     F.addFnAttr("amdgpu-memory-bound", "true");
+    Changed = true;
   }
 
   if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(*Info)) {
     LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n");
     NumLimitWave++;
     F.addFnAttr("amdgpu-wave-limiter", "true");
+    Changed = true;
   }
 
-  return true;
+  return Changed;
 }
 
 bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
@@ -332,15 +335,6 @@ AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const {
   return MAI;
 }
 
-bool AMDGPUPerfHint::isConstantAddr(const Value *V) const {
-  if (auto PT = dyn_cast<PointerType>(V->getType())) {
-    unsigned As = PT->getAddressSpace();
-    return As == AMDGPUAS::CONSTANT_ADDRESS ||
-           As == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
-  }
-  return false;
-}
-
 bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(
     MemAccessInfo &Reference) const {
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index c029046ab65f..bfe2e9b66ed4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -16,6 +16,7 @@
 #include "AMDGPULegalizerInfo.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
 #include "llvm/CodeGen/GlobalISel/Combiner.h"
 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
@@ -125,7 +126,6 @@ void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16(
          LLT::scalar(64));
   const LLT S32 = LLT::scalar(32);
 
-  B.setMBB(*MI.getParent());
   B.setInstrAndDebugLoc(MI);
 
   auto Unmerge = B.buildUnmerge(S32, Src);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
index f91f31508ad2..1db7c18e4598 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -19,6 +19,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/Dominators.h"
@@ -66,7 +67,7 @@ private:
 
   Value *simplify(Instruction *I, const TargetLibraryInfo *TLI,
                   const DominatorTree *DT) {
-    return SimplifyInstruction(I, {*TD, TLI, DT});
+    return simplifyInstruction(I, {*TD, TLI, DT});
   }
 
   const DataLayout *TD;
@@ -562,15 +563,6 @@ bool AMDGPUPrintfRuntimeBindingImpl::run(Module &M) {
   if (Printfs.empty())
     return false;
 
-  if (auto HostcallFunction = M.getFunction("__ockl_hostcall_internal")) {
-    for (auto &U : HostcallFunction->uses()) {
-      if (auto *CI = dyn_cast<CallInst>(U.getUser())) {
-        M.getContext().emitError(
-            CI, "Cannot use both printf and hostcall in the same module");
-      }
-    }
-  }
-
   TD = &M.getDataLayout();
 
   return lowerPrintfForGpu(M);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 99b7ffb33884..5a4426ba8113 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -334,86 +334,49 @@ static FixedVectorType *arrayTypeToVecType(ArrayType *ArrayTy) {
                               ArrayTy->getNumElements());
 }
 
-static Value *stripBitcasts(Value *V) {
-  while (Instruction *I = dyn_cast<Instruction>(V)) {
-    if (I->getOpcode() != Instruction::BitCast)
-      break;
-    V = I->getOperand(0);
-  }
-  return V;
-}
-
 static Value *
 calculateVectorIndex(Value *Ptr,
                      const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
-  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(stripBitcasts(Ptr));
+  auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts());
   if (!GEP)
-    return nullptr;
+    return ConstantInt::getNullValue(Type::getInt32Ty(Ptr->getContext()));
 
   auto I = GEPIdx.find(GEP);
-  return I == GEPIdx.end() ? nullptr : I->second;
+  assert(I != GEPIdx.end() && "Must have entry for GEP!");
+  return I->second;
 }
 
-static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
-  // FIXME we only support simple cases
-  if (GEP->getNumOperands() != 3)
+static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
+                               Type *VecElemTy, const DataLayout &DL) {
+  // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
+  // helper.
+  unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
+  MapVector<Value *, APInt> VarOffsets;
+  APInt ConstOffset(BW, 0);
+  if (GEP->getPointerOperand()->stripPointerCasts() != Alloca ||
+      !GEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
     return nullptr;
 
-  ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
-  if (!I0 || !I0->isZero())
+  unsigned VecElemSize = DL.getTypeAllocSize(VecElemTy);
+  if (VarOffsets.size() > 1)
     return nullptr;
 
-  return GEP->getOperand(2);
-}
-
-// Not an instruction handled below to turn into a vector.
-//
-// TODO: Check isTriviallyVectorizable for calls and handle other
-// instructions.
-static bool canVectorizeInst(Instruction *Inst, User *User,
-                             const DataLayout &DL) {
-  switch (Inst->getOpcode()) {
-  case Instruction::Load: {
-    // Currently only handle the case where the Pointer Operand is a GEP.
-    // Also we could not vectorize volatile or atomic loads.
-    LoadInst *LI = cast<LoadInst>(Inst);
-    if (isa<AllocaInst>(User) &&
-        LI->getPointerOperandType() == User->getType() &&
-        isa<VectorType>(LI->getType()))
-      return true;
-
-    Instruction *PtrInst = dyn_cast<Instruction>(LI->getPointerOperand());
-    if (!PtrInst)
-      return false;
-
-    return (PtrInst->getOpcode() == Instruction::GetElementPtr ||
-            PtrInst->getOpcode() == Instruction::BitCast) &&
-           LI->isSimple();
+  if (VarOffsets.size() == 1) {
+    // Only handle cases where we don't need to insert extra arithmetic
+    // instructions.
+    const auto &VarOffset = VarOffsets.front();
+    if (!ConstOffset.isZero() || VarOffset.second != VecElemSize)
+      return nullptr;
+    return VarOffset.first;
   }
-  case Instruction::BitCast:
-    return true;
-  case Instruction::Store: {
-    // Must be the stored pointer operand, not a stored value, plus
-    // since it should be canonical form, the User should be a GEP.
-    // Also we could not vectorize volatile or atomic stores.
-    StoreInst *SI = cast<StoreInst>(Inst);
-    if (isa<AllocaInst>(User) &&
-        SI->getPointerOperandType() == User->getType() &&
-        isa<VectorType>(SI->getValueOperand()->getType()))
-      return true;
-
-    Instruction *UserInst = dyn_cast<Instruction>(User);
-    if (!UserInst)
-      return false;
 
-    return (SI->getPointerOperand() == User) &&
-           (UserInst->getOpcode() == Instruction::GetElementPtr ||
-            UserInst->getOpcode() == Instruction::BitCast) &&
-           SI->isSimple();
-  }
-  default:
-    return false;
-  }
+  APInt Quot;
+  uint64_t Rem;
+  APInt::udivrem(ConstOffset, VecElemSize, Quot, Rem);
+  if (Rem != 0)
+    return nullptr;
+
+  return ConstantInt::get(GEP->getContext(), Quot);
 }
 
 static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
@@ -455,73 +418,87 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
   }
 
   std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
-  std::vector<Value *> WorkList;
-  SmallVector<User *, 8> Users(Alloca->users());
-  SmallVector<User *, 8> UseUsers(Users.size(), Alloca);
+  SmallVector<Instruction *> WorkList;
+  SmallVector<Use *, 8> Uses;
+  for (Use &U : Alloca->uses())
+    Uses.push_back(&U);
+
   Type *VecEltTy = VectorTy->getElementType();
-  while (!Users.empty()) {
-    User *AllocaUser = Users.pop_back_val();
-    User *UseUser = UseUsers.pop_back_val();
-    Instruction *Inst = dyn_cast<Instruction>(AllocaUser);
-
-    GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
-    if (!GEP) {
-      if (!canVectorizeInst(Inst, UseUser, DL))
+  while (!Uses.empty()) {
+    Use *U = Uses.pop_back_val();
+    Instruction *Inst = dyn_cast<Instruction>(U->getUser());
+
+    if (Value *Ptr = getLoadStorePointerOperand(Inst)) {
+      // This is a store of the pointer, not to the pointer.
+      if (isa<StoreInst>(Inst) &&
+          U->getOperandNo() != StoreInst::getPointerOperandIndex())
         return false;
 
-      if (Inst->getOpcode() == Instruction::BitCast) {
-        Type *FromTy = Inst->getOperand(0)->getType()->getPointerElementType();
-        Type *ToTy = Inst->getType()->getPointerElementType();
-        if (FromTy->isAggregateType() || ToTy->isAggregateType() ||
-            DL.getTypeSizeInBits(FromTy) != DL.getTypeSizeInBits(ToTy))
-          continue;
-
-        for (User *CastUser : Inst->users()) {
-          if (isAssumeLikeIntrinsic(cast<Instruction>(CastUser)))
-            continue;
-          Users.push_back(CastUser);
-          UseUsers.push_back(Inst);
-        }
+      Type *AccessTy = getLoadStoreType(Inst);
+      Ptr = Ptr->stripPointerCasts();
 
+      // Alloca already accessed as vector, leave alone.
+      if (Ptr == Alloca && DL.getTypeStoreSize(Alloca->getAllocatedType()) ==
+                               DL.getTypeStoreSize(AccessTy))
         continue;
-      }
 
-      WorkList.push_back(AllocaUser);
+      // Check that this is a simple access of a vector element.
+      bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple()
+                                          : cast<StoreInst>(Inst)->isSimple();
+      if (!IsSimple ||
+          !CastInst::isBitOrNoopPointerCastable(VecEltTy, AccessTy, DL))
+        return false;
+
+      WorkList.push_back(Inst);
       continue;
     }
 
-    Value *Index = GEPToVectorIndex(GEP);
+    if (isa<BitCastInst>(Inst)) {
+      // Look through bitcasts.
+      for (Use &U : Inst->uses())
+        Uses.push_back(&U);
+      continue;
+    }
 
-    // If we can't compute a vector index from this GEP, then we can't
-    // promote this alloca to vector.
-    if (!Index) {
-      LLVM_DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP
-                        << '\n');
-      return false;
+    if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
+      // If we can't compute a vector index from this GEP, then we can't
+      // promote this alloca to vector.
+      Value *Index = GEPToVectorIndex(GEP, Alloca, VecEltTy, DL);
+      if (!Index) {
+        LLVM_DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP
+                          << '\n');
+        return false;
+      }
+
+      GEPVectorIdx[GEP] = Index;
+      for (Use &U : Inst->uses())
+        Uses.push_back(&U);
+      continue;
     }
 
-    GEPVectorIdx[GEP] = Index;
-    Users.append(GEP->user_begin(), GEP->user_end());
-    UseUsers.append(GEP->getNumUses(), GEP);
+    // Ignore assume-like intrinsics and comparisons used in assumes.
+    if (isAssumeLikeIntrinsic(Inst))
+      continue;
+
+    if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) {
+          return isAssumeLikeIntrinsic(cast<Instruction>(U));
+        }))
+      continue;
+
+    // Unknown user.
+    return false;
   }
 
   LLVM_DEBUG(dbgs() << "  Converting alloca to vector " << *AllocaTy << " -> "
                     << *VectorTy << '\n');
 
-  for (Value *V : WorkList) {
-    Instruction *Inst = cast<Instruction>(V);
+  for (Instruction *Inst : WorkList) {
     IRBuilder<> Builder(Inst);
     switch (Inst->getOpcode()) {
     case Instruction::Load: {
-      if (Inst->getType() == AllocaTy || Inst->getType()->isVectorTy())
-        break;
-
       Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
-      if (!Index)
-        break;
-
-      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
+      Type *VecPtrTy = VectorTy->getPointerTo(Alloca->getAddressSpace());
       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
       Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
       Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
@@ -533,16 +510,9 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
     }
     case Instruction::Store: {
       StoreInst *SI = cast<StoreInst>(Inst);
-      if (SI->getValueOperand()->getType() == AllocaTy ||
-          SI->getValueOperand()->getType()->isVectorTy())
-        break;
-
       Value *Ptr = SI->getPointerOperand();
       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
-      if (!Index)
-        break;
-
-      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
+      Type *VecPtrTy = VectorTy->getPointerTo(Alloca->getAddressSpace());
       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
       Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
       Value *Elt = SI->getValueOperand();
@@ -808,10 +778,7 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
   //
   // FIXME: We should really do something to fix the addresses to a more optimal
   // value instead
-  llvm::sort(AllocatedSizes, [](std::pair<uint64_t, Align> LHS,
-                                std::pair<uint64_t, Align> RHS) {
-    return LHS.second < RHS.second;
-  });
+  llvm::sort(AllocatedSizes, llvm::less_second());
 
   // Check how much local memory is being used by global objects
   CurrentLocalMemUsage = 0;
@@ -917,7 +884,7 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
   // usage order.
   //
   // FIXME: It is also possible that if we're allowed to use all of the memory
-  // could could end up using more than the maximum due to alignment padding.
+  // could end up using more than the maximum due to alignment padding.
 
   uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment);
   uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
index 01d03d17ec47..ed450f59e4b3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
@@ -16,7 +16,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "Utils/AMDGPUMemoryUtils.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/InitializePasses.h"
@@ -30,6 +32,8 @@ namespace {
 class AMDGPUPromoteKernelArguments : public FunctionPass {
   MemorySSA *MSSA;
 
+  AliasAnalysis *AA;
+
   Instruction *ArgCastInsertPt;
 
   SmallVector<Value *> Ptrs;
@@ -38,16 +42,19 @@ class AMDGPUPromoteKernelArguments : public FunctionPass {
 
   bool promotePointer(Value *Ptr);
 
+  bool promoteLoad(LoadInst *LI);
+
 public:
   static char ID;
 
   AMDGPUPromoteKernelArguments() : FunctionPass(ID) {}
 
-  bool run(Function &F, MemorySSA &MSSA);
+  bool run(Function &F, MemorySSA &MSSA, AliasAnalysis &AA);
 
   bool runOnFunction(Function &F) override;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AAResultsWrapperPass>();
     AU.addRequired<MemorySSAWrapperPass>();
     AU.setPreservesAll();
   }
@@ -68,17 +75,10 @@ void AMDGPUPromoteKernelArguments::enqueueUsers(Value *Ptr) {
       break;
     case Instruction::Load: {
       LoadInst *LD = cast<LoadInst>(U);
-      PointerType *PT = dyn_cast<PointerType>(LD->getType());
-      if (!PT ||
-          (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS &&
-           PT->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS &&
-           PT->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) ||
-          LD->getPointerOperand()->stripInBoundsOffsets() != Ptr)
-        break;
-      const MemoryAccess *MA = MSSA->getWalker()->getClobberingMemoryAccess(LD);
-      // TODO: This load poprobably can be promoted to constant address space.
-      if (MSSA->isLiveOnEntryDef(MA))
+      if (LD->getPointerOperand()->stripInBoundsOffsets() == Ptr &&
+          !AMDGPU::isClobberedInFunction(LD, MSSA, AA))
         Ptrs.push_back(LD);
+
       break;
     }
     case Instruction::GetElementPtr:
@@ -92,15 +92,26 @@ void AMDGPUPromoteKernelArguments::enqueueUsers(Value *Ptr) {
 }
 
 bool AMDGPUPromoteKernelArguments::promotePointer(Value *Ptr) {
-  enqueueUsers(Ptr);
+  bool Changed = false;
+
+  LoadInst *LI = dyn_cast<LoadInst>(Ptr);
+  if (LI)
+    Changed |= promoteLoad(LI);
+
+  PointerType *PT = dyn_cast<PointerType>(Ptr->getType());
+  if (!PT)
+    return Changed;
+
+  if (PT->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
+      PT->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+      PT->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS)
+    enqueueUsers(Ptr);
 
-  PointerType *PT = cast<PointerType>(Ptr->getType());
   if (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
-    return false;
+    return Changed;
 
-  bool IsArg = isa<Argument>(Ptr);
-  IRBuilder<> B(IsArg ? ArgCastInsertPt
-                      : &*std::next(cast<Instruction>(Ptr)->getIterator()));
+  IRBuilder<> B(LI ? &*std::next(cast<Instruction>(Ptr)->getIterator())
+                   : ArgCastInsertPt);
 
   // Cast pointer to global address space and back to flat and let
   // Infer Address Spaces pass to do all necessary rewriting.
@@ -116,6 +127,14 @@ bool AMDGPUPromoteKernelArguments::promotePointer(Value *Ptr) {
   return true;
 }
 
+bool AMDGPUPromoteKernelArguments::promoteLoad(LoadInst *LI) {
+  if (!LI->isSimple())
+    return false;
+
+  LI->setMetadata("amdgpu.noclobber", MDNode::get(LI->getContext(), {}));
+  return true;
+}
+
 // skip allocas
 static BasicBlock::iterator getInsertPt(BasicBlock &BB) {
   BasicBlock::iterator InsPt = BB.getFirstInsertionPt();
@@ -131,7 +150,8 @@ static BasicBlock::iterator getInsertPt(BasicBlock &BB) {
   return InsPt;
 }
 
-bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA) {
+bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA,
+                                       AliasAnalysis &AA) {
   if (skipFunction(F))
     return false;
 
@@ -141,6 +161,7 @@ bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA) {
 
   ArgCastInsertPt = &*getInsertPt(*F.begin());
   this->MSSA = &MSSA;
+  this->AA = &AA;
 
   for (Argument &Arg : F.args()) {
     if (Arg.use_empty())
@@ -166,11 +187,13 @@ bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA) {
 
 bool AMDGPUPromoteKernelArguments::runOnFunction(Function &F) {
   MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
-  return run(F, MSSA);
+  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+  return run(F, MSSA, AA);
 }
 
 INITIALIZE_PASS_BEGIN(AMDGPUPromoteKernelArguments, DEBUG_TYPE,
                       "AMDGPU Promote Kernel Arguments", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
 INITIALIZE_PASS_END(AMDGPUPromoteKernelArguments, DEBUG_TYPE,
                     "AMDGPU Promote Kernel Arguments", false, false)
@@ -185,7 +208,8 @@ PreservedAnalyses
 AMDGPUPromoteKernelArgumentsPass::run(Function &F,
                                       FunctionAnalysisManager &AM) {
   MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
-  if (AMDGPUPromoteKernelArguments().run(F, MSSA)) {
+  AliasAnalysis &AA = AM.getResult<AAManager>(F);
+  if (AMDGPUPromoteKernelArguments().run(F, MSSA, AA)) {
     PreservedAnalyses PA;
     PA.preserveSet<CFGAnalyses>();
     PA.preserve<MemorySSAAnalysis>();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index de2dccef804a..0830cbd919a0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -76,10 +76,11 @@
 #include "GCNSubtarget.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/RegisterBank.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 
 #define GET_TARGET_REGBANK_IMPL
@@ -193,9 +194,7 @@ public:
 
 }
 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
-    : AMDGPUGenRegisterBankInfo(),
-      Subtarget(ST),
-      TRI(Subtarget.getRegisterInfo()),
+    : Subtarget(ST), TRI(Subtarget.getRegisterInfo()),
       TII(Subtarget.getInstrInfo()) {
 
   // HACK: Until this is fully tablegen'd.
@@ -428,11 +427,6 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
   }
 }
 
-static bool memOpHasNoClobbered(const MachineMemOperand *MMO) {
-  const Instruction *I = dyn_cast_or_null<Instruction>(MMO->getValue());
-  return I && I->getMetadata("amdgpu.noclobber");
-}
-
 // FIXME: Returns uniform if there's no source value information. This is
 // probably wrong.
 static bool isScalarLoadLegal(const MachineInstr &MI) {
@@ -451,7 +445,7 @@ static bool isScalarLoadLegal(const MachineInstr &MI) {
          // spaces.
          (IsConst || !MMO->isVolatile()) &&
          // Memory must be known constant, or not written before this load.
-         (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) &&
+         (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) &&
          AMDGPUInstrInfo::isUniformMMO(MMO);
 }
 
@@ -684,6 +678,62 @@ static LLT getHalfSizedType(LLT Ty) {
   return LLT::scalar(Ty.getScalarSizeInBits() / 2);
 }
 
+// Build one or more V_READFIRSTLANE_B32 instructions to move the given vector
+// source value into a scalar register.
+Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
+                                                    MachineRegisterInfo &MRI,
+                                                    Register Src) const {
+  LLT Ty = MRI.getType(Src);
+  const RegisterBank *Bank = getRegBank(Src, MRI, *TRI);
+
+  if (Bank == &AMDGPU::SGPRRegBank)
+    return Src;
+
+  unsigned Bits = Ty.getSizeInBits();
+  assert(Bits % 32 == 0);
+
+  if (Bank != &AMDGPU::VGPRRegBank) {
+    // We need to copy from AGPR to VGPR
+    Src = B.buildCopy(Ty, Src).getReg(0);
+    MRI.setRegBank(Src, AMDGPU::VGPRRegBank);
+  }
+
+  LLT S32 = LLT::scalar(32);
+  unsigned NumParts = Bits / 32;
+  SmallVector<Register, 8> SrcParts;
+  SmallVector<Register, 8> DstParts;
+
+  if (Bits == 32) {
+    SrcParts.push_back(Src);
+  } else {
+    auto Unmerge = B.buildUnmerge(S32, Src);
+    for (unsigned i = 0; i < NumParts; ++i)
+      SrcParts.push_back(Unmerge.getReg(i));
+  }
+
+  for (unsigned i = 0; i < NumParts; ++i) {
+    Register SrcPart = SrcParts[i];
+    Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+    MRI.setType(DstPart, NumParts == 1 ? Ty : S32);
+
+    const TargetRegisterClass *Constrained =
+        constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI);
+    (void)Constrained;
+    assert(Constrained && "Failed to constrain readfirstlane src reg");
+
+    B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart});
+
+    DstParts.push_back(DstPart);
+  }
+
+  if (Bits == 32)
+    return DstParts[0];
+
+  Register Dst = B.buildMerge(Ty, DstParts).getReg(0);
+  MRI.setRegBank(Dst, AMDGPU::SGPRRegBank);
+  return Dst;
+}
+
 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
 /// execute the instruction for each unique combination of values in all lanes
@@ -716,8 +766,6 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
   MachineFunction *MF = &B.getMF();
 
   const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
-  const unsigned WaveAndOpc = Subtarget.isWave32() ?
-    AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
   const unsigned MovExecOpc =
       Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
   const unsigned MovExecTermOpc =
@@ -747,16 +795,19 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
   // To insert the loop we need to split the block. Move everything before this
   // point to a new block, and insert a new empty block before this instruction.
   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
+  MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock();
   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
   MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
   MachineFunction::iterator MBBI(MBB);
   ++MBBI;
   MF->insert(MBBI, LoopBB);
+  MF->insert(MBBI, BodyBB);
   MF->insert(MBBI, RestoreExecBB);
   MF->insert(MBBI, RemainderBB);
 
-  LoopBB->addSuccessor(RestoreExecBB);
-  LoopBB->addSuccessor(LoopBB);
+  LoopBB->addSuccessor(BodyBB);
+  BodyBB->addSuccessor(RestoreExecBB);
+  BodyBB->addSuccessor(LoopBB);
 
   // Move the rest of the block into a new block.
   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
@@ -768,27 +819,27 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
   B.setInsertPt(*LoopBB, LoopBB->end());
 
   B.buildInstr(TargetOpcode::PHI)
-    .addDef(PhiExec)
-    .addReg(InitSaveExecReg)
-    .addMBB(&MBB)
-    .addReg(NewExec)
-    .addMBB(LoopBB);
+      .addDef(PhiExec)
+      .addReg(InitSaveExecReg)
+      .addMBB(&MBB)
+      .addReg(NewExec)
+      .addMBB(BodyBB);
 
   const DebugLoc &DL = B.getDL();
 
   MachineInstr &FirstInst = *Range.begin();
 
-  // Move the instruction into the loop. Note we moved everything after
+  // Move the instruction into the loop body. Note we moved everything after
   // Range.end() already into a new block, so Range.end() is no longer valid.
-  LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
+  BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());
 
   // Figure out the iterator range after splicing the instructions.
   MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
-  auto NewEnd = LoopBB->end();
+  auto NewEnd = BodyBB->end();
 
-  MachineBasicBlock::iterator I = Range.begin();
-  B.setInsertPt(*LoopBB, I);
+  B.setMBB(*LoopBB);
 
+  LLT S1 = LLT::scalar(1);
   Register CondReg;
 
   assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
@@ -819,164 +870,62 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
         B.setMBB(MBB);
         OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
         MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
-        B.setInstr(*I);
+        B.setMBB(*LoopBB);
       }
 
-      unsigned OpSize = OpTy.getSizeInBits();
+      Register CurrentLaneReg = buildReadFirstLane(B, MRI, OpReg);
 
-      // Can only do a readlane of 32-bit pieces.
-      if (OpSize == 32) {
-        // Avoid extra copies in the simple case of one 32-bit register.
-        Register CurrentLaneOpReg
-          = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
-        MRI.setType(CurrentLaneOpReg, OpTy);
-
-        constrainGenericRegister(OpReg, AMDGPU::VGPR_32RegClass, MRI);
-        // Read the next variant <- also loop target.
-        BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
-                CurrentLaneOpReg)
-          .addReg(OpReg);
-
-        Register NewCondReg = MRI.createVirtualRegister(WaveRC);
-        bool First = CondReg == AMDGPU::NoRegister;
-        if (First)
-          CondReg = NewCondReg;
-
-        // Compare the just read M0 value to all possible Idx values.
-        B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
-          .addDef(NewCondReg)
-          .addReg(CurrentLaneOpReg)
-          .addReg(OpReg);
-        Op.setReg(CurrentLaneOpReg);
-
-        if (!First) {
-          Register AndReg = MRI.createVirtualRegister(WaveRC);
-
-          // If there are multiple operands to consider, and the conditions.
-          B.buildInstr(WaveAndOpc)
-            .addDef(AndReg)
-            .addReg(NewCondReg)
-            .addReg(CondReg);
-          CondReg = AndReg;
-        }
+      // Build the comparison(s).
+      unsigned OpSize = OpTy.getSizeInBits();
+      bool Is64 = OpSize % 64 == 0;
+      unsigned PartSize = Is64 ? 64 : 32;
+      LLT PartTy = LLT::scalar(PartSize);
+      unsigned NumParts = OpSize / PartSize;
+      SmallVector<Register, 8> OpParts;
+      SmallVector<Register, 8> CurrentLaneParts;
+
+      if (NumParts == 1) {
+        OpParts.push_back(OpReg);
+        CurrentLaneParts.push_back(CurrentLaneReg);
       } else {
-        LLT S32 = LLT::scalar(32);
-        SmallVector<Register, 8> ReadlanePieces;
-
-        // The compares can be done as 64-bit, but the extract needs to be done
-        // in 32-bit pieces.
-
-        bool Is64 = OpSize % 64 == 0;
-
-        unsigned UnmergeTySize = Is64 ? 64 : 32;
-        unsigned CmpOp =
-            Is64 ? AMDGPU::V_CMP_EQ_U64_e64 : AMDGPU::V_CMP_EQ_U32_e64;
-
-        // Insert the unmerge before the loop.
-
-        B.setMBB(MBB);
-        unsigned NumPieces = OpSize / UnmergeTySize;
-        SmallVector<Register, 8> UnmergePieces;
-        if (NumPieces == 1) {
-          UnmergePieces.push_back(OpReg);
-        } else {
-          LLT UnmergeTy = LLT::scalar(UnmergeTySize);
-          MachineInstrBuilder Unmerge = B.buildUnmerge(UnmergeTy, OpReg);
-          for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx)
-            UnmergePieces.push_back(Unmerge.getReg(PieceIdx));
+        auto UnmergeOp = B.buildUnmerge(PartTy, OpReg);
+        auto UnmergeCurrentLane = B.buildUnmerge(PartTy, CurrentLaneReg);
+        for (unsigned i = 0; i < NumParts; ++i) {
+          OpParts.push_back(UnmergeOp.getReg(i));
+          CurrentLaneParts.push_back(UnmergeCurrentLane.getReg(i));
+          MRI.setRegBank(OpParts[i], AMDGPU::VGPRRegBank);
+          MRI.setRegBank(CurrentLaneParts[i], AMDGPU::SGPRRegBank);
         }
-        B.setInstr(*I);
-
-        for (Register UnmergePiece : UnmergePieces) {
-          Register CurrentLaneOpReg;
-          if (Is64) {
-            Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
-            Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
-
-            MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
-            MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
-            MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
-
-            // Read the next variant <- also loop target.
-            BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
-                    CurrentLaneOpRegLo)
-              .addReg(UnmergePiece, 0, AMDGPU::sub0);
-
-            // Read the next variant <- also loop target.
-            BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
-                    CurrentLaneOpRegHi)
-              .addReg(UnmergePiece, 0, AMDGPU::sub1);
-
-            CurrentLaneOpReg =
-              B.buildMerge(LLT::scalar(64),
-                           {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
-              .getReg(0);
-
-            MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
-
-            if (OpTy.getScalarSizeInBits() == 64) {
-              // If we need to produce a 64-bit element vector, so use the
-              // merged pieces
-              ReadlanePieces.push_back(CurrentLaneOpReg);
-            } else {
-              // 32-bit element type.
-              ReadlanePieces.push_back(CurrentLaneOpRegLo);
-              ReadlanePieces.push_back(CurrentLaneOpRegHi);
-            }
-          } else {
-            CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
-            MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
-            MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
-
-            // Read the next variant <- also loop target.
-            BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
-                    CurrentLaneOpReg)
-              .addReg(UnmergePiece);
-            ReadlanePieces.push_back(CurrentLaneOpReg);
-          }
+      }
 
-          Register NewCondReg = MRI.createVirtualRegister(WaveRC);
-          bool First = CondReg == AMDGPU::NoRegister;
-          if (First)
-            CondReg = NewCondReg;
-
-          B.buildInstr(CmpOp)
-            .addDef(NewCondReg)
-            .addReg(CurrentLaneOpReg)
-            .addReg(UnmergePiece);
-
-          if (!First) {
-            Register AndReg = MRI.createVirtualRegister(WaveRC);
-
-            // If there are multiple operands to consider, and the conditions.
-            B.buildInstr(WaveAndOpc)
-              .addDef(AndReg)
-              .addReg(NewCondReg)
-              .addReg(CondReg);
-            CondReg = AndReg;
-          }
-        }
+      for (unsigned i = 0; i < NumParts; ++i) {
+        auto CmpReg = B.buildICmp(CmpInst::ICMP_EQ, S1, CurrentLaneParts[i],
+                                  OpParts[i]).getReg(0);
+        MRI.setRegBank(CmpReg, AMDGPU::VCCRegBank);
 
-        // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
-        // BUILD_VECTOR
-        if (OpTy.isVector()) {
-          auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
-          Op.setReg(Merge.getReg(0));
-          MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
-        } else if (ReadlanePieces.size() > 1) {
-          auto Merge = B.buildMerge(OpTy, ReadlanePieces);
-          Op.setReg(Merge.getReg(0));
-          MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
+        if (!CondReg) {
+          CondReg = CmpReg;
         } else {
-          Op.setReg(ReadlanePieces[0]);
+          CondReg = B.buildAnd(S1, CondReg, CmpReg).getReg(0);
+          MRI.setRegBank(CondReg, AMDGPU::VCCRegBank);
         }
       }
 
+      Op.setReg(CurrentLaneReg);
+
       // Make sure we don't re-process this register again.
       WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg()));
     }
   }
 
+  // The ballot becomes a no-op during instruction selection.
+  CondReg = B.buildIntrinsic(Intrinsic::amdgcn_ballot,
+                             {LLT::scalar(Subtarget.isWave32() ? 32 : 64)},
+                             false)
+                .addReg(CondReg)
+                .getReg(0);
+  MRI.setRegClass(CondReg, WaveRC);
+
   // Update EXEC, save the original EXEC value to VCC.
   B.buildInstr(AndSaveExecOpc)
     .addDef(NewExec)
@@ -984,7 +933,7 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
 
   MRI.setSimpleHint(NewExec, CondReg);
 
-  B.setInsertPt(*LoopBB, LoopBB->end());
+  B.setInsertPt(*BodyBB, BodyBB->end());
 
   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
   B.buildInstr(XorTermOpc)
@@ -1064,28 +1013,10 @@ void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
   if (Bank == &AMDGPU::SGPRRegBank)
     return;
 
-  LLT Ty = MRI.getType(Reg);
   MachineIRBuilder B(MI);
 
-  if (Bank != &AMDGPU::VGPRRegBank) {
-    // We need to copy from AGPR to VGPR
-    Reg = B.buildCopy(Ty, Reg).getReg(0);
-    MRI.setRegBank(Reg, AMDGPU::VGPRRegBank);
-  }
-
-  Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-  B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
-    .addDef(SGPR)
-    .addReg(Reg);
-
-  MRI.setType(SGPR, Ty);
-
-  const TargetRegisterClass *Constrained =
-      constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
-  (void)Constrained;
-  assert(Constrained && "Failed to constrain readfirstlane src reg");
-
-  MI.getOperand(OpIdx).setReg(SGPR);
+  Reg = buildReadFirstLane(B, MRI, Reg);
+  MI.getOperand(OpIdx).setReg(Reg);
 }
 
 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
@@ -1624,6 +1555,157 @@ bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper,
   return true;
 }
 
+bool AMDGPURegisterBankInfo::applyMappingMAD_64_32(
+    const OperandsMapper &OpdMapper) const {
+  MachineInstr &MI = OpdMapper.getMI();
+  MachineRegisterInfo &MRI = OpdMapper.getMRI();
+
+  // Insert basic copies.
+  applyDefaultMapping(OpdMapper);
+
+  Register Dst0 = MI.getOperand(0).getReg();
+  Register Dst1 = MI.getOperand(1).getReg();
+  Register Src0 = MI.getOperand(2).getReg();
+  Register Src1 = MI.getOperand(3).getReg();
+  Register Src2 = MI.getOperand(4).getReg();
+
+  if (MRI.getRegBankOrNull(Src0) == &AMDGPU::VGPRRegBank)
+    return true;
+
+  bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
+  LLT S1 = LLT::scalar(1);
+  LLT S32 = LLT::scalar(32);
+
+  bool DstOnValu = MRI.getRegBankOrNull(Src2) == &AMDGPU::VGPRRegBank;
+  bool Accumulate = true;
+
+  if (!DstOnValu) {
+    if (mi_match(Src2, MRI, m_ZeroInt()))
+      Accumulate = false;
+  }
+
+  // Keep the multiplication on the SALU.
+  MachineIRBuilder B(MI);
+
+  Register DstHi;
+  Register DstLo = B.buildMul(S32, Src0, Src1).getReg(0);
+  bool MulHiInVgpr = false;
+
+  MRI.setRegBank(DstLo, AMDGPU::SGPRRegBank);
+
+  if (Subtarget.hasSMulHi()) {
+    DstHi = IsUnsigned ? B.buildUMulH(S32, Src0, Src1).getReg(0)
+                       : B.buildSMulH(S32, Src0, Src1).getReg(0);
+    MRI.setRegBank(DstHi, AMDGPU::SGPRRegBank);
+  } else {
+    Register VSrc0 = B.buildCopy(S32, Src0).getReg(0);
+    Register VSrc1 = B.buildCopy(S32, Src1).getReg(0);
+
+    MRI.setRegBank(VSrc0, AMDGPU::VGPRRegBank);
+    MRI.setRegBank(VSrc1, AMDGPU::VGPRRegBank);
+
+    DstHi = IsUnsigned ? B.buildUMulH(S32, VSrc0, VSrc1).getReg(0)
+                       : B.buildSMulH(S32, VSrc0, VSrc1).getReg(0);
+    MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
+
+    if (!DstOnValu) {
+      DstHi = buildReadFirstLane(B, MRI, DstHi);
+    } else {
+      MulHiInVgpr = true;
+    }
+  }
+
+  // Accumulate and produce the "carry-out" bit.
+  //
+  // The "carry-out" is defined as bit 64 of the result when computed as a
+  // big integer. For unsigned multiply-add, this matches the usual definition
+  // of carry-out. For signed multiply-add, bit 64 is the sign bit of the
+  // result, which is determined as:
+  //   sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add
+  LLT CarryType = DstOnValu ? S1 : S32;
+  const RegisterBank &CarryBank =
+      DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
+  const RegisterBank &DstBank =
+      DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank;
+  Register Carry;
+  Register Zero;
+
+  if (!IsUnsigned) {
+    Zero = B.buildConstant(S32, 0).getReg(0);
+    MRI.setRegBank(Zero,
+                   MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank);
+
+    Carry = B.buildICmp(CmpInst::ICMP_SLT, MulHiInVgpr ? S1 : S32, DstHi, Zero)
+                .getReg(0);
+    MRI.setRegBank(Carry, MulHiInVgpr ? AMDGPU::VCCRegBank
+                                      : AMDGPU::SGPRRegBank);
+
+    if (DstOnValu && !MulHiInVgpr) {
+      Carry = B.buildTrunc(S1, Carry).getReg(0);
+      MRI.setRegBank(Carry, AMDGPU::VCCRegBank);
+    }
+  }
+
+  if (Accumulate) {
+    if (DstOnValu) {
+      DstLo = B.buildCopy(S32, DstLo).getReg(0);
+      DstHi = B.buildCopy(S32, DstHi).getReg(0);
+      MRI.setRegBank(DstLo, AMDGPU::VGPRRegBank);
+      MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
+    }
+
+    auto Unmerge = B.buildUnmerge(S32, Src2);
+    Register Src2Lo = Unmerge.getReg(0);
+    Register Src2Hi = Unmerge.getReg(1);
+    MRI.setRegBank(Src2Lo, DstBank);
+    MRI.setRegBank(Src2Hi, DstBank);
+
+    if (!IsUnsigned) {
+      auto Src2Sign = B.buildICmp(CmpInst::ICMP_SLT, CarryType, Src2Hi, Zero);
+      MRI.setRegBank(Src2Sign.getReg(0), CarryBank);
+
+      Carry = B.buildXor(CarryType, Carry, Src2Sign).getReg(0);
+      MRI.setRegBank(Carry, CarryBank);
+    }
+
+    auto AddLo = B.buildUAddo(S32, CarryType, DstLo, Src2Lo);
+    DstLo = AddLo.getReg(0);
+    Register CarryLo = AddLo.getReg(1);
+    MRI.setRegBank(DstLo, DstBank);
+    MRI.setRegBank(CarryLo, CarryBank);
+
+    auto AddHi = B.buildUAdde(S32, CarryType, DstHi, Src2Hi, CarryLo);
+    DstHi = AddHi.getReg(0);
+    MRI.setRegBank(DstHi, DstBank);
+
+    Register CarryHi = AddHi.getReg(1);
+    MRI.setRegBank(CarryHi, CarryBank);
+
+    if (IsUnsigned) {
+      Carry = CarryHi;
+    } else {
+      Carry = B.buildXor(CarryType, Carry, CarryHi).getReg(0);
+      MRI.setRegBank(Carry, CarryBank);
+    }
+  } else {
+    if (IsUnsigned) {
+      Carry = B.buildConstant(CarryType, 0).getReg(0);
+      MRI.setRegBank(Carry, CarryBank);
+    }
+  }
+
+  B.buildMerge(Dst0, {DstLo, DstHi});
+
+  if (DstOnValu) {
+    B.buildCopy(Dst1, Carry);
+  } else {
+    B.buildTrunc(Dst1, Carry);
+  }
+
+  MI.eraseFromParent();
+  return true;
+}
+
 // Return a suitable opcode for extending the operands of Opc when widening.
 static unsigned getExtendOp(unsigned Opc) {
   switch (Opc) {
@@ -1794,7 +1876,7 @@ bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
 }
 
 /// Utility function for pushing dynamic vector indexes with a constant offset
-/// into waterwall loops.
+/// into waterfall loops.
 static void reinsertVectorIndexAdd(MachineIRBuilder &B,
                                    MachineInstr &IdxUseInstr,
                                    unsigned OpIdx,
@@ -1857,7 +1939,7 @@ bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
   unsigned NumElem = VecTy.getNumElements();
 
   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
-                                                  IsDivergentIdx))
+                                                  IsDivergentIdx, &Subtarget))
     return false;
 
   MachineIRBuilder B(MI);
@@ -1955,7 +2037,7 @@ bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
   unsigned NumElem = VecTy.getNumElements();
 
   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
-                                                  IsDivergentIdx))
+                                                  IsDivergentIdx, &Subtarget))
     return false;
 
   MachineIRBuilder B(MI);
@@ -2926,7 +3008,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
     case Intrinsic::amdgcn_interp_p2:
     case Intrinsic::amdgcn_interp_mov:
     case Intrinsic::amdgcn_interp_p1_f16:
-    case Intrinsic::amdgcn_interp_p2_f16: {
+    case Intrinsic::amdgcn_interp_p2_f16:
+    case Intrinsic::amdgcn_lds_param_load: {
       applyDefaultMapping(OpdMapper);
 
       // Readlane for m0 value, which is always the last operand.
@@ -2934,6 +3017,12 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
       return;
     }
+    case Intrinsic::amdgcn_interp_inreg_p10:
+    case Intrinsic::amdgcn_interp_inreg_p2:
+    case Intrinsic::amdgcn_interp_inreg_p10_f16:
+    case Intrinsic::amdgcn_interp_inreg_p2_f16:
+      applyDefaultMapping(OpdMapper);
+      return;
     case Intrinsic::amdgcn_permlane16:
     case Intrinsic::amdgcn_permlanex16: {
       // Doing a waterfall loop over these wouldn't make any sense.
@@ -3015,6 +3104,35 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       constrainOpWithReadfirstlane(MI, MRI, 2);
       return;
     }
+    case Intrinsic::amdgcn_raw_buffer_load_lds: {
+      applyDefaultMapping(OpdMapper);
+      constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc
+      constrainOpWithReadfirstlane(MI, MRI, 2); // M0
+      constrainOpWithReadfirstlane(MI, MRI, 5); // soffset
+      return;
+    }
+    case Intrinsic::amdgcn_struct_buffer_load_lds: {
+      applyDefaultMapping(OpdMapper);
+      constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc
+      constrainOpWithReadfirstlane(MI, MRI, 2); // M0
+      constrainOpWithReadfirstlane(MI, MRI, 6); // soffset
+      return;
+    }
+    case Intrinsic::amdgcn_global_load_lds: {
+      applyDefaultMapping(OpdMapper);
+      constrainOpWithReadfirstlane(MI, MRI, 2);
+      return;
+    }
+    case Intrinsic::amdgcn_lds_direct_load: {
+      applyDefaultMapping(OpdMapper);
+      // Readlane for m0 value, which is always the last operand.
+      constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
+      return;
+    }
+    case Intrinsic::amdgcn_exp_row:
+      applyDefaultMapping(OpdMapper);
+      constrainOpWithReadfirstlane(MI, MRI, 8); // M0
+      return;
     default: {
       if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
               AMDGPU::lookupRsrcIntrinsic(IntrID)) {
@@ -3143,6 +3261,10 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
   case AMDGPU::G_UBFX:
     applyMappingBFE(OpdMapper, /*Signed*/ false);
     return;
+  case AMDGPU::G_AMDGPU_MAD_U64_U32:
+  case AMDGPU::G_AMDGPU_MAD_I64_I32:
+    applyMappingMAD_64_32(OpdMapper);
+    return;
   default:
     break;
   }
@@ -3668,6 +3790,48 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       return getDefaultMappingSOP(MI);
     return getDefaultMappingVOP(MI);
   }
+  case AMDGPU::G_AMDGPU_MAD_U64_U32:
+  case AMDGPU::G_AMDGPU_MAD_I64_I32: {
+    // Three possible mappings:
+    //
+    //  - Default SOP
+    //  - Default VOP
+    //  - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP.
+    //
+    // This allows instruction selection to keep the multiplication part of the
+    // instruction on the SALU.
+    bool AllSalu = true;
+    bool MulSalu = true;
+    for (unsigned i = 0; i < 5; ++i) {
+      Register Reg = MI.getOperand(i).getReg();
+      if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
+        if (Bank->getID() != AMDGPU::SGPRRegBankID) {
+          AllSalu = false;
+          if (i == 2 || i == 3) {
+            MulSalu = false;
+            break;
+          }
+        }
+      }
+    }
+
+    if (AllSalu)
+      return getDefaultMappingSOP(MI);
+
+    // If the multiply-add is full-rate in VALU, use that even if the
+    // multiplication part is scalar. Accumulating separately on the VALU would
+    // take two instructions.
+    if (!MulSalu || Subtarget.hasFullRate64Ops())
+      return getDefaultMappingVOP(MI);
+
+    // Keep the multiplication on the SALU, then accumulate on the VALU.
+    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
+    OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
+    OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+    OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+    OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
+    break;
+  }
   case AMDGPU::G_IMPLICIT_DEF: {
     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
@@ -3828,10 +3992,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   }
   case AMDGPU::G_FCMP: {
     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
-    unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
     OpdsMapping[1] = nullptr; // Predicate Operand.
-    OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
+    OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
     OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
     break;
   }
@@ -4102,6 +4265,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_udot4:
     case Intrinsic::amdgcn_sdot8:
     case Intrinsic::amdgcn_udot8:
+    case Intrinsic::amdgcn_fdot2_bf16_bf16:
+    case Intrinsic::amdgcn_fdot2_f16_f16:
+    case Intrinsic::amdgcn_fdot2_f32_bf16:
+    case Intrinsic::amdgcn_sudot4:
+    case Intrinsic::amdgcn_sudot8:
+    case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
+    case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
+    case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
+    case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
+    case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
+    case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
       return getDefaultMappingVOP(MI);
     case Intrinsic::amdgcn_sbfe:
     case Intrinsic::amdgcn_ubfe:
@@ -4120,6 +4294,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_wqm:
     case Intrinsic::amdgcn_softwqm:
     case Intrinsic::amdgcn_set_inactive:
+    case Intrinsic::amdgcn_permlane64:
       return getDefaultMappingAllVGPR(MI);
     case Intrinsic::amdgcn_kernarg_segment_ptr:
     case Intrinsic::amdgcn_s_getpc:
@@ -4247,24 +4422,50 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
     case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
     case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
-    case Intrinsic::amdgcn_mfma_f64_4x4x4f64: {
+    case Intrinsic::amdgcn_mfma_f64_4x4x4f64:
+    case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
+    case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
+    case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32:
+    case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32: {
       // Default for MAI intrinsics.
       // srcC can also be an immediate which can be folded later.
       // FIXME: Should we eventually add an alternative mapping with AGPR src
       // for srcA/srcB?
       //
       // vdst, srcA, srcB, srcC
+      const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+      OpdsMapping[0] =
+          Info->mayNeedAGPRs()
+              ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
+              : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+      OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+      OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
+      OpdsMapping[4] =
+          Info->mayNeedAGPRs()
+              ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
+              : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+      break;
+    }
+    case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
+    case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
+    case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
+    case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
+    case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
+    case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: {
+      // vdst, srcA, srcB, srcC, idx
       OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
       OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+      OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
       break;
     }
     case Intrinsic::amdgcn_interp_p1:
     case Intrinsic::amdgcn_interp_p2:
     case Intrinsic::amdgcn_interp_mov:
     case Intrinsic::amdgcn_interp_p1_f16:
-    case Intrinsic::amdgcn_interp_p2_f16: {
+    case Intrinsic::amdgcn_interp_p2_f16:
+    case Intrinsic::amdgcn_lds_param_load: {
       const int M0Idx = MI.getNumOperands() - 1;
       Register M0Reg = MI.getOperand(M0Idx).getReg();
       unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
@@ -4279,6 +4480,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
       break;
     }
+    case Intrinsic::amdgcn_interp_inreg_p10:
+    case Intrinsic::amdgcn_interp_inreg_p2:
+    case Intrinsic::amdgcn_interp_inreg_p10_f16:
+    case Intrinsic::amdgcn_interp_inreg_p2_f16: {
+      unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
+      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+      OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+      break;
+    }
     case Intrinsic::amdgcn_ballot: {
       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
@@ -4314,8 +4526,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
     } else {
       // NSA form
-      for (unsigned I = 2; I < N; ++I)
-        OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+      for (unsigned I = 2; I < N; ++I) {
+        unsigned Size = MRI.getType(MI.getOperand(I).getReg()).getSizeInBits();
+        OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+      }
     }
     break;
   }
@@ -4325,7 +4539,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_s_getreg:
     case Intrinsic::amdgcn_s_memtime:
     case Intrinsic::amdgcn_s_memrealtime:
-    case Intrinsic::amdgcn_s_get_waveid_in_workgroup: {
+    case Intrinsic::amdgcn_s_get_waveid_in_workgroup:
+    case Intrinsic::amdgcn_s_sendmsg_rtn: {
       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
       break;
@@ -4337,6 +4552,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_flat_atomic_fadd:
     case Intrinsic::amdgcn_flat_atomic_fmin:
     case Intrinsic::amdgcn_flat_atomic_fmax:
+    case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
+    case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
       return getDefaultMappingAllVGPR(MI);
     case Intrinsic::amdgcn_ds_ordered_add:
     case Intrinsic::amdgcn_ds_ordered_swap: {
@@ -4366,6 +4583,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
       break;
+    case Intrinsic::amdgcn_exp_row:
+      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+      OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+      OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+      OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+      OpdsMapping[8] = getSGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);
+      break;
     case Intrinsic::amdgcn_s_sendmsg:
     case Intrinsic::amdgcn_s_sendmsghalt: {
       // This must be an SGPR, but accept a VGPR.
@@ -4412,6 +4636,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
       break;
     }
+    case Intrinsic::amdgcn_raw_buffer_load_lds: {
+      OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+      OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+      OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+      OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
+      break;
+    }
     case Intrinsic::amdgcn_raw_buffer_store:
     case Intrinsic::amdgcn_raw_buffer_store_format:
     case Intrinsic::amdgcn_raw_tbuffer_store: {
@@ -4430,6 +4661,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
       break;
     }
+    case Intrinsic::amdgcn_struct_buffer_load_lds: {
+      OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+      OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+      OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+      OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
+      OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
+      break;
+    }
     case Intrinsic::amdgcn_struct_buffer_store:
     case Intrinsic::amdgcn_struct_tbuffer_store: {
       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
@@ -4464,6 +4703,31 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
       break;
     }
+    case Intrinsic::amdgcn_global_load_lds: {
+      OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+      OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+      break;
+    }
+    case Intrinsic::amdgcn_lds_direct_load: {
+      const int M0Idx = MI.getNumOperands() - 1;
+      Register M0Reg = MI.getOperand(M0Idx).getReg();
+      unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
+      unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+
+      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
+      for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
+        OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+
+      // Must be SGPR, but we must take whatever the original bank is and fix it
+      // later.
+      OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
+      break;
+    }
+    case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
+    case Intrinsic::amdgcn_ds_sub_gs_reg_rtn:
+      OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+      OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+      break;
     default:
       return getInvalidInstructionMapping();
     }
@@ -4568,6 +4832,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
     break;
   }
+  case AMDGPU::G_FPTRUNC_ROUND_UPWARD:
+  case AMDGPU::G_FPTRUNC_ROUND_DOWNWARD:
+    return getDefaultMappingVOP(MI);
   }
 
   return getInstructionMapping(/*ID*/1, /*Cost*/1,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
index 2b9d0923ab49..c9741c2202e6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -16,7 +16,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/Register.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
 
 #define GET_REGBANK_DECLARATIONS
 #include "AMDGPUGenRegisterBank.inc"
@@ -59,6 +59,9 @@ public:
     SmallSet<Register, 4> &SGPROperandRegs,
     MachineRegisterInfo &MRI) const;
 
+  Register buildReadFirstLane(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+                              Register Src) const;
+
   bool executeInWaterfallLoop(MachineIRBuilder &B,
                               MachineInstr &MI,
                               MachineRegisterInfo &MRI,
@@ -83,6 +86,8 @@ public:
 
   bool applyMappingBFE(const OperandsMapper &OpdMapper, bool Signed) const;
 
+  bool applyMappingMAD_64_32(const OperandsMapper &OpdMapper) const;
+
   Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,
                           Register Reg) const;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUReleaseVGPRs.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReleaseVGPRs.cpp
new file mode 100644
index 000000000000..a86871a4a653
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUReleaseVGPRs.cpp
@@ -0,0 +1,140 @@
+//===- AMDGPUReleaseVGPRs.cpp - Automatically release vgprs on GFX11+ -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Insert S_SENDMSG instructions to release vgprs on GFX11+.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIDefines.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineOperand.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "release-vgprs"
+
+namespace {
+
+class AMDGPUReleaseVGPRs : public MachineFunctionPass {
+public:
+  static char ID;
+
+  const SIInstrInfo *SII;
+  const SIRegisterInfo *TRI;
+
+  AMDGPUReleaseVGPRs() : MachineFunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  // Used to cache the result of isLastInstructionVMEMStore for each block
+  using BlockVMEMStoreType = DenseMap<MachineBasicBlock *, bool>;
+  BlockVMEMStoreType BlockVMEMStore;
+
+  // Return true if the last instruction referencing a vgpr in this MBB
+  // is a VMEM store, otherwise return false.
+  // Visit previous basic blocks to find this last instruction if needed.
+  // Because this pass is late in the pipeline, it is expected that the
+  // last vgpr use will likely be one of vmem store, ds, exp.
+  // Loads and others vgpr operations would have been
+  // deleted by this point, except for complex control flow involving loops.
+  // This is why we are just testing the type of instructions rather
+  // than the operands.
+  bool isLastVGPRUseVMEMStore(MachineBasicBlock &MBB) {
+    // Use the cache to break infinite loop and save some time. Initialize to
+    // false in case we have a cycle.
+    BlockVMEMStoreType::iterator It;
+    bool Inserted;
+    std::tie(It, Inserted) = BlockVMEMStore.insert({&MBB, false});
+    bool &CacheEntry = It->second;
+    if (!Inserted)
+      return CacheEntry;
+
+    for (auto &MI : reverse(MBB.instrs())) {
+      // If it's a VMEM store, a vgpr will be used, return true.
+      if ((SIInstrInfo::isVMEM(MI) || SIInstrInfo::isFLAT(MI)) && MI.mayStore())
+        return CacheEntry = true;
+
+      // If it's referencing a VGPR but is not a VMEM store, return false.
+      if (SIInstrInfo::isDS(MI) || SIInstrInfo::isEXP(MI) ||
+          SIInstrInfo::isVMEM(MI) || SIInstrInfo::isFLAT(MI) ||
+          SIInstrInfo::isVALU(MI))
+        return CacheEntry = false;
+    }
+
+    // Recursive call into parent blocks. Look into predecessors if there is no
+    // vgpr used in this block.
+    return CacheEntry = llvm::any_of(MBB.predecessors(),
+                                     [this](MachineBasicBlock *Parent) {
+                                       return isLastVGPRUseVMEMStore(*Parent);
+                                     });
+  }
+
+  bool runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+
+    bool Changed = false;
+
+    for (auto &MI : MBB.terminators()) {
+      // Look for S_ENDPGM instructions
+      if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
+          MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
+        // If the last instruction using a VGPR in the block is a VMEM store,
+        // release VGPRs. The VGPRs release will be placed just before ending
+        // the program
+        if (isLastVGPRUseVMEMStore(MBB)) {
+          BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_SENDMSG))
+              .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
+          Changed = true;
+        }
+      }
+    }
+
+    return Changed;
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    Function &F = MF.getFunction();
+    if (skipFunction(F) || !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+      return false;
+
+    // This pass only runs on GFX11+
+    const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+    if (ST.getGeneration() < AMDGPUSubtarget::GFX11)
+      return false;
+
+    LLVM_DEBUG(dbgs() << "AMDGPUReleaseVGPRs running on " << MF.getName()
+                      << "\n");
+
+    SII = ST.getInstrInfo();
+    TRI = ST.getRegisterInfo();
+
+    bool Changed = false;
+    for (auto &MBB : MF) {
+      Changed |= runOnMachineBasicBlock(MBB);
+    }
+
+    BlockVMEMStore.clear();
+
+    return Changed;
+  }
+};
+
+} // namespace
+
+char AMDGPUReleaseVGPRs::ID = 0;
+
+char &llvm::AMDGPUReleaseVGPRsID = AMDGPUReleaseVGPRs::ID;
+
+INITIALIZE_PASS(AMDGPUReleaseVGPRs, DEBUG_TYPE, "Release VGPRs", false, false)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
index 2475b44b42a3..4d7a3f4028e8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
@@ -83,7 +83,7 @@
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "Utils/AMDGPUBaseInfo.h"
-#include "Utils/AMDGPULDSUtils.h"
+#include "Utils/AMDGPUMemoryUtils.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetOperations.h"
@@ -442,7 +442,7 @@ class CollectReachableCallees {
         continue;
 
       for (const auto &GI : *CGN) {
-        auto *RCB = cast<CallBase>(GI.first.getValue());
+        auto *RCB = cast<CallBase>(*GI.first);
         auto *RCGN = GI.second;
 
         if (auto *DCallee = RCGN->getFunction()) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index cb511e5e3483..f7f93c75c870 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -27,7 +27,9 @@
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/Analysis/CallGraph.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
@@ -87,9 +89,7 @@ int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
 
 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
     const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const {
-  if (ST.hasGFX90AInsts() && ArgNumAGPR)
-    return alignTo(ArgNumVGPR, 4) + ArgNumAGPR;
-  return std::max(ArgNumVGPR, ArgNumAGPR);
+  return AMDGPU::getTotalNumVGPRs(ST.hasGFX90AInsts(), ArgNumAGPR, ArgNumVGPR);
 }
 
 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
@@ -97,28 +97,31 @@ int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
   return getTotalNumVGPRs(ST, NumAGPR, NumVGPR);
 }
 
-bool AMDGPUResourceUsageAnalysis::runOnSCC(CallGraphSCC &SCC) {
+bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
   if (!TPC)
     return false;
 
+  MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
   const TargetMachine &TM = TPC->getTM<TargetMachine>();
   bool HasIndirectCall = false;
 
-  for (CallGraphNode *I : SCC) {
-    Function *F = I->getFunction();
+  CallGraph CG = CallGraph(M);
+  auto End = po_end(&CG);
+
+  for (auto IT = po_begin(&CG); IT != End; ++IT) {
+    Function *F = IT->getFunction();
     if (!F || F->isDeclaration())
       continue;
 
-    MachineModuleInfo &MMI =
-        getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
-    MachineFunction &MF = MMI.getOrCreateMachineFunction(*F);
+    MachineFunction *MF = MMI.getMachineFunction(*F);
+    assert(MF && "function must have been generated already");
 
     auto CI = CallGraphResourceInfo.insert(
-        std::make_pair(&MF.getFunction(), SIFunctionResourceInfo()));
+        std::make_pair(F, SIFunctionResourceInfo()));
     SIFunctionResourceInfo &Info = CI.first->second;
     assert(CI.second && "should only be called once per function");
-    Info = analyzeResourceUsage(MF, TM);
+    Info = analyzeResourceUsage(*MF, TM);
     HasIndirectCall |= Info.HasIndirectCall;
   }
 
@@ -246,6 +249,7 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
         case AMDGPU::SRC_PRIVATE_BASE:
         case AMDGPU::SRC_PRIVATE_LIMIT:
         case AMDGPU::SGPR_NULL:
+        case AMDGPU::SGPR_NULL64:
         case AMDGPU::MODE:
           continue;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
index b0a2d3bffc62..df0789e471c1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
@@ -24,7 +24,7 @@ class GCNSubtarget;
 class MachineFunction;
 class TargetMachine;
 
-struct AMDGPUResourceUsageAnalysis : public CallGraphSCCPass {
+struct AMDGPUResourceUsageAnalysis : public ModulePass {
   static char ID;
 
 public:
@@ -50,15 +50,15 @@ public:
     int32_t getTotalNumVGPRs(const GCNSubtarget &ST) const;
   };
 
-  AMDGPUResourceUsageAnalysis() : CallGraphSCCPass(ID) {}
+  AMDGPUResourceUsageAnalysis() : ModulePass(ID) {}
 
-  bool runOnSCC(CallGraphSCC &SCC) override;
-
-  bool doInitialization(CallGraph &CG) override {
+  bool doInitialization(Module &M) override {
     CallGraphResourceInfo.clear();
-    return CallGraphSCCPass::doInitialization(CG);
+    return ModulePass::doInitialization(M);
   }
 
+  bool runOnModule(Module &M) override;
+
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineModuleInfoWrapperPass>();
     AU.setPreservesAll();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
index 1c6c63dd5b25..4f8a61a77097 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
@@ -83,12 +83,8 @@ private:
   const DataLayout *DL = nullptr;
   MemoryDependenceResults *MDA = nullptr;
 
-  bool checkArgumentUses(Value &Arg) const;
-  bool isOutArgumentCandidate(Argument &Arg) const;
-
-#ifndef NDEBUG
-  bool isVec3ToVec4Shuffle(Type *Ty0, Type* Ty1) const;
-#endif
+  Type *getStoredType(Value &Arg) const;
+  Type *getOutArgumentType(Argument &Arg) const;
 
 public:
   static char ID;
@@ -114,72 +110,61 @@ INITIALIZE_PASS_END(AMDGPURewriteOutArguments, DEBUG_TYPE,
 
 char AMDGPURewriteOutArguments::ID = 0;
 
-bool AMDGPURewriteOutArguments::checkArgumentUses(Value &Arg) const {
+Type *AMDGPURewriteOutArguments::getStoredType(Value &Arg) const {
   const int MaxUses = 10;
   int UseCount = 0;
 
-  for (Use &U : Arg.uses()) {
-    StoreInst *SI = dyn_cast<StoreInst>(U.getUser());
-    if (UseCount > MaxUses)
-      return false;
+  SmallVector<Use *> Worklist;
+  for (Use &U : Arg.uses())
+    Worklist.push_back(&U);
 
-    if (!SI) {
-      auto *BCI = dyn_cast<BitCastInst>(U.getUser());
-      if (!BCI || !BCI->hasOneUse())
-        return false;
-
-      // We don't handle multiple stores currently, so stores to aggregate
-      // pointers aren't worth the trouble since they are canonically split up.
-      Type *DestEltTy = BCI->getType()->getPointerElementType();
-      if (DestEltTy->isAggregateType())
-        return false;
-
-      // We could handle these if we had a convenient way to bitcast between
-      // them.
-      Type *SrcEltTy = Arg.getType()->getPointerElementType();
-      if (SrcEltTy->isArrayTy())
-        return false;
-
-      // Special case handle structs with single members. It is useful to handle
-      // some casts between structs and non-structs, but we can't bitcast
-      // directly between them. Blender uses some casts that look like
-      // { <3 x float> }* to <4 x float>*
-      if ((SrcEltTy->isStructTy() && (SrcEltTy->getStructNumElements() != 1)))
-        return false;
-
-      // Clang emits OpenCL 3-vector type accesses with a bitcast to the
-      // equivalent 4-element vector and accesses that, and we're looking for
-      // this pointer cast.
-      if (DL->getTypeAllocSize(SrcEltTy) != DL->getTypeAllocSize(DestEltTy))
-        return false;
-
-      return checkArgumentUses(*BCI);
+  Type *StoredType = nullptr;
+  while (!Worklist.empty()) {
+    Use *U = Worklist.pop_back_val();
+
+    if (auto *BCI = dyn_cast<BitCastInst>(U->getUser())) {
+      for (Use &U : BCI->uses())
+        Worklist.push_back(&U);
+      continue;
     }
 
-    if (!SI->isSimple() ||
-        U.getOperandNo() != StoreInst::getPointerOperandIndex())
-      return false;
+    if (auto *SI = dyn_cast<StoreInst>(U->getUser())) {
+      if (UseCount++ > MaxUses)
+        return nullptr;
+
+      if (!SI->isSimple() ||
+          U->getOperandNo() != StoreInst::getPointerOperandIndex())
+        return nullptr;
 
-    ++UseCount;
+      if (StoredType && StoredType != SI->getValueOperand()->getType())
+        return nullptr; // More than one type.
+      StoredType = SI->getValueOperand()->getType();
+      continue;
+    }
+
+    // Unsupported user.
+    return nullptr;
   }
 
-  // Skip unused arguments.
-  return UseCount > 0;
+  return StoredType;
 }
 
-bool AMDGPURewriteOutArguments::isOutArgumentCandidate(Argument &Arg) const {
+Type *AMDGPURewriteOutArguments::getOutArgumentType(Argument &Arg) const {
   const unsigned MaxOutArgSizeBytes = 4 * MaxNumRetRegs;
   PointerType *ArgTy = dyn_cast<PointerType>(Arg.getType());
 
   // TODO: It might be useful for any out arguments, not just privates.
   if (!ArgTy || (ArgTy->getAddressSpace() != DL->getAllocaAddrSpace() &&
                  !AnyAddressSpace) ||
-      Arg.hasByValAttr() || Arg.hasStructRetAttr() ||
-      DL->getTypeStoreSize(ArgTy->getPointerElementType()) > MaxOutArgSizeBytes) {
-    return false;
+      Arg.hasByValAttr() || Arg.hasStructRetAttr()) {
+    return nullptr;
   }
 
-  return checkArgumentUses(Arg);
+  Type *StoredType = getStoredType(Arg);
+  if (!StoredType || DL->getTypeStoreSize(StoredType) > MaxOutArgSizeBytes)
+    return nullptr;
+
+  return StoredType;
 }
 
 bool AMDGPURewriteOutArguments::doInitialization(Module &M) {
@@ -187,22 +172,6 @@ bool AMDGPURewriteOutArguments::doInitialization(Module &M) {
   return false;
 }
 
-#ifndef NDEBUG
-bool AMDGPURewriteOutArguments::isVec3ToVec4Shuffle(Type *Ty0, Type* Ty1) const {
-  auto *VT0 = dyn_cast<FixedVectorType>(Ty0);
-  auto *VT1 = dyn_cast<FixedVectorType>(Ty1);
-  if (!VT0 || !VT1)
-    return false;
-
-  if (VT0->getNumElements() != 3 ||
-      VT1->getNumElements() != 4)
-    return false;
-
-  return DL->getTypeSizeInBits(VT0->getElementType()) ==
-         DL->getTypeSizeInBits(VT1->getElementType());
-}
-#endif
-
 bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
@@ -215,7 +184,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
   MDA = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
 
   unsigned ReturnNumRegs = 0;
-  SmallSet<int, 4> OutArgIndexes;
+  SmallDenseMap<int, Type *, 4> OutArgIndexes;
   SmallVector<Type *, 4> ReturnTypes;
   Type *RetTy = F.getReturnType();
   if (!RetTy->isVoidTy()) {
@@ -227,12 +196,12 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
     ReturnTypes.push_back(RetTy);
   }
 
-  SmallVector<Argument *, 4> OutArgs;
+  SmallVector<std::pair<Argument *, Type *>, 4> OutArgs;
   for (Argument &Arg : F.args()) {
-    if (isOutArgumentCandidate(Arg)) {
+    if (Type *Ty = getOutArgumentType(Arg)) {
       LLVM_DEBUG(dbgs() << "Found possible out argument " << Arg
                         << " in function " << F.getName() << '\n');
-      OutArgs.push_back(&Arg);
+      OutArgs.push_back({&Arg, Ty});
     }
   }
 
@@ -264,11 +233,12 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
     // first. On the second iteration we've removed that out clobbering argument
     // (by effectively moving it into another function) and will find the second
     // argument is OK to move.
-    for (Argument *OutArg : OutArgs) {
+    for (const auto &Pair : OutArgs) {
       bool ThisReplaceable = true;
       SmallVector<std::pair<ReturnInst *, StoreInst *>, 4> ReplaceableStores;
 
-      Type *ArgTy = OutArg->getType()->getPointerElementType();
+      Argument *OutArg = Pair.first;
+      Type *ArgTy = Pair.second;
 
       // Skip this argument if converting it will push us over the register
       // count to return limit.
@@ -324,7 +294,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
 
       if (ThisReplaceable) {
         ReturnTypes.push_back(ArgTy);
-        OutArgIndexes.insert(OutArg->getArgNo());
+        OutArgIndexes.insert({OutArg->getArgNo(), ArgTy});
         ++NumOutArgumentsReplaced;
         Changing = true;
       }
@@ -376,32 +346,8 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
     if (RetVal)
       NewRetVal = B.CreateInsertValue(NewRetVal, RetVal, RetIdx++);
 
-    for (std::pair<Argument *, Value *> ReturnPoint : Replacement.second) {
-      Argument *Arg = ReturnPoint.first;
-      Value *Val = ReturnPoint.second;
-      Type *EltTy = Arg->getType()->getPointerElementType();
-      if (Val->getType() != EltTy) {
-        Type *EffectiveEltTy = EltTy;
-        if (StructType *CT = dyn_cast<StructType>(EltTy)) {
-          assert(CT->getNumElements() == 1);
-          EffectiveEltTy = CT->getElementType(0);
-        }
-
-        if (DL->getTypeSizeInBits(EffectiveEltTy) !=
-            DL->getTypeSizeInBits(Val->getType())) {
-          assert(isVec3ToVec4Shuffle(EffectiveEltTy, Val->getType()));
-          Val = B.CreateShuffleVector(Val, ArrayRef<int>{0, 1, 2});
-        }
-
-        Val = B.CreateBitCast(Val, EffectiveEltTy);
-
-        // Re-create single element composite.
-        if (EltTy != EffectiveEltTy)
-          Val = B.CreateInsertValue(UndefValue::get(EltTy), Val, 0);
-      }
-
-      NewRetVal = B.CreateInsertValue(NewRetVal, Val, RetIdx++);
-    }
+    for (std::pair<Argument *, Value *> ReturnPoint : Replacement.second)
+      NewRetVal = B.CreateInsertValue(NewRetVal, ReturnPoint.second, RetIdx++);
 
     if (RetVal)
       RI->setOperand(0, NewRetVal);
@@ -433,7 +379,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
 
     PointerType *ArgType = cast<PointerType>(Arg.getType());
 
-    auto *EltTy = ArgType->getPointerElementType();
+    Type *EltTy = OutArgIndexes[Arg.getArgNo()];
     const auto Align =
         DL->getValueOrABITypeAlignment(Arg.getParamAlign(), EltTy);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index afe016731395..8297635d7bb2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -39,7 +39,8 @@ class GcnBufferFormatBase<bits<8> f, bits<8> bpc, bits<8> numc, bits<8> nfmt, bi
 }
 
 class Gfx9BufferFormat<bits<8> f, bits<8> bpc, bits<8> numc, bits<8> nfmt, bits<8> dfmt> : GcnBufferFormatBase<f, bpc, numc, nfmt, dfmt>;
-class Gfx10PlusBufferFormat<bits<8> f, bits<8> bpc, bits<8> numc, bits<8> nfmt, bits<8> dfmt> : GcnBufferFormatBase<f, bpc, numc, nfmt, dfmt>;
+class Gfx10BufferFormat<bits<8> f, bits<8> bpc, bits<8> numc, bits<8> nfmt, bits<8> dfmt> : GcnBufferFormatBase<f, bpc, numc, nfmt, dfmt>;
+class Gfx11PlusBufferFormat<bits<8> f, bits<8> bpc, bits<8> numc, bits<8> nfmt, bits<8> dfmt> : GcnBufferFormatBase<f, bpc, numc, nfmt, dfmt>;
 
 class GcnBufferFormatTable  : GenericTable {
   let CppTypeName = "GcnBufferFormatInfo";
@@ -51,17 +52,25 @@ def Gfx9BufferFormat : GcnBufferFormatTable {
   let FilterClass = "Gfx9BufferFormat";
   let PrimaryKeyName = "getGfx9BufferFormatInfo";
 }
-def Gfx10PlusBufferFormat : GcnBufferFormatTable {
-  let FilterClass = "Gfx10PlusBufferFormat";
-  let PrimaryKeyName = "getGfx10PlusBufferFormatInfo";
+def Gfx10BufferFormat : GcnBufferFormatTable {
+  let FilterClass = "Gfx10BufferFormat";
+  let PrimaryKeyName = "getGfx10BufferFormatInfo";
+}
+def Gfx11PlusBufferFormat : GcnBufferFormatTable {
+  let FilterClass = "Gfx11PlusBufferFormat";
+  let PrimaryKeyName = "getGfx11PlusBufferFormatInfo";
 }
 
 def getGfx9BufferFormatInfo : SearchIndex {
   let Table = Gfx9BufferFormat;
   let Key = ["Format"];
 }
-def getGfx10PlusBufferFormatInfo : SearchIndex {
-  let Table = Gfx10PlusBufferFormat;
+def getGfx10BufferFormatInfo : SearchIndex {
+  let Table = Gfx10BufferFormat;
+  let Key = ["Format"];
+}
+def getGfx11PlusBufferFormatInfo : SearchIndex {
+  let Table = Gfx11PlusBufferFormat;
   let Key = ["Format"];
 }
 
@@ -119,57 +128,87 @@ def : Gfx9BufferFormat< /*FORMAT_32_32_32_32_SINT*/     0x5E, 32, 4, /*NUM_FORMA
 def : Gfx9BufferFormat< /*FORMAT_32_32_32_32_FLOAT*/    0x7E, 32, 4, /*NUM_FORMAT_FLOAT*/   7, /*DATA_FORMAT_32_32_32_32*/ 14>;
 
 // Buffer formats with equal component sizes (GFX10 and later)
-def : Gfx10PlusBufferFormat< /*FORMAT_8_UNORM*/              0x01,  8, 1, /*NUM_FORMAT_UNORM*/   0, /*DATA_FORMAT_8*/            1>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_SNORM*/              0x02,  8, 1, /*NUM_FORMAT_SNORM*/   1, /*DATA_FORMAT_8*/            1>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_USCALED*/            0x03,  8, 1, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_8*/            1>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_SSCALED*/            0x04,  8, 1, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_8*/            1>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_UINT*/               0x05,  8, 1, /*NUM_FORMAT_UINT*/    4, /*DATA_FORMAT_8*/            1>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_SINT*/               0x06,  8, 1, /*NUM_FORMAT_SINT*/    5, /*DATA_FORMAT_8*/            1>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_UNORM*/             0x07, 16, 1, /*NUM_FORMAT_UNORM*/   0, /*DATA_FORMAT_16*/           2>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_SNORM*/             0x08, 16, 1, /*NUM_FORMAT_SNORM*/   1, /*DATA_FORMAT_16*/           2>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_USCALED*/           0x09, 16, 1, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_16*/           2>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_SSCALED*/           0x0A, 16, 1, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_16*/           2>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_UINT*/              0x0B, 16, 1, /*NUM_FORMAT_UINT*/    4, /*DATA_FORMAT_16*/           2>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_SINT*/              0x0C, 16, 1, /*NUM_FORMAT_SINT*/    5, /*DATA_FORMAT_16*/           2>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_FLOAT*/             0x0D, 16, 1, /*NUM_FORMAT_FLOAT*/   7, /*DATA_FORMAT_16*/           2>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_8_UNORM*/            0x0E,  8, 2, /*NUM_FORMAT_UNORM*/   0, /*DATA_FORMAT_8_8*/          3>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_8_SNORM*/            0x0F,  8, 2, /*NUM_FORMAT_SNORM*/   1, /*DATA_FORMAT_8_8*/          3>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_8_USCALED*/          0x10,  8, 2, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_8_8*/          3>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_8_SSCALED*/          0x11,  8, 2, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_8_8*/          3>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_8_UINT*/             0x12,  8, 2, /*NUM_FORMAT_UINT*/    4, /*DATA_FORMAT_8_8*/          3>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_8_SINT*/             0x13,  8, 2, /*NUM_FORMAT_SINT*/    5, /*DATA_FORMAT_8_8*/          3>;
-def : Gfx10PlusBufferFormat< /*FORMAT_32_UINT*/              0x14, 32, 1, /*NUM_FORMAT_UINT*/    4, /*DATA_FORMAT_32*/           4>;
-def : Gfx10PlusBufferFormat< /*FORMAT_32_SINT*/              0x15, 32, 1, /*NUM_FORMAT_SINT*/    5, /*DATA_FORMAT_32*/           4>;
-def : Gfx10PlusBufferFormat< /*FORMAT_32_FLOAT*/             0x16, 32, 1, /*NUM_FORMAT_FLOAT*/   7, /*DATA_FORMAT_32*/           4>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_16_UNORM*/          0x17, 16, 2, /*NUM_FORMAT_UNORM*/   0, /*DATA_FORMAT_16_16*/        5>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_16_SNORM*/          0x18, 16, 2, /*NUM_FORMAT_SNORM*/   1, /*DATA_FORMAT_16_16*/        5>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_16_USCALED*/        0x19, 16, 2, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_16_16*/        5>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_16_SSCALED*/        0x1A, 16, 2, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_16_16*/        5>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_16_UINT*/           0x1B, 16, 2, /*NUM_FORMAT_UINT*/    4, /*DATA_FORMAT_16_16*/        5>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_16_SINT*/           0x1C, 16, 2, /*NUM_FORMAT_SINT*/    5, /*DATA_FORMAT_16_16*/        5>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_16_FLOAT*/          0x1D, 16, 2, /*NUM_FORMAT_FLOAT*/   7, /*DATA_FORMAT_16_16*/        5>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_8_8_8_UNORM*/        0x38,  8, 4, /*NUM_FORMAT_UNORM*/   0, /*DATA_FORMAT_8_8_8_8*/     10>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_8_8_8_SNORM*/        0x39,  8, 4, /*NUM_FORMAT_SNORM*/   1, /*DATA_FORMAT_8_8_8_8*/     10>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_8_8_8_USCALED*/      0x3A,  8, 4, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_8_8_8_8*/     10>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_8_8_8_SSCALED*/      0x3B,  8, 4, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_8_8_8_8*/     10>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_8_8_8_UINT*/         0x3C,  8, 4, /*NUM_FORMAT_UINT*/    4, /*DATA_FORMAT_8_8_8_8*/     10>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_8_8_8_SINT*/         0x3D,  8, 4, /*NUM_FORMAT_SINT*/    5, /*DATA_FORMAT_8_8_8_8*/     10>;
-def : Gfx10PlusBufferFormat< /*FORMAT_32_32_UINT*/           0x3E, 32, 2, /*NUM_FORMAT_UINT*/    4, /*DATA_FORMAT_32_32*/       11>;
-def : Gfx10PlusBufferFormat< /*FORMAT_32_32_SINT*/           0x3F, 32, 2, /*NUM_FORMAT_SINT*/    5, /*DATA_FORMAT_32_32*/       11>;
-def : Gfx10PlusBufferFormat< /*FORMAT_32_32_FLOAT*/          0x40, 32, 2, /*NUM_FORMAT_FLOAT*/   7, /*DATA_FORMAT_32_32*/       11>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_16_16_16_UNORM*/    0x41, 16, 4, /*NUM_FORMAT_UNORM*/   0, /*DATA_FORMAT_16_16_16_16*/ 12>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_16_16_16_SNORM*/    0x42, 16, 4, /*NUM_FORMAT_SNORM*/   1, /*DATA_FORMAT_16_16_16_16*/ 12>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_16_16_16_USCALED*/  0x43, 16, 4, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_16_16_16_16*/ 12>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_16_16_16_SSCALED*/  0x44, 16, 4, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_16_16_16_16*/ 12>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_16_16_16_UINT*/     0x45, 16, 4, /*NUM_FORMAT_UINT*/    4, /*DATA_FORMAT_16_16_16_16*/ 12>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_16_16_16_SINT*/     0x46, 16, 4, /*NUM_FORMAT_SINT*/    5, /*DATA_FORMAT_16_16_16_16*/ 12>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_16_16_16_FLOAT*/    0x47, 16, 4, /*NUM_FORMAT_FLOAT*/   7, /*DATA_FORMAT_16_16_16_16*/ 12>;
-def : Gfx10PlusBufferFormat< /*FORMAT_32_32_32_UINT*/        0x48, 32, 3, /*NUM_FORMAT_UINT*/    4, /*DATA_FORMAT_32_32_32*/    13>;
-def : Gfx10PlusBufferFormat< /*FORMAT_32_32_32_SINT*/        0x49, 32, 3, /*NUM_FORMAT_SINT*/    5, /*DATA_FORMAT_32_32_32*/    13>;
-def : Gfx10PlusBufferFormat< /*FORMAT_32_32_32_FLOAT*/       0x4A, 32, 3, /*NUM_FORMAT_FLOAT*/   7, /*DATA_FORMAT_32_32_32*/    13>;
-def : Gfx10PlusBufferFormat< /*FORMAT_32_32_32_32_UINT*/     0x4B, 32, 4, /*NUM_FORMAT_UINT*/    4, /*DATA_FORMAT_32_32_32_32*/ 14>;
-def : Gfx10PlusBufferFormat< /*FORMAT_32_32_32_32_SINT*/     0x4C, 32, 4, /*NUM_FORMAT_SINT*/    5, /*DATA_FORMAT_32_32_32_32*/ 14>;
-def : Gfx10PlusBufferFormat< /*FORMAT_32_32_32_32_FLOAT*/    0x4D, 32, 4, /*NUM_FORMAT_FLOAT*/   7, /*DATA_FORMAT_32_32_32_32*/ 14>;
+multiclass Gfx10PlusBufferFormat<bits<8> f, bits<8> bpc, bits<8> numc, bits<8> nfmt, bits<8> dfmt> {
+  def : Gfx10BufferFormat<f, bpc, numc, nfmt, dfmt>;
+  def : Gfx11PlusBufferFormat<f, bpc, numc, nfmt, dfmt>;
+}
+defm : Gfx10PlusBufferFormat< /*FORMAT_8_UNORM*/              0x01,  8, 1, /*NUM_FORMAT_UNORM*/   0, /*DATA_FORMAT_8*/            1>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_8_SNORM*/              0x02,  8, 1, /*NUM_FORMAT_SNORM*/   1, /*DATA_FORMAT_8*/            1>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_8_USCALED*/            0x03,  8, 1, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_8*/            1>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_8_SSCALED*/            0x04,  8, 1, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_8*/            1>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_8_UINT*/               0x05,  8, 1, /*NUM_FORMAT_UINT*/    4, /*DATA_FORMAT_8*/            1>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_8_SINT*/               0x06,  8, 1, /*NUM_FORMAT_SINT*/    5, /*DATA_FORMAT_8*/            1>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_16_UNORM*/             0x07, 16, 1, /*NUM_FORMAT_UNORM*/   0, /*DATA_FORMAT_16*/           2>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_16_SNORM*/             0x08, 16, 1, /*NUM_FORMAT_SNORM*/   1, /*DATA_FORMAT_16*/           2>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_16_USCALED*/           0x09, 16, 1, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_16*/           2>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_16_SSCALED*/           0x0A, 16, 1, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_16*/           2>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_16_UINT*/              0x0B, 16, 1, /*NUM_FORMAT_UINT*/    4, /*DATA_FORMAT_16*/           2>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_16_SINT*/              0x0C, 16, 1, /*NUM_FORMAT_SINT*/    5, /*DATA_FORMAT_16*/           2>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_16_FLOAT*/             0x0D, 16, 1, /*NUM_FORMAT_FLOAT*/   7, /*DATA_FORMAT_16*/           2>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_8_8_UNORM*/            0x0E,  8, 2, /*NUM_FORMAT_UNORM*/   0, /*DATA_FORMAT_8_8*/          3>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_8_8_SNORM*/            0x0F,  8, 2, /*NUM_FORMAT_SNORM*/   1, /*DATA_FORMAT_8_8*/          3>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_8_8_USCALED*/          0x10,  8, 2, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_8_8*/          3>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_8_8_SSCALED*/          0x11,  8, 2, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_8_8*/          3>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_8_8_UINT*/             0x12,  8, 2, /*NUM_FORMAT_UINT*/    4, /*DATA_FORMAT_8_8*/          3>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_8_8_SINT*/             0x13,  8, 2, /*NUM_FORMAT_SINT*/    5, /*DATA_FORMAT_8_8*/          3>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_32_UINT*/              0x14, 32, 1, /*NUM_FORMAT_UINT*/    4, /*DATA_FORMAT_32*/           4>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_32_SINT*/              0x15, 32, 1, /*NUM_FORMAT_SINT*/    5, /*DATA_FORMAT_32*/           4>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_32_FLOAT*/             0x16, 32, 1, /*NUM_FORMAT_FLOAT*/   7, /*DATA_FORMAT_32*/           4>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_16_16_UNORM*/          0x17, 16, 2, /*NUM_FORMAT_UNORM*/   0, /*DATA_FORMAT_16_16*/        5>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_16_16_SNORM*/          0x18, 16, 2, /*NUM_FORMAT_SNORM*/   1, /*DATA_FORMAT_16_16*/        5>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_16_16_USCALED*/        0x19, 16, 2, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_16_16*/        5>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_16_16_SSCALED*/        0x1A, 16, 2, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_16_16*/        5>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_16_16_UINT*/           0x1B, 16, 2, /*NUM_FORMAT_UINT*/    4, /*DATA_FORMAT_16_16*/        5>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_16_16_SINT*/           0x1C, 16, 2, /*NUM_FORMAT_SINT*/    5, /*DATA_FORMAT_16_16*/        5>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_16_16_FLOAT*/          0x1D, 16, 2, /*NUM_FORMAT_FLOAT*/   7, /*DATA_FORMAT_16_16*/        5>;
+
+// Buffer formats with equal component sizes (GFX10 only)
+def : Gfx10BufferFormat< /*FORMAT_8_8_8_8_UNORM*/        0x38,  8, 4, /*NUM_FORMAT_UNORM*/   0, /*DATA_FORMAT_8_8_8_8*/     10>;
+def : Gfx10BufferFormat< /*FORMAT_8_8_8_8_SNORM*/        0x39,  8, 4, /*NUM_FORMAT_SNORM*/   1, /*DATA_FORMAT_8_8_8_8*/     10>;
+def : Gfx10BufferFormat< /*FORMAT_8_8_8_8_USCALED*/      0x3A,  8, 4, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_8_8_8_8*/     10>;
+def : Gfx10BufferFormat< /*FORMAT_8_8_8_8_SSCALED*/      0x3B,  8, 4, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_8_8_8_8*/     10>;
+def : Gfx10BufferFormat< /*FORMAT_8_8_8_8_UINT*/         0x3C,  8, 4, /*NUM_FORMAT_UINT*/    4, /*DATA_FORMAT_8_8_8_8*/     10>;
+def : Gfx10BufferFormat< /*FORMAT_8_8_8_8_SINT*/         0x3D,  8, 4, /*NUM_FORMAT_SINT*/    5, /*DATA_FORMAT_8_8_8_8*/     10>;
+def : Gfx10BufferFormat< /*FORMAT_32_32_UINT*/           0x3E, 32, 2, /*NUM_FORMAT_UINT*/    4, /*DATA_FORMAT_32_32*/       11>;
+def : Gfx10BufferFormat< /*FORMAT_32_32_SINT*/           0x3F, 32, 2, /*NUM_FORMAT_SINT*/    5, /*DATA_FORMAT_32_32*/       11>;
+def : Gfx10BufferFormat< /*FORMAT_32_32_FLOAT*/          0x40, 32, 2, /*NUM_FORMAT_FLOAT*/   7, /*DATA_FORMAT_32_32*/       11>;
+def : Gfx10BufferFormat< /*FORMAT_16_16_16_16_UNORM*/    0x41, 16, 4, /*NUM_FORMAT_UNORM*/   0, /*DATA_FORMAT_16_16_16_16*/ 12>;
+def : Gfx10BufferFormat< /*FORMAT_16_16_16_16_SNORM*/    0x42, 16, 4, /*NUM_FORMAT_SNORM*/   1, /*DATA_FORMAT_16_16_16_16*/ 12>;
+def : Gfx10BufferFormat< /*FORMAT_16_16_16_16_USCALED*/  0x43, 16, 4, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_16_16_16_16*/ 12>;
+def : Gfx10BufferFormat< /*FORMAT_16_16_16_16_SSCALED*/  0x44, 16, 4, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_16_16_16_16*/ 12>;
+def : Gfx10BufferFormat< /*FORMAT_16_16_16_16_UINT*/     0x45, 16, 4, /*NUM_FORMAT_UINT*/    4, /*DATA_FORMAT_16_16_16_16*/ 12>;
+def : Gfx10BufferFormat< /*FORMAT_16_16_16_16_SINT*/     0x46, 16, 4, /*NUM_FORMAT_SINT*/    5, /*DATA_FORMAT_16_16_16_16*/ 12>;
+def : Gfx10BufferFormat< /*FORMAT_16_16_16_16_FLOAT*/    0x47, 16, 4, /*NUM_FORMAT_FLOAT*/   7, /*DATA_FORMAT_16_16_16_16*/ 12>;
+def : Gfx10BufferFormat< /*FORMAT_32_32_32_UINT*/        0x48, 32, 3, /*NUM_FORMAT_UINT*/    4, /*DATA_FORMAT_32_32_32*/    13>;
+def : Gfx10BufferFormat< /*FORMAT_32_32_32_SINT*/        0x49, 32, 3, /*NUM_FORMAT_SINT*/    5, /*DATA_FORMAT_32_32_32*/    13>;
+def : Gfx10BufferFormat< /*FORMAT_32_32_32_FLOAT*/       0x4A, 32, 3, /*NUM_FORMAT_FLOAT*/   7, /*DATA_FORMAT_32_32_32*/    13>;
+def : Gfx10BufferFormat< /*FORMAT_32_32_32_32_UINT*/     0x4B, 32, 4, /*NUM_FORMAT_UINT*/    4, /*DATA_FORMAT_32_32_32_32*/ 14>;
+def : Gfx10BufferFormat< /*FORMAT_32_32_32_32_SINT*/     0x4C, 32, 4, /*NUM_FORMAT_SINT*/    5, /*DATA_FORMAT_32_32_32_32*/ 14>;
+def : Gfx10BufferFormat< /*FORMAT_32_32_32_32_FLOAT*/    0x4D, 32, 4, /*NUM_FORMAT_FLOAT*/   7, /*DATA_FORMAT_32_32_32_32*/ 14>;
+
+// Buffer formats with equal component sizes (GFX11 and later)
+def : Gfx11PlusBufferFormat< /*FORMAT_8_8_8_8_UNORM*/        0x2A,  8, 4, /*NUM_FORMAT_UNORM*/   0, /*DATA_FORMAT_8_8_8_8*/     10>;
+def : Gfx11PlusBufferFormat< /*FORMAT_8_8_8_8_SNORM*/        0x2B,  8, 4, /*NUM_FORMAT_SNORM*/   1, /*DATA_FORMAT_8_8_8_8*/     10>;
+def : Gfx11PlusBufferFormat< /*FORMAT_8_8_8_8_USCALED*/      0x2C,  8, 4, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_8_8_8_8*/     10>;
+def : Gfx11PlusBufferFormat< /*FORMAT_8_8_8_8_SSCALED*/      0x2D,  8, 4, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_8_8_8_8*/     10>;
+def : Gfx11PlusBufferFormat< /*FORMAT_8_8_8_8_UINT*/         0x2E,  8, 4, /*NUM_FORMAT_UINT*/    4, /*DATA_FORMAT_8_8_8_8*/     10>;
+def : Gfx11PlusBufferFormat< /*FORMAT_8_8_8_8_SINT*/         0x2F,  8, 4, /*NUM_FORMAT_SINT*/    5, /*DATA_FORMAT_8_8_8_8*/     10>;
+def : Gfx11PlusBufferFormat< /*FORMAT_32_32_UINT*/           0x30, 32, 2, /*NUM_FORMAT_UINT*/    4, /*DATA_FORMAT_32_32*/       11>;
+def : Gfx11PlusBufferFormat< /*FORMAT_32_32_SINT*/           0x31, 32, 2, /*NUM_FORMAT_SINT*/    5, /*DATA_FORMAT_32_32*/       11>;
+def : Gfx11PlusBufferFormat< /*FORMAT_32_32_FLOAT*/          0x32, 32, 2, /*NUM_FORMAT_FLOAT*/   7, /*DATA_FORMAT_32_32*/       11>;
+def : Gfx11PlusBufferFormat< /*FORMAT_16_16_16_16_UNORM*/    0x33, 16, 4, /*NUM_FORMAT_UNORM*/   0, /*DATA_FORMAT_16_16_16_16*/ 12>;
+def : Gfx11PlusBufferFormat< /*FORMAT_16_16_16_16_SNORM*/    0x34, 16, 4, /*NUM_FORMAT_SNORM*/   1, /*DATA_FORMAT_16_16_16_16*/ 12>;
+def : Gfx11PlusBufferFormat< /*FORMAT_16_16_16_16_USCALED*/  0x35, 16, 4, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_16_16_16_16*/ 12>;
+def : Gfx11PlusBufferFormat< /*FORMAT_16_16_16_16_SSCALED*/  0x36, 16, 4, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_16_16_16_16*/ 12>;
+def : Gfx11PlusBufferFormat< /*FORMAT_16_16_16_16_UINT*/     0x37, 16, 4, /*NUM_FORMAT_UINT*/    4, /*DATA_FORMAT_16_16_16_16*/ 12>;
+def : Gfx11PlusBufferFormat< /*FORMAT_16_16_16_16_SINT*/     0x38, 16, 4, /*NUM_FORMAT_SINT*/    5, /*DATA_FORMAT_16_16_16_16*/ 12>;
+def : Gfx11PlusBufferFormat< /*FORMAT_16_16_16_16_FLOAT*/    0x39, 16, 4, /*NUM_FORMAT_FLOAT*/   7, /*DATA_FORMAT_16_16_16_16*/ 12>;
+def : Gfx11PlusBufferFormat< /*FORMAT_32_32_32_UINT*/        0x3A, 32, 3, /*NUM_FORMAT_UINT*/    4, /*DATA_FORMAT_32_32_32*/    13>;
+def : Gfx11PlusBufferFormat< /*FORMAT_32_32_32_SINT*/        0x3B, 32, 3, /*NUM_FORMAT_SINT*/    5, /*DATA_FORMAT_32_32_32*/    13>;
+def : Gfx11PlusBufferFormat< /*FORMAT_32_32_32_FLOAT*/       0x3C, 32, 3, /*NUM_FORMAT_FLOAT*/   7, /*DATA_FORMAT_32_32_32*/    13>;
+def : Gfx11PlusBufferFormat< /*FORMAT_32_32_32_32_UINT*/     0x3D, 32, 4, /*NUM_FORMAT_UINT*/    4, /*DATA_FORMAT_32_32_32_32*/ 14>;
+def : Gfx11PlusBufferFormat< /*FORMAT_32_32_32_32_SINT*/     0x3E, 32, 4, /*NUM_FORMAT_SINT*/    5, /*DATA_FORMAT_32_32_32_32*/ 14>;
+def : Gfx11PlusBufferFormat< /*FORMAT_32_32_32_32_FLOAT*/    0x3F, 32, 4, /*NUM_FORMAT_FLOAT*/   7, /*DATA_FORMAT_32_32_32_32*/ 14>;
 
 class SourceOfDivergence<Intrinsic intr> {
   Intrinsic Intr = intr;
@@ -191,6 +230,8 @@ def : SourceOfDivergence<int_amdgcn_interp_p1>;
 def : SourceOfDivergence<int_amdgcn_interp_p2>;
 def : SourceOfDivergence<int_amdgcn_interp_p1_f16>;
 def : SourceOfDivergence<int_amdgcn_interp_p2_f16>;
+def : SourceOfDivergence<int_amdgcn_lds_direct_load>;
+def : SourceOfDivergence<int_amdgcn_lds_param_load>;
 def : SourceOfDivergence<int_amdgcn_mbcnt_hi>;
 def : SourceOfDivergence<int_amdgcn_mbcnt_lo>;
 def : SourceOfDivergence<int_r600_read_tidig_x>;
@@ -205,9 +246,12 @@ def : SourceOfDivergence<int_amdgcn_global_atomic_fmax>;
 def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd>;
 def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin>;
 def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax>;
+def : SourceOfDivergence<int_amdgcn_global_atomic_fadd_v2bf16>;
+def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd_v2bf16>;
 def : SourceOfDivergence<int_amdgcn_ds_fadd>;
 def : SourceOfDivergence<int_amdgcn_ds_fmin>;
 def : SourceOfDivergence<int_amdgcn_ds_fmax>;
+def : SourceOfDivergence<int_amdgcn_ds_fadd_v2bf16>;
 def : SourceOfDivergence<int_amdgcn_buffer_atomic_swap>;
 def : SourceOfDivergence<int_amdgcn_buffer_atomic_add>;
 def : SourceOfDivergence<int_amdgcn_buffer_atomic_sub>;
@@ -292,6 +336,16 @@ def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x8bf16_1k>;
 def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x16bf16_1k>;
 def : SourceOfDivergence<int_amdgcn_mfma_f64_16x16x4f64>;
 def : SourceOfDivergence<int_amdgcn_mfma_f64_4x4x4f64>;
+def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x32_i8>;
+def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x16_i8>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x8_xf32>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4_xf32>;
+def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x32_f16>;
+def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x16_f16>;
+def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x32_bf16>;
+def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x16_bf16>;
+def : SourceOfDivergence<int_amdgcn_smfmac_i32_16x16x64_i8>;
+def : SourceOfDivergence<int_amdgcn_smfmac_i32_32x32x32_i8>;
 
 // The dummy boolean output is divergent from the IR's perspective,
 // but the mask results are uniform. These produce a divergent and
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
new file mode 100644
index 000000000000..34702ee6623b
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
@@ -0,0 +1,166 @@
+//===- AMDGPUSetWavePriority.cpp - Set wave priority ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Pass to temporarily raise the wave priority beginning the start of
+/// the shader function until its last VMEM instructions to allow younger
+/// waves to issue their VMEM instructions as well.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Allocator.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-set-wave-priority"
+
+namespace {
+
+struct MBBInfo {
+  MBBInfo() = default;
+  bool MayReachVMEMLoad = false;
+};
+
+using MBBInfoSet = DenseMap<const MachineBasicBlock *, MBBInfo>;
+
+class AMDGPUSetWavePriority : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AMDGPUSetWavePriority() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override { return "Set wave priority"; }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  MachineInstr *BuildSetprioMI(MachineFunction &MF, unsigned priority) const;
+
+  const SIInstrInfo *TII;
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(AMDGPUSetWavePriority, DEBUG_TYPE, "Set wave priority", false,
+                false)
+
+char AMDGPUSetWavePriority::ID = 0;
+
+FunctionPass *llvm::createAMDGPUSetWavePriorityPass() {
+  return new AMDGPUSetWavePriority();
+}
+
+MachineInstr *AMDGPUSetWavePriority::BuildSetprioMI(MachineFunction &MF,
+                                                    unsigned priority) const {
+  return BuildMI(MF, DebugLoc(), TII->get(AMDGPU::S_SETPRIO)).addImm(priority);
+}
+
+// Checks that for every predecessor Pred that can reach a VMEM load,
+// none of Pred's successors can reach a VMEM load.
+static bool CanLowerPriorityDirectlyInPredecessors(const MachineBasicBlock &MBB,
+                                                   MBBInfoSet &MBBInfos) {
+  for (const MachineBasicBlock *Pred : MBB.predecessors()) {
+    if (!MBBInfos[Pred].MayReachVMEMLoad)
+      continue;
+    for (const MachineBasicBlock *Succ : Pred->successors()) {
+      if (MBBInfos[Succ].MayReachVMEMLoad)
+        return false;
+    }
+  }
+  return true;
+}
+
+static bool isVMEMLoad(const MachineInstr &MI) {
+  return SIInstrInfo::isVMEM(MI) && MI.mayLoad();
+}
+
+bool AMDGPUSetWavePriority::runOnMachineFunction(MachineFunction &MF) {
+  const unsigned HighPriority = 3;
+  const unsigned LowPriority = 0;
+
+  Function &F = MF.getFunction();
+  if (skipFunction(F) || !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+    return false;
+
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  TII = ST.getInstrInfo();
+
+  MBBInfoSet MBBInfos;
+  SmallVector<const MachineBasicBlock *, 16> Worklist;
+  for (MachineBasicBlock &MBB : MF) {
+    if (any_of(MBB, isVMEMLoad))
+      Worklist.push_back(&MBB);
+  }
+
+  // Mark blocks from which control may reach VMEM loads.
+  while (!Worklist.empty()) {
+    const MachineBasicBlock *MBB = Worklist.pop_back_val();
+    MBBInfo &Info = MBBInfos[MBB];
+    if (!Info.MayReachVMEMLoad) {
+      Info.MayReachVMEMLoad = true;
+      Worklist.append(MBB->pred_begin(), MBB->pred_end());
+    }
+  }
+
+  MachineBasicBlock &Entry = MF.front();
+  if (!MBBInfos[&Entry].MayReachVMEMLoad)
+    return false;
+
+  // Raise the priority at the beginning of the shader.
+  MachineBasicBlock::iterator I = Entry.begin(), E = Entry.end();
+  while (I != E && !SIInstrInfo::isVALU(*I) && !I->isTerminator())
+    ++I;
+  Entry.insert(I, BuildSetprioMI(MF, HighPriority));
+
+  // Lower the priority on edges where control leaves blocks from which
+  // VMEM loads are reachable.
+  SmallSet<MachineBasicBlock *, 16> PriorityLoweringBlocks;
+  for (MachineBasicBlock &MBB : MF) {
+    if (MBBInfos[&MBB].MayReachVMEMLoad) {
+      if (MBB.succ_empty())
+        PriorityLoweringBlocks.insert(&MBB);
+      continue;
+    }
+
+    if (CanLowerPriorityDirectlyInPredecessors(MBB, MBBInfos)) {
+      for (MachineBasicBlock *Pred : MBB.predecessors()) {
+        if (MBBInfos[Pred].MayReachVMEMLoad)
+          PriorityLoweringBlocks.insert(Pred);
+      }
+      continue;
+    }
+
+    // Where lowering the priority in predecessors is not possible, the
+    // block receiving control either was not part of a loop in the first
+    // place or the loop simplification/canonicalization pass should have
+    // already tried to split the edge and insert a preheader, and if for
+    // whatever reason it failed to do so, then this leaves us with the
+    // only option of lowering the priority within the loop.
+    PriorityLoweringBlocks.insert(&MBB);
+  }
+
+  for (MachineBasicBlock *MBB : PriorityLoweringBlocks) {
+    MachineBasicBlock::iterator I = MBB->end(), B = MBB->begin();
+    while (I != B) {
+      if (isVMEMLoad(*--I)) {
+        ++I;
+        break;
+      }
+    }
+    MBB->insert(I, BuildSetprioMI(MF, LowPriority));
+  }
+
+  return true;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index e82f9232b114..77816a783630 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -50,11 +50,6 @@ static cl::opt<bool> EnableVGPRIndexMode(
   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
   cl::init(false));
 
-static cl::opt<bool> EnableFlatScratch(
-  "amdgpu-enable-flat-scratch",
-  cl::desc("Use flat scratch instructions"),
-  cl::init(false));
-
 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
                            cl::desc("Enable the use of AA during codegen."),
                            cl::init(true));
@@ -159,26 +154,7 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
   return *this;
 }
 
-AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
-  TargetTriple(TT),
-  GCN3Encoding(false),
-  Has16BitInsts(false),
-  HasMadMixInsts(false),
-  HasMadMacF32Insts(false),
-  HasDsSrc2Insts(false),
-  HasSDWA(false),
-  HasVOP3PInsts(false),
-  HasMulI24(true),
-  HasMulU24(true),
-  HasSMulHi(false),
-  HasInv2PiInlineImm(false),
-  HasFminFmaxLegacy(true),
-  EnablePromoteAlloca(false),
-  HasTrigReducedRange(false),
-  MaxWavesPerEU(10),
-  LocalMemorySize(0),
-  WavefrontSizeLog2(0)
-  { }
+AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : TargetTriple(TT) {}
 
 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
                            const GCNTargetMachine &TM)
@@ -187,120 +163,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     AMDGPUSubtarget(TT),
     TargetTriple(TT),
     TargetID(*this),
-    Gen(INVALID),
     InstrItins(getInstrItineraryForCPU(GPU)),
-    LDSBankCount(0),
-    MaxPrivateElementSize(0),
-
-    FastFMAF32(false),
-    FastDenormalF32(false),
-    HalfRate64Ops(false),
-    FullRate64Ops(false),
-
-    FlatForGlobal(false),
-    AutoWaitcntBeforeBarrier(false),
-    UnalignedScratchAccess(false),
-    UnalignedAccessMode(false),
-
-    HasApertureRegs(false),
-    SupportsXNACK(false),
-    EnableXNACK(false),
-    EnableTgSplit(false),
-    EnableCuMode(false),
-    TrapHandler(false),
-
-    EnableLoadStoreOpt(false),
-    EnableUnsafeDSOffsetFolding(false),
-    EnableSIScheduler(false),
-    EnableDS128(false),
-    EnablePRTStrictNull(false),
-    DumpCode(false),
-
-    FP64(false),
-    CIInsts(false),
-    GFX8Insts(false),
-    GFX9Insts(false),
-    GFX90AInsts(false),
-    GFX10Insts(false),
-    GFX10_3Insts(false),
-    GFX7GFX8GFX9Insts(false),
-    SGPRInitBug(false),
-    NegativeScratchOffsetBug(false),
-    NegativeUnalignedScratchOffsetBug(false),
-    HasSMemRealTime(false),
-    HasIntClamp(false),
-    HasFmaMixInsts(false),
-    HasMovrel(false),
-    HasVGPRIndexMode(false),
-    HasScalarStores(false),
-    HasScalarAtomics(false),
-    HasSDWAOmod(false),
-    HasSDWAScalar(false),
-    HasSDWASdst(false),
-    HasSDWAMac(false),
-    HasSDWAOutModsVOPC(false),
-    HasDPP(false),
-    HasDPP8(false),
-    Has64BitDPP(false),
-    HasPackedFP32Ops(false),
-    HasExtendedImageInsts(false),
-    HasR128A16(false),
-    HasGFX10A16(false),
-    HasG16(false),
-    HasNSAEncoding(false),
-    NSAMaxSize(0),
-    GFX10_AEncoding(false),
-    GFX10_BEncoding(false),
-    HasDLInsts(false),
-    HasDot1Insts(false),
-    HasDot2Insts(false),
-    HasDot3Insts(false),
-    HasDot4Insts(false),
-    HasDot5Insts(false),
-    HasDot6Insts(false),
-    HasDot7Insts(false),
-    HasMAIInsts(false),
-    HasPkFmacF16Inst(false),
-    HasAtomicFaddInsts(false),
-    SupportsSRAMECC(false),
-    EnableSRAMECC(false),
-    HasNoSdstCMPX(false),
-    HasVscnt(false),
-    HasGetWaveIdInst(false),
-    HasSMemTimeInst(false),
-    HasShaderCyclesRegister(false),
-    HasVOP3Literal(false),
-    HasNoDataDepHazard(false),
-    FlatAddressSpace(false),
-    FlatInstOffsets(false),
-    FlatGlobalInsts(false),
-    FlatScratchInsts(false),
-    ScalarFlatScratchInsts(false),
-    HasArchitectedFlatScratch(false),
-    AddNoCarryInsts(false),
-    HasUnpackedD16VMem(false),
-    LDSMisalignedBug(false),
-    HasMFMAInlineLiteralBug(false),
-    UnalignedBufferAccess(false),
-    UnalignedDSAccess(false),
-    HasPackedTID(false),
-
-    ScalarizeGlobal(false),
-
-    HasVcmpxPermlaneHazard(false),
-    HasVMEMtoScalarWriteHazard(false),
-    HasSMEMtoVectorWriteHazard(false),
-    HasInstFwdPrefetchBug(false),
-    HasVcmpxExecWARHazard(false),
-    HasLdsBranchVmemWARHazard(false),
-    HasNSAtoVMEMBug(false),
-    HasNSAClauseBug(false),
-    HasOffset3fBug(false),
-    HasFlatSegmentOffsetBug(false),
-    HasImageStoreD16Bug(false),
-    HasImageGather4D16Bug(false),
-
-    FeatureDisable(false),
     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
     TLInfo(TM, *this),
     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
@@ -314,11 +177,6 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
 }
 
-bool GCNSubtarget::enableFlatScratch() const {
-  return flatScratchIsArchitected() ||
-         (EnableFlatScratch && hasFlatScratchInsts());
-}
-
 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
   if (getGeneration() < GFX10)
     return 1;
@@ -326,12 +184,15 @@ unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
   switch (Opcode) {
   case AMDGPU::V_LSHLREV_B64_e64:
   case AMDGPU::V_LSHLREV_B64_gfx10:
+  case AMDGPU::V_LSHLREV_B64_e64_gfx11:
   case AMDGPU::V_LSHL_B64_e64:
   case AMDGPU::V_LSHRREV_B64_e64:
   case AMDGPU::V_LSHRREV_B64_gfx10:
+  case AMDGPU::V_LSHRREV_B64_e64_gfx11:
   case AMDGPU::V_LSHR_B64_e64:
   case AMDGPU::V_ASHRREV_I64_e64:
   case AMDGPU::V_ASHRREV_I64_gfx10:
+  case AMDGPU::V_ASHRREV_I64_e64_gfx11:
   case AMDGPU::V_ASHR_I64_e64:
     return 1;
   }
@@ -658,7 +519,8 @@ unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
     return 16;
 
   // Assume all implicit inputs are used by default
-  return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 56);
+  unsigned NBytes = (AMDGPU::getAmdhsaCodeObjectVersion() >= 5) ? 256 : 56;
+  return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", NBytes);
 }
 
 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
@@ -673,13 +535,11 @@ uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
   for (const Argument &Arg : F.args()) {
     const bool IsByRef = Arg.hasByRefAttr();
     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
-    MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
-    if (!Alignment)
-      Alignment = DL.getABITypeAlign(ArgTy);
-
+    Align Alignment = DL.getValueOrABITypeAlignment(
+        IsByRef ? Arg.getParamAlign() : None, ArgTy);
     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
-    MaxAlign = max(MaxAlign, Alignment);
+    MaxAlign = std::max(MaxAlign, Alignment);
   }
 
   return ExplicitArgBytes;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 7f1b94be4ffe..7400c81effd0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -38,30 +38,32 @@ public:
     SEA_ISLANDS = 6,
     VOLCANIC_ISLANDS = 7,
     GFX9 = 8,
-    GFX10 = 9
+    GFX10 = 9,
+    GFX11 = 10
   };
 
 private:
   Triple TargetTriple;
 
 protected:
-  bool GCN3Encoding;
-  bool Has16BitInsts;
-  bool HasMadMixInsts;
-  bool HasMadMacF32Insts;
-  bool HasDsSrc2Insts;
-  bool HasSDWA;
-  bool HasVOP3PInsts;
-  bool HasMulI24;
-  bool HasMulU24;
-  bool HasSMulHi;
-  bool HasInv2PiInlineImm;
-  bool HasFminFmaxLegacy;
-  bool EnablePromoteAlloca;
-  bool HasTrigReducedRange;
-  unsigned MaxWavesPerEU;
-  unsigned LocalMemorySize;
-  char WavefrontSizeLog2;
+  bool GCN3Encoding = false;
+  bool Has16BitInsts = false;
+  bool HasTrue16BitInsts = false;
+  bool HasMadMixInsts = false;
+  bool HasMadMacF32Insts = false;
+  bool HasDsSrc2Insts = false;
+  bool HasSDWA = false;
+  bool HasVOP3PInsts = false;
+  bool HasMulI24 = true;
+  bool HasMulU24 = true;
+  bool HasSMulHi = false;
+  bool HasInv2PiInlineImm = false;
+  bool HasFminFmaxLegacy = true;
+  bool EnablePromoteAlloca = false;
+  bool HasTrigReducedRange = false;
+  unsigned MaxWavesPerEU = 10;
+  unsigned LocalMemorySize = 0;
+  char WavefrontSizeLog2 = 0;
 
 public:
   AMDGPUSubtarget(const Triple &TT);
@@ -145,6 +147,8 @@ public:
     return Has16BitInsts;
   }
 
+  bool hasTrue16BitInsts() const { return HasTrue16BitInsts; }
+
   bool hasMadMixInsts() const {
     return HasMadMixInsts;
   }
@@ -267,7 +271,7 @@ public:
   /// \p WavefrontSize.
   AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const;
 
-  virtual ~AMDGPUSubtarget() {}
+  virtual ~AMDGPUSubtarget() = default;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index a2c61f9da8da..1c6b9d35695a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -16,6 +16,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUAliasAnalysis.h"
 #include "AMDGPUExportClustering.h"
+#include "AMDGPUIGroupLP.h"
 #include "AMDGPUMacroFusion.h"
 #include "AMDGPUTargetObjectFile.h"
 #include "AMDGPUTargetTransformInfo.h"
@@ -27,6 +28,7 @@
 #include "SIMachineScheduler.h"
 #include "TargetInfo/AMDGPUTargetInfo.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
+#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
@@ -56,6 +58,7 @@
 #include "llvm/Transforms/Vectorize.h"
 
 using namespace llvm;
+using namespace llvm::PatternMatch;
 
 namespace {
 class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
@@ -269,12 +272,22 @@ static cl::opt<bool> EnableSIModeRegisterPass(
   cl::init(true),
   cl::Hidden);
 
+// Enable GFX11+ s_delay_alu insertion
+static cl::opt<bool>
+    EnableInsertDelayAlu("amdgpu-enable-delay-alu",
+                         cl::desc("Enable s_delay_alu insertion"),
+                         cl::init(true), cl::Hidden);
+
 // Option is used in lit tests to prevent deadcoding of patterns inspected.
 static cl::opt<bool>
 EnableDCEInRA("amdgpu-dce-in-ra",
     cl::init(true), cl::Hidden,
     cl::desc("Enable machine DCE inside regalloc"));
 
+static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
+                                           cl::desc("Adjust wave priority"),
+                                           cl::init(false), cl::Hidden);
+
 static cl::opt<bool> EnableScalarIRPasses(
   "amdgpu-scalar-ir-passes",
   cl::desc("Enable scalar IR passes"),
@@ -330,7 +343,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeSIOptimizeExecMaskingPreRAPass(*PR);
   initializeSIOptimizeVGPRLiveRangePass(*PR);
   initializeSILoadStoreOptimizerPass(*PR);
-  initializeAMDGPUFixFunctionBitcastsPass(*PR);
   initializeAMDGPUCtorDtorLoweringPass(*PR);
   initializeAMDGPUAlwaysInlinePass(*PR);
   initializeAMDGPUAttributorPass(*PR);
@@ -357,6 +369,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPURewriteOutArgumentsPass(*PR);
   initializeAMDGPUUnifyMetadataPass(*PR);
   initializeSIAnnotateControlFlowPass(*PR);
+  initializeAMDGPUReleaseVGPRsPass(*PR);
+  initializeAMDGPUInsertDelayAluPass(*PR);
   initializeSIInsertHardClausesPass(*PR);
   initializeSIInsertWaitcntsPass(*PR);
   initializeSIModeRegisterPass(*PR);
@@ -390,9 +404,14 @@ static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
 
 static ScheduleDAGInstrs *
 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
+  const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
   ScheduleDAGMILive *DAG =
     new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+  if (ST.shouldClusterStores())
+    DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
+  DAG->addMutation(createIGroupLPDAGMutation());
+  DAG->addMutation(createSchedBarrierDAGMutation());
   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
   DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
   return DAG;
@@ -400,9 +419,12 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
 
 static ScheduleDAGInstrs *
 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
+  const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
   auto DAG = new GCNIterativeScheduler(C,
     GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+  if (ST.shouldClusterStores())
+    DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
   return DAG;
 }
 
@@ -413,9 +435,12 @@ static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
 
 static ScheduleDAGInstrs *
 createIterativeILPMachineScheduler(MachineSchedContext *C) {
+  const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
   auto DAG = new GCNIterativeScheduler(C,
     GCNIterativeScheduler::SCHEDULE_ILP);
   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+  if (ST.shouldClusterStores())
+    DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
   return DAG;
 }
@@ -801,6 +826,23 @@ AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const {
   return std::make_pair(nullptr, -1);
 }
 
+unsigned
+AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
+  switch (Kind) {
+  case PseudoSourceValue::Stack:
+  case PseudoSourceValue::FixedStack:
+    return AMDGPUAS::PRIVATE_ADDRESS;
+  case PseudoSourceValue::ConstantPool:
+  case PseudoSourceValue::GOT:
+  case PseudoSourceValue::JumpTable:
+  case PseudoSourceValue::GlobalValueCallEntry:
+  case PseudoSourceValue::ExternalSymbolCallEntry:
+  case PseudoSourceValue::TargetCustom:
+    return AMDGPUAS::CONSTANT_ADDRESS;
+  }
+  return AMDGPUAS::FLAT_ADDRESS;
+}
+
 //===----------------------------------------------------------------------===//
 // GCN Target Machine (SI+)
 //===----------------------------------------------------------------------===//
@@ -836,7 +878,7 @@ GCNTargetMachine::getSubtargetImpl(const Function &F) const {
 }
 
 TargetTransformInfo
-GCNTargetMachine::getTargetTransformInfo(const Function &F) {
+GCNTargetMachine::getTargetTransformInfo(const Function &F) const {
   return TargetTransformInfo(GCNTTIImpl(this, F));
 }
 
@@ -873,7 +915,11 @@ public:
     ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
     const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
     DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+    if (ST.shouldClusterStores())
+      DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
     DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
+    DAG->addMutation(createIGroupLPDAGMutation());
+    DAG->addMutation(createSchedBarrierDAGMutation());
     return DAG;
   }
 
@@ -953,10 +999,6 @@ void AMDGPUPassConfig::addIRPasses() {
   addPass(createAMDGPUPrintfRuntimeBinding());
   addPass(createAMDGPUCtorDtorLoweringPass());
 
-  // This must occur before inlining, as the inliner will not look through
-  // bitcast calls.
-  addPass(createAMDGPUFixFunctionBitcastsPass());
-
   // A call to propagate attributes pass in the backend in case opt was not run.
   addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));
 
@@ -967,7 +1009,7 @@ void AMDGPUPassConfig::addIRPasses() {
   addPass(createAlwaysInlinerLegacyPass());
   // We need to add the barrier noop pass, otherwise adding the function
   // inlining pass will cause all of the PassConfigs passes to be run
-  // one function at a time, which means if we have a nodule with two
+  // one function at a time, which means if we have a module with two
   // functions, then we will generate code for the first function
   // without ever running any passes on the second.
   addPass(createBarrierNoopPass());
@@ -1079,8 +1121,11 @@ bool AMDGPUPassConfig::addGCPasses() {
 
 llvm::ScheduleDAGInstrs *
 AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const {
+  const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
   ScheduleDAGMILive *DAG = createGenericSchedLive(C);
   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+  if (ST.shouldClusterStores())
+    DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
   return DAG;
 }
 
@@ -1363,6 +1408,8 @@ void GCNPassConfig::addPreEmitPass() {
     addPass(&SIInsertHardClausesID);
 
   addPass(&SILateBranchLoweringPassID);
+  if (isPassEnabled(EnableSetWavePriority, CodeGenOpt::Less))
+    addPass(createAMDGPUSetWavePriorityPass());
   if (getOptLevel() > CodeGenOpt::None)
     addPass(&SIPreEmitPeepholeID);
   // The hazard recognizer that runs as part of the post-ra scheduler does not
@@ -1374,6 +1421,13 @@ void GCNPassConfig::addPreEmitPass() {
   // Here we add a stand-alone hazard recognizer pass which can handle all
   // cases.
   addPass(&PostRAHazardRecognizerID);
+
+  if (getOptLevel() > CodeGenOpt::Less)
+    addPass(&AMDGPUReleaseVGPRsID);
+
+  if (isPassEnabled(EnableInsertDelayAlu, CodeGenOpt::Less))
+    addPass(&AMDGPUInsertDelayAluID);
+
   addPass(&BranchRelaxationPassID);
 }
 
@@ -1396,7 +1450,7 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
     const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
     SMDiagnostic &Error, SMRange &SourceRange) const {
   const yaml::SIMachineFunctionInfo &YamlMFI =
-      reinterpret_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
+      static_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
   MachineFunction &MF = PFS.MF;
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
@@ -1420,6 +1474,14 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
     return false;
   };
 
+  auto parseOptionalRegister = [&](const yaml::StringValue &RegName,
+                                   Register &RegVal) {
+    return !RegName.Value.empty() && parseRegister(RegName, RegVal);
+  };
+
+  if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy))
+    return true;
+
   auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
     // Create a diagnostic for a the register string literal.
     const MemoryBuffer &Buffer =
@@ -1452,6 +1514,14 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
     return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
   }
 
+  for (const auto &YamlReg : YamlMFI.WWMReservedRegs) {
+    Register ParsedReg;
+    if (parseRegister(YamlReg, ParsedReg))
+      return true;
+
+    MFI->reserveWWMRegister(ParsedReg);
+  }
+
   auto parseAndCheckArgument = [&](const Optional<yaml::SIArgument> &A,
                                    const TargetRegisterClass &RC,
                                    ArgDescriptor &Arg, unsigned UserSGPRs,
@@ -1473,7 +1543,7 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
       Arg = ArgDescriptor::createStack(A->StackOffset);
     // Check and apply the optional mask.
     if (A->Mask)
-      Arg = ArgDescriptor::createArg(Arg, A->Mask.getValue());
+      Arg = ArgDescriptor::createArg(Arg, *A->Mask);
 
     MFI->NumUserSGPRs += UserSGPRs;
     MFI->NumSystemSGPRs += SystemSGPRs;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index dd3676f3b707..567cc9d610d2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// The AMDGPU TargetMachine interface definition for hw codgen targets.
+/// The AMDGPU TargetMachine interface definition for hw codegen targets.
 //
 //===----------------------------------------------------------------------===//
 
@@ -64,6 +64,8 @@ public:
 
   std::pair<const Value *, unsigned>
   getPredicatedAddrSpace(const Value *V) const override;
+
+  unsigned getAddressSpaceForPseudoSourceKind(unsigned Kind) const override;
 };
 
 //===----------------------------------------------------------------------===//
@@ -84,7 +86,7 @@ public:
 
   const TargetSubtargetInfo *getSubtargetImpl(const Function &) const override;
 
-  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+  TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
 
   bool useIPRA() const override {
     return true;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index a8df7789c8a1..a79cd2e9499e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -288,33 +288,21 @@ GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
     : BaseT(TM, F.getParent()->getDataLayout()),
       ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
       TLI(ST->getTargetLowering()), CommonTTI(TM, F),
-      IsGraphics(AMDGPU::isGraphics(F.getCallingConv())),
-      MaxVGPRs(ST->getMaxNumVGPRs(
-          std::max(ST->getWavesPerEU(F).first,
-                   ST->getWavesPerEUForWorkGroup(
-                       ST->getFlatWorkGroupSizes(F).second)))) {
+      IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
   AMDGPU::SIModeRegisterDefaults Mode(F);
   HasFP32Denormals = Mode.allFP32Denormals();
   HasFP64FP16Denormals = Mode.allFP64FP16Denormals();
 }
 
-unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
-  // The concept of vector registers doesn't really exist. Some packed vector
-  // operations operate on the normal 32-bit registers.
-  return MaxVGPRs;
-}
+unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
+  // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
+  // registers. See getRegisterClassForType for the implementation.
+  // In this case vector registers are not vector in terms of
+  // VGPRs, but those which can hold multiple values.
 
-unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
   // This is really the number of registers to fill when vectorizing /
   // interleaving loops, so we lie to avoid trying to use all registers.
-  return getHardwareNumberOfRegisters(Vec) >> 3;
-}
-
-unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
-  const SIRegisterInfo *TRI = ST->getRegisterInfo();
-  const TargetRegisterClass *RC = TRI->getRegClass(RCID);
-  unsigned NumVGPRs = (TRI->getRegSizeInBits(*RC) + 31) / 32;
-  return getHardwareNumberOfRegisters(false) / NumVGPRs;
+  return 4;
 }
 
 TypeSize
@@ -410,11 +398,14 @@ bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
 // unaligned access is legal?
 //
 // FIXME: This could use fine tuning and microbenchmarks.
-Type *GCNTTIImpl::getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
-                                            unsigned SrcAddrSpace,
-                                            unsigned DestAddrSpace,
-                                            unsigned SrcAlign,
-                                            unsigned DestAlign) const {
+Type *GCNTTIImpl::getMemcpyLoopLoweringType(
+    LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
+    unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
+    Optional<uint32_t> AtomicElementSize) const {
+
+  if (AtomicElementSize)
+    return Type::getIntNTy(Context, *AtomicElementSize * 8);
+
   unsigned MinAlign = std::min(SrcAlign, DestAlign);
 
   // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
@@ -439,11 +430,17 @@ Type *GCNTTIImpl::getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
 }
 
 void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
-  SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
-  unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
-  unsigned SrcAlign, unsigned DestAlign) const {
+    SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
+    unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
+    unsigned SrcAlign, unsigned DestAlign,
+    Optional<uint32_t> AtomicCpySize) const {
   assert(RemainingBytes < 16);
 
+  if (AtomicCpySize)
+    BaseT::getMemcpyLoopResidualLoweringType(
+        OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
+        DestAlign, AtomicCpySize);
+
   unsigned MinAlign = std::min(SrcAlign, DestAlign);
 
   if (MinAlign != 2) {
@@ -1042,7 +1039,8 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
 
 InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
                                            VectorType *VT, ArrayRef<int> Mask,
-                                           int Index, VectorType *SubTp) {
+                                           int Index, VectorType *SubTp,
+                                           ArrayRef<const Value *> Args) {
   Kind = improveShuffleKindFromMask(Kind, Mask);
   if (ST->hasVOP3PInsts()) {
     if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index e901b5c5747d..f2260c31e678 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -68,7 +68,6 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
   bool IsGraphics;
   bool HasFP32Denormals;
   bool HasFP64FP16Denormals;
-  unsigned MaxVGPRs;
 
   static const FeatureBitset InlineFeatureIgnoreList;
 
@@ -113,8 +112,6 @@ public:
     return TTI::PSK_FastHardware;
   }
 
-  unsigned getHardwareNumberOfRegisters(bool Vector) const;
-  unsigned getNumberOfRegisters(bool Vector) const;
   unsigned getNumberOfRegisters(unsigned RCID) const;
   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const;
   unsigned getMinVectorRegisterBitWidth() const;
@@ -135,15 +132,14 @@ public:
                                     unsigned AddrSpace) const;
   Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
                                   unsigned SrcAddrSpace, unsigned DestAddrSpace,
-                                  unsigned SrcAlign, unsigned DestAlign) const;
-
-  void getMemcpyLoopResidualLoweringType(SmallVectorImpl<Type *> &OpsOut,
-                                         LLVMContext &Context,
-                                         unsigned RemainingBytes,
-                                         unsigned SrcAddrSpace,
-                                         unsigned DestAddrSpace,
-                                         unsigned SrcAlign,
-                                         unsigned DestAlign) const;
+                                  unsigned SrcAlign, unsigned DestAlign,
+                                  Optional<uint32_t> AtomicElementSize) const;
+
+  void getMemcpyLoopResidualLoweringType(
+      SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
+      unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
+      unsigned SrcAlign, unsigned DestAlign,
+      Optional<uint32_t> AtomicCpySize) const;
   unsigned getMaxInterleaveFactor(unsigned VF);
 
   bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const;
@@ -201,7 +197,8 @@ public:
 
   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
                                  ArrayRef<int> Mask, int Index,
-                                 VectorType *SubTp);
+                                 VectorType *SubTp,
+                                 ArrayRef<const Value *> Args = None);
 
   bool areInlineCompatible(const Function *Caller,
                            const Function *Callee) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
deleted file mode 100644
index 1736c078eb83..000000000000
--- a/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ /dev/null
@@ -1,1638 +0,0 @@
-//===- AMDILCFGStructurizer.cpp - CFG Structurizer ------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//==-----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/R600MCTargetDesc.h"
-#include "R600.h"
-#include "R600RegisterInfo.h"
-#include "R600Subtarget.h"
-#include "llvm/ADT/SCCIterator.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineJumpTableInfo.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
-#include "llvm/InitializePasses.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "structcfg"
-
-#define DEFAULT_VEC_SLOTS 8
-
-// TODO: move-begin.
-
-//===----------------------------------------------------------------------===//
-//
-// Statistics for CFGStructurizer.
-//
-//===----------------------------------------------------------------------===//
-
-STATISTIC(numSerialPatternMatch,    "CFGStructurizer number of serial pattern "
-    "matched");
-STATISTIC(numIfPatternMatch,        "CFGStructurizer number of if pattern "
-    "matched");
-STATISTIC(numClonedBlock,           "CFGStructurizer cloned blocks");
-STATISTIC(numClonedInstr,           "CFGStructurizer cloned instructions");
-
-namespace llvm {
-
-void initializeAMDGPUCFGStructurizerPass(PassRegistry &);
-
-} // end namespace llvm
-
-namespace {
-
-//===----------------------------------------------------------------------===//
-//
-// Miscellaneous utility for CFGStructurizer.
-//
-//===----------------------------------------------------------------------===//
-
-#define SHOWNEWINSTR(i) LLVM_DEBUG(dbgs() << "New instr: " << *i << "\n");
-
-#define SHOWNEWBLK(b, msg)                                                     \
-  LLVM_DEBUG(dbgs() << msg << "BB" << b->getNumber() << "size " << b->size();  \
-             dbgs() << "\n";);
-
-#define SHOWBLK_DETAIL(b, msg)                                                 \
-  LLVM_DEBUG(if (b) {                                                          \
-    dbgs() << msg << "BB" << b->getNumber() << "size " << b->size();           \
-    b->print(dbgs());                                                          \
-    dbgs() << "\n";                                                            \
-  });
-
-#define INVALIDSCCNUM -1
-
-//===----------------------------------------------------------------------===//
-//
-// supporting data structure for CFGStructurizer
-//
-//===----------------------------------------------------------------------===//
-
-class BlockInformation {
-public:
-  bool IsRetired = false;
-  int SccNum = INVALIDSCCNUM;
-
-  BlockInformation() = default;
-};
-
-//===----------------------------------------------------------------------===//
-//
-// CFGStructurizer
-//
-//===----------------------------------------------------------------------===//
-
-class AMDGPUCFGStructurizer : public MachineFunctionPass {
-public:
-  using MBBVector = SmallVector<MachineBasicBlock *, 32>;
-  using MBBInfoMap = std::map<MachineBasicBlock *, BlockInformation *>;
-  using LoopLandInfoMap = std::map<MachineLoop *, MachineBasicBlock *>;
-
-  enum PathToKind {
-    Not_SinglePath = 0,
-    SinglePath_InPath = 1,
-    SinglePath_NotInPath = 2
-  };
-
-  static char ID;
-
-  AMDGPUCFGStructurizer() : MachineFunctionPass(ID) {
-    initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry());
-  }
-
-  StringRef getPassName() const override {
-    return "AMDGPU Control Flow Graph structurizer Pass";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<MachineDominatorTree>();
-    AU.addRequired<MachinePostDominatorTree>();
-    AU.addRequired<MachineLoopInfo>();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-
-  /// Perform the CFG structurization
-  bool run();
-
-  /// Perform the CFG preparation
-  /// This step will remove every unconditionnal/dead jump instructions and make
-  /// sure all loops have an exit block
-  bool prepare();
-
-  bool runOnMachineFunction(MachineFunction &MF) override {
-    // FIXME: This pass causes verification failures.
-    MF.getProperties().set(
-        MachineFunctionProperties::Property::FailsVerification);
-
-    TII = MF.getSubtarget<R600Subtarget>().getInstrInfo();
-    TRI = &TII->getRegisterInfo();
-    LLVM_DEBUG(MF.dump(););
-    OrderedBlks.clear();
-    Visited.clear();
-    FuncRep = &MF;
-    MLI = &getAnalysis<MachineLoopInfo>();
-    LLVM_DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
-    MDT = &getAnalysis<MachineDominatorTree>();
-    LLVM_DEBUG(MDT->print(dbgs(), (const Module *)nullptr););
-    PDT = &getAnalysis<MachinePostDominatorTree>();
-    LLVM_DEBUG(PDT->print(dbgs()););
-    prepare();
-    run();
-    LLVM_DEBUG(MF.dump(););
-    return true;
-  }
-
-protected:
-  MachineDominatorTree *MDT;
-  MachinePostDominatorTree *PDT;
-  MachineLoopInfo *MLI;
-  const R600InstrInfo *TII = nullptr;
-  const R600RegisterInfo *TRI = nullptr;
-
-  // PRINT FUNCTIONS
-  /// Print the ordered Blocks.
-  void printOrderedBlocks() const {
-    size_t i = 0;
-    for (MBBVector::const_iterator iterBlk = OrderedBlks.begin(),
-        iterBlkEnd = OrderedBlks.end(); iterBlk != iterBlkEnd; ++iterBlk, ++i) {
-      dbgs() << "BB" << (*iterBlk)->getNumber();
-      dbgs() << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")";
-      if (i != 0 && i % 10 == 0) {
-        dbgs() << "\n";
-      } else {
-        dbgs() << " ";
-      }
-    }
-  }
-
-  static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) {
-    for (const MachineLoop *L : LoopInfo)
-      L->print(dbgs());
-  }
-
-  // UTILITY FUNCTIONS
-  int getSCCNum(MachineBasicBlock *MBB) const;
-  MachineBasicBlock *getLoopLandInfo(MachineLoop *LoopRep) const;
-  bool hasBackEdge(MachineBasicBlock *MBB) const;
-  bool isRetiredBlock(MachineBasicBlock *MBB) const;
-  bool isActiveLoophead(MachineBasicBlock *MBB) const;
-  PathToKind singlePathTo(MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
-      bool AllowSideEntry = true) const;
-  int countActiveBlock(MBBVector::const_iterator It,
-      MBBVector::const_iterator E) const;
-  bool needMigrateBlock(MachineBasicBlock *MBB) const;
-
-  // Utility Functions
-  void reversePredicateSetter(MachineBasicBlock::iterator I,
-                              MachineBasicBlock &MBB);
-  /// Compute the reversed DFS post order of Blocks
-  void orderBlocks(MachineFunction *MF);
-
-  // Function originally from CFGStructTraits
-  void insertInstrEnd(MachineBasicBlock *MBB, int NewOpcode,
-                      const DebugLoc &DL = DebugLoc());
-  MachineInstr *insertInstrBefore(MachineBasicBlock *MBB, int NewOpcode,
-                                  const DebugLoc &DL = DebugLoc());
-  MachineInstr *insertInstrBefore(MachineBasicBlock::iterator I, int NewOpcode);
-  void insertCondBranchBefore(MachineBasicBlock::iterator I, int NewOpcode,
-                              const DebugLoc &DL);
-  void insertCondBranchBefore(MachineBasicBlock *MBB,
-                              MachineBasicBlock::iterator I, int NewOpcode,
-                              int RegNum, const DebugLoc &DL);
-
-  static int getBranchNzeroOpcode(int OldOpcode);
-  static int getBranchZeroOpcode(int OldOpcode);
-  static int getContinueNzeroOpcode(int OldOpcode);
-  static int getContinueZeroOpcode(int OldOpcode);
-  static MachineBasicBlock *getTrueBranch(MachineInstr *MI);
-  static void setTrueBranch(MachineInstr *MI, MachineBasicBlock *MBB);
-  static MachineBasicBlock *getFalseBranch(MachineBasicBlock *MBB,
-      MachineInstr *MI);
-  static bool isCondBranch(MachineInstr *MI);
-  static bool isUncondBranch(MachineInstr *MI);
-  static DebugLoc getLastDebugLocInBB(MachineBasicBlock *MBB);
-  static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *MBB);
-
-  /// The correct naming for this is getPossibleLoopendBlockBranchInstr.
-  ///
-  /// BB with backward-edge could have move instructions after the branch
-  /// instruction.  Such move instruction "belong to" the loop backward-edge.
-  MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *MBB);
-
-  static MachineInstr *getReturnInstr(MachineBasicBlock *MBB);
-  static bool isReturnBlock(MachineBasicBlock *MBB);
-  static void cloneSuccessorList(MachineBasicBlock *DstMBB,
-      MachineBasicBlock *SrcMBB);
-  static MachineBasicBlock *clone(MachineBasicBlock *MBB);
-
-  /// MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose
-  /// because the AMDGPU instruction is not recognized as terminator fix this
-  /// and retire this routine
-  void replaceInstrUseOfBlockWith(MachineBasicBlock *SrcMBB,
-      MachineBasicBlock *OldMBB, MachineBasicBlock *NewBlk);
-
-  static void wrapup(MachineBasicBlock *MBB);
-
-  int patternMatch(MachineBasicBlock *MBB);
-  int patternMatchGroup(MachineBasicBlock *MBB);
-  int serialPatternMatch(MachineBasicBlock *MBB);
-  int ifPatternMatch(MachineBasicBlock *MBB);
-  int loopendPatternMatch();
-  int mergeLoop(MachineLoop *LoopRep);
-
-  /// return true iff src1Blk->succ_empty() && src1Blk and src2Blk are in
-  /// the same loop with LoopLandInfo without explicitly keeping track of
-  /// loopContBlks and loopBreakBlks, this is a method to get the information.
-  bool isSameloopDetachedContbreak(MachineBasicBlock *Src1MBB,
-      MachineBasicBlock *Src2MBB);
-  int handleJumpintoIf(MachineBasicBlock *HeadMBB,
-      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
-  int handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
-      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
-  int improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
-      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
-      MachineBasicBlock **LandMBBPtr);
-  void showImproveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
-      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
-      MachineBasicBlock *LandMBB, bool Detail = false);
-  int cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
-      MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB);
-  void mergeSerialBlock(MachineBasicBlock *DstMBB,
-      MachineBasicBlock *SrcMBB);
-
-  void mergeIfthenelseBlock(MachineInstr *BranchMI,
-      MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
-      MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB);
-  void mergeLooplandBlock(MachineBasicBlock *DstMBB,
-      MachineBasicBlock *LandMBB);
-  void mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
-      MachineBasicBlock *LandMBB);
-  void settleLoopcontBlock(MachineBasicBlock *ContingMBB,
-      MachineBasicBlock *ContMBB);
-
-  /// normalizeInfiniteLoopExit change
-  ///   B1:
-  ///        uncond_br LoopHeader
-  ///
-  /// to
-  ///   B1:
-  ///        cond_br 1 LoopHeader dummyExit
-  /// and return the newly added dummy exit block
-  MachineBasicBlock *normalizeInfiniteLoopExit(MachineLoop *LoopRep);
-  void removeUnconditionalBranch(MachineBasicBlock *MBB);
-
-  /// Remove duplicate branches instructions in a block.
-  /// For instance
-  /// B0:
-  ///    cond_br X B1 B2
-  ///    cond_br X B1 B2
-  /// is transformed to
-  /// B0:
-  ///    cond_br X B1 B2
-  void removeRedundantConditionalBranch(MachineBasicBlock *MBB);
-
-  void addDummyExitBlock(SmallVectorImpl<MachineBasicBlock *> &RetMBB);
-  void removeSuccessor(MachineBasicBlock *MBB);
-  MachineBasicBlock *cloneBlockForPredecessor(MachineBasicBlock *MBB,
-      MachineBasicBlock *PredMBB);
-  void migrateInstruction(MachineBasicBlock *SrcMBB,
-      MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I);
-  void recordSccnum(MachineBasicBlock *MBB, int SCCNum);
-  void retireBlock(MachineBasicBlock *MBB);
-
-private:
-  MBBInfoMap BlockInfoMap;
-  LoopLandInfoMap LLInfoMap;
-  std::map<MachineLoop *, bool> Visited;
-  MachineFunction *FuncRep;
-  SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> OrderedBlks;
-};
-
-} // end anonymous namespace
-
-char AMDGPUCFGStructurizer::ID = 0;
-
-int AMDGPUCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const {
-  MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
-  if (It == BlockInfoMap.end())
-    return INVALIDSCCNUM;
-  return (*It).second->SccNum;
-}
-
-MachineBasicBlock *AMDGPUCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep)
-    const {
-  LoopLandInfoMap::const_iterator It = LLInfoMap.find(LoopRep);
-  if (It == LLInfoMap.end())
-    return nullptr;
-  return (*It).second;
-}
-
-bool AMDGPUCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const {
-  MachineLoop *LoopRep = MLI->getLoopFor(MBB);
-  if (!LoopRep)
-    return false;
-  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
-  return MBB->isSuccessor(LoopHeader);
-}
-
-bool AMDGPUCFGStructurizer::isRetiredBlock(MachineBasicBlock *MBB) const {
-  MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
-  if (It == BlockInfoMap.end())
-    return false;
-  return (*It).second->IsRetired;
-}
-
-bool AMDGPUCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const {
-  MachineLoop *LoopRep = MLI->getLoopFor(MBB);
-  while (LoopRep && LoopRep->getHeader() == MBB) {
-    MachineBasicBlock *LoopLand = getLoopLandInfo(LoopRep);
-    if(!LoopLand)
-      return true;
-    if (!isRetiredBlock(LoopLand))
-      return true;
-    LoopRep = LoopRep->getParentLoop();
-  }
-  return false;
-}
-
-AMDGPUCFGStructurizer::PathToKind AMDGPUCFGStructurizer::singlePathTo(
-    MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
-    bool AllowSideEntry) const {
-  assert(DstMBB);
-  if (SrcMBB == DstMBB)
-    return SinglePath_InPath;
-  while (SrcMBB && SrcMBB->succ_size() == 1) {
-    SrcMBB = *SrcMBB->succ_begin();
-    if (SrcMBB == DstMBB)
-      return SinglePath_InPath;
-    if (!AllowSideEntry && SrcMBB->pred_size() > 1)
-      return Not_SinglePath;
-  }
-  if (SrcMBB && SrcMBB->succ_size()==0)
-    return SinglePath_NotInPath;
-  return Not_SinglePath;
-}
-
-int AMDGPUCFGStructurizer::countActiveBlock(MBBVector::const_iterator It,
-    MBBVector::const_iterator E) const {
-  int Count = 0;
-  while (It != E) {
-    if (!isRetiredBlock(*It))
-      ++Count;
-    ++It;
-  }
-  return Count;
-}
-
-bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const {
-  unsigned BlockSizeThreshold = 30;
-  unsigned CloneInstrThreshold = 100;
-  bool MultiplePreds = MBB && (MBB->pred_size() > 1);
-
-  if(!MultiplePreds)
-    return false;
-  unsigned BlkSize = MBB->size();
-  return ((BlkSize > BlockSizeThreshold) &&
-      (BlkSize * (MBB->pred_size() - 1) > CloneInstrThreshold));
-}
-
-void AMDGPUCFGStructurizer::reversePredicateSetter(
-    MachineBasicBlock::iterator I, MachineBasicBlock &MBB) {
-  assert(I.isValid() && "Expected valid iterator");
-  for (;; --I) {
-    if (I == MBB.end())
-      continue;
-    if (I->getOpcode() == R600::PRED_X) {
-      switch (I->getOperand(2).getImm()) {
-      case R600::PRED_SETE_INT:
-        I->getOperand(2).setImm(R600::PRED_SETNE_INT);
-        return;
-      case R600::PRED_SETNE_INT:
-        I->getOperand(2).setImm(R600::PRED_SETE_INT);
-        return;
-      case R600::PRED_SETE:
-        I->getOperand(2).setImm(R600::PRED_SETNE);
-        return;
-      case R600::PRED_SETNE:
-        I->getOperand(2).setImm(R600::PRED_SETE);
-        return;
-      default:
-        llvm_unreachable("PRED_X Opcode invalid!");
-      }
-    }
-  }
-}
-
-void AMDGPUCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB,
-                                           int NewOpcode, const DebugLoc &DL) {
-  MachineInstr *MI =
-      MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL);
-  MBB->push_back(MI);
-  //assume the instruction doesn't take any reg operand ...
-  SHOWNEWINSTR(MI);
-}
-
-MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB,
-                                                       int NewOpcode,
-                                                       const DebugLoc &DL) {
-  MachineInstr *MI =
-      MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL);
-  if (!MBB->empty())
-    MBB->insert(MBB->begin(), MI);
-  else
-    MBB->push_back(MI);
-  SHOWNEWINSTR(MI);
-  return MI;
-}
-
-MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(
-    MachineBasicBlock::iterator I, int NewOpcode) {
-  MachineInstr *OldMI = &(*I);
-  MachineBasicBlock *MBB = OldMI->getParent();
-  MachineInstr *NewMBB =
-      MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DebugLoc());
-  MBB->insert(I, NewMBB);
-  //assume the instruction doesn't take any reg operand ...
-  SHOWNEWINSTR(NewMBB);
-  return NewMBB;
-}
-
-void AMDGPUCFGStructurizer::insertCondBranchBefore(
-    MachineBasicBlock::iterator I, int NewOpcode, const DebugLoc &DL) {
-  MachineInstr *OldMI = &(*I);
-  MachineBasicBlock *MBB = OldMI->getParent();
-  MachineFunction *MF = MBB->getParent();
-  MachineInstr *NewMI = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
-  MBB->insert(I, NewMI);
-  MachineInstrBuilder MIB(*MF, NewMI);
-  MIB.addReg(OldMI->getOperand(1).getReg(), false);
-  SHOWNEWINSTR(NewMI);
-  //erase later oldInstr->eraseFromParent();
-}
-
-void AMDGPUCFGStructurizer::insertCondBranchBefore(
-    MachineBasicBlock *blk, MachineBasicBlock::iterator I, int NewOpcode,
-    int RegNum, const DebugLoc &DL) {
-  MachineFunction *MF = blk->getParent();
-  MachineInstr *NewInstr = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
-  //insert before
-  blk->insert(I, NewInstr);
-  MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false);
-  SHOWNEWINSTR(NewInstr);
-}
-
-int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) {
-  switch(OldOpcode) {
-  case R600::JUMP_COND:
-  case R600::JUMP: return R600::IF_PREDICATE_SET;
-  case R600::BRANCH_COND_i32:
-  case R600::BRANCH_COND_f32: return R600::IF_LOGICALNZ_f32;
-  default: llvm_unreachable("internal error");
-  }
-  return -1;
-}
-
-int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) {
-  switch(OldOpcode) {
-  case R600::JUMP_COND:
-  case R600::JUMP: return R600::IF_PREDICATE_SET;
-  case R600::BRANCH_COND_i32:
-  case R600::BRANCH_COND_f32: return R600::IF_LOGICALZ_f32;
-  default: llvm_unreachable("internal error");
-  }
-  return -1;
-}
-
-int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) {
-  switch(OldOpcode) {
-  case R600::JUMP_COND:
-  case R600::JUMP: return R600::CONTINUE_LOGICALNZ_i32;
-  default: llvm_unreachable("internal error");
-  }
-  return -1;
-}
-
-int AMDGPUCFGStructurizer::getContinueZeroOpcode(int OldOpcode) {
-  switch(OldOpcode) {
-  case R600::JUMP_COND:
-  case R600::JUMP: return R600::CONTINUE_LOGICALZ_i32;
-  default: llvm_unreachable("internal error");
-  }
-  return -1;
-}
-
-MachineBasicBlock *AMDGPUCFGStructurizer::getTrueBranch(MachineInstr *MI) {
-  return MI->getOperand(0).getMBB();
-}
-
-void AMDGPUCFGStructurizer::setTrueBranch(MachineInstr *MI,
-    MachineBasicBlock *MBB) {
-  MI->getOperand(0).setMBB(MBB);
-}
-
-MachineBasicBlock *
-AMDGPUCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB,
-    MachineInstr *MI) {
-  assert(MBB->succ_size() == 2);
-  MachineBasicBlock *TrueBranch = getTrueBranch(MI);
-  MachineBasicBlock::succ_iterator It = MBB->succ_begin();
-  MachineBasicBlock::succ_iterator Next = It;
-  ++Next;
-  return (*It == TrueBranch) ? *Next : *It;
-}
-
-bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) {
-  switch (MI->getOpcode()) {
-    case R600::JUMP_COND:
-    case R600::BRANCH_COND_i32:
-    case R600::BRANCH_COND_f32: return true;
-  default:
-    return false;
-  }
-  return false;
-}
-
-bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) {
-  switch (MI->getOpcode()) {
-  case R600::JUMP:
-  case R600::BRANCH:
-    return true;
-  default:
-    return false;
-  }
-  return false;
-}
-
-DebugLoc AMDGPUCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) {
-  //get DebugLoc from the first MachineBasicBlock instruction with debug info
-  DebugLoc DL;
-  for (MachineInstr &MI : *MBB)
-    if (MI.getDebugLoc())
-      DL = MI.getDebugLoc();
-  return DL;
-}
-
-MachineInstr *AMDGPUCFGStructurizer::getNormalBlockBranchInstr(
-    MachineBasicBlock *MBB) {
-  MachineBasicBlock::reverse_iterator It = MBB->rbegin();
-  MachineInstr *MI = &*It;
-  if (MI && (isCondBranch(MI) || isUncondBranch(MI)))
-    return MI;
-  return nullptr;
-}
-
-MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr(
-    MachineBasicBlock *MBB) {
-  for (MachineBasicBlock::reverse_iterator It = MBB->rbegin(), E = MBB->rend();
-      It != E; ++It) {
-    // FIXME: Simplify
-    MachineInstr *MI = &*It;
-    if (MI) {
-      if (isCondBranch(MI) || isUncondBranch(MI))
-        return MI;
-      else if (!TII->isMov(MI->getOpcode()))
-        break;
-    }
-  }
-  return nullptr;
-}
-
-MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
-  MachineBasicBlock::reverse_iterator It = MBB->rbegin();
-  if (It != MBB->rend()) {
-    MachineInstr *instr = &(*It);
-    if (instr->getOpcode() == R600::RETURN)
-      return instr;
-  }
-  return nullptr;
-}
-
-bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
-  MachineInstr *MI = getReturnInstr(MBB);
-  bool IsReturn = MBB->succ_empty();
-  if (MI)
-    assert(IsReturn);
-  else if (IsReturn)
-    LLVM_DEBUG(dbgs() << "BB" << MBB->getNumber()
-                      << " is return block without RETURN instr\n";);
-  return  IsReturn;
-}
-
-void AMDGPUCFGStructurizer::cloneSuccessorList(MachineBasicBlock *DstMBB,
-    MachineBasicBlock *SrcMBB) {
-  for (MachineBasicBlock *Succ : SrcMBB->successors())
-    DstMBB->addSuccessor(Succ);  // *iter's predecessor is also taken care of
-}
-
-MachineBasicBlock *AMDGPUCFGStructurizer::clone(MachineBasicBlock *MBB) {
-  MachineFunction *Func = MBB->getParent();
-  MachineBasicBlock *NewMBB = Func->CreateMachineBasicBlock();
-  Func->push_back(NewMBB);  //insert to function
-  for (const MachineInstr &It : *MBB)
-    NewMBB->push_back(Func->CloneMachineInstr(&It));
-  return NewMBB;
-}
-
-void AMDGPUCFGStructurizer::replaceInstrUseOfBlockWith(
-    MachineBasicBlock *SrcMBB, MachineBasicBlock *OldMBB,
-    MachineBasicBlock *NewBlk) {
-  MachineInstr *BranchMI = getLoopendBlockBranchInstr(SrcMBB);
-  if (BranchMI && isCondBranch(BranchMI) &&
-      getTrueBranch(BranchMI) == OldMBB)
-    setTrueBranch(BranchMI, NewBlk);
-}
-
-void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) {
-  assert((!MBB->getParent()->getJumpTableInfo()
-          || MBB->getParent()->getJumpTableInfo()->isEmpty())
-         && "found a jump table");
-
-   //collect continue right before endloop
-   SmallVector<MachineInstr *, DEFAULT_VEC_SLOTS> ContInstr;
-   MachineBasicBlock::iterator Pre = MBB->begin();
-   MachineBasicBlock::iterator E = MBB->end();
-   MachineBasicBlock::iterator It = Pre;
-   while (It != E) {
-     if (Pre->getOpcode() == R600::CONTINUE
-         && It->getOpcode() == R600::ENDLOOP)
-       ContInstr.push_back(&*Pre);
-     Pre = It;
-     ++It;
-   }
-
-   //delete continue right before endloop
-   for (unsigned i = 0; i < ContInstr.size(); ++i)
-      ContInstr[i]->eraseFromParent();
-
-   // TODO to fix up jump table so later phase won't be confused.  if
-   // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but
-   // there isn't such an interface yet.  alternatively, replace all the other
-   // blocks in the jump table with the entryBlk //}
-}
-
-bool AMDGPUCFGStructurizer::prepare() {
-  bool Changed = false;
-
-  //FIXME: if not reducible flow graph, make it so ???
-
-  LLVM_DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";);
-
-  orderBlocks(FuncRep);
-
-  SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> RetBlks;
-
-  // Add an ExitBlk to loop that don't have one
-  for (MachineLoop *LoopRep : *MLI) {
-    MBBVector ExitingMBBs;
-    LoopRep->getExitingBlocks(ExitingMBBs);
-
-    if (ExitingMBBs.size() == 0) {
-      MachineBasicBlock* DummyExitBlk = normalizeInfiniteLoopExit(LoopRep);
-      if (DummyExitBlk)
-        RetBlks.push_back(DummyExitBlk);
-    }
-  }
-
-  // Remove unconditional branch instr.
-  // Add dummy exit block iff there are multiple returns.
-  for (MachineBasicBlock *MBB : OrderedBlks) {
-    removeUnconditionalBranch(MBB);
-    removeRedundantConditionalBranch(MBB);
-    if (isReturnBlock(MBB)) {
-      RetBlks.push_back(MBB);
-    }
-    assert(MBB->succ_size() <= 2);
-  }
-
-  if (RetBlks.size() >= 2) {
-    addDummyExitBlock(RetBlks);
-    Changed = true;
-  }
-
-  return Changed;
-}
-
-bool AMDGPUCFGStructurizer::run() {
-  //Assume reducible CFG...
-  LLVM_DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n");
-
-#ifdef STRESSTEST
-  //Use the worse block ordering to test the algorithm.
-  ReverseVector(orderedBlks);
-#endif
-
-  LLVM_DEBUG(dbgs() << "Ordered blocks:\n"; printOrderedBlocks(););
-  int NumIter = 0;
-  bool Finish = false;
-  MachineBasicBlock *MBB;
-  bool MakeProgress = false;
-  int NumRemainedBlk = countActiveBlock(OrderedBlks.begin(),
-                                        OrderedBlks.end());
-
-  do {
-    ++NumIter;
-    LLVM_DEBUG(dbgs() << "numIter = " << NumIter
-                      << ", numRemaintedBlk = " << NumRemainedBlk << "\n";);
-
-    SmallVectorImpl<MachineBasicBlock *>::const_iterator It =
-        OrderedBlks.begin();
-    SmallVectorImpl<MachineBasicBlock *>::const_iterator E =
-        OrderedBlks.end();
-
-    SmallVectorImpl<MachineBasicBlock *>::const_iterator SccBeginIter =
-        It;
-    MachineBasicBlock *SccBeginMBB = nullptr;
-    int SccNumBlk = 0;  // The number of active blocks, init to a
-                        // maximum possible number.
-    int SccNumIter;     // Number of iteration in this SCC.
-
-    while (It != E) {
-      MBB = *It;
-
-      if (!SccBeginMBB) {
-        SccBeginIter = It;
-        SccBeginMBB = MBB;
-        SccNumIter = 0;
-        SccNumBlk = NumRemainedBlk; // Init to maximum possible number.
-        LLVM_DEBUG(dbgs() << "start processing SCC" << getSCCNum(SccBeginMBB);
-                   dbgs() << "\n";);
-      }
-
-      if (!isRetiredBlock(MBB))
-        patternMatch(MBB);
-
-      ++It;
-
-      bool ContNextScc = true;
-      if (It == E
-          || getSCCNum(SccBeginMBB) != getSCCNum(*It)) {
-        // Just finish one scc.
-        ++SccNumIter;
-        int sccRemainedNumBlk = countActiveBlock(SccBeginIter, It);
-        if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= SccNumBlk) {
-          LLVM_DEBUG(dbgs() << "Can't reduce SCC " << getSCCNum(MBB)
-                            << ", sccNumIter = " << SccNumIter;
-                     dbgs() << "doesn't make any progress\n";);
-          ContNextScc = true;
-        } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < SccNumBlk) {
-          SccNumBlk = sccRemainedNumBlk;
-          It = SccBeginIter;
-          ContNextScc = false;
-          LLVM_DEBUG(dbgs() << "repeat processing SCC" << getSCCNum(MBB)
-                            << "sccNumIter = " << SccNumIter << '\n';);
-        } else {
-          // Finish the current scc.
-          ContNextScc = true;
-        }
-      } else {
-        // Continue on next component in the current scc.
-        ContNextScc = false;
-      }
-
-      if (ContNextScc)
-        SccBeginMBB = nullptr;
-    } //while, "one iteration" over the function.
-
-    MachineBasicBlock *EntryMBB =
-        *GraphTraits<MachineFunction *>::nodes_begin(FuncRep);
-    if (EntryMBB->succ_empty()) {
-      Finish = true;
-      LLVM_DEBUG(dbgs() << "Reduce to one block\n";);
-    } else {
-      int NewnumRemainedBlk
-        = countActiveBlock(OrderedBlks.begin(), OrderedBlks.end());
-      // consider cloned blocks ??
-      if (NewnumRemainedBlk == 1 || NewnumRemainedBlk < NumRemainedBlk) {
-        MakeProgress = true;
-        NumRemainedBlk = NewnumRemainedBlk;
-      } else {
-        MakeProgress = false;
-        LLVM_DEBUG(dbgs() << "No progress\n";);
-      }
-    }
-  } while (!Finish && MakeProgress);
-
-  // Misc wrap up to maintain the consistency of the Function representation.
-  wrapup(*GraphTraits<MachineFunction *>::nodes_begin(FuncRep));
-
-  // Detach retired Block, release memory.
-  for (auto &It : BlockInfoMap) {
-    if (It.second && It.second->IsRetired) {
-      assert((It.first)->getNumber() != -1);
-      LLVM_DEBUG(dbgs() << "Erase BB" << (It.first)->getNumber() << "\n";);
-      It.first->eraseFromParent(); // Remove from the parent Function.
-    }
-    delete It.second;
-  }
-  BlockInfoMap.clear();
-  LLInfoMap.clear();
-
-  if (!Finish) {
-    LLVM_DEBUG(FuncRep->viewCFG());
-    report_fatal_error("IRREDUCIBLE_CFG");
-  }
-
-  return true;
-}
-
-void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) {
-  int SccNum = 0;
-  for (scc_iterator<MachineFunction *> It = scc_begin(MF); !It.isAtEnd();
-       ++It, ++SccNum) {
-    const std::vector<MachineBasicBlock *> &SccNext = *It;
-    for (MachineBasicBlock *MBB : SccNext) {
-      OrderedBlks.push_back(MBB);
-      recordSccnum(MBB, SccNum);
-    }
-  }
-
-  // walk through all the block in func to check for unreachable
-  for (auto *MBB : nodes(MF)) {
-    SccNum = getSCCNum(MBB);
-    if (SccNum == INVALIDSCCNUM)
-      dbgs() << "unreachable block BB" << MBB->getNumber() << "\n";
-  }
-}
-
-int AMDGPUCFGStructurizer::patternMatch(MachineBasicBlock *MBB) {
-  int NumMatch = 0;
-  int CurMatch;
-
-  LLVM_DEBUG(dbgs() << "Begin patternMatch BB" << MBB->getNumber() << "\n";);
-
-  while ((CurMatch = patternMatchGroup(MBB)) > 0)
-    NumMatch += CurMatch;
-
-  LLVM_DEBUG(dbgs() << "End patternMatch BB" << MBB->getNumber()
-                    << ", numMatch = " << NumMatch << "\n";);
-
-  return NumMatch;
-}
-
-int AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) {
-  int NumMatch = 0;
-  NumMatch += loopendPatternMatch();
-  NumMatch += serialPatternMatch(MBB);
-  NumMatch += ifPatternMatch(MBB);
-  return NumMatch;
-}
-
-int AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) {
-  if (MBB->succ_size() != 1)
-    return 0;
-
-  MachineBasicBlock *childBlk = *MBB->succ_begin();
-  if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk))
-    return 0;
-
-  mergeSerialBlock(MBB, childBlk);
-  ++numSerialPatternMatch;
-  return 1;
-}
-
-int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
-  //two edges
-  if (MBB->succ_size() != 2)
-    return 0;
-  if (hasBackEdge(MBB))
-    return 0;
-  MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
-  if (!BranchMI)
-    return 0;
-
-  assert(isCondBranch(BranchMI));
-  int NumMatch = 0;
-
-  MachineBasicBlock *TrueMBB = getTrueBranch(BranchMI);
-  NumMatch += serialPatternMatch(TrueMBB);
-  NumMatch += ifPatternMatch(TrueMBB);
-  MachineBasicBlock *FalseMBB = getFalseBranch(MBB, BranchMI);
-  NumMatch += serialPatternMatch(FalseMBB);
-  NumMatch += ifPatternMatch(FalseMBB);
-  MachineBasicBlock *LandBlk;
-  int Cloned = 0;
-
-  assert (!TrueMBB->succ_empty() || !FalseMBB->succ_empty());
-  // TODO: Simplify
-  if (TrueMBB->succ_size() == 1 && FalseMBB->succ_size() == 1
-    && *TrueMBB->succ_begin() == *FalseMBB->succ_begin()) {
-    // Diamond pattern
-    LandBlk = *TrueMBB->succ_begin();
-  } else if (TrueMBB->succ_size() == 1 && *TrueMBB->succ_begin() == FalseMBB) {
-    // Triangle pattern, false is empty
-    LandBlk = FalseMBB;
-    FalseMBB = nullptr;
-  } else if (FalseMBB->succ_size() == 1
-             && *FalseMBB->succ_begin() == TrueMBB) {
-    // Triangle pattern, true is empty
-    // We reverse the predicate to make a triangle, empty false pattern;
-    std::swap(TrueMBB, FalseMBB);
-    reversePredicateSetter(MBB->end(), *MBB);
-    LandBlk = FalseMBB;
-    FalseMBB = nullptr;
-  } else if (FalseMBB->succ_size() == 1
-             && isSameloopDetachedContbreak(TrueMBB, FalseMBB)) {
-    LandBlk = *FalseMBB->succ_begin();
-  } else if (TrueMBB->succ_size() == 1
-    && isSameloopDetachedContbreak(FalseMBB, TrueMBB)) {
-    LandBlk = *TrueMBB->succ_begin();
-  } else {
-    return NumMatch + handleJumpintoIf(MBB, TrueMBB, FalseMBB);
-  }
-
-  // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the
-  // new BB created for landBlk==NULL may introduce new challenge to the
-  // reduction process.
-  if (LandBlk &&
-      ((TrueMBB && TrueMBB->pred_size() > 1)
-      || (FalseMBB && FalseMBB->pred_size() > 1))) {
-     Cloned += improveSimpleJumpintoIf(MBB, TrueMBB, FalseMBB, &LandBlk);
-  }
-
-  if (TrueMBB && TrueMBB->pred_size() > 1) {
-    TrueMBB = cloneBlockForPredecessor(TrueMBB, MBB);
-    ++Cloned;
-  }
-
-  if (FalseMBB && FalseMBB->pred_size() > 1) {
-    FalseMBB = cloneBlockForPredecessor(FalseMBB, MBB);
-    ++Cloned;
-  }
-
-  mergeIfthenelseBlock(BranchMI, MBB, TrueMBB, FalseMBB, LandBlk);
-
-  ++numIfPatternMatch;
-
-  numClonedBlock += Cloned;
-
-  return 1 + Cloned + NumMatch;
-}
-
-int AMDGPUCFGStructurizer::loopendPatternMatch() {
-  std::deque<MachineLoop *> NestedLoops;
-  for (auto &It: *MLI)
-    for (MachineLoop *ML : depth_first(It))
-      NestedLoops.push_front(ML);
-
-  if (NestedLoops.empty())
-    return 0;
-
-  // Process nested loop outside->inside (we did push_front),
-  // so "continue" to a outside loop won't be mistaken as "break"
-  // of the current loop.
-  int Num = 0;
-  for (MachineLoop *ExaminedLoop : NestedLoops) {
-    if (ExaminedLoop->getNumBlocks() == 0 || Visited[ExaminedLoop])
-      continue;
-    LLVM_DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump(););
-    int NumBreak = mergeLoop(ExaminedLoop);
-    if (NumBreak == -1)
-      break;
-    Num += NumBreak;
-  }
-  return Num;
-}
-
-int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) {
-  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
-  MBBVector ExitingMBBs;
-  LoopRep->getExitingBlocks(ExitingMBBs);
-  assert(!ExitingMBBs.empty() && "Infinite Loop not supported");
-  LLVM_DEBUG(dbgs() << "Loop has " << ExitingMBBs.size()
-                    << " exiting blocks\n";);
-  // We assume a single ExitBlk
-  MBBVector ExitBlks;
-  LoopRep->getExitBlocks(ExitBlks);
-  SmallPtrSet<MachineBasicBlock *, 2> ExitBlkSet;
-  for (unsigned i = 0, e = ExitBlks.size(); i < e; ++i)
-    ExitBlkSet.insert(ExitBlks[i]);
-  assert(ExitBlkSet.size() == 1);
-  MachineBasicBlock *ExitBlk = *ExitBlks.begin();
-  assert(ExitBlk && "Loop has several exit block");
-  MBBVector LatchBlks;
-  for (auto *LB : inverse_children<MachineBasicBlock*>(LoopHeader))
-    if (LoopRep->contains(LB))
-      LatchBlks.push_back(LB);
-
-  for (unsigned i = 0, e = ExitingMBBs.size(); i < e; ++i)
-    mergeLoopbreakBlock(ExitingMBBs[i], ExitBlk);
-  for (unsigned i = 0, e = LatchBlks.size(); i < e; ++i)
-    settleLoopcontBlock(LatchBlks[i], LoopHeader);
-  int Match = 0;
-  do {
-    Match = 0;
-    Match += serialPatternMatch(LoopHeader);
-    Match += ifPatternMatch(LoopHeader);
-  } while (Match > 0);
-  mergeLooplandBlock(LoopHeader, ExitBlk);
-  MachineLoop *ParentLoop = LoopRep->getParentLoop();
-  if (ParentLoop)
-    MLI->changeLoopFor(LoopHeader, ParentLoop);
-  else
-    MLI->removeBlock(LoopHeader);
-  Visited[LoopRep] = true;
-  return 1;
-}
-
-bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak(
-    MachineBasicBlock *Src1MBB, MachineBasicBlock *Src2MBB) {
-  if (Src1MBB->succ_empty()) {
-    MachineLoop *LoopRep = MLI->getLoopFor(Src1MBB);
-    if (LoopRep&& LoopRep == MLI->getLoopFor(Src2MBB)) {
-      MachineBasicBlock *&TheEntry = LLInfoMap[LoopRep];
-      if (TheEntry) {
-        LLVM_DEBUG(dbgs() << "isLoopContBreakBlock yes src1 = BB"
-                          << Src1MBB->getNumber() << " src2 = BB"
-                          << Src2MBB->getNumber() << "\n";);
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB,
-    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
-  int Num = handleJumpintoIfImp(HeadMBB, TrueMBB, FalseMBB);
-  if (Num == 0) {
-    LLVM_DEBUG(dbgs() << "handleJumpintoIf swap trueBlk and FalseBlk"
-                      << "\n";);
-    Num = handleJumpintoIfImp(HeadMBB, FalseMBB, TrueMBB);
-  }
-  return Num;
-}
-
-int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
-    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
-  int Num = 0;
-  MachineBasicBlock *DownBlk;
-
-  //trueBlk could be the common post dominator
-  DownBlk = TrueMBB;
-
-  LLVM_DEBUG(dbgs() << "handleJumpintoIfImp head = BB" << HeadMBB->getNumber()
-                    << " true = BB" << TrueMBB->getNumber()
-                    << ", numSucc=" << TrueMBB->succ_size() << " false = BB"
-                    << FalseMBB->getNumber() << "\n";);
-
-  while (DownBlk) {
-    LLVM_DEBUG(dbgs() << "check down = BB" << DownBlk->getNumber(););
-
-    if (singlePathTo(FalseMBB, DownBlk) == SinglePath_InPath) {
-      LLVM_DEBUG(dbgs() << " working\n";);
-
-      Num += cloneOnSideEntryTo(HeadMBB, TrueMBB, DownBlk);
-      Num += cloneOnSideEntryTo(HeadMBB, FalseMBB, DownBlk);
-
-      numClonedBlock += Num;
-      Num += serialPatternMatch(*HeadMBB->succ_begin());
-      Num += serialPatternMatch(*std::next(HeadMBB->succ_begin()));
-      Num += ifPatternMatch(HeadMBB);
-      assert(Num > 0);
-
-      break;
-    }
-    LLVM_DEBUG(dbgs() << " not working\n";);
-    DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : nullptr;
-  } // walk down the postDomTree
-
-  return Num;
-}
-
-#ifndef NDEBUG
-void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf(
-    MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB,
-    MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB, bool Detail) {
-  dbgs() << "head = BB" << HeadMBB->getNumber()
-         << " size = " << HeadMBB->size();
-  if (Detail) {
-    dbgs() << "\n";
-    HeadMBB->print(dbgs());
-    dbgs() << "\n";
-  }
-
-  if (TrueMBB) {
-    dbgs() << ", true = BB" << TrueMBB->getNumber() << " size = "
-           << TrueMBB->size() << " numPred = " << TrueMBB->pred_size();
-    if (Detail) {
-      dbgs() << "\n";
-      TrueMBB->print(dbgs());
-      dbgs() << "\n";
-    }
-  }
-  if (FalseMBB) {
-    dbgs() << ", false = BB" << FalseMBB->getNumber() << " size = "
-           << FalseMBB->size() << " numPred = " << FalseMBB->pred_size();
-    if (Detail) {
-      dbgs() << "\n";
-      FalseMBB->print(dbgs());
-      dbgs() << "\n";
-    }
-  }
-  if (LandMBB) {
-    dbgs() << ", land = BB" << LandMBB->getNumber() << " size = "
-           << LandMBB->size() << " numPred = " << LandMBB->pred_size();
-    if (Detail) {
-      dbgs() << "\n";
-      LandMBB->print(dbgs());
-      dbgs() << "\n";
-    }
-  }
-
-  dbgs() << "\n";
-}
-#endif
-
-int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
-    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
-    MachineBasicBlock **LandMBBPtr) {
-  bool MigrateTrue = false;
-  bool MigrateFalse = false;
-
-  MachineBasicBlock *LandBlk = *LandMBBPtr;
-
-  assert((!TrueMBB || TrueMBB->succ_size() <= 1)
-         && (!FalseMBB || FalseMBB->succ_size() <= 1));
-
-  if (TrueMBB == FalseMBB)
-    return 0;
-
-  MigrateTrue = needMigrateBlock(TrueMBB);
-  MigrateFalse = needMigrateBlock(FalseMBB);
-
-  if (!MigrateTrue && !MigrateFalse)
-    return 0;
-
-  // If we need to migrate either trueBlk and falseBlk, migrate the rest that
-  // have more than one predecessors.  without doing this, its predecessor
-  // rather than headBlk will have undefined value in initReg.
-  if (!MigrateTrue && TrueMBB && TrueMBB->pred_size() > 1)
-    MigrateTrue = true;
-  if (!MigrateFalse && FalseMBB && FalseMBB->pred_size() > 1)
-    MigrateFalse = true;
-
-  LLVM_DEBUG(
-      dbgs() << "before improveSimpleJumpintoIf: ";
-      showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0););
-
-  // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk
-  //
-  // new: headBlk => if () {initReg = 1; org trueBlk branch} else
-  //      {initReg = 0; org falseBlk branch }
-  //      => landBlk => if (initReg) {org trueBlk} else {org falseBlk}
-  //      => org landBlk
-  //      if landBlk->pred_size() > 2, put the about if-else inside
-  //      if (initReg !=2) {...}
-  //
-  // add initReg = initVal to headBlk
-
-  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
-  if (!MigrateTrue || !MigrateFalse) {
-    // XXX: We have an opportunity here to optimize the "branch into if" case
-    // here.  Branch into if looks like this:
-    //                        entry
-    //                       /     |
-    //           diamond_head       branch_from
-    //             /      \           |
-    // diamond_false        diamond_true
-    //             \      /
-    //               done
-    //
-    // The diamond_head block begins the "if" and the diamond_true block
-    // is the block being "branched into".
-    //
-    // If MigrateTrue is true, then TrueBB is the block being "branched into"
-    // and if MigrateFalse is true, then FalseBB is the block being
-    // "branched into"
-    //
-    // Here is the pseudo code for how I think the optimization should work:
-    // 1. Insert MOV GPR0, 0 before the branch instruction in diamond_head.
-    // 2. Insert MOV GPR0, 1 before the branch instruction in branch_from.
-    // 3. Move the branch instruction from diamond_head into its own basic
-    //    block (new_block).
-    // 4. Add an unconditional branch from diamond_head to new_block
-    // 5. Replace the branch instruction in branch_from with an unconditional
-    //    branch to new_block.  If branch_from has multiple predecessors, then
-    //    we need to replace the True/False block in the branch
-    //    instruction instead of replacing it.
-    // 6. Change the condition of the branch instruction in new_block from
-    //    COND to (COND || GPR0)
-    //
-    // In order insert these MOV instruction, we will need to use the
-    // RegisterScavenger.  Usually liveness stops being tracked during
-    // the late machine optimization passes, however if we implement
-    // bool TargetRegisterInfo::requiresRegisterScavenging(
-    //                                                const MachineFunction &MF)
-    // and have it return true, liveness will be tracked correctly
-    // by generic optimization passes.  We will also need to make sure that
-    // all of our target-specific passes that run after regalloc and before
-    // the CFGStructurizer track liveness and we will need to modify this pass
-    // to correctly track liveness.
-    //
-    // After the above changes, the new CFG should look like this:
-    //                        entry
-    //                       /     |
-    //           diamond_head       branch_from
-    //                       \     /
-    //                      new_block
-    //                      /      |
-    //         diamond_false        diamond_true
-    //                      \      /
-    //                        done
-    //
-    // Without this optimization, we are forced to duplicate the diamond_true
-    // block and we will end up with a CFG like this:
-    //
-    //                        entry
-    //                       /     |
-    //           diamond_head       branch_from
-    //             /      \                   |
-    // diamond_false        diamond_true      diamond_true (duplicate)
-    //             \      /                   |
-    //               done --------------------|
-    //
-    // Duplicating diamond_true can be very costly especially if it has a
-    // lot of instructions.
-    return 0;
-  }
-
-  int NumNewBlk = 0;
-
-  bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2);
-
-  //insert R600::ENDIF to avoid special case "input landBlk == NULL"
-  MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, R600::ENDIF);
-
-  if (LandBlkHasOtherPred) {
-    report_fatal_error("Extra register needed to handle CFG");
-    Register CmpResReg =
-        HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
-    report_fatal_error("Extra compare instruction needed to handle CFG");
-    insertCondBranchBefore(LandBlk, I, R600::IF_PREDICATE_SET,
-        CmpResReg, DebugLoc());
-  }
-
-  // XXX: We are running this after RA, so creating virtual registers will
-  // cause an assertion failure in the PostRA scheduling pass.
-  Register InitReg =
-      HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
-  insertCondBranchBefore(LandBlk, I, R600::IF_PREDICATE_SET, InitReg,
-      DebugLoc());
-
-  if (MigrateTrue) {
-    migrateInstruction(TrueMBB, LandBlk, I);
-    // need to uncondionally insert the assignment to ensure a path from its
-    // predecessor rather than headBlk has valid value in initReg if
-    // (initVal != 1).
-    report_fatal_error("Extra register needed to handle CFG");
-  }
-  insertInstrBefore(I, R600::ELSE);
-
-  if (MigrateFalse) {
-    migrateInstruction(FalseMBB, LandBlk, I);
-    // need to uncondionally insert the assignment to ensure a path from its
-    // predecessor rather than headBlk has valid value in initReg if
-    // (initVal != 0)
-    report_fatal_error("Extra register needed to handle CFG");
-  }
-
-  if (LandBlkHasOtherPred) {
-    // add endif
-    insertInstrBefore(I, R600::ENDIF);
-
-    // put initReg = 2 to other predecessors of landBlk
-    for (MachineBasicBlock *MBB : LandBlk->predecessors())
-      if (MBB != TrueMBB && MBB != FalseMBB)
-        report_fatal_error("Extra register needed to handle CFG");
-  }
-  LLVM_DEBUG(
-      dbgs() << "result from improveSimpleJumpintoIf: ";
-      showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0););
-
-  // update landBlk
-  *LandMBBPtr = LandBlk;
-
-  return NumNewBlk;
-}
-
-void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
-    MachineBasicBlock *SrcMBB) {
-  LLVM_DEBUG(dbgs() << "serialPattern BB" << DstMBB->getNumber() << " <= BB"
-                    << SrcMBB->getNumber() << "\n";);
-  DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end());
-
-  DstMBB->removeSuccessor(SrcMBB, true);
-  cloneSuccessorList(DstMBB, SrcMBB);
-
-  removeSuccessor(SrcMBB);
-  MLI->removeBlock(SrcMBB);
-  retireBlock(SrcMBB);
-}
-
-void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
-    MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
-    MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB) {
-  assert (TrueMBB);
-  LLVM_DEBUG(dbgs() << "ifPattern BB" << MBB->getNumber(); dbgs() << "{  ";
-             if (TrueMBB) { dbgs() << "BB" << TrueMBB->getNumber(); } dbgs()
-             << "  } else ";
-             dbgs() << "{  "; if (FalseMBB) {
-               dbgs() << "BB" << FalseMBB->getNumber();
-             } dbgs() << "  }\n ";
-             dbgs() << "landBlock: "; if (!LandMBB) { dbgs() << "NULL"; } else {
-               dbgs() << "BB" << LandMBB->getNumber();
-             } dbgs() << "\n";);
-
-  int OldOpcode = BranchMI->getOpcode();
-  DebugLoc BranchDL = BranchMI->getDebugLoc();
-
-//    transform to
-//    if cond
-//       trueBlk
-//    else
-//       falseBlk
-//    endif
-//    landBlk
-
-  MachineBasicBlock::iterator I = BranchMI;
-  insertCondBranchBefore(I, getBranchNzeroOpcode(OldOpcode),
-      BranchDL);
-
-  if (TrueMBB) {
-    MBB->splice(I, TrueMBB, TrueMBB->begin(), TrueMBB->end());
-    MBB->removeSuccessor(TrueMBB, true);
-    if (LandMBB && TrueMBB->succ_size()!=0)
-      TrueMBB->removeSuccessor(LandMBB, true);
-    retireBlock(TrueMBB);
-    MLI->removeBlock(TrueMBB);
-  }
-
-  if (FalseMBB) {
-    insertInstrBefore(I, R600::ELSE);
-    MBB->splice(I, FalseMBB, FalseMBB->begin(),
-                   FalseMBB->end());
-    MBB->removeSuccessor(FalseMBB, true);
-    if (LandMBB && !FalseMBB->succ_empty())
-      FalseMBB->removeSuccessor(LandMBB, true);
-    retireBlock(FalseMBB);
-    MLI->removeBlock(FalseMBB);
-  }
-  insertInstrBefore(I, R600::ENDIF);
-
-  BranchMI->eraseFromParent();
-
-  if (LandMBB && TrueMBB && FalseMBB)
-    MBB->addSuccessor(LandMBB);
-}
-
-void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
-    MachineBasicBlock *LandMBB) {
-  LLVM_DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber()
-                    << " land = BB" << LandMBB->getNumber() << "\n";);
-
-  insertInstrBefore(DstBlk, R600::WHILELOOP, DebugLoc());
-  insertInstrEnd(DstBlk, R600::ENDLOOP, DebugLoc());
-  DstBlk->replaceSuccessor(DstBlk, LandMBB);
-}
-
-void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
-    MachineBasicBlock *LandMBB) {
-  LLVM_DEBUG(dbgs() << "loopbreakPattern exiting = BB"
-                    << ExitingMBB->getNumber() << " land = BB"
-                    << LandMBB->getNumber() << "\n";);
-  MachineInstr *BranchMI = getLoopendBlockBranchInstr(ExitingMBB);
-  assert(BranchMI && isCondBranch(BranchMI));
-  DebugLoc DL = BranchMI->getDebugLoc();
-  MachineBasicBlock *TrueBranch = getTrueBranch(BranchMI);
-  MachineBasicBlock::iterator I = BranchMI;
-  if (TrueBranch != LandMBB)
-    reversePredicateSetter(I, *I->getParent());
-  insertCondBranchBefore(ExitingMBB, I, R600::IF_PREDICATE_SET, R600::PREDICATE_BIT, DL);
-  insertInstrBefore(I, R600::BREAK);
-  insertInstrBefore(I, R600::ENDIF);
-  //now branchInst can be erase safely
-  BranchMI->eraseFromParent();
-  //now take care of successors, retire blocks
-  ExitingMBB->removeSuccessor(LandMBB, true);
-}
-
-void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
-    MachineBasicBlock *ContMBB) {
-  LLVM_DEBUG(dbgs() << "settleLoopcontBlock conting = BB"
-                    << ContingMBB->getNumber() << ", cont = BB"
-                    << ContMBB->getNumber() << "\n";);
-
-  MachineInstr *MI = getLoopendBlockBranchInstr(ContingMBB);
-  if (MI) {
-    assert(isCondBranch(MI));
-    MachineBasicBlock::iterator I = MI;
-    MachineBasicBlock *TrueBranch = getTrueBranch(MI);
-    int OldOpcode = MI->getOpcode();
-    DebugLoc DL = MI->getDebugLoc();
-
-    bool UseContinueLogical = ((&*ContingMBB->rbegin()) == MI);
-
-    if (!UseContinueLogical) {
-      int BranchOpcode =
-          TrueBranch == ContMBB ? getBranchNzeroOpcode(OldOpcode) :
-          getBranchZeroOpcode(OldOpcode);
-      insertCondBranchBefore(I, BranchOpcode, DL);
-      // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
-      insertInstrEnd(ContingMBB, R600::CONTINUE, DL);
-      insertInstrEnd(ContingMBB, R600::ENDIF, DL);
-    } else {
-      int BranchOpcode =
-          TrueBranch == ContMBB ? getContinueNzeroOpcode(OldOpcode) :
-          getContinueZeroOpcode(OldOpcode);
-      insertCondBranchBefore(I, BranchOpcode, DL);
-    }
-
-    MI->eraseFromParent();
-  } else {
-    // if we've arrived here then we've already erased the branch instruction
-    // travel back up the basic block to see the last reference of our debug
-    // location we've just inserted that reference here so it should be
-    // representative insertEnd to ensure phi-moves, if exist, go before the
-    // continue-instr.
-    insertInstrEnd(ContingMBB, R600::CONTINUE,
-        getLastDebugLocInBB(ContingMBB));
-  }
-}
-
-int AMDGPUCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
-    MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB) {
-  int Cloned = 0;
-  assert(PreMBB->isSuccessor(SrcMBB));
-  while (SrcMBB && SrcMBB != DstMBB) {
-    assert(SrcMBB->succ_size() == 1);
-    if (SrcMBB->pred_size() > 1) {
-      SrcMBB = cloneBlockForPredecessor(SrcMBB, PreMBB);
-      ++Cloned;
-    }
-
-    PreMBB = SrcMBB;
-    SrcMBB = *SrcMBB->succ_begin();
-  }
-
-  return Cloned;
-}
-
-MachineBasicBlock *
-AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
-    MachineBasicBlock *PredMBB) {
-  assert(PredMBB->isSuccessor(MBB) &&
-         "succBlk is not a prececessor of curBlk");
-
-  MachineBasicBlock *CloneMBB = clone(MBB);  //clone instructions
-  replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB);
-  //srcBlk, oldBlk, newBlk
-
-  PredMBB->replaceSuccessor(MBB, CloneMBB);
-
-  // add all successor to cloneBlk
-  cloneSuccessorList(CloneMBB, MBB);
-
-  numClonedInstr += MBB->size();
-
-  LLVM_DEBUG(dbgs() << "Cloned block: "
-                    << "BB" << MBB->getNumber() << "size " << MBB->size()
-                    << "\n";);
-
-  SHOWNEWBLK(CloneMBB, "result of Cloned block: ");
-
-  return CloneMBB;
-}
-
-void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
-    MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I) {
-  MachineBasicBlock::iterator SpliceEnd;
-  //look for the input branchinstr, not the AMDGPU branchinstr
-  MachineInstr *BranchMI = getNormalBlockBranchInstr(SrcMBB);
-  if (!BranchMI) {
-    LLVM_DEBUG(dbgs() << "migrateInstruction don't see branch instr\n";);
-    SpliceEnd = SrcMBB->end();
-  } else {
-    LLVM_DEBUG(dbgs() << "migrateInstruction see branch instr: " << *BranchMI);
-    SpliceEnd = BranchMI;
-  }
-  LLVM_DEBUG(dbgs() << "migrateInstruction before splice dstSize = "
-                    << DstMBB->size() << "srcSize = " << SrcMBB->size()
-                    << "\n";);
-
-  //splice insert before insertPos
-  DstMBB->splice(I, SrcMBB, SrcMBB->begin(), SpliceEnd);
-
-  LLVM_DEBUG(dbgs() << "migrateInstruction after splice dstSize = "
-                    << DstMBB->size() << "srcSize = " << SrcMBB->size()
-                    << '\n';);
-}
-
-MachineBasicBlock *
-AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
-  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
-  MachineBasicBlock *LoopLatch = LoopRep->getLoopLatch();
-
-  if (!LoopHeader || !LoopLatch)
-    return nullptr;
-  MachineInstr *BranchMI = getLoopendBlockBranchInstr(LoopLatch);
-  // Is LoopRep an infinite loop ?
-  if (!BranchMI || !isUncondBranch(BranchMI))
-    return nullptr;
-
-  MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
-  FuncRep->push_back(DummyExitBlk);  //insert to function
-  SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
-  LLVM_DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";);
-  LLVMContext &Ctx = LoopHeader->getParent()->getFunction().getContext();
-  Ctx.emitError("Extra register needed to handle CFG");
-  return nullptr;
-}
-
-void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) {
-  MachineInstr *BranchMI;
-
-  // I saw two unconditional branch in one basic block in example
-  // test_fc_do_while_or.c need to fix the upstream on this to remove the loop.
-  while ((BranchMI = getLoopendBlockBranchInstr(MBB))
-          && isUncondBranch(BranchMI)) {
-    LLVM_DEBUG(dbgs() << "Removing uncond branch instr: " << *BranchMI);
-    BranchMI->eraseFromParent();
-  }
-}
-
-void AMDGPUCFGStructurizer::removeRedundantConditionalBranch(
-    MachineBasicBlock *MBB) {
-  if (MBB->succ_size() != 2)
-    return;
-  MachineBasicBlock *MBB1 = *MBB->succ_begin();
-  MachineBasicBlock *MBB2 = *std::next(MBB->succ_begin());
-  if (MBB1 != MBB2)
-    return;
-
-  MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
-  assert(BranchMI && isCondBranch(BranchMI));
-  LLVM_DEBUG(dbgs() << "Removing unneeded cond branch instr: " << *BranchMI);
-  BranchMI->eraseFromParent();
-  SHOWNEWBLK(MBB1, "Removing redundant successor");
-  MBB->removeSuccessor(MBB1, true);
-}
-
-void AMDGPUCFGStructurizer::addDummyExitBlock(
-    SmallVectorImpl<MachineBasicBlock*> &RetMBB) {
-  MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
-  FuncRep->push_back(DummyExitBlk);  //insert to function
-  insertInstrEnd(DummyExitBlk, R600::RETURN);
-
-  for (MachineBasicBlock *MBB : RetMBB) {
-    if (MachineInstr *MI = getReturnInstr(MBB))
-      MI->eraseFromParent();
-    MBB->addSuccessor(DummyExitBlk);
-    LLVM_DEBUG(dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber()
-                      << " successors\n";);
-  }
-  SHOWNEWBLK(DummyExitBlk, "DummyExitBlock: ");
-}
-
-void AMDGPUCFGStructurizer::removeSuccessor(MachineBasicBlock *MBB) {
-  while (MBB->succ_size())
-    MBB->removeSuccessor(*MBB->succ_begin());
-}
-
-void AMDGPUCFGStructurizer::recordSccnum(MachineBasicBlock *MBB,
-    int SccNum) {
-  BlockInformation *&srcBlkInfo = BlockInfoMap[MBB];
-  if (!srcBlkInfo)
-    srcBlkInfo = new BlockInformation();
-  srcBlkInfo->SccNum = SccNum;
-}
-
-void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
-  LLVM_DEBUG(dbgs() << "Retiring BB" << MBB->getNumber() << "\n";);
-
-  BlockInformation *&SrcBlkInfo = BlockInfoMap[MBB];
-
-  if (!SrcBlkInfo)
-    SrcBlkInfo = new BlockInformation();
-
-  SrcBlkInfo->IsRetired = true;
-  assert(MBB->succ_empty() && MBB->pred_empty() && "can't retire block yet");
-}
-
-INITIALIZE_PASS_BEGIN(AMDGPUCFGStructurizer, "amdgpustructurizer",
-                      "AMDGPU CFG Structurizer", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_END(AMDGPUCFGStructurizer, "amdgpustructurizer",
-                      "AMDGPU CFG Structurizer", false, false)
-
-FunctionPass *llvm::createAMDGPUCFGStructurizerPass() {
-  return new AMDGPUCFGStructurizer();
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h b/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h
index 654153ea5151..8e5f966b7c6c 100644
--- a/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h
+++ b/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h
@@ -142,7 +142,7 @@ enum amd_code_property_mask_t {
   /// is provided to the finalizer when it is invoked and is recorded
   /// here. The hardware will interleave the memory requests of each
   /// lane of a wavefront by this element size to ensure each
-  /// work-item gets a distinct memory memory location. Therefore, the
+  /// work-item gets a distinct memory location. Therefore, the
   /// finalizer ensures that all load and store operations done to
   /// private memory do not exceed this size. For example, if the
   /// element size is 4 (32-bits or dword) and a 64-bit value must be
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index ffe626513d47..e12d0ffef35c 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -20,10 +20,13 @@
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
@@ -33,6 +36,7 @@
 #include "llvm/Support/AMDHSAKernelDescriptor.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/MachineValueType.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TargetParser.h"
 
 using namespace llvm;
@@ -120,12 +124,6 @@ public:
     ImmTyD16,
     ImmTyClampSI,
     ImmTyOModSI,
-    ImmTyDPP8,
-    ImmTyDppCtrl,
-    ImmTyDppRowMask,
-    ImmTyDppBankMask,
-    ImmTyDppBoundCtrl,
-    ImmTyDppFi,
     ImmTySdwaDstSel,
     ImmTySdwaSrc0Sel,
     ImmTySdwaSrc1Sel,
@@ -151,6 +149,12 @@ public:
     ImmTyOpSelHi,
     ImmTyNegLo,
     ImmTyNegHi,
+    ImmTyDPP8,
+    ImmTyDppCtrl,
+    ImmTyDppRowMask,
+    ImmTyDppBankMask,
+    ImmTyDppBoundCtrl,
+    ImmTyDppFi,
     ImmTySwizzle,
     ImmTyGprIdxMode,
     ImmTyHigh,
@@ -158,6 +162,8 @@ public:
     ImmTyCBSZ,
     ImmTyABID,
     ImmTyEndpgm,
+    ImmTyWaitVDST,
+    ImmTyWaitEXP,
   };
 
   enum ImmKindTy {
@@ -262,6 +268,14 @@ public:
     return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::i32);
   }
 
+  bool isRegOrInlineImmWithInt16InputMods() const {
+    return isRegOrInline(AMDGPU::VS_32RegClassID, MVT::i16);
+  }
+
+  bool isRegOrInlineImmWithInt32InputMods() const {
+    return isRegOrInline(AMDGPU::VS_32RegClassID, MVT::i32);
+  }
+
   bool isRegOrImmWithInt64InputMods() const {
     return isRegOrImmWithInputMods(AMDGPU::VS_64RegClassID, MVT::i64);
   }
@@ -278,6 +292,15 @@ public:
     return isRegOrImmWithInputMods(AMDGPU::VS_64RegClassID, MVT::f64);
   }
 
+  bool isRegOrInlineImmWithFP16InputMods() const {
+    return isRegOrInline(AMDGPU::VS_32RegClassID, MVT::f16);
+  }
+
+  bool isRegOrInlineImmWithFP32InputMods() const {
+    return isRegOrInline(AMDGPU::VS_32RegClassID, MVT::f32);
+  }
+
+
   bool isVReg() const {
     return isRegClass(AMDGPU::VGPR_32RegClassID) ||
            isRegClass(AMDGPU::VReg_64RegClassID) ||
@@ -815,6 +838,8 @@ public:
   }
 
   bool isSWaitCnt() const;
+  bool isDepCtr() const;
+  bool isSDelayAlu() const;
   bool isHwreg() const;
   bool isSendMsg() const;
   bool isSwizzle() const;
@@ -830,6 +855,8 @@ public:
   bool isS16Imm() const;
   bool isU16Imm() const;
   bool isEndpgm() const;
+  bool isWaitVDST() const;
+  bool isWaitEXP() const;
 
   StringRef getExpressionAsToken() const {
     assert(isExpr());
@@ -1037,6 +1064,8 @@ public:
     case ImmTyCBSZ: OS << "CBSZ"; break;
     case ImmTyABID: OS << "ABID"; break;
     case ImmTyEndpgm: OS << "Endpgm"; break;
+    case ImmTyWaitVDST: OS << "WaitVDST"; break;
+    case ImmTyWaitEXP: OS << "WaitEXP"; break;
     }
   }
 
@@ -1123,7 +1152,9 @@ raw_ostream &operator <<(raw_ostream &OS, AMDGPUOperand::Modifiers Mods) {
 class KernelScopeInfo {
   int SgprIndexUnusedMin = -1;
   int VgprIndexUnusedMin = -1;
+  int AgprIndexUnusedMin = -1;
   MCContext *Ctx = nullptr;
+  MCSubtargetInfo const *MSTI = nullptr;
 
   void usesSgprAt(int i) {
     if (i >= SgprIndexUnusedMin) {
@@ -1142,7 +1173,31 @@ class KernelScopeInfo {
       if (Ctx) {
         MCSymbol* const Sym =
           Ctx->getOrCreateSymbol(Twine(".kernel.vgpr_count"));
-        Sym->setVariableValue(MCConstantExpr::create(VgprIndexUnusedMin, *Ctx));
+        int totalVGPR = getTotalNumVGPRs(isGFX90A(*MSTI), AgprIndexUnusedMin,
+                                         VgprIndexUnusedMin);
+        Sym->setVariableValue(MCConstantExpr::create(totalVGPR, *Ctx));
+      }
+    }
+  }
+
+  void usesAgprAt(int i) {
+    // Instruction will error in AMDGPUAsmParser::MatchAndEmitInstruction
+    if (!hasMAIInsts(*MSTI))
+      return;
+
+    if (i >= AgprIndexUnusedMin) {
+      AgprIndexUnusedMin = ++i;
+      if (Ctx) {
+        MCSymbol* const Sym =
+          Ctx->getOrCreateSymbol(Twine(".kernel.agpr_count"));
+        Sym->setVariableValue(MCConstantExpr::create(AgprIndexUnusedMin, *Ctx));
+
+        // Also update vgpr_count (dependent on agpr_count for gfx908/gfx90a)
+        MCSymbol* const vSym =
+          Ctx->getOrCreateSymbol(Twine(".kernel.vgpr_count"));
+        int totalVGPR = getTotalNumVGPRs(isGFX90A(*MSTI), AgprIndexUnusedMin,
+                                         VgprIndexUnusedMin);
+        vSym->setVariableValue(MCConstantExpr::create(totalVGPR, *Ctx));
       }
     }
   }
@@ -1152,16 +1207,29 @@ public:
 
   void initialize(MCContext &Context) {
     Ctx = &Context;
+    MSTI = Ctx->getSubtargetInfo();
+
     usesSgprAt(SgprIndexUnusedMin = -1);
     usesVgprAt(VgprIndexUnusedMin = -1);
+    if (hasMAIInsts(*MSTI)) {
+      usesAgprAt(AgprIndexUnusedMin = -1);
+    }
   }
 
-  void usesRegister(RegisterKind RegKind, unsigned DwordRegIndex, unsigned RegWidth) {
+  void usesRegister(RegisterKind RegKind, unsigned DwordRegIndex,
+                    unsigned RegWidth) {
     switch (RegKind) {
-      case IS_SGPR: usesSgprAt(DwordRegIndex + RegWidth - 1); break;
-      case IS_AGPR: // fall through
-      case IS_VGPR: usesVgprAt(DwordRegIndex + RegWidth - 1); break;
-      default: break;
+    case IS_SGPR:
+      usesSgprAt(DwordRegIndex + divideCeil(RegWidth, 32) - 1);
+      break;
+    case IS_AGPR:
+      usesAgprAt(DwordRegIndex + divideCeil(RegWidth, 32) - 1);
+      break;
+    case IS_VGPR:
+      usesVgprAt(DwordRegIndex + divideCeil(RegWidth, 32) - 1);
+      break;
+    default:
+      break;
     }
   }
 };
@@ -1353,10 +1421,15 @@ public:
     return AMDGPU::isGFX9(getSTI());
   }
 
+  // TODO: isGFX90A is also true for GFX940. We need to clean it.
   bool isGFX90A() const {
     return AMDGPU::isGFX90A(getSTI());
   }
 
+  bool isGFX940() const {
+    return AMDGPU::isGFX940(getSTI());
+  }
+
   bool isGFX9Plus() const {
     return AMDGPU::isGFX9Plus(getSTI());
   }
@@ -1367,6 +1440,14 @@ public:
 
   bool isGFX10Plus() const { return AMDGPU::isGFX10Plus(getSTI()); }
 
+  bool isGFX11() const {
+    return AMDGPU::isGFX11(getSTI());
+  }
+
+  bool isGFX11Plus() const {
+    return AMDGPU::isGFX11Plus(getSTI());
+  }
+
   bool isGFX10_BEncoding() const {
     return AMDGPU::isGFX10_BEncoding(getSTI());
   }
@@ -1496,6 +1577,14 @@ public:
 
   bool parseCnt(int64_t &IntVal);
   OperandMatchResultTy parseSWaitCntOps(OperandVector &Operands);
+
+  bool parseDepCtr(int64_t &IntVal, unsigned &Mask);
+  void depCtrError(SMLoc Loc, int ErrorId, StringRef DepCtrName);
+  OperandMatchResultTy parseDepCtrOps(OperandVector &Operands);
+
+  bool parseDelay(int64_t &Delay);
+  OperandMatchResultTy parseSDelayAluOps(OperandVector &Operands);
+
   OperandMatchResultTy parseHwreg(OperandVector &Operands);
 
 private:
@@ -1522,6 +1611,7 @@ private:
 
   SMLoc getFlatOffsetLoc(const OperandVector &Operands) const;
   SMLoc getSMEMOffsetLoc(const OperandVector &Operands) const;
+  SMLoc getBLGPLoc(const OperandVector &Operands) const;
 
   SMLoc getOperandLoc(std::function<bool(const AMDGPUOperand&)> Test,
                       const OperandVector &Operands) const;
@@ -1540,7 +1630,7 @@ private:
   bool validateMIMGAtomicDMask(const MCInst &Inst);
   bool validateMIMGGatherDMask(const MCInst &Inst);
   bool validateMovrels(const MCInst &Inst, const OperandVector &Operands);
-  bool validateMIMGDataSize(const MCInst &Inst);
+  Optional<StringRef> validateMIMGDataSize(const MCInst &Inst);
   bool validateMIMGAddrSize(const MCInst &Inst);
   bool validateMIMGD16(const MCInst &Inst);
   bool validateMIMGDim(const MCInst &Inst);
@@ -1553,10 +1643,14 @@ private:
   bool validateMFMA(const MCInst &Inst, const OperandVector &Operands);
   bool validateAGPRLdSt(const MCInst &Inst) const;
   bool validateVGPRAlign(const MCInst &Inst) const;
+  bool validateBLGP(const MCInst &Inst, const OperandVector &Operands);
   bool validateGWS(const MCInst &Inst, const OperandVector &Operands);
   bool validateDivScale(const MCInst &Inst);
   bool validateCoherencyBits(const MCInst &Inst, const OperandVector &Operands,
                              const SMLoc &IDLoc);
+  bool validateFlatLdsDMA(const MCInst &Inst, const OperandVector &Operands,
+                          const SMLoc &IDLoc);
+  bool validateExeczVcczOperands(const OperandVector &Operands);
   Optional<StringRef> validateLdsDirect(const MCInst &Inst);
   unsigned getConstantBusLimit(unsigned Opcode) const;
   bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
@@ -1586,7 +1680,7 @@ private:
   bool parseExpr(int64_t &Imm, StringRef Expected = "");
   bool parseExpr(OperandVector &Operands);
   StringRef getTokenStr() const;
-  AsmToken peekToken();
+  AsmToken peekToken(bool ShouldSkipSpace = true);
   AsmToken getToken() const;
   SMLoc getLoc() const;
   void lex();
@@ -1644,10 +1738,12 @@ public:
   void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands);
   void cvtVOP3(MCInst &Inst, const OperandVector &Operands);
   void cvtVOP3P(MCInst &Inst, const OperandVector &Operands);
+  void cvtVOPD(MCInst &Inst, const OperandVector &Operands);
   void cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
                 OptionalImmIndexMap &OptionalIdx);
 
   void cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands);
+  void cvtVINTERP(MCInst &Inst, const OperandVector &Operands);
 
   void cvtMIMG(MCInst &Inst, const OperandVector &Operands,
                bool IsAtomic = false);
@@ -1668,7 +1764,24 @@ public:
   AMDGPUOperand::Ptr defaultBoundCtrl() const;
   AMDGPUOperand::Ptr defaultFI() const;
   void cvtDPP(MCInst &Inst, const OperandVector &Operands, bool IsDPP8 = false);
-  void cvtDPP8(MCInst &Inst, const OperandVector &Operands) { cvtDPP(Inst, Operands, true); }
+  void cvtDPP8(MCInst &Inst, const OperandVector &Operands) {
+    cvtDPP(Inst, Operands, true);
+  }
+  void cvtVOPCNoDstDPP(MCInst &Inst, const OperandVector &Operands,
+                       bool IsDPP8 = false);
+  void cvtVOPCNoDstDPP8(MCInst &Inst, const OperandVector &Operands) {
+    cvtVOPCNoDstDPP(Inst, Operands, true);
+  }
+  void cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
+                  bool IsDPP8 = false);
+  void cvtVOP3DPP8(MCInst &Inst, const OperandVector &Operands) {
+    cvtVOP3DPP(Inst, Operands, true);
+  }
+  void cvtVOPC64NoDstDPP(MCInst &Inst, const OperandVector &Operands,
+                         bool IsDPP8 = false);
+  void cvtVOPC64NoDstDPP8(MCInst &Inst, const OperandVector &Operands) {
+    cvtVOPC64NoDstDPP(Inst, Operands, true);
+  }
 
   OperandMatchResultTy parseSDWASel(OperandVector &Operands, StringRef Prefix,
                                     AMDGPUOperand::ImmTy Type);
@@ -1689,6 +1802,10 @@ public:
 
   OperandMatchResultTy parseEndpgmOp(OperandVector &Operands);
   AMDGPUOperand::Ptr defaultEndpgmImmOperands() const;
+
+  AMDGPUOperand::Ptr defaultWaitVDST() const;
+  AMDGPUOperand::Ptr defaultWaitEXP() const;
+  OperandMatchResultTy parseVOPD(OperandVector &Operands);
 };
 
 struct OptionalOperand {
@@ -1897,7 +2014,7 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const {
 
   // We allow fp literals with f16x2 operands assuming that the specified
   // literal goes into the lower half and the upper half is zero. We also
-  // require that the literal may be losslesly converted to f16.
+  // require that the literal may be losslessly converted to f16.
   MVT ExpectedType = (type == MVT::v2f16)? MVT::f16 :
                      (type == MVT::v2i16)? MVT::i16 :
                      (type == MVT::v2f32)? MVT::f32 : type;
@@ -2211,52 +2328,86 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
   if (Is == IS_VGPR) {
     switch (RegWidth) {
       default: return -1;
-      case 1: return AMDGPU::VGPR_32RegClassID;
-      case 2: return AMDGPU::VReg_64RegClassID;
-      case 3: return AMDGPU::VReg_96RegClassID;
-      case 4: return AMDGPU::VReg_128RegClassID;
-      case 5: return AMDGPU::VReg_160RegClassID;
-      case 6: return AMDGPU::VReg_192RegClassID;
-      case 7: return AMDGPU::VReg_224RegClassID;
-      case 8: return AMDGPU::VReg_256RegClassID;
-      case 16: return AMDGPU::VReg_512RegClassID;
-      case 32: return AMDGPU::VReg_1024RegClassID;
+      case 32:
+        return AMDGPU::VGPR_32RegClassID;
+      case 64:
+        return AMDGPU::VReg_64RegClassID;
+      case 96:
+        return AMDGPU::VReg_96RegClassID;
+      case 128:
+        return AMDGPU::VReg_128RegClassID;
+      case 160:
+        return AMDGPU::VReg_160RegClassID;
+      case 192:
+        return AMDGPU::VReg_192RegClassID;
+      case 224:
+        return AMDGPU::VReg_224RegClassID;
+      case 256:
+        return AMDGPU::VReg_256RegClassID;
+      case 512:
+        return AMDGPU::VReg_512RegClassID;
+      case 1024:
+        return AMDGPU::VReg_1024RegClassID;
     }
   } else if (Is == IS_TTMP) {
     switch (RegWidth) {
       default: return -1;
-      case 1: return AMDGPU::TTMP_32RegClassID;
-      case 2: return AMDGPU::TTMP_64RegClassID;
-      case 4: return AMDGPU::TTMP_128RegClassID;
-      case 8: return AMDGPU::TTMP_256RegClassID;
-      case 16: return AMDGPU::TTMP_512RegClassID;
+      case 32:
+        return AMDGPU::TTMP_32RegClassID;
+      case 64:
+        return AMDGPU::TTMP_64RegClassID;
+      case 128:
+        return AMDGPU::TTMP_128RegClassID;
+      case 256:
+        return AMDGPU::TTMP_256RegClassID;
+      case 512:
+        return AMDGPU::TTMP_512RegClassID;
     }
   } else if (Is == IS_SGPR) {
     switch (RegWidth) {
       default: return -1;
-      case 1: return AMDGPU::SGPR_32RegClassID;
-      case 2: return AMDGPU::SGPR_64RegClassID;
-      case 3: return AMDGPU::SGPR_96RegClassID;
-      case 4: return AMDGPU::SGPR_128RegClassID;
-      case 5: return AMDGPU::SGPR_160RegClassID;
-      case 6: return AMDGPU::SGPR_192RegClassID;
-      case 7: return AMDGPU::SGPR_224RegClassID;
-      case 8: return AMDGPU::SGPR_256RegClassID;
-      case 16: return AMDGPU::SGPR_512RegClassID;
+      case 32:
+        return AMDGPU::SGPR_32RegClassID;
+      case 64:
+        return AMDGPU::SGPR_64RegClassID;
+      case 96:
+        return AMDGPU::SGPR_96RegClassID;
+      case 128:
+        return AMDGPU::SGPR_128RegClassID;
+      case 160:
+        return AMDGPU::SGPR_160RegClassID;
+      case 192:
+        return AMDGPU::SGPR_192RegClassID;
+      case 224:
+        return AMDGPU::SGPR_224RegClassID;
+      case 256:
+        return AMDGPU::SGPR_256RegClassID;
+      case 512:
+        return AMDGPU::SGPR_512RegClassID;
     }
   } else if (Is == IS_AGPR) {
     switch (RegWidth) {
       default: return -1;
-      case 1: return AMDGPU::AGPR_32RegClassID;
-      case 2: return AMDGPU::AReg_64RegClassID;
-      case 3: return AMDGPU::AReg_96RegClassID;
-      case 4: return AMDGPU::AReg_128RegClassID;
-      case 5: return AMDGPU::AReg_160RegClassID;
-      case 6: return AMDGPU::AReg_192RegClassID;
-      case 7: return AMDGPU::AReg_224RegClassID;
-      case 8: return AMDGPU::AReg_256RegClassID;
-      case 16: return AMDGPU::AReg_512RegClassID;
-      case 32: return AMDGPU::AReg_1024RegClassID;
+      case 32:
+        return AMDGPU::AGPR_32RegClassID;
+      case 64:
+        return AMDGPU::AReg_64RegClassID;
+      case 96:
+        return AMDGPU::AReg_96RegClassID;
+      case 128:
+        return AMDGPU::AReg_128RegClassID;
+      case 160:
+        return AMDGPU::AReg_160RegClassID;
+      case 192:
+        return AMDGPU::AReg_192RegClassID;
+      case 224:
+        return AMDGPU::AReg_224RegClassID;
+      case 256:
+        return AMDGPU::AReg_256RegClassID;
+      case 512:
+        return AMDGPU::AReg_512RegClassID;
+      case 1024:
+        return AMDGPU::AReg_1024RegClassID;
     }
   }
   return -1;
@@ -2343,32 +2494,32 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth,
   case IS_SPECIAL:
     if (Reg == AMDGPU::EXEC_LO && Reg1 == AMDGPU::EXEC_HI) {
       Reg = AMDGPU::EXEC;
-      RegWidth = 2;
+      RegWidth = 64;
       return true;
     }
     if (Reg == AMDGPU::FLAT_SCR_LO && Reg1 == AMDGPU::FLAT_SCR_HI) {
       Reg = AMDGPU::FLAT_SCR;
-      RegWidth = 2;
+      RegWidth = 64;
       return true;
     }
     if (Reg == AMDGPU::XNACK_MASK_LO && Reg1 == AMDGPU::XNACK_MASK_HI) {
       Reg = AMDGPU::XNACK_MASK;
-      RegWidth = 2;
+      RegWidth = 64;
       return true;
     }
     if (Reg == AMDGPU::VCC_LO && Reg1 == AMDGPU::VCC_HI) {
       Reg = AMDGPU::VCC;
-      RegWidth = 2;
+      RegWidth = 64;
       return true;
     }
     if (Reg == AMDGPU::TBA_LO && Reg1 == AMDGPU::TBA_HI) {
       Reg = AMDGPU::TBA;
-      RegWidth = 2;
+      RegWidth = 64;
       return true;
     }
     if (Reg == AMDGPU::TMA_LO && Reg1 == AMDGPU::TMA_HI) {
       Reg = AMDGPU::TMA;
-      RegWidth = 2;
+      RegWidth = 64;
       return true;
     }
     Error(Loc, "register does not fit in the list");
@@ -2377,11 +2528,11 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth,
   case IS_SGPR:
   case IS_AGPR:
   case IS_TTMP:
-    if (Reg1 != Reg + RegWidth) {
+    if (Reg1 != Reg + RegWidth / 32) {
       Error(Loc, "registers in a list must have consecutive indices");
       return false;
     }
-    RegWidth++;
+    RegWidth += 32;
     return true;
   default:
     llvm_unreachable("unexpected register kind");
@@ -2470,7 +2621,7 @@ AMDGPUAsmParser::getRegularReg(RegisterKind RegKind,
   if (RegKind == IS_SGPR || RegKind == IS_TTMP) {
     // SGPR and TTMP registers must be aligned.
     // Max required alignment is 4 dwords.
-    AlignSize = std::min(RegWidth, 4u);
+    AlignSize = std::min(RegWidth / 32, 4u);
   }
 
   if (RegNum % AlignSize != 0) {
@@ -2495,8 +2646,7 @@ AMDGPUAsmParser::getRegularReg(RegisterKind RegKind,
   return RC.getRegister(RegIdx);
 }
 
-bool
-AMDGPUAsmParser::ParseRegRange(unsigned& Num, unsigned& Width) {
+bool AMDGPUAsmParser::ParseRegRange(unsigned &Num, unsigned &RegWidth) {
   int64_t RegLo, RegHi;
   if (!skipToken(AsmToken::LBrac, "missing register index"))
     return false;
@@ -2534,7 +2684,7 @@ AMDGPUAsmParser::ParseRegRange(unsigned& Num, unsigned& Width) {
   }
 
   Num = static_cast<unsigned>(RegLo);
-  Width = (RegHi - RegLo) + 1;
+  RegWidth = 32 * ((RegHi - RegLo) + 1);
   return true;
 }
 
@@ -2545,7 +2695,7 @@ unsigned AMDGPUAsmParser::ParseSpecialReg(RegisterKind &RegKind,
   unsigned Reg = getSpecialRegForName(getTokenStr());
   if (Reg) {
     RegNum = 0;
-    RegWidth = 1;
+    RegWidth = 32;
     RegKind = IS_SPECIAL;
     Tokens.push_back(getToken());
     lex(); // skip register name
@@ -2577,7 +2727,7 @@ unsigned AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind,
       Error(Loc, "invalid register index");
       return AMDGPU::NoRegister;
     }
-    RegWidth = 1;
+    RegWidth = 32;
   } else {
     // Range of registers: v[XX:YY]. ":YY" is optional.
     if (!ParseRegRange(RegNum, RegWidth))
@@ -2603,7 +2753,7 @@ unsigned AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, unsigned &RegNum,
   auto Loc = getLoc();
   if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth))
     return AMDGPU::NoRegister;
-  if (RegWidth != 1) {
+  if (RegWidth != 32) {
     Error(Loc, "expected a single 32-bit register");
     return AMDGPU::NoRegister;
   }
@@ -2618,7 +2768,7 @@ unsigned AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, unsigned &RegNum,
                              Tokens)) {
       return AMDGPU::NoRegister;
     }
-    if (NextRegWidth != 1) {
+    if (NextRegWidth != 32) {
       Error(Loc, "expected a single 32-bit register");
       return AMDGPU::NoRegister;
     }
@@ -2721,7 +2871,7 @@ bool AMDGPUAsmParser::updateGprCountSymbols(RegisterKind RegKind,
     return true;
   MCSymbol *Sym = getContext().getOrCreateSymbol(*SymbolName);
 
-  int64_t NewMax = DwordRegIndex + RegWidth - 1;
+  int64_t NewMax = DwordRegIndex + divideCeil(RegWidth, 32) - 1;
   int64_t OldCount;
 
   if (!Sym->isVariable())
@@ -2761,7 +2911,8 @@ OperandMatchResultTy
 AMDGPUAsmParser::parseImm(OperandVector &Operands, bool HasSP3AbsModifier) {
   // TODO: add syntactic sugar for 1/(2*PI)
 
-  assert(!isRegister());
+  if (isRegister())
+    return MatchOperand_NoMatch;
   assert(!isModifier());
 
   const auto& Tok = getToken();
@@ -2927,7 +3078,7 @@ AMDGPUAsmParser::isModifier() {
 //    v_exp_f32_e32 v5, -1 // VOP1: src0 = 0xFFFFFFFF
 //    v_exp_f32_e64 v5, -1 // VOP3: src0 = 0x80000001
 // Negative fp literals with preceding "-" are
-// handled likewise for unifomtity
+// handled likewise for uniformity
 //
 bool
 AMDGPUAsmParser::parseSP3NegModifier() {
@@ -3110,7 +3261,8 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
 static ArrayRef<unsigned> getAllVariants() {
   static const unsigned Variants[] = {
     AMDGPUAsmVariants::DEFAULT, AMDGPUAsmVariants::VOP3,
-    AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::SDWA9, AMDGPUAsmVariants::DPP
+    AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::SDWA9,
+    AMDGPUAsmVariants::DPP, AMDGPUAsmVariants::VOP3_DPP
   };
 
   return makeArrayRef(Variants);
@@ -3118,6 +3270,10 @@ static ArrayRef<unsigned> getAllVariants() {
 
 // What asm variants we should check
 ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const {
+  if (isForcedDPP() && isForcedVOP3()) {
+    static const unsigned Variants[] = {AMDGPUAsmVariants::VOP3_DPP};
+    return makeArrayRef(Variants);
+  }
   if (getForcedEncodingSize() == 32) {
     static const unsigned Variants[] = {AMDGPUAsmVariants::DEFAULT};
     return makeArrayRef(Variants);
@@ -3143,6 +3299,9 @@ ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const {
 }
 
 StringRef AMDGPUAsmParser::getMatchedVariantName() const {
+  if (isForcedDPP() && isForcedVOP3())
+    return "e64_dpp";
+
   if (getForcedEncodingSize() == 32)
     return "e32";
 
@@ -3231,10 +3390,13 @@ unsigned AMDGPUAsmParser::getConstantBusLimit(unsigned Opcode) const {
   // 64-bit shift instructions can use only one scalar value input
   case AMDGPU::V_LSHLREV_B64_e64:
   case AMDGPU::V_LSHLREV_B64_gfx10:
+  case AMDGPU::V_LSHLREV_B64_e64_gfx11:
   case AMDGPU::V_LSHRREV_B64_e64:
   case AMDGPU::V_LSHRREV_B64_gfx10:
+  case AMDGPU::V_LSHRREV_B64_e64_gfx11:
   case AMDGPU::V_ASHRREV_I64_e64:
   case AMDGPU::V_ASHRREV_I64_gfx10:
+  case AMDGPU::V_ASHRREV_I64_e64_gfx11:
   case AMDGPU::V_LSHL_B64_e64:
   case AMDGPU::V_LSHR_B64_e64:
   case AMDGPU::V_ASHR_I64_e64:
@@ -3305,8 +3467,7 @@ AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst,
           //   flat_scratch_lo, flat_scratch_hi
           // are theoretically valid but they are disabled anyway.
           // Note that this code mimics SIInstrInfo::verifyInstruction
-          if (!SGPRsUsed.count(LastSGPR)) {
-            SGPRsUsed.insert(LastSGPR);
+          if (SGPRsUsed.insert(LastSGPR).second) {
             ++ConstantBusUseCount;
           }
         } else { // Expression or a literal
@@ -3369,7 +3530,6 @@ AMDGPUAsmParser::validateEarlyClobberLimitations(const MCInst &Inst,
   assert(DstIdx != -1);
   const MCOperand &Dst = Inst.getOperand(DstIdx);
   assert(Dst.isReg());
-  const unsigned DstReg = mc2PseudoReg(Dst.getReg());
 
   const int SrcIndices[] = { Src0Idx, Src1Idx, Src2Idx };
 
@@ -3377,8 +3537,8 @@ AMDGPUAsmParser::validateEarlyClobberLimitations(const MCInst &Inst,
     if (SrcIdx == -1) break;
     const MCOperand &Src = Inst.getOperand(SrcIdx);
     if (Src.isReg()) {
-      const unsigned SrcReg = mc2PseudoReg(Src.getReg());
-      if (isRegIntersect(DstReg, SrcReg, TRI)) {
+      if (TRI->regsOverlap(Dst.getReg(), Src.getReg())) {
+        const unsigned SrcReg = mc2PseudoReg(Src.getReg());
         Error(getRegLoc(SrcReg, Operands),
           "destination must be different than all sources");
         return false;
@@ -3403,13 +3563,13 @@ bool AMDGPUAsmParser::validateIntClampSupported(const MCInst &Inst) {
   return true;
 }
 
-bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst) {
+Optional<StringRef> AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst) {
 
   const unsigned Opc = Inst.getOpcode();
   const MCInstrDesc &Desc = MII.get(Opc);
 
   if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0)
-    return true;
+    return None;
 
   int VDataIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
   int DMaskIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dmask);
@@ -3418,7 +3578,7 @@ bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst) {
   assert(VDataIdx != -1);
 
   if (DMaskIdx == -1 || TFEIdx == -1) // intersect_ray
-    return true;
+    return None;
 
   unsigned VDataSize = AMDGPU::getRegOperandSize(getMRI(), Desc, VDataIdx);
   unsigned TFESize = (TFEIdx != -1 && Inst.getOperand(TFEIdx).getImm()) ? 1 : 0;
@@ -3426,15 +3586,22 @@ bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst) {
   if (DMask == 0)
     DMask = 1;
 
+  bool isPackedD16 = false;
   unsigned DataSize =
     (Desc.TSFlags & SIInstrFlags::Gather4) ? 4 : countPopulation(DMask);
   if (hasPackedD16()) {
     int D16Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::d16);
-    if (D16Idx >= 0 && Inst.getOperand(D16Idx).getImm())
+    isPackedD16 = D16Idx >= 0;
+    if (isPackedD16 && Inst.getOperand(D16Idx).getImm())
       DataSize = (DataSize + 1) / 2;
   }
 
-  return (VDataSize / 4) == DataSize + TFESize;
+  if ((VDataSize / 4) == DataSize + TFESize)
+    return None;
+
+  return StringRef(isPackedD16
+                       ? "image data size does not match dmask, d16 and tfe"
+                       : "image data size does not match dmask and tfe");
 }
 
 bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst) {
@@ -3607,7 +3774,7 @@ bool AMDGPUAsmParser::validateMAIAccWrite(const MCInst &Inst,
 
   auto Reg = mc2PseudoReg(Src0.getReg());
   const MCRegisterInfo *TRI = getContext().getRegisterInfo();
-  if (isSGPR(Reg, TRI)) {
+  if (!isGFX90A() && isSGPR(Reg, TRI)) {
     Error(getRegLoc(Reg, Operands),
           "source operand must be either a VGPR or an inline constant");
     return false;
@@ -3641,7 +3808,7 @@ bool AMDGPUAsmParser::validateMFMA(const MCInst &Inst,
   if (TRI->getRegClass(Desc.OpInfo[0].RegClass).getSizeInBits() <= 128)
     return true;
 
-  if (isRegIntersect(Src2Reg, DstReg, TRI)) {
+  if (TRI->regsOverlap(Src2Reg, DstReg)) {
     Error(getRegLoc(mc2PseudoReg(Src2Reg), Operands),
           "source 2 operand must not partially overlap with dst");
     return false;
@@ -3861,7 +4028,7 @@ Optional<StringRef> AMDGPUAsmParser::validateLdsDirect(const MCInst &Inst) {
     const auto &Src = Inst.getOperand(SrcIdx);
     if (Src.isReg() && Src.getReg() == LDS_DIRECT) {
 
-      if (isGFX90A())
+      if (isGFX90A() || isGFX11Plus())
         return StringRef("lds_direct is not supported on this GPU");
 
       if (IsRevOpcode(Opcode) || (Desc.TSFlags & SIInstrFlags::SDWA))
@@ -4009,6 +4176,20 @@ bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) {
     if (OpSel & ~3)
       return false;
   }
+
+  if (isGFX940() && (MII.get(Opc).TSFlags & SIInstrFlags::IsDOT)) {
+    int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
+    if (OpSelIdx != -1) {
+      if (Inst.getOperand(OpSelIdx).getImm() != 0)
+        return false;
+    }
+    int OpSelHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi);
+    if (OpSelHiIdx != -1) {
+      if (Inst.getOperand(OpSelHiIdx).getImm() != -1)
+        return false;
+    }
+  }
+
   return true;
 }
 
@@ -4179,6 +4360,47 @@ bool AMDGPUAsmParser::validateVGPRAlign(const MCInst &Inst) const {
   return true;
 }
 
+SMLoc AMDGPUAsmParser::getBLGPLoc(const OperandVector &Operands) const {
+  for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+    if (Op.isBLGP())
+      return Op.getStartLoc();
+  }
+  return SMLoc();
+}
+
+bool AMDGPUAsmParser::validateBLGP(const MCInst &Inst,
+                                   const OperandVector &Operands) {
+  unsigned Opc = Inst.getOpcode();
+  int BlgpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::blgp);
+  if (BlgpIdx == -1)
+    return true;
+  SMLoc BLGPLoc = getBLGPLoc(Operands);
+  if (!BLGPLoc.isValid())
+    return true;
+  bool IsNeg = StringRef(BLGPLoc.getPointer()).startswith("neg:");
+  auto FB = getFeatureBits();
+  bool UsesNeg = false;
+  if (FB[AMDGPU::FeatureGFX940Insts]) {
+    switch (Opc) {
+    case AMDGPU::V_MFMA_F64_16X16X4F64_gfx940_acd:
+    case AMDGPU::V_MFMA_F64_16X16X4F64_gfx940_vcd:
+    case AMDGPU::V_MFMA_F64_4X4X4F64_gfx940_acd:
+    case AMDGPU::V_MFMA_F64_4X4X4F64_gfx940_vcd:
+      UsesNeg = true;
+    }
+  }
+
+  if (IsNeg == UsesNeg)
+    return true;
+
+  Error(BLGPLoc,
+        UsesNeg ? "invalid modifier: blgp is not supported"
+                : "invalid modifier: neg is not supported");
+
+  return false;
+}
+
 // gfx90a has an undocumented limitation:
 // DS_GWS opcodes must use even aligned registers.
 bool AMDGPUAsmParser::validateGWS(const MCInst &Inst,
@@ -4218,13 +4440,19 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
   unsigned CPol = Inst.getOperand(CPolPos).getImm();
 
   uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
-  if ((TSFlags & (SIInstrFlags::SMRD)) &&
-      (CPol & ~(AMDGPU::CPol::GLC | AMDGPU::CPol::DLC))) {
-    Error(IDLoc, "invalid cache policy for SMRD instruction");
-    return false;
+  if (TSFlags & SIInstrFlags::SMRD) {
+    if (CPol && (isSI() || isCI())) {
+      SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
+      Error(S, "cache policy is not supported for SMRD instructions");
+      return false;
+    }
+    if (CPol & ~(AMDGPU::CPol::GLC | AMDGPU::CPol::DLC)) {
+      Error(IDLoc, "invalid cache policy for SMEM instruction");
+      return false;
+    }
   }
 
-  if (isGFX90A() && (CPol & CPol::SCC)) {
+  if (isGFX90A() && !isGFX940() && (CPol & CPol::SCC)) {
     SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
     StringRef CStr(S.getPointer());
     S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scc")]);
@@ -4237,15 +4465,18 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
 
   if (TSFlags & SIInstrFlags::IsAtomicRet) {
     if (!(TSFlags & SIInstrFlags::MIMG) && !(CPol & CPol::GLC)) {
-      Error(IDLoc, "instruction must use glc");
+      Error(IDLoc, isGFX940() ? "instruction must use sc0"
+                              : "instruction must use glc");
       return false;
     }
   } else {
     if (CPol & CPol::GLC) {
       SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
       StringRef CStr(S.getPointer());
-      S = SMLoc::getFromPointer(&CStr.data()[CStr.find("glc")]);
-      Error(S, "instruction must not use glc");
+      S = SMLoc::getFromPointer(
+          &CStr.data()[CStr.find(isGFX940() ? "sc0" : "glc")]);
+      Error(S, isGFX940() ? "instruction must not use sc0"
+                          : "instruction must not use glc");
       return false;
     }
   }
@@ -4253,6 +4484,47 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
   return true;
 }
 
+bool AMDGPUAsmParser::validateFlatLdsDMA(const MCInst &Inst,
+                                         const OperandVector &Operands,
+                                         const SMLoc &IDLoc) {
+  if (isGFX940())
+    return true;
+
+  uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
+  if ((TSFlags & (SIInstrFlags::VALU | SIInstrFlags::FLAT)) !=
+      (SIInstrFlags::VALU | SIInstrFlags::FLAT))
+    return true;
+  // This is FLAT LDS DMA.
+
+  SMLoc S = getImmLoc(AMDGPUOperand::ImmTyLDS, Operands);
+  StringRef CStr(S.getPointer());
+  if (!CStr.startswith("lds")) {
+    // This is incorrectly selected LDS DMA version of a FLAT load opcode.
+    // And LDS version should have 'lds' modifier, but it follows optional
+    // operands so its absense is ignored by the matcher.
+    Error(IDLoc, "invalid operands for instruction");
+    return false;
+  }
+
+  return true;
+}
+
+bool AMDGPUAsmParser::validateExeczVcczOperands(const OperandVector &Operands) {
+  if (!isGFX11Plus())
+    return true;
+  for (auto &Operand : Operands) {
+    if (!Operand->isReg())
+      continue;
+    unsigned Reg = Operand->getReg();
+    if (Reg == SRC_EXECZ || Reg == SRC_VCCZ) {
+      Error(getRegLoc(Reg, Operands),
+            "execz and vccz are not supported on this GPU");
+      return false;
+    }
+  }
+  return true;
+}
+
 bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
                                           const SMLoc &IDLoc,
                                           const OperandVector &Operands) {
@@ -4302,9 +4574,8 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
           "invalid dim; must be MSAA type");
     return false;
   }
-  if (!validateMIMGDataSize(Inst)) {
-    Error(IDLoc,
-      "image data size does not match dmask and tfe");
+  if (auto ErrMsg = validateMIMGDataSize(Inst)) {
+    Error(IDLoc, *ErrMsg);
     return false;
   }
   if (!validateMIMGAddrSize(Inst)) {
@@ -4357,6 +4628,10 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
     return false;
   }
 
+  if (!validateBLGP(Inst, Operands)) {
+    return false;
+  }
+
   if (!validateDivScale(Inst)) {
     Error(IDLoc, "ABS not allowed in VOP3B instructions");
     return false;
@@ -4364,6 +4639,13 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
   if (!validateCoherencyBits(Inst, Operands, IDLoc)) {
     return false;
   }
+  if (!validateExeczVcczOperands(Operands)) {
+    return false;
+  }
+
+  if (!validateFlatLdsDMA(Inst, Operands, IDLoc)) {
+    return false;
+  }
 
   return true;
 }
@@ -4606,6 +4888,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
   SMRange VGPRRange;
   uint64_t NextFreeVGPR = 0;
   uint64_t AccumOffset = 0;
+  uint64_t SharedVGPRCount = 0;
   SMRange SGPRRange;
   uint64_t NextFreeSGPR = 0;
 
@@ -4630,9 +4913,8 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
     if (ID == ".end_amdhsa_kernel")
       break;
 
-    if (Seen.find(ID) != Seen.end())
+    if (!Seen.insert(ID).second)
       return TokError(".amdhsa_ directives cannot be repeated");
-    Seen.insert(ID);
 
     SMLoc ValStart = getLoc();
     int64_t IVal;
@@ -4833,6 +5115,13 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
         return Error(IDRange.Start, "directive requires gfx10+", IDRange);
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_FWD_PROGRESS, Val,
                        ValRange);
+    } else if (ID == ".amdhsa_shared_vgpr_count") {
+      if (IVersion.Major < 10)
+        return Error(IDRange.Start, "directive requires gfx10+", IDRange);
+      SharedVGPRCount = Val;
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc3,
+                       COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT, Val,
+                       ValRange);
     } else if (ID == ".amdhsa_exception_fp_ieee_invalid_op") {
       PARSE_BITS_ENTRY(
           KD.compute_pgm_rsrc2,
@@ -4922,6 +5211,19 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
                     (AccumOffset / 4 - 1));
   }
 
+  if (IVersion.Major == 10) {
+    // SharedVGPRCount < 16 checked by PARSE_ENTRY_BITS
+    if (SharedVGPRCount && EnableWavefrontSize32) {
+      return TokError("shared_vgpr_count directive not valid on "
+                      "wavefront size 32");
+    }
+    if (SharedVGPRCount * 2 + VGPRBlocks > 63) {
+      return TokError("shared_vgpr_count*2 + "
+                      "compute_pgm_rsrc1.GRANULATED_WORKITEM_VGPR_COUNT cannot "
+                      "exceed 63\n");
+    }
+  }
+
   getTargetStreamer().EmitAmdhsaKernelDescriptor(
       getSTI(), KernelName, KD, NextFreeVGPR, NextFreeSGPR, ReserveVCC,
       ReserveFlatScr);
@@ -5253,8 +5555,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPULDS() {
       return Error(AlignLoc, "alignment is too large");
   }
 
-  if (parseToken(AsmToken::EndOfStatement,
-                 "unexpected token in '.amdgpu_lds' directive"))
+  if (parseEOL())
     return true;
 
   Symbol->redefineIfPossible();
@@ -5313,26 +5614,21 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
 bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
                                            unsigned RegNo) {
 
-  for (MCRegAliasIterator R(AMDGPU::TTMP12_TTMP13_TTMP14_TTMP15, &MRI, true);
-       R.isValid(); ++R) {
-    if (*R == RegNo)
-      return isGFX9Plus();
-  }
+  if (MRI.regsOverlap(AMDGPU::TTMP12_TTMP13_TTMP14_TTMP15, RegNo))
+    return isGFX9Plus();
 
-  // GFX10 has 2 more SGPRs 104 and 105.
-  for (MCRegAliasIterator R(AMDGPU::SGPR104_SGPR105, &MRI, true);
-       R.isValid(); ++R) {
-    if (*R == RegNo)
-      return hasSGPR104_SGPR105();
-  }
+  // GFX10+ has 2 more SGPRs 104 and 105.
+  if (MRI.regsOverlap(AMDGPU::SGPR104_SGPR105, RegNo))
+    return hasSGPR104_SGPR105();
 
   switch (RegNo) {
   case AMDGPU::SRC_SHARED_BASE:
   case AMDGPU::SRC_SHARED_LIMIT:
   case AMDGPU::SRC_PRIVATE_BASE:
   case AMDGPU::SRC_PRIVATE_LIMIT:
-  case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
     return isGFX9Plus();
+  case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
+    return isGFX9Plus() && !isGFX11Plus();
   case AMDGPU::TBA:
   case AMDGPU::TBA_LO:
   case AMDGPU::TBA_HI:
@@ -5355,7 +5651,7 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
 
   if (isSI() || isGFX10Plus()) {
     // No flat_scr on SI.
-    // On GFX10 flat scratch is not a valid register operand and can only be
+    // On GFX10Plus flat scratch is not a valid register operand and can only be
     // accessed with s_setreg/s_getreg.
     switch (RegNo) {
     case AMDGPU::FLAT_SCR:
@@ -5369,11 +5665,8 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
 
   // VI only has 102 SGPRs, so make sure we aren't trying to use the 2 more that
   // SI/CI have.
-  for (MCRegAliasIterator R(AMDGPU::SGPR102_SGPR103, &MRI, true);
-       R.isValid(); ++R) {
-    if (*R == RegNo)
-      return hasSGPR102_SGPR103();
-  }
+  if (MRI.regsOverlap(AMDGPU::SGPR102_SGPR103, RegNo))
+    return hasSGPR102_SGPR103();
 
   return true;
 }
@@ -5381,8 +5674,13 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
 OperandMatchResultTy
 AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic,
                               OperandMode Mode) {
+  OperandMatchResultTy ResTy = parseVOPD(Operands);
+  if (ResTy == MatchOperand_Success || ResTy == MatchOperand_ParseFail ||
+      isToken(AsmToken::EndOfStatement))
+    return ResTy;
+
   // Try to parse with a custom parser
-  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+  ResTy = MatchOperandParserImpl(Operands, Mnemonic);
 
   // If we successfully parsed the operand or if there as an error parsing,
   // we are done.
@@ -5435,7 +5733,11 @@ StringRef AMDGPUAsmParser::parseMnemonicSuffix(StringRef Name) {
   setForcedDPP(false);
   setForcedSDWA(false);
 
-  if (Name.endswith("_e64")) {
+  if (Name.endswith("_e64_dpp")) {
+    setForcedDPP(true);
+    setForcedEncodingSize(64);
+    return Name.substr(0, Name.size() - 8);
+  } else if (Name.endswith("_e64")) {
     setForcedEncodingSize(64);
     return Name.substr(0, Name.size() - 4);
   } else if (Name.endswith("_e32")) {
@@ -5451,11 +5753,20 @@ StringRef AMDGPUAsmParser::parseMnemonicSuffix(StringRef Name) {
   return Name;
 }
 
+static void applyMnemonicAliases(StringRef &Mnemonic,
+                                 const FeatureBitset &Features,
+                                 unsigned VariantID);
+
 bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info,
                                        StringRef Name,
                                        SMLoc NameLoc, OperandVector &Operands) {
   // Add the instruction mnemonic
   Name = parseMnemonicSuffix(Name);
+
+  // If the target architecture uses MnemonicAlias, call it here to parse
+  // operands correctly.
+  applyMnemonicAliases(Name, getAvailableFeatures(), 0);
+
   Operands.push_back(AMDGPUOperand::CreateToken(this, Name, NameLoc));
 
   bool IsMIMG = Name.startswith("image_");
@@ -5603,7 +5914,24 @@ AMDGPUAsmParser::parseCPol(OperandVector &Operands) {
   unsigned CPolOff = 0;
   SMLoc S = getLoc();
 
-  if (trySkipId("glc"))
+  StringRef Mnemo = ((AMDGPUOperand &)*Operands[0]).getToken();
+  if (isGFX940() && !Mnemo.startswith("s_")) {
+    if (trySkipId("sc0"))
+      CPolOn = AMDGPU::CPol::SC0;
+    else if (trySkipId("nosc0"))
+      CPolOff = AMDGPU::CPol::SC0;
+    else if (trySkipId("nt"))
+      CPolOn = AMDGPU::CPol::NT;
+    else if (trySkipId("nont"))
+      CPolOff = AMDGPU::CPol::NT;
+    else if (trySkipId("sc1"))
+      CPolOn = AMDGPU::CPol::SC1;
+    else if (trySkipId("nosc1"))
+      CPolOff = AMDGPU::CPol::SC1;
+    else
+      return MatchOperand_NoMatch;
+  }
+  else if (trySkipId("glc"))
     CPolOn = AMDGPU::CPol::GLC;
   else if (trySkipId("noglc"))
     CPolOff = AMDGPU::CPol::GLC;
@@ -5809,7 +6137,7 @@ AMDGPUAsmParser::parseSymbolicSplitFormat(StringRef FormatStr,
   Nfmt = (Nfmt == NFMT_UNDEF) ? NFMT_DEFAULT : Nfmt;
 
   if (isGFX10Plus()) {
-    auto Ufmt = convertDfmtNfmt2Ufmt(Dfmt, Nfmt);
+    auto Ufmt = convertDfmtNfmt2Ufmt(Dfmt, Nfmt, getSTI());
     if (Ufmt == UFMT_UNDEF) {
       Error(FormatLoc, "unsupported format");
       return MatchOperand_ParseFail;
@@ -5828,7 +6156,7 @@ AMDGPUAsmParser::parseSymbolicUnifiedFormat(StringRef FormatStr,
                                             int64_t &Format) {
   using namespace llvm::AMDGPU::MTBUFFormat;
 
-  auto Id = getUnifiedFormat(FormatStr);
+  auto Id = getUnifiedFormat(FormatStr, getSTI());
   if (Id == UFMT_UNDEF)
     return MatchOperand_NoMatch;
 
@@ -5969,6 +6297,7 @@ void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst,
 void AMDGPUAsmParser::cvtDSImpl(MCInst &Inst, const OperandVector &Operands,
                                 bool IsGdsHardcoded) {
   OptionalImmIndexMap OptionalIdx;
+  AMDGPUOperand::ImmTy OffsetType = AMDGPUOperand::ImmTyOffset;
 
   for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
     AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
@@ -5986,13 +6315,10 @@ void AMDGPUAsmParser::cvtDSImpl(MCInst &Inst, const OperandVector &Operands,
 
     // Handle optional arguments
     OptionalIdx[Op.getImmTy()] = i;
-  }
 
-  AMDGPUOperand::ImmTy OffsetType =
-    (Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_gfx10 ||
-     Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_gfx6_gfx7 ||
-     Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_vi) ? AMDGPUOperand::ImmTySwizzle :
-                                                      AMDGPUOperand::ImmTyOffset;
+    if (Op.getImmTy() == AMDGPUOperand::ImmTySwizzle)
+      OffsetType = AMDGPUOperand::ImmTySwizzle;
+  }
 
   addOptionalImmOperand(Inst, Operands, OptionalIdx, OffsetType);
 
@@ -6034,7 +6360,7 @@ void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) {
       continue;
     }
 
-    if (Op.isToken() && Op.getToken() == "done")
+    if (Op.isToken() && (Op.getToken() == "done" || Op.getToken() == "row_en"))
       continue;
 
     // Handle optional arguments
@@ -6157,11 +6483,179 @@ AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
+bool AMDGPUAsmParser::parseDelay(int64_t &Delay) {
+  SMLoc FieldLoc = getLoc();
+  StringRef FieldName = getTokenStr();
+  if (!skipToken(AsmToken::Identifier, "expected a field name") ||
+      !skipToken(AsmToken::LParen, "expected a left parenthesis"))
+    return false;
+
+  SMLoc ValueLoc = getLoc();
+  StringRef ValueName = getTokenStr();
+  if (!skipToken(AsmToken::Identifier, "expected a value name") ||
+      !skipToken(AsmToken::RParen, "expected a right parenthesis"))
+    return false;
+
+  unsigned Shift;
+  if (FieldName == "instid0") {
+    Shift = 0;
+  } else if (FieldName == "instskip") {
+    Shift = 4;
+  } else if (FieldName == "instid1") {
+    Shift = 7;
+  } else {
+    Error(FieldLoc, "invalid field name " + FieldName);
+    return false;
+  }
+
+  int Value;
+  if (Shift == 4) {
+    // Parse values for instskip.
+    Value = StringSwitch<int>(ValueName)
+                .Case("SAME", 0)
+                .Case("NEXT", 1)
+                .Case("SKIP_1", 2)
+                .Case("SKIP_2", 3)
+                .Case("SKIP_3", 4)
+                .Case("SKIP_4", 5)
+                .Default(-1);
+  } else {
+    // Parse values for instid0 and instid1.
+    Value = StringSwitch<int>(ValueName)
+                .Case("NO_DEP", 0)
+                .Case("VALU_DEP_1", 1)
+                .Case("VALU_DEP_2", 2)
+                .Case("VALU_DEP_3", 3)
+                .Case("VALU_DEP_4", 4)
+                .Case("TRANS32_DEP_1", 5)
+                .Case("TRANS32_DEP_2", 6)
+                .Case("TRANS32_DEP_3", 7)
+                .Case("FMA_ACCUM_CYCLE_1", 8)
+                .Case("SALU_CYCLE_1", 9)
+                .Case("SALU_CYCLE_2", 10)
+                .Case("SALU_CYCLE_3", 11)
+                .Default(-1);
+  }
+  if (Value < 0) {
+    Error(ValueLoc, "invalid value name " + ValueName);
+    return false;
+  }
+
+  Delay |= Value << Shift;
+  return true;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseSDelayAluOps(OperandVector &Operands) {
+  int64_t Delay = 0;
+  SMLoc S = getLoc();
+
+  if (isToken(AsmToken::Identifier) && peekToken().is(AsmToken::LParen)) {
+    do {
+      if (!parseDelay(Delay))
+        return MatchOperand_ParseFail;
+    } while (trySkipToken(AsmToken::Pipe));
+  } else {
+    if (!parseExpr(Delay))
+      return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(AMDGPUOperand::CreateImm(this, Delay, S));
+  return MatchOperand_Success;
+}
+
 bool
 AMDGPUOperand::isSWaitCnt() const {
   return isImm();
 }
 
+bool AMDGPUOperand::isSDelayAlu() const { return isImm(); }
+
+//===----------------------------------------------------------------------===//
+// DepCtr
+//===----------------------------------------------------------------------===//
+
+void AMDGPUAsmParser::depCtrError(SMLoc Loc, int ErrorId,
+                                  StringRef DepCtrName) {
+  switch (ErrorId) {
+  case OPR_ID_UNKNOWN:
+    Error(Loc, Twine("invalid counter name ", DepCtrName));
+    return;
+  case OPR_ID_UNSUPPORTED:
+    Error(Loc, Twine(DepCtrName, " is not supported on this GPU"));
+    return;
+  case OPR_ID_DUPLICATE:
+    Error(Loc, Twine("duplicate counter name ", DepCtrName));
+    return;
+  case OPR_VAL_INVALID:
+    Error(Loc, Twine("invalid value for ", DepCtrName));
+    return;
+  default:
+    assert(false);
+  }
+}
+
+bool AMDGPUAsmParser::parseDepCtr(int64_t &DepCtr, unsigned &UsedOprMask) {
+
+  using namespace llvm::AMDGPU::DepCtr;
+
+  SMLoc DepCtrLoc = getLoc();
+  StringRef DepCtrName = getTokenStr();
+
+  if (!skipToken(AsmToken::Identifier, "expected a counter name") ||
+      !skipToken(AsmToken::LParen, "expected a left parenthesis"))
+    return false;
+
+  int64_t ExprVal;
+  if (!parseExpr(ExprVal))
+    return false;
+
+  unsigned PrevOprMask = UsedOprMask;
+  int CntVal = encodeDepCtr(DepCtrName, ExprVal, UsedOprMask, getSTI());
+
+  if (CntVal < 0) {
+    depCtrError(DepCtrLoc, CntVal, DepCtrName);
+    return false;
+  }
+
+  if (!skipToken(AsmToken::RParen, "expected a closing parenthesis"))
+    return false;
+
+  if (trySkipToken(AsmToken::Amp) || trySkipToken(AsmToken::Comma)) {
+    if (isToken(AsmToken::EndOfStatement)) {
+      Error(getLoc(), "expected a counter name");
+      return false;
+    }
+  }
+
+  unsigned CntValMask = PrevOprMask ^ UsedOprMask;
+  DepCtr = (DepCtr & ~CntValMask) | CntVal;
+  return true;
+}
+
+OperandMatchResultTy AMDGPUAsmParser::parseDepCtrOps(OperandVector &Operands) {
+  using namespace llvm::AMDGPU::DepCtr;
+
+  int64_t DepCtr = getDefaultDepCtrEncoding(getSTI());
+  SMLoc Loc = getLoc();
+
+  if (isToken(AsmToken::Identifier) && peekToken().is(AsmToken::LParen)) {
+    unsigned UsedOprMask = 0;
+    while (!isToken(AsmToken::EndOfStatement)) {
+      if (!parseDepCtr(DepCtr, UsedOprMask))
+        return MatchOperand_ParseFail;
+    }
+  } else {
+    if (!parseExpr(DepCtr))
+      return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(AMDGPUOperand::CreateImm(this, DepCtr, Loc));
+  return MatchOperand_Success;
+}
+
+bool AMDGPUOperand::isDepCtr() const { return isS16Imm(); }
+
 //===----------------------------------------------------------------------===//
 // hwreg
 //===----------------------------------------------------------------------===//
@@ -6175,7 +6669,7 @@ AMDGPUAsmParser::parseHwregBody(OperandInfoTy &HwReg,
   // The register may be specified by name or using a numeric code
   HwReg.Loc = getLoc();
   if (isToken(AsmToken::Identifier) &&
-      (HwReg.Id = getHwregId(getTokenStr())) >= 0) {
+      (HwReg.Id = getHwregId(getTokenStr(), getSTI())) != OPR_ID_UNKNOWN) {
     HwReg.IsSymbolic = true;
     lex(); // skip register name
   } else if (!parseExpr(HwReg.Id, "a register name")) {
@@ -6208,15 +6702,18 @@ AMDGPUAsmParser::validateHwreg(const OperandInfoTy &HwReg,
 
   using namespace llvm::AMDGPU::Hwreg;
 
-  if (HwReg.IsSymbolic && !isValidHwreg(HwReg.Id, getSTI())) {
-    Error(HwReg.Loc,
-          "specified hardware register is not supported on this GPU");
-    return false;
-  }
-  if (!isValidHwreg(HwReg.Id)) {
-    Error(HwReg.Loc,
-          "invalid code of hardware register: only 6-bit values are legal");
-    return false;
+  if (HwReg.IsSymbolic) {
+    if (HwReg.Id == OPR_ID_UNSUPPORTED) {
+      Error(HwReg.Loc,
+            "specified hardware register is not supported on this GPU");
+      return false;
+    }
+  } else {
+    if (!isValidHwreg(HwReg.Id)) {
+      Error(HwReg.Loc,
+            "invalid code of hardware register: only 6-bit values are legal");
+      return false;
+    }
   }
   if (!isValidHwregOffset(Offset.Id)) {
     Error(Offset.Loc, "invalid bit offset: only 5-bit values are legal");
@@ -6238,7 +6735,7 @@ AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
   SMLoc Loc = getLoc();
 
   if (trySkipId("hwreg", AsmToken::LParen)) {
-    OperandInfoTy HwReg(ID_UNKNOWN_);
+    OperandInfoTy HwReg(OPR_ID_UNKNOWN);
     OperandInfoTy Offset(OFFSET_DEFAULT_);
     OperandInfoTy Width(WIDTH_DEFAULT_);
     if (parseHwregBody(HwReg, Offset, Width) &&
@@ -6275,7 +6772,8 @@ AMDGPUAsmParser::parseSendMsgBody(OperandInfoTy &Msg,
   using namespace llvm::AMDGPU::SendMsg;
 
   Msg.Loc = getLoc();
-  if (isToken(AsmToken::Identifier) && (Msg.Id = getMsgId(getTokenStr())) >= 0) {
+  if (isToken(AsmToken::Identifier) &&
+      (Msg.Id = getMsgId(getTokenStr(), getSTI())) != OPR_ID_UNKNOWN) {
     Msg.IsSymbolic = true;
     lex(); // skip message name
   } else if (!parseExpr(Msg.Id, "a message name")) {
@@ -6310,15 +6808,22 @@ AMDGPUAsmParser::validateSendMsg(const OperandInfoTy &Msg,
   using namespace llvm::AMDGPU::SendMsg;
 
   // Validation strictness depends on whether message is specified
-  // in a symbolc or in a numeric form. In the latter case
+  // in a symbolic or in a numeric form. In the latter case
   // only encoding possibility is checked.
   bool Strict = Msg.IsSymbolic;
 
-  if (!isValidMsgId(Msg.Id, getSTI(), Strict)) {
-    Error(Msg.Loc, "invalid message id");
-    return false;
+  if (Strict) {
+    if (Msg.Id == OPR_ID_UNSUPPORTED) {
+      Error(Msg.Loc, "specified message id is not supported on this GPU");
+      return false;
+    }
+  } else {
+    if (!isValidMsgId(Msg.Id, getSTI())) {
+      Error(Msg.Loc, "invalid message id");
+      return false;
+    }
   }
-  if (Strict && (msgRequiresOp(Msg.Id) != Op.IsDefined)) {
+  if (Strict && (msgRequiresOp(Msg.Id, getSTI()) != Op.IsDefined)) {
     if (Op.IsDefined) {
       Error(Op.Loc, "message does not support operations");
     } else {
@@ -6330,7 +6835,8 @@ AMDGPUAsmParser::validateSendMsg(const OperandInfoTy &Msg,
     Error(Op.Loc, "invalid operation id");
     return false;
   }
-  if (Strict && !msgSupportsStream(Msg.Id, Op.Id) && Stream.IsDefined) {
+  if (Strict && !msgSupportsStream(Msg.Id, Op.Id, getSTI()) &&
+      Stream.IsDefined) {
     Error(Stream.Loc, "message operation does not support streams");
     return false;
   }
@@ -6349,7 +6855,7 @@ AMDGPUAsmParser::parseSendMsgOp(OperandVector &Operands) {
   SMLoc Loc = getLoc();
 
   if (trySkipId("sendmsg", AsmToken::LParen)) {
-    OperandInfoTy Msg(ID_UNKNOWN_);
+    OperandInfoTy Msg(OPR_ID_UNKNOWN);
     OperandInfoTy Op(OP_NONE_);
     OperandInfoTy Stream(STREAM_ID_NONE_);
     if (parseSendMsgBody(Msg, Op, Stream) &&
@@ -6610,9 +7116,10 @@ AMDGPUAsmParser::getToken() const {
   return Parser.getTok();
 }
 
-AsmToken
-AMDGPUAsmParser::peekToken() {
-  return isToken(AsmToken::EndOfStatement) ? getToken() : getLexer().peekTok();
+AsmToken AMDGPUAsmParser::peekToken(bool ShouldSkipSpace) {
+  return isToken(AsmToken::EndOfStatement)
+             ? getToken()
+             : getLexer().peekTok(ShouldSkipSpace);
 }
 
 void
@@ -7078,8 +7585,6 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
                                    const OperandVector &Operands,
                                    bool IsAtomic,
                                    bool IsLds) {
-  bool IsLdsOpcode = IsLds;
-  bool HasLdsModifier = false;
   OptionalImmIndexMap OptionalIdx;
   unsigned FirstOperandIdx = 1;
   bool IsAtomicReturn = false;
@@ -7123,8 +7628,6 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
       continue;
     }
 
-    HasLdsModifier |= Op.isLDS();
-
     // Handle tokens like 'offen' which are sometimes hard-coded into the
     // asm string.  There are no MCInst operands for these.
     if (Op.isToken()) {
@@ -7136,25 +7639,10 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
     OptionalIdx[Op.getImmTy()] = i;
   }
 
-  // This is a workaround for an llvm quirk which may result in an
-  // incorrect instruction selection. Lds and non-lds versions of
-  // MUBUF instructions are identical except that lds versions
-  // have mandatory 'lds' modifier. However this modifier follows
-  // optional modifiers and llvm asm matcher regards this 'lds'
-  // modifier as an optional one. As a result, an lds version
-  // of opcode may be selected even if it has no 'lds' modifier.
-  if (IsLdsOpcode && !HasLdsModifier) {
-    int NoLdsOpcode = AMDGPU::getMUBUFNoLdsInst(Inst.getOpcode());
-    if (NoLdsOpcode != -1) { // Got lds version - correct it.
-      Inst.setOpcode(NoLdsOpcode);
-      IsLdsOpcode = false;
-    }
-  }
-
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCPol, 0);
 
-  if (!IsLdsOpcode) { // tfe is not legal with lds opcodes
+  if (!IsLds) { // tfe is not legal with lds opcodes
     addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
   }
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySWZ);
@@ -7327,7 +7815,8 @@ bool AMDGPUOperand::isSMRDOffset8() const {
 }
 
 bool AMDGPUOperand::isSMEMOffset() const {
-  return isImm(); // Offset range is checked later by validator.
+  return isImmTy(ImmTyNone) ||
+         isImmTy(ImmTyOffset); // Offset range is checked later by validator.
 }
 
 bool AMDGPUOperand::isSMRDLiteralOffset() const {
@@ -7415,10 +7904,6 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
   {"d16",     AMDGPUOperand::ImmTyD16,   true, nullptr},
   {"dmask",   AMDGPUOperand::ImmTyDMask, false, nullptr},
   {"dim",     AMDGPUOperand::ImmTyDim,   false, nullptr},
-  {"row_mask",   AMDGPUOperand::ImmTyDppRowMask, false, nullptr},
-  {"bank_mask",  AMDGPUOperand::ImmTyDppBankMask, false, nullptr},
-  {"bound_ctrl", AMDGPUOperand::ImmTyDppBoundCtrl, false, ConvertBoundCtrl},
-  {"fi",         AMDGPUOperand::ImmTyDppFi, false, nullptr},
   {"dst_sel",    AMDGPUOperand::ImmTySdwaDstSel, false, nullptr},
   {"src0_sel",   AMDGPUOperand::ImmTySdwaSrc0Sel, false, nullptr},
   {"src1_sel",   AMDGPUOperand::ImmTySdwaSrc1Sel, false, nullptr},
@@ -7429,9 +7914,17 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
   {"op_sel_hi", AMDGPUOperand::ImmTyOpSelHi, false, nullptr},
   {"neg_lo", AMDGPUOperand::ImmTyNegLo, false, nullptr},
   {"neg_hi", AMDGPUOperand::ImmTyNegHi, false, nullptr},
+  {"dpp8",     AMDGPUOperand::ImmTyDPP8, false, nullptr},
+  {"dpp_ctrl", AMDGPUOperand::ImmTyDppCtrl, false, nullptr},
+  {"row_mask",   AMDGPUOperand::ImmTyDppRowMask, false, nullptr},
+  {"bank_mask",  AMDGPUOperand::ImmTyDppBankMask, false, nullptr},
+  {"bound_ctrl", AMDGPUOperand::ImmTyDppBoundCtrl, false, ConvertBoundCtrl},
+  {"fi",   AMDGPUOperand::ImmTyDppFi, false, nullptr},
   {"blgp", AMDGPUOperand::ImmTyBLGP, false, nullptr},
   {"cbsz", AMDGPUOperand::ImmTyCBSZ, false, nullptr},
-  {"abid", AMDGPUOperand::ImmTyABID, false, nullptr}
+  {"abid", AMDGPUOperand::ImmTyABID, false, nullptr},
+  {"wait_vdst", AMDGPUOperand::ImmTyWaitVDST, false, nullptr},
+  {"wait_exp", AMDGPUOperand::ImmTyWaitEXP, false, nullptr}
 };
 
 void AMDGPUAsmParser::onBeginOfFile() {
@@ -7497,8 +7990,17 @@ OperandMatchResultTy AMDGPUAsmParser::parseOptionalOpr(OperandVector &Operands)
       res = parseDim(Operands);
     } else if (Op.Type == AMDGPUOperand::ImmTyCPol) {
       res = parseCPol(Operands);
+    } else if (Op.Type == AMDGPUOperand::ImmTyDPP8) {
+      res = parseDPP8(Operands);
+    } else if (Op.Type == AMDGPUOperand::ImmTyDppCtrl) {
+      res = parseDPPCtrl(Operands);
     } else {
       res = parseIntWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult);
+      if (Op.Type == AMDGPUOperand::ImmTyBLGP && res == MatchOperand_NoMatch) {
+        res = parseOperandArrayWithPrefix("neg", Operands,
+                                          AMDGPUOperand::ImmTyBLGP,
+                                          nullptr);
+      }
     }
     if (res != MatchOperand_NoMatch) {
       return res;
@@ -7596,6 +8098,66 @@ void AMDGPUAsmParser::cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands)
   }
 }
 
+void AMDGPUAsmParser::cvtVINTERP(MCInst &Inst, const OperandVector &Operands)
+{
+  OptionalImmIndexMap OptionalIdx;
+  unsigned Opc = Inst.getOpcode();
+
+  unsigned I = 1;
+  const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+  for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
+    ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
+  }
+
+  for (unsigned E = Operands.size(); I != E; ++I) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+    if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
+      Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
+    } else if (Op.isImmModifier()) {
+      OptionalIdx[Op.getImmTy()] = I;
+    } else {
+      llvm_unreachable("unhandled operand type");
+    }
+  }
+
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI);
+
+  int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
+  if (OpSelIdx != -1)
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOpSel);
+
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyWaitEXP);
+
+  if (OpSelIdx == -1)
+    return;
+
+  const int Ops[] = { AMDGPU::OpName::src0,
+                      AMDGPU::OpName::src1,
+                      AMDGPU::OpName::src2 };
+  const int ModOps[] = { AMDGPU::OpName::src0_modifiers,
+                         AMDGPU::OpName::src1_modifiers,
+                         AMDGPU::OpName::src2_modifiers };
+
+  unsigned OpSel = Inst.getOperand(OpSelIdx).getImm();
+
+  for (int J = 0; J < 3; ++J) {
+    int OpIdx = AMDGPU::getNamedOperandIdx(Opc, Ops[J]);
+    if (OpIdx == -1)
+      break;
+
+    int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
+    uint32_t ModVal = Inst.getOperand(ModIdx).getImm();
+
+    if ((OpSel & (1 << J)) != 0)
+      ModVal |= SISrcMods::OP_SEL_0;
+    if (ModOps[J] == AMDGPU::OpName::src0_modifiers &&
+        (OpSel & (1 << 3)) != 0)
+      ModVal |= SISrcMods::DST_OP_SEL;
+
+    Inst.getOperand(ModIdx).setImm(ModVal);
+  }
+}
+
 void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
                               OptionalImmIndexMap &OptionalIdx) {
   unsigned Opc = Inst.getOpcode();
@@ -7652,9 +8214,12 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
       Opc == AMDGPU::V_MAC_F16_e64_vi ||
       Opc == AMDGPU::V_FMAC_F64_e64_gfx90a ||
       Opc == AMDGPU::V_FMAC_F32_e64_gfx10 ||
+      Opc == AMDGPU::V_FMAC_F32_e64_gfx11 ||
       Opc == AMDGPU::V_FMAC_F32_e64_vi ||
       Opc == AMDGPU::V_FMAC_LEGACY_F32_e64_gfx10 ||
-      Opc == AMDGPU::V_FMAC_F16_e64_gfx10) {
+      Opc == AMDGPU::V_FMAC_DX9_ZERO_F32_e64_gfx11 ||
+      Opc == AMDGPU::V_FMAC_F16_e64_gfx10 ||
+      Opc == AMDGPU::V_FMAC_F16_e64_gfx11) {
     auto it = Inst.begin();
     std::advance(it, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers));
     it = Inst.insert(it, MCOperand::createImm(0)); // no modifiers for src2
@@ -7731,6 +8296,11 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
     if (OpIdx == -1)
       break;
 
+    int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
+
+    if (ModIdx == -1)
+      continue;
+
     uint32_t ModVal = 0;
 
     if ((OpSel & (1 << J)) != 0)
@@ -7745,8 +8315,6 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
     if ((NegHi & (1 << J)) != 0)
       ModVal |= SISrcMods::NEG_HI;
 
-    int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
-
     Inst.getOperand(ModIdx).setImm(Inst.getOperand(ModIdx).getImm() | ModVal);
   }
 }
@@ -7757,6 +8325,118 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands) {
   cvtVOP3P(Inst, Operands, OptIdx);
 }
 
+//===----------------------------------------------------------------------===//
+// VOPD
+//===----------------------------------------------------------------------===//
+
+OperandMatchResultTy AMDGPUAsmParser::parseVOPD(OperandVector &Operands) {
+  if (!hasVOPD(getSTI()))
+    return MatchOperand_NoMatch;
+
+  if (isToken(AsmToken::Colon) && peekToken(false).is(AsmToken::Colon)) {
+    SMLoc S = getLoc();
+    lex();
+    lex();
+    Operands.push_back(AMDGPUOperand::CreateToken(this, "::", S));
+    const MCExpr *Expr;
+    if (isToken(AsmToken::Identifier) && !Parser.parseExpression(Expr)) {
+      Operands.push_back(AMDGPUOperand::CreateExpr(this, Expr, S));
+      return MatchOperand_Success;
+    }
+    Error(S, "invalid VOPD :: usage");
+    return MatchOperand_ParseFail;
+  }
+  return MatchOperand_NoMatch;
+}
+
+// Create VOPD MCInst operands using parsed assembler operands.
+// Parsed VOPD operands are ordered as follows:
+//   OpXMnemo dstX src0X [vsrc1X|imm vsrc1X|vsrc1X imm] '::'
+//   OpYMnemo dstY src0Y [vsrc1Y|imm vsrc1Y|vsrc1Y imm]
+// If both OpX and OpY have an imm, the first imm has a different name:
+//   OpXMnemo dstX src0X [vsrc1X|immDeferred vsrc1X|vsrc1X immDeferred] '::'
+//   OpYMnemo dstY src0Y [vsrc1Y|imm vsrc1Y|vsrc1Y imm]
+// MCInst operands have the following order:
+//   dstX, dstY, src0X [, other OpX operands], src0Y [, other OpY operands]
+void AMDGPUAsmParser::cvtVOPD(MCInst &Inst, const OperandVector &Operands) {
+  auto addOp = [&](uint16_t i) { // NOLINT:function pointer
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+    if (Op.isReg()) {
+      Op.addRegOperands(Inst, 1);
+      return;
+    }
+    if (Op.isImm()) {
+      Op.addImmOperands(Inst, 1);
+      return;
+    }
+    // Handle tokens like 'offen' which are sometimes hard-coded into the
+    // asm string.  There are no MCInst operands for these.
+    if (Op.isToken()) {
+      return;
+    }
+    llvm_unreachable("Unhandled operand type in cvtVOPD");
+  };
+
+  // Indices into MCInst.Operands
+  const auto FmamkOpXImmMCIndex = 3; // dstX, dstY, src0X, imm, ...
+  const auto FmaakOpXImmMCIndex = 4; // dstX, dstY, src0X, src1X, imm, ...
+  const auto MinOpYImmMCIndex = 4;   // dstX, dstY, src0X, src0Y, imm, ...
+
+  unsigned Opc = Inst.getOpcode();
+  bool HasVsrc1X =
+      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vsrc1X) != -1;
+  bool HasImmX =
+      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::immDeferred) != -1 ||
+      (HasVsrc1X && (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::imm) ==
+                         FmamkOpXImmMCIndex ||
+                     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::imm) ==
+                         FmaakOpXImmMCIndex));
+
+  bool HasVsrc1Y =
+      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vsrc1Y) != -1;
+  bool HasImmY =
+      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::immDeferred) != -1 ||
+      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::imm) >=
+          MinOpYImmMCIndex + HasVsrc1X;
+
+  // Indices of parsed operands relative to dst
+  const auto DstIdx = 0;
+  const auto Src0Idx = 1;
+  const auto Vsrc1OrImmIdx = 2;
+
+  const auto OpXOperandsSize = 2 + HasImmX + HasVsrc1X;
+  const auto BridgeTokensSize = 2; // Special VOPD tokens ('::' and OpYMnemo)
+
+  // Offsets into parsed operands
+  const auto OpXFirstOperandOffset = 1;
+  const auto OpYFirstOperandOffset =
+      OpXFirstOperandOffset + OpXOperandsSize + BridgeTokensSize;
+
+  // Order of addOp calls determines MC operand order
+  addOp(OpXFirstOperandOffset + DstIdx); // vdstX
+  addOp(OpYFirstOperandOffset + DstIdx); // vdstY
+
+  addOp(OpXFirstOperandOffset + Src0Idx); // src0X
+  if (HasImmX) {
+    // immX then vsrc1X for fmamk, vsrc1X then immX for fmaak
+    addOp(OpXFirstOperandOffset + Vsrc1OrImmIdx);
+    addOp(OpXFirstOperandOffset + Vsrc1OrImmIdx + 1);
+  } else {
+    if (HasVsrc1X) // all except v_mov
+      addOp(OpXFirstOperandOffset + Vsrc1OrImmIdx); // vsrc1X
+  }
+
+  addOp(OpYFirstOperandOffset + Src0Idx); // src0Y
+  if (HasImmY) {
+    // immY then vsrc1Y for fmamk, vsrc1Y then immY for fmaak
+    addOp(OpYFirstOperandOffset + Vsrc1OrImmIdx);
+    addOp(OpYFirstOperandOffset + Vsrc1OrImmIdx + 1);
+  } else {
+    if (HasVsrc1Y) // all except v_mov
+      addOp(OpYFirstOperandOffset + Vsrc1OrImmIdx); // vsrc1Y
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // dpp
 //===----------------------------------------------------------------------===//
@@ -8067,6 +8747,88 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultFI() const {
   return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDppFi);
 }
 
+// Add dummy $old operand
+void AMDGPUAsmParser::cvtVOPC64NoDstDPP(MCInst &Inst,
+                                        const OperandVector &Operands,
+                                        bool IsDPP8) {
+  Inst.addOperand(MCOperand::createReg(0));
+  cvtVOP3DPP(Inst, Operands, IsDPP8);
+}
+
+void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, bool IsDPP8) {
+  OptionalImmIndexMap OptionalIdx;
+  unsigned Opc = Inst.getOpcode();
+  bool HasModifiers = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers) != -1;
+  unsigned I = 1;
+  const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+  for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
+    ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
+  }
+
+  int Fi = 0;
+  for (unsigned E = Operands.size(); I != E; ++I) {
+    auto TiedTo = Desc.getOperandConstraint(Inst.getNumOperands(),
+                                            MCOI::TIED_TO);
+    if (TiedTo != -1) {
+      assert((unsigned)TiedTo < Inst.getNumOperands());
+      // handle tied old or src2 for MAC instructions
+      Inst.addOperand(Inst.getOperand(TiedTo));
+    }
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+    // Add the register arguments
+    if (IsDPP8 && Op.isFI()) {
+      Fi = Op.getImm();
+    } else if (HasModifiers &&
+               isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
+      Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
+    } else if (Op.isReg()) {
+      Op.addRegOperands(Inst, 1);
+    } else if (Op.isImm() &&
+               Desc.OpInfo[Inst.getNumOperands()].RegClass != -1) {
+      assert(!HasModifiers && "Case should be unreachable with modifiers");
+      assert(!Op.IsImmKindLiteral() && "Cannot use literal with DPP");
+      Op.addImmOperands(Inst, 1);
+    } else if (Op.isImm()) {
+      OptionalIdx[Op.getImmTy()] = I;
+    } else {
+      llvm_unreachable("unhandled operand type");
+    }
+  }
+  if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp) != -1) {
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI);
+  }
+  if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod) != -1) {
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);
+  }
+  if (Desc.TSFlags & SIInstrFlags::VOP3P)
+    cvtVOP3P(Inst, Operands, OptionalIdx);
+  else if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel) != -1) {
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOpSel);
+  }
+
+  if (IsDPP8) {
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDPP8);
+    using namespace llvm::AMDGPU::DPP;
+    Inst.addOperand(MCOperand::createImm(Fi? DPP8_FI_1 : DPP8_FI_0));
+  } else {
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppCtrl, 0xe4);
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppRowMask, 0xf);
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBankMask, 0xf);
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBoundCtrl);
+    if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::fi) != -1) {
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppFi);
+    }
+  }
+}
+
+// Add dummy $old operand
+void AMDGPUAsmParser::cvtVOPCNoDstDPP(MCInst &Inst,
+                                      const OperandVector &Operands,
+                                      bool IsDPP8) {
+  Inst.addOperand(MCOperand::createReg(0));
+  cvtDPP(Inst, Operands, IsDPP8);
+}
+
 void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool IsDPP8) {
   OptionalImmIndexMap OptionalIdx;
 
@@ -8352,7 +9114,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmParser() {
 #define GET_MNEMONIC_CHECKER
 #include "AMDGPUGenAsmMatcher.inc"
 
-// This fuction should be defined after auto-generated include so that we have
+// This function should be defined after auto-generated include so that we have
 // MatchClassKind enum defined
 unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op,
                                                      unsigned Kind) {
@@ -8431,3 +9193,27 @@ OperandMatchResultTy AMDGPUAsmParser::parseEndpgmOp(OperandVector &Operands) {
 }
 
 bool AMDGPUOperand::isEndpgm() const { return isImmTy(ImmTyEndpgm); }
+
+//===----------------------------------------------------------------------===//
+// LDSDIR
+//===----------------------------------------------------------------------===//
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultWaitVDST() const {
+  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyWaitVDST);
+}
+
+bool AMDGPUOperand::isWaitVDST() const {
+  return isImmTy(ImmTyWaitVDST) && isUInt<4>(getImm());
+}
+
+//===----------------------------------------------------------------------===//
+// VINTERP
+//===----------------------------------------------------------------------===//
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultWaitEXP() const {
+  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyWaitEXP);
+}
+
+bool AMDGPUOperand::isWaitEXP() const {
+  return isImmTy(ImmTyWaitEXP) && isUInt<3>(getImm());
+}
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index a535c8cc0918..a087323e5de7 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -35,11 +35,6 @@ class MUBUFAddr64Table <bit is_addr64, string Name> {
   string OpName = Name;
 }
 
-class MUBUFLdsTable <bit is_lds, string Name> {
-  bit IsLds = is_lds;
-  string OpName = Name;
-}
-
 class MTBUFAddr64Table <bit is_addr64, string Name> {
   bit IsAddr64 = is_addr64;
   string OpName = Name;
@@ -100,8 +95,8 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins,
   bits<1> sccb_value  = 0;
 }
 
-class MTBUF_Real <MTBUF_Pseudo ps> :
-  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []> {
+class MTBUF_Real <MTBUF_Pseudo ps, string real_name = ps.Mnemonic> :
+  InstSI <ps.OutOperandList, ps.InOperandList, real_name # ps.AsmOperands, []> {
 
   let isPseudo = 0;
   let isCodeGenOnly = 0;
@@ -136,7 +131,7 @@ class MTBUF_Real <MTBUF_Pseudo ps> :
   bits<3> nfmt = format{6-4};
 
   // GFX90A+ only: instruction uses AccVGPR for data
-  // Bit superceedes tfe.
+  // Bit supersedes tfe.
   bits<1> acc = !if(ps.has_vdata, vdata{9}, 0);
 }
 
@@ -320,7 +315,7 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins,
   bits<1> idxen       = 0;
   bits<1> addr64      = 0;
   bits<1> lds         = 0;
-  bits<1> has_vdata   = 1;
+  bits<1> has_vdata   = !not(lds);
   bits<1> has_vaddr   = 1;
   bits<1> has_glc     = 1;
   bits<1> has_dlc     = 1;
@@ -337,8 +332,8 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins,
   bits<1> IsBufferInv = 0;
 }
 
-class MUBUF_Real <MUBUF_Pseudo ps> :
-  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []> {
+class MUBUF_Real <MUBUF_Pseudo ps, string real_name = ps.Mnemonic> :
+  InstSI <ps.OutOperandList, ps.InOperandList, real_name # ps.AsmOperands, []> {
 
   let isPseudo = 0;
   let isCodeGenOnly = 0;
@@ -360,6 +355,8 @@ class MUBUF_Real <MUBUF_Pseudo ps> :
   let mayStore             = ps.mayStore;
   let IsAtomicRet          = ps.IsAtomicRet;
   let IsAtomicNoRet        = ps.IsAtomicNoRet;
+  let VALU                 = ps.VALU;
+  let LGKM_CNT             = ps.LGKM_CNT;
 
   bits<12> offset;
   bits<5>  cpol;
@@ -370,8 +367,8 @@ class MUBUF_Real <MUBUF_Pseudo ps> :
   bits<8>  soffset;
 
   // GFX90A+ only: instruction uses AccVGPR for data
-  // Bit superceedes tfe.
-  bits<1> acc = !if(ps.has_vdata, vdata{9}, 0);
+  // Bit supersedes tfe.
+  bits<1> acc = !if(ps.has_vdata, vdata{9}, !if(ps.lds, ?, 0));
 }
 
 
@@ -486,16 +483,17 @@ class MUBUF_Load_Pseudo <string opName,
                          ValueType vdata_vt,
                          bit HasTiedDest = 0,
                          bit isLds = 0,
+                         bit isLdsOpc = 0,
                          list<dag> pattern=[],
                          // Workaround bug bz30254
                          int addrKindCopy = addrKind,
                          RegisterClass vdata_rc = getVregSrcForVT<vdata_vt>.ret,
                          RegisterOperand vdata_op = getLdStRegisterOperand<vdata_rc>.ret>
   : MUBUF_Pseudo<opName,
-                 (outs vdata_op:$vdata),
+                 !if(!or(isLds, isLdsOpc), (outs), (outs vdata_op:$vdata)),
                  !con(getMUBUFIns<addrKindCopy, [], isLds>.ret,
                       !if(HasTiedDest, (ins vdata_op:$vdata_in), (ins))),
-                 " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$cpol" #
+                 !if(!or(isLds, isLdsOpc), " ", " $vdata, ") # getMUBUFAsmOps<addrKindCopy>.ret # "$cpol" #
                    !if(isLds, " lds", "$tfe") # "$swz",
                  pattern>,
     MUBUF_SetupAddr<addrKindCopy> {
@@ -504,13 +502,16 @@ class MUBUF_Load_Pseudo <string opName,
   let AsmMatchConverter = !if(isLds, "cvtMubufLds", "cvtMubuf");
 
   let Constraints = !if(HasTiedDest, "$vdata = $vdata_in", "");
+  let LGKM_CNT = isLds;
+  let has_vdata = !not(isLdsOpc);
   let mayLoad = 1;
-  let mayStore = 0;
+  let mayStore = isLds;
   let maybeAtomic = 1;
-  let Uses = !if(isLds, [EXEC, M0], [EXEC]);
+  let Uses = !if(!or(isLds, isLdsOpc) , [EXEC, M0], [EXEC]);
   let has_tfe = !not(isLds);
   let lds = isLds;
   let elements = getMUBUFElements<vdata_vt>.ret;
+  let VALU = isLds;
 }
 
 class MUBUF_Offset_Load_Pat <Instruction inst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> : Pat <
@@ -563,6 +564,20 @@ multiclass MUBUF_Pseudo_Loads_Lds<string opName, ValueType load_vt = i32> {
   defm _LDS : MUBUF_Pseudo_Loads<opName, load_vt, 0, 1>;
 }
 
+multiclass MUBUF_Pseudo_Loads_LDSOpc<string opName,
+                                     ValueType load_vt = i32,
+                                     bit TiedDest = 0,
+                                     bit isLds = 0,
+                                     bit isLdsOpc = 1> {
+
+  defvar legal_load_vt = !if(!eq(!cast<string>(load_vt), !cast<string>(v3f16)), v4f16, load_vt);
+
+  def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, legal_load_vt, TiedDest, isLds, isLdsOpc>;
+  def _OFFEN  : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, legal_load_vt, TiedDest, isLds, isLdsOpc>;
+  def _IDXEN  : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, legal_load_vt, TiedDest, isLds, isLdsOpc>;
+  def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, legal_load_vt, TiedDest, isLds, isLdsOpc>;
+}
+
 class MUBUF_Store_Pseudo <string opName,
                           int addrKind,
                           ValueType store_vt,
@@ -615,7 +630,8 @@ class MUBUF_Pseudo_Store_Lds<string opName>
                  (outs),
                  (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, CPol:$cpol, SWZ:$swz),
                  " $srsrc, $soffset$offset lds$cpol$swz"> {
-  let mayLoad = 0;
+  let LGKM_CNT = 1;
+  let mayLoad = 1;
   let mayStore = 1;
   let maybeAtomic = 1;
 
@@ -623,6 +639,7 @@ class MUBUF_Pseudo_Store_Lds<string opName>
   let has_vaddr = 0;
   let has_tfe = 0;
   let lds = 1;
+  let VALU = 1;
 
   let Uses = [EXEC, M0];
   let AsmMatchConverter = "cvtMubufLds";
@@ -785,7 +802,7 @@ multiclass MUBUF_Pseudo_Atomics_RTN <string opName,
 multiclass MUBUF_Pseudo_Atomics <string opName,
                                  RegisterClass vdataClass,
                                  ValueType vdataType,
-                                 SDPatternOperator atomic> :
+                                 SDPatternOperator atomic = null_frag> :
   MUBUF_Pseudo_Atomics_NO_RTN<opName, vdataClass, vdataType>,
   MUBUF_Pseudo_Atomics_RTN<opName, vdataClass, vdataType, atomic>;
 
@@ -898,6 +915,29 @@ defm BUFFER_LOAD_DWORDX4 : MUBUF_Pseudo_Loads <
   "buffer_load_dwordx4", v4i32
 >;
 
+defm BUFFER_LOAD_LDS_B32 : MUBUF_Pseudo_Loads_LDSOpc <
+  "buffer_load_lds_b32", i32
+>;
+defm BUFFER_LOAD_LDS_FORMAT_X : MUBUF_Pseudo_Loads_LDSOpc <
+  "buffer_load_lds_format_x", f32
+>;
+defm BUFFER_LOAD_LDS_I8 : MUBUF_Pseudo_Loads_LDSOpc <
+  "buffer_load_lds_i8", i32
+>;
+defm BUFFER_LOAD_LDS_I16 : MUBUF_Pseudo_Loads_LDSOpc <
+  "buffer_load_lds_i16", i32
+>;
+defm BUFFER_LOAD_LDS_U8 : MUBUF_Pseudo_Loads_LDSOpc <
+  "buffer_load_lds_u8", i32
+>;
+defm BUFFER_LOAD_LDS_U16 : MUBUF_Pseudo_Loads_LDSOpc <
+  "buffer_load_lds_u16", i32
+>;
+
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, atomic_load_8_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, atomic_load_16_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i16, atomic_load_8_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i16, atomic_load_16_global>;
 defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, extloadi8_global>;
 defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, zextloadi8_global>;
 defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SBYTE", i32, sextloadi8_global>;
@@ -909,21 +949,6 @@ defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX2", v2i32, load_global>;
 defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX3", v3i32, load_global>;
 defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX4", v4i32, load_global>;
 
-// This is not described in AMD documentation,
-// but 'lds' versions of these opcodes are available
-// in at least GFX8+ chips. See Bug 37653.
-let SubtargetPredicate = isGFX8GFX9 in {
-defm BUFFER_LOAD_DWORDX2_LDS : MUBUF_Pseudo_Loads <
-  "buffer_load_dwordx2", v2i32, 0, 1
->;
-defm BUFFER_LOAD_DWORDX3_LDS : MUBUF_Pseudo_Loads <
-  "buffer_load_dwordx3", v3i32, 0, 1
->;
-defm BUFFER_LOAD_DWORDX4_LDS : MUBUF_Pseudo_Loads <
-  "buffer_load_dwordx4", v4i32, 0, 1
->;
-}
-
 defm BUFFER_STORE_BYTE : MUBUF_Pseudo_Stores <
   "buffer_store_byte", i32, truncstorei8_global
 >;
@@ -943,82 +968,82 @@ defm BUFFER_STORE_DWORDX4 : MUBUF_Pseudo_Stores <
   "buffer_store_dwordx4", v4i32, store_global
 >;
 defm BUFFER_ATOMIC_SWAP : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global_32
+  "buffer_atomic_swap", VGPR_32, i32
 >;
 defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_cmpswap", VReg_64, v2i32, null_frag
+  "buffer_atomic_cmpswap", VReg_64, v2i32
 >;
 defm BUFFER_ATOMIC_ADD : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_add", VGPR_32, i32, atomic_load_add_global_32
+  "buffer_atomic_add", VGPR_32, i32
 >;
 defm BUFFER_ATOMIC_SUB : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_sub", VGPR_32, i32, atomic_load_sub_global_32
+  "buffer_atomic_sub", VGPR_32, i32
 >;
 defm BUFFER_ATOMIC_SMIN : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_smin", VGPR_32, i32, atomic_load_min_global_32
+  "buffer_atomic_smin", VGPR_32, i32
 >;
 defm BUFFER_ATOMIC_UMIN : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_umin", VGPR_32, i32, atomic_load_umin_global_32
+  "buffer_atomic_umin", VGPR_32, i32
 >;
 defm BUFFER_ATOMIC_SMAX : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_smax", VGPR_32, i32, atomic_load_max_global_32
+  "buffer_atomic_smax", VGPR_32, i32
 >;
 defm BUFFER_ATOMIC_UMAX : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_umax", VGPR_32, i32, atomic_load_umax_global_32
+  "buffer_atomic_umax", VGPR_32, i32
 >;
 defm BUFFER_ATOMIC_AND : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_and", VGPR_32, i32, atomic_load_and_global_32
+  "buffer_atomic_and", VGPR_32, i32
 >;
 defm BUFFER_ATOMIC_OR : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_or", VGPR_32, i32, atomic_load_or_global_32
+  "buffer_atomic_or", VGPR_32, i32
 >;
 defm BUFFER_ATOMIC_XOR : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_xor", VGPR_32, i32, atomic_load_xor_global_32
+  "buffer_atomic_xor", VGPR_32, i32
 >;
 defm BUFFER_ATOMIC_INC : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_inc", VGPR_32, i32, atomic_inc_global_32
+  "buffer_atomic_inc", VGPR_32, i32
 >;
 defm BUFFER_ATOMIC_DEC : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_dec", VGPR_32, i32, atomic_dec_global_32
+  "buffer_atomic_dec", VGPR_32, i32
 >;
 defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_swap_x2", VReg_64, i64, atomic_swap_global_64
+  "buffer_atomic_swap_x2", VReg_64, i64
 >;
 defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_cmpswap_x2", VReg_128, v2i64, null_frag
+  "buffer_atomic_cmpswap_x2", VReg_128, v2i64
 >;
 defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_add_x2", VReg_64, i64, atomic_load_add_global_64
+  "buffer_atomic_add_x2", VReg_64, i64
 >;
 defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_sub_x2", VReg_64, i64, atomic_load_sub_global_64
+  "buffer_atomic_sub_x2", VReg_64, i64
 >;
 defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_smin_x2", VReg_64, i64, atomic_load_min_global_64
+  "buffer_atomic_smin_x2", VReg_64, i64
 >;
 defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_umin_x2", VReg_64, i64, atomic_load_umin_global_64
+  "buffer_atomic_umin_x2", VReg_64, i64
 >;
 defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_smax_x2", VReg_64, i64, atomic_load_max_global_64
+  "buffer_atomic_smax_x2", VReg_64, i64
 >;
 defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_umax_x2", VReg_64, i64, atomic_load_umax_global_64
+  "buffer_atomic_umax_x2", VReg_64, i64
 >;
 defm BUFFER_ATOMIC_AND_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_and_x2", VReg_64, i64, atomic_load_and_global_64
+  "buffer_atomic_and_x2", VReg_64, i64
 >;
 defm BUFFER_ATOMIC_OR_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_or_x2", VReg_64, i64, atomic_load_or_global_64
+  "buffer_atomic_or_x2", VReg_64, i64
 >;
 defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_xor_x2", VReg_64, i64, atomic_load_xor_global_64
+  "buffer_atomic_xor_x2", VReg_64, i64
 >;
 defm BUFFER_ATOMIC_INC_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_inc_x2", VReg_64, i64, atomic_inc_global_64
+  "buffer_atomic_inc_x2", VReg_64, i64
 >;
 defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global_64
+  "buffer_atomic_dec_x2", VReg_64, i64
 >;
 
 let SubtargetPredicate = HasGFX10_BEncoding in
@@ -1040,7 +1065,7 @@ def BUFFER_WBINVL1_SC : MUBUF_Invalidate <"buffer_wbinvl1_sc",
                                           int_amdgcn_buffer_wbinvl1_sc>;
 }
 
-let SubtargetPredicate = isGFX6GFX7GFX10 in {
+let SubtargetPredicate = isGFX6GFX7GFX10Plus in {
 
 defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics <
   "buffer_atomic_fcmpswap", VReg_64, v2f32, null_frag
@@ -1051,6 +1076,11 @@ defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics <
 defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics <
   "buffer_atomic_fmax", VGPR_32, f32, null_frag
 >;
+
+}
+
+let SubtargetPredicate = isGFX6GFX7GFX10 in {
+
 defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics <
   "buffer_atomic_fcmpswap_x2", VReg_128, v2f64, null_frag
 >;
@@ -1109,23 +1139,25 @@ defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores <
 def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",
                                        int_amdgcn_buffer_wbinvl1>;
 
-let SubtargetPredicate = HasAtomicFaddInsts in {
-defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN <
+let SubtargetPredicate = HasAtomicFaddNoRtnInsts in
+defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN<
   "buffer_atomic_add_f32", VGPR_32, f32
 >;
+
+let SubtargetPredicate = HasAtomicPkFaddNoRtnInsts in
 defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN <
   "buffer_atomic_pk_add_f16", VGPR_32, v2f16
 >;
 
-let OtherPredicates = [isGFX90APlus] in {
-defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_RTN <
+let OtherPredicates = [HasAtomicFaddRtnInsts] in
+defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_RTN<
   "buffer_atomic_add_f32", VGPR_32, f32, atomic_load_fadd_global_32
 >;
+
+let OtherPredicates = [isGFX90APlus] in
 defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN <
   "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_load_fadd_v2f16_global_32
 >;
-}
-} // End SubtargetPredicate = HasAtomicFaddInsts
 
 //===----------------------------------------------------------------------===//
 // MTBUF Instructions
@@ -1175,15 +1207,28 @@ def BUFFER_WBINVL1_VOL : MUBUF_Invalidate <"buffer_wbinvl1_vol",
 
 let SubtargetPredicate = isGFX90APlus in {
   def BUFFER_WBL2  : MUBUF_Invalidate<"buffer_wbl2"> {
+    let has_glc = 1;
+    let has_sccb = 1;
+    let InOperandList = (ins CPol_0:$cpol);
+    let AsmOperands = "$cpol";
   }
   def BUFFER_INVL2 : MUBUF_Invalidate<"buffer_invl2"> {
+    let SubtargetPredicate = isGFX90AOnly;
   }
 
-  defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", VReg_64, f64, int_amdgcn_global_atomic_fadd>;
-  defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", VReg_64, f64, int_amdgcn_global_atomic_fmin>;
-  defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", VReg_64, f64, int_amdgcn_global_atomic_fmax>;
+  defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", VReg_64, f64>;
+  defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", VReg_64, f64>;
+  defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", VReg_64, f64>;
 } // End SubtargetPredicate = isGFX90APlus
 
+def BUFFER_INV : MUBUF_Invalidate<"buffer_inv"> {
+  let SubtargetPredicate = isGFX940Plus;
+  let has_glc = 1;
+  let has_sccb = 1;
+  let InOperandList = (ins CPol_0:$cpol);
+  let AsmOperands = "$cpol";
+}
+
 let SubtargetPredicate = isGFX10Plus in {
   def BUFFER_GL0_INV : MUBUF_Invalidate<"buffer_gl0_inv">;
   def BUFFER_GL1_INV : MUBUF_Invalidate<"buffer_gl1_inv">;
@@ -1364,75 +1409,169 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_short, i32, "BUFFER_STORE_SHORT">;
 // buffer_atomic patterns
 //===----------------------------------------------------------------------===//
 
-multiclass BufferAtomicPatterns<SDPatternOperator name, ValueType vt,
-                                string opcode> {
+multiclass BufferAtomicPat<string OpPrefix, ValueType vt, string Inst, bit isIntr = 0> {
+  foreach RtnMode = ["ret", "noret"] in {
+
+  defvar Op = !cast<SDPatternOperator>(OpPrefix # "_" # RtnMode
+                                       # !if(isIntr, "", "_" # vt.Size));
+  defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
+
+  def : GCNPat<
+    (vt (Op (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset), vt:$vdata_in)),
+    (!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in,
+      SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset)
+  >;
+
+  def : GCNPat<
+    (vt (Op (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset),
+      vt:$vdata_in)),
+    (!cast<MUBUF_Pseudo>(Inst # "_ADDR64" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in,
+      VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset)
+  >;
+
+  } // end foreach RtnMode
+}
+
+multiclass BufferAtomicIntrPat<string OpPrefix, ValueType vt, string Inst> {
+  defm : BufferAtomicPat<OpPrefix, vt, Inst, /* isIntr */ 1>;
+}
+
+multiclass BufferAtomicCmpSwapPat<ValueType vt, ValueType data_vt, string Inst> {
+  foreach RtnMode = ["ret", "noret"] in {
+
+  defvar Op = !cast<SDPatternOperator>("AMDGPUatomic_cmp_swap_global_" # RtnMode
+                                       # "_" # vt.Size);
+  defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
+
+  defvar OffsetResDag = (!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix)
+    getVregSrcForVT<data_vt>.ret:$vdata_in, SReg_128:$srsrc, SCSrc_b32:$soffset,
+    offset:$offset);
+  def : GCNPat<
+    (vt (Op (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset), data_vt:$vdata_in)),
+    !if(!eq(RtnMode, "ret"),
+      (EXTRACT_SUBREG (vt (COPY_TO_REGCLASS OffsetResDag, getVregSrcForVT<data_vt>.ret)),
+        !if(!eq(vt, i32), sub0, sub0_sub1)),
+      OffsetResDag)
+  >;
+
+  defvar Addr64ResDag = (!cast<MUBUF_Pseudo>(Inst # "_ADDR64" # InstSuffix)
+    getVregSrcForVT<data_vt>.ret:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc,
+    SCSrc_b32:$soffset, offset:$offset);
+  def : GCNPat<
+    (vt (Op (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset),
+      data_vt:$vdata_in)),
+    !if(!eq(RtnMode, "ret"),
+      (EXTRACT_SUBREG (vt (COPY_TO_REGCLASS Addr64ResDag, getVregSrcForVT<data_vt>.ret)),
+        !if(!eq(vt, i32), sub0, sub0_sub1)),
+      Addr64ResDag)
+  >;
+
+  } // end foreach RtnMode
+}
+
+foreach Ty = [i32, i64] in {
+
+defvar Suffix = !if(!eq(Ty, i64), "_X2", "");
+
+defm : BufferAtomicPat<"atomic_swap_global", Ty, "BUFFER_ATOMIC_SWAP" # Suffix>;
+defm : BufferAtomicPat<"atomic_load_add_global", Ty, "BUFFER_ATOMIC_ADD" # Suffix>;
+defm : BufferAtomicPat<"atomic_load_sub_global", Ty, "BUFFER_ATOMIC_SUB" # Suffix>;
+defm : BufferAtomicPat<"atomic_load_min_global", Ty, "BUFFER_ATOMIC_SMIN" # Suffix>;
+defm : BufferAtomicPat<"atomic_load_umin_global", Ty, "BUFFER_ATOMIC_UMIN" # Suffix>;
+defm : BufferAtomicPat<"atomic_load_max_global", Ty, "BUFFER_ATOMIC_SMAX" # Suffix>;
+defm : BufferAtomicPat<"atomic_load_umax_global", Ty, "BUFFER_ATOMIC_UMAX" # Suffix>;
+defm : BufferAtomicPat<"atomic_load_and_global", Ty, "BUFFER_ATOMIC_AND" # Suffix>;
+defm : BufferAtomicPat<"atomic_load_or_global", Ty, "BUFFER_ATOMIC_OR" # Suffix>;
+defm : BufferAtomicPat<"atomic_load_xor_global", Ty, "BUFFER_ATOMIC_XOR" # Suffix>;
+defm : BufferAtomicPat<"atomic_inc_global", Ty, "BUFFER_ATOMIC_INC" # Suffix>;
+defm : BufferAtomicPat<"atomic_dec_global", Ty, "BUFFER_ATOMIC_DEC" # Suffix>;
+
+} // end foreach Ty
+
+defm : BufferAtomicCmpSwapPat<i32, v2i32, "BUFFER_ATOMIC_CMPSWAP">;
+defm : BufferAtomicCmpSwapPat<i64, v2i64, "BUFFER_ATOMIC_CMPSWAP_X2">;
+
+multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst,
+                             list<string> RtnModes = ["ret", "noret"]> {
+  foreach RtnMode = RtnModes in {
+
+  defvar Op = !cast<SDPatternOperator>(!if(!eq(RtnMode, "none"),
+    OpPrefix, OpPrefix # "_" # RtnMode));
+  defvar InstSuffix = !if(!or(!eq(RtnMode, "none"), !eq(RtnMode, "ret")),
+    "_RTN", "");
+  defvar CachePolicy = !if(!or(!eq(RtnMode, "none"), !eq(RtnMode, "ret")),
+    (set_glc $cachepolicy), (timm:$cachepolicy));
+
   def : GCNPat<
-    (vt (name vt:$vdata_in, v4i32:$rsrc, 0, 0, i32:$soffset,
+    (vt (Op vt:$vdata_in, v4i32:$rsrc, 0, 0, i32:$soffset,
               timm:$offset, timm:$cachepolicy, 0)),
-    (!cast<MUBUF_Pseudo>(opcode # _OFFSET_RTN)
+    (!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix)
       getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset,
-      (as_i16timm $offset), (set_glc $cachepolicy))
+      (as_i16timm $offset), CachePolicy)
   >;
 
   def : GCNPat<
-    (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset,
+    (vt (Op vt:$vdata_in, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset,
               timm:$offset, timm:$cachepolicy, timm)),
-    (!cast<MUBUF_Pseudo>(opcode # _IDXEN_RTN) getVregSrcForVT<vt>.ret:$vdata_in,
-      VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
-      (as_i16timm $offset), (set_glc $cachepolicy))
+    (!cast<MUBUF_Pseudo>(Inst # "_IDXEN" # InstSuffix)
+      getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc,
+      SCSrc_b32:$soffset, (as_i16timm $offset), CachePolicy)
   >;
 
   def : GCNPat<
-    (vt (name vt:$vdata_in, v4i32:$rsrc, 0, i32:$voffset,
+    (vt (Op vt:$vdata_in, v4i32:$rsrc, 0, i32:$voffset,
               i32:$soffset, timm:$offset, timm:$cachepolicy, 0)),
-    (!cast<MUBUF_Pseudo>(opcode # _OFFEN_RTN) getVregSrcForVT<vt>.ret:$vdata_in,
-      VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
-      (as_i16timm $offset), (set_glc $cachepolicy))
+    (!cast<MUBUF_Pseudo>(Inst # "_OFFEN" # InstSuffix)
+      getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc,
+      SCSrc_b32:$soffset, (as_i16timm $offset), CachePolicy)
   >;
 
   def : GCNPat<
-    (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, i32:$voffset,
+    (vt (Op vt:$vdata_in, v4i32:$rsrc, i32:$vindex, i32:$voffset,
               i32:$soffset, timm:$offset, timm:$cachepolicy, timm)),
-    (!cast<MUBUF_Pseudo>(opcode # _BOTHEN_RTN)
+    (!cast<MUBUF_Pseudo>(Inst # "_BOTHEN" # InstSuffix)
       getVregSrcForVT<vt>.ret:$vdata_in,
       (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
-        SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
-        (set_glc $cachepolicy))
+        SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), CachePolicy)
   >;
-}
-
-defm : BufferAtomicPatterns<SIbuffer_atomic_swap, i32, "BUFFER_ATOMIC_SWAP">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_swap, f32, "BUFFER_ATOMIC_SWAP">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_add, i32, "BUFFER_ATOMIC_ADD">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_sub, i32, "BUFFER_ATOMIC_SUB">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_smin, i32, "BUFFER_ATOMIC_SMIN">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_umin, i32, "BUFFER_ATOMIC_UMIN">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_smax, i32, "BUFFER_ATOMIC_SMAX">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_umax, i32, "BUFFER_ATOMIC_UMAX">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_and, i32, "BUFFER_ATOMIC_AND">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_or, i32, "BUFFER_ATOMIC_OR">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i32, "BUFFER_ATOMIC_XOR">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_inc, i32, "BUFFER_ATOMIC_INC">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_dec, i32, "BUFFER_ATOMIC_DEC">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_csub, i32, "BUFFER_ATOMIC_CSUB">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_swap, i64, "BUFFER_ATOMIC_SWAP_X2">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_add, i64,  "BUFFER_ATOMIC_ADD_X2">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_sub, i64, "BUFFER_ATOMIC_SUB_X2">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_smin, i64, "BUFFER_ATOMIC_SMIN_X2">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_umin, i64, "BUFFER_ATOMIC_UMIN_X2">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_smax, i64, "BUFFER_ATOMIC_SMAX_X2">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_umax, i64, "BUFFER_ATOMIC_UMAX_X2">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_and, i64, "BUFFER_ATOMIC_AND_X2">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_or, i64, "BUFFER_ATOMIC_OR_X2">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i64, "BUFFER_ATOMIC_XOR_X2">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_inc, i64, "BUFFER_ATOMIC_INC_X2">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_dec, i64, "BUFFER_ATOMIC_DEC_X2">;
 
+  } // end foreach RtnMode
+}
+
+defm : SIBufferAtomicPat<"SIbuffer_atomic_swap", i32, "BUFFER_ATOMIC_SWAP">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_swap", f32, "BUFFER_ATOMIC_SWAP">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_add", i32, "BUFFER_ATOMIC_ADD">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_sub", i32, "BUFFER_ATOMIC_SUB">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_smin", i32, "BUFFER_ATOMIC_SMIN">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_umin", i32, "BUFFER_ATOMIC_UMIN">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_smax", i32, "BUFFER_ATOMIC_SMAX">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_umax", i32, "BUFFER_ATOMIC_UMAX">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_and", i32, "BUFFER_ATOMIC_AND">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_or", i32, "BUFFER_ATOMIC_OR">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_xor", i32, "BUFFER_ATOMIC_XOR">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_inc", i32, "BUFFER_ATOMIC_INC">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_dec", i32, "BUFFER_ATOMIC_DEC">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["none"]>;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_swap", i64, "BUFFER_ATOMIC_SWAP_X2">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_add", i64,  "BUFFER_ATOMIC_ADD_X2">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_sub", i64, "BUFFER_ATOMIC_SUB_X2">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_smin", i64, "BUFFER_ATOMIC_SMIN_X2">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_umin", i64, "BUFFER_ATOMIC_UMIN_X2">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_smax", i64, "BUFFER_ATOMIC_SMAX_X2">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_umax", i64, "BUFFER_ATOMIC_UMAX_X2">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_and", i64, "BUFFER_ATOMIC_AND_X2">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_or", i64, "BUFFER_ATOMIC_OR_X2">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_xor", i64, "BUFFER_ATOMIC_XOR_X2">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_inc", i64, "BUFFER_ATOMIC_INC_X2">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_dec", i64, "BUFFER_ATOMIC_DEC_X2">;
+
+let SubtargetPredicate = isGFX6GFX7GFX10Plus in {
+  defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f32, "BUFFER_ATOMIC_FMIN">;
+  defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f32, "BUFFER_ATOMIC_FMAX">;
+}
 let SubtargetPredicate = isGFX6GFX7GFX10 in {
-  defm : BufferAtomicPatterns<SIbuffer_atomic_fmin, f32, "BUFFER_ATOMIC_FMIN">;
-  defm : BufferAtomicPatterns<SIbuffer_atomic_fmax, f32, "BUFFER_ATOMIC_FMAX">;
-  defm : BufferAtomicPatterns<SIbuffer_atomic_fmin, f64, "BUFFER_ATOMIC_FMIN_X2">;
-  defm : BufferAtomicPatterns<SIbuffer_atomic_fmax, f64, "BUFFER_ATOMIC_FMAX_X2">;
+  defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, "BUFFER_ATOMIC_FMIN_X2">;
+  defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, "BUFFER_ATOMIC_FMAX_X2">;
 }
 
 class NoUseBufferAtomic<SDPatternOperator Op, ValueType vt> : PatFrag <
@@ -1482,71 +1621,89 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt,
   >;
 }
 
-let SubtargetPredicate = HasAtomicFaddInsts in {
+let SubtargetPredicate = HasAtomicFaddNoRtnInsts in
 defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_fadd, f32, "BUFFER_ATOMIC_ADD_F32">;
+
+let SubtargetPredicate = HasAtomicPkFaddNoRtnInsts in
 defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_fadd, v2f16, "BUFFER_ATOMIC_PK_ADD_F16">;
-}
 
-let SubtargetPredicate = isGFX90APlus in {
-  defm : BufferAtomicPatterns<SIbuffer_atomic_fadd, f32,   "BUFFER_ATOMIC_ADD_F32">;
-  defm : BufferAtomicPatterns<SIbuffer_atomic_fadd, v2f16, "BUFFER_ATOMIC_PK_ADD_F16">;
+let SubtargetPredicate = HasAtomicFaddRtnInsts in
+ defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32,   "BUFFER_ATOMIC_ADD_F32">;
 
-  defm : BufferAtomicPatterns<SIbuffer_atomic_fadd, f64, "BUFFER_ATOMIC_ADD_F64">;
-  defm : BufferAtomicPatterns<SIbuffer_atomic_fmin, f64, "BUFFER_ATOMIC_MIN_F64">;
-  defm : BufferAtomicPatterns<SIbuffer_atomic_fmax, f64, "BUFFER_ATOMIC_MAX_F64">;
+let SubtargetPredicate = isGFX90APlus in {
+  defm : BufferAtomicIntrPat<"int_amdgcn_global_atomic_fadd", f64, "BUFFER_ATOMIC_ADD_F64">;
+  defm : BufferAtomicIntrPat<"int_amdgcn_global_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64">;
+  defm : BufferAtomicIntrPat<"int_amdgcn_global_atomic_fmax", f64, "BUFFER_ATOMIC_MAX_F64">;
+  defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16">;
+
+  defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f64, "BUFFER_ATOMIC_ADD_F64">;
+  defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64">;
+  defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, "BUFFER_ATOMIC_MAX_F64">;
 } // End SubtargetPredicate = isGFX90APlus
 
+foreach RtnMode = ["ret", "noret"] in {
+
+defvar Op = !cast<SDPatternOperator>(SIbuffer_atomic_cmpswap # "_" # RtnMode);
+defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
+defvar CachePolicy = !if(!eq(RtnMode, "ret"), (set_glc $cachepolicy),
+  (timm:$cachepolicy));
+
+defvar OffsetResDag = (!cast<MUBUF_Pseudo>("BUFFER_ATOMIC_CMPSWAP_OFFSET" # InstSuffix)
+  (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
+  SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), CachePolicy);
 def : GCNPat<
-  (SIbuffer_atomic_cmpswap
+  (Op
       i32:$data, i32:$cmp, v4i32:$rsrc, 0, 0, i32:$soffset,
       timm:$offset, timm:$cachepolicy, 0),
-  (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS
-    (BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN
-      (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
-        SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
-        (set_glc $cachepolicy)), VReg_64)), sub0)
+  !if(!eq(RtnMode, "ret"),
+    (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS OffsetResDag, VReg_64)), sub0),
+    OffsetResDag)
 >;
 
+defvar IdxenResDag = (!cast<MUBUF_Pseudo>("BUFFER_ATOMIC_CMPSWAP_IDXEN" # InstSuffix)
+  (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
+  VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+  CachePolicy);
 def : GCNPat<
-  (SIbuffer_atomic_cmpswap
+  (Op
       i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
       0, i32:$soffset, timm:$offset,
       timm:$cachepolicy, timm),
-  (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS
-    (BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN
-      (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
-      VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
-      (set_glc $cachepolicy)), VReg_64)),
-    sub0)
+  !if(!eq(RtnMode, "ret"),
+    (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS IdxenResDag, VReg_64)), sub0),
+    IdxenResDag)
 >;
 
+defvar OffenResDag = (!cast<MUBUF_Pseudo>("BUFFER_ATOMIC_CMPSWAP_OFFEN" # InstSuffix)
+  (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
+  VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+  CachePolicy);
 def : GCNPat<
-  (SIbuffer_atomic_cmpswap
+  (Op
       i32:$data, i32:$cmp, v4i32:$rsrc, 0,
       i32:$voffset, i32:$soffset, timm:$offset,
       timm:$cachepolicy, 0),
-  (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS
-    (BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN
-      (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
-      VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
-      (set_glc $cachepolicy)), VReg_64)),
-    sub0)
+  !if(!eq(RtnMode, "ret"),
+    (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS OffenResDag, VReg_64)), sub0),
+    OffenResDag)
 >;
 
+defvar BothenResDag = (!cast<MUBUF_Pseudo>("BUFFER_ATOMIC_CMPSWAP_BOTHEN" # InstSuffix)
+  (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
+  (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
+  SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), CachePolicy);
 def : GCNPat<
-  (SIbuffer_atomic_cmpswap
+  (Op
       i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
       i32:$voffset, i32:$soffset, timm:$offset,
       timm:$cachepolicy, timm),
-  (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS
-    (BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN
-      (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
-      (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
-      SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
-      (set_glc $cachepolicy)), VReg_64)),
-    sub0)
+  !if(!eq(RtnMode, "ret"),
+    (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS BothenResDag, VReg_64)), sub0),
+    BothenResDag)
 >;
 
+} // end foreach RtnMode
+
 class MUBUFLoad_PatternADDR64 <MUBUF_Pseudo Instr_ADDR64, ValueType vt,
                               PatFrag constant_ld> : GCNPat <
      (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
@@ -1682,8 +1839,12 @@ multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo In
   >;
 }
 let SubtargetPredicate = isGFX6GFX7 in {
-defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORD_ADDR64, BUFFER_STORE_DWORD_OFFSET, i32, atomic_store_global_32>;
-defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORDX2_ADDR64, BUFFER_STORE_DWORDX2_OFFSET, i64, atomic_store_global_64>;
+defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_BYTE_ADDR64, BUFFER_STORE_BYTE_OFFSET, i32, atomic_store_8_global>;
+defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_BYTE_ADDR64, BUFFER_STORE_BYTE_OFFSET, i16, atomic_store_8_global>;
+defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_SHORT_ADDR64, BUFFER_STORE_SHORT_OFFSET, i32, atomic_store_16_global>;
+defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_SHORT_ADDR64, BUFFER_STORE_SHORT_OFFSET, i16, atomic_store_16_global>;
+defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORD_ADDR64, BUFFER_STORE_DWORD_OFFSET, i32, atomic_store_32_global>;
+defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORDX2_ADDR64, BUFFER_STORE_DWORDX2_OFFSET, i64, atomic_store_64_global>;
 } // End Predicates = isGFX6GFX7
 
 
@@ -1731,7 +1892,7 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX3_OFFEN, BUFFER_STORE_DWORDX3_OF
 defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private, VReg_128>;
 
 
-let OtherPredicates = [D16PreservesUnusedBits, DisableFlatScratch] in {
+let OtherPredicates = [HasD16LoadStore, DisableFlatScratch] in {
  // Hiding the extract high pattern in the PatFrag seems to not
  // automatically increase the complexity.
 let AddedComplexity = 1 in {
@@ -1882,24 +2043,41 @@ let SubtargetPredicate = HasPackedD16VMem in {
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-// Base ENC_MUBUF for GFX6, GFX7, GFX10.
+// Base ENC_MUBUF for GFX6, GFX7, GFX10, GFX11.
 //===----------------------------------------------------------------------===//
 
-class Base_MUBUF_Real_gfx6_gfx7_gfx10<bits<7> op, MUBUF_Pseudo ps, int ef> :
-    MUBUF_Real<ps>, Enc64, SIMCInstr<ps.PseudoInstr, ef> {
+class Base_MUBUF_Real_gfx6_gfx7_gfx10_gfx11 <MUBUF_Pseudo ps, int ef,
+                                             string real_name = ps.Mnemonic> :
+  MUBUF_Real<ps, real_name>, Enc64, SIMCInstr<ps.PseudoInstr, ef> {
   let Inst{11-0}  = !if(ps.has_offset, offset, ?);
+  let Inst{31-26} = 0x38;
+  let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
+  let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?);
+  let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+  let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
+}
+
+class MUBUF_Real_gfx11<bits<8> op, MUBUF_Pseudo ps,
+                       string real_name = ps.Mnemonic> :
+  Base_MUBUF_Real_gfx6_gfx7_gfx10_gfx11<ps, SIEncodingFamily.GFX11, real_name> {
+  let Inst{12}    = !if(ps.has_slc, cpol{CPolBit.SLC}, ?);
+  let Inst{13}    = !if(ps.has_dlc, cpol{CPolBit.DLC}, ps.dlc_value);
+  let Inst{14}    = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glc_value);
+  let Inst{25-18} = op;
+  let Inst{53}    = !if(ps.has_tfe, tfe, ?);
+  let Inst{54}    = ps.offen;
+  let Inst{55}    = ps.idxen;
+}
+
+class Base_MUBUF_Real_gfx6_gfx7_gfx10<bits<7> op, MUBUF_Pseudo ps, int ef> :
+  Base_MUBUF_Real_gfx6_gfx7_gfx10_gfx11<ps, ef> {
   let Inst{12}    = ps.offen;
   let Inst{13}    = ps.idxen;
   let Inst{14}    = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glc_value);
   let Inst{16}    = ps.lds;
   let Inst{24-18} = op;
-  let Inst{31-26} = 0x38;
-  let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
-  let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?);
-  let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
   let Inst{54}    = !if(ps.has_slc, cpol{CPolBit.SLC}, ?);
   let Inst{55}    = !if(ps.has_tfe, tfe, ?);
-  let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
 }
 
 class MUBUF_Real_gfx10<bits<8> op, MUBUF_Pseudo ps> :
@@ -1913,11 +2091,156 @@ class MUBUF_Real_gfx6_gfx7<bits<8> op, MUBUF_Pseudo ps> :
   let Inst{15} = ps.addr64;
 }
 
+//===----------------------------------------------------------------------===//
+// MUBUF - GFX11.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in
+multiclass MUBUF_Real_AllAddr_gfx11_Renamed_Impl<bits<8> op, string real_name> {
+  def _BOTHEN_gfx11 :
+    MUBUF_Real_gfx11<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN"), real_name>,
+    AtomicNoRet<NAME # "_BOTHEN_gfx11", 0>;
+  def _IDXEN_gfx11 :
+    MUBUF_Real_gfx11<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN"), real_name>,
+    AtomicNoRet<NAME # "_IDXEN_gfx11", 0>;
+  def _OFFEN_gfx11 :
+    MUBUF_Real_gfx11<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN"), real_name>,
+    AtomicNoRet<NAME # "_OFFEN_gfx11", 0>;
+  def _OFFSET_gfx11 :
+    MUBUF_Real_gfx11<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET"), real_name>,
+    AtomicNoRet<NAME # "_OFFSET_gfx11", 0>;
+}
+
+multiclass MUBUF_Real_AllAddr_gfx11_Impl<bits<8> op, MUBUF_Pseudo ps> :
+  MUBUF_Real_AllAddr_gfx11_Renamed_Impl<op, ps.Mnemonic>;
+multiclass MUBUF_Real_AllAddr_gfx11<bits<8> op> :
+  MUBUF_Real_AllAddr_gfx11_Impl<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
+
+class Pre_gfx11_MUBUF_Name <MUBUF_Pseudo ps, string real_name> :
+  MnemonicAlias<ps.Mnemonic, real_name>, Requires<[isGFX11Plus]>;
+multiclass MUBUF_Real_AllAddr_gfx11_Renamed<bits<8> op, string real_name> :
+  MUBUF_Real_AllAddr_gfx11_Renamed_Impl<op, real_name> {
+  def : Pre_gfx11_MUBUF_Name<!cast<MUBUF_Pseudo>(NAME#"_BOTHEN"), real_name>;
+}
+
+let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in
+multiclass MUBUF_Real_Atomics_RTN_gfx11_Renamed<bits<8> op, string real_name> {
+  def _BOTHEN_RTN_gfx11 :
+    MUBUF_Real_gfx11<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN"), real_name>,
+    AtomicNoRet<NAME # "_BOTHEN_gfx11", 1>;
+  def _IDXEN_RTN_gfx11 :
+    MUBUF_Real_gfx11<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN"), real_name>,
+    AtomicNoRet<NAME # "_IDXEN_gfx11", 1>;
+  def _OFFEN_RTN_gfx11 :
+    MUBUF_Real_gfx11<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN"), real_name>,
+    AtomicNoRet<NAME # "_OFFEN_gfx11", 1>;
+  def _OFFSET_RTN_gfx11 :
+    MUBUF_Real_gfx11<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN"), real_name>,
+    AtomicNoRet<NAME # "_OFFSET_gfx11", 1>;
+}
+
+multiclass MUBUF_Real_Atomics_RTN_gfx11_impl<bits<8> op, MUBUF_Pseudo ps> :
+  MUBUF_Real_Atomics_RTN_gfx11_Renamed<op, ps.Mnemonic>;
+multiclass MUBUF_Real_Atomics_RTN_gfx11<bits<8> op> :
+  MUBUF_Real_Atomics_RTN_gfx11_impl<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
+
+multiclass MUBUF_Real_Atomics_gfx11<bits<8> op> :
+  MUBUF_Real_AllAddr_gfx11<op>,
+  MUBUF_Real_Atomics_RTN_gfx11<op>;
+
+multiclass MUBUF_Real_Atomics_gfx11_Renamed<bits<8> op, string real_name> :
+  MUBUF_Real_AllAddr_gfx11_Renamed<op, real_name>,
+  MUBUF_Real_Atomics_RTN_gfx11_Renamed<op, real_name>;
+
+let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in {
+def BUFFER_GL0_INV_gfx11          : MUBUF_Real_gfx11<0x02B, BUFFER_GL0_INV>;
+def BUFFER_GL1_INV_gfx11          : MUBUF_Real_gfx11<0x02C, BUFFER_GL1_INV>;
+}
+
+defm BUFFER_LOAD_DWORD            : MUBUF_Real_AllAddr_gfx11_Renamed<0x014, "buffer_load_b32">;
+defm BUFFER_LOAD_DWORDX2          : MUBUF_Real_AllAddr_gfx11_Renamed<0x015, "buffer_load_b64">;
+defm BUFFER_LOAD_DWORDX3          : MUBUF_Real_AllAddr_gfx11_Renamed<0x016, "buffer_load_b96">;
+defm BUFFER_LOAD_DWORDX4          : MUBUF_Real_AllAddr_gfx11_Renamed<0x017, "buffer_load_b128">;
+defm BUFFER_LOAD_SHORT_D16        : MUBUF_Real_AllAddr_gfx11_Renamed<0x020, "buffer_load_d16_b16">;
+defm BUFFER_LOAD_FORMAT_D16_X     : MUBUF_Real_AllAddr_gfx11_Renamed<0x008, "buffer_load_d16_format_x">;
+defm BUFFER_LOAD_FORMAT_D16_XY    : MUBUF_Real_AllAddr_gfx11_Renamed<0x009, "buffer_load_d16_format_xy">;
+defm BUFFER_LOAD_FORMAT_D16_XYZ   : MUBUF_Real_AllAddr_gfx11_Renamed<0x00a, "buffer_load_d16_format_xyz">;
+defm BUFFER_LOAD_FORMAT_D16_XYZW  : MUBUF_Real_AllAddr_gfx11_Renamed<0x00b, "buffer_load_d16_format_xyzw">;
+defm BUFFER_LOAD_SHORT_D16_HI     : MUBUF_Real_AllAddr_gfx11_Renamed<0x023, "buffer_load_d16_hi_b16">;
+defm BUFFER_LOAD_FORMAT_D16_HI_X  : MUBUF_Real_AllAddr_gfx11_Renamed<0x026, "buffer_load_d16_hi_format_x">;
+defm BUFFER_LOAD_SBYTE_D16_HI     : MUBUF_Real_AllAddr_gfx11_Renamed<0x022, "buffer_load_d16_hi_i8">;
+defm BUFFER_LOAD_UBYTE_D16_HI     : MUBUF_Real_AllAddr_gfx11_Renamed<0x021, "buffer_load_d16_hi_u8">;
+defm BUFFER_LOAD_SBYTE_D16        : MUBUF_Real_AllAddr_gfx11_Renamed<0x01f, "buffer_load_d16_i8">;
+defm BUFFER_LOAD_UBYTE_D16        : MUBUF_Real_AllAddr_gfx11_Renamed<0x01e, "buffer_load_d16_u8">;
+defm BUFFER_LOAD_FORMAT_X         : MUBUF_Real_AllAddr_gfx11<0x000>;
+defm BUFFER_LOAD_FORMAT_XY        : MUBUF_Real_AllAddr_gfx11<0x001>;
+defm BUFFER_LOAD_FORMAT_XYZ       : MUBUF_Real_AllAddr_gfx11<0x002>;
+defm BUFFER_LOAD_FORMAT_XYZW      : MUBUF_Real_AllAddr_gfx11<0x003>;
+defm BUFFER_LOAD_SBYTE            : MUBUF_Real_AllAddr_gfx11_Renamed<0x011, "buffer_load_i8">;
+defm BUFFER_LOAD_SSHORT           : MUBUF_Real_AllAddr_gfx11_Renamed<0x013, "buffer_load_i16">;
+defm BUFFER_LOAD_UBYTE            : MUBUF_Real_AllAddr_gfx11_Renamed<0x010, "buffer_load_u8">;
+defm BUFFER_LOAD_USHORT           : MUBUF_Real_AllAddr_gfx11_Renamed<0x012, "buffer_load_u16">;
+defm BUFFER_LOAD_LDS_B32          : MUBUF_Real_AllAddr_gfx11<0x031>;
+defm BUFFER_LOAD_LDS_FORMAT_X     : MUBUF_Real_AllAddr_gfx11<0x032>;
+defm BUFFER_LOAD_LDS_I8           : MUBUF_Real_AllAddr_gfx11<0x02e>;
+defm BUFFER_LOAD_LDS_I16          : MUBUF_Real_AllAddr_gfx11<0x030>;
+defm BUFFER_LOAD_LDS_U8           : MUBUF_Real_AllAddr_gfx11<0x02d>;
+defm BUFFER_LOAD_LDS_U16          : MUBUF_Real_AllAddr_gfx11<0x02f>;
+defm BUFFER_STORE_BYTE            : MUBUF_Real_AllAddr_gfx11_Renamed<0x018, "buffer_store_b8">;
+defm BUFFER_STORE_SHORT           : MUBUF_Real_AllAddr_gfx11_Renamed<0x019, "buffer_store_b16">;
+defm BUFFER_STORE_DWORD           : MUBUF_Real_AllAddr_gfx11_Renamed<0x01A, "buffer_store_b32">;
+defm BUFFER_STORE_DWORDX2         : MUBUF_Real_AllAddr_gfx11_Renamed<0x01B, "buffer_store_b64">;
+defm BUFFER_STORE_DWORDX3         : MUBUF_Real_AllAddr_gfx11_Renamed<0x01C, "buffer_store_b96">;
+defm BUFFER_STORE_DWORDX4         : MUBUF_Real_AllAddr_gfx11_Renamed<0x01D, "buffer_store_b128">;
+defm BUFFER_STORE_FORMAT_D16_X    : MUBUF_Real_AllAddr_gfx11_Renamed<0x00C, "buffer_store_d16_format_x">;
+defm BUFFER_STORE_FORMAT_D16_XY   : MUBUF_Real_AllAddr_gfx11_Renamed<0x00D, "buffer_store_d16_format_xy">;
+defm BUFFER_STORE_FORMAT_D16_XYZ  : MUBUF_Real_AllAddr_gfx11_Renamed<0x00E, "buffer_store_d16_format_xyz">;
+defm BUFFER_STORE_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_gfx11_Renamed<0x00F, "buffer_store_d16_format_xyzw">;
+defm BUFFER_STORE_BYTE_D16_HI     : MUBUF_Real_AllAddr_gfx11_Renamed<0x024, "buffer_store_d16_hi_b8">;
+defm BUFFER_STORE_SHORT_D16_HI    : MUBUF_Real_AllAddr_gfx11_Renamed<0x025, "buffer_store_d16_hi_b16">;
+defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_gfx11_Renamed<0x027, "buffer_store_d16_hi_format_x">;
+defm BUFFER_STORE_FORMAT_X        : MUBUF_Real_AllAddr_gfx11<0x004>;
+defm BUFFER_STORE_FORMAT_XY       : MUBUF_Real_AllAddr_gfx11<0x005>;
+defm BUFFER_STORE_FORMAT_XYZ      : MUBUF_Real_AllAddr_gfx11<0x006>;
+defm BUFFER_STORE_FORMAT_XYZW     : MUBUF_Real_AllAddr_gfx11<0x007>;
+defm BUFFER_ATOMIC_ADD_F32        : MUBUF_Real_Atomics_gfx11<0x056>;
+defm BUFFER_ATOMIC_ADD            : MUBUF_Real_Atomics_gfx11_Renamed<0x035, "buffer_atomic_add_u32">;
+defm BUFFER_ATOMIC_ADD_X2         : MUBUF_Real_Atomics_gfx11_Renamed<0x043, "buffer_atomic_add_u64">;
+defm BUFFER_ATOMIC_AND            : MUBUF_Real_Atomics_gfx11_Renamed<0x03C, "buffer_atomic_and_b32">;
+defm BUFFER_ATOMIC_AND_X2         : MUBUF_Real_Atomics_gfx11_Renamed<0x049, "buffer_atomic_and_b64">;
+defm BUFFER_ATOMIC_CMPSWAP        : MUBUF_Real_Atomics_gfx11_Renamed<0x034, "buffer_atomic_cmpswap_b32">;
+defm BUFFER_ATOMIC_CMPSWAP_X2     : MUBUF_Real_Atomics_gfx11_Renamed<0x042, "buffer_atomic_cmpswap_b64">;
+defm BUFFER_ATOMIC_FCMPSWAP       : MUBUF_Real_Atomics_gfx11_Renamed<0x050, "buffer_atomic_cmpswap_f32">;
+defm BUFFER_ATOMIC_CSUB           : MUBUF_Real_Atomics_RTN_gfx11_Renamed<0x037, "buffer_atomic_csub_u32">;
+def : MnemonicAlias<"buffer_atomic_csub", "buffer_atomic_csub_u32">, Requires<[isGFX11Plus]>;
+defm BUFFER_ATOMIC_DEC            : MUBUF_Real_Atomics_gfx11_Renamed<0x040, "buffer_atomic_dec_u32">;
+defm BUFFER_ATOMIC_DEC_X2         : MUBUF_Real_Atomics_gfx11_Renamed<0x04D, "buffer_atomic_dec_u64">;
+defm BUFFER_ATOMIC_INC            : MUBUF_Real_Atomics_gfx11_Renamed<0x03F, "buffer_atomic_inc_u32">;
+defm BUFFER_ATOMIC_INC_X2         : MUBUF_Real_Atomics_gfx11_Renamed<0x04C, "buffer_atomic_inc_u64">;
+defm BUFFER_ATOMIC_FMAX           : MUBUF_Real_Atomics_gfx11_Renamed<0x052, "buffer_atomic_max_f32">;
+defm BUFFER_ATOMIC_SMAX           : MUBUF_Real_Atomics_gfx11_Renamed<0x03A, "buffer_atomic_max_i32">;
+defm BUFFER_ATOMIC_SMAX_X2        : MUBUF_Real_Atomics_gfx11_Renamed<0x047, "buffer_atomic_max_i64">;
+defm BUFFER_ATOMIC_UMAX           : MUBUF_Real_Atomics_gfx11_Renamed<0x03B, "buffer_atomic_max_u32">;
+defm BUFFER_ATOMIC_UMAX_X2        : MUBUF_Real_Atomics_gfx11_Renamed<0x048, "buffer_atomic_max_u64">;
+defm BUFFER_ATOMIC_FMIN           : MUBUF_Real_Atomics_gfx11_Renamed<0x051, "buffer_atomic_min_f32">;
+defm BUFFER_ATOMIC_SMIN           : MUBUF_Real_Atomics_gfx11_Renamed<0x038, "buffer_atomic_min_i32">;
+defm BUFFER_ATOMIC_SMIN_X2        : MUBUF_Real_Atomics_gfx11_Renamed<0x045, "buffer_atomic_min_i64">;
+defm BUFFER_ATOMIC_UMIN           : MUBUF_Real_Atomics_gfx11_Renamed<0x039, "buffer_atomic_min_u32">;
+defm BUFFER_ATOMIC_UMIN_X2        : MUBUF_Real_Atomics_gfx11_Renamed<0x046, "buffer_atomic_min_u64">;
+defm BUFFER_ATOMIC_OR             : MUBUF_Real_Atomics_gfx11_Renamed<0x03D, "buffer_atomic_or_b32">;
+defm BUFFER_ATOMIC_OR_X2          : MUBUF_Real_Atomics_gfx11_Renamed<0x04A, "buffer_atomic_or_b64">;
+defm BUFFER_ATOMIC_SUB            : MUBUF_Real_Atomics_gfx11_Renamed<0x036, "buffer_atomic_sub_u32">;
+defm BUFFER_ATOMIC_SUB_X2         : MUBUF_Real_Atomics_gfx11_Renamed<0x044, "buffer_atomic_sub_u64">;
+defm BUFFER_ATOMIC_SWAP           : MUBUF_Real_Atomics_gfx11_Renamed<0x033, "buffer_atomic_swap_b32">;
+defm BUFFER_ATOMIC_SWAP_X2        : MUBUF_Real_Atomics_gfx11_Renamed<0x041, "buffer_atomic_swap_b64">;
+defm BUFFER_ATOMIC_XOR            : MUBUF_Real_Atomics_gfx11_Renamed<0x03E, "buffer_atomic_xor_b32">;
+defm BUFFER_ATOMIC_XOR_X2         : MUBUF_Real_Atomics_gfx11_Renamed<0x04B, "buffer_atomic_xor_b64">;
+
 //===----------------------------------------------------------------------===//
 // MUBUF - GFX10.
 //===----------------------------------------------------------------------===//
 
-let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
   multiclass MUBUF_Real_AllAddr_gfx10<bits<8> op> {
     def _BOTHEN_gfx10 :
       MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
@@ -1929,23 +2252,15 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
       MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
   }
   multiclass MUBUF_Real_AllAddr_Lds_gfx10<bits<8> op> {
-    def _OFFSET_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>,
-                        MUBUFLdsTable<0, NAME # "_OFFSET_gfx10">;
-    def _OFFEN_gfx10  : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>,
-                        MUBUFLdsTable<0, NAME # "_OFFEN_gfx10">;
-    def _IDXEN_gfx10  : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>,
-                        MUBUFLdsTable<0, NAME # "_IDXEN_gfx10">;
-    def _BOTHEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>,
-                        MUBUFLdsTable<0, NAME # "_BOTHEN_gfx10">;
-
-    def _LDS_OFFSET_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>,
-                            MUBUFLdsTable<1, NAME # "_OFFSET_gfx10">;
-    def _LDS_OFFEN_gfx10  : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>,
-                            MUBUFLdsTable<1, NAME # "_OFFEN_gfx10">;
-    def _LDS_IDXEN_gfx10  : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>,
-                            MUBUFLdsTable<1, NAME # "_IDXEN_gfx10">;
-    def _LDS_BOTHEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
-                            MUBUFLdsTable<1, NAME # "_BOTHEN_gfx10">;
+    def _OFFSET_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
+    def _OFFEN_gfx10  : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>;
+    def _IDXEN_gfx10  : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>;
+    def _BOTHEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
+
+    def _LDS_OFFSET_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>;
+    def _LDS_OFFEN_gfx10  : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>;
+    def _LDS_IDXEN_gfx10  : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>;
+    def _LDS_BOTHEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>;
   }
   multiclass MUBUF_Real_Atomics_RTN_gfx10<bits<8> op> {
     def _BOTHEN_RTN_gfx10 :
@@ -1976,7 +2291,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
       MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>,
       AtomicNoRet<NAME # "_OFFSET_gfx10", 0>;
   }
-} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
+} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
 
 defm BUFFER_STORE_BYTE_D16_HI     : MUBUF_Real_AllAddr_gfx10<0x019>;
 defm BUFFER_STORE_SHORT_D16_HI    : MUBUF_Real_AllAddr_gfx10<0x01b>;
@@ -2033,27 +2348,17 @@ let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
       MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
   }
   multiclass MUBUF_Real_AllAddr_Lds_gfx6_gfx7<bits<8> op> {
-    def _OFFSET_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>,
-                            MUBUFLdsTable<0, NAME # "_OFFSET_gfx6_gfx7">;
-    def _ADDR64_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64")>,
-                            MUBUFLdsTable<0, NAME # "_ADDR64_gfx6_gfx7">;
-    def _OFFEN_gfx6_gfx7  : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>,
-                            MUBUFLdsTable<0, NAME # "_OFFEN_gfx6_gfx7">;
-    def _IDXEN_gfx6_gfx7  : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>,
-                            MUBUFLdsTable<0, NAME # "_IDXEN_gfx6_gfx7">;
-    def _BOTHEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>,
-                            MUBUFLdsTable<0, NAME # "_BOTHEN_gfx6_gfx7">;
-
-    def _LDS_OFFSET_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>,
-                                MUBUFLdsTable<1, NAME # "_OFFSET_gfx6_gfx7">;
-    def _LDS_ADDR64_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_ADDR64")>,
-                                MUBUFLdsTable<1, NAME # "_ADDR64_gfx6_gfx7">;
-    def _LDS_OFFEN_gfx6_gfx7  : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>,
-                                MUBUFLdsTable<1, NAME # "_OFFEN_gfx6_gfx7">;
-    def _LDS_IDXEN_gfx6_gfx7  : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>,
-                                MUBUFLdsTable<1, NAME # "_IDXEN_gfx6_gfx7">;
-    def _LDS_BOTHEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
-                                MUBUFLdsTable<1, NAME # "_BOTHEN_gfx6_gfx7">;
+    def _OFFSET_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
+    def _ADDR64_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64")>;
+    def _OFFEN_gfx6_gfx7  : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>;
+    def _IDXEN_gfx6_gfx7  : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>;
+    def _BOTHEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
+
+    def _LDS_OFFSET_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>;
+    def _LDS_ADDR64_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_ADDR64")>;
+    def _LDS_OFFEN_gfx6_gfx7  : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>;
+    def _LDS_IDXEN_gfx6_gfx7  : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>;
+    def _LDS_BOTHEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>;
   }
   multiclass MUBUF_Real_Atomics_gfx6_gfx7<bits<8> op> {
     def _ADDR64_gfx6_gfx7 :
@@ -2167,25 +2472,88 @@ defm BUFFER_WBINVL1_VOL       : MUBUF_Real_gfx7<0x070>;
 def  BUFFER_WBINVL1_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<0x071, BUFFER_WBINVL1>;
 
 //===----------------------------------------------------------------------===//
-// Base ENC_MTBUF for GFX6, GFX7, GFX10.
+// Base ENC_MTBUF for GFX6, GFX7, GFX10, GFX11.
 //===----------------------------------------------------------------------===//
 
-class Base_MTBUF_Real_gfx6_gfx7_gfx10<bits<3> op, MTBUF_Pseudo ps, int ef> :
-    MTBUF_Real<ps>, Enc64, SIMCInstr<ps.PseudoInstr, ef> {
+class Base_MTBUF_Real_gfx6_gfx7_gfx10_gfx11<MTBUF_Pseudo ps, int ef,
+                                            string real_name = ps.Mnemonic> :
+  MTBUF_Real<ps, real_name>, Enc64, SIMCInstr<ps.PseudoInstr, ef> {
   let Inst{11-0}  = !if(ps.has_offset, offset, ?);
-  let Inst{12}    = ps.offen;
-  let Inst{13}    = ps.idxen;
   let Inst{14}    = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glc_value);
-  let Inst{18-16} = op;
   let Inst{31-26} = 0x3a; //encoding
   let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
   let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?);
   let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+  let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
+}
+
+class Base_MTBUF_Real_gfx11<bits<4> op, MTBUF_Pseudo ps,
+                            string real_name = ps.Mnemonic> :
+  Base_MTBUF_Real_gfx6_gfx7_gfx10_gfx11<ps, SIEncodingFamily.GFX11, real_name> {
+  let Inst{12}    = !if(ps.has_slc, cpol{CPolBit.SLC}, ?);
+  let Inst{13}    = !if(ps.has_dlc, cpol{CPolBit.DLC}, ps.dlc_value);
+  let Inst{18-15} = op;
+  let Inst{25-19} = format;
+  let Inst{53}    = !if(ps.has_tfe, tfe, ?);
+  let Inst{54}    = ps.offen;
+  let Inst{55}    = ps.idxen;
+}
+
+class Base_MTBUF_Real_gfx6_gfx7_gfx10<bits<3> op, MTBUF_Pseudo ps, int ef> :
+  Base_MTBUF_Real_gfx6_gfx7_gfx10_gfx11<ps, ef> {
+  let Inst{12}    = ps.offen;
+  let Inst{13}    = ps.idxen;
+  let Inst{18-16} = op;
   let Inst{54}    = !if(ps.has_slc, cpol{CPolBit.SLC}, ?);
   let Inst{55}    = !if(ps.has_tfe, tfe, ?);
-  let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
 }
 
+//===----------------------------------------------------------------------===//
+// MTBUF - GFX11.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in
+multiclass MTBUF_Real_AllAddr_gfx11_Renamed_Impl<bits<4> op, string real_name> {
+  def _BOTHEN_gfx11 :
+    Base_MTBUF_Real_gfx11<op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN"), real_name>;
+  def _IDXEN_gfx11 :
+    Base_MTBUF_Real_gfx11<op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN"), real_name>;
+  def _OFFEN_gfx11 :
+    Base_MTBUF_Real_gfx11<op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN"), real_name>;
+  def _OFFSET_gfx11 :
+    Base_MTBUF_Real_gfx11<op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET"), real_name>;
+}
+
+multiclass MTBUF_Real_AllAddr_gfx11_Impl<bits<4> op, MTBUF_Pseudo ps>
+ : MTBUF_Real_AllAddr_gfx11_Renamed_Impl<op, ps.Mnemonic>;
+multiclass MTBUF_Real_AllAddr_gfx11<bits<4> op>
+ : MTBUF_Real_AllAddr_gfx11_Impl<op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
+
+
+class Pre_gfx11_MTBUF_Name <MTBUF_Pseudo ps, string real_name>
+  : MnemonicAlias<ps.Mnemonic, real_name>, Requires<[isGFX11Plus]>;
+multiclass MTBUF_Real_AllAddr_gfx11_Renamed<bits<4> op, string real_name>
+  : MTBUF_Real_AllAddr_gfx11_Renamed_Impl<op, real_name> {
+  def : Pre_gfx11_MTBUF_Name<!cast<MTBUF_Pseudo>(NAME#"_BOTHEN"), real_name>;
+}
+
+defm TBUFFER_LOAD_FORMAT_D16_X     : MTBUF_Real_AllAddr_gfx11_Renamed<0x008, "tbuffer_load_d16_format_x">;
+defm TBUFFER_LOAD_FORMAT_D16_XY    : MTBUF_Real_AllAddr_gfx11_Renamed<0x009, "tbuffer_load_d16_format_xy">;
+defm TBUFFER_LOAD_FORMAT_D16_XYZ   : MTBUF_Real_AllAddr_gfx11_Renamed<0x00a, "tbuffer_load_d16_format_xyz">;
+defm TBUFFER_LOAD_FORMAT_D16_XYZW  : MTBUF_Real_AllAddr_gfx11_Renamed<0x00b, "tbuffer_load_d16_format_xyzw">;
+defm TBUFFER_LOAD_FORMAT_X         : MTBUF_Real_AllAddr_gfx11<0x000>;
+defm TBUFFER_LOAD_FORMAT_XY        : MTBUF_Real_AllAddr_gfx11<0x001>;
+defm TBUFFER_LOAD_FORMAT_XYZ       : MTBUF_Real_AllAddr_gfx11<0x002>;
+defm TBUFFER_LOAD_FORMAT_XYZW      : MTBUF_Real_AllAddr_gfx11<0x003>;
+defm TBUFFER_STORE_FORMAT_D16_X    : MTBUF_Real_AllAddr_gfx11_Renamed<0x00c, "tbuffer_store_d16_format_x">;
+defm TBUFFER_STORE_FORMAT_D16_XY   : MTBUF_Real_AllAddr_gfx11_Renamed<0x00d, "tbuffer_store_d16_format_xy">;
+defm TBUFFER_STORE_FORMAT_D16_XYZ  : MTBUF_Real_AllAddr_gfx11_Renamed<0x00e, "tbuffer_store_d16_format_xyz">;
+defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_gfx11_Renamed<0x00f, "tbuffer_store_d16_format_xyzw">;
+defm TBUFFER_STORE_FORMAT_X        : MTBUF_Real_AllAddr_gfx11<0x004>;
+defm TBUFFER_STORE_FORMAT_XY       : MTBUF_Real_AllAddr_gfx11<0x005>;
+defm TBUFFER_STORE_FORMAT_XYZ      : MTBUF_Real_AllAddr_gfx11<0x006>;
+defm TBUFFER_STORE_FORMAT_XYZW     : MTBUF_Real_AllAddr_gfx11<0x007>;
+
 //===----------------------------------------------------------------------===//
 // MTBUF - GFX10.
 //===----------------------------------------------------------------------===//
@@ -2197,7 +2565,7 @@ class MTBUF_Real_gfx10<bits<4> op, MTBUF_Pseudo ps> :
   let Inst{53} = op{3};
 }
 
-let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
   multiclass MTBUF_Real_AllAddr_gfx10<bits<4> op> {
     def _BOTHEN_gfx10 :
       MTBUF_Real_gfx10<op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
@@ -2208,7 +2576,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
     def _OFFSET_gfx10 :
       MTBUF_Real_gfx10<op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
   }
-} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
+} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
 
 defm TBUFFER_LOAD_FORMAT_D16_X     : MTBUF_Real_AllAddr_gfx10<0x008>;
 defm TBUFFER_LOAD_FORMAT_D16_XY    : MTBUF_Real_AllAddr_gfx10<0x009>;
@@ -2303,9 +2671,28 @@ class MUBUF_Real_gfx90a <bits<7> op, MUBUF_Pseudo ps,
   let Inst{55}    = acc;
 }
 
+class MUBUF_Real_gfx940 <bits<7> op, MUBUF_Pseudo ps> :
+  MUBUF_Real_Base_vi<op, ps, SIEncodingFamily.GFX940> {
+  let AssemblerPredicate = isGFX940Plus;
+  let DecoderNamespace = "GFX9";
+  let AsmString = ps.Mnemonic # !subst("$tfe", "", ps.AsmOperands);
+
+  let Inst{55} = acc;
+}
+
 multiclass MUBUF_Real_vi_gfx90a<bits<7> op, MUBUF_Pseudo ps> {
   def _vi :     MUBUF_Real_vi<op, ps>;
-  def _gfx90a : MUBUF_Real_gfx90a<op, ps, !and(ps.has_sccb,!not(ps.FPAtomic))>;
+
+  foreach _ = BoolToList<!not(ps.FPAtomic)>.ret in
+    def _gfx90a : MUBUF_Real_gfx90a<op, ps>;
+
+  foreach _ = BoolToList<ps.FPAtomic>.ret in {
+    def _gfx90a : MUBUF_Real_gfx90a<op, ps, 0> {
+      let SubtargetPredicate = isGFX90AOnly;
+      let AssemblerPredicate = isGFX90AOnly;
+    }
+    def _gfx940 : MUBUF_Real_gfx940<op, ps>;
+  }
 }
 
 multiclass MUBUF_Real_AllAddr_vi<bits<7> op> {
@@ -2317,41 +2704,25 @@ multiclass MUBUF_Real_AllAddr_vi<bits<7> op> {
 
 multiclass MUBUF_Real_AllAddr_Lds_vi<bits<7> op> {
 
-  def _OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>,
-                   MUBUFLdsTable<0, NAME # "_OFFSET_vi">;
-  def _OFFEN_vi  : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>,
-                   MUBUFLdsTable<0, NAME # "_OFFEN_vi">;
-  def _IDXEN_vi  : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>,
-                   MUBUFLdsTable<0, NAME # "_IDXEN_vi">;
-  def _BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>,
-                   MUBUFLdsTable<0, NAME # "_BOTHEN_vi">;
-
-  def _LDS_OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>,
-                       MUBUFLdsTable<1, NAME # "_OFFSET_vi">;
-  def _LDS_OFFEN_vi  : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>,
-                       MUBUFLdsTable<1, NAME # "_OFFEN_vi">;
-  def _LDS_IDXEN_vi  : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>,
-                       MUBUFLdsTable<1, NAME # "_IDXEN_vi">;
-  def _LDS_BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
-                       MUBUFLdsTable<1, NAME # "_BOTHEN_vi">;
-
-  def _OFFSET_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>,
-                   MUBUFLdsTable<0, NAME # "_OFFSET_gfx90a">;
-  def _OFFEN_gfx90a  : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>,
-                   MUBUFLdsTable<0, NAME # "_OFFEN_gfx90a">;
-  def _IDXEN_gfx90a  : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>,
-                   MUBUFLdsTable<0, NAME # "_IDXEN_gfx90a">;
-  def _BOTHEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>,
-                   MUBUFLdsTable<0, NAME # "_BOTHEN_gfx90a">;
-
-  def _LDS_OFFSET_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>,
-                       MUBUFLdsTable<1, NAME # "_OFFSET_gfx90a">;
-  def _LDS_OFFEN_gfx90a  : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>,
-                       MUBUFLdsTable<1, NAME # "_OFFEN_gfx90a">;
-  def _LDS_IDXEN_gfx90a  : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>,
-                       MUBUFLdsTable<1, NAME # "_IDXEN_gfx90a">;
-  def _LDS_BOTHEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
-                       MUBUFLdsTable<1, NAME # "_BOTHEN_gfx90a">;
+  def _OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
+  def _OFFEN_vi  : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>;
+  def _IDXEN_vi  : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>;
+  def _BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
+
+  def _LDS_OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>;
+  def _LDS_OFFEN_vi  : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>;
+  def _LDS_IDXEN_vi  : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>;
+  def _LDS_BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>;
+
+  def _OFFSET_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
+  def _OFFEN_gfx90a  : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>;
+  def _IDXEN_gfx90a  : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>;
+  def _BOTHEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
+
+  def _LDS_OFFSET_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>;
+  def _LDS_OFFEN_gfx90a  : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>;
+  def _LDS_IDXEN_gfx90a  : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>;
+  def _LDS_BOTHEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>;
 }
 
 class MUBUF_Real_gfx80 <bits<7> op, MUBUF_Pseudo ps> :
@@ -2424,9 +2795,9 @@ defm BUFFER_LOAD_SBYTE          : MUBUF_Real_AllAddr_Lds_vi <0x11>;
 defm BUFFER_LOAD_USHORT         : MUBUF_Real_AllAddr_Lds_vi <0x12>;
 defm BUFFER_LOAD_SSHORT         : MUBUF_Real_AllAddr_Lds_vi <0x13>;
 defm BUFFER_LOAD_DWORD          : MUBUF_Real_AllAddr_Lds_vi <0x14>;
-defm BUFFER_LOAD_DWORDX2        : MUBUF_Real_AllAddr_Lds_vi <0x15>;
-defm BUFFER_LOAD_DWORDX3        : MUBUF_Real_AllAddr_Lds_vi <0x16>;
-defm BUFFER_LOAD_DWORDX4        : MUBUF_Real_AllAddr_Lds_vi <0x17>;
+defm BUFFER_LOAD_DWORDX2        : MUBUF_Real_AllAddr_vi <0x15>;
+defm BUFFER_LOAD_DWORDX3        : MUBUF_Real_AllAddr_vi <0x16>;
+defm BUFFER_LOAD_DWORDX4        : MUBUF_Real_AllAddr_vi <0x17>;
 defm BUFFER_STORE_BYTE          : MUBUF_Real_AllAddr_vi <0x18>;
 defm BUFFER_STORE_BYTE_D16_HI   : MUBUF_Real_AllAddr_vi <0x19>;
 defm BUFFER_STORE_SHORT         : MUBUF_Real_AllAddr_vi <0x1a>;
@@ -2481,12 +2852,12 @@ def BUFFER_WBINVL1_vi           : MUBUF_Real_vi <0x3e, BUFFER_WBINVL1>;
 def BUFFER_WBINVL1_VOL_vi       : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>;
 } // End AssemblerPredicate = isGFX8GFX9
 
-let SubtargetPredicate = HasAtomicFaddInsts in {
+let SubtargetPredicate = HasAtomicFaddNoRtnInsts in {
 
 defm BUFFER_ATOMIC_ADD_F32    : MUBUF_Real_Atomic_vi <0x4d>;
 defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_Atomic_vi <0x4e>;
 
-} // End SubtargetPredicate = HasAtomicFaddInsts
+} // End SubtargetPredicate = HasAtomicFaddNoRtnInsts
 
 let SubtargetPredicate = isGFX90APlus in {
   defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Real_Atomic_vi<0x4f>;
@@ -2495,9 +2866,17 @@ let SubtargetPredicate = isGFX90APlus in {
 } // End SubtargetPredicate = isGFX90APlus, AssemblerPredicate = isGFX90APlus
 
 def BUFFER_WBL2_gfx90a  : MUBUF_Real_gfx90a<0x28, BUFFER_WBL2> {
+  let AsmString = BUFFER_WBL2.Mnemonic; // drop flags
+  let AssemblerPredicate = isGFX90AOnly;
+  let SubtargetPredicate = isGFX90AOnly;
 }
 def BUFFER_INVL2_gfx90a : MUBUF_Real_gfx90a<0x29, BUFFER_INVL2>;
 
+let SubtargetPredicate = isGFX940Plus in {
+def BUFFER_WBL2_gfx940  : MUBUF_Real_gfx940<0x28, BUFFER_WBL2>;
+def BUFFER_INV_gfx940   : MUBUF_Real_gfx940<0x29, BUFFER_INV>;
+}
+
 class MTBUF_Real_Base_vi <bits<4> op, MTBUF_Pseudo ps, int Enc> :
   MTBUF_Real<ps>,
   Enc64,
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index c4043177b618..27b723875aa4 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -52,8 +52,8 @@ class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt
   let Uses = !if(has_m0_read, [M0, EXEC], [EXEC]);
 }
 
-class DS_Real <DS_Pseudo ps> :
-  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+class DS_Real <DS_Pseudo ps, string opName = ps.Mnemonic> :
+  InstSI <ps.OutOperandList, ps.InOperandList, opName # ps.AsmOperands>,
   Enc64 {
 
   let isPseudo = 0;
@@ -72,6 +72,9 @@ class DS_Real <DS_Pseudo ps> :
   let IsAtomicRet        = ps.IsAtomicRet;
   let IsAtomicNoRet      = ps.IsAtomicNoRet;
 
+  let Constraints = ps.Constraints;
+  let DisableEncoding = ps.DisableEncoding;
+
   // encoding fields
   bits<10> vdst;
   bits<1> gds;
@@ -172,6 +175,22 @@ multiclass DS_1A2D_Off8_NORET_mc <string opName, RegisterClass rc = VGPR_32> {
   }
 }
 
+class DS_0A1D_RET_GDS<string opName, RegisterClass rc = VGPR_32, RegisterClass src = rc,
+                  RegisterOperand dst_op = getLdStRegisterOperand<rc>.ret,
+                  RegisterOperand src_op = getLdStRegisterOperand<src>.ret>
+: DS_Pseudo<opName,
+  (outs dst_op:$vdst),
+  (ins src_op:$data0, offset:$offset),
+  " $vdst, $data0$offset gds"> {
+
+  let has_addr = 0;
+  let has_data1 = 0;
+  let has_gds = 0;
+  let gdsValue = 1;
+  let AsmMatchConverter = "cvtDSGds";
+  let hasSideEffects = 1;
+}
+
 class DS_1A1D_RET <string opName, RegisterClass rc = VGPR_32,
                   RegisterOperand data_op = getLdStRegisterOperand<rc>.ret>
 : DS_Pseudo<opName,
@@ -462,6 +481,22 @@ let SubtargetPredicate = isGFX90APlus in {
   defm DS_ADD_RTN_F64 : DS_1A1D_RET_mc_gfx9<"ds_add_rtn_f64", VReg_64, "ds_add_f64">;
 } // End SubtargetPredicate = isGFX90APlus
 
+let SubtargetPredicate = isGFX940Plus in {
+  defm DS_PK_ADD_F16      : DS_1A1D_NORET_mc_gfx9<"ds_pk_add_f16">;
+  defm DS_PK_ADD_RTN_F16  : DS_1A1D_RET_mc_gfx9<"ds_pk_add_rtn_f16", VGPR_32, "ds_pk_add_f16">;
+  defm DS_PK_ADD_BF16     : DS_1A1D_NORET_mc_gfx9<"ds_pk_add_bf16">;
+  defm DS_PK_ADD_RTN_BF16 : DS_1A1D_RET_mc_gfx9<"ds_pk_add_rtn_bf16", VGPR_32, "ds_pk_add_bf16">;
+} // End SubtargetPredicate = isGFX940Plus
+
+defm DS_CMPSTORE_B32     : DS_1A2D_NORET_mc<"ds_cmpstore_b32">;
+defm DS_CMPSTORE_F32     : DS_1A2D_NORET_mc<"ds_cmpstore_f32">;
+defm DS_CMPSTORE_B64     : DS_1A2D_NORET_mc<"ds_cmpstore_b64", VReg_64>;
+defm DS_CMPSTORE_F64     : DS_1A2D_NORET_mc<"ds_cmpstore_f64", VReg_64>;
+defm DS_CMPSTORE_RTN_B32 : DS_1A2D_RET_mc<"ds_cmpstore_rtn_b32", VGPR_32, "ds_cmpstore_b32">;
+defm DS_CMPSTORE_RTN_F32 : DS_1A2D_RET_mc<"ds_cmpstore_rtn_f32", VGPR_32, "ds_cmpstore_f32">;
+defm DS_CMPSTORE_RTN_B64  : DS_1A2D_RET_mc<"ds_cmpstore_rtn_b64", VReg_64, "ds_cmpstore_b64">;
+defm DS_CMPSTORE_RTN_F64  : DS_1A2D_RET_mc<"ds_cmpstore_rtn_f64", VReg_64, "ds_cmpstore_f64">;
+
 defm DS_MSKOR_B32     : DS_1A2D_NORET_mc<"ds_mskor_b32">;
 defm DS_CMPST_B32     : DS_1A2D_NORET_mc<"ds_cmpst_b32">;
 defm DS_CMPST_F32     : DS_1A2D_NORET_mc<"ds_cmpst_f32">;
@@ -619,6 +654,8 @@ def DS_READ_ADDTID_B32 : DS_0A_RET<"ds_read_addtid_b32">;
 
 def DS_CONSUME       : DS_0A_RET<"ds_consume">;
 def DS_APPEND        : DS_0A_RET<"ds_append">;
+
+let SubtargetPredicate = isNotGFX90APlus in
 def DS_ORDERED_COUNT : DS_1A_RET_GDS<"ds_ordered_count">;
 
 //===----------------------------------------------------------------------===//
@@ -667,6 +704,18 @@ let SubtargetPredicate = HasLDSFPAtomicAdd, OtherPredicates = [HasDsSrc2Insts] i
 def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">;
 }
 
+
+//===----------------------------------------------------------------------===//
+// Instruction definitions for GFX11 and newer.
+//===----------------------------------------------------------------------===//
+
+let SubtargetPredicate = isGFX11Plus in {
+
+def DS_ADD_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_add_gs_reg_rtn", VReg_64, VGPR_32>;
+def DS_SUB_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_sub_gs_reg_rtn", VReg_64, VGPR_32>;
+
+} // let SubtargetPredicate = isGFX11Plus
+
 //===----------------------------------------------------------------------===//
 // DS Patterns
 //===----------------------------------------------------------------------===//
@@ -777,14 +826,14 @@ foreach vt = Reg32Types.types in {
 defm : DSWritePat_mc <DS_WRITE_B32, vt, "store_local">;
 }
 
-defm : DSAtomicWritePat_mc <DS_WRITE_B8, i16, "atomic_store_local_8">;
-defm : DSAtomicWritePat_mc <DS_WRITE_B8, i32, "atomic_store_local_8">;
-defm : DSAtomicWritePat_mc <DS_WRITE_B16, i16, "atomic_store_local_16">;
-defm : DSAtomicWritePat_mc <DS_WRITE_B16, i32, "atomic_store_local_16">;
-defm : DSAtomicWritePat_mc <DS_WRITE_B32, i32, "atomic_store_local_32">;
-defm : DSAtomicWritePat_mc <DS_WRITE_B64, i64, "atomic_store_local_64">;
+defm : DSAtomicWritePat_mc <DS_WRITE_B8, i16, "atomic_store_8_local">;
+defm : DSAtomicWritePat_mc <DS_WRITE_B8, i32, "atomic_store_8_local">;
+defm : DSAtomicWritePat_mc <DS_WRITE_B16, i16, "atomic_store_16_local">;
+defm : DSAtomicWritePat_mc <DS_WRITE_B16, i32, "atomic_store_16_local">;
+defm : DSAtomicWritePat_mc <DS_WRITE_B32, i32, "atomic_store_32_local">;
+defm : DSAtomicWritePat_mc <DS_WRITE_B64, i64, "atomic_store_64_local">;
 
-let OtherPredicates = [D16PreservesUnusedBits] in {
+let OtherPredicates = [HasD16LoadStore] in {
 def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_hi16_local>;
 def : DSWritePat <DS_WRITE_B8_D16_HI, i32, truncstorei8_hi16_local>;
 }
@@ -870,15 +919,30 @@ defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align16_local">;
 
 let SubtargetPredicate = HasUnalignedAccessMode in {
 
-// FIXME: From performance point of view, is ds_read_b96/ds_write_b96 better choice
-// for unaligned accesses?
+// Select 64 bit loads and stores aligned less than 4 as a single ds_read_b64/
+// ds_write_b64 instruction as this is faster than ds_read2_b32/ds_write2_b32
+// which would be used otherwise. In this case a b32 access would still be
+// misaligned, but we will have 2 of them.
+foreach vt = VReg_64.RegTypes in {
+defm : DSReadPat_mc <DS_READ_B64, vt, "load_align_less_than_4_local">;
+defm : DSWritePat_mc <DS_WRITE_B64, vt, "store_align_less_than_4_local">;
+}
+
+// Selection will split most of the unaligned 3 dword accesses due to performance
+// reasons when beneficial. Keep these two patterns for the rest of the cases.
 foreach vt = VReg_96.RegTypes in {
 defm : DSReadPat_mc <DS_READ_B96, vt, "load_local">;
 defm : DSWritePat_mc <DS_WRITE_B96, vt, "store_local">;
 }
 
-// For performance reasons, *do not* select ds_read_b128/ds_write_b128 for unaligned
-// accesses.
+// Select 128 bit loads and stores aligned less than 4 as a single ds_read_b128/
+// ds_write_b128 instruction as this is faster than ds_read2_b64/ds_write2_b64
+// which would be used otherwise. In this case a b64 access would still be
+// misaligned, but we will have 2 of them.
+foreach vt = VReg_128.RegTypes in {
+defm : DSReadPat_mc <DS_READ_B128, vt, "load_align_less_than_4_local">;
+defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align_less_than_4_local">;
+}
 
 } // End SubtargetPredicate = HasUnalignedAccessMode
 
@@ -904,69 +968,143 @@ multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
   def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), 1>;
 }
 
+multiclass DSAtomicRetNoRetPat_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
+                                  ValueType vt, string frag> {
+  let OtherPredicates = [LDSRequiresM0Init] in {
+    def : DSAtomicRetPat<inst, vt,
+                         !cast<PatFrag>(frag#"_local_m0_ret_"#vt.Size)>;
+    def : DSAtomicRetPat<noRetInst, vt,
+                         !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size)>;
+  }
+
+  let OtherPredicates = [NotLDSRequiresM0Init] in {
+    def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
+                         !cast<PatFrag>(frag#"_local_ret_"#vt.Size)>;
+    def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt,
+                         !cast<PatFrag>(frag#"_local_noret_"#vt.Size)>;
+  }
 
+  def : DSAtomicRetPat<inst, vt,
+                       !cast<PatFrag>(frag#"_region_m0_ret_"#vt.Size), 1>;
+  def : DSAtomicRetPat<noRetInst, vt,
+                       !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), 1>;
+}
 
-class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat <
+
+
+let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
+// Caution, the order of src and cmp is the *opposite* of the BUFFER_ATOMIC_CMPSWAP opcode.
+class DSAtomicCmpXChgSwapped<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat <
   (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$cmp, vt:$swap),
   (inst $ptr, getVregSrcForVT<vt>.ret:$cmp, getVregSrcForVT<vt>.ret:$swap, offset:$offset, (i1 gds))
 >;
 
-multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, ValueType vt, string frag> {
+multiclass DSAtomicCmpXChgSwapped_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt,
+                                     string frag> {
   let OtherPredicates = [LDSRequiresM0Init] in {
-    def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>;
+    def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_local_m0_ret_"#vt.Size)>;
+    def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size)>;
   }
 
   let OtherPredicates = [NotLDSRequiresM0Init] in {
-    def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
-                          !cast<PatFrag>(frag#"_local_"#vt.Size)>;
+    def : DSAtomicCmpXChgSwapped<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
+                                 !cast<PatFrag>(frag#"_local_ret_"#vt.Size)>;
+    def : DSAtomicCmpXChgSwapped<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt,
+                                 !cast<PatFrag>(frag#"_local_noret_"#vt.Size)>;
   }
 
-  def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), 1>;
+  def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_region_m0_ret_"#vt.Size), 1>;
+  def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), 1>;
 }
+} // End SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10
+
+let SubtargetPredicate = isGFX11Plus in {
+// The order of src and cmp agrees with the BUFFER_ATOMIC_CMPSWAP opcode.
+class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat <
+  (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$cmp, vt:$swap),
+  (inst $ptr, getVregSrcForVT<vt>.ret:$swap, getVregSrcForVT<vt>.ret:$cmp, offset:$offset, (i1 gds))
+>;
 
+multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt, string frag> {
 
+  def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
+                        !cast<PatFrag>(frag#"_local_ret_"#vt.Size)>;
+  def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt,
+                        !cast<PatFrag>(frag#"_local_noret_"#vt.Size)>;
+
+  def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_ret_"#vt.Size), 1>;
+  def : DSAtomicCmpXChg<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), 1>;
+}
+} // End SubtargetPredicate = isGFX11Plus
 
 // 32-bit atomics.
 defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B32, i32, "atomic_swap">;
-defm : DSAtomicRetPat_mc<DS_ADD_RTN_U32, i32, "atomic_load_add">;
-defm : DSAtomicRetPat_mc<DS_SUB_RTN_U32, i32, "atomic_load_sub">;
-defm : DSAtomicRetPat_mc<DS_INC_RTN_U32, i32, "atomic_inc">;
-defm : DSAtomicRetPat_mc<DS_DEC_RTN_U32, i32, "atomic_dec">;
-defm : DSAtomicRetPat_mc<DS_AND_RTN_B32, i32, "atomic_load_and">;
-defm : DSAtomicRetPat_mc<DS_OR_RTN_B32, i32, "atomic_load_or">;
-defm : DSAtomicRetPat_mc<DS_XOR_RTN_B32, i32, "atomic_load_xor">;
-defm : DSAtomicRetPat_mc<DS_MIN_RTN_I32, i32, "atomic_load_min">;
-defm : DSAtomicRetPat_mc<DS_MAX_RTN_I32, i32, "atomic_load_max">;
-defm : DSAtomicRetPat_mc<DS_MIN_RTN_U32, i32, "atomic_load_umin">;
-defm : DSAtomicRetPat_mc<DS_MAX_RTN_U32, i32, "atomic_load_umax">;
-defm : DSAtomicRetPat_mc<DS_MIN_RTN_F32, f32, "atomic_load_fmin">;
-defm : DSAtomicRetPat_mc<DS_MAX_RTN_F32, f32, "atomic_load_fmax">;
-defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B32, i32, "atomic_cmp_swap">;
+defm : DSAtomicRetNoRetPat_mc<DS_ADD_RTN_U32, DS_ADD_U32, i32, "atomic_load_add">;
+defm : DSAtomicRetNoRetPat_mc<DS_SUB_RTN_U32, DS_SUB_U32, i32, "atomic_load_sub">;
+defm : DSAtomicRetNoRetPat_mc<DS_INC_RTN_U32, DS_INC_U32, i32, "atomic_inc">;
+defm : DSAtomicRetNoRetPat_mc<DS_DEC_RTN_U32, DS_DEC_U32, i32, "atomic_dec">;
+defm : DSAtomicRetNoRetPat_mc<DS_AND_RTN_B32, DS_AND_B32, i32, "atomic_load_and">;
+defm : DSAtomicRetNoRetPat_mc<DS_OR_RTN_B32, DS_OR_B32, i32, "atomic_load_or">;
+defm : DSAtomicRetNoRetPat_mc<DS_XOR_RTN_B32, DS_XOR_B32, i32, "atomic_load_xor">;
+defm : DSAtomicRetNoRetPat_mc<DS_MIN_RTN_I32, DS_MIN_I32, i32, "atomic_load_min">;
+defm : DSAtomicRetNoRetPat_mc<DS_MAX_RTN_I32, DS_MAX_I32, i32, "atomic_load_max">;
+defm : DSAtomicRetNoRetPat_mc<DS_MIN_RTN_U32, DS_MIN_U32, i32, "atomic_load_umin">;
+defm : DSAtomicRetNoRetPat_mc<DS_MAX_RTN_U32, DS_MAX_U32, i32, "atomic_load_umax">;
+defm : DSAtomicRetNoRetPat_mc<DS_MIN_RTN_F32, DS_MIN_F32, f32, "atomic_load_fmin">;
+defm : DSAtomicRetNoRetPat_mc<DS_MAX_RTN_F32, DS_MAX_F32, f32, "atomic_load_fmax">;
+
+let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
+defm : DSAtomicCmpXChgSwapped_mc<DS_CMPST_RTN_B32, DS_CMPST_B32, i32, "atomic_cmp_swap">;
+}
+
+let SubtargetPredicate = isGFX11Plus in {
+defm : DSAtomicCmpXChg_mc<DS_CMPSTORE_RTN_B32, DS_CMPSTORE_B32, i32, "atomic_cmp_swap">;
+}
 
 let SubtargetPredicate = HasLDSFPAtomicAdd in {
-defm : DSAtomicRetPat_mc<DS_ADD_RTN_F32, f32, "atomic_load_fadd">;
+defm : DSAtomicRetNoRetPat_mc<DS_ADD_RTN_F32, DS_ADD_F32, f32, "atomic_load_fadd">;
 }
 
 // 64-bit atomics.
 defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B64, i64, "atomic_swap">;
-defm : DSAtomicRetPat_mc<DS_ADD_RTN_U64, i64, "atomic_load_add">;
-defm : DSAtomicRetPat_mc<DS_SUB_RTN_U64, i64, "atomic_load_sub">;
-defm : DSAtomicRetPat_mc<DS_INC_RTN_U64, i64, "atomic_inc">;
-defm : DSAtomicRetPat_mc<DS_DEC_RTN_U64, i64, "atomic_dec">;
-defm : DSAtomicRetPat_mc<DS_AND_RTN_B64, i64, "atomic_load_and">;
-defm : DSAtomicRetPat_mc<DS_OR_RTN_B64, i64, "atomic_load_or">;
-defm : DSAtomicRetPat_mc<DS_XOR_RTN_B64, i64, "atomic_load_xor">;
-defm : DSAtomicRetPat_mc<DS_MIN_RTN_I64, i64, "atomic_load_min">;
-defm : DSAtomicRetPat_mc<DS_MAX_RTN_I64, i64, "atomic_load_max">;
-defm : DSAtomicRetPat_mc<DS_MIN_RTN_U64, i64, "atomic_load_umin">;
-defm : DSAtomicRetPat_mc<DS_MAX_RTN_U64, i64, "atomic_load_umax">;
-defm : DSAtomicRetPat_mc<DS_MIN_RTN_F64, f64, "atomic_load_fmin">;
-defm : DSAtomicRetPat_mc<DS_MAX_RTN_F64, f64, "atomic_load_fmax">;
-
-defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B64, i64, "atomic_cmp_swap">;
+defm : DSAtomicRetNoRetPat_mc<DS_ADD_RTN_U64, DS_ADD_U64, i64, "atomic_load_add">;
+defm : DSAtomicRetNoRetPat_mc<DS_SUB_RTN_U64, DS_SUB_U64, i64, "atomic_load_sub">;
+defm : DSAtomicRetNoRetPat_mc<DS_INC_RTN_U64, DS_INC_U64, i64, "atomic_inc">;
+defm : DSAtomicRetNoRetPat_mc<DS_DEC_RTN_U64, DS_DEC_U64, i64, "atomic_dec">;
+defm : DSAtomicRetNoRetPat_mc<DS_AND_RTN_B64, DS_AND_B64, i64, "atomic_load_and">;
+defm : DSAtomicRetNoRetPat_mc<DS_OR_RTN_B64, DS_OR_B64, i64, "atomic_load_or">;
+defm : DSAtomicRetNoRetPat_mc<DS_XOR_RTN_B64, DS_XOR_B64, i64, "atomic_load_xor">;
+defm : DSAtomicRetNoRetPat_mc<DS_MIN_RTN_I64, DS_MIN_I64, i64, "atomic_load_min">;
+defm : DSAtomicRetNoRetPat_mc<DS_MAX_RTN_I64, DS_MAX_I64, i64, "atomic_load_max">;
+defm : DSAtomicRetNoRetPat_mc<DS_MIN_RTN_U64, DS_MIN_U64, i64, "atomic_load_umin">;
+defm : DSAtomicRetNoRetPat_mc<DS_MAX_RTN_U64, DS_MAX_U64, i64, "atomic_load_umax">;
+defm : DSAtomicRetNoRetPat_mc<DS_MIN_RTN_F64, DS_MIN_F64, f64, "atomic_load_fmin">;
+defm : DSAtomicRetNoRetPat_mc<DS_MAX_RTN_F64, DS_MAX_F64, f64, "atomic_load_fmax">;
+
+let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
+defm : DSAtomicCmpXChgSwapped_mc<DS_CMPST_RTN_B64, DS_CMPST_B64, i64, "atomic_cmp_swap">;
+} // End SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10
+
+let SubtargetPredicate = isGFX11Plus in {
+defm : DSAtomicCmpXChg_mc<DS_CMPSTORE_RTN_B64, DS_CMPSTORE_B64, i64, "atomic_cmp_swap">;
+} // End SubtargetPredicate = isGFX11Plus
 
 let SubtargetPredicate = isGFX90APlus in {
-def : DSAtomicRetPat<DS_ADD_RTN_F64, f64, atomic_load_fadd_local_64>;
+def : DSAtomicRetPat<DS_ADD_RTN_F64, f64, atomic_load_fadd_local_ret_64>;
+def : DSAtomicRetPat<DS_ADD_F64, f64, atomic_load_fadd_local_noret_64>;
+}
+
+let SubtargetPredicate = isGFX940Plus in {
+def : DSAtomicRetPat<DS_PK_ADD_RTN_F16, v2f16, atomic_load_fadd_v2f16_local_ret_32>;
+def : DSAtomicRetPat<DS_PK_ADD_F16, v2f16, atomic_load_fadd_v2f16_local_noret_32>;
+def : GCNPat <
+  (v2i16 (int_amdgcn_ds_fadd_v2bf16_ret i32:$ptr, v2i16:$src)),
+  (DS_PK_ADD_RTN_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0)
+>;
+def : GCNPat <
+  (v2i16 (int_amdgcn_ds_fadd_v2bf16_noret i32:$ptr, v2i16:$src)),
+  (DS_PK_ADD_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0)
+>;
 }
 
 def : Pat <
@@ -974,16 +1112,44 @@ def : Pat <
   (DS_ORDERED_COUNT $value, (as_i16imm $offset))
 >;
 
+def : GCNPat <
+  (i64 (int_amdgcn_ds_add_gs_reg_rtn i32:$src, timm:$offset32)),
+  (DS_ADD_GS_REG_RTN VGPR_32:$src, (as_i32timm $offset32))
+>;
+
+def : GCNPat <
+  (i32 (int_amdgcn_ds_add_gs_reg_rtn i32:$src, timm:$offset32)),
+  (EXTRACT_SUBREG
+    (i64 (COPY_TO_REGCLASS
+      (DS_ADD_GS_REG_RTN VGPR_32:$src, (as_i32timm $offset32)),
+      VReg_64)),
+    sub0)
+>;
+
+def : GCNPat <
+  (i64 (int_amdgcn_ds_sub_gs_reg_rtn i32:$src, timm:$offset32)),
+  (DS_SUB_GS_REG_RTN VGPR_32:$src, (as_i32timm $offset32))
+>;
+
+def : GCNPat <
+  (i32 (int_amdgcn_ds_sub_gs_reg_rtn i32:$src, timm:$offset32)),
+  (EXTRACT_SUBREG
+    (i64 (COPY_TO_REGCLASS
+      (DS_SUB_GS_REG_RTN VGPR_32:$src, (as_i32timm $offset32)),
+      VReg_64)),
+    sub0)
+>;
+
 //===----------------------------------------------------------------------===//
 // Target-specific instruction encodings.
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-// Base ENC_DS for GFX6, GFX7, GFX10.
+// Base ENC_DS for GFX6, GFX7, GFX10, GFX11.
 //===----------------------------------------------------------------------===//
 
-class Base_DS_Real_gfx6_gfx7_gfx10<bits<8> op, DS_Pseudo ps, int ef> :
-    DS_Real<ps>, SIMCInstr <ps.Mnemonic, ef> {
+class Base_DS_Real_gfx6_gfx7_gfx10_gfx11<bits<8> op, DS_Pseudo ps, int ef, string opName = ps.Mnemonic> :
+    DS_Real<ps, opName>, SIMCInstr <ps.Mnemonic, ef> {
 
   let Inst{7-0}   = !if(ps.has_offset0, offset0, 0);
   let Inst{15-8}  = !if(ps.has_offset1, offset1, 0);
@@ -996,20 +1162,90 @@ class Base_DS_Real_gfx6_gfx7_gfx10<bits<8> op, DS_Pseudo ps, int ef> :
   let Inst{63-56} = !if(ps.has_vdst, vdst{7-0}, 0);
 }
 
+//===----------------------------------------------------------------------===//
+// GFX11.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX11Plus, DecoderNamespace = "GFX11" in {
+  multiclass DS_Real_gfx11<bits<8> op>  {
+    def _gfx11 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11<op, !cast<DS_Pseudo>(NAME),
+                                              SIEncodingFamily.GFX11>;
+  }
+
+  multiclass DS_Real_Renamed_gfx11<bits<8> op, DS_Pseudo backing_pseudo, string real_name> {
+     def _gfx11 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11<op, backing_pseudo, SIEncodingFamily.GFX11, real_name>,
+               MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX11Plus]>;
+  }
+} // End AssemblerPredicate = isGFX11Plus, DecoderNamespace = "GFX11"
+
+defm DS_STORE_B32                        : DS_Real_Renamed_gfx11<0x00d, DS_WRITE_B32, "ds_store_b32">;
+defm DS_STORE_2ADDR_B32                  : DS_Real_Renamed_gfx11<0x00e, DS_WRITE2_B32, "ds_store_2addr_b32">;
+defm DS_STORE_2ADDR_STRIDE64_B32         : DS_Real_Renamed_gfx11<0x00f, DS_WRITE2ST64_B32, "ds_store_2addr_stride64_b32">;
+defm DS_STORE_B8                         : DS_Real_Renamed_gfx11<0x01e, DS_WRITE_B8, "ds_store_b8">;
+defm DS_STORE_B16                        : DS_Real_Renamed_gfx11<0x01f, DS_WRITE_B16, "ds_store_b16">;
+defm DS_STOREXCHG_RTN_B32                : DS_Real_Renamed_gfx11<0x02d, DS_WRXCHG_RTN_B32, "ds_storexchg_rtn_b32">;
+defm DS_STOREXCHG_2ADDR_RTN_B32          : DS_Real_Renamed_gfx11<0x02e, DS_WRXCHG2_RTN_B32, "ds_storexchg_2addr_rtn_b32">;
+defm DS_STOREXCHG_2ADDR_STRIDE64_RTN_B32 : DS_Real_Renamed_gfx11<0x02f, DS_WRXCHG2ST64_RTN_B32, "ds_storexchg_2addr_stride64_rtn_b32">;
+defm DS_LOAD_B32                         : DS_Real_Renamed_gfx11<0x036, DS_READ_B32, "ds_load_b32">;
+defm DS_LOAD_2ADDR_B32                   : DS_Real_Renamed_gfx11<0x037, DS_READ2_B32, "ds_load_2addr_b32">;
+defm DS_LOAD_2ADDR_STRIDE64_B32          : DS_Real_Renamed_gfx11<0x038, DS_READ2ST64_B32, "ds_load_2addr_stride64_b32">;
+defm DS_LOAD_I8                          : DS_Real_Renamed_gfx11<0x039, DS_READ_I8, "ds_load_i8">;
+defm DS_LOAD_U8                          : DS_Real_Renamed_gfx11<0x03a, DS_READ_U8, "ds_load_u8">;
+defm DS_LOAD_I16                         : DS_Real_Renamed_gfx11<0x03b, DS_READ_I16, "ds_load_i16">;
+defm DS_LOAD_U16                         : DS_Real_Renamed_gfx11<0x03c, DS_READ_U16, "ds_load_u16">;
+defm DS_STORE_B64                        : DS_Real_Renamed_gfx11<0x04d, DS_WRITE_B64, "ds_store_b64">;
+defm DS_STORE_2ADDR_B64                  : DS_Real_Renamed_gfx11<0x04e, DS_WRITE2_B64, "ds_store_2addr_b64">;
+defm DS_STORE_2ADDR_STRIDE64_B64         : DS_Real_Renamed_gfx11<0x04f, DS_WRITE2ST64_B64, "ds_store_2addr_stride64_b64">;
+defm DS_STOREXCHG_RTN_B64                : DS_Real_Renamed_gfx11<0x06d, DS_WRXCHG_RTN_B64, "ds_storexchg_rtn_b64">;
+defm DS_STOREXCHG_2ADDR_RTN_B64          : DS_Real_Renamed_gfx11<0x06e, DS_WRXCHG2_RTN_B64, "ds_storexchg_2addr_rtn_b64">;
+defm DS_STOREXCHG_2ADDR_STRIDE64_RTN_B64 : DS_Real_Renamed_gfx11<0x06f, DS_WRXCHG2ST64_RTN_B64, "ds_storexchg_2addr_stride64_rtn_b64">;
+defm DS_LOAD_B64                         : DS_Real_Renamed_gfx11<0x076, DS_READ_B64, "ds_load_b64">;
+defm DS_LOAD_2ADDR_B64                   : DS_Real_Renamed_gfx11<0x077, DS_READ2_B64, "ds_load_2addr_b64">;
+defm DS_LOAD_2ADDR_STRIDE64_B64          : DS_Real_Renamed_gfx11<0x078, DS_READ2ST64_B64, "ds_load_2addr_stride64_b64">;
+defm DS_STORE_B8_D16_HI                  : DS_Real_Renamed_gfx11<0x0a0, DS_WRITE_B8_D16_HI, "ds_store_b8_d16_hi">;
+defm DS_STORE_B16_D16_HI                 : DS_Real_Renamed_gfx11<0x0a1, DS_WRITE_B16_D16_HI, "ds_store_b16_d16_hi">;
+defm DS_LOAD_U8_D16                      : DS_Real_Renamed_gfx11<0x0a2, DS_READ_U8_D16, "ds_load_u8_d16">;
+defm DS_LOAD_U8_D16_HI                   : DS_Real_Renamed_gfx11<0x0a3, DS_READ_U8_D16_HI, "ds_load_u8_d16_hi">;
+defm DS_LOAD_I8_D16                      : DS_Real_Renamed_gfx11<0x0a4, DS_READ_I8_D16, "ds_load_i8_d16">;
+defm DS_LOAD_I8_D16_HI                   : DS_Real_Renamed_gfx11<0x0a5, DS_READ_I8_D16_HI, "ds_load_i8_d16_hi">;
+defm DS_LOAD_U16_D16                     : DS_Real_Renamed_gfx11<0x0a6, DS_READ_U16_D16, "ds_load_u16_d16">;
+defm DS_LOAD_U16_D16_HI                  : DS_Real_Renamed_gfx11<0x0a7, DS_READ_U16_D16_HI, "ds_load_u16_d16_hi">;
+defm DS_STORE_ADDTID_B32                 : DS_Real_Renamed_gfx11<0x0b0, DS_WRITE_ADDTID_B32, "ds_store_addtid_b32">;
+defm DS_LOAD_ADDTID_B32                  : DS_Real_Renamed_gfx11<0x0b1, DS_READ_ADDTID_B32, "ds_load_addtid_b32">;
+defm DS_STORE_B96                        : DS_Real_Renamed_gfx11<0x0de, DS_WRITE_B96, "ds_store_b96">;
+defm DS_STORE_B128                       : DS_Real_Renamed_gfx11<0x0df, DS_WRITE_B128, "ds_store_b128">;
+defm DS_LOAD_B96                         : DS_Real_Renamed_gfx11<0x0fe, DS_READ_B96, "ds_load_b96">;
+defm DS_LOAD_B128                        : DS_Real_Renamed_gfx11<0x0ff, DS_READ_B128, "ds_load_b128">;
+
+// DS_CMPST_* are renamed to DS_CMPSTORE_* in GFX11, but also the data operands (src and cmp) are swapped
+// comparing to pre-GFX11.
+// Note: the mnemonic alias is not generated to avoid a potential ambiguity due to the semantics change.
+
+defm DS_CMPSTORE_B32                     : DS_Real_gfx11<0x010>;
+defm DS_CMPSTORE_F32                     : DS_Real_gfx11<0x011>;
+defm DS_CMPSTORE_RTN_B32                 : DS_Real_gfx11<0x030>;
+defm DS_CMPSTORE_RTN_F32                 : DS_Real_gfx11<0x031>;
+defm DS_CMPSTORE_B64                     : DS_Real_gfx11<0x050>;
+defm DS_CMPSTORE_F64                     : DS_Real_gfx11<0x051>;
+defm DS_CMPSTORE_RTN_B64                 : DS_Real_gfx11<0x070>;
+defm DS_CMPSTORE_RTN_F64                 : DS_Real_gfx11<0x071>;
+
+defm DS_ADD_RTN_F32                      : DS_Real_gfx11<0x079>;
+defm DS_ADD_GS_REG_RTN                   : DS_Real_gfx11<0x07a>;
+defm DS_SUB_GS_REG_RTN                   : DS_Real_gfx11<0x07b>;
+
 //===----------------------------------------------------------------------===//
 // GFX10.
 //===----------------------------------------------------------------------===//
 
-let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
   multiclass DS_Real_gfx10<bits<8> op>  {
-    def _gfx10 : Base_DS_Real_gfx6_gfx7_gfx10<op, !cast<DS_Pseudo>(NAME),
+    def _gfx10 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11<op, !cast<DS_Pseudo>(NAME),
                                               SIEncodingFamily.GFX10>;
   }
-} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
+} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
 
-defm DS_ADD_F32          : DS_Real_gfx10<0x015>;
 defm DS_ADD_RTN_F32      : DS_Real_gfx10<0x055>;
-defm DS_ADD_SRC2_F32     : DS_Real_gfx10<0x095>;
 defm DS_WRITE_B8_D16_HI  : DS_Real_gfx10<0x0a0>;
 defm DS_WRITE_B16_D16_HI : DS_Real_gfx10<0x0a1>;
 defm DS_READ_U8_D16      : DS_Real_gfx10<0x0a2>;
@@ -1020,95 +1256,118 @@ defm DS_READ_U16_D16     : DS_Real_gfx10<0x0a6>;
 defm DS_READ_U16_D16_HI  : DS_Real_gfx10<0x0a7>;
 defm DS_WRITE_ADDTID_B32 : DS_Real_gfx10<0x0b0>;
 defm DS_READ_ADDTID_B32  : DS_Real_gfx10<0x0b1>;
-defm DS_PERMUTE_B32      : DS_Real_gfx10<0x0b2>;
-defm DS_BPERMUTE_B32     : DS_Real_gfx10<0x0b3>;
 
 //===----------------------------------------------------------------------===//
-// GFX7, GFX10.
+// GFX10, GFX11.
+//===----------------------------------------------------------------------===//
+
+multiclass DS_Real_gfx10_gfx11<bits<8> op> :
+  DS_Real_gfx10<op>, DS_Real_gfx11<op>;
+
+defm DS_ADD_F32          : DS_Real_gfx10_gfx11<0x015>;
+defm DS_ADD_SRC2_F32     : DS_Real_gfx10<0x095>;
+defm DS_PERMUTE_B32      : DS_Real_gfx10_gfx11<0x0b2>;
+defm DS_BPERMUTE_B32     : DS_Real_gfx10_gfx11<0x0b3>;
+
+//===----------------------------------------------------------------------===//
+// GFX7, GFX10, GFX11.
 //===----------------------------------------------------------------------===//
 
 let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in {
   multiclass DS_Real_gfx7<bits<8> op> {
-    def _gfx7 : Base_DS_Real_gfx6_gfx7_gfx10<op, !cast<DS_Pseudo>(NAME),
+    def _gfx7 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11<op, !cast<DS_Pseudo>(NAME),
                                              SIEncodingFamily.SI>;
   }
 } // End AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7"
 
+multiclass DS_Real_gfx7_gfx10_gfx11<bits<8> op> :
+  DS_Real_gfx7<op>, DS_Real_gfx10_gfx11<op>;
+
 multiclass DS_Real_gfx7_gfx10<bits<8> op> :
   DS_Real_gfx7<op>, DS_Real_gfx10<op>;
 
 // FIXME-GFX7: Add tests when upstreaming this part.
-defm DS_GWS_SEMA_RELEASE_ALL : DS_Real_gfx7_gfx10<0x018>;
-defm DS_WRAP_RTN_B32         : DS_Real_gfx7_gfx10<0x034>;
-defm DS_CONDXCHG32_RTN_B64   : DS_Real_gfx7_gfx10<0x07e>;
+defm DS_GWS_SEMA_RELEASE_ALL : DS_Real_gfx7_gfx10_gfx11<0x018>;
+defm DS_WRAP_RTN_B32         : DS_Real_gfx7_gfx10_gfx11<0x034>;
+defm DS_CONDXCHG32_RTN_B64   : DS_Real_gfx7_gfx10_gfx11<0x07e>;
 defm DS_WRITE_B96            : DS_Real_gfx7_gfx10<0x0de>;
 defm DS_WRITE_B128           : DS_Real_gfx7_gfx10<0x0df>;
 defm DS_READ_B96             : DS_Real_gfx7_gfx10<0x0fe>;
 defm DS_READ_B128            : DS_Real_gfx7_gfx10<0x0ff>;
 
 //===----------------------------------------------------------------------===//
-// GFX6, GFX7, GFX10.
+// GFX6, GFX7, GFX10, GFX11.
 //===----------------------------------------------------------------------===//
 
 let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
   multiclass DS_Real_gfx6_gfx7<bits<8> op> {
-    def _gfx6_gfx7 : Base_DS_Real_gfx6_gfx7_gfx10<op, !cast<DS_Pseudo>(NAME),
+    def _gfx6_gfx7 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11<op, !cast<DS_Pseudo>(NAME),
                                                   SIEncodingFamily.SI>;
   }
 } // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7"
 
+multiclass DS_Real_gfx6_gfx7_gfx10_gfx11<bits<8> op> :
+  DS_Real_gfx6_gfx7<op>, DS_Real_gfx10_gfx11<op>;
+
 multiclass DS_Real_gfx6_gfx7_gfx10<bits<8> op> :
   DS_Real_gfx6_gfx7<op>, DS_Real_gfx10<op>;
 
-defm DS_ADD_U32             : DS_Real_gfx6_gfx7_gfx10<0x000>;
-defm DS_SUB_U32             : DS_Real_gfx6_gfx7_gfx10<0x001>;
-defm DS_RSUB_U32            : DS_Real_gfx6_gfx7_gfx10<0x002>;
-defm DS_INC_U32             : DS_Real_gfx6_gfx7_gfx10<0x003>;
-defm DS_DEC_U32             : DS_Real_gfx6_gfx7_gfx10<0x004>;
-defm DS_MIN_I32             : DS_Real_gfx6_gfx7_gfx10<0x005>;
-defm DS_MAX_I32             : DS_Real_gfx6_gfx7_gfx10<0x006>;
-defm DS_MIN_U32             : DS_Real_gfx6_gfx7_gfx10<0x007>;
-defm DS_MAX_U32             : DS_Real_gfx6_gfx7_gfx10<0x008>;
-defm DS_AND_B32             : DS_Real_gfx6_gfx7_gfx10<0x009>;
-defm DS_OR_B32              : DS_Real_gfx6_gfx7_gfx10<0x00a>;
-defm DS_XOR_B32             : DS_Real_gfx6_gfx7_gfx10<0x00b>;
-defm DS_MSKOR_B32           : DS_Real_gfx6_gfx7_gfx10<0x00c>;
+defm DS_ADD_U32             : DS_Real_gfx6_gfx7_gfx10_gfx11<0x000>;
+defm DS_SUB_U32             : DS_Real_gfx6_gfx7_gfx10_gfx11<0x001>;
+defm DS_RSUB_U32            : DS_Real_gfx6_gfx7_gfx10_gfx11<0x002>;
+defm DS_INC_U32             : DS_Real_gfx6_gfx7_gfx10_gfx11<0x003>;
+defm DS_DEC_U32             : DS_Real_gfx6_gfx7_gfx10_gfx11<0x004>;
+defm DS_MIN_I32             : DS_Real_gfx6_gfx7_gfx10_gfx11<0x005>;
+defm DS_MAX_I32             : DS_Real_gfx6_gfx7_gfx10_gfx11<0x006>;
+defm DS_MIN_U32             : DS_Real_gfx6_gfx7_gfx10_gfx11<0x007>;
+defm DS_MAX_U32             : DS_Real_gfx6_gfx7_gfx10_gfx11<0x008>;
+defm DS_AND_B32             : DS_Real_gfx6_gfx7_gfx10_gfx11<0x009>;
+defm DS_OR_B32              : DS_Real_gfx6_gfx7_gfx10_gfx11<0x00a>;
+defm DS_XOR_B32             : DS_Real_gfx6_gfx7_gfx10_gfx11<0x00b>;
+defm DS_MSKOR_B32           : DS_Real_gfx6_gfx7_gfx10_gfx11<0x00c>;
+
 defm DS_WRITE_B32           : DS_Real_gfx6_gfx7_gfx10<0x00d>;
 defm DS_WRITE2_B32          : DS_Real_gfx6_gfx7_gfx10<0x00e>;
 defm DS_WRITE2ST64_B32      : DS_Real_gfx6_gfx7_gfx10<0x00f>;
 defm DS_CMPST_B32           : DS_Real_gfx6_gfx7_gfx10<0x010>;
 defm DS_CMPST_F32           : DS_Real_gfx6_gfx7_gfx10<0x011>;
-defm DS_MIN_F32             : DS_Real_gfx6_gfx7_gfx10<0x012>;
-defm DS_MAX_F32             : DS_Real_gfx6_gfx7_gfx10<0x013>;
-defm DS_NOP                 : DS_Real_gfx6_gfx7_gfx10<0x014>;
-defm DS_GWS_INIT            : DS_Real_gfx6_gfx7_gfx10<0x019>;
-defm DS_GWS_SEMA_V          : DS_Real_gfx6_gfx7_gfx10<0x01a>;
-defm DS_GWS_SEMA_BR         : DS_Real_gfx6_gfx7_gfx10<0x01b>;
-defm DS_GWS_SEMA_P          : DS_Real_gfx6_gfx7_gfx10<0x01c>;
-defm DS_GWS_BARRIER         : DS_Real_gfx6_gfx7_gfx10<0x01d>;
+
+defm DS_MIN_F32             : DS_Real_gfx6_gfx7_gfx10_gfx11<0x012>;
+defm DS_MAX_F32             : DS_Real_gfx6_gfx7_gfx10_gfx11<0x013>;
+defm DS_NOP                 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x014>;
+defm DS_GWS_INIT            : DS_Real_gfx6_gfx7_gfx10_gfx11<0x019>;
+defm DS_GWS_SEMA_V          : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01a>;
+defm DS_GWS_SEMA_BR         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01b>;
+defm DS_GWS_SEMA_P          : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01c>;
+defm DS_GWS_BARRIER         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01d>;
+
 defm DS_WRITE_B8            : DS_Real_gfx6_gfx7_gfx10<0x01e>;
 defm DS_WRITE_B16           : DS_Real_gfx6_gfx7_gfx10<0x01f>;
-defm DS_ADD_RTN_U32         : DS_Real_gfx6_gfx7_gfx10<0x020>;
-defm DS_SUB_RTN_U32         : DS_Real_gfx6_gfx7_gfx10<0x021>;
-defm DS_RSUB_RTN_U32        : DS_Real_gfx6_gfx7_gfx10<0x022>;
-defm DS_INC_RTN_U32         : DS_Real_gfx6_gfx7_gfx10<0x023>;
-defm DS_DEC_RTN_U32         : DS_Real_gfx6_gfx7_gfx10<0x024>;
-defm DS_MIN_RTN_I32         : DS_Real_gfx6_gfx7_gfx10<0x025>;
-defm DS_MAX_RTN_I32         : DS_Real_gfx6_gfx7_gfx10<0x026>;
-defm DS_MIN_RTN_U32         : DS_Real_gfx6_gfx7_gfx10<0x027>;
-defm DS_MAX_RTN_U32         : DS_Real_gfx6_gfx7_gfx10<0x028>;
-defm DS_AND_RTN_B32         : DS_Real_gfx6_gfx7_gfx10<0x029>;
-defm DS_OR_RTN_B32          : DS_Real_gfx6_gfx7_gfx10<0x02a>;
-defm DS_XOR_RTN_B32         : DS_Real_gfx6_gfx7_gfx10<0x02b>;
-defm DS_MSKOR_RTN_B32       : DS_Real_gfx6_gfx7_gfx10<0x02c>;
+
+defm DS_ADD_RTN_U32         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x020>;
+defm DS_SUB_RTN_U32         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x021>;
+defm DS_RSUB_RTN_U32        : DS_Real_gfx6_gfx7_gfx10_gfx11<0x022>;
+defm DS_INC_RTN_U32         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x023>;
+defm DS_DEC_RTN_U32         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x024>;
+defm DS_MIN_RTN_I32         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x025>;
+defm DS_MAX_RTN_I32         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x026>;
+defm DS_MIN_RTN_U32         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x027>;
+defm DS_MAX_RTN_U32         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x028>;
+defm DS_AND_RTN_B32         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x029>;
+defm DS_OR_RTN_B32          : DS_Real_gfx6_gfx7_gfx10_gfx11<0x02a>;
+defm DS_XOR_RTN_B32         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x02b>;
+defm DS_MSKOR_RTN_B32       : DS_Real_gfx6_gfx7_gfx10_gfx11<0x02c>;
+
 defm DS_WRXCHG_RTN_B32      : DS_Real_gfx6_gfx7_gfx10<0x02d>;
 defm DS_WRXCHG2_RTN_B32     : DS_Real_gfx6_gfx7_gfx10<0x02e>;
 defm DS_WRXCHG2ST64_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02f>;
 defm DS_CMPST_RTN_B32       : DS_Real_gfx6_gfx7_gfx10<0x030>;
 defm DS_CMPST_RTN_F32       : DS_Real_gfx6_gfx7_gfx10<0x031>;
-defm DS_MIN_RTN_F32         : DS_Real_gfx6_gfx7_gfx10<0x032>;
-defm DS_MAX_RTN_F32         : DS_Real_gfx6_gfx7_gfx10<0x033>;
-defm DS_SWIZZLE_B32         : DS_Real_gfx6_gfx7_gfx10<0x035>;
+
+defm DS_MIN_RTN_F32         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x032>;
+defm DS_MAX_RTN_F32         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x033>;
+defm DS_SWIZZLE_B32         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x035>;
+
 defm DS_READ_B32            : DS_Real_gfx6_gfx7_gfx10<0x036>;
 defm DS_READ2_B32           : DS_Real_gfx6_gfx7_gfx10<0x037>;
 defm DS_READ2ST64_B32       : DS_Real_gfx6_gfx7_gfx10<0x038>;
@@ -1116,49 +1375,55 @@ defm DS_READ_I8             : DS_Real_gfx6_gfx7_gfx10<0x039>;
 defm DS_READ_U8             : DS_Real_gfx6_gfx7_gfx10<0x03a>;
 defm DS_READ_I16            : DS_Real_gfx6_gfx7_gfx10<0x03b>;
 defm DS_READ_U16            : DS_Real_gfx6_gfx7_gfx10<0x03c>;
-defm DS_CONSUME             : DS_Real_gfx6_gfx7_gfx10<0x03d>;
-defm DS_APPEND              : DS_Real_gfx6_gfx7_gfx10<0x03e>;
-defm DS_ORDERED_COUNT       : DS_Real_gfx6_gfx7_gfx10<0x03f>;
-defm DS_ADD_U64             : DS_Real_gfx6_gfx7_gfx10<0x040>;
-defm DS_SUB_U64             : DS_Real_gfx6_gfx7_gfx10<0x041>;
-defm DS_RSUB_U64            : DS_Real_gfx6_gfx7_gfx10<0x042>;
-defm DS_INC_U64             : DS_Real_gfx6_gfx7_gfx10<0x043>;
-defm DS_DEC_U64             : DS_Real_gfx6_gfx7_gfx10<0x044>;
-defm DS_MIN_I64             : DS_Real_gfx6_gfx7_gfx10<0x045>;
-defm DS_MAX_I64             : DS_Real_gfx6_gfx7_gfx10<0x046>;
-defm DS_MIN_U64             : DS_Real_gfx6_gfx7_gfx10<0x047>;
-defm DS_MAX_U64             : DS_Real_gfx6_gfx7_gfx10<0x048>;
-defm DS_AND_B64             : DS_Real_gfx6_gfx7_gfx10<0x049>;
-defm DS_OR_B64              : DS_Real_gfx6_gfx7_gfx10<0x04a>;
-defm DS_XOR_B64             : DS_Real_gfx6_gfx7_gfx10<0x04b>;
-defm DS_MSKOR_B64           : DS_Real_gfx6_gfx7_gfx10<0x04c>;
+
+defm DS_CONSUME             : DS_Real_gfx6_gfx7_gfx10_gfx11<0x03d>;
+defm DS_APPEND              : DS_Real_gfx6_gfx7_gfx10_gfx11<0x03e>;
+defm DS_ORDERED_COUNT       : DS_Real_gfx6_gfx7_gfx10_gfx11<0x03f>;
+defm DS_ADD_U64             : DS_Real_gfx6_gfx7_gfx10_gfx11<0x040>;
+defm DS_SUB_U64             : DS_Real_gfx6_gfx7_gfx10_gfx11<0x041>;
+defm DS_RSUB_U64            : DS_Real_gfx6_gfx7_gfx10_gfx11<0x042>;
+defm DS_INC_U64             : DS_Real_gfx6_gfx7_gfx10_gfx11<0x043>;
+defm DS_DEC_U64             : DS_Real_gfx6_gfx7_gfx10_gfx11<0x044>;
+defm DS_MIN_I64             : DS_Real_gfx6_gfx7_gfx10_gfx11<0x045>;
+defm DS_MAX_I64             : DS_Real_gfx6_gfx7_gfx10_gfx11<0x046>;
+defm DS_MIN_U64             : DS_Real_gfx6_gfx7_gfx10_gfx11<0x047>;
+defm DS_MAX_U64             : DS_Real_gfx6_gfx7_gfx10_gfx11<0x048>;
+defm DS_AND_B64             : DS_Real_gfx6_gfx7_gfx10_gfx11<0x049>;
+defm DS_OR_B64              : DS_Real_gfx6_gfx7_gfx10_gfx11<0x04a>;
+defm DS_XOR_B64             : DS_Real_gfx6_gfx7_gfx10_gfx11<0x04b>;
+defm DS_MSKOR_B64           : DS_Real_gfx6_gfx7_gfx10_gfx11<0x04c>;
+
 defm DS_WRITE_B64           : DS_Real_gfx6_gfx7_gfx10<0x04d>;
 defm DS_WRITE2_B64          : DS_Real_gfx6_gfx7_gfx10<0x04e>;
 defm DS_WRITE2ST64_B64      : DS_Real_gfx6_gfx7_gfx10<0x04f>;
 defm DS_CMPST_B64           : DS_Real_gfx6_gfx7_gfx10<0x050>;
 defm DS_CMPST_F64           : DS_Real_gfx6_gfx7_gfx10<0x051>;
-defm DS_MIN_F64             : DS_Real_gfx6_gfx7_gfx10<0x052>;
-defm DS_MAX_F64             : DS_Real_gfx6_gfx7_gfx10<0x053>;
-defm DS_ADD_RTN_U64         : DS_Real_gfx6_gfx7_gfx10<0x060>;
-defm DS_SUB_RTN_U64         : DS_Real_gfx6_gfx7_gfx10<0x061>;
-defm DS_RSUB_RTN_U64        : DS_Real_gfx6_gfx7_gfx10<0x062>;
-defm DS_INC_RTN_U64         : DS_Real_gfx6_gfx7_gfx10<0x063>;
-defm DS_DEC_RTN_U64         : DS_Real_gfx6_gfx7_gfx10<0x064>;
-defm DS_MIN_RTN_I64         : DS_Real_gfx6_gfx7_gfx10<0x065>;
-defm DS_MAX_RTN_I64         : DS_Real_gfx6_gfx7_gfx10<0x066>;
-defm DS_MIN_RTN_U64         : DS_Real_gfx6_gfx7_gfx10<0x067>;
-defm DS_MAX_RTN_U64         : DS_Real_gfx6_gfx7_gfx10<0x068>;
-defm DS_AND_RTN_B64         : DS_Real_gfx6_gfx7_gfx10<0x069>;
-defm DS_OR_RTN_B64          : DS_Real_gfx6_gfx7_gfx10<0x06a>;
-defm DS_XOR_RTN_B64         : DS_Real_gfx6_gfx7_gfx10<0x06b>;
-defm DS_MSKOR_RTN_B64       : DS_Real_gfx6_gfx7_gfx10<0x06c>;
+
+defm DS_MIN_F64             : DS_Real_gfx6_gfx7_gfx10_gfx11<0x052>;
+defm DS_MAX_F64             : DS_Real_gfx6_gfx7_gfx10_gfx11<0x053>;
+defm DS_ADD_RTN_U64         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x060>;
+defm DS_SUB_RTN_U64         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x061>;
+defm DS_RSUB_RTN_U64        : DS_Real_gfx6_gfx7_gfx10_gfx11<0x062>;
+defm DS_INC_RTN_U64         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x063>;
+defm DS_DEC_RTN_U64         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x064>;
+defm DS_MIN_RTN_I64         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x065>;
+defm DS_MAX_RTN_I64         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x066>;
+defm DS_MIN_RTN_U64         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x067>;
+defm DS_MAX_RTN_U64         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x068>;
+defm DS_AND_RTN_B64         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x069>;
+defm DS_OR_RTN_B64          : DS_Real_gfx6_gfx7_gfx10_gfx11<0x06a>;
+defm DS_XOR_RTN_B64         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x06b>;
+defm DS_MSKOR_RTN_B64       : DS_Real_gfx6_gfx7_gfx10_gfx11<0x06c>;
+
 defm DS_WRXCHG_RTN_B64      : DS_Real_gfx6_gfx7_gfx10<0x06d>;
 defm DS_WRXCHG2_RTN_B64     : DS_Real_gfx6_gfx7_gfx10<0x06e>;
 defm DS_WRXCHG2ST64_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06f>;
 defm DS_CMPST_RTN_B64       : DS_Real_gfx6_gfx7_gfx10<0x070>;
 defm DS_CMPST_RTN_F64       : DS_Real_gfx6_gfx7_gfx10<0x071>;
-defm DS_MIN_RTN_F64         : DS_Real_gfx6_gfx7_gfx10<0x072>;
-defm DS_MAX_RTN_F64         : DS_Real_gfx6_gfx7_gfx10<0x073>;
+
+defm DS_MIN_RTN_F64         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x072>;
+defm DS_MAX_RTN_F64         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x073>;
+
 defm DS_READ_B64            : DS_Real_gfx6_gfx7_gfx10<0x076>;
 defm DS_READ2_B64           : DS_Real_gfx6_gfx7_gfx10<0x077>;
 defm DS_READ2ST64_B64       : DS_Real_gfx6_gfx7_gfx10<0x078>;
@@ -1381,3 +1646,10 @@ let SubtargetPredicate = isGFX90APlus in {
   def DS_ADD_F64_vi     : DS_Real_vi<0x5c, DS_ADD_F64>;
   def DS_ADD_RTN_F64_vi : DS_Real_vi<0x7c, DS_ADD_RTN_F64>;
 } // End SubtargetPredicate = isGFX90APlus
+
+let SubtargetPredicate = isGFX940Plus in {
+  def DS_PK_ADD_F16_vi     : DS_Real_vi<0x17, DS_PK_ADD_F16>;
+  def DS_PK_ADD_RTN_F16_vi : DS_Real_vi<0xb7, DS_PK_ADD_RTN_F16>;
+  def DS_PK_ADD_BF16_vi     : DS_Real_vi<0x18, DS_PK_ADD_BF16>;
+  def DS_PK_ADD_RTN_BF16_vi : DS_Real_vi<0xb8, DS_PK_ADD_RTN_BF16>;
+} // End SubtargetPredicate = isGFX940Plus
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index e2186d4d533e..ccaf646008b1 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -18,15 +18,20 @@
 
 #include "Disassembler/AMDGPUDisassembler.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIDefines.h"
+#include "SIRegisterInfo.h"
 #include "TargetInfo/AMDGPUTargetInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm-c/DisassemblerTypes.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDecoderOps.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCFixedLenDisassembler.h"
-#include "llvm/MC/TargetRegistry.h"
 #include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/AMDHSAKernelDescriptor.h"
 
 using namespace llvm;
@@ -70,7 +75,8 @@ static int insertNamedMCOperand(MCInst &MI, const MCOperand &Op,
 }
 
 static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm,
-                                       uint64_t Addr, const void *Decoder) {
+                                       uint64_t Addr,
+                                       const MCDisassembler *Decoder) {
   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
 
   // Our branches take a simm16, but we need two extra bits to account for the
@@ -78,13 +84,13 @@ static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm,
   APInt SignedOffset(18, Imm * 4, true);
   int64_t Offset = (SignedOffset.sext(64) + 4 + Addr).getSExtValue();
 
-  if (DAsm->tryAddingSymbolicOperand(Inst, Offset, Addr, true, 2, 2))
+  if (DAsm->tryAddingSymbolicOperand(Inst, Offset, Addr, true, 2, 2, 0))
     return MCDisassembler::Success;
   return addOperand(Inst, MCOperand::createImm(Imm));
 }
 
-static DecodeStatus decodeSMEMOffset(MCInst &Inst, unsigned Imm,
-                                     uint64_t Addr, const void *Decoder) {
+static DecodeStatus decodeSMEMOffset(MCInst &Inst, unsigned Imm, uint64_t Addr,
+                                     const MCDisassembler *Decoder) {
   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
   int64_t Offset;
   if (DAsm->isVI()) {         // VI supports 20-bit unsigned offsets.
@@ -95,20 +101,19 @@ static DecodeStatus decodeSMEMOffset(MCInst &Inst, unsigned Imm,
   return addOperand(Inst, MCOperand::createImm(Offset));
 }
 
-static DecodeStatus decodeBoolReg(MCInst &Inst, unsigned Val,
-                                  uint64_t Addr, const void *Decoder) {
+static DecodeStatus decodeBoolReg(MCInst &Inst, unsigned Val, uint64_t Addr,
+                                  const MCDisassembler *Decoder) {
   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
   return addOperand(Inst, DAsm->decodeBoolReg(Val));
 }
 
-#define DECODE_OPERAND(StaticDecoderName, DecoderName) \
-static DecodeStatus StaticDecoderName(MCInst &Inst, \
-                                       unsigned Imm, \
-                                       uint64_t /*Addr*/, \
-                                       const void *Decoder) { \
-  auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); \
-  return addOperand(Inst, DAsm->DecoderName(Imm)); \
-}
+#define DECODE_OPERAND(StaticDecoderName, DecoderName)                         \
+  static DecodeStatus StaticDecoderName(MCInst &Inst, unsigned Imm,            \
+                                        uint64_t /*Addr*/,                     \
+                                        const MCDisassembler *Decoder) {       \
+    auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);              \
+    return addOperand(Inst, DAsm->DecoderName(Imm));                           \
+  }
 
 #define DECODE_OPERAND_REG(RegClass) \
 DECODE_OPERAND(Decode##RegClass##RegisterClass, decodeOperand_##RegClass)
@@ -144,155 +149,151 @@ DECODE_OPERAND_REG(AReg_512)
 DECODE_OPERAND_REG(AReg_1024)
 DECODE_OPERAND_REG(AV_32)
 DECODE_OPERAND_REG(AV_64)
+DECODE_OPERAND_REG(AV_128)
+DECODE_OPERAND_REG(AVDst_128)
+DECODE_OPERAND_REG(AVDst_512)
 
-static DecodeStatus decodeOperand_VSrc16(MCInst &Inst,
-                                         unsigned Imm,
+static DecodeStatus decodeOperand_VSrc16(MCInst &Inst, unsigned Imm,
                                          uint64_t Addr,
-                                         const void *Decoder) {
+                                         const MCDisassembler *Decoder) {
   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
   return addOperand(Inst, DAsm->decodeOperand_VSrc16(Imm));
 }
 
-static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst,
-                                         unsigned Imm,
-                                         uint64_t Addr,
-                                         const void *Decoder) {
+static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst, unsigned Imm,
+                                           uint64_t Addr,
+                                           const MCDisassembler *Decoder) {
   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
   return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm));
 }
 
-static DecodeStatus decodeOperand_VSrcV232(MCInst &Inst,
-                                           unsigned Imm,
+static DecodeStatus decodeOperand_VSrcV232(MCInst &Inst, unsigned Imm,
                                            uint64_t Addr,
-                                           const void *Decoder) {
+                                           const MCDisassembler *Decoder) {
   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
   return addOperand(Inst, DAsm->decodeOperand_VSrcV232(Imm));
 }
 
-static DecodeStatus decodeOperand_VS_16(MCInst &Inst,
-                                        unsigned Imm,
+static DecodeStatus decodeOperand_VS_16(MCInst &Inst, unsigned Imm,
                                         uint64_t Addr,
-                                        const void *Decoder) {
+                                        const MCDisassembler *Decoder) {
   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
   return addOperand(Inst, DAsm->decodeOperand_VSrc16(Imm));
 }
 
-static DecodeStatus decodeOperand_VS_32(MCInst &Inst,
-                                        unsigned Imm,
+static DecodeStatus decodeOperand_VS_32(MCInst &Inst, unsigned Imm,
                                         uint64_t Addr,
-                                        const void *Decoder) {
+                                        const MCDisassembler *Decoder) {
   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
   return addOperand(Inst, DAsm->decodeOperand_VS_32(Imm));
 }
 
-static DecodeStatus decodeOperand_AReg_64(MCInst &Inst,
-                                          unsigned Imm,
+static DecodeStatus decodeOperand_AReg_64(MCInst &Inst, unsigned Imm,
                                           uint64_t Addr,
-                                          const void *Decoder) {
+                                          const MCDisassembler *Decoder) {
   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
   return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm | 512));
 }
 
-static DecodeStatus decodeOperand_AReg_128(MCInst &Inst,
-                                           unsigned Imm,
+static DecodeStatus decodeOperand_AReg_128(MCInst &Inst, unsigned Imm,
                                            uint64_t Addr,
-                                           const void *Decoder) {
+                                           const MCDisassembler *Decoder) {
   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
   return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW128, Imm | 512));
 }
 
-static DecodeStatus decodeOperand_AReg_256(MCInst &Inst,
-                                           unsigned Imm,
+static DecodeStatus decodeOperand_AReg_256(MCInst &Inst, unsigned Imm,
                                            uint64_t Addr,
-                                           const void *Decoder) {
+                                           const MCDisassembler *Decoder) {
   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
   return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW256, Imm | 512));
 }
 
-static DecodeStatus decodeOperand_AReg_512(MCInst &Inst,
-                                           unsigned Imm,
+static DecodeStatus decodeOperand_AReg_512(MCInst &Inst, unsigned Imm,
                                            uint64_t Addr,
-                                           const void *Decoder) {
+                                           const MCDisassembler *Decoder) {
   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
   return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW512, Imm | 512));
 }
 
-static DecodeStatus decodeOperand_AReg_1024(MCInst &Inst,
-                                            unsigned Imm,
+static DecodeStatus decodeOperand_AReg_1024(MCInst &Inst, unsigned Imm,
                                             uint64_t Addr,
-                                            const void *Decoder) {
+                                            const MCDisassembler *Decoder) {
   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
   return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW1024, Imm | 512));
 }
 
-static DecodeStatus decodeOperand_VReg_64(MCInst &Inst,
-                                          unsigned Imm,
+static DecodeStatus decodeOperand_VReg_64(MCInst &Inst, unsigned Imm,
                                           uint64_t Addr,
-                                          const void *Decoder) {
+                                          const MCDisassembler *Decoder) {
   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
   return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm));
 }
 
-static DecodeStatus decodeOperand_VReg_128(MCInst &Inst,
-                                           unsigned Imm,
+static DecodeStatus decodeOperand_VReg_128(MCInst &Inst, unsigned Imm,
                                            uint64_t Addr,
-                                           const void *Decoder) {
+                                           const MCDisassembler *Decoder) {
   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
   return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW128, Imm));
 }
 
-static DecodeStatus decodeOperand_VReg_256(MCInst &Inst,
-                                           unsigned Imm,
+static DecodeStatus decodeOperand_VReg_256(MCInst &Inst, unsigned Imm,
                                            uint64_t Addr,
-                                           const void *Decoder) {
+                                           const MCDisassembler *Decoder) {
   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
   return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW256, Imm));
 }
 
-static DecodeStatus decodeOperand_VReg_512(MCInst &Inst,
-                                           unsigned Imm,
+static DecodeStatus decodeOperand_VReg_512(MCInst &Inst, unsigned Imm,
                                            uint64_t Addr,
-                                           const void *Decoder) {
+                                           const MCDisassembler *Decoder) {
   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
   return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW512, Imm));
 }
 
-static DecodeStatus decodeOperand_VReg_1024(MCInst &Inst,
-                                            unsigned Imm,
+static DecodeStatus decodeOperand_VReg_1024(MCInst &Inst, unsigned Imm,
                                             uint64_t Addr,
-                                            const void *Decoder) {
+                                            const MCDisassembler *Decoder) {
   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
   return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW1024, Imm));
 }
 
 static DecodeStatus decodeOperand_f32kimm(MCInst &Inst, unsigned Imm,
-                                          uint64_t Addr, const void *Decoder) {
+                                          uint64_t Addr,
+                                          const MCDisassembler *Decoder) {
   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
   return addOperand(Inst, DAsm->decodeMandatoryLiteralConstant(Imm));
 }
 
 static DecodeStatus decodeOperand_f16kimm(MCInst &Inst, unsigned Imm,
-                                          uint64_t Addr, const void *Decoder) {
+                                          uint64_t Addr,
+                                          const MCDisassembler *Decoder) {
   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
   return addOperand(Inst, DAsm->decodeMandatoryLiteralConstant(Imm));
 }
 
-static DecodeStatus decodeOperand_VS_16_Deferred(MCInst &Inst, unsigned Imm,
-                                                 uint64_t Addr,
-                                                 const void *Decoder) {
+static DecodeStatus
+decodeOperand_VS_16_Deferred(MCInst &Inst, unsigned Imm, uint64_t Addr,
+                             const MCDisassembler *Decoder) {
   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
   return addOperand(
       Inst, DAsm->decodeSrcOp(llvm::AMDGPUDisassembler::OPW16, Imm, true));
 }
 
-static DecodeStatus decodeOperand_VS_32_Deferred(MCInst &Inst, unsigned Imm,
-                                                 uint64_t Addr,
-                                                 const void *Decoder) {
+static DecodeStatus
+decodeOperand_VS_32_Deferred(MCInst &Inst, unsigned Imm, uint64_t Addr,
+                             const MCDisassembler *Decoder) {
   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
   return addOperand(
       Inst, DAsm->decodeSrcOp(llvm::AMDGPUDisassembler::OPW32, Imm, true));
 }
 
+static DecodeStatus decodeOperandVOPDDstY(MCInst &Inst, unsigned Val,
+                                          uint64_t Addr, const void *Decoder) {
+  const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
+  return addOperand(Inst, DAsm->decodeVOPDDstYOp(Inst, Val));
+}
+
 static bool IsAGPROperand(const MCInst &Inst, int OpIdx,
                           const MCRegisterInfo *MRI) {
   if (OpIdx < 0)
@@ -307,10 +308,9 @@ static bool IsAGPROperand(const MCInst &Inst, int OpIdx,
   return Reg >= AMDGPU::AGPR0 && Reg <= AMDGPU::AGPR255;
 }
 
-static DecodeStatus decodeOperand_AVLdSt_Any(MCInst &Inst,
-                                             unsigned Imm,
+static DecodeStatus decodeOperand_AVLdSt_Any(MCInst &Inst, unsigned Imm,
                                              AMDGPUDisassembler::OpWidthTy Opw,
-                                             const void *Decoder) {
+                                             const MCDisassembler *Decoder) {
   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
   if (!DAsm->isGFX90A()) {
     Imm &= 511;
@@ -342,54 +342,41 @@ static DecodeStatus decodeOperand_AVLdSt_Any(MCInst &Inst,
   return addOperand(Inst, DAsm->decodeSrcOp(Opw, Imm | 256));
 }
 
-static DecodeStatus DecodeAVLdSt_32RegisterClass(MCInst &Inst,
-                                                 unsigned Imm,
-                                                 uint64_t Addr,
-                                                 const void *Decoder) {
+static DecodeStatus
+DecodeAVLdSt_32RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
+                             const MCDisassembler *Decoder) {
   return decodeOperand_AVLdSt_Any(Inst, Imm,
                                   AMDGPUDisassembler::OPW32, Decoder);
 }
 
-static DecodeStatus DecodeAVLdSt_64RegisterClass(MCInst &Inst,
-                                                 unsigned Imm,
-                                                 uint64_t Addr,
-                                                 const void *Decoder) {
+static DecodeStatus
+DecodeAVLdSt_64RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
+                             const MCDisassembler *Decoder) {
   return decodeOperand_AVLdSt_Any(Inst, Imm,
                                   AMDGPUDisassembler::OPW64, Decoder);
 }
 
-static DecodeStatus DecodeAVLdSt_96RegisterClass(MCInst &Inst,
-                                                 unsigned Imm,
-                                                 uint64_t Addr,
-                                                 const void *Decoder) {
+static DecodeStatus
+DecodeAVLdSt_96RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
+                             const MCDisassembler *Decoder) {
   return decodeOperand_AVLdSt_Any(Inst, Imm,
                                   AMDGPUDisassembler::OPW96, Decoder);
 }
 
-static DecodeStatus DecodeAVLdSt_128RegisterClass(MCInst &Inst,
-                                                  unsigned Imm,
-                                                  uint64_t Addr,
-                                                  const void *Decoder) {
+static DecodeStatus
+DecodeAVLdSt_128RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
+                              const MCDisassembler *Decoder) {
   return decodeOperand_AVLdSt_Any(Inst, Imm,
                                   AMDGPUDisassembler::OPW128, Decoder);
 }
 
-static DecodeStatus decodeOperand_SReg_32(MCInst &Inst,
-                                          unsigned Imm,
+static DecodeStatus decodeOperand_SReg_32(MCInst &Inst, unsigned Imm,
                                           uint64_t Addr,
-                                          const void *Decoder) {
+                                          const MCDisassembler *Decoder) {
   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
   return addOperand(Inst, DAsm->decodeOperand_SReg_32(Imm));
 }
 
-static DecodeStatus decodeOperand_VGPR_32(MCInst &Inst,
-                                         unsigned Imm,
-                                         uint64_t Addr,
-                                         const void *Decoder) {
-  auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
-  return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW32, Imm));
-}
-
 #define DECODE_SDWA(DecName) \
 DECODE_OPERAND(decodeSDWA##DecName, decodeSDWA##DecName)
 
@@ -410,21 +397,15 @@ template <typename T> static inline T eatBytes(ArrayRef<uint8_t>& Bytes) {
   return Res;
 }
 
-DecodeStatus AMDGPUDisassembler::tryDecodeInst(const uint8_t* Table,
-                                               MCInst &MI,
-                                               uint64_t Inst,
-                                               uint64_t Address) const {
-  assert(MI.getOpcode() == 0);
-  assert(MI.getNumOperands() == 0);
-  MCInst TmpInst;
-  HasLiteral = false;
-  const auto SavedBytes = Bytes;
-  if (decodeInstruction(Table, TmpInst, Inst, Address, this, STI)) {
-    MI = TmpInst;
-    return MCDisassembler::Success;
-  }
-  Bytes = SavedBytes;
-  return MCDisassembler::Fail;
+static inline DecoderUInt128 eat12Bytes(ArrayRef<uint8_t> &Bytes) {
+  assert(Bytes.size() >= 12);
+  uint64_t Lo = support::endian::read<uint64_t, support::endianness::little>(
+      Bytes.data());
+  Bytes = Bytes.slice(8);
+  uint64_t Hi = support::endian::read<uint32_t, support::endianness::little>(
+      Bytes.data());
+  Bytes = Bytes.slice(4);
+  return DecoderUInt128(Lo, Hi);
 }
 
 // The disassembler is greedy, so we need to check FI operand value to
@@ -457,6 +438,29 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
 
     // Try to decode DPP and SDWA first to solve conflict with VOP1 and VOP2
     // encodings
+    if (isGFX11Plus() && Bytes.size() >= 12 ) {
+      DecoderUInt128 DecW = eat12Bytes(Bytes);
+      Res = tryDecodeInst(DecoderTableDPP8GFX1196, MI, DecW,
+                                          Address);
+      if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
+        break;
+      MI = MCInst(); // clear
+      Res = tryDecodeInst(DecoderTableDPPGFX1196, MI, DecW,
+                                          Address);
+      if (Res) {
+        if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P)
+          convertVOP3PDPPInst(MI);
+        else if (AMDGPU::isVOPC64DPP(MI.getOpcode()))
+          convertVOPCDPPInst(MI);
+        break;
+      }
+      Res = tryDecodeInst(DecoderTableGFX1196, MI, DecW, Address);
+      if (Res)
+        break;
+    }
+    // Reinitialize Bytes
+    Bytes = Bytes_.slice(0, MaxInstBytesNum);
+
     if (Bytes.size() >= 8) {
       const uint64_t QW = eatBytes<uint64_t>(Bytes);
 
@@ -475,12 +479,23 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
       Res = tryDecodeInst(DecoderTableDPP864, MI, QW, Address);
       if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
         break;
+      MI = MCInst(); // clear
 
+      Res = tryDecodeInst(DecoderTableDPP8GFX1164, MI, QW, Address);
+      if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
+        break;
       MI = MCInst(); // clear
 
       Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address);
       if (Res) break;
 
+      Res = tryDecodeInst(DecoderTableDPPGFX1164, MI, QW, Address);
+      if (Res) {
+        if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC)
+          convertVOPCDPPInst(MI);
+        break;
+      }
+
       Res = tryDecodeInst(DecoderTableSDWA64, MI, QW, Address);
       if (Res) { IsSDWA = true;  break; }
 
@@ -535,6 +550,9 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     Res = tryDecodeInst(DecoderTableGFX1032, MI, DW, Address);
     if (Res) break;
 
+    Res = tryDecodeInst(DecoderTableGFX1132, MI, DW, Address);
+    if (Res) break;
+
     if (Bytes.size() < 4) break;
     const uint64_t QW = ((uint64_t)eatBytes<uint32_t>(Bytes) << 32) | DW;
 
@@ -554,6 +572,13 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     if (Res) break;
 
     Res = tryDecodeInst(DecoderTableGFX1064, MI, QW, Address);
+    if (Res) break;
+
+    Res = tryDecodeInst(DecoderTableGFX1164, MI, QW, Address);
+    if (Res)
+      break;
+
+    Res = tryDecodeInst(DecoderTableWMMAGFX1164, MI, QW, Address);
   } while (false);
 
   if (Res && (MI.getOpcode() == AMDGPU::V_MAC_F32_e64_vi ||
@@ -565,8 +590,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
               MI.getOpcode() == AMDGPU::V_FMAC_F64_e64_gfx90a ||
               MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_vi ||
               MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_gfx10 ||
+              MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_gfx11 ||
               MI.getOpcode() == AMDGPU::V_FMAC_LEGACY_F32_e64_gfx10 ||
-              MI.getOpcode() == AMDGPU::V_FMAC_F16_e64_gfx10)) {
+              MI.getOpcode() == AMDGPU::V_FMAC_DX9_ZERO_F32_e64_gfx11 ||
+              MI.getOpcode() == AMDGPU::V_FMAC_F16_e64_gfx10 ||
+              MI.getOpcode() == AMDGPU::V_FMAC_F16_e64_gfx11)) {
     // Insert dummy unused src2_modifiers.
     insertNamedMCOperand(MI, MCOperand::createImm(0),
                          AMDGPU::OpName::src2_modifiers);
@@ -625,8 +653,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
         Res = MCDisassembler::Fail;
       } else {
         for (unsigned i = 0; i < NSAArgs; ++i) {
-          MI.insert(MI.begin() + VAddr0Idx + 1 + i,
-                    decodeOperand_VGPR_32(Bytes[i]));
+          const unsigned VAddrIdx = VAddr0Idx + 1 + i;
+          auto VAddrRCID = MCII->get(MI.getOpcode()).OpInfo[VAddrIdx].RegClass;
+          MI.insert(MI.begin() + VAddrIdx,
+                    createRegOperand(VAddrRCID, Bytes[i]));
         }
         Bytes = Bytes.slice(4 * NSAWords);
       }
@@ -636,6 +666,12 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
       Res = convertMIMGInst(MI);
   }
 
+  if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::EXP))
+    Res = convertEXPInst(MI);
+
+  if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VINTERP))
+    Res = convertVINTERPInst(MI);
+
   if (Res && IsSDWA)
     Res = convertSDWAInst(MI);
 
@@ -667,6 +703,28 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   return Res;
 }
 
+DecodeStatus AMDGPUDisassembler::convertEXPInst(MCInst &MI) const {
+  if (STI.getFeatureBits()[AMDGPU::FeatureGFX11]) {
+    // The MCInst still has these fields even though they are no longer encoded
+    // in the GFX11 instruction.
+    insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::vm);
+    insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::compr);
+  }
+  return MCDisassembler::Success;
+}
+
+DecodeStatus AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const {
+  if (MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_gfx11 ||
+      MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_gfx11 ||
+      MI.getOpcode() == AMDGPU::V_INTERP_P2_F16_F32_inreg_gfx11 ||
+      MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_gfx11) {
+    // The MCInst has this field that is not directly encoded in the
+    // instruction.
+    insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::op_sel);
+  }
+  return MCDisassembler::Success;
+}
+
 DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
   if (STI.getFeatureBits()[AMDGPU::FeatureGFX9] ||
       STI.getFeatureBits()[AMDGPU::FeatureGFX10]) {
@@ -692,18 +750,23 @@ DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
 DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
   unsigned Opc = MI.getOpcode();
   unsigned DescNumOps = MCII->get(Opc).getNumOperands();
-
-  // Insert dummy unused src modifiers.
-  if (MI.getNumOperands() < DescNumOps &&
-      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers) != -1)
-    insertNamedMCOperand(MI, MCOperand::createImm(0),
-                         AMDGPU::OpName::src0_modifiers);
-
-  if (MI.getNumOperands() < DescNumOps &&
-      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers) != -1)
-    insertNamedMCOperand(MI, MCOperand::createImm(0),
-                         AMDGPU::OpName::src1_modifiers);
-
+  if (MCII->get(Opc).TSFlags & SIInstrFlags::VOP3P) {
+    convertVOP3PDPPInst(MI);
+  } else if ((MCII->get(Opc).TSFlags & SIInstrFlags::VOPC) ||
+             AMDGPU::isVOPC64DPP(Opc)) {
+    convertVOPCDPPInst(MI);
+  } else {
+    // Insert dummy unused src modifiers.
+    if (MI.getNumOperands() < DescNumOps &&
+        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers) != -1)
+      insertNamedMCOperand(MI, MCOperand::createImm(0),
+                           AMDGPU::OpName::src0_modifiers);
+
+    if (MI.getNumOperands() < DescNumOps &&
+        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers) != -1)
+      insertNamedMCOperand(MI, MCOperand::createImm(0),
+                           AMDGPU::OpName::src1_modifiers);
+  }
   return isValidDPP8(MI) ? MCDisassembler::Success : MCDisassembler::SoftFail;
 }
 
@@ -745,7 +808,7 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
   bool IsNSA = false;
   unsigned AddrSize = Info->VAddrDwords;
 
-  if (STI.getFeatureBits()[AMDGPU::FeatureGFX10]) {
+  if (isGFX10Plus()) {
     unsigned DimIdx =
         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dim);
     int A16Idx =
@@ -757,7 +820,8 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
     AddrSize =
         AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, AMDGPU::hasG16(STI));
 
-    IsNSA = Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA;
+    IsNSA = Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA ||
+            Info->MIMGEncoding == AMDGPU::MIMGEncGfx11NSA;
     if (!IsNSA) {
       if (AddrSize > 8)
         AddrSize = 16;
@@ -808,9 +872,9 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
     }
   }
 
+  // If not using NSA on GFX10+, widen address register to correct size.
   unsigned NewVAddr0 = AMDGPU::NoRegister;
-  if (STI.getFeatureBits()[AMDGPU::FeatureGFX10] && !IsNSA &&
-      AddrSize != Info->VAddrDwords) {
+  if (isGFX10Plus() && !IsNSA && AddrSize != Info->VAddrDwords) {
     unsigned VAddr0 = MI.getOperand(VAddr0Idx).getReg();
     unsigned VAddrSub0 = MRI.getSubReg(VAddr0, AMDGPU::sub0);
     VAddr0 = (VAddrSub0 != 0) ? VAddrSub0 : VAddr0;
@@ -844,11 +908,84 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
   return MCDisassembler::Success;
 }
 
+// Opsel and neg bits are used in src_modifiers and standalone operands. Autogen
+// decoder only adds to src_modifiers, so manually add the bits to the other
+// operands.
+DecodeStatus AMDGPUDisassembler::convertVOP3PDPPInst(MCInst &MI) const {
+  unsigned Opc = MI.getOpcode();
+  unsigned DescNumOps = MCII->get(Opc).getNumOperands();
+
+  if (MI.getNumOperands() < DescNumOps &&
+      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in) != -1)
+    insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::vdst_in);
+
+  const int ModOps[] = {AMDGPU::OpName::src0_modifiers,
+                        AMDGPU::OpName::src1_modifiers,
+                        AMDGPU::OpName::src2_modifiers};
+  unsigned OpSel = 0;
+  unsigned OpSelHi = 0;
+  unsigned NegLo = 0;
+  unsigned NegHi = 0;
+  for (int J = 0; J < 3; ++J) {
+    int OpIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
+    if (OpIdx == -1)
+      break;
+    unsigned Val = MI.getOperand(OpIdx).getImm();
+
+    OpSel |= !!(Val & SISrcMods::OP_SEL_0) << J;
+    OpSelHi |= !!(Val & SISrcMods::OP_SEL_1) << J;
+    NegLo |= !!(Val & SISrcMods::NEG) << J;
+    NegHi |= !!(Val & SISrcMods::NEG_HI) << J;
+  }
+
+  if (MI.getNumOperands() < DescNumOps &&
+      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel) != -1)
+    insertNamedMCOperand(MI, MCOperand::createImm(OpSel),
+                         AMDGPU::OpName::op_sel);
+  if (MI.getNumOperands() < DescNumOps &&
+      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi) != -1)
+    insertNamedMCOperand(MI, MCOperand::createImm(OpSelHi),
+                         AMDGPU::OpName::op_sel_hi);
+  if (MI.getNumOperands() < DescNumOps &&
+      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo) != -1)
+    insertNamedMCOperand(MI, MCOperand::createImm(NegLo),
+                         AMDGPU::OpName::neg_lo);
+  if (MI.getNumOperands() < DescNumOps &&
+      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi) != -1)
+    insertNamedMCOperand(MI, MCOperand::createImm(NegHi),
+                         AMDGPU::OpName::neg_hi);
+
+  return MCDisassembler::Success;
+}
+
+// Create dummy old operand and insert optional operands
+DecodeStatus AMDGPUDisassembler::convertVOPCDPPInst(MCInst &MI) const {
+  unsigned Opc = MI.getOpcode();
+  unsigned DescNumOps = MCII->get(Opc).getNumOperands();
+
+  if (MI.getNumOperands() < DescNumOps &&
+      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::old) != -1)
+    insertNamedMCOperand(MI, MCOperand::createReg(0), AMDGPU::OpName::old);
+
+  if (MI.getNumOperands() < DescNumOps &&
+      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers) != -1)
+    insertNamedMCOperand(MI, MCOperand::createImm(0),
+                         AMDGPU::OpName::src0_modifiers);
+
+  if (MI.getNumOperands() < DescNumOps &&
+      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers) != -1)
+    insertNamedMCOperand(MI, MCOperand::createImm(0),
+                         AMDGPU::OpName::src1_modifiers);
+  return MCDisassembler::Success;
+}
+
 DecodeStatus AMDGPUDisassembler::convertFMAanyK(MCInst &MI,
                                                 int ImmLitIdx) const {
   assert(HasLiteral && "Should have decoded a literal");
   const MCInstrDesc &Desc = MCII->get(MI.getOpcode());
   unsigned DescNumOps = Desc.getNumOperands();
+  insertNamedMCOperand(MI, MCOperand::createImm(Literal),
+                       AMDGPU::OpName::immDeferred);
   assert(DescNumOps == MI.getNumOperands());
   for (unsigned I = 0; I < DescNumOps; ++I) {
     auto &Op = MI.getOperand(I);
@@ -1001,6 +1138,22 @@ MCOperand AMDGPUDisassembler::decodeOperand_AV_64(unsigned Val) const {
   return decodeSrcOp(OPW64, Val);
 }
 
+MCOperand AMDGPUDisassembler::decodeOperand_AV_128(unsigned Val) const {
+  return decodeSrcOp(OPW128, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_AVDst_128(unsigned Val) const {
+  using namespace AMDGPU::EncValues;
+  assert((Val & IS_VGPR) == 0); // Val{8} is not encoded but assumed to be 1.
+  return decodeSrcOp(OPW128, Val | IS_VGPR);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_AVDst_512(unsigned Val) const {
+  using namespace AMDGPU::EncValues;
+  assert((Val & IS_VGPR) == 0); // Val{8} is not encoded but assumed to be 1.
+  return decodeSrcOp(OPW512, Val | IS_VGPR);
+}
+
 MCOperand AMDGPUDisassembler::decodeOperand_VReg_64(unsigned Val) const {
   return createRegOperand(AMDGPU::VReg_64RegClassID, Val);
 }
@@ -1075,6 +1228,9 @@ MCOperand AMDGPUDisassembler::decodeOperand_SReg_512(unsigned Val) const {
 MCOperand
 AMDGPUDisassembler::decodeMandatoryLiteralConstant(unsigned Val) const {
   if (HasLiteral) {
+    assert(
+        AMDGPU::hasVOPD(STI) &&
+        "Should only decode multiple kimm with VOPD, check VSrc operand types");
     if (Literal != Val)
       return errOperand(Val, "More than one unique literal is illegal");
   }
@@ -1367,6 +1523,20 @@ MCOperand AMDGPUDisassembler::decodeDstOp(const OpWidthTy Width, unsigned Val) c
   llvm_unreachable("unknown dst register");
 }
 
+// Bit 0 of DstY isn't stored in the instruction, because it's always the
+// opposite of bit 0 of DstX.
+MCOperand AMDGPUDisassembler::decodeVOPDDstYOp(MCInst &Inst,
+                                               unsigned Val) const {
+  int VDstXInd =
+      AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdstX);
+  assert(VDstXInd != -1);
+  assert(Inst.getOperand(VDstXInd).isReg());
+  unsigned XDstReg = MRI.getEncodingValue(Inst.getOperand(VDstXInd).getReg());
+  Val |= ~XDstReg & 1;
+  auto Width = llvm::AMDGPUDisassembler::OPW32;
+  return createRegOperand(getVgprClassId(Width), Val);
+}
+
 MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {
   using namespace AMDGPU;
 
@@ -1381,8 +1551,10 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {
   case 109: return createRegOperand(TBA_HI);
   case 110: return createRegOperand(TMA_LO);
   case 111: return createRegOperand(TMA_HI);
-  case 124: return createRegOperand(M0);
-  case 125: return createRegOperand(SGPR_NULL);
+  case 124:
+    return isGFX11Plus() ? createRegOperand(SGPR_NULL) : createRegOperand(M0);
+  case 125:
+    return isGFX11Plus() ? createRegOperand(M0) : createRegOperand(SGPR_NULL);
   case 126: return createRegOperand(EXEC_LO);
   case 127: return createRegOperand(EXEC_HI);
   case 235: return createRegOperand(SRC_SHARED_BASE);
@@ -1408,7 +1580,14 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
   case 106: return createRegOperand(VCC);
   case 108: return createRegOperand(TBA);
   case 110: return createRegOperand(TMA);
-  case 125: return createRegOperand(SGPR_NULL);
+  case 124:
+    if (isGFX11Plus())
+      return createRegOperand(SGPR_NULL);
+    break;
+  case 125:
+    if (!isGFX11Plus())
+      return createRegOperand(SGPR_NULL);
+    break;
   case 126: return createRegOperand(EXEC);
   case 235: return createRegOperand(SRC_SHARED_BASE);
   case 236: return createRegOperand(SRC_SHARED_LIMIT);
@@ -1522,6 +1701,15 @@ bool AMDGPUDisassembler::isGFX10Plus() const {
   return AMDGPU::isGFX10Plus(STI);
 }
 
+bool AMDGPUDisassembler::isGFX11() const {
+  return STI.getFeatureBits()[AMDGPU::FeatureGFX11];
+}
+
+bool AMDGPUDisassembler::isGFX11Plus() const {
+  return AMDGPU::isGFX11Plus(STI);
+}
+
+
 bool AMDGPUDisassembler::hasArchitectedFlatScratch() const {
   return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch];
 }
@@ -1888,10 +2076,10 @@ AMDGPUDisassembler::onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size,
 //===----------------------------------------------------------------------===//
 
 // Try to find symbol name for specified label
-bool AMDGPUSymbolizer::tryAddingSymbolicOperand(MCInst &Inst,
-                                raw_ostream &/*cStream*/, int64_t Value,
-                                uint64_t /*Address*/, bool IsBranch,
-                                uint64_t /*Offset*/, uint64_t /*InstSize*/) {
+bool AMDGPUSymbolizer::tryAddingSymbolicOperand(
+    MCInst &Inst, raw_ostream & /*cStream*/, int64_t Value,
+    uint64_t /*Address*/, bool IsBranch, uint64_t /*Offset*/,
+    uint64_t /*OpSize*/, uint64_t /*InstSize*/) {
 
   if (!IsBranch) {
     return false;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index eea6074d5281..31869f0917ae 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -15,8 +15,10 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H
 #define LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H
 
+#include "llvm/ADT/APInt.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/Support/DataExtractor.h"
 #include <memory>
 
@@ -27,6 +29,60 @@ class MCOperand;
 class MCSubtargetInfo;
 class Twine;
 
+// Exposes an interface expected by autogenerated code in
+// FixedLenDecoderEmitter
+class DecoderUInt128 {
+private:
+  uint64_t Lo = 0;
+  uint64_t Hi = 0;
+
+public:
+  DecoderUInt128() = default;
+  DecoderUInt128(uint64_t Lo, uint64_t Hi = 0) : Lo(Lo), Hi(Hi) {}
+  operator bool() const { return Lo || Hi; }
+  void insertBits(uint64_t SubBits, unsigned BitPosition, unsigned NumBits) {
+    assert(NumBits && NumBits <= 64);
+    assert(SubBits >> 1 >> (NumBits - 1) == 0);
+    assert(BitPosition < 128);
+    if (BitPosition < 64) {
+      Lo |= SubBits << BitPosition;
+      Hi |= SubBits >> 1 >> (63 - BitPosition);
+    } else {
+      Hi |= SubBits << (BitPosition - 64);
+    }
+  }
+  uint64_t extractBitsAsZExtValue(unsigned NumBits,
+                                  unsigned BitPosition) const {
+    assert(NumBits && NumBits <= 64);
+    assert(BitPosition < 128);
+    uint64_t Val;
+    if (BitPosition < 64)
+      Val = Lo >> BitPosition | Hi << 1 << (63 - BitPosition);
+    else
+      Val = Hi >> (BitPosition - 64);
+    return Val & ((uint64_t(2) << (NumBits - 1)) - 1);
+  }
+  DecoderUInt128 operator&(const DecoderUInt128 &RHS) const {
+    return DecoderUInt128(Lo & RHS.Lo, Hi & RHS.Hi);
+  }
+  DecoderUInt128 operator&(const uint64_t &RHS) const {
+    return *this & DecoderUInt128(RHS);
+  }
+  DecoderUInt128 operator~() const { return DecoderUInt128(~Lo, ~Hi); }
+  bool operator==(const DecoderUInt128 &RHS) {
+    return Lo == RHS.Lo && Hi == RHS.Hi;
+  }
+  bool operator!=(const DecoderUInt128 &RHS) {
+    return Lo != RHS.Lo || Hi != RHS.Hi;
+  }
+  bool operator!=(const int &RHS) {
+    return *this != DecoderUInt128(RHS);
+  }
+  friend raw_ostream &operator<<(raw_ostream &OS, const DecoderUInt128 &RHS) {
+    return OS << APInt(128, {RHS.Lo, RHS.Hi});
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // AMDGPUDisassembler
 //===----------------------------------------------------------------------===//
@@ -57,8 +113,21 @@ public:
 
   MCOperand errOperand(unsigned V, const Twine& ErrMsg) const;
 
-  DecodeStatus tryDecodeInst(const uint8_t* Table, MCInst &MI, uint64_t Inst,
-                             uint64_t Address) const;
+  template <typename InsnType>
+  DecodeStatus tryDecodeInst(const uint8_t *Table, MCInst &MI, InsnType Inst,
+                             uint64_t Address) const {
+    assert(MI.getOpcode() == 0);
+    assert(MI.getNumOperands() == 0);
+    MCInst TmpInst;
+    HasLiteral = false;
+    const auto SavedBytes = Bytes;
+    if (decodeInstruction(Table, TmpInst, Inst, Address, this, STI)) {
+      MI = TmpInst;
+      return MCDisassembler::Success;
+    }
+    Bytes = SavedBytes;
+    return MCDisassembler::Fail;
+  }
 
   Optional<DecodeStatus> onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size,
                                        ArrayRef<uint8_t> Bytes,
@@ -87,10 +156,14 @@ public:
   DecodeStatus decodeCOMPUTE_PGM_RSRC2(uint32_t FourByteBuffer,
                                        raw_string_ostream &KdStream) const;
 
+  DecodeStatus convertEXPInst(MCInst &MI) const;
+  DecodeStatus convertVINTERPInst(MCInst &MI) const;
   DecodeStatus convertFMAanyK(MCInst &MI, int ImmLitIdx) const;
   DecodeStatus convertSDWAInst(MCInst &MI) const;
   DecodeStatus convertDPP8Inst(MCInst &MI) const;
   DecodeStatus convertMIMGInst(MCInst &MI) const;
+  DecodeStatus convertVOP3PDPPInst(MCInst &MI) const;
+  DecodeStatus convertVOPCDPPInst(MCInst &MI) const;
 
   MCOperand decodeOperand_VGPR_32(unsigned Val) const;
   MCOperand decodeOperand_VRegOrLds_32(unsigned Val) const;
@@ -127,6 +200,9 @@ public:
   MCOperand decodeOperand_AReg_1024(unsigned Val) const;
   MCOperand decodeOperand_AV_32(unsigned Val) const;
   MCOperand decodeOperand_AV_64(unsigned Val) const;
+  MCOperand decodeOperand_AV_128(unsigned Val) const;
+  MCOperand decodeOperand_AVDst_128(unsigned Val) const;
+  MCOperand decodeOperand_AVDst_512(unsigned Val) const;
 
   enum OpWidthTy {
     OPW32,
@@ -157,6 +233,7 @@ public:
   MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val,
                         bool MandatoryLiteral = false) const;
   MCOperand decodeDstOp(const OpWidthTy Width, unsigned Val) const;
+  MCOperand decodeVOPDDstYOp(MCInst &Inst, unsigned Val) const;
   MCOperand decodeSpecialReg32(unsigned Val) const;
   MCOperand decodeSpecialReg64(unsigned Val) const;
 
@@ -177,6 +254,8 @@ public:
   bool isGFX9Plus() const;
   bool isGFX10() const;
   bool isGFX10Plus() const;
+  bool isGFX11() const;
+  bool isGFX11Plus() const;
 
   bool hasArchitectedFlatScratch() const;
 };
@@ -196,8 +275,8 @@ public:
                    : MCSymbolizer(Ctx, std::move(RelInfo)), DisInfo(disInfo) {}
 
   bool tryAddingSymbolicOperand(MCInst &Inst, raw_ostream &cStream,
-                                int64_t Value, uint64_t Address,
-                                bool IsBranch, uint64_t Offset,
+                                int64_t Value, uint64_t Address, bool IsBranch,
+                                uint64_t Offset, uint64_t OpSize,
                                 uint64_t InstSize) override;
 
   void tryAddingPcLoadReferenceComment(raw_ostream &cStream,
diff --git a/llvm/lib/Target/AMDGPU/EXPInstructions.td b/llvm/lib/Target/AMDGPU/EXPInstructions.td
index b3b55ddd2c97..14ba01f0d67c 100644
--- a/llvm/lib/Target/AMDGPU/EXPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/EXPInstructions.td
@@ -10,7 +10,7 @@
 // EXP classes
 //===----------------------------------------------------------------------===//
 
-class EXPCommon<bit done, string asm = ""> : InstSI<
+class EXPCommon<bit row, bit done, string asm = ""> : InstSI<
   (outs),
   (ins exp_tgt:$tgt,
        ExpSrc0:$src0, ExpSrc1:$src1, ExpSrc2:$src2, ExpSrc3:$src3,
@@ -21,21 +21,30 @@ class EXPCommon<bit done, string asm = ""> : InstSI<
   let mayLoad = done;
   let mayStore = 1;
   let UseNamedOperandTable = 1;
-  let Uses = [EXEC];
+  let Uses = !if(row, [EXEC, M0], [EXEC]);
   let SchedRW = [WriteExport];
   let DisableWQM = 1;
 }
 
-class EXP_Pseudo<bit done> : EXPCommon<done>,
-                             SIMCInstr <NAME, SIEncodingFamily.NONE> {
+class EXP_Pseudo<bit row, bit done>
+  : EXPCommon<row, done>, SIMCInstr<NAME, SIEncodingFamily.NONE> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
 }
 
-class EXP_Real<bit done, string pseudo, int subtarget>
-  : EXPCommon<done, "exp$tgt $src0, $src1, $src2, $src3"#!if(done, " done", "")
-                    #"$compr$vm">,
-    SIMCInstr <pseudo, subtarget> {
+// Real instruction with optional asm operands "compr" and "vm".
+class EXP_Real_ComprVM<bit done, string pseudo, int subtarget>
+  : EXPCommon<0, done, "exp$tgt $src0, $src1, $src2, $src3"
+                       #!if(done, " done", "")#"$compr$vm">,
+    SIMCInstr<pseudo, subtarget> {
+  let AsmMatchConverter = "cvtExp";
+}
+
+// Real instruction with optional asm operand "row_en".
+class EXP_Real_Row<bit row, bit done, string pseudo, int subtarget>
+  : EXPCommon<row, done, "exp$tgt $src0, $src1, $src2, $src3"
+                         #!if(done, " done", "")#!if(row, " row_en", "")>,
+    SIMCInstr<pseudo, subtarget> {
   let AsmMatchConverter = "cvtExp";
 }
 
@@ -43,17 +52,21 @@ class EXP_Real<bit done, string pseudo, int subtarget>
 // EXP Instructions
 //===----------------------------------------------------------------------===//
 
-// Split EXP instruction into EXP and EXP_DONE so we can set
-// mayLoad for done=1.
-def EXP : EXP_Pseudo<0>;
-def EXP_DONE : EXP_Pseudo<1>;
+// DONE variants have mayLoad = 1.
+// ROW variants have an implicit use of M0.
+let SubtargetPredicate = isNotGFX90APlus in {
+def EXP          : EXP_Pseudo<0, 0>;
+def EXP_DONE     : EXP_Pseudo<0, 1>;
+def EXP_ROW      : EXP_Pseudo<1, 0>;
+def EXP_ROW_DONE : EXP_Pseudo<1, 1>;
+} // let SubtargetPredicate = isNotGFX90APlus
 
 //===----------------------------------------------------------------------===//
 // SI
 //===----------------------------------------------------------------------===//
 
 class EXP_Real_si<bit _done, string pseudo>
-  : EXP_Real<_done, pseudo, SIEncodingFamily.SI>, EXPe {
+  : EXP_Real_ComprVM<_done, pseudo, SIEncodingFamily.SI>, EXPe_ComprVM {
   let AssemblerPredicate = isGFX6GFX7;
   let DecoderNamespace = "GFX6GFX7";
   let done = _done;
@@ -67,8 +80,9 @@ def EXP_DONE_si : EXP_Real_si<1, "EXP_DONE">;
 //===----------------------------------------------------------------------===//
 
 class EXP_Real_vi<bit _done, string pseudo>
-  : EXP_Real<_done, pseudo, SIEncodingFamily.VI>, EXPe_vi {
+  : EXP_Real_ComprVM<_done, pseudo, SIEncodingFamily.VI>, EXPe_vi {
   let AssemblerPredicate = isGFX8GFX9;
+  let SubtargetPredicate = isNotGFX90APlus;
   let DecoderNamespace = "GFX8";
   let done = _done;
 }
@@ -77,12 +91,12 @@ def EXP_vi      : EXP_Real_vi<0, "EXP">;
 def EXP_DONE_vi : EXP_Real_vi<1, "EXP_DONE">;
 
 //===----------------------------------------------------------------------===//
-// GFX10+
+// GFX10
 //===----------------------------------------------------------------------===//
 
 class EXP_Real_gfx10<bit _done, string pseudo>
-  : EXP_Real<_done, pseudo, SIEncodingFamily.GFX10>, EXPe {
-  let AssemblerPredicate = isGFX10Plus;
+  : EXP_Real_ComprVM<_done, pseudo, SIEncodingFamily.GFX10>, EXPe_ComprVM {
+  let AssemblerPredicate = isGFX10Only;
   let DecoderNamespace = "GFX10";
   let done = _done;
 }
@@ -90,6 +104,23 @@ class EXP_Real_gfx10<bit _done, string pseudo>
 def EXP_gfx10      : EXP_Real_gfx10<0, "EXP">;
 def EXP_DONE_gfx10 : EXP_Real_gfx10<1, "EXP_DONE">;
 
+//===----------------------------------------------------------------------===//
+// GFX11+
+//===----------------------------------------------------------------------===//
+
+class EXP_Real_gfx11<bit _row, bit _done, string pseudo>
+  : EXP_Real_Row<_row, _done, pseudo, SIEncodingFamily.GFX11>, EXPe_Row {
+  let AssemblerPredicate = isGFX11Plus;
+  let DecoderNamespace = "GFX11";
+  let row = _row;
+  let done = _done;
+}
+
+def EXP_gfx11          : EXP_Real_gfx11<0, 0, "EXP">;
+def EXP_DONE_gfx11     : EXP_Real_gfx11<0, 1, "EXP_DONE">;
+def EXP_ROW_gfx11      : EXP_Real_gfx11<1, 0, "EXP_ROW">;
+def EXP_ROW_DONE_gfx11 : EXP_Real_gfx11<1, 1, "EXP_ROW_DONE">;
+
 //===----------------------------------------------------------------------===//
 // EXP Patterns
 //===----------------------------------------------------------------------===//
@@ -103,6 +134,15 @@ class ExpPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat<
         ExpSrc2:$src2, ExpSrc3:$src3, timm:$vm, 0, timm:$en)
 >;
 
+class ExpRowPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat<
+  (int_amdgcn_exp_row timm:$tgt, timm:$en,
+                      (vt ExpSrc0:$src0), (vt ExpSrc1:$src1),
+                      (vt ExpSrc2:$src2), (vt ExpSrc3:$src3),
+                      done_val, M0),
+  (Inst timm:$tgt, ExpSrc0:$src0, ExpSrc1:$src1,
+        ExpSrc2:$src2, ExpSrc3:$src3, 0, 0, timm:$en)
+>;
+
 class ExpComprPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat<
   (int_amdgcn_exp_compr timm:$tgt, timm:$en,
                         (vt ExpSrc0:$src0), (vt ExpSrc1:$src1),
@@ -119,6 +159,11 @@ def : ExpPattern<i32, EXP_DONE, -1>;
 def : ExpPattern<f32, EXP, 0>;
 def : ExpPattern<f32, EXP_DONE, -1>;
 
+def : ExpRowPattern<i32, EXP_ROW, 0>;
+def : ExpRowPattern<i32, EXP_ROW_DONE, -1>;
+def : ExpRowPattern<f32, EXP_ROW, 0>;
+def : ExpRowPattern<f32, EXP_ROW_DONE, -1>;
+
 def : ExpComprPattern<v2i16, EXP, 0>;
 def : ExpComprPattern<v2i16, EXP_DONE, -1>;
 def : ExpComprPattern<v2f16, EXP, 0>;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index c530d3cb49f0..cb2822818549 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -12,6 +12,7 @@ def ScratchOffset : ComplexPattern<iPTR, 2, "SelectScratchOffset", [], [SDNPWant
 
 def GlobalSAddr : ComplexPattern<iPTR, 3, "SelectGlobalSAddr", [], [SDNPWantRoot], -10>;
 def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [SDNPWantRoot], -10>;
+def ScratchSVAddr : ComplexPattern<iPTR, 3, "SelectScratchSVAddr", [], [SDNPWantRoot], -10>;
 
 //===----------------------------------------------------------------------===//
 // FLAT classes
@@ -56,6 +57,9 @@ class FLAT_Pseudo<string opName, dag outs, dag ins,
   bits<1> dlcValue = 0;
   bits<1> has_sccb  = 1;
   bits<1> sccbValue = 0;
+  bits<1> has_sve  = 0; // Scratch VGPR Enable
+  bits<1> lds = 0;
+  bits<1> sve = 0;
 
   let SubtargetPredicate = !if(is_flat_global, HasFlatGlobalInsts,
     !if(is_flat_scratch, HasFlatScratchInsts, HasFlatAddressSpace));
@@ -74,8 +78,8 @@ class FLAT_Pseudo<string opName, dag outs, dag ins,
   let FlatScratch = is_flat_scratch;
 }
 
-class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
-  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+class FLAT_Real <bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
+  InstSI <ps.OutOperandList, ps.InOperandList, opName # ps.AsmOperands, []>,
   Enc64 {
 
   let isPseudo = 0;
@@ -96,6 +100,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
   let IsAtomicNoRet        = ps.IsAtomicNoRet;
   let VM_CNT               = ps.VM_CNT;
   let LGKM_CNT             = ps.LGKM_CNT;
+  let VALU                 = ps.VALU;
 
   // encoding fields
   bits<8> vaddr;
@@ -106,7 +111,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
   bits<5> cpol;
 
   // Only valid on gfx9
-  bits<1> lds = 0; // XXX - What does this actually do?
+  bits<1> lds = ps.lds; // LDS DMA for global and scratch
 
   // Segment, 00=flat, 01=scratch, 10=global, 11=reserved
   bits<2> seg = !if(ps.is_flat_global, 0b10,
@@ -123,7 +128,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
 
   // Only valid on GFX9+
   let Inst{12-0} = offset;
-  let Inst{13} = lds;
+  let Inst{13} = !if(ps.has_sve, ps.sve, lds);
   let Inst{15-14} = seg;
 
   let Inst{16}    = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glcValue);
@@ -240,6 +245,35 @@ multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> {
   }
 }
 
+class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_Pseudo<
+  opName,
+  (outs ),
+  !con(
+      !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)),
+      (ins flat_offset:$offset, CPol_0:$cpol)),
+  " $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
+  let LGKM_CNT = 1;
+  let is_flat_global = 1;
+  let lds = 1;
+  let has_data = 0;
+  let has_vdst = 0;
+  let mayLoad = 1;
+  let mayStore = 1;
+  let has_saddr = 1;
+  let enabled_saddr = EnableSaddr;
+  let VALU = 1;
+  let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", "");
+  let Uses = [M0, EXEC];
+  let SchedRW = [WriteVMEM, WriteLDS];
+}
+
+multiclass FLAT_Global_Load_LDS_Pseudo<string opName> {
+  def ""     : FLAT_Global_Load_LDS_Pseudo<opName>,
+    GlobalSaddrTable<0, opName>;
+  def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1>,
+    GlobalSaddrTable<1, opName>;
+}
+
 class FLAT_Global_Store_AddTid_Pseudo <string opName, RegisterClass vdataClass,
   bit EnableSaddr = 0> : FLAT_Pseudo<
   opName,
@@ -273,16 +307,19 @@ class FlatScratchInst <string sv_op, string mode> {
 class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,
   bit HasTiedOutput = 0,
   bit EnableSaddr = 0,
-  bit EnableVaddr = !not(EnableSaddr)>
+  bit EnableSVE = 0,
+  bit EnableVaddr = !or(EnableSVE, !not(EnableSaddr))>
   : FLAT_Pseudo<
   opName,
   (outs getLdStRegisterOperand<regClass>.ret:$vdst),
   !con(
-     !if(EnableSaddr,
-       (ins SReg_32_XEXEC_HI:$saddr, flat_offset:$offset),
-       !if(EnableVaddr,
-         (ins VGPR_32:$vaddr, flat_offset:$offset),
-         (ins flat_offset:$offset))),
+    !if(EnableSVE,
+        (ins VGPR_32:$vaddr, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset),
+        !if(EnableSaddr,
+          (ins SReg_32_XEXEC_HI:$saddr, flat_offset:$offset),
+          !if(EnableVaddr,
+            (ins VGPR_32:$vaddr, flat_offset:$offset),
+            (ins flat_offset:$offset)))),
      !if(HasTiedOutput, (ins CPol:$cpol, getLdStRegisterOperand<regClass>.ret:$vdst_in),
                         (ins CPol_0:$cpol))),
   " $vdst, "#!if(EnableVaddr, "$vaddr, ", "off, ")#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> {
@@ -291,7 +328,9 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,
   let has_saddr = 1;
   let enabled_saddr = EnableSaddr;
   let has_vaddr = EnableVaddr;
-  let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", !if(EnableVaddr, "", "_ST"));
+  let has_sve = EnableSVE;
+  let sve = EnableVaddr;
+  let PseudoInstr = opName#!if(EnableSVE, "_SVS", !if(EnableSaddr, "_SADDR", !if(EnableVaddr, "", "_ST")));
   let maybeAtomic = 1;
 
   let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
@@ -299,15 +338,18 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,
 }
 
 class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit EnableSaddr = 0,
-  bit EnableVaddr = !not(EnableSaddr),
+  bit EnableSVE = 0,
+  bit EnableVaddr = !or(EnableSVE, !not(EnableSaddr)),
   RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret> : FLAT_Pseudo<
   opName,
   (outs),
-  !if(EnableSaddr,
-    (ins vdata_op:$vdata, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol_0:$cpol),
-    !if(EnableVaddr,
-      (ins vdata_op:$vdata, VGPR_32:$vaddr, flat_offset:$offset, CPol_0:$cpol),
-      (ins vdata_op:$vdata, flat_offset:$offset, CPol_0:$cpol))),
+  !if(EnableSVE,
+    (ins vdata_op:$vdata, VGPR_32:$vaddr, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol_0:$cpol),
+    !if(EnableSaddr,
+      (ins vdata_op:$vdata, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol_0:$cpol),
+      !if(EnableVaddr,
+        (ins vdata_op:$vdata, VGPR_32:$vaddr, flat_offset:$offset, CPol_0:$cpol),
+        (ins vdata_op:$vdata, flat_offset:$offset, CPol_0:$cpol)))),
   " "#!if(EnableVaddr, "$vaddr", "off")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> {
   let mayLoad  = 0;
   let mayStore = 1;
@@ -315,7 +357,9 @@ class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit En
   let has_saddr = 1;
   let enabled_saddr = EnableSaddr;
   let has_vaddr = EnableVaddr;
-  let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", !if(EnableVaddr, "", "_ST"));
+  let has_sve = EnableSVE;
+  let sve = EnableVaddr;
+  let PseudoInstr = opName#!if(EnableSVE, "_SVS", !if(EnableSaddr, "_SADDR", !if(EnableVaddr, "", "_ST")));
   let maybeAtomic = 1;
 }
 
@@ -326,8 +370,12 @@ multiclass FLAT_Scratch_Load_Pseudo<string opName, RegisterClass regClass, bit H
     def _SADDR : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 1>,
                  FlatScratchInst<opName, "SS">;
 
+    let SubtargetPredicate = HasFlatScratchSVSMode in
+    def _SVS : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 1, 1>,
+               FlatScratchInst<opName, "SVS">;
+
     let SubtargetPredicate = HasFlatScratchSTMode in
-    def _ST  : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 0, 0>,
+    def _ST  : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 0, 0, 0>,
                FlatScratchInst<opName, "ST">;
   }
 }
@@ -339,12 +387,59 @@ multiclass FLAT_Scratch_Store_Pseudo<string opName, RegisterClass regClass> {
     def _SADDR : FLAT_Scratch_Store_Pseudo<opName, regClass, 1>,
                  FlatScratchInst<opName, "SS">;
 
+    let SubtargetPredicate = HasFlatScratchSVSMode in
+    def _SVS : FLAT_Scratch_Store_Pseudo<opName, regClass, 1, 1>,
+               FlatScratchInst<opName, "SVS">;
+
     let SubtargetPredicate = HasFlatScratchSTMode in
-    def _ST  : FLAT_Scratch_Store_Pseudo<opName, regClass, 0, 0>,
+    def _ST  : FLAT_Scratch_Store_Pseudo<opName, regClass, 0, 0, 0>,
                FlatScratchInst<opName, "ST">;
   }
 }
 
+class FLAT_Scratch_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0,
+  bit EnableSVE = 0,
+  bit EnableVaddr = !or(EnableSVE, !not(EnableSaddr))> : FLAT_Pseudo<
+  opName,
+  (outs ),
+  !if(EnableSVE,
+    (ins VGPR_32:$vaddr, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol:$cpol),
+    !if(EnableSaddr,
+      (ins SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol:$cpol),
+      !if(EnableVaddr,
+        (ins VGPR_32:$vaddr, flat_offset:$offset, CPol:$cpol),
+        (ins flat_offset:$offset, CPol:$cpol)))),
+  " "#!if(EnableVaddr, "$vaddr, ", "off, ")#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> {
+
+  let LGKM_CNT = 1;
+  let is_flat_scratch = 1;
+  let lds = 1;
+  let has_data = 0;
+  let has_vdst = 0;
+  let mayLoad = 1;
+  let mayStore = 1;
+  let has_saddr = 1;
+  let enabled_saddr = EnableSaddr;
+  let has_vaddr = EnableVaddr;
+  let has_sve = EnableSVE;
+  let sve = EnableVaddr;
+  let VALU = 1;
+  let PseudoInstr = opName#!if(EnableSVE, "_SVS", !if(EnableSaddr, "_SADDR", !if(EnableVaddr, "", "_ST")));
+  let Uses = [M0, EXEC];
+  let SchedRW = [WriteVMEM, WriteLDS];
+}
+
+multiclass FLAT_Scratch_Load_LDS_Pseudo<string opName> {
+  def ""     : FLAT_Scratch_Load_LDS_Pseudo<opName>,
+               FlatScratchInst<opName, "SV">;
+  def _SADDR : FLAT_Scratch_Load_LDS_Pseudo<opName, 1>,
+               FlatScratchInst<opName, "SS">;
+  def _SVS   : FLAT_Scratch_Load_LDS_Pseudo<opName, 1, 1>,
+               FlatScratchInst<opName, "SVS">;
+  def _ST    : FLAT_Scratch_Load_LDS_Pseudo<opName, 0, 0, 0>,
+               FlatScratchInst<opName, "ST">;
+}
+
 class FLAT_AtomicNoRet_Pseudo<string opName, dag outs, dag ins,
                                string asm, list<dag> pattern = []> :
   FLAT_Pseudo<opName, outs, ins, asm, pattern> {
@@ -375,7 +470,6 @@ multiclass FLAT_Atomic_Pseudo<
   string opName,
   RegisterClass vdst_rc,
   ValueType vt,
-  SDPatternOperator atomic = null_frag,
   ValueType data_vt = vt,
   RegisterClass data_rc = vdst_rc,
   bit isFP = isFloatType<data_vt>.ret,
@@ -394,11 +488,9 @@ multiclass FLAT_Atomic_Pseudo<
   def _RTN : FLAT_AtomicRet_Pseudo <opName,
     (outs getLdStRegisterOperand<vdst_rc>.ret:$vdst),
     (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
-    " $vdst, $vaddr, $vdata$offset$cpol",
-    [(set vt:$vdst,
-      (atomic (FlatOffset i64:$vaddr, i16:$offset), data_vt:$vdata))]>,
-       GlobalSaddrTable<0, opName#"_rtn">,
-       AtomicNoRet <opName, 1>{
+    " $vdst, $vaddr, $vdata$offset$cpol">,
+    GlobalSaddrTable<0, opName#"_rtn">,
+    AtomicNoRet <opName, 1> {
     let FPAtomic = isFP;
     let AddedComplexity = -1; // Prefer global atomics if available
   }
@@ -441,7 +533,6 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN<
   string opName,
   RegisterClass vdst_rc,
   ValueType vt,
-  SDPatternOperator atomic = null_frag,
   ValueType data_vt = vt,
   RegisterClass data_rc = vdst_rc,
   bit isFP = isFloatType<data_vt>.ret,
@@ -451,11 +542,9 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN<
   def _RTN : FLAT_AtomicRet_Pseudo <opName,
     (outs vdst_op:$vdst),
       (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
-    " $vdst, $vaddr, $vdata, off$offset$cpol",
-    [(set vt:$vdst,
-      (atomic (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$vdata))]>,
-      GlobalSaddrTable<0, opName#"_rtn">,
-      AtomicNoRet <opName, 1> {
+    " $vdst, $vaddr, $vdata, off$offset$cpol">,
+    GlobalSaddrTable<0, opName#"_rtn">,
+    AtomicNoRet <opName, 1> {
     let has_saddr = 1;
     let FPAtomic = isFP;
   }
@@ -477,12 +566,11 @@ multiclass FLAT_Global_Atomic_Pseudo<
   string opName,
   RegisterClass vdst_rc,
   ValueType vt,
-  SDPatternOperator atomic_rtn = null_frag,
   ValueType data_vt = vt,
   RegisterClass data_rc = vdst_rc> {
   let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in {
     defm "" : FLAT_Global_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, data_vt, data_rc>;
-    defm "" : FLAT_Global_Atomic_Pseudo_RTN<opName, vdst_rc, vt, atomic_rtn, data_vt, data_rc>;
+    defm "" : FLAT_Global_Atomic_Pseudo_RTN<opName, vdst_rc, vt, data_vt, data_rc>;
   }
 }
 
@@ -519,99 +607,88 @@ def FLAT_STORE_SHORT_D16_HI : FLAT_Store_Pseudo <"flat_store_short_d16_hi", VGPR
 }
 
 defm FLAT_ATOMIC_CMPSWAP    : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap",
-                                VGPR_32, i32, AMDGPUatomic_cmp_swap_flat_32,
-                                v2i32, VReg_64>;
+                                VGPR_32, i32, v2i32, VReg_64>;
 
 defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap_x2",
-                                VReg_64, i64, AMDGPUatomic_cmp_swap_flat_64,
-                                v2i64, VReg_128>;
+                                VReg_64, i64, v2i64, VReg_128>;
 
 defm FLAT_ATOMIC_SWAP       : FLAT_Atomic_Pseudo <"flat_atomic_swap",
-                                VGPR_32, i32, atomic_swap_flat_32>;
+                                VGPR_32, i32>;
 
 defm FLAT_ATOMIC_SWAP_X2    : FLAT_Atomic_Pseudo <"flat_atomic_swap_x2",
-                                VReg_64, i64, atomic_swap_flat_64>;
+                                VReg_64, i64>;
 
 defm FLAT_ATOMIC_ADD        : FLAT_Atomic_Pseudo <"flat_atomic_add",
-                                VGPR_32, i32, atomic_load_add_flat_32>;
+                                VGPR_32, i32>;
 
 defm FLAT_ATOMIC_SUB        : FLAT_Atomic_Pseudo <"flat_atomic_sub",
-                                VGPR_32, i32, atomic_load_sub_flat_32>;
+                                VGPR_32, i32>;
 
 defm FLAT_ATOMIC_SMIN       : FLAT_Atomic_Pseudo <"flat_atomic_smin",
-                                VGPR_32, i32, atomic_load_min_flat_32>;
+                                VGPR_32, i32>;
 
 defm FLAT_ATOMIC_UMIN       : FLAT_Atomic_Pseudo <"flat_atomic_umin",
-                                VGPR_32, i32, atomic_load_umin_flat_32>;
+                                VGPR_32, i32>;
 
 defm FLAT_ATOMIC_SMAX       : FLAT_Atomic_Pseudo <"flat_atomic_smax",
-                                VGPR_32, i32, atomic_load_max_flat_32>;
+                                VGPR_32, i32>;
 
 defm FLAT_ATOMIC_UMAX       : FLAT_Atomic_Pseudo <"flat_atomic_umax",
-                                VGPR_32, i32, atomic_load_umax_flat_32>;
+                                VGPR_32, i32>;
 
 defm FLAT_ATOMIC_AND        : FLAT_Atomic_Pseudo <"flat_atomic_and",
-                                VGPR_32, i32, atomic_load_and_flat_32>;
+                                VGPR_32, i32>;
 
 defm FLAT_ATOMIC_OR         : FLAT_Atomic_Pseudo <"flat_atomic_or",
-                                VGPR_32, i32, atomic_load_or_flat_32>;
+                                VGPR_32, i32>;
 
 defm FLAT_ATOMIC_XOR        : FLAT_Atomic_Pseudo <"flat_atomic_xor",
-                                VGPR_32, i32, atomic_load_xor_flat_32>;
+                                VGPR_32, i32>;
 
 defm FLAT_ATOMIC_INC        : FLAT_Atomic_Pseudo <"flat_atomic_inc",
-                                VGPR_32, i32, atomic_inc_flat_32>;
+                                VGPR_32, i32>;
 
 defm FLAT_ATOMIC_DEC        : FLAT_Atomic_Pseudo <"flat_atomic_dec",
-                                VGPR_32, i32, atomic_dec_flat_32>;
+                                VGPR_32, i32>;
 
 defm FLAT_ATOMIC_ADD_X2     : FLAT_Atomic_Pseudo <"flat_atomic_add_x2",
-                                VReg_64, i64, atomic_load_add_flat_64>;
+                                VReg_64, i64>;
 
 defm FLAT_ATOMIC_SUB_X2     : FLAT_Atomic_Pseudo <"flat_atomic_sub_x2",
-                                VReg_64, i64, atomic_load_sub_flat_64>;
+                                VReg_64, i64>;
 
 defm FLAT_ATOMIC_SMIN_X2    : FLAT_Atomic_Pseudo <"flat_atomic_smin_x2",
-                                VReg_64, i64, atomic_load_min_flat_64>;
+                                VReg_64, i64>;
 
 defm FLAT_ATOMIC_UMIN_X2    : FLAT_Atomic_Pseudo <"flat_atomic_umin_x2",
-                                VReg_64, i64, atomic_load_umin_flat_64>;
+                                VReg_64, i64>;
 
 defm FLAT_ATOMIC_SMAX_X2    : FLAT_Atomic_Pseudo <"flat_atomic_smax_x2",
-                                VReg_64, i64, atomic_load_max_flat_64>;
+                                VReg_64, i64>;
 
 defm FLAT_ATOMIC_UMAX_X2    : FLAT_Atomic_Pseudo <"flat_atomic_umax_x2",
-                                VReg_64, i64, atomic_load_umax_flat_64>;
+                                VReg_64, i64>;
 
 defm FLAT_ATOMIC_AND_X2     : FLAT_Atomic_Pseudo <"flat_atomic_and_x2",
-                                VReg_64, i64, atomic_load_and_flat_64>;
+                                VReg_64, i64>;
 
 defm FLAT_ATOMIC_OR_X2      : FLAT_Atomic_Pseudo <"flat_atomic_or_x2",
-                                VReg_64, i64, atomic_load_or_flat_64>;
+                                VReg_64, i64>;
 
 defm FLAT_ATOMIC_XOR_X2     : FLAT_Atomic_Pseudo <"flat_atomic_xor_x2",
-                                VReg_64, i64, atomic_load_xor_flat_64>;
+                                VReg_64, i64>;
 
 defm FLAT_ATOMIC_INC_X2     : FLAT_Atomic_Pseudo <"flat_atomic_inc_x2",
-                                VReg_64, i64, atomic_inc_flat_64>;
+                                VReg_64, i64>;
 
 defm FLAT_ATOMIC_DEC_X2     : FLAT_Atomic_Pseudo <"flat_atomic_dec_x2",
-                                VReg_64, i64, atomic_dec_flat_64>;
+                                VReg_64, i64>;
 
 // GFX7-, GFX10-only flat instructions.
 let SubtargetPredicate = isGFX7GFX10 in {
 
-defm FLAT_ATOMIC_FCMPSWAP    : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap",
-                                VGPR_32, f32, null_frag, v2f32, VReg_64>;
-
 defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap_x2",
-                                VReg_64, f64, null_frag, v2f64, VReg_128>;
-
-defm FLAT_ATOMIC_FMIN        : FLAT_Atomic_Pseudo <"flat_atomic_fmin",
-                                VGPR_32, f32>;
-
-defm FLAT_ATOMIC_FMAX        : FLAT_Atomic_Pseudo <"flat_atomic_fmax",
-                                VGPR_32, f32>;
+                                VReg_64, f64, v2f64, VReg_128>;
 
 defm FLAT_ATOMIC_FMIN_X2     : FLAT_Atomic_Pseudo <"flat_atomic_fmin_x2",
                                 VReg_64, f64>;
@@ -622,14 +699,39 @@ defm FLAT_ATOMIC_FMAX_X2     : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2",
 } // End SubtargetPredicate = isGFX7GFX10
 
 let SubtargetPredicate = isGFX90APlus in {
-  defm FLAT_ATOMIC_ADD_F64   : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", VReg_64, f64, int_amdgcn_flat_atomic_fadd>;
-  defm FLAT_ATOMIC_MIN_F64   : FLAT_Atomic_Pseudo<"flat_atomic_min_f64", VReg_64, f64, int_amdgcn_flat_atomic_fmin>;
-  defm FLAT_ATOMIC_MAX_F64   : FLAT_Atomic_Pseudo<"flat_atomic_max_f64", VReg_64, f64, int_amdgcn_flat_atomic_fmax>;
-  defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", VReg_64, f64, int_amdgcn_global_atomic_fadd>;
-  defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64, int_amdgcn_global_atomic_fmin>;
-  defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64, int_amdgcn_global_atomic_fmax>;
+  defm FLAT_ATOMIC_ADD_F64   : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", VReg_64, f64>;
+  defm FLAT_ATOMIC_MIN_F64   : FLAT_Atomic_Pseudo<"flat_atomic_min_f64", VReg_64, f64>;
+  defm FLAT_ATOMIC_MAX_F64   : FLAT_Atomic_Pseudo<"flat_atomic_max_f64", VReg_64, f64>;
+  defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", VReg_64, f64>;
+  defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64>;
+  defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64>;
 } // End SubtargetPredicate = isGFX90APlus
 
+let SubtargetPredicate = isGFX940Plus in {
+  defm FLAT_ATOMIC_PK_ADD_F16    : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_f16",  VGPR_32, v2f16>;
+  defm FLAT_ATOMIC_PK_ADD_BF16   : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_bf16", VGPR_32, v2f16>;
+  defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Atomic_Pseudo<"global_atomic_pk_add_bf16", VGPR_32, v2f16>;
+} // End SubtargetPredicate = isGFX940Plus
+
+// GFX7-, GFX10-, GFX11-only flat instructions.
+let SubtargetPredicate = isGFX7GFX10GFX11 in {
+
+defm FLAT_ATOMIC_FCMPSWAP    : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap",
+                                VGPR_32, f32, v2f32, VReg_64>;
+
+defm FLAT_ATOMIC_FMIN        : FLAT_Atomic_Pseudo <"flat_atomic_fmin",
+                                VGPR_32, f32>;
+
+defm FLAT_ATOMIC_FMAX        : FLAT_Atomic_Pseudo <"flat_atomic_fmax",
+                                VGPR_32, f32>;
+
+} // End SubtargetPredicate = isGFX7GFX10GFX11
+
+// GFX940-, GFX11-only flat instructions.
+let SubtargetPredicate = isGFX940GFX11Plus in {
+  defm FLAT_ATOMIC_ADD_F32       : FLAT_Atomic_Pseudo<"flat_atomic_add_f32",     VGPR_32, f32>;
+} // End SubtargetPredicate = isGFX940GFX11Plus
+
 defm GLOBAL_LOAD_UBYTE    : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>;
 defm GLOBAL_LOAD_SBYTE    : FLAT_Global_Load_Pseudo <"global_load_sbyte", VGPR_32>;
 defm GLOBAL_LOAD_USHORT   : FLAT_Global_Load_Pseudo <"global_load_ushort", VGPR_32>;
@@ -662,88 +764,93 @@ defm GLOBAL_STORE_SHORT_D16_HI : FLAT_Global_Store_Pseudo <"global_store_short_d
 
 let is_flat_global = 1 in {
 defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Global_Atomic_Pseudo <"global_atomic_cmpswap",
-                               VGPR_32, i32, AMDGPUatomic_cmp_swap_global_32,
-                               v2i32, VReg_64>;
+                               VGPR_32, i32, v2i32, VReg_64>;
 
 defm GLOBAL_ATOMIC_CMPSWAP_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_cmpswap_x2",
-                                  VReg_64, i64, AMDGPUatomic_cmp_swap_global_64,
-                                  v2i64, VReg_128>;
+                                  VReg_64, i64, v2i64, VReg_128>;
 
 defm GLOBAL_ATOMIC_SWAP : FLAT_Global_Atomic_Pseudo <"global_atomic_swap",
-                             VGPR_32, i32, atomic_swap_global_32>;
+                             VGPR_32, i32>;
 
 defm GLOBAL_ATOMIC_SWAP_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_swap_x2",
-                                VReg_64, i64, atomic_swap_global_64>;
+                                VReg_64, i64>;
 
 defm GLOBAL_ATOMIC_ADD : FLAT_Global_Atomic_Pseudo <"global_atomic_add",
-                           VGPR_32, i32, atomic_load_add_global_32>;
+                           VGPR_32, i32>;
 
 defm GLOBAL_ATOMIC_SUB : FLAT_Global_Atomic_Pseudo <"global_atomic_sub",
-                           VGPR_32, i32, atomic_load_sub_global_32>;
+                           VGPR_32, i32>;
 
 defm GLOBAL_ATOMIC_SMIN : FLAT_Global_Atomic_Pseudo <"global_atomic_smin",
-                            VGPR_32, i32, atomic_load_min_global_32>;
+                            VGPR_32, i32>;
 
 defm GLOBAL_ATOMIC_UMIN : FLAT_Global_Atomic_Pseudo <"global_atomic_umin",
-                            VGPR_32, i32, atomic_load_umin_global_32>;
+                            VGPR_32, i32>;
 
 defm GLOBAL_ATOMIC_SMAX : FLAT_Global_Atomic_Pseudo <"global_atomic_smax",
-                            VGPR_32, i32, atomic_load_max_global_32>;
+                            VGPR_32, i32>;
 
 defm GLOBAL_ATOMIC_UMAX : FLAT_Global_Atomic_Pseudo <"global_atomic_umax",
-                            VGPR_32, i32, atomic_load_umax_global_32>;
+                            VGPR_32, i32>;
 
 defm GLOBAL_ATOMIC_AND : FLAT_Global_Atomic_Pseudo <"global_atomic_and",
-                           VGPR_32, i32, atomic_load_and_global_32>;
+                           VGPR_32, i32>;
 
 defm GLOBAL_ATOMIC_OR : FLAT_Global_Atomic_Pseudo <"global_atomic_or",
-                          VGPR_32, i32, atomic_load_or_global_32>;
+                          VGPR_32, i32>;
 
 defm GLOBAL_ATOMIC_XOR : FLAT_Global_Atomic_Pseudo <"global_atomic_xor",
-                           VGPR_32, i32, atomic_load_xor_global_32>;
+                           VGPR_32, i32>;
 
 defm GLOBAL_ATOMIC_INC : FLAT_Global_Atomic_Pseudo <"global_atomic_inc",
-                           VGPR_32, i32, atomic_inc_global_32>;
+                           VGPR_32, i32>;
 
 defm GLOBAL_ATOMIC_DEC : FLAT_Global_Atomic_Pseudo <"global_atomic_dec",
-                           VGPR_32, i32, atomic_dec_global_32>;
+                           VGPR_32, i32>;
 
 defm GLOBAL_ATOMIC_ADD_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_add_x2",
-                              VReg_64, i64, atomic_load_add_global_64>;
+                              VReg_64, i64>;
 
 defm GLOBAL_ATOMIC_SUB_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_sub_x2",
-                              VReg_64, i64, atomic_load_sub_global_64>;
+                              VReg_64, i64>;
 
 defm GLOBAL_ATOMIC_SMIN_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_smin_x2",
-                               VReg_64, i64, atomic_load_min_global_64>;
+                               VReg_64, i64>;
 
 defm GLOBAL_ATOMIC_UMIN_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_umin_x2",
-                               VReg_64, i64, atomic_load_umin_global_64>;
+                               VReg_64, i64>;
 
 defm GLOBAL_ATOMIC_SMAX_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_smax_x2",
-                               VReg_64, i64, atomic_load_max_global_64>;
+                               VReg_64, i64>;
 
 defm GLOBAL_ATOMIC_UMAX_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_umax_x2",
-                               VReg_64, i64, atomic_load_umax_global_64>;
+                               VReg_64, i64>;
 
 defm GLOBAL_ATOMIC_AND_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_and_x2",
-                              VReg_64, i64, atomic_load_and_global_64>;
+                              VReg_64, i64>;
 
 defm GLOBAL_ATOMIC_OR_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_or_x2",
-                             VReg_64, i64, atomic_load_or_global_64>;
+                             VReg_64, i64>;
 
 defm GLOBAL_ATOMIC_XOR_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_xor_x2",
-                              VReg_64, i64, atomic_load_xor_global_64>;
+                              VReg_64, i64>;
 
 defm GLOBAL_ATOMIC_INC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_inc_x2",
-                              VReg_64, i64, atomic_inc_global_64>;
+                              VReg_64, i64>;
 
 defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_dec_x2",
-                              VReg_64, i64, atomic_dec_global_64>;
+                              VReg_64, i64>;
 
 let SubtargetPredicate = HasGFX10_BEncoding in
 defm GLOBAL_ATOMIC_CSUB : FLAT_Global_Atomic_Pseudo_RTN <"global_atomic_csub",
-                              VGPR_32, i32, int_amdgcn_global_atomic_csub>;
+                              VGPR_32, i32>;
+
+defm GLOBAL_LOAD_LDS_UBYTE  : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_ubyte">;
+defm GLOBAL_LOAD_LDS_SBYTE  : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sbyte">;
+defm GLOBAL_LOAD_LDS_USHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_ushort">;
+defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sshort">;
+defm GLOBAL_LOAD_LDS_DWORD  : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dword">;
+
 } // End is_flat_global = 1
 
 
@@ -775,41 +882,46 @@ defm SCRATCH_STORE_DWORDX4 : FLAT_Scratch_Store_Pseudo <"scratch_store_dwordx4",
 defm SCRATCH_STORE_BYTE_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_byte_d16_hi", VGPR_32>;
 defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_short_d16_hi", VGPR_32>;
 
+defm SCRATCH_LOAD_LDS_UBYTE  : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_ubyte">;
+defm SCRATCH_LOAD_LDS_SBYTE  : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_sbyte">;
+defm SCRATCH_LOAD_LDS_USHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_ushort">;
+defm SCRATCH_LOAD_LDS_SSHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_sshort">;
+defm SCRATCH_LOAD_LDS_DWORD  : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_dword">;
+
 } // End SubtargetPredicate = HasFlatScratchInsts
 
 let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in {
   defm GLOBAL_ATOMIC_FCMPSWAP :
-    FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", VGPR_32, f32, null_frag, v2f32, VReg_64>;
+    FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", VGPR_32, f32, v2f32, VReg_64>;
   defm GLOBAL_ATOMIC_FMIN :
-    FLAT_Global_Atomic_Pseudo<"global_atomic_fmin", VGPR_32, f32, int_amdgcn_global_atomic_fmin>;
+    FLAT_Global_Atomic_Pseudo<"global_atomic_fmin", VGPR_32, f32>;
   defm GLOBAL_ATOMIC_FMAX :
-    FLAT_Global_Atomic_Pseudo<"global_atomic_fmax", VGPR_32, f32, int_amdgcn_global_atomic_fmax>;
+    FLAT_Global_Atomic_Pseudo<"global_atomic_fmax", VGPR_32, f32>;
   defm GLOBAL_ATOMIC_FCMPSWAP_X2 :
-    FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap_x2", VReg_64, f64, null_frag, v2f64, VReg_128>;
+    FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap_x2", VReg_64, f64, v2f64, VReg_128>;
   defm GLOBAL_ATOMIC_FMIN_X2 :
-    FLAT_Global_Atomic_Pseudo<"global_atomic_fmin_x2", VReg_64, f64, int_amdgcn_global_atomic_fmin>;
+    FLAT_Global_Atomic_Pseudo<"global_atomic_fmin_x2", VReg_64, f64>;
   defm GLOBAL_ATOMIC_FMAX_X2 :
-    FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64, int_amdgcn_global_atomic_fmax>;
+    FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64>;
 } // End SubtargetPredicate = isGFX10Plus, is_flat_global = 1
 
 let is_flat_global = 1 in {
-let OtherPredicates = [HasAtomicFaddInsts] in {
+let OtherPredicates = [HasAtomicFaddNoRtnInsts] in
   defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN <
     "global_atomic_add_f32", VGPR_32, f32
   >;
+let OtherPredicates = [HasAtomicPkFaddNoRtnInsts] in
   defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN <
     "global_atomic_pk_add_f16", VGPR_32, v2f16
   >;
-} // End OtherPredicates = [HasAtomicFaddInsts]
-
-let OtherPredicates = [isGFX90APlus] in {
+let OtherPredicates = [HasAtomicFaddRtnInsts] in
   defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_RTN <
-    "global_atomic_add_f32", VGPR_32, f32, int_amdgcn_global_atomic_fadd
+    "global_atomic_add_f32", VGPR_32, f32
   >;
+let OtherPredicates = [isGFX90APlus] in
   defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_RTN <
-    "global_atomic_pk_add_f16", VGPR_32, v2f16, int_amdgcn_global_atomic_fadd
+    "global_atomic_pk_add_f16", VGPR_32, v2f16
   >;
-} // End OtherPredicates = [isGFX90APlus]
 } // End is_flat_global = 1
 
 //===----------------------------------------------------------------------===//
@@ -896,24 +1008,47 @@ class FlatStoreSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node,
   (inst $vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)
 >;
 
-class FlatAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
-                     ValueType data_vt = vt> : GCNPat <
-  (vt (node (FlatOffset i64:$vaddr, i16:$offset), data_vt:$data)),
-  (inst $vaddr, $data, $offset)
->;
-
 class FlatAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
   (node (FlatOffset i64:$vaddr, i16:$offset), vt:$data),
   (inst VReg_64:$vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
 >;
 
+multiclass FlatAtomicPat <string inst, string node, ValueType vt,
+                          ValueType data_vt = vt> {
+  defvar rtnNode = !cast<PatFrags>(node#"_ret_"#vt.Size);
+  defvar noRtnNode = !cast<PatFrags>(node#"_noret_"#vt.Size);
+
+  def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i16:$offset), data_vt:$data)),
+    (!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
+
+  def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i16:$offset), data_vt:$data)),
+    (!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
+}
+
+multiclass FlatSignedAtomicPat <string inst, string node, ValueType vt,
+                                ValueType data_vt = vt, bit isIntr = 0> {
+  defvar rtnNode = !cast<PatFrags>(node # "_ret" # !if(isIntr, "", "_" # vt.Size));
+  defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_" # vt.Size));
+
+  def : GCNPat <(vt (rtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)),
+    (!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
+
+  def : GCNPat <(vt (noRtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)),
+    (!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
+}
+
+multiclass FlatSignedAtomicIntrPat <string inst, string node, ValueType vt,
+                                    ValueType data_vt = vt> {
+  defm : FlatSignedAtomicPat<inst, node, vt, data_vt, /* isIntr */ 1>;
+}
+
 class FlatSignedAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
   (node (GlobalOffset i64:$vaddr, i16:$offset), vt:$data),
   (inst VReg_64:$vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
 >;
 
-class FlatSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
-                           ValueType data_vt = vt> : GCNPat <
+class FlatSignedAtomicPatRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
+                              ValueType data_vt = vt> : GCNPat <
   (vt (node (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)),
   (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)
 >;
@@ -949,8 +1084,28 @@ class ScratchStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
   (inst getVregSrcForVT<vt>.ret:$data, $saddr, $offset)
 >;
 
+class ScratchLoadSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+  (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i16:$offset))),
+  (inst $vaddr, $saddr, $offset, 0)
+>;
+
+class ScratchStoreSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
+                             ValueType vt> : GCNPat <
+  (node vt:$data, (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i16:$offset)),
+  (inst getVregSrcForVT<vt>.ret:$data, $vaddr, $saddr, $offset)
+>;
+
+class ScratchLoadSVaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+  (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i16:$offset), vt:$in)),
+  (inst $vaddr, $saddr, $offset, 0, $in)
+>;
+
 let OtherPredicates = [HasFlatAddressSpace] in {
 
+def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_8_flat, i32>;
+def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_8_flat, i16>;
+def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_16_flat, i32>;
+def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_16_flat, i16>;
 def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i32>;
 def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>;
 def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>;
@@ -986,44 +1141,52 @@ def : FlatLoadPat <FLAT_LOAD_DWORDX4, load_flat, vt>;
 def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, vt>;
 }
 
-def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_store_flat_32, i32>;
-def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_store_flat_64, i64>;
-
-def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_load_add_global_32, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_load_sub_global_32, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_INC_RTN, atomic_inc_global_32, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_DEC_RTN, atomic_dec_global_32, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_load_and_global_32, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_load_max_global_32, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_load_umax_global_32, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_load_min_global_32, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_load_umin_global_32, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_load_or_global_32, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global_32, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_RTN, AMDGPUatomic_cmp_swap_global_32, i32, v2i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_load_xor_global_32, i32>;
-
-def : FlatAtomicPat <FLAT_ATOMIC_ADD_X2_RTN, atomic_load_add_global_64, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_SUB_X2_RTN, atomic_load_sub_global_64, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_INC_X2_RTN, atomic_inc_global_64, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_DEC_X2_RTN, atomic_dec_global_64, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_AND_X2_RTN, atomic_load_and_global_64, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMAX_X2_RTN, atomic_load_max_global_64, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMAX_X2_RTN, atomic_load_umax_global_64, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMIN_X2_RTN, atomic_load_min_global_64, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMIN_X2_RTN, atomic_load_umin_global_64, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_OR_X2_RTN, atomic_load_or_global_64, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_SWAP_X2_RTN, atomic_swap_global_64, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_X2_RTN, AMDGPUatomic_cmp_swap_global_64, i64, v2i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_XOR_X2_RTN, atomic_load_xor_global_64, i64>;
+def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_store_32_flat, i32>;
+def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>;
+def : FlatStoreAtomicPat <FLAT_STORE_BYTE, atomic_store_8_flat, i32>;
+def : FlatStoreAtomicPat <FLAT_STORE_BYTE, atomic_store_8_flat, i16>;
+def : FlatStoreAtomicPat <FLAT_STORE_SHORT, atomic_store_16_flat, i32>;
+def : FlatStoreAtomicPat <FLAT_STORE_SHORT, atomic_store_16_flat, i16>;
+
+foreach as = [ "flat", "global" ] in {
+defm : FlatAtomicPat <"FLAT_ATOMIC_ADD", "atomic_load_add_"#as, i32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_SUB", "atomic_load_sub_"#as, i32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_INC", "atomic_inc_"#as, i32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_DEC", "atomic_dec_"#as, i32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_AND", "atomic_load_and_"#as, i32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_SMAX", "atomic_load_max_"#as, i32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_UMAX", "atomic_load_umax_"#as, i32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_SMIN", "atomic_load_min_"#as, i32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_UMIN", "atomic_load_umin_"#as, i32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_OR", "atomic_load_or_"#as, i32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_SWAP", "atomic_swap_"#as, i32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_CMPSWAP", "AMDGPUatomic_cmp_swap_"#as, i32, v2i32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_XOR", "atomic_load_xor_"#as, i32>;
+
+defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_X2", "atomic_load_add_"#as, i64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_SUB_X2", "atomic_load_sub_"#as, i64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_INC_X2", "atomic_inc_"#as, i64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_DEC_X2", "atomic_dec_"#as, i64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_AND_X2", "atomic_load_and_"#as, i64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_SMAX_X2", "atomic_load_max_"#as, i64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_UMAX_X2", "atomic_load_umax_"#as, i64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_SMIN_X2", "atomic_load_min_"#as, i64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_UMIN_X2", "atomic_load_umin_"#as, i64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_OR_X2", "atomic_load_or_"#as, i64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_SWAP_X2", "atomic_swap_"#as, i64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_"#as, i64, v2i64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_XOR_X2", "atomic_load_xor_"#as, i64>;
+} // end foreach as
 
 def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
 def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>;
 
-let OtherPredicates = [D16PreservesUnusedBits] in {
+let OtherPredicates = [HasD16LoadStore] in {
 def : FlatStorePat <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
 def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
+}
 
+let OtherPredicates = [D16PreservesUnusedBits] in {
 def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>;
 def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>;
 def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>;
@@ -1084,9 +1247,9 @@ multiclass GlobalFLATAtomicStorePats<FLAT_Pseudo inst, SDPatternOperator node, V
   }
 }
 
-multiclass GlobalFLATAtomicPats<string nortn_inst_name, SDPatternOperator node,
-                               ValueType vt, ValueType data_vt = vt> {
-  def : FlatSignedAtomicPat <!cast<FLAT_Pseudo>(nortn_inst_name#"_RTN"), node, vt, data_vt> {
+multiclass GlobalFLATAtomicPatsRtn<string nortn_inst_name, SDPatternOperator node,
+                                   ValueType vt, ValueType data_vt = vt> {
+  def : FlatSignedAtomicPatRtn <!cast<FLAT_Pseudo>(nortn_inst_name#"_RTN"), node, vt, data_vt> {
     let AddedComplexity = 10;
   }
 
@@ -1095,6 +1258,26 @@ multiclass GlobalFLATAtomicPats<string nortn_inst_name, SDPatternOperator node,
   }
 }
 
+multiclass GlobalFLATAtomicPats<string inst, string node, ValueType vt,
+                                ValueType data_vt = vt, bit isIntr = 0> {
+  defvar rtnNode = !cast<PatFrags>(node # "_ret" # !if(isIntr, "", "_" # vt.Size));
+  defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_" # vt.Size));
+
+  let AddedComplexity = 10 in {
+    defm : FlatSignedAtomicPat <inst, node, vt, data_vt, isIntr>;
+  }
+
+  let AddedComplexity = 11 in {
+    def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), noRtnNode, vt, data_vt>;
+    def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_RTN"), rtnNode, vt, data_vt>;
+  }
+}
+
+multiclass GlobalFLATAtomicIntrPats<string inst, string node, ValueType vt,
+                                    ValueType data_vt = vt> {
+  defm : GlobalFLATAtomicPats<inst, node, vt, data_vt, /* isIntr */ 1>;
+}
+
 multiclass GlobalFLATNoRtnAtomicPats<FLAT_Pseudo inst, SDPatternOperator node,
                                      ValueType vt> {
   def : FlatSignedAtomicPatNoRtn <inst, node, vt> {
@@ -1114,6 +1297,11 @@ multiclass ScratchFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueTy
   def : ScratchLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
     let AddedComplexity = 26;
   }
+
+  def : ScratchLoadSVaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SVS"), node, vt> {
+    let SubtargetPredicate = HasFlatScratchSVSMode;
+    let AddedComplexity = 27;
+  }
 }
 
 multiclass ScratchFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node,
@@ -1125,6 +1313,11 @@ multiclass ScratchFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node,
   def : ScratchStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
     let AddedComplexity = 26;
   }
+
+  def : ScratchStoreSVaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SVS"), node, vt> {
+    let SubtargetPredicate = HasFlatScratchSVSMode;
+    let AddedComplexity = 27;
+  }
 }
 
 multiclass ScratchFLATLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
@@ -1135,10 +1328,19 @@ multiclass ScratchFLATLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, Val
   def : ScratchLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
     let AddedComplexity = 26;
   }
+
+  def : ScratchLoadSVaddrPat_D16 <!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SVS"), node, vt> {
+    let SubtargetPredicate = HasFlatScratchSVSMode;
+    let AddedComplexity = 27;
+  }
 }
 
 let OtherPredicates = [HasFlatGlobalInsts] in {
 
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_8_global, i32>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_8_global, i16>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_16_global, i32>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_16_global, i16>;
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, extloadi8_global, i32>;
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, zextloadi8_global, i32>;
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_SBYTE, sextloadi8_global, i32>;
@@ -1179,10 +1381,12 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, truncstorei16_global, i32>;
 defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, store_global, i16>;
 defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX3, store_global, v3i32>;
 
-let OtherPredicates = [D16PreservesUnusedBits] in {
+let OtherPredicates = [HasD16LoadStore] in {
 defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT_D16_HI, truncstorei16_hi16_global, i32>;
 defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i32>;
+}
 
+let OtherPredicates = [D16PreservesUnusedBits] in {
 defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_global, v2i16>;
 defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_global, v2f16>;
 defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_global, v2i16>;
@@ -1198,59 +1402,84 @@ defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2i16>
 defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2f16>;
 }
 
-defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_DWORD, atomic_store_global_32, i32>;
-defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_DWORDX2, atomic_store_global_64, i64>;
-
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD", atomic_load_add_global_32, i32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB", atomic_load_sub_global_32, i32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_INC", atomic_inc_global_32, i32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_DEC", atomic_dec_global_32, i32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_AND", atomic_load_and_global_32, i32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMAX", atomic_load_max_global_32, i32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMAX", atomic_load_umax_global_32, i32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMIN", atomic_load_min_global_32, i32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMIN", atomic_load_umin_global_32, i32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_OR", atomic_load_or_global_32, i32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP", atomic_swap_global_32, i32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", AMDGPUatomic_cmp_swap_global_32, i32, v2i32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", atomic_load_xor_global_32, i32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CSUB", int_amdgcn_global_atomic_csub, i32>;
-
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", atomic_load_add_global_64, i64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB_X2", atomic_load_sub_global_64, i64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_INC_X2", atomic_inc_global_64, i64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_DEC_X2", atomic_dec_global_64, i64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_AND_X2", atomic_load_and_global_64, i64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMAX_X2", atomic_load_max_global_64, i64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMAX_X2", atomic_load_umax_global_64, i64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMIN_X2", atomic_load_min_global_64, i64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMIN_X2", atomic_load_umin_global_64, i64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_OR_X2", atomic_load_or_global_64, i64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP_X2", atomic_swap_global_64, i64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", AMDGPUatomic_cmp_swap_global_64, i64, v2i64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", atomic_load_xor_global_64, i64>;
+defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i32>;
+defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i16>;
+defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i32>;
+defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i16>;
+defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_DWORD, atomic_store_32_global, i32>;
+defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, i64>;
+
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD", "atomic_load_add_global", i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB", "atomic_load_sub_global", i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_INC", "atomic_inc_global", i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_DEC", "atomic_dec_global", i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_AND", "atomic_load_and_global", i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMAX", "atomic_load_max_global", i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMAX", "atomic_load_umax_global", i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMIN", "atomic_load_min_global", i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMIN", "atomic_load_umin_global", i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_OR", "atomic_load_or_global", i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP", "atomic_swap_global", i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", "AMDGPUatomic_cmp_swap_global", i32, v2i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", "atomic_load_xor_global", i32>;
+defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", int_amdgcn_global_atomic_csub, i32>;
+
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", "atomic_load_add_global", i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB_X2", "atomic_load_sub_global", i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_INC_X2", "atomic_inc_global", i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_DEC_X2", "atomic_dec_global", i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_AND_X2", "atomic_load_and_global", i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMAX_X2", "atomic_load_max_global", i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMAX_X2", "atomic_load_umax_global", i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMIN_X2", "atomic_load_min_global", i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMIN_X2", "atomic_load_umin_global", i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_OR_X2", "atomic_load_or_global", i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP_X2", "atomic_swap_global", i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_global", i64, v2i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i64>;
 
 let OtherPredicates = [isGFX10Plus] in {
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", atomic_load_fmin_global_32, f32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", atomic_load_fmax_global_32, f32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN_X2", atomic_load_fmin_global_64, f64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX_X2", atomic_load_fmax_global_64, f64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN_X2", "atomic_load_fmin_global", f64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX_X2", "atomic_load_fmax_global", f64>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin", f32>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax", f32>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN_X2", "int_amdgcn_global_atomic_fmin", f64>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX_X2", "int_amdgcn_global_atomic_fmax", f64>;
 }
 
-let OtherPredicates = [HasAtomicFaddInsts] in {
+let OtherPredicates = [HasAtomicFaddNoRtnInsts] in
 defm : GlobalFLATNoRtnAtomicPats <GLOBAL_ATOMIC_ADD_F32,    atomic_load_fadd_global_noret_32, f32>;
+let OtherPredicates = [HasAtomicPkFaddNoRtnInsts] in
 defm : GlobalFLATNoRtnAtomicPats <GLOBAL_ATOMIC_PK_ADD_F16, atomic_load_fadd_v2f16_global_noret_32, v2f16>;
-}
 
 let OtherPredicates = [isGFX90APlus] in {
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F32",    atomic_load_fadd_global_32,       f32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", atomic_load_fadd_v2f16_global_32, v2f16>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64",    atomic_load_fadd_global_64,       f64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64",    atomic_load_fmin_global_64,       f64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64",    atomic_load_fmax_global_64,       f64>;
-def  : FlatSignedAtomicPat  <FLAT_ATOMIC_ADD_F64_RTN,    atomic_load_fadd_flat_64,         f64>;
-def  : FlatSignedAtomicPat  <FLAT_ATOMIC_MIN_F64_RTN,    atomic_load_fmin_flat_64,         f64>;
-def  : FlatSignedAtomicPat  <FLAT_ATOMIC_MAX_F64_RTN,    atomic_load_fmax_flat_64,         f64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_v2f16_global", v2f16>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", "atomic_load_fmin_global", f64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", "atomic_load_fmax_global", f64>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", f32>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", f64>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", v2f16>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MIN_F64", "int_amdgcn_global_atomic_fmin", f64>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MAX_F64", "int_amdgcn_global_atomic_fmax", f64>;
+defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>;
+defm : FlatSignedAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_flat", f64>;
+defm : FlatSignedAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_flat", f64>;
+defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", f64>;
+defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_MIN_F64", "int_amdgcn_flat_atomic_fmin", f64>;
+defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_MAX_F64", "int_amdgcn_flat_atomic_fmax", f64>;
+}
+
+let OtherPredicates = [isGFX940Plus] in {
+defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F32",    "atomic_load_fadd_flat",       f32>;
+defm : FlatSignedAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_v2f16_flat", v2f16>;
+defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_ADD_F32",     "int_amdgcn_flat_atomic_fadd",        f32>;
+defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_F16",  "int_amdgcn_flat_atomic_fadd",        v2f16>;
+defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>;
 }
 
 } // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
@@ -1291,10 +1520,12 @@ defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT, truncstorei16_private, i32>;
 defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT, store_private, i16>;
 defm : ScratchFLATStorePats <SCRATCH_STORE_DWORDX3, store_private, v3i32>;
 
-let OtherPredicates = [D16PreservesUnusedBits, HasFlatScratchInsts, EnableFlatScratch] in {
+let OtherPredicates = [HasD16LoadStore, HasFlatScratchInsts, EnableFlatScratch] in {
 defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT_D16_HI, truncstorei16_hi16_private, i32>;
 defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE_D16_HI, truncstorei8_hi16_private, i32>;
+}
 
+let OtherPredicates = [D16PreservesUnusedBits, HasFlatScratchInsts, EnableFlatScratch] in {
 defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_private, v2i16>;
 defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_private, v2f16>;
 defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_private, v2i16>;
@@ -1405,6 +1636,57 @@ multiclass FLAT_Real_AllAddr_vi<bits<7> op,
   def _SADDR_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>;
 }
 
+class FLAT_Real_gfx940 <bits<7> op, FLAT_Pseudo ps> :
+  FLAT_Real <op, ps>,
+  SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX940> {
+  let AssemblerPredicate = isGFX940Plus;
+  let DecoderNamespace = "GFX9";
+  let Inst{13} = ps.sve;
+  let Inst{25} = !if(ps.has_sccb, cpol{CPolBit.SCC}, ps.sccbValue);
+}
+
+multiclass FLAT_Real_AllAddr_SVE_vi<bits<7> op> {
+  def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME)> {
+    let AssemblerPredicate = isGFX8GFX9NotGFX940;
+    let OtherPredicates = [isGFX8GFX9NotGFX940];
+  }
+  def _SADDR_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")> {
+    let DecoderNamespace = "GFX9";
+  }
+  let AssemblerPredicate = isGFX940Plus, SubtargetPredicate = isGFX940Plus in {
+    def _VE_gfx940  : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME)>;
+    def _SVS_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_SVS")>;
+    def _ST_gfx940  : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_ST")>;
+  }
+}
+
+multiclass FLAT_Real_AllAddr_LDS<bits<7> op, bits<7> pre_gfx940_op,
+  string pre_gfx940_name = !subst("_lds", "", !cast<FLAT_Pseudo>(NAME).PseudoInstr),
+  bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
+
+  let OtherPredicates = [isGFX8GFX9NotGFX940] in {
+    def _vi : FLAT_Real_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME), has_sccb> {
+      let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME).AsmOperands # " lds";
+    }
+    def _SADDR_vi : FLAT_Real_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb> {
+      let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME#"_SADDR").AsmOperands # " lds";
+    }
+  }
+
+  let SubtargetPredicate = isGFX940Plus in {
+    def _gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME)>;
+    def _SADDR_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")>;
+  }
+}
+
+multiclass FLAT_Real_AllAddr_SVE_LDS<bits<7> op, bits<7> pre_gfx940_op> {
+  defm "" : FLAT_Real_AllAddr_LDS<op, pre_gfx940_op>;
+  let SubtargetPredicate = isGFX940Plus in {
+    def _SVS_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_SVS")>;
+    def _ST_gfx940  : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_ST")>;
+  }
+}
+
 def FLAT_LOAD_UBYTE_vi         : FLAT_Real_vi <0x10, FLAT_LOAD_UBYTE>;
 def FLAT_LOAD_SBYTE_vi         : FLAT_Real_vi <0x11, FLAT_LOAD_SBYTE>;
 def FLAT_LOAD_USHORT_vi        : FLAT_Real_vi <0x12, FLAT_LOAD_USHORT>;
@@ -1496,6 +1778,11 @@ defm GLOBAL_STORE_DWORDX2 : FLAT_Real_AllAddr_vi <0x1d>;
 defm GLOBAL_STORE_DWORDX3 : FLAT_Real_AllAddr_vi <0x1e>;
 defm GLOBAL_STORE_DWORDX4 : FLAT_Real_AllAddr_vi <0x1f>;
 
+defm GLOBAL_LOAD_LDS_UBYTE  : FLAT_Real_AllAddr_LDS <0x026, 0x10>;
+defm GLOBAL_LOAD_LDS_SBYTE  : FLAT_Real_AllAddr_LDS <0x027, 0x11>;
+defm GLOBAL_LOAD_LDS_USHORT : FLAT_Real_AllAddr_LDS <0x028, 0x12>;
+defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Real_AllAddr_LDS <0x029, 0x13>;
+defm GLOBAL_LOAD_LDS_DWORD  : FLAT_Real_AllAddr_LDS <0x02a, 0x14>;
 
 defm GLOBAL_ATOMIC_SWAP       : FLAT_Global_Real_Atomics_vi <0x40>;
 defm GLOBAL_ATOMIC_CMPSWAP    : FLAT_Global_Real_Atomics_vi <0x41>;
@@ -1524,32 +1811,39 @@ defm GLOBAL_ATOMIC_XOR_X2     : FLAT_Global_Real_Atomics_vi <0x6a>;
 defm GLOBAL_ATOMIC_INC_X2     : FLAT_Global_Real_Atomics_vi <0x6b>;
 defm GLOBAL_ATOMIC_DEC_X2     : FLAT_Global_Real_Atomics_vi <0x6c>;
 
-defm SCRATCH_LOAD_UBYTE         : FLAT_Real_AllAddr_vi <0x10>;
-defm SCRATCH_LOAD_SBYTE         : FLAT_Real_AllAddr_vi <0x11>;
-defm SCRATCH_LOAD_USHORT        : FLAT_Real_AllAddr_vi <0x12>;
-defm SCRATCH_LOAD_SSHORT        : FLAT_Real_AllAddr_vi <0x13>;
-defm SCRATCH_LOAD_DWORD         : FLAT_Real_AllAddr_vi <0x14>;
-defm SCRATCH_LOAD_DWORDX2       : FLAT_Real_AllAddr_vi <0x15>;
-defm SCRATCH_LOAD_DWORDX3       : FLAT_Real_AllAddr_vi <0x16>;
-defm SCRATCH_LOAD_DWORDX4       : FLAT_Real_AllAddr_vi <0x17>;
-defm SCRATCH_STORE_BYTE         : FLAT_Real_AllAddr_vi <0x18>;
-defm SCRATCH_STORE_BYTE_D16_HI  : FLAT_Real_AllAddr_vi <0x19>;
-defm SCRATCH_LOAD_UBYTE_D16     : FLAT_Real_AllAddr_vi <0x20>;
-defm SCRATCH_LOAD_UBYTE_D16_HI  : FLAT_Real_AllAddr_vi <0x21>;
-defm SCRATCH_LOAD_SBYTE_D16     : FLAT_Real_AllAddr_vi <0x22>;
-defm SCRATCH_LOAD_SBYTE_D16_HI  : FLAT_Real_AllAddr_vi <0x23>;
-defm SCRATCH_LOAD_SHORT_D16     : FLAT_Real_AllAddr_vi <0x24>;
-defm SCRATCH_LOAD_SHORT_D16_HI  : FLAT_Real_AllAddr_vi <0x25>;
-defm SCRATCH_STORE_SHORT        : FLAT_Real_AllAddr_vi <0x1a>;
-defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Real_AllAddr_vi <0x1b>;
-defm SCRATCH_STORE_DWORD        : FLAT_Real_AllAddr_vi <0x1c>;
-defm SCRATCH_STORE_DWORDX2      : FLAT_Real_AllAddr_vi <0x1d>;
-defm SCRATCH_STORE_DWORDX3      : FLAT_Real_AllAddr_vi <0x1e>;
-defm SCRATCH_STORE_DWORDX4      : FLAT_Real_AllAddr_vi <0x1f>;
-
-let SubtargetPredicate = HasAtomicFaddInsts in {
-defm GLOBAL_ATOMIC_ADD_F32    : FLAT_Global_Real_Atomics_vi <0x04d, 0>;
-defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Real_Atomics_vi <0x04e, 0>;
+defm SCRATCH_LOAD_LDS_UBYTE  : FLAT_Real_AllAddr_SVE_LDS <0x026, 0x10>;
+defm SCRATCH_LOAD_LDS_SBYTE  : FLAT_Real_AllAddr_SVE_LDS <0x027, 0x11>;
+defm SCRATCH_LOAD_LDS_USHORT : FLAT_Real_AllAddr_SVE_LDS <0x028, 0x12>;
+defm SCRATCH_LOAD_LDS_SSHORT : FLAT_Real_AllAddr_SVE_LDS <0x029, 0x13>;
+defm SCRATCH_LOAD_LDS_DWORD  : FLAT_Real_AllAddr_SVE_LDS <0x02a, 0x14>;
+
+defm SCRATCH_LOAD_UBYTE         : FLAT_Real_AllAddr_SVE_vi <0x10>;
+defm SCRATCH_LOAD_SBYTE         : FLAT_Real_AllAddr_SVE_vi <0x11>;
+defm SCRATCH_LOAD_USHORT        : FLAT_Real_AllAddr_SVE_vi <0x12>;
+defm SCRATCH_LOAD_SSHORT        : FLAT_Real_AllAddr_SVE_vi <0x13>;
+defm SCRATCH_LOAD_DWORD         : FLAT_Real_AllAddr_SVE_vi <0x14>;
+defm SCRATCH_LOAD_DWORDX2       : FLAT_Real_AllAddr_SVE_vi <0x15>;
+defm SCRATCH_LOAD_DWORDX3       : FLAT_Real_AllAddr_SVE_vi <0x16>;
+defm SCRATCH_LOAD_DWORDX4       : FLAT_Real_AllAddr_SVE_vi <0x17>;
+defm SCRATCH_STORE_BYTE         : FLAT_Real_AllAddr_SVE_vi <0x18>;
+defm SCRATCH_STORE_BYTE_D16_HI  : FLAT_Real_AllAddr_SVE_vi <0x19>;
+defm SCRATCH_LOAD_UBYTE_D16     : FLAT_Real_AllAddr_SVE_vi <0x20>;
+defm SCRATCH_LOAD_UBYTE_D16_HI  : FLAT_Real_AllAddr_SVE_vi <0x21>;
+defm SCRATCH_LOAD_SBYTE_D16     : FLAT_Real_AllAddr_SVE_vi <0x22>;
+defm SCRATCH_LOAD_SBYTE_D16_HI  : FLAT_Real_AllAddr_SVE_vi <0x23>;
+defm SCRATCH_LOAD_SHORT_D16     : FLAT_Real_AllAddr_SVE_vi <0x24>;
+defm SCRATCH_LOAD_SHORT_D16_HI  : FLAT_Real_AllAddr_SVE_vi <0x25>;
+defm SCRATCH_STORE_SHORT        : FLAT_Real_AllAddr_SVE_vi <0x1a>;
+defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Real_AllAddr_SVE_vi <0x1b>;
+defm SCRATCH_STORE_DWORD        : FLAT_Real_AllAddr_SVE_vi <0x1c>;
+defm SCRATCH_STORE_DWORDX2      : FLAT_Real_AllAddr_SVE_vi <0x1d>;
+defm SCRATCH_STORE_DWORDX3      : FLAT_Real_AllAddr_SVE_vi <0x1e>;
+defm SCRATCH_STORE_DWORDX4      : FLAT_Real_AllAddr_SVE_vi <0x1f>;
+
+let SubtargetPredicate = isGFX8GFX9NotGFX940 in {
+  // These instructions are encoded differently on gfx90* and gfx940.
+  defm GLOBAL_ATOMIC_ADD_F32    : FLAT_Global_Real_Atomics_vi <0x04d, 0>;
+  defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Real_Atomics_vi <0x04e, 0>;
 }
 
 let SubtargetPredicate = isGFX90AOnly in {
@@ -1561,13 +1855,46 @@ let SubtargetPredicate = isGFX90AOnly in {
   defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Real_Atomics_vi<0x51, 0>;
 } // End SubtargetPredicate = isGFX90AOnly
 
+multiclass FLAT_Real_AllAddr_gfx940<bits<7> op> {
+  def _gfx940       : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME)>;
+  def _SADDR_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")>;
+}
+
+multiclass FLAT_Real_Atomics_gfx940 <bits<7> op, FLAT_Pseudo ps> {
+  def _gfx940     : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(ps.PseudoInstr)>;
+  def _RTN_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN")>;
+}
+
+multiclass FLAT_Global_Real_Atomics_gfx940<bits<7> op> :
+  FLAT_Real_AllAddr_gfx940<op> {
+  def _RTN_gfx940       : FLAT_Real_gfx940 <op, !cast<FLAT_Pseudo>(NAME#"_RTN")>;
+  def _SADDR_RTN_gfx940 : FLAT_Real_gfx940 <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN")>;
+}
+
+let SubtargetPredicate = isGFX940Plus in {
+  // These instructions are encoded differently on gfx90* and gfx940.
+  defm GLOBAL_ATOMIC_ADD_F32     : FLAT_Global_Real_Atomics_gfx940 <0x04d>;
+  defm GLOBAL_ATOMIC_PK_ADD_F16  : FLAT_Global_Real_Atomics_gfx940 <0x04e>;
+
+  defm FLAT_ATOMIC_ADD_F64       : FLAT_Real_Atomics_gfx940<0x4f, FLAT_ATOMIC_ADD_F64>;
+  defm FLAT_ATOMIC_MIN_F64       : FLAT_Real_Atomics_gfx940<0x50, FLAT_ATOMIC_MIN_F64>;
+  defm FLAT_ATOMIC_MAX_F64       : FLAT_Real_Atomics_gfx940<0x51, FLAT_ATOMIC_MAX_F64>;
+  defm GLOBAL_ATOMIC_ADD_F64     : FLAT_Global_Real_Atomics_gfx940<0x4f>;
+  defm GLOBAL_ATOMIC_MIN_F64     : FLAT_Global_Real_Atomics_gfx940<0x50>;
+  defm GLOBAL_ATOMIC_MAX_F64     : FLAT_Global_Real_Atomics_gfx940<0x51>;
+  defm FLAT_ATOMIC_ADD_F32       : FLAT_Real_Atomics_vi<0x4d, FLAT_ATOMIC_ADD_F32>;
+  defm FLAT_ATOMIC_PK_ADD_F16    : FLAT_Real_Atomics_vi<0x4e, FLAT_ATOMIC_PK_ADD_F16>;
+  defm FLAT_ATOMIC_PK_ADD_BF16   : FLAT_Real_Atomics_vi<0x52, FLAT_ATOMIC_PK_ADD_BF16>;
+  defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Real_Atomics_vi<0x52>;
+} // End SubtargetPredicate = isGFX940Plus
+
 //===----------------------------------------------------------------------===//
 // GFX10.
 //===----------------------------------------------------------------------===//
 
 class FLAT_Real_gfx10<bits<7> op, FLAT_Pseudo ps> :
     FLAT_Real<op, ps>, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX10> {
-  let AssemblerPredicate = isGFX10Plus;
+  let AssemblerPredicate = isGFX10Only;
   let DecoderNamespace = "GFX10";
 
   let Inst{11-0}  = offset{11-0};
@@ -1627,6 +1954,23 @@ multiclass FLAT_Real_ScratchAllAddr_gfx10<bits<7> op> :
   FLAT_Real_SADDR_gfx10<op>,
   FLAT_Real_ST_gfx10<op>;
 
+multiclass FLAT_Real_AllAddr_LDS_gfx10<bits<7> op,
+  string opname = !subst("_lds", "", !cast<FLAT_Pseudo>(NAME).PseudoInstr)> {
+  let AsmString = opname # !cast<FLAT_Pseudo>(NAME).AsmOperands # " lds" in
+  defm "" : FLAT_Real_Base_gfx10<op>;
+
+  let AsmString = opname # !cast<FLAT_Pseudo>(NAME#"_SADDR").AsmOperands # " lds" in
+  defm "" : FLAT_Real_SADDR_gfx10<op>;
+}
+
+multiclass FLAT_Real_ScratchAllAddr_LDS_gfx10<bits<7> op,
+  string opname = !subst("_lds", "", !cast<FLAT_Pseudo>(NAME).PseudoInstr)> {
+  defm "" : FLAT_Real_AllAddr_LDS_gfx10<op>;
+
+  let AsmString = opname # !cast<FLAT_Pseudo>(NAME#"_ST").AsmOperands # " lds" in
+  defm "" : FLAT_Real_ST_gfx10<op>;
+}
+
 // ENC_FLAT.
 defm FLAT_LOAD_UBYTE            : FLAT_Real_Base_gfx10<0x008>;
 defm FLAT_LOAD_SBYTE            : FLAT_Real_Base_gfx10<0x009>;
@@ -1743,6 +2087,12 @@ defm GLOBAL_ATOMIC_FMAX_X2      : FLAT_Real_GlblAtomics_gfx10<0x060>;
 defm GLOBAL_LOAD_DWORD_ADDTID   : FLAT_Real_AllAddr_gfx10<0x016>;
 defm GLOBAL_STORE_DWORD_ADDTID  : FLAT_Real_AllAddr_gfx10<0x017>;
 
+defm GLOBAL_LOAD_LDS_UBYTE      : FLAT_Real_AllAddr_LDS_gfx10 <0x008>;
+defm GLOBAL_LOAD_LDS_SBYTE      : FLAT_Real_AllAddr_LDS_gfx10 <0x009>;
+defm GLOBAL_LOAD_LDS_USHORT     : FLAT_Real_AllAddr_LDS_gfx10 <0x00a>;
+defm GLOBAL_LOAD_LDS_SSHORT     : FLAT_Real_AllAddr_LDS_gfx10 <0x00b>;
+defm GLOBAL_LOAD_LDS_DWORD      : FLAT_Real_AllAddr_LDS_gfx10 <0x00c>;
+
 // ENC_FLAT_SCRATCH.
 defm SCRATCH_LOAD_UBYTE         : FLAT_Real_ScratchAllAddr_gfx10<0x008>;
 defm SCRATCH_LOAD_SBYTE         : FLAT_Real_ScratchAllAddr_gfx10<0x009>;
@@ -1766,3 +2116,219 @@ defm SCRATCH_LOAD_SBYTE_D16     : FLAT_Real_ScratchAllAddr_gfx10<0x022>;
 defm SCRATCH_LOAD_SBYTE_D16_HI  : FLAT_Real_ScratchAllAddr_gfx10<0x023>;
 defm SCRATCH_LOAD_SHORT_D16     : FLAT_Real_ScratchAllAddr_gfx10<0x024>;
 defm SCRATCH_LOAD_SHORT_D16_HI  : FLAT_Real_ScratchAllAddr_gfx10<0x025>;
+
+defm SCRATCH_LOAD_LDS_UBYTE     : FLAT_Real_ScratchAllAddr_LDS_gfx10 <0x008>;
+defm SCRATCH_LOAD_LDS_SBYTE     : FLAT_Real_ScratchAllAddr_LDS_gfx10 <0x009>;
+defm SCRATCH_LOAD_LDS_USHORT    : FLAT_Real_ScratchAllAddr_LDS_gfx10 <0x00a>;
+defm SCRATCH_LOAD_LDS_SSHORT    : FLAT_Real_ScratchAllAddr_LDS_gfx10 <0x00b>;
+defm SCRATCH_LOAD_LDS_DWORD     : FLAT_Real_ScratchAllAddr_LDS_gfx10 <0x00c>;
+
+//===----------------------------------------------------------------------===//
+// GFX11
+//===----------------------------------------------------------------------===//
+
+class FLAT_Real_gfx11 <bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
+  FLAT_Real <op, ps, opName>,
+  SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX11> {
+  let AssemblerPredicate = isGFX11Plus;
+  let DecoderNamespace = "GFX11";
+
+  let Inst{13}    = !if(ps.has_dlc, cpol{CPolBit.DLC}, ps.dlcValue);
+  let Inst{14}    = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glcValue);
+  let Inst{15}    = cpol{CPolBit.SLC};
+  let Inst{17-16} = seg;
+  let Inst{55}    = ps.sve;
+}
+
+multiclass FLAT_Real_Base_gfx11<bits<7> op, string ps, string opName, int renamed = false> {
+  def _gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps), opName> {
+    let Inst{54-48} = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding);
+  }
+  if renamed then
+    def _renamed_gfx11 : MnemonicAlias<!cast<FLAT_Pseudo>(ps).Mnemonic, opName>, Requires<[isGFX11Plus]>;
+}
+
+multiclass FLAT_Real_RTN_gfx11<bits<7> op, string ps, string opName> {
+  def _RTN_gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps#"_RTN"), opName> {
+    let Inst{54-48} = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding);
+  }
+}
+
+multiclass FLAT_Real_SADDR_gfx11<bits<7> op, string ps, string opName> {
+  def _SADDR_gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps#"_SADDR"), opName>;
+}
+
+multiclass FLAT_Real_SADDR_RTN_gfx11<bits<7> op, string ps, string opName> {
+  def _SADDR_RTN_gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps#"_SADDR_RTN"), opName>;
+}
+
+multiclass FLAT_Real_ST_gfx11<bits<7> op, string ps, string opName> {
+  def _ST_gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps#"_ST"), opName> {
+    let Inst{54-48} = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding);
+    let OtherPredicates = [HasFlatScratchSTMode];
+  }
+}
+
+multiclass FLAT_Real_SVS_gfx11<bits<7> op, string ps, string opName> {
+  def _SVS_gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps#"_SVS"), opName> {
+    let OtherPredicates = [HasFlatScratchSVSMode];
+  }
+}
+
+multiclass FLAT_Real_AllAddr_gfx11<bits<7> op, string ps, string opName, int renamed = false> :
+  FLAT_Real_Base_gfx11<op, ps, opName, renamed>,
+  FLAT_Real_SADDR_gfx11<op, ps, opName>;
+
+multiclass FLAT_Real_Atomics_gfx11<bits<7> op, string ps, string opName, int renamed = false> :
+  FLAT_Real_Base_gfx11<op, ps, opName, renamed>,
+  FLAT_Real_RTN_gfx11<op, ps, opName>;
+
+multiclass FLAT_Real_GlblAtomics_gfx11<bits<7> op, string ps, string opName, int renamed = false> :
+  FLAT_Real_AllAddr_gfx11<op, ps, opName, renamed>,
+  FLAT_Real_RTN_gfx11<op, ps, opName>,
+  FLAT_Real_SADDR_RTN_gfx11<op, ps, opName>;
+
+multiclass FLAT_Real_GlblAtomics_RTN_gfx11<bits<7> op, string ps, string opName> :
+  FLAT_Real_RTN_gfx11<op, ps, opName>,
+  FLAT_Real_SADDR_RTN_gfx11<op, ps, opName>;
+
+multiclass FLAT_Real_ScratchAllAddr_gfx11<bits<7> op, string ps, string opName, int renamed = false> :
+  FLAT_Real_Base_gfx11<op, ps, opName, renamed>,
+  FLAT_Real_SADDR_gfx11<op, ps, opName>,
+  FLAT_Real_ST_gfx11<op, ps, opName>,
+  FLAT_Real_SVS_gfx11<op, ps, opName>;
+
+// ENC_FLAT.
+defm FLAT_LOAD_U8               : FLAT_Real_Base_gfx11<0x010, "FLAT_LOAD_UBYTE", "flat_load_u8", true>;
+defm FLAT_LOAD_I8               : FLAT_Real_Base_gfx11<0x011, "FLAT_LOAD_SBYTE", "flat_load_i8", true>;
+defm FLAT_LOAD_U16              : FLAT_Real_Base_gfx11<0x012, "FLAT_LOAD_USHORT", "flat_load_u16", true>;
+defm FLAT_LOAD_I16              : FLAT_Real_Base_gfx11<0x013, "FLAT_LOAD_SSHORT", "flat_load_i16", true>;
+defm FLAT_LOAD_B32              : FLAT_Real_Base_gfx11<0x014, "FLAT_LOAD_DWORD", "flat_load_b32", true>;
+defm FLAT_LOAD_B64              : FLAT_Real_Base_gfx11<0x015, "FLAT_LOAD_DWORDX2", "flat_load_b64", true>;
+defm FLAT_LOAD_B96              : FLAT_Real_Base_gfx11<0x016, "FLAT_LOAD_DWORDX3", "flat_load_b96", true>;
+defm FLAT_LOAD_B128             : FLAT_Real_Base_gfx11<0x017, "FLAT_LOAD_DWORDX4", "flat_load_b128", true>;
+defm FLAT_STORE_B8              : FLAT_Real_Base_gfx11<0x018, "FLAT_STORE_BYTE", "flat_store_b8", true>;
+defm FLAT_STORE_B16             : FLAT_Real_Base_gfx11<0x019, "FLAT_STORE_SHORT", "flat_store_b16", true>;
+defm FLAT_STORE_B32             : FLAT_Real_Base_gfx11<0x01a, "FLAT_STORE_DWORD", "flat_store_b32", true>;
+defm FLAT_STORE_B64             : FLAT_Real_Base_gfx11<0x01b, "FLAT_STORE_DWORDX2", "flat_store_b64", true>;
+defm FLAT_STORE_B96             : FLAT_Real_Base_gfx11<0x01c, "FLAT_STORE_DWORDX3", "flat_store_b96", true>;
+defm FLAT_STORE_B128            : FLAT_Real_Base_gfx11<0x01d, "FLAT_STORE_DWORDX4", "flat_store_b128", true>;
+defm FLAT_LOAD_D16_U8           : FLAT_Real_Base_gfx11<0x01e, "FLAT_LOAD_UBYTE_D16", "flat_load_d16_u8">;
+defm FLAT_LOAD_D16_I8           : FLAT_Real_Base_gfx11<0x01f, "FLAT_LOAD_SBYTE_D16", "flat_load_d16_i8">;
+defm FLAT_LOAD_D16_B16          : FLAT_Real_Base_gfx11<0x020, "FLAT_LOAD_SHORT_D16", "flat_load_d16_b16">;
+defm FLAT_LOAD_D16_HI_U8        : FLAT_Real_Base_gfx11<0x021, "FLAT_LOAD_UBYTE_D16_HI", "flat_load_d16_hi_u8">;
+defm FLAT_LOAD_D16_HI_I8        : FLAT_Real_Base_gfx11<0x022, "FLAT_LOAD_SBYTE_D16_HI", "flat_load_d16_hi_i8">;
+defm FLAT_LOAD_D16_HI_B16       : FLAT_Real_Base_gfx11<0x023, "FLAT_LOAD_SHORT_D16_HI", "flat_load_d16_hi_b16">;
+defm FLAT_STORE_D16_HI_B8       : FLAT_Real_Base_gfx11<0x024, "FLAT_STORE_BYTE_D16_HI", "flat_store_d16_hi_b8">;
+defm FLAT_STORE_D16_HI_B16      : FLAT_Real_Base_gfx11<0x025, "FLAT_STORE_SHORT_D16_HI", "flat_store_d16_hi_b16">;
+defm FLAT_ATOMIC_SWAP_B32       : FLAT_Real_Atomics_gfx11<0x033, "FLAT_ATOMIC_SWAP", "flat_atomic_swap_b32", true>;
+defm FLAT_ATOMIC_CMPSWAP_B32    : FLAT_Real_Atomics_gfx11<0x034, "FLAT_ATOMIC_CMPSWAP", "flat_atomic_cmpswap_b32", true>;
+defm FLAT_ATOMIC_ADD_U32        : FLAT_Real_Atomics_gfx11<0x035, "FLAT_ATOMIC_ADD", "flat_atomic_add_u32", true>;
+defm FLAT_ATOMIC_SUB_U32        : FLAT_Real_Atomics_gfx11<0x036, "FLAT_ATOMIC_SUB", "flat_atomic_sub_u32", true>;
+defm FLAT_ATOMIC_MIN_I32        : FLAT_Real_Atomics_gfx11<0x038, "FLAT_ATOMIC_SMIN", "flat_atomic_min_i32", true>;
+defm FLAT_ATOMIC_MIN_U32        : FLAT_Real_Atomics_gfx11<0x039, "FLAT_ATOMIC_UMIN", "flat_atomic_min_u32", true>;
+defm FLAT_ATOMIC_MAX_I32        : FLAT_Real_Atomics_gfx11<0x03a, "FLAT_ATOMIC_SMAX", "flat_atomic_max_i32", true>;
+defm FLAT_ATOMIC_MAX_U32        : FLAT_Real_Atomics_gfx11<0x03b, "FLAT_ATOMIC_UMAX", "flat_atomic_max_u32", true>;
+defm FLAT_ATOMIC_AND_B32        : FLAT_Real_Atomics_gfx11<0x03c, "FLAT_ATOMIC_AND", "flat_atomic_and_b32", true>;
+defm FLAT_ATOMIC_OR_B32         : FLAT_Real_Atomics_gfx11<0x03d, "FLAT_ATOMIC_OR", "flat_atomic_or_b32", true>;
+defm FLAT_ATOMIC_XOR_B32        : FLAT_Real_Atomics_gfx11<0x03e, "FLAT_ATOMIC_XOR", "flat_atomic_xor_b32", true>;
+defm FLAT_ATOMIC_INC_U32        : FLAT_Real_Atomics_gfx11<0x03f, "FLAT_ATOMIC_INC", "flat_atomic_inc_u32", true>;
+defm FLAT_ATOMIC_DEC_U32        : FLAT_Real_Atomics_gfx11<0x040, "FLAT_ATOMIC_DEC", "flat_atomic_dec_u32", true>;
+defm FLAT_ATOMIC_SWAP_B64       : FLAT_Real_Atomics_gfx11<0x041, "FLAT_ATOMIC_SWAP_X2", "flat_atomic_swap_b64", true>;
+defm FLAT_ATOMIC_CMPSWAP_B64    : FLAT_Real_Atomics_gfx11<0x042, "FLAT_ATOMIC_CMPSWAP_X2", "flat_atomic_cmpswap_b64", true>;
+defm FLAT_ATOMIC_ADD_U64        : FLAT_Real_Atomics_gfx11<0x043, "FLAT_ATOMIC_ADD_X2", "flat_atomic_add_u64", true>;
+defm FLAT_ATOMIC_SUB_U64        : FLAT_Real_Atomics_gfx11<0x044, "FLAT_ATOMIC_SUB_X2", "flat_atomic_sub_u64", true>;
+defm FLAT_ATOMIC_MIN_I64        : FLAT_Real_Atomics_gfx11<0x045, "FLAT_ATOMIC_SMIN_X2", "flat_atomic_min_i64", true>;
+defm FLAT_ATOMIC_MIN_U64        : FLAT_Real_Atomics_gfx11<0x046, "FLAT_ATOMIC_UMIN_X2", "flat_atomic_min_u64", true>;
+defm FLAT_ATOMIC_MAX_I64        : FLAT_Real_Atomics_gfx11<0x047, "FLAT_ATOMIC_SMAX_X2", "flat_atomic_max_i64", true>;
+defm FLAT_ATOMIC_MAX_U64        : FLAT_Real_Atomics_gfx11<0x048, "FLAT_ATOMIC_UMAX_X2", "flat_atomic_max_u64", true>;
+defm FLAT_ATOMIC_AND_B64        : FLAT_Real_Atomics_gfx11<0x049, "FLAT_ATOMIC_AND_X2", "flat_atomic_and_b64", true>;
+defm FLAT_ATOMIC_OR_B64         : FLAT_Real_Atomics_gfx11<0x04a, "FLAT_ATOMIC_OR_X2", "flat_atomic_or_b64", true>;
+defm FLAT_ATOMIC_XOR_B64        : FLAT_Real_Atomics_gfx11<0x04b, "FLAT_ATOMIC_XOR_X2", "flat_atomic_xor_b64", true>;
+defm FLAT_ATOMIC_INC_U64        : FLAT_Real_Atomics_gfx11<0x04c, "FLAT_ATOMIC_INC_X2", "flat_atomic_inc_u64", true>;
+defm FLAT_ATOMIC_DEC_U64        : FLAT_Real_Atomics_gfx11<0x04d, "FLAT_ATOMIC_DEC_X2", "flat_atomic_dec_u64", true>;
+defm FLAT_ATOMIC_CMPSWAP_F32    : FLAT_Real_Atomics_gfx11<0x050, "FLAT_ATOMIC_FCMPSWAP", "flat_atomic_cmpswap_f32">;
+defm FLAT_ATOMIC_MIN_F32        : FLAT_Real_Atomics_gfx11<0x051, "FLAT_ATOMIC_FMIN", "flat_atomic_min_f32">;
+defm FLAT_ATOMIC_MAX_F32        : FLAT_Real_Atomics_gfx11<0x052, "FLAT_ATOMIC_FMAX", "flat_atomic_max_f32">;
+defm FLAT_ATOMIC_ADD_F32        : FLAT_Real_Atomics_gfx11<0x056, "FLAT_ATOMIC_ADD_F32", "flat_atomic_add_f32">;
+
+// ENC_FLAT_GLBL.
+defm GLOBAL_LOAD_U8             : FLAT_Real_AllAddr_gfx11<0x010, "GLOBAL_LOAD_UBYTE", "global_load_u8", true>;
+defm GLOBAL_LOAD_I8             : FLAT_Real_AllAddr_gfx11<0x011, "GLOBAL_LOAD_SBYTE", "global_load_i8", true>;
+defm GLOBAL_LOAD_U16            : FLAT_Real_AllAddr_gfx11<0x012, "GLOBAL_LOAD_USHORT", "global_load_u16", true>;
+defm GLOBAL_LOAD_I16            : FLAT_Real_AllAddr_gfx11<0x013, "GLOBAL_LOAD_SSHORT", "global_load_i16", true>;
+defm GLOBAL_LOAD_B32            : FLAT_Real_AllAddr_gfx11<0x014, "GLOBAL_LOAD_DWORD", "global_load_b32", true>;
+defm GLOBAL_LOAD_B64            : FLAT_Real_AllAddr_gfx11<0x015, "GLOBAL_LOAD_DWORDX2", "global_load_b64", true>;
+defm GLOBAL_LOAD_B96            : FLAT_Real_AllAddr_gfx11<0x016, "GLOBAL_LOAD_DWORDX3", "global_load_b96", true>;
+defm GLOBAL_LOAD_B128           : FLAT_Real_AllAddr_gfx11<0x017, "GLOBAL_LOAD_DWORDX4", "global_load_b128", true>;
+defm GLOBAL_STORE_B8            : FLAT_Real_AllAddr_gfx11<0x018, "GLOBAL_STORE_BYTE", "global_store_b8", true>;
+defm GLOBAL_STORE_B16           : FLAT_Real_AllAddr_gfx11<0x019, "GLOBAL_STORE_SHORT", "global_store_b16", true>;
+defm GLOBAL_STORE_B32           : FLAT_Real_AllAddr_gfx11<0x01a, "GLOBAL_STORE_DWORD", "global_store_b32", true>;
+defm GLOBAL_STORE_B64           : FLAT_Real_AllAddr_gfx11<0x01b, "GLOBAL_STORE_DWORDX2", "global_store_b64", true>;
+defm GLOBAL_STORE_B96           : FLAT_Real_AllAddr_gfx11<0x01c, "GLOBAL_STORE_DWORDX3", "global_store_b96", true>;
+defm GLOBAL_STORE_B128          : FLAT_Real_AllAddr_gfx11<0x01d, "GLOBAL_STORE_DWORDX4", "global_store_b128", true>;
+defm GLOBAL_LOAD_D16_U8         : FLAT_Real_AllAddr_gfx11<0x01e, "GLOBAL_LOAD_UBYTE_D16", "global_load_d16_u8">;
+defm GLOBAL_LOAD_D16_I8         : FLAT_Real_AllAddr_gfx11<0x01f, "GLOBAL_LOAD_SBYTE_D16", "global_load_d16_i8">;
+defm GLOBAL_LOAD_D16_B16        : FLAT_Real_AllAddr_gfx11<0x020, "GLOBAL_LOAD_SHORT_D16", "global_load_d16_b16">;
+defm GLOBAL_LOAD_D16_HI_U8      : FLAT_Real_AllAddr_gfx11<0x021, "GLOBAL_LOAD_UBYTE_D16_HI", "global_load_d16_hi_u8">;
+defm GLOBAL_LOAD_D16_HI_I8      : FLAT_Real_AllAddr_gfx11<0x022, "GLOBAL_LOAD_SBYTE_D16_HI", "global_load_d16_hi_i8">;
+defm GLOBAL_LOAD_D16_HI_B16     : FLAT_Real_AllAddr_gfx11<0x023, "GLOBAL_LOAD_SHORT_D16_HI", "global_load_d16_hi_b16">;
+defm GLOBAL_STORE_D16_HI_B8     : FLAT_Real_AllAddr_gfx11<0x024, "GLOBAL_STORE_BYTE_D16_HI", "global_store_d16_hi_b8">;
+defm GLOBAL_STORE_D16_HI_B16    : FLAT_Real_AllAddr_gfx11<0x025, "GLOBAL_STORE_SHORT_D16_HI", "global_store_d16_hi_b16">;
+defm GLOBAL_LOAD_ADDTID_B32     : FLAT_Real_AllAddr_gfx11<0x028, "GLOBAL_LOAD_DWORD_ADDTID", "global_load_addtid_b32">;
+defm GLOBAL_STORE_ADDTID_B32    : FLAT_Real_AllAddr_gfx11<0x029, "GLOBAL_STORE_DWORD_ADDTID", "global_store_addtid_b32">;
+defm GLOBAL_ATOMIC_SWAP_B32     : FLAT_Real_GlblAtomics_gfx11<0x033, "GLOBAL_ATOMIC_SWAP", "global_atomic_swap_b32", true>;
+defm GLOBAL_ATOMIC_CMPSWAP_B32  : FLAT_Real_GlblAtomics_gfx11<0x034, "GLOBAL_ATOMIC_CMPSWAP", "global_atomic_cmpswap_b32", true>;
+defm GLOBAL_ATOMIC_ADD_U32      : FLAT_Real_GlblAtomics_gfx11<0x035, "GLOBAL_ATOMIC_ADD", "global_atomic_add_u32", true>;
+defm GLOBAL_ATOMIC_SUB_U32      : FLAT_Real_GlblAtomics_gfx11<0x036, "GLOBAL_ATOMIC_SUB", "global_atomic_sub_u32", true>;
+defm GLOBAL_ATOMIC_CSUB_U32     : FLAT_Real_GlblAtomics_RTN_gfx11<0x037, "GLOBAL_ATOMIC_CSUB", "global_atomic_csub_u32">;
+defm GLOBAL_ATOMIC_MIN_I32      : FLAT_Real_GlblAtomics_gfx11<0x038, "GLOBAL_ATOMIC_SMIN", "global_atomic_min_i32", true>;
+defm GLOBAL_ATOMIC_MIN_U32      : FLAT_Real_GlblAtomics_gfx11<0x039, "GLOBAL_ATOMIC_UMIN", "global_atomic_min_u32", true>;
+defm GLOBAL_ATOMIC_MAX_I32      : FLAT_Real_GlblAtomics_gfx11<0x03a, "GLOBAL_ATOMIC_SMAX", "global_atomic_max_i32", true>;
+defm GLOBAL_ATOMIC_MAX_U32      : FLAT_Real_GlblAtomics_gfx11<0x03b, "GLOBAL_ATOMIC_UMAX", "global_atomic_max_u32", true>;
+defm GLOBAL_ATOMIC_AND_B32      : FLAT_Real_GlblAtomics_gfx11<0x03c, "GLOBAL_ATOMIC_AND", "global_atomic_and_b32", true>;
+defm GLOBAL_ATOMIC_OR_B32       : FLAT_Real_GlblAtomics_gfx11<0x03d, "GLOBAL_ATOMIC_OR", "global_atomic_or_b32", true>;
+defm GLOBAL_ATOMIC_XOR_B32      : FLAT_Real_GlblAtomics_gfx11<0x03e, "GLOBAL_ATOMIC_XOR", "global_atomic_xor_b32", true>;
+defm GLOBAL_ATOMIC_INC_U32      : FLAT_Real_GlblAtomics_gfx11<0x03f, "GLOBAL_ATOMIC_INC", "global_atomic_inc_u32", true>;
+defm GLOBAL_ATOMIC_DEC_U32      : FLAT_Real_GlblAtomics_gfx11<0x040, "GLOBAL_ATOMIC_DEC", "global_atomic_dec_u32", true>;
+defm GLOBAL_ATOMIC_SWAP_B64     : FLAT_Real_GlblAtomics_gfx11<0x041, "GLOBAL_ATOMIC_SWAP_X2", "global_atomic_swap_b64", true>;
+defm GLOBAL_ATOMIC_CMPSWAP_B64  : FLAT_Real_GlblAtomics_gfx11<0x042, "GLOBAL_ATOMIC_CMPSWAP_X2", "global_atomic_cmpswap_b64", true>;
+defm GLOBAL_ATOMIC_ADD_U64      : FLAT_Real_GlblAtomics_gfx11<0x043, "GLOBAL_ATOMIC_ADD_X2", "global_atomic_add_u64", true>;
+defm GLOBAL_ATOMIC_SUB_U64      : FLAT_Real_GlblAtomics_gfx11<0x044, "GLOBAL_ATOMIC_SUB_X2", "global_atomic_sub_u64", true>;
+defm GLOBAL_ATOMIC_MIN_I64      : FLAT_Real_GlblAtomics_gfx11<0x045, "GLOBAL_ATOMIC_SMIN_X2", "global_atomic_min_i64", true>;
+defm GLOBAL_ATOMIC_MIN_U64      : FLAT_Real_GlblAtomics_gfx11<0x046, "GLOBAL_ATOMIC_UMIN_X2", "global_atomic_min_u64", true>;
+defm GLOBAL_ATOMIC_MAX_I64      : FLAT_Real_GlblAtomics_gfx11<0x047, "GLOBAL_ATOMIC_SMAX_X2", "global_atomic_max_i64", true>;
+defm GLOBAL_ATOMIC_MAX_U64      : FLAT_Real_GlblAtomics_gfx11<0x048, "GLOBAL_ATOMIC_UMAX_X2", "global_atomic_max_u64", true>;
+defm GLOBAL_ATOMIC_AND_B64      : FLAT_Real_GlblAtomics_gfx11<0x049, "GLOBAL_ATOMIC_AND_X2", "global_atomic_and_b64", true>;
+defm GLOBAL_ATOMIC_OR_B64       : FLAT_Real_GlblAtomics_gfx11<0x04a, "GLOBAL_ATOMIC_OR_X2", "global_atomic_or_b64", true>;
+defm GLOBAL_ATOMIC_XOR_B64      : FLAT_Real_GlblAtomics_gfx11<0x04b, "GLOBAL_ATOMIC_XOR_X2", "global_atomic_xor_b64", true>;
+defm GLOBAL_ATOMIC_INC_U64      : FLAT_Real_GlblAtomics_gfx11<0x04c, "GLOBAL_ATOMIC_INC_X2", "global_atomic_inc_u64", true>;
+defm GLOBAL_ATOMIC_DEC_U64      : FLAT_Real_GlblAtomics_gfx11<0x04d, "GLOBAL_ATOMIC_DEC_X2", "global_atomic_dec_u64", true>;
+defm GLOBAL_ATOMIC_CMPSWAP_F32  : FLAT_Real_GlblAtomics_gfx11<0x050, "GLOBAL_ATOMIC_FCMPSWAP", "global_atomic_cmpswap_f32">;
+defm GLOBAL_ATOMIC_MIN_F32      : FLAT_Real_GlblAtomics_gfx11<0x051, "GLOBAL_ATOMIC_FMIN", "global_atomic_min_f32">;
+defm GLOBAL_ATOMIC_MAX_F32      : FLAT_Real_GlblAtomics_gfx11<0x052, "GLOBAL_ATOMIC_FMAX", "global_atomic_max_f32">;
+defm GLOBAL_ATOMIC_ADD_F32      : FLAT_Real_GlblAtomics_gfx11<0x056, "GLOBAL_ATOMIC_ADD_F32", "global_atomic_add_f32">;
+
+// ENC_FLAT_SCRATCH.
+defm SCRATCH_LOAD_U8            : FLAT_Real_ScratchAllAddr_gfx11<0x10, "SCRATCH_LOAD_UBYTE", "scratch_load_u8", true>;
+defm SCRATCH_LOAD_I8            : FLAT_Real_ScratchAllAddr_gfx11<0x11, "SCRATCH_LOAD_SBYTE", "scratch_load_i8", true>;
+defm SCRATCH_LOAD_U16           : FLAT_Real_ScratchAllAddr_gfx11<0x12, "SCRATCH_LOAD_USHORT", "scratch_load_u16", true>;
+defm SCRATCH_LOAD_I16           : FLAT_Real_ScratchAllAddr_gfx11<0x13, "SCRATCH_LOAD_SSHORT", "scratch_load_i16", true>;
+defm SCRATCH_LOAD_B32           : FLAT_Real_ScratchAllAddr_gfx11<0x14, "SCRATCH_LOAD_DWORD", "scratch_load_b32", true>;
+defm SCRATCH_LOAD_B64           : FLAT_Real_ScratchAllAddr_gfx11<0x15, "SCRATCH_LOAD_DWORDX2", "scratch_load_b64", true>;
+defm SCRATCH_LOAD_B96           : FLAT_Real_ScratchAllAddr_gfx11<0x16, "SCRATCH_LOAD_DWORDX3", "scratch_load_b96", true>;
+defm SCRATCH_LOAD_B128          : FLAT_Real_ScratchAllAddr_gfx11<0x17, "SCRATCH_LOAD_DWORDX4", "scratch_load_b128", true>;
+defm SCRATCH_STORE_B8           : FLAT_Real_ScratchAllAddr_gfx11<0x18, "SCRATCH_STORE_BYTE", "scratch_store_b8", true>;
+defm SCRATCH_STORE_B16          : FLAT_Real_ScratchAllAddr_gfx11<0x19, "SCRATCH_STORE_SHORT", "scratch_store_b16", true>;
+defm SCRATCH_STORE_B32          : FLAT_Real_ScratchAllAddr_gfx11<0x1a, "SCRATCH_STORE_DWORD", "scratch_store_b32", true>;
+defm SCRATCH_STORE_B64          : FLAT_Real_ScratchAllAddr_gfx11<0x1b, "SCRATCH_STORE_DWORDX2", "scratch_store_b64", true>;
+defm SCRATCH_STORE_B96          : FLAT_Real_ScratchAllAddr_gfx11<0x1c, "SCRATCH_STORE_DWORDX3", "scratch_store_b96", true>;
+defm SCRATCH_STORE_B128         : FLAT_Real_ScratchAllAddr_gfx11<0x1d, "SCRATCH_STORE_DWORDX4", "scratch_store_b128", true>;
+defm SCRATCH_LOAD_D16_U8        : FLAT_Real_ScratchAllAddr_gfx11<0x1e, "SCRATCH_LOAD_UBYTE_D16", "scratch_load_d16_u8">;
+defm SCRATCH_LOAD_D16_I8        : FLAT_Real_ScratchAllAddr_gfx11<0x1f, "SCRATCH_LOAD_SBYTE_D16", "scratch_load_d16_i8">;
+defm SCRATCH_LOAD_D16_B16       : FLAT_Real_ScratchAllAddr_gfx11<0x20, "SCRATCH_LOAD_SHORT_D16", "scratch_load_d16_b16">;
+defm SCRATCH_LOAD_D16_HI_U8     : FLAT_Real_ScratchAllAddr_gfx11<0x21, "SCRATCH_LOAD_UBYTE_D16_HI", "scratch_load_d16_hi_u8">;
+defm SCRATCH_LOAD_D16_HI_I8     : FLAT_Real_ScratchAllAddr_gfx11<0x22, "SCRATCH_LOAD_SBYTE_D16_HI", "scratch_load_d16_hi_i8">;
+defm SCRATCH_LOAD_D16_HI_B16    : FLAT_Real_ScratchAllAddr_gfx11<0x23, "SCRATCH_LOAD_SHORT_D16_HI", "scratch_load_d16_hi_b16">;
+defm SCRATCH_STORE_D16_HI_B8    : FLAT_Real_ScratchAllAddr_gfx11<0x24, "SCRATCH_STORE_BYTE_D16_HI", "scratch_store_d16_hi_b8">;
+defm SCRATCH_STORE_D16_HI_B16   : FLAT_Real_ScratchAllAddr_gfx11<0x25, "SCRATCH_STORE_SHORT_D16_HI", "scratch_store_d16_hi_b16">;
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index a8c85ec4e5ea..1cd880eaa48e 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -167,7 +167,9 @@ MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const {
     return nullptr;
   case AMDGPU::COPY:
   case AMDGPU::V_MOV_B32_e32:
-  case AMDGPU::V_MOV_B64_PSEUDO: {
+  case AMDGPU::V_MOV_B64_PSEUDO:
+  case AMDGPU::V_MOV_B64_e32:
+  case AMDGPU::V_MOV_B64_e64: {
     auto &Op1 = Def->getOperand(1);
     if (Op1.isImm())
       return &Op1;
@@ -183,6 +185,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
                                            bool CombBCZ,
                                            bool IsShrinkable) const {
   assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp ||
+         MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp ||
          MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
 
   auto OrigOp = OrigMI.getOpcode();
@@ -383,6 +386,7 @@ bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName,
 
 bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
   assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp ||
+         MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp ||
          MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
   LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI);
 
@@ -399,7 +403,8 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
     return false;
   }
 
-  if (MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO) {
+  if (MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO ||
+      MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp) {
     auto *DppCtrl = TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl);
     assert(DppCtrl && DppCtrl->isImm());
     if (!AMDGPU::isLegal64BitDPPControl(DppCtrl->getImm())) {
@@ -447,12 +452,6 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
       return false;
     }
 
-    if (OldOpndValue->getParent()->getParent() != MovMI.getParent()) {
-      LLVM_DEBUG(dbgs() <<
-        "  failed: old reg def and mov should be in the same BB\n");
-      return false;
-    }
-
     if (OldOpndValue->getImm() == 0) {
       if (MaskAllLanes) {
         assert(!BoundCtrlZero); // by check [1]
@@ -616,7 +615,8 @@ bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) {
       if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) {
         Changed = true;
         ++NumDPPMovsCombined;
-      } else if (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO) {
+      } else if (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO ||
+                 MI.getOpcode() == AMDGPU::V_MOV_B64_dpp) {
         if (ST->has64BitDPP() && combineDPPMov(MI)) {
           Changed = true;
           ++NumDPPMovsCombined;
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index c0592f6f3c7a..b6d16009e776 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -13,14 +13,38 @@
 #include "GCNHazardRecognizer.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/Support/TargetParser.h"
 
 using namespace llvm;
 
+namespace {
+
+struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
+  MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
+
+  bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
+    if (Arg.getAsInteger(0, Value))
+      return O.error("'" + Arg + "' value invalid for uint argument!");
+
+    if (Value > 100)
+      return O.error("'" + Arg + "' value must be in the range [0, 100]!");
+
+    return false;
+  }
+};
+
+} // end anonymous namespace
+
+static cl::opt<unsigned, false, MFMAPaddingRatioParser>
+    MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
+                     cl::desc("Fill a percentage of the latency between "
+                              "neighboring MFMA with s_nops."));
+
 //===----------------------------------------------------------------------===//
-// Hazard Recoginizer Implementation
+// Hazard Recognizer Implementation
 //===----------------------------------------------------------------------===//
 
 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
@@ -92,12 +116,7 @@ static bool isSMovRel(unsigned Opcode) {
 }
 
 static bool isDGEMM(unsigned Opcode) {
-  return Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
-         Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64 ||
-         Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_e64 ||
-         Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64 ||
-         Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64 ||
-         Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64;
+  return AMDGPU::getMAIIsDGEMM(Opcode);
 }
 
 static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
@@ -109,7 +128,10 @@ static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
       Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
     return false;
 
-  return true;
+  if (!ST.hasGFX940Insts())
+    return true;
+
+  return AMDGPU::getMAIIsGFX940XDL(Opcode);
 }
 
 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
@@ -144,6 +166,11 @@ static bool isPermlane(const MachineInstr &MI) {
          Opcode == AMDGPU::V_PERMLANEX16_B32_e64;
 }
 
+static bool isLdsDma(const MachineInstr &MI) {
+  return SIInstrInfo::isVALU(MI) &&
+         (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI));
+}
+
 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
                                                      AMDGPU::OpName::simm16);
@@ -204,12 +231,12 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
     return HazardType;
 
-  if (ST.hasReadM0MovRelInterpHazard() &&
-      (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
-      checkReadM0Hazards(MI) > 0)
-    return HazardType;
-
-  if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&
+  if (((ST.hasReadM0MovRelInterpHazard() &&
+        (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()))) ||
+       (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
+       (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
+       (ST.hasReadM0LdsDirectHazard() &&
+        MI->readsRegister(AMDGPU::LDS_DIRECT))) &&
       checkReadM0Hazards(MI) > 0)
     return HazardType;
 
@@ -237,6 +264,14 @@ static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
   }
 }
 
+unsigned
+GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
+  const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
+  assert(TSchedModel.getWriteProcResBegin(SC) !=
+         TSchedModel.getWriteProcResEnd(SC));
+  return TSchedModel.getWriteProcResBegin(SC)->Cycles;
+}
+
 void GCNHazardRecognizer::processBundle() {
   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
@@ -321,11 +356,11 @@ unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
   if (isRFE(MI->getOpcode()))
     return std::max(WaitStates, checkRFEHazards(MI));
 
-  if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) ||
-                                           isSMovRel(MI->getOpcode())))
-    return std::max(WaitStates, checkReadM0Hazards(MI));
-
-  if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI))
+  if ((ST.hasReadM0MovRelInterpHazard() &&
+       (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()))) ||
+      (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
+      (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
+      (ST.hasReadM0LdsDirectHazard() && MI->readsRegister(AMDGPU::LDS_DIRECT)))
     return std::max(WaitStates, checkReadM0Hazards(MI));
 
   if (SIInstrInfo::isMAI(*MI))
@@ -389,16 +424,61 @@ void GCNHazardRecognizer::RecedeCycle() {
 // Helper Functions
 //===----------------------------------------------------------------------===//
 
+typedef enum { HazardFound, HazardExpired, NoHazardFound } HazardFnResult;
+
 typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn;
+typedef function_ref<unsigned int(const MachineInstr &)> GetNumWaitStatesFn;
+
+// Search for a hazard in a block and its predecessors.
+template <typename StateT>
+static bool
+hasHazard(StateT State,
+          function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
+          function_ref<void(StateT &, const MachineInstr &)> UpdateState,
+          const MachineBasicBlock *MBB,
+          MachineBasicBlock::const_reverse_instr_iterator I,
+          DenseSet<const MachineBasicBlock *> &Visited) {
+  for (auto E = MBB->instr_rend(); I != E; ++I) {
+    // No need to look at parent BUNDLE instructions.
+    if (I->isBundle())
+      continue;
+
+    switch (IsHazard(State, *I)) {
+    case HazardFound:
+      return true;
+    case HazardExpired:
+      return false;
+    default:
+      // Continue search
+      break;
+    }
+
+    if (I->isInlineAsm() || I->isMetaInstruction())
+      continue;
+
+    UpdateState(State, *I);
+  }
+
+  for (MachineBasicBlock *Pred : MBB->predecessors()) {
+    if (!Visited.insert(Pred).second)
+      continue;
+
+    if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
+                  Visited))
+      return true;
+  }
+
+  return false;
+}
 
 // Returns a minimum wait states since \p I walking all predecessors.
 // Only scans until \p IsExpired does not return true.
 // Can only be run in a hazard recognizer mode.
-static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
-                              const MachineBasicBlock *MBB,
-                              MachineBasicBlock::const_reverse_instr_iterator I,
-                              int WaitStates, IsExpiredFn IsExpired,
-                              DenseSet<const MachineBasicBlock *> &Visited) {
+static int getWaitStatesSince(
+    GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB,
+    MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates,
+    IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited,
+    GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) {
   for (auto E = MBB->instr_rend(); I != E; ++I) {
     // Don't add WaitStates for parent BUNDLE instructions.
     if (I->isBundle())
@@ -410,7 +490,7 @@ static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
     if (I->isInlineAsm())
       continue;
 
-    WaitStates += SIInstrInfo::getNumWaitStates(*I);
+    WaitStates += GetNumWaitStates(*I);
 
     if (IsExpired(*I, WaitStates))
       return std::numeric_limits<int>::max();
@@ -421,8 +501,8 @@ static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
     if (!Visited.insert(Pred).second)
       continue;
 
-    int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
-                               WaitStates, IsExpired, Visited);
+    int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
+                               IsExpired, Visited, GetNumWaitStates);
 
     MinWaitStates = std::min(MinWaitStates, W);
   }
@@ -534,7 +614,7 @@ int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
   // In order to handle these situations correctly we need to make sure that
   // when a clause has more than one instruction, no instruction in the clause
   // writes to a register that is read by another instruction in the clause
-  // (including itself). If we encounter this situaion, we need to break the
+  // (including itself). If we encounter this situation, we need to break the
   // clause by inserting a non SMEM instruction.
 
   for (MachineInstr *MI : EmittedInstrs) {
@@ -764,7 +844,7 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
   // 8 bytes can have there store data over written by the next instruction.
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
 
-  const int VALUWaitStates = 1;
+  const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
   int WaitStatesNeeded = 0;
 
   if (!TRI->isVectorRegister(MRI, Def.getReg()))
@@ -783,13 +863,136 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
 }
 
 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
+  int WaitStatesNeeded = 0;
+
+  if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
+    const int TransDefWaitstates = 1;
+
+    auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
+      if (!SIInstrInfo::isTRANS(MI))
+        return false;
+      const SIRegisterInfo *TRI = ST.getRegisterInfo();
+      const SIInstrInfo *TII = ST.getInstrInfo();
+      Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
+
+      for (const MachineOperand &Use : VALU->explicit_uses()) {
+        if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
+          return true;
+      }
+
+      return false;
+    };
+
+    int WaitStatesNeededForDef =
+        TransDefWaitstates -
+        getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
+  }
+
+  if (ST.hasDstSelForwardingHazard()) {
+    const int Shift16DefWaitstates = 1;
+
+    auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
+      if (!SIInstrInfo::isVALU(MI))
+        return false;
+      const SIInstrInfo *TII = ST.getInstrInfo();
+      if (SIInstrInfo::isSDWA(MI)) {
+        if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
+          if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
+            return false;
+      } else {
+        if ((AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+                                        AMDGPU::OpName::op_sel) == -1) ||
+            !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)
+                  ->getImm() &
+              SISrcMods::DST_OP_SEL))
+          return false;
+      }
+      const SIRegisterInfo *TRI = ST.getRegisterInfo();
+      if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
+        Register Def = Dst->getReg();
+
+        for (const MachineOperand &Use : VALU->explicit_uses()) {
+          if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
+            return true;
+        }
+      }
+
+      return false;
+    };
+
+    int WaitStatesNeededForDef =
+        Shift16DefWaitstates -
+        getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
+  }
+
+  if (ST.hasVDecCoExecHazard()) {
+    const int VALUWriteSGPRVALUReadWaitstates = 2;
+    const int VALUWriteEXECRWLane = 4;
+    const int VALUWriteVGPRReadlaneRead = 1;
+
+    const SIRegisterInfo *TRI = ST.getRegisterInfo();
+    const MachineRegisterInfo &MRI = MF.getRegInfo();
+    Register UseReg;
+    auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
+      if (!SIInstrInfo::isVALU(MI))
+        return false;
+      return MI.modifiesRegister(UseReg, TRI);
+    };
+
+    for (const MachineOperand &Use : VALU->explicit_uses()) {
+      if (!Use.isReg())
+        continue;
+
+      UseReg = Use.getReg();
+      if (TRI->isSGPRReg(MRI, UseReg)) {
+        int WaitStatesNeededForDef =
+            VALUWriteSGPRVALUReadWaitstates -
+            getWaitStatesSince(IsVALUDefSGPRFn,
+                               VALUWriteSGPRVALUReadWaitstates);
+        WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
+      }
+    }
+
+    if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
+      UseReg = AMDGPU::VCC;
+      int WaitStatesNeededForDef =
+          VALUWriteSGPRVALUReadWaitstates -
+          getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
+      WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
+    }
+
+    switch (VALU->getOpcode()) {
+    case AMDGPU::V_READLANE_B32:
+    case AMDGPU::V_READFIRSTLANE_B32: {
+      MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
+      UseReg = Src->getReg();
+      int WaitStatesNeededForDef =
+          VALUWriteVGPRReadlaneRead -
+          getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
+      WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
+    }
+      LLVM_FALLTHROUGH;
+    case AMDGPU::V_WRITELANE_B32: {
+      UseReg = AMDGPU::EXEC;
+      int WaitStatesNeededForDef =
+          VALUWriteEXECRWLane -
+          getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
+      WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
+      break;
+    }
+    default:
+      break;
+    }
+  }
+
   // This checks for the hazard where VMEM instructions that store more than
   // 8 bytes can have there store data over written by the next instruction.
   if (!ST.has12DWordStoreHazard())
-    return 0;
+    return WaitStatesNeeded;
 
   const MachineRegisterInfo &MRI = MF.getRegInfo();
-  int WaitStatesNeeded = 0;
 
   for (const MachineOperand &Def : VALU->defs()) {
     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
@@ -861,10 +1064,10 @@ int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
 
 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
   const SIInstrInfo *TII = ST.getInstrInfo();
-  const int SMovRelWaitStates = 1;
+  const int ReadM0WaitStates = 1;
   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
-  return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
-                                                   SMovRelWaitStates);
+  return ReadM0WaitStates -
+         getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
 }
 
 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
@@ -873,6 +1076,13 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
   fixSMEMtoVectorWriteHazards(MI);
   fixVcmpxExecWARHazard(MI);
   fixLdsBranchVmemWARHazard(MI);
+  if (ST.hasLdsDirect()) {
+    fixLdsDirectVALUHazard(MI);
+    fixLdsDirectVMEMHazard(MI);
+  }
+  fixVALUPartialForwardingHazard(MI);
+  fixVALUTransUseHazard(MI);
+  fixWMMAHazards(MI);
 }
 
 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
@@ -880,7 +1090,12 @@ bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
     return false;
 
   const SIInstrInfo *TII = ST.getInstrInfo();
-  auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVOPC(MI); };
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
+    return (TII->isVOPC(MI) ||
+            ((TII->isVOP3(MI) || TII->isSDWA(MI)) && MI.isCompare())) &&
+           MI.modifiesRegister(AMDGPU::EXEC, TRI);
+  };
 
   auto IsExpiredFn = [](const MachineInstr &MI, int) {
     unsigned Opc = MI.getOpcode();
@@ -893,7 +1108,7 @@ bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
     return false;
 
   // V_NOP will be discarded by SQ.
-  // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
+  // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
   // which is always a VGPR and available.
   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
   Register Reg = Src0->getReg();
@@ -1157,6 +1372,369 @@ bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
   return true;
 }
 
+bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
+  if (!SIInstrInfo::isLDSDIR(*MI))
+    return false;
+
+  const int NoHazardWaitStates = 15;
+  const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
+  const Register VDSTReg = VDST->getReg();
+
+  bool VisitedTrans = false;
+  auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
+    if (!SIInstrInfo::isVALU(I))
+      return false;
+    VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
+    // Cover both WAR and WAW
+    return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
+  };
+  auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
+    if (WaitStates >= NoHazardWaitStates)
+      return true;
+    // Instructions which cause va_vdst==0 expire hazard
+    return SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
+           SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I);
+  };
+  auto GetWaitStatesFn = [](const MachineInstr &MI) {
+    return SIInstrInfo::isVALU(MI) ? 1 : 0;
+  };
+
+  DenseSet<const MachineBasicBlock *> Visited;
+  auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
+                                    std::next(MI->getReverseIterator()), 0,
+                                    IsExpiredFn, Visited, GetWaitStatesFn);
+
+  // Transcendentals can execute in parallel to other VALUs.
+  // This makes va_vdst count unusable with a mixture of VALU and TRANS.
+  if (VisitedTrans)
+    Count = 0;
+
+  MachineOperand *WaitVdstOp =
+      TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
+  WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
+
+  return true;
+}
+
+bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
+  if (!SIInstrInfo::isLDSDIR(*MI))
+    return false;
+
+  const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
+  const Register VDSTReg = VDST->getReg();
+
+  auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
+    if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I) &&
+        !SIInstrInfo::isDS(I))
+      return false;
+    return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
+  };
+  auto IsExpiredFn = [](const MachineInstr &I, int) {
+    return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) ||
+           (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
+           (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
+            I.getOperand(0).getImm() == 0xffe3);
+  };
+
+  if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
+      std::numeric_limits<int>::max())
+    return false;
+
+  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+          TII.get(AMDGPU::S_WAITCNT_DEPCTR))
+      .addImm(0xffe3);
+
+  return true;
+}
+
+bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
+  if (!ST.isWave64())
+    return false;
+  if (!ST.hasVALUPartialForwardingHazard())
+    return false;
+  if (!SIInstrInfo::isVALU(*MI))
+    return false;
+
+  SmallSetVector<Register, 4> SrcVGPRs;
+
+  for (const MachineOperand &Use : MI->explicit_uses()) {
+    if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
+      SrcVGPRs.insert(Use.getReg());
+  }
+
+  // Only applies with >= 2 unique VGPR sources
+  if (SrcVGPRs.size() <= 1)
+    return false;
+
+  // Look for the following pattern:
+  //   Va <- VALU [PreExecPos]
+  //   intv1
+  //   Exec <- SALU [ExecPos]
+  //   intv2
+  //   Vb <- VALU [PostExecPos]
+  //   intv3
+  //   MI Va, Vb (WaitState = 0)
+  //
+  // Where:
+  // intv1 + intv2 <= 2 VALUs
+  // intv3 <= 4 VALUs
+  //
+  // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
+
+  const int Intv1plus2MaxVALUs = 2;
+  const int Intv3MaxVALUs = 4;
+  const int IntvMaxVALUs = 6;
+  const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
+
+  struct StateType {
+    SmallDenseMap<Register, int, 4> DefPos;
+    int ExecPos = std::numeric_limits<int>::max();
+    int VALUs = 0;
+  };
+
+  StateType State;
+
+  // This overloads expiry testing with all the hazard detection
+  auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
+    // Too many VALU states have passed
+    if (State.VALUs > NoHazardVALUWaitStates)
+      return HazardExpired;
+
+    // Instructions which cause va_vdst==0 expire hazard
+    if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
+        SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
+        (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
+         I.getOperand(0).getImm() == 0x0fff))
+      return HazardExpired;
+
+    // Track registers writes
+    bool Changed = false;
+    if (SIInstrInfo::isVALU(I)) {
+      for (Register Src : SrcVGPRs) {
+        if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
+          State.DefPos[Src] = State.VALUs;
+          Changed = true;
+        }
+      }
+    } else if (SIInstrInfo::isSALU(I)) {
+      if (State.ExecPos == std::numeric_limits<int>::max()) {
+        if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
+          State.ExecPos = State.VALUs;
+          Changed = true;
+        }
+      }
+    }
+
+    // Early expiration: too many VALUs in intv3
+    if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
+      return HazardExpired;
+
+    // Only evaluate state if something changed
+    if (!Changed)
+      return NoHazardFound;
+
+    // Determine positions of VALUs pre/post exec change
+    if (State.ExecPos == std::numeric_limits<int>::max())
+      return NoHazardFound;
+
+    int PreExecPos = std::numeric_limits<int>::max();
+    int PostExecPos = std::numeric_limits<int>::max();
+
+    for (auto Entry : State.DefPos) {
+      int DefVALUs = Entry.second;
+      if (DefVALUs != std::numeric_limits<int>::max()) {
+        if (DefVALUs >= State.ExecPos)
+          PreExecPos = std::min(PreExecPos, DefVALUs);
+        else if (DefVALUs < State.ExecPos)
+          PostExecPos = std::min(PostExecPos, DefVALUs);
+      }
+    }
+
+    // Need a VALUs post exec change
+    if (PostExecPos == std::numeric_limits<int>::max())
+      return NoHazardFound;
+
+    // Too many VALUs in intv3?
+    int Intv3VALUs = PostExecPos;
+    if (Intv3VALUs > Intv3MaxVALUs)
+      return HazardExpired;
+
+    // Too many VALUs in intv2?
+    int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
+    if (Intv2VALUs > Intv1plus2MaxVALUs)
+      return HazardExpired;
+
+    // Need a VALUs pre exec change
+    if (PreExecPos == std::numeric_limits<int>::max())
+      return NoHazardFound;
+
+    // Too many VALUs in intv1?
+    int Intv1VALUs = PreExecPos - State.ExecPos;
+    if (Intv1VALUs > Intv1plus2MaxVALUs)
+      return HazardExpired;
+
+    // Too many VALUs in intv1 + intv2
+    if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
+      return HazardExpired;
+
+    return HazardFound;
+  };
+  auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
+    if (SIInstrInfo::isVALU(MI))
+      State.VALUs += 1;
+  };
+
+  DenseSet<const MachineBasicBlock *> Visited;
+  if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
+                            std::next(MI->getReverseIterator()), Visited))
+    return false;
+
+  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+          TII.get(AMDGPU::S_WAITCNT_DEPCTR))
+      .addImm(0x0fff);
+
+  return true;
+}
+
+bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
+  if (!ST.hasVALUTransUseHazard())
+    return false;
+  if (!SIInstrInfo::isVALU(*MI))
+    return false;
+
+  SmallSet<Register, 4> SrcVGPRs;
+
+  for (const MachineOperand &Use : MI->explicit_uses()) {
+    if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
+      SrcVGPRs.insert(Use.getReg());
+  }
+
+  // Look for the following pattern:
+  //   Va <- TRANS VALU
+  //   intv
+  //   MI Va (WaitState = 0)
+  //
+  // Where:
+  // intv <= 5 VALUs / 1 TRANS
+  //
+  // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
+
+  const int IntvMaxVALUs = 5;
+  const int IntvMaxTRANS = 1;
+
+  struct StateType {
+    int VALUs = 0;
+    int TRANS = 0;
+  };
+
+  StateType State;
+
+  // This overloads expiry testing with all the hazard detection
+  auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
+    // Too many VALU states have passed
+    if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
+      return HazardExpired;
+
+    // Instructions which cause va_vdst==0 expire hazard
+    if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
+        SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
+        (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
+         I.getOperand(0).getImm() == 0x0fff))
+      return HazardExpired;
+
+    // Track registers writes
+    if (SIInstrInfo::isTRANS(I)) {
+      for (Register Src : SrcVGPRs) {
+        if (I.modifiesRegister(Src, &TRI)) {
+          return HazardFound;
+        }
+      }
+    }
+
+    return NoHazardFound;
+  };
+  auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
+    if (SIInstrInfo::isVALU(MI))
+      State.VALUs += 1;
+    if (SIInstrInfo::isTRANS(MI))
+      State.TRANS += 1;
+  };
+
+  DenseSet<const MachineBasicBlock *> Visited;
+  if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
+                            std::next(MI->getReverseIterator()), Visited))
+    return false;
+
+  // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
+  // avoided (mask 0x0fff achieves this).
+  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+          TII.get(AMDGPU::S_WAITCNT_DEPCTR))
+      .addImm(0x0fff);
+
+  return true;
+}
+
+bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
+  if (!SIInstrInfo::isWMMA(*MI))
+    return false;
+
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+  auto IsHazardFn = [MI, TII, TRI](const MachineInstr &I) {
+    if (!SIInstrInfo::isWMMA(I))
+      return false;
+
+    // Src0 or Src1 of the current wmma instruction overlaps with the dest of
+    // the previous wmma.
+    const Register CurSrc0Reg =
+        TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
+    const Register CurSrc1Reg =
+        TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
+
+    const Register PrevDstReg =
+        TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
+
+    if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
+        TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
+      return true;
+    }
+
+    // Src2 of the current wmma instruction overlaps with the dest of the
+    // previous wmma.
+    const MachineOperand *Src2 =
+        TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
+    const Register CurSrc2Reg = Src2->isReg() ? Src2->getReg() : Register();
+
+    if (CurSrc2Reg != AMDGPU::NoRegister &&
+        TRI->regsOverlap(PrevDstReg, CurSrc2Reg)) {
+
+      const MachineOperand *Src2Mods =
+          TII->getNamedOperand(*MI, AMDGPU::OpName::src2_modifiers);
+      const bool NoSrc2Mods =
+          (Src2Mods->getImm() & (SISrcMods::NEG | SISrcMods::NEG_HI)) == 0;
+      // Exception: there is no hazard if the wmma instructions are of the same
+      // type and there is no input modifier on src2 of the current instruction.
+      return !(NoSrc2Mods && (TII->pseudoToMCOpcode(I.getOpcode()) ==
+                              TII->pseudoToMCOpcode(MI->getOpcode())));
+    }
+
+    return false;
+  };
+
+  auto IsExpiredFn = [](const MachineInstr &I, int) {
+    return SIInstrInfo::isVALU(I);
+  };
+
+  if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
+      std::numeric_limits<int>::max())
+    return false;
+
+  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
+
+  return true;
+}
+
 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
   int NSAtoVMEMWaitStates = 1;
 
@@ -1223,6 +1801,36 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
   return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
 }
 
+int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
+  // Early exit if no padding is requested.
+  if (MFMAPaddingRatio == 0)
+    return 0;
+
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
+    return 0;
+
+  int NeighborMFMALatency = 0;
+  auto IsNeighboringMFMA = [&NeighborMFMALatency,
+                            this](const MachineInstr &MI) {
+    if (!SIInstrInfo::isMFMA(MI))
+      return false;
+
+    NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
+    return true;
+  };
+
+  const int MaxMFMAPipelineWaitStates = 16;
+  int WaitStatesSinceNeighborMFMA =
+      getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
+
+  int NeighborMFMAPaddingNeeded =
+      (NeighborMFMALatency * MFMAPaddingRatio / 100) -
+      WaitStatesSinceNeighborMFMA;
+
+  return std::max(0, NeighborMFMAPaddingNeeded);
+}
+
 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
   int WaitStatesNeeded = 0;
   unsigned Opc = MI->getOpcode();
@@ -1257,12 +1865,6 @@ int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
     }
   }
 
-  auto IsMFMAFn = [](const MachineInstr &MI) {
-    return SIInstrInfo::isMAI(MI) &&
-           MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
-           MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
-  };
-
   for (const MachineOperand &Op : MI->explicit_operands()) {
     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
       continue;
@@ -1282,9 +1884,9 @@ int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
     Register Reg = Op.getReg();
     unsigned HazardDefLatency = 0;
 
-    auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency,
+    auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
                                this](const MachineInstr &MI) {
-      if (!IsMFMAFn(MI))
+      if (!SIInstrInfo::isMFMA(MI))
         return false;
       Register DstReg = MI.getOperand(0).getReg();
       if (DstReg == Reg)
@@ -1361,9 +1963,9 @@ int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
     Register DstReg = MI->getOperand(0).getReg();
     unsigned HazardDefLatency = 0;
 
-    auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency,
+    auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
                          this](const MachineInstr &MI) {
-      if (!IsMFMAFn(MI))
+      if (!SIInstrInfo::isMFMA(MI))
         return false;
       Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
       HazardDefLatency =
@@ -1387,6 +1989,9 @@ int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
   }
 
+  // Pad neighboring MFMA with noops for better inter-wave performance.
+  WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
+
   return WaitStatesNeeded;
 }
 
@@ -1394,21 +1999,16 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
   int WaitStatesNeeded = 0;
   unsigned Opc = MI->getOpcode();
 
-  auto IsMFMAFn = [](const MachineInstr &MI) {
-    return SIInstrInfo::isMAI(MI) &&
-           MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
-           MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
+  auto IsLegacyVALUFn = [](const MachineInstr &MI) {
+    return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI);
   };
 
-  auto IsLegacyVALUFn = [&IsMFMAFn](const MachineInstr &MI) {
-    return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI);
+  auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
+    return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) &&
+           !SIInstrInfo::isDOT(MI);
   };
 
-  auto IsLegacyVALUNotDotFn = [&IsMFMAFn](const MachineInstr &MI) {
-    return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI) && !SIInstrInfo::isDOT(MI);
-  };
-
-  if (!IsMFMAFn(*MI))
+  if (!SIInstrInfo::isMFMA(*MI))
     return WaitStatesNeeded;
 
   const int VALUWritesExecWaitStates = 4;
@@ -1423,6 +2023,13 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
   for (const MachineOperand &Use : MI->explicit_uses()) {
     const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
     const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
+    const int GFX940_XDL2PassWritesVGPROverlappedSMFMASrcCWaitStates = 3;
+    const int GFX940_XDL4PassWritesVGPROverlappedSMFMASrcCWaitStates = 5;
+    const int GFX940_SMFMA4PassWritesVGPROverlappedSMFMASrcCWaitStates = 4;
+    const int GFX940_XDL8PassWritesVGPROverlappedSMFMASrcCWaitStates = 9;
+    const int GFX940_SMFMA8PassWritesVGPROverlappedSMFMASrcCWaitStates = 8;
+    const int GFX940_XDL16PassWritesVGPROverlappedSMFMASrcCWaitStates = 17;
+    const int GFX940_SMFMA16PassWritesVGPROverlappedSMFMASrcCWaitStates = 16;
     const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
     const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
     const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
@@ -1433,9 +2040,18 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
     const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
     const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
     const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
+    const int GFX940_SMFMA2PassWritesVGPROverlappedSrcABWaitStates = 4;
+    const int GFX940_SMFMA4PassWritesVGPROverlappedSrcABWaitStates = 6;
+    const int GFX940_SMFMA8PassWritesVGPROverlappedSrcABWaitStates = 10;
+    const int GFX940_SMFMA16PassWritesVGPROverlappedSrcABWaitStates = 18;
+    const int GFX940_XDL2PassWritesVGPROverlappedSrcABWaitStates = 5;
+    const int GFX940_XDL4PassWritesVGPROverlappedSrcABWaitStates = 7;
+    const int GFX940_XDL8PassWritesVGPROverlappedSrcABWaitStates = 11;
+    const int GFX940_XDL16PassWritesVGPROverlappedSrcABWaitStates = 19;
     const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
     const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
     const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
+    const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
     const int MaxWaitStates = 19;
 
     if (!Use.isReg())
@@ -1444,9 +2060,9 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
     bool FullReg;
     const MachineInstr *MI1;
 
-    auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &FullReg, &MI1,
+    auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
                                this](const MachineInstr &MI) {
-      if (!IsMFMAFn(MI))
+      if (!SIInstrInfo::isMFMA(MI))
         return false;
       Register DstReg = MI.getOperand(0).getReg();
       FullReg = (DstReg == Reg);
@@ -1467,7 +2083,7 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
     unsigned Opc1 = MI1->getOpcode();
     int NeedWaitStates = 0;
     if (OpNo == SrcCIdx) {
-      if (!isDGEMM(Opc) && isDGEMM(Opc1)) {
+      if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) {
         NeedWaitStates = 0;
       } else if (FullReg) {
         if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
@@ -1475,6 +2091,9 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
             (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
              Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
           NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
+        else if (ST.hasGFX940Insts() &&
+                 TSchedModel.computeInstrLatency(MI1) == 2)
+          NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
       } else {
         switch (Opc1) {
         case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
@@ -1490,22 +2109,42 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
             NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
           break;
         default:
+          if (ST.hasGFX940Insts() && isXDL(ST, *MI) && !isXDL(ST, *MI1))
+            break;
           switch (TSchedModel.computeInstrLatency(MI1)) {
           case 2:
-            NeedWaitStates = isDGEMM(Opc)
-              ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
-              : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
+            NeedWaitStates = ST.hasGFX940Insts()
+              ? isXDL(ST, *MI1)
+                ? GFX940_XDL2PassWritesVGPROverlappedSMFMASrcCWaitStates
+                : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates
+              : isDGEMM(Opc)
+                ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
+                : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
+            break;
+          case 4:
+            assert(ST.hasGFX940Insts());
+            NeedWaitStates = isXDL(ST, *MI1)
+              ? GFX940_XDL4PassWritesVGPROverlappedSMFMASrcCWaitStates
+              : GFX940_SMFMA4PassWritesVGPROverlappedSMFMASrcCWaitStates;
             break;
           case 8:
-            NeedWaitStates = isDGEMM(Opc)
-              ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
-              : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
+            NeedWaitStates = ST.hasGFX940Insts()
+              ? isXDL(ST, *MI1)
+                ? GFX940_XDL8PassWritesVGPROverlappedSMFMASrcCWaitStates
+                : GFX940_SMFMA8PassWritesVGPROverlappedSMFMASrcCWaitStates
+              : isDGEMM(Opc)
+                ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
+                : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
             break;
           case 16: LLVM_FALLTHROUGH;
           default:
-            NeedWaitStates = isDGEMM(Opc)
-              ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
-              : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
+            NeedWaitStates = ST.hasGFX940Insts()
+              ? isXDL(ST, *MI1)
+                ? GFX940_XDL16PassWritesVGPROverlappedSMFMASrcCWaitStates
+                : GFX940_SMFMA16PassWritesVGPROverlappedSMFMASrcCWaitStates
+              : isDGEMM(Opc)
+                ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
+                : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
           }
         }
       }
@@ -1524,14 +2163,32 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
       default:
         switch (TSchedModel.computeInstrLatency(MI1)) {
         case 2:
-          NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
+          NeedWaitStates = ST.hasGFX940Insts()
+            ? isXDL(ST, *MI1)
+              ? GFX940_XDL2PassWritesVGPROverlappedSrcABWaitStates
+              : GFX940_SMFMA2PassWritesVGPROverlappedSrcABWaitStates
+            : SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
+          break;
+        case 4:
+          assert(ST.hasGFX940Insts());
+          NeedWaitStates = isXDL(ST, *MI1)
+            ? GFX940_XDL4PassWritesVGPROverlappedSrcABWaitStates
+            : GFX940_SMFMA4PassWritesVGPROverlappedSrcABWaitStates;
           break;
         case 8:
-          NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
+          NeedWaitStates = ST.hasGFX940Insts()
+            ? isXDL(ST, *MI1)
+              ? GFX940_XDL8PassWritesVGPROverlappedSrcABWaitStates
+              : GFX940_SMFMA8PassWritesVGPROverlappedSrcABWaitStates
+            : SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
           break;
         case 16: LLVM_FALLTHROUGH;
         default:
-          NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
+          NeedWaitStates = ST.hasGFX940Insts()
+            ? isXDL(ST, *MI1)
+              ? GFX940_XDL16PassWritesVGPROverlappedSrcABWaitStates
+              : GFX940_SMFMA16PassWritesVGPROverlappedSrcABWaitStates
+            : SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
         }
       }
     }
@@ -1599,18 +2256,12 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
   if (!ST.hasGFX90AInsts())
     return 0;
 
-  auto IsMFMAFn = [](const MachineInstr &MI) -> bool {
-    return SIInstrInfo::isMAI(MI) &&
-           MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
-           MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
-  };
-
   auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
     return isDGEMM(MI.getOpcode());
   };
 
   // This is checked in checkMAIHazards90A()
-  if (IsMFMAFn(*MI))
+  if (SIInstrInfo::isMFMA(*MI))
     return 0;
 
   int WaitStatesNeeded = 0;
@@ -1623,8 +2274,9 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
 
   const MachineInstr *MFMA = nullptr;
   unsigned Reg;
-  auto IsMFMAWriteFn = [&Reg, &IsMFMAFn, &MFMA, this](const MachineInstr &MI) {
-    if (!IsMFMAFn(MI) || !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
+  auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
+    if (!SIInstrInfo::isMFMA(MI) ||
+        !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
       return false;
     MFMA = &MI;
     return true;
@@ -1646,6 +2298,14 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
     const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
     const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
     const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
+    const int GFX940_SMFMA2PassWriteVgprVALUMemExpReadWaitStates = 4;
+    const int GFX940_SMFMA4PassWriteVgprVALUMemExpReadWaitStates = 6;
+    const int GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates = 10;
+    const int GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates = 18;
+    const int GFX940_XDL2PassWriteVgprVALUMemExpReadWaitStates = 5;
+    const int GFX940_XDL4PassWriteVgprVALUMemExpReadWaitStates = 7;
+    const int GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates = 11;
+    const int GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates = 19;
     const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
     const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
     const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
@@ -1685,16 +2345,30 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
       int NeedWaitStates = MaxWaitStates;
       switch (HazardDefLatency) {
       case 2:
-        NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
+        NeedWaitStates =
+          ST.hasGFX940Insts()
+            ? isXDL(ST, *MFMA)
+              ? GFX940_XDL2PassWriteVgprVALUMemExpReadWaitStates
+              : GFX940_SMFMA2PassWriteVgprVALUMemExpReadWaitStates
+            : SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
         break;
       case 4:
-        assert(isDGEMM(MFMA->getOpcode()));
+        assert(isDGEMM(MFMA->getOpcode()) || ST.hasGFX940Insts());
         NeedWaitStates =
-            IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
-                          : DMFMA4x4WriteVgprVALUReadWaitStates;
+          isDGEMM(MFMA->getOpcode())
+            ? IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
+                            : DMFMA4x4WriteVgprVALUReadWaitStates
+            : isXDL(ST, *MFMA)
+              ? GFX940_XDL4PassWriteVgprVALUMemExpReadWaitStates
+              : GFX940_SMFMA4PassWriteVgprVALUMemExpReadWaitStates;
         break;
       case 8:
-        NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
+        NeedWaitStates =
+          ST.hasGFX940Insts()
+            ? isXDL(ST, *MFMA)
+              ? GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates
+              : GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates
+            : SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
         break;
       case 16: LLVM_FALLTHROUGH;
       default:
@@ -1702,7 +2376,11 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
           isDGEMM(MFMA->getOpcode())
             ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates
                             : DMFMA16x16WriteVgprVALUReadWaitStates
-            : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
+            : ST.hasGFX940Insts()
+              ? isXDL(ST, *MFMA)
+                ? GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates
+                : GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates
+              : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
         break;
       }
 
@@ -1732,7 +2410,16 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
     const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
     const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
     const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
+    const int GFX940_SMFMA2PassWriteVgprVALUWawWaitStates = 4;
+    const int GFX940_SMFMA4PassWriteVgprVALUWawWaitStates = 6;
+    const int GFX940_SMFMA8PassWriteVgprVALUWawWaitStates = 10;
+    const int GFX940_SMFMA16PassWriteVgprVALUWawWaitStates = 18;
+    const int GFX940_XDL2PassWriteVgprVALUWawWaitStates = 5;
+    const int GFX940_XDL4PassWriteVgprVALUWawWaitStates = 7;
+    const int GFX940_XDL8PassWriteVgprVALUWawWaitStates = 11;
+    const int GFX940_XDL16PassWriteVgprVALUWawWaitStates = 19;
     const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
+    const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
     const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
     const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
     const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
@@ -1757,19 +2444,35 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
       int NeedWaitStates = MaxWaitStates;
       switch (TSchedModel.computeInstrLatency(MFMA)) {
       case 2:
-        NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
+        NeedWaitStates = ST.hasGFX940Insts()
+          ? isXDL(ST, *MFMA)
+            ? GFX940_XDL2PassWriteVgprVALUWawWaitStates
+            : GFX940_SMFMA2PassWriteVgprVALUWawWaitStates
+          : SMFMA4x4WriteVgprVALUWawWaitStates;
         break;
       case 4:
-        assert(isDGEMM(MFMA->getOpcode()));
-        NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
+        assert(isDGEMM(MFMA->getOpcode()) || ST.hasGFX940Insts());
+        NeedWaitStates = isDGEMM(MFMA->getOpcode())
+            ? DMFMA4x4WriteVgprVALUWriteWaitStates
+            : isXDL(ST, *MFMA)
+              ? GFX940_XDL4PassWriteVgprVALUWawWaitStates
+              : GFX940_SMFMA4PassWriteVgprVALUWawWaitStates;
         break;
       case 8:
-        NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
+        NeedWaitStates = ST.hasGFX940Insts()
+          ? isXDL(ST, *MFMA)
+            ? GFX940_XDL8PassWriteVgprVALUWawWaitStates
+            : GFX940_SMFMA8PassWriteVgprVALUWawWaitStates
+          : SMFMA16x16WriteVgprVALUWawWaitStates;
         break;
       case 16: LLVM_FALLTHROUGH;
       default:
         NeedWaitStates = isDGEMM(MFMA->getOpcode())
                    ? DMFMA16x16WriteVgprVALUWriteWaitStates
+                   : ST.hasGFX940Insts()
+                     ? isXDL(ST, *MFMA)
+                       ? GFX940_XDL16PassWriteVgprVALUWawWaitStates
+                       : GFX940_SMFMA16PassWriteVgprVALUWawWaitStates
                    : SMFMA32x32WriteVgprVALUWawWaitStates;
         break;
       }
@@ -1781,12 +2484,14 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
         break;
     }
 
-    auto IsSMFMAReadAsCFn = [&Reg, &IsMFMAFn, &MFMA,
-                             this](const MachineInstr &MI) {
-      if (!IsMFMAFn(MI) || isDGEMM(MI.getOpcode()) ||
+    auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
+      if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) ||
           !MI.readsRegister(Reg, &TRI))
         return false;
 
+      if (ST.hasGFX940Insts() && !isXDL(ST, MI))
+        return false;
+
       const MachineOperand *SrcC =
           TII.getNamedOperand(MI, AMDGPU::OpName::src2);
       assert(SrcC);
@@ -1808,6 +2513,9 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
     switch (HazardDefLatency) {
     case 2:  NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
              break;
+    case 4:  assert(ST.hasGFX940Insts());
+             NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
+             break;
     case 8:  NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
              break;
     case 16: LLVM_FALLTHROUGH;
@@ -1827,11 +2535,10 @@ bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
     return false;
 
   const MachineInstr *MAI = nullptr;
+
   auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
     MAI = nullptr;
-    if (SIInstrInfo::isMAI(MI) &&
-        MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
-        MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64)
+    if (SIInstrInfo::isMFMA(MI))
       MAI = &MI;
     return MAI != nullptr;
   };
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index 716bc027a894..57f5a04c6eda 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -62,6 +62,10 @@ private:
 
   void addClauseInst(const MachineInstr &MI);
 
+  /// \returns the number of wait states before another MFMA instruction can be
+  /// issued after \p MI.
+  unsigned getMFMAPipelineWaitStates(const MachineInstr &MI) const;
+
   // Advance over a MachineInstr bundle. Look for hazards in the bundled
   // instructions.
   void processBundle();
@@ -92,10 +96,31 @@ private:
   bool fixSMEMtoVectorWriteHazards(MachineInstr *MI);
   bool fixVcmpxExecWARHazard(MachineInstr *MI);
   bool fixLdsBranchVmemWARHazard(MachineInstr *MI);
+  bool fixLdsDirectVALUHazard(MachineInstr *MI);
+  bool fixLdsDirectVMEMHazard(MachineInstr *MI);
+  bool fixVALUPartialForwardingHazard(MachineInstr *MI);
+  bool fixVALUTransUseHazard(MachineInstr *MI);
+  bool fixWMMAHazards(MachineInstr *MI);
 
   int checkMAIHazards(MachineInstr *MI);
   int checkMAIHazards908(MachineInstr *MI);
   int checkMAIHazards90A(MachineInstr *MI);
+  /// Pad the latency between neighboring MFMA instructions with s_nops. The
+  /// percentage of wait states to fill with s_nops is specified by the command
+  /// line option '-amdgpu-mfma-padding-ratio'.
+  ///
+  /// For example, with '-amdgpu-mfma-padding-ratio=100':
+  ///
+  /// 2 pass MFMA instructions have a latency of 2 wait states. Therefore, a
+  /// 'S_NOP 1' will be added between sequential MFMA instructions.
+  ///
+  /// V_MFMA_F32_4X4X1F32
+  /// V_MFMA_F32_4X4X1F32
+  ///-->
+  /// V_MFMA_F32_4X4X1F32
+  /// S_NOP 1
+  /// V_MFMA_F32_4X4X1F32
+  int checkMFMAPadding(MachineInstr *MI);
   int checkMAIVALUHazards(MachineInstr *MI);
   int checkMAILdStHazards(MachineInstr *MI);
 
diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
index 9f98f9ada802..6f82148854c4 100644
--- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
@@ -1,4 +1,4 @@
-//===-- GCNNSAReassign.cpp - Reassign registers in NSA unstructions -------===//
+//===-- GCNNSAReassign.cpp - Reassign registers in NSA instructions -------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,7 +8,7 @@
 //
 /// \file
 /// \brief Try to reassign registers on GFX10+ from non-sequential to sequential
-/// in NSA image instructions. Later SIShrinkInstructions pass will relace NSA
+/// in NSA image instructions. Later SIShrinkInstructions pass will replace NSA
 /// with sequential versions where possible.
 ///
 //===----------------------------------------------------------------------===//
@@ -16,10 +16,12 @@
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LiveRegMatrix.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/InitializePasses.h"
 
 using namespace llvm;
@@ -159,15 +161,23 @@ GCNNSAReassign::scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const {
 GCNNSAReassign::NSA_Status
 GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const {
   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
-  if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
+  if (!Info)
     return NSA_Status::NOT_NSA;
 
+  switch (Info->MIMGEncoding) {
+  case AMDGPU::MIMGEncGfx10NSA:
+  case AMDGPU::MIMGEncGfx11NSA:
+    break;
+  default:
+    return NSA_Status::NOT_NSA;
+  }
+
   int VAddr0Idx =
     AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
 
   unsigned VgprBase = 0;
   bool NSA = false;
-  for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
+  for (unsigned I = 0; I < Info->VAddrOperands; ++I) {
     const MachineOperand &Op = MI.getOperand(VAddr0Idx + I);
     Register Reg = Op.getReg();
     if (Reg.isPhysical() || !VRM->isAssignedReg(Reg))
@@ -179,15 +189,16 @@ GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const {
       if (!PhysReg)
         return NSA_Status::FIXED;
 
+      // TODO: address the below limitation to handle GFX11 BVH instructions
       // Bail if address is not a VGPR32. That should be possible to extend the
       // optimization to work with subregs of a wider register tuples, but the
       // logic to find free registers will be much more complicated with much
       // less chances for success. That seems reasonable to assume that in most
       // cases a tuple is used because a vector variable contains different
-      // parts of an address and it is either already consequitive or cannot
+      // parts of an address and it is either already consecutive or cannot
       // be reassigned if not. If needed it is better to rely on register
       // coalescer to process such address tuples.
-      if (MRI->getRegClass(Reg) != &AMDGPU::VGPR_32RegClass || Op.getSubReg())
+      if (TRI->getRegSizeInBits(*MRI->getRegClass(Reg)) != 32 || Op.getSubReg())
         return NSA_Status::FIXED;
 
       // InlineSpiller does not call LRM::assign() after an LI split leaving
@@ -278,7 +289,7 @@ bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
     SmallVector<LiveInterval *, 16> Intervals;
     SmallVector<MCRegister, 16> OrigRegs;
     SlotIndex MinInd, MaxInd;
-    for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
+    for (unsigned I = 0; I < Info->VAddrOperands; ++I) {
       const MachineOperand &Op = MI->getOperand(VAddr0Idx + I);
       Register Reg = Op.getReg();
       LiveInterval *LI = &LIS->getInterval(Reg);
@@ -331,11 +342,11 @@ bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
     }
 
     if (!Success) {
-      for (unsigned I = 0; I < Info->VAddrDwords; ++I)
+      for (unsigned I = 0; I < Info->VAddrOperands; ++I)
         if (VRM->hasPhys(Intervals[I]->reg()))
           LRM->unassign(*Intervals[I]);
 
-      for (unsigned I = 0; I < Info->VAddrDwords; ++I)
+      for (unsigned I = 0; I < Info->VAddrOperands; ++I)
         LRM->assign(*Intervals[I], OrigRegs[I]);
 
       continue;
diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td
index 3a68ed1934e1..281474994bca 100644
--- a/llvm/lib/Target/AMDGPU/GCNProcessors.td
+++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td
@@ -192,6 +192,10 @@ def : ProcessorModel<"gfx90c", SIQuarterSpeedModel,
   FeatureISAVersion9_0_C.Features
 >;
 
+def : ProcessorModel<"gfx940", SIDPGFX940FullSpeedModel,
+  FeatureISAVersion9_4_0.Features
+>;
+
 //===----------------------------------------------------------------------===//
 // GCN GFX10.
 //===----------------------------------------------------------------------===//
@@ -235,3 +239,27 @@ def : ProcessorModel<"gfx1034", GFX10SpeedModel,
 def : ProcessorModel<"gfx1035", GFX10SpeedModel,
   FeatureISAVersion10_3_0.Features
 >;
+
+def : ProcessorModel<"gfx1036", GFX10SpeedModel,
+  FeatureISAVersion10_3_0.Features
+>;
+
+//===----------------------------------------------------------------------===//
+// GCN GFX11.
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"gfx1100", GFX11SpeedModel,
+  FeatureISAVersion11_0.Features
+>;
+
+def : ProcessorModel<"gfx1101", GFX11SpeedModel,
+  FeatureISAVersion11_0.Features
+>;
+
+def : ProcessorModel<"gfx1102", GFX11SpeedModel,
+  FeatureISAVersion11_0_2.Features
+>;
+
+def : ProcessorModel<"gfx1103", GFX11SpeedModel,
+  FeatureISAVersion11_0_2.Features
+>;
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 257561cb8430..c41548d19c8e 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -10,7 +10,7 @@
 /// This file defines the GCNRegPressure class, which tracks registry pressure
 /// by bookkeeping number of SGPR/VGPRs used, weights for large SGPR/VGPRs. It
 /// also implements a compare function, which compares different register
-/// pressures, and declares one with max occupance as winner.
+/// pressures, and declares one with max occupancy as winner.
 ///
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 75855a7a4f9c..100410bb7644 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -13,6 +13,7 @@
 
 #include "GCNSchedStrategy.h"
 #include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
 
 #define DEBUG_TYPE "machine-scheduler"
 
@@ -362,6 +363,9 @@ void GCNScheduleDAGMILive::schedule() {
   if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
       PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
     Pressure[RegionIdx] = PressureAfter;
+    RegionsWithMinOcc[RegionIdx] =
+        PressureAfter.getOccupancy(ST) == MinOccupancy;
+
     LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
     return;
   }
@@ -378,6 +382,7 @@ void GCNScheduleDAGMILive::schedule() {
   // occupancy before was higher, or if the current schedule has register
   // pressure higher than the excess limits which could lead to more spilling.
   unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
+
   // Allow memory bound functions to drop to 4 waves if not limited by an
   // attribute.
   if (WavesAfter < WavesBefore && WavesAfter < MinOccupancy &&
@@ -390,6 +395,7 @@ void GCNScheduleDAGMILive::schedule() {
   if (NewOccupancy < MinOccupancy) {
     MinOccupancy = NewOccupancy;
     MFI.limitOccupancy(MinOccupancy);
+    RegionsWithMinOcc.reset();
     LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
                       << MinOccupancy << ".\n");
   }
@@ -416,6 +422,8 @@ void GCNScheduleDAGMILive::schedule() {
         PressureAfter.less(ST, PressureBefore) ||
         !RescheduleRegions[RegionIdx]) {
       Pressure[RegionIdx] = PressureAfter;
+      RegionsWithMinOcc[RegionIdx] =
+          PressureAfter.getOccupancy(ST) == MinOccupancy;
       if (!RegionsWithClusters[RegionIdx] &&
           (Stage + 1) == UnclusteredReschedule)
         RescheduleRegions[RegionIdx] = false;
@@ -425,13 +433,18 @@ void GCNScheduleDAGMILive::schedule() {
     }
   }
 
+  RegionsWithMinOcc[RegionIdx] =
+      PressureBefore.getOccupancy(ST) == MinOccupancy;
   LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
   RescheduleRegions[RegionIdx] = RegionsWithClusters[RegionIdx] ||
                                  (Stage + 1) != UnclusteredReschedule;
   RegionEnd = RegionBegin;
+  int SkippedDebugInstr = 0;
   for (MachineInstr *MI : Unsched) {
-    if (MI->isDebugInstr())
+    if (MI->isDebugInstr()) {
+      ++SkippedDebugInstr;
       continue;
+    }
 
     if (MI->getIterator() != RegionEnd) {
       BB->remove(MI);
@@ -459,10 +472,31 @@ void GCNScheduleDAGMILive::schedule() {
     ++RegionEnd;
     LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
   }
+
+  // After reverting schedule, debug instrs will now be at the end of the block
+  // and RegionEnd will point to the first debug instr. Increment RegionEnd
+  // pass debug instrs to the actual end of the scheduling region.
+  while (SkippedDebugInstr-- > 0)
+    ++RegionEnd;
+
+  // If Unsched.front() instruction is a debug instruction, this will actually
+  // shrink the region since we moved all debug instructions to the end of the
+  // block. Find the first instruction that is not a debug instruction.
   RegionBegin = Unsched.front()->getIterator();
-  Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
+  if (RegionBegin->isDebugInstr()) {
+    for (MachineInstr *MI : Unsched) {
+      if (MI->isDebugInstr())
+        continue;
+      RegionBegin = MI->getIterator();
+      break;
+    }
+  }
 
+  // Then move the debug instructions back into their correct place and set
+  // RegionBegin and RegionEnd if needed.
   placeDebugValues();
+
+  Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
 }
 
 GCNRegPressure GCNScheduleDAGMILive::getRealRegPressure() const {
@@ -493,14 +527,14 @@ void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) {
 
   auto I = MBB->begin();
   auto LiveInIt = MBBLiveIns.find(MBB);
+  auto &Rgn = Regions[CurRegion];
+  auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
   if (LiveInIt != MBBLiveIns.end()) {
     auto LiveIn = std::move(LiveInIt->second);
     RPTracker.reset(*MBB->begin(), &LiveIn);
     MBBLiveIns.erase(LiveInIt);
   } else {
-    auto &Rgn = Regions[CurRegion];
     I = Rgn.first;
-    auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
     auto LRS = BBLiveInMap.lookup(NonDbgMI);
 #ifdef EXPENSIVE_CHECKS
     assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
@@ -511,7 +545,7 @@ void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) {
   for ( ; ; ) {
     I = RPTracker.getNext();
 
-    if (Regions[CurRegion].first == I) {
+    if (Regions[CurRegion].first == I || NonDbgMI == I) {
       LiveIns[CurRegion] = RPTracker.getLiveRegs();
       RPTracker.clearMaxPressure();
     }
@@ -561,9 +595,11 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
   RescheduleRegions.resize(Regions.size());
   RegionsWithClusters.resize(Regions.size());
   RegionsWithHighRP.resize(Regions.size());
+  RegionsWithMinOcc.resize(Regions.size());
   RescheduleRegions.set();
   RegionsWithClusters.reset();
   RegionsWithHighRP.reset();
+  RegionsWithMinOcc.reset();
 
   if (!Regions.empty())
     BBLiveInMap = getBBLiveInMap();
@@ -600,13 +636,41 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
             << "Retrying function scheduling with lowest recorded occupancy "
             << MinOccupancy << ".\n");
       }
+
+      if (Stage == PreRARematerialize) {
+        if (RegionsWithMinOcc.none() || Regions.size() == 1)
+          break;
+
+        const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+        const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+        // Check maximum occupancy
+        if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) ==
+            MinOccupancy)
+          break;
+
+        // FIXME: This pass will invalidate cached MBBLiveIns for regions
+        // inbetween the defs and region we sinked the def to. Cached pressure
+        // for regions where a def is sinked from will also be invalidated. Will
+        // need to be fixed if there is another pass after this pass.
+        static_assert(LastStage == PreRARematerialize,
+                      "Passes after PreRARematerialize are not supported");
+
+        collectRematerializableInstructions();
+        if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII))
+          break;
+
+        LLVM_DEBUG(
+            dbgs() << "Retrying function scheduling with improved occupancy of "
+                   << MinOccupancy << " from rematerializing\n");
+      }
     }
 
     if (Stage == UnclusteredReschedule)
       SavedMutations.swap(Mutations);
 
     for (auto Region : Regions) {
-      if ((Stage == UnclusteredReschedule && !RescheduleRegions[RegionIdx]) ||
+      if (((Stage == UnclusteredReschedule || Stage == PreRARematerialize) &&
+           !RescheduleRegions[RegionIdx]) ||
           (Stage == ClusteredLowOccupancyReschedule &&
            !RegionsWithClusters[RegionIdx] && !RegionsWithHighRP[RegionIdx])) {
 
@@ -631,6 +695,7 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
       // Skip empty scheduling regions (0 or 1 schedulable instructions).
       if (begin() == end() || begin() == std::prev(end())) {
         exitRegion();
+        ++RegionIdx;
         continue;
       }
 
@@ -653,3 +718,282 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
       SavedMutations.swap(Mutations);
   } while (Stage != LastStage);
 }
+
+void GCNScheduleDAGMILive::collectRematerializableInstructions() {
+  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+    Register Reg = Register::index2VirtReg(I);
+    if (!LIS->hasInterval(Reg))
+      continue;
+
+    // TODO: Handle AGPR and SGPR rematerialization
+    if (!SRI->isVGPRClass(MRI.getRegClass(Reg)) || !MRI.hasOneDef(Reg) ||
+        !MRI.hasOneNonDBGUse(Reg))
+      continue;
+
+    MachineOperand *Op = MRI.getOneDef(Reg);
+    MachineInstr *Def = Op->getParent();
+    if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def, AA))
+      continue;
+
+    MachineInstr *UseI = &*MRI.use_instr_nodbg_begin(Reg);
+    if (Def->getParent() == UseI->getParent())
+      continue;
+
+    // We are only collecting defs that are defined in another block and are
+    // live-through or used inside regions at MinOccupancy. This means that the
+    // register must be in the live-in set for the region.
+    bool AddedToRematList = false;
+    for (unsigned I = 0, E = Regions.size(); I != E; ++I) {
+      auto It = LiveIns[I].find(Reg);
+      if (It != LiveIns[I].end() && !It->second.none()) {
+        if (RegionsWithMinOcc[I]) {
+          RematerializableInsts[I][Def] = UseI;
+          AddedToRematList = true;
+        }
+
+        // Collect regions with rematerializable reg as live-in to avoid
+        // searching later when updating RP.
+        RematDefToLiveInRegions[Def].push_back(I);
+      }
+    }
+    if (!AddedToRematList)
+      RematDefToLiveInRegions.erase(Def);
+  }
+}
+
+bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
+                                                   const TargetInstrInfo *TII) {
+  // Temporary copies of cached variables we will be modifying and replacing if
+  // sinking succeeds.
+  SmallVector<
+      std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32>
+      NewRegions;
+  DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns;
+  DenseMap<unsigned, GCNRegPressure> NewPressure;
+  BitVector NewRescheduleRegions;
+
+  NewRegions.resize(Regions.size());
+  NewRescheduleRegions.resize(Regions.size());
+
+  // Collect only regions that has a rematerializable def as a live-in.
+  SmallSet<unsigned, 16> ImpactedRegions;
+  for (const auto &It : RematDefToLiveInRegions)
+    ImpactedRegions.insert(It.second.begin(), It.second.end());
+
+  // Make copies of register pressure and live-ins cache that will be updated
+  // as we rematerialize.
+  for (auto Idx : ImpactedRegions) {
+    NewPressure[Idx] = Pressure[Idx];
+    NewLiveIns[Idx] = LiveIns[Idx];
+  }
+  NewRegions = Regions;
+  NewRescheduleRegions.reset();
+
+  DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
+  bool Improved = false;
+  for (auto I : ImpactedRegions) {
+    if (!RegionsWithMinOcc[I])
+      continue;
+
+    Improved = false;
+    int VGPRUsage = NewPressure[I].getVGPRNum(ST.hasGFX90AInsts());
+    int SGPRUsage = NewPressure[I].getSGPRNum();
+
+    // TODO: Handle occupancy drop due to AGPR and SGPR.
+    // Check if cause of occupancy drop is due to VGPR usage and not SGPR.
+    if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == MinOccupancy)
+      break;
+
+    // The occupancy of this region could have been improved by a previous
+    // iteration's sinking of defs.
+    if (NewPressure[I].getOccupancy(ST) > MinOccupancy) {
+      NewRescheduleRegions[I] = true;
+      Improved = true;
+      continue;
+    }
+
+    // First check if we have enough trivially rematerializable instructions to
+    // improve occupancy. Optimistically assume all instructions we are able to
+    // sink decreased RP.
+    int TotalSinkableRegs = 0;
+    for (const auto &It : RematerializableInsts[I]) {
+      MachineInstr *Def = It.first;
+      Register DefReg = Def->getOperand(0).getReg();
+      TotalSinkableRegs +=
+          SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]);
+    }
+    int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs;
+    unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink);
+    // If in the most optimistic scenario, we cannot improve occupancy, then do
+    // not attempt to sink any instructions.
+    if (OptimisticOccupancy <= MinOccupancy)
+      break;
+
+    unsigned ImproveOccupancy = 0;
+    SmallVector<MachineInstr *, 4> SinkedDefs;
+    for (auto &It : RematerializableInsts[I]) {
+      MachineInstr *Def = It.first;
+      MachineBasicBlock::iterator InsertPos =
+          MachineBasicBlock::iterator(It.second);
+      Register Reg = Def->getOperand(0).getReg();
+      // Rematerialize MI to its use block. Since we are only rematerializing
+      // instructions that do not have any virtual reg uses, we do not need to
+      // call LiveRangeEdit::allUsesAvailableAt() and
+      // LiveRangeEdit::canRematerializeAt().
+      TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
+                         Def->getOperand(0).getSubReg(), *Def, *TRI);
+      MachineInstr *NewMI = &*(--InsertPos);
+      LIS->InsertMachineInstrInMaps(*NewMI);
+      LIS->removeInterval(Reg);
+      LIS->createAndComputeVirtRegInterval(Reg);
+      InsertedMIToOldDef[NewMI] = Def;
+
+      // Update region boundaries in scheduling region we sinked from since we
+      // may sink an instruction that was at the beginning or end of its region
+      updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr,
+                             /*Removing =*/true);
+
+      // Update region boundaries in region we sinked to.
+      updateRegionBoundaries(NewRegions, InsertPos, NewMI);
+
+      LaneBitmask PrevMask = NewLiveIns[I][Reg];
+      // FIXME: Also update cached pressure for where the def was sinked from.
+      // Update RP for all regions that has this reg as a live-in and remove
+      // the reg from all regions as a live-in.
+      for (auto Idx : RematDefToLiveInRegions[Def]) {
+        NewLiveIns[Idx].erase(Reg);
+        if (InsertPos->getParent() != Regions[Idx].first->getParent()) {
+          // Def is live-through and not used in this block.
+          NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), MRI);
+        } else {
+          // Def is used and rematerialized into this block.
+          GCNDownwardRPTracker RPT(*LIS);
+          auto *NonDbgMI = &*skipDebugInstructionsForward(
+              NewRegions[Idx].first, NewRegions[Idx].second);
+          RPT.reset(*NonDbgMI, &NewLiveIns[Idx]);
+          RPT.advance(NewRegions[Idx].second);
+          NewPressure[Idx] = RPT.moveMaxPressure();
+        }
+      }
+
+      SinkedDefs.push_back(Def);
+      ImproveOccupancy = NewPressure[I].getOccupancy(ST);
+      if (ImproveOccupancy > MinOccupancy)
+        break;
+    }
+
+    // Remove defs we just sinked from all regions' list of sinkable defs
+    for (auto &Def : SinkedDefs)
+      for (auto TrackedIdx : RematDefToLiveInRegions[Def])
+        RematerializableInsts[TrackedIdx].erase(Def);
+
+    if (ImproveOccupancy <= MinOccupancy)
+      break;
+
+    NewRescheduleRegions[I] = true;
+    Improved = true;
+  }
+
+  if (!Improved) {
+    // Occupancy was not improved for all regions that were at MinOccupancy.
+    // Undo sinking and remove newly rematerialized instructions.
+    for (auto &Entry : InsertedMIToOldDef) {
+      MachineInstr *MI = Entry.first;
+      MachineInstr *OldMI = Entry.second;
+      Register Reg = MI->getOperand(0).getReg();
+      LIS->RemoveMachineInstrFromMaps(*MI);
+      MI->eraseFromParent();
+      OldMI->clearRegisterDeads(Reg);
+      LIS->removeInterval(Reg);
+      LIS->createAndComputeVirtRegInterval(Reg);
+    }
+    return false;
+  }
+
+  // Occupancy was improved for all regions.
+  for (auto &Entry : InsertedMIToOldDef) {
+    MachineInstr *MI = Entry.first;
+    MachineInstr *OldMI = Entry.second;
+
+    // Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
+    BBLiveInMap.erase(OldMI);
+
+    // Remove OldMI and update LIS
+    Register Reg = MI->getOperand(0).getReg();
+    LIS->RemoveMachineInstrFromMaps(*OldMI);
+    OldMI->eraseFromParent();
+    LIS->removeInterval(Reg);
+    LIS->createAndComputeVirtRegInterval(Reg);
+  }
+
+  // Update live-ins, register pressure, and regions caches.
+  for (auto Idx : ImpactedRegions) {
+    LiveIns[Idx] = NewLiveIns[Idx];
+    Pressure[Idx] = NewPressure[Idx];
+    MBBLiveIns.erase(Regions[Idx].first->getParent());
+  }
+  Regions = NewRegions;
+  RescheduleRegions = NewRescheduleRegions;
+
+  SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+  MFI.increaseOccupancy(MF, ++MinOccupancy);
+
+  return true;
+}
+
+// Copied from MachineLICM
+bool GCNScheduleDAGMILive::isTriviallyReMaterializable(const MachineInstr &MI,
+                                                       AAResults *AA) {
+  if (!TII->isTriviallyReMaterializable(MI, AA))
+    return false;
+
+  for (const MachineOperand &MO : MI.operands())
+    if (MO.isReg() && MO.isUse() && MO.getReg().isVirtual())
+      return false;
+
+  return true;
+}
+
+// When removing, we will have to check both beginning and ending of the region.
+// When inserting, we will only have to check if we are inserting NewMI in front
+// of a scheduling region and do not need to check the ending since we will only
+// ever be inserting before an already existing MI.
+void GCNScheduleDAGMILive::updateRegionBoundaries(
+    SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
+                              MachineBasicBlock::iterator>> &RegionBoundaries,
+    MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) {
+  unsigned I = 0, E = RegionBoundaries.size();
+  // Search for first region of the block where MI is located
+  while (I != E && MI->getParent() != RegionBoundaries[I].first->getParent())
+    ++I;
+
+  for (; I != E; ++I) {
+    if (MI->getParent() != RegionBoundaries[I].first->getParent())
+      return;
+
+    if (Removing && MI == RegionBoundaries[I].first &&
+        MI == RegionBoundaries[I].second) {
+      // MI is in a region with size 1, after removing, the region will be
+      // size 0, set RegionBegin and RegionEnd to pass end of block iterator.
+      RegionBoundaries[I] =
+          std::make_pair(MI->getParent()->end(), MI->getParent()->end());
+      return;
+    }
+    if (MI == RegionBoundaries[I].first) {
+      if (Removing)
+        RegionBoundaries[I] =
+            std::make_pair(std::next(MI), RegionBoundaries[I].second);
+      else
+        // Inserted NewMI in front of region, set new RegionBegin to NewMI
+        RegionBoundaries[I] = std::make_pair(MachineBasicBlock::iterator(NewMI),
+                                             RegionBoundaries[I].second);
+      return;
+    }
+    if (Removing && MI == RegionBoundaries[I].second) {
+      RegionBoundaries[I] =
+          std::make_pair(RegionBoundaries[I].first, std::prev(MI));
+      return;
+    }
+  }
+}
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index a6e42ad3dfca..97f94f69b70e 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -14,6 +14,7 @@
 #define LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
 
 #include "GCNRegPressure.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 
 namespace llvm {
@@ -77,7 +78,8 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
     InitialSchedule,
     UnclusteredReschedule,
     ClusteredLowOccupancyReschedule,
-    LastStage = ClusteredLowOccupancyReschedule
+    PreRARematerialize,
+    LastStage = PreRARematerialize
   };
 
   const GCNSubtarget &ST;
@@ -110,24 +112,56 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   // Record regions with high register pressure.
   BitVector RegionsWithHighRP;
 
+  // Regions that has the same occupancy as the latest MinOccupancy
+  BitVector RegionsWithMinOcc;
+
   // Region live-in cache.
   SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns;
 
   // Region pressure cache.
   SmallVector<GCNRegPressure, 32> Pressure;
 
+  // Each region at MinOccupancy will have their own list of trivially
+  // rematerializable instructions we can remat to reduce RP. The list maps an
+  // instruction to the position we should remat before, usually the MI using
+  // the rematerializable instruction.
+  MapVector<unsigned, MapVector<MachineInstr *, MachineInstr *>>
+      RematerializableInsts;
+
+  // Map a trivially remateriazable def to a list of regions at MinOccupancy
+  // that has the defined reg as a live-in.
+  DenseMap<MachineInstr *, SmallVector<unsigned, 4>> RematDefToLiveInRegions;
+
   // Temporary basic block live-in cache.
   DenseMap<const MachineBasicBlock*, GCNRPTracker::LiveRegSet> MBBLiveIns;
 
   DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> BBLiveInMap;
   DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> getBBLiveInMap() const;
 
+  // Collect all trivially rematerializable VGPR instructions with a single def
+  // and single use outside the defining block into RematerializableInsts.
+  void collectRematerializableInstructions();
+
+  bool isTriviallyReMaterializable(const MachineInstr &MI, AAResults *AA);
+
+  // TODO: Should also attempt to reduce RP of SGPRs and AGPRs
+  // Attempt to reduce RP of VGPR by sinking trivially rematerializable
+  // instructions. Returns true if we were able to sink instruction(s).
+  bool sinkTriviallyRematInsts(const GCNSubtarget &ST,
+                               const TargetInstrInfo *TII);
+
   // Return current region pressure.
   GCNRegPressure getRealRegPressure() const;
 
   // Compute and cache live-ins and pressure for all regions in block.
   void computeBlockPressure(const MachineBasicBlock *MBB);
 
+  // Update region boundaries when removing MI or inserting NewMI before MI.
+  void updateRegionBoundaries(
+      SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
+                                MachineBasicBlock::iterator>> &RegionBoundaries,
+      MachineBasicBlock::iterator MI, MachineInstr *NewMI,
+      bool Removing = false);
 
 public:
   GCNScheduleDAGMILive(MachineSchedContext *C,
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 0cd2cfa2f0e7..d269d0945f3b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -58,133 +58,142 @@ protected:
   // Basic subtarget description.
   Triple TargetTriple;
   AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
-  unsigned Gen;
+  unsigned Gen = INVALID;
   InstrItineraryData InstrItins;
-  int LDSBankCount;
-  unsigned MaxPrivateElementSize;
+  int LDSBankCount = 0;
+  unsigned MaxPrivateElementSize = 0;
 
   // Possibly statically set by tablegen, but may want to be overridden.
-  bool FastFMAF32;
-  bool FastDenormalF32;
-  bool HalfRate64Ops;
-  bool FullRate64Ops;
+  bool FastFMAF32 = false;
+  bool FastDenormalF32 = false;
+  bool HalfRate64Ops = false;
+  bool FullRate64Ops = false;
 
   // Dynamically set bits that enable features.
-  bool FlatForGlobal;
-  bool AutoWaitcntBeforeBarrier;
-  bool UnalignedScratchAccess;
-  bool UnalignedAccessMode;
-  bool HasApertureRegs;
-  bool SupportsXNACK;
+  bool FlatForGlobal = false;
+  bool AutoWaitcntBeforeBarrier = false;
+  bool UnalignedScratchAccess = false;
+  bool UnalignedAccessMode = false;
+  bool HasApertureRegs = false;
+  bool SupportsXNACK = false;
 
   // This should not be used directly. 'TargetID' tracks the dynamic settings
   // for XNACK.
-  bool EnableXNACK;
+  bool EnableXNACK = false;
 
-  bool EnableTgSplit;
-  bool EnableCuMode;
-  bool TrapHandler;
+  bool EnableTgSplit = false;
+  bool EnableCuMode = false;
+  bool TrapHandler = false;
 
   // Used as options.
-  bool EnableLoadStoreOpt;
-  bool EnableUnsafeDSOffsetFolding;
-  bool EnableSIScheduler;
-  bool EnableDS128;
-  bool EnablePRTStrictNull;
-  bool DumpCode;
+  bool EnableLoadStoreOpt = false;
+  bool EnableUnsafeDSOffsetFolding = false;
+  bool EnableSIScheduler = false;
+  bool EnableDS128 = false;
+  bool EnablePRTStrictNull = false;
+  bool DumpCode = false;
 
   // Subtarget statically properties set by tablegen
-  bool FP64;
-  bool FMA;
-  bool MIMG_R128;
-  bool CIInsts;
-  bool GFX8Insts;
-  bool GFX9Insts;
-  bool GFX90AInsts;
-  bool GFX10Insts;
-  bool GFX10_3Insts;
-  bool GFX7GFX8GFX9Insts;
-  bool SGPRInitBug;
-  bool NegativeScratchOffsetBug;
-  bool NegativeUnalignedScratchOffsetBug;
-  bool HasSMemRealTime;
-  bool HasIntClamp;
-  bool HasFmaMixInsts;
-  bool HasMovrel;
-  bool HasVGPRIndexMode;
-  bool HasScalarStores;
-  bool HasScalarAtomics;
-  bool HasSDWAOmod;
-  bool HasSDWAScalar;
-  bool HasSDWASdst;
-  bool HasSDWAMac;
-  bool HasSDWAOutModsVOPC;
-  bool HasDPP;
-  bool HasDPP8;
-  bool Has64BitDPP;
-  bool HasPackedFP32Ops;
-  bool HasExtendedImageInsts;
-  bool HasR128A16;
-  bool HasGFX10A16;
-  bool HasG16;
-  bool HasNSAEncoding;
-  unsigned NSAMaxSize;
-  bool GFX10_AEncoding;
-  bool GFX10_BEncoding;
-  bool HasDLInsts;
-  bool HasDot1Insts;
-  bool HasDot2Insts;
-  bool HasDot3Insts;
-  bool HasDot4Insts;
-  bool HasDot5Insts;
-  bool HasDot6Insts;
-  bool HasDot7Insts;
-  bool HasMAIInsts;
-  bool HasPkFmacF16Inst;
-  bool HasAtomicFaddInsts;
-  bool SupportsSRAMECC;
+  bool FP64 = false;
+  bool FMA = false;
+  bool MIMG_R128 = false;
+  bool CIInsts = false;
+  bool GFX8Insts = false;
+  bool GFX9Insts = false;
+  bool GFX90AInsts = false;
+  bool GFX940Insts = false;
+  bool GFX10Insts = false;
+  bool GFX11Insts = false;
+  bool GFX10_3Insts = false;
+  bool GFX7GFX8GFX9Insts = false;
+  bool SGPRInitBug = false;
+  bool UserSGPRInit16Bug = false;
+  bool NegativeScratchOffsetBug = false;
+  bool NegativeUnalignedScratchOffsetBug = false;
+  bool HasSMemRealTime = false;
+  bool HasIntClamp = false;
+  bool HasFmaMixInsts = false;
+  bool HasMovrel = false;
+  bool HasVGPRIndexMode = false;
+  bool HasScalarStores = false;
+  bool HasScalarAtomics = false;
+  bool HasSDWAOmod = false;
+  bool HasSDWAScalar = false;
+  bool HasSDWASdst = false;
+  bool HasSDWAMac = false;
+  bool HasSDWAOutModsVOPC = false;
+  bool HasDPP = false;
+  bool HasDPP8 = false;
+  bool Has64BitDPP = false;
+  bool HasPackedFP32Ops = false;
+  bool HasImageInsts = false;
+  bool HasExtendedImageInsts = false;
+  bool HasR128A16 = false;
+  bool HasGFX10A16 = false;
+  bool HasG16 = false;
+  bool HasNSAEncoding = false;
+  unsigned NSAMaxSize = 0;
+  bool GFX10_AEncoding = false;
+  bool GFX10_BEncoding = false;
+  bool HasDLInsts = false;
+  bool HasDot1Insts = false;
+  bool HasDot2Insts = false;
+  bool HasDot3Insts = false;
+  bool HasDot4Insts = false;
+  bool HasDot5Insts = false;
+  bool HasDot6Insts = false;
+  bool HasDot7Insts = false;
+  bool HasDot8Insts = false;
+  bool HasMAIInsts = false;
+  bool HasPkFmacF16Inst = false;
+  bool HasAtomicFaddRtnInsts = false;
+  bool HasAtomicFaddNoRtnInsts = false;
+  bool HasAtomicPkFaddNoRtnInsts = false;
+  bool SupportsSRAMECC = false;
 
   // This should not be used directly. 'TargetID' tracks the dynamic settings
   // for SRAMECC.
-  bool EnableSRAMECC;
-
-  bool HasNoSdstCMPX;
-  bool HasVscnt;
-  bool HasGetWaveIdInst;
-  bool HasSMemTimeInst;
-  bool HasShaderCyclesRegister;
-  bool HasVOP3Literal;
-  bool HasNoDataDepHazard;
-  bool FlatAddressSpace;
-  bool FlatInstOffsets;
-  bool FlatGlobalInsts;
-  bool FlatScratchInsts;
-  bool ScalarFlatScratchInsts;
-  bool HasArchitectedFlatScratch;
-  bool AddNoCarryInsts;
-  bool HasUnpackedD16VMem;
-  bool LDSMisalignedBug;
-  bool HasMFMAInlineLiteralBug;
-  bool UnalignedBufferAccess;
-  bool UnalignedDSAccess;
-  bool HasPackedTID;
-  bool ScalarizeGlobal;
-
-  bool HasVcmpxPermlaneHazard;
-  bool HasVMEMtoScalarWriteHazard;
-  bool HasSMEMtoVectorWriteHazard;
-  bool HasInstFwdPrefetchBug;
-  bool HasVcmpxExecWARHazard;
-  bool HasLdsBranchVmemWARHazard;
-  bool HasNSAtoVMEMBug;
-  bool HasNSAClauseBug;
-  bool HasOffset3fBug;
-  bool HasFlatSegmentOffsetBug;
-  bool HasImageStoreD16Bug;
-  bool HasImageGather4D16Bug;
+  bool EnableSRAMECC = false;
+
+  bool HasNoSdstCMPX = false;
+  bool HasVscnt = false;
+  bool HasGetWaveIdInst = false;
+  bool HasSMemTimeInst = false;
+  bool HasShaderCyclesRegister = false;
+  bool HasVOP3Literal = false;
+  bool HasNoDataDepHazard = false;
+  bool FlatAddressSpace = false;
+  bool FlatInstOffsets = false;
+  bool FlatGlobalInsts = false;
+  bool FlatScratchInsts = false;
+  bool ScalarFlatScratchInsts = false;
+  bool HasArchitectedFlatScratch = false;
+  bool EnableFlatScratch = false;
+  bool AddNoCarryInsts = false;
+  bool HasUnpackedD16VMem = false;
+  bool LDSMisalignedBug = false;
+  bool HasMFMAInlineLiteralBug = false;
+  bool UnalignedBufferAccess = false;
+  bool UnalignedDSAccess = false;
+  bool HasPackedTID = false;
+  bool ScalarizeGlobal = false;
+
+  bool HasVcmpxPermlaneHazard = false;
+  bool HasVMEMtoScalarWriteHazard = false;
+  bool HasSMEMtoVectorWriteHazard = false;
+  bool HasInstFwdPrefetchBug = false;
+  bool HasVcmpxExecWARHazard = false;
+  bool HasLdsBranchVmemWARHazard = false;
+  bool HasNSAtoVMEMBug = false;
+  bool HasNSAClauseBug = false;
+  bool HasOffset3fBug = false;
+  bool HasFlatSegmentOffsetBug = false;
+  bool HasImageStoreD16Bug = false;
+  bool HasImageGather4D16Bug = false;
+  bool HasVOPDInsts = false;
 
   // Dummy feature to use for assembler in tablegen.
-  bool FeatureDisable;
+  bool FeatureDisable = false;
 
   SelectionDAGTargetInfo TSInfo;
 private:
@@ -193,9 +202,6 @@ private:
   SIFrameLowering FrameLowering;
 
 public:
-  // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword.
-  static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1);
-
   GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
                const GCNTargetMachine &TM);
   ~GCNSubtarget() override;
@@ -258,9 +264,19 @@ public:
     return (Generation)Gen;
   }
 
+  unsigned getMaxWaveScratchSize() const {
+    // See COMPUTE_TMPRING_SIZE.WAVESIZE.
+    if (getGeneration() < GFX11) {
+      // 13-bit field in units of 256-dword.
+      return (256 * 4) * ((1 << 13) - 1);
+    }
+    // 15-bit field in units of 64-dword.
+    return (64 * 4) * ((1 << 15) - 1);
+  }
+
   /// Return the number of high bits known to be zero for a frame index.
   unsigned getKnownHighZeroBitsForFrameIndex() const {
-    return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2();
+    return countLeadingZeros(getMaxWaveScratchSize()) + getWavefrontSizeLog2();
   }
 
   int getLDSBankCount() const {
@@ -558,13 +574,20 @@ public:
   // The ST addressing mode means no registers are used, either VGPR or SGPR,
   // but only immediate offset is swizzled and added to the FLAT scratch base.
   bool hasFlatScratchSTMode() const {
-    return hasFlatScratchInsts() && hasGFX10_3Insts();
+    return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
   }
 
+  bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; }
+
   bool hasScalarFlatScratchInsts() const {
     return ScalarFlatScratchInsts;
   }
 
+  bool enableFlatScratch() const {
+    return flatScratchIsArchitected() ||
+           (EnableFlatScratch && hasFlatScratchInsts());
+  }
+
   bool hasGlobalAddTidInsts() const {
     return GFX10_BEncoding;
   }
@@ -690,6 +713,10 @@ public:
     return HasDot7Insts;
   }
 
+  bool hasDot8Insts() const {
+    return HasDot8Insts;
+  }
+
   bool hasMAIInsts() const {
     return HasMAIInsts;
   }
@@ -699,9 +726,15 @@ public:
   }
 
   bool hasAtomicFaddInsts() const {
-    return HasAtomicFaddInsts;
+    return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
   }
 
+  bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; }
+
+  bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; }
+
+  bool hasAtomicPkFaddNoRtnInsts() const { return HasAtomicPkFaddNoRtnInsts; }
+
   bool hasNoSdstCMPX() const {
     return HasNoSdstCMPX;
   }
@@ -765,8 +798,6 @@ public:
     return true;
   }
 
-  bool enableFlatScratch() const;
-
   void overrideSchedPolicy(MachineSchedPolicy &Policy,
                            unsigned NumRegionInstrs) const override;
 
@@ -805,6 +836,9 @@ public:
   /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
   bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
 
+  /// \returns true if the subtarget has the v_permlane64_b32 instruction.
+  bool hasPermLane64() const { return getGeneration() >= GFX11; }
+
   bool hasDPP() const {
     return HasDPP;
   }
@@ -830,7 +864,11 @@ public:
   }
 
   bool hasFmaakFmamkF32Insts() const {
-    return getGeneration() >= GFX10;
+    return getGeneration() >= GFX10 || hasGFX940Insts();
+  }
+
+  bool hasImageInsts() const {
+    return HasImageInsts;
   }
 
   bool hasExtendedImageInsts() const {
@@ -875,6 +913,10 @@ public:
 
   bool hasMadF16() const;
 
+  bool hasMovB64() const { return GFX940Insts; }
+
+  bool hasLshlAddB64() const { return GFX940Insts; }
+
   bool enableSIScheduler() const {
     return EnableSIScheduler;
   }
@@ -887,6 +929,10 @@ public:
     return SGPRInitBug;
   }
 
+  bool hasUserSGPRInit16Bug() const {
+    return UserSGPRInit16Bug;
+  }
+
   bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
 
   bool hasNegativeUnalignedScratchOffsetBug() const {
@@ -915,6 +961,14 @@ public:
            getGeneration() <= AMDGPUSubtarget::GFX9;
   }
 
+  bool hasReadM0LdsDmaHazard() const {
+    return getGeneration() == AMDGPUSubtarget::GFX9;
+  }
+
+  bool hasReadM0LdsDirectHazard() const {
+    return getGeneration() == AMDGPUSubtarget::GFX9;
+  }
+
   bool hasVcmpxPermlaneHazard() const {
     return HasVcmpxPermlaneHazard;
   }
@@ -943,6 +997,22 @@ public:
     return HasLdsBranchVmemWARHazard;
   }
 
+  // Has one cycle hazard on transcendental instruction feeding a
+  // non transcendental VALU.
+  bool hasTransForwardingHazard() const { return GFX940Insts; }
+
+  // Has one cycle hazard on a VALU instruction partially writing dst with
+  // a shift of result bits feeding another VALU instruction.
+  bool hasDstSelForwardingHazard() const { return GFX940Insts; }
+
+  // Cannot use op_sel with v_dot instructions.
+  bool hasDOTOpSelHazard() const { return GFX940Insts; }
+
+  // Does not have HW interlocs for VALU writing and then reading SGPRs.
+  bool hasVDecCoExecHazard() const {
+    return GFX940Insts;
+  }
+
   bool hasNSAtoVMEMBug() const {
     return HasNSAtoVMEMBug;
   }
@@ -953,11 +1023,43 @@ public:
 
   bool hasGFX90AInsts() const { return GFX90AInsts; }
 
+  bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
+
+  bool hasLdsDirect() const { return getGeneration() >= GFX11; }
+
+  bool hasVALUPartialForwardingHazard() const {
+    return getGeneration() >= GFX11;
+  }
+
+  bool hasVALUTransUseHazard() const { return getGeneration() >= GFX11; }
+
   /// Return if operations acting on VGPR tuples require even alignment.
   bool needsAlignedVGPRs() const { return GFX90AInsts; }
 
+  /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
+  bool hasSPackHL() const { return GFX11Insts; }
+
+  /// Return true if the target's EXP instruction has the COMPR flag, which
+  /// affects the meaning of the EN (enable) bits.
+  bool hasCompressedExport() const { return !GFX11Insts; }
+
+  /// Return true if the target's EXP instruction supports the NULL export
+  /// target.
+  bool hasNullExportTarget() const { return !GFX11Insts; }
+
+  bool hasVOPDInsts() const { return HasVOPDInsts; }
+
+  bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
+
+  /// Return true if the target has the S_DELAY_ALU instruction.
+  bool hasDelayAlu() const { return GFX11Insts; }
+
   bool hasPackedTID() const { return HasPackedTID; }
 
+  // GFX940 is a derivation to GFX90A. hasGFX940Insts() being true implies that
+  // hasGFX90AInsts is also true.
+  bool hasGFX940Insts() const { return GFX940Insts; }
+
   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
   /// SGPRs
   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
@@ -989,6 +1091,9 @@ public:
     return getGeneration() >= GFX9;
   }
 
+  // \returns true if the target supports the pre-NGG legacy geometry path.
+  bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
+
   /// \returns SGPR allocation granularity supported by the subtarget.
   unsigned getSGPRAllocGranule() const {
     return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
@@ -1105,6 +1210,10 @@ public:
   /// unit requirement.
   unsigned getMaxNumVGPRs(const Function &F) const;
 
+  unsigned getMaxNumAGPRs(const Function &F) const {
+    return getMaxNumVGPRs(F);
+  }
+
   /// \returns Maximum number of VGPRs that meets number of waves per execution
   /// unit requirement for function \p MF, or number of VGPRs explicitly
   /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
@@ -1165,6 +1274,10 @@ public:
 
   void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
                              SDep &Dep) const override;
+
+  // \returns true if it's beneficial on this subtarget for the scheduler to
+  // cluster stores as well as loads.
+  bool shouldClusterStores() const { return getGeneration() >= GFX11; }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td b/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td
new file mode 100644
index 000000000000..1f65376890da
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td
@@ -0,0 +1,116 @@
+//===-- LDSDIRInstructions.td - LDS Direct Instruction Definitions --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// LDSDIR encoding
+//===----------------------------------------------------------------------===//
+
+class LDSDIRe<bits<2> op, bit is_direct> : Enc32 {
+  // encoding fields
+  bits<2> attrchan;
+  bits<6> attr;
+  bits<4> waitvdst;
+  bits<8> vdst;
+
+  // encoding
+  let Inst{31-24} = 0xce; // encoding
+  let Inst{23-22} = 0x0; // reserved
+  let Inst{21-20} = op;
+  let Inst{19-16} = waitvdst;
+  let Inst{15-10} = !if(is_direct, ?, attr);
+  let Inst{9-8} = !if(is_direct, ?, attrchan);
+  let Inst{7-0} = vdst;
+}
+
+//===----------------------------------------------------------------------===//
+// LDSDIR Classes
+//===----------------------------------------------------------------------===//
+
+class LDSDIR_getIns<bit direct> {
+  dag ret = !if(direct,
+    (ins wait_vdst:$waitvdst),
+    (ins Attr:$attr, AttrChan:$attrchan, wait_vdst:$waitvdst)
+  );
+}
+
+class LDSDIR_Common<string opName, string asm = "", bit direct> : InstSI<
+    (outs VGPR_32:$vdst),
+    LDSDIR_getIns<direct>.ret,
+    asm> {
+  let LDSDIR = 1;
+  let EXP_CNT = 1;
+
+  let hasSideEffects = 0;
+  let mayLoad = 1;
+  let mayStore = 0;
+
+  string Mnemonic = opName;
+  let UseNamedOperandTable = 1;
+
+  let Uses = [M0, EXEC];
+  let DisableWQM = 0;
+  let SchedRW = [WriteLDS];
+
+  bit is_direct;
+  let is_direct = direct;
+}
+
+class LDSDIR_Pseudo<string opName, bit direct> :
+  LDSDIR_Common<opName, "", direct>,
+  SIMCInstr<opName, SIEncodingFamily.NONE> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
+
+class LDSDIR_getAsm<bit direct> {
+  string ret = !if(direct,
+    " $vdst$waitvdst",
+    " $vdst, $attr$attrchan$waitvdst"
+  );
+}
+
+class LDSDIR_Real<bits<2> op, LDSDIR_Pseudo lds, int subtarget> :
+  LDSDIR_Common<lds.Mnemonic,
+                lds.Mnemonic # LDSDIR_getAsm<lds.is_direct>.ret,
+                lds.is_direct>,
+  SIMCInstr <lds.Mnemonic, subtarget>,
+  LDSDIRe<op, lds.is_direct> {
+  let isPseudo = 0;
+  let isCodeGenOnly = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// LDS Direct Instructions
+//===----------------------------------------------------------------------===//
+
+def LDS_DIRECT_LOAD : LDSDIR_Pseudo<"lds_direct_load", 1>;
+def LDS_PARAM_LOAD : LDSDIR_Pseudo<"lds_param_load", 0>;
+
+def : GCNPat <
+  (f32 (int_amdgcn_lds_direct_load M0)),
+  (LDS_DIRECT_LOAD 0)
+>;
+
+def : GCNPat <
+  (f32 (int_amdgcn_lds_param_load timm:$attrchan, timm:$attr, M0)),
+  (LDS_PARAM_LOAD timm:$attr, timm:$attrchan, 0)
+>;
+
+//===----------------------------------------------------------------------===//
+// GFX11+
+//===----------------------------------------------------------------------===//
+
+multiclass LDSDIR_Real_gfx11<bits<2> op, LDSDIR_Pseudo lds = !cast<LDSDIR_Pseudo>(NAME)> {
+  def _gfx11 : LDSDIR_Real<op, lds, SIEncodingFamily.GFX11> {
+    let AssemblerPredicate = isGFX11Plus;
+    let DecoderNamespace = "GFX11";
+  }
+}
+
+defm LDS_PARAM_LOAD : LDSDIR_Real_gfx11<0x0>;
+defm LDS_DIRECT_LOAD : LDSDIR_Real_gfx11<0x1>;
diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
index 912bcc792e4d..24c9cc2d7dd2 100644
--- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
+++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
@@ -239,9 +239,9 @@ void AMDGPUCustomBehaviour::generateWaitCntInfo() {
   AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
   InstrWaitCntInfo.resize(SrcMgr.size());
 
-  int Index = 0;
-  for (auto I = SrcMgr.begin(), E = SrcMgr.end(); I != E; ++I, ++Index) {
-    const std::unique_ptr<Instruction> &Inst = *I;
+  for (const auto &EN : llvm::enumerate(SrcMgr.getInstructions())) {
+    const std::unique_ptr<Instruction> &Inst = EN.value();
+    unsigned Index = EN.index();
     unsigned Opcode = Inst->getOpcode();
     const MCInstrDesc &MCID = MCII.get(Opcode);
     if ((MCID.TSFlags & SIInstrFlags::DS) &&
diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
index 56650515bd0a..7a0d454c3578 100644
--- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
+++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
@@ -31,7 +31,7 @@ public:
   AMDGPUInstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII)
       : InstrPostProcess(STI, MCII) {}
 
-  ~AMDGPUInstrPostProcess() {}
+  ~AMDGPUInstrPostProcess() = default;
 
   void postProcessInstruction(std::unique_ptr<Instruction> &Inst,
                               const MCInst &MCI) override;
@@ -86,7 +86,7 @@ public:
   AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
                         const mca::SourceMgr &SrcMgr, const MCInstrInfo &MCII);
 
-  ~AMDGPUCustomBehaviour() {}
+  ~AMDGPUCustomBehaviour() = default;
   /// This method is used to determine if an instruction
   /// should be allowed to be dispatched. The return value is
   /// how many cycles until the instruction can be dispatched.
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index 50318a59225d..bda3c25e956b 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -10,13 +10,16 @@
 #include "MCTargetDesc/AMDGPUFixupKinds.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/EndianStream.h"
+#include "llvm/Support/TargetParser.h"
 
 using namespace llvm;
 using namespace llvm::AMDGPU;
@@ -47,7 +50,10 @@ public:
   bool writeNopData(raw_ostream &OS, uint64_t Count,
                     const MCSubtargetInfo *STI) const override;
 
+  Optional<MCFixupKind> getFixupKind(StringRef Name) const override;
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
+  bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+                             const MCValue &Target) override;
 };
 
 } //End anonymous namespace
@@ -134,6 +140,9 @@ void AMDGPUAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                                   MutableArrayRef<char> Data, uint64_t Value,
                                   bool IsResolved,
                                   const MCSubtargetInfo *STI) const {
+  if (Fixup.getKind() >= FirstLiteralRelocationKind)
+    return;
+
   Value = adjustFixupValue(Fixup, Value, &Asm.getContext());
   if (!Value)
     return; // Doesn't change encoding.
@@ -153,6 +162,15 @@ void AMDGPUAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
     Data[Offset + i] |= static_cast<uint8_t>((Value >> (i * 8)) & 0xff);
 }
 
+Optional<MCFixupKind> AMDGPUAsmBackend::getFixupKind(StringRef Name) const {
+  return StringSwitch<Optional<MCFixupKind>>(Name)
+#define ELF_RELOC(Name, Value)                                                 \
+  .Case(#Name, MCFixupKind(FirstLiteralRelocationKind + Value))
+#include "llvm/BinaryFormat/ELFRelocs/AMDGPU.def"
+#undef ELF_RELOC
+      .Default(None);
+}
+
 const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo(
                                                        MCFixupKind Kind) const {
   const static MCFixupKindInfo Infos[AMDGPU::NumTargetFixupKinds] = {
@@ -160,12 +178,21 @@ const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo(
     { "fixup_si_sopp_br",     0,     16,   MCFixupKindInfo::FKF_IsPCRel },
   };
 
+  if (Kind >= FirstLiteralRelocationKind)
+    return MCAsmBackend::getFixupKindInfo(FK_NONE);
+
   if (Kind < FirstTargetFixupKind)
     return MCAsmBackend::getFixupKindInfo(Kind);
 
   return Infos[Kind - FirstTargetFixupKind];
 }
 
+bool AMDGPUAsmBackend::shouldForceRelocation(const MCAssembler &,
+                                             const MCFixup &Fixup,
+                                             const MCValue &) {
+  return Fixup.getKind() >= FirstLiteralRelocationKind;
+}
+
 unsigned AMDGPUAsmBackend::getMinimumNopSize() const {
   return 4;
 }
@@ -236,5 +263,5 @@ MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T,
                                            const MCRegisterInfo &MRI,
                                            const MCTargetOptions &Options) {
   return new ELFAMDGPUAsmBackend(T, STI.getTargetTriple(),
-                                 getHsaAbiVersion(&STI).getValueOr(0));
+                                 getHsaAbiVersion(&STI).value_or(0));
 }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index bb2c298c2850..066b36622a16 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -65,7 +65,10 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
     return ELF::R_AMDGPU_REL64;
   }
 
-  switch (Fixup.getKind()) {
+  MCFixupKind Kind = Fixup.getKind();
+  if (Kind >= FirstLiteralRelocationKind)
+    return Kind - FirstLiteralRelocationKind;
+  switch (Kind) {
   default: break;
   case FK_PCRel_4:
     return ELF::R_AMDGPU_REL32;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 76663b563150..bd938d829953 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -120,14 +120,6 @@ void AMDGPUInstPrinter::printAddr64(const MCInst *MI, unsigned OpNo,
   printNamedBit(MI, OpNo, O, "addr64");
 }
 
-void AMDGPUInstPrinter::printMBUFOffset(const MCInst *MI, unsigned OpNo,
-                                        raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm()) {
-    O << " offset:";
-    printU16ImmDecOperand(MI, OpNo, O);
-  }
-}
-
 void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo,
                                     const MCSubtargetInfo &STI,
                                     raw_ostream &O) {
@@ -152,7 +144,7 @@ void AMDGPUInstPrinter::printFlatOffset(const MCInst *MI, unsigned OpNo,
     if (IsFlatSeg) { // Unsigned offset
       printU16ImmDecOperand(MI, OpNo, O);
     } else {         // Signed offset
-      if (AMDGPU::isGFX10Plus(STI)) {
+      if (AMDGPU::isGFX10(STI)) {
         O << formatDec(SignExtend32<12>(MI->getOperand(OpNo).getImm()));
       } else {
         O << formatDec(SignExtend32<13>(MI->getOperand(OpNo).getImm()));
@@ -191,6 +183,13 @@ void AMDGPUInstPrinter::printSMEMOffset(const MCInst *MI, unsigned OpNo,
   O << formatHex(MI->getOperand(OpNo).getImm());
 }
 
+void AMDGPUInstPrinter::printSMEMOffsetMod(const MCInst *MI, unsigned OpNo,
+                                           const MCSubtargetInfo &STI,
+                                           raw_ostream &O) {
+  O << " offset:";
+  printSMEMOffset(MI, OpNo, STI, O);
+}
+
 void AMDGPUInstPrinter::printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo,
                                                const MCSubtargetInfo &STI,
                                                raw_ostream &O) {
@@ -206,13 +205,15 @@ void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo,
                                   const MCSubtargetInfo &STI, raw_ostream &O) {
   auto Imm = MI->getOperand(OpNo).getImm();
   if (Imm & CPol::GLC)
-    O << " glc";
+    O << ((AMDGPU::isGFX940(STI) &&
+           !(MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::SMRD)) ? " sc0"
+                                                                     : " glc");
   if (Imm & CPol::SLC)
-    O << " slc";
+    O << (AMDGPU::isGFX940(STI) ? " nt" : " slc");
   if ((Imm & CPol::DLC) && AMDGPU::isGFX10Plus(STI))
     O << " dlc";
   if ((Imm & CPol::SCC) && AMDGPU::isGFX90A(STI))
-    O << " scc";
+    O << (AMDGPU::isGFX940(STI) ? " sc1" : " scc");
   if (Imm & ~CPol::ALL)
     O << " /* unexpected cache policy bit */";
 }
@@ -309,8 +310,8 @@ void AMDGPUInstPrinter::printSymbolicFormat(const MCInst *MI,
   if (AMDGPU::isGFX10Plus(STI)) {
     if (Val == UFMT_DEFAULT)
       return;
-    if (isValidUnifiedFormat(Val)) {
-      O << " format:[" << getUnifiedFormatName(Val) << ']';
+    if (isValidUnifiedFormat(Val, STI)) {
+      O << " format:[" << getUnifiedFormatName(Val, STI) << ']';
     } else {
       O << " format:" << Val;
     }
@@ -362,27 +363,26 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
 }
 
 void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo,
-                                    const MCSubtargetInfo &STI,
-                                    raw_ostream &O) {
+                                    const MCSubtargetInfo &STI, raw_ostream &O) {
   auto Opcode = MI->getOpcode();
   auto Flags = MII.get(Opcode).TSFlags;
-
   if (OpNo == 0) {
-    if (Flags & SIInstrFlags::VOP3) {
+    if (Flags & SIInstrFlags::VOP3 && Flags & SIInstrFlags::DPP)
+      O << "_e64_dpp";
+    else if (Flags & SIInstrFlags::VOP3) {
       if (!getVOP3IsSingle(Opcode))
         O << "_e64";
-    } else if (Flags & SIInstrFlags::DPP) {
+    } else if (Flags & SIInstrFlags::DPP)
       O << "_dpp";
-    } else if (Flags & SIInstrFlags::SDWA) {
+    else if (Flags & SIInstrFlags::SDWA)
       O << "_sdwa";
-    } else if (((Flags & SIInstrFlags::VOP1) && !getVOP1IsSingle(Opcode)) ||
-               ((Flags & SIInstrFlags::VOP2) && !getVOP2IsSingle(Opcode))) {
+    else if (((Flags & SIInstrFlags::VOP1) && !getVOP1IsSingle(Opcode)) ||
+             ((Flags & SIInstrFlags::VOP2) && !getVOP2IsSingle(Opcode)))
       O << "_e32";
-    }
     O << " ";
   }
 
-  printOperand(MI, OpNo, STI, O);
+  printRegularOperand(MI, OpNo, STI, O);
 
   // Print default vcc/vcc_lo operand.
   switch (Opcode) {
@@ -400,7 +400,16 @@ void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo,
   case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx10:
   case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx10:
   case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx10:
-    printDefaultVccOperand(1, STI, O);
+  case AMDGPU::V_ADD_CO_CI_U32_e32_gfx11:
+  case AMDGPU::V_SUB_CO_CI_U32_e32_gfx11:
+  case AMDGPU::V_SUBREV_CO_CI_U32_e32_gfx11:
+  case AMDGPU::V_ADD_CO_CI_U32_dpp_gfx11:
+  case AMDGPU::V_SUB_CO_CI_U32_dpp_gfx11:
+  case AMDGPU::V_SUBREV_CO_CI_U32_dpp_gfx11:
+  case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx11:
+  case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx11:
+  case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx11:
+    printDefaultVccOperand(false, STI, O);
     break;
   }
 }
@@ -412,7 +421,7 @@ void AMDGPUInstPrinter::printVINTRPDst(const MCInst *MI, unsigned OpNo,
   else
     O << "_e32 ";
 
-  printOperand(MI, OpNo, STI, O);
+  printRegularOperand(MI, OpNo, STI, O);
 }
 
 void AMDGPUInstPrinter::printImmediateInt16(uint32_t Imm,
@@ -533,7 +542,7 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
            STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm])
     O << "0.15915494309189532";
   else {
-    assert(isUInt<32>(Imm) || Imm == 0x3fc45f306dc9c882);
+    assert(isUInt<32>(Imm) || isInt<32>(Imm));
 
     // In rare situations, we will have a 32-bit literal in a 64-bit
     // operand. This is technically allowed for the encoding of s_mov_b64.
@@ -548,6 +557,18 @@ void AMDGPUInstPrinter::printBLGP(const MCInst *MI, unsigned OpNo,
   if (!Imm)
     return;
 
+  if (AMDGPU::isGFX940(STI)) {
+    switch (MI->getOpcode()) {
+    case AMDGPU::V_MFMA_F64_16X16X4F64_gfx940_acd:
+    case AMDGPU::V_MFMA_F64_16X16X4F64_gfx940_vcd:
+    case AMDGPU::V_MFMA_F64_4X4X4F64_gfx940_acd:
+    case AMDGPU::V_MFMA_F64_4X4X4F64_gfx940_vcd:
+      O << " neg:[" << (Imm & 1) << ',' << ((Imm >> 1) & 1) << ','
+        << ((Imm >> 2) & 1) << ']';
+      return;
+    }
+  }
+
   O << " blgp:" << Imm;
 }
 
@@ -571,26 +592,73 @@ void AMDGPUInstPrinter::printABID(const MCInst *MI, unsigned OpNo,
   O << " abid:" << Imm;
 }
 
-void AMDGPUInstPrinter::printDefaultVccOperand(unsigned OpNo,
+void AMDGPUInstPrinter::printDefaultVccOperand(bool FirstOperand,
                                                const MCSubtargetInfo &STI,
                                                raw_ostream &O) {
-  if (OpNo > 0)
+  if (!FirstOperand)
     O << ", ";
-  printRegOperand(STI.getFeatureBits()[AMDGPU::FeatureWavefrontSize64] ?
-                  AMDGPU::VCC : AMDGPU::VCC_LO, O, MRI);
-  if (OpNo == 0)
+  printRegOperand(STI.getFeatureBits()[AMDGPU::FeatureWavefrontSize64]
+                      ? AMDGPU::VCC
+                      : AMDGPU::VCC_LO,
+                  O, MRI);
+  if (FirstOperand)
     O << ", ";
 }
 
+void AMDGPUInstPrinter::printWaitVDST(const MCInst *MI, unsigned OpNo,
+                                      const MCSubtargetInfo &STI,
+                                      raw_ostream &O) {
+  uint8_t Imm = MI->getOperand(OpNo).getImm();
+  if (Imm != 0) {
+    O << " wait_vdst:";
+    printU4ImmDecOperand(MI, OpNo, O);
+  }
+}
+
+void AMDGPUInstPrinter::printWaitEXP(const MCInst *MI, unsigned OpNo,
+                                    const MCSubtargetInfo &STI,
+                                    raw_ostream &O) {
+  uint8_t Imm = MI->getOperand(OpNo).getImm();
+  if (Imm != 0) {
+    O << " wait_exp:";
+    printU4ImmDecOperand(MI, OpNo, O);
+  }
+}
+
+bool AMDGPUInstPrinter::needsImpliedVcc(const MCInstrDesc &Desc,
+                                        unsigned OpNo) const {
+  return OpNo == 1 && (Desc.TSFlags & SIInstrFlags::DPP) &&
+         (Desc.TSFlags & SIInstrFlags::VOPC) &&
+         (Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC) ||
+          Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC_LO));
+}
+
+// Print default vcc/vcc_lo operand of VOPC.
 void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                      const MCSubtargetInfo &STI,
                                      raw_ostream &O) {
-  // Print default vcc/vcc_lo operand of VOPC.
-  const MCInstrDesc &Desc = MII.get(MI->getOpcode());
-  if (OpNo == 0 && (Desc.TSFlags & SIInstrFlags::VOPC) &&
+  unsigned Opc = MI->getOpcode();
+  const MCInstrDesc &Desc = MII.get(Opc);
+  int ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
+  // 0, 1 and 2 are the first printed operands in different cases
+  // If there are printed modifiers, printOperandAndFPInputMods or
+  // printOperandAndIntInputMods will be called instead
+  if ((OpNo == 0 ||
+       (OpNo == 1 && (Desc.TSFlags & SIInstrFlags::DPP)) ||
+       (OpNo == 2 && (Desc.TSFlags & SIInstrFlags::DPP) && ModIdx != -1)) &&
+      (Desc.TSFlags & SIInstrFlags::VOPC) &&
       (Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC) ||
        Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC_LO)))
-    printDefaultVccOperand(OpNo, STI, O);
+    printDefaultVccOperand(true, STI, O);
+
+  printRegularOperand(MI, OpNo, STI, O);
+}
+
+// Print operands after vcc or modifier handling.
+void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
+                                            const MCSubtargetInfo &STI,
+                                            raw_ostream &O) {
+  const MCInstrDesc &Desc = MII.get(MI->getOpcode());
 
   if (OpNo >= MI->getNumOperands()) {
     O << "/*Missing OP" << OpNo << "*/";
@@ -710,12 +778,24 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx10:
   case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx10:
   case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx10:
+  case AMDGPU::V_CNDMASK_B32_e32_gfx11:
+  case AMDGPU::V_ADD_CO_CI_U32_e32_gfx11:
+  case AMDGPU::V_SUB_CO_CI_U32_e32_gfx11:
+  case AMDGPU::V_SUBREV_CO_CI_U32_e32_gfx11:
+  case AMDGPU::V_CNDMASK_B32_dpp_gfx11:
+  case AMDGPU::V_ADD_CO_CI_U32_dpp_gfx11:
+  case AMDGPU::V_SUB_CO_CI_U32_dpp_gfx11:
+  case AMDGPU::V_SUBREV_CO_CI_U32_dpp_gfx11:
+  case AMDGPU::V_CNDMASK_B32_dpp8_gfx11:
+  case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx11:
+  case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx11:
+  case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx11:
 
   case AMDGPU::V_CNDMASK_B32_e32_gfx6_gfx7:
   case AMDGPU::V_CNDMASK_B32_e32_vi:
     if ((int)OpNo == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
                                                 AMDGPU::OpName::src1))
-      printDefaultVccOperand(OpNo, STI, O);
+      printDefaultVccOperand(OpNo == 0, STI, O);
     break;
   }
 
@@ -732,6 +812,10 @@ void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI,
                                                    unsigned OpNo,
                                                    const MCSubtargetInfo &STI,
                                                    raw_ostream &O) {
+  const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+  if (needsImpliedVcc(Desc, OpNo))
+    printDefaultVccOperand(true, STI, O);
+
   unsigned InputModifiers = MI->getOperand(OpNo).getImm();
 
   // Use 'neg(...)' instead of '-' to avoid ambiguity.
@@ -754,7 +838,7 @@ void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI,
 
   if (InputModifiers & SISrcMods::ABS)
     O << '|';
-  printOperand(MI, OpNo + 1, STI, O);
+  printRegularOperand(MI, OpNo + 1, STI, O);
   if (InputModifiers & SISrcMods::ABS)
     O << '|';
 
@@ -767,10 +851,14 @@ void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI,
                                                     unsigned OpNo,
                                                     const MCSubtargetInfo &STI,
                                                     raw_ostream &O) {
+  const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+  if (needsImpliedVcc(Desc, OpNo))
+    printDefaultVccOperand(true, STI, O);
+
   unsigned InputModifiers = MI->getOperand(OpNo).getImm();
   if (InputModifiers & SISrcMods::SEXT)
     O << "sext(";
-  printOperand(MI, OpNo + 1, STI, O);
+  printRegularOperand(MI, OpNo + 1, STI, O);
   if (InputModifiers & SISrcMods::SEXT)
     O << ')';
 
@@ -784,7 +872,7 @@ void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI,
   case AMDGPU::V_SUBREV_CO_CI_U32_sdwa_gfx10:
     if ((int)OpNo + 1 == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
                                                     AMDGPU::OpName::src1))
-      printDefaultVccOperand(OpNo, STI, O);
+      printDefaultVccOperand(OpNo == 0, STI, O);
     break;
   }
 }
@@ -1203,9 +1291,9 @@ void AMDGPUInstPrinter::printVGPRIndexMode(const MCInst *MI, unsigned OpNo,
 void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo,
                                         const MCSubtargetInfo &STI,
                                         raw_ostream &O) {
-  printOperand(MI, OpNo, STI, O);
+  printRegularOperand(MI, OpNo, STI, O);
   O  << ", ";
-  printOperand(MI, OpNo + 1, STI, O);
+  printRegularOperand(MI, OpNo + 1, STI, O);
 }
 
 void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo,
@@ -1262,15 +1350,16 @@ void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
   uint16_t MsgId;
   uint16_t OpId;
   uint16_t StreamId;
-  decodeMsg(Imm16, MsgId, OpId, StreamId);
+  decodeMsg(Imm16, MsgId, OpId, StreamId, STI);
 
-  if (isValidMsgId(MsgId, STI) &&
-      isValidMsgOp(MsgId, OpId, STI) &&
+  StringRef MsgName = getMsgName(MsgId, STI);
+
+  if (!MsgName.empty() && isValidMsgOp(MsgId, OpId, STI) &&
       isValidMsgStream(MsgId, OpId, StreamId, STI)) {
-    O << "sendmsg(" << getMsgName(MsgId);
-    if (msgRequiresOp(MsgId)) {
-      O << ", " << getMsgOpName(MsgId, OpId);
-      if (msgSupportsStream(MsgId, OpId)) {
+    O << "sendmsg(" << MsgName;
+    if (msgRequiresOp(MsgId, STI)) {
+      O << ", " << getMsgOpName(MsgId, OpId, STI);
+      if (msgSupportsStream(MsgId, OpId, STI)) {
         O << ", " << StreamId;
       }
     }
@@ -1423,6 +1512,76 @@ void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
   }
 }
 
+void AMDGPUInstPrinter::printDepCtr(const MCInst *MI, unsigned OpNo,
+                                    const MCSubtargetInfo &STI,
+                                    raw_ostream &O) {
+  using namespace llvm::AMDGPU::DepCtr;
+
+  uint64_t Imm16 = MI->getOperand(OpNo).getImm() & 0xffff;
+
+  bool HasNonDefaultVal = false;
+  if (isSymbolicDepCtrEncoding(Imm16, HasNonDefaultVal, STI)) {
+    int Id = 0;
+    StringRef Name;
+    unsigned Val;
+    bool IsDefault;
+    bool NeedSpace = false;
+    while (decodeDepCtr(Imm16, Id, Name, Val, IsDefault, STI)) {
+      if (!IsDefault || !HasNonDefaultVal) {
+        if (NeedSpace)
+          O << ' ';
+        O << Name << '(' << Val << ')';
+        NeedSpace = true;
+      }
+    }
+  } else {
+    O << formatHex(Imm16);
+  }
+}
+
+void AMDGPUInstPrinter::printDelayFlag(const MCInst *MI, unsigned OpNo,
+                                       const MCSubtargetInfo &STI,
+                                       raw_ostream &O) {
+  const char *BadInstId = "/* invalid instid value */";
+  static const std::array<const char *, 12> InstIds = {
+      "NO_DEP",        "VALU_DEP_1",    "VALU_DEP_2",
+      "VALU_DEP_3",    "VALU_DEP_4",    "TRANS32_DEP_1",
+      "TRANS32_DEP_2", "TRANS32_DEP_3", "FMA_ACCUM_CYCLE_1",
+      "SALU_CYCLE_1",  "SALU_CYCLE_2",  "SALU_CYCLE_3"};
+
+  const char *BadInstSkip = "/* invalid instskip value */";
+  static const std::array<const char *, 6> InstSkips = {
+      "SAME", "NEXT", "SKIP_1", "SKIP_2", "SKIP_3", "SKIP_4"};
+
+  unsigned SImm16 = MI->getOperand(OpNo).getImm();
+  const char *Prefix = "";
+
+  unsigned Value = SImm16 & 0xF;
+  if (Value) {
+    const char *Name = Value < InstIds.size() ? InstIds[Value] : BadInstId;
+    O << Prefix << "instid0(" << Name << ')';
+    Prefix = " | ";
+  }
+
+  Value = (SImm16 >> 4) & 7;
+  if (Value) {
+    const char *Name =
+        Value < InstSkips.size() ? InstSkips[Value] : BadInstSkip;
+    O << Prefix << "instskip(" << Name << ')';
+    Prefix = " | ";
+  }
+
+  Value = (SImm16 >> 7) & 0xF;
+  if (Value) {
+    const char *Name = Value < InstIds.size() ? InstIds[Value] : BadInstId;
+    O << Prefix << "instid1(" << Name << ')';
+    Prefix = " | ";
+  }
+
+  if (!*Prefix)
+    O << "0";
+}
+
 void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo,
                                    const MCSubtargetInfo &STI, raw_ostream &O) {
   unsigned Id;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index 71db0beba0b6..202edeee3cb3 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -15,6 +15,7 @@
 #include "llvm/MC/MCInstPrinter.h"
 
 namespace llvm {
+class MCInstrDesc;
 
 class AMDGPUInstPrinter : public MCInstPrinter {
 public:
@@ -50,7 +51,6 @@ private:
   void printOffen(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printIdxen(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printAddr64(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                    raw_ostream &O);
   void printFlatOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -64,6 +64,8 @@ private:
                        const MCSubtargetInfo &STI, raw_ostream &O);
   void printSMEMOffset(const MCInst *MI, unsigned OpNo,
                        const MCSubtargetInfo &STI, raw_ostream &O);
+  void printSMEMOffsetMod(const MCInst *MI, unsigned OpNo,
+                          const MCSubtargetInfo &STI, raw_ostream &O);
   void printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo,
                               const MCSubtargetInfo &STI, raw_ostream &O);
   void printGDS(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -116,6 +118,8 @@ private:
                         raw_ostream &O);
   void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                     raw_ostream &O);
+  void printRegularOperand(const MCInst *MI, unsigned OpNo,
+                           const MCSubtargetInfo &STI, raw_ostream &O);
   void printOperand(const MCInst *MI, uint64_t /*Address*/, unsigned OpNum,
                     const MCSubtargetInfo &STI, raw_ostream &O) {
     printOperand(MI, OpNum, STI, O);
@@ -172,8 +176,13 @@ private:
                  raw_ostream &O);
   void printABID(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                  raw_ostream &O);
-  void printDefaultVccOperand(unsigned OpNo, const MCSubtargetInfo &STI,
+  bool needsImpliedVcc(const MCInstrDesc &Desc, unsigned OpNo) const;
+  void printDefaultVccOperand(bool FirstOperand, const MCSubtargetInfo &STI,
                               raw_ostream &O);
+  void printWaitVDST(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                    raw_ostream &O);
+  void printWaitEXP(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                    raw_ostream &O);
 
   void printExpSrcN(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                     raw_ostream &O, unsigned N);
@@ -234,6 +243,10 @@ protected:
                     raw_ostream &O);
   void printWaitFlag(const MCInst *MI, unsigned OpNo,
                      const MCSubtargetInfo &STI, raw_ostream &O);
+  void printDepCtr(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                   raw_ostream &O);
+  void printDelayFlag(const MCInst *MI, unsigned OpNo,
+                      const MCSubtargetInfo &STI, raw_ostream &O);
   void printHwreg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                   raw_ostream &O);
   void printEndpgm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
index 53c724f2211a..02c213f90f89 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -14,8 +14,8 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCCODEEMITTER_H
 #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCCODEEMITTER_H
 
+#include "llvm/ADT/APInt.h"
 #include "llvm/MC/MCCodeEmitter.h"
-#include <cstdint>
 
 namespace llvm {
 
@@ -34,46 +34,34 @@ protected:
   AMDGPUMCCodeEmitter(const MCInstrInfo &mcii) : MCII(mcii) {}
 
 public:
+  void getBinaryCodeForInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups,
+                             APInt &Inst, APInt &Scratch,
+                             const MCSubtargetInfo &STI) const;
 
-  uint64_t getBinaryCodeForInstr(const MCInst &MI,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
+  virtual void getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                                 APInt &Op, SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const = 0;
 
-  virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
-                                     SmallVectorImpl<MCFixup> &Fixups,
-                                     const MCSubtargetInfo &STI) const {
-    return 0;
-  }
+  virtual void getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const = 0;
 
-  virtual unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
+  virtual void getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
                                      SmallVectorImpl<MCFixup> &Fixups,
-                                     const MCSubtargetInfo &STI) const {
-    return 0;
-  }
+                                     const MCSubtargetInfo &STI) const = 0;
 
-  virtual unsigned getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo,
-                                         SmallVectorImpl<MCFixup> &Fixups,
-                                         const MCSubtargetInfo &STI) const {
-    return 0;
-  }
+  virtual void getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const = 0;
 
-  virtual unsigned getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
+  virtual void getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
+                                      APInt &Op,
                                       SmallVectorImpl<MCFixup> &Fixups,
-                                      const MCSubtargetInfo &STI) const {
-    return 0;
-  }
-
-  virtual unsigned getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-    return 0;
-  }
-
-  virtual unsigned getAVOperandEncoding(const MCInst &MI, unsigned OpNo,
-                                        SmallVectorImpl<MCFixup> &Fixups,
-                                        const MCSubtargetInfo &STI) const {
-    return 0;
-  }
+                                      const MCSubtargetInfo &STI) const = 0;
+
+  virtual void getAVOperandEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const = 0;
 
 protected:
   FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 1f917cd91b47..11fe3f9ef058 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -19,6 +19,7 @@
 #include "R600InstPrinter.h"
 #include "R600MCTargetDesc.h"
 #include "TargetInfo/AMDGPUTargetInfo.h"
+#include "llvm/MC/LaneBitmask.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCELFStreamer.h"
@@ -27,6 +28,7 @@
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
index e5cce6045c8c..060d4b660632 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -33,7 +33,6 @@ enum AMDGPUDwarfFlavour : unsigned { Wave64 = 0, Wave32 = 1 };
 MCRegisterInfo *createGCNMCRegisterInfo(AMDGPUDwarfFlavour DwarfFlavour);
 
 MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII,
-                                     const MCRegisterInfo &MRI,
                                      MCContext &Ctx);
 
 MCAsmBackend *createAMDGPUAsmBackend(const Target &T,
@@ -51,7 +50,6 @@ createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
 
 #define GET_INSTRINFO_ENUM
 #define GET_INSTRINFO_OPERAND_ENUM
-#define GET_INSTRINFO_SCHED_ENUM
 #include "AMDGPUGenInstrInfo.inc"
 
 #define GET_SUBTARGETINFO_ENUM
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 7aa5f1abf65b..078133469549 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -17,12 +17,16 @@
 #include "Utils/AMDKernelCodeTUtils.h"
 #include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h"
 #include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/AMDGPUMetadata.h"
 #include "llvm/Support/AMDHSAKernelDescriptor.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetParser.h"
 
 using namespace llvm;
 using namespace llvm::AMDGPU;
@@ -102,6 +106,7 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909:  AK = GK_GFX909;  break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A:  AK = GK_GFX90A;  break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C:  AK = GK_GFX90C;  break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX940:  AK = GK_GFX940;  break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: AK = GK_GFX1011; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012: AK = GK_GFX1012; break;
@@ -112,6 +117,11 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1033: AK = GK_GFX1033; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1034: AK = GK_GFX1034; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1035: AK = GK_GFX1035; break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1036: AK = GK_GFX1036; break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1100: AK = GK_GFX1100; break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1101: AK = GK_GFX1101; break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1102: AK = GK_GFX1102; break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1103: AK = GK_GFX1103; break;
   case ELF::EF_AMDGPU_MACH_NONE:           AK = GK_NONE;    break;
   }
 
@@ -165,6 +175,7 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
   case GK_GFX909:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX909;
   case GK_GFX90A:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A;
   case GK_GFX90C:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C;
+  case GK_GFX940:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX940;
   case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010;
   case GK_GFX1011: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011;
   case GK_GFX1012: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012;
@@ -175,6 +186,11 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
   case GK_GFX1033: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1033;
   case GK_GFX1034: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1034;
   case GK_GFX1035: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1035;
+  case GK_GFX1036: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1036;
+  case GK_GFX1100: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1100;
+  case GK_GFX1101: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1101;
+  case GK_GFX1102: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1102;
+  case GK_GFX1103: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1103;
   case GK_NONE:    return ELF::EF_AMDGPU_MACH_NONE;
   }
 
@@ -285,7 +301,7 @@ bool AMDGPUTargetAsmStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
   uint32_t Encoded_pad = Encoded_s_code_end;
 
   // Instruction cache line size in bytes.
-  const unsigned Log2CacheLineSize = 6;
+  const unsigned Log2CacheLineSize = AMDGPU::isGFX11Plus(STI) ? 7 : 6;
   const unsigned CacheLineSize = 1u << Log2CacheLineSize;
 
   // Extra padding amount in bytes to support prefetch mode 3.
@@ -439,6 +455,8 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
     PRINT_FIELD(OS, ".amdhsa_forward_progress", KD,
                 compute_pgm_rsrc1,
                 amdhsa::COMPUTE_PGM_RSRC1_FWD_PROGRESS);
+    PRINT_FIELD(OS, ".amdhsa_shared_vgpr_count", KD, compute_pgm_rsrc3,
+                amdhsa::COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT);
   }
   PRINT_FIELD(
       OS, ".amdhsa_exception_fp_ieee_invalid_op", KD,
@@ -515,8 +533,8 @@ void AMDGPUTargetELFStreamer::EmitNote(
   if (STI.getTargetTriple().getOS() == Triple::AMDHSA)
     NoteFlags = ELF::SHF_ALLOC;
 
-  S.PushSection();
-  S.SwitchSection(
+  S.pushSection();
+  S.switchSection(
       Context.getELFSection(ElfNote::SectionName, ELF::SHT_NOTE, NoteFlags));
   S.emitInt32(NameSZ);                                        // namesz
   S.emitValue(DescSZ, 4);                                     // descz
@@ -525,7 +543,7 @@ void AMDGPUTargetELFStreamer::EmitNote(
   S.emitValueToAlignment(4, 0, 1, 0);                         // padding 0
   EmitDesc(S);                                                // desc
   S.emitValueToAlignment(4, 0, 1, 0);                         // padding 0
-  S.PopSection();
+  S.popSection();
 }
 
 unsigned AMDGPUTargetELFStreamer::getEFlags() {
@@ -691,7 +709,7 @@ AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISAV2(uint32_t Major,
              OS.emitBytes(VendorName);
              OS.emitInt8(0); // NULL terminate VendorName
              OS.emitBytes(ArchName);
-             OS.emitInt8(0); // NULL terminte ArchName
+             OS.emitInt8(0); // NULL terminate ArchName
            });
 }
 
@@ -699,9 +717,9 @@ void
 AMDGPUTargetELFStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) {
 
   MCStreamer &OS = getStreamer();
-  OS.PushSection();
+  OS.pushSection();
   OS.emitBytes(StringRef((const char*)&Header, sizeof(Header)));
-  OS.PopSection();
+  OS.popSection();
 }
 
 void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
@@ -806,7 +824,7 @@ bool AMDGPUTargetELFStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
   uint32_t Encoded_pad = Encoded_s_code_end;
 
   // Instruction cache line size in bytes.
-  const unsigned Log2CacheLineSize = 6;
+  const unsigned Log2CacheLineSize = AMDGPU::isGFX11Plus(STI) ? 7 : 6;
   const unsigned CacheLineSize = 1u << Log2CacheLineSize;
 
   // Extra padding amount in bytes to support prefetch mode 3.
@@ -818,11 +836,11 @@ bool AMDGPUTargetELFStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
   }
 
   MCStreamer &OS = getStreamer();
-  OS.PushSection();
+  OS.pushSection();
   OS.emitValueToAlignment(CacheLineSize, Encoded_pad, 4);
   for (unsigned I = 0; I < FillSize; I += 4)
     OS.emitInt32(Encoded_pad);
-  OS.PopSection();
+  OS.popSection();
   return true;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
index 6fe192e95e72..78eb304fe84f 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -20,6 +20,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Support/EndianStream.h"
 
@@ -84,9 +85,8 @@ enum FCInstr {
 };
 
 MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII,
-                                             const MCRegisterInfo &MRI,
                                              MCContext &Ctx) {
-  return new R600MCCodeEmitter(MCII, MRI);
+  return new R600MCCodeEmitter(MCII, *Ctx.getRegisterInfo());
 }
 
 void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h
index fc52cb33824f..605ae851378d 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h
@@ -24,7 +24,6 @@ class MCInstrInfo;
 class MCRegisterInfo;
 
 MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII,
-                                       const MCRegisterInfo &MRI,
                                        MCContext &Ctx);
 MCInstrInfo *createR600MCInstrInfo();
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 77f219aaa3ab..5e67fb5ec876 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -17,10 +17,15 @@
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIDefines.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/Casting.h"
 
 using namespace llvm;
 
@@ -34,9 +39,8 @@ class SIMCCodeEmitter : public  AMDGPUMCCodeEmitter {
                           const MCSubtargetInfo &STI) const;
 
 public:
-  SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
-                  MCContext &ctx)
-      : AMDGPUMCCodeEmitter(mcii), MRI(mri) {}
+  SIMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
+      : AMDGPUMCCodeEmitter(mcii), MRI(*ctx.getRegisterInfo()) {}
   SIMCCodeEmitter(const SIMCCodeEmitter &) = delete;
   SIMCCodeEmitter &operator=(const SIMCCodeEmitter &) = delete;
 
@@ -46,42 +50,45 @@ public:
                          const MCSubtargetInfo &STI) const override;
 
   /// \returns the encoding for an MCOperand.
-  uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
-                             SmallVectorImpl<MCFixup> &Fixups,
-                             const MCSubtargetInfo &STI) const override;
+  void getMachineOpValue(const MCInst &MI, const MCOperand &MO, APInt &Op,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
 
   /// Use a fixup to encode the simm16 field for SOPP branch
   ///        instructions.
-  unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
+  void getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+
+  void getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
                              SmallVectorImpl<MCFixup> &Fixups,
                              const MCSubtargetInfo &STI) const override;
 
-  unsigned getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const override;
+  void getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const override;
 
-  unsigned getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
+  void getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
                               SmallVectorImpl<MCFixup> &Fixups,
                               const MCSubtargetInfo &STI) const override;
 
-  unsigned getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
-                                  SmallVectorImpl<MCFixup> &Fixups,
-                                  const MCSubtargetInfo &STI) const override;
-
-  unsigned getAVOperandEncoding(const MCInst &MI, unsigned OpNo,
-                                SmallVectorImpl<MCFixup> &Fixups,
-                                const MCSubtargetInfo &STI) const override;
+  void getAVOperandEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
+                            SmallVectorImpl<MCFixup> &Fixups,
+                            const MCSubtargetInfo &STI) const override;
 
 private:
   uint64_t getImplicitOpSelHiEncoding(int Opcode) const;
+  void getMachineOpValueCommon(const MCInst &MI, const MCOperand &MO,
+                               unsigned OpNo, APInt &Op,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
 };
 
 } // end anonymous namespace
 
 MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII,
-                                           const MCRegisterInfo &MRI,
                                            MCContext &Ctx) {
-  return new SIMCCodeEmitter(MCII, MRI, Ctx);
+  return new SIMCCodeEmitter(MCII, Ctx);
 }
 
 // Returns the encoding value to use if the given integer is an integer inline
@@ -309,8 +316,9 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
                               computeAvailableFeatures(STI.getFeatureBits()));
 
   int Opcode = MI.getOpcode();
-  uint64_t Encoding = getBinaryCodeForInstr(MI, Fixups, STI);
-  const MCInstrDesc &Desc = MCII.get(Opcode);
+  APInt Encoding, Scratch;
+  getBinaryCodeForInstr(MI, Fixups, Encoding, Scratch,  STI);
+  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
   unsigned bytes = Desc.getSize();
 
   // Set unused op_sel_hi bits to 1 for VOP3P and MAI instructions.
@@ -322,7 +330,7 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
   }
 
   for (unsigned i = 0; i < bytes; i++) {
-    OS.write((uint8_t) ((Encoding >> (8 * i)) & 0xff));
+    OS.write((uint8_t)Encoding.extractBitsAsZExtValue(8, 8 * i));
   }
 
   // NSA encoding.
@@ -335,9 +343,11 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
     unsigned NumExtraAddrs = srsrc - vaddr0 - 1;
     unsigned NumPadding = (-NumExtraAddrs) & 3;
 
-    for (unsigned i = 0; i < NumExtraAddrs; ++i)
-      OS.write((uint8_t)getMachineOpValue(MI, MI.getOperand(vaddr0 + 1 + i),
-                                          Fixups, STI));
+    for (unsigned i = 0; i < NumExtraAddrs; ++i) {
+      getMachineOpValue(MI, MI.getOperand(vaddr0 + 1 + i), Encoding, Fixups,
+                        STI);
+      OS.write((uint8_t)Encoding.getLimitedValue());
+    }
     for (unsigned i = 0; i < NumPadding; ++i)
       OS.write(0);
   }
@@ -385,34 +395,36 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
   }
 }
 
-unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
-                                            SmallVectorImpl<MCFixup> &Fixups,
-                                            const MCSubtargetInfo &STI) const {
+void SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
+                                        APInt &Op,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const {
   const MCOperand &MO = MI.getOperand(OpNo);
 
   if (MO.isExpr()) {
     const MCExpr *Expr = MO.getExpr();
     MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_sopp_br;
     Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
-    return 0;
+    Op = APInt::getNullValue(96);
+  } else {
+    getMachineOpValue(MI, MO, Op, Fixups, STI);
   }
-
-  return getMachineOpValue(MI, MO, Fixups, STI);
 }
 
-unsigned SIMCCodeEmitter::getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo,
-                                                SmallVectorImpl<MCFixup> &Fixups,
-                                                const MCSubtargetInfo &STI) const {
+void SIMCCodeEmitter::getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo,
+                                            APInt &Op,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
   auto Offset = MI.getOperand(OpNo).getImm();
   // VI only supports 20-bit unsigned offsets.
   assert(!AMDGPU::isVI(STI) || isUInt<20>(Offset));
-  return Offset;
+  Op = Offset;
 }
 
-unsigned
-SIMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
-                                    SmallVectorImpl<MCFixup> &Fixups,
-                                    const MCSubtargetInfo &STI) const {
+void SIMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
+                                         APInt &Op,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
   using namespace AMDGPU::SDWA;
 
   uint64_t RegEnc = 0;
@@ -426,23 +438,24 @@ SIMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
     if (AMDGPU::isSGPR(AMDGPU::mc2PseudoReg(Reg), &MRI)) {
       RegEnc |= SDWA9EncValues::SRC_SGPR_MASK;
     }
-    return RegEnc;
+    Op = RegEnc;
+    return;
   } else {
     const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
     uint32_t Enc = getLitEncoding(MO, Desc.OpInfo[OpNo], STI);
     if (Enc != ~0U && Enc != 255) {
-      return Enc | SDWA9EncValues::SRC_SGPR_MASK;
+      Op = Enc | SDWA9EncValues::SRC_SGPR_MASK;
+      return;
     }
   }
 
   llvm_unreachable("Unsupported operand kind");
-  return 0;
 }
 
-unsigned
-SIMCCodeEmitter::getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
-                                        SmallVectorImpl<MCFixup> &Fixups,
-                                        const MCSubtargetInfo &STI) const {
+void SIMCCodeEmitter::getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
+                                             APInt &Op,
+                                             SmallVectorImpl<MCFixup> &Fixups,
+                                             const MCSubtargetInfo &STI) const {
   using namespace AMDGPU::SDWA;
 
   uint64_t RegEnc = 0;
@@ -455,13 +468,13 @@ SIMCCodeEmitter::getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
     RegEnc &= SDWA9EncValues::VOPC_DST_SGPR_MASK;
     RegEnc |= SDWA9EncValues::VOPC_DST_VCC_MASK;
   }
-  return RegEnc;
+  Op = RegEnc;
 }
 
-unsigned
-SIMCCodeEmitter::getAVOperandEncoding(const MCInst &MI, unsigned OpNo,
-                                      SmallVectorImpl<MCFixup> &Fixups,
-                                      const MCSubtargetInfo &STI) const {
+void SIMCCodeEmitter::getAVOperandEncoding(const MCInst &MI, unsigned OpNo,
+                                           APInt &Op,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
   unsigned Reg = MI.getOperand(OpNo).getReg();
   uint64_t Enc = MRI.getEncodingValue(Reg);
 
@@ -476,10 +489,11 @@ SIMCCodeEmitter::getAVOperandEncoding(const MCInst &MI, unsigned OpNo,
       MRI.getRegClass(AMDGPU::AReg_192RegClassID).contains(Reg) ||
       MRI.getRegClass(AMDGPU::AReg_224RegClassID).contains(Reg) ||
       MRI.getRegClass(AMDGPU::AReg_256RegClassID).contains(Reg) ||
+      MRI.getRegClass(AMDGPU::AReg_512RegClassID).contains(Reg) ||
       MRI.getRegClass(AMDGPU::AGPR_LO16RegClassID).contains(Reg))
     Enc |= 512;
 
-  return Enc;
+  Op = Enc;
 }
 
 static bool needsPCRel(const MCExpr *Expr) {
@@ -505,12 +519,21 @@ static bool needsPCRel(const MCExpr *Expr) {
   llvm_unreachable("invalid kind");
 }
 
-uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
-                                            const MCOperand &MO,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
-  if (MO.isReg())
-    return MRI.getEncodingValue(MO.getReg());
+void SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
+                                        const MCOperand &MO, APInt &Op,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const {
+  if (MO.isReg()){
+    Op = MRI.getEncodingValue(MO.getReg());
+    return;
+  }
+  unsigned OpNo = &MO - MI.begin();
+  getMachineOpValueCommon(MI, MO, OpNo, Op, Fixups, STI);
+}
+
+void SIMCCodeEmitter::getMachineOpValueCommon(
+    const MCInst &MI, const MCOperand &MO, unsigned OpNo, APInt &Op,
+    SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const {
 
   if (MO.isExpr() && MO.getExpr()->getKind() != MCExpr::Constant) {
     // FIXME: If this is expression is PCRel or not should not depend on what
@@ -533,28 +556,22 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
     uint32_t Offset = Desc.getSize();
     assert(Offset == 4 || Offset == 8);
 
-    Fixups.push_back(
-      MCFixup::create(Offset, MO.getExpr(), Kind, MI.getLoc()));
-  }
-
-  // Figure out the operand number, needed for isSrcOperand check
-  unsigned OpNo = 0;
-  for (unsigned e = MI.getNumOperands(); OpNo < e; ++OpNo) {
-    if (&MO == &MI.getOperand(OpNo))
-      break;
+    Fixups.push_back(MCFixup::create(Offset, MO.getExpr(), Kind, MI.getLoc()));
   }
 
   const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
   if (AMDGPU::isSISrcOperand(Desc, OpNo)) {
     uint32_t Enc = getLitEncoding(MO, Desc.OpInfo[OpNo], STI);
-    if (Enc != ~0U)
-      return Enc;
-
-  } else if (MO.isImm())
-    return MO.getImm();
+    if (Enc != ~0U) {
+      Op = Enc;
+      return;
+    }
+  } else if (MO.isImm()) {
+    Op = MO.getImm();
+    return;
+  }
 
   llvm_unreachable("Encoding of this operand type is not supported yet.");
-  return 0;
 }
 
 #define ENABLE_INSTR_PREDICATE_VERIFIER
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index cf03fd682143..be1addf35012 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -14,6 +14,8 @@
 // - MIMGEncGfx90a: encoding for gfx90a for atomics
 // - MIMGEncGfx10Default: gfx10 default (non-NSA) encoding
 // - MIMGEncGfx10NSA: gfx10 NSA encoding
+// - MIMGEncGfx11Default: gfx11 default (non-NSA) encoding
+// - MIMGEncGfx11NSA: gfx11 NSA encoding
 class MIMGEncoding;
 
 def MIMGEncGfx6 : MIMGEncoding;
@@ -21,6 +23,8 @@ def MIMGEncGfx8 : MIMGEncoding;
 def MIMGEncGfx90a : MIMGEncoding;
 def MIMGEncGfx10Default : MIMGEncoding;
 def MIMGEncGfx10NSA : MIMGEncoding;
+def MIMGEncGfx11Default : MIMGEncoding;
+def MIMGEncGfx11NSA : MIMGEncoding;
 
 def MIMGEncoding : GenericEnum {
   let FilterClass = "MIMGEncoding";
@@ -90,11 +94,13 @@ def MIMG {
   int NOP = -1;
 }
 
-class mimgopc <int base, int vi = base, int si = base> {
-  field bits<8> BASE = base; // Opcode for all but atomics
+class mimgopc <int gfx11, int gfx10m, int vi = gfx10m, int si = gfx10m> {
+  field bits<8> GFX11 = gfx11;
+  field bits<8> GFX10M = gfx10m; // GFX10minus for all but atomics
   field bits<8> VI = vi; // VI is only used for atomic instructions
   field bits<8> SI = si; // SI is only used for atomic instructions
-  bit HAS_BASE = !ne(base, MIMG.NOP);
+  bit HAS_GFX11 = !ne(gfx11, MIMG.NOP);
+  bit HAS_GFX10M = !ne(gfx10m, MIMG.NOP);
   bit HAS_VI = !ne(vi, MIMG.NOP);
   bit HAS_SI = !ne(si, MIMG.NOP);
 }
@@ -207,12 +213,16 @@ class MIMG <dag outs, string dns = "">
   MIMGEncoding MIMGEncoding;
   bits<8> VDataDwords;
   bits<8> VAddrDwords;
+
+  // If NSA is used this counts number of operands VAddrDwords is split into.
+  bits<8> VAddrOperands;
 }
 
 def MIMGInfoTable : GenericTable {
   let FilterClass = "MIMG";
   let CppTypeName = "MIMGInfo";
-  let Fields = ["Opcode", "BaseOpcode", "MIMGEncoding", "VDataDwords", "VAddrDwords"];
+  let Fields = ["Opcode", "BaseOpcode", "MIMGEncoding", "VDataDwords",
+                "VAddrDwords", "VAddrOperands"];
   string TypeOf_BaseOpcode = "MIMGBaseOpcode";
   string TypeOf_MIMGEncoding = "MIMGEncoding";
 
@@ -227,11 +237,12 @@ def getMIMGInfo : SearchIndex {
 
 // This class used to use !foldl to memoize the AddrAsmNames list.
 // It turned out that that was much slower than using !filter.
-class MIMGNSAHelper<int num_addrs> {
+class MIMGNSAHelper<int num_addrs,
+                    list<RegisterClass> addr_types=!listsplat(VGPR_32, num_addrs)> {
   list<string> AddrAsmNames =
     !foreach(i, !filter(i, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
                         !lt(i, num_addrs)), "vaddr" # i);
-  dag AddrIns = !dag(ins, !foreach(arg, AddrAsmNames, VGPR_32), AddrAsmNames);
+  dag AddrIns = !dag(ins, addr_types, AddrAsmNames);
   string AddrAsm = "[$" # !interleave(AddrAsmNames, ", $") # "]";
 
   int NSA = !if(!le(num_addrs, 1), ?,
@@ -247,6 +258,7 @@ class MIMG_gfx6789<bits<8> op, dag outs, string dns = "">
   let AssemblerPredicate = isGFX6GFX7GFX8GFX9NotGFX90A;
 
   let MIMGEncoding = MIMGEncGfx6;
+  let VAddrOperands = 1;
 
   let d16 = !if(BaseOpcode.HasD16, ?, 0);
 }
@@ -257,6 +269,7 @@ class MIMG_gfx90a<bits<8> op, dag outs, string dns = "">
   let AssemblerPredicate = isGFX90APlus;
 
   let MIMGEncoding = MIMGEncGfx90a;
+  let VAddrOperands = 1;
 
   let d16 = !if(BaseOpcode.HasD16, ?, 0);
 }
@@ -264,10 +277,11 @@ class MIMG_gfx90a<bits<8> op, dag outs, string dns = "">
 // Base class of all non-NSA gfx10 MIMG instructions.
 class MIMG_gfx10<int op, dag outs, string dns = "">
   : MIMG<outs, dns>, MIMGe_gfx10<op> {
-  let SubtargetPredicate = isGFX10Plus;
-  let AssemblerPredicate = isGFX10Plus;
+  let SubtargetPredicate = isGFX10Only;
+  let AssemblerPredicate = isGFX10Only;
 
   let MIMGEncoding = MIMGEncGfx10Default;
+  let VAddrOperands = 1;
 
   let d16 = !if(BaseOpcode.HasD16, ?, 0);
   let nsa = 0;
@@ -277,10 +291,11 @@ class MIMG_gfx10<int op, dag outs, string dns = "">
 // Note that 1-dword addresses always use non-NSA variants.
 class MIMG_nsa_gfx10<int op, dag outs, int num_addrs, string dns="">
   : MIMG<outs, dns>, MIMGe_gfx10<op> {
-  let SubtargetPredicate = isGFX10Plus;
-  let AssemblerPredicate = isGFX10Plus;
+  let SubtargetPredicate = isGFX10Only;
+  let AssemblerPredicate = isGFX10Only;
 
   let MIMGEncoding = MIMGEncGfx10NSA;
+  let VAddrOperands = num_addrs;
 
   MIMGNSAHelper nsah = MIMGNSAHelper<num_addrs>;
   dag AddrIns = nsah.AddrIns;
@@ -290,11 +305,45 @@ class MIMG_nsa_gfx10<int op, dag outs, int num_addrs, string dns="">
   let nsa = nsah.NSA;
 }
 
+// Base class of all non-NSA gfx11 MIMG instructions.
+class MIMG_gfx11<int op, dag outs, string dns = "">
+  : MIMG<outs, dns>, MIMGe_gfx11<op> {
+  let SubtargetPredicate = isGFX11Plus;
+  let AssemblerPredicate = isGFX11Plus;
+
+  let MIMGEncoding = MIMGEncGfx11Default;
+  let VAddrOperands = 1;
+
+  let d16 = !if(BaseOpcode.HasD16, ?, 0);
+  let nsa = 0;
+}
+
+// Base class for all NSA MIMG instructions.
+// Note that 1-dword addresses always use non-NSA variants.
+class MIMG_nsa_gfx11<int op, dag outs, int num_addrs, string dns="",
+                     list<RegisterClass> addr_types=[]>
+  : MIMG<outs, dns>, MIMGe_gfx11<op> {
+  let SubtargetPredicate = isGFX11Plus;
+  let AssemblerPredicate = isGFX11Plus;
+
+  let MIMGEncoding = MIMGEncGfx11NSA;
+  let VAddrOperands = num_addrs;
+
+  MIMGNSAHelper nsah = !if(!empty(addr_types),
+                           MIMGNSAHelper<num_addrs>,
+                           MIMGNSAHelper<num_addrs, addr_types>);
+  dag AddrIns = nsah.AddrIns;
+  string AddrAsm = nsah.AddrAsm;
+
+  let d16 = !if(BaseOpcode.HasD16, ?, 0);
+  let nsa = nsah.NSA;
+}
+
 class MIMG_NoSampler_Helper <mimgopc op, string asm,
                              RegisterClass dst_rc,
                              RegisterClass addr_rc,
                              string dns="">
-  : MIMG_gfx6789 <op.BASE, (outs dst_rc:$vdata), dns> {
+  : MIMG_gfx6789 <op.GFX10M, (outs dst_rc:$vdata), dns> {
   let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc,
                                 DMask:$dmask, UNorm:$unorm, CPol:$cpol,
                                 R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
@@ -307,7 +356,7 @@ class MIMG_NoSampler_Helper_gfx90a <mimgopc op, string asm,
                                     RegisterClass dst_rc,
                                     RegisterClass addr_rc,
                                     string dns="">
-  : MIMG_gfx90a <op.BASE, (outs getLdStRegisterOperand<dst_rc>.ret:$vdata), dns> {
+  : MIMG_gfx90a <op.GFX10M, (outs getLdStRegisterOperand<dst_rc>.ret:$vdata), dns> {
   let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc,
                                 DMask:$dmask, UNorm:$unorm, CPol:$cpol,
                                 R128A16:$r128, LWE:$lwe, DA:$da),
@@ -319,7 +368,7 @@ class MIMG_NoSampler_Helper_gfx90a <mimgopc op, string asm,
 class MIMG_NoSampler_gfx10<mimgopc op, string opcode,
                            RegisterClass DataRC, RegisterClass AddrRC,
                            string dns="">
-  : MIMG_gfx10<op.BASE, (outs DataRC:$vdata), dns> {
+  : MIMG_gfx10<op.GFX10M, (outs DataRC:$vdata), dns> {
   let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, DMask:$dmask,
                                 Dim:$dim, UNorm:$unorm, CPol:$cpol,
                                 R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
@@ -331,7 +380,32 @@ class MIMG_NoSampler_gfx10<mimgopc op, string opcode,
 class MIMG_NoSampler_nsa_gfx10<mimgopc op, string opcode,
                                RegisterClass DataRC, int num_addrs,
                                string dns="">
-  : MIMG_nsa_gfx10<op.BASE, (outs DataRC:$vdata), num_addrs, dns> {
+  : MIMG_nsa_gfx10<op.GFX10M, (outs DataRC:$vdata), num_addrs, dns> {
+  let InOperandList = !con(AddrIns,
+                           (ins SReg_256:$srsrc, DMask:$dmask,
+                                Dim:$dim, UNorm:$unorm, CPol:$cpol,
+                                R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
+                           !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+  let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"
+                    #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+class MIMG_NoSampler_gfx11<mimgopc op, string opcode,
+                           RegisterClass DataRC, RegisterClass AddrRC,
+                           string dns="">
+  : MIMG_gfx11<op.GFX11, (outs DataRC:$vdata), dns> {
+  let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, DMask:$dmask,
+                                Dim:$dim, UNorm:$unorm, CPol:$cpol,
+                                R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
+                           !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+  let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"
+                    #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+class MIMG_NoSampler_nsa_gfx11<mimgopc op, string opcode,
+                               RegisterClass DataRC, int num_addrs,
+                               string dns="">
+  : MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdata), num_addrs, dns> {
   let InOperandList = !con(AddrIns,
                            (ins SReg_256:$srsrc, DMask:$dmask,
                                 Dim:$dim, UNorm:$unorm, CPol:$cpol,
@@ -347,7 +421,7 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
                                              bit ExtendedImageInst = 1> {
   let ssamp = 0 in {
     let VAddrDwords = 1 in {
-      if op.HAS_BASE then {
+      if op.HAS_GFX10M then {
         def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32,
                                          !if(enableDisasm, "AMDGPU", "")>;
         if !not(ExtendedImageInst) then
@@ -356,30 +430,42 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
         def _V1_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VGPR_32,
                                              !if(enableDisasm, "AMDGPU", "")>;
       }
+      if op.HAS_GFX11 then {
+        def _V1_gfx11 : MIMG_NoSampler_gfx11<op, asm, dst_rc, VGPR_32,
+                                             !if(enableDisasm, "AMDGPU", "")>;
+      }
     }
 
     let VAddrDwords = 2 in {
-      if op.HAS_BASE then {
+      if op.HAS_GFX10M then {
         def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>;
         if !not(ExtendedImageInst) then
         def _V2_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_64>;
         def _V2_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_64>;
         def _V2_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 2>;
       }
+      if op.HAS_GFX11 then {
+        def _V2_gfx11 : MIMG_NoSampler_gfx11<op, asm, dst_rc, VReg_64>;
+        def _V2_nsa_gfx11 : MIMG_NoSampler_nsa_gfx11<op, asm, dst_rc, 2>;
+      }
     }
 
     let VAddrDwords = 3 in {
-      if op.HAS_BASE then {
+      if op.HAS_GFX10M then {
         def _V3 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_96>;
         if !not(ExtendedImageInst) then
         def _V3_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_96>;
         def _V3_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_96>;
         def _V3_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 3>;
       }
+      if op.HAS_GFX11 then {
+        def _V3_gfx11 : MIMG_NoSampler_gfx11<op, asm, dst_rc, VReg_96>;
+        def _V3_nsa_gfx11 : MIMG_NoSampler_nsa_gfx11<op, asm, dst_rc, 3>;
+      }
     }
 
     let VAddrDwords = 4 in {
-      if op.HAS_BASE then {
+      if op.HAS_GFX10M then {
         def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>;
         if !not(ExtendedImageInst) then
         def _V4_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_128>;
@@ -387,6 +473,11 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
         def _V4_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 4,
                                                      !if(enableDisasm, "AMDGPU", "")>;
       }
+      if op.HAS_GFX11 then {
+        def _V4_gfx11 : MIMG_NoSampler_gfx11<op, asm, dst_rc, VReg_128>;
+        def _V4_nsa_gfx11 : MIMG_NoSampler_nsa_gfx11<op, asm, dst_rc, 4,
+                                                     !if(enableDisasm, "AMDGPU", "")>;
+      }
     }
   }
 }
@@ -420,7 +511,7 @@ class MIMG_Store_Helper <mimgopc op, string asm,
                          RegisterClass data_rc,
                          RegisterClass addr_rc,
                          string dns = "">
-  : MIMG_gfx6789<op.BASE, (outs), dns> {
+  : MIMG_gfx6789<op.GFX10M, (outs), dns> {
   let InOperandList = !con((ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
                                 DMask:$dmask, UNorm:$unorm, CPol:$cpol,
                                 R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
@@ -433,7 +524,7 @@ class MIMG_Store_Helper_gfx90a <mimgopc op, string asm,
                                 RegisterClass data_rc,
                                 RegisterClass addr_rc,
                                 string dns = "">
-  : MIMG_gfx90a<op.BASE, (outs), dns> {
+  : MIMG_gfx90a<op.GFX10M, (outs), dns> {
   let InOperandList = !con((ins getLdStRegisterOperand<data_rc>.ret:$vdata,
                                 addr_rc:$vaddr, SReg_256:$srsrc,
                                 DMask:$dmask, UNorm:$unorm, CPol:$cpol,
@@ -446,7 +537,7 @@ class MIMG_Store_Helper_gfx90a <mimgopc op, string asm,
 class MIMG_Store_gfx10<mimgopc op, string opcode,
                        RegisterClass DataRC, RegisterClass AddrRC,
                        string dns="">
-  : MIMG_gfx10<op.BASE, (outs), dns> {
+  : MIMG_gfx10<op.GFX10M, (outs), dns> {
   let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc,
                                 DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
                                 R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
@@ -458,7 +549,33 @@ class MIMG_Store_gfx10<mimgopc op, string opcode,
 class MIMG_Store_nsa_gfx10<mimgopc op, string opcode,
                            RegisterClass DataRC, int num_addrs,
                            string dns="">
-  : MIMG_nsa_gfx10<op.BASE, (outs), num_addrs, dns> {
+  : MIMG_nsa_gfx10<op.GFX10M, (outs), num_addrs, dns> {
+  let InOperandList = !con((ins DataRC:$vdata),
+                           AddrIns,
+                           (ins SReg_256:$srsrc, DMask:$dmask,
+                                Dim:$dim, UNorm:$unorm, CPol:$cpol,
+                                R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
+                           !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+  let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"
+                    #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+class MIMG_Store_gfx11<mimgopc op, string opcode,
+                       RegisterClass DataRC, RegisterClass AddrRC,
+                       string dns="">
+  : MIMG_gfx11<op.GFX11, (outs), dns> {
+  let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc,
+                                DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
+                                R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
+                           !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+  let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"
+                    #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+class MIMG_Store_nsa_gfx11<mimgopc op, string opcode,
+                           RegisterClass DataRC, int num_addrs,
+                           string dns="">
+  : MIMG_nsa_gfx11<op.GFX11, (outs), num_addrs, dns> {
   let InOperandList = !con((ins DataRC:$vdata),
                            AddrIns,
                            (ins SReg_256:$srsrc, DMask:$dmask,
@@ -475,39 +592,57 @@ multiclass MIMG_Store_Addr_Helper <mimgopc op, string asm,
   let mayLoad = 0, mayStore = 1, hasSideEffects = 0, hasPostISelHook = 0,
       DisableWQM = 1, ssamp = 0 in {
     let VAddrDwords = 1 in {
-      if op.HAS_BASE then {
+      if op.HAS_GFX10M then {
         def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32,
                                      !if(enableDisasm, "AMDGPU", "")>;
+        let hasPostISelHook = 1 in
         def _V1_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VGPR_32,
                                      !if(enableDisasm, "GFX90A", "")>;
         def _V1_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VGPR_32,
                                           !if(enableDisasm, "AMDGPU", "")>;
       }
+      if op.HAS_GFX11 then {
+        def _V1_gfx11 : MIMG_Store_gfx11 <op, asm, data_rc, VGPR_32,
+                                          !if(enableDisasm, "AMDGPU", "")>;
+      }
     }
     let VAddrDwords = 2 in {
-      if op.HAS_BASE then {
+      if op.HAS_GFX10M then {
         def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>;
         def _V2_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_64>;
         def _V2_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_64>;
         def _V2_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 2>;
       }
+      if op.HAS_GFX11 then {
+        def _V2_gfx11 : MIMG_Store_gfx11 <op, asm, data_rc, VReg_64>;
+        def _V2_nsa_gfx11 : MIMG_Store_nsa_gfx11 <op, asm, data_rc, 2>;
+      }
     }
     let VAddrDwords = 3 in {
-      if op.HAS_BASE then {
+      if op.HAS_GFX10M then {
         def _V3 : MIMG_Store_Helper <op, asm, data_rc, VReg_96>;
         def _V3_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_96>;
         def _V3_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_96>;
         def _V3_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 3>;
       }
+      if op.HAS_GFX11 then {
+        def _V3_gfx11 : MIMG_Store_gfx11 <op, asm, data_rc, VReg_96>;
+        def _V3_nsa_gfx11 : MIMG_Store_nsa_gfx11 <op, asm, data_rc, 3>;
+      }
     }
     let VAddrDwords = 4 in {
-      if op.HAS_BASE then {
+      if op.HAS_GFX10M then {
         def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>;
         def _V4_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_128>;
         def _V4_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_128>;
         def _V4_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 4,
                                                          !if(enableDisasm, "AMDGPU", "")>;
       }
+      if op.HAS_GFX11 then {
+        def _V4_gfx11 : MIMG_Store_gfx11 <op, asm, data_rc, VReg_128>;
+        def _V4_nsa_gfx11 : MIMG_Store_nsa_gfx11 <op, asm, data_rc, 4,
+                                                         !if(enableDisasm, "AMDGPU", "")>;
+      }
     }
   }
 }
@@ -582,7 +717,7 @@ class MIMG_Atomic_gfx90a<mimgopc op, string asm, RegisterClass data_rc,
 class MIMG_Atomic_gfx10<mimgopc op, string opcode,
                         RegisterClass DataRC, RegisterClass AddrRC,
                         bit enableDisasm = 0>
-  : MIMG_gfx10<!cast<int>(op.BASE), (outs DataRC:$vdst),
+  : MIMG_gfx10<!cast<int>(op.GFX10M), (outs DataRC:$vdst),
                !if(enableDisasm, "AMDGPU", "")> {
   let Constraints = "$vdst = $vdata";
   let AsmMatchConverter = "cvtMIMGAtomic";
@@ -596,7 +731,37 @@ class MIMG_Atomic_gfx10<mimgopc op, string opcode,
 class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode,
                             RegisterClass DataRC, int num_addrs,
                             bit enableDisasm = 0>
-  : MIMG_nsa_gfx10<!cast<int>(op.BASE), (outs DataRC:$vdst), num_addrs,
+  : MIMG_nsa_gfx10<!cast<int>(op.GFX10M), (outs DataRC:$vdst), num_addrs,
+                   !if(enableDisasm, "AMDGPU", "")> {
+  let Constraints = "$vdst = $vdata";
+  let AsmMatchConverter = "cvtMIMGAtomic";
+
+  let InOperandList = !con((ins DataRC:$vdata),
+                           AddrIns,
+                           (ins SReg_256:$srsrc, DMask:$dmask,
+                                Dim:$dim, UNorm:$unorm, CPol:$cpol,
+                                R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe));
+  let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
+}
+
+class MIMG_Atomic_gfx11<mimgopc op, string opcode,
+                        RegisterClass DataRC, RegisterClass AddrRC,
+                        bit enableDisasm = 0>
+  : MIMG_gfx11<!cast<int>(op.GFX11), (outs DataRC:$vdst),
+               !if(enableDisasm, "AMDGPU", "")> {
+  let Constraints = "$vdst = $vdata";
+  let AsmMatchConverter = "cvtMIMGAtomic";
+
+  let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc,
+                           DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
+                           R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe);
+  let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
+}
+
+class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode,
+                            RegisterClass DataRC, int num_addrs,
+                            bit enableDisasm = 0>
+  : MIMG_nsa_gfx11<!cast<int>(op.GFX11), (outs DataRC:$vdst), num_addrs,
                    !if(enableDisasm, "AMDGPU", "")> {
   let Constraints = "$vdst = $vdata";
   let AsmMatchConverter = "cvtMIMGAtomic";
@@ -622,11 +787,15 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
       }
       if op.HAS_VI then {
         def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, enableDasm>;
+        let hasPostISelHook = 1 in
         def _V1_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VGPR_32, enableDasm>;
       }
-      if op.HAS_BASE then {
+      if op.HAS_GFX10M then {
         def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, enableDasm>;
       }
+      if op.HAS_GFX11 then {
+        def _V1_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VGPR_32, enableDasm>;
+      }
     }
     let VAddrDwords = 2 in {
       if op.HAS_SI then {
@@ -636,10 +805,14 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
         def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, 0>;
         def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64, 0>;
       }
-      if op.HAS_BASE then {
+      if op.HAS_GFX10M then {
         def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, 0>;
         def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, 0>;
       }
+      if op.HAS_GFX11 then {
+        def _V2_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_64, 0>;
+        def _V2_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 2, 0>;
+      }
     }
     let VAddrDwords = 3 in {
       if op.HAS_SI then {
@@ -649,10 +822,14 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
         def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, 0>;
         def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96, 0>;
       }
-      if op.HAS_BASE then {
+      if op.HAS_GFX10M then {
         def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, 0>;
         def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, 0>;
       }
+      if op.HAS_GFX11 then {
+        def _V3_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_96, 0>;
+        def _V3_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 3, 0>;
+      }
     }
     let VAddrDwords = 4 in {
       if op.HAS_SI then {
@@ -662,10 +839,14 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
         def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, 0>;
         def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128, 0>;
       }
-      if op.HAS_BASE then {
+      if op.HAS_GFX10M then {
         def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, 0>;
         def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, enableDasm>;
       }
+      if op.HAS_GFX11 then {
+        def _V4_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_128, 0>;
+        def _V4_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 4, enableDasm>;
+      }
     }
   }
 }
@@ -691,7 +872,7 @@ multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0>
 
 class MIMG_Sampler_Helper <mimgopc op, string asm, RegisterClass dst_rc,
                            RegisterClass src_rc, string dns="">
-  : MIMG_gfx6789 <op.BASE, (outs dst_rc:$vdata), dns> {
+  : MIMG_gfx6789 <op.GFX10M, (outs dst_rc:$vdata), dns> {
   let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
                                 DMask:$dmask, UNorm:$unorm, CPol:$cpol,
                                 R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
@@ -702,7 +883,7 @@ class MIMG_Sampler_Helper <mimgopc op, string asm, RegisterClass dst_rc,
 
 class MIMG_Sampler_gfx90a<mimgopc op, string asm, RegisterClass dst_rc,
                           RegisterClass src_rc, string dns="">
-  : MIMG_gfx90a<op.BASE, (outs getLdStRegisterOperand<dst_rc>.ret:$vdata), dns> {
+  : MIMG_gfx90a<op.GFX10M, (outs getLdStRegisterOperand<dst_rc>.ret:$vdata), dns> {
   let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
                                 DMask:$dmask, UNorm:$unorm, CPol:$cpol,
                                 R128A16:$r128, LWE:$lwe, DA:$da),
@@ -714,7 +895,7 @@ class MIMG_Sampler_gfx90a<mimgopc op, string asm, RegisterClass dst_rc,
 class MIMG_Sampler_gfx10<mimgopc op, string opcode,
                          RegisterClass DataRC, RegisterClass AddrRC,
                          string dns="">
-  : MIMG_gfx10<op.BASE, (outs DataRC:$vdata), dns> {
+  : MIMG_gfx10<op.GFX10M, (outs DataRC:$vdata), dns> {
   let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, SReg_128:$ssamp,
                                 DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
                                 R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
@@ -727,7 +908,34 @@ class MIMG_Sampler_gfx10<mimgopc op, string opcode,
 class MIMG_Sampler_nsa_gfx10<mimgopc op, string opcode,
                              RegisterClass DataRC, int num_addrs,
                              string dns="">
-  : MIMG_nsa_gfx10<op.BASE, (outs DataRC:$vdata), num_addrs, dns> {
+  : MIMG_nsa_gfx10<op.GFX10M, (outs DataRC:$vdata), num_addrs, dns> {
+  let InOperandList = !con(AddrIns,
+                           (ins SReg_256:$srsrc, SReg_128:$ssamp, DMask:$dmask,
+                                Dim:$dim, UNorm:$unorm, CPol:$cpol,
+                                R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
+                           !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+  let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc, $ssamp$dmask$dim$unorm"
+                    #"$cpol$r128$a16$tfe$lwe"
+                    #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+class MIMG_Sampler_gfx11<mimgopc op, string opcode,
+                         RegisterClass DataRC, RegisterClass AddrRC,
+                         string dns="">
+  : MIMG_gfx11<op.GFX11, (outs DataRC:$vdata), dns> {
+  let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, SReg_128:$ssamp,
+                                DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
+                                R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
+                           !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+  let AsmString = opcode#" $vdata, $vaddr0, $srsrc, $ssamp$dmask$dim$unorm"
+                    #"$cpol$r128$a16$tfe$lwe"
+                    #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+class MIMG_Sampler_nsa_gfx11<mimgopc op, string opcode,
+                             RegisterClass DataRC, int num_addrs,
+                             string dns="">
+  : MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdata), num_addrs, dns> {
   let InOperandList = !con(AddrIns,
                            (ins SReg_256:$srsrc, SReg_128:$ssamp, DMask:$dmask,
                                 Dim:$dim, UNorm:$unorm, CPol:$cpol,
@@ -823,7 +1031,7 @@ multiclass MIMG_Sampler_Src_Helper <mimgopc op, string asm,
                                     bit ExtendedImageInst = 1> {
   foreach addr = MIMG_Sampler_AddrSizes<sample>.MachineInstrs in {
     let VAddrDwords = addr.NumWords in {
-      if op.HAS_BASE then {
+      if op.HAS_GFX10M then {
         def _V # addr.NumWords
           : MIMG_Sampler_Helper <op, asm, dst_rc, addr.RegClass,
                                  !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
@@ -835,16 +1043,26 @@ multiclass MIMG_Sampler_Src_Helper <mimgopc op, string asm,
           : MIMG_Sampler_gfx10 <op, asm, dst_rc, addr.RegClass,
                                  !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
       }
+      if op.HAS_GFX11 then {
+        def _V # addr.NumWords # _gfx11
+          : MIMG_Sampler_gfx11 <op, asm, dst_rc, addr.RegClass,
+                                 !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
+      }
     }
   }
 
   foreach addr = MIMG_Sampler_AddrSizes<sample>.NSAInstrs in {
     let VAddrDwords = addr.NumWords in {
-      if op.HAS_BASE then {
+      if op.HAS_GFX10M then {
         def _V # addr.NumWords # _nsa_gfx10
           : MIMG_Sampler_nsa_gfx10<op, asm, dst_rc, addr.NumWords,
                                    !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
       }
+      if !and(op.HAS_GFX11, !le(addr.NumWords, 5)) then {
+        def _V # addr.NumWords # _nsa_gfx11
+          : MIMG_Sampler_nsa_gfx11<op, asm, dst_rc, addr.NumWords,
+                                   !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
+      }
     }
   }
 }
@@ -911,10 +1129,17 @@ class MIMG_IntersectRay_Helper<bit Is64, bit A16> {
   // when we only need 9, 11 or 12 depending on A16 field and ptr size.
   RegisterClass RegClass = MIMGAddrSize<num_addrs, 0>.RegClass;
   int VAddrDwords = !srl(RegClass.Size, 5);
+
+  int gfx11_nsa_addrs = !if(A16, 4, 5);
+  RegisterClass node_ptr_type = !if(Is64, VReg_64, VGPR_32);
+  list<RegisterClass> gfx11_addr_types =
+    !if(A16,
+        [node_ptr_type, VGPR_32, VReg_96, VReg_96],
+        [node_ptr_type, VGPR_32, VReg_96, VReg_96, VReg_96]);
 }
 
 class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterClass AddrRC, bit A16>
-    : MIMG_gfx10<op.BASE, (outs VReg_128:$vdata), "AMDGPU"> {
+    : MIMG_gfx10<op.GFX10M, (outs VReg_128:$vdata), "AMDGPU"> {
 
   let InOperandList = !con((ins AddrRC:$vaddr0, SReg_128:$srsrc),
                            !if(A16, (ins GFX10A16:$a16), (ins)));
@@ -924,7 +1149,27 @@ class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterClass AddrRC, b
 }
 
 class MIMG_IntersectRay_nsa_gfx10<mimgopc op, string opcode, int num_addrs, bit A16>
-    : MIMG_nsa_gfx10<op.BASE, (outs VReg_128:$vdata), num_addrs, "AMDGPU"> {
+    : MIMG_nsa_gfx10<op.GFX10M, (outs VReg_128:$vdata), num_addrs, "AMDGPU"> {
+  let InOperandList = !con(nsah.AddrIns,
+                           (ins SReg_128:$srsrc),
+                           !if(A16, (ins GFX10A16:$a16), (ins)));
+  let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $srsrc"#!if(A16, "$a16", "");
+}
+
+class MIMG_IntersectRay_gfx11<mimgopc op, string opcode, RegisterClass AddrRC, bit A16>
+    : MIMG_gfx11<op.GFX11, (outs VReg_128:$vdata), "AMDGPU"> {
+
+  let InOperandList = !con((ins AddrRC:$vaddr0, SReg_128:$srsrc),
+                           !if(A16, (ins GFX10A16:$a16), (ins)));
+  let AsmString = opcode#" $vdata, $vaddr0, $srsrc"#!if(A16, "$a16", "");
+
+  let nsa = 0;
+}
+
+class MIMG_IntersectRay_nsa_gfx11<mimgopc op, string opcode, int num_addrs,
+                                  bit A16, list<RegisterClass> addr_types>
+    : MIMG_nsa_gfx11<op.GFX11, (outs VReg_128:$vdata), num_addrs, "AMDGPU",
+                     addr_types> {
   let InOperandList = !con(nsah.AddrIns,
                            (ins SReg_128:$srsrc),
                            !if(A16, (ins GFX10A16:$a16), (ins)));
@@ -936,9 +1181,7 @@ multiclass MIMG_IntersectRay<mimgopc op, string opcode, bit Is64, bit A16> {
   def "" : MIMGBaseOpcode {
     let BVH = 1;
   }
-  let SubtargetPredicate = HasGFX10_AEncoding,
-      AssemblerPredicate = HasGFX10_AEncoding,
-      AsmMatchConverter = !if(A16, "cvtIntersectRay", ""),
+  let AsmMatchConverter = !if(A16, "cvtIntersectRay", ""),
       dmask = 0xf,
       unorm = 1,
       d16 = 0,
@@ -955,142 +1198,183 @@ multiclass MIMG_IntersectRay<mimgopc op, string opcode, bit Is64, bit A16> {
     def _sa_gfx10 : MIMG_IntersectRay_gfx10<op, opcode, info.RegClass, A16> {
       let VAddrDwords = info.VAddrDwords;
     }
+    def _sa_gfx11 : MIMG_IntersectRay_gfx11<op, opcode, info.RegClass, A16> {
+      let VAddrDwords = info.VAddrDwords;
+    }
     def _nsa_gfx10 : MIMG_IntersectRay_nsa_gfx10<op, opcode, info.num_addrs, A16> {
       let VAddrDwords = info.num_addrs;
     }
+    def _nsa_gfx11 : MIMG_IntersectRay_nsa_gfx11<op, opcode,
+                                                 info.gfx11_nsa_addrs, A16,
+                                                 info.gfx11_addr_types> {
+      let VAddrDwords = info.num_addrs;
+    }
+  }
+}
+
+multiclass MIMG_MSAA_Load <mimgopc op, string asm> {
+  def "" : MIMGBaseOpcode {
+    let HasD16 = 1;
+    let Gather4 = 1; /* for appropriate dmask handling */
+    let MSAA = 1;
+  }
+
+  let BaseOpcode = !cast<MIMGBaseOpcode>(NAME),
+    Gather4 = 1, hasPostISelHook = 0, mayLoad = 1 in {
+    let VDataDwords = 2 in
+    defm _V2 : MIMG_NoSampler_Src_Helper<op, asm, VReg_64, 0>; /* packed D16 */
+    let VDataDwords = 3 in
+    defm _V3 : MIMG_NoSampler_Src_Helper<op, asm, VReg_96, 0>; /* packed D16 + tfe */
+    let VDataDwords = 4 in
+    defm _V4 : MIMG_NoSampler_Src_Helper<op, asm, VReg_128, 1>;
+    let VDataDwords = 5 in
+    defm _V5 : MIMG_NoSampler_Src_Helper<op, asm, VReg_160, 0>;
   }
 }
 
 //===----------------------------------------------------------------------===//
 // MIMG Instructions
 //===----------------------------------------------------------------------===//
-defm IMAGE_LOAD                 : MIMG_NoSampler <mimgopc<0x00>, "image_load", 1>;
-defm IMAGE_LOAD_MIP             : MIMG_NoSampler <mimgopc<0x01>, "image_load_mip", 1, 1>;
-defm IMAGE_LOAD_PCK             : MIMG_NoSampler <mimgopc<0x02>, "image_load_pck", 0>;
-defm IMAGE_LOAD_PCK_SGN         : MIMG_NoSampler <mimgopc<0x03>, "image_load_pck_sgn", 0>;
-defm IMAGE_LOAD_MIP_PCK         : MIMG_NoSampler <mimgopc<0x04>, "image_load_mip_pck", 0, 1>;
-defm IMAGE_LOAD_MIP_PCK_SGN     : MIMG_NoSampler <mimgopc<0x05>, "image_load_mip_pck_sgn", 0, 1>;
-defm IMAGE_STORE                : MIMG_Store <mimgopc<0x08>, "image_store", 1>;
-defm IMAGE_STORE_MIP            : MIMG_Store <mimgopc<0x09>, "image_store_mip", 1, 1>;
-defm IMAGE_STORE_PCK            : MIMG_Store <mimgopc<0x0a>, "image_store_pck", 0>;
-defm IMAGE_STORE_MIP_PCK        : MIMG_Store <mimgopc<0x0b>, "image_store_mip_pck", 0, 1>;
-
-defm IMAGE_GET_RESINFO          : MIMG_NoSampler <mimgopc<0x0e>, "image_get_resinfo", 0, 1, 1>;
-
-defm IMAGE_ATOMIC_SWAP          : MIMG_Atomic <mimgopc<0x0f, 0x10, 0x0f>, "image_atomic_swap">;
-defm IMAGE_ATOMIC_CMPSWAP       : MIMG_Atomic <mimgopc<0x10, 0x11, 0x10>, "image_atomic_cmpswap", 1>;
-defm IMAGE_ATOMIC_ADD           : MIMG_Atomic <mimgopc<0x11, 0x12, 0x11>, "image_atomic_add">;
-defm IMAGE_ATOMIC_SUB           : MIMG_Atomic <mimgopc<0x12, 0x13, 0x12>, "image_atomic_sub">;
-defm IMAGE_ATOMIC_RSUB          : MIMG_Atomic <mimgopc<MIMG.NOP, MIMG.NOP, 0x13>, "image_atomic_rsub">;
-defm IMAGE_ATOMIC_SMIN          : MIMG_Atomic <mimgopc<0x14>, "image_atomic_smin">;
-defm IMAGE_ATOMIC_UMIN          : MIMG_Atomic <mimgopc<0x15>, "image_atomic_umin">;
-defm IMAGE_ATOMIC_SMAX          : MIMG_Atomic <mimgopc<0x16>, "image_atomic_smax">;
-defm IMAGE_ATOMIC_UMAX          : MIMG_Atomic <mimgopc<0x17>, "image_atomic_umax">;
-defm IMAGE_ATOMIC_AND           : MIMG_Atomic <mimgopc<0x18>, "image_atomic_and">;
-defm IMAGE_ATOMIC_OR            : MIMG_Atomic <mimgopc<0x19>, "image_atomic_or">;
-defm IMAGE_ATOMIC_XOR           : MIMG_Atomic <mimgopc<0x1a>, "image_atomic_xor">;
-defm IMAGE_ATOMIC_INC           : MIMG_Atomic <mimgopc<0x1b>, "image_atomic_inc">;
-defm IMAGE_ATOMIC_DEC           : MIMG_Atomic <mimgopc<0x1c>, "image_atomic_dec">;
-defm IMAGE_ATOMIC_FCMPSWAP      : MIMG_Atomic <mimgopc<0x1d, MIMG.NOP>, "image_atomic_fcmpswap", 1, 1>;
-defm IMAGE_ATOMIC_FMIN          : MIMG_Atomic <mimgopc<0x1e, MIMG.NOP>, "image_atomic_fmin", 0, 1>;
-defm IMAGE_ATOMIC_FMAX          : MIMG_Atomic <mimgopc<0x1f, MIMG.NOP>, "image_atomic_fmax", 0, 1>;
-
-defm IMAGE_SAMPLE               : MIMG_Sampler_WQM <mimgopc<0x20>, AMDGPUSample>;
+let OtherPredicates = [HasImageInsts] in {
+
+defm IMAGE_LOAD                 : MIMG_NoSampler <mimgopc<0x00, 0x00>, "image_load", 1>;
+defm IMAGE_LOAD_MIP             : MIMG_NoSampler <mimgopc<0x01, 0x01>, "image_load_mip", 1, 1>;
+defm IMAGE_LOAD_PCK             : MIMG_NoSampler <mimgopc<0x02, 0x02>, "image_load_pck", 0>;
+defm IMAGE_LOAD_PCK_SGN         : MIMG_NoSampler <mimgopc<0x03, 0x03>, "image_load_pck_sgn", 0>;
+defm IMAGE_LOAD_MIP_PCK         : MIMG_NoSampler <mimgopc<0x04, 0x04>, "image_load_mip_pck", 0, 1>;
+defm IMAGE_LOAD_MIP_PCK_SGN     : MIMG_NoSampler <mimgopc<0x05, 0x05>, "image_load_mip_pck_sgn", 0, 1>;
+defm IMAGE_STORE                : MIMG_Store <mimgopc<0x06, 0x08>, "image_store", 1>;
+defm IMAGE_STORE_MIP            : MIMG_Store <mimgopc<0x07, 0x09>, "image_store_mip", 1, 1>;
+defm IMAGE_STORE_PCK            : MIMG_Store <mimgopc<0x08, 0x0a>, "image_store_pck", 0>;
+defm IMAGE_STORE_MIP_PCK        : MIMG_Store <mimgopc<0x09, 0x0b>, "image_store_mip_pck", 0, 1>;
+
+defm IMAGE_GET_RESINFO          : MIMG_NoSampler <mimgopc<0x17, 0x0e>, "image_get_resinfo", 0, 1, 1>;
+
+defm IMAGE_ATOMIC_SWAP          : MIMG_Atomic <mimgopc<0x0a, 0x0f, 0x10, 0x0f>, "image_atomic_swap">;
+defm IMAGE_ATOMIC_CMPSWAP       : MIMG_Atomic <mimgopc<0x0b, 0x10, 0x11, 0x10>, "image_atomic_cmpswap", 1>;
+defm IMAGE_ATOMIC_ADD           : MIMG_Atomic <mimgopc<0x0c, 0x11, 0x12, 0x11>, "image_atomic_add">;
+defm IMAGE_ATOMIC_SUB           : MIMG_Atomic <mimgopc<0x0d, 0x12, 0x13, 0x12>, "image_atomic_sub">;
+defm IMAGE_ATOMIC_RSUB          : MIMG_Atomic <mimgopc<MIMG.NOP, MIMG.NOP, MIMG.NOP, 0x13>, "image_atomic_rsub">;
+defm IMAGE_ATOMIC_SMIN          : MIMG_Atomic <mimgopc<0x0e, 0x14>, "image_atomic_smin">;
+defm IMAGE_ATOMIC_UMIN          : MIMG_Atomic <mimgopc<0x0f, 0x15>, "image_atomic_umin">;
+defm IMAGE_ATOMIC_SMAX          : MIMG_Atomic <mimgopc<0x10, 0x16>, "image_atomic_smax">;
+defm IMAGE_ATOMIC_UMAX          : MIMG_Atomic <mimgopc<0x11, 0x17>, "image_atomic_umax">;
+defm IMAGE_ATOMIC_AND           : MIMG_Atomic <mimgopc<0x12, 0x18>, "image_atomic_and">;
+defm IMAGE_ATOMIC_OR            : MIMG_Atomic <mimgopc<0x13, 0x19>, "image_atomic_or">;
+defm IMAGE_ATOMIC_XOR           : MIMG_Atomic <mimgopc<0x14, 0x1a>, "image_atomic_xor">;
+defm IMAGE_ATOMIC_INC           : MIMG_Atomic <mimgopc<0x15, 0x1b>, "image_atomic_inc">;
+defm IMAGE_ATOMIC_DEC           : MIMG_Atomic <mimgopc<0x16, 0x1c>, "image_atomic_dec">;
+defm IMAGE_ATOMIC_FCMPSWAP      : MIMG_Atomic <mimgopc<MIMG.NOP, 0x1d, MIMG.NOP>, "image_atomic_fcmpswap", 1, 1>;
+defm IMAGE_ATOMIC_FMIN          : MIMG_Atomic <mimgopc<MIMG.NOP, 0x1e, MIMG.NOP>, "image_atomic_fmin", 0, 1>;
+defm IMAGE_ATOMIC_FMAX          : MIMG_Atomic <mimgopc<MIMG.NOP, 0x1f, MIMG.NOP>, "image_atomic_fmax", 0, 1>;
+
+defm IMAGE_SAMPLE               : MIMG_Sampler_WQM <mimgopc<0x1b, 0x20>, AMDGPUSample>;
 let OtherPredicates = [HasExtendedImageInsts] in {
-defm IMAGE_SAMPLE_CL            : MIMG_Sampler_WQM <mimgopc<0x21>, AMDGPUSample_cl>;
-defm IMAGE_SAMPLE_D             : MIMG_Sampler <mimgopc<0x22>, AMDGPUSample_d>;
-defm IMAGE_SAMPLE_D_CL          : MIMG_Sampler <mimgopc<0x23>, AMDGPUSample_d_cl>;
-defm IMAGE_SAMPLE_D_G16         : MIMG_Sampler <mimgopc<0xa2>, AMDGPUSample_d, 0, 1>;
-defm IMAGE_SAMPLE_D_CL_G16      : MIMG_Sampler <mimgopc<0xa3>, AMDGPUSample_d_cl, 0, 1>;
-defm IMAGE_SAMPLE_L             : MIMG_Sampler <mimgopc<0x24>, AMDGPUSample_l>;
-defm IMAGE_SAMPLE_B             : MIMG_Sampler_WQM <mimgopc<0x25>, AMDGPUSample_b>;
-defm IMAGE_SAMPLE_B_CL          : MIMG_Sampler_WQM <mimgopc<0x26>, AMDGPUSample_b_cl>;
-defm IMAGE_SAMPLE_LZ            : MIMG_Sampler <mimgopc<0x27>, AMDGPUSample_lz>;
-defm IMAGE_SAMPLE_C             : MIMG_Sampler_WQM <mimgopc<0x28>, AMDGPUSample_c>;
-defm IMAGE_SAMPLE_C_CL          : MIMG_Sampler_WQM <mimgopc<0x29>, AMDGPUSample_c_cl>;
-defm IMAGE_SAMPLE_C_D           : MIMG_Sampler <mimgopc<0x2a>, AMDGPUSample_c_d>;
-defm IMAGE_SAMPLE_C_D_CL        : MIMG_Sampler <mimgopc<0x2b>, AMDGPUSample_c_d_cl>;
-defm IMAGE_SAMPLE_C_D_G16       : MIMG_Sampler <mimgopc<0xaa>, AMDGPUSample_c_d, 0, 1>;
-defm IMAGE_SAMPLE_C_D_CL_G16    : MIMG_Sampler <mimgopc<0xab>, AMDGPUSample_c_d_cl, 0, 1>;
-defm IMAGE_SAMPLE_C_L           : MIMG_Sampler <mimgopc<0x2c>, AMDGPUSample_c_l>;
-defm IMAGE_SAMPLE_C_B           : MIMG_Sampler_WQM <mimgopc<0x2d>, AMDGPUSample_c_b>;
-defm IMAGE_SAMPLE_C_B_CL        : MIMG_Sampler_WQM <mimgopc<0x2e>, AMDGPUSample_c_b_cl>;
-defm IMAGE_SAMPLE_C_LZ          : MIMG_Sampler <mimgopc<0x2f>, AMDGPUSample_c_lz>;
-defm IMAGE_SAMPLE_O             : MIMG_Sampler_WQM <mimgopc<0x30>, AMDGPUSample_o>;
-defm IMAGE_SAMPLE_CL_O          : MIMG_Sampler_WQM <mimgopc<0x31>, AMDGPUSample_cl_o>;
-defm IMAGE_SAMPLE_D_O           : MIMG_Sampler <mimgopc<0x32>, AMDGPUSample_d_o>;
-defm IMAGE_SAMPLE_D_CL_O        : MIMG_Sampler <mimgopc<0x33>, AMDGPUSample_d_cl_o>;
-defm IMAGE_SAMPLE_D_O_G16       : MIMG_Sampler <mimgopc<0xb2>, AMDGPUSample_d_o, 0, 1>;
-defm IMAGE_SAMPLE_D_CL_O_G16    : MIMG_Sampler <mimgopc<0xb3>, AMDGPUSample_d_cl_o, 0, 1>;
-defm IMAGE_SAMPLE_L_O           : MIMG_Sampler <mimgopc<0x34>, AMDGPUSample_l_o>;
-defm IMAGE_SAMPLE_B_O           : MIMG_Sampler_WQM <mimgopc<0x35>, AMDGPUSample_b_o>;
-defm IMAGE_SAMPLE_B_CL_O        : MIMG_Sampler_WQM <mimgopc<0x36>, AMDGPUSample_b_cl_o>;
-defm IMAGE_SAMPLE_LZ_O          : MIMG_Sampler <mimgopc<0x37>, AMDGPUSample_lz_o>;
-defm IMAGE_SAMPLE_C_O           : MIMG_Sampler_WQM <mimgopc<0x38>, AMDGPUSample_c_o>;
-defm IMAGE_SAMPLE_C_CL_O        : MIMG_Sampler_WQM <mimgopc<0x39>, AMDGPUSample_c_cl_o>;
-defm IMAGE_SAMPLE_C_D_O         : MIMG_Sampler <mimgopc<0x3a>, AMDGPUSample_c_d_o>;
-defm IMAGE_SAMPLE_C_D_CL_O      : MIMG_Sampler <mimgopc<0x3b>, AMDGPUSample_c_d_cl_o>;
-defm IMAGE_SAMPLE_C_D_O_G16     : MIMG_Sampler <mimgopc<0xba>, AMDGPUSample_c_d_o, 0, 1>;
-defm IMAGE_SAMPLE_C_D_CL_O_G16  : MIMG_Sampler <mimgopc<0xbb>, AMDGPUSample_c_d_cl_o, 0, 1>;
-defm IMAGE_SAMPLE_C_L_O         : MIMG_Sampler <mimgopc<0x3c>, AMDGPUSample_c_l_o>;
-defm IMAGE_SAMPLE_C_B_CL_O      : MIMG_Sampler_WQM <mimgopc<0x3e>, AMDGPUSample_c_b_cl_o>;
-defm IMAGE_SAMPLE_C_B_O         : MIMG_Sampler_WQM <mimgopc<0x3d>, AMDGPUSample_c_b_o>;
-defm IMAGE_SAMPLE_C_LZ_O        : MIMG_Sampler <mimgopc<0x3f>, AMDGPUSample_c_lz_o>;
-defm IMAGE_GATHER4              : MIMG_Gather_WQM <mimgopc<0x40>, AMDGPUSample>;
-defm IMAGE_GATHER4_CL           : MIMG_Gather_WQM <mimgopc<0x41>, AMDGPUSample_cl>;
-defm IMAGE_GATHER4_L            : MIMG_Gather <mimgopc<0x44>, AMDGPUSample_l>;
-defm IMAGE_GATHER4_B            : MIMG_Gather_WQM <mimgopc<0x45>, AMDGPUSample_b>;
-defm IMAGE_GATHER4_B_CL         : MIMG_Gather_WQM <mimgopc<0x46>, AMDGPUSample_b_cl>;
-defm IMAGE_GATHER4_LZ           : MIMG_Gather <mimgopc<0x47>, AMDGPUSample_lz>;
-defm IMAGE_GATHER4_C            : MIMG_Gather_WQM <mimgopc<0x48>, AMDGPUSample_c>;
-defm IMAGE_GATHER4_C_CL         : MIMG_Gather_WQM <mimgopc<0x49>, AMDGPUSample_c_cl>;
-defm IMAGE_GATHER4_C_L          : MIMG_Gather <mimgopc<0x4c>, AMDGPUSample_c_l>;
-defm IMAGE_GATHER4_C_B          : MIMG_Gather_WQM <mimgopc<0x4d>, AMDGPUSample_c_b>;
-defm IMAGE_GATHER4_C_B_CL       : MIMG_Gather_WQM <mimgopc<0x4e>, AMDGPUSample_c_b_cl>;
-defm IMAGE_GATHER4_C_LZ         : MIMG_Gather <mimgopc<0x4f>, AMDGPUSample_c_lz>;
-defm IMAGE_GATHER4_O            : MIMG_Gather_WQM <mimgopc<0x50>, AMDGPUSample_o>;
-defm IMAGE_GATHER4_CL_O         : MIMG_Gather_WQM <mimgopc<0x51>, AMDGPUSample_cl_o>;
-defm IMAGE_GATHER4_L_O          : MIMG_Gather <mimgopc<0x54>, AMDGPUSample_l_o>;
-defm IMAGE_GATHER4_B_O          : MIMG_Gather_WQM <mimgopc<0x55>, AMDGPUSample_b_o>;
-defm IMAGE_GATHER4_B_CL_O       : MIMG_Gather <mimgopc<0x56>, AMDGPUSample_b_cl_o>;
-defm IMAGE_GATHER4_LZ_O         : MIMG_Gather <mimgopc<0x57>, AMDGPUSample_lz_o>;
-defm IMAGE_GATHER4_C_O          : MIMG_Gather_WQM <mimgopc<0x58>, AMDGPUSample_c_o>;
-defm IMAGE_GATHER4_C_CL_O       : MIMG_Gather_WQM <mimgopc<0x59>, AMDGPUSample_c_cl_o>;
-defm IMAGE_GATHER4_C_L_O        : MIMG_Gather <mimgopc<0x5c>, AMDGPUSample_c_l_o>;
-defm IMAGE_GATHER4_C_B_O        : MIMG_Gather_WQM <mimgopc<0x5d>, AMDGPUSample_c_b_o>;
-defm IMAGE_GATHER4_C_B_CL_O     : MIMG_Gather_WQM <mimgopc<0x5e>, AMDGPUSample_c_b_cl_o>;
-defm IMAGE_GATHER4_C_LZ_O       : MIMG_Gather <mimgopc<0x5f>, AMDGPUSample_c_lz_o>;
-//defm IMAGE_GATHER4H             : MIMG_Gather_WQM <mimgopc<0x61>, ?>;
-
-defm IMAGE_GET_LOD              : MIMG_Sampler <mimgopc<0x60>, AMDGPUSample, 1, 0, 1, "image_get_lod">;
-
-defm IMAGE_SAMPLE_CD            : MIMG_Sampler <mimgopc<0x68>, AMDGPUSample_cd>;
-defm IMAGE_SAMPLE_CD_CL         : MIMG_Sampler <mimgopc<0x69>, AMDGPUSample_cd_cl>;
-defm IMAGE_SAMPLE_C_CD          : MIMG_Sampler <mimgopc<0x6a>, AMDGPUSample_c_cd>;
-defm IMAGE_SAMPLE_C_CD_CL       : MIMG_Sampler <mimgopc<0x6b>, AMDGPUSample_c_cd_cl>;
-defm IMAGE_SAMPLE_CD_O          : MIMG_Sampler <mimgopc<0x6c>, AMDGPUSample_cd_o>;
-defm IMAGE_SAMPLE_CD_CL_O       : MIMG_Sampler <mimgopc<0x6d>, AMDGPUSample_cd_cl_o>;
-defm IMAGE_SAMPLE_C_CD_O        : MIMG_Sampler <mimgopc<0x6e>, AMDGPUSample_c_cd_o>;
-defm IMAGE_SAMPLE_C_CD_CL_O     : MIMG_Sampler <mimgopc<0x6f>, AMDGPUSample_c_cd_cl_o>;
-defm IMAGE_SAMPLE_CD_G16        : MIMG_Sampler <mimgopc<0xe8>, AMDGPUSample_cd, 0, 1>;
-defm IMAGE_SAMPLE_CD_CL_G16     : MIMG_Sampler <mimgopc<0xe9>, AMDGPUSample_cd_cl, 0, 1>;
-defm IMAGE_SAMPLE_C_CD_G16      : MIMG_Sampler <mimgopc<0xea>, AMDGPUSample_c_cd, 0, 1>;
-defm IMAGE_SAMPLE_C_CD_CL_G16   : MIMG_Sampler <mimgopc<0xeb>, AMDGPUSample_c_cd_cl, 0, 1>;
-defm IMAGE_SAMPLE_CD_O_G16      : MIMG_Sampler <mimgopc<0xec>, AMDGPUSample_cd_o, 0, 1>;
-defm IMAGE_SAMPLE_CD_CL_O_G16   : MIMG_Sampler <mimgopc<0xed>, AMDGPUSample_cd_cl_o, 0, 1>;
-defm IMAGE_SAMPLE_C_CD_O_G16    : MIMG_Sampler <mimgopc<0xee>, AMDGPUSample_c_cd_o, 0, 1>;
-defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <mimgopc<0xef>, AMDGPUSample_c_cd_cl_o, 0, 1>;
+defm IMAGE_SAMPLE_CL            : MIMG_Sampler_WQM <mimgopc<0x40, 0x21>, AMDGPUSample_cl>;
+defm IMAGE_SAMPLE_D             : MIMG_Sampler <mimgopc<0x1c, 0x22>, AMDGPUSample_d>;
+defm IMAGE_SAMPLE_D_CL          : MIMG_Sampler <mimgopc<0x41, 0x23>, AMDGPUSample_d_cl>;
+defm IMAGE_SAMPLE_L             : MIMG_Sampler <mimgopc<0x1d, 0x24>, AMDGPUSample_l>;
+defm IMAGE_SAMPLE_B             : MIMG_Sampler_WQM <mimgopc<0x1e, 0x25>, AMDGPUSample_b>;
+defm IMAGE_SAMPLE_B_CL          : MIMG_Sampler_WQM <mimgopc<0x42, 0x26>, AMDGPUSample_b_cl>;
+defm IMAGE_SAMPLE_LZ            : MIMG_Sampler <mimgopc<0x1f, 0x27>, AMDGPUSample_lz>;
+defm IMAGE_SAMPLE_C             : MIMG_Sampler_WQM <mimgopc<0x20, 0x28>, AMDGPUSample_c>;
+defm IMAGE_SAMPLE_C_CL          : MIMG_Sampler_WQM <mimgopc<0x43, 0x29>, AMDGPUSample_c_cl>;
+defm IMAGE_SAMPLE_C_D           : MIMG_Sampler <mimgopc<0x21, 0x2a>, AMDGPUSample_c_d>;
+defm IMAGE_SAMPLE_C_D_CL        : MIMG_Sampler <mimgopc<0x44, 0x2b>, AMDGPUSample_c_d_cl>;
+defm IMAGE_SAMPLE_C_L           : MIMG_Sampler <mimgopc<0x22, 0x2c>, AMDGPUSample_c_l>;
+defm IMAGE_SAMPLE_C_B           : MIMG_Sampler_WQM <mimgopc<0x23, 0x2d>, AMDGPUSample_c_b>;
+defm IMAGE_SAMPLE_C_B_CL        : MIMG_Sampler_WQM <mimgopc<0x45, 0x2e>, AMDGPUSample_c_b_cl>;
+defm IMAGE_SAMPLE_C_LZ          : MIMG_Sampler <mimgopc<0x24, 0x2f>, AMDGPUSample_c_lz>;
+defm IMAGE_SAMPLE_O             : MIMG_Sampler_WQM <mimgopc<0x25, 0x30>, AMDGPUSample_o>;
+defm IMAGE_SAMPLE_CL_O          : MIMG_Sampler_WQM <mimgopc<0x46, 0x31>, AMDGPUSample_cl_o>;
+defm IMAGE_SAMPLE_D_O           : MIMG_Sampler <mimgopc<0x26, 0x32>, AMDGPUSample_d_o>;
+defm IMAGE_SAMPLE_D_CL_O        : MIMG_Sampler <mimgopc<0x47, 0x33>, AMDGPUSample_d_cl_o>;
+defm IMAGE_SAMPLE_L_O           : MIMG_Sampler <mimgopc<0x27, 0x34>, AMDGPUSample_l_o>;
+defm IMAGE_SAMPLE_B_O           : MIMG_Sampler_WQM <mimgopc<0x28, 0x35>, AMDGPUSample_b_o>;
+defm IMAGE_SAMPLE_B_CL_O        : MIMG_Sampler_WQM <mimgopc<0x48, 0x36>, AMDGPUSample_b_cl_o>;
+defm IMAGE_SAMPLE_LZ_O          : MIMG_Sampler <mimgopc<0x29, 0x37>, AMDGPUSample_lz_o>;
+defm IMAGE_SAMPLE_C_O           : MIMG_Sampler_WQM <mimgopc<0x2a, 0x38>, AMDGPUSample_c_o>;
+defm IMAGE_SAMPLE_C_CL_O        : MIMG_Sampler_WQM <mimgopc<0x49, 0x39>, AMDGPUSample_c_cl_o>;
+defm IMAGE_SAMPLE_C_D_O         : MIMG_Sampler <mimgopc<0x2b, 0x3a>, AMDGPUSample_c_d_o>;
+defm IMAGE_SAMPLE_C_D_CL_O      : MIMG_Sampler <mimgopc<0x4a, 0x3b>, AMDGPUSample_c_d_cl_o>;
+defm IMAGE_SAMPLE_C_L_O         : MIMG_Sampler <mimgopc<0x2c, 0x3c>, AMDGPUSample_c_l_o>;
+defm IMAGE_SAMPLE_C_B_CL_O      : MIMG_Sampler_WQM <mimgopc<0x4b, 0x3e>, AMDGPUSample_c_b_cl_o>;
+defm IMAGE_SAMPLE_C_B_O         : MIMG_Sampler_WQM <mimgopc<0x2d, 0x3d>, AMDGPUSample_c_b_o>;
+defm IMAGE_SAMPLE_C_LZ_O        : MIMG_Sampler <mimgopc<0x2e, 0x3f>, AMDGPUSample_c_lz_o>;
+defm IMAGE_GATHER4              : MIMG_Gather_WQM <mimgopc<0x2f, 0x40>, AMDGPUSample>;
+defm IMAGE_GATHER4_CL           : MIMG_Gather_WQM <mimgopc<0x60, 0x41>, AMDGPUSample_cl>;
+defm IMAGE_GATHER4_L            : MIMG_Gather <mimgopc<0x30, 0x44>, AMDGPUSample_l>;
+defm IMAGE_GATHER4_B            : MIMG_Gather_WQM <mimgopc<0x31, 0x45>, AMDGPUSample_b>;
+defm IMAGE_GATHER4_B_CL         : MIMG_Gather_WQM <mimgopc<0x61, 0x46>, AMDGPUSample_b_cl>;
+defm IMAGE_GATHER4_LZ           : MIMG_Gather <mimgopc<0x32, 0x47>, AMDGPUSample_lz>;
+defm IMAGE_GATHER4_C            : MIMG_Gather_WQM <mimgopc<0x33, 0x48>, AMDGPUSample_c>;
+defm IMAGE_GATHER4_C_CL         : MIMG_Gather_WQM <mimgopc<0x62, 0x49>, AMDGPUSample_c_cl>;
+defm IMAGE_GATHER4_C_L          : MIMG_Gather <mimgopc<0x63, 0x4c>, AMDGPUSample_c_l>;
+defm IMAGE_GATHER4_C_B          : MIMG_Gather_WQM <mimgopc<0x64, 0x4d>, AMDGPUSample_c_b>;
+defm IMAGE_GATHER4_C_B_CL       : MIMG_Gather_WQM <mimgopc<0x65, 0x4e>, AMDGPUSample_c_b_cl>;
+defm IMAGE_GATHER4_C_LZ         : MIMG_Gather <mimgopc<0x34, 0x4f>, AMDGPUSample_c_lz>;
+defm IMAGE_GATHER4_O            : MIMG_Gather_WQM <mimgopc<0x35, 0x50>, AMDGPUSample_o>;
+defm IMAGE_GATHER4_CL_O         : MIMG_Gather_WQM <mimgopc<MIMG.NOP, 0x51>, AMDGPUSample_cl_o>;
+defm IMAGE_GATHER4_L_O          : MIMG_Gather <mimgopc<MIMG.NOP, 0x54>, AMDGPUSample_l_o>;
+defm IMAGE_GATHER4_B_O          : MIMG_Gather_WQM <mimgopc<MIMG.NOP, 0x55>, AMDGPUSample_b_o>;
+defm IMAGE_GATHER4_B_CL_O       : MIMG_Gather <mimgopc<MIMG.NOP, 0x56>, AMDGPUSample_b_cl_o>;
+defm IMAGE_GATHER4_LZ_O         : MIMG_Gather <mimgopc<0x36, 0x57>, AMDGPUSample_lz_o>;
+defm IMAGE_GATHER4_C_O          : MIMG_Gather_WQM <mimgopc<MIMG.NOP, 0x58>, AMDGPUSample_c_o>;
+defm IMAGE_GATHER4_C_CL_O       : MIMG_Gather_WQM <mimgopc<MIMG.NOP, 0x59>, AMDGPUSample_c_cl_o>;
+defm IMAGE_GATHER4_C_L_O        : MIMG_Gather <mimgopc<MIMG.NOP, 0x5c>, AMDGPUSample_c_l_o>;
+defm IMAGE_GATHER4_C_B_O        : MIMG_Gather_WQM <mimgopc<MIMG.NOP, 0x5d>, AMDGPUSample_c_b_o>;
+defm IMAGE_GATHER4_C_B_CL_O     : MIMG_Gather_WQM <mimgopc<MIMG.NOP, 0x5e>, AMDGPUSample_c_b_cl_o>;
+defm IMAGE_GATHER4_C_LZ_O       : MIMG_Gather <mimgopc<0x37, 0x5f>, AMDGPUSample_c_lz_o>;
+//defm IMAGE_GATHER4H             : MIMG_Gather_WQM <mimgopc<0x90, 0x61>, ?>;
+
+defm IMAGE_GET_LOD              : MIMG_Sampler <mimgopc<0x38, 0x60>, AMDGPUSample, 1, 0, 1, "image_get_lod">;
+
+defm IMAGE_SAMPLE_CD            : MIMG_Sampler <mimgopc<MIMG.NOP, 0x68>, AMDGPUSample_cd>;
+defm IMAGE_SAMPLE_CD_CL         : MIMG_Sampler <mimgopc<MIMG.NOP, 0x69>, AMDGPUSample_cd_cl>;
+defm IMAGE_SAMPLE_C_CD          : MIMG_Sampler <mimgopc<MIMG.NOP, 0x6a>, AMDGPUSample_c_cd>;
+defm IMAGE_SAMPLE_C_CD_CL       : MIMG_Sampler <mimgopc<MIMG.NOP, 0x6b>, AMDGPUSample_c_cd_cl>;
+defm IMAGE_SAMPLE_CD_O          : MIMG_Sampler <mimgopc<MIMG.NOP, 0x6c>, AMDGPUSample_cd_o>;
+defm IMAGE_SAMPLE_CD_CL_O       : MIMG_Sampler <mimgopc<MIMG.NOP, 0x6d>, AMDGPUSample_cd_cl_o>;
+defm IMAGE_SAMPLE_C_CD_O        : MIMG_Sampler <mimgopc<MIMG.NOP, 0x6e>, AMDGPUSample_c_cd_o>;
+defm IMAGE_SAMPLE_C_CD_CL_O     : MIMG_Sampler <mimgopc<MIMG.NOP, 0x6f>, AMDGPUSample_c_cd_cl_o>;
 } // End OtherPredicates = [HasExtendedImageInsts]
-//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>;
-//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>;
-
-let SubtargetPredicate = HasGFX10_AEncoding in
-defm IMAGE_MSAA_LOAD_X : MIMG_NoSampler <mimgopc<0x80>, "image_msaa_load", 1, 0, 0, 1>;
 
-defm IMAGE_BVH_INTERSECT_RAY       : MIMG_IntersectRay<mimgopc<0xe6>, "image_bvh_intersect_ray", 0, 0>;
-defm IMAGE_BVH_INTERSECT_RAY_a16   : MIMG_IntersectRay<mimgopc<0xe6>, "image_bvh_intersect_ray", 0, 1>;
-defm IMAGE_BVH64_INTERSECT_RAY     : MIMG_IntersectRay<mimgopc<0xe7>, "image_bvh64_intersect_ray", 1, 0>;
-defm IMAGE_BVH64_INTERSECT_RAY_a16 : MIMG_IntersectRay<mimgopc<0xe7>, "image_bvh64_intersect_ray", 1, 1>;
+let OtherPredicates = [HasExtendedImageInsts,HasG16] in {
+defm IMAGE_SAMPLE_D_G16         : MIMG_Sampler <mimgopc<0x39, 0xa2>, AMDGPUSample_d, 0, 1>;
+defm IMAGE_SAMPLE_D_CL_G16      : MIMG_Sampler <mimgopc<0x5f, 0xa3>, AMDGPUSample_d_cl, 0, 1>;
+defm IMAGE_SAMPLE_C_D_G16       : MIMG_Sampler <mimgopc<0x3a, 0xaa>, AMDGPUSample_c_d, 0, 1>;
+defm IMAGE_SAMPLE_C_D_CL_G16    : MIMG_Sampler <mimgopc<0x54, 0xab>, AMDGPUSample_c_d_cl, 0, 1>;
+defm IMAGE_SAMPLE_D_O_G16       : MIMG_Sampler <mimgopc<0x3b, 0xb2>, AMDGPUSample_d_o, 0, 1>;
+defm IMAGE_SAMPLE_D_CL_O_G16    : MIMG_Sampler <mimgopc<0x55, 0xb3>, AMDGPUSample_d_cl_o, 0, 1>;
+defm IMAGE_SAMPLE_C_D_O_G16     : MIMG_Sampler <mimgopc<0x3c, 0xba>, AMDGPUSample_c_d_o, 0, 1>;
+defm IMAGE_SAMPLE_C_D_CL_O_G16  : MIMG_Sampler <mimgopc<0x56, 0xbb>, AMDGPUSample_c_d_cl_o, 0, 1>;
+defm IMAGE_SAMPLE_CD_G16        : MIMG_Sampler <mimgopc<MIMG.NOP, 0xe8>, AMDGPUSample_cd, 0, 1>;
+defm IMAGE_SAMPLE_CD_CL_G16     : MIMG_Sampler <mimgopc<MIMG.NOP, 0xe9>, AMDGPUSample_cd_cl, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_G16      : MIMG_Sampler <mimgopc<MIMG.NOP, 0xea>, AMDGPUSample_c_cd, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_CL_G16   : MIMG_Sampler <mimgopc<MIMG.NOP, 0xeb>, AMDGPUSample_c_cd_cl, 0, 1>;
+defm IMAGE_SAMPLE_CD_O_G16      : MIMG_Sampler <mimgopc<MIMG.NOP, 0xec>, AMDGPUSample_cd_o, 0, 1>;
+defm IMAGE_SAMPLE_CD_CL_O_G16   : MIMG_Sampler <mimgopc<MIMG.NOP, 0xed>, AMDGPUSample_cd_cl_o, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_O_G16    : MIMG_Sampler <mimgopc<MIMG.NOP, 0xee>, AMDGPUSample_c_cd_o, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xef>, AMDGPUSample_c_cd_cl_o, 0, 1>;
+} // End OtherPredicates = [HasExtendedImageInsts,HasG16]
+
+//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", mimgopc<0x7e>>;
+//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", mimgopc<0x7f>>;
+
+let SubtargetPredicate = isGFX10Only, OtherPredicates = [HasGFX10_AEncoding] in
+defm IMAGE_MSAA_LOAD_X : MIMG_NoSampler <mimgopc<MIMG.NOP, 0x80>, "image_msaa_load", 1, 0, 0, 1>;
+
+let OtherPredicates = [HasGFX10_AEncoding] in
+defm IMAGE_MSAA_LOAD : MIMG_MSAA_Load <mimgopc<0x18, MIMG.NOP>, "image_msaa_load">;
+
+let OtherPredicates = [HasGFX10_AEncoding] in {
+defm IMAGE_BVH_INTERSECT_RAY       : MIMG_IntersectRay<mimgopc<0x19, 0xe6>, "image_bvh_intersect_ray", 0, 0>;
+defm IMAGE_BVH_INTERSECT_RAY_a16   : MIMG_IntersectRay<mimgopc<0x19, 0xe6>, "image_bvh_intersect_ray", 0, 1>;
+defm IMAGE_BVH64_INTERSECT_RAY     : MIMG_IntersectRay<mimgopc<0x1a, 0xe7>, "image_bvh64_intersect_ray", 1, 0>;
+defm IMAGE_BVH64_INTERSECT_RAY_a16 : MIMG_IntersectRay<mimgopc<0x1a, 0xe7>, "image_bvh64_intersect_ray", 1, 1>;
+} // End OtherPredicates = [HasGFX10_AEncoding]
+
+} // End let OtherPredicates = [HasImageInsts]
 
 /********** ========================================= **********/
 /********** Table of dimension-aware image intrinsics **********/
diff --git a/llvm/lib/Target/AMDGPU/R600.h b/llvm/lib/Target/AMDGPU/R600.h
index 2b483ae63da9..5dfbf8f1ef95 100644
--- a/llvm/lib/Target/AMDGPU/R600.h
+++ b/llvm/lib/Target/AMDGPU/R600.h
@@ -26,7 +26,7 @@ FunctionPass *createR600EmitClauseMarkers();
 FunctionPass *createR600ClauseMergePass();
 FunctionPass *createR600Packetizer();
 FunctionPass *createR600ControlFlowFinalizer();
-FunctionPass *createAMDGPUCFGStructurizerPass();
+FunctionPass *createR600MachineCFGStructurizerPass();
 FunctionPass *createR600ISelDag(TargetMachine *TM, CodeGenOpt::Level OptLevel);
 ModulePass *createR600OpenCLImageTypeLoweringPass();
 
diff --git a/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp b/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp
index c19e3c41485e..afcb6b4d65f8 100644
--- a/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp
@@ -111,7 +111,7 @@ bool R600AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   MCContext &Context = getObjFileLowering().getContext();
   MCSectionELF *ConfigSection =
       Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
-  OutStreamer->SwitchSection(ConfigSection);
+  OutStreamer->switchSection(ConfigSection);
 
   EmitProgramInfoR600(MF);
 
@@ -120,7 +120,7 @@ bool R600AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   if (isVerbose()) {
     MCSectionELF *CommentSection =
         Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
-    OutStreamer->SwitchSection(CommentSection);
+    OutStreamer->switchSection(CommentSection);
 
     R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
     OutStreamer->emitRawComment(
diff --git a/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
index 715fd69fc7ae..2b85df8ac6cf 100644
--- a/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// R600EmitClauseMarker pass emits CFAlu instruction in a conservative maneer.
+/// R600EmitClauseMarker pass emits CFAlu instruction in a conservative manner.
 /// This pass is merging consecutive CFAlus where applicable.
 /// It needs to be called after IfCvt for best results.
 //===----------------------------------------------------------------------===//
@@ -15,6 +15,7 @@
 #include "MCTargetDesc/R600MCTargetDesc.h"
 #include "R600.h"
 #include "R600Subtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index 8a48a67b829c..4bf38a3c6ceb 100644
--- a/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -16,6 +16,7 @@
 #include "R600.h"
 #include "R600MachineFunctionInfo.h"
 #include "R600Subtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include <set>
 
 using namespace llvm;
diff --git a/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
index b9ca7f928d56..ef67e5c937dc 100644
--- a/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
+++ b/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
@@ -17,6 +17,7 @@
 #include "R600.h"
 #include "R600Defines.h"
 #include "R600Subtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 
 using namespace llvm;
 
@@ -327,9 +328,9 @@ char R600EmitClauseMarkers::ID = 0;
 } // end anonymous namespace
 
 INITIALIZE_PASS_BEGIN(R600EmitClauseMarkers, "emitclausemarkers",
-                      "R600 Emit Clause Markters", false, false)
+                      "R600 Emit Clause Markers", false, false)
 INITIALIZE_PASS_END(R600EmitClauseMarkers, "emitclausemarkers",
-                      "R600 Emit Clause Markters", false, false)
+                    "R600 Emit Clause Markers", false, false)
 
 FunctionPass *llvm::createR600EmitClauseMarkers() {
   return new R600EmitClauseMarkers();
diff --git a/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
index 194879fef53c..ef2d049f9175 100644
--- a/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
@@ -17,6 +17,8 @@
 #include "R600.h"
 #include "R600Defines.h"
 #include "R600Subtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp b/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp
index abd4086db62c..fd8cecab90da 100644
--- a/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp
@@ -8,6 +8,7 @@
 
 #include "R600FrameLowering.h"
 #include "R600Subtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index bd757e9e3d70..bf52f7830ad7 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -42,39 +42,26 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
   computeRegisterProperties(Subtarget->getRegisterInfo());
 
   // Legalize loads and stores to the private address space.
-  setOperationAction(ISD::LOAD, MVT::i32, Custom);
-  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
-  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
+  setOperationAction(ISD::LOAD, {MVT::i32, MVT::v2i32, MVT::v4i32}, Custom);
 
   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
   // spaces, so it is custom lowered to handle those where it isn't.
-  for (MVT VT : MVT::integer_valuetypes()) {
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
-
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
-
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
-  }
+  for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD})
+    for (MVT VT : MVT::integer_valuetypes()) {
+      setLoadExtAction(Op, VT, MVT::i1, Promote);
+      setLoadExtAction(Op, VT, MVT::i8, Custom);
+      setLoadExtAction(Op, VT, MVT::i16, Custom);
+    }
 
   // Workaround for LegalizeDAG asserting on expansion of i1 vector loads.
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
+  setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i32,
+                   MVT::v2i1, Expand);
 
-  setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
+  setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v4i32,
+                   MVT::v4i1, Expand);
 
-  setOperationAction(ISD::STORE, MVT::i8, Custom);
-  setOperationAction(ISD::STORE, MVT::i32, Custom);
-  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
-  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
+  setOperationAction(ISD::STORE, {MVT::i8, MVT::i32, MVT::v2i32, MVT::v4i32},
+                     Custom);
 
   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
@@ -96,55 +83,34 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
   setTruncStoreAction(MVT::v4i32, MVT::v4i1, Expand);
 
   // Set condition code actions
-  setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
-  setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
-  setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
-  setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
-  setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
-  setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
-  setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
-  setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
-  setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
-  setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
-  setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
-  setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
-
-  setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
-  setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
-  setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
-  setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
-
-  setOperationAction(ISD::FCOS, MVT::f32, Custom);
-  setOperationAction(ISD::FSIN, MVT::f32, Custom);
-
-  setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
-  setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
-
-  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
-  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+  setCondCodeAction({ISD::SETO, ISD::SETUO, ISD::SETLT, ISD::SETLE, ISD::SETOLT,
+                     ISD::SETOLE, ISD::SETONE, ISD::SETUEQ, ISD::SETUGE,
+                     ISD::SETUGT, ISD::SETULT, ISD::SETULE},
+                    MVT::f32, Expand);
+
+  setCondCodeAction({ISD::SETLE, ISD::SETLT, ISD::SETULE, ISD::SETULT},
+                    MVT::i32, Expand);
+
+  setOperationAction({ISD::FCOS, ISD::FSIN}, MVT::f32, Custom);
+
+  setOperationAction(ISD::SETCC, {MVT::v4i32, MVT::v2i32}, Expand);
+
+  setOperationAction(ISD::BR_CC, {MVT::i32, MVT::f32}, Expand);
   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
 
   setOperationAction(ISD::FSUB, MVT::f32, Expand);
 
-  setOperationAction(ISD::FCEIL, MVT::f64, Custom);
-  setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
-  setOperationAction(ISD::FRINT, MVT::f64, Custom);
-  setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
+  setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FFLOOR},
+                     MVT::f64, Custom);
 
-  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
-  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+  setOperationAction(ISD::SELECT_CC, {MVT::f32, MVT::i32}, Custom);
 
-  setOperationAction(ISD::SETCC, MVT::i32, Expand);
-  setOperationAction(ISD::SETCC, MVT::f32, Expand);
-  setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
-  setOperationAction(ISD::FP_TO_SINT, MVT::i1, Custom);
-  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
-  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+  setOperationAction(ISD::SETCC, {MVT::i32, MVT::f32}, Expand);
+  setOperationAction({ISD::FP_TO_UINT, ISD::FP_TO_SINT}, {MVT::i1, MVT::i64},
+                     Custom);
 
-  setOperationAction(ISD::SELECT, MVT::i32, Expand);
-  setOperationAction(ISD::SELECT, MVT::f32, Expand);
-  setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
-  setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
+  setOperationAction(ISD::SELECT, {MVT::i32, MVT::f32, MVT::v2i32, MVT::v4i32},
+                     Expand);
 
   // ADD, SUB overflow.
   // TODO: turn these into Legal?
@@ -158,56 +124,43 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
   if (!Subtarget->hasBFE())
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::v2i1, MVT::v4i1}, Expand);
 
   if (!Subtarget->hasBFE())
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::v2i8, MVT::v4i8}, Expand);
 
   if (!Subtarget->hasBFE())
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::v2i16, MVT::v4i16}, Expand);
 
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::v2i32, MVT::v4i32}, Expand);
 
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
 
   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 
-  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
-  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
-  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
-  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT,
+                     {MVT::v2i32, MVT::v2f32, MVT::v4i32, MVT::v4f32}, Custom);
 
-  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
-  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
-  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
-  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT,
+                     {MVT::v2i32, MVT::v2f32, MVT::v4i32, MVT::v4f32}, Custom);
 
   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
   //  to be Legal/Custom in order to avoid library calls.
-  setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
-  setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
-  setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
+  setOperationAction({ISD::SHL_PARTS, ISD::SRL_PARTS, ISD::SRA_PARTS}, MVT::i32,
+                     Custom);
 
-  if (!Subtarget->hasFMA()) {
-    setOperationAction(ISD::FMA, MVT::f32, Expand);
-    setOperationAction(ISD::FMA, MVT::f64, Expand);
-  }
+  if (!Subtarget->hasFMA())
+    setOperationAction(ISD::FMA, {MVT::f32, MVT::f64}, Expand);
 
   // FIXME: May need no denormals check
   setOperationAction(ISD::FMAD, MVT::f32, Legal);
 
-  if (!Subtarget->hasBFI()) {
+  if (!Subtarget->hasBFI())
     // fcopysign can be done in a single instruction with BFI.
-    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
-    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
-  }
+    setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
 
   if (!Subtarget->hasBCNT(32))
     setOperationAction(ISD::CTPOP, MVT::i32, Expand);
@@ -229,30 +182,22 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 
   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
-  for (MVT VT : ScalarIntVTs) {
-    setOperationAction(ISD::ADDC, VT, Expand);
-    setOperationAction(ISD::SUBC, VT, Expand);
-    setOperationAction(ISD::ADDE, VT, Expand);
-    setOperationAction(ISD::SUBE, VT, Expand);
-  }
+  for (MVT VT : ScalarIntVTs)
+    setOperationAction({ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT,
+                       Expand);
 
   // LLVM will expand these to atomic_cmp_swap(0)
   // and atomic_swap, respectively.
-  setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Expand);
-  setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand);
+  setOperationAction({ISD::ATOMIC_LOAD, ISD::ATOMIC_STORE}, MVT::i32, Expand);
 
   // We need to custom lower some of the intrinsics
-  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction({ISD::INTRINSIC_VOID, ISD::INTRINSIC_WO_CHAIN}, MVT::Other,
+                     Custom);
 
   setSchedulingPreference(Sched::Source);
 
-  setTargetDAGCombine(ISD::FP_ROUND);
-  setTargetDAGCombine(ISD::FP_TO_SINT);
-  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
-  setTargetDAGCombine(ISD::SELECT_CC);
-  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
-  setTargetDAGCombine(ISD::LOAD);
+  setTargetDAGCombine({ISD::FP_ROUND, ISD::FP_TO_SINT, ISD::EXTRACT_VECTOR_ELT,
+                       ISD::SELECT_CC, ISD::INSERT_VECTOR_ELT, ISD::LOAD});
 }
 
 static inline bool isEOP(MachineBasicBlock::iterator I) {
@@ -995,7 +940,7 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
 /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
 /// convert these pointers to a register index.  Each register holds
 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
-/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
+/// \p StackWidth, which tells us how many of the 4 sub-registers will be used
 /// for indirect addressing.
 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
                                                unsigned StackWidth,
@@ -1100,7 +1045,7 @@ SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
   SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
                                  DAG.getConstant(3, DL, MVT::i32));
 
-  // TODO: Contrary to the name of the functiom,
+  // TODO: Contrary to the name of the function,
   // it also handles sub i32 non-truncating stores (like i1)
   SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
                                   Store->getValue());
@@ -1163,9 +1108,9 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
       SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain);
       // TODO: can the chain be replaced without creating a new store?
       SDValue NewStore = DAG.getTruncStore(
-          NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(),
-          MemVT, StoreNode->getAlignment(),
-          StoreNode->getMemOperand()->getFlags(), StoreNode->getAAInfo());
+          NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(), MemVT,
+          StoreNode->getAlign(), StoreNode->getMemOperand()->getFlags(),
+          StoreNode->getAAInfo());
       StoreNode = cast<StoreSDNode>(NewStore);
     }
 
@@ -1417,7 +1362,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
     SDValue NewLoad = DAG.getExtLoad(
         ISD::EXTLOAD, DL, VT, Chain, Ptr, LoadNode->getPointerInfo(), MemVT,
-        LoadNode->getAlignment(), LoadNode->getMemOperand()->getFlags());
+        LoadNode->getAlign(), LoadNode->getMemOperand()->getFlags());
     SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad,
                               DAG.getValueType(MemVT));
 
@@ -1610,7 +1555,7 @@ static SDValue CompactSwizzlableVector(
     if (NewBldVec[i].isUndef())
       // We mask write here to teach later passes that the ith element of this
       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
-      // break false dependencies and additionnaly make assembly easier to read.
+      // break false dependencies and additionally make assembly easier to read.
       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
       if (C->isZero()) {
@@ -1714,7 +1659,7 @@ SDValue R600TargetLowering::constBufferLoad(LoadSDNode *LoadNode, int Block,
   if (LoadNode->getMemoryVT().getScalarType() != MVT::i32 || !ISD::isNON_EXTLoad(LoadNode))
     return SDValue();
 
-  if (LoadNode->getAlignment() < 4)
+  if (LoadNode->getAlign() < Align(4))
     return SDValue();
 
   int ConstantBlock = ConstantAddressBlock(Block);
diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
index aec8b1ae4837..d04ec6490aae 100644
--- a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -18,6 +18,7 @@
 #include "R600Defines.h"
 #include "R600Subtarget.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 
 using namespace llvm;
 
@@ -1469,21 +1470,3 @@ void R600InstrInfo::clearFlag(MachineInstr &MI, unsigned Operand,
     FlagOp.setImm(InstFlags);
   }
 }
-
-unsigned R600InstrInfo::getAddressSpaceForPseudoSourceKind(
-    unsigned Kind) const {
-  switch (Kind) {
-  case PseudoSourceValue::Stack:
-  case PseudoSourceValue::FixedStack:
-    return AMDGPUAS::PRIVATE_ADDRESS;
-  case PseudoSourceValue::ConstantPool:
-  case PseudoSourceValue::GOT:
-  case PseudoSourceValue::JumpTable:
-  case PseudoSourceValue::GlobalValueCallEntry:
-  case PseudoSourceValue::ExternalSymbolCallEntry:
-  case PseudoSourceValue::TargetCustom:
-    return AMDGPUAS::CONSTANT_ADDRESS;
-  }
-
-  llvm_unreachable("Invalid pseudo source kind");
-}
diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.h b/llvm/lib/Target/AMDGPU/R600InstrInfo.h
index bc8a4786df77..f720e4656348 100644
--- a/llvm/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.h
@@ -320,9 +320,6 @@ public:
   bool isRegisterLoad(const MachineInstr &MI) const {
     return get(MI.getOpcode()).TSFlags & R600InstrFlags::REGISTER_LOAD;
   }
-
-  unsigned getAddressSpaceForPseudoSourceKind(
-      unsigned Kind) const override;
 };
 
 namespace R600 {
diff --git a/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp
new file mode 100644
index 000000000000..0a96c643d9bd
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp
@@ -0,0 +1,1640 @@
+//===- R600MachineCFGStructurizer.cpp - CFG Structurizer ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//==-----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/R600MCTargetDesc.h"
+#include "R600.h"
+#include "R600RegisterInfo.h"
+#include "R600Subtarget.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "structcfg"
+
+#define DEFAULT_VEC_SLOTS 8
+
+// TODO: move-begin.
+
+//===----------------------------------------------------------------------===//
+//
+// Statistics for CFGStructurizer.
+//
+//===----------------------------------------------------------------------===//
+
+STATISTIC(numSerialPatternMatch,    "CFGStructurizer number of serial pattern "
+    "matched");
+STATISTIC(numIfPatternMatch,        "CFGStructurizer number of if pattern "
+    "matched");
+STATISTIC(numClonedBlock,           "CFGStructurizer cloned blocks");
+STATISTIC(numClonedInstr,           "CFGStructurizer cloned instructions");
+
+namespace llvm {
+
+void initializeR600MachineCFGStructurizerPass(PassRegistry &);
+
+} // end namespace llvm
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+//
+// Miscellaneous utility for CFGStructurizer.
+//
+//===----------------------------------------------------------------------===//
+
+#define SHOWNEWINSTR(i) LLVM_DEBUG(dbgs() << "New instr: " << *i << "\n");
+
+#define SHOWNEWBLK(b, msg)                                                     \
+  LLVM_DEBUG(dbgs() << msg << "BB" << b->getNumber() << "size " << b->size();  \
+             dbgs() << "\n";);
+
+#define SHOWBLK_DETAIL(b, msg)                                                 \
+  LLVM_DEBUG(if (b) {                                                          \
+    dbgs() << msg << "BB" << b->getNumber() << "size " << b->size();           \
+    b->print(dbgs());                                                          \
+    dbgs() << "\n";                                                            \
+  });
+
+#define INVALIDSCCNUM -1
+
+//===----------------------------------------------------------------------===//
+//
+// supporting data structure for CFGStructurizer
+//
+//===----------------------------------------------------------------------===//
+
+class BlockInformation {
+public:
+  bool IsRetired = false;
+  int SccNum = INVALIDSCCNUM;
+
+  BlockInformation() = default;
+};
+
+//===----------------------------------------------------------------------===//
+//
+// CFGStructurizer
+//
+//===----------------------------------------------------------------------===//
+
+class R600MachineCFGStructurizer : public MachineFunctionPass {
+public:
+  using MBBVector = SmallVector<MachineBasicBlock *, 32>;
+  using MBBInfoMap = std::map<MachineBasicBlock *, BlockInformation *>;
+  using LoopLandInfoMap = std::map<MachineLoop *, MachineBasicBlock *>;
+
+  enum PathToKind {
+    Not_SinglePath = 0,
+    SinglePath_InPath = 1,
+    SinglePath_NotInPath = 2
+  };
+
+  static char ID;
+
+  R600MachineCFGStructurizer() : MachineFunctionPass(ID) {
+    initializeR600MachineCFGStructurizerPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "AMDGPU Control Flow Graph structurizer Pass";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachinePostDominatorTree>();
+    AU.addRequired<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  /// Perform the CFG structurization
+  bool run();
+
+  /// Perform the CFG preparation
+  /// This step will remove every unconditionnal/dead jump instructions and make
+  /// sure all loops have an exit block
+  bool prepare();
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    // FIXME: This pass causes verification failures.
+    MF.getProperties().set(
+        MachineFunctionProperties::Property::FailsVerification);
+
+    TII = MF.getSubtarget<R600Subtarget>().getInstrInfo();
+    TRI = &TII->getRegisterInfo();
+    LLVM_DEBUG(MF.dump(););
+    OrderedBlks.clear();
+    Visited.clear();
+    FuncRep = &MF;
+    MLI = &getAnalysis<MachineLoopInfo>();
+    LLVM_DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
+    MDT = &getAnalysis<MachineDominatorTree>();
+    LLVM_DEBUG(MDT->print(dbgs(), (const Module *)nullptr););
+    PDT = &getAnalysis<MachinePostDominatorTree>();
+    LLVM_DEBUG(PDT->print(dbgs()););
+    prepare();
+    run();
+    LLVM_DEBUG(MF.dump(););
+    return true;
+  }
+
+protected:
+  MachineDominatorTree *MDT;
+  MachinePostDominatorTree *PDT;
+  MachineLoopInfo *MLI;
+  const R600InstrInfo *TII = nullptr;
+  const R600RegisterInfo *TRI = nullptr;
+
+  // PRINT FUNCTIONS
+  /// Print the ordered Blocks.
+  void printOrderedBlocks() const {
+    size_t i = 0;
+    for (MBBVector::const_iterator iterBlk = OrderedBlks.begin(),
+        iterBlkEnd = OrderedBlks.end(); iterBlk != iterBlkEnd; ++iterBlk, ++i) {
+      dbgs() << "BB" << (*iterBlk)->getNumber();
+      dbgs() << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")";
+      if (i != 0 && i % 10 == 0) {
+        dbgs() << "\n";
+      } else {
+        dbgs() << " ";
+      }
+    }
+  }
+
+  static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) {
+    for (const MachineLoop *L : LoopInfo)
+      L->print(dbgs());
+  }
+
+  // UTILITY FUNCTIONS
+  int getSCCNum(MachineBasicBlock *MBB) const;
+  MachineBasicBlock *getLoopLandInfo(MachineLoop *LoopRep) const;
+  bool hasBackEdge(MachineBasicBlock *MBB) const;
+  bool isRetiredBlock(MachineBasicBlock *MBB) const;
+  bool isActiveLoophead(MachineBasicBlock *MBB) const;
+  PathToKind singlePathTo(MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
+      bool AllowSideEntry = true) const;
+  int countActiveBlock(MBBVector::const_iterator It,
+      MBBVector::const_iterator E) const;
+  bool needMigrateBlock(MachineBasicBlock *MBB) const;
+
+  // Utility Functions
+  void reversePredicateSetter(MachineBasicBlock::iterator I,
+                              MachineBasicBlock &MBB);
+  /// Compute the reversed DFS post order of Blocks
+  void orderBlocks(MachineFunction *MF);
+
+  // Function originally from CFGStructTraits
+  void insertInstrEnd(MachineBasicBlock *MBB, int NewOpcode,
+                      const DebugLoc &DL = DebugLoc());
+  MachineInstr *insertInstrBefore(MachineBasicBlock *MBB, int NewOpcode,
+                                  const DebugLoc &DL = DebugLoc());
+  MachineInstr *insertInstrBefore(MachineBasicBlock::iterator I, int NewOpcode);
+  void insertCondBranchBefore(MachineBasicBlock::iterator I, int NewOpcode,
+                              const DebugLoc &DL);
+  void insertCondBranchBefore(MachineBasicBlock *MBB,
+                              MachineBasicBlock::iterator I, int NewOpcode,
+                              int RegNum, const DebugLoc &DL);
+
+  static int getBranchNzeroOpcode(int OldOpcode);
+  static int getBranchZeroOpcode(int OldOpcode);
+  static int getContinueNzeroOpcode(int OldOpcode);
+  static int getContinueZeroOpcode(int OldOpcode);
+  static MachineBasicBlock *getTrueBranch(MachineInstr *MI);
+  static void setTrueBranch(MachineInstr *MI, MachineBasicBlock *MBB);
+  static MachineBasicBlock *getFalseBranch(MachineBasicBlock *MBB,
+      MachineInstr *MI);
+  static bool isCondBranch(MachineInstr *MI);
+  static bool isUncondBranch(MachineInstr *MI);
+  static DebugLoc getLastDebugLocInBB(MachineBasicBlock *MBB);
+  static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *MBB);
+
+  /// The correct naming for this is getPossibleLoopendBlockBranchInstr.
+  ///
+  /// BB with backward-edge could have move instructions after the branch
+  /// instruction.  Such move instruction "belong to" the loop backward-edge.
+  MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *MBB);
+
+  static MachineInstr *getReturnInstr(MachineBasicBlock *MBB);
+  static bool isReturnBlock(MachineBasicBlock *MBB);
+  static void cloneSuccessorList(MachineBasicBlock *DstMBB,
+      MachineBasicBlock *SrcMBB);
+  static MachineBasicBlock *clone(MachineBasicBlock *MBB);
+
+  /// MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose
+  /// because the AMDGPU instruction is not recognized as terminator fix this
+  /// and retire this routine
+  void replaceInstrUseOfBlockWith(MachineBasicBlock *SrcMBB,
+      MachineBasicBlock *OldMBB, MachineBasicBlock *NewBlk);
+
+  static void wrapup(MachineBasicBlock *MBB);
+
+  int patternMatch(MachineBasicBlock *MBB);
+  int patternMatchGroup(MachineBasicBlock *MBB);
+  int serialPatternMatch(MachineBasicBlock *MBB);
+  int ifPatternMatch(MachineBasicBlock *MBB);
+  int loopendPatternMatch();
+  int mergeLoop(MachineLoop *LoopRep);
+
+  /// return true iff src1Blk->succ_empty() && src1Blk and src2Blk are in
+  /// the same loop with LoopLandInfo without explicitly keeping track of
+  /// loopContBlks and loopBreakBlks, this is a method to get the information.
+  bool isSameloopDetachedContbreak(MachineBasicBlock *Src1MBB,
+      MachineBasicBlock *Src2MBB);
+  int handleJumpintoIf(MachineBasicBlock *HeadMBB,
+      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
+  int handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
+      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
+  int improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
+      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
+      MachineBasicBlock **LandMBBPtr);
+  void showImproveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
+      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
+      MachineBasicBlock *LandMBB, bool Detail = false);
+  int cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
+      MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB);
+  void mergeSerialBlock(MachineBasicBlock *DstMBB,
+      MachineBasicBlock *SrcMBB);
+
+  void mergeIfthenelseBlock(MachineInstr *BranchMI,
+      MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
+      MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB);
+  void mergeLooplandBlock(MachineBasicBlock *DstMBB,
+      MachineBasicBlock *LandMBB);
+  void mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
+      MachineBasicBlock *LandMBB);
+  void settleLoopcontBlock(MachineBasicBlock *ContingMBB,
+      MachineBasicBlock *ContMBB);
+
+  /// normalizeInfiniteLoopExit change
+  ///   B1:
+  ///        uncond_br LoopHeader
+  ///
+  /// to
+  ///   B1:
+  ///        cond_br 1 LoopHeader dummyExit
+  /// and return the newly added dummy exit block
+  MachineBasicBlock *normalizeInfiniteLoopExit(MachineLoop *LoopRep);
+  void removeUnconditionalBranch(MachineBasicBlock *MBB);
+
+  /// Remove duplicate branches instructions in a block.
+  /// For instance
+  /// B0:
+  ///    cond_br X B1 B2
+  ///    cond_br X B1 B2
+  /// is transformed to
+  /// B0:
+  ///    cond_br X B1 B2
+  void removeRedundantConditionalBranch(MachineBasicBlock *MBB);
+
+  void addDummyExitBlock(SmallVectorImpl<MachineBasicBlock *> &RetMBB);
+  void removeSuccessor(MachineBasicBlock *MBB);
+  MachineBasicBlock *cloneBlockForPredecessor(MachineBasicBlock *MBB,
+      MachineBasicBlock *PredMBB);
+  void migrateInstruction(MachineBasicBlock *SrcMBB,
+      MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I);
+  void recordSccnum(MachineBasicBlock *MBB, int SCCNum);
+  void retireBlock(MachineBasicBlock *MBB);
+
+private:
+  MBBInfoMap BlockInfoMap;
+  LoopLandInfoMap LLInfoMap;
+  std::map<MachineLoop *, bool> Visited;
+  MachineFunction *FuncRep;
+  SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> OrderedBlks;
+};
+
+} // end anonymous namespace
+
+char R600MachineCFGStructurizer::ID = 0;
+
+int R600MachineCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const {
+  MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
+  if (It == BlockInfoMap.end())
+    return INVALIDSCCNUM;
+  return (*It).second->SccNum;
+}
+
+MachineBasicBlock *R600MachineCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep)
+    const {
+  LoopLandInfoMap::const_iterator It = LLInfoMap.find(LoopRep);
+  if (It == LLInfoMap.end())
+    return nullptr;
+  return (*It).second;
+}
+
+bool R600MachineCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const {
+  MachineLoop *LoopRep = MLI->getLoopFor(MBB);
+  if (!LoopRep)
+    return false;
+  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
+  return MBB->isSuccessor(LoopHeader);
+}
+
+bool R600MachineCFGStructurizer::isRetiredBlock(MachineBasicBlock *MBB) const {
+  MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
+  if (It == BlockInfoMap.end())
+    return false;
+  return (*It).second->IsRetired;
+}
+
+bool R600MachineCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const {
+  MachineLoop *LoopRep = MLI->getLoopFor(MBB);
+  while (LoopRep && LoopRep->getHeader() == MBB) {
+    MachineBasicBlock *LoopLand = getLoopLandInfo(LoopRep);
+    if(!LoopLand)
+      return true;
+    if (!isRetiredBlock(LoopLand))
+      return true;
+    LoopRep = LoopRep->getParentLoop();
+  }
+  return false;
+}
+
+R600MachineCFGStructurizer::PathToKind R600MachineCFGStructurizer::singlePathTo(
+    MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
+    bool AllowSideEntry) const {
+  assert(DstMBB);
+  if (SrcMBB == DstMBB)
+    return SinglePath_InPath;
+  while (SrcMBB && SrcMBB->succ_size() == 1) {
+    SrcMBB = *SrcMBB->succ_begin();
+    if (SrcMBB == DstMBB)
+      return SinglePath_InPath;
+    if (!AllowSideEntry && SrcMBB->pred_size() > 1)
+      return Not_SinglePath;
+  }
+  if (SrcMBB && SrcMBB->succ_size()==0)
+    return SinglePath_NotInPath;
+  return Not_SinglePath;
+}
+
+int R600MachineCFGStructurizer::countActiveBlock(MBBVector::const_iterator It,
+    MBBVector::const_iterator E) const {
+  int Count = 0;
+  while (It != E) {
+    if (!isRetiredBlock(*It))
+      ++Count;
+    ++It;
+  }
+  return Count;
+}
+
+bool R600MachineCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const {
+  unsigned BlockSizeThreshold = 30;
+  unsigned CloneInstrThreshold = 100;
+  bool MultiplePreds = MBB && (MBB->pred_size() > 1);
+
+  if(!MultiplePreds)
+    return false;
+  unsigned BlkSize = MBB->size();
+  return ((BlkSize > BlockSizeThreshold) &&
+      (BlkSize * (MBB->pred_size() - 1) > CloneInstrThreshold));
+}
+
+void R600MachineCFGStructurizer::reversePredicateSetter(
+    MachineBasicBlock::iterator I, MachineBasicBlock &MBB) {
+  assert(I.isValid() && "Expected valid iterator");
+  for (;; --I) {
+    if (I == MBB.end())
+      continue;
+    if (I->getOpcode() == R600::PRED_X) {
+      switch (I->getOperand(2).getImm()) {
+      case R600::PRED_SETE_INT:
+        I->getOperand(2).setImm(R600::PRED_SETNE_INT);
+        return;
+      case R600::PRED_SETNE_INT:
+        I->getOperand(2).setImm(R600::PRED_SETE_INT);
+        return;
+      case R600::PRED_SETE:
+        I->getOperand(2).setImm(R600::PRED_SETNE);
+        return;
+      case R600::PRED_SETNE:
+        I->getOperand(2).setImm(R600::PRED_SETE);
+        return;
+      default:
+        llvm_unreachable("PRED_X Opcode invalid!");
+      }
+    }
+  }
+}
+
+void R600MachineCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB,
+                                           int NewOpcode, const DebugLoc &DL) {
+  MachineInstr *MI =
+      MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL);
+  MBB->push_back(MI);
+  //assume the instruction doesn't take any reg operand ...
+  SHOWNEWINSTR(MI);
+}
+
+MachineInstr *R600MachineCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB,
+                                                       int NewOpcode,
+                                                       const DebugLoc &DL) {
+  MachineInstr *MI =
+      MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL);
+  if (!MBB->empty())
+    MBB->insert(MBB->begin(), MI);
+  else
+    MBB->push_back(MI);
+  SHOWNEWINSTR(MI);
+  return MI;
+}
+
+MachineInstr *R600MachineCFGStructurizer::insertInstrBefore(
+    MachineBasicBlock::iterator I, int NewOpcode) {
+  MachineInstr *OldMI = &(*I);
+  MachineBasicBlock *MBB = OldMI->getParent();
+  MachineInstr *NewMBB =
+      MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DebugLoc());
+  MBB->insert(I, NewMBB);
+  //assume the instruction doesn't take any reg operand ...
+  SHOWNEWINSTR(NewMBB);
+  return NewMBB;
+}
+
+void R600MachineCFGStructurizer::insertCondBranchBefore(
+    MachineBasicBlock::iterator I, int NewOpcode, const DebugLoc &DL) {
+  MachineInstr *OldMI = &(*I);
+  MachineBasicBlock *MBB = OldMI->getParent();
+  MachineFunction *MF = MBB->getParent();
+  MachineInstr *NewMI = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
+  MBB->insert(I, NewMI);
+  MachineInstrBuilder MIB(*MF, NewMI);
+  MIB.addReg(OldMI->getOperand(1).getReg(), false);
+  SHOWNEWINSTR(NewMI);
+  //erase later oldInstr->eraseFromParent();
+}
+
+void R600MachineCFGStructurizer::insertCondBranchBefore(
+    MachineBasicBlock *blk, MachineBasicBlock::iterator I, int NewOpcode,
+    int RegNum, const DebugLoc &DL) {
+  MachineFunction *MF = blk->getParent();
+  MachineInstr *NewInstr = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
+  //insert before
+  blk->insert(I, NewInstr);
+  MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false);
+  SHOWNEWINSTR(NewInstr);
+}
+
+int R600MachineCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) {
+  switch(OldOpcode) {
+  case R600::JUMP_COND:
+  case R600::JUMP: return R600::IF_PREDICATE_SET;
+  case R600::BRANCH_COND_i32:
+  case R600::BRANCH_COND_f32: return R600::IF_LOGICALNZ_f32;
+  default: llvm_unreachable("internal error");
+  }
+  return -1;
+}
+
+int R600MachineCFGStructurizer::getBranchZeroOpcode(int OldOpcode) {
+  switch(OldOpcode) {
+  case R600::JUMP_COND:
+  case R600::JUMP: return R600::IF_PREDICATE_SET;
+  case R600::BRANCH_COND_i32:
+  case R600::BRANCH_COND_f32: return R600::IF_LOGICALZ_f32;
+  default: llvm_unreachable("internal error");
+  }
+  return -1;
+}
+
+int R600MachineCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) {
+  switch(OldOpcode) {
+  case R600::JUMP_COND:
+  case R600::JUMP: return R600::CONTINUE_LOGICALNZ_i32;
+  default: llvm_unreachable("internal error");
+  }
+  return -1;
+}
+
+int R600MachineCFGStructurizer::getContinueZeroOpcode(int OldOpcode) {
+  switch(OldOpcode) {
+  case R600::JUMP_COND:
+  case R600::JUMP: return R600::CONTINUE_LOGICALZ_i32;
+  default: llvm_unreachable("internal error");
+  }
+  return -1;
+}
+
+MachineBasicBlock *R600MachineCFGStructurizer::getTrueBranch(MachineInstr *MI) {
+  return MI->getOperand(0).getMBB();
+}
+
+void R600MachineCFGStructurizer::setTrueBranch(MachineInstr *MI,
+    MachineBasicBlock *MBB) {
+  MI->getOperand(0).setMBB(MBB);
+}
+
+MachineBasicBlock *
+R600MachineCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB,
+    MachineInstr *MI) {
+  assert(MBB->succ_size() == 2);
+  MachineBasicBlock *TrueBranch = getTrueBranch(MI);
+  MachineBasicBlock::succ_iterator It = MBB->succ_begin();
+  MachineBasicBlock::succ_iterator Next = It;
+  ++Next;
+  return (*It == TrueBranch) ? *Next : *It;
+}
+
+bool R600MachineCFGStructurizer::isCondBranch(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+    case R600::JUMP_COND:
+    case R600::BRANCH_COND_i32:
+    case R600::BRANCH_COND_f32: return true;
+  default:
+    return false;
+  }
+  return false;
+}
+
+bool R600MachineCFGStructurizer::isUncondBranch(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case R600::JUMP:
+  case R600::BRANCH:
+    return true;
+  default:
+    return false;
+  }
+  return false;
+}
+
+DebugLoc R600MachineCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) {
+  //get DebugLoc from the first MachineBasicBlock instruction with debug info
+  DebugLoc DL;
+  for (MachineInstr &MI : *MBB)
+    if (MI.getDebugLoc())
+      DL = MI.getDebugLoc();
+  return DL;
+}
+
+MachineInstr *R600MachineCFGStructurizer::getNormalBlockBranchInstr(
+    MachineBasicBlock *MBB) {
+  MachineBasicBlock::reverse_iterator It = MBB->rbegin();
+  MachineInstr *MI = &*It;
+  if (MI && (isCondBranch(MI) || isUncondBranch(MI)))
+    return MI;
+  return nullptr;
+}
+
+MachineInstr *R600MachineCFGStructurizer::getLoopendBlockBranchInstr(
+    MachineBasicBlock *MBB) {
+  for (MachineBasicBlock::reverse_iterator It = MBB->rbegin(), E = MBB->rend();
+      It != E; ++It) {
+    // FIXME: Simplify
+    MachineInstr *MI = &*It;
+    if (MI) {
+      if (isCondBranch(MI) || isUncondBranch(MI))
+        return MI;
+      else if (!TII->isMov(MI->getOpcode()))
+        break;
+    }
+  }
+  return nullptr;
+}
+
+MachineInstr *R600MachineCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
+  MachineBasicBlock::reverse_iterator It = MBB->rbegin();
+  if (It != MBB->rend()) {
+    MachineInstr *instr = &(*It);
+    if (instr->getOpcode() == R600::RETURN)
+      return instr;
+  }
+  return nullptr;
+}
+
+bool R600MachineCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
+  MachineInstr *MI = getReturnInstr(MBB);
+  bool IsReturn = MBB->succ_empty();
+  if (MI)
+    assert(IsReturn);
+  else if (IsReturn)
+    LLVM_DEBUG(dbgs() << "BB" << MBB->getNumber()
+                      << " is return block without RETURN instr\n";);
+  return  IsReturn;
+}
+
+void R600MachineCFGStructurizer::cloneSuccessorList(MachineBasicBlock *DstMBB,
+    MachineBasicBlock *SrcMBB) {
+  for (MachineBasicBlock *Succ : SrcMBB->successors())
+    DstMBB->addSuccessor(Succ);  // *iter's predecessor is also taken care of
+}
+
+MachineBasicBlock *R600MachineCFGStructurizer::clone(MachineBasicBlock *MBB) {
+  MachineFunction *Func = MBB->getParent();
+  MachineBasicBlock *NewMBB = Func->CreateMachineBasicBlock();
+  Func->push_back(NewMBB);  //insert to function
+  for (const MachineInstr &It : *MBB)
+    NewMBB->push_back(Func->CloneMachineInstr(&It));
+  return NewMBB;
+}
+
+void R600MachineCFGStructurizer::replaceInstrUseOfBlockWith(
+    MachineBasicBlock *SrcMBB, MachineBasicBlock *OldMBB,
+    MachineBasicBlock *NewBlk) {
+  MachineInstr *BranchMI = getLoopendBlockBranchInstr(SrcMBB);
+  if (BranchMI && isCondBranch(BranchMI) &&
+      getTrueBranch(BranchMI) == OldMBB)
+    setTrueBranch(BranchMI, NewBlk);
+}
+
+void R600MachineCFGStructurizer::wrapup(MachineBasicBlock *MBB) {
+  assert((!MBB->getParent()->getJumpTableInfo()
+          || MBB->getParent()->getJumpTableInfo()->isEmpty())
+         && "found a jump table");
+
+   //collect continue right before endloop
+   SmallVector<MachineInstr *, DEFAULT_VEC_SLOTS> ContInstr;
+   MachineBasicBlock::iterator Pre = MBB->begin();
+   MachineBasicBlock::iterator E = MBB->end();
+   MachineBasicBlock::iterator It = Pre;
+   while (It != E) {
+     if (Pre->getOpcode() == R600::CONTINUE
+         && It->getOpcode() == R600::ENDLOOP)
+       ContInstr.push_back(&*Pre);
+     Pre = It;
+     ++It;
+   }
+
+   //delete continue right before endloop
+   for (unsigned i = 0; i < ContInstr.size(); ++i)
+      ContInstr[i]->eraseFromParent();
+
+   // TODO to fix up jump table so later phase won't be confused.  if
+   // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but
+   // there isn't such an interface yet.  alternatively, replace all the other
+   // blocks in the jump table with the entryBlk //}
+}
+
+bool R600MachineCFGStructurizer::prepare() {
+  bool Changed = false;
+
+  //FIXME: if not reducible flow graph, make it so ???
+
+  LLVM_DEBUG(dbgs() << "R600MachineCFGStructurizer::prepare\n";);
+
+  orderBlocks(FuncRep);
+
+  SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> RetBlks;
+
+  // Add an ExitBlk to loop that don't have one
+  for (MachineLoop *LoopRep : *MLI) {
+    MBBVector ExitingMBBs;
+    LoopRep->getExitingBlocks(ExitingMBBs);
+
+    if (ExitingMBBs.size() == 0) {
+      MachineBasicBlock* DummyExitBlk = normalizeInfiniteLoopExit(LoopRep);
+      if (DummyExitBlk)
+        RetBlks.push_back(DummyExitBlk);
+    }
+  }
+
+  // Remove unconditional branch instr.
+  // Add dummy exit block iff there are multiple returns.
+  for (MachineBasicBlock *MBB : OrderedBlks) {
+    removeUnconditionalBranch(MBB);
+    removeRedundantConditionalBranch(MBB);
+    if (isReturnBlock(MBB)) {
+      RetBlks.push_back(MBB);
+    }
+    assert(MBB->succ_size() <= 2);
+  }
+
+  if (RetBlks.size() >= 2) {
+    addDummyExitBlock(RetBlks);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+bool R600MachineCFGStructurizer::run() {
+  //Assume reducible CFG...
+  LLVM_DEBUG(dbgs() << "R600MachineCFGStructurizer::run\n");
+
+#ifdef STRESSTEST
+  //Use the worse block ordering to test the algorithm.
+  ReverseVector(orderedBlks);
+#endif
+
+  LLVM_DEBUG(dbgs() << "Ordered blocks:\n"; printOrderedBlocks(););
+  int NumIter = 0;
+  bool Finish = false;
+  MachineBasicBlock *MBB;
+  bool MakeProgress = false;
+  int NumRemainedBlk = countActiveBlock(OrderedBlks.begin(),
+                                        OrderedBlks.end());
+
+  do {
+    ++NumIter;
+    LLVM_DEBUG(dbgs() << "numIter = " << NumIter
+                      << ", numRemaintedBlk = " << NumRemainedBlk << "\n";);
+    (void)NumIter;
+
+    SmallVectorImpl<MachineBasicBlock *>::const_iterator It =
+        OrderedBlks.begin();
+    SmallVectorImpl<MachineBasicBlock *>::const_iterator E =
+        OrderedBlks.end();
+
+    SmallVectorImpl<MachineBasicBlock *>::const_iterator SccBeginIter =
+        It;
+    MachineBasicBlock *SccBeginMBB = nullptr;
+    int SccNumBlk = 0;  // The number of active blocks, init to a
+                        // maximum possible number.
+    int SccNumIter;     // Number of iteration in this SCC.
+
+    while (It != E) {
+      MBB = *It;
+
+      if (!SccBeginMBB) {
+        SccBeginIter = It;
+        SccBeginMBB = MBB;
+        SccNumIter = 0;
+        SccNumBlk = NumRemainedBlk; // Init to maximum possible number.
+        LLVM_DEBUG(dbgs() << "start processing SCC" << getSCCNum(SccBeginMBB);
+                   dbgs() << "\n";);
+      }
+
+      if (!isRetiredBlock(MBB))
+        patternMatch(MBB);
+
+      ++It;
+
+      bool ContNextScc = true;
+      if (It == E
+          || getSCCNum(SccBeginMBB) != getSCCNum(*It)) {
+        // Just finish one scc.
+        ++SccNumIter;
+        int sccRemainedNumBlk = countActiveBlock(SccBeginIter, It);
+        if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= SccNumBlk) {
+          LLVM_DEBUG(dbgs() << "Can't reduce SCC " << getSCCNum(MBB)
+                            << ", sccNumIter = " << SccNumIter;
+                     dbgs() << "doesn't make any progress\n";);
+          (void)SccNumIter;
+          ContNextScc = true;
+        } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < SccNumBlk) {
+          SccNumBlk = sccRemainedNumBlk;
+          It = SccBeginIter;
+          ContNextScc = false;
+          LLVM_DEBUG(dbgs() << "repeat processing SCC" << getSCCNum(MBB)
+                            << "sccNumIter = " << SccNumIter << '\n';);
+        } else {
+          // Finish the current scc.
+          ContNextScc = true;
+        }
+      } else {
+        // Continue on next component in the current scc.
+        ContNextScc = false;
+      }
+
+      if (ContNextScc)
+        SccBeginMBB = nullptr;
+    } //while, "one iteration" over the function.
+
+    MachineBasicBlock *EntryMBB =
+        *GraphTraits<MachineFunction *>::nodes_begin(FuncRep);
+    if (EntryMBB->succ_empty()) {
+      Finish = true;
+      LLVM_DEBUG(dbgs() << "Reduce to one block\n";);
+    } else {
+      int NewnumRemainedBlk
+        = countActiveBlock(OrderedBlks.begin(), OrderedBlks.end());
+      // consider cloned blocks ??
+      if (NewnumRemainedBlk == 1 || NewnumRemainedBlk < NumRemainedBlk) {
+        MakeProgress = true;
+        NumRemainedBlk = NewnumRemainedBlk;
+      } else {
+        MakeProgress = false;
+        LLVM_DEBUG(dbgs() << "No progress\n";);
+      }
+    }
+  } while (!Finish && MakeProgress);
+
+  // Misc wrap up to maintain the consistency of the Function representation.
+  wrapup(*GraphTraits<MachineFunction *>::nodes_begin(FuncRep));
+
+  // Detach retired Block, release memory.
+  for (auto &It : BlockInfoMap) {
+    if (It.second && It.second->IsRetired) {
+      assert((It.first)->getNumber() != -1);
+      LLVM_DEBUG(dbgs() << "Erase BB" << (It.first)->getNumber() << "\n";);
+      It.first->eraseFromParent(); // Remove from the parent Function.
+    }
+    delete It.second;
+  }
+  BlockInfoMap.clear();
+  LLInfoMap.clear();
+
+  if (!Finish) {
+    LLVM_DEBUG(FuncRep->viewCFG());
+    report_fatal_error("IRREDUCIBLE_CFG");
+  }
+
+  return true;
+}
+
+void R600MachineCFGStructurizer::orderBlocks(MachineFunction *MF) {
+  int SccNum = 0;
+  for (scc_iterator<MachineFunction *> It = scc_begin(MF); !It.isAtEnd();
+       ++It, ++SccNum) {
+    const std::vector<MachineBasicBlock *> &SccNext = *It;
+    for (MachineBasicBlock *MBB : SccNext) {
+      OrderedBlks.push_back(MBB);
+      recordSccnum(MBB, SccNum);
+    }
+  }
+
+  // walk through all the block in func to check for unreachable
+  for (auto *MBB : nodes(MF)) {
+    SccNum = getSCCNum(MBB);
+    if (SccNum == INVALIDSCCNUM)
+      dbgs() << "unreachable block BB" << MBB->getNumber() << "\n";
+  }
+}
+
+int R600MachineCFGStructurizer::patternMatch(MachineBasicBlock *MBB) {
+  int NumMatch = 0;
+  int CurMatch;
+
+  LLVM_DEBUG(dbgs() << "Begin patternMatch BB" << MBB->getNumber() << "\n";);
+
+  while ((CurMatch = patternMatchGroup(MBB)) > 0)
+    NumMatch += CurMatch;
+
+  LLVM_DEBUG(dbgs() << "End patternMatch BB" << MBB->getNumber()
+                    << ", numMatch = " << NumMatch << "\n";);
+
+  return NumMatch;
+}
+
+int R600MachineCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) {
+  int NumMatch = 0;
+  NumMatch += loopendPatternMatch();
+  NumMatch += serialPatternMatch(MBB);
+  NumMatch += ifPatternMatch(MBB);
+  return NumMatch;
+}
+
+int R600MachineCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) {
+  if (MBB->succ_size() != 1)
+    return 0;
+
+  MachineBasicBlock *childBlk = *MBB->succ_begin();
+  if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk))
+    return 0;
+
+  mergeSerialBlock(MBB, childBlk);
+  ++numSerialPatternMatch;
+  return 1;
+}
+
+int R600MachineCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
+  //two edges
+  if (MBB->succ_size() != 2)
+    return 0;
+  if (hasBackEdge(MBB))
+    return 0;
+  MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
+  if (!BranchMI)
+    return 0;
+
+  assert(isCondBranch(BranchMI));
+  int NumMatch = 0;
+
+  MachineBasicBlock *TrueMBB = getTrueBranch(BranchMI);
+  NumMatch += serialPatternMatch(TrueMBB);
+  NumMatch += ifPatternMatch(TrueMBB);
+  MachineBasicBlock *FalseMBB = getFalseBranch(MBB, BranchMI);
+  NumMatch += serialPatternMatch(FalseMBB);
+  NumMatch += ifPatternMatch(FalseMBB);
+  MachineBasicBlock *LandBlk;
+  int Cloned = 0;
+
+  assert (!TrueMBB->succ_empty() || !FalseMBB->succ_empty());
+  // TODO: Simplify
+  if (TrueMBB->succ_size() == 1 && FalseMBB->succ_size() == 1
+    && *TrueMBB->succ_begin() == *FalseMBB->succ_begin()) {
+    // Diamond pattern
+    LandBlk = *TrueMBB->succ_begin();
+  } else if (TrueMBB->succ_size() == 1 && *TrueMBB->succ_begin() == FalseMBB) {
+    // Triangle pattern, false is empty
+    LandBlk = FalseMBB;
+    FalseMBB = nullptr;
+  } else if (FalseMBB->succ_size() == 1
+             && *FalseMBB->succ_begin() == TrueMBB) {
+    // Triangle pattern, true is empty
+    // We reverse the predicate to make a triangle, empty false pattern;
+    std::swap(TrueMBB, FalseMBB);
+    reversePredicateSetter(MBB->end(), *MBB);
+    LandBlk = FalseMBB;
+    FalseMBB = nullptr;
+  } else if (FalseMBB->succ_size() == 1
+             && isSameloopDetachedContbreak(TrueMBB, FalseMBB)) {
+    LandBlk = *FalseMBB->succ_begin();
+  } else if (TrueMBB->succ_size() == 1
+    && isSameloopDetachedContbreak(FalseMBB, TrueMBB)) {
+    LandBlk = *TrueMBB->succ_begin();
+  } else {
+    return NumMatch + handleJumpintoIf(MBB, TrueMBB, FalseMBB);
+  }
+
+  // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the
+  // new BB created for landBlk==NULL may introduce new challenge to the
+  // reduction process.
+  if (LandBlk &&
+      ((TrueMBB && TrueMBB->pred_size() > 1)
+      || (FalseMBB && FalseMBB->pred_size() > 1))) {
+     Cloned += improveSimpleJumpintoIf(MBB, TrueMBB, FalseMBB, &LandBlk);
+  }
+
+  if (TrueMBB && TrueMBB->pred_size() > 1) {
+    TrueMBB = cloneBlockForPredecessor(TrueMBB, MBB);
+    ++Cloned;
+  }
+
+  if (FalseMBB && FalseMBB->pred_size() > 1) {
+    FalseMBB = cloneBlockForPredecessor(FalseMBB, MBB);
+    ++Cloned;
+  }
+
+  mergeIfthenelseBlock(BranchMI, MBB, TrueMBB, FalseMBB, LandBlk);
+
+  ++numIfPatternMatch;
+
+  numClonedBlock += Cloned;
+
+  return 1 + Cloned + NumMatch;
+}
+
+int R600MachineCFGStructurizer::loopendPatternMatch() {
+  std::deque<MachineLoop *> NestedLoops;
+  for (auto &It: *MLI)
+    for (MachineLoop *ML : depth_first(It))
+      NestedLoops.push_front(ML);
+
+  if (NestedLoops.empty())
+    return 0;
+
+  // Process nested loop outside->inside (we did push_front),
+  // so "continue" to a outside loop won't be mistaken as "break"
+  // of the current loop.
+  int Num = 0;
+  for (MachineLoop *ExaminedLoop : NestedLoops) {
+    if (ExaminedLoop->getNumBlocks() == 0 || Visited[ExaminedLoop])
+      continue;
+    LLVM_DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump(););
+    int NumBreak = mergeLoop(ExaminedLoop);
+    if (NumBreak == -1)
+      break;
+    Num += NumBreak;
+  }
+  return Num;
+}
+
+int R600MachineCFGStructurizer::mergeLoop(MachineLoop *LoopRep) {
+  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
+  MBBVector ExitingMBBs;
+  LoopRep->getExitingBlocks(ExitingMBBs);
+  assert(!ExitingMBBs.empty() && "Infinite Loop not supported");
+  LLVM_DEBUG(dbgs() << "Loop has " << ExitingMBBs.size()
+                    << " exiting blocks\n";);
+  // We assume a single ExitBlk
+  MBBVector ExitBlks;
+  LoopRep->getExitBlocks(ExitBlks);
+  SmallPtrSet<MachineBasicBlock *, 2> ExitBlkSet;
+  for (unsigned i = 0, e = ExitBlks.size(); i < e; ++i)
+    ExitBlkSet.insert(ExitBlks[i]);
+  assert(ExitBlkSet.size() == 1);
+  MachineBasicBlock *ExitBlk = *ExitBlks.begin();
+  assert(ExitBlk && "Loop has several exit block");
+  MBBVector LatchBlks;
+  for (auto *LB : inverse_children<MachineBasicBlock*>(LoopHeader))
+    if (LoopRep->contains(LB))
+      LatchBlks.push_back(LB);
+
+  for (unsigned i = 0, e = ExitingMBBs.size(); i < e; ++i)
+    mergeLoopbreakBlock(ExitingMBBs[i], ExitBlk);
+  for (unsigned i = 0, e = LatchBlks.size(); i < e; ++i)
+    settleLoopcontBlock(LatchBlks[i], LoopHeader);
+  int Match = 0;
+  do {
+    Match = 0;
+    Match += serialPatternMatch(LoopHeader);
+    Match += ifPatternMatch(LoopHeader);
+  } while (Match > 0);
+  mergeLooplandBlock(LoopHeader, ExitBlk);
+  MachineLoop *ParentLoop = LoopRep->getParentLoop();
+  if (ParentLoop)
+    MLI->changeLoopFor(LoopHeader, ParentLoop);
+  else
+    MLI->removeBlock(LoopHeader);
+  Visited[LoopRep] = true;
+  return 1;
+}
+
+bool R600MachineCFGStructurizer::isSameloopDetachedContbreak(
+    MachineBasicBlock *Src1MBB, MachineBasicBlock *Src2MBB) {
+  if (Src1MBB->succ_empty()) {
+    MachineLoop *LoopRep = MLI->getLoopFor(Src1MBB);
+    if (LoopRep&& LoopRep == MLI->getLoopFor(Src2MBB)) {
+      MachineBasicBlock *&TheEntry = LLInfoMap[LoopRep];
+      if (TheEntry) {
+        LLVM_DEBUG(dbgs() << "isLoopContBreakBlock yes src1 = BB"
+                          << Src1MBB->getNumber() << " src2 = BB"
+                          << Src2MBB->getNumber() << "\n";);
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+int R600MachineCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB,
+    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
+  int Num = handleJumpintoIfImp(HeadMBB, TrueMBB, FalseMBB);
+  if (Num == 0) {
+    LLVM_DEBUG(dbgs() << "handleJumpintoIf swap trueBlk and FalseBlk"
+                      << "\n";);
+    Num = handleJumpintoIfImp(HeadMBB, FalseMBB, TrueMBB);
+  }
+  return Num;
+}
+
+int R600MachineCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
+    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
+  int Num = 0;
+  MachineBasicBlock *DownBlk;
+
+  //trueBlk could be the common post dominator
+  DownBlk = TrueMBB;
+
+  LLVM_DEBUG(dbgs() << "handleJumpintoIfImp head = BB" << HeadMBB->getNumber()
+                    << " true = BB" << TrueMBB->getNumber()
+                    << ", numSucc=" << TrueMBB->succ_size() << " false = BB"
+                    << FalseMBB->getNumber() << "\n";);
+
+  while (DownBlk) {
+    LLVM_DEBUG(dbgs() << "check down = BB" << DownBlk->getNumber(););
+
+    if (singlePathTo(FalseMBB, DownBlk) == SinglePath_InPath) {
+      LLVM_DEBUG(dbgs() << " working\n";);
+
+      Num += cloneOnSideEntryTo(HeadMBB, TrueMBB, DownBlk);
+      Num += cloneOnSideEntryTo(HeadMBB, FalseMBB, DownBlk);
+
+      numClonedBlock += Num;
+      Num += serialPatternMatch(*HeadMBB->succ_begin());
+      Num += serialPatternMatch(*std::next(HeadMBB->succ_begin()));
+      Num += ifPatternMatch(HeadMBB);
+      assert(Num > 0);
+
+      break;
+    }
+    LLVM_DEBUG(dbgs() << " not working\n";);
+    DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : nullptr;
+  } // walk down the postDomTree
+
+  return Num;
+}
+
+#ifndef NDEBUG
+void R600MachineCFGStructurizer::showImproveSimpleJumpintoIf(
+    MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB,
+    MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB, bool Detail) {
+  dbgs() << "head = BB" << HeadMBB->getNumber()
+         << " size = " << HeadMBB->size();
+  if (Detail) {
+    dbgs() << "\n";
+    HeadMBB->print(dbgs());
+    dbgs() << "\n";
+  }
+
+  if (TrueMBB) {
+    dbgs() << ", true = BB" << TrueMBB->getNumber() << " size = "
+           << TrueMBB->size() << " numPred = " << TrueMBB->pred_size();
+    if (Detail) {
+      dbgs() << "\n";
+      TrueMBB->print(dbgs());
+      dbgs() << "\n";
+    }
+  }
+  if (FalseMBB) {
+    dbgs() << ", false = BB" << FalseMBB->getNumber() << " size = "
+           << FalseMBB->size() << " numPred = " << FalseMBB->pred_size();
+    if (Detail) {
+      dbgs() << "\n";
+      FalseMBB->print(dbgs());
+      dbgs() << "\n";
+    }
+  }
+  if (LandMBB) {
+    dbgs() << ", land = BB" << LandMBB->getNumber() << " size = "
+           << LandMBB->size() << " numPred = " << LandMBB->pred_size();
+    if (Detail) {
+      dbgs() << "\n";
+      LandMBB->print(dbgs());
+      dbgs() << "\n";
+    }
+  }
+
+  dbgs() << "\n";
+}
+#endif
+
+int R600MachineCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
+    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
+    MachineBasicBlock **LandMBBPtr) {
+  bool MigrateTrue = false;
+  bool MigrateFalse = false;
+
+  MachineBasicBlock *LandBlk = *LandMBBPtr;
+
+  assert((!TrueMBB || TrueMBB->succ_size() <= 1)
+         && (!FalseMBB || FalseMBB->succ_size() <= 1));
+
+  if (TrueMBB == FalseMBB)
+    return 0;
+
+  MigrateTrue = needMigrateBlock(TrueMBB);
+  MigrateFalse = needMigrateBlock(FalseMBB);
+
+  if (!MigrateTrue && !MigrateFalse)
+    return 0;
+
+  // If we need to migrate either trueBlk and falseBlk, migrate the rest that
+  // have more than one predecessors.  without doing this, its predecessor
+  // rather than headBlk will have undefined value in initReg.
+  if (!MigrateTrue && TrueMBB && TrueMBB->pred_size() > 1)
+    MigrateTrue = true;
+  if (!MigrateFalse && FalseMBB && FalseMBB->pred_size() > 1)
+    MigrateFalse = true;
+
+  LLVM_DEBUG(
+      dbgs() << "before improveSimpleJumpintoIf: ";
+      showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0););
+
+  // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk
+  //
+  // new: headBlk => if () {initReg = 1; org trueBlk branch} else
+  //      {initReg = 0; org falseBlk branch }
+  //      => landBlk => if (initReg) {org trueBlk} else {org falseBlk}
+  //      => org landBlk
+  //      if landBlk->pred_size() > 2, put the about if-else inside
+  //      if (initReg !=2) {...}
+  //
+  // add initReg = initVal to headBlk
+
+  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
+  if (!MigrateTrue || !MigrateFalse) {
+    // XXX: We have an opportunity here to optimize the "branch into if" case
+    // here.  Branch into if looks like this:
+    //                        entry
+    //                       /     |
+    //           diamond_head       branch_from
+    //             /      \           |
+    // diamond_false        diamond_true
+    //             \      /
+    //               done
+    //
+    // The diamond_head block begins the "if" and the diamond_true block
+    // is the block being "branched into".
+    //
+    // If MigrateTrue is true, then TrueBB is the block being "branched into"
+    // and if MigrateFalse is true, then FalseBB is the block being
+    // "branched into"
+    //
+    // Here is the pseudo code for how I think the optimization should work:
+    // 1. Insert MOV GPR0, 0 before the branch instruction in diamond_head.
+    // 2. Insert MOV GPR0, 1 before the branch instruction in branch_from.
+    // 3. Move the branch instruction from diamond_head into its own basic
+    //    block (new_block).
+    // 4. Add an unconditional branch from diamond_head to new_block
+    // 5. Replace the branch instruction in branch_from with an unconditional
+    //    branch to new_block.  If branch_from has multiple predecessors, then
+    //    we need to replace the True/False block in the branch
+    //    instruction instead of replacing it.
+    // 6. Change the condition of the branch instruction in new_block from
+    //    COND to (COND || GPR0)
+    //
+    // In order insert these MOV instruction, we will need to use the
+    // RegisterScavenger.  Usually liveness stops being tracked during
+    // the late machine optimization passes, however if we implement
+    // bool TargetRegisterInfo::requiresRegisterScavenging(
+    //                                                const MachineFunction &MF)
+    // and have it return true, liveness will be tracked correctly
+    // by generic optimization passes.  We will also need to make sure that
+    // all of our target-specific passes that run after regalloc and before
+    // the CFGStructurizer track liveness and we will need to modify this pass
+    // to correctly track liveness.
+    //
+    // After the above changes, the new CFG should look like this:
+    //                        entry
+    //                       /     |
+    //           diamond_head       branch_from
+    //                       \     /
+    //                      new_block
+    //                      /      |
+    //         diamond_false        diamond_true
+    //                      \      /
+    //                        done
+    //
+    // Without this optimization, we are forced to duplicate the diamond_true
+    // block and we will end up with a CFG like this:
+    //
+    //                        entry
+    //                       /     |
+    //           diamond_head       branch_from
+    //             /      \                   |
+    // diamond_false        diamond_true      diamond_true (duplicate)
+    //             \      /                   |
+    //               done --------------------|
+    //
+    // Duplicating diamond_true can be very costly especially if it has a
+    // lot of instructions.
+    return 0;
+  }
+
+  int NumNewBlk = 0;
+
+  bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2);
+
+  //insert R600::ENDIF to avoid special case "input landBlk == NULL"
+  MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, R600::ENDIF);
+
+  if (LandBlkHasOtherPred) {
+    report_fatal_error("Extra register needed to handle CFG");
+    Register CmpResReg =
+        HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
+    report_fatal_error("Extra compare instruction needed to handle CFG");
+    insertCondBranchBefore(LandBlk, I, R600::IF_PREDICATE_SET,
+        CmpResReg, DebugLoc());
+  }
+
+  // XXX: We are running this after RA, so creating virtual registers will
+  // cause an assertion failure in the PostRA scheduling pass.
+  Register InitReg =
+      HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
+  insertCondBranchBefore(LandBlk, I, R600::IF_PREDICATE_SET, InitReg,
+      DebugLoc());
+
+  if (MigrateTrue) {
+    migrateInstruction(TrueMBB, LandBlk, I);
+    // need to uncondionally insert the assignment to ensure a path from its
+    // predecessor rather than headBlk has valid value in initReg if
+    // (initVal != 1).
+    report_fatal_error("Extra register needed to handle CFG");
+  }
+  insertInstrBefore(I, R600::ELSE);
+
+  if (MigrateFalse) {
+    migrateInstruction(FalseMBB, LandBlk, I);
+    // need to uncondionally insert the assignment to ensure a path from its
+    // predecessor rather than headBlk has valid value in initReg if
+    // (initVal != 0)
+    report_fatal_error("Extra register needed to handle CFG");
+  }
+
+  if (LandBlkHasOtherPred) {
+    // add endif
+    insertInstrBefore(I, R600::ENDIF);
+
+    // put initReg = 2 to other predecessors of landBlk
+    for (MachineBasicBlock *MBB : LandBlk->predecessors())
+      if (MBB != TrueMBB && MBB != FalseMBB)
+        report_fatal_error("Extra register needed to handle CFG");
+  }
+  LLVM_DEBUG(
+      dbgs() << "result from improveSimpleJumpintoIf: ";
+      showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0););
+
+  // update landBlk
+  *LandMBBPtr = LandBlk;
+
+  return NumNewBlk;
+}
+
+void R600MachineCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
+    MachineBasicBlock *SrcMBB) {
+  LLVM_DEBUG(dbgs() << "serialPattern BB" << DstMBB->getNumber() << " <= BB"
+                    << SrcMBB->getNumber() << "\n";);
+  DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end());
+
+  DstMBB->removeSuccessor(SrcMBB, true);
+  cloneSuccessorList(DstMBB, SrcMBB);
+
+  removeSuccessor(SrcMBB);
+  MLI->removeBlock(SrcMBB);
+  retireBlock(SrcMBB);
+}
+
+void R600MachineCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
+    MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
+    MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB) {
+  assert (TrueMBB);
+  LLVM_DEBUG(dbgs() << "ifPattern BB" << MBB->getNumber(); dbgs() << "{  ";
+             if (TrueMBB) { dbgs() << "BB" << TrueMBB->getNumber(); } dbgs()
+             << "  } else ";
+             dbgs() << "{  "; if (FalseMBB) {
+               dbgs() << "BB" << FalseMBB->getNumber();
+             } dbgs() << "  }\n ";
+             dbgs() << "landBlock: "; if (!LandMBB) { dbgs() << "NULL"; } else {
+               dbgs() << "BB" << LandMBB->getNumber();
+             } dbgs() << "\n";);
+
+  int OldOpcode = BranchMI->getOpcode();
+  DebugLoc BranchDL = BranchMI->getDebugLoc();
+
+//    transform to
+//    if cond
+//       trueBlk
+//    else
+//       falseBlk
+//    endif
+//    landBlk
+
+  MachineBasicBlock::iterator I = BranchMI;
+  insertCondBranchBefore(I, getBranchNzeroOpcode(OldOpcode),
+      BranchDL);
+
+  if (TrueMBB) {
+    MBB->splice(I, TrueMBB, TrueMBB->begin(), TrueMBB->end());
+    MBB->removeSuccessor(TrueMBB, true);
+    if (LandMBB && TrueMBB->succ_size()!=0)
+      TrueMBB->removeSuccessor(LandMBB, true);
+    retireBlock(TrueMBB);
+    MLI->removeBlock(TrueMBB);
+  }
+
+  if (FalseMBB) {
+    insertInstrBefore(I, R600::ELSE);
+    MBB->splice(I, FalseMBB, FalseMBB->begin(),
+                   FalseMBB->end());
+    MBB->removeSuccessor(FalseMBB, true);
+    if (LandMBB && !FalseMBB->succ_empty())
+      FalseMBB->removeSuccessor(LandMBB, true);
+    retireBlock(FalseMBB);
+    MLI->removeBlock(FalseMBB);
+  }
+  insertInstrBefore(I, R600::ENDIF);
+
+  BranchMI->eraseFromParent();
+
+  if (LandMBB && TrueMBB && FalseMBB)
+    MBB->addSuccessor(LandMBB);
+}
+
+void R600MachineCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
+    MachineBasicBlock *LandMBB) {
+  LLVM_DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber()
+                    << " land = BB" << LandMBB->getNumber() << "\n";);
+
+  insertInstrBefore(DstBlk, R600::WHILELOOP, DebugLoc());
+  insertInstrEnd(DstBlk, R600::ENDLOOP, DebugLoc());
+  DstBlk->replaceSuccessor(DstBlk, LandMBB);
+}
+
+void R600MachineCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
+    MachineBasicBlock *LandMBB) {
+  LLVM_DEBUG(dbgs() << "loopbreakPattern exiting = BB"
+                    << ExitingMBB->getNumber() << " land = BB"
+                    << LandMBB->getNumber() << "\n";);
+  MachineInstr *BranchMI = getLoopendBlockBranchInstr(ExitingMBB);
+  assert(BranchMI && isCondBranch(BranchMI));
+  DebugLoc DL = BranchMI->getDebugLoc();
+  MachineBasicBlock *TrueBranch = getTrueBranch(BranchMI);
+  MachineBasicBlock::iterator I = BranchMI;
+  if (TrueBranch != LandMBB)
+    reversePredicateSetter(I, *I->getParent());
+  insertCondBranchBefore(ExitingMBB, I, R600::IF_PREDICATE_SET, R600::PREDICATE_BIT, DL);
+  insertInstrBefore(I, R600::BREAK);
+  insertInstrBefore(I, R600::ENDIF);
+  //now branchInst can be erase safely
+  BranchMI->eraseFromParent();
+  //now take care of successors, retire blocks
+  ExitingMBB->removeSuccessor(LandMBB, true);
+}
+
+void R600MachineCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
+    MachineBasicBlock *ContMBB) {
+  LLVM_DEBUG(dbgs() << "settleLoopcontBlock conting = BB"
+                    << ContingMBB->getNumber() << ", cont = BB"
+                    << ContMBB->getNumber() << "\n";);
+
+  MachineInstr *MI = getLoopendBlockBranchInstr(ContingMBB);
+  if (MI) {
+    assert(isCondBranch(MI));
+    MachineBasicBlock::iterator I = MI;
+    MachineBasicBlock *TrueBranch = getTrueBranch(MI);
+    int OldOpcode = MI->getOpcode();
+    DebugLoc DL = MI->getDebugLoc();
+
+    bool UseContinueLogical = ((&*ContingMBB->rbegin()) == MI);
+
+    if (!UseContinueLogical) {
+      int BranchOpcode =
+          TrueBranch == ContMBB ? getBranchNzeroOpcode(OldOpcode) :
+          getBranchZeroOpcode(OldOpcode);
+      insertCondBranchBefore(I, BranchOpcode, DL);
+      // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
+      insertInstrEnd(ContingMBB, R600::CONTINUE, DL);
+      insertInstrEnd(ContingMBB, R600::ENDIF, DL);
+    } else {
+      int BranchOpcode =
+          TrueBranch == ContMBB ? getContinueNzeroOpcode(OldOpcode) :
+          getContinueZeroOpcode(OldOpcode);
+      insertCondBranchBefore(I, BranchOpcode, DL);
+    }
+
+    MI->eraseFromParent();
+  } else {
+    // if we've arrived here then we've already erased the branch instruction
+    // travel back up the basic block to see the last reference of our debug
+    // location we've just inserted that reference here so it should be
+    // representative insertEnd to ensure phi-moves, if exist, go before the
+    // continue-instr.
+    insertInstrEnd(ContingMBB, R600::CONTINUE,
+        getLastDebugLocInBB(ContingMBB));
+  }
+}
+
+int R600MachineCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
+    MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB) {
+  int Cloned = 0;
+  assert(PreMBB->isSuccessor(SrcMBB));
+  while (SrcMBB && SrcMBB != DstMBB) {
+    assert(SrcMBB->succ_size() == 1);
+    if (SrcMBB->pred_size() > 1) {
+      SrcMBB = cloneBlockForPredecessor(SrcMBB, PreMBB);
+      ++Cloned;
+    }
+
+    PreMBB = SrcMBB;
+    SrcMBB = *SrcMBB->succ_begin();
+  }
+
+  return Cloned;
+}
+
+MachineBasicBlock *
+R600MachineCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
+    MachineBasicBlock *PredMBB) {
+  assert(PredMBB->isSuccessor(MBB) && "succBlk is not a predecessor of curBlk");
+
+  MachineBasicBlock *CloneMBB = clone(MBB);  //clone instructions
+  replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB);
+  //srcBlk, oldBlk, newBlk
+
+  PredMBB->replaceSuccessor(MBB, CloneMBB);
+
+  // add all successor to cloneBlk
+  cloneSuccessorList(CloneMBB, MBB);
+
+  numClonedInstr += MBB->size();
+
+  LLVM_DEBUG(dbgs() << "Cloned block: "
+                    << "BB" << MBB->getNumber() << "size " << MBB->size()
+                    << "\n";);
+
+  SHOWNEWBLK(CloneMBB, "result of Cloned block: ");
+
+  return CloneMBB;
+}
+
+void R600MachineCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
+    MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I) {
+  MachineBasicBlock::iterator SpliceEnd;
+  //look for the input branchinstr, not the AMDGPU branchinstr
+  MachineInstr *BranchMI = getNormalBlockBranchInstr(SrcMBB);
+  if (!BranchMI) {
+    LLVM_DEBUG(dbgs() << "migrateInstruction don't see branch instr\n";);
+    SpliceEnd = SrcMBB->end();
+  } else {
+    LLVM_DEBUG(dbgs() << "migrateInstruction see branch instr: " << *BranchMI);
+    SpliceEnd = BranchMI;
+  }
+  LLVM_DEBUG(dbgs() << "migrateInstruction before splice dstSize = "
+                    << DstMBB->size() << "srcSize = " << SrcMBB->size()
+                    << "\n";);
+
+  //splice insert before insertPos
+  DstMBB->splice(I, SrcMBB, SrcMBB->begin(), SpliceEnd);
+
+  LLVM_DEBUG(dbgs() << "migrateInstruction after splice dstSize = "
+                    << DstMBB->size() << "srcSize = " << SrcMBB->size()
+                    << '\n';);
+}
+
+MachineBasicBlock *
+R600MachineCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
+  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
+  MachineBasicBlock *LoopLatch = LoopRep->getLoopLatch();
+
+  if (!LoopHeader || !LoopLatch)
+    return nullptr;
+  MachineInstr *BranchMI = getLoopendBlockBranchInstr(LoopLatch);
+  // Is LoopRep an infinite loop ?
+  if (!BranchMI || !isUncondBranch(BranchMI))
+    return nullptr;
+
+  MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
+  FuncRep->push_back(DummyExitBlk);  //insert to function
+  SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
+  LLVM_DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";);
+  LLVMContext &Ctx = LoopHeader->getParent()->getFunction().getContext();
+  Ctx.emitError("Extra register needed to handle CFG");
+  return nullptr;
+}
+
+void R600MachineCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) {
+  MachineInstr *BranchMI;
+
+  // I saw two unconditional branch in one basic block in example
+  // test_fc_do_while_or.c need to fix the upstream on this to remove the loop.
+  while ((BranchMI = getLoopendBlockBranchInstr(MBB))
+          && isUncondBranch(BranchMI)) {
+    LLVM_DEBUG(dbgs() << "Removing uncond branch instr: " << *BranchMI);
+    BranchMI->eraseFromParent();
+  }
+}
+
+void R600MachineCFGStructurizer::removeRedundantConditionalBranch(
+    MachineBasicBlock *MBB) {
+  if (MBB->succ_size() != 2)
+    return;
+  MachineBasicBlock *MBB1 = *MBB->succ_begin();
+  MachineBasicBlock *MBB2 = *std::next(MBB->succ_begin());
+  if (MBB1 != MBB2)
+    return;
+
+  MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
+  assert(BranchMI && isCondBranch(BranchMI));
+  LLVM_DEBUG(dbgs() << "Removing unneeded cond branch instr: " << *BranchMI);
+  BranchMI->eraseFromParent();
+  SHOWNEWBLK(MBB1, "Removing redundant successor");
+  MBB->removeSuccessor(MBB1, true);
+}
+
+void R600MachineCFGStructurizer::addDummyExitBlock(
+    SmallVectorImpl<MachineBasicBlock*> &RetMBB) {
+  MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
+  FuncRep->push_back(DummyExitBlk);  //insert to function
+  insertInstrEnd(DummyExitBlk, R600::RETURN);
+
+  for (MachineBasicBlock *MBB : RetMBB) {
+    if (MachineInstr *MI = getReturnInstr(MBB))
+      MI->eraseFromParent();
+    MBB->addSuccessor(DummyExitBlk);
+    LLVM_DEBUG(dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber()
+                      << " successors\n";);
+  }
+  SHOWNEWBLK(DummyExitBlk, "DummyExitBlock: ");
+}
+
+void R600MachineCFGStructurizer::removeSuccessor(MachineBasicBlock *MBB) {
+  while (MBB->succ_size())
+    MBB->removeSuccessor(*MBB->succ_begin());
+}
+
+void R600MachineCFGStructurizer::recordSccnum(MachineBasicBlock *MBB,
+    int SccNum) {
+  BlockInformation *&srcBlkInfo = BlockInfoMap[MBB];
+  if (!srcBlkInfo)
+    srcBlkInfo = new BlockInformation();
+  srcBlkInfo->SccNum = SccNum;
+}
+
+void R600MachineCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
+  LLVM_DEBUG(dbgs() << "Retiring BB" << MBB->getNumber() << "\n";);
+
+  BlockInformation *&SrcBlkInfo = BlockInfoMap[MBB];
+
+  if (!SrcBlkInfo)
+    SrcBlkInfo = new BlockInformation();
+
+  SrcBlkInfo->IsRetired = true;
+  assert(MBB->succ_empty() && MBB->pred_empty() && "can't retire block yet");
+}
+
+INITIALIZE_PASS_BEGIN(R600MachineCFGStructurizer, "amdgpustructurizer",
+                      "AMDGPU CFG Structurizer", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(R600MachineCFGStructurizer, "amdgpustructurizer",
+                      "AMDGPU CFG Structurizer", false, false)
+
+FunctionPass *llvm::createR600MachineCFGStructurizerPass() {
+  return new R600MachineCFGStructurizer();
+}
diff --git a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
index fbe2a1cd9fba..59e274787590 100644
--- a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -207,7 +207,7 @@ public:
     return !ARDef || !ARUse;
   }
 
-  // isLegalToPruneDependencies - Is it legal to prune dependece between SUI
+  // isLegalToPruneDependencies - Is it legal to prune dependency between SUI
   // and SUJ.
   bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override {
     return false;
diff --git a/llvm/lib/Target/AMDGPU/R600Subtarget.cpp b/llvm/lib/Target/AMDGPU/R600Subtarget.cpp
index 20c1ce7266dd..d8f061054904 100644
--- a/llvm/lib/Target/AMDGPU/R600Subtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/R600Subtarget.cpp
@@ -27,8 +27,6 @@ R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
     : R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), AMDGPUSubtarget(TT),
       InstrInfo(*this),
       FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
-      FMA(false), CaymanISA(false), CFALUBug(false), HasVertexCache(false),
-      R600ALUInst(false), FP64(false), TexVTXClauseSize(0), Gen(R600),
       TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
       InstrItins(getInstrItineraryForCPU(GPU)) {}
 
diff --git a/llvm/lib/Target/AMDGPU/R600Subtarget.h b/llvm/lib/Target/AMDGPU/R600Subtarget.h
index 92d559b1f8e6..c3d002f29272 100644
--- a/llvm/lib/Target/AMDGPU/R600Subtarget.h
+++ b/llvm/lib/Target/AMDGPU/R600Subtarget.h
@@ -31,14 +31,14 @@ class R600Subtarget final : public R600GenSubtargetInfo,
 private:
   R600InstrInfo InstrInfo;
   R600FrameLowering FrameLowering;
-  bool FMA;
-  bool CaymanISA;
-  bool CFALUBug;
-  bool HasVertexCache;
-  bool R600ALUInst;
-  bool FP64;
-  short TexVTXClauseSize;
-  Generation Gen;
+  bool FMA = false;
+  bool CaymanISA = false;
+  bool CFALUBug = false;
+  bool HasVertexCache = false;
+  bool R600ALUInst = false;
+  bool FP64 = false;
+  short TexVTXClauseSize = 0;
+  Generation Gen = R600;
   R600TargetLowering TLInfo;
   InstrItineraryData InstrItins;
   SelectionDAGTargetInfo TSInfo;
diff --git a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
index 39dad45425fc..76bb0f65ef69 100644
--- a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
@@ -83,7 +83,7 @@ R600TargetMachine::getSubtargetImpl(const Function &F) const {
 }
 
 TargetTransformInfo
-R600TargetMachine::getTargetTransformInfo(const Function &F) {
+R600TargetMachine::getTargetTransformInfo(const Function &F) const {
   return TargetTransformInfo(R600TTIImpl(this, F));
 }
 
@@ -131,7 +131,7 @@ void R600PassConfig::addPreSched2() {
 }
 
 void R600PassConfig::addPreEmitPass() {
-  addPass(createAMDGPUCFGStructurizerPass());
+  addPass(createR600MachineCFGStructurizerPass());
   addPass(createR600ExpandSpecialInstrsPass());
   addPass(&FinalizeMachineBundlesID);
   addPass(createR600Packetizer());
diff --git a/llvm/lib/Target/AMDGPU/R600TargetMachine.h b/llvm/lib/Target/AMDGPU/R600TargetMachine.h
index 0ccbca3c68b1..8d20841292b9 100644
--- a/llvm/lib/Target/AMDGPU/R600TargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/R600TargetMachine.h
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// The AMDGPU TargetMachine interface definition for hw codgen targets.
+/// The AMDGPU TargetMachine interface definition for hw codegen targets.
 //
 //===----------------------------------------------------------------------===//
 
@@ -38,7 +38,7 @@ public:
 
   const TargetSubtargetInfo *getSubtargetImpl(const Function &) const override;
 
-  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+  TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
 
   bool isMachineVerifierClean() const override { return false; }
 };
diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index b81fac36fc95..afd2a38b11ec 100644
--- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -73,19 +73,19 @@ class SIAnnotateControlFlow : public FunctionPass {
 
   bool hasKill(const BasicBlock *BB);
 
-  void eraseIfUnused(PHINode *Phi);
+  bool eraseIfUnused(PHINode *Phi);
 
-  void openIf(BranchInst *Term);
+  bool openIf(BranchInst *Term);
 
-  void insertElse(BranchInst *Term);
+  bool insertElse(BranchInst *Term);
 
   Value *
   handleLoopCondition(Value *Cond, PHINode *Broken, llvm::Loop *L,
                       BranchInst *Term);
 
-  void handleLoop(BranchInst *Term);
+  bool handleLoop(BranchInst *Term);
 
-  void closeControlFlow(BasicBlock *BB);
+  bool closeControlFlow(BasicBlock *BB);
 
 public:
   static char ID;
@@ -193,31 +193,34 @@ bool SIAnnotateControlFlow::hasKill(const BasicBlock *BB) {
   return false;
 }
 
-// Erase "Phi" if it is not used any more
-void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
-  if (RecursivelyDeleteDeadPHINode(Phi)) {
+// Erase "Phi" if it is not used any more. Return true if any change was made.
+bool SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
+  bool Changed = RecursivelyDeleteDeadPHINode(Phi);
+  if (Changed)
     LLVM_DEBUG(dbgs() << "Erased unused condition phi\n");
-  }
+  return Changed;
 }
 
 /// Open a new "If" block
-void SIAnnotateControlFlow::openIf(BranchInst *Term) {
+bool SIAnnotateControlFlow::openIf(BranchInst *Term) {
   if (isUniform(Term))
-    return;
+    return false;
 
   Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term);
   Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
   push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
+  return true;
 }
 
 /// Close the last "If" block and open a new "Else" block
-void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
+bool SIAnnotateControlFlow::insertElse(BranchInst *Term) {
   if (isUniform(Term)) {
-    return;
+    return false;
   }
   Value *Ret = CallInst::Create(Else, popSaved(), "", Term);
   Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
   push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
+  return true;
 }
 
 /// Recursively handle the condition leading to a loop
@@ -255,14 +258,14 @@ Value *SIAnnotateControlFlow::handleLoopCondition(
 }
 
 /// Handle a back edge (loop)
-void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
+bool SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
   if (isUniform(Term))
-    return;
+    return false;
 
   BasicBlock *BB = Term->getParent();
   llvm::Loop *L = LI->getLoopFor(BB);
   if (!L)
-    return;
+    return false;
 
   BasicBlock *Target = Term->getSuccessor(1);
   PHINode *Broken = PHINode::Create(IntMask, 0, "phi.broken", &Target->front());
@@ -286,10 +289,12 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
   Term->setCondition(CallInst::Create(Loop, Arg, "", Term));
 
   push(Term->getSuccessor(0), Arg);
+
+  return true;
 }
 
 /// Close the last opened control flow
-void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
+bool SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
   llvm::Loop *L = LI->getLoopFor(BB);
 
   assert(Stack.back().first == BB);
@@ -322,6 +327,8 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
     }
     CallInst::Create(EndCf, Exec, "", FirstInsertionPt);
   }
+
+  return true;
 }
 
 /// Annotate the control flow with intrinsics so the backend can
@@ -333,6 +340,7 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
   TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
   const TargetMachine &TM = TPC.getTM<TargetMachine>();
 
+  bool Changed = false;
   initialize(*F.getParent(), TM.getSubtarget<GCNSubtarget>(F));
   for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()),
        E = df_end(&F.getEntryBlock()); I != E; ++I) {
@@ -341,32 +349,32 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
 
     if (!Term || Term->isUnconditional()) {
       if (isTopOfStack(BB))
-        closeControlFlow(BB);
+        Changed |= closeControlFlow(BB);
 
       continue;
     }
 
     if (I.nodeVisited(Term->getSuccessor(1))) {
       if (isTopOfStack(BB))
-        closeControlFlow(BB);
+        Changed |= closeControlFlow(BB);
 
       if (DT->dominates(Term->getSuccessor(1), BB))
-        handleLoop(Term);
+        Changed |= handleLoop(Term);
       continue;
     }
 
     if (isTopOfStack(BB)) {
       PHINode *Phi = dyn_cast<PHINode>(Term->getCondition());
       if (Phi && Phi->getParent() == BB && isElse(Phi) && !hasKill(BB)) {
-        insertElse(Term);
-        eraseIfUnused(Phi);
+        Changed |= insertElse(Term);
+        Changed |= eraseIfUnused(Phi);
         continue;
       }
 
-      closeControlFlow(BB);
+      Changed |= closeControlFlow(BB);
     }
 
-    openIf(Term);
+    Changed |= openIf(Term);
   }
 
   if (!Stack.empty()) {
@@ -374,7 +382,7 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
     report_fatal_error("failed to annotate CFG");
   }
 
-  return true;
+  return Changed;
 }
 
 /// Create the annotation pass
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 107ee5ed5532..85930312352b 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -63,6 +63,12 @@ enum : uint64_t {
   VGPRSpill = 1 << 24,
   SGPRSpill = 1 << 25,
 
+  // LDSDIR instruction format.
+  LDSDIR = 1 << 26,
+
+  // VINTERP instruction format.
+  VINTERP = 1 << 27,
+
   // High bits - other information.
   VM_CNT = UINT64_C(1) << 32,
   EXP_CNT = UINT64_C(1) << 33,
@@ -120,7 +126,10 @@ enum : uint64_t {
   IsAtomicNoRet = UINT64_C(1) << 57,
 
   // Atomic with return.
-  IsAtomicRet = UINT64_C(1) << 58
+  IsAtomicRet = UINT64_C(1) << 58,
+
+  // Is a WMMA instruction.
+  IsWMMA = UINT64_C(1) << 59,
 };
 
 // v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
@@ -258,9 +267,10 @@ namespace AMDGPUAsmVariants {
     VOP3 = 1,
     SDWA = 2,
     SDWA9 = 3,
-    DPP = 4
+    DPP = 4,
+    VOP3_DPP = 5
   };
-}
+} // namespace AMDGPUAsmVariants
 
 namespace AMDGPU {
 namespace EncValues { // Encoding values of enum9/8/7 operands
@@ -280,7 +290,8 @@ enum : unsigned {
   INLINE_FLOATING_C_MAX = 248,
   LITERAL_CONST = 255,
   VGPR_MIN = 256,
-  VGPR_MAX = 511
+  VGPR_MAX = 511,
+  IS_VGPR = 256  // Indicates VGPR or AGPR
 };
 
 } // namespace EncValues
@@ -294,6 +305,9 @@ enum CPol {
   SLC = 2,
   DLC = 4,
   SCC = 16,
+  SC0 = GLC,
+  SC1 = SCC,
+  NT = SLC,
   ALL = GLC | SLC | DLC | SCC
 };
 
@@ -302,24 +316,33 @@ enum CPol {
 namespace SendMsg { // Encoding of SIMM16 used in s_sendmsg* insns.
 
 enum Id { // Message ID, width(4) [3:0].
-  ID_UNKNOWN_ = -1,
   ID_INTERRUPT = 1,
-  ID_GS = 2,
-  ID_GS_DONE = 3,
-  ID_SAVEWAVE = 4,           // added in GFX8
+
+  ID_GS_PreGFX11 = 2,      // replaced in GFX11
+  ID_GS_DONE_PreGFX11 = 3, // replaced in GFX11
+
+  ID_HS_TESSFACTOR_GFX11Plus = 2, // reused in GFX11
+  ID_DEALLOC_VGPRS_GFX11Plus = 3, // reused in GFX11
+
+  ID_SAVEWAVE = 4,           // added in GFX8, removed in GFX11
   ID_STALL_WAVE_GEN = 5,     // added in GFX9
   ID_HALT_WAVES = 6,         // added in GFX9
   ID_ORDERED_PS_DONE = 7,    // added in GFX9
   ID_EARLY_PRIM_DEALLOC = 8, // added in GFX9, removed in GFX10
   ID_GS_ALLOC_REQ = 9,       // added in GFX9
-  ID_GET_DOORBELL = 10,      // added in GFX9
-  ID_GET_DDID = 11,          // added in GFX10
+  ID_GET_DOORBELL = 10,      // added in GFX9, removed in GFX11
+  ID_GET_DDID = 11,          // added in GFX10, removed in GFX11
   ID_SYSMSG = 15,
-  ID_GAPS_LAST_, // Indicate that sequence has gaps.
-  ID_GAPS_FIRST_ = ID_INTERRUPT,
-  ID_SHIFT_ = 0,
-  ID_WIDTH_ = 4,
-  ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_)
+
+  ID_RTN_GET_DOORBELL = 128,
+  ID_RTN_GET_DDID = 129,
+  ID_RTN_GET_TMA = 130,
+  ID_RTN_GET_REALTIME = 131,
+  ID_RTN_SAVE_WAVE = 132,
+  ID_RTN_GET_TBA = 133,
+
+  ID_MASK_PreGFX11_ = 0xF,
+  ID_MASK_GFX11Plus_ = 0xFF
 };
 
 enum Op { // Both GS and SYS operation IDs.
@@ -360,8 +383,6 @@ enum StreamId : unsigned { // Stream ID, (2) [9:8].
 namespace Hwreg { // Encoding of SIMM16 used in s_setreg/getreg* insns.
 
 enum Id { // HwRegCode, (6) [5:0]
-  ID_UNKNOWN_ = -1,
-  ID_SYMBOLIC_FIRST_ = 1, // There are corresponding symbolic names defined.
   ID_MODE = 1,
   ID_STATUS = 2,
   ID_TRAPSTS = 3,
@@ -370,12 +391,15 @@ enum Id { // HwRegCode, (6) [5:0]
   ID_LDS_ALLOC = 6,
   ID_IB_STS = 7,
   ID_MEM_BASES = 15,
-  ID_SYMBOLIC_FIRST_GFX9_ = ID_MEM_BASES,
   ID_TBA_LO = 16,
-  ID_SYMBOLIC_FIRST_GFX10_ = ID_TBA_LO,
   ID_TBA_HI = 17,
   ID_TMA_LO = 18,
   ID_TMA_HI = 19,
+  ID_XCC_ID = 20,
+  ID_SQ_PERF_SNAPSHOT_DATA = 21,
+  ID_SQ_PERF_SNAPSHOT_DATA1 = 22,
+  ID_SQ_PERF_SNAPSHOT_PC_LO = 23,
+  ID_SQ_PERF_SNAPSHOT_PC_HI = 24,
   ID_FLAT_SCR_LO = 20,
   ID_FLAT_SCR_HI = 21,
   ID_XNACK_MASK = 22,
@@ -383,8 +407,7 @@ enum Id { // HwRegCode, (6) [5:0]
   ID_HW_ID2 = 24,
   ID_POPS_PACKER = 25,
   ID_SHADER_CYCLES = 29,
-  ID_SYMBOLIC_FIRST_GFX1030_ = ID_SHADER_CYCLES,
-  ID_SYMBOLIC_LAST_ = 30,
+
   ID_SHIFT_ = 0,
   ID_WIDTH_ = 6,
   ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_)
@@ -503,6 +526,15 @@ enum MergedFormat : int64_t {
   DFMT_NFMT_MAX = DFMT_NFMT_MASK
 };
 
+enum UnifiedFormatCommon : int64_t {
+  UFMT_MAX = 127,
+  UFMT_UNDEF = -1,
+  UFMT_DEFAULT = 1
+};
+
+} // namespace MTBUFFormat
+
+namespace UfmtGFX10 {
 enum UnifiedFormat : int64_t {
   UFMT_INVALID = 0,
 
@@ -598,14 +630,95 @@ enum UnifiedFormat : int64_t {
 
   UFMT_FIRST = UFMT_INVALID,
   UFMT_LAST = UFMT_32_32_32_32_FLOAT,
+};
 
-  UFMT_MAX = 127,
+} // namespace UfmtGFX10
 
-  UFMT_UNDEF = -1,
-  UFMT_DEFAULT = UFMT_8_UNORM
+namespace UfmtGFX11 {
+enum UnifiedFormat : int64_t {
+  UFMT_INVALID = 0,
+
+  UFMT_8_UNORM,
+  UFMT_8_SNORM,
+  UFMT_8_USCALED,
+  UFMT_8_SSCALED,
+  UFMT_8_UINT,
+  UFMT_8_SINT,
+
+  UFMT_16_UNORM,
+  UFMT_16_SNORM,
+  UFMT_16_USCALED,
+  UFMT_16_SSCALED,
+  UFMT_16_UINT,
+  UFMT_16_SINT,
+  UFMT_16_FLOAT,
+
+  UFMT_8_8_UNORM,
+  UFMT_8_8_SNORM,
+  UFMT_8_8_USCALED,
+  UFMT_8_8_SSCALED,
+  UFMT_8_8_UINT,
+  UFMT_8_8_SINT,
+
+  UFMT_32_UINT,
+  UFMT_32_SINT,
+  UFMT_32_FLOAT,
+
+  UFMT_16_16_UNORM,
+  UFMT_16_16_SNORM,
+  UFMT_16_16_USCALED,
+  UFMT_16_16_SSCALED,
+  UFMT_16_16_UINT,
+  UFMT_16_16_SINT,
+  UFMT_16_16_FLOAT,
+
+  UFMT_10_11_11_FLOAT,
+
+  UFMT_11_11_10_FLOAT,
+
+  UFMT_10_10_10_2_UNORM,
+  UFMT_10_10_10_2_SNORM,
+  UFMT_10_10_10_2_UINT,
+  UFMT_10_10_10_2_SINT,
+
+  UFMT_2_10_10_10_UNORM,
+  UFMT_2_10_10_10_SNORM,
+  UFMT_2_10_10_10_USCALED,
+  UFMT_2_10_10_10_SSCALED,
+  UFMT_2_10_10_10_UINT,
+  UFMT_2_10_10_10_SINT,
+
+  UFMT_8_8_8_8_UNORM,
+  UFMT_8_8_8_8_SNORM,
+  UFMT_8_8_8_8_USCALED,
+  UFMT_8_8_8_8_SSCALED,
+  UFMT_8_8_8_8_UINT,
+  UFMT_8_8_8_8_SINT,
+
+  UFMT_32_32_UINT,
+  UFMT_32_32_SINT,
+  UFMT_32_32_FLOAT,
+
+  UFMT_16_16_16_16_UNORM,
+  UFMT_16_16_16_16_SNORM,
+  UFMT_16_16_16_16_USCALED,
+  UFMT_16_16_16_16_SSCALED,
+  UFMT_16_16_16_16_UINT,
+  UFMT_16_16_16_16_SINT,
+  UFMT_16_16_16_16_FLOAT,
+
+  UFMT_32_32_32_UINT,
+  UFMT_32_32_32_SINT,
+  UFMT_32_32_32_FLOAT,
+  UFMT_32_32_32_32_UINT,
+  UFMT_32_32_32_32_SINT,
+  UFMT_32_32_32_32_FLOAT,
+
+  UFMT_FIRST = UFMT_INVALID,
+  UFMT_LAST = UFMT_32_32_32_32_FLOAT,
 };
 
-} // namespace MTBUFFormat
+} // namespace UfmtGFX11
 
 namespace Swizzle { // Encoding of swizzle macro used in ds_swizzle_b32.
 
@@ -746,20 +859,23 @@ enum Target : unsigned {
   ET_MRT0 = 0,
   ET_MRT7 = 7,
   ET_MRTZ = 8,
-  ET_NULL = 9,
+  ET_NULL = 9,             // Pre-GFX11
   ET_POS0 = 12,
   ET_POS3 = 15,
-  ET_POS4 = 16,          // GFX10+
-  ET_POS_LAST = ET_POS4, // Highest pos used on any subtarget
-  ET_PRIM = 20,          // GFX10+
-  ET_PARAM0 = 32,
-  ET_PARAM31 = 63,
+  ET_POS4 = 16,            // GFX10+
+  ET_POS_LAST = ET_POS4,   // Highest pos used on any subtarget
+  ET_PRIM = 20,            // GFX10+
+  ET_DUAL_SRC_BLEND0 = 21, // GFX11+
+  ET_DUAL_SRC_BLEND1 = 22, // GFX11+
+  ET_PARAM0 = 32,          // Pre-GFX11
+  ET_PARAM31 = 63,         // Pre-GFX11
 
   ET_NULL_MAX_IDX = 0,
   ET_MRTZ_MAX_IDX = 0,
   ET_PRIM_MAX_IDX = 0,
   ET_MRT_MAX_IDX = 7,
   ET_POS_MAX_IDX = 4,
+  ET_DUAL_SRC_BLEND_MAX_IDX = 1,
   ET_PARAM_MAX_IDX = 31,
 
   ET_INVALID = 255,
@@ -777,6 +893,18 @@ enum OpSel : uint64_t {
 
 } // namespace VOP3PEncoding
 
+namespace ImplicitArg {
+// Implicit kernel argument offset for code object version 5.
+enum Offset_COV5 : unsigned {
+  HOSTCALL_PTR_OFFSET = 80,
+  MULTIGRID_SYNC_ARG_OFFSET = 88,
+  HEAP_PTR_OFFSET = 96,
+  PRIVATE_BASE_OFFSET = 192,
+  SHARED_BASE_OFFSET = 196,
+  QUEUE_PTR_OFFSET = 200,
+};
+
+} // namespace ImplicitArg
 } // namespace AMDGPU
 
 #define R_00B028_SPI_SHADER_PGM_RSRC1_PS                                0x00B028
@@ -911,10 +1039,12 @@ enum OpSel : uint64_t {
 #define FP_DENORM_MODE_DP(x) (((x) & 0x3) << 6)
 
 #define R_00B860_COMPUTE_TMPRING_SIZE                                   0x00B860
-#define   S_00B860_WAVESIZE(x)                                        (((x) & 0x1FFF) << 12)
+#define   S_00B860_WAVESIZE_PreGFX11(x)                               (((x) & 0x1FFF) << 12)
+#define   S_00B860_WAVESIZE_GFX11Plus(x)                              (((x) & 0x7FFF) << 12)
 
 #define R_0286E8_SPI_TMPRING_SIZE                                       0x0286E8
-#define   S_0286E8_WAVESIZE(x)                                        (((x) & 0x1FFF) << 12)
+#define   S_0286E8_WAVESIZE_PreGFX11(x)                               (((x) & 0x1FFF) << 12)
+#define   S_0286E8_WAVESIZE_GFX11Plus(x)                              (((x) & 0x7FFF) << 12)
 
 #define R_028B54_VGT_SHADER_STAGES_EN                                 0x028B54
 #define   S_028B54_HS_W32_EN(x)                                       (((x) & 0x1) << 21)
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 33954e11d6c6..99aa8a60b04f 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -92,7 +92,7 @@ public:
 
   bool tryFoldCndMask(MachineInstr &MI) const;
   bool tryFoldZeroHighBits(MachineInstr &MI) const;
-  void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
+  bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
 
   const MachineOperand *isClamp(const MachineInstr &MI) const;
   bool tryFoldClamp(MachineInstr &MI);
@@ -146,30 +146,6 @@ static unsigned macToMad(unsigned Opc) {
   return AMDGPU::INSTRUCTION_LIST_END;
 }
 
-// Wrapper around isInlineConstant that understands special cases when
-// instruction types are replaced during operand folding.
-static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
-                                     const MachineInstr &UseMI,
-                                     unsigned OpNo,
-                                     const MachineOperand &OpToFold) {
-  if (TII->isInlineConstant(UseMI, OpNo, OpToFold))
-    return true;
-
-  unsigned Opc = UseMI.getOpcode();
-  unsigned NewOpc = macToMad(Opc);
-  if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
-    // Special case for mac. Since this is replaced with mad when folded into
-    // src2, we need to check the legality for the final instruction.
-    int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
-    if (static_cast<int>(OpNo) == Src2Idx) {
-      const MCInstrDesc &MadDesc = TII->get(NewOpc);
-      return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
-    }
-  }
-
-  return false;
-}
-
 // TODO: Add heuristic that the frame index might not fit in the addressing mode
 // immediate offset to avoid materializing in loops.
 static bool frameIndexMayFold(const SIInstrInfo *TII,
@@ -210,6 +186,8 @@ static bool updateOperand(FoldCandidate &Fold,
   if (Fold.isImm()) {
     if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked &&
         !(MI->getDesc().TSFlags & SIInstrFlags::IsMAI) &&
+        (!ST.hasDOTOpSelHazard() ||
+         !(MI->getDesc().TSFlags & SIInstrFlags::IsDOT)) &&
         AMDGPU::isFoldableLiteralV216(Fold.ImmToFold,
                                       ST.hasInv2PiInlineImm())) {
       // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
@@ -289,7 +267,7 @@ static bool updateOperand(FoldCandidate &Fold,
     // when looking at a use.
     Dst0.setReg(NewReg0);
     for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
-      MI->RemoveOperand(I);
+      MI->removeOperand(I);
     MI->setDesc(TII.get(AMDGPU::IMPLICIT_DEF));
 
     if (Fold.isCommuted())
@@ -490,6 +468,8 @@ static bool isUseSafeToFold(const SIInstrInfo *TII,
   case AMDGPU::V_MOV_B32_e32:
   case AMDGPU::V_MOV_B32_e64:
   case AMDGPU::V_MOV_B64_PSEUDO:
+  case AMDGPU::V_MOV_B64_e32:
+  case AMDGPU::V_MOV_B64_e64:
     // Do not fold into an indirect mov.
     return !MI.hasRegisterImplicitUseOperand(AMDGPU::M0);
   }
@@ -675,7 +655,9 @@ void SIFoldOperands::foldOperand(
 
     if (TII->isFLATScratch(*UseMI) &&
         AMDGPU::getNamedOperandIdx(UseMI->getOpcode(),
-                                   AMDGPU::OpName::vaddr) != -1) {
+                                   AMDGPU::OpName::vaddr) != -1 &&
+        AMDGPU::getNamedOperandIdx(UseMI->getOpcode(),
+                                   AMDGPU::OpName::saddr) == -1) {
       unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(UseMI->getOpcode());
       UseMI->setDesc(TII->get(NewOpc));
     }
@@ -739,7 +721,7 @@ void SIFoldOperands::foldOperand(
     while (ImpOpI != ImpOpE) {
       MachineInstr::mop_iterator Tmp = ImpOpI;
       ImpOpI++;
-      UseMI->RemoveOperand(UseMI->getOperandNo(Tmp));
+      UseMI->removeOperand(UseMI->getOperandNo(Tmp));
     }
     CopiesToReplace.push_back(UseMI);
   } else {
@@ -768,7 +750,7 @@ void SIFoldOperands::foldOperand(
 
         UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
         for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I)
-          UseMI->RemoveOperand(I);
+          UseMI->removeOperand(I);
 
         MachineInstrBuilder B(*MBB.getParent(), UseMI);
         DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
@@ -871,7 +853,7 @@ void SIFoldOperands::foldOperand(
           UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
         else
           UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex());
-        UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane)
+        UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
         return;
       }
 
@@ -890,7 +872,7 @@ void SIFoldOperands::foldOperand(
         UseMI->getOperand(1).setReg(OpToFold.getReg());
         UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
         UseMI->getOperand(1).setIsKill(false);
-        UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane)
+        UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
         return;
       }
     }
@@ -906,6 +888,22 @@ void SIFoldOperands::foldOperand(
   }
 
   if (!FoldingImmLike) {
+    if (OpToFold.isReg() && ST->needsAlignedVGPRs()) {
+      // Don't fold if OpToFold doesn't hold an aligned register.
+      const TargetRegisterClass *RC =
+          TRI->getRegClassForReg(*MRI, OpToFold.getReg());
+      if (TRI->hasVectorRegisters(RC) && OpToFold.getSubReg()) {
+        unsigned SubReg = OpToFold.getSubReg();
+        const TargetRegisterClass *SubRC = TRI->getSubRegClass(RC, SubReg);
+        RC = TRI->getCompatibleSubRegClass(RC, SubRC, SubReg);
+        if (RC)
+          RC = SubRC;
+      }
+
+      if (!RC || !TRI->isProperlyAlignedRC(*RC))
+        return;
+    }
+
     tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
 
     // FIXME: We could try to change the instruction from 64-bit to 32-bit
@@ -1025,7 +1023,7 @@ static void stripExtraCopyOperands(MachineInstr &MI) {
                     Desc.getNumImplicitDefs();
 
   for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
-    MI.RemoveOperand(I);
+    MI.removeOperand(I);
 }
 
 static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
@@ -1093,7 +1091,7 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI, const SIInstrInfo *TII,
     // Be careful to change the right operand, src0 may belong to a different
     // instruction.
     MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
-    MI->RemoveOperand(Src1Idx);
+    MI->removeOperand(Src1Idx);
     mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
     return true;
   }
@@ -1112,11 +1110,11 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI, const SIInstrInfo *TII,
       Opc == AMDGPU::S_OR_B32) {
     if (Src1Val == 0) {
       // y = or x, 0 => y = copy x
-      MI->RemoveOperand(Src1Idx);
+      MI->removeOperand(Src1Idx);
       mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
     } else if (Src1Val == -1) {
       // y = or x, -1 => y = v_mov_b32 -1
-      MI->RemoveOperand(Src1Idx);
+      MI->removeOperand(Src1Idx);
       mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
     } else
       return false;
@@ -1129,11 +1127,11 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI, const SIInstrInfo *TII,
       MI->getOpcode() == AMDGPU::S_AND_B32) {
     if (Src1Val == 0) {
       // y = and x, 0 => y = v_mov_b32 0
-      MI->RemoveOperand(Src0Idx);
+      MI->removeOperand(Src0Idx);
       mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
     } else if (Src1Val == -1) {
       // y = and x, -1 => y = copy x
-      MI->RemoveOperand(Src1Idx);
+      MI->removeOperand(Src1Idx);
       mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
       stripExtraCopyOperands(*MI);
     } else
@@ -1147,7 +1145,7 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI, const SIInstrInfo *TII,
       MI->getOpcode() == AMDGPU::S_XOR_B32) {
     if (Src1Val == 0) {
       // y = xor x, 0 => y = copy x
-      MI->RemoveOperand(Src1Idx);
+      MI->removeOperand(Src1Idx);
       mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
       return true;
     }
@@ -1185,12 +1183,12 @@ bool SIFoldOperands::tryFoldCndMask(MachineInstr &MI) const {
       TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
   int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
   if (Src2Idx != -1)
-    MI.RemoveOperand(Src2Idx);
-  MI.RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
+    MI.removeOperand(Src2Idx);
+  MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
   if (Src1ModIdx != -1)
-    MI.RemoveOperand(Src1ModIdx);
+    MI.removeOperand(Src1ModIdx);
   if (Src0ModIdx != -1)
-    MI.RemoveOperand(Src0ModIdx);
+    MI.removeOperand(Src0ModIdx);
   mutateCopyOp(MI, NewDesc);
   LLVM_DEBUG(dbgs() << MI);
   return true;
@@ -1217,7 +1215,7 @@ bool SIFoldOperands::tryFoldZeroHighBits(MachineInstr &MI) const {
   return false;
 }
 
-void SIFoldOperands::foldInstOperand(MachineInstr &MI,
+bool SIFoldOperands::foldInstOperand(MachineInstr &MI,
                                      MachineOperand &OpToFold) const {
   // We need mutate the operands of new mov instructions to add implicit
   // uses of EXEC, but adding them invalidates the use_iterator, so defer
@@ -1225,6 +1223,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
   SmallVector<MachineInstr *, 4> CopiesToReplace;
   SmallVector<FoldCandidate, 4> FoldList;
   MachineOperand &Dst = MI.getOperand(0);
+  bool Changed = false;
 
   if (OpToFold.isImm()) {
     for (auto &UseMI :
@@ -1237,66 +1236,25 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
       // We may also encounter cases where one or both operands are
       // immediates materialized into a register, which would ordinarily not
       // be folded due to multiple uses or operand constraints.
-      if (tryConstantFoldOp(*MRI, TII, &UseMI))
+      if (tryConstantFoldOp(*MRI, TII, &UseMI)) {
         LLVM_DEBUG(dbgs() << "Constant folded " << UseMI);
-    }
-  }
-
-  bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
-  if (FoldingImm) {
-    unsigned NumLiteralUses = 0;
-    MachineOperand *NonInlineUse = nullptr;
-    int NonInlineUseOpNo = -1;
-
-    for (auto &Use :
-         make_early_inc_range(MRI->use_nodbg_operands(Dst.getReg()))) {
-      MachineInstr *UseMI = Use.getParent();
-      unsigned OpNo = UseMI->getOperandNo(&Use);
-
-      // Try to fold any inline immediate uses, and then only fold other
-      // constants if they have one use.
-      //
-      // The legality of the inline immediate must be checked based on the use
-      // operand, not the defining instruction, because 32-bit instructions
-      // with 32-bit inline immediate sources may be used to materialize
-      // constants used in 16-bit operands.
-      //
-      // e.g. it is unsafe to fold:
-      //  s_mov_b32 s0, 1.0    // materializes 0x3f800000
-      //  v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
-
-      // Folding immediates with more than one use will increase program size.
-      // FIXME: This will also reduce register usage, which may be better
-      // in some cases. A better heuristic is needed.
-      if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
-        foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
-      } else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) {
-        foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
-      } else {
-        if (++NumLiteralUses == 1) {
-          NonInlineUse = &Use;
-          NonInlineUseOpNo = OpNo;
-        }
+        Changed = true;
       }
     }
+  }
 
-    if (NumLiteralUses == 1) {
-      MachineInstr *UseMI = NonInlineUse->getParent();
-      foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace);
-    }
-  } else {
-    // Folding register.
-    SmallVector <MachineOperand *, 4> UsesToProcess;
-    for (auto &Use : MRI->use_nodbg_operands(Dst.getReg()))
-      UsesToProcess.push_back(&Use);
-    for (auto U : UsesToProcess) {
-      MachineInstr *UseMI = U->getParent();
-
-      foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U),
-        FoldList, CopiesToReplace);
-    }
+  SmallVector<MachineOperand *, 4> UsesToProcess;
+  for (auto &Use : MRI->use_nodbg_operands(Dst.getReg()))
+    UsesToProcess.push_back(&Use);
+  for (auto U : UsesToProcess) {
+    MachineInstr *UseMI = U->getParent();
+    foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U), FoldList,
+                CopiesToReplace);
   }
 
+  if (CopiesToReplace.empty() && FoldList.empty())
+    return Changed;
+
   MachineFunction *MF = MI.getParent()->getParent();
   // Make sure we add EXEC uses to any new v_mov instructions created.
   for (MachineInstr *Copy : CopiesToReplace)
@@ -1328,6 +1286,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
       TII->commuteInstruction(*Fold.UseMI, false);
     }
   }
+  return true;
 }
 
 // Clamp patterns are canonically selected to v_max_* instructions, so only
@@ -1593,8 +1552,9 @@ bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) {
 
   unsigned OpIdx = Op - &UseMI->getOperand(0);
   const MCInstrDesc &InstDesc = UseMI->getDesc();
-  if (!TRI->isVectorSuperClass(
-          TRI->getRegClass(InstDesc.OpInfo[OpIdx].RegClass)))
+  const TargetRegisterClass *OpRC =
+      TII->getRegClass(InstDesc, OpIdx, TRI, *MI.getMF());
+  if (!OpRC || !TRI->isVectorSuperClass(OpRC))
     return false;
 
   const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
@@ -1751,22 +1711,31 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
   bool IsIEEEMode = MFI->getMode().IEEE;
   bool HasNSZ = MFI->hasNoSignedZerosFPMath();
 
+  bool Changed = false;
   for (MachineBasicBlock *MBB : depth_first(&MF)) {
     MachineOperand *CurrentKnownM0Val = nullptr;
     for (auto &MI : make_early_inc_range(*MBB)) {
-      tryFoldCndMask(MI);
+      Changed |= tryFoldCndMask(MI);
 
-      if (tryFoldZeroHighBits(MI))
+      if (tryFoldZeroHighBits(MI)) {
+        Changed = true;
         continue;
+      }
 
-      if (MI.isRegSequence() && tryFoldRegSequence(MI))
+      if (MI.isRegSequence() && tryFoldRegSequence(MI)) {
+        Changed = true;
         continue;
+      }
 
-      if (MI.isPHI() && tryFoldLCSSAPhi(MI))
+      if (MI.isPHI() && tryFoldLCSSAPhi(MI)) {
+        Changed = true;
         continue;
+      }
 
-      if (MI.mayLoad() && tryFoldLoad(MI))
+      if (MI.mayLoad() && tryFoldLoad(MI)) {
+        Changed = true;
         continue;
+      }
 
       if (!TII->isFoldableCopy(MI)) {
         // Saw an unknown clobber of m0, so we no longer know what it is.
@@ -1777,7 +1746,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
         // instruction, and not the omod multiply.
         if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
             !tryFoldOMod(MI))
-          tryFoldClamp(MI);
+          Changed |= tryFoldClamp(MI);
 
         continue;
       }
@@ -1788,6 +1757,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
         MachineOperand &NewM0Val = MI.getOperand(1);
         if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
           MI.eraseFromParent();
+          Changed = true;
           continue;
         }
 
@@ -1817,7 +1787,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
       if (!MI.getOperand(0).getReg().isVirtual())
         continue;
 
-      foldInstOperand(MI, OpToFold);
+      Changed |= foldInstOperand(MI, OpToFold);
 
       // If we managed to fold all uses of this copy then we might as well
       // delete it now.
@@ -1829,6 +1799,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
         auto &SrcOp = InstToErase->getOperand(1);
         auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
         InstToErase->eraseFromParent();
+        Changed = true;
         InstToErase = nullptr;
         if (!SrcReg || SrcReg.isPhysical())
           break;
@@ -1837,9 +1808,11 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
           break;
       }
       if (InstToErase && InstToErase->isRegSequence() &&
-          MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg()))
+          MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
         InstToErase->eraseFromParent();
+        Changed = true;
+      }
     }
   }
-  return true;
+  return Changed;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
index 80ee7a00252a..d7ca7f36284b 100644
--- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -241,7 +241,7 @@ void SIFormMemoryClauses::collectRegUses(const MachineInstr &MI,
 }
 
 // Check register def/use conflicts, occupancy limits and collect def/use maps.
-// Return true if instruction can be bundled with previous. It it cannot
+// Return true if instruction can be bundled with previous. If it cannot
 // def/use maps are not updated.
 bool SIFormMemoryClauses::processRegUses(const MachineInstr &MI,
                                          RegUse &Defs, RegUse &Uses,
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 6078f4a0577a..a57e81eb4e4a 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -749,7 +749,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
     return;
   }
 
-  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
@@ -789,19 +789,13 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
                      *Reg.FI);
   }
 
-  // VGPRs used for Whole Wave Mode
-  for (const auto &Reg : FuncInfo->WWMReservedRegs) {
-    auto VGPR = Reg.first;
-    auto FI = Reg.second;
-    if (!FI)
-      continue;
-
+  for (auto ReservedWWM : FuncInfo->wwmAllocation()) {
     if (!ScratchExecCopy)
       ScratchExecCopy =
           buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ true);
 
-    buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, VGPR,
-                     *FI);
+    buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL,
+                     std::get<0>(ReservedWWM), std::get<1>(ReservedWWM));
   }
 
   if (ScratchExecCopy) {
@@ -813,27 +807,8 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
     LiveRegs.addReg(ScratchExecCopy);
   }
 
-  if (FPSaveIndex && spilledToMemory(MF, *FPSaveIndex)) {
-    const int FramePtrFI = *FPSaveIndex;
-    assert(!MFI.isDeadObjectIndex(FramePtrFI));
-
-    initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
-
-    MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
-        MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
-    if (!TmpVGPR)
-      report_fatal_error("failed to find free scratch register");
-
-    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
-        .addReg(FramePtrReg);
-
-    buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, TmpVGPR,
-                     FramePtrFI);
-  }
-
-  if (BPSaveIndex && spilledToMemory(MF, *BPSaveIndex)) {
-    const int BasePtrFI = *BPSaveIndex;
-    assert(!MFI.isDeadObjectIndex(BasePtrFI));
+  auto SaveSGPRToMemory = [&](Register Reg, const int FI) {
+    assert(!MFI.isDeadObjectIndex(FI));
 
     initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
 
@@ -843,44 +818,31 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
       report_fatal_error("failed to find free scratch register");
 
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
-        .addReg(BasePtrReg);
+        .addReg(Reg);
 
     buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, TmpVGPR,
-                     BasePtrFI);
-  }
+                     FI);
+  };
 
-  // In this case, spill the FP to a reserved VGPR.
-  if (FPSaveIndex && !spilledToMemory(MF, *FPSaveIndex)) {
-    const int FramePtrFI = *FPSaveIndex;
-    assert(!MFI.isDeadObjectIndex(FramePtrFI));
+  auto SaveSGPRToVGPRLane = [&](Register Reg, const int FI) {
+    assert(!MFI.isDeadObjectIndex(FI));
 
-    assert(MFI.getStackID(FramePtrFI) == TargetStackID::SGPRSpill);
-    ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
-        FuncInfo->getSGPRToVGPRSpills(FramePtrFI);
+    assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
+    ArrayRef<SIRegisterInfo::SpilledReg> Spill =
+        FuncInfo->getSGPRToVGPRSpills(FI);
     assert(Spill.size() == 1);
 
-    // Save FP before setting it up.
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR)
-        .addReg(FramePtrReg)
+        .addReg(Reg)
         .addImm(Spill[0].Lane)
         .addReg(Spill[0].VGPR, RegState::Undef);
-  }
+  };
 
-  // In this case, spill the BP to a reserved VGPR.
-  if (BPSaveIndex && !spilledToMemory(MF, *BPSaveIndex)) {
-    const int BasePtrFI = *BPSaveIndex;
-    assert(!MFI.isDeadObjectIndex(BasePtrFI));
-
-    assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill);
-    ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
-        FuncInfo->getSGPRToVGPRSpills(BasePtrFI);
-    assert(Spill.size() == 1);
-
-    // Save BP before setting it up.
-    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR)
-        .addReg(BasePtrReg)
-        .addImm(Spill[0].Lane)
-        .addReg(Spill[0].VGPR, RegState::Undef);
+  if (FPSaveIndex) {
+    if (spilledToMemory(MF, *FPSaveIndex))
+      SaveSGPRToMemory(FramePtrReg, *FPSaveIndex);
+    else
+      SaveSGPRToVGPRLane(FramePtrReg, *FPSaveIndex);
   }
 
   // Emit the copy if we need an FP, and are using a free SGPR to save it.
@@ -891,6 +853,13 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
         .setMIFlag(MachineInstr::FrameSetup);
   }
 
+  if (BPSaveIndex) {
+    if (spilledToMemory(MF, *BPSaveIndex))
+      SaveSGPRToMemory(BasePtrReg, *BPSaveIndex);
+    else
+      SaveSGPRToVGPRLane(BasePtrReg, *BPSaveIndex);
+  }
+
   // Emit the copy if we need a BP, and are using a free SGPR to save it.
   if (FuncInfo->SGPRForBPSaveRestoreCopy) {
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY),
@@ -1034,56 +1003,44 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
         .setMIFlag(MachineInstr::FrameDestroy);
   }
 
+  auto RestoreSGPRFromMemory = [&](Register Reg, const int FI) {
+    initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
+    MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
+        MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
+    if (!TmpVGPR)
+      report_fatal_error("failed to find free scratch register");
+    buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, TmpVGPR,
+                       FI);
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), Reg)
+        .addReg(TmpVGPR, RegState::Kill);
+  };
+
+  auto RestoreSGPRFromVGPRLane = [&](Register Reg, const int FI) {
+    assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
+    ArrayRef<SIRegisterInfo::SpilledReg> Spill =
+        FuncInfo->getSGPRToVGPRSpills(FI);
+    assert(Spill.size() == 1);
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READLANE_B32), Reg)
+        .addReg(Spill[0].VGPR)
+        .addImm(Spill[0].Lane);
+  };
+
   if (FPSaveIndex) {
     const int FramePtrFI = *FPSaveIndex;
     assert(!MFI.isDeadObjectIndex(FramePtrFI));
-    if (spilledToMemory(MF, FramePtrFI)) {
-      initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
-
-      MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
-          MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
-      if (!TmpVGPR)
-        report_fatal_error("failed to find free scratch register");
-      buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL,
-                         TmpVGPR, FramePtrFI);
-      BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg)
-          .addReg(TmpVGPR, RegState::Kill);
-    } else {
-      // Reload from VGPR spill.
-      assert(MFI.getStackID(FramePtrFI) == TargetStackID::SGPRSpill);
-      ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
-          FuncInfo->getSGPRToVGPRSpills(FramePtrFI);
-      assert(Spill.size() == 1);
-      BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READLANE_B32), FramePtrReg)
-          .addReg(Spill[0].VGPR)
-          .addImm(Spill[0].Lane);
-    }
+    if (spilledToMemory(MF, FramePtrFI))
+      RestoreSGPRFromMemory(FramePtrReg, FramePtrFI);
+    else
+      RestoreSGPRFromVGPRLane(FramePtrReg, FramePtrFI);
   }
 
   if (BPSaveIndex) {
     const int BasePtrFI = *BPSaveIndex;
     assert(!MFI.isDeadObjectIndex(BasePtrFI));
-    if (spilledToMemory(MF, BasePtrFI)) {
-      initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
-
-      MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
-          MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
-      if (!TmpVGPR)
-        report_fatal_error("failed to find free scratch register");
-      buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL,
-                         TmpVGPR, BasePtrFI);
-      BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg)
-          .addReg(TmpVGPR, RegState::Kill);
-    } else {
-      // Reload from VGPR spill.
-      assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill);
-      ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
-          FuncInfo->getSGPRToVGPRSpills(BasePtrFI);
-      assert(Spill.size() == 1);
-      BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READLANE_B32), BasePtrReg)
-          .addReg(Spill[0].VGPR)
-          .addImm(Spill[0].Lane);
-    }
+    if (spilledToMemory(MF, BasePtrFI))
+      RestoreSGPRFromMemory(BasePtrReg, BasePtrFI);
+    else
+      RestoreSGPRFromVGPRLane(BasePtrReg, BasePtrFI);
   }
 
   Register ScratchExecCopy;
@@ -1100,18 +1057,13 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
                        Reg.VGPR, *Reg.FI);
   }
 
-  for (const auto &Reg : FuncInfo->WWMReservedRegs) {
-    auto VGPR = Reg.first;
-    auto FI = Reg.second;
-    if (!FI)
-      continue;
-
+  for (auto ReservedWWM : FuncInfo->wwmAllocation()) {
     if (!ScratchExecCopy)
       ScratchExecCopy =
           buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ false);
 
-    buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, VGPR,
-                       *FI);
+    buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL,
+                       std::get<0>(ReservedWWM), std::get<1>(ReservedWWM));
   }
 
   if (ScratchExecCopy) {
@@ -1161,6 +1113,11 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
   MachineRegisterInfo &MRI = MF.getRegInfo();
   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
 
+  if (!FuncInfo->isEntryFunction()) {
+    // Spill VGPRs used for Whole Wave Mode
+    FuncInfo->allocateWWMReservedSpillSlots(MFI, *TRI);
+  }
+
   const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
                                && EnableSpillVGPRToAGPR;
 
@@ -1200,7 +1157,7 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
       }
     }
 
-    // Stack slot coloring may assign different objets to the same stack slot.
+    // Stack slot coloring may assign different objects to the same stack slot.
     // If not, then the VGPR to AGPR spill slot is dead.
     for (unsigned FI : SpillFIs.set_bits())
       if (!NonVGPRSpillFIs.test(FI))
@@ -1229,7 +1186,11 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
     }
   }
 
-  FuncInfo->removeDeadFrameIndices(MFI);
+  // At this point we've already allocated all spilled SGPRs to VGPRs if we
+  // can. Any remaining SGPR spills will go to memory, so move them back to the
+  // default stack.
+  bool HaveSGPRToVMemSpill =
+      FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true);
   assert(allSGPRSpillsAreDead(MF) &&
          "SGPR spill should have been removed in SILowerSGPRSpills");
 
@@ -1241,6 +1202,39 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
 
     // Add an emergency spill slot
     RS->addScavengingFrameIndex(FuncInfo->getScavengeFI(MFI, *TRI));
+
+    // If we are spilling SGPRs to memory with a large frame, we may need a
+    // second VGPR emergency frame index.
+    if (HaveSGPRToVMemSpill &&
+        allocateScavengingFrameIndexesNearIncomingSP(MF)) {
+      RS->addScavengingFrameIndex(MFI.CreateStackObject(4, Align(4), false));
+    }
+  }
+}
+
+void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced(
+    MachineFunction &MF, RegScavenger *RS) const {
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+
+  if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
+    // On gfx908, we had initially reserved highest available VGPR for AGPR
+    // copy. Now since we are done with RA, check if there exist an unused VGPR
+    // which is lower than the eariler reserved VGPR before RA. If one exist,
+    // use it for AGPR copy instead of one reserved before RA.
+    Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy();
+    Register UnusedLowVGPR =
+        TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
+    if (UnusedLowVGPR && (TRI->getHWRegIndex(UnusedLowVGPR) <
+                          TRI->getHWRegIndex(VGPRForAGPRCopy))) {
+      // Call to setVGPRForAGPRCopy() should happen first before calling
+      // freezeReservedRegs() so that getReservedRegs() can reserve this newly
+      // identified VGPR (for AGPR copy).
+      FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR);
+      MRI.freezeReservedRegs(MF);
+    }
   }
 }
 
@@ -1333,6 +1327,20 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
   // FP will be specially managed like SP.
   if (WillHaveFP || hasFP(MF))
     SavedRegs.reset(MFI->getFrameOffsetReg());
+
+  // Return address use with return instruction is hidden through the SI_RETURN
+  // pseudo. Given that and since the IPRA computes actual register usage and
+  // does not use CSR list, the clobbering of return address by function calls
+  // (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register
+  // usage collection. This will ensure save/restore of return address happens
+  // in those scenarios.
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  Register RetAddrReg = TRI->getReturnAddressReg(MF);
+  if (!MFI->isEntryFunction() &&
+      (FrameInfo.hasCalls() || MRI.isPhysRegModified(RetAddrReg))) {
+    SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub0));
+    SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub1));
+  }
 }
 
 bool SIFrameLowering::assignCalleeSavedSpillSlots(
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index 7949dcfa6632..79154d494e91 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -47,6 +47,9 @@ public:
     MachineFunction &MF,
     RegScavenger *RS = nullptr) const override;
 
+  void processFunctionBeforeFrameIndicesReplaced(
+      MachineFunction &MF, RegScavenger *RS = nullptr) const override;
+
   MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF,
                                 MachineBasicBlock &MBB,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e2f4a0896bc3..094d5cd58673 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17,6 +17,7 @@
 #include "AMDGPUTargetMachine.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
+#include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
@@ -25,6 +26,7 @@
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
@@ -136,6 +138,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
     addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
     addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
+    addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
+    addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
   }
 
   addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
@@ -151,27 +155,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setBooleanVectorContents(ZeroOrOneBooleanContent);
 
   // We need to custom lower vector stores from local memory
-  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
-  setOperationAction(ISD::LOAD, MVT::v3i32, Custom);
-  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
-  setOperationAction(ISD::LOAD, MVT::v5i32, Custom);
-  setOperationAction(ISD::LOAD, MVT::v6i32, Custom);
-  setOperationAction(ISD::LOAD, MVT::v7i32, Custom);
-  setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
-  setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
-  setOperationAction(ISD::LOAD, MVT::i1, Custom);
-  setOperationAction(ISD::LOAD, MVT::v32i32, Custom);
-
-  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
-  setOperationAction(ISD::STORE, MVT::v3i32, Custom);
-  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
-  setOperationAction(ISD::STORE, MVT::v5i32, Custom);
-  setOperationAction(ISD::STORE, MVT::v6i32, Custom);
-  setOperationAction(ISD::STORE, MVT::v7i32, Custom);
-  setOperationAction(ISD::STORE, MVT::v8i32, Custom);
-  setOperationAction(ISD::STORE, MVT::v16i32, Custom);
-  setOperationAction(ISD::STORE, MVT::i1, Custom);
-  setOperationAction(ISD::STORE, MVT::v32i32, Custom);
+  setOperationAction(ISD::LOAD,
+                     {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
+                      MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v16i32, MVT::i1,
+                      MVT::v32i32},
+                     Custom);
+
+  setOperationAction(ISD::STORE,
+                     {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
+                      MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v16i32, MVT::i1,
+                      MVT::v32i32},
+                     Custom);
 
   setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
   setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
@@ -198,81 +192,57 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
   setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
 
-  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
-  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+  setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
 
   setOperationAction(ISD::SELECT, MVT::i1, Promote);
   setOperationAction(ISD::SELECT, MVT::i64, Custom);
   setOperationAction(ISD::SELECT, MVT::f64, Promote);
   AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
 
-  setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
-  setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
-  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
-  setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
-  setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
+  setOperationAction(ISD::SELECT_CC,
+                     {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
 
   setOperationAction(ISD::SETCC, MVT::i1, Promote);
-  setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
-  setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
+  setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
   AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
 
-  setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
-  setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
-  setOperationAction(ISD::TRUNCATE, MVT::v3i32, Expand);
-  setOperationAction(ISD::FP_ROUND, MVT::v3f32, Expand);
-  setOperationAction(ISD::TRUNCATE, MVT::v4i32, Expand);
-  setOperationAction(ISD::FP_ROUND, MVT::v4f32, Expand);
-  setOperationAction(ISD::TRUNCATE, MVT::v5i32, Expand);
-  setOperationAction(ISD::FP_ROUND, MVT::v5f32, Expand);
-  setOperationAction(ISD::TRUNCATE, MVT::v6i32, Expand);
-  setOperationAction(ISD::FP_ROUND, MVT::v6f32, Expand);
-  setOperationAction(ISD::TRUNCATE, MVT::v7i32, Expand);
-  setOperationAction(ISD::FP_ROUND, MVT::v7f32, Expand);
-  setOperationAction(ISD::TRUNCATE, MVT::v8i32, Expand);
-  setOperationAction(ISD::FP_ROUND, MVT::v8f32, Expand);
-  setOperationAction(ISD::TRUNCATE, MVT::v16i32, Expand);
-  setOperationAction(ISD::FP_ROUND, MVT::v16f32, Expand);
-
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v3i16, Custom);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
+  setOperationAction(ISD::TRUNCATE,
+                     {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
+                      MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v16i32},
+                     Expand);
+  setOperationAction(ISD::FP_ROUND,
+                     {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
+                      MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32},
+                     Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG,
+                     {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
+                      MVT::v3i16, MVT::v4i16, MVT::Other},
+                     Custom);
 
   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
-  setOperationAction(ISD::BR_CC, MVT::i1, Expand);
-  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
-  setOperationAction(ISD::BR_CC, MVT::i64, Expand);
-  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
-  setOperationAction(ISD::BR_CC, MVT::f64, Expand);
+  setOperationAction(ISD::BR_CC,
+                     {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
 
-  setOperationAction(ISD::UADDO, MVT::i32, Legal);
-  setOperationAction(ISD::USUBO, MVT::i32, Legal);
+  setOperationAction({ISD::UADDO, ISD::USUBO}, MVT::i32, Legal);
 
-  setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
-  setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
+  setOperationAction({ISD::ADDCARRY, ISD::SUBCARRY}, MVT::i32, Legal);
 
-  setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
-  setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
-  setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
+  setOperationAction({ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS}, MVT::i64,
+                     Expand);
 
 #if 0
-  setOperationAction(ISD::ADDCARRY, MVT::i64, Legal);
-  setOperationAction(ISD::SUBCARRY, MVT::i64, Legal);
+  setOperationAction({ISD::ADDCARRY, ISD::SUBCARRY}, MVT::i64, Legal);
 #endif
 
   // We only support LOAD/STORE and vector manipulation ops for vectors
   // with > 4 elements.
-  for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
-                  MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16,
-                  MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32,
-                  MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
-                  MVT::v8i16, MVT::v8f16, MVT::v16i64, MVT::v16f64,
-                  MVT::v32i32, MVT::v32f32 }) {
+  for (MVT VT :
+       {MVT::v8i32,  MVT::v8f32,  MVT::v16i32, MVT::v16f32, MVT::v2i64,
+        MVT::v2f64,  MVT::v4i16,  MVT::v4f16,  MVT::v3i64,  MVT::v3f64,
+        MVT::v6i32,  MVT::v6f32,  MVT::v4i64,  MVT::v4f64,  MVT::v8i64,
+        MVT::v8f64,  MVT::v8i16,  MVT::v8f16,  MVT::v16i16, MVT::v16f16,
+        MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32}) {
     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
       switch (Op) {
       case ISD::LOAD:
@@ -372,94 +342,63 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
   }
 
-  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
-  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
-  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
-  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
+  setOperationAction(ISD::VECTOR_SHUFFLE,
+                     {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
+                     Expand);
 
-  setOperationAction(ISD::BUILD_VECTOR, MVT::v4f16, Custom);
-  setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
+  setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16}, Custom);
 
   // Avoid stack access for these.
   // TODO: Generalize to more vector types.
-  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
-  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
-  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
-  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
-
-  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom);
-  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
-  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom);
-  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom);
-  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
-  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom);
-
-  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i16, Custom);
-  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f16, Custom);
-  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
-  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
+  setOperationAction({ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT},
+                     {MVT::v2i16, MVT::v2f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
+                      MVT::v4i16, MVT::v4f16, MVT::v16i16, MVT::v16f16},
+                     Custom);
 
   // Deal with vec3 vector operations when widened to vec4.
-  setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3i32, Custom);
-  setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3f32, Custom);
-  setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i32, Custom);
-  setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4f32, Custom);
+  setOperationAction(ISD::INSERT_SUBVECTOR,
+                     {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
 
   // Deal with vec5/6/7 vector operations when widened to vec8.
-  setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5i32, Custom);
-  setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5f32, Custom);
-  setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v6i32, Custom);
-  setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v6f32, Custom);
-  setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v7i32, Custom);
-  setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v7f32, Custom);
-  setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i32, Custom);
-  setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8f32, Custom);
+  setOperationAction(ISD::INSERT_SUBVECTOR,
+                     {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
+                      MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32},
+                     Custom);
 
   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
   // and output demarshalling
-  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
-  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
 
   // We can't return success/failure, only the old value,
   // let LLVM add the comparison
-  setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
-  setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64},
+                     Expand);
 
-  if (Subtarget->hasFlatAddressSpace()) {
-    setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
-    setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
-  }
+  if (Subtarget->hasFlatAddressSpace())
+    setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
 
-  setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
-  setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
+  setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
 
   // FIXME: This should be narrowed to i32, but that only happens if i64 is
   // illegal.
   // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
-  setOperationAction(ISD::BSWAP, MVT::i64, Legal);
-  setOperationAction(ISD::BSWAP, MVT::i32, Legal);
+  setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
 
   // On SI this is s_memtime and s_memrealtime on VI.
   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
-  setOperationAction(ISD::TRAP, MVT::Other, Custom);
-  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
+  setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom);
 
   if (Subtarget->has16BitInsts()) {
-    setOperationAction(ISD::FPOW, MVT::f16, Promote);
-    setOperationAction(ISD::FPOWI, MVT::f16, Promote);
-    setOperationAction(ISD::FLOG, MVT::f16, Custom);
-    setOperationAction(ISD::FEXP, MVT::f16, Custom);
-    setOperationAction(ISD::FLOG10, MVT::f16, Custom);
+    setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);
+    setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom);
   }
 
   if (Subtarget->hasMadMacF32Insts())
     setOperationAction(ISD::FMAD, MVT::f32, Legal);
 
-  if (!Subtarget->hasBFI()) {
+  if (!Subtarget->hasBFI())
     // fcopysign can be done in a single instruction with BFI.
-    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
-    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
-  }
+    setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
 
   if (!Subtarget->hasBCNT(32))
     setOperationAction(ISD::CTPOP, MVT::i32, Expand);
@@ -467,15 +406,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   if (!Subtarget->hasBCNT(64))
     setOperationAction(ISD::CTPOP, MVT::i64, Expand);
 
-  if (Subtarget->hasFFBH()) {
-    setOperationAction(ISD::CTLZ, MVT::i32, Custom);
-    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
-  }
+  if (Subtarget->hasFFBH())
+    setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom);
 
-  if (Subtarget->hasFFBL()) {
-    setOperationAction(ISD::CTTZ, MVT::i32, Custom);
-    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
-  }
+  if (Subtarget->hasFFBL())
+    setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);
 
   // We only really have 32-bit BFE instructions (and 16-bit on VI).
   //
@@ -489,84 +424,48 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setHasExtractBitsInsn(true);
 
   // Clamp modifier on add/sub
-  if (Subtarget->hasIntClamp()) {
-    setOperationAction(ISD::UADDSAT, MVT::i32, Legal);
-    setOperationAction(ISD::USUBSAT, MVT::i32, Legal);
-  }
+  if (Subtarget->hasIntClamp())
+    setOperationAction({ISD::UADDSAT, ISD::USUBSAT}, MVT::i32, Legal);
 
-  if (Subtarget->hasAddNoCarry()) {
-    setOperationAction(ISD::SADDSAT, MVT::i16, Legal);
-    setOperationAction(ISD::SSUBSAT, MVT::i16, Legal);
-    setOperationAction(ISD::SADDSAT, MVT::i32, Legal);
-    setOperationAction(ISD::SSUBSAT, MVT::i32, Legal);
-  }
-
-  setOperationAction(ISD::FMINNUM, MVT::f32, Custom);
-  setOperationAction(ISD::FMAXNUM, MVT::f32, Custom);
-  setOperationAction(ISD::FMINNUM, MVT::f64, Custom);
-  setOperationAction(ISD::FMAXNUM, MVT::f64, Custom);
+  if (Subtarget->hasAddNoCarry())
+    setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
+                       Legal);
 
+  setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
+                     Custom);
 
   // These are really only legal for ieee_mode functions. We should be avoiding
   // them for functions that don't have ieee_mode enabled, so just say they are
   // legal.
-  setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal);
-  setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
-  setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal);
-  setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
-
+  setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
+                     {MVT::f32, MVT::f64}, Legal);
 
-  if (Subtarget->haveRoundOpsF64()) {
-    setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
-    setOperationAction(ISD::FCEIL, MVT::f64, Legal);
-    setOperationAction(ISD::FRINT, MVT::f64, Legal);
-  } else {
-    setOperationAction(ISD::FCEIL, MVT::f64, Custom);
-    setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
-    setOperationAction(ISD::FRINT, MVT::f64, Custom);
-    setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
-  }
+  if (Subtarget->haveRoundOpsF64())
+    setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FRINT}, MVT::f64, Legal);
+  else
+    setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FFLOOR},
+                       MVT::f64, Custom);
 
   setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
 
-  setOperationAction(ISD::FSIN, MVT::f32, Custom);
-  setOperationAction(ISD::FCOS, MVT::f32, Custom);
-  setOperationAction(ISD::FDIV, MVT::f32, Custom);
+  setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom);
   setOperationAction(ISD::FDIV, MVT::f64, Custom);
 
   if (Subtarget->has16BitInsts()) {
-    setOperationAction(ISD::Constant, MVT::i16, Legal);
-
-    setOperationAction(ISD::SMIN, MVT::i16, Legal);
-    setOperationAction(ISD::SMAX, MVT::i16, Legal);
-
-    setOperationAction(ISD::UMIN, MVT::i16, Legal);
-    setOperationAction(ISD::UMAX, MVT::i16, Legal);
+    setOperationAction({ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN,
+                        ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT},
+                       MVT::i16, Legal);
 
-    setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
     AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
 
-    setOperationAction(ISD::ROTR, MVT::i16, Expand);
-    setOperationAction(ISD::ROTL, MVT::i16, Expand);
-
-    setOperationAction(ISD::SDIV, MVT::i16, Promote);
-    setOperationAction(ISD::UDIV, MVT::i16, Promote);
-    setOperationAction(ISD::SREM, MVT::i16, Promote);
-    setOperationAction(ISD::UREM, MVT::i16, Promote);
-    setOperationAction(ISD::UADDSAT, MVT::i16, Legal);
-    setOperationAction(ISD::USUBSAT, MVT::i16, Legal);
-
-    setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
-
-    setOperationAction(ISD::CTTZ, MVT::i16, Promote);
-    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
-    setOperationAction(ISD::CTLZ, MVT::i16, Promote);
-    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
-    setOperationAction(ISD::CTPOP, MVT::i16, Promote);
+    setOperationAction({ISD::ROTR, ISD::ROTL, ISD::SELECT_CC, ISD::BR_CC},
+                       MVT::i16, Expand);
 
-    setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
-
-    setOperationAction(ISD::BR_CC, MVT::i16, Expand);
+    setOperationAction({ISD::SIGN_EXTEND, ISD::SDIV, ISD::UDIV, ISD::SREM,
+                        ISD::UREM, ISD::BITREVERSE, ISD::CTTZ,
+                        ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF,
+                        ISD::CTPOP},
+                       MVT::i16, Promote);
 
     setOperationAction(ISD::LOAD, MVT::i16, Custom);
 
@@ -577,8 +476,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
     AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
 
-    setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
-    setOperationAction(ISD::FP_TO_UINT, MVT::i16, Custom);
+    setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::i16, Custom);
 
     // F16 - Constant Actions.
     setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
@@ -590,22 +488,18 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
 
     // F16 - VOP1 Actions.
-    setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
-    setOperationAction(ISD::FCOS, MVT::f16, Custom);
-    setOperationAction(ISD::FSIN, MVT::f16, Custom);
+    setOperationAction(
+        {ISD::FP_ROUND, ISD::FCOS, ISD::FSIN, ISD::FROUND, ISD::FPTRUNC_ROUND},
+        MVT::f16, Custom);
 
-    setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
-    setOperationAction(ISD::UINT_TO_FP, MVT::i16, Custom);
+    setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);
 
-    setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote);
-    setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
-    setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
-    setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote);
-    setOperationAction(ISD::FROUND, MVT::f16, Custom);
+    setOperationAction(
+        {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP},
+        MVT::f16, Promote);
 
     // F16 - VOP2 Actions.
-    setOperationAction(ISD::BR_CC, MVT::f16, Expand);
-    setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
+    setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, MVT::f16, Expand);
 
     setOperationAction(ISD::FDIV, MVT::f16, Custom);
 
@@ -615,7 +509,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::FMAD, MVT::f16, Legal);
 
     for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16,
-                   MVT::v8f16}) {
+                   MVT::v8f16, MVT::v16i16, MVT::v16f16}) {
       for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
         switch (Op) {
         case ISD::LOAD:
@@ -639,16 +533,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     }
 
     // v_perm_b32 can handle either of these.
-    setOperationAction(ISD::BSWAP, MVT::i16, Legal);
-    setOperationAction(ISD::BSWAP, MVT::v2i16, Legal);
+    setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
     setOperationAction(ISD::BSWAP, MVT::v4i16, Custom);
 
     // XXX - Do these do anything? Vector constants turn into build_vector.
-    setOperationAction(ISD::Constant, MVT::v2i16, Legal);
-    setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
+    setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
 
-    setOperationAction(ISD::UNDEF, MVT::v2i16, Legal);
-    setOperationAction(ISD::UNDEF, MVT::v2f16, Legal);
+    setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16}, Legal);
 
     setOperationAction(ISD::STORE, MVT::v2i16, Promote);
     AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
@@ -692,140 +583,98 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::STORE, MVT::v8f16, Promote);
     AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
 
-    setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
-    setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
-    setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
+    setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
+    AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
+    setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
+    AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
+
+    setOperationAction(ISD::STORE, MVT::v16i16, Promote);
+    AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
+    setOperationAction(ISD::STORE, MVT::v16f16, Promote);
+    AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
+
+    setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
+                       MVT::v2i32, Expand);
     setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
 
-    setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Expand);
-    setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand);
-    setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand);
+    setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
+                       MVT::v4i32, Expand);
 
-    setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Expand);
-    setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Expand);
-    setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Expand);
+    setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
+                       MVT::v8i32, Expand);
 
-    if (!Subtarget->hasVOP3PInsts()) {
-      setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
-      setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
-    }
+    if (!Subtarget->hasVOP3PInsts())
+      setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16}, Custom);
 
     setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
     // This isn't really legal, but this avoids the legalizer unrolling it (and
     // allows matching fneg (fabs x) patterns)
     setOperationAction(ISD::FABS, MVT::v2f16, Legal);
 
-    setOperationAction(ISD::FMAXNUM, MVT::f16, Custom);
-    setOperationAction(ISD::FMINNUM, MVT::f16, Custom);
-    setOperationAction(ISD::FMAXNUM_IEEE, MVT::f16, Legal);
-    setOperationAction(ISD::FMINNUM_IEEE, MVT::f16, Legal);
+    setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
+    setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
 
-    setOperationAction(ISD::FMINNUM_IEEE, MVT::v4f16, Custom);
-    setOperationAction(ISD::FMAXNUM_IEEE, MVT::v4f16, Custom);
-    setOperationAction(ISD::FMINNUM_IEEE, MVT::v8f16, Custom);
-    setOperationAction(ISD::FMAXNUM_IEEE, MVT::v8f16, Custom);
+    setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
+                       {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Custom);
 
-    setOperationAction(ISD::FMINNUM, MVT::v4f16, Expand);
-    setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand);
-    setOperationAction(ISD::FMINNUM, MVT::v8f16, Expand);
-    setOperationAction(ISD::FMAXNUM, MVT::v8f16, Expand);
+    setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
+                       {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Expand);
 
-    for (MVT Vec16 : { MVT::v8i16, MVT::v8f16 }) {
-      setOperationAction(ISD::BUILD_VECTOR, Vec16, Custom);
-      setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec16, Custom);
+    for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16}) {
+      setOperationAction(
+          {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR},
+          Vec16, Custom);
       setOperationAction(ISD::INSERT_VECTOR_ELT, Vec16, Expand);
-      setOperationAction(ISD::SCALAR_TO_VECTOR, Vec16, Expand);
     }
   }
 
   if (Subtarget->hasVOP3PInsts()) {
-    setOperationAction(ISD::ADD, MVT::v2i16, Legal);
-    setOperationAction(ISD::SUB, MVT::v2i16, Legal);
-    setOperationAction(ISD::MUL, MVT::v2i16, Legal);
-    setOperationAction(ISD::SHL, MVT::v2i16, Legal);
-    setOperationAction(ISD::SRL, MVT::v2i16, Legal);
-    setOperationAction(ISD::SRA, MVT::v2i16, Legal);
-    setOperationAction(ISD::SMIN, MVT::v2i16, Legal);
-    setOperationAction(ISD::UMIN, MVT::v2i16, Legal);
-    setOperationAction(ISD::SMAX, MVT::v2i16, Legal);
-    setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
-
-    setOperationAction(ISD::UADDSAT, MVT::v2i16, Legal);
-    setOperationAction(ISD::USUBSAT, MVT::v2i16, Legal);
-    setOperationAction(ISD::SADDSAT, MVT::v2i16, Legal);
-    setOperationAction(ISD::SSUBSAT, MVT::v2i16, Legal);
-
-    setOperationAction(ISD::FADD, MVT::v2f16, Legal);
-    setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
-    setOperationAction(ISD::FMA, MVT::v2f16, Legal);
-
-    setOperationAction(ISD::FMINNUM_IEEE, MVT::v2f16, Legal);
-    setOperationAction(ISD::FMAXNUM_IEEE, MVT::v2f16, Legal);
-
-    setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);
-
-    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
-    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
-
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f16, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f16, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i16, Custom);
-
-    for (MVT VT : { MVT::v4i16, MVT::v8i16 }) {
-      // Split vector operations.
-      setOperationAction(ISD::SHL, VT, Custom);
-      setOperationAction(ISD::SRA, VT, Custom);
-      setOperationAction(ISD::SRL, VT, Custom);
-      setOperationAction(ISD::ADD, VT, Custom);
-      setOperationAction(ISD::SUB, VT, Custom);
-      setOperationAction(ISD::MUL, VT, Custom);
+    setOperationAction({ISD::ADD, ISD::SUB, ISD::MUL, ISD::SHL, ISD::SRL,
+                        ISD::SRA, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX,
+                        ISD::UADDSAT, ISD::USUBSAT, ISD::SADDSAT, ISD::SSUBSAT},
+                       MVT::v2i16, Legal);
 
-      setOperationAction(ISD::SMIN, VT, Custom);
-      setOperationAction(ISD::SMAX, VT, Custom);
-      setOperationAction(ISD::UMIN, VT, Custom);
-      setOperationAction(ISD::UMAX, VT, Custom);
+    setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
+                        ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
+                       MVT::v2f16, Legal);
 
-      setOperationAction(ISD::UADDSAT, VT, Custom);
-      setOperationAction(ISD::SADDSAT, VT, Custom);
-      setOperationAction(ISD::USUBSAT, VT, Custom);
-      setOperationAction(ISD::SSUBSAT, VT, Custom);
-    }
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16},
+                       Custom);
+
+    setOperationAction(ISD::VECTOR_SHUFFLE,
+                       {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
+                        MVT::v16f16, MVT::v16i16},
+                       Custom);
 
-    for (MVT VT : { MVT::v4f16, MVT::v8f16 }) {
+    for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16})
       // Split vector operations.
-      setOperationAction(ISD::FADD, VT, Custom);
-      setOperationAction(ISD::FMUL, VT, Custom);
-      setOperationAction(ISD::FMA, VT, Custom);
-      setOperationAction(ISD::FCANONICALIZE, VT, Custom);
-    }
+      setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB,
+                          ISD::MUL, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX,
+                          ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT,
+                          ISD::SSUBSAT},
+                         VT, Custom);
 
-    setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom);
-    setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom);
+    for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16})
+      // Split vector operations.
+      setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
+                         VT, Custom);
 
-    setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
-    setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
+    setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
+                       Custom);
 
     setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
-    setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
-    setOperationAction(ISD::SELECT, MVT::v4f16, Custom);
+    setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16}, Custom);
 
     if (Subtarget->hasPackedFP32Ops()) {
-      setOperationAction(ISD::FADD, MVT::v2f32, Legal);
-      setOperationAction(ISD::FMUL, MVT::v2f32, Legal);
-      setOperationAction(ISD::FMA,  MVT::v2f32, Legal);
-      setOperationAction(ISD::FNEG, MVT::v2f32, Legal);
-
-      for (MVT VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32 }) {
-        setOperationAction(ISD::FADD, VT, Custom);
-        setOperationAction(ISD::FMUL, VT, Custom);
-        setOperationAction(ISD::FMA, VT, Custom);
-      }
+      setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG},
+                         MVT::v2f32, Legal);
+      setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA},
+                         {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
+                         Custom);
     }
   }
 
-  setOperationAction(ISD::FNEG, MVT::v4f16, Custom);
-  setOperationAction(ISD::FABS, MVT::v4f16, Custom);
+  setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom);
 
   if (Subtarget->has16BitInsts()) {
     setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
@@ -834,107 +683,88 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
   } else {
     // Legalization hack.
-    setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
-    setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
-
-    setOperationAction(ISD::FNEG, MVT::v2f16, Custom);
-    setOperationAction(ISD::FABS, MVT::v2f16, Custom);
-  }
-
-  for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
-                  MVT::v8i16, MVT::v8f16 }) {
-    setOperationAction(ISD::SELECT, VT, Custom);
-  }
-
-  setOperationAction(ISD::SMULO, MVT::i64, Custom);
-  setOperationAction(ISD::UMULO, MVT::i64, Custom);
-
-  if (Subtarget->hasMad64_32()) {
-    setOperationAction(ISD::SMUL_LOHI, MVT::i32, Custom);
-    setOperationAction(ISD::UMUL_LOHI, MVT::i32, Custom);
-  }
-
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom);
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
-
-  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
-  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2i16, Custom);
-  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v3f16, Custom);
-  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v3i16, Custom);
-  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
-  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4i16, Custom);
-  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom);
-  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
-  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::f16, Custom);
-  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom);
-  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
-
-  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
-  setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
-  setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
-  setOperationAction(ISD::INTRINSIC_VOID, MVT::v3i16, Custom);
-  setOperationAction(ISD::INTRINSIC_VOID, MVT::v3f16, Custom);
-  setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
-  setOperationAction(ISD::INTRINSIC_VOID, MVT::v4i16, Custom);
-  setOperationAction(ISD::INTRINSIC_VOID, MVT::f16, Custom);
-  setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom);
-  setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom);
-
-  setTargetDAGCombine(ISD::ADD);
-  setTargetDAGCombine(ISD::ADDCARRY);
-  setTargetDAGCombine(ISD::SUB);
-  setTargetDAGCombine(ISD::SUBCARRY);
-  setTargetDAGCombine(ISD::FADD);
-  setTargetDAGCombine(ISD::FSUB);
-  setTargetDAGCombine(ISD::FMINNUM);
-  setTargetDAGCombine(ISD::FMAXNUM);
-  setTargetDAGCombine(ISD::FMINNUM_IEEE);
-  setTargetDAGCombine(ISD::FMAXNUM_IEEE);
-  setTargetDAGCombine(ISD::FMA);
-  setTargetDAGCombine(ISD::SMIN);
-  setTargetDAGCombine(ISD::SMAX);
-  setTargetDAGCombine(ISD::UMIN);
-  setTargetDAGCombine(ISD::UMAX);
-  setTargetDAGCombine(ISD::SETCC);
-  setTargetDAGCombine(ISD::AND);
-  setTargetDAGCombine(ISD::OR);
-  setTargetDAGCombine(ISD::XOR);
-  setTargetDAGCombine(ISD::SINT_TO_FP);
-  setTargetDAGCombine(ISD::UINT_TO_FP);
-  setTargetDAGCombine(ISD::FCANONICALIZE);
-  setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
-  setTargetDAGCombine(ISD::ZERO_EXTEND);
-  setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
-  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
-  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+    setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
+
+    setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom);
+  }
+
+  setOperationAction(ISD::SELECT,
+                     {MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
+                      MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16},
+                     Custom);
+
+  setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom);
+
+  if (Subtarget->hasMad64_32())
+    setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom);
+
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN,
+                     {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
+                      MVT::v2i16, MVT::v2f16},
+                     Custom);
+
+  setOperationAction(ISD::INTRINSIC_W_CHAIN,
+                     {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16,
+                      MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16,
+                      MVT::i16, MVT::i8},
+                     Custom);
+
+  setOperationAction(ISD::INTRINSIC_VOID,
+                     {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16,
+                      MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16,
+                      MVT::i8},
+                     Custom);
+
+  setTargetDAGCombine({ISD::ADD,
+                       ISD::ADDCARRY,
+                       ISD::SUB,
+                       ISD::SUBCARRY,
+                       ISD::FADD,
+                       ISD::FSUB,
+                       ISD::FMINNUM,
+                       ISD::FMAXNUM,
+                       ISD::FMINNUM_IEEE,
+                       ISD::FMAXNUM_IEEE,
+                       ISD::FMA,
+                       ISD::SMIN,
+                       ISD::SMAX,
+                       ISD::UMIN,
+                       ISD::UMAX,
+                       ISD::SETCC,
+                       ISD::AND,
+                       ISD::OR,
+                       ISD::XOR,
+                       ISD::SINT_TO_FP,
+                       ISD::UINT_TO_FP,
+                       ISD::FCANONICALIZE,
+                       ISD::SCALAR_TO_VECTOR,
+                       ISD::ZERO_EXTEND,
+                       ISD::SIGN_EXTEND_INREG,
+                       ISD::EXTRACT_VECTOR_ELT,
+                       ISD::INSERT_VECTOR_ELT});
 
   // All memory operations. Some folding on the pointer operand is done to help
   // matching the constant offsets in the addressing modes.
-  setTargetDAGCombine(ISD::LOAD);
-  setTargetDAGCombine(ISD::STORE);
-  setTargetDAGCombine(ISD::ATOMIC_LOAD);
-  setTargetDAGCombine(ISD::ATOMIC_STORE);
-  setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
-  setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
-  setTargetDAGCombine(ISD::ATOMIC_SWAP);
-  setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
-  setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
-  setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
-  setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
-  setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
-  setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
-  setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
-  setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
-  setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
-  setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
-  setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD);
-  setTargetDAGCombine(ISD::INTRINSIC_VOID);
-  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
+  setTargetDAGCombine({ISD::LOAD,
+                       ISD::STORE,
+                       ISD::ATOMIC_LOAD,
+                       ISD::ATOMIC_STORE,
+                       ISD::ATOMIC_CMP_SWAP,
+                       ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
+                       ISD::ATOMIC_SWAP,
+                       ISD::ATOMIC_LOAD_ADD,
+                       ISD::ATOMIC_LOAD_SUB,
+                       ISD::ATOMIC_LOAD_AND,
+                       ISD::ATOMIC_LOAD_OR,
+                       ISD::ATOMIC_LOAD_XOR,
+                       ISD::ATOMIC_LOAD_NAND,
+                       ISD::ATOMIC_LOAD_MIN,
+                       ISD::ATOMIC_LOAD_MAX,
+                       ISD::ATOMIC_LOAD_UMIN,
+                       ISD::ATOMIC_LOAD_UMAX,
+                       ISD::ATOMIC_LOAD_FADD,
+                       ISD::INTRINSIC_VOID,
+                       ISD::INTRINSIC_W_CHAIN});
 
   // FIXME: In other contexts we pretend this is a per-function property.
   setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32);
@@ -1118,6 +948,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                           const CallInst &CI,
                                           MachineFunction &MF,
                                           unsigned IntrID) const {
+  Info.flags = MachineMemOperand::MONone;
+  if (CI.hasMetadata(LLVMContext::MD_invariant_load))
+    Info.flags |= MachineMemOperand::MOInvariant;
+
   if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
           AMDGPU::lookupRsrcIntrinsic(IntrID)) {
     AttributeList Attr = Intrinsic::getAttributes(CI.getContext(),
@@ -1127,16 +961,17 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
 
     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
+    const GCNTargetMachine &TM =
+        static_cast<const GCNTargetMachine &>(getTargetMachine());
+
     if (RsrcIntr->IsImage) {
-      Info.ptrVal =
-          MFI->getImagePSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
+      Info.ptrVal = MFI->getImagePSV(TM);
       Info.align.reset();
     } else {
-      Info.ptrVal =
-          MFI->getBufferPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
+      Info.ptrVal = MFI->getBufferPSV(TM);
     }
 
-    Info.flags = MachineMemOperand::MODereferenceable;
+    Info.flags |= MachineMemOperand::MODereferenceable;
     if (Attr.hasFnAttr(Attribute::ReadOnly)) {
       unsigned DMaskLanes = 4;
 
@@ -1178,12 +1013,23 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
       Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
                                             ISD::INTRINSIC_W_CHAIN;
       Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
-      Info.flags = MachineMemOperand::MOLoad |
-                   MachineMemOperand::MOStore |
-                   MachineMemOperand::MODereferenceable;
+      Info.flags |= MachineMemOperand::MOLoad |
+                    MachineMemOperand::MOStore |
+                    MachineMemOperand::MODereferenceable;
 
       // XXX - Should this be volatile without known ordering?
       Info.flags |= MachineMemOperand::MOVolatile;
+
+      switch (IntrID) {
+      default:
+        break;
+      case Intrinsic::amdgcn_raw_buffer_load_lds:
+      case Intrinsic::amdgcn_struct_buffer_load_lds: {
+        unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
+        Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
+        return true;
+      }
+      }
     }
     return true;
   }
@@ -1200,7 +1046,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = MVT::getVT(CI.getType());
     Info.ptrVal = CI.getOperand(0);
     Info.align.reset();
-    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+    Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
 
     const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
     if (!Vol->isZero())
@@ -1211,12 +1057,14 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::amdgcn_buffer_atomic_fadd: {
     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
+    const GCNTargetMachine &TM =
+        static_cast<const GCNTargetMachine &>(getTargetMachine());
+
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
-    Info.ptrVal =
-        MFI->getBufferPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
+    Info.ptrVal = MFI->getBufferPSV(TM);
     Info.align.reset();
-    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+    Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
 
     const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
     if (!Vol || !Vol->isZero())
@@ -1230,7 +1078,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = MVT::getVT(CI.getType());
     Info.ptrVal = CI.getOperand(0);
     Info.align.reset();
-    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+    Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
 
     const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
     if (!Vol->isZero())
@@ -1243,20 +1091,23 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = MVT::getVT(CI.getType());
     Info.ptrVal = CI.getOperand(0);
     Info.align.reset();
-    Info.flags = MachineMemOperand::MOLoad |
-                 MachineMemOperand::MOStore |
-                 MachineMemOperand::MOVolatile;
+    Info.flags |= MachineMemOperand::MOLoad |
+                  MachineMemOperand::MOStore |
+                  MachineMemOperand::MOVolatile;
     return true;
   }
   case Intrinsic::amdgcn_image_bvh_intersect_ray: {
     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
-    Info.ptrVal =
-        MFI->getImagePSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
+
+    const GCNTargetMachine &TM =
+        static_cast<const GCNTargetMachine &>(getTargetMachine());
+
+    Info.ptrVal = MFI->getImagePSV(TM);
     Info.align.reset();
-    Info.flags = MachineMemOperand::MOLoad |
-                 MachineMemOperand::MODereferenceable;
+    Info.flags |= MachineMemOperand::MOLoad |
+                  MachineMemOperand::MODereferenceable;
     return true;
   }
   case Intrinsic::amdgcn_global_atomic_fadd:
@@ -1264,15 +1115,17 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::amdgcn_global_atomic_fmax:
   case Intrinsic::amdgcn_flat_atomic_fadd:
   case Intrinsic::amdgcn_flat_atomic_fmin:
-  case Intrinsic::amdgcn_flat_atomic_fmax: {
+  case Intrinsic::amdgcn_flat_atomic_fmax:
+  case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
+  case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::getVT(CI.getType());
     Info.ptrVal = CI.getOperand(0);
     Info.align.reset();
-    Info.flags = MachineMemOperand::MOLoad |
-                 MachineMemOperand::MOStore |
-                 MachineMemOperand::MODereferenceable |
-                 MachineMemOperand::MOVolatile;
+    Info.flags |= MachineMemOperand::MOLoad |
+                  MachineMemOperand::MOStore |
+                  MachineMemOperand::MODereferenceable |
+                  MachineMemOperand::MOVolatile;
     return true;
   }
   case Intrinsic::amdgcn_ds_gws_init:
@@ -1283,18 +1136,29 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::amdgcn_ds_gws_sema_release_all: {
     Info.opc = ISD::INTRINSIC_VOID;
 
+    const GCNTargetMachine &TM =
+        static_cast<const GCNTargetMachine &>(getTargetMachine());
+
     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-    Info.ptrVal =
-        MFI->getGWSPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
+    Info.ptrVal = MFI->getGWSPSV(TM);
 
     // This is an abstract access, but we need to specify a type and size.
     Info.memVT = MVT::i32;
     Info.size = 4;
     Info.align = Align(4);
 
-    Info.flags = MachineMemOperand::MOStore;
     if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
-      Info.flags = MachineMemOperand::MOLoad;
+      Info.flags |= MachineMemOperand::MOLoad;
+    else
+      Info.flags |= MachineMemOperand::MOStore;
+    return true;
+  }
+  case Intrinsic::amdgcn_global_load_lds: {
+    Info.opc = ISD::INTRINSIC_VOID;
+    unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
+    Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
+    Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
+                  MachineMemOperand::MOVolatile;
     return true;
   }
   default:
@@ -1319,6 +1183,8 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
   case Intrinsic::amdgcn_flat_atomic_fadd:
   case Intrinsic::amdgcn_flat_atomic_fmin:
   case Intrinsic::amdgcn_flat_atomic_fmax:
+  case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
+  case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
   case Intrinsic::amdgcn_global_atomic_csub: {
     Value *Ptr = II->getArgOperand(0);
     AccessTy = II->getType();
@@ -1506,47 +1372,96 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
       AddrSpace == AMDGPUAS::REGION_ADDRESS) {
     // Check if alignment requirements for ds_read/write instructions are
     // disabled.
-    if (Subtarget->hasUnalignedDSAccessEnabled() &&
-        !Subtarget->hasLDSMisalignedBug()) {
-      if (IsFast)
-        *IsFast = Alignment != Align(2);
-      return true;
-    }
+    if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
+      return false;
+
+    Align RequiredAlignment(PowerOf2Ceil(Size/8)); // Natural alignment.
+    if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
+        Alignment < RequiredAlignment)
+      return false;
 
     // Either, the alignment requirements are "enabled", or there is an
     // unaligned LDS access related hardware bug though alignment requirements
     // are "disabled". In either case, we need to check for proper alignment
     // requirements.
     //
-    if (Size == 64) {
+    switch (Size) {
+    case 64:
+      // SI has a hardware bug in the LDS / GDS bounds checking: if the base
+      // address is negative, then the instruction is incorrectly treated as
+      // out-of-bounds even if base + offsets is in bounds. Split vectorized
+      // loads here to avoid emitting ds_read2_b32. We may re-combine the
+      // load later in the SILoadStoreOptimizer.
+      if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
+        return false;
+
       // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
       // can do a 4 byte aligned, 8 byte access in a single operation using
       // ds_read2/write2_b32 with adjacent offsets.
-      bool AlignedBy4 = Alignment >= Align(4);
-      if (IsFast)
-        *IsFast = AlignedBy4;
+      RequiredAlignment = Align(4);
+
+      if (Subtarget->hasUnalignedDSAccessEnabled()) {
+        // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
+        // ds_write2_b32 depending on the alignment. In either case with either
+        // alignment there is no faster way of doing this.
+        if (IsFast)
+          *IsFast = true;
+        return true;
+      }
+
+      break;
+    case 96:
+      if (!Subtarget->hasDS96AndDS128())
+        return false;
 
-      return AlignedBy4;
-    }
-    if (Size == 96) {
       // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
       // gfx8 and older.
-      bool AlignedBy16 = Alignment >= Align(16);
-      if (IsFast)
-        *IsFast = AlignedBy16;
 
-      return AlignedBy16;
-    }
-    if (Size == 128) {
+      if (Subtarget->hasUnalignedDSAccessEnabled()) {
+        // Naturally aligned access is fastest. However, also report it is Fast
+        // if memory is aligned less than DWORD. A narrow load or store will be
+        // be equally slow as a single ds_read_b96/ds_write_b96, but there will
+        // be more of them, so overall we will pay less penalty issuing a single
+        // instruction.
+        if (IsFast)
+          *IsFast = Alignment >= RequiredAlignment || Alignment < Align(4);
+        return true;
+      }
+
+      break;
+    case 128:
+      if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
+        return false;
+
       // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
       // gfx8 and older, but  we can do a 8 byte aligned, 16 byte access in a
       // single operation using ds_read2/write2_b64.
-      bool AlignedBy8 = Alignment >= Align(8);
-      if (IsFast)
-        *IsFast = AlignedBy8;
+      RequiredAlignment = Align(8);
+
+      if (Subtarget->hasUnalignedDSAccessEnabled()) {
+        // Naturally aligned access is fastest. However, also report it is Fast
+        // if memory is aligned less than DWORD. A narrow load or store will be
+        // be equally slow as a single ds_read_b128/ds_write_b128, but there
+        // will be more of them, so overall we will pay less penalty issuing a
+        // single instruction.
+        if (IsFast)
+          *IsFast = Alignment >= RequiredAlignment || Alignment < Align(4);
+        return true;
+      }
 
-      return AlignedBy8;
+      break;
+    default:
+      if (Size > 32)
+        return false;
+
+      break;
     }
+
+    if (IsFast)
+      *IsFast = Alignment >= RequiredAlignment;
+
+    return Alignment >= RequiredAlignment ||
+           Subtarget->hasUnalignedDSAccessEnabled();
   }
 
   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
@@ -1571,14 +1486,12 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
     return AlignedBy4;
   }
 
-  if (Subtarget->hasUnalignedBufferAccessEnabled() &&
-      !(AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
-        AddrSpace == AMDGPUAS::REGION_ADDRESS)) {
-    // If we have an uniform constant load, it still requires using a slow
+  if (Subtarget->hasUnalignedBufferAccessEnabled()) {
+    // If we have a uniform constant load, it still requires using a slow
     // buffer instruction if unaligned.
     if (IsFast) {
       // Accesses can really be issued as 1-byte aligned or 4-byte aligned, so
-      // 2-byte alignment is worse than 1 unless doing a 2-byte accesss.
+      // 2-byte alignment is worse than 1 unless doing a 2-byte access.
       *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
                  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
         Alignment >= Align(4) : Alignment != Align(2);
@@ -1603,20 +1516,22 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
 bool SITargetLowering::allowsMisalignedMemoryAccesses(
     EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
     bool *IsFast) const {
-  if (IsFast)
-    *IsFast = false;
-
-  // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
-  // which isn't a simple VT.
-  // Until MVT is extended to handle this, simply check for the size and
-  // rely on the condition below: allow accesses if the size is a multiple of 4.
-  if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
-                           VT.getStoreSize() > 16)) {
-    return false;
+  bool Allow = allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
+                                                  Alignment, Flags, IsFast);
+
+  if (Allow && IsFast && Subtarget->hasUnalignedDSAccessEnabled() &&
+      (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+       AddrSpace == AMDGPUAS::REGION_ADDRESS)) {
+    // Lie it is fast if +unaligned-access-mode is passed so that DS accesses
+    // get vectorized. We could use ds_read2_b*/ds_write2_b* instructions on a
+    // misaligned data which is faster than a pair of ds_read_b*/ds_write_b*
+    // which would be equally misaligned.
+    // This is only used by the common passes, selection always calls the
+    // allowsMisalignedMemoryAccessesImpl version.
+    *IsFast = true;
   }
 
-  return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
-                                            Alignment, Flags, IsFast);
+  return Allow;
 }
 
 EVT SITargetLowering::getOptimalMemOpType(
@@ -1639,9 +1554,7 @@ EVT SITargetLowering::getOptimalMemOpType(
 
 bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
   const MemSDNode *MemNode = cast<MemSDNode>(N);
-  const Value *Ptr = MemNode->getMemOperand()->getValue();
-  const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
-  return I && I->getMetadata("amdgpu.noclobber");
+  return MemNode->getMemOperand()->getFlags() & MONoClobber;
 }
 
 bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) {
@@ -1681,6 +1594,15 @@ bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
   return true;
 }
 
+bool SITargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
+                                               unsigned Index) const {
+  if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
+    return false;
+
+  // TODO: Add more cases that are cheap.
+  return Index == 0;
+}
+
 bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
   if (Subtarget->has16BitInsts() && VT == MVT::i16) {
     switch (Op) {
@@ -2106,7 +2028,7 @@ void SITargetLowering::allocateSpecialInputSGPRs(
   if (Info.hasDispatchPtr())
     allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
 
-  if (Info.hasQueuePtr())
+  if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5)
     allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
 
   // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
@@ -2153,7 +2075,7 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
     CCInfo.AllocateReg(DispatchPtrReg);
   }
 
-  if (Info.hasQueuePtr()) {
+  if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) {
     Register QueuePtrReg = Info.addQueuePtr(TRI);
     MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
     CCInfo.AllocateReg(QueuePtrReg);
@@ -2190,6 +2112,24 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
                                            SIMachineFunctionInfo &Info,
                                            CallingConv::ID CallConv,
                                            bool IsShader) const {
+  if (Subtarget->hasUserSGPRInit16Bug()) {
+    // Pad up the used user SGPRs with dead inputs.
+    unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
+
+    // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
+    // rely on it to reach 16 since if we end up having no stack usage, it will
+    // not really be added.
+    unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() +
+                                      Info.hasWorkGroupIDY() +
+                                      Info.hasWorkGroupIDZ() +
+                                      Info.hasWorkGroupInfo();
+    for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
+      Register Reg = Info.addReservedUserSGPR();
+      MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
+      CCInfo.AllocateReg(Reg);
+    }
+  }
+
   if (Info.hasWorkGroupIDX()) {
     Register Reg = Info.addWorkGroupIDX();
     MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
@@ -2234,6 +2174,8 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
     MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
     CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
   }
+
+  assert(!Subtarget->hasUserSGPRInit16Bug() || Info.getNumPreloadedSGPRs() >= 16);
 }
 
 static void reservePrivateMemoryRegs(const TargetMachine &TM,
@@ -2388,7 +2330,7 @@ SDValue SITargetLowering::LowerFormalArguments(
     return DAG.getEntryNode();
   }
 
-  Info->allocateModuleLDSGlobal(Fn.getParent());
+  Info->allocateModuleLDSGlobal(Fn);
 
   SmallVector<ISD::InputArg, 16> Splits;
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -2538,7 +2480,13 @@ SDValue SITargetLowering::LowerFormalArguments(
     assert(VA.isRegLoc() && "Parameter must be in a register!");
 
     Register Reg = VA.getLocReg();
-    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
+    const TargetRegisterClass *RC = nullptr;
+    if (AMDGPU::VGPR_32RegClass.contains(Reg))
+      RC = &AMDGPU::VGPR_32RegClass;
+    else if (AMDGPU::SGPR_32RegClass.contains(Reg))
+      RC = &AMDGPU::SGPR_32RegClass;
+    else
+      llvm_unreachable("Unexpected register class in LowerFormalArguments!");
     EVT ValVT = VA.getValVT();
 
     Reg = MF.addLiveIn(Reg, RC);
@@ -2657,24 +2605,6 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   SmallVector<SDValue, 48> RetOps;
   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
 
-  // Add return address for callable functions.
-  if (!Info->isEntryFunction()) {
-    const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
-    SDValue ReturnAddrReg = CreateLiveInRegister(
-      DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
-
-    SDValue ReturnAddrVirtualReg =
-        DAG.getRegister(MF.getRegInfo().createVirtualRegister(
-                            CallConv != CallingConv::AMDGPU_Gfx
-                                ? &AMDGPU::CCR_SGPR_64RegClass
-                                : &AMDGPU::Gfx_CCR_SGPR_64RegClass),
-                        MVT::i64);
-    Chain =
-        DAG.getCopyToReg(Chain, DL, ReturnAddrVirtualReg, ReturnAddrReg, Flag);
-    Flag = Chain.getValue(1);
-    RetOps.push_back(ReturnAddrVirtualReg);
-  }
-
   // Copy the result values into the output registers.
   for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
        ++I, ++RealRVLocIdx) {
@@ -2731,15 +2661,8 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     RetOps.push_back(Flag);
 
   unsigned Opc = AMDGPUISD::ENDPGM;
-  if (!IsWaveEnd) {
-    if (IsShader)
-      Opc = AMDGPUISD::RETURN_TO_EPILOG;
-    else if (CallConv == CallingConv::AMDGPU_Gfx)
-      Opc = AMDGPUISD::RET_GFX_FLAG;
-    else
-      Opc = AMDGPUISD::RET_FLAG;
-  }
-
+  if (!IsWaveEnd)
+    Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
   return DAG.getNode(Opc, DL, MVT::Other, RetOps);
 }
 
@@ -3321,21 +3244,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   }
 
 
-  SDValue PhysReturnAddrReg;
-  if (IsTailCall) {
-    // Since the return is being combined with the call, we need to pass on the
-    // return address.
-
-    const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
-    SDValue ReturnAddrReg = CreateLiveInRegister(
-      DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
-
-    PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
-                                        MVT::i64);
-    Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
-    InFlag = Chain.getValue(1);
-  }
-
   // We don't usually want to end the call-sequence here because we would tidy
   // the frame up *after* the call, however in the ABI-changing tail-call case
   // we've carefully laid out the parameters so that when sp is reset they'll be
@@ -3365,8 +3273,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
     // this information must travel along with the operation for eventual
     // consumption by emitEpilogue.
     Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
-
-    Ops.push_back(PhysReturnAddrReg);
   }
 
   // Add argument registers to the end of the list so that they are known live
@@ -4104,6 +4010,21 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
 
     bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
 
+    MachineOperand &Dest = MI.getOperand(0);
+    MachineOperand &Src0 = MI.getOperand(1);
+    MachineOperand &Src1 = MI.getOperand(2);
+
+    if (IsAdd && ST.hasLshlAddB64()) {
+      auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
+                         Dest.getReg())
+                     .add(Src0)
+                     .addImm(0)
+                     .add(Src1);
+      TII->legalizeOperands(*Add);
+      MI.eraseFromParent();
+      return BB;
+    }
+
     const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
 
     Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -4112,10 +4033,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     Register CarryReg = MRI.createVirtualRegister(CarryRC);
     Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
 
-    MachineOperand &Dest = MI.getOperand(0);
-    MachineOperand &Src0 = MI.getOperand(1);
-    MachineOperand &Src1 = MI.getOperand(2);
-
     const TargetRegisterClass *Src0RC = Src0.isReg()
                                             ? MRI.getRegClass(Src0.getReg())
                                             : &AMDGPU::VReg_64RegClass;
@@ -4390,29 +4307,7 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
   case AMDGPU::DS_GWS_INIT:
   case AMDGPU::DS_GWS_SEMA_BR:
   case AMDGPU::DS_GWS_BARRIER:
-    if (Subtarget->needsAlignedVGPRs()) {
-      // Add implicit aligned super-reg to force alignment on the data operand.
-      const DebugLoc &DL = MI.getDebugLoc();
-      MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-      const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
-      MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
-      Register DataReg = Op->getReg();
-      bool IsAGPR = TRI->isAGPR(MRI, DataReg);
-      Register Undef = MRI.createVirtualRegister(
-          IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
-      BuildMI(*BB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), Undef);
-      Register NewVR =
-          MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
-                                           : &AMDGPU::VReg_64_Align2RegClass);
-      BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), NewVR)
-          .addReg(DataReg, 0, Op->getSubReg())
-          .addImm(AMDGPU::sub0)
-          .addReg(Undef)
-          .addImm(AMDGPU::sub1);
-      Op->setReg(NewVR);
-      Op->setSubReg(AMDGPU::sub0);
-      MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
-    }
+    TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
     LLVM_FALLTHROUGH;
   case AMDGPU::DS_GWS_SEMA_V:
   case AMDGPU::DS_GWS_SEMA_P:
@@ -4500,6 +4395,18 @@ bool SITargetLowering::hasBitPreservingFPLogic(EVT VT) const {
   return isTypeLegal(VT.getScalarType());
 }
 
+bool SITargetLowering::hasAtomicFaddRtnForTy(SDValue &Op) const {
+  switch (Op.getValue(0).getSimpleValueType().SimpleTy) {
+  case MVT::f32:
+    return Subtarget->hasAtomicFaddRtnInsts();
+  case MVT::v2f16:
+  case MVT::f64:
+    return Subtarget->hasGFX90AInsts();
+  default:
+    return false;
+  }
+}
+
 bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
   // This currently forces unfolding various combinations of fsub into fma with
   // free fneg'd operands. As long as we have fast FMA (controlled by
@@ -4560,7 +4467,7 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
 
     // Otherwise f32 mad is always full rate and returns the same result as
     // the separate operations so should be preferred over fma.
-    // However does not support denomals.
+    // However does not support denormals.
     if (hasFP32Denormals(MF))
       return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
 
@@ -4653,8 +4560,9 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
   unsigned Opc = Op.getOpcode();
   EVT VT = Op.getValueType();
   assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
-         VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8f32 ||
-         VT == MVT::v16f32 || VT == MVT::v32f32);
+         VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
+         VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
+         VT == MVT::v32f32);
 
   SDValue Lo0, Hi0;
   std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
@@ -4676,8 +4584,9 @@ SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
   unsigned Opc = Op.getOpcode();
   EVT VT = Op.getValueType();
   assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
-         VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v8f32 ||
-         VT == MVT::v16f32 || VT == MVT::v32f32);
+         VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
+         VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
+         VT == MVT::v32f32);
 
   SDValue Lo0, Hi0;
   SDValue Op0 = Op.getOperand(0);
@@ -4738,10 +4647,30 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return lowerEXTRACT_VECTOR_ELT(Op, DAG);
   case ISD::VECTOR_SHUFFLE:
     return lowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::SCALAR_TO_VECTOR:
+    return lowerSCALAR_TO_VECTOR(Op, DAG);
   case ISD::BUILD_VECTOR:
     return lowerBUILD_VECTOR(Op, DAG);
   case ISD::FP_ROUND:
     return lowerFP_ROUND(Op, DAG);
+  case ISD::FPTRUNC_ROUND: {
+    unsigned Opc;
+    SDLoc DL(Op);
+
+    if (Op.getOperand(0)->getValueType(0) != MVT::f32)
+      return SDValue();
+
+    // Get the rounding mode from the last operand
+    int RoundMode = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    if (RoundMode == (int)RoundingMode::TowardPositive)
+      Opc = AMDGPUISD::FPTRUNC_ROUND_UPWARD;
+    else if (RoundMode == (int)RoundingMode::TowardNegative)
+      Opc = AMDGPUISD::FPTRUNC_ROUND_DOWNWARD;
+    else
+      return SDValue();
+
+    return DAG.getNode(Opc, DL, Op.getNode()->getVTList(), Op->getOperand(0));
+  }
   case ISD::TRAP:
     return lowerTRAP(Op, DAG);
   case ISD::DEBUGTRAP:
@@ -5356,7 +5285,7 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
   if (IsIEEEMode)
     return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
 
-  if (VT == MVT::v4f16 || VT == MVT::v8f16)
+  if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16)
     return splitBinaryVectorOp(Op, DAG);
   return Op;
 }
@@ -5439,24 +5368,41 @@ SDValue SITargetLowering::lowerTrapEndpgm(
   return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
 }
 
+SDValue SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
+    const SDLoc &DL, Align Alignment, ImplicitParameter Param) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  uint64_t Offset = getImplicitParameterOffset(MF, Param);
+  SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
+  MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
+  return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
+                     MachineMemOperand::MODereferenceable |
+                         MachineMemOperand::MOInvariant);
+}
+
 SDValue SITargetLowering::lowerTrapHsaQueuePtr(
     SDValue Op, SelectionDAG &DAG) const {
   SDLoc SL(Op);
   SDValue Chain = Op.getOperand(0);
 
-  MachineFunction &MF = DAG.getMachineFunction();
-  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
-  Register UserSGPR = Info->getQueuePtrUserSGPR();
-
   SDValue QueuePtr;
-  if (UserSGPR == AMDGPU::NoRegister) {
-    // We probably are in a function incorrectly marked with
-    // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the trap,
-    // so just use a null pointer.
-    QueuePtr = DAG.getConstant(0, SL, MVT::i64);
+  // For code object version 5, QueuePtr is passed through implicit kernarg.
+  if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
+    QueuePtr =
+        loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
   } else {
-    QueuePtr = CreateLiveInRegister(
-      DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
+    MachineFunction &MF = DAG.getMachineFunction();
+    SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+    Register UserSGPR = Info->getQueuePtrUserSGPR();
+
+    if (UserSGPR == AMDGPU::NoRegister) {
+      // We probably are in a function incorrectly marked with
+      // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
+      // trap, so just use a null pointer.
+      QueuePtr = DAG.getConstant(0, SL, MVT::i64);
+    } else {
+      QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
+                                      MVT::i64);
+    }
   }
 
   SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
@@ -5532,6 +5478,14 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
     return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
   }
 
+  // For code object version 5, private_base and shared_base are passed through
+  // implicit kernargs.
+  if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
+    ImplicitParameter Param =
+        (AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE;
+    return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
+  }
+
   MachineFunction &MF = DAG.getMachineFunction();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
   Register UserSGPR = Info->getQueuePtrUserSGPR();
@@ -5691,14 +5645,11 @@ SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
   EVT EltVT = VecVT.getVectorElementType();
   unsigned VecSize = VecVT.getSizeInBits();
   unsigned EltSize = EltVT.getSizeInBits();
+  SDLoc SL(Op);
 
-
-  assert(VecSize <= 64);
-
+  // Specially handle the case of v4i16 with static indexing.
   unsigned NumElts = VecVT.getVectorNumElements();
-  SDLoc SL(Op);
   auto KIdx = dyn_cast<ConstantSDNode>(Idx);
-
   if (NumElts == 4 && EltSize == 16 && KIdx) {
     SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
 
@@ -5726,35 +5677,41 @@ SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
     return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
   }
 
+  // Static indexing does not lower to stack access, and hence there is no need
+  // for special custom lowering to avoid stack access.
   if (isa<ConstantSDNode>(Idx))
     return SDValue();
 
-  MVT IntVT = MVT::getIntegerVT(VecSize);
-
-  // Avoid stack access for dynamic indexing.
+  // Avoid stack access for dynamic indexing by custom lowering to
   // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
 
-  // Create a congruent vector with the target value in each element so that
-  // the required element can be masked and ORed into the target vector.
-  SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
-                               DAG.getSplatBuildVector(VecVT, SL, InsVal));
+  assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
+
+  MVT IntVT = MVT::getIntegerVT(VecSize);
 
+  // Convert vector index to bit-index and get the required bit mask.
   assert(isPowerOf2_32(EltSize));
   SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
-
-  // Convert vector index to bit-index.
   SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
-
-  SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
   SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
                             DAG.getConstant(0xffff, SL, IntVT),
                             ScaledIdx);
 
+  // 1. Create a congruent vector with the target value in each element.
+  SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
+                               DAG.getSplatBuildVector(VecVT, SL, InsVal));
+
+  // 2. Mask off all other indicies except the required index within (1).
   SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
+
+  // 3. Mask off the required index within the target vector.
+  SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
   SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
                             DAG.getNOT(SL, BFM, IntVT), BCVec);
 
+  // 4. Get (2) and (3) ORed into the target vector.
   SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
+
   return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
 }
 
@@ -5778,17 +5735,35 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
   if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
     return Combined;
 
-  if (VecSize == 128) {
+  if (VecSize == 128 || VecSize == 256) {
     SDValue Lo, Hi;
     EVT LoVT, HiVT;
-    SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
     std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
-    Lo =
-        DAG.getBitcast(LoVT, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64,
-                                         V2, DAG.getConstant(0, SL, MVT::i32)));
-    Hi =
-        DAG.getBitcast(HiVT, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64,
-                                         V2, DAG.getConstant(1, SL, MVT::i32)));
+
+    if (VecSize == 128) {
+      SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
+      Lo = DAG.getBitcast(LoVT,
+                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
+                                      DAG.getConstant(0, SL, MVT::i32)));
+      Hi = DAG.getBitcast(HiVT,
+                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
+                                      DAG.getConstant(1, SL, MVT::i32)));
+    } else {
+      assert(VecSize == 256);
+
+      SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
+      SDValue Parts[4];
+      for (unsigned P = 0; P < 4; ++P) {
+        Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
+                               DAG.getConstant(P, SL, MVT::i32));
+      }
+
+      Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
+                                            Parts[0], Parts[1]));
+      Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
+                                            Parts[2], Parts[3]));
+    }
+
     EVT IdxVT = Idx.getValueType();
     unsigned NElem = VecVT.getVectorNumElements();
     assert(isPowerOf2_32(NElem));
@@ -5800,10 +5775,19 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
 
   assert(VecSize <= 64);
 
+  MVT IntVT = MVT::getIntegerVT(VecSize);
+
+  // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
+  SDValue VecBC = peekThroughBitcasts(Vec);
+  if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+    SDValue Src = VecBC.getOperand(0);
+    Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
+    Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
+  }
+
   unsigned EltSize = EltVT.getSizeInBits();
   assert(isPowerOf2_32(EltSize));
 
-  MVT IntVT = MVT::getIntegerVT(VecSize);
   SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
 
   // Convert vector index to bit-index (* EltSize)
@@ -5877,6 +5861,22 @@ SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
   return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
 }
 
+SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDValue SVal = Op.getOperand(0);
+  EVT ResultVT = Op.getValueType();
+  EVT SValVT = SVal.getValueType();
+  SDValue UndefVal = DAG.getUNDEF(SValVT);
+  SDLoc SL(Op);
+
+  SmallVector<SDValue, 8> VElts;
+  VElts.push_back(SVal);
+  for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
+    VElts.push_back(UndefVal);
+
+  return DAG.getBuildVector(ResultVT, SL, VElts);
+}
+
 SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
                                             SelectionDAG &DAG) const {
   SDLoc SL(Op);
@@ -5906,6 +5906,27 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
     return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
   }
 
+  if (VT == MVT::v16i16 || VT == MVT::v16f16) {
+    EVT QuarterVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(),
+                                     VT.getVectorNumElements() / 4);
+    MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
+
+    SmallVector<SDValue, 4> Parts[4];
+    for (unsigned I = 0, E = VT.getVectorNumElements() / 4; I != E; ++I) {
+      for (unsigned P = 0; P < 4; ++P)
+        Parts[P].push_back(Op.getOperand(I + P * E));
+    }
+    SDValue Casts[4];
+    for (unsigned P = 0; P < 4; ++P) {
+      SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
+      Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
+    }
+
+    SDValue Blend =
+        DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 4), SL, Casts);
+    return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
+  }
+
   assert(VT == MVT::v2f16 || VT == MVT::v2i16);
   assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
 
@@ -6277,6 +6298,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
   unsigned IntrOpcode = Intr->BaseOpcode;
   bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
+  bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
 
   SmallVector<EVT, 3> ResultTypes(Op->values());
   SmallVector<EVT, 3> OrigResultTypes(Op->values());
@@ -6455,6 +6477,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
   //
   // SIShrinkInstructions will convert NSA encodings to non-NSA after register
   // allocation when possible.
+  //
+  // TODO: we can actually allow partial NSA where the final register is a
+  // contiguous set of the remaining addresses.
+  // This could help where there are more addresses than supported.
   bool UseNSA = ST->hasFeature(AMDGPU::FeatureNSAEncoding) &&
                 VAddrs.size() >= 3 &&
                 VAddrs.size() <= (unsigned)ST->getNSAMaxSize();
@@ -6561,7 +6587,12 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
       UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
   int Opcode = -1;
 
-  if (IsGFX10Plus) {
+  if (IsGFX11Plus) {
+    Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
+                                   UseNSA ? AMDGPU::MIMGEncGfx11NSA
+                                          : AMDGPU::MIMGEncGfx11Default,
+                                   NumVDataDwords, NumVAddrDwords);
+  } else if (IsGFX10Plus) {
     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
                                    UseNSA ? AMDGPU::MIMGEncGfx10NSA
                                           : AMDGPU::MIMGEncGfx10Default,
@@ -6685,6 +6716,32 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
   return Loads[0];
 }
 
+SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
+                                          unsigned Dim,
+                                          const ArgDescriptor &Arg) const {
+  SDLoc SL(Op);
+  MachineFunction &MF = DAG.getMachineFunction();
+  unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
+  if (MaxID == 0)
+    return DAG.getConstant(0, SL, MVT::i32);
+
+  SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
+                               SDLoc(DAG.getEntryNode()), Arg);
+
+  // Don't bother inserting AssertZext for packed IDs since we're emitting the
+  // masking operations anyway.
+  //
+  // TODO: We could assert the top bit is 0 for the source copy.
+  if (Arg.isMasked())
+    return Val;
+
+  // Preserve the known bits after expansion to a copy.
+  EVT SmallVT =
+      EVT::getIntegerVT(*DAG.getContext(), 32 - countLeadingZeros(MaxID));
+  return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
+                     DAG.getValueType(SmallVT));
+}
+
 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                   SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -6831,26 +6888,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return getPreloadedValue(DAG, *MFI, VT,
                              AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
   case Intrinsic::amdgcn_workitem_id_x:
-    if (Subtarget->getMaxWorkitemID(MF.getFunction(), 0) == 0)
-      return DAG.getConstant(0, DL, MVT::i32);
-
-    return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
-                          SDLoc(DAG.getEntryNode()),
-                          MFI->getArgInfo().WorkItemIDX);
+    return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
   case Intrinsic::amdgcn_workitem_id_y:
-    if (Subtarget->getMaxWorkitemID(MF.getFunction(), 1) == 0)
-      return DAG.getConstant(0, DL, MVT::i32);
-
-    return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
-                          SDLoc(DAG.getEntryNode()),
-                          MFI->getArgInfo().WorkItemIDY);
+    return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
   case Intrinsic::amdgcn_workitem_id_z:
-    if (Subtarget->getMaxWorkitemID(MF.getFunction(), 2) == 0)
-      return DAG.getConstant(0, DL, MVT::i32);
-
-    return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
-                          SDLoc(DAG.getEntryNode()),
-                          MFI->getArgInfo().WorkItemIDZ);
+    return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
   case Intrinsic::amdgcn_wavefrontsize:
     return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
                            SDLoc(Op), MVT::i32);
@@ -7157,12 +7199,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     unsigned ShaderType =
         SIInstrInfo::getDSShaderTypeValue(DAG.getMachineFunction());
     unsigned Offset0 = OrderedCountIndex << 2;
-    unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
-                       (Instruction << 4);
+    unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
 
     if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
       Offset1 |= (CountDw - 1) << 6;
 
+    if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
+      Offset1 |= ShaderType << 2;
+
     unsigned Offset = Offset0 | (Offset1 << 8);
 
     SDValue Ops[] = {
@@ -7441,7 +7485,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
       break;
     case Intrinsic::amdgcn_buffer_atomic_fadd:
-      if (!Op.getValue(0).use_empty() && !Subtarget->hasGFX90AInsts()) {
+      if (!Op.getValue(0).use_empty() && !hasAtomicFaddRtnForTy(Op)) {
         DiagnosticInfoUnsupported
           NoFpRet(DAG.getMachineFunction().getFunction(),
                   "return versions of fp atomics not supported",
@@ -7609,12 +7653,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       return SDValue();
     }
 
+    const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
     const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
     const bool Is64 = NodePtr.getValueType() == MVT::i64;
     const unsigned NumVDataDwords = 4;
     const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
-    const bool UseNSA = Subtarget->hasNSAEncoding() &&
-                        NumVAddrDwords <= Subtarget->getNSAMaxSize();
+    const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
+    const bool UseNSA =
+        Subtarget->hasNSAEncoding() && NumVAddrs <= Subtarget->getNSAMaxSize();
     const unsigned BaseOpcodes[2][2] = {
         {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
         {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
@@ -7622,12 +7668,15 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     int Opcode;
     if (UseNSA) {
       Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
-                                     AMDGPU::MIMGEncGfx10NSA, NumVDataDwords,
-                                     NumVAddrDwords);
+                                     IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA
+                                                 : AMDGPU::MIMGEncGfx10NSA,
+                                     NumVDataDwords, NumVAddrDwords);
     } else {
-      Opcode = AMDGPU::getMIMGOpcode(
-          BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
-          PowerOf2Ceil(NumVAddrDwords));
+      Opcode =
+          AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
+                                IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default
+                                            : AMDGPU::MIMGEncGfx10Default,
+                                NumVDataDwords, PowerOf2Ceil(NumVAddrDwords));
     }
     assert(Opcode != -1);
 
@@ -7660,15 +7709,36 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       }
     };
 
-    if (Is64)
-      DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0, 2);
-    else
+    if (UseNSA && IsGFX11Plus) {
       Ops.push_back(NodePtr);
+      Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
+      Ops.push_back(RayOrigin);
+      if (IsA16) {
+        SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
+        DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
+        DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
+        for (unsigned I = 0; I < 3; ++I) {
+          MergedLanes.push_back(DAG.getBitcast(
+              MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
+                                           {DirLanes[I], InvDirLanes[I]})));
+        }
+        Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
+      } else {
+        Ops.push_back(RayDir);
+        Ops.push_back(RayInvDir);
+      }
+    } else {
+      if (Is64)
+        DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
+                                  2);
+      else
+        Ops.push_back(NodePtr);
 
-    Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
-    packLanes(RayOrigin, true);
-    packLanes(RayDir, true);
-    packLanes(RayInvDir, false);
+      Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
+      packLanes(RayOrigin, true);
+      packLanes(RayDir, true);
+      packLanes(RayInvDir, false);
+    }
 
     if (!UseNSA) {
       // Build a single vector containing all the operands so far prepared.
@@ -7868,6 +7938,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 
   switch (IntrinsicID) {
   case Intrinsic::amdgcn_exp_compr: {
+    if (!Subtarget->hasCompressedExport()) {
+      DiagnosticInfoUnsupported BadIntrin(
+          DAG.getMachineFunction().getFunction(),
+          "intrinsic not supported on subtarget", DL.getDebugLoc());
+      DAG.getContext()->diagnose(BadIntrin);
+    }
     SDValue Src0 = Op.getOperand(4);
     SDValue Src1 = Op.getOperand(5);
     // Hack around illegal type on SI by directly selecting it.
@@ -8110,6 +8186,160 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
                                    M->getMemoryVT(), M->getMemOperand());
   }
+  case Intrinsic::amdgcn_raw_buffer_load_lds:
+  case Intrinsic::amdgcn_struct_buffer_load_lds: {
+    unsigned Opc;
+    bool HasVIndex = IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds;
+    unsigned OpOffset = HasVIndex ? 1 : 0;
+    SDValue VOffset = Op.getOperand(5 + OpOffset);
+    auto CVOffset = dyn_cast<ConstantSDNode>(VOffset);
+    bool HasVOffset = !CVOffset || !CVOffset->isZero();
+    unsigned Size = Op->getConstantOperandVal(4);
+
+    switch (Size) {
+    default:
+      return SDValue();
+    case 1:
+      Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
+                                   : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
+                      : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
+                                   : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
+      break;
+    case 2:
+      Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
+                                   : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
+                      : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
+                                   : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
+      break;
+    case 4:
+      Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
+                                   : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
+                      : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
+                                   : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
+      break;
+    }
+
+    SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
+
+    SmallVector<SDValue, 8> Ops;
+
+    if (HasVIndex && HasVOffset)
+      Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
+                                       { Op.getOperand(5), // VIndex
+                                         VOffset }));
+    else if (HasVIndex)
+      Ops.push_back(Op.getOperand(5));
+    else if (HasVOffset)
+      Ops.push_back(VOffset);
+
+    Ops.push_back(Op.getOperand(2));           // rsrc
+    Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
+    Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
+    unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
+    Ops.push_back(
+      DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol
+    Ops.push_back(
+      DAG.getTargetConstant((Aux >> 3) & 1, DL, MVT::i8));          // swz
+    Ops.push_back(M0Val.getValue(0)); // Chain
+    Ops.push_back(M0Val.getValue(1)); // Glue
+
+    auto *M = cast<MemSDNode>(Op);
+    MachineMemOperand *LoadMMO = M->getMemOperand();
+    MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
+    LoadPtrI.Offset = Op->getConstantOperandVal(7 + OpOffset);
+    MachinePointerInfo StorePtrI = LoadPtrI;
+    StorePtrI.V = nullptr;
+    StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
+
+    auto F = LoadMMO->getFlags() &
+             ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
+    LoadMMO = MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
+                                      Size, LoadMMO->getBaseAlign());
+
+    MachineMemOperand *StoreMMO =
+        MF.getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
+                                sizeof(int32_t), LoadMMO->getBaseAlign());
+
+    auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
+    DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
+
+    return SDValue(Load, 0);
+  }
+  case Intrinsic::amdgcn_global_load_lds: {
+    unsigned Opc;
+    unsigned Size = Op->getConstantOperandVal(4);
+    switch (Size) {
+    default:
+      return SDValue();
+    case 1:
+      Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
+      break;
+    case 2:
+      Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
+      break;
+    case 4:
+      Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
+      break;
+    }
+
+    auto *M = cast<MemSDNode>(Op);
+    SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
+
+    SmallVector<SDValue, 6> Ops;
+
+    SDValue Addr = Op.getOperand(2); // Global ptr
+    SDValue VOffset;
+    // Try to split SAddr and VOffset. Global and LDS pointers share the same
+    // immediate offset, so we cannot use a regular SelectGlobalSAddr().
+    if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
+      SDValue LHS = Addr.getOperand(0);
+      SDValue RHS = Addr.getOperand(1);
+
+      if (LHS->isDivergent())
+        std::swap(LHS, RHS);
+
+      if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
+          RHS.getOperand(0).getValueType() == MVT::i32) {
+        // add (i64 sgpr), (zero_extend (i32 vgpr))
+        Addr = LHS;
+        VOffset = RHS.getOperand(0);
+      }
+    }
+
+    Ops.push_back(Addr);
+    if (!Addr->isDivergent()) {
+      Opc = AMDGPU::getGlobalSaddrOp(Opc);
+      if (!VOffset)
+        VOffset = SDValue(
+            DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
+                               DAG.getTargetConstant(0, DL, MVT::i32)), 0);
+      Ops.push_back(VOffset);
+    }
+
+    Ops.push_back(Op.getOperand(5));  // Offset
+    Ops.push_back(Op.getOperand(6));  // CPol
+    Ops.push_back(M0Val.getValue(0)); // Chain
+    Ops.push_back(M0Val.getValue(1)); // Glue
+
+    MachineMemOperand *LoadMMO = M->getMemOperand();
+    MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
+    LoadPtrI.Offset = Op->getConstantOperandVal(5);
+    MachinePointerInfo StorePtrI = LoadPtrI;
+    LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
+    StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
+    auto F = LoadMMO->getFlags() &
+             ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
+    LoadMMO = MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
+                                      Size, LoadMMO->getBaseAlign());
+    MachineMemOperand *StoreMMO =
+        MF.getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
+                                sizeof(int32_t), Align(4));
+
+    auto Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
+    DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
+
+    return SDValue(Load, 0);
+  }
   case Intrinsic::amdgcn_end_cf:
     return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
                                       Op->getOperand(2), Chain), 0);
@@ -8271,7 +8501,7 @@ static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
 
 SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
-  if (Ld->getAlignment() < 4 || Ld->isDivergent())
+  if (Ld->getAlign() < Align(4) || Ld->isDivergent())
     return SDValue();
 
   // FIXME: Constant loads should all be marked invariant.
@@ -8296,14 +8526,11 @@ SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const
 
   // TODO: Drop only high part of range.
   SDValue Ptr = Ld->getBasePtr();
-  SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
-                                MVT::i32, SL, Ld->getChain(), Ptr,
-                                Ld->getOffset(),
-                                Ld->getPointerInfo(), MVT::i32,
-                                Ld->getAlignment(),
-                                Ld->getMemOperand()->getFlags(),
-                                Ld->getAAInfo(),
-                                nullptr); // Drop ranges
+  SDValue NewLoad = DAG.getLoad(
+      ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
+      Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
+      Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
+      nullptr); // Drop ranges
 
   EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
   if (MemVT.isFloatingPoint()) {
@@ -8392,17 +8619,16 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
          "Custom lowering for non-i32 vectors hasn't been implemented.");
 
-  unsigned Alignment = Load->getAlignment();
+  Align Alignment = Load->getAlign();
   unsigned AS = Load->getAddressSpace();
-  if (Subtarget->hasLDSMisalignedBug() &&
-      AS == AMDGPUAS::FLAT_ADDRESS &&
-      Alignment < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
+  if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
+      Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
     return SplitVectorLoad(Op, DAG);
   }
 
   MachineFunction &MF = DAG.getMachineFunction();
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  // If there is a possibilty that flat instruction access scratch memory
+  // If there is a possibility that flat instruction access scratch memory
   // then we need to use the same legalization rules we use for private.
   if (AS == AMDGPUAS::FLAT_ADDRESS &&
       !Subtarget->hasMultiDwordFlatScratchAddressing())
@@ -8413,7 +8639,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
 
   if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
       AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
-    if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32) {
+    if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) {
       if (MemVT.isPow2VectorType())
         return SDValue();
       return WidenOrSplitVectorLoad(Op, DAG);
@@ -8429,7 +8655,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
       AS == AMDGPUAS::GLOBAL_ADDRESS) {
     if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
         Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) &&
-        Alignment >= 4 && NumElements < 32) {
+        Alignment >= Align(4) && NumElements < 32) {
       if (MemVT.isPow2VectorType())
         return SDValue();
       return WidenOrSplitVectorLoad(Op, DAG);
@@ -8479,27 +8705,15 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
       llvm_unreachable("unsupported private_element_size");
     }
   } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
-    // Use ds_read_b128 or ds_read_b96 when possible.
-    if (Subtarget->hasDS96AndDS128() &&
-        ((Subtarget->useDS128() && MemVT.getStoreSize() == 16) ||
-         MemVT.getStoreSize() == 12) &&
-        allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS,
-                                           Load->getAlign()))
+    bool Fast = false;
+    auto Flags = Load->getMemOperand()->getFlags();
+    if (allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS,
+                                           Load->getAlign(), Flags, &Fast) &&
+        Fast)
       return SDValue();
 
-    if (NumElements > 2)
+    if (MemVT.isVector())
       return SplitVectorLoad(Op, DAG);
-
-    // SI has a hardware bug in the LDS / GDS boounds checking: if the base
-    // address is negative, then the instruction is incorrectly treated as
-    // out-of-bounds even if base + offsets is in bounds. Split vectorized
-    // loads here to avoid emitting ds_read2_b32. We may re-combine the
-    // load later in the SILoadStoreOptimizer.
-    if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
-        NumElements == 2 && MemVT.getStoreSize() == 8 &&
-        Load->getAlignment() < 8) {
-      return SplitVectorLoad(Op, DAG);
-    }
   }
 
   if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
@@ -8514,7 +8728,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
-  if (VT.getSizeInBits() == 128)
+  if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256)
     return splitTernaryVectorOp(Op, DAG);
 
   assert(VT.getSizeInBits() == 64);
@@ -8946,13 +9160,13 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   unsigned AS = Store->getAddressSpace();
   if (Subtarget->hasLDSMisalignedBug() &&
       AS == AMDGPUAS::FLAT_ADDRESS &&
-      Store->getAlignment() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
+      Store->getAlign().value() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
     return SplitVectorStore(Op, DAG);
   }
 
   MachineFunction &MF = DAG.getMachineFunction();
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  // If there is a possibilty that flat instruction access scratch memory
+  // If there is a possibility that flat instruction access scratch memory
   // then we need to use the same legalization rules we use for private.
   if (AS == AMDGPUAS::FLAT_ADDRESS &&
       !Subtarget->hasMultiDwordFlatScratchAddressing())
@@ -8990,39 +9204,21 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
       llvm_unreachable("unsupported private_element_size");
     }
   } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
-    // Use ds_write_b128 or ds_write_b96 when possible.
-    if (Subtarget->hasDS96AndDS128() &&
-        ((Subtarget->useDS128() && VT.getStoreSize() == 16) ||
-         (VT.getStoreSize() == 12)) &&
-        allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS,
-                                           Store->getAlign()))
+    bool Fast = false;
+    auto Flags = Store->getMemOperand()->getFlags();
+    if (allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS,
+                                           Store->getAlign(), Flags, &Fast) &&
+        Fast)
       return SDValue();
 
-    if (NumElements > 2)
+    if (VT.isVector())
       return SplitVectorStore(Op, DAG);
 
-    // SI has a hardware bug in the LDS / GDS boounds checking: if the base
-    // address is negative, then the instruction is incorrectly treated as
-    // out-of-bounds even if base + offsets is in bounds. Split vectorized
-    // stores here to avoid emitting ds_write2_b32. We may re-combine the
-    // store later in the SILoadStoreOptimizer.
-    if (!Subtarget->hasUsableDSOffset() &&
-        NumElements == 2 && VT.getStoreSize() == 8 &&
-        Store->getAlignment() < 8) {
-      return SplitVectorStore(Op, DAG);
-    }
-
-    if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
-                                        VT, *Store->getMemOperand())) {
-      if (VT.isVector())
-        return SplitVectorStore(Op, DAG);
-      return expandUnalignedStore(Store, DAG);
-    }
-
-    return SDValue();
-  } else {
-    llvm_unreachable("unhandled address space");
+    return expandUnalignedStore(Store, DAG);
   }
+
+  // Probably an invalid store. If so we'll end up emitting a selection error.
+  return SDValue();
 }
 
 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
@@ -10041,7 +10237,7 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
         }
       }
 
-      // If one half is undef, and one is constant, perfer a splat vector rather
+      // If one half is undef, and one is constant, prefer a splat vector rather
       // than the normal qNaN. If it's a register, prefer 0.0 since that's
       // cheaper to use and may be free with a packed operation.
       if (NewElts[0].isUndef()) {
@@ -10349,7 +10545,8 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
 // expanded into a set of cmp/select instructions.
 bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize,
                                                 unsigned NumElem,
-                                                bool IsDivergentIdx) {
+                                                bool IsDivergentIdx,
+                                                const GCNSubtarget *Subtarget) {
   if (UseDivergentRegisterIndexing)
     return false;
 
@@ -10371,10 +10568,18 @@ bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize,
   // Large vectors would yield too many compares and v_cndmask_b32 instructions.
   unsigned NumInsts = NumElem /* Number of compares */ +
                       ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
-  return NumInsts <= 16;
+
+  // On some architectures (GFX9) movrel is not available and it's better
+  // to expand.
+  if (!Subtarget->hasMovrel())
+    return NumInsts <= 16;
+
+  // If movrel is available, use it instead of expanding for vector of 8
+  // elements.
+  return NumInsts <= 15;
 }
 
-static bool shouldExpandVectorDynExt(SDNode *N) {
+bool SITargetLowering::shouldExpandVectorDynExt(SDNode *N) const {
   SDValue Idx = N->getOperand(N->getNumOperands() - 1);
   if (isa<ConstantSDNode>(Idx))
     return false;
@@ -10385,8 +10590,8 @@ static bool shouldExpandVectorDynExt(SDNode *N) {
   unsigned EltSize = EltVT.getSizeInBits();
   unsigned NumElem = VecVT.getVectorNumElements();
 
-  return SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
-                                                    Idx->isDivergent());
+  return SITargetLowering::shouldExpandVectorDynExt(
+      EltSize, NumElem, Idx->isDivergent(), getSubtarget());
 }
 
 SDValue SITargetLowering::performExtractVectorEltCombine(
@@ -10450,7 +10655,7 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
   unsigned EltSize = EltVT.getSizeInBits();
 
   // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
-  if (::shouldExpandVectorDynExt(N)) {
+  if (shouldExpandVectorDynExt(N)) {
     SDLoc SL(N);
     SDValue Idx = N->getOperand(1);
     SDValue V;
@@ -10513,7 +10718,7 @@ SITargetLowering::performInsertVectorEltCombine(SDNode *N,
 
   // INSERT_VECTOR_ELT (<n x e>, var-idx)
   // => BUILD_VECTOR n x select (e, const-idx)
-  if (!::shouldExpandVectorDynExt(N))
+  if (!shouldExpandVectorDynExt(N))
     return SDValue();
 
   SelectionDAG &DAG = DCI.DAG;
@@ -10603,39 +10808,145 @@ static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
   return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
 }
 
-SDValue SITargetLowering::performAddCombine(SDNode *N,
+// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
+// multiplies, if any.
+//
+// Full 64-bit multiplies that feed into an addition are lowered here instead
+// of using the generic expansion. The generic expansion ends up with
+// a tree of ADD nodes that prevents us from using the "add" part of the
+// MAD instruction. The expansion produced here results in a chain of ADDs
+// instead of a tree.
+SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
+  assert(N->getOpcode() == ISD::ADD);
+
   SelectionDAG &DAG = DCI.DAG;
   EVT VT = N->getValueType(0);
   SDLoc SL(N);
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
 
-  if ((LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL)
-      && Subtarget->hasMad64_32() &&
-      !VT.isVector() && VT.getScalarSizeInBits() > 32 &&
-      VT.getScalarSizeInBits() <= 64) {
-    if (LHS.getOpcode() != ISD::MUL)
-      std::swap(LHS, RHS);
+  if (VT.isVector())
+    return SDValue();
 
-    SDValue MulLHS = LHS.getOperand(0);
-    SDValue MulRHS = LHS.getOperand(1);
-    SDValue AddRHS = RHS;
+  // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
+  // result in scalar registers for uniform values.
+  if (!N->isDivergent() && Subtarget->hasSMulHi())
+    return SDValue();
+
+  unsigned NumBits = VT.getScalarSizeInBits();
+  if (NumBits <= 32 || NumBits > 64)
+    return SDValue();
+
+  if (LHS.getOpcode() != ISD::MUL) {
+    assert(RHS.getOpcode() == ISD::MUL);
+    std::swap(LHS, RHS);
+  }
+
+  // Avoid the fold if it would unduly increase the number of multiplies due to
+  // multiple uses, except on hardware with full-rate multiply-add (which is
+  // part of full-rate 64-bit ops).
+  if (!Subtarget->hasFullRate64Ops()) {
+    unsigned NumUsers = 0;
+    for (SDNode *Use : LHS->uses()) {
+      // There is a use that does not feed into addition, so the multiply can't
+      // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
+      if (Use->getOpcode() != ISD::ADD)
+        return SDValue();
 
-    // TODO: Maybe restrict if SGPR inputs.
-    if (numBitsUnsigned(MulLHS, DAG) <= 32 &&
-        numBitsUnsigned(MulRHS, DAG) <= 32) {
-      MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32);
-      MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32);
-      AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64);
-      return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false);
+      // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
+      // MUL + 3xADD + 3xADDC over 3xMAD.
+      ++NumUsers;
+      if (NumUsers >= 3)
+        return SDValue();
     }
+  }
+
+  SDValue MulLHS = LHS.getOperand(0);
+  SDValue MulRHS = LHS.getOperand(1);
+  SDValue AddRHS = RHS;
+
+  // Always check whether operands are small unsigned values, since that
+  // knowledge is useful in more cases. Check for small signed values only if
+  // doing so can unlock a shorter code sequence.
+  bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
+  bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
+
+  bool MulSignedLo = false;
+  if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
+    MulSignedLo = numBitsSigned(MulLHS, DAG) <= 32 &&
+                  numBitsSigned(MulRHS, DAG) <= 32;
+  }
 
-    if (numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32) {
-      MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32);
-      MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);
-      AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);
-      return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true);
+  // The operands and final result all have the same number of bits. If
+  // operands need to be extended, they can be extended with garbage. The
+  // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
+  // truncated away in the end.
+  if (VT != MVT::i64) {
+    MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
+    MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
+    AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
+  }
+
+  // The basic code generated is conceptually straightforward. Pseudo code:
+  //
+  //   accum = mad_64_32 lhs.lo, rhs.lo, accum
+  //   accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
+  //   accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
+  //
+  // The second and third lines are optional, depending on whether the factors
+  // are {sign,zero}-extended or not.
+  //
+  // The actual DAG is noisier than the pseudo code, but only due to
+  // instructions that disassemble values into low and high parts, and
+  // assemble the final result.
+  SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+  SDValue One = DAG.getConstant(1, SL, MVT::i32);
+
+  auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
+  auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
+  SDValue Accum =
+      getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
+
+  if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
+    auto AccumLo = DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, Accum, Zero);
+    auto AccumHi = DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, Accum, One);
+
+    if (!MulLHSUnsigned32) {
+      auto MulLHSHi =
+          DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
+      SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
+      AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
+    }
+
+    if (!MulRHSUnsigned32) {
+      auto MulRHSHi =
+          DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
+      SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
+      AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
+    }
+
+    Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
+    Accum = DAG.getBitcast(MVT::i64, Accum);
+  }
+
+  if (VT != MVT::i64)
+    Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
+  return Accum;
+}
+
+SDValue SITargetLowering::performAddCombine(SDNode *N,
+                                            DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+  SDLoc SL(N);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
+    if (Subtarget->hasMad64_32()) {
+      if (SDValue Folded = tryFoldToMad64_32(N, DCI))
+        return Folded;
     }
 
     return SDValue();
@@ -10763,7 +11074,7 @@ SDValue SITargetLowering::performFAddCombine(SDNode *N,
   SDValue RHS = N->getOperand(1);
 
   // These should really be instruction patterns, but writing patterns with
-  // source modiifiers is a pain.
+  // source modifiers is a pain.
 
   // fadd (fadd (a, a), b) -> mad 2.0, a, b
   if (LHS.getOpcode() == ISD::FADD) {
@@ -10860,8 +11171,8 @@ SDValue SITargetLowering::performFMACombine(SDNode *N,
     return SDValue();
 
   // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
-  // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract
-  // is sufficient to allow generaing fdot2.
+  // regardless of the denorm mode setting. Therefore,
+  // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
   const TargetOptions &Options = DAG.getTarget().Options;
   if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
       (N->getFlags().hasAllowContract() &&
@@ -11562,7 +11873,7 @@ void SITargetLowering::AddIMGInit(MachineInstr &MI) const {
   if (DstSize < InitIdx)
     return;
 
-  // Create a register for the intialization value.
+  // Create a register for the initialization value.
   Register PrevDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
   unsigned NewDst = 0; // Final initialized value will be in here
 
@@ -11608,7 +11919,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
     TII->legalizeOperandsVOP3(MRI, MI);
 
     // Prefer VGPRs over AGPRs in mAI instructions where possible.
-    // This saves a chain-copy of registers and better ballance register
+    // This saves a chain-copy of registers and better balance register
     // use between vgpr and agpr as agpr tuples tend to be big.
     if (MI.getDesc().OpInfo) {
       unsigned Opc = MI.getOpcode();
@@ -11633,54 +11944,29 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
         // so no use checks are needed.
         MRI.setRegClass(Op.getReg(), NewRC);
       }
-    }
 
-    return;
-  }
-
-  // Replace unused atomics with the no return version.
-  int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
-  if (NoRetAtomicOp != -1) {
-    if (!Node->hasAnyUseOfValue(0)) {
-      int CPolIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
-                                               AMDGPU::OpName::cpol);
-      if (CPolIdx != -1) {
-        MachineOperand &CPol = MI.getOperand(CPolIdx);
-        CPol.setImm(CPol.getImm() & ~AMDGPU::CPol::GLC);
+      // Resolve the rest of AV operands to AGPRs.
+      if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
+        if (Src2->isReg() && Src2->getReg().isVirtual()) {
+          auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
+          if (TRI->isVectorSuperClass(RC)) {
+            auto *NewRC = TRI->getEquivalentAGPRClass(RC);
+            MRI.setRegClass(Src2->getReg(), NewRC);
+            if (Src2->isTied())
+              MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
+          }
+        }
       }
-      MI.RemoveOperand(0);
-      MI.setDesc(TII->get(NoRetAtomicOp));
-      return;
     }
 
-    // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
-    // instruction, because the return type of these instructions is a vec2 of
-    // the memory type, so it can be tied to the input operand.
-    // This means these instructions always have a use, so we need to add a
-    // special case to check if the atomic has only one extract_subreg use,
-    // which itself has no uses.
-    if ((Node->hasNUsesOfValue(1, 0) &&
-         Node->use_begin()->isMachineOpcode() &&
-         Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
-         !Node->use_begin()->hasAnyUseOfValue(0))) {
-      Register Def = MI.getOperand(0).getReg();
-
-      // Change this into a noret atomic.
-      MI.setDesc(TII->get(NoRetAtomicOp));
-      MI.RemoveOperand(0);
-
-      // If we only remove the def operand from the atomic instruction, the
-      // extract_subreg will be left with a use of a vreg without a def.
-      // So we need to insert an implicit_def to avoid machine verifier
-      // errors.
-      BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
-              TII->get(AMDGPU::IMPLICIT_DEF), Def);
-    }
     return;
   }
 
-  if (TII->isMIMG(MI) && !MI.mayStore())
-    AddIMGInit(MI);
+  if (TII->isMIMG(MI)) {
+    if (!MI.mayStore())
+      AddIMGInit(MI);
+    TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
+  }
 }
 
 static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
@@ -12243,13 +12529,17 @@ Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
   MachineBasicBlock *Exit = ML->getExitBlock();
 
   if (Pre && Exit) {
-    BuildMI(*Pre, Pre->getFirstTerminator(), DebugLoc(),
-            TII->get(AMDGPU::S_INST_PREFETCH))
-      .addImm(1); // prefetch 2 lines behind PC
+    auto PreTerm = Pre->getFirstTerminator();
+    if (PreTerm == Pre->begin() ||
+        std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
+      BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
+          .addImm(1); // prefetch 2 lines behind PC
 
-    BuildMI(*Exit, Exit->getFirstNonDebugInstr(), DebugLoc(),
-            TII->get(AMDGPU::S_INST_PREFETCH))
-      .addImm(2); // prefetch 1 line behind PC
+    auto ExitHead = Exit->getFirstNonDebugInstr();
+    if (ExitHead == Exit->end() ||
+        ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
+      BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
+          .addImm(2); // prefetch 1 line behind PC
   }
 
   return CacheLineAlign;
@@ -12390,6 +12680,9 @@ static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
 
 TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
+  unsigned AS = RMW->getPointerAddressSpace();
+  if (AS == AMDGPUAS::PRIVATE_ADDRESS)
+    return AtomicExpansionKind::NotAtomic;
 
   auto ReportUnsafeHWInst = [&](TargetLowering::AtomicExpansionKind Kind) {
     OptimizationRemarkEmitter ORE(RMW->getFunction());
@@ -12421,10 +12714,11 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
     if (!Ty->isFloatTy() && (!Subtarget->hasGFX90AInsts() || !Ty->isDoubleTy()))
       return AtomicExpansionKind::CmpXChg;
 
-    unsigned AS = RMW->getPointerAddressSpace();
-
     if ((AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) &&
-         Subtarget->hasAtomicFaddInsts()) {
+        Subtarget->hasAtomicFaddNoRtnInsts()) {
+      if (Subtarget->hasGFX940Insts())
+        return AtomicExpansionKind::None;
+
       // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
       // floating point atomic instructions. May generate more efficient code,
       // but may not respect rounding and denormal modes, and may give incorrect
@@ -12453,8 +12747,8 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
                               : AtomicExpansionKind::CmpXChg;
     }
 
-    // DS FP atomics do repect the denormal mode, but the rounding mode is fixed
-    // to round-to-nearest-even.
+    // DS FP atomics do respect the denormal mode, but the rounding mode is
+    // fixed to round-to-nearest-even.
     // The only exception is DS_ADD_F64 which never flushes regardless of mode.
     if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomicAdd()) {
       if (!Ty->isDoubleTy())
@@ -12479,6 +12773,27 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
   return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW);
 }
 
+TargetLowering::AtomicExpansionKind
+SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+  return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
+             ? AtomicExpansionKind::NotAtomic
+             : AtomicExpansionKind::None;
+}
+
+TargetLowering::AtomicExpansionKind
+SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+  return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
+             ? AtomicExpansionKind::NotAtomic
+             : AtomicExpansionKind::None;
+}
+
+TargetLowering::AtomicExpansionKind
+SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
+  return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
+             ? AtomicExpansionKind::NotAtomic
+             : AtomicExpansionKind::None;
+}
+
 const TargetRegisterClass *
 SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
   const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false);
@@ -12500,7 +12815,7 @@ SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
 // always uniform.
 static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
                       unsigned WaveSize) {
-  // FIXME: We asssume we never cast the mask results of a control flow
+  // FIXME: We assume we never cast the mask results of a control flow
   // intrinsic.
   // Early exit if the type won't be consistent as a compile time hack.
   IntegerType *IT = dyn_cast<IntegerType>(V->getType());
@@ -12604,7 +12919,7 @@ bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
                                            SDValue N1) const {
   if (!N0.hasOneUse())
     return false;
-  // Take care of the oportunity to keep N0 uniform
+  // Take care of the opportunity to keep N0 uniform
   if (N0->isDivergent() || !N1->isDivergent())
     return true;
   // Check if we have a good chance to form the memory access pattern with the
@@ -12612,3 +12927,11 @@ bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
   return (DAG.isBaseWithConstantOffset(N0) &&
           hasMemSDNodeUser(*N0->use_begin()));
 }
+
+MachineMemOperand::Flags
+SITargetLowering::getTargetMMOFlags(const Instruction &I) const {
+  // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
+  if (I.getMetadata("amdgpu.noclobber"))
+    return MONoClobber;
+  return MachineMemOperand::MONone;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index bf81e082b478..4fbccf0c5850 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -53,6 +53,9 @@ private:
                                    uint64_t Offset, Align Alignment,
                                    bool Signed,
                                    const ISD::InputArg *Arg = nullptr) const;
+  SDValue loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT, const SDLoc &DL,
+                                     Align Alignment,
+                                     ImplicitParameter Param) const;
 
   SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
                               const SDLoc &SL, SDValue Chain,
@@ -76,6 +79,9 @@ private:
   SDValue lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
                                         unsigned NewOpcode) const;
 
+  SDValue lowerWorkitemID(SelectionDAG &DAG, SDValue Op, unsigned Dim,
+                          const ArgDescriptor &ArgDesc) const;
+
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
@@ -145,6 +151,7 @@ private:
   SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;
@@ -191,6 +198,7 @@ private:
   SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const;
   unsigned getFusedOpcode(const SelectionDAG &DAG,
                           const SDNode *N0, const SDNode *N1) const;
+  SDValue tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performAddCarrySubCarryCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -227,7 +235,10 @@ public:
   /// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
   /// expanded into a set of cmp/select instructions.
   static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem,
-                                       bool IsDivergentIdx);
+                                       bool IsDivergentIdx,
+                                       const GCNSubtarget *Subtarget);
+
+  bool shouldExpandVectorDynExt(SDNode *N) const;
 
 private:
   // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
@@ -310,6 +321,9 @@ public:
   bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                         Type *Ty) const override;
 
+  bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
+                               unsigned Index) const override;
+
   bool isTypeDesirableForOp(unsigned Op, EVT VT) const override;
 
   bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
@@ -380,6 +394,7 @@ public:
                               MachineBasicBlock *BB) const override;
 
   bool hasBitPreservingFPLogic(EVT VT) const override;
+  bool hasAtomicFaddRtnForTy(SDValue &Op) const;
   bool enableAggressiveFMAFusion(EVT VT) const override;
   bool enableAggressiveFMAFusion(LLT Ty) const override;
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
@@ -466,6 +481,10 @@ public:
                                     bool SNaN = false,
                                     unsigned Depth = 0) const override;
   AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
+  AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
+  AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+  AtomicExpansionKind
+  shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
 
   virtual const TargetRegisterClass *
   getRegClassFor(MVT VT, bool isDivergent) const override;
@@ -505,6 +524,9 @@ public:
 
   std::pair<InstructionCost, MVT> getTypeLegalizationCost(const DataLayout &DL,
                                                           Type *Ty) const;
+
+  MachineMemOperand::Flags
+  getTargetMMOFlags(const Instruction &I) const override;
 };
 
 } // End namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
index 125f006a1d1d..50f8ad4433c6 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
@@ -35,6 +35,7 @@
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 
 using namespace llvm;
 
@@ -42,11 +43,39 @@ using namespace llvm;
 
 namespace {
 
+// A clause length of 64 instructions could be encoded in the s_clause
+// instruction, but the hardware documentation (at least for GFX11) says that
+// 63 is the maximum allowed.
+constexpr unsigned MaxInstructionsInClause = 63;
+
 enum HardClauseType {
+  // For GFX10:
+
   // Texture, buffer, global or scratch memory instructions.
   HARDCLAUSE_VMEM,
   // Flat (not global or scratch) memory instructions.
   HARDCLAUSE_FLAT,
+
+  // For GFX11:
+
+  // Texture memory instructions.
+  HARDCLAUSE_MIMG_LOAD,
+  HARDCLAUSE_MIMG_STORE,
+  HARDCLAUSE_MIMG_ATOMIC,
+  HARDCLAUSE_MIMG_SAMPLE,
+  // Buffer, global or scratch memory instructions.
+  HARDCLAUSE_VMEM_LOAD,
+  HARDCLAUSE_VMEM_STORE,
+  HARDCLAUSE_VMEM_ATOMIC,
+  // Flat (not global or scratch) memory instructions.
+  HARDCLAUSE_FLAT_LOAD,
+  HARDCLAUSE_FLAT_STORE,
+  HARDCLAUSE_FLAT_ATOMIC,
+  // BVH instructions.
+  HARDCLAUSE_BVH,
+
+  // Common:
+
   // Instructions that access LDS.
   HARDCLAUSE_LDS,
   // Scalar memory instructions.
@@ -78,19 +107,43 @@ public:
   }
 
   HardClauseType getHardClauseType(const MachineInstr &MI) {
-
-    // On current architectures we only get a benefit from clausing loads.
-    if (MI.mayLoad()) {
-      if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI)) {
-        if (ST->hasNSAClauseBug()) {
+    if (MI.mayLoad() || (MI.mayStore() && ST->shouldClusterStores())) {
+      if (ST->getGeneration() == AMDGPUSubtarget::GFX10) {
+        if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI)) {
+          if (ST->hasNSAClauseBug()) {
+            const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
+            if (Info && Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA)
+              return HARDCLAUSE_ILLEGAL;
+          }
+          return HARDCLAUSE_VMEM;
+        }
+        if (SIInstrInfo::isFLAT(MI))
+          return HARDCLAUSE_FLAT;
+      } else {
+        assert(ST->getGeneration() >= AMDGPUSubtarget::GFX11);
+        if (SIInstrInfo::isMIMG(MI)) {
           const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
-          if (Info && Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA)
-            return HARDCLAUSE_ILLEGAL;
+          const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
+              AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
+          if (BaseInfo->BVH)
+            return HARDCLAUSE_BVH;
+          if (BaseInfo->Sampler)
+            return HARDCLAUSE_MIMG_SAMPLE;
+          return MI.mayLoad() ? MI.mayStore() ? HARDCLAUSE_MIMG_ATOMIC
+                                              : HARDCLAUSE_MIMG_LOAD
+                              : HARDCLAUSE_MIMG_STORE;
+        }
+        if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI)) {
+          return MI.mayLoad() ? MI.mayStore() ? HARDCLAUSE_VMEM_ATOMIC
+                                              : HARDCLAUSE_VMEM_LOAD
+                              : HARDCLAUSE_VMEM_STORE;
+        }
+        if (SIInstrInfo::isFLAT(MI)) {
+          return MI.mayLoad() ? MI.mayStore() ? HARDCLAUSE_FLAT_ATOMIC
+                                              : HARDCLAUSE_FLAT_LOAD
+                              : HARDCLAUSE_FLAT_STORE;
         }
-        return HARDCLAUSE_VMEM;
       }
-      if (SIInstrInfo::isFLAT(MI))
-        return HARDCLAUSE_FLAT;
       // TODO: LDS
       if (SIInstrInfo::isSMRD(MI))
         return HARDCLAUSE_SMEM;
@@ -129,7 +182,7 @@ public:
   bool emitClause(const ClauseInfo &CI, const SIInstrInfo *SII) {
     if (CI.First == CI.Last)
       return false;
-    assert(CI.Length <= 64 && "Hard clause is too long!");
+    assert(CI.Length <= MaxInstructionsInClause && "Hard clause is too long!");
 
     auto &MBB = *CI.First->getParent();
     auto ClauseMI =
@@ -170,7 +223,7 @@ public:
           }
         }
 
-        if (CI.Length == 64 ||
+        if (CI.Length == MaxInstructionsInClause ||
             (CI.Length && Type != HARDCLAUSE_INTERNAL &&
              Type != HARDCLAUSE_IGNORE &&
              (Type != CI.Type ||
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index f8a10bc8ef6f..349bcbf82195 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -31,6 +31,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/DebugCounter.h"
@@ -87,29 +88,29 @@ struct RegisterEncoding {
 };
 
 enum WaitEventType {
-  VMEM_ACCESS,      // vector-memory read & write
-  VMEM_READ_ACCESS, // vector-memory read
-  VMEM_WRITE_ACCESS,// vector-memory write
-  LDS_ACCESS,       // lds read & write
-  GDS_ACCESS,       // gds read & write
-  SQ_MESSAGE,       // send message
-  SMEM_ACCESS,      // scalar-memory read & write
-  EXP_GPR_LOCK,     // export holding on its data src
-  GDS_GPR_LOCK,     // GDS holding on its data and addr src
-  EXP_POS_ACCESS,   // write to export position
-  EXP_PARAM_ACCESS, // write to export parameter
-  VMW_GPR_LOCK,     // vector-memory write holding on its data src
+  VMEM_ACCESS,       // vector-memory read & write
+  VMEM_READ_ACCESS,  // vector-memory read
+  VMEM_WRITE_ACCESS, // vector-memory write
+  LDS_ACCESS,        // lds read & write
+  GDS_ACCESS,        // gds read & write
+  SQ_MESSAGE,        // send message
+  SMEM_ACCESS,       // scalar-memory read & write
+  EXP_GPR_LOCK,      // export holding on its data src
+  GDS_GPR_LOCK,      // GDS holding on its data and addr src
+  EXP_POS_ACCESS,    // write to export position
+  EXP_PARAM_ACCESS,  // write to export parameter
+  VMW_GPR_LOCK,      // vector-memory write holding on its data src
+  EXP_LDS_ACCESS,    // read by ldsdir counting as export
   NUM_WAIT_EVENTS,
 };
 
 static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = {
-  (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
-  (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
-      (1 << SQ_MESSAGE),
-  (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
-      (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS),
-  (1 << VMEM_WRITE_ACCESS)
-};
+    (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
+    (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
+        (1 << SQ_MESSAGE),
+    (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
+        (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) | (1 << EXP_LDS_ACCESS),
+    (1 << VMEM_WRITE_ACCESS)};
 
 // The mapping is:
 //  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
@@ -119,10 +120,10 @@ static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = {
 // special tokens like SCMEM_LDS (needed for buffer load to LDS).
 enum RegisterMapping {
   SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
-  AGPR_OFFSET = 226, // Maximum programmable ArchVGPRs across all targets.
+  AGPR_OFFSET = 256,      // Maximum programmable ArchVGPRs across all targets.
   SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
   NUM_EXTRA_VGPRS = 1,    // A reserved slot for DS.
-  EXTRA_VGPR_LDS = 0,     // This is a placeholder the Shader algorithm uses.
+  EXTRA_VGPR_LDS = 0,     // An artificial register to track LDS writes.
   NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
 };
 
@@ -355,6 +356,8 @@ private:
 
   DenseSet<MachineInstr *> TrackedWaitcntSet;
   DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
+  DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
+  MachineLoopInfo *MLI;
   MachinePostDominatorTree *PDT;
 
   struct BlockInfo {
@@ -381,6 +384,9 @@ public:
     (void)ForceVMCounter;
   }
 
+  bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets);
+  bool isPreheaderToFlush(MachineBasicBlock &MBB,
+                          WaitcntBrackets &ScoreBrackets);
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   StringRef getPassName() const override {
@@ -389,6 +395,7 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
+    AU.addRequired<MachineLoopInfo>();
     AU.addRequired<MachinePostDominatorTree>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -431,14 +438,23 @@ public:
   bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
   bool generateWaitcntInstBefore(MachineInstr &MI,
                                  WaitcntBrackets &ScoreBrackets,
-                                 MachineInstr *OldWaitcntInstr);
+                                 MachineInstr *OldWaitcntInstr,
+                                 bool FlushVmCnt);
+  bool generateWaitcntBlockEnd(MachineBasicBlock &Block,
+                               WaitcntBrackets &ScoreBrackets,
+                               MachineInstr *OldWaitcntInstr);
+  bool generateWaitcnt(AMDGPU::Waitcnt Wait,
+                       MachineBasicBlock::instr_iterator It,
+                       MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
+                       MachineInstr *OldWaitcntInstr);
   void updateEventWaitcntAfter(MachineInstr &Inst,
                                WaitcntBrackets *ScoreBrackets);
   bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
                             WaitcntBrackets &ScoreBrackets);
   bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
                                MachineInstr &OldWaitcntInstr,
-                               AMDGPU::Waitcnt &Wait, const MachineInstr *MI);
+                               AMDGPU::Waitcnt &Wait,
+                               MachineBasicBlock::instr_iterator It);
 };
 
 } // end anonymous namespace
@@ -496,6 +512,14 @@ void WaitcntBrackets::setExpScore(const MachineInstr *MI,
   }
 }
 
+// MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS written
+// can be accessed. A load from LDS to VMEM does not need a wait.
+static bool mayWriteLDSThroughDMA(const MachineInstr &MI) {
+  return SIInstrInfo::isVALU(MI) &&
+         (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI)) &&
+         MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD;
+}
+
 void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
                                     const SIRegisterInfo *TRI,
                                     const MachineRegisterInfo *MRI,
@@ -588,6 +612,12 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
             CurrScore);
       }
+    } else if (TII->isLDSDIR(Inst)) {
+      // LDSDIR instructions attach the score to the destination.
+      setExpScore(
+          &Inst, TII, TRI, MRI,
+          AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdst),
+          CurrScore);
     } else {
       if (TII->isEXP(Inst)) {
         // For export the destination registers are really temps that
@@ -644,7 +674,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
         setRegScore(RegNo, T, CurrScore);
       }
     }
-    if (TII->isDS(Inst) && Inst.mayStore()) {
+    if (Inst.mayStore() && (TII->isDS(Inst) || mayWriteLDSThroughDMA(Inst))) {
       setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
     }
   }
@@ -784,6 +814,7 @@ bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
 
 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
                       false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
                     false)
@@ -796,53 +827,53 @@ FunctionPass *llvm::createSIInsertWaitcntsPass() {
   return new SIInsertWaitcnts();
 }
 
-/// Combine consecutive waitcnt instructions that precede \p MI and follow
+/// Combine consecutive waitcnt instructions that precede \p It and follow
 /// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
 /// by previous passes. Currently this pass conservatively assumes that these
 /// preexisting waitcnt are required for correctness.
-bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
-                                               MachineInstr &OldWaitcntInstr,
-                                               AMDGPU::Waitcnt &Wait,
-                                               const MachineInstr *MI) {
+bool SIInsertWaitcnts::applyPreexistingWaitcnt(
+    WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
+    AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) {
   bool Modified = false;
   MachineInstr *WaitcntInstr = nullptr;
   MachineInstr *WaitcntVsCntInstr = nullptr;
-  for (auto II = OldWaitcntInstr.getIterator(), NextI = std::next(II);
-       &*II != MI; II = NextI, ++NextI) {
-    if (II->isMetaInstruction())
+
+  for (auto &II :
+       make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
+    if (II.isMetaInstruction())
       continue;
 
-    if (II->getOpcode() == AMDGPU::S_WAITCNT) {
+    if (II.getOpcode() == AMDGPU::S_WAITCNT) {
       // Conservatively update required wait if this waitcnt was added in an
       // earlier pass. In this case it will not exist in the tracked waitcnt
       // set.
-      if (!TrackedWaitcntSet.count(&*II)) {
-        unsigned IEnc = II->getOperand(0).getImm();
+      if (!TrackedWaitcntSet.count(&II)) {
+        unsigned IEnc = II.getOperand(0).getImm();
         AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
         Wait = Wait.combined(OldWait);
       }
 
       // Merge consecutive waitcnt of the same type by erasing multiples.
       if (!WaitcntInstr) {
-        WaitcntInstr = &*II;
+        WaitcntInstr = &II;
       } else {
-        II->eraseFromParent();
+        II.eraseFromParent();
         Modified = true;
       }
 
     } else {
-      assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
-      assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
-      if (!TrackedWaitcntSet.count(&*II)) {
+      assert(II.getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
+      assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
+      if (!TrackedWaitcntSet.count(&II)) {
         unsigned OldVSCnt =
-            TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->getImm();
+            TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
         Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);
       }
 
       if (!WaitcntVsCntInstr) {
-        WaitcntVsCntInstr = &*II;
+        WaitcntVsCntInstr = &II;
       } else {
-        II->eraseFromParent();
+        II.eraseFromParent();
         Modified = true;
       }
     }
@@ -862,9 +893,14 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
       Wait.LgkmCnt = ~0u;
       Wait.ExpCnt = ~0u;
 
-      LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
-                        << "Old Instr: " << *MI << "New Instr: " << *WaitcntInstr
-                        << '\n');
+      LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
+                     ? dbgs() << "applyPreexistingWaitcnt\n"
+                              << "New Instr at block end: " << *WaitcntInstr
+                              << '\n'
+                     : dbgs() << "applyPreexistingWaitcnt\n"
+                              << "Old Instr: " << *It
+                              << "New Instr: " << *WaitcntInstr << '\n');
+
     } else {
       WaitcntInstr->eraseFromParent();
       Modified = true;
@@ -885,9 +921,13 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
       ScoreBrackets.applyWaitcnt(Wait);
       Wait.VsCnt = ~0u;
 
-      LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
-                        << "Old Instr: " << *MI
-                        << "New Instr: " << *WaitcntVsCntInstr << '\n');
+      LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
+                     ? dbgs() << "applyPreexistingWaitcnt\n"
+                              << "New Instr at block end: "
+                              << *WaitcntVsCntInstr << '\n'
+                     : dbgs() << "applyPreexistingWaitcnt\n"
+                              << "Old Instr: " << *It
+                              << "New Instr: " << *WaitcntVsCntInstr << '\n');
     } else {
       WaitcntVsCntInstr->eraseFromParent();
       Modified = true;
@@ -928,16 +968,18 @@ static bool callWaitsOnFunctionReturn(const MachineInstr &MI) {
 ///  and if so what the value of each counter is.
 ///  The "score bracket" is bound by the lower bound and upper bound
 ///  scores (*_score_LB and *_score_ub respectively).
-bool SIInsertWaitcnts::generateWaitcntInstBefore(
-    MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
-    MachineInstr *OldWaitcntInstr) {
+///  If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
+///  flush the vmcnt counter here.
+bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
+                                                 WaitcntBrackets &ScoreBrackets,
+                                                 MachineInstr *OldWaitcntInstr,
+                                                 bool FlushVmCnt) {
   setForceEmitWaitcnt();
 
   if (MI.isMetaInstruction())
     return false;
 
   AMDGPU::Waitcnt Wait;
-  bool Modified = false;
 
   // FIXME: This should have already been handled by the memory legalizer.
   // Removing this currently doesn't affect any lit tests, but we need to
@@ -955,16 +997,17 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
   // NOTE: this could be improved with knowledge of all call sites or
   //   with knowledge of the called routines.
   if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
+      MI.getOpcode() == AMDGPU::SI_RETURN ||
       MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
-      MI.getOpcode() == AMDGPU::S_SETPC_B64_return_gfx ||
       (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
     Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
   }
   // Resolve vm waits before gs-done.
   else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
             MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
-           ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
-            AMDGPU::SendMsg::ID_GS_DONE)) {
+           ST->hasLegacyGeometry() &&
+           ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
+            AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) {
     Wait.VmCnt = 0;
   }
 #if 0 // TODO: the following blocks of logic when we have fence.
@@ -1040,7 +1083,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
     if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
       // The function is going to insert a wait on everything in its prolog.
       // This still needs to be careful if the call target is a load (e.g. a GOT
-      // load). We also need to check WAW depenancy with saved PC.
+      // load). We also need to check WAW dependency with saved PC.
       Wait = AMDGPU::Waitcnt();
 
       int CallAddrOpIdx =
@@ -1089,7 +1132,10 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
             SLoadAddresses.erase(Ptr);
         }
         unsigned AS = Memop->getAddrSpace();
-        if (AS != AMDGPUAS::LOCAL_ADDRESS)
+        if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS)
+          continue;
+        // No need to wait before load from VMEM to LDS.
+        if (mayWriteLDSThroughDMA(MI))
           continue;
         unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
         // VM_CNT is only relevant to vgpr or LDS.
@@ -1123,7 +1169,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
                   VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
               ScoreBrackets.clearVgprVmemTypes(RegNo);
             }
-            if (Op.isDef()) {
+            if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
               ScoreBrackets.determineWait(
                   EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
             }
@@ -1170,47 +1216,93 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
   if (ForceEmitWaitcnt[VS_CNT])
     Wait.VsCnt = 0;
 
-  if (OldWaitcntInstr) {
+  if (FlushVmCnt) {
+    unsigned UB = ScoreBrackets.getScoreUB(VM_CNT);
+    unsigned LB = ScoreBrackets.getScoreLB(VM_CNT);
+    if (UB - LB != 0)
+      Wait.VmCnt = 0;
+  }
+
+  return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
+                         OldWaitcntInstr);
+}
+
+// Add a waitcnt to flush the vmcnt counter at the end of the given block if
+// needed.
+bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block,
+                                               WaitcntBrackets &ScoreBrackets,
+                                               MachineInstr *OldWaitcntInstr) {
+  AMDGPU::Waitcnt Wait;
+
+  unsigned UB = ScoreBrackets.getScoreUB(VM_CNT);
+  unsigned LB = ScoreBrackets.getScoreLB(VM_CNT);
+  if (UB - LB == 0)
+    return false;
+
+  Wait.VmCnt = 0;
+
+  return generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
+                         OldWaitcntInstr);
+}
+
+bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
+                                       MachineBasicBlock::instr_iterator It,
+                                       MachineBasicBlock &Block,
+                                       WaitcntBrackets &ScoreBrackets,
+                                       MachineInstr *OldWaitcntInstr) {
+  bool Modified = false;
+  const DebugLoc &DL = Block.findDebugLoc(It);
+
+  if (OldWaitcntInstr)
     // Try to merge the required wait with preexisting waitcnt instructions.
     // Also erase redundant waitcnt.
     Modified =
-        applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, &MI);
-  } else {
-    // Update waitcnt brackets after determining the required wait.
+        applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
+  else
     ScoreBrackets.applyWaitcnt(Wait);
+
+  // ExpCnt can be merged into VINTERP.
+  if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
+      SIInstrInfo::isVINTERP(*It)) {
+    MachineOperand *WaitExp =
+        TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
+    if (Wait.ExpCnt < WaitExp->getImm()) {
+      WaitExp->setImm(Wait.ExpCnt);
+      Modified = true;
+    }
+    Wait.ExpCnt = ~0u;
+
+    LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
+                      << "Update Instr: " << *It);
   }
 
   // Build new waitcnt instructions unless no wait is needed or the old waitcnt
   // instruction was modified to handle the required wait.
   if (Wait.hasWaitExceptVsCnt()) {
     unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
-    auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
-                             MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
-                         .addImm(Enc);
+    auto SWaitInst =
+        BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
     TrackedWaitcntSet.insert(SWaitInst);
     Modified = true;
 
-    LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
-                      << "Old Instr: " << MI
-                      << "New Instr: " << *SWaitInst << '\n');
+    LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+               if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+               dbgs() << "New Instr: " << *SWaitInst << '\n');
   }
 
   if (Wait.hasWaitVsCnt()) {
     assert(ST->hasVscnt());
 
-    auto SWaitInst =
-        BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
-                TII->get(AMDGPU::S_WAITCNT_VSCNT))
-            .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
-            .addImm(Wait.VsCnt);
+    auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
+                         .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+                         .addImm(Wait.VsCnt);
     TrackedWaitcntSet.insert(SWaitInst);
     Modified = true;
 
-    LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
-                      << "Old Instr: " << MI
-                      << "New Instr: " << *SWaitInst << '\n');
+    LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+               if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+               dbgs() << "New Instr: " << *SWaitInst << '\n');
   }
-
   return Modified;
 }
 
@@ -1338,6 +1430,11 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
       // May need to way wait for anything.
       ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
     }
+  } else if (SIInstrInfo::isLDSDIR(Inst)) {
+    ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst);
+  } else if (TII->isVINTERP(Inst)) {
+    int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
+    ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
   } else if (SIInstrInfo::isEXP(Inst)) {
     unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
     if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
@@ -1349,6 +1446,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
   } else {
     switch (Inst.getOpcode()) {
     case AMDGPU::S_SENDMSG:
+    case AMDGPU::S_SENDMSG_RTN_B32:
+    case AMDGPU::S_SENDMSG_RTN_B64:
     case AMDGPU::S_SENDMSGHALT:
       ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
       break;
@@ -1476,8 +1575,12 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
       continue;
     }
 
+    bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
+                      isPreheaderToFlush(Block, ScoreBrackets);
+
     // Generate an s_waitcnt instruction to be placed before Inst, if needed.
-    Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
+    Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
+                                          FlushVmCnt);
     OldWaitcntInstr = nullptr;
 
     // Restore vccz if it's not known to be correct already.
@@ -1562,9 +1665,101 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
     ++Iter;
   }
 
+  if (Block.getFirstTerminator() == Block.end() &&
+      isPreheaderToFlush(Block, ScoreBrackets))
+    Modified |= generateWaitcntBlockEnd(Block, ScoreBrackets, OldWaitcntInstr);
+
   return Modified;
 }
 
+// Return true if the given machine basic block is a preheader of a loop in
+// which we want to flush the vmcnt counter, and false otherwise.
+bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
+                                          WaitcntBrackets &ScoreBrackets) {
+  if (PreheadersToFlush.count(&MBB))
+    return PreheadersToFlush[&MBB];
+
+  auto UpdateCache = [&](bool val) {
+    PreheadersToFlush[&MBB] = val;
+    return val;
+  };
+
+  MachineBasicBlock *Succ = MBB.getSingleSuccessor();
+  if (!Succ)
+    return UpdateCache(false);
+
+  MachineLoop *Loop = MLI->getLoopFor(Succ);
+  if (!Loop)
+    return UpdateCache(false);
+
+  if (Loop->getLoopPreheader() == &MBB && shouldFlushVmCnt(Loop, ScoreBrackets))
+    return UpdateCache(true);
+
+  return UpdateCache(false);
+}
+
+// Return true if it is better to flush the vmcnt counter in the preheader of
+// the given loop. We currently decide to flush in two situations:
+// 1. The loop contains vmem store(s), no vmem load and at least one use of a
+//    vgpr containing a value that is loaded outside of the loop. (Only on
+//    targets with no vscnt counter).
+// 2. The loop contains vmem load(s), but the loaded values are not used in the
+//    loop, and at least one use of a vgpr containing a value that is loaded
+//    outside of the loop.
+bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
+                                        WaitcntBrackets &Brackets) {
+  bool HasVMemLoad = false;
+  bool HasVMemStore = false;
+  bool UsesVgprLoadedOutside = false;
+  DenseSet<Register> VgprUse;
+  DenseSet<Register> VgprDef;
+
+  for (MachineBasicBlock *MBB : ML->blocks()) {
+    for (MachineInstr &MI : *MBB) {
+      if (SIInstrInfo::isVMEM(MI)) {
+        if (MI.mayLoad())
+          HasVMemLoad = true;
+        if (MI.mayStore())
+          HasVMemStore = true;
+      }
+      for (unsigned I = 0; I < MI.getNumOperands(); I++) {
+        MachineOperand &Op = MI.getOperand(I);
+        if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg()))
+          continue;
+        RegInterval Interval = Brackets.getRegInterval(&MI, TII, MRI, TRI, I);
+        // Vgpr use
+        if (Op.isUse()) {
+          for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+            // If we find a register that is loaded inside the loop, 1. and 2.
+            // are invalidated and we can exit.
+            if (VgprDef.contains(RegNo))
+              return false;
+            VgprUse.insert(RegNo);
+            // If at least one of Op's registers is in the score brackets, the
+            // value is likely loaded outside of the loop.
+            if (Brackets.getRegScore(RegNo, VM_CNT) > 0) {
+              UsesVgprLoadedOutside = true;
+              break;
+            }
+          }
+        }
+        // VMem load vgpr def
+        else if (SIInstrInfo::isVMEM(MI) && MI.mayLoad() && Op.isDef())
+          for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+            // If we find a register that is loaded inside the loop, 1. and 2.
+            // are invalidated and we can exit.
+            if (VgprUse.contains(RegNo))
+              return false;
+            VgprDef.insert(RegNo);
+          }
+      }
+    }
+  }
+  if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
+    return true;
+  return HasVMemLoad && UsesVgprLoadedOutside;
+}
+
 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
   ST = &MF.getSubtarget<GCNSubtarget>();
   TII = ST->getInstrInfo();
@@ -1572,6 +1767,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
   IV = AMDGPU::getIsaVersion(ST->getCPU());
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  MLI = &getAnalysis<MachineLoopInfo>();
   PDT = &getAnalysis<MachinePostDominatorTree>();
 
   ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index e39f52875f1f..b398e108bf62 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -48,6 +48,12 @@ class InstSI <dag outs, dag ins, string asm = "",
   field bit VGPRSpill = 0;
   field bit SGPRSpill = 0;
 
+  // LDSDIR instruction format.
+  field bit LDSDIR = 0;
+
+  // VINTERP instruction format.
+  field bit VINTERP = 0;
+
   // High bits - other information.
   field bit VM_CNT = 0;
   field bit EXP_CNT = 0;
@@ -141,6 +147,9 @@ class InstSI <dag outs, dag ins, string asm = "",
   // Atomic with return.
   field bit IsAtomicRet = 0;
 
+  // This bit indicates that this is one of WMMA instructions.
+  field bit IsWMMA = 0;
+
   // These need to be kept in sync with the enum in SIInstrFlags.
   let TSFlags{0} = SALU;
   let TSFlags{1} = VALU;
@@ -173,6 +182,9 @@ class InstSI <dag outs, dag ins, string asm = "",
   let TSFlags{24} = VGPRSpill;
   let TSFlags{25} = SGPRSpill;
 
+  let TSFlags{26} = LDSDIR;
+  let TSFlags{27} = VINTERP;
+
   let TSFlags{32} = VM_CNT;
   let TSFlags{33} = EXP_CNT;
   let TSFlags{34} = LGKM_CNT;
@@ -215,6 +227,8 @@ class InstSI <dag outs, dag ins, string asm = "",
 
   let TSFlags{58} = IsAtomicRet;
 
+  let TSFlags{59} = IsWMMA;
+
   let SchedRW = [Write32Bit];
 
   let AsmVariantName = AMDGPUAsmVariants.Default;
@@ -261,6 +275,11 @@ class Enc64 {
   int Size = 8;
 }
 
+class Enc96 {
+  field bits<96> Inst;
+  int Size = 12;
+}
+
 def CPolBit {
   int GLC = 0;
   int SLC = 1;
@@ -284,7 +303,7 @@ class VINTRPe <bits<2> op> : Enc32 {
   let Inst{31-26} = 0x32; // encoding
 }
 
-class MIMGe : Enc64 {
+class MIMGe_gfxpre11 : Enc64 {
   bits<10> vdata;
   bits<4> dmask;
   bits<1> unorm;
@@ -309,7 +328,7 @@ class MIMGe : Enc64 {
   let Inst{63} = d16;
 }
 
-class MIMGe_gfx6789 <bits<8> op> : MIMGe {
+class MIMGe_gfx6789 <bits<8> op> : MIMGe_gfxpre11 {
   bits<8> vaddr;
   bits<1> da;
 
@@ -321,7 +340,7 @@ class MIMGe_gfx6789 <bits<8> op> : MIMGe {
   let Inst{39-32} = vaddr;
 }
 
-class MIMGe_gfx90a <bits<8> op> : MIMGe {
+class MIMGe_gfx90a <bits<8> op> : MIMGe_gfxpre11 {
   bits<8> vaddr;
   bits<1> da;
 
@@ -333,7 +352,7 @@ class MIMGe_gfx90a <bits<8> op> : MIMGe {
   let Inst{39-32} = vaddr;
 }
 
-class MIMGe_gfx10 <bits<8> op> : MIMGe {
+class MIMGe_gfx10 <bits<8> op> : MIMGe_gfxpre11 {
   bits<8> vaddr0;
   bits<3> dim;
   bits<2> nsa;
@@ -349,12 +368,46 @@ class MIMGe_gfx10 <bits<8> op> : MIMGe {
   let Inst{62} = a16;
 }
 
+class MIMGe_gfx11 <bits<8> op> : Enc64 {
+  bits<8> vdata;
+  bits<4> dmask;
+  bits<1> unorm;
+  bits<5> cpol;
+  bits<1> r128;
+  bits<1> tfe;
+  bits<1> lwe;
+  bits<7> srsrc;
+  bits<7> ssamp;
+  bit d16;
+  bits<1> a16;
+  bits<8> vaddr0;
+  bits<3> dim;
+  bits<1> nsa;
+
+  let Inst{0} = nsa;
+  let Inst{4-2} = dim;
+  let Inst{7} = unorm;
+  let Inst{11-8} = dmask;
+  let Inst{12} = cpol{CPolBit.SLC};
+  let Inst{13} = cpol{CPolBit.DLC};
+  let Inst{14} = cpol{CPolBit.GLC};
+  let Inst{15} = r128;
+  let Inst{16} = a16;
+  let Inst{17} = d16;
+  let Inst{25-18} = op;
+  let Inst{31-26} = 0x3c;
+  let Inst{39-32} = vaddr0;
+  let Inst{47-40} = vdata;
+  let Inst{52-48} = srsrc{6-2};
+  let Inst{53} = tfe;
+  let Inst{54} = lwe;
+  let Inst{62-58} = ssamp{6-2};
+}
+
 class EXPe : Enc64 {
   bits<4> en;
   bits<6> tgt;
-  bits<1> compr;
   bits<1> done;
-  bits<1> vm;
   bits<8> src0;
   bits<8> src1;
   bits<8> src2;
@@ -362,9 +415,7 @@ class EXPe : Enc64 {
 
   let Inst{3-0} = en;
   let Inst{9-4} = tgt;
-  let Inst{10} = compr;
   let Inst{11} = done;
-  let Inst{12} = vm;
   let Inst{31-26} = 0x3e;
   let Inst{39-32} = src0;
   let Inst{47-40} = src1;
@@ -372,6 +423,22 @@ class EXPe : Enc64 {
   let Inst{63-56} = src3;
 }
 
+// Pre-GFX11 encoding has compr and vm bits.
+class EXPe_ComprVM : EXPe {
+  bits<1> compr;
+  bits<1> vm;
+
+  let Inst{10} = compr;
+  let Inst{12} = vm;
+}
+
+// GFX11+ encoding has row bit.
+class EXPe_Row : EXPe {
+  bits<1> row;
+
+  let Inst{13} = row;
+}
+
 let Uses = [EXEC] in {
 
 class VINTRPCommon <dag outs, dag ins, string asm, list<dag> pattern> :
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 0a2f9381e71f..814a7c446889 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -16,12 +16,12 @@
 #include "AMDGPUInstrInfo.h"
 #include "GCNHazardRecognizer.h"
 #include "GCNSubtarget.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
@@ -130,9 +130,31 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
   return false;
 }
 
-static bool readsExecAsData(const MachineInstr &MI) {
-  if (MI.isCompare())
-    return true;
+// Returns true if the scalar result of a VALU instruction depends on exec.
+static bool resultDependsOnExec(const MachineInstr &MI) {
+  // Ignore comparisons which are only used masked with exec.
+  // This allows some hoisting/sinking of VALU comparisons.
+  if (MI.isCompare()) {
+    const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+    Register DstReg = MI.getOperand(0).getReg();
+    if (!DstReg.isVirtual())
+      return true;
+    for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
+      switch (Use.getOpcode()) {
+      case AMDGPU::S_AND_SAVEEXEC_B32:
+      case AMDGPU::S_AND_SAVEEXEC_B64:
+        break;
+      case AMDGPU::S_AND_B32:
+      case AMDGPU::S_AND_B64:
+        if (!Use.readsRegister(AMDGPU::EXEC))
+          return true;
+        break;
+      default:
+        return true;
+      }
+    }
+    return false;
+  }
 
   switch (MI.getOpcode()) {
   default:
@@ -147,7 +169,7 @@ static bool readsExecAsData(const MachineInstr &MI) {
 bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const {
   // Any implicit use of exec by VALU is not a real register read.
   return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
-         isVALU(*MO.getParent()) && !readsExecAsData(*MO.getParent());
+         isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
 }
 
 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
@@ -181,7 +203,7 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
     if (Offset0Idx == -1 || Offset1Idx == -1)
       return false;
 
-    // XXX - be careful of datalesss loads
+    // XXX - be careful of dataless loads
     // getNamedOperandIdx returns the index for MachineInstrs.  Since they
     // include the output in the operand list, but SDNodes don't, we need to
     // subtract the index by one.
@@ -362,6 +384,8 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth(
     DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
     if (DataOpIdx == -1)
       DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
+    if (DataOpIdx == -1) // LDS DMA
+      return false;
     Width = getOpSize(LdSt, DataOpIdx);
     return true;
   }
@@ -410,6 +434,8 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth(
     DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
     if (DataOpIdx == -1)
       DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
+    if (DataOpIdx == -1) // LDS DMA
+      return false;
     Width = getOpSize(LdSt, DataOpIdx);
     return true;
   }
@@ -464,7 +490,7 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
     return false;
   }
 
-  // In order to avoid regester pressure, on an average, the number of DWORDS
+  // In order to avoid register pressure, on an average, the number of DWORDS
   // loaded together by all clustered mem ops should not exceed 8. This is an
   // empirical value based on certain observations and performance related
   // experiments.
@@ -517,8 +543,9 @@ static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
     .addReg(SrcReg, getKillRegState(KillSrc));
 }
 
-/// Handle copying from SGPR to AGPR, or from AGPR to AGPR. It is not possible
-/// to directly copy, so an intermediate VGPR needs to be used.
+/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
+/// possible to have a direct copy in these cases on GFX908, so an intermediate
+/// VGPR copy is required.
 static void indirectCopyToAGPR(const SIInstrInfo &TII,
                                MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator MI,
@@ -527,10 +554,18 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII,
                                RegScavenger &RS,
                                Register ImpDefSuperReg = Register(),
                                Register ImpUseSuperReg = Register()) {
-  const SIRegisterInfo &RI = TII.getRegisterInfo();
+  assert((TII.getSubtarget().hasMAIInsts() &&
+          !TII.getSubtarget().hasGFX90AInsts()) &&
+         "Expected GFX908 subtarget.");
 
-  assert(AMDGPU::SReg_32RegClass.contains(SrcReg) ||
-         AMDGPU::AGPR_32RegClass.contains(SrcReg));
+  assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
+          AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
+         "Source register of the copy should be either an SGPR or an AGPR.");
+
+  assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
+         "Destination register of the copy should be an AGPR.");
+
+  const SIRegisterInfo &RI = TII.getRegisterInfo();
 
   // First try to find defining accvgpr_write to avoid temporary registers.
   for (auto Def = MI, E = MBB.begin(); Def != E; ) {
@@ -581,23 +616,21 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII,
 
   // Registers in the sequence are allocated contiguously so we can just
   // use register number to pick one of three round-robin temps.
-  unsigned RegNo = DestReg % 3;
-  Register Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
-  if (!Tmp)
-    report_fatal_error("Cannot scavenge VGPR to copy to AGPR");
-  RS.setRegUsed(Tmp);
-
-  if (!TII.getSubtarget().hasGFX90AInsts()) {
-    // Only loop through if there are any free registers left, otherwise
-    // scavenger may report a fatal error without emergency spill slot
-    // or spill with the slot.
-    while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) {
-      Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
-      if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
-        break;
-      Tmp = Tmp2;
-      RS.setRegUsed(Tmp);
-    }
+  unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
+  Register Tmp =
+      MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
+  assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
+         "VGPR used for an intermediate copy should have been reserved.");
+
+  // Only loop through if there are any free registers left, otherwise
+  // scavenger may report a fatal error without emergency spill slot
+  // or spill with the slot.
+  while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) {
+    Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
+    if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
+      break;
+    Tmp = Tmp2;
+    RS.setRegUsed(Tmp);
   }
 
   // Insert copy to temporary VGPR.
@@ -796,7 +829,8 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   }
 
   if (RC == &AMDGPU::AGPR_32RegClass) {
-    if (AMDGPU::VGPR_32RegClass.contains(SrcReg)) {
+    if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
+        (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
       BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
         .addReg(SrcReg, getKillRegState(KillSrc));
       return;
@@ -884,6 +918,11 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   const TargetRegisterClass *SrcRC = RI.getPhysRegClass(SrcReg);
   if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
+    if (ST.hasMovB64()) {
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+      return;
+    }
     if (ST.hasPackedFP32Ops()) {
       BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
         .addImm(SISrcMods::OP_SEL_1)
@@ -906,7 +945,9 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
       return;
     }
-    expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RC, Forward);
+    const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
+    expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
+                   Forward);
     return;
   }
 
@@ -915,7 +956,8 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   if (RI.isAGPRClass(RC)) {
     if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
       Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
-    else if (RI.hasVGPRs(SrcRC))
+    else if (RI.hasVGPRs(SrcRC) ||
+             (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
       Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
     else
       Opcode = AMDGPU::INSTRUCTION_LIST_END;
@@ -925,7 +967,10 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
              (RI.isProperlyAlignedRC(*RC) &&
               (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
     // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
-    if (ST.hasPackedFP32Ops()) {
+    if (ST.hasMovB64()) {
+      Opcode = AMDGPU::V_MOV_B64_e32;
+      EltSize = 8;
+    } else if (ST.hasPackedFP32Ops()) {
       Opcode = AMDGPU::V_PK_MOV_B32;
       EltSize = 8;
     }
@@ -1725,13 +1770,8 @@ unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
 
   case AMDGPU::S_NOP:
     return MI.getOperand(0).getImm() + 1;
-
-  // FIXME: Any other pseudo instruction?
   // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
   // hazard, even if one exist, won't really be visible. Should we handle it?
-  case AMDGPU::SI_MASKED_UNREACHABLE:
-  case AMDGPU::WAVE_BARRIER:
-    return 0;
   }
 }
 
@@ -1807,6 +1847,11 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     const MachineOperand &SrcOp = MI.getOperand(1);
     // FIXME: Will this work for 64-bit floating point immediates?
     assert(!SrcOp.isFPImm());
+    if (ST.hasMovB64()) {
+      MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
+      if (!isLiteralConstant(MI, 1) || isUInt<32>(SrcOp.getImm()))
+        break;
+    }
     if (SrcOp.isImm()) {
       APInt Imm(64, SrcOp.getImm());
       APInt Lo(32, Imm.getLoBits(32).getZExtValue());
@@ -1887,6 +1932,10 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case AMDGPU::V_SET_INACTIVE_B32: {
     unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
     unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+    // FIXME: We may possibly optimize the COPY once we find ways to make LLVM
+    // optimizations (mainly Register Coalescer) aware of WWM register liveness.
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
+        .add(MI.getOperand(1));
     auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
     FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
@@ -1899,11 +1948,15 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case AMDGPU::V_SET_INACTIVE_B64: {
     unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
     unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-    auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
-    FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
     MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
                                  MI.getOperand(0).getReg())
-      .add(MI.getOperand(2));
+                             .add(MI.getOperand(1));
+    expandPostRAPseudo(*Copy);
+    auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
+    FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
+    Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
+                   MI.getOperand(0).getReg())
+               .add(MI.getOperand(2));
     expandPostRAPseudo(*Copy);
     BuildMI(MBB, MI, DL, get(NotOpc), Exec)
       .addReg(Exec);
@@ -2085,6 +2138,23 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
     break;
   }
+  case AMDGPU::SI_RETURN: {
+    const MachineFunction *MF = MBB.getParent();
+    const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+    const SIRegisterInfo *TRI = ST.getRegisterInfo();
+    // Hiding the return address use with SI_RETURN may lead to extra kills in
+    // the function and missing live-ins. We are fine in practice because callee
+    // saved register handling ensures the register value is restored before
+    // RET, but we need the undef flag here to appease the MachineVerifier
+    // liveness checks.
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
+            .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
+
+    MIB.copyImplicitOps(MI);
+    MI.eraseFromParent();
+    break;
+  }
   }
   return true;
 }
@@ -2093,6 +2163,13 @@ std::pair<MachineInstr*, MachineInstr*>
 SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
   assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
 
+  if (ST.hasMovB64() &&
+      AMDGPU::isLegal64BitDPPControl(
+        getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
+    MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
+    return std::make_pair(&MI, nullptr);
+  }
+
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MBB.findDebugLoc(MI);
   MachineFunction *MF = MBB.getParent();
@@ -2789,6 +2866,8 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
   case AMDGPU::V_MOV_B32_e32:
   case AMDGPU::V_MOV_B32_e64:
   case AMDGPU::V_MOV_B64_PSEUDO:
+  case AMDGPU::V_MOV_B64_e32:
+  case AMDGPU::V_MOV_B64_e64:
   case AMDGPU::S_MOV_B32:
   case AMDGPU::S_MOV_B64:
   case AMDGPU::COPY:
@@ -2801,35 +2880,15 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
   }
 }
 
-unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind(
-    unsigned Kind) const {
-  switch(Kind) {
-  case PseudoSourceValue::Stack:
-  case PseudoSourceValue::FixedStack:
-    return AMDGPUAS::PRIVATE_ADDRESS;
-  case PseudoSourceValue::ConstantPool:
-  case PseudoSourceValue::GOT:
-  case PseudoSourceValue::JumpTable:
-  case PseudoSourceValue::GlobalValueCallEntry:
-  case PseudoSourceValue::ExternalSymbolCallEntry:
-  case PseudoSourceValue::TargetCustom:
-    return AMDGPUAS::CONSTANT_ADDRESS;
-  }
-  return AMDGPUAS::FLAT_ADDRESS;
-}
+static constexpr unsigned ModifierOpNames[] = {
+    AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
+    AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
+    AMDGPU::OpName::omod};
 
-static void removeModOperands(MachineInstr &MI) {
+void SIInstrInfo::removeModOperands(MachineInstr &MI) const {
   unsigned Opc = MI.getOpcode();
-  int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
-                                              AMDGPU::OpName::src0_modifiers);
-  int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
-                                              AMDGPU::OpName::src1_modifiers);
-  int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
-                                              AMDGPU::OpName::src2_modifiers);
-
-  MI.RemoveOperand(Src2ModIdx);
-  MI.RemoveOperand(Src1ModIdx);
-  MI.RemoveOperand(Src0ModIdx);
+  for (unsigned Name : reverse(ModifierOpNames))
+    MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, Name));
 }
 
 bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
@@ -2841,7 +2900,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
   default:
     return false;
   case AMDGPU::S_MOV_B64:
-    // TODO: We could fold 64-bit immediates, but this get compilicated
+    // TODO: We could fold 64-bit immediates, but this get complicated
     // when there are sub-registers.
     return false;
 
@@ -2921,7 +2980,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
     MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
 
     // Multiplied part is the constant: Use v_madmk_{f16, f32}.
-    // We should only expect these to be on src0 due to canonicalizations.
+    // We should only expect these to be on src0 due to canonicalization.
     if (Src0->isReg() && Src0->getReg() == Reg) {
       if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
         return false;
@@ -2942,12 +3001,6 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
       // FIXME: This would be a lot easier if we could return a new instruction
       // instead of having to modify in place.
 
-      // Remove these first since they are at the end.
-      UseMI.RemoveOperand(
-          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
-      UseMI.RemoveOperand(
-          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
-
       Register Src1Reg = Src1->getReg();
       unsigned Src1SubReg = Src1->getSubReg();
       Src0->setReg(Src1Reg);
@@ -2966,7 +3019,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
       removeModOperands(UseMI);
       UseMI.setDesc(get(NewOpc));
 
-      bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
+      bool DeleteDef = MRI->use_nodbg_empty(Reg);
       if (DeleteDef)
         DefMI.eraseFromParent();
 
@@ -3025,12 +3078,6 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
       // FIXME: This would be a lot easier if we could return a new instruction
       // instead of having to modify in place.
 
-      // Remove these first since they are at the end.
-      UseMI.RemoveOperand(
-          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
-      UseMI.RemoveOperand(
-          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
-
       if (Opc == AMDGPU::V_MAC_F32_e64 ||
           Opc == AMDGPU::V_MAC_F16_e64 ||
           Opc == AMDGPU::V_FMAC_F32_e64 ||
@@ -3049,7 +3096,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
       // constant and SGPR are illegal.
       legalizeOperands(UseMI);
 
-      bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
+      bool DeleteDef = MRI->use_nodbg_empty(Reg);
       if (DeleteDef)
         DefMI.eraseFromParent();
 
@@ -3192,34 +3239,68 @@ static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI,
 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
                                                  LiveVariables *LV,
                                                  LiveIntervals *LIS) const {
+  MachineBasicBlock &MBB = *MI.getParent();
   unsigned Opc = MI.getOpcode();
-  bool IsF16 = false;
+
+  // Handle MFMA.
+  int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
+  if (NewMFMAOpc != -1) {
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
+    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
+      MIB.add(MI.getOperand(I));
+    updateLiveVariables(LV, MI, *MIB);
+    if (LIS)
+      LIS->ReplaceMachineInstrInMaps(MI, *MIB);
+    return MIB;
+  }
+
+  if (SIInstrInfo::isWMMA(MI)) {
+    unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
+    MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
+                                  .setMIFlags(MI.getFlags());
+    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
+      MIB->addOperand(MI.getOperand(I));
+
+    updateLiveVariables(LV, MI, *MIB);
+    if (LIS)
+      LIS->ReplaceMachineInstrInMaps(MI, *MIB);
+
+    return MIB;
+  }
+
+  // Handle MAC/FMAC.
+  bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
+               Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64;
   bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
+               Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
+               Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
                Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
                Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
   bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
-  int NewMFMAOpc = -1;
+  bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
+                  Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
+                  Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
+                  Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
+  bool Src0Literal = false;
 
   switch (Opc) {
   default:
-    NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
-    if (NewMFMAOpc == -1)
-      return nullptr;
-    break;
+    return nullptr;
   case AMDGPU::V_MAC_F16_e64:
   case AMDGPU::V_FMAC_F16_e64:
-    IsF16 = true;
-    LLVM_FALLTHROUGH;
   case AMDGPU::V_MAC_F32_e64:
+  case AMDGPU::V_MAC_LEGACY_F32_e64:
   case AMDGPU::V_FMAC_F32_e64:
+  case AMDGPU::V_FMAC_LEGACY_F32_e64:
   case AMDGPU::V_FMAC_F64_e64:
     break;
   case AMDGPU::V_MAC_F16_e32:
   case AMDGPU::V_FMAC_F16_e32:
-    IsF16 = true;
-    LLVM_FALLTHROUGH;
   case AMDGPU::V_MAC_F32_e32:
+  case AMDGPU::V_MAC_LEGACY_F32_e32:
   case AMDGPU::V_FMAC_F32_e32:
+  case AMDGPU::V_FMAC_LEGACY_F32_e32:
   case AMDGPU::V_FMAC_F64_e32: {
     int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
                                              AMDGPU::OpName::src0);
@@ -3228,25 +3309,13 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
       return nullptr;
 
     if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
-      return nullptr;
+      Src0Literal = true;
 
     break;
   }
   }
 
   MachineInstrBuilder MIB;
-  MachineBasicBlock &MBB = *MI.getParent();
-
-  if (NewMFMAOpc != -1) {
-    MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
-    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
-      MIB.add(MI.getOperand(I));
-    updateLiveVariables(LV, MI, *MIB);
-    if (LIS)
-      LIS->ReplaceMachineInstrInMaps(MI, *MIB);
-    return MIB;
-  }
-
   const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
   const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
   const MachineOperand *Src0Mods =
@@ -3255,10 +3324,13 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
   const MachineOperand *Src1Mods =
     getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
   const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
+  const MachineOperand *Src2Mods =
+      getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
   const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
   const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
 
-  if (!Src0Mods && !Src1Mods && !Clamp && !Omod && !IsF64 &&
+  if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsF64 &&
+      !IsLegacy &&
       // If we have an SGPR input, we will violate the constant bus restriction.
       (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
        !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
@@ -3271,11 +3343,11 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
       // We cannot just remove the DefMI here, calling pass will crash.
       DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
       for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
-        DefMI->RemoveOperand(I);
+        DefMI->removeOperand(I);
     };
 
     int64_t Imm;
-    if (getFoldableImm(Src2, Imm, &DefMI)) {
+    if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
       unsigned NewOpc =
           IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32)
                 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
@@ -3295,7 +3367,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
     unsigned NewOpc = IsFMA
                           ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32)
                           : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
-    if (getFoldableImm(Src1, Imm, &DefMI)) {
+    if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
       if (pseudoToMCOpcode(NewOpc) != -1) {
         MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
                   .add(*Dst)
@@ -3309,7 +3381,11 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
         return MIB;
       }
     }
-    if (getFoldableImm(Src0, Imm, &DefMI)) {
+    if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
+      if (Src0Literal) {
+        Imm = Src0->getImm();
+        DefMI = nullptr;
+      }
       if (pseudoToMCOpcode(NewOpc) != -1 &&
           isOperandLegal(
               MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
@@ -3322,16 +3398,27 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
         updateLiveVariables(LV, MI, *MIB);
         if (LIS)
           LIS->ReplaceMachineInstrInMaps(MI, *MIB);
-        killDef();
+        if (DefMI)
+          killDef();
         return MIB;
       }
     }
   }
 
-  unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64
-                                   : IsF64 ? AMDGPU::V_FMA_F64_e64
-                                           : AMDGPU::V_FMA_F32_e64)
-                          : (IsF16 ? AMDGPU::V_MAD_F16_e64 : AMDGPU::V_MAD_F32_e64);
+  // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
+  // because VOP3 does not allow a literal operand.
+  // TODO: Remove this restriction for GFX10.
+  if (Src0Literal)
+    return nullptr;
+
+  unsigned NewOpc = IsFMA ? IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64
+                                  : IsF64 ? AMDGPU::V_FMA_F64_e64
+                                          : IsLegacy
+                                                ? AMDGPU::V_FMA_LEGACY_F32_e64
+                                                : AMDGPU::V_FMA_F32_e64
+                          : IsF16 ? AMDGPU::V_MAD_F16_e64
+                                  : IsLegacy ? AMDGPU::V_MAD_LEGACY_F32_e64
+                                             : AMDGPU::V_MAD_F32_e64;
   if (pseudoToMCOpcode(NewOpc) == -1)
     return nullptr;
 
@@ -3341,7 +3428,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
             .add(*Src0)
             .addImm(Src1Mods ? Src1Mods->getImm() : 0)
             .add(*Src1)
-            .addImm(0) // Src mods
+            .addImm(Src2Mods ? Src2Mods->getImm() : 0)
             .add(*Src2)
             .addImm(Clamp ? Clamp->getImm() : 0)
             .addImm(Omod ? Omod->getImm() : 0);
@@ -3383,6 +3470,9 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
   if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
     return true;
 
+  if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
+    return true;
+
   // Target-independent instructions do not have an implicit-use of EXEC, even
   // when they operate on VGPRs. Treating EXEC modifications as scheduling
   // boundaries prevents incorrect movements of such instructions.
@@ -3676,11 +3766,8 @@ bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
 }
 
 bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const {
-  return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
-         hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
-         hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) ||
-         hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
-         hasModifiersSet(MI, AMDGPU::OpName::omod);
+  return any_of(ModifierOpNames,
+                [&](unsigned Name) { return hasModifiersSet(MI, Name); });
 }
 
 bool SIInstrInfo::canShrink(const MachineInstr &MI,
@@ -3754,18 +3841,19 @@ static void copyFlagsToImplicitVCC(MachineInstr &MI,
 
 MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
                                            unsigned Op32) const {
-  MachineBasicBlock *MBB = MI.getParent();;
+  MachineBasicBlock *MBB = MI.getParent();
   MachineInstrBuilder Inst32 =
     BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32))
     .setMIFlags(MI.getFlags());
 
   // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
   // For VOPC instructions, this is replaced by an implicit def of vcc.
-  int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
-  if (Op32DstIdx != -1) {
+  if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst) != -1) {
     // dst
     Inst32.add(MI.getOperand(0));
-  } else {
+  } else if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::sdst) != -1) {
+    // VOPCX instructions won't be writing to an explicit dst, so this should
+    // not fail for these instructions.
     assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) ||
             (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) &&
            "Unexpected case");
@@ -3816,7 +3904,7 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
     return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
 
   // Null is free
-  if (MO.getReg() == AMDGPU::SGPR_NULL)
+  if (MO.getReg() == AMDGPU::SGPR_NULL || MO.getReg() == AMDGPU::SGPR_NULL64)
     return false;
 
   // SGPRs use the constant bus
@@ -3951,6 +4039,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     case AMDGPU::OPERAND_REG_IMM_INT32:
     case AMDGPU::OPERAND_REG_IMM_FP32:
     case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED:
+    case AMDGPU::OPERAND_REG_IMM_V2FP32:
       break;
     case AMDGPU::OPERAND_REG_INLINE_C_INT32:
     case AMDGPU::OPERAND_REG_INLINE_C_FP32:
@@ -4031,9 +4120,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
 
     int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
 
-    const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx };
-
-    for (int OpIdx: OpIndicies) {
+    for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
       if (OpIdx == -1)
         continue;
       const MachineOperand &MO = MI.getOperand(OpIdx);
@@ -4150,24 +4237,25 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
   }
 
   // Verify VOP*. Ignore multiple sgpr operands on writelane.
-  if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32
-      && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) {
-    // Only look at the true operands. Only a real operand can use the constant
-    // bus, and we don't want to check pseudo-operands like the source modifier
-    // flags.
-    const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
-
+  if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
     unsigned ConstantBusCount = 0;
     bool UsesLiteral = false;
     const MachineOperand *LiteralVal = nullptr;
 
-    if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
+    int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
+    if (ImmIdx != -1) {
       ++ConstantBusCount;
+      UsesLiteral = true;
+      LiteralVal = &MI.getOperand(ImmIdx);
+    }
 
     SmallVector<Register, 2> SGPRsUsed;
     Register SGPRUsed;
 
-    for (int OpIdx : OpIndices) {
+    // Only look at the true operands. Only a real operand can use the constant
+    // bus, and we don't want to check pseudo-operands like the source modifier
+    // flags.
+    for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) {
       if (OpIdx == -1)
         break;
       const MachineOperand &MO = MI.getOperand(OpIdx);
@@ -4186,8 +4274,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
             UsesLiteral = true;
             LiteralVal = &MO;
           } else if (!MO.isIdenticalTo(*LiteralVal)) {
-            assert(isVOP3(MI));
-            ErrInfo = "VOP3 instruction uses more than one literal";
+            assert(isVOP2(MI) || isVOP3(MI));
+            ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
             return false;
           }
         }
@@ -4196,7 +4284,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
 
     SGPRUsed = findImplicitSGPRRead(MI);
     if (SGPRUsed != AMDGPU::NoRegister) {
-      // Implicit uses may safely overlap true overands
+      // Implicit uses may safely overlap true operands
       if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
             return !RI.regsOverlap(SGPRUsed, SGPR);
           })) {
@@ -4225,7 +4313,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     unsigned SGPRCount = 0;
     Register SGPRUsed = AMDGPU::NoRegister;
 
-    for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) {
+    for (int OpIdx : {Src0Idx, Src1Idx}) {
       if (OpIdx == -1)
         break;
 
@@ -4272,16 +4360,11 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
   if (isSOP2(MI) || isSOPC(MI)) {
     const MachineOperand &Src0 = MI.getOperand(Src0Idx);
     const MachineOperand &Src1 = MI.getOperand(Src1Idx);
-    unsigned Immediates = 0;
 
-    if (!Src0.isReg() &&
-        !isInlineConstant(Src0, Desc.OpInfo[Src0Idx].OperandType))
-      Immediates++;
-    if (!Src1.isReg() &&
-        !isInlineConstant(Src1, Desc.OpInfo[Src1Idx].OperandType))
-      Immediates++;
-
-    if (Immediates > 1) {
+    if (!Src0.isReg() && !Src1.isReg() &&
+        !isInlineConstant(Src0, Desc.OpInfo[Src0Idx].OperandType) &&
+        !isInlineConstant(Src1, Desc.OpInfo[Src1Idx].OperandType) &&
+        !Src0.isIdenticalTo(Src1)) {
       ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
       return false;
     }
@@ -4364,10 +4447,11 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
   }
 
   if (isSMRD(MI)) {
-    if (MI.mayStore()) {
+    if (MI.mayStore() &&
+        ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
       // The register offset form of scalar stores may only use m0 as the
       // soffset register.
-      const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff);
+      const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
       if (Soff && Soff->getReg() != AMDGPU::M0) {
         ErrInfo = "scalar stores must use m0 as offset register";
         return false;
@@ -4477,7 +4561,6 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     }
 
     int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
-    int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
 
     if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
         ((DstIdx >= 0 &&
@@ -4527,24 +4610,45 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     }
   }
 
-  if (ST.needsAlignedVGPRs() &&
-      (MI.getOpcode() == AMDGPU::DS_GWS_INIT ||
-       MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
-       MI.getOpcode() == AMDGPU::DS_GWS_BARRIER)) {
-    const MachineOperand *Op = getNamedOperand(MI, AMDGPU::OpName::data0);
-    Register Reg = Op->getReg();
-    bool Aligned = true;
-    if (Reg.isPhysical()) {
-      Aligned = !(RI.getHWRegIndex(Reg) & 1);
-    } else {
+  if (ST.needsAlignedVGPRs()) {
+    const auto isAlignedReg = [&MI, &MRI, this](unsigned OpName) -> bool {
+      const MachineOperand *Op = getNamedOperand(MI, OpName);
+      if (!Op)
+        return true;
+      Register Reg = Op->getReg();
+      if (Reg.isPhysical())
+        return !(RI.getHWRegIndex(Reg) & 1);
       const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
-      Aligned = RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
-                !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
+      return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
+             !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
+    };
+
+    if (MI.getOpcode() == AMDGPU::DS_GWS_INIT ||
+        MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
+        MI.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
+
+      if (!isAlignedReg(AMDGPU::OpName::data0)) {
+        ErrInfo = "Subtarget requires even aligned vector registers "
+                  "for DS_GWS instructions";
+        return false;
+      }
+    }
+
+    if (isMIMG(MI)) {
+      if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
+        ErrInfo = "Subtarget requires even aligned vector registers "
+                  "for vaddr operand of image instructions";
+        return false;
+      }
     }
+  }
 
-    if (!Aligned) {
-      ErrInfo = "Subtarget requires even aligned vector registers "
-                "for DS_GWS instructions";
+  if (MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+      !ST.hasGFX90AInsts()) {
+    const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
+    if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
+      ErrInfo = "Invalid register class: "
+                "v_accvgpr_write with an SGPR is not supported on this GPU";
       return false;
     }
   }
@@ -4641,26 +4745,40 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
       "Unexpected scalar opcode without corresponding vector one!");
 }
 
-static unsigned adjustAllocatableRegClass(const GCNSubtarget &ST,
-                                          const MachineRegisterInfo &MRI,
-                                          const MCInstrDesc &TID,
-                                          unsigned RCID,
-                                          bool IsAllocatable) {
+static const TargetRegisterClass *
+adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI,
+                          const MachineRegisterInfo &MRI,
+                          const MCInstrDesc &TID, unsigned RCID,
+                          bool IsAllocatable) {
   if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
       (((TID.mayLoad() || TID.mayStore()) &&
         !(TID.TSFlags & SIInstrFlags::VGPRSpill)) ||
        (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) {
     switch (RCID) {
-    case AMDGPU::AV_32RegClassID: return AMDGPU::VGPR_32RegClassID;
-    case AMDGPU::AV_64RegClassID: return AMDGPU::VReg_64RegClassID;
-    case AMDGPU::AV_96RegClassID: return AMDGPU::VReg_96RegClassID;
-    case AMDGPU::AV_128RegClassID: return AMDGPU::VReg_128RegClassID;
-    case AMDGPU::AV_160RegClassID: return AMDGPU::VReg_160RegClassID;
+    case AMDGPU::AV_32RegClassID:
+      RCID = AMDGPU::VGPR_32RegClassID;
+      break;
+    case AMDGPU::AV_64RegClassID:
+      RCID = AMDGPU::VReg_64RegClassID;
+      break;
+    case AMDGPU::AV_96RegClassID:
+      RCID = AMDGPU::VReg_96RegClassID;
+      break;
+    case AMDGPU::AV_128RegClassID:
+      RCID = AMDGPU::VReg_128RegClassID;
+      break;
+    case AMDGPU::AV_160RegClassID:
+      RCID = AMDGPU::VReg_160RegClassID;
+      break;
+    case AMDGPU::AV_512RegClassID:
+      RCID = AMDGPU::VReg_512RegClassID;
+      break;
     default:
       break;
     }
   }
-  return RCID;
+
+  return RI.getProperlyAlignedRC(RI.getRegClass(RCID));
 }
 
 const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID,
@@ -4673,7 +4791,7 @@ const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID,
   bool IsAllocatable = false;
   if (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::FLAT)) {
     // vdst and vdata should be both VGPR or AGPR, same for the DS instructions
-    // with two data operands. Request register class constainted to VGPR only
+    // with two data operands. Request register class constrained to VGPR only
     // of both operands present as Machine Copy Propagation can not check this
     // constraint and possibly other passes too.
     //
@@ -4690,9 +4808,8 @@ const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID,
                                                  AMDGPU::OpName::data1) != -1;
     }
   }
-  RegClass = adjustAllocatableRegClass(ST, MF.getRegInfo(), TID, RegClass,
-                                       IsAllocatable);
-  return RI.getRegClass(RegClass);
+  return adjustAllocatableRegClass(ST, RI, MF.getRegInfo(), TID, RegClass,
+                                   IsAllocatable);
 }
 
 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
@@ -4709,8 +4826,7 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
   }
 
   unsigned RCID = Desc.OpInfo[OpNo].RegClass;
-  RCID = adjustAllocatableRegClass(ST, MRI, Desc, RCID, true);
-  return RI.getRegClass(RCID);
+  return adjustAllocatableRegClass(ST, RI, MRI, Desc, RCID, true);
 }
 
 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
@@ -4797,7 +4913,7 @@ MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
 void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
   assert(Inst.getNumExplicitOperands() == 3);
   MachineOperand Op1 = Inst.getOperand(1);
-  Inst.RemoveOperand(1);
+  Inst.removeOperand(1);
   Inst.addOperand(Op1);
 }
 
@@ -4851,9 +4967,9 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
     MO = &MI.getOperand(OpIdx);
 
   int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
-  int VOP3LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
+  int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
   if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
-    if (isVOP3(MI) && isLiteralConstantLike(*MO, OpInfo) && !VOP3LiteralLimit--)
+    if (isLiteralConstantLike(*MO, OpInfo) && !LiteralLimit--)
       return false;
 
     SmallDenseSet<RegSubRegPair> SGPRsUsed;
@@ -4872,12 +4988,10 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
             return false;
           SGPRsUsed.insert(SGPR);
         }
-      } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
-        if (--ConstantBusLimit <= 0)
-          return false;
-      } else if (isVOP3(MI) && AMDGPU::isSISrcOperand(InstDesc, i) &&
-                 isLiteralConstantLike(Op, InstDesc.OpInfo[i])) {
-        if (!VOP3LiteralLimit--)
+      } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32 ||
+                 (AMDGPU::isSISrcOperand(InstDesc, i) &&
+                  isLiteralConstantLike(Op, InstDesc.OpInfo[i]))) {
+        if (!LiteralLimit--)
           return false;
         if (--ConstantBusLimit <= 0)
           return false;
@@ -4886,7 +5000,10 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
   }
 
   if (MO->isReg()) {
-    assert(DefinedRC);
+    if (!DefinedRC) {
+      // This operand allows any register.
+      return true;
+    }
     if (!isLegalRegOperand(MRI, OpInfo, *MO))
       return false;
     bool IsAGPR = RI.isAGPR(MRI, MO->getReg());
@@ -4916,7 +5033,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
           RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
         return false;
     }
-    if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+    if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
         (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
         RI.isSGPRReg(MRI, MO->getReg()))
       return false;
@@ -5186,7 +5303,7 @@ void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
     Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
     SBase->setReg(SGPR);
   }
-  MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff);
+  MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
   if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
     Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
     SOff->setReg(SGPR);
@@ -5232,16 +5349,16 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
   const MCInstrDesc &NewDesc = get(NewOpc);
   Inst.setDesc(NewDesc);
 
-  // Callers expect interator to be valid after this call, so modify the
+  // Callers expect iterator to be valid after this call, so modify the
   // instruction in place.
   if (OldVAddrIdx == NewVAddrIdx) {
     MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
     // Clear use list from the old vaddr holding a zero register.
     MRI.removeRegOperandFromUseList(&NewVAddr);
     MRI.moveOperands(&NewVAddr, &SAddr, 1);
-    Inst.RemoveOperand(OldSAddrIdx);
+    Inst.removeOperand(OldSAddrIdx);
     // Update the use list with the pointer we have just moved from vaddr to
-    // saddr poisition. Otherwise new vaddr will be missing from the use list.
+    // saddr position. Otherwise new vaddr will be missing from the use list.
     MRI.removeRegOperandFromUseList(&NewVAddr);
     MRI.addRegOperandToUseList(&NewVAddr);
   } else {
@@ -5251,14 +5368,14 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
       int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
                                                  AMDGPU::OpName::vdst_in);
 
-      // RemoveOperand doesn't try to fixup tied operand indexes at it goes, so
+      // removeOperand doesn't try to fixup tied operand indexes at it goes, so
       // it asserts. Untie the operands for now and retie them afterwards.
       if (NewVDstIn != -1) {
         int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
         Inst.untieRegOperand(OldVDstIn);
       }
 
-      Inst.RemoveOperand(OldVAddrIdx);
+      Inst.removeOperand(OldVAddrIdx);
 
       if (NewVDstIn != -1) {
         int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
@@ -5340,7 +5457,8 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
 static void
 emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
                           MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
-                          const DebugLoc &DL, MachineOperand &Rsrc) {
+                          MachineBasicBlock &BodyBB, const DebugLoc &DL,
+                          MachineOperand &Rsrc) {
   MachineFunction &MF = *OrigBB.getParent();
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -5398,7 +5516,7 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
     else
       Cmp.addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx, 2));
 
-    // Combine the comparision results with AND.
+    // Combine the comparison results with AND.
     if (CondReg == AMDGPU::NoRegister) // First.
       CondReg = NewCondReg;
     else { // If not the first, we create an AND.
@@ -5433,14 +5551,14 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
       .addReg(CondReg, RegState::Kill);
 
   // The original instruction is here; we insert the terminators after it.
-  I = LoopBB.end();
+  I = BodyBB.end();
 
   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
-  BuildMI(LoopBB, I, DL, TII.get(XorTermOpc), Exec)
+  BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec)
       .addReg(Exec)
       .addReg(SaveExec);
 
-  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
+  BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
 }
 
 // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register
@@ -5487,31 +5605,35 @@ loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
   // To insert the loop we need to split the block. Move everything after this
   // point to a new block, and insert a new empty block between the two.
   MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
+  MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock();
   MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
   MachineFunction::iterator MBBI(MBB);
   ++MBBI;
 
   MF.insert(MBBI, LoopBB);
+  MF.insert(MBBI, BodyBB);
   MF.insert(MBBI, RemainderBB);
 
-  LoopBB->addSuccessor(LoopBB);
-  LoopBB->addSuccessor(RemainderBB);
+  LoopBB->addSuccessor(BodyBB);
+  BodyBB->addSuccessor(LoopBB);
+  BodyBB->addSuccessor(RemainderBB);
 
-  // Move Begin to MI to the LoopBB, and the remainder of the block to
+  // Move Begin to MI to the BodyBB, and the remainder of the block to
   // RemainderBB.
   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
   RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
-  LoopBB->splice(LoopBB->begin(), &MBB, Begin, MBB.end());
+  BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
 
   MBB.addSuccessor(LoopBB);
 
   // Update dominators. We know that MBB immediately dominates LoopBB, that
-  // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately
-  // dominates all of the successors transferred to it from MBB that MBB used
-  // to properly dominate.
+  // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
+  // RemainderBB. RemainderBB immediately dominates all of the successors
+  // transferred to it from MBB that MBB used to properly dominate.
   if (MDT) {
     MDT->addNewBlock(LoopBB, &MBB);
-    MDT->addNewBlock(RemainderBB, LoopBB);
+    MDT->addNewBlock(BodyBB, LoopBB);
+    MDT->addNewBlock(RemainderBB, BodyBB);
     for (auto &Succ : RemainderBB->successors()) {
       if (MDT->properlyDominates(&MBB, Succ)) {
         MDT->changeImmediateDominator(Succ, RemainderBB);
@@ -5519,12 +5641,12 @@ loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
     }
   }
 
-  emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc);
+  emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, *BodyBB, DL, Rsrc);
 
   // Restore the EXEC mask
   MachineBasicBlock::iterator First = RemainderBB->begin();
   BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
-  return LoopBB;
+  return BodyBB;
 }
 
 // Extract pointer from Rsrc and return a zero-value Rsrc replacement.
@@ -5762,7 +5884,7 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
     if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()),
                              RI.getRegClass(RsrcRC))) {
       // The operands are legal.
-      // FIXME: We may need to legalize operands besided srsrc.
+      // FIXME: We may need to legalize operands besides srsrc.
       return CreatedBB;
     }
 
@@ -5836,7 +5958,7 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
       MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
       unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
 
-      // Atomics rith return have have an additional tied operand and are
+      // Atomics with return have an additional tied operand and are
       // missing some of the special bits.
       MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
       MachineInstr *Addr64;
@@ -6050,7 +6172,7 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst,
         BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
             .addReg(EXEC)
             .addReg(IsSCC ? VCC : CondReg);
-        Inst.RemoveOperand(1);
+        Inst.removeOperand(1);
       }
       break;
 
@@ -6060,6 +6182,7 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst,
 
     case AMDGPU::S_PACK_LL_B32_B16:
     case AMDGPU::S_PACK_LH_B32_B16:
+    case AMDGPU::S_PACK_HL_B32_B16:
     case AMDGPU::S_PACK_HH_B32_B16:
       movePackToVALU(Worklist, MRI, Inst);
       Inst.eraseFromParent();
@@ -6217,7 +6340,7 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst,
           addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
         if (Op.isUse())
           addSCCDefsToVALUWorklist(Op, Worklist);
-        Inst.RemoveOperand(i);
+        Inst.removeOperand(i);
       }
     }
 
@@ -6247,7 +6370,7 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst,
 
       uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
       uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
-      Inst.RemoveOperand(2);                     // Remove old immediate.
+      Inst.removeOperand(2);                     // Remove old immediate.
       Inst.addOperand(MachineOperand::CreateImm(Offset));
       Inst.addOperand(MachineOperand::CreateImm(BitWidth));
     }
@@ -6281,7 +6404,7 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst,
         // these are deleted later, but at -O0 it would leave a suspicious
         // looking illegal copy of an undef register.
         for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
-          Inst.RemoveOperand(I);
+          Inst.removeOperand(I);
         Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
         continue;
       }
@@ -6323,7 +6446,7 @@ SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
       AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
 
     assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
-    Inst.RemoveOperand(3);
+    Inst.removeOperand(3);
 
     Inst.setDesc(get(NewOpc));
     Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
@@ -6467,7 +6590,7 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
     // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
     // invert either source and then perform the XOR. If either source is a
     // scalar register, then we can leave the inversion on the scalar unit to
-    // acheive a better distrubution of scalar and vector instructions.
+    // achieve a better distribution of scalar and vector instructions.
     bool Src0IsSGPR = Src0.isReg() &&
                       RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
     bool Src1IsSGPR = Src1.isReg() &&
@@ -6689,7 +6812,7 @@ void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
   legalizeOperands(*LoHalf, MDT);
   legalizeOperands(*HiHalf, MDT);
 
-  // Move all users of this moved vlaue.
+  // Move all users of this moved value.
   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
 }
 
@@ -6753,7 +6876,7 @@ void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
   Worklist.insert(&LoHalf);
   Worklist.insert(&HiHalf);
 
-  // Move all users of this moved vlaue.
+  // Move all users of this moved value.
   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
 }
 
@@ -6831,7 +6954,7 @@ void SIInstrInfo::splitScalar64BitBCNT(
 
   MRI.replaceRegWith(Dest.getReg(), ResultReg);
 
-  // We don't need to legalize operands here. src0 for etiher instruction can be
+  // We don't need to legalize operands here. src0 for either instruction can be
   // an SGPR, and the second input is unused or determined here.
   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
 }
@@ -6973,6 +7096,17 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
       .add(Src1);
     break;
   }
+  case AMDGPU::S_PACK_HL_B32_B16: {
+    Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
+        .addImm(16)
+        .add(Src0);
+    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
+        .add(Src1)
+        .addImm(16)
+        .addReg(TmpReg, RegState::Kill);
+    break;
+  }
   case AMDGPU::S_PACK_HH_B32_B16: {
     Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -7045,7 +7179,7 @@ void SIInstrInfo::addSCCDefsToVALUWorklist(MachineOperand &Op,
   assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isUse());
 
   MachineInstr *SCCUseInst = Op.getParent();
-  // Look for a preceeding instruction that either defines VCC or SCC. If VCC
+  // Look for a preceding instruction that either defines VCC or SCC. If VCC
   // then there is nothing to do because the defining instruction has been
   // converted to a VALU already. If SCC then that instruction needs to be
   // converted to a VALU.
@@ -7191,7 +7325,10 @@ MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
 
 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
   if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
-    return (AMDGPU::MTBUFFormat::UFMT_32_FLOAT << 44) |
+    int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11 ?
+                         AMDGPU::UfmtGFX11::UFMT_32_FLOAT :
+                         AMDGPU::UfmtGFX10::UFMT_32_FLOAT;
+    return (Format << 44) |
            (1ULL << 56) | // RESOURCE_LEVEL = 1
            (3ULL << 60); // OOB_SELECT = 3
   }
@@ -7332,7 +7469,9 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
       return DescSize;
     bool HasLiteral = false;
     for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
-      if (isLiteralConstant(MI, I)) {
+      const MachineOperand &Op = MI.getOperand(I);
+      const MCOperandInfo &OpInfo = Desc.OpInfo[I];
+      if (isLiteralConstantLike(Op, OpInfo)) {
         HasLiteral = true;
         break;
       }
@@ -7513,6 +7652,16 @@ SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
   return makeArrayRef(TargetFlags);
 }
 
+ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
+SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const {
+  static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
+      {
+          {MONoClobber, "amdgpu-noclobber"},
+      };
+
+  return makeArrayRef(TargetFlags);
+}
+
 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
   return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
          MI.modifiesRegister(AMDGPU::EXEC, &RI);
@@ -7690,6 +7839,7 @@ SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
 }
 
 // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
+// and the columns of the getMCOpcodeGen table.
 enum SIEncodingFamily {
   SI = 0,
   VI = 1,
@@ -7699,7 +7849,9 @@ enum SIEncodingFamily {
   GFX9 = 5,
   GFX10 = 6,
   SDWA10 = 7,
-  GFX90A = 8
+  GFX90A = 8,
+  GFX940 = 9,
+  GFX11 = 10,
 };
 
 static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) {
@@ -7714,6 +7866,8 @@ static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) {
     return SIEncodingFamily::VI;
   case AMDGPUSubtarget::GFX10:
     return SIEncodingFamily::GFX10;
+  case AMDGPUSubtarget::GFX11:
+    return SIEncodingFamily::GFX11;
   }
   llvm_unreachable("Unknown subtarget generation!");
 }
@@ -7779,6 +7933,9 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
 
   if (ST.hasGFX90AInsts()) {
     uint16_t NMCOp = (uint16_t)-1;
+    if (ST.hasGFX940Insts())
+      NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX940);
+    if (NMCOp == (uint16_t)-1)
       NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX90A);
     if (NMCOp == (uint16_t)-1)
       NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX9);
@@ -7925,7 +8082,7 @@ bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
     auto &UseInst = *Use.getParent();
     // Don't bother searching between blocks, although it is possible this block
     // doesn't modify exec.
-    if (UseInst.getParent() != DefBB)
+    if (UseInst.getParent() != DefBB || UseInst.isPHI())
       return true;
 
     if (++NumUse > MaxUseScan)
@@ -8150,7 +8307,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
 
   const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
                                this](int64_t ExpectedValue, unsigned SrcSize,
-                                     bool IsReversable, bool IsSigned) -> bool {
+                                     bool IsReversible, bool IsSigned) -> bool {
     // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
     // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
     // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
@@ -8208,7 +8365,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
 
     bool IsReversedCC = false;
     if (CmpValue != ExpectedValue) {
-      if (!IsReversable)
+      if (!IsReversible)
         return false;
       IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
       if (!IsReversedCC)
@@ -8284,3 +8441,37 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
 
   return false;
 }
+
+void SIInstrInfo::enforceOperandRCAlignment(MachineInstr &MI,
+                                            unsigned OpName) const {
+  if (!ST.needsAlignedVGPRs())
+    return;
+
+  int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
+  if (OpNo < 0)
+    return;
+  MachineOperand &Op = MI.getOperand(OpNo);
+  if (getOpSize(MI, OpNo) > 4)
+    return;
+
+  // Add implicit aligned super-reg to force alignment on the data operand.
+  const DebugLoc &DL = MI.getDebugLoc();
+  MachineBasicBlock *BB = MI.getParent();
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+  Register DataReg = Op.getReg();
+  bool IsAGPR = RI.isAGPR(MRI, DataReg);
+  Register Undef = MRI.createVirtualRegister(
+      IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
+  BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
+  Register NewVR =
+      MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
+                                       : &AMDGPU::VReg_64_Align2RegClass);
+  BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
+      .addReg(DataReg, 0, Op.getSubReg())
+      .addImm(AMDGPU::sub0)
+      .addReg(Undef)
+      .addImm(AMDGPU::sub1);
+  Op.setReg(NewVR);
+  Op.setSubReg(AMDGPU::sub0);
+  MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
+}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index e551d6c7223f..311f9f68e675 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -15,6 +15,7 @@
 #define LLVM_LIB_TARGET_AMDGPU_SIINSTRINFO_H
 
 #include "AMDGPUMIRFormatter.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIRegisterInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/SetVector.h"
@@ -35,6 +36,11 @@ class RegScavenger;
 class TargetRegisterClass;
 class ScheduleHazardRecognizer;
 
+/// Mark the MMO of a uniform load if there are no potentially clobbering stores
+/// on any path from the start of an entry function to this load.
+static const MachineMemOperand::Flags MONoClobber =
+    MachineMemOperand::MOTargetFlag1;
+
 class SIInstrInfo final : public AMDGPUGenInstrInfo {
 private:
   const SIRegisterInfo RI;
@@ -323,15 +329,14 @@ public:
                             Register SrcReg2, int64_t CmpMask, int64_t CmpValue,
                             const MachineRegisterInfo *MRI) const override;
 
-  unsigned getAddressSpaceForPseudoSourceKind(
-             unsigned Kind) const override;
-
   bool
   areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
                                   const MachineInstr &MIb) const override;
 
   static bool isFoldableCopy(const MachineInstr &MI);
 
+  void removeModOperands(MachineInstr &MI) const;
+
   bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg,
                      MachineRegisterInfo *MRI) const final;
 
@@ -549,6 +554,14 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::EXP;
   }
 
+  static bool isDualSourceBlendEXP(const MachineInstr &MI) {
+    if (!isEXP(MI))
+      return false;
+    unsigned Target = MI.getOperand(0).getImm();
+    return Target == AMDGPU::Exp::ET_DUAL_SRC_BLEND0 ||
+           Target == AMDGPU::Exp::ET_DUAL_SRC_BLEND1;
+  }
+
   bool isEXP(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::EXP;
   }
@@ -651,14 +664,43 @@ public:
     return get(Opcode).TSFlags & SIInstrFlags::IsMAI;
   }
 
+  static bool isMFMA(const MachineInstr &MI) {
+    return isMAI(MI) && MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+           MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
+  }
+
   static bool isDOT(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::IsDOT;
   }
 
+  static bool isWMMA(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::IsWMMA;
+  }
+
+  bool isWMMA(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::IsWMMA;
+  }
+
   bool isDOT(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::IsDOT;
   }
 
+  static bool isLDSDIR(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::LDSDIR;
+  }
+
+  bool isLDSDIR(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::LDSDIR;
+  }
+
+  static bool isVINTERP(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::VINTERP;
+  }
+
+  bool isVINTERP(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::VINTERP;
+  }
+
   static bool isScalarUnit(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & (SIInstrFlags::SALU | SIInstrFlags::SMRD);
   }
@@ -1036,6 +1078,9 @@ public:
   ArrayRef<std::pair<unsigned, const char *>>
   getSerializableDirectMachineOperandTargetFlags() const override;
 
+  ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
+  getSerializableMachineMemOperandTargetFlags() const override;
+
   ScheduleHazardRecognizer *
   CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
                                  const ScheduleDAG *DAG) const override;
@@ -1132,6 +1177,11 @@ public:
   static unsigned getDSShaderTypeValue(const MachineFunction &MF);
 
   const TargetSchedModel &getSchedModel() const { return SchedModel; }
+
+  // Enforce operand's \p OpName even alignment if required by target.
+  // This is used if an operand is a 32 bit register but needs to be aligned
+  // regardless.
+  void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const;
 };
 
 /// \brief Returns true if a reg:subreg pair P has a TRC class
@@ -1209,9 +1259,6 @@ namespace AMDGPU {
   LLVM_READONLY
   int getIfAddr64Inst(uint16_t Opcode);
 
-  LLVM_READONLY
-  int getMUBUFNoLdsInst(uint16_t Opcode);
-
   LLVM_READONLY
   int getAtomicNoRetOp(uint16_t Opcode);
 
@@ -1236,6 +1283,11 @@ namespace AMDGPU {
   LLVM_READONLY
   int getFlatScratchInstSTfromSS(uint16_t Opcode);
 
+  /// \returns SV (VADDR) form of a FLAT Scratch instruction given an \p Opcode
+  /// of an SVS (SADDR + VADDR) form.
+  LLVM_READONLY
+  int getFlatScratchInstSVfromSVS(uint16_t Opcode);
+
   /// \returns SS (SADDR) form of a FLAT Scratch instruction given an \p Opcode
   /// of an SV (VADDR) form.
   LLVM_READONLY
@@ -1250,6 +1302,10 @@ namespace AMDGPU {
   LLVM_READONLY
   int getMFMAEarlyClobberOp(uint16_t Opcode);
 
+  /// \returns v_cmpx version of a v_cmp instruction.
+  LLVM_READONLY
+  int getVCMPXOpFromVCMP(uint16_t Opcode);
+
   const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
   const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
   const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 713a08907e99..29ee9f12b12d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1,4 +1,4 @@
-//===-- SIInstrInfo.td - SI Instruction Infos -------------*- tablegen -*--===//
+//===-- SIInstrInfo.td -----------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -17,7 +17,8 @@ class GCNPredicateControl : PredicateControl {
 }
 
 // Except for the NONE field, this must be kept in sync with the
-// SIEncodingFamily enum in AMDGPUInstrInfo.cpp
+// SIEncodingFamily enum in SIInstrInfo.cpp and the columns of the
+// getMCOpcodeGen table.
 def SIEncodingFamily {
   int NONE = -1;
   int SI = 0;
@@ -29,6 +30,8 @@ def SIEncodingFamily {
   int GFX10 = 6;
   int SDWA10 = 7;
   int GFX90A = 8;
+  int GFX940 = 9;
+  int GFX11 = 10;
 }
 
 //===----------------------------------------------------------------------===//
@@ -190,6 +193,44 @@ def SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">;
 def SIbuffer_atomic_fmin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMIN">;
 def SIbuffer_atomic_fmax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMAX">;
 
+multiclass SDBufferAtomicRetNoRet {
+  def "_ret" : PatFrag<
+    (ops node:$vdata_in, node:$rsrc, node:$vindex, node:$voffset, node:$soffset,
+      node:$offset, node:$cachepolicy, node:$idxen),
+    (!cast<SDNode>(NAME) node:$vdata_in, node:$rsrc, node:$vindex,
+      node:$voffset, node:$soffset, node:$offset, node:$cachepolicy,
+      node:$idxen)> {
+    let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }];
+    let GISelPredicateCode = [{ return true; }];
+  }
+
+  def "_noret" : PatFrag<
+    (ops node:$vdata_in, node:$rsrc, node:$vindex, node:$voffset, node:$soffset,
+      node:$offset, node:$cachepolicy, node:$idxen),
+    (!cast<SDNode>(NAME) node:$vdata_in, node:$rsrc, node:$vindex,
+      node:$voffset, node:$soffset, node:$offset, node:$cachepolicy,
+      node:$idxen)> {
+    let PredicateCode = [{ return SDValue(N, 0).use_empty(); }];
+    let GISelPredicateCode = [{ return false; }];
+  }
+}
+
+defm SIbuffer_atomic_swap : SDBufferAtomicRetNoRet;
+defm SIbuffer_atomic_add : SDBufferAtomicRetNoRet;
+defm SIbuffer_atomic_sub : SDBufferAtomicRetNoRet;
+defm SIbuffer_atomic_smin : SDBufferAtomicRetNoRet;
+defm SIbuffer_atomic_umin : SDBufferAtomicRetNoRet;
+defm SIbuffer_atomic_smax : SDBufferAtomicRetNoRet;
+defm SIbuffer_atomic_umax : SDBufferAtomicRetNoRet;
+defm SIbuffer_atomic_and : SDBufferAtomicRetNoRet;
+defm SIbuffer_atomic_or : SDBufferAtomicRetNoRet;
+defm SIbuffer_atomic_xor : SDBufferAtomicRetNoRet;
+defm SIbuffer_atomic_inc : SDBufferAtomicRetNoRet;
+defm SIbuffer_atomic_dec : SDBufferAtomicRetNoRet;
+defm SIbuffer_atomic_fadd : SDBufferAtomicRetNoRet;
+defm SIbuffer_atomic_fmin : SDBufferAtomicRetNoRet;
+defm SIbuffer_atomic_fmax : SDBufferAtomicRetNoRet;
+
 def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP",
   SDTypeProfile<1, 9,
     [SDTCisVT<0, i32>,   // dst
@@ -205,6 +246,26 @@ def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP",
   [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
 >;
 
+def SIbuffer_atomic_cmpswap_ret : PatFrag<
+  (ops node:$src, node:$cmp, node:$rsrc, node:$vindex, node:$voffset,
+    node:$soffset, node:$offset, node:$cachepolicy, node:$idxen),
+  (SIbuffer_atomic_cmpswap node:$src, node:$cmp, node:$rsrc, node:$vindex,
+    node:$voffset, node:$soffset, node:$offset, node:$cachepolicy,
+    node:$idxen)> {
+  let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }];
+  let GISelPredicateCode = [{ return true; }];
+}
+
+def SIbuffer_atomic_cmpswap_noret : PatFrag<
+  (ops node:$src, node:$cmp, node:$rsrc, node:$vindex, node:$voffset,
+    node:$soffset, node:$offset, node:$cachepolicy, node:$idxen),
+  (SIbuffer_atomic_cmpswap node:$src, node:$cmp, node:$rsrc, node:$vindex,
+    node:$voffset, node:$soffset, node:$offset, node:$cachepolicy,
+    node:$idxen)> {
+  let PredicateCode = [{ return SDValue(N, 0).use_empty(); }];
+  let GISelPredicateCode = [{ return false; }];
+}
+
 class SDGlobalAtomicNoRtn<string opcode, ValueType ty> : SDNode <opcode,
   SDTypeProfile<0, 2,
       [SDTCisPtrTy<0>,     // vaddr
@@ -255,35 +316,57 @@ def SIdenorm_mode : SDNode<"AMDGPUISD::DENORM_MODE",
   [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]
 >;
 
+def SIfptrunc_round_upward : SDNode<"AMDGPUISD::FPTRUNC_ROUND_UPWARD",
+  SDTFPRoundOp
+>;
+
+def SIfptrunc_round_downward : SDNode<"AMDGPUISD::FPTRUNC_ROUND_DOWNWARD",
+  SDTFPRoundOp
+>;
+
 //===----------------------------------------------------------------------===//
 // ValueType helpers
 //===----------------------------------------------------------------------===//
 
 // Returns 1 if the source arguments have modifiers, 0 if they do not.
-// XXX - do f16 instructions?
 class isFloatType<ValueType SrcVT> {
   bit ret = !or(!eq(SrcVT.Value, f16.Value),
                 !eq(SrcVT.Value, f32.Value),
                 !eq(SrcVT.Value, f64.Value),
                 !eq(SrcVT.Value, v2f16.Value),
                 !eq(SrcVT.Value, v4f16.Value),
+                !eq(SrcVT.Value, v8f16.Value),
+                !eq(SrcVT.Value, v16f16.Value),
                 !eq(SrcVT.Value, v2f32.Value),
+                !eq(SrcVT.Value, v4f32.Value),
+                !eq(SrcVT.Value, v8f32.Value),
                 !eq(SrcVT.Value, v2f64.Value),
                 !eq(SrcVT.Value, v4f64.Value));
 }
 
+// XXX - do v2i16 instructions?
 class isIntType<ValueType SrcVT> {
   bit ret = !or(!eq(SrcVT.Value, i16.Value),
                 !eq(SrcVT.Value, i32.Value),
                 !eq(SrcVT.Value, i64.Value),
-                !eq(SrcVT.Value, v2i32.Value));
+                !eq(SrcVT.Value, v4i16.Value),
+                !eq(SrcVT.Value, v8i16.Value),
+                !eq(SrcVT.Value, v16i16.Value),
+                !eq(SrcVT.Value, v2i32.Value),
+                !eq(SrcVT.Value, v4i32.Value),
+                !eq(SrcVT.Value, v8i32.Value));
 }
 
 class isPackedType<ValueType SrcVT> {
   bit ret = !or(!eq(SrcVT.Value, v2i16.Value),
                 !eq(SrcVT.Value, v2f16.Value),
                 !eq(SrcVT.Value, v4f16.Value),
-                !eq(SrcVT.Value, v2f32.Value));
+                !eq(SrcVT.Value, v2i32.Value),
+                !eq(SrcVT.Value, v2f32.Value),
+                !eq(SrcVT.Value, v4i32.Value),
+                !eq(SrcVT.Value, v4f32.Value),
+                !eq(SrcVT.Value, v8i32.Value),
+                !eq(SrcVT.Value, v8f32.Value));
 }
 
 
@@ -291,19 +374,10 @@ class isPackedType<ValueType SrcVT> {
 // PatFrags for global memory operations
 //===----------------------------------------------------------------------===//
 
-foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in {
-let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in {
-
-
-defm atomic_inc_#as : binary_atomic_op<SIatomic_inc>;
-defm atomic_dec_#as : binary_atomic_op<SIatomic_dec>;
-defm atomic_load_fmin_#as : binary_atomic_op<SIatomic_fmin, 0>;
-defm atomic_load_fmax_#as : binary_atomic_op<SIatomic_fmax, 0>;
-
-
-} // End let AddressSpaces = ...
-} // End foreach AddrSpace
-
+defm atomic_inc : binary_atomic_op_all_as<SIatomic_inc>;
+defm atomic_dec : binary_atomic_op_all_as<SIatomic_dec>;
+defm atomic_load_fmin : binary_atomic_op_all_as<SIatomic_fmin, 0>;
+defm atomic_load_fmax : binary_atomic_op_all_as<SIatomic_fmax, 0>;
 
 //===----------------------------------------------------------------------===//
 // SDNodes PatFrags for loads/stores with a glue input.
@@ -408,50 +482,36 @@ def load_local_m0 : PatFrag<(ops node:$ptr), (load_glue node:$ptr)> {
   let IsNonExtLoad = 1;
 }
 
-let MemoryVT = i8 in {
 def extloadi8_local_m0 : PatFrag<(ops node:$ptr), (extloadi8_glue node:$ptr)>;
 def sextloadi8_local_m0 : PatFrag<(ops node:$ptr), (sextloadi8_glue node:$ptr)>;
 def zextloadi8_local_m0 : PatFrag<(ops node:$ptr), (zextloadi8_glue node:$ptr)>;
-}
 
-let MemoryVT = i16 in {
 def extloadi16_local_m0 : PatFrag<(ops node:$ptr), (extloadi16_glue node:$ptr)>;
 def sextloadi16_local_m0 : PatFrag<(ops node:$ptr), (sextloadi16_glue node:$ptr)>;
 def zextloadi16_local_m0 : PatFrag<(ops node:$ptr), (zextloadi16_glue node:$ptr)>;
-}
+} // End IsLoad = 1, , AddressSpaces = LoadAddress_local.AddrSpaces
 
 def load_align8_local_m0 : PatFrag<(ops node:$ptr),
-                                   (load_local_m0 node:$ptr)>, Aligned<8> {
+                                   (load_local_m0 node:$ptr)> {
   let IsLoad = 1;
-  let IsNonExtLoad = 1;
+  int MinAlignment = 8;
 }
 
 def load_align16_local_m0 : PatFrag<(ops node:$ptr),
-                                   (load_local_m0 node:$ptr)>, Aligned<16> {
+                                   (load_local_m0 node:$ptr)> {
   let IsLoad = 1;
-  let IsNonExtLoad = 1;
+  int MinAlignment = 16;
 }
 
-} // End IsLoad = 1
-
 let IsAtomic = 1, AddressSpaces = LoadAddress_local.AddrSpaces in {
 def atomic_load_8_local_m0 : PatFrag<(ops node:$ptr),
-                                      (atomic_load_8_glue node:$ptr)> {
-  let MemoryVT = i8;
-}
+                                      (atomic_load_8_glue node:$ptr)>;
 def atomic_load_16_local_m0 : PatFrag<(ops node:$ptr),
-                                      (atomic_load_16_glue node:$ptr)> {
-  let MemoryVT = i16;
-}
+                                      (atomic_load_16_glue node:$ptr)>;
 def atomic_load_32_local_m0 : PatFrag<(ops node:$ptr),
-                                      (atomic_load_32_glue node:$ptr)> {
-  let MemoryVT = i32;
-}
+                                      (atomic_load_32_glue node:$ptr)>;
 def atomic_load_64_local_m0 : PatFrag<(ops node:$ptr),
-                                       (atomic_load_64_glue node:$ptr)> {
-  let MemoryVT = i64;
-}
-
+                                       (atomic_load_64_glue node:$ptr)>;
 } // End let AddressSpaces = LoadAddress_local.AddrSpaces
 
 
@@ -485,75 +545,103 @@ def truncstorei8_glue : PatFrag<(ops node:$val, node:$ptr),
                            (truncstore_glue node:$val, node:$ptr)> {
   let IsStore = 1;
   let MemoryVT = i8;
+  let IsTruncStore = 1;
 }
 
 def truncstorei16_glue : PatFrag<(ops node:$val, node:$ptr),
                            (truncstore_glue node:$val, node:$ptr)> {
   let IsStore = 1;
   let MemoryVT = i16;
+  let IsTruncStore = 1;
 }
 
 let IsStore = 1, AddressSpaces = StoreAddress_local.AddrSpaces in {
 def store_local_m0 : PatFrag<(ops node:$val, node:$ptr),
-                             (store_glue node:$val, node:$ptr)> {
-  let IsStore = 1;
-  let IsTruncStore = 0;
-}
-
+                             (store_glue node:$val, node:$ptr)>;
 def truncstorei8_local_m0 : PatFrag<(ops node:$val, node:$ptr),
-                                    (unindexedstore_glue node:$val, node:$ptr)> {
-  let IsStore = 1;
-  let MemoryVT = i8;
-}
-
+                                    (truncstorei8_glue node:$val, node:$ptr)>;
 def truncstorei16_local_m0 : PatFrag<(ops node:$val, node:$ptr),
-                                    (unindexedstore_glue node:$val, node:$ptr)> {
-  let IsStore = 1;
-  let MemoryVT = i16;
-}
+                                    (truncstorei16_glue node:$val, node:$ptr)>;
 }
 
 def store_align8_local_m0 : PatFrag <(ops node:$value, node:$ptr),
                                      (store_local_m0 node:$value, node:$ptr)>,
                             Aligned<8> {
   let IsStore = 1;
-  let IsTruncStore = 0;
 }
 
 def store_align16_local_m0 : PatFrag <(ops node:$value, node:$ptr),
                                      (store_local_m0 node:$value, node:$ptr)>,
                             Aligned<16> {
   let IsStore = 1;
+}
+
+let PredicateCode = [{return cast<MemSDNode>(N)->getAlignment() < 4;}],
+    GISelPredicateCode = [{return (*MI.memoperands_begin())->getAlign() < 4;}],
+    AddressSpaces = [ AddrSpaces.Local ] in {
+def load_align_less_than_4_local : PatFrag<(ops node:$ptr),
+                                           (load_local node:$ptr)> {
+  let IsLoad = 1;
+  let IsNonExtLoad = 1;
+}
+
+def load_align_less_than_4_local_m0 : PatFrag<(ops node:$ptr),
+                                              (load_local_m0 node:$ptr)> {
+  let IsLoad = 1;
+  let IsNonExtLoad = 1;
+}
+
+def store_align_less_than_4_local : PatFrag <(ops node:$value, node:$ptr),
+                                             (store_local node:$value, node:$ptr)> {
+  let IsStore = 1;
   let IsTruncStore = 0;
 }
 
-let AddressSpaces = StoreAddress_local.AddrSpaces in {
+def store_align_less_than_4_local_m0 : PatFrag <(ops node:$value, node:$ptr),
+                                                (store_local_m0 node:$value, node:$ptr)> {
+  let IsStore = 1;
+  let IsTruncStore = 0;
+}
+}
 
-def atomic_store_local_8_m0 : PatFrag <
-  (ops node:$value, node:$ptr),
-  (AMDGPUatomic_st_glue node:$value, node:$ptr)> {
+def atomic_store_8_glue : PatFrag <
+  (ops node:$ptr, node:$value),
+  (AMDGPUatomic_st_glue node:$ptr, node:$value)> {
   let IsAtomic = 1;
   let MemoryVT = i8;
 }
-def atomic_store_local_16_m0 : PatFrag <
-  (ops node:$value, node:$ptr),
-  (AMDGPUatomic_st_glue node:$value, node:$ptr)> {
+
+def atomic_store_16_glue : PatFrag <
+  (ops node:$ptr, node:$value),
+  (AMDGPUatomic_st_glue node:$ptr, node:$value)> {
   let IsAtomic = 1;
   let MemoryVT = i16;
 }
-def atomic_store_local_32_m0 : PatFrag <
-  (ops node:$value, node:$ptr),
-  (AMDGPUatomic_st_glue node:$value, node:$ptr)> {
+
+def atomic_store_32_glue : PatFrag <
+  (ops node:$ptr, node:$value),
+  (AMDGPUatomic_st_glue node:$ptr, node:$value)> {
   let IsAtomic = 1;
   let MemoryVT = i32;
 }
-def atomic_store_local_64_m0 : PatFrag <
-  (ops node:$value, node:$ptr),
-  (AMDGPUatomic_st_glue node:$value, node:$ptr)> {
+
+def atomic_store_64_glue : PatFrag <
+  (ops node:$ptr, node:$value),
+  (AMDGPUatomic_st_glue node:$ptr, node:$value)> {
   let IsAtomic = 1;
   let MemoryVT = i64;
 }
-} // End let AddressSpaces = StoreAddress_local.AddrSpaces
+
+let IsAtomic = 1, AddressSpaces = StoreAddress_local.AddrSpaces in {
+def atomic_store_8_local_m0 : PatFrag<(ops node:$ptr, node:$val),
+                                       (atomic_store_8_glue node:$ptr, node:$val)>;
+def atomic_store_16_local_m0 : PatFrag<(ops node:$ptr, node:$val),
+                                       (atomic_store_16_glue node:$ptr, node:$val)>;
+def atomic_store_32_local_m0 : PatFrag<(ops node:$ptr, node:$val),
+                                       (atomic_store_32_glue node:$ptr, node:$val)>;
+def atomic_store_64_local_m0 : PatFrag<(ops node:$ptr, node:$val),
+                                       (atomic_store_64_glue node:$ptr, node:$val)>;
+} // End let IsAtomic = 1, AddressSpaces = StoreAddress_local.AddrSpaces
 
 
 def si_setcc_uniform : PatFrag <
@@ -686,10 +774,14 @@ multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0,
 
   let AddressSpaces = StoreAddress_local.AddrSpaces in {
     defm _local_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>;
+    defm _local_m0 : ret_noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"),
+                                                 IsInt>;
   }
 
   let AddressSpaces = StoreAddress_region.AddrSpaces in {
     defm _region_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>;
+    defm _region_m0 : ret_noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"),
+                                                  IsInt>;
   }
 }
 
@@ -954,6 +1046,18 @@ def SWaitMatchClass : AsmOperandClass {
   let ParserMethod = "parseSWaitCntOps";
 }
 
+def DepCtrMatchClass : AsmOperandClass {
+  let Name = "DepCtr";
+  let RenderMethod = "addImmOperands";
+  let ParserMethod = "parseDepCtrOps";
+}
+
+def SDelayMatchClass : AsmOperandClass {
+  let Name = "SDelayAlu";
+  let RenderMethod = "addImmOperands";
+  let ParserMethod = "parseSDelayAluOps";
+}
+
 def VReg32OrOffClass : AsmOperandClass {
   let Name = "VReg32OrOff";
   let ParserMethod = "parseVReg32OrOff";
@@ -979,6 +1083,16 @@ def WAIT_FLAG : Operand <i32> {
   let ParserMatchClass = SWaitMatchClass;
   let PrintMethod = "printWaitFlag";
 }
+
+def DepCtrImm : Operand <i32> {
+  let ParserMatchClass = DepCtrMatchClass;
+  let PrintMethod = "printDepCtr";
+}
+
+def DELAY_FLAG : Operand <i32> {
+  let ParserMatchClass = SDelayMatchClass;
+  let PrintMethod = "printDelayFlag";
+}
 } // End OperandType = "OPERAND_IMMEDIATE"
 
 include "SIInstrFormats.td"
@@ -1163,14 +1277,6 @@ def FORMAT : NamedOperandU8<"FORMAT", NamedMatchClass<"FORMAT", 0>>;
 def DMask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>;
 def Dim : NamedOperandU8<"Dim", NamedMatchClass<"Dim", 0>>;
 
-def dpp8 : NamedOperandU32<"DPP8", NamedMatchClass<"DPP8", 0>>;
-
-def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>;
-def row_mask : NamedOperandU32<"RowMask", NamedMatchClass<"RowMask">>;
-def bank_mask : NamedOperandU32<"BankMask", NamedMatchClass<"BankMask">>;
-def bound_ctrl : NamedOperandBit<"BoundCtrl", NamedMatchClass<"BoundCtrl">>;
-def FI : NamedOperandU32<"FI", NamedMatchClass<"FI">>;
-
 def dst_sel : NamedOperandU32<"SDWADstSel", NamedMatchClass<"SDWADstSel">>;
 def src0_sel : NamedOperandU32<"SDWASrc0Sel", NamedMatchClass<"SDWASrc0Sel">>;
 def src1_sel : NamedOperandU32<"SDWASrc1Sel", NamedMatchClass<"SDWASrc1Sel">>;
@@ -1181,6 +1287,14 @@ def op_sel_hi0 : NamedOperandU32Default0<"OpSelHi", NamedMatchClass<"OpSelHi">>;
 def neg_lo0 : NamedOperandU32Default0<"NegLo", NamedMatchClass<"NegLo">>;
 def neg_hi0 : NamedOperandU32Default0<"NegHi", NamedMatchClass<"NegHi">>;
 
+def dpp8 : NamedOperandU32<"DPP8", NamedMatchClass<"DPP8", 0>>;
+def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>;
+
+def row_mask : NamedOperandU32<"RowMask", NamedMatchClass<"RowMask">>;
+def bank_mask : NamedOperandU32<"BankMask", NamedMatchClass<"BankMask">>;
+def bound_ctrl : NamedOperandBit<"BoundCtrl", NamedMatchClass<"BoundCtrl">>;
+def FI : NamedOperandU32<"FI", NamedMatchClass<"FI">>;
+
 def blgp : NamedOperandU32<"BLGP", NamedMatchClass<"BLGP">>;
 def cbsz : NamedOperandU32<"CBSZ", NamedMatchClass<"CBSZ">>;
 def abid : NamedOperandU32<"ABID", NamedMatchClass<"ABID">>;
@@ -1191,6 +1305,9 @@ def exp_tgt : NamedOperandU32<"ExpTgt", NamedMatchClass<"ExpTgt", 0>> {
 
 }
 
+def wait_vdst : NamedOperandU8<"WaitVDST", NamedMatchClass<"WaitVDST">>;
+def wait_exp : NamedOperandU8<"WaitEXP", NamedMatchClass<"WaitEXP">>;
+
 } // End OperandType = "OPERAND_IMMEDIATE"
 
 class KImmMatchClass<int size> : AsmOperandClass {
@@ -1223,10 +1340,18 @@ class FPInputModsMatchClass <int opSize> : AsmOperandClass {
   let PredicateMethod = "isRegOrImmWithFP"#opSize#"InputMods";
 }
 
+class FPVCSrcInputModsMatchClass <int opSize> : FPInputModsMatchClass <opSize> {
+  let Name = "RegOrInlineImmWithFP"#opSize#"InputMods";
+  let PredicateMethod = "isRegOrInlineImmWithFP"#opSize#"InputMods";
+}
+
 def FP16InputModsMatchClass : FPInputModsMatchClass<16>;
 def FP32InputModsMatchClass : FPInputModsMatchClass<32>;
 def FP64InputModsMatchClass : FPInputModsMatchClass<64>;
 
+def FP16VCSrcInputModsMatchClass : FPVCSrcInputModsMatchClass<16>;
+def FP32VCSrcInputModsMatchClass : FPVCSrcInputModsMatchClass<32>;
+
 class InputMods <AsmOperandClass matchClass> : Operand <i32> {
   let OperandNamespace = "AMDGPU";
   let OperandType = "OPERAND_INPUT_MODS";
@@ -1241,19 +1366,28 @@ def FP16InputMods : FPInputMods<FP16InputModsMatchClass>;
 def FP32InputMods : FPInputMods<FP32InputModsMatchClass>;
 def FP64InputMods : FPInputMods<FP64InputModsMatchClass>;
 
+def FP16VCSrcInputMods : FPInputMods<FP16VCSrcInputModsMatchClass>;
+def FP32VCSrcInputMods : FPInputMods<FP32VCSrcInputModsMatchClass>;
+
 class IntInputModsMatchClass <int opSize> : AsmOperandClass {
   let Name = "RegOrImmWithInt"#opSize#"InputMods";
   let ParserMethod = "parseRegOrImmWithIntInputMods";
   let PredicateMethod = "isRegOrImmWithInt"#opSize#"InputMods";
 }
+class IntVCSrcInputModsMatchClass <int opSize> : IntInputModsMatchClass <opSize> {
+  let Name = "RegOrInlineImmWithInt"#opSize#"InputMods";
+  let PredicateMethod = "isRegOrInlineImmWithInt"#opSize#"InputMods";
+}
 def Int32InputModsMatchClass : IntInputModsMatchClass<32>;
 def Int64InputModsMatchClass : IntInputModsMatchClass<64>;
+def Int32VCSrcInputModsMatchClass : IntVCSrcInputModsMatchClass<32>;
 
 class IntInputMods <IntInputModsMatchClass matchClass> : InputMods <matchClass> {
   let PrintMethod = "printOperandAndIntInputMods";
 }
 def Int32InputMods : IntInputMods<Int32InputModsMatchClass>;
 def Int64InputMods : IntInputMods<Int64InputModsMatchClass>;
+def Int32VCSrcInputMods : IntInputMods<Int32VCSrcInputModsMatchClass>;
 
 class OpSelModsMatchClass : AsmOperandClass {
   let Name = "OpSelMods";
@@ -1366,12 +1500,19 @@ def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">;
 
 def VOP3PMods  : ComplexPattern<untyped, 2, "SelectVOP3PMods">;
 
+def VOP3PModsDOT  : ComplexPattern<untyped, 2, "SelectVOP3PModsDOT">;
+def DotIUVOP3PMods  : ComplexPattern<untyped, 1, "SelectDotIUVOP3PMods">;
+def WMMAOpSelVOP3PMods  : ComplexPattern<untyped, 1, "SelectWMMAOpSelVOP3PMods">;
+
 def VOP3OpSel  : ComplexPattern<untyped, 2, "SelectVOP3OpSel">;
 
 def VOP3OpSelMods  : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">;
 
 def VOP3PMadMixMods  : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">;
 
+def VINTERPMods  : ComplexPattern<untyped, 2, "SelectVINTERPMods">;
+def VINTERPModsHi  : ComplexPattern<untyped, 2, "SelectVINTERPModsHi">;
+
 //===----------------------------------------------------------------------===//
 // SI assembler operands
 //===----------------------------------------------------------------------===//
@@ -1575,6 +1716,19 @@ class getVOP3SrcForVT<ValueType VT> {
   );
 }
 
+// Src2 of VOP3 DPP instructions cannot be a literal
+class getVOP3DPPSrcForVT<ValueType VT> {
+  bit isFP = isFloatType<VT>.ret;
+  RegisterOperand ret =
+      !if (!eq(VT.Value, i1.Value), SSrc_i1,
+           !if (isFP,
+                !if (!eq(VT.Value, f16.Value), VCSrc_f16,
+                     !if (!eq(VT.Value, v2f16.Value), VCSrc_v2f16, VCSrc_f32)),
+                !if (!eq(VT.Value, i16.Value), VCSrc_b16,
+                     !if (!eq(VT.Value, v2i16.Value), VCSrc_v2b16,
+                          VCSrc_b32))));
+}
+
 // Float or packed int
 class isModifierType<ValueType SrcVT> {
   bit ret = !or(!eq(SrcVT.Value, f16.Value),
@@ -1583,7 +1737,17 @@ class isModifierType<ValueType SrcVT> {
                 !eq(SrcVT.Value, v2f16.Value),
                 !eq(SrcVT.Value, v2i16.Value),
                 !eq(SrcVT.Value, v2f32.Value),
-                !eq(SrcVT.Value, v2i32.Value));
+                !eq(SrcVT.Value, v2i32.Value),
+                !eq(SrcVT.Value, v4f16.Value),
+                !eq(SrcVT.Value, v4i16.Value),
+                !eq(SrcVT.Value, v4f32.Value),
+                !eq(SrcVT.Value, v4i32.Value),
+                !eq(SrcVT.Value, v8f16.Value),
+                !eq(SrcVT.Value, v8i16.Value),
+                !eq(SrcVT.Value, v8f32.Value),
+                !eq(SrcVT.Value, v8i32.Value),
+                !eq(SrcVT.Value, v16f16.Value),
+                !eq(SrcVT.Value, v16i16.Value));
 }
 
 // Return type of input modifiers operand for specified input operand
@@ -1611,6 +1775,17 @@ class getSrcModDPP <ValueType VT> {
   Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods);
 }
 
+// Return type of input modifiers operand for specified input operand for DPP
+class getSrcModVOP3DPP <ValueType VT, bit EnableF32SrcMods> {
+  bit isFP = isFloatType<VT>.ret;
+  bit isPacked = isPackedType<VT>.ret;
+  Operand ret =
+      !if (isFP,
+           !if (!eq(VT.Value, f16.Value), FP16VCSrcInputMods,
+                FP32VCSrcInputMods),
+           !if (EnableF32SrcMods, FP32VCSrcInputMods, Int32VCSrcInputMods));
+}
+
 // Return type of input modifiers operand specified input operand for SDWA
 class getSrcModSDWA <ValueType VT> {
   Operand ret = !if(!eq(VT.Value, f16.Value), FP16SDWAInputMods,
@@ -1620,7 +1795,7 @@ class getSrcModSDWA <ValueType VT> {
 }
 
 // Returns the input arguments for VOP[12C] instructions for the given SrcVT.
-class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> {
+class getIns32 <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArgs> {
   dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0),               // VOP1
             !if(!eq(NumSrcArgs, 2), (ins Src0RC:$src0, Src1RC:$src1), // VOP2
                                     (ins)));
@@ -1715,19 +1890,21 @@ class getInsVOP3Base<RegisterOperand Src0RC, RegisterOperand Src1RC,
                 HasClamp, HasModifiers, HasSrc2Mods, HasOMod,
                 Src0Mod, Src1Mod, Src2Mod>.ret;
   dag opsel = (ins op_sel0:$op_sel);
-  dag vop3pFields = (ins op_sel_hi0:$op_sel_hi, neg_lo0:$neg_lo, neg_hi0:$neg_hi);
+  dag vop3pOpsel = (ins op_sel_hi0:$op_sel_hi);
+  dag vop3pFields = !con(!if(HasOpSel, vop3pOpsel, (ins)), (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi));
+
   dag ret = !con(base,
                  !if(HasOpSel, opsel,(ins)),
                  !if(IsVOP3P, vop3pFields,(ins)));
 }
 
 class getInsVOP3P <RegisterOperand Src0RC, RegisterOperand Src1RC,
-                   RegisterOperand Src2RC, int NumSrcArgs, bit HasClamp,
+                   RegisterOperand Src2RC, int NumSrcArgs, bit HasClamp, bit HasOpSel,
                    Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
   dag ret = getInsVOP3Base<Src0RC, Src1RC, Src2RC, NumSrcArgs,
                     HasClamp, 1/*HasModifiers*/, 1/*HasSrc2Mods*/,
                     0/*HasOMod*/, Src0Mod, Src1Mod, Src2Mod,
-                    1/*HasOpSel*/, 1/*IsVOP3P*/>.ret;
+                    HasOpSel, 1/*IsVOP3P*/>.ret;
 }
 
 class getInsVOP3OpSel <RegisterOperand Src0RC, RegisterOperand Src1RC,
@@ -1741,8 +1918,8 @@ class getInsVOP3OpSel <RegisterOperand Src0RC, RegisterOperand Src1RC,
 }
 
 class getInsDPPBase <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC,
-                 int NumSrcArgs, bit HasModifiers,
-                 Operand Src0Mod, Operand Src1Mod> {
+                 RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers,
+                 Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
 
   dag ret = !if (!eq(NumSrcArgs, 0),
                 // VOP1 without input operands (V_NOP)
@@ -1756,6 +1933,7 @@ class getInsDPPBase <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass
                 // VOP1_DPP without modifiers
                 (ins OldRC:$old, Src0RC:$src0)
               /* endif */),
+            !if (!eq(NumSrcArgs, 2),
               !if (HasModifiers,
                 // VOP2_DPP with modifiers
                 (ins OldRC:$old,
@@ -1765,34 +1943,72 @@ class getInsDPPBase <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass
                 // VOP2_DPP without modifiers
                 (ins OldRC:$old,
                      Src0RC:$src0, Src1RC:$src1)
-                )));
+                )
+              /* NumSrcArgs == 3, VOP3 */,
+              !if (HasModifiers,
+                // VOP3_DPP with modifiers
+                (ins OldRC:$old,
+                     Src0Mod:$src0_modifiers, Src0RC:$src0,
+                     Src1Mod:$src1_modifiers, Src1RC:$src1,
+                     Src2Mod:$src2_modifiers, Src2RC:$src2)
+              /* else */,
+                // VOP3_DPP without modifiers
+                (ins OldRC:$old,
+                     Src0RC:$src0, Src1RC:$src1,
+                     Src2RC:$src2)
+                )
+             /* endif */)));
 }
 
 class getInsDPP <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC,
-                 int NumSrcArgs, bit HasModifiers,
-                 Operand Src0Mod, Operand Src1Mod> {
-  dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, NumSrcArgs,
-                               HasModifiers, Src0Mod, Src1Mod>.ret,
+                 RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers,
+                 Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
+  dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs,
+                           HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret,
                  (ins dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
-                  bank_mask:$bank_mask, bound_ctrl:$bound_ctrl));
+                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl));
 }
 
 class getInsDPP16 <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC,
-                 int NumSrcArgs, bit HasModifiers,
-                 Operand Src0Mod, Operand Src1Mod> {
-  dag ret = !con(getInsDPP<OldRC, Src0RC, Src1RC, NumSrcArgs,
-                           HasModifiers, Src0Mod, Src1Mod>.ret,
+                 RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers,
+                 Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
+  dag ret = !con(getInsDPP<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs,
+                           HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret,
                  (ins FI:$fi));
 }
 
 class getInsDPP8 <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC,
-                 int NumSrcArgs, bit HasModifiers,
-                 Operand Src0Mod, Operand Src1Mod> {
-  dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, NumSrcArgs,
-                               HasModifiers, Src0Mod, Src1Mod>.ret,
+                 RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers,
+                 Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
+  dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs,
+                           HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret,
                  (ins dpp8:$dpp8, FI:$fi));
 }
 
+class getInsVOP3DPPBase<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs> {
+  dag old = ( ins OldRC:$old );
+  dag base = VOP3Base;
+  dag ret =  !con(
+                !if(!ne(NumSrcArgs, 0), old, (ins)),
+                base
+              );
+}
+
+class getInsVOP3DPP<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs> {
+  dag ret = !con(getInsVOP3DPPBase<VOP3Base,OldRC,NumSrcArgs>.ret,
+                 (ins dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl));
+}
+
+class getInsVOP3DPP16<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs> {
+  dag ret = !con(getInsVOP3DPP<VOP3Base,OldRC,NumSrcArgs>.ret,
+                 (ins FI:$fi));
+}
+
+class getInsVOP3DPP8<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs> {
+  dag ret = !con(getInsVOP3DPPBase<VOP3Base,OldRC,NumSrcArgs>.ret,
+                 (ins dpp8:$dpp8, FI:$fi));
+}
 
 // Ins for SDWA
 class getInsSDWA <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArgs,
@@ -1870,6 +2086,15 @@ class getAsm32 <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> {
                !if(!eq(NumSrcArgs, 3), src0#src1#src2, "");
 }
 
+class getAsmVOPDPart <int NumSrcArgs, string XorY> {
+  string dst = "$vdst" # XorY;
+  string src0 = ", $src0" # XorY;
+  string src1 = ", $vsrc1" # XorY;
+  string ret = dst #
+               !if(!ge(NumSrcArgs, 1), src0, "") #
+               !if(!ge(NumSrcArgs, 2), src1, "");
+}
+
 // Returns the assembly string for the inputs and outputs of a VOP3
 // instruction.
 class getAsm64 <bit HasDst, int NumSrcArgs, bit HasIntClamp, bit HasModifiers,
@@ -1890,7 +2115,7 @@ class getAsm64 <bit HasDst, int NumSrcArgs, bit HasIntClamp, bit HasModifiers,
 // Returns the assembly string for the inputs and outputs of a VOP3P
 // instruction.
 class getAsmVOP3P <int NumSrcArgs, bit HasModifiers,
-                   bit HasClamp> {
+                   bit HasClamp, bit HasOpSel> {
   string dst = "$vdst";
   string src0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,");
   string src1 = !if(!eq(NumSrcArgs, 1), "",
@@ -1900,10 +2125,11 @@ class getAsmVOP3P <int NumSrcArgs, bit HasModifiers,
 
   string mods = !if(HasModifiers, "$neg_lo$neg_hi", "");
   string clamp = !if(HasClamp, "$clamp", "");
+  string opsel = !if(HasOpSel, "$op_sel$op_sel_hi", "");
 
   // Each modifier is printed as an array of bits for each operand, so
   // all operands are printed as part of src0_modifiers.
-  string ret = dst#", "#src0#src1#src2#"$op_sel$op_sel_hi"#mods#clamp;
+  string ret = dst#", "#src0#src1#src2#opsel#mods#clamp;
 }
 
 class getAsmVOP3OpSel <int NumSrcArgs,
@@ -1930,8 +2156,8 @@ class getAsmVOP3OpSel <int NumSrcArgs,
   string src2 = !if(Src2HasMods, fsrc2, isrc2);
 
   string clamp = !if(HasClamp, "$clamp", "");
-
-  string ret = dst#", "#src0#src1#src2#"$op_sel"#clamp;
+  string omod = "";
+  string ret = dst#", "#src0#src1#src2#"$op_sel"#clamp#omod;
 }
 
 class getAsmDPP <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> {
@@ -1955,15 +2181,63 @@ class getAsmDPP16 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT
 }
 
 class getAsmDPP8 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32>
-  : getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT> {
+  : getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>{
   let ret = dst#args#" $dpp8$fi";
 }
 
+class getAsmVOP3DPPBase <int NumSrcArgs, bit HasDst, bit HasClamp,
+                       bit HasOpSel, bit HasOMod, bit IsVOP3P,
+                       bit HasModifiers, bit Src0HasMods,
+                       bit Src1HasMods, bit Src2HasMods, ValueType DstVT = i32> {
+  string dst = !if(HasDst,
+                   !if(!eq(DstVT.Size, 1),
+                       "$sdst",
+                       "$vdst"),
+                    ""); // use $sdst for VOPC
+  string isrc0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,");
+  string isrc1 = !if(!eq(NumSrcArgs, 1), "",
+                     !if(!eq(NumSrcArgs, 2), " $src1",
+                                             " $src1,"));
+  string isrc2 = !if(!eq(NumSrcArgs, 3), " $src2", "");
+
+  string fsrc0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
+  string fsrc1 = !if(!eq(NumSrcArgs, 1), "",
+                     !if(!eq(NumSrcArgs, 2), " $src1_modifiers",
+                                             " $src1_modifiers,"));
+  string fsrc2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", "");
+
+  string src0 = !if(Src0HasMods, fsrc0, isrc0);
+  string src1 = !if(Src1HasMods, fsrc1, isrc1);
+  string src2 = !if(Src2HasMods, fsrc2, isrc2);
+  string opsel = !if(HasOpSel, "$op_sel", "");
+  string 3PMods = !if(IsVOP3P,
+                      !if(HasOpSel, "$op_sel_hi", "")
+                        #!if(HasModifiers, "$neg_lo$neg_hi", ""),
+                      "");
+  string clamp = !if(HasClamp, "$clamp", "");
+  string omod = !if(HasOMod, "$omod", "");
+
+  string ret = dst#", "#src0#src1#src2#opsel#3PMods#clamp#omod;
+
+}
+
+class getAsmVOP3DPP<string base> {
+  string ret = base # " $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
+}
+
+class getAsmVOP3DPP16<string base> {
+  string ret = getAsmVOP3DPP<base>.ret # "$fi";
+}
+
+class getAsmVOP3DPP8<string base> {
+  string ret = base # " $dpp8$fi";
+}
+
 
 class getAsmSDWA <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> {
   string dst = !if(HasDst,
                    !if(!eq(DstVT.Size, 1),
-                       " vcc", // use vcc token as dst for VOPC instructioins
+                       " vcc", // use vcc token as dst for VOPC instructions
                        "$vdst"),
                     "");
   string src0 = "$src0_modifiers";
@@ -2056,6 +2330,12 @@ class getHasDPP <int NumSrcArgs> {
                 1);
 }
 
+class getHasExt32BitDPP <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
+                 ValueType Src1VT = i32> {
+  bit ret = !and(getHasDPP<NumSrcArgs>.ret,
+                 !not(getHas64BitOps<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret));
+}
+
 class getHasExt64BitDPP <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
                  ValueType Src1VT = i32> {
   bit ret = !and(getHasDPP<NumSrcArgs>.ret,
@@ -2089,6 +2369,24 @@ class BitAnd<bit a, bit b> {
   bit ret = !if(a, !if(b, 1, 0), 0);
 }
 
+class getHasVOP3DPP <ValueType DstVT = i32, ValueType Src0VT = i32,
+                 ValueType Src1VT = i32, ValueType Src2VT = i32> {
+  bit ret =    !if(!eq(DstVT.Size, 64),
+                    0, // 64-bit dst No DPP for 64-bit operands
+                    !if(!eq(Src0VT.Size, 64),
+                        0, // 64-bit src0
+                        !if(!eq(Src1VT.Size, 64),
+                            0, // 64-bit src1
+                            !if(!eq(Src2VT.Size, 64),
+                                0, // 64-bit src2
+                                1
+                            )
+                        )
+                    )
+                );
+}
+
+
 def PatGenMode {
   int NoPattern = 0;
   int Pattern   = 1;
@@ -2106,15 +2404,20 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
   field ValueType Src1VT = ArgVT[2];
   field ValueType Src2VT = ArgVT[3];
   field RegisterOperand DstRC = getVALUDstForVT<DstVT>.ret;
+  field RegisterOperand DstRC64 = DstRC;
   field RegisterOperand DstRCDPP = getVALUDstForVT<DstVT>.ret;
   field RegisterOperand DstRCSDWA = getSDWADstForVT<DstVT>.ret;
   field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret;
-  field RegisterClass Src1RC32 = getVregSrcForVT<Src1VT>.ret;
+  field RegisterOperand Src1RC32 = RegisterOperand<getVregSrcForVT<Src1VT>.ret>;
   field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret;
   field RegisterOperand Src1RC64 = getVOP3SrcForVT<Src1VT>.ret;
   field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret;
   field RegisterClass Src0DPP = getVregSrcForVT<Src0VT>.ret;
   field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret;
+  field RegisterClass Src2DPP = getVregSrcForVT<Src2VT>.ret;
+  field RegisterOperand Src0VOP3DPP = VGPRSrc_32;
+  field RegisterOperand Src1VOP3DPP = VGPRSrc_32;
+  field RegisterOperand Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT>.ret;
   field RegisterOperand Src0SDWA = getSDWASrcForVT<Src0VT>.ret;
   field RegisterOperand Src1SDWA = getSDWASrcForVT<Src0VT>.ret;
   field Operand Src0Mod = getSrcMod<Src0VT, EnableF32SrcMods>.ret;
@@ -2122,6 +2425,8 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
   field Operand Src2Mod = getSrcMod<Src2VT, EnableF32SrcMods>.ret;
   field Operand Src0ModDPP = getSrcModDPP<Src0VT>.ret;
   field Operand Src1ModDPP = getSrcModDPP<Src1VT>.ret;
+  field Operand Src2ModDPP = getSrcModDPP<Src2VT>.ret;
+  field Operand Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT, EnableF32SrcMods>.ret;
   field Operand Src0ModSDWA = getSrcModSDWA<Src0VT>.ret;
   field Operand Src1ModSDWA = getSrcModSDWA<Src1VT>.ret;
 
@@ -2169,15 +2474,20 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
   field bit HasSrc2Mods = !if(HasModifiers, !or(HasSrc2FloatMods, HasSrc2IntMods), 0);
 
   field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
-  field bit HasExtDPP = getHasDPP<NumSrcArgs>.ret;
+  field bit HasExtVOP3DPP = getHasVOP3DPP<DstVT, Src0VT, Src1VT, Src2VT>.ret;
+  field bit HasExtDPP = !if(!or(getHasDPP<NumSrcArgs>.ret,
+                HasExtVOP3DPP), 1, 0);
+  field bit HasExt32BitDPP = getHasExt32BitDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
   field bit HasExt64BitDPP = getHasExt64BitDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
   field bit HasExtSDWA = getHasSDWA<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
   field bit HasExtSDWA9 = HasExtSDWA;
   field int NeedPatGen = PatGenMode.NoPattern;
 
   field bit IsMAI = 0;
+  field bit IsVOP3P = 0;
   field bit IsDOT = 0;
   field bit IsSingle = 0;
+  field bit IsWMMA = 0;
 
   field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods);
   field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods);
@@ -2188,9 +2498,11 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
   // VOP3b instructions are a special case with a second explicit
   // output. This is manually overridden for them.
   field dag Outs32 = Outs;
-  field dag Outs64 = Outs;
+  field dag Outs64 = !if(HasDst,(outs DstRC64:$vdst),(outs));
   field dag OutsDPP = getOutsDPP<HasDst, DstVT, DstRCDPP>.ret;
   field dag OutsDPP8 = getOutsDPP<HasDst, DstVT, DstRCDPP>.ret;
+  field dag OutsVOP3DPP = OutsDPP;
+  field dag OutsVOP3DPP8 = OutsDPP8;
   field dag OutsSDWA = getOutsSDWA<HasDst, DstVT, DstRCSDWA>.ret;
 
   field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
@@ -2198,7 +2510,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
                              HasIntClamp, HasModifiers, HasSrc2Mods,
                              HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret;
   field dag InsVOP3P = getInsVOP3P<Src0RC64, Src1RC64, Src2RC64,
-                                   NumSrcArgs, HasClamp,
+                                   NumSrcArgs, HasClamp, HasOpSel,
                                    Src0PackedMod, Src1PackedMod, Src2PackedMod>.ret;
   field dag InsVOP3OpSel = getInsVOP3OpSel<Src0RC64, Src1RC64, Src2RC64,
                                 NumSrcArgs, HasClamp, HasOMod,
@@ -2206,21 +2518,35 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
                                 getOpSelMod<Src1VT>.ret,
                                 getOpSelMod<Src2VT>.ret>.ret;
   field dag InsDPP = !if(HasExtDPP,
-                         getInsDPP<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs,
-                                   HasModifiers, Src0ModDPP, Src1ModDPP>.ret,
+                         getInsDPP<DstRCDPP, Src0DPP, Src1DPP, Src2DPP, NumSrcArgs,
+                                   HasModifiers, Src0ModDPP, Src1ModDPP, Src2ModDPP>.ret,
                          (ins));
-  field dag InsDPP16 = getInsDPP16<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs,
-                                   HasModifiers, Src0ModDPP, Src1ModDPP>.ret;
-  field dag InsDPP8 = getInsDPP8<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs, 0,
-                                 Src0ModDPP, Src1ModDPP>.ret;
+  field dag InsDPP16 = getInsDPP16<DstRCDPP, Src0DPP, Src1DPP, Src2DPP, NumSrcArgs,
+                                   HasModifiers, Src0ModDPP, Src1ModDPP, Src2ModDPP>.ret;
+  field dag InsDPP8 = getInsDPP8<DstRCDPP, Src0DPP, Src1DPP, Src2DPP,
+                                 NumSrcArgs, HasModifiers,
+                                 Src0ModDPP, Src1ModDPP, Src2ModDPP>.ret;
+  field dag InsVOP3Base = getInsVOP3Base<Src0VOP3DPP, Src1VOP3DPP,
+                  Src2VOP3DPP, NumSrcArgs, HasClamp, HasModifiers, HasSrc2Mods, HasOMod,
+                  Src0ModDPP, Src1ModDPP, Src2ModVOP3DPP, HasOpSel, IsVOP3P>.ret;
+  field dag InsVOP3DPP = getInsVOP3DPP<InsVOP3Base, DstRCDPP, NumSrcArgs>.ret;
+  field dag InsVOP3DPP16 = getInsVOP3DPP16<InsVOP3Base, DstRCDPP, NumSrcArgs>.ret;
+  field dag InsVOP3DPP8 = getInsVOP3DPP8<InsVOP3Base, DstRCDPP, NumSrcArgs>.ret;
   field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs,
                                  HasSDWAOMod, Src0ModSDWA, Src1ModSDWA,
                                  DstVT>.ret;
+  field dag InsVOPDX = (ins Src0RC32:$src0X, Src1RC32:$vsrc1X);
+  // It is a slight misnomer to use the deferred f32 operand type for non-float
+  // operands, but this operand type will only be used if the other dual
+  // component is FMAAK or FMAMK
+  field dag InsVOPDXDeferred = (ins !if(!eq(Src0VT.Size, 32), VSrc_f32_Deferred, VSrc_f16_Deferred):$src0X, VGPR_32:$vsrc1X);
+  field dag InsVOPDY = (ins Src0RC32:$src0Y, Src1RC32:$vsrc1Y);
+  field dag InsVOPDYDeferred = (ins !if(!eq(Src1VT.Size, 32), VSrc_f32_Deferred, VSrc_f16_Deferred):$src0Y, VGPR_32:$vsrc1Y);
 
 
   field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret;
   field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasIntClamp, HasModifiers, HasOMod, DstVT>.ret;
-  field string AsmVOP3P = getAsmVOP3P<NumSrcArgs, HasModifiers, HasClamp>.ret;
+  field string AsmVOP3P = getAsmVOP3P<NumSrcArgs, HasModifiers, HasClamp, HasOpSel>.ret;
   field string AsmVOP3OpSel = getAsmVOP3OpSel<NumSrcArgs,
                                               HasClamp,
                                               HasSrc0FloatMods,
@@ -2232,15 +2558,24 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
   // DPP8 encoding has no fields for modifiers, and it is enforced by setting
   // the asm operand name via this HasModifiers flag
   field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0 /*HasModifiers*/, DstVT>.ret;
+  field string AsmVOP3DPPBase = getAsmVOP3DPPBase<NumSrcArgs, HasDst, HasClamp,
+   HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasSrc0FloatMods, HasSrc1FloatMods,
+   HasSrc2FloatMods, DstVT >.ret;
+  field string AsmVOP3DPP = getAsmVOP3DPP<AsmVOP3DPPBase>.ret;
+  field string AsmVOP3DPP16 = getAsmVOP3DPP16<AsmVOP3DPPBase>.ret;
+  field string AsmVOP3DPP8 = getAsmVOP3DPP8<AsmVOP3DPPBase>.ret;
   field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, DstVT>.ret;
   field string AsmSDWA9 = getAsmSDWA9<HasDst, HasSDWAOMod, NumSrcArgs, DstVT>.ret;
-
+  field string AsmVOPDX = getAsmVOPDPart<NumSrcArgs, "X">.ret;
+  field string AsmVOPDY = getAsmVOPDPart<NumSrcArgs, "Y">.ret;
   field string TieRegDPP = "$old";
 }
 
-class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> {
+  class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> {
   let HasExt = 0;
   let HasExtDPP = 0;
+  let HasExtVOP3DPP = 0;
+  let HasExt32BitDPP = 0;
   let HasExt64BitDPP = 0;
   let HasExtSDWA = 0;
   let HasExtSDWA9 = 0;
@@ -2249,10 +2584,10 @@ class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> {
 class VOP_PAT_GEN <VOPProfile p, int mode=PatGenMode.NoPattern> : VOPProfile <p.ArgVT> {
   let NeedPatGen = mode;
 }
-
 def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>;
 def VOP_F16_I16 : VOPProfile <[f16, i16, untyped, untyped]>;
 def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>;
+def VOP_I16_I16 : VOPProfile <[i16, i16, untyped, untyped]>;
 
 def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>;
 def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;
@@ -2264,6 +2599,7 @@ def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>;
 def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>;
 
 def VOP_I32_I16_I16_I32 : VOPProfile <[i32, i16, i16, i32, untyped]>;
+def VOP_I32_I16 : VOPProfile <[i32, i16, untyped, untyped]>;
 
 def VOP_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, untyped]>;
 def VOP_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, untyped]>;
@@ -2274,6 +2610,10 @@ def VOP_V2I16_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, v2i16]>;
 def VOP_V2I16_F32_F32 : VOPProfile <[v2i16, f32, f32, untyped]>;
 def VOP_V2I16_I32_I32 : VOPProfile <[v2i16, i32, i32, untyped]>;
 
+def VOP_F16_V2F16_V2F16_F16 : VOPProfile <[f16, v2f16, v2f16, f16]>;
+def VOP_I16_V2I16_V2I16_I16 : VOPProfile <[i16, v2i16, v2i16, i16]>;
+def VOP_F32_V2I16_V2I16_F32 : VOPProfile <[f32, v2i16, v2i16, f32]>;
+
 def VOP_F32_V2F16_V2F16_V2F16 : VOPProfile <[f32, v2f16, v2f16, v2f16]>;
 
 def VOP_NONE : VOPProfile <[untyped, untyped, untyped, untyped]>;
@@ -2343,6 +2683,18 @@ def VOP_V4F32_V4I16_V4I16_V4F32   : VOPProfile <[v4f32,  v4i16, v4i16, v4f32]>;
 def VOP_V16F32_V4I16_V4I16_V16F32 : VOPProfile <[v16f32, v4i16, v4i16, v16f32]>;
 def VOP_V32F32_V4I16_V4I16_V32F32 : VOPProfile <[v32f32, v4i16, v4i16, v32f32]>;
 
+def VOP_V4I32_I64_I64_V4I32       : VOPProfile <[v4i32,  i64,   i64,   v4i32]>;
+def VOP_V16I32_I64_I64_V16I32     : VOPProfile <[v16i32, i64,   i64,   v16i32]>;
+def VOP_V4F32_V2F32_V2F32_V4F32   : VOPProfile <[v4f32,  v2f32, v2f32, v4f32]>;
+def VOP_V16F32_V2F32_V2F32_V16F32 : VOPProfile <[v16f32, v2f32, v2f32, v16f32]>;
+
+def VOP_V4F32_V4F16_V8F16_I32     : VOPProfile <[v4f32,  v4f16, v8f16, i32]>;
+def VOP_V16F32_V4F16_V8F16_I32    : VOPProfile <[v16f32, v4f16, v8f16, i32]>;
+def VOP_V4F32_V4I16_V8I16_I32     : VOPProfile <[v4f32,  v4i16, v8i16, i32]>;
+def VOP_V16F32_V4I16_V8I16_I32    : VOPProfile <[v16f32, v4i16, v8i16, i32]>;
+def VOP_V4I32_V2I32_V4I32_I32     : VOPProfile <[v4i32,  v2i32, v4i32, i32]>;
+def VOP_V16I32_V2I32_V4I32_I32    : VOPProfile <[v16i32, v2i32, v4i32, i32]>;
+
 class Commutable_REV <string revOp, bit isOrig> {
   string RevOp = revOp;
   bit IsOrig = isOrig;
@@ -2394,10 +2746,11 @@ multiclass VINTRP_m <bits <2> op, dag outs, dag ins, string asm,
 
   def _vi : VINTRP_Real_vi <op, NAME, outs, ins, asm>;
 
-  let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+  let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
     def _gfx10 : VINTRP_Real_si<op, NAME, outs, ins, asm, SIEncodingFamily.GFX10>;
-  } // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
+  } // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
 }
+
 //===----------------------------------------------------------------------===//
 // Vector instruction mappings
 //===----------------------------------------------------------------------===//
@@ -2470,6 +2823,7 @@ def getMCOpcodeGen : InstrMapping {
   let RowFields = ["PseudoInstr"];
   let ColFields = ["Subtarget"];
   let KeyCol = [!cast<string>(SIEncodingFamily.NONE)];
+  // These columns must be kept in sync with the SIEncodingFamily enumeration.
   let ValueCols = [[!cast<string>(SIEncodingFamily.SI)],
                    [!cast<string>(SIEncodingFamily.VI)],
                    [!cast<string>(SIEncodingFamily.SDWA)],
@@ -2482,7 +2836,9 @@ def getMCOpcodeGen : InstrMapping {
                    [!cast<string>(SIEncodingFamily.GFX9)],
                    [!cast<string>(SIEncodingFamily.GFX10)],
                    [!cast<string>(SIEncodingFamily.SDWA10)],
-                   [!cast<string>(SIEncodingFamily.GFX90A)]];
+                   [!cast<string>(SIEncodingFamily.GFX90A)],
+                   [!cast<string>(SIEncodingFamily.GFX940)],
+                   [!cast<string>(SIEncodingFamily.GFX11)]];
 }
 
 // Get equivalent SOPK instruction.
@@ -2510,14 +2866,6 @@ def getIfAddr64Inst : InstrMapping {
   let ValueCols = [["1"]];
 }
 
-def getMUBUFNoLdsInst : InstrMapping {
-  let FilterClass = "MUBUFLdsTable";
-  let RowFields = ["OpName"];
-  let ColFields = ["IsLds"];
-  let KeyCol = ["1"];
-  let ValueCols = [["0"]];
-}
-
 // Maps an atomic opcode to its returnless version.
 def getAtomicNoRetOp : InstrMapping {
   let FilterClass = "AtomicNoRet";
@@ -2580,6 +2928,14 @@ def getFlatScratchInstSSfromSV : InstrMapping {
   let ValueCols = [["SS"]];
 }
 
+def getFlatScratchInstSVfromSVS : InstrMapping {
+  let FilterClass = "FlatScratchInst";
+  let RowFields = ["SVOp"];
+  let ColFields = ["Mode"];
+  let KeyCol = ["SVS"];
+  let ValueCols = [["SV"]];
+}
+
 def getFlatScratchInstSVfromSS : InstrMapping {
   let FilterClass = "FlatScratchInst";
   let RowFields = ["SVOp"];
@@ -2596,6 +2952,15 @@ def getMFMAEarlyClobberOp : InstrMapping {
   let ValueCols = [["0"]];
 }
 
+// Maps an v_cmp instruction to its v_cmpx equivalent.
+def getVCMPXOpFromVCMP : InstrMapping {
+  let FilterClass = "VCMPVCMPXTable";
+  let RowFields = ["VCMPOp"];
+  let ColFields = ["IsVCMPX"];
+  let KeyCol = ["0"];
+  let ValueCols = [["1"]];
+}
+
 include "SIInstructions.td"
 
 include "DSInstructions.td"
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 7be63ae6964b..829669157893 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -14,12 +14,24 @@ class GCNPat<dag pattern, dag result> : Pat<pattern, result>, GCNPredicateContro
 
 }
 
+class UniformSextInreg<ValueType VT> : PatFrag<
+  (ops node:$src),
+  (sext_inreg $src, VT),
+  [{ return !N->isDivergent(); }]>;
+
+class DivergentSextInreg<ValueType VT> : PatFrag<
+  (ops node:$src),
+  (sext_inreg $src, VT),
+  [{ return N->isDivergent(); }]>;
+
 include "SOPInstructions.td"
 include "VOPInstructions.td"
 include "SMInstructions.td"
 include "FLATInstructions.td"
 include "BUFInstructions.td"
 include "EXPInstructions.td"
+include "LDSDIRInstructions.td"
+include "VINTERPInstructions.td"
 
 //===----------------------------------------------------------------------===//
 // VINTRP Instructions
@@ -176,19 +188,33 @@ def EXIT_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> {
   let mayStore = 0;
 }
 
+// Pseudo instructions used for @llvm.fptrunc.round upward
+// and @llvm.fptrunc.round downward.
+// These intrinsics will be legalized to G_FPTRUNC_ROUND_UPWARD
+// and G_FPTRUNC_ROUND_DOWNWARD before being lowered to
+// FPTRUNC_UPWARD_PSEUDO and FPTRUNC_DOWNWARD_PSEUDO.
+// The final codegen is done in the ModeRegister pass.
+let Uses = [MODE, EXEC] in {
+def FPTRUNC_UPWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst),
+  (ins VGPR_32:$src0),
+  [(set f16:$vdst, (SIfptrunc_round_upward f32:$src0))]>;
+
+def FPTRUNC_DOWNWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst),
+  (ins VGPR_32:$src0),
+  [(set f16:$vdst, (SIfptrunc_round_downward f32:$src0))]>;
+} // End Uses = [MODE, EXEC]
+
 // Invert the exec mask and overwrite the inactive lanes of dst with inactive,
 // restoring it after we're done.
 let Defs = [SCC] in {
 def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst),
-  (ins VGPR_32: $src, VSrc_b32:$inactive),
+  (ins VSrc_b32: $src, VSrc_b32:$inactive),
   [(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> {
-  let Constraints = "$src = $vdst";
 }
 
 def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
-  (ins VReg_64: $src, VSrc_b64:$inactive),
+  (ins VSrc_b64: $src, VSrc_b64:$inactive),
   [(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> {
-  let Constraints = "$src = $vdst";
 }
 } // End Defs = [SCC]
 
@@ -287,6 +313,20 @@ def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
   let isConvergent = 1;
   let FixedSize = 1;
   let Size = 0;
+  let isMeta = 1;
+}
+
+def SCHED_BARRIER : SPseudoInstSI<(outs), (ins i32imm:$mask),
+  [(int_amdgcn_sched_barrier (i32 timm:$mask))]> {
+  let SchedRW = [];
+  let hasNoSchedulingInfo = 1;
+  let hasSideEffects = 1;
+  let mayLoad = 0;
+  let mayStore = 0;
+  let isConvergent = 1;
+  let FixedSize = 1;
+  let Size = 0;
+  let isMeta = 1;
 }
 
 // SI pseudo instructions. These are used by the CFG structurizer pass
@@ -424,6 +464,7 @@ def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins),
   let Size = 0;
   let hasNoSchedulingInfo = 1;
   let FixedSize = 1;
+  let isMeta = 1;
 }
 
 // Used as an isel pseudo to directly emit initialization with an
@@ -459,11 +500,14 @@ def SI_RETURN_TO_EPILOG : SPseudoInstSI <
   let hasNoSchedulingInfo = 1;
   let DisableWQM = 1;
   let FixedSize = 1;
+
+  // TODO: Should this be true?
+  let isMeta = 0;
 }
 
 // Return for returning function calls.
 def SI_RETURN : SPseudoInstSI <
-  (outs), (ins), [],
+  (outs), (ins), [(AMDGPUret_flag)],
   "; return"> {
   let isTerminator = 1;
   let isBarrier = 1;
@@ -496,6 +540,7 @@ def : GCNPat<
 def SI_CALL : SPseudoInstSI <
   (outs SReg_64:$dst), (ins SSrc_b64:$src0, unknown:$callee)> {
   let Size = 4;
+  let FixedSize = 1;
   let isCall = 1;
   let UseNamedOperandTable = 1;
   let SchedRW = [WriteBranch];
@@ -508,6 +553,7 @@ def SI_TCRETURN : SPseudoInstSI <(outs),
   (ins SReg_64:$src0, unknown:$callee, i32imm:$fpdiff),
   [(AMDGPUtc_return i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> {
   let Size = 4;
+  let FixedSize = 1;
   let isCall = 1;
   let isTerminator = 1;
   let isReturn = 1;
@@ -1212,6 +1258,26 @@ def : Pat <
   (v4f16 (EXTRACT_SUBREG v8f16:$vec, sub2_sub3))
 >;
 
+def : Pat <
+  (extract_subvector v16i16:$vec, (i32 0)),
+  (v8i16 (EXTRACT_SUBREG v16i16:$vec, sub0_sub1_sub2_sub3))
+>;
+
+def : Pat <
+  (extract_subvector v16i16:$vec, (i32 8)),
+  (v8i16 (EXTRACT_SUBREG v16i16:$vec, sub4_sub5_sub6_sub7))
+>;
+
+def : Pat <
+  (extract_subvector v16f16:$vec, (i32 0)),
+  (v8f16 (EXTRACT_SUBREG v16f16:$vec, sub0_sub1_sub2_sub3))
+>;
+
+def : Pat <
+  (extract_subvector v16f16:$vec, (i32 8)),
+  (v8f16 (EXTRACT_SUBREG v16f16:$vec, sub4_sub5_sub6_sub7))
+>;
+
 foreach Index = 0-31 in {
   def Extract_Element_v32i32_#Index : Extract_Element <
     i32, v32i32, Index, !cast<SubRegIndex>(sub#Index)
@@ -1371,7 +1437,18 @@ def : BitConvert <v8i32, v4i64, VReg_256>;
 def : BitConvert <v8f32, v4i64, VReg_256>;
 def : BitConvert <v8i32, v4f64, VReg_256>;
 def : BitConvert <v8f32, v4f64, VReg_256>;
-
+def : BitConvert <v16i16, v16f16, SReg_256>;
+def : BitConvert <v16f16, v16i16, SReg_256>;
+def : BitConvert <v16i16, v16f16, VReg_256>;
+def : BitConvert <v16f16, v16i16, VReg_256>;
+def : BitConvert <v16f16, v8i32, VReg_256>;
+def : BitConvert <v16i16, v8i32, VReg_256>;
+def : BitConvert <v16f16, v8f32, VReg_256>;
+def : BitConvert <v16i16, v8f32, VReg_256>;
+def : BitConvert <v8i32, v16f16, VReg_256>;
+def : BitConvert <v8i32, v16i16, VReg_256>;
+def : BitConvert <v8f32, v16f16, VReg_256>;
+def : BitConvert <v8f32, v16i16, VReg_256>;
 
 // 512-bit bitcast
 def : BitConvert <v16i32, v16f32, VReg_512>;
@@ -1941,12 +2018,6 @@ def : GCNPat <
 //===----------------------------------------------------------------------===//
 // Conversion Patterns
 //===----------------------------------------------------------------------===//
-
-class UniformSextInreg<ValueType VT> : PatFrag<
-  (ops node:$src),
-  (sext_inreg $src, VT),
-  [{ return !N->isDivergent(); }]>;
-
 def : GCNPat<(i32 (UniformSextInreg<i1> i32:$src)),
   (S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16
 
@@ -1981,23 +2052,28 @@ def : GCNPat <
   (S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16
 >;
 
-
-class DivergentSextInreg<ValueType VT> : PatFrag<
-  (ops node:$src),
-  (sext_inreg $src, VT),
-  [{ return N->isDivergent(); }]>;
-
-def : GCNPat<(i32 (DivergentSextInreg<i1> i32:$src)),
+def : GCNPat<
+  (i32 (DivergentSextInreg<i1> i32:$src)),
   (V_BFE_I32_e64 i32:$src, (i32 0), (i32 1))>;
 
 def : GCNPat <
   (i16 (DivergentSextInreg<i1> i16:$src)),
-  (V_BFE_I32_e64 $src, (i32 0), (i32 1)) // 0 | 1 << 16
+  (V_BFE_I32_e64 $src, (i32 0), (i32 1))
 >;
 
 def : GCNPat <
   (i16 (DivergentSextInreg<i8> i16:$src)),
-  (V_BFE_I32_e64 $src, (i32 0), (i32 8)) // 0 | 8 << 16
+  (V_BFE_I32_e64 $src, (i32 0), (i32 8))
+>;
+
+def : GCNPat<
+  (i32 (DivergentSextInreg<i8> i32:$src)),
+  (V_BFE_I32_e64 i32:$src, (i32 0), (i32 8))
+>;
+
+def : GCNPat <
+  (i32 (DivergentSextInreg<i16> i32:$src)),
+  (V_BFE_I32_e64 $src, (i32 0), (i32 16))
 >;
 
 def : GCNPat <
@@ -2010,14 +2086,14 @@ def : GCNPat <
 def : GCNPat <
   (i64 (DivergentSextInreg<i8> i64:$src)),
   (REG_SEQUENCE VReg_64,
-    (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 8)/* 0 | 8 << 16 */), sub0,
+    (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 8)), sub0,
     (V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 8))), sub1)
 >;
 
 def : GCNPat <
   (i64 (DivergentSextInreg<i16> i64:$src)),
   (REG_SEQUENCE VReg_64,
-    (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 16)/* 0 | 16 << 16 */), sub0,
+    (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 16)), sub0,
     (V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 16))), sub1)
 >;
 
@@ -2053,11 +2129,17 @@ def : ZExt_i64_i1_Pat<anyext>;
 // FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that
 // REG_SEQUENCE patterns don't support instructions with multiple outputs.
 def : GCNPat <
-  (i64 (sext i32:$src)),
+  (i64 (UniformUnaryFrag<sext> i32:$src)),
     (REG_SEQUENCE SReg_64, $src, sub0,
     (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1)
 >;
 
+def : GCNPat <
+  (i64 (DivergentUnaryFrag<sext> i32:$src)),
+    (REG_SEQUENCE VReg_64, $src, sub0,
+    (i32 (COPY_TO_REGCLASS (V_ASHRREV_I32_e64 (i32 31), $src), VGPR_32)), sub1)
+>;
+
 def : GCNPat <
   (i64 (sext i1:$src)),
   (REG_SEQUENCE VReg_64,
@@ -2234,6 +2316,30 @@ def : GCNPat <
 // certainty what the source behavior is without more context on how
 // the src is lowered. e.g. fptrunc + fma may be lowered to a
 // v_fma_mix* instruction which does not zero, or may not.
+def : GCNPat<
+  (i32 (DivergentUnaryFrag<abs> i32:$src)),
+  (V_MAX_I32_e64 (V_SUB_CO_U32_e32 (i32 0), $src), $src)>;
+
+let AddedComplexity = 1 in {
+def : GCNPat<
+  (i32 (DivergentUnaryFrag<abs> i32:$src)),
+  (V_MAX_I32_e64 (V_SUB_U32_e32 (i32 0), $src), $src)>{
+  let SubtargetPredicate = HasAddNoCarryInsts;
+}
+}  // AddedComplexity = 1
+
+def : GCNPat<
+  (i32 (DivergentUnaryFrag<zext> i16:$src)),
+  (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff)), $src)
+>;
+
+def : GCNPat<
+  (i64 (DivergentUnaryFrag<zext> i16:$src)),
+  (REG_SEQUENCE VReg_64,
+    (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff)), $src), sub0,
+    (S_MOV_B32 (i32 0)), sub1)
+>;
+
 def : GCNPat<
   (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
   (COPY VSrc_b16:$src)>;
@@ -2269,6 +2375,34 @@ def : GCNPat <
   (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
 >;
 
+def IMMBitSelConst : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(1ULL << N->getZExtValue(), SDLoc(N),
+                                   MVT::i32);
+}]>;
+
+// Matching separate SRL and TRUNC instructions
+// with dependent operands (SRL dest is source of TRUNC)
+// generates three instructions. However, by using bit shifts,
+// the V_LSHRREV_B32_e64 result can be directly used in the
+// operand of the V_AND_B32_e64 instruction:
+// (trunc i32 (srl i32 $a, i32 $b)) ->
+// v_and_b32_e64 $a, (1 << $b), $a
+// v_cmp_ne_u32_e64 $a, 0, $a
+
+// Handle the VALU case.
+def : GCNPat <
+  (i1 (DivergentUnaryFrag<trunc> (i32 (srl i32:$a, (i32 imm:$b))))),
+  (V_CMP_NE_U32_e64 (V_AND_B32_e64 (i32 (IMMBitSelConst $b)), $a),
+    (i32 0))
+>;
+
+// Handle the scalar case.
+def : GCNPat <
+  (i1 (UniformUnaryFrag<trunc> (i32 (srl i32:$a, (i32 imm:$b))))),
+  (S_CMP_LG_U32 (S_AND_B32 (i32 (IMMBitSelConst $b)), $a),
+    (i32 0))
+>;
+
 def : GCNPat <
   (i1 (DivergentUnaryFrag<trunc> i64:$a)),
   (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1),
@@ -2350,6 +2484,11 @@ def : GCNPat <
 
 }
 
+def : GCNPat<
+  (i64 (DivergentUnaryFrag<bitreverse> i64:$a)),
+  (REG_SEQUENCE VReg_64,
+   (V_BFREV_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1))), sub0,
+   (V_BFREV_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0))), sub1)>;
 
 // Prefer selecting to max when legal, but using mul is always valid.
 let AddedComplexity = -5 in {
@@ -2508,12 +2647,12 @@ def : GCNPat <
 >;
 
 def : GCNPat <
-  (v2i16 (build_vector (i16 SReg_32:$src0), (i16 undef))),
+  (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src0), (i16 undef))),
   (COPY_TO_REGCLASS SReg_32:$src0, SReg_32)
 >;
 
 def : GCNPat <
-  (v2i16 (build_vector (i16 VGPR_32:$src0), (i16 undef))),
+  (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src0), (i16 undef))),
   (COPY_TO_REGCLASS VGPR_32:$src0, VGPR_32)
 >;
 
@@ -2597,6 +2736,15 @@ def : GCNPat <
 >;
 } // End SubtargetPredicate = HasVOP3PInsts
 
+// With multiple uses of the shift, this will duplicate the shift and
+// increase register pressure.
+let SubtargetPredicate = isGFX11Plus in
+def : GCNPat <
+  (v2i16 (build_vector (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))), (i16 SReg_32:$src1))),
+  (v2i16 (S_PACK_HL_B32_B16 SReg_32:$src0, SReg_32:$src1))
+>;
+
+
 def : GCNPat <
   (v2f16 (scalar_to_vector f16:$src0)),
   (COPY $src0)
@@ -2678,18 +2826,18 @@ def : GCNPat <
 // an inline immediate than -c.
 // TODO: Also do for 64-bit.
 def : GCNPat<
-  (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
+  (UniformBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)),
   (S_SUB_I32 SReg_32:$src0, NegSubInlineConst32:$src1)
 >;
 
 def : GCNPat<
-  (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
+  (DivergentBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)),
   (V_SUB_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> {
   let SubtargetPredicate = HasAddNoCarryInsts;
 }
 
 def : GCNPat<
-  (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
+  (DivergentBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)),
   (V_SUB_CO_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> {
   let SubtargetPredicate = NotHasAddNoCarryInsts;
 }
@@ -2703,20 +2851,21 @@ def : GCNPat<
   (S_MOV_B32 SReg_32:$src)
 >;
 
-multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> {
+multiclass BFMPatterns <ValueType vt, PatFrag SHL, PatFrag ADD, InstSI BFM> {
   def : GCNPat <
-    (vt (shl (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)),
+    (vt (SHL (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)),
     (BFM $a, $b)
   >;
 
   def : GCNPat <
-    (vt (add (vt (shl 1, vt:$a)), -1)),
-    (BFM $a, (MOV (i32 0)))
+    (vt (ADD (vt (shl 1, vt:$a)), -1)),
+    (BFM $a, (i32 0))
   >;
 }
 
-defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
-// FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>;
+defm : BFMPatterns <i32, UniformBinFrag<shl>, UniformBinFrag<add>, S_BFM_B32>;
+// FIXME: defm : BFMPatterns <i64, UniformBinFrag<shl>, UniformBinFrag<add>, S_BFM_B64>;
+defm : BFMPatterns <i32, DivergentBinFrag<shl>, DivergentBinFrag<add>, V_BFM_B32_e64>;
 
 // Bitfield extract patterns
 
@@ -3007,6 +3156,19 @@ def G_AMDGPU_CLAMP : AMDGPUGenericInstruction {
   let hasSideEffects = 0;
 }
 
+// Integer multiply-add: arg0 * arg1 + arg2.
+//
+// arg0 and arg1 are 32-bit integers (interpreted as signed or unsigned),
+// arg2 is a 64-bit integer. Result is a 64-bit integer and a 1-bit carry-out.
+class G_AMDGPU_MAD_64_32 : AMDGPUGenericInstruction {
+  let OutOperandList = (outs type0:$dst, type1:$carry_out);
+  let InOperandList = (ins type2:$arg0, type2:$arg1, type0:$arg2);
+  let hasSideEffects = 0;
+}
+
+def G_AMDGPU_MAD_U64_U32 : G_AMDGPU_MAD_64_32;
+def G_AMDGPU_MAD_I64_I32 : G_AMDGPU_MAD_64_32;
+
 // Atomic cmpxchg. $cmpval ad $newval are packed in a single vector
 // operand Expects a MachineMemOperand in addition to explicit
 // operands.
@@ -3130,3 +3292,15 @@ def G_SI_CALL : AMDGPUGenericInstruction {
   // TODO: Should really base this on the call target
   let isConvergent = 1;
 }
+
+def G_FPTRUNC_ROUND_UPWARD : AMDGPUGenericInstruction {
+  let OutOperandList = (outs type0:$vdst);
+  let InOperandList = (ins type1:$src0);
+  let hasSideEffects = 0;
+}
+
+def G_FPTRUNC_ROUND_DOWNWARD : AMDGPUGenericInstruction {
+  let OutOperandList = (outs type0:$vdst);
+  let InOperandList = (ins type1:$src0);
+  let hasSideEffects = 0;
+}
diff --git a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
index 4fa8ec711134..47095ae22027 100644
--- a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
@@ -72,16 +72,22 @@ static void generateEndPgm(MachineBasicBlock &MBB,
   bool IsPS = F.getCallingConv() == CallingConv::AMDGPU_PS;
 
   // Check if hardware has been configured to expect color or depth exports.
-  bool HasExports =
-      AMDGPU::getHasColorExport(F) || AMDGPU::getHasDepthExport(F);
+  bool HasColorExports = AMDGPU::getHasColorExport(F);
+  bool HasDepthExports = AMDGPU::getHasDepthExport(F);
+  bool HasExports = HasColorExports || HasDepthExports;
 
   // Prior to GFX10, hardware always expects at least one export for PS.
   bool MustExport = !AMDGPU::isGFX10Plus(TII->getSubtarget());
 
   if (IsPS && (HasExports || MustExport)) {
     // Generate "null export" if hardware is expecting PS to export.
+    const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
+    int Target =
+        ST.hasNullExportTarget()
+            ? AMDGPU::Exp::ET_NULL
+            : (HasColorExports ? AMDGPU::Exp::ET_MRT0 : AMDGPU::Exp::ET_MRTZ);
     BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE))
-        .addImm(AMDGPU::Exp::ET_NULL)
+        .addImm(Target)
         .addReg(AMDGPU::VGPR0, RegState::Undef)
         .addReg(AMDGPU::VGPR0, RegState::Undef)
         .addReg(AMDGPU::VGPR0, RegState::Undef)
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 44bdbe37dec0..6d4e1d2c898b 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -79,6 +79,13 @@ enum InstClassEnum {
   MIMG,
   TBUFFER_LOAD,
   TBUFFER_STORE,
+  GLOBAL_LOAD_SADDR,
+  GLOBAL_STORE_SADDR,
+  FLAT_LOAD,
+  FLAT_STORE,
+  GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
+  GLOBAL_STORE // any CombineInfo, they are only ever returned by
+               // getCommonInstClass.
 };
 
 struct AddressRegs {
@@ -86,6 +93,7 @@ struct AddressRegs {
   bool SBase = false;
   bool SRsrc = false;
   bool SOffset = false;
+  bool SAddr = false;
   bool VAddr = false;
   bool Addr = false;
   bool SSamp = false;
@@ -160,6 +168,11 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
     }
 
     void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
+
+    // Compare by pointer order.
+    bool operator<(const CombineInfo& Other) const {
+      return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
+    }
   };
 
   struct BaseRegisters {
@@ -185,6 +198,9 @@ private:
   AliasAnalysis *AA = nullptr;
   bool OptimizeAgain;
 
+  bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
+                           const DenseSet<Register> &ARegUses,
+                           const MachineInstr &A, const MachineInstr &B) const;
   static bool dmasksCanBeCombined(const CombineInfo &CI,
                                   const SIInstrInfo &TII,
                                   const CombineInfo &Paired);
@@ -199,38 +215,43 @@ private:
                                                     const CombineInfo &Paired);
   const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
 
-  bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired,
-                            SmallVectorImpl<MachineInstr *> &InstsToMove);
+  CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
 
   unsigned read2Opcode(unsigned EltSize) const;
   unsigned read2ST64Opcode(unsigned EltSize) const;
-  MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI,
-                                             CombineInfo &Paired,
-                  const SmallVectorImpl<MachineInstr *> &InstsToMove);
+  MachineBasicBlock::iterator
+  mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
+                 MachineBasicBlock::iterator InsertBefore);
 
   unsigned write2Opcode(unsigned EltSize) const;
   unsigned write2ST64Opcode(unsigned EltSize) const;
   MachineBasicBlock::iterator
   mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
-                  const SmallVectorImpl<MachineInstr *> &InstsToMove);
+                  MachineBasicBlock::iterator InsertBefore);
   MachineBasicBlock::iterator
   mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
-                 const SmallVectorImpl<MachineInstr *> &InstsToMove);
+                 MachineBasicBlock::iterator InsertBefore);
   MachineBasicBlock::iterator
   mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
-                          const SmallVectorImpl<MachineInstr *> &InstsToMove);
+                          MachineBasicBlock::iterator InsertBefore);
   MachineBasicBlock::iterator
   mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
-                      const SmallVectorImpl<MachineInstr *> &InstsToMove);
+                      MachineBasicBlock::iterator InsertBefore);
   MachineBasicBlock::iterator
   mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
-                       const SmallVectorImpl<MachineInstr *> &InstsToMove);
+                       MachineBasicBlock::iterator InsertBefore);
   MachineBasicBlock::iterator
   mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
-                       const SmallVectorImpl<MachineInstr *> &InstsToMove);
+                       MachineBasicBlock::iterator InsertBefore);
   MachineBasicBlock::iterator
   mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
-                        const SmallVectorImpl<MachineInstr *> &InstsToMove);
+                        MachineBasicBlock::iterator InsertBefore);
+  MachineBasicBlock::iterator
+  mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
+                    MachineBasicBlock::iterator InsertBefore);
+  MachineBasicBlock::iterator
+  mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
+                     MachineBasicBlock::iterator InsertBefore);
 
   void updateBaseAndOffset(MachineInstr &I, Register NewBase,
                            int32_t NewOffset) const;
@@ -252,6 +273,12 @@ private:
       MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
       std::list<std::list<CombineInfo>> &MergeableInsts) const;
 
+  static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
+                                                     const CombineInfo &Paired);
+
+  static InstClassEnum getCommonInstClass(const CombineInfo &CI,
+                                          const CombineInfo &Paired);
+
 public:
   static char ID;
 
@@ -298,10 +325,35 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
 
   switch (Opc) {
   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
+  case AMDGPU::GLOBAL_LOAD_DWORD:
+  case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORD:
+  case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
+  case AMDGPU::FLAT_LOAD_DWORD:
+  case AMDGPU::FLAT_STORE_DWORD:
     return 1;
   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+  case AMDGPU::GLOBAL_LOAD_DWORDX2:
+  case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORDX2:
+  case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
+  case AMDGPU::FLAT_LOAD_DWORDX2:
+  case AMDGPU::FLAT_STORE_DWORDX2:
     return 2;
+  case AMDGPU::GLOBAL_LOAD_DWORDX3:
+  case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORDX3:
+  case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
+  case AMDGPU::FLAT_LOAD_DWORDX3:
+  case AMDGPU::FLAT_STORE_DWORDX3:
+    return 3;
   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+  case AMDGPU::GLOBAL_LOAD_DWORDX4:
+  case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORDX4:
+  case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
+  case AMDGPU::FLAT_LOAD_DWORDX4:
+  case AMDGPU::FLAT_STORE_DWORDX4:
     return 4;
   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
     return 8;
@@ -386,11 +438,40 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::DS_WRITE_B64:
   case AMDGPU::DS_WRITE_B64_gfx9:
     return DS_WRITE;
+  case AMDGPU::GLOBAL_LOAD_DWORD:
+  case AMDGPU::GLOBAL_LOAD_DWORDX2:
+  case AMDGPU::GLOBAL_LOAD_DWORDX3:
+  case AMDGPU::GLOBAL_LOAD_DWORDX4:
+  case AMDGPU::FLAT_LOAD_DWORD:
+  case AMDGPU::FLAT_LOAD_DWORDX2:
+  case AMDGPU::FLAT_LOAD_DWORDX3:
+  case AMDGPU::FLAT_LOAD_DWORDX4:
+    return FLAT_LOAD;
+  case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
+  case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
+  case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
+  case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
+    return GLOBAL_LOAD_SADDR;
+  case AMDGPU::GLOBAL_STORE_DWORD:
+  case AMDGPU::GLOBAL_STORE_DWORDX2:
+  case AMDGPU::GLOBAL_STORE_DWORDX3:
+  case AMDGPU::GLOBAL_STORE_DWORDX4:
+  case AMDGPU::FLAT_STORE_DWORD:
+  case AMDGPU::FLAT_STORE_DWORDX2:
+  case AMDGPU::FLAT_STORE_DWORDX3:
+  case AMDGPU::FLAT_STORE_DWORDX4:
+    return FLAT_STORE;
+  case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
+    return GLOBAL_STORE_SADDR;
   }
 }
 
 /// Determines instruction subclass from opcode. Only instructions
-/// of the same subclass can be merged together.
+/// of the same subclass can be merged together. The merged instruction may have
+/// a different subclass but must have the same class.
 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
   switch (Opc) {
   default:
@@ -418,9 +499,55 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
     return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
+  case AMDGPU::GLOBAL_LOAD_DWORD:
+  case AMDGPU::GLOBAL_LOAD_DWORDX2:
+  case AMDGPU::GLOBAL_LOAD_DWORDX3:
+  case AMDGPU::GLOBAL_LOAD_DWORDX4:
+  case AMDGPU::FLAT_LOAD_DWORD:
+  case AMDGPU::FLAT_LOAD_DWORDX2:
+  case AMDGPU::FLAT_LOAD_DWORDX3:
+  case AMDGPU::FLAT_LOAD_DWORDX4:
+    return AMDGPU::FLAT_LOAD_DWORD;
+  case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
+  case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
+  case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
+  case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
+    return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
+  case AMDGPU::GLOBAL_STORE_DWORD:
+  case AMDGPU::GLOBAL_STORE_DWORDX2:
+  case AMDGPU::GLOBAL_STORE_DWORDX3:
+  case AMDGPU::GLOBAL_STORE_DWORDX4:
+  case AMDGPU::FLAT_STORE_DWORD:
+  case AMDGPU::FLAT_STORE_DWORDX2:
+  case AMDGPU::FLAT_STORE_DWORDX3:
+  case AMDGPU::FLAT_STORE_DWORDX4:
+    return AMDGPU::FLAT_STORE_DWORD;
+  case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
+    return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
   }
 }
 
+// GLOBAL loads and stores are classified as FLAT initially. If both combined
+// instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
+// If either or both instructions are non segment specific FLAT the resulting
+// combined operation will be FLAT, potentially promoting one of the GLOBAL
+// operations to FLAT.
+// For other instructions return the original unmodified class.
+InstClassEnum
+SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
+                                         const CombineInfo &Paired) {
+  assert(CI.InstClass == Paired.InstClass);
+
+  if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
+      SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I))
+    return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
+
+  return CI.InstClass;
+}
+
 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
   AddressRegs Result;
 
@@ -480,6 +607,34 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::DS_WRITE_B64_gfx9:
     Result.Addr = true;
     return Result;
+  case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
+  case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
+  case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
+  case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
+    Result.SAddr = true;
+    LLVM_FALLTHROUGH;
+  case AMDGPU::GLOBAL_LOAD_DWORD:
+  case AMDGPU::GLOBAL_LOAD_DWORDX2:
+  case AMDGPU::GLOBAL_LOAD_DWORDX3:
+  case AMDGPU::GLOBAL_LOAD_DWORDX4:
+  case AMDGPU::GLOBAL_STORE_DWORD:
+  case AMDGPU::GLOBAL_STORE_DWORDX2:
+  case AMDGPU::GLOBAL_STORE_DWORDX3:
+  case AMDGPU::GLOBAL_STORE_DWORDX4:
+  case AMDGPU::FLAT_LOAD_DWORD:
+  case AMDGPU::FLAT_LOAD_DWORDX2:
+  case AMDGPU::FLAT_LOAD_DWORDX3:
+  case AMDGPU::FLAT_LOAD_DWORDX4:
+  case AMDGPU::FLAT_STORE_DWORD:
+  case AMDGPU::FLAT_STORE_DWORDX2:
+  case AMDGPU::FLAT_STORE_DWORDX3:
+  case AMDGPU::FLAT_STORE_DWORDX4:
+    Result.VAddr = true;
+    return Result;
   }
 }
 
@@ -551,6 +706,9 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
   if (Regs.SOffset)
     AddrIdx[NumAddresses++] =
         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
+  if (Regs.SAddr)
+    AddrIdx[NumAddresses++] =
+        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
   if (Regs.VAddr)
     AddrIdx[NumAddresses++] =
         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
@@ -579,92 +737,58 @@ FunctionPass *llvm::createSILoadStoreOptimizerPass() {
   return new SILoadStoreOptimizer();
 }
 
-static void moveInstsAfter(MachineBasicBlock::iterator I,
-                           ArrayRef<MachineInstr *> InstsToMove) {
-  MachineBasicBlock *MBB = I->getParent();
-  ++I;
-  for (MachineInstr *MI : InstsToMove) {
-    MI->removeFromParent();
-    MBB->insert(I, MI);
-  }
-}
-
 static void addDefsUsesToList(const MachineInstr &MI,
                               DenseSet<Register> &RegDefs,
-                              DenseSet<Register> &PhysRegUses) {
-  for (const MachineOperand &Op : MI.operands()) {
-    if (Op.isReg()) {
-      if (Op.isDef())
-        RegDefs.insert(Op.getReg());
-      else if (Op.readsReg() && Op.getReg().isPhysical())
-        PhysRegUses.insert(Op.getReg());
-    }
-  }
-}
-
-static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
-                                      MachineBasicBlock::iterator B,
-                                      AliasAnalysis *AA) {
-  // RAW or WAR - cannot reorder
-  // WAW - cannot reorder
-  // RAR - safe to reorder
-  return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true);
-}
-
-// Add MI and its defs to the lists if MI reads one of the defs that are
-// already in the list. Returns true in that case.
-static bool addToListsIfDependent(MachineInstr &MI, DenseSet<Register> &RegDefs,
-                                  DenseSet<Register> &PhysRegUses,
-                                  SmallVectorImpl<MachineInstr *> &Insts) {
-  for (MachineOperand &Use : MI.operands()) {
-    // If one of the defs is read, then there is a use of Def between I and the
-    // instruction that I will potentially be merged with. We will need to move
-    // this instruction after the merged instructions.
-    //
-    // Similarly, if there is a def which is read by an instruction that is to
-    // be moved for merging, then we need to move the def-instruction as well.
-    // This can only happen for physical registers such as M0; virtual
-    // registers are in SSA form.
-    if (Use.isReg() && ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
-                        (Use.isDef() && RegDefs.count(Use.getReg())) ||
-                        (Use.isDef() && Use.getReg().isPhysical() &&
-                         PhysRegUses.count(Use.getReg())))) {
-      Insts.push_back(&MI);
-      addDefsUsesToList(MI, RegDefs, PhysRegUses);
-      return true;
-    }
+                              DenseSet<Register> &RegUses) {
+  for (const auto &Op : MI.operands()) {
+    if (!Op.isReg())
+      continue;
+    if (Op.isDef())
+      RegDefs.insert(Op.getReg());
+    if (Op.readsReg())
+      RegUses.insert(Op.getReg());
   }
-
-  return false;
 }
 
-static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
-                                    ArrayRef<MachineInstr *> InstsToMove,
-                                    AliasAnalysis *AA) {
-  assert(MemOp.mayLoadOrStore());
-
-  for (MachineInstr *InstToMove : InstsToMove) {
-    if (!InstToMove->mayLoadOrStore())
+bool SILoadStoreOptimizer::canSwapInstructions(
+    const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
+    const MachineInstr &A, const MachineInstr &B) const {
+  if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
+      (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
+    return false;
+  for (const auto &BOp : B.operands()) {
+    if (!BOp.isReg())
       continue;
-    if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA))
+    if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
+      return false;
+    if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
       return false;
   }
   return true;
 }
 
-// This function assumes that \p A and \p B have are identical except for
-// size and offset, and they reference adjacent memory.
-static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF,
-                                                   const MachineMemOperand *A,
-                                                   const MachineMemOperand *B) {
-  unsigned MinOffset = std::min(A->getOffset(), B->getOffset());
-  unsigned Size = A->getSize() + B->getSize();
-  // This function adds the offset parameter to the existing offset for A,
-  // so we pass 0 here as the offset and then manually set it to the correct
-  // value after the call.
-  MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size);
-  MMO->setOffset(MinOffset);
-  return MMO;
+// Given that \p CI and \p Paired are adjacent memory operations produce a new
+// MMO for the combined operation with a new access size.
+MachineMemOperand *
+SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
+                                               const CombineInfo &Paired) {
+  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
+  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
+
+  unsigned Size = MMOa->getSize() + MMOb->getSize();
+
+  // A base pointer for the combined operation is the same as the leading
+  // operation's pointer.
+  if (Paired < CI)
+    std::swap(MMOa, MMOb);
+
+  MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
+  // If merging FLAT and GLOBAL set address space to FLAT.
+  if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
+    PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
+
+  MachineFunction *MF = CI.I->getMF();
+  return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
 }
 
 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
@@ -787,8 +911,7 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
     return (EltOffset0 + CI.Width == EltOffset1 ||
             EltOffset1 + Paired.Width == EltOffset0) &&
-           CI.CPol == Paired.CPol &&
-           (CI.InstClass == S_BUFFER_LOAD_IMM || CI.CPol == Paired.CPol);
+           CI.CPol == Paired.CPol;
   }
 
   // If the offset in elements doesn't fit in 8-bits, we might be able to use
@@ -889,111 +1012,59 @@ SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
   return nullptr;
 }
 
-/// This function assumes that CI comes before Paired in a basic block.
-bool SILoadStoreOptimizer::checkAndPrepareMerge(
-    CombineInfo &CI, CombineInfo &Paired,
-    SmallVectorImpl<MachineInstr *> &InstsToMove) {
+/// This function assumes that CI comes before Paired in a basic block. Return
+/// an insertion point for the merged instruction or nullptr on failure.
+SILoadStoreOptimizer::CombineInfo *
+SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
+                                           CombineInfo &Paired) {
+  // If another instruction has already been merged into CI, it may now be a
+  // type that we can't do any further merging into.
+  if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
+    return nullptr;
+  assert(CI.InstClass == Paired.InstClass);
+
+  if (getInstSubclass(CI.I->getOpcode(), *TII) !=
+      getInstSubclass(Paired.I->getOpcode(), *TII))
+    return nullptr;
 
   // Check both offsets (or masks for MIMG) can be combined and fit in the
   // reduced range.
-  if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired))
-    return false;
-
-  if (CI.InstClass != MIMG &&
-      (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)))
-    return false;
-
-  const unsigned Opc = CI.I->getOpcode();
-  const InstClassEnum InstClass = getInstClass(Opc, *TII);
-
-  if (InstClass == UNKNOWN) {
-    return false;
+  if (CI.InstClass == MIMG) {
+    if (!dmasksCanBeCombined(CI, *TII, Paired))
+      return nullptr;
+  } else {
+    if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
+      return nullptr;
   }
-  const unsigned InstSubclass = getInstSubclass(Opc, *TII);
-
-  DenseSet<Register> RegDefsToMove;
-  DenseSet<Register> PhysRegUsesToMove;
-  addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
-
-  MachineBasicBlock::iterator E = std::next(Paired.I);
-  MachineBasicBlock::iterator MBBI = std::next(CI.I);
-  MachineBasicBlock::iterator MBBE = CI.I->getParent()->end();
-  for (; MBBI != E; ++MBBI) {
-
-    if (MBBI == MBBE) {
-      // CombineInfo::Order is a hint on the instruction ordering within the
-      // basic block. This hint suggests that CI precedes Paired, which is
-      // true most of the time. However, moveInstsAfter() processing a
-      // previous list may have changed this order in a situation when it
-      // moves an instruction which exists in some other merge list.
-      // In this case it must be dependent.
-      return false;
-    }
-
-    if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) ||
-        (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) {
-      // This is not a matching instruction, but we can keep looking as
-      // long as one of these conditions are met:
-      // 1. It is safe to move I down past MBBI.
-      // 2. It is safe to move MBBI down past the instruction that I will
-      //    be merged into.
-
-      if (MBBI->mayLoadOrStore() &&
-          (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
-           !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) {
-        // We fail condition #1, but we may still be able to satisfy condition
-        // #2.  Add this instruction to the move list and then we will check
-        // if condition #2 holds once we have selected the matching instruction.
-        InstsToMove.push_back(&*MBBI);
-        addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
-        continue;
-      }
 
-      // When we match I with another DS instruction we will be moving I down
-      // to the location of the matched instruction any uses of I will need to
-      // be moved down as well.
-      addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
-                            InstsToMove);
-      continue;
+  DenseSet<Register> RegDefs;
+  DenseSet<Register> RegUses;
+  CombineInfo *Where;
+  if (CI.I->mayLoad()) {
+    // Try to hoist Paired up to CI.
+    addDefsUsesToList(*Paired.I, RegDefs, RegUses);
+    for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
+      if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
+        return nullptr;
     }
-
-    // Handle a case like
-    //   DS_WRITE_B32 addr, v, idx0
-    //   w = DS_READ_B32 addr, idx0
-    //   DS_WRITE_B32 addr, f(w), idx1
-    // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
-    // merging of the two writes.
-    if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
-                              InstsToMove))
-      continue;
-
-    if (&*MBBI == &*Paired.I) {
-      // We need to go through the list of instructions that we plan to
-      // move and make sure they are all safe to move down past the merged
-      // instruction.
-      if (canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) {
-
-        // Call offsetsCanBeCombined with modify = true so that the offsets are
-        // correct for the new instruction.  This should return true, because
-        // this function should only be called on CombineInfo objects that
-        // have already been confirmed to be mergeable.
-        if (CI.InstClass != MIMG)
-          offsetsCanBeCombined(CI, *STM, Paired, true);
-        return true;
-      }
-      return false;
+    Where = &CI;
+  } else {
+    // Try to sink CI down to Paired.
+    addDefsUsesToList(*CI.I, RegDefs, RegUses);
+    for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
+      if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
+        return nullptr;
     }
-
-    // We've found a load/store that we couldn't merge for some reason.
-    // We could potentially keep looking, but we'd need to make sure that
-    // it was safe to move I and also all the instruction in InstsToMove
-    // down past this instruction.
-    // check if we can move I across MBBI and if we can move all I's users
-    if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
-        !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))
-      break;
+    Where = &Paired;
   }
-  return false;
+
+  // Call offsetsCanBeCombined with modify = true so that the offsets are
+  // correct for the new instruction.  This should return true, because
+  // this function should only be called on CombineInfo objects that
+  // have already been confirmed to be mergeable.
+  if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
+    offsetsCanBeCombined(CI, *STM, Paired, true);
+  return Where;
 }
 
 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
@@ -1012,7 +1083,7 @@ unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
 
 MachineBasicBlock::iterator
 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
-    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+                                     MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
 
   // Be careful, since the addresses could be subregisters themselves in weird
@@ -1051,13 +1122,13 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
   unsigned BaseRegFlags = 0;
   if (CI.BaseOff) {
     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
-    BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
+    BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
         .addImm(CI.BaseOff);
 
     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     BaseRegFlags = RegState::Kill;
 
-    TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg)
+    TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
         .addReg(ImmReg)
         .addReg(AddrReg->getReg(), 0, BaseSubReg)
         .addImm(0); // clamp bit
@@ -1065,7 +1136,7 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
   }
 
   MachineInstrBuilder Read2 =
-      BuildMI(*MBB, Paired.I, DL, Read2Desc, DestReg)
+      BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
           .addImm(NewOffset0)                        // offset0
           .addImm(NewOffset1)                        // offset1
@@ -1077,14 +1148,12 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
 
   // Copy to the old destination registers.
-  BuildMI(*MBB, Paired.I, DL, CopyDesc)
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
       .add(*Dest0) // Copy to same destination including flags and sub reg.
       .addReg(DestReg, 0, SubRegIdx0);
-  MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
-                            .add(*Dest1)
-                            .addReg(DestReg, RegState::Kill, SubRegIdx1);
-
-  moveInstsAfter(Copy1, InstsToMove);
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+      .add(*Dest1)
+      .addReg(DestReg, RegState::Kill, SubRegIdx1);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1109,9 +1178,9 @@ unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
 }
 
-MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
-                                      const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
+    CombineInfo &CI, CombineInfo &Paired,
+    MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
 
   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
@@ -1145,13 +1214,13 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
   unsigned BaseRegFlags = 0;
   if (CI.BaseOff) {
     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
-    BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
+    BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
         .addImm(CI.BaseOff);
 
     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     BaseRegFlags = RegState::Kill;
 
-    TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg)
+    TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
         .addReg(ImmReg)
         .addReg(AddrReg->getReg(), 0, BaseSubReg)
         .addImm(0); // clamp bit
@@ -1159,7 +1228,7 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
   }
 
   MachineInstrBuilder Write2 =
-      BuildMI(*MBB, Paired.I, DL, Write2Desc)
+      BuildMI(*MBB, InsertBefore, DL, Write2Desc)
           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
           .add(*Data0)                               // data0
           .add(*Data1)                               // data1
@@ -1168,8 +1237,6 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
           .addImm(0)                                 // gds
           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
 
-  moveInstsAfter(Write2, InstsToMove);
-
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
 
@@ -1179,7 +1246,7 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
 
 MachineBasicBlock::iterator
 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
-                           const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+                                     MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
   const unsigned Opcode = getNewOpcode(CI, Paired);
@@ -1191,7 +1258,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
   unsigned DMaskIdx =
       AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
 
-  auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
+  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
   for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
     if (I == DMaskIdx)
       MIB.addImm(MergedDMask);
@@ -1204,10 +1271,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
   // will return true if this is the case.
   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
 
-  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
-  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
-
-  MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
+  MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
   unsigned SubRegIdx0, SubRegIdx1;
   std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
@@ -1217,14 +1281,12 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
 
-  BuildMI(*MBB, Paired.I, DL, CopyDesc)
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
       .add(*Dest0) // Copy to same destination including flags and sub reg.
       .addReg(DestReg, 0, SubRegIdx0);
-  MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
-                            .add(*Dest1)
-                            .addReg(DestReg, RegState::Kill, SubRegIdx1);
-
-  moveInstsAfter(Copy1, InstsToMove);
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+      .add(*Dest1)
+      .addReg(DestReg, RegState::Kill, SubRegIdx1);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1233,7 +1295,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
 
 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
     CombineInfo &CI, CombineInfo &Paired,
-    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+    MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
   const unsigned Opcode = getNewOpcode(CI, Paired);
@@ -1248,15 +1310,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
   // will return true if this is the case.
   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
 
-  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
-  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
-
   MachineInstr *New =
-    BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg)
-        .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
-        .addImm(MergedOffset) // offset
-        .addImm(CI.CPol)      // cpol
-        .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
+      BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
+          .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
+          .addImm(MergedOffset) // offset
+          .addImm(CI.CPol)      // cpol
+          .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
@@ -1267,14 +1326,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
 
-  BuildMI(*MBB, Paired.I, DL, CopyDesc)
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
       .add(*Dest0) // Copy to same destination including flags and sub reg.
       .addReg(DestReg, 0, SubRegIdx0);
-  MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
-                            .add(*Dest1)
-                            .addReg(DestReg, RegState::Kill, SubRegIdx1);
-
-  moveInstsAfter(Copy1, InstsToMove);
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+      .add(*Dest1)
+      .addReg(DestReg, RegState::Kill, SubRegIdx1);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1283,7 +1340,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
 
 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
     CombineInfo &CI, CombineInfo &Paired,
-    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+    MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
 
@@ -1295,7 +1352,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
   Register DestReg = MRI->createVirtualRegister(SuperRC);
   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
 
-  auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
+  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
 
   AddressRegs Regs = getRegs(Opcode, *TII);
 
@@ -1307,9 +1364,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
   // will return true if this is the case.
   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
 
-  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
-  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
-
   MachineInstr *New =
     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
@@ -1317,7 +1371,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
         .addImm(CI.CPol)      // cpol
         .addImm(0)            // tfe
         .addImm(0)            // swz
-        .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
+        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
@@ -1328,14 +1382,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
 
-  BuildMI(*MBB, Paired.I, DL, CopyDesc)
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
       .add(*Dest0) // Copy to same destination including flags and sub reg.
       .addReg(DestReg, 0, SubRegIdx0);
-  MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
-                            .add(*Dest1)
-                            .addReg(DestReg, RegState::Kill, SubRegIdx1);
-
-  moveInstsAfter(Copy1, InstsToMove);
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+      .add(*Dest1)
+      .addReg(DestReg, RegState::Kill, SubRegIdx1);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1344,7 +1396,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
 
 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
     CombineInfo &CI, CombineInfo &Paired,
-    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+    MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
 
@@ -1356,7 +1408,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
   Register DestReg = MRI->createVirtualRegister(SuperRC);
   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
 
-  auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
+  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
 
   AddressRegs Regs = getRegs(Opcode, *TII);
 
@@ -1371,9 +1423,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
   // will return true if this is the case.
   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
 
-  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
-  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
-
   MachineInstr *New =
       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
@@ -1382,8 +1431,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
           .addImm(CI.CPol)      // cpol
           .addImm(0)            // tfe
           .addImm(0)            // swz
-          .addMemOperand(
-              combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
+          .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
@@ -1394,14 +1442,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
 
-  BuildMI(*MBB, Paired.I, DL, CopyDesc)
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
       .add(*Dest0) // Copy to same destination including flags and sub reg.
       .addReg(DestReg, 0, SubRegIdx0);
-  MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
-                            .add(*Dest1)
-                            .addReg(DestReg, RegState::Kill, SubRegIdx1);
-
-  moveInstsAfter(Copy1, InstsToMove);
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+      .add(*Dest1)
+      .addReg(DestReg, RegState::Kill, SubRegIdx1);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1410,7 +1456,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
 
 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
     CombineInfo &CI, CombineInfo &Paired,
-    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+    MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
 
@@ -1427,13 +1473,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
 
-  BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
+  BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
       .add(*Src0)
       .addImm(SubRegIdx0)
       .add(*Src1)
       .addImm(SubRegIdx1);
 
-  auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
+  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
                  .addReg(SrcReg, RegState::Kill);
 
   AddressRegs Regs = getRegs(Opcode, *TII);
@@ -1449,9 +1495,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
   // will return true if this is the case.
   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
 
-  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
-  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
-
   MachineInstr *New =
       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
@@ -1460,10 +1503,92 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
           .addImm(CI.CPol)                          // cpol
           .addImm(0)                                // tfe
           .addImm(0)                                // swz
-          .addMemOperand(
-              combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
+          .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
+
+  CI.I->eraseFromParent();
+  Paired.I->eraseFromParent();
+  return New;
+}
+
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
+    CombineInfo &CI, CombineInfo &Paired,
+    MachineBasicBlock::iterator InsertBefore) {
+  MachineBasicBlock *MBB = CI.I->getParent();
+  DebugLoc DL = CI.I->getDebugLoc();
+
+  const unsigned Opcode = getNewOpcode(CI, Paired);
+
+  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
+  Register DestReg = MRI->createVirtualRegister(SuperRC);
+
+  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
+
+  if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
+    MIB.add(*SAddr);
+
+  MachineInstr *New =
+    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
+       .addImm(std::min(CI.Offset, Paired.Offset))
+       .addImm(CI.CPol)
+       .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
+
+  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
+  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
+  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
+
+  // Copy to the old destination registers.
+  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
+  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
+  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
 
-  moveInstsAfter(MIB, InstsToMove);
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+      .add(*Dest0) // Copy to same destination including flags and sub reg.
+      .addReg(DestReg, 0, SubRegIdx0);
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+      .add(*Dest1)
+      .addReg(DestReg, RegState::Kill, SubRegIdx1);
+
+  CI.I->eraseFromParent();
+  Paired.I->eraseFromParent();
+  return New;
+}
+
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
+    CombineInfo &CI, CombineInfo &Paired,
+    MachineBasicBlock::iterator InsertBefore) {
+  MachineBasicBlock *MBB = CI.I->getParent();
+  DebugLoc DL = CI.I->getDebugLoc();
+
+  const unsigned Opcode = getNewOpcode(CI, Paired);
+
+  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
+  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
+  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
+
+  // Copy to the new source register.
+  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
+  Register SrcReg = MRI->createVirtualRegister(SuperRC);
+
+  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
+  const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
+
+  BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
+      .add(*Src0)
+      .addImm(SubRegIdx0)
+      .add(*Src1)
+      .addImm(SubRegIdx1);
+
+  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
+                 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
+                 .addReg(SrcReg, RegState::Kill);
+
+  if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
+    MIB.add(*SAddr);
+
+  MachineInstr *New =
+    MIB.addImm(std::min(CI.Offset, Paired.Offset))
+       .addImm(CI.CPol)
+       .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1474,7 +1599,7 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
                                             const CombineInfo &Paired) {
   const unsigned Width = CI.Width + Paired.Width;
 
-  switch (CI.InstClass) {
+  switch (getCommonInstClass(CI, Paired)) {
   default:
     assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
     // FIXME: Handle d16 correctly
@@ -1498,6 +1623,72 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
     case 8:
       return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
     }
+  case GLOBAL_LOAD:
+    switch (Width) {
+    default:
+      return 0;
+    case 2:
+      return AMDGPU::GLOBAL_LOAD_DWORDX2;
+    case 3:
+      return AMDGPU::GLOBAL_LOAD_DWORDX3;
+    case 4:
+      return AMDGPU::GLOBAL_LOAD_DWORDX4;
+    }
+  case GLOBAL_LOAD_SADDR:
+    switch (Width) {
+    default:
+      return 0;
+    case 2:
+      return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
+    case 3:
+      return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
+    case 4:
+      return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
+    }
+  case GLOBAL_STORE:
+    switch (Width) {
+    default:
+      return 0;
+    case 2:
+      return AMDGPU::GLOBAL_STORE_DWORDX2;
+    case 3:
+      return AMDGPU::GLOBAL_STORE_DWORDX3;
+    case 4:
+      return AMDGPU::GLOBAL_STORE_DWORDX4;
+    }
+  case GLOBAL_STORE_SADDR:
+    switch (Width) {
+    default:
+      return 0;
+    case 2:
+      return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
+    case 3:
+      return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
+    case 4:
+      return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
+    }
+  case FLAT_LOAD:
+    switch (Width) {
+    default:
+      return 0;
+    case 2:
+      return AMDGPU::FLAT_LOAD_DWORDX2;
+    case 3:
+      return AMDGPU::FLAT_LOAD_DWORDX3;
+    case 4:
+      return AMDGPU::FLAT_LOAD_DWORDX4;
+    }
+  case FLAT_STORE:
+    switch (Width) {
+    default:
+      return 0;
+    case 2:
+      return AMDGPU::FLAT_STORE_DWORDX2;
+    case 3:
+      return AMDGPU::FLAT_STORE_DWORDX3;
+    case 4:
+      return AMDGPU::FLAT_STORE_DWORDX4;
+    }
   case MIMG:
     assert((countPopulation(CI.DMask | Paired.DMask) == Width) &&
            "No overlaps");
@@ -1508,15 +1699,9 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
 std::pair<unsigned, unsigned>
 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
                                     const CombineInfo &Paired) {
-  bool ReverseOrder;
-  if (CI.InstClass == MIMG) {
-    assert(
-        (countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) &&
-        "No overlaps");
-    ReverseOrder = CI.DMask > Paired.DMask;
-  } else {
-    ReverseOrder = CI.Offset > Paired.Offset;
-  }
+  assert((CI.InstClass != MIMG || (countPopulation(CI.DMask | Paired.DMask) ==
+                                   CI.Width + Paired.Width)) &&
+         "No overlaps");
 
   unsigned Idx0;
   unsigned Idx1;
@@ -1532,7 +1717,7 @@ SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
   assert(CI.Width >= 1 && CI.Width <= 4);
   assert(Paired.Width >= 1 && Paired.Width <= 4);
 
-  if (ReverseOrder) {
+  if (Paired < CI) {
     Idx1 = Idxs[0][Paired.Width - 1];
     Idx0 = Idxs[Paired.Width][CI.Width - 1];
   } else {
@@ -1569,7 +1754,7 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
 
 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
     CombineInfo &CI, CombineInfo &Paired,
-    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+    MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
 
@@ -1586,13 +1771,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
 
-  BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
+  BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
       .add(*Src0)
       .addImm(SubRegIdx0)
       .add(*Src1)
       .addImm(SubRegIdx1);
 
-  auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
+  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
                  .addReg(SrcReg, RegState::Kill);
 
   AddressRegs Regs = getRegs(Opcode, *TII);
@@ -1606,9 +1791,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
   // will return true if this is the case.
   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
 
-  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
-  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
-
   MachineInstr *New =
     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
@@ -1616,9 +1798,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
         .addImm(CI.CPol)      // cpol
         .addImm(0)            // tfe
         .addImm(0)            // swz
-        .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
-
-  moveInstsAfter(MIB, InstsToMove);
+        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1846,7 +2026,7 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
   // as the new-base(anchor) because of the maximum distance which can
-  // accomodate more intermediate bases presumeably.
+  // accommodate more intermediate bases presumably.
   //
   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
   // (&a + 8192) for load1, load2, load4.
@@ -2098,8 +2278,8 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
     CombineInfo &CI = *First;
     CombineInfo &Paired = *Second;
 
-    SmallVector<MachineInstr *, 8> InstsToMove;
-    if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) {
+    CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
+    if (!Where) {
       ++I;
       continue;
     }
@@ -2108,66 +2288,56 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
 
     LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << "   with: " << *Paired.I);
 
+    MachineBasicBlock::iterator NewMI;
     switch (CI.InstClass) {
     default:
       llvm_unreachable("unknown InstClass");
       break;
-    case DS_READ: {
-      MachineBasicBlock::iterator NewMI =
-          mergeRead2Pair(CI, Paired, InstsToMove);
-      CI.setMI(NewMI, *this);
+    case DS_READ:
+      NewMI = mergeRead2Pair(CI, Paired, Where->I);
       break;
-    }
-    case DS_WRITE: {
-      MachineBasicBlock::iterator NewMI =
-          mergeWrite2Pair(CI, Paired, InstsToMove);
-      CI.setMI(NewMI, *this);
+    case DS_WRITE:
+      NewMI = mergeWrite2Pair(CI, Paired, Where->I);
       break;
-    }
-    case S_BUFFER_LOAD_IMM: {
-      MachineBasicBlock::iterator NewMI =
-          mergeSBufferLoadImmPair(CI, Paired, InstsToMove);
-      CI.setMI(NewMI, *this);
-      OptimizeListAgain |= (CI.Width + Paired.Width) < 8;
+    case S_BUFFER_LOAD_IMM:
+      NewMI = mergeSBufferLoadImmPair(CI, Paired, Where->I);
+      OptimizeListAgain |= CI.Width + Paired.Width < 8;
       break;
-    }
-    case BUFFER_LOAD: {
-      MachineBasicBlock::iterator NewMI =
-          mergeBufferLoadPair(CI, Paired, InstsToMove);
-      CI.setMI(NewMI, *this);
-      OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
+    case BUFFER_LOAD:
+      NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
+      OptimizeListAgain |= CI.Width + Paired.Width < 4;
       break;
-    }
-    case BUFFER_STORE: {
-      MachineBasicBlock::iterator NewMI =
-          mergeBufferStorePair(CI, Paired, InstsToMove);
-      CI.setMI(NewMI, *this);
-      OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
+    case BUFFER_STORE:
+      NewMI = mergeBufferStorePair(CI, Paired, Where->I);
+      OptimizeListAgain |= CI.Width + Paired.Width < 4;
       break;
-    }
-    case MIMG: {
-      MachineBasicBlock::iterator NewMI =
-          mergeImagePair(CI, Paired, InstsToMove);
-      CI.setMI(NewMI, *this);
-      OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
+    case MIMG:
+      NewMI = mergeImagePair(CI, Paired, Where->I);
+      OptimizeListAgain |= CI.Width + Paired.Width < 4;
       break;
-    }
-    case TBUFFER_LOAD: {
-      MachineBasicBlock::iterator NewMI =
-          mergeTBufferLoadPair(CI, Paired, InstsToMove);
-      CI.setMI(NewMI, *this);
-      OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
+    case TBUFFER_LOAD:
+      NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
+      OptimizeListAgain |= CI.Width + Paired.Width < 4;
       break;
-    }
-    case TBUFFER_STORE: {
-      MachineBasicBlock::iterator NewMI =
-          mergeTBufferStorePair(CI, Paired, InstsToMove);
-      CI.setMI(NewMI, *this);
-      OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
+    case TBUFFER_STORE:
+      NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
+      OptimizeListAgain |= CI.Width + Paired.Width < 4;
+      break;
+    case FLAT_LOAD:
+    case GLOBAL_LOAD:
+    case GLOBAL_LOAD_SADDR:
+      NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
+      OptimizeListAgain |= CI.Width + Paired.Width < 4;
+      break;
+    case FLAT_STORE:
+    case GLOBAL_STORE:
+    case GLOBAL_STORE_SADDR:
+      NewMI = mergeFlatStorePair(CI, Paired, Where->I);
+      OptimizeListAgain |= CI.Width + Paired.Width < 4;
       break;
     }
-    }
-    CI.Order = Paired.Order;
+    CI.setMI(NewMI, *this);
+    CI.Order = Where->Order;
     if (I == Second)
       I = Next;
 
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index e1018bdfde46..607383ab8cde 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -509,8 +509,35 @@ MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) {
     BuildMI(MBB, InsPt, DL, TII->get(Opcode), Exec)
     .addReg(Exec)
     .add(MI.getOperand(0));
-  if (LV)
-    LV->replaceKillInstruction(MI.getOperand(0).getReg(), MI, *NewMI);
+  if (LV) {
+    LV->replaceKillInstruction(DataReg, MI, *NewMI);
+
+    if (SplitBB != &MBB) {
+      // Track the set of registers defined in the split block so we don't
+      // accidentally add the original block to AliveBlocks.
+      DenseSet<Register> SplitDefs;
+      for (MachineInstr &X : *SplitBB) {
+        for (MachineOperand &Op : X.operands()) {
+          if (Op.isReg() && Op.isDef() && Op.getReg().isVirtual())
+            SplitDefs.insert(Op.getReg());
+        }
+      }
+
+      for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+        Register Reg = Register::index2VirtReg(i);
+        LiveVariables::VarInfo &VI = LV->getVarInfo(Reg);
+
+        if (VI.AliveBlocks.test(MBB.getNumber()))
+          VI.AliveBlocks.set(SplitBB->getNumber());
+        else {
+          for (MachineInstr *Kill : VI.Kills) {
+            if (Kill->getParent() == SplitBB && !SplitDefs.contains(Reg))
+              VI.AliveBlocks.set(MBB.getNumber());
+          }
+        }
+      }
+    }
+  }
 
   LoweredEndCf.insert(NewMI);
 
@@ -540,7 +567,7 @@ void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
     return;
 
   // Make sure we do not modify exec between def and use.
-  // A copy with implcitly defined exec inserted earlier is an exclusion, it
+  // A copy with implicitly defined exec inserted earlier is an exclusion, it
   // does not really modify exec.
   for (auto I = Def->getIterator(); I != MI.getIterator(); ++I)
     if (I->modifiesRegister(AMDGPU::EXEC, TRI) &&
@@ -573,14 +600,14 @@ void SILowerControlFlow::combineMasks(MachineInstr &MI) {
   else return;
 
   Register Reg = MI.getOperand(OpToReplace).getReg();
-  MI.RemoveOperand(OpToReplace);
+  MI.removeOperand(OpToReplace);
   MI.addOperand(Ops[UniqueOpndIdx]);
   if (MRI->use_empty(Reg))
     MRI->getUniqueVRegDef(Reg)->eraseFromParent();
 }
 
 void SILowerControlFlow::optimizeEndCf() {
-  // If the only instruction immediately following this END_CF is an another
+  // If the only instruction immediately following this END_CF is another
   // END_CF in the only successor we can avoid emitting exec mask restore here.
   if (!EnableOptimizeEndCf)
     return;
@@ -865,6 +892,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
+  bool Changed = false;
   MachineFunction::iterator NextBB;
   for (MachineFunction::iterator BI = MF.begin();
        BI != MF.end(); BI = NextBB) {
@@ -886,6 +914,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
       case AMDGPU::SI_LOOP:
       case AMDGPU::SI_END_CF:
         SplitMBB = process(MI);
+        Changed = true;
         break;
 
       // FIXME: find a better place for this
@@ -894,6 +923,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
         lowerInitExec(MBB, MI);
         if (LIS)
           LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
+        Changed = true;
         break;
 
       default:
@@ -913,5 +943,5 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
   LoweredIf.clear();
   KillBlocks.clear();
 
-  return true;
+  return Changed;
 }
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 672266f0c11e..5fb545b50228 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -79,9 +79,9 @@ public:
   }
 
 private:
-  void lowerCopiesFromI1();
-  void lowerPhis();
-  void lowerCopiesToI1();
+  bool lowerCopiesFromI1();
+  bool lowerPhis();
+  bool lowerCopiesToI1();
   bool isConstantLaneMask(Register Reg, bool &Val) const;
   void buildMergeLaneMasks(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator I, const DebugLoc &DL,
@@ -473,15 +473,17 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) {
     OrN2Op = AMDGPU::S_ORN2_B64;
   }
 
-  lowerCopiesFromI1();
-  lowerPhis();
-  lowerCopiesToI1();
+  bool Changed = false;
+  Changed |= lowerCopiesFromI1();
+  Changed |= lowerPhis();
+  Changed |= lowerCopiesToI1();
 
+  assert(Changed || ConstrainRegs.empty());
   for (unsigned Reg : ConstrainRegs)
     MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass);
   ConstrainRegs.clear();
 
-  return true;
+  return Changed;
 }
 
 #ifndef NDEBUG
@@ -493,7 +495,8 @@ static bool isVRegCompatibleReg(const SIRegisterInfo &TRI,
 }
 #endif
 
-void SILowerI1Copies::lowerCopiesFromI1() {
+bool SILowerI1Copies::lowerCopiesFromI1() {
+  bool Changed = false;
   SmallVector<MachineInstr *, 4> DeadCopies;
 
   for (MachineBasicBlock &MBB : *MF) {
@@ -509,6 +512,8 @@ void SILowerI1Copies::lowerCopiesFromI1() {
       if (isLaneMaskReg(DstReg) || isVreg1(DstReg))
         continue;
 
+      Changed = true;
+
       // Copy into a 32-bit vector register.
       LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI);
       DebugLoc DL = MI.getDebugLoc();
@@ -530,9 +535,10 @@ void SILowerI1Copies::lowerCopiesFromI1() {
       MI->eraseFromParent();
     DeadCopies.clear();
   }
+  return Changed;
 }
 
-void SILowerI1Copies::lowerPhis() {
+bool SILowerI1Copies::lowerPhis() {
   MachineSSAUpdater SSAUpdater(*MF);
   LoopFinder LF(*DT, *PDT);
   PhiIncomingAnalysis PIA(*PDT);
@@ -550,6 +556,8 @@ void SILowerI1Copies::lowerPhis() {
         Vreg1Phis.push_back(&MI);
     }
   }
+  if (Vreg1Phis.empty())
+    return false;
 
   MachineBasicBlock *PrevMBB = nullptr;
   for (MachineInstr *MI : Vreg1Phis) {
@@ -662,9 +670,11 @@ void SILowerI1Copies::lowerPhis() {
     IncomingRegs.clear();
     IncomingUpdated.clear();
   }
+  return true;
 }
 
-void SILowerI1Copies::lowerCopiesToI1() {
+bool SILowerI1Copies::lowerCopiesToI1() {
+  bool Changed = false;
   MachineSSAUpdater SSAUpdater(*MF);
   LoopFinder LF(*DT, *PDT);
   SmallVector<MachineInstr *, 4> DeadCopies;
@@ -681,6 +691,8 @@ void SILowerI1Copies::lowerCopiesToI1() {
       if (!isVreg1(DstReg))
         continue;
 
+      Changed = true;
+
       if (MRI->use_empty(DstReg)) {
         DeadCopies.push_back(&MI);
         continue;
@@ -731,6 +743,7 @@ void SILowerI1Copies::lowerCopiesToI1() {
       MI->eraseFromParent();
     DeadCopies.clear();
   }
+  return Changed;
 }
 
 bool SILowerI1Copies::isConstantLaneMask(Register Reg, bool &Val) const {
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 0fbdbef6fcce..dd881ec42d53 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -20,6 +20,7 @@
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/InitializePasses.h"
 
@@ -79,6 +80,8 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock,
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIRegisterInfo *RI = ST.getRegisterInfo();
 
   MachineBasicBlock::iterator I = SaveBlock.begin();
   if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) {
@@ -89,8 +92,8 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock,
       MCRegister Reg = CS.getReg();
 
       MachineInstrSpan MIS(I, &SaveBlock);
-      const TargetRegisterClass *RC =
-        TRI->getMinimalPhysRegClass(Reg, MVT::i32);
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(
+          Reg, Reg == RI->getReturnAddressReg(MF) ? MVT::i64 : MVT::i32);
 
       // If this value was already livein, we probably have a direct use of the
       // incoming register value, so don't kill at the spill point. This happens
@@ -119,7 +122,8 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIRegisterInfo *RI = ST.getRegisterInfo();
   // Restore all registers immediately before the return and any
   // terminators that precede it.
   MachineBasicBlock::iterator I = RestoreBlock.getFirstTerminator();
@@ -128,8 +132,8 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
   if (!TFI->restoreCalleeSavedRegisters(RestoreBlock, I, CSI, TRI)) {
     for (const CalleeSavedInfo &CI : reverse(CSI)) {
       Register Reg = CI.getReg();
-      const TargetRegisterClass *RC =
-        TRI->getMinimalPhysRegClass(Reg, MVT::i32);
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(
+          Reg, Reg == RI->getReturnAddressReg(MF) ? MVT::i64 : MVT::i32);
 
       TII.loadRegFromStackSlot(RestoreBlock, I, Reg, CI.getFrameIdx(), RC, TRI);
       assert(I != RestoreBlock.begin() &&
@@ -321,7 +325,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
     // free frame index ids by the later pass(es) like "stack slot coloring"
     // which in turn could mess-up with the book keeping of "frame index to VGPR
     // lane".
-    FuncInfo->removeDeadFrameIndices(MFI);
+    FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ false);
 
     MadeChange = true;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index cca8565c9ff9..0504c59ebd9e 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -31,6 +31,9 @@ using namespace llvm;
 
 SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   : AMDGPUMachineFunction(MF),
+    BufferPSV(static_cast<const AMDGPUTargetMachine &>(MF.getTarget())),
+    ImagePSV(static_cast<const AMDGPUTargetMachine &>(MF.getTarget())),
+    GWSResourcePSV(static_cast<const AMDGPUTargetMachine &>(MF.getTarget())),
     PrivateSegmentBuffer(false),
     DispatchPtr(false),
     QueuePtr(false),
@@ -48,8 +51,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     ImplicitBufferPtr(false),
     ImplicitArgPtr(false),
     GITPtrHigh(0xffffffff),
-    HighBitsOf32BitAddress(0),
-    GDSSize(0) {
+    HighBitsOf32BitAddress(0) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const Function &F = MF.getFunction();
   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
@@ -74,6 +76,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
   }
 
+  MayNeedAGPRs = ST.hasMAIInsts();
+
   if (!isEntryFunction()) {
     if (CC != CallingConv::AMDGPU_Gfx)
       ArgInfo = AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
@@ -97,6 +101,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     ImplicitArgPtr = false;
     MaxKernArgAlign = std::max(ST.getAlignmentForImplicitArgPtr(),
                                MaxKernArgAlign);
+
+    if (ST.hasGFX90AInsts() &&
+        ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
+        !mayUseAGPRs(MF))
+      MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
   }
 
   bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
@@ -177,9 +186,20 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   if (!S.empty())
     S.consumeInteger(0, HighBitsOf32BitAddress);
 
-  S = F.getFnAttribute("amdgpu-gds-size").getValueAsString();
-  if (!S.empty())
-    S.consumeInteger(0, GDSSize);
+  // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
+  // VGPR available at all times. For now, reserve highest available VGPR. After
+  // RA, shift it to the lowest available unused VGPR if the one exist.
+  if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
+    VGPRForAGPRCopy =
+        AMDGPU::VGPR_32RegClass.getRegister(ST.getMaxNumVGPRs(F) - 1);
+  }
+}
+
+MachineFunctionInfo *SIMachineFunctionInfo::clone(
+    BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+    const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+    const {
+  return DestMF.cloneInfo<SIMachineFunctionInfo>(*this);
 }
 
 void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) {
@@ -265,7 +285,7 @@ bool SIMachineFunctionInfo::haveFreeLanesForSGPRSpill(const MachineFunction &MF,
 /// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI.
 bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
                                                     int FI) {
-  std::vector<SpilledReg> &SpillLanes = SGPRToVGPRSpills[FI];
+  std::vector<SIRegisterInfo::SpilledReg> &SpillLanes = SGPRToVGPRSpills[FI];
 
   // This has already been allocated.
   if (!SpillLanes.empty())
@@ -320,7 +340,7 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
 
       SpillVGPRs.push_back(SGPRSpillVGPR(LaneVGPR, SpillFI));
 
-      // Add this register as live-in to all blocks to avoid machine verifer
+      // Add this register as live-in to all blocks to avoid machine verifier
       // complaining about use of an undefined physical register.
       for (MachineBasicBlock &BB : MF)
         BB.addLiveIn(LaneVGPR);
@@ -328,7 +348,7 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
       LaneVGPR = SpillVGPRs.back().VGPR;
     }
 
-    SpillLanes.push_back(SpilledReg(LaneVGPR, VGPRIndex));
+    SpillLanes.push_back(SIRegisterInfo::SpilledReg(LaneVGPR, VGPRIndex));
   }
 
   return true;
@@ -402,7 +422,8 @@ bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF,
   return Spill.FullyAllocated;
 }
 
-void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) {
+bool SIMachineFunctionInfo::removeDeadFrameIndices(
+    MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) {
   // Remove dead frame indices from function frame, however keep FP & BP since
   // spills for them haven't been inserted yet. And also make sure to remove the
   // frame indices from `SGPRToVGPRSpills` data structure, otherwise, it could
@@ -415,17 +436,42 @@ void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) {
     }
   }
 
-  // All other SPGRs must be allocated on the default stack, so reset the stack
-  // ID.
-  for (int i = MFI.getObjectIndexBegin(), e = MFI.getObjectIndexEnd(); i != e;
-       ++i)
-    if (i != FramePointerSaveIndex && i != BasePointerSaveIndex)
-      MFI.setStackID(i, TargetStackID::Default);
+  bool HaveSGPRToMemory = false;
+
+  if (ResetSGPRSpillStackIDs) {
+    // All other SPGRs must be allocated on the default stack, so reset the
+    // stack ID.
+    for (int i = MFI.getObjectIndexBegin(), e = MFI.getObjectIndexEnd(); i != e;
+         ++i) {
+      if (i != FramePointerSaveIndex && i != BasePointerSaveIndex) {
+        if (MFI.getStackID(i) == TargetStackID::SGPRSpill) {
+          MFI.setStackID(i, TargetStackID::Default);
+          HaveSGPRToMemory = true;
+        }
+      }
+    }
+  }
 
   for (auto &R : VGPRToAGPRSpills) {
     if (R.second.IsDead)
       MFI.RemoveStackObject(R.first);
   }
+
+  return HaveSGPRToMemory;
+}
+
+void SIMachineFunctionInfo::allocateWWMReservedSpillSlots(
+    MachineFrameInfo &MFI, const SIRegisterInfo &TRI) {
+  assert(WWMReservedFrameIndexes.empty());
+
+  WWMReservedFrameIndexes.resize(WWMReservedRegs.size());
+
+  int I = 0;
+  for (Register VGPR : WWMReservedRegs) {
+    const TargetRegisterClass *RC = TRI.getPhysRegClass(VGPR);
+    WWMReservedFrameIndexes[I++] = MFI.CreateSpillStackObject(
+        TRI.getSpillSize(*RC), TRI.getSpillAlign(*RC));
+  }
 }
 
 int SIMachineFunctionInfo::getScavengeFI(MachineFrameInfo &MFI,
@@ -539,6 +585,7 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
     const llvm::MachineFunction &MF)
     : ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
       MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()),
+      GDSSize(MFI.getGDSSize()),
       DynLDSAlign(MFI.getDynLDSAlign()), IsEntryFunction(MFI.isEntryFunction()),
       NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()),
       MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()),
@@ -549,7 +596,14 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
       ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
       FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)),
       StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)),
+      BytesInStackArgArea(MFI.getBytesInStackArgArea()),
+      ReturnsVoid(MFI.returnsVoid()),
       ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)), Mode(MFI.getMode()) {
+  for (Register Reg : MFI.WWMReservedRegs)
+    WWMReservedRegs.push_back(regToString(Reg, TRI));
+
+  if (MFI.getVGPRForAGPRCopy())
+    VGPRForAGPRCopy = regToString(MFI.getVGPRForAGPRCopy(), TRI);
   auto SFI = MFI.getOptionalScavengeFI();
   if (SFI)
     ScavengeFI = yaml::FrameIndex(*SFI, MF.getFrameInfo());
@@ -563,8 +617,9 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
     const yaml::SIMachineFunctionInfo &YamlMFI, const MachineFunction &MF,
     PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange) {
   ExplicitKernArgSize = YamlMFI.ExplicitKernArgSize;
-  MaxKernArgAlign = assumeAligned(YamlMFI.MaxKernArgAlign);
+  MaxKernArgAlign = YamlMFI.MaxKernArgAlign;
   LDSSize = YamlMFI.LDSSize;
+  GDSSize = YamlMFI.GDSSize;
   DynLDSAlign = YamlMFI.DynLDSAlign;
   HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress;
   Occupancy = YamlMFI.Occupancy;
@@ -574,6 +629,8 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
   WaveLimiter = YamlMFI.WaveLimiter;
   HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs;
   HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
+  BytesInStackArgArea = YamlMFI.BytesInStackArgArea;
+  ReturnsVoid = YamlMFI.ReturnsVoid;
 
   if (YamlMFI.ScavengeFI) {
     auto FIOrErr = YamlMFI.ScavengeFI->getFI(MF.getFrameInfo());
@@ -595,10 +652,47 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
   return false;
 }
 
+bool SIMachineFunctionInfo::mayUseAGPRs(const MachineFunction &MF) const {
+  for (const BasicBlock &BB : MF.getFunction()) {
+    for (const Instruction &I : BB) {
+      const auto *CB = dyn_cast<CallBase>(&I);
+      if (!CB)
+        continue;
+
+      if (CB->isInlineAsm()) {
+        const InlineAsm *IA = dyn_cast<InlineAsm>(CB->getCalledOperand());
+        for (const auto &CI : IA->ParseConstraints()) {
+          for (StringRef Code : CI.Codes) {
+            Code.consume_front("{");
+            if (Code.startswith("a"))
+              return true;
+          }
+        }
+        continue;
+      }
+
+      const Function *Callee =
+          dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts());
+      if (!Callee)
+        return true;
+
+      if (Callee->getIntrinsicID() == Intrinsic::not_intrinsic)
+        return true;
+    }
+  }
+
+  return false;
+}
+
 bool SIMachineFunctionInfo::usesAGPRs(const MachineFunction &MF) const {
   if (UsesAGPRs)
     return *UsesAGPRs;
 
+  if (!mayNeedAGPRs()) {
+    UsesAGPRs = false;
+    return false;
+  }
+
   if (!AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv()) ||
       MF.getFrameInfo().hasCalls()) {
     UsesAGPRs = true;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 8e821274bb77..bebb13cbf09f 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -15,9 +15,10 @@
 
 #include "AMDGPUArgumentUsageInfo.h"
 #include "AMDGPUMachineFunction.h"
+#include "AMDGPUTargetMachine.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIInstrInfo.h"
-#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/CodeGen/MIRYamlMapping.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/Support/raw_ostream.h"
@@ -39,8 +40,8 @@ public:
   };
 
 protected:
-  AMDGPUPseudoSourceValue(unsigned Kind, const TargetInstrInfo &TII)
-      : PseudoSourceValue(Kind, TII) {}
+  AMDGPUPseudoSourceValue(unsigned Kind, const AMDGPUTargetMachine &TM)
+      : PseudoSourceValue(Kind, TM) {}
 
 public:
   bool isConstant(const MachineFrameInfo *) const override {
@@ -60,8 +61,8 @@ public:
 
 class AMDGPUBufferPseudoSourceValue final : public AMDGPUPseudoSourceValue {
 public:
-  explicit AMDGPUBufferPseudoSourceValue(const TargetInstrInfo &TII)
-      : AMDGPUPseudoSourceValue(PSVBuffer, TII) {}
+  explicit AMDGPUBufferPseudoSourceValue(const AMDGPUTargetMachine &TM)
+      : AMDGPUPseudoSourceValue(PSVBuffer, TM) {}
 
   static bool classof(const PseudoSourceValue *V) {
     return V->kind() == PSVBuffer;
@@ -73,8 +74,8 @@ public:
 class AMDGPUImagePseudoSourceValue final : public AMDGPUPseudoSourceValue {
 public:
   // TODO: Is the img rsrc useful?
-  explicit AMDGPUImagePseudoSourceValue(const TargetInstrInfo &TII)
-      : AMDGPUPseudoSourceValue(PSVImage, TII) {}
+  explicit AMDGPUImagePseudoSourceValue(const AMDGPUTargetMachine &TM)
+      : AMDGPUPseudoSourceValue(PSVImage, TM) {}
 
   static bool classof(const PseudoSourceValue *V) {
     return V->kind() == PSVImage;
@@ -85,8 +86,8 @@ public:
 
 class AMDGPUGWSResourcePseudoSourceValue final : public AMDGPUPseudoSourceValue {
 public:
-  explicit AMDGPUGWSResourcePseudoSourceValue(const TargetInstrInfo &TII)
-      : AMDGPUPseudoSourceValue(GWSResource, TII) {}
+  explicit AMDGPUGWSResourcePseudoSourceValue(const AMDGPUTargetMachine &TM)
+      : AMDGPUPseudoSourceValue(GWSResource, TM) {}
 
   static bool classof(const PseudoSourceValue *V) {
     return V->kind() == GWSResource;
@@ -269,8 +270,9 @@ template <> struct MappingTraits<SIMode> {
 
 struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
   uint64_t ExplicitKernArgSize = 0;
-  unsigned MaxKernArgAlign = 0;
-  unsigned LDSSize = 0;
+  Align MaxKernArgAlign;
+  uint32_t LDSSize = 0;
+  uint32_t GDSSize = 0;
   Align DynLDSAlign;
   bool IsEntryFunction = false;
   bool NoSignedZerosFPMath = false;
@@ -283,13 +285,19 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
   // TODO: 10 may be a better default since it's the maximum.
   unsigned Occupancy = 0;
 
+  SmallVector<StringValue> WWMReservedRegs;
+
   StringValue ScratchRSrcReg = "$private_rsrc_reg";
   StringValue FrameOffsetReg = "$fp_reg";
   StringValue StackPtrOffsetReg = "$sp_reg";
 
+  unsigned BytesInStackArgArea = 0;
+  bool ReturnsVoid = true;
+
   Optional<SIArgumentInfo> ArgInfo;
   SIMode Mode;
   Optional<FrameIndex> ScavengeFI;
+  StringValue VGPRForAGPRCopy;
 
   SIMachineFunctionInfo() = default;
   SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &,
@@ -304,8 +312,9 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
   static void mapping(IO &YamlIO, SIMachineFunctionInfo &MFI) {
     YamlIO.mapOptional("explicitKernArgSize", MFI.ExplicitKernArgSize,
                        UINT64_C(0));
-    YamlIO.mapOptional("maxKernArgAlign", MFI.MaxKernArgAlign, 0u);
+    YamlIO.mapOptional("maxKernArgAlign", MFI.MaxKernArgAlign);
     YamlIO.mapOptional("ldsSize", MFI.LDSSize, 0u);
+    YamlIO.mapOptional("gdsSize", MFI.GDSSize, 0u);
     YamlIO.mapOptional("dynLDSAlign", MFI.DynLDSAlign, Align());
     YamlIO.mapOptional("isEntryFunction", MFI.IsEntryFunction, false);
     YamlIO.mapOptional("noSignedZerosFPMath", MFI.NoSignedZerosFPMath, false);
@@ -319,12 +328,17 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
                        StringValue("$fp_reg"));
     YamlIO.mapOptional("stackPtrOffsetReg", MFI.StackPtrOffsetReg,
                        StringValue("$sp_reg"));
+    YamlIO.mapOptional("bytesInStackArgArea", MFI.BytesInStackArgArea, 0u);
+    YamlIO.mapOptional("returnsVoid", MFI.ReturnsVoid, true);
     YamlIO.mapOptional("argumentInfo", MFI.ArgInfo);
     YamlIO.mapOptional("mode", MFI.Mode, SIMode());
     YamlIO.mapOptional("highBitsOf32BitAddress",
                        MFI.HighBitsOf32BitAddress, 0u);
     YamlIO.mapOptional("occupancy", MFI.Occupancy, 0);
+    YamlIO.mapOptional("wwmReservedRegs", MFI.WWMReservedRegs);
     YamlIO.mapOptional("scavengeFI", MFI.ScavengeFI);
+    YamlIO.mapOptional("vgprForAGPRCopy", MFI.VGPRForAGPRCopy,
+                       StringValue()); // Don't print out when it's empty.
   }
 };
 
@@ -335,8 +349,6 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
 class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
   friend class GCNTargetMachine;
 
-  Register TIDReg = AMDGPU::NoRegister;
-
   // Registers that may be reserved for spilling purposes. These may be the same
   // as the input registers.
   Register ScratchRSrcReg = AMDGPU::PRIVATE_RSRC_REG;
@@ -377,12 +389,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
   // unit. Minimum - first, maximum - second.
   std::pair<unsigned, unsigned> WavesPerEU = {0, 0};
 
-  std::unique_ptr<const AMDGPUBufferPseudoSourceValue> BufferPSV;
-  std::unique_ptr<const AMDGPUImagePseudoSourceValue> ImagePSV;
-  std::unique_ptr<const AMDGPUGWSResourcePseudoSourceValue> GWSResourcePSV;
+  const AMDGPUBufferPseudoSourceValue BufferPSV;
+  const AMDGPUImagePseudoSourceValue ImagePSV;
+  const AMDGPUGWSResourcePseudoSourceValue GWSResourcePSV;
 
 private:
-  unsigned LDSWaveSpillSize = 0;
   unsigned NumUserSGPRs = 0;
   unsigned NumSystemSGPRs = 0;
 
@@ -422,13 +433,14 @@ private:
   // user arguments. This is an offset from the KernargSegmentPtr.
   bool ImplicitArgPtr : 1;
 
+  bool MayNeedAGPRs : 1;
+
   // The hard-wired high half of the address of the global information table
   // for AMDPAL OS type. 0xffffffff represents no hard-wired high half, since
   // current hardware only allows a 16 bit value.
   unsigned GITPtrHigh;
 
   unsigned HighBitsOf32BitAddress;
-  unsigned GDSSize;
 
   // Current recorded maximum possible occupancy.
   unsigned Occupancy;
@@ -440,17 +452,6 @@ private:
   MCPhysReg getNextSystemSGPR() const;
 
 public:
-  struct SpilledReg {
-    Register VGPR;
-    int Lane = -1;
-
-    SpilledReg() = default;
-    SpilledReg(Register R, int L) : VGPR (R), Lane (L) {}
-
-    bool hasLane() { return Lane != -1;}
-    bool hasReg() { return VGPR != 0;}
-  };
-
   struct SGPRSpillVGPR {
     // VGPR used for SGPR spills
     Register VGPR;
@@ -468,14 +469,28 @@ public:
     bool IsDead = false;
   };
 
-  // Map WWM VGPR to a stack slot that is used to save/restore it in the
-  // prolog/epilog.
-  MapVector<Register, Optional<int>> WWMReservedRegs;
+  // Track VGPRs reserved for WWM.
+  SmallSetVector<Register, 8> WWMReservedRegs;
+
+  /// Track stack slots used for save/restore of reserved WWM VGPRs in the
+  /// prolog/epilog.
+
+  /// FIXME: This is temporary state only needed in PrologEpilogInserter, and
+  /// doesn't really belong here. It does not require serialization
+  SmallVector<int, 8> WWMReservedFrameIndexes;
+
+  void allocateWWMReservedSpillSlots(MachineFrameInfo &MFI,
+                                     const SIRegisterInfo &TRI);
+
+  auto wwmAllocation() const {
+    assert(WWMReservedRegs.size() == WWMReservedFrameIndexes.size());
+    return zip(WWMReservedRegs, WWMReservedFrameIndexes);
+  }
 
 private:
   // Track VGPR + wave index for each subregister of the SGPR spilled to
   // frameindex key.
-  DenseMap<int, std::vector<SpilledReg>> SGPRToVGPRSpills;
+  DenseMap<int, std::vector<SIRegisterInfo::SpilledReg>> SGPRToVGPRSpills;
   unsigned NumVGPRSpillLanes = 0;
   SmallVector<SGPRSpillVGPR, 2> SpillVGPRs;
 
@@ -491,6 +506,18 @@ private:
   // frame, so save it here and add it to the RegScavenger later.
   Optional<int> ScavengeFI;
 
+private:
+  Register VGPRForAGPRCopy;
+
+public:
+  Register getVGPRForAGPRCopy() const {
+    return VGPRForAGPRCopy;
+  }
+
+  void setVGPRForAGPRCopy(Register NewVGPRForAGPRCopy) {
+    VGPRForAGPRCopy = NewVGPRForAGPRCopy;
+  }
+
 public: // FIXME
   /// If this is set, an SGPR used for save/restore of the register used for the
   /// frame pointer.
@@ -506,31 +533,32 @@ public: // FIXME
 
 public:
   SIMachineFunctionInfo(const MachineFunction &MF);
+  SIMachineFunctionInfo(const SIMachineFunctionInfo &MFI) = default;
+
+  MachineFunctionInfo *
+  clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+        const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+      const override;
 
   bool initializeBaseYamlFields(const yaml::SIMachineFunctionInfo &YamlMFI,
                                 const MachineFunction &MF,
                                 PerFunctionMIParsingState &PFS,
                                 SMDiagnostic &Error, SMRange &SourceRange);
 
-  void reserveWWMRegister(Register Reg, Optional<int> FI) {
-    WWMReservedRegs.insert(std::make_pair(Reg, FI));
+  void reserveWWMRegister(Register Reg) {
+    WWMReservedRegs.insert(Reg);
   }
 
-  ArrayRef<SpilledReg> getSGPRToVGPRSpills(int FrameIndex) const {
+  ArrayRef<SIRegisterInfo::SpilledReg>
+  getSGPRToVGPRSpills(int FrameIndex) const {
     auto I = SGPRToVGPRSpills.find(FrameIndex);
-    return (I == SGPRToVGPRSpills.end()) ?
-      ArrayRef<SpilledReg>() : makeArrayRef(I->second);
+    return (I == SGPRToVGPRSpills.end())
+               ? ArrayRef<SIRegisterInfo::SpilledReg>()
+               : makeArrayRef(I->second);
   }
 
   ArrayRef<SGPRSpillVGPR> getSGPRSpillVGPRs() const { return SpillVGPRs; }
 
-  void setSGPRSpillVGPRs(Register NewVGPR, Optional<int> newFI, int Index) {
-    SpillVGPRs[Index].VGPR = NewVGPR;
-    SpillVGPRs[Index].FI = newFI;
-  }
-
-  bool removeVGPRForSGPRSpill(Register ReservedVGPR, MachineFunction &MF);
-
   ArrayRef<MCPhysReg> getAGPRSpillVGPRs() const {
     return SpillAGPR;
   }
@@ -555,15 +583,15 @@ public:
                                  unsigned NumLane) const;
   bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
   bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR);
-  void removeDeadFrameIndices(MachineFrameInfo &MFI);
+
+  /// If \p ResetSGPRSpillStackIDs is true, reset the stack ID from sgpr-spill
+  /// to the default stack.
+  bool removeDeadFrameIndices(MachineFrameInfo &MFI,
+                              bool ResetSGPRSpillStackIDs);
 
   int getScavengeFI(MachineFrameInfo &MFI, const SIRegisterInfo &TRI);
   Optional<int> getOptionalScavengeFI() const { return ScavengeFI; }
 
-  bool hasCalculatedTID() const { return TIDReg != 0; };
-  Register getTIDReg() const { return TIDReg; };
-  void setTIDReg(Register Reg) { TIDReg = Reg; }
-
   unsigned getBytesInStackArgArea() const {
     return BytesInStackArgArea;
   }
@@ -581,6 +609,13 @@ public:
   Register addFlatScratchInit(const SIRegisterInfo &TRI);
   Register addImplicitBufferPtr(const SIRegisterInfo &TRI);
 
+  /// Increment user SGPRs used for padding the argument list only.
+  Register addReservedUserSGPR() {
+    Register Next = getNextUserSGPR();
+    ++NumUserSGPRs;
+    return Next;
+  }
+
   // Add system SGPRs.
   Register addWorkGroupIDX() {
     ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(getNextSystemSGPR());
@@ -722,10 +757,6 @@ public:
     return HighBitsOf32BitAddress;
   }
 
-  unsigned getGDSSize() const {
-    return GDSSize;
-  }
-
   unsigned getNumUserSGPRs() const {
     return NumUserSGPRs;
   }
@@ -903,31 +934,19 @@ public:
     llvm_unreachable("unexpected dimension");
   }
 
-  unsigned getLDSWaveSpillSize() const {
-    return LDSWaveSpillSize;
+  const AMDGPUBufferPseudoSourceValue *
+  getBufferPSV(const AMDGPUTargetMachine &TM) {
+    return &BufferPSV;
   }
 
-  const AMDGPUBufferPseudoSourceValue *getBufferPSV(const SIInstrInfo &TII) {
-    if (!BufferPSV)
-      BufferPSV = std::make_unique<AMDGPUBufferPseudoSourceValue>(TII);
-
-    return BufferPSV.get();
-  }
-
-  const AMDGPUImagePseudoSourceValue *getImagePSV(const SIInstrInfo &TII) {
-    if (!ImagePSV)
-      ImagePSV = std::make_unique<AMDGPUImagePseudoSourceValue>(TII);
-
-    return ImagePSV.get();
+  const AMDGPUImagePseudoSourceValue *
+  getImagePSV(const AMDGPUTargetMachine &TM) {
+    return &ImagePSV;
   }
 
-  const AMDGPUGWSResourcePseudoSourceValue *getGWSPSV(const SIInstrInfo &TII) {
-    if (!GWSResourcePSV) {
-      GWSResourcePSV =
-          std::make_unique<AMDGPUGWSResourcePseudoSourceValue>(TII);
-    }
-
-    return GWSResourcePSV.get();
+  const AMDGPUGWSResourcePseudoSourceValue *
+  getGWSPSV(const AMDGPUTargetMachine &TM) {
+    return &GWSResourcePSV;
   }
 
   unsigned getOccupancy() const {
@@ -953,6 +972,14 @@ public:
     limitOccupancy(MF);
   }
 
+  bool mayNeedAGPRs() const {
+    return MayNeedAGPRs;
+  }
+
+  // \returns true if a function has a use of AGPRs via inline asm or
+  // has a call which may use it.
+  bool mayUseAGPRs(const MachineFunction &MF) const;
+
   // \returns true if a function needs or may need AGPRs.
   bool usesAGPRs(const MachineFunction &MF) const;
 };
diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
index 81db66a98ddf..e426e938b856 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -64,7 +64,7 @@ using namespace llvm;
 // First the instructions are put into blocks.
 //   We want the blocks help control register usage and hide high latencies
 //   later. To help control register usage, we typically want all local
-//   computations, when for example you create a result that can be comsummed
+//   computations, when for example you create a result that can be consumed
 //   right away, to be contained in a block. Block inputs and outputs would
 //   typically be important results that are needed in several locations of
 //   the shader. Since we do want blocks to help hide high latencies, we want
@@ -90,8 +90,8 @@ using namespace llvm;
 // Increasing the number of active wavefronts helps hide the former, but it
 // doesn't solve the latter, thus why even if wavefront count is high, we have
 // to try have as many instructions hiding high latencies as possible.
-// The OpenCL doc says for example latency of 400 cycles for a global mem access,
-// which is hidden by 10 instructions if the wavefront count is 10.
+// The OpenCL doc says for example latency of 400 cycles for a global mem
+// access, which is hidden by 10 instructions if the wavefront count is 10.
 
 // Some figures taken from AMD docs:
 // Both texture and constant L1 caches are 4-way associative with 64 bytes
@@ -353,7 +353,7 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
   // able to correctly handle 5 vs 6, 2 vs 3.
   // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
   // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
-  // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
+  // Comparing to LiveInRegs is not sufficient to differentiate 4 vs 5, 7
   // The use of findDefBetween removes the case 4.
   for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
     Register Reg = RegMaskPair.RegUnit;
@@ -402,7 +402,7 @@ void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
     nodeScheduled(SU);
   }
 
-  // TODO: compute InternalAdditionnalPressure.
+  // TODO: compute InternalAdditionalPressure.
   InternalAdditionalPressure.resize(TopPressure.MaxSetPressure.size());
 
   // Check everything is right.
@@ -696,7 +696,7 @@ void SIScheduleBlockCreator::colorHighLatenciesGroups() {
         bool HasSubGraph;
         std::vector<int> SubGraph;
         // By construction (topological order), if SU and
-        // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
+        // DAG->SUnits[j] are linked, DAG->SUnits[j] is necessary
         // in the parent graph of SU.
 #ifndef NDEBUG
         SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
@@ -1123,36 +1123,26 @@ void SIScheduleBlockCreator::colorExports() {
   for (unsigned SUNum : DAG->TopDownIndex2SU) {
     const SUnit &SU = DAG->SUnits[SUNum];
     if (SIInstrInfo::isEXP(*SU.getInstr())) {
-      // Check the EXP can be added to the group safely,
-      // ie without needing any other instruction.
-      // The EXP is allowed to depend on other EXP
-      // (they will be in the same group).
-      for (unsigned j : ExpGroup) {
-        bool HasSubGraph;
-        std::vector<int> SubGraph;
-        // By construction (topological order), if SU and
-        // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
-        // in the parent graph of SU.
-#ifndef NDEBUG
-        SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
-                                               HasSubGraph);
-        assert(!HasSubGraph);
-#endif
-        SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
-                                               HasSubGraph);
-        if (!HasSubGraph)
-          continue; // No dependencies between each other
-
-        // SubGraph contains all the instructions required
-        // between EXP SUnits[j] and EXP SU.
-        for (unsigned k : SubGraph) {
-          if (!SIInstrInfo::isEXP(*DAG->SUnits[k].getInstr()))
-            // Other instructions than EXP would be required in the group.
-            // Abort the groupping.
-            return;
+      // SU is an export instruction. Check whether one of its successor
+      // dependencies is a non-export, in which case we skip export grouping.
+      for (const SDep &SuccDep : SU.Succs) {
+        const SUnit *SuccSU = SuccDep.getSUnit();
+        if (SuccDep.isWeak() || SuccSU->NodeNum >= DAG->SUnits.size()) {
+          // Ignore these dependencies.
+          continue;
+        }
+        assert(SuccSU->isInstr() &&
+               "SUnit unexpectedly not representing an instruction!");
+
+        if (!SIInstrInfo::isEXP(*SuccSU->getInstr())) {
+          // A non-export depends on us. Skip export grouping.
+          // Note that this is a bit pessimistic: We could still group all other
+          // exports that are not depended on by non-exports, directly or
+          // indirectly. Simply skipping this particular export but grouping all
+          // others would not account for indirect dependencies.
+          return;
         }
       }
-
       ExpGroup.push_back(SUNum);
     }
   }
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index fff4f6729c99..8a66213931ff 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -19,6 +19,7 @@
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/TargetParser.h"
@@ -63,7 +64,7 @@ enum class SIAtomicScope {
 };
 
 /// The distinct address spaces supported by the AMDGPU target for
-/// atomic memory operation. Can be ORed toether.
+/// atomic memory operation. Can be ORed together.
 enum class SIAtomicAddrSpace {
   NONE = 0u,
   GLOBAL = 1u << 0,
@@ -459,6 +460,56 @@ public:
                      Position Pos) const override;
 };
 
+class SIGfx940CacheControl : public SIGfx90ACacheControl {
+protected:
+
+  /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
+  /// is modified, false otherwise.
+  bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
+    return enableNamedBit(MI, AMDGPU::CPol::SC0);
+  }
+
+  /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
+  /// is modified, false otherwise.
+  bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
+    return enableNamedBit(MI, AMDGPU::CPol::SC1);
+  }
+
+  /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
+  /// is modified, false otherwise.
+  bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
+    return enableNamedBit(MI, AMDGPU::CPol::NT);
+  }
+
+public:
+
+  SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
+
+  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
+                             SIAtomicScope Scope,
+                             SIAtomicAddrSpace AddrSpace) const override;
+
+  bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
+                              SIAtomicScope Scope,
+                              SIAtomicAddrSpace AddrSpace) const override;
+
+  bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
+                            SIAtomicScope Scope,
+                            SIAtomicAddrSpace AddrSpace) const override;
+
+  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
+                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+                                      bool IsVolatile,
+                                      bool IsNonTemporal) const override;
+
+  bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+                     SIAtomicAddrSpace AddrSpace, Position Pos) const override;
+
+  bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+                     SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
+                     Position Pos) const override;
+};
+
 class SIGfx10CacheControl : public SIGfx7CacheControl {
 protected:
 
@@ -494,6 +545,20 @@ public:
                      Position Pos) const override;
 };
 
+class SIGfx11CacheControl : public SIGfx10CacheControl {
+public:
+  SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
+
+  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
+                             SIAtomicScope Scope,
+                             SIAtomicAddrSpace AddrSpace) const override;
+
+  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
+                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+                                      bool IsVolatile,
+                                      bool IsNonTemporal) const override;
+};
+
 class SIMemoryLegalizer final : public MachineFunctionPass {
 private:
 
@@ -649,7 +714,7 @@ Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
         return None;
       }
 
-      SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
+      SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
       Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
@@ -668,7 +733,7 @@ Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
       return None;
     }
     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
-      ScopeOrNone.getValue();
+        *ScopeOrNone;
     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
         ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
@@ -730,7 +795,7 @@ Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
   bool IsCrossAddressSpaceOrdering = false;
   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
-    ScopeOrNone.getValue();
+      *ScopeOrNone;
 
   if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
       ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
@@ -775,13 +840,17 @@ bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
 /* static */
 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
   GCNSubtarget::Generation Generation = ST.getGeneration();
+  if (ST.hasGFX940Insts())
+    return std::make_unique<SIGfx940CacheControl>(ST);
   if (ST.hasGFX90AInsts())
     return std::make_unique<SIGfx90ACacheControl>(ST);
   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
     return std::make_unique<SIGfx6CacheControl>(ST);
   if (Generation < AMDGPUSubtarget::GFX10)
     return std::make_unique<SIGfx7CacheControl>(ST);
-  return std::make_unique<SIGfx10CacheControl>(ST);
+  if (Generation < AMDGPUSubtarget::GFX11)
+    return std::make_unique<SIGfx10CacheControl>(ST);
+  return std::make_unique<SIGfx11CacheControl>(ST);
 }
 
 bool SIGfx6CacheControl::enableLoadCacheBypass(
@@ -943,7 +1012,7 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
       // The LDS keeps all memory operations in order for
-      // the same wavesfront.
+      // the same wavefront.
       break;
     default:
       llvm_unreachable("Unsupported synchronization scope");
@@ -1360,7 +1429,9 @@ bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
       // to initiate writeback of any dirty cache lines of earlier writes by the
       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
       // writeback has completed.
-      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2));
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
+        // Set SC bits to indicate system scope.
+        .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
       // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
       // vmcnt(0)" needed by the "BUFFER_WBL2".
       Changed = true;
@@ -1386,6 +1457,308 @@ bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
   return Changed;
 }
 
+bool SIGfx940CacheControl::enableLoadCacheBypass(
+    const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+    SIAtomicAddrSpace AddrSpace) const {
+  assert(MI->mayLoad() && !MI->mayStore());
+  bool Changed = false;
+
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+      // Set SC bits to indicate system scope.
+      Changed |= enableSC0Bit(MI);
+      Changed |= enableSC1Bit(MI);
+      break;
+    case SIAtomicScope::AGENT:
+      // Set SC bits to indicate agent scope.
+      Changed |= enableSC1Bit(MI);
+      break;
+    case SIAtomicScope::WORKGROUP:
+      // In threadgroup split mode the waves of a work-group can be executing on
+      // different CUs. Therefore need to bypass the L1 which is per CU.
+      // Otherwise in non-threadgroup split mode all waves of a work-group are
+      // on the same CU, and so the L1 does not need to be bypassed. Setting SC
+      // bits to indicate work-group scope will do this automatically.
+      Changed |= enableSC0Bit(MI);
+      break;
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // Leave SC bits unset to indicate wavefront scope.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  /// The scratch address space does not need the global memory caches
+  /// to be bypassed as all memory operations by the same thread are
+  /// sequentially consistent, and no other thread can access scratch
+  /// memory.
+
+  /// Other address spaces do not have a cache.
+
+  return Changed;
+}
+
+bool SIGfx940CacheControl::enableStoreCacheBypass(
+    const MachineBasicBlock::iterator &MI,
+    SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
+  assert(!MI->mayLoad() && MI->mayStore());
+  bool Changed = false;
+
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+      // Set SC bits to indicate system scope.
+      Changed |= enableSC0Bit(MI);
+      Changed |= enableSC1Bit(MI);
+      break;
+    case SIAtomicScope::AGENT:
+      // Set SC bits to indicate agent scope.
+      Changed |= enableSC1Bit(MI);
+      break;
+    case SIAtomicScope::WORKGROUP:
+      // Set SC bits to indicate workgroup scope.
+      Changed |= enableSC0Bit(MI);
+      break;
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // Leave SC bits unset to indicate wavefront scope.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  /// The scratch address space does not need the global memory caches
+  /// to be bypassed as all memory operations by the same thread are
+  /// sequentially consistent, and no other thread can access scratch
+  /// memory.
+
+  /// Other address spaces do not have a cache.
+
+  return Changed;
+}
+
+bool SIGfx940CacheControl::enableRMWCacheBypass(
+    const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+    SIAtomicAddrSpace AddrSpace) const {
+  assert(MI->mayLoad() && MI->mayStore());
+  bool Changed = false;
+
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+      // Set SC1 bit to indicate system scope.
+      Changed |= enableSC1Bit(MI);
+      break;
+    case SIAtomicScope::AGENT:
+    case SIAtomicScope::WORKGROUP:
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // RMW atomic operations implicitly bypass the L1 cache and only use SC1
+      // to indicate system or agent scope. The SC0 bit is used to indicate if
+      // they are return or no-return. Leave SC1 bit unset to indicate agent
+      // scope.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  return Changed;
+}
+
+bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
+    MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+    bool IsVolatile, bool IsNonTemporal) const {
+  // Only handle load and store, not atomic read-modify-write insructions. The
+  // latter use glc to indicate if the atomic returns a result and so must not
+  // be used for cache control.
+  assert(MI->mayLoad() ^ MI->mayStore());
+
+  // Only update load and store, not LLVM IR atomic read-modify-write
+  // instructions. The latter are always marked as volatile so cannot sensibly
+  // handle it as do not want to pessimize all atomics. Also they do not support
+  // the nontemporal attribute.
+  assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
+
+  bool Changed = false;
+
+  if (IsVolatile) {
+    // Set SC bits to indicate system scope.
+    Changed |= enableSC0Bit(MI);
+    Changed |= enableSC1Bit(MI);
+
+    // Ensure operation has completed at system scope to cause all volatile
+    // operations to be visible outside the program in a global order. Do not
+    // request cross address space as only the global address space can be
+    // observable outside the program, so no need to cause a waitcnt for LDS
+    // address space operations.
+    Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
+                          Position::AFTER);
+
+    return Changed;
+  }
+
+  if (IsNonTemporal) {
+    Changed |= enableNTBit(MI);
+    return Changed;
+  }
+
+  return Changed;
+}
+
+bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
+                                         SIAtomicScope Scope,
+                                         SIAtomicAddrSpace AddrSpace,
+                                         Position Pos) const {
+  if (!InsertCacheInv)
+    return false;
+
+  bool Changed = false;
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+
+  if (Pos == Position::AFTER)
+    ++MI;
+
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+      // Ensures that following loads will not see stale remote VMEM data or
+      // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
+      // CC will never be stale due to the local memory probes.
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
+          // Set SC bits to indicate system scope.
+          .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
+      // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
+      // hardware does not reorder memory operations by the same wave with
+      // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
+      // remove any cache lines of earlier writes by the same wave and ensures
+      // later reads by the same wave will refetch the cache lines.
+      Changed = true;
+      break;
+    case SIAtomicScope::AGENT:
+      // Ensures that following loads will not see stale remote date or local
+      // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
+      // due to the memory probes.
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
+          // Set SC bits to indicate agent scope.
+          .addImm(AMDGPU::CPol::SC1);
+      // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
+      // does not reorder memory operations with respect to preceeding buffer
+      // invalidate. The invalidate is guaranteed to remove any cache lines of
+      // earlier writes and ensures later writes will refetch the cache lines.
+      Changed = true;
+      break;
+    case SIAtomicScope::WORKGROUP:
+      // In threadgroup split mode the waves of a work-group can be executing on
+      // different CUs. Therefore need to invalidate the L1 which is per CU.
+      // Otherwise in non-threadgroup split mode all waves of a work-group are
+      // on the same CU, and so the L1 does not need to be invalidated.
+      if (ST.isTgSplitEnabled()) {
+        // Ensures L1 is invalidated if in threadgroup split mode. In
+        // non-threadgroup split mode it is a NOP, but no point generating it in
+        // that case if know not in that mode.
+        BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
+            // Set SC bits to indicate work-group scope.
+            .addImm(AMDGPU::CPol::SC0);
+        // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
+        // does not reorder memory operations with respect to preceeding buffer
+        // invalidate. The invalidate is guaranteed to remove any cache lines of
+        // earlier writes and ensures later writes will refetch the cache lines.
+        Changed = true;
+      }
+      break;
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // Could generate "BUFFER_INV" but it would do nothing as there are no
+      // caches to invalidate.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  /// The scratch address space does not need the global memory cache
+  /// to be flushed as all memory operations by the same thread are
+  /// sequentially consistent, and no other thread can access scratch
+  /// memory.
+
+  /// Other address spaces do not have a cache.
+
+  if (Pos == Position::AFTER)
+    --MI;
+
+  return Changed;
+}
+
+bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
+                                         SIAtomicScope Scope,
+                                         SIAtomicAddrSpace AddrSpace,
+                                         bool IsCrossAddrSpaceOrdering,
+                                         Position Pos) const {
+  bool Changed = false;
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+
+  if (Pos == Position::AFTER)
+    ++MI;
+
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+      // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
+      // hardware does not reorder memory operations by the same wave with
+      // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
+      // to initiate writeback of any dirty cache lines of earlier writes by the
+      // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
+      // writeback has completed.
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
+          // Set SC bits to indicate system scope.
+          .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
+      // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
+      // SIAtomicScope::SYSTEM, the following insertWait will generate the
+      // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
+      Changed = true;
+      break;
+    case SIAtomicScope::AGENT:
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
+          // Set SC bits to indicate agent scope.
+          .addImm(AMDGPU::CPol::SC1);
+
+      // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
+      // SIAtomicScope::AGENT, the following insertWait will generate the
+      // required "S_WAITCNT vmcnt(0)".
+      Changed = true;
+      break;
+    case SIAtomicScope::WORKGROUP:
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // Do not generate "BUFFER_WBL2" as there are no caches it would
+      // writeback, and would require an otherwise unnecessary
+      // "S_WAITCNT vmcnt(0)".
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  if (Pos == Position::AFTER)
+    --MI;
+
+  // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
+  // S_WAITCNT needed.
+  Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
+                        IsCrossAddrSpaceOrdering, Pos);
+
+  return Changed;
+}
+
 bool SIGfx10CacheControl::enableLoadCacheBypass(
     const MachineBasicBlock::iterator &MI,
     SIAtomicScope Scope,
@@ -1547,7 +1920,7 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
       // The LDS keeps all memory operations in order for
-      // the same wavesfront.
+      // the same wavefront.
       break;
     default:
       llvm_unreachable("Unsupported synchronization scope");
@@ -1655,6 +2028,101 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
   return Changed;
 }
 
+bool SIGfx11CacheControl::enableLoadCacheBypass(
+    const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+    SIAtomicAddrSpace AddrSpace) const {
+  assert(MI->mayLoad() && !MI->mayStore());
+  bool Changed = false;
+
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+      // Set the L0 and L1 cache policies to MISS_EVICT.
+      // Note: there is no L2 cache coherent bypass control at the ISA level.
+      Changed |= enableGLCBit(MI);
+      break;
+    case SIAtomicScope::WORKGROUP:
+      // In WGP mode the waves of a work-group can be executing on either CU of
+      // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
+      // CU mode all waves of a work-group are on the same CU, and so the L0
+      // does not need to be bypassed.
+      if (!ST.isCuModeEnabled())
+        Changed |= enableGLCBit(MI);
+      break;
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // No cache to bypass.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  /// The scratch address space does not need the global memory caches
+  /// to be bypassed as all memory operations by the same thread are
+  /// sequentially consistent, and no other thread can access scratch
+  /// memory.
+
+  /// Other address spaces do not have a cache.
+
+  return Changed;
+}
+
+bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
+    MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+    bool IsVolatile, bool IsNonTemporal) const {
+
+  // Only handle load and store, not atomic read-modify-write insructions. The
+  // latter use glc to indicate if the atomic returns a result and so must not
+  // be used for cache control.
+  assert(MI->mayLoad() ^ MI->mayStore());
+
+  // Only update load and store, not LLVM IR atomic read-modify-write
+  // instructions. The latter are always marked as volatile so cannot sensibly
+  // handle it as do not want to pessimize all atomics. Also they do not support
+  // the nontemporal attribute.
+  assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
+
+  bool Changed = false;
+
+  if (IsVolatile) {
+    // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
+    // and MISS_LRU for store instructions.
+    // Note: there is no L2 cache coherent bypass control at the ISA level.
+    if (Op == SIMemOp::LOAD)
+      Changed |= enableGLCBit(MI);
+
+    // Set MALL NOALLOC for load and store instructions.
+    Changed |= enableDLCBit(MI);
+
+    // Ensure operation has completed at system scope to cause all volatile
+    // operations to be visible outside the program in a global order. Do not
+    // request cross address space as only the global address space can be
+    // observable outside the program, so no need to cause a waitcnt for LDS
+    // address space operations.
+    Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
+                          Position::AFTER);
+    return Changed;
+  }
+
+  if (IsNonTemporal) {
+    // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
+    // and L2 cache policy to STREAM.
+    // For stores setting both GLC and SLC configures L0 and L1 cache policy
+    // to MISS_EVICT and the L2 cache policy to STREAM.
+    if (Op == SIMemOp::STORE)
+      Changed |= enableGLCBit(MI);
+    Changed |= enableSLCBit(MI);
+
+    // Set MALL NOALLOC for load and store instructions.
+    Changed |= enableDLCBit(MI);
+    return Changed;
+  }
+
+  return Changed;
+}
+
 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
   if (AtomicPseudoMIs.empty())
     return false;
diff --git a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
index 24a8879b5684..a5816e2e8c73 100644
--- a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
+++ b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
@@ -17,6 +17,7 @@
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include <queue>
 
 #define DEBUG_TYPE "si-mode-register"
@@ -162,7 +163,9 @@ FunctionPass *llvm::createSIModeRegisterPass() { return new SIModeRegister(); }
 // double precision setting.
 Status SIModeRegister::getInstructionMode(MachineInstr &MI,
                                           const SIInstrInfo *TII) {
-  if (TII->usesFPDPRounding(MI)) {
+  if (TII->usesFPDPRounding(MI) ||
+      MI.getOpcode() == AMDGPU::FPTRUNC_UPWARD_PSEUDO ||
+      MI.getOpcode() == AMDGPU::FPTRUNC_DOWNWARD_PSEUDO) {
     switch (MI.getOpcode()) {
     case AMDGPU::V_INTERP_P1LL_F16:
     case AMDGPU::V_INTERP_P1LV_F16:
@@ -170,6 +173,18 @@ Status SIModeRegister::getInstructionMode(MachineInstr &MI,
       // f16 interpolation instructions need double precision round to zero
       return Status(FP_ROUND_MODE_DP(3),
                     FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_ZERO));
+    case AMDGPU::FPTRUNC_UPWARD_PSEUDO: {
+      // Replacing the pseudo by a real instruction
+      MI.setDesc(TII->get(AMDGPU::V_CVT_F16_F32_e32));
+      return Status(FP_ROUND_MODE_DP(3),
+                    FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_INF));
+    }
+    case AMDGPU::FPTRUNC_DOWNWARD_PSEUDO: {
+      // Replacing the pseudo by a real instruction
+      MI.setDesc(TII->get(AMDGPU::V_CVT_F16_F32_e32));
+      return Status(FP_ROUND_MODE_DP(3),
+                    FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEGINF));
+    }
     default:
       return DefaultStatus;
     }
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index b9c839fe28ba..5215397d5936 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -9,6 +9,7 @@
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/InitializePasses.h"
 
@@ -292,6 +293,210 @@ static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) {
   return false;
 }
 
+// Backwards-iterate from Origin (for n=MaxInstructions iterations) until either
+// the beginning of the BB is reached or Pred evaluates to true - which can be
+// an arbitrary condition based on the current MachineInstr, for instance an
+// target instruction. Breaks prematurely by returning nullptr if  one of the
+// registers given in NonModifiableRegs is modified by the current instruction.
+static MachineInstr *
+findInstrBackwards(MachineInstr &Origin,
+                   std::function<bool(MachineInstr *)> Pred,
+                   ArrayRef<MCRegister> NonModifiableRegs,
+                   const SIRegisterInfo *TRI, unsigned MaxInstructions = 20) {
+  MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(),
+                                      E = Origin.getParent()->rend();
+  unsigned CurrentIteration = 0;
+
+  for (++A; CurrentIteration < MaxInstructions && A != E; ++A) {
+    if (A->isDebugInstr())
+      continue;
+    
+    if (Pred(&*A))
+      return &*A;
+
+    for (MCRegister Reg : NonModifiableRegs) {
+      if (A->modifiesRegister(Reg, TRI))
+        return nullptr;
+    }
+    
+    ++CurrentIteration;
+  }
+
+  return nullptr;
+}
+
+
+// Determine if a register Reg is not re-defined and still in use
+// in the range (Stop..Start].
+// It does so by backwards calculating liveness from the end of the BB until
+// either Stop or the beginning of the BB is reached.
+// After liveness is calculated, we can determine if Reg is still in use and not
+// defined inbetween the instructions.
+static bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start,
+                                   MCRegister Reg, const SIRegisterInfo *TRI,
+                                   MachineRegisterInfo &MRI,
+                                   bool useLiveOuts = false,
+                                   bool ignoreStart = false) {
+  LivePhysRegs LR(*TRI);
+  if (useLiveOuts)
+    LR.addLiveOuts(*Stop.getParent());
+
+  MachineBasicBlock::reverse_iterator A(Start);
+  MachineBasicBlock::reverse_iterator E(Stop);
+
+  if (ignoreStart)
+    ++A;
+
+  for (; A != Stop.getParent()->rend() && A != Stop; ++A) {
+    LR.stepBackward(*A);
+  }
+
+  return !LR.available(MRI, Reg);
+}
+
+// Determine if a register Reg is not re-defined and still in use
+// in the range (Stop..BB.end].
+static bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg,
+                                 const SIRegisterInfo *TRI,
+                                 MachineRegisterInfo &MRI) {
+  return isRegisterInUseBetween(Stop, *Stop.getParent()->rbegin(), Reg, TRI,
+                                MRI, true);
+}
+
+// Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence
+// by looking at an instance of a s_and_saveexec instruction. Returns a pointer
+// to the v_cmp instruction if it is safe to replace the sequence (see the
+// conditions in the function body). This is after register allocation, so some
+// checks on operand dependencies need to be considered.
+static MachineInstr *findPossibleVCMPVCMPXOptimization(
+    MachineInstr &SaveExec, MCRegister Exec, const SIRegisterInfo *TRI,
+    const SIInstrInfo *TII, MachineRegisterInfo &MRI) {
+
+  MachineInstr *VCmp = nullptr;
+
+  Register SaveExecDest = SaveExec.getOperand(0).getReg();
+  if (!TRI->isSGPRReg(MRI, SaveExecDest))
+    return nullptr;
+
+  MachineOperand *SaveExecSrc0 =
+      TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0);
+  if (!SaveExecSrc0->isReg())
+    return nullptr;
+
+  // Try to find the last v_cmp instruction that defs the saveexec input
+  // operand without any write to Exec or the saveexec input operand inbetween.
+  VCmp = findInstrBackwards(
+      SaveExec,
+      [&](MachineInstr *Check) {
+        return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 &&
+               Check->modifiesRegister(SaveExecSrc0->getReg(), TRI);
+      },
+      {Exec, SaveExecSrc0->getReg()}, TRI);
+
+  if (!VCmp)
+    return nullptr;
+
+  MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst);
+  assert(VCmpDest && "Should have an sdst operand!");
+
+  // Check if any of the v_cmp source operands is written by the saveexec.
+  MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0);
+  if (Src0->isReg() && TRI->isSGPRReg(MRI, Src0->getReg()) &&
+      SaveExec.modifiesRegister(Src0->getReg(), TRI))
+    return nullptr;
+
+  MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1);
+  if (Src1->isReg() && TRI->isSGPRReg(MRI, Src1->getReg()) &&
+      SaveExec.modifiesRegister(Src1->getReg(), TRI))
+    return nullptr;
+
+  // Don't do the transformation if the destination operand is included in
+  // it's MBB Live-outs, meaning it's used in any of it's successors, leading
+  // to incorrect code if the v_cmp and therefore the def of
+  // the dest operand is removed.
+  if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg()))
+    return nullptr;
+
+  // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the
+  // s_and_saveexec, skip the optimization.
+  if (isRegisterInUseBetween(*VCmp, SaveExec, VCmpDest->getReg(), TRI, MRI,
+                             false, true) ||
+      isRegisterInUseAfter(SaveExec, VCmpDest->getReg(), TRI, MRI))
+    return nullptr;
+
+  // Try to determine if there is a write to any of the VCmp
+  // operands between the saveexec and the vcmp.
+  // If yes, additional VGPR spilling might need to be inserted. In this case,
+  // it's not worth replacing the instruction sequence.
+  SmallVector<MCRegister, 2> NonDefRegs;
+  if (Src0->isReg())
+    NonDefRegs.push_back(Src0->getReg());
+
+  if (Src1->isReg())
+    NonDefRegs.push_back(Src1->getReg());
+
+  if (!findInstrBackwards(
+          SaveExec, [&](MachineInstr *Check) { return Check == VCmp; },
+          NonDefRegs, TRI))
+    return nullptr;
+
+  return VCmp;
+}
+
+// Inserts the optimized s_mov_b32 / v_cmpx sequence based on the
+// operands extracted from a v_cmp ..., s_and_saveexec pattern.
+static bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr,
+                                         MachineInstr &VCmp, MCRegister Exec,
+                                         const SIInstrInfo *TII,
+                                         const SIRegisterInfo *TRI,
+                                         MachineRegisterInfo &MRI) {
+  const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode());
+
+  if (NewOpcode == -1)
+    return false;
+
+  MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0);
+  MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1);
+
+  Register MoveDest = SaveExecInstr.getOperand(0).getReg();
+
+  MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator();
+  if (!SaveExecInstr.uses().empty()) {
+    bool isSGPR32 = TRI->getRegSizeInBits(MoveDest, MRI) == 32;
+    unsigned MovOpcode = isSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+    BuildMI(*SaveExecInstr.getParent(), InsertPosIt,
+            SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest)
+        .addReg(Exec);
+  }
+
+  // Omit dst as V_CMPX is implicitly writing to EXEC.
+  // Add dummy src and clamp modifiers, if needed.
+  auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt),
+                         VCmp.getDebugLoc(), TII->get(NewOpcode));
+
+  auto TryAddImmediateValueFromNamedOperand =
+      [&](unsigned OperandName) -> void {
+    if (auto *Mod = TII->getNamedOperand(VCmp, OperandName))
+      Builder.addImm(Mod->getImm());
+  };
+
+  TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers);
+  Builder.add(*Src0);
+
+  TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers);
+  Builder.add(*Src1);
+
+  TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp);
+
+  // The kill flags may no longer be correct.
+  if (Src0->isReg())
+    MRI.clearKillFlags(Src0->getReg());
+  if (Src1->isReg())
+    MRI.clearKillFlags(Src1->getReg());
+
+  return true;
+}
+
 bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -299,6 +504,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
   const SIInstrInfo *TII = ST.getInstrInfo();
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
   MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
 
   // Optimize sequences emitted for control flow lowering. They are originally
@@ -312,6 +518,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
   //     x = s_<op>_saveexec_b64 y
   //
 
+  bool Changed = false;
   for (MachineBasicBlock &MBB : MF) {
     MachineBasicBlock::reverse_iterator I = fixTerminators(*TII, MBB);
     MachineBasicBlock::reverse_iterator E = MBB.rend();
@@ -351,6 +558,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
         LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n');
 
         CopyToExecInst->eraseFromParent();
+        Changed = true;
       }
 
       continue;
@@ -456,8 +664,49 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
       OtherInst->substituteRegister(CopyToExec, Exec,
                                     AMDGPU::NoSubRegister, *TRI);
     }
+
+    Changed = true;
   }
 
-  return true;
+  // After all s_op_saveexec instructions are inserted,
+  // replace (on GFX10.3 and later)
+  // v_cmp_* SGPR, IMM, VGPR
+  // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR
+  // with
+  // s_mov_b32 EXEC_SGPR_DEST, exec_lo
+  // v_cmpx_* IMM, VGPR
+  // to reduce pipeline stalls.
+  if (ST.hasGFX10_3Insts()) {
+    DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
+    const unsigned AndSaveExecOpcode =
+        ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
+
+    for (MachineBasicBlock &MBB : MF) {
+      for (MachineInstr &MI : MBB) {
+        // Record relevant v_cmp / s_and_saveexec instruction pairs for
+        // replacement.
+        if (MI.getOpcode() != AndSaveExecOpcode)
+          continue;
+
+        if (MachineInstr *VCmp =
+                findPossibleVCMPVCMPXOptimization(MI, Exec, TRI, TII, *MRI))
+          SaveExecVCmpMapping[&MI] = VCmp;
+      }
+    }
+
+    for (const auto &Entry : SaveExecVCmpMapping) {
+      MachineInstr *SaveExecInstr = Entry.getFirst();
+      MachineInstr *VCmpInstr = Entry.getSecond();
+
+      if (optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec, TII,
+                                       TRI, *MRI)) {
+        SaveExecInstr->eraseFromParent();
+        VCmpInstr->eraseFromParent();
+
+        Changed = true;
+      }
+    }
+  }
 
+  return Changed;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
index 5f89f3826683..e5e65a8dbbf1 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -39,7 +39,7 @@ private:
   MCRegister CondReg;
   MCRegister ExecReg;
 
-  Register optimizeVcndVcmpPair(MachineBasicBlock &MBB);
+  bool optimizeVcndVcmpPair(MachineBasicBlock &MBB);
   bool optimizeElseBranch(MachineBasicBlock &MBB);
 
 public:
@@ -90,8 +90,8 @@ static bool isDefBetween(const LiveRange &LR, SlotIndex AndIdx,
 static bool isDefBetween(const SIRegisterInfo &TRI,
                          LiveIntervals *LIS, Register Reg,
                          const MachineInstr &Sel, const MachineInstr &And) {
-  SlotIndex AndIdx = LIS->getInstructionIndex(And);
-  SlotIndex SelIdx = LIS->getInstructionIndex(Sel);
+  SlotIndex AndIdx = LIS->getInstructionIndex(And).getRegSlot();
+  SlotIndex SelIdx = LIS->getInstructionIndex(Sel).getRegSlot();
 
   if (Reg.isVirtual())
     return isDefBetween(LIS->getInterval(Reg), AndIdx, SelIdx);
@@ -119,21 +119,20 @@ static bool isDefBetween(const SIRegisterInfo &TRI,
 // required part of the pattern since V_CNDMASK_B32 writes zeroes for inactive
 // lanes.
 //
-// Returns %cc register on success.
-Register
-SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
+// Returns true on success.
+bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
   auto I = llvm::find_if(MBB.terminators(), [](const MachineInstr &MI) {
                            unsigned Opc = MI.getOpcode();
                            return Opc == AMDGPU::S_CBRANCH_VCCZ ||
                                   Opc == AMDGPU::S_CBRANCH_VCCNZ; });
   if (I == MBB.terminators().end())
-    return Register();
+    return false;
 
   auto *And =
       TRI->findReachingDef(CondReg, AMDGPU::NoSubRegister, *I, *MRI, LIS);
   if (!And || And->getOpcode() != AndOpc ||
       !And->getOperand(1).isReg() || !And->getOperand(2).isReg())
-    return Register();
+    return false;
 
   MachineOperand *AndCC = &And->getOperand(1);
   Register CmpReg = AndCC->getReg();
@@ -143,49 +142,49 @@ SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
     CmpReg = AndCC->getReg();
     CmpSubReg = AndCC->getSubReg();
   } else if (And->getOperand(2).getReg() != Register(ExecReg)) {
-    return Register();
+    return false;
   }
 
   auto *Cmp = TRI->findReachingDef(CmpReg, CmpSubReg, *And, *MRI, LIS);
   if (!Cmp || !(Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e32 ||
                 Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e64) ||
       Cmp->getParent() != And->getParent())
-    return Register();
+    return false;
 
   MachineOperand *Op1 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src0);
   MachineOperand *Op2 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src1);
   if (Op1->isImm() && Op2->isReg())
     std::swap(Op1, Op2);
   if (!Op1->isReg() || !Op2->isImm() || Op2->getImm() != 1)
-    return Register();
+    return false;
 
   Register SelReg = Op1->getReg();
   auto *Sel = TRI->findReachingDef(SelReg, Op1->getSubReg(), *Cmp, *MRI, LIS);
   if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
-    return Register();
+    return false;
 
   if (TII->hasModifiersSet(*Sel, AMDGPU::OpName::src0_modifiers) ||
       TII->hasModifiersSet(*Sel, AMDGPU::OpName::src1_modifiers))
-    return Register();
+    return false;
 
   Op1 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src0);
   Op2 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src1);
   MachineOperand *CC = TII->getNamedOperand(*Sel, AMDGPU::OpName::src2);
   if (!Op1->isImm() || !Op2->isImm() || !CC->isReg() ||
       Op1->getImm() != 0 || Op2->getImm() != 1)
-    return Register();
+    return false;
 
   Register CCReg = CC->getReg();
 
   // If there was a def between the select and the and, we would need to move it
   // to fold this.
   if (isDefBetween(*TRI, LIS, CCReg, *Sel, *And))
-    return Register();
+    return false;
 
+  // TODO: Guard against implicit def operands?
   LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t' << *Cmp << '\t'
                     << *And);
 
-  LIS->RemoveMachineInstrFromMaps(*And);
   MachineInstr *Andn2 =
       BuildMI(MBB, *And, And->getDebugLoc(), TII->get(Andn2Opc),
               And->getOperand(0).getReg())
@@ -196,34 +195,92 @@ SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
   MachineOperand &Andn2SCC = Andn2->getOperand(3);
   assert(Andn2SCC.getReg() == AMDGPU::SCC);
   Andn2SCC.setIsDead(AndSCC.isDead());
+
+  SlotIndex AndIdx = LIS->ReplaceMachineInstrInMaps(*And, *Andn2);
   And->eraseFromParent();
-  LIS->InsertMachineInstrInMaps(*Andn2);
 
   LLVM_DEBUG(dbgs() << "=>\n\t" << *Andn2 << '\n');
 
+  SlotIndex CmpIdx = LIS->getInstructionIndex(*Cmp);
+  SlotIndex SelIdx = LIS->getInstructionIndex(*Sel);
+
+  LiveInterval *CmpLI =
+      CmpReg.isVirtual() ? &LIS->getInterval(CmpReg) : nullptr;
+  LiveInterval *SelLI =
+      SelReg.isVirtual() ? &LIS->getInterval(SelReg) : nullptr;
+
+  // Update live intervals for CCReg before potentially removing CmpReg/SelReg,
+  // and their associated liveness information.
+  if (CCReg.isVirtual()) {
+    // Note: this ignores that SelLI might have multiple internal values
+    // or splits and simply extends the live range to cover all cases
+    // where the result of the v_cndmask_b32 was live (e.g. loops).
+    // This could yield worse register allocation in rare edge cases.
+    SlotIndex EndIdx = AndIdx.getRegSlot();
+    if (SelLI && SelLI->endIndex() > EndIdx && SelLI->endIndex().isBlock())
+      EndIdx = SelLI->endIndex();
+
+    LiveInterval &CCLI = LIS->getInterval(CCReg);
+    auto CCQ = CCLI.Query(SelIdx.getRegSlot());
+    if (CCQ.valueIn()) {
+      CCLI.addSegment(LiveRange::Segment(SelIdx.getRegSlot(),
+                                         EndIdx, CCQ.valueIn()));
+    }
+
+    if (CC->getSubReg()) {
+      LaneBitmask Mask = TRI->getSubRegIndexLaneMask(CC->getSubReg());
+      BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
+      CCLI.refineSubRanges(
+          Allocator, Mask,
+          [=](LiveInterval::SubRange &SR) {
+            auto CCQS = SR.Query(SelIdx.getRegSlot());
+            if (CCQS.valueIn()) {
+              SR.addSegment(LiveRange::Segment(
+                  SelIdx.getRegSlot(), EndIdx, CCQS.valueIn()));
+            }
+          },
+          *LIS->getSlotIndexes(), *TRI);
+      CCLI.removeEmptySubRanges();
+
+      SmallVector<LiveInterval *> SplitLIs;
+      LIS->splitSeparateComponents(CCLI, SplitLIs);
+    }
+  } else
+    LIS->removeAllRegUnitsForPhysReg(CCReg);
+
   // Try to remove compare. Cmp value should not used in between of cmp
   // and s_and_b64 if VCC or just unused if any other register.
-  if ((CmpReg.isVirtual() && MRI->use_nodbg_empty(CmpReg)) ||
+  if ((CmpReg.isVirtual() && CmpLI && CmpLI->Query(AndIdx.getRegSlot()).isKill()) ||
       (CmpReg == Register(CondReg) &&
        std::none_of(std::next(Cmp->getIterator()), Andn2->getIterator(),
                     [&](const MachineInstr &MI) {
                       return MI.readsRegister(CondReg, TRI);
                     }))) {
     LLVM_DEBUG(dbgs() << "Erasing: " << *Cmp << '\n');
-
+    if (CmpLI)
+      LIS->removeVRegDefAt(*CmpLI, CmpIdx.getRegSlot());
     LIS->RemoveMachineInstrFromMaps(*Cmp);
     Cmp->eraseFromParent();
 
     // Try to remove v_cndmask_b32.
-    if (SelReg.isVirtual() && MRI->use_nodbg_empty(SelReg)) {
-      LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n');
+    if (SelLI) {
+      bool CanRemoveSel = SelLI->Query(CmpIdx.getRegSlot()).isKill();
+      if (!CanRemoveSel) {
+        // Try to shrink the live interval and check for dead def instead.
+        LIS->shrinkToUses(SelLI, nullptr);
+        CanRemoveSel = SelLI->Query(SelIdx.getRegSlot()).isDeadDef();
+      }
+      if (CanRemoveSel) {
+        LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n');
 
-      LIS->RemoveMachineInstrFromMaps(*Sel);
-      Sel->eraseFromParent();
+        LIS->removeVRegDefAt(*SelLI, SelIdx.getRegSlot());
+        LIS->RemoveMachineInstrFromMaps(*Sel);
+        Sel->eraseFromParent();
+      }
     }
   }
 
-  return CCReg;
+  return true;
 }
 
 // Optimize sequence
@@ -330,8 +387,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
       Changed = true;
     }
 
-    if (Register Reg = optimizeVcndVcmpPair(MBB)) {
-      RecalcRegs.insert(Reg);
+    if (optimizeVcndVcmpPair(MBB)) {
       RecalcRegs.insert(AMDGPU::VCC_LO);
       RecalcRegs.insert(AMDGPU::VCC_HI);
       RecalcRegs.insert(AMDGPU::SCC);
@@ -402,7 +458,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
     }
 
     // If the only user of a logical operation is move to exec, fold it now
-    // to prevent forming of saveexec. I.e:
+    // to prevent forming of saveexec. I.e.:
     //
     //    %0:sreg_64 = COPY $exec
     //    %1:sreg_64 = S_AND_B64 %0:sreg_64, %2:sreg_64
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
index e13e33ed5457..2ae3157bab49 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
@@ -112,8 +112,10 @@ public:
                             SmallVectorImpl<Register> &CandidateRegs) const;
 
   void collectWaterfallCandidateRegisters(
-      MachineBasicBlock *Loop,
-      SmallSetVector<Register, 16> &CandidateRegs) const;
+      MachineBasicBlock *LoopHeader, MachineBasicBlock *LoopEnd,
+      SmallSetVector<Register, 16> &CandidateRegs,
+      SmallSetVector<MachineBasicBlock *, 2> &Blocks,
+      SmallVectorImpl<MachineInstr *> &Instructions) const;
 
   void findNonPHIUsesInBlock(Register Reg, MachineBasicBlock *MBB,
                              SmallVectorImpl<MachineInstr *> &Uses) const;
@@ -131,7 +133,10 @@ public:
                     MachineBasicBlock *Flow, MachineBasicBlock *Endif,
                     SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks) const;
 
-  void optimizeWaterfallLiveRange(Register Reg, MachineBasicBlock *If) const;
+  void optimizeWaterfallLiveRange(
+      Register Reg, MachineBasicBlock *LoopHeader,
+      SmallSetVector<MachineBasicBlock *, 2> &LoopBlocks,
+      SmallVectorImpl<MachineInstr *> &Instructions) const;
 
   SIOptimizeVGPRLiveRange() : MachineFunctionPass(ID) {}
 
@@ -323,12 +328,34 @@ void SIOptimizeVGPRLiveRange::collectCandidateRegisters(
 /// Collect the registers used in the waterfall loop block that are defined
 /// before.
 void SIOptimizeVGPRLiveRange::collectWaterfallCandidateRegisters(
-    MachineBasicBlock *Loop,
-    SmallSetVector<Register, 16> &CandidateRegs) const {
+    MachineBasicBlock *LoopHeader, MachineBasicBlock *LoopEnd,
+    SmallSetVector<Register, 16> &CandidateRegs,
+    SmallSetVector<MachineBasicBlock *, 2> &Blocks,
+    SmallVectorImpl<MachineInstr *> &Instructions) const {
+
+  // Collect loop instructions, potentially spanning multiple blocks
+  auto *MBB = LoopHeader;
+  for (;;) {
+    Blocks.insert(MBB);
+    for (auto &MI : *MBB) {
+      if (MI.isDebugInstr())
+        continue;
+      Instructions.push_back(&MI);
+    }
+    if (MBB == LoopEnd)
+      break;
 
-  for (auto &MI : Loop->instrs()) {
-    if (MI.isDebugInstr())
-      continue;
+    if ((MBB != LoopHeader && MBB->pred_size() != 1) ||
+        (MBB == LoopHeader && MBB->pred_size() != 2) || MBB->succ_size() != 1) {
+      LLVM_DEBUG(dbgs() << "Unexpected edges in CFG, ignoring loop\n");
+      return;
+    }
+
+    MBB = *MBB->succ_begin();
+  }
+
+  for (auto *I : Instructions) {
+    auto &MI = *I;
 
     for (auto &MO : MI.operands()) {
       if (!MO.isReg() || !MO.getReg() || MO.isDef())
@@ -340,16 +367,17 @@ void SIOptimizeVGPRLiveRange::collectWaterfallCandidateRegisters(
         continue;
 
       if (MO.readsReg()) {
-        const MachineBasicBlock *DefMBB = MRI->getVRegDef(MOReg)->getParent();
+        MachineBasicBlock *DefMBB = MRI->getVRegDef(MOReg)->getParent();
         // Make sure the value is defined before the LOOP block
-        if (DefMBB != Loop && !CandidateRegs.contains(MOReg)) {
+        if (!Blocks.contains(DefMBB) && !CandidateRegs.contains(MOReg)) {
           // If the variable is used after the loop, the register coalescer will
           // merge the newly created register and remove the phi node again.
           // Just do nothing in that case.
           LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(MOReg);
           bool IsUsed = false;
-          for (auto *Succ : Loop->successors()) {
-            if (Succ != Loop && OldVarInfo.isLiveIn(*Succ, MOReg, *MRI)) {
+          for (auto *Succ : LoopEnd->successors()) {
+            if (!Blocks.contains(Succ) &&
+                OldVarInfo.isLiveIn(*Succ, MOReg, *MRI)) {
               IsUsed = true;
               break;
             }
@@ -513,7 +541,9 @@ void SIOptimizeVGPRLiveRange::optimizeLiveRange(
 }
 
 void SIOptimizeVGPRLiveRange::optimizeWaterfallLiveRange(
-    Register Reg, MachineBasicBlock *Loop) const {
+    Register Reg, MachineBasicBlock *LoopHeader,
+    SmallSetVector<MachineBasicBlock *, 2> &Blocks,
+    SmallVectorImpl<MachineInstr *> &Instructions) const {
   // Insert a new PHI, marking the value from the last loop iteration undef.
   LLVM_DEBUG(dbgs() << "Optimizing " << printReg(Reg, TRI) << '\n');
   const auto *RC = MRI->getRegClass(Reg);
@@ -525,15 +555,16 @@ void SIOptimizeVGPRLiveRange::optimizeWaterfallLiveRange(
   for (auto &O : make_early_inc_range(MRI->use_operands(Reg))) {
     auto *UseMI = O.getParent();
     auto *UseBlock = UseMI->getParent();
-    // Replace uses in Loop block
-    if (UseBlock == Loop)
+    // Replace uses in Loop blocks
+    if (Blocks.contains(UseBlock))
       O.setReg(NewReg);
   }
 
-  MachineInstrBuilder PHI = BuildMI(*Loop, Loop->getFirstNonPHI(), DebugLoc(),
-                                    TII->get(TargetOpcode::PHI), NewReg);
-  for (auto *Pred : Loop->predecessors()) {
-    if (Pred == Loop)
+  MachineInstrBuilder PHI =
+      BuildMI(*LoopHeader, LoopHeader->getFirstNonPHI(), DebugLoc(),
+              TII->get(TargetOpcode::PHI), NewReg);
+  for (auto *Pred : LoopHeader->predecessors()) {
+    if (Blocks.contains(Pred))
       PHI.addReg(UndefReg, RegState::Undef).addMBB(Pred);
     else
       PHI.addReg(Reg).addMBB(Pred);
@@ -542,21 +573,36 @@ void SIOptimizeVGPRLiveRange::optimizeWaterfallLiveRange(
   LiveVariables::VarInfo &NewVarInfo = LV->getVarInfo(NewReg);
   LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg);
 
-  // collectWaterfallCandidateRegisters only collects registers that are dead
-  // after the loop. So we know that the old reg is not live throughout the
-  // whole block anymore.
-  OldVarInfo.AliveBlocks.reset(Loop->getNumber());
-
-  // Mark the last use as kill
-  for (auto &MI : reverse(Loop->instrs())) {
-    if (MI.readsRegister(NewReg, TRI)) {
-      MI.addRegisterKilled(NewReg, TRI);
-      NewVarInfo.Kills.push_back(&MI);
+  // Find last use and mark as kill
+  MachineInstr *Kill = nullptr;
+  for (auto *MI : reverse(Instructions)) {
+    if (MI->readsRegister(NewReg, TRI)) {
+      MI->addRegisterKilled(NewReg, TRI);
+      NewVarInfo.Kills.push_back(MI);
+      Kill = MI;
       break;
     }
   }
-  assert(!NewVarInfo.Kills.empty() &&
-         "Failed to find last usage of register in loop");
+  assert(Kill && "Failed to find last usage of register in loop");
+
+  MachineBasicBlock *KillBlock = Kill->getParent();
+  bool PostKillBlock = false;
+  for (auto *Block : Blocks) {
+    auto BBNum = Block->getNumber();
+
+    // collectWaterfallCandidateRegisters only collects registers that are dead
+    // after the loop. So we know that the old reg is no longer live throughout
+    // the waterfall loop.
+    OldVarInfo.AliveBlocks.reset(BBNum);
+
+    // The new register is live up to (and including) the block that kills it.
+    PostKillBlock |= (Block == KillBlock);
+    if (PostKillBlock) {
+      NewVarInfo.AliveBlocks.reset(BBNum);
+    } else if (Block != LoopHeader) {
+      NewVarInfo.AliveBlocks.set(BBNum);
+    }
+  }
 }
 
 char SIOptimizeVGPRLiveRange::ID = 0;
@@ -601,6 +647,10 @@ bool SIOptimizeVGPRLiveRange::runOnMachineFunction(MachineFunction &MF) {
         if (!Endif)
           continue;
 
+        // Skip unexpected control flow.
+        if (!MDT->dominates(&MBB, IfTarget) || !MDT->dominates(IfTarget, Endif))
+          continue;
+
         SmallSetVector<MachineBasicBlock *, 16> ElseBlocks;
         SmallVector<Register> CandidateRegs;
 
@@ -620,15 +670,22 @@ bool SIOptimizeVGPRLiveRange::runOnMachineFunction(MachineFunction &MF) {
         for (auto Reg : CandidateRegs)
           optimizeLiveRange(Reg, &MBB, IfTarget, Endif, ElseBlocks);
       } else if (MI.getOpcode() == AMDGPU::SI_WATERFALL_LOOP) {
+        auto *LoopHeader = MI.getOperand(0).getMBB();
+        auto *LoopEnd = &MBB;
+
         LLVM_DEBUG(dbgs() << "Checking Waterfall loop: "
-                          << printMBBReference(MBB) << '\n');
+                          << printMBBReference(*LoopHeader) << '\n');
 
         SmallSetVector<Register, 16> CandidateRegs;
-        collectWaterfallCandidateRegisters(&MBB, CandidateRegs);
+        SmallVector<MachineInstr *, 16> Instructions;
+        SmallSetVector<MachineBasicBlock *, 2> Blocks;
+
+        collectWaterfallCandidateRegisters(LoopHeader, LoopEnd, CandidateRegs,
+                                           Blocks, Instructions);
         MadeChange |= !CandidateRegs.empty();
         // Now we are safe to optimize.
         for (auto Reg : CandidateRegs)
-          optimizeWaterfallLiveRange(Reg, &MBB);
+          optimizeWaterfallLiveRange(Reg, LoopHeader, Blocks, Instructions);
       }
     }
   }
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index da41a5e2478a..e768a2f3e1a5 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -316,7 +316,7 @@ uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
   }
   if (Abs || Neg) {
     assert(!Sext &&
-           "Float and integer src modifiers can't be set simulteniously");
+           "Float and integer src modifiers can't be set simultaneously");
     Mods |= Abs ? SISrcMods::ABS : 0u;
     Mods ^= Neg ? SISrcMods::NEG : 0u;
   } else if (Sext) {
@@ -1131,16 +1131,16 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
   bool Converted = false;
   for (auto &Operand : SDWAOperands) {
     LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand);
-    // There should be no intesection between SDWA operands and potential MIs
+    // There should be no intersection between SDWA operands and potential MIs
     // e.g.:
     // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
     // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
     // v_add_u32 v3, v4, v2
     //
-    // In that example it is possible that we would fold 2nd instruction into 3rd
-    // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was
-    // already destroyed). So if SDWAOperand is also a potential MI then do not
-    // apply it.
+    // In that example it is possible that we would fold 2nd instruction into
+    // 3rd (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that
+    // was already destroyed). So if SDWAOperand is also a potential MI then do
+    // not apply it.
     if (PotentialMatches.count(Operand->getParentInst()) == 0)
       Converted |= Operand->convertToSDWA(*SDWAInst, TII);
   }
diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
index c2e2875ed6bf..4fab13bb44b1 100644
--- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -18,7 +18,10 @@
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/InitializePasses.h"
 
 using namespace llvm;
@@ -85,9 +88,6 @@ FunctionPass *llvm::createSIPreAllocateWWMRegsPass() {
 }
 
 bool SIPreAllocateWWMRegs::processDef(MachineOperand &MO) {
-  if (!MO.isReg())
-    return false;
-
   Register Reg = MO.getReg();
   if (Reg.isPhysical())
     return false;
@@ -111,7 +111,6 @@ bool SIPreAllocateWWMRegs::processDef(MachineOperand &MO) {
   }
 
   llvm_unreachable("physreg not found for WWM expression");
-  return false;
 }
 
 void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
@@ -142,7 +141,6 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
   }
 
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  MachineFrameInfo &FrameInfo = MF.getFrameInfo();
 
   for (unsigned Reg : RegsToRewrite) {
     LIS->removeInterval(Reg);
@@ -150,18 +148,7 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
     const Register PhysReg = VRM->getPhys(Reg);
     assert(PhysReg != 0);
 
-    // Check if PhysReg is already reserved
-    if (!MFI->WWMReservedRegs.count(PhysReg)) {
-      Optional<int> FI;
-      if (!MFI->isEntryFunction()) {
-        // Create a stack object for a possible spill in the function prologue.
-        // Note: Non-CSR VGPR also need this as we may overwrite inactive lanes.
-        const TargetRegisterClass *RC = TRI->getPhysRegClass(PhysReg);
-        FI = FrameInfo.CreateSpillStackObject(TRI->getSpillSize(*RC),
-                                              TRI->getSpillAlign(*RC));
-      }
-      MFI->reserveWWMRegister(PhysReg, FI);
-    }
+    MFI->reserveWWMRegister(PhysReg);
   }
 
   RegsToRewrite.clear();
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index b0e45dd3e3e3..8d33b8a1fd4b 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -74,6 +74,15 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
   // We end up with this pattern sometimes after basic block placement.
   // It happens while combining a block which assigns -1 or 0 to a saved mask
   // and another block which consumes that saved mask and then a branch.
+  //
+  // While searching this also performs the following substitution:
+  // vcc = V_CMP
+  // vcc = S_AND exec, vcc
+  // S_CBRANCH_VCC[N]Z
+  // =>
+  // vcc = V_CMP
+  // S_CBRANCH_VCC[N]Z
+
   bool Changed = false;
   MachineBasicBlock &MBB = *MI.getParent();
   const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
@@ -121,19 +130,32 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
     SReg = Op2.getReg();
     auto M = std::next(A);
     bool ReadsSreg = false;
+    bool ModifiesExec = false;
     for (; M != E; ++M) {
       if (M->definesRegister(SReg, TRI))
         break;
       if (M->modifiesRegister(SReg, TRI))
         return Changed;
       ReadsSreg |= M->readsRegister(SReg, TRI);
+      ModifiesExec |= M->modifiesRegister(ExecReg, TRI);
+    }
+    if (M == E)
+      return Changed;
+    // If SReg is VCC and SReg definition is a VALU comparison.
+    // This means S_AND with EXEC is not required.
+    // Erase the S_AND and return.
+    // Note: isVOPC is used instead of isCompare to catch V_CMP_CLASS
+    if (A->getOpcode() == And && SReg == CondReg && !ModifiesExec &&
+        TII->isVOPC(*M)) {
+      A->eraseFromParent();
+      return true;
     }
-    if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() ||
+    if (!M->isMoveImmediate() || !M->getOperand(1).isImm() ||
         (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
       return Changed;
     MaskValue = M->getOperand(1).getImm();
     // First if sreg is only used in the AND instruction fold the immediate
-    // into into the AND.
+    // into the AND.
     if (!ReadsSreg && Op2.isKill()) {
       A->getOperand(2).ChangeToImmediate(MaskValue);
       M->eraseFromParent();
@@ -213,7 +235,7 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
         TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
   }
 
-  MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
+  MI.removeOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
   MI.addImplicitDefUseOperands(*MBB.getParent());
 
   return true;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 21aed4ececb5..ad1455ed20fd 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -19,7 +19,9 @@
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 
 using namespace llvm;
@@ -182,6 +184,16 @@ struct SGPRSpillBuilder {
       TmpVGPRLive = true;
     }
 
+    if (TmpVGPRLive) {
+      // We need to inform the scavenger that this index is already in use until
+      // we're done with the custom emergency spill.
+      RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR);
+    }
+
+    // We may end up recursively calling the scavenger, and don't want to re-use
+    // the same register.
+    RS->setRegUsed(TmpVGPR);
+
     // Try to scavenge SGPRs to save exec
     assert(!SavedExecReg && "Exec is already saved, refuse to save again");
     const TargetRegisterClass &RC =
@@ -202,6 +214,12 @@ struct SGPRSpillBuilder {
       // Spill needed lanes
       TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
     } else {
+      // The modify and restore of exec clobber SCC, which we would have to save
+      // and restore. FIXME: We probably would need to reserve a register for
+      // this.
+      if (RS->isRegUsed(AMDGPU::SCC))
+        MI->emitError("unhandled SGPR spill to memory");
+
       // Spill active lanes
       if (TmpVGPRLive)
         TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false,
@@ -251,6 +269,12 @@ struct SGPRSpillBuilder {
       if (TmpVGPRLive)
         TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true);
     }
+
+    // Inform the scavenger where we're releasing our custom scavenged register.
+    if (TmpVGPRLive) {
+      MachineBasicBlock::iterator RestorePt = std::prev(MI);
+      RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR, &*RestorePt);
+    }
   }
 
   // Write TmpVGPR to memory or read TmpVGPR from memory.
@@ -265,6 +289,12 @@ struct SGPRSpillBuilder {
       // Spill needed lanes
       TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
     } else {
+      // The modify and restore of exec clobber SCC, which we would have to save
+      // and restore. FIXME: We probably would need to reserve a register for
+      // this.
+      if (RS->isRegUsed(AMDGPU::SCC))
+        MI->emitError("unhandled SGPR spill to memory");
+
       // Spill active lanes
       TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad,
                                   /*IsKill*/ false);
@@ -329,7 +359,7 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
   static auto InitializeSubRegFromChannelTableOnce = [this]() {
     for (auto &Row : SubRegFromChannelTable)
       Row.fill(AMDGPU::NoSubRegister);
-    for (uint16_t Idx = 1; Idx < getNumSubRegIndices(); ++Idx) {
+    for (unsigned Idx = 1; Idx < getNumSubRegIndices(); ++Idx) {
       unsigned Width = AMDGPUSubRegIdxRanges[Idx].Size / 32;
       unsigned Offset = AMDGPUSubRegIdxRanges[Idx].Offset / 32;
       assert(Width < SubRegFromChannelTableWidthMap.size());
@@ -364,13 +394,11 @@ const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
   case CallingConv::C:
   case CallingConv::Fast:
   case CallingConv::Cold:
-    return MF->getSubtarget<GCNSubtarget>().hasGFX90AInsts()
-        ? CSR_AMDGPU_HighRegs_With_AGPRs_SaveList
-        : CSR_AMDGPU_HighRegs_SaveList;
+    return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList
+                               : CSR_AMDGPU_SaveList;
   case CallingConv::AMDGPU_Gfx:
-    return MF->getSubtarget<GCNSubtarget>().hasGFX90AInsts()
-               ? CSR_AMDGPU_SI_Gfx_With_AGPRs_SaveList
-               : CSR_AMDGPU_SI_Gfx_SaveList;
+    return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList
+                               : CSR_AMDGPU_SI_Gfx_SaveList;
   default: {
     // Dummy to not crash RegisterClassInfo.
     static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
@@ -390,13 +418,11 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
   case CallingConv::C:
   case CallingConv::Fast:
   case CallingConv::Cold:
-    return MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts()
-        ? CSR_AMDGPU_HighRegs_With_AGPRs_RegMask
-        : CSR_AMDGPU_HighRegs_RegMask;
+    return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask
+                               : CSR_AMDGPU_RegMask;
   case CallingConv::AMDGPU_Gfx:
-    return MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts()
-               ? CSR_AMDGPU_SI_Gfx_With_AGPRs_RegMask
-               : CSR_AMDGPU_SI_Gfx_RegMask;
+    return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask
+                               : CSR_AMDGPU_SI_Gfx_RegMask;
   default:
     return nullptr;
   }
@@ -413,8 +439,7 @@ SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
   // equivalent AV class. If used one, the verifier will crash after
   // RegBankSelect in the GISel flow. The aligned regclasses are not fully given
   // until Instruction selection.
-  if (MF.getSubtarget<GCNSubtarget>().hasMAIInsts() &&
-      (isVGPRClass(RC) || isAGPRClass(RC))) {
+  if (ST.hasMAIInsts() && (isVGPRClass(RC) || isAGPRClass(RC))) {
     if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass)
       return &AMDGPU::AV_32RegClass;
     if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass)
@@ -463,8 +488,7 @@ SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
 }
 
 Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  const SIFrameLowering *TFI =
-      MF.getSubtarget<GCNSubtarget>().getFrameLowering();
+  const SIFrameLowering *TFI = ST.getFrameLowering();
   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
   // During ISel lowering we always reserve the stack pointer in entry
   // functions, but never actually want to reference it when accessing our own
@@ -487,19 +511,19 @@ bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
 Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
 
 const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const {
-  return CSR_AMDGPU_AllVGPRs_RegMask;
+  return AMDGPU_AllVGPRs_RegMask;
 }
 
 const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const {
-  return CSR_AMDGPU_AllAGPRs_RegMask;
+  return AMDGPU_AllAGPRs_RegMask;
 }
 
 const uint32_t *SIRegisterInfo::getAllVectorRegMask() const {
-  return CSR_AMDGPU_AllVectorRegs_RegMask;
+  return AMDGPU_AllVectorRegs_RegMask;
 }
 
 const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const {
-  return CSR_AMDGPU_AllAllocatableSRegs_RegMask;
+  return AMDGPU_AllAllocatableSRegs_RegMask;
 }
 
 unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel,
@@ -522,6 +546,10 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   Reserved.set(AMDGPU::MODE);
 
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  // Reserve special purpose registers.
+  //
   // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
   // this seems likely to result in bugs, so I'm marking them as reserved.
   reserveRegisterTuples(Reserved, AMDGPU::EXEC);
@@ -563,7 +591,7 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
 
   // Reserve null register - it shall never be allocated
-  reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL);
+  reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64);
 
   // Disallow vcc_hi allocation in wave32. It may be allocated but most likely
   // will result in bugs.
@@ -572,6 +600,8 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     Reserved.set(AMDGPU::VCC_HI);
   }
 
+  // Reserve SGPRs.
+  //
   unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
   unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
   for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) {
@@ -579,39 +609,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     reserveRegisterTuples(Reserved, Reg);
   }
 
-  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
-  unsigned MaxNumAGPRs = MaxNumVGPRs;
-  unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
-
-  if (ST.hasGFX90AInsts()) {
-    // In an entry function without calls and AGPRs used it is possible to use
-    // the whole register budget for VGPRs.
-
-    // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and
-    //       split register file accordingly.
-    if (MFI->usesAGPRs(MF)) {
-      MaxNumVGPRs /= 2;
-      MaxNumAGPRs = MaxNumVGPRs;
-    } else {
-      if (MaxNumVGPRs > TotalNumVGPRs) {
-        MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs;
-        MaxNumVGPRs = TotalNumVGPRs;
-      } else
-        MaxNumAGPRs = 0;
-    }
-  }
-
-  for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) {
-    unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
-    reserveRegisterTuples(Reserved, Reg);
-  }
-
-  for (unsigned i = MaxNumAGPRs; i < TotalNumVGPRs; ++i) {
-    unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
-    reserveRegisterTuples(Reserved, Reg);
-  }
-
   for (auto Reg : AMDGPU::SReg_32RegClass) {
     Reserved.set(getSubReg(Reg, AMDGPU::hi16));
     Register Low = getSubReg(Reg, AMDGPU::lo16);
@@ -620,22 +617,10 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
       Reserved.set(Low);
   }
 
-  for (auto Reg : AMDGPU::AGPR_32RegClass) {
-    Reserved.set(getSubReg(Reg, AMDGPU::hi16));
-  }
-
-  // Reserve all the rest AGPRs if there are no instructions to use it.
-  if (!ST.hasMAIInsts()) {
-    for (unsigned i = 0; i < MaxNumVGPRs; ++i) {
-      unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
-      reserveRegisterTuples(Reserved, Reg);
-    }
-  }
-
   Register ScratchRSrcReg = MFI->getScratchRSrcReg();
   if (ScratchRSrcReg != AMDGPU::NoRegister) {
-    // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
-    // to spill.
+    // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we
+    // need to spill.
     // TODO: May need to reserve a VGPR if doing LDS spilling.
     reserveRegisterTuples(Reserved, ScratchRSrcReg);
   }
@@ -644,7 +629,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   // which is detected after the function is lowered. If we aren't really going
   // to need SP, don't bother reserving it.
   MCRegister StackPtrReg = MFI->getStackPtrOffsetReg();
-
   if (StackPtrReg) {
     reserveRegisterTuples(Reserved, StackPtrReg);
     assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
@@ -662,20 +646,63 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
   }
 
-  for (auto Reg : MFI->WWMReservedRegs) {
-    reserveRegisterTuples(Reserved, Reg.first);
+  // Reserve VGPRs/AGPRs.
+  //
+  unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
+  unsigned MaxNumAGPRs = MaxNumVGPRs;
+  unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
+
+  // Reserve all the AGPRs if there are no instructions to use it.
+  if (!ST.hasMAIInsts()) {
+    for (unsigned i = 0; i < MaxNumAGPRs; ++i) {
+      unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
+      reserveRegisterTuples(Reserved, Reg);
+    }
   }
 
-  // Reserve VGPRs used for SGPR spilling.
-  // Note we treat freezeReservedRegs unusually because we run register
-  // allocation in two phases. It's OK to re-freeze with new registers for the
-  // second run.
-#if 0
-  for (auto &SpilledFI : MFI->sgpr_spill_vgprs()) {
-    for (auto &SpilledVGPR : SpilledFI.second)
-      reserveRegisterTuples(Reserved, SpilledVGPR.VGPR);
+  for (auto Reg : AMDGPU::AGPR_32RegClass) {
+    Reserved.set(getSubReg(Reg, AMDGPU::hi16));
   }
-#endif
+
+  // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
+  // a wave may have up to 512 total vector registers combining together both
+  // VGPRs and AGPRs. Hence, in an entry function without calls and without
+  // AGPRs used within it, it is possible to use the whole vector register
+  // budget for VGPRs.
+  //
+  // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
+  //       register file accordingly.
+  if (ST.hasGFX90AInsts()) {
+    if (MFI->usesAGPRs(MF)) {
+      MaxNumVGPRs /= 2;
+      MaxNumAGPRs = MaxNumVGPRs;
+    } else {
+      if (MaxNumVGPRs > TotalNumVGPRs) {
+        MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs;
+        MaxNumVGPRs = TotalNumVGPRs;
+      } else
+        MaxNumAGPRs = 0;
+    }
+  }
+
+  for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) {
+    unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
+    reserveRegisterTuples(Reserved, Reg);
+  }
+
+  for (unsigned i = MaxNumAGPRs; i < TotalNumVGPRs; ++i) {
+    unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
+    reserveRegisterTuples(Reserved, Reg);
+  }
+
+  // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
+  // VGPR available at all times.
+  if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
+    reserveRegisterTuples(Reserved, MFI->getVGPRForAGPRCopy());
+  }
+
+  for (Register Reg : MFI->WWMReservedRegs)
+    reserveRegisterTuples(Reserved, Reg);
 
   // FIXME: Stop using reserved registers for this.
   for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
@@ -690,6 +717,11 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   return Reserved;
 }
 
+bool SIRegisterInfo::isAsmClobberable(const MachineFunction &MF,
+                                      MCRegister PhysReg) const {
+  return !MF.getRegInfo().isReserved(PhysReg);
+}
+
 bool SIRegisterInfo::shouldRealignStack(const MachineFunction &MF) const {
   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
   // On entry, the base address is 0, so it can't possibly need any more
@@ -1010,6 +1042,8 @@ static int getOffsetMUBUFStore(unsigned Opc) {
     return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
   case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
     return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
+  case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN:
+    return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET;
   case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
     return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
   case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN:
@@ -1035,6 +1069,8 @@ static int getOffsetMUBUFLoad(unsigned Opc) {
     return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
   case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
     return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
+  case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN:
+    return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET;
   case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
     return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
   case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN:
@@ -1054,6 +1090,64 @@ static int getOffsetMUBUFLoad(unsigned Opc) {
   }
 }
 
+static int getOffenMUBUFStore(unsigned Opc) {
+  switch (Opc) {
+  case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
+    return AMDGPU::BUFFER_STORE_DWORD_OFFEN;
+  case AMDGPU::BUFFER_STORE_BYTE_OFFSET:
+    return AMDGPU::BUFFER_STORE_BYTE_OFFEN;
+  case AMDGPU::BUFFER_STORE_SHORT_OFFSET:
+    return AMDGPU::BUFFER_STORE_SHORT_OFFEN;
+  case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
+    return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
+  case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET:
+    return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN;
+  case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET:
+    return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
+  case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET:
+    return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN;
+  case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET:
+    return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN;
+  default:
+    return -1;
+  }
+}
+
+static int getOffenMUBUFLoad(unsigned Opc) {
+  switch (Opc) {
+  case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
+    return AMDGPU::BUFFER_LOAD_DWORD_OFFEN;
+  case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET:
+    return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN;
+  case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET:
+    return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN;
+  case AMDGPU::BUFFER_LOAD_USHORT_OFFSET:
+    return AMDGPU::BUFFER_LOAD_USHORT_OFFEN;
+  case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET:
+    return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN;
+  case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET:
+    return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
+  case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET:
+    return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN;
+  case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET:
+    return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
+  case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET:
+    return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN;
+  case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET:
+    return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN;
+  case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET:
+    return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN;
+  case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET:
+    return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN;
+  case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET:
+    return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN;
+  case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET:
+    return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN;
+  default:
+    return -1;
+  }
+}
+
 static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST,
                                            MachineBasicBlock &MBB,
                                            MachineBasicBlock::iterator MI,
@@ -1139,8 +1233,9 @@ static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII,
                                           unsigned LoadStoreOp,
                                           unsigned EltSize) {
   bool IsStore = TII->get(LoadStoreOp).mayStore();
+  bool HasVAddr = AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) != -1;
   bool UseST =
-    AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 &&
+    !HasVAddr &&
     AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::saddr) < 0;
 
   switch (EltSize) {
@@ -1164,7 +1259,9 @@ static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII,
     llvm_unreachable("Unexpected spill load/store size!");
   }
 
-  if (UseST)
+  if (HasVAddr)
+    LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
+  else if (UseST)
     LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
 
   return LoadStoreOp;
@@ -1186,6 +1283,7 @@ void SIRegisterInfo::buildSpillLoadStore(
   bool IsStore = Desc->mayStore();
   bool IsFlat = TII->isFLATScratch(LoadStoreOp);
 
+  bool CanClobberSCC = false;
   bool Scavenged = false;
   MCRegister SOffset = ScratchOffsetReg;
 
@@ -1202,6 +1300,8 @@ void SIRegisterInfo::buildSpillLoadStore(
   unsigned RemSize = RegWidth - Size;
   unsigned NumRemSubRegs = RemSize ? 1 : 0;
   int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
+  int64_t MaterializedOffset = Offset;
+
   int64_t MaxOffset = Offset + Size + RemSize - EltSize;
   int64_t ScratchOffsetRegDelta = 0;
 
@@ -1216,6 +1316,42 @@ void SIRegisterInfo::buildSpillLoadStore(
   assert((IsFlat || ((Offset % EltSize) == 0)) &&
          "unexpected VGPR spill offset");
 
+  // Track a VGPR to use for a constant offset we need to materialize.
+  Register TmpOffsetVGPR;
+
+  // Track a VGPR to use as an intermediate value.
+  Register TmpIntermediateVGPR;
+  bool UseVGPROffset = false;
+
+  // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate
+  // combination.
+  auto MaterializeVOffset = [&](Register SGPRBase, Register TmpVGPR,
+                                int64_t VOffset) {
+    // We are using a VGPR offset
+    if (IsFlat && SGPRBase) {
+      // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free
+      // SGPR, so perform the add as vector.
+      // We don't need a base SGPR in the kernel.
+
+      if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) >= 2) {
+        BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), TmpVGPR)
+          .addReg(SGPRBase)
+          .addImm(VOffset)
+          .addImm(0); // clamp
+      } else {
+        BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
+          .addReg(SGPRBase);
+        BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), TmpVGPR)
+          .addImm(VOffset)
+          .addReg(TmpOffsetVGPR);
+      }
+    } else {
+      assert(TmpOffsetVGPR);
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
+        .addImm(VOffset);
+    }
+  };
+
   bool IsOffsetLegal =
       IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
                                       SIInstrFlags::FlatScratch)
@@ -1223,17 +1359,17 @@ void SIRegisterInfo::buildSpillLoadStore(
   if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) {
     SOffset = MCRegister();
 
-    // We currently only support spilling VGPRs to EltSize boundaries, meaning
-    // we can simplify the adjustment of Offset here to just scale with
-    // WavefrontSize.
-    if (!IsFlat)
-      Offset *= ST.getWavefrontSize();
-
     // We don't have access to the register scavenger if this function is called
     // during  PEI::scavengeFrameVirtualRegs() so use LiveRegs in this case.
+    // TODO: Clobbering SCC is not necessary for scratch instructions in the
+    // entry.
     if (RS) {
       SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false);
+
+      // Piggy back on the liveness scan we just did see if SCC is dead.
+      CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC);
     } else if (LiveRegs) {
+      CanClobberSCC = !LiveRegs->contains(AMDGPU::SCC);
       for (MCRegister Reg : AMDGPU::SGPR_32RegClass) {
         if (LiveRegs->available(MF->getRegInfo(), Reg)) {
           SOffset = Reg;
@@ -1242,7 +1378,26 @@ void SIRegisterInfo::buildSpillLoadStore(
       }
     }
 
+    if (ScratchOffsetReg != AMDGPU::NoRegister && !CanClobberSCC)
+      SOffset = Register();
+
     if (!SOffset) {
+      UseVGPROffset = true;
+
+      if (RS) {
+        TmpOffsetVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
+      } else {
+        assert(LiveRegs);
+        for (MCRegister Reg : AMDGPU::VGPR_32RegClass) {
+          if (LiveRegs->available(MF->getRegInfo(), Reg)) {
+            TmpOffsetVGPR = Reg;
+            break;
+          }
+        }
+      }
+
+      assert(TmpOffsetVGPR);
+    } else if (!SOffset && CanClobberSCC) {
       // There are no free SGPRs, and since we are in the process of spilling
       // VGPRs too.  Since we need a VGPR in order to spill SGPRs (this is true
       // on SI/CI and on VI it is true until we implement spilling using scalar
@@ -1250,6 +1405,9 @@ void SIRegisterInfo::buildSpillLoadStore(
       // add the offset directly to the ScratchOffset or StackPtrOffset
       // register, and then subtract the offset after the spill to return the
       // register to it's original value.
+
+      // TODO: If we don't have to do an emergency stack slot spill, converting
+      // to use the VGPR offset is fewer instructions.
       if (!ScratchOffsetReg)
         ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg();
       SOffset = ScratchOffsetReg;
@@ -1258,12 +1416,22 @@ void SIRegisterInfo::buildSpillLoadStore(
       Scavenged = true;
     }
 
-    if (!SOffset)
+    // We currently only support spilling VGPRs to EltSize boundaries, meaning
+    // we can simplify the adjustment of Offset here to just scale with
+    // WavefrontSize.
+    if (!IsFlat && !UseVGPROffset)
+      Offset *= ST.getWavefrontSize();
+
+    if (!UseVGPROffset && !SOffset)
       report_fatal_error("could not scavenge SGPR to spill in entry function");
 
-    if (ScratchOffsetReg == AMDGPU::NoRegister) {
+    if (UseVGPROffset) {
+      // We are using a VGPR offset
+      MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, Offset);
+    } else if (ScratchOffsetReg == AMDGPU::NoRegister) {
       BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset);
     } else {
+      assert(Offset != 0);
       auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
           .addReg(ScratchOffsetReg)
           .addImm(Offset);
@@ -1277,13 +1445,16 @@ void SIRegisterInfo::buildSpillLoadStore(
     assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0
            && "Unexpected vaddr for flat scratch with a FI operand");
 
-    assert(ST.hasFlatScratchSTMode());
-    LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
+    if (UseVGPROffset) {
+      LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
+    } else {
+      assert(ST.hasFlatScratchSTMode());
+      LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
+    }
+
     Desc = &TII->get(LoadStoreOp);
   }
 
-  Register TmpReg;
-
   for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e;
        ++i, RegOffset += EltSize) {
     if (i == NumSubRegs) {
@@ -1292,6 +1463,22 @@ void SIRegisterInfo::buildSpillLoadStore(
     }
     Desc = &TII->get(LoadStoreOp);
 
+    if (!IsFlat && UseVGPROffset) {
+      int NewLoadStoreOp = IsStore ? getOffenMUBUFStore(LoadStoreOp)
+                                   : getOffenMUBUFLoad(LoadStoreOp);
+      Desc = &TII->get(NewLoadStoreOp);
+    }
+
+    if (UseVGPROffset && TmpOffsetVGPR == TmpIntermediateVGPR) {
+      // If we are spilling an AGPR beyond the range of the memory instruction
+      // offset and need to use a VGPR offset, we ideally have at least 2
+      // scratch VGPRs. If we don't have a second free VGPR without spilling,
+      // recycle the VGPR used for the offset which requires resetting after
+      // each subregister.
+
+      MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, MaterializedOffset);
+    }
+
     unsigned NumRegs = EltSize / 4;
     Register SubReg = e == 1
             ? ValueReg
@@ -1300,7 +1487,8 @@ void SIRegisterInfo::buildSpillLoadStore(
 
     unsigned SOffsetRegState = 0;
     unsigned SrcDstRegState = getDefRegState(!IsStore);
-    if (i + 1 == e) {
+    const bool IsLastSubReg = i + 1 == e;
+    if (IsLastSubReg) {
       SOffsetRegState |= getKillRegState(Scavenged);
       // The last implicit use carries the "Kill" flag.
       SrcDstRegState |= getKillRegState(IsKill);
@@ -1363,21 +1551,26 @@ void SIRegisterInfo::buildSpillLoadStore(
     if (IsAGPR) {
       assert(EltSize == 4);
 
-      if (!TmpReg) {
-        assert(RS && "Needs to have RegScavenger to spill an AGPR!");
-        // FIXME: change to scavengeRegisterBackwards()
-        TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
-        RS->setRegUsed(TmpReg);
+      if (!TmpIntermediateVGPR) {
+        TmpIntermediateVGPR = FuncInfo->getVGPRForAGPRCopy();
+        assert(MF->getRegInfo().isReserved(TmpIntermediateVGPR));
       }
       if (IsStore) {
         auto AccRead = BuildMI(MBB, MI, DL,
-                               TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TmpReg)
+                               TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64),
+                               TmpIntermediateVGPR)
                            .addReg(SubReg, getKillRegState(IsKill));
         if (NeedSuperRegDef)
           AccRead.addReg(ValueReg, RegState::ImplicitDefine);
         AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse);
       }
-      SubReg = TmpReg;
+      SubReg = TmpIntermediateVGPR;
+    } else if (UseVGPROffset) {
+      // FIXME: change to scavengeRegisterBackwards()
+      if (!TmpOffsetVGPR) {
+        TmpOffsetVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
+        RS->setRegUsed(TmpOffsetVGPR);
+      }
     }
 
     MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset);
@@ -1388,12 +1581,26 @@ void SIRegisterInfo::buildSpillLoadStore(
     auto MIB =
         BuildMI(MBB, MI, DL, *Desc)
             .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill));
+
+    if (UseVGPROffset) {
+      // For an AGPR spill, we reuse the same temp VGPR for the offset and the
+      // intermediate accvgpr_write.
+      MIB.addReg(TmpOffsetVGPR, getKillRegState(IsLastSubReg && !IsAGPR));
+    }
+
     if (!IsFlat)
       MIB.addReg(FuncInfo->getScratchRSrcReg());
 
     if (SOffset == AMDGPU::NoRegister) {
-      if (!IsFlat)
-        MIB.addImm(0);
+      if (!IsFlat) {
+        if (UseVGPROffset && ScratchOffsetReg) {
+          assert(!FuncInfo->isEntryFunction());
+          MIB.addReg(ScratchOffsetReg);
+        } else {
+          assert(FuncInfo->isEntryFunction());
+          MIB.addImm(0);
+        }
+      }
     } else {
       MIB.addReg(SOffset, SOffsetRegState);
     }
@@ -1407,10 +1614,10 @@ void SIRegisterInfo::buildSpillLoadStore(
     if (!IsAGPR && NeedSuperRegDef)
       MIB.addReg(ValueReg, RegState::ImplicitDefine);
 
-    if (!IsStore && TmpReg != AMDGPU::NoRegister) {
+    if (!IsStore && IsAGPR && TmpIntermediateVGPR != AMDGPU::NoRegister) {
       MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64),
                     FinalReg)
-                .addReg(TmpReg, RegState::Kill);
+                .addReg(TmpIntermediateVGPR, RegState::Kill);
       MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
     }
 
@@ -1466,8 +1673,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
                                bool OnlyToVGPR) const {
   SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
 
-  ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills =
-      SB.MFI.getSGPRToVGPRSpills(Index);
+  ArrayRef<SpilledReg> VGPRSpills = SB.MFI.getSGPRToVGPRSpills(Index);
   bool SpillToVGPR = !VGPRSpills.empty();
   if (OnlyToVGPR && !SpillToVGPR)
     return false;
@@ -1485,7 +1691,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
           SB.NumSubRegs == 1
               ? SB.SuperReg
               : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
-      SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
+      SpilledReg Spill = VGPRSpills[i];
 
       bool UseKill = SB.IsKill && i == SB.NumSubRegs - 1;
 
@@ -1586,8 +1792,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
                                  bool OnlyToVGPR) const {
   SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
 
-  ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills =
-      SB.MFI.getSGPRToVGPRSpills(Index);
+  ArrayRef<SpilledReg> VGPRSpills = SB.MFI.getSGPRToVGPRSpills(Index);
   bool SpillToVGPR = !VGPRSpills.empty();
   if (OnlyToVGPR && !SpillToVGPR)
     return false;
@@ -1599,7 +1804,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
               ? SB.SuperReg
               : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
 
-      SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
+      SpilledReg Spill = VGPRSpills[i];
       auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32),
                          SubReg)
                      .addReg(Spill.VGPR)
@@ -1937,18 +2142,23 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
             Offset = 0;
           }
 
-          assert(!TII->getNamedOperand(*MI, AMDGPU::OpName::vaddr) &&
-                 "Unexpected vaddr for flat scratch with a FI operand");
-
-          // On GFX10 we have ST mode to use no registers for an address.
-          // Otherwise we need to materialize 0 into an SGPR.
-          if (!Offset && ST.hasFlatScratchSTMode()) {
+          if (!Offset) {
             unsigned Opc = MI->getOpcode();
-            unsigned NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc);
-            MI->RemoveOperand(
-                AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
-            MI->setDesc(TII->get(NewOpc));
-            return;
+            int NewOpc = -1;
+            if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) != -1) {
+              NewOpc = AMDGPU::getFlatScratchInstSVfromSVS(Opc);
+            } else if (ST.hasFlatScratchSTMode()) {
+              // On GFX10 we have ST mode to use no registers for an address.
+              // Otherwise we need to materialize 0 into an SGPR.
+              NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc);
+            }
+
+            if (NewOpc != -1) {
+              MI->removeOperand(
+                  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
+              MI->setDesc(TII->get(NewOpc));
+              return;
+            }
           }
         }
 
@@ -2026,57 +2236,78 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
 
       if (!IsMUBUF && !MFI->isEntryFunction()) {
         // Convert to a swizzled stack address by scaling by the wave size.
-        //
         // In an entry function/kernel the offset is already swizzled.
-
-        bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32;
-        Register ResultReg =
-            IsCopy ? MI->getOperand(0).getReg()
-                   : RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
+        bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum));
+        bool LiveSCC = RS->isRegUsed(AMDGPU::SCC);
+        const TargetRegisterClass *RC = IsSALU && !LiveSCC
+                                            ? &AMDGPU::SReg_32RegClass
+                                            : &AMDGPU::VGPR_32RegClass;
+        bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
+                      MI->getOpcode() == AMDGPU::V_MOV_B32_e64;
+        Register ResultReg = IsCopy ? MI->getOperand(0).getReg()
+                                    : RS->scavengeRegister(RC, MI, 0);
 
         int64_t Offset = FrameInfo.getObjectOffset(Index);
         if (Offset == 0) {
+          unsigned OpCode = IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32
+                                               : AMDGPU::V_LSHRREV_B32_e64;
           // XXX - This never happens because of emergency scavenging slot at 0?
-          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg)
-            .addImm(ST.getWavefrontSizeLog2())
-            .addReg(FrameReg);
+          auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), ResultReg)
+                           .addImm(ST.getWavefrontSizeLog2())
+                           .addReg(FrameReg);
+          if (IsSALU && !LiveSCC)
+            Shift.getInstr()->getOperand(3).setIsDead(
+                true); // Mark SCC as dead.
+          if (IsSALU && LiveSCC) {
+            Register NewDest =
+                RS->scavengeRegister(&AMDGPU::SReg_32RegClass, Shift, 0);
+            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+                    NewDest)
+                .addReg(ResultReg);
+            ResultReg = NewDest;
+          }
         } else {
-          if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) {
-            // Reuse ResultReg in intermediate step.
-            Register ScaledReg = ResultReg;
-
-            BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
-                    ScaledReg)
-              .addImm(ST.getWavefrontSizeLog2())
-              .addReg(FrameReg);
-
-            const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
-
-            // TODO: Fold if use instruction is another add of a constant.
-            if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
-              // FIXME: This can fail
-              MIB.addImm(Offset);
-              MIB.addReg(ScaledReg, RegState::Kill);
-              if (!IsVOP2)
+          MachineInstrBuilder MIB;
+          if (!IsSALU) {
+            if ((MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) !=
+                nullptr) {
+              // Reuse ResultReg in intermediate step.
+              Register ScaledReg = ResultReg;
+
+              BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
+                      ScaledReg)
+                .addImm(ST.getWavefrontSizeLog2())
+                .addReg(FrameReg);
+
+              const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
+
+              // TODO: Fold if use instruction is another add of a constant.
+              if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
+                // FIXME: This can fail
+                MIB.addImm(Offset);
+                MIB.addReg(ScaledReg, RegState::Kill);
+                if (!IsVOP2)
+                  MIB.addImm(0); // clamp bit
+              } else {
+                assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 &&
+                       "Need to reuse carry out register");
+
+                // Use scavenged unused carry out as offset register.
+                Register ConstOffsetReg;
+                if (!isWave32)
+                  ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0);
+                else
+                  ConstOffsetReg = MIB.getReg(1);
+
+                BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
+                    .addImm(Offset);
+                MIB.addReg(ConstOffsetReg, RegState::Kill);
+                MIB.addReg(ScaledReg, RegState::Kill);
                 MIB.addImm(0); // clamp bit
-            } else {
-              assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 &&
-                     "Need to reuse carry out register");
-
-              // Use scavenged unused carry out as offset register.
-              Register ConstOffsetReg;
-              if (!isWave32)
-                ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0);
-              else
-                ConstOffsetReg = MIB.getReg(1);
-
-              BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
-                .addImm(Offset);
-              MIB.addReg(ConstOffsetReg, RegState::Kill);
-              MIB.addReg(ScaledReg, RegState::Kill);
-              MIB.addImm(0); // clamp bit
+              }
             }
-          } else {
+          }
+          if (!MIB || IsSALU) {
             // We have to produce a carry out, and there isn't a free SGPR pair
             // for it. We can keep the whole computation on the SALU to avoid
             // clobbering an additional register at the cost of an extra mov.
@@ -2084,7 +2315,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
             // We may have 1 free scratch SGPR even though a carry out is
             // unavailable. Only one additional mov is needed.
             Register TmpScaledReg =
-                RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
+              RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
             Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
 
             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
@@ -2093,14 +2324,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
                 .addReg(ScaledReg, RegState::Kill)
                 .addImm(Offset);
-            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
-              .addReg(ScaledReg, RegState::Kill);
+            if (!IsSALU)
+              BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
+                  .addReg(ScaledReg, RegState::Kill);
+            else
+              ResultReg = ScaledReg;
 
             // If there were truly no free SGPRs, we need to undo everything.
             if (!TmpScaledReg.isValid()) {
               BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
-                  .addReg(ScaledReg, RegState::Kill)
-                  .addImm(-Offset);
+                .addReg(ScaledReg, RegState::Kill)
+                .addImm(-Offset);
               BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
                 .addReg(FrameReg)
                 .addImm(ST.getWavefrontSizeLog2());
@@ -2665,8 +2899,7 @@ MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const
 
 const TargetRegisterClass *
 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
-                                         const RegisterBank &RB,
-                                         const MachineRegisterInfo &MRI) const {
+                                         const RegisterBank &RB) const {
   switch (RB.getID()) {
   case AMDGPU::VGPRRegBankID:
     return getVGPRClassForBitWidth(std::max(32u, Size));
@@ -2688,7 +2921,7 @@ SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
                                          const MachineRegisterInfo &MRI) const {
   const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
   if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>())
-    return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI);
+    return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB);
 
   if (const auto *RC = RCOrRB.dyn_cast<const TargetRegisterClass *>())
     return getAllocatableClass(RC);
@@ -2808,9 +3041,29 @@ bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const {
   return true;
 }
 
+const TargetRegisterClass *
+SIRegisterInfo::getProperlyAlignedRC(const TargetRegisterClass *RC) const {
+  if (!RC || !ST.needsAlignedVGPRs())
+    return RC;
+
+  unsigned Size = getRegSizeInBits(*RC);
+  if (Size <= 32)
+    return RC;
+
+  if (isVGPRClass(RC))
+    return getAlignedVGPRClassForBitWidth(Size);
+  if (isAGPRClass(RC))
+    return getAlignedAGPRClassForBitWidth(Size);
+  if (isVectorSuperClass(RC))
+    return getAlignedVectorSuperClassForBitWidth(Size);
+
+  return RC;
+}
+
 bool SIRegisterInfo::isConstantPhysReg(MCRegister PhysReg) const {
   switch (PhysReg) {
   case AMDGPU::SGPR_NULL:
+  case AMDGPU::SGPR_NULL64:
   case AMDGPU::SRC_SHARED_BASE:
   case AMDGPU::SRC_PRIVATE_BASE:
   case AMDGPU::SRC_SHARED_LIMIT:
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index f1fe0a1d9329..9bfbc253410b 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -51,6 +51,17 @@ private:
 public:
   SIRegisterInfo(const GCNSubtarget &ST);
 
+  struct SpilledReg {
+    Register VGPR;
+    int Lane = -1;
+
+    SpilledReg() = default;
+    SpilledReg(Register R, int L) : VGPR(R), Lane(L) {}
+
+    bool hasLane() { return Lane != -1; }
+    bool hasReg() { return VGPR != 0; }
+  };
+
   /// \returns the sub reg enum value for the given \p Channel
   /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
   static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs = 1);
@@ -64,6 +75,8 @@ public:
   MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const;
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
+  bool isAsmClobberable(const MachineFunction &MF,
+                        MCRegister PhysReg) const override;
 
   const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
   const MCPhysReg *getCalleeSavedRegsViaCopy(const MachineFunction *MF) const;
@@ -304,15 +317,11 @@ public:
   MCRegister getReturnAddressReg(const MachineFunction &MF) const;
 
   const TargetRegisterClass *
-  getRegClassForSizeOnBank(unsigned Size,
-                           const RegisterBank &Bank,
-                           const MachineRegisterInfo &MRI) const;
+  getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank) const;
 
   const TargetRegisterClass *
-  getRegClassForTypeOnBank(LLT Ty,
-                           const RegisterBank &Bank,
-                           const MachineRegisterInfo &MRI) const {
-    return getRegClassForSizeOnBank(Ty.getSizeInBits(), Bank, MRI);
+  getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank) const {
+    return getRegClassForSizeOnBank(Ty.getSizeInBits(), Bank);
   }
 
   const TargetRegisterClass *
@@ -377,6 +386,11 @@ public:
   // the subtarget.
   bool isProperlyAlignedRC(const TargetRegisterClass &RC) const;
 
+  // Given \p RC returns corresponding aligned register class if required
+  // by the subtarget.
+  const TargetRegisterClass *
+  getProperlyAlignedRC(const TargetRegisterClass *RC) const;
+
   /// Return all SGPR128 which satisfy the waves per execution unit requirement
   /// of the subtarget.
   ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF) const;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index eb9452f4b85e..ffe8dce79816 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -97,7 +97,7 @@ class RegSeqNames<int last_reg, int stride, int size, string prefix,
                     []);
 }
 
-// Generates list of dags for register tupless.
+// Generates list of dags for register tuples.
 class RegSeqDags<RegisterClass RC, int last_reg, int stride, int size,
                 int start = 0> {
   dag trunc_rc = (trunc RC,
@@ -189,7 +189,7 @@ def PC_REG : SIReg<"pc", 0>, DwarfRegNum<[16, 16]> {
 def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> {
   let Namespace = "AMDGPU";
   let SubRegIndices = [sub0, sub1];
-  let HWEncoding = 106;
+  let HWEncoding = VCC_LO.HWEncoding;
 }
 
 defm EXEC_LO : SIRegLoHi16<"exec_lo", 126>, DwarfRegNum<[1, 1]>;
@@ -198,7 +198,7 @@ defm EXEC_HI : SIRegLoHi16<"exec_hi", 127>;
 def EXEC : RegisterWithSubRegs<"exec", [EXEC_LO, EXEC_HI]>, DwarfRegNum<[17, 1]> {
   let Namespace = "AMDGPU";
   let SubRegIndices = [sub0, sub1];
-  let HWEncoding = 126;
+  let HWEncoding = EXEC_LO.HWEncoding;
 }
 
 // 32-bit real registers, for MC only.
@@ -211,8 +211,23 @@ defm SRC_SCC : SIRegLoHi16<"src_scc", 253>;
 // Should never be emitted.
 def SCC : SIReg<"scc">;
 
-defm M0 : SIRegLoHi16 <"m0", 124>;
-defm SGPR_NULL : SIRegLoHi16 <"null", 125>;
+// Encoding changes between subtarget generations.
+// See also Utils/AMDGPUBaseInfo.cpp MAP_REG2REG.
+defm M0_gfxpre11 : SIRegLoHi16 <"m0", 124>;
+defm M0_gfx11plus : SIRegLoHi16 <"m0", 125>;
+defm M0 : SIRegLoHi16 <"m0", 0>;
+
+defm SGPR_NULL_gfxpre11 : SIRegLoHi16 <"null", 125>;
+defm SGPR_NULL_gfx11plus : SIRegLoHi16 <"null", 124>;
+defm SGPR_NULL : SIRegLoHi16 <"null", 0>;
+defm SGPR_NULL_HI : SIRegLoHi16 <"", 0>;
+
+def SGPR_NULL64 :
+    RegisterWithSubRegs<"null", [SGPR_NULL, SGPR_NULL_HI]> {
+  let Namespace = "AMDGPU";
+  let SubRegIndices = [sub0, sub1];
+  let HWEncoding = SGPR_NULL.HWEncoding;
+}
 
 defm SRC_SHARED_BASE : SIRegLoHi16<"src_shared_base", 235>;
 defm SRC_SHARED_LIMIT : SIRegLoHi16<"src_shared_limit", 236>;
@@ -237,7 +252,7 @@ def XNACK_MASK :
     RegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI]> {
   let Namespace = "AMDGPU";
   let SubRegIndices = [sub0, sub1];
-  let HWEncoding = 104;
+  let HWEncoding = XNACK_MASK_LO.HWEncoding;
 }
 
 // Trap handler registers
@@ -247,7 +262,7 @@ defm TBA_HI : SIRegLoHi16<"tba_hi", 109>;
 def TBA : RegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]> {
   let Namespace = "AMDGPU";
   let SubRegIndices = [sub0, sub1];
-  let HWEncoding = 108;
+  let HWEncoding = TBA_LO.HWEncoding;
 }
 
 defm TMA_LO : SIRegLoHi16<"tma_lo", 110>;
@@ -256,7 +271,7 @@ defm TMA_HI : SIRegLoHi16<"tma_hi", 111>;
 def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]> {
   let Namespace = "AMDGPU";
   let SubRegIndices = [sub0, sub1];
-  let HWEncoding = 110;
+  let HWEncoding = TMA_LO.HWEncoding;
 }
 
 foreach Index = 0...15 in {
@@ -635,16 +650,16 @@ let GeneratePressureSet = 0, HasSGPR = 1 in {
 // See comments in SIInstructions.td for more info.
 def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
   (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI,
-   SGPR_NULL, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT,
-   SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, SRC_POPS_EXITING_WAVE_ID,
+   SGPR_NULL, SGPR_NULL_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE,
+   SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, SRC_POPS_EXITING_WAVE_ID,
    SRC_VCCZ, SRC_EXECZ, SRC_SCC)> {
   let AllocationPriority = 10;
 }
 
 def SReg_LO16_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i16, f16], 16,
   (add SGPR_LO16, VCC_LO_LO16, VCC_HI_LO16, FLAT_SCR_LO_LO16, FLAT_SCR_HI_LO16,
-   XNACK_MASK_LO_LO16, XNACK_MASK_HI_LO16, SGPR_NULL_LO16, TTMP_LO16, TMA_LO_LO16,
-   TMA_HI_LO16, TBA_LO_LO16, TBA_HI_LO16, SRC_SHARED_BASE_LO16,
+   XNACK_MASK_LO_LO16, XNACK_MASK_HI_LO16, SGPR_NULL_LO16, SGPR_NULL_HI_LO16, TTMP_LO16,
+   TMA_LO_LO16, TMA_HI_LO16, TBA_LO_LO16, TBA_HI_LO16, SRC_SHARED_BASE_LO16,
    SRC_SHARED_LIMIT_LO16, SRC_PRIVATE_BASE_LO16, SRC_PRIVATE_LIMIT_LO16,
    SRC_POPS_EXITING_WAVE_ID_LO16, SRC_VCCZ_LO16, SRC_EXECZ_LO16, SRC_SCC_LO16)> {
   let Size = 16;
@@ -701,23 +716,6 @@ def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16],
   let HasSGPR = 1;
 }
 
-// CCR (call clobbered registers) SGPR 64-bit registers
-def CCR_SGPR_64 : SIRegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
-                                (add (trunc SGPR_64, 16))> {
-  let CopyCost = SGPR_64.CopyCost;
-  let AllocationPriority = SGPR_64.AllocationPriority;
-  let HasSGPR = 1;
-}
-
-// Call clobbered 64-bit SGPRs for AMDGPU_Gfx CC
-def Gfx_CCR_SGPR_64 : SIRegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
-                                (add (trunc (shl SGPR_64, 15), 1), // s[30:31]
-                                     (trunc (shl SGPR_64, 18), 14))> { // s[36:37]-s[s62:63]
-  let CopyCost = SGPR_64.CopyCost;
-  let AllocationPriority = SGPR_64.AllocationPriority;
-  let HasSGPR = 1;
-}
-
 def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32,
                             (add TTMP_64Regs)> {
   let isAllocatable = 0;
@@ -725,7 +723,7 @@ def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32,
 }
 
 def SReg_64_XEXEC : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
-  (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA)> {
+  (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, SGPR_NULL64, TTMP_64, TBA, TMA)> {
   let CopyCost = 1;
   let AllocationPriority = 13;
   let HasSGPR = 1;
@@ -788,7 +786,7 @@ defm "" : SRegClass<4, 15, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], SGPR_128R
 defm "" : SRegClass<5, 16, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>;
 defm "" : SRegClass<6, 17, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>;
 defm "" : SRegClass<7, 18, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>;
-defm "" : SRegClass<8, 19, [v8i32, v8f32, v4i64, v4f64], SGPR_256Regs, TTMP_256Regs>;
+defm "" : SRegClass<8, 19, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], SGPR_256Regs, TTMP_256Regs>;
 defm "" : SRegClass<16, 20, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>;
 defm "" : SRegClass<32, 21, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>;
 
@@ -829,7 +827,7 @@ defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>;
 
 defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>;
 defm VReg_224 : VRegClass<7, [v7i32, v7f32], (add VGPR_224)>;
-defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64], (add VGPR_256)>;
+defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], (add VGPR_256)>;
 defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>;
 defm VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>;
 
@@ -856,21 +854,12 @@ defm AReg_1024 : ARegClass<32, [v32i32, v32f32, v16i64, v16f64], (add AGPR_1024)
 
 } // End GeneratePressureSet = 0
 
-// This is not a real register. This is just to have a register to add
-// to VReg_1 that does not alias any real register that would
-// introduce inferred register classes.
-def ARTIFICIAL_VGPR : SIReg <"invalid vgpr", 0> {
-  let isArtificial = 1;
-}
-
 let GeneratePressureSet = 0 in {
-// FIXME: Should specify an empty set for this. No register should
-// ever be allocated using VReg_1. This is a hack for SelectionDAG
-// that should always be lowered by SILowerI1Copies. TableGen crashes
-// on an empty register set, but also sorts register classes based on
-// the number of registerss in them. Add only one register so this is
+// No register should ever be allocated using VReg_1. This is a hack for
+// SelectionDAG that should always be lowered by SILowerI1Copies.  TableGen
+// sorts register classes based on the number of registers in them so this is
 // sorted to the end and not preferred over VGPR_32.
-def VReg_1 : SIRegisterClass<"AMDGPU", [i1], 32, (add ARTIFICIAL_VGPR)> {
+def VReg_1 : SIRegisterClass<"AMDGPU", [i1], 32, (add)> {
   let Size = 1;
   let HasVGPR = 1;
 }
@@ -913,11 +902,11 @@ defm AV_64 : AVRegClass<2, VReg_64.RegTypes, (add VGPR_64), (add AGPR_64)>;
 defm AV_96 : AVRegClass<3, VReg_96.RegTypes, (add VGPR_96), (add AGPR_96)>;
 defm AV_128 : AVRegClass<4, VReg_128.RegTypes, (add VGPR_128), (add AGPR_128)>;
 defm AV_160 : AVRegClass<5, VReg_160.RegTypes, (add VGPR_160), (add AGPR_160)>;
-defm AV_192 : AVRegClass<6, VReg_160.RegTypes, (add VGPR_192), (add AGPR_192)>;
-defm AV_224 : AVRegClass<7, VReg_160.RegTypes, (add VGPR_224), (add AGPR_224)>;
-defm AV_256 : AVRegClass<8, VReg_160.RegTypes, (add VGPR_256), (add AGPR_256)>;
-defm AV_512 : AVRegClass<16, VReg_160.RegTypes, (add VGPR_512), (add AGPR_512)>;
-defm AV_1024 : AVRegClass<32, VReg_160.RegTypes, (add VGPR_1024), (add AGPR_1024)>;
+defm AV_192 : AVRegClass<6, VReg_192.RegTypes, (add VGPR_192), (add AGPR_192)>;
+defm AV_224 : AVRegClass<7, VReg_224.RegTypes, (add VGPR_224), (add AGPR_224)>;
+defm AV_256 : AVRegClass<8, VReg_256.RegTypes, (add VGPR_256), (add AGPR_256)>;
+defm AV_512 : AVRegClass<16, VReg_512.RegTypes, (add VGPR_512), (add AGPR_512)>;
+defm AV_1024 : AVRegClass<32, VReg_1024.RegTypes, (add VGPR_1024), (add AGPR_1024)>;
 
 //===----------------------------------------------------------------------===//
 //  Register operands
@@ -1087,6 +1076,27 @@ def VRegSrc_32 : RegisterOperand<VGPR_32> {
   let DecoderMethod = "DecodeVS_32RegisterClass";
 }
 
+def VRegSrc_64 : RegisterOperand<VReg_64> {
+  let DecoderMethod = "decodeOperand_VReg_64";
+}
+
+def VRegSrc_128 : RegisterOperand<VReg_128> {
+  let DecoderMethod = "decodeOperand_VReg_128";
+}
+
+def VRegSrc_256 : RegisterOperand<VReg_256> {
+  let DecoderMethod = "decodeOperand_VReg_256";
+}
+
+//===----------------------------------------------------------------------===//
+// VGPRSrc_*
+//===----------------------------------------------------------------------===//
+
+// An 8-bit RegisterOperand wrapper for a VGPR
+def VGPRSrc_32 : RegisterOperand<VGPR_32> {
+  let DecoderMethod = "DecodeVGPR_32RegisterClass";
+}
+
 //===----------------------------------------------------------------------===//
 //  ASrc_* Operands with an AccVGPR
 //===----------------------------------------------------------------------===//
@@ -1116,7 +1126,7 @@ defm VISrc_512  : RegInlineOperandAC<"VReg", "VISrc_512",  "_512">;
 defm VISrc_1024 : RegInlineOperandAC<"VReg", "VISrc_1024", "_1024">;
 
 //===----------------------------------------------------------------------===//
-//  AVSrc_* Operands with an AGPR or VGPR
+//  AVSrc_*, AVDst_*, AVLdSt_* Operands with an AGPR or VGPR
 //===----------------------------------------------------------------------===//
 
 def AVSrc_32 : RegisterOperand<AV_32> {
@@ -1129,6 +1139,21 @@ def AVSrc_64 : RegisterOperand<AV_64> {
   let EncoderMethod = "getAVOperandEncoding";
 }
 
+def AVSrc_128 : RegisterOperand<AV_128> {
+  let DecoderMethod = "DecodeAV_128RegisterClass";
+  let EncoderMethod = "getAVOperandEncoding";
+}
+
+def AVDst_128 : RegisterOperand<AV_128> {
+  let DecoderMethod = "DecodeAVDst_128RegisterClass";
+  let EncoderMethod = "getAVOperandEncoding";
+}
+
+def AVDst_512 : RegisterOperand<AV_512> {
+  let DecoderMethod = "DecodeAVDst_512RegisterClass";
+  let EncoderMethod = "getAVOperandEncoding";
+}
+
 def AVLdSt_32 : RegisterOperand<AV_32> {
   let DecoderMethod = "DecodeAVLdSt_32RegisterClass";
   let EncoderMethod = "getAVOperandEncoding";
diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td
index 18d424a3bc9f..53441b5a4ced 100644
--- a/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -59,6 +59,7 @@ def WriteIntMul : SchedWrite;
 
 // mAI multipass instructions.
 def Write2PassMAI  : SchedWrite;
+def Write4PassMAI  : SchedWrite;
 def Write8PassMAI  : SchedWrite;
 def Write16PassMAI : SchedWrite;
 def Write4PassDGEMM : SchedWrite;
@@ -86,7 +87,9 @@ class SISchedMachineModel : SchedMachineModel {
 def SIFullSpeedModel : SISchedMachineModel;
 def SIQuarterSpeedModel : SISchedMachineModel;
 def SIDPFullSpeedModel : SISchedMachineModel;
+def SIDPGFX940FullSpeedModel : SISchedMachineModel;
 def GFX10SpeedModel : SISchedMachineModel;
+def GFX11SpeedModel : SISchedMachineModel;
 
 // XXX: Are the resource counts correct?
 def HWBranch : ProcResource<1> {
@@ -156,6 +159,8 @@ multiclass SICommonWriteRes {
 
   let ResourceCycles = [2] in
   def : HWWriteRes<Write2PassMAI,  [HWXDL], 2>;
+  let ResourceCycles = [4] in
+  def : HWWriteRes<Write4PassMAI,  [HWXDL], 4>;
   let ResourceCycles = [8] in
   def : HWWriteRes<Write8PassMAI,  [HWXDL], 8>;
   let ResourceCycles = [16] in
@@ -244,6 +249,40 @@ def : InstRW<[Write8PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_16X16X")>;
 
 } // End SchedModel = SIDPFullSpeedModel
 
+let SchedModel = SIDPGFX940FullSpeedModel in {
+
+defm : SICommonWriteRes;
+
+def : HWVALUWriteRes<WriteFloatFMA,    1>;
+def : HWVALUWriteRes<WriteDouble,      1>;
+def : HWVALUWriteRes<WriteDoubleAdd,   1>;
+def : HWVALUWriteRes<WriteDoubleCvt,   1>;
+def : HWVALUWriteRes<WriteTrans64,     4>;
+def : HWVALUWriteRes<WriteIntMul,      1>;
+def : HWVALUWriteRes<Write64Bit,       1>;
+
+def : InstRW<[WriteCopy], (instrs COPY)>;
+def : InstRW<[Write64Bit], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>;
+def : InstRW<[Write2PassMAI,   MIMFMARead], (instregex "^V_MFMA_.32_4X4X")>;
+
+def : InstRW<[Write4PassMAI,   MIMFMARead], (instregex "^V_MFMA_.32_16X16X8X")>;
+def : InstRW<[Write4PassMAI,   MIMFMARead], (instregex "^V_MFMA_.32_16X16X16")>;
+def : InstRW<[Write4PassMAI,   MIMFMARead], (instregex "^V_MFMA_.32_16X16X32")>;
+def : InstRW<[Write8PassMAI,   MIMFMARead], (instregex "^V_MFMA_.32_16X16X[14][FBI]")>;
+
+def : InstRW<[Write8PassMAI,   MIMFMARead], (instregex "^V_MFMA_.32_32X32X4XF")>;
+def : InstRW<[Write8PassMAI,   MIMFMARead], (instregex "^V_MFMA_.32_32X32X8")>;
+def : InstRW<[Write8PassMAI,   MIMFMARead], (instregex "^V_MFMA_.32_32X32X16")>;
+def : InstRW<[Write16PassMAI,  MIMFMARead], (instregex "^V_MFMA_.32_32X32X[124][FBI]")>;
+
+def : InstRW<[Write4PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_4X4X")>;
+def : InstRW<[Write8PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_16X16X")>;
+
+def : InstRW<[Write4PassMAI,   MIMFMARead], (instregex "^V_SMFMAC_.32_16X16X")>;
+def : InstRW<[Write8PassMAI,   MIMFMARead], (instregex "^V_SMFMAC_.32_32X32X")>;
+
+} // End SchedModel = SIDPGFX940FullSpeedModel
+
 let SchedModel = GFX10SpeedModel in {
 
 // The latency values are 1 / (operations / cycle).
@@ -273,3 +312,29 @@ def : HWWriteRes<WriteBarrier,       [HWBranch],       2000>;
 def : InstRW<[WriteCopy], (instrs COPY)>;
 
 }  // End SchedModel = GFX10SpeedModel
+
+let SchedModel = GFX11SpeedModel in {
+
+def : HWWriteRes<Write32Bit,         [HWVALU, HWRC],   5>;
+def : HWWriteRes<WriteFloatCvt,      [HWVALU, HWRC],   5>;
+def : HWWriteRes<Write64Bit,         [HWVALU, HWRC],   6>;
+def : HWWriteRes<WriteTrans32,       [HWVALU, HWRC],   10>;
+def : HWWriteRes<WriteQuarterRate32, [HWVALU, HWRC],   8>;
+def : HWWriteRes<WriteFloatFMA,      [HWVALU, HWRC],   5>;
+def : HWWriteRes<WriteDouble,        [HWVALU, HWRC],   38>;
+def : HWWriteRes<WriteDoubleAdd,     [HWVALU, HWRC],   38>;
+def : HWWriteRes<WriteDoubleCvt,     [HWVALU, HWRC],   38>;
+def : HWWriteRes<WriteIntMul,        [HWVALU, HWRC],   8>;
+def : HWWriteRes<WriteTrans64,       [HWVALU, HWRC],   40>;
+
+def : HWWriteRes<WriteBranch,        [HWBranch],       32>;
+def : HWWriteRes<WriteExport,        [HWExport, HWRC], 16>;
+def : HWWriteRes<WriteLDS,           [HWLGKM,   HWRC], 20>;
+def : HWWriteRes<WriteSALU,          [HWSALU,   HWRC], 2>;
+def : HWWriteRes<WriteSMEM,          [HWLGKM,   HWRC], 20>;
+def : HWWriteRes<WriteVMEM,          [HWVMEM,   HWRC], 320>;
+def : HWWriteRes<WriteBarrier,       [HWBranch],       2000>;
+
+def : InstRW<[WriteCopy], (instrs COPY)>;
+
+}  // End SchedModel = GFX11SpeedModel
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index c8f1daf26de9..05d2dd000162 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -26,15 +26,40 @@ using namespace llvm;
 namespace {
 
 class SIShrinkInstructions : public MachineFunctionPass {
+  MachineRegisterInfo *MRI;
+  const GCNSubtarget *ST;
+  const SIInstrInfo *TII;
+  const SIRegisterInfo *TRI;
+
 public:
   static char ID;
 
-  void shrinkMIMG(MachineInstr &MI);
-
 public:
   SIShrinkInstructions() : MachineFunctionPass(ID) {
   }
 
+  bool foldImmediates(MachineInstr &MI, bool TryToCommute = true) const;
+  bool isKImmOperand(const MachineOperand &Src) const;
+  bool isKUImmOperand(const MachineOperand &Src) const;
+  bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const;
+  bool isReverseInlineImm(const MachineOperand &Src, int32_t &ReverseImm) const;
+  void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const;
+  void shrinkScalarCompare(MachineInstr &MI) const;
+  void shrinkMIMG(MachineInstr &MI) const;
+  void shrinkMadFma(MachineInstr &MI) const;
+  bool shrinkScalarLogicOp(MachineInstr &MI) const;
+  bool tryReplaceDeadSDST(MachineInstr &MI) const;
+  bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
+                     Register Reg, unsigned SubReg) const;
+  bool instReadsReg(const MachineInstr *MI, unsigned Reg,
+                    unsigned SubReg) const;
+  bool instModifiesReg(const MachineInstr *MI, unsigned Reg,
+                       unsigned SubReg) const;
+  TargetInstrInfo::RegSubRegPair getSubRegForIndex(Register Reg, unsigned Sub,
+                                                   unsigned I) const;
+  void dropInstructionKeepingImpDefs(MachineInstr &MI) const;
+  MachineInstr *matchSwap(MachineInstr &MovT) const;
+
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   StringRef getPassName() const override { return "SI Shrink Instructions"; }
@@ -59,8 +84,8 @@ FunctionPass *llvm::createSIShrinkInstructionsPass() {
 /// This function checks \p MI for operands defined by a move immediate
 /// instruction and then folds the literal constant into the instruction if it
 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions.
-static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
-                           MachineRegisterInfo &MRI, bool TryToCommute = true) {
+bool SIShrinkInstructions::foldImmediates(MachineInstr &MI,
+                                          bool TryToCommute) const {
   assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
 
   int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
@@ -69,8 +94,8 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
   MachineOperand &Src0 = MI.getOperand(Src0Idx);
   if (Src0.isReg()) {
     Register Reg = Src0.getReg();
-    if (Reg.isVirtual() && MRI.hasOneUse(Reg)) {
-      MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
+    if (Reg.isVirtual()) {
+      MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
       if (Def && Def->isMoveImmediate()) {
         MachineOperand &MovSrc = Def->getOperand(1);
         bool ConstantFolded = false;
@@ -91,8 +116,8 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
         }
 
         if (ConstantFolded) {
-          assert(MRI.use_empty(Reg));
-          Def->eraseFromParent();
+          if (MRI->use_nodbg_empty(Reg))
+            Def->eraseFromParent();
           ++NumLiteralConstantsFolded;
           return true;
         }
@@ -103,7 +128,7 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
   // We have failed to fold src0, so commute the instruction and try again.
   if (TryToCommute && MI.isCommutable()) {
     if (TII->commuteInstruction(MI)) {
-      if (foldImmediates(MI, TII, MRI, false))
+      if (foldImmediates(MI, false))
         return true;
 
       // Commute back.
@@ -114,21 +139,20 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
   return false;
 }
 
-static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
+bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const {
   return isInt<16>(Src.getImm()) &&
     !TII->isInlineConstant(*Src.getParent(),
                            Src.getParent()->getOperandNo(&Src));
 }
 
-static bool isKUImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
+bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const {
   return isUInt<16>(Src.getImm()) &&
     !TII->isInlineConstant(*Src.getParent(),
                            Src.getParent()->getOperandNo(&Src));
 }
 
-static bool isKImmOrKUImmOperand(const SIInstrInfo *TII,
-                                 const MachineOperand &Src,
-                                 bool &IsUnsigned) {
+bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src,
+                                                bool &IsUnsigned) const {
   if (isInt<16>(Src.getImm())) {
     IsUnsigned = false;
     return !TII->isInlineConstant(Src);
@@ -144,9 +168,8 @@ static bool isKImmOrKUImmOperand(const SIInstrInfo *TII,
 
 /// \returns true if the constant in \p Src should be replaced with a bitreverse
 /// of an inline immediate.
-static bool isReverseInlineImm(const SIInstrInfo *TII,
-                               const MachineOperand &Src,
-                               int32_t &ReverseImm) {
+bool SIShrinkInstructions::isReverseInlineImm(const MachineOperand &Src,
+                                              int32_t &ReverseImm) const {
   if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src))
     return false;
 
@@ -156,8 +179,9 @@ static bool isReverseInlineImm(const SIInstrInfo *TII,
 
 /// Copy implicit register operands from specified instruction to this
 /// instruction that are not part of the instruction definition.
-static void copyExtraImplicitOps(MachineInstr &NewMI, MachineFunction &MF,
-                                 const MachineInstr &MI) {
+void SIShrinkInstructions::copyExtraImplicitOps(MachineInstr &NewMI,
+                                                MachineInstr &MI) const {
+  MachineFunction &MF = *MI.getMF();
   for (unsigned i = MI.getDesc().getNumOperands() +
          MI.getDesc().getNumImplicitUses() +
          MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands();
@@ -168,7 +192,7 @@ static void copyExtraImplicitOps(MachineInstr &NewMI, MachineFunction &MF,
   }
 }
 
-static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
+void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const {
   // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
   // get constants on the RHS.
   if (!MI.getOperand(0).isReg())
@@ -191,7 +215,7 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
   // and initially selected to the unsigned versions.
   if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) {
     bool HasUImm;
-    if (isKImmOrKUImmOperand(TII, Src1, HasUImm)) {
+    if (isKImmOrKUImmOperand(Src1, HasUImm)) {
       if (!HasUImm) {
         SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ?
           AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32;
@@ -205,22 +229,30 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
 
   const MCInstrDesc &NewDesc = TII->get(SOPKOpc);
 
-  if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(TII, Src1)) ||
-      (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(TII, Src1))) {
+  if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(Src1)) ||
+      (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(Src1))) {
     MI.setDesc(NewDesc);
   }
 }
 
 // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding.
-void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) {
+void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
-  if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
+  if (!Info)
     return;
 
-  MachineFunction *MF = MI.getParent()->getParent();
-  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
-  const SIInstrInfo *TII = ST.getInstrInfo();
-  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  uint8_t NewEncoding;
+  switch (Info->MIMGEncoding) {
+  case AMDGPU::MIMGEncGfx10NSA:
+    NewEncoding = AMDGPU::MIMGEncGfx10Default;
+    break;
+  case AMDGPU::MIMGEncGfx11NSA:
+    NewEncoding = AMDGPU::MIMGEncGfx11Default;
+    break;
+  default:
+    return;
+  }
+
   int VAddr0Idx =
       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
   unsigned NewAddrDwords = Info->VAddrDwords;
@@ -246,16 +278,23 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) {
   }
 
   unsigned VgprBase = 0;
+  unsigned NextVgpr = 0;
   bool IsUndef = true;
   bool IsKill = NewAddrDwords == Info->VAddrDwords;
-  for (unsigned i = 0; i < Info->VAddrDwords; ++i) {
-    const MachineOperand &Op = MI.getOperand(VAddr0Idx + i);
-    unsigned Vgpr = TRI.getHWRegIndex(Op.getReg());
+  for (unsigned Idx = 0; Idx < Info->VAddrOperands; ++Idx) {
+    const MachineOperand &Op = MI.getOperand(VAddr0Idx + Idx);
+    unsigned Vgpr = TRI->getHWRegIndex(Op.getReg());
+    unsigned Dwords = TRI->getRegSizeInBits(Op.getReg(), *MRI) / 32;
+    assert(Dwords > 0 && "Un-implemented for less than 32 bit regs");
 
-    if (i == 0) {
+    if (Idx == 0) {
       VgprBase = Vgpr;
-    } else if (VgprBase + i != Vgpr)
+      NextVgpr = Vgpr + Dwords;
+    } else if (Vgpr == NextVgpr) {
+      NextVgpr = Vgpr + Dwords;
+    } else {
       return;
+    }
 
     if (!Op.isUndef())
       IsUndef = false;
@@ -288,21 +327,108 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) {
     }
   }
 
-  unsigned NewOpcode =
-      AMDGPU::getMIMGOpcode(Info->BaseOpcode, AMDGPU::MIMGEncGfx10Default,
-                            Info->VDataDwords, NewAddrDwords);
+  unsigned NewOpcode = AMDGPU::getMIMGOpcode(Info->BaseOpcode, NewEncoding,
+                                             Info->VDataDwords, NewAddrDwords);
   MI.setDesc(TII->get(NewOpcode));
   MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase));
   MI.getOperand(VAddr0Idx).setIsUndef(IsUndef);
   MI.getOperand(VAddr0Idx).setIsKill(IsKill);
 
-  for (unsigned i = 1; i < Info->VAddrDwords; ++i)
-    MI.RemoveOperand(VAddr0Idx + 1);
+  for (int i = 1; i < Info->VAddrOperands; ++i)
+    MI.removeOperand(VAddr0Idx + 1);
 
   if (ToUntie >= 0) {
     MI.tieOperands(
         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata),
-        ToUntie - (Info->VAddrDwords - 1));
+        ToUntie - (Info->VAddrOperands - 1));
+  }
+}
+
+// Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK.
+void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
+  if (!ST->hasVOP3Literal())
+    return;
+
+  if (TII->hasAnyModifiersSet(MI))
+    return;
+
+  const unsigned Opcode = MI.getOpcode();
+  MachineOperand &Src0 = *TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+  MachineOperand &Src1 = *TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+  MachineOperand &Src2 = *TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+  unsigned NewOpcode = AMDGPU::INSTRUCTION_LIST_END;
+
+  bool Swap;
+
+  // Detect "Dst = VSrc * VGPR + Imm" and convert to AK form.
+  if (Src2.isImm() && !TII->isInlineConstant(Src2)) {
+    if (Src1.isReg() && TRI->isVGPR(*MRI, Src1.getReg()))
+      Swap = false;
+    else if (Src0.isReg() && TRI->isVGPR(*MRI, Src0.getReg()))
+      Swap = true;
+    else
+      return;
+
+    switch (Opcode) {
+    default:
+      llvm_unreachable("Unexpected mad/fma opcode!");
+    case AMDGPU::V_MAD_F32_e64:
+      NewOpcode = AMDGPU::V_MADAK_F32;
+      break;
+    case AMDGPU::V_FMA_F32_e64:
+      NewOpcode = AMDGPU::V_FMAAK_F32;
+      break;
+    case AMDGPU::V_MAD_F16_e64:
+      NewOpcode = AMDGPU::V_MADAK_F16;
+      break;
+    case AMDGPU::V_FMA_F16_e64:
+      NewOpcode = AMDGPU::V_FMAAK_F16;
+      break;
+    }
+  }
+
+  // Detect "Dst = VSrc * Imm + VGPR" and convert to MK form.
+  if (Src2.isReg() && TRI->isVGPR(*MRI, Src2.getReg())) {
+    if (Src1.isImm() && !TII->isInlineConstant(Src1))
+      Swap = false;
+    else if (Src0.isImm() && !TII->isInlineConstant(Src0))
+      Swap = true;
+    else
+      return;
+
+    switch (Opcode) {
+    default:
+      llvm_unreachable("Unexpected mad/fma opcode!");
+    case AMDGPU::V_MAD_F32_e64:
+      NewOpcode = AMDGPU::V_MADMK_F32;
+      break;
+    case AMDGPU::V_FMA_F32_e64:
+      NewOpcode = AMDGPU::V_FMAMK_F32;
+      break;
+    case AMDGPU::V_MAD_F16_e64:
+      NewOpcode = AMDGPU::V_MADMK_F16;
+      break;
+    case AMDGPU::V_FMA_F16_e64:
+      NewOpcode = AMDGPU::V_FMAMK_F16;
+      break;
+    }
+  }
+
+  if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END)
+    return;
+
+  if (Swap) {
+    // Swap Src0 and Src1 by building a new instruction.
+    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(NewOpcode),
+            MI.getOperand(0).getReg())
+        .add(Src1)
+        .add(Src0)
+        .add(Src2)
+        .setMIFlags(MI.getFlags());
+    MI.eraseFromParent();
+  } else {
+    TII->removeModOperands(MI);
+    MI.setDesc(TII->get(NewOpcode));
   }
 }
 
@@ -311,10 +437,7 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) {
 /// If the inverse of the immediate is legal, use ANDN2, ORN2 or
 /// XNOR (as a ^ b == ~(a ^ ~b)).
 /// \returns true if the caller should continue the machine function iterator
-static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
-                                MachineRegisterInfo &MRI,
-                                const SIInstrInfo *TII,
-                                MachineInstr &MI) {
+bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
   unsigned Opc = MI.getOpcode();
   const MachineOperand *Dest = &MI.getOperand(0);
   MachineOperand *Src0 = &MI.getOperand(1);
@@ -323,7 +446,7 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
   MachineOperand *SrcImm = Src1;
 
   if (!SrcImm->isImm() ||
-      AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST.hasInv2PiInlineImm()))
+      AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST->hasInv2PiInlineImm()))
     return false;
 
   uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm());
@@ -333,7 +456,7 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
     if (isPowerOf2_32(~Imm)) {
       NewImm = countTrailingOnes(Imm);
       Opc = AMDGPU::S_BITSET0_B32;
-    } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
+    } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
       NewImm = ~Imm;
       Opc = AMDGPU::S_ANDN2_B32;
     }
@@ -341,12 +464,12 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
     if (isPowerOf2_32(Imm)) {
       NewImm = countTrailingZeros(Imm);
       Opc = AMDGPU::S_BITSET1_B32;
-    } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
+    } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
       NewImm = ~Imm;
       Opc = AMDGPU::S_ORN2_B32;
     }
   } else if (Opc == AMDGPU::S_XOR_B32) {
-    if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
+    if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
       NewImm = ~Imm;
       Opc = AMDGPU::S_XNOR_B32;
     }
@@ -354,16 +477,10 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
     llvm_unreachable("unexpected opcode");
   }
 
-  if ((Opc == AMDGPU::S_ANDN2_B32 || Opc == AMDGPU::S_ORN2_B32) &&
-      SrcImm == Src0) {
-    if (!TII->commuteInstruction(MI, false, 1, 2))
-      NewImm = 0;
-  }
-
   if (NewImm != 0) {
     if (Dest->getReg().isVirtual() && SrcReg->isReg()) {
-      MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
-      MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
+      MRI->setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
+      MRI->setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
       return true;
     }
 
@@ -390,19 +507,19 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
 
 // This is the same as MachineInstr::readsRegister/modifiesRegister except
 // it takes subregs into account.
-static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
-                          Register Reg, unsigned SubReg,
-                          const SIRegisterInfo &TRI) {
+bool SIShrinkInstructions::instAccessReg(
+    iterator_range<MachineInstr::const_mop_iterator> &&R, Register Reg,
+    unsigned SubReg) const {
   for (const MachineOperand &MO : R) {
     if (!MO.isReg())
       continue;
 
     if (Reg.isPhysical() && MO.getReg().isPhysical()) {
-      if (TRI.regsOverlap(Reg, MO.getReg()))
+      if (TRI->regsOverlap(Reg, MO.getReg()))
         return true;
     } else if (MO.getReg() == Reg && Reg.isVirtual()) {
-      LaneBitmask Overlap = TRI.getSubRegIndexLaneMask(SubReg) &
-                            TRI.getSubRegIndexLaneMask(MO.getSubReg());
+      LaneBitmask Overlap = TRI->getSubRegIndexLaneMask(SubReg) &
+                            TRI->getSubRegIndexLaneMask(MO.getSubReg());
       if (Overlap.any())
         return true;
     }
@@ -410,33 +527,31 @@ static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
   return false;
 }
 
-static bool instReadsReg(const MachineInstr *MI,
-                         unsigned Reg, unsigned SubReg,
-                         const SIRegisterInfo &TRI) {
-  return instAccessReg(MI->uses(), Reg, SubReg, TRI);
+bool SIShrinkInstructions::instReadsReg(const MachineInstr *MI, unsigned Reg,
+                                        unsigned SubReg) const {
+  return instAccessReg(MI->uses(), Reg, SubReg);
 }
 
-static bool instModifiesReg(const MachineInstr *MI,
-                            unsigned Reg, unsigned SubReg,
-                            const SIRegisterInfo &TRI) {
-  return instAccessReg(MI->defs(), Reg, SubReg, TRI);
+bool SIShrinkInstructions::instModifiesReg(const MachineInstr *MI, unsigned Reg,
+                                           unsigned SubReg) const {
+  return instAccessReg(MI->defs(), Reg, SubReg);
 }
 
-static TargetInstrInfo::RegSubRegPair
-getSubRegForIndex(Register Reg, unsigned Sub, unsigned I,
-                  const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI) {
-  if (TRI.getRegSizeInBits(Reg, MRI) != 32) {
+TargetInstrInfo::RegSubRegPair
+SIShrinkInstructions::getSubRegForIndex(Register Reg, unsigned Sub,
+                                        unsigned I) const {
+  if (TRI->getRegSizeInBits(Reg, *MRI) != 32) {
     if (Reg.isPhysical()) {
-      Reg = TRI.getSubReg(Reg, TRI.getSubRegFromChannel(I));
+      Reg = TRI->getSubReg(Reg, TRI->getSubRegFromChannel(I));
     } else {
-      Sub = TRI.getSubRegFromChannel(I + TRI.getChannelFromSubReg(Sub));
+      Sub = TRI->getSubRegFromChannel(I + TRI->getChannelFromSubReg(Sub));
     }
   }
   return TargetInstrInfo::RegSubRegPair(Reg, Sub);
 }
 
-static void dropInstructionKeepingImpDefs(MachineInstr &MI,
-                                          const SIInstrInfo *TII) {
+void SIShrinkInstructions::dropInstructionKeepingImpDefs(
+    MachineInstr &MI) const {
   for (unsigned i = MI.getDesc().getNumOperands() +
          MI.getDesc().getNumImplicitUses() +
          MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands();
@@ -464,14 +579,13 @@ static void dropInstructionKeepingImpDefs(MachineInstr &MI,
 // Returns next valid instruction pointer if was able to create v_swap_b32.
 //
 // This shall not be done too early not to prevent possible folding which may
-// remove matched moves, and this should prefereably be done before RA to
+// remove matched moves, and this should preferably be done before RA to
 // release saved registers and also possibly after RA which can insert copies
 // too.
 //
-// This is really just a generic peephole that is not a canocical shrinking,
+// This is really just a generic peephole that is not a canonical shrinking,
 // although requirements match the pass placement and it reduces code size too.
-static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
-                               const SIInstrInfo *TII) {
+MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
   assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
          MovT.getOpcode() == AMDGPU::COPY);
 
@@ -486,8 +600,7 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
 
   unsigned Size = TII->getOpSize(MovT, 0) / 4;
 
-  const SIRegisterInfo &TRI = TII->getRegisterInfo();
-  if (!TRI.isVGPR(MRI, X))
+  if (!TRI->isVGPR(*MRI, X))
     return nullptr;
 
   if (MovT.hasRegisterImplicitUseOperand(AMDGPU::M0))
@@ -501,7 +614,7 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
        Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) {
 
     MachineInstr *MovY = &*Iter;
-    KilledT = MovY->killsRegister(T, &TRI);
+    KilledT = MovY->killsRegister(T, TRI);
 
     if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
          MovY->getOpcode() != AMDGPU::COPY) ||
@@ -514,21 +627,20 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
     Register Y = MovY->getOperand(0).getReg();
     unsigned Ysub = MovY->getOperand(0).getSubReg();
 
-    if (!TRI.isVGPR(MRI, Y))
+    if (!TRI->isVGPR(*MRI, Y))
       continue;
 
     MachineInstr *MovX = nullptr;
     for (auto IY = MovY->getIterator(), I = std::next(MovT.getIterator());
          I != IY; ++I) {
-      if (instReadsReg(&*I, X, Xsub, TRI)    ||
-          instModifiesReg(&*I, Y, Ysub, TRI) ||
-          instModifiesReg(&*I, T, Tsub, TRI) ||
-          (MovX && instModifiesReg(&*I, X, Xsub, TRI))) {
+      if (instReadsReg(&*I, X, Xsub) || instModifiesReg(&*I, Y, Ysub) ||
+          instModifiesReg(&*I, T, Tsub) ||
+          (MovX && instModifiesReg(&*I, X, Xsub))) {
         MovX = nullptr;
         break;
       }
-      if (!instReadsReg(&*I, Y, Ysub, TRI)) {
-        if (!MovX && instModifiesReg(&*I, X, Xsub, TRI)) {
+      if (!instReadsReg(&*I, Y, Ysub)) {
+        if (!MovX && instModifiesReg(&*I, X, Xsub)) {
           MovX = nullptr;
           break;
         }
@@ -559,8 +671,8 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
 
     for (unsigned I = 0; I < Size; ++I) {
       TargetInstrInfo::RegSubRegPair X1, Y1;
-      X1 = getSubRegForIndex(X, Xsub, I, TRI, MRI);
-      Y1 = getSubRegForIndex(Y, Ysub, I, TRI, MRI);
+      X1 = getSubRegForIndex(X, Xsub, I);
+      Y1 = getSubRegForIndex(Y, Ysub, I);
       MachineBasicBlock &MBB = *MovT.getParent();
       auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(),
                          TII->get(AMDGPU::V_SWAP_B32))
@@ -570,23 +682,23 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
         .addReg(X1.Reg, 0, X1.SubReg).getInstr();
       if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
         // Drop implicit EXEC.
-        MIB->RemoveOperand(MIB->getNumExplicitOperands());
+        MIB->removeOperand(MIB->getNumExplicitOperands());
         MIB->copyImplicitOps(*MBB.getParent(), *MovX);
       }
     }
     MovX->eraseFromParent();
-    dropInstructionKeepingImpDefs(*MovY, TII);
+    dropInstructionKeepingImpDefs(*MovY);
     MachineInstr *Next = &*std::next(MovT.getIterator());
 
-    if (T.isVirtual() && MRI.use_nodbg_empty(T)) {
-      dropInstructionKeepingImpDefs(MovT, TII);
+    if (T.isVirtual() && MRI->use_nodbg_empty(T)) {
+      dropInstructionKeepingImpDefs(MovT);
     } else {
       Xop.setIsKill(false);
       for (int I = MovT.getNumImplicitOperands() - 1; I >= 0; --I ) {
         unsigned OpNo = MovT.getNumExplicitOperands() + I;
         const MachineOperand &Op = MovT.getOperand(OpNo);
-        if (Op.isKill() && TRI.regsOverlap(X, Op.getReg()))
-          MovT.RemoveOperand(OpNo);
+        if (Op.isKill() && TRI->regsOverlap(X, Op.getReg()))
+          MovT.removeOperand(OpNo);
       }
     }
 
@@ -596,14 +708,32 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
   return nullptr;
 }
 
+// If an instruction has dead sdst replace it with NULL register on gfx1030+
+bool SIShrinkInstructions::tryReplaceDeadSDST(MachineInstr &MI) const {
+  if (!ST->hasGFX10_3Insts())
+    return false;
+
+  MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
+  if (!Op)
+    return false;
+  Register SDstReg = Op->getReg();
+  if (SDstReg.isPhysical() || !MRI->use_nodbg_empty(SDstReg))
+    return false;
+
+  Op->setReg(ST->isWave32() ? AMDGPU::SGPR_NULL : AMDGPU::SGPR_NULL64);
+  return true;
+}
+
 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
 
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  const SIInstrInfo *TII = ST.getInstrInfo();
-  unsigned VCCReg = ST.isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
+  MRI = &MF.getRegInfo();
+  ST = &MF.getSubtarget<GCNSubtarget>();
+  TII = ST->getInstrInfo();
+  TRI = &TII->getRegisterInfo();
+
+  unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
 
   std::vector<unsigned> I1Defs;
 
@@ -628,7 +758,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
         MachineOperand &Src = MI.getOperand(1);
         if (Src.isImm() && MI.getOperand(0).getReg().isPhysical()) {
           int32_t ReverseImm;
-          if (isReverseInlineImm(TII, Src, ReverseImm)) {
+          if (isReverseInlineImm(Src, ReverseImm)) {
             MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
             Src.setImm(ReverseImm);
             continue;
@@ -636,19 +766,15 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
         }
       }
 
-      if (ST.hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
-                           MI.getOpcode() == AMDGPU::COPY)) {
-        if (auto *NextMI = matchSwap(MI, MRI, TII)) {
+      if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
+                            MI.getOpcode() == AMDGPU::COPY)) {
+        if (auto *NextMI = matchSwap(MI)) {
           Next = NextMI->getIterator();
           continue;
         }
       }
 
-      // FIXME: We also need to consider movs of constant operands since
-      // immediate operands are not folded if they have more than one use, and
-      // the operand folding pass is unaware if the immediate will be free since
-      // it won't know if the src == dest constraint will end up being
-      // satisfied.
+      // Try to use S_ADDK_I32 and S_MULK_I32.
       if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
           MI.getOpcode() == AMDGPU::S_MUL_I32) {
         const MachineOperand *Dest = &MI.getOperand(0);
@@ -664,13 +790,13 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
         // we have a vector add of a constant, we usually don't get the correct
         // allocation due to the subregister usage.
         if (Dest->getReg().isVirtual() && Src0->isReg()) {
-          MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
-          MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
+          MRI->setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
+          MRI->setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
           continue;
         }
 
         if (Src0->isReg() && Src0->getReg() == Dest->getReg()) {
-          if (Src1->isImm() && isKImmOperand(TII, *Src1)) {
+          if (Src1->isImm() && isKImmOperand(*Src1)) {
             unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
               AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
 
@@ -682,7 +808,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
 
       // Try to use s_cmpk_*
       if (MI.isCompare() && TII->isSOPC(MI)) {
-        shrinkScalarCompare(TII, MI);
+        shrinkScalarCompare(MI);
         continue;
       }
 
@@ -693,9 +819,9 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
 
         if (Src.isImm() && Dst.getReg().isPhysical()) {
           int32_t ReverseImm;
-          if (isKImmOperand(TII, Src))
+          if (isKImmOperand(Src))
             MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
-          else if (isReverseInlineImm(TII, Src, ReverseImm)) {
+          else if (isReverseInlineImm(Src, ReverseImm)) {
             MI.setDesc(TII->get(AMDGPU::S_BREV_B32));
             Src.setImm(ReverseImm);
           }
@@ -708,47 +834,70 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       if (MI.getOpcode() == AMDGPU::S_AND_B32 ||
           MI.getOpcode() == AMDGPU::S_OR_B32 ||
           MI.getOpcode() == AMDGPU::S_XOR_B32) {
-        if (shrinkScalarLogicOp(ST, MRI, TII, MI))
+        if (shrinkScalarLogicOp(MI))
           continue;
       }
 
       if (TII->isMIMG(MI.getOpcode()) &&
-          ST.getGeneration() >= AMDGPUSubtarget::GFX10 &&
+          ST->getGeneration() >= AMDGPUSubtarget::GFX10 &&
           MF.getProperties().hasProperty(
               MachineFunctionProperties::Property::NoVRegs)) {
         shrinkMIMG(MI);
         continue;
       }
 
-      if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
+      if (!TII->isVOP3(MI))
         continue;
 
-      if (!TII->canShrink(MI, MRI)) {
+      if (MI.getOpcode() == AMDGPU::V_MAD_F32_e64 ||
+          MI.getOpcode() == AMDGPU::V_FMA_F32_e64 ||
+          MI.getOpcode() == AMDGPU::V_MAD_F16_e64 ||
+          MI.getOpcode() == AMDGPU::V_FMA_F16_e64) {
+        shrinkMadFma(MI);
+        continue;
+      }
+
+      if (!TII->hasVALU32BitEncoding(MI.getOpcode())) {
+        // If there is no chance we will shrink it and use VCC as sdst to get
+        // a 32 bit form try to replace dead sdst with NULL.
+        tryReplaceDeadSDST(MI);
+        continue;
+      }
+
+      if (!TII->canShrink(MI, *MRI)) {
         // Try commuting the instruction and see if that enables us to shrink
         // it.
         if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
-            !TII->canShrink(MI, MRI))
+            !TII->canShrink(MI, *MRI)) {
+          tryReplaceDeadSDST(MI);
           continue;
+        }
       }
 
       int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
 
       if (TII->isVOPC(Op32)) {
-        Register DstReg = MI.getOperand(0).getReg();
-        if (DstReg.isVirtual()) {
-          // VOPC instructions can only write to the VCC register. We can't
-          // force them to use VCC here, because this is only one register and
-          // cannot deal with sequences which would require multiple copies of
-          // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
-          //
-          // So, instead of forcing the instruction to write to VCC, we provide
-          // a hint to the register allocator to use VCC and then we will run
-          // this pass again after RA and shrink it if it outputs to VCC.
-          MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, VCCReg);
-          continue;
+        MachineOperand &Op0 = MI.getOperand(0);
+        if (Op0.isReg()) {
+          // Exclude VOPCX instructions as these don't explicitly write a
+          // dst.
+          Register DstReg = Op0.getReg();
+          if (DstReg.isVirtual()) {
+            // VOPC instructions can only write to the VCC register. We can't
+            // force them to use VCC here, because this is only one register and
+            // cannot deal with sequences which would require multiple copies of
+            // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
+            //
+            // So, instead of forcing the instruction to write to VCC, we
+            // provide a hint to the register allocator to use VCC and then we
+            // will run this pass again after RA and shrink it if it outputs to
+            // VCC.
+            MRI->setRegAllocationHint(DstReg, 0, VCCReg);
+            continue;
+          }
+          if (DstReg != VCCReg)
+            continue;
         }
-        if (DstReg != VCCReg)
-          continue;
       }
 
       if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
@@ -760,7 +909,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
           continue;
         Register SReg = Src2->getReg();
         if (SReg.isVirtual()) {
-          MRI.setRegAllocationHint(SReg, 0, VCCReg);
+          MRI->setRegAllocationHint(SReg, 0, VCCReg);
           continue;
         }
         if (SReg != VCCReg)
@@ -776,7 +925,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
 
         if (SDst->getReg() != VCCReg) {
           if (SDst->getReg().isVirtual())
-            MRI.setRegAllocationHint(SDst->getReg(), 0, VCCReg);
+            MRI->setRegAllocationHint(SDst->getReg(), 0, VCCReg);
           Next = true;
         }
 
@@ -786,7 +935,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
                                                           AMDGPU::OpName::src2);
         if (Src2 && Src2->getReg() != VCCReg) {
           if (Src2->getReg().isVirtual())
-            MRI.setRegAllocationHint(Src2->getReg(), 0, VCCReg);
+            MRI->setRegAllocationHint(Src2->getReg(), 0, VCCReg);
           Next = true;
         }
 
@@ -801,14 +950,14 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       ++NumInstructionsShrunk;
 
       // Copy extra operands not present in the instruction definition.
-      copyExtraImplicitOps(*Inst32, MF, MI);
+      copyExtraImplicitOps(*Inst32, MI);
 
       // Copy deadness from the old explicit vcc def to the new implicit def.
       if (SDst && SDst->isDead())
         Inst32->findRegisterDefOperand(VCCReg)->setIsDead();
 
       MI.eraseFromParent();
-      foldImmediates(*Inst32, TII, MRI);
+      foldImmediates(*Inst32);
 
       LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
     }
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 46efb3c605c6..a5798afab595 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -349,8 +349,7 @@ void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
     const VNInfo *NextValue = nullptr;
     const VisitKey Key(Value, DefinedLanes);
 
-    if (!Visited.count(Key)) {
-      Visited.insert(Key);
+    if (Visited.insert(Key).second) {
       // On first visit to a phi then start processing first predecessor
       NextPredIdx = 0;
     }
@@ -535,13 +534,36 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
         GlobalFlags |= StateStrictWWM;
         LowerToMovInstrs.push_back(&MI);
         continue;
-      } else if (Opcode == AMDGPU::STRICT_WQM) {
+      } else if (Opcode == AMDGPU::STRICT_WQM ||
+                 TII->isDualSourceBlendEXP(MI)) {
         // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
         // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
         // quads that have at least one active thread.
         markInstructionUses(MI, StateStrictWQM, Worklist);
         GlobalFlags |= StateStrictWQM;
-        LowerToMovInstrs.push_back(&MI);
+
+        if (Opcode == AMDGPU::STRICT_WQM) {
+          LowerToMovInstrs.push_back(&MI);
+        } else {
+          // Dual source blend export acts as implicit strict-wqm, its sources
+          // need to be shuffled in strict wqm, but the export itself needs to
+          // run in exact mode.
+          BBI.Needs |= StateExact;
+          if (!(BBI.InNeeds & StateExact)) {
+            BBI.InNeeds |= StateExact;
+            Worklist.push_back(MBB);
+          }
+          GlobalFlags |= StateExact;
+          III.Disabled = StateWQM | StateStrict;
+        }
+        continue;
+      } else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
+                 Opcode == AMDGPU::LDS_DIRECT_LOAD) {
+        // Mark these STRICTWQM, but only for the instruction, not its operands.
+        // This avoid unnecessarily marking M0 as requiring WQM.
+        InstrInfo &II = Instructions[&MI];
+        II.Needs |= StateStrictWQM;
+        GlobalFlags |= StateStrictWQM;
         continue;
       } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
                  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
@@ -969,7 +991,7 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
   MachineInstr *WQMMaskMI = nullptr;
   Register LiveMaskWQM;
   if (IsDemote) {
-    // Demotes deactive quads with only helper lanes
+    // Demote - deactivate quads with only helper lanes
     LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
     WQMMaskMI =
         BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
@@ -977,7 +999,7 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
                   .addReg(Exec)
                   .addReg(LiveMaskWQM);
   } else {
-    // Kills deactivate lanes
+    // Kill - deactivate lanes no longer in live mask
     if (Op.isImm()) {
       unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
       NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
@@ -1453,7 +1475,7 @@ void SIWholeQuadMode::lowerCopyInstrs() {
       }
       int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
       while (Index >= 0) {
-        MI->RemoveOperand(Index);
+        MI->removeOperand(Index);
         Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
       }
       MI->setDesc(TII->get(AMDGPU::COPY));
@@ -1468,7 +1490,7 @@ void SIWholeQuadMode::lowerCopyInstrs() {
       // an undef input so it is being replaced by a simple copy.
       // There should be a second undef source that we should remove.
       assert(MI->getOperand(2).isUndef());
-      MI->RemoveOperand(2);
+      MI->removeOperand(2);
       MI->untieRegOperand(1);
     } else {
       assert(MI->getNumExplicitOperands() == 2);
@@ -1588,11 +1610,11 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   // Physical registers like SCC aren't tracked by default anyway, so just
   // removing the ranges we computed is the simplest option for maintaining
   // the analysis results.
-  LIS->removeRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
+  LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC);
 
   // If we performed any kills then recompute EXEC
   if (!KillInstrs.empty())
-    LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
+    LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
 
   return true;
 }
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 184c871db775..882d13402a19 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -11,13 +11,19 @@ def smrd_offset_8 : NamedOperandU32<"SMRDOffset8",
   let OperandType = "OPERAND_IMMEDIATE";
 }
 
-def smem_offset : NamedOperandU32<"SMEMOffset",
-                                  NamedMatchClass<"SMEMOffset">> {
+class SMEMOffset : NamedOperandU32<"SMEMOffset",
+                                   NamedMatchClass<"SMEMOffset">> {
   let OperandType = "OPERAND_IMMEDIATE";
   let EncoderMethod = "getSMEMOffsetEncoding";
   let DecoderMethod = "decodeSMEMOffset";
 }
 
+def smem_offset : SMEMOffset;
+
+def smem_offset_mod : SMEMOffset {
+  let PrintMethod = "printSMEMOffsetMod";
+}
+
 //===----------------------------------------------------------------------===//
 // Scalar Memory classes
 //===----------------------------------------------------------------------===//
@@ -43,13 +49,13 @@ class SM_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt
   bits<1> has_sdst = 1;
   bit has_glc = 0;
   bit has_dlc = 0;
-  bits<1> has_offset = 1;
-  bits<1> offset_is_imm = 0;
+  bit has_offset = 0;
+  bit has_soffset = 0;
   bit is_buffer = 0;
 }
 
-class SM_Real <SM_Pseudo ps>
-  : InstSI<ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []> {
+class SM_Real <SM_Pseudo ps, string opName = ps.Mnemonic>
+  : InstSI<ps.OutOperandList, ps.InOperandList, opName # ps.AsmOperands> {
 
   let isPseudo = 0;
   let isCodeGenOnly = 0;
@@ -77,20 +83,40 @@ class SM_Real <SM_Pseudo ps>
   bits<7>  sbase;
   bits<7>  sdst;
   bits<32> offset;
-  bits<1> imm = !if(ps.has_offset, ps.offset_is_imm, 0);
+  bits<8>  soffset;
   bits<5> cpol;
 }
 
-class SM_Probe_Pseudo <string opName, dag ins, bit isImm>
-  : SM_Pseudo<opName, (outs), ins, " $sdata, $sbase, $offset"> {
+class OffsetMode<bit hasOffset, bit hasSOffset, string variant,
+                 dag ins, string asm> {
+  bit HasOffset = hasOffset;
+  bit HasSOffset = hasSOffset;
+  string Variant = variant;
+  dag Ins = ins;
+  string Asm = asm;
+}
+
+def IMM_Offset : OffsetMode<1, 0, "_IMM", (ins smem_offset:$offset), "$offset">;
+def SGPR_Offset : OffsetMode<0, 1, "_SGPR", (ins SReg_32:$soffset), "$soffset">;
+def SGPR_IMM_Offset : OffsetMode<1, 1, "_SGPR_IMM",
+                                 (ins SReg_32:$soffset, smem_offset_mod:$offset),
+                                 "$soffset$offset">;
+
+class SM_Probe_Pseudo <string opName, string variant, RegisterClass baseClass,
+                       dag offsets, string asmOffsets,
+                       bit hasOffset, bit hasSOffset>
+  : SM_Pseudo<opName, (outs),
+              !con((ins i8imm:$sdata, baseClass:$sbase), offsets),
+              " $sdata, $sbase, " # asmOffsets> {
   let mayLoad = 0;
   let mayStore = 0;
   let has_glc = 0;
   let LGKM_CNT = 0;
   let ScalarStore = 0;
   let hasSideEffects = 1;
-  let offset_is_imm = isImm;
-  let PseudoInstr = opName # !if(isImm, "_IMM", "_SGPR");
+  let has_offset = hasOffset;
+  let has_soffset = hasSOffset;
+  let PseudoInstr = opName # variant;
 }
 
 class SM_Load_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> pattern=[]>
@@ -102,10 +128,11 @@ class SM_Load_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag>
   let has_dlc = 1;
 }
 
-class SM_Store_Pseudo <string opName, dag ins, string asmOps, list<dag> pattern = []>
-  : SM_Pseudo<opName, (outs), ins, asmOps, pattern> {
-  RegisterClass BaseClass;
-  RegisterClass SrcClass;
+class SM_Store_Pseudo <string opName, RegisterClass baseClass,
+                       RegisterClass srcClass, dag ins, string asmOps>
+  : SM_Pseudo<opName, (outs), ins, asmOps, []> {
+  RegisterClass BaseClass = baseClass;
+  RegisterClass SrcClass = srcClass;
   let mayLoad = 0;
   let mayStore = 1;
   let has_glc = 1;
@@ -113,16 +140,19 @@ class SM_Store_Pseudo <string opName, dag ins, string asmOps, list<dag> pattern
   let ScalarStore = 1;
 }
 
-class SM_Discard_Pseudo <string opName, dag ins, bit isImm>
-  : SM_Pseudo<opName, (outs), ins, " $sbase, $offset"> {
+class SM_Discard_Pseudo <string opName, string variant, dag offsets,
+                         string asmOffsets, bit hasOffset, bit hasSOffset>
+  : SM_Pseudo<opName, (outs), !con((ins SReg_64:$sbase), offsets),
+              " $sbase, " # asmOffsets> {
   let mayLoad = 0;
   let mayStore = 0;
   let has_glc = 0;
   let has_sdst = 0;
   let ScalarStore = 0;
   let hasSideEffects = 1;
-  let offset_is_imm = isImm;
-  let PseudoInstr = opName # !if(isImm, "_IMM", "_SGPR");
+  let has_offset = hasOffset;
+  let has_soffset = hasSOffset;
+  let PseudoInstr = opName # variant;
 }
 
 multiclass SM_Pseudo_Loads<string opName,
@@ -132,7 +162,7 @@ multiclass SM_Pseudo_Loads<string opName,
                               (outs dstClass:$sdst),
                               (ins baseClass:$sbase, i32imm:$offset, CPol:$cpol),
                               " $sdst, $sbase, $offset$cpol", []> {
-    let offset_is_imm = 1;
+    let has_offset = 1;
     let BaseClass = baseClass;
     let PseudoInstr = opName # "_IMM";
     let has_glc = 1;
@@ -141,39 +171,63 @@ multiclass SM_Pseudo_Loads<string opName,
 
   def _SGPR  : SM_Load_Pseudo <opName,
                               (outs dstClass:$sdst),
-                              (ins baseClass:$sbase, SReg_32:$soff, CPol:$cpol),
-                              " $sdst, $sbase, $offset$cpol", []> {
+                              (ins baseClass:$sbase, SReg_32:$soffset, CPol:$cpol),
+                              " $sdst, $sbase, $soffset$cpol", []> {
+    let has_soffset = 1;
     let BaseClass = baseClass;
     let PseudoInstr = opName # "_SGPR";
     let has_glc = 1;
     let has_dlc = 1;
   }
+
+  def _SGPR_IMM  : SM_Load_Pseudo <opName,
+                                   (outs dstClass:$sdst),
+                                   (ins baseClass:$sbase, SReg_32:$soffset,
+                                        i32imm:$offset, CPol:$cpol),
+                                   " $sdst, $sbase, $soffset$offset$cpol", []> {
+    let has_offset = 1;
+    let has_soffset = 1;
+    let BaseClass = baseClass;
+    let PseudoInstr = opName # "_SGPR_IMM";
+    let has_glc = 1;
+    let has_dlc = 1;
+  }
 }
 
 multiclass SM_Pseudo_Stores<string opName,
                            RegisterClass baseClass,
                            RegisterClass srcClass> {
-  def _IMM  : SM_Store_Pseudo <opName,
+  def _IMM : SM_Store_Pseudo <opName, baseClass, srcClass,
     (ins srcClass:$sdata, baseClass:$sbase, i32imm:$offset, CPol:$cpol),
-    " $sdata, $sbase, $offset$cpol", []> {
-    let offset_is_imm = 1;
-    let BaseClass = baseClass;
-    let SrcClass = srcClass;
+    " $sdata, $sbase, $offset$cpol"> {
+    let has_offset = 1;
     let PseudoInstr = opName # "_IMM";
   }
 
-  def _SGPR  : SM_Store_Pseudo <opName,
-    (ins srcClass:$sdata, baseClass:$sbase, SReg_32:$soff, CPol:$cpol),
-    " $sdata, $sbase, $offset$cpol", []> {
-    let BaseClass = baseClass;
-    let SrcClass = srcClass;
+  def _SGPR : SM_Store_Pseudo <opName, baseClass, srcClass,
+    (ins srcClass:$sdata, baseClass:$sbase, SReg_32:$soffset, CPol:$cpol),
+    " $sdata, $sbase, $soffset$cpol"> {
+    let has_soffset = 1;
     let PseudoInstr = opName # "_SGPR";
   }
+
+  def _SGPR_IMM : SM_Store_Pseudo <opName, baseClass, srcClass,
+    (ins srcClass:$sdata, baseClass:$sbase, SReg_32:$soffset, i32imm:$offset,
+     CPol:$cpol),
+    " $sdata, $sbase, $soffset$offset$cpol"> {
+    let has_offset = 1;
+    let has_soffset = 1;
+    let PseudoInstr = opName # "_SGPR_IMM";
+  }
 }
 
 multiclass SM_Pseudo_Discards<string opName> {
-  def _IMM  : SM_Discard_Pseudo <opName, (ins SReg_64:$sbase, smem_offset:$offset), 1>;
-  def _SGPR : SM_Discard_Pseudo <opName, (ins SReg_64:$sbase, SReg_32:$offset), 0>;
+  def _IMM  : SM_Discard_Pseudo <opName, "_IMM",
+    (ins smem_offset:$offset), "$offset", 1, 0>;
+  def _SGPR : SM_Discard_Pseudo <opName, "_SGPR",
+    (ins SReg_32:$soffset), "$soffset", 0, 1>;
+  def _SGPR_IMM : SM_Discard_Pseudo <opName, "_SGPR_IMM",
+    (ins SReg_32:$soffset, smem_offset_mod:$offset), "$soffset$offset", 1, 1>;
 }
 
 class SM_Time_Pseudo<string opName, SDPatternOperator node = null_frag> : SM_Pseudo<
@@ -184,21 +238,24 @@ class SM_Time_Pseudo<string opName, SDPatternOperator node = null_frag> : SM_Pse
   let mayStore = 0;
   let mayLoad = 0;
   let has_sbase = 0;
-  let has_offset = 0;
 }
 
 class SM_Inval_Pseudo <string opName, SDPatternOperator node = null_frag> : SM_Pseudo<
   opName, (outs), (ins), "", [(node)]> {
   let hasSideEffects = 1;
+  let mayLoad = 0;
   let mayStore = 0;
   let has_sdst = 0;
   let has_sbase = 0;
-  let has_offset = 0;
 }
 
 multiclass SM_Pseudo_Probe<string opName, RegisterClass baseClass> {
-  def _IMM  : SM_Probe_Pseudo <opName, (ins i8imm:$sdata, baseClass:$sbase, smem_offset:$offset), 1>;
-  def _SGPR : SM_Probe_Pseudo <opName, (ins i8imm:$sdata, baseClass:$sbase, SReg_32:$offset), 0>;
+  def _IMM  : SM_Probe_Pseudo <opName, "_IMM", baseClass,
+    (ins smem_offset:$offset), "$offset", 1, 0>;
+  def _SGPR : SM_Probe_Pseudo <opName, "_SGPR", baseClass,
+    (ins SReg_32:$soffset), "$soffset", 0, 1>;
+  def _SGPR_IMM : SM_Probe_Pseudo <opName, "_SGPR_IMM", baseClass,
+    (ins SReg_32:$soffset, smem_offset_mod:$offset), "$soffset$offset", 1, 1>;
 }
 
 class SM_WaveId_Pseudo<string opName, SDPatternOperator node> : SM_Pseudo<
@@ -206,9 +263,8 @@ class SM_WaveId_Pseudo<string opName, SDPatternOperator node> : SM_Pseudo<
   " $sdst", [(set i32:$sdst, (node))]> {
   let hasSideEffects = 1;
   let mayStore = 0;
-  let mayLoad = 1;
+  let mayLoad = 0;
   let has_sbase = 0;
-  let has_offset = 0;
 }
 
 //===----------------------------------------------------------------------===//
@@ -225,6 +281,7 @@ class SM_Atomic_Pseudo <string opName,
   let mayStore = 1;
   let has_glc = 1;
   let has_dlc = 1;
+  let has_soffset = 1;
 
   // Should these be set?
   let ScalarStore = 1;
@@ -240,21 +297,21 @@ class SM_Atomic_Pseudo <string opName,
 class SM_Pseudo_Atomic<string opName,
                        RegisterClass baseClass,
                        RegisterClass dataClass,
-                       bit isImm,
+                       OffsetMode offsets,
                        bit isRet,
-                       string opNameWithSuffix = opName # !if(isImm,
-                                 !if(isRet, "_IMM_RTN", "_IMM"),
-                                 !if(isRet, "_SGPR_RTN", "_SGPR")),
+                       string opNameWithSuffix =
+                         opName # offsets.Variant # !if(isRet, "_RTN", ""),
                        Operand CPolTy = !if(isRet, CPol_GLC1, CPol)> :
   SM_Atomic_Pseudo<opName,
                    !if(isRet, (outs dataClass:$sdst), (outs)),
-                   !if(isImm,
-                       (ins dataClass:$sdata, baseClass:$sbase, smem_offset:$offset, CPolTy:$cpol),
-                       (ins dataClass:$sdata, baseClass:$sbase, SReg_32:$offset, CPolTy:$cpol)),
-                   !if(isRet, " $sdst", " $sdata") # ", $sbase, $offset$cpol",
+                   !con((ins dataClass:$sdata, baseClass:$sbase), offsets.Ins,
+                        (ins CPolTy:$cpol)),
+                   !if(isRet, " $sdst", " $sdata") #
+                     ", $sbase, " # offsets.Asm # "$cpol",
                    isRet>,
   AtomicNoRet <opNameWithSuffix, isRet> {
-  let offset_is_imm = isImm;
+  let has_offset = offsets.HasOffset;
+  let has_soffset = offsets.HasSOffset;
   let PseudoInstr = opNameWithSuffix;
 
   let Constraints = !if(isRet, "$sdst = $sdata", "");
@@ -264,10 +321,12 @@ class SM_Pseudo_Atomic<string opName,
 multiclass SM_Pseudo_Atomics<string opName,
                              RegisterClass baseClass,
                              RegisterClass dataClass> {
-  def _IMM      : SM_Pseudo_Atomic <opName, baseClass, dataClass, 1, 0>;
-  def _SGPR     : SM_Pseudo_Atomic <opName, baseClass, dataClass, 0, 0>;
-  def _IMM_RTN  : SM_Pseudo_Atomic <opName, baseClass, dataClass, 1, 1>;
-  def _SGPR_RTN : SM_Pseudo_Atomic <opName, baseClass, dataClass, 0, 1>;
+  def _IMM      : SM_Pseudo_Atomic <opName, baseClass, dataClass, IMM_Offset, 0>;
+  def _SGPR     : SM_Pseudo_Atomic <opName, baseClass, dataClass, SGPR_Offset, 0>;
+  def _SGPR_IMM : SM_Pseudo_Atomic <opName, baseClass, dataClass, SGPR_IMM_Offset, 0>;
+  def _IMM_RTN  : SM_Pseudo_Atomic <opName, baseClass, dataClass, IMM_Offset, 1>;
+  def _SGPR_RTN : SM_Pseudo_Atomic <opName, baseClass, dataClass, SGPR_Offset, 1>;
+  def _SGPR_IMM_RTN : SM_Pseudo_Atomic <opName, baseClass, dataClass, SGPR_IMM_Offset, 1>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -452,16 +511,14 @@ class SMRD_Real_si <bits<5> op, SM_Pseudo ps>
   let AssemblerPredicate = isGFX6GFX7;
   let DecoderNamespace = "GFX6GFX7";
 
-  let Inst{7-0}   = !if(ps.has_offset, offset{7-0}, ?);
-  let Inst{8}     = imm;
+  let Inst{7-0}   = !if(ps.has_offset, offset{7-0}, !if(ps.has_soffset, soffset, ?));
+  let Inst{8}     = ps.has_offset;
   let Inst{14-9}  = !if(ps.has_sbase, sbase{6-1}, ?);
   let Inst{21-15} = !if(ps.has_sdst, sdst{6-0}, ?);
   let Inst{26-22} = op;
   let Inst{31-27} = 0x18; //encoding
 }
 
-// FIXME: Assembler should reject trying to use glc on SMRD
-// instructions on SI.
 multiclass SM_Real_Loads_si<bits<5> op, string ps,
                             SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM),
                             SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> {
@@ -470,10 +527,8 @@ multiclass SM_Real_Loads_si<bits<5> op, string ps,
     let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_8:$offset, CPol:$cpol);
   }
 
-  // FIXME: The operand name $offset is inconsistent with $soff used
-  // in the pseudo
   def _SGPR_si : SMRD_Real_si <op, sgprPs> {
-    let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, CPol:$cpol);
+    let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$soffset, CPol:$cpol);
   }
 
 }
@@ -494,42 +549,82 @@ def S_DCACHE_INV_si : SMRD_Real_si <0x1f, S_DCACHE_INV>;
 
 
 //===----------------------------------------------------------------------===//
-// VI
+// VI and GFX9.
 //===----------------------------------------------------------------------===//
 
 class SMEM_Real_vi <bits<8> op, SM_Pseudo ps>
   : SM_Real<ps>
   , SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI>
   , Enc64 {
-  let AssemblerPredicate = isGFX8GFX9;
+  field bit IsGFX9SpecificEncoding = false;
+  let AssemblerPredicate = !if(IsGFX9SpecificEncoding, isGFX9Only, isGFX8GFX9);
   let DecoderNamespace = "GFX8";
 
   let Inst{5-0}   = !if(ps.has_sbase, sbase{6-1}, ?);
   let Inst{12-6}  = !if(ps.has_sdst, sdst{6-0}, ?);
 
+  // Note that for GFX9 instructions with immediate offsets, soffset_en
+  // must be defined, whereas in GFX8 it's undefined in all cases,
+  // meaning GFX9 is not perfectly backward-compatible with GFX8, despite
+  // documentation suggesting otherwise.
+  field bit SOffsetEn = !if(IsGFX9SpecificEncoding,
+    !if(ps.has_offset, ps.has_soffset, !if(ps.has_soffset, 0, ?)),
+    ?);
+  let Inst{14} = SOffsetEn;
+
   let Inst{16} = !if(ps.has_glc, cpol{CPolBit.GLC}, ?);
-  let Inst{17} = imm;
+
+  // imm
+  // TODO: Shall not be defined if the instruction has no offset nor
+  // soffset.
+  let Inst{17} = ps.has_offset;
+
   let Inst{25-18} = op;
   let Inst{31-26} = 0x30; //encoding
 
   // VI supports 20-bit unsigned offsets while GFX9+ supports 21-bit signed.
   // Offset value is corrected accordingly when offset is encoded/decoded.
-  let Inst{38-32} = !if(ps.has_offset, offset{6-0}, ?);
-  let Inst{52-39} = !if(ps.has_offset, !if(imm, offset{20-7}, ?), ?);
+  // TODO: Forbid non-M0 register offsets for GFX8 stores and atomics.
+  field bits<21> Offset;
+  let Offset{6-0} = !if(ps.has_offset, offset{6-0},
+                                       !if(ps.has_soffset, soffset{6-0}, ?));
+  let Offset{20-7} = !if(ps.has_offset, offset{20-7}, ?);
+  let Inst{52-32} = Offset;
+
+  // soffset
+  let Inst{63-57} = !if(!and(IsGFX9SpecificEncoding, ps.has_soffset),
+                        soffset{6-0}, ?);
 }
 
-multiclass SM_Real_Loads_vi<bits<8> op, string ps,
-                            SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM),
-                            SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> {
-  def _IMM_vi : SMEM_Real_vi <op, immPs> {
-    let InOperandList = (ins immPs.BaseClass:$sbase, smem_offset:$offset, CPol:$cpol);
-  }
-  def _SGPR_vi : SMEM_Real_vi <op, sgprPs> {
-    let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, CPol:$cpol);
-  }
+class SMEM_Real_Load_vi<bits<8> op, string ps, dag offsets>
+    : SMEM_Real_vi<op, !cast<SM_Pseudo>(ps)> {
+  RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps).BaseClass;
+  let InOperandList = !con((ins BaseClass:$sbase), offsets, (ins CPol:$cpol));
 }
 
-class SMEM_Real_Store_vi <bits<8> op, SM_Pseudo ps> : SMEM_Real_vi <op, ps> {
+// The alternative GFX9 SGPR encoding using soffset to encode the
+// offset register. Not available in assembler and goes to the GFX9
+// encoding family to avoid conflicts with the primary SGPR variant.
+class SMEM_Real_SGPR_alt_gfx9 {
+  bit IsGFX9SpecificEncoding = true;
+  bit SOffsetEn = 1;
+  bit Offset = ?;
+  int Subtarget = SIEncodingFamily.GFX9;
+  string AsmVariantName = "NonParsable";
+}
+
+multiclass SM_Real_Loads_vi<bits<8> op, string ps> {
+  def _IMM_vi : SMEM_Real_Load_vi <op, ps#"_IMM", (ins smem_offset:$offset)>;
+  def _SGPR_vi : SMEM_Real_Load_vi <op, ps#"_SGPR", (ins SReg_32:$soffset)>;
+  def _SGPR_alt_gfx9 : SMEM_Real_Load_vi <op, ps#"_SGPR",
+                                          (ins SReg_32:$soffset)>,
+                       SMEM_Real_SGPR_alt_gfx9;
+  let IsGFX9SpecificEncoding = true in
+  def _SGPR_IMM_gfx9 : SMEM_Real_Load_vi <
+    op, ps#"_SGPR_IMM", (ins SReg_32:$soffset, smem_offset_mod:$offset)>;
+}
+
+class SMEM_Real_Store_Base_vi <bits<8> op, SM_Pseudo ps> : SMEM_Real_vi <op, ps> {
   // encoding
   bits<7> sdata;
 
@@ -537,23 +632,34 @@ class SMEM_Real_Store_vi <bits<8> op, SM_Pseudo ps> : SMEM_Real_vi <op, ps> {
   let Inst{12-6}  = !if(ps.has_sdst, sdata{6-0}, ?);
 }
 
-multiclass SM_Real_Stores_vi<bits<8> op, string ps,
-                            SM_Store_Pseudo immPs = !cast<SM_Store_Pseudo>(ps#_IMM),
-                            SM_Store_Pseudo sgprPs = !cast<SM_Store_Pseudo>(ps#_SGPR)> {
-  // FIXME: The operand name $offset is inconsistent with $soff used
-  // in the pseudo
-  def _IMM_vi : SMEM_Real_Store_vi <op, immPs> {
-    let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smem_offset:$offset, CPol:$cpol);
-  }
+class SMEM_Real_Store_vi <bits<8> op, string ps, dag offsets>
+    : SMEM_Real_Store_Base_vi <op, !cast<SM_Pseudo>(ps)> {
+  RegisterClass SrcClass = !cast<SM_Store_Pseudo>(ps).SrcClass;
+  RegisterClass BaseClass = !cast<SM_Store_Pseudo>(ps).BaseClass;
+  let InOperandList = !con((ins SrcClass:$sdata, BaseClass:$sbase),
+                           offsets, (ins CPol:$cpol));
+}
 
-  def _SGPR_vi : SMEM_Real_Store_vi <op, sgprPs> {
-    let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, CPol:$cpol);
-  }
+multiclass SM_Real_Stores_vi<bits<8> op, string ps> {
+  def _IMM_vi : SMEM_Real_Store_vi <op, ps#_IMM, (ins smem_offset:$offset)>;
+  def _SGPR_vi : SMEM_Real_Store_vi <op, ps#_SGPR, (ins SReg_32:$soffset)>;
+  def _SGPR_alt_gfx9 : SMEM_Real_Store_vi <op, ps#"_SGPR",
+                                           (ins SReg_32:$soffset)>,
+                       SMEM_Real_SGPR_alt_gfx9;
+  let IsGFX9SpecificEncoding = true in
+  def _SGPR_IMM_gfx9 : SMEM_Real_Store_vi <
+    op, ps#"_SGPR_IMM", (ins SReg_32:$soffset, smem_offset_mod:$offset)>;
 }
 
 multiclass SM_Real_Probe_vi<bits<8> op, string ps> {
-  def _IMM_vi  : SMEM_Real_Store_vi <op, !cast<SM_Probe_Pseudo>(ps#_IMM)>;
-  def _SGPR_vi : SMEM_Real_Store_vi <op, !cast<SM_Probe_Pseudo>(ps#_SGPR)>;
+  def _IMM_vi  : SMEM_Real_Store_Base_vi <op, !cast<SM_Probe_Pseudo>(ps#_IMM)>;
+  def _SGPR_vi : SMEM_Real_Store_Base_vi <op, !cast<SM_Probe_Pseudo>(ps#_SGPR)>;
+  def _SGPR_alt_gfx9
+    : SMEM_Real_Store_Base_vi <op, !cast<SM_Probe_Pseudo>(ps#_SGPR)>,
+      SMEM_Real_SGPR_alt_gfx9;
+  let IsGFX9SpecificEncoding = true in
+  def _SGPR_IMM_gfx9
+    : SMEM_Real_Store_Base_vi <op, !cast<SM_Probe_Pseudo>(ps#_SGPR_IMM)>;
 }
 
 defm S_LOAD_DWORD           : SM_Real_Loads_vi <0x00, "S_LOAD_DWORD">;
@@ -614,8 +720,20 @@ class SMEM_Atomic_Real_vi <bits<8> op, SM_Atomic_Pseudo ps>
 multiclass SM_Real_Atomics_vi<bits<8> op, string ps> {
   def _IMM_vi       : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_IMM)>;
   def _SGPR_vi      : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR)>;
+  def _SGPR_alt_gfx9
+    : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR)>,
+      SMEM_Real_SGPR_alt_gfx9;
+  let IsGFX9SpecificEncoding = true in
+  def _SGPR_IMM_gfx9
+    : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_IMM)>;
   def _IMM_RTN_vi   : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_IMM_RTN)>;
   def _SGPR_RTN_vi  : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_RTN)>;
+  def _SGPR_RTN_alt_gfx9
+    : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_RTN)>,
+      SMEM_Real_SGPR_alt_gfx9;
+  let IsGFX9SpecificEncoding = true in
+  def _SGPR_IMM_RTN_gfx9
+    : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_IMM_RTN)>;
 }
 
 defm S_BUFFER_ATOMIC_SWAP         : SM_Real_Atomics_vi <0x40, "S_BUFFER_ATOMIC_SWAP">;
@@ -677,6 +795,10 @@ defm S_ATOMIC_DEC_X2              : SM_Real_Atomics_vi <0xac, "S_ATOMIC_DEC_X2">
 multiclass SM_Real_Discard_vi<bits<8> op, string ps> {
   def _IMM_vi  : SMEM_Real_vi <op, !cast<SM_Discard_Pseudo>(ps#_IMM)>;
   def _SGPR_vi : SMEM_Real_vi <op, !cast<SM_Discard_Pseudo>(ps#_SGPR)>;
+  def _SGPR_alt_gfx9 : SMEM_Real_vi <op, !cast<SM_Discard_Pseudo>(ps#_SGPR)>,
+                       SMEM_Real_SGPR_alt_gfx9;
+  let IsGFX9SpecificEncoding = true in
+  def _SGPR_IMM_gfx9 : SMEM_Real_vi <op, !cast<SM_Discard_Pseudo>(ps#_SGPR_IMM)>;
 }
 
 defm S_DCACHE_DISCARD    : SM_Real_Discard_vi <0x28, "S_DCACHE_DISCARD">;
@@ -727,8 +849,8 @@ class SMRD_Real_ci <bits<5> op, SM_Pseudo ps>
   let AssemblerPredicate = isGFX7Only;
   let DecoderNamespace = "GFX7";
 
-  let Inst{7-0}   = !if(ps.has_offset, offset{7-0}, ?);
-  let Inst{8}     = imm;
+  let Inst{7-0}   = !if(ps.has_offset, offset{7-0}, !if(ps.has_soffset, soffset, ?));
+  let Inst{8}     = ps.has_offset;
   let Inst{14-9}  = !if(ps.has_sbase, sbase{6-1}, ?);
   let Inst{21-15} = !if(ps.has_sdst, sdst{6-0}, ?);
   let Inst{26-22} = op;
@@ -876,20 +998,27 @@ def : GCNPat <
 // GFX10.
 //===----------------------------------------------------------------------===//
 
-class SMEM_Real_gfx10<bits<8> op, SM_Pseudo ps> :
-    SM_Real<ps>, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX10>, Enc64 {
-  let AssemblerPredicate = isGFX10Plus;
-  let DecoderNamespace = "GFX10";
-
+class SMEM_Real_10Plus_common<bits<8> op, SM_Pseudo ps, string opName,
+                              int subtarget, RegisterWithSubRegs sgpr_null> :
+    SM_Real<ps, opName>, SIMCInstr<ps.PseudoInstr, subtarget>, Enc64 {
   let Inst{5-0}   = !if(ps.has_sbase, sbase{6-1}, ?);
   let Inst{12-6}  = !if(ps.has_sdst, sdst{6-0}, ?);
-  let Inst{14}    = !if(ps.has_dlc, cpol{CPolBit.DLC}, ?);
-  let Inst{16}    = !if(ps.has_glc, cpol{CPolBit.GLC}, ?);
   let Inst{25-18} = op;
   let Inst{31-26} = 0x3d;
-  let Inst{52-32} = !if(ps.offset_is_imm, !if(ps.has_offset, offset{20-0}, ?), ?);
-  let Inst{63-57} = !if(ps.offset_is_imm, !cast<int>(SGPR_NULL.HWEncoding),
-                                          !if(ps.has_offset, offset{6-0}, ?));
+  // There are SMEM instructions that do not employ any of the offset
+  // fields, in which case we need them to remain undefined.
+  let Inst{52-32} = !if(ps.has_offset, offset{20-0}, !if(ps.has_soffset, 0, ?));
+  let Inst{63-57} = !if(ps.has_soffset, soffset{6-0},
+                        !if(ps.has_offset, sgpr_null.HWEncoding{6-0}, ?));
+}
+
+class SMEM_Real_gfx10<bits<8> op, SM_Pseudo ps>
+    : SMEM_Real_10Plus_common<op, ps, ps.Mnemonic, SIEncodingFamily.GFX10,
+                              SGPR_NULL_gfxpre11> {
+  let AssemblerPredicate = isGFX10Only;
+  let DecoderNamespace = "GFX10";
+  let Inst{14}    = !if(ps.has_dlc, cpol{CPolBit.DLC}, ?);
+  let Inst{16}    = !if(ps.has_glc, cpol{CPolBit.GLC}, ?);
 }
 
 multiclass SM_Real_Loads_gfx10<bits<8> op, string ps,
@@ -899,7 +1028,11 @@ multiclass SM_Real_Loads_gfx10<bits<8> op, string ps,
     let InOperandList = (ins immPs.BaseClass:$sbase, smem_offset:$offset, CPol:$cpol);
   }
   def _SGPR_gfx10 : SMEM_Real_gfx10<op, sgprPs> {
-    let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, CPol:$cpol);
+    let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$soffset, CPol:$cpol);
+  }
+  def _SGPR_IMM_gfx10 : SMEM_Real_gfx10<op, !cast<SM_Load_Pseudo>(ps#_SGPR_IMM)> {
+    let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$soffset,
+                             smem_offset_mod:$offset, CPol:$cpol);
   }
 }
 
@@ -913,14 +1046,17 @@ class SMEM_Real_Store_gfx10<bits<8> op, SM_Pseudo ps> : SMEM_Real_gfx10<op, ps>
 multiclass SM_Real_Stores_gfx10<bits<8> op, string ps,
                                 SM_Store_Pseudo immPs = !cast<SM_Store_Pseudo>(ps#_IMM),
                                 SM_Store_Pseudo sgprPs = !cast<SM_Store_Pseudo>(ps#_SGPR)> {
-  // FIXME: The operand name $offset is inconsistent with $soff used
-  // in the pseudo
   def _IMM_gfx10 : SMEM_Real_Store_gfx10 <op, immPs> {
     let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smem_offset:$offset, CPol:$cpol);
   }
 
   def _SGPR_gfx10 : SMEM_Real_Store_gfx10 <op, sgprPs> {
-    let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, CPol:$cpol);
+    let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$soffset, CPol:$cpol);
+  }
+
+  def _SGPR_IMM_gfx10 : SMEM_Real_Store_gfx10 <op, !cast<SM_Store_Pseudo>(ps#_SGPR_IMM)> {
+    let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase,
+                             SReg_32:$soffset, smem_offset_mod:$offset, CPol:$cpol);
   }
 }
 
@@ -969,6 +1105,8 @@ def S_DCACHE_WB_gfx10                : SMEM_Real_gfx10<0x021, S_DCACHE_WB>;
 multiclass SM_Real_Probe_gfx10<bits<8> op, string ps> {
   def _IMM_gfx10  : SMEM_Real_Store_gfx10 <op, !cast<SM_Pseudo>(ps#_IMM)>;
   def _SGPR_gfx10 : SMEM_Real_Store_gfx10 <op, !cast<SM_Pseudo>(ps#_SGPR)>;
+  def _SGPR_IMM_gfx10
+    : SMEM_Real_Store_gfx10 <op, !cast<SM_Pseudo>(ps#_SGPR_IMM)>;
 }
 
 defm S_ATC_PROBE        : SM_Real_Probe_gfx10 <0x26, "S_ATC_PROBE">;
@@ -992,8 +1130,10 @@ class SMEM_Atomic_Real_gfx10 <bits<8> op, SM_Atomic_Pseudo ps>
 multiclass SM_Real_Atomics_gfx10<bits<8> op, string ps> {
   def _IMM_gfx10       : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_IMM)>;
   def _SGPR_gfx10      : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR)>;
+  def _SGPR_IMM_gfx10  : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_IMM)>;
   def _IMM_RTN_gfx10   : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_IMM_RTN)>;
   def _SGPR_RTN_gfx10  : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_RTN)>;
+  def _SGPR_IMM_RTN_gfx10 : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_IMM_RTN)>;
 }
 
 let SubtargetPredicate = HasScalarAtomics in {
@@ -1057,6 +1197,7 @@ defm S_ATOMIC_DEC_X2              : SM_Real_Atomics_gfx10 <0xac, "S_ATOMIC_DEC_X
 multiclass SM_Real_Discard_gfx10<bits<8> op, string ps> {
   def _IMM_gfx10  : SMEM_Real_gfx10 <op, !cast<SM_Pseudo>(ps#_IMM)>;
   def _SGPR_gfx10 : SMEM_Real_gfx10 <op, !cast<SM_Pseudo>(ps#_SGPR)>;
+  def _SGPR_IMM_gfx10 : SMEM_Real_gfx10 <op, !cast<SM_Pseudo>(ps#_SGPR_IMM)>;
 }
 
 defm S_DCACHE_DISCARD    : SM_Real_Discard_gfx10 <0x28, "S_DCACHE_DISCARD">;
@@ -1072,3 +1213,64 @@ def SMInfoTable : GenericTable {
   let PrimaryKey = ["Opcode"];
   let PrimaryKeyName = "getSMEMOpcodeHelper";
 }
+
+//===----------------------------------------------------------------------===//
+// GFX11.
+//===----------------------------------------------------------------------===//
+
+class SMEM_Real_gfx11<bits<8> op, SM_Pseudo ps, string opName = ps.Mnemonic> :
+    SMEM_Real_10Plus_common<op, ps, opName, SIEncodingFamily.GFX11,
+                            SGPR_NULL_gfx11plus> {
+  let AssemblerPredicate = isGFX11Plus;
+  let DecoderNamespace = "GFX11";
+  let Inst{13}    = !if(ps.has_dlc, cpol{CPolBit.DLC}, 0);
+  let Inst{14}    = !if(ps.has_glc, cpol{CPolBit.GLC}, 0);
+}
+
+class SMEM_Real_Load_gfx11<bits<8> op, string ps, string opName, dag offsets> :
+    SMEM_Real_gfx11<op, !cast<SM_Pseudo>(ps), opName> {
+  RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps).BaseClass;
+  let InOperandList = !con((ins BaseClass:$sbase), offsets, (ins CPol:$cpol));
+}
+
+multiclass SM_Real_Loads_gfx11<bits<8> op, string ps, string opName> {
+  def _IMM_gfx11 : SMEM_Real_Load_gfx11<op, ps#"_IMM", opName, (ins smem_offset:$offset)>;
+  def _SGPR_gfx11 : SMEM_Real_Load_gfx11<op, ps#"_SGPR", opName, (ins SReg_32:$soffset)>;
+  def _SGPR_IMM_gfx11 : SMEM_Real_Load_gfx11<
+    op, ps#"_SGPR_IMM", opName, (ins SReg_32:$soffset, smem_offset_mod:$offset)>;
+  def : MnemonicAlias<!cast<SM_Pseudo>(ps#"_IMM").Mnemonic, opName>,
+                      Requires<[isGFX11Plus]>;
+}
+
+defm S_LOAD_B32  : SM_Real_Loads_gfx11<0x000, "S_LOAD_DWORD", "s_load_b32">;
+defm S_LOAD_B64  : SM_Real_Loads_gfx11<0x001, "S_LOAD_DWORDX2", "s_load_b64">;
+defm S_LOAD_B128 : SM_Real_Loads_gfx11<0x002, "S_LOAD_DWORDX4", "s_load_b128">;
+defm S_LOAD_B256 : SM_Real_Loads_gfx11<0x003, "S_LOAD_DWORDX8", "s_load_b256">;
+defm S_LOAD_B512 : SM_Real_Loads_gfx11<0x004, "S_LOAD_DWORDX16", "s_load_b512">;
+
+defm S_BUFFER_LOAD_B32  : SM_Real_Loads_gfx11<0x008, "S_BUFFER_LOAD_DWORD", "s_buffer_load_b32">;
+defm S_BUFFER_LOAD_B64  : SM_Real_Loads_gfx11<0x009, "S_BUFFER_LOAD_DWORDX2", "s_buffer_load_b64">;
+defm S_BUFFER_LOAD_B128 : SM_Real_Loads_gfx11<0x00a, "S_BUFFER_LOAD_DWORDX4", "s_buffer_load_b128">;
+defm S_BUFFER_LOAD_B256 : SM_Real_Loads_gfx11<0x00b, "S_BUFFER_LOAD_DWORDX8", "s_buffer_load_b256">;
+defm S_BUFFER_LOAD_B512 : SM_Real_Loads_gfx11<0x00c, "S_BUFFER_LOAD_DWORDX16", "s_buffer_load_b512">;
+
+def S_GL1_INV_gfx11    : SMEM_Real_gfx11<0x020, S_GL1_INV>;
+def S_DCACHE_INV_gfx11 : SMEM_Real_gfx11<0x021, S_DCACHE_INV>;
+
+class SMEM_Real_Store_gfx11 <bits<8> op, SM_Pseudo ps> : SMEM_Real_gfx11<op, ps> {
+  // encoding
+  bits<7> sdata;
+
+  let sdst = ?;
+  let Inst{12-6}  = !if(ps.has_sdst, sdata{6-0}, ?);
+}
+
+multiclass SM_Real_Probe_gfx11<bits<8> op, string ps> {
+  def _IMM_gfx11  : SMEM_Real_Store_gfx11 <op, !cast<SM_Probe_Pseudo>(ps#_IMM)>;
+  def _SGPR_gfx11 : SMEM_Real_Store_gfx11 <op, !cast<SM_Probe_Pseudo>(ps#_SGPR)>;
+  def _SGPR_IMM_gfx11
+    : SMEM_Real_Store_gfx11 <op, !cast<SM_Probe_Pseudo>(ps#_SGPR_IMM)>;
+}
+
+defm S_ATC_PROBE        : SM_Real_Probe_gfx11 <0x22, "S_ATC_PROBE">;
+defm S_ATC_PROBE_BUFFER : SM_Real_Probe_gfx11 <0x23, "S_ATC_PROBE_BUFFER">;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 3f7837f7dbf1..37d20045adb5 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -152,8 +152,8 @@ class SOP1_64_0 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
 }
 
 // 64-bit input, no output
-class SOP1_1 <string opName, RegisterClass rc = SReg_64, list<dag> pattern=[]> : SOP1_Pseudo <
-  opName, (outs), (ins rc:$src0), "$src0", pattern> {
+class SOP1_1 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
+  opName, (outs), (ins SReg_64:$src0), "$src0", pattern> {
   let has_sdst = 0;
 }
 
@@ -235,10 +235,10 @@ def : GCNPat <
 
 let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
 def S_BREV_B32 : SOP1_32 <"s_brev_b32",
-  [(set i32:$sdst, (bitreverse i32:$src0))]
+  [(set i32:$sdst, (UniformUnaryFrag<bitreverse> i32:$src0))]
 >;
 def S_BREV_B64 : SOP1_64 <"s_brev_b64",
-  [(set i64:$sdst, (bitreverse i64:$src0))]
+  [(set i64:$sdst, (UniformUnaryFrag<bitreverse> i64:$src0))]
 >;
 } // End isReMaterializable = 1, isAsCheapAsAMove = 1
 
@@ -276,10 +276,10 @@ def S_FLBIT_I32 : SOP1_32 <"s_flbit_i32",
 >;
 def S_FLBIT_I32_I64 : SOP1_32_64 <"s_flbit_i32_i64">;
 def S_SEXT_I32_I8 : SOP1_32 <"s_sext_i32_i8",
-  [(set i32:$sdst, (sext_inreg i32:$src0, i8))]
+  [(set i32:$sdst, (UniformSextInreg<i8> i32:$src0))]
 >;
 def S_SEXT_I32_I16 : SOP1_32 <"s_sext_i32_i16",
-  [(set i32:$sdst, (sext_inreg i32:$src0, i16))]
+  [(set i32:$sdst, (UniformSextInreg<i16> i32:$src0))]
 >;
 } // End isReMaterializable = 1
 
@@ -300,8 +300,7 @@ def S_SETPC_B64 : SOP1_1  <"s_setpc_b64">;
 
 let isReturn = 1 in {
 // Define variant marked as return rather than branch.
-def S_SETPC_B64_return : SOP1_1<"", CCR_SGPR_64, [(AMDGPUret_flag i64:$src0)]>;
-def S_SETPC_B64_return_gfx : SOP1_1<"", Gfx_CCR_SGPR_64, [(AMDGPUret_gfx_flag i64:$src0)]>;
+def S_SETPC_B64_return : SOP1_1<"">;
 }
 } // End isTerminator = 1, isBarrier = 1
 
@@ -341,7 +340,7 @@ def S_CBRANCH_JOIN : SOP1_0_32R <"s_cbranch_join">;
 
 let Defs = [SCC] in {
 def S_ABS_I32 : SOP1_32 <"s_abs_i32",
-    [(set i32:$sdst, (abs i32:$src0))]
+    [(set i32:$sdst, (UniformUnaryFrag<abs> i32:$src0))]
   >;
 } // End Defs = [SCC]
 
@@ -385,6 +384,21 @@ let SubtargetPredicate = isGFX10Plus in {
   } // End Uses = [M0]
 } // End SubtargetPredicate = isGFX10Plus
 
+let SubtargetPredicate = isGFX11Plus in {
+  let hasSideEffects = 1 in {
+    // For s_sendmsg_rtn_* the src0 field encodes the message type directly; it
+    // is not an SGPR number.
+    def S_SENDMSG_RTN_B32 : SOP1_Pseudo<
+      "s_sendmsg_rtn_b32", (outs SReg_32:$sdst), (ins SendMsgImm:$src0),
+      "$sdst, $src0", [(set i32:$sdst, (int_amdgcn_s_sendmsg_rtn timm:$src0))]
+    >;
+    def S_SENDMSG_RTN_B64 : SOP1_Pseudo<
+      "s_sendmsg_rtn_b64", (outs SReg_64:$sdst), (ins SendMsgImm:$src0),
+      "$sdst, $src0", [(set i64:$sdst, (int_amdgcn_s_sendmsg_rtn timm:$src0))]
+    >;
+  }
+} // End SubtargetPredicate = isGFX11Plus
+
 //===----------------------------------------------------------------------===//
 // SOP2 Instructions
 //===----------------------------------------------------------------------===//
@@ -690,6 +704,10 @@ let SubtargetPredicate = isGFX9Plus in {
   } // End isCommutable = 1, isReMaterializable = 1
 } // End SubtargetPredicate = isGFX9Plus
 
+let SubtargetPredicate = isGFX11Plus in {
+  def S_PACK_HL_B32_B16 : SOP2_32<"s_pack_hl_b32_b16">;
+} // End SubtargetPredicate = isGFX11Plus
+
 //===----------------------------------------------------------------------===//
 // SOPK Instructions
 //===----------------------------------------------------------------------===//
@@ -855,9 +873,7 @@ def S_CBRANCH_I_FORK : SOPK_Pseudo <
   "$sdst, $simm16"
 >;
 
-let mayLoad = 1 in {
-// s_getreg_b32 should use hasSideEffects = 1 for tablegen to allow
-// its use in the readcyclecounter selection.
+// This is hasSideEffects to allow its use in readcyclecounter selection.
 // FIXME: Need to truncate immediate to 16-bits.
 def S_GETREG_B32 : SOPK_Pseudo <
   "s_getreg_b32",
@@ -867,7 +883,6 @@ def S_GETREG_B32 : SOPK_Pseudo <
   let SOPKZext = 1;
   let hasSideEffects = 1;
 }
-} // End mayLoad = 1
 
 let Defs = [MODE], Uses = [MODE] in {
 
@@ -1169,12 +1184,12 @@ def S_ENDPGM_SAVED : SOPP_Pseudo<"s_endpgm_saved", (ins)> {
   let isReturn = 1;
 }
 
-let SubtargetPredicate = isGFX9Plus in {
+let SubtargetPredicate = isGFX9GFX10 in {
   let isBarrier = 1, isReturn = 1, simm16 = 0, fixed_imm = 1 in {
     def S_ENDPGM_ORDERED_PS_DONE :
       SOPP_Pseudo<"s_endpgm_ordered_ps_done", (ins)>;
   } // End isBarrier = 1, isReturn = 1, simm16 = 0, fixed_imm = 1
-} // End SubtargetPredicate = isGFX9Plus
+} // End SubtargetPredicate = isGFX9GFX10
 
 let SubtargetPredicate = isGFX10Plus in {
   let isBarrier = 1, isReturn = 1, simm16 = 0, fixed_imm = 1 in {
@@ -1279,15 +1294,21 @@ def S_SLEEP : SOPP_Pseudo <"s_sleep", (ins i32imm:$simm16),
   let hasSideEffects = 1;
 }
 
-def S_SETPRIO : SOPP_Pseudo <"s_setprio" , (ins i16imm:$simm16), "$simm16">;
+def S_SETPRIO : SOPP_Pseudo <"s_setprio", (ins i16imm:$simm16), "$simm16",
+  [(int_amdgcn_s_setprio timm:$simm16)]> {
+  let hasSideEffects = 1;
+}
 
 let Uses = [EXEC, M0] in {
-// FIXME: Should this be mayLoad+mayStore?
 def S_SENDMSG : SOPP_Pseudo <"s_sendmsg" , (ins SendMsgImm:$simm16), "$simm16",
-  [(int_amdgcn_s_sendmsg (i32 timm:$simm16), M0)]>;
+  [(int_amdgcn_s_sendmsg (i32 timm:$simm16), M0)]> {
+  let hasSideEffects = 1;
+}
 
 def S_SENDMSGHALT : SOPP_Pseudo <"s_sendmsghalt" , (ins SendMsgImm:$simm16), "$simm16",
-  [(int_amdgcn_s_sendmsghalt (i32 timm:$simm16), M0)]>;
+  [(int_amdgcn_s_sendmsghalt (i32 timm:$simm16), M0)]> {
+  let hasSideEffects = 1;
+}
 
 } // End Uses = [EXEC, M0]
 
@@ -1341,7 +1362,7 @@ let SubtargetPredicate = isGFX10Plus in {
       let fixed_imm = 1;
     }
   def S_WAITCNT_DEPCTR :
-    SOPP_Pseudo <"s_waitcnt_depctr" , (ins s16imm:$simm16), "$simm16">;
+    SOPP_Pseudo <"s_waitcnt_depctr" , (ins DepCtrImm:$simm16), "$simm16">;
 
   let hasSideEffects = 0, Uses = [MODE], Defs = [MODE] in {
     def S_ROUND_MODE :
@@ -1355,6 +1376,13 @@ let SubtargetPredicate = isGFX10Plus in {
     SOPP_Pseudo<"s_ttracedata_imm", (ins s16imm:$simm16), "$simm16">;
 } // End SubtargetPredicate = isGFX10Plus
 
+let SubtargetPredicate = isGFX11Plus in {
+  def S_WAIT_EVENT : SOPP_Pseudo<"s_wait_event", (ins s16imm:$simm16),
+                                 "$simm16">;
+  def S_DELAY_ALU : SOPP_Pseudo<"s_delay_alu", (ins DELAY_FLAG:$simm16),
+                                "$simm16">;
+} // End SubtargetPredicate = isGFX11Plus
+
 //===----------------------------------------------------------------------===//
 // SOP1 Patterns
 //===----------------------------------------------------------------------===//
@@ -1377,7 +1405,7 @@ def : GCNPat <
 >;
 
 def : GCNPat <
-  (i32 (smax i32:$x, (i32 (ineg i32:$x)))),
+  (i32 (UniformBinFrag<smax> i32:$x, (i32 (ineg i32:$x)))),
   (S_ABS_I32 SReg_32:$x)
 >;
 
@@ -1408,7 +1436,7 @@ def : GCNPat <
 // REG_SEQUENCE patterns don't support instructions with multiple
 // outputs.
 def : GCNPat<
-  (i64 (zext i16:$src)),
+  (i64 (UniformUnaryFrag<zext> i16:$src)),
     (REG_SEQUENCE SReg_64,
       (i32 (COPY_TO_REGCLASS (S_AND_B32 $src, (S_MOV_B32 (i32 0xffff))), SGPR_32)), sub0,
       (S_MOV_B32 (i32 0)), sub1)
@@ -1421,7 +1449,7 @@ def : GCNPat <
 >;
 
 def : GCNPat<
-  (i32 (zext i16:$src)),
+  (i32 (UniformUnaryFrag<zext> i16:$src)),
   (S_AND_B32 (S_MOV_B32 (i32 0xffff)), $src)
 >;
 
@@ -1448,8 +1476,13 @@ def : ScalarNot2Pat<S_ORN2_B64, or, v2i32>;
 // Target-specific instruction encodings.
 //===----------------------------------------------------------------------===//
 
+class Select_gfx11<string opName> : SIMCInstr<opName, SIEncodingFamily.GFX11> {
+  Predicate AssemblerPredicate = isGFX11Only;
+  string DecoderNamespace      = "GFX11";
+}
+
 class Select_gfx10<string opName> : SIMCInstr<opName, SIEncodingFamily.GFX10> {
-  Predicate AssemblerPredicate = isGFX10Plus;
+  Predicate AssemblerPredicate = isGFX10Only;
   string DecoderNamespace      = "GFX10";
 }
 
@@ -1463,6 +1496,87 @@ class Select_gfx6_gfx7<string opName> : SIMCInstr<opName, SIEncodingFamily.SI> {
   string DecoderNamespace      = "GFX6GFX7";
 }
 
+//===----------------------------------------------------------------------===//
+//  GFX11.
+//===----------------------------------------------------------------------===//
+
+multiclass SOP1_Real_gfx11<bits<8> op> {
+  def _gfx11 : SOP1_Real<op, !cast<SOP1_Pseudo>(NAME)>,
+               Select_gfx11<!cast<SOP1_Pseudo>(NAME).Mnemonic>;
+}
+
+multiclass SOP1_Real_Renamed_gfx11<bits<8> op, SOP1_Pseudo backing_pseudo, string real_name> {
+  def _gfx11 : SOP1_Real<op, backing_pseudo, real_name>,
+               Select_gfx11<backing_pseudo.Mnemonic>,
+               MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX11Plus]>;
+}
+
+defm S_MOV_B32               : SOP1_Real_gfx11<0x000>;
+defm S_MOV_B64               : SOP1_Real_gfx11<0x001>;
+defm S_CMOV_B32              : SOP1_Real_gfx11<0x002>;
+defm S_CMOV_B64              : SOP1_Real_gfx11<0x003>;
+defm S_BREV_B32              : SOP1_Real_gfx11<0x004>;
+defm S_BREV_B64              : SOP1_Real_gfx11<0x005>;
+defm S_CTZ_I32_B32           : SOP1_Real_Renamed_gfx11<0x008, S_FF1_I32_B32, "s_ctz_i32_b32">;
+defm S_CTZ_I32_B64           : SOP1_Real_Renamed_gfx11<0x009, S_FF1_I32_B64, "s_ctz_i32_b64">;
+defm S_CLZ_I32_U32           : SOP1_Real_Renamed_gfx11<0x00a, S_FLBIT_I32_B32, "s_clz_i32_u32">;
+defm S_CLZ_I32_U64           : SOP1_Real_Renamed_gfx11<0x00b, S_FLBIT_I32_B64, "s_clz_i32_u64">;
+defm S_CLS_I32               : SOP1_Real_Renamed_gfx11<0x00c, S_FLBIT_I32, "s_cls_i32">;
+defm S_CLS_I32_I64           : SOP1_Real_Renamed_gfx11<0x00d, S_FLBIT_I32_I64, "s_cls_i32_i64">;
+defm S_SEXT_I32_I8           : SOP1_Real_gfx11<0x00e>;
+defm S_SEXT_I32_I16          : SOP1_Real_gfx11<0x00f>;
+defm S_BITSET0_B32           : SOP1_Real_gfx11<0x010>;
+defm S_BITSET0_B64           : SOP1_Real_gfx11<0x011>;
+defm S_BITSET1_B32           : SOP1_Real_gfx11<0x012>;
+defm S_BITSET1_B64           : SOP1_Real_gfx11<0x013>;
+defm S_BITREPLICATE_B64_B32  : SOP1_Real_gfx11<0x014>;
+defm S_ABS_I32               : SOP1_Real_gfx11<0x015>;
+defm S_BCNT0_I32_B32         : SOP1_Real_gfx11<0x016>;
+defm S_BCNT0_I32_B64         : SOP1_Real_gfx11<0x017>;
+defm S_BCNT1_I32_B32         : SOP1_Real_gfx11<0x018>;
+defm S_BCNT1_I32_B64         : SOP1_Real_gfx11<0x019>;
+defm S_QUADMASK_B32          : SOP1_Real_gfx11<0x01a>;
+defm S_QUADMASK_B64          : SOP1_Real_gfx11<0x01b>;
+defm S_WQM_B32               : SOP1_Real_gfx11<0x01c>;
+defm S_WQM_B64               : SOP1_Real_gfx11<0x01d>;
+defm S_NOT_B32               : SOP1_Real_gfx11<0x01e>;
+defm S_NOT_B64               : SOP1_Real_gfx11<0x01f>;
+defm S_AND_SAVEEXEC_B32      : SOP1_Real_gfx11<0x020>;
+defm S_AND_SAVEEXEC_B64      : SOP1_Real_gfx11<0x021>;
+defm S_OR_SAVEEXEC_B32       : SOP1_Real_gfx11<0x022>;
+defm S_OR_SAVEEXEC_B64       : SOP1_Real_gfx11<0x023>;
+defm S_XOR_SAVEEXEC_B32      : SOP1_Real_gfx11<0x024>;
+defm S_XOR_SAVEEXEC_B64      : SOP1_Real_gfx11<0x025>;
+defm S_NAND_SAVEEXEC_B32     : SOP1_Real_gfx11<0x026>;
+defm S_NAND_SAVEEXEC_B64     : SOP1_Real_gfx11<0x027>;
+defm S_NOR_SAVEEXEC_B32      : SOP1_Real_gfx11<0x028>;
+defm S_NOR_SAVEEXEC_B64      : SOP1_Real_gfx11<0x029>;
+defm S_XNOR_SAVEEXEC_B32     : SOP1_Real_gfx11<0x02a>;
+/*defm S_XNOR_SAVEEXEC_B64   : SOP1_Real_gfx11<0x02b>; //same as older arch, handled there*/
+defm S_AND_NOT0_SAVEEXEC_B32 : SOP1_Real_Renamed_gfx11<0x02c, S_ANDN1_SAVEEXEC_B32, "s_and_not0_saveexec_b32">;
+defm S_AND_NOT0_SAVEEXEC_B64 : SOP1_Real_Renamed_gfx11<0x02d, S_ANDN1_SAVEEXEC_B64, "s_and_not0_saveexec_b64">;
+defm S_OR_NOT0_SAVEEXEC_B32  : SOP1_Real_Renamed_gfx11<0x02e, S_ORN1_SAVEEXEC_B32, "s_or_not0_saveexec_b32">;
+defm S_OR_NOT0_SAVEEXEC_B64  : SOP1_Real_Renamed_gfx11<0x02f, S_ORN1_SAVEEXEC_B64, "s_or_not0_saveexec_b64">;
+defm S_AND_NOT1_SAVEEXEC_B32 : SOP1_Real_Renamed_gfx11<0x030, S_ANDN2_SAVEEXEC_B32, "s_and_not1_saveexec_b32">;
+defm S_AND_NOT1_SAVEEXEC_B64 : SOP1_Real_Renamed_gfx11<0x031, S_ANDN2_SAVEEXEC_B64, "s_and_not1_saveexec_b64">;
+defm S_OR_NOT1_SAVEEXEC_B32  : SOP1_Real_Renamed_gfx11<0x032, S_ORN2_SAVEEXEC_B32, "s_or_not1_saveexec_b32">;
+defm S_OR_NOT1_SAVEEXEC_B64  : SOP1_Real_Renamed_gfx11<0x033, S_ORN2_SAVEEXEC_B64, "s_or_not1_saveexec_b64">;
+defm S_AND_NOT0_WREXEC_B32   : SOP1_Real_Renamed_gfx11<0x034, S_ANDN1_WREXEC_B32, "s_and_not0_wrexec_b32">;
+defm S_AND_NOT0_WREXEC_B64   : SOP1_Real_Renamed_gfx11<0x035, S_ANDN1_WREXEC_B64, "s_and_not0_wrexec_b64">;
+defm S_AND_NOT1_WREXEC_B32   : SOP1_Real_Renamed_gfx11<0x036, S_ANDN2_WREXEC_B32, "s_and_not1_wrexec_b32">;
+defm S_AND_NOT1_WREXEC_B64   : SOP1_Real_Renamed_gfx11<0x037, S_ANDN2_WREXEC_B64, "s_and_not1_wrexec_b64">;
+defm S_MOVRELS_B32           : SOP1_Real_gfx11<0x040>;
+defm S_MOVRELS_B64           : SOP1_Real_gfx11<0x041>;
+defm S_MOVRELD_B32           : SOP1_Real_gfx11<0x042>;
+defm S_MOVRELD_B64           : SOP1_Real_gfx11<0x043>;
+defm S_MOVRELSD_2_B32        : SOP1_Real_gfx11<0x044>;
+defm S_GETPC_B64             : SOP1_Real_gfx11<0x047>;
+defm S_SETPC_B64             : SOP1_Real_gfx11<0x048>;
+defm S_SWAPPC_B64            : SOP1_Real_gfx11<0x049>;
+defm S_RFE_B64               : SOP1_Real_gfx11<0x04a>;
+defm S_SENDMSG_RTN_B32       : SOP1_Real_gfx11<0x04c>;
+defm S_SENDMSG_RTN_B64       : SOP1_Real_gfx11<0x04d>;
+
 //===----------------------------------------------------------------------===//
 // SOP1 - GFX10.
 //===----------------------------------------------------------------------===//
@@ -1473,6 +1587,9 @@ multiclass SOP1_Real_gfx10<bits<8> op> {
                Select_gfx10<ps.Mnemonic>;
 }
 
+multiclass SOP1_Real_gfx10_gfx11<bits<8> op> :
+  SOP1_Real_gfx10<op>, SOP1_Real_gfx11<op>;
+
 defm S_ANDN1_SAVEEXEC_B64   : SOP1_Real_gfx10<0x037>;
 defm S_ORN1_SAVEEXEC_B64    : SOP1_Real_gfx10<0x038>;
 defm S_ANDN1_WREXEC_B64     : SOP1_Real_gfx10<0x039>;
@@ -1493,7 +1610,7 @@ defm S_ANDN2_WREXEC_B32     : SOP1_Real_gfx10<0x047>;
 defm S_MOVRELSD_2_B32       : SOP1_Real_gfx10<0x049>;
 
 //===----------------------------------------------------------------------===//
-// SOP1 - GFX6, GFX7.
+// SOP1 - GFX6, GFX7, GFX10, GFX11.
 //===----------------------------------------------------------------------===//
 
 
@@ -1506,6 +1623,9 @@ multiclass SOP1_Real_gfx6_gfx7<bits<8> op> {
 multiclass SOP1_Real_gfx6_gfx7_gfx10<bits<8> op> :
   SOP1_Real_gfx6_gfx7<op>, SOP1_Real_gfx10<op>;
 
+multiclass SOP1_Real_gfx6_gfx7_gfx10_gfx11<bits<8> op> :
+  SOP1_Real_gfx6_gfx7<op>, SOP1_Real_gfx10_gfx11<op>;
+
 defm S_CBRANCH_JOIN  : SOP1_Real_gfx6_gfx7<0x032>;
 
 defm S_MOV_B32            : SOP1_Real_gfx6_gfx7_gfx10<0x003>;
@@ -1547,7 +1667,7 @@ defm S_ANDN2_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x027>;
 defm S_ORN2_SAVEEXEC_B64  : SOP1_Real_gfx6_gfx7_gfx10<0x028>;
 defm S_NAND_SAVEEXEC_B64  : SOP1_Real_gfx6_gfx7_gfx10<0x029>;
 defm S_NOR_SAVEEXEC_B64   : SOP1_Real_gfx6_gfx7_gfx10<0x02a>;
-defm S_XNOR_SAVEEXEC_B64  : SOP1_Real_gfx6_gfx7_gfx10<0x02b>;
+defm S_XNOR_SAVEEXEC_B64  : SOP1_Real_gfx6_gfx7_gfx10_gfx11<0x02b>;
 defm S_QUADMASK_B32       : SOP1_Real_gfx6_gfx7_gfx10<0x02c>;
 defm S_QUADMASK_B64       : SOP1_Real_gfx6_gfx7_gfx10<0x02d>;
 defm S_MOVRELS_B32        : SOP1_Real_gfx6_gfx7_gfx10<0x02e>;
@@ -1556,6 +1676,65 @@ defm S_MOVRELD_B32        : SOP1_Real_gfx6_gfx7_gfx10<0x030>;
 defm S_MOVRELD_B64        : SOP1_Real_gfx6_gfx7_gfx10<0x031>;
 defm S_ABS_I32            : SOP1_Real_gfx6_gfx7_gfx10<0x034>;
 
+//===----------------------------------------------------------------------===//
+// SOP2 - GFX11.
+//===----------------------------------------------------------------------===//
+
+multiclass SOP2_Real_gfx11<bits<7> op> {
+  def _gfx11 : SOP2_Real<op, !cast<SOP2_Pseudo>(NAME)>,
+               Select_gfx11<!cast<SOP2_Pseudo>(NAME).Mnemonic>;
+}
+
+multiclass SOP2_Real_Renamed_gfx11<bits<7> op, SOP2_Pseudo backing_pseudo, string real_name> {
+  def _gfx11 : SOP2_Real<op, backing_pseudo, real_name>,
+               Select_gfx11<backing_pseudo.Mnemonic>,
+               MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX11Plus]>;
+}
+
+defm S_ABSDIFF_I32     : SOP2_Real_gfx11<0x006>;
+defm S_LSHL_B32        : SOP2_Real_gfx11<0x008>;
+defm S_LSHL_B64        : SOP2_Real_gfx11<0x009>;
+defm S_LSHR_B32        : SOP2_Real_gfx11<0x00a>;
+defm S_LSHR_B64        : SOP2_Real_gfx11<0x00b>;
+defm S_ASHR_I32        : SOP2_Real_gfx11<0x00c>;
+defm S_ASHR_I64        : SOP2_Real_gfx11<0x00d>;
+defm S_LSHL1_ADD_U32   : SOP2_Real_gfx11<0x00e>;
+defm S_LSHL2_ADD_U32   : SOP2_Real_gfx11<0x00f>;
+defm S_LSHL3_ADD_U32   : SOP2_Real_gfx11<0x010>;
+defm S_LSHL4_ADD_U32   : SOP2_Real_gfx11<0x011>;
+defm S_MIN_I32         : SOP2_Real_gfx11<0x012>;
+defm S_MIN_U32         : SOP2_Real_gfx11<0x013>;
+defm S_MAX_I32         : SOP2_Real_gfx11<0x014>;
+defm S_MAX_U32         : SOP2_Real_gfx11<0x015>;
+defm S_AND_B32         : SOP2_Real_gfx11<0x016>;
+defm S_AND_B64         : SOP2_Real_gfx11<0x017>;
+defm S_OR_B32          : SOP2_Real_gfx11<0x018>;
+defm S_OR_B64          : SOP2_Real_gfx11<0x019>;
+defm S_XOR_B32         : SOP2_Real_gfx11<0x01a>;
+defm S_XOR_B64         : SOP2_Real_gfx11<0x01b>;
+defm S_NAND_B32        : SOP2_Real_gfx11<0x01c>;
+defm S_NAND_B64        : SOP2_Real_gfx11<0x01d>;
+defm S_NOR_B32         : SOP2_Real_gfx11<0x01e>;
+defm S_NOR_B64         : SOP2_Real_gfx11<0x01f>;
+defm S_XNOR_B32        : SOP2_Real_gfx11<0x020>;
+defm S_XNOR_B64        : SOP2_Real_gfx11<0x021>;
+defm S_AND_NOT1_B32    : SOP2_Real_Renamed_gfx11<0x022, S_ANDN2_B32, "s_and_not1_b32">;
+defm S_AND_NOT1_B64    : SOP2_Real_Renamed_gfx11<0x023, S_ANDN2_B64, "s_and_not1_b64">;
+defm S_OR_NOT1_B32     : SOP2_Real_Renamed_gfx11<0x024, S_ORN2_B32, "s_or_not1_b32">;
+defm S_OR_NOT1_B64     : SOP2_Real_Renamed_gfx11<0x025, S_ORN2_B64, "s_or_not1_b64">;
+defm S_BFE_U32         : SOP2_Real_gfx11<0x026>;
+defm S_BFE_I32         : SOP2_Real_gfx11<0x027>;
+defm S_BFE_U64         : SOP2_Real_gfx11<0x028>;
+defm S_BFE_I64         : SOP2_Real_gfx11<0x029>;
+defm S_BFM_B32         : SOP2_Real_gfx11<0x02a>;
+defm S_BFM_B64         : SOP2_Real_gfx11<0x02b>;
+defm S_MUL_I32         : SOP2_Real_gfx11<0x02c>;
+defm S_MUL_HI_U32      : SOP2_Real_gfx11<0x02d>;
+defm S_MUL_HI_I32      : SOP2_Real_gfx11<0x02e>;
+defm S_CSELECT_B32     : SOP2_Real_gfx11<0x030>;
+defm S_CSELECT_B64     : SOP2_Real_gfx11<0x031>;
+defm S_PACK_HL_B32_B16 : SOP2_Real_gfx11<0x035>;
+
 //===----------------------------------------------------------------------===//
 // SOP2 - GFX10.
 //===----------------------------------------------------------------------===//
@@ -1566,13 +1745,16 @@ multiclass SOP2_Real_gfx10<bits<7> op> {
                Select_gfx10<ps.Mnemonic>;
 }
 
+multiclass SOP2_Real_gfx10_gfx11<bits<7> op> :
+  SOP2_Real_gfx10<op>, SOP2_Real_gfx11<op>;
+
 defm S_LSHL1_ADD_U32   : SOP2_Real_gfx10<0x02e>;
 defm S_LSHL2_ADD_U32   : SOP2_Real_gfx10<0x02f>;
 defm S_LSHL3_ADD_U32   : SOP2_Real_gfx10<0x030>;
 defm S_LSHL4_ADD_U32   : SOP2_Real_gfx10<0x031>;
-defm S_PACK_LL_B32_B16 : SOP2_Real_gfx10<0x032>;
-defm S_PACK_LH_B32_B16 : SOP2_Real_gfx10<0x033>;
-defm S_PACK_HH_B32_B16 : SOP2_Real_gfx10<0x034>;
+defm S_PACK_LL_B32_B16 : SOP2_Real_gfx10_gfx11<0x032>;
+defm S_PACK_LH_B32_B16 : SOP2_Real_gfx10_gfx11<0x033>;
+defm S_PACK_HH_B32_B16 : SOP2_Real_gfx10_gfx11<0x034>;
 defm S_MUL_HI_U32      : SOP2_Real_gfx10<0x035>;
 defm S_MUL_HI_I32      : SOP2_Real_gfx10<0x036>;
 
@@ -1589,14 +1771,17 @@ multiclass SOP2_Real_gfx6_gfx7<bits<7> op> {
 multiclass SOP2_Real_gfx6_gfx7_gfx10<bits<7> op> :
   SOP2_Real_gfx6_gfx7<op>, SOP2_Real_gfx10<op>;
 
+multiclass SOP2_Real_gfx6_gfx7_gfx10_gfx11<bits<7> op> :
+  SOP2_Real_gfx6_gfx7<op>, SOP2_Real_gfx10_gfx11<op>;
+
 defm S_CBRANCH_G_FORK : SOP2_Real_gfx6_gfx7<0x02b>;
 
-defm S_ADD_U32     : SOP2_Real_gfx6_gfx7_gfx10<0x000>;
-defm S_SUB_U32     : SOP2_Real_gfx6_gfx7_gfx10<0x001>;
-defm S_ADD_I32     : SOP2_Real_gfx6_gfx7_gfx10<0x002>;
-defm S_SUB_I32     : SOP2_Real_gfx6_gfx7_gfx10<0x003>;
-defm S_ADDC_U32    : SOP2_Real_gfx6_gfx7_gfx10<0x004>;
-defm S_SUBB_U32    : SOP2_Real_gfx6_gfx7_gfx10<0x005>;
+defm S_ADD_U32     : SOP2_Real_gfx6_gfx7_gfx10_gfx11<0x000>;
+defm S_SUB_U32     : SOP2_Real_gfx6_gfx7_gfx10_gfx11<0x001>;
+defm S_ADD_I32     : SOP2_Real_gfx6_gfx7_gfx10_gfx11<0x002>;
+defm S_SUB_I32     : SOP2_Real_gfx6_gfx7_gfx10_gfx11<0x003>;
+defm S_ADDC_U32    : SOP2_Real_gfx6_gfx7_gfx10_gfx11<0x004>;
+defm S_SUBB_U32    : SOP2_Real_gfx6_gfx7_gfx10_gfx11<0x005>;
 defm S_MIN_I32     : SOP2_Real_gfx6_gfx7_gfx10<0x006>;
 defm S_MIN_U32     : SOP2_Real_gfx6_gfx7_gfx10<0x007>;
 defm S_MAX_I32     : SOP2_Real_gfx6_gfx7_gfx10<0x008>;
@@ -1634,6 +1819,31 @@ defm S_BFE_U64     : SOP2_Real_gfx6_gfx7_gfx10<0x029>;
 defm S_BFE_I64     : SOP2_Real_gfx6_gfx7_gfx10<0x02a>;
 defm S_ABSDIFF_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x02c>;
 
+//===----------------------------------------------------------------------===//
+// SOPK - GFX11.
+//===----------------------------------------------------------------------===//
+
+multiclass SOPK_Real32_gfx11<bits<5> op> {
+  def _gfx11 : SOPK_Real32<op, !cast<SOPK_Pseudo>(NAME)>,
+               Select_gfx11<!cast<SOPK_Pseudo>(NAME).Mnemonic>;
+}
+
+multiclass SOPK_Real64_gfx11<bits<5> op> {
+  def _gfx11 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>,
+               Select_gfx11<!cast<SOPK_Pseudo>(NAME).Mnemonic>;
+}
+
+defm S_GETREG_B32           : SOPK_Real32_gfx11<0x011>;
+defm S_SETREG_B32           : SOPK_Real32_gfx11<0x012>;
+defm S_SETREG_IMM32_B32     : SOPK_Real64_gfx11<0x013>;
+defm S_CALL_B64             : SOPK_Real32_gfx11<0x014>;
+defm S_SUBVECTOR_LOOP_BEGIN : SOPK_Real32_gfx11<0x016>;
+defm S_SUBVECTOR_LOOP_END   : SOPK_Real32_gfx11<0x017>;
+defm S_WAITCNT_VSCNT        : SOPK_Real32_gfx11<0x018>;
+defm S_WAITCNT_VMCNT        : SOPK_Real32_gfx11<0x019>;
+defm S_WAITCNT_EXPCNT       : SOPK_Real32_gfx11<0x01a>;
+defm S_WAITCNT_LGKMCNT      : SOPK_Real32_gfx11<0x01b>;
+
 //===----------------------------------------------------------------------===//
 // SOPK - GFX10.
 //===----------------------------------------------------------------------===//
@@ -1650,7 +1860,10 @@ multiclass SOPK_Real64_gfx10<bits<5> op> {
                Select_gfx10<ps.Mnemonic>;
 }
 
-defm S_VERSION              : SOPK_Real32_gfx10<0x001>;
+multiclass SOPK_Real32_gfx10_gfx11<bits<5> op> :
+  SOPK_Real32_gfx10<op>, SOPK_Real32_gfx11<op>;
+
+defm S_VERSION              : SOPK_Real32_gfx10_gfx11<0x001>;
 defm S_CALL_B64             : SOPK_Real32_gfx10<0x016>;
 defm S_WAITCNT_VSCNT        : SOPK_Real32_gfx10<0x017>;
 defm S_WAITCNT_VMCNT        : SOPK_Real32_gfx10<0x018>;
@@ -1681,28 +1894,95 @@ multiclass SOPK_Real32_gfx6_gfx7_gfx10<bits<5> op> :
 multiclass SOPK_Real64_gfx6_gfx7_gfx10<bits<5> op> :
   SOPK_Real64_gfx6_gfx7<op>, SOPK_Real64_gfx10<op>;
 
+multiclass SOPK_Real32_gfx6_gfx7_gfx10_gfx11<bits<5> op> :
+  SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10_gfx11<op>;
+
 defm S_CBRANCH_I_FORK : SOPK_Real32_gfx6_gfx7<0x011>;
 
-defm S_MOVK_I32         : SOPK_Real32_gfx6_gfx7_gfx10<0x000>;
-defm S_CMOVK_I32        : SOPK_Real32_gfx6_gfx7_gfx10<0x002>;
-defm S_CMPK_EQ_I32      : SOPK_Real32_gfx6_gfx7_gfx10<0x003>;
-defm S_CMPK_LG_I32      : SOPK_Real32_gfx6_gfx7_gfx10<0x004>;
-defm S_CMPK_GT_I32      : SOPK_Real32_gfx6_gfx7_gfx10<0x005>;
-defm S_CMPK_GE_I32      : SOPK_Real32_gfx6_gfx7_gfx10<0x006>;
-defm S_CMPK_LT_I32      : SOPK_Real32_gfx6_gfx7_gfx10<0x007>;
-defm S_CMPK_LE_I32      : SOPK_Real32_gfx6_gfx7_gfx10<0x008>;
-defm S_CMPK_EQ_U32      : SOPK_Real32_gfx6_gfx7_gfx10<0x009>;
-defm S_CMPK_LG_U32      : SOPK_Real32_gfx6_gfx7_gfx10<0x00a>;
-defm S_CMPK_GT_U32      : SOPK_Real32_gfx6_gfx7_gfx10<0x00b>;
-defm S_CMPK_GE_U32      : SOPK_Real32_gfx6_gfx7_gfx10<0x00c>;
-defm S_CMPK_LT_U32      : SOPK_Real32_gfx6_gfx7_gfx10<0x00d>;
-defm S_CMPK_LE_U32      : SOPK_Real32_gfx6_gfx7_gfx10<0x00e>;
-defm S_ADDK_I32         : SOPK_Real32_gfx6_gfx7_gfx10<0x00f>;
-defm S_MULK_I32         : SOPK_Real32_gfx6_gfx7_gfx10<0x010>;
+defm S_MOVK_I32         : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x000>;
+defm S_CMOVK_I32        : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x002>;
+defm S_CMPK_EQ_I32      : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x003>;
+defm S_CMPK_LG_I32      : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x004>;
+defm S_CMPK_GT_I32      : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x005>;
+defm S_CMPK_GE_I32      : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x006>;
+defm S_CMPK_LT_I32      : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x007>;
+defm S_CMPK_LE_I32      : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x008>;
+defm S_CMPK_EQ_U32      : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x009>;
+defm S_CMPK_LG_U32      : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00a>;
+defm S_CMPK_GT_U32      : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00b>;
+defm S_CMPK_GE_U32      : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00c>;
+defm S_CMPK_LT_U32      : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00d>;
+defm S_CMPK_LE_U32      : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00e>;
+defm S_ADDK_I32         : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00f>;
+defm S_MULK_I32         : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x010>;
 defm S_GETREG_B32       : SOPK_Real32_gfx6_gfx7_gfx10<0x012>;
 defm S_SETREG_B32       : SOPK_Real32_gfx6_gfx7_gfx10<0x013>;
 defm S_SETREG_IMM32_B32 : SOPK_Real64_gfx6_gfx7_gfx10<0x015>;
 
+//===----------------------------------------------------------------------===//
+// SOPP - GFX11
+//===----------------------------------------------------------------------===//
+
+multiclass SOPP_Real_32_gfx11<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> {
+  def _gfx11 : SOPP_Real_32<op, !cast<SOPP_Pseudo>(NAME), real_name>,
+               Select_gfx11<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
+               SOPPRelaxTable<0, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx11">;
+}
+
+multiclass SOPP_Real_64_gfx11<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> {
+  def _gfx11 : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), real_name>,
+               Select_gfx11<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
+               SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx11">;
+}
+
+multiclass SOPP_Real_32_Renamed_gfx11<bits<7> op, SOPP_Pseudo backing_pseudo, string real_name> {
+  def _gfx11 : SOPP_Real_32<op, backing_pseudo, real_name # " ">,
+               Select_gfx11<backing_pseudo.Mnemonic>,
+               MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX11Plus]>;
+}
+
+multiclass SOPP_Real_With_Relaxation_gfx11<bits<7> op> {
+  defm "" : SOPP_Real_32_gfx11<op>;
+  defm _pad_s_nop : SOPP_Real_64_gfx11<op>;
+}
+
+defm S_SETKILL                    : SOPP_Real_32_gfx11<0x001>;
+defm S_SETHALT                    : SOPP_Real_32_gfx11<0x002>;
+defm S_SLEEP                      : SOPP_Real_32_gfx11<0x003>;
+defm S_SET_INST_PREFETCH_DISTANCE : SOPP_Real_32_Renamed_gfx11<0x004, S_INST_PREFETCH, "s_set_inst_prefetch_distance">;
+defm S_CLAUSE                     : SOPP_Real_32_gfx11<0x005>;
+defm S_DELAY_ALU                  : SOPP_Real_32_gfx11<0x007>;
+defm S_WAITCNT_DEPCTR             : SOPP_Real_32_gfx11<0x008>;
+defm S_WAITCNT                    : SOPP_Real_32_gfx11<0x009>;
+defm S_WAIT_IDLE                  : SOPP_Real_32_gfx11<0x00a>;
+defm S_WAIT_EVENT                 : SOPP_Real_32_gfx11<0x00b>;
+defm S_TRAP                       : SOPP_Real_32_gfx11<0x010>;
+defm S_ROUND_MODE                 : SOPP_Real_32_gfx11<0x011>;
+defm S_DENORM_MODE                : SOPP_Real_32_gfx11<0x012>;
+defm S_BRANCH                     : SOPP_Real_With_Relaxation_gfx11<0x020>;
+defm S_CBRANCH_SCC0               : SOPP_Real_With_Relaxation_gfx11<0x021>;
+defm S_CBRANCH_SCC1               : SOPP_Real_With_Relaxation_gfx11<0x022>;
+defm S_CBRANCH_VCCZ               : SOPP_Real_With_Relaxation_gfx11<0x023>;
+defm S_CBRANCH_VCCNZ              : SOPP_Real_With_Relaxation_gfx11<0x024>;
+defm S_CBRANCH_EXECZ              : SOPP_Real_With_Relaxation_gfx11<0x025>;
+defm S_CBRANCH_EXECNZ             : SOPP_Real_With_Relaxation_gfx11<0x026>;
+defm S_CBRANCH_CDBGSYS            : SOPP_Real_With_Relaxation_gfx11<0x027>;
+defm S_CBRANCH_CDBGUSER           : SOPP_Real_With_Relaxation_gfx11<0x028>;
+defm S_CBRANCH_CDBGSYS_OR_USER    : SOPP_Real_With_Relaxation_gfx11<0x029>;
+defm S_CBRANCH_CDBGSYS_AND_USER   : SOPP_Real_With_Relaxation_gfx11<0x02a>;
+defm S_ENDPGM                     : SOPP_Real_32_gfx11<0x030, "s_endpgm">;
+defm S_ENDPGM_SAVED               : SOPP_Real_32_gfx11<0x031>;
+defm S_WAKEUP                     : SOPP_Real_32_gfx11<0x034>;
+defm S_SETPRIO                    : SOPP_Real_32_gfx11<0x035>;
+defm S_SENDMSG                    : SOPP_Real_32_gfx11<0x036>;
+defm S_SENDMSGHALT                : SOPP_Real_32_gfx11<0x037>;
+defm S_INCPERFLEVEL               : SOPP_Real_32_gfx11<0x038>;
+defm S_DECPERFLEVEL               : SOPP_Real_32_gfx11<0x039>;
+defm S_TTRACEDATA                 : SOPP_Real_32_gfx11<0x03a>;
+defm S_TTRACEDATA_IMM             : SOPP_Real_32_gfx11<0x03b>;
+defm S_ICACHE_INV                 : SOPP_Real_32_gfx11<0x03c>;
+defm S_BARRIER                    : SOPP_Real_32_gfx11<0x03d>;
+
 //===----------------------------------------------------------------------===//
 // SOPP - GFX6, GFX7, GFX8, GFX9, GFX10
 //===----------------------------------------------------------------------===//
@@ -1737,6 +2017,12 @@ multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9<bits<7> op, string real_name = !cast
 multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> :
   SOPP_Real_32_gfx6_gfx7_gfx8_gfx9<op, real_name>, SOPP_Real_32_gfx10<op, real_name>;
 
+multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> :
+  SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<op, real_name>, SOPP_Real_32_gfx11<op, real_name>;
+
+multiclass SOPP_Real_32_gfx10_gfx11<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> :
+  SOPP_Real_32_gfx10<op, real_name>, SOPP_Real_32_gfx11<op, real_name>;
+
 //64 bit encodings, for Relaxation
 multiclass SOPP_Real_64_gfx6_gfx7<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> {
   defvar ps = !cast<SOPP_Pseudo>(NAME);
@@ -1768,13 +2054,16 @@ multiclass SOPP_Real_64_gfx6_gfx7_gfx8_gfx9<bits<7> op, string real_name = !cast
 multiclass SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> :
   SOPP_Real_64_gfx6_gfx7_gfx8_gfx9<op, real_name>, SOPP_Real_64_gfx10<op, real_name>;
 
+multiclass SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> :
+  SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10<op, real_name>, SOPP_Real_64_gfx11<op, real_name>;
+
 //relaxation for insts with no operands not implemented
 multiclass SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op> {
   defm "" : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<op>;
   defm _pad_s_nop : SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10<op>;
 }
 
-defm S_NOP                      : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x000>;
+defm S_NOP                      : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11<0x000>;
 defm S_ENDPGM                   : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x001, "s_endpgm">;
 defm S_WAKEUP                   : SOPP_Real_32_gfx8_gfx9_gfx10<0x003>;
 defm S_BARRIER                  : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00a>;
@@ -1794,7 +2083,7 @@ defm S_ENDPGM_SAVED             : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x01B>;
 defm S_SET_GPR_IDX_OFF          : SOPP_Real_32_gfx8_gfx9<0x01c>;
 defm S_SET_GPR_IDX_MODE         : SOPP_Real_32_gfx8_gfx9<0x01d>;
 defm S_ENDPGM_ORDERED_PS_DONE   : SOPP_Real_32_gfx8_gfx9_gfx10<0x01e>;
-defm S_CODE_END                 : SOPP_Real_32_gfx10<0x01f>;
+defm S_CODE_END                 : SOPP_Real_32_gfx10_gfx11<0x01f>;
 defm S_INST_PREFETCH            : SOPP_Real_32_gfx10<0x020>;
 defm S_CLAUSE                   : SOPP_Real_32_gfx10<0x021>;
 defm S_WAIT_IDLE                : SOPP_Real_32_gfx10<0x022>;
@@ -1817,6 +2106,34 @@ defm S_CBRANCH_CDBGSYS_OR_USER  : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_
 defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x01A>;
 }
 
+//===----------------------------------------------------------------------===//
+// SOPC - GFX11
+//===----------------------------------------------------------------------===//
+
+multiclass SOPC_Real_gfx11<bits<7> op> {
+  def _gfx11 : SOPC_Real<op, !cast<SOPC_Pseudo>(NAME)>,
+               Select_gfx11<!cast<SOPC_Pseudo>(NAME).Mnemonic>;
+}
+
+defm S_CMP_EQ_I32     : SOPC_Real_gfx11<0x00>;
+defm S_CMP_LG_I32     : SOPC_Real_gfx11<0x01>;
+defm S_CMP_GT_I32     : SOPC_Real_gfx11<0x02>;
+defm S_CMP_GE_I32     : SOPC_Real_gfx11<0x03>;
+defm S_CMP_LT_I32     : SOPC_Real_gfx11<0x04>;
+defm S_CMP_LE_I32     : SOPC_Real_gfx11<0x05>;
+defm S_CMP_EQ_U32     : SOPC_Real_gfx11<0x06>;
+defm S_CMP_LG_U32     : SOPC_Real_gfx11<0x07>;
+defm S_CMP_GT_U32     : SOPC_Real_gfx11<0x08>;
+defm S_CMP_GE_U32     : SOPC_Real_gfx11<0x09>;
+defm S_CMP_LT_U32     : SOPC_Real_gfx11<0x0a>;
+defm S_CMP_LE_U32     : SOPC_Real_gfx11<0x0b>;
+defm S_BITCMP0_B32    : SOPC_Real_gfx11<0x0c>;
+defm S_BITCMP1_B32    : SOPC_Real_gfx11<0x0d>;
+defm S_BITCMP0_B64    : SOPC_Real_gfx11<0x0e>;
+defm S_BITCMP1_B64    : SOPC_Real_gfx11<0x0f>;
+defm S_CMP_EQ_U64     : SOPC_Real_gfx11<0x10>;
+defm S_CMP_LG_U64     : SOPC_Real_gfx11<0x11>;
+
 //===----------------------------------------------------------------------===//
 // SOPC - GFX6, GFX7, GFX8, GFX9, GFX10
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index 18c348d1cf89..c0fd5bc69325 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -6,33 +6,64 @@
 //
 //===----------------------------------------------------------------------===//
 #include "AMDGPUAsmUtils.h"
+#include "AMDGPUBaseInfo.h"
 #include "SIDefines.h"
 
-#include "llvm/ADT/StringRef.h"
-
 namespace llvm {
 namespace AMDGPU {
+
+namespace DepCtr {
+
+// NOLINTBEGIN
+const CustomOperandVal DepCtrInfo[] = {
+  // Name               max dflt offset width  constraint
+  {{"depctr_hold_cnt"},  1,   1,    7,    1,   isGFX10_BEncoding},
+  {{"depctr_sa_sdst"},   1,   1,    0,    1},
+  {{"depctr_va_vdst"},  15,  15,   12,    4},
+  {{"depctr_va_sdst"},   7,   7,    9,    3},
+  {{"depctr_va_ssrc"},   1,   1,    8,    1},
+  {{"depctr_va_vcc"},    1,   1,    1,    1},
+  {{"depctr_vm_vsrc"},   7,   7,    2,    3},
+};
+// NOLINTEND
+
+const int DEP_CTR_SIZE =
+    static_cast<int>(sizeof(DepCtrInfo) / sizeof(CustomOperandVal));
+
+} // namespace DepCtr
+
 namespace SendMsg {
 
-// This must be in sync with llvm::AMDGPU::SendMsg::Id enum members, see SIDefines.h.
-const char *const IdSymbolic[ID_GAPS_LAST_] = {
-  nullptr,
-  "MSG_INTERRUPT",
-  "MSG_GS",
-  "MSG_GS_DONE",
-  "MSG_SAVEWAVE",
-  "MSG_STALL_WAVE_GEN",
-  "MSG_HALT_WAVES",
-  "MSG_ORDERED_PS_DONE",
-  "MSG_EARLY_PRIM_DEALLOC",
-  "MSG_GS_ALLOC_REQ",
-  "MSG_GET_DOORBELL",
-  "MSG_GET_DDID",
-  nullptr,
-  nullptr,
-  nullptr,
-  "MSG_SYSMSG"
+// Disable lint checking for this block since it makes the table unreadable.
+// NOLINTBEGIN
+const CustomOperand<const MCSubtargetInfo &> Msg[] = {
+  {{""}},
+  {{"MSG_INTERRUPT"},           ID_INTERRUPT},
+  {{"MSG_GS"},                  ID_GS_PreGFX11,             isNotGFX11Plus},
+  {{"MSG_GS_DONE"},             ID_GS_DONE_PreGFX11,        isNotGFX11Plus},
+  {{"MSG_SAVEWAVE"},            ID_SAVEWAVE,                isGFX8_GFX9_GFX10},
+  {{"MSG_STALL_WAVE_GEN"},      ID_STALL_WAVE_GEN,          isGFX9Plus},
+  {{"MSG_HALT_WAVES"},          ID_HALT_WAVES,              isGFX9Plus},
+  {{"MSG_ORDERED_PS_DONE"},     ID_ORDERED_PS_DONE,         isGFX9Plus},
+  {{"MSG_EARLY_PRIM_DEALLOC"},  ID_EARLY_PRIM_DEALLOC,      isGFX9_GFX10},
+  {{"MSG_GS_ALLOC_REQ"},        ID_GS_ALLOC_REQ,            isGFX9Plus},
+  {{"MSG_GET_DOORBELL"},        ID_GET_DOORBELL,            isGFX9_GFX10},
+  {{"MSG_GET_DDID"},            ID_GET_DDID,                isGFX10},
+  {{"MSG_HS_TESSFACTOR"},       ID_HS_TESSFACTOR_GFX11Plus, isGFX11Plus},
+  {{"MSG_DEALLOC_VGPRS"},       ID_DEALLOC_VGPRS_GFX11Plus, isGFX11Plus},
+  {{""}},
+  {{"MSG_SYSMSG"},              ID_SYSMSG},
+  {{"MSG_RTN_GET_DOORBELL"},    ID_RTN_GET_DOORBELL,        isGFX11Plus},
+  {{"MSG_RTN_GET_DDID"},        ID_RTN_GET_DDID,            isGFX11Plus},
+  {{"MSG_RTN_GET_TMA"},         ID_RTN_GET_TMA,             isGFX11Plus},
+  {{"MSG_RTN_GET_REALTIME"},    ID_RTN_GET_REALTIME,        isGFX11Plus},
+  {{"MSG_RTN_SAVE_WAVE"},       ID_RTN_SAVE_WAVE,           isGFX11Plus},
+  {{"MSG_RTN_GET_TBA"},         ID_RTN_GET_TBA,             isGFX11Plus},
 };
+// NOLINTEND
+
+const int MSG_SIZE = static_cast<int>(
+    sizeof(Msg) / sizeof(CustomOperand<const MCSubtargetInfo &>));
 
 // These two must be in sync with llvm::AMDGPU::SendMsg::Op enum members, see SIDefines.h.
 const char *const OpSysSymbolic[OP_SYS_LAST_] = {
@@ -54,39 +85,54 @@ const char *const OpGsSymbolic[OP_GS_LAST_] = {
 
 namespace Hwreg {
 
-// This must be in sync with llvm::AMDGPU::Hwreg::ID_SYMBOLIC_FIRST_/LAST_, see SIDefines.h.
-const char* const IdSymbolic[] = {
-  nullptr,
-  "HW_REG_MODE",
-  "HW_REG_STATUS",
-  "HW_REG_TRAPSTS",
-  "HW_REG_HW_ID",
-  "HW_REG_GPR_ALLOC",
-  "HW_REG_LDS_ALLOC",
-  "HW_REG_IB_STS",
-  nullptr,
-  nullptr,
-  nullptr,
-  nullptr,
-  nullptr,
-  nullptr,
-  nullptr,
-  "HW_REG_SH_MEM_BASES",
-  "HW_REG_TBA_LO",
-  "HW_REG_TBA_HI",
-  "HW_REG_TMA_LO",
-  "HW_REG_TMA_HI",
-  "HW_REG_FLAT_SCR_LO",
-  "HW_REG_FLAT_SCR_HI",
-  "HW_REG_XNACK_MASK",
-  "HW_REG_HW_ID1",
-  "HW_REG_HW_ID2",
-  "HW_REG_POPS_PACKER",
-  nullptr,
-  nullptr,
-  nullptr,
-  "HW_REG_SHADER_CYCLES"
+// Disable lint checking for this block since it makes the table unreadable.
+// NOLINTBEGIN
+const CustomOperand<const MCSubtargetInfo &> Opr[] = {
+  {{""}},
+  {{"HW_REG_MODE"},          ID_MODE},
+  {{"HW_REG_STATUS"},        ID_STATUS},
+  {{"HW_REG_TRAPSTS"},       ID_TRAPSTS},
+  {{"HW_REG_HW_ID"},         ID_HW_ID,       isNotGFX10Plus},
+  {{"HW_REG_GPR_ALLOC"},     ID_GPR_ALLOC},
+  {{"HW_REG_LDS_ALLOC"},     ID_LDS_ALLOC},
+  {{"HW_REG_IB_STS"},        ID_IB_STS},
+  {{""}},
+  {{""}},
+  {{""}},
+  {{""}},
+  {{""}},
+  {{""}},
+  {{""}},
+  {{"HW_REG_SH_MEM_BASES"},  ID_MEM_BASES,   isGFX9Plus},
+  {{"HW_REG_TBA_LO"},        ID_TBA_LO,      isGFX9_GFX10},
+  {{"HW_REG_TBA_HI"},        ID_TBA_HI,      isGFX9_GFX10},
+  {{"HW_REG_TMA_LO"},        ID_TMA_LO,      isGFX9_GFX10},
+  {{"HW_REG_TMA_HI"},        ID_TMA_HI,      isGFX9_GFX10},
+  {{"HW_REG_FLAT_SCR_LO"},   ID_FLAT_SCR_LO, isGFX10Plus},
+  {{"HW_REG_FLAT_SCR_HI"},   ID_FLAT_SCR_HI, isGFX10Plus},
+  {{"HW_REG_XNACK_MASK"},    ID_XNACK_MASK,  isGFX10Before1030},
+  {{"HW_REG_HW_ID1"},        ID_HW_ID1,      isGFX10Plus},
+  {{"HW_REG_HW_ID2"},        ID_HW_ID2,      isGFX10Plus},
+  {{"HW_REG_POPS_PACKER"},   ID_POPS_PACKER, isGFX10},
+  {{""}},
+  {{""}},
+  {{""}},
+  {{"HW_REG_SHADER_CYCLES"}, ID_SHADER_CYCLES, isGFX10_BEncoding},
+
+  // GFX940 specific registers
+  {{"HW_REG_XCC_ID"},                 ID_XCC_ID,                 isGFX940},
+  {{"HW_REG_SQ_PERF_SNAPSHOT_DATA"},  ID_SQ_PERF_SNAPSHOT_DATA,  isGFX940},
+  {{"HW_REG_SQ_PERF_SNAPSHOT_DATA1"}, ID_SQ_PERF_SNAPSHOT_DATA1, isGFX940},
+  {{"HW_REG_SQ_PERF_SNAPSHOT_PC_LO"}, ID_SQ_PERF_SNAPSHOT_PC_LO, isGFX940},
+  {{"HW_REG_SQ_PERF_SNAPSHOT_PC_HI"}, ID_SQ_PERF_SNAPSHOT_PC_HI, isGFX940},
+
+  // Aliases
+  {{"HW_REG_HW_ID"},                  ID_HW_ID1,                 isGFX10},
 };
+// NOLINTEND
+
+const int OPR_SIZE = static_cast<int>(
+    sizeof(Opr) / sizeof(CustomOperand<const MCSubtargetInfo &>));
 
 } // namespace Hwreg
 
@@ -144,7 +190,7 @@ StringLiteral const NfmtSymbolicVI[] = {    // VI and GFX9
   "BUF_NUM_FORMAT_FLOAT"
 };
 
-StringLiteral const UfmtSymbolic[] = {
+StringLiteral const UfmtSymbolicGFX10[] = {
   "BUF_FMT_INVALID",
 
   "BUF_FMT_8_UNORM",
@@ -238,7 +284,7 @@ StringLiteral const UfmtSymbolic[] = {
   "BUF_FMT_32_32_32_32_FLOAT"
 };
 
-unsigned const DfmtNfmt2UFmt[] = {
+unsigned const DfmtNfmt2UFmtGFX10[] = {
   DFMT_INVALID     | (NFMT_UNORM   << NFMT_SHIFT),
 
   DFMT_8           | (NFMT_UNORM   << NFMT_SHIFT),
@@ -332,6 +378,166 @@ unsigned const DfmtNfmt2UFmt[] = {
   DFMT_32_32_32_32 | (NFMT_FLOAT   << NFMT_SHIFT)
 };
 
+StringLiteral const UfmtSymbolicGFX11[] = {
+  "BUF_FMT_INVALID",
+
+  "BUF_FMT_8_UNORM",
+  "BUF_FMT_8_SNORM",
+  "BUF_FMT_8_USCALED",
+  "BUF_FMT_8_SSCALED",
+  "BUF_FMT_8_UINT",
+  "BUF_FMT_8_SINT",
+
+  "BUF_FMT_16_UNORM",
+  "BUF_FMT_16_SNORM",
+  "BUF_FMT_16_USCALED",
+  "BUF_FMT_16_SSCALED",
+  "BUF_FMT_16_UINT",
+  "BUF_FMT_16_SINT",
+  "BUF_FMT_16_FLOAT",
+
+  "BUF_FMT_8_8_UNORM",
+  "BUF_FMT_8_8_SNORM",
+  "BUF_FMT_8_8_USCALED",
+  "BUF_FMT_8_8_SSCALED",
+  "BUF_FMT_8_8_UINT",
+  "BUF_FMT_8_8_SINT",
+
+  "BUF_FMT_32_UINT",
+  "BUF_FMT_32_SINT",
+  "BUF_FMT_32_FLOAT",
+
+  "BUF_FMT_16_16_UNORM",
+  "BUF_FMT_16_16_SNORM",
+  "BUF_FMT_16_16_USCALED",
+  "BUF_FMT_16_16_SSCALED",
+  "BUF_FMT_16_16_UINT",
+  "BUF_FMT_16_16_SINT",
+  "BUF_FMT_16_16_FLOAT",
+
+  "BUF_FMT_10_11_11_FLOAT",
+
+  "BUF_FMT_11_11_10_FLOAT",
+
+  "BUF_FMT_10_10_10_2_UNORM",
+  "BUF_FMT_10_10_10_2_SNORM",
+  "BUF_FMT_10_10_10_2_UINT",
+  "BUF_FMT_10_10_10_2_SINT",
+
+  "BUF_FMT_2_10_10_10_UNORM",
+  "BUF_FMT_2_10_10_10_SNORM",
+  "BUF_FMT_2_10_10_10_USCALED",
+  "BUF_FMT_2_10_10_10_SSCALED",
+  "BUF_FMT_2_10_10_10_UINT",
+  "BUF_FMT_2_10_10_10_SINT",
+
+  "BUF_FMT_8_8_8_8_UNORM",
+  "BUF_FMT_8_8_8_8_SNORM",
+  "BUF_FMT_8_8_8_8_USCALED",
+  "BUF_FMT_8_8_8_8_SSCALED",
+  "BUF_FMT_8_8_8_8_UINT",
+  "BUF_FMT_8_8_8_8_SINT",
+
+  "BUF_FMT_32_32_UINT",
+  "BUF_FMT_32_32_SINT",
+  "BUF_FMT_32_32_FLOAT",
+
+  "BUF_FMT_16_16_16_16_UNORM",
+  "BUF_FMT_16_16_16_16_SNORM",
+  "BUF_FMT_16_16_16_16_USCALED",
+  "BUF_FMT_16_16_16_16_SSCALED",
+  "BUF_FMT_16_16_16_16_UINT",
+  "BUF_FMT_16_16_16_16_SINT",
+  "BUF_FMT_16_16_16_16_FLOAT",
+
+  "BUF_FMT_32_32_32_UINT",
+  "BUF_FMT_32_32_32_SINT",
+  "BUF_FMT_32_32_32_FLOAT",
+  "BUF_FMT_32_32_32_32_UINT",
+  "BUF_FMT_32_32_32_32_SINT",
+  "BUF_FMT_32_32_32_32_FLOAT"
+};
+
+unsigned const DfmtNfmt2UFmtGFX11[] = {
+  DFMT_INVALID     | (NFMT_UNORM   << NFMT_SHIFT),
+
+  DFMT_8           | (NFMT_UNORM   << NFMT_SHIFT),
+  DFMT_8           | (NFMT_SNORM   << NFMT_SHIFT),
+  DFMT_8           | (NFMT_USCALED << NFMT_SHIFT),
+  DFMT_8           | (NFMT_SSCALED << NFMT_SHIFT),
+  DFMT_8           | (NFMT_UINT    << NFMT_SHIFT),
+  DFMT_8           | (NFMT_SINT    << NFMT_SHIFT),
+
+  DFMT_16          | (NFMT_UNORM   << NFMT_SHIFT),
+  DFMT_16          | (NFMT_SNORM   << NFMT_SHIFT),
+  DFMT_16          | (NFMT_USCALED << NFMT_SHIFT),
+  DFMT_16          | (NFMT_SSCALED << NFMT_SHIFT),
+  DFMT_16          | (NFMT_UINT    << NFMT_SHIFT),
+  DFMT_16          | (NFMT_SINT    << NFMT_SHIFT),
+  DFMT_16          | (NFMT_FLOAT   << NFMT_SHIFT),
+
+  DFMT_8_8         | (NFMT_UNORM   << NFMT_SHIFT),
+  DFMT_8_8         | (NFMT_SNORM   << NFMT_SHIFT),
+  DFMT_8_8         | (NFMT_USCALED << NFMT_SHIFT),
+  DFMT_8_8         | (NFMT_SSCALED << NFMT_SHIFT),
+  DFMT_8_8         | (NFMT_UINT    << NFMT_SHIFT),
+  DFMT_8_8         | (NFMT_SINT    << NFMT_SHIFT),
+
+  DFMT_32          | (NFMT_UINT    << NFMT_SHIFT),
+  DFMT_32          | (NFMT_SINT    << NFMT_SHIFT),
+  DFMT_32          | (NFMT_FLOAT   << NFMT_SHIFT),
+
+  DFMT_16_16       | (NFMT_UNORM   << NFMT_SHIFT),
+  DFMT_16_16       | (NFMT_SNORM   << NFMT_SHIFT),
+  DFMT_16_16       | (NFMT_USCALED << NFMT_SHIFT),
+  DFMT_16_16       | (NFMT_SSCALED << NFMT_SHIFT),
+  DFMT_16_16       | (NFMT_UINT    << NFMT_SHIFT),
+  DFMT_16_16       | (NFMT_SINT    << NFMT_SHIFT),
+  DFMT_16_16       | (NFMT_FLOAT   << NFMT_SHIFT),
+
+  DFMT_10_11_11    | (NFMT_FLOAT   << NFMT_SHIFT),
+
+  DFMT_11_11_10    | (NFMT_FLOAT   << NFMT_SHIFT),
+
+  DFMT_10_10_10_2  | (NFMT_UNORM   << NFMT_SHIFT),
+  DFMT_10_10_10_2  | (NFMT_SNORM   << NFMT_SHIFT),
+  DFMT_10_10_10_2  | (NFMT_UINT    << NFMT_SHIFT),
+  DFMT_10_10_10_2  | (NFMT_SINT    << NFMT_SHIFT),
+
+  DFMT_2_10_10_10  | (NFMT_UNORM   << NFMT_SHIFT),
+  DFMT_2_10_10_10  | (NFMT_SNORM   << NFMT_SHIFT),
+  DFMT_2_10_10_10  | (NFMT_USCALED << NFMT_SHIFT),
+  DFMT_2_10_10_10  | (NFMT_SSCALED << NFMT_SHIFT),
+  DFMT_2_10_10_10  | (NFMT_UINT    << NFMT_SHIFT),
+  DFMT_2_10_10_10  | (NFMT_SINT    << NFMT_SHIFT),
+
+  DFMT_8_8_8_8     | (NFMT_UNORM   << NFMT_SHIFT),
+  DFMT_8_8_8_8     | (NFMT_SNORM   << NFMT_SHIFT),
+  DFMT_8_8_8_8     | (NFMT_USCALED << NFMT_SHIFT),
+  DFMT_8_8_8_8     | (NFMT_SSCALED << NFMT_SHIFT),
+  DFMT_8_8_8_8     | (NFMT_UINT    << NFMT_SHIFT),
+  DFMT_8_8_8_8     | (NFMT_SINT    << NFMT_SHIFT),
+
+  DFMT_32_32       | (NFMT_UINT    << NFMT_SHIFT),
+  DFMT_32_32       | (NFMT_SINT    << NFMT_SHIFT),
+  DFMT_32_32       | (NFMT_FLOAT   << NFMT_SHIFT),
+
+  DFMT_16_16_16_16 | (NFMT_UNORM   << NFMT_SHIFT),
+  DFMT_16_16_16_16 | (NFMT_SNORM   << NFMT_SHIFT),
+  DFMT_16_16_16_16 | (NFMT_USCALED << NFMT_SHIFT),
+  DFMT_16_16_16_16 | (NFMT_SSCALED << NFMT_SHIFT),
+  DFMT_16_16_16_16 | (NFMT_UINT    << NFMT_SHIFT),
+  DFMT_16_16_16_16 | (NFMT_SINT    << NFMT_SHIFT),
+  DFMT_16_16_16_16 | (NFMT_FLOAT   << NFMT_SHIFT),
+
+  DFMT_32_32_32    | (NFMT_UINT    << NFMT_SHIFT),
+  DFMT_32_32_32    | (NFMT_SINT    << NFMT_SHIFT),
+  DFMT_32_32_32    | (NFMT_FLOAT   << NFMT_SHIFT),
+  DFMT_32_32_32_32 | (NFMT_UINT    << NFMT_SHIFT),
+  DFMT_32_32_32_32 | (NFMT_SINT    << NFMT_SHIFT),
+  DFMT_32_32_32_32 | (NFMT_FLOAT   << NFMT_SHIFT)
+};
+
 } // namespace MTBUFFormat
 
 namespace Swizzle {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
index d1deb570a938..054e35e90f2f 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
@@ -11,15 +11,60 @@
 
 #include "SIDefines.h"
 
+#include "llvm/ADT/StringRef.h"
+
 namespace llvm {
 
 class StringLiteral;
+class MCSubtargetInfo;
 
 namespace AMDGPU {
 
+const int OPR_ID_UNKNOWN = -1;
+const int OPR_ID_UNSUPPORTED = -2;
+const int OPR_ID_DUPLICATE = -3;
+const int OPR_VAL_INVALID = -4;
+
+template <class T> struct CustomOperand {
+  StringLiteral Name;
+  int Encoding = 0;
+  bool (*Cond)(T Context) = nullptr;
+};
+
+struct CustomOperandVal {
+  StringLiteral Name;
+  unsigned Max;
+  unsigned Default;
+  unsigned Shift;
+  unsigned Width;
+  bool (*Cond)(const MCSubtargetInfo &STI) = nullptr;
+  unsigned Mask = (1 << Width) - 1;
+
+  unsigned decode(unsigned Code) const { return (Code >> Shift) & Mask; }
+
+  unsigned encode(unsigned Val) const { return (Val & Mask) << Shift; }
+
+  unsigned getMask() const { return Mask << Shift; }
+
+  bool isValid(unsigned Val) const { return Val <= Max; }
+
+  bool isSupported(const MCSubtargetInfo &STI) const {
+    return !Cond || Cond(STI);
+  }
+};
+
+namespace DepCtr {
+
+extern const CustomOperandVal DepCtrInfo[];
+extern const int DEP_CTR_SIZE;
+
+} // namespace DepCtr
+
 namespace SendMsg { // Symbolic names for the sendmsg(...) syntax.
 
-extern const char *const IdSymbolic[ID_GAPS_LAST_];
+extern const CustomOperand<const MCSubtargetInfo &> Msg[];
+extern const int MSG_SIZE;
+
 extern const char *const OpSysSymbolic[OP_SYS_LAST_];
 extern const char *const OpGsSymbolic[OP_GS_LAST_];
 
@@ -27,7 +72,8 @@ extern const char *const OpGsSymbolic[OP_GS_LAST_];
 
 namespace Hwreg { // Symbolic names for the hwreg(...) syntax.
 
-extern const char* const IdSymbolic[];
+extern const CustomOperand<const MCSubtargetInfo &> Opr[];
+extern const int OPR_SIZE;
 
 } // namespace Hwreg
 
@@ -37,8 +83,10 @@ extern StringLiteral const DfmtSymbolic[];
 extern StringLiteral const NfmtSymbolicGFX10[];
 extern StringLiteral const NfmtSymbolicSICI[];
 extern StringLiteral const NfmtSymbolicVI[];
-extern StringLiteral const UfmtSymbolic[];
-extern unsigned const DfmtNfmt2UFmt[];
+extern StringLiteral const UfmtSymbolicGFX10[];
+extern StringLiteral const UfmtSymbolicGFX11[];
+extern unsigned const DfmtNfmt2UFmtGFX10[];
+extern unsigned const DfmtNfmt2UFmtGFX11[];
 
 } // namespace MTBUFFormat
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 683be871ff82..e4ab72f1095b 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -28,10 +28,15 @@
 #define GET_INSTRMAP_INFO
 #include "AMDGPUGenInstrInfo.inc"
 
-static llvm::cl::opt<unsigned> AmdhsaCodeObjectVersion(
-  "amdhsa-code-object-version", llvm::cl::Hidden,
-  llvm::cl::desc("AMDHSA Code Object Version"), llvm::cl::init(4),
-  llvm::cl::ZeroOrMore);
+static llvm::cl::opt<unsigned>
+    AmdhsaCodeObjectVersion("amdhsa-code-object-version", llvm::cl::Hidden,
+                            llvm::cl::desc("AMDHSA Code Object Version"),
+                            llvm::cl::init(4));
+
+// TODO-GFX11: Remove this when full 16-bit codegen is implemented.
+static llvm::cl::opt<bool>
+    LimitTo128VGPRs("amdgpu-limit-to-128-vgprs", llvm::cl::Hidden,
+                    llvm::cl::desc("Never use more than 128 VGPRs"));
 
 namespace {
 
@@ -44,9 +49,8 @@ unsigned getBitMask(unsigned Shift, unsigned Width) {
 ///
 /// \returns Packed \p Dst.
 unsigned packBits(unsigned Src, unsigned Dst, unsigned Shift, unsigned Width) {
-  Dst &= ~(1 << Shift) & ~getBitMask(Shift, Width);
-  Dst |= (Src << Shift) & getBitMask(Shift, Width);
-  return Dst;
+  unsigned Mask = getBitMask(Shift, Width);
+  return ((Src << Shift) & Mask) | (Dst & ~Mask);
 }
 
 /// Unpacks bits from \p Src for given bit \p Shift and bit \p Width.
@@ -57,30 +61,40 @@ unsigned unpackBits(unsigned Src, unsigned Shift, unsigned Width) {
 }
 
 /// \returns Vmcnt bit shift (lower bits).
-unsigned getVmcntBitShiftLo() { return 0; }
+unsigned getVmcntBitShiftLo(unsigned VersionMajor) {
+  return VersionMajor >= 11 ? 10 : 0;
+}
 
 /// \returns Vmcnt bit width (lower bits).
-unsigned getVmcntBitWidthLo() { return 4; }
+unsigned getVmcntBitWidthLo(unsigned VersionMajor) {
+  return VersionMajor >= 11 ? 6 : 4;
+}
 
 /// \returns Expcnt bit shift.
-unsigned getExpcntBitShift() { return 4; }
+unsigned getExpcntBitShift(unsigned VersionMajor) {
+  return VersionMajor >= 11 ? 0 : 4;
+}
 
 /// \returns Expcnt bit width.
-unsigned getExpcntBitWidth() { return 3; }
+unsigned getExpcntBitWidth(unsigned VersionMajor) { return 3; }
 
 /// \returns Lgkmcnt bit shift.
-unsigned getLgkmcntBitShift() { return 8; }
+unsigned getLgkmcntBitShift(unsigned VersionMajor) {
+  return VersionMajor >= 11 ? 4 : 8;
+}
 
 /// \returns Lgkmcnt bit width.
 unsigned getLgkmcntBitWidth(unsigned VersionMajor) {
-  return (VersionMajor >= 10) ? 6 : 4;
+  return VersionMajor >= 10 ? 6 : 4;
 }
 
 /// \returns Vmcnt bit shift (higher bits).
-unsigned getVmcntBitShiftHi() { return 14; }
+unsigned getVmcntBitShiftHi(unsigned VersionMajor) { return 14; }
 
 /// \returns Vmcnt bit width (higher bits).
-unsigned getVmcntBitWidthHi() { return 2; }
+unsigned getVmcntBitWidthHi(unsigned VersionMajor) {
+  return (VersionMajor == 9 || VersionMajor == 10) ? 2 : 0;
+}
 
 } // end namespace anonymous
 
@@ -136,6 +150,41 @@ bool isHsaAbiVersion3AndAbove(const MCSubtargetInfo *STI) {
          isHsaAbiVersion5(STI);
 }
 
+unsigned getAmdhsaCodeObjectVersion() {
+  return AmdhsaCodeObjectVersion;
+}
+
+unsigned getMultigridSyncArgImplicitArgPosition() {
+  switch (AmdhsaCodeObjectVersion) {
+  case 2:
+  case 3:
+  case 4:
+    return 48;
+  case 5:
+    return AMDGPU::ImplicitArg::MULTIGRID_SYNC_ARG_OFFSET;
+  default:
+    llvm_unreachable("Unexpected code object version");
+    return 0;
+  }
+}
+
+
+// FIXME: All such magic numbers about the ABI should be in a
+// central TD file.
+unsigned getHostcallImplicitArgPosition() {
+  switch (AmdhsaCodeObjectVersion) {
+  case 2:
+  case 3:
+  case 4:
+    return 24;
+  case 5:
+    return AMDGPU::ImplicitArg::HOSTCALL_PTR_OFFSET;
+  default:
+    llvm_unreachable("Unexpected code object version");
+    return 0;
+  }
+}
+
 #define GET_MIMGBaseOpcodesTable_IMPL
 #define GET_MIMGDimInfoTable_IMPL
 #define GET_MIMGInfoTable_IMPL
@@ -144,6 +193,7 @@ bool isHsaAbiVersion3AndAbove(const MCSubtargetInfo *STI) {
 #define GET_MIMGBiasMappingTable_IMPL
 #define GET_MIMGOffsetMappingTable_IMPL
 #define GET_MIMGG16MappingTable_IMPL
+#define GET_MAIInstInfoTable_IMPL
 #include "AMDGPUGenSearchableTables.inc"
 
 int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
@@ -223,6 +273,10 @@ struct VOPInfo {
   bool IsSingle;
 };
 
+struct VOPC64DPPInfo {
+  uint16_t Opcode;
+};
+
 #define GET_MTBUFInfoTable_DECL
 #define GET_MTBUFInfoTable_IMPL
 #define GET_MUBUFInfoTable_DECL
@@ -235,6 +289,14 @@ struct VOPInfo {
 #define GET_VOP2InfoTable_IMPL
 #define GET_VOP3InfoTable_DECL
 #define GET_VOP3InfoTable_IMPL
+#define GET_VOPC64DPPTable_DECL
+#define GET_VOPC64DPPTable_IMPL
+#define GET_VOPC64DPP8Table_DECL
+#define GET_VOPC64DPP8Table_IMPL
+#define GET_WMMAOpcode2AddrMappingTable_DECL
+#define GET_WMMAOpcode2AddrMappingTable_IMPL
+#define GET_WMMAOpcode3AddrMappingTable_DECL
+#define GET_WMMAOpcode3AddrMappingTable_IMPL
 #include "AMDGPUGenSearchableTables.inc"
 
 int getMTBUFBaseOpcode(unsigned Opc) {
@@ -322,6 +384,30 @@ bool getVOP3IsSingle(unsigned Opc) {
   return Info ? Info->IsSingle : false;
 }
 
+bool isVOPC64DPP(unsigned Opc) {
+  return isVOPC64DPPOpcodeHelper(Opc) || isVOPC64DPP8OpcodeHelper(Opc);
+}
+
+bool getMAIIsDGEMM(unsigned Opc) {
+  const MAIInstInfo *Info = getMAIInstInfoHelper(Opc);
+  return Info ? Info->is_dgemm : false;
+}
+
+bool getMAIIsGFX940XDL(unsigned Opc) {
+  const MAIInstInfo *Info = getMAIInstInfoHelper(Opc);
+  return Info ? Info->is_gfx940_xdl : false;
+}
+
+unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) {
+  const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom2AddrOpcode(Opc);
+  return Info ? Info->Opcode3Addr : ~0u;
+}
+
+unsigned mapWMMA3AddrTo2AddrOpcode(unsigned Opc) {
+  const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom3AddrOpcode(Opc);
+  return Info ? Info->Opcode2Addr : ~0u;
+}
+
 // Wrapper for Tablegen'd function.  enum Subtarget is not defined in any
 // header files, so we need to wrap it in a function that takes unsigned
 // instead.
@@ -740,6 +826,15 @@ unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
 }
 
 unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
+  if (LimitTo128VGPRs.getNumOccurrences() ? LimitTo128VGPRs
+                                          : isGFX11Plus(*STI)) {
+    // GFX11 changes the encoding of 16-bit operands in VOP1/2/C instructions
+    // such that values 128..255 no longer mean v128..v255, they mean
+    // v0.hi..v127.hi instead. Until the compiler understands this, it is not
+    // safe to use v128..v255.
+    // TODO-GFX11: Remove this when full 16-bit codegen is implemented.
+    return 128;
+  }
   if (STI->getFeatureBits().test(FeatureGFX90AInsts))
     return 512;
   return 256;
@@ -904,16 +999,13 @@ std::pair<int, int> getIntegerPairAttribute(const Function &F,
 }
 
 unsigned getVmcntBitMask(const IsaVersion &Version) {
-  unsigned VmcntLo = (1 << getVmcntBitWidthLo()) - 1;
-  if (Version.Major < 9)
-    return VmcntLo;
-
-  unsigned VmcntHi = ((1 << getVmcntBitWidthHi()) - 1) << getVmcntBitWidthLo();
-  return VmcntLo | VmcntHi;
+  return (1 << (getVmcntBitWidthLo(Version.Major) +
+                getVmcntBitWidthHi(Version.Major))) -
+         1;
 }
 
 unsigned getExpcntBitMask(const IsaVersion &Version) {
-  return (1 << getExpcntBitWidth()) - 1;
+  return (1 << getExpcntBitWidth(Version.Major)) - 1;
 }
 
 unsigned getLgkmcntBitMask(const IsaVersion &Version) {
@@ -921,36 +1013,32 @@ unsigned getLgkmcntBitMask(const IsaVersion &Version) {
 }
 
 unsigned getWaitcntBitMask(const IsaVersion &Version) {
-  unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(), getVmcntBitWidthLo());
-  unsigned Expcnt = getBitMask(getExpcntBitShift(), getExpcntBitWidth());
-  unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(),
+  unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(Version.Major),
+                                getVmcntBitWidthLo(Version.Major));
+  unsigned Expcnt = getBitMask(getExpcntBitShift(Version.Major),
+                               getExpcntBitWidth(Version.Major));
+  unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(Version.Major),
                                 getLgkmcntBitWidth(Version.Major));
-  unsigned Waitcnt = VmcntLo | Expcnt | Lgkmcnt;
-  if (Version.Major < 9)
-    return Waitcnt;
-
-  unsigned VmcntHi = getBitMask(getVmcntBitShiftHi(), getVmcntBitWidthHi());
-  return Waitcnt | VmcntHi;
+  unsigned VmcntHi = getBitMask(getVmcntBitShiftHi(Version.Major),
+                                getVmcntBitWidthHi(Version.Major));
+  return VmcntLo | Expcnt | Lgkmcnt | VmcntHi;
 }
 
 unsigned decodeVmcnt(const IsaVersion &Version, unsigned Waitcnt) {
-  unsigned VmcntLo =
-      unpackBits(Waitcnt, getVmcntBitShiftLo(), getVmcntBitWidthLo());
-  if (Version.Major < 9)
-    return VmcntLo;
-
-  unsigned VmcntHi =
-      unpackBits(Waitcnt, getVmcntBitShiftHi(), getVmcntBitWidthHi());
-  VmcntHi <<= getVmcntBitWidthLo();
-  return VmcntLo | VmcntHi;
+  unsigned VmcntLo = unpackBits(Waitcnt, getVmcntBitShiftLo(Version.Major),
+                                getVmcntBitWidthLo(Version.Major));
+  unsigned VmcntHi = unpackBits(Waitcnt, getVmcntBitShiftHi(Version.Major),
+                                getVmcntBitWidthHi(Version.Major));
+  return VmcntLo | VmcntHi << getVmcntBitWidthLo(Version.Major);
 }
 
 unsigned decodeExpcnt(const IsaVersion &Version, unsigned Waitcnt) {
-  return unpackBits(Waitcnt, getExpcntBitShift(), getExpcntBitWidth());
+  return unpackBits(Waitcnt, getExpcntBitShift(Version.Major),
+                    getExpcntBitWidth(Version.Major));
 }
 
 unsigned decodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt) {
-  return unpackBits(Waitcnt, getLgkmcntBitShift(),
+  return unpackBits(Waitcnt, getLgkmcntBitShift(Version.Major),
                     getLgkmcntBitWidth(Version.Major));
 }
 
@@ -971,24 +1059,23 @@ Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded) {
 
 unsigned encodeVmcnt(const IsaVersion &Version, unsigned Waitcnt,
                      unsigned Vmcnt) {
-  Waitcnt =
-      packBits(Vmcnt, Waitcnt, getVmcntBitShiftLo(), getVmcntBitWidthLo());
-  if (Version.Major < 9)
-    return Waitcnt;
-
-  Vmcnt >>= getVmcntBitWidthLo();
-  return packBits(Vmcnt, Waitcnt, getVmcntBitShiftHi(), getVmcntBitWidthHi());
+  Waitcnt = packBits(Vmcnt, Waitcnt, getVmcntBitShiftLo(Version.Major),
+                     getVmcntBitWidthLo(Version.Major));
+  return packBits(Vmcnt >> getVmcntBitWidthLo(Version.Major), Waitcnt,
+                  getVmcntBitShiftHi(Version.Major),
+                  getVmcntBitWidthHi(Version.Major));
 }
 
 unsigned encodeExpcnt(const IsaVersion &Version, unsigned Waitcnt,
                       unsigned Expcnt) {
-  return packBits(Expcnt, Waitcnt, getExpcntBitShift(), getExpcntBitWidth());
+  return packBits(Expcnt, Waitcnt, getExpcntBitShift(Version.Major),
+                  getExpcntBitWidth(Version.Major));
 }
 
 unsigned encodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt,
                        unsigned Lgkmcnt) {
-  return packBits(Lgkmcnt, Waitcnt, getLgkmcntBitShift(),
-                                    getLgkmcntBitWidth(Version.Major));
+  return packBits(Lgkmcnt, Waitcnt, getLgkmcntBitShift(Version.Major),
+                  getLgkmcntBitWidth(Version.Major));
 }
 
 unsigned encodeWaitcnt(const IsaVersion &Version,
@@ -1005,43 +1092,184 @@ unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded) {
 }
 
 //===----------------------------------------------------------------------===//
-// hwreg
+// Custom Operands.
+//
+// A table of custom operands shall describe "primary" operand names
+// first followed by aliases if any. It is not required but recommended
+// to arrange operands so that operand encoding match operand position
+// in the table. This will make disassembly a bit more efficient.
+// Unused slots in the table shall have an empty name.
+//
 //===----------------------------------------------------------------------===//
 
-namespace Hwreg {
-
-int64_t getHwregId(const StringRef Name) {
-  for (int Id = ID_SYMBOLIC_FIRST_; Id < ID_SYMBOLIC_LAST_; ++Id) {
-    if (IdSymbolic[Id] && Name == IdSymbolic[Id])
-      return Id;
+template <class T>
+static bool isValidOpr(int Idx, const CustomOperand<T> OpInfo[], int OpInfoSize,
+                       T Context) {
+  return 0 <= Idx && Idx < OpInfoSize && !OpInfo[Idx].Name.empty() &&
+         (!OpInfo[Idx].Cond || OpInfo[Idx].Cond(Context));
+}
+
+template <class T>
+static int getOprIdx(std::function<bool(const CustomOperand<T> &)> Test,
+                     const CustomOperand<T> OpInfo[], int OpInfoSize,
+                     T Context) {
+  int InvalidIdx = OPR_ID_UNKNOWN;
+  for (int Idx = 0; Idx < OpInfoSize; ++Idx) {
+    if (Test(OpInfo[Idx])) {
+      if (!OpInfo[Idx].Cond || OpInfo[Idx].Cond(Context))
+        return Idx;
+      InvalidIdx = OPR_ID_UNSUPPORTED;
+    }
   }
-  return ID_UNKNOWN_;
+  return InvalidIdx;
+}
+
+template <class T>
+static int getOprIdx(const StringRef Name, const CustomOperand<T> OpInfo[],
+                     int OpInfoSize, T Context) {
+  auto Test = [=](const CustomOperand<T> &Op) { return Op.Name == Name; };
+  return getOprIdx<T>(Test, OpInfo, OpInfoSize, Context);
+}
+
+template <class T>
+static int getOprIdx(int Id, const CustomOperand<T> OpInfo[], int OpInfoSize,
+                     T Context, bool QuickCheck = true) {
+  auto Test = [=](const CustomOperand<T> &Op) {
+    return Op.Encoding == Id && !Op.Name.empty();
+  };
+  // This is an optimization that should work in most cases.
+  // As a side effect, it may cause selection of an alias
+  // instead of a primary operand name in case of sparse tables.
+  if (QuickCheck && isValidOpr<T>(Id, OpInfo, OpInfoSize, Context) &&
+      OpInfo[Id].Encoding == Id) {
+    return Id;
+  }
+  return getOprIdx<T>(Test, OpInfo, OpInfoSize, Context);
 }
 
-static unsigned getLastSymbolicHwreg(const MCSubtargetInfo &STI) {
-  if (isSI(STI) || isCI(STI) || isVI(STI))
-    return ID_SYMBOLIC_FIRST_GFX9_;
-  else if (isGFX9(STI))
-    return ID_SYMBOLIC_FIRST_GFX10_;
-  else if (isGFX10(STI) && !isGFX10_BEncoding(STI))
-    return ID_SYMBOLIC_FIRST_GFX1030_;
-  else
-    return ID_SYMBOLIC_LAST_;
+//===----------------------------------------------------------------------===//
+// Custom Operand Values
+//===----------------------------------------------------------------------===//
+
+static unsigned getDefaultCustomOperandEncoding(const CustomOperandVal *Opr,
+                                                int Size,
+                                                const MCSubtargetInfo &STI) {
+  unsigned Enc = 0;
+  for (int Idx = 0; Idx < Size; ++Idx) {
+    const auto &Op = Opr[Idx];
+    if (Op.isSupported(STI))
+      Enc |= Op.encode(Op.Default);
+  }
+  return Enc;
+}
+
+static bool isSymbolicCustomOperandEncoding(const CustomOperandVal *Opr,
+                                            int Size, unsigned Code,
+                                            bool &HasNonDefaultVal,
+                                            const MCSubtargetInfo &STI) {
+  unsigned UsedOprMask = 0;
+  HasNonDefaultVal = false;
+  for (int Idx = 0; Idx < Size; ++Idx) {
+    const auto &Op = Opr[Idx];
+    if (!Op.isSupported(STI))
+      continue;
+    UsedOprMask |= Op.getMask();
+    unsigned Val = Op.decode(Code);
+    if (!Op.isValid(Val))
+      return false;
+    HasNonDefaultVal |= (Val != Op.Default);
+  }
+  return (Code & ~UsedOprMask) == 0;
+}
+
+static bool decodeCustomOperand(const CustomOperandVal *Opr, int Size,
+                                unsigned Code, int &Idx, StringRef &Name,
+                                unsigned &Val, bool &IsDefault,
+                                const MCSubtargetInfo &STI) {
+  while (Idx < Size) {
+    const auto &Op = Opr[Idx++];
+    if (Op.isSupported(STI)) {
+      Name = Op.Name;
+      Val = Op.decode(Code);
+      IsDefault = (Val == Op.Default);
+      return true;
+    }
+  }
+
+  return false;
 }
 
-bool isValidHwreg(int64_t Id, const MCSubtargetInfo &STI) {
-  switch (Id) {
-  case ID_HW_ID:
-    return isSI(STI) || isCI(STI) || isVI(STI) || isGFX9(STI);
-  case ID_HW_ID1:
-  case ID_HW_ID2:
-    return isGFX10Plus(STI);
-  case ID_XNACK_MASK:
-    return isGFX10(STI) && !AMDGPU::isGFX10_BEncoding(STI);
-  default:
-    return ID_SYMBOLIC_FIRST_ <= Id && Id < getLastSymbolicHwreg(STI) &&
-           IdSymbolic[Id];
+static int encodeCustomOperandVal(const CustomOperandVal &Op,
+                                  int64_t InputVal) {
+  if (InputVal < 0 || InputVal > Op.Max)
+    return OPR_VAL_INVALID;
+  return Op.encode(InputVal);
+}
+
+static int encodeCustomOperand(const CustomOperandVal *Opr, int Size,
+                               const StringRef Name, int64_t InputVal,
+                               unsigned &UsedOprMask,
+                               const MCSubtargetInfo &STI) {
+  int InvalidId = OPR_ID_UNKNOWN;
+  for (int Idx = 0; Idx < Size; ++Idx) {
+    const auto &Op = Opr[Idx];
+    if (Op.Name == Name) {
+      if (!Op.isSupported(STI)) {
+        InvalidId = OPR_ID_UNSUPPORTED;
+        continue;
+      }
+      auto OprMask = Op.getMask();
+      if (OprMask & UsedOprMask)
+        return OPR_ID_DUPLICATE;
+      UsedOprMask |= OprMask;
+      return encodeCustomOperandVal(Op, InputVal);
+    }
   }
+  return InvalidId;
+}
+
+//===----------------------------------------------------------------------===//
+// DepCtr
+//===----------------------------------------------------------------------===//
+
+namespace DepCtr {
+
+int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI) {
+  static int Default = -1;
+  if (Default == -1)
+    Default = getDefaultCustomOperandEncoding(DepCtrInfo, DEP_CTR_SIZE, STI);
+  return Default;
+}
+
+bool isSymbolicDepCtrEncoding(unsigned Code, bool &HasNonDefaultVal,
+                              const MCSubtargetInfo &STI) {
+  return isSymbolicCustomOperandEncoding(DepCtrInfo, DEP_CTR_SIZE, Code,
+                                         HasNonDefaultVal, STI);
+}
+
+bool decodeDepCtr(unsigned Code, int &Id, StringRef &Name, unsigned &Val,
+                  bool &IsDefault, const MCSubtargetInfo &STI) {
+  return decodeCustomOperand(DepCtrInfo, DEP_CTR_SIZE, Code, Id, Name, Val,
+                             IsDefault, STI);
+}
+
+int encodeDepCtr(const StringRef Name, int64_t Val, unsigned &UsedOprMask,
+                 const MCSubtargetInfo &STI) {
+  return encodeCustomOperand(DepCtrInfo, DEP_CTR_SIZE, Name, Val, UsedOprMask,
+                             STI);
+}
+
+} // namespace DepCtr
+
+//===----------------------------------------------------------------------===//
+// hwreg
+//===----------------------------------------------------------------------===//
+
+namespace Hwreg {
+
+int64_t getHwregId(const StringRef Name, const MCSubtargetInfo &STI) {
+  int Idx = getOprIdx<const MCSubtargetInfo &>(Name, Opr, OPR_SIZE, STI);
+  return (Idx < 0) ? Idx : Opr[Idx].Encoding;
 }
 
 bool isValidHwreg(int64_t Id) {
@@ -1063,7 +1291,8 @@ uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width) {
 }
 
 StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI) {
-  return isValidHwreg(Id, STI) ? IdSymbolic[Id] : "";
+  int Idx = getOprIdx<const MCSubtargetInfo &>(Id, Opr, OPR_SIZE, STI);
+  return (Idx < 0) ? "" : Opr[Idx].Name;
 }
 
 void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width) {
@@ -1087,12 +1316,13 @@ struct ExpTgt {
 };
 
 static constexpr ExpTgt ExpTgtInfo[] = {
-  {{"null"},  ET_NULL,   ET_NULL_MAX_IDX},
-  {{"mrtz"},  ET_MRTZ,   ET_MRTZ_MAX_IDX},
-  {{"prim"},  ET_PRIM,   ET_PRIM_MAX_IDX},
-  {{"mrt"},   ET_MRT0,   ET_MRT_MAX_IDX},
-  {{"pos"},   ET_POS0,   ET_POS_MAX_IDX},
-  {{"param"}, ET_PARAM0, ET_PARAM_MAX_IDX},
+  {{"null"},           ET_NULL,            ET_NULL_MAX_IDX},
+  {{"mrtz"},           ET_MRTZ,            ET_MRTZ_MAX_IDX},
+  {{"prim"},           ET_PRIM,            ET_PRIM_MAX_IDX},
+  {{"mrt"},            ET_MRT0,            ET_MRT_MAX_IDX},
+  {{"pos"},            ET_POS0,            ET_POS_MAX_IDX},
+  {{"dual_src_blend"}, ET_DUAL_SRC_BLEND0, ET_DUAL_SRC_BLEND_MAX_IDX},
+  {{"param"},          ET_PARAM0,          ET_PARAM_MAX_IDX},
 };
 
 bool getTgtName(unsigned Id, StringRef &Name, int &Index) {
@@ -1130,7 +1360,20 @@ unsigned getTgtId(const StringRef Name) {
 }
 
 bool isSupportedTgtId(unsigned Id, const MCSubtargetInfo &STI) {
-  return (Id != ET_POS4 && Id != ET_PRIM) || isGFX10Plus(STI);
+  switch (Id) {
+  case ET_NULL:
+    return !isGFX11Plus(STI);
+  case ET_POS4:
+  case ET_PRIM:
+    return isGFX10Plus(STI);
+  case ET_DUAL_SRC_BLEND0:
+  case ET_DUAL_SRC_BLEND1:
+    return isGFX11Plus(STI);
+  default:
+    if (Id >= ET_PARAM0 && Id <= ET_PARAM31)
+      return !isGFX11Plus(STI);
+    return true;
+  }
 }
 
 } // namespace Exp
@@ -1196,27 +1439,44 @@ void decodeDfmtNfmt(unsigned Format, unsigned &Dfmt, unsigned &Nfmt) {
   Nfmt = (Format >> NFMT_SHIFT) & NFMT_MASK;
 }
 
-int64_t getUnifiedFormat(const StringRef Name) {
-  for (int Id = UFMT_FIRST; Id <= UFMT_LAST; ++Id) {
-    if (Name == UfmtSymbolic[Id])
-      return Id;
+int64_t getUnifiedFormat(const StringRef Name, const MCSubtargetInfo &STI) {
+  if (isGFX11Plus(STI)) {
+    for (int Id = UfmtGFX11::UFMT_FIRST; Id <= UfmtGFX11::UFMT_LAST; ++Id) {
+      if (Name == UfmtSymbolicGFX11[Id])
+        return Id;
+    }
+  } else {
+    for (int Id = UfmtGFX10::UFMT_FIRST; Id <= UfmtGFX10::UFMT_LAST; ++Id) {
+      if (Name == UfmtSymbolicGFX10[Id])
+        return Id;
+    }
   }
   return UFMT_UNDEF;
 }
 
-StringRef getUnifiedFormatName(unsigned Id) {
-  return isValidUnifiedFormat(Id) ? UfmtSymbolic[Id] : "";
+StringRef getUnifiedFormatName(unsigned Id, const MCSubtargetInfo &STI) {
+  if(isValidUnifiedFormat(Id, STI))
+    return isGFX10(STI) ? UfmtSymbolicGFX10[Id] : UfmtSymbolicGFX11[Id];
+  return "";
 }
 
-bool isValidUnifiedFormat(unsigned Id) {
-  return Id <= UFMT_LAST;
+bool isValidUnifiedFormat(unsigned Id, const MCSubtargetInfo &STI) {
+  return isGFX10(STI) ? Id <= UfmtGFX10::UFMT_LAST : Id <= UfmtGFX11::UFMT_LAST;
 }
 
-int64_t convertDfmtNfmt2Ufmt(unsigned Dfmt, unsigned Nfmt) {
+int64_t convertDfmtNfmt2Ufmt(unsigned Dfmt, unsigned Nfmt,
+                             const MCSubtargetInfo &STI) {
   int64_t Fmt = encodeDfmtNfmt(Dfmt, Nfmt);
-  for (int Id = UFMT_FIRST; Id <= UFMT_LAST; ++Id) {
-    if (Fmt == DfmtNfmt2UFmt[Id])
-      return Id;
+  if (isGFX11Plus(STI)) {
+    for (int Id = UfmtGFX11::UFMT_FIRST; Id <= UfmtGFX11::UFMT_LAST; ++Id) {
+      if (Fmt == DfmtNfmt2UFmtGFX11[Id])
+        return Id;
+    }
+  } else {
+    for (int Id = UfmtGFX10::UFMT_FIRST; Id <= UfmtGFX10::UFMT_LAST; ++Id) {
+      if (Fmt == DfmtNfmt2UFmtGFX10[Id])
+        return Id;
+    }
   }
   return UFMT_UNDEF;
 }
@@ -1239,40 +1499,22 @@ unsigned getDefaultFormatEncoding(const MCSubtargetInfo &STI) {
 
 namespace SendMsg {
 
-int64_t getMsgId(const StringRef Name) {
-  for (int i = ID_GAPS_FIRST_; i < ID_GAPS_LAST_; ++i) {
-    if (IdSymbolic[i] && Name == IdSymbolic[i])
-      return i;
-  }
-  return ID_UNKNOWN_;
+static uint64_t getMsgIdMask(const MCSubtargetInfo &STI) {
+  return isGFX11Plus(STI) ? ID_MASK_GFX11Plus_ : ID_MASK_PreGFX11_;
 }
 
-bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI, bool Strict) {
-  if (Strict) {
-    switch (MsgId) {
-    case ID_SAVEWAVE:
-      return isVI(STI) || isGFX9Plus(STI);
-    case ID_STALL_WAVE_GEN:
-    case ID_HALT_WAVES:
-    case ID_ORDERED_PS_DONE:
-    case ID_GS_ALLOC_REQ:
-    case ID_GET_DOORBELL:
-      return isGFX9Plus(STI);
-    case ID_EARLY_PRIM_DEALLOC:
-      return isGFX9(STI);
-    case ID_GET_DDID:
-      return isGFX10Plus(STI);
-    default:
-      return 0 <= MsgId && MsgId < ID_GAPS_LAST_ && IdSymbolic[MsgId];
-    }
-  } else {
-    return 0 <= MsgId && isUInt<ID_WIDTH_>(MsgId);
-  }
+int64_t getMsgId(const StringRef Name, const MCSubtargetInfo &STI) {
+  int Idx = getOprIdx<const MCSubtargetInfo &>(Name, Msg, MSG_SIZE, STI);
+  return (Idx < 0) ? Idx : Msg[Idx].Encoding;
 }
 
-StringRef getMsgName(int64_t MsgId) {
-  assert(0 <= MsgId && MsgId < ID_GAPS_LAST_);
-  return IdSymbolic[MsgId];
+bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI) {
+  return (MsgId & ~(getMsgIdMask(STI))) == 0;
+}
+
+StringRef getMsgName(int64_t MsgId, const MCSubtargetInfo &STI) {
+  int Idx = getOprIdx<const MCSubtargetInfo &>(MsgId, Msg, MSG_SIZE, STI);
+  return (Idx < 0) ? "" : Msg[Idx].Name;
 }
 
 int64_t getMsgOpId(int64_t MsgId, const StringRef Name) {
@@ -1289,26 +1531,27 @@ int64_t getMsgOpId(int64_t MsgId, const StringRef Name) {
 
 bool isValidMsgOp(int64_t MsgId, int64_t OpId, const MCSubtargetInfo &STI,
                   bool Strict) {
-  assert(isValidMsgId(MsgId, STI, Strict));
+  assert(isValidMsgId(MsgId, STI));
 
   if (!Strict)
     return 0 <= OpId && isUInt<OP_WIDTH_>(OpId);
 
-  switch(MsgId)
-  {
-  case ID_GS:
-    return (OP_GS_FIRST_ <= OpId && OpId < OP_GS_LAST_) && OpId != OP_GS_NOP;
-  case ID_GS_DONE:
-    return OP_GS_FIRST_ <= OpId && OpId < OP_GS_LAST_;
-  case ID_SYSMSG:
+  if (MsgId == ID_SYSMSG)
     return OP_SYS_FIRST_ <= OpId && OpId < OP_SYS_LAST_;
-  default:
-    return OpId == OP_NONE_;
+  if (!isGFX11Plus(STI)) {
+    switch (MsgId) {
+    case ID_GS_PreGFX11:
+      return (OP_GS_FIRST_ <= OpId && OpId < OP_GS_LAST_) && OpId != OP_GS_NOP;
+    case ID_GS_DONE_PreGFX11:
+      return OP_GS_FIRST_ <= OpId && OpId < OP_GS_LAST_;
+    }
   }
+  return OpId == OP_NONE_;
 }
 
-StringRef getMsgOpName(int64_t MsgId, int64_t OpId) {
-  assert(msgRequiresOp(MsgId));
+StringRef getMsgOpName(int64_t MsgId, int64_t OpId,
+                       const MCSubtargetInfo &STI) {
+  assert(msgRequiresOp(MsgId, STI));
   return (MsgId == ID_SYSMSG)? OpSysSymbolic[OpId] : OpGsSymbolic[OpId];
 }
 
@@ -1319,42 +1562,48 @@ bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId,
   if (!Strict)
     return 0 <= StreamId && isUInt<STREAM_ID_WIDTH_>(StreamId);
 
-  switch(MsgId)
-  {
-  case ID_GS:
-    return STREAM_ID_FIRST_ <= StreamId && StreamId < STREAM_ID_LAST_;
-  case ID_GS_DONE:
-    return (OpId == OP_GS_NOP)?
-           (StreamId == STREAM_ID_NONE_) :
-           (STREAM_ID_FIRST_ <= StreamId && StreamId < STREAM_ID_LAST_);
-  default:
-    return StreamId == STREAM_ID_NONE_;
+  if (!isGFX11Plus(STI)) {
+    switch (MsgId) {
+    case ID_GS_PreGFX11:
+      return STREAM_ID_FIRST_ <= StreamId && StreamId < STREAM_ID_LAST_;
+    case ID_GS_DONE_PreGFX11:
+      return (OpId == OP_GS_NOP) ?
+          (StreamId == STREAM_ID_NONE_) :
+          (STREAM_ID_FIRST_ <= StreamId && StreamId < STREAM_ID_LAST_);
+    }
   }
+  return StreamId == STREAM_ID_NONE_;
 }
 
-bool msgRequiresOp(int64_t MsgId) {
-  return MsgId == ID_GS || MsgId == ID_GS_DONE || MsgId == ID_SYSMSG;
+bool msgRequiresOp(int64_t MsgId, const MCSubtargetInfo &STI) {
+  return MsgId == ID_SYSMSG ||
+      (!isGFX11Plus(STI) &&
+       (MsgId == ID_GS_PreGFX11 || MsgId == ID_GS_DONE_PreGFX11));
 }
 
-bool msgSupportsStream(int64_t MsgId, int64_t OpId) {
-  return (MsgId == ID_GS || MsgId == ID_GS_DONE) && OpId != OP_GS_NOP;
+bool msgSupportsStream(int64_t MsgId, int64_t OpId,
+                       const MCSubtargetInfo &STI) {
+  return !isGFX11Plus(STI) &&
+      (MsgId == ID_GS_PreGFX11 || MsgId == ID_GS_DONE_PreGFX11) &&
+      OpId != OP_GS_NOP;
 }
 
-void decodeMsg(unsigned Val,
-               uint16_t &MsgId,
-               uint16_t &OpId,
-               uint16_t &StreamId) {
-  MsgId = Val & ID_MASK_;
-  OpId = (Val & OP_MASK_) >> OP_SHIFT_;
-  StreamId = (Val & STREAM_ID_MASK_) >> STREAM_ID_SHIFT_;
+void decodeMsg(unsigned Val, uint16_t &MsgId, uint16_t &OpId,
+               uint16_t &StreamId, const MCSubtargetInfo &STI) {
+  MsgId = Val & getMsgIdMask(STI);
+  if (isGFX11Plus(STI)) {
+    OpId = 0;
+    StreamId = 0;
+  } else {
+    OpId = (Val & OP_MASK_) >> OP_SHIFT_;
+    StreamId = (Val & STREAM_ID_MASK_) >> STREAM_ID_SHIFT_;
+  }
 }
 
 uint64_t encodeMsg(uint64_t MsgId,
                    uint64_t OpId,
                    uint64_t StreamId) {
-  return (MsgId << ID_SHIFT_) |
-         (OpId << OP_SHIFT_) |
-         (StreamId << STREAM_ID_SHIFT_);
+  return MsgId | (OpId << OP_SHIFT_) | (StreamId << STREAM_ID_SHIFT_);
 }
 
 } // namespace SendMsg
@@ -1427,6 +1676,10 @@ bool isModuleEntryFunctionCC(CallingConv::ID CC) {
   }
 }
 
+bool isKernelCC(const Function *Func) {
+  return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv());
+}
+
 bool hasXNACK(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureXNACK];
 }
@@ -1448,7 +1701,8 @@ bool hasG16(const MCSubtargetInfo &STI) {
 }
 
 bool hasPackedD16(const MCSubtargetInfo &STI) {
-  return !STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem];
+  return !STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem] && !isCI(STI) &&
+         !isSI(STI);
 }
 
 bool isSI(const MCSubtargetInfo &STI) {
@@ -1467,6 +1721,18 @@ bool isGFX9(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureGFX9];
 }
 
+bool isGFX9_GFX10(const MCSubtargetInfo &STI) {
+  return isGFX9(STI) || isGFX10(STI);
+}
+
+bool isGFX8_GFX9_GFX10(const MCSubtargetInfo &STI) {
+  return isVI(STI) || isGFX9(STI) || isGFX10(STI);
+}
+
+bool isGFX8Plus(const MCSubtargetInfo &STI) {
+  return isVI(STI) || isGFX9Plus(STI);
+}
+
 bool isGFX9Plus(const MCSubtargetInfo &STI) {
   return isGFX9(STI) || isGFX10Plus(STI);
 }
@@ -1475,7 +1741,29 @@ bool isGFX10(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureGFX10];
 }
 
-bool isGFX10Plus(const MCSubtargetInfo &STI) { return isGFX10(STI); }
+bool isGFX10Plus(const MCSubtargetInfo &STI) {
+  return isGFX10(STI) || isGFX11Plus(STI);
+}
+
+bool isGFX11(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureGFX11];
+}
+
+bool isGFX11Plus(const MCSubtargetInfo &STI) {
+  return isGFX11(STI);
+}
+
+bool isNotGFX11Plus(const MCSubtargetInfo &STI) {
+  return !isGFX11Plus(STI);
+}
+
+bool isNotGFX10Plus(const MCSubtargetInfo &STI) {
+  return isSI(STI) || isCI(STI) || isVI(STI) || isGFX9(STI);
+}
+
+bool isGFX10Before1030(const MCSubtargetInfo &STI) {
+  return isGFX10(STI) && !AMDGPU::isGFX10_BEncoding(STI);
+}
 
 bool isGCN3Encoding(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding];
@@ -1497,10 +1785,29 @@ bool isGFX90A(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts];
 }
 
+bool isGFX940(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureGFX940Insts];
+}
+
 bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch];
 }
 
+bool hasMAIInsts(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureMAIInsts];
+}
+
+bool hasVOPD(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureVOPD];
+}
+
+int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR,
+                         int32_t ArgNumVGPR) {
+  if (has90AInsts && ArgNumAGPR)
+    return alignTo(ArgNumVGPR, 4) + ArgNumAGPR;
+  return std::max(ArgNumVGPR, ArgNumAGPR);
+}
+
 bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
   const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID);
   const unsigned FirstSubReg = TRI->getSubReg(Reg, AMDGPU::sub0);
@@ -1508,13 +1815,6 @@ bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
     Reg == AMDGPU::SCC;
 }
 
-bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) {
-  for (MCRegAliasIterator R(Reg0, TRI, true); R.isValid(); ++R) {
-    if (*R == Reg1) return true;
-  }
-  return false;
-}
-
 #define MAP_REG2REG \
   using namespace AMDGPU; \
   switch(Reg) { \
@@ -1554,6 +1854,9 @@ bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) {
   CASE_VI_GFX9PLUS(TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11) \
   CASE_VI_GFX9PLUS(TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \
   CASE_VI_GFX9PLUS(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \
+  CASE_GFXPRE11_GFX11PLUS(M0) \
+  CASE_GFXPRE11_GFX11PLUS(SGPR_NULL) \
+  CASE_GFXPRE11_GFX11PLUS_TO(SGPR_NULL64, SGPR_NULL) \
   }
 
 #define CASE_CI_VI(node) \
@@ -1563,6 +1866,12 @@ bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) {
 #define CASE_VI_GFX9PLUS(node) \
   case node: return isGFX9Plus(STI) ? node##_gfx9plus : node##_vi;
 
+#define CASE_GFXPRE11_GFX11PLUS(node) \
+  case node: return isGFX11Plus(STI) ? node##_gfx11plus : node##_gfxpre11;
+
+#define CASE_GFXPRE11_GFX11PLUS_TO(node, result) \
+  case node: return isGFX11Plus(STI) ? result##_gfx11plus : result##_gfxpre11;
+
 unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
   if (STI.getTargetTriple().getArch() == Triple::r600)
     return Reg;
@@ -1571,9 +1880,13 @@ unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
 
 #undef CASE_CI_VI
 #undef CASE_VI_GFX9PLUS
+#undef CASE_GFXPRE11_GFX11PLUS
+#undef CASE_GFXPRE11_GFX11PLUS_TO
 
 #define CASE_CI_VI(node)   case node##_ci: case node##_vi:   return node;
 #define CASE_VI_GFX9PLUS(node) case node##_vi: case node##_gfx9plus: return node;
+#define CASE_GFXPRE11_GFX11PLUS(node) case node##_gfx11plus: case node##_gfxpre11: return node;
+#define CASE_GFXPRE11_GFX11PLUS_TO(node, result)
 
 unsigned mc2PseudoReg(unsigned Reg) {
   MAP_REG2REG
@@ -1581,6 +1894,8 @@ unsigned mc2PseudoReg(unsigned Reg) {
 
 #undef CASE_CI_VI
 #undef CASE_VI_GFX9PLUS
+#undef CASE_GFXPRE11_GFX11PLUS
+#undef CASE_GFXPRE11_GFX11PLUS_TO
 #undef MAP_REG2REG
 
 bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) {
@@ -1934,7 +2249,7 @@ Optional<int64_t> getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST,
 }
 
 unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST, bool Signed) {
-  // Address offset is 12-bit signed for GFX10, 13-bit for GFX9.
+  // Address offset is 12-bit signed for GFX10, 13-bit for GFX9 and GFX11+.
   if (AMDGPU::isGFX10(ST))
     return Signed ? 12 : 11;
 
@@ -2029,7 +2344,8 @@ const SourceOfDivergence *lookupSourceOfDivergence(unsigned Intr);
 
 #define GET_SourcesOfDivergence_IMPL
 #define GET_Gfx9BufferFormat_IMPL
-#define GET_Gfx10PlusBufferFormat_IMPL
+#define GET_Gfx10BufferFormat_IMPL
+#define GET_Gfx11PlusBufferFormat_IMPL
 #include "AMDGPUGenSearchableTables.inc"
 
 } // end anonymous namespace
@@ -2042,16 +2358,20 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp,
                                                   uint8_t NumComponents,
                                                   uint8_t NumFormat,
                                                   const MCSubtargetInfo &STI) {
-  return isGFX10Plus(STI)
-             ? getGfx10PlusBufferFormatInfo(BitsPerComp, NumComponents,
+  return isGFX11Plus(STI)
+             ? getGfx11PlusBufferFormatInfo(BitsPerComp, NumComponents,
                                             NumFormat)
-             : getGfx9BufferFormatInfo(BitsPerComp, NumComponents, NumFormat);
+             : isGFX10(STI) ? getGfx10BufferFormatInfo(BitsPerComp,
+                                                       NumComponents, NumFormat)
+                            : getGfx9BufferFormatInfo(BitsPerComp,
+                                                      NumComponents, NumFormat);
 }
 
 const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format,
                                                   const MCSubtargetInfo &STI) {
-  return isGFX10Plus(STI) ? getGfx10PlusBufferFormatInfo(Format)
-                          : getGfx9BufferFormatInfo(Format);
+  return isGFX11Plus(STI) ? getGfx11PlusBufferFormatInfo(Format)
+                          : isGFX10(STI) ? getGfx10BufferFormatInfo(Format)
+                                         : getGfx9BufferFormatInfo(Format);
 }
 
 } // namespace AMDGPU
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 4516b511f3c8..dffeec10a14a 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -50,10 +50,19 @@ bool isHsaAbiVersion4(const MCSubtargetInfo *STI);
 /// \returns True if HSA OS ABI Version identification is 5,
 /// false otherwise.
 bool isHsaAbiVersion5(const MCSubtargetInfo *STI);
-/// \returns True if HSA OS ABI Version identification is 3 or 4,
+/// \returns True if HSA OS ABI Version identification is 3 and above,
 /// false otherwise.
 bool isHsaAbiVersion3AndAbove(const MCSubtargetInfo *STI);
 
+/// \returns The offset of the multigrid_sync_arg argument from implicitarg_ptr
+unsigned getMultigridSyncArgImplicitArgPosition();
+
+/// \returns The offset of the hostcall pointer argument from implicitarg_ptr
+unsigned getHostcallImplicitArgPosition();
+
+/// \returns Code object version.
+unsigned getAmdhsaCodeObjectVersion();
+
 struct GcnBufferFormatInfo {
   unsigned Format;
   unsigned BitsPerComp;
@@ -62,12 +71,19 @@ struct GcnBufferFormatInfo {
   unsigned DataFormat;
 };
 
+struct MAIInstInfo {
+  uint16_t Opcode;
+  bool is_dgemm;
+  bool is_gfx940_xdl;
+};
+
 #define GET_MIMGBaseOpcode_DECL
 #define GET_MIMGDim_DECL
 #define GET_MIMGEncoding_DECL
 #define GET_MIMGLZMapping_DECL
 #define GET_MIMGMIPMapping_DECL
 #define GET_MIMGBiASMapping_DECL
+#define GET_MAIInstInfoTable_DECL
 #include "AMDGPUGenSearchableTables.inc"
 
 namespace IsaInfo {
@@ -352,6 +368,11 @@ struct MIMGG16MappingInfo {
 LLVM_READONLY
 const MIMGLZMappingInfo *getMIMGLZMappingInfo(unsigned L);
 
+struct WMMAOpcodeMappingInfo {
+  unsigned Opcode2Addr;
+  unsigned Opcode3Addr;
+};
+
 LLVM_READONLY
 const MIMGMIPMappingInfo *getMIMGMIPMappingInfo(unsigned MIP);
 
@@ -382,6 +403,7 @@ struct MIMGInfo {
   uint8_t MIMGEncoding;
   uint8_t VDataDwords;
   uint8_t VAddrDwords;
+  uint8_t VAddrOperands;
 };
 
 LLVM_READONLY
@@ -438,6 +460,16 @@ bool getVOP2IsSingle(unsigned Opc);
 LLVM_READONLY
 bool getVOP3IsSingle(unsigned Opc);
 
+LLVM_READONLY
+bool isVOPC64DPP(unsigned Opc);
+
+/// Returns true if MAI operation is a double precision GEMM.
+LLVM_READONLY
+bool getMAIIsDGEMM(unsigned Opc);
+
+LLVM_READONLY
+bool getMAIIsGFX940XDL(unsigned Opc);
+
 LLVM_READONLY
 const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp,
                                                   uint8_t NumComponents,
@@ -450,6 +482,12 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format,
 LLVM_READONLY
 int getMCOpcode(uint16_t Opcode, unsigned Gen);
 
+LLVM_READONLY
+unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc);
+
+LLVM_READONLY
+unsigned mapWMMA3AddrTo2AddrOpcode(unsigned Opc);
+
 void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
                                const MCSubtargetInfo *STI);
 
@@ -496,7 +534,7 @@ struct Waitcnt {
   unsigned LgkmCnt = ~0u;
   unsigned VsCnt = ~0u;
 
-  Waitcnt() {}
+  Waitcnt() = default;
   Waitcnt(unsigned VmCnt, unsigned ExpCnt, unsigned LgkmCnt, unsigned VsCnt)
       : VmCnt(VmCnt), ExpCnt(ExpCnt), LgkmCnt(LgkmCnt), VsCnt(VsCnt) {}
 
@@ -555,11 +593,14 @@ unsigned decodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt);
 /// \p Lgkmcnt respectively.
 ///
 /// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are decoded as follows:
-///     \p Vmcnt = \p Waitcnt[3:0]                      (pre-gfx9 only)
-///     \p Vmcnt = \p Waitcnt[3:0] | \p Waitcnt[15:14]  (gfx9+ only)
-///     \p Expcnt = \p Waitcnt[6:4]
-///     \p Lgkmcnt = \p Waitcnt[11:8]                   (pre-gfx10 only)
-///     \p Lgkmcnt = \p Waitcnt[13:8]                   (gfx10+ only)
+///     \p Vmcnt = \p Waitcnt[3:0]        (pre-gfx9)
+///     \p Vmcnt = \p Waitcnt[15:14,3:0]  (gfx9,10)
+///     \p Vmcnt = \p Waitcnt[15:10]      (gfx11+)
+///     \p Expcnt = \p Waitcnt[6:4]       (pre-gfx11)
+///     \p Expcnt = \p Waitcnt[2:0]       (gfx11+)
+///     \p Lgkmcnt = \p Waitcnt[11:8]     (pre-gfx10)
+///     \p Lgkmcnt = \p Waitcnt[13:8]     (gfx10)
+///     \p Lgkmcnt = \p Waitcnt[9:4]      (gfx11+)
 void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt,
                    unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt);
 
@@ -581,12 +622,15 @@ unsigned encodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt,
 /// \p Version.
 ///
 /// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are encoded as follows:
-///     Waitcnt[3:0]   = \p Vmcnt       (pre-gfx9 only)
-///     Waitcnt[3:0]   = \p Vmcnt[3:0]  (gfx9+ only)
-///     Waitcnt[6:4]   = \p Expcnt
-///     Waitcnt[11:8]  = \p Lgkmcnt     (pre-gfx10 only)
-///     Waitcnt[13:8]  = \p Lgkmcnt     (gfx10+ only)
-///     Waitcnt[15:14] = \p Vmcnt[5:4]  (gfx9+ only)
+///     Waitcnt[2:0]   = \p Expcnt      (gfx11+)
+///     Waitcnt[3:0]   = \p Vmcnt       (pre-gfx9)
+///     Waitcnt[3:0]   = \p Vmcnt[3:0]  (gfx9,10)
+///     Waitcnt[6:4]   = \p Expcnt      (pre-gfx11)
+///     Waitcnt[9:4]   = \p Lgkmcnt     (gfx11+)
+///     Waitcnt[11:8]  = \p Lgkmcnt     (pre-gfx10)
+///     Waitcnt[13:8]  = \p Lgkmcnt     (gfx10)
+///     Waitcnt[15:10] = \p Vmcnt       (gfx11+)
+///     Waitcnt[15:14] = \p Vmcnt[5:4]  (gfx9,10)
 ///
 /// \returns Waitcnt with encoded \p Vmcnt, \p Expcnt and \p Lgkmcnt for given
 /// isa \p Version.
@@ -598,10 +642,7 @@ unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded);
 namespace Hwreg {
 
 LLVM_READONLY
-int64_t getHwregId(const StringRef Name);
-
-LLVM_READNONE
-bool isValidHwreg(int64_t Id, const MCSubtargetInfo &STI);
+int64_t getHwregId(const StringRef Name, const MCSubtargetInfo &STI);
 
 LLVM_READNONE
 bool isValidHwreg(int64_t Id);
@@ -622,6 +663,18 @@ void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width);
 
 } // namespace Hwreg
 
+namespace DepCtr {
+
+int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI);
+int encodeDepCtr(const StringRef Name, int64_t Val, unsigned &UsedOprMask,
+                 const MCSubtargetInfo &STI);
+bool isSymbolicDepCtrEncoding(unsigned Code, bool &HasNonDefaultVal,
+                              const MCSubtargetInfo &STI);
+bool decodeDepCtr(unsigned Code, int &Id, StringRef &Name, unsigned &Val,
+                  bool &IsDefault, const MCSubtargetInfo &STI);
+
+} // namespace DepCtr
+
 namespace Exp {
 
 bool getTgtName(unsigned Id, StringRef &Name, int &Index);
@@ -653,13 +706,14 @@ bool isValidDfmtNfmt(unsigned Val, const MCSubtargetInfo &STI);
 
 bool isValidNfmt(unsigned Val, const MCSubtargetInfo &STI);
 
-int64_t getUnifiedFormat(const StringRef Name);
+int64_t getUnifiedFormat(const StringRef Name, const MCSubtargetInfo &STI);
 
-StringRef getUnifiedFormatName(unsigned Id);
+StringRef getUnifiedFormatName(unsigned Id, const MCSubtargetInfo &STI);
 
-bool isValidUnifiedFormat(unsigned Val);
+bool isValidUnifiedFormat(unsigned Val, const MCSubtargetInfo &STI);
 
-int64_t convertDfmtNfmt2Ufmt(unsigned Dfmt, unsigned Nfmt);
+int64_t convertDfmtNfmt2Ufmt(unsigned Dfmt, unsigned Nfmt,
+                             const MCSubtargetInfo &STI);
 
 bool isValidFormatEncoding(unsigned Val, const MCSubtargetInfo &STI);
 
@@ -670,19 +724,19 @@ unsigned getDefaultFormatEncoding(const MCSubtargetInfo &STI);
 namespace SendMsg {
 
 LLVM_READONLY
-int64_t getMsgId(const StringRef Name);
+int64_t getMsgId(const StringRef Name, const MCSubtargetInfo &STI);
 
 LLVM_READONLY
 int64_t getMsgOpId(int64_t MsgId, const StringRef Name);
 
 LLVM_READNONE
-StringRef getMsgName(int64_t MsgId);
+StringRef getMsgName(int64_t MsgId, const MCSubtargetInfo &STI);
 
 LLVM_READNONE
-StringRef getMsgOpName(int64_t MsgId, int64_t OpId);
+StringRef getMsgOpName(int64_t MsgId, int64_t OpId, const MCSubtargetInfo &STI);
 
 LLVM_READNONE
-bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI, bool Strict = true);
+bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI);
 
 LLVM_READNONE
 bool isValidMsgOp(int64_t MsgId, int64_t OpId, const MCSubtargetInfo &STI,
@@ -693,15 +747,13 @@ bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId,
                       const MCSubtargetInfo &STI, bool Strict = true);
 
 LLVM_READNONE
-bool msgRequiresOp(int64_t MsgId);
+bool msgRequiresOp(int64_t MsgId, const MCSubtargetInfo &STI);
 
 LLVM_READNONE
-bool msgSupportsStream(int64_t MsgId, int64_t OpId);
+bool msgSupportsStream(int64_t MsgId, int64_t OpId, const MCSubtargetInfo &STI);
 
-void decodeMsg(unsigned Val,
-               uint16_t &MsgId,
-               uint16_t &OpId,
-               uint16_t &StreamId);
+void decodeMsg(unsigned Val, uint16_t &MsgId, uint16_t &OpId,
+               uint16_t &StreamId, const MCSubtargetInfo &STI);
 
 LLVM_READNONE
 uint64_t encodeMsg(uint64_t MsgId,
@@ -738,6 +790,8 @@ bool isEntryFunctionCC(CallingConv::ID CC);
 LLVM_READNONE
 bool isModuleEntryFunctionCC(CallingConv::ID CC);
 
+bool isKernelCC(const Function *Func);
+
 // FIXME: Remove this when calling conventions cleaned up
 LLVM_READNONE
 inline bool isKernel(CallingConv::ID CC) {
@@ -761,22 +815,31 @@ bool isSI(const MCSubtargetInfo &STI);
 bool isCI(const MCSubtargetInfo &STI);
 bool isVI(const MCSubtargetInfo &STI);
 bool isGFX9(const MCSubtargetInfo &STI);
+bool isGFX9_GFX10(const MCSubtargetInfo &STI);
+bool isGFX8_GFX9_GFX10(const MCSubtargetInfo &STI);
+bool isGFX8Plus(const MCSubtargetInfo &STI);
 bool isGFX9Plus(const MCSubtargetInfo &STI);
 bool isGFX10(const MCSubtargetInfo &STI);
 bool isGFX10Plus(const MCSubtargetInfo &STI);
+bool isNotGFX10Plus(const MCSubtargetInfo &STI);
+bool isGFX10Before1030(const MCSubtargetInfo &STI);
+bool isGFX11(const MCSubtargetInfo &STI);
+bool isGFX11Plus(const MCSubtargetInfo &STI);
+bool isNotGFX11Plus(const MCSubtargetInfo &STI);
 bool isGCN3Encoding(const MCSubtargetInfo &STI);
 bool isGFX10_AEncoding(const MCSubtargetInfo &STI);
 bool isGFX10_BEncoding(const MCSubtargetInfo &STI);
 bool hasGFX10_3Insts(const MCSubtargetInfo &STI);
 bool isGFX90A(const MCSubtargetInfo &STI);
+bool isGFX940(const MCSubtargetInfo &STI);
 bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI);
+bool hasMAIInsts(const MCSubtargetInfo &STI);
+bool hasVOPD(const MCSubtargetInfo &STI);
+int getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR);
 
 /// Is Reg - scalar register
 bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
 
-/// Is there any intersection between registers
-bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI);
-
 /// If \p Reg is a pseudo reg, return the correct hardware register given
 /// \p STI otherwise return \p Reg.
 unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI);
@@ -931,7 +994,7 @@ inline bool isLegal64BitDPPControl(unsigned DC) {
 /// \returns true if the intrinsic is divergent
 bool isIntrinsicSourceOfDivergence(unsigned IntrID);
 
-// Track defaults for fields in the MODE registser.
+// Track defaults for fields in the MODE register.
 struct SIModeRegisterDefaults {
   /// Floating point opcodes that support exception flag gathering quiet and
   /// propagate signaling NaN inputs per IEEE 754-2008. Min_dx10 and max_dx10
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
deleted file mode 100644
index a83ff6667956..000000000000
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-//===- AMDGPULDSUtils.cpp -------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// AMDGPU LDS related helper utility functions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPULDSUtils.h"
-#include "AMDGPU.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/ReplaceConstant.h"
-
-using namespace llvm;
-
-namespace llvm {
-
-namespace AMDGPU {
-
-bool isKernelCC(const Function *Func) {
-  return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv());
-}
-
-Align getAlign(DataLayout const &DL, const GlobalVariable *GV) {
-  return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL),
-                                       GV->getValueType());
-}
-
-static void collectFunctionUses(User *U, const Function *F,
-                                SetVector<Instruction *> &InstUsers) {
-  SmallVector<User *> Stack{U};
-
-  while (!Stack.empty()) {
-    U = Stack.pop_back_val();
-
-    if (auto *I = dyn_cast<Instruction>(U)) {
-      if (I->getFunction() == F)
-        InstUsers.insert(I);
-      continue;
-    }
-
-    if (!isa<ConstantExpr>(U))
-      continue;
-
-    append_range(Stack, U->users());
-  }
-}
-
-void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F) {
-  SetVector<Instruction *> InstUsers;
-
-  collectFunctionUses(C, F, InstUsers);
-  for (Instruction *I : InstUsers) {
-    convertConstantExprsToInstructions(I, C);
-  }
-}
-
-static bool shouldLowerLDSToStruct(const GlobalVariable &GV,
-                                   const Function *F) {
-  // We are not interested in kernel LDS lowering for module LDS itself.
-  if (F && GV.getName() == "llvm.amdgcn.module.lds")
-    return false;
-
-  bool Ret = false;
-  SmallPtrSet<const User *, 8> Visited;
-  SmallVector<const User *, 16> Stack(GV.users());
-
-  assert(!F || isKernelCC(F));
-
-  while (!Stack.empty()) {
-    const User *V = Stack.pop_back_val();
-    Visited.insert(V);
-
-    if (isa<GlobalValue>(V)) {
-      // This use of the LDS variable is the initializer of a global variable.
-      // This is ill formed. The address of an LDS variable is kernel dependent
-      // and unknown until runtime. It can't be written to a global variable.
-      continue;
-    }
-
-    if (auto *I = dyn_cast<Instruction>(V)) {
-      const Function *UF = I->getFunction();
-      if (UF == F) {
-        // Used from this kernel, we want to put it into the structure.
-        Ret = true;
-      } else if (!F) {
-        // For module LDS lowering, lowering is required if the user instruction
-        // is from non-kernel function.
-        Ret |= !isKernelCC(UF);
-      }
-      continue;
-    }
-
-    // User V should be a constant, recursively visit users of V.
-    assert(isa<Constant>(V) && "Expected a constant.");
-    append_range(Stack, V->users());
-  }
-
-  return Ret;
-}
-
-std::vector<GlobalVariable *> findVariablesToLower(Module &M,
-                                                   const Function *F) {
-  std::vector<llvm::GlobalVariable *> LocalVars;
-  for (auto &GV : M.globals()) {
-    if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
-      continue;
-    }
-    if (!GV.hasInitializer()) {
-      // addrspace(3) without initializer implies cuda/hip extern __shared__
-      // the semantics for such a variable appears to be that all extern
-      // __shared__ variables alias one another, in which case this transform
-      // is not required
-      continue;
-    }
-    if (!isa<UndefValue>(GV.getInitializer())) {
-      // Initializers are unimplemented for LDS address space.
-      // Leave such variables in place for consistent error reporting.
-      continue;
-    }
-    if (GV.isConstant()) {
-      // A constant undef variable can't be written to, and any load is
-      // undef, so it should be eliminated by the optimizer. It could be
-      // dropped by the back end if not. This pass skips over it.
-      continue;
-    }
-    if (!shouldLowerLDSToStruct(GV, F)) {
-      continue;
-    }
-    LocalVars.push_back(&GV);
-  }
-  return LocalVars;
-}
-
-} // end namespace AMDGPU
-
-} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
deleted file mode 100644
index 83ef68cc3f60..000000000000
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
+++ /dev/null
@@ -1,38 +0,0 @@
-//===- AMDGPULDSUtils.h - LDS related helper functions -*- C++ -*----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// AMDGPU LDS related helper utility functions.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H
-#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/IR/Constants.h"
-
-namespace llvm {
-
-class ConstantExpr;
-
-namespace AMDGPU {
-
-bool isKernelCC(const Function *Func);
-
-Align getAlign(DataLayout const &DL, const GlobalVariable *GV);
-
-std::vector<GlobalVariable *> findVariablesToLower(Module &M,
-                                                   const Function *F = nullptr);
-
-/// Replace all uses of constant \p C with instructions in \p F.
-void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F);
-} // end namespace AMDGPU
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
new file mode 100644
index 000000000000..83d7cbdb183c
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
@@ -0,0 +1,220 @@
+//===-- AMDGPUMemoryUtils.cpp - -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMemoryUtils.h"
+#include "AMDGPU.h"
+#include "AMDGPUBaseInfo.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/ReplaceConstant.h"
+
+#define DEBUG_TYPE "amdgpu-memory-utils"
+
+using namespace llvm;
+
+namespace llvm {
+
+namespace AMDGPU {
+
+Align getAlign(DataLayout const &DL, const GlobalVariable *GV) {
+  return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL),
+                                       GV->getValueType());
+}
+
+static void collectFunctionUses(User *U, const Function *F,
+                                SetVector<Instruction *> &InstUsers) {
+  SmallVector<User *> Stack{U};
+
+  while (!Stack.empty()) {
+    U = Stack.pop_back_val();
+
+    if (auto *I = dyn_cast<Instruction>(U)) {
+      if (I->getFunction() == F)
+        InstUsers.insert(I);
+      continue;
+    }
+
+    if (!isa<ConstantExpr>(U))
+      continue;
+
+    append_range(Stack, U->users());
+  }
+}
+
+void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F) {
+  SetVector<Instruction *> InstUsers;
+
+  collectFunctionUses(C, F, InstUsers);
+  for (Instruction *I : InstUsers) {
+    convertConstantExprsToInstructions(I, C);
+  }
+}
+
+static bool shouldLowerLDSToStruct(const GlobalVariable &GV,
+                                   const Function *F) {
+  // We are not interested in kernel LDS lowering for module LDS itself.
+  if (F && GV.getName() == "llvm.amdgcn.module.lds")
+    return false;
+
+  bool Ret = false;
+  SmallPtrSet<const User *, 8> Visited;
+  SmallVector<const User *, 16> Stack(GV.users());
+
+  assert(!F || isKernelCC(F));
+
+  while (!Stack.empty()) {
+    const User *V = Stack.pop_back_val();
+    Visited.insert(V);
+
+    if (isa<GlobalValue>(V)) {
+      // This use of the LDS variable is the initializer of a global variable.
+      // This is ill formed. The address of an LDS variable is kernel dependent
+      // and unknown until runtime. It can't be written to a global variable.
+      continue;
+    }
+
+    if (auto *I = dyn_cast<Instruction>(V)) {
+      const Function *UF = I->getFunction();
+      if (UF == F) {
+        // Used from this kernel, we want to put it into the structure.
+        Ret = true;
+      } else if (!F) {
+        // For module LDS lowering, lowering is required if the user instruction
+        // is from non-kernel function.
+        Ret |= !isKernelCC(UF);
+      }
+      continue;
+    }
+
+    // User V should be a constant, recursively visit users of V.
+    assert(isa<Constant>(V) && "Expected a constant.");
+    append_range(Stack, V->users());
+  }
+
+  return Ret;
+}
+
+std::vector<GlobalVariable *> findVariablesToLower(Module &M,
+                                                   const Function *F) {
+  std::vector<llvm::GlobalVariable *> LocalVars;
+  for (auto &GV : M.globals()) {
+    if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
+      continue;
+    }
+    if (!GV.hasInitializer()) {
+      // addrspace(3) without initializer implies cuda/hip extern __shared__
+      // the semantics for such a variable appears to be that all extern
+      // __shared__ variables alias one another, in which case this transform
+      // is not required
+      continue;
+    }
+    if (!isa<UndefValue>(GV.getInitializer())) {
+      // Initializers are unimplemented for LDS address space.
+      // Leave such variables in place for consistent error reporting.
+      continue;
+    }
+    if (GV.isConstant()) {
+      // A constant undef variable can't be written to, and any load is
+      // undef, so it should be eliminated by the optimizer. It could be
+      // dropped by the back end if not. This pass skips over it.
+      continue;
+    }
+    if (!shouldLowerLDSToStruct(GV, F)) {
+      continue;
+    }
+    LocalVars.push_back(&GV);
+  }
+  return LocalVars;
+}
+
+bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) {
+  Instruction *DefInst = Def->getMemoryInst();
+
+  if (isa<FenceInst>(DefInst))
+    return false;
+
+  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) {
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::amdgcn_s_barrier:
+    case Intrinsic::amdgcn_wave_barrier:
+    case Intrinsic::amdgcn_sched_barrier:
+      return false;
+    default:
+      break;
+    }
+  }
+
+  // Ignore atomics not aliasing with the original load, any atomic is a
+  // universal MemoryDef from MSSA's point of view too, just like a fence.
+  const auto checkNoAlias = [AA, Ptr](auto I) -> bool {
+    return I && AA->isNoAlias(I->getPointerOperand(), Ptr);
+  };
+
+  if (checkNoAlias(dyn_cast<AtomicCmpXchgInst>(DefInst)) ||
+      checkNoAlias(dyn_cast<AtomicRMWInst>(DefInst)))
+    return false;
+
+  return true;
+}
+
+bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA,
+                           AAResults *AA) {
+  MemorySSAWalker *Walker = MSSA->getWalker();
+  SmallVector<MemoryAccess *> WorkList{Walker->getClobberingMemoryAccess(Load)};
+  SmallSet<MemoryAccess *, 8> Visited;
+  MemoryLocation Loc(MemoryLocation::get(Load));
+
+  LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n');
+
+  // Start with a nearest dominating clobbering access, it will be either
+  // live on entry (nothing to do, load is not clobbered), MemoryDef, or
+  // MemoryPhi if several MemoryDefs can define this memory state. In that
+  // case add all Defs to WorkList and continue going up and checking all
+  // the definitions of this memory location until the root. When all the
+  // defs are exhausted and came to the entry state we have no clobber.
+  // Along the scan ignore barriers and fences which are considered clobbers
+  // by the MemorySSA, but not really writing anything into the memory.
+  while (!WorkList.empty()) {
+    MemoryAccess *MA = WorkList.pop_back_val();
+    if (!Visited.insert(MA).second)
+      continue;
+
+    if (MSSA->isLiveOnEntryDef(MA))
+      continue;
+
+    if (MemoryDef *Def = dyn_cast<MemoryDef>(MA)) {
+      LLVM_DEBUG(dbgs() << "  Def: " << *Def->getMemoryInst() << '\n');
+
+      if (isReallyAClobber(Load->getPointerOperand(), Def, AA)) {
+        LLVM_DEBUG(dbgs() << "      -> load is clobbered\n");
+        return true;
+      }
+
+      WorkList.push_back(
+          Walker->getClobberingMemoryAccess(Def->getDefiningAccess(), Loc));
+      continue;
+    }
+
+    const MemoryPhi *Phi = cast<MemoryPhi>(MA);
+    for (auto &Use : Phi->incoming_values())
+      WorkList.push_back(cast<MemoryAccess>(&Use));
+  }
+
+  LLVM_DEBUG(dbgs() << "      -> no clobber\n");
+  return false;
+}
+
+} // end namespace AMDGPU
+
+} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
new file mode 100644
index 000000000000..65ed02ca62de
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
@@ -0,0 +1,51 @@
+//===- AMDGPUMemoryUtils.h - Memory related helper functions -*- C++ -*----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H
+#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H
+
+#include <vector>
+
+namespace llvm {
+
+struct Align;
+class AAResults;
+class ConstantExpr;
+class DataLayout;
+class Function;
+class GlobalVariable;
+class LoadInst;
+class MemoryDef;
+class MemorySSA;
+class Module;
+class Value;
+
+namespace AMDGPU {
+
+Align getAlign(DataLayout const &DL, const GlobalVariable *GV);
+
+std::vector<GlobalVariable *> findVariablesToLower(Module &M,
+                                                   const Function *F = nullptr);
+
+/// Replace all uses of constant \p C with instructions in \p F.
+void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F);
+
+/// Given a \p Def clobbering a load from \p Ptr according to the MSSA check
+/// if this is actually a memory update or an artificial clobber to facilitate
+/// ordering constraints.
+bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA);
+
+/// Check is a \p Load is clobbered in its function.
+bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA,
+                           AAResults *AA);
+
+} // end namespace AMDGPU
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index f6b5975f1934..4ad93f7b0b68 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -209,6 +209,11 @@ void AMDGPUPALMetadata::setNumUsedVgprs(CallingConv::ID CC, unsigned Val) {
   getHwStage(CC)[".vgpr_count"] = MsgPackDoc.getNode(Val);
 }
 
+// Set the number of used agprs in the metadata.
+void AMDGPUPALMetadata::setNumUsedAgprs(CallingConv::ID CC, unsigned Val) {
+  getHwStage(CC)[".agpr_count"] = Val;
+}
+
 // Set the number of used sgprs in the metadata. This is an optional advisory
 // record for logging etc; wave dispatch actually uses the rsrc1 register for
 // the shader stage to determine the number of sgprs to allocate.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
index 7fdd9a8429c1..a45a799e38a9 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
@@ -69,6 +69,10 @@ public:
   // the shader stage to determine the number of vgprs to allocate.
   void setNumUsedVgprs(unsigned CC, unsigned Val);
 
+  // Set the number of used agprs in the metadata. This is an optional advisory
+  // record for logging etc;
+  void setNumUsedAgprs(unsigned CC, unsigned Val);
+
   // Set the number of used sgprs in the metadata. This is an optional advisory
   // record for logging etc; wave dispatch actually uses the rsrc1 register for
   // the shader stage to determine the number of sgprs to allocate.
diff --git a/llvm/lib/Target/AMDGPU/VIInstrFormats.td b/llvm/lib/Target/AMDGPU/VIInstrFormats.td
index bd65a495fa72..7393ef6c2a2d 100644
--- a/llvm/lib/Target/AMDGPU/VIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/VIInstrFormats.td
@@ -10,7 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-class EXPe_vi : EXPe {
+class EXPe_vi : EXPe_ComprVM {
   let Inst{31-26} = 0x31; //encoding
 }
 
diff --git a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
new file mode 100644
index 000000000000..c63fbbc241d9
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
@@ -0,0 +1,180 @@
+//===-- VINTERPInstructions.td - VINTERP Instruction Definitions ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// VINTERP encoding
+//===----------------------------------------------------------------------===//
+
+class VINTERPe_gfx11 <bits<7> op, VOPProfile P> : Enc64 {
+  bits<8> vdst;
+  bits<4> src0_modifiers;
+  bits<9> src0;
+  bits<3> src1_modifiers;
+  bits<9> src1;
+  bits<3> src2_modifiers;
+  bits<9> src2;
+  bits<1> clamp;
+  bits<3> waitexp;
+
+  let Inst{31-26} = 0x33; // VOP3P encoding
+  let Inst{25-24} = 0x1; // VINTERP sub-encoding
+  let Inst{23}    = 0; // reserved
+
+  let Inst{7-0}   = vdst;
+  let Inst{10-8}  = waitexp;
+  let Inst{11}    = !if(P.HasOpSel, src0_modifiers{2}, 0); // op_sel(0)
+  let Inst{12}    = !if(P.HasOpSel, src1_modifiers{2}, 0); // op_sel(1)
+  let Inst{13}    = !if(P.HasOpSel, src2_modifiers{2}, 0); // op_sel(2)
+  let Inst{14}    = !if(P.HasOpSel, src0_modifiers{3}, 0); // op_sel(3)
+  let Inst{15}    = clamp;
+  let Inst{22-16} = op;
+  let Inst{40-32} = src0;
+  let Inst{49-41} = src1;
+  let Inst{58-50} = src2;
+  let Inst{61}    = src0_modifiers{0}; // neg(0)
+  let Inst{62}    = src1_modifiers{0}; // neg(1)
+  let Inst{63}    = src2_modifiers{0}; // neg(2)
+}
+
+//===----------------------------------------------------------------------===//
+// VOP3 VINTERP
+//===----------------------------------------------------------------------===//
+
+class VINTERP_Pseudo <string OpName, VOPProfile P, list<dag> pattern = []> :
+  VOP3_Pseudo<OpName, P, pattern, 0, 0> {
+  let AsmMatchConverter = "cvtVINTERP";
+  let mayRaiseFPException = 0;
+
+  let VOP3_OPSEL = 1;
+  let VINTERP = 1;
+}
+
+class VINTERP_Real <VOP_Pseudo ps, int EncodingFamily> :
+  VOP3_Real <ps, EncodingFamily> {
+  let VINTERP = 1;
+}
+
+def VOP3_VINTERP_F32 : VOPProfile<[f32, f32, f32, f32]> {
+  let HasOpSel = 0;
+  let HasModifiers = 1;
+
+  let Outs64 = (outs VGPR_32:$vdst);
+  let Ins64 = (ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0,
+                   Src1Mod:$src1_modifiers, VRegSrc_32:$src1,
+                   Src2Mod:$src2_modifiers, VRegSrc_32:$src2,
+                   clampmod:$clamp,
+                   wait_exp:$waitexp);
+
+  let Asm64 = " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$waitexp";
+}
+
+class VOP3_VINTERP_F16 <list<ValueType> ArgVT> : VOPProfile<ArgVT> {
+  let HasOpSel = 1;
+  let HasModifiers = 1;
+
+  let Outs64 = (outs VGPR_32:$vdst);
+  let Ins64 = (ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0,
+                   Src1Mod:$src1_modifiers, VRegSrc_32:$src1,
+                   Src2Mod:$src2_modifiers, VRegSrc_32:$src2,
+                   clampmod:$clamp, op_sel0:$op_sel,
+                   wait_exp:$waitexp);
+
+  let Asm64 = " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$op_sel$waitexp";
+}
+
+//===----------------------------------------------------------------------===//
+// VINTERP Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+let SubtargetPredicate = isGFX11Plus in {
+
+let Uses = [M0, EXEC, MODE] in {
+def V_INTERP_P10_F32_inreg : VINTERP_Pseudo <"v_interp_p10_f32", VOP3_VINTERP_F32>;
+def V_INTERP_P2_F32_inreg : VINTERP_Pseudo <"v_interp_p2_f32", VOP3_VINTERP_F32>;
+def V_INTERP_P10_F16_F32_inreg :
+  VINTERP_Pseudo <"v_interp_p10_f16_f32", VOP3_VINTERP_F16<[f32, f32, f32, f32]>>;
+def V_INTERP_P2_F16_F32_inreg :
+  VINTERP_Pseudo <"v_interp_p2_f16_f32", VOP3_VINTERP_F16<[f16, f32, f32, f32]>>;
+} // Uses = [M0, EXEC, MODE]
+
+let Uses = [M0, EXEC] in {
+def V_INTERP_P10_RTZ_F16_F32_inreg :
+  VINTERP_Pseudo <"v_interp_p10_rtz_f16_f32", VOP3_VINTERP_F16<[f32, f32, f32, f32]>>;
+def V_INTERP_P2_RTZ_F16_F32_inreg :
+  VINTERP_Pseudo <"v_interp_p2_rtz_f16_f32", VOP3_VINTERP_F16<[f16, f32, f32, f32]>>;
+} // Uses = [M0, EXEC]
+
+} // SubtargetPredicate = isGFX11Plus
+
+class VInterpF32Pat <SDPatternOperator op, Instruction inst> : GCNPat <
+   (f32 (op
+      (VINTERPMods f32:$src0, i32:$src0_modifiers),
+      (VINTERPMods f32:$src1, i32:$src1_modifiers),
+      (VINTERPMods f32:$src2, i32:$src2_modifiers))),
+    (inst $src0_modifiers, $src0,
+          $src1_modifiers, $src1,
+          $src2_modifiers, $src2,
+          0, /* clamp */
+          7) /* wait_exp */
+>;
+
+def VINTERP_OPSEL {
+  int LOW = 0;
+  int HIGH = 0xa;
+}
+
+class VInterpF16Pat <SDPatternOperator op, Instruction inst,
+                     ValueType dst_type, bit high,
+                     list<ComplexPattern> pat> : GCNPat <
+   (dst_type (op
+      (pat[0] f32:$src0, i32:$src0_modifiers),
+      (pat[1] f32:$src1, i32:$src1_modifiers),
+      (pat[2] f32:$src2, i32:$src2_modifiers),
+      !if(high, (i1 -1), (i1 0)))),
+    (inst $src0_modifiers, $src0,
+          $src1_modifiers, $src1,
+          $src2_modifiers, $src2,
+          0, /* clamp */
+          /* op_sel = 0 */
+          7) /* wait_exp */
+>;
+
+multiclass VInterpF16Pat <SDPatternOperator op, Instruction inst,
+                          ValueType dst_type, list<ComplexPattern> high_pat> {
+  def : VInterpF16Pat<op, inst, dst_type, 0,
+                      [VINTERPMods, VINTERPMods, VINTERPMods]>;
+  def : VInterpF16Pat<op, inst, dst_type, 1, high_pat>;
+}
+
+def : VInterpF32Pat<int_amdgcn_interp_inreg_p10, V_INTERP_P10_F32_inreg>;
+def : VInterpF32Pat<int_amdgcn_interp_inreg_p2, V_INTERP_P2_F32_inreg>;
+defm : VInterpF16Pat<int_amdgcn_interp_inreg_p10_f16,
+                     V_INTERP_P10_F16_F32_inreg, f32,
+                     [VINTERPModsHi, VINTERPMods, VINTERPModsHi]>;
+defm : VInterpF16Pat<int_amdgcn_interp_inreg_p2_f16,
+                     V_INTERP_P2_F16_F32_inreg, f16,
+                     [VINTERPModsHi, VINTERPMods, VINTERPMods]>;
+
+//===----------------------------------------------------------------------===//
+// VINTERP Real Instructions
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX11Plus, DecoderNamespace = "GFX11" in {
+  multiclass VINTERP_Real_gfx11 <bits<7> op> {
+    def _gfx11 :
+      VINTERP_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX11>,
+      VINTERPe_gfx11<op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+  }
+}
+
+defm V_INTERP_P10_F32_inreg  : VINTERP_Real_gfx11<0x000>;
+defm V_INTERP_P2_F32_inreg  : VINTERP_Real_gfx11<0x001>;
+defm V_INTERP_P10_F16_F32_inreg  : VINTERP_Real_gfx11<0x002>;
+defm V_INTERP_P2_F16_F32_inreg  : VINTERP_Real_gfx11<0x003>;
+defm V_INTERP_P10_RTZ_F16_F32_inreg  : VINTERP_Real_gfx11<0x004>;
+defm V_INTERP_P2_RTZ_F16_F32_inreg  : VINTERP_Real_gfx11<0x005>;
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 48548d8b6722..1d374a9f90ba 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -59,9 +59,9 @@ class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1On
   let AsmVariantName = AMDGPUAsmVariants.Default;
 }
 
-class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily> :
+class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily, string real_name = ps.Mnemonic > :
   VOP_Real <ps>,
-  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+  InstSI <ps.OutOperandList, ps.InOperandList, real_name # ps.AsmOperands, []>,
   SIMCInstr <ps.PseudoInstr, EncodingFamily> {
 
   let VALU = 1;
@@ -110,13 +110,18 @@ class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
 }
 
 multiclass VOP1Inst <string opName, VOPProfile P,
-                     SDPatternOperator node = null_frag> {
+                     SDPatternOperator node = null_frag, int VOPDOp = -1> {
   // We only want to set this on the basic, non-SDWA or DPP forms.
-  defvar should_mov_imm = !eq(opName, "v_mov_b32");
+  defvar should_mov_imm = !or(!eq(opName, "v_mov_b32"),
+                              !eq(opName, "v_mov_b64"));
 
   let isMoveImm = should_mov_imm in {
-    def _e32 : VOP1_Pseudo <opName, P>;
-    def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>;
+    if !eq(VOPDOp, -1) then
+      def _e32 : VOP1_Pseudo <opName, P>;
+    else
+      // Only for V_MOV_B32
+      def _e32 : VOP1_Pseudo <opName, P>, VOPD_Component<VOPDOp, "v_mov_b32">;
+    def _e64 : VOP3InstBase <opName, P, node>;
   }
 
   foreach _ = BoolToList<P.HasExtSDWA>.ret in
@@ -125,6 +130,11 @@ multiclass VOP1Inst <string opName, VOPProfile P,
   foreach _ = BoolToList<P.HasExtDPP>.ret in
     def _dpp : VOP1_DPP_Pseudo <opName, P>;
 
+  let SubtargetPredicate = isGFX11Plus in {
+    foreach _ = BoolToList<P.HasExtVOP3DPP>.ret in
+      def _e64_dpp  : VOP3_DPP_Pseudo <opName, P>;
+  } // End SubtargetPredicate = isGFX11Plus
+
   def : MnemonicAlias<opName#"_e32", opName>, LetDummies;
   def : MnemonicAlias<opName#"_e64", opName>, LetDummies;
 
@@ -141,7 +151,9 @@ class VOPProfileI2F<ValueType dstVt, ValueType srcVt> :
   VOPProfile<[dstVt, srcVt, untyped, untyped]> {
 
   let Ins64 = (ins Src0RC64:$src0, clampmod:$clamp, omod:$omod);
+  let InsVOP3Base = (ins Src0DPP:$src0, clampmod:$clamp, omod:$omod);
   let Asm64 = "$vdst, $src0$clamp$omod";
+  let AsmVOP3DPPBase = Asm64;
 
   let HasModifiers = 0;
   let HasClamp = 1;
@@ -151,6 +163,12 @@ def VOP1_F64_I32 : VOPProfileI2F <f64, i32>;
 def VOP1_F32_I32 : VOPProfileI2F <f32, i32>;
 def VOP1_F16_I16 : VOPProfileI2F <f16, i16>;
 
+def VOP_NOP_PROFILE : VOPProfile <[untyped, untyped, untyped, untyped]>{
+  let HasExtVOP3DPP = 0;
+}
+
+// OMod clears exceptions when set. OMod was always an operand, but its
+// now explicitly set.
 class VOP_SPECIAL_OMOD_PROF<ValueType dstVt, ValueType srcVt> :
   VOPProfile<[dstVt, srcVt, untyped, untyped]> {
 
@@ -165,11 +183,21 @@ def VOP_I16_F16_SPECIAL_OMOD : VOP_SPECIAL_OMOD_PROF<i16, f16>;
 //===----------------------------------------------------------------------===//
 
 let VOPAsmPrefer32Bit = 1 in {
-defm V_NOP : VOP1Inst <"v_nop", VOP_NONE>;
+defm V_NOP : VOP1Inst <"v_nop", VOP_NOP_PROFILE>;
+}
+
+def VOPProfile_MOV : VOPProfile <[i32, i32, untyped, untyped]> {
+  let InsVOPDX = (ins Src0RC32:$src0X);
+  let InsVOPDXDeferred = (ins VSrc_f32_Deferred:$src0X);
+  let InsVOPDY = (ins Src0RC32:$src0Y);
+  let InsVOPDYDeferred = (ins VSrc_f32_Deferred:$src0Y);
 }
 
 let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
-defm V_MOV_B32 : VOP1Inst <"v_mov_b32", VOP_I32_I32>;
+defm V_MOV_B32 : VOP1Inst <"v_mov_b32", VOPProfile_MOV, null_frag, 0x8>;
+
+let SubtargetPredicate = isGFX940Plus in
+defm V_MOV_B64 : VOP1Inst <"v_mov_b64", VOP_I64_I64>;
 } // End isMoveImm = 1
 
 // FIXME: Specify SchedRW for READFIRSTLANE_B32
@@ -282,7 +310,7 @@ defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>;
 } // End TRANS = 1, SchedRW = [WriteTrans32]
 
 defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>;
-defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32, bitreverse>;
+defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32, DivergentUnaryFrag<bitreverse>>;
 defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32, AMDGPUffbh_u32>;
 defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32, AMDGPUffbl_b32>;
 defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32, AMDGPUffbh_i32>;
@@ -472,7 +500,7 @@ let SubtargetPredicate = isGFX9Only in {
 } // End SubtargetPredicate = isGFX9Only
 
 let SubtargetPredicate = isGFX10Plus in {
-  defm V_PIPEFLUSH        : VOP1Inst<"v_pipeflush", VOP_NONE>;
+  defm V_PIPEFLUSH        : VOP1Inst<"v_pipeflush", VOP_NO_EXT<VOP_NONE>>;
 
   let Uses = [M0] in {
     defm V_MOVRELSD_2_B32 :
@@ -498,6 +526,17 @@ def V_ACCVGPR_MOV_B32 : VOP1_Pseudo<"v_accvgpr_mov_b32", VOPProfileAccMov, [], 1
   let isAsCheapAsAMove = 1;
 }
 
+let SubtargetPredicate = isGFX11Plus in {
+  // Restrict src0 to be VGPR
+  def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS,
+                                      getVOP1Pat64<int_amdgcn_permlane64,
+                                                   VOP_MOVRELS>.ret,
+                                      /*VOP1Only=*/ 1>;
+  defm V_NOT_B16        : VOP1Inst<"v_not_b16", VOP_I16_I16>;
+  defm V_CVT_I32_I16    : VOP1Inst<"v_cvt_i32_i16", VOP_I32_I16>;
+  defm V_CVT_U32_U16    : VOP1Inst<"v_cvt_u32_u16", VOP_I16_I16>;
+} // End SubtargetPredicate = isGFX11Plus
+
 //===----------------------------------------------------------------------===//
 // Target-specific instruction encodings.
 //===----------------------------------------------------------------------===//
@@ -517,9 +556,9 @@ class VOP1_DPP<bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP1
   let Inst{31-25} = 0x3f;
 }
 
-class VOP1_DPP16<bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile p = ps.Pfl> :
+class VOP1_DPP16<bits<8> op, VOP1_DPP_Pseudo ps, int subtarget, VOPProfile p = ps.Pfl> :
     VOP1_DPP<op, ps, p, 1>,
-    SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX10> {
+    SIMCInstr <ps.PseudoInstr, subtarget> {
   let AssemblerPredicate = HasDPP16;
   let SubtargetPredicate = HasDPP16;
 }
@@ -538,11 +577,113 @@ class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
   let Inst{31-25} = 0x3f;
 }
 
+//===----------------------------------------------------------------------===//
+// GFX11.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in {
+  multiclass VOP1Only_Real_gfx11<bits<9> op> {
+    let IsSingle = 1 in
+      def _gfx11 :
+        VOP1_Real<!cast<VOP1_Pseudo>(NAME), SIEncodingFamily.GFX11>,
+        VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME).Pfl>;
+  }
+  multiclass VOP1_Real_e32_gfx11<bits<9> op, string opName = NAME> {
+    defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
+    def _e32_gfx11 :
+      VOP1_Real<ps, SIEncodingFamily.GFX11>,
+      VOP1e<op{7-0}, ps.Pfl>;
+  }
+  multiclass VOP1_Real_e32_with_name_gfx11<bits<9> op, string opName,
+                                       string asmName> {
+    defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
+    let AsmString = asmName # ps.AsmOperands in {
+      defm NAME : VOP1_Real_e32_gfx11<op, opName>,
+         MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX11Plus]>;
+    }
+  }
+  multiclass VOP1_Real_e64_gfx11<bits<9> op> {
+    def _e64_gfx11 :
+      VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX11>,
+      VOP3e_gfx11<{0, 1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+  }
+  multiclass VOP1_Real_dpp_gfx11<bits<9> op, string opName = NAME> {
+    defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
+    def _dpp_gfx11 : VOP1_DPP16<op{7-0}, !cast<VOP1_DPP_Pseudo>(opName#"_dpp"), SIEncodingFamily.GFX11> {
+      let DecoderNamespace = "DPPGFX11";
+    }
+  }
+  multiclass VOP1_Real_dpp_with_name_gfx11<bits<9> op, string opName,
+                                           string asmName> {
+    defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
+    let AsmString = asmName # ps.Pfl.AsmDPP16, DecoderNamespace = "DPPGFX11" in {
+      defm NAME : VOP1_Real_dpp_gfx11<op, opName>,
+         MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX11Plus]>;
+    }
+  }
+  multiclass VOP1_Real_dpp8_gfx11<bits<9> op, string opName = NAME> {
+    defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
+    def _dpp8_gfx11 : VOP1_DPP8<op{7-0}, ps> {
+      let DecoderNamespace = "DPP8GFX11";
+    }
+  }
+  multiclass VOP1_Real_dpp8_with_name_gfx11<bits<9> op, string opName,
+                                           string asmName> {
+    defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
+    let AsmString = asmName # ps.Pfl.AsmDPP8, DecoderNamespace = "DPP8GFX11" in {
+      defm NAME : VOP1_Real_dpp8_gfx11<op, opName>,
+         MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX11Plus]>;
+    }
+  }
+} // End AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11"
+
+multiclass VOP1_Realtriple_e64_gfx11<bits<9> op> {
+  defm NAME : VOP3_Realtriple_gfx11<{0, 1, 1, op{6-0}}, /*isSingle=*/ 0, NAME>;
+}
+multiclass VOP1_Realtriple_e64_with_name_gfx11<bits<9> op, string opName,
+  string asmName> {
+  defm NAME : VOP3_Realtriple_with_name_gfx11<{0, 1, 1, op{6-0}}, opName,
+    asmName>;
+}
+
+multiclass VOP1_Real_FULL_gfx11<bits<9> op> :
+  VOP1_Real_e32_gfx11<op>, VOP1_Realtriple_e64_gfx11<op>,
+  VOP1_Real_dpp_gfx11<op>, VOP1_Real_dpp8_gfx11<op>;
+
+multiclass VOP1_Real_NO_VOP3_with_name_gfx11<bits<9> op, string opName,
+                                           string asmName> :
+  VOP1_Real_e32_with_name_gfx11<op, opName, asmName>,
+  VOP1_Real_dpp_with_name_gfx11<op, opName, asmName>,
+  VOP1_Real_dpp8_with_name_gfx11<op, opName, asmName>;
+
+multiclass VOP1_Real_FULL_with_name_gfx11<bits<9> op, string opName,
+                                         string asmName> :
+  VOP1_Real_NO_VOP3_with_name_gfx11<op, opName, asmName>,
+  VOP1_Realtriple_e64_with_name_gfx11<op, opName, asmName>;
+
+multiclass VOP1_Real_NO_DPP_gfx11<bits<9> op> :
+  VOP1_Real_e32_gfx11<op>, VOP1_Real_e64_gfx11<op>;
+
+defm V_CVT_NEAREST_I32_F32 : VOP1_Real_FULL_with_name_gfx11<0x00c,
+  "V_CVT_RPI_I32_F32", "v_cvt_nearest_i32_f32">;
+defm V_CVT_FLOOR_I32_F32   : VOP1_Real_FULL_with_name_gfx11<0x00d,
+  "V_CVT_FLR_I32_F32", "v_cvt_floor_i32_f32">;
+defm V_CLZ_I32_U32         : VOP1_Real_FULL_with_name_gfx11<0x039,
+  "V_FFBH_U32", "v_clz_i32_u32">;
+defm V_CTZ_I32_B32         : VOP1_Real_FULL_with_name_gfx11<0x03a,
+  "V_FFBL_B32", "v_ctz_i32_b32">;
+defm V_CLS_I32             : VOP1_Real_FULL_with_name_gfx11<0x03b,
+  "V_FFBH_I32", "v_cls_i32">;
+defm V_PERMLANE64_B32      : VOP1Only_Real_gfx11<0x067>;
+defm V_NOT_B16             : VOP1_Real_FULL_gfx11<0x069>;
+defm V_CVT_I32_I16         : VOP1_Real_FULL_gfx11<0x06a>;
+defm V_CVT_U32_U16         : VOP1_Real_FULL_gfx11<0x06b>;
+
 //===----------------------------------------------------------------------===//
 // GFX10.
 //===----------------------------------------------------------------------===//
 
-let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
   multiclass VOP1Only_Real_gfx10<bits<9> op> {
     def _gfx10 :
       VOP1_Real<!cast<VOP1_Pseudo>(NAME), SIEncodingFamily.GFX10>,
@@ -567,50 +708,59 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
     }
   }
   multiclass VOP1_Real_dpp_gfx10<bits<9> op> {
-    foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
-    def _dpp_gfx10 : VOP1_DPP16<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")> {
+    foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP>.ret in
+    def _dpp_gfx10 : VOP1_DPP16<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX10> {
       let DecoderNamespace = "SDWA10";
     }
   }
   multiclass VOP1_Real_dpp8_gfx10<bits<9> op> {
-    foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+    foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP>.ret in
     def _dpp8_gfx10 : VOP1_DPP8<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")> {
       let DecoderNamespace = "DPP8";
     }
   }
-} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
+} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
 
 multiclass VOP1_Real_gfx10<bits<9> op> :
   VOP1_Real_e32_gfx10<op>, VOP1_Real_e64_gfx10<op>,
   VOP1_Real_sdwa_gfx10<op>, VOP1_Real_dpp_gfx10<op>,
   VOP1_Real_dpp8_gfx10<op>;
 
-defm V_PIPEFLUSH         : VOP1_Real_gfx10<0x01b>;
-defm V_MOVRELSD_2_B32    : VOP1_Real_gfx10<0x048>;
-defm V_CVT_F16_U16       : VOP1_Real_gfx10<0x050>;
-defm V_CVT_F16_I16       : VOP1_Real_gfx10<0x051>;
-defm V_CVT_U16_F16       : VOP1_Real_gfx10<0x052>;
-defm V_CVT_I16_F16       : VOP1_Real_gfx10<0x053>;
-defm V_RCP_F16           : VOP1_Real_gfx10<0x054>;
-defm V_SQRT_F16          : VOP1_Real_gfx10<0x055>;
-defm V_RSQ_F16           : VOP1_Real_gfx10<0x056>;
-defm V_LOG_F16           : VOP1_Real_gfx10<0x057>;
-defm V_EXP_F16           : VOP1_Real_gfx10<0x058>;
-defm V_FREXP_MANT_F16    : VOP1_Real_gfx10<0x059>;
-defm V_FREXP_EXP_I16_F16 : VOP1_Real_gfx10<0x05a>;
-defm V_FLOOR_F16         : VOP1_Real_gfx10<0x05b>;
-defm V_CEIL_F16          : VOP1_Real_gfx10<0x05c>;
-defm V_TRUNC_F16         : VOP1_Real_gfx10<0x05d>;
-defm V_RNDNE_F16         : VOP1_Real_gfx10<0x05e>;
-defm V_FRACT_F16         : VOP1_Real_gfx10<0x05f>;
-defm V_SIN_F16           : VOP1_Real_gfx10<0x060>;
-defm V_COS_F16           : VOP1_Real_gfx10<0x061>;
-defm V_SAT_PK_U8_I16     : VOP1_Real_gfx10<0x062>;
-defm V_CVT_NORM_I16_F16  : VOP1_Real_gfx10<0x063>;
-defm V_CVT_NORM_U16_F16  : VOP1_Real_gfx10<0x064>;
-
-defm V_SWAP_B32    : VOP1Only_Real_gfx10<0x065>;
-defm V_SWAPREL_B32 : VOP1Only_Real_gfx10<0x068>;
+multiclass VOP1_Real_gfx10_FULL_gfx11<bits<9> op> :
+  VOP1_Real_gfx10<op>, VOP1_Real_FULL_gfx11<op>;
+
+multiclass VOP1_Real_gfx10_NO_DPP_gfx11<bits<9> op> :
+  VOP1_Real_gfx10<op>, VOP1_Real_NO_DPP_gfx11<op>;
+
+multiclass VOP1Only_Real_gfx10_gfx11<bits<9> op> :
+  VOP1Only_Real_gfx10<op>, VOP1Only_Real_gfx11<op>;
+
+defm V_PIPEFLUSH         : VOP1_Real_gfx10_NO_DPP_gfx11<0x01b>;
+defm V_MOVRELSD_2_B32    : VOP1_Real_gfx10_FULL_gfx11<0x048>;
+defm V_CVT_F16_U16       : VOP1_Real_gfx10_FULL_gfx11<0x050>;
+defm V_CVT_F16_I16       : VOP1_Real_gfx10_FULL_gfx11<0x051>;
+defm V_CVT_U16_F16       : VOP1_Real_gfx10_FULL_gfx11<0x052>;
+defm V_CVT_I16_F16       : VOP1_Real_gfx10_FULL_gfx11<0x053>;
+defm V_RCP_F16           : VOP1_Real_gfx10_FULL_gfx11<0x054>;
+defm V_SQRT_F16          : VOP1_Real_gfx10_FULL_gfx11<0x055>;
+defm V_RSQ_F16           : VOP1_Real_gfx10_FULL_gfx11<0x056>;
+defm V_LOG_F16           : VOP1_Real_gfx10_FULL_gfx11<0x057>;
+defm V_EXP_F16           : VOP1_Real_gfx10_FULL_gfx11<0x058>;
+defm V_FREXP_MANT_F16    : VOP1_Real_gfx10_FULL_gfx11<0x059>;
+defm V_FREXP_EXP_I16_F16 : VOP1_Real_gfx10_FULL_gfx11<0x05a>;
+defm V_FLOOR_F16         : VOP1_Real_gfx10_FULL_gfx11<0x05b>;
+defm V_CEIL_F16          : VOP1_Real_gfx10_FULL_gfx11<0x05c>;
+defm V_TRUNC_F16         : VOP1_Real_gfx10_FULL_gfx11<0x05d>;
+defm V_RNDNE_F16         : VOP1_Real_gfx10_FULL_gfx11<0x05e>;
+defm V_FRACT_F16         : VOP1_Real_gfx10_FULL_gfx11<0x05f>;
+defm V_SIN_F16           : VOP1_Real_gfx10_FULL_gfx11<0x060>;
+defm V_COS_F16           : VOP1_Real_gfx10_FULL_gfx11<0x061>;
+defm V_SAT_PK_U8_I16     : VOP1_Real_gfx10_FULL_gfx11<0x062>;
+defm V_CVT_NORM_I16_F16  : VOP1_Real_gfx10_FULL_gfx11<0x063>;
+defm V_CVT_NORM_U16_F16  : VOP1_Real_gfx10_FULL_gfx11<0x064>;
+
+defm V_SWAP_B32          : VOP1Only_Real_gfx10_gfx11<0x065>;
+defm V_SWAPREL_B32       : VOP1Only_Real_gfx10_gfx11<0x068>;
 
 //===----------------------------------------------------------------------===//
 // GFX7, GFX10.
@@ -635,16 +785,19 @@ multiclass VOP1_Real_gfx7<bits<9> op> :
 multiclass VOP1_Real_gfx7_gfx10<bits<9> op> :
   VOP1_Real_gfx7<op>, VOP1_Real_gfx10<op>;
 
+multiclass VOP1_Real_gfx7_gfx10_NO_DPP_gfx11<bits<9> op> :
+  VOP1_Real_gfx7_gfx10<op>, VOP1_Real_NO_DPP_gfx11<op>;
+
 defm V_LOG_LEGACY_F32 : VOP1_Real_gfx7<0x045>;
 defm V_EXP_LEGACY_F32 : VOP1_Real_gfx7<0x046>;
 
-defm V_TRUNC_F64 : VOP1_Real_gfx7_gfx10<0x017>;
-defm V_CEIL_F64  : VOP1_Real_gfx7_gfx10<0x018>;
-defm V_RNDNE_F64 : VOP1_Real_gfx7_gfx10<0x019>;
-defm V_FLOOR_F64 : VOP1_Real_gfx7_gfx10<0x01a>;
+defm V_TRUNC_F64      : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11<0x017>;
+defm V_CEIL_F64       : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11<0x018>;
+defm V_RNDNE_F64      : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11<0x019>;
+defm V_FLOOR_F64      : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11<0x01a>;
 
 //===----------------------------------------------------------------------===//
-// GFX6, GFX7, GFX10.
+// GFX6, GFX7, GFX10, GFX11.
 //===----------------------------------------------------------------------===//
 
 let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
@@ -666,65 +819,71 @@ multiclass VOP1_Real_gfx6_gfx7<bits<9> op> :
 multiclass VOP1_Real_gfx6_gfx7_gfx10<bits<9> op> :
   VOP1_Real_gfx6_gfx7<op>, VOP1_Real_gfx10<op>;
 
-defm V_LOG_CLAMP_F32  : VOP1_Real_gfx6_gfx7<0x026>;
-defm V_RCP_CLAMP_F32  : VOP1_Real_gfx6_gfx7<0x028>;
-defm V_RCP_LEGACY_F32 : VOP1_Real_gfx6_gfx7<0x029>;
-defm V_RSQ_CLAMP_F32  : VOP1_Real_gfx6_gfx7<0x02c>;
-defm V_RSQ_LEGACY_F32 : VOP1_Real_gfx6_gfx7<0x02d>;
-defm V_RCP_CLAMP_F64  : VOP1_Real_gfx6_gfx7<0x030>;
-defm V_RSQ_CLAMP_F64  : VOP1_Real_gfx6_gfx7<0x032>;
-
-defm V_NOP               : VOP1_Real_gfx6_gfx7_gfx10<0x000>;
-defm V_MOV_B32           : VOP1_Real_gfx6_gfx7_gfx10<0x001>;
-defm V_CVT_I32_F64       : VOP1_Real_gfx6_gfx7_gfx10<0x003>;
-defm V_CVT_F64_I32       : VOP1_Real_gfx6_gfx7_gfx10<0x004>;
-defm V_CVT_F32_I32       : VOP1_Real_gfx6_gfx7_gfx10<0x005>;
-defm V_CVT_F32_U32       : VOP1_Real_gfx6_gfx7_gfx10<0x006>;
-defm V_CVT_U32_F32       : VOP1_Real_gfx6_gfx7_gfx10<0x007>;
-defm V_CVT_I32_F32       : VOP1_Real_gfx6_gfx7_gfx10<0x008>;
-defm V_CVT_F16_F32       : VOP1_Real_gfx6_gfx7_gfx10<0x00a>;
-defm V_CVT_F32_F16       : VOP1_Real_gfx6_gfx7_gfx10<0x00b>;
+multiclass VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<bits<9> op> :
+  VOP1_Real_gfx6_gfx7_gfx10<op>, VOP1_Real_FULL_gfx11<op>;
+
+multiclass VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<bits<9> op> :
+  VOP1_Real_gfx6_gfx7_gfx10<op>, VOP1_Real_NO_DPP_gfx11<op>;
+
+defm V_LOG_CLAMP_F32     : VOP1_Real_gfx6_gfx7<0x026>;
+defm V_RCP_CLAMP_F32     : VOP1_Real_gfx6_gfx7<0x028>;
+defm V_RCP_LEGACY_F32    : VOP1_Real_gfx6_gfx7<0x029>;
+defm V_RSQ_CLAMP_F32     : VOP1_Real_gfx6_gfx7<0x02c>;
+defm V_RSQ_LEGACY_F32    : VOP1_Real_gfx6_gfx7<0x02d>;
+defm V_RCP_CLAMP_F64     : VOP1_Real_gfx6_gfx7<0x030>;
+defm V_RSQ_CLAMP_F64     : VOP1_Real_gfx6_gfx7<0x032>;
+
+defm V_NOP               : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x000>;
+defm V_MOV_B32           : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x001>;
+defm V_CVT_I32_F64       : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x003>;
+defm V_CVT_F64_I32       : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x004>;
+defm V_CVT_F32_I32       : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x005>;
+defm V_CVT_F32_U32       : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x006>;
+defm V_CVT_U32_F32       : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x007>;
+defm V_CVT_I32_F32       : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x008>;
+defm V_CVT_F16_F32       : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x00a>;
+defm V_CVT_F32_F16       : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x00b>;
 defm V_CVT_RPI_I32_F32   : VOP1_Real_gfx6_gfx7_gfx10<0x00c>;
 defm V_CVT_FLR_I32_F32   : VOP1_Real_gfx6_gfx7_gfx10<0x00d>;
-defm V_CVT_OFF_F32_I4    : VOP1_Real_gfx6_gfx7_gfx10<0x00e>;
-defm V_CVT_F32_F64       : VOP1_Real_gfx6_gfx7_gfx10<0x00f>;
-defm V_CVT_F64_F32       : VOP1_Real_gfx6_gfx7_gfx10<0x010>;
-defm V_CVT_F32_UBYTE0    : VOP1_Real_gfx6_gfx7_gfx10<0x011>;
-defm V_CVT_F32_UBYTE1    : VOP1_Real_gfx6_gfx7_gfx10<0x012>;
-defm V_CVT_F32_UBYTE2    : VOP1_Real_gfx6_gfx7_gfx10<0x013>;
-defm V_CVT_F32_UBYTE3    : VOP1_Real_gfx6_gfx7_gfx10<0x014>;
-defm V_CVT_U32_F64       : VOP1_Real_gfx6_gfx7_gfx10<0x015>;
-defm V_CVT_F64_U32       : VOP1_Real_gfx6_gfx7_gfx10<0x016>;
-defm V_FRACT_F32         : VOP1_Real_gfx6_gfx7_gfx10<0x020>;
-defm V_TRUNC_F32         : VOP1_Real_gfx6_gfx7_gfx10<0x021>;
-defm V_CEIL_F32          : VOP1_Real_gfx6_gfx7_gfx10<0x022>;
-defm V_RNDNE_F32         : VOP1_Real_gfx6_gfx7_gfx10<0x023>;
-defm V_FLOOR_F32         : VOP1_Real_gfx6_gfx7_gfx10<0x024>;
-defm V_EXP_F32           : VOP1_Real_gfx6_gfx7_gfx10<0x025>;
-defm V_LOG_F32           : VOP1_Real_gfx6_gfx7_gfx10<0x027>;
-defm V_RCP_F32           : VOP1_Real_gfx6_gfx7_gfx10<0x02a>;
-defm V_RCP_IFLAG_F32     : VOP1_Real_gfx6_gfx7_gfx10<0x02b>;
-defm V_RSQ_F32           : VOP1_Real_gfx6_gfx7_gfx10<0x02e>;
-defm V_RCP_F64           : VOP1_Real_gfx6_gfx7_gfx10<0x02f>;
-defm V_RSQ_F64           : VOP1_Real_gfx6_gfx7_gfx10<0x031>;
-defm V_SQRT_F32          : VOP1_Real_gfx6_gfx7_gfx10<0x033>;
-defm V_SQRT_F64          : VOP1_Real_gfx6_gfx7_gfx10<0x034>;
-defm V_SIN_F32           : VOP1_Real_gfx6_gfx7_gfx10<0x035>;
-defm V_COS_F32           : VOP1_Real_gfx6_gfx7_gfx10<0x036>;
-defm V_NOT_B32           : VOP1_Real_gfx6_gfx7_gfx10<0x037>;
-defm V_BFREV_B32         : VOP1_Real_gfx6_gfx7_gfx10<0x038>;
+defm V_CVT_OFF_F32_I4    : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x00e>;
+defm V_CVT_F32_F64       : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x00f>;
+defm V_CVT_F64_F32       : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x010>;
+defm V_CVT_F32_UBYTE0    : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x011>;
+defm V_CVT_F32_UBYTE1    : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x012>;
+defm V_CVT_F32_UBYTE2    : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x013>;
+defm V_CVT_F32_UBYTE3    : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x014>;
+defm V_CVT_U32_F64       : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x015>;
+defm V_CVT_F64_U32       : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x016>;
+defm V_FRACT_F32         : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x020>;
+defm V_TRUNC_F32         : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x021>;
+defm V_CEIL_F32          : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x022>;
+defm V_RNDNE_F32         : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x023>;
+defm V_FLOOR_F32         : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x024>;
+defm V_EXP_F32           : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x025>;
+defm V_LOG_F32           : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x027>;
+defm V_RCP_F32           : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x02a>;
+defm V_RCP_IFLAG_F32     : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x02b>;
+defm V_RSQ_F32           : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x02e>;
+defm V_RCP_F64           : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x02f>;
+defm V_RSQ_F64           : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x031>;
+defm V_SQRT_F32          : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x033>;
+defm V_SQRT_F64          : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x034>;
+defm V_SIN_F32           : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x035>;
+defm V_COS_F32           : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x036>;
+defm V_NOT_B32           : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x037>;
+defm V_BFREV_B32         : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x038>;
 defm V_FFBH_U32          : VOP1_Real_gfx6_gfx7_gfx10<0x039>;
 defm V_FFBL_B32          : VOP1_Real_gfx6_gfx7_gfx10<0x03a>;
 defm V_FFBH_I32          : VOP1_Real_gfx6_gfx7_gfx10<0x03b>;
-defm V_FREXP_EXP_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x03c>;
-defm V_FREXP_MANT_F64    : VOP1_Real_gfx6_gfx7_gfx10<0x03d>;
-defm V_FRACT_F64         : VOP1_Real_gfx6_gfx7_gfx10<0x03e>;
-defm V_FREXP_EXP_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x03f>;
-defm V_FREXP_MANT_F32    : VOP1_Real_gfx6_gfx7_gfx10<0x040>;
+defm V_FREXP_EXP_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x03c>;
+defm V_FREXP_MANT_F64    : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x03d>;
+defm V_FRACT_F64         : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x03e>;
+defm V_FREXP_EXP_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x03f>;
+defm V_FREXP_MANT_F32    : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x040>;
 defm V_CLREXCP           : VOP1_Real_gfx6_gfx7_gfx10<0x041>;
-defm V_MOVRELD_B32       : VOP1_Real_gfx6_gfx7_gfx10<0x042>;
-defm V_MOVRELS_B32       : VOP1_Real_gfx6_gfx7_gfx10<0x043>;
-defm V_MOVRELSD_B32      : VOP1_Real_gfx6_gfx7_gfx10<0x044>;
+defm V_MOVRELD_B32       : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x042>;
+defm V_MOVRELS_B32       : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x043>;
+defm V_MOVRELSD_B32      : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x044>;
 
 //===----------------------------------------------------------------------===//
 // GFX8, GFX9 (VI).
@@ -949,14 +1108,29 @@ multiclass VOP1_Real_gfx9 <bits<10> op> {
 
 defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>;
 
+let AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX9" in
+defm V_MOV_B64 : VOP1_Real_gfx9 <0x38>;
+
 //===----------------------------------------------------------------------===//
 // GFX10
 //===----------------------------------------------------------------------===//
 
-let OtherPredicates = [isGFX10Plus] in {
+let OtherPredicates = [isGFX10Only] in {
 def : GCNPat <
   (i32 (int_amdgcn_mov_dpp8 i32:$src, timm:$dpp8)),
   (V_MOV_B32_dpp8_gfx10 VGPR_32:$src, VGPR_32:$src,
                         (as_i32timm $dpp8), (i32 DPP8Mode.FI_0))
 >;
-} // End OtherPredicates = [isGFX10Plus]
+} // End OtherPredicates = [isGFX10Only]
+
+//===----------------------------------------------------------------------===//
+// GFX11
+//===----------------------------------------------------------------------===//
+
+let OtherPredicates = [isGFX11Only] in {
+def : GCNPat <
+  (i32 (int_amdgcn_mov_dpp8 i32:$src, timm:$dpp8)),
+  (V_MOV_B32_dpp8_gfx11 VGPR_32:$src, VGPR_32:$src,
+                        (as_i32timm $dpp8), (i32 DPP8Mode.FI_0))
+>;
+} // End OtherPredicates = [isGFX11Only]
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index b9ff814a4dc5..1485a1e63129 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -80,9 +80,9 @@ class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suf
   let AsmVariantName = AMDGPUAsmVariants.Default;
 }
 
-class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily> :
+class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily, string real_name = ps.Mnemonic> :
   VOP_Real <ps>,
-  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+  InstSI <ps.OutOperandList, ps.InOperandList, real_name # ps.AsmOperands, []>,
   SIMCInstr <ps.PseudoInstr, EncodingFamily> {
 
   let VALU = 1;
@@ -140,15 +140,26 @@ multiclass VOP2Inst_e32<string opName,
                Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
   } // End renamedInGFX9 = GFX9Renamed
 }
-
+multiclass
+    VOP2Inst_e32_VOPD<string opName, VOPProfile P, bits<5> VOPDOp,
+                      string VOPDName, SDPatternOperator node = null_frag,
+                      string revOp = opName, bit GFX9Renamed = 0> {
+  defm NAME : VOP2Inst_e32<opName, P, node, revOp, GFX9Renamed>,
+              VOPD_Component<VOPDOp, VOPDName>;
+}
 multiclass VOP2Inst_e64<string opName,
                         VOPProfile P,
                         SDPatternOperator node = null_frag,
                         string revOp = opName,
                         bit GFX9Renamed = 0> {
   let renamedInGFX9 = GFX9Renamed in {
-    def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
+    def _e64 : VOP3InstBase <opName, P, node, 1>,
                Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
+
+    let SubtargetPredicate = isGFX11Plus in {
+      foreach _ = BoolToList<P.HasExtVOP3DPP>.ret in
+        def _e64_dpp  : VOP3_DPP_Pseudo <opName, P>;
+    } // End SubtargetPredicate = isGFX11Plus
   } // End renamedInGFX9 = GFX9Renamed
 }
 
@@ -175,6 +186,22 @@ multiclass VOP2Inst<string opName,
   }
 }
 
+multiclass VOP2Inst_VOPD<string opName,
+                         VOPProfile P,
+                         bits<5> VOPDOp,
+                         string VOPDName,
+                         SDPatternOperator node = null_frag,
+                         string revOp = opName,
+                         bit GFX9Renamed = 0> :
+    VOP2Inst_e32_VOPD<opName, P, VOPDOp, VOPDName, node, revOp, GFX9Renamed>,
+    VOP2Inst_e64<opName, P, node, revOp, GFX9Renamed>,
+    VOP2Inst_sdwa<opName, P, GFX9Renamed> {
+  let renamedInGFX9 = GFX9Renamed in {
+    foreach _ = BoolToList<P.HasExtDPP>.ret in
+      def _dpp  : VOP2_DPP_Pseudo <opName, P>;
+  }
+}
+
 multiclass VOP2bInst <string opName,
                       VOPProfile P,
                       SDPatternOperator node = null_frag,
@@ -195,10 +222,15 @@ multiclass VOP2bInst <string opName,
           }
         foreach _ = BoolToList<P.HasExtDPP>.ret in
           def _dpp  : VOP2_DPP_Pseudo <opName, P>;
-      }
+      } // End Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC]
 
-      def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
+      def _e64 : VOP3InstBase <opName, P, node, 1>,
                  Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
+
+      let SubtargetPredicate = isGFX11Plus in {
+        foreach _ = BoolToList<P.HasExtVOP3DPP>.ret in
+          def _e64_dpp  : VOP3_DPP_Pseudo <opName, P>;
+      } // End SubtargetPredicate = isGFX11Plus
     }
   }
 }
@@ -220,16 +252,19 @@ multiclass VOP2bInstAliases<VOP2_Pseudo ps, VOP2_Real inst, string OpName> {
   }
 }
 
-multiclass VOP2eInst <string opName,
-                      VOPProfile P,
-                      SDPatternOperator node = null_frag,
-                      string revOp = opName,
-                      bit useSGPRInput = !eq(P.NumSrcArgs, 3)> {
+multiclass
+    VOP2eInst_Base<string opName, VOPProfile P, bits<5> VOPDOp, string VOPDName,
+                   SDPatternOperator node, string revOp, bit useSGPRInput> {
 
   let SchedRW = [Write32Bit] in {
     let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]) in {
-      def _e32 : VOP2_Pseudo <opName, P>,
-                 Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
+      if !eq(VOPDOp, -1) then
+        def _e32 : VOP2_Pseudo <opName, P>,
+                   Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
+      else
+        def _e32 : VOP2_Pseudo <opName, P>,
+                   Commutable_REV<revOp#"_e32", !eq(revOp, opName)>,
+                   VOPD_Component<VOPDOp, VOPDName>;
 
       foreach _ = BoolToList<P.HasExtSDWA>.ret in
         def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
@@ -240,13 +275,29 @@ multiclass VOP2eInst <string opName,
         def _dpp  : VOP2_DPP_Pseudo <opName, P>;
     }
 
-    def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
+    def _e64 : VOP3InstBase <opName, P, node, 1>,
                Commutable_REV<revOp#"_e64", !eq(revOp, opName)> {
       let isReMaterializable = 1;
     }
+
+    let SubtargetPredicate = isGFX11Plus in {
+      foreach _ = BoolToList<P.HasExtVOP3DPP>.ret in
+        def _e64_dpp  : VOP3_DPP_Pseudo <opName, P>;
+    } // End SubtargetPredicate = isGFX11Plus
   }
 }
 
+multiclass
+    VOP2eInst<string opName, VOPProfile P, SDPatternOperator node = null_frag,
+              string revOp = opName, bit useSGPRInput = !eq(P.NumSrcArgs, 3)>
+    : VOP2eInst_Base<opName, P, -1, "", node, revOp, useSGPRInput>;
+
+multiclass
+    VOP2eInst_VOPD<string opName, VOPProfile P, bits<5> VOPDOp, string VOPDName,
+                   SDPatternOperator node = null_frag, string revOp = opName,
+                   bit useSGPRInput = !eq(P.NumSrcArgs, 3)>
+    : VOP2eInst_Base<opName, P, VOPDOp, VOPDName, node, revOp, useSGPRInput>;
+
 class VOP2eInstAlias <VOP2_Pseudo ps, Instruction inst, string opnd = ""> :
   InstAlias <ps.OpName#" "#ps.Pfl.Asm32#", "#opnd,
              (inst ps.Pfl.DstRC:$vdst, ps.Pfl.Src0RC32:$src0,
@@ -267,12 +318,24 @@ multiclass VOP2eInstAliases<VOP2_Pseudo ps, VOP2_Real inst> {
   }
 }
 
-class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
+class VOP_MADK_Base<ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
+  string AsmVOPDXDeferred = ?;
+}
+
+class VOP_MADAK <ValueType vt> : VOP_MADK_Base<vt> {
   field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
   field dag Ins32 = !if(!eq(vt.Size, 32),
                         (ins VSrc_f32_Deferred:$src0, VGPR_32:$src1, ImmOpType:$imm),
                         (ins VSrc_f16_Deferred:$src0, VGPR_32:$src1, ImmOpType:$imm));
+  field dag InsVOPDX = (ins VSrc_f32_Deferred:$src0X, VGPR_32:$vsrc1X, ImmOpType:$imm);
+  // Note that both src0X and imm are deferred
+  let InsVOPDXDeferred = (ins VSrc_f32_Deferred:$src0X, VGPR_32:$vsrc1X, ImmOpType:$immDeferred);
+  field dag InsVOPDY = (ins VSrc_f32_Deferred:$src0Y, VGPR_32:$vsrc1Y, ImmOpType:$imm);
+
   field string Asm32 = "$vdst, $src0, $src1, $imm";
+  field string AsmVOPDX = "$vdstX, $src0X, $vsrc1X, $imm";
+  let AsmVOPDXDeferred = "$vdstX, $src0X, $vsrc1X, $immDeferred";
+  field string AsmVOPDY = "$vdstY, $src0Y, $vsrc1Y, $imm";
   field bit HasExt = 0;
   let IsSingle = 1;
 }
@@ -280,10 +343,17 @@ class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
 def VOP_MADAK_F16 : VOP_MADAK <f16>;
 def VOP_MADAK_F32 : VOP_MADAK <f32>;
 
-class VOP_MADMK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
+class VOP_MADMK <ValueType vt> : VOP_MADK_Base<vt> {
   field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
   field dag Ins32 = (ins VSrc_f32_Deferred:$src0, ImmOpType:$imm, VGPR_32:$src1);
+  field dag InsVOPDX = (ins VSrc_f32_Deferred:$src0X, ImmOpType:$imm, VGPR_32:$vsrc1X);
+  let InsVOPDXDeferred = (ins VSrc_f32_Deferred:$src0X, ImmOpType:$immDeferred, VGPR_32:$vsrc1X);
+  field dag InsVOPDY = (ins VSrc_f32_Deferred:$src0Y, ImmOpType:$imm, VGPR_32:$vsrc1Y);
+
   field string Asm32 = "$vdst, $src0, $imm, $src1";
+  field string AsmVOPDX = "$vdstX, $src0X, $imm, $vsrc1X";
+  let AsmVOPDXDeferred = "$vdstX, $src0X, $immDeferred, $vsrc1X";
+  field string AsmVOPDY = "$vdstY, $src0Y, $imm, $vsrc1Y";
   field bit HasExt = 0;
   let IsSingle = 1;
 }
@@ -308,6 +378,10 @@ class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, v
                     dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
   let InsDPP16 = !con(InsDPP, (ins FI:$fi));
+  let InsVOP3Base  = getIns64<Src0VOP3DPP, Src1RC64, RegisterOperand<VGPR_32>, 3,
+                       0, HasModifiers, HasModifiers, HasOMod,
+                       Src0Mod, Src1Mod, Src2Mod>.ret;
+
   let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
                      Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
                      getVregSrcForVT<Src2VT>.ret:$src2, // stub argument
@@ -330,6 +404,7 @@ class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, v
 
   let HasExt = 1;
   let HasExtDPP = 1;
+  let HasExt32BitDPP = 1;
   let HasExtSDWA = 1;
   let HasExtSDWA9 = 0;
   let TieRegDPP = "$src2";
@@ -337,9 +412,9 @@ class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, v
 
 def VOP_MAC_F16 : VOP_MAC <f16>;
 def VOP_MAC_F32 : VOP_MAC <f32>;
-let HasExtDPP = 0 in
+let HasExtDPP = 0, HasExt32BitDPP = 0 in
 def VOP_MAC_LEGACY_F32 : VOP_MAC <f32>;
-let HasExtSDWA = 0, HasExt64BitDPP = 1 in
+let HasExtSDWA = 0, HasExt32BitDPP = 0, HasExt64BitDPP = 1 in
 def VOP_MAC_F64 : VOP_MAC <f64>;
 
 class VOP_DOT_ACC<ValueType vt0, ValueType vt1> : VOP_MAC<vt0, vt1> {
@@ -355,6 +430,7 @@ def VOP_DOT_ACC_F32_V2F16 : VOP_DOT_ACC<f32, v2f16> {
 }
 
 def VOP_DOT_ACC_I32_I32   : VOP_DOT_ACC<i32, i32> {
+  let HasExtVOP3DPP = 0;
   let HasSrc0Mods = 1;
   let HasSrc1Mods = 1;
 }
@@ -368,13 +444,27 @@ def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped], 0, /*EnableClamp
   let AsmDPP = "$vdst, vcc, $src0, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
   let AsmDPP8 = "$vdst, vcc, $src0, $src1 $dpp8$fi";
   let AsmDPP16 = AsmDPP#"$fi";
+  let AsmVOP3DPPBase = Asm64;
+  let InsDPP = (ins DstRCDPP:$old,
+                    Src0DPP:$src0,
+                    Src1DPP:$src1,
+                    dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+                    bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+  let InsDPP16 = !con(InsDPP, (ins FI:$fi));
+  let InsDPP8 = (ins DstRCDPP:$old,
+                    Src0DPP:$src0,
+                    Src1DPP:$src1,
+                    dpp8:$dpp8, FI:$fi);
   let Outs32 = (outs DstRC:$vdst);
   let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
+  let OutsVOP3DPP = Outs64;
+  let OutsVOP3DPP8 = Outs64;
 }
 
 // Write out to vcc or arbitrary SGPR and read in from vcc or
 // arbitrary SGPR.
 def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], 0, /*EnableClamp=*/1> {
+  let HasSrc2Mods = 0;
   let Asm32 = "$vdst, vcc, $src0, $src1, vcc";
   let Asm64 = "$vdst, $sdst, $src0, $src1, $src2$clamp";
   let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc$clamp $dst_sel $dst_unused $src0_sel $src1_sel";
@@ -384,6 +474,9 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], 0, /*EnableClamp=*
   let AsmDPP16 = AsmDPP#"$fi";
   let Outs32 = (outs DstRC:$vdst);
   let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
+  let AsmVOP3DPPBase = Asm64;
+  let OutsVOP3DPP = Outs64;
+  let OutsVOP3DPP8 = Outs64;
 
   // Suppress src2 implied by type since the 32-bit encoding uses an
   // implicit VCC use.
@@ -401,15 +494,20 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], 0, /*EnableClamp=*
                     dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
   let InsDPP16 = !con(InsDPP, (ins FI:$fi));
+  let InsDPP8 = (ins DstRCDPP:$old,
+                    Src0DPP:$src0,
+                    Src1DPP:$src1,
+                   dpp8:$dpp8, FI:$fi);
 
   let HasExt = 1;
   let HasExtDPP = 1;
+  let HasExt32BitDPP = 1;
   let HasExtSDWA = 1;
   let HasExtSDWA9 = 1;
 }
 
 // Read in from vcc or arbitrary SGPR.
-def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], /*EnableF32SrcMods=*/1> {
+class VOP2e_SGPR<list<ValueType> ArgVT> : VOPProfile<ArgVT, /*EnableF32SrcMods=*/1> {
   let Asm32 = "$vdst, $src0, $src1";
   let Asm64 = "$vdst, $src0_modifiers, $src1_modifiers, $src2";
   let AsmSDWA = "$vdst, $src0_modifiers, $src1_modifiers, vcc$clamp $dst_sel $dst_unused $src0_sel $src1_sel";
@@ -417,6 +515,7 @@ def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], /*EnableF32SrcMods=*/
   let AsmDPP = "$vdst, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
   let AsmDPP8 = "$vdst, $src0, $src1, vcc $dpp8$fi";
   let AsmDPP16 = AsmDPP#"$fi";
+  let AsmVOP3DPPBase = Asm64;
 
   let Outs32 = (outs DstRC:$vdst);
   let Outs64 = (outs DstRC:$vdst);
@@ -437,14 +536,22 @@ def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], /*EnableF32SrcMods=*/
                     dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
   let InsDPP16 = !con(InsDPP, (ins FI:$fi));
+  let InsDPP8 = (ins DstRCDPP:$old,
+                     Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
+                     Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
+                     dpp8:$dpp8, FI:$fi);
 
   let HasExt = 1;
   let HasExtDPP = 1;
+  let HasExt32BitDPP = 1;
   let HasExtSDWA = 1;
   let HasExtSDWA9 = 1;
 }
 
-def VOP_READLANE : VOPProfile<[i32, i32, i32]> {
+def VOP2e_I32_I32_I32_I1 : VOP2e_SGPR<[i32, i32, i32, i1]>;
+def VOP2e_I16_I16_I16_I1 : VOP2e_SGPR<[i16, i16, i16, i1]>;
+
+def VOP_READLANE : VOPProfile<[i32, i32, i32, untyped]> {
   let Outs32 = (outs SReg_32:$vdst);
   let Outs64 = Outs32;
   let Ins32 = (ins VRegOrLds_32:$src0, SCSrc_b32:$src1);
@@ -454,6 +561,7 @@ def VOP_READLANE : VOPProfile<[i32, i32, i32]> {
 
   let HasExt = 0;
   let HasExtDPP = 0;
+  let HasExt32BitDPP = 0;
   let HasExt64BitDPP = 0;
   let HasExtSDWA = 0;
   let HasExtSDWA9 = 0;
@@ -471,6 +579,7 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> {
 
   let HasExt = 0;
   let HasExtDPP = 0;
+  let HasExt32BitDPP = 0;
   let HasExt64BitDPP = 0;
   let HasExtSDWA = 0;
   let HasExtSDWA9 = 0;
@@ -480,31 +589,33 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> {
 // VOP2 Instructions
 //===----------------------------------------------------------------------===//
 
-defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>;
+let SubtargetPredicate = isGFX11Plus in
+defm V_CNDMASK_B16 : VOP2eInst <"v_cndmask_b16", VOP2e_I16_I16_I16_I1>;
+defm V_CNDMASK_B32 : VOP2eInst_VOPD <"v_cndmask_b32", VOP2e_I32_I32_I32_I1, 0x9, "v_cndmask_b32">;
 let SubtargetPredicate = HasMadMacF32Insts, isReMaterializable = 1 in
 def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, []>;
 
 let isCommutable = 1 in {
 let isReMaterializable = 1 in {
-defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, any_fadd>;
-defm V_SUB_F32 : VOP2Inst <"v_sub_f32", VOP_F32_F32_F32, any_fsub>;
-defm V_SUBREV_F32 : VOP2Inst <"v_subrev_f32", VOP_F32_F32_F32, null_frag, "v_sub_f32">;
-defm V_MUL_LEGACY_F32 : VOP2Inst <"v_mul_legacy_f32", VOP_F32_F32_F32, AMDGPUfmul_legacy>;
-defm V_MUL_F32 : VOP2Inst <"v_mul_f32", VOP_F32_F32_F32, any_fmul>;
+defm V_ADD_F32 : VOP2Inst_VOPD <"v_add_f32", VOP_F32_F32_F32, 0x4, "v_add_f32", any_fadd>;
+defm V_SUB_F32 : VOP2Inst_VOPD <"v_sub_f32", VOP_F32_F32_F32, 0x5, "v_sub_f32", any_fsub>;
+defm V_SUBREV_F32 : VOP2Inst_VOPD <"v_subrev_f32", VOP_F32_F32_F32, 0x6, "v_subrev_f32", null_frag, "v_sub_f32">;
+defm V_MUL_LEGACY_F32 : VOP2Inst_VOPD <"v_mul_legacy_f32", VOP_F32_F32_F32, 0x7, "v_mul_dx9_zero_f32", AMDGPUfmul_legacy>;
+defm V_MUL_F32 : VOP2Inst_VOPD <"v_mul_f32", VOP_F32_F32_F32, 0x3, "v_mul_f32", any_fmul>;
 defm V_MUL_I32_I24 : VOP2Inst <"v_mul_i32_i24", VOP_I32_I32_I32_ARITH, AMDGPUmul_i24>;
 defm V_MUL_HI_I32_I24 : VOP2Inst <"v_mul_hi_i32_i24", VOP_I32_I32_I32, AMDGPUmulhi_i24>;
 defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_I32_I32_I32_ARITH, AMDGPUmul_u24>;
 defm V_MUL_HI_U32_U24 : VOP2Inst <"v_mul_hi_u32_u24", VOP_I32_I32_I32, AMDGPUmulhi_u24>;
-defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum_like>;
-defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum_like>;
+defm V_MIN_F32 : VOP2Inst_VOPD <"v_min_f32", VOP_F32_F32_F32, 0xb, "v_min_f32", fminnum_like>;
+defm V_MAX_F32 : VOP2Inst_VOPD <"v_max_f32", VOP_F32_F32_F32, 0xa, "v_max_f32", fmaxnum_like>;
 defm V_MIN_I32 : VOP2Inst <"v_min_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smin>;
 defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smax>;
 defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umin>;
 defm V_MAX_U32 : VOP2Inst <"v_max_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umax>;
 defm V_LSHRREV_B32 : VOP2Inst <"v_lshrrev_b32", VOP_I32_I32_I32, clshr_rev_32, "v_lshr_b32">;
 defm V_ASHRREV_I32 : VOP2Inst <"v_ashrrev_i32", VOP_I32_I32_I32, cashr_rev_32, "v_ashr_i32">;
-defm V_LSHLREV_B32 : VOP2Inst <"v_lshlrev_b32", VOP_I32_I32_I32, clshl_rev_32, "v_lshl_b32">;
-defm V_AND_B32 : VOP2Inst <"v_and_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, and>;
+defm V_LSHLREV_B32 : VOP2Inst_VOPD <"v_lshlrev_b32", VOP_I32_I32_I32, 0x11, "v_lshlrev_b32", clshl_rev_32, "v_lshl_b32">;
+defm V_AND_B32 : VOP2Inst_VOPD <"v_and_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, 0x12, "v_and_b32", and>;
 defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, or>;
 defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, xor>;
 } // End isReMaterializable = 1
@@ -536,7 +647,7 @@ defm V_SUBBREV_U32 : VOP2bInst <"v_subbrev_u32", VOP2b_I32_I1_I32_I32_I1, null_f
 
 
 let SubtargetPredicate = HasAddNoCarryInsts, isReMaterializable = 1 in {
-defm V_ADD_U32 : VOP2Inst <"v_add_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_add_u32", 1>;
+defm V_ADD_U32 : VOP2Inst_VOPD <"v_add_u32", VOP_I32_I32_I32_ARITH, 0x10, "v_add_nc_u32", null_frag, "v_add_u32", 1>;
 defm V_SUB_U32 : VOP2Inst <"v_sub_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>;
 defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>;
 }
@@ -555,20 +666,20 @@ def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE,
 } // End isConvergent = 1
 
 let isReMaterializable = 1 in {
-defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_NO_EXT<VOP_I32_I32_I32>>;
-defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, add_ctpop>;
-defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_lo>;
-defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>;
-defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>;
+defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_I32_I32_I32>;
+defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_I32_I32_I32, add_ctpop>;
+defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_lo>;
+defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_hi>;
+defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_F32_F32_I32, AMDGPUldexp>;
 
 let ReadsModeReg = 0, mayRaiseFPException = 0 in {
-defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_V2I16_F32_F32>, AMDGPUpknorm_i16_f32>;
-defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_V2I16_F32_F32>, AMDGPUpknorm_u16_f32>;
+defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_V2I16_F32_F32, AMDGPUpknorm_i16_f32>;
+defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_V2I16_F32_F32, AMDGPUpknorm_u16_f32>;
 }
 
-defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_V2F16_F32_F32>, AMDGPUpkrtz_f16_f32>;
-defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_V2I16_I32_I32>, AMDGPUpk_u16_u32>;
-defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_V2I16_I32_I32>, AMDGPUpk_i16_i32>;
+defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_V2F16_F32_F32, AMDGPUpkrtz_f16_f32>;
+defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_V2I16_I32_I32, AMDGPUpk_u16_u32>;
+defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_V2I16_I32_I32, AMDGPUpk_i16_i32>;
 
 
 let SubtargetPredicate = isGFX6GFX7 in {
@@ -641,8 +752,9 @@ def :  divergent_i64_BinOp <and, V_AND_B32_e64>;
 def :  divergent_i64_BinOp <or,  V_OR_B32_e64>;
 def :  divergent_i64_BinOp <xor, V_XOR_B32_e64>;
 
-let SubtargetPredicate = Has16BitInsts in {
 
+let SubtargetPredicate = Has16BitInsts in {
+let isReMaterializable = 1 in {
 let FPDPRounding = 1 in {
 def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">;
 defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>;
@@ -664,9 +776,7 @@ def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">;
 }
 
 } // End FPDPRounding = 1
-defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16_ARITH, add>;
-defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16_ARITH, sub>;
-defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16_ARITH, null_frag, "v_sub_u16">;
+
 defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16, mul>;
 defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum_like>;
 defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum_like>;
@@ -675,12 +785,19 @@ defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16, smax>;
 defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16, umin>;
 defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16, smin>;
 
-let Constraints = "$vdst = $src2", DisableEncoding="$src2",
-    isConvertibleToThreeAddress = 1 in {
-defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>;
+let SubtargetPredicate = isGFX8GFX9 in {
+  defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16_ARITH, add>;
+  defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16_ARITH, sub>;
+  defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16_ARITH, null_frag, "v_sub_u16">;
 }
 } // End isCommutable = 1
+} // End isReMaterializable = 1
 
+// FIXME: Missing FPDPRounding
+let Constraints = "$vdst = $src2", DisableEncoding="$src2",
+    isConvertibleToThreeAddress = 1, isCommutable = 1 in {
+defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>;
+}
 } // End SubtargetPredicate = Has16BitInsts
 
 let SubtargetPredicate = HasDLInsts in {
@@ -722,7 +839,7 @@ let Constraints = "$vdst = $src2",
     DisableEncoding = "$src2",
     isConvertibleToThreeAddress = 1,
     isCommutable = 1 in
-defm V_FMAC_F32 : VOP2Inst <"v_fmac_f32", VOP_MAC_F32>;
+defm V_FMAC_F32 : VOP2Inst_VOPD <"v_fmac_f32", VOP_MAC_F32, 0x0, "v_fmac_f32">;
 
 } // End SubtargetPredicate = HasDLInsts
 
@@ -750,7 +867,7 @@ let Constraints = "$vdst = $src2",
       isCommutable = 1,
       IsDOT = 1 in {
   let SubtargetPredicate = HasDot5Insts in
-    defm V_DOT2C_F32_F16 : VOP2Inst<"v_dot2c_f32_f16", VOP_DOT_ACC_F32_V2F16>;
+    defm V_DOT2C_F32_F16 : VOP2Inst_VOPD<"v_dot2c_f32_f16", VOP_DOT_ACC_F32_V2F16, 0xc, "v_dot2acc_f32_f16">;
   let SubtargetPredicate = HasDot6Insts in
     defm V_DOT4C_I32_I8  : VOP2Inst<"v_dot4c_i32_i8",  VOP_DOT_ACC_I32_I32>;
 
@@ -788,20 +905,20 @@ let AddedComplexity = 30 in {
 } // End AddedComplexity = 30
 
 let SubtargetPredicate = HasFmaakFmamkF32Insts, isReMaterializable = 1 in {
-def V_FMAMK_F32 : VOP2_Pseudo<"v_fmamk_f32", VOP_MADMK_F32, [], "">;
+def V_FMAMK_F32 : VOP2_Pseudo<"v_fmamk_f32", VOP_MADMK_F32, [], "">, VOPD_Component<0x2, "v_fmamk_f32">;
 
 let isCommutable = 1 in
-def V_FMAAK_F32 : VOP2_Pseudo<"v_fmaak_f32", VOP_MADAK_F32, [], "">;
+def V_FMAAK_F32 : VOP2_Pseudo<"v_fmaak_f32", VOP_MADAK_F32, [], "">, VOPD_Component<0x1, "v_fmaak_f32">;
 }
 
 let SubtargetPredicate = isGFX10Plus in {
 
-let FPDPRounding = 1 in {
+let FPDPRounding = 1, isReMaterializable = 1 in {
 def V_FMAMK_F16 : VOP2_Pseudo <"v_fmamk_f16", VOP_MADMK_F16, [], "">;
 
 let isCommutable = 1 in
 def V_FMAAK_F16 : VOP2_Pseudo <"v_fmaak_f16", VOP_MADAK_F16, [], "">;
-} // End FPDPRounding  = 1
+} // End FPDPRounding  = 1, isReMaterializable = 1
 
 let Constraints = "$vdst = $src2",
     DisableEncoding="$src2",
@@ -857,7 +974,7 @@ def : GCNPat <
 >;
 }
 
-let Predicates = [Has16BitInsts] in {
+let Predicates = [Has16BitInsts, isGFX8GFX9] in {
 
 // Undo sub x, c -> add x, -c canonicalization since c is more likely
 // an inline immediate than -c.
@@ -867,9 +984,6 @@ def : GCNPat<
   (V_SUB_U16_e64 VSrc_b16:$src0, NegSubInlineIntConst16:$src1)
 >;
 
-
-let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in {
-
 def : GCNPat<
   (i32 (zext (add i16:$src0, (i16 NegSubInlineIntConst16:$src1)))),
   (V_SUB_U16_e64 VSrc_b16:$src0, NegSubInlineIntConst16:$src1)
@@ -885,7 +999,10 @@ defm : Arithmetic_i16_0Hi_Pats<umax, V_MAX_U16_e64>;
 defm : Arithmetic_i16_0Hi_Pats<clshl_rev_16, V_LSHLREV_B16_e64>;
 defm : Arithmetic_i16_0Hi_Pats<clshr_rev_16, V_LSHRREV_B16_e64>;
 defm : Arithmetic_i16_0Hi_Pats<cashr_rev_16, V_ASHRREV_I16_e64>;
-}  // End Predicates = [Has16BitInsts, isGFX7GFX8GFX9]
+
+}  // End Predicates = [Has16BitInsts, isGFX8GFX9]
+
+let Predicates = [Has16BitInsts] in {
 
 def : ZExt_i16_i1_Pat<zext>;
 def : ZExt_i16_i1_Pat<anyext>;
@@ -917,8 +1034,16 @@ def : VOPBinOpClampPat<uaddsat, V_ADD_U16_e64, i16>;
 def : VOPBinOpClampPat<usubsat, V_SUB_U16_e64, i16>;
 }
 
+let SubtargetPredicate = isGFX11Plus in {
+  let isCommutable = 1 in {
+    defm V_AND_B16 : VOP2Inst <"v_and_b16", VOP_I16_I16_I16, and>;
+    defm V_OR_B16  : VOP2Inst <"v_or_b16", VOP_I16_I16_I16, or>;
+    defm V_XOR_B16 : VOP2Inst <"v_xor_b16", VOP_I16_I16_I16, xor>;
+  } // End isCommutable = 1
+} // End SubtargetPredicate = isGFX11Plus
+
 //===----------------------------------------------------------------------===//
-// Target-specific instruction encodings.
+// DPP Encodings
 //===----------------------------------------------------------------------===//
 
 class VOP2_DPP<bits<6> op, VOP2_DPP_Pseudo ps,
@@ -947,10 +1072,10 @@ class Base_VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps,
   let OtherPredicates = ps.OtherPredicates;
 }
 
-class VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps,
+class VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps, int subtarget,
                  string opName = ps.OpName, VOPProfile p = ps.Pfl> :
     Base_VOP2_DPP16<op, ps, opName, p>,
-    SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX10>;
+    SIMCInstr <ps.PseudoInstr, subtarget>;
 
 class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps,
                 VOPProfile p = ps.Pfl> :
@@ -972,11 +1097,254 @@ class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps,
   let OtherPredicates = ps.OtherPredicates;
 }
 
+//===----------------------------------------------------------------------===//
+// GFX11.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in {
+  //===------------------------------- VOP2 -------------------------------===//
+  multiclass VOP2Only_Real_MADK_gfx11<bits<6> op> {
+    def _gfx11 :
+      VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.GFX11>,
+      VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
+  }
+  multiclass VOP2_Real_e32_gfx11<bits<6> op> {
+    def _e32_gfx11 :
+      VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX11>,
+      VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>;
+  }
+  multiclass VOP2Only_Real_e32_gfx11<bits<6> op> {
+    let IsSingle = 1 in
+      defm NAME: VOP2_Real_e32_gfx11<op>;
+  }
+  multiclass VOP2_Real_e64_gfx11<bits<6> op> {
+    def _e64_gfx11 :
+      VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX11>,
+      VOP3e_gfx11<{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+  }
+  multiclass VOP2_Real_dpp_gfx11<bits<6> op> {
+    foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+    def _dpp_gfx11 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX11> {
+      let DecoderNamespace = "DPPGFX11";
+    }
+  }
+  multiclass VOP2_Real_dpp8_gfx11<bits<6> op> {
+    foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+    def _dpp8_gfx11 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(NAME#"_e32")> {
+      let DecoderNamespace = "DPP8GFX11";
+    }
+  }
+
+  //===------------------------- VOP2 (with name) -------------------------===//
+  multiclass VOP2_Real_e32_with_name_gfx11<bits<6> op, string opName,
+                                           string asmName, bit single = 0> {
+    defvar ps = !cast<VOP2_Pseudo>(opName#"_e32");
+    def _e32_gfx11 :
+      VOP2_Real<ps, SIEncodingFamily.GFX11, asmName>,
+      VOP2e<op{5-0}, ps.Pfl>,
+      MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX11Plus]> {
+        let AsmString = asmName # ps.AsmOperands;
+        let IsSingle = single;
+      }
+  }
+  multiclass VOP2_Real_e64_with_name_gfx11<bits<6> op, string opName,
+                                           string asmName> {
+    defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
+    def _e64_gfx11 :
+      VOP3_Real<ps, SIEncodingFamily.GFX11>,
+      VOP3e_gfx11<{0, 1, 0, 0, op{5-0}}, ps.Pfl>,
+      MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX11Plus]> {
+        let AsmString = asmName # ps.AsmOperands;
+      }
+  }
+
+  multiclass VOP2_Real_dpp_with_name_gfx11<bits<6> op, string opName,
+                                           string asmName> {
+    defvar ps = !cast<VOP2_Pseudo>(opName#"_e32");
+    foreach _ = BoolToList<ps.Pfl.HasExtDPP>.ret in
+    def _dpp_gfx11 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"),
+        SIEncodingFamily.GFX11> {
+      let AsmString = asmName # ps.Pfl.AsmDPP16;
+      let DecoderNamespace = "DPPGFX11";
+    }
+  }
+  multiclass VOP2_Real_dpp8_with_name_gfx11<bits<6> op, string opName,
+                                            string asmName> {
+    defvar ps = !cast<VOP2_Pseudo>(opName#"_e32");
+    foreach _ = BoolToList<ps.Pfl.HasExtDPP>.ret in
+    def _dpp8_gfx11 : VOP2_DPP8<op, ps> {
+      let AsmString = asmName # ps.Pfl.AsmDPP8;
+      let DecoderNamespace = "DPP8GFX11";
+    }
+  }
+
+  //===------------------------------ VOP2be ------------------------------===//
+  multiclass VOP2be_Real_e32_gfx11<bits<6> op, string opName, string asmName> {
+    defvar ps = !cast<VOP2_Pseudo>(opName#"_e32");
+    def _e32_gfx11 :
+      VOP2_Real<ps, SIEncodingFamily.GFX11>,
+      VOP2e<op{5-0}, ps.Pfl> {
+        let AsmString = asmName # !subst(", vcc", "", ps.AsmOperands);
+      }
+  }
+  multiclass VOP2be_Real_dpp_gfx11<bits<6> op, string opName, string asmName> {
+    foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+    def _dpp_gfx11 :
+      VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), SIEncodingFamily.GFX11, asmName> {
+        string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
+        let AsmString = asmName # !subst(", vcc", "", AsmDPP);
+        let DecoderNamespace = "DPPGFX11";
+      }
+    foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+    def _dpp_w32_gfx11 :
+      Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> {
+        string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
+        let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP);
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave32;
+      }
+    foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+    def _dpp_w64_gfx11 :
+      Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> {
+        string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
+        let AsmString = asmName # AsmDPP;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave64;
+      }
+  }
+  multiclass VOP2be_Real_dpp8_gfx11<bits<6> op, string opName, string asmName> {
+    foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+    def _dpp8_gfx11 :
+      VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
+        string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
+        let AsmString = asmName # !subst(", vcc", "", AsmDPP8);
+        let DecoderNamespace = "DPP8GFX11";
+      }
+    foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+    def _dpp8_w32_gfx11 :
+      VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
+        string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
+        let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP8);
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave32;
+      }
+    foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+    def _dpp8_w64_gfx11 :
+      VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
+        string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
+        let AsmString = asmName # AsmDPP8;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave64;
+      }
+  }
+
+} // End AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11"
+
+// We don't want to override separate decoderNamespaces within these
+multiclass VOP2_Realtriple_e64_gfx11<bits<6> op> {
+  defm NAME : VOP3_Realtriple_gfx11<{0, 1, 0, 0, op{5-0}}, /*isSingle=*/ 0, NAME> ;
+}
+multiclass VOP2_Realtriple_e64_with_name_gfx11<bits<6> op, string opName,
+                                               string asmName> {
+  defm NAME : VOP3_Realtriple_with_name_gfx11<{0, 1, 0, 0, op{5-0}}, opName, asmName> ;
+}
+
+multiclass VOP2be_Real_gfx11<bits<6> op, string opName, string asmName> :
+  VOP2be_Real_e32_gfx11<op, opName, asmName>,
+  VOP3be_Realtriple_gfx11<{0, 1, 0, 0, op{5-0}}, /*isSingle=*/ 0, opName, asmName>,
+  VOP2be_Real_dpp_gfx11<op, opName, asmName>,
+  VOP2be_Real_dpp8_gfx11<op, opName, asmName>;
+
+// Only for CNDMASK
+multiclass VOP2e_Real_gfx11<bits<6> op, string opName, string asmName> :
+  VOP2_Real_e32_gfx11<op>,
+  VOP2_Realtriple_e64_gfx11<op>,
+  VOP2be_Real_dpp_gfx11<op, opName, asmName>,
+  VOP2be_Real_dpp8_gfx11<op, opName, asmName>;
+
+multiclass VOP2Only_Real_gfx11<bits<6> op> :
+  VOP2Only_Real_e32_gfx11<op>,
+  VOP2_Real_dpp_gfx11<op>,
+  VOP2_Real_dpp8_gfx11<op>;
+
+multiclass VOP2_Real_NO_VOP3_gfx11<bits<6> op> :
+  VOP2_Real_e32_gfx11<op>, VOP2_Real_dpp_gfx11<op>, VOP2_Real_dpp8_gfx11<op>;
+
+multiclass VOP2_Real_FULL_gfx11<bits<6> op> :
+  VOP2_Realtriple_e64_gfx11<op>, VOP2_Real_NO_VOP3_gfx11<op>;
+
+multiclass VOP2_Real_NO_VOP3_with_name_gfx11<bits<6> op, string opName,
+                                           string asmName, bit isSingle = 0> :
+  VOP2_Real_e32_with_name_gfx11<op, opName, asmName, isSingle>,
+  VOP2_Real_dpp_with_name_gfx11<op, opName, asmName>,
+  VOP2_Real_dpp8_with_name_gfx11<op, opName, asmName>;
+
+multiclass VOP2_Real_FULL_with_name_gfx11<bits<6> op, string opName,
+                                         string asmName> :
+  VOP2_Realtriple_e64_with_name_gfx11<op, opName, asmName>,
+  VOP2_Real_NO_VOP3_with_name_gfx11<op, opName, asmName>;
+
+multiclass VOP2_Real_NO_DPP_gfx11<bits<6> op> :
+  VOP2_Real_e32_gfx11<op>, VOP2_Real_e64_gfx11<op>;
+
+multiclass VOP2_Real_NO_DPP_with_name_gfx11<bits<6> op, string opName,
+                                           string asmName> :
+  VOP2_Real_e32_with_name_gfx11<op, opName, asmName>,
+  VOP2_Real_e64_with_name_gfx11<op, opName, asmName>;
+
+defm V_CNDMASK_B32 : VOP2e_Real_gfx11<0x001, "V_CNDMASK_B32",
+  "v_cndmask_b32">;
+defm V_DOT2ACC_F32_F16 : VOP2_Real_NO_VOP3_with_name_gfx11<0x002,
+  "V_DOT2C_F32_F16", "v_dot2acc_f32_f16", 1>;
+defm V_FMAC_DX9_ZERO_F32 : VOP2_Real_NO_DPP_with_name_gfx11<0x006,
+  "V_FMAC_LEGACY_F32", "v_fmac_dx9_zero_f32">;
+defm V_MUL_DX9_ZERO_F32 : VOP2_Real_FULL_with_name_gfx11<0x007,
+  "V_MUL_LEGACY_F32", "v_mul_dx9_zero_f32">;
+defm V_LSHLREV_B32        : VOP2_Real_FULL_gfx11<0x018>;
+defm V_LSHRREV_B32        : VOP2_Real_FULL_gfx11<0x019>;
+defm V_ASHRREV_I32        : VOP2_Real_FULL_gfx11<0x01a>;
+defm V_ADD_CO_CI_U32 :
+  VOP2be_Real_gfx11<0x020, "V_ADDC_U32", "v_add_co_ci_u32">;
+defm V_SUB_CO_CI_U32 :
+  VOP2be_Real_gfx11<0x021, "V_SUBB_U32", "v_sub_co_ci_u32">;
+defm V_SUBREV_CO_CI_U32 :
+  VOP2be_Real_gfx11<0x022, "V_SUBBREV_U32", "v_subrev_co_ci_u32">;
+
+defm V_CVT_PK_RTZ_F16_F32  : VOP2_Real_FULL_with_name_gfx11<0x02f,
+  "V_CVT_PKRTZ_F16_F32", "v_cvt_pk_rtz_f16_f32">;
+defm V_PK_FMAC_F16     : VOP2Only_Real_gfx11<0x03c>;
+
+// VOP3 only.
+defm V_CNDMASK_B16        : VOP3Only_Realtriple_gfx11<0x25d>;
+defm V_LDEXP_F32          : VOP3Only_Realtriple_gfx11<0x31c>;
+defm V_BFM_B32            : VOP3Only_Realtriple_gfx11<0x31d>;
+defm V_BCNT_U32_B32       : VOP3Only_Realtriple_gfx11<0x31e>;
+defm V_MBCNT_LO_U32_B32   : VOP3Only_Realtriple_gfx11<0x31f>;
+defm V_MBCNT_HI_U32_B32   : VOP3Only_Realtriple_gfx11<0x320>;
+defm V_CVT_PKNORM_I16_F32 : VOP3Only_Realtriple_gfx11<0x321>;
+defm V_CVT_PKNORM_U16_F32 : VOP3Only_Realtriple_gfx11<0x322>;
+defm V_CVT_PK_U16_U32     : VOP3Only_Realtriple_gfx11<0x323>;
+defm V_CVT_PK_I16_I32     : VOP3Only_Realtriple_gfx11<0x324>;
+defm V_ADD_CO_U32         : VOP3beOnly_Realtriple_gfx11<0x300>;
+defm V_SUB_CO_U32         : VOP3beOnly_Realtriple_gfx11<0x301>;
+defm V_SUBREV_CO_U32      : VOP3beOnly_Realtriple_gfx11<0x302>;
+
+let SubtargetPredicate = isGFX11Plus in {
+  defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_gfx11>;
+
+  defm : VOP2bInstAliases<
+    V_ADDC_U32_e32, V_ADD_CO_CI_U32_e32_gfx11, "v_add_co_ci_u32">;
+  defm : VOP2bInstAliases<
+    V_SUBB_U32_e32, V_SUB_CO_CI_U32_e32_gfx11, "v_sub_co_ci_u32">;
+  defm : VOP2bInstAliases<
+    V_SUBBREV_U32_e32, V_SUBREV_CO_CI_U32_e32_gfx11, "v_subrev_co_ci_u32">;
+} // End SubtargetPredicate = isGFX11Plus
+
 //===----------------------------------------------------------------------===//
 // GFX10.
 //===----------------------------------------------------------------------===//
 
-let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
   //===------------------------------- VOP2 -------------------------------===//
   multiclass VOP2Only_Real_MADK_gfx10<bits<6> op> {
     def _gfx10 :
@@ -1011,13 +1379,13 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
     }
   }
   multiclass VOP2_Real_dpp_gfx10<bits<6> op> {
-    foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
-    def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")> {
+    foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP>.ret in
+    def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX10> {
       let DecoderNamespace = "SDWA10";
     }
   }
   multiclass VOP2_Real_dpp8_gfx10<bits<6> op> {
-    foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+    foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP>.ret in
     def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(NAME#"_e32")> {
       let DecoderNamespace = "DPP8";
     }
@@ -1056,15 +1424,15 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
     }
     multiclass VOP2_Real_dpp_gfx10_with_name<bits<6> op, string opName,
                                              string asmName> {
-      foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
-      def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp")> {
+      foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in
+      def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), SIEncodingFamily.GFX10> {
         VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32");
         let AsmString = asmName # ps.Pfl.AsmDPP16;
       }
     }
     multiclass VOP2_Real_dpp8_gfx10_with_name<bits<6> op, string opName,
                                               string asmName> {
-      foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+      foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in
       def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
         VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32");
         let AsmString = asmName # ps.Pfl.AsmDPP8;
@@ -1122,14 +1490,14 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
       }
   }
   multiclass VOP2be_Real_dpp_gfx10<bits<6> op, string opName, string asmName> {
-    foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+    foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in
     def _dpp_gfx10 :
-      VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> {
+      VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), SIEncodingFamily.GFX10, asmName> {
         string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
         let AsmString = asmName # !subst(", vcc", "", AsmDPP);
         let DecoderNamespace = "SDWA10";
       }
-    foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+    foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in
     def _dpp_w32_gfx10 :
       Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> {
         string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
@@ -1137,7 +1505,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
         let isAsmParserOnly = 1;
         let WaveSizePredicate = isWave32;
       }
-    foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+    foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in
     def _dpp_w64_gfx10 :
       Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> {
         string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
@@ -1147,14 +1515,14 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
       }
   }
   multiclass VOP2be_Real_dpp8_gfx10<bits<6> op, string opName, string asmName> {
-    foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+    foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in
     def _dpp8_gfx10 :
       VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
         string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
         let AsmString = asmName # !subst(", vcc", "", AsmDPP8);
         let DecoderNamespace = "DPP8";
       }
-    foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+    foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in
     def _dpp8_w32_gfx10 :
       VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
         string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
@@ -1162,7 +1530,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
         let isAsmParserOnly = 1;
         let WaveSizePredicate = isWave32;
       }
-    foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+    foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in
     def _dpp8_w64_gfx10 :
       VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
         string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
@@ -1189,7 +1557,10 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
         let IsSingle = 1;
       }
   }
-} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
+} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
+
+multiclass VOP2Only_Real_MADK_gfx10_gfx11<bits<6> op> :
+  VOP2Only_Real_MADK_gfx10<op>, VOP2Only_Real_MADK_gfx11<op>;
 
 multiclass VOP2be_Real_gfx10<bits<6> op, string opName, string asmName> :
   VOP2be_Real_e32_gfx10<op, opName, asmName>,
@@ -1209,7 +1580,10 @@ multiclass VOP2_Real_gfx10<bits<6> op> :
   VOP2_Real_e32_gfx10<op>, VOP2_Real_e64_gfx10<op>,
   VOP2_Real_sdwa_gfx10<op>, VOP2_Real_dpp_gfx10<op>, VOP2_Real_dpp8_gfx10<op>;
 
-multiclass VOP2_Real_gfx10_with_name<bits<6> op, string opName,
+multiclass VOP2_Real_gfx10_gfx11<bits<6> op> :
+  VOP2_Real_gfx10<op>, VOP2_Real_FULL_gfx11<op>;
+
+multiclass VOP2_Real_with_name_gfx10<bits<6> op, string opName,
                                      string asmName> :
   VOP2_Real_e32_gfx10_with_name<op, opName, asmName>,
   VOP2_Real_e64_gfx10_with_name<op, opName, asmName>,
@@ -1217,36 +1591,41 @@ multiclass VOP2_Real_gfx10_with_name<bits<6> op, string opName,
   VOP2_Real_dpp_gfx10_with_name<op, opName, asmName>,
   VOP2_Real_dpp8_gfx10_with_name<op, opName, asmName>;
 
+multiclass VOP2_Real_with_name_gfx10_gfx11<bits<6> op, string opName,
+                                     string asmName> :
+  VOP2_Real_with_name_gfx10<op, opName, asmName>,
+  VOP2_Real_FULL_with_name_gfx11<op, opName, asmName>;
+
 // NB: Same opcode as v_mac_legacy_f32
 let DecoderNamespace = "GFX10_B" in
 defm V_FMAC_LEGACY_F32 : VOP2_Real_gfx10<0x006>;
 
-defm V_XNOR_B32        : VOP2_Real_gfx10<0x01e>;
-defm V_FMAC_F32        : VOP2_Real_gfx10<0x02b>;
-defm V_FMAMK_F32       : VOP2Only_Real_MADK_gfx10<0x02c>;
-defm V_FMAAK_F32       : VOP2Only_Real_MADK_gfx10<0x02d>;
-defm V_ADD_F16         : VOP2_Real_gfx10<0x032>;
-defm V_SUB_F16         : VOP2_Real_gfx10<0x033>;
-defm V_SUBREV_F16      : VOP2_Real_gfx10<0x034>;
-defm V_MUL_F16         : VOP2_Real_gfx10<0x035>;
-defm V_FMAC_F16        : VOP2_Real_gfx10<0x036>;
-defm V_FMAMK_F16       : VOP2Only_Real_MADK_gfx10<0x037>;
-defm V_FMAAK_F16       : VOP2Only_Real_MADK_gfx10<0x038>;
-defm V_MAX_F16         : VOP2_Real_gfx10<0x039>;
-defm V_MIN_F16         : VOP2_Real_gfx10<0x03a>;
-defm V_LDEXP_F16       : VOP2_Real_gfx10<0x03b>;
+defm V_XNOR_B32        : VOP2_Real_gfx10_gfx11<0x01e>;
+defm V_FMAC_F32        : VOP2_Real_gfx10_gfx11<0x02b>;
+defm V_FMAMK_F32       : VOP2Only_Real_MADK_gfx10_gfx11<0x02c>;
+defm V_FMAAK_F32       : VOP2Only_Real_MADK_gfx10_gfx11<0x02d>;
+defm V_ADD_F16         : VOP2_Real_gfx10_gfx11<0x032>;
+defm V_SUB_F16         : VOP2_Real_gfx10_gfx11<0x033>;
+defm V_SUBREV_F16      : VOP2_Real_gfx10_gfx11<0x034>;
+defm V_MUL_F16         : VOP2_Real_gfx10_gfx11<0x035>;
+defm V_FMAC_F16        : VOP2_Real_gfx10_gfx11<0x036>;
+defm V_FMAMK_F16       : VOP2Only_Real_MADK_gfx10_gfx11<0x037>;
+defm V_FMAAK_F16       : VOP2Only_Real_MADK_gfx10_gfx11<0x038>;
+defm V_MAX_F16         : VOP2_Real_gfx10_gfx11<0x039>;
+defm V_MIN_F16         : VOP2_Real_gfx10_gfx11<0x03a>;
+defm V_LDEXP_F16       : VOP2_Real_gfx10_gfx11<0x03b>;
 
 let IsSingle = 1 in {
-defm V_PK_FMAC_F16     : VOP2_Real_e32_gfx10<0x03c>;
+  defm V_PK_FMAC_F16     : VOP2_Real_e32_gfx10<0x03c>;
 }
 
 // VOP2 no carry-in, carry-out.
 defm V_ADD_NC_U32 :
-  VOP2_Real_gfx10_with_name<0x025, "V_ADD_U32", "v_add_nc_u32">;
+  VOP2_Real_with_name_gfx10_gfx11<0x025, "V_ADD_U32", "v_add_nc_u32">;
 defm V_SUB_NC_U32 :
-  VOP2_Real_gfx10_with_name<0x026, "V_SUB_U32", "v_sub_nc_u32">;
+  VOP2_Real_with_name_gfx10_gfx11<0x026, "V_SUB_U32", "v_sub_nc_u32">;
 defm V_SUBREV_NC_U32 :
-  VOP2_Real_gfx10_with_name<0x027, "V_SUBREV_U32", "v_subrev_nc_u32">;
+  VOP2_Real_with_name_gfx10_gfx11<0x027, "V_SUBREV_U32", "v_subrev_nc_u32">;
 
 // VOP2 carry-in, carry-out.
 defm V_ADD_CO_CI_U32 :
@@ -1275,7 +1654,7 @@ defm V_ADD_CO_U32 : VOP3beOnly_Real_gfx10<0x30f>;
 defm V_SUB_CO_U32 : VOP3beOnly_Real_gfx10<0x310>;
 defm V_SUBREV_CO_U32 : VOP3beOnly_Real_gfx10<0x319>;
 
-let SubtargetPredicate = isGFX10Plus in {
+let SubtargetPredicate = isGFX10Only in {
   defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_gfx10>;
 
   defm : VOP2bInstAliases<
@@ -1284,10 +1663,10 @@ let SubtargetPredicate = isGFX10Plus in {
     V_SUBB_U32_e32, V_SUB_CO_CI_U32_e32_gfx10, "v_sub_co_ci_u32">;
   defm : VOP2bInstAliases<
     V_SUBBREV_U32_e32, V_SUBREV_CO_CI_U32_e32_gfx10, "v_subrev_co_ci_u32">;
-} // End SubtargetPredicate = isGFX10Plus
+} // End SubtargetPredicate = isGFX10Only
 
 //===----------------------------------------------------------------------===//
-// GFX6, GFX7, GFX10.
+// GFX6, GFX7, GFX10, GFX11
 //===----------------------------------------------------------------------===//
 
 class VOP2_DPPe <bits<6> op, VOP2_DPP_Pseudo ps, VOPProfile P = ps.Pfl> :
@@ -1338,6 +1717,9 @@ multiclass VOP2_Real_gfx6_gfx7<bits<6> op> :
 multiclass VOP2_Real_gfx6_gfx7_gfx10<bits<6> op> :
   VOP2_Real_gfx6_gfx7<op>, VOP2_Real_gfx10<op>;
 
+multiclass VOP2_Real_gfx6_gfx7_gfx10_gfx11<bits<6> op> :
+  VOP2_Real_gfx6_gfx7_gfx10<op>, VOP2_Real_FULL_gfx11<op>;
+
 multiclass VOP2be_Real_gfx6_gfx7<bits<6> op> :
   VOP2_Real_e32_gfx6_gfx7<op>, VOP2be_Real_e64_gfx6_gfx7<op>;
 
@@ -1398,28 +1780,28 @@ let SubtargetPredicate = isGFX6GFX7 in {
   def : VOP2e64InstAlias<V_SUBREV_CO_U32_e64, V_SUBREV_I32_e64_gfx6_gfx7>;
 } // End SubtargetPredicate = isGFX6GFX7
 
-defm V_ADD_F32            : VOP2_Real_gfx6_gfx7_gfx10<0x003>;
-defm V_SUB_F32            : VOP2_Real_gfx6_gfx7_gfx10<0x004>;
-defm V_SUBREV_F32         : VOP2_Real_gfx6_gfx7_gfx10<0x005>;
+defm V_ADD_F32            : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x003>;
+defm V_SUB_F32            : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x004>;
+defm V_SUBREV_F32         : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x005>;
 defm V_MAC_LEGACY_F32     : VOP2_Real_gfx6_gfx7_gfx10<0x006>;
 defm V_MUL_LEGACY_F32     : VOP2_Real_gfx6_gfx7_gfx10<0x007>;
-defm V_MUL_F32            : VOP2_Real_gfx6_gfx7_gfx10<0x008>;
-defm V_MUL_I32_I24        : VOP2_Real_gfx6_gfx7_gfx10<0x009>;
-defm V_MUL_HI_I32_I24     : VOP2_Real_gfx6_gfx7_gfx10<0x00a>;
-defm V_MUL_U32_U24        : VOP2_Real_gfx6_gfx7_gfx10<0x00b>;
-defm V_MUL_HI_U32_U24     : VOP2_Real_gfx6_gfx7_gfx10<0x00c>;
-defm V_MIN_F32            : VOP2_Real_gfx6_gfx7_gfx10<0x00f>;
-defm V_MAX_F32            : VOP2_Real_gfx6_gfx7_gfx10<0x010>;
-defm V_MIN_I32            : VOP2_Real_gfx6_gfx7_gfx10<0x011>;
-defm V_MAX_I32            : VOP2_Real_gfx6_gfx7_gfx10<0x012>;
-defm V_MIN_U32            : VOP2_Real_gfx6_gfx7_gfx10<0x013>;
-defm V_MAX_U32            : VOP2_Real_gfx6_gfx7_gfx10<0x014>;
+defm V_MUL_F32            : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x008>;
+defm V_MUL_I32_I24        : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x009>;
+defm V_MUL_HI_I32_I24     : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x00a>;
+defm V_MUL_U32_U24        : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x00b>;
+defm V_MUL_HI_U32_U24     : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x00c>;
+defm V_MIN_F32            : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x00f>;
+defm V_MAX_F32            : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x010>;
+defm V_MIN_I32            : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x011>;
+defm V_MAX_I32            : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x012>;
+defm V_MIN_U32            : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x013>;
+defm V_MAX_U32            : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x014>;
 defm V_LSHRREV_B32        : VOP2_Real_gfx6_gfx7_gfx10<0x016>;
 defm V_ASHRREV_I32        : VOP2_Real_gfx6_gfx7_gfx10<0x018>;
 defm V_LSHLREV_B32        : VOP2_Real_gfx6_gfx7_gfx10<0x01a>;
-defm V_AND_B32            : VOP2_Real_gfx6_gfx7_gfx10<0x01b>;
-defm V_OR_B32             : VOP2_Real_gfx6_gfx7_gfx10<0x01c>;
-defm V_XOR_B32            : VOP2_Real_gfx6_gfx7_gfx10<0x01d>;
+defm V_AND_B32            : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x01b>;
+defm V_OR_B32             : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x01c>;
+defm V_XOR_B32            : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x01d>;
 defm V_MAC_F32            : VOP2_Real_gfx6_gfx7_gfx10<0x01f>;
 defm V_CVT_PKRTZ_F16_F32  : VOP2_Real_gfx6_gfx7_gfx10<0x02f>;
 defm V_MADMK_F32          : VOP2Only_Real_MADK_gfx6_gfx7_gfx10<0x020>;
@@ -1436,6 +1818,13 @@ multiclass VOP2_Real_MADK_vi <bits<6> op> {
             VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
 }
 
+multiclass VOP2_Real_MADK_gfx940 <bits<6> op> {
+  def _gfx940 : VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.GFX940>,
+                VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl> {
+    let DecoderNamespace = "GFX9";
+  }
+}
+
 multiclass VOP2_Real_e32_vi <bits<6> op> {
   def _e32_vi :
     VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>,
@@ -1736,6 +2125,11 @@ let SubtargetPredicate = isGFX90APlus in {
   }
 } // End SubtargetPredicate = isGFX90APlus
 
+let SubtargetPredicate = HasFmaakFmamkF32Insts in {
+defm V_FMAMK_F32        : VOP2_Real_MADK_gfx940 <0x17>;
+defm V_FMAAK_F32        : VOP2_Real_MADK_gfx940 <0x18>;
+}
+
 multiclass VOP2_Real_DOT_ACC_gfx9<bits<6> op> : VOP2_Real_e32_vi<op> {
   def _dpp_vi : VOP2_DPP<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>;
 }
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 494e3aeb6d55..dddd0aacc140 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -6,191 +6,25 @@
 //
 //===----------------------------------------------------------------------===//
 
-//===----------------------------------------------------------------------===//
-// VOP3 Classes
-//===----------------------------------------------------------------------===//
-
-class getVOP3ModPat<VOPProfile P, SDPatternOperator node> {
-  dag src0 = !if(P.HasOMod,
-    (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
-    (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp));
-
-  list<dag> ret3 = [(set P.DstVT:$vdst,
-    (DivergentFragOrOp<node, P>.ret (P.Src0VT src0),
-          (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
-          (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))];
-
-  list<dag> ret2 = [(set P.DstVT:$vdst,
-    (DivergentFragOrOp<node, P>.ret (P.Src0VT src0),
-          (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))];
-
-  list<dag> ret1 = [(set P.DstVT:$vdst,
-    (DivergentFragOrOp<node, P>.ret (P.Src0VT src0)))];
-
-  list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
-                  !if(!eq(P.NumSrcArgs, 2), ret2,
-                  ret1));
-}
-
-class getVOP3PModPat<VOPProfile P, SDPatternOperator node, bit HasExplicitClamp> {
-  dag src0_dag = (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers));
-  dag src1_dag = (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers));
-  dag src2_dag = (P.Src2VT (VOP3PMods P.Src2VT:$src2, i32:$src2_modifiers));
-  dag clamp_dag = (i1 timm:$clamp);
-
-  list<dag> ret3 = [(set P.DstVT:$vdst,
-    !if(HasExplicitClamp,
-        (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, src2_dag, clamp_dag),
-        (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, src2_dag)))];
-
-  list<dag> ret2 = [(set P.DstVT:$vdst,
-    !if(HasExplicitClamp,
-        (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, clamp_dag),
-        (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag)))];
-
-  list<dag> ret1 = [(set P.DstVT:$vdst,
-    !if(HasExplicitClamp,
-        (DivergentFragOrOp<node, P>.ret src0_dag, clamp_dag),
-        (DivergentFragOrOp<node, P>.ret src0_dag)))];
-
-  list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
-                  !if(!eq(P.NumSrcArgs, 2), ret2,
-                  ret1));
-}
-
-class getVOP3OpSelPat<VOPProfile P, SDPatternOperator node> {
-  list<dag> ret3 = [(set P.DstVT:$vdst,
-        (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers)),
-          (P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers)),
-          (P.Src2VT (VOP3OpSel P.Src2VT:$src2, i32:$src2_modifiers))))];
-
-  list<dag> ret2 = [(set P.DstVT:$vdst,
-    (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers)),
-                                    (P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers))))];
-
-  list<dag> ret1 = [(set P.DstVT:$vdst,
-    (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers))))];
-
-  list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
-                  !if(!eq(P.NumSrcArgs, 2), ret2,
-                  ret1));
-}
-
-class getVOP3OpSelModPat<VOPProfile P, SDPatternOperator node> {
-  list<dag> ret3 = [(set P.DstVT:$vdst,
-    (DivergentFragOrOp<node, P>.ret (P.Src0VT !if(P.HasClamp, (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers),
-                                    (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))),
-          (P.Src1VT (VOP3OpSelMods P.Src1VT:$src1, i32:$src1_modifiers)),
-          (P.Src2VT (VOP3OpSelMods P.Src2VT:$src2, i32:$src2_modifiers))))];
-
-  list<dag> ret2 = [(set P.DstVT:$vdst,
-    (DivergentFragOrOp<node, P>.ret !if(P.HasClamp, (P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers)),
-                          (P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))),
-          (P.Src1VT (VOP3OpSelMods P.Src1VT:$src1, i32:$src1_modifiers))))];
-
-  list<dag> ret1 = [(set P.DstVT:$vdst,
-    (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))))];
-
-  list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
-                  !if(!eq(P.NumSrcArgs, 2), ret2,
-                  ret1));
-}
-
-class getVOP3Pat<VOPProfile P, SDPatternOperator node> {
-  list<dag> ret3 = [(set P.DstVT:$vdst, (DivergentFragOrOp<node, P>.ret P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2))];
-  list<dag> ret2 = [(set P.DstVT:$vdst, (DivergentFragOrOp<node, P>.ret P.Src0VT:$src0, P.Src1VT:$src1))];
-  list<dag> ret1 = [(set P.DstVT:$vdst, (DivergentFragOrOp<node, P>.ret P.Src0VT:$src0))];
-  list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
-                  !if(!eq(P.NumSrcArgs, 2), ret2,
-                  ret1));
-}
-
-class getVOP3ClampPat<VOPProfile P, SDPatternOperator node> {
-  list<dag> ret3 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, i1:$clamp))];
-  list<dag> ret2 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, i1:$clamp))];
-  list<dag> ret1 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, i1:$clamp))];
-  list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
-                  !if(!eq(P.NumSrcArgs, 2), ret2,
-                  ret1));
-}
-
-class getVOP3MAIPat<VOPProfile P, SDPatternOperator node> {
-  list<dag> ret = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2,
-                                        timm:$cbsz, timm:$abid, timm:$blgp))];
-}
-
-// Consistently gives instructions a _e64 suffix.
-multiclass VOP3Inst_Pseudo_Wrapper<string opName, VOPProfile P, list<dag> pattern = []> {
-  def _e64 : VOP3_Pseudo<opName, P, pattern>;
-}
-
-class VOP3InstBase<string OpName, VOPProfile P, SDPatternOperator node = null_frag> :
-  VOP3_Pseudo<OpName, P,
-  !if(P.HasOpSel,
-      !if(P.HasModifiers,
-          getVOP3OpSelModPat<P, node>.ret,
-          getVOP3OpSelPat<P, node>.ret),
-      !if(P.HasModifiers,
-          getVOP3ModPat<P, node>.ret,
-          !if(P.HasIntClamp,
-              getVOP3ClampPat<P, node>.ret,
-              !if (P.IsMAI,
-                  getVOP3MAIPat<P, node>.ret,
-                  getVOP3Pat<P, node>.ret)))),
-  0, P.HasOpSel> {
-
-  let IntClamp = P.HasIntClamp;
-  let AsmMatchConverter =
-  !if(P.HasOpSel,
-      "cvtVOP3OpSel",
-      !if(!or(P.HasModifiers, P.HasOMod, P.HasIntClamp),
-          "cvtVOP3",
-          ""));
-}
-
-multiclass VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> {
-  def _e64 : VOP3InstBase<OpName, P, node>;
-}
-
 // Special case for v_div_fmas_{f32|f64}, since it seems to be the
 // only VOP instruction that implicitly reads VCC.
 let Asm64 = " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod" in {
 def VOP_F32_F32_F32_F32_VCC : VOPProfile<[f32, f32, f32, f32]> {
   let Outs64 = (outs DstRC.RegClass:$vdst);
+  let HasExtVOP3DPP = 0;
+  let HasExtDPP = 0;
 }
 def VOP_F64_F64_F64_F64_VCC : VOPProfile<[f64, f64, f64, f64]> {
   let Outs64 = (outs DstRC.RegClass:$vdst);
 }
 }
 
-class VOP3Features<bit Clamp, bit OpSel, bit Packed, bit MAI> {
-  bit HasClamp = Clamp;
-  bit HasOpSel = OpSel;
-  bit IsPacked = Packed;
-  bit IsMAI = MAI;
-}
-
-def VOP3_REGULAR : VOP3Features<0, 0, 0, 0>;
-def VOP3_CLAMP   : VOP3Features<1, 0, 0, 0>;
-def VOP3_OPSEL   : VOP3Features<1, 1, 0, 0>;
-def VOP3_PACKED  : VOP3Features<1, 1, 1, 0>;
-def VOP3_MAI     : VOP3Features<0, 0, 0, 1>;
-
-class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProfile<P.ArgVT> {
-
-  let HasClamp = !if(Features.HasClamp, 1, P.HasClamp);
-  let HasOpSel = !if(Features.HasOpSel, 1, P.HasOpSel);
-  let IsMAI    = !if(Features.IsMAI,    1, P.IsMAI);
-  let IsPacked = !if(Features.IsPacked, 1, P.IsPacked);
-
-  let HasModifiers = !if(Features.IsMAI, 0, !or(Features.IsPacked, P.HasModifiers));
-  let IsSingle = 1;
-}
-
 class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> {
   let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
   let Asm64 = "$vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod";
   let IsSingle = 1;
+  let HasExtVOP3DPP = 0;
+  let HasExtDPP = 0;
 }
 
 def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32>;
@@ -198,12 +32,22 @@ def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64>;
 
 def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> {
   let HasClamp = 1;
-  let IsSingle = 1;
 
+  let IsSingle = 1;
   let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
   let Asm64 = "$vdst, $sdst, $src0, $src1, $src2$clamp";
 }
 
+class V_MUL_PROF<VOPProfile P> : VOP3_Profile<P> {
+  let HasExtVOP3DPP = 0;
+  let HasExtDPP = 0;
+}
+
+def DIV_FIXUP_F32_PROF : VOP3_Profile<VOP_F32_F32_F32_F32> {
+  let HasExtVOP3DPP = 0;
+  let HasExtDPP = 0;
+}
+
 //===----------------------------------------------------------------------===//
 // VOP3 INTERP
 //===----------------------------------------------------------------------===//
@@ -304,10 +148,10 @@ defm V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_l
 } // End SchedRW = [WriteDoubleAdd]
 
 let SchedRW = [WriteIntMul] in {
-defm V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile<VOP_I32_I32_I32>, DivergentBinFrag<mul>>;
-defm V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", VOP3_Profile<VOP_I32_I32_I32>, mulhu>;
-defm V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", VOP3_Profile<VOP_I32_I32_I32>>;
-defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", VOP3_Profile<VOP_I32_I32_I32>, mulhs>;
+defm V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", V_MUL_PROF<VOP_I32_I32_I32>, DivergentBinFrag<mul>>;
+defm V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", V_MUL_PROF<VOP_I32_I32_I32>, mulhu>;
+defm V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", V_MUL_PROF<VOP_I32_I32_I32>>;
+defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", V_MUL_PROF<VOP_I32_I32_I32>, mulhs>;
 } // End SchedRW = [WriteIntMul]
 } // End isReMaterializable = 1
 
@@ -367,7 +211,7 @@ let isCommutable = 1 in {
 } // End isCommutable = 1
 defm V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile<VOP_I32_F32_I32_I32>, int_amdgcn_cvt_pk_u8_f32>;
 
-defm V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUdiv_fixup>;
+defm V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", DIV_FIXUP_F32_PROF, AMDGPUdiv_fixup>;
 
 let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in {
   defm V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, AMDGPUdiv_fixup>;
@@ -419,9 +263,9 @@ def : GCNPat<
 >;
 
 let isReMaterializable = 1 in {
-let SubtargetPredicate = isGFX6GFX7GFX10 in {
+let SubtargetPredicate = isGFX6GFX7GFX10Plus in {
 defm V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
-} // End SubtargetPredicate = isGFX6GFX7GFX10
+} // End SubtargetPredicate = isGFX6GFX7GFX10Plus
 
 let SchedRW = [Write32Bit] in {
 let SubtargetPredicate = isGFX8Plus in {
@@ -430,21 +274,30 @@ defm V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMD
 } // End SchedRW = [Write32Bit]
 } // End isReMaterializable = 1
 
-let SubtargetPredicate = isGFX7Plus in {
+def VOPProfileMQSAD : VOP3_Profile<VOP_V4I32_I64_I32_V4I32, VOP3_CLAMP> {
+  let HasModifiers = 0;
+}
 
+let SubtargetPredicate = isGFX7Plus in {
 let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in {
 defm V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
-defm V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32, VOP3_CLAMP>>;
+defm V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOPProfileMQSAD>;
 } // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32]
+} // End SubtargetPredicate = isGFX7Plus
 
 let isCommutable = 1 in {
 let SchedRW = [WriteIntMul, WriteSALU] in {
+let SubtargetPredicate = isGFX7GFX8GFX9GFX10 in {
 defm V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
 defm V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
+}
+let SubtargetPredicate = isGFX11Only, Constraints = "@earlyclobber $vdst" in {
+defm V_MAD_U64_U32_gfx11 : VOP3Inst <"v_mad_u64_u32_gfx11", VOP3b_I64_I1_I32_I32_I64>;
+defm V_MAD_I64_I32_gfx11 : VOP3Inst <"v_mad_i64_i32_gfx11", VOP3b_I64_I1_I32_I32_I64>;
+} // End SubtargetPredicate = isGFX11Only, Constraints = "@earlyclobber $vdst"
 } // End SchedRW = [WriteIntMul, WriteSALU]
 } // End isCommutable = 1
 
-} // End SubtargetPredicate = isGFX7Plus
 
 let FPDPRounding = 1 in {
   let Predicates = [Has16BitInsts, isGFX8Only] in {
@@ -557,7 +410,7 @@ defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_I16_gfx9_e64>;
 
 } // End Predicates = [Has16BitInsts, isGFX10Plus]
 
-class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
+class ThreeOpFragSDAG<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
   (ops node:$x, node:$y, node:$z),
   // When the inner operation is used multiple times, selecting 3-op
   // instructions may still be beneficial -- if the other users can be
@@ -587,7 +440,9 @@ class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
     return true;
   }]> {
   let PredicateCodeUsesOperands = 1;
+}
 
+class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : ThreeOpFragSDAG<op1, op2> {
   // The divergence predicate is irrelevant in GlobalISel, as we have
   // proper register bank checks. We just need to verify the constant
   // bus restriction when all the sources are considered.
@@ -609,6 +464,23 @@ class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
   }];
 }
 
+def shl_0_to_4 : PatFrag<
+  (ops node:$src0, node:$src1), (shl node:$src0, node:$src1),
+  [{
+     if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+       return C->getZExtValue() <= 4;
+     }
+     return false;
+   }]> {
+  let GISelPredicateCode = [{
+    int64_t Imm = 0;
+    if (!mi_match(MI.getOperand(2).getReg(), MRI, m_ICst(Imm)) &&
+        !mi_match(MI.getOperand(2).getReg(), MRI, m_Copy(m_ICst(Imm))))
+      return false;
+    return (uint64_t)Imm <= 4;
+  }];
+}
+
 let SubtargetPredicate = isGFX9Plus in {
 let isCommutable = 1, isReMaterializable = 1 in {
   defm V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
@@ -649,6 +521,10 @@ defm V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile<VOP_I32_I32_I32_I
 defm V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
 } // End isReMaterializable = 1
 
+// V_LSHL_ADD_U64: D0.u64 = (S0.u64 << S1.u[2:0]) + S2.u64
+// src0 is shifted left by 0-4 (use “0” to get ADD_U64).
+let SubtargetPredicate = isGFX940Plus in
+defm V_LSHL_ADD_U64 : VOP3Inst <"v_lshl_add_u64", VOP3_Profile<VOP_I64_I64_I32_I64>>;
 
 class ThreeOp_i32_Pats <SDPatternOperator op1, SDPatternOperator op2, Instruction inst> : GCNPat <
   // This matches (op2 (op1 i32:$src0, i32:$src1), i32:$src2) with conditions.
@@ -664,6 +540,12 @@ def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32_e64>;
 def : ThreeOp_i32_Pats<or, or, V_OR3_B32_e64>;
 def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>;
 
+let SubtargetPredicate = isGFX940Plus in
+def : GCNPat<
+  (ThreeOpFrag<shl_0_to_4, add> i64:$src0, i32:$src1, i64:$src2),
+  (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2)
+>;
+
 def : VOPBinOpClampPat<saddsat, V_ADD_I32_e64, i32>;
 def : VOPBinOpClampPat<ssubsat, V_SUB_I32_e64, i32>;
 
@@ -688,6 +570,33 @@ def : OpSelBinOpClampPat<saddsat, V_ADD_I16_e64>;
 def : OpSelBinOpClampPat<ssubsat, V_SUB_I16_e64>;
 } // End SubtargetPredicate = isGFX9Plus
 
+// FIXME: GlobalISel in general does not handle instructions with 2 results,
+// so it cannot use these patterns.
+multiclass IMAD32_Pats <VOP3_Pseudo inst> {
+  def : GCNPat <
+        (ThreeOpFrag<mul, add> i32:$src0, i32:$src1, i32:$src2),
+        (EXTRACT_SUBREG (inst $src0, $src1,
+                              (REG_SEQUENCE SReg_64, // Use scalar and let it be legalized
+                                            $src2, sub0,
+                                            (i32 (IMPLICIT_DEF)), sub1),
+                                            0 /* clamp */),
+                        sub0)
+        >;
+  // Immediate src2 in the pattern above will not fold because it would be partially
+  // undef. Hence define specialized pattern for this case.
+  // FIXME: GlobalISel pattern exporter fails to export a pattern like this and asserts,
+  // make it SDAG only.
+  def : GCNPat <
+        (ThreeOpFragSDAG<mul, add> i32:$src0, i32:$src1, (i32 imm:$src2)),
+        (EXTRACT_SUBREG (inst $src0, $src1, (i64 (as_i64imm $src2)), 0 /* clamp */), sub0)
+        >;
+}
+
+let SubtargetPredicate = isGFX9GFX10 in // exclude pre-GFX9 where it was slow
+defm : IMAD32_Pats<V_MAD_U64_U32_e64>;
+let SubtargetPredicate = isGFX11Only in
+defm : IMAD32_Pats<V_MAD_U64_U32_gfx11_e64>;
+
 def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3_OPSEL> {
   let Src0RC64 = VRegSrc_32;
   let Src1RC64 = SCSrc_b32;
@@ -697,6 +606,8 @@ def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3
                           IntOpSelMods:$src2_modifiers, SCSrc_b32:$src2,
                           VGPR_32:$vdst_in, op_sel0:$op_sel);
   let HasClamp = 0;
+  let HasExtVOP3DPP = 0;
+  let HasExtDPP = 0;
 }
 
 class PermlanePat<SDPatternOperator permlane,
@@ -753,6 +664,20 @@ let SubtargetPredicate = isGFX10Plus in {
   def : PermlaneDiscardVDstIn<
     BoundControlOrFetchInvalidPermlane<int_amdgcn_permlanex16>,
     V_PERMLANEX16_B32_e64>;
+
+  defm V_ADD_NC_U16 : VOP3Inst <"v_add_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, add>;
+  defm V_SUB_NC_U16 : VOP3Inst <"v_sub_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, sub>;
+
+  def : OpSelBinOpClampPat<uaddsat, V_ADD_NC_U16_e64>;
+  def : OpSelBinOpClampPat<usubsat, V_SUB_NC_U16_e64>;
+
+  // Undo sub x, c -> add x, -c canonicalization since c is more likely
+  // an inline immediate than -c.
+  def : GCNPat<
+    (add i16:$src0, (i16 NegSubInlineIntConst16:$src1)),
+    (V_SUB_NC_U16_e64 0, VSrc_b16:$src0, 0, NegSubInlineIntConst16:$src1, 0, 0)
+  >;
+
 } // End SubtargetPredicate = isGFX10Plus
 
 class DivFmasPat<ValueType vt, Instruction inst, Register CondReg> : GCNPat<
@@ -773,6 +698,36 @@ def : DivFmasPat<f32, V_DIV_FMAS_F32_e64, VCC_LO>;
 def : DivFmasPat<f64, V_DIV_FMAS_F64_e64, VCC_LO>;
 }
 
+class VOP3_DOT_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOP3_Profile<P, Features> {
+  // FIXME VOP3 DPP versions are unsupported
+  let HasExtVOP3DPP = 0;
+  let HasClamp = 0;
+  let HasOMod = 0;
+  let InsVOP3OpSel = getInsVOP3OpSel<Src0RC64, Src1RC64, Src2RC64,
+                                NumSrcArgs, HasClamp, HasOMod,
+                                !if(isFloatType<Src0VT>.ret, FPVRegInputMods, IntOpSelMods),
+                                !if(isFloatType<Src1VT>.ret, FPVRegInputMods, IntOpSelMods),
+                                !if(isFloatType<Src2VT>.ret, FPVRegInputMods, IntOpSelMods)>.ret;
+}
+
+let SubtargetPredicate = isGFX11Plus in {
+  defm V_MAXMIN_F32     : VOP3Inst<"v_maxmin_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
+  defm V_MINMAX_F32     : VOP3Inst<"v_minmax_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
+  defm V_MAXMIN_F16     : VOP3Inst<"v_maxmin_f16", VOP3_Profile<VOP_F16_F16_F16_F16>>;
+  defm V_MINMAX_F16     : VOP3Inst<"v_minmax_f16", VOP3_Profile<VOP_F16_F16_F16_F16>>;
+  defm V_MAXMIN_U32     : VOP3Inst<"v_maxmin_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+  defm V_MINMAX_U32     : VOP3Inst<"v_minmax_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+  defm V_MAXMIN_I32     : VOP3Inst<"v_maxmin_i32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+  defm V_MINMAX_I32     : VOP3Inst<"v_minmax_i32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+  defm V_CVT_PK_I16_F32 : VOP3Inst<"v_cvt_pk_i16_f32", VOP3_Profile<VOP_V2I16_F32_F32>>;
+  defm V_CVT_PK_U16_F32 : VOP3Inst<"v_cvt_pk_u16_f32", VOP3_Profile<VOP_V2I16_F32_F32>>;
+} // End SubtargetPredicate = isGFX11Plus
+
+let SubtargetPredicate = HasDot8Insts in {
+  defm V_DOT2_F16_F16 :   VOP3Inst<"v_dot2_f16_f16",   VOP3_DOT_Profile<VOP_F16_V2F16_V2F16_F16>, int_amdgcn_fdot2_f16_f16>;
+  defm V_DOT2_BF16_BF16 : VOP3Inst<"v_dot2_bf16_bf16", VOP3_DOT_Profile<VOP_I16_V2I16_V2I16_I16>, int_amdgcn_fdot2_bf16_bf16>;
+}
+
 //===----------------------------------------------------------------------===//
 // Integer Clamp Patterns
 //===----------------------------------------------------------------------===//
@@ -813,16 +768,137 @@ def : IntClampPat<V_MQSAD_PK_U16_U8_e64, int_amdgcn_mqsad_pk_u16_u8>;
 def : IntClampPat<V_QSAD_PK_U16_U8_e64, int_amdgcn_qsad_pk_u16_u8>;
 def : IntClampPat<V_MQSAD_U32_U8_e64, int_amdgcn_mqsad_u32_u8>;
 
-
 //===----------------------------------------------------------------------===//
 // Target-specific instruction encodings.
 //===----------------------------------------------------------------------===//
 
+//===----------------------------------------------------------------------===//
+// GFX11.
+//===----------------------------------------------------------------------===//
+
+defm V_FMA_DX9_ZERO_F32    : VOP3_Real_with_name_gfx11<0x209, "V_FMA_LEGACY_F32", "v_fma_dx9_zero_f32">;
+defm V_MAD_I32_I24         : VOP3_Realtriple_gfx11<0x20a>;
+defm V_MAD_U32_U24         : VOP3_Realtriple_gfx11<0x20b>;
+defm V_CUBEID_F32          : VOP3_Realtriple_gfx11<0x20c>;
+defm V_CUBESC_F32          : VOP3_Realtriple_gfx11<0x20d>;
+defm V_CUBETC_F32          : VOP3_Realtriple_gfx11<0x20e>;
+defm V_CUBEMA_F32          : VOP3_Realtriple_gfx11<0x20f>;
+defm V_BFE_U32             : VOP3_Realtriple_gfx11<0x210>;
+defm V_BFE_I32             : VOP3_Realtriple_gfx11<0x211>;
+defm V_BFI_B32             : VOP3_Realtriple_gfx11<0x212>;
+defm V_FMA_F32             : VOP3_Realtriple_gfx11<0x213>;
+defm V_FMA_F64             : VOP3_Real_Base_gfx11<0x214>;
+defm V_LERP_U8             : VOP3_Realtriple_gfx11<0x215>;
+defm V_ALIGNBIT_B32        : VOP3_Realtriple_gfx11<0x216>;
+defm V_ALIGNBYTE_B32       : VOP3_Realtriple_gfx11<0x217>;
+defm V_MULLIT_F32          : VOP3_Realtriple_gfx11<0x218>;
+defm V_MIN3_F32            : VOP3_Realtriple_gfx11<0x219>;
+defm V_MIN3_I32            : VOP3_Realtriple_gfx11<0x21a>;
+defm V_MIN3_U32            : VOP3_Realtriple_gfx11<0x21b>;
+defm V_MAX3_F32            : VOP3_Realtriple_gfx11<0x21c>;
+defm V_MAX3_I32            : VOP3_Realtriple_gfx11<0x21d>;
+defm V_MAX3_U32            : VOP3_Realtriple_gfx11<0x21e>;
+defm V_MED3_F32            : VOP3_Realtriple_gfx11<0x21f>;
+defm V_MED3_I32            : VOP3_Realtriple_gfx11<0x220>;
+defm V_MED3_U32            : VOP3_Realtriple_gfx11<0x221>;
+defm V_SAD_U8              : VOP3_Realtriple_gfx11<0x222>;
+defm V_SAD_HI_U8           : VOP3_Realtriple_gfx11<0x223>;
+defm V_SAD_U16             : VOP3_Realtriple_gfx11<0x224>;
+defm V_SAD_U32             : VOP3_Realtriple_gfx11<0x225>;
+defm V_CVT_PK_U8_F32       : VOP3_Realtriple_gfx11<0x226>;
+defm V_DIV_FIXUP_F32       : VOP3_Real_Base_gfx11<0x227>;
+defm V_DIV_FIXUP_F64       : VOP3_Real_Base_gfx11<0x228>;
+defm V_DIV_FMAS_F32        : VOP3_Real_Base_gfx11<0x237>;
+defm V_DIV_FMAS_F64        : VOP3_Real_Base_gfx11<0x238>;
+defm V_MSAD_U8             : VOP3_Realtriple_gfx11<0x239>;
+defm V_QSAD_PK_U16_U8      : VOP3_Real_Base_gfx11<0x23a>;
+defm V_MQSAD_PK_U16_U8     : VOP3_Real_Base_gfx11<0x23b>;
+defm V_MQSAD_U32_U8        : VOP3_Real_Base_gfx11<0x23d>;
+defm V_XOR3_B32            : VOP3_Realtriple_gfx11<0x240>;
+defm V_MAD_U16             : VOP3_Realtriple_with_name_gfx11<0x241, "V_MAD_U16_gfx9", "v_mad_u16">;
+defm V_PERM_B32            : VOP3_Realtriple_gfx11<0x244>;
+defm V_XAD_U32             : VOP3_Realtriple_gfx11<0x245>;
+defm V_LSHL_ADD_U32        : VOP3_Realtriple_gfx11<0x246>;
+defm V_ADD_LSHL_U32        : VOP3_Realtriple_gfx11<0x247>;
+defm V_FMA_F16             : VOP3_Realtriple_with_name_gfx11<0x248, "V_FMA_F16_gfx9", "v_fma_f16">;
+defm V_MIN3_F16            : VOP3_Realtriple_gfx11<0x249>;
+defm V_MIN3_I16            : VOP3_Realtriple_gfx11<0x24a>;
+defm V_MIN3_U16            : VOP3_Realtriple_gfx11<0x24b>;
+defm V_MAX3_F16            : VOP3_Realtriple_gfx11<0x24c>;
+defm V_MAX3_I16            : VOP3_Realtriple_gfx11<0x24d>;
+defm V_MAX3_U16            : VOP3_Realtriple_gfx11<0x24e>;
+defm V_MED3_F16            : VOP3_Realtriple_gfx11<0x24f>;
+defm V_MED3_I16            : VOP3_Realtriple_gfx11<0x250>;
+defm V_MED3_U16            : VOP3_Realtriple_gfx11<0x251>;
+defm V_MAD_I16             : VOP3_Realtriple_with_name_gfx11<0x253, "V_MAD_I16_gfx9", "v_mad_i16">;
+defm V_DIV_FIXUP_F16       : VOP3_Realtriple_with_name_gfx11<0x254, "V_DIV_FIXUP_F16_gfx9", "v_div_fixup_f16">;
+defm V_ADD3_U32            : VOP3_Realtriple_gfx11<0x255>;
+defm V_LSHL_OR_B32         : VOP3_Realtriple_gfx11<0x256>;
+defm V_AND_OR_B32          : VOP3_Realtriple_gfx11<0x257>;
+defm V_OR3_B32             : VOP3_Realtriple_gfx11<0x258>;
+defm V_MAD_U32_U16         : VOP3_Realtriple_gfx11<0x259>;
+defm V_MAD_I32_I16         : VOP3_Realtriple_gfx11<0x25a>;
+defm V_PERMLANE16_B32      : VOP3_Real_Base_gfx11<0x25b>;
+defm V_PERMLANEX16_B32     : VOP3_Real_Base_gfx11<0x25c>;
+defm V_MAXMIN_F32          : VOP3_Realtriple_gfx11<0x25e>;
+defm V_MINMAX_F32          : VOP3_Realtriple_gfx11<0x25f>;
+defm V_MAXMIN_F16          : VOP3_Realtriple_gfx11<0x260>;
+defm V_MINMAX_F16          : VOP3_Realtriple_gfx11<0x261>;
+defm V_MAXMIN_U32          : VOP3_Realtriple_gfx11<0x262>;
+defm V_MINMAX_U32          : VOP3_Realtriple_gfx11<0x263>;
+defm V_MAXMIN_I32          : VOP3_Realtriple_gfx11<0x264>;
+defm V_MINMAX_I32          : VOP3_Realtriple_gfx11<0x265>;
+// FIXME VOP3 DPP Dot instructions are unsupported
+defm V_DOT2_F16_F16        : VOP3_Real_Base_gfx11<0x266>;
+defm V_DOT2_BF16_BF16      : VOP3_Real_Base_gfx11<0x267>;
+defm V_DIV_SCALE_F32       : VOP3be_Real_gfx11<0x2fc, "V_DIV_SCALE_F32", "v_div_scale_f32">;
+defm V_DIV_SCALE_F64       : VOP3be_Real_gfx11<0x2fd, "V_DIV_SCALE_F64", "v_div_scale_f64">;
+defm V_MAD_U64_U32_gfx11   : VOP3be_Real_gfx11<0x2fe, "V_MAD_U64_U32_gfx11", "v_mad_u64_u32">;
+defm V_MAD_I64_I32_gfx11   : VOP3be_Real_gfx11<0x2ff, "V_MAD_I64_I32_gfx11", "v_mad_i64_i32">;
+defm V_ADD_NC_U16          : VOP3Only_Realtriple_gfx11<0x303>;
+defm V_SUB_NC_U16          : VOP3Only_Realtriple_gfx11<0x304>;
+defm V_MUL_LO_U16          : VOP3Only_Realtriple_gfx11<0x305>;
+defm V_CVT_PK_I16_F32      : VOP3_Realtriple_gfx11<0x306>;
+defm V_CVT_PK_U16_F32      : VOP3_Realtriple_gfx11<0x307>;
+defm V_MAX_U16             : VOP3Only_Realtriple_gfx11<0x309>;
+defm V_MAX_I16             : VOP3Only_Realtriple_gfx11<0x30a>;
+defm V_MIN_U16             : VOP3Only_Realtriple_gfx11<0x30b>;
+defm V_MIN_I16             : VOP3Only_Realtriple_gfx11<0x30c>;
+defm V_ADD_NC_I16          : VOP3_Realtriple_with_name_gfx11<0x30d, "V_ADD_I16", "v_add_nc_i16">;
+defm V_SUB_NC_I16          : VOP3_Realtriple_with_name_gfx11<0x30e, "V_SUB_I16", "v_sub_nc_i16">;
+defm V_PACK_B32_F16        : VOP3_Realtriple_gfx11<0x311>;
+defm V_CVT_PK_NORM_I16_F16 : VOP3_Realtriple_with_name_gfx11<0x312, "V_CVT_PKNORM_I16_F16" , "v_cvt_pk_norm_i16_f16" >;
+defm V_CVT_PK_NORM_U16_F16 : VOP3_Realtriple_with_name_gfx11<0x313, "V_CVT_PKNORM_U16_F16" , "v_cvt_pk_norm_u16_f16" >;
+defm V_SUB_NC_I32          : VOP3_Realtriple_with_name_gfx11<0x325, "V_SUB_I32", "v_sub_nc_i32">;
+defm V_ADD_NC_I32          : VOP3_Realtriple_with_name_gfx11<0x326, "V_ADD_I32", "v_add_nc_i32">;
+defm V_ADD_F64             : VOP3_Real_Base_gfx11<0x327>;
+defm V_MUL_F64             : VOP3_Real_Base_gfx11<0x328>;
+defm V_MIN_F64             : VOP3_Real_Base_gfx11<0x329>;
+defm V_MAX_F64             : VOP3_Real_Base_gfx11<0x32a>;
+defm V_LDEXP_F64           : VOP3_Real_Base_gfx11<0x32b>;
+defm V_MUL_LO_U32          : VOP3_Real_Base_gfx11<0x32c>;
+defm V_MUL_HI_U32          : VOP3_Real_Base_gfx11<0x32d>;
+defm V_MUL_HI_I32          : VOP3_Real_Base_gfx11<0x32e>;
+defm V_TRIG_PREOP_F64      : VOP3_Real_Base_gfx11<0x32f>;
+defm V_LSHLREV_B16         : VOP3Only_Realtriple_gfx11<0x338>;
+defm V_LSHRREV_B16         : VOP3Only_Realtriple_gfx11<0x339>;
+defm V_ASHRREV_I16         : VOP3Only_Realtriple_gfx11<0x33a>;
+defm V_LSHLREV_B64         : VOP3_Real_Base_gfx11<0x33c>;
+defm V_LSHRREV_B64         : VOP3_Real_Base_gfx11<0x33d>;
+defm V_ASHRREV_I64         : VOP3_Real_Base_gfx11<0x33e>;
+defm V_READLANE_B32        : VOP3_Real_No_Suffix_gfx11<0x360>; // Pseudo in VOP2
+let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in {
+  defm V_WRITELANE_B32     : VOP3_Real_No_Suffix_gfx11<0x361>; // Pseudo in VOP2
+} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in)
+defm V_AND_B16             : VOP3Only_Realtriple_gfx11<0x362>;
+defm V_OR_B16              : VOP3Only_Realtriple_gfx11<0x363>;
+defm V_XOR_B16             : VOP3Only_Realtriple_gfx11<0x364>;
+
 //===----------------------------------------------------------------------===//
 // GFX10.
 //===----------------------------------------------------------------------===//
 
-let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
   multiclass VOP3_Real_gfx10<bits<10> op> {
     def _gfx10 :
       VOP3_Real<!cast<VOP_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>,
@@ -867,7 +943,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
         let AsmString = asmName # ps.AsmOperands;
       }
   }
-} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
+} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
 
 defm V_READLANE_B32  : VOP3_Real_No_Suffix_gfx10<0x360>;
 
@@ -935,10 +1011,11 @@ defm V_MAD_I16 :
 defm V_DIV_FIXUP_F16 :
   VOP3OpSel_Real_gfx10_with_name<0x35f, "V_DIV_FIXUP_F16_gfx9", "v_div_fixup_f16">;
 
+defm V_ADD_NC_U16      : VOP3OpSel_Real_gfx10<0x303>;
+defm V_SUB_NC_U16      : VOP3OpSel_Real_gfx10<0x304>;
+
 // FIXME-GFX10-OPSEL: Need to add "selective" opsel support to some of these
 // (they do not support SDWA or DPP).
-defm V_ADD_NC_U16      : VOP3_Real_gfx10_with_name<0x303, "V_ADD_U16", "v_add_nc_u16">;
-defm V_SUB_NC_U16      : VOP3_Real_gfx10_with_name<0x304, "V_SUB_U16", "v_sub_nc_u16">;
 defm V_MUL_LO_U16      : VOP3_Real_gfx10_with_name<0x305, "V_MUL_LO_U16", "v_mul_lo_u16">;
 defm V_LSHRREV_B16     : VOP3_Real_gfx10_with_name<0x307, "V_LSHRREV_B16", "v_lshrrev_b16">;
 defm V_ASHRREV_I16     : VOP3_Real_gfx10_with_name<0x308, "V_ASHRREV_I16", "v_ashrrev_i16">;
@@ -1273,3 +1350,5 @@ defm V_MAD_I32_I16 : VOP3OpSel_Real_gfx9 <0x1f2>;
 
 defm V_CVT_PKNORM_I16_F16 : VOP3OpSel_Real_gfx9 <0x299>;
 defm V_CVT_PKNORM_U16_F16 : VOP3OpSel_Real_gfx9 <0x29a>;
+
+defm V_LSHL_ADD_U64 : VOP3_Real_vi <0x208>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 707475ceccee..59ce532af59b 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -10,19 +10,33 @@
 // VOP3P Classes
 //===----------------------------------------------------------------------===//
 
+class VOP3P_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR,
+                    bit HasDPP = 0> : VOP3_Profile<P, Features> {
+  let IsVOP3P = 1;
+  let HasExtVOP3DPP = HasDPP;
+  // We do not want to print src modifiers for vop3p because the bits are
+  // overloaded in meaning and the logic in printOperandAndFPInputMods is
+  // wrong for vop3p
+  let AsmVOP3DPPBase = AsmVOP3P;
+}
+
 // Used for FMA_MIX* and MAD_MIX* insts
 // Their operands are only sort of f16 operands. Depending on
 // op_sel_hi, these may be interpreted as f32. The inline immediate
 // values are really f16 converted to f32, so we treat these as f16
 // operands.
 class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR,
-                    bit useTiedOutput = 0> : VOP3_Profile<P, Features> {
+                    bit useTiedOutput = 0> : VOP3P_Profile<P, Features, 1> {
     bit UseTiedOutput = useTiedOutput;
 
     dag srcs =
           (ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0,
                FP16InputMods:$src1_modifiers, VCSrc_f16:$src1,
                FP16InputMods:$src2_modifiers, VCSrc_f16:$src2);
+    dag dpp_srcs =
+          (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0,
+               FP16InputMods:$src1_modifiers, VCSrc_f16:$src1,
+               FP16InputMods:$src2_modifiers, VCSrc_f16:$src2);
 
            // FIXME: clampmod0 misbehaves with the non-default vdst_in
            // following it. For now workaround this by requiring clamp
@@ -35,19 +49,27 @@ class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR,
     // We use Ins64 because that is the one which populates InOperandList
     // due to the logic in class VOP3_Pseudo
     let Ins64 = !con(srcs, mods);
+    let InsVOP3Base = !con(dpp_srcs, mods);
     let Asm64 =
       "$vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$op_sel$op_sel_hi$clamp";
+    let AsmVOP3DPPBase = Asm64;
 }
 
 multiclass VOP3PInst<string OpName, VOPProfile P,
-                     SDPatternOperator node = null_frag, bit HasExplicitClamp = 0> {
+                     SDPatternOperator node = null_frag, bit IsDOT = 0> {
   def NAME : VOP3P_Pseudo<OpName, P,
                           !if (P.HasModifiers,
-                               getVOP3PModPat<P, node, HasExplicitClamp>.ret,
+                               getVOP3PModPat<P, node, IsDOT, IsDOT>.ret,
                                getVOP3Pat<P, node>.ret)>;
+  let SubtargetPredicate = isGFX11Plus in {
+  if P.HasExtVOP3DPP then
+    def _dpp : VOP3_DPP_Pseudo<OpName, P> {
+      let VOP3P = 1;
+      let PseudoInstr = OpName #"_dpp";
+    }
+  } // end SubtargetPredicate = isGFX11Plus
 }
 
-
 // Non-packed instructions that use the VOP3P encoding.
 // VOP3 neg/abs and VOP3P opsel/opsel_hi modifiers are allowed.
 multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P> {
@@ -55,37 +77,47 @@ multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P> {
     let Constraints = !if(P.UseTiedOutput, "$vdst = $vdst_in", "");
     let DisableEncoding = !if(P.UseTiedOutput, "$vdst_in", "");
   }
+  let SubtargetPredicate = isGFX11Plus in {
+    if P.HasExtVOP3DPP then
+      def _dpp : VOP3_DPP_Pseudo<OpName, P> {
+        let VOP3P = 1;
+        let PseudoInstr = OpName#"_dpp";
+        let Constraints = !if(P.UseTiedOutput, "$vdst = $vdst_in", "");
+        let DisableEncoding = !if(P.UseTiedOutput, "$vdst_in", "");
+      }
+  } // end SubtargetPredicate = isGFX11Plus
 }
 
+let isReMaterializable = 1 in {
 let isCommutable = 1 in {
-defm V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
-defm V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
+defm V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
+defm V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
 
 let FPDPRounding = 1 in {
-defm V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, any_fma>;
-defm V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, any_fadd>;
-defm V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, any_fmul>;
+defm V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, any_fma>;
+defm V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16>, any_fadd>;
+defm V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16>, any_fmul>;
 } // End FPDPRounding = 1
-defm V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>;
-defm V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>;
+defm V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>;
+defm V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>;
 
-defm V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>;
-defm V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
-defm V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, mul>;
+defm V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, add>;
+defm V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>>;
+defm V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, mul>;
 
-defm V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smin>;
-defm V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umin>;
-defm V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smax>;
-defm V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umax>;
+defm V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, smin>;
+defm V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, umin>;
+defm V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, smax>;
+defm V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, umax>;
 }
 
-defm V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
-defm V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>;
-
-defm V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, clshl_rev_16>;
-defm V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, cashr_rev_16>;
-defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, clshr_rev_16>;
+defm V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>>;
+defm V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, sub>;
 
+defm V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, clshl_rev_16>;
+defm V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, cashr_rev_16>;
+defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, clshr_rev_16>;
+} // End isReMaterializable = 1
 
 let SubtargetPredicate = HasVOP3PInsts in {
 
@@ -178,6 +210,7 @@ let SubtargetPredicate = HasMadMixInsts in {
 // Size of src arguments (16/32) is controlled by op_sel.
 // For 16-bit src arguments their location (hi/lo) are controlled by op_sel_hi.
 let isCommutable = 1, mayRaiseFPException = 0 in {
+let isReMaterializable = 1 in
 defm V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3P_Mix_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;
 
 let FPDPRounding = 1 in {
@@ -197,6 +230,8 @@ defm : MadFmaMixPats<fmad, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>;
 // Essentially the same as the mad_mix versions
 let SubtargetPredicate = HasFmaMixInsts in {
 let isCommutable = 1 in {
+
+let isReMaterializable = 1 in
 defm V_FMA_MIX_F32 : VOP3_VOP3PInst<"v_fma_mix_f32", VOP3P_Mix_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;
 
 let FPDPRounding = 1 in {
@@ -297,34 +332,63 @@ let IsDOT = 1 in {
 let SubtargetPredicate = HasDot2Insts in {
 
 defm V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16",
-  VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2, 1>;
+  VOP3P_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2, 1>;
 defm V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16",
-  VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2, 1>;
+  VOP3P_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2, 1>;
 
 } // End SubtargetPredicate = HasDot2Insts
 
 let SubtargetPredicate = HasDot7Insts in {
 
 defm V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16",
-  VOP3_Profile<VOP_F32_V2F16_V2F16_F32>,
+  VOP3P_Profile<VOP_F32_V2F16_V2F16_F32, VOP3_REGULAR, /*HasDPP*/ 1>,
   AMDGPUfdot2, 1/*ExplicitClamp*/>;
 defm V_DOT4_U32_U8  : VOP3PInst<"v_dot4_u32_u8",
-  VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>;
+  VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>;
 defm V_DOT8_U32_U4  : VOP3PInst<"v_dot8_u32_u4",
-  VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>;
+  VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>;
 
 } // End SubtargetPredicate = HasDot7Insts
 
 let SubtargetPredicate = HasDot1Insts in {
 
 defm V_DOT4_I32_I8  : VOP3PInst<"v_dot4_i32_i8",
-  VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>;
+  VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>;
 defm V_DOT8_I32_I4  : VOP3PInst<"v_dot8_i32_i4",
-  VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8, 1>;
+  VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8, 1>;
 
 } // End SubtargetPredicate = HasDot1Insts
+
+let SubtargetPredicate = HasDot8Insts  in {
+
+defm V_DOT2_F32_BF16 : VOP3PInst<"v_dot2_f32_bf16",
+  VOP3P_Profile<VOP_F32_V2I16_V2I16_F32, VOP3_REGULAR, /*HasDPP*/ 1>,
+  int_amdgcn_fdot2_f32_bf16, 1>;
+
+} // End SubtargetPredicate = HasDot8Insts
+
 } // End let IsDOT = 1
 
+multiclass VOP3PDOTIUInst <string OpName, SDPatternOperator intrinsic_node> {
+  let IsDOT = 1 in
+  defm NAME : VOP3PInst<OpName, VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>,
+                        null_frag, 1>;
+  // Dot-iu instructions consider input as signed if imod neg bits are set. Thus
+  // Dot-iu Intrinsics have extra operands and require separate codegen pattern.
+  def : GCNPat < (intrinsic_node (DotIUVOP3PMods i32:$src0_mods), i32:$src0,
+                                 (DotIUVOP3PMods i32:$src1_mods), i32:$src1,
+                                 i32:$src2, (i1 timm:$clamp)),
+                 (!cast<Instruction>(NAME) $src0_mods, i32:$src0,
+                                           $src1_mods, i32:$src1,
+                                           (i32 8), i32:$src2, i1:$clamp)
+  >;
+}
+
+let SubtargetPredicate = HasDot8Insts  in {
+defm V_DOT4_I32_IU8 : VOP3PDOTIUInst<"v_dot4_i32_iu8", int_amdgcn_sudot4>;
+defm V_DOT8_I32_IU4 : VOP3PDOTIUInst<"v_dot8_i32_iu4", int_amdgcn_sudot8>;
+} // End SubtargetPredicate = HasDot8Insts
+
 def : UDot2Pat<V_DOT2_U32_U16>;
 def : SDot2Pat<V_DOT2_I32_I16>;
 
@@ -365,18 +429,18 @@ def VDst_256  : VOPDstOperand<VReg_256>;
 def VDst_512  : VOPDstOperand<VReg_512>;
 def VDst_1024 : VOPDstOperand<VReg_1024>;
 
-def VOPProfileAccRead : VOP3_Profile<VOP_I32_I32, VOP3_MAI> {
+def VOPProfileAccRead : VOP3P_Profile<VOP_I32_I32, VOP3_MAI> {
   let Src0RC64 = ARegSrc_32;
 }
 
-def VOPProfileAccWrite : VOP3_Profile<VOP_I32_I32, VOP3_MAI> {
+def VOPProfileAccWrite : VOP3P_Profile<VOP_I32_I32, VOP3_MAI> {
   let DstRC = ADst_32;
-  let Src0RC64 = VISrc_b32;
+  let Src0RC64 = VCSrc_b32;
 }
 
 class VOPProfileMAI<VOPProfile P, RegisterOperand _SrcRC, RegisterOperand _DstRC,
                     RegisterOperand SrcABRC = AVSrc_32>
-  : VOP3_Profile<P, VOP3_MAI> {
+  : VOP3P_Profile<P, VOP3_MAI> {
   let DstRC = _DstRC;
   let Src0RC64 = SrcABRC;
   let Src1RC64 = SrcABRC;
@@ -387,15 +451,27 @@ class VOPProfileMAI<VOPProfile P, RegisterOperand _SrcRC, RegisterOperand _DstRC
   let HasOMod = 0;
   let HasModifiers = 0;
   let Asm64 = "$vdst, $src0, $src1, $src2$cbsz$abid$blgp";
+  let AsmVOP3DPPBase = Asm64;
   let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, Src2RC64:$src2, cbsz:$cbsz, abid:$abid, blgp:$blgp);
+  let InsVOP3Base = Ins64;
   // Dst and SrcC cannot partially overlap if SrcC/Dst is bigger than 4 VGPRs.
   // We then create two versions of the instruction: with tied dst and src2
-  // and with the eralyclobber flag on the dst. This is strciter than the
+  // and with the earlyclobber flag on the dst. This is stricter than the
   // actual HW restriction. In particular earlyclobber also affects src0 and
   // src1 allocation which is not required.
   bit NoDstOverlap = !gt(DstVT.Size, 128);
 }
 
+class VOPProfileSMFMAC<VOPProfile P, RegisterOperand _DstRC,
+                       RegisterOperand _SrcARC, RegisterOperand _SrcBRC>
+  : VOPProfileMAI<P, _DstRC, _DstRC, _SrcARC> {
+  let Src1RC64 = _SrcBRC;
+  let Src2VT = DstVT;
+  let Asm64 = " $vdst, $src0, $src1, $idx$cbsz$abid";
+  let Outs64 = (outs DstRC:$vdst);
+  let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, VRegSrc_32:$idx, cbsz:$cbsz, abid:$abid, Src2RC64:$src2);
+}
+
 def VOPProfileMAI_F32_F32_X4    : VOPProfileMAI<VOP_V4F32_F32_F32_V4F32,       AISrc_128_f32,  ADst_128>;
 def VOPProfileMAI_F32_F32_X16   : VOPProfileMAI<VOP_V16F32_F32_F32_V16F32,     AISrc_512_f32,  ADst_512>;
 def VOPProfileMAI_F32_F32_X32   : VOPProfileMAI<VOP_V32F32_F32_F32_V32F32,     AISrc_1024_f32, ADst_1024>;
@@ -413,6 +489,10 @@ def VOPProfileMAI_F32_V4I16_X16 : VOPProfileMAI<VOP_V16F32_V4I16_V4I16_V16F32, A
 def VOPProfileMAI_F32_V4I16_X32 : VOPProfileMAI<VOP_V32F32_V4I16_V4I16_V32F32, AISrc_1024_b32, ADst_1024, AVSrc_64>;
 def VOPProfileMAI_F64_16X16X4F64 : VOPProfileMAI<VOP_V4F64_F64_F64_V4F64,      AISrc_256_f64,  ADst_256,  AVSrc_64>;
 def VOPProfileMAI_F64_4X4X4F64   : VOPProfileMAI<VOP_F64_F64_F64_F64,          AISrc_64_f64,   ADst_64,   AVSrc_64>;
+def VOPProfileMAI_I32_I64_X16   : VOPProfileMAI<VOP_V4I32_I64_I64_V4I32,       AISrc_128_b32,  ADst_128,  AVSrc_64>;
+def VOPProfileMAI_I32_I64_X32   : VOPProfileMAI<VOP_V16I32_I64_I64_V16I32,     AISrc_512_b32,  ADst_512,  AVSrc_64>;
+def VOPProfileMAI_F32_V2F32_X16 : VOPProfileMAI<VOP_V4F32_V2F32_V2F32_V4F32,   AISrc_128_b32,  ADst_128,  AVSrc_64>;
+def VOPProfileMAI_F32_V2F32_X32 : VOPProfileMAI<VOP_V16F32_V2F32_V2F32_V16F32, AISrc_512_b32,  ADst_512,  AVSrc_64>;
 
 def VOPProfileMAI_F32_F32_X4_VCD     : VOPProfileMAI<VOP_V4F32_F32_F32_V4F32,       VISrc_128_f32,  VDst_128>;
 def VOPProfileMAI_F32_F32_X16_VCD    : VOPProfileMAI<VOP_V16F32_F32_F32_V16F32,     VISrc_512_f32,  VDst_512>;
@@ -431,12 +511,37 @@ def VOPProfileMAI_F32_V4I16_X16_VCD  : VOPProfileMAI<VOP_V16F32_V4I16_V4I16_V16F
 def VOPProfileMAI_F32_V4I16_X32_VCD  : VOPProfileMAI<VOP_V32F32_V4I16_V4I16_V32F32, VISrc_1024_b32, VDst_1024, AVSrc_64>;
 def VOPProfileMAI_F64_16X16X4F64_VCD : VOPProfileMAI<VOP_V4F64_F64_F64_V4F64,       VISrc_256_f64,  VDst_256,  AVSrc_64>;
 def VOPProfileMAI_F64_4X4X4F64_VCD   : VOPProfileMAI<VOP_F64_F64_F64_F64,           VISrc_64_f64,   VDst_64,   AVSrc_64>;
+def VOPProfileMAI_I32_I64_X16_VCD    : VOPProfileMAI<VOP_V4I32_I64_I64_V4I32,       VISrc_128_b32,  VDst_128,  AVSrc_64>;
+def VOPProfileMAI_I32_I64_X32_VCD    : VOPProfileMAI<VOP_V16I32_I64_I64_V16I32,     VISrc_512_b32,  VDst_512,  AVSrc_64>;
+def VOPProfileMAI_F32_V2F32_X16_VCD  : VOPProfileMAI<VOP_V4F32_V2F32_V2F32_V4F32,   VISrc_128_b32,  VDst_128,  AVSrc_64>;
+def VOPProfileMAI_F32_V2F32_X32_VCD  : VOPProfileMAI<VOP_V16F32_V2F32_V2F32_V16F32, VISrc_512_b32,  VDst_512,  AVSrc_64>;
+
+def VOPProfileSMFMAC_F32_16X16X32_F16 : VOPProfileSMFMAC<VOP_V4F32_V4F16_V8F16_I32,  AVDst_128, AVSrc_64, AVSrc_128>;
+def VOPProfileSMFMAC_F32_32X32X16_F16 : VOPProfileSMFMAC<VOP_V16F32_V4F16_V8F16_I32, AVDst_512, AVSrc_64, AVSrc_128>;
+def VOPProfileSMFMAC_F32_16X16X32_I16 : VOPProfileSMFMAC<VOP_V4F32_V4I16_V8I16_I32,  AVDst_128, AVSrc_64, AVSrc_128>;
+def VOPProfileSMFMAC_F32_32X32X16_I16 : VOPProfileSMFMAC<VOP_V16F32_V4I16_V8I16_I32, AVDst_512, AVSrc_64, AVSrc_128>;
+def VOPProfileSMFMAC_I32_16X16X64_I8  : VOPProfileSMFMAC<VOP_V4I32_V2I32_V4I32_I32,  AVDst_128, AVSrc_64, AVSrc_128>;
+def VOPProfileSMFMAC_I32_32X32X32_I8  : VOPProfileSMFMAC<VOP_V16I32_V2I32_V4I32_I32, AVDst_512, AVSrc_64, AVSrc_128>;
 
 class MFMATable <bit is_mac, string Name> {
   bit IsMac = is_mac;
   string FMAOp = Name;
 }
 
+class MAIFrag<SDPatternOperator Op, code pred> : PatFrag <
+  (ops node:$src0, node:$src1, node:$src2, node:$cbsz, node:$abid, node:$blgp),
+  (Op $src0, $src1, $src2, $cbsz, $abid, $blgp),
+  pred
+>;
+
+let GISelPredicateCode = [{ return MF.getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); }] in
+class AgprMAIFrag<SDPatternOperator Op> :
+  MAIFrag<Op, [{ return MF->getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); }]>;
+
+let GISelPredicateCode = [{ return !MF.getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); }] in
+class VgprMAIFrag<SDPatternOperator Op> :
+  MAIFrag<Op, [{ return !MF->getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); }]>;
+
 let Predicates = [HasMAIInsts] in {
 
 let isAsCheapAsAMove = 1, isReMaterializable = 1 in {
@@ -446,47 +551,62 @@ let isAsCheapAsAMove = 1, isReMaterializable = 1 in {
   } // End isMoveImm = 1
 } // End isAsCheapAsAMove = 1, isReMaterializable = 1
 
+class MAIInst<string OpName, VOPProfile P, SDPatternOperator node>
+  : VOP3InstBase<OpName, P, node> {
+  Instruction Opcode = !cast<Instruction>(NAME);
+  bit is_dgemm = 0;
+  bit is_gfx940_xdl = 0;
+}
+
 multiclass MAIInst<string OpName, string P, SDPatternOperator node,
                    bit NoDstOverlap = !cast<VOPProfileMAI>("VOPProfileMAI_" # P).NoDstOverlap> {
   let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in {
     // FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported.
     let Constraints = !if(NoDstOverlap, "@earlyclobber $vdst", "") in {
-      defm "" : VOP3Inst<OpName, !cast<VOPProfileMAI>("VOPProfileMAI_" # P), !if(NoDstOverlap, null_frag, node)>,
-                MFMATable<0, NAME # "_e64">;
+      def _e64 : MAIInst<OpName, !cast<VOPProfileMAI>("VOPProfileMAI_" # P),
+                         !if(NoDstOverlap, null_frag, AgprMAIFrag<node>)>,
+                 MFMATable<0, NAME # "_e64">;
 
       let SubtargetPredicate = isGFX90APlus, Mnemonic = OpName in
-      defm _vgprcd : VOP3Inst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD")>,
-                     MFMATable<0, NAME # "_vgprcd_e64">;
+      def _vgprcd_e64 : MAIInst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"),
+                                !if(NoDstOverlap, null_frag, VgprMAIFrag<node>)>,
+                        MFMATable<0, NAME # "_vgprcd_e64">;
     }
 
     foreach _ = BoolToList<NoDstOverlap>.ret in {
       let Constraints = !if(NoDstOverlap, "$vdst = $src2", ""),
           isConvertibleToThreeAddress = NoDstOverlap,
           Mnemonic = OpName in {
-        defm "_mac" : VOP3Inst<OpName # "_mac", !cast<VOPProfileMAI>("VOPProfileMAI_" # P), node>,
-                      MFMATable<1, NAME # "_e64">;
+        def "_mac_e64" : MAIInst<OpName # "_mac", !cast<VOPProfileMAI>("VOPProfileMAI_" # P), AgprMAIFrag<node>>,
+                         MFMATable<1, NAME # "_e64">;
 
         let SubtargetPredicate = isGFX90APlus in
-        defm _mac_vgprcd : VOP3Inst<OpName # "_mac_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD")>,
-                           MFMATable<1, NAME # "_vgprcd_e64">;
+        def _mac_vgprcd_e64 : MAIInst<OpName # "_mac_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"),
+                                      VgprMAIFrag<node>>,
+                              MFMATable<1, NAME # "_vgprcd_e64">;
       }
     }
   } // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1
 }
 
 defm V_MFMA_F32_4X4X1F32    : MAIInst<"v_mfma_f32_4x4x1f32",    "F32_F32_X4",    int_amdgcn_mfma_f32_4x4x1f32>;
-defm V_MFMA_F32_4X4X4F16    : MAIInst<"v_mfma_f32_4x4x4f16",    "F32_V4F16_X4",  int_amdgcn_mfma_f32_4x4x4f16>;
-defm V_MFMA_I32_4X4X4I8     : MAIInst<"v_mfma_i32_4x4x4i8",     "I32_I32_X4",    int_amdgcn_mfma_i32_4x4x4i8>;
 defm V_MFMA_F32_16X16X1F32  : MAIInst<"v_mfma_f32_16x16x1f32",  "F32_F32_X16",   int_amdgcn_mfma_f32_16x16x1f32>;
 defm V_MFMA_F32_16X16X4F32  : MAIInst<"v_mfma_f32_16x16x4f32",  "F32_F32_X4",    int_amdgcn_mfma_f32_16x16x4f32>;
+defm V_MFMA_F32_32X32X1F32  : MAIInst<"v_mfma_f32_32x32x1f32",  "F32_F32_X32",   int_amdgcn_mfma_f32_32x32x1f32>;
+defm V_MFMA_F32_32X32X2F32  : MAIInst<"v_mfma_f32_32x32x2f32",  "F32_F32_X16",   int_amdgcn_mfma_f32_32x32x2f32>;
+
+let is_gfx940_xdl = 1 in {
+defm V_MFMA_F32_4X4X4F16    : MAIInst<"v_mfma_f32_4x4x4f16",    "F32_V4F16_X4",  int_amdgcn_mfma_f32_4x4x4f16>;
+defm V_MFMA_I32_4X4X4I8     : MAIInst<"v_mfma_i32_4x4x4i8",     "I32_I32_X4",    int_amdgcn_mfma_i32_4x4x4i8>;
 defm V_MFMA_F32_16X16X4F16  : MAIInst<"v_mfma_f32_16x16x4f16",  "F32_V4F16_X16", int_amdgcn_mfma_f32_16x16x4f16>;
 defm V_MFMA_F32_16X16X16F16 : MAIInst<"v_mfma_f32_16x16x16f16", "F32_V4F16_X4",  int_amdgcn_mfma_f32_16x16x16f16>;
 defm V_MFMA_I32_16X16X4I8   : MAIInst<"v_mfma_i32_16x16x4i8",   "I32_I32_X16",   int_amdgcn_mfma_i32_16x16x4i8>;
-defm V_MFMA_F32_32X32X1F32  : MAIInst<"v_mfma_f32_32x32x1f32",  "F32_F32_X32",   int_amdgcn_mfma_f32_32x32x1f32>;
-defm V_MFMA_F32_32X32X2F32  : MAIInst<"v_mfma_f32_32x32x2f32",  "F32_F32_X16",   int_amdgcn_mfma_f32_32x32x2f32>;
 defm V_MFMA_F32_32X32X4F16  : MAIInst<"v_mfma_f32_32x32x4f16",  "F32_V4F16_X32", int_amdgcn_mfma_f32_32x32x4f16>;
 defm V_MFMA_F32_32X32X8F16  : MAIInst<"v_mfma_f32_32x32x8f16",  "F32_V4F16_X16", int_amdgcn_mfma_f32_32x32x8f16>;
 defm V_MFMA_I32_32X32X4I8   : MAIInst<"v_mfma_i32_32x32x4i8",   "I32_I32_X32",   int_amdgcn_mfma_i32_32x32x4i8>;
+}
+
+let Predicates = [isGFX908orGFX90A] in {
 defm V_MFMA_I32_16X16X16I8  : MAIInst<"v_mfma_i32_16x16x16i8",  "I32_I32_X4",    int_amdgcn_mfma_i32_16x16x16i8>;
 defm V_MFMA_I32_32X32X8I8   : MAIInst<"v_mfma_i32_32x32x8i8",   "I32_I32_X16",   int_amdgcn_mfma_i32_32x32x8i8>;
 defm V_MFMA_F32_4X4X2BF16   : MAIInst<"v_mfma_f32_4x4x2bf16",   "F32_V2I16_X4",  int_amdgcn_mfma_f32_4x4x2bf16>;
@@ -494,34 +614,314 @@ defm V_MFMA_F32_16X16X2BF16 : MAIInst<"v_mfma_f32_16x16x2bf16", "F32_V2I16_X16",
 defm V_MFMA_F32_16X16X8BF16 : MAIInst<"v_mfma_f32_16x16x8bf16", "F32_V2I16_X4",  int_amdgcn_mfma_f32_16x16x8bf16>;
 defm V_MFMA_F32_32X32X2BF16 : MAIInst<"v_mfma_f32_32x32x2bf16", "F32_V2I16_X32", int_amdgcn_mfma_f32_32x32x2bf16>;
 defm V_MFMA_F32_32X32X4BF16 : MAIInst<"v_mfma_f32_32x32x4bf16", "F32_V2I16_X16", int_amdgcn_mfma_f32_32x32x4bf16>;
+}
 
 } // End SubtargetPredicate = HasMAIInsts
 
 let Predicates = [isGFX90APlus] in {
+  let is_gfx940_xdl = 1 in {
   defm V_MFMA_F32_32X32X4BF16_1K  : MAIInst<"v_mfma_f32_32x32x4bf16_1k",  "F32_V4I16_X32",  int_amdgcn_mfma_f32_32x32x4bf16_1k>;
   defm V_MFMA_F32_16X16X4BF16_1K  : MAIInst<"v_mfma_f32_16x16x4bf16_1k",  "F32_V4I16_X16",  int_amdgcn_mfma_f32_16x16x4bf16_1k>;
   defm V_MFMA_F32_4X4X4BF16_1K    : MAIInst<"v_mfma_f32_4x4x4bf16_1k",    "F32_V4I16_X4",   int_amdgcn_mfma_f32_4x4x4bf16_1k>;
   defm V_MFMA_F32_32X32X8BF16_1K  : MAIInst<"v_mfma_f32_32x32x8bf16_1k",  "F32_V4I16_X16",  int_amdgcn_mfma_f32_32x32x8bf16_1k>;
   defm V_MFMA_F32_16X16X16BF16_1K : MAIInst<"v_mfma_f32_16x16x16bf16_1k", "F32_V4I16_X4",   int_amdgcn_mfma_f32_16x16x16bf16_1k>;
+  }
 
+  let is_dgemm = 1 in {
   defm V_MFMA_F64_16X16X4F64      : MAIInst<"v_mfma_f64_16x16x4f64",      "F64_16X16X4F64", int_amdgcn_mfma_f64_16x16x4f64>;
   defm V_MFMA_F64_4X4X4F64        : MAIInst<"v_mfma_f64_4x4x4f64",        "F64_4X4X4F64",   int_amdgcn_mfma_f64_4x4x4f64>;
+  }
 } // End Predicates = [isGFX90APlus]
 
-let SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1 in {
-  defm V_PK_FMA_F32 : VOP3PInst<"v_pk_fma_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fma>;
-  defm V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fmul>;
-  defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fadd>;
-  defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3_Profile<VOP_V2I32_V2I32_V2I32, VOP3_PACKED>>;
+let Predicates = [isGFX940Plus], is_gfx940_xdl = 1 in {
+  defm V_MFMA_I32_32X32X16I8       : MAIInst<"v_mfma_i32_32x32x16i8",       "I32_I64_X32",    int_amdgcn_mfma_i32_32x32x16_i8>;
+  defm V_MFMA_I32_16X16X32I8       : MAIInst<"v_mfma_i32_16x16x32i8",       "I32_I64_X16",    int_amdgcn_mfma_i32_16x16x32_i8>;
+  defm V_MFMA_F32_16X16X8XF32      : MAIInst<"v_mfma_f32_16x16x8xf32",      "F32_V2F32_X16",  int_amdgcn_mfma_f32_16x16x8_xf32>;
+  defm V_MFMA_F32_32X32X4XF32      : MAIInst<"v_mfma_f32_32x32x4xf32",      "F32_V2F32_X32",  int_amdgcn_mfma_f32_32x32x4_xf32>;
+} // End Predicates = [isGFX940Plus], is_gfx940_xdl = 1
+
+multiclass SMFMACInst<string OpName, string P, SDPatternOperator node> {
+  let Constraints = "$vdst = $src2", DisableEncoding = "$src2",
+      isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1, is_gfx940_xdl = 1 in {
+    def _e64 : MAIInst<OpName, !cast<VOPProfileSMFMAC>("VOPProfileSMFMAC_" # P), node>;
+  }
+}
+
+let SubtargetPredicate = isGFX940Plus in {
+defm V_SMFMAC_F32_16X16X32_F16     : SMFMACInst<"v_smfmac_f32_16x16x32_f16",     "F32_16X16X32_F16", int_amdgcn_smfmac_f32_16x16x32_f16>;
+defm V_SMFMAC_F32_32X32X16_F16     : SMFMACInst<"v_smfmac_f32_32x32x16_f16",     "F32_32X32X16_F16", int_amdgcn_smfmac_f32_32x32x16_f16>;
+defm V_SMFMAC_F32_16X16X32_BF16    : SMFMACInst<"v_smfmac_f32_16x16x32_bf16",    "F32_16X16X32_I16", int_amdgcn_smfmac_f32_16x16x32_bf16>;
+defm V_SMFMAC_F32_32X32X16_BF16    : SMFMACInst<"v_smfmac_f32_32x32x16_bf16",    "F32_32X32X16_I16", int_amdgcn_smfmac_f32_32x32x16_bf16>;
+defm V_SMFMAC_I32_16X16X64_I8      : SMFMACInst<"v_smfmac_i32_16x16x64_i8",      "I32_16X16X64_I8",  int_amdgcn_smfmac_i32_16x16x64_i8>;
+defm V_SMFMAC_I32_32X32X32_I8      : SMFMACInst<"v_smfmac_i32_32x32x32_i8",      "I32_32X32X32_I8",  int_amdgcn_smfmac_i32_32x32x32_i8>;
+}
+
+def MAIInstInfoTable : GenericTable {
+  let FilterClass = "MAIInst";
+  let CppTypeName = "MAIInstInfo";
+  let Fields = [
+    "Opcode", "is_dgemm", "is_gfx940_xdl"
+  ];
+
+  let PrimaryKey = ["Opcode"];
+  let PrimaryKeyName = "getMAIInstInfoHelper";
+}
+
+let SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1, isReMaterializable = 1 in {
+  defm V_PK_FMA_F32 : VOP3PInst<"v_pk_fma_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fma>;
+  defm V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fmul>;
+  defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fadd>;
+  defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3P_Profile<VOP_V2I32_V2I32_V2I32, VOP3_PACKED>>;
 } // End SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1
 
 def : MnemonicAlias<"v_accvgpr_read",  "v_accvgpr_read_b32">;
 def : MnemonicAlias<"v_accvgpr_write", "v_accvgpr_write_b32">;
 
+class VOPProfileWMMA<VOPProfile P, string Suffix, RegisterOperand _Src01RC64, bit _HasClamp, bit _HasOpSel> : VOP3P_Profile<P> {
+  let DstRC = !if(!eq(Suffix, "_w32"), VDst_256, VDst_128);
+  let Src0RC64 = _Src01RC64;
+  let Src1RC64 = _Src01RC64;
+  let Src2RC64 = !if(!eq(Suffix, "_w32"), VISrc_256_f64, VISrc_128_f32);
+  let HasClamp = _HasClamp;
+  let HasOpSel = _HasOpSel;
+  let IsPacked = 1;
+  let IsWMMA = 1;
+}
+
+def VOP_V8F32_V16F16_V16F16_V8F32 : VOPProfile <[v8f32, v16f16, v16f16, v8f32]>;
+def VOP_V8F32_V16I16_V16I16_V8F32 : VOPProfile <[v8f32, v16i16, v16i16, v8f32]>;
+def VOP_V16F16_V16F16_V16F16_V16F16 : VOPProfile <[v16f16, v16f16, v16f16, v16f16]>;
+def VOP_V16I16_V16I16_V16I16_V16I16 : VOPProfile <[v16i16, v16i16, v16i16, v16i16]>;
+def VOP_V8I32_V4I32_V4I32_V8I32 : VOPProfile <[v8i32, v4i32, v4i32, v8i32]>;
+def VOP_V8I32_V2I32_V2I32_V8I32 : VOPProfile <[v8i32, v2i32, v2i32, v8i32]>;
+
+def VOP_V4F32_V16F16_V16F16_V4F32 : VOPProfile <[v4f32, v16f16, v16f16, v4f32]>;
+def VOP_V4F32_V16I16_V16I16_V4F32 : VOPProfile <[v4f32, v16i16, v16i16, v4f32]>;
+def VOP_V8F16_V16F16_V16F16_V8F16 : VOPProfile <[v8f16, v16f16, v16f16, v8f16]>;
+def VOP_V8I16_V16I16_V16I16_V8I16 : VOPProfile <[v8i16, v16i16, v16i16, v8i16]>;
+def VOP_V4I32_V4I32_V4I32_V4I32 : VOPProfile <[v4i32, v4i32, v4i32, v4i32]>;
+def VOP_V4I32_V2I32_V2I32_V4I32 : VOPProfile <[v4i32, v2i32, v2i32, v4i32]>;
+
+
+class WMMAType <bits<2> val> {
+  bit hasClamp = val{0};
+  bit hasOpsel = val{1};
+}
+
+def WMMARegular      : WMMAType<0b00>;
+def WMMAUIClamp      : WMMAType<0b01>;
+def WMMAOpSel        : WMMAType<0b10>;
+
+class WMMARegularPat<Instruction Inst, SDPatternOperator node, VOPProfile P> :
+  GCNPat < (P.DstVT (node
+                                (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers)),
+                                (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers)),
+                                (P.Src2VT (VOP3PMods P.Src2VT:$src2, i32:$src2_modifiers))
+                   )),
+                   (P.DstVT (Inst i32:$src0_modifiers, P.Src0VT:$src0, i32:$src1_modifiers, P.Src1VT:$src1, $src2_modifiers, P.Src2VT:$src2))
+>;
+
+class WMMAOpSelPat<Instruction Inst, SDPatternOperator node, VOPProfile P> :
+  GCNPat < (P.DstVT (node
+                                (P.Src0VT P.Src0VT:$src0),
+                                (P.Src1VT P.Src1VT:$src1),
+                                (P.Src2VT P.Src2VT:$src2), (WMMAOpSelVOP3PMods i32:$src2_modifiers)
+                   )),
+                   (P.DstVT (Inst (i32 8), P.Src0VT:$src0, (i32 8), P.Src1VT:$src1, i32:$src2_modifiers, P.Src2VT:$src2))
+>;
+
+class WMMAUIClampPat<Instruction Inst, SDPatternOperator node, VOPProfile P> :
+  GCNPat < (P.DstVT (node
+                                (DotIUVOP3PMods i32:$src0_modifiers), (P.Src0VT P.Src0VT:$src0),
+                                (DotIUVOP3PMods i32:$src1_modifiers), (P.Src1VT P.Src1VT:$src1),
+                                (P.Src2VT P.Src2VT:$src2), (i1 timm:$clamp)
+                   )),
+                   (P.DstVT (Inst i32:$src0_modifiers, P.Src0VT:$src0, i32:$src1_modifiers, P.Src1VT:$src1, (i32 8), P.Src2VT:$src2, i1:$clamp))
+>;
+
+class WMMAOpcodeMapping<Instruction TwoAddr, Instruction ThreeAddr> {
+  Instruction Opcode2Addr = TwoAddr;
+  Instruction Opcode3Addr = ThreeAddr;
+  Predicate WaveSizePredicate;
+}
+
+def WMMAOpcode : GenericEnum {
+  let FilterClass = "VOP3P_Pseudo";
+}
+
+class WMMAMappingTable : GenericTable {
+  let FilterClass = "WMMAOpcodeMapping";
+  let CppTypeName = "WMMAOpcodeMappingInfo";
+  let Fields = ["Opcode2Addr", "Opcode3Addr"];
+  string TypeOf_Opcode2Addr = "WMMAOpcode";
+  string TypeOf_Opcode3Addr = "WMMAOpcode";
+}
+
+def WMMAOpcode2AddrMappingTable : WMMAMappingTable {
+  let PrimaryKey = ["Opcode2Addr"];
+  let PrimaryKeyName = "getWMMAMappingInfoFrom2AddrOpcode";
+}
+
+def WMMAOpcode3AddrMappingTable : WMMAMappingTable {
+  let PrimaryKey = ["Opcode3Addr"];
+  let PrimaryKeyName = "getWMMAMappingInfoFrom3AddrOpcode";
+}
+
+// The WMMA instruction has extra constraints:
+// Matrices A and B cannot overlap with D. C cannot partially overlap with D,
+// but it is OK for them to be the same (which is a typical case).
+//
+// We implement it as follows:
+// 1) Map the intrinsic to the pseudo where D is tied to C ($vdst = $src2).
+// 2) The pass twoaddressinstruction checks if src2 is live and if that is the case
+//    it converts the default pseudo to the pseudo where src2 is not the same as vdst.
+// 3) @earlyclobber on the destination satisfies the constraint during RA.
+
+multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator node = null_frag, RegisterOperand _Src01RC64 = VRegSrc_256, WMMAType Type> {
+
+  defvar WMMAConstraints2Addr = "@earlyclobber $vdst,$vdst = $src2";
+  defvar WMMAConstraints3Addr = "@earlyclobber $vdst";
+
+  defvar WMMAProfile = VOPProfileWMMA<P, Suffix, _Src01RC64, Type.hasClamp, Type.hasOpsel>;
+  if !eq(Suffix, "_w32") then {
+    let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
+      let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in {
+        def _twoaddr_w32 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
+      }
+      let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
+        def _threeaddr_w32 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
+      }
+    }
+    def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr_w32),
+                            !cast<Instruction>(NAME # _threeaddr_w32)>;
+  } else if !eq(Suffix, "_w64") then {
+    let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
+      let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in {
+        def _twoaddr_w64 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
+      }
+      let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
+        def _threeaddr_w64 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
+      }
+    }
+    def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr_w64),
+                            !cast<Instruction>(NAME # _threeaddr_w64)>;
+  }
+
+  if !eq(Type, WMMAOpSel) then {
+    def : WMMAOpSelPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
+  } else if !eq(Type, WMMAUIClamp) then {
+    def : WMMAUIClampPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
+  } else {
+    def : WMMARegularPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
+  }
+}
+
+
+let WaveSizePredicate = isWave32 in {
+  defm V_WMMA_F32_16X16X16_F16   : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16",  VOP_V8F32_V16F16_V16F16_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>;
+  defm V_WMMA_F32_16X16X16_BF16  : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V16I16_V16I16_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>;
+  defm V_WMMA_F16_16X16X16_F16   : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16",   VOP_V16F16_V16F16_V16F16_V16F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>;
+  defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V16I16_V16I16_V16I16_V16I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>;
+  defm V_WMMA_I32_16X16X16_IU8   : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu8",   VOP_V8I32_V4I32_V4I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>;
+  defm V_WMMA_I32_16X16X16_IU4   : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu4",   VOP_V8I32_V2I32_V2I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64,  WMMAUIClamp>;
+}
+
+let WaveSizePredicate = isWave64 in {
+  defm V_WMMA_F32_16X16X16_F16   : WMMAInst<"_w64", "v_wmma_f32_16x16x16_f16",   VOP_V4F32_V16F16_V16F16_V4F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>;
+  defm V_WMMA_F32_16X16X16_BF16  : WMMAInst<"_w64", "v_wmma_f32_16x16x16_bf16",  VOP_V4F32_V16I16_V16I16_V4F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>;
+  defm V_WMMA_F16_16X16X16_F16   : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16",   VOP_V8F16_V16F16_V16F16_V8F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>;
+  defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V8I16_V16I16_V16I16_V8I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>;
+  defm V_WMMA_I32_16X16X16_IU8   : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu8",   VOP_V4I32_V4I32_V4I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>;
+  defm V_WMMA_I32_16X16X16_IU4   : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu4",   VOP_V4I32_V2I32_V2I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp>;
+
+}
+
 //===----------------------------------------------------------------------===//
 // Begin Real Encodings
 //===----------------------------------------------------------------------===//
 
+class VOP3P_DPP16<bits<7> op, VOP_DPP_Pseudo ps, int subtarget,
+                  string opName = ps.OpName>
+    : VOP3P_DPP<op, opName, ps.Pfl, 1>, SIMCInstr<ps.PseudoInstr, subtarget> {
+  let hasSideEffects = ps.hasSideEffects;
+  let Defs = ps.Defs;
+  let SchedRW = ps.SchedRW;
+  let Uses = ps.Uses;
+  let AssemblerPredicate = HasDPP16;
+  let SubtargetPredicate = HasDPP16;
+  let OtherPredicates = ps.OtherPredicates;
+}
+
+class VOP3P_DPP8_Base<bits<7> op, VOP_Pseudo ps, string opName = ps.OpName>
+    : VOP3P_DPP8<op, opName, ps.Pfl> {
+  let hasSideEffects = ps.hasSideEffects;
+  let Defs = ps.Defs;
+  let SchedRW = ps.SchedRW;
+  let Uses = ps.Uses;
+  let OtherPredicates = ps.OtherPredicates;
+}
+
+//===----------------------------------------------------------------------===//
+// GFX11.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX11Plus,
+    DecoderNamespace = "GFX11" in {
+
+  multiclass VOP3P_Real_gfx11<bits<7> op, string backing_ps_name = NAME,
+                       string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
+    def _gfx11 : VOP3P_Real<!cast<VOP3P_Pseudo>(backing_ps_name),
+                            SIEncodingFamily.GFX11, asmName>,
+                 VOP3Pe_gfx11<op, !cast<VOP3P_Pseudo>(backing_ps_name).Pfl>;
+  }
+
+  multiclass VOP3P_Real_dpp_gfx11<bits<7> op, string backing_ps_name = NAME,
+                       string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
+    defvar ps = !cast<VOP3P_Pseudo>(backing_ps_name);
+    def _dpp_gfx11
+        : VOP3P_DPP16<op, !cast<VOP_DPP_Pseudo>(backing_ps_name #"_dpp"),
+                      SIEncodingFamily.GFX11> {
+      let AsmString = asmName #ps.Pfl.AsmVOP3DPP16;
+      let DecoderNamespace = "DPPGFX11";
+    }
+  }
+
+  multiclass VOP3P_Real_dpp8_gfx11<bits<7> op, string backing_ps_name = NAME,
+                       string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
+    defvar ps = !cast<VOP3P_Pseudo>(backing_ps_name);
+    def _dpp8_gfx11 : VOP3P_DPP8_Base<op, ps> {
+      let AsmString = asmName #ps.Pfl.AsmVOP3DPP8;
+      let DecoderNamespace = "DPP8GFX11";
+    }
+  }
+
+  multiclass VOP3P_Realtriple_gfx11<bits<7> op, string backing_ps_name = NAME,
+                        string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic>
+      : VOP3P_Real_gfx11<op, backing_ps_name, asmName>,
+        VOP3P_Real_dpp_gfx11<op, backing_ps_name, asmName>,
+        VOP3P_Real_dpp8_gfx11<op, backing_ps_name, asmName>;
+} // End AssemblerPredicate = isGFX11Plus, DecoderNamespace = "GFX11"
+
+defm V_DOT4_I32_IU8              : VOP3P_Real_gfx11 <0x16>;
+defm V_DOT8_I32_IU4              : VOP3P_Real_gfx11 <0x18>;
+defm V_DOT2_F32_BF16             : VOP3P_Real_gfx11 <0x1a>;
+
+multiclass VOP3P_Real_WMMA <bits<7> op> {
+  let WaveSizePredicate = isWave32, DecoderNamespace = "GFX11" in {
+    defm _twoaddr_w32 : VOP3P_Real_gfx11 <op>;
+  }
+  let WaveSizePredicate = isWave64, DecoderNamespace = "WMMAGFX11" in {
+    defm _twoaddr_w64 : VOP3P_Real_gfx11 <op>;
+  }
+}
+
+defm V_WMMA_F32_16X16X16_F16   : VOP3P_Real_WMMA <0x040>;
+defm V_WMMA_F32_16X16X16_BF16  : VOP3P_Real_WMMA <0x041>;
+defm V_WMMA_F16_16X16X16_F16   : VOP3P_Real_WMMA <0x042>;
+defm V_WMMA_BF16_16X16X16_BF16 : VOP3P_Real_WMMA <0x043>;
+defm V_WMMA_I32_16X16X16_IU8   : VOP3P_Real_WMMA <0x044>;
+defm V_WMMA_I32_16X16X16_IU4   : VOP3P_Real_WMMA <0x045>;
+
 //===----------------------------------------------------------------------===//
 // GFX8 (VI)
 //===----------------------------------------------------------------------===//
@@ -557,15 +957,64 @@ multiclass VOP3P_Real_MFMA_gfx90a<bits<7> op> {
              VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64").Pfl, 0>;
   } // End AssemblerPredicate = isGFX90AOnly, DecoderNamespace = "GFX90A"
 }
+}
 
-multiclass VOP3P_Real_MFMA<bits<7> op> :
-  VOP3P_Real_MFMA_gfx90a <op> {
+multiclass VOP3P_Real_MFMA_gfx940_aliases<string NameFrom, string NameTo, string Op,
+                                          VOP3_Pseudo PS_ACD = !cast<VOP3_Pseudo>(Op # "_e64"),
+                                          VOP3_Pseudo PS_VCD = !cast<VOP3_Pseudo>(Op # "_vgprcd" # "_e64"),
+                                          VOPProfile Pfl_ACD = PS_ACD.Pfl,
+                                          VOPProfile Pfl_VCD = PS_VCD.Pfl> {
+  let Predicates = [isGFX940Plus] in {
+    foreach _ = BoolToList<!ne(NameFrom, NameTo)>.ret in {
+      def : InstAlias <NameTo # " " # PS_ACD.AsmOperands,
+                       (!cast<VOP3P_Real>(Op # "_gfx940_acd") Pfl_ACD.DstRC:$vdst,
+                           Pfl_ACD.Src0RC64:$src0, Pfl_ACD.Src1RC64:$src1, Pfl_ACD.Src2RC64:$src2,
+                           cbsz:$cbsz, abid:$abid, blgp:$blgp)>, PredicateControl;
+      def : InstAlias <NameTo # " " # PS_VCD.AsmOperands,
+                       (!cast<VOP3P_Real>(Op # "_gfx940_vcd") Pfl_VCD.DstRC:$vdst,
+                           Pfl_VCD.Src0RC64:$src0, Pfl_VCD.Src1RC64:$src1, Pfl_VCD.Src2RC64:$src2,
+                           cbsz:$cbsz, abid:$abid, blgp:$blgp)>, PredicateControl;
+    }
+  } // End Predicates = [isGFX940Plus]
+}
+
+multiclass VOP3P_Real_MFMA_gfx940<bits<7> op, string Name = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic,
+                                  VOP3_Pseudo PS_ACD = !cast<VOP3_Pseudo>(NAME # "_e64"),
+                                  VOP3_Pseudo PS_VCD = !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64")> {
+  let SubtargetPredicate = isGFX940Plus,
+      AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX9",
+      AsmString = Name # PS_ACD.AsmOperands, Constraints = "" in {
+  def _gfx940_acd : VOP3P_Real<PS_ACD, SIEncodingFamily.GFX940>,
+                    VOP3Pe_MAI <op, PS_ACD.Pfl, 1>;
+
+  def _gfx940_vcd : VOP3P_Real<PS_VCD, SIEncodingFamily.GFX940>,
+                    VOP3Pe_MAI <op, PS_VCD.Pfl, 0>;
+  } // End AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX9"
+
+  defm : VOP3P_Real_MFMA_gfx940_aliases<Name, PS_ACD.Mnemonic, NAME>;
+
+  foreach _ = BoolToList<!ne(!subst("_1k", "", PS_ACD.Mnemonic), PS_ACD.Mnemonic)>.ret in
+  defm : VOP3P_Real_MFMA_gfx940_aliases<Name, !subst("_1k", "", PS_ACD.Mnemonic), NAME>;
+}
+
+multiclass VOP3P_Real_MFMA<bits<7> op, string GFX940Name = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic> :
+  VOP3P_Real_MFMA_gfx90a <op>,
+  VOP3P_Real_MFMA_gfx940 <op, GFX940Name> {
   def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
             VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl, ?> {
     let AssemblerPredicate = HasMAIInsts;
     let DecoderNamespace = "GFX8";
+    let Constraints = "";
   }
 }
+
+multiclass VOP3P_Real_SMFMAC<bits<7> op, string alias> {
+  def _gfx940 : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+                VOP3Pe_SMFMAC <op> {
+    let AssemblerPredicate = isGFX940Plus;
+    let DecoderNamespace = "GFX8";
+  }
+  def : MnemonicAlias<alias, !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic>;
 }
 
 defm V_PK_MAD_I16 : VOP3P_Real_vi <0x00>;
@@ -634,19 +1083,21 @@ let SubtargetPredicate = HasMAIInsts in {
 
 defm V_ACCVGPR_READ_B32  : VOP3P_Real_MAI <0x58>;
 defm V_ACCVGPR_WRITE_B32 : VOP3P_Real_MAI <0x59>;
-defm V_MFMA_F32_32X32X1F32  : VOP3P_Real_MFMA <0x40>;
-defm V_MFMA_F32_16X16X1F32  : VOP3P_Real_MFMA <0x41>;
-defm V_MFMA_F32_4X4X1F32    : VOP3P_Real_MFMA <0x42>;
-defm V_MFMA_F32_32X32X2F32  : VOP3P_Real_MFMA <0x44>;
-defm V_MFMA_F32_16X16X4F32  : VOP3P_Real_MFMA <0x45>;
-defm V_MFMA_F32_32X32X4F16  : VOP3P_Real_MFMA <0x48>;
-defm V_MFMA_F32_16X16X4F16  : VOP3P_Real_MFMA <0x49>;
-defm V_MFMA_F32_4X4X4F16    : VOP3P_Real_MFMA <0x4a>;
-defm V_MFMA_F32_32X32X8F16  : VOP3P_Real_MFMA <0x4c>;
-defm V_MFMA_F32_16X16X16F16 : VOP3P_Real_MFMA <0x4d>;
-defm V_MFMA_I32_32X32X4I8   : VOP3P_Real_MFMA <0x50>;
-defm V_MFMA_I32_16X16X4I8   : VOP3P_Real_MFMA <0x51>;
-defm V_MFMA_I32_4X4X4I8     : VOP3P_Real_MFMA <0x52>;
+defm V_MFMA_F32_32X32X1F32  : VOP3P_Real_MFMA <0x40, "v_mfma_f32_32x32x1_2b_f32">;
+defm V_MFMA_F32_16X16X1F32  : VOP3P_Real_MFMA <0x41, "v_mfma_f32_16x16x1_4b_f32">;
+defm V_MFMA_F32_4X4X1F32    : VOP3P_Real_MFMA <0x42, "v_mfma_f32_4x4x1_16b_f32">;
+defm V_MFMA_F32_32X32X2F32  : VOP3P_Real_MFMA <0x44, "v_mfma_f32_32x32x2_f32">;
+defm V_MFMA_F32_16X16X4F32  : VOP3P_Real_MFMA <0x45, "v_mfma_f32_16x16x4_f32">;
+defm V_MFMA_F32_32X32X4F16  : VOP3P_Real_MFMA <0x48, "v_mfma_f32_32x32x4_2b_f16">;
+defm V_MFMA_F32_16X16X4F16  : VOP3P_Real_MFMA <0x49, "v_mfma_f32_16x16x4_4b_f16">;
+defm V_MFMA_F32_4X4X4F16    : VOP3P_Real_MFMA <0x4a, "v_mfma_f32_4x4x4_16b_f16">;
+defm V_MFMA_F32_32X32X8F16  : VOP3P_Real_MFMA <0x4c, "v_mfma_f32_32x32x8_f16">;
+defm V_MFMA_F32_16X16X16F16 : VOP3P_Real_MFMA <0x4d, "v_mfma_f32_16x16x16_f16">;
+defm V_MFMA_I32_32X32X4I8   : VOP3P_Real_MFMA <0x50, "v_mfma_i32_32x32x4_2b_i8">;
+defm V_MFMA_I32_16X16X4I8   : VOP3P_Real_MFMA <0x51, "v_mfma_i32_16x16x4_4b_i8">;
+defm V_MFMA_I32_4X4X4I8     : VOP3P_Real_MFMA <0x52, "v_mfma_i32_4x4x4_16b_i8">;
+
+let SubtargetPredicate = isGFX908orGFX90A in {
 defm V_MFMA_I32_16X16X16I8  : VOP3P_Real_MFMA <0x55>;
 defm V_MFMA_I32_32X32X8I8   : VOP3P_Real_MFMA <0x54>;
 defm V_MFMA_F32_32X32X2BF16 : VOP3P_Real_MFMA <0x68>;
@@ -654,6 +1105,7 @@ defm V_MFMA_F32_16X16X2BF16 : VOP3P_Real_MFMA <0x69>;
 defm V_MFMA_F32_4X4X2BF16   : VOP3P_Real_MFMA <0x6b>;
 defm V_MFMA_F32_32X32X4BF16 : VOP3P_Real_MFMA <0x6c>;
 defm V_MFMA_F32_16X16X8BF16 : VOP3P_Real_MFMA <0x6d>;
+}
 
 } // End SubtargetPredicate = HasMAIInsts
 
@@ -665,6 +1117,27 @@ defm V_MFMA_F32_16X16X16BF16_1K : VOP3P_Real_MFMA_gfx90a <0x67>;
 defm V_MFMA_F64_16X16X4F64      : VOP3P_Real_MFMA_gfx90a <0x6e>;
 defm V_MFMA_F64_4X4X4F64        : VOP3P_Real_MFMA_gfx90a <0x6f>;
 
+defm V_MFMA_I32_32X32X16I8       : VOP3P_Real_MFMA_gfx940 <0x56, "v_mfma_i32_32x32x16_i8">;
+defm V_MFMA_I32_16X16X32I8       : VOP3P_Real_MFMA_gfx940 <0x57, "v_mfma_i32_16x16x32_i8">;
+defm V_MFMA_F32_16X16X8XF32      : VOP3P_Real_MFMA_gfx940 <0x3e, "v_mfma_f32_16x16x8_xf32">;
+defm V_MFMA_F32_32X32X4XF32      : VOP3P_Real_MFMA_gfx940 <0x3f, "v_mfma_f32_32x32x4_xf32">;
+
+defm V_MFMA_F32_32X32X4BF16_1K   : VOP3P_Real_MFMA_gfx940 <0x5d, "v_mfma_f32_32x32x4_2b_bf16">;
+defm V_MFMA_F32_16X16X4BF16_1K   : VOP3P_Real_MFMA_gfx940 <0x5e, "v_mfma_f32_16x16x4_4b_bf16">;
+defm V_MFMA_F32_4X4X4BF16_1K     : VOP3P_Real_MFMA_gfx940 <0x5f, "v_mfma_f32_4x4x4_16b_bf16">;
+defm V_MFMA_F32_32X32X8BF16_1K   : VOP3P_Real_MFMA_gfx940 <0x60, "v_mfma_f32_32x32x8_bf16">;
+defm V_MFMA_F32_16X16X16BF16_1K  : VOP3P_Real_MFMA_gfx940 <0x61, "v_mfma_f32_16x16x16_bf16">;
+
+defm V_MFMA_F64_16X16X4F64       : VOP3P_Real_MFMA_gfx940 <0x6e, "v_mfma_f64_16x16x4_f64">;
+defm V_MFMA_F64_4X4X4F64         : VOP3P_Real_MFMA_gfx940 <0x6f, "v_mfma_f64_4x4x4_4b_f64">;
+
+defm V_SMFMAC_F32_16X16X32_F16     : VOP3P_Real_SMFMAC <0x62, "v_smfmac_f32_16x16x32f16">;
+defm V_SMFMAC_F32_32X32X16_F16     : VOP3P_Real_SMFMAC <0x64, "v_smfmac_f32_32x32x16f16">;
+defm V_SMFMAC_F32_16X16X32_BF16    : VOP3P_Real_SMFMAC <0x66, "v_smfmac_f32_16x16x32bf16">;
+defm V_SMFMAC_F32_32X32X16_BF16    : VOP3P_Real_SMFMAC <0x68, "v_smfmac_f32_32x32x16bf16">;
+defm V_SMFMAC_I32_16X16X64_I8      : VOP3P_Real_SMFMAC <0x6a, "v_smfmac_i32_16x16x64i8">;
+defm V_SMFMAC_I32_32X32X32_I8      : VOP3P_Real_SMFMAC <0x6c, "v_smfmac_i32_32x32x32i8">;
+
 let SubtargetPredicate = HasPackedFP32Ops in {
   defm V_PK_FMA_F32 : VOP3P_Real_vi <0x30>;
   defm V_PK_MUL_F32 : VOP3P_Real_vi <0x31>;
@@ -676,35 +1149,41 @@ let SubtargetPredicate = HasPackedFP32Ops in {
 // GFX10.
 //===----------------------------------------------------------------------===//
 
-let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10", VOP3P = 1 in {
+let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10", VOP3P = 1 in {
   multiclass VOP3P_Real_gfx10<bits<7> op> {
     def _gfx10 : VOP3P_Real<!cast<VOP3P_Pseudo>(NAME), SIEncodingFamily.GFX10>,
                  VOP3Pe_gfx10 <op, !cast<VOP3P_Pseudo>(NAME).Pfl>;
   }
-} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10", VOP3P = 1
-
-defm V_PK_MAD_I16     : VOP3P_Real_gfx10<0x00>;
-defm V_PK_MUL_LO_U16  : VOP3P_Real_gfx10<0x01>;
-defm V_PK_ADD_I16     : VOP3P_Real_gfx10<0x02>;
-defm V_PK_SUB_I16     : VOP3P_Real_gfx10<0x03>;
-defm V_PK_LSHLREV_B16 : VOP3P_Real_gfx10<0x04>;
-defm V_PK_LSHRREV_B16 : VOP3P_Real_gfx10<0x05>;
-defm V_PK_ASHRREV_I16 : VOP3P_Real_gfx10<0x06>;
-defm V_PK_MAX_I16     : VOP3P_Real_gfx10<0x07>;
-defm V_PK_MIN_I16     : VOP3P_Real_gfx10<0x08>;
-defm V_PK_MAD_U16     : VOP3P_Real_gfx10<0x09>;
-defm V_PK_ADD_U16     : VOP3P_Real_gfx10<0x0a>;
-defm V_PK_SUB_U16     : VOP3P_Real_gfx10<0x0b>;
-defm V_PK_MAX_U16     : VOP3P_Real_gfx10<0x0c>;
-defm V_PK_MIN_U16     : VOP3P_Real_gfx10<0x0d>;
-defm V_PK_FMA_F16     : VOP3P_Real_gfx10<0x0e>;
-defm V_PK_ADD_F16     : VOP3P_Real_gfx10<0x0f>;
-defm V_PK_MUL_F16     : VOP3P_Real_gfx10<0x10>;
-defm V_PK_MIN_F16     : VOP3P_Real_gfx10<0x11>;
-defm V_PK_MAX_F16     : VOP3P_Real_gfx10<0x12>;
-defm V_FMA_MIX_F32    : VOP3P_Real_gfx10<0x20>;
-defm V_FMA_MIXLO_F16  : VOP3P_Real_gfx10<0x21>;
-defm V_FMA_MIXHI_F16  : VOP3P_Real_gfx10<0x22>;
+} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10", VOP3P = 1
+
+multiclass VOP3P_Real_gfx10_gfx11<bits<7> op>
+  : VOP3P_Real_gfx10<op>, VOP3P_Real_gfx11<op>;
+
+multiclass VOP3P_Real_gfx10_gfx11_Triple<bits<7> op>
+  : VOP3P_Real_gfx10<op>, VOP3P_Realtriple_gfx11<op>;
+
+defm V_PK_MAD_I16     : VOP3P_Real_gfx10_gfx11<0x00>;
+defm V_PK_MUL_LO_U16  : VOP3P_Real_gfx10_gfx11<0x01>;
+defm V_PK_ADD_I16     : VOP3P_Real_gfx10_gfx11<0x02>;
+defm V_PK_SUB_I16     : VOP3P_Real_gfx10_gfx11<0x03>;
+defm V_PK_LSHLREV_B16 : VOP3P_Real_gfx10_gfx11<0x04>;
+defm V_PK_LSHRREV_B16 : VOP3P_Real_gfx10_gfx11<0x05>;
+defm V_PK_ASHRREV_I16 : VOP3P_Real_gfx10_gfx11<0x06>;
+defm V_PK_MAX_I16     : VOP3P_Real_gfx10_gfx11<0x07>;
+defm V_PK_MIN_I16     : VOP3P_Real_gfx10_gfx11<0x08>;
+defm V_PK_MAD_U16     : VOP3P_Real_gfx10_gfx11<0x09>;
+defm V_PK_ADD_U16     : VOP3P_Real_gfx10_gfx11<0x0a>;
+defm V_PK_SUB_U16     : VOP3P_Real_gfx10_gfx11<0x0b>;
+defm V_PK_MAX_U16     : VOP3P_Real_gfx10_gfx11<0x0c>;
+defm V_PK_MIN_U16     : VOP3P_Real_gfx10_gfx11<0x0d>;
+defm V_PK_FMA_F16     : VOP3P_Real_gfx10_gfx11<0x0e>;
+defm V_PK_ADD_F16     : VOP3P_Real_gfx10_gfx11<0x0f>;
+defm V_PK_MUL_F16     : VOP3P_Real_gfx10_gfx11<0x10>;
+defm V_PK_MIN_F16     : VOP3P_Real_gfx10_gfx11<0x11>;
+defm V_PK_MAX_F16     : VOP3P_Real_gfx10_gfx11<0x12>;
+defm V_FMA_MIX_F32    : VOP3P_Real_gfx10_gfx11_Triple <0x20>;
+defm V_FMA_MIXLO_F16  : VOP3P_Real_gfx10_gfx11_Triple <0x21>;
+defm V_FMA_MIXHI_F16  : VOP3P_Real_gfx10_gfx11_Triple <0x22>;
 
 let SubtargetPredicate = HasDot2Insts in {
 
@@ -715,9 +1194,9 @@ defm V_DOT2_U32_U16 : VOP3P_Real_gfx10 <0x15>;
 
 let SubtargetPredicate = HasDot7Insts in {
 
-defm V_DOT2_F32_F16 : VOP3P_Real_gfx10 <0x13>;
-defm V_DOT4_U32_U8  : VOP3P_Real_gfx10 <0x17>;
-defm V_DOT8_U32_U4  : VOP3P_Real_gfx10 <0x19>;
+defm V_DOT2_F32_F16 : VOP3P_Real_gfx10_gfx11_Triple <0x13>;
+defm V_DOT4_U32_U8  : VOP3P_Real_gfx10_gfx11 <0x17>;
+defm V_DOT8_U32_U4  : VOP3P_Real_gfx10_gfx11 <0x19>;
 
 } // End SubtargetPredicate = HasDot7Insts
 
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index c0cc91029d11..eb6c54a45263 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -49,12 +49,36 @@ class VOPC_SDWA9e <bits<8> op, VOPProfile P> : VOP_SDWA9Be <P> {
 // an explicit $dst.
 class VOPC_Profile<list<SchedReadWrite> sched, ValueType vt0, ValueType vt1 = vt0> :
   VOPProfile <[i1, vt0, vt1, untyped]> {
+  // We want to exclude instructions with 64bit operands
+  let HasExtDPP = getHasVOP3DPP<DstVT, Src0VT, Src1VT, Src2VT>.ret;
   let Asm32 = "$src0, $src1";
+
+  let AsmDPP = !if (HasModifiers,
+                    "$src0_modifiers, $src1_modifiers "
+                    "$dpp_ctrl$row_mask$bank_mask$bound_ctrl",
+                    "$src0, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl");
+  let AsmDPP8 = "$src0, $src1 $dpp8$fi";
+  let AsmDPP16 = AsmDPP#"$fi";
+  let InsDPP = getInsDPP<VOPDstOperand<Src0DPP>, Src0DPP, Src1DPP, Src2DPP,
+                         NumSrcArgs, HasModifiers, Src0ModDPP, Src1ModDPP,
+                         Src2ModDPP>.ret;
+  let InsDPP16 = getInsDPP16<VOPDstOperand<Src0DPP>, Src0DPP, Src1DPP, Src2DPP,
+                             NumSrcArgs, HasModifiers, Src0ModDPP, Src1ModDPP,
+                             Src2ModDPP>.ret;
+  let InsDPP8 = getInsDPP8<VOPDstOperand<Src0DPP>, Src0DPP, Src1DPP, Src2DPP,
+                           NumSrcArgs, HasModifiers, Src0ModDPP, Src1ModDPP,
+                           Src2ModDPP>.ret;
+
   // The destination for 32-bit encoding is implicit.
   let HasDst32 = 0;
   // VOPC disallows dst_sel and dst_unused as they have no effect on destination
   let EmitDstSel = 0;
   let Outs64 = (outs VOPDstS64orS32:$sdst);
+  let OutsVOP3DPP = Outs64;
+  let OutsVOP3DPP8 = Outs64;
+  let InsVOP3DPP = getInsVOP3DPP<InsVOP3Base, Src0VOP3DPP, NumSrcArgs>.ret;
+  let InsVOP3DPP16 = getInsVOP3DPP16<InsVOP3Base, Src0VOP3DPP, NumSrcArgs>.ret;
+  let InsVOP3DPP8 = getInsVOP3DPP8<InsVOP3Base, Src0VOP3DPP, NumSrcArgs>.ret;
   list<SchedReadWrite> Schedule = sched;
 }
 
@@ -62,12 +86,15 @@ class VOPC_NoSdst_Profile<list<SchedReadWrite> sched, ValueType vt0,
                           ValueType vt1 = vt0> :
   VOPC_Profile<sched, vt0, vt1> {
   let Outs64 = (outs );
+  let OutsVOP3DPP = Outs64;
+  let OutsVOP3DPP8 = Outs64;
   let OutsSDWA = (outs );
   let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
                      Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
                      src0_sel:$src0_sel, src1_sel:$src1_sel);
   let Asm64 = !if(isFloatType<Src0VT>.ret, "$src0_modifiers, $src1_modifiers$clamp",
                                            "$src0, $src1");
+  let AsmVOP3DPPBase = Asm64;
   let AsmSDWA9 = "$src0_modifiers, $src1_modifiers $src0_sel $src1_sel";
   let EmitDst = 0;
 }
@@ -100,8 +127,8 @@ class VOPC_Pseudo <string opName, VOPC_Profile P, list<dag> pattern=[],
   VOPProfile Pfl = P;
 }
 
-class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily> :
-  InstSI <ps.OutOperandList, ps.InOperandList, ps.PseudoInstr # " " # ps.AsmOperands, []>,
+class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily, string asm_name = ps.PseudoInstr> :
+  InstSI <ps.OutOperandList, ps.InOperandList, asm_name # " " # ps.AsmOperands, []>,
   SIMCInstr <ps.PseudoInstr, EncodingFamily> {
 
   let VALU = 1;
@@ -133,8 +160,9 @@ class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
 
 // This class is used only with VOPC instructions. Use $sdst for out operand
 class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst,
-                     string Asm32 = ps.Pfl.Asm32, VOPProfile p = ps.Pfl> :
-  InstAlias <ps.OpName#" "#Asm32, (inst)>, PredicateControl {
+                     string Asm32 = ps.Pfl.Asm32, string real_name = ps.OpName,
+                     VOPProfile p = ps.Pfl> :
+  InstAlias <real_name#" "#Asm32, (inst)>, PredicateControl {
 
   field bit isCompare;
   field bit isCommutable;
@@ -167,27 +195,32 @@ class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst,
   let SubtargetPredicate = AssemblerPredicate;
 }
 
-multiclass VOPCInstAliases <string OpName, string Arch> {
-  def : VOPCInstAlias <!cast<VOP3_Pseudo>(OpName#"_e64"),
-                       !cast<Instruction>(OpName#"_e32_"#Arch)>;
+multiclass VOPCInstAliases <string old_name, string Arch, string real_name = old_name> {
+  def : VOPCInstAlias <!cast<VOP3_Pseudo>(old_name#"_e64"),
+                       !cast<Instruction>(real_name#"_e32_"#Arch),
+                       !cast<VOP3_Pseudo>(old_name#"_e64").Pfl.Asm32,
+                       real_name>;
   let WaveSizePredicate = isWave32 in {
-    def : VOPCInstAlias <!cast<VOP3_Pseudo>(OpName#"_e64"),
-                         !cast<Instruction>(OpName#"_e32_"#Arch),
-                         "vcc_lo, "#!cast<VOP3_Pseudo>(OpName#"_e64").Pfl.Asm32>;
+    def : VOPCInstAlias <!cast<VOP3_Pseudo>(old_name#"_e64"),
+                         !cast<Instruction>(real_name#"_e32_"#Arch),
+                         "vcc_lo, "#!cast<VOP3_Pseudo>(old_name#"_e64").Pfl.Asm32,
+                         real_name>;
   }
   let WaveSizePredicate = isWave64 in {
-    def : VOPCInstAlias <!cast<VOP3_Pseudo>(OpName#"_e64"),
-                         !cast<Instruction>(OpName#"_e32_"#Arch),
-                         "vcc, "#!cast<VOP3_Pseudo>(OpName#"_e64").Pfl.Asm32>;
+    def : VOPCInstAlias <!cast<VOP3_Pseudo>(old_name#"_e64"),
+                         !cast<Instruction>(real_name#"_e32_"#Arch),
+                         "vcc, "#!cast<VOP3_Pseudo>(old_name#"_e64").Pfl.Asm32,
+                         real_name>;
   }
 }
 
-multiclass VOPCXInstAliases <string OpName, string Arch> {
-  def : VOPCInstAlias <!cast<VOP3_Pseudo>(OpName#"_e64"),
-                       !cast<Instruction>(OpName#"_e32_"#Arch)>;
+multiclass VOPCXInstAliases <string old_name, string Arch, string real_name = old_name> {
+  def : VOPCInstAlias <!cast<VOP3_Pseudo>(old_name#"_e64"),
+                       !cast<Instruction>(real_name#"_e32_"#Arch),
+                       !cast<VOP3_Pseudo>(old_name#"_e64").Pfl.Asm32,
+                       real_name>;
 }
 
-
 class getVOPCPat64 <SDPatternOperator cond, VOPProfile P> : LetDummies {
   list<dag> ret = !if(P.HasModifiers,
       [(set i1:$sdst,
@@ -205,6 +238,11 @@ class VCMPXNoSDstTable <bit has_sdst, string Name> {
   string NoSDstOp = Name;
 }
 
+class VCMPVCMPXTable <string Name> {
+  bit IsVCMPX = 0;
+  string VCMPOp = Name;
+}
+
 multiclass VOPC_Pseudos <string opName,
                          VOPC_Profile P,
                          SDPatternOperator cond = COND_NULL,
@@ -213,7 +251,8 @@ multiclass VOPC_Pseudos <string opName,
 
   def _e32 : VOPC_Pseudo <opName, P>,
              Commutable_REV<revOp#"_e32", !eq(revOp, opName)>,
-             VCMPXNoSDstTable<1, opName#"_e32"> {
+             VCMPXNoSDstTable<1, opName#"_e32">,
+             VCMPVCMPXTable<opName#"_e32"> {
     let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
     let SchedRW = P.Schedule;
     let isConvergent = DefExec;
@@ -223,7 +262,8 @@ multiclass VOPC_Pseudos <string opName,
 
   def _e64 : VOP3_Pseudo<opName, P, getVOPCPat64<cond, P>.ret>,
     Commutable_REV<revOp#"_e64", !eq(revOp, opName)>,
-    VCMPXNoSDstTable<1, opName#"_e64"> {
+    VCMPXNoSDstTable<1, opName#"_e64">,
+    VCMPVCMPXTable<opName#"_e64"> {
     let Defs = !if(DefExec, [EXEC], []);
     let SchedRW = P.Schedule;
     let isCompare = 1;
@@ -237,6 +277,26 @@ multiclass VOPC_Pseudos <string opName,
     let isConvergent = DefExec;
     let isCompare = 1;
   }
+
+  let SubtargetPredicate = isGFX11Plus in {
+  if P.HasExtDPP then
+      def _e32_dpp : VOP_DPP_Pseudo<opName, P> {
+        let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+        let SchedRW = P.Schedule;
+        let isConvergent = DefExec;
+        let isCompare = 1;
+        let VOPC = 1;
+        let Constraints = "";
+      }
+  if P.HasExtVOP3DPP then
+      def _e64_dpp : VOP3_DPP_Pseudo<opName, P> {
+        let Defs = !if(DefExec, [EXEC], []);
+        let SchedRW = P.Schedule;
+        let isCompare = 1;
+        let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $sdst", "");
+    }
+  } // end SubtargetPredicate = isGFX11Plus
+
 }
 
 let SubtargetPredicate = HasSdstCMPX in {
@@ -248,23 +308,27 @@ multiclass VOPCX_Pseudos <string opName,
 
   def _nosdst_e32 : VOPC_Pseudo <opName#"_nosdst", P_NoSDst, [], 0>,
              Commutable_REV<revOp#"_nosdst_e32", !eq(revOp, opName)>,
-             VCMPXNoSDstTable<0, opName#"_e32"> {
+             VCMPXNoSDstTable<0, opName#"_e32">,
+             VCMPVCMPXTable<!subst("v_cmpx", "v_cmp", opName#"_e32")> {
     let Defs = [EXEC];
     let SchedRW = P_NoSDst.Schedule;
     let isConvergent = 1;
     let isCompare = 1;
     let isCommutable = 1;
     let SubtargetPredicate = HasNoSdstCMPX;
+    let IsVCMPX = 1;
   }
 
   def _nosdst_e64 : VOP3_Pseudo<opName#"_nosdst", P_NoSDst>,
     Commutable_REV<revOp#"_nosdst_e64", !eq(revOp, opName)>,
-    VCMPXNoSDstTable<0, opName#"_e64"> {
+    VCMPXNoSDstTable<0, opName#"_e64">,
+    VCMPVCMPXTable<!subst("v_cmpx", "v_cmp", opName#"_e64")> {
     let Defs = [EXEC];
     let SchedRW = P_NoSDst.Schedule;
     let isCompare = 1;
     let isCommutable = 1;
     let SubtargetPredicate = HasNoSdstCMPX;
+    let IsVCMPX = 1;
   }
 
   foreach _ = BoolToList<P_NoSDst.HasExtSDWA>.ret in
@@ -275,6 +339,25 @@ multiclass VOPCX_Pseudos <string opName,
     let isCompare = 1;
     let SubtargetPredicate = HasNoSdstCMPX;
   }
+
+  let SubtargetPredicate = isGFX11Plus in {
+  if P.HasExtDPP then
+      def _nosdst_e32_dpp : VOP_DPP_Pseudo<opName#"_nosdst", P_NoSDst> {
+        let Defs = [EXEC];
+        let SchedRW = P_NoSDst.Schedule;
+        let isConvergent = 1;
+        let isCompare = 1;
+        let VOPC = 1;
+        let Constraints = "";
+      }
+  if P.HasExtVOP3DPP then
+      def _nosdst_e64_dpp : VOP3_DPP_Pseudo<opName#"_nosdst", P_NoSDst> {
+        let Defs = [EXEC];
+        let SchedRW = P_NoSDst.Schedule;
+        let isCompare = 1;
+        let Constraints = "";
+    }
+  } // end SubtargetPredicate = isGFX11Plus
 }
 } // End SubtargetPredicate = HasSdstCMPX
 
@@ -626,8 +709,18 @@ defm V_CMPX_T_U64 : VOPCX_I64 <"v_cmpx_t_u64">;
 
 class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> :
   VOPC_Profile<sched, vt, i32> {
+  let AsmDPP = "$src0_modifiers, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
+  let AsmDPP16 = AsmDPP#"$fi";
+  let InsDPP = (ins VGPR_32:$old, FPVRegInputMods:$src0_modifiers, VGPR_32:$src0, VGPR_32:$src1, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+  let InsDPP16 = !con(InsDPP, (ins FI:$fi));
+  // DPP8 forbids modifiers and can inherit from VOPC_Profile
+
   let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
+  dag InsPartVOP3DPP = (ins Src0Mod:$src0_modifiers, VGPRSrc_32:$src0, VGPRSrc_32:$src1);
+  let InsVOP3Base = !con(InsPartVOP3DPP, !if(HasOpSel, (ins op_sel0:$op_sel),
+                                                       (ins)));
   let Asm64 = "$sdst, $src0_modifiers, $src1";
+  let AsmVOP3DPPBase = Asm64;
 
   let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
                      Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
@@ -647,6 +740,7 @@ class VOPC_Class_NoSdst_Profile<list<SchedReadWrite> sched, ValueType vt> :
                      Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
                      src0_sel:$src0_sel, src1_sel:$src1_sel);
   let Asm64 = "$src0_modifiers, $src1";
+  let AsmVOP3DPPBase = Asm64;
   let AsmSDWA9 = "$src0_modifiers, $src1_modifiers $src0_sel $src1_sel";
   let EmitDst = 0;
 }
@@ -684,6 +778,24 @@ multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec,
     let SchedRW = p.Schedule;
     let isConvergent = DefExec;
   }
+
+  let SubtargetPredicate = isGFX11Plus in {
+  if p.HasExtDPP then
+      def _e32_dpp : VOP_DPP_Pseudo<opName, p> {
+        let Defs = !if(DefExec, !if(DefVcc, [VCC, EXEC], [EXEC]),
+                                !if(DefVcc, [VCC], []));
+        let SchedRW = p.Schedule;
+        let isConvergent = DefExec;
+        let VOPC = 1;
+        let Constraints = "";
+      }
+  if p.HasExtVOP3DPP then
+      def _e64_dpp : VOP3_DPP_Pseudo<opName, p> {
+        let Defs = !if(DefExec, [EXEC], []);
+        let SchedRW = p.Schedule;
+        let Constraints = !if(p.NumSrcArgs, p.TieRegDPP # " = $sdst", "");
+    }
+  } // end SubtargetPredicate = isGFX11Plus
 }
 
 let SubtargetPredicate = HasSdstCMPX in {
@@ -714,6 +826,23 @@ multiclass VOPCX_Class_Pseudos <string opName,
     let isConvergent = 1;
     let SubtargetPredicate = HasNoSdstCMPX;
   }
+
+  let SubtargetPredicate = isGFX11Plus in {
+  if P.HasExtDPP then
+      def _nosdst_e32_dpp : VOP_DPP_Pseudo<opName#"_nosdst", P_NoSDst> {
+        let Defs = [EXEC];
+        let SchedRW = P_NoSDst.Schedule;
+        let isConvergent = 1;
+        let VOPC = 1;
+        let Constraints = "";
+      }
+  if P.HasExtVOP3DPP then
+      def _nosdst_e64_dpp : VOP3_DPP_Pseudo<opName#"_nosdst", P_NoSDst> {
+        let Defs = [EXEC];
+        let SchedRW = P_NoSDst.Schedule;
+        let Constraints = "";
+    }
+  } // end SubtargetPredicate = isGFX11Plus
 }
 } // End SubtargetPredicate = HasSdstCMPX
 
@@ -871,15 +1000,677 @@ defm : FCMP_Pattern <COND_UGE, V_CMP_NLT_F16_e64, f16>;
 defm : FCMP_Pattern <COND_ULT, V_CMP_NGE_F16_e64, f16>;
 defm : FCMP_Pattern <COND_ULE, V_CMP_NGT_F16_e64, f16>;
 
+//===----------------------------------------------------------------------===//
+// DPP Encodings
+//===----------------------------------------------------------------------===//
+
+// VOPC32
+
+class VOPC_DPPe_Common<bits<8> op> : Enc64 {
+  bits<8> src1;
+  let Inst{16-9} = src1;
+  let Inst{24-17} = op;
+  let Inst{31-25} = 0x3e;
+}
+
+class VOPC_DPP_Base<bits<8> op, string OpName, VOPProfile P>
+    : VOP_DPP_Base<OpName, P, P.InsDPP16, " " #P.AsmDPP16>,
+      VOPC_DPPe_Common<op> {
+  bits<2> src0_modifiers;
+  bits<8> src0;
+  bits<2> src1_modifiers;
+  bits<9> dpp_ctrl;
+  bits<1> bound_ctrl;
+  bits<4> bank_mask;
+  bits<4> row_mask;
+  bit fi;
+
+  let Inst{8-0} = 0xfa;
+
+  let Inst{39-32} = !if (P.HasSrc0, src0{7-0}, 0);
+  let Inst{48-40} = dpp_ctrl;
+  let Inst{50} = fi;
+  let Inst{51} = bound_ctrl;
+  let Inst{52} = !if (P.HasSrc0Mods, src0_modifiers{0}, 0); // src0_neg
+  let Inst{53} = !if (P.HasSrc0Mods, src0_modifiers{1}, 0); // src0_abs
+  let Inst{54} = !if (P.HasSrc1Mods, src1_modifiers{0}, 0); // src1_neg
+  let Inst{55} = !if (P.HasSrc1Mods, src1_modifiers{1}, 0); // src1_abs
+  let Inst{59-56} = bank_mask;
+  let Inst{63-60} = row_mask;
+
+  let AsmMatchConverter = "cvtDPP";
+  let VOPC = 1;
+}
+
+class VOPC_DPP8_Base<bits<8> op, string OpName, VOPProfile P>
+    : VOP_DPP8_Base<OpName, P, P.InsDPP8, " " #P.AsmDPP8>,
+      VOPC_DPPe_Common<op> {
+  bits<8> src0;
+  bits<24> dpp8;
+  bits<9> fi;
+
+  let Inst{8-0} = fi;
+
+  let Inst{39-32} = !if (P.HasSrc0, src0{7-0}, 0);
+  let Inst{63-40} = dpp8{23-0};
+
+  let AsmMatchConverter = "cvtDPP8";
+  let VOPC = 1;
+}
+
+class VOPC_DPP16<bits<8> op, VOP_DPP_Pseudo ps, string opName = ps.OpName>
+    : VOPC_DPP_Base<op, opName, ps.Pfl> {
+  let AssemblerPredicate = HasDPP16;
+  let SubtargetPredicate = HasDPP16;
+  let hasSideEffects = ps.hasSideEffects;
+  let Defs = ps.Defs;
+  let SchedRW = ps.SchedRW;
+  let Uses = ps.Uses;
+  let OtherPredicates = ps.OtherPredicates;
+  let Constraints = ps.Constraints;
+  let AsmMatchConverter = "cvtVOPCNoDstDPP";
+}
+
+class VOPC_DPP16_SIMC<bits<8> op, VOP_DPP_Pseudo ps, int subtarget,
+                      string opName = ps.OpName>
+    : VOPC_DPP16<op, ps, opName>, SIMCInstr<ps.PseudoInstr, subtarget>;
+
+class VOPC_DPP8<bits<8> op, VOPC_Pseudo ps, string opName = ps.OpName>
+    : VOPC_DPP8_Base<op, opName, ps.Pfl> {
+  // Note ps is the non-dpp pseudo
+  let hasSideEffects = ps.hasSideEffects;
+  let Defs = ps.Defs;
+  let SchedRW = ps.SchedRW;
+  let Uses = ps.Uses;
+  let OtherPredicates = ps.OtherPredicates;
+  let Constraints = "";
+  let AsmMatchConverter = "cvtVOPCNoDstDPP8";
+}
+
+// VOPC64
+
+class VOPC64_DPP_Base<bits<10> op, string OpName, VOPProfile P>
+    : VOP3_DPP_Base<OpName, P, 1>, VOP3_DPPe_Common<op, P> {
+  Instruction Opcode = !cast<Instruction>(NAME);
+
+  bits<8> src0;
+  bits<9> dpp_ctrl;
+  bits<1> bound_ctrl;
+  bits<4> bank_mask;
+  bits<4> row_mask;
+  bit     fi;
+
+  let Inst{40-32} = 0xfa;
+  let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+  let Inst{80-72} = dpp_ctrl;
+  let Inst{82}    = fi;
+  let Inst{83}    = bound_ctrl;
+  // Inst{87-84} ignored by hw
+  let Inst{91-88} = bank_mask;
+  let Inst{95-92} = row_mask;
+
+}
+
+class VOPC64_DPP16<bits<10> op, VOP_DPP_Pseudo ps, string opName = ps.OpName>
+    : VOPC64_DPP_Base<op, opName, ps.Pfl> {
+  let AssemblerPredicate = HasDPP16;
+  let SubtargetPredicate = HasDPP16;
+  let hasSideEffects = ps.hasSideEffects;
+  let Defs = ps.Defs;
+  let SchedRW = ps.SchedRW;
+  let Uses = ps.Uses;
+  let OtherPredicates = ps.OtherPredicates;
+  let Constraints = ps.Constraints;
+}
+
+class VOPC64_DPP16_Dst<bits<10> op, VOP_DPP_Pseudo ps,
+                       string opName = ps.OpName>
+    : VOPC64_DPP16<op, ps, opName> {
+  bits<8> sdst;
+  let Inst{7-0} = sdst;
+}
+
+class VOPC64_DPP16_NoDst<bits<10> op, VOP_DPP_Pseudo ps,
+                         string opName = ps.OpName>
+    : VOPC64_DPP16<op, ps, opName> {
+  let Inst{7-0} = ? ;
+  let AsmMatchConverter = "cvtVOPC64NoDstDPP";
+}
+
+class VOPC64_DPP8_Base<bits<10> op, string OpName, VOPProfile P>
+    : VOP3_DPP8_Base<OpName, P>, VOP3_DPPe_Common<op, P> {
+  Instruction Opcode = !cast<Instruction>(NAME);
+
+  bits<8> src0;
+  bits<24> dpp8;
+  bits<9> fi;
+
+  let Inst{40-32} = fi;
+  let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+  let Inst{95-72} = dpp8{23-0};
+
+}
+
+class VOPC64_DPP8<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
+    : VOPC64_DPP8_Base<op, opName, ps.Pfl> {
+  // Note ps is the non-dpp pseudo
+  let hasSideEffects = ps.hasSideEffects;
+  let Defs = ps.Defs;
+  let SchedRW = ps.SchedRW;
+  let Uses = ps.Uses;
+  let OtherPredicates = ps.OtherPredicates;
+}
+
+class VOPC64_DPP8_Dst<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
+    : VOPC64_DPP8<op, ps, opName> {
+  bits<8> sdst;
+  let Inst{7-0} = sdst;
+  let Constraints = "$old = $sdst";
+}
+
+class VOPC64_DPP8_NoDst<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
+    : VOPC64_DPP8<op, ps, opName> {
+  let Inst{7-0} = ? ;
+  let AsmMatchConverter = "cvtVOPC64NoDstDPP8";
+  let Constraints = "";
+}
+
 //===----------------------------------------------------------------------===//
 // Target-specific instruction encodings.
 //===----------------------------------------------------------------------===//
 
+//===----------------------------------------------------------------------===//
+// GFX11.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX11Only in {
+  multiclass VOPC_Real_gfx11<bits<9> op> {
+    defvar ps32 = !cast<VOPC_Pseudo>(NAME#"_e32");
+    defvar ps64 = !cast<VOP3_Pseudo>(NAME#"_e64");
+    let DecoderNamespace = "GFX11" in {
+      def _e32_gfx11 : VOPC_Real<ps32, SIEncodingFamily.GFX11>,
+                       VOPCe<op{7-0}>;
+      def _e64_gfx11 : VOP3_Real<ps64, SIEncodingFamily.GFX11>,
+                       VOP3a_gfx11<{0, op}, ps64.Pfl> {
+        // Encoding used for VOPC instructions encoded as VOP3 differs from
+        // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
+        bits<8> sdst;
+        let Inst{7-0} = sdst;
+      }
+    } // End DecoderNamespace = "GFX11"
+
+    defm : VOPCInstAliases<NAME, "gfx11">;
+
+    foreach _ = BoolToList<ps32.Pfl.HasExtDPP>.ret in {
+      defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_e32" #"_dpp");
+      defvar AsmDPP = ps32.Pfl.AsmDPP16;
+      let DecoderNamespace = "DPPGFX11" in {
+        def _e32_dpp_gfx11 : VOPC_DPP16_SIMC<op{7-0}, psDPP,
+                                             SIEncodingFamily.GFX11>;
+        def _e32_dpp_w32_gfx11 : VOPC_DPP16<op{7-0}, psDPP> {
+          let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP;
+          let isAsmParserOnly = 1;
+          let WaveSizePredicate = isWave32;
+        }
+        def _e32_dpp_w64_gfx11 : VOPC_DPP16<op{7-0}, psDPP> {
+          let AsmString = psDPP.OpName # " vcc, " # AsmDPP;
+          let isAsmParserOnly = 1;
+          let WaveSizePredicate = isWave64;
+        }
+      }
+      defvar AsmDPP8 = ps32.Pfl.AsmDPP8;
+      let DecoderNamespace = "DPP8GFX11" in {
+        def _e32_dpp8_gfx11 : VOPC_DPP8<op{7-0}, ps32>;
+        def _e32_dpp8_w32_gfx11 : VOPC_DPP8<op{7-0}, ps32> {
+          let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8;
+          let isAsmParserOnly = 1;
+          let WaveSizePredicate = isWave32;
+        }
+        def _e32_dpp8_w64_gfx11 : VOPC_DPP8<op{7-0}, ps32> {
+          let AsmString = ps32.OpName # " vcc, " # AsmDPP8;
+          let isAsmParserOnly = 1;
+          let WaveSizePredicate = isWave64;
+        }
+      }
+    }
+    foreach _ = BoolToList<ps64.Pfl.HasExtVOP3DPP>.ret in {
+      defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_e64" #"_dpp");
+      defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
+      let DecoderNamespace = "DPPGFX11" in {
+        def _e64_dpp_gfx11 : VOPC64_DPP16_Dst<{0, op}, psDPP>,
+                             SIMCInstr<psDPP.PseudoInstr, SIEncodingFamily.GFX11>;
+        def _e64_dpp_w32_gfx11 : VOPC64_DPP16_Dst<{0, op}, psDPP> {
+          let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP;
+          let isAsmParserOnly = 1;
+          let WaveSizePredicate = isWave32;
+        }
+        def _e64_dpp_w64_gfx11 : VOPC64_DPP16_Dst<{0, op}, psDPP> {
+          let AsmString = psDPP.OpName # " vcc, " # AsmDPP;
+          let isAsmParserOnly = 1;
+          let WaveSizePredicate = isWave64;
+        }
+      }
+      defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
+      let DecoderNamespace = "DPP8GFX11" in {
+        def _e64_dpp8_gfx11 : VOPC64_DPP8_Dst<{0, op}, ps64>;
+        def _e64_dpp8_w32_gfx11 : VOPC64_DPP8_Dst<{0, op}, ps64> {
+          let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8;
+          let isAsmParserOnly = 1;
+          let WaveSizePredicate = isWave32;
+        }
+        def _e64_dpp8_w64_gfx11 : VOPC64_DPP8_Dst<{0, op}, ps64> {
+          let AsmString = ps32.OpName # " vcc, " # AsmDPP8;
+          let isAsmParserOnly = 1;
+          let WaveSizePredicate = isWave64;
+        }
+      }
+    }
+
+  }
+
+  multiclass VOPC_Real_with_name_gfx11<bits<9> op, string OpName,
+        string asm_name> {
+    defvar ps32 = !cast<VOPC_Pseudo>(OpName#"_e32");
+    defvar ps64 = !cast<VOP3_Pseudo>(OpName#"_e64");
+    let DecoderNamespace = "GFX11" in {
+      def _e32_gfx11 :
+        // 32 and 64 bit forms of the instruction have _e32 and _e64
+        // respectively appended to their assembly mnemonic.
+        // _e64 is printed as part of the VOPDstS64orS32 operand, whereas
+        // the destination-less 32bit forms add it to the asmString here.
+        VOPC_Real<ps32, SIEncodingFamily.GFX11, asm_name#"_e32">,
+        VOPCe<op{7-0}>,
+        MnemonicAlias<ps32.Mnemonic, asm_name>, Requires<[isGFX11Plus]>;
+      def _e64_gfx11 :
+            VOP3_Real<ps64, SIEncodingFamily.GFX11, asm_name>,
+            VOP3a_gfx11<{0, op}, ps64.Pfl>,
+            MnemonicAlias<ps64.Mnemonic, asm_name>, Requires<[isGFX11Plus]> {
+        // Encoding used for VOPC instructions encoded as VOP3 differs from
+        // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
+        bits<8> sdst;
+        let Inst{7-0} = sdst;
+      }
+    } // End DecoderNamespace = "GFX11"
+
+    defm : VOPCInstAliases<OpName, "gfx11", NAME>;
+
+    foreach _ = BoolToList<ps32.Pfl.HasExtDPP>.ret in {
+      defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName #"_e32" #"_dpp");
+      defvar AsmDPP = ps32.Pfl.AsmDPP16;
+      let DecoderNamespace = "DPPGFX11" in {
+        def _e32_dpp_gfx11 : VOPC_DPP16_SIMC<op{7-0}, psDPP,
+                                             SIEncodingFamily.GFX11, asm_name>;
+        def _e32_dpp_w32_gfx11
+            : VOPC_DPP16<op{7-0}, psDPP, asm_name> {
+          let AsmString = asm_name # " vcc_lo, " # AsmDPP;
+          let isAsmParserOnly = 1;
+          let WaveSizePredicate = isWave32;
+        }
+        def _e32_dpp_w64_gfx11
+            : VOPC_DPP16<op{7-0}, psDPP, asm_name> {
+          let AsmString = asm_name # " vcc, " # AsmDPP;
+          let isAsmParserOnly = 1;
+          let WaveSizePredicate = isWave64;
+        }
+      }
+      defvar AsmDPP8 = ps32.Pfl.AsmDPP8;
+      let DecoderNamespace = "DPP8GFX11" in {
+        def _e32_dpp8_gfx11 : VOPC_DPP8<op{7-0}, ps32, asm_name>;
+        def _e32_dpp8_w32_gfx11
+            : VOPC_DPP8<op{7-0}, ps32, asm_name> {
+          let AsmString = asm_name # " vcc_lo, " # AsmDPP8;
+          let isAsmParserOnly = 1;
+          let WaveSizePredicate = isWave32;
+        }
+        def _e32_dpp8_w64_gfx11
+            : VOPC_DPP8<op{7-0}, ps32, asm_name> {
+          let AsmString = asm_name # " vcc, " # AsmDPP8;
+          let isAsmParserOnly = 1;
+          let WaveSizePredicate = isWave64;
+        }
+      }
+    }
+
+    foreach _ = BoolToList<ps64.Pfl.HasExtVOP3DPP>.ret in {
+      defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName #"_e64" #"_dpp");
+      defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
+      let DecoderNamespace = "DPPGFX11" in {
+        def _e64_dpp_gfx11 : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name>,
+                             SIMCInstr<psDPP.PseudoInstr, SIEncodingFamily.GFX11>;
+        def _e64_dpp_w32_gfx11
+            : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> {
+          let AsmString = asm_name # " vcc_lo, " # AsmDPP;
+          let isAsmParserOnly = 1;
+          let WaveSizePredicate = isWave32;
+        }
+        def _e64_dpp_w64_gfx11
+            : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> {
+          let AsmString = asm_name # " vcc, " # AsmDPP;
+          let isAsmParserOnly = 1;
+          let WaveSizePredicate = isWave64;
+        }
+      }
+      defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
+      let DecoderNamespace = "DPP8GFX11" in {
+        def _e64_dpp8_gfx11 : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name>;
+        def _e64_dpp8_w32_gfx11
+            : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> {
+          let AsmString = asm_name # " vcc_lo, " # AsmDPP8;
+          let isAsmParserOnly = 1;
+          let WaveSizePredicate = isWave32;
+        }
+        def _e64_dpp8_w64_gfx11
+            : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> {
+          let AsmString = asm_name # " vcc, " # AsmDPP8;
+          let isAsmParserOnly = 1;
+          let WaveSizePredicate = isWave64;
+        }
+      }
+    }
+
+  }
+
+  multiclass VOPCX_Real_gfx11<bits<9> op> {
+    defvar ps32 = !cast<VOPC_Pseudo>(NAME#"_nosdst_e32");
+    defvar ps64 = !cast<VOP3_Pseudo>(NAME#"_nosdst_e64");
+    let DecoderNamespace = "GFX11" in {
+      def _e32_gfx11 :
+        VOPC_Real<ps32, SIEncodingFamily.GFX11>,
+        VOPCe<op{7-0}> {
+          let AsmString = !subst("_nosdst", "", ps32.PseudoInstr)
+                          # " " # ps32.AsmOperands;
+        }
+      def _e64_gfx11 :
+        VOP3_Real<ps64, SIEncodingFamily.GFX11>,
+        VOP3a_gfx11<{0, op}, ps64.Pfl> {
+          let Inst{7-0} = ?; // sdst
+          let AsmString = !subst("_nosdst", "", ps64.Mnemonic)
+                          # "{_e64} " # ps64.AsmOperands;
+        }
+    } // End DecoderNamespace = "GFX11"
+
+    defm : VOPCXInstAliases<NAME, "gfx11">;
+
+    foreach _ = BoolToList<ps32.Pfl.HasExtDPP>.ret in {
+      defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_nosdst_e32" #"_dpp");
+      defvar AsmDPP = ps32.Pfl.AsmDPP16;
+      let DecoderNamespace = "DPPGFX11" in {
+        def _e32_dpp_gfx11
+            : VOPC_DPP16_SIMC<op{7-0}, psDPP, SIEncodingFamily.GFX11> {
+          let AsmString = !subst("_nosdst", "", psDPP.OpName) # " " # AsmDPP;
+        }
+      }
+      defvar AsmDPP8 = ps32.Pfl.AsmDPP8;
+      let DecoderNamespace = "DPP8GFX11" in {
+        def _e32_dpp8_gfx11 : VOPC_DPP8<op{7-0}, ps32> {
+          let AsmString = !subst("_nosdst", "", ps32.OpName) # " " # AsmDPP8;
+        }
+      }
+    }
+
+    foreach _ = BoolToList<ps64.Pfl.HasExtVOP3DPP>.ret in {
+      defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_nosdst_e64" #"_dpp");
+      defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
+      let DecoderNamespace = "DPPGFX11" in {
+        def _e64_dpp_gfx11
+            : VOPC64_DPP16_NoDst<{0, op}, psDPP>,
+              SIMCInstr<psDPP.PseudoInstr, SIEncodingFamily.GFX11> {
+          let AsmString = !subst("_nosdst", "", psDPP.OpName)
+                          # "{_e64_dpp} " # AsmDPP;
+        }
+      }
+      defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
+      let DecoderNamespace = "DPP8GFX11" in {
+        def _e64_dpp8_gfx11 : VOPC64_DPP8_NoDst<{0, op}, ps64> {
+          let AsmString = !subst("_nosdst", "", ps64.OpName)
+                          # "{_e64_dpp} " # AsmDPP8;
+        }
+      }
+    }
+  }
+
+  multiclass VOPCX_Real_with_name_gfx11<bits<9> op, string OpName,
+        string asm_name> {
+    defvar ps32 = !cast<VOPC_Pseudo>(OpName#"_nosdst_e32");
+    defvar ps64 = !cast<VOP3_Pseudo>(OpName#"_nosdst_e64");
+    let DecoderNamespace = "GFX11" in {
+      def _e32_gfx11
+          : VOPC_Real<ps32, SIEncodingFamily.GFX11, asm_name>,
+            MnemonicAlias<!subst("_nosdst", "", ps32.Mnemonic), asm_name>,
+            Requires<[isGFX11Plus]>,
+            VOPCe<op{7-0}> {
+        let AsmString = asm_name # "{_e32} " # ps32.AsmOperands;
+      }
+      def _e64_gfx11
+          : VOP3_Real<ps64, SIEncodingFamily.GFX11, asm_name>,
+            MnemonicAlias<!subst("_nosdst", "", ps64.Mnemonic), asm_name>,
+            Requires<[isGFX11Plus]>,
+            VOP3a_gfx11<{0, op}, ps64.Pfl> {
+        let Inst{7-0} = ? ; // sdst
+        let AsmString = asm_name # "{_e64} " # ps64.AsmOperands;
+      }
+    } // End DecoderNamespace = "GFX11"
+
+    defm : VOPCXInstAliases<OpName, "gfx11", NAME>;
+
+    foreach _ = BoolToList<ps32.Pfl.HasExtDPP>.ret in {
+      defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName#"_nosdst_e32"#"_dpp");
+      let DecoderNamespace = "DPPGFX11" in {
+        def _e32_dpp_gfx11 : VOPC_DPP16_SIMC<op{7-0}, psDPP,
+                                             SIEncodingFamily.GFX11, asm_name>;
+      }
+      let DecoderNamespace = "DPP8GFX11" in {
+        def _e32_dpp8_gfx11 : VOPC_DPP8<op{7-0}, ps32, asm_name>;
+      }
+    }
+    foreach _ = BoolToList<ps64.Pfl.HasExtVOP3DPP>.ret in {
+      defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName#"_nosdst_e64"#"_dpp");
+      defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
+      let DecoderNamespace = "DPPGFX11" in {
+        def _e64_dpp_gfx11
+            : VOPC64_DPP16_NoDst<{0, op}, psDPP, asm_name>,
+              SIMCInstr<psDPP.PseudoInstr, SIEncodingFamily.GFX11> {
+          let AsmString = asm_name # "{_e64_dpp} " # AsmDPP;
+        }
+      }
+      defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
+      let DecoderNamespace = "DPP8GFX11" in {
+        def _e64_dpp8_gfx11 : VOPC64_DPP8_NoDst<{0, op}, ps64, asm_name> {
+          let AsmString = asm_name # "{_e64_dpp} " # AsmDPP8;
+        }
+      }
+    }
+
+  }
+} // End AssemblerPredicate = isGFX11Only
+
+defm V_CMP_F_F16      : VOPC_Real_gfx11<0x000>;
+defm V_CMP_LT_F16     : VOPC_Real_gfx11<0x001>;
+defm V_CMP_EQ_F16     : VOPC_Real_gfx11<0x002>;
+defm V_CMP_LE_F16     : VOPC_Real_gfx11<0x003>;
+defm V_CMP_GT_F16     : VOPC_Real_gfx11<0x004>;
+defm V_CMP_LG_F16     : VOPC_Real_gfx11<0x005>;
+defm V_CMP_GE_F16     : VOPC_Real_gfx11<0x006>;
+defm V_CMP_O_F16      : VOPC_Real_gfx11<0x007>;
+defm V_CMP_U_F16      : VOPC_Real_gfx11<0x008>;
+defm V_CMP_NGE_F16    : VOPC_Real_gfx11<0x009>;
+defm V_CMP_NLG_F16    : VOPC_Real_gfx11<0x00a>;
+defm V_CMP_NGT_F16    : VOPC_Real_gfx11<0x00b>;
+defm V_CMP_NLE_F16    : VOPC_Real_gfx11<0x00c>;
+defm V_CMP_NEQ_F16    : VOPC_Real_gfx11<0x00d>;
+defm V_CMP_NLT_F16    : VOPC_Real_gfx11<0x00e>;
+defm V_CMP_T_F16      : VOPC_Real_with_name_gfx11<0x00f, "V_CMP_TRU_F16", "v_cmp_t_f16">;
+defm V_CMP_F_F32      : VOPC_Real_gfx11<0x010>;
+defm V_CMP_LT_F32     : VOPC_Real_gfx11<0x011>;
+defm V_CMP_EQ_F32     : VOPC_Real_gfx11<0x012>;
+defm V_CMP_LE_F32     : VOPC_Real_gfx11<0x013>;
+defm V_CMP_GT_F32     : VOPC_Real_gfx11<0x014>;
+defm V_CMP_LG_F32     : VOPC_Real_gfx11<0x015>;
+defm V_CMP_GE_F32     : VOPC_Real_gfx11<0x016>;
+defm V_CMP_O_F32      : VOPC_Real_gfx11<0x017>;
+defm V_CMP_U_F32      : VOPC_Real_gfx11<0x018>;
+defm V_CMP_NGE_F32    : VOPC_Real_gfx11<0x019>;
+defm V_CMP_NLG_F32    : VOPC_Real_gfx11<0x01a>;
+defm V_CMP_NGT_F32    : VOPC_Real_gfx11<0x01b>;
+defm V_CMP_NLE_F32    : VOPC_Real_gfx11<0x01c>;
+defm V_CMP_NEQ_F32    : VOPC_Real_gfx11<0x01d>;
+defm V_CMP_NLT_F32    : VOPC_Real_gfx11<0x01e>;
+defm V_CMP_T_F32      : VOPC_Real_with_name_gfx11<0x01f, "V_CMP_TRU_F32", "v_cmp_t_f32">;
+defm V_CMP_T_F64      : VOPC_Real_with_name_gfx11<0x02f, "V_CMP_TRU_F64", "v_cmp_t_f64">;
+defm V_CMP_LT_I16     : VOPC_Real_gfx11<0x031>;
+defm V_CMP_EQ_I16     : VOPC_Real_gfx11<0x032>;
+defm V_CMP_LE_I16     : VOPC_Real_gfx11<0x033>;
+defm V_CMP_GT_I16     : VOPC_Real_gfx11<0x034>;
+defm V_CMP_NE_I16     : VOPC_Real_gfx11<0x035>;
+defm V_CMP_GE_I16     : VOPC_Real_gfx11<0x036>;
+defm V_CMP_LT_U16     : VOPC_Real_gfx11<0x039>;
+defm V_CMP_EQ_U16     : VOPC_Real_gfx11<0x03a>;
+defm V_CMP_LE_U16     : VOPC_Real_gfx11<0x03b>;
+defm V_CMP_GT_U16     : VOPC_Real_gfx11<0x03c>;
+defm V_CMP_NE_U16     : VOPC_Real_gfx11<0x03d>;
+defm V_CMP_GE_U16     : VOPC_Real_gfx11<0x03e>;
+defm V_CMP_F_I32      : VOPC_Real_gfx11<0x040>;
+defm V_CMP_LT_I32     : VOPC_Real_gfx11<0x041>;
+defm V_CMP_EQ_I32     : VOPC_Real_gfx11<0x042>;
+defm V_CMP_LE_I32     : VOPC_Real_gfx11<0x043>;
+defm V_CMP_GT_I32     : VOPC_Real_gfx11<0x044>;
+defm V_CMP_NE_I32     : VOPC_Real_gfx11<0x045>;
+defm V_CMP_GE_I32     : VOPC_Real_gfx11<0x046>;
+defm V_CMP_T_I32      : VOPC_Real_gfx11<0x047>;
+defm V_CMP_F_U32      : VOPC_Real_gfx11<0x048>;
+defm V_CMP_LT_U32     : VOPC_Real_gfx11<0x049>;
+defm V_CMP_EQ_U32     : VOPC_Real_gfx11<0x04a>;
+defm V_CMP_LE_U32     : VOPC_Real_gfx11<0x04b>;
+defm V_CMP_GT_U32     : VOPC_Real_gfx11<0x04c>;
+defm V_CMP_NE_U32     : VOPC_Real_gfx11<0x04d>;
+defm V_CMP_GE_U32     : VOPC_Real_gfx11<0x04e>;
+defm V_CMP_T_U32      : VOPC_Real_gfx11<0x04f>;
+
+defm V_CMP_F_I64      : VOPC_Real_gfx11<0x050>;
+defm V_CMP_LT_I64     : VOPC_Real_gfx11<0x051>;
+defm V_CMP_EQ_I64     : VOPC_Real_gfx11<0x052>;
+defm V_CMP_LE_I64     : VOPC_Real_gfx11<0x053>;
+defm V_CMP_GT_I64     : VOPC_Real_gfx11<0x054>;
+defm V_CMP_NE_I64     : VOPC_Real_gfx11<0x055>;
+defm V_CMP_GE_I64     : VOPC_Real_gfx11<0x056>;
+defm V_CMP_T_I64      : VOPC_Real_gfx11<0x057>;
+defm V_CMP_F_U64      : VOPC_Real_gfx11<0x058>;
+defm V_CMP_LT_U64     : VOPC_Real_gfx11<0x059>;
+defm V_CMP_EQ_U64     : VOPC_Real_gfx11<0x05a>;
+defm V_CMP_LE_U64     : VOPC_Real_gfx11<0x05b>;
+defm V_CMP_GT_U64     : VOPC_Real_gfx11<0x05c>;
+defm V_CMP_NE_U64     : VOPC_Real_gfx11<0x05d>;
+defm V_CMP_GE_U64     : VOPC_Real_gfx11<0x05e>;
+defm V_CMP_T_U64      : VOPC_Real_gfx11<0x05f>;
+
+defm V_CMP_CLASS_F16  : VOPC_Real_gfx11<0x07d>;
+defm V_CMP_CLASS_F32  : VOPC_Real_gfx11<0x07e>;
+defm V_CMP_CLASS_F64  : VOPC_Real_gfx11<0x07f>;
+
+defm V_CMPX_F_F16     : VOPCX_Real_gfx11<0x080>;
+defm V_CMPX_LT_F16    : VOPCX_Real_gfx11<0x081>;
+defm V_CMPX_EQ_F16    : VOPCX_Real_gfx11<0x082>;
+defm V_CMPX_LE_F16    : VOPCX_Real_gfx11<0x083>;
+defm V_CMPX_GT_F16    : VOPCX_Real_gfx11<0x084>;
+defm V_CMPX_LG_F16    : VOPCX_Real_gfx11<0x085>;
+defm V_CMPX_GE_F16    : VOPCX_Real_gfx11<0x086>;
+defm V_CMPX_O_F16     : VOPCX_Real_gfx11<0x087>;
+defm V_CMPX_U_F16     : VOPCX_Real_gfx11<0x088>;
+defm V_CMPX_NGE_F16   : VOPCX_Real_gfx11<0x089>;
+defm V_CMPX_NLG_F16   : VOPCX_Real_gfx11<0x08a>;
+defm V_CMPX_NGT_F16   : VOPCX_Real_gfx11<0x08b>;
+defm V_CMPX_NLE_F16   : VOPCX_Real_gfx11<0x08c>;
+defm V_CMPX_NEQ_F16   : VOPCX_Real_gfx11<0x08d>;
+defm V_CMPX_NLT_F16   : VOPCX_Real_gfx11<0x08e>;
+defm V_CMPX_T_F16     : VOPCX_Real_with_name_gfx11<0x08f, "V_CMPX_TRU_F16", "v_cmpx_t_f16">;
+defm V_CMPX_F_F32     : VOPCX_Real_gfx11<0x090>;
+defm V_CMPX_LT_F32    : VOPCX_Real_gfx11<0x091>;
+defm V_CMPX_EQ_F32    : VOPCX_Real_gfx11<0x092>;
+defm V_CMPX_LE_F32    : VOPCX_Real_gfx11<0x093>;
+defm V_CMPX_GT_F32    : VOPCX_Real_gfx11<0x094>;
+defm V_CMPX_LG_F32    : VOPCX_Real_gfx11<0x095>;
+defm V_CMPX_GE_F32    : VOPCX_Real_gfx11<0x096>;
+defm V_CMPX_O_F32     : VOPCX_Real_gfx11<0x097>;
+defm V_CMPX_U_F32     : VOPCX_Real_gfx11<0x098>;
+defm V_CMPX_NGE_F32   : VOPCX_Real_gfx11<0x099>;
+defm V_CMPX_NLG_F32   : VOPCX_Real_gfx11<0x09a>;
+defm V_CMPX_NGT_F32   : VOPCX_Real_gfx11<0x09b>;
+defm V_CMPX_NLE_F32   : VOPCX_Real_gfx11<0x09c>;
+defm V_CMPX_NEQ_F32   : VOPCX_Real_gfx11<0x09d>;
+defm V_CMPX_NLT_F32   : VOPCX_Real_gfx11<0x09e>;
+defm V_CMPX_T_F32     : VOPCX_Real_with_name_gfx11<0x09f, "V_CMPX_TRU_F32", "v_cmpx_t_f32">;
+
+defm V_CMPX_F_F64     : VOPCX_Real_gfx11<0x0a0>;
+defm V_CMPX_LT_F64    : VOPCX_Real_gfx11<0x0a1>;
+defm V_CMPX_EQ_F64    : VOPCX_Real_gfx11<0x0a2>;
+defm V_CMPX_LE_F64    : VOPCX_Real_gfx11<0x0a3>;
+defm V_CMPX_GT_F64    : VOPCX_Real_gfx11<0x0a4>;
+defm V_CMPX_LG_F64    : VOPCX_Real_gfx11<0x0a5>;
+defm V_CMPX_GE_F64    : VOPCX_Real_gfx11<0x0a6>;
+defm V_CMPX_O_F64     : VOPCX_Real_gfx11<0x0a7>;
+defm V_CMPX_U_F64     : VOPCX_Real_gfx11<0x0a8>;
+defm V_CMPX_NGE_F64   : VOPCX_Real_gfx11<0x0a9>;
+defm V_CMPX_NLG_F64   : VOPCX_Real_gfx11<0x0aa>;
+defm V_CMPX_NGT_F64   : VOPCX_Real_gfx11<0x0ab>;
+defm V_CMPX_NLE_F64   : VOPCX_Real_gfx11<0x0ac>;
+defm V_CMPX_NEQ_F64   : VOPCX_Real_gfx11<0x0ad>;
+defm V_CMPX_NLT_F64   : VOPCX_Real_gfx11<0x0ae>;
+defm V_CMPX_T_F64     : VOPCX_Real_with_name_gfx11<0x0af, "V_CMPX_TRU_F64", "v_cmpx_t_f64">;
+
+defm V_CMPX_LT_I16    : VOPCX_Real_gfx11<0x0b1>;
+defm V_CMPX_EQ_I16    : VOPCX_Real_gfx11<0x0b2>;
+defm V_CMPX_LE_I16    : VOPCX_Real_gfx11<0x0b3>;
+defm V_CMPX_GT_I16    : VOPCX_Real_gfx11<0x0b4>;
+defm V_CMPX_NE_I16    : VOPCX_Real_gfx11<0x0b5>;
+defm V_CMPX_GE_I16    : VOPCX_Real_gfx11<0x0b6>;
+defm V_CMPX_LT_U16    : VOPCX_Real_gfx11<0x0b9>;
+defm V_CMPX_EQ_U16    : VOPCX_Real_gfx11<0x0ba>;
+defm V_CMPX_LE_U16    : VOPCX_Real_gfx11<0x0bb>;
+defm V_CMPX_GT_U16    : VOPCX_Real_gfx11<0x0bc>;
+defm V_CMPX_NE_U16    : VOPCX_Real_gfx11<0x0bd>;
+defm V_CMPX_GE_U16    : VOPCX_Real_gfx11<0x0be>;
+defm V_CMPX_F_I32     : VOPCX_Real_gfx11<0x0c0>;
+defm V_CMPX_LT_I32    : VOPCX_Real_gfx11<0x0c1>;
+defm V_CMPX_EQ_I32    : VOPCX_Real_gfx11<0x0c2>;
+defm V_CMPX_LE_I32    : VOPCX_Real_gfx11<0x0c3>;
+defm V_CMPX_GT_I32    : VOPCX_Real_gfx11<0x0c4>;
+defm V_CMPX_NE_I32    : VOPCX_Real_gfx11<0x0c5>;
+defm V_CMPX_GE_I32    : VOPCX_Real_gfx11<0x0c6>;
+defm V_CMPX_T_I32     : VOPCX_Real_gfx11<0x0c7>;
+defm V_CMPX_F_U32     : VOPCX_Real_gfx11<0x0c8>;
+defm V_CMPX_LT_U32    : VOPCX_Real_gfx11<0x0c9>;
+defm V_CMPX_EQ_U32    : VOPCX_Real_gfx11<0x0ca>;
+defm V_CMPX_LE_U32    : VOPCX_Real_gfx11<0x0cb>;
+defm V_CMPX_GT_U32    : VOPCX_Real_gfx11<0x0cc>;
+defm V_CMPX_NE_U32    : VOPCX_Real_gfx11<0x0cd>;
+defm V_CMPX_GE_U32    : VOPCX_Real_gfx11<0x0ce>;
+defm V_CMPX_T_U32     : VOPCX_Real_gfx11<0x0cf>;
+
+defm V_CMPX_F_I64     : VOPCX_Real_gfx11<0x0d0>;
+defm V_CMPX_LT_I64    : VOPCX_Real_gfx11<0x0d1>;
+defm V_CMPX_EQ_I64    : VOPCX_Real_gfx11<0x0d2>;
+defm V_CMPX_LE_I64    : VOPCX_Real_gfx11<0x0d3>;
+defm V_CMPX_GT_I64    : VOPCX_Real_gfx11<0x0d4>;
+defm V_CMPX_NE_I64    : VOPCX_Real_gfx11<0x0d5>;
+defm V_CMPX_GE_I64    : VOPCX_Real_gfx11<0x0d6>;
+defm V_CMPX_T_I64     : VOPCX_Real_gfx11<0x0d7>;
+defm V_CMPX_F_U64     : VOPCX_Real_gfx11<0x0d8>;
+defm V_CMPX_LT_U64    : VOPCX_Real_gfx11<0x0d9>;
+defm V_CMPX_EQ_U64    : VOPCX_Real_gfx11<0x0da>;
+defm V_CMPX_LE_U64    : VOPCX_Real_gfx11<0x0db>;
+defm V_CMPX_GT_U64    : VOPCX_Real_gfx11<0x0dc>;
+defm V_CMPX_NE_U64    : VOPCX_Real_gfx11<0x0dd>;
+defm V_CMPX_GE_U64    : VOPCX_Real_gfx11<0x0de>;
+defm V_CMPX_T_U64     : VOPCX_Real_gfx11<0x0df>;
+defm V_CMPX_CLASS_F16 : VOPCX_Real_gfx11<0x0fd>;
+defm V_CMPX_CLASS_F32 : VOPCX_Real_gfx11<0x0fe>;
+defm V_CMPX_CLASS_F64 : VOPCX_Real_gfx11<0x0ff>;
+
 //===----------------------------------------------------------------------===//
 // GFX10.
 //===----------------------------------------------------------------------===//
 
-let AssemblerPredicate = isGFX10Plus in {
+let AssemblerPredicate = isGFX10Only in {
   multiclass VOPC_Real_gfx10<bits<9> op> {
     let DecoderNamespace = "GFX10" in {
       def _e32_gfx10 :
@@ -931,7 +1722,7 @@ let AssemblerPredicate = isGFX10Plus in {
 
     defm : VOPCXInstAliases<NAME, "gfx10">;
   }
-} // End AssemblerPredicate = isGFX10Plus
+} // End AssemblerPredicate = isGFX10Only
 
 defm V_CMP_LT_I16     : VOPC_Real_gfx10<0x089>;
 defm V_CMP_EQ_I16     : VOPC_Real_gfx10<0x08a>;
@@ -1025,6 +1816,12 @@ multiclass VOPCX_Real_gfx6_gfx7<bits<9> op> :
 multiclass VOPCX_Real_gfx6_gfx7_gfx10 <bits<9> op> :
   VOPC_Real_gfx6_gfx7<op>, VOPCX_Real_gfx10<op>;
 
+multiclass VOPC_Real_gfx6_gfx7_gfx10_gfx11<bits<9> op> :
+  VOPC_Real_gfx6_gfx7_gfx10<op>, VOPC_Real_gfx11<op>;
+
+multiclass VOPCX_Real_gfx6_gfx7_gfx10_gfx11<bits<9> op> :
+  VOPCX_Real_gfx6_gfx7_gfx10<op>, VOPCX_Real_gfx11<op>;
+
 defm V_CMP_F_F32      : VOPC_Real_gfx6_gfx7_gfx10<0x000>;
 defm V_CMP_LT_F32     : VOPC_Real_gfx6_gfx7_gfx10<0x001>;
 defm V_CMP_EQ_F32     : VOPC_Real_gfx6_gfx7_gfx10<0x002>;
@@ -1057,21 +1854,21 @@ defm V_CMPX_NLE_F32   : VOPCX_Real_gfx6_gfx7_gfx10<0x01c>;
 defm V_CMPX_NEQ_F32   : VOPCX_Real_gfx6_gfx7_gfx10<0x01d>;
 defm V_CMPX_NLT_F32   : VOPCX_Real_gfx6_gfx7_gfx10<0x01e>;
 defm V_CMPX_TRU_F32   : VOPCX_Real_gfx6_gfx7_gfx10<0x01f>;
-defm V_CMP_F_F64      : VOPC_Real_gfx6_gfx7_gfx10<0x020>;
-defm V_CMP_LT_F64     : VOPC_Real_gfx6_gfx7_gfx10<0x021>;
-defm V_CMP_EQ_F64     : VOPC_Real_gfx6_gfx7_gfx10<0x022>;
-defm V_CMP_LE_F64     : VOPC_Real_gfx6_gfx7_gfx10<0x023>;
-defm V_CMP_GT_F64     : VOPC_Real_gfx6_gfx7_gfx10<0x024>;
-defm V_CMP_LG_F64     : VOPC_Real_gfx6_gfx7_gfx10<0x025>;
-defm V_CMP_GE_F64     : VOPC_Real_gfx6_gfx7_gfx10<0x026>;
-defm V_CMP_O_F64      : VOPC_Real_gfx6_gfx7_gfx10<0x027>;
-defm V_CMP_U_F64      : VOPC_Real_gfx6_gfx7_gfx10<0x028>;
-defm V_CMP_NGE_F64    : VOPC_Real_gfx6_gfx7_gfx10<0x029>;
-defm V_CMP_NLG_F64    : VOPC_Real_gfx6_gfx7_gfx10<0x02a>;
-defm V_CMP_NGT_F64    : VOPC_Real_gfx6_gfx7_gfx10<0x02b>;
-defm V_CMP_NLE_F64    : VOPC_Real_gfx6_gfx7_gfx10<0x02c>;
-defm V_CMP_NEQ_F64    : VOPC_Real_gfx6_gfx7_gfx10<0x02d>;
-defm V_CMP_NLT_F64    : VOPC_Real_gfx6_gfx7_gfx10<0x02e>;
+defm V_CMP_F_F64      : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x020>;
+defm V_CMP_LT_F64     : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x021>;
+defm V_CMP_EQ_F64     : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x022>;
+defm V_CMP_LE_F64     : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x023>;
+defm V_CMP_GT_F64     : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x024>;
+defm V_CMP_LG_F64     : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x025>;
+defm V_CMP_GE_F64     : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x026>;
+defm V_CMP_O_F64      : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x027>;
+defm V_CMP_U_F64      : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x028>;
+defm V_CMP_NGE_F64    : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x029>;
+defm V_CMP_NLG_F64    : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x02a>;
+defm V_CMP_NGT_F64    : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x02b>;
+defm V_CMP_NLE_F64    : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x02c>;
+defm V_CMP_NEQ_F64    : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x02d>;
+defm V_CMP_NLT_F64    : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x02e>;
 defm V_CMP_TRU_F64    : VOPC_Real_gfx6_gfx7_gfx10<0x02f>;
 defm V_CMPX_F_F64     : VOPCX_Real_gfx6_gfx7_gfx10<0x030>;
 defm V_CMPX_LT_F64    : VOPCX_Real_gfx6_gfx7_gfx10<0x031>;
diff --git a/llvm/lib/Target/AMDGPU/VOPDInstructions.td b/llvm/lib/Target/AMDGPU/VOPDInstructions.td
new file mode 100644
index 000000000000..420f18436095
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/VOPDInstructions.td
@@ -0,0 +1,159 @@
+//===-- VOPDInstructions.td - Vector Instruction Definitions --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Encodings
+//===----------------------------------------------------------------------===//
+
+class VOPDe<bits<4> opX, bits<5> opY> : Enc64 {
+  bits<9> src0X;
+  bits<8> vsrc1X;
+  bits<8> vdstX;
+  bits<9> src0Y;
+  bits<8> vsrc1Y;
+  bits<8> vdstY;
+
+  let Inst{8-0} = src0X;
+  let Inst{16-9} = vsrc1X;
+  let Inst{21-17} = opY;
+  let Inst{25-22} = opX;
+  let Inst{31-26} = 0x32; // encoding
+  let Inst{40-32} = src0Y;
+  let Inst{48-41} = vsrc1Y;
+  let Inst{55-49} = vdstY{7-1};
+  let Inst{63-56} = vdstX;
+}
+
+class VOPD_MADKe<bits<4> opX, bits<5> opY> : Enc96 {
+  bits<9> src0X;
+  bits<8> vsrc1X;
+  bits<8> vdstX;
+  bits<9> src0Y;
+  bits<8> vsrc1Y;
+  bits<8> vdstY;
+  bits<32> imm;
+
+  let Inst{8-0} = src0X;
+  let Inst{16-9} = vsrc1X;
+  let Inst{21-17} = opY;
+  let Inst{25-22} = opX;
+  let Inst{31-26} = 0x32; // encoding
+  let Inst{40-32} = src0Y;
+  let Inst{48-41} = vsrc1Y;
+  let Inst{55-49} = vdstY{7-1};
+  let Inst{63-56} = vdstX;
+  let Inst{95-64} = imm;
+}
+
+//===----------------------------------------------------------------------===//
+// VOPD classes
+//===----------------------------------------------------------------------===//
+
+class VOPD_Base<dag outs, dag ins, string asm, VOP_Pseudo VDX, VOP_Pseudo VDY,
+                VOPD_Component XasVC, VOPD_Component YasVC>
+    : VOPAnyCommon<outs, ins, asm, []>,
+      VOP<NAME>,
+      SIMCInstr<NAME, SIEncodingFamily.GFX11> {
+  // Fields for table indexing
+  Instruction Opcode = !cast<Instruction>(NAME);
+  bits<5> OpX = XasVC.VOPDOp;
+  bits<5> OpY = YasVC.VOPDOp;
+
+  let VALU = 1;
+
+  let DecoderNamespace = "GFX11";
+  let AssemblerPredicate = isGFX11Plus;
+  let WaveSizePredicate = isWave32;
+  let isCodeGenOnly = 0;
+  let SubtargetPredicate = isGFX11Plus;
+  let AsmMatchConverter  = "cvtVOPD";
+  let Size = 8;
+  let ReadsModeReg = !or(VDX.ReadsModeReg, VDY.ReadsModeReg);
+  let mayRaiseFPException = ReadsModeReg;
+
+  let Uses = RegListUnion<VDX.Uses, VDY.Uses>.ret;
+  let Defs = RegListUnion<VDX.Defs, VDY.Defs>.ret;
+  let SchedRW = !listconcat(VDX.SchedRW, VDY.SchedRW);
+}
+
+class VOPD<dag outs, dag ins, string asm, VOP_Pseudo VDX, VOP_Pseudo VDY,
+           VOPD_Component XasVC, VOPD_Component YasVC>
+    : VOPD_Base<outs, ins, asm, VDX, VDY, XasVC, YasVC>,
+      VOPDe<XasVC.VOPDOp{3-0}, YasVC.VOPDOp> {
+  let Inst{16-9} = !if (!eq(VDX.Mnemonic, "v_mov_b32"), 0x0, vsrc1X);
+  let Inst{48-41} = !if (!eq(VDY.Mnemonic, "v_mov_b32"), 0x0, vsrc1Y);
+}
+
+class VOPD_MADK<dag outs, dag ins, string asm, VOP_Pseudo VDX, VOP_Pseudo VDY,
+                VOPD_Component XasVC, VOPD_Component YasVC>
+    : VOPD_Base<outs, ins, asm, VDX, VDY, XasVC, YasVC>,
+      VOPD_MADKe<XasVC.VOPDOp{3-0}, YasVC.VOPDOp> {
+  let Inst{16-9} = !if (!eq(VDX.Mnemonic, "v_mov_b32"), 0x0, vsrc1X);
+  let Inst{48-41} = !if (!eq(VDY.Mnemonic, "v_mov_b32"), 0x0, vsrc1Y);
+  let Size = 12;
+}
+
+// V_DUAL_DOT2ACC_F32_BF16 is a legal instruction, but V_DOT2ACC_F32_BF16 is
+// not. Since we generate the DUAL form by converting from the normal form we
+// will never generate it.
+defvar VOPDYPseudos = [
+  "V_FMAC_F32_e32", "V_FMAAK_F32", "V_FMAMK_F32", "V_MUL_F32_e32",
+  "V_ADD_F32_e32", "V_SUB_F32_e32", "V_SUBREV_F32_e32", "V_MUL_LEGACY_F32_e32",
+  "V_MOV_B32_e32", "V_CNDMASK_B32_e32", "V_MAX_F32_e32", "V_MIN_F32_e32",
+  "V_DOT2C_F32_F16_e32", "V_ADD_U32_e32", "V_LSHLREV_B32_e32", "V_AND_B32_e32"
+];
+defvar VOPDXPseudos = VOPDYPseudos[0...VOPDX_Max_Index];
+
+def VOPDDstYOperand : RegisterOperand<VGPR_32, "printRegularOperand"> {
+  let DecoderMethod = "decodeOperandVOPDDstY";
+}
+
+foreach x = VOPDXPseudos in {
+  foreach y = VOPDYPseudos in {
+    defvar xInst = !cast<VOP_Pseudo>(x);
+    defvar yInst = !cast<VOP_Pseudo>(y);
+    defvar XasVC = !cast<VOPD_Component>(x);
+    defvar YasVC = !cast<VOPD_Component>(y);
+    defvar isMADK = !or(!eq(x, "V_FMAAK_F32"), !eq(x, "V_FMAMK_F32"),
+                        !eq(y, "V_FMAAK_F32"), !eq(y, "V_FMAMK_F32"));
+    // If X or Y is MADK (have a mandatory immediate), all src operands which
+    // may contain an optional literal must use the VSrc_*_Deferred operand
+    // type. Optional literal operands in MADK VOPD components always use this
+    // operand form. If Both X and Y are MADK, the mandatory literal of X
+    // additionally must use an alternate operand format which defers to the
+    // 'real' Y literal
+    defvar isOpXMADK = !or(!eq(x, "V_FMAAK_F32"), !eq(x, "V_FMAMK_F32"));
+    defvar isOpYMADK = !or(!eq(y, "V_FMAAK_F32"), !eq(y, "V_FMAMK_F32"));
+    defvar OpName = "V_DUAL_" # !substr(x,2) # "_X_" # !substr(y,2);
+    defvar outs = (outs VGPRSrc_32:$vdstX, VOPDDstYOperand:$vdstY);
+    if !or(isOpXMADK, isOpYMADK) then {
+      if !and(isOpXMADK, isOpYMADK) then {
+        defvar X_MADK_Pfl = !cast<VOP_MADK_Base>(xInst.Pfl);
+        defvar ins = !con(xInst.Pfl.InsVOPDXDeferred, yInst.Pfl.InsVOPDY);
+        defvar asm = XasVC.VOPDName #" "# X_MADK_Pfl.AsmVOPDXDeferred #" :: "# YasVC.VOPDName #" "# yInst.Pfl.AsmVOPDY;
+        def OpName : VOPD_MADK<outs, ins, asm, xInst, yInst, XasVC, YasVC>;
+      } else {
+        defvar asm = XasVC.VOPDName #" "# xInst.Pfl.AsmVOPDX #" :: "# YasVC.VOPDName #" "# yInst.Pfl.AsmVOPDY;
+        if isOpXMADK then {
+          assert !not(isOpYMADK), "Expected only OpX as MADK";
+          defvar ins = !con(xInst.Pfl.InsVOPDX, yInst.Pfl.InsVOPDYDeferred);
+          def OpName : VOPD_MADK<outs, ins, asm, xInst, yInst, XasVC, YasVC>;
+        } else {
+          assert !not(isOpXMADK), "Expected only OpY as MADK";
+          defvar ins = !con(xInst.Pfl.InsVOPDXDeferred, yInst.Pfl.InsVOPDY);
+          def OpName : VOPD_MADK<outs, ins, asm, xInst, yInst, XasVC, YasVC>;
+        }
+      }
+    } else {
+      defvar ins = !con(xInst.Pfl.InsVOPDX, yInst.Pfl.InsVOPDY);
+      defvar asm = XasVC.VOPDName #" "# xInst.Pfl.AsmVOPDX #" :: "# YasVC.VOPDName #" "# yInst.Pfl.AsmVOPDY;
+      def OpName : VOPD<outs, ins, asm, xInst, yInst, XasVC, YasVC>;
+    }
+  }
+}
+
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index a8368892c565..8cd3d2fe2c47 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -30,6 +30,16 @@ class VOP <string opName> {
   string OpName = opName;
 }
 
+// First 13 insts from VOPDY are also VOPDX. DOT2ACC_F32_BF16 is omitted
+defvar VOPDX_Max_Index = 12;
+
+class VOPD_Component<bits<5> OpIn, string vOPDName> {
+  Instruction BaseVOP = !cast<Instruction>(NAME);
+  string VOPDName = "v_dual_" # !substr(vOPDName, 2);
+  bits<5> VOPDOp = OpIn;
+  bit CanBeVOPDX = !le(VOPDOp, VOPDX_Max_Index);
+}
+
 class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> :
     InstSI <outs, ins, asm, pattern> {
 
@@ -92,6 +102,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
   let VOP3_OPSEL = isVop3OpSel;
   let IsPacked = P.IsPacked;
   let IsMAI = P.IsMAI;
+  let IsWMMA = P.IsWMMA;
 
   let AsmOperands = !if(isVop3OpSel,
                         P.AsmVOP3OpSel,
@@ -144,9 +155,9 @@ class VOP_Real<VOP_Pseudo ps> {
   bit IsSingle = ps.Pfl.IsSingle;
 }
 
-class VOP3_Real <VOP_Pseudo ps, int EncodingFamily> :
+class VOP3_Real <VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemonic> :
   VOP_Real <ps>,
-  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+  InstSI <ps.OutOperandList, ps.InOperandList, asm_name # ps.AsmOperands, []>,
   SIMCInstr <ps.PseudoInstr, EncodingFamily> {
 
   let VALU = 1;
@@ -155,9 +166,6 @@ class VOP3_Real <VOP_Pseudo ps, int EncodingFamily> :
   let isCodeGenOnly = 0;
   let UseNamedOperandTable = 1;
 
-  let Constraints     = ps.Constraints;
-  let DisableEncoding = ps.DisableEncoding;
-
   // copy relevant pseudo op flags
   let SubtargetPredicate = ps.SubtargetPredicate;
   let OtherPredicates    = ps.OtherPredicates;
@@ -179,8 +187,12 @@ class VOP3_Real <VOP_Pseudo ps, int EncodingFamily> :
 
 // XXX - Is there any reason to distinguish this from regular VOP3
 // here?
-class VOP3P_Real<VOP_Pseudo ps, int EncodingFamily> :
-  VOP3_Real<ps, EncodingFamily>;
+class VOP3P_Real<VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemonic> :
+  VOP3_Real<ps, EncodingFamily, asm_name> {
+
+  // The v_wmma pseudos have extra constraints that we do not want to impose on the real instruction.
+  let Constraints        = !if(!eq(!substr(ps.Mnemonic,0,6), "v_wmma"), "", ps.Constraints);
+}
 
 class VOP3a<VOPProfile P> : Enc64 {
   bits<4> src0_modifiers;
@@ -217,6 +229,8 @@ class VOP3a_gfx10<bits<10> op, VOPProfile p> : VOP3a<p> {
   let Inst{31-26} = 0x35;
 }
 
+class VOP3a_gfx11<bits<10> op, VOPProfile p> : VOP3a_gfx10<op, p>;
+
 class VOP3a_vi <bits<10> op, VOPProfile P> : VOP3a<P> {
   let Inst{25-16} = op;
   let Inst{15}    = !if(P.HasClamp, clamp{0}, 0);
@@ -232,6 +246,8 @@ class VOP3e_gfx10<bits<10> op, VOPProfile p> : VOP3a_gfx10<op, p> {
   let Inst{7-0} = !if(p.EmitDst, vdst{7-0}, 0);
 }
 
+class VOP3e_gfx11<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p>;
+
 class VOP3e_vi <bits<10> op, VOPProfile P> : VOP3a_vi <op, P> {
   bits<8> vdst;
   let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0);
@@ -251,6 +267,9 @@ class VOP3OpSel_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
   let Inst{14} = !if(p.HasDst,  src0_modifiers{3}, 0);
 }
 
+class VOP3OpSel_gfx11<bits<10> op, VOPProfile p> : VOP3OpSel_gfx10<op, p>;
+
+
 // NB: For V_INTERP* opcodes, src0 is encoded as src1 and vice versa
 class VOP3Interp_vi <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {
   bits<2> attrchan;
@@ -285,6 +304,8 @@ class VOP3Interp_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
   let Inst{62}    = !if(p.HasSrc0Mods, src0_modifiers{0}, 0);
 }
 
+class VOP3Interp_gfx11<bits<10> op, VOPProfile p> : VOP3Interp_gfx10<op, p>;
+
 class VOP3be <VOPProfile P> : Enc64 {
   bits<8> vdst;
   bits<2> src0_modifiers;
@@ -310,7 +331,6 @@ class VOP3be <VOPProfile P> : Enc64 {
 
 class VOP3Pe <bits<7> op, VOPProfile P> : Enc64 {
   bits<8> vdst;
-  // neg, neg_hi, op_sel put in srcN_modifiers
   bits<4> src0_modifiers;
   bits<9> src0;
   bits<4> src1_modifiers;
@@ -372,11 +392,42 @@ class VOP3Pe_MAI <bits<7> op, VOPProfile P, bit acc_cd = 0> : Enc64 {
   let Inst{63-61} = !if(P.HasSrc1, blgp, 0);
 }
 
+class VOP3Pe_SMFMAC <bits<7> op> : Enc64 {
+  bits<10> vdst; // VGPR or AGPR, but not SGPR. vdst{8} is not encoded in the instruction.
+  bits<10> src0;
+  bits<10> src1;
+  bits<9> idx;
+  bits<3> blgp;
+  bits<3> cbsz;
+  bits<4> abid;
+
+  let blgp = 0;
+
+  let Inst{7-0} = vdst{7-0};
+
+  let Inst{10-8}  = cbsz;
+  let Inst{14-11} = abid;
+
+  let Inst{15} = vdst{9}; // acc(vdst)
+
+  let Inst{22-16} = op;
+  let Inst{31-23} = 0x1a7; // encoding
+  let Inst{40-32} = src0{8-0};
+  let Inst{49-41} = src1{8-0};
+  let Inst{58-50} = idx;
+
+  let Inst{59}    = src0{9}; // acc(0)
+  let Inst{60}    = src1{9}; // acc(1)
+
+  let Inst{63-61} = blgp;
+}
 
 class VOP3Pe_gfx10 <bits<7> op, VOPProfile P> : VOP3Pe<op, P> {
   let Inst{31-23} = 0x198; //encoding
 }
 
+class VOP3Pe_gfx11<bits<7> op, VOPProfile P> : VOP3Pe_gfx10<op, P>;
+
 class VOP3be_gfx6_gfx7<bits<9> op, VOPProfile p> : VOP3be<p> {
   let Inst{25-17} = op;
 }
@@ -388,6 +439,8 @@ class VOP3be_gfx10<bits<10> op, VOPProfile p> : VOP3be<p> {
   let Inst{31-26} = 0x35;
 }
 
+class VOP3be_gfx11<bits<10> op, VOPProfile p> : VOP3be_gfx10<op, p>;
+
 class VOP3be_vi <bits<10> op, VOPProfile P> : VOP3be<P> {
   bits<1> clamp;
   let Inst{25-16} = op;
@@ -621,8 +674,89 @@ class VOP_DPPe<VOPProfile P, bit IsDPP16=0> : Enc64 {
   let Inst{63-60} = row_mask;
 }
 
-class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
-  InstSI <P.OutsDPP, P.InsDPP, OpName#P.AsmDPP, pattern>,
+class VOP3_DPPe_Fields_Base {
+  bits<9> dpp_ctrl;
+  bits<1> bound_ctrl;
+  bits<4> bank_mask;
+  bits<4> row_mask;
+  bit     fi;
+}
+class VOP3_DPPe_Fields : VOP3_DPPe_Fields_Base {
+  bits<8> src0;
+}
+
+// Common refers to common between DPP and DPP8
+class VOP3_DPPe_Common_Base<bits<10> op, VOPProfile P> : Enc96 {
+  bits<4> src0_modifiers;
+  bits<3> src1_modifiers;
+  bits<3> src2_modifiers;
+  bits<1> clamp;
+  bits<2> omod;
+
+  let Inst{8}     = !if(P.HasSrc0Mods, src0_modifiers{1}, 0);
+  let Inst{9}     = !if(P.HasSrc1Mods, src1_modifiers{1}, 0);
+  let Inst{10}    = !if(P.HasSrc2Mods, src2_modifiers{1}, 0);
+  // OPSEL must be set such that the low result only uses low inputs, and the high result only uses high inputs.
+  let Inst{11} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{2}, 0),?);
+  let Inst{12} = !if(P.HasOpSel,!if(P.HasSrc1Mods, src1_modifiers{2}, 0),?);
+  let Inst{13} = !if(P.HasOpSel,!if(P.HasSrc2Mods, src2_modifiers{2}, 0),?);
+  let Inst{14} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{3}, 0),?);
+  let Inst{15}    = !if(P.HasClamp, clamp, 0);
+  let Inst{25-16} = op;
+  let Inst{31-26} = 0x35;
+
+  let Inst{60-59} = !if(P.HasOMod, omod, 0);
+  let Inst{61}    = !if(P.HasSrc0Mods, src0_modifiers{0}, 0);
+  let Inst{62}    = !if(P.HasSrc1Mods, src1_modifiers{0}, 0);
+  let Inst{63}    = !if(P.HasSrc2Mods, src2_modifiers{0}, 0);
+}
+
+class VOP3_DPPe_Common<bits<10> op, VOPProfile P> : VOP3_DPPe_Common_Base<op, P> {
+  bits<8> vdst;
+  bits<9> src1;
+  bits<9> src2;
+
+  let Inst{7-0}   = !if(P.EmitDst, vdst{7-0}, 0);
+  let Inst{49-41} = !if(P.HasSrc1, src1, 0);
+  let Inst{58-50} = !if(P.HasSrc2, src2, 0);
+}
+
+class VOP3P_DPPe_Common_Base<bits<7> op, VOPProfile P> : Enc96 {
+  bits<4> src0_modifiers;
+  bits<4> src1_modifiers;
+  bits<4> src2_modifiers;
+  bits<1> clamp;
+
+  let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0
+  let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // neg_hi src1
+  let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); // neg_hi src2
+  let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, 0); // op_sel(0)
+  let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1)
+  let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2)
+  let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, ?); // op_sel_hi(2)
+  let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
+  let Inst{22-16} = op;
+  let Inst{31-23} = 0x198; // encoding
+  let Inst{59}    = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, ?); // op_sel_hi(0)
+  let Inst{60}    = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, ?); // op_sel_hi(1)
+  let Inst{61}    = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // neg (lo)
+  let Inst{62}    = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // neg (lo)
+  let Inst{63}    = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo)
+}
+
+class VOP3P_DPPe_Common<bits<7> op, VOPProfile P> : VOP3P_DPPe_Common_Base<op, P> {
+  bits<8> vdst;
+  bits<9> src1;
+  bits<9> src2;
+
+  let Inst{7-0} = vdst;
+  let Inst{49-41} = !if(P.HasSrc1, src1, 0);
+  let Inst{58-50} = !if(P.HasSrc2, src2, 0);
+}
+
+class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[],
+  dag Ins = P.InsDPP, string asmOps = P.AsmDPP> :
+  InstSI <P.OutsDPP, Ins, OpName#asmOps, pattern>,
   VOP <OpName>,
   SIMCInstr <OpName#"_dpp", SIEncodingFamily.NONE> {
 
@@ -645,7 +779,7 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
   let isConvergent = 1;
 
   string Mnemonic = OpName;
-  string AsmOperands = P.AsmDPP;
+  string AsmOperands = asmOps;
 
   let AsmMatchConverter = !if(P.HasModifiers, "cvtDPP", "");
   let SubtargetPredicate = !if(P.HasExt64BitDPP, Has64BitDPP, HasDPP);
@@ -659,6 +793,17 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
   VOPProfile Pfl = P;
 }
 
+class VOP3_DPP_Pseudo <string OpName, VOPProfile P> :
+  VOP_DPP_Pseudo <OpName, P, [], P.InsVOP3DPP, P.AsmVOP3DPP> {
+  let PseudoInstr = OpName#"_e64"#"_dpp";
+  let OutOperandList = P.OutsVOP3DPP;
+  let Size = 12;
+  let VOP3 = 1;
+  let AsmMatchConverter = "cvtVOP3DPP";
+  let AsmVariantName = !if(P.HasExtVOP3DPP, AMDGPUAsmVariants.VOP3_DPP,
+                                            AMDGPUAsmVariants.Disable);
+}
+
 class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> :
   InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
   SIMCInstr <ps.PseudoInstr, EncodingFamily> {
@@ -679,6 +824,7 @@ class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> :
   let isConvergent         = ps.isConvergent;
   let SubtargetPredicate   = ps.SubtargetPredicate;
   let AssemblerPredicate   = ps.AssemblerPredicate;
+  let OtherPredicates      = ps.OtherPredicates;
   let AsmMatchConverter    = ps.AsmMatchConverter;
   let AsmVariantName       = ps.AsmVariantName;
   let UseNamedOperandTable = ps.UseNamedOperandTable;
@@ -692,11 +838,10 @@ class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> :
   let TRANS                = ps.TRANS;
 }
 
-class VOP_DPP <string OpName, VOPProfile P, bit IsDPP16,
-               dag InsDPP = !if(IsDPP16, P.InsDPP16, P.InsDPP),
-               string AsmDPP = !if(IsDPP16, P.AsmDPP16, P.AsmDPP)> :
-  InstSI <P.OutsDPP, InsDPP, OpName#AsmDPP, []>,
-  VOP_DPPe<P, IsDPP16> {
+class VOP_DPP_Base <string OpName, VOPProfile P,
+               dag InsDPP,
+               string AsmDPP > :
+  InstSI <P.OutsDPP, InsDPP, OpName#AsmDPP, []> {
 
   let mayLoad = 0;
   let mayStore = 0;
@@ -717,6 +862,59 @@ class VOP_DPP <string OpName, VOPProfile P, bit IsDPP16,
   let DecoderNamespace = "DPP";
 }
 
+class VOP_DPP <string OpName, VOPProfile P, bit IsDPP16,
+               dag InsDPP = !if(IsDPP16, P.InsDPP16, P.InsDPP),
+               string AsmDPP = !if(IsDPP16, P.AsmDPP16, P.AsmDPP)> :
+  VOP_DPP_Base<OpName, P, InsDPP, AsmDPP>, VOP_DPPe<P, IsDPP16>;
+
+class VOP3_DPP_Base <string OpName, VOPProfile P, bit IsDPP16,
+               dag InsDPP = !if(IsDPP16, P.InsVOP3DPP16, P.InsVOP3DPP),
+               string AsmDPP = !if(IsDPP16, P.AsmVOP3DPP16, P.AsmVOP3DPP)> :
+  VOP_DPP_Base<OpName, P, InsDPP, AsmDPP> {
+  let OutOperandList = P.OutsVOP3DPP;
+  let AsmMatchConverter = "cvtVOP3DPP";
+  let VOP3 = 1;
+  let AsmVariantName = !if(P.HasExtVOP3DPP, AMDGPUAsmVariants.VOP3_DPP,
+                                            AMDGPUAsmVariants.Disable);
+  let Size = 12;
+}
+
+class VOP3_DPP <bits<10> op, string OpName, VOPProfile P, bit IsDPP16,
+               dag InsDPP = !if(IsDPP16, P.InsVOP3DPP16, P.InsVOP3DPP),
+               string AsmDPP = !if(IsDPP16, P.AsmVOP3DPP16, P.AsmVOP3DPP)> :
+  VOP3_DPP_Base<OpName, P, IsDPP16, InsDPP, AsmDPP>, VOP3_DPPe_Common<op, P>,
+  VOP3_DPPe_Fields {
+
+  let Inst{40-32} = 0xfa;
+  let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+  let Inst{80-72} = dpp_ctrl;
+  let Inst{82}    = !if(IsDPP16, fi, ?);
+  let Inst{83}    = bound_ctrl;
+
+  // Inst{87-84} ignored by hw
+  let Inst{91-88} = bank_mask;
+  let Inst{95-92} = row_mask;
+}
+
+class VOP3P_DPP <bits<7> op, string OpName, VOPProfile P, bit IsDPP16,
+               dag InsDPP = !if(IsDPP16, P.InsVOP3DPP16, P.InsVOP3DPP),
+               string AsmDPP = !if(IsDPP16, P.AsmVOP3DPP16, P.AsmVOP3DPP)> :
+  VOP3_DPP_Base<OpName, P, IsDPP16, InsDPP, AsmDPP>, VOP3P_DPPe_Common<op, P>,
+  VOP3_DPPe_Fields {
+
+  let VOP3P = 1;
+
+  let Inst{40-32} = 0xfa;
+  let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+  let Inst{80-72} = dpp_ctrl;
+  let Inst{82}    = !if(IsDPP16, fi, ?);
+  let Inst{83}    = bound_ctrl;
+
+  // Inst{87-84} ignored by hw
+  let Inst{91-88} = bank_mask;
+  let Inst{95-92} = row_mask;
+}
+
 class VOP_DPP8e<VOPProfile P> : Enc64 {
   bits<8> src0;
   bits<24> dpp8;
@@ -726,9 +924,14 @@ class VOP_DPP8e<VOPProfile P> : Enc64 {
   let Inst{63-40} = dpp8{23-0};
 }
 
-class VOP_DPP8<string OpName, VOPProfile P> :
-  InstSI<P.OutsDPP8, P.InsDPP8, OpName#P.AsmDPP8, []>,
-  VOP_DPP8e<P> {
+class VOP3_DPP8e_Fields {
+  bits<8> src0;
+  bits<24> dpp8;
+  bits<9> fi;
+}
+
+class VOP_DPP8_Base<string OpName, VOPProfile P, dag InsDPP8 = P.InsDPP8, string AsmDPP8 = P.AsmDPP8> :
+  InstSI<P.OutsDPP8, InsDPP8, OpName#AsmDPP8, []> {
 
   let mayLoad = 0;
   let mayStore = 0;
@@ -742,12 +945,44 @@ class VOP_DPP8<string OpName, VOPProfile P> :
   let AsmMatchConverter = "cvtDPP8";
   let SubtargetPredicate = HasDPP8;
   let AssemblerPredicate = HasDPP8;
-  let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.DPP,
-                                     AMDGPUAsmVariants.Disable);
+  let AsmVariantName = AMDGPUAsmVariants.DPP;
   let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
   let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, "");
 }
 
+class VOP_DPP8<string OpName, VOPProfile P> :
+  VOP_DPP8_Base<OpName, P>, VOP_DPP8e<P>;
+
+class VOP3_DPP8_Base<string OpName, VOPProfile P> :
+  VOP_DPP8_Base<OpName, P, P.InsVOP3DPP8, P.AsmVOP3DPP8> {
+  let OutOperandList = P.OutsVOP3DPP8;
+  let AsmMatchConverter = "cvtVOP3DPP8";
+  let AsmVariantName = !if(P.HasExtVOP3DPP, AMDGPUAsmVariants.VOP3_DPP,
+                                            AMDGPUAsmVariants.Disable);
+  let VOP3 = 1;
+  let Size = 12;
+}
+
+
+class VOP3_DPP8<bits<10> op, string OpName, VOPProfile P> :
+  VOP3_DPP8_Base<OpName, P>, VOP3_DPPe_Common<op, P>,
+  VOP3_DPP8e_Fields {
+
+  let Inst{40-32} = fi;
+  let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+  let Inst{95-72} = dpp8{23-0};
+}
+
+class VOP3P_DPP8<bits<7> op, string OpName, VOPProfile P> :
+  VOP3_DPP8_Base<OpName, P>, VOP3P_DPPe_Common<op, P>,
+  VOP3_DPP8e_Fields {
+
+  let VOP3P = 1;
+  let Inst{40-32} = fi;
+  let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+  let Inst{95-72} = dpp8{23-0};
+}
+
 def DPP8Mode {
   int FI_0 = 0xE9;
   int FI_1 = 0xEA;
@@ -780,14 +1015,12 @@ class getDivergentFrag<SDPatternOperator Op> {
 }
 
 class VOPPatGen<SDPatternOperator Op, VOPProfile P> {
-
   PatFrag Operator = getDivergentFrag < Op >.ret;
 
   dag Ins = !foreach(tmp, P.Ins32, !subst(ins, Operator,
                                          !subst(P.Src0RC32, P.Src0VT,
                                                !subst(P.Src1RC32, P.Src1VT, tmp))));
 
-
   dag Outs = !foreach(tmp, P.Outs32, !subst(outs, set,
                                            !subst(P.DstRC, P.DstVT, tmp)));
 
@@ -827,12 +1060,379 @@ class VOPBinOpClampPat<SDPatternOperator node, Instruction inst, ValueType vt> :
                DSTCLAMP.ENABLE)
 >;
 
+//===----------------------------------------------------------------------===//
+// VOP3 Classes
+//===----------------------------------------------------------------------===//
+
+class getVOP3ModPat<VOPProfile P, SDPatternOperator node> {
+  dag src0 = !if(P.HasOMod,
+    (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
+    (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp));
+
+  list<dag> ret3 = [(set P.DstVT:$vdst,
+    (DivergentFragOrOp<node, P>.ret (P.Src0VT src0),
+          (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
+          (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))];
+
+  list<dag> ret2 = [(set P.DstVT:$vdst,
+    (DivergentFragOrOp<node, P>.ret (P.Src0VT src0),
+          (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))];
+
+  list<dag> ret1 = [(set P.DstVT:$vdst,
+    (DivergentFragOrOp<node, P>.ret (P.Src0VT src0)))];
+
+  list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
+                  !if(!eq(P.NumSrcArgs, 2), ret2,
+                  ret1));
+}
+
+class getVOP3PModPat<VOPProfile P, SDPatternOperator node, bit HasExplicitClamp,
+                     bit IsDOT = 0,
+                     ComplexPattern SrcPat = !if(IsDOT, VOP3PModsDOT, VOP3PMods)> {
+  dag src0_dag = (P.Src0VT (SrcPat P.Src0VT:$src0, i32:$src0_modifiers));
+  dag src1_dag = (P.Src1VT (SrcPat P.Src1VT:$src1, i32:$src1_modifiers));
+  dag src2_dag = (P.Src2VT (SrcPat P.Src2VT:$src2, i32:$src2_modifiers));
+  dag clamp_dag = (i1 timm:$clamp);
+
+  list<dag> ret3 = [(set P.DstVT:$vdst,
+    !if(HasExplicitClamp,
+        (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, src2_dag, clamp_dag),
+        (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, src2_dag)))];
+
+  list<dag> ret2 = [(set P.DstVT:$vdst,
+    !if(HasExplicitClamp,
+        (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, clamp_dag),
+        (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag)))];
+
+  list<dag> ret1 = [(set P.DstVT:$vdst,
+    !if(HasExplicitClamp,
+        (DivergentFragOrOp<node, P>.ret src0_dag, clamp_dag),
+        (DivergentFragOrOp<node, P>.ret src0_dag)))];
+
+  list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
+                  !if(!eq(P.NumSrcArgs, 2), ret2,
+                  ret1));
+}
+
+class getVOP3OpSelPat<VOPProfile P, SDPatternOperator node> {
+  list<dag> ret3 = [(set P.DstVT:$vdst,
+        (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers)),
+          (P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers)),
+          (P.Src2VT (VOP3OpSel P.Src2VT:$src2, i32:$src2_modifiers))))];
+
+  list<dag> ret2 = [(set P.DstVT:$vdst,
+    (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers)),
+                                    (P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers))))];
+
+  list<dag> ret1 = [(set P.DstVT:$vdst,
+    (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers))))];
+
+  list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
+                  !if(!eq(P.NumSrcArgs, 2), ret2,
+                  ret1));
+}
+
+class getVOP3OpSelModPat<VOPProfile P, SDPatternOperator node> {
+  list<dag> ret3 = [(set P.DstVT:$vdst,
+    (DivergentFragOrOp<node, P>.ret (P.Src0VT !if(P.HasClamp, (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers),
+                                    (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))),
+          (P.Src1VT (VOP3OpSelMods P.Src1VT:$src1, i32:$src1_modifiers)),
+          (P.Src2VT (VOP3OpSelMods P.Src2VT:$src2, i32:$src2_modifiers))))];
+
+  list<dag> ret2 = [(set P.DstVT:$vdst,
+    (DivergentFragOrOp<node, P>.ret !if(P.HasClamp, (P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers)),
+                          (P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))),
+          (P.Src1VT (VOP3OpSelMods P.Src1VT:$src1, i32:$src1_modifiers))))];
+
+  list<dag> ret1 = [(set P.DstVT:$vdst,
+    (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))))];
+
+  list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
+                  !if(!eq(P.NumSrcArgs, 2), ret2,
+                  ret1));
+}
+
+class getVOP3FromVOP2Pat<VOPProfile P, SDPatternOperator node> {
+  list<dag> ret = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))];
+}
+// In VOP1, we can have clamp and omod even if !HasModifiers
+class getVOP3Pat<VOPProfile P, SDPatternOperator node> {
+  dag src0 =
+    !if(P.HasOMod,
+      !if(P.HasClamp,
+          (VOP3Mods0 P.Src0VT:$src0, i1:$clamp, i32:$omod),
+          (VOP3Mods0 P.Src0VT:$src0, i32:$omod)), // impossible?
+      !if(P.HasClamp,
+          (VOP3Mods0 P.Src0VT:$src0, i1:$clamp),
+          (VOP3Mods0 P.Src0VT:$src0))
+    );
+  list<dag> ret3 = [(set P.DstVT:$vdst, (DivergentFragOrOp<node, P>.ret (P.Src0VT src0), P.Src1VT:$src1, P.Src2VT:$src2))];
+
+  list<dag> ret2 = [(set P.DstVT:$vdst, (DivergentFragOrOp<node, P>.ret (P.Src0VT src0), P.Src1VT:$src1))];
+
+  list<dag> ret1 = [(set P.DstVT:$vdst, (DivergentFragOrOp<node, P>.ret (P.Src0VT src0)))];
+  list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
+                  !if(!eq(P.NumSrcArgs, 2), ret2,
+                  ret1));
+}
+
+class getVOP3ClampPat<VOPProfile P, SDPatternOperator node> {
+  list<dag> ret3 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, i1:$clamp))];
+  list<dag> ret2 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, i1:$clamp))];
+  list<dag> ret1 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, i1:$clamp))];
+  list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
+                  !if(!eq(P.NumSrcArgs, 2), ret2,
+                  ret1));
+}
+
+class getVOP3MAIPat<VOPProfile P, SDPatternOperator node> {
+  list<dag> ret = !if(!eq(P.Src0VT, P.Src1VT),
+                      // mfma
+                      [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2,
+                                            timm:$cbsz, timm:$abid, timm:$blgp))],
+                      // smfmac
+                      [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, i32:$idx,
+                                            timm:$cbsz, timm:$abid))]);
+}
+
+class VOP3Features<bit Clamp, bit OpSel, bit Packed, bit MAI> {
+  bit HasClamp = Clamp;
+  bit HasOpSel = OpSel;
+  bit IsPacked = Packed;
+  bit IsMAI = MAI;
+}
+
+def VOP3_REGULAR : VOP3Features<0, 0, 0, 0>;
+def VOP3_CLAMP   : VOP3Features<1, 0, 0, 0>;
+def VOP3_OPSEL   : VOP3Features<1, 1, 0, 0>;
+def VOP3_PACKED  : VOP3Features<1, 1, 1, 0>;
+def VOP3_MAI     : VOP3Features<0, 0, 0, 1>;
+
+class VOP3_Profile_Base<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProfile<P.ArgVT> {
+
+  let HasClamp = !if(Features.HasClamp, 1, P.HasClamp);
+  let HasOpSel = !if(Features.HasOpSel, 1, P.HasOpSel);
+  let IsMAI    = !if(Features.IsMAI,    1, P.IsMAI);
+  let IsPacked = !if(Features.IsPacked, 1, P.IsPacked);
+
+  let HasModifiers = !if(Features.IsMAI, 0, !or(Features.IsPacked, P.HasModifiers));
+}
+
+class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOP3_Profile_Base<P, Features> {
+  let IsSingle = 1;
+
+}
+
+// consistently gives instructions a _e64 suffix
+multiclass VOP3Inst_Pseudo_Wrapper<string opName, VOPProfile P, list<dag> pattern = [], bit VOP3Only = 0> {
+    def _e64 : VOP3_Pseudo<opName, P, pattern, VOP3Only>;
+}
+
+class VOP3InstBase<string OpName, VOPProfile P, SDPatternOperator node = null_frag, bit IsVOP2 = 0> :
+  VOP3_Pseudo<OpName, P,
+    !if(P.HasOpSel,
+        !if(P.HasModifiers,
+            getVOP3OpSelModPat<P, node>.ret,
+            getVOP3OpSelPat<P, node>.ret),
+        !if(P.HasModifiers,
+            getVOP3ModPat<P, node>.ret,
+            !if(IsVOP2,
+              getVOP3FromVOP2Pat<P, node>.ret,
+              !if(P.HasIntClamp,
+                  getVOP3ClampPat<P, node>.ret,
+                  !if (P.IsMAI,
+                      getVOP3MAIPat<P, node>.ret,
+                      getVOP3Pat<P, node>.ret))))),
+    0, P.HasOpSel> {
+
+  let IntClamp = P.HasIntClamp;
+  let AsmMatchConverter =
+    !if(P.HasOpSel,
+        "cvtVOP3OpSel",
+        !if(!or(P.HasModifiers, P.HasOMod, P.HasIntClamp),
+            "cvtVOP3",
+            ""));
+}
+
+multiclass VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> {
+  def _e64 : VOP3InstBase<OpName, P, node>;
+  let SubtargetPredicate = isGFX11Plus in {
+    foreach _ = BoolToList<P.HasExtVOP3DPP>.ret in
+      def _e64_dpp : VOP3_DPP_Pseudo <OpName, P>;
+  } // end SubtargetPredicate = isGFX11Plus
+}
+
+//===----------------------------------------------------------------------===//
+// VOP3 DPP
+//===----------------------------------------------------------------------===//
+
+class Base_VOP3_DPP16<bits<10> op, VOP_DPP_Pseudo ps, string opName = ps.OpName>
+    : VOP3_DPP<op, opName, ps.Pfl, 1> {
+  let hasSideEffects = ps.hasSideEffects;
+  let Defs = ps.Defs;
+  let SchedRW = ps.SchedRW;
+  let Uses = ps.Uses;
+  let AssemblerPredicate = HasDPP16;
+  let SubtargetPredicate = HasDPP16;
+  let OtherPredicates = ps.OtherPredicates;
+}
+
+class VOP3_DPP16<bits<10> op, VOP_DPP_Pseudo ps, int subtarget,
+                 string opName = ps.OpName>
+    : Base_VOP3_DPP16<op, ps, opName>, SIMCInstr<ps.PseudoInstr, subtarget>;
+
+class Base_VOP3_DPP8<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
+    : VOP3_DPP8<op, opName, ps.Pfl> {
+  let hasSideEffects = ps.hasSideEffects;
+  let Defs = ps.Defs;
+  let SchedRW = ps.SchedRW;
+  let Uses = ps.Uses;
+
+  let OtherPredicates = ps.OtherPredicates;
+}
+
+class Base_VOP3b_DPP16<bits<10> op, VOP_DPP_Pseudo ps,
+                       string opName = ps.OpName>
+    : Base_VOP3_DPP16<op, ps, opName> {
+  bits<7> sdst;
+  let Inst{14 - 8} = sdst;
+}
+
+class VOP3b_DPP8_Base<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
+    : Base_VOP3_DPP8<op, ps, opName> {
+  bits<7> sdst;
+  let Inst{14 - 8} = sdst;
+}
+
+//===----------------------------------------------------------------------===//
+// VOP3 GFX11
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX11Only,
+    DecoderNamespace = "GFX11" in {
+  multiclass VOP3_Real_Base_gfx11<bits<10> op, string opName = NAME,
+                                  bit isSingle = 0> {
+    defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
+    let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
+    foreach _ = BoolToList<ps.Pfl.HasOpSel>.ret in
+      def _e64_gfx11 :
+        VOP3_Real<ps, SIEncodingFamily.GFX11>,
+        VOP3OpSel_gfx11<op, ps.Pfl>;
+    foreach _ = BoolToList<!not(ps.Pfl.HasOpSel)>.ret in
+      def _e64_gfx11 :
+        VOP3_Real<ps, SIEncodingFamily.GFX11>,
+        VOP3e_gfx11<op, ps.Pfl>;
+    }
+  }
+  multiclass VOP3_Real_with_name_gfx11<bits<10> op, string opName,
+                                       string asmName, bit isSingle = 0> {
+    defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
+    let AsmString = asmName # ps.AsmOperands,
+        IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
+    foreach _ = BoolToList<ps.Pfl.HasOpSel>.ret in
+      def _e64_gfx11 :
+        VOP3_Real<ps, SIEncodingFamily.GFX11>,
+        VOP3OpSel_gfx11<op, ps.Pfl>,
+        MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX11Plus]>;
+    foreach _ = BoolToList<!not(ps.Pfl.HasOpSel)>.ret in
+      def _e64_gfx11 :
+        VOP3_Real<ps, SIEncodingFamily.GFX11>,
+        VOP3e_gfx11<op, ps.Pfl>,
+        MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX11Plus]>;
+    }
+  }
+  // for READLANE/WRITELANE
+  multiclass VOP3_Real_No_Suffix_gfx11<bits<10> op, string opName = NAME> {
+    defvar ps = !cast<VOP_Pseudo>(opName);
+      def _e64_gfx11 :
+        VOP3_Real<ps, SIEncodingFamily.GFX11>,
+        VOP3e_gfx11<op, ps.Pfl>;
+  }
+  multiclass VOP3_Real_dpp_Base_gfx11<bits<10> op, string opName = NAME> {
+    def _e64_dpp_gfx11 : VOP3_DPP16<op, !cast<VOP_DPP_Pseudo>(opName#"_e64"#"_dpp"), SIEncodingFamily.GFX11> {
+      let DecoderNamespace = "DPPGFX11";
+      }
+  }
+  multiclass VOP3_Real_dpp_with_name_gfx11<bits<10> op, string opName,
+                                           string asmName> {
+    defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
+    let AsmString = asmName # ps.Pfl.AsmVOP3DPP16, DecoderNamespace = "DPPGFX11" in {
+      defm NAME : VOP3_Real_dpp_Base_gfx11<op, opName>;
+    }
+  }
+  multiclass VOP3_Real_dpp8_Base_gfx11<bits<10> op, string opName = NAME> {
+    defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
+    def _e64_dpp8_gfx11 : Base_VOP3_DPP8<op, ps> {
+      let DecoderNamespace = "DPP8GFX11";
+    }
+  }
+  multiclass VOP3_Real_dpp8_with_name_gfx11<bits<10> op, string opName,
+                                           string asmName> {
+    defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
+    let AsmString = asmName # ps.Pfl.AsmVOP3DPP8, DecoderNamespace = "DPP8GFX11"  in {
+      defm NAME : VOP3_Real_dpp8_Base_gfx11<op, opName>;
+    }
+  }
+  multiclass VOP3be_Real_gfx11<bits<10> op, string opName, string asmName,
+                               bit isSingle = 0> {
+    defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
+    let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in
+      def _e64_gfx11 :
+        VOP3_Real<ps, SIEncodingFamily.GFX11, asmName>,
+        VOP3be_gfx11<op, ps.Pfl> ;
+  }
+  multiclass VOP3be_Real_dpp_gfx11<bits<10> op, string opName, string asmName> {
+    defvar ps = !cast<VOP3_Pseudo>(opName #"_e64");
+    defvar dpp_ps = !cast<VOP_DPP_Pseudo>(opName #"_e64" #"_dpp");
+    def _e64_dpp_gfx11 : Base_VOP3b_DPP16<op, dpp_ps, asmName>,
+                         SIMCInstr<dpp_ps.PseudoInstr, SIEncodingFamily.GFX11> {
+      let DecoderNamespace = "DPPGFX11";
+    }
+  }
+  multiclass VOP3be_Real_dpp8_gfx11<bits<10> op, string opName, string asmName> {
+    defvar ps = !cast<VOP3_Pseudo>(opName #"_e64");
+    def _e64_dpp8_gfx11 : VOP3b_DPP8_Base<op, ps, asmName> {
+      let DecoderNamespace = "DPP8GFX11";
+    }
+  }
+} // End AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11"
+
+// VOP1 and VOP2 depend on these triple defs
+multiclass VOP3_Realtriple_gfx11<bits<10> op,
+                                 bit isSingle = 0, string opName = NAME> :
+  VOP3_Real_Base_gfx11<op, opName, isSingle>,
+  VOP3_Real_dpp_Base_gfx11<op, opName>,
+  VOP3_Real_dpp8_Base_gfx11<op, opName>;
+
+multiclass VOP3Only_Realtriple_gfx11<bits<10> op> :
+  VOP3_Realtriple_gfx11<op, 1>;
+
+multiclass VOP3_Realtriple_with_name_gfx11<bits<10> op, string opName,
+                                           string asmName, bit isSingle = 0> :
+  VOP3_Real_with_name_gfx11<op, opName, asmName, isSingle>,
+  VOP3_Real_dpp_with_name_gfx11<op, opName, asmName>,
+  VOP3_Real_dpp8_with_name_gfx11<op, opName, asmName>;
+
+multiclass VOP3Only_Realtriple_with_name_gfx11<bits<10> op, string opName,
+                                           string asmName> :
+  VOP3_Realtriple_with_name_gfx11<op, opName, asmName, 1>;
+
+multiclass VOP3be_Realtriple_gfx11<
+    bits<10> op, bit isSingle = 0, string opName = NAME,
+    string asmName = !cast<VOP_Pseudo>(opName#"_e64").Mnemonic> :
+  VOP3be_Real_gfx11<op, opName, asmName, isSingle>,
+  VOP3be_Real_dpp_gfx11<op, opName, asmName>,
+  VOP3be_Real_dpp8_gfx11<op, opName, asmName>;
+
+multiclass VOP3beOnly_Realtriple_gfx11<bits<10> op> :
+  VOP3be_Realtriple_gfx11<op, 1>;
 
 include "VOPCInstructions.td"
 include "VOP1Instructions.td"
 include "VOP2Instructions.td"
 include "VOP3Instructions.td"
 include "VOP3PInstructions.td"
+include "VOPDInstructions.td"
 
 
 class VOPInfoTable <string Format> : GenericTable {
@@ -847,3 +1447,15 @@ class VOPInfoTable <string Format> : GenericTable {
 def VOP1InfoTable : VOPInfoTable<"VOP1">;
 def VOP2InfoTable : VOPInfoTable<"VOP2">;
 def VOP3InfoTable : VOPInfoTable<"VOP3">;
+
+class VOPC64Table <string Format> : GenericTable {
+  let FilterClass = "VOPC64_" # Format # "_Base";
+  let CppTypeName = "VOPC64DPPInfo";
+  let Fields = ["Opcode"];
+
+  let PrimaryKey = ["Opcode"];
+  let PrimaryKeyName = "isVOPC64" # Format # "OpcodeHelper";
+}
+
+def VOPC64DPPTable : VOPC64Table<"DPP">;
+def VOPC64DPP8Table : VOPC64Table<"DPP8">;
diff --git a/llvm/lib/Target/ARC/ARCMachineFunctionInfo.cpp b/llvm/lib/Target/ARC/ARCMachineFunctionInfo.cpp
index 9cd9661ae245..733f2f0a0499 100644
--- a/llvm/lib/Target/ARC/ARCMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/ARC/ARCMachineFunctionInfo.cpp
@@ -11,3 +11,10 @@
 using namespace llvm;
 
 void ARCFunctionInfo::anchor() {}
+
+MachineFunctionInfo *
+ARCFunctionInfo::clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+                       const DenseMap<MachineBasicBlock *, MachineBasicBlock *>
+                           &Src2DstMBB) const {
+  return DestMF.cloneInfo<ARCFunctionInfo>(*this);
+}
diff --git a/llvm/lib/Target/ARC/ARCMachineFunctionInfo.h b/llvm/lib/Target/ARC/ARCMachineFunctionInfo.h
index 968c6b63f423..454206037498 100644
--- a/llvm/lib/Target/ARC/ARCMachineFunctionInfo.h
+++ b/llvm/lib/Target/ARC/ARCMachineFunctionInfo.h
@@ -34,9 +34,13 @@ public:
   explicit ARCFunctionInfo(MachineFunction &MF)
       : ReturnStackOffsetSet(false), VarArgsFrameIndex(0),
         ReturnStackOffset(-1U), MaxCallStackReq(0) {}
-
   ~ARCFunctionInfo() {}
 
+  MachineFunctionInfo *
+  clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+        const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+      const override;
+
   void setVarArgsFrameIndex(int off) { VarArgsFrameIndex = off; }
   int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
 
diff --git a/llvm/lib/Target/ARC/ARCOptAddrMode.cpp b/llvm/lib/Target/ARC/ARCOptAddrMode.cpp
index c956f00b628d..589c58e285bb 100644
--- a/llvm/lib/Target/ARC/ARCOptAddrMode.cpp
+++ b/llvm/lib/Target/ARC/ARCOptAddrMode.cpp
@@ -36,7 +36,7 @@ using namespace llvm;
 namespace llvm {
 
 static cl::opt<unsigned> ArcKillAddrMode("arc-kill-addr-mode", cl::init(0),
-                                         cl::ReallyHidden, cl::ZeroOrMore);
+                                         cl::ReallyHidden);
 
 #define DUMP_BEFORE() ((ArcKillAddrMode & 0x0001) != 0)
 #define DUMP_AFTER() ((ArcKillAddrMode & 0x0002) != 0)
@@ -459,12 +459,12 @@ void ARCOptAddrMode::changeToAddrMode(MachineInstr &Ldst, unsigned NewOpcode,
 
   Register BaseReg = Ldst.getOperand(BasePos).getReg();
 
-  Ldst.RemoveOperand(OffPos);
-  Ldst.RemoveOperand(BasePos);
+  Ldst.removeOperand(OffPos);
+  Ldst.removeOperand(BasePos);
 
   if (IsStore) {
     Src = Ldst.getOperand(BasePos - 1);
-    Ldst.RemoveOperand(BasePos - 1);
+    Ldst.removeOperand(BasePos - 1);
   }
 
   Ldst.setDesc(AST->getInstrInfo()->get(NewOpcode));
diff --git a/llvm/lib/Target/ARC/ARCTargetMachine.cpp b/llvm/lib/Target/ARC/ARCTargetMachine.cpp
index 52f74b729ff7..21757927d873 100644
--- a/llvm/lib/Target/ARC/ARCTargetMachine.cpp
+++ b/llvm/lib/Target/ARC/ARCTargetMachine.cpp
@@ -21,7 +21,7 @@
 using namespace llvm;
 
 static Reloc::Model getRelocModel(Optional<Reloc::Model> RM) {
-  return RM.getValueOr(Reloc::Static);
+  return RM.value_or(Reloc::Static);
 }
 
 /// ARCTargetMachine ctor - Create an ILP32 architecture model
@@ -84,6 +84,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARCTarget() {
 }
 
 TargetTransformInfo
-ARCTargetMachine::getTargetTransformInfo(const Function &F) {
+ARCTargetMachine::getTargetTransformInfo(const Function &F) const {
   return TargetTransformInfo(ARCTTIImpl(this, F));
 }
diff --git a/llvm/lib/Target/ARC/ARCTargetMachine.h b/llvm/lib/Target/ARC/ARCTargetMachine.h
index c5e8c3f2936d..81ccfc6d5dd0 100644
--- a/llvm/lib/Target/ARC/ARCTargetMachine.h
+++ b/llvm/lib/Target/ARC/ARCTargetMachine.h
@@ -39,7 +39,7 @@ public:
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
-  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+  TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
diff --git a/llvm/lib/Target/ARC/Disassembler/ARCDisassembler.cpp b/llvm/lib/Target/ARC/Disassembler/ARCDisassembler.cpp
index bb5336931932..618101755904 100644
--- a/llvm/lib/Target/ARC/Disassembler/ARCDisassembler.cpp
+++ b/llvm/lib/Target/ARC/Disassembler/ARCDisassembler.cpp
@@ -16,8 +16,8 @@
 #include "MCTargetDesc/ARCMCTargetDesc.h"
 #include "TargetInfo/ARCTargetInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDecoderOps.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
-#include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -83,41 +83,43 @@ static bool readInstruction16(ArrayRef<uint8_t> Bytes, uint64_t Address,
 }
 
 template <unsigned B>
-static DecodeStatus DecodeSignedOperand(MCInst &Inst, unsigned InsnS,
-                                        uint64_t Address = 0,
-                                        const void *Decoder = nullptr);
+static DecodeStatus
+DecodeSignedOperand(MCInst &Inst, unsigned InsnS, uint64_t Address = 0,
+                    const MCDisassembler *Decoder = nullptr);
 
 template <unsigned B>
-static DecodeStatus DecodeFromCyclicRange(MCInst &Inst, unsigned InsnS,
-                                        uint64_t Address = 0,
-                                        const void *Decoder = nullptr);
+static DecodeStatus
+DecodeFromCyclicRange(MCInst &Inst, unsigned InsnS, uint64_t Address = 0,
+                      const MCDisassembler *Decoder = nullptr);
 
 template <unsigned B>
 static DecodeStatus DecodeBranchTargetS(MCInst &Inst, unsigned InsnS,
-                                        uint64_t Address, const void *Decoder);
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeMEMrs9(MCInst &, unsigned, uint64_t, const void *);
+static DecodeStatus DecodeMEMrs9(MCInst &, unsigned, uint64_t,
+                                 const MCDisassembler *);
 
 static DecodeStatus DecodeLdLImmInstruction(MCInst &, uint64_t, uint64_t,
-                                            const void *);
+                                            const MCDisassembler *);
 
 static DecodeStatus DecodeStLImmInstruction(MCInst &, uint64_t, uint64_t,
-                                            const void *);
+                                            const MCDisassembler *);
 
 static DecodeStatus DecodeLdRLImmInstruction(MCInst &, uint64_t, uint64_t,
-                                             const void *);
+                                             const MCDisassembler *);
 
 static DecodeStatus DecodeSOPwithRS12(MCInst &, uint64_t, uint64_t,
-                                      const void *);
+                                      const MCDisassembler *);
 
 static DecodeStatus DecodeSOPwithRU6(MCInst &, uint64_t, uint64_t,
-                                     const void *);
+                                     const MCDisassembler *);
 
 static DecodeStatus DecodeCCRU6Instruction(MCInst &, uint64_t, uint64_t,
-                                           const void *);
+                                           const MCDisassembler *);
 
 static DecodeStatus DecodeMoveHRegInstruction(MCInst &Inst, uint64_t, uint64_t,
-                                              const void *);
+                                              const MCDisassembler *);
 
 static const uint16_t GPR32DecoderTable[] = {
     ARC::R0,  ARC::R1,    ARC::R2,  ARC::R3,   ARC::R4,  ARC::R5,  ARC::R6,
@@ -128,7 +130,7 @@ static const uint16_t GPR32DecoderTable[] = {
 
 static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
-                                             const void *Decoder) {
+                                             const MCDisassembler *Decoder) {
   if (RegNo >= 32) {
     LLVM_DEBUG(dbgs() << "Not a GPR32 register.");
     return MCDisassembler::Fail;
@@ -140,8 +142,8 @@ static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
 }
 
 static DecodeStatus DecodeGBR32ShortRegister(MCInst &Inst, unsigned RegNo,
-                                               uint64_t Address,
-                                               const void *Decoder) {
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder) {
   // Enumerates registers from ranges [r0-r3],[r12-r15].
   if (RegNo > 3)
     RegNo += 8; // 4 for r12, etc...
@@ -165,7 +167,7 @@ static unsigned decodeAField(unsigned Insn) {
 }
 
 static DecodeStatus DecodeMEMrs9(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const void *Dec) {
+                                 const MCDisassembler *Dec) {
   // We have the 9-bit immediate in the low bits, 6-bit register in high bits.
   unsigned S9 = Insn & 0x1ff;
   unsigned R = (Insn & (0x7fff & ~0x1ff)) >> 9;
@@ -175,17 +177,16 @@ static DecodeStatus DecodeMEMrs9(MCInst &Inst, unsigned Insn, uint64_t Address,
 }
 
 static bool DecodeSymbolicOperand(MCInst &Inst, uint64_t Address,
-                                  uint64_t Value, const void *Decoder) {
+                                  uint64_t Value,
+                                  const MCDisassembler *Decoder) {
   static const uint64_t AtLeast = 2;
-  // TODO: Try to force emitter to use MCDisassembler* instead of void*.
-  auto Disassembler = static_cast<const MCDisassembler *>(Decoder);
-  return (nullptr != Disassembler &&
-          Disassembler->tryAddingSymbolicOperand(Inst, Value, Address, true, 0,
-                                                 AtLeast));
+  return (nullptr != Decoder && Decoder->tryAddingSymbolicOperand(
+                                    Inst, Value, Address, true, 0, AtLeast, 0));
 }
 
 static void DecodeSymbolicOperandOff(MCInst &Inst, uint64_t Address,
-                                     uint64_t Offset, const void *Decoder) {
+                                     uint64_t Offset,
+                                     const MCDisassembler *Decoder) {
   uint64_t NextAddress = Address + Offset;
 
   if (!DecodeSymbolicOperand(Inst, Address, NextAddress, Decoder))
@@ -194,7 +195,8 @@ static void DecodeSymbolicOperandOff(MCInst &Inst, uint64_t Address,
 
 template <unsigned B>
 static DecodeStatus DecodeBranchTargetS(MCInst &Inst, unsigned InsnS,
-                                        uint64_t Address, const void *Decoder) {
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder) {
 
   static_assert(B > 0, "field is empty");
   DecodeSymbolicOperandOff(Inst, Address, SignExtend32<B>(InsnS), Decoder);
@@ -204,7 +206,7 @@ static DecodeStatus DecodeBranchTargetS(MCInst &Inst, unsigned InsnS,
 template <unsigned B>
 static DecodeStatus DecodeSignedOperand(MCInst &Inst, unsigned InsnS,
                                         uint64_t /*Address*/,
-                                        const void * /*Decoder*/) {
+                                        const MCDisassembler * /*Decoder*/) {
 
   static_assert(B > 0, "field is empty");
   Inst.addOperand(MCOperand::createImm(
@@ -215,7 +217,7 @@ static DecodeStatus DecodeSignedOperand(MCInst &Inst, unsigned InsnS,
 template <unsigned B>
 static DecodeStatus DecodeFromCyclicRange(MCInst &Inst, unsigned InsnS,
                                           uint64_t /*Address*/,
-                                          const void * /*Decoder*/) {
+                                          const MCDisassembler * /*Decoder*/) {
 
   static_assert(B > 0, "field is empty");
   const unsigned max = (1u << B) - 1;
@@ -226,7 +228,7 @@ static DecodeStatus DecodeFromCyclicRange(MCInst &Inst, unsigned InsnS,
 
 static DecodeStatus DecodeStLImmInstruction(MCInst &Inst, uint64_t Insn,
                                             uint64_t Address,
-                                            const void *Decoder) {
+                                            const MCDisassembler *Decoder) {
   unsigned SrcC, DstB, LImm;
   DstB = decodeBField(Insn);
   if (DstB != 62) {
@@ -243,7 +245,7 @@ static DecodeStatus DecodeStLImmInstruction(MCInst &Inst, uint64_t Insn,
 
 static DecodeStatus DecodeLdLImmInstruction(MCInst &Inst, uint64_t Insn,
                                             uint64_t Address,
-                                            const void *Decoder) {
+                                            const MCDisassembler *Decoder) {
   unsigned DstA, SrcB, LImm;
   LLVM_DEBUG(dbgs() << "Decoding LdLImm:\n");
   SrcB = decodeBField(Insn);
@@ -261,7 +263,7 @@ static DecodeStatus DecodeLdLImmInstruction(MCInst &Inst, uint64_t Insn,
 
 static DecodeStatus DecodeLdRLImmInstruction(MCInst &Inst, uint64_t Insn,
                                              uint64_t Address,
-                                             const void *Decoder) {
+                                             const MCDisassembler *Decoder) {
   unsigned DstA, SrcB;
   LLVM_DEBUG(dbgs() << "Decoding LdRLimm\n");
   DstA = decodeAField(Insn);
@@ -278,7 +280,7 @@ static DecodeStatus DecodeLdRLImmInstruction(MCInst &Inst, uint64_t Insn,
 
 static DecodeStatus DecodeMoveHRegInstruction(MCInst &Inst, uint64_t Insn,
                                               uint64_t Address,
-                                              const void *Decoder) {
+                                              const MCDisassembler *Decoder) {
   LLVM_DEBUG(dbgs() << "Decoding MOV_S h-register\n");
   using Field = decltype(Insn);
   Field H = fieldFromInstruction(Insn, 5, 3) |
@@ -304,7 +306,7 @@ static DecodeStatus DecodeMoveHRegInstruction(MCInst &Inst, uint64_t Insn,
 
 static DecodeStatus DecodeCCRU6Instruction(MCInst &Inst, uint64_t Insn,
                                            uint64_t Address,
-                                           const void *Decoder) {
+                                           const MCDisassembler *Decoder) {
   unsigned DstB;
   LLVM_DEBUG(dbgs() << "Decoding CCRU6 instruction:\n");
   DstB = decodeBField(Insn);
@@ -318,7 +320,8 @@ static DecodeStatus DecodeCCRU6Instruction(MCInst &Inst, uint64_t Insn,
 }
 
 static DecodeStatus DecodeSOPwithRU6(MCInst &Inst, uint64_t Insn,
-                                     uint64_t Address, const void *Decoder) {
+                                     uint64_t Address,
+                                     const MCDisassembler *Decoder) {
   unsigned DstB = decodeBField(Insn);
   DecodeGPR32RegisterClass(Inst, DstB, Address, Decoder);
   using Field = decltype(Insn);
@@ -328,7 +331,8 @@ static DecodeStatus DecodeSOPwithRU6(MCInst &Inst, uint64_t Insn,
 }
 
 static DecodeStatus DecodeSOPwithRS12(MCInst &Inst, uint64_t Insn,
-                                      uint64_t Address, const void *Decoder) {
+                                      uint64_t Address,
+                                      const MCDisassembler *Decoder) {
   unsigned DstB = decodeBField(Insn);
   DecodeGPR32RegisterClass(Inst, DstB, Address, Decoder);
   using Field = decltype(Insn);
diff --git a/llvm/lib/Target/ARM/A15SDOptimizer.cpp b/llvm/lib/Target/ARM/A15SDOptimizer.cpp
index d0efecad63bc..65da95b0fc8d 100644
--- a/llvm/lib/Target/ARM/A15SDOptimizer.cpp
+++ b/llvm/lib/Target/ARM/A15SDOptimizer.cpp
@@ -361,9 +361,8 @@ void A15SDOptimizer::elideCopiesAndPHIs(MachineInstr *MI,
      MI = Front.pop_back_val();
 
      // If we have already explored this MachineInstr, ignore it.
-     if (Reached.find(MI) != Reached.end())
+     if (!Reached.insert(MI).second)
        continue;
-     Reached.insert(MI);
      if (MI->isPHI()) {
        for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) {
          Register Reg = MI->getOperand(I).getReg();
diff --git a/llvm/lib/Target/ARM/ARM.h b/llvm/lib/Target/ARM/ARM.h
index 979371bf7cf6..9990078cfdbb 100644
--- a/llvm/lib/Target/ARM/ARM.h
+++ b/llvm/lib/Target/ARM/ARM.h
@@ -57,6 +57,7 @@ Pass *createMVEGatherScatterLoweringPass();
 FunctionPass *createARMSLSHardeningPass();
 FunctionPass *createARMIndirectThunks();
 Pass *createMVELaneInterleavingPass();
+FunctionPass *createARMFixCortexA57AES1742098Pass();
 
 void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                   ARMAsmPrinter &AP);
@@ -77,6 +78,7 @@ void initializeMVETailPredicationPass(PassRegistry &);
 void initializeMVEGatherScatterLoweringPass(PassRegistry &);
 void initializeARMSLSHardeningPass(PassRegistry &);
 void initializeMVELaneInterleavingPass(PassRegistry &);
+void initializeARMFixCortexA57AES1742098Pass(PassRegistry &);
 
 } // end namespace llvm
 
diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td
index 27edf69b4abf..48559a89a30a 100644
--- a/llvm/lib/Target/ARM/ARM.td
+++ b/llvm/lib/Target/ARM/ARM.td
@@ -19,9 +19,11 @@ include "llvm/Target/Target.td"
 // ARM Subtarget state.
 //
 
-def ModeThumb             : SubtargetFeature<"thumb-mode", "InThumbMode",
+// True if compiling for Thumb, false for ARM.
+def ModeThumb             : SubtargetFeature<"thumb-mode", "IsThumb",
                                              "true", "Thumb mode">;
 
+// True if we're using software floating point features.
 def ModeSoftFloat         : SubtargetFeature<"soft-float","UseSoftFloat",
                                              "true", "Use software floating "
                                              "point features.">;
@@ -48,14 +50,18 @@ def FeatureFPRegs64       : SubtargetFeature<"fpregs64", "HasFPRegs64", "true",
                                              "Enable 64-bit FP registers",
                                              [FeatureFPRegs]>;
 
+// True if the floating point unit supports double precision.
 def FeatureFP64           : SubtargetFeature<"fp64", "HasFP64", "true",
                                              "Floating point unit supports "
                                              "double precision",
                                              [FeatureFPRegs64]>;
 
+// True if subtarget has the full 32 double precision FP registers for VFPv3.
 def FeatureD32            : SubtargetFeature<"d32", "HasD32", "true",
                                              "Extend FP to 32 double registers">;
 
+/// Versions of the VFP flags restricted to single precision, or to
+/// 16 d-registers, or both.
 multiclass VFPver<string name, string query, string description,
                   list<SubtargetFeature> prev,
                   list<SubtargetFeature> otherimplies,
@@ -100,6 +106,7 @@ def FeatureNEON           : SubtargetFeature<"neon", "HasNEON", "true",
                                              "Enable NEON instructions",
                                              [FeatureVFP3]>;
 
+// True if subtarget supports half-precision FP conversions.
 def FeatureFP16           : SubtargetFeature<"fp16", "HasFP16", "true",
                                              "Enable half-precision "
                                              "floating point">;
@@ -110,169 +117,211 @@ defm FeatureVFP4: VFPver<"vfp4", "HasVFPv4", "Enable VFP4 instructions",
 defm FeatureFPARMv8: VFPver<"fp-armv8", "HasFPARMv8", "Enable ARMv8 FP",
                          [FeatureVFP4], []>;
 
+// True if subtarget supports half-precision FP operations.
 def FeatureFullFP16       : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
                                              "Enable full half-precision "
                                              "floating point",
                                              [FeatureFPARMv8_D16_SP, FeatureFPRegs16]>;
 
+// True if subtarget supports half-precision FP fml operations.
 def FeatureFP16FML        : SubtargetFeature<"fp16fml", "HasFP16FML", "true",
                                              "Enable full half-precision "
                                              "floating point fml instructions",
                                              [FeatureFullFP16]>;
 
+// True if subtarget supports [su]div in Thumb mode.
 def FeatureHWDivThumb     : SubtargetFeature<"hwdiv",
-                                             "HasHardwareDivideInThumb", "true",
+                                             "HasDivideInThumbMode", "true",
                                              "Enable divide instructions in Thumb">;
 
+// True if subtarget supports [su]div in ARM mode.
 def FeatureHWDivARM       : SubtargetFeature<"hwdiv-arm",
-                                             "HasHardwareDivideInARM", "true",
+                                             "HasDivideInARMMode", "true",
                                              "Enable divide instructions in ARM mode">;
 
 // Atomic Support
+
+// True if the subtarget supports DMB / DSB data barrier instructions.
 def FeatureDB             : SubtargetFeature<"db", "HasDataBarrier", "true",
                                              "Has data barrier (dmb/dsb) instructions">;
 
+// True if the subtarget supports CLREX instructions.
 def FeatureV7Clrex        : SubtargetFeature<"v7clrex", "HasV7Clrex", "true",
                                              "Has v7 clrex instruction">;
 
+// True if the subtarget supports DFB data barrier instruction.
 def FeatureDFB  : SubtargetFeature<"dfb", "HasFullDataBarrier", "true",
                                    "Has full data barrier (dfb) instruction">;
 
+// True if the subtarget supports v8 atomics (LDA/LDAEX etc) instructions.
 def FeatureAcquireRelease : SubtargetFeature<"acquire-release",
                                              "HasAcquireRelease", "true",
                                              "Has v8 acquire/release (lda/ldaex "
                                              " etc) instructions">;
 
 
-def FeatureSlowFPBrcc     : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true",
+// True if floating point compare + branch is slow.
+def FeatureSlowFPBrcc     : SubtargetFeature<"slow-fp-brcc", "IsFPBrccSlow", "true",
                                              "FP compare + branch is slow">;
 
+// True if the processor supports the Performance Monitor Extensions. These
+// include a generic cycle-counter as well as more fine-grained (often
+// implementation-specific) events.
 def FeaturePerfMon        : SubtargetFeature<"perfmon", "HasPerfMon", "true",
                                              "Enable support for Performance "
                                              "Monitor extensions">;
 
 
 // TrustZone Security Extensions
+
+// True if processor supports TrustZone security extensions.
 def FeatureTrustZone      : SubtargetFeature<"trustzone", "HasTrustZone", "true",
                                              "Enable support for TrustZone "
                                              "security extensions">;
 
+// True if processor supports ARMv8-M Security Extensions.
 def Feature8MSecExt       : SubtargetFeature<"8msecext", "Has8MSecExt", "true",
                                              "Enable support for ARMv8-M "
                                              "Security Extensions">;
 
+// True if processor supports SHA1 and SHA256.
 def FeatureSHA2           : SubtargetFeature<"sha2", "HasSHA2", "true",
                                              "Enable SHA1 and SHA256 support", [FeatureNEON]>;
 
 def FeatureAES            : SubtargetFeature<"aes", "HasAES", "true",
                                              "Enable AES support", [FeatureNEON]>;
 
+// True if processor supports Cryptography extensions.
 def FeatureCrypto         : SubtargetFeature<"crypto", "HasCrypto", "true",
                                              "Enable support for "
                                              "Cryptography extensions",
                                              [FeatureNEON, FeatureSHA2, FeatureAES]>;
 
+// True if processor supports CRC instructions.
 def FeatureCRC            : SubtargetFeature<"crc", "HasCRC", "true",
                                              "Enable support for CRC instructions">;
 
+// True if the ARMv8.2A dot product instructions are supported.
 def FeatureDotProd        : SubtargetFeature<"dotprod", "HasDotProd", "true",
                                              "Enable support for dot product instructions",
                                              [FeatureNEON]>;
 
-// Not to be confused with FeatureHasRetAddrStack (return address stack)
+// True if the processor supports RAS extensions.
+// Not to be confused with FeatureHasRetAddrStack (return address stack).
 def FeatureRAS            : SubtargetFeature<"ras", "HasRAS", "true",
                                              "Enable Reliability, Availability "
                                              "and Serviceability extensions">;
 
-// Fast computation of non-negative address offsets
+// Fast computation of non-negative address offsets.
+// True if processor does positive address offset computation faster.
 def FeatureFPAO           : SubtargetFeature<"fpao", "HasFPAO", "true",
                                              "Enable fast computation of "
                                              "positive address offsets">;
 
-// Fast execution of AES crypto operations
+// Fast execution of AES crypto operations.
+// True if processor executes back to back AES instruction pairs faster.
 def FeatureFuseAES        : SubtargetFeature<"fuse-aes", "HasFuseAES", "true",
                                              "CPU fuses AES crypto operations">;
 
-// Fast execution of bottom and top halves of literal generation
+// Fast execution of bottom and top halves of literal generation.
+// True if processor executes back to back bottom and top halves of literal generation faster.
 def FeatureFuseLiterals   : SubtargetFeature<"fuse-literals", "HasFuseLiterals", "true",
                                              "CPU fuses literal generation operations">;
 
-// The way of reading thread pointer                                             
-def FeatureReadTp :  SubtargetFeature<"read-tp-hard", "ReadTPHard", "true",
+// The way of reading thread pointer.
+// True if read thread pointer from coprocessor register.
+def FeatureReadTp :  SubtargetFeature<"read-tp-hard", "IsReadTPHard", "true",
                                       "Reading thread pointer from register">;
 
 // Cyclone can zero VFP registers in 0 cycles.
+// True if the instructions "vmov.i32 d0, #0" and "vmov.i32 q0, #0" are
+// particularly effective at zeroing a VFP register.
 def FeatureZCZeroing      : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
                                              "Has zero-cycle zeroing instructions">;
 
-// Whether it is profitable to unpredicate certain instructions during if-conversion
+// Whether it is profitable to unpredicate certain instructions during if-conversion.
+// True if if conversion may decide to leave some instructions unpredicated.
 def FeatureProfUnpredicate : SubtargetFeature<"prof-unpr",
                                               "IsProfitableToUnpredicate", "true",
                                               "Is profitable to unpredicate">;
 
 // Some targets (e.g. Swift) have microcoded VGETLNi32.
+// True if VMOV will be favored over VGETLNi32.
 def FeatureSlowVGETLNi32  : SubtargetFeature<"slow-vgetlni32",
                                              "HasSlowVGETLNi32", "true",
                                              "Has slow VGETLNi32 - prefer VMOV">;
 
 // Some targets (e.g. Swift) have microcoded VDUP32.
+// True if VMOV will be favored over VDUP.
 def FeatureSlowVDUP32     : SubtargetFeature<"slow-vdup32", "HasSlowVDUP32",
                                              "true",
                                              "Has slow VDUP32 - prefer VMOV">;
 
 // Some targets (e.g. Cortex-A9) prefer VMOVSR to VMOVDRR even when using NEON
 // for scalar FP, as this allows more effective execution domain optimization.
+// True if VMOVSR will be favored over VMOVDRR.
 def FeaturePreferVMOVSR   : SubtargetFeature<"prefer-vmovsr", "PreferVMOVSR",
                                              "true", "Prefer VMOVSR">;
 
 // Swift has ISHST barriers compatible with Atomic Release semantics but weaker
-// than ISH
-def FeaturePrefISHSTBarrier : SubtargetFeature<"prefer-ishst", "PreferISHST",
+// than ISH.
+// True if ISHST barriers will be used for Release semantics.
+def FeaturePrefISHSTBarrier : SubtargetFeature<"prefer-ishst", "PreferISHSTBarriers",
                                                "true", "Prefer ISHST barriers">;
 
 // Some targets (e.g. Cortex-A9) have muxed AGU and NEON/FPU.
+// True if the AGU and NEON/FPU units are multiplexed.
 def FeatureMuxedUnits     : SubtargetFeature<"muxed-units", "HasMuxedUnits",
                                              "true",
                                              "Has muxed AGU and NEON/FPU">;
 
 // Whether VLDM/VSTM starting with odd register number need more microops
-// than single VLDRS
-def FeatureSlowOddRegister : SubtargetFeature<"slow-odd-reg", "SlowOddRegister",
+// than single VLDRS.
+// True if a VLDM/VSTM starting with an odd register number is considered to
+// take more microops than single VLDRS/VSTRS.
+def FeatureSlowOddRegister : SubtargetFeature<"slow-odd-reg", "HasSlowOddRegister",
                                               "true", "VLDM/VSTM starting "
                                               "with an odd register is slow">;
 
 // Some targets have a renaming dependency when loading into D subregisters.
+// True if loading into a D subregister will be penalized.
 def FeatureSlowLoadDSubreg : SubtargetFeature<"slow-load-D-subreg",
-                                              "SlowLoadDSubregister", "true",
+                                              "HasSlowLoadDSubregister", "true",
                                               "Loading into D subregs is slow">;
 
+// True if use a wider stride when allocating VFP registers.
 def FeatureUseWideStrideVFP : SubtargetFeature<"wide-stride-vfp",
                                                "UseWideStrideVFP", "true",
                                                "Use a wide stride when allocating VFP registers">;
 
 // Some targets (e.g. Cortex-A15) never want VMOVS to be widened to VMOVD.
+// True if VMOVS will never be widened to VMOVD.
 def FeatureDontWidenVMOVS : SubtargetFeature<"dont-widen-vmovs",
                                              "DontWidenVMOVS", "true",
                                              "Don't widen VMOVS to VMOVD">;
 
 // Some targets (e.g. Cortex-A15) prefer to avoid mixing operations on different
 // VFP register widths.
+// True if splat a register between VFP and NEON instructions.
 def FeatureSplatVFPToNeon : SubtargetFeature<"splat-vfp-neon",
-                                             "SplatVFPToNeon", "true",
+                                             "UseSplatVFPToNeon", "true",
                                              "Splat register from VFP to NEON",
                                              [FeatureDontWidenVMOVS]>;
 
 // Whether or not it is profitable to expand VFP/NEON MLA/MLS instructions.
+// True if run the MLx expansion pass.
 def FeatureExpandMLx      : SubtargetFeature<"expand-fp-mlx",
                                              "ExpandMLx", "true",
                                              "Expand VFP/NEON MLA/MLS instructions">;
 
 // Some targets have special RAW hazards for VFP/NEON VMLA/VMLS.
+// True if VFP/NEON VMLA/VMLS have special RAW hazards.
 def FeatureHasVMLxHazards : SubtargetFeature<"vmlx-hazards", "HasVMLxHazards",
                                              "true", "Has VMLx hazards">;
 
 // Some targets (e.g. Cortex-A9) want to convert VMOVRS, VMOVSR and VMOVS from
 // VFP to NEON, as an execution domain optimization.
+// True if VMOVRS, VMOVSR and VMOVS will be converted from VFP to NEON.
 def FeatureNEONForFPMovs  : SubtargetFeature<"neon-fpmovs",
                                              "UseNEONForFPMovs", "true",
                                              "Convert VMOVSR, VMOVRS, "
@@ -281,18 +330,21 @@ def FeatureNEONForFPMovs  : SubtargetFeature<"neon-fpmovs",
 // Some processors benefit from using NEON instructions for scalar
 // single-precision FP operations. This affects instruction selection and should
 // only be enabled if the handling of denormals is not important.
+// Use the method useNEONForSinglePrecisionFP() to determine if NEON should actually be used.
 def FeatureNEONForFP      : SubtargetFeature<"neonfp",
-                                             "UseNEONForSinglePrecisionFP",
+                                             "HasNEONForFP",
                                              "true",
                                              "Use NEON for single precision FP">;
 
 // On some processors, VLDn instructions that access unaligned data take one
 // extra cycle. Take that into account when computing operand latencies.
-def FeatureCheckVLDnAlign : SubtargetFeature<"vldn-align", "CheckVLDnAlign",
+// True if VLDn instructions take an extra cycle for unaligned accesses.
+def FeatureCheckVLDnAlign : SubtargetFeature<"vldn-align", "CheckVLDnAccessAlignment",
                                              "true",
                                              "Check for VLDn unaligned access">;
 
 // Some processors have a nonpipelined VFP coprocessor.
+// True if VFP instructions are not pipelined.
 def FeatureNonpipelinedVFP : SubtargetFeature<"nonpipelined-vfp",
                                               "NonpipelinedVFP", "true",
                                               "VFP instructions are not pipelined">;
@@ -300,20 +352,27 @@ def FeatureNonpipelinedVFP : SubtargetFeature<"nonpipelined-vfp",
 // Some processors have FP multiply-accumulate instructions that don't
 // play nicely with other VFP / NEON instructions, and it's generally better
 // to just not use them.
+// If the VFP2 / NEON instructions are available, indicates
+// whether the FP VML[AS] instructions are slow (if so, don't use them).
 def FeatureHasSlowFPVMLx  : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true",
                                              "Disable VFP / NEON MAC instructions">;
 
-// VFPv4 added VFMA instructions that can similar be fast or slow.
+// VFPv4 added VFMA instructions that can similarly be fast or slow.
+// If the VFP4 / NEON instructions are available, indicates
+// whether the FP VFM[AS] instructions are slow (if so, don't use them).
 def FeatureHasSlowFPVFMx  : SubtargetFeature<"slowfpvfmx", "SlowFPVFMx", "true",
                                              "Disable VFP / NEON FMA instructions">;
 
 // Cortex-A8 / A9 Advanced SIMD has multiplier accumulator forwarding.
+/// True if NEON has special multiplier accumulator
+/// forwarding to allow mul + mla being issued back to back.
 def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding",
                                              "HasVMLxForwarding", "true",
                                              "Has multiplier accumulator forwarding">;
 
 // Disable 32-bit to 16-bit narrowing for experimentation.
-def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true",
+// True if codegen would prefer 32-bit Thumb instructions over 16-bit ones.
+def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Prefers32BitThumb", "true",
                                              "Prefer 32-bit Thumb instrs">;
 
 def FeaturePrefLoopAlign32 : SubtargetFeature<"loop-align", "PrefLoopLogAlignment","2",
@@ -332,17 +391,22 @@ def FeatureMVEVectorCostFactor4 : SubtargetFeature<"mve4beat", "MVEVectorCostFac
 /// out-of-order implementation, e.g. Cortex-A9, unless each individual bit is
 /// mapped to a separate physical register. Avoid partial CPSR update for these
 /// processors.
+/// True if codegen would avoid using instructions
+/// that partially update CPSR and add false dependency on the previous
+/// CPSR setting instruction.
 def FeatureAvoidPartialCPSR : SubtargetFeature<"avoid-partial-cpsr",
                                                "AvoidCPSRPartialUpdate", "true",
                                  "Avoid CPSR partial update for OOO execution">;
 
 /// Disable +1 predication cost for instructions updating CPSR.
 /// Enabled for Cortex-A57.
+/// True if disable +1 predication cost for instructions updating CPSR. Enabled for Cortex-A57.
 def FeatureCheapPredicableCPSR : SubtargetFeature<"cheap-predicable-cpsr",
                                                   "CheapPredicableCPSRDef",
                                                   "true",
                   "Disable +1 predication cost for instructions updating CPSR">;
 
+// True if codegen should avoid using flag setting movs with shifter operand (i.e. asr, lsl, lsr).
 def FeatureAvoidMOVsShOp  : SubtargetFeature<"avoid-movs-shop",
                                              "AvoidMOVsShifterOperand", "true",
                                              "Avoid movs instructions with "
@@ -357,16 +421,20 @@ def FeatureHasRetAddrStack : SubtargetFeature<"ret-addr-stack",
 // Some processors have no branch predictor, which changes the expected cost of
 // taking a branch which affects the choice of whether to use predicated
 // instructions.
+// True if the subtarget has a branch predictor. Having
+// a branch predictor or not changes the expected cost of taking a branch
+// which affects the choice of whether to use predicated instructions.
 def FeatureHasNoBranchPredictor : SubtargetFeature<"no-branch-predictor",
                                                    "HasBranchPredictor", "false",
                                                    "Has no branch predictor">;
 
 /// DSP extension.
+/// True if the subtarget supports the DSP (saturating arith and such) instructions.
 def FeatureDSP            : SubtargetFeature<"dsp", "HasDSP", "true",
                                              "Supports DSP instructions in "
                                              "ARM and/or Thumb2">;
 
-// Multiprocessing extension.
+// True if the subtarget supports Multiprocessing extension (ARMv7 only).
 def FeatureMP             : SubtargetFeature<"mp", "HasMPExtension", "true",
                                         "Supports Multiprocessing extension">;
 
@@ -378,31 +446,42 @@ def FeatureVirtualization : SubtargetFeature<"virtualization",
 
 // Special TRAP encoding for NaCl, which looks like a TRAP in Thumb too.
 // See ARMInstrInfo.td for details.
+// True if NaCl TRAP instruction is generated instead of the regular TRAP.
 def FeatureNaClTrap       : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true",
                                              "NaCl trap">;
 
+// True if the subtarget disallows unaligned memory
+// accesses for some types.  For details, see
+// ARMTargetLowering::allowsMisalignedMemoryAccesses().
 def FeatureStrictAlign    : SubtargetFeature<"strict-align",
                                              "StrictAlign", "true",
                                              "Disallow all unaligned memory "
                                              "access">;
 
+// Generate calls via indirect call instructions.
 def FeatureLongCalls      : SubtargetFeature<"long-calls", "GenLongCalls", "true",
                                              "Generate calls via indirect call "
                                              "instructions">;
 
+// Generate code that does not contain data access to code sections.
 def FeatureExecuteOnly    : SubtargetFeature<"execute-only",
                                              "GenExecuteOnly", "true",
                                              "Enable the generation of "
                                              "execute only code.">;
 
+// True if R9 is not available as a general purpose register.
 def FeatureReserveR9      : SubtargetFeature<"reserve-r9", "ReserveR9", "true",
                                              "Reserve R9, making it unavailable"
                                              " as GPR">;
 
+// True if MOVT / MOVW pairs are not used for materialization of
+// 32-bit imms (including global addresses).
 def FeatureNoMovt         : SubtargetFeature<"no-movt", "NoMovt", "true",
                                              "Don't use movt/movw pairs for "
                                              "32-bit imms">;
 
+/// Implicitly convert an instruction to a different one if its immediates
+/// cannot be encoded. For example, ADD r0, r1, #FFFFFFFF -> SUB r0, r1, #1.
 def FeatureNoNegativeImmediates
                           : SubtargetFeature<"no-neg-immediates",
                                              "NegativeImmediates", "false",
@@ -415,28 +494,39 @@ def FeatureNoNegativeImmediates
 def FeatureUseMISched: SubtargetFeature<"use-misched", "UseMISched", "true",
                                         "Use the MachineScheduler">;
 
+// Use the MachinePipeliner for instruction scheduling for the subtarget.
+def FeatureUseMIPipeliner: SubtargetFeature<"use-mipipeliner", "UseMIPipeliner", "true",
+                                            "Use the MachinePipeliner">;
+
+// False if scheduling should happen again after register allocation.
 def FeatureNoPostRASched : SubtargetFeature<"disable-postra-scheduler",
     "DisablePostRAScheduler", "true",
     "Don't schedule again after register allocation">;
 
 // Armv8.5-A extensions
 
+// Has speculation barrier.
 def FeatureSB       : SubtargetFeature<"sb", "HasSB", "true",
   "Enable v8.5a Speculation Barrier" >;
 
 // Armv8.6-A extensions
+
+// True if subtarget supports BFloat16 floating point operations.
 def FeatureBF16     : SubtargetFeature<"bf16", "HasBF16", "true",
   "Enable support for BFloat16 instructions",  [FeatureNEON]>;
 
+// True if subtarget supports 8-bit integer matrix multiply.
 def FeatureMatMulInt8 : SubtargetFeature<"i8mm", "HasMatMulInt8",
     "true", "Enable Matrix Multiply Int8 Extension", [FeatureNEON]>;
 
 // Armv8.1-M extensions
 
+// True if the processor supports the Low Overhead Branch extension.
 def FeatureLOB            : SubtargetFeature<"lob", "HasLOB", "true",
                                              "Enable Low Overhead Branch "
                                              "extensions">;
 
+// Mitigate against the cve-2021-35465 security vulnurability.
 def FeatureFixCMSE_CVE_2021_35465 : SubtargetFeature<"fix-cmse-cve-2021-35465",
                                         "FixCMSE_CVE_2021_35465", "true",
                                         "Mitigate against the cve-2021-35465 "
@@ -446,11 +536,26 @@ def FeaturePACBTI         : SubtargetFeature<"pacbti", "HasPACBTI", "true",
                                              "Enable Pointer Authentication and Branch "
                                              "Target Identification">;
 
+/// Don't place a BTI instruction after return-twice constructs (setjmp).
 def FeatureNoBTIAtReturnTwice : SubtargetFeature<"no-bti-at-return-twice",
                                                  "NoBTIAtReturnTwice", "true",
                                                  "Don't place a BTI instruction "
                                                  "after a return-twice">;
 
+def FeatureFixCortexA57AES1742098 : SubtargetFeature<"fix-cortex-a57-aes-1742098",
+  "FixCortexA57AES1742098", "true",
+  "Work around Cortex-A57 Erratum 1742098 / Cortex-A72 Erratum 1655431 (AES)">;
+
+def FeatureAAPCSFrameChain : SubtargetFeature<"aapcs-frame-chain",
+                                              "CreateAAPCSFrameChain", "true",
+                                              "Create an AAPCS compliant frame chain">;
+
+def FeatureAAPCSFrameChainLeaf : SubtargetFeature<"aapcs-frame-chain-leaf",
+                                                  "CreateAAPCSFrameChainLeaf", "true",
+                                                  "Create an AAPCS compliant frame chain "
+                                                  "for leaf functions",
+                                                  [FeatureAAPCSFrameChain]>;
+
 //===----------------------------------------------------------------------===//
 // ARM architecture class
 //
@@ -467,16 +572,18 @@ def FeatureRClass : SubtargetFeature<"rclass", "ARMProcClass", "RClass",
 def FeatureMClass : SubtargetFeature<"mclass", "ARMProcClass", "MClass",
                                      "Is microcontroller profile ('M' series)">;
 
-
+// True if Thumb2 instructions are supported.
 def FeatureThumb2 : SubtargetFeature<"thumb2", "HasThumb2", "true",
                                      "Enable Thumb2 instructions">;
 
+// True if subtarget does not support ARM mode execution.
 def FeatureNoARM  : SubtargetFeature<"noarm", "NoARM", "true",
                                      "Does not support ARM mode execution">;
 
 //===----------------------------------------------------------------------===//
 // ARM ISAa.
 //
+// Specify whether target support specific ARM ISA variants.
 
 def HasV4TOps   : SubtargetFeature<"v4t", "HasV4TOps", "true",
                                    "Support ARM v4T instructions">;
@@ -599,13 +706,16 @@ foreach i = {0-7} in
 // Control codegen mitigation against Straight Line Speculation vulnerability.
 //===----------------------------------------------------------------------===//
 
+/// Harden against Straight Line Speculation for Returns and Indirect Branches.
 def FeatureHardenSlsRetBr : SubtargetFeature<"harden-sls-retbr",
   "HardenSlsRetBr", "true",
   "Harden against straight line speculation across RETurn and BranchRegister "
   "instructions">;
+/// Harden against Straight Line Speculation for indirect calls.
 def FeatureHardenSlsBlr : SubtargetFeature<"harden-sls-blr",
   "HardenSlsBlr", "true",
   "Harden against straight line speculation across indirect calls">;
+/// Generate thunk code for SLS mitigation in the normal text section.
 def FeatureHardenSlsNoComdat : SubtargetFeature<"harden-sls-nocomdat",
   "HardenSlsNoComdat", "true",
   "Generate thunk code for SLS mitigation in the normal text section">;
@@ -1303,6 +1413,7 @@ def : ProcessorModel<"cortex-m4", CortexM4Model,        [ARMv7em,
 def : ProcessorModel<"cortex-m7", CortexM7Model,        [ARMv7em,
                                                          ProcM7,
                                                          FeatureFPARMv8_D16,
+                                                         FeatureUseMIPipeliner,
                                                          FeatureUseMISched]>;
 
 def : ProcNoItin<"cortex-m23",                          [ARMv8mBaseline,
@@ -1370,13 +1481,15 @@ def : ProcessorModel<"cortex-a57",  CortexA57Model,     [ARMv8a, ProcA57,
                                                          FeatureCRC,
                                                          FeatureFPAO,
                                                          FeatureAvoidPartialCPSR,
-                                                         FeatureCheapPredicableCPSR]>;
+                                                         FeatureCheapPredicableCPSR,
+                                                         FeatureFixCortexA57AES1742098]>;
 
 def : ProcessorModel<"cortex-a72",  CortexA57Model,     [ARMv8a, ProcA72,
                                                          FeatureHWDivThumb,
                                                          FeatureHWDivARM,
                                                          FeatureCrypto,
-                                                         FeatureCRC]>;
+                                                         FeatureCRC,
+                                                         FeatureFixCortexA57AES1742098]>;
 
 def : ProcNoItin<"cortex-a73",                          [ARMv8a, ProcA73,
                                                          FeatureHWDivThumb,
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index fa09b2567aa9..4aa28bc5d28d 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -161,10 +161,10 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
                                             : COFF::IMAGE_SYM_CLASS_EXTERNAL;
     int Type = COFF::IMAGE_SYM_DTYPE_FUNCTION << COFF::SCT_COMPLEX_TYPE_SHIFT;
 
-    OutStreamer->BeginCOFFSymbolDef(CurrentFnSym);
-    OutStreamer->EmitCOFFSymbolStorageClass(Scl);
-    OutStreamer->EmitCOFFSymbolType(Type);
-    OutStreamer->EndCOFFSymbolDef();
+    OutStreamer->beginCOFFSymbolDef(CurrentFnSym);
+    OutStreamer->emitCOFFSymbolStorageClass(Scl);
+    OutStreamer->emitCOFFSymbolType(Type);
+    OutStreamer->endCOFFSymbolDef();
   }
 
   // Emit the rest of the function body.
@@ -535,27 +535,27 @@ void ARMAsmPrinter::emitEndOfAsmFile(Module &M) {
 
     if (!Stubs.empty()) {
       // Switch with ".non_lazy_symbol_pointer" directive.
-      OutStreamer->SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
+      OutStreamer->switchSection(TLOFMacho.getNonLazySymbolPointerSection());
       emitAlignment(Align(4));
 
       for (auto &Stub : Stubs)
         emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second);
 
       Stubs.clear();
-      OutStreamer->AddBlankLine();
+      OutStreamer->addBlankLine();
     }
 
     Stubs = MMIMacho.GetThreadLocalGVStubList();
     if (!Stubs.empty()) {
       // Switch with ".non_lazy_symbol_pointer" directive.
-      OutStreamer->SwitchSection(TLOFMacho.getThreadLocalPointerSection());
+      OutStreamer->switchSection(TLOFMacho.getThreadLocalPointerSection());
       emitAlignment(Align(4));
 
       for (auto &Stub : Stubs)
         emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second);
 
       Stubs.clear();
-      OutStreamer->AddBlankLine();
+      OutStreamer->addBlankLine();
     }
 
     // Funny Darwin hack: This flag tells the linker that no global symbols
@@ -740,55 +740,53 @@ void ARMAsmPrinter::emitAttributes() {
   ATS.emitAttribute(ARMBuildAttrs::ABI_FP_16bit_format,
                     ARMBuildAttrs::FP16FormatIEEE);
 
-  if (MMI) {
-    if (const Module *SourceModule = MMI->getModule()) {
-      // ABI_PCS_wchar_t to indicate wchar_t width
-      // FIXME: There is no way to emit value 0 (wchar_t prohibited).
-      if (auto WCharWidthValue = mdconst::extract_or_null<ConstantInt>(
-              SourceModule->getModuleFlag("wchar_size"))) {
-        int WCharWidth = WCharWidthValue->getZExtValue();
-        assert((WCharWidth == 2 || WCharWidth == 4) &&
-               "wchar_t width must be 2 or 4 bytes");
-        ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_wchar_t, WCharWidth);
-      }
+  if (const Module *SourceModule = MMI->getModule()) {
+    // ABI_PCS_wchar_t to indicate wchar_t width
+    // FIXME: There is no way to emit value 0 (wchar_t prohibited).
+    if (auto WCharWidthValue = mdconst::extract_or_null<ConstantInt>(
+            SourceModule->getModuleFlag("wchar_size"))) {
+      int WCharWidth = WCharWidthValue->getZExtValue();
+      assert((WCharWidth == 2 || WCharWidth == 4) &&
+             "wchar_t width must be 2 or 4 bytes");
+      ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_wchar_t, WCharWidth);
+    }
 
-      // ABI_enum_size to indicate enum width
-      // FIXME: There is no way to emit value 0 (enums prohibited) or value 3
-      //        (all enums contain a value needing 32 bits to encode).
-      if (auto EnumWidthValue = mdconst::extract_or_null<ConstantInt>(
-              SourceModule->getModuleFlag("min_enum_size"))) {
-        int EnumWidth = EnumWidthValue->getZExtValue();
-        assert((EnumWidth == 1 || EnumWidth == 4) &&
-               "Minimum enum width must be 1 or 4 bytes");
-        int EnumBuildAttr = EnumWidth == 1 ? 1 : 2;
-        ATS.emitAttribute(ARMBuildAttrs::ABI_enum_size, EnumBuildAttr);
-      }
+    // ABI_enum_size to indicate enum width
+    // FIXME: There is no way to emit value 0 (enums prohibited) or value 3
+    //        (all enums contain a value needing 32 bits to encode).
+    if (auto EnumWidthValue = mdconst::extract_or_null<ConstantInt>(
+            SourceModule->getModuleFlag("min_enum_size"))) {
+      int EnumWidth = EnumWidthValue->getZExtValue();
+      assert((EnumWidth == 1 || EnumWidth == 4) &&
+             "Minimum enum width must be 1 or 4 bytes");
+      int EnumBuildAttr = EnumWidth == 1 ? 1 : 2;
+      ATS.emitAttribute(ARMBuildAttrs::ABI_enum_size, EnumBuildAttr);
+    }
 
-      auto *PACValue = mdconst::extract_or_null<ConstantInt>(
-          SourceModule->getModuleFlag("sign-return-address"));
-      if (PACValue && PACValue->getZExtValue() == 1) {
-        // If "+pacbti" is used as an architecture extension,
-        // Tag_PAC_extension is emitted in
-        // ARMTargetStreamer::emitTargetAttributes().
-        if (!STI.hasPACBTI()) {
-          ATS.emitAttribute(ARMBuildAttrs::PAC_extension,
-                            ARMBuildAttrs::AllowPACInNOPSpace);
-        }
-        ATS.emitAttribute(ARMBuildAttrs::PACRET_use, ARMBuildAttrs::PACRETUsed);
+    auto *PACValue = mdconst::extract_or_null<ConstantInt>(
+        SourceModule->getModuleFlag("sign-return-address"));
+    if (PACValue && PACValue->getZExtValue() == 1) {
+      // If "+pacbti" is used as an architecture extension,
+      // Tag_PAC_extension is emitted in
+      // ARMTargetStreamer::emitTargetAttributes().
+      if (!STI.hasPACBTI()) {
+        ATS.emitAttribute(ARMBuildAttrs::PAC_extension,
+                          ARMBuildAttrs::AllowPACInNOPSpace);
       }
+      ATS.emitAttribute(ARMBuildAttrs::PACRET_use, ARMBuildAttrs::PACRETUsed);
+    }
 
-      auto *BTIValue = mdconst::extract_or_null<ConstantInt>(
-          SourceModule->getModuleFlag("branch-target-enforcement"));
-      if (BTIValue && BTIValue->getZExtValue() == 1) {
-        // If "+pacbti" is used as an architecture extension,
-        // Tag_BTI_extension is emitted in
-        // ARMTargetStreamer::emitTargetAttributes().
-        if (!STI.hasPACBTI()) {
-          ATS.emitAttribute(ARMBuildAttrs::BTI_extension,
-                            ARMBuildAttrs::AllowBTIInNOPSpace);
-        }
-        ATS.emitAttribute(ARMBuildAttrs::BTI_use, ARMBuildAttrs::BTIUsed);
+    auto *BTIValue = mdconst::extract_or_null<ConstantInt>(
+        SourceModule->getModuleFlag("branch-target-enforcement"));
+    if (BTIValue && BTIValue->getZExtValue() == 1) {
+      // If "+pacbti" is used as an architecture extension,
+      // Tag_BTI_extension is emitted in
+      // ARMTargetStreamer::emitTargetAttributes().
+      if (!STI.hasPACBTI()) {
+        ATS.emitAttribute(ARMBuildAttrs::BTI_extension,
+                          ARMBuildAttrs::AllowBTIInNOPSpace);
       }
+      ATS.emitAttribute(ARMBuildAttrs::BTI_use, ARMBuildAttrs::BTIUsed);
     }
   }
 
@@ -2276,6 +2274,47 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
     EmitToStreamer(*OutStreamer, TmpInstSB);
     return;
   }
+
+  case ARM::SEH_StackAlloc:
+    ATS.emitARMWinCFIAllocStack(MI->getOperand(0).getImm(),
+                                MI->getOperand(1).getImm());
+    return;
+
+  case ARM::SEH_SaveRegs:
+  case ARM::SEH_SaveRegs_Ret:
+    ATS.emitARMWinCFISaveRegMask(MI->getOperand(0).getImm(),
+                                 MI->getOperand(1).getImm());
+    return;
+
+  case ARM::SEH_SaveSP:
+    ATS.emitARMWinCFISaveSP(MI->getOperand(0).getImm());
+    return;
+
+  case ARM::SEH_SaveFRegs:
+    ATS.emitARMWinCFISaveFRegs(MI->getOperand(0).getImm(),
+                               MI->getOperand(1).getImm());
+    return;
+
+  case ARM::SEH_SaveLR:
+    ATS.emitARMWinCFISaveLR(MI->getOperand(0).getImm());
+    return;
+
+  case ARM::SEH_Nop:
+  case ARM::SEH_Nop_Ret:
+    ATS.emitARMWinCFINop(MI->getOperand(0).getImm());
+    return;
+
+  case ARM::SEH_PrologEnd:
+    ATS.emitARMWinCFIPrologEnd(/*Fragment=*/false);
+    return;
+
+  case ARM::SEH_EpilogStart:
+    ATS.emitARMWinCFIEpilogStart(ARMCC::AL);
+    return;
+
+  case ARM::SEH_EpilogEnd:
+    ATS.emitARMWinCFIEpilogEnd();
+    return;
   }
 
   MCInst TmpInst;
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 5b0bae4d9274..80ba7b5f0d2e 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -343,6 +343,13 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineInstr &MI, LiveVariables *LV,
 }
 
 // Branch analysis.
+// Cond vector output format:
+//   0 elements indicates an unconditional branch
+//   2 elements indicates a conditional branch; the elements are
+//     the condition to check and the CPSR.
+//   3 elements indicates a hardware loop end; the elements
+//     are the opcode, the operand value to test, and a dummy
+//     operand used to pad out to 3 operands.
 bool ARMBaseInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
                                      MachineBasicBlock *&TBB,
                                      MachineBasicBlock *&FBB,
@@ -394,6 +401,17 @@ bool ARMBaseInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
     } else if (I->isReturn()) {
       // Returns can't be analyzed, but we should run cleanup.
       CantAnalyze = true;
+    } else if (I->getOpcode() == ARM::t2LoopEnd &&
+               MBB.getParent()
+                   ->getSubtarget<ARMSubtarget>()
+                   .enableMachinePipeliner()) {
+      if (!Cond.empty())
+        return true;
+      FBB = TBB;
+      TBB = I->getOperand(1).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(I->getOpcode()));
+      Cond.push_back(I->getOperand(0));
+      Cond.push_back(MachineOperand::CreateImm(0));
     } else {
       // We encountered other unrecognized terminator. Bail out immediately.
       return true;
@@ -457,7 +475,7 @@ unsigned ARMBaseInstrInfo::removeBranch(MachineBasicBlock &MBB,
     return 0;
 
   if (!isUncondBranchOpcode(I->getOpcode()) &&
-      !isCondBranchOpcode(I->getOpcode()))
+      !isCondBranchOpcode(I->getOpcode()) && I->getOpcode() != ARM::t2LoopEnd)
     return 0;
 
   // Remove the branch.
@@ -467,7 +485,7 @@ unsigned ARMBaseInstrInfo::removeBranch(MachineBasicBlock &MBB,
 
   if (I == MBB.begin()) return 1;
   --I;
-  if (!isCondBranchOpcode(I->getOpcode()))
+  if (!isCondBranchOpcode(I->getOpcode()) && I->getOpcode() != ARM::t2LoopEnd)
     return 1;
 
   // Remove the branch.
@@ -491,8 +509,8 @@ unsigned ARMBaseInstrInfo::insertBranch(MachineBasicBlock &MBB,
 
   // Shouldn't be a fall through.
   assert(TBB && "insertBranch must not be told to insert a fallthrough");
-  assert((Cond.size() == 2 || Cond.size() == 0) &&
-         "ARM branch conditions have two components!");
+  assert((Cond.size() == 2 || Cond.size() == 0 || Cond.size() == 3) &&
+         "ARM branch conditions have two or three components!");
 
   // For conditional branches, we use addOperand to preserve CPSR flags.
 
@@ -502,19 +520,24 @@ unsigned ARMBaseInstrInfo::insertBranch(MachineBasicBlock &MBB,
         BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB).add(predOps(ARMCC::AL));
       else
         BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB);
-    } else
+    } else if (Cond.size() == 2) {
       BuildMI(&MBB, DL, get(BccOpc))
           .addMBB(TBB)
           .addImm(Cond[0].getImm())
           .add(Cond[1]);
+    } else
+      BuildMI(&MBB, DL, get(Cond[0].getImm())).add(Cond[1]).addMBB(TBB);
     return 1;
   }
 
   // Two-way conditional branch.
-  BuildMI(&MBB, DL, get(BccOpc))
-      .addMBB(TBB)
-      .addImm(Cond[0].getImm())
-      .add(Cond[1]);
+  if (Cond.size() == 2)
+    BuildMI(&MBB, DL, get(BccOpc))
+        .addMBB(TBB)
+        .addImm(Cond[0].getImm())
+        .add(Cond[1]);
+  else if (Cond.size() == 3)
+    BuildMI(&MBB, DL, get(Cond[0].getImm())).add(Cond[1]).addMBB(TBB);
   if (isThumb)
     BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB).add(predOps(ARMCC::AL));
   else
@@ -524,9 +547,12 @@ unsigned ARMBaseInstrInfo::insertBranch(MachineBasicBlock &MBB,
 
 bool ARMBaseInstrInfo::
 reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
-  ARMCC::CondCodes CC = (ARMCC::CondCodes)(int)Cond[0].getImm();
-  Cond[0].setImm(ARMCC::getOppositeCondition(CC));
-  return false;
+  if (Cond.size() == 2) {
+    ARMCC::CondCodes CC = (ARMCC::CondCodes)(int)Cond[0].getImm();
+    Cond[0].setImm(ARMCC::getOppositeCondition(CC));
+    return false;
+  }
+  return true;
 }
 
 bool ARMBaseInstrInfo::isPredicated(const MachineInstr &MI) const {
@@ -556,7 +582,7 @@ std::string ARMBaseInstrInfo::createMIROperandComment(
     return GenericComment;
 
   // If not, check if we have an immediate operand.
-  if (Op.getType() != MachineOperand::MO_Immediate)
+  if (!Op.isImm())
     return std::string();
 
   // And print its corresponding condition code if the immediate is a
@@ -1703,7 +1729,7 @@ bool ARMBaseInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   // or some other super-register.
   int ImpDefIdx = MI.findRegisterDefOperandIdx(DstRegD);
   if (ImpDefIdx != -1)
-    MI.RemoveOperand(ImpDefIdx);
+    MI.removeOperand(ImpDefIdx);
 
   // Change the opcode and operands.
   MI.setDesc(get(ARM::VMOVD));
@@ -2045,6 +2071,9 @@ bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
   if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
     return true;
 
+  if (isSEHInstruction(MI))
+    return true;
+
   // Treat the start of the IT block as a scheduling boundary, but schedule
   // t2IT along with all instructions following it.
   // FIXME: This is a big hammer. But the alternative is to add all potential
@@ -2598,7 +2627,7 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
   // ahead: strip all existing registers off and add them back again
   // in the right order.
   for (int i = MI->getNumOperands() - 1; i >= RegListIdx; --i)
-    MI->RemoveOperand(i);
+    MI->removeOperand(i);
 
   // Add the complete list back in.
   MachineInstrBuilder MIB(MF, &*MI);
@@ -2626,7 +2655,7 @@ bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
       // Turn it into a move.
       MI.setDesc(TII.get(ARM::MOVr));
       MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
-      MI.RemoveOperand(FrameRegIdx+1);
+      MI.removeOperand(FrameRegIdx+1);
       Offset = 0;
       return true;
     } else if (Offset < 0) {
@@ -5103,7 +5132,7 @@ void ARMBaseInstrInfo::setExecutionDomain(MachineInstr &MI,
     SrcReg = MI.getOperand(1).getReg();
 
     for (unsigned i = MI.getDesc().getNumOperands(); i; --i)
-      MI.RemoveOperand(i - 1);
+      MI.removeOperand(i - 1);
 
     // Change to a %DDst = VORRd %DSrc, %DSrc, 14, %noreg (; implicits)
     MI.setDesc(get(ARM::VORRd));
@@ -5122,7 +5151,7 @@ void ARMBaseInstrInfo::setExecutionDomain(MachineInstr &MI,
     SrcReg = MI.getOperand(1).getReg();
 
     for (unsigned i = MI.getDesc().getNumOperands(); i; --i)
-      MI.RemoveOperand(i - 1);
+      MI.removeOperand(i - 1);
 
     DReg = getCorrespondingDRegAndLane(TRI, SrcReg, Lane);
 
@@ -5155,7 +5184,7 @@ void ARMBaseInstrInfo::setExecutionDomain(MachineInstr &MI,
       break;
 
     for (unsigned i = MI.getDesc().getNumOperands(); i; --i)
-      MI.RemoveOperand(i - 1);
+      MI.removeOperand(i - 1);
 
     // Convert to %DDst = VSETLNi32 %DDst, %RSrc, Lane, 14, %noreg (; imps)
     // Again DDst may be undefined at the beginning of this instruction.
@@ -5190,7 +5219,7 @@ void ARMBaseInstrInfo::setExecutionDomain(MachineInstr &MI,
         break;
 
       for (unsigned i = MI.getDesc().getNumOperands(); i; --i)
-        MI.RemoveOperand(i - 1);
+        MI.removeOperand(i - 1);
 
       if (DSrc == DDst) {
         // Destination can be:
@@ -5766,26 +5795,25 @@ struct OutlinerCosts {
         SaveRestoreLROnStack(target.isThumb() ? 8 : 8) {}
 };
 
-unsigned
-ARMBaseInstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const {
-  assert(C.LRUWasSet && "LRU wasn't set?");
+Register
+ARMBaseInstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const {
   MachineFunction *MF = C.getMF();
-  const ARMBaseRegisterInfo *ARI = static_cast<const ARMBaseRegisterInfo *>(
-      MF->getSubtarget().getRegisterInfo());
+  const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
+  const ARMBaseRegisterInfo *ARI =
+      static_cast<const ARMBaseRegisterInfo *>(&TRI);
 
   BitVector regsReserved = ARI->getReservedRegs(*MF);
   // Check if there is an available register across the sequence that we can
   // use.
-  for (unsigned Reg : ARM::rGPRRegClass) {
+  for (Register Reg : ARM::rGPRRegClass) {
     if (!(Reg < regsReserved.size() && regsReserved.test(Reg)) &&
         Reg != ARM::LR &&  // LR is not reserved, but don't use it.
         Reg != ARM::R12 && // R12 is not guaranteed to be preserved.
-        C.LRU.available(Reg) && C.UsedInSequence.available(Reg))
+        C.isAvailableAcrossAndOutOfSeq(Reg, TRI) &&
+        C.isAvailableInsideSeq(Reg, TRI))
       return Reg;
   }
-
-  // No suitable register. Return 0.
-  return 0u;
+  return Register();
 }
 
 // Compute liveness of LR at the point after the interval [I, E), which
@@ -5833,9 +5861,8 @@ outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo(
 
   // Compute liveness information for each candidate, and set FlagsSetInAll.
   const TargetRegisterInfo &TRI = getRegisterInfo();
-  std::for_each(
-      RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
-      [&FlagsSetInAll](outliner::Candidate &C) { FlagsSetInAll &= C.Flags; });
+  for (outliner::Candidate &C : RepeatedSequenceLocs)
+    FlagsSetInAll &= C.Flags;
 
   // According to the ARM Procedure Call Standard, the following are
   // undefined on entry/exit from a function call:
@@ -5854,9 +5881,7 @@ outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo(
     // to compute liveness here.
     if (C.Flags & UnsafeRegsDead)
       return false;
-    C.initLRU(TRI);
-    LiveRegUnits LRU = C.LRU;
-    return (!LRU.available(ARM::R12) || !LRU.available(ARM::CPSR));
+    return C.isAnyUnavailableAcrossOrOutOfSeq({ARM::R12, ARM::CPSR}, TRI);
   };
 
   // Are there any candidates where those registers are live?
@@ -5969,7 +5994,6 @@ outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo(
     std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
 
     for (outliner::Candidate &C : RepeatedSequenceLocs) {
-      C.initLRU(TRI);
       // LR liveness is overestimated in return blocks, unless they end with a
       // tail call.
       const auto Last = C.getMBB()->rbegin();
@@ -5977,7 +6001,7 @@ outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo(
           C.getMBB()->isReturnBlock() && !Last->isCall()
               ? isLRAvailable(TRI, Last,
                               (MachineBasicBlock::reverse_iterator)C.front())
-              : C.LRU.available(ARM::LR);
+              : C.isAvailableAcrossAndOutOfSeq(ARM::LR, TRI);
       if (LRIsAvailable) {
         FrameID = MachineOutlinerNoLRSave;
         NumBytesNoStackCalls += Costs.CallNoLRSave;
@@ -5996,7 +6020,7 @@ outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo(
 
       // Is SP used in the sequence at all? If not, we don't have to modify
       // the stack, so we are guaranteed to get the same frame.
-      else if (C.UsedInSequence.available(ARM::SP)) {
+      else if (C.isAvailableInsideSeq(ARM::SP, TRI)) {
         NumBytesNoStackCalls += Costs.CallDefault;
         C.setCallInfo(MachineOutlinerDefault, Costs.CallDefault);
         CandidatesWithoutStackFixups.push_back(C);
@@ -6189,8 +6213,8 @@ bool ARMBaseInstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
 
   LiveRegUnits LRU(getRegisterInfo());
 
-  std::for_each(MBB.rbegin(), MBB.rend(),
-                [&LRU](MachineInstr &MI) { LRU.accumulate(MI); });
+  for (MachineInstr &MI : llvm::reverse(MBB))
+    LRU.accumulate(MI);
 
   // Check if each of the unsafe registers are available...
   bool R12AvailableInBlock = LRU.available(ARM::R12);
@@ -6635,7 +6659,7 @@ void ARMBaseInstrInfo::buildOutlinedFrame(
 
 MachineBasicBlock::iterator ARMBaseInstrInfo::insertOutlinedCall(
     Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
-    MachineFunction &MF, const outliner::Candidate &C) const {
+    MachineFunction &MF, outliner::Candidate &C) const {
   MachineInstrBuilder MIB;
   MachineBasicBlock::iterator CallPt;
   unsigned Opc;
@@ -6726,3 +6750,122 @@ unsigned llvm::getBLXpredOpcode(const MachineFunction &MF) {
                                                           : ARM::BLX_pred;
 }
 
+namespace {
+class ARMPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
+  MachineInstr *EndLoop, *LoopCount;
+  MachineFunction *MF;
+  const TargetInstrInfo *TII;
+
+  // Meanings of the various stuff with loop types:
+  // t2Bcc:
+  //   EndLoop = branch at end of original BB that will become a kernel
+  //   LoopCount = CC setter live into branch
+  // t2LoopEnd:
+  //   EndLoop = branch at end of original BB
+  //   LoopCount = t2LoopDec
+public:
+  ARMPipelinerLoopInfo(MachineInstr *EndLoop, MachineInstr *LoopCount)
+      : EndLoop(EndLoop), LoopCount(LoopCount),
+        MF(EndLoop->getParent()->getParent()),
+        TII(MF->getSubtarget().getInstrInfo()) {}
+
+  bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
+    // Only ignore the terminator.
+    return MI == EndLoop || MI == LoopCount;
+  }
+
+  Optional<bool> createTripCountGreaterCondition(
+      int TC, MachineBasicBlock &MBB,
+      SmallVectorImpl<MachineOperand> &Cond) override {
+
+    if (isCondBranchOpcode(EndLoop->getOpcode())) {
+      Cond.push_back(EndLoop->getOperand(1));
+      Cond.push_back(EndLoop->getOperand(2));
+      if (EndLoop->getOperand(0).getMBB() == EndLoop->getParent()) {
+        TII->reverseBranchCondition(Cond);
+      }
+      return {};
+    } else if (EndLoop->getOpcode() == ARM::t2LoopEnd) {
+      // General case just lets the unrolled t2LoopDec do the subtraction and
+      // therefore just needs to check if zero has been reached.
+      MachineInstr *LoopDec = nullptr;
+      for (auto &I : MBB.instrs())
+        if (I.getOpcode() == ARM::t2LoopDec)
+          LoopDec = &I;
+      assert(LoopDec && "Unable to find copied LoopDec");
+      // Check if we're done with the loop.
+      BuildMI(&MBB, LoopDec->getDebugLoc(), TII->get(ARM::t2CMPri))
+          .addReg(LoopDec->getOperand(0).getReg())
+          .addImm(0)
+          .addImm(ARMCC::AL)
+          .addReg(ARM::NoRegister);
+      Cond.push_back(MachineOperand::CreateImm(ARMCC::EQ));
+      Cond.push_back(MachineOperand::CreateReg(ARM::CPSR, false));
+      return {};
+    } else
+      llvm_unreachable("Unknown EndLoop");
+  }
+
+  void setPreheader(MachineBasicBlock *NewPreheader) override {}
+
+  void adjustTripCount(int TripCountAdjust) override {}
+
+  void disposed() override {}
+};
+} // namespace
+
+std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
+ARMBaseInstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
+  MachineBasicBlock::iterator I = LoopBB->getFirstTerminator();
+  MachineBasicBlock *Preheader = *LoopBB->pred_begin();
+  if (Preheader == LoopBB)
+    Preheader = *std::next(LoopBB->pred_begin());
+
+  if (I != LoopBB->end() && I->getOpcode() == ARM::t2Bcc) {
+    // If the branch is a Bcc, then the CPSR should be set somewhere within the
+    // block.  We need to determine the reaching definition of CPSR so that
+    // it can be marked as non-pipelineable, allowing the pipeliner to force
+    // it into stage 0 or give up if it cannot or will not do so.
+    MachineInstr *CCSetter = nullptr;
+    for (auto &L : LoopBB->instrs()) {
+      if (L.isCall())
+        return nullptr;
+      if (isCPSRDefined(L))
+        CCSetter = &L;
+    }
+    if (CCSetter)
+      return std::make_unique<ARMPipelinerLoopInfo>(&*I, CCSetter);
+    else
+      return nullptr; // Unable to find the CC setter, so unable to guarantee
+                      // that pipeline will work
+  }
+
+  // Recognize:
+  //   preheader:
+  //     %1 = t2DoopLoopStart %0
+  //   loop:
+  //     %2 = phi %1, <not loop>, %..., %loop
+  //     %3 = t2LoopDec %2, <imm>
+  //     t2LoopEnd %3, %loop
+
+  if (I != LoopBB->end() && I->getOpcode() == ARM::t2LoopEnd) {
+    for (auto &L : LoopBB->instrs())
+      if (L.isCall())
+        return nullptr;
+      else if (isVCTP(&L))
+        return nullptr;
+    Register LoopDecResult = I->getOperand(0).getReg();
+    MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
+    MachineInstr *LoopDec = MRI.getUniqueVRegDef(LoopDecResult);
+    if (!LoopDec || LoopDec->getOpcode() != ARM::t2LoopDec)
+      return nullptr;
+    MachineInstr *LoopStart = nullptr;
+    for (auto &J : Preheader->instrs())
+      if (J.getOpcode() == ARM::t2DoLoopStart)
+        LoopStart = &J;
+    if (!LoopStart)
+      return nullptr;
+    return std::make_unique<ARMPipelinerLoopInfo>(&*I, LoopDec);
+  }
+  return nullptr;
+}
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index defce07dd862..3b8f3403e3c3 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -360,7 +360,7 @@ public:
   MachineBasicBlock::iterator
   insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
                      MachineBasicBlock::iterator &It, MachineFunction &MF,
-                     const outliner::Candidate &C) const override;
+                     outliner::Candidate &C) const override;
 
   /// Enable outlining by default at -Oz.
   bool shouldOutlineFromFunctionByDefault(MachineFunction &MF) const override;
@@ -372,10 +372,15 @@ public:
            MI->getOpcode() == ARM::t2WhileLoopStartTP;
   }
 
+  /// Analyze loop L, which must be a single-basic-block loop, and if the
+  /// conditions can be understood enough produce a PipelinerLoopInfo object.
+  std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
+  analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override;
+
 private:
   /// Returns an unused general-purpose register which can be used for
   /// constructing an outlined call if one exists. Returns 0 otherwise.
-  unsigned findRegisterToSaveLRTo(const outliner::Candidate &C) const;
+  Register findRegisterToSaveLRTo(outliner::Candidate &C) const;
 
   /// Adds an instruction which saves the link register on top of the stack into
   /// the MachineBasicBlock \p MBB at position \p It. If \p Auth is true,
@@ -752,6 +757,26 @@ static inline bool isValidCoprocessorNumber(unsigned Num,
   return true;
 }
 
+static inline bool isSEHInstruction(const MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+  case ARM::SEH_StackAlloc:
+  case ARM::SEH_SaveRegs:
+  case ARM::SEH_SaveRegs_Ret:
+  case ARM::SEH_SaveSP:
+  case ARM::SEH_SaveFRegs:
+  case ARM::SEH_SaveLR:
+  case ARM::SEH_Nop:
+  case ARM::SEH_Nop_Ret:
+  case ARM::SEH_PrologEnd:
+  case ARM::SEH_EpilogStart:
+  case ARM::SEH_EpilogEnd:
+    return true;
+  default:
+    return false;
+  }
+}
+
 /// getInstrPredicate - If instruction is predicated, returns its predicate
 /// condition, otherwise returns AL. It also returns the condition code
 /// register by reference.
diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index c543d02ff75a..1d0e743b94db 100644
--- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -63,28 +63,26 @@ const MCPhysReg*
 ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   const ARMSubtarget &STI = MF->getSubtarget<ARMSubtarget>();
   bool UseSplitPush = STI.splitFramePushPop(*MF);
-  const MCPhysReg *RegList =
-      STI.isTargetDarwin()
-          ? CSR_iOS_SaveList
-          : (UseSplitPush ? CSR_AAPCS_SplitPush_SaveList : CSR_AAPCS_SaveList);
-
   const Function &F = MF->getFunction();
+
   if (F.getCallingConv() == CallingConv::GHC) {
     // GHC set of callee saved regs is empty as all those regs are
     // used for passing STG regs around
     return CSR_NoRegs_SaveList;
+  } else if (STI.splitFramePointerPush(*MF)) {
+    return CSR_Win_SplitFP_SaveList;
   } else if (F.getCallingConv() == CallingConv::CFGuard_Check) {
     return CSR_Win_AAPCS_CFGuard_Check_SaveList;
   } else if (F.getCallingConv() == CallingConv::SwiftTail) {
     return STI.isTargetDarwin()
                ? CSR_iOS_SwiftTail_SaveList
-               : (UseSplitPush ? CSR_AAPCS_SplitPush_SwiftTail_SaveList
+               : (UseSplitPush ? CSR_ATPCS_SplitPush_SwiftTail_SaveList
                                : CSR_AAPCS_SwiftTail_SaveList);
   } else if (F.hasFnAttribute("interrupt")) {
     if (STI.isMClass()) {
       // M-class CPUs have hardware which saves the registers needed to allow a
       // function conforming to the AAPCS to function as a handler.
-      return UseSplitPush ? CSR_AAPCS_SplitPush_SaveList : CSR_AAPCS_SaveList;
+      return UseSplitPush ? CSR_ATPCS_SplitPush_SaveList : CSR_AAPCS_SaveList;
     } else if (F.getFnAttribute("interrupt").getValueAsString() == "FIQ") {
       // Fast interrupt mode gives the handler a private copy of R8-R14, so less
       // need to be saved to restore user-mode state.
@@ -101,7 +99,7 @@ ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     if (STI.isTargetDarwin())
       return CSR_iOS_SwiftError_SaveList;
 
-    return UseSplitPush ? CSR_AAPCS_SplitPush_SwiftError_SaveList :
+    return UseSplitPush ? CSR_ATPCS_SplitPush_SwiftError_SaveList :
       CSR_AAPCS_SwiftError_SaveList;
   }
 
@@ -109,7 +107,15 @@ ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     return MF->getInfo<ARMFunctionInfo>()->isSplitCSR()
                ? CSR_iOS_CXX_TLS_PE_SaveList
                : CSR_iOS_CXX_TLS_SaveList;
-  return RegList;
+
+  if (STI.isTargetDarwin())
+    return CSR_iOS_SaveList;
+
+  if (UseSplitPush)
+    return STI.createAAPCSFrameChain() ? CSR_AAPCS_SplitPush_SaveList
+                                       : CSR_ATPCS_SplitPush_SaveList;
+
+  return CSR_AAPCS_SaveList;
 }
 
 const MCPhysReg *ARMBaseRegisterInfo::getCalleeSavedRegsViaCopy(
@@ -238,7 +244,7 @@ bool ARMBaseRegisterInfo::isInlineAsmReadOnlyReg(const MachineFunction &MF,
 
   BitVector Reserved(getNumRegs());
   markSuperRegs(Reserved, ARM::PC);
-  if (TFI->hasFP(MF))
+  if (TFI->isFPReserved(MF))
     markSuperRegs(Reserved, STI.getFramePointerReg());
   if (hasBasePointer(MF))
     markSuperRegs(Reserved, BasePtr);
diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
index 57d7842c63ca..73ed300ccff4 100644
--- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -43,7 +43,7 @@ namespace ARMRI {
 
 /// isARMArea1Register - Returns true if the register is a low register (r0-r7)
 /// or a stack/pc register that we should push/pop.
-static inline bool isARMArea1Register(unsigned Reg, bool isIOS) {
+static inline bool isARMArea1Register(unsigned Reg, bool SplitFramePushPop) {
   using namespace ARM;
 
   switch (Reg) {
@@ -53,25 +53,52 @@ static inline bool isARMArea1Register(unsigned Reg, bool isIOS) {
       return true;
     case R8:  case R9:  case R10: case R11: case R12:
       // For iOS we want r7 and lr to be next to each other.
-      return !isIOS;
+      return !SplitFramePushPop;
     default:
       return false;
   }
 }
 
-static inline bool isARMArea2Register(unsigned Reg, bool isIOS) {
+static inline bool isARMArea2Register(unsigned Reg, bool SplitFramePushPop) {
   using namespace ARM;
 
   switch (Reg) {
     case R8: case R9: case R10: case R11: case R12:
       // iOS has this second area.
-      return isIOS;
+      return SplitFramePushPop;
     default:
       return false;
   }
 }
 
-static inline bool isARMArea3Register(unsigned Reg, bool isIOS) {
+static inline bool isSplitFPArea1Register(unsigned Reg,
+                                          bool SplitFramePushPop) {
+  using namespace ARM;
+
+  switch (Reg) {
+    case R0:  case R1:  case R2:  case R3:
+    case R4:  case R5:  case R6:  case R7:
+    case R8:  case R9:  case R10: case R12:
+    case SP:  case PC:
+      return true;
+    default:
+      return false;
+  }
+}
+
+static inline bool isSplitFPArea2Register(unsigned Reg,
+                                          bool SplitFramePushPop) {
+  using namespace ARM;
+
+  switch (Reg) {
+    case R11: case LR:
+      return true;
+    default:
+      return false;
+  }
+}
+
+static inline bool isARMArea3Register(unsigned Reg, bool SplitFramePushPop) {
   using namespace ARM;
 
   switch (Reg) {
@@ -214,6 +241,8 @@ public:
                             unsigned DefSubReg,
                             const TargetRegisterClass *SrcRC,
                             unsigned SrcSubReg) const override;
+
+  int getSEHRegNum(unsigned i) const { return getEncodingValue(i); }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/ARM/ARMBlockPlacement.cpp b/llvm/lib/Target/ARM/ARMBlockPlacement.cpp
index ddbd6702e528..b2d291bbe7ff 100644
--- a/llvm/lib/Target/ARM/ARMBlockPlacement.cpp
+++ b/llvm/lib/Target/ARM/ARMBlockPlacement.cpp
@@ -16,6 +16,7 @@
 #include "ARMBasicBlockInfo.h"
 #include "ARMSubtarget.h"
 #include "MVETailPredUtils.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
@@ -212,7 +213,7 @@ bool ARMBlockPlacement::processPostOrderLoops(MachineLoop *ML) {
 bool ARMBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
-  const ARMSubtarget &ST = static_cast<const ARMSubtarget &>(MF.getSubtarget());
+  const ARMSubtarget &ST = MF.getSubtarget<ARMSubtarget>();
   if (!ST.hasLOB())
     return false;
   LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Running on " << MF.getName() << "\n");
diff --git a/llvm/lib/Target/ARM/ARMCallingConv.td b/llvm/lib/Target/ARM/ARMCallingConv.td
index a6dbe563a4ab..d14424c2deca 100644
--- a/llvm/lib/Target/ARM/ARMCallingConv.td
+++ b/llvm/lib/Target/ARM/ARMCallingConv.td
@@ -284,19 +284,32 @@ def CSR_AAPCS_SwiftTail : CalleeSavedRegs<(sub CSR_AAPCS, R10)>;
 // The order of callee-saved registers needs to match the order we actually push
 // them in FrameLowering, because this order is what's used by
 // PrologEpilogInserter to allocate frame index slots. So when R7 is the frame
-// pointer, we use this AAPCS alternative.
-def CSR_AAPCS_SplitPush : CalleeSavedRegs<(add LR, R7, R6, R5, R4,
+// pointer, we use this ATPCS alternative.
+def CSR_ATPCS_SplitPush : CalleeSavedRegs<(add LR, R7, R6, R5, R4,
                                                R11, R10, R9, R8,
                                                (sequence "D%u", 15, 8))>;
 
+def CSR_Win_SplitFP : CalleeSavedRegs<(add R10, R9, R8, R7, R6, R5, R4,
+                                               (sequence "D%u", 15, 8),
+                                               LR, R11)>;
+
 // R8 is used to pass swifterror, remove it from CSR.
-def CSR_AAPCS_SplitPush_SwiftError : CalleeSavedRegs<(sub CSR_AAPCS_SplitPush,
+def CSR_ATPCS_SplitPush_SwiftError : CalleeSavedRegs<(sub CSR_ATPCS_SplitPush,
                                                       R8)>;
 
 // R10 is used to pass swifterror, remove it from CSR.
-def CSR_AAPCS_SplitPush_SwiftTail : CalleeSavedRegs<(sub CSR_AAPCS_SplitPush,
+def CSR_ATPCS_SplitPush_SwiftTail : CalleeSavedRegs<(sub CSR_ATPCS_SplitPush,
                                                      R10)>;
 
+// When enforcing an AAPCS compliant frame chain, R11 is used as the frame
+// pointer even for Thumb targets, where split pushes are necessary.
+// This AAPCS alternative makes sure the frame index slots match the push
+// order in that case.
+def CSR_AAPCS_SplitPush : CalleeSavedRegs<(add LR, R11,
+                                               R7, R6, R5, R4,
+                                               R10, R9, R8,
+                                               (sequence "D%u", 15, 8))>;
+
 // Constructors and destructors return 'this' in the ARM C++ ABI; since 'this'
 // and the pointer return value are both passed in R0 in these cases, this can
 // be partially modelled by treating R0 as a callee-saved register
diff --git a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
index a2a4f1f3bdfd..d77c3afd05e5 100644
--- a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -396,7 +396,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
                     << MCP->getConstants().size() << " CP entries, aligned to "
                     << MCP->getConstantPoolAlign().value() << " bytes *****\n");
 
-  STI = &static_cast<const ARMSubtarget &>(MF->getSubtarget());
+  STI = &MF->getSubtarget<ARMSubtarget>();
   TII = STI->getInstrInfo();
   isPositionIndependentOrROPI =
       STI->getTargetLowering()->isPositionIndependent() || STI->isROPI();
diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 2f083561bbd4..613904f702f0 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/Debug.h"
 
 using namespace llvm;
@@ -2107,6 +2108,10 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::TCRETURNdi:
     case ARM::TCRETURNri: {
       MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+      if (MBBI->getOpcode() == ARM::SEH_EpilogEnd)
+        MBBI--;
+      if (MBBI->getOpcode() == ARM::SEH_Nop_Ret)
+        MBBI--;
       assert(MBBI->isReturn() &&
              "Can only insert epilog into returning blocks");
       unsigned RetOpcode = MBBI->getOpcode();
@@ -2116,13 +2121,21 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
 
       // Tail call return: adjust the stack pointer and jump to callee.
       MBBI = MBB.getLastNonDebugInstr();
+      if (MBBI->getOpcode() == ARM::SEH_EpilogEnd)
+        MBBI--;
+      if (MBBI->getOpcode() == ARM::SEH_Nop_Ret)
+        MBBI--;
       MachineOperand &JumpTarget = MBBI->getOperand(0);
 
       // Jump to label or value in register.
       if (RetOpcode == ARM::TCRETURNdi) {
+        MachineFunction *MF = MBB.getParent();
+        bool NeedsWinCFI = MF->getTarget().getMCAsmInfo()->usesWindowsCFI() &&
+                           MF->getFunction().needsUnwindTableEntry();
         unsigned TCOpcode =
             STI->isThumb()
-                ? (STI->isTargetMachO() ? ARM::tTAILJMPd : ARM::tTAILJMPdND)
+                ? ((STI->isTargetMachO() || NeedsWinCFI) ? ARM::tTAILJMPd
+                                                         : ARM::tTAILJMPdND)
                 : ARM::TAILJMPd;
         MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(TCOpcode));
         if (JumpTarget.isGlobal())
@@ -3132,7 +3145,7 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
 }
 
 bool ARMExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
-  STI = &static_cast<const ARMSubtarget &>(MF.getSubtarget());
+  STI = &MF.getSubtarget<ARMSubtarget>();
   TII = STI->getInstrInfo();
   TRI = STI->getRegisterInfo();
   AFI = MF.getInfo<ARMFunctionInfo>();
diff --git a/llvm/lib/Target/ARM/ARMFastISel.cpp b/llvm/lib/Target/ARM/ARMFastISel.cpp
index 5d94b99d4c5d..a167225e2743 100644
--- a/llvm/lib/Target/ARM/ARMFastISel.cpp
+++ b/llvm/lib/Target/ARM/ARMFastISel.cpp
@@ -122,8 +122,7 @@ class ARMFastISel final : public FastISel {
     explicit ARMFastISel(FunctionLoweringInfo &funcInfo,
                          const TargetLibraryInfo *libInfo)
         : FastISel(funcInfo, libInfo),
-          Subtarget(
-              &static_cast<const ARMSubtarget &>(funcInfo.MF->getSubtarget())),
+          Subtarget(&funcInfo.MF->getSubtarget<ARMSubtarget>()),
           M(const_cast<Module &>(*funcInfo.Fn->getParent())),
           TM(funcInfo.MF->getTarget()), TII(*Subtarget->getInstrInfo()),
           TLI(*Subtarget->getTargetLowering()) {
@@ -156,7 +155,7 @@ class ARMFastISel final : public FastISel {
                              const LoadInst *LI) override;
     bool fastLowerArguments() override;
 
-  #include "ARMGenFastISel.inc"
+#include "ARMGenFastISel.inc"
 
     // Instruction selection routines.
 
@@ -189,10 +188,10 @@ class ARMFastISel final : public FastISel {
     bool ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
                     bool isZExt);
     bool ARMEmitLoad(MVT VT, Register &ResultReg, Address &Addr,
-                     unsigned Alignment = 0, bool isZExt = true,
+                     MaybeAlign Alignment = None, bool isZExt = true,
                      bool allocReg = true);
     bool ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
-                      unsigned Alignment = 0);
+                      MaybeAlign Alignment = None);
     bool ARMComputeAddress(const Value *Obj, Address &Addr);
     void ARMSimplifyAddress(Address &Addr, MVT VT, bool useAM3);
     bool ARMIsMemCpySmall(uint64_t Len);
@@ -602,8 +601,7 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
   }
 
   if ((Subtarget->isTargetELF() && Subtarget->isGVInGOT(GV)) ||
-      (Subtarget->isTargetMachO() && IsIndirect) ||
-      Subtarget->genLongCalls()) {
+      (Subtarget->isTargetMachO() && IsIndirect)) {
     MachineInstrBuilder MIB;
     Register NewDestReg = createResultReg(TLI.getRegClassFor(VT));
     if (isThumb2)
@@ -898,7 +896,8 @@ void ARMFastISel::AddLoadStoreOperands(MVT VT, Address &Addr,
 }
 
 bool ARMFastISel::ARMEmitLoad(MVT VT, Register &ResultReg, Address &Addr,
-                              unsigned Alignment, bool isZExt, bool allocReg) {
+                              MaybeAlign Alignment, bool isZExt,
+                              bool allocReg) {
   unsigned Opc;
   bool useAM3 = false;
   bool needVMOV = false;
@@ -924,7 +923,8 @@ bool ARMFastISel::ARMEmitLoad(MVT VT, Register &ResultReg, Address &Addr,
       RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass;
       break;
     case MVT::i16:
-      if (Alignment && Alignment < 2 && !Subtarget->allowsUnalignedMem())
+      if (Alignment && *Alignment < Align(2) &&
+          !Subtarget->allowsUnalignedMem())
         return false;
 
       if (isThumb2) {
@@ -939,7 +939,8 @@ bool ARMFastISel::ARMEmitLoad(MVT VT, Register &ResultReg, Address &Addr,
       RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass;
       break;
     case MVT::i32:
-      if (Alignment && Alignment < 4 && !Subtarget->allowsUnalignedMem())
+      if (Alignment && *Alignment < Align(4) &&
+          !Subtarget->allowsUnalignedMem())
         return false;
 
       if (isThumb2) {
@@ -955,7 +956,7 @@ bool ARMFastISel::ARMEmitLoad(MVT VT, Register &ResultReg, Address &Addr,
     case MVT::f32:
       if (!Subtarget->hasVFP2Base()) return false;
       // Unaligned loads need special handling. Floats require word-alignment.
-      if (Alignment && Alignment < 4) {
+      if (Alignment && *Alignment < Align(4)) {
         needVMOV = true;
         VT = MVT::i32;
         Opc = isThumb2 ? ARM::t2LDRi12 : ARM::LDRi12;
@@ -970,7 +971,7 @@ bool ARMFastISel::ARMEmitLoad(MVT VT, Register &ResultReg, Address &Addr,
       if (!Subtarget->hasVFP2Base()) return false;
       // FIXME: Unaligned loads need special handling.  Doublewords require
       // word-alignment.
-      if (Alignment && Alignment < 4)
+      if (Alignment && *Alignment < Align(4))
         return false;
 
       Opc = ARM::VLDRD;
@@ -1030,14 +1031,14 @@ bool ARMFastISel::SelectLoad(const Instruction *I) {
   if (!ARMComputeAddress(I->getOperand(0), Addr)) return false;
 
   Register ResultReg;
-  if (!ARMEmitLoad(VT, ResultReg, Addr, cast<LoadInst>(I)->getAlignment()))
+  if (!ARMEmitLoad(VT, ResultReg, Addr, cast<LoadInst>(I)->getAlign()))
     return false;
   updateValueMap(I, ResultReg);
   return true;
 }
 
 bool ARMFastISel::ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
-                               unsigned Alignment) {
+                               MaybeAlign Alignment) {
   unsigned StrOpc;
   bool useAM3 = false;
   switch (VT.SimpleTy) {
@@ -1065,7 +1066,8 @@ bool ARMFastISel::ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
       }
       break;
     case MVT::i16:
-      if (Alignment && Alignment < 2 && !Subtarget->allowsUnalignedMem())
+      if (Alignment && *Alignment < Align(2) &&
+          !Subtarget->allowsUnalignedMem())
         return false;
 
       if (isThumb2) {
@@ -1079,7 +1081,8 @@ bool ARMFastISel::ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
       }
       break;
     case MVT::i32:
-      if (Alignment && Alignment < 4 && !Subtarget->allowsUnalignedMem())
+      if (Alignment && *Alignment < Align(4) &&
+          !Subtarget->allowsUnalignedMem())
         return false;
 
       if (isThumb2) {
@@ -1094,7 +1097,7 @@ bool ARMFastISel::ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
     case MVT::f32:
       if (!Subtarget->hasVFP2Base()) return false;
       // Unaligned stores need special handling. Floats require word-alignment.
-      if (Alignment && Alignment < 4) {
+      if (Alignment && *Alignment < Align(4)) {
         Register MoveReg = createResultReg(TLI.getRegClassFor(MVT::i32));
         AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                 TII.get(ARM::VMOVRS), MoveReg)
@@ -1111,8 +1114,8 @@ bool ARMFastISel::ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
       if (!Subtarget->hasVFP2Base()) return false;
       // FIXME: Unaligned stores need special handling.  Doublewords require
       // word-alignment.
-      if (Alignment && Alignment < 4)
-          return false;
+      if (Alignment && *Alignment < Align(4))
+        return false;
 
       StrOpc = ARM::VSTRD;
       break;
@@ -1166,7 +1169,7 @@ bool ARMFastISel::SelectStore(const Instruction *I) {
   if (!ARMComputeAddress(I->getOperand(1), Addr))
     return false;
 
-  if (!ARMEmitStore(VT, SrcReg, Addr, cast<StoreInst>(I)->getAlignment()))
+  if (!ARMEmitStore(VT, SrcReg, Addr, cast<StoreInst>(I)->getAlign()))
     return false;
   return true;
 }
@@ -2939,7 +2942,7 @@ bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
   if (!ARMComputeAddress(LI->getOperand(0), Addr)) return false;
 
   Register ResultReg = MI->getOperand(0).getReg();
-  if (!ARMEmitLoad(VT, ResultReg, Addr, LI->getAlignment(), isZExt, false))
+  if (!ARMEmitLoad(VT, ResultReg, Addr, LI->getAlign(), isZExt, false))
     return false;
   MachineBasicBlock::iterator I(MI);
   removeDeadCode(I, std::next(I));
diff --git a/llvm/lib/Target/ARM/ARMFixCortexA57AES1742098Pass.cpp b/llvm/lib/Target/ARM/ARMFixCortexA57AES1742098Pass.cpp
new file mode 100644
index 000000000000..77c8f7134a55
--- /dev/null
+++ b/llvm/lib/Target/ARM/ARMFixCortexA57AES1742098Pass.cpp
@@ -0,0 +1,432 @@
+//===-- ARMFixCortexA57AES1742098Pass.cpp ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This pass works around a Cortex Core Fused AES erratum:
+// - Cortex-A57 Erratum 1742098
+// - Cortex-A72 Erratum 1655431
+//
+// The erratum may be triggered if an input vector register to AESE or AESD was
+// last written by an instruction that only updated 32 bits of it. This can
+// occur for either of the input registers.
+//
+// The workaround chosen is to update the input register using `r = VORRq r, r`,
+// as this updates all 128 bits of the register unconditionally, but does not
+// change the values observed in `r`, making the input safe.
+//
+// This pass has to be conservative in a few cases:
+// - an input vector register to the AES instruction is defined outside the
+//   current function, where we have to assume the register was updated in an
+//   unsafe way; and
+// - an input vector register to the AES instruction is updated along multiple
+//   different control-flow paths, where we have to ensure all the register
+//   updating instructions are safe.
+//
+// Both of these cases may apply to a input vector register. In either case, we
+// need to ensure that, when the pass is finished, there exists a safe
+// instruction between every unsafe register updating instruction and the AES
+// instruction.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMSubtarget.h"
+#include "Utils/ARMBaseInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineInstrBundleIterator.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/ReachingDefAnalysis.h"
+#include "llvm/CodeGen/Register.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <assert.h>
+#include <stdint.h>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-fix-cortex-a57-aes-1742098"
+
+//===----------------------------------------------------------------------===//
+
+namespace {
+class ARMFixCortexA57AES1742098 : public MachineFunctionPass {
+public:
+  static char ID;
+  explicit ARMFixCortexA57AES1742098() : MachineFunctionPass(ID) {
+    initializeARMFixCortexA57AES1742098Pass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+  StringRef getPassName() const override {
+    return "ARM fix for Cortex-A57 AES Erratum 1742098";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<ReachingDefAnalysis>();
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  // This is the information needed to insert the fixup in the right place.
+  struct AESFixupLocation {
+    MachineBasicBlock *Block;
+    // The fixup instruction will be inserted *before* InsertionPt.
+    MachineInstr *InsertionPt;
+    MachineOperand *MOp;
+  };
+
+  void analyzeMF(MachineFunction &MF, ReachingDefAnalysis &RDA,
+                 const ARMBaseRegisterInfo *TRI,
+                 SmallVectorImpl<AESFixupLocation> &FixupLocsForFn) const;
+
+  void insertAESFixup(AESFixupLocation &FixupLoc, const ARMBaseInstrInfo *TII,
+                      const ARMBaseRegisterInfo *TRI) const;
+
+  static bool isFirstAESPairInstr(MachineInstr &MI);
+  static bool isSafeAESInput(MachineInstr &MI);
+};
+char ARMFixCortexA57AES1742098::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(ARMFixCortexA57AES1742098, DEBUG_TYPE,
+                      "ARM fix for Cortex-A57 AES Erratum 1742098", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(ReachingDefAnalysis);
+INITIALIZE_PASS_END(ARMFixCortexA57AES1742098, DEBUG_TYPE,
+                    "ARM fix for Cortex-A57 AES Erratum 1742098", false, false)
+
+//===----------------------------------------------------------------------===//
+
+bool ARMFixCortexA57AES1742098::isFirstAESPairInstr(MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  return Opc == ARM::AESD || Opc == ARM::AESE;
+}
+
+bool ARMFixCortexA57AES1742098::isSafeAESInput(MachineInstr &MI) {
+  auto CondCodeIsAL = [](MachineInstr &MI) -> bool {
+    int CCIdx = MI.findFirstPredOperandIdx();
+    if (CCIdx == -1)
+      return false;
+    return MI.getOperand(CCIdx).getImm() == (int64_t)ARMCC::AL;
+  };
+
+  switch (MI.getOpcode()) {
+  // Unknown: Assume not safe.
+  default:
+    return false;
+  // 128-bit wide AES instructions
+  case ARM::AESD:
+  case ARM::AESE:
+  case ARM::AESMC:
+  case ARM::AESIMC:
+    // No CondCode.
+    return true;
+  // 128-bit and 64-bit wide bitwise ops (when condition = al)
+  case ARM::VANDd:
+  case ARM::VANDq:
+  case ARM::VORRd:
+  case ARM::VORRq:
+  case ARM::VEORd:
+  case ARM::VEORq:
+  case ARM::VMVNd:
+  case ARM::VMVNq:
+  // VMOV of 64-bit value between D registers (when condition = al)
+  case ARM::VMOVD:
+  // VMOV of 64 bit value from GPRs (when condition = al)
+  case ARM::VMOVDRR:
+  // VMOV of immediate into D or Q registers (when condition = al)
+  case ARM::VMOVv2i64:
+  case ARM::VMOVv1i64:
+  case ARM::VMOVv2f32:
+  case ARM::VMOVv4f32:
+  case ARM::VMOVv2i32:
+  case ARM::VMOVv4i32:
+  case ARM::VMOVv4i16:
+  case ARM::VMOVv8i16:
+  case ARM::VMOVv8i8:
+  case ARM::VMOVv16i8:
+  // Loads (when condition = al)
+  // VLD Dn, [Rn, #imm]
+  case ARM::VLDRD:
+  // VLDM
+  case ARM::VLDMDDB_UPD:
+  case ARM::VLDMDIA_UPD:
+  case ARM::VLDMDIA:
+  // VLDn to all lanes.
+  case ARM::VLD1d64:
+  case ARM::VLD1q64:
+  case ARM::VLD1d32:
+  case ARM::VLD1q32:
+  case ARM::VLD2b32:
+  case ARM::VLD2d32:
+  case ARM::VLD2q32:
+  case ARM::VLD1d16:
+  case ARM::VLD1q16:
+  case ARM::VLD2d16:
+  case ARM::VLD2q16:
+  case ARM::VLD1d8:
+  case ARM::VLD1q8:
+  case ARM::VLD2b8:
+  case ARM::VLD2d8:
+  case ARM::VLD2q8:
+  case ARM::VLD3d32:
+  case ARM::VLD3q32:
+  case ARM::VLD3d16:
+  case ARM::VLD3q16:
+  case ARM::VLD3d8:
+  case ARM::VLD3q8:
+  case ARM::VLD4d32:
+  case ARM::VLD4q32:
+  case ARM::VLD4d16:
+  case ARM::VLD4q16:
+  case ARM::VLD4d8:
+  case ARM::VLD4q8:
+  // VLD1 (single element to one lane)
+  case ARM::VLD1LNd32:
+  case ARM::VLD1LNd32_UPD:
+  case ARM::VLD1LNd8:
+  case ARM::VLD1LNd8_UPD:
+  case ARM::VLD1LNd16:
+  case ARM::VLD1LNd16_UPD:
+  // VLD1 (single element to all lanes)
+  case ARM::VLD1DUPd32:
+  case ARM::VLD1DUPd32wb_fixed:
+  case ARM::VLD1DUPd32wb_register:
+  case ARM::VLD1DUPd16:
+  case ARM::VLD1DUPd16wb_fixed:
+  case ARM::VLD1DUPd16wb_register:
+  case ARM::VLD1DUPd8:
+  case ARM::VLD1DUPd8wb_fixed:
+  case ARM::VLD1DUPd8wb_register:
+  case ARM::VLD1DUPq32:
+  case ARM::VLD1DUPq32wb_fixed:
+  case ARM::VLD1DUPq32wb_register:
+  case ARM::VLD1DUPq16:
+  case ARM::VLD1DUPq16wb_fixed:
+  case ARM::VLD1DUPq16wb_register:
+  case ARM::VLD1DUPq8:
+  case ARM::VLD1DUPq8wb_fixed:
+  case ARM::VLD1DUPq8wb_register:
+  // VMOV
+  case ARM::VSETLNi32:
+  case ARM::VSETLNi16:
+  case ARM::VSETLNi8:
+    return CondCodeIsAL(MI);
+  };
+
+  return false;
+}
+
+bool ARMFixCortexA57AES1742098::runOnMachineFunction(MachineFunction &F) {
+  LLVM_DEBUG(dbgs() << "***** ARMFixCortexA57AES1742098 *****\n");
+  auto &STI = F.getSubtarget<ARMSubtarget>();
+
+  // Fix not requested or AES instructions not present: skip pass.
+  if (!STI.hasAES() || !STI.fixCortexA57AES1742098())
+    return false;
+
+  const ARMBaseRegisterInfo *TRI = STI.getRegisterInfo();
+  const ARMBaseInstrInfo *TII = STI.getInstrInfo();
+
+  auto &RDA = getAnalysis<ReachingDefAnalysis>();
+
+  // Analyze whole function to find instructions which need fixing up...
+  SmallVector<AESFixupLocation> FixupLocsForFn{};
+  analyzeMF(F, RDA, TRI, FixupLocsForFn);
+
+  // ... and fix the instructions up all at the same time.
+  bool Changed = false;
+  LLVM_DEBUG(dbgs() << "Inserting " << FixupLocsForFn.size() << " fixup(s)\n");
+  for (AESFixupLocation &FixupLoc : FixupLocsForFn) {
+    insertAESFixup(FixupLoc, TII, TRI);
+    Changed |= true;
+  }
+
+  return Changed;
+}
+
+void ARMFixCortexA57AES1742098::analyzeMF(
+    MachineFunction &MF, ReachingDefAnalysis &RDA,
+    const ARMBaseRegisterInfo *TRI,
+    SmallVectorImpl<AESFixupLocation> &FixupLocsForFn) const {
+  unsigned MaxAllowedFixups = 0;
+
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (!isFirstAESPairInstr(MI))
+        continue;
+
+      // Found an instruction to check the operands of.
+      LLVM_DEBUG(dbgs() << "Found AES Pair starting: " << MI);
+      assert(MI.getNumExplicitOperands() == 3 && MI.getNumExplicitDefs() == 1 &&
+             "Unknown AES Instruction Format. Expected 1 def, 2 uses.");
+
+      // A maximum of two fixups should be inserted for each AES pair (one per
+      // register use).
+      MaxAllowedFixups += 2;
+
+      // Inspect all operands, choosing whether to insert a fixup.
+      for (MachineOperand &MOp : MI.uses()) {
+        SmallPtrSet<MachineInstr *, 1> AllDefs{};
+        RDA.getGlobalReachingDefs(&MI, MOp.getReg(), AllDefs);
+
+        // Planned Fixup: This should be added to FixupLocsForFn at most once.
+        AESFixupLocation NewLoc{&MBB, &MI, &MOp};
+
+        // In small functions with loops, this operand may be both a live-in and
+        // have definitions within the function itself. These will need a fixup.
+        bool IsLiveIn = MF.front().isLiveIn(MOp.getReg());
+
+        // If the register doesn't have defining instructions, and is not a
+        // live-in, then something is wrong and the fixup must always be
+        // inserted to be safe.
+        if (!IsLiveIn && AllDefs.size() == 0) {
+          LLVM_DEBUG(dbgs()
+                     << "Fixup Planned: No Defining Instrs found, not live-in: "
+                     << printReg(MOp.getReg(), TRI) << "\n");
+          FixupLocsForFn.emplace_back(NewLoc);
+          continue;
+        }
+
+        auto IsUnsafe = [](MachineInstr *MI) -> bool {
+          return !isSafeAESInput(*MI);
+        };
+        size_t UnsafeCount = llvm::count_if(AllDefs, IsUnsafe);
+
+        // If there are no unsafe definitions...
+        if (UnsafeCount == 0) {
+          // ... and the register is not live-in ...
+          if (!IsLiveIn) {
+            // ... then skip the fixup.
+            LLVM_DEBUG(dbgs() << "No Fixup: Defining instrs are all safe: "
+                              << printReg(MOp.getReg(), TRI) << "\n");
+            continue;
+          }
+
+          // Otherwise, the only unsafe "definition" is a live-in, so insert the
+          // fixup at the start of the function.
+          LLVM_DEBUG(dbgs()
+                     << "Fixup Planned: Live-In (with safe defining instrs): "
+                     << printReg(MOp.getReg(), TRI) << "\n");
+          NewLoc.Block = &MF.front();
+          NewLoc.InsertionPt = &*NewLoc.Block->begin();
+          LLVM_DEBUG(dbgs() << "Moving Fixup for Live-In to immediately before "
+                            << *NewLoc.InsertionPt);
+          FixupLocsForFn.emplace_back(NewLoc);
+          continue;
+        }
+
+        // If a fixup is needed in more than one place, then the best place to
+        // insert it is adjacent to the use rather than introducing a fixup
+        // adjacent to each def.
+        //
+        // FIXME: It might be better to hoist this to the start of the BB, if
+        // possible.
+        if (IsLiveIn || UnsafeCount > 1) {
+          LLVM_DEBUG(dbgs() << "Fixup Planned: Multiple unsafe defining instrs "
+                               "(including live-ins): "
+                            << printReg(MOp.getReg(), TRI) << "\n");
+          FixupLocsForFn.emplace_back(NewLoc);
+          continue;
+        }
+
+        assert(UnsafeCount == 1 && !IsLiveIn &&
+               "At this point, there should be one unsafe defining instrs "
+               "and the defined register should not be a live-in.");
+        SmallPtrSetIterator<MachineInstr *> It =
+            llvm::find_if(AllDefs, IsUnsafe);
+        assert(It != AllDefs.end() &&
+               "UnsafeCount == 1 but No Unsafe MachineInstr found.");
+        MachineInstr *DefMI = *It;
+
+        LLVM_DEBUG(
+            dbgs() << "Fixup Planned: Found single unsafe defining instrs for "
+                   << printReg(MOp.getReg(), TRI) << ": " << *DefMI);
+
+        // There is one unsafe defining instruction, which needs a fixup. It is
+        // generally good to hoist the fixup to be adjacent to the defining
+        // instruction rather than the using instruction, as the using
+        // instruction may be inside a loop when the defining instruction is
+        // not.
+        MachineBasicBlock::iterator DefIt = DefMI;
+        ++DefIt;
+        if (DefIt != DefMI->getParent()->end()) {
+          LLVM_DEBUG(dbgs() << "Moving Fixup to immediately after " << *DefMI
+                            << "And immediately before " << *DefIt);
+          NewLoc.Block = DefIt->getParent();
+          NewLoc.InsertionPt = &*DefIt;
+        }
+
+        FixupLocsForFn.emplace_back(NewLoc);
+      }
+    }
+  }
+
+  assert(FixupLocsForFn.size() <= MaxAllowedFixups &&
+         "Inserted too many fixups for this function.");
+  (void)MaxAllowedFixups;
+}
+
+void ARMFixCortexA57AES1742098::insertAESFixup(
+    AESFixupLocation &FixupLoc, const ARMBaseInstrInfo *TII,
+    const ARMBaseRegisterInfo *TRI) const {
+  MachineOperand *OperandToFixup = FixupLoc.MOp;
+
+  assert(OperandToFixup->isReg() && "OperandToFixup must be a register");
+  Register RegToFixup = OperandToFixup->getReg();
+
+  LLVM_DEBUG(dbgs() << "Inserting VORRq of " << printReg(RegToFixup, TRI)
+                    << " before: " << *FixupLoc.InsertionPt);
+
+  // Insert the new `VORRq qN, qN, qN`. There are a few details here:
+  //
+  // The uses are marked as killed, even if the original use of OperandToFixup
+  // is not killed, as the new instruction is clobbering the register. This is
+  // safe even if there are other uses of `qN`, as the VORRq value-wise a no-op
+  // (it is inserted for microarchitectural reasons).
+  //
+  // The def and the uses are still marked as Renamable if the original register
+  // was, to avoid having to rummage through all the other uses and defs and
+  // unset their renamable bits.
+  unsigned Renamable = OperandToFixup->isRenamable() ? RegState::Renamable : 0;
+  BuildMI(*FixupLoc.Block, FixupLoc.InsertionPt, DebugLoc(),
+          TII->get(ARM::VORRq))
+      .addReg(RegToFixup, RegState::Define | Renamable)
+      .addReg(RegToFixup, RegState::Kill | Renamable)
+      .addReg(RegToFixup, RegState::Kill | Renamable)
+      .addImm((uint64_t)ARMCC::AL)
+      .addReg(ARM::NoRegister);
+}
+
+// Factory function used by AArch64TargetMachine to add the pass to
+// the passmanager.
+FunctionPass *llvm::createARMFixCortexA57AES1742098Pass() {
+  return new ARMFixCortexA57AES1742098();
+}
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index 1f2f6f7497e0..48b4d266b41a 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -47,7 +47,8 @@
 // |                                   |
 // |-----------------------------------|
 // |                                   |
-// | prev_fp, prev_lr                  |
+// | prev_lr                           |
+// | prev_fp                           |
 // | (a.k.a. "frame record")           |
 // |                                   |
 // |- - - - - - - - - - - - - - - - - -| <- fp (r7 or r11)
@@ -138,6 +139,7 @@
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -210,6 +212,12 @@ bool ARMFrameLowering::hasFP(const MachineFunction &MF) const {
           MFI.isFrameAddressTaken());
 }
 
+/// isFPReserved - Return true if the frame pointer register should be
+/// considered a reserved register on the scope of the specified function.
+bool ARMFrameLowering::isFPReserved(const MachineFunction &MF) const {
+  return hasFP(MF) || MF.getSubtarget<ARMSubtarget>().createAAPCSFrameChain();
+}
+
 /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
 /// not required, we reserve argument space for call sites in the function
 /// immediately on entry to the current function.  This eliminates the need for
@@ -272,6 +280,230 @@ static int getArgumentStackToRestore(MachineFunction &MF,
   return ArgumentPopSize;
 }
 
+static bool needsWinCFI(const MachineFunction &MF) {
+  const Function &F = MF.getFunction();
+  return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
+         F.needsUnwindTableEntry();
+}
+
+// Given a load or a store instruction, generate an appropriate unwinding SEH
+// code on Windows.
+static MachineBasicBlock::iterator insertSEH(MachineBasicBlock::iterator MBBI,
+                                             const TargetInstrInfo &TII,
+                                             unsigned Flags) {
+  unsigned Opc = MBBI->getOpcode();
+  MachineBasicBlock *MBB = MBBI->getParent();
+  MachineFunction &MF = *MBB->getParent();
+  DebugLoc DL = MBBI->getDebugLoc();
+  MachineInstrBuilder MIB;
+  const ARMSubtarget &Subtarget = MF.getSubtarget<ARMSubtarget>();
+  const ARMBaseRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+
+  Flags |= MachineInstr::NoMerge;
+
+  switch (Opc) {
+  default:
+    report_fatal_error("No SEH Opcode for instruction " + TII.getName(Opc));
+    break;
+  case ARM::t2ADDri:   // add.w r11, sp, #xx
+  case ARM::t2ADDri12: // add.w r11, sp, #xx
+  case ARM::t2MOVTi16: // movt  r4, #xx
+  case ARM::tBL:       // bl __chkstk
+    // These are harmless if used for just setting up a frame pointer,
+    // but that frame pointer can't be relied upon for unwinding, unless
+    // set up with SEH_SaveSP.
+    MIB = BuildMI(MF, DL, TII.get(ARM::SEH_Nop))
+              .addImm(/*Wide=*/1)
+              .setMIFlags(Flags);
+    break;
+
+  case ARM::t2MOVi16: { // mov(w) r4, #xx
+    bool Wide = MBBI->getOperand(1).getImm() >= 256;
+    if (!Wide) {
+      MachineInstrBuilder NewInstr =
+          BuildMI(MF, DL, TII.get(ARM::tMOVi8)).setMIFlags(MBBI->getFlags());
+      NewInstr.add(MBBI->getOperand(0));
+      NewInstr.add(t1CondCodeOp(/*isDead=*/true));
+      for (unsigned i = 1, NumOps = MBBI->getNumOperands(); i != NumOps; ++i)
+        NewInstr.add(MBBI->getOperand(i));
+      MachineBasicBlock::iterator NewMBBI = MBB->insertAfter(MBBI, NewInstr);
+      MBB->erase(MBBI);
+      MBBI = NewMBBI;
+    }
+    MIB = BuildMI(MF, DL, TII.get(ARM::SEH_Nop)).addImm(Wide).setMIFlags(Flags);
+    break;
+  }
+
+  case ARM::tBLXr: // blx r12 (__chkstk)
+    MIB = BuildMI(MF, DL, TII.get(ARM::SEH_Nop))
+              .addImm(/*Wide=*/0)
+              .setMIFlags(Flags);
+    break;
+
+  case ARM::t2MOVi32imm: // movw+movt
+    // This pseudo instruction expands into two mov instructions. If the
+    // second operand is a symbol reference, this will stay as two wide
+    // instructions, movw+movt. If they're immediates, the first one can
+    // end up as a narrow mov though.
+    // As two SEH instructions are appended here, they won't get interleaved
+    // between the two final movw/movt instructions, but it doesn't make any
+    // practical difference.
+    MIB = BuildMI(MF, DL, TII.get(ARM::SEH_Nop))
+              .addImm(/*Wide=*/1)
+              .setMIFlags(Flags);
+    MBB->insertAfter(MBBI, MIB);
+    MIB = BuildMI(MF, DL, TII.get(ARM::SEH_Nop))
+              .addImm(/*Wide=*/1)
+              .setMIFlags(Flags);
+    break;
+
+  case ARM::t2LDMIA_RET:
+  case ARM::t2LDMIA_UPD:
+  case ARM::t2STMDB_UPD: {
+    unsigned Mask = 0;
+    bool Wide = false;
+    for (unsigned i = 4, NumOps = MBBI->getNumOperands(); i != NumOps; ++i) {
+      const MachineOperand &MO = MBBI->getOperand(i);
+      if (!MO.isReg() || MO.isImplicit())
+        continue;
+      unsigned Reg = RegInfo->getSEHRegNum(MO.getReg());
+      if (Reg == 15)
+        Reg = 14;
+      if (Reg >= 8 && Reg <= 13)
+        Wide = true;
+      else if (Opc == ARM::t2LDMIA_UPD && Reg == 14)
+        Wide = true;
+      Mask |= 1 << Reg;
+    }
+    if (!Wide) {
+      unsigned NewOpc;
+      switch (Opc) {
+      case ARM::t2LDMIA_RET:
+        NewOpc = ARM::tPOP_RET;
+        break;
+      case ARM::t2LDMIA_UPD:
+        NewOpc = ARM::tPOP;
+        break;
+      case ARM::t2STMDB_UPD:
+        NewOpc = ARM::tPUSH;
+        break;
+      default:
+        llvm_unreachable("");
+      }
+      MachineInstrBuilder NewInstr =
+          BuildMI(MF, DL, TII.get(NewOpc)).setMIFlags(MBBI->getFlags());
+      for (unsigned i = 2, NumOps = MBBI->getNumOperands(); i != NumOps; ++i)
+        NewInstr.add(MBBI->getOperand(i));
+      MachineBasicBlock::iterator NewMBBI = MBB->insertAfter(MBBI, NewInstr);
+      MBB->erase(MBBI);
+      MBBI = NewMBBI;
+    }
+    unsigned SEHOpc =
+        (Opc == ARM::t2LDMIA_RET) ? ARM::SEH_SaveRegs_Ret : ARM::SEH_SaveRegs;
+    MIB = BuildMI(MF, DL, TII.get(SEHOpc))
+              .addImm(Mask)
+              .addImm(Wide ? 1 : 0)
+              .setMIFlags(Flags);
+    break;
+  }
+  case ARM::VSTMDDB_UPD:
+  case ARM::VLDMDIA_UPD: {
+    int First = -1, Last = 0;
+    for (unsigned i = 4, NumOps = MBBI->getNumOperands(); i != NumOps; ++i) {
+      const MachineOperand &MO = MBBI->getOperand(i);
+      unsigned Reg = RegInfo->getSEHRegNum(MO.getReg());
+      if (First == -1)
+        First = Reg;
+      Last = Reg;
+    }
+    MIB = BuildMI(MF, DL, TII.get(ARM::SEH_SaveFRegs))
+              .addImm(First)
+              .addImm(Last)
+              .setMIFlags(Flags);
+    break;
+  }
+  case ARM::tSUBspi:
+  case ARM::tADDspi:
+    MIB = BuildMI(MF, DL, TII.get(ARM::SEH_StackAlloc))
+              .addImm(MBBI->getOperand(2).getImm() * 4)
+              .addImm(/*Wide=*/0)
+              .setMIFlags(Flags);
+    break;
+  case ARM::t2SUBspImm:
+  case ARM::t2SUBspImm12:
+  case ARM::t2ADDspImm:
+  case ARM::t2ADDspImm12:
+    MIB = BuildMI(MF, DL, TII.get(ARM::SEH_StackAlloc))
+              .addImm(MBBI->getOperand(2).getImm())
+              .addImm(/*Wide=*/1)
+              .setMIFlags(Flags);
+    break;
+
+  case ARM::tMOVr:
+    if (MBBI->getOperand(1).getReg() == ARM::SP &&
+        (Flags & MachineInstr::FrameSetup)) {
+      unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
+      MIB = BuildMI(MF, DL, TII.get(ARM::SEH_SaveSP))
+                .addImm(Reg)
+                .setMIFlags(Flags);
+    } else if (MBBI->getOperand(0).getReg() == ARM::SP &&
+               (Flags & MachineInstr::FrameDestroy)) {
+      unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
+      MIB = BuildMI(MF, DL, TII.get(ARM::SEH_SaveSP))
+                .addImm(Reg)
+                .setMIFlags(Flags);
+    } else {
+      report_fatal_error("No SEH Opcode for MOV");
+    }
+    break;
+
+  case ARM::tBX_RET:
+  case ARM::TCRETURNri:
+    MIB = BuildMI(MF, DL, TII.get(ARM::SEH_Nop_Ret))
+              .addImm(/*Wide=*/0)
+              .setMIFlags(Flags);
+    break;
+
+  case ARM::TCRETURNdi:
+    MIB = BuildMI(MF, DL, TII.get(ARM::SEH_Nop_Ret))
+              .addImm(/*Wide=*/1)
+              .setMIFlags(Flags);
+    break;
+  }
+  return MBB->insertAfter(MBBI, MIB);
+}
+
+static MachineBasicBlock::iterator
+initMBBRange(MachineBasicBlock &MBB, const MachineBasicBlock::iterator &MBBI) {
+  if (MBBI == MBB.begin())
+    return MachineBasicBlock::iterator();
+  return std::prev(MBBI);
+}
+
+static void insertSEHRange(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator Start,
+                           const MachineBasicBlock::iterator &End,
+                           const ARMBaseInstrInfo &TII, unsigned MIFlags) {
+  if (Start.isValid())
+    Start = std::next(Start);
+  else
+    Start = MBB.begin();
+
+  for (auto MI = Start; MI != End;) {
+    auto Next = std::next(MI);
+    // Check if this instruction already has got a SEH opcode added. In that
+    // case, don't do this generic mapping.
+    if (Next != End && isSEHInstruction(*Next)) {
+      MI = std::next(Next);
+      while (MI != End && isSEHInstruction(*MI))
+        ++MI;
+      continue;
+    }
+    insertSEH(MI, TII, MIFlags);
+    MI = Next;
+  }
+}
+
 static void emitRegPlusImmediate(
     bool isARM, MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
     const DebugLoc &dl, const ARMBaseInstrInfo &TII, unsigned DestReg,
@@ -392,8 +624,7 @@ static void emitAligningInstructions(MachineFunction &MF, ARMFunctionInfo *AFI,
                                      const DebugLoc &DL, const unsigned Reg,
                                      const Align Alignment,
                                      const bool MustBeSingleInstruction) {
-  const ARMSubtarget &AST =
-      static_cast<const ARMSubtarget &>(MF.getSubtarget());
+  const ARMSubtarget &AST = MF.getSubtarget<ARMSubtarget>();
   const bool CanUseBFC = AST.hasV6T2Ops() || AST.hasV7Ops();
   const unsigned AlignMask = Alignment.value() - 1U;
   const unsigned NrBitsToZero = Log2(Alignment);
@@ -452,15 +683,23 @@ static void emitAligningInstructions(MachineFunction &MF, ARMFunctionInfo *AFI,
 /// Unfortunately we cannot determine this value in determineCalleeSaves() yet
 /// as assignCalleeSavedSpillSlots() hasn't run at this point. Instead we use
 /// this to produce a conservative estimate that we check in an assert() later.
-static int getMaxFPOffset(const ARMSubtarget &STI, const ARMFunctionInfo &AFI) {
+static int getMaxFPOffset(const ARMSubtarget &STI, const ARMFunctionInfo &AFI,
+                          const MachineFunction &MF) {
   // For Thumb1, push.w isn't available, so the first push will always push
   // r7 and lr onto the stack first.
   if (AFI.isThumb1OnlyFunction())
     return -AFI.getArgRegsSaveSize() - (2 * 4);
   // This is a conservative estimation: Assume the frame pointer being r7 and
   // pc("r15") up to r8 getting spilled before (= 8 registers).
-  int FPCXTSaveSize = (STI.hasV8_1MMainlineOps() && AFI.isCmseNSEntryFunction()) ? 4 : 0;
-  return - FPCXTSaveSize - AFI.getArgRegsSaveSize() - (8 * 4);
+  int MaxRegBytes = 8 * 4;
+  if (STI.splitFramePointerPush(MF)) {
+    // Here, r11 can be stored below all of r4-r15 (3 registers more than
+    // above), plus d8-d15.
+    MaxRegBytes = 11 * 4 + 8 * 8;
+  }
+  int FPCXTSaveSize =
+      (STI.hasV8_1MMainlineOps() && AFI.isCmseNSEntryFunction()) ? 4 : 0;
+  return -FPCXTSaveSize - AFI.getArgRegsSaveSize() - MaxRegBytes;
 }
 
 void ARMFrameLowering::emitPrologue(MachineFunction &MF,
@@ -482,6 +721,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   unsigned NumBytes = MFI.getStackSize();
   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
   int FPCXTSaveSize = 0;
+  bool NeedsWinCFI = needsWinCFI(MF);
 
   // Debug location must be unknown since the first debug location is used
   // to determine the end of the prologue.
@@ -510,47 +750,92 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
                    MachineInstr::FrameSetup);
       DefCFAOffsetCandidates.addInst(std::prev(MBBI), NumBytes, true);
     }
-    DefCFAOffsetCandidates.emitDefCFAOffsets(MBB, dl, TII, HasFP);
+    if (!NeedsWinCFI)
+      DefCFAOffsetCandidates.emitDefCFAOffsets(MBB, dl, TII, HasFP);
+    if (NeedsWinCFI && MBBI != MBB.begin()) {
+      insertSEHRange(MBB, {}, MBBI, TII, MachineInstr::FrameSetup);
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::SEH_PrologEnd))
+          .setMIFlag(MachineInstr::FrameSetup);
+      MF.setHasWinCFI(true);
+    }
     return;
   }
 
   // Determine spill area sizes.
-  for (const CalleeSavedInfo &I : CSI) {
-    Register Reg = I.getReg();
-    int FI = I.getFrameIdx();
-    switch (Reg) {
-    case ARM::R8:
-    case ARM::R9:
-    case ARM::R10:
-    case ARM::R11:
-    case ARM::R12:
-      if (STI.splitFramePushPop(MF)) {
+  if (STI.splitFramePointerPush(MF)) {
+    for (const CalleeSavedInfo &I : CSI) {
+      Register Reg = I.getReg();
+      int FI = I.getFrameIdx();
+      switch (Reg) {
+      case ARM::R11:
+      case ARM::LR:
+        if (Reg == FramePtr)
+          FramePtrSpillFI = FI;
         GPRCS2Size += 4;
         break;
+      case ARM::R0:
+      case ARM::R1:
+      case ARM::R2:
+      case ARM::R3:
+      case ARM::R4:
+      case ARM::R5:
+      case ARM::R6:
+      case ARM::R7:
+      case ARM::R8:
+      case ARM::R9:
+      case ARM::R10:
+      case ARM::R12:
+        GPRCS1Size += 4;
+        break;
+      case ARM::FPCXTNS:
+        FPCXTSaveSize = 4;
+        break;
+      default:
+        // This is a DPR. Exclude the aligned DPRCS2 spills.
+        if (Reg == ARM::D8)
+          D8SpillFI = FI;
+        if (Reg < ARM::D8 || Reg >= ARM::D8 + AFI->getNumAlignedDPRCS2Regs())
+          DPRCSSize += 8;
+      }
+    }
+  } else {
+    for (const CalleeSavedInfo &I : CSI) {
+      Register Reg = I.getReg();
+      int FI = I.getFrameIdx();
+      switch (Reg) {
+      case ARM::R8:
+      case ARM::R9:
+      case ARM::R10:
+      case ARM::R11:
+      case ARM::R12:
+        if (STI.splitFramePushPop(MF)) {
+          GPRCS2Size += 4;
+          break;
+        }
+        LLVM_FALLTHROUGH;
+      case ARM::R0:
+      case ARM::R1:
+      case ARM::R2:
+      case ARM::R3:
+      case ARM::R4:
+      case ARM::R5:
+      case ARM::R6:
+      case ARM::R7:
+      case ARM::LR:
+        if (Reg == FramePtr)
+          FramePtrSpillFI = FI;
+        GPRCS1Size += 4;
+        break;
+      case ARM::FPCXTNS:
+        FPCXTSaveSize = 4;
+        break;
+      default:
+        // This is a DPR. Exclude the aligned DPRCS2 spills.
+        if (Reg == ARM::D8)
+          D8SpillFI = FI;
+        if (Reg < ARM::D8 || Reg >= ARM::D8 + AFI->getNumAlignedDPRCS2Regs())
+          DPRCSSize += 8;
       }
-      LLVM_FALLTHROUGH;
-    case ARM::R0:
-    case ARM::R1:
-    case ARM::R2:
-    case ARM::R3:
-    case ARM::R4:
-    case ARM::R5:
-    case ARM::R6:
-    case ARM::R7:
-    case ARM::LR:
-      if (Reg == FramePtr)
-        FramePtrSpillFI = FI;
-      GPRCS1Size += 4;
-      break;
-    case ARM::FPCXTNS:
-      FPCXTSaveSize = 4;
-      break;
-    default:
-      // This is a DPR. Exclude the aligned DPRCS2 spills.
-      if (Reg == ARM::D8)
-        D8SpillFI = FI;
-      if (Reg < ARM::D8 || Reg >= ARM::D8 + AFI->getNumAlignedDPRCS2Regs())
-        DPRCSSize += 8;
     }
   }
 
@@ -585,15 +870,23 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   unsigned GPRCS1Offset = FPCXTOffset - GPRCS1Size;
   unsigned GPRCS2Offset = GPRCS1Offset - GPRCS2Size;
   Align DPRAlign = DPRCSSize ? std::min(Align(8), Alignment) : Align(4);
-  unsigned DPRGapSize =
-      (GPRCS1Size + GPRCS2Size + FPCXTSaveSize + ArgRegsSaveSize) %
-      DPRAlign.value();
+  unsigned DPRGapSize = GPRCS1Size + FPCXTSaveSize + ArgRegsSaveSize;
+  if (!STI.splitFramePointerPush(MF)) {
+    DPRGapSize += GPRCS2Size;
+  }
+  DPRGapSize %= DPRAlign.value();
 
-  unsigned DPRCSOffset = GPRCS2Offset - DPRGapSize - DPRCSSize;
+  unsigned DPRCSOffset;
+  if (STI.splitFramePointerPush(MF)) {
+    DPRCSOffset = GPRCS1Offset - DPRGapSize - DPRCSSize;
+    GPRCS2Offset = DPRCSOffset - GPRCS2Size;
+  } else {
+    DPRCSOffset = GPRCS2Offset - DPRGapSize - DPRCSSize;
+  }
   int FramePtrOffsetInPush = 0;
   if (HasFP) {
     int FPOffset = MFI.getObjectOffset(FramePtrSpillFI);
-    assert(getMaxFPOffset(STI, *AFI) <= FPOffset &&
+    assert(getMaxFPOffset(STI, *AFI, MF) <= FPOffset &&
            "Max FP estimation is wrong");
     FramePtrOffsetInPush = FPOffset + ArgRegsSaveSize + FPCXTSaveSize;
     AFI->setFramePtrSpillOffset(MFI.getObjectOffset(FramePtrSpillFI) +
@@ -604,7 +897,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
 
   // Move past area 2.
-  if (GPRCS2Size > 0) {
+  if (GPRCS2Size > 0 && !STI.splitFramePointerPush(MF)) {
     GPRCS2Push = LastPush = MBBI++;
     DefCFAOffsetCandidates.addInst(LastPush, GPRCS2Size);
   }
@@ -644,18 +937,37 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   } else
     NumBytes = DPRCSOffset;
 
+  if (GPRCS2Size > 0 && STI.splitFramePointerPush(MF)) {
+    GPRCS2Push = LastPush = MBBI++;
+    DefCFAOffsetCandidates.addInst(LastPush, GPRCS2Size);
+  }
+
+  bool NeedsWinCFIStackAlloc = NeedsWinCFI;
+  if (STI.splitFramePointerPush(MF) && HasFP)
+    NeedsWinCFIStackAlloc = false;
+
   if (STI.isTargetWindows() && WindowsRequiresStackProbe(MF, NumBytes)) {
     uint32_t NumWords = NumBytes >> 2;
 
-    if (NumWords < 65536)
+    if (NumWords < 65536) {
       BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi16), ARM::R4)
           .addImm(NumWords)
           .setMIFlags(MachineInstr::FrameSetup)
           .add(predOps(ARMCC::AL));
-    else
-      BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi32imm), ARM::R4)
-        .addImm(NumWords)
-        .setMIFlags(MachineInstr::FrameSetup);
+    } else {
+      // Split into two instructions here, instead of using t2MOVi32imm,
+      // to allow inserting accurate SEH instructions (including accurate
+      // instruction size for each of them).
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi16), ARM::R4)
+          .addImm(NumWords & 0xffff)
+          .setMIFlags(MachineInstr::FrameSetup)
+          .add(predOps(ARMCC::AL));
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVTi16), ARM::R4)
+          .addReg(ARM::R4)
+          .addImm(NumWords >> 16)
+          .setMIFlags(MachineInstr::FrameSetup)
+          .add(predOps(ARMCC::AL));
+    }
 
     switch (TM.getCodeModel()) {
     case CodeModel::Tiny:
@@ -682,12 +994,20 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
       break;
     }
 
-    BuildMI(MBB, MBBI, dl, TII.get(ARM::t2SUBrr), ARM::SP)
-        .addReg(ARM::SP, RegState::Kill)
-        .addReg(ARM::R4, RegState::Kill)
-        .setMIFlags(MachineInstr::FrameSetup)
-        .add(predOps(ARMCC::AL))
-        .add(condCodeOp());
+    MachineInstrBuilder Instr, SEH;
+    Instr = BuildMI(MBB, MBBI, dl, TII.get(ARM::t2SUBrr), ARM::SP)
+                .addReg(ARM::SP, RegState::Kill)
+                .addReg(ARM::R4, RegState::Kill)
+                .setMIFlags(MachineInstr::FrameSetup)
+                .add(predOps(ARMCC::AL))
+                .add(condCodeOp());
+    if (NeedsWinCFIStackAlloc) {
+      SEH = BuildMI(MF, dl, TII.get(ARM::SEH_StackAlloc))
+                .addImm(NumBytes)
+                .addImm(/*Wide=*/1)
+                .setMIFlags(MachineInstr::FrameSetup);
+      MBB.insertAfter(Instr, SEH);
+    }
     NumBytes = 0;
   }
 
@@ -720,34 +1040,58 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   // into spill area 1, including the FP in R11.  In either case, it
   // is in area one and the adjustment needs to take place just after
   // that push.
+  // FIXME: The above is not necessary true when PACBTI is enabled.
+  // AAPCS requires use of R11, and PACBTI gets in the way of regular pushes,
+  // so FP ends up on area two.
+  MachineBasicBlock::iterator AfterPush;
   if (HasFP) {
-    MachineBasicBlock::iterator AfterPush = std::next(GPRCS1Push);
+    AfterPush = std::next(GPRCS1Push);
     unsigned PushSize = sizeOfSPAdjustment(*GPRCS1Push);
-    emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, AfterPush,
-                         dl, TII, FramePtr, ARM::SP,
-                         PushSize + FramePtrOffsetInPush,
-                         MachineInstr::FrameSetup);
-    if (FramePtrOffsetInPush + PushSize != 0) {
-      unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
-          nullptr, MRI->getDwarfRegNum(FramePtr, true),
-          FPCXTSaveSize + ArgRegsSaveSize - FramePtrOffsetInPush));
-      BuildMI(MBB, AfterPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex)
-          .setMIFlags(MachineInstr::FrameSetup);
+    int FPOffset = PushSize + FramePtrOffsetInPush;
+    if (STI.splitFramePointerPush(MF)) {
+      AfterPush = std::next(GPRCS2Push);
+      emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, AfterPush, dl, TII,
+                           FramePtr, ARM::SP, 0, MachineInstr::FrameSetup);
     } else {
-      unsigned CFIIndex =
-          MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(
-              nullptr, MRI->getDwarfRegNum(FramePtr, true)));
-      BuildMI(MBB, AfterPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex)
-          .setMIFlags(MachineInstr::FrameSetup);
+      emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, AfterPush, dl, TII,
+                           FramePtr, ARM::SP, FPOffset,
+                           MachineInstr::FrameSetup);
     }
+    if (!NeedsWinCFI) {
+      if (FramePtrOffsetInPush + PushSize != 0) {
+        unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
+            nullptr, MRI->getDwarfRegNum(FramePtr, true),
+            FPCXTSaveSize + ArgRegsSaveSize - FramePtrOffsetInPush));
+        BuildMI(MBB, AfterPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex)
+            .setMIFlags(MachineInstr::FrameSetup);
+      } else {
+        unsigned CFIIndex =
+            MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(
+                nullptr, MRI->getDwarfRegNum(FramePtr, true)));
+        BuildMI(MBB, AfterPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex)
+            .setMIFlags(MachineInstr::FrameSetup);
+      }
+    }
+  }
+
+  // Emit a SEH opcode indicating the prologue end. The rest of the prologue
+  // instructions below don't need to be replayed to unwind the stack.
+  if (NeedsWinCFI && MBBI != MBB.begin()) {
+    MachineBasicBlock::iterator End = MBBI;
+    if (HasFP && STI.splitFramePointerPush(MF))
+      End = AfterPush;
+    insertSEHRange(MBB, {}, End, TII, MachineInstr::FrameSetup);
+    BuildMI(MBB, End, dl, TII.get(ARM::SEH_PrologEnd))
+        .setMIFlag(MachineInstr::FrameSetup);
+    MF.setHasWinCFI(true);
   }
 
   // Now that the prologue's actual instructions are finalised, we can insert
   // the necessary DWARF cf instructions to describe the situation. Start by
   // recording where each register ended up:
-  if (GPRCS1Size > 0) {
+  if (GPRCS1Size > 0 && !NeedsWinCFI) {
     MachineBasicBlock::iterator Pos = std::next(GPRCS1Push);
     int CFIIndex;
     for (const auto &Entry : CSI) {
@@ -781,7 +1125,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
     }
   }
 
-  if (GPRCS2Size > 0) {
+  if (GPRCS2Size > 0 && !NeedsWinCFI) {
     MachineBasicBlock::iterator Pos = std::next(GPRCS2Push);
     for (const auto &Entry : CSI) {
       Register Reg = Entry.getReg();
@@ -807,7 +1151,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
     }
   }
 
-  if (DPRCSSize > 0) {
+  if (DPRCSSize > 0 && !NeedsWinCFI) {
     // Since vpush register list cannot have gaps, there may be multiple vpush
     // instructions in the prologue.
     MachineBasicBlock::iterator Pos = std::next(LastPush);
@@ -831,7 +1175,8 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   // throughout the process. If we have a frame pointer, it takes over the job
   // half-way through, so only the first few .cfi_def_cfa_offset instructions
   // actually get emitted.
-  DefCFAOffsetCandidates.emitDefCFAOffsets(MBB, dl, TII, HasFP);
+  if (!NeedsWinCFI)
+    DefCFAOffsetCandidates.emitDefCFAOffsets(MBB, dl, TII, HasFP);
 
   if (STI.isTargetELF() && hasFP(MF))
     MFI.setOffsetAdjustment(MFI.getOffsetAdjustment() -
@@ -928,7 +1273,14 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
   DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
 
+  MachineBasicBlock::iterator RangeStart;
   if (!AFI->hasStackFrame()) {
+    if (MF.hasWinCFI()) {
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::SEH_EpilogStart))
+          .setMIFlag(MachineInstr::FrameDestroy);
+      RangeStart = initMBBRange(MBB, MBBI);
+    }
+
     if (NumBytes + IncomingArgStackToRestore != 0)
       emitSPUpdate(isARM, MBB, MBBI, dl, TII,
                    NumBytes + IncomingArgStackToRestore,
@@ -944,6 +1296,12 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
         ++MBBI;
     }
 
+    if (MF.hasWinCFI()) {
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::SEH_EpilogStart))
+          .setMIFlag(MachineInstr::FrameDestroy);
+      RangeStart = initMBBRange(MBB, MBBI);
+    }
+
     // Move SP to start of FP callee save spill area.
     NumBytes -= (ReservedArgStack +
                  AFI->getFPCXTSaveAreaSize() +
@@ -998,6 +1356,9 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
                    MachineInstr::FrameDestroy);
 
     // Increment past our save areas.
+    if (AFI->getGPRCalleeSavedArea2Size() && STI.splitFramePointerPush(MF))
+      MBBI++;
+
     if (MBBI != MBB.end() && AFI->getDPRCalleeSavedAreaSize()) {
       MBBI++;
       // Since vpop register list cannot have gaps, there may be multiple vpop
@@ -1012,7 +1373,8 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
                    MachineInstr::FrameDestroy);
     }
 
-    if (AFI->getGPRCalleeSavedArea2Size()) MBBI++;
+    if (AFI->getGPRCalleeSavedArea2Size() && !STI.splitFramePointerPush(MF))
+      MBBI++;
     if (AFI->getGPRCalleeSavedArea1Size()) MBBI++;
 
     if (ReservedArgStack || IncomingArgStackToRestore) {
@@ -1030,6 +1392,12 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
     if (AFI->shouldSignReturnAddress() && !AFI->isCmseNSEntryFunction())
       BuildMI(MBB, MBBI, DebugLoc(), STI.getInstrInfo()->get(ARM::t2AUT));
   }
+
+  if (MF.hasWinCFI()) {
+    insertSEHRange(MBB, RangeStart, MBB.end(), TII, MachineInstr::FrameDestroy);
+    BuildMI(MBB, MBB.end(), dl, TII.get(ARM::SEH_EpilogEnd))
+        .setMIFlag(MachineInstr::FrameDestroy);
+  }
 }
 
 /// getFrameIndexReference - Provide a base+offset reference to an FI slot for
@@ -1245,7 +1613,8 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
         continue;
       if (Reg == ARM::LR && !isTailCall && !isVarArg && !isInterrupt &&
           !isCmseEntry && !isTrap && AFI->getArgumentStackToRestore() == 0 &&
-          STI.hasV5TOps() && MBB.succ_empty() && !hasPAC) {
+          STI.hasV5TOps() && MBB.succ_empty() && !hasPAC &&
+          !STI.splitFramePointerPush(MF)) {
         Reg = ARM::PC;
         // Fold the return instruction into the LDM.
         DeleteRet = true;
@@ -1609,12 +1978,21 @@ bool ARMFrameLowering::spillCalleeSavedRegisters(
         .addImm(-4)
         .add(predOps(ARMCC::AL));
   }
-  emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea1Register, 0,
-               MachineInstr::FrameSetup);
-  emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea2Register, 0,
-               MachineInstr::FrameSetup);
-  emitPushInst(MBB, MI, CSI, FltOpc, 0, true, &isARMArea3Register,
-               NumAlignedDPRCS2Regs, MachineInstr::FrameSetup);
+  if (STI.splitFramePointerPush(MF)) {
+    emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false,
+                 &isSplitFPArea1Register, 0, MachineInstr::FrameSetup);
+    emitPushInst(MBB, MI, CSI, FltOpc, 0, true, &isARMArea3Register,
+                 NumAlignedDPRCS2Regs, MachineInstr::FrameSetup);
+    emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false,
+                 &isSplitFPArea2Register, 0, MachineInstr::FrameSetup);
+  } else {
+    emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea1Register,
+                 0, MachineInstr::FrameSetup);
+    emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea2Register,
+                 0, MachineInstr::FrameSetup);
+    emitPushInst(MBB, MI, CSI, FltOpc, 0, true, &isARMArea3Register,
+                 NumAlignedDPRCS2Regs, MachineInstr::FrameSetup);
+  }
 
   // The code above does not insert spill code for the aligned DPRCS2 registers.
   // The stack realignment code will be inserted between the push instructions
@@ -1642,14 +2020,24 @@ bool ARMFrameLowering::restoreCalleeSavedRegisters(
     emitAlignedDPRCS2Restores(MBB, MI, NumAlignedDPRCS2Regs, CSI, TRI);
 
   unsigned PopOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_UPD : ARM::LDMIA_UPD;
-  unsigned LdrOpc = AFI->isThumbFunction() ? ARM::t2LDR_POST :ARM::LDR_POST_IMM;
+  unsigned LdrOpc =
+      AFI->isThumbFunction() ? ARM::t2LDR_POST : ARM::LDR_POST_IMM;
   unsigned FltOpc = ARM::VLDMDIA_UPD;
-  emitPopInst(MBB, MI, CSI, FltOpc, 0, isVarArg, true, &isARMArea3Register,
-              NumAlignedDPRCS2Regs);
-  emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false,
-              &isARMArea2Register, 0);
-  emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false,
-              &isARMArea1Register, 0);
+  if (STI.splitFramePointerPush(MF)) {
+    emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false,
+                &isSplitFPArea2Register, 0);
+    emitPopInst(MBB, MI, CSI, FltOpc, 0, isVarArg, true, &isARMArea3Register,
+                NumAlignedDPRCS2Regs);
+    emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false,
+                &isSplitFPArea1Register, 0);
+  } else {
+    emitPopInst(MBB, MI, CSI, FltOpc, 0, isVarArg, true, &isARMArea3Register,
+                NumAlignedDPRCS2Regs);
+    emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false,
+                &isARMArea2Register, 0);
+    emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false,
+                &isARMArea1Register, 0);
+  }
 
   return true;
 }
@@ -1768,7 +2156,7 @@ checkNumAlignedDPRCS2Regs(MachineFunction &MF, BitVector &SavedRegs) {
     return;
 
   // We are planning to use NEON instructions vst1 / vld1.
-  if (!static_cast<const ARMSubtarget &>(MF.getSubtarget()).hasNEON())
+  if (!MF.getSubtarget<ARMSubtarget>().hasNEON())
     return;
 
   // Don't bother if the default stack alignment is sufficiently high.
@@ -1818,6 +2206,34 @@ bool ARMFrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
   return true;
 }
 
+static bool requiresAAPCSFrameRecord(const MachineFunction &MF) {
+  const auto &Subtarget = MF.getSubtarget<ARMSubtarget>();
+  return Subtarget.createAAPCSFrameChainLeaf() ||
+         (Subtarget.createAAPCSFrameChain() && MF.getFrameInfo().hasCalls());
+}
+
+// Thumb1 may require a spill when storing to a frame index through FP, for
+// cases where FP is a high register (R11). This scans the function for cases
+// where this may happen.
+static bool canSpillOnFrameIndexAccess(const MachineFunction &MF,
+                                       const TargetFrameLowering &TFI) {
+  const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  if (!AFI->isThumb1OnlyFunction())
+    return false;
+
+  for (const auto &MBB : MF)
+    for (const auto &MI : MBB)
+      if (MI.getOpcode() == ARM::tSTRspi || MI.getOpcode() == ARM::tSTRi)
+        for (const auto &Op : MI.operands())
+          if (Op.isFI()) {
+            Register Reg;
+            TFI.getFrameIndexReference(MF, Op.getIndex(), Reg);
+            if (ARM::hGPRRegClass.contains(Reg) && Reg != ARM::SP)
+              return true;
+          }
+  return false;
+}
+
 void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
                                             BitVector &SavedRegs,
                                             RegScavenger *RS) const {
@@ -1826,7 +2242,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
   // to take advantage the eliminateFrameIndex machinery. This also ensures it
   // is spilled in the order specified by getCalleeSavedRegs() to make it easier
   // to combine multiple loads / stores.
-  bool CanEliminateFrame = true;
+  bool CanEliminateFrame = !(requiresAAPCSFrameRecord(MF) && hasFP(MF));
   bool CS1Spilled = false;
   bool LRSpilled = false;
   unsigned NumGPRSpills = 0;
@@ -2021,6 +2437,11 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
     // Functions with VLAs or extremely large call frames are rare, and
     // if a function is allocating more than 1KB of stack, an extra 4-byte
     // slot probably isn't relevant.
+    //
+    // A special case is the scenario where r11 is used as FP, where accesses
+    // to a frame index will require its value to be moved into a low reg.
+    // This is handled later on, once we are able to determine if we have any
+    // fp-relative accesses.
     if (RegInfo->hasBasePointer(MF))
       EstimatedRSStackSizeLimit = (1U << 5) * 4;
     else
@@ -2049,7 +2470,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
   //
   // We could do slightly better on Thumb1; in some cases, an sp-relative
   // offset would be legal even though an fp-relative offset is not.
-  int MaxFPOffset = getMaxFPOffset(STI, *AFI);
+  int MaxFPOffset = getMaxFPOffset(STI, *AFI, MF);
   bool HasLargeArgumentList =
       HasFP && (MaxFixedOffset - MaxFPOffset) > (int)EstimatedRSFixedSizeLimit;
 
@@ -2067,7 +2488,9 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
       SavedRegs.set(FramePtr);
       // If the frame pointer is required by the ABI, also spill LR so that we
       // emit a complete frame record.
-      if (MF.getTarget().Options.DisableFramePointerElim(MF) && !LRSpilled) {
+      if ((requiresAAPCSFrameRecord(MF) ||
+           MF.getTarget().Options.DisableFramePointerElim(MF)) &&
+          !LRSpilled) {
         SavedRegs.set(ARM::LR);
         LRSpilled = true;
         NumGPRSpills++;
@@ -2149,7 +2572,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
       }
 
       // r7 can be used if it is not being used as the frame pointer.
-      if (!HasFP) {
+      if (!HasFP || FramePtr != ARM::R7) {
         if (SavedRegs.test(ARM::R7)) {
           --RegDeficit;
           LLVM_DEBUG(dbgs() << "%r7 is saved low register, RegDeficit = "
@@ -2270,8 +2693,10 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
     // to materialize a stack offset. If so, either spill one additional
     // callee-saved register or reserve a special spill slot to facilitate
     // register scavenging. Thumb1 needs a spill slot for stack pointer
-    // adjustments also, even when the frame itself is small.
-    if (BigFrameOffsets && !ExtraCSSpill) {
+    // adjustments and for frame index accesses when FP is high register,
+    // even when the frame itself is small.
+    if (!ExtraCSSpill &&
+        (BigFrameOffsets || canSpillOnFrameIndexAccess(MF, *this))) {
       // If any non-reserved CS register isn't spilled, just spill one or two
       // extra. That should take care of it!
       unsigned NumExtras = TargetAlign.value() / 4;
@@ -2488,6 +2913,7 @@ void ARMFrameLowering::adjustForSegmentedStacks(
   unsigned CFIIndex;
   const ARMSubtarget *ST = &MF.getSubtarget<ARMSubtarget>();
   bool Thumb = ST->isThumb();
+  bool Thumb2 = ST->isThumb2();
 
   // Sadly, this currently doesn't support varargs, platforms other than
   // android/linux. Note that thumb1/thumb2 are support for android/linux.
@@ -2505,19 +2931,10 @@ void ARMFrameLowering::adjustForSegmentedStacks(
   ARMFunctionInfo *ARMFI = MF.getInfo<ARMFunctionInfo>();
   DebugLoc DL;
 
-  uint64_t StackSize = MFI.getStackSize();
-
-  // Do not generate a prologue for leaf functions with a stack of size zero.
-  // For non-leaf functions we have to allow for the possibility that the
-  // callis to a non-split function, as in PR37807. This function could also
-  // take the address of a non-split function. When the linker tries to adjust
-  // its non-existent prologue, it would fail with an error. Mark the object
-  // file so that such failures are not errors. See this Go language bug-report
-  // https://go-review.googlesource.com/c/go/+/148819/
-  if (StackSize == 0 && !MFI.hasTailCall()) {
-    MF.getMMI().setHasNosplitStack(true);
+  if (!MFI.needsSplitStackProlog())
     return;
-  }
+
+  uint64_t StackSize = MFI.getStackSize();
 
   // Use R4 and R5 as scratch registers.
   // We save R4 and R5 before use and restore them before leaving the function.
@@ -2570,8 +2987,9 @@ void ARMFrameLowering::adjustForSegmentedStacks(
     // Make sure the LiveIns are still sorted and unique.
     MBB->sortUniqueLiveIns();
     // Replace the edges to PrologueMBB by edges to the sequences
-    // we are about to add.
-    MBB->ReplaceUsesOfBlockWith(&PrologueMBB, AddedBlocks[0]);
+    // we are about to add, but only update for immediate predecessors.
+    if (MBB->isSuccessor(&PrologueMBB))
+      MBB->ReplaceUsesOfBlockWith(&PrologueMBB, AddedBlocks[0]);
   }
 
   // The required stack size that is aligned to ARM constant criterion.
@@ -2604,17 +3022,19 @@ void ARMFrameLowering::adjustForSegmentedStacks(
 
   // Emit the relevant DWARF information about the change in stack pointer as
   // well as where to find both r4 and r5 (the callee-save registers)
-  CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 8));
-  BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
-      .addCFIIndex(CFIIndex);
-  CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
-      nullptr, MRI->getDwarfRegNum(ScratchReg1, true), -4));
-  BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
-      .addCFIIndex(CFIIndex);
-  CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
-      nullptr, MRI->getDwarfRegNum(ScratchReg0, true), -8));
-  BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
-      .addCFIIndex(CFIIndex);
+  if (!MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
+    CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 8));
+    BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+    CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+        nullptr, MRI->getDwarfRegNum(ScratchReg1, true), -4));
+    BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+    CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+        nullptr, MRI->getDwarfRegNum(ScratchReg0, true), -8));
+    BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+  }
 
   // mov SR1, sp
   if (Thumb) {
@@ -2630,17 +3050,46 @@ void ARMFrameLowering::adjustForSegmentedStacks(
 
   // sub SR1, sp, #StackSize
   if (!CompareStackPointer && Thumb) {
-    BuildMI(McrMBB, DL, TII.get(ARM::tSUBi8), ScratchReg1)
-        .add(condCodeOp())
-        .addReg(ScratchReg1)
-        .addImm(AlignedStackSize)
-        .add(predOps(ARMCC::AL));
+    if (AlignedStackSize < 256) {
+      BuildMI(McrMBB, DL, TII.get(ARM::tSUBi8), ScratchReg1)
+          .add(condCodeOp())
+          .addReg(ScratchReg1)
+          .addImm(AlignedStackSize)
+          .add(predOps(ARMCC::AL));
+    } else {
+      if (Thumb2) {
+        BuildMI(McrMBB, DL, TII.get(ARM::t2MOVi32imm), ScratchReg0)
+            .addImm(AlignedStackSize);
+      } else {
+        auto MBBI = McrMBB->end();
+        auto RegInfo = STI.getRegisterInfo();
+        RegInfo->emitLoadConstPool(*McrMBB, MBBI, DL, ScratchReg0, 0,
+                                   AlignedStackSize);
+      }
+      BuildMI(McrMBB, DL, TII.get(ARM::tSUBrr), ScratchReg1)
+          .add(condCodeOp())
+          .addReg(ScratchReg1)
+          .addReg(ScratchReg0)
+          .add(predOps(ARMCC::AL));
+    }
   } else if (!CompareStackPointer) {
-    BuildMI(McrMBB, DL, TII.get(ARM::SUBri), ScratchReg1)
-        .addReg(ARM::SP)
-        .addImm(AlignedStackSize)
-        .add(predOps(ARMCC::AL))
-        .add(condCodeOp());
+    if (AlignedStackSize < 256) {
+      BuildMI(McrMBB, DL, TII.get(ARM::SUBri), ScratchReg1)
+          .addReg(ARM::SP)
+          .addImm(AlignedStackSize)
+          .add(predOps(ARMCC::AL))
+          .add(condCodeOp());
+    } else {
+      auto MBBI = McrMBB->end();
+      auto RegInfo = STI.getRegisterInfo();
+      RegInfo->emitLoadConstPool(*McrMBB, MBBI, DL, ScratchReg0, 0,
+                                 AlignedStackSize);
+      BuildMI(McrMBB, DL, TII.get(ARM::SUBrr), ScratchReg1)
+          .addReg(ARM::SP)
+          .addReg(ScratchReg0)
+          .add(predOps(ARMCC::AL))
+          .add(condCodeOp());
+    }
   }
 
   if (Thumb && ST->isThumb1Only()) {
@@ -2707,28 +3156,69 @@ void ARMFrameLowering::adjustForSegmentedStacks(
   // Pass first argument for the __morestack by Scratch Register #0.
   //   The amount size of stack required
   if (Thumb) {
-    BuildMI(AllocMBB, DL, TII.get(ARM::tMOVi8), ScratchReg0)
-        .add(condCodeOp())
-        .addImm(AlignedStackSize)
-        .add(predOps(ARMCC::AL));
+    if (AlignedStackSize < 256) {
+      BuildMI(AllocMBB, DL, TII.get(ARM::tMOVi8), ScratchReg0)
+          .add(condCodeOp())
+          .addImm(AlignedStackSize)
+          .add(predOps(ARMCC::AL));
+    } else {
+      if (Thumb2) {
+        BuildMI(AllocMBB, DL, TII.get(ARM::t2MOVi32imm), ScratchReg0)
+            .addImm(AlignedStackSize);
+      } else {
+        auto MBBI = AllocMBB->end();
+        auto RegInfo = STI.getRegisterInfo();
+        RegInfo->emitLoadConstPool(*AllocMBB, MBBI, DL, ScratchReg0, 0,
+                                   AlignedStackSize);
+      }
+    }
   } else {
-    BuildMI(AllocMBB, DL, TII.get(ARM::MOVi), ScratchReg0)
-        .addImm(AlignedStackSize)
-        .add(predOps(ARMCC::AL))
-        .add(condCodeOp());
+    if (AlignedStackSize < 256) {
+      BuildMI(AllocMBB, DL, TII.get(ARM::MOVi), ScratchReg0)
+          .addImm(AlignedStackSize)
+          .add(predOps(ARMCC::AL))
+          .add(condCodeOp());
+    } else {
+      auto MBBI = AllocMBB->end();
+      auto RegInfo = STI.getRegisterInfo();
+      RegInfo->emitLoadConstPool(*AllocMBB, MBBI, DL, ScratchReg0, 0,
+                                 AlignedStackSize);
+    }
   }
+
   // Pass second argument for the __morestack by Scratch Register #1.
   //   The amount size of stack consumed to save function arguments.
   if (Thumb) {
-    BuildMI(AllocMBB, DL, TII.get(ARM::tMOVi8), ScratchReg1)
-        .add(condCodeOp())
-        .addImm(alignToARMConstant(ARMFI->getArgumentStackSize()))
-        .add(predOps(ARMCC::AL));
+    if (ARMFI->getArgumentStackSize() < 256) {
+      BuildMI(AllocMBB, DL, TII.get(ARM::tMOVi8), ScratchReg1)
+          .add(condCodeOp())
+          .addImm(alignToARMConstant(ARMFI->getArgumentStackSize()))
+          .add(predOps(ARMCC::AL));
+    } else {
+      if (Thumb2) {
+        BuildMI(AllocMBB, DL, TII.get(ARM::t2MOVi32imm), ScratchReg1)
+            .addImm(alignToARMConstant(ARMFI->getArgumentStackSize()));
+      } else {
+        auto MBBI = AllocMBB->end();
+        auto RegInfo = STI.getRegisterInfo();
+        RegInfo->emitLoadConstPool(
+            *AllocMBB, MBBI, DL, ScratchReg1, 0,
+            alignToARMConstant(ARMFI->getArgumentStackSize()));
+      }
+    }
   } else {
-    BuildMI(AllocMBB, DL, TII.get(ARM::MOVi), ScratchReg1)
-        .addImm(alignToARMConstant(ARMFI->getArgumentStackSize()))
-        .add(predOps(ARMCC::AL))
-        .add(condCodeOp());
+    if (alignToARMConstant(ARMFI->getArgumentStackSize()) < 256) {
+      BuildMI(AllocMBB, DL, TII.get(ARM::MOVi), ScratchReg1)
+          .addImm(alignToARMConstant(ARMFI->getArgumentStackSize()))
+          .add(predOps(ARMCC::AL))
+          .add(condCodeOp());
+    } else {
+      auto MBBI = AllocMBB->end();
+      auto RegInfo = STI.getRegisterInfo();
+      RegInfo->emitLoadConstPool(
+          *AllocMBB, MBBI, DL, ScratchReg1, 0,
+          alignToARMConstant(ARMFI->getArgumentStackSize()));
+    }
   }
 
   // push {lr} - Save return address of this function.
@@ -2746,13 +3236,15 @@ void ARMFrameLowering::adjustForSegmentedStacks(
 
   // Emit the DWARF info about the change in stack as well as where to find the
   // previous link register
-  CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 12));
-  BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
-      .addCFIIndex(CFIIndex);
-  CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+  if (!MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
+    CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 12));
+    BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+    CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
         nullptr, MRI->getDwarfRegNum(ARM::LR, true), -12));
-  BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
-      .addCFIIndex(CFIIndex);
+    BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+  }
 
   // Call __morestack().
   if (Thumb) {
@@ -2808,9 +3300,11 @@ void ARMFrameLowering::adjustForSegmentedStacks(
   }
 
   // Update the CFA offset now that we've popped
-  CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 0));
-  BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
-      .addCFIIndex(CFIIndex);
+  if (!MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
+    CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 0));
+    BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+  }
 
   // Return from this function.
   BuildMI(AllocMBB, DL, TII.get(ST->getReturnOpcode())).add(predOps(ARMCC::AL));
@@ -2832,20 +3326,22 @@ void ARMFrameLowering::adjustForSegmentedStacks(
   }
 
   // Update the CFA offset now that we've popped
-  CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 0));
-  BuildMI(PostStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
-      .addCFIIndex(CFIIndex);
-
-  // Tell debuggers that r4 and r5 are now the same as they were in the
-  // previous function, that they're the "Same Value".
-  CFIIndex = MF.addFrameInst(MCCFIInstruction::createSameValue(
-      nullptr, MRI->getDwarfRegNum(ScratchReg0, true)));
-  BuildMI(PostStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
-      .addCFIIndex(CFIIndex);
-  CFIIndex = MF.addFrameInst(MCCFIInstruction::createSameValue(
-      nullptr, MRI->getDwarfRegNum(ScratchReg1, true)));
-  BuildMI(PostStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
-      .addCFIIndex(CFIIndex);
+  if (!MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
+    CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 0));
+    BuildMI(PostStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+
+    // Tell debuggers that r4 and r5 are now the same as they were in the
+    // previous function, that they're the "Same Value".
+    CFIIndex = MF.addFrameInst(MCCFIInstruction::createSameValue(
+        nullptr, MRI->getDwarfRegNum(ScratchReg0, true)));
+    BuildMI(PostStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+    CFIIndex = MF.addFrameInst(MCCFIInstruction::createSameValue(
+        nullptr, MRI->getDwarfRegNum(ScratchReg1, true)));
+    BuildMI(PostStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+  }
 
   // Organizing MBB lists
   PostStackMBB->addSuccessor(&PrologueMBB);
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.h b/llvm/lib/Target/ARM/ARMFrameLowering.h
index 9822e2321bb4..16f2ce6bea6f 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.h
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.h
@@ -46,6 +46,7 @@ public:
   bool enableCalleeSaveSkip(const MachineFunction &MF) const override;
 
   bool hasFP(const MachineFunction &MF) const override;
+  bool isFPReserved(const MachineFunction &MF) const;
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
   bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
   StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
diff --git a/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp b/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp
index 0d201a67af46..9b26aac6c0b7 100644
--- a/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp
+++ b/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp
@@ -11,6 +11,8 @@
 #include "ARMBaseRegisterInfo.h"
 #include "ARMSubtarget.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 98c8133282a2..e0e4ffd90e0e 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -1058,15 +1058,15 @@ bool ARMDAGToDAGISel::SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,
        MemN->getConstantOperandVal(MemN->getNumOperands() - 1) == 1)) {
     // This case occurs only for VLD1-lane/dup and VST1-lane instructions.
     // The maximum alignment is equal to the memory size being referenced.
-    unsigned MMOAlign = MemN->getAlignment();
+    llvm::Align MMOAlign = MemN->getAlign();
     unsigned MemSize = MemN->getMemoryVT().getSizeInBits() / 8;
-    if (MMOAlign >= MemSize && MemSize > 1)
+    if (MMOAlign.value() >= MemSize && MemSize > 1)
       Alignment = MemSize;
   } else {
     // All other uses of addrmode6 are for intrinsics.  For now just record
     // the raw alignment value; it will be refined later based on the legal
     // alignment operands for the intrinsic.
-    Alignment = MemN->getAlignment();
+    Alignment = MemN->getAlign().value();
   }
 
   Align = CurDAG->getTargetConstant(Alignment, SDLoc(N), MVT::i32);
@@ -3464,40 +3464,39 @@ bool ARMDAGToDAGISel::tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned) {
   return false;
 }
 
-/// Target-specific DAG combining for ISD::XOR.
+/// Target-specific DAG combining for ISD::SUB.
 /// Target-independent combining lowers SELECT_CC nodes of the form
 /// select_cc setg[ge] X,  0,  X, -X
 /// select_cc setgt    X, -1,  X, -X
 /// select_cc setl[te] X,  0, -X,  X
 /// select_cc setlt    X,  1, -X,  X
 /// which represent Integer ABS into:
-/// Y = sra (X, size(X)-1); xor (add (X, Y), Y)
+/// Y = sra (X, size(X)-1); sub (xor (X, Y), Y)
 /// ARM instruction selection detects the latter and matches it to
 /// ARM::ABS or ARM::t2ABS machine node.
 bool ARMDAGToDAGISel::tryABSOp(SDNode *N){
-  SDValue XORSrc0 = N->getOperand(0);
-  SDValue XORSrc1 = N->getOperand(1);
+  SDValue SUBSrc0 = N->getOperand(0);
+  SDValue SUBSrc1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
 
   if (Subtarget->isThumb1Only())
     return false;
 
-  if (XORSrc0.getOpcode() != ISD::ADD || XORSrc1.getOpcode() != ISD::SRA)
+  if (SUBSrc0.getOpcode() != ISD::XOR || SUBSrc1.getOpcode() != ISD::SRA)
     return false;
 
-  SDValue ADDSrc0 = XORSrc0.getOperand(0);
-  SDValue ADDSrc1 = XORSrc0.getOperand(1);
-  SDValue SRASrc0 = XORSrc1.getOperand(0);
-  SDValue SRASrc1 = XORSrc1.getOperand(1);
+  SDValue XORSrc0 = SUBSrc0.getOperand(0);
+  SDValue XORSrc1 = SUBSrc0.getOperand(1);
+  SDValue SRASrc0 = SUBSrc1.getOperand(0);
+  SDValue SRASrc1 = SUBSrc1.getOperand(1);
   ConstantSDNode *SRAConstant =  dyn_cast<ConstantSDNode>(SRASrc1);
   EVT XType = SRASrc0.getValueType();
   unsigned Size = XType.getSizeInBits() - 1;
 
-  if (ADDSrc1 == XORSrc1 && ADDSrc0 == SRASrc0 &&
-      XType.isInteger() && SRAConstant != nullptr &&
-      Size == SRAConstant->getZExtValue()) {
+  if (XORSrc1 == SUBSrc1 && XORSrc0 == SRASrc0 && XType.isInteger() &&
+      SRAConstant != nullptr && Size == SRAConstant->getZExtValue()) {
     unsigned Opcode = Subtarget->isThumb2() ? ARM::t2ABS : ARM::ABS;
-    CurDAG->SelectNodeTo(N, Opcode, VT, ADDSrc0);
+    CurDAG->SelectNodeTo(N, Opcode, VT, XORSrc0);
     return true;
   }
 
@@ -3673,8 +3672,8 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     if (tryInlineAsm(N))
       return;
     break;
-  case ISD::XOR:
-    // Select special operations if XOR node forms integer ABS pattern
+  case ISD::SUB:
+    // Select special operations if SUB node forms integer ABS pattern
     if (tryABSOp(N))
       return;
     // Other cases are autogenerated.
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 1b41427a1cab..85e32c08c74c 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -273,6 +273,10 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
     setOperationAction(ISD::USUBSAT, VT, Legal);
     setOperationAction(ISD::ABDS, VT, Legal);
     setOperationAction(ISD::ABDU, VT, Legal);
+    setOperationAction(ISD::AVGFLOORS, VT, Legal);
+    setOperationAction(ISD::AVGFLOORU, VT, Legal);
+    setOperationAction(ISD::AVGCEILS, VT, Legal);
+    setOperationAction(ISD::AVGCEILU, VT, Legal);
 
     // No native support for these.
     setOperationAction(ISD::UDIV, VT, Expand);
@@ -392,6 +396,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
     setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
     setOperationAction(ISD::VSELECT, VT, Legal);
+    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
   }
   setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
 
@@ -476,7 +481,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 
   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() &&
-      !Subtarget->isTargetWatchOS()) {
+      !Subtarget->isTargetWatchOS() && !Subtarget->isTargetDriverKit()) {
     bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard;
     for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID)
       setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID),
@@ -809,8 +814,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
 
   // Combine low-overhead loop intrinsics so that we can lower i1 types.
   if (Subtarget->hasLOB()) {
-    setTargetDAGCombine(ISD::BRCOND);
-    setTargetDAGCombine(ISD::BR_CC);
+    setTargetDAGCombine({ISD::BRCOND, ISD::BR_CC});
   }
 
   if (Subtarget->hasNEON()) {
@@ -982,13 +986,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::FMA, MVT::v4f32, Expand);
     }
 
-    setTargetDAGCombine(ISD::SHL);
-    setTargetDAGCombine(ISD::SRL);
-    setTargetDAGCombine(ISD::SRA);
-    setTargetDAGCombine(ISD::FP_TO_SINT);
-    setTargetDAGCombine(ISD::FP_TO_UINT);
-    setTargetDAGCombine(ISD::FDIV);
-    setTargetDAGCombine(ISD::LOAD);
+    setTargetDAGCombine({ISD::SHL, ISD::SRL, ISD::SRA, ISD::FP_TO_SINT,
+                         ISD::FP_TO_UINT, ISD::FDIV, ISD::LOAD});
 
     // It is legal to extload from v4i8 to v4i16 or v4i32.
     for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
@@ -1002,32 +1001,17 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   }
 
   if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
-    setTargetDAGCombine(ISD::BUILD_VECTOR);
-    setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
-    setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
-    setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
-    setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
-    setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
-    setTargetDAGCombine(ISD::STORE);
-    setTargetDAGCombine(ISD::SIGN_EXTEND);
-    setTargetDAGCombine(ISD::ZERO_EXTEND);
-    setTargetDAGCombine(ISD::ANY_EXTEND);
-    setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
-    setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
-    setTargetDAGCombine(ISD::INTRINSIC_VOID);
-    setTargetDAGCombine(ISD::VECREDUCE_ADD);
-    setTargetDAGCombine(ISD::ADD);
-    setTargetDAGCombine(ISD::BITCAST);
+    setTargetDAGCombine(
+        {ISD::BUILD_VECTOR, ISD::VECTOR_SHUFFLE, ISD::INSERT_SUBVECTOR,
+         ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,
+         ISD::SIGN_EXTEND_INREG, ISD::STORE, ISD::SIGN_EXTEND, ISD::ZERO_EXTEND,
+         ISD::ANY_EXTEND, ISD::INTRINSIC_WO_CHAIN, ISD::INTRINSIC_W_CHAIN,
+         ISD::INTRINSIC_VOID, ISD::VECREDUCE_ADD, ISD::ADD, ISD::BITCAST});
   }
   if (Subtarget->hasMVEIntegerOps()) {
-    setTargetDAGCombine(ISD::SMIN);
-    setTargetDAGCombine(ISD::UMIN);
-    setTargetDAGCombine(ISD::SMAX);
-    setTargetDAGCombine(ISD::UMAX);
-    setTargetDAGCombine(ISD::FP_EXTEND);
-    setTargetDAGCombine(ISD::SELECT);
-    setTargetDAGCombine(ISD::SELECT_CC);
-    setTargetDAGCombine(ISD::SETCC);
+    setTargetDAGCombine({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX,
+                         ISD::FP_EXTEND, ISD::SELECT, ISD::SELECT_CC,
+                         ISD::SETCC});
   }
   if (Subtarget->hasMVEFloatOps()) {
     setTargetDAGCombine(ISD::FADD);
@@ -1364,6 +1348,29 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     }
   }
 
+  // Compute supported atomic widths.
+  if (Subtarget->isTargetLinux() ||
+      (!Subtarget->isMClass() && Subtarget->hasV6Ops())) {
+    // For targets where __sync_* routines are reliably available, we use them
+    // if necessary.
+    //
+    // ARM Linux always supports 64-bit atomics through kernel-assisted atomic
+    // routines (kernel 3.1 or later). FIXME: Not with compiler-rt?
+    //
+    // ARMv6 targets have native instructions in ARM mode. For Thumb mode,
+    // such targets should provide __sync_* routines, which use the ARM mode
+    // instructions. (ARMv6 doesn't have dmb, but it has an equivalent
+    // encoding; see ARMISD::MEMBARRIER_MCR.)
+    setMaxAtomicSizeInBitsSupported(64);
+  } else if (Subtarget->isMClass() && Subtarget->hasV8MBaselineOps()) {
+    // Cortex-M (besides Cortex-M0) have 32-bit atomics.
+    setMaxAtomicSizeInBitsSupported(32);
+  } else {
+    // We can't assume anything about other targets; just use libatomic
+    // routines.
+    setMaxAtomicSizeInBitsSupported(0);
+  }
+
   setOperationAction(ISD::PREFETCH,         MVT::Other, Custom);
 
   // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
@@ -1545,12 +1552,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
 
   // We have target-specific dag combine patterns for the following nodes:
   // ARMISD::VMOVRRD  - No need to call setTargetDAGCombine
-  setTargetDAGCombine(ISD::ADD);
-  setTargetDAGCombine(ISD::SUB);
-  setTargetDAGCombine(ISD::MUL);
-  setTargetDAGCombine(ISD::AND);
-  setTargetDAGCombine(ISD::OR);
-  setTargetDAGCombine(ISD::XOR);
+  setTargetDAGCombine(
+      {ISD::ADD, ISD::SUB, ISD::MUL, ISD::AND, ISD::OR, ISD::XOR});
 
   if (Subtarget->hasMVEIntegerOps())
     setTargetDAGCombine(ISD::VSELECT);
@@ -1559,6 +1562,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setTargetDAGCombine(ISD::SRL);
   if (Subtarget->isThumb1Only())
     setTargetDAGCombine(ISD::SHL);
+  // Attempt to lower smin/smax to ssat/usat
+  if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) ||
+      Subtarget->isThumb2()) {
+    setTargetDAGCombine({ISD::SMIN, ISD::SMAX});
+  }
 
   setStackPointerRegisterToSaveRestore(ARM::SP);
 
@@ -1901,13 +1909,14 @@ ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
 // source/dest is aligned and the copy size is large enough. We therefore want
 // to align such objects passed to memory intrinsics.
 bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
-                                               unsigned &PrefAlign) const {
+                                               Align &PrefAlign) const {
   if (!isa<MemIntrinsic>(CI))
     return false;
   MinSize = 8;
   // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
   // cycle faster than 4-byte aligned LDM.
-  PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4);
+  PrefAlign =
+      (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? Align(8) : Align(4));
   return true;
 }
 
@@ -2326,7 +2335,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Lower 'returns_twice' calls to a pseudo-instruction.
   if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
-      !Subtarget->getNoBTIAtReturnTwice())
+      !Subtarget->noBTIAtReturnTwice())
     GuardWithBTI = AFI->branchTargetEnforcement();
 
   // Determine whether this is a non-secure function call.
@@ -2778,25 +2787,23 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                   RegsToPass[i].second.getValueType()));
 
   // Add a register mask operand representing the call-preserved registers.
-  if (!isTailCall) {
-    const uint32_t *Mask;
-    const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
-    if (isThisReturn) {
-      // For 'this' returns, use the R0-preserving mask if applicable
-      Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
-      if (!Mask) {
-        // Set isThisReturn to false if the calling convention is not one that
-        // allows 'returned' to be modeled in this way, so LowerCallResult does
-        // not try to pass 'this' straight through
-        isThisReturn = false;
-        Mask = ARI->getCallPreservedMask(MF, CallConv);
-      }
-    } else
+  const uint32_t *Mask;
+  const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
+  if (isThisReturn) {
+    // For 'this' returns, use the R0-preserving mask if applicable
+    Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
+    if (!Mask) {
+      // Set isThisReturn to false if the calling convention is not one that
+      // allows 'returned' to be modeled in this way, so LowerCallResult does
+      // not try to pass 'this' straight through
+      isThisReturn = false;
       Mask = ARI->getCallPreservedMask(MF, CallConv);
+    }
+  } else
+    Mask = ARI->getCallPreservedMask(MF, CallConv);
 
-    assert(Mask && "Missing call preserved mask for calling convention");
-    Ops.push_back(DAG.getRegisterMask(Mask));
-  }
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
 
   if (InFlag.getNode())
     Ops.push_back(InFlag);
@@ -4379,7 +4386,7 @@ void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
 bool ARMTargetLowering::splitValueIntoRegisterParts(
     SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
     unsigned NumParts, MVT PartVT, Optional<CallingConv::ID> CC) const {
-  bool IsABIRegCopy = CC.hasValue();
+  bool IsABIRegCopy = CC.has_value();
   EVT ValueVT = Val.getValueType();
   if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) &&
       PartVT == MVT::f32) {
@@ -4397,7 +4404,7 @@ bool ARMTargetLowering::splitValueIntoRegisterParts(
 SDValue ARMTargetLowering::joinRegisterPartsIntoValue(
     SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
     MVT PartVT, EVT ValueVT, Optional<CallingConv::ID> CC) const {
-  bool IsABIRegCopy = CC.hasValue();
+  bool IsABIRegCopy = CC.has_value();
   if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) &&
       PartVT == MVT::f32) {
     unsigned ValueBits = ValueVT.getSizeInBits();
@@ -5547,7 +5554,7 @@ static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
 
   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
     return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
-                       Ld->getPointerInfo(), Ld->getAlignment(),
+                       Ld->getPointerInfo(), Ld->getAlign(),
                        Ld->getMemOperand()->getFlags());
 
   llvm_unreachable("Unknown VFP cmp argument!");
@@ -5567,14 +5574,14 @@ static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
     SDValue Ptr = Ld->getBasePtr();
     RetVal1 =
         DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
-                    Ld->getAlignment(), Ld->getMemOperand()->getFlags());
+                    Ld->getAlign(), Ld->getMemOperand()->getFlags());
 
     EVT PtrType = Ptr.getValueType();
-    unsigned NewAlign = MinAlign(Ld->getAlignment(), 4);
     SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
                                  PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
     RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
-                          Ld->getPointerInfo().getWithOffset(4), NewAlign,
+                          Ld->getPointerInfo().getWithOffset(4),
+                          commonAlignment(Ld->getAlign(), 4),
                           Ld->getMemOperand()->getFlags());
     return;
   }
@@ -5801,8 +5808,7 @@ static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
     return DAG.UnrollVectorOp(Op.getNode());
   }
 
-  const bool HasFullFP16 =
-    static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16();
+  const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
 
   EVT NewTy;
   const EVT OpTy = Op.getOperand(0).getValueType();
@@ -5912,8 +5918,7 @@ static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
           Op.getOperand(0).getValueType() == MVT::v8i16) &&
          "Invalid type for custom lowering!");
 
-  const bool HasFullFP16 =
-    static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16();
+  const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
 
   EVT DestVecType;
   if (VT == MVT::v4f32)
@@ -9359,15 +9364,15 @@ static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
   // The load already has the right type.
   if (ExtendedTy == LD->getMemoryVT())
     return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
-                       LD->getBasePtr(), LD->getPointerInfo(),
-                       LD->getAlignment(), LD->getMemOperand()->getFlags());
+                       LD->getBasePtr(), LD->getPointerInfo(), LD->getAlign(),
+                       LD->getMemOperand()->getFlags());
 
   // We need to create a zextload/sextload. We cannot just create a load
   // followed by a zext/zext node because LowerMUL is also run during normal
   // operation legalization where we can't create illegal types.
   return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
                         LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
-                        LD->getMemoryVT(), LD->getAlignment(),
+                        LD->getMemoryVT(), LD->getAlign(),
                         LD->getMemOperand()->getFlags());
 }
 
@@ -9876,7 +9881,7 @@ ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
   if (N->getOpcode() != ISD::SDIV)
     return SDValue();
 
-  const auto &ST = static_cast<const ARMSubtarget&>(DAG.getSubtarget());
+  const auto &ST = DAG.getSubtarget<ARMSubtarget>();
   const bool MinSize = ST.hasMinSize();
   const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
                                       : ST.hasDivideInARMMode();
@@ -10311,6 +10316,15 @@ SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getMergeValues({Result, Chain}, dl);
 }
 
+SDValue ARMTargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+
+  EVT VT = getPointerTy(DAG.getDataLayout());
+  SDLoc DL(Op);
+  int FI = MFI.CreateFixedObject(4, 0, false);
+  return DAG.getFrameIndex(FI, VT);
+}
+
 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
   switch (Op.getOpcode()) {
@@ -10424,6 +10438,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
   case ISD::STRICT_FSETCC:
   case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG);
+  case ISD::SPONENTRY:
+    return LowerSPONENTRY(Op, DAG);
   case ARMISD::WIN__DBZCHK: return SDValue();
   }
 }
@@ -10509,9 +10525,6 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
     return;
   case ISD::INTRINSIC_WO_CHAIN:
     return ReplaceLongIntrinsic(N, Results, DAG);
-  case ISD::ABS:
-     lowerABS(N, Results, DAG);
-     return ;
   case ISD::LOAD:
     LowerLOAD(N, Results, DAG);
     break;
@@ -12170,7 +12183,7 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
     if (Subtarget->isThumb1Only()) {
       for (unsigned c = MCID->getNumOperands() - 4; c--;) {
         MI.addOperand(MI.getOperand(1));
-        MI.RemoveOperand(1);
+        MI.removeOperand(1);
       }
 
       // Restore the ties
@@ -12208,7 +12221,7 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
       definesCPSR = true;
       if (MO.isDead())
         deadCPSR = true;
-      MI.RemoveOperand(i);
+      MI.removeOperand(i);
       break;
     }
   }
@@ -14775,14 +14788,14 @@ static SDValue PerformVMOVRRDCombine(SDNode *N,
     SDValue BasePtr = LD->getBasePtr();
     SDValue NewLD1 =
         DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
-                    LD->getAlignment(), LD->getMemOperand()->getFlags());
+                    LD->getAlign(), LD->getMemOperand()->getFlags());
 
     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
                                     DAG.getConstant(4, DL, MVT::i32));
 
     SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr,
                                  LD->getPointerInfo().getWithOffset(4),
-                                 std::min(4U, LD->getAlignment()),
+                                 commonAlignment(LD->getAlign(), 4),
                                  LD->getMemOperand()->getFlags());
 
     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
@@ -15352,6 +15365,10 @@ static SDValue FlattenVectorShuffle(ShuffleVectorSDNode *N, SelectionDAG &DAG) {
   case ISD::MULHU:
   case ISD::ABDS:
   case ISD::ABDU:
+  case ISD::AVGFLOORS:
+  case ISD::AVGFLOORU:
+  case ISD::AVGCEILS:
+  case ISD::AVGCEILU:
     break;
   default:
     return SDValue();
@@ -15721,7 +15738,7 @@ static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target,
   // Now, create a _UPD node, taking care of not breaking alignment.
 
   EVT AlignedVecTy = VecTy;
-  unsigned Alignment = MemN->getAlignment();
+  Align Alignment = MemN->getAlign();
 
   // If this is a less-than-standard-aligned load/store, change the type to
   // match the standard alignment.
@@ -15738,10 +15755,8 @@ static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target,
   //   memory type to match the explicit alignment.  That way, we don't
   //   generate non-standard-aligned ARMISD::VLDx nodes.
   if (isa<LSBaseSDNode>(N)) {
-    if (Alignment == 0)
-      Alignment = 1;
-    if (Alignment < VecTy.getScalarSizeInBits() / 8) {
-      MVT EltTy = MVT::getIntegerVT(Alignment * 8);
+    if (Alignment.value() < VecTy.getScalarSizeInBits() / 8) {
+      MVT EltTy = MVT::getIntegerVT(Alignment.value() * 8);
       assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
       assert(!isLaneOp && "Unexpected generic load/store lane.");
       unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
@@ -15754,7 +15769,7 @@ static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target,
     // alignment of the memory type.
     // Intrinsics, however, always get an explicit alignment, set to the
     // alignment of the MMO.
-    Alignment = 1;
+    Alignment = Align(1);
   }
 
   // Create the new updating load/store node.
@@ -15787,7 +15802,7 @@ static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target,
   }
 
   // For all node types, the alignment operand is always the last one.
-  Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32));
+  Ops.push_back(DAG.getConstant(Alignment.value(), dl, MVT::i32));
 
   // If this is a non-standard-aligned STORE, the penultimate operand is the
   // stored value.  Bitcast it to the aligned type.
@@ -15965,10 +15980,10 @@ static SDValue CombineBaseUpdate(SDNode *N,
   // Try to fold with other users. Non-constant updates are considered
   // first, and constant updates are sorted to not break a sequence of
   // strided accesses (if there is any).
-  std::sort(BaseUpdates.begin(), BaseUpdates.end(),
-            [](BaseUpdateUser &LHS, BaseUpdateUser &RHS) {
-              return LHS.ConstInc < RHS.ConstInc;
-            });
+  std::stable_sort(BaseUpdates.begin(), BaseUpdates.end(),
+                   [](const BaseUpdateUser &LHS, const BaseUpdateUser &RHS) {
+                     return LHS.ConstInc < RHS.ConstInc;
+                   });
   for (BaseUpdateUser &User : BaseUpdates) {
     if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/false, DCI))
       return SDValue();
@@ -16258,7 +16273,7 @@ static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG,
   if (LD && Op.hasOneUse() && LD->isUnindexed() &&
       LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
     SDValue Ops[] = {LD->getOperand(0), LD->getOperand(1),
-                     DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32)};
+                     DAG.getConstant(LD->getAlign().value(), SDLoc(N), MVT::i32)};
     SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
     SDValue VLDDup =
         DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys, Ops,
@@ -16360,7 +16375,7 @@ static SDValue PerformTruncatingStoreCombine(StoreSDNode *St,
                                  ShuffWide, DAG.getIntPtrConstant(I, DL));
     SDValue Ch =
         DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(),
-                     St->getAlignment(), St->getMemOperand()->getFlags());
+                     St->getAlign(), St->getMemOperand()->getFlags());
     BasePtr =
         DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment);
     Chains.push_back(Ch);
@@ -16608,7 +16623,7 @@ static SDValue PerformSTORECombine(SDNode *N,
     DCI.AddToWorklist(ExtElt.getNode());
     DCI.AddToWorklist(V.getNode());
     return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
-                        St->getPointerInfo(), St->getAlignment(),
+                        St->getPointerInfo(), St->getAlign(),
                         St->getMemOperand()->getFlags(), St->getAAInfo());
   }
 
@@ -16690,14 +16705,16 @@ static SDValue PerformFAddVSelectCombine(SDNode *N, SelectionDAG &DAG,
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
 
-  // The identity element for a fadd is -0.0, which these VMOV's represent.
-  auto isNegativeZeroSplat = [&](SDValue Op) {
+  // The identity element for a fadd is -0.0 or +0.0 when the nsz flag is set,
+  // which these VMOV's represent.
+  auto isIdentitySplat = [&](SDValue Op, bool NSZ) {
     if (Op.getOpcode() != ISD::BITCAST ||
         Op.getOperand(0).getOpcode() != ARMISD::VMOVIMM)
       return false;
-    if (VT == MVT::v4f32 && Op.getOperand(0).getConstantOperandVal(0) == 1664)
+    uint64_t ImmVal = Op.getOperand(0).getConstantOperandVal(0);
+    if (VT == MVT::v4f32 && (ImmVal == 1664 || (ImmVal == 0 && NSZ)))
       return true;
-    if (VT == MVT::v8f16 && Op.getOperand(0).getConstantOperandVal(0) == 2688)
+    if (VT == MVT::v8f16 && (ImmVal == 2688 || (ImmVal == 0 && NSZ)))
       return true;
     return false;
   };
@@ -16705,12 +16722,17 @@ static SDValue PerformFAddVSelectCombine(SDNode *N, SelectionDAG &DAG,
   if (Op0.getOpcode() == ISD::VSELECT && Op1.getOpcode() != ISD::VSELECT)
     std::swap(Op0, Op1);
 
-  if (Op1.getOpcode() != ISD::VSELECT ||
-      !isNegativeZeroSplat(Op1.getOperand(2)))
+  if (Op1.getOpcode() != ISD::VSELECT)
+    return SDValue();
+
+  SDNodeFlags FaddFlags = N->getFlags();
+  bool NSZ = FaddFlags.hasNoSignedZeros();
+  if (!isIdentitySplat(Op1.getOperand(2), NSZ))
     return SDValue();
+
   SDValue FAdd =
-      DAG.getNode(ISD::FADD, DL, VT, Op0, Op1.getOperand(1), N->getFlags());
-  return DAG.getNode(ISD::VSELECT, DL, VT, Op1.getOperand(0), FAdd, Op0);
+      DAG.getNode(ISD::FADD, DL, VT, Op0, Op1.getOperand(1), FaddFlags);
+  return DAG.getNode(ISD::VSELECT, DL, VT, Op1.getOperand(0), FAdd, Op0, FaddFlags);
 }
 
 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
@@ -17060,13 +17082,10 @@ static SDValue PerformVMOVNCombine(SDNode *N,
       IsTop ? Op1DemandedElts
             : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1));
 
-  APInt KnownUndef, KnownZero;
   const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
-  if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, KnownUndef,
-                                     KnownZero, DCI))
+  if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
     return SDValue(N, 0);
-  if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, KnownUndef,
-                                     KnownZero, DCI))
+  if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, DCI))
     return SDValue(N, 0);
 
   return SDValue();
@@ -17082,10 +17101,8 @@ static SDValue PerformVQMOVNCombine(SDNode *N,
       APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
                                      : APInt::getHighBitsSet(2, 1));
 
-  APInt KnownUndef, KnownZero;
   const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
-  if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, KnownUndef,
-                                     KnownZero, DCI))
+  if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
     return SDValue(N, 0);
   return SDValue();
 }
@@ -17390,7 +17407,7 @@ static SDValue PerformShiftCombine(SDNode *N,
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (!VT.isVector() || !TLI.isTypeLegal(VT))
     return SDValue();
-  if (ST->hasMVEIntegerOps() && VT == MVT::v2i64)
+  if (ST->hasMVEIntegerOps())
     return SDValue();
 
   int64_t Cnt;
@@ -17556,12 +17573,57 @@ static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+// Lower smin(smax(x, C1), C2) to ssat or usat, if they have saturating
+// constant bounds.
+static SDValue PerformMinMaxToSatCombine(SDValue Op, SelectionDAG &DAG,
+                                         const ARMSubtarget *Subtarget) {
+  if ((Subtarget->isThumb() || !Subtarget->hasV6Ops()) &&
+      !Subtarget->isThumb2())
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+  SDValue Op0 = Op.getOperand(0);
+
+  if (VT != MVT::i32 ||
+      (Op0.getOpcode() != ISD::SMIN && Op0.getOpcode() != ISD::SMAX) ||
+      !isa<ConstantSDNode>(Op.getOperand(1)) ||
+      !isa<ConstantSDNode>(Op0.getOperand(1)))
+    return SDValue();
+
+  SDValue Min = Op;
+  SDValue Max = Op0;
+  SDValue Input = Op0.getOperand(0);
+  if (Min.getOpcode() == ISD::SMAX)
+    std::swap(Min, Max);
+
+  APInt MinC = Min.getConstantOperandAPInt(1);
+  APInt MaxC = Max.getConstantOperandAPInt(1);
+
+  if (Min.getOpcode() != ISD::SMIN || Max.getOpcode() != ISD::SMAX ||
+      !(MinC + 1).isPowerOf2())
+    return SDValue();
+
+  SDLoc DL(Op);
+  if (MinC == ~MaxC)
+    return DAG.getNode(ARMISD::SSAT, DL, VT, Input,
+                       DAG.getConstant(MinC.countTrailingOnes(), DL, VT));
+  if (MaxC == 0)
+    return DAG.getNode(ARMISD::USAT, DL, VT, Input,
+                       DAG.getConstant(MinC.countTrailingOnes(), DL, VT));
+
+  return SDValue();
+}
+
 /// PerformMinMaxCombine - Target-specific DAG combining for creating truncating
 /// saturates.
 static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG,
                                     const ARMSubtarget *ST) {
   EVT VT = N->getValueType(0);
   SDValue N0 = N->getOperand(0);
+
+  if (VT == MVT::i32)
+    return PerformMinMaxToSatCombine(SDValue(N, 0), DAG, ST);
+
   if (!ST->hasMVEIntegerOps())
     return SDValue();
 
@@ -19354,8 +19416,8 @@ bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
 // Return false to prevent folding
 // (mul (add r, c0), c1) -> (add (mul r, c1), c0*c1) in DAGCombine,
 // if the folding leads to worse code.
-bool ARMTargetLowering::isMulAddWithConstProfitable(
-    const SDValue &AddNode, const SDValue &ConstNode) const {
+bool ARMTargetLowering::isMulAddWithConstProfitable(SDValue AddNode,
+                                                    SDValue ConstNode) const {
   // Let the DAGCombiner decide for vector types and large types.
   const EVT VT = AddNode.getValueType();
   if (VT.isVector() || VT.getScalarSizeInBits() > 32)
@@ -20537,38 +20599,6 @@ SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
   return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
 }
 
-void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results,
-                                 SelectionDAG &DAG) const {
-  assert(N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS.");
-  MVT HalfT = MVT::i32;
-  SDLoc dl(N);
-  SDValue Hi, Lo, Tmp;
-
-  if (!isOperationLegalOrCustom(ISD::ADDCARRY, HalfT) ||
-      !isOperationLegalOrCustom(ISD::UADDO, HalfT))
-    return ;
-
-  unsigned OpTypeBits = HalfT.getScalarSizeInBits();
-  SDVTList VTList = DAG.getVTList(HalfT, MVT::i1);
-
-  Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
-                   DAG.getConstant(0, dl, HalfT));
-  Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
-                   DAG.getConstant(1, dl, HalfT));
-
-  Tmp = DAG.getNode(ISD::SRA, dl, HalfT, Hi,
-                    DAG.getConstant(OpTypeBits - 1, dl,
-                    getShiftAmountTy(HalfT, DAG.getDataLayout())));
-  Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo);
-  Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi,
-                   SDValue(Lo.getNode(), 1));
-  Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi);
-  Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo);
-
-  Results.push_back(Lo);
-  Results.push_back(Hi);
-}
-
 bool
 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   // The ARM target isn't yet aware of offsets.
@@ -20787,24 +20817,24 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::arm_ldaex:
   case Intrinsic::arm_ldrex: {
     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
-    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
+    Type *ValTy = I.getParamElementType(0);
     Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::getVT(PtrTy->getPointerElementType());
+    Info.memVT = MVT::getVT(ValTy);
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
-    Info.align = DL.getABITypeAlign(PtrTy->getPointerElementType());
+    Info.align = DL.getABITypeAlign(ValTy);
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
     return true;
   }
   case Intrinsic::arm_stlex:
   case Intrinsic::arm_strex: {
     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
-    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
+    Type *ValTy = I.getParamElementType(1);
     Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::getVT(PtrTy->getPointerElementType());
+    Info.memVT = MVT::getVT(ValTy);
     Info.ptrVal = I.getArgOperand(1);
     Info.offset = 0;
-    Info.align = DL.getABITypeAlign(PtrTy->getPointerElementType());
+    Info.align = DL.getABITypeAlign(ValTy);
     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
     return true;
   }
@@ -20932,9 +20962,19 @@ Instruction *ARMTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
 // are doomed anyway, so defer to the default libcall and blame the OS when
 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
 // anything for those.
-bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+TargetLoweringBase::AtomicExpansionKind
+ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+  bool has64BitAtomicStore;
+  if (Subtarget->isMClass())
+    has64BitAtomicStore = false;
+  else if (Subtarget->isThumb())
+    has64BitAtomicStore = Subtarget->hasV7Ops();
+  else
+    has64BitAtomicStore = Subtarget->hasV6Ops();
+
   unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
-  return (Size == 64) && !Subtarget->isMClass();
+  return Size == 64 && has64BitAtomicStore ? AtomicExpansionKind::Expand
+                                           : AtomicExpansionKind::None;
 }
 
 // Loads and stores less than 64-bits are already atomic; ones above that
@@ -20946,9 +20986,17 @@ bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
 // sections A8.8.72-74 LDRD)
 TargetLowering::AtomicExpansionKind
 ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+  bool has64BitAtomicLoad;
+  if (Subtarget->isMClass())
+    has64BitAtomicLoad = false;
+  else if (Subtarget->isThumb())
+    has64BitAtomicLoad = Subtarget->hasV7Ops();
+  else
+    has64BitAtomicLoad = Subtarget->hasV6Ops();
+
   unsigned Size = LI->getType()->getPrimitiveSizeInBits();
-  return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly
-                                                  : AtomicExpansionKind::None;
+  return (Size == 64 && has64BitAtomicLoad) ? AtomicExpansionKind::LLOnly
+                                            : AtomicExpansionKind::None;
 }
 
 // For the real atomic operations, we have ldrex/strex up to 32 bits,
@@ -20958,19 +21006,25 @@ ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   if (AI->isFloatingPointOperation())
     return AtomicExpansionKind::CmpXChg;
 
-  // At -O0, fast-regalloc cannot cope with the live vregs necessary to
-  // implement atomicrmw without spilling. If the target address is also on the
-  // stack and close enough to the spill slot, this can lead to a situation
-  // where the monitor always gets cleared and the atomic operation can never
-  // succeed. So at -O0 lower this operation to a CAS loop.
-  if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
-    return AtomicExpansionKind::CmpXChg;
-
   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
-  bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
-  return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW)
-             ? AtomicExpansionKind::LLSC
-             : AtomicExpansionKind::None;
+  bool hasAtomicRMW;
+  if (Subtarget->isMClass())
+    hasAtomicRMW = Subtarget->hasV8MBaselineOps();
+  else if (Subtarget->isThumb())
+    hasAtomicRMW = Subtarget->hasV7Ops();
+  else
+    hasAtomicRMW = Subtarget->hasV6Ops();
+  if (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) {
+    // At -O0, fast-regalloc cannot cope with the live vregs necessary to
+    // implement atomicrmw without spilling. If the target address is also on
+    // the stack and close enough to the spill slot, this can lead to a
+    // situation where the monitor always gets cleared and the atomic operation
+    // can never succeed. So at -O0 lower this operation to a CAS loop.
+    if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
+      return AtomicExpansionKind::CmpXChg;
+    return AtomicExpansionKind::LLSC;
+  }
+  return AtomicExpansionKind::None;
 }
 
 // Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used  up to 32
@@ -20983,8 +21037,13 @@ ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {
   // situation where the monitor always gets cleared and the atomic operation
   // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
   unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits();
-  bool HasAtomicCmpXchg =
-      !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
+  bool HasAtomicCmpXchg;
+  if (Subtarget->isMClass())
+    HasAtomicCmpXchg = Subtarget->hasV8MBaselineOps();
+  else if (Subtarget->isThumb())
+    HasAtomicCmpXchg = Subtarget->hasV7Ops();
+  else
+    HasAtomicCmpXchg = Subtarget->hasV6Ops();
   if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg &&
       Size <= (Subtarget->isMClass() ? 32U : 64U))
     return AtomicExpansionKind::LLSC;
@@ -21099,8 +21158,11 @@ Value *ARMTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy,
   Type *Tys[] = { Addr->getType() };
   Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
   Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys);
+  CallInst *CI = Builder.CreateCall(Ldrex, Addr);
 
-  return Builder.CreateTruncOrBitCast(Builder.CreateCall(Ldrex, Addr), ValueTy);
+  CI->addParamAttr(
+      0, Attribute::get(M->getContext(), Attribute::ElementType, ValueTy));
+  return Builder.CreateTruncOrBitCast(CI, ValueTy);
 }
 
 void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
@@ -21138,10 +21200,13 @@ Value *ARMTargetLowering::emitStoreConditional(IRBuilderBase &Builder,
   Type *Tys[] = { Addr->getType() };
   Function *Strex = Intrinsic::getDeclaration(M, Int, Tys);
 
-  return Builder.CreateCall(
+  CallInst *CI = Builder.CreateCall(
       Strex, {Builder.CreateZExtOrBitCast(
                   Val, Strex->getFunctionType()->getParamType(0)),
               Addr});
+  CI->addParamAttr(1, Attribute::get(M->getContext(), Attribute::ElementType,
+                                     Val->getType()));
+  return CI;
 }
 
 
@@ -21273,7 +21338,7 @@ bool ARMTargetLowering::lowerInterleavedLoad(
 
       SmallVector<Value *, 2> Ops;
       Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
-      Ops.push_back(Builder.getInt32(LI->getAlignment()));
+      Ops.push_back(Builder.getInt32(LI->getAlign().value()));
 
       return Builder.CreateCall(VldnFunc, Ops, "vldN");
     } else {
@@ -21443,7 +21508,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
       SmallVector<Value *, 6> Ops;
       Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
       append_range(Ops, Shuffles);
-      Ops.push_back(Builder.getInt32(SI->getAlignment()));
+      Ops.push_back(Builder.getInt32(SI->getAlign().value()));
       Builder.CreateCall(VstNFunc, Ops);
     } else {
       assert((Factor == 2 || Factor == 4) &&
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 1c5f8389f57c..10f60ab93ae3 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -581,7 +581,7 @@ class VectorType;
     getRegClassFor(MVT VT, bool isDivergent = false) const override;
 
     bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
-                                unsigned &PrefAlign) const override;
+                                Align &PrefAlign) const override;
 
     /// createFastISel - This method returns a target specific FastISel object,
     /// or null if the target does not support "fast" ISel.
@@ -665,7 +665,8 @@ class VectorType;
     bool shouldInsertFencesForAtomic(const Instruction *I) const override;
     TargetLoweringBase::AtomicExpansionKind
     shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
-    bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+    TargetLoweringBase::AtomicExpansionKind
+    shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
     TargetLoweringBase::AtomicExpansionKind
     shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
     TargetLoweringBase::AtomicExpansionKind
@@ -713,8 +714,8 @@ class VectorType;
                                       Align Alignment,
                                       const DataLayout &DL) const;
 
-    bool isMulAddWithConstProfitable(const SDValue &AddNode,
-                                     const SDValue &ConstNode) const override;
+    bool isMulAddWithConstProfitable(SDValue AddNode,
+                                     SDValue ConstNode) const override;
 
     bool alignLoopsWithOptSize() const override;
 
@@ -845,8 +846,7 @@ class VectorType;
     SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFSETCC(SDValue Op, SelectionDAG &DAG) const;
-    void lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results,
-                  SelectionDAG &DAG) const;
+    SDValue LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const;
     void LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
                    SelectionDAG &DAG) const;
 
diff --git a/llvm/lib/Target/ARM/ARMInstrFormats.td b/llvm/lib/Target/ARM/ARMInstrFormats.td
index ff5afd787c82..c9a2d21bec53 100644
--- a/llvm/lib/Target/ARM/ARMInstrFormats.td
+++ b/llvm/lib/Target/ARM/ARMInstrFormats.td
@@ -1589,9 +1589,9 @@ class VFPXI<dag oops, dag iops, AddrMode am, int sz,
 }
 
 class VFPAI<dag oops, dag iops, Format f, InstrItinClass itin,
-            string opc, string asm, list<dag> pattern>
+            string opc, string asm, string cstr, list<dag> pattern>
   : VFPI<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin,
-         opc, asm, "", pattern> {
+         opc, asm, cstr, pattern> {
   let PostEncoderMethod = "VFPThumb2PostEncoder";
 }
 
@@ -1751,8 +1751,8 @@ class AXSI4<dag oops, dag iops, IndexMode im, InstrItinClass itin,
 // Double precision, unary
 class ADuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
            bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
-           string asm, list<dag> pattern>
-  : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> {
+           string asm, string cstr, list<dag> pattern>
+  : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, cstr, pattern> {
   // Instruction operands.
   bits<5> Dd;
   bits<5> Dm;
@@ -1804,7 +1804,7 @@ class ADuInp<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
 class ADbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
            dag iops, InstrItinClass itin, string opc, string asm,
            list<dag> pattern>
-  : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> {
+  : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, "", pattern> {
   // Instruction operands.
   bits<5> Dd;
   bits<5> Dn;
@@ -1862,8 +1862,8 @@ class ADbInp<bits<5> opcod1, bits<2> opcod2, bit opcod3, dag oops, dag iops,
 // Single precision, unary, predicated
 class ASuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
            bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
-           string asm, list<dag> pattern>
-  : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> {
+           string asm, string cstr, list<dag> pattern>
+  : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, cstr, pattern> {
   // Instruction operands.
   bits<5> Sd;
   bits<5> Sm;
@@ -1916,14 +1916,14 @@ class ASuIn<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
             bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
             string asm, list<dag> pattern>
   : ASuI<opcod1, opcod2, opcod3, opcod4, opcod5, oops, iops, itin, opc, asm,
-         pattern> {
+         "", pattern> {
   list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP];
 }
 
 // Single precision, binary
 class ASbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, dag iops,
            InstrItinClass itin, string opc, string asm, list<dag> pattern>
-  : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> {
+  : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, "", pattern> {
   // Instruction operands.
   bits<5> Sd;
   bits<5> Sn;
@@ -2000,7 +2000,7 @@ class ASbIn<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
 class AHuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
            bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
            string asm, list<dag> pattern>
-  : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> {
+  : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, "", pattern> {
   list<Predicate> Predicates = [HasFullFP16];
 
   // Instruction operands.
@@ -2056,7 +2056,7 @@ class AHuInp<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
 // Half precision, binary
 class AHbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, dag iops,
            InstrItinClass itin, string opc, string asm, list<dag> pattern>
-  : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> {
+  : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, "", pattern> {
   list<Predicate> Predicates = [HasFullFP16];
 
   // Instruction operands.
@@ -2116,7 +2116,7 @@ class AHbInp<bits<5> opcod1, bits<2> opcod2, bit opcod3, dag oops, dag iops,
 class AVConv1I<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<4> opcod4,
                dag oops, dag iops, InstrItinClass itin, string opc, string asm,
                list<dag> pattern>
-  : VFPAI<oops, iops, VFPConv1Frm, itin, opc, asm, pattern> {
+  : VFPAI<oops, iops, VFPConv1Frm, itin, opc, asm, "", pattern> {
   let Inst{27-23} = opcod1;
   let Inst{21-20} = opcod2;
   let Inst{19-16} = opcod3;
@@ -2149,7 +2149,7 @@ class AVConv1In<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<4> opcod4,
 class AVConvXI<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, Format f,
                InstrItinClass itin,
                string opc, string asm, list<dag> pattern>
-  : VFPAI<oops, iops, f, itin, opc, asm, pattern> {
+  : VFPAI<oops, iops, f, itin, opc, asm, "", pattern> {
   let Inst{27-20} = opcod1;
   let Inst{11-8}  = opcod2;
   let Inst{4}     = 1;
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index 32a3911d3369..88bb74d1fc54 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -5129,6 +5129,7 @@ let hasNoSchedulingInfo = 1 in
 def TSB : AInoP<(outs), (ins tsb_opt:$opt), MiscFrm, NoItinerary,
                 "tsb", "\t$opt", []>, Requires<[IsARM, HasV8_4a]> {
   let Inst{31-0} = 0xe320f012;
+  let DecoderMethod = "DecodeTSBInstruction";
 }
 
 }
@@ -6387,7 +6388,7 @@ def : ARMInstAlias<"neg${s}${p} $Rd, $Rm",
                    (RSBri GPR:$Rd, GPR:$Rm, 0, pred:$p, cc_out:$s)>;
 
 // Pre-v6, 'mov r0, r0' was used as a NOP encoding.
-def : InstAlias<"nop${p}", (MOVr R0, R0, pred:$p, zero_reg)>,
+def : InstAlias<"nop${p}", (MOVr R0, R0, pred:$p, zero_reg), 0>,
          Requires<[IsARM, NoV6]>;
 
 // MUL/UMLAL/SMLAL/UMULL/SMULL are available on all arches, but
@@ -6415,8 +6416,7 @@ def : InstAlias<"umull${s}${p} $RdLo, $RdHi, $Rn, $Rm",
 
 // 'it' blocks in ARM mode just validate the predicates. The IT itself
 // is discarded.
-def ITasm : ARMAsmPseudo<"it$mask $cc", (ins it_pred:$cc, it_mask:$mask)>,
-         ComplexDeprecationPredicate<"IT">;
+def ITasm : ARMAsmPseudo<"it$mask $cc", (ins it_pred:$cc, it_mask:$mask)>;
 
 let mayLoad = 1, mayStore =1, hasSideEffects = 1, hasNoSchedulingInfo = 1 in
 def SPACE : PseudoInst<(outs GPR:$Rd), (ins i32imm:$size, GPR:$Rn),
@@ -6476,3 +6476,24 @@ def CompilerBarrier : PseudoInst<(outs), (ins i32imm:$ordering), NoItinerary,
   let AsmString = "@ COMPILER BARRIER";
   let hasNoSchedulingInfo = 1;
 }
+
+//===----------------------------------------------------------------------===//
+// Instructions used for emitting unwind opcodes on Windows.
+//===----------------------------------------------------------------------===//
+let isPseudo = 1 in {
+  def SEH_StackAlloc : PseudoInst<(outs), (ins i32imm:$size, i32imm:$wide), NoItinerary, []>, Sched<[]>;
+  def SEH_SaveRegs : PseudoInst<(outs), (ins i32imm:$mask, i32imm:$wide), NoItinerary, []>, Sched<[]>;
+  let isTerminator = 1 in
+  def SEH_SaveRegs_Ret : PseudoInst<(outs), (ins i32imm:$mask, i32imm:$wide), NoItinerary, []>, Sched<[]>;
+  def SEH_SaveSP : PseudoInst<(outs), (ins i32imm:$reg), NoItinerary, []>, Sched<[]>;
+  def SEH_SaveFRegs : PseudoInst<(outs), (ins i32imm:$first, i32imm:$last), NoItinerary, []>, Sched<[]>;
+  let isTerminator = 1 in
+  def SEH_SaveLR : PseudoInst<(outs), (ins i32imm:$offst), NoItinerary, []>, Sched<[]>;
+  def SEH_Nop : PseudoInst<(outs), (ins i32imm:$wide), NoItinerary, []>, Sched<[]>;
+  let isTerminator = 1 in
+  def SEH_Nop_Ret : PseudoInst<(outs), (ins i32imm:$wide), NoItinerary, []>, Sched<[]>;
+  def SEH_PrologEnd : PseudoInst<(outs), (ins), NoItinerary, []>, Sched<[]>;
+  def SEH_EpilogStart : PseudoInst<(outs), (ins), NoItinerary, []>, Sched<[]>;
+  let isTerminator = 1 in
+  def SEH_EpilogEnd : PseudoInst<(outs), (ins), NoItinerary, []>, Sched<[]>;
+}
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 1ae0354ffc37..15c33014e988 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -2192,36 +2192,29 @@ def subnsw : PatFrag<(ops node:$lhs, node:$rhs),
   return N->getFlags().hasNoSignedWrap();
 }]>;
 
-multiclass MVE_VRHADD_m<MVEVectorVTInfo VTI,
-                      SDNode unpred_op, Intrinsic pred_int> {
+multiclass MVE_VRHADD_m<MVEVectorVTInfo VTI, SDNode Op,
+                      SDNode unpred_op, Intrinsic PredInt> {
   def "" : MVE_VRHADD_Base<VTI.Suffix, VTI.Unsigned, VTI.Size>;
   defvar Inst = !cast<Instruction>(NAME);
+  defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? (i32 VTI.Unsigned)), !cast<Instruction>(NAME)>;
 
   let Predicates = [HasMVEInt] in {
-    // Unpredicated rounding add-with-divide-by-two
+    // Unpredicated rounding add-with-divide-by-two intrinsic
     def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
                             (i32 VTI.Unsigned))),
               (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
-
-    // Predicated add-with-divide-by-two
-    def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                            (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
-                            (VTI.Vec MQPR:$inactive))),
-              (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                             ARMVCCThen, (VTI.Pred VCCR:$mask), zero_reg,
-                             (VTI.Vec MQPR:$inactive)))>;
   }
 }
 
-multiclass MVE_VRHADD<MVEVectorVTInfo VTI>
-  : MVE_VRHADD_m<VTI, int_arm_mve_vrhadd, int_arm_mve_rhadd_predicated>;
+multiclass MVE_VRHADD<MVEVectorVTInfo VTI, SDNode rhadd>
+  : MVE_VRHADD_m<VTI, rhadd, int_arm_mve_vrhadd, int_arm_mve_rhadd_predicated>;
 
-defm MVE_VRHADDs8  : MVE_VRHADD<MVE_v16s8>;
-defm MVE_VRHADDs16 : MVE_VRHADD<MVE_v8s16>;
-defm MVE_VRHADDs32 : MVE_VRHADD<MVE_v4s32>;
-defm MVE_VRHADDu8  : MVE_VRHADD<MVE_v16u8>;
-defm MVE_VRHADDu16 : MVE_VRHADD<MVE_v8u16>;
-defm MVE_VRHADDu32 : MVE_VRHADD<MVE_v4u32>;
+defm MVE_VRHADDs8  : MVE_VRHADD<MVE_v16s8, avgceils>;
+defm MVE_VRHADDs16 : MVE_VRHADD<MVE_v8s16, avgceils>;
+defm MVE_VRHADDs32 : MVE_VRHADD<MVE_v4s32, avgceils>;
+defm MVE_VRHADDu8  : MVE_VRHADD<MVE_v16u8, avgceilu>;
+defm MVE_VRHADDu16 : MVE_VRHADD<MVE_v8u16, avgceilu>;
+defm MVE_VRHADDu32 : MVE_VRHADD<MVE_v4u32, avgceilu>;
 
 // Rounding Halving Add perform the arithemtic operation with an extra bit of
 // precision, before performing the shift, to void clipping errors. We're not
@@ -2303,11 +2296,12 @@ class MVE_VHSUB_<string suffix, bit U, bits<2> size,
               list<dag> pattern=[]>
   : MVE_VHADDSUB<"vhsub", suffix, U, 0b1, size, pattern>;
 
-multiclass MVE_VHADD_m<MVEVectorVTInfo VTI,
-                      SDNode unpred_op, Intrinsic pred_int, PatFrag add_op,
+multiclass MVE_VHADD_m<MVEVectorVTInfo VTI, SDNode Op,
+                      SDNode unpred_op, Intrinsic PredInt, PatFrag add_op,
                       SDNode shift_op> {
   def "" : MVE_VHADD_<VTI.Suffix, VTI.Unsigned, VTI.Size>;
   defvar Inst = !cast<Instruction>(NAME);
+  defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? (i32 VTI.Unsigned)), !cast<Instruction>(NAME)>;
 
   let Predicates = [HasMVEInt] in {
     // Unpredicated add-and-divide-by-two
@@ -2316,30 +2310,23 @@ multiclass MVE_VHADD_m<MVEVectorVTInfo VTI,
 
     def : Pat<(VTI.Vec (shift_op (add_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)), (i32 1))),
               (Inst MQPR:$Qm, MQPR:$Qn)>;
-
-    // Predicated add-and-divide-by-two
-    def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned),
-                            (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
-              (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                             ARMVCCThen, (VTI.Pred VCCR:$mask), zero_reg,
-                             (VTI.Vec MQPR:$inactive)))>;
   }
 }
 
-multiclass MVE_VHADD<MVEVectorVTInfo VTI, PatFrag add_op, SDNode shift_op>
-  : MVE_VHADD_m<VTI, int_arm_mve_vhadd, int_arm_mve_hadd_predicated, add_op,
+multiclass MVE_VHADD<MVEVectorVTInfo VTI, SDNode Op, PatFrag add_op, SDNode shift_op>
+  : MVE_VHADD_m<VTI, Op, int_arm_mve_vhadd, int_arm_mve_hadd_predicated, add_op,
                 shift_op>;
 
 // Halving add/sub perform the arithemtic operation with an extra bit of
 // precision, before performing the shift, to void clipping errors. We're not
 // modelling that here with these patterns, but we're using no wrap forms of
 // add/sub to ensure that the extra bit of information is not needed.
-defm MVE_VHADDs8  : MVE_VHADD<MVE_v16s8, addnsw, ARMvshrsImm>;
-defm MVE_VHADDs16 : MVE_VHADD<MVE_v8s16, addnsw, ARMvshrsImm>;
-defm MVE_VHADDs32 : MVE_VHADD<MVE_v4s32, addnsw, ARMvshrsImm>;
-defm MVE_VHADDu8  : MVE_VHADD<MVE_v16u8, addnuw, ARMvshruImm>;
-defm MVE_VHADDu16 : MVE_VHADD<MVE_v8u16, addnuw, ARMvshruImm>;
-defm MVE_VHADDu32 : MVE_VHADD<MVE_v4u32, addnuw, ARMvshruImm>;
+defm MVE_VHADDs8  : MVE_VHADD<MVE_v16s8, avgfloors, addnsw, ARMvshrsImm>;
+defm MVE_VHADDs16 : MVE_VHADD<MVE_v8s16, avgfloors, addnsw, ARMvshrsImm>;
+defm MVE_VHADDs32 : MVE_VHADD<MVE_v4s32, avgfloors, addnsw, ARMvshrsImm>;
+defm MVE_VHADDu8  : MVE_VHADD<MVE_v16u8, avgflooru, addnuw, ARMvshruImm>;
+defm MVE_VHADDu16 : MVE_VHADD<MVE_v8u16, avgflooru, addnuw, ARMvshruImm>;
+defm MVE_VHADDu32 : MVE_VHADD<MVE_v4u32, avgflooru, addnuw, ARMvshruImm>;
 
 multiclass MVE_VHSUB_m<MVEVectorVTInfo VTI,
                       SDNode unpred_op, Intrinsic pred_int, PatFrag sub_op,
@@ -5372,10 +5359,10 @@ class MVE_VxADDSUB_qr<string iname, string suffix,
   let validForTailPredication = 1;
 }
 
-multiclass MVE_VHADDSUB_qr_m<string iname, MVEVectorVTInfo VTI, bit subtract,
-                             Intrinsic unpred_int, Intrinsic pred_int, PatFrag add_op,
-                             SDNode shift_op> {
+multiclass MVE_VHADDSUB_qr_m<string iname, MVEVectorVTInfo VTI, bit subtract, SDNode Op,
+                             Intrinsic unpred_int, Intrinsic pred_int, PatFrag add_op, PatFrag shift_op> {
   def "" : MVE_VxADDSUB_qr<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, subtract, VTI.Size>;
+  defm : MVE_TwoOpPatternDup<VTI, Op, pred_int, (? (i32 VTI.Unsigned)), !cast<Instruction>(NAME)>;
   defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME),
                                   VTI, unpred_int, pred_int, 1, 1>;
   defvar Inst = !cast<Instruction>(NAME);
@@ -5386,20 +5373,20 @@ multiclass MVE_VHADDSUB_qr_m<string iname, MVEVectorVTInfo VTI, bit subtract,
   }
 }
 
-multiclass MVE_VHADD_qr_m<MVEVectorVTInfo VTI, PatFrag add_op, SDNode shift_op> :
-  MVE_VHADDSUB_qr_m<"vhadd", VTI, 0b0, int_arm_mve_vhadd, int_arm_mve_hadd_predicated,
-                    add_op, shift_op>;
+multiclass MVE_VHADD_qr_m<MVEVectorVTInfo VTI, PatFrag add_op, SDNode shift_op, SDNode Op> :
+  MVE_VHADDSUB_qr_m<"vhadd", VTI, 0b0, Op, int_arm_mve_vhadd,
+                    int_arm_mve_hadd_predicated, add_op, shift_op>;
 
 multiclass MVE_VHSUB_qr_m<MVEVectorVTInfo VTI, PatFrag add_op, SDNode shift_op> :
-  MVE_VHADDSUB_qr_m<"vhsub", VTI, 0b1, int_arm_mve_vhsub, int_arm_mve_hsub_predicated,
-                    add_op, shift_op>;
-
-defm MVE_VHADD_qr_s8  : MVE_VHADD_qr_m<MVE_v16s8, addnsw, ARMvshrsImm>;
-defm MVE_VHADD_qr_s16 : MVE_VHADD_qr_m<MVE_v8s16, addnsw, ARMvshrsImm>;
-defm MVE_VHADD_qr_s32 : MVE_VHADD_qr_m<MVE_v4s32, addnsw, ARMvshrsImm>;
-defm MVE_VHADD_qr_u8  : MVE_VHADD_qr_m<MVE_v16u8, addnuw, ARMvshruImm>;
-defm MVE_VHADD_qr_u16 : MVE_VHADD_qr_m<MVE_v8u16, addnuw, ARMvshruImm>;
-defm MVE_VHADD_qr_u32 : MVE_VHADD_qr_m<MVE_v4u32, addnuw, ARMvshruImm>;
+  MVE_VHADDSUB_qr_m<"vhsub", VTI, 0b1, null_frag, int_arm_mve_vhsub,
+                    int_arm_mve_hsub_predicated, add_op, shift_op>;
+
+defm MVE_VHADD_qr_s8  : MVE_VHADD_qr_m<MVE_v16s8, addnsw, ARMvshrsImm, avgfloors>;
+defm MVE_VHADD_qr_s16 : MVE_VHADD_qr_m<MVE_v8s16, addnsw, ARMvshrsImm, avgfloors>;
+defm MVE_VHADD_qr_s32 : MVE_VHADD_qr_m<MVE_v4s32, addnsw, ARMvshrsImm, avgfloors>;
+defm MVE_VHADD_qr_u8  : MVE_VHADD_qr_m<MVE_v16u8, addnuw, ARMvshruImm, avgflooru>;
+defm MVE_VHADD_qr_u16 : MVE_VHADD_qr_m<MVE_v8u16, addnuw, ARMvshruImm, avgflooru>;
+defm MVE_VHADD_qr_u32 : MVE_VHADD_qr_m<MVE_v4u32, addnuw, ARMvshruImm, avgflooru>;
 
 defm MVE_VHSUB_qr_s8  : MVE_VHSUB_qr_m<MVE_v16s8, subnsw, ARMvshrsImm>;
 defm MVE_VHSUB_qr_s16 : MVE_VHSUB_qr_m<MVE_v8s16, subnsw, ARMvshrsImm>;
diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td
index 357aa6d062e9..cdad8e106de6 100644
--- a/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -6946,6 +6946,9 @@ def  VCVTh2f  : N2VLInt<0b11, 0b11, 0b01, 0b10, 0b01110, 0, 0,
                         v4f32, v4i16, int_arm_neon_vcvthf2fp>,
                 Requires<[HasNEON, HasFP16]>;
 
+def : Pat<(v4f16 (fpround (v4f32 QPR:$src))), (VCVTf2h QPR:$src)>;
+def : Pat<(v4f32 (fpextend (v4f16 DPR:$src))), (VCVTh2f DPR:$src)>;
+
 // Vector Reverse.
 
 //   VREV64   : Vector Reverse elements within 64-bit doublewords
diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td
index f80b9a5053f7..20d8a45aaf49 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -3561,6 +3561,7 @@ let hasNoSchedulingInfo = 1 in
 def t2TSB : T2I<(outs), (ins tsb_opt:$opt), NoItinerary,
                 "tsb", "\t$opt", []>, Requires<[IsThumb, HasV8_4a]> {
   let Inst{31-0} = 0xf3af8012;
+  let DecoderMethod = "DecodeTSBInstruction";
 }
 }
 
@@ -3950,6 +3951,7 @@ def t2Bcc : T2I<(outs), (ins brtarget:$target), IIC_Br,
 
 // Tail calls. The MachO version of thumb tail calls uses a t2 branch, so
 // it goes here.
+// Windows SEH unwinding also needs a strict t2 branch for tail calls.
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
   // IOS version.
   let Uses = [SP] in
@@ -3957,15 +3959,14 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
                    (ins thumb_br_target:$dst, pred:$p),
                    4, IIC_Br, [],
                    (t2B thumb_br_target:$dst, pred:$p)>,
-                 Requires<[IsThumb2, IsMachO]>, Sched<[WriteBr]>;
+                 Requires<[IsThumb2]>, Sched<[WriteBr]>;
 }
 
 // IT block
 let Defs = [ITSTATE] in
 def t2IT : Thumb2XI<(outs), (ins it_pred:$cc, it_mask:$mask),
                     AddrModeNone, 2,  IIC_iALUx,
-                    "it$mask\t$cc", "", []>,
-           ComplexDeprecationPredicate<"IT"> {
+                    "it$mask\t$cc", "", []> {
   // 16-bit instruction.
   let Inst{31-16} = 0x0000;
   let Inst{15-8} = 0b10111111;
diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td
index dc5f1b92a6c2..b233555d5225 100644
--- a/llvm/lib/Target/ARM/ARMInstrVFP.td
+++ b/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -584,12 +584,12 @@ def : Pat<(fmul (fneg SPR:$a), SPR:$b),
 let Defs = [FPSCR_NZCV] in {
 def VCMPED : ADuI<0b11101, 0b11, 0b0100, 0b11, 0,
                   (outs), (ins DPR:$Dd, DPR:$Dm),
-                  IIC_fpCMP64, "vcmpe", ".f64\t$Dd, $Dm",
+                  IIC_fpCMP64, "vcmpe", ".f64\t$Dd, $Dm", "",
                   [(arm_cmpfpe DPR:$Dd, (f64 DPR:$Dm))]>;
 
 def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0,
                   (outs), (ins SPR:$Sd, SPR:$Sm),
-                  IIC_fpCMP32, "vcmpe", ".f32\t$Sd, $Sm",
+                  IIC_fpCMP32, "vcmpe", ".f32\t$Sd, $Sm", "",
                   [(arm_cmpfpe SPR:$Sd, SPR:$Sm)]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
@@ -603,12 +603,12 @@ def VCMPEH : AHuI<0b11101, 0b11, 0b0100, 0b11, 0,
 
 def VCMPD  : ADuI<0b11101, 0b11, 0b0100, 0b01, 0,
                   (outs), (ins DPR:$Dd, DPR:$Dm),
-                  IIC_fpCMP64, "vcmp", ".f64\t$Dd, $Dm",
+                  IIC_fpCMP64, "vcmp", ".f64\t$Dd, $Dm", "",
                   [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm))]>;
 
 def VCMPS  : ASuI<0b11101, 0b11, 0b0100, 0b01, 0,
                   (outs), (ins SPR:$Sd, SPR:$Sm),
-                  IIC_fpCMP32, "vcmp", ".f32\t$Sd, $Sm",
+                  IIC_fpCMP32, "vcmp", ".f32\t$Sd, $Sm", "",
                   [(arm_cmpfp SPR:$Sd, SPR:$Sm)]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
@@ -627,7 +627,7 @@ def VCMPH  : AHuI<0b11101, 0b11, 0b0100, 0b01, 0,
 
 def VABSD  : ADuI<0b11101, 0b11, 0b0000, 0b11, 0,
                   (outs DPR:$Dd), (ins DPR:$Dm),
-                  IIC_fpUNA64, "vabs", ".f64\t$Dd, $Dm",
+                  IIC_fpUNA64, "vabs", ".f64\t$Dd, $Dm", "",
                   [(set DPR:$Dd, (fabs (f64 DPR:$Dm)))]>;
 
 def VABSS  : ASuIn<0b11101, 0b11, 0b0000, 0b11, 0,
@@ -647,7 +647,7 @@ def VABSH  : AHuI<0b11101, 0b11, 0b0000, 0b11, 0,
 let Defs = [FPSCR_NZCV] in {
 def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0,
                    (outs), (ins DPR:$Dd),
-                   IIC_fpCMP64, "vcmpe", ".f64\t$Dd, #0",
+                   IIC_fpCMP64, "vcmpe", ".f64\t$Dd, #0", "",
                    [(arm_cmpfpe0 (f64 DPR:$Dd))]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
@@ -655,7 +655,7 @@ def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0,
 
 def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0,
                    (outs), (ins SPR:$Sd),
-                   IIC_fpCMP32, "vcmpe", ".f32\t$Sd, #0",
+                   IIC_fpCMP32, "vcmpe", ".f32\t$Sd, #0", "",
                    [(arm_cmpfpe0 SPR:$Sd)]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
@@ -675,7 +675,7 @@ def VCMPEZH : AHuI<0b11101, 0b11, 0b0101, 0b11, 0,
 
 def VCMPZD  : ADuI<0b11101, 0b11, 0b0101, 0b01, 0,
                    (outs), (ins DPR:$Dd),
-                   IIC_fpCMP64, "vcmp", ".f64\t$Dd, #0",
+                   IIC_fpCMP64, "vcmp", ".f64\t$Dd, #0", "",
                    [(arm_cmpfp0 (f64 DPR:$Dd))]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
@@ -683,7 +683,7 @@ def VCMPZD  : ADuI<0b11101, 0b11, 0b0101, 0b01, 0,
 
 def VCMPZS  : ASuI<0b11101, 0b11, 0b0101, 0b01, 0,
                    (outs), (ins SPR:$Sd),
-                   IIC_fpCMP32, "vcmp", ".f32\t$Sd, #0",
+                   IIC_fpCMP32, "vcmp", ".f32\t$Sd, #0", "",
                    [(arm_cmpfp0 SPR:$Sd)]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
@@ -704,7 +704,7 @@ def VCMPZH  : AHuI<0b11101, 0b11, 0b0101, 0b01, 0,
 
 def VCVTDS  : ASuI<0b11101, 0b11, 0b0111, 0b11, 0,
                    (outs DPR:$Dd), (ins SPR:$Sm),
-                   IIC_fpCVTDS, "vcvt", ".f64.f32\t$Dd, $Sm",
+                   IIC_fpCVTDS, "vcvt", ".f64.f32\t$Dd, $Sm", "",
                    [(set DPR:$Dd, (fpextend SPR:$Sm))]>,
              Sched<[WriteFPCVT]> {
   // Instruction operands.
@@ -723,7 +723,7 @@ def VCVTDS  : ASuI<0b11101, 0b11, 0b0111, 0b11, 0,
 
 // Special case encoding: bits 11-8 is 0b1011.
 def VCVTSD  : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm,
-                    IIC_fpCVTSD, "vcvt", ".f32.f64\t$Sd, $Dm",
+                    IIC_fpCVTSD, "vcvt", ".f32.f64\t$Sd, $Dm", "",
                     [(set SPR:$Sd, (fpround DPR:$Dm))]>,
               Sched<[WriteFPCVT]> {
   // Instruction operands.
@@ -749,7 +749,7 @@ def VCVTSD  : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm,
 // Between half, single and double-precision.
 let hasSideEffects = 0 in
 def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
-                 /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm",
+                 /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm", "",
                  [/* Intentionally left blank, see patterns below */]>,
                  Requires<[HasFP16]>,
              Sched<[WriteFPCVT]>;
@@ -760,26 +760,30 @@ def : FP16Pat<(f16_to_fp GPR:$a),
               (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>;
 
 let hasSideEffects = 0 in
-def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
-                 /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm",
+def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sda, SPR:$Sm),
+                 /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm", "$Sd = $Sda",
                  [/* Intentionally left blank, see patterns below */]>,
                  Requires<[HasFP16]>,
              Sched<[WriteFPCVT]>;
 
 def : FP16Pat<(f16 (fpround SPR:$Sm)),
-              (COPY_TO_REGCLASS (VCVTBSH SPR:$Sm), HPR)>;
+              (COPY_TO_REGCLASS (VCVTBSH (IMPLICIT_DEF), SPR:$Sm), HPR)>;
 def : FP16Pat<(fp_to_f16 SPR:$a),
-              (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>;
+              (i32 (COPY_TO_REGCLASS (VCVTBSH (IMPLICIT_DEF), SPR:$a), GPR))>;
 def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_even:$lane),
-              (v8f16 (INSERT_SUBREG (v8f16 MQPR:$src1), (VCVTBSH SPR:$src2),
+              (v8f16 (INSERT_SUBREG (v8f16 MQPR:$src1),
+                                    (VCVTBSH (EXTRACT_SUBREG (v8f16 MQPR:$src1), (SSubReg_f16_reg imm:$lane)),
+                                             SPR:$src2),
                                     (SSubReg_f16_reg imm:$lane)))>;
 def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_even:$lane),
-              (v4f16 (INSERT_SUBREG (v4f16 DPR:$src1), (VCVTBSH SPR:$src2),
+              (v4f16 (INSERT_SUBREG (v4f16 DPR:$src1),
+                                    (VCVTBSH (EXTRACT_SUBREG (v4f16 DPR:$src1), (SSubReg_f16_reg imm:$lane)),
+                                             SPR:$src2),
                                     (SSubReg_f16_reg imm:$lane)))>;
 
 let hasSideEffects = 0 in
 def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
-                 /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$Sd, $Sm",
+                 /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$Sd, $Sm", "",
                  [/* Intentionally left blank, see patterns below */]>,
                  Requires<[HasFP16]>,
              Sched<[WriteFPCVT]>;
@@ -792,22 +796,26 @@ def : FP16Pat<(f32 (fpextend (extractelt (v4f16 DPR:$src), imm_odd:$lane))),
                 (SSubReg_f16_reg imm_odd:$lane)))>;
 
 let hasSideEffects = 0 in
-def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
-                 /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$Sd, $Sm",
+def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sda, SPR:$Sm),
+                 /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$Sd, $Sm", "$Sd = $Sda",
                  [/* Intentionally left blank, see patterns below */]>,
                  Requires<[HasFP16]>,
             Sched<[WriteFPCVT]>;
 
 def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_odd:$lane),
-              (v8f16 (INSERT_SUBREG (v8f16 MQPR:$src1), (VCVTTSH SPR:$src2),
+              (v8f16 (INSERT_SUBREG (v8f16 MQPR:$src1),
+                                    (VCVTTSH (EXTRACT_SUBREG (v8f16 MQPR:$src1), (SSubReg_f16_reg imm:$lane)),
+                                             SPR:$src2),
                                     (SSubReg_f16_reg imm:$lane)))>;
 def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_odd:$lane),
-              (v4f16 (INSERT_SUBREG (v4f16 DPR:$src1), (VCVTTSH SPR:$src2),
+              (v4f16 (INSERT_SUBREG (v4f16 DPR:$src1),
+                                    (VCVTTSH (EXTRACT_SUBREG (v4f16 DPR:$src1), (SSubReg_f16_reg imm:$lane)),
+                                             SPR:$src2),
                                     (SSubReg_f16_reg imm:$lane)))>;
 
 def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0,
                    (outs DPR:$Dd), (ins SPR:$Sm),
-                   NoItinerary, "vcvtb", ".f64.f16\t$Dd, $Sm",
+                   NoItinerary, "vcvtb", ".f64.f16\t$Dd, $Sm", "",
                    [/* Intentionally left blank, see patterns below */]>,
                    Requires<[HasFPARMv8, HasDPVFP]>,
               Sched<[WriteFPCVT]> {
@@ -829,8 +837,8 @@ def : FP16Pat<(f64 (f16_to_fp GPR:$a)),
               Requires<[HasFPARMv8, HasDPVFP]>;
 
 def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0,
-                   (outs SPR:$Sd), (ins DPR:$Dm),
-                   NoItinerary, "vcvtb", ".f16.f64\t$Sd, $Dm",
+                   (outs SPR:$Sd), (ins SPR:$Sda, DPR:$Dm),
+                   NoItinerary, "vcvtb", ".f16.f64\t$Sd, $Dm", "$Sd = $Sda",
                    [/* Intentionally left blank, see patterns below */]>,
                    Requires<[HasFPARMv8, HasDPVFP]> {
   // Instruction operands.
@@ -847,15 +855,15 @@ def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0,
 }
 
 def : FullFP16Pat<(f16 (fpround DPR:$Dm)),
-                  (COPY_TO_REGCLASS (VCVTBDH DPR:$Dm), HPR)>,
+                  (COPY_TO_REGCLASS (VCVTBDH (IMPLICIT_DEF), DPR:$Dm), HPR)>,
                   Requires<[HasFPARMv8, HasDPVFP]>;
 def : FP16Pat<(fp_to_f16 (f64 DPR:$a)),
-              (i32 (COPY_TO_REGCLASS (VCVTBDH DPR:$a), GPR))>,
+              (i32 (COPY_TO_REGCLASS (VCVTBDH (IMPLICIT_DEF), DPR:$a), GPR))>,
                    Requires<[HasFPARMv8, HasDPVFP]>;
 
 def VCVTTHD : ADuI<0b11101, 0b11, 0b0010, 0b11, 0,
                    (outs DPR:$Dd), (ins SPR:$Sm),
-                   NoItinerary, "vcvtt", ".f64.f16\t$Dd, $Sm",
+                   NoItinerary, "vcvtt", ".f64.f16\t$Dd, $Sm", "",
                    []>, Requires<[HasFPARMv8, HasDPVFP]> {
   // Instruction operands.
   bits<5> Sm;
@@ -868,8 +876,8 @@ def VCVTTHD : ADuI<0b11101, 0b11, 0b0010, 0b11, 0,
 }
 
 def VCVTTDH : ADuI<0b11101, 0b11, 0b0011, 0b11, 0,
-                   (outs SPR:$Sd), (ins DPR:$Dm),
-                   NoItinerary, "vcvtt", ".f16.f64\t$Sd, $Dm",
+                   (outs SPR:$Sd), (ins SPR:$Sda, DPR:$Dm),
+                   NoItinerary, "vcvtt", ".f16.f64\t$Sd, $Dm", "$Sd = $Sda",
                    []>, Requires<[HasFPARMv8, HasDPVFP]> {
   // Instruction operands.
   bits<5> Sd;
@@ -990,7 +998,7 @@ defm VCVTM : vcvt_inst<"m", 0b11, ffloor>;
 
 def VNEGD  : ADuI<0b11101, 0b11, 0b0001, 0b01, 0,
                   (outs DPR:$Dd), (ins DPR:$Dm),
-                  IIC_fpUNA64, "vneg", ".f64\t$Dd, $Dm",
+                  IIC_fpUNA64, "vneg", ".f64\t$Dd, $Dm", "",
                   [(set DPR:$Dd, (fneg (f64 DPR:$Dm)))]>;
 
 def VNEGS  : ASuIn<0b11101, 0b11, 0b0001, 0b01, 0,
@@ -1019,7 +1027,7 @@ multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node> {
 
   def S : ASuI<0b11101, 0b11, 0b0110, 0b11, 0,
                (outs SPR:$Sd), (ins SPR:$Sm),
-               NoItinerary, !strconcat("vrint", opc), ".f32\t$Sd, $Sm",
+               NoItinerary, !strconcat("vrint", opc), ".f32\t$Sd, $Sm", "",
                [(set (f32 SPR:$Sd), (node (f32 SPR:$Sm)))]>,
                Requires<[HasFPARMv8]> {
     let Inst{7} = op2;
@@ -1027,7 +1035,7 @@ multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node> {
   }
   def D : ADuI<0b11101, 0b11, 0b0110, 0b11, 0,
                 (outs DPR:$Dd), (ins DPR:$Dm),
-                NoItinerary, !strconcat("vrint", opc), ".f64\t$Dd, $Dm",
+                NoItinerary, !strconcat("vrint", opc), ".f64\t$Dd, $Dm", "",
                 [(set (f64 DPR:$Dd), (node (f64 DPR:$Dm)))]>,
                 Requires<[HasFPARMv8, HasDPVFP]> {
     let Inst{7} = op2;
@@ -1094,13 +1102,13 @@ defm VRINTM : vrint_inst_anpm<"m", 0b11, ffloor>;
 
 def VSQRTD : ADuI<0b11101, 0b11, 0b0001, 0b11, 0,
                   (outs DPR:$Dd), (ins DPR:$Dm),
-                  IIC_fpSQRT64, "vsqrt", ".f64\t$Dd, $Dm",
+                  IIC_fpSQRT64, "vsqrt", ".f64\t$Dd, $Dm", "",
                   [(set DPR:$Dd, (fsqrt (f64 DPR:$Dm)))]>,
              Sched<[WriteFPSQRT64]>;
 
 def VSQRTS : ASuI<0b11101, 0b11, 0b0001, 0b11, 0,
                   (outs SPR:$Sd), (ins SPR:$Sm),
-                  IIC_fpSQRT32, "vsqrt", ".f32\t$Sd, $Sm",
+                  IIC_fpSQRT32, "vsqrt", ".f32\t$Sd, $Sm", "",
                   [(set SPR:$Sd, (fsqrt SPR:$Sm))]>,
              Sched<[WriteFPSQRT32]>;
 
@@ -1113,12 +1121,12 @@ let hasSideEffects = 0 in {
 let isMoveReg = 1 in {
 def VMOVD  : ADuI<0b11101, 0b11, 0b0000, 0b01, 0,
                   (outs DPR:$Dd), (ins DPR:$Dm),
-                  IIC_fpUNA64, "vmov", ".f64\t$Dd, $Dm", []>,
+                  IIC_fpUNA64, "vmov", ".f64\t$Dd, $Dm", "", []>,
              Requires<[HasFPRegs64]>;
 
 def VMOVS  : ASuI<0b11101, 0b11, 0b0000, 0b01, 0,
                   (outs SPR:$Sd), (ins SPR:$Sm),
-                  IIC_fpUNA32, "vmov", ".f32\t$Sd, $Sm", []>,
+                  IIC_fpUNA32, "vmov", ".f32\t$Sd, $Sm", "", []>,
              Requires<[HasFPRegs]>;
 } // isMoveReg
 
@@ -1984,7 +1992,7 @@ def VULTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1011, 0b1011, 1,
 class BF16_VCVT<string opc, bits<2> op7_6>
    : VFPAI<(outs SPR:$Sd), (ins SPR:$dst, SPR:$Sm),
            VFPUnaryFrm, NoItinerary,
-           opc, ".bf16.f32\t$Sd, $Sm", []>,
+           opc, ".bf16.f32\t$Sd, $Sm", "", []>,
       RegConstraint<"$dst = $Sd">,
       Requires<[HasBF16]>,
      Sched<[]> {
@@ -2440,7 +2448,7 @@ def VMOVHcc  : PseudoInst<(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm, cmovpred:$p),
 
 class MovFromVFP<bits<4> opc19_16, dag oops, dag iops, string opc, string asm,
                  list<dag> pattern>:
-  VFPAI<oops, iops, VFPMiscFrm, IIC_fpSTAT, opc, asm, pattern> {
+  VFPAI<oops, iops, VFPMiscFrm, IIC_fpSTAT, opc, asm, "", pattern> {
 
   // Instruction operand.
   bits<4> Rt;
@@ -2525,7 +2533,7 @@ let DecoderMethod = "DecodeForVMRSandVMSR" in {
 
 class MovToVFP<bits<4> opc19_16, dag oops, dag iops, string opc, string asm,
                list<dag> pattern>:
-  VFPAI<oops, iops, VFPMiscFrm, IIC_fpSTAT, opc, asm, pattern> {
+  VFPAI<oops, iops, VFPMiscFrm, IIC_fpSTAT, opc, asm, "", pattern> {
 
   // Instruction operand.
   bits<4> Rt;
@@ -2598,7 +2606,7 @@ let DecoderMethod = "DecodeForVMRSandVMSR" in {
 let isReMaterializable = 1 in {
 def FCONSTD : VFPAI<(outs DPR:$Dd), (ins vfp_f64imm:$imm),
                     VFPMiscFrm, IIC_fpUNA64,
-                    "vmov", ".f64\t$Dd, $imm",
+                    "vmov", ".f64\t$Dd, $imm", "",
                     [(set DPR:$Dd, vfp_f64imm:$imm)]>,
               Requires<[HasVFP3,HasDPVFP]> {
   bits<5> Dd;
@@ -2617,7 +2625,7 @@ def FCONSTD : VFPAI<(outs DPR:$Dd), (ins vfp_f64imm:$imm),
 
 def FCONSTS : VFPAI<(outs SPR:$Sd), (ins vfp_f32imm:$imm),
                      VFPMiscFrm, IIC_fpUNA32,
-                     "vmov", ".f32\t$Sd, $imm",
+                     "vmov", ".f32\t$Sd, $imm", "",
                      [(set SPR:$Sd, vfp_f32imm:$imm)]>, Requires<[HasVFP3]> {
   bits<5> Sd;
   bits<8> imm;
@@ -2635,7 +2643,7 @@ def FCONSTS : VFPAI<(outs SPR:$Sd), (ins vfp_f32imm:$imm),
 
 def FCONSTH : VFPAI<(outs HPR:$Sd), (ins vfp_f16imm:$imm),
                      VFPMiscFrm, IIC_fpUNA16,
-                     "vmov", ".f16\t$Sd, $imm",
+                     "vmov", ".f16\t$Sd, $imm", "",
                      [(set (f16 HPR:$Sd), vfp_f16imm:$imm)]>,
               Requires<[HasFullFP16]> {
   bits<5> Sd;
diff --git a/llvm/lib/Target/ARM/ARMInstructionSelector.cpp b/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
index 188b5562cac9..1c44893581f9 100644
--- a/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -624,12 +624,12 @@ bool ARMInstructionSelector::selectGlobal(MachineInstrBuilder &MIB,
 
   bool UseMovt = STI.useMovt();
 
-  unsigned Size = TM.getPointerSize(0);
+  LLT PtrTy = MRI.getType(MIB->getOperand(0).getReg());
   const Align Alignment(4);
 
-  auto addOpsForConstantPoolLoad = [&MF, Alignment,
-                                    Size](MachineInstrBuilder &MIB,
-                                          const GlobalValue *GV, bool IsSBREL) {
+  auto addOpsForConstantPoolLoad = [&MF, Alignment, PtrTy](
+                                       MachineInstrBuilder &MIB,
+                                       const GlobalValue *GV, bool IsSBREL) {
     assert((MIB->getOpcode() == ARM::LDRi12 ||
             MIB->getOpcode() == ARM::t2LDRpci) &&
            "Unsupported instruction");
@@ -644,7 +644,7 @@ bool ARMInstructionSelector::selectGlobal(MachineInstrBuilder &MIB,
     MIB.addConstantPoolIndex(CPIndex, /*Offset*/ 0, /*TargetFlags*/ 0)
         .addMemOperand(MF.getMachineMemOperand(
             MachinePointerInfo::getConstantPool(MF), MachineMemOperand::MOLoad,
-            Size, Alignment));
+            PtrTy, Alignment));
     if (MIB->getOpcode() == ARM::LDRi12)
       MIB.addImm(0);
     MIB.add(predOps(ARMCC::AL));
@@ -733,7 +733,7 @@ bool ARMInstructionSelector::selectGlobal(MachineInstrBuilder &MIB,
 
     // Add the offset to the SB register.
     MIB->setDesc(TII.get(Opcodes.ADDrr));
-    MIB->RemoveOperand(1);
+    MIB->removeOperand(1);
     MIB.addReg(ARM::R9) // FIXME: don't hardcode R9
         .addReg(Offset)
         .add(predOps(ARMCC::AL))
@@ -748,7 +748,7 @@ bool ARMInstructionSelector::selectGlobal(MachineInstrBuilder &MIB,
     } else {
       // Load the global's address from the constant pool.
       MIB->setDesc(TII.get(Opcodes.ConstPoolLoad));
-      MIB->RemoveOperand(1);
+      MIB->removeOperand(1);
       addOpsForConstantPoolLoad(MIB, GV, /*IsSBREL*/ false);
     }
   } else if (STI.isTargetMachO()) {
@@ -997,7 +997,7 @@ bool ARMInstructionSelector::select(MachineInstr &I) {
     auto CPIndex =
         ConstPool->getConstantPoolIndex(I.getOperand(1).getFPImm(), Alignment);
     MIB->setDesc(TII.get(LoadOpcode));
-    MIB->RemoveOperand(1);
+    MIB->removeOperand(1);
     MIB.addConstantPoolIndex(CPIndex, /*Offset*/ 0, /*TargetFlags*/ 0)
         .addMemOperand(
             MF.getMachineMemOperand(MachinePointerInfo::getConstantPool(MF),
diff --git a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
index de88ffab1c28..52b6b6f3bcf7 100644
--- a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -14,6 +14,7 @@
 #include "ARMCallLowering.h"
 #include "ARMSubtarget.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/LowLevelType.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index ef5fc12feb54..0a38f5633ae3 100644
--- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -24,6 +24,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -33,6 +34,7 @@
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -2108,7 +2110,7 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
     return false;
 
   MF = &Fn;
-  STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
+  STI = &Fn.getSubtarget<ARMSubtarget>();
   TL = STI->getTargetLowering();
   AFI = Fn.getInfo<ARMFunctionInfo>();
   TII = STI->getInstrInfo();
@@ -2199,7 +2201,7 @@ bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
     return false;
 
   TD = &Fn.getDataLayout();
-  STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
+  STI = &Fn.getSubtarget<ARMSubtarget>();
   TII = STI->getInstrInfo();
   TRI = STI->getRegisterInfo();
   MRI = &Fn.getRegInfo();
@@ -2894,10 +2896,12 @@ bool ARMPreAllocLoadStoreOpt::DistributeIncrements(Register Base) {
     LLVM_DEBUG(dbgs() << "\nAttempting to distribute increments on VirtualReg "
                       << Base.virtRegIndex() << "\n");
 
-    // Make sure that Increment has no uses before BaseAccess.
+    // Make sure that Increment has no uses before BaseAccess that are not PHI
+    // uses.
     for (MachineInstr &Use :
         MRI->use_nodbg_instructions(Increment->getOperand(0).getReg())) {
-      if (!DT->dominates(BaseAccess, &Use) || &Use == BaseAccess) {
+      if (&Use == BaseAccess || (Use.getOpcode() != TargetOpcode::PHI &&
+                                 !DT->dominates(BaseAccess, &Use))) {
         LLVM_DEBUG(dbgs() << "  BaseAccess doesn't dominate use of increment\n");
         return false;
       }
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index f822672c4477..aa739db44da2 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -59,8 +59,10 @@
 #include "MVETailPredUtils.h"
 #include "Thumb2InstrInfo.h"
 #include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineLoopUtils.h"
@@ -1297,7 +1299,7 @@ bool LowOverheadLoop::ValidateMVEInst(MachineInstr *MI) {
 }
 
 bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
-  const ARMSubtarget &ST = static_cast<const ARMSubtarget&>(mf.getSubtarget());
+  const ARMSubtarget &ST = mf.getSubtarget<ARMSubtarget>();
   if (!ST.hasLOB())
     return false;
 
diff --git a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
index 308d5e7889f2..9596e88deb18 100644
--- a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
@@ -73,3 +73,10 @@ ARMFunctionInfo::ARMFunctionInfo(MachineFunction &MF)
     std::tie(SignReturnAddress, SignReturnAddressAll) =
         GetSignReturnAddress(MF.getFunction());
 }
+
+MachineFunctionInfo *
+ARMFunctionInfo::clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+                       const DenseMap<MachineBasicBlock *, MachineBasicBlock *>
+                           &Src2DstMBB) const {
+  return DestMF.cloneInfo<ARMFunctionInfo>(*this);
+}
diff --git a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
index d8d937055d23..e906fea1a810 100644
--- a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -86,6 +86,7 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   /// GPRCS1Size, GPRCS2Size, DPRCSSize - Sizes of callee saved register spills
   /// areas.
   unsigned FPCXTSaveSize = 0;
+  unsigned FRSaveSize = 0;
   unsigned GPRCS1Size = 0;
   unsigned GPRCS2Size = 0;
   unsigned DPRCSAlignGapSize = 0;
@@ -158,6 +159,11 @@ public:
 
   explicit ARMFunctionInfo(MachineFunction &MF);
 
+  MachineFunctionInfo *
+  clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+        const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+      const override;
+
   bool isThumbFunction() const { return isThumb; }
   bool isThumb1OnlyFunction() const { return isThumb && !hasThumb2; }
   bool isThumb2Function() const { return isThumb && hasThumb2; }
@@ -198,12 +204,14 @@ public:
   void setDPRCalleeSavedAreaOffset(unsigned o)  { DPRCSOffset = o; }
 
   unsigned getFPCXTSaveAreaSize() const       { return FPCXTSaveSize; }
+  unsigned getFrameRecordSavedAreaSize() const { return FRSaveSize; }
   unsigned getGPRCalleeSavedArea1Size() const { return GPRCS1Size; }
   unsigned getGPRCalleeSavedArea2Size() const { return GPRCS2Size; }
   unsigned getDPRCalleeSavedGapSize() const   { return DPRCSAlignGapSize; }
   unsigned getDPRCalleeSavedAreaSize()  const { return DPRCSSize; }
 
   void setFPCXTSaveAreaSize(unsigned s)       { FPCXTSaveSize = s; }
+  void setFrameRecordSavedAreaSize(unsigned s) { FRSaveSize = s; }
   void setGPRCalleeSavedArea1Size(unsigned s) { GPRCS1Size = s; }
   void setGPRCalleeSavedArea2Size(unsigned s) { GPRCS2Size = s; }
   void setDPRCalleeSavedGapSize(unsigned s)   { DPRCSAlignGapSize = s; }
diff --git a/llvm/lib/Target/ARM/ARMParallelDSP.cpp b/llvm/lib/Target/ARM/ARMParallelDSP.cpp
index 46baf8930939..6effd84041b5 100644
--- a/llvm/lib/Target/ARM/ARMParallelDSP.cpp
+++ b/llvm/lib/Target/ARM/ARMParallelDSP.cpp
@@ -459,6 +459,10 @@ bool ARMParallelDSP::Search(Value *V, BasicBlock *BB, Reduction &R) {
     if (ValidLHS && ValidRHS)
       return true;
 
+    // Ensure we don't add the root as the incoming accumulator.
+    if (R.getRoot() == I)
+      return false;
+
     return R.InsertAcc(I);
   }
   case Instruction::Mul: {
@@ -535,6 +539,7 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) {
       InsertParallelMACs(R);
       Changed = true;
       AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
+      LLVM_DEBUG(dbgs() << "BB after inserting parallel MACs:\n" << BB);
     }
   }
 
diff --git a/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp b/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
index 1a7f10a13ed3..527fefbd291e 100644
--- a/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -13,9 +13,9 @@
 #include "ARMRegisterBankInfo.h"
 #include "ARMInstrInfo.h" // For the register classes
 #include "ARMSubtarget.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterBank.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 
 #define GET_TARGET_REGBANK_IMPL
@@ -129,8 +129,7 @@ static void checkValueMappings() {
 } // end namespace arm
 } // end namespace llvm
 
-ARMRegisterBankInfo::ARMRegisterBankInfo(const TargetRegisterInfo &TRI)
-    : ARMGenRegisterBankInfo() {
+ARMRegisterBankInfo::ARMRegisterBankInfo(const TargetRegisterInfo &TRI) {
   // We have only one set of register banks, whatever the subtarget
   // is. Therefore, the initialization of the RegBanks table should be
   // done only once. Indeed the table of all register banks
diff --git a/llvm/lib/Target/ARM/ARMRegisterBankInfo.h b/llvm/lib/Target/ARM/ARMRegisterBankInfo.h
index b8aff65a967e..c56134aab38c 100644
--- a/llvm/lib/Target/ARM/ARMRegisterBankInfo.h
+++ b/llvm/lib/Target/ARM/ARMRegisterBankInfo.h
@@ -13,7 +13,7 @@
 #ifndef LLVM_LIB_TARGET_ARM_ARMREGISTERBANKINFO_H
 #define LLVM_LIB_TARGET_ARM_ARMREGISTERBANKINFO_H
 
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
 
 #define GET_REGBANK_DECLARATIONS
 #include "ARMGenRegisterBank.inc"
diff --git a/llvm/lib/Target/ARM/ARMRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMRegisterInfo.cpp
index ff4647dd46fd..d1d30e614fc9 100644
--- a/llvm/lib/Target/ARM/ARMRegisterInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMRegisterInfo.cpp
@@ -15,4 +15,4 @@ using namespace llvm;
 
 void ARMRegisterInfo::anchor() { }
 
-ARMRegisterInfo::ARMRegisterInfo() {}
+ARMRegisterInfo::ARMRegisterInfo() = default;
diff --git a/llvm/lib/Target/ARM/ARMSLSHardening.cpp b/llvm/lib/Target/ARM/ARMSLSHardening.cpp
index 332acb453124..fa80b75484e1 100644
--- a/llvm/lib/Target/ARM/ARMSLSHardening.cpp
+++ b/llvm/lib/Target/ARM/ARMSLSHardening.cpp
@@ -322,8 +322,8 @@ MachineBasicBlock &ARMSLSHardening::ConvertIndirectCallToIndirectJump(
   assert(ImpSPOpIdx != -1);
   int FirstOpIdxToRemove = std::max(ImpLROpIdx, ImpSPOpIdx);
   int SecondOpIdxToRemove = std::min(ImpLROpIdx, ImpSPOpIdx);
-  BL->RemoveOperand(FirstOpIdxToRemove);
-  BL->RemoveOperand(SecondOpIdxToRemove);
+  BL->removeOperand(FirstOpIdxToRemove);
+  BL->removeOperand(SecondOpIdxToRemove);
   // Now copy over the implicit operands from the original IndirectCall
   BL->copyImplicitOps(MF, IndirectCall);
   MF.moveCallSiteInfo(&IndirectCall, BL);
diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index 12d4ad889897..379521752261 100644
--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -296,7 +296,7 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove(
 
 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
-    SDValue Size, Align Alignment, bool isVolatile,
+    SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
     MachinePointerInfo DstPtrInfo) const {
 
   const ARMSubtarget &Subtarget =
@@ -314,6 +314,9 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
                        DAG.getZExtOrTrunc(Size, dl, MVT::i32));
   }
 
-  return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
-                                Alignment.value(), RTLIB::MEMSET);
+  if (!AlwaysInline)
+    return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
+                                  Alignment.value(), RTLIB::MEMSET);
+
+  return SDValue();
 }
diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
index 7aa831c09248..ffa8b5049351 100644
--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
@@ -55,6 +55,7 @@ public:
   SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
                                   SDValue Chain, SDValue Op1, SDValue Op2,
                                   SDValue Op3, Align Alignment, bool isVolatile,
+                                  bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo) const override;
 
   SDValue EmitSpecializedLibcall(SelectionDAG &DAG, const SDLoc &dl,
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp
index 32160b109343..79244f634ce3 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -27,6 +27,7 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
@@ -52,19 +53,15 @@ UseFusedMulOps("arm-use-mulops",
 
 enum ITMode {
   DefaultIT,
-  RestrictedIT,
-  NoRestrictedIT
+  RestrictedIT
 };
 
 static cl::opt<ITMode>
-IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT),
-   cl::ZeroOrMore,
-   cl::values(clEnumValN(DefaultIT, "arm-default-it",
-                         "Generate IT block based on arch"),
-              clEnumValN(RestrictedIT, "arm-restrict-it",
-                         "Disallow deprecated IT based on ARMv8"),
-              clEnumValN(NoRestrictedIT, "arm-no-restrict-it",
-                         "Allow IT blocks based on ARMv7")));
+    IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT),
+       cl::values(clEnumValN(DefaultIT, "arm-default-it",
+                             "Generate any type of IT block"),
+                  clEnumValN(RestrictedIT, "arm-restrict-it",
+                             "Disallow complex IT blocks")));
 
 /// ForceFastISel - Use the fast-isel, even for subtargets where it is not
 /// currently supported (for testing only).
@@ -237,21 +234,18 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
 
   switch (IT) {
   case DefaultIT:
-    RestrictIT = hasV8Ops() && !hasMinSize();
+    RestrictIT = false;
     break;
   case RestrictedIT:
     RestrictIT = true;
     break;
-  case NoRestrictedIT:
-    RestrictIT = false;
-    break;
   }
 
   // NEON f32 ops are non-IEEE 754 compliant. Darwin is ok with it by default.
   const FeatureBitset &Bits = getFeatureBits();
   if ((Bits[ARM::ProcA5] || Bits[ARM::ProcA8]) && // Where this matters
       (Options.UnsafeFPMath || isTargetDarwin()))
-    UseNEONForSinglePrecisionFP = true;
+    HasNEONForFP = true;
 
   if (isRWPI())
     ReserveR9 = true;
@@ -399,6 +393,14 @@ bool ARMSubtarget::enableSubRegLiveness() const {
   return hasMVEIntegerOps();
 }
 
+bool ARMSubtarget::enableMachinePipeliner() const {
+  // Enable the MachinePipeliner before register allocation for subtargets
+  // with the use-mipipeliner feature.
+  return getSchedModel().hasInstrSchedModel() && useMachinePipeliner();
+}
+
+bool ARMSubtarget::useDFAforSMS() const { return false; }
+
 // This overrides the PostRAScheduler bit in the SchedModel for any CPU.
 bool ARMSubtarget::enablePostRAScheduler() const {
   if (enableMachineScheduler())
@@ -417,8 +419,6 @@ bool ARMSubtarget::enablePostRAMachineScheduler() const {
   return !isThumb1Only();
 }
 
-bool ARMSubtarget::enableAtomicExpand() const { return hasAnyDataBarrier(); }
-
 bool ARMSubtarget::useStride4VFPs() const {
   // For general targets, the prologue can grow when VFPs are allocated with
   // stride 4 (more vpush instructions). But WatchOS uses a compact unwind
@@ -491,3 +491,12 @@ bool ARMSubtarget::ignoreCSRForAllocationOrder(const MachineFunction &MF,
   return isThumb2() && MF.getFunction().hasMinSize() &&
          ARM::GPRRegClass.contains(PhysReg);
 }
+
+bool ARMSubtarget::splitFramePointerPush(const MachineFunction &MF) const {
+  const Function &F = MF.getFunction();
+  if (!MF.getTarget().getMCAsmInfo()->usesWindowsCFI() ||
+      !F.needsUnwindTableEntry())
+    return false;
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  return MFI.hasVarSizedObjects() || getRegisterInfo()->hasStackRealignment(MF);
+}
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index 7cbdc014299f..460ec62d5a33 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -25,8 +25,8 @@
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/MC/MCSchedule.h"
@@ -150,6 +150,11 @@ public:
   };
 
 protected:
+// Bool members corresponding to the SubtargetFeatures defined in tablegen
+#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER)                    \
+  bool ATTRIBUTE = DEFAULT;
+#include "ARMGenSubtargetInfo.inc"
+
   /// ARMProcFamily - ARM processor family: Cortex-A8, Cortex-A9, and others.
   ARMProcFamilyEnum ARMProcFamily = Others;
 
@@ -159,343 +164,22 @@ protected:
   /// ARMArch - ARM architecture
   ARMArchEnum ARMArch = ARMv4t;
 
-  /// HasV4TOps, HasV5TOps, HasV5TEOps,
-  /// HasV6Ops, HasV6MOps, HasV6KOps, HasV6T2Ops, HasV7Ops, HasV8Ops -
-  /// Specify whether target support specific ARM ISA variants.
-  bool HasV4TOps = false;
-  bool HasV5TOps = false;
-  bool HasV5TEOps = false;
-  bool HasV6Ops = false;
-  bool HasV6MOps = false;
-  bool HasV6KOps = false;
-  bool HasV6T2Ops = false;
-  bool HasV7Ops = false;
-  bool HasV8Ops = false;
-  bool HasV8_1aOps = false;
-  bool HasV8_2aOps = false;
-  bool HasV8_3aOps = false;
-  bool HasV8_4aOps = false;
-  bool HasV8_5aOps = false;
-  bool HasV8_6aOps = false;
-  bool HasV8_8aOps = false;
-  bool HasV8_7aOps = false;
-  bool HasV9_0aOps = false;
-  bool HasV9_1aOps = false;
-  bool HasV9_2aOps = false;
-  bool HasV9_3aOps = false;
-  bool HasV8MBaselineOps = false;
-  bool HasV8MMainlineOps = false;
-  bool HasV8_1MMainlineOps = false;
-  bool HasMVEIntegerOps = false;
-  bool HasMVEFloatOps = false;
-  bool HasCDEOps = false;
-
-  /// HasVFPv2, HasVFPv3, HasVFPv4, HasFPARMv8, HasNEON - Specify what
-  /// floating point ISAs are supported.
-  bool HasVFPv2 = false;
-  bool HasVFPv3 = false;
-  bool HasVFPv4 = false;
-  bool HasFPARMv8 = false;
-  bool HasNEON = false;
-  bool HasFPRegs = false;
-  bool HasFPRegs16 = false;
-  bool HasFPRegs64 = false;
-
-  /// Versions of the VFP flags restricted to single precision, or to
-  /// 16 d-registers, or both.
-  bool HasVFPv2SP = false;
-  bool HasVFPv3SP = false;
-  bool HasVFPv4SP = false;
-  bool HasFPARMv8SP = false;
-  bool HasVFPv3D16 = false;
-  bool HasVFPv4D16 = false;
-  bool HasFPARMv8D16 = false;
-  bool HasVFPv3D16SP = false;
-  bool HasVFPv4D16SP = false;
-  bool HasFPARMv8D16SP = false;
-
-  /// HasDotProd - True if the ARMv8.2A dot product instructions are supported.
-  bool HasDotProd = false;
-
-  /// UseNEONForSinglePrecisionFP - if the NEONFP attribute has been
-  /// specified. Use the method useNEONForSinglePrecisionFP() to
-  /// determine if NEON should actually be used.
-  bool UseNEONForSinglePrecisionFP = false;
-
   /// UseMulOps - True if non-microcoded fused integer multiply-add and
   /// multiply-subtract instructions should be used.
   bool UseMulOps = false;
 
-  /// SlowFPVMLx - If the VFP2 / NEON instructions are available, indicates
-  /// whether the FP VML[AS] instructions are slow (if so, don't use them).
-  bool SlowFPVMLx = false;
-
-  /// SlowFPVFMx - If the VFP4 / NEON instructions are available, indicates
-  /// whether the FP VFM[AS] instructions are slow (if so, don't use them).
-  bool SlowFPVFMx = false;
-
-  /// HasVMLxForwarding - If true, NEON has special multiplier accumulator
-  /// forwarding to allow mul + mla being issued back to back.
-  bool HasVMLxForwarding = false;
-
-  /// SlowFPBrcc - True if floating point compare + branch is slow.
-  bool SlowFPBrcc = false;
-
-  /// InThumbMode - True if compiling for Thumb, false for ARM.
-  bool InThumbMode = false;
-
-  /// UseSoftFloat - True if we're using software floating point features.
-  bool UseSoftFloat = false;
-
-  /// UseMISched - True if MachineScheduler should be used for this subtarget.
-  bool UseMISched = false;
-
-  /// DisablePostRAScheduler - False if scheduling should happen again after
-  /// register allocation.
-  bool DisablePostRAScheduler = false;
-
-  /// HasThumb2 - True if Thumb2 instructions are supported.
-  bool HasThumb2 = false;
-
-  /// NoARM - True if subtarget does not support ARM mode execution.
-  bool NoARM = false;
-
-  /// ReserveR9 - True if R9 is not available as a general purpose register.
-  bool ReserveR9 = false;
-
-  /// NoMovt - True if MOVT / MOVW pairs are not used for materialization of
-  /// 32-bit imms (including global addresses).
-  bool NoMovt = false;
-
   /// SupportsTailCall - True if the OS supports tail call. The dynamic linker
   /// must be able to synthesize call stubs for interworking between ARM and
   /// Thumb.
   bool SupportsTailCall = false;
 
-  /// HasFP16 - True if subtarget supports half-precision FP conversions
-  bool HasFP16 = false;
-
-  /// HasFullFP16 - True if subtarget supports half-precision FP operations
-  bool HasFullFP16 = false;
-
-  /// HasFP16FML - True if subtarget supports half-precision FP fml operations
-  bool HasFP16FML = false;
-
-  /// HasBF16 - True if subtarget supports BFloat16 floating point operations
-  bool HasBF16 = false;
-
-  /// HasMatMulInt8 - True if subtarget supports 8-bit integer matrix multiply
-  bool HasMatMulInt8 = false;
-
-  /// HasD32 - True if subtarget has the full 32 double precision
-  /// FP registers for VFPv3.
-  bool HasD32 = false;
-
-  /// HasHardwareDivide - True if subtarget supports [su]div in Thumb mode
-  bool HasHardwareDivideInThumb = false;
-
-  /// HasHardwareDivideInARM - True if subtarget supports [su]div in ARM mode
-  bool HasHardwareDivideInARM = false;
-
-  /// HasDataBarrier - True if the subtarget supports DMB / DSB data barrier
-  /// instructions.
-  bool HasDataBarrier = false;
-
-  /// HasFullDataBarrier - True if the subtarget supports DFB data barrier
-  /// instruction.
-  bool HasFullDataBarrier = false;
-
-  /// HasV7Clrex - True if the subtarget supports CLREX instructions
-  bool HasV7Clrex = false;
-
-  /// HasAcquireRelease - True if the subtarget supports v8 atomics (LDA/LDAEX etc)
-  /// instructions
-  bool HasAcquireRelease = false;
-
-  /// Pref32BitThumb - If true, codegen would prefer 32-bit Thumb instructions
-  /// over 16-bit ones.
-  bool Pref32BitThumb = false;
-
-  /// AvoidCPSRPartialUpdate - If true, codegen would avoid using instructions
-  /// that partially update CPSR and add false dependency on the previous
-  /// CPSR setting instruction.
-  bool AvoidCPSRPartialUpdate = false;
-
-  /// CheapPredicableCPSRDef - If true, disable +1 predication cost
-  /// for instructions updating CPSR. Enabled for Cortex-A57.
-  bool CheapPredicableCPSRDef = false;
-
-  /// AvoidMOVsShifterOperand - If true, codegen should avoid using flag setting
-  /// movs with shifter operand (i.e. asr, lsl, lsr).
-  bool AvoidMOVsShifterOperand = false;
-
-  /// HasRetAddrStack - Some processors perform return stack prediction. CodeGen should
-  /// avoid issue "normal" call instructions to callees which do not return.
-  bool HasRetAddrStack = false;
-
-  /// HasBranchPredictor - True if the subtarget has a branch predictor. Having
-  /// a branch predictor or not changes the expected cost of taking a branch
-  /// which affects the choice of whether to use predicated instructions.
-  bool HasBranchPredictor = true;
-
-  /// HasMPExtension - True if the subtarget supports Multiprocessing
-  /// extension (ARMv7 only).
-  bool HasMPExtension = false;
-
-  /// HasVirtualization - True if the subtarget supports the Virtualization
-  /// extension.
-  bool HasVirtualization = false;
-
-  /// HasFP64 - If true, the floating point unit supports double
-  /// precision.
-  bool HasFP64 = false;
-
-  /// If true, the processor supports the Performance Monitor Extensions. These
-  /// include a generic cycle-counter as well as more fine-grained (often
-  /// implementation-specific) events.
-  bool HasPerfMon = false;
-
-  /// HasTrustZone - if true, processor supports TrustZone security extensions
-  bool HasTrustZone = false;
-
-  /// Has8MSecExt - if true, processor supports ARMv8-M Security Extensions
-  bool Has8MSecExt = false;
-
-  /// HasSHA2 - if true, processor supports SHA1 and SHA256
-  bool HasSHA2 = false;
-
-  /// HasAES - if true, processor supports AES
-  bool HasAES = false;
-
-  /// HasCrypto - if true, processor supports Cryptography extensions
-  bool HasCrypto = false;
-
-  /// HasCRC - if true, processor supports CRC instructions
-  bool HasCRC = false;
-
-  /// HasRAS - if true, the processor supports RAS extensions
-  bool HasRAS = false;
-
-  /// HasLOB - if true, the processor supports the Low Overhead Branch extension
-  bool HasLOB = false;
-
-  bool HasPACBTI = false;
-
-  /// If true, the instructions "vmov.i32 d0, #0" and "vmov.i32 q0, #0" are
-  /// particularly effective at zeroing a VFP register.
-  bool HasZeroCycleZeroing = false;
-
-  /// HasFPAO - if true, processor  does positive address offset computation faster
-  bool HasFPAO = false;
-
-  /// HasFuseAES - if true, processor executes back to back AES instruction
-  /// pairs faster.
-  bool HasFuseAES = false;
-
-  /// HasFuseLiterals - if true, processor executes back to back
-  /// bottom and top halves of literal generation faster.
-  bool HasFuseLiterals = false;
-
-  /// If true, if conversion may decide to leave some instructions unpredicated.
-  bool IsProfitableToUnpredicate = false;
-
-  /// If true, VMOV will be favored over VGETLNi32.
-  bool HasSlowVGETLNi32 = false;
-
-  /// If true, VMOV will be favored over VDUP.
-  bool HasSlowVDUP32 = false;
-
-  /// If true, VMOVSR will be favored over VMOVDRR.
-  bool PreferVMOVSR = false;
-
-  /// If true, ISHST barriers will be used for Release semantics.
-  bool PreferISHST = false;
-
-  /// If true, a VLDM/VSTM starting with an odd register number is considered to
-  /// take more microops than single VLDRS/VSTRS.
-  bool SlowOddRegister = false;
-
-  /// If true, loading into a D subregister will be penalized.
-  bool SlowLoadDSubregister = false;
-
-  /// If true, use a wider stride when allocating VFP registers.
-  bool UseWideStrideVFP = false;
-
-  /// If true, the AGU and NEON/FPU units are multiplexed.
-  bool HasMuxedUnits = false;
-
-  /// If true, VMOVS will never be widened to VMOVD.
-  bool DontWidenVMOVS = false;
-
-  /// If true, splat a register between VFP and NEON instructions.
-  bool SplatVFPToNeon = false;
-
-  /// If true, run the MLx expansion pass.
-  bool ExpandMLx = false;
-
-  /// If true, VFP/NEON VMLA/VMLS have special RAW hazards.
-  bool HasVMLxHazards = false;
-
-  // If true, read thread pointer from coprocessor register.
-  bool ReadTPHard = false;
-
-  /// If true, VMOVRS, VMOVSR and VMOVS will be converted from VFP to NEON.
-  bool UseNEONForFPMovs = false;
-
-  /// If true, VLDn instructions take an extra cycle for unaligned accesses.
-  bool CheckVLDnAlign = false;
-
-  /// If true, VFP instructions are not pipelined.
-  bool NonpipelinedVFP = false;
-
-  /// StrictAlign - If true, the subtarget disallows unaligned memory
-  /// accesses for some types.  For details, see
-  /// ARMTargetLowering::allowsMisalignedMemoryAccesses().
-  bool StrictAlign = false;
-
-  /// RestrictIT - If true, the subtarget disallows generation of deprecated IT
-  ///  blocks to conform to ARMv8 rule.
+  /// RestrictIT - If true, the subtarget disallows generation of complex IT
+  ///  blocks.
   bool RestrictIT = false;
 
-  /// HasDSP - If true, the subtarget supports the DSP (saturating arith
-  /// and such) instructions.
-  bool HasDSP = false;
-
-  /// NaCl TRAP instruction is generated instead of the regular TRAP.
-  bool UseNaClTrap = false;
-
-  /// Generate calls via indirect call instructions.
-  bool GenLongCalls = false;
-
-  /// Generate code that does not contain data access to code sections.
-  bool GenExecuteOnly = false;
-
-  /// Target machine allowed unsafe FP math (such as use of NEON fp)
-  bool UnsafeFPMath = false;
-
   /// UseSjLjEH - If true, the target uses SjLj exception handling (e.g. iOS).
   bool UseSjLjEH = false;
 
-  /// Has speculation barrier
-  bool HasSB = false;
-
-  /// Implicitly convert an instruction to a different one if its immediates
-  /// cannot be encoded. For example, ADD r0, r1, #FFFFFFFF -> SUB r0, r1, #1.
-  bool NegativeImmediates = true;
-
-  /// Mitigate against the cve-2021-35465 security vulnurability.
-  bool FixCMSE_CVE_2021_35465 = false;
-
-  /// Harden against Straight Line Speculation for Returns and Indirect
-  /// Branches.
-  bool HardenSlsRetBr = false;
-
-  /// Harden against Straight Line Speculation for indirect calls.
-  bool HardenSlsBlr = false;
-
-  /// Generate thunk code for SLS mitigation in the normal text section.
-  bool HardenSlsNoComdat = false;
-
   /// stackAlignment - The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
   Align stackAlignment = Align(4);
@@ -540,10 +224,6 @@ protected:
   /// Selected instruction itineraries (one entry per itinerary class.)
   InstrItineraryData InstrItins;
 
-  /// NoBTIAtReturnTwice - Don't place a BTI instruction after
-  /// return-twice constructs (setjmp)
-  bool NoBTIAtReturnTwice = false;
-
   /// Options passed via command line that could influence the target
   const TargetOptions &Options;
 
@@ -622,38 +302,12 @@ private:
 
   std::bitset<8> CoprocCDE = {};
 public:
-  void computeIssueWidth();
+// Getters for SubtargetFeatures defined in tablegen
+#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER)                    \
+  bool GETTER() const { return ATTRIBUTE; }
+#include "ARMGenSubtargetInfo.inc"
 
-  bool hasV4TOps()  const { return HasV4TOps;  }
-  bool hasV5TOps()  const { return HasV5TOps;  }
-  bool hasV5TEOps() const { return HasV5TEOps; }
-  bool hasV6Ops()   const { return HasV6Ops;   }
-  bool hasV6MOps()  const { return HasV6MOps;  }
-  bool hasV6KOps()  const { return HasV6KOps; }
-  bool hasV6T2Ops() const { return HasV6T2Ops; }
-  bool hasV7Ops()   const { return HasV7Ops;  }
-  bool hasV8Ops()   const { return HasV8Ops;  }
-  bool hasV8_1aOps() const { return HasV8_1aOps; }
-  bool hasV8_2aOps() const { return HasV8_2aOps; }
-  bool hasV8_3aOps() const { return HasV8_3aOps; }
-  bool hasV8_4aOps() const { return HasV8_4aOps; }
-  bool hasV8_5aOps() const { return HasV8_5aOps; }
-  bool hasV8_6aOps() const { return HasV8_6aOps; }
-  bool hasV8_7aOps() const { return HasV8_7aOps; }
-  bool hasV8_8aOps() const { return HasV8_8aOps; }
-  bool hasV9_0aOps() const { return HasV9_0aOps; }
-  bool hasV9_1aOps() const { return HasV9_1aOps; }
-  bool hasV9_2aOps() const { return HasV9_2aOps; }
-  bool hasV9_3aOps() const { return HasV9_3aOps; }
-  bool hasV8MBaselineOps() const { return HasV8MBaselineOps; }
-  bool hasV8MMainlineOps() const { return HasV8MMainlineOps; }
-  bool hasV8_1MMainlineOps() const { return HasV8_1MMainlineOps; }
-  bool hasMVEIntegerOps() const { return HasMVEIntegerOps; }
-  bool hasMVEFloatOps() const { return HasMVEFloatOps; }
-  bool hasCDEOps() const { return HasCDEOps; }
-  bool hasFPRegs() const { return HasFPRegs; }
-  bool hasFPRegs16() const { return HasFPRegs16; }
-  bool hasFPRegs64() const { return HasFPRegs64; }
+  void computeIssueWidth();
 
   /// @{
   /// These functions are obsolete, please consider adding subtarget features
@@ -673,31 +327,14 @@ public:
 
   bool hasARMOps() const { return !NoARM; }
 
-  bool hasVFP2Base() const { return HasVFPv2SP; }
-  bool hasVFP3Base() const { return HasVFPv3D16SP; }
-  bool hasVFP4Base() const { return HasVFPv4D16SP; }
-  bool hasFPARMv8Base() const { return HasFPARMv8D16SP; }
-  bool hasNEON() const { return HasNEON;  }
-  bool hasSHA2() const { return HasSHA2; }
-  bool hasAES() const { return HasAES; }
-  bool hasCrypto() const { return HasCrypto; }
-  bool hasDotProd() const { return HasDotProd; }
-  bool hasCRC() const { return HasCRC; }
-  bool hasRAS() const { return HasRAS; }
-  bool hasLOB() const { return HasLOB; }
-  bool hasPACBTI() const { return HasPACBTI; }
-  bool hasVirtualization() const { return HasVirtualization; }
-
   bool useNEONForSinglePrecisionFP() const {
-    return hasNEON() && UseNEONForSinglePrecisionFP;
+    return hasNEON() && hasNEONForFP();
   }
 
-  bool hasDivideInThumbMode() const { return HasHardwareDivideInThumb; }
-  bool hasDivideInARMMode() const { return HasHardwareDivideInARM; }
-  bool hasDataBarrier() const { return HasDataBarrier; }
-  bool hasFullDataBarrier() const { return HasFullDataBarrier; }
-  bool hasV7Clrex() const { return HasV7Clrex; }
-  bool hasAcquireRelease() const { return HasAcquireRelease; }
+  bool hasVFP2Base() const { return hasVFPv2SP(); }
+  bool hasVFP3Base() const { return hasVFPv3D16SP(); }
+  bool hasVFP4Base() const { return hasVFPv4D16SP(); }
+  bool hasFPARMv8Base() const { return hasFPARMv8D16SP(); }
 
   bool hasAnyDataBarrier() const {
     return HasDataBarrier || (hasV6Ops() && !isThumb());
@@ -710,43 +347,7 @@ public:
   }
   bool useFPVFMx16() const { return useFPVFMx() && hasFullFP16(); }
   bool useFPVFMx64() const { return useFPVFMx() && hasFP64(); }
-  bool hasVMLxForwarding() const { return HasVMLxForwarding; }
-  bool isFPBrccSlow() const { return SlowFPBrcc; }
-  bool hasFP64() const { return HasFP64; }
-  bool hasPerfMon() const { return HasPerfMon; }
-  bool hasTrustZone() const { return HasTrustZone; }
-  bool has8MSecExt() const { return Has8MSecExt; }
-  bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; }
-  bool hasFPAO() const { return HasFPAO; }
-  bool isProfitableToUnpredicate() const { return IsProfitableToUnpredicate; }
-  bool hasSlowVGETLNi32() const { return HasSlowVGETLNi32; }
-  bool hasSlowVDUP32() const { return HasSlowVDUP32; }
-  bool preferVMOVSR() const { return PreferVMOVSR; }
-  bool preferISHSTBarriers() const { return PreferISHST; }
-  bool expandMLx() const { return ExpandMLx; }
-  bool hasVMLxHazards() const { return HasVMLxHazards; }
-  bool hasSlowOddRegister() const { return SlowOddRegister; }
-  bool hasSlowLoadDSubregister() const { return SlowLoadDSubregister; }
-  bool useWideStrideVFP() const { return UseWideStrideVFP; }
-  bool hasMuxedUnits() const { return HasMuxedUnits; }
-  bool dontWidenVMOVS() const { return DontWidenVMOVS; }
-  bool useSplatVFPToNeon() const { return SplatVFPToNeon; }
-  bool useNEONForFPMovs() const { return UseNEONForFPMovs; }
-  bool checkVLDnAccessAlignment() const { return CheckVLDnAlign; }
-  bool nonpipelinedVFP() const { return NonpipelinedVFP; }
-  bool prefers32BitThumb() const { return Pref32BitThumb; }
-  bool avoidCPSRPartialUpdate() const { return AvoidCPSRPartialUpdate; }
-  bool cheapPredicableCPSRDef() const { return CheapPredicableCPSRDef; }
-  bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; }
-  bool hasRetAddrStack() const { return HasRetAddrStack; }
-  bool hasBranchPredictor() const { return HasBranchPredictor; }
-  bool hasMPExtension() const { return HasMPExtension; }
-  bool hasDSP() const { return HasDSP; }
-  bool useNaClTrap() const { return UseNaClTrap; }
   bool useSjLjEH() const { return UseSjLjEH; }
-  bool hasSB() const { return HasSB; }
-  bool genLongCalls() const { return GenLongCalls; }
-  bool genExecuteOnly() const { return GenExecuteOnly; }
   bool hasBaseDSP() const {
     if (isThumb())
       return hasDSP();
@@ -754,25 +355,16 @@ public:
       return hasV5TEOps();
   }
 
-  bool hasFP16() const { return HasFP16; }
-  bool hasD32() const { return HasD32; }
-  bool hasFullFP16() const { return HasFullFP16; }
-  bool hasFP16FML() const { return HasFP16FML; }
-  bool hasBF16() const { return HasBF16; }
-
-  bool hasFuseAES() const { return HasFuseAES; }
-  bool hasFuseLiterals() const { return HasFuseLiterals; }
   /// Return true if the CPU supports any kind of instruction fusion.
   bool hasFusion() const { return hasFuseAES() || hasFuseLiterals(); }
 
-  bool hasMatMulInt8() const { return HasMatMulInt8; }
-
   const Triple &getTargetTriple() const { return TargetTriple; }
 
   bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
   bool isTargetIOS() const { return TargetTriple.isiOS(); }
   bool isTargetWatchOS() const { return TargetTriple.isWatchOS(); }
   bool isTargetWatchABI() const { return TargetTriple.isWatchABI(); }
+  bool isTargetDriverKit() const { return TargetTriple.isDriverKit(); }
   bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
   bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
   bool isTargetNetBSD() const { return TargetTriple.isOSNetBSD(); }
@@ -825,24 +417,21 @@ public:
   bool isRWPI() const;
 
   bool useMachineScheduler() const { return UseMISched; }
-  bool disablePostRAScheduler() const { return DisablePostRAScheduler; }
-  bool useSoftFloat() const { return UseSoftFloat; }
-  bool isThumb() const { return InThumbMode; }
+  bool useMachinePipeliner() const { return UseMIPipeliner; }
   bool hasMinSize() const { return OptMinSize; }
-  bool isThumb1Only() const { return InThumbMode && !HasThumb2; }
-  bool isThumb2() const { return InThumbMode && HasThumb2; }
-  bool hasThumb2() const { return HasThumb2; }
+  bool isThumb1Only() const { return isThumb() && !hasThumb2(); }
+  bool isThumb2() const { return isThumb() && hasThumb2(); }
   bool isMClass() const { return ARMProcClass == MClass; }
   bool isRClass() const { return ARMProcClass == RClass; }
   bool isAClass() const { return ARMProcClass == AClass; }
-  bool isReadTPHard() const { return ReadTPHard; }
 
   bool isR9Reserved() const {
     return isTargetMachO() ? (ReserveR9 || !HasV6Ops) : ReserveR9;
   }
 
   MCPhysReg getFramePointerReg() const {
-    if (isTargetDarwin() || (!isTargetWindows() && isThumb()))
+    if (isTargetDarwin() ||
+        (!isTargetWindows() && isThumb() && !createAAPCSFrameChain()))
       return ARM::R7;
     return ARM::R11;
   }
@@ -859,6 +448,8 @@ public:
            isThumb1Only();
   }
 
+  bool splitFramePointerPush(const MachineFunction &MF) const;
+
   bool useStride4VFPs() const;
 
   bool useMovt() const;
@@ -878,6 +469,10 @@ public:
   /// Returns true if machine scheduler should be enabled.
   bool enableMachineScheduler() const override;
 
+  /// Returns true if machine pipeliner should be enabled.
+  bool enableMachinePipeliner() const override;
+  bool useDFAforSMS() const override;
+
   /// True for some subtargets at > -O0.
   bool enablePostRAScheduler() const override;
 
@@ -891,9 +486,6 @@ public:
   /// scheduling, DAGCombine, etc.).
   bool useAA() const override { return true; }
 
-  // enableAtomicExpand- True if we need to expand our atomics.
-  bool enableAtomicExpand() const override;
-
   /// getInstrItins - Return the instruction itineraries based on subtarget
   /// selection.
   const InstrItineraryData *getInstrItineraryData() const override {
@@ -956,14 +548,6 @@ public:
   bool ignoreCSRForAllocationOrder(const MachineFunction &MF,
                                    unsigned PhysReg) const override;
   unsigned getGPRAllocationOrder(const MachineFunction &MF) const;
-
-  bool fixCMSE_CVE_2021_35465() const { return FixCMSE_CVE_2021_35465; }
-
-  bool hardenSlsRetBr() const { return HardenSlsRetBr; }
-  bool hardenSlsBlr() const { return HardenSlsBlr; }
-  bool hardenSlsNoComdat() const { return HardenSlsNoComdat; }
-
-  bool getNoBTIAtReturnTwice() const { return NoBTIAtReturnTwice; }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index c38970f8e341..d95c21d6504b 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -23,6 +23,7 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/ExecutionDomainFix.h"
+#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
@@ -30,20 +31,20 @@
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/ARMTargetParser.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/ARMTargetParser.h"
 #include "llvm/Support/TargetParser.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
@@ -106,6 +107,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTarget() {
   initializeMVEGatherScatterLoweringPass(Registry);
   initializeARMSLSHardeningPass(Registry);
   initializeMVELaneInterleavingPass(Registry);
+  initializeARMFixCortexA57AES1742098Pass(Registry);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -194,7 +196,7 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU,
 
 static Reloc::Model getEffectiveRelocModel(const Triple &TT,
                                            Optional<Reloc::Model> RM) {
-  if (!RM.hasValue())
+  if (!RM)
     // Default relocation model on Darwin is PIC.
     return TT.isOSBinFormatMachO() ? Reloc::PIC_ : Reloc::Static;
 
@@ -307,7 +309,7 @@ ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const {
 }
 
 TargetTransformInfo
-ARMBaseTargetMachine::getTargetTransformInfo(const Function &F) {
+ARMBaseTargetMachine::getTargetTransformInfo(const Function &F) const {
   return TargetTransformInfo(ARMTTIImpl(this, F));
 }
 
@@ -434,6 +436,9 @@ void ARMPassConfig::addIRPasses() {
   // Add Control Flow Guard checks.
   if (TM->getTargetTriple().isOSWindows())
     addPass(createCFGuardCheckPass());
+
+  if (TM->Options.JMCInstrument)
+    addPass(createJMCInstrumenterPass());
 }
 
 void ARMPassConfig::addCodeGenPrepare() {
@@ -505,6 +510,9 @@ bool ARMPassConfig::addGlobalInstructionSelect() {
 
 void ARMPassConfig::addPreRegAlloc() {
   if (getOptLevel() != CodeGenOpt::None) {
+    if (getOptLevel() == CodeGenOpt::Aggressive)
+      addPass(&MachinePipelinerID);
+
     addPass(createMVETPAndVPTOptimisationsPass());
 
     addPass(createMLxExpansionPass());
@@ -573,8 +581,20 @@ void ARMPassConfig::addPreEmitPass() {
 }
 
 void ARMPassConfig::addPreEmitPass2() {
+  // Inserts fixup instructions before unsafe AES operations. Instructions may
+  // be inserted at the start of blocks and at within blocks so this pass has to
+  // come before those below.
+  addPass(createARMFixCortexA57AES1742098Pass());
+  // Inserts BTIs at the start of functions and indirectly-called basic blocks,
+  // so passes cannot add to the start of basic blocks once this has run.
   addPass(createARMBranchTargetsPass());
+  // Inserts Constant Islands. Block sizes cannot be increased after this point,
+  // as this may push the branch ranges and load offsets of accessing constant
+  // pools out of range..
   addPass(createARMConstantIslandPass());
+  // Finalises Low-Overhead Loops. This replaces pseudo instructions with real
+  // instructions, but the pseudos all have conservative sizes so that block
+  // sizes will only be decreased by this pass.
   addPass(createARMLowOverheadLoopsPass());
 
   if (TM->getTargetTriple().isOSWindows()) {
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.h b/llvm/lib/Target/ARM/ARMTargetMachine.h
index 8428092bf179..8d33a038deeb 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.h
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.h
@@ -52,7 +52,7 @@ public:
   const ARMSubtarget *getSubtargetImpl() const = delete;
   bool isLittleEndian() const { return isLittle; }
 
-  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+  TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
 
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index d9d563ead260..3a9946ee810b 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1202,7 +1202,8 @@ InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) {
 
 InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
                                            VectorType *Tp, ArrayRef<int> Mask,
-                                           int Index, VectorType *SubTp) {
+                                           int Index, VectorType *SubTp,
+                                           ArrayRef<const Value *> Args) {
   Kind = improveShuffleKindFromMask(Kind, Mask);
   if (ST->hasNEON()) {
     if (Kind == TTI::SK_Broadcast) {
@@ -1290,7 +1291,8 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
 
     if (!Mask.empty()) {
       std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
-      if (Mask.size() <= LT.second.getVectorNumElements() &&
+      if (LT.second.isVector() &&
+          Mask.size() <= LT.second.getVectorNumElements() &&
           (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
            isVREVMask(Mask, LT.second, 64)))
         return ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) * LT.first;
@@ -1764,6 +1766,48 @@ ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
       return LT.first * ST->getMVEVectorCostFactor(CostKind);
     break;
   }
+  case Intrinsic::fptosi_sat:
+  case Intrinsic::fptoui_sat: {
+    if (ICA.getArgTypes().empty())
+      break;
+    bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
+    auto LT = TLI->getTypeLegalizationCost(DL, ICA.getArgTypes()[0]);
+    EVT MTy = TLI->getValueType(DL, ICA.getReturnType());
+    // Check for the legal types, with the corect subtarget features.
+    if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) ||
+        (ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) ||
+        (ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))
+      return LT.first;
+
+    // Equally for MVE vector types
+    if (ST->hasMVEFloatOps() &&
+        (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) &&
+        LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
+      return LT.first * ST->getMVEVectorCostFactor(CostKind);
+
+    // Otherwise we use a legal convert followed by a min+max
+    if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||
+         (ST->hasFP64() && LT.second == MVT::f64) ||
+         (ST->hasFullFP16() && LT.second == MVT::f16) ||
+         (ST->hasMVEFloatOps() &&
+          (LT.second == MVT::v4f32 || LT.second == MVT::v8f16))) &&
+        LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
+      Type *LegalTy = Type::getIntNTy(ICA.getReturnType()->getContext(),
+                                      LT.second.getScalarSizeInBits());
+      InstructionCost Cost =
+          LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : 1;
+      IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
+                                              : Intrinsic::umin,
+                                     LegalTy, {LegalTy, LegalTy});
+      Cost += getIntrinsicInstrCost(Attrs1, CostKind);
+      IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
+                                              : Intrinsic::umax,
+                                     LegalTy, {LegalTy, LegalTy});
+      Cost += getIntrinsicInstrCost(Attrs2, CostKind);
+      return LT.first * Cost;
+    }
+    break;
+  }
   }
 
   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
@@ -1771,7 +1815,7 @@ ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
 
 bool ARMTTIImpl::isLoweredToCall(const Function *F) {
   if (!F->isIntrinsic())
-    BaseT::isLoweredToCall(F);
+    return BaseT::isLoweredToCall(F);
 
   // Assume all Arm-specific intrinsics map to an instruction.
   if (F->getName().startswith("llvm.arm"))
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 5bb84899e5ef..d7a2bdb3db15 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -213,7 +213,8 @@ public:
 
   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
                                  ArrayRef<int> Mask, int Index,
-                                 VectorType *SubTp);
+                                 VectorType *SubTp,
+                                 ArrayRef<const Value *> Args = None);
 
   bool preferInLoopReduction(unsigned Opcode, Type *Ty,
                              TTI::ReductionFlags Flags) const;
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index c7734cc2cf11..b725ea3a84e5 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -453,6 +453,7 @@ class ARMAsmParser : public MCTargetAsmParser {
                          bool AllowRAAC = false);
   bool parseMemory(OperandVector &);
   bool parseOperand(OperandVector &, StringRef Mnemonic);
+  bool parseImmExpr(int64_t &Out);
   bool parsePrefix(ARMMCExpr::VariantKind &RefKind);
   bool parseMemRegOffsetShift(ARM_AM::ShiftOpc &ShiftType,
                               unsigned &ShiftAmount);
@@ -488,6 +489,17 @@ class ARMAsmParser : public MCTargetAsmParser {
   bool parseDirectiveAlign(SMLoc L);
   bool parseDirectiveThumbSet(SMLoc L);
 
+  bool parseDirectiveSEHAllocStack(SMLoc L, bool Wide);
+  bool parseDirectiveSEHSaveRegs(SMLoc L, bool Wide);
+  bool parseDirectiveSEHSaveSP(SMLoc L);
+  bool parseDirectiveSEHSaveFRegs(SMLoc L);
+  bool parseDirectiveSEHSaveLR(SMLoc L);
+  bool parseDirectiveSEHPrologEnd(SMLoc L, bool Fragment);
+  bool parseDirectiveSEHNop(SMLoc L, bool Wide);
+  bool parseDirectiveSEHEpilogStart(SMLoc L, bool Condition);
+  bool parseDirectiveSEHEpilogEnd(SMLoc L);
+  bool parseDirectiveSEHCustom(SMLoc L);
+
   bool isMnemonicVPTPredicable(StringRef Mnemonic, StringRef ExtraToken);
   StringRef splitMnemonic(StringRef Mnemonic, StringRef ExtraToken,
                           unsigned &PredicationCode,
@@ -4528,9 +4540,7 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
       if (Reg == EndReg)
         continue;
       // The register must be in the same register class as the first.
-      if ((Reg == ARM::RA_AUTH_CODE &&
-           RC != &ARMMCRegisterClasses[ARM::GPRRegClassID]) ||
-          (Reg != ARM::RA_AUTH_CODE && !RC->contains(Reg)))
+      if (!RC->contains(Reg))
         return Error(AfterMinusLoc, "invalid register in register list");
       // Ranges must go from low to high.
       if (MRI->getEncodingValue(Reg) > MRI->getEncodingValue(EndReg))
@@ -6319,6 +6329,18 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
   }
 }
 
+bool ARMAsmParser::parseImmExpr(int64_t &Out) {
+  const MCExpr *Expr = nullptr;
+  SMLoc L = getParser().getTok().getLoc();
+  if (check(getParser().parseExpression(Expr), L, "expected expression"))
+    return true;
+  const MCConstantExpr *Value = dyn_cast_or_null<MCConstantExpr>(Expr);
+  if (check(!Value, L, "expected constant expression"))
+    return true;
+  Out = Value->getValue();
+  return false;
+}
+
 // parsePrefix - Parse ARM 16-bit relocations expression prefix, i.e.
 //  :lower16: and :upper16:.
 bool ARMAsmParser::parsePrefix(ARMMCExpr::VariantKind &RefKind) {
@@ -6379,7 +6401,9 @@ bool ARMAsmParser::parsePrefix(ARMMCExpr::VariantKind &RefKind) {
     CurrentFormat = WASM;
     break;
   case MCContext::IsGOFF:
+  case MCContext::IsSPIRV:
   case MCContext::IsXCOFF:
+  case MCContext::IsDXContainer:
     llvm_unreachable("unexpected object format");
     break;
   }
@@ -10958,9 +10982,7 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       return true;
     }
 
-    { // processInstruction() updates inITBlock state, we need to save it away
-      bool wasInITBlock = inITBlock();
-
+    {
       // Some instructions need post-processing to, for example, tweak which
       // encoding is selected. Loop on it while changes happen so the
       // individual transformations can chain off each other. E.g.,
@@ -10969,12 +10991,6 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
         LLVM_DEBUG(dbgs() << "Changed to: ";
                    Inst.dump_pretty(dbgs(), MII.getName(Inst.getOpcode()));
                    dbgs() << "\n");
-
-      // Only after the instruction is fully processed, we can validate it
-      if (wasInITBlock && hasV8Ops() && isThumb() &&
-          !isV8EligibleForIT(&Inst) && !getTargetOptions().MCNoDeprecatedWarn) {
-        Warning(IDLoc, "deprecated instruction in IT block");
-      }
     }
 
     // Only move forward at the very end so that everything in validate
@@ -11090,6 +11106,39 @@ bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) {
       parseDirectiveTLSDescSeq(DirectiveID.getLoc());
     else
       return true;
+  } else if (IsCOFF) {
+    if (IDVal == ".seh_stackalloc")
+      parseDirectiveSEHAllocStack(DirectiveID.getLoc(), /*Wide=*/false);
+    else if (IDVal == ".seh_stackalloc_w")
+      parseDirectiveSEHAllocStack(DirectiveID.getLoc(), /*Wide=*/true);
+    else if (IDVal == ".seh_save_regs")
+      parseDirectiveSEHSaveRegs(DirectiveID.getLoc(), /*Wide=*/false);
+    else if (IDVal == ".seh_save_regs_w")
+      parseDirectiveSEHSaveRegs(DirectiveID.getLoc(), /*Wide=*/true);
+    else if (IDVal == ".seh_save_sp")
+      parseDirectiveSEHSaveSP(DirectiveID.getLoc());
+    else if (IDVal == ".seh_save_fregs")
+      parseDirectiveSEHSaveFRegs(DirectiveID.getLoc());
+    else if (IDVal == ".seh_save_lr")
+      parseDirectiveSEHSaveLR(DirectiveID.getLoc());
+    else if (IDVal == ".seh_endprologue")
+      parseDirectiveSEHPrologEnd(DirectiveID.getLoc(), /*Fragment=*/false);
+    else if (IDVal == ".seh_endprologue_fragment")
+      parseDirectiveSEHPrologEnd(DirectiveID.getLoc(), /*Fragment=*/true);
+    else if (IDVal == ".seh_nop")
+      parseDirectiveSEHNop(DirectiveID.getLoc(), /*Wide=*/false);
+    else if (IDVal == ".seh_nop_w")
+      parseDirectiveSEHNop(DirectiveID.getLoc(), /*Wide=*/true);
+    else if (IDVal == ".seh_startepilogue")
+      parseDirectiveSEHEpilogStart(DirectiveID.getLoc(), /*Condition=*/false);
+    else if (IDVal == ".seh_startepilogue_cond")
+      parseDirectiveSEHEpilogStart(DirectiveID.getLoc(), /*Condition=*/true);
+    else if (IDVal == ".seh_endepilogue")
+      parseDirectiveSEHEpilogEnd(DirectiveID.getLoc());
+    else if (IDVal == ".seh_custom")
+      parseDirectiveSEHCustom(DirectiveID.getLoc());
+    else
+      return true;
   } else
     return true;
   return false;
@@ -11113,8 +11162,7 @@ bool ARMAsmParser::parseLiteralValues(unsigned Size, SMLoc L) {
 /// parseDirectiveThumb
 ///  ::= .thumb
 bool ARMAsmParser::parseDirectiveThumb(SMLoc L) {
-  if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive") ||
-      check(!hasThumb(), L, "target does not support Thumb mode"))
+  if (parseEOL() || check(!hasThumb(), L, "target does not support Thumb mode"))
     return true;
 
   if (!isThumb())
@@ -11127,8 +11175,7 @@ bool ARMAsmParser::parseDirectiveThumb(SMLoc L) {
 /// parseDirectiveARM
 ///  ::= .arm
 bool ARMAsmParser::parseDirectiveARM(SMLoc L) {
-  if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive") ||
-      check(!hasARM(), L, "target does not support ARM mode"))
+  if (parseEOL() || check(!hasARM(), L, "target does not support ARM mode"))
     return true;
 
   if (isThumb())
@@ -11167,15 +11214,13 @@ bool ARMAsmParser::parseDirectiveThumbFunc(SMLoc L) {
           Parser.getTok().getIdentifier());
       getParser().getStreamer().emitThumbFunc(Func);
       Parser.Lex();
-      if (parseToken(AsmToken::EndOfStatement,
-                     "unexpected token in '.thumb_func' directive"))
+      if (parseEOL())
         return true;
       return false;
     }
   }
 
-  if (parseToken(AsmToken::EndOfStatement,
-                 "unexpected token in '.thumb_func' directive"))
+  if (parseEOL())
     return true;
 
   // .thumb_func implies .thumb
@@ -11204,7 +11249,7 @@ bool ARMAsmParser::parseDirectiveSyntax(SMLoc L) {
             "'.syntax divided' arm assembly not supported") ||
       check(Mode != "unified" && Mode != "UNIFIED", L,
             "unrecognized syntax mode in .syntax directive") ||
-      parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
+      parseEOL())
     return true;
 
   // TODO tell the MC streamer the mode
@@ -11226,7 +11271,7 @@ bool ARMAsmParser::parseDirectiveCode(SMLoc L) {
   }
   Parser.Lex();
 
-  if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
+  if (parseEOL())
     return true;
 
   if (Val == 16) {
@@ -11257,8 +11302,7 @@ bool ARMAsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
   SMLoc SRegLoc, ERegLoc;
   if (check(ParseRegister(Reg, SRegLoc, ERegLoc), SRegLoc,
             "register name expected") ||
-      parseToken(AsmToken::EndOfStatement,
-                 "unexpected input in .req directive."))
+      parseEOL())
     return true;
 
   if (RegisterReqs.insert(std::make_pair(Name, Reg)).first->second != Reg)
@@ -11276,10 +11320,7 @@ bool ARMAsmParser::parseDirectiveUnreq(SMLoc L) {
     return Error(L, "unexpected input in .unreq directive.");
   RegisterReqs.erase(Parser.getTok().getIdentifier().lower());
   Parser.Lex(); // Eat the identifier.
-  if (parseToken(AsmToken::EndOfStatement,
-                 "unexpected input in '.unreq' directive"))
-    return true;
-  return false;
+  return parseEOL();
 }
 
 // After changing arch/CPU, try to put the ARM/Thumb mode back to what it was
@@ -11340,11 +11381,11 @@ bool ARMAsmParser::parseDirectiveEabiAttr(SMLoc L) {
     StringRef Name = Parser.getTok().getIdentifier();
     Optional<unsigned> Ret = ELFAttrs::attrTypeFromString(
         Name, ARMBuildAttrs::getARMAttributeTags());
-    if (!Ret.hasValue()) {
+    if (!Ret) {
       Error(TagLoc, "attribute name not recognised: " + Name);
       return false;
     }
-    Tag = Ret.getValue();
+    Tag = *Ret;
     Parser.Lex();
   } else {
     const MCExpr *AttrExpr;
@@ -11406,8 +11447,7 @@ bool ARMAsmParser::parseDirectiveEabiAttr(SMLoc L) {
     Parser.Lex();
   }
 
-  if (Parser.parseToken(AsmToken::EndOfStatement,
-                        "unexpected token in '.eabi_attribute' directive"))
+  if (Parser.parseEOL())
     return true;
 
   if (IsIntegerValue && IsStringValue) {
@@ -11463,8 +11503,7 @@ bool ARMAsmParser::parseDirectiveFPU(SMLoc L) {
 /// parseDirectiveFnStart
 ///  ::= .fnstart
 bool ARMAsmParser::parseDirectiveFnStart(SMLoc L) {
-  if (parseToken(AsmToken::EndOfStatement,
-                 "unexpected token in '.fnstart' directive"))
+  if (parseEOL())
     return true;
 
   if (UC.hasFnStart()) {
@@ -11485,8 +11524,7 @@ bool ARMAsmParser::parseDirectiveFnStart(SMLoc L) {
 /// parseDirectiveFnEnd
 ///  ::= .fnend
 bool ARMAsmParser::parseDirectiveFnEnd(SMLoc L) {
-  if (parseToken(AsmToken::EndOfStatement,
-                 "unexpected token in '.fnend' directive"))
+  if (parseEOL())
     return true;
   // Check the ordering of unwind directives
   if (!UC.hasFnStart())
@@ -11502,8 +11540,7 @@ bool ARMAsmParser::parseDirectiveFnEnd(SMLoc L) {
 /// parseDirectiveCantUnwind
 ///  ::= .cantunwind
 bool ARMAsmParser::parseDirectiveCantUnwind(SMLoc L) {
-  if (parseToken(AsmToken::EndOfStatement,
-                 "unexpected token in '.cantunwind' directive"))
+  if (parseEOL())
     return true;
 
   UC.recordCantUnwind(L);
@@ -11538,8 +11575,7 @@ bool ARMAsmParser::parseDirectivePersonality(SMLoc L) {
   StringRef Name(Parser.getTok().getIdentifier());
   Parser.Lex();
 
-  if (parseToken(AsmToken::EndOfStatement,
-                 "unexpected token in '.personality' directive"))
+  if (parseEOL())
     return true;
 
   UC.recordPersonality(L);
@@ -11571,8 +11607,7 @@ bool ARMAsmParser::parseDirectivePersonality(SMLoc L) {
 /// parseDirectiveHandlerData
 ///  ::= .handlerdata
 bool ARMAsmParser::parseDirectiveHandlerData(SMLoc L) {
-  if (parseToken(AsmToken::EndOfStatement,
-                 "unexpected token in '.handlerdata' directive"))
+  if (parseEOL())
     return true;
 
   UC.recordHandlerData(L);
@@ -11670,8 +11705,7 @@ bool ARMAsmParser::parseDirectivePad(SMLoc L) {
   if (!CE)
     return Error(ExLoc, "pad offset must be an immediate");
 
-  if (parseToken(AsmToken::EndOfStatement,
-                 "unexpected token in '.pad' directive"))
+  if (parseEOL())
     return true;
 
   getTargetStreamer().emitPad(CE->getValue());
@@ -11692,8 +11726,7 @@ bool ARMAsmParser::parseDirectiveRegSave(SMLoc L, bool IsVector) {
   SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Operands;
 
   // Parse the register list
-  if (parseRegisterList(Operands, true, true) ||
-      parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
+  if (parseRegisterList(Operands, true, true) || parseEOL())
     return true;
   ARMOperand &Op = (ARMOperand &)*Operands[0];
   if (!IsVector && !Op.isRegList())
@@ -11776,7 +11809,7 @@ bool ARMAsmParser::parseDirectiveInst(SMLoc Loc, char Suffix) {
 /// parseDirectiveLtorg
 ///  ::= .ltorg | .pool
 bool ARMAsmParser::parseDirectiveLtorg(SMLoc L) {
-  if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
+  if (parseEOL())
     return true;
   getTargetStreamer().emitCurrentConstantPool();
   return false;
@@ -11785,7 +11818,7 @@ bool ARMAsmParser::parseDirectiveLtorg(SMLoc L) {
 bool ARMAsmParser::parseDirectiveEven(SMLoc L) {
   const MCSection *Section = getStreamer().getCurrentSectionOnly();
 
-  if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
+  if (parseEOL())
     return true;
 
   if (!Section) {
@@ -11794,7 +11827,7 @@ bool ARMAsmParser::parseDirectiveEven(SMLoc L) {
   }
 
   assert(Section && "must have section to emit alignment");
-  if (Section->UseCodeAlign())
+  if (Section->useCodeAlign())
     getStreamer().emitCodeAlignment(2, &getSTI());
   else
     getStreamer().emitValueToAlignment(2);
@@ -11810,9 +11843,7 @@ bool ARMAsmParser::parseDirectivePersonalityIndex(SMLoc L) {
 
   const MCExpr *IndexExpression;
   SMLoc IndexLoc = Parser.getTok().getLoc();
-  if (Parser.parseExpression(IndexExpression) ||
-      parseToken(AsmToken::EndOfStatement,
-                 "unexpected token in '.personalityindex' directive")) {
+  if (Parser.parseExpression(IndexExpression) || parseEOL()) {
     return true;
   }
 
@@ -11913,11 +11944,10 @@ bool ARMAsmParser::parseDirectiveTLSDescSeq(SMLoc L) {
                             MCSymbolRefExpr::VK_ARM_TLSDESCSEQ, getContext());
   Lex();
 
-  if (parseToken(AsmToken::EndOfStatement,
-                 "unexpected token in '.tlsdescseq' directive"))
+  if (parseEOL())
     return true;
 
-  getTargetStreamer().AnnotateTLSDescriptorSequence(SRE);
+  getTargetStreamer().annotateTLSDescriptorSequence(SRE);
   return false;
 }
 
@@ -11955,8 +11985,7 @@ bool ARMAsmParser::parseDirectiveMovSP(SMLoc L) {
     Offset = CE->getValue();
   }
 
-  if (parseToken(AsmToken::EndOfStatement,
-                 "unexpected token in '.movsp' directive"))
+  if (parseEOL())
     return true;
 
   getTargetStreamer().emitMovSP(SPReg, Offset);
@@ -11996,7 +12025,7 @@ bool ARMAsmParser::parseDirectiveAlign(SMLoc L) {
     // '.align' is target specifically handled to mean 2**2 byte alignment.
     const MCSection *Section = getStreamer().getCurrentSectionOnly();
     assert(Section && "must have section to emit alignment");
-    if (Section->UseCodeAlign())
+    if (Section->useCodeAlign())
       getStreamer().emitCodeAlignment(4, &getSTI(), 0);
     else
       getStreamer().emitValueToAlignment(4, 0, 1, 0);
@@ -12026,6 +12055,175 @@ bool ARMAsmParser::parseDirectiveThumbSet(SMLoc L) {
   return false;
 }
 
+/// parseDirectiveSEHAllocStack
+/// ::= .seh_stackalloc
+/// ::= .seh_stackalloc_w
+bool ARMAsmParser::parseDirectiveSEHAllocStack(SMLoc L, bool Wide) {
+  int64_t Size;
+  if (parseImmExpr(Size))
+    return true;
+  getTargetStreamer().emitARMWinCFIAllocStack(Size, Wide);
+  return false;
+}
+
+/// parseDirectiveSEHSaveRegs
+/// ::= .seh_save_regs
+/// ::= .seh_save_regs_w
+bool ARMAsmParser::parseDirectiveSEHSaveRegs(SMLoc L, bool Wide) {
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Operands;
+
+  if (parseRegisterList(Operands) || parseEOL())
+    return true;
+  ARMOperand &Op = (ARMOperand &)*Operands[0];
+  if (!Op.isRegList())
+    return Error(L, ".seh_save_regs{_w} expects GPR registers");
+  const SmallVectorImpl<unsigned> &RegList = Op.getRegList();
+  uint32_t Mask = 0;
+  for (size_t i = 0; i < RegList.size(); ++i) {
+    unsigned Reg = MRI->getEncodingValue(RegList[i]);
+    if (Reg == 15) // pc -> lr
+      Reg = 14;
+    if (Reg == 13)
+      return Error(L, ".seh_save_regs{_w} can't include SP");
+    assert(Reg < 16U && "Register out of range");
+    unsigned Bit = (1u << Reg);
+    Mask |= Bit;
+  }
+  if (!Wide && (Mask & 0x1f00) != 0)
+    return Error(L,
+                 ".seh_save_regs cannot save R8-R12, needs .seh_save_regs_w");
+  getTargetStreamer().emitARMWinCFISaveRegMask(Mask, Wide);
+  return false;
+}
+
+/// parseDirectiveSEHSaveSP
+/// ::= .seh_save_sp
+bool ARMAsmParser::parseDirectiveSEHSaveSP(SMLoc L) {
+  int Reg = tryParseRegister();
+  if (Reg == -1 || !MRI->getRegClass(ARM::GPRRegClassID).contains(Reg))
+    return Error(L, "expected GPR");
+  unsigned Index = MRI->getEncodingValue(Reg);
+  if (Index > 14 || Index == 13)
+    return Error(L, "invalid register for .seh_save_sp");
+  getTargetStreamer().emitARMWinCFISaveSP(Index);
+  return false;
+}
+
+/// parseDirectiveSEHSaveFRegs
+/// ::= .seh_save_fregs
+bool ARMAsmParser::parseDirectiveSEHSaveFRegs(SMLoc L) {
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Operands;
+
+  if (parseRegisterList(Operands) || parseEOL())
+    return true;
+  ARMOperand &Op = (ARMOperand &)*Operands[0];
+  if (!Op.isDPRRegList())
+    return Error(L, ".seh_save_fregs expects DPR registers");
+  const SmallVectorImpl<unsigned> &RegList = Op.getRegList();
+  uint32_t Mask = 0;
+  for (size_t i = 0; i < RegList.size(); ++i) {
+    unsigned Reg = MRI->getEncodingValue(RegList[i]);
+    assert(Reg < 32U && "Register out of range");
+    unsigned Bit = (1u << Reg);
+    Mask |= Bit;
+  }
+
+  if (Mask == 0)
+    return Error(L, ".seh_save_fregs missing registers");
+
+  unsigned First = 0;
+  while ((Mask & 1) == 0) {
+    First++;
+    Mask >>= 1;
+  }
+  if (((Mask + 1) & Mask) != 0)
+    return Error(L,
+                 ".seh_save_fregs must take a contiguous range of registers");
+  unsigned Last = First;
+  while ((Mask & 2) != 0) {
+    Last++;
+    Mask >>= 1;
+  }
+  if (First < 16 && Last >= 16)
+    return Error(L, ".seh_save_fregs must be all d0-d15 or d16-d31");
+  getTargetStreamer().emitARMWinCFISaveFRegs(First, Last);
+  return false;
+}
+
+/// parseDirectiveSEHSaveLR
+/// ::= .seh_save_lr
+bool ARMAsmParser::parseDirectiveSEHSaveLR(SMLoc L) {
+  int64_t Offset;
+  if (parseImmExpr(Offset))
+    return true;
+  getTargetStreamer().emitARMWinCFISaveLR(Offset);
+  return false;
+}
+
+/// parseDirectiveSEHPrologEnd
+/// ::= .seh_endprologue
+/// ::= .seh_endprologue_fragment
+bool ARMAsmParser::parseDirectiveSEHPrologEnd(SMLoc L, bool Fragment) {
+  getTargetStreamer().emitARMWinCFIPrologEnd(Fragment);
+  return false;
+}
+
+/// parseDirectiveSEHNop
+/// ::= .seh_nop
+/// ::= .seh_nop_w
+bool ARMAsmParser::parseDirectiveSEHNop(SMLoc L, bool Wide) {
+  getTargetStreamer().emitARMWinCFINop(Wide);
+  return false;
+}
+
+/// parseDirectiveSEHEpilogStart
+/// ::= .seh_startepilogue
+/// ::= .seh_startepilogue_cond
+bool ARMAsmParser::parseDirectiveSEHEpilogStart(SMLoc L, bool Condition) {
+  unsigned CC = ARMCC::AL;
+  if (Condition) {
+    MCAsmParser &Parser = getParser();
+    SMLoc S = Parser.getTok().getLoc();
+    const AsmToken &Tok = Parser.getTok();
+    if (!Tok.is(AsmToken::Identifier))
+      return Error(S, ".seh_startepilogue_cond missing condition");
+    CC = ARMCondCodeFromString(Tok.getString());
+    if (CC == ~0U)
+      return Error(S, "invalid condition");
+    Parser.Lex(); // Eat the token.
+  }
+
+  getTargetStreamer().emitARMWinCFIEpilogStart(CC);
+  return false;
+}
+
+/// parseDirectiveSEHEpilogEnd
+/// ::= .seh_endepilogue
+bool ARMAsmParser::parseDirectiveSEHEpilogEnd(SMLoc L) {
+  getTargetStreamer().emitARMWinCFIEpilogEnd();
+  return false;
+}
+
+/// parseDirectiveSEHCustom
+/// ::= .seh_custom
+bool ARMAsmParser::parseDirectiveSEHCustom(SMLoc L) {
+  unsigned Opcode = 0;
+  do {
+    int64_t Byte;
+    if (parseImmExpr(Byte))
+      return true;
+    if (Byte > 0xff || Byte < 0)
+      return Error(L, "Invalid byte value in .seh_custom");
+    if (Opcode > 0x00ffffff)
+      return Error(L, "Too many bytes in .seh_custom");
+    // Store the bytes as one big endian number in Opcode. In a multi byte
+    // opcode sequence, the first byte can't be zero.
+    Opcode = (Opcode << 8) | Byte;
+  } while (parseOptionalToken(AsmToken::Comma));
+  getTargetStreamer().emitARMWinCFICustom(Opcode);
+  return false;
+}
+
 /// Force static initialization.
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMAsmParser() {
   RegisterMCAsmParser<ARMAsmParser> X(getTheARMLETarget());
@@ -12338,8 +12536,7 @@ bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) {
   SMLoc ExtLoc = Parser.getTok().getLoc();
   Lex();
 
-  if (parseToken(AsmToken::EndOfStatement,
-                 "unexpected token in '.arch_extension' directive"))
+  if (parseEOL())
     return true;
 
   if (Name == "nocrypto") {
diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index c3df7dc88d79..9acd49292268 100644
--- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -13,8 +13,8 @@
 #include "TargetInfo/ARMTargetInfo.h"
 #include "Utils/ARMBaseInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDecoderOps.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
-#include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -175,408 +175,529 @@ static bool Check(DecodeStatus &Out, DecodeStatus In) {
 // Forward declare these because the autogenerated code will reference them.
 // Definitions are further down.
 static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder);
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder);
 static DecodeStatus DecodeCLRMGPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder);
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder);
 static DecodeStatus DecodetGPROddRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder);
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder);
 static DecodeStatus DecodetGPREvenRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder);
+                                                uint64_t Address,
+                                                const MCDisassembler *Decoder);
 static DecodeStatus
 DecodeGPRwithAPSR_NZCVnospRegisterClass(MCInst &Inst, unsigned RegNo,
-                                        uint64_t Address, const void *Decoder);
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder);
 static DecodeStatus DecodeGPRnopcRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder);
+                                               const MCDisassembler *Decoder);
 static DecodeStatus DecodeGPRnospRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder);
-static DecodeStatus DecodeGPRwithAPSRRegisterClass(MCInst &Inst,
-                                               unsigned RegNo, uint64_t Address,
-                                               const void *Decoder);
-static DecodeStatus DecodeGPRwithZRRegisterClass(MCInst &Inst,
-                                               unsigned RegNo, uint64_t Address,
-                                               const void *Decoder);
-static DecodeStatus DecodeGPRwithZRnospRegisterClass(
-    MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder);
+                                               const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeGPRwithAPSRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
+                               const MCDisassembler *Decoder);
+static DecodeStatus DecodeGPRwithZRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                                 uint64_t Address,
+                                                 const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeGPRwithZRnospRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
+                                 const MCDisassembler *Decoder);
 static DecodeStatus DecodetGPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder);
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder);
 static DecodeStatus DecodetcGPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder);
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder);
 static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder);
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder);
 static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeGPRPairnospRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder);
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeGPRPairnospRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
+                               const MCDisassembler *Decoder);
 static DecodeStatus DecodeGPRspRegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
-                                             const void *Decoder);
+                                             const MCDisassembler *Decoder);
 static DecodeStatus DecodeHPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder);
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder);
 static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder);
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder);
 static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder);
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder);
 static DecodeStatus DecodeDPR_8RegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder);
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder);
 static DecodeStatus DecodeSPR_8RegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeDPR_VFP2RegisterClass(MCInst &Inst,
-                                                unsigned RegNo,
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder);
+static DecodeStatus DecodeDPR_VFP2RegisterClass(MCInst &Inst, unsigned RegNo,
                                                 uint64_t Address,
-                                                const void *Decoder);
+                                                const MCDisassembler *Decoder);
 static DecodeStatus DecodeQPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder);
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder);
 static DecodeStatus DecodeMQPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder);
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder);
 static DecodeStatus DecodeMQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
-                                             const void *Decoder);
+                                             const MCDisassembler *Decoder);
 static DecodeStatus DecodeMQQQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder);
+                                               const MCDisassembler *Decoder);
 static DecodeStatus DecodeDPairRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeDPairSpacedRegisterClass(MCInst &Inst,
-                               unsigned RegNo, uint64_t Address,
-                               const void *Decoder);
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeDPairSpacedRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
+                               const MCDisassembler *Decoder);
 
 static DecodeStatus DecodePredicateOperand(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder);
 static DecodeStatus DecodeCCOutOperand(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                       uint64_t Address,
+                                       const MCDisassembler *Decoder);
 static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder);
 static DecodeStatus DecodeSPRRegListOperand(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder);
 static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder);
 
 static DecodeStatus DecodeBitfieldMaskOperand(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
+                                              uint64_t Address,
+                                              const MCDisassembler *Decoder);
 static DecodeStatus DecodeCopMemInstruction(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeAddrMode2IdxInstruction(MCInst &Inst,
-                                                  unsigned Insn,
-                                                  uint64_t Address,
-                                                  const void *Decoder);
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeAddrMode2IdxInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                              const MCDisassembler *Decoder);
 static DecodeStatus DecodeSORegMemOperand(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeAddrMode3Instruction(MCInst &Inst,unsigned Insn,
-                               uint64_t Address, const void *Decoder);
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder);
+static DecodeStatus DecodeAddrMode3Instruction(MCInst &Inst, unsigned Insn,
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder);
+static DecodeStatus DecodeTSBInstruction(MCInst &Inst, unsigned Insn,
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder);
 static DecodeStatus DecodeSORegImmOperand(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder);
 static DecodeStatus DecodeSORegRegOperand(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeMemMultipleWritebackInstruction(MCInst & Inst,
-                                                  unsigned Insn,
-                                                  uint64_t Adddress,
-                                                  const void *Decoder);
+static DecodeStatus
+DecodeMemMultipleWritebackInstruction(MCInst &Inst, unsigned Insn,
+                                      uint64_t Adddress,
+                                      const MCDisassembler *Decoder);
 static DecodeStatus DecodeT2MOVTWInstruction(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder);
 static DecodeStatus DecodeArmMOVTWInstruction(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
+                                              uint64_t Address,
+                                              const MCDisassembler *Decoder);
 static DecodeStatus DecodeSMLAInstruction(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder);
 static DecodeStatus DecodeHINTInstruction(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder);
 static DecodeStatus DecodeCPSInstruction(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder);
 static DecodeStatus DecodeTSTInstruction(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder);
 static DecodeStatus DecodeSETPANInstruction(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder);
 static DecodeStatus DecodeT2CPSInstruction(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder);
 static DecodeStatus DecodeT2HintSpaceInstruction(MCInst &Inst, unsigned Insn,
                                                  uint64_t Address,
-                                                 const void *Decoder);
+                                                 const MCDisassembler *Decoder);
 static DecodeStatus DecodeAddrModeImm12Operand(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder);
 static DecodeStatus DecodeAddrMode5Operand(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder);
 static DecodeStatus DecodeAddrMode5FP16Operand(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder);
 static DecodeStatus DecodeAddrMode7Operand(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder);
 static DecodeStatus DecodeT2BInstruction(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeBranchImmInstruction(MCInst &Inst,unsigned Insn,
-                               uint64_t Address, const void *Decoder);
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder);
+static DecodeStatus DecodeBranchImmInstruction(MCInst &Inst, unsigned Insn,
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder);
 static DecodeStatus DecodeAddrMode6Operand(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder);
 static DecodeStatus DecodeVLDST1Instruction(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder);
 static DecodeStatus DecodeVLDST2Instruction(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder);
 static DecodeStatus DecodeVLDST3Instruction(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder);
 static DecodeStatus DecodeVLDST4Instruction(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder);
 static DecodeStatus DecodeVLDInstruction(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder);
 static DecodeStatus DecodeVSTInstruction(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder);
 static DecodeStatus DecodeVLD1DupInstruction(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder);
 static DecodeStatus DecodeVLD2DupInstruction(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder);
 static DecodeStatus DecodeVLD3DupInstruction(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder);
 static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVMOVModImmInstruction(MCInst &Inst,unsigned Val,
-                               uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeMVEModImmInstruction(MCInst &Inst,unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder);
+static DecodeStatus DecodeVMOVModImmInstruction(MCInst &Inst, unsigned Val,
+                                                uint64_t Address,
+                                                const MCDisassembler *Decoder);
+static DecodeStatus DecodeMVEModImmInstruction(MCInst &Inst, unsigned Val,
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder);
 static DecodeStatus DecodeMVEVADCInstruction(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder);
 static DecodeStatus DecodeVSHLMaxInstruction(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder);
 static DecodeStatus DecodeShiftRight8Imm(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder);
 static DecodeStatus DecodeShiftRight16Imm(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder);
 static DecodeStatus DecodeShiftRight32Imm(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder);
 static DecodeStatus DecodeShiftRight64Imm(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder);
 static DecodeStatus DecodeTBLInstruction(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder);
 static DecodeStatus DecodePostIdxReg(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
+                                     uint64_t Address,
+                                     const MCDisassembler *Decoder);
 static DecodeStatus DecodeMveAddrModeRQ(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
-template<int shift>
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder);
+template <int shift>
 static DecodeStatus DecodeMveAddrModeQ(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
+                                       uint64_t Address,
+                                       const MCDisassembler *Decoder);
 static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
+                                      uint64_t Address,
+                                      const MCDisassembler *Decoder);
 static DecodeStatus DecodeMemBarrierOption(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder);
 static DecodeStatus DecodeInstSyncBarrierOption(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
+                                                uint64_t Address,
+                                                const MCDisassembler *Decoder);
+static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                  const MCDisassembler *Decoder);
 static DecodeStatus DecodeBankedReg(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder);
 static DecodeStatus DecodeDoubleRegLoad(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder);
 static DecodeStatus DecodeDoubleRegStore(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder);
 static DecodeStatus DecodeLDRPreImm(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder);
 static DecodeStatus DecodeLDRPreReg(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder);
 static DecodeStatus DecodeSTRPreImm(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder);
 static DecodeStatus DecodeSTRPreReg(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVLD1LN(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVLD2LN(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVLD3LN(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVLD4LN(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVST1LN(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVST2LN(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVST3LN(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVST4LN(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVMOVSRR(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVMOVRRS(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeSwap(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn,
-                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn,
-                                uint64_t Address, const void *Decoder);
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder);
+static DecodeStatus DecodeVLD1LN(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                 const MCDisassembler *Decoder);
+static DecodeStatus DecodeVLD2LN(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                 const MCDisassembler *Decoder);
+static DecodeStatus DecodeVLD3LN(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                 const MCDisassembler *Decoder);
+static DecodeStatus DecodeVLD4LN(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                 const MCDisassembler *Decoder);
+static DecodeStatus DecodeVST1LN(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                 const MCDisassembler *Decoder);
+static DecodeStatus DecodeVST2LN(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                 const MCDisassembler *Decoder);
+static DecodeStatus DecodeVST3LN(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                 const MCDisassembler *Decoder);
+static DecodeStatus DecodeVST4LN(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                 const MCDisassembler *Decoder);
+static DecodeStatus DecodeVMOVSRR(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                  const MCDisassembler *Decoder);
+static DecodeStatus DecodeVMOVRRS(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                  const MCDisassembler *Decoder);
+static DecodeStatus DecodeSwap(MCInst &Inst, unsigned Insn, uint64_t Address,
+                               const MCDisassembler *Decoder);
+static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                const MCDisassembler *Decoder);
+static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                const MCDisassembler *Decoder);
 static DecodeStatus DecodeVCVTImmOperand(MCInst &Inst, unsigned Insn,
-                                         uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeNEONComplexLane64Instruction(MCInst &Inst,
-                                                       unsigned Val,
-                                                       uint64_t Address,
-                                                       const void *Decoder);
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeNEONComplexLane64Instruction(MCInst &Inst, unsigned Val, uint64_t Address,
+                                   const MCDisassembler *Decoder);
 
 static DecodeStatus DecodeThumbAddSpecialReg(MCInst &Inst, uint16_t Insn,
-                               uint64_t Address, const void *Decoder);
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder);
 static DecodeStatus DecodeThumbBROperand(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder);
 static DecodeStatus DecodeT2BROperand(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                      uint64_t Address,
+                                      const MCDisassembler *Decoder);
 static DecodeStatus DecodeThumbCmpBROperand(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder);
 static DecodeStatus DecodeThumbAddrModeRR(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder);
 static DecodeStatus DecodeThumbAddrModeIS(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder);
 static DecodeStatus DecodeThumbAddrModePC(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder);
 static DecodeStatus DecodeThumbAddrModeSP(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder);
 static DecodeStatus DecodeT2AddrModeSOReg(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder);
 static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                      uint64_t Address,
+                                      const MCDisassembler *Decoder);
 static DecodeStatus DecodeT2LoadImm8(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void* Decoder);
+                                     uint64_t Address,
+                                     const MCDisassembler *Decoder);
 static DecodeStatus DecodeT2LoadImm12(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void* Decoder);
-static DecodeStatus DecodeT2LoadT(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void* Decoder);
+                                      uint64_t Address,
+                                      const MCDisassembler *Decoder);
+static DecodeStatus DecodeT2LoadT(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                  const MCDisassembler *Decoder);
 static DecodeStatus DecodeT2LoadLabel(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void* Decoder);
-static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeT2Imm7S4(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                      uint64_t Address,
+                                      const MCDisassembler *Decoder);
+static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val, uint64_t Address,
+                                   const MCDisassembler *Decoder);
+static DecodeStatus DecodeT2Imm7S4(MCInst &Inst, unsigned Val, uint64_t Address,
+                                   const MCDisassembler *Decoder);
 static DecodeStatus DecodeT2AddrModeImm8s4(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder);
 static DecodeStatus DecodeT2AddrModeImm7s4(MCInst &Inst, unsigned Val,
                                            uint64_t Address,
-                                           const void *Decoder);
-static DecodeStatus DecodeT2AddrModeImm0_1020s4(MCInst &Inst,unsigned Val,
-                               uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeT2Imm8(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
-template<int shift>
-static DecodeStatus DecodeT2Imm7(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                           const MCDisassembler *Decoder);
+static DecodeStatus DecodeT2AddrModeImm0_1020s4(MCInst &Inst, unsigned Val,
+                                                uint64_t Address,
+                                                const MCDisassembler *Decoder);
+static DecodeStatus DecodeT2Imm8(MCInst &Inst, unsigned Val, uint64_t Address,
+                                 const MCDisassembler *Decoder);
+template <int shift>
+static DecodeStatus DecodeT2Imm7(MCInst &Inst, unsigned Val, uint64_t Address,
+                                 const MCDisassembler *Decoder);
 static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
-template<int shift>
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder);
+template <int shift>
 static DecodeStatus DecodeTAddrModeImm7(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
-template<int shift, int WriteBack>
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder);
+template <int shift, int WriteBack>
 static DecodeStatus DecodeT2AddrModeImm7(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder);
 static DecodeStatus DecodeThumbAddSPImm(MCInst &Inst, uint16_t Val,
-                               uint64_t Address, const void *Decoder);
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder);
 static DecodeStatus DecodeThumbAddSPReg(MCInst &Inst, uint16_t Insn,
-                                uint64_t Address, const void *Decoder);
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder);
 static DecodeStatus DecodeThumbCPS(MCInst &Inst, uint16_t Insn,
-                                uint64_t Address, const void *Decoder);
+                                   uint64_t Address,
+                                   const MCDisassembler *Decoder);
 static DecodeStatus DecodeQADDInstruction(MCInst &Inst, unsigned Insn,
-                                uint64_t Address, const void *Decoder);
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder);
 static DecodeStatus DecodeThumbBLXOffset(MCInst &Inst, unsigned Insn,
-                                uint64_t Address, const void *Decoder);
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder);
 static DecodeStatus DecodeT2AddrModeImm12(MCInst &Inst, unsigned Val,
-                                uint64_t Address, const void *Decoder);
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder);
 static DecodeStatus DecodeThumbTableBranch(MCInst &Inst, unsigned Val,
-                                uint64_t Address, const void *Decoder);
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder);
 static DecodeStatus DecodeThumb2BCCInstruction(MCInst &Inst, unsigned Val,
-                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeT2SOImm(MCInst &Inst, unsigned Val,
-                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeThumbBCCTargetOperand(MCInst &Inst,unsigned Val,
-                                uint64_t Address, const void *Decoder);
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder);
+static DecodeStatus DecodeT2SOImm(MCInst &Inst, unsigned Val, uint64_t Address,
+                                  const MCDisassembler *Decoder);
+static DecodeStatus DecodeThumbBCCTargetOperand(MCInst &Inst, unsigned Val,
+                                                uint64_t Address,
+                                                const MCDisassembler *Decoder);
 static DecodeStatus DecodeThumbBLTargetOperand(MCInst &Inst, unsigned Val,
-                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeIT(MCInst &Inst, unsigned Val,
-                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeT2LDRDPreInstruction(MCInst &Inst,unsigned Insn,
-                               uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeT2STRDPreInstruction(MCInst &Inst,unsigned Insn,
-                               uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeT2Adr(MCInst &Inst, unsigned Val,
-                                uint64_t Address, const void *Decoder);
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder);
+static DecodeStatus DecodeIT(MCInst &Inst, unsigned Val, uint64_t Address,
+                             const MCDisassembler *Decoder);
+static DecodeStatus DecodeT2LDRDPreInstruction(MCInst &Inst, unsigned Insn,
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder);
+static DecodeStatus DecodeT2STRDPreInstruction(MCInst &Inst, unsigned Insn,
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder);
+static DecodeStatus DecodeT2Adr(MCInst &Inst, unsigned Val, uint64_t Address,
+                                const MCDisassembler *Decoder);
 static DecodeStatus DecodeT2LdStPre(MCInst &Inst, unsigned Val,
-                                uint64_t Address, const void *Decoder);
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder);
 static DecodeStatus DecodeT2ShifterImmOperand(MCInst &Inst, unsigned Val,
-                                uint64_t Address, const void *Decoder);
+                                              uint64_t Address,
+                                              const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val,
-                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val, uint64_t Address,
+                              const MCDisassembler *Decoder);
 static DecodeStatus DecoderForMRRC2AndMCRR2(MCInst &Inst, unsigned Val,
-                                            uint64_t Address, const void *Decoder);
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder);
 static DecodeStatus DecodeForVMRSandVMSR(MCInst &Inst, unsigned Val,
-                                         uint64_t Address, const void *Decoder);
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder);
 
 template <bool isSigned, bool isNeg, bool zeroPermitted, int size>
 static DecodeStatus DecodeBFLabelOperand(MCInst &Inst, unsigned val,
-                                         uint64_t Address, const void *Decoder);
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder);
 static DecodeStatus DecodeBFAfterTargetOperand(MCInst &Inst, unsigned val,
                                                uint64_t Address,
-                                               const void *Decoder);
+                                               const MCDisassembler *Decoder);
 static DecodeStatus DecodePredNoALOperand(MCInst &Inst, unsigned Val,
                                           uint64_t Address,
-                                          const void *Decoder);
+                                          const MCDisassembler *Decoder);
 static DecodeStatus DecodeLOLoop(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const void *Decoder);
+                                 const MCDisassembler *Decoder);
 static DecodeStatus DecodeLongShiftOperand(MCInst &Inst, unsigned Val,
                                            uint64_t Address,
-                                           const void *Decoder);
+                                           const MCDisassembler *Decoder);
 static DecodeStatus DecodeVSCCLRM(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                  const void *Decoder);
+                                  const MCDisassembler *Decoder);
 static DecodeStatus DecodeVPTMaskOperand(MCInst &Inst, unsigned Val,
-                                         uint64_t Address, const void *Decoder);
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder);
 static DecodeStatus DecodeVpredROperand(MCInst &Inst, unsigned Val,
-                                        uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeRestrictedIPredicateOperand(MCInst &Inst, unsigned Val,
-                                                     uint64_t Address,
-                                                     const void *Decoder);
-static DecodeStatus DecodeRestrictedSPredicateOperand(MCInst &Inst, unsigned Val,
-                                                     uint64_t Address,
-                                                     const void *Decoder);
-static DecodeStatus DecodeRestrictedUPredicateOperand(MCInst &Inst, unsigned Val,
-                                                     uint64_t Address,
-                                                     const void *Decoder);
-static DecodeStatus DecodeRestrictedFPPredicateOperand(MCInst &Inst,
-                                                       unsigned Val,
-                                                       uint64_t Address,
-                                                       const void *Decoder);
-template<bool Writeback>
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeRestrictedIPredicateOperand(MCInst &Inst, unsigned Val, uint64_t Address,
+                                  const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeRestrictedSPredicateOperand(MCInst &Inst, unsigned Val, uint64_t Address,
+                                  const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeRestrictedUPredicateOperand(MCInst &Inst, unsigned Val, uint64_t Address,
+                                  const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeRestrictedFPPredicateOperand(MCInst &Inst, unsigned Val, uint64_t Address,
+                                   const MCDisassembler *Decoder);
+template <bool Writeback>
 static DecodeStatus DecodeVSTRVLDR_SYSREG(MCInst &Inst, unsigned Insn,
                                           uint64_t Address,
-                                          const void *Decoder);
-template<int shift>
+                                          const MCDisassembler *Decoder);
+template <int shift>
 static DecodeStatus DecodeMVE_MEM_1_pre(MCInst &Inst, unsigned Val,
-                                        uint64_t Address, const void *Decoder);
-template<int shift>
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder);
+template <int shift>
 static DecodeStatus DecodeMVE_MEM_2_pre(MCInst &Inst, unsigned Val,
-                                        uint64_t Address, const void *Decoder);
-template<int shift>
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder);
+template <int shift>
 static DecodeStatus DecodeMVE_MEM_3_pre(MCInst &Inst, unsigned Val,
-                                        uint64_t Address, const void *Decoder);
-template<unsigned MinLog, unsigned MaxLog>
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder);
+template <unsigned MinLog, unsigned MaxLog>
 static DecodeStatus DecodePowerTwoOperand(MCInst &Inst, unsigned Val,
                                           uint64_t Address,
-                                          const void *Decoder);
-template<unsigned start>
-static DecodeStatus DecodeMVEPairVectorIndexOperand(MCInst &Inst, unsigned Val,
-                                                    uint64_t Address,
-                                                    const void *Decoder);
+                                          const MCDisassembler *Decoder);
+template <unsigned start>
+static DecodeStatus
+DecodeMVEPairVectorIndexOperand(MCInst &Inst, unsigned Val, uint64_t Address,
+                                const MCDisassembler *Decoder);
 static DecodeStatus DecodeMVEVMOVQtoDReg(MCInst &Inst, unsigned Insn,
                                          uint64_t Address,
-                                         const void *Decoder);
+                                         const MCDisassembler *Decoder);
 static DecodeStatus DecodeMVEVMOVDRegtoQ(MCInst &Inst, unsigned Insn,
                                          uint64_t Address,
-                                         const void *Decoder);
+                                         const MCDisassembler *Decoder);
 static DecodeStatus DecodeMVEVCVTt1fp(MCInst &Inst, unsigned Insn,
-                                      uint64_t Address, const void *Decoder);
+                                      uint64_t Address,
+                                      const MCDisassembler *Decoder);
 typedef DecodeStatus OperandDecoder(MCInst &Inst, unsigned Val,
-                                    uint64_t Address, const void *Decoder);
-template<bool scalar, OperandDecoder predicate_decoder>
-static DecodeStatus DecodeMVEVCMP(MCInst &Inst, unsigned Insn,
-                                  uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeMveVCTP(MCInst &Inst, unsigned Insn,
-                                  uint64_t Address, const void *Decoder);
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder);
+template <bool scalar, OperandDecoder predicate_decoder>
+static DecodeStatus DecodeMVEVCMP(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                  const MCDisassembler *Decoder);
+static DecodeStatus DecodeMveVCTP(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                  const MCDisassembler *Decoder);
 static DecodeStatus DecodeMVEVPNOT(MCInst &Inst, unsigned Insn,
-                                   uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeMVEOverlappingLongShift(MCInst &Inst, unsigned Insn,
-                                                  uint64_t Address,
-                                                  const void *Decoder);
+                                   uint64_t Address,
+                                   const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeMVEOverlappingLongShift(MCInst &Inst, unsigned Insn, uint64_t Address,
+                              const MCDisassembler *Decoder);
 static DecodeStatus DecodeT2AddSubSPImm(MCInst &Inst, unsigned Insn,
-                                        uint64_t Address, const void *Decoder);
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder);
 
 #include "ARMGenDisassemblerTables.inc"
 
@@ -710,11 +831,12 @@ extern const MCInstrDesc ARMInsts[];
 /// operand to the MCInst and false otherwise.
 static bool tryAddingSymbolicOperand(uint64_t Address, int32_t Value,
                                      bool isBranch, uint64_t InstSize,
-                                     MCInst &MI, const void *Decoder) {
-  const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+                                     MCInst &MI,
+                                     const MCDisassembler *Decoder) {
   // FIXME: Does it make sense for value to be negative?
-  return Dis->tryAddingSymbolicOperand(MI, (uint32_t)Value, Address, isBranch,
-                                       /* Offset */ 0, InstSize);
+  return Decoder->tryAddingSymbolicOperand(MI, (uint32_t)Value, Address,
+                                           isBranch, /*Offset=*/0, /*OpSize=*/0,
+                                           InstSize);
 }
 
 /// tryAddingPcLoadReferenceComment - trys to add a comment as to what is being
@@ -727,7 +849,7 @@ static bool tryAddingSymbolicOperand(uint64_t Address, int32_t Value,
 /// a literal 'C' string if the referenced address of the literal pool's entry
 /// is an address into a section with 'C' string literals.
 static void tryAddingPcLoadReferenceComment(uint64_t Address, int Value,
-                                            const void *Decoder) {
+                                            const MCDisassembler *Decoder) {
   const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
   Dis->tryAddingPcLoadReferenceComment(Value, Address);
 }
@@ -1142,7 +1264,8 @@ static const uint16_t CLRMGPRDecoderTable[] = {
 };
 
 static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder) {
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder) {
   if (RegNo > 15)
     return MCDisassembler::Fail;
 
@@ -1153,7 +1276,7 @@ static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecodeCLRMGPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   if (RegNo > 15)
     return MCDisassembler::Fail;
 
@@ -1165,9 +1288,9 @@ static DecodeStatus DecodeCLRMGPRRegisterClass(MCInst &Inst, unsigned RegNo,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus
-DecodeGPRnopcRegisterClass(MCInst &Inst, unsigned RegNo,
-                           uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeGPRnopcRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   if (RegNo == 15)
@@ -1180,7 +1303,7 @@ DecodeGPRnopcRegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecodeGPRnospRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   if (RegNo == 13)
@@ -1192,8 +1315,8 @@ static DecodeStatus DecodeGPRnospRegisterClass(MCInst &Inst, unsigned RegNo,
 }
 
 static DecodeStatus
-DecodeGPRwithAPSRRegisterClass(MCInst &Inst, unsigned RegNo,
-                               uint64_t Address, const void *Decoder) {
+DecodeGPRwithAPSRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
+                               const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   if (RegNo == 15)
@@ -1207,8 +1330,8 @@ DecodeGPRwithAPSRRegisterClass(MCInst &Inst, unsigned RegNo,
 }
 
 static DecodeStatus
-DecodeGPRwithZRRegisterClass(MCInst &Inst, unsigned RegNo,
-                             uint64_t Address, const void *Decoder) {
+DecodeGPRwithZRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
+                             const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   if (RegNo == 15)
@@ -1225,8 +1348,8 @@ DecodeGPRwithZRRegisterClass(MCInst &Inst, unsigned RegNo,
 }
 
 static DecodeStatus
-DecodeGPRwithZRnospRegisterClass(MCInst &Inst, unsigned RegNo,
-                                 uint64_t Address, const void *Decoder) {
+DecodeGPRwithZRnospRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
+                                 const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
   if (RegNo == 13)
     return MCDisassembler::Fail;
@@ -1235,7 +1358,8 @@ DecodeGPRwithZRnospRegisterClass(MCInst &Inst, unsigned RegNo,
 }
 
 static DecodeStatus DecodetGPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder) {
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder) {
   if (RegNo > 7)
     return MCDisassembler::Fail;
   return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder);
@@ -1247,7 +1371,8 @@ static const uint16_t GPRPairDecoderTable[] = {
 };
 
 static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder) {
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   // According to the Arm ARM RegNo = 14 is undefined, but we return fail
@@ -1263,8 +1388,9 @@ static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo,
   return S;
 }
 
-static DecodeStatus DecodeGPRPairnospRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder) {
+static DecodeStatus
+DecodeGPRPairnospRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
+                               const MCDisassembler *Decoder) {
   if (RegNo > 13)
     return MCDisassembler::Fail;
 
@@ -1278,7 +1404,7 @@ static DecodeStatus DecodeGPRPairnospRegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecodeGPRspRegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
-                                             const void *Decoder) {
+                                             const MCDisassembler *Decoder) {
   if (RegNo != 13)
     return MCDisassembler::Fail;
 
@@ -1288,7 +1414,8 @@ static DecodeStatus DecodeGPRspRegisterClass(MCInst &Inst, unsigned RegNo,
 }
 
 static DecodeStatus DecodetcGPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder) {
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder) {
   unsigned Register = 0;
   switch (RegNo) {
     case 0:
@@ -1318,7 +1445,8 @@ static DecodeStatus DecodetcGPRRegisterClass(MCInst &Inst, unsigned RegNo,
 }
 
 static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder) {
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   const FeatureBitset &featureBits =
@@ -1343,7 +1471,8 @@ static const uint16_t SPRDecoderTable[] = {
 };
 
 static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder) {
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
@@ -1353,7 +1482,8 @@ static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo,
 }
 
 static DecodeStatus DecodeHPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder) {
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder) {
   return DecodeSPRRegisterClass(Inst, RegNo, Address, Decoder);
 }
 
@@ -1369,7 +1499,8 @@ static const uint16_t DPRDecoderTable[] = {
 };
 
 static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder) {
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder) {
   const FeatureBitset &featureBits =
     ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
 
@@ -1384,22 +1515,24 @@ static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo,
 }
 
 static DecodeStatus DecodeDPR_8RegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder) {
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder) {
   if (RegNo > 7)
     return MCDisassembler::Fail;
   return DecodeDPRRegisterClass(Inst, RegNo, Address, Decoder);
 }
 
 static DecodeStatus DecodeSPR_8RegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder) {
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder) {
   if (RegNo > 15)
     return MCDisassembler::Fail;
   return DecodeSPRRegisterClass(Inst, RegNo, Address, Decoder);
 }
 
-static DecodeStatus
-DecodeDPR_VFP2RegisterClass(MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeDPR_VFP2RegisterClass(MCInst &Inst, unsigned RegNo,
+                                                uint64_t Address,
+                                                const MCDisassembler *Decoder) {
   if (RegNo > 15)
     return MCDisassembler::Fail;
   return DecodeDPRRegisterClass(Inst, RegNo, Address, Decoder);
@@ -1413,7 +1546,8 @@ static const uint16_t QPRDecoderTable[] = {
 };
 
 static DecodeStatus DecodeQPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder) {
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder) {
   if (RegNo > 31 || (RegNo & 1) != 0)
     return MCDisassembler::Fail;
   RegNo >>= 1;
@@ -1433,7 +1567,8 @@ static const uint16_t DPairDecoderTable[] = {
 };
 
 static DecodeStatus DecodeDPairRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder) {
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder) {
   if (RegNo > 30)
     return MCDisassembler::Fail;
 
@@ -1453,10 +1588,9 @@ static const uint16_t DPairSpacedDecoderTable[] = {
   ARM::D28_D30, ARM::D29_D31
 };
 
-static DecodeStatus DecodeDPairSpacedRegisterClass(MCInst &Inst,
-                                                   unsigned RegNo,
-                                                   uint64_t Address,
-                                                   const void *Decoder) {
+static DecodeStatus
+DecodeDPairSpacedRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
+                               const MCDisassembler *Decoder) {
   if (RegNo > 29)
     return MCDisassembler::Fail;
 
@@ -1466,7 +1600,8 @@ static DecodeStatus DecodeDPairSpacedRegisterClass(MCInst &Inst,
 }
 
 static DecodeStatus DecodePredicateOperand(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder) {
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
   if (Val == 0xF) return MCDisassembler::Fail;
   // AL predicate is not allowed on Thumb1 branches.
@@ -1483,7 +1618,8 @@ static DecodeStatus DecodePredicateOperand(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeCCOutOperand(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder) {
+                                       uint64_t Address,
+                                       const MCDisassembler *Decoder) {
   if (Val)
     Inst.addOperand(MCOperand::createReg(ARM::CPSR));
   else
@@ -1492,7 +1628,8 @@ static DecodeStatus DecodeCCOutOperand(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeSORegImmOperand(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder) {
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rm = fieldFromInstruction(Val, 0, 4);
@@ -1529,7 +1666,8 @@ static DecodeStatus DecodeSORegImmOperand(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeSORegRegOperand(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder) {
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rm = fieldFromInstruction(Val, 0, 4);
@@ -1564,7 +1702,8 @@ static DecodeStatus DecodeSORegRegOperand(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val,
-                                 uint64_t Address, const void *Decoder) {
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   bool NeedDisjointWriteback = false;
@@ -1611,7 +1750,8 @@ static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeSPRRegListOperand(MCInst &Inst, unsigned Val,
-                                 uint64_t Address, const void *Decoder) {
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Vd = fieldFromInstruction(Val, 8, 5);
@@ -1635,7 +1775,8 @@ static DecodeStatus DecodeSPRRegListOperand(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val,
-                                 uint64_t Address, const void *Decoder) {
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Vd = fieldFromInstruction(Val, 8, 5);
@@ -1660,7 +1801,8 @@ static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeBitfieldMaskOperand(MCInst &Inst, unsigned Val,
-                                      uint64_t Address, const void *Decoder) {
+                                              uint64_t Address,
+                                              const MCDisassembler *Decoder) {
   // This operand encodes a mask of contiguous zeros between a specified MSB
   // and LSB.  To decode it, we create the mask of all bits MSB-and-lower,
   // the mask of all bits LSB-and-lower, and then xor them to create
@@ -1687,7 +1829,8 @@ static DecodeStatus DecodeBitfieldMaskOperand(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeCopMemInstruction(MCInst &Inst, unsigned Insn,
-                                  uint64_t Address, const void *Decoder) {
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned pred = fieldFromInstruction(Insn, 28, 4);
@@ -1865,8 +2008,8 @@ static DecodeStatus DecodeCopMemInstruction(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus
-DecodeAddrMode2IdxInstruction(MCInst &Inst, unsigned Insn,
-                              uint64_t Address, const void *Decoder) {
+DecodeAddrMode2IdxInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                              const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Insn, 16, 4);
@@ -1971,7 +2114,8 @@ DecodeAddrMode2IdxInstruction(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeSORegMemOperand(MCInst &Inst, unsigned Val,
-                                  uint64_t Address, const void *Decoder) {
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Val, 13, 4);
@@ -2013,9 +2157,22 @@ static DecodeStatus DecodeSORegMemOperand(MCInst &Inst, unsigned Val,
   return S;
 }
 
-static DecodeStatus
-DecodeAddrMode3Instruction(MCInst &Inst, unsigned Insn,
-                           uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeTSBInstruction(MCInst &Inst, unsigned Insn,
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder) {
+  if (Inst.getOpcode() != ARM::TSB && Inst.getOpcode() != ARM::t2TSB)
+    return MCDisassembler::Fail;
+
+  // The "csync" operand is not encoded into the "tsb" instruction (as this is
+  // the only available operand), but LLVM expects the instruction to have one
+  // operand, so we need to add the csync when decoding.
+  Inst.addOperand(MCOperand::createImm(ARM_TSB::CSYNC));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeAddrMode3Instruction(MCInst &Inst, unsigned Insn,
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rt = fieldFromInstruction(Insn, 12, 4);
@@ -2206,7 +2363,8 @@ DecodeAddrMode3Instruction(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeRFEInstruction(MCInst &Inst, unsigned Insn,
-                                 uint64_t Address, const void *Decoder) {
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Insn, 16, 4);
@@ -2235,7 +2393,8 @@ static DecodeStatus DecodeRFEInstruction(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeQADDInstruction(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder) {
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rd = fieldFromInstruction(Insn, 12, 4);
@@ -2257,9 +2416,10 @@ static DecodeStatus DecodeQADDInstruction(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeMemMultipleWritebackInstruction(MCInst &Inst,
-                                  unsigned Insn,
-                                  uint64_t Address, const void *Decoder) {
+static DecodeStatus
+DecodeMemMultipleWritebackInstruction(MCInst &Inst, unsigned Insn,
+                                      uint64_t Address,
+                                      const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Insn, 16, 4);
@@ -2350,7 +2510,8 @@ static DecodeStatus DecodeMemMultipleWritebackInstruction(MCInst &Inst,
 
 // Check for UNPREDICTABLE predicated ESB instruction
 static DecodeStatus DecodeHINTInstruction(MCInst &Inst, unsigned Insn,
-                                 uint64_t Address, const void *Decoder) {
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder) {
   unsigned pred = fieldFromInstruction(Insn, 28, 4);
   unsigned imm8 = fieldFromInstruction(Insn, 0, 8);
   const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
@@ -2372,7 +2533,8 @@ static DecodeStatus DecodeHINTInstruction(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeCPSInstruction(MCInst &Inst, unsigned Insn,
-                                 uint64_t Address, const void *Decoder) {
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder) {
   unsigned imod = fieldFromInstruction(Insn, 18, 2);
   unsigned M = fieldFromInstruction(Insn, 17, 1);
   unsigned iflags = fieldFromInstruction(Insn, 6, 3);
@@ -2419,7 +2581,8 @@ static DecodeStatus DecodeCPSInstruction(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeT2CPSInstruction(MCInst &Inst, unsigned Insn,
-                                 uint64_t Address, const void *Decoder) {
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder) {
   unsigned imod = fieldFromInstruction(Insn, 9, 2);
   unsigned M = fieldFromInstruction(Insn, 8, 1);
   unsigned iflags = fieldFromInstruction(Insn, 5, 3);
@@ -2460,9 +2623,9 @@ static DecodeStatus DecodeT2CPSInstruction(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeT2HintSpaceInstruction(MCInst &Inst, unsigned Insn,
-                                                 uint64_t Address,
-                                                 const void *Decoder) {
+static DecodeStatus
+DecodeT2HintSpaceInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                             const MCDisassembler *Decoder) {
   unsigned imm = fieldFromInstruction(Insn, 0, 8);
 
   unsigned Opcode = ARM::t2HINT;
@@ -2486,7 +2649,8 @@ static DecodeStatus DecodeT2HintSpaceInstruction(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeT2MOVTWInstruction(MCInst &Inst, unsigned Insn,
-                                 uint64_t Address, const void *Decoder) {
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rd = fieldFromInstruction(Insn, 8, 4);
@@ -2510,7 +2674,8 @@ static DecodeStatus DecodeT2MOVTWInstruction(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeArmMOVTWInstruction(MCInst &Inst, unsigned Insn,
-                                 uint64_t Address, const void *Decoder) {
+                                              uint64_t Address,
+                                              const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rd = fieldFromInstruction(Insn, 12, 4);
@@ -2537,7 +2702,8 @@ static DecodeStatus DecodeArmMOVTWInstruction(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeSMLAInstruction(MCInst &Inst, unsigned Insn,
-                                 uint64_t Address, const void *Decoder) {
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rd = fieldFromInstruction(Insn, 16, 4);
@@ -2565,7 +2731,8 @@ static DecodeStatus DecodeSMLAInstruction(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeTSTInstruction(MCInst &Inst, unsigned Insn,
-                                  uint64_t Address, const void *Decoder) {
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Pred = fieldFromInstruction(Insn, 28, 4);
@@ -2586,7 +2753,8 @@ static DecodeStatus DecodeTSTInstruction(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeSETPANInstruction(MCInst &Inst, unsigned Insn,
-                                  uint64_t Address, const void *Decoder) {
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Imm = fieldFromInstruction(Insn, 9, 1);
@@ -2614,7 +2782,8 @@ static DecodeStatus DecodeSETPANInstruction(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeAddrModeImm12Operand(MCInst &Inst, unsigned Val,
-                           uint64_t Address, const void *Decoder) {
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned add = fieldFromInstruction(Val, 12, 1);
@@ -2634,7 +2803,8 @@ static DecodeStatus DecodeAddrModeImm12Operand(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeAddrMode5Operand(MCInst &Inst, unsigned Val,
-                                   uint64_t Address, const void *Decoder) {
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Val, 9, 4);
@@ -2654,7 +2824,8 @@ static DecodeStatus DecodeAddrMode5Operand(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeAddrMode5FP16Operand(MCInst &Inst, unsigned Val,
-                                   uint64_t Address, const void *Decoder) {
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Val, 9, 4);
@@ -2674,13 +2845,14 @@ static DecodeStatus DecodeAddrMode5FP16Operand(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeAddrMode7Operand(MCInst &Inst, unsigned Val,
-                                   uint64_t Address, const void *Decoder) {
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder) {
   return DecodeGPRRegisterClass(Inst, Val, Address, Decoder);
 }
 
-static DecodeStatus
-DecodeT2BInstruction(MCInst &Inst, unsigned Insn,
-                     uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeT2BInstruction(MCInst &Inst, unsigned Insn,
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder) {
   DecodeStatus Status = MCDisassembler::Success;
 
   // Note the J1 and J2 values are from the encoded instruction.  So here
@@ -2705,9 +2877,9 @@ DecodeT2BInstruction(MCInst &Inst, unsigned Insn,
   return Status;
 }
 
-static DecodeStatus
-DecodeBranchImmInstruction(MCInst &Inst, unsigned Insn,
-                           uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeBranchImmInstruction(MCInst &Inst, unsigned Insn,
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned pred = fieldFromInstruction(Insn, 28, 4);
@@ -2736,7 +2908,8 @@ DecodeBranchImmInstruction(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeAddrMode6Operand(MCInst &Inst, unsigned Val,
-                                   uint64_t Address, const void *Decoder) {
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rm = fieldFromInstruction(Val, 0, 4);
@@ -2753,7 +2926,8 @@ static DecodeStatus DecodeAddrMode6Operand(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeVLDInstruction(MCInst &Inst, unsigned Insn,
-                                   uint64_t Address, const void *Decoder) {
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rd = fieldFromInstruction(Insn, 12, 4);
@@ -3029,7 +3203,8 @@ static DecodeStatus DecodeVLDInstruction(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeVLDST1Instruction(MCInst &Inst, unsigned Insn,
-                                   uint64_t Address, const void *Decoder) {
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder) {
   unsigned type = fieldFromInstruction(Insn, 8, 4);
   unsigned align = fieldFromInstruction(Insn, 4, 2);
   if (type == 6 && (align & 2)) return MCDisassembler::Fail;
@@ -3042,7 +3217,8 @@ static DecodeStatus DecodeVLDST1Instruction(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeVLDST2Instruction(MCInst &Inst, unsigned Insn,
-                                   uint64_t Address, const void *Decoder) {
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder) {
   unsigned size = fieldFromInstruction(Insn, 6, 2);
   if (size == 3) return MCDisassembler::Fail;
 
@@ -3057,7 +3233,8 @@ static DecodeStatus DecodeVLDST2Instruction(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeVLDST3Instruction(MCInst &Inst, unsigned Insn,
-                                   uint64_t Address, const void *Decoder) {
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder) {
   unsigned size = fieldFromInstruction(Insn, 6, 2);
   if (size == 3) return MCDisassembler::Fail;
 
@@ -3070,7 +3247,8 @@ static DecodeStatus DecodeVLDST3Instruction(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeVLDST4Instruction(MCInst &Inst, unsigned Insn,
-                                   uint64_t Address, const void *Decoder) {
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder) {
   unsigned size = fieldFromInstruction(Insn, 6, 2);
   if (size == 3) return MCDisassembler::Fail;
 
@@ -3080,7 +3258,8 @@ static DecodeStatus DecodeVLDST4Instruction(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeVSTInstruction(MCInst &Inst, unsigned Insn,
-                                 uint64_t Address, const void *Decoder) {
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rd = fieldFromInstruction(Insn, 12, 4);
@@ -3350,7 +3529,8 @@ static DecodeStatus DecodeVSTInstruction(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeVLD1DupInstruction(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address, const void *Decoder) {
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rd = fieldFromInstruction(Insn, 12, 4);
@@ -3397,7 +3577,8 @@ static DecodeStatus DecodeVLD1DupInstruction(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeVLD2DupInstruction(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address, const void *Decoder) {
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rd = fieldFromInstruction(Insn, 12, 4);
@@ -3445,7 +3626,8 @@ static DecodeStatus DecodeVLD2DupInstruction(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeVLD3DupInstruction(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address, const void *Decoder) {
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rd = fieldFromInstruction(Insn, 12, 4);
@@ -3480,7 +3662,8 @@ static DecodeStatus DecodeVLD3DupInstruction(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address, const void *Decoder) {
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rd = fieldFromInstruction(Insn, 12, 4);
@@ -3531,9 +3714,9 @@ static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus
-DecodeVMOVModImmInstruction(MCInst &Inst, unsigned Insn,
-                            uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeVMOVModImmInstruction(MCInst &Inst, unsigned Insn,
+                                                uint64_t Address,
+                                                const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rd = fieldFromInstruction(Insn, 12, 4);
@@ -3577,9 +3760,9 @@ DecodeVMOVModImmInstruction(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus
-DecodeMVEModImmInstruction(MCInst &Inst, unsigned Insn,
-                           uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeMVEModImmInstruction(MCInst &Inst, unsigned Insn,
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Qd = ((fieldFromInstruction(Insn, 22, 1) << 3) |
@@ -3607,7 +3790,8 @@ DecodeMVEModImmInstruction(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeMVEVADCInstruction(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder) {
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Qd = fieldFromInstruction(Insn, 13, 3);
@@ -3632,7 +3816,8 @@ static DecodeStatus DecodeMVEVADCInstruction(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeVSHLMaxInstruction(MCInst &Inst, unsigned Insn,
-                                        uint64_t Address, const void *Decoder) {
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rd = fieldFromInstruction(Insn, 12, 4);
@@ -3651,31 +3836,36 @@ static DecodeStatus DecodeVSHLMaxInstruction(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeShiftRight8Imm(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder) {
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder) {
   Inst.addOperand(MCOperand::createImm(8 - Val));
   return MCDisassembler::Success;
 }
 
 static DecodeStatus DecodeShiftRight16Imm(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder) {
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder) {
   Inst.addOperand(MCOperand::createImm(16 - Val));
   return MCDisassembler::Success;
 }
 
 static DecodeStatus DecodeShiftRight32Imm(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder) {
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder) {
   Inst.addOperand(MCOperand::createImm(32 - Val));
   return MCDisassembler::Success;
 }
 
 static DecodeStatus DecodeShiftRight64Imm(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder) {
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder) {
   Inst.addOperand(MCOperand::createImm(64 - Val));
   return MCDisassembler::Success;
 }
 
 static DecodeStatus DecodeTBLInstruction(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder) {
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rd = fieldFromInstruction(Insn, 12, 4);
@@ -3711,7 +3901,8 @@ static DecodeStatus DecodeTBLInstruction(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeThumbAddSpecialReg(MCInst &Inst, uint16_t Insn,
-                                     uint64_t Address, const void *Decoder) {
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned dst = fieldFromInstruction(Insn, 8, 3);
@@ -3735,7 +3926,8 @@ static DecodeStatus DecodeThumbAddSpecialReg(MCInst &Inst, uint16_t Insn,
 }
 
 static DecodeStatus DecodeThumbBROperand(MCInst &Inst, unsigned Val,
-                                 uint64_t Address, const void *Decoder) {
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder) {
   if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<12>(Val<<1) + 4,
                                 true, 2, Inst, Decoder))
     Inst.addOperand(MCOperand::createImm(SignExtend32<12>(Val << 1)));
@@ -3743,7 +3935,8 @@ static DecodeStatus DecodeThumbBROperand(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeT2BROperand(MCInst &Inst, unsigned Val,
-                                 uint64_t Address, const void *Decoder) {
+                                      uint64_t Address,
+                                      const MCDisassembler *Decoder) {
   if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<21>(Val) + 4,
                                 true, 4, Inst, Decoder))
     Inst.addOperand(MCOperand::createImm(SignExtend32<21>(Val)));
@@ -3751,7 +3944,8 @@ static DecodeStatus DecodeT2BROperand(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeThumbCmpBROperand(MCInst &Inst, unsigned Val,
-                                 uint64_t Address, const void *Decoder) {
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder) {
   if (!tryAddingSymbolicOperand(Address, Address + (Val<<1) + 4,
                                 true, 2, Inst, Decoder))
     Inst.addOperand(MCOperand::createImm(Val << 1));
@@ -3759,7 +3953,8 @@ static DecodeStatus DecodeThumbCmpBROperand(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeThumbAddrModeRR(MCInst &Inst, unsigned Val,
-                                 uint64_t Address, const void *Decoder) {
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Val, 0, 3);
@@ -3774,7 +3969,8 @@ static DecodeStatus DecodeThumbAddrModeRR(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeThumbAddrModeIS(MCInst &Inst, unsigned Val,
-                                  uint64_t Address, const void *Decoder) {
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Val, 0, 3);
@@ -3788,7 +3984,8 @@ static DecodeStatus DecodeThumbAddrModeIS(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeThumbAddrModePC(MCInst &Inst, unsigned Val,
-                                  uint64_t Address, const void *Decoder) {
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder) {
   unsigned imm = Val << 2;
 
   Inst.addOperand(MCOperand::createImm(imm));
@@ -3798,7 +3995,8 @@ static DecodeStatus DecodeThumbAddrModePC(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeThumbAddrModeSP(MCInst &Inst, unsigned Val,
-                                  uint64_t Address, const void *Decoder) {
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder) {
   Inst.addOperand(MCOperand::createReg(ARM::SP));
   Inst.addOperand(MCOperand::createImm(Val));
 
@@ -3806,7 +4004,8 @@ static DecodeStatus DecodeThumbAddrModeSP(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeT2AddrModeSOReg(MCInst &Inst, unsigned Val,
-                                  uint64_t Address, const void *Decoder) {
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Val, 6, 4);
@@ -3835,7 +4034,8 @@ static DecodeStatus DecodeT2AddrModeSOReg(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Insn,
-                              uint64_t Address, const void *Decoder) {
+                                      uint64_t Address,
+                                      const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rt = fieldFromInstruction(Insn, 12, 4);
@@ -3918,7 +4118,8 @@ static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeT2LoadImm8(MCInst &Inst, unsigned Insn,
-                                uint64_t Address, const void* Decoder) {
+                                     uint64_t Address,
+                                     const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Insn, 16, 4);
@@ -4002,7 +4203,8 @@ static DecodeStatus DecodeT2LoadImm8(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeT2LoadImm12(MCInst &Inst, unsigned Insn,
-                                uint64_t Address, const void* Decoder) {
+                                      uint64_t Address,
+                                      const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Insn, 16, 4);
@@ -4081,8 +4283,8 @@ static DecodeStatus DecodeT2LoadImm12(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeT2LoadT(MCInst &Inst, unsigned Insn,
-                                uint64_t Address, const void* Decoder) {
+static DecodeStatus DecodeT2LoadT(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                  const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Insn, 16, 4);
@@ -4121,7 +4323,8 @@ static DecodeStatus DecodeT2LoadT(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeT2LoadLabel(MCInst &Inst, unsigned Insn,
-                                uint64_t Address, const void* Decoder) {
+                                      uint64_t Address,
+                                      const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rt = fieldFromInstruction(Insn, 12, 4);
@@ -4173,8 +4376,8 @@ static DecodeStatus DecodeT2LoadLabel(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val,
-                           uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val, uint64_t Address,
+                                   const MCDisassembler *Decoder) {
   if (Val == 0)
     Inst.addOperand(MCOperand::createImm(INT32_MIN));
   else {
@@ -4188,7 +4391,7 @@ static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeT2Imm7S4(MCInst &Inst, unsigned Val, uint64_t Address,
-                                   const void *Decoder) {
+                                   const MCDisassembler *Decoder) {
   if (Val == 0)
     Inst.addOperand(MCOperand::createImm(INT32_MIN));
   else {
@@ -4203,7 +4406,8 @@ static DecodeStatus DecodeT2Imm7S4(MCInst &Inst, unsigned Val, uint64_t Address,
 }
 
 static DecodeStatus DecodeT2AddrModeImm8s4(MCInst &Inst, unsigned Val,
-                                   uint64_t Address, const void *Decoder) {
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Val, 9, 4);
@@ -4219,7 +4423,7 @@ static DecodeStatus DecodeT2AddrModeImm8s4(MCInst &Inst, unsigned Val,
 
 static DecodeStatus DecodeT2AddrModeImm7s4(MCInst &Inst, unsigned Val,
                                            uint64_t Address,
-                                           const void *Decoder) {
+                                           const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Val, 8, 4);
@@ -4233,8 +4437,9 @@ static DecodeStatus DecodeT2AddrModeImm7s4(MCInst &Inst, unsigned Val,
   return S;
 }
 
-static DecodeStatus DecodeT2AddrModeImm0_1020s4(MCInst &Inst,unsigned Val,
-                                   uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeT2AddrModeImm0_1020s4(MCInst &Inst, unsigned Val,
+                                                uint64_t Address,
+                                                const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Val, 8, 4);
@@ -4248,8 +4453,8 @@ static DecodeStatus DecodeT2AddrModeImm0_1020s4(MCInst &Inst,unsigned Val,
   return S;
 }
 
-static DecodeStatus DecodeT2Imm8(MCInst &Inst, unsigned Val,
-                         uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeT2Imm8(MCInst &Inst, unsigned Val, uint64_t Address,
+                                 const MCDisassembler *Decoder) {
   int imm = Val & 0xFF;
   if (Val == 0)
     imm = INT32_MIN;
@@ -4260,9 +4465,9 @@ static DecodeStatus DecodeT2Imm8(MCInst &Inst, unsigned Val,
   return MCDisassembler::Success;
 }
 
-template<int shift>
-static DecodeStatus DecodeT2Imm7(MCInst &Inst, unsigned Val,
-                         uint64_t Address, const void *Decoder) {
+template <int shift>
+static DecodeStatus DecodeT2Imm7(MCInst &Inst, unsigned Val, uint64_t Address,
+                                 const MCDisassembler *Decoder) {
   int imm = Val & 0x7F;
   if (Val == 0)
     imm = INT32_MIN;
@@ -4276,7 +4481,8 @@ static DecodeStatus DecodeT2Imm7(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val,
-                                 uint64_t Address, const void *Decoder) {
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Val, 9, 4);
@@ -4321,10 +4527,10 @@ static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val,
   return S;
 }
 
-template<int shift>
+template <int shift>
 static DecodeStatus DecodeTAddrModeImm7(MCInst &Inst, unsigned Val,
-                                         uint64_t Address,
-                                         const void *Decoder) {
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Val, 8, 3);
@@ -4338,10 +4544,10 @@ static DecodeStatus DecodeTAddrModeImm7(MCInst &Inst, unsigned Val,
   return S;
 }
 
-template<int shift, int WriteBack>
+template <int shift, int WriteBack>
 static DecodeStatus DecodeT2AddrModeImm7(MCInst &Inst, unsigned Val,
                                          uint64_t Address,
-                                         const void *Decoder) {
+                                         const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Val, 8, 4);
@@ -4358,7 +4564,8 @@ static DecodeStatus DecodeT2AddrModeImm7(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeT2LdStPre(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address, const void *Decoder) {
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rt = fieldFromInstruction(Insn, 12, 4);
@@ -4419,7 +4626,8 @@ static DecodeStatus DecodeT2LdStPre(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeT2AddrModeImm12(MCInst &Inst, unsigned Val,
-                                  uint64_t Address, const void *Decoder) {
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Val, 13, 4);
@@ -4445,7 +4653,8 @@ static DecodeStatus DecodeT2AddrModeImm12(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeThumbAddSPImm(MCInst &Inst, uint16_t Insn,
-                                uint64_t Address, const void *Decoder) {
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder) {
   unsigned imm = fieldFromInstruction(Insn, 0, 7);
 
   Inst.addOperand(MCOperand::createReg(ARM::SP));
@@ -4456,7 +4665,8 @@ static DecodeStatus DecodeThumbAddSPImm(MCInst &Inst, uint16_t Insn,
 }
 
 static DecodeStatus DecodeThumbAddSPReg(MCInst &Inst, uint16_t Insn,
-                                uint64_t Address, const void *Decoder) {
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   if (Inst.getOpcode() == ARM::tADDrSP) {
@@ -4481,7 +4691,8 @@ static DecodeStatus DecodeThumbAddSPReg(MCInst &Inst, uint16_t Insn,
 }
 
 static DecodeStatus DecodeThumbCPS(MCInst &Inst, uint16_t Insn,
-                           uint64_t Address, const void *Decoder) {
+                                   uint64_t Address,
+                                   const MCDisassembler *Decoder) {
   unsigned imod = fieldFromInstruction(Insn, 4, 1) | 0x2;
   unsigned flags = fieldFromInstruction(Insn, 0, 3);
 
@@ -4492,7 +4703,8 @@ static DecodeStatus DecodeThumbCPS(MCInst &Inst, uint16_t Insn,
 }
 
 static DecodeStatus DecodePostIdxReg(MCInst &Inst, unsigned Insn,
-                             uint64_t Address, const void *Decoder) {
+                                     uint64_t Address,
+                                     const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
   unsigned Rm = fieldFromInstruction(Insn, 0, 4);
   unsigned add = fieldFromInstruction(Insn, 4, 1);
@@ -4505,7 +4717,8 @@ static DecodeStatus DecodePostIdxReg(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeMveAddrModeRQ(MCInst &Inst, unsigned Insn,
-                             uint64_t Address, const void *Decoder) {
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
   unsigned Rn = fieldFromInstruction(Insn, 3, 4);
   unsigned Qm = fieldFromInstruction(Insn, 0, 3);
@@ -4518,9 +4731,10 @@ static DecodeStatus DecodeMveAddrModeRQ(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-template<int shift>
+template <int shift>
 static DecodeStatus DecodeMveAddrModeQ(MCInst &Inst, unsigned Insn,
-                             uint64_t Address, const void *Decoder) {
+                                       uint64_t Address,
+                                       const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
   unsigned Qm = fieldFromInstruction(Insn, 8, 3);
   int imm = fieldFromInstruction(Insn, 0, 7);
@@ -4542,7 +4756,8 @@ static DecodeStatus DecodeMveAddrModeQ(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeThumbBLXOffset(MCInst &Inst, unsigned Val,
-                                 uint64_t Address, const void *Decoder) {
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder) {
   // Val is passed in as S:J1:J2:imm10H:imm10L:'0'
   // Note only one trailing zero not two.  Also the J1 and J2 values are from
   // the encoded instruction.  So here change to I1 and I2 values via:
@@ -4566,7 +4781,8 @@ static DecodeStatus DecodeThumbBLXOffset(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Val,
-                              uint64_t Address, const void *Decoder) {
+                                      uint64_t Address,
+                                      const MCDisassembler *Decoder) {
   if (Val == 0xA || Val == 0xB)
     return MCDisassembler::Fail;
 
@@ -4580,9 +4796,9 @@ static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Val,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus
-DecodeThumbTableBranch(MCInst &Inst, unsigned Insn,
-                       uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeThumbTableBranch(MCInst &Inst, unsigned Insn,
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder) {
   const FeatureBitset &FeatureBits =
     ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
   DecodeStatus S = MCDisassembler::Success;
@@ -4598,9 +4814,9 @@ DecodeThumbTableBranch(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus
-DecodeThumb2BCCInstruction(MCInst &Inst, unsigned Insn,
-                           uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeThumb2BCCInstruction(MCInst &Inst, unsigned Insn,
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned pred = fieldFromInstruction(Insn, 22, 4);
@@ -4641,8 +4857,8 @@ DecodeThumb2BCCInstruction(MCInst &Inst, unsigned Insn,
 // Decode a shifted immediate operand.  These basically consist
 // of an 8-bit value, and a 4-bit directive that specifies either
 // a splat operation or a rotation.
-static DecodeStatus DecodeT2SOImm(MCInst &Inst, unsigned Val,
-                          uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeT2SOImm(MCInst &Inst, unsigned Val, uint64_t Address,
+                                  const MCDisassembler *Decoder) {
   unsigned ctrl = fieldFromInstruction(Val, 10, 2);
   if (ctrl == 0) {
     unsigned byte = fieldFromInstruction(Val, 8, 2);
@@ -4672,9 +4888,9 @@ static DecodeStatus DecodeT2SOImm(MCInst &Inst, unsigned Val,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus
-DecodeThumbBCCTargetOperand(MCInst &Inst, unsigned Val,
-                            uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeThumbBCCTargetOperand(MCInst &Inst, unsigned Val,
+                                                uint64_t Address,
+                                                const MCDisassembler *Decoder) {
   if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<9>(Val<<1) + 4,
                                 true, 2, Inst, Decoder))
     Inst.addOperand(MCOperand::createImm(SignExtend32<9>(Val << 1)));
@@ -4683,7 +4899,7 @@ DecodeThumbBCCTargetOperand(MCInst &Inst, unsigned Val,
 
 static DecodeStatus DecodeThumbBLTargetOperand(MCInst &Inst, unsigned Val,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   // Val is passed in as S:J1:J2:imm10:imm11
   // Note no trailing zero after imm11.  Also the J1 and J2 values are from
   // the encoded instruction.  So here change to I1 and I2 values via:
@@ -4706,7 +4922,8 @@ static DecodeStatus DecodeThumbBLTargetOperand(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeMemBarrierOption(MCInst &Inst, unsigned Val,
-                                   uint64_t Address, const void *Decoder) {
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder) {
   if (Val & ~0xf)
     return MCDisassembler::Fail;
 
@@ -4715,7 +4932,8 @@ static DecodeStatus DecodeMemBarrierOption(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeInstSyncBarrierOption(MCInst &Inst, unsigned Val,
-                                        uint64_t Address, const void *Decoder) {
+                                                uint64_t Address,
+                                                const MCDisassembler *Decoder) {
   if (Val & ~0xf)
     return MCDisassembler::Fail;
 
@@ -4723,8 +4941,8 @@ static DecodeStatus DecodeInstSyncBarrierOption(MCInst &Inst, unsigned Val,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Val,
-                          uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Val, uint64_t Address,
+                                  const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
   const FeatureBitset &FeatureBits =
     ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
@@ -4825,7 +5043,8 @@ static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeBankedReg(MCInst &Inst, unsigned Val,
-                                    uint64_t Address, const void *Decoder) {
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder) {
   unsigned R = fieldFromInstruction(Val, 5, 1);
   unsigned SysM = fieldFromInstruction(Val, 0, 5);
 
@@ -4840,7 +5059,8 @@ static DecodeStatus DecodeBankedReg(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeDoubleRegLoad(MCInst &Inst, unsigned Insn,
-                                        uint64_t Address, const void *Decoder) {
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rt = fieldFromInstruction(Insn, 12, 4);
@@ -4862,7 +5082,7 @@ static DecodeStatus DecodeDoubleRegLoad(MCInst &Inst, unsigned Insn,
 
 static DecodeStatus DecodeDoubleRegStore(MCInst &Inst, unsigned Insn,
                                          uint64_t Address,
-                                         const void *Decoder) {
+                                         const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rd = fieldFromInstruction(Insn, 12, 4);
@@ -4887,7 +5107,8 @@ static DecodeStatus DecodeDoubleRegStore(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeLDRPreImm(MCInst &Inst, unsigned Insn,
-                            uint64_t Address, const void *Decoder) {
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Insn, 16, 4);
@@ -4912,7 +5133,8 @@ static DecodeStatus DecodeLDRPreImm(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeLDRPreReg(MCInst &Inst, unsigned Insn,
-                            uint64_t Address, const void *Decoder) {
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Insn, 16, 4);
@@ -4939,7 +5161,8 @@ static DecodeStatus DecodeLDRPreReg(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeSTRPreImm(MCInst &Inst, unsigned Insn,
-                            uint64_t Address, const void *Decoder) {
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Insn, 16, 4);
@@ -4964,7 +5187,8 @@ static DecodeStatus DecodeSTRPreImm(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeSTRPreReg(MCInst &Inst, unsigned Insn,
-                            uint64_t Address, const void *Decoder) {
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Insn, 16, 4);
@@ -4988,8 +5212,8 @@ static DecodeStatus DecodeSTRPreReg(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeVLD1LN(MCInst &Inst, unsigned Insn,
-                         uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeVLD1LN(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                 const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Insn, 16, 4);
@@ -5055,8 +5279,8 @@ static DecodeStatus DecodeVLD1LN(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeVST1LN(MCInst &Inst, unsigned Insn,
-                         uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeVST1LN(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                 const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Insn, 16, 4);
@@ -5120,8 +5344,8 @@ static DecodeStatus DecodeVST1LN(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeVLD2LN(MCInst &Inst, unsigned Insn,
-                         uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeVLD2LN(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                 const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Insn, 16, 4);
@@ -5187,8 +5411,8 @@ static DecodeStatus DecodeVLD2LN(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeVST2LN(MCInst &Inst, unsigned Insn,
-                         uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeVST2LN(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                 const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Insn, 16, 4);
@@ -5250,8 +5474,8 @@ static DecodeStatus DecodeVST2LN(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeVLD3LN(MCInst &Inst, unsigned Insn,
-                         uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeVLD3LN(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                 const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Insn, 16, 4);
@@ -5320,8 +5544,8 @@ static DecodeStatus DecodeVLD3LN(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeVST3LN(MCInst &Inst, unsigned Insn,
-                         uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeVST3LN(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                 const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Insn, 16, 4);
@@ -5383,8 +5607,8 @@ static DecodeStatus DecodeVST3LN(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeVLD4LN(MCInst &Inst, unsigned Insn,
-                         uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeVLD4LN(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                 const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Insn, 16, 4);
@@ -5464,8 +5688,8 @@ static DecodeStatus DecodeVLD4LN(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeVST4LN(MCInst &Inst, unsigned Insn,
-                         uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeVST4LN(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                 const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Insn, 16, 4);
@@ -5536,8 +5760,8 @@ static DecodeStatus DecodeVST4LN(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeVMOVSRR(MCInst &Inst, unsigned Insn,
-                                  uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeVMOVSRR(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                  const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
   unsigned Rt  = fieldFromInstruction(Insn, 12, 4);
   unsigned Rt2 = fieldFromInstruction(Insn, 16, 4);
@@ -5562,8 +5786,8 @@ static DecodeStatus DecodeVMOVSRR(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeVMOVRRS(MCInst &Inst, unsigned Insn,
-                                  uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeVMOVRRS(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                  const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
   unsigned Rt  = fieldFromInstruction(Insn, 12, 4);
   unsigned Rt2 = fieldFromInstruction(Insn, 16, 4);
@@ -5588,8 +5812,8 @@ static DecodeStatus DecodeVMOVRRS(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeIT(MCInst &Inst, unsigned Insn,
-                             uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeIT(MCInst &Inst, unsigned Insn, uint64_t Address,
+                             const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
   unsigned pred = fieldFromInstruction(Insn, 4, 4);
   unsigned mask = fieldFromInstruction(Insn, 0, 4);
@@ -5617,9 +5841,9 @@ static DecodeStatus DecodeIT(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus
-DecodeT2LDRDPreInstruction(MCInst &Inst, unsigned Insn,
-                           uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeT2LDRDPreInstruction(MCInst &Inst, unsigned Insn,
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rt = fieldFromInstruction(Insn, 12, 4);
@@ -5654,9 +5878,9 @@ DecodeT2LDRDPreInstruction(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus
-DecodeT2STRDPreInstruction(MCInst &Inst, unsigned Insn,
-                           uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeT2STRDPreInstruction(MCInst &Inst, unsigned Insn,
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rt = fieldFromInstruction(Insn, 12, 4);
@@ -5689,8 +5913,8 @@ DecodeT2STRDPreInstruction(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeT2Adr(MCInst &Inst, uint32_t Insn,
-                                uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeT2Adr(MCInst &Inst, uint32_t Insn, uint64_t Address,
+                                const MCDisassembler *Decoder) {
   unsigned sign1 = fieldFromInstruction(Insn, 21, 1);
   unsigned sign2 = fieldFromInstruction(Insn, 23, 1);
   if (sign1 != sign2) return MCDisassembler::Fail;
@@ -5717,7 +5941,7 @@ static DecodeStatus DecodeT2Adr(MCInst &Inst, uint32_t Insn,
 
 static DecodeStatus DecodeT2ShifterImmOperand(MCInst &Inst, uint32_t Val,
                                               uint64_t Address,
-                                              const void *Decoder) {
+                                              const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   // Shift of "asr #32" is not allowed in Thumb2 mode.
@@ -5726,8 +5950,8 @@ static DecodeStatus DecodeT2ShifterImmOperand(MCInst &Inst, uint32_t Val,
   return S;
 }
 
-static DecodeStatus DecodeSwap(MCInst &Inst, unsigned Insn,
-                               uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeSwap(MCInst &Inst, unsigned Insn, uint64_t Address,
+                               const MCDisassembler *Decoder) {
   unsigned Rt   = fieldFromInstruction(Insn, 12, 4);
   unsigned Rt2  = fieldFromInstruction(Insn, 0,  4);
   unsigned Rn   = fieldFromInstruction(Insn, 16, 4);
@@ -5753,8 +5977,8 @@ static DecodeStatus DecodeSwap(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn,
-                                uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                const MCDisassembler *Decoder) {
   const FeatureBitset &featureBits =
       ((const MCDisassembler *)Decoder)->getSubtargetInfo().getFeatureBits();
   bool hasFullFP16 = featureBits[ARM::FeatureFullFP16];
@@ -5812,8 +6036,8 @@ static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn,
-                                uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                const MCDisassembler *Decoder) {
   const FeatureBitset &featureBits =
       ((const MCDisassembler *)Decoder)->getSubtargetInfo().getFeatureBits();
   bool hasFullFP16 = featureBits[ARM::FeatureFullFP16];
@@ -5871,10 +6095,10 @@ static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeNEONComplexLane64Instruction(MCInst &Inst,
-                                                       unsigned Insn,
-                                                       uint64_t Address,
-                                                       const void *Decoder) {
+static DecodeStatus
+DecodeNEONComplexLane64Instruction(MCInst &Inst, unsigned Insn,
+                                   uint64_t Address,
+                                   const MCDisassembler *Decoder) {
   unsigned Vd = (fieldFromInstruction(Insn, 12, 4) << 0);
   Vd |= (fieldFromInstruction(Insn, 22, 1) << 4);
   unsigned Vn = (fieldFromInstruction(Insn, 16, 4) << 0);
@@ -5904,8 +6128,8 @@ static DecodeStatus DecodeNEONComplexLane64Instruction(MCInst &Inst,
   return S;
 }
 
-static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val,
-                                uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val, uint64_t Address,
+                              const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Val, 16, 4);
@@ -5932,7 +6156,8 @@ static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecoderForMRRC2AndMCRR2(MCInst &Inst, unsigned Val,
-                                            uint64_t Address, const void *Decoder) {
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned CRm = fieldFromInstruction(Val, 0, 4);
@@ -5978,7 +6203,7 @@ static DecodeStatus DecoderForMRRC2AndMCRR2(MCInst &Inst, unsigned Val,
 
 static DecodeStatus DecodeForVMRSandVMSR(MCInst &Inst, unsigned Val,
                                          uint64_t Address,
-                                         const void *Decoder) {
+                                         const MCDisassembler *Decoder) {
   const FeatureBitset &featureBits =
       ((const MCDisassembler *)Decoder)->getSubtargetInfo().getFeatureBits();
   DecodeStatus S = MCDisassembler::Success;
@@ -6030,7 +6255,7 @@ static DecodeStatus DecodeForVMRSandVMSR(MCInst &Inst, unsigned Val,
 template <bool isSigned, bool isNeg, bool zeroPermitted, int size>
 static DecodeStatus DecodeBFLabelOperand(MCInst &Inst, unsigned Val,
                                          uint64_t Address,
-                                         const void *Decoder) {
+                                         const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
   if (Val == 0 && !zeroPermitted)
     S = MCDisassembler::Fail;
@@ -6049,7 +6274,7 @@ static DecodeStatus DecodeBFLabelOperand(MCInst &Inst, unsigned Val,
 
 static DecodeStatus DecodeBFAfterTargetOperand(MCInst &Inst, unsigned Val,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
 
   uint64_t LocImm = Inst.getOperand(0).getImm();
   Val = LocImm + (2 << Val);
@@ -6061,7 +6286,7 @@ static DecodeStatus DecodeBFAfterTargetOperand(MCInst &Inst, unsigned Val,
 
 static DecodeStatus DecodePredNoALOperand(MCInst &Inst, unsigned Val,
                                           uint64_t Address,
-                                          const void *Decoder) {
+                                          const MCDisassembler *Decoder) {
   if (Val >= ARMCC::AL)  // also exclude the non-condition NV
     return MCDisassembler::Fail;
   Inst.addOperand(MCOperand::createImm(Val));
@@ -6069,7 +6294,7 @@ static DecodeStatus DecodePredNoALOperand(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeLOLoop(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const void *Decoder) {
+                                 const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   if (Inst.getOpcode() == ARM::MVE_LCTP)
@@ -6132,7 +6357,7 @@ static DecodeStatus DecodeLOLoop(MCInst &Inst, unsigned Insn, uint64_t Address,
 
 static DecodeStatus DecodeLongShiftOperand(MCInst &Inst, unsigned Val,
                                            uint64_t Address,
-                                           const void *Decoder) {
+                                           const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   if (Val == 0)
@@ -6144,7 +6369,8 @@ static DecodeStatus DecodeLongShiftOperand(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodetGPROddRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder) {
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder) {
   if ((RegNo) + 1 > 11)
     return MCDisassembler::Fail;
 
@@ -6154,7 +6380,8 @@ static DecodeStatus DecodetGPROddRegisterClass(MCInst &Inst, unsigned RegNo,
 }
 
 static DecodeStatus DecodetGPREvenRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder) {
+                                                uint64_t Address,
+                                                const MCDisassembler *Decoder) {
   if ((RegNo) > 14)
     return MCDisassembler::Fail;
 
@@ -6165,7 +6392,8 @@ static DecodeStatus DecodetGPREvenRegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus
 DecodeGPRwithAPSR_NZCVnospRegisterClass(MCInst &Inst, unsigned RegNo,
-                                        uint64_t Address, const void *Decoder) {
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder) {
   if (RegNo == 15) {
     Inst.addOperand(MCOperand::createReg(ARM::APSR_NZCV));
     return MCDisassembler::Success;
@@ -6181,7 +6409,7 @@ DecodeGPRwithAPSR_NZCVnospRegisterClass(MCInst &Inst, unsigned RegNo,
 }
 
 static DecodeStatus DecodeVSCCLRM(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                  const void *Decoder) {
+                                  const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   Inst.addOperand(MCOperand::createImm(ARMCC::AL));
@@ -6207,8 +6435,8 @@ static DecodeStatus DecodeVSCCLRM(MCInst &Inst, unsigned Insn, uint64_t Address,
 }
 
 static DecodeStatus DecodeMQPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                              uint64_t Address,
-                              const void *Decoder) {
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder) {
   if (RegNo > 7)
     return MCDisassembler::Fail;
 
@@ -6224,7 +6452,7 @@ static const uint16_t QQPRDecoderTable[] = {
 
 static DecodeStatus DecodeMQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
-                                             const void *Decoder) {
+                                             const MCDisassembler *Decoder) {
   if (RegNo > 6)
     return MCDisassembler::Fail;
 
@@ -6240,7 +6468,7 @@ static const uint16_t QQQQPRDecoderTable[] = {
 
 static DecodeStatus DecodeMQQQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   if (RegNo > 4)
     return MCDisassembler::Fail;
 
@@ -6251,7 +6479,7 @@ static DecodeStatus DecodeMQQQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecodeVPTMaskOperand(MCInst &Inst, unsigned Val,
                                          uint64_t Address,
-                                         const void *Decoder) {
+                                         const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   // Parse VPT mask and encode it in the MCInst as an immediate with the same
@@ -6281,7 +6509,8 @@ static DecodeStatus DecodeVPTMaskOperand(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeVpredROperand(MCInst &Inst, unsigned RegNo,
-                                        uint64_t Address, const void *Decoder) {
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder) {
   // The vpred_r operand type includes an MQPR register field derived
   // from the encoding. But we don't actually want to add an operand
   // to the MCInst at this stage, because AddThumbPredicate will do it
@@ -6292,18 +6521,16 @@ static DecodeStatus DecodeVpredROperand(MCInst &Inst, unsigned RegNo,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeRestrictedIPredicateOperand(MCInst &Inst,
-                                                      unsigned Val,
-                                                      uint64_t Address,
-                                                      const void *Decoder) {
+static DecodeStatus
+DecodeRestrictedIPredicateOperand(MCInst &Inst, unsigned Val, uint64_t Address,
+                                  const MCDisassembler *Decoder) {
   Inst.addOperand(MCOperand::createImm((Val & 0x1) == 0 ? ARMCC::EQ : ARMCC::NE));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeRestrictedSPredicateOperand(MCInst &Inst,
-                                                      unsigned Val,
-                                                      uint64_t Address,
-                                                      const void *Decoder) {
+static DecodeStatus
+DecodeRestrictedSPredicateOperand(MCInst &Inst, unsigned Val, uint64_t Address,
+                                  const MCDisassembler *Decoder) {
   unsigned Code;
   switch (Val & 0x3) {
   case 0:
@@ -6323,17 +6550,16 @@ static DecodeStatus DecodeRestrictedSPredicateOperand(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeRestrictedUPredicateOperand(MCInst &Inst,
-                                                      unsigned Val,
-                                                      uint64_t Address,
-                                                      const void *Decoder) {
+static DecodeStatus
+DecodeRestrictedUPredicateOperand(MCInst &Inst, unsigned Val, uint64_t Address,
+                                  const MCDisassembler *Decoder) {
   Inst.addOperand(MCOperand::createImm((Val & 0x1) == 0 ? ARMCC::HS : ARMCC::HI));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeRestrictedFPPredicateOperand(MCInst &Inst, unsigned Val,
-                                                     uint64_t Address,
-                                                     const void *Decoder) {
+static DecodeStatus
+DecodeRestrictedFPPredicateOperand(MCInst &Inst, unsigned Val, uint64_t Address,
+                                   const MCDisassembler *Decoder) {
   unsigned Code;
   switch (Val) {
   default:
@@ -6363,7 +6589,8 @@ static DecodeStatus DecodeRestrictedFPPredicateOperand(MCInst &Inst, unsigned Va
 }
 
 static DecodeStatus DecodeVCVTImmOperand(MCInst &Inst, unsigned Val,
-                                         uint64_t Address, const void *Decoder) {
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned DecodedVal = 64 - Val;
@@ -6404,10 +6631,10 @@ static unsigned FixedRegForVSTRVLDR_SYSREG(unsigned Opcode) {
   }
 }
 
-template<bool Writeback>
+template <bool Writeback>
 static DecodeStatus DecodeVSTRVLDR_SYSREG(MCInst &Inst, unsigned Val,
                                           uint64_t Address,
-                                          const void *Decoder) {
+                                          const MCDisassembler *Decoder) {
   switch (Inst.getOpcode()) {
   case ARM::VSTR_FPSCR_pre:
   case ARM::VSTR_FPSCR_NZCVQC_pre:
@@ -6448,9 +6675,10 @@ static DecodeStatus DecodeVSTRVLDR_SYSREG(MCInst &Inst, unsigned Val,
   return S;
 }
 
-static inline DecodeStatus DecodeMVE_MEM_pre(
-  MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder,
-  unsigned Rn, OperandDecoder RnDecoder, OperandDecoder AddrDecoder) {
+static inline DecodeStatus
+DecodeMVE_MEM_pre(MCInst &Inst, unsigned Val, uint64_t Address,
+                  const MCDisassembler *Decoder, unsigned Rn,
+                  OperandDecoder RnDecoder, OperandDecoder AddrDecoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Qd = fieldFromInstruction(Val, 13, 3);
@@ -6469,7 +6697,8 @@ static inline DecodeStatus DecodeMVE_MEM_pre(
 
 template <int shift>
 static DecodeStatus DecodeMVE_MEM_1_pre(MCInst &Inst, unsigned Val,
-                                        uint64_t Address, const void *Decoder) {
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder) {
   return DecodeMVE_MEM_pre(Inst, Val, Address, Decoder,
                            fieldFromInstruction(Val, 16, 3),
                            DecodetGPRRegisterClass,
@@ -6478,7 +6707,8 @@ static DecodeStatus DecodeMVE_MEM_1_pre(MCInst &Inst, unsigned Val,
 
 template <int shift>
 static DecodeStatus DecodeMVE_MEM_2_pre(MCInst &Inst, unsigned Val,
-                                        uint64_t Address, const void *Decoder) {
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder) {
   return DecodeMVE_MEM_pre(Inst, Val, Address, Decoder,
                            fieldFromInstruction(Val, 16, 4),
                            DecoderGPRRegisterClass,
@@ -6487,17 +6717,18 @@ static DecodeStatus DecodeMVE_MEM_2_pre(MCInst &Inst, unsigned Val,
 
 template <int shift>
 static DecodeStatus DecodeMVE_MEM_3_pre(MCInst &Inst, unsigned Val,
-                                        uint64_t Address, const void *Decoder) {
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder) {
   return DecodeMVE_MEM_pre(Inst, Val, Address, Decoder,
                            fieldFromInstruction(Val, 17, 3),
                            DecodeMQPRRegisterClass,
                            DecodeMveAddrModeQ<shift>);
 }
 
-template<unsigned MinLog, unsigned MaxLog>
+template <unsigned MinLog, unsigned MaxLog>
 static DecodeStatus DecodePowerTwoOperand(MCInst &Inst, unsigned Val,
                                           uint64_t Address,
-                                          const void *Decoder) {
+                                          const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   if (Val < MinLog || Val > MaxLog)
@@ -6507,10 +6738,10 @@ static DecodeStatus DecodePowerTwoOperand(MCInst &Inst, unsigned Val,
   return S;
 }
 
-template<unsigned start>
-static DecodeStatus DecodeMVEPairVectorIndexOperand(MCInst &Inst, unsigned Val,
-                                                    uint64_t Address,
-                                                    const void *Decoder) {
+template <unsigned start>
+static DecodeStatus
+DecodeMVEPairVectorIndexOperand(MCInst &Inst, unsigned Val, uint64_t Address,
+                                const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   Inst.addOperand(MCOperand::createImm(start + Val));
@@ -6519,7 +6750,8 @@ static DecodeStatus DecodeMVEPairVectorIndexOperand(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeMVEVMOVQtoDReg(MCInst &Inst, unsigned Insn,
-                                         uint64_t Address, const void *Decoder) {
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
   unsigned Rt = fieldFromInstruction(Insn, 0, 4);
   unsigned Rt2 = fieldFromInstruction(Insn, 16, 4);
@@ -6542,7 +6774,8 @@ static DecodeStatus DecodeMVEVMOVQtoDReg(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeMVEVMOVDRegtoQ(MCInst &Inst, unsigned Insn,
-                                         uint64_t Address, const void *Decoder) {
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
   unsigned Rt = fieldFromInstruction(Insn, 0, 4);
   unsigned Rt2 = fieldFromInstruction(Insn, 16, 4);
@@ -6566,8 +6799,9 @@ static DecodeStatus DecodeMVEVMOVDRegtoQ(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeMVEOverlappingLongShift(
-  MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder) {
+static DecodeStatus
+DecodeMVEOverlappingLongShift(MCInst &Inst, unsigned Insn, uint64_t Address,
+                              const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned RdaLo = fieldFromInstruction(Insn, 17, 3) << 1;
@@ -6645,8 +6879,9 @@ static DecodeStatus DecodeMVEOverlappingLongShift(
   return S;
 }
 
-static DecodeStatus DecodeMVEVCVTt1fp(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                      const void *Decoder) {
+static DecodeStatus DecodeMVEVCVTt1fp(MCInst &Inst, unsigned Insn,
+                                      uint64_t Address,
+                                      const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
   unsigned Qd = ((fieldFromInstruction(Insn, 22, 1) << 3) |
                  fieldFromInstruction(Insn, 13, 3));
@@ -6664,9 +6899,9 @@ static DecodeStatus DecodeMVEVCVTt1fp(MCInst &Inst, unsigned Insn, uint64_t Addr
   return S;
 }
 
-template<bool scalar, OperandDecoder predicate_decoder>
+template <bool scalar, OperandDecoder predicate_decoder>
 static DecodeStatus DecodeMVEVCMP(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                  const void *Decoder) {
+                                  const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
   Inst.addOperand(MCOperand::createReg(ARM::VPR));
   unsigned Qn = fieldFromInstruction(Insn, 17, 3);
@@ -6703,7 +6938,7 @@ static DecodeStatus DecodeMVEVCMP(MCInst &Inst, unsigned Insn, uint64_t Address,
 }
 
 static DecodeStatus DecodeMveVCTP(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                  const void *Decoder) {
+                                  const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
   Inst.addOperand(MCOperand::createReg(ARM::VPR));
   unsigned Rn = fieldFromInstruction(Insn, 16, 4);
@@ -6712,8 +6947,9 @@ static DecodeStatus DecodeMveVCTP(MCInst &Inst, unsigned Insn, uint64_t Address,
   return S;
 }
 
-static DecodeStatus DecodeMVEVPNOT(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                   const void *Decoder) {
+static DecodeStatus DecodeMVEVPNOT(MCInst &Inst, unsigned Insn,
+                                   uint64_t Address,
+                                   const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
   Inst.addOperand(MCOperand::createReg(ARM::VPR));
   Inst.addOperand(MCOperand::createReg(ARM::VPR));
@@ -6721,7 +6957,8 @@ static DecodeStatus DecodeMVEVPNOT(MCInst &Inst, unsigned Insn, uint64_t Address
 }
 
 static DecodeStatus DecodeT2AddSubSPImm(MCInst &Inst, unsigned Insn,
-                                        uint64_t Address, const void *Decoder) {
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder) {
   const unsigned Rd = fieldFromInstruction(Insn, 8, 4);
   const unsigned Rn = fieldFromInstruction(Insn, 16, 4);
   const unsigned Imm12 = fieldFromInstruction(Insn, 26, 1) << 11 |
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 16bc0ca179a7..d74da27fbc4f 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -17,8 +17,8 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmBackend.h"
@@ -98,9 +98,20 @@ class ARMTargetAsmStreamer : public ARMTargetStreamer {
   void emitInst(uint32_t Inst, char Suffix = '\0') override;
   void finishAttributeSection() override;
 
-  void AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE) override;
+  void annotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE) override;
   void emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) override;
 
+  void emitARMWinCFIAllocStack(unsigned Size, bool Wide) override;
+  void emitARMWinCFISaveRegMask(unsigned Mask, bool Wide) override;
+  void emitARMWinCFISaveSP(unsigned Reg) override;
+  void emitARMWinCFISaveFRegs(unsigned First, unsigned Last) override;
+  void emitARMWinCFISaveLR(unsigned Offset) override;
+  void emitARMWinCFIPrologEnd(bool Fragment) override;
+  void emitARMWinCFINop(bool Wide) override;
+  void emitARMWinCFIEpilogStart(unsigned Condition) override;
+  void emitARMWinCFIEpilogEnd() override;
+  void emitARMWinCFICustom(unsigned Opcode) override;
+
 public:
   ARMTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS,
                        MCInstPrinter &InstPrinter, bool VerboseAsm);
@@ -239,8 +250,8 @@ void ARMTargetAsmStreamer::emitFPU(unsigned FPU) {
 
 void ARMTargetAsmStreamer::finishAttributeSection() {}
 
-void
-ARMTargetAsmStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *S) {
+void ARMTargetAsmStreamer::annotateTLSDescriptorSequence(
+    const MCSymbolRefExpr *S) {
   OS << "\t.tlsdescseq\t" << S->getSymbol().getName() << "\n";
 }
 
@@ -269,6 +280,101 @@ void ARMTargetAsmStreamer::emitUnwindRaw(int64_t Offset,
   OS << '\n';
 }
 
+void ARMTargetAsmStreamer::emitARMWinCFIAllocStack(unsigned Size, bool Wide) {
+  if (Wide)
+    OS << "\t.seh_stackalloc_w\t" << Size << "\n";
+  else
+    OS << "\t.seh_stackalloc\t" << Size << "\n";
+}
+
+static void printRegs(formatted_raw_ostream &OS, ListSeparator &LS, int First,
+                      int Last) {
+  if (First != Last)
+    OS << LS << "r" << First << "-r" << Last;
+  else
+    OS << LS << "r" << First;
+}
+
+void ARMTargetAsmStreamer::emitARMWinCFISaveRegMask(unsigned Mask, bool Wide) {
+  if (Wide)
+    OS << "\t.seh_save_regs_w\t";
+  else
+    OS << "\t.seh_save_regs\t";
+  ListSeparator LS;
+  int First = -1;
+  OS << "{";
+  for (int I = 0; I <= 12; I++) {
+    if (Mask & (1 << I)) {
+      if (First < 0)
+        First = I;
+    } else {
+      if (First >= 0) {
+        printRegs(OS, LS, First, I - 1);
+        First = -1;
+      }
+    }
+  }
+  if (First >= 0)
+    printRegs(OS, LS, First, 12);
+  if (Mask & (1 << 14))
+    OS << LS << "lr";
+  OS << "}\n";
+}
+
+void ARMTargetAsmStreamer::emitARMWinCFISaveSP(unsigned Reg) {
+  OS << "\t.seh_save_sp\tr" << Reg << "\n";
+}
+
+void ARMTargetAsmStreamer::emitARMWinCFISaveFRegs(unsigned First,
+                                                  unsigned Last) {
+  if (First != Last)
+    OS << "\t.seh_save_fregs\t{d" << First << "-d" << Last << "}\n";
+  else
+    OS << "\t.seh_save_fregs\t{d" << First << "}\n";
+}
+
+void ARMTargetAsmStreamer::emitARMWinCFISaveLR(unsigned Offset) {
+  OS << "\t.seh_save_lr\t" << Offset << "\n";
+}
+
+void ARMTargetAsmStreamer::emitARMWinCFIPrologEnd(bool Fragment) {
+  if (Fragment)
+    OS << "\t.seh_endprologue_fragment\n";
+  else
+    OS << "\t.seh_endprologue\n";
+}
+
+void ARMTargetAsmStreamer::emitARMWinCFINop(bool Wide) {
+  if (Wide)
+    OS << "\t.seh_nop_w\n";
+  else
+    OS << "\t.seh_nop\n";
+}
+
+void ARMTargetAsmStreamer::emitARMWinCFIEpilogStart(unsigned Condition) {
+  if (Condition == ARMCC::AL)
+    OS << "\t.seh_startepilogue\n";
+  else
+    OS << "\t.seh_startepilogue_cond\t"
+       << ARMCondCodeToString(static_cast<ARMCC::CondCodes>(Condition)) << "\n";
+}
+
+void ARMTargetAsmStreamer::emitARMWinCFIEpilogEnd() {
+  OS << "\t.seh_endepilogue\n";
+}
+
+void ARMTargetAsmStreamer::emitARMWinCFICustom(unsigned Opcode) {
+  int I;
+  for (I = 3; I > 0; I--)
+    if (Opcode & (0xffu << (8 * I)))
+      break;
+  ListSeparator LS;
+  OS << "\t.seh_custom\t";
+  for (; I >= 0; I--)
+    OS << LS << ((Opcode >> (8 * I)) & 0xff);
+  OS << "\n";
+}
+
 class ARMTargetELFStreamer : public ARMTargetStreamer {
 private:
   StringRef CurrentVendor;
@@ -309,7 +415,7 @@ private:
   void finishAttributeSection() override;
   void emitLabel(MCSymbol *Symbol) override;
 
-  void AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE) override;
+  void annotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE) override;
   void emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) override;
 
   // Reset state between object emissions
@@ -984,8 +1090,8 @@ void ARMTargetELFStreamer::emitLabel(MCSymbol *Symbol) {
     Streamer.emitThumbFunc(Symbol);
 }
 
-void
-ARMTargetELFStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *S) {
+void ARMTargetELFStreamer::annotateTLSDescriptorSequence(
+    const MCSymbolRefExpr *S) {
   getStreamer().EmitFixup(S, FK_Data_4);
 }
 
@@ -1057,7 +1163,7 @@ inline void ARMELFStreamer::SwitchToEHSection(StringRef Prefix,
   assert(EHSection && "Failed to get the required EH section");
 
   // Switch to .ARM.extab or .ARM.exidx section
-  SwitchSection(EHSection);
+  switchSection(EHSection);
   emitValueToAlignment(4, 0, 1, 0);
 }
 
@@ -1150,7 +1256,7 @@ void ARMELFStreamer::emitFnEnd() {
   }
 
   // Switch to the section containing FnStart
-  SwitchSection(&FnStart->getSection());
+  switchSection(&FnStart->getSection());
 
   // Clean exception handling frame information
   EHReset();
@@ -1369,12 +1475,8 @@ MCTargetStreamer *createARMNullTargetStreamer(MCStreamer &S) {
   return new ARMTargetStreamer(S);
 }
 
-MCTargetStreamer *createARMObjectTargetStreamer(MCStreamer &S,
-                                                const MCSubtargetInfo &STI) {
-  const Triple &TT = STI.getTargetTriple();
-  if (TT.isOSBinFormatELF())
-    return new ARMTargetELFStreamer(S);
-  return new ARMTargetStreamer(S);
+MCTargetStreamer *createARMObjectTargetELFStreamer(MCStreamer &S) {
+  return new ARMTargetELFStreamer(S);
 }
 
 MCELFStreamer *createARMELFStreamer(MCContext &Context,
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
index 77c0e3522911..febd8ab8bbc0 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -89,6 +89,7 @@ ARMCOFFMCAsmInfoMicrosoft::ARMCOFFMCAsmInfoMicrosoft() {
   AlignmentIsInBytes = false;
   SupportsDebugInformation = true;
   ExceptionsType = ExceptionHandling::WinEH;
+  WinEHEncodingType = WinEH::EncodingType::Itanium;
   PrivateGlobalPrefix = "$M";
   PrivateLabelPrefix = "$M";
   CommentString = "@";
@@ -110,7 +111,8 @@ ARMCOFFMCAsmInfoGNU::ARMCOFFMCAsmInfoGNU() {
   PrivateLabelPrefix = ".L";
 
   SupportsDebugInformation = true;
-  ExceptionsType = ExceptionHandling::DwarfCFI;
+  ExceptionsType = ExceptionHandling::WinEH;
+  WinEHEncodingType = WinEH::EncodingType::Itanium;
   UseParensForSymbolVariant = true;
 
   DwarfRegNumForCFI = false;
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index 5ecacdab390f..c33bbfcc7114 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -2006,13 +2006,11 @@ getMVEPairVectorIndexOpValue(const MCInst &MI, unsigned OpIdx,
 #include "ARMGenMCCodeEmitter.inc"
 
 MCCodeEmitter *llvm::createARMLEMCCodeEmitter(const MCInstrInfo &MCII,
-                                              const MCRegisterInfo &MRI,
                                               MCContext &Ctx) {
   return new ARMMCCodeEmitter(MCII, Ctx, true);
 }
 
 MCCodeEmitter *llvm::createARMBEMCCodeEmitter(const MCInstrInfo &MCII,
-                                              const MCRegisterInfo &MRI,
                                               MCContext &Ctx) {
   return new ARMMCCodeEmitter(MCII, Ctx, false);
 }
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 17ca1866cf95..3f1379f135d1 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -87,18 +87,6 @@ static bool getMRCDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI,
   return false;
 }
 
-static bool getITDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI,
-                                 std::string &Info) {
-  if (STI.getFeatureBits()[llvm::ARM::HasV8Ops] && MI.getOperand(1).isImm() &&
-      MI.getOperand(1).getImm() != 8) {
-    Info = "applying IT instruction to more than one subsequent instruction is "
-           "deprecated";
-    return true;
-  }
-
-  return false;
-}
-
 static bool getARMStoreDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI,
                                        std::string &Info) {
   assert(!STI.getFeatureBits()[llvm::ARM::ModeThumb] &&
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
index 5c8f9bfdca08..e0c992f4fae2 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -71,13 +71,13 @@ MCTargetStreamer *createARMTargetAsmStreamer(MCStreamer &S,
                                              bool isVerboseAsm);
 MCTargetStreamer *createARMObjectTargetStreamer(MCStreamer &S,
                                                 const MCSubtargetInfo &STI);
+MCTargetStreamer *createARMObjectTargetELFStreamer(MCStreamer &S);
+MCTargetStreamer *createARMObjectTargetWinCOFFStreamer(MCStreamer &S);
 
 MCCodeEmitter *createARMLEMCCodeEmitter(const MCInstrInfo &MCII,
-                                        const MCRegisterInfo &MRI,
                                         MCContext &Ctx);
 
 MCCodeEmitter *createARMBEMCCodeEmitter(const MCInstrInfo &MCII,
-                                        const MCRegisterInfo &MRI,
                                         MCContext &Ctx);
 
 MCAsmBackend *createARMLEAsmBackend(const Target &T, const MCSubtargetInfo &STI,
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index ed4000c7e5be..0ea51839824b 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -9,6 +9,7 @@
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "MCTargetDesc/ARMFixupKinds.h"
 #include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/MC/MCAsmLayout.h"
@@ -21,7 +22,6 @@
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/ScopedPrinter.h"
 
 using namespace llvm;
 
@@ -149,7 +149,7 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
   if (FixupOffset & 0xff000000) {
     Asm.getContext().reportError(Fixup.getLoc(),
                                  "can not encode offset '0x" +
-                                     to_hexString(FixupOffset) +
+                                     utohexstr(FixupOffset) +
                                      "' in resulting scattered relocation.");
     return;
   }
@@ -264,7 +264,7 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
   if (FixupOffset & 0xff000000) {
     Asm.getContext().reportError(Fixup.getLoc(),
                                  "can not encode offset '0x" +
-                                     to_hexString(FixupOffset) +
+                                     utohexstr(FixupOffset) +
                                      "' in resulting scattered relocation.");
     return;
   }
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
index 02a2d01176fc..16d1ae62053e 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
@@ -114,15 +114,28 @@ void ARMTargetStreamer::emitArchExtension(uint64_t ArchExt) {}
 void ARMTargetStreamer::emitObjectArch(ARM::ArchKind Arch) {}
 void ARMTargetStreamer::emitFPU(unsigned FPU) {}
 void ARMTargetStreamer::finishAttributeSection() {}
-void
-ARMTargetStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE) {}
+void ARMTargetStreamer::annotateTLSDescriptorSequence(
+    const MCSymbolRefExpr *SRE) {}
 void ARMTargetStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) {}
 
+void ARMTargetStreamer::emitARMWinCFIAllocStack(unsigned Size, bool Wide) {}
+void ARMTargetStreamer::emitARMWinCFISaveRegMask(unsigned Mask, bool Wide) {}
+void ARMTargetStreamer::emitARMWinCFISaveSP(unsigned Reg) {}
+void ARMTargetStreamer::emitARMWinCFISaveFRegs(unsigned First, unsigned Last) {}
+void ARMTargetStreamer::emitARMWinCFISaveLR(unsigned Offset) {}
+void ARMTargetStreamer::emitARMWinCFINop(bool Wide) {}
+void ARMTargetStreamer::emitARMWinCFIPrologEnd(bool Fragment) {}
+void ARMTargetStreamer::emitARMWinCFIEpilogStart(unsigned Condition) {}
+void ARMTargetStreamer::emitARMWinCFIEpilogEnd() {}
+void ARMTargetStreamer::emitARMWinCFICustom(unsigned Opcode) {}
+
 static ARMBuildAttrs::CPUArch getArchForCPU(const MCSubtargetInfo &STI) {
   if (STI.getCPU() == "xscale")
     return ARMBuildAttrs::v5TEJ;
 
-  if (STI.hasFeature(ARM::HasV8Ops)) {
+  if (STI.hasFeature(ARM::HasV9_0aOps))
+    return ARMBuildAttrs::v9_A;
+  else if (STI.hasFeature(ARM::HasV8Ops)) {
     if (STI.hasFeature(ARM::FeatureRClass))
       return ARMBuildAttrs::v8_R;
     return ARMBuildAttrs::v8_A;
@@ -305,3 +318,13 @@ void ARMTargetStreamer::emitTargetAttributes(const MCSubtargetInfo &STI) {
     emitAttribute(ARMBuildAttrs::BTI_extension, ARMBuildAttrs::AllowBTI);
   }
 }
+
+MCTargetStreamer *
+llvm::createARMObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
+  const Triple &TT = STI.getTargetTriple();
+  if (TT.isOSBinFormatELF())
+    return createARMObjectTargetELFStreamer(S);
+  if (TT.isOSBinFormatCOFF())
+    return createARMObjectTargetWinCOFFStreamer(S);
+  return new ARMTargetStreamer(S);
+}
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
index e6f649164a29..cdd7f6fb715a 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
@@ -8,30 +8,59 @@
 
 #include "ARMMCTargetDesc.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCWin64EH.h"
 #include "llvm/MC/MCWinCOFFStreamer.h"
 
 using namespace llvm;
 
 namespace {
 class ARMWinCOFFStreamer : public MCWinCOFFStreamer {
+  Win64EH::ARMUnwindEmitter EHStreamer;
+
 public:
   ARMWinCOFFStreamer(MCContext &C, std::unique_ptr<MCAsmBackend> AB,
                      std::unique_ptr<MCCodeEmitter> CE,
                      std::unique_ptr<MCObjectWriter> OW)
       : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW)) {}
 
+  void emitWinEHHandlerData(SMLoc Loc) override;
+  void emitWindowsUnwindTables() override;
+  void emitWindowsUnwindTables(WinEH::FrameInfo *Frame) override;
+
   void emitThumbFunc(MCSymbol *Symbol) override;
   void finishImpl() override;
 };
 
+void ARMWinCOFFStreamer::emitWinEHHandlerData(SMLoc Loc) {
+  MCStreamer::emitWinEHHandlerData(Loc);
+
+  // We have to emit the unwind info now, because this directive
+  // actually switches to the .xdata section!
+  EHStreamer.EmitUnwindInfo(*this, getCurrentWinFrameInfo(),
+                            /* HandlerData = */ true);
+}
+
+void ARMWinCOFFStreamer::emitWindowsUnwindTables(WinEH::FrameInfo *Frame) {
+  EHStreamer.EmitUnwindInfo(*this, Frame, /* HandlerData = */ false);
+}
+
+void ARMWinCOFFStreamer::emitWindowsUnwindTables() {
+  if (!getNumWinFrameInfos())
+    return;
+  EHStreamer.Emit(*this);
+}
+
 void ARMWinCOFFStreamer::emitThumbFunc(MCSymbol *Symbol) {
   getAssembler().setIsThumbFunc(Symbol);
 }
 
 void ARMWinCOFFStreamer::finishImpl() {
   emitFrames(nullptr);
+  emitWindowsUnwindTables();
 
   MCWinCOFFStreamer::finishImpl();
 }
@@ -48,3 +77,201 @@ MCStreamer *llvm::createARMWinCOFFStreamer(
   return S;
 }
 
+namespace {
+class ARMTargetWinCOFFStreamer : public llvm::ARMTargetStreamer {
+private:
+  // True if we are processing SEH directives in an epilogue.
+  bool InEpilogCFI = false;
+
+  // Symbol of the current epilog for which we are processing SEH directives.
+  MCSymbol *CurrentEpilog = nullptr;
+
+public:
+  ARMTargetWinCOFFStreamer(llvm::MCStreamer &S) : ARMTargetStreamer(S) {}
+
+  // The unwind codes on ARM Windows are documented at
+  // https://docs.microsoft.com/en-us/cpp/build/arm-exception-handling
+  void emitARMWinCFIAllocStack(unsigned Size, bool Wide) override;
+  void emitARMWinCFISaveRegMask(unsigned Mask, bool Wide) override;
+  void emitARMWinCFISaveSP(unsigned Reg) override;
+  void emitARMWinCFISaveFRegs(unsigned First, unsigned Last) override;
+  void emitARMWinCFISaveLR(unsigned Offset) override;
+  void emitARMWinCFIPrologEnd(bool Fragment) override;
+  void emitARMWinCFINop(bool Wide) override;
+  void emitARMWinCFIEpilogStart(unsigned Condition) override;
+  void emitARMWinCFIEpilogEnd() override;
+  void emitARMWinCFICustom(unsigned Opcode) override;
+
+private:
+  void emitARMWinUnwindCode(unsigned UnwindCode, int Reg, int Offset);
+};
+
+// Helper function to common out unwind code setup for those codes that can
+// belong to both prolog and epilog.
+void ARMTargetWinCOFFStreamer::emitARMWinUnwindCode(unsigned UnwindCode,
+                                                    int Reg, int Offset) {
+  auto &S = getStreamer();
+  WinEH::FrameInfo *CurFrame = S.EnsureValidWinFrameInfo(SMLoc());
+  if (!CurFrame)
+    return;
+  MCSymbol *Label = S.emitCFILabel();
+  auto Inst = WinEH::Instruction(UnwindCode, Label, Reg, Offset);
+  if (InEpilogCFI)
+    CurFrame->EpilogMap[CurrentEpilog].Instructions.push_back(Inst);
+  else
+    CurFrame->Instructions.push_back(Inst);
+}
+
+void ARMTargetWinCOFFStreamer::emitARMWinCFIAllocStack(unsigned Size,
+                                                       bool Wide) {
+  unsigned Op = Win64EH::UOP_AllocSmall;
+  if (!Wide) {
+    if (Size / 4 > 0xffff)
+      Op = Win64EH::UOP_AllocHuge;
+    else if (Size / 4 > 0x7f)
+      Op = Win64EH::UOP_AllocLarge;
+  } else {
+    Op = Win64EH::UOP_WideAllocMedium;
+    if (Size / 4 > 0xffff)
+      Op = Win64EH::UOP_WideAllocHuge;
+    else if (Size / 4 > 0x3ff)
+      Op = Win64EH::UOP_WideAllocLarge;
+  }
+  emitARMWinUnwindCode(Op, -1, Size);
+}
+
+void ARMTargetWinCOFFStreamer::emitARMWinCFISaveRegMask(unsigned Mask,
+                                                        bool Wide) {
+  assert(Mask != 0);
+  int Lr = (Mask & 0x4000) ? 1 : 0;
+  Mask &= ~0x4000;
+  if (Wide)
+    assert((Mask & ~0x1fff) == 0);
+  else
+    assert((Mask & ~0x00ff) == 0);
+  if (Mask && ((Mask + (1 << 4)) & Mask) == 0) {
+    if (Wide && (Mask & 0x1000) == 0 && (Mask & 0xff) == 0xf0) {
+      // One continuous range from r4 to r8-r11
+      for (int I = 11; I >= 8; I--) {
+        if (Mask & (1 << I)) {
+          emitARMWinUnwindCode(Win64EH::UOP_WideSaveRegsR4R11LR, I, Lr);
+          return;
+        }
+      }
+      // If it actually was from r4 to r4-r7, continue below.
+    } else if (!Wide) {
+      // One continuous range from r4 to r4-r7
+      for (int I = 7; I >= 4; I--) {
+        if (Mask & (1 << I)) {
+          emitARMWinUnwindCode(Win64EH::UOP_SaveRegsR4R7LR, I, Lr);
+          return;
+        }
+      }
+      llvm_unreachable("logic error");
+    }
+  }
+  Mask |= Lr << 14;
+  if (Wide)
+    emitARMWinUnwindCode(Win64EH::UOP_WideSaveRegMask, Mask, 0);
+  else
+    emitARMWinUnwindCode(Win64EH::UOP_SaveRegMask, Mask, 0);
+}
+
+void ARMTargetWinCOFFStreamer::emitARMWinCFISaveSP(unsigned Reg) {
+  emitARMWinUnwindCode(Win64EH::UOP_SaveSP, Reg, 0);
+}
+
+void ARMTargetWinCOFFStreamer::emitARMWinCFISaveFRegs(unsigned First,
+                                                      unsigned Last) {
+  assert(First <= Last);
+  assert(First >= 16 || Last < 16);
+  assert(First <= 31 && Last <= 31);
+  if (First == 8)
+    emitARMWinUnwindCode(Win64EH::UOP_SaveFRegD8D15, Last, 0);
+  else if (First <= 15)
+    emitARMWinUnwindCode(Win64EH::UOP_SaveFRegD0D15, First, Last);
+  else
+    emitARMWinUnwindCode(Win64EH::UOP_SaveFRegD16D31, First, Last);
+}
+
+void ARMTargetWinCOFFStreamer::emitARMWinCFISaveLR(unsigned Offset) {
+  emitARMWinUnwindCode(Win64EH::UOP_SaveLR, 0, Offset);
+}
+
+void ARMTargetWinCOFFStreamer::emitARMWinCFINop(bool Wide) {
+  if (Wide)
+    emitARMWinUnwindCode(Win64EH::UOP_WideNop, -1, 0);
+  else
+    emitARMWinUnwindCode(Win64EH::UOP_Nop, -1, 0);
+}
+
+void ARMTargetWinCOFFStreamer::emitARMWinCFIPrologEnd(bool Fragment) {
+  auto &S = getStreamer();
+  WinEH::FrameInfo *CurFrame = S.EnsureValidWinFrameInfo(SMLoc());
+  if (!CurFrame)
+    return;
+
+  MCSymbol *Label = S.emitCFILabel();
+  CurFrame->PrologEnd = Label;
+  WinEH::Instruction Inst =
+      WinEH::Instruction(Win64EH::UOP_End, /*Label=*/nullptr, -1, 0);
+  auto it = CurFrame->Instructions.begin();
+  CurFrame->Instructions.insert(it, Inst);
+  CurFrame->Fragment = Fragment;
+}
+
+void ARMTargetWinCOFFStreamer::emitARMWinCFIEpilogStart(unsigned Condition) {
+  auto &S = getStreamer();
+  WinEH::FrameInfo *CurFrame = S.EnsureValidWinFrameInfo(SMLoc());
+  if (!CurFrame)
+    return;
+
+  InEpilogCFI = true;
+  CurrentEpilog = S.emitCFILabel();
+  CurFrame->EpilogMap[CurrentEpilog].Condition = Condition;
+}
+
+void ARMTargetWinCOFFStreamer::emitARMWinCFIEpilogEnd() {
+  auto &S = getStreamer();
+  WinEH::FrameInfo *CurFrame = S.EnsureValidWinFrameInfo(SMLoc());
+  if (!CurFrame)
+    return;
+
+  if (!CurrentEpilog) {
+    S.getContext().reportError(SMLoc(), "Stray .seh_endepilogue in " +
+                                            CurFrame->Function->getName());
+    return;
+  }
+
+  std::vector<WinEH::Instruction> &Epilog =
+      CurFrame->EpilogMap[CurrentEpilog].Instructions;
+
+  unsigned UnwindCode = Win64EH::UOP_End;
+  if (!Epilog.empty()) {
+    WinEH::Instruction EndInstr = Epilog.back();
+    if (EndInstr.Operation == Win64EH::UOP_Nop) {
+      UnwindCode = Win64EH::UOP_EndNop;
+      Epilog.pop_back();
+    } else if (EndInstr.Operation == Win64EH::UOP_WideNop) {
+      UnwindCode = Win64EH::UOP_WideEndNop;
+      Epilog.pop_back();
+    }
+  }
+
+  InEpilogCFI = false;
+  WinEH::Instruction Inst = WinEH::Instruction(UnwindCode, nullptr, -1, 0);
+  CurFrame->EpilogMap[CurrentEpilog].Instructions.push_back(Inst);
+  MCSymbol *Label = S.emitCFILabel();
+  CurFrame->EpilogMap[CurrentEpilog].End = Label;
+  CurrentEpilog = nullptr;
+}
+
+void ARMTargetWinCOFFStreamer::emitARMWinCFICustom(unsigned Opcode) {
+  emitARMWinUnwindCode(Win64EH::UOP_Custom, 0, Opcode);
+}
+
+} // end anonymous namespace
+
+MCTargetStreamer *llvm::createARMObjectTargetWinCOFFStreamer(MCStreamer &S) {
+  return new ARMTargetWinCOFFStreamer(S);
+}
diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
index cfd275bc0621..30785340ef12 100644
--- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
+++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
@@ -145,7 +145,8 @@ private:
   // Optimise the base and offsets of the given address
   bool optimiseAddress(Value *Address, BasicBlock *BB, LoopInfo *LI);
   // Try to fold consecutive geps together into one
-  Value *foldGEP(GetElementPtrInst *GEP, Value *&Offsets, IRBuilder<> &Builder);
+  Value *foldGEP(GetElementPtrInst *GEP, Value *&Offsets, unsigned &Scale,
+                 IRBuilder<> &Builder);
   // Check whether these offsets could be moved out of the loop they're in
   bool optimiseOffsets(Value *Offsets, BasicBlock *BB, LoopInfo *LI);
   // Pushes the given add out of the loop
@@ -390,7 +391,7 @@ MVEGatherScatterLowering::getVarAndConst(Value *Inst, int TypeScale) {
     return ReturnFalse;
 
   // Check that the constant is small enough for an incrementing gather
-  int64_t Immediate = Const.getValue() << TypeScale;
+  int64_t Immediate = *Const << TypeScale;
   if (Immediate > 512 || Immediate < -512 || Immediate % 4 != 0)
     return ReturnFalse;
 
@@ -964,7 +965,7 @@ static bool hasAllGatScatUsers(Instruction *I, const DataLayout &DL) {
 
 bool MVEGatherScatterLowering::optimiseOffsets(Value *Offsets, BasicBlock *BB,
                                                LoopInfo *LI) {
-  LLVM_DEBUG(dbgs() << "masked gathers/scatters: trying to optimize\n"
+  LLVM_DEBUG(dbgs() << "masked gathers/scatters: trying to optimize: "
                     << *Offsets << "\n");
   // Optimise the addresses of gathers/scatters by moving invariant
   // calculations out of the loop
@@ -1103,8 +1104,8 @@ bool MVEGatherScatterLowering::optimiseOffsets(Value *Offsets, BasicBlock *BB,
   return true;
 }
 
-static Value *CheckAndCreateOffsetAdd(Value *X, Value *Y, Value *GEP,
-                                      IRBuilder<> &Builder) {
+static Value *CheckAndCreateOffsetAdd(Value *X, unsigned ScaleX, Value *Y,
+                                      unsigned ScaleY, IRBuilder<> &Builder) {
   // Splat the non-vector value to a vector of the given type - if the value is
   // a constant (and its value isn't too big), we can even use this opportunity
   // to scale it to the size of the vector elements
@@ -1156,40 +1157,49 @@ static Value *CheckAndCreateOffsetAdd(Value *X, Value *Y, Value *GEP,
       ConstantInt *ConstYEl =
           dyn_cast<ConstantInt>(ConstY->getAggregateElement(i));
       if (!ConstXEl || !ConstYEl ||
-          ConstXEl->getZExtValue() + ConstYEl->getZExtValue() >=
+          ConstXEl->getZExtValue() * ScaleX +
+                  ConstYEl->getZExtValue() * ScaleY >=
               (unsigned)(1 << (TargetElemSize - 1)))
         return nullptr;
     }
   }
 
-  Value *Add = Builder.CreateAdd(X, Y);
+  Value *XScale = Builder.CreateVectorSplat(
+      XElType->getNumElements(),
+      Builder.getIntN(XElType->getScalarSizeInBits(), ScaleX));
+  Value *YScale = Builder.CreateVectorSplat(
+      YElType->getNumElements(),
+      Builder.getIntN(YElType->getScalarSizeInBits(), ScaleY));
+  Value *Add = Builder.CreateAdd(Builder.CreateMul(X, XScale),
+                                 Builder.CreateMul(Y, YScale));
 
-  FixedVectorType *GEPType = cast<FixedVectorType>(GEP->getType());
-  if (checkOffsetSize(Add, GEPType->getNumElements()))
+  if (checkOffsetSize(Add, XElType->getNumElements()))
     return Add;
   else
     return nullptr;
 }
 
 Value *MVEGatherScatterLowering::foldGEP(GetElementPtrInst *GEP,
-                                         Value *&Offsets,
+                                         Value *&Offsets, unsigned &Scale,
                                          IRBuilder<> &Builder) {
   Value *GEPPtr = GEP->getPointerOperand();
   Offsets = GEP->getOperand(1);
+  Scale = DL->getTypeAllocSize(GEP->getSourceElementType());
   // We only merge geps with constant offsets, because only for those
   // we can make sure that we do not cause an overflow
-  if (!isa<Constant>(Offsets))
+  if (GEP->getNumIndices() != 1 || !isa<Constant>(Offsets))
     return nullptr;
-  GetElementPtrInst *BaseGEP;
-  if ((BaseGEP = dyn_cast<GetElementPtrInst>(GEPPtr))) {
+  if (GetElementPtrInst *BaseGEP = dyn_cast<GetElementPtrInst>(GEPPtr)) {
     // Merge the two geps into one
-    Value *BaseBasePtr = foldGEP(BaseGEP, Offsets, Builder);
+    Value *BaseBasePtr = foldGEP(BaseGEP, Offsets, Scale, Builder);
     if (!BaseBasePtr)
       return nullptr;
-    Offsets =
-        CheckAndCreateOffsetAdd(Offsets, GEP->getOperand(1), GEP, Builder);
+    Offsets = CheckAndCreateOffsetAdd(
+        Offsets, Scale, GEP->getOperand(1),
+        DL->getTypeAllocSize(GEP->getSourceElementType()), Builder);
     if (Offsets == nullptr)
       return nullptr;
+    Scale = 1; // Scale is always an i8 at this point.
     return BaseBasePtr;
   }
   return GEPPtr;
@@ -1206,15 +1216,24 @@ bool MVEGatherScatterLowering::optimiseAddress(Value *Address, BasicBlock *BB,
     Builder.SetInsertPoint(GEP);
     Builder.SetCurrentDebugLocation(GEP->getDebugLoc());
     Value *Offsets;
-    Value *Base = foldGEP(GEP, Offsets, Builder);
+    unsigned Scale;
+    Value *Base = foldGEP(GEP, Offsets, Scale, Builder);
     // We only want to merge the geps if there is a real chance that they can be
     // used by an MVE gather; thus the offset has to have the correct size
     // (always i32 if it is not of vector type) and the base has to be a
     // pointer.
     if (Offsets && Base && Base != GEP) {
+      assert(Scale == 1 && "Expected to fold GEP to a scale of 1");
+      Type *BaseTy = Builder.getInt8PtrTy();
+      if (auto *VecTy = dyn_cast<FixedVectorType>(Base->getType()))
+        BaseTy = FixedVectorType::get(BaseTy, VecTy);
       GetElementPtrInst *NewAddress = GetElementPtrInst::Create(
-          GEP->getSourceElementType(), Base, Offsets, "gep.merged", GEP);
-      GEP->replaceAllUsesWith(NewAddress);
+          Builder.getInt8Ty(), Builder.CreateBitCast(Base, BaseTy), Offsets,
+          "gep.merged", GEP);
+      LLVM_DEBUG(dbgs() << "Folded GEP: " << *GEP
+                        << "\n      new :  " << *NewAddress << "\n");
+      GEP->replaceAllUsesWith(
+          Builder.CreateBitCast(NewAddress, GEP->getType()));
       GEP = NewAddress;
       Changed = true;
     }
diff --git a/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp b/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp
index 538bd10685b0..3e76efb5133f 100644
--- a/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp
+++ b/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp
@@ -45,6 +45,7 @@
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMSubtarget.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
@@ -176,9 +177,8 @@ static bool tryInterleave(Instruction *Start,
     // Truncs
     case Instruction::Trunc:
     case Instruction::FPTrunc:
-      if (Truncs.count(I))
+      if (!Truncs.insert(I))
         continue;
-      Truncs.insert(I);
       Visited.insert(I);
       break;
 
@@ -235,9 +235,8 @@ static bool tryInterleave(Instruction *Start,
     case Instruction::FAdd:
     case Instruction::FMul:
     case Instruction::Select:
-      if (Ops.count(I))
+      if (!Ops.insert(I))
         continue;
-      Ops.insert(I);
 
       for (Use &Op : I->operands()) {
         if (!isa<FixedVectorType>(Op->getType()))
diff --git a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
index 7e31ea77f4f5..6bad9d61238e 100644
--- a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
+++ b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
@@ -404,6 +404,17 @@ bool MVETPAndVPTOptimisations::MergeLoopEnd(MachineLoop *ML) {
     LoopPhi->getOperand(3).setReg(DecReg);
   }
 
+  SmallVector<MachineOperand, 4> Cond;              // For analyzeBranch.
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
+  if (!TII->analyzeBranch(*LoopEnd->getParent(), TBB, FBB, Cond) && !FBB) {
+    // If the LoopEnd falls through, need to insert a t2B to the fall-through
+    // block so that the non-analyzable t2LoopEndDec doesn't fall through.
+    MachineFunction::iterator MBBI = ++LoopEnd->getParent()->getIterator();
+    BuildMI(LoopEnd->getParent(), DebugLoc(), TII->get(ARM::t2B))
+        .addMBB(&*MBBI)
+        .add(predOps(ARMCC::AL));
+  }
+
   // Replace the loop dec and loop end as a single instruction.
   MachineInstrBuilder MI =
       BuildMI(*LoopEnd->getParent(), *LoopEnd, LoopEnd->getDebugLoc(),
@@ -1041,8 +1052,7 @@ bool MVETPAndVPTOptimisations::HintDoLoopStartReg(MachineBasicBlock &MBB) {
 }
 
 bool MVETPAndVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) {
-  const ARMSubtarget &STI =
-      static_cast<const ARMSubtarget &>(Fn.getSubtarget());
+  const ARMSubtarget &STI = Fn.getSubtarget<ARMSubtarget>();
 
   if (!STI.isThumb2() || !STI.hasLOB())
     return false;
diff --git a/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp b/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp
index c7f451cba14f..d6d43b9143d6 100644
--- a/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp
+++ b/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp
@@ -312,8 +312,7 @@ bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) {
 }
 
 bool MVEVPTBlock::runOnMachineFunction(MachineFunction &Fn) {
-  const ARMSubtarget &STI =
-      static_cast<const ARMSubtarget &>(Fn.getSubtarget());
+  const ARMSubtarget &STI = Fn.getSubtarget<ARMSubtarget>();
 
   if (!STI.isThumb2() || !STI.hasMVEIntegerOps())
     return false;
diff --git a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
index 71a82a1e3271..df64710712cc 100644
--- a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -176,7 +176,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
 
   // Determine the sizes of each callee-save spill areas and record which frame
   // belongs to which callee-save spill areas.
-  unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0;
+  unsigned FRSize = 0, GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0;
   int FramePtrSpillFI = 0;
 
   if (ArgRegsSaveSize) {
@@ -205,26 +205,38 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
     return;
   }
 
+  bool HasFrameRecordArea = hasFP(MF) && ARM::hGPRRegClass.contains(FramePtr);
+
   for (const CalleeSavedInfo &I : CSI) {
     Register Reg = I.getReg();
     int FI = I.getFrameIdx();
+    if (Reg == FramePtr)
+      FramePtrSpillFI = FI;
     switch (Reg) {
+    case ARM::R11:
+      if (HasFrameRecordArea) {
+        FRSize += 4;
+        break;
+      }
+      LLVM_FALLTHROUGH;
     case ARM::R8:
     case ARM::R9:
     case ARM::R10:
-    case ARM::R11:
       if (STI.splitFramePushPop(MF)) {
         GPRCS2Size += 4;
         break;
       }
       LLVM_FALLTHROUGH;
+    case ARM::LR:
+      if (HasFrameRecordArea) {
+        FRSize += 4;
+        break;
+      }
+      LLVM_FALLTHROUGH;
     case ARM::R4:
     case ARM::R5:
     case ARM::R6:
     case ARM::R7:
-    case ARM::LR:
-      if (Reg == FramePtr)
-        FramePtrSpillFI = FI;
       GPRCS1Size += 4;
       break;
     default:
@@ -232,18 +244,53 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
     }
   }
 
+  MachineBasicBlock::iterator FRPush, GPRCS1Push, GPRCS2Push;
+  if (HasFrameRecordArea) {
+    // Skip Frame Record setup:
+    //   push {lr}
+    //   mov lr, r11
+    //   push {lr}
+    std::advance(MBBI, 2);
+    FRPush = MBBI++;
+  }
+
   if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPUSH) {
+    GPRCS1Push = MBBI;
     ++MBBI;
   }
 
+  // Find last push instruction for GPRCS2 - spilling of high registers
+  // (r8-r11) could consist of multiple tPUSH and tMOVr instructions.
+  while (true) {
+    MachineBasicBlock::iterator OldMBBI = MBBI;
+    // Skip a run of tMOVr instructions
+    while (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tMOVr &&
+           MBBI->getFlag(MachineInstr::FrameSetup))
+      MBBI++;
+    if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPUSH &&
+        MBBI->getFlag(MachineInstr::FrameSetup)) {
+      GPRCS2Push = MBBI;
+      MBBI++;
+    } else {
+      // We have reached an instruction which is not a push, so the previous
+      // run of tMOVr instructions (which may have been empty) was not part of
+      // the prologue. Reset MBBI back to the last PUSH of the prologue.
+      MBBI = OldMBBI;
+      break;
+    }
+  }
+
   // Determine starting offsets of spill areas.
-  unsigned DPRCSOffset  = NumBytes - ArgRegsSaveSize - (GPRCS1Size + GPRCS2Size + DPRCSSize);
+  unsigned DPRCSOffset = NumBytes - ArgRegsSaveSize -
+                         (FRSize + GPRCS1Size + GPRCS2Size + DPRCSSize);
   unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize;
   unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size;
   bool HasFP = hasFP(MF);
   if (HasFP)
     AFI->setFramePtrSpillOffset(MFI.getObjectOffset(FramePtrSpillFI) +
                                 NumBytes);
+  if (HasFrameRecordArea)
+    AFI->setFrameRecordSavedAreaSize(FRSize);
   AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset);
   AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset);
   AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
@@ -252,71 +299,45 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
   int FramePtrOffsetInBlock = 0;
   unsigned adjustedGPRCS1Size = GPRCS1Size;
   if (GPRCS1Size > 0 && GPRCS2Size == 0 &&
-      tryFoldSPUpdateIntoPushPop(STI, MF, &*std::prev(MBBI), NumBytes)) {
+      tryFoldSPUpdateIntoPushPop(STI, MF, &*(GPRCS1Push), NumBytes)) {
     FramePtrOffsetInBlock = NumBytes;
     adjustedGPRCS1Size += NumBytes;
     NumBytes = 0;
   }
-
-  if (adjustedGPRCS1Size) {
-    CFAOffset += adjustedGPRCS1Size;
-    unsigned CFIIndex =
-        MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, CFAOffset));
-    BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
-        .addCFIIndex(CFIIndex)
-        .setMIFlags(MachineInstr::FrameSetup);
-  }
-  for (const CalleeSavedInfo &I : CSI) {
-    Register Reg = I.getReg();
-    int FI = I.getFrameIdx();
-    switch (Reg) {
-    case ARM::R8:
-    case ARM::R9:
-    case ARM::R10:
-    case ARM::R11:
-    case ARM::R12:
-      if (STI.splitFramePushPop(MF))
-        break;
-      LLVM_FALLTHROUGH;
-    case ARM::R0:
-    case ARM::R1:
-    case ARM::R2:
-    case ARM::R3:
-    case ARM::R4:
-    case ARM::R5:
-    case ARM::R6:
-    case ARM::R7:
-    case ARM::LR:
-      unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
-          nullptr, MRI->getDwarfRegNum(Reg, true), MFI.getObjectOffset(FI)));
-      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex)
-          .setMIFlags(MachineInstr::FrameSetup);
-      break;
-    }
-  }
+  CFAOffset += adjustedGPRCS1Size;
 
   // Adjust FP so it point to the stack slot that contains the previous FP.
   if (HasFP) {
-    FramePtrOffsetInBlock +=
-        MFI.getObjectOffset(FramePtrSpillFI) + GPRCS1Size + ArgRegsSaveSize;
-    BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
-        .addReg(ARM::SP)
-        .addImm(FramePtrOffsetInBlock / 4)
-        .setMIFlags(MachineInstr::FrameSetup)
-        .add(predOps(ARMCC::AL));
+    MachineBasicBlock::iterator AfterPush =
+        HasFrameRecordArea ? std::next(FRPush) : std::next(GPRCS1Push);
+    if (HasFrameRecordArea) {
+      // We have just finished pushing the previous FP into the stack,
+      // so simply capture the SP value as the new Frame Pointer.
+      BuildMI(MBB, AfterPush, dl, TII.get(ARM::tMOVr), FramePtr)
+          .addReg(ARM::SP)
+          .setMIFlags(MachineInstr::FrameSetup)
+          .add(predOps(ARMCC::AL));
+    } else {
+      FramePtrOffsetInBlock +=
+          MFI.getObjectOffset(FramePtrSpillFI) + GPRCS1Size + ArgRegsSaveSize;
+      BuildMI(MBB, AfterPush, dl, TII.get(ARM::tADDrSPi), FramePtr)
+          .addReg(ARM::SP)
+          .addImm(FramePtrOffsetInBlock / 4)
+          .setMIFlags(MachineInstr::FrameSetup)
+          .add(predOps(ARMCC::AL));
+    }
+
     if(FramePtrOffsetInBlock) {
-      CFAOffset -= FramePtrOffsetInBlock;
       unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
-          nullptr, MRI->getDwarfRegNum(FramePtr, true), CFAOffset));
-      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          nullptr, MRI->getDwarfRegNum(FramePtr, true), (CFAOffset - FramePtrOffsetInBlock)));
+      BuildMI(MBB, AfterPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex)
           .setMIFlags(MachineInstr::FrameSetup);
     } else {
       unsigned CFIIndex =
           MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(
               nullptr, MRI->getDwarfRegNum(FramePtr, true)));
-      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      BuildMI(MBB, AfterPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex)
           .setMIFlags(MachineInstr::FrameSetup);
     }
@@ -326,45 +347,69 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
       AFI->setShouldRestoreSPFromFP(true);
   }
 
-  // Skip past the spilling of r8-r11, which could consist of multiple tPUSH
-  // and tMOVr instructions. We don't need to add any call frame information
-  // in-between these instructions, because they do not modify the high
-  // registers.
-  while (true) {
-    MachineBasicBlock::iterator OldMBBI = MBBI;
-    // Skip a run of tMOVr instructions
-    while (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tMOVr)
-      MBBI++;
-    if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPUSH) {
-      MBBI++;
-    } else {
-      // We have reached an instruction which is not a push, so the previous
-      // run of tMOVr instructions (which may have been empty) was not part of
-      // the prologue. Reset MBBI back to the last PUSH of the prologue.
-      MBBI = OldMBBI;
-      break;
+  // Emit call frame information for the callee-saved low registers.
+  if (GPRCS1Size > 0) {
+    MachineBasicBlock::iterator Pos = std::next(GPRCS1Push);
+    if (adjustedGPRCS1Size) {
+      unsigned CFIIndex =
+          MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, CFAOffset));
+      BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
+    }
+    for (const CalleeSavedInfo &I : CSI) {
+      Register Reg = I.getReg();
+      int FI = I.getFrameIdx();
+      switch (Reg) {
+      case ARM::R8:
+      case ARM::R9:
+      case ARM::R10:
+      case ARM::R11:
+      case ARM::R12:
+        if (STI.splitFramePushPop(MF))
+          break;
+        LLVM_FALLTHROUGH;
+      case ARM::R0:
+      case ARM::R1:
+      case ARM::R2:
+      case ARM::R3:
+      case ARM::R4:
+      case ARM::R5:
+      case ARM::R6:
+      case ARM::R7:
+      case ARM::LR:
+        unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+            nullptr, MRI->getDwarfRegNum(Reg, true), MFI.getObjectOffset(FI)));
+        BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex)
+            .setMIFlags(MachineInstr::FrameSetup);
+        break;
+      }
     }
   }
 
   // Emit call frame information for the callee-saved high registers.
-  for (auto &I : CSI) {
-    Register Reg = I.getReg();
-    int FI = I.getFrameIdx();
-    switch (Reg) {
-    case ARM::R8:
-    case ARM::R9:
-    case ARM::R10:
-    case ARM::R11:
-    case ARM::R12: {
-      unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
-          nullptr, MRI->getDwarfRegNum(Reg, true), MFI.getObjectOffset(FI)));
-      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex)
-          .setMIFlags(MachineInstr::FrameSetup);
-      break;
-    }
-    default:
-      break;
+  if (GPRCS2Size > 0) {
+    MachineBasicBlock::iterator Pos = std::next(GPRCS2Push);
+    for (auto &I : CSI) {
+      Register Reg = I.getReg();
+      int FI = I.getFrameIdx();
+      switch (Reg) {
+      case ARM::R8:
+      case ARM::R9:
+      case ARM::R10:
+      case ARM::R11:
+      case ARM::R12: {
+        unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+            nullptr, MRI->getDwarfRegNum(Reg, true), MFI.getObjectOffset(FI)));
+        BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex)
+            .setMIFlags(MachineInstr::FrameSetup);
+        break;
+      }
+      default:
+        break;
+      }
     }
   }
 
@@ -453,21 +498,6 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
   MF.getProperties().reset(MachineFunctionProperties::Property::NoVRegs);
 }
 
-static bool isCSRestore(MachineInstr &MI, const MCPhysReg *CSRegs) {
-  if (MI.getOpcode() == ARM::tLDRspi && MI.getOperand(1).isFI() &&
-      isCalleeSavedRegister(MI.getOperand(0).getReg(), CSRegs))
-    return true;
-  else if (MI.getOpcode() == ARM::tPOP) {
-    return true;
-  } else if (MI.getOpcode() == ARM::tMOVr) {
-    Register Dst = MI.getOperand(0).getReg();
-    Register Src = MI.getOperand(1).getReg();
-    return ((ARM::tGPRRegClass.contains(Src) || Src == ARM::LR) &&
-            ARM::hGPRRegClass.contains(Dst));
-  }
-  return false;
-}
-
 void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
                                    MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
@@ -483,26 +513,26 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
   int NumBytes = (int)MFI.getStackSize();
   assert((unsigned)NumBytes >= ArgRegsSaveSize &&
          "ArgRegsSaveSize is included in NumBytes");
-  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
   Register FramePtr = RegInfo->getFrameRegister(MF);
 
   if (!AFI->hasStackFrame()) {
     if (NumBytes - ArgRegsSaveSize != 0)
       emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo,
                                    NumBytes - ArgRegsSaveSize, ARM::NoRegister,
-                                   MachineInstr::NoFlags);
+                                   MachineInstr::FrameDestroy);
   } else {
     // Unwind MBBI to point to first LDR / VLDRD.
     if (MBBI != MBB.begin()) {
       do
         --MBBI;
-      while (MBBI != MBB.begin() && isCSRestore(*MBBI, CSRegs));
-      if (!isCSRestore(*MBBI, CSRegs))
+      while (MBBI != MBB.begin() && MBBI->getFlag(MachineInstr::FrameDestroy));
+      if (!MBBI->getFlag(MachineInstr::FrameDestroy))
         ++MBBI;
     }
 
     // Move SP to start of FP callee save spill area.
-    NumBytes -= (AFI->getGPRCalleeSavedArea1Size() +
+    NumBytes -= (AFI->getFrameRecordSavedAreaSize() +
+                 AFI->getGPRCalleeSavedArea1Size() +
                  AFI->getGPRCalleeSavedArea2Size() +
                  AFI->getDPRCalleeSavedAreaSize() +
                  ArgRegsSaveSize);
@@ -516,14 +546,16 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
         assert(!MFI.getPristineRegs(MF).test(ARM::R4) &&
                "No scratch register to restore SP from FP!");
         emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes,
-                                  TII, *RegInfo);
+                                  TII, *RegInfo, MachineInstr::FrameDestroy);
         BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
             .addReg(ARM::R4)
-            .add(predOps(ARMCC::AL));
+            .add(predOps(ARMCC::AL))
+            .setMIFlag(MachineInstr::FrameDestroy);
       } else
         BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
             .addReg(FramePtr)
-            .add(predOps(ARMCC::AL));
+            .add(predOps(ARMCC::AL))
+            .setMIFlag(MachineInstr::FrameDestroy);
     } else {
       // For a large stack frame, we might need a scratch register to store
       // the size of the frame.  We know all callee-save registers are free
@@ -542,10 +574,10 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
         MachineBasicBlock::iterator PMBBI = std::prev(MBBI);
         if (!tryFoldSPUpdateIntoPushPop(STI, MF, &*PMBBI, NumBytes))
           emitPrologueEpilogueSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes,
-                                       ScratchRegister, MachineInstr::NoFlags);
+                                       ScratchRegister, MachineInstr::FrameDestroy);
       } else if (!tryFoldSPUpdateIntoPushPop(STI, MF, &*MBBI, NumBytes))
         emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes,
-                                     ScratchRegister, MachineInstr::NoFlags);
+                                     ScratchRegister, MachineInstr::FrameDestroy);
     }
   }
 
@@ -637,7 +669,8 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
       return true;
     MachineInstrBuilder MIB =
         BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP_RET))
-            .add(predOps(ARMCC::AL));
+            .add(predOps(ARMCC::AL))
+            .setMIFlag(MachineInstr::FrameDestroy);
     // Copy implicit ops and popped registers, if any.
     for (auto MO: MBBI->operands())
       if (MO.isReg() && (MO.isImplicit() || MO.isDef()))
@@ -725,18 +758,20 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
       .addReg(PopReg, RegState::Define)
       .addReg(ARM::SP)
       .addImm(MBBI->getNumExplicitOperands() - 2)
-      .add(predOps(ARMCC::AL));
+      .add(predOps(ARMCC::AL))
+      .setMIFlag(MachineInstr::FrameDestroy);
     // Move from the temporary register to the LR.
     BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
       .addReg(ARM::LR, RegState::Define)
       .addReg(PopReg, RegState::Kill)
-      .add(predOps(ARMCC::AL));
+      .add(predOps(ARMCC::AL))
+      .setMIFlag(MachineInstr::FrameDestroy);
     // Advance past the pop instruction.
     MBBI++;
     // Increment the SP.
     emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo,
                                  ArgRegsSaveSize + 4, ARM::NoRegister,
-                                 MachineInstr::NoFlags);
+                                 MachineInstr::FrameDestroy);
     return true;
   }
 
@@ -746,7 +781,8 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
     BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
         .addReg(TemporaryReg, RegState::Define)
         .addReg(PopReg, RegState::Kill)
-        .add(predOps(ARMCC::AL));
+        .add(predOps(ARMCC::AL))
+        .setMIFlag(MachineInstr::FrameDestroy);
   }
 
   if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPOP_RET) {
@@ -754,7 +790,8 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
     // perform the opposite conversion: tPOP_RET to tPOP.
     MachineInstrBuilder MIB =
         BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP))
-            .add(predOps(ARMCC::AL));
+            .add(predOps(ARMCC::AL))
+            .setMIFlag(MachineInstr::FrameDestroy);
     bool Popped = false;
     for (auto MO: MBBI->operands())
       if (MO.isReg() && (MO.isImplicit() || MO.isDef()) &&
@@ -769,90 +806,82 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
     // Erase the old instruction.
     MBB.erase(MBBI);
     MBBI = BuildMI(MBB, MBB.end(), dl, TII.get(ARM::tBX_RET))
-               .add(predOps(ARMCC::AL));
+               .add(predOps(ARMCC::AL))
+               .setMIFlag(MachineInstr::FrameDestroy);
   }
 
   assert(PopReg && "Do not know how to get LR");
   BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP))
       .add(predOps(ARMCC::AL))
-      .addReg(PopReg, RegState::Define);
+      .addReg(PopReg, RegState::Define)
+      .setMIFlag(MachineInstr::FrameDestroy);
 
   emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize,
-                               ARM::NoRegister, MachineInstr::NoFlags);
+                               ARM::NoRegister, MachineInstr::FrameDestroy);
 
   BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
       .addReg(ARM::LR, RegState::Define)
       .addReg(PopReg, RegState::Kill)
-      .add(predOps(ARMCC::AL));
+      .add(predOps(ARMCC::AL))
+      .setMIFlag(MachineInstr::FrameDestroy);
 
   if (TemporaryReg)
     BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
         .addReg(PopReg, RegState::Define)
         .addReg(TemporaryReg, RegState::Kill)
-        .add(predOps(ARMCC::AL));
+        .add(predOps(ARMCC::AL))
+        .setMIFlag(MachineInstr::FrameDestroy);
 
   return true;
 }
 
-using ARMRegSet = std::bitset<ARM::NUM_TARGET_REGS>;
-
-// Return the first iteraror after CurrentReg which is present in EnabledRegs,
-// or OrderEnd if no further registers are in that set. This does not advance
-// the iterator fiorst, so returns CurrentReg if it is in EnabledRegs.
-static const unsigned *findNextOrderedReg(const unsigned *CurrentReg,
-                                          const ARMRegSet &EnabledRegs,
-                                          const unsigned *OrderEnd) {
-  while (CurrentReg != OrderEnd && !EnabledRegs[*CurrentReg])
-    ++CurrentReg;
-  return CurrentReg;
-}
-
-bool Thumb1FrameLowering::spillCalleeSavedRegisters(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-    ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
-  if (CSI.empty())
-    return false;
-
-  DebugLoc DL;
-  const TargetInstrInfo &TII = *STI.getInstrInfo();
-  MachineFunction &MF = *MBB.getParent();
-  const ARMBaseRegisterInfo *RegInfo = static_cast<const ARMBaseRegisterInfo *>(
-      MF.getSubtarget().getRegisterInfo());
-
-  ARMRegSet LoRegsToSave; // r0-r7, lr
-  ARMRegSet HiRegsToSave; // r8-r11
-  ARMRegSet CopyRegs;     // Registers which can be used after pushing
-                          // LoRegs for saving HiRegs.
-
-  for (const CalleeSavedInfo &I : llvm::reverse(CSI)) {
-    Register Reg = I.getReg();
-
+static const SmallVector<Register> OrderedLowRegs = {ARM::R4, ARM::R5, ARM::R6,
+                                                     ARM::R7, ARM::LR};
+static const SmallVector<Register> OrderedHighRegs = {ARM::R8, ARM::R9,
+                                                      ARM::R10, ARM::R11};
+static const SmallVector<Register> OrderedCopyRegs = {
+    ARM::R0, ARM::R1, ARM::R2, ARM::R3, ARM::R4,
+    ARM::R5, ARM::R6, ARM::R7, ARM::LR};
+
+static void splitLowAndHighRegs(const std::set<Register> &Regs,
+                                std::set<Register> &LowRegs,
+                                std::set<Register> &HighRegs) {
+  for (Register Reg : Regs) {
     if (ARM::tGPRRegClass.contains(Reg) || Reg == ARM::LR) {
-      LoRegsToSave[Reg] = true;
+      LowRegs.insert(Reg);
     } else if (ARM::hGPRRegClass.contains(Reg) && Reg != ARM::LR) {
-      HiRegsToSave[Reg] = true;
+      HighRegs.insert(Reg);
     } else {
       llvm_unreachable("callee-saved register of unexpected class");
     }
-
-    if ((ARM::tGPRRegClass.contains(Reg) || Reg == ARM::LR) &&
-        !MF.getRegInfo().isLiveIn(Reg) &&
-        !(hasFP(MF) && Reg == RegInfo->getFrameRegister(MF)))
-      CopyRegs[Reg] = true;
   }
+}
 
-  // Unused argument registers can be used for the high register saving.
-  for (unsigned ArgReg : {ARM::R0, ARM::R1, ARM::R2, ARM::R3})
-    if (!MF.getRegInfo().isLiveIn(ArgReg))
-      CopyRegs[ArgReg] = true;
+template <typename It>
+It getNextOrderedReg(It OrderedStartIt, It OrderedEndIt,
+                     const std::set<Register> &RegSet) {
+  return std::find_if(OrderedStartIt, OrderedEndIt,
+                      [&](Register Reg) { return RegSet.count(Reg); });
+}
 
-  // Push the low registers and lr
+static void pushRegsToStack(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            const TargetInstrInfo &TII,
+                            const std::set<Register> &RegsToSave,
+                            const std::set<Register> &CopyRegs) {
+  MachineFunction &MF = *MBB.getParent();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
-  if (!LoRegsToSave.none()) {
+  DebugLoc DL;
+
+  std::set<Register> LowRegs, HighRegs;
+  splitLowAndHighRegs(RegsToSave, LowRegs, HighRegs);
+
+  // Push low regs first
+  if (!LowRegs.empty()) {
     MachineInstrBuilder MIB =
         BuildMI(MBB, MI, DL, TII.get(ARM::tPUSH)).add(predOps(ARMCC::AL));
-    for (unsigned Reg : {ARM::R4, ARM::R5, ARM::R6, ARM::R7, ARM::LR}) {
-      if (LoRegsToSave[Reg]) {
+    for (unsigned Reg : OrderedLowRegs) {
+      if (LowRegs.count(Reg)) {
         bool isKill = !MRI.isLiveIn(Reg);
         if (isKill && !MRI.isReserved(Reg))
           MBB.addLiveIn(Reg);
@@ -863,31 +892,26 @@ bool Thumb1FrameLowering::spillCalleeSavedRegisters(
     MIB.setMIFlags(MachineInstr::FrameSetup);
   }
 
-  // Push the high registers. There are no store instructions that can access
-  // these registers directly, so we have to move them to low registers, and
-  // push them. This might take multiple pushes, as it is possible for there to
+  // Now push the high registers
+  // There are no store instructions that can access high registers directly,
+  // so we have to move them to low registers, and push them.
+  // This might take multiple pushes, as it is possible for there to
   // be fewer low registers available than high registers which need saving.
 
-  // These are in reverse order so that in the case where we need to use
+  // Find the first register to save.
+  // Registers must be processed in reverse order so that in case we need to use
   // multiple PUSH instructions, the order of the registers on the stack still
   // matches the unwind info. They need to be swicthed back to ascending order
   // before adding to the PUSH instruction.
-  static const unsigned AllCopyRegs[] = {ARM::LR, ARM::R7, ARM::R6,
-                                         ARM::R5, ARM::R4, ARM::R3,
-                                         ARM::R2, ARM::R1, ARM::R0};
-  static const unsigned AllHighRegs[] = {ARM::R11, ARM::R10, ARM::R9, ARM::R8};
+  auto HiRegToSave = getNextOrderedReg(OrderedHighRegs.rbegin(),
+                                       OrderedHighRegs.rend(),
+                                       HighRegs);
 
-  const unsigned *AllCopyRegsEnd = std::end(AllCopyRegs);
-  const unsigned *AllHighRegsEnd = std::end(AllHighRegs);
-
-  // Find the first register to save.
-  const unsigned *HiRegToSave = findNextOrderedReg(
-      std::begin(AllHighRegs), HiRegsToSave, AllHighRegsEnd);
-
-  while (HiRegToSave != AllHighRegsEnd) {
+  while (HiRegToSave != OrderedHighRegs.rend()) {
     // Find the first low register to use.
-    const unsigned *CopyReg =
-        findNextOrderedReg(std::begin(AllCopyRegs), CopyRegs, AllCopyRegsEnd);
+    auto CopyRegIt = getNextOrderedReg(OrderedCopyRegs.rbegin(),
+                                       OrderedCopyRegs.rend(),
+                                       CopyRegs);
 
     // Create the PUSH, but don't insert it yet (the MOVs need to come first).
     MachineInstrBuilder PushMIB = BuildMI(MF, DL, TII.get(ARM::tPUSH))
@@ -895,25 +919,29 @@ bool Thumb1FrameLowering::spillCalleeSavedRegisters(
                                       .setMIFlags(MachineInstr::FrameSetup);
 
     SmallVector<unsigned, 4> RegsToPush;
-    while (HiRegToSave != AllHighRegsEnd && CopyReg != AllCopyRegsEnd) {
-      if (HiRegsToSave[*HiRegToSave]) {
+    while (HiRegToSave != OrderedHighRegs.rend() &&
+           CopyRegIt != OrderedCopyRegs.rend()) {
+      if (HighRegs.count(*HiRegToSave)) {
         bool isKill = !MRI.isLiveIn(*HiRegToSave);
         if (isKill && !MRI.isReserved(*HiRegToSave))
           MBB.addLiveIn(*HiRegToSave);
 
         // Emit a MOV from the high reg to the low reg.
         BuildMI(MBB, MI, DL, TII.get(ARM::tMOVr))
-            .addReg(*CopyReg, RegState::Define)
+            .addReg(*CopyRegIt, RegState::Define)
             .addReg(*HiRegToSave, getKillRegState(isKill))
             .add(predOps(ARMCC::AL))
             .setMIFlags(MachineInstr::FrameSetup);
 
         // Record the register that must be added to the PUSH.
-        RegsToPush.push_back(*CopyReg);
-
-        CopyReg = findNextOrderedReg(++CopyReg, CopyRegs, AllCopyRegsEnd);
-        HiRegToSave =
-            findNextOrderedReg(++HiRegToSave, HiRegsToSave, AllHighRegsEnd);
+        RegsToPush.push_back(*CopyRegIt);
+
+        CopyRegIt = getNextOrderedReg(std::next(CopyRegIt),
+                                      OrderedCopyRegs.rend(),
+                                      CopyRegs);
+        HiRegToSave = getNextOrderedReg(std::next(HiRegToSave),
+                                        OrderedHighRegs.rend(),
+                                        HighRegs);
       }
     }
 
@@ -924,84 +952,63 @@ bool Thumb1FrameLowering::spillCalleeSavedRegisters(
     // Insert the PUSH instruction after the MOVs.
     MBB.insert(MI, PushMIB);
   }
-
-  return true;
 }
 
-bool Thumb1FrameLowering::restoreCalleeSavedRegisters(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-    MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
-  if (CSI.empty())
-    return false;
+static void popRegsFromStack(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator &MI,
+                             const TargetInstrInfo &TII,
+                             const std::set<Register> &RegsToRestore,
+                             const std::set<Register> &AvailableCopyRegs,
+                             bool IsVarArg, bool HasV5Ops) {
+  if (RegsToRestore.empty())
+    return;
 
   MachineFunction &MF = *MBB.getParent();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  const TargetInstrInfo &TII = *STI.getInstrInfo();
-  const ARMBaseRegisterInfo *RegInfo = static_cast<const ARMBaseRegisterInfo *>(
-      MF.getSubtarget().getRegisterInfo());
-
-  bool isVarArg = AFI->getArgRegsSaveSize() > 0;
   DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc();
 
-  ARMRegSet LoRegsToRestore;
-  ARMRegSet HiRegsToRestore;
-  // Low registers (r0-r7) which can be used to restore the high registers.
-  ARMRegSet CopyRegs;
+  std::set<Register> LowRegs, HighRegs;
+  splitLowAndHighRegs(RegsToRestore, LowRegs, HighRegs);
 
-  for (CalleeSavedInfo I : CSI) {
-    Register Reg = I.getReg();
-
-    if (ARM::tGPRRegClass.contains(Reg) || Reg == ARM::LR) {
-      LoRegsToRestore[Reg] = true;
-    } else if (ARM::hGPRRegClass.contains(Reg) && Reg != ARM::LR) {
-      HiRegsToRestore[Reg] = true;
-    } else {
-      llvm_unreachable("callee-saved register of unexpected class");
-    }
-
-    // If this is a low register not used as the frame pointer, we may want to
-    // use it for restoring the high registers.
-    if ((ARM::tGPRRegClass.contains(Reg)) &&
-        !(hasFP(MF) && Reg == RegInfo->getFrameRegister(MF)))
-      CopyRegs[Reg] = true;
-  }
-
-  // If this is a return block, we may be able to use some unused return value
-  // registers for restoring the high regs.
-  auto Terminator = MBB.getFirstTerminator();
-  if (Terminator != MBB.end() && Terminator->getOpcode() == ARM::tBX_RET) {
-    CopyRegs[ARM::R0] = true;
-    CopyRegs[ARM::R1] = true;
-    CopyRegs[ARM::R2] = true;
-    CopyRegs[ARM::R3] = true;
-    for (auto Op : Terminator->implicit_operands()) {
-      if (Op.isReg())
-        CopyRegs[Op.getReg()] = false;
-    }
-  }
-
-  static const unsigned AllCopyRegs[] = {ARM::R0, ARM::R1, ARM::R2, ARM::R3,
-                                         ARM::R4, ARM::R5, ARM::R6, ARM::R7};
-  static const unsigned AllHighRegs[] = {ARM::R8, ARM::R9, ARM::R10, ARM::R11};
-
-  const unsigned *AllCopyRegsEnd = std::end(AllCopyRegs);
-  const unsigned *AllHighRegsEnd = std::end(AllHighRegs);
+  // Pop the high registers first
+  // There are no store instructions that can access high registers directly,
+  // so we have to pop into low registers and them move to  the high registers.
+  // This might take multiple pops, as it is possible for there to
+  // be fewer low registers available than high registers which need restoring.
 
   // Find the first register to restore.
-  auto HiRegToRestore = findNextOrderedReg(std::begin(AllHighRegs),
-                                           HiRegsToRestore, AllHighRegsEnd);
+  auto HiRegToRestore = getNextOrderedReg(OrderedHighRegs.begin(),
+                                          OrderedHighRegs.end(),
+                                          HighRegs);
+
+  std::set<Register> CopyRegs = AvailableCopyRegs;
+  Register LowScratchReg;
+  if (!HighRegs.empty() && CopyRegs.empty()) {
+    // No copy regs are available to pop high regs. Let's make use of a return
+    // register and the scratch register (IP/R12) to copy things around.
+    LowScratchReg = ARM::R0;
+    BuildMI(MBB, MI, DL, TII.get(ARM::tMOVr))
+        .addReg(ARM::R12, RegState::Define)
+        .addReg(LowScratchReg, RegState::Kill)
+        .add(predOps(ARMCC::AL))
+        .setMIFlag(MachineInstr::FrameDestroy);
+    CopyRegs.insert(LowScratchReg);
+  }
 
-  while (HiRegToRestore != AllHighRegsEnd) {
-    assert(!CopyRegs.none());
+  while (HiRegToRestore != OrderedHighRegs.end()) {
+    assert(!CopyRegs.empty());
     // Find the first low register to use.
-    auto CopyReg =
-        findNextOrderedReg(std::begin(AllCopyRegs), CopyRegs, AllCopyRegsEnd);
+    auto CopyReg = getNextOrderedReg(OrderedCopyRegs.begin(),
+                                     OrderedCopyRegs.end(),
+                                     CopyRegs);
 
     // Create the POP instruction.
-    MachineInstrBuilder PopMIB =
-        BuildMI(MBB, MI, DL, TII.get(ARM::tPOP)).add(predOps(ARMCC::AL));
+    MachineInstrBuilder PopMIB = BuildMI(MBB, MI, DL, TII.get(ARM::tPOP))
+                                     .add(predOps(ARMCC::AL))
+                                     .setMIFlag(MachineInstr::FrameDestroy);
 
-    while (HiRegToRestore != AllHighRegsEnd && CopyReg != AllCopyRegsEnd) {
+    while (HiRegToRestore != OrderedHighRegs.end() &&
+           CopyReg != OrderedCopyRegs.end()) {
       // Add the low register to the POP.
       PopMIB.addReg(*CopyReg, RegState::Define);
 
@@ -1009,64 +1016,189 @@ bool Thumb1FrameLowering::restoreCalleeSavedRegisters(
       BuildMI(MBB, MI, DL, TII.get(ARM::tMOVr))
           .addReg(*HiRegToRestore, RegState::Define)
           .addReg(*CopyReg, RegState::Kill)
-          .add(predOps(ARMCC::AL));
-
-      CopyReg = findNextOrderedReg(++CopyReg, CopyRegs, AllCopyRegsEnd);
-      HiRegToRestore =
-          findNextOrderedReg(++HiRegToRestore, HiRegsToRestore, AllHighRegsEnd);
+          .add(predOps(ARMCC::AL))
+          .setMIFlag(MachineInstr::FrameDestroy);
+
+      CopyReg = getNextOrderedReg(std::next(CopyReg),
+                                  OrderedCopyRegs.end(),
+                                  CopyRegs);
+      HiRegToRestore = getNextOrderedReg(std::next(HiRegToRestore),
+                                         OrderedHighRegs.end(),
+                                         HighRegs);
     }
   }
 
-  MachineInstrBuilder MIB =
-      BuildMI(MF, DL, TII.get(ARM::tPOP)).add(predOps(ARMCC::AL));
-
-  bool NeedsPop = false;
-  for (CalleeSavedInfo &Info : llvm::reverse(CSI)) {
-    Register Reg = Info.getReg();
-
-    // High registers (excluding lr) have already been dealt with
-    if (!(ARM::tGPRRegClass.contains(Reg) || Reg == ARM::LR))
-      continue;
-
-    if (Reg == ARM::LR) {
-      Info.setRestored(false);
-      if (!MBB.succ_empty() ||
-          MI->getOpcode() == ARM::TCRETURNdi ||
-          MI->getOpcode() == ARM::TCRETURNri)
-        // LR may only be popped into PC, as part of return sequence.
-        // If this isn't the return sequence, we'll need emitPopSpecialFixUp
-        // to restore LR the hard way.
-        // FIXME: if we don't pass any stack arguments it would be actually
-        // advantageous *and* correct to do the conversion to an ordinary call
-        // instruction here.
-        continue;
-      // Special epilogue for vararg functions. See emitEpilogue
-      if (isVarArg)
-        continue;
-      // ARMv4T requires BX, see emitEpilogue
-      if (!STI.hasV5TOps())
-        continue;
+  // Restore low register used as scratch if necessary
+  if (LowScratchReg.isValid()) {
+    BuildMI(MBB, MI, DL, TII.get(ARM::tMOVr))
+        .addReg(LowScratchReg, RegState::Define)
+        .addReg(ARM::R12, RegState::Kill)
+        .add(predOps(ARMCC::AL))
+        .setMIFlag(MachineInstr::FrameDestroy);
+  }
 
-      // CMSE entry functions must return via BXNS, see emitEpilogue.
-      if (AFI->isCmseNSEntryFunction())
+  // Now pop the low registers
+  if (!LowRegs.empty()) {
+    MachineInstrBuilder MIB = BuildMI(MF, DL, TII.get(ARM::tPOP))
+                                  .add(predOps(ARMCC::AL))
+                                  .setMIFlag(MachineInstr::FrameDestroy);
+
+    bool NeedsPop = false;
+    for (Register Reg : OrderedLowRegs) {
+      if (!LowRegs.count(Reg))
         continue;
 
-      // Pop LR into PC.
-      Reg = ARM::PC;
-      (*MIB).setDesc(TII.get(ARM::tPOP_RET));
-      if (MI != MBB.end())
-        MIB.copyImplicitOps(*MI);
-      MI = MBB.erase(MI);
+      if (Reg == ARM::LR) {
+        if (!MBB.succ_empty() ||
+            MI->getOpcode() == ARM::TCRETURNdi ||
+            MI->getOpcode() == ARM::TCRETURNri)
+          // LR may only be popped into PC, as part of return sequence.
+          // If this isn't the return sequence, we'll need emitPopSpecialFixUp
+          // to restore LR the hard way.
+          // FIXME: if we don't pass any stack arguments it would be actually
+          // advantageous *and* correct to do the conversion to an ordinary call
+          // instruction here.
+          continue;
+        // Special epilogue for vararg functions. See emitEpilogue
+        if (IsVarArg)
+          continue;
+        // ARMv4T requires BX, see emitEpilogue
+        if (!HasV5Ops)
+          continue;
+
+        // CMSE entry functions must return via BXNS, see emitEpilogue.
+        if (AFI->isCmseNSEntryFunction())
+          continue;
+
+        // Pop LR into PC.
+        Reg = ARM::PC;
+        (*MIB).setDesc(TII.get(ARM::tPOP_RET));
+        if (MI != MBB.end())
+          MIB.copyImplicitOps(*MI);
+        MI = MBB.erase(MI);
+      }
+      MIB.addReg(Reg, getDefRegState(true));
+      NeedsPop = true;
     }
-    MIB.addReg(Reg, getDefRegState(true));
-    NeedsPop = true;
+
+    // It's illegal to emit pop instruction without operands.
+    if (NeedsPop)
+      MBB.insert(MI, &*MIB);
+    else
+      MF.deleteMachineInstr(MIB);
+  }
+}
+
+bool Thumb1FrameLowering::spillCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
+  MachineFunction &MF = *MBB.getParent();
+  const ARMBaseRegisterInfo *RegInfo = static_cast<const ARMBaseRegisterInfo *>(
+      MF.getSubtarget().getRegisterInfo());
+  Register FPReg = RegInfo->getFrameRegister(MF);
+
+  // In case FP is a high reg, we need a separate push sequence to generate
+  // a correct Frame Record
+  bool NeedsFrameRecordPush = hasFP(MF) && ARM::hGPRRegClass.contains(FPReg);
+
+  std::set<Register> FrameRecord;
+  std::set<Register> SpilledGPRs;
+  for (const CalleeSavedInfo &I : CSI) {
+    Register Reg = I.getReg();
+    if (NeedsFrameRecordPush && (Reg == FPReg || Reg == ARM::LR))
+      FrameRecord.insert(Reg);
+    else
+      SpilledGPRs.insert(Reg);
   }
 
-  // It's illegal to emit pop instruction without operands.
-  if (NeedsPop)
-    MBB.insert(MI, &*MIB);
-  else
-    MF.deleteMachineInstr(MIB);
+  pushRegsToStack(MBB, MI, TII, FrameRecord, {ARM::LR});
+
+  // Determine intermediate registers which can be used for pushing high regs:
+  // - Spilled low regs
+  // - Unused argument registers
+  std::set<Register> CopyRegs;
+  for (Register Reg : SpilledGPRs)
+    if ((ARM::tGPRRegClass.contains(Reg) || Reg == ARM::LR) &&
+        !MF.getRegInfo().isLiveIn(Reg) && !(hasFP(MF) && Reg == FPReg))
+      CopyRegs.insert(Reg);
+  for (unsigned ArgReg : {ARM::R0, ARM::R1, ARM::R2, ARM::R3})
+    if (!MF.getRegInfo().isLiveIn(ArgReg))
+      CopyRegs.insert(ArgReg);
+
+  pushRegsToStack(MBB, MI, TII, SpilledGPRs, CopyRegs);
+
+  return true;
+}
+
+bool Thumb1FrameLowering::restoreCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
+  const ARMBaseRegisterInfo *RegInfo = static_cast<const ARMBaseRegisterInfo *>(
+      MF.getSubtarget().getRegisterInfo());
+  bool IsVarArg = AFI->getArgRegsSaveSize() > 0;
+  Register FPReg = RegInfo->getFrameRegister(MF);
+
+  // In case FP is a high reg, we need a separate pop sequence to generate
+  // a correct Frame Record
+  bool NeedsFrameRecordPop = hasFP(MF) && ARM::hGPRRegClass.contains(FPReg);
+
+  std::set<Register> FrameRecord;
+  std::set<Register> SpilledGPRs;
+  for (CalleeSavedInfo &I : CSI) {
+    Register Reg = I.getReg();
+    if (NeedsFrameRecordPop && (Reg == FPReg || Reg == ARM::LR))
+      FrameRecord.insert(Reg);
+    else
+      SpilledGPRs.insert(Reg);
+
+    if (Reg == ARM::LR)
+      I.setRestored(false);
+  }
+
+  // Determine intermidiate registers which can be used for popping high regs:
+  // - Spilled low regs
+  // - Unused return registers
+  std::set<Register> CopyRegs;
+  std::set<Register> UnusedReturnRegs;
+  for (Register Reg : SpilledGPRs)
+    if ((ARM::tGPRRegClass.contains(Reg)) && !(hasFP(MF) && Reg == FPReg))
+      CopyRegs.insert(Reg);
+  auto Terminator = MBB.getFirstTerminator();
+  if (Terminator != MBB.end() && Terminator->getOpcode() == ARM::tBX_RET) {
+    UnusedReturnRegs.insert(ARM::R0);
+    UnusedReturnRegs.insert(ARM::R1);
+    UnusedReturnRegs.insert(ARM::R2);
+    UnusedReturnRegs.insert(ARM::R3);
+    for (auto Op : Terminator->implicit_operands()) {
+      if (Op.isReg())
+        UnusedReturnRegs.erase(Op.getReg());
+    }
+  }
+  CopyRegs.insert(UnusedReturnRegs.begin(), UnusedReturnRegs.end());
+
+  // First pop regular spilled regs.
+  popRegsFromStack(MBB, MI, TII, SpilledGPRs, CopyRegs, IsVarArg,
+                   STI.hasV5TOps());
+
+  // LR may only be popped into pc, as part of a return sequence.
+  // Check that no other pop instructions are inserted after that.
+  assert((!SpilledGPRs.count(ARM::LR) || FrameRecord.empty()) &&
+         "Can't insert pop after return sequence");
+
+  // Now pop Frame Record regs.
+  // Only unused return registers can be used as copy regs at this point.
+  popRegsFromStack(MBB, MI, TII, FrameRecord, UnusedReturnRegs, IsVarArg,
+                   STI.hasV5TOps());
 
   return true;
 }
diff --git a/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp b/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp
index 5cdaa7f02201..155555152ced 100644
--- a/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -226,9 +226,10 @@ bool Thumb2ITBlock::InsertITInstructions(MachineBasicBlock &MBB) {
     ARMCC::CondCodes OCC = ARMCC::getOppositeCondition(CC);
     unsigned Mask = 0, Pos = 3;
 
-    // v8 IT blocks are limited to one conditional op unless -arm-no-restrict-it
+    // IT blocks are limited to one conditional op if -arm-restrict-it
     // is set: skip the loop
     if (!restrictIT) {
+      LLVM_DEBUG(dbgs() << "Allowing complex IT block\n";);
       // Branches, including tricky ones like LDM_RET, need to end an IT
       // block so check the instruction we just put in the block.
       for (; MBBI != E && Pos &&
@@ -283,8 +284,7 @@ bool Thumb2ITBlock::InsertITInstructions(MachineBasicBlock &MBB) {
 }
 
 bool Thumb2ITBlock::runOnMachineFunction(MachineFunction &Fn) {
-  const ARMSubtarget &STI =
-      static_cast<const ARMSubtarget &>(Fn.getSubtarget());
+  const ARMSubtarget &STI = Fn.getSubtarget<ARMSubtarget>();
   if (!STI.isThumb2())
     return false;
   AFI = Fn.getInfo<ARMFunctionInfo>();
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index ebd139af2219..60dbc7b92013 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -555,7 +555,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
       MI.setDesc(TII.get(ARM::tMOVr));
       MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
       // Remove offset and remaining explicit predicate operands.
-      do MI.RemoveOperand(FrameRegIdx+1);
+      do MI.removeOperand(FrameRegIdx+1);
       while (MI.getNumOperands() > FrameRegIdx+1);
       MachineInstrBuilder MIB(*MI.getParent()->getParent(), &MI);
       MIB.add(predOps(ARMCC::AL));
@@ -592,7 +592,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
       MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset);
       // Remove the cc_out operand.
       if (HasCCOut)
-        MI.RemoveOperand(MI.getNumOperands()-1);
+        MI.removeOperand(MI.getNumOperands()-1);
       Offset = 0;
       return true;
     }
@@ -626,7 +626,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
         return Offset == 0;
       }
 
-      MI.RemoveOperand(FrameRegIdx+1);
+      MI.removeOperand(FrameRegIdx+1);
       MI.getOperand(FrameRegIdx+1).ChangeToImmediate(0);
       NewOpc = immediateOffsetOpcode(Opcode);
       AddrMode = ARMII::AddrModeT2_i12;
diff --git a/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
index 1cc5422523f1..7ae4b19afb60 100644
--- a/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -27,6 +27,7 @@
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
@@ -205,11 +206,11 @@ namespace {
                         bool IsSelfLoop);
 
     /// ReduceMI - Attempt to reduce MI, return true on success.
-    bool ReduceMI(MachineBasicBlock &MBB, MachineInstr *MI,
-                  bool LiveCPSR, bool IsSelfLoop);
+    bool ReduceMI(MachineBasicBlock &MBB, MachineInstr *MI, bool LiveCPSR,
+                  bool IsSelfLoop, bool SkipPrologueEpilogue);
 
     /// ReduceMBB - Reduce width of instructions in the specified basic block.
-    bool ReduceMBB(MachineBasicBlock &MBB);
+    bool ReduceMBB(MachineBasicBlock &MBB, bool SkipPrologueEpilogue);
 
     bool OptimizeSize;
     bool MinimizeSize;
@@ -620,7 +621,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
   // Transfer MI flags.
   MIB.setMIFlags(MI->getFlags());
 
-  LLVM_DEBUG(errs() << "Converted 32-bit: " << *MI
+  LLVM_DEBUG(dbgs() << "Converted 32-bit: " << *MI
                     << "       to 16-bit: " << *MIB);
 
   MBB.erase_instr(MI);
@@ -668,7 +669,7 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
     // Transfer MI flags.
     MIB.setMIFlags(MI->getFlags());
 
-    LLVM_DEBUG(errs() << "Converted 32-bit: " << *MI
+    LLVM_DEBUG(dbgs() << "Converted 32-bit: " << *MI
                       << "       to 16-bit: " << *MIB);
 
     MBB.erase_instr(MI);
@@ -848,7 +849,7 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
   // Transfer MI flags.
   MIB.setMIFlags(MI->getFlags());
 
-  LLVM_DEBUG(errs() << "Converted 32-bit: " << *MI
+  LLVM_DEBUG(dbgs() << "Converted 32-bit: " << *MI
                     << "       to 16-bit: " << *MIB);
 
   MBB.erase_instr(MI);
@@ -971,7 +972,7 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
   // Transfer MI flags.
   MIB.setMIFlags(MI->getFlags());
 
-  LLVM_DEBUG(errs() << "Converted 32-bit: " << *MI
+  LLVM_DEBUG(dbgs() << "Converted 32-bit: " << *MI
                     << "       to 16-bit: " << *MIB);
 
   MBB.erase_instr(MI);
@@ -1012,11 +1013,15 @@ static bool UpdateCPSRUse(MachineInstr &MI, bool LiveCPSR) {
 }
 
 bool Thumb2SizeReduce::ReduceMI(MachineBasicBlock &MBB, MachineInstr *MI,
-                                bool LiveCPSR, bool IsSelfLoop) {
+                                bool LiveCPSR, bool IsSelfLoop,
+                                bool SkipPrologueEpilogue) {
   unsigned Opcode = MI->getOpcode();
   DenseMap<unsigned, unsigned>::iterator OPI = ReduceOpcodeMap.find(Opcode);
   if (OPI == ReduceOpcodeMap.end())
     return false;
+  if (SkipPrologueEpilogue && (MI->getFlag(MachineInstr::FrameSetup) ||
+                               MI->getFlag(MachineInstr::FrameDestroy)))
+    return false;
   const ReduceEntry &Entry = ReduceTable[OPI->second];
 
   // Don't attempt normal reductions on "special" cases for now.
@@ -1036,7 +1041,8 @@ bool Thumb2SizeReduce::ReduceMI(MachineBasicBlock &MBB, MachineInstr *MI,
   return false;
 }
 
-bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
+bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB,
+                                 bool SkipPrologueEpilogue) {
   bool Modified = false;
 
   // Yes, CPSR could be livein.
@@ -1080,7 +1086,7 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
     // Does NextMII belong to the same bundle as MI?
     bool NextInSameBundle = NextMII != E && NextMII->isBundledWithPred();
 
-    if (ReduceMI(MBB, MI, LiveCPSR, IsSelfLoop)) {
+    if (ReduceMI(MBB, MI, LiveCPSR, IsSelfLoop, SkipPrologueEpilogue)) {
       Modified = true;
       MachineBasicBlock::instr_iterator I = std::prev(NextMII);
       MI = &*I;
@@ -1130,7 +1136,7 @@ bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) {
   if (PredicateFtor && !PredicateFtor(MF.getFunction()))
     return false;
 
-  STI = &static_cast<const ARMSubtarget &>(MF.getSubtarget());
+  STI = &MF.getSubtarget<ARMSubtarget>();
   if (STI->isThumb1Only() || STI->prefers32BitThumb())
     return false;
 
@@ -1147,8 +1153,10 @@ bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) {
   // predecessors.
   ReversePostOrderTraversal<MachineFunction*> RPOT(&MF);
   bool Modified = false;
+  bool NeedsWinCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
+                     MF.getFunction().needsUnwindTableEntry();
   for (MachineBasicBlock *MBB : RPOT)
-    Modified |= ReduceMBB(*MBB);
+    Modified |= ReduceMBB(*MBB, /*SkipPrologueEpilogue=*/NeedsWinCFI);
   return Modified;
 }
 
diff --git a/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp b/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
index 5d2bc4ebe191..2a3fa3b31512 100644
--- a/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
+++ b/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
@@ -37,7 +37,7 @@ extern cl::opt<bool> ReuseFrameIndexVals;
 
 using namespace llvm;
 
-ThumbRegisterInfo::ThumbRegisterInfo() {}
+ThumbRegisterInfo::ThumbRegisterInfo() = default;
 
 const TargetRegisterClass *
 ThumbRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
@@ -338,7 +338,7 @@ void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
 static void removeOperands(MachineInstr &MI, unsigned i) {
   unsigned Op = i;
   for (unsigned e = MI.getNumOperands(); i != e; ++i)
-    MI.RemoveOperand(Op);
+    MI.removeOperand(Op);
 }
 
 /// convertToNonSPOpcode - Change the opcode to the non-SP version, because
@@ -361,6 +361,7 @@ bool ThumbRegisterInfo::rewriteFrameIndex(MachineBasicBlock::iterator II,
                                           const ARMBaseInstrInfo &TII) const {
   MachineInstr &MI = *II;
   MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
   assert(MBB.getParent()->getSubtarget<ARMSubtarget>().isThumb1Only() &&
          "This isn't needed for thumb2!");
   DebugLoc dl = MI.getDebugLoc();
@@ -396,7 +397,18 @@ bool ThumbRegisterInfo::rewriteFrameIndex(MachineBasicBlock::iterator II,
 
     if ((unsigned)Offset <= Mask * Scale) {
       // Replace the FrameIndex with the frame register (e.g., sp).
-      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      Register DestReg = FrameReg;
+
+      // In case FrameReg is a high register, move it to a low reg to ensure it
+      // can be used as an operand.
+      if (ARM::hGPRRegClass.contains(FrameReg) && FrameReg != ARM::SP) {
+        DestReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass);
+        BuildMI(MBB, II, dl, TII.get(ARM::tMOVr), DestReg)
+            .addReg(FrameReg)
+            .add(predOps(ARMCC::AL));
+      }
+
+      MI.getOperand(FrameRegIdx).ChangeToRegister(DestReg, false);
       ImmOp.ChangeToImmediate(ImmedOffset);
 
       // If we're using a register where sp was stored, convert the instruction
@@ -517,7 +529,16 @@ void ThumbRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                  Offset, false, TII, *this);
       else {
         emitLoadConstPool(MBB, II, dl, TmpReg, 0, Offset);
-        UseRR = true;
+        if (!ARM::hGPRRegClass.contains(FrameReg)) {
+          UseRR = true;
+        } else {
+          // If FrameReg is a high register, add the reg values in a separate
+          // instruction as the load won't be able to access it.
+          BuildMI(MBB, II, dl, TII.get(ARM::tADDhirr), TmpReg)
+              .addReg(TmpReg)
+              .addReg(FrameReg)
+              .add(predOps(ARMCC::AL));
+        }
       }
     } else {
       emitThumbRegPlusImmediate(MBB, II, dl, TmpReg, FrameReg, Offset, TII,
@@ -526,11 +547,14 @@ void ThumbRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
     MI.setDesc(TII.get(UseRR ? ARM::tLDRr : ARM::tLDRi));
     MI.getOperand(FIOperandNum).ChangeToRegister(TmpReg, false, false, true);
-    if (UseRR)
+    if (UseRR) {
+      assert(!ARM::hGPRRegClass.contains(FrameReg) &&
+             "Thumb1 loads can't use high register");
       // Use [reg, reg] addrmode. Replace the immediate operand w/ the frame
       // register. The offset is already handled in the vreg value.
       MI.getOperand(FIOperandNum+1).ChangeToRegister(FrameReg, false, false,
                                                      false);
+    }
   } else if (MI.mayStore()) {
       VReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass);
       bool UseRR = false;
@@ -541,18 +565,30 @@ void ThumbRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                    Offset, false, TII, *this);
         else {
           emitLoadConstPool(MBB, II, dl, VReg, 0, Offset);
-          UseRR = true;
+          if (!ARM::hGPRRegClass.contains(FrameReg)) {
+            UseRR = true;
+          } else {
+            // If FrameReg is a high register, add the reg values in a separate
+            // instruction as the load won't be able to access it.
+            BuildMI(MBB, II, dl, TII.get(ARM::tADDhirr), VReg)
+                .addReg(VReg)
+                .addReg(FrameReg)
+                .add(predOps(ARMCC::AL));
+          }
         }
       } else
         emitThumbRegPlusImmediate(MBB, II, dl, VReg, FrameReg, Offset, TII,
                                   *this);
       MI.setDesc(TII.get(UseRR ? ARM::tSTRr : ARM::tSTRi));
       MI.getOperand(FIOperandNum).ChangeToRegister(VReg, false, false, true);
-      if (UseRR)
+      if (UseRR) {
+        assert(!ARM::hGPRRegClass.contains(FrameReg) &&
+               "Thumb1 stores can't use high register");
         // Use [reg, reg] addrmode. Replace the immediate operand w/ the frame
         // register. The offset is already handled in the vreg value.
         MI.getOperand(FIOperandNum+1).ChangeToRegister(FrameReg, false, false,
                                                        false);
+      }
   } else {
     llvm_unreachable("Unexpected opcode!");
   }
diff --git a/llvm/lib/Target/AVR/AVR.h b/llvm/lib/Target/AVR/AVR.h
index 0b512172ba10..d29dc5f70e72 100644
--- a/llvm/lib/Target/AVR/AVR.h
+++ b/llvm/lib/Target/AVR/AVR.h
@@ -15,6 +15,8 @@
 #define LLVM_AVR_H
 
 #include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
@@ -27,12 +29,10 @@ FunctionPass *createAVRISelDag(AVRTargetMachine &TM,
                                CodeGenOpt::Level OptLevel);
 FunctionPass *createAVRExpandPseudoPass();
 FunctionPass *createAVRFrameAnalyzerPass();
-FunctionPass *createAVRRelaxMemPass();
 FunctionPass *createAVRBranchSelectionPass();
 
 void initializeAVRShiftExpandPass(PassRegistry &);
 void initializeAVRExpandPseudoPass(PassRegistry &);
-void initializeAVRRelaxMemPass(PassRegistry &);
 
 /// Contains the AVR backend.
 namespace AVR {
diff --git a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
index 259ab1bc7aec..0001e520b1fb 100644
--- a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
+++ b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
@@ -14,6 +14,7 @@
 #include "AVR.h"
 #include "AVRMCInstLower.h"
 #include "AVRSubtarget.h"
+#include "AVRTargetMachine.h"
 #include "MCTargetDesc/AVRInstPrinter.h"
 #include "MCTargetDesc/AVRMCExpr.h"
 #include "TargetInfo/AVRTargetInfo.h"
@@ -21,6 +22,7 @@
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/Mangler.h"
@@ -60,6 +62,8 @@ public:
 
   bool doFinalization(Module &M) override;
 
+  void emitStartOfAsmFile(Module &M) override;
+
 private:
   const MCRegisterInfo &MRI;
   bool EmittedStructorSymbolAttrs = false;
@@ -236,6 +240,45 @@ bool AVRAsmPrinter::doFinalization(Module &M) {
   return AsmPrinter::doFinalization(M);
 }
 
+void AVRAsmPrinter::emitStartOfAsmFile(Module &M) {
+  const AVRTargetMachine &TM = (const AVRTargetMachine &)MMI->getTarget();
+  const AVRSubtarget *SubTM = (const AVRSubtarget *)TM.getSubtargetImpl();
+  if (!SubTM)
+    return;
+
+  // Emit __tmp_reg__.
+  OutStreamer->emitAssignment(
+      MMI->getContext().getOrCreateSymbol(StringRef("__tmp_reg__")),
+      MCConstantExpr::create(SubTM->getRegTmpIndex(), MMI->getContext()));
+  // Emit __zero_reg__.
+  OutStreamer->emitAssignment(
+      MMI->getContext().getOrCreateSymbol(StringRef("__zero_reg__")),
+      MCConstantExpr::create(SubTM->getRegZeroIndex(), MMI->getContext()));
+  // Emit __SREG__.
+  OutStreamer->emitAssignment(
+      MMI->getContext().getOrCreateSymbol(StringRef("__SREG__")),
+      MCConstantExpr::create(SubTM->getIORegSREG(), MMI->getContext()));
+  // Emit __SP_H__ if available.
+  if (!SubTM->hasSmallStack())
+    OutStreamer->emitAssignment(
+        MMI->getContext().getOrCreateSymbol(StringRef("__SP_H__")),
+        MCConstantExpr::create(SubTM->getIORegSPH(), MMI->getContext()));
+  // Emit __SP_L__.
+  OutStreamer->emitAssignment(
+      MMI->getContext().getOrCreateSymbol(StringRef("__SP_L__")),
+      MCConstantExpr::create(SubTM->getIORegSPL(), MMI->getContext()));
+  // Emit __EIND__ if available.
+  if (SubTM->hasEIJMPCALL())
+    OutStreamer->emitAssignment(
+        MMI->getContext().getOrCreateSymbol(StringRef("__EIND__")),
+        MCConstantExpr::create(SubTM->getIORegEIND(), MMI->getContext()));
+  // Emit __RAMPZ__ if available.
+  if (SubTM->hasELPM())
+    OutStreamer->emitAssignment(
+        MMI->getContext().getOrCreateSymbol(StringRef("__RAMPZ__")),
+        MCConstantExpr::create(SubTM->getIORegRAMPZ(), MMI->getContext()));
+}
+
 } // end of namespace llvm
 
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRAsmPrinter() {
diff --git a/llvm/lib/Target/AVR/AVRCallingConv.td b/llvm/lib/Target/AVR/AVRCallingConv.td
index b4bc35e191c0..314d59bc2a59 100644
--- a/llvm/lib/Target/AVR/AVRCallingConv.td
+++ b/llvm/lib/Target/AVR/AVRCallingConv.td
@@ -27,6 +27,8 @@ def RetCC_AVR_BUILTIN : CallingConv<[
 
 // Calling convention for variadic functions.
 def ArgCC_AVR_Vararg : CallingConv<[
+  // i8 are always passed through the stack with a byte slot and byte alignment.
+  CCIfType<[i8], CCAssignToStack<1, 1>>,
   // i16 are always passed through the stack with an alignment of 1.
   CCAssignToStack<2, 1>
 ]>;
@@ -36,4 +38,6 @@ def ArgCC_AVR_Vararg : CallingConv<[
 //===----------------------------------------------------------------------===//
 
 def CSR_Normal : CalleeSavedRegs<(add R29, R28, (sequence "R%u", 17, 2))>;
+def CSR_NormalTiny : CalleeSavedRegs<(add R29, R28, R19, R18)>;
 def CSR_Interrupts : CalleeSavedRegs<(add(sequence "R%u", 31, 2))>;
+def CSR_InterruptsTiny : CalleeSavedRegs<(add(sequence "R%u", 31, 18))>;
diff --git a/llvm/lib/Target/AVR/AVRDevices.td b/llvm/lib/Target/AVR/AVRDevices.td
index 7ad0fe904a81..3eb5a16204e7 100644
--- a/llvm/lib/Target/AVR/AVRDevices.td
+++ b/llvm/lib/Target/AVR/AVRDevices.td
@@ -174,15 +174,13 @@ def FamilyAVR35
     : Family<"avr35",
              [FamilyAVR3, FeatureMOVW, FeatureLPMX, FeatureSPM, FeatureBREAK]>;
 
-def FamilyAVR4 : Family<"avr4", [
-  FamilyAVR2, FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM,
-  FeatureBREAK
-]>;
+def FamilyAVR4 : Family<"avr4",
+                        [FamilyAVR2, FeatureMultiplication, FeatureMOVW,
+                         FeatureLPMX, FeatureSPM, FeatureBREAK]>;
 
-def FamilyAVR5 : Family<"avr5", [
-  FamilyAVR3, FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM,
-  FeatureBREAK
-]>;
+def FamilyAVR5 : Family<"avr5",
+                        [FamilyAVR3, FeatureMultiplication, FeatureMOVW,
+                         FeatureLPMX, FeatureSPM, FeatureBREAK]>;
 
 def FamilyAVR51 : Family<"avr51", [FamilyAVR5, FeatureELPM, FeatureELPMX]>;
 
@@ -190,14 +188,21 @@ def FamilyAVR6 : Family<"avr6", [FamilyAVR51]>;
 
 def FamilyTiny
     : Family<"avrtiny",
-             [FamilyAVR0, FeatureBREAK, FeatureSRAM, FeatureTinyEncoding]>;
-
-def FamilyXMEGA : Family<"xmega", [
-  FamilyAVR0, FeatureLPM, FeatureIJMPCALL, FeatureADDSUBIW, FeatureSRAM,
-  FeatureJMPCALL, FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM,
-  FeatureBREAK, FeatureEIJMPCALL, FeatureSPMX, FeatureDES, FeatureELPM,
-  FeatureELPMX
-]>;
+             [FamilyAVR0, FeatureBREAK, FeatureSRAM, FeatureTinyEncoding,
+              FeatureSmallStack]>;
+
+def FamilyXMEGA3 : Family<"xmega3",
+                          [FamilyAVR0, FeatureLPM, FeatureIJMPCALL,
+                           FeatureADDSUBIW, FeatureSRAM, FeatureJMPCALL,
+                           FeatureMultiplication, FeatureMOVW, FeatureLPMX,
+                           FeatureBREAK]>;
+
+def FamilyXMEGA : Family<"xmega",
+                         [FamilyAVR0, FeatureLPM, FeatureIJMPCALL,
+                          FeatureADDSUBIW, FeatureSRAM, FeatureJMPCALL,
+                          FeatureMultiplication, FeatureMOVW, FeatureLPMX,
+                          FeatureSPM, FeatureBREAK, FeatureEIJMPCALL,
+                          FeatureSPMX, FeatureDES, FeatureELPM, FeatureELPMX]>;
 
 def FamilyXMEGAU : Family<"xmegau", [FamilyXMEGA, FeatureRMW]>;
 
@@ -237,7 +242,7 @@ def : Device<"avr51", FamilyAVR51, ELFArchAVR51>;
 def : Device<"avr6", FamilyAVR6, ELFArchAVR6>;
 def : Device<"avrxmega1", FamilyXMEGA, ELFArchXMEGA1>;
 def : Device<"avrxmega2", FamilyXMEGA, ELFArchXMEGA2>;
-def : Device<"avrxmega3", FamilyXMEGA, ELFArchXMEGA3>;
+def : Device<"avrxmega3", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"avrxmega4", FamilyXMEGA, ELFArchXMEGA4>;
 def : Device<"avrxmega5", FamilyXMEGA, ELFArchXMEGA5>;
 def : Device<"avrxmega6", FamilyXMEGA, ELFArchXMEGA6>;
@@ -245,41 +250,44 @@ def : Device<"avrxmega7", FamilyXMEGA, ELFArchXMEGA7>;
 def : Device<"avrtiny", FamilyTiny, ELFArchTiny>;
 
 // Specific MCUs
-def : Device<"at90s1200", FamilyAVR0, ELFArchAVR1>;
-def : Device<"attiny11", FamilyAVR1, ELFArchAVR1>;
-def : Device<"attiny12", FamilyAVR1, ELFArchAVR1>;
-def : Device<"attiny15", FamilyAVR1, ELFArchAVR1>;
-def : Device<"attiny28", FamilyAVR1, ELFArchAVR1>;
-def : Device<"at90s2313", FamilyAVR2, ELFArchAVR2>;
-def : Device<"at90s2323", FamilyAVR2, ELFArchAVR2>;
-def : Device<"at90s2333", FamilyAVR2, ELFArchAVR2>;
-def : Device<"at90s2343", FamilyAVR2, ELFArchAVR2>;
-def : Device<"attiny22", FamilyAVR2, ELFArchAVR2>;
-def : Device<"attiny26", FamilyAVR2, ELFArchAVR2, [FeatureLPMX]>;
+// NOTE: This list has been synchronized with gcc-avr 5.4.0 and avr-libc 2.0.0.
+def : Device<"at90s1200", FamilyAVR0, ELFArchAVR1, [FeatureSmallStack]>;
+def : Device<"attiny11", FamilyAVR1, ELFArchAVR1, [FeatureSmallStack]>;
+def : Device<"attiny12", FamilyAVR1, ELFArchAVR1, [FeatureSmallStack]>;
+def : Device<"attiny15", FamilyAVR1, ELFArchAVR1, [FeatureSmallStack]>;
+def : Device<"attiny28", FamilyAVR1, ELFArchAVR1, [FeatureSmallStack]>;
+def : Device<"at90s2313", FamilyAVR2, ELFArchAVR2, [FeatureSmallStack]>;
+def : Device<"at90s2323", FamilyAVR2, ELFArchAVR2, [FeatureSmallStack]>;
+def : Device<"at90s2333", FamilyAVR2, ELFArchAVR2, [FeatureSmallStack]>;
+def : Device<"at90s2343", FamilyAVR2, ELFArchAVR2, [FeatureSmallStack]>;
+def : Device<"attiny22", FamilyAVR2, ELFArchAVR2, [FeatureSmallStack]>;
+def : Device<"attiny26", FamilyAVR2, ELFArchAVR2,
+             [FeatureLPMX, FeatureSmallStack]>;
 def : Device<"at86rf401", FamilyAVR2, ELFArchAVR25, [FeatureMOVW, FeatureLPMX]>;
-def : Device<"at90s4414", FamilyAVR2, ELFArchAVR2>;
-def : Device<"at90s4433", FamilyAVR2, ELFArchAVR2>;
-def : Device<"at90s4434", FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s4414", FamilyAVR2, ELFArchAVR2, [FeatureSmallStack]>;
+def : Device<"at90s4433", FamilyAVR2, ELFArchAVR2, [FeatureSmallStack]>;
+def : Device<"at90s4434", FamilyAVR2, ELFArchAVR2, [FeatureSmallStack]>;
 def : Device<"at90s8515", FamilyAVR2, ELFArchAVR2>;
 def : Device<"at90c8534", FamilyAVR2, ELFArchAVR2>;
 def : Device<"at90s8535", FamilyAVR2, ELFArchAVR2>;
 def : Device<"ata5272", FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny13", FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny13a", FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny2313", FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny2313a", FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny24", FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny24a", FamilyAVR25, ELFArchAVR25>;
+def : Device<"ata6616c", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny13", FamilyAVR25, ELFArchAVR25, [FeatureSmallStack]>;
+def : Device<"attiny13a", FamilyAVR25, ELFArchAVR25, [FeatureSmallStack]>;
+def : Device<"attiny2313", FamilyAVR25, ELFArchAVR25, [FeatureSmallStack]>;
+def : Device<"attiny2313a", FamilyAVR25, ELFArchAVR25, [FeatureSmallStack]>;
+def : Device<"attiny24", FamilyAVR25, ELFArchAVR25, [FeatureSmallStack]>;
+def : Device<"attiny24a", FamilyAVR25, ELFArchAVR25, [FeatureSmallStack]>;
 def : Device<"attiny4313", FamilyAVR25, ELFArchAVR25>;
 def : Device<"attiny44", FamilyAVR25, ELFArchAVR25>;
 def : Device<"attiny44a", FamilyAVR25, ELFArchAVR25>;
 def : Device<"attiny84", FamilyAVR25, ELFArchAVR25>;
 def : Device<"attiny84a", FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny25", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny25", FamilyAVR25, ELFArchAVR25, [FeatureSmallStack]>;
 def : Device<"attiny45", FamilyAVR25, ELFArchAVR25>;
 def : Device<"attiny85", FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny261", FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny261a", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny261", FamilyAVR25, ELFArchAVR25, [FeatureSmallStack]>;
+def : Device<"attiny261a", FamilyAVR25, ELFArchAVR25, [FeatureSmallStack]>;
 def : Device<"attiny441", FamilyAVR25, ELFArchAVR25>;
 def : Device<"attiny461", FamilyAVR25, ELFArchAVR25>;
 def : Device<"attiny461a", FamilyAVR25, ELFArchAVR25>;
@@ -299,6 +307,8 @@ def : Device<"attiny167", FamilyAVR35, ELFArchAVR35>;
 def : Device<"at90usb82", FamilyAVR35, ELFArchAVR35>;
 def : Device<"at90usb162", FamilyAVR35, ELFArchAVR35>;
 def : Device<"ata5505", FamilyAVR35, ELFArchAVR35>;
+def : Device<"ata6617c", FamilyAVR35, ELFArchAVR35>;
+def : Device<"ata664251", FamilyAVR35, ELFArchAVR35>;
 def : Device<"atmega8u2", FamilyAVR35, ELFArchAVR35>;
 def : Device<"atmega16u2", FamilyAVR35, ELFArchAVR35>;
 def : Device<"atmega32u2", FamilyAVR35, ELFArchAVR35>;
@@ -310,6 +320,7 @@ def : Device<"atmega8a", FamilyAVR2, ELFArchAVR4,
              [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
 def : Device<"ata6285", FamilyAVR4, ELFArchAVR4>;
 def : Device<"ata6286", FamilyAVR4, ELFArchAVR4>;
+def : Device<"ata6612c", FamilyAVR4, ELFArchAVR4>;
 def : Device<"atmega48", FamilyAVR4, ELFArchAVR4>;
 def : Device<"atmega48a", FamilyAVR4, ELFArchAVR4>;
 def : Device<"atmega48pa", FamilyAVR4, ELFArchAVR4>;
@@ -331,8 +342,17 @@ def : Device<"at90pwm2b", FamilyAVR4, ELFArchAVR4>;
 def : Device<"at90pwm3", FamilyAVR4, ELFArchAVR4>;
 def : Device<"at90pwm3b", FamilyAVR4, ELFArchAVR4>;
 def : Device<"at90pwm81", FamilyAVR4, ELFArchAVR4>;
+def : Device<"ata5702m322", FamilyAVR5, ELFArchAVR5>;
+def : Device<"ata5782", FamilyAVR5, ELFArchAVR5>;
 def : Device<"ata5790", FamilyAVR5, ELFArchAVR5>;
+def : Device<"ata5790n", FamilyAVR5, ELFArchAVR5>;
+def : Device<"ata5791", FamilyAVR5, ELFArchAVR5>;
 def : Device<"ata5795", FamilyAVR5, ELFArchAVR5>;
+def : Device<"ata5831", FamilyAVR5, ELFArchAVR5>;
+def : Device<"ata6613c", FamilyAVR5, ELFArchAVR5>;
+def : Device<"ata6614q", FamilyAVR5, ELFArchAVR5>;
+def : Device<"ata8210", FamilyAVR5, ELFArchAVR5>;
+def : Device<"ata8510", FamilyAVR5, ELFArchAVR5>;
 def : Device<"atmega16", FamilyAVR5, ELFArchAVR5>;
 def : Device<"atmega16a", FamilyAVR5, ELFArchAVR5>;
 def : Device<"atmega161", FamilyAVR3, ELFArchAVR5,
@@ -411,6 +431,7 @@ def : Device<"atmega16hvbrevb", FamilyAVR5, ELFArchAVR5>;
 def : Device<"atmega32hvb", FamilyAVR5, ELFArchAVR5>;
 def : Device<"atmega32hvbrevb", FamilyAVR5, ELFArchAVR5>;
 def : Device<"atmega64hve", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega64hve2", FamilyAVR5, ELFArchAVR5>;
 def : Device<"at90can32", FamilyAVR5, ELFArchAVR5>;
 def : Device<"at90can64", FamilyAVR5, ELFArchAVR5>;
 def : Device<"at90pwm161", FamilyAVR5, ELFArchAVR5>;
@@ -452,12 +473,13 @@ def : Device<"atxmega16c4", FamilyXMEGAU, ELFArchXMEGA2>;
 def : Device<"atxmega16d4", FamilyXMEGA, ELFArchXMEGA2>;
 def : Device<"atxmega32a4", FamilyXMEGA, ELFArchXMEGA2>;
 def : Device<"atxmega32a4u", FamilyXMEGAU, ELFArchXMEGA2>;
+def : Device<"atxmega32c3", FamilyXMEGAU, ELFArchXMEGA2>;
 def : Device<"atxmega32c4", FamilyXMEGAU, ELFArchXMEGA2>;
+def : Device<"atxmega32d3", FamilyXMEGA, ELFArchXMEGA2>;
 def : Device<"atxmega32d4", FamilyXMEGA, ELFArchXMEGA2>;
 def : Device<"atxmega32e5", FamilyXMEGAU, ELFArchXMEGA2>;
 def : Device<"atxmega16e5", FamilyXMEGAU, ELFArchXMEGA2>;
 def : Device<"atxmega8e5", FamilyXMEGAU, ELFArchXMEGA2>;
-def : Device<"atxmega32x1", FamilyXMEGA, ELFArchXMEGA2>;
 def : Device<"atxmega64a3", FamilyXMEGA, ELFArchXMEGA4>;
 def : Device<"atxmega64a3u", FamilyXMEGAU, ELFArchXMEGA4>;
 def : Device<"atxmega64a4u", FamilyXMEGAU, ELFArchXMEGA4>;
@@ -498,28 +520,39 @@ def : Device<"attiny20", FamilyTiny, ELFArchTiny>;
 def : Device<"attiny40", FamilyTiny, ELFArchTiny>;
 def : Device<"attiny102", FamilyTiny, ELFArchTiny>;
 def : Device<"attiny104", FamilyTiny, ELFArchTiny>;
-def : Device<"attiny202", FamilyXMEGA, ELFArchXMEGA3>;
-def : Device<"attiny402", FamilyXMEGA, ELFArchXMEGA3>;
-def : Device<"attiny204", FamilyXMEGA, ELFArchXMEGA3>;
-def : Device<"attiny404", FamilyXMEGA, ELFArchXMEGA3>;
-def : Device<"attiny804", FamilyXMEGA, ELFArchXMEGA3>;
-def : Device<"attiny1604", FamilyXMEGA, ELFArchXMEGA3>;
-def : Device<"attiny406", FamilyXMEGA, ELFArchXMEGA3>;
-def : Device<"attiny806", FamilyXMEGA, ELFArchXMEGA3>;
-def : Device<"attiny1606", FamilyXMEGA, ELFArchXMEGA3>;
-def : Device<"attiny807", FamilyXMEGA, ELFArchXMEGA3>;
-def : Device<"attiny1607", FamilyXMEGA, ELFArchXMEGA3>;
-def : Device<"attiny212", FamilyXMEGA, ELFArchXMEGA3>;
-def : Device<"attiny412", FamilyXMEGA, ELFArchXMEGA3>;
-def : Device<"attiny214", FamilyXMEGA, ELFArchXMEGA3>;
-def : Device<"attiny414", FamilyXMEGA, ELFArchXMEGA3>;
-def : Device<"attiny814", FamilyXMEGA, ELFArchXMEGA3>;
-def : Device<"attiny1614", FamilyXMEGA, ELFArchXMEGA3>;
-def : Device<"attiny416", FamilyXMEGA, ELFArchXMEGA3>;
-def : Device<"attiny816", FamilyXMEGA, ELFArchXMEGA3>;
-def : Device<"attiny1616", FamilyXMEGA, ELFArchXMEGA3>;
-def : Device<"attiny3216", FamilyXMEGA, ELFArchXMEGA3>;
-def : Device<"attiny417", FamilyXMEGA, ELFArchXMEGA3>;
-def : Device<"attiny817", FamilyXMEGA, ELFArchXMEGA3>;
-def : Device<"attiny1617", FamilyXMEGA, ELFArchXMEGA3>;
-def : Device<"attiny3217", FamilyXMEGA, ELFArchXMEGA3>;
+def : Device<"attiny202", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny402", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny204", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny404", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny804", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny1604", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny406", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny806", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny1606", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny807", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny1607", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny212", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny412", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny214", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny414", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny814", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny1614", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny416", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny816", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny1616", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny3216", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny417", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny817", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny1617", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny3217", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny1624", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny1626", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny1627", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"atmega808", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"atmega809", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"atmega1608", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"atmega1609", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"atmega3208", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"atmega3209", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"atmega4808", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"atmega4809", FamilyXMEGA3, ELFArchXMEGA3>;
diff --git a/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp b/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
index 144ae2b320f9..a9dc9af819e6 100644
--- a/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
@@ -54,8 +54,6 @@ private:
   const Register SCRATCH_REGISTER = AVR::R0;
   /// The register that will always contain zero.
   const Register ZERO_REGISTER = AVR::R1;
-  /// The IO address of the status register.
-  const unsigned SREG_ADDR = 0x3f;
 
   bool expandMBB(Block &MBB);
   bool expandMI(Block &MBB, BlockIt MBBI);
@@ -86,21 +84,23 @@ private:
 
   bool expandAtomicBinaryOp(unsigned Opcode, Block &MBB, BlockIt MBBI);
 
-  bool expandAtomicArithmeticOp(unsigned MemOpcode, unsigned ArithOpcode,
-                                Block &MBB, BlockIt MBBI);
-
-  /// Specific shift implementation.
+  /// Specific shift implementation for int8.
   bool expandLSLB7Rd(Block &MBB, BlockIt MBBI);
   bool expandLSRB7Rd(Block &MBB, BlockIt MBBI);
   bool expandASRB6Rd(Block &MBB, BlockIt MBBI);
   bool expandASRB7Rd(Block &MBB, BlockIt MBBI);
+
+  /// Specific shift implementation for int16.
   bool expandLSLW4Rd(Block &MBB, BlockIt MBBI);
   bool expandLSRW4Rd(Block &MBB, BlockIt MBBI);
+  bool expandASRW7Rd(Block &MBB, BlockIt MBBI);
   bool expandLSLW8Rd(Block &MBB, BlockIt MBBI);
   bool expandLSRW8Rd(Block &MBB, BlockIt MBBI);
   bool expandASRW8Rd(Block &MBB, BlockIt MBBI);
   bool expandLSLW12Rd(Block &MBB, BlockIt MBBI);
   bool expandLSRW12Rd(Block &MBB, BlockIt MBBI);
+  bool expandASRW14Rd(Block &MBB, BlockIt MBBI);
+  bool expandASRW15Rd(Block &MBB, BlockIt MBBI);
 
   // Common implementation of LPMWRdZ and ELPMWRdZ.
   bool expandLPMWELPMW(Block &MBB, BlockIt MBBI, bool IsExt);
@@ -141,6 +141,7 @@ bool AVRExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
     // Continue expanding the block until all pseudos are expanded.
     do {
       assert(ExpandCount < 10 && "pseudo expand limit reached");
+      (void)ExpandCount;
 
       bool BlockModified = expandMBB(MBB);
       Modified |= BlockModified;
@@ -453,7 +454,7 @@ bool AVRExpandPseudo::expand<AVR::NEGWRd>(Block &MBB, BlockIt MBBI) {
   auto MIBHI =
       buildMI(MBB, MBBI, AVR::NEGRd)
           .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
-          .addReg(DstHiReg, getKillRegState(DstIsKill));
+          .addReg(DstHiReg, RegState::Kill);
   // SREG is always implicitly dead
   MIBHI->getOperand(2).setIsDead();
 
@@ -917,13 +918,13 @@ bool AVRExpandPseudo::expand<AVR::ELPMWRdZPi>(Block &MBB, BlockIt MBBI) {
 
 template <typename Func>
 bool AVRExpandPseudo::expandAtomic(Block &MBB, BlockIt MBBI, Func f) {
-  // Remove the pseudo instruction.
   MachineInstr &MI = *MBBI;
+  const AVRSubtarget &STI = MBB.getParent()->getSubtarget<AVRSubtarget>();
 
   // Store the SREG.
   buildMI(MBB, MBBI, AVR::INRdA)
       .addReg(SCRATCH_REGISTER, RegState::Define)
-      .addImm(SREG_ADDR);
+      .addImm(STI.getIORegSREG());
 
   // Disable exceptions.
   buildMI(MBB, MBBI, AVR::BCLRs).addImm(7); // CLI
@@ -931,7 +932,9 @@ bool AVRExpandPseudo::expandAtomic(Block &MBB, BlockIt MBBI, Func f) {
   f(MI);
 
   // Restore the status reg.
-  buildMI(MBB, MBBI, AVR::OUTARr).addImm(SREG_ADDR).addReg(SCRATCH_REGISTER);
+  buildMI(MBB, MBBI, AVR::OUTARr)
+      .addImm(STI.getIORegSREG())
+      .addReg(SCRATCH_REGISTER);
 
   MI.eraseFromParent();
   return true;
@@ -955,31 +958,6 @@ bool AVRExpandPseudo::expandAtomicBinaryOp(unsigned Opcode, Block &MBB,
   return expandAtomicBinaryOp(Opcode, MBB, MBBI, [](MachineInstr &MI) {});
 }
 
-bool AVRExpandPseudo::expandAtomicArithmeticOp(unsigned Width,
-                                               unsigned ArithOpcode, Block &MBB,
-                                               BlockIt MBBI) {
-  return expandAtomic(MBB, MBBI, [&](MachineInstr &MI) {
-    auto DstReg = MI.getOperand(0).getReg();
-    auto PtrOp = MI.getOperand(1);
-    auto SrcReg = MI.getOperand(2).getReg();
-
-    unsigned LoadOpcode = (Width == 8) ? AVR::LDRdPtr : AVR::LDWRdPtr;
-    unsigned StoreOpcode = (Width == 8) ? AVR::STPtrRr : AVR::STWPtrRr;
-
-    // FIXME: this returns the new value (after the operation), not the old
-    // value as the atomicrmw instruction is supposed to do!
-
-    // Create the load
-    buildMI(MBB, MBBI, LoadOpcode, DstReg).addReg(PtrOp.getReg());
-
-    // Create the arithmetic op
-    buildMI(MBB, MBBI, ArithOpcode, DstReg).addReg(DstReg).addReg(SrcReg);
-
-    // Create the store
-    buildMI(MBB, MBBI, StoreOpcode).add(PtrOp).addReg(DstReg);
-  });
-}
-
 Register AVRExpandPseudo::scavengeGPR8(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   RegScavenger RS;
@@ -1025,56 +1003,6 @@ bool AVRExpandPseudo::expand<AVR::AtomicStore16>(Block &MBB, BlockIt MBBI) {
   return expandAtomicBinaryOp(AVR::STWPtrRr, MBB, MBBI);
 }
 
-template <>
-bool AVRExpandPseudo::expand<AVR::AtomicLoadAdd8>(Block &MBB, BlockIt MBBI) {
-  return expandAtomicArithmeticOp(8, AVR::ADDRdRr, MBB, MBBI);
-}
-
-template <>
-bool AVRExpandPseudo::expand<AVR::AtomicLoadAdd16>(Block &MBB, BlockIt MBBI) {
-  return expandAtomicArithmeticOp(16, AVR::ADDWRdRr, MBB, MBBI);
-}
-
-template <>
-bool AVRExpandPseudo::expand<AVR::AtomicLoadSub8>(Block &MBB, BlockIt MBBI) {
-  return expandAtomicArithmeticOp(8, AVR::SUBRdRr, MBB, MBBI);
-}
-
-template <>
-bool AVRExpandPseudo::expand<AVR::AtomicLoadSub16>(Block &MBB, BlockIt MBBI) {
-  return expandAtomicArithmeticOp(16, AVR::SUBWRdRr, MBB, MBBI);
-}
-
-template <>
-bool AVRExpandPseudo::expand<AVR::AtomicLoadAnd8>(Block &MBB, BlockIt MBBI) {
-  return expandAtomicArithmeticOp(8, AVR::ANDRdRr, MBB, MBBI);
-}
-
-template <>
-bool AVRExpandPseudo::expand<AVR::AtomicLoadAnd16>(Block &MBB, BlockIt MBBI) {
-  return expandAtomicArithmeticOp(16, AVR::ANDWRdRr, MBB, MBBI);
-}
-
-template <>
-bool AVRExpandPseudo::expand<AVR::AtomicLoadOr8>(Block &MBB, BlockIt MBBI) {
-  return expandAtomicArithmeticOp(8, AVR::ORRdRr, MBB, MBBI);
-}
-
-template <>
-bool AVRExpandPseudo::expand<AVR::AtomicLoadOr16>(Block &MBB, BlockIt MBBI) {
-  return expandAtomicArithmeticOp(16, AVR::ORWRdRr, MBB, MBBI);
-}
-
-template <>
-bool AVRExpandPseudo::expand<AVR::AtomicLoadXor8>(Block &MBB, BlockIt MBBI) {
-  return expandAtomicArithmeticOp(8, AVR::EORRdRr, MBB, MBBI);
-}
-
-template <>
-bool AVRExpandPseudo::expand<AVR::AtomicLoadXor16>(Block &MBB, BlockIt MBBI) {
-  return expandAtomicArithmeticOp(16, AVR::EORWRdRr, MBB, MBBI);
-}
-
 template <>
 bool AVRExpandPseudo::expand<AVR::AtomicFence>(Block &MBB, BlockIt MBBI) {
   // On AVR, there is only one core and so atomic fences do nothing.
@@ -1230,37 +1158,94 @@ bool AVRExpandPseudo::expand<AVR::STWPtrPdRr>(Block &MBB, BlockIt MBBI) {
 template <>
 bool AVRExpandPseudo::expand<AVR::STDWPtrQRr>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
-  Register SrcLoReg, SrcHiReg;
+
   Register DstReg = MI.getOperand(0).getReg();
-  Register SrcReg = MI.getOperand(2).getReg();
-  unsigned Imm = MI.getOperand(1).getImm();
   bool DstIsKill = MI.getOperand(0).isKill();
+  unsigned Imm = MI.getOperand(1).getImm();
+  Register SrcReg = MI.getOperand(2).getReg();
   bool SrcIsKill = MI.getOperand(2).isKill();
-  unsigned OpLo = AVR::STDPtrQRr;
-  unsigned OpHi = AVR::STDPtrQRr;
-  TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
 
-  // Since we add 1 to the Imm value for the high byte below, and 63 is the
-  // highest Imm value allowed for the instruction, 62 is the limit here.
-  assert(Imm <= 62 && "Offset is out of range");
+  // STD's maximum displacement is 63, so larger stores have to be split into a
+  // set of operations
+  if (Imm >= 63) {
+    if (!DstIsKill) {
+      buildMI(MBB, MBBI, AVR::PUSHWRr).addReg(DstReg);
+    }
 
-  auto MIBLO = buildMI(MBB, MBBI, OpLo)
-                   .addReg(DstReg)
-                   .addImm(Imm)
-                   .addReg(SrcLoReg, getKillRegState(SrcIsKill));
+    buildMI(MBB, MBBI, AVR::SUBIWRdK)
+        .addReg(DstReg, RegState::Define)
+        .addReg(DstReg, RegState::Kill)
+        .addImm(-Imm);
 
-  auto MIBHI = buildMI(MBB, MBBI, OpHi)
-                   .addReg(DstReg, getKillRegState(DstIsKill))
-                   .addImm(Imm + 1)
-                   .addReg(SrcHiReg, getKillRegState(SrcIsKill));
+    buildMI(MBB, MBBI, AVR::STWPtrRr)
+        .addReg(DstReg, RegState::Kill)
+        .addReg(SrcReg, getKillRegState(SrcIsKill));
 
-  MIBLO.setMemRefs(MI.memoperands());
-  MIBHI.setMemRefs(MI.memoperands());
+    if (!DstIsKill) {
+      buildMI(MBB, MBBI, AVR::POPWRd).addDef(DstReg, RegState::Define);
+    }
+  } else {
+    unsigned OpLo = AVR::STDPtrQRr;
+    unsigned OpHi = AVR::STDPtrQRr;
+    Register SrcLoReg, SrcHiReg;
+    TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
+
+    auto MIBLO = buildMI(MBB, MBBI, OpLo)
+                     .addReg(DstReg)
+                     .addImm(Imm)
+                     .addReg(SrcLoReg, getKillRegState(SrcIsKill));
+
+    auto MIBHI = buildMI(MBB, MBBI, OpHi)
+                     .addReg(DstReg, getKillRegState(DstIsKill))
+                     .addImm(Imm + 1)
+                     .addReg(SrcHiReg, getKillRegState(SrcIsKill));
+
+    MIBLO.setMemRefs(MI.memoperands());
+    MIBHI.setMemRefs(MI.memoperands());
+  }
 
   MI.eraseFromParent();
   return true;
 }
 
+template <>
+bool AVRExpandPseudo::expand<AVR::STDSPQRr>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  const MachineFunction &MF = *MBB.getParent();
+  const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
+
+  assert(MI.getOperand(0).getReg() == AVR::SP &&
+         "SP is expected as base pointer");
+
+  assert(STI.getFrameLowering()->hasReservedCallFrame(MF) &&
+         "unexpected STDSPQRr pseudo instruction");
+  (void)STI;
+
+  MI.setDesc(TII->get(AVR::STDPtrQRr));
+  MI.getOperand(0).setReg(AVR::R29R28);
+
+  return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::STDWSPQRr>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  const MachineFunction &MF = *MBB.getParent();
+  const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
+
+  assert(MI.getOperand(0).getReg() == AVR::SP &&
+         "SP is expected as base pointer");
+
+  assert(STI.getFrameLowering()->hasReservedCallFrame(MF) &&
+         "unexpected STDWSPQRr pseudo instruction");
+  (void)STI;
+
+  MI.setDesc(TII->get(AVR::STDWPtrQRr));
+  MI.getOperand(0).setReg(AVR::R29R28);
+
+  return true;
+}
+
 template <>
 bool AVRExpandPseudo::expand<AVR::INWRdA>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
@@ -1378,6 +1363,7 @@ bool AVRExpandPseudo::expand<AVR::ROLBRd>(Block &MBB, BlockIt MBBI) {
   unsigned OpShift, OpCarry;
   Register DstReg = MI.getOperand(0).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
+  bool DstIsKill = MI.getOperand(1).isKill();
   OpShift = AVR::ADDRdRr;
   OpCarry = AVR::ADCRdRr;
 
@@ -1387,13 +1373,13 @@ bool AVRExpandPseudo::expand<AVR::ROLBRd>(Block &MBB, BlockIt MBBI) {
   // Shift part
   buildMI(MBB, MBBI, OpShift)
       .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
-      .addReg(DstReg)
-      .addReg(DstReg);
+      .addReg(DstReg, RegState::Kill)
+      .addReg(DstReg, RegState::Kill);
 
   // Add the carry bit
   auto MIB = buildMI(MBB, MBBI, OpCarry)
                  .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
-                 .addReg(DstReg)
+                 .addReg(DstReg, getKillRegState(DstIsKill))
                  .addReg(ZERO_REGISTER);
 
   // SREG is always implicitly killed
@@ -1446,13 +1432,13 @@ bool AVRExpandPseudo::expand<AVR::LSLWRd>(Block &MBB, BlockIt MBBI) {
   // Low part
   buildMI(MBB, MBBI, OpLo)
       .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
-      .addReg(DstLoReg)
+      .addReg(DstLoReg, getKillRegState(DstIsKill))
       .addReg(DstLoReg, getKillRegState(DstIsKill));
 
   auto MIBHI =
       buildMI(MBB, MBBI, OpHi)
           .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
-          .addReg(DstHiReg)
+          .addReg(DstHiReg, getKillRegState(DstIsKill))
           .addReg(DstHiReg, getKillRegState(DstIsKill));
 
   if (ImpIsDead)
@@ -1478,7 +1464,7 @@ bool AVRExpandPseudo::expand<AVR::LSLWHiRd>(Block &MBB, BlockIt MBBI) {
   // add hireg, hireg <==> lsl hireg
   auto MILSL =
       buildMI(MBB, MBBI, AVR::ADDRdRr)
-          .addReg(DstHiReg, RegState::Define, getDeadRegState(DstIsDead))
+          .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
           .addReg(DstHiReg, getKillRegState(DstIsKill))
           .addReg(DstHiReg, getKillRegState(DstIsKill));
 
@@ -1502,16 +1488,16 @@ bool AVRExpandPseudo::expandLSLW4Rd(Block &MBB, BlockIt MBBI) {
   // swap Rl
   buildMI(MBB, MBBI, AVR::SWAPRd)
       .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
-      .addReg(DstHiReg, getKillRegState(DstIsKill));
+      .addReg(DstHiReg, RegState::Kill);
   buildMI(MBB, MBBI, AVR::SWAPRd)
       .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
-      .addReg(DstLoReg, getKillRegState(DstIsKill));
+      .addReg(DstLoReg, RegState::Kill);
 
   // andi Rh, 0xf0
   auto MI0 =
       buildMI(MBB, MBBI, AVR::ANDIRdK)
           .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
-          .addReg(DstHiReg, getKillRegState(DstIsKill))
+          .addReg(DstHiReg, RegState::Kill)
           .addImm(0xf0);
   // SREG is implicitly dead.
   MI0->getOperand(3).setIsDead();
@@ -1520,7 +1506,7 @@ bool AVRExpandPseudo::expandLSLW4Rd(Block &MBB, BlockIt MBBI) {
   auto MI1 =
       buildMI(MBB, MBBI, AVR::EORRdRr)
           .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
-          .addReg(DstHiReg, getKillRegState(DstIsKill))
+          .addReg(DstHiReg, RegState::Kill)
           .addReg(DstLoReg);
   // SREG is implicitly dead.
   MI1->getOperand(3).setIsDead();
@@ -1591,7 +1577,7 @@ bool AVRExpandPseudo::expandLSLW12Rd(Block &MBB, BlockIt MBBI) {
   // swap Rh
   buildMI(MBB, MBBI, AVR::SWAPRd)
       .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
-      .addReg(DstHiReg, getKillRegState(DstIsKill));
+      .addReg(DstHiReg, RegState::Kill);
 
   // andi Rh, 0xf0
   auto MI0 =
@@ -1700,16 +1686,16 @@ bool AVRExpandPseudo::expandLSRW4Rd(Block &MBB, BlockIt MBBI) {
   // swap Rl
   buildMI(MBB, MBBI, AVR::SWAPRd)
       .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
-      .addReg(DstHiReg, getKillRegState(DstIsKill));
+      .addReg(DstHiReg, RegState::Kill);
   buildMI(MBB, MBBI, AVR::SWAPRd)
       .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
-      .addReg(DstLoReg, getKillRegState(DstIsKill));
+      .addReg(DstLoReg, RegState::Kill);
 
   // andi Rl, 0xf
   auto MI0 =
       buildMI(MBB, MBBI, AVR::ANDIRdK)
           .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
-          .addReg(DstLoReg, getKillRegState(DstIsKill))
+          .addReg(DstLoReg, RegState::Kill)
           .addImm(0xf);
   // SREG is implicitly dead.
   MI0->getOperand(3).setIsDead();
@@ -1718,7 +1704,7 @@ bool AVRExpandPseudo::expandLSRW4Rd(Block &MBB, BlockIt MBBI) {
   auto MI1 =
       buildMI(MBB, MBBI, AVR::EORRdRr)
           .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
-          .addReg(DstLoReg, getKillRegState(DstIsKill))
+          .addReg(DstLoReg, RegState::Kill)
           .addReg(DstHiReg);
   // SREG is implicitly dead.
   MI1->getOperand(3).setIsDead();
@@ -1789,7 +1775,7 @@ bool AVRExpandPseudo::expandLSRW12Rd(Block &MBB, BlockIt MBBI) {
   // swap Rl
   buildMI(MBB, MBBI, AVR::SWAPRd)
       .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
-      .addReg(DstLoReg, getKillRegState(DstIsKill));
+      .addReg(DstLoReg, RegState::Kill);
 
   // andi Rl, 0xf
   auto MI0 =
@@ -1897,6 +1883,53 @@ bool AVRExpandPseudo::expand<AVR::ASRWLoRd>(Block &MBB, BlockIt MBBI) {
   return true;
 }
 
+bool AVRExpandPseudo::expandASRW7Rd(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  Register DstLoReg, DstHiReg;
+  Register DstReg = MI.getOperand(0).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  bool DstIsKill = MI.getOperand(1).isKill();
+  bool ImpIsDead = MI.getOperand(3).isDead();
+  TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+  // lsl r24
+  // mov r24,r25
+  // rol r24
+  // sbc r25,r25
+
+  // lsl r24 <=> add r24, r24
+  buildMI(MBB, MBBI, AVR::ADDRdRr)
+      .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(DstLoReg, RegState::Kill)
+      .addReg(DstLoReg, RegState::Kill);
+
+  // mov r24, r25
+  buildMI(MBB, MBBI, AVR::MOVRdRr)
+      .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(DstHiReg);
+
+  // rol r24 <=> adc r24, r24
+  buildMI(MBB, MBBI, AVR::ADCRdRr)
+      .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(DstLoReg, getKillRegState(DstIsKill))
+      .addReg(DstLoReg, getKillRegState(DstIsKill));
+
+  // sbc r25, r25
+  auto MISBC =
+      buildMI(MBB, MBBI, AVR::SBCRdRr)
+          .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstHiReg, getKillRegState(DstIsKill))
+          .addReg(DstHiReg, getKillRegState(DstIsKill));
+
+  if (ImpIsDead)
+    MISBC->getOperand(3).setIsDead();
+  // SREG is always implicitly killed
+  MISBC->getOperand(4).setIsKill();
+
+  MI.eraseFromParent();
+  return true;
+}
+
 bool AVRExpandPseudo::expandASRW8Rd(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
   Register DstLoReg, DstHiReg;
@@ -1913,9 +1946,9 @@ bool AVRExpandPseudo::expandASRW8Rd(Block &MBB, BlockIt MBBI) {
 
   // Move the sign bit to the C flag.
   buildMI(MBB, MBBI, AVR::ADDRdRr)
-      .addReg(DstHiReg, RegState::Define, getDeadRegState(DstIsDead))
-      .addReg(DstHiReg, getKillRegState(DstIsKill) | getDeadRegState(DstIsDead))
-      .addReg(DstHiReg, getKillRegState(DstIsKill));
+      .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(DstHiReg, RegState::Kill)
+      .addReg(DstHiReg, RegState::Kill);
 
   // Set upper byte to 0 or -1.
   auto MIBHI =
@@ -1923,8 +1956,102 @@ bool AVRExpandPseudo::expandASRW8Rd(Block &MBB, BlockIt MBBI) {
           .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
           .addReg(DstHiReg, getKillRegState(DstIsKill))
           .addReg(DstHiReg, getKillRegState(DstIsKill));
+
   if (ImpIsDead)
     MIBHI->getOperand(3).setIsDead();
+  // SREG is always implicitly killed
+  MIBHI->getOperand(4).setIsKill();
+
+  MI.eraseFromParent();
+  return true;
+}
+bool AVRExpandPseudo::expandASRW14Rd(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  Register DstLoReg, DstHiReg;
+  Register DstReg = MI.getOperand(0).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  bool DstIsKill = MI.getOperand(1).isKill();
+  bool ImpIsDead = MI.getOperand(3).isDead();
+  TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+  // lsl r25
+  // sbc r24, r24
+  // lsl r25
+  // mov r25, r24
+  // rol r24
+
+  // lsl r25 <=> add r25, r25
+  buildMI(MBB, MBBI, AVR::ADDRdRr)
+      .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(DstHiReg, RegState::Kill)
+      .addReg(DstHiReg, RegState::Kill);
+
+  // sbc r24, r24
+  buildMI(MBB, MBBI, AVR::SBCRdRr)
+      .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(DstLoReg, RegState::Kill)
+      .addReg(DstLoReg, RegState::Kill);
+
+  // lsl r25 <=> add r25, r25
+  buildMI(MBB, MBBI, AVR::ADDRdRr)
+      .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(DstHiReg, RegState::Kill)
+      .addReg(DstHiReg, RegState::Kill);
+
+  // mov r25, r24
+  buildMI(MBB, MBBI, AVR::MOVRdRr)
+      .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(DstLoReg);
+
+  // rol r24 <=> adc r24, r24
+  auto MIROL =
+      buildMI(MBB, MBBI, AVR::ADCRdRr)
+          .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstLoReg, getKillRegState(DstIsKill))
+          .addReg(DstLoReg, getKillRegState(DstIsKill));
+
+  if (ImpIsDead)
+    MIROL->getOperand(3).setIsDead();
+  // SREG is always implicitly killed
+  MIROL->getOperand(4).setIsKill();
+
+  MI.eraseFromParent();
+  return false;
+}
+
+bool AVRExpandPseudo::expandASRW15Rd(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  Register DstLoReg, DstHiReg;
+  Register DstReg = MI.getOperand(0).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  bool ImpIsDead = MI.getOperand(3).isDead();
+  TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+  // lsl r25
+  // sbc r25, r25
+  // mov r24, r25
+
+  // lsl r25 <=> add r25, r25
+  buildMI(MBB, MBBI, AVR::ADDRdRr)
+      .addReg(DstHiReg, RegState::Define)
+      .addReg(DstHiReg, RegState::Kill)
+      .addReg(DstHiReg, RegState::Kill);
+
+  // sbc r25, r25
+  auto MISBC =
+      buildMI(MBB, MBBI, AVR::SBCRdRr)
+          .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstHiReg, RegState::Kill)
+          .addReg(DstHiReg, RegState::Kill);
+  if (ImpIsDead)
+    MISBC->getOperand(3).setIsDead();
+  // SREG is always implicitly killed
+  MISBC->getOperand(4).setIsKill();
+
+  // mov r24, r25
+  buildMI(MBB, MBBI, AVR::MOVRdRr)
+      .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(DstHiReg);
 
   MI.eraseFromParent();
   return true;
@@ -1935,8 +2062,14 @@ bool AVRExpandPseudo::expand<AVR::ASRWNRd>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
   unsigned Imm = MI.getOperand(2).getImm();
   switch (Imm) {
+  case 7:
+    return expandASRW7Rd(MBB, MBBI);
   case 8:
     return expandASRW8Rd(MBB, MBBI);
+  case 14:
+    return expandASRW14Rd(MBB, MBBI);
+  case 15:
+    return expandASRW15Rd(MBB, MBBI);
   default:
     llvm_unreachable("unimplemented asrwn");
     return false;
@@ -1956,14 +2089,14 @@ bool AVRExpandPseudo::expandLSLB7Rd(Block &MBB, BlockIt MBBI) {
 
   buildMI(MBB, MBBI, AVR::RORRd)
       .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
-      .addReg(DstReg, getKillRegState(DstIsKill))
+      .addReg(DstReg, RegState::Kill)
       ->getOperand(3)
       .setIsUndef(true);
 
   buildMI(MBB, MBBI, AVR::EORRdRr)
       .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
-      .addReg(DstReg, getKillRegState(DstIsKill))
-      .addReg(DstReg, getKillRegState(DstIsKill));
+      .addReg(DstReg, RegState::Kill)
+      .addReg(DstReg, RegState::Kill);
 
   auto MIRRC =
       buildMI(MBB, MBBI, AVR::RORRd)
@@ -2006,15 +2139,15 @@ bool AVRExpandPseudo::expandLSRB7Rd(Block &MBB, BlockIt MBBI) {
 
   buildMI(MBB, MBBI, AVR::ADCRdRr)
       .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
-      .addReg(DstReg, getKillRegState(DstIsKill))
-      .addReg(DstReg, getKillRegState(DstIsKill))
+      .addReg(DstReg, RegState::Kill)
+      .addReg(DstReg, RegState::Kill)
       ->getOperand(4)
       .setIsUndef(true);
 
   buildMI(MBB, MBBI, AVR::EORRdRr)
       .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
-      .addReg(DstReg, getKillRegState(DstIsKill))
-      .addReg(DstReg, getKillRegState(DstIsKill));
+      .addReg(DstReg, RegState::Kill)
+      .addReg(DstReg, RegState::Kill);
 
   auto MIRRC =
       buildMI(MBB, MBBI, AVR::ADCRdRr)
@@ -2064,13 +2197,13 @@ bool AVRExpandPseudo::expandASRB6Rd(Block &MBB, BlockIt MBBI) {
 
   buildMI(MBB, MBBI, AVR::ADDRdRr) // LSL Rd <==> ADD Rd, Rd
       .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
-      .addReg(DstReg, getKillRegState(DstIsKill))
-      .addReg(DstReg, getKillRegState(DstIsKill));
+      .addReg(DstReg, RegState::Kill)
+      .addReg(DstReg, RegState::Kill);
 
   buildMI(MBB, MBBI, AVR::SBCRdRr)
       .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
-      .addReg(DstReg, getKillRegState(DstIsKill))
-      .addReg(DstReg, getKillRegState(DstIsKill));
+      .addReg(DstReg, RegState::Kill)
+      .addReg(DstReg, RegState::Kill);
 
   buildMI(MBB, MBBI, AVR::BLD)
       .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
@@ -2095,8 +2228,8 @@ bool AVRExpandPseudo::expandASRB7Rd(Block &MBB, BlockIt MBBI) {
 
   buildMI(MBB, MBBI, AVR::ADDRdRr)
       .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
-      .addReg(DstReg, getKillRegState(DstIsKill))
-      .addReg(DstReg, getKillRegState(DstIsKill));
+      .addReg(DstReg, RegState::Kill)
+      .addReg(DstReg, RegState::Kill);
 
   auto MIRRC =
       buildMI(MBB, MBBI, AVR::SBCRdRr)
@@ -2152,26 +2285,22 @@ template <> bool AVRExpandPseudo::expand<AVR::SEXT>(Block &MBB, BlockIt MBBI) {
   bool ImpIsDead = MI.getOperand(2).isDead();
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
-  if (SrcReg != DstLoReg) {
-    auto MOV =
-        buildMI(MBB, MBBI, AVR::MOVRdRr)
-            .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
-            .addReg(SrcReg);
-
-    if (SrcReg == DstHiReg) {
-      MOV->getOperand(1).setIsKill();
-    }
-  }
+  if (SrcReg != DstLoReg)
+    buildMI(MBB, MBBI, AVR::MOVRdRr)
+        .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+        .addReg(SrcReg);
 
   if (SrcReg != DstHiReg) {
-    buildMI(MBB, MBBI, AVR::MOVRdRr)
-        .addReg(DstHiReg, RegState::Define)
-        .addReg(SrcReg, getKillRegState(SrcIsKill));
+    auto MOV = buildMI(MBB, MBBI, AVR::MOVRdRr)
+                   .addReg(DstHiReg, RegState::Define)
+                   .addReg(SrcReg);
+    if (SrcReg != DstLoReg && SrcIsKill)
+      MOV->getOperand(1).setIsKill();
   }
 
   buildMI(MBB, MBBI, AVR::ADDRdRr) // LSL Rd <==> ADD Rd, Rr
       .addReg(DstHiReg, RegState::Define)
-      .addReg(DstHiReg)
+      .addReg(DstHiReg, RegState::Kill)
       .addReg(DstHiReg, RegState::Kill);
 
   auto SBC =
@@ -2256,6 +2385,7 @@ bool AVRExpandPseudo::expand<AVR::SPREAD>(Block &MBB, BlockIt MBBI) {
 
 template <>
 bool AVRExpandPseudo::expand<AVR::SPWRITE>(Block &MBB, BlockIt MBBI) {
+  const AVRSubtarget &STI = MBB.getParent()->getSubtarget<AVRSubtarget>();
   MachineInstr &MI = *MBBI;
   Register SrcLoReg, SrcHiReg;
   Register SrcReg = MI.getOperand(1).getReg();
@@ -2265,7 +2395,7 @@ bool AVRExpandPseudo::expand<AVR::SPWRITE>(Block &MBB, BlockIt MBBI) {
 
   buildMI(MBB, MBBI, AVR::INRdA)
       .addReg(AVR::R0, RegState::Define)
-      .addImm(SREG_ADDR)
+      .addImm(STI.getIORegSREG())
       .setMIFlags(Flags);
 
   buildMI(MBB, MBBI, AVR::BCLRs).addImm(0x07).setMIFlags(Flags);
@@ -2276,7 +2406,7 @@ bool AVRExpandPseudo::expand<AVR::SPWRITE>(Block &MBB, BlockIt MBBI) {
       .setMIFlags(Flags);
 
   buildMI(MBB, MBBI, AVR::OUTARr)
-      .addImm(SREG_ADDR)
+      .addImm(STI.getIORegSREG())
       .addReg(AVR::R0, RegState::Kill)
       .setMIFlags(Flags);
 
@@ -2330,22 +2460,14 @@ bool AVRExpandPseudo::expandMI(Block &MBB, BlockIt MBBI) {
     EXPAND(AVR::AtomicLoad16);
     EXPAND(AVR::AtomicStore8);
     EXPAND(AVR::AtomicStore16);
-    EXPAND(AVR::AtomicLoadAdd8);
-    EXPAND(AVR::AtomicLoadAdd16);
-    EXPAND(AVR::AtomicLoadSub8);
-    EXPAND(AVR::AtomicLoadSub16);
-    EXPAND(AVR::AtomicLoadAnd8);
-    EXPAND(AVR::AtomicLoadAnd16);
-    EXPAND(AVR::AtomicLoadOr8);
-    EXPAND(AVR::AtomicLoadOr16);
-    EXPAND(AVR::AtomicLoadXor8);
-    EXPAND(AVR::AtomicLoadXor16);
     EXPAND(AVR::AtomicFence);
     EXPAND(AVR::STSWKRr);
     EXPAND(AVR::STWPtrRr);
     EXPAND(AVR::STWPtrPiRr);
     EXPAND(AVR::STWPtrPdRr);
     EXPAND(AVR::STDWPtrQRr);
+    EXPAND(AVR::STDSPQRr);
+    EXPAND(AVR::STDWSPQRr);
     EXPAND(AVR::INWRdA);
     EXPAND(AVR::OUTWARr);
     EXPAND(AVR::PUSHWRr);
diff --git a/llvm/lib/Target/AVR/AVRFrameLowering.cpp b/llvm/lib/Target/AVR/AVRFrameLowering.cpp
index b3bc9ede205e..ec8b74e435ce 100644
--- a/llvm/lib/Target/AVR/AVRFrameLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRFrameLowering.cpp
@@ -73,7 +73,7 @@ void AVRFrameLowering::emitPrologue(MachineFunction &MF,
         .setMIFlag(MachineInstr::FrameSetup);
 
     BuildMI(MBB, MBBI, DL, TII.get(AVR::INRdA), AVR::R0)
-        .addImm(0x3f)
+        .addImm(STI.getIORegSREG())
         .setMIFlag(MachineInstr::FrameSetup);
     BuildMI(MBB, MBBI, DL, TII.get(AVR::PUSHRr))
         .addReg(AVR::R0, RegState::Kill)
@@ -144,7 +144,7 @@ static void restoreStatusRegister(MachineFunction &MF, MachineBasicBlock &MBB) {
   if (AFI->isInterruptOrSignalHandler()) {
     BuildMI(MBB, MBBI, DL, TII.get(AVR::POPRd), AVR::R0);
     BuildMI(MBB, MBBI, DL, TII.get(AVR::OUTARr))
-        .addImm(0x3f)
+        .addImm(STI.getIORegSREG())
         .addReg(AVR::R0, RegState::Kill);
     BuildMI(MBB, MBBI, DL, TII.get(AVR::POPWRd), AVR::R1R0);
   }
@@ -201,8 +201,8 @@ void AVRFrameLowering::emitEpilogue(MachineFunction &MF,
 
     // Restore the frame pointer by doing FP += <size>.
     MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opcode), AVR::R29R28)
-                          .addReg(AVR::R29R28, RegState::Kill)
-                          .addImm(FrameSize);
+                           .addReg(AVR::R29R28, RegState::Kill)
+                           .addImm(FrameSize);
     // The SREG implicit def is dead.
     MI->getOperand(3).setIsDead();
   }
@@ -298,11 +298,11 @@ bool AVRFrameLowering::restoreCalleeSavedRegisters(
 /// Replace pseudo store instructions that pass arguments through the stack with
 /// real instructions.
 static void fixStackStores(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI,
-                           const TargetInstrInfo &TII, Register FP) {
+                           MachineBasicBlock::iterator StartMI,
+                           const TargetInstrInfo &TII) {
   // Iterate through the BB until we hit a call instruction or we reach the end.
   for (MachineInstr &MI :
-       llvm::make_early_inc_range(llvm::make_range(MI, MBB.end()))) {
+       llvm::make_early_inc_range(llvm::make_range(StartMI, MBB.end()))) {
     if (MI.isCall())
       break;
 
@@ -313,7 +313,7 @@ static void fixStackStores(MachineBasicBlock &MBB,
       continue;
 
     assert(MI.getOperand(0).getReg() == AVR::SP &&
-           "Invalid register, should be SP!");
+           "SP is expected as base pointer");
 
     // Replace this instruction with a regular store. Use Y as the base
     // pointer since it is guaranteed to contain a copy of SP.
@@ -321,7 +321,7 @@ static void fixStackStores(MachineBasicBlock &MBB,
         (Opcode == AVR::STDWSPQRr) ? AVR::STDWPtrQRr : AVR::STDPtrQRr;
 
     MI.setDesc(TII.get(STOpc));
-    MI.getOperand(0).setReg(FP);
+    MI.getOperand(0).setReg(AVR::R31R30);
   }
 }
 
@@ -331,11 +331,7 @@ MachineBasicBlock::iterator AVRFrameLowering::eliminateCallFramePseudoInstr(
   const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
   const AVRInstrInfo &TII = *STI.getInstrInfo();
 
-  // There is nothing to insert when the call frame memory is allocated during
-  // function entry. Delete the call frame pseudo and replace all pseudo stores
-  // with real store instructions.
   if (hasReservedCallFrame(MF)) {
-    fixStackStores(MBB, MI, TII, AVR::R29R28);
     return MBB.erase(MI);
   }
 
@@ -343,57 +339,58 @@ MachineBasicBlock::iterator AVRFrameLowering::eliminateCallFramePseudoInstr(
   unsigned int Opcode = MI->getOpcode();
   int Amount = TII.getFrameSize(*MI);
 
-  // ADJCALLSTACKUP and ADJCALLSTACKDOWN are converted to adiw/subi
-  // instructions to read and write the stack pointer in I/O space.
-  if (Amount != 0) {
-    assert(getStackAlign() == Align(1) && "Unsupported stack alignment");
-
-    if (Opcode == TII.getCallFrameSetupOpcode()) {
-      // Update the stack pointer.
-      // In many cases this can be done far more efficiently by pushing the
-      // relevant values directly to the stack. However, doing that correctly
-      // (in the right order, possibly skipping some empty space for undef
-      // values, etc) is tricky and thus left to be optimized in the future.
-      BuildMI(MBB, MI, DL, TII.get(AVR::SPREAD), AVR::R31R30).addReg(AVR::SP);
-
-      MachineInstr *New =
-          BuildMI(MBB, MI, DL, TII.get(AVR::SUBIWRdK), AVR::R31R30)
-              .addReg(AVR::R31R30, RegState::Kill)
-              .addImm(Amount);
-      New->getOperand(3).setIsDead();
-
-      BuildMI(MBB, MI, DL, TII.get(AVR::SPWRITE), AVR::SP).addReg(AVR::R31R30);
-
-      // Make sure the remaining stack stores are converted to real store
-      // instructions.
-      fixStackStores(MBB, MI, TII, AVR::R31R30);
-    } else {
-      assert(Opcode == TII.getCallFrameDestroyOpcode());
-
-      // Note that small stack changes could be implemented more efficiently
-      // with a few pop instructions instead of the 8-9 instructions now
-      // required.
-
-      // Select the best opcode to adjust SP based on the offset size.
-      unsigned addOpcode;
-      if (isUInt<6>(Amount)) {
-        addOpcode = AVR::ADIWRdK;
-      } else {
-        addOpcode = AVR::SUBIWRdK;
-        Amount = -Amount;
-      }
+  if (Amount == 0) {
+    return MBB.erase(MI);
+  }
+
+  assert(getStackAlign() == Align(1) && "Unsupported stack alignment");
+
+  if (Opcode == TII.getCallFrameSetupOpcode()) {
+    // Update the stack pointer.
+    // In many cases this can be done far more efficiently by pushing the
+    // relevant values directly to the stack. However, doing that correctly
+    // (in the right order, possibly skipping some empty space for undef
+    // values, etc) is tricky and thus left to be optimized in the future.
+    BuildMI(MBB, MI, DL, TII.get(AVR::SPREAD), AVR::R31R30).addReg(AVR::SP);
+
+    MachineInstr *New =
+        BuildMI(MBB, MI, DL, TII.get(AVR::SUBIWRdK), AVR::R31R30)
+            .addReg(AVR::R31R30, RegState::Kill)
+            .addImm(Amount);
+    New->getOperand(3).setIsDead();
 
-      // Build the instruction sequence.
-      BuildMI(MBB, MI, DL, TII.get(AVR::SPREAD), AVR::R31R30).addReg(AVR::SP);
+    BuildMI(MBB, MI, DL, TII.get(AVR::SPWRITE), AVR::SP).addReg(AVR::R31R30);
 
-      MachineInstr *New = BuildMI(MBB, MI, DL, TII.get(addOpcode), AVR::R31R30)
-                              .addReg(AVR::R31R30, RegState::Kill)
-                              .addImm(Amount);
-      New->getOperand(3).setIsDead();
+    // Make sure the remaining stack stores are converted to real store
+    // instructions.
+    fixStackStores(MBB, MI, TII);
+  } else {
+    assert(Opcode == TII.getCallFrameDestroyOpcode());
 
-      BuildMI(MBB, MI, DL, TII.get(AVR::SPWRITE), AVR::SP)
-          .addReg(AVR::R31R30, RegState::Kill);
+    // Note that small stack changes could be implemented more efficiently
+    // with a few pop instructions instead of the 8-9 instructions now
+    // required.
+
+    // Select the best opcode to adjust SP based on the offset size.
+    unsigned AddOpcode;
+
+    if (isUInt<6>(Amount)) {
+      AddOpcode = AVR::ADIWRdK;
+    } else {
+      AddOpcode = AVR::SUBIWRdK;
+      Amount = -Amount;
     }
+
+    // Build the instruction sequence.
+    BuildMI(MBB, MI, DL, TII.get(AVR::SPREAD), AVR::R31R30).addReg(AVR::SP);
+
+    MachineInstr *New = BuildMI(MBB, MI, DL, TII.get(AddOpcode), AVR::R31R30)
+                            .addReg(AVR::R31R30, RegState::Kill)
+                            .addImm(Amount);
+    New->getOperand(3).setIsDead();
+
+    BuildMI(MBB, MI, DL, TII.get(AVR::SPWRITE), AVR::SP)
+        .addReg(AVR::R31R30, RegState::Kill);
   }
 
   return MBB.erase(MI);
@@ -420,7 +417,7 @@ struct AVRFrameAnalyzer : public MachineFunctionPass {
 
   bool runOnMachineFunction(MachineFunction &MF) override {
     const MachineFrameInfo &MFI = MF.getFrameInfo();
-    AVRMachineFunctionInfo *FuncInfo = MF.getInfo<AVRMachineFunctionInfo>();
+    AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
 
     // If there are no fixed frame indexes during this stage it means there
     // are allocas present in the function.
@@ -431,7 +428,7 @@ struct AVRFrameAnalyzer : public MachineFunctionPass {
       for (unsigned i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
         // Variable sized objects have size 0.
         if (MFI.getObjectSize(i)) {
-          FuncInfo->setHasAllocas(true);
+          AFI->setHasAllocas(true);
           break;
         }
       }
@@ -460,7 +457,7 @@ struct AVRFrameAnalyzer : public MachineFunctionPass {
           }
 
           if (MFI.isFixedObjectIndex(MO.getIndex())) {
-            FuncInfo->setHasStackArgs(true);
+            AFI->setHasStackArgs(true);
             return false;
           }
         }
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp
index a58fedf6cd36..7a1e7b1535a7 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -13,6 +13,7 @@
 
 #include "AVRISelLowering.h"
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/CodeGen/CallingConvLower.h"
@@ -269,8 +270,6 @@ EVT AVRTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
 }
 
 SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const {
-  //: TODO: this function has to be completely rewritten to produce optimal
-  // code, for now it's producing very long but correct code.
   unsigned Opc8;
   const SDNode *N = Op.getNode();
   EVT VT = Op.getValueType();
@@ -371,6 +370,27 @@ SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const {
       ShiftAmount = 0;
     }
   } else if (VT.getSizeInBits() == 16) {
+    if (Op.getOpcode() == ISD::SRA)
+      // Special optimization for int16 arithmetic right shift.
+      switch (ShiftAmount) {
+      case 15:
+        Victim = DAG.getNode(AVRISD::ASRWN, dl, VT, Victim,
+                             DAG.getConstant(15, dl, VT));
+        ShiftAmount = 0;
+        break;
+      case 14:
+        Victim = DAG.getNode(AVRISD::ASRWN, dl, VT, Victim,
+                             DAG.getConstant(14, dl, VT));
+        ShiftAmount = 0;
+        break;
+      case 7:
+        Victim = DAG.getNode(AVRISD::ASRWN, dl, VT, Victim,
+                             DAG.getConstant(7, dl, VT));
+        ShiftAmount = 0;
+        break;
+      default:
+        break;
+      }
     if (4 <= ShiftAmount && ShiftAmount < 8)
       switch (Op.getOpcode()) {
       case ISD::SHL:
@@ -1023,17 +1043,24 @@ bool AVRTargetLowering::isOffsetFoldingLegal(
 
 /// Registers for calling conventions, ordered in reverse as required by ABI.
 /// Both arrays must be of the same length.
-static const MCPhysReg RegList8[] = {
+static const MCPhysReg RegList8AVR[] = {
     AVR::R25, AVR::R24, AVR::R23, AVR::R22, AVR::R21, AVR::R20,
     AVR::R19, AVR::R18, AVR::R17, AVR::R16, AVR::R15, AVR::R14,
     AVR::R13, AVR::R12, AVR::R11, AVR::R10, AVR::R9,  AVR::R8};
-static const MCPhysReg RegList16[] = {
+static const MCPhysReg RegList8Tiny[] = {AVR::R25, AVR::R24, AVR::R23,
+                                         AVR::R22, AVR::R21, AVR::R20};
+static const MCPhysReg RegList16AVR[] = {
     AVR::R26R25, AVR::R25R24, AVR::R24R23, AVR::R23R22, AVR::R22R21,
     AVR::R21R20, AVR::R20R19, AVR::R19R18, AVR::R18R17, AVR::R17R16,
     AVR::R16R15, AVR::R15R14, AVR::R14R13, AVR::R13R12, AVR::R12R11,
     AVR::R11R10, AVR::R10R9,  AVR::R9R8};
+static const MCPhysReg RegList16Tiny[] = {AVR::R26R25, AVR::R25R24,
+                                          AVR::R24R23, AVR::R23R22,
+                                          AVR::R22R21, AVR::R21R20};
 
-static_assert(array_lengthof(RegList8) == array_lengthof(RegList16),
+static_assert(array_lengthof(RegList8AVR) == array_lengthof(RegList16AVR),
+              "8-bit and 16-bit register arrays must be of equal length");
+static_assert(array_lengthof(RegList8Tiny) == array_lengthof(RegList16Tiny),
               "8-bit and 16-bit register arrays must be of equal length");
 
 /// Analyze incoming and outgoing function arguments. We need custom C++ code
@@ -1041,10 +1068,22 @@ static_assert(array_lengthof(RegList8) == array_lengthof(RegList16),
 /// In addition, all pieces of a certain argument have to be passed either
 /// using registers or the stack but never mixing both.
 template <typename ArgT>
-static void
-analyzeArguments(TargetLowering::CallLoweringInfo *CLI, const Function *F,
-                 const DataLayout *TD, const SmallVectorImpl<ArgT> &Args,
-                 SmallVectorImpl<CCValAssign> &ArgLocs, CCState &CCInfo) {
+static void analyzeArguments(TargetLowering::CallLoweringInfo *CLI,
+                             const Function *F, const DataLayout *TD,
+                             const SmallVectorImpl<ArgT> &Args,
+                             SmallVectorImpl<CCValAssign> &ArgLocs,
+                             CCState &CCInfo, bool Tiny) {
+  // Choose the proper register list for argument passing according to the ABI.
+  ArrayRef<MCPhysReg> RegList8;
+  ArrayRef<MCPhysReg> RegList16;
+  if (Tiny) {
+    RegList8 = makeArrayRef(RegList8Tiny, array_lengthof(RegList8Tiny));
+    RegList16 = makeArrayRef(RegList16Tiny, array_lengthof(RegList16Tiny));
+  } else {
+    RegList8 = makeArrayRef(RegList8AVR, array_lengthof(RegList8AVR));
+    RegList16 = makeArrayRef(RegList16AVR, array_lengthof(RegList16AVR));
+  }
+
   unsigned NumArgs = Args.size();
   // This is the index of the last used register, in RegList*.
   // -1 means R26 (R26 is never actually used in CC).
@@ -1074,7 +1113,7 @@ analyzeArguments(TargetLowering::CallLoweringInfo *CLI, const Function *F,
     unsigned RegIdx = RegLastIdx + TotalBytes;
     RegLastIdx = RegIdx;
     // If there are not enough registers, use the stack
-    if (RegIdx >= array_lengthof(RegList8)) {
+    if (RegIdx >= RegList8.size()) {
       UseStack = true;
     }
     for (; i != j; ++i) {
@@ -1123,13 +1162,24 @@ getTotalArgumentsSizeInBytes(const SmallVectorImpl<ArgT> &Args) {
 /// one value, possibly an aggregate, and it is limited to 8 bytes.
 template <typename ArgT>
 static void analyzeReturnValues(const SmallVectorImpl<ArgT> &Args,
-                                CCState &CCInfo) {
+                                CCState &CCInfo, bool Tiny) {
   unsigned NumArgs = Args.size();
   unsigned TotalBytes = getTotalArgumentsSizeInBytes(Args);
   // CanLowerReturn() guarantees this assertion.
   assert(TotalBytes <= 8 &&
          "return values greater than 8 bytes cannot be lowered");
 
+  // Choose the proper register list for argument passing according to the ABI.
+  ArrayRef<MCPhysReg> RegList8;
+  ArrayRef<MCPhysReg> RegList16;
+  if (Tiny) {
+    RegList8 = makeArrayRef(RegList8Tiny, array_lengthof(RegList8Tiny));
+    RegList16 = makeArrayRef(RegList16Tiny, array_lengthof(RegList16Tiny));
+  } else {
+    RegList8 = makeArrayRef(RegList8AVR, array_lengthof(RegList8AVR));
+    RegList16 = makeArrayRef(RegList16AVR, array_lengthof(RegList16AVR));
+  }
+
   // GCC-ABI says that the size is rounded up to the next even number,
   // but actually once it is more than 4 it will always round up to 8.
   if (TotalBytes > 4) {
@@ -1174,7 +1224,8 @@ SDValue AVRTargetLowering::LowerFormalArguments(
   if (isVarArg) {
     CCInfo.AnalyzeFormalArguments(Ins, ArgCC_AVR_Vararg);
   } else {
-    analyzeArguments(nullptr, &MF.getFunction(), &DL, Ins, ArgLocs, CCInfo);
+    analyzeArguments(nullptr, &MF.getFunction(), &DL, Ins, ArgLocs, CCInfo,
+                     Subtarget.hasTinyEncoding());
   }
 
   SDValue ArgValue;
@@ -1285,8 +1336,8 @@ SDValue AVRTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   const Function *F = nullptr;
   if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     const GlobalValue *GV = G->getGlobal();
-
-    F = cast<Function>(GV);
+    if (isa<Function>(GV))
+      F = cast<Function>(GV);
     Callee =
         DAG.getTargetGlobalAddress(GV, DL, getPointerTy(DAG.getDataLayout()));
   } else if (const ExternalSymbolSDNode *ES =
@@ -1299,7 +1350,8 @@ SDValue AVRTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (isVarArg) {
     CCInfo.AnalyzeCallOperands(Outs, ArgCC_AVR_Vararg);
   } else {
-    analyzeArguments(&CLI, F, &DAG.getDataLayout(), Outs, ArgLocs, CCInfo);
+    analyzeArguments(&CLI, F, &DAG.getDataLayout(), Outs, ArgLocs, CCInfo,
+                     Subtarget.hasTinyEncoding());
   }
 
   // Get a count of how many bytes are to be pushed on the stack.
@@ -1444,7 +1496,7 @@ SDValue AVRTargetLowering::LowerCallResult(
   if (CallConv == CallingConv::AVR_BUILTIN) {
     CCInfo.AnalyzeCallResult(Ins, RetCC_AVR_BUILTIN);
   } else {
-    analyzeReturnValues(Ins, CCInfo);
+    analyzeReturnValues(Ins, CCInfo, Subtarget.hasTinyEncoding());
   }
 
   // Copy all of the result registers out of their specified physreg.
@@ -1495,7 +1547,7 @@ AVRTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   if (CallConv == CallingConv::AVR_BUILTIN) {
     CCInfo.AnalyzeReturn(Outs, RetCC_AVR_BUILTIN);
   } else {
-    analyzeReturnValues(Outs, CCInfo);
+    analyzeReturnValues(Outs, CCInfo, Subtarget.hasTinyEncoding());
   }
 
   SDValue Flag;
@@ -1707,6 +1759,60 @@ AVRTargetLowering::insertCopyR1(MachineInstr &MI, MachineBasicBlock *BB) const {
   return BB;
 }
 
+// Lower atomicrmw operation to disable interrupts, do operation, and restore
+// interrupts. This works because all AVR microcontrollers are single core.
+MachineBasicBlock *AVRTargetLowering::insertAtomicArithmeticOp(
+    MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode, int Width) const {
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+  MachineBasicBlock::iterator I(MI);
+  const Register SCRATCH_REGISTER = AVR::R0;
+  DebugLoc dl = MI.getDebugLoc();
+
+  // Example instruction sequence, for an atomic 8-bit add:
+  //   ldi r25, 5
+  //   in r0, SREG
+  //   cli
+  //   ld r24, X
+  //   add r25, r24
+  //   st X, r25
+  //   out SREG, r0
+
+  const TargetRegisterClass *RC =
+      (Width == 8) ? &AVR::GPR8RegClass : &AVR::DREGSRegClass;
+  unsigned LoadOpcode = (Width == 8) ? AVR::LDRdPtr : AVR::LDWRdPtr;
+  unsigned StoreOpcode = (Width == 8) ? AVR::STPtrRr : AVR::STWPtrRr;
+
+  // Disable interrupts.
+  BuildMI(*BB, I, dl, TII.get(AVR::INRdA), SCRATCH_REGISTER)
+      .addImm(Subtarget.getIORegSREG());
+  BuildMI(*BB, I, dl, TII.get(AVR::BCLRs)).addImm(7);
+
+  // Load the original value.
+  BuildMI(*BB, I, dl, TII.get(LoadOpcode), MI.getOperand(0).getReg())
+      .add(MI.getOperand(1));
+
+  // Do the arithmetic operation.
+  Register Result = MRI.createVirtualRegister(RC);
+  BuildMI(*BB, I, dl, TII.get(Opcode), Result)
+      .addReg(MI.getOperand(0).getReg())
+      .add(MI.getOperand(2));
+
+  // Store the result.
+  BuildMI(*BB, I, dl, TII.get(StoreOpcode))
+      .add(MI.getOperand(1))
+      .addReg(Result);
+
+  // Restore interrupts.
+  BuildMI(*BB, I, dl, TII.get(AVR::OUTARr))
+      .addImm(Subtarget.getIORegSREG())
+      .addReg(SCRATCH_REGISTER);
+
+  // Remove the pseudo instruction.
+  MI.eraseFromParent();
+  return BB;
+}
+
 MachineBasicBlock *
 AVRTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *MBB) const {
@@ -1731,6 +1837,26 @@ AVRTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     return insertMul(MI, MBB);
   case AVR::CopyR1:
     return insertCopyR1(MI, MBB);
+  case AVR::AtomicLoadAdd8:
+    return insertAtomicArithmeticOp(MI, MBB, AVR::ADDRdRr, 8);
+  case AVR::AtomicLoadAdd16:
+    return insertAtomicArithmeticOp(MI, MBB, AVR::ADDWRdRr, 16);
+  case AVR::AtomicLoadSub8:
+    return insertAtomicArithmeticOp(MI, MBB, AVR::SUBRdRr, 8);
+  case AVR::AtomicLoadSub16:
+    return insertAtomicArithmeticOp(MI, MBB, AVR::SUBWRdRr, 16);
+  case AVR::AtomicLoadAnd8:
+    return insertAtomicArithmeticOp(MI, MBB, AVR::ANDRdRr, 8);
+  case AVR::AtomicLoadAnd16:
+    return insertAtomicArithmeticOp(MI, MBB, AVR::ANDWRdRr, 16);
+  case AVR::AtomicLoadOr8:
+    return insertAtomicArithmeticOp(MI, MBB, AVR::ORRdRr, 8);
+  case AVR::AtomicLoadOr16:
+    return insertAtomicArithmeticOp(MI, MBB, AVR::ORWRdRr, 16);
+  case AVR::AtomicLoadXor8:
+    return insertAtomicArithmeticOp(MI, MBB, AVR::EORRdRr, 8);
+  case AVR::AtomicLoadXor16:
+    return insertAtomicArithmeticOp(MI, MBB, AVR::EORWRdRr, 16);
   }
 
   assert((Opc == AVR::Select16 || Opc == AVR::Select8) &&
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.h b/llvm/lib/Target/AVR/AVRISelLowering.h
index 116417b61566..c5c937c983ed 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.h
+++ b/llvm/lib/Target/AVR/AVRISelLowering.h
@@ -189,6 +189,9 @@ private:
   MachineBasicBlock *insertMul(MachineInstr &MI, MachineBasicBlock *BB) const;
   MachineBasicBlock *insertCopyR1(MachineInstr &MI,
                                   MachineBasicBlock *BB) const;
+  MachineBasicBlock *insertAtomicArithmeticOp(MachineInstr &MI,
+                                              MachineBasicBlock *BB,
+                                              unsigned Opcode, int Width) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AVR/AVRInstrFormats.td b/llvm/lib/Target/AVR/AVRInstrFormats.td
index 2bcbcdfbf925..83c32c80dfb9 100644
--- a/llvm/lib/Target/AVR/AVRInstrFormats.td
+++ b/llvm/lib/Target/AVR/AVRInstrFormats.td
@@ -179,7 +179,8 @@ class FSTDLDD<bit type, dag outs, dag ins, string asmstr, list<dag> pattern>
 // r = src/dst register
 //
 // Note that the bit labelled 'i' above does not follow a simple pattern,
-// so there exists a post encoder method to set it manually.
+// so there exists a post encoder method to set it manually. Also a specified
+// decoder method is needed.
 //===---------------------------------------------------------------------===//
 class FSTLD<bit type, bits<2> mode, dag outs, dag ins, string asmstr,
             list<dag> pattern> : AVRInst16<outs, ins, asmstr, pattern> {
@@ -200,6 +201,7 @@ class FSTLD<bit type, bits<2> mode, dag outs, dag ins, string asmstr,
   let Inst{3 - 2} = ptrreg{1 - 0};
   let Inst{1 - 0} = mode{1 - 0};
 
+  let DecoderMethod = "decodeLoadStore";
   let PostEncoderMethod = "loadStorePostEncoder";
 }
 
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.cpp b/llvm/lib/Target/AVR/AVRInstrInfo.cpp
index ac52c47f93d5..510000f231fa 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.cpp
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.cpp
@@ -46,8 +46,9 @@ void AVRInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   const AVRRegisterInfo &TRI = *STI.getRegisterInfo();
   unsigned Opc;
 
-  // Not all AVR devices support the 16-bit `MOVW` instruction.
   if (AVR::DREGSRegClass.contains(DestReg, SrcReg)) {
+    // If our AVR has `movw`, let's emit that; otherwise let's emit two separate
+    // `mov`s.
     if (STI.hasMOVW() && AVR::DREGSMOVWRegClass.contains(DestReg, SrcReg)) {
       BuildMI(MBB, MI, DL, get(AVR::MOVWRdRr), DestReg)
           .addReg(SrcReg, getKillRegState(KillSrc));
@@ -57,11 +58,17 @@ void AVRInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       TRI.splitReg(DestReg, DestLo, DestHi);
       TRI.splitReg(SrcReg, SrcLo, SrcHi);
 
-      // Copy each individual register with the `MOV` instruction.
-      BuildMI(MBB, MI, DL, get(AVR::MOVRdRr), DestLo)
-          .addReg(SrcLo, getKillRegState(KillSrc));
-      BuildMI(MBB, MI, DL, get(AVR::MOVRdRr), DestHi)
-          .addReg(SrcHi, getKillRegState(KillSrc));
+      if (DestLo == SrcHi) {
+        BuildMI(MBB, MI, DL, get(AVR::MOVRdRr), DestHi)
+            .addReg(SrcHi, getKillRegState(KillSrc));
+        BuildMI(MBB, MI, DL, get(AVR::MOVRdRr), DestLo)
+            .addReg(SrcLo, getKillRegState(KillSrc));
+      } else {
+        BuildMI(MBB, MI, DL, get(AVR::MOVRdRr), DestLo)
+            .addReg(SrcLo, getKillRegState(KillSrc));
+        BuildMI(MBB, MI, DL, get(AVR::MOVRdRr), DestHi)
+            .addReg(SrcHi, getKillRegState(KillSrc));
+      }
     }
   } else {
     if (AVR::GPR8RegClass.contains(DestReg, SrcReg)) {
@@ -299,9 +306,7 @@ bool AVRInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
       }
 
       // If the block has any instructions after a JMP, delete them.
-      while (std::next(I) != MBB.end()) {
-        std::next(I)->eraseFromParent();
-      }
+      MBB.erase(std::next(I), MBB.end());
 
       Cond.clear();
       FBB = nullptr;
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.td b/llvm/lib/Target/AVR/AVRInstrInfo.td
index 2b96dc0b833a..f20ba5edf208 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.td
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.td
@@ -177,12 +177,16 @@ def memri : Operand<iPTR> {
 
   let PrintMethod = "printMemri";
   let EncoderMethod = "encodeMemri";
+  let DecoderMethod = "decodeMemri";
 
   let ParserMatchClass = MemriAsmOperand;
 }
 
 // Address operand for `SP+imm` used by STD{W}SPQRr
-def memspi : Operand<iPTR> { let MIOperandInfo = (ops GPRSP, i16imm); }
+def memspi : Operand<iPTR> {
+  let MIOperandInfo = (ops GPRSP, i16imm);
+  let PrintMethod = "printMemspi";
+}
 
 def relbrtarget_7 : Operand<OtherVT> {
   let PrintMethod = "printPCRelImm";
@@ -194,6 +198,11 @@ def brtarget_13 : Operand<OtherVT> {
   let EncoderMethod = "encodeRelCondBrTarget<AVR::fixup_13_pcrel>";
 }
 
+def rcalltarget_13 : Operand<i16> {
+  let PrintMethod = "printPCRelImm";
+  let EncoderMethod = "encodeRelCondBrTarget<AVR::fixup_13_pcrel>";
+}
+
 // The target of a 22 or 16-bit call/jmp instruction.
 def call_target : Operand<iPTR> {
   let EncoderMethod = "encodeCallTarget";
@@ -965,10 +974,8 @@ let isBarrier = 1, isBranch = 1, isTerminator = 1 in {
 let isCall = 1 in {
   // SP is marked as a use to prevent stack-pointer assignments that appear
   // immediately before calls from potentially appearing dead.
-  let Uses = [SP] in def RCALLk : FBRk<1, (outs),
-                                       (ins brtarget_13
-                                        : $target),
-                                       "rcall\t$target", []>;
+  let Uses = [SP] in def RCALLk : FBRk<1, (outs), (ins rcalltarget_13:$k),
+                                       "rcall\t$k", [(AVRcall imm:$k)]>;
 
   // SP is marked as a use to prevent stack-pointer assignments that appear
   // immediately before calls from potentially appearing dead.
@@ -985,13 +992,10 @@ let isCall = 1 in {
   // SP is marked as a use to prevent stack-pointer assignments that appear
   // immediately before calls from potentially appearing dead.
   //
-  //: TODO: the imm field can be either 16 or 22 bits in devices with more
+  // TODO: the imm field can be either 16 or 22 bits in devices with more
   // than 64k of ROM, fix it once we support the largest devices.
-  let Uses = [SP] in def CALLk : F32BRk<0b111, (outs),
-                                        (ins call_target
-                                         : $k),
-                                        "call\t$k", [(AVRcall imm
-                                                      : $k)]>,
+  let Uses = [SP] in def CALLk : F32BRk<0b111, (outs), (ins call_target:$k),
+                                        "call\t$k", [(AVRcall imm:$k)]>,
       Requires<[HasJMPCALL]>;
 }
 
@@ -1446,27 +1450,14 @@ class AtomicStore<PatFrag Op, RegisterClass DRC, RegisterClass PTRRC>
                             : $rd, DRC
                             : $rr)]>;
 
-let Constraints =
-    "@earlyclobber $rd" in class AtomicLoadOp<PatFrag Op, RegisterClass DRC,
-                                              RegisterClass PTRRC>
-    : Pseudo<(outs DRC
-              : $rd),
-             (ins PTRRC
-              : $rr, DRC
-              : $operand),
-             "atomic_op", [(set DRC
-                            : $rd, (Op i16
-                                    : $rr, DRC
-                                    : $operand))]>;
-
-// FIXME: I think 16-bit atomic binary ops need to mark
-// r0 as clobbered.
+class AtomicLoadOp<PatFrag Op, RegisterClass DRC, RegisterClass PTRRC>
+    : Pseudo<(outs DRC:$rd),
+             (ins PTRRC:$rr, DRC:$operand),
+             "atomic_op", [(set DRC:$rd, (Op i16:$rr, DRC:$operand))]>;
 
 // Atomic instructions
 // ===================
 //
-// These are all expanded by AVRExpandPseudoInsts
-//
 // 8-bit operations can use any pointer register because
 // they are expanded directly into an LD/ST instruction.
 //
@@ -1482,16 +1473,18 @@ def AtomicStore16 : AtomicStore<atomic_store_16, DREGS, PTRDISPREGS>;
 class AtomicLoadOp8<PatFrag Op> : AtomicLoadOp<Op, GPR8, PTRREGS>;
 class AtomicLoadOp16<PatFrag Op> : AtomicLoadOp<Op, DREGS, PTRDISPREGS>;
 
-def AtomicLoadAdd8 : AtomicLoadOp8<atomic_load_add_8>;
-def AtomicLoadAdd16 : AtomicLoadOp16<atomic_load_add_16>;
-def AtomicLoadSub8 : AtomicLoadOp8<atomic_load_sub_8>;
-def AtomicLoadSub16 : AtomicLoadOp16<atomic_load_sub_16>;
-def AtomicLoadAnd8 : AtomicLoadOp8<atomic_load_and_8>;
-def AtomicLoadAnd16 : AtomicLoadOp16<atomic_load_and_16>;
-def AtomicLoadOr8 : AtomicLoadOp8<atomic_load_or_8>;
-def AtomicLoadOr16 : AtomicLoadOp16<atomic_load_or_16>;
-def AtomicLoadXor8 : AtomicLoadOp8<atomic_load_xor_8>;
-def AtomicLoadXor16 : AtomicLoadOp16<atomic_load_xor_16>;
+let usesCustomInserter=1 in {
+  def AtomicLoadAdd8 : AtomicLoadOp8<atomic_load_add_8>;
+  def AtomicLoadAdd16 : AtomicLoadOp16<atomic_load_add_16>;
+  def AtomicLoadSub8 : AtomicLoadOp8<atomic_load_sub_8>;
+  def AtomicLoadSub16 : AtomicLoadOp16<atomic_load_sub_16>;
+  def AtomicLoadAnd8 : AtomicLoadOp8<atomic_load_and_8>;
+  def AtomicLoadAnd16 : AtomicLoadOp16<atomic_load_and_16>;
+  def AtomicLoadOr8 : AtomicLoadOp8<atomic_load_or_8>;
+  def AtomicLoadOr16 : AtomicLoadOp16<atomic_load_or_16>;
+  def AtomicLoadXor8 : AtomicLoadOp8<atomic_load_xor_8>;
+  def AtomicLoadXor16 : AtomicLoadOp16<atomic_load_xor_16>;
+}
 def AtomicFence
     : Pseudo<(outs), (ins), "atomic_fence", [(atomic_fence timm, timm)]>;
 
@@ -1954,7 +1947,7 @@ let Constraints = "$src = $rd", Defs = [SREG] in {
                                   : $src)),
                          (implicit SREG)]>;
 
-  def ASRWNRd : Pseudo<(outs DLDREGS
+  def ASRWNRd : Pseudo<(outs DREGS
                         : $rd),
                        (ins DREGS
                         : $src, imm16
@@ -2122,15 +2115,17 @@ def ROL : InstAlias<"rol\t$rd", (ADCRdRr GPR8 : $rd, GPR8 : $rd)>;
 // Sets all bits in a register.
 def : InstAlias<"ser\t$rd", (LDIRdK LD8 : $rd, 0xff), 0>;
 
-let Defs = [SREG] in def BSETs : FS<0, (outs),
-                                    (ins i8imm
-                                     : $s),
-                                    "bset\t$s", []>;
+let hasSideEffects=1 in {
+  let Defs = [SREG] in def BSETs : FS<0,
+                                      (outs),
+                                      (ins i8imm:$s),
+                                      "bset\t$s", []>;
 
-let Defs = [SREG] in def BCLRs : FS<1, (outs),
-                                    (ins i8imm
-                                     : $s),
-                                    "bclr\t$s", []>;
+  let Defs = [SREG] in def BCLRs : FS<1,
+                                      (outs),
+                                      (ins i8imm:$s),
+                                      "bclr\t$s", []>;
+}
 
 // Set/clear aliases for the carry (C) status flag (bit 0).
 def : InstAlias<"sec", (BSETs 0)>;
@@ -2457,8 +2452,12 @@ def : Pat<(adde i8
                      : $src2))>;
 
 // Calls.
-def : Pat<(AVRcall(i16 tglobaladdr : $dst)), (CALLk tglobaladdr : $dst)>;
-def : Pat<(AVRcall(i16 texternalsym : $dst)), (CALLk texternalsym : $dst)>;
+let Predicates = [HasJMPCALL] in {
+  def : Pat<(AVRcall(i16 tglobaladdr:$dst)), (CALLk tglobaladdr:$dst)>;
+  def : Pat<(AVRcall(i16 texternalsym:$dst)), (CALLk texternalsym:$dst)>;
+}
+def : Pat<(AVRcall(i16 tglobaladdr:$dst)), (RCALLk tglobaladdr:$dst)>;
+def : Pat<(AVRcall(i16 texternalsym:$dst)), (RCALLk texternalsym:$dst)>;
 
 // `anyext`
 def : Pat<(i16(anyext i8
diff --git a/llvm/lib/Target/AVR/AVRMachineFunctionInfo.h b/llvm/lib/Target/AVR/AVRMachineFunctionInfo.h
index 8b1c247eb6a7..da4c48559d9e 100644
--- a/llvm/lib/Target/AVR/AVRMachineFunctionInfo.h
+++ b/llvm/lib/Target/AVR/AVRMachineFunctionInfo.h
@@ -61,6 +61,13 @@ public:
                             MF.getFunction().hasFnAttribute("signal");
   }
 
+  MachineFunctionInfo *
+  clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+        const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+      const override {
+    return DestMF.cloneInfo<AVRMachineFunctionInfo>(*this);
+  }
+
   bool getHasSpills() const { return HasSpills; }
   void setHasSpills(bool B) { HasSpills = B; }
 
diff --git a/llvm/lib/Target/AVR/AVRRegisterInfo.cpp b/llvm/lib/Target/AVR/AVRRegisterInfo.cpp
index 5dd7f5c55695..87e6558c12c2 100644
--- a/llvm/lib/Target/AVR/AVRRegisterInfo.cpp
+++ b/llvm/lib/Target/AVR/AVRRegisterInfo.cpp
@@ -36,15 +36,20 @@ AVRRegisterInfo::AVRRegisterInfo() : AVRGenRegisterInfo(0) {}
 const uint16_t *
 AVRRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   const AVRMachineFunctionInfo *AFI = MF->getInfo<AVRMachineFunctionInfo>();
-
-  return AFI->isInterruptOrSignalHandler() ? CSR_Interrupts_SaveList
-                                           : CSR_Normal_SaveList;
+  const AVRSubtarget &STI = MF->getSubtarget<AVRSubtarget>();
+  if (STI.hasTinyEncoding())
+    return AFI->isInterruptOrSignalHandler() ? CSR_InterruptsTiny_SaveList
+                                             : CSR_NormalTiny_SaveList;
+  else
+    return AFI->isInterruptOrSignalHandler() ? CSR_Interrupts_SaveList
+                                             : CSR_Normal_SaveList;
 }
 
 const uint32_t *
 AVRRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                                       CallingConv::ID CC) const {
-  return CSR_Normal_RegMask;
+  const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
+  return STI.hasTinyEncoding() ? CSR_NormalTiny_RegMask : CSR_Normal_RegMask;
 }
 
 BitVector AVRRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
@@ -52,15 +57,26 @@ BitVector AVRRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 
   // Reserve the intermediate result registers r1 and r2
   // The result of instructions like 'mul' is always stored here.
+  // R0/R1/R1R0 are always reserved on both avr and avrtiny.
   Reserved.set(AVR::R0);
   Reserved.set(AVR::R1);
   Reserved.set(AVR::R1R0);
 
-  //  Reserve the stack pointer.
+  // Reserve the stack pointer.
   Reserved.set(AVR::SPL);
   Reserved.set(AVR::SPH);
   Reserved.set(AVR::SP);
 
+  // Reserve R2~R17 only on avrtiny.
+  if (MF.getSubtarget<AVRSubtarget>().hasTinyEncoding()) {
+    // Reserve 8-bit registers R2~R15, Rtmp(R16) and Zero(R17).
+    for (unsigned Reg = AVR::R2; Reg <= AVR::R17; Reg++)
+      Reserved.set(Reg);
+    // Reserve 16-bit registers R3R2~R18R17.
+    for (unsigned Reg = AVR::R3R2; Reg <= AVR::R18R17; Reg++)
+      Reserved.set(Reg);
+  }
+
   // We tenatively reserve the frame pointer register r29:r28 because the
   // function may require one, but we cannot tell until register allocation
   // is complete, which can be too late.
@@ -137,6 +153,7 @@ void AVRRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   const TargetInstrInfo &TII = *TM.getSubtargetImpl()->getInstrInfo();
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const TargetFrameLowering *TFI = TM.getSubtargetImpl()->getFrameLowering();
+  const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
   int Offset = MFI.getObjectOffset(FrameIndex);
 
@@ -151,7 +168,7 @@ void AVRRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   if (MI.getOpcode() == AVR::FRMIDX) {
     MI.setDesc(TII.get(AVR::MOVWRdRr));
     MI.getOperand(FIOperandNum).ChangeToRegister(AVR::R29R28, false);
-    MI.RemoveOperand(2);
+    MI.removeOperand(2);
 
     assert(Offset > 0 && "Invalid offset");
 
@@ -219,7 +236,8 @@ void AVRRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     // a compare and branch, invalidating the contents of SREG set by the
     // compare instruction because of the add/sub pairs. Conservatively save and
     // restore SREG before and after each add/sub pair.
-    BuildMI(MBB, II, dl, TII.get(AVR::INRdA), AVR::R0).addImm(0x3f);
+    BuildMI(MBB, II, dl, TII.get(AVR::INRdA), AVR::R0)
+        .addImm(STI.getIORegSREG());
 
     MachineInstr *New = BuildMI(MBB, II, dl, TII.get(AddOpc), AVR::R29R28)
                             .addReg(AVR::R29R28, RegState::Kill)
@@ -228,7 +246,7 @@ void AVRRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
     // Restore SREG.
     BuildMI(MBB, std::next(II), dl, TII.get(AVR::OUTARr))
-        .addImm(0x3f)
+        .addImm(STI.getIORegSREG())
         .addReg(AVR::R0, RegState::Kill);
 
     // No need to set SREG as dead here otherwise if the next instruction is a
diff --git a/llvm/lib/Target/AVR/AVRRelaxMemOperations.cpp b/llvm/lib/Target/AVR/AVRRelaxMemOperations.cpp
deleted file mode 100644
index 76f29eb9f369..000000000000
--- a/llvm/lib/Target/AVR/AVRRelaxMemOperations.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-//===-- AVRRelaxMemOperations.cpp - Relax out of range loads/stores -------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a pass which relaxes out of range memory operations into
-// equivalent operations which handle bigger addresses.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AVR.h"
-#include "AVRInstrInfo.h"
-#include "AVRTargetMachine.h"
-#include "MCTargetDesc/AVRMCTargetDesc.h"
-
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-
-using namespace llvm;
-
-#define AVR_RELAX_MEM_OPS_NAME "AVR memory operation relaxation pass"
-
-namespace {
-
-class AVRRelaxMem : public MachineFunctionPass {
-public:
-  static char ID;
-
-  AVRRelaxMem() : MachineFunctionPass(ID) {
-    initializeAVRRelaxMemPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  StringRef getPassName() const override { return AVR_RELAX_MEM_OPS_NAME; }
-
-private:
-  typedef MachineBasicBlock Block;
-  typedef Block::iterator BlockIt;
-
-  const TargetInstrInfo *TII;
-
-  template <unsigned OP> bool relax(Block &MBB, BlockIt MBBI);
-
-  bool runOnBasicBlock(Block &MBB);
-  bool runOnInstruction(Block &MBB, BlockIt MBBI);
-
-  MachineInstrBuilder buildMI(Block &MBB, BlockIt MBBI, unsigned Opcode) {
-    return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(Opcode));
-  }
-};
-
-char AVRRelaxMem::ID = 0;
-
-bool AVRRelaxMem::runOnMachineFunction(MachineFunction &MF) {
-  bool Modified = false;
-
-  const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
-  TII = STI.getInstrInfo();
-
-  for (Block &MBB : MF) {
-    bool BlockModified = runOnBasicBlock(MBB);
-    Modified |= BlockModified;
-  }
-
-  return Modified;
-}
-
-bool AVRRelaxMem::runOnBasicBlock(Block &MBB) {
-  bool Modified = false;
-
-  BlockIt MBBI = MBB.begin(), E = MBB.end();
-  while (MBBI != E) {
-    BlockIt NMBBI = std::next(MBBI);
-    Modified |= runOnInstruction(MBB, MBBI);
-    MBBI = NMBBI;
-  }
-
-  return Modified;
-}
-
-template <> bool AVRRelaxMem::relax<AVR::STDWPtrQRr>(Block &MBB, BlockIt MBBI) {
-  MachineInstr &MI = *MBBI;
-
-  MachineOperand &Ptr = MI.getOperand(0);
-  MachineOperand &Src = MI.getOperand(2);
-  int64_t Imm = MI.getOperand(1).getImm();
-
-  // We can definitely optimise this better.
-  if (Imm > 63) {
-    // Push the previous state of the pointer register.
-    // This instruction must preserve the value.
-    buildMI(MBB, MBBI, AVR::PUSHWRr).addReg(Ptr.getReg());
-
-    // Add the immediate to the pointer register.
-    buildMI(MBB, MBBI, AVR::SBCIWRdK)
-        .addReg(Ptr.getReg(), RegState::Define)
-        .addReg(Ptr.getReg())
-        .addImm(-Imm);
-
-    // Store the value in the source register to the address
-    // pointed to by the pointer register.
-    buildMI(MBB, MBBI, AVR::STWPtrRr)
-        .addReg(Ptr.getReg())
-        .addReg(Src.getReg(), getKillRegState(Src.isKill()));
-
-    // Pop the original state of the pointer register.
-    buildMI(MBB, MBBI, AVR::POPWRd)
-        .addDef(Ptr.getReg(), getKillRegState(Ptr.isKill()));
-
-    MI.removeFromParent();
-  }
-
-  return false;
-}
-
-bool AVRRelaxMem::runOnInstruction(Block &MBB, BlockIt MBBI) {
-  MachineInstr &MI = *MBBI;
-  int Opcode = MBBI->getOpcode();
-
-#define RELAX(Op)                                                              \
-  case Op:                                                                     \
-    return relax<Op>(MBB, MI)
-
-  switch (Opcode) { RELAX(AVR::STDWPtrQRr); }
-#undef RELAX
-  return false;
-}
-
-} // end of anonymous namespace
-
-INITIALIZE_PASS(AVRRelaxMem, "avr-relax-mem", AVR_RELAX_MEM_OPS_NAME, false,
-                false)
-
-namespace llvm {
-
-FunctionPass *createAVRRelaxMemPass() { return new AVRRelaxMem(); }
-
-} // end of namespace llvm
diff --git a/llvm/lib/Target/AVR/AVRSubtarget.h b/llvm/lib/Target/AVR/AVRSubtarget.h
index f8ca191b1868..2325193bac0a 100644
--- a/llvm/lib/Target/AVR/AVRSubtarget.h
+++ b/llvm/lib/Target/AVR/AVRSubtarget.h
@@ -91,8 +91,16 @@ public:
     return ELFArch;
   }
 
-  /// Get I/O register address.
-  int getIORegRAMPZ(void) const { return 0x3b; }
+  /// Get I/O register addresses.
+  int getIORegRAMPZ(void) const { return hasELPM() ? 0x3b : -1; }
+  int getIORegEIND(void) const { return hasEIJMPCALL() ? 0x3c : -1; }
+  int getIORegSPL(void) const { return 0x3d; }
+  int getIORegSPH(void) const { return hasSmallStack() ? -1 : 0x3e; }
+  int getIORegSREG(void) const { return 0x3f; }
+
+  /// Get GPR aliases.
+  int getRegTmpIndex(void) const { return hasTinyEncoding() ? 16 : 0; }
+  int getRegZeroIndex(void) const { return hasTinyEncoding() ? 17 : 1; }
 
 private:
   /// The ELF e_flags architecture.
diff --git a/llvm/lib/Target/AVR/AVRTargetMachine.cpp b/llvm/lib/Target/AVR/AVRTargetMachine.cpp
index 22b9ba3ece07..b9d77e0d1a51 100644
--- a/llvm/lib/Target/AVR/AVRTargetMachine.cpp
+++ b/llvm/lib/Target/AVR/AVRTargetMachine.cpp
@@ -38,7 +38,7 @@ static StringRef getCPU(StringRef CPU) {
 }
 
 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
-  return RM.getValueOr(Reloc::Static);
+  return RM.value_or(Reloc::Static);
 }
 
 AVRTargetMachine::AVRTargetMachine(const Target &T, const Triple &TT,
@@ -92,7 +92,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRTarget() {
 
   auto &PR = *PassRegistry::getPassRegistry();
   initializeAVRExpandPseudoPass(PR);
-  initializeAVRRelaxMemPass(PR);
   initializeAVRShiftExpandPass(PR);
 }
 
@@ -118,7 +117,6 @@ bool AVRPassConfig::addInstSelector() {
 }
 
 void AVRPassConfig::addPreSched2() {
-  addPass(createAVRRelaxMemPass());
   addPass(createAVRExpandPseudoPass());
 }
 
diff --git a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
index f19e7840eb31..9e1c7b781f0f 100644
--- a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
+++ b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
@@ -43,6 +43,10 @@ class AVRAsmParser : public MCTargetAsmParser {
   const MCRegisterInfo *MRI;
   const std::string GENERATE_STUBS = "gs";
 
+  enum AVRMatchResultTy {
+    Match_InvalidRegisterOnTiny = FIRST_TARGET_MATCH_RESULT_TY + 1,
+  };
+
 #define GET_ASSEMBLER_HEADER
 #include "AVRGenAsmMatcher.inc"
 
@@ -332,6 +336,8 @@ bool AVRAsmParser::MatchAndEmitInstruction(SMLoc Loc, unsigned &Opcode,
     return invalidOperand(Loc, Operands, ErrorInfo);
   case Match_MnemonicFail:
     return Error(Loc, "invalid instruction");
+  case Match_InvalidRegisterOnTiny:
+    return Error(Loc, "invalid register on avrtiny");
   default:
     return true;
   }
@@ -399,6 +405,11 @@ bool AVRAsmParser::tryParseRegisterOperand(OperandVector &Operands) {
   if (RegNo == AVR::NoRegister)
     return true;
 
+  // Reject R0~R15 on avrtiny.
+  if (AVR::R0 <= RegNo && RegNo <= AVR::R15 &&
+      STI.hasFeature(AVR::FeatureTinyEncoding))
+    return Error(Parser.getTok().getLoc(), "invalid register on avrtiny");
+
   AsmToken const &T = Parser.getTok();
   Operands.push_back(AVROperand::CreateReg(RegNo, T.getLoc(), T.getEndLoc()));
   Parser.Lex(); // Eat register token.
@@ -726,6 +737,12 @@ unsigned AVRAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
   if (Op.isImm()) {
     if (MCConstantExpr const *Const = dyn_cast<MCConstantExpr>(Op.getImm())) {
       int64_t RegNum = Const->getValue();
+
+      // Reject R0~R15 on avrtiny.
+      if (0 <= RegNum && RegNum <= 15 &&
+          STI.hasFeature(AVR::FeatureTinyEncoding))
+        return Match_InvalidRegisterOnTiny;
+
       std::ostringstream RegName;
       RegName << "r" << RegNum;
       RegNum = MatchRegisterName(RegName.str());
diff --git a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
index 9dcd370b9f1e..ee0ae08e192f 100644
--- a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
+++ b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
@@ -18,8 +18,8 @@
 
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDecoderOps.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
-#include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/TargetRegistry.h"
 
@@ -36,7 +36,7 @@ class AVRDisassembler : public MCDisassembler {
 public:
   AVRDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
       : MCDisassembler(STI, Ctx) {}
-  virtual ~AVRDisassembler() {}
+  virtual ~AVRDisassembler() = default;
 
   DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
                               ArrayRef<uint8_t> Bytes, uint64_t Address,
@@ -66,7 +66,7 @@ static const uint16_t GPRDecoderTable[] = {
 
 static DecodeStatus DecodeGPR8RegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
-                                            const void *Decoder) {
+                                            const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
@@ -77,7 +77,7 @@ static DecodeStatus DecodeGPR8RegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecodeLD8RegisterClass(MCInst &Inst, unsigned RegNo,
                                            uint64_t Address,
-                                           const void *Decoder) {
+                                           const MCDisassembler *Decoder) {
   if (RegNo > 15)
     return MCDisassembler::Fail;
 
@@ -86,48 +86,51 @@ static DecodeStatus DecodeLD8RegisterClass(MCInst &Inst, unsigned RegNo,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodePTRREGSRegisterClass(MCInst &Inst, unsigned RegNo,
-                                               uint64_t Address,
-                                               const void *Decoder) {
-  // Note: this function must be defined but does not seem to be called.
-  assert(false && "unimplemented: PTRREGS register class");
-  return MCDisassembler::Success;
-}
-
 static DecodeStatus decodeFIOARr(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const void *Decoder);
+                                 const MCDisassembler *Decoder);
 
 static DecodeStatus decodeFIORdA(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const void *Decoder);
+                                 const MCDisassembler *Decoder);
 
 static DecodeStatus decodeFIOBIT(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const void *Decoder);
+                                 const MCDisassembler *Decoder);
 
 static DecodeStatus decodeCallTarget(MCInst &Inst, unsigned Insn,
-                                     uint64_t Address, const void *Decoder);
+                                     uint64_t Address,
+                                     const MCDisassembler *Decoder);
 
 static DecodeStatus decodeFRd(MCInst &Inst, unsigned Insn, uint64_t Address,
-                              const void *Decoder);
+                              const MCDisassembler *Decoder);
 
 static DecodeStatus decodeFLPMX(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                const void *Decoder);
+                                const MCDisassembler *Decoder);
 
 static DecodeStatus decodeFFMULRdRr(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address, const void *Decoder);
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder);
 
 static DecodeStatus decodeFMOVWRdRr(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address, const void *Decoder);
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder);
 
 static DecodeStatus decodeFWRdK(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                const void *Decoder);
+                                const MCDisassembler *Decoder);
 
 static DecodeStatus decodeFMUL2RdRr(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address, const void *Decoder);
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder);
+
+static DecodeStatus decodeMemri(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                const MCDisassembler *Decoder);
+
+static DecodeStatus decodeLoadStore(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder);
 
 #include "AVRGenDisassemblerTables.inc"
 
 static DecodeStatus decodeFIOARr(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const void *Decoder) {
+                                 const MCDisassembler *Decoder) {
   unsigned addr = 0;
   addr |= fieldFromInstruction(Insn, 0, 4);
   addr |= fieldFromInstruction(Insn, 9, 2) << 4;
@@ -140,7 +143,7 @@ static DecodeStatus decodeFIOARr(MCInst &Inst, unsigned Insn, uint64_t Address,
 }
 
 static DecodeStatus decodeFIORdA(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const void *Decoder) {
+                                 const MCDisassembler *Decoder) {
   unsigned addr = 0;
   addr |= fieldFromInstruction(Insn, 0, 4);
   addr |= fieldFromInstruction(Insn, 9, 2) << 4;
@@ -153,7 +156,7 @@ static DecodeStatus decodeFIORdA(MCInst &Inst, unsigned Insn, uint64_t Address,
 }
 
 static DecodeStatus decodeFIOBIT(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const void *Decoder) {
+                                 const MCDisassembler *Decoder) {
   unsigned addr = fieldFromInstruction(Insn, 3, 5);
   unsigned b = fieldFromInstruction(Insn, 0, 3);
   Inst.addOperand(MCOperand::createImm(addr));
@@ -162,7 +165,8 @@ static DecodeStatus decodeFIOBIT(MCInst &Inst, unsigned Insn, uint64_t Address,
 }
 
 static DecodeStatus decodeCallTarget(MCInst &Inst, unsigned Field,
-                                     uint64_t Address, const void *Decoder) {
+                                     uint64_t Address,
+                                     const MCDisassembler *Decoder) {
   // Call targets need to be shifted left by one so this needs a custom
   // decoder.
   Inst.addOperand(MCOperand::createImm(Field << 1));
@@ -170,7 +174,7 @@ static DecodeStatus decodeCallTarget(MCInst &Inst, unsigned Field,
 }
 
 static DecodeStatus decodeFRd(MCInst &Inst, unsigned Insn, uint64_t Address,
-                              const void *Decoder) {
+                              const MCDisassembler *Decoder) {
   unsigned d = fieldFromInstruction(Insn, 4, 5);
   if (DecodeGPR8RegisterClass(Inst, d, Address, Decoder) ==
       MCDisassembler::Fail)
@@ -179,7 +183,7 @@ static DecodeStatus decodeFRd(MCInst &Inst, unsigned Insn, uint64_t Address,
 }
 
 static DecodeStatus decodeFLPMX(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                const void *Decoder) {
+                                const MCDisassembler *Decoder) {
   if (decodeFRd(Inst, Insn, Address, Decoder) == MCDisassembler::Fail)
     return MCDisassembler::Fail;
   Inst.addOperand(MCOperand::createReg(AVR::R31R30));
@@ -187,7 +191,8 @@ static DecodeStatus decodeFLPMX(MCInst &Inst, unsigned Insn, uint64_t Address,
 }
 
 static DecodeStatus decodeFFMULRdRr(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address, const void *Decoder) {
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder) {
   unsigned d = fieldFromInstruction(Insn, 4, 3) + 16;
   unsigned r = fieldFromInstruction(Insn, 0, 3) + 16;
   if (DecodeGPR8RegisterClass(Inst, d, Address, Decoder) ==
@@ -200,7 +205,8 @@ static DecodeStatus decodeFFMULRdRr(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus decodeFMOVWRdRr(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address, const void *Decoder) {
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder) {
   unsigned r = fieldFromInstruction(Insn, 4, 4) * 2;
   unsigned d = fieldFromInstruction(Insn, 0, 4) * 2;
   if (DecodeGPR8RegisterClass(Inst, r, Address, Decoder) ==
@@ -213,7 +219,7 @@ static DecodeStatus decodeFMOVWRdRr(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus decodeFWRdK(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                const void *Decoder) {
+                                const MCDisassembler *Decoder) {
   unsigned d = fieldFromInstruction(Insn, 4, 2) * 2 + 24; // starts at r24:r25
   unsigned k = 0;
   k |= fieldFromInstruction(Insn, 0, 4);
@@ -229,7 +235,8 @@ static DecodeStatus decodeFWRdK(MCInst &Inst, unsigned Insn, uint64_t Address,
 }
 
 static DecodeStatus decodeFMUL2RdRr(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address, const void *Decoder) {
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder) {
   unsigned rd = fieldFromInstruction(Insn, 4, 4) + 16;
   unsigned rr = fieldFromInstruction(Insn, 0, 4) + 16;
   if (DecodeGPR8RegisterClass(Inst, rd, Address, Decoder) ==
@@ -241,6 +248,128 @@ static DecodeStatus decodeFMUL2RdRr(MCInst &Inst, unsigned Insn,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus decodeMemri(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                const MCDisassembler *Decoder) {
+  // As in the EncoderMethod `AVRMCCodeEmitter::encodeMemri`, the memory
+  // address is encoded into 7-bit, in which bits 0-5 are the immediate offset,
+  // and the bit-6 is the pointer register bit (Z=0, Y=1).
+  if (Insn > 127)
+    return MCDisassembler::Fail;
+
+  // Append the base register operand.
+  Inst.addOperand(
+      MCOperand::createReg((Insn & 0x40) ? AVR::R29R28 : AVR::R31R30));
+  // Append the immediate offset operand.
+  Inst.addOperand(MCOperand::createImm(Insn & 0x3f));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeLoadStore(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder) {
+  // Get the register will be loaded or stored.
+  unsigned RegVal = GPRDecoderTable[(Insn >> 4) & 0x1f];
+
+  // Decode LDD/STD with offset less than 8.
+  if ((Insn & 0xf000) == 0x8000) {
+    unsigned RegBase = (Insn & 0x8) ? AVR::R29R28 : AVR::R31R30;
+    unsigned Offset = Insn & 7; // We need not consider offset > 7.
+    if ((Insn & 0x200) == 0) { // Decode LDD.
+      Inst.setOpcode(AVR::LDDRdPtrQ);
+      Inst.addOperand(MCOperand::createReg(RegVal));
+      Inst.addOperand(MCOperand::createReg(RegBase));
+      Inst.addOperand(MCOperand::createImm(Offset));
+    } else { // Decode STD.
+      Inst.setOpcode(AVR::STDPtrQRr);
+      Inst.addOperand(MCOperand::createReg(RegBase));
+      Inst.addOperand(MCOperand::createImm(Offset));
+      Inst.addOperand(MCOperand::createReg(RegVal));
+    }
+    return MCDisassembler::Success;
+  }
+
+  // Decode the following 14 instructions. Bit 9 indicates load(0) or store(1),
+  // bits 8~4 indicate the value register, bits 3-2 indicate the base address
+  // register (11-X, 10-Y, 00-Z), bits 1~0 indicate the mode (00-basic,
+  // 01-postinc, 10-predec).
+  // ST X,  Rr : 1001 001r rrrr 1100
+  // ST X+, Rr : 1001 001r rrrr 1101
+  // ST -X, Rr : 1001 001r rrrr 1110
+  // ST Y+, Rr : 1001 001r rrrr 1001
+  // ST -Y, Rr : 1001 001r rrrr 1010
+  // ST Z+, Rr : 1001 001r rrrr 0001
+  // ST -Z, Rr : 1001 001r rrrr 0010
+  // LD Rd, X  : 1001 000d dddd 1100
+  // LD Rd, X+ : 1001 000d dddd 1101
+  // LD Rd, -X : 1001 000d dddd 1110
+  // LD Rd, Y+ : 1001 000d dddd 1001
+  // LD Rd, -Y : 1001 000d dddd 1010
+  // LD Rd, Z+ : 1001 000d dddd 0001
+  // LD Rd, -Z : 1001 000d dddd 0010
+  if ((Insn & 0xfc00) != 0x9000 || (Insn & 0xf) == 0)
+    return MCDisassembler::Fail;
+
+  // Get the base address register.
+  unsigned RegBase;
+  switch (Insn & 0xc) {
+  case 0xc:
+    RegBase = AVR::R27R26;
+    break;
+  case 0x8:
+    RegBase = AVR::R29R28;
+    break;
+  case 0x0:
+    RegBase = AVR::R31R30;
+    break;
+  default:
+    return MCDisassembler::Fail;
+  }
+
+  // Set the opcode.
+  switch (Insn & 0x203) {
+  case 0x200:
+    Inst.setOpcode(AVR::STPtrRr);
+    Inst.addOperand(MCOperand::createReg(RegBase));
+    Inst.addOperand(MCOperand::createReg(RegVal));
+    return MCDisassembler::Success;
+  case 0x201:
+    Inst.setOpcode(AVR::STPtrPiRr);
+    break;
+  case 0x202:
+    Inst.setOpcode(AVR::STPtrPdRr);
+    break;
+  case 0:
+    Inst.setOpcode(AVR::LDRdPtr);
+    Inst.addOperand(MCOperand::createReg(RegVal));
+    Inst.addOperand(MCOperand::createReg(RegBase));
+    return MCDisassembler::Success;
+  case 1:
+    Inst.setOpcode(AVR::LDRdPtrPi);
+    break;
+  case 2:
+    Inst.setOpcode(AVR::LDRdPtrPd);
+    break;
+  default:
+    return MCDisassembler::Fail;
+  }
+
+  // Build postinc/predec machine instructions.
+  if ((Insn & 0x200) == 0) { // This is a load instruction.
+    Inst.addOperand(MCOperand::createReg(RegVal));
+    Inst.addOperand(MCOperand::createReg(RegBase));
+    Inst.addOperand(MCOperand::createReg(RegBase));
+  } else { // This is a store instruction.
+    Inst.addOperand(MCOperand::createReg(RegBase));
+    Inst.addOperand(MCOperand::createReg(RegBase));
+    Inst.addOperand(MCOperand::createReg(RegVal));
+    // STPtrPiRr and STPtrPdRr have an extra immediate operand.
+    Inst.addOperand(MCOperand::createImm(1));
+  }
+
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus readInstruction16(ArrayRef<uint8_t> Bytes, uint64_t Address,
                                       uint64_t &Size, uint32_t &Insn) {
   if (Bytes.size() < 2) {
@@ -299,7 +428,12 @@ DecodeStatus AVRDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
     // Try to auto-decode a 16-bit instruction.
     Result = decodeInstruction(getDecoderTable(Size), Instr, Insn, Address,
                                this, STI);
+    if (Result != MCDisassembler::Fail)
+      return Result;
 
+    // Try to decode to a load/store instruction. ST/LD need a specified
+    // DecoderMethod, as they already have a specified PostEncoderMethod.
+    Result = decodeLoadStore(Instr, Insn, Address, this);
     if (Result != MCDisassembler::Fail)
       return Result;
   }
@@ -323,4 +457,4 @@ DecodeStatus AVRDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
 }
 
 typedef DecodeStatus (*DecodeFunc)(MCInst &MI, unsigned insn, uint64_t Address,
-                                   const void *Decoder);
+                                   const MCDisassembler *Decoder);
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
index b90e103794da..850ddf0d9458 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
@@ -25,7 +25,7 @@ class AVRELFObjectWriter : public MCELFObjectTargetWriter {
 public:
   AVRELFObjectWriter(uint8_t OSABI);
 
-  virtual ~AVRELFObjectWriter() {}
+  virtual ~AVRELFObjectWriter() = default;
 
   unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
                         const MCFixup &Fixup, bool IsPCRel) const override;
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp
index 85933d6b9bb9..ade5df18c3b9 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp
@@ -1,6 +1,7 @@
 #include "AVRELFStreamer.h"
 
 #include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Support/FormattedStream.h"
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h
index 11f55f6d253b..54dad3098385 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h
@@ -43,6 +43,9 @@ private:
     printPCRelImm(MI, OpNo, O);
   }
   void printMemri(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMemspi(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemri(MI, OpNo, O);
+  }
 
   // Autogenerated by TableGen.
   std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
index 9754ff7f1146..c8bb410e4882 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
@@ -295,7 +295,6 @@ void AVRMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
 }
 
 MCCodeEmitter *createAVRMCCodeEmitter(const MCInstrInfo &MCII,
-                                      const MCRegisterInfo &MRI,
                                       MCContext &Ctx) {
   return new AVRMCCodeEmitter(MCII, Ctx);
 }
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h
index 68589763f29a..5bf6c1a581e3 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h
@@ -84,7 +84,7 @@ private:
 private:
   explicit AVRMCExpr(VariantKind Kind, const MCExpr *Expr, bool Negated)
       : Kind(Kind), SubExpr(Expr), Negated(Negated) {}
-  ~AVRMCExpr() {}
+  ~AVRMCExpr() = default;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h
index ef116793d326..aaf236d82016 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h
@@ -33,7 +33,6 @@ MCInstrInfo *createAVRMCInstrInfo();
 
 /// Creates a machine code emitter for AVR.
 MCCodeEmitter *createAVRMCCodeEmitter(const MCInstrInfo &MCII,
-                                      const MCRegisterInfo &MRI,
                                       MCContext &Ctx);
 
 /// Creates an assembly backend for AVR.
diff --git a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
index 697deb117bcb..4c064d65d919 100644
--- a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
+++ b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
@@ -13,6 +13,7 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
diff --git a/llvm/lib/Target/BPF/BPF.h b/llvm/lib/Target/BPF/BPF.h
index 89990f7e15c2..3de761bf6601 100644
--- a/llvm/lib/Target/BPF/BPF.h
+++ b/llvm/lib/Target/BPF/BPF.h
@@ -11,6 +11,8 @@
 
 #include "MCTargetDesc/BPFMCTargetDesc.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
diff --git a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
index 46141e69d9d4..349cdd92ae62 100644
--- a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
+++ b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
@@ -77,6 +77,7 @@
 #include "BPF.h"
 #include "BPFCORE.h"
 #include "BPFTargetMachine.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instruction.h"
@@ -123,7 +124,7 @@ public:
   struct CallInfo {
     uint32_t Kind;
     uint32_t AccessIndex;
-    Align RecordAlignment;
+    MaybeAlign RecordAlignment;
     MDNode *Metadata;
     Value *Base;
   };
@@ -142,9 +143,9 @@ private:
   Module *M = nullptr;
 
   static std::map<std::string, GlobalVariable *> GEPGlobals;
-  // A map to link preserve_*_access_index instrinsic calls.
+  // A map to link preserve_*_access_index intrinsic calls.
   std::map<CallInst *, std::pair<CallInst *, CallInfo>> AIChain;
-  // A map to hold all the base preserve_*_access_index instrinsic calls.
+  // A map to hold all the base preserve_*_access_index intrinsic calls.
   // The base call is not an input of any other preserve_*
   // intrinsics.
   std::map<CallInst *, CallInfo> BaseAICalls;
@@ -169,7 +170,7 @@ private:
                           uint32_t &StartBitOffset, uint32_t &EndBitOffset);
   uint32_t GetFieldInfo(uint32_t InfoKind, DICompositeType *CTy,
                         uint32_t AccessIndex, uint32_t PatchImm,
-                        Align RecordAlignment);
+                        MaybeAlign RecordAlignment);
 
   Value *computeBaseAndAccessKey(CallInst *Call, CallInfo &CInfo,
                                  std::string &AccessKey, MDNode *&BaseMeta);
@@ -270,7 +271,7 @@ static uint32_t calcArraySize(const DICompositeType *CTy, uint32_t StartDim) {
 
 static Type *getBaseElementType(const CallInst *Call) {
   // Element type is stored in an elementtype() attribute on the first param.
-  return Call->getAttributes().getParamElementType(0);
+  return Call->getParamElementType(0);
 }
 
 /// Check whether a call is a preserve_*_access_index intrinsic call or not.
@@ -299,8 +300,6 @@ bool BPFAbstractMemberAccess::IsPreserveDIAccessIndexCall(const CallInst *Call,
       report_fatal_error("Missing metadata for llvm.preserve.union.access.index intrinsic");
     CInfo.AccessIndex = getConstant(Call->getArgOperand(1));
     CInfo.Base = Call->getArgOperand(0);
-    CInfo.RecordAlignment =
-        DL->getABITypeAlign(CInfo.Base->getType()->getPointerElementType());
     return true;
   }
   if (GV->getName().startswith("llvm.preserve.struct.access.index")) {
@@ -333,6 +332,8 @@ bool BPFAbstractMemberAccess::IsPreserveDIAccessIndexCall(const CallInst *Call,
       report_fatal_error("Incorrect flag for llvm.bpf.preserve.type.info intrinsic");
     if (Flag == BPFCoreSharedInfo::PRESERVE_TYPE_INFO_EXISTENCE)
       CInfo.AccessIndex = BPFCoreSharedInfo::TYPE_EXISTENCE;
+    else if (Flag == BPFCoreSharedInfo::PRESERVE_TYPE_INFO_MATCH)
+      CInfo.AccessIndex = BPFCoreSharedInfo::TYPE_MATCH;
     else
       CInfo.AccessIndex = BPFCoreSharedInfo::TYPE_SIZE;
     return true;
@@ -592,10 +593,20 @@ void BPFAbstractMemberAccess::GetStorageBitRange(DIDerivedType *MemberTy,
                                                  uint32_t &EndBitOffset) {
   uint32_t MemberBitSize = MemberTy->getSizeInBits();
   uint32_t MemberBitOffset = MemberTy->getOffsetInBits();
+
+  if (RecordAlignment > 8) {
+    // If the Bits are within an aligned 8-byte, set the RecordAlignment
+    // to 8, other report the fatal error.
+    if (MemberBitOffset / 64 != (MemberBitOffset + MemberBitSize) / 64)
+      report_fatal_error("Unsupported field expression for llvm.bpf.preserve.field.info, "
+                         "requiring too big alignment");
+    RecordAlignment = Align(8);
+  }
+
   uint32_t AlignBits = RecordAlignment.value() * 8;
-  if (RecordAlignment > 8 || MemberBitSize > AlignBits)
+  if (MemberBitSize > AlignBits)
     report_fatal_error("Unsupported field expression for llvm.bpf.preserve.field.info, "
-                       "requiring too big alignment");
+                       "bitfield size greater than record alignment");
 
   StartBitOffset = MemberBitOffset & ~(AlignBits - 1);
   if ((StartBitOffset + AlignBits) < (MemberBitOffset + MemberBitSize))
@@ -608,7 +619,7 @@ uint32_t BPFAbstractMemberAccess::GetFieldInfo(uint32_t InfoKind,
                                                DICompositeType *CTy,
                                                uint32_t AccessIndex,
                                                uint32_t PatchImm,
-                                               Align RecordAlignment) {
+                                               MaybeAlign RecordAlignment) {
   if (InfoKind == BPFCoreSharedInfo::FIELD_EXISTENCE)
       return 1;
 
@@ -624,7 +635,7 @@ uint32_t BPFAbstractMemberAccess::GetFieldInfo(uint32_t InfoKind,
         PatchImm += MemberTy->getOffsetInBits() >> 3;
       } else {
         unsigned SBitOffset, NextSBitOffset;
-        GetStorageBitRange(MemberTy, RecordAlignment, SBitOffset,
+        GetStorageBitRange(MemberTy, *RecordAlignment, SBitOffset,
                            NextSBitOffset);
         PatchImm += SBitOffset >> 3;
       }
@@ -643,7 +654,8 @@ uint32_t BPFAbstractMemberAccess::GetFieldInfo(uint32_t InfoKind,
         return SizeInBits >> 3;
 
       unsigned SBitOffset, NextSBitOffset;
-      GetStorageBitRange(MemberTy, RecordAlignment, SBitOffset, NextSBitOffset);
+      GetStorageBitRange(MemberTy, *RecordAlignment, SBitOffset,
+                         NextSBitOffset);
       SizeInBits = NextSBitOffset - SBitOffset;
       if (SizeInBits & (SizeInBits - 1))
         report_fatal_error("Unsupported field expression for llvm.bpf.preserve.field.info");
@@ -703,7 +715,7 @@ uint32_t BPFAbstractMemberAccess::GetFieldInfo(uint32_t InfoKind,
     }
 
     unsigned SBitOffset, NextSBitOffset;
-    GetStorageBitRange(MemberTy, RecordAlignment, SBitOffset, NextSBitOffset);
+    GetStorageBitRange(MemberTy, *RecordAlignment, SBitOffset, NextSBitOffset);
     if (NextSBitOffset - SBitOffset > 64)
       report_fatal_error("too big field size for llvm.bpf.preserve.field.info");
 
@@ -734,7 +746,7 @@ uint32_t BPFAbstractMemberAccess::GetFieldInfo(uint32_t InfoKind,
     }
 
     unsigned SBitOffset, NextSBitOffset;
-    GetStorageBitRange(MemberTy, RecordAlignment, SBitOffset, NextSBitOffset);
+    GetStorageBitRange(MemberTy, *RecordAlignment, SBitOffset, NextSBitOffset);
     if (NextSBitOffset - SBitOffset > 64)
       report_fatal_error("too big field size for llvm.bpf.preserve.field.info");
 
@@ -923,7 +935,8 @@ MDNode *BPFAbstractMemberAccess::computeAccessKey(CallInst *Call,
 
   int64_t PatchImm;
   std::string AccessStr("0");
-  if (CInfo.AccessIndex == BPFCoreSharedInfo::TYPE_EXISTENCE) {
+  if (CInfo.AccessIndex == BPFCoreSharedInfo::TYPE_EXISTENCE ||
+      CInfo.AccessIndex == BPFCoreSharedInfo::TYPE_MATCH) {
     PatchImm = 1;
   } else if (CInfo.AccessIndex == BPFCoreSharedInfo::TYPE_SIZE) {
     // typedef debuginfo type has size 0, get the eventual base type.
@@ -933,8 +946,11 @@ MDNode *BPFAbstractMemberAccess::computeAccessKey(CallInst *Call,
     // ENUM_VALUE_EXISTENCE and ENUM_VALUE
     IsInt32Ret = false;
 
-    const auto *CE = cast<ConstantExpr>(Call->getArgOperand(1));
-    const GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
+    // The argument could be a global variable or a getelementptr with base to
+    // a global variable depending on whether the clang option `opaque-options`
+    // is set or not.
+    const GlobalVariable *GV =
+        cast<GlobalVariable>(Call->getArgOperand(1)->stripPointerCasts());
     assert(GV->hasInitializer());
     const ConstantDataArray *DA = cast<ConstantDataArray>(GV->getInitializer());
     assert(DA->isString());
diff --git a/llvm/lib/Target/BPF/BPFAdjustOpt.cpp b/llvm/lib/Target/BPF/BPFAdjustOpt.cpp
index 69d0bca0bd77..98f8d59fbe01 100644
--- a/llvm/lib/Target/BPF/BPFAdjustOpt.cpp
+++ b/llvm/lib/Target/BPF/BPFAdjustOpt.cpp
@@ -259,10 +259,16 @@ bool BPFAdjustOptImpl::serializeICMPCrossBB(BasicBlock &BB) {
     return false;
 
   if (Cond1Op == ICmpInst::ICMP_SGT || Cond1Op == ICmpInst::ICMP_SGE) {
-    if (Cond2Op != ICmpInst::ICMP_SLT && Cond1Op != ICmpInst::ICMP_SLE)
+    if (Cond2Op != ICmpInst::ICMP_SLT && Cond2Op != ICmpInst::ICMP_SLE)
       return false;
   } else if (Cond1Op == ICmpInst::ICMP_SLT || Cond1Op == ICmpInst::ICMP_SLE) {
-    if (Cond2Op != ICmpInst::ICMP_SGT && Cond1Op != ICmpInst::ICMP_SGE)
+    if (Cond2Op != ICmpInst::ICMP_SGT && Cond2Op != ICmpInst::ICMP_SGE)
+      return false;
+  } else if (Cond1Op == ICmpInst::ICMP_ULT || Cond1Op == ICmpInst::ICMP_ULE) {
+    if (Cond2Op != ICmpInst::ICMP_UGT && Cond2Op != ICmpInst::ICMP_UGE)
+      return false;
+  } else if (Cond1Op == ICmpInst::ICMP_UGT || Cond1Op == ICmpInst::ICMP_UGE) {
+    if (Cond2Op != ICmpInst::ICMP_ULT && Cond2Op != ICmpInst::ICMP_ULE)
       return false;
   } else {
     return false;
diff --git a/llvm/lib/Target/BPF/BPFCORE.h b/llvm/lib/Target/BPF/BPFCORE.h
index 0c504412480d..c9aa135232c1 100644
--- a/llvm/lib/Target/BPF/BPFCORE.h
+++ b/llvm/lib/Target/BPF/BPFCORE.h
@@ -32,6 +32,7 @@ public:
     TYPE_SIZE,
     ENUM_VALUE_EXISTENCE,
     ENUM_VALUE,
+    TYPE_MATCH,
 
     MAX_FIELD_RELOC_KIND,
   };
@@ -46,6 +47,7 @@ public:
   enum PreserveTypeInfo : uint32_t {
     PRESERVE_TYPE_INFO_EXISTENCE = 0,
     PRESERVE_TYPE_INFO_SIZE,
+    PRESERVE_TYPE_INFO_MATCH,
 
     MAX_PRESERVE_TYPE_INFO_FLAG,
   };
diff --git a/llvm/lib/Target/BPF/BPFISelLowering.cpp b/llvm/lib/Target/BPF/BPFISelLowering.cpp
index 0587cb0e16e3..16876e74c4a1 100644
--- a/llvm/lib/Target/BPF/BPFISelLowering.cpp
+++ b/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -103,7 +103,6 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SDIVREM, VT, Expand);
     setOperationAction(ISD::UDIVREM, VT, Expand);
     setOperationAction(ISD::SREM, VT, Expand);
-    setOperationAction(ISD::UREM, VT, Expand);
     setOperationAction(ISD::MULHU, VT, Expand);
     setOperationAction(ISD::MULHS, VT, Expand);
     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
@@ -168,6 +167,7 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
     MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 0;
     MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 0;
     MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 0;
+    MaxLoadsPerMemcmp = 0;
   } else {
     // inline memcpy() for kernel to see explicit copy
     unsigned CommonMaxStores =
@@ -176,6 +176,7 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
     MaxStoresPerMemset = MaxStoresPerMemsetOptSize = CommonMaxStores;
     MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = CommonMaxStores;
     MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = CommonMaxStores;
+    MaxLoadsPerMemcmp = MaxLoadsPerMemcmpOptSize = CommonMaxStores;
   }
 
   // CPU/Feature control
diff --git a/llvm/lib/Target/BPF/BPFInstrFormats.td b/llvm/lib/Target/BPF/BPFInstrFormats.td
index a809065014e5..27db0be080ae 100644
--- a/llvm/lib/Target/BPF/BPFInstrFormats.td
+++ b/llvm/lib/Target/BPF/BPFInstrFormats.td
@@ -39,6 +39,7 @@ def BPF_AND  : BPFArithOp<0x5>;
 def BPF_LSH  : BPFArithOp<0x6>;
 def BPF_RSH  : BPFArithOp<0x7>;
 def BPF_NEG  : BPFArithOp<0x8>;
+def BPF_MOD  : BPFArithOp<0x9>;
 def BPF_XOR  : BPFArithOp<0xa>;
 def BPF_MOV  : BPFArithOp<0xb>;
 def BPF_ARSH : BPFArithOp<0xc>;
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.cpp b/llvm/lib/Target/BPF/BPFInstrInfo.cpp
index 54360a89782b..e61e32b62d83 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.cpp
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.cpp
@@ -192,8 +192,7 @@ bool BPFInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
       }
 
       // If the block has any instructions after a J, delete them.
-      while (std::next(I) != MBB.end())
-        std::next(I)->eraseFromParent();
+      MBB.erase(std::next(I), MBB.end());
       Cond.clear();
       FBB = nullptr;
 
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td
index 082e1f4a92c2..6cac478561b2 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.td
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -298,6 +298,7 @@ let isAsCheapAsAMove = 1 in {
 }
   defm MUL : ALU<BPF_MUL, "*=", mul>;
   defm DIV : ALU<BPF_DIV, "/=", udiv>;
+  defm MOD : ALU<BPF_MOD, "%=", urem>;
 }
 
 class NEG_RR<BPFOpClass Class, BPFArithOp Opc,
@@ -372,6 +373,7 @@ def FI_ri
   let Inst{47-32} = 0;
   let Inst{31-0} = 0;
   let BPFClass = BPF_LD;
+  bit isPseudo = true;
 }
 
 def LD_pseudo
diff --git a/llvm/lib/Target/BPF/BPFMIChecking.cpp b/llvm/lib/Target/BPF/BPFMIChecking.cpp
index 2bc2302cf55c..b462f1d1427d 100644
--- a/llvm/lib/Target/BPF/BPFMIChecking.cpp
+++ b/llvm/lib/Target/BPF/BPFMIChecking.cpp
@@ -17,6 +17,7 @@
 #include "BPF.h"
 #include "BPFInstrInfo.h"
 #include "BPFTargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
diff --git a/llvm/lib/Target/BPF/BPFMIPeephole.cpp b/llvm/lib/Target/BPF/BPFMIPeephole.cpp
index 7f69c8a63443..cefbe48b7217 100644
--- a/llvm/lib/Target/BPF/BPFMIPeephole.cpp
+++ b/llvm/lib/Target/BPF/BPFMIPeephole.cpp
@@ -24,6 +24,7 @@
 #include "BPFInstrInfo.h"
 #include "BPFTargetMachine.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
@@ -123,9 +124,8 @@ bool BPFMIPeephole::isPhiFrom32Def(MachineInstr *PhiMI)
     if (!PhiDef)
       return false;
     if (PhiDef->isPHI()) {
-      if (PhiInsns.find(PhiDef) != PhiInsns.end())
+      if (!PhiInsns.insert(PhiDef).second)
         return false;
-      PhiInsns.insert(PhiDef);
       if (!isPhiFrom32Def(PhiDef))
         return false;
     }
@@ -143,9 +143,8 @@ bool BPFMIPeephole::isInsnFrom32Def(MachineInstr *DefInsn)
     return false;
 
   if (DefInsn->isPHI()) {
-    if (PhiInsns.find(DefInsn) != PhiInsns.end())
+    if (!PhiInsns.insert(DefInsn).second)
       return false;
-    PhiInsns.insert(DefInsn);
     if (!isPhiFrom32Def(DefInsn))
       return false;
   } else if (DefInsn->getOpcode() == BPF::COPY) {
diff --git a/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp b/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp
index b4232875383c..088195994edd 100644
--- a/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp
+++ b/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp
@@ -31,9 +31,11 @@
 #include "BPFCORE.h"
 #include "BPFInstrInfo.h"
 #include "BPFTargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
+#include <set>
 
 using namespace llvm;
 
@@ -52,9 +54,12 @@ struct BPFMISimplifyPatchable : public MachineFunctionPass {
   }
 
 private:
+  std::set<MachineInstr *> SkipInsts;
+
   // Initialize class variables.
   void initialize(MachineFunction &MFParm);
 
+  bool isLoadInst(unsigned Opcode);
   bool removeLD();
   void processCandidate(MachineRegisterInfo *MRI, MachineBasicBlock &MBB,
                         MachineInstr &MI, Register &SrcReg, Register &DstReg,
@@ -88,6 +93,12 @@ void BPFMISimplifyPatchable::initialize(MachineFunction &MFParm) {
   LLVM_DEBUG(dbgs() << "*** BPF simplify patchable insts pass ***\n\n");
 }
 
+bool BPFMISimplifyPatchable::isLoadInst(unsigned Opcode) {
+  return Opcode == BPF::LDD || Opcode == BPF::LDW || Opcode == BPF::LDH ||
+         Opcode == BPF::LDB || Opcode == BPF::LDW32 || Opcode == BPF::LDH32 ||
+         Opcode == BPF::LDB32;
+}
+
 void BPFMISimplifyPatchable::checkADDrr(MachineRegisterInfo *MRI,
     MachineOperand *RelocOp, const GlobalValue *GVal) {
   const MachineInstr *Inst = RelocOp->getParent();
@@ -229,6 +240,11 @@ void BPFMISimplifyPatchable::processDstReg(MachineRegisterInfo *MRI,
 void BPFMISimplifyPatchable::processInst(MachineRegisterInfo *MRI,
     MachineInstr *Inst, MachineOperand *RelocOp, const GlobalValue *GVal) {
   unsigned Opcode = Inst->getOpcode();
+  if (isLoadInst(Opcode)) {
+    SkipInsts.insert(Inst);
+    return;
+  }
+
   if (Opcode == BPF::ADD_rr)
     checkADDrr(MRI, RelocOp, GVal);
   else if (Opcode == BPF::SLL_rr)
@@ -253,10 +269,10 @@ bool BPFMISimplifyPatchable::removeLD() {
       }
 
       // Ensure the register format is LOAD <reg>, <reg>, 0
-      if (MI.getOpcode() != BPF::LDD && MI.getOpcode() != BPF::LDW &&
-          MI.getOpcode() != BPF::LDH && MI.getOpcode() != BPF::LDB &&
-          MI.getOpcode() != BPF::LDW32 && MI.getOpcode() != BPF::LDH32 &&
-          MI.getOpcode() != BPF::LDB32)
+      if (!isLoadInst(MI.getOpcode()))
+        continue;
+
+      if (SkipInsts.find(&MI) != SkipInsts.end())
         continue;
 
       if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg())
diff --git a/llvm/lib/Target/BPF/BPFPreserveDIType.cpp b/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
index 6dfb7dc39922..8c58aae5b618 100644
--- a/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
+++ b/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
@@ -12,6 +12,7 @@
 
 #include "BPF.h"
 #include "BPFCORE.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instruction.h"
diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
index 2fb76ab5c440..97d9ed3cad47 100644
--- a/llvm/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
@@ -59,7 +59,7 @@ static std::string computeDataLayout(const Triple &TT) {
 }
 
 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
-  return RM.getValueOr(Reloc::PIC_);
+  return RM.value_or(Reloc::PIC_);
 }
 
 BPFTargetMachine::BPFTargetMachine(const Target &T, const Triple &TT,
@@ -149,7 +149,7 @@ void BPFPassConfig::addIRPasses() {
 }
 
 TargetTransformInfo
-BPFTargetMachine::getTargetTransformInfo(const Function &F) {
+BPFTargetMachine::getTargetTransformInfo(const Function &F) const {
   return TargetTransformInfo(BPFTTIImpl(this, F));
 }
 
diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.h b/llvm/lib/Target/BPF/BPFTargetMachine.h
index 98f64ccc3793..fede52089725 100644
--- a/llvm/lib/Target/BPF/BPFTargetMachine.h
+++ b/llvm/lib/Target/BPF/BPFTargetMachine.h
@@ -34,7 +34,7 @@ public:
 
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
-  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+  TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
 
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
diff --git a/llvm/lib/Target/BPF/BPFTargetTransformInfo.h b/llvm/lib/Target/BPF/BPFTargetTransformInfo.h
index 6b86bf6e6cc1..0c8f9604b665 100644
--- a/llvm/lib/Target/BPF/BPFTargetTransformInfo.h
+++ b/llvm/lib/Target/BPF/BPFTargetTransformInfo.h
@@ -71,6 +71,15 @@ public:
                                            Opd2Info, Opd1PropInfo,
                                            Opd2PropInfo);
   }
+
+  TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
+                                                    bool IsZeroCmp) const {
+    TTI::MemCmpExpansionOptions Options;
+    Options.LoadSizes = {8, 4, 2, 1};
+    Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
+    return Options;
+  }
+
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/BPF/BTF.def b/llvm/lib/Target/BPF/BTF.def
index 0ae4194bc512..1de0e51b4757 100644
--- a/llvm/lib/Target/BPF/BTF.def
+++ b/llvm/lib/Target/BPF/BTF.def
@@ -33,5 +33,6 @@ HANDLE_BTF_KIND(15, DATASEC)
 HANDLE_BTF_KIND(16, FLOAT)
 HANDLE_BTF_KIND(17, DECL_TAG)
 HANDLE_BTF_KIND(18, TYPE_TAG)
+HANDLE_BTF_KIND(19, ENUM64)
 
 #undef HANDLE_BTF_KIND
diff --git a/llvm/lib/Target/BPF/BTF.h b/llvm/lib/Target/BPF/BTF.h
index e54b97cd49a9..4540054aaf34 100644
--- a/llvm/lib/Target/BPF/BTF.h
+++ b/llvm/lib/Target/BPF/BTF.h
@@ -60,6 +60,7 @@ enum {
   CommonTypeSize = 12,
   BTFArraySize = 12,
   BTFEnumSize = 8,
+  BTFEnum64Size = 12,
   BTFMemberSize = 12,
   BTFParamSize = 8,
   BTFDataSecVarSize = 12,
@@ -145,6 +146,15 @@ struct BTFEnum {
   int32_t Val;      ///< Enum member value
 };
 
+/// BTF_KIND_ENUM64 is followed by multiple "struct BTFEnum64".
+/// The exact number of BTFEnum64 is stored in the vlen (of the
+/// info in "struct CommonType").
+struct BTFEnum64 {
+  uint32_t NameOff; ///< Enum name offset in the string table
+  uint32_t Val_Lo32;     ///< Enum member lo32 value
+  uint32_t Val_Hi32;     ///< Enum member hi32 value
+};
+
 /// BTF_KIND_ARRAY is followed by one "struct BTFArray".
 struct BTFArray {
   uint32_t ElemType;  ///< Element type
diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index d536aed1d211..a949e925eb60 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -22,6 +22,7 @@
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/LineIterator.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 
 using namespace llvm;
@@ -161,9 +162,10 @@ void BTFTypeInt::emitType(MCStreamer &OS) {
   OS.emitInt32(IntVal);
 }
 
-BTFTypeEnum::BTFTypeEnum(const DICompositeType *ETy, uint32_t VLen) : ETy(ETy) {
+BTFTypeEnum::BTFTypeEnum(const DICompositeType *ETy, uint32_t VLen,
+    bool IsSigned) : ETy(ETy) {
   Kind = BTF::BTF_KIND_ENUM;
-  BTFType.Info = Kind << 24 | VLen;
+  BTFType.Info = IsSigned << 31 | Kind << 24 | VLen;
   BTFType.Size = roundupToBytes(ETy->getSizeInBits());
 }
 
@@ -199,6 +201,48 @@ void BTFTypeEnum::emitType(MCStreamer &OS) {
   }
 }
 
+BTFTypeEnum64::BTFTypeEnum64(const DICompositeType *ETy, uint32_t VLen,
+    bool IsSigned) : ETy(ETy) {
+  Kind = BTF::BTF_KIND_ENUM64;
+  BTFType.Info = IsSigned << 31 | Kind << 24 | VLen;
+  BTFType.Size = roundupToBytes(ETy->getSizeInBits());
+}
+
+void BTFTypeEnum64::completeType(BTFDebug &BDebug) {
+  if (IsCompleted)
+    return;
+  IsCompleted = true;
+
+  BTFType.NameOff = BDebug.addString(ETy->getName());
+
+  DINodeArray Elements = ETy->getElements();
+  for (const auto Element : Elements) {
+    const auto *Enum = cast<DIEnumerator>(Element);
+
+    struct BTF::BTFEnum64 BTFEnum;
+    BTFEnum.NameOff = BDebug.addString(Enum->getName());
+    uint64_t Value;
+    if (Enum->isUnsigned())
+      Value = static_cast<uint64_t>(Enum->getValue().getZExtValue());
+    else
+      Value = static_cast<uint64_t>(Enum->getValue().getSExtValue());
+    BTFEnum.Val_Lo32 = Value;
+    BTFEnum.Val_Hi32 = Value >> 32;
+    EnumValues.push_back(BTFEnum);
+  }
+}
+
+void BTFTypeEnum64::emitType(MCStreamer &OS) {
+  BTFTypeBase::emitType(OS);
+  for (const auto &Enum : EnumValues) {
+    OS.emitInt32(Enum.NameOff);
+    OS.AddComment("0x" + Twine::utohexstr(Enum.Val_Lo32));
+    OS.emitInt32(Enum.Val_Lo32);
+    OS.AddComment("0x" + Twine::utohexstr(Enum.Val_Hi32));
+    OS.emitInt32(Enum.Val_Hi32);
+  }
+}
+
 BTFTypeArray::BTFTypeArray(uint32_t ElemTypeId, uint32_t NumElems) {
   Kind = BTF::BTF_KIND_ARRAY;
   BTFType.NameOff = 0;
@@ -552,6 +596,46 @@ void BTFDebug::processDeclAnnotations(DINodeArray Annotations,
   }
 }
 
+/// Generate btf_type_tag chains.
+int BTFDebug::genBTFTypeTags(const DIDerivedType *DTy, int BaseTypeId) {
+  SmallVector<const MDString *, 4> MDStrs;
+  DINodeArray Annots = DTy->getAnnotations();
+  if (Annots) {
+    // For type with "int __tag1 __tag2 *p", the MDStrs will have
+    // content: [__tag1, __tag2].
+    for (const Metadata *Annotations : Annots->operands()) {
+      const MDNode *MD = cast<MDNode>(Annotations);
+      const MDString *Name = cast<MDString>(MD->getOperand(0));
+      if (!Name->getString().equals("btf_type_tag"))
+        continue;
+      MDStrs.push_back(cast<MDString>(MD->getOperand(1)));
+    }
+  }
+
+  if (MDStrs.size() == 0)
+    return -1;
+
+  // With MDStrs [__tag1, __tag2], the output type chain looks like
+  //   PTR -> __tag2 -> __tag1 -> BaseType
+  // In the below, we construct BTF types with the order of __tag1, __tag2
+  // and PTR.
+  unsigned TmpTypeId;
+  std::unique_ptr<BTFTypeTypeTag> TypeEntry;
+  if (BaseTypeId >= 0)
+    TypeEntry =
+        std::make_unique<BTFTypeTypeTag>(BaseTypeId, MDStrs[0]->getString());
+  else
+    TypeEntry = std::make_unique<BTFTypeTypeTag>(DTy, MDStrs[0]->getString());
+  TmpTypeId = addType(std::move(TypeEntry));
+
+  for (unsigned I = 1; I < MDStrs.size(); I++) {
+    const MDString *Value = MDStrs[I];
+    TypeEntry = std::make_unique<BTFTypeTypeTag>(TmpTypeId, Value->getString());
+    TmpTypeId = addType(std::move(TypeEntry));
+  }
+  return TmpTypeId;
+}
+
 /// Handle structure/union types.
 void BTFDebug::visitStructType(const DICompositeType *CTy, bool IsStruct,
                                uint32_t &TypeId) {
@@ -633,8 +717,25 @@ void BTFDebug::visitEnumType(const DICompositeType *CTy, uint32_t &TypeId) {
   if (VLen > BTF::MAX_VLEN)
     return;
 
-  auto TypeEntry = std::make_unique<BTFTypeEnum>(CTy, VLen);
-  TypeId = addType(std::move(TypeEntry), CTy);
+  bool IsSigned = false;
+  unsigned NumBits = 32;
+  // No BaseType implies forward declaration in which case a
+  // BTFTypeEnum with Vlen = 0 is emitted.
+  if (CTy->getBaseType() != nullptr) {
+    const auto *BTy = cast<DIBasicType>(CTy->getBaseType());
+    IsSigned = BTy->getEncoding() == dwarf::DW_ATE_signed ||
+               BTy->getEncoding() == dwarf::DW_ATE_signed_char;
+    NumBits = BTy->getSizeInBits();
+  }
+
+  if (NumBits <= 32) {
+    auto TypeEntry = std::make_unique<BTFTypeEnum>(CTy, VLen, IsSigned);
+    TypeId = addType(std::move(TypeEntry), CTy);
+  } else {
+    assert(NumBits == 64);
+    auto TypeEntry = std::make_unique<BTFTypeEnum64>(CTy, VLen, IsSigned);
+    TypeId = addType(std::move(TypeEntry), CTy);
+  }
   // No need to visit base type as BTF does not encode it.
 }
 
@@ -684,9 +785,8 @@ void BTFDebug::visitDerivedType(const DIDerivedType *DTy, uint32_t &TypeId,
           /// pointee type will be replaced with either a real type or
           /// a forward declaration.
           auto TypeEntry = std::make_unique<BTFTypeDerived>(DTy, Tag, true);
-          auto &Fixup = FixupDerivedTypes[CTy->getName()];
-          Fixup.first = CTag == dwarf::DW_TAG_union_type;
-          Fixup.second.push_back(TypeEntry.get());
+          auto &Fixup = FixupDerivedTypes[CTy];
+          Fixup.push_back(std::make_pair(DTy, TypeEntry.get()));
           TypeId = addType(std::move(TypeEntry), DTy);
           return;
         }
@@ -695,34 +795,8 @@ void BTFDebug::visitDerivedType(const DIDerivedType *DTy, uint32_t &TypeId,
   }
 
   if (Tag == dwarf::DW_TAG_pointer_type) {
-    SmallVector<const MDString *, 4> MDStrs;
-    DINodeArray Annots = DTy->getAnnotations();
-    if (Annots) {
-      // For type with "int __tag1 __tag2 *p", the MDStrs will have
-      // content: [__tag1, __tag2].
-      for (const Metadata *Annotations : Annots->operands()) {
-        const MDNode *MD = cast<MDNode>(Annotations);
-        const MDString *Name = cast<MDString>(MD->getOperand(0));
-        if (!Name->getString().equals("btf_type_tag"))
-          continue;
-        MDStrs.push_back(cast<MDString>(MD->getOperand(1)));
-      }
-    }
-
-    if (MDStrs.size() > 0) {
-      // With MDStrs [__tag1, __tag2], the output type chain looks like
-      //   PTR -> __tag2 -> __tag1 -> BaseType
-      // In the below, we construct BTF types with the order of __tag1, __tag2
-      // and PTR.
-      auto TypeEntry =
-          std::make_unique<BTFTypeTypeTag>(DTy, MDStrs[0]->getString());
-      unsigned TmpTypeId = addType(std::move(TypeEntry));
-      for (unsigned I = 1; I < MDStrs.size(); I++) {
-        const MDString *Value = MDStrs[I];
-        TypeEntry =
-            std::make_unique<BTFTypeTypeTag>(TmpTypeId, Value->getString());
-        TmpTypeId = addType(std::move(TypeEntry));
-      }
+    int TmpTypeId = genBTFTypeTags(DTy, -1);
+    if (TmpTypeId >= 0) {
       auto TypeDEntry =
           std::make_unique<BTFTypeDerived>(TmpTypeId, Tag, DTy->getName());
       TypeId = addType(std::move(TypeDEntry), DTy);
@@ -773,15 +847,31 @@ void BTFDebug::visitTypeEntry(const DIType *Ty, uint32_t &TypeId,
     // already defined, we should keep moving to eventually
     // bring in types for "struct t". Otherwise, the "struct s2"
     // definition won't be correct.
+    //
+    // In the above, we have following debuginfo:
+    //  {ptr, struct_member} ->  typedef -> struct
+    // and BTF type for 'typedef' is generated while 'struct' may
+    // be in FixUp. But let us generalize the above to handle
+    //  {different types} -> [various derived types]+ -> another type.
+    // For example,
+    //  {func_param, struct_member} -> const -> ptr -> volatile -> struct
+    // We will traverse const/ptr/volatile which already have corresponding
+    // BTF types and generate type for 'struct' which might be in Fixup
+    // state.
     if (Ty && (!CheckPointer || !SeenPointer)) {
       if (const auto *DTy = dyn_cast<DIDerivedType>(Ty)) {
-        unsigned Tag = DTy->getTag();
-        if (Tag == dwarf::DW_TAG_typedef || Tag == dwarf::DW_TAG_const_type ||
-            Tag == dwarf::DW_TAG_volatile_type ||
-            Tag == dwarf::DW_TAG_restrict_type) {
-          uint32_t TmpTypeId;
-          visitTypeEntry(DTy->getBaseType(), TmpTypeId, CheckPointer,
-                         SeenPointer);
+        while (DTy) {
+          const DIType *BaseTy = DTy->getBaseType();
+          if (!BaseTy)
+            break;
+
+          if (DIToIdMap.find(BaseTy) != DIToIdMap.end()) {
+            DTy = dyn_cast<DIDerivedType>(BaseTy);
+          } else {
+            uint32_t TmpTypeId;
+            visitTypeEntry(BaseTy, TmpTypeId, CheckPointer, SeenPointer);
+            break;
+          }
         }
       }
     }
@@ -908,7 +998,7 @@ void BTFDebug::emitBTFSection() {
   MCContext &Ctx = OS.getContext();
   MCSectionELF *Sec = Ctx.getELFSection(".BTF", ELF::SHT_PROGBITS, 0);
   Sec->setAlignment(Align(4));
-  OS.SwitchSection(Sec);
+  OS.switchSection(Sec);
 
   // Emit header.
   emitCommonHeader();
@@ -948,7 +1038,7 @@ void BTFDebug::emitBTFExtSection() {
   MCContext &Ctx = OS.getContext();
   MCSectionELF *Sec = Ctx.getELFSection(".BTF.ext", ELF::SHT_PROGBITS, 0);
   Sec->setAlignment(Align(4));
-  OS.SwitchSection(Sec);
+  OS.switchSection(Sec);
 
   // Emit header.
   emitCommonHeader();
@@ -1436,9 +1526,8 @@ void BTFDebug::processFuncPrototypes(const Function *F) {
     return;
 
   // Do not emit again if already emitted.
-  if (ProtoFunctions.find(F) != ProtoFunctions.end())
+  if (!ProtoFunctions.insert(F).second)
     return;
-  ProtoFunctions.insert(F);
 
   uint32_t ProtoTypeId;
   const std::unordered_map<uint32_t, StringRef> FuncArgNames;
@@ -1480,8 +1569,9 @@ void BTFDebug::endModule() {
 
   // Fixups
   for (auto &Fixup : FixupDerivedTypes) {
-    StringRef TypeName = Fixup.first;
-    bool IsUnion = Fixup.second.first;
+    const DICompositeType *CTy = Fixup.first;
+    StringRef TypeName = CTy->getName();
+    bool IsUnion = CTy->getTag() == dwarf::DW_TAG_union_type;
 
     // Search through struct types
     uint32_t StructTypeId = 0;
@@ -1497,8 +1587,15 @@ void BTFDebug::endModule() {
       StructTypeId = addType(std::move(FwdTypeEntry));
     }
 
-    for (auto &DType : Fixup.second.second) {
-      DType->setPointeeType(StructTypeId);
+    for (auto &TypeInfo : Fixup.second) {
+      const DIDerivedType *DTy = TypeInfo.first;
+      BTFTypeDerived *BDType = TypeInfo.second;
+
+      int TmpTypeId = genBTFTypeTags(DTy, StructTypeId);
+      if (TmpTypeId >= 0)
+        BDType->setPointeeType(TmpTypeId);
+      else
+        BDType->setPointeeType(StructTypeId);
     }
   }
 
diff --git a/llvm/lib/Target/BPF/BTFDebug.h b/llvm/lib/Target/BPF/BTFDebug.h
index 7c30675c553c..1ad8ec5d918c 100644
--- a/llvm/lib/Target/BPF/BTFDebug.h
+++ b/llvm/lib/Target/BPF/BTFDebug.h
@@ -103,7 +103,7 @@ class BTFTypeEnum : public BTFTypeBase {
   std::vector<struct BTF::BTFEnum> EnumValues;
 
 public:
-  BTFTypeEnum(const DICompositeType *ETy, uint32_t NumValues);
+  BTFTypeEnum(const DICompositeType *ETy, uint32_t NumValues, bool IsSigned);
   uint32_t getSize() override {
     return BTFTypeBase::getSize() + EnumValues.size() * BTF::BTFEnumSize;
   }
@@ -218,6 +218,20 @@ public:
   void emitType(MCStreamer &OS) override;
 };
 
+/// Handle 64-bit enumerate type.
+class BTFTypeEnum64 : public BTFTypeBase {
+  const DICompositeType *ETy;
+  std::vector<struct BTF::BTFEnum64> EnumValues;
+
+public:
+  BTFTypeEnum64(const DICompositeType *ETy, uint32_t NumValues, bool IsSigned);
+  uint32_t getSize() override {
+    return BTFTypeBase::getSize() + EnumValues.size() * BTF::BTFEnum64Size;
+  }
+  void completeType(BTFDebug &BDebug) override;
+  void emitType(MCStreamer &OS) override;
+};
+
 class BTFTypeTypeTag : public BTFTypeBase {
   const DIDerivedType *DTy;
   StringRef Tag;
@@ -289,7 +303,8 @@ class BTFDebug : public DebugHandlerBase {
   std::map<std::string, std::unique_ptr<BTFKindDataSec>> DataSecEntries;
   std::vector<BTFTypeStruct *> StructTypes;
   std::map<const GlobalVariable *, std::pair<int64_t, uint32_t>> PatchImms;
-  std::map<StringRef, std::pair<bool, std::vector<BTFTypeDerived *>>>
+  std::map<const DICompositeType *,
+           std::vector<std::pair<const DIDerivedType *, BTFTypeDerived *>>>
       FixupDerivedTypes;
   std::set<const Function *>ProtoFunctions;
 
@@ -341,6 +356,13 @@ class BTFDebug : public DebugHandlerBase {
   void processDeclAnnotations(DINodeArray Annotations, uint32_t BaseTypeId,
                               int ComponentId);
 
+  /// Generate BTF type_tag's. If BaseTypeId is nonnegative, the last
+  /// BTF type_tag in the chain points to BaseTypeId. Otherwise, it points to
+  /// the base type of DTy. Return the type id of the first BTF type_tag
+  /// in the chain. If no type_tag's are generated, a negative value
+  /// is returned.
+  int genBTFTypeTags(const DIDerivedType *DTy, int BaseTypeId);
+
   /// Generate one field relocation record.
   void generatePatchImmReloc(const MCSymbol *ORSym, uint32_t RootId,
                              const GlobalVariable *, bool IsAma);
diff --git a/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp b/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
index 3f643d47f934..aa408f8b65f7 100644
--- a/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
+++ b/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
@@ -15,9 +15,10 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDecoderOps.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
-#include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/SubtargetFeature.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/MathExtras.h"
 #include <cstdint>
@@ -99,7 +100,7 @@ static const unsigned GPRDecoderTable[] = {
 
 static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                            uint64_t /*Address*/,
-                                           const void * /*Decoder*/) {
+                                           const MCDisassembler * /*Decoder*/) {
   if (RegNo > 11)
     return MCDisassembler::Fail;
 
@@ -112,9 +113,9 @@ static const unsigned GPR32DecoderTable[] = {
     BPF::W0,  BPF::W1,  BPF::W2,  BPF::W3,  BPF::W4,  BPF::W5,
     BPF::W6,  BPF::W7,  BPF::W8,  BPF::W9,  BPF::W10, BPF::W11};
 
-static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t /*Address*/,
-                                             const void * /*Decoder*/) {
+static DecodeStatus
+DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t /*Address*/,
+                         const MCDisassembler * /*Decoder*/) {
   if (RegNo > 11)
     return MCDisassembler::Fail;
 
@@ -124,7 +125,8 @@ static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
 }
 
 static DecodeStatus decodeMemoryOpValue(MCInst &Inst, unsigned Insn,
-                                        uint64_t Address, const void *Decoder) {
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder) {
   unsigned Register = (Insn >> 16) & 0xf;
   if (Register > 11)
     return MCDisassembler::Fail;
@@ -220,4 +222,4 @@ DecodeStatus BPFDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
 }
 
 typedef DecodeStatus (*DecodeFunc)(MCInst &MI, unsigned insn, uint64_t Address,
-                                   const void *Decoder);
+                                   const MCDisassembler *Decoder);
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
index bacd00360f82..56fdd6766132 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
@@ -87,6 +87,11 @@ void BPFAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
     }
   } else {
     assert(Fixup.getKind() == FK_PCRel_2);
+
+    int64_t ByteOff = (int64_t)Value - 8;
+    if (ByteOff > INT16_MAX * 8 || ByteOff < INT16_MIN * 8)
+      report_fatal_error("Branch target out of insn range");
+
     Value = (uint16_t)((Value - 8) / 8);
     support::endian::write<uint16_t>(&Data[Fixup.getOffset() + 2], Value,
                                      Endian);
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp
index 200c72a07ed6..6f041584a955 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp
@@ -15,6 +15,7 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 using namespace llvm;
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
index 3292c3e5ebb5..14f6b367b8c7 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
@@ -41,8 +41,6 @@ public:
     // section will be parsable, but with odd offsets and
     // line numbers, etc.
     CodePointerSize = 8;
-
-    UseIntegratedAssembler = false;
   }
 
   void setDwarfUsesRelocationsAcrossSections(bool enable) {
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
index 12af92e0d198..a98d001097bc 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
@@ -73,15 +73,13 @@ private:
 } // end anonymous namespace
 
 MCCodeEmitter *llvm::createBPFMCCodeEmitter(const MCInstrInfo &MCII,
-                                            const MCRegisterInfo &MRI,
                                             MCContext &Ctx) {
-  return new BPFMCCodeEmitter(MCII, MRI, true);
+  return new BPFMCCodeEmitter(MCII, *Ctx.getRegisterInfo(), true);
 }
 
 MCCodeEmitter *llvm::createBPFbeMCCodeEmitter(const MCInstrInfo &MCII,
-                                              const MCRegisterInfo &MRI,
                                               MCContext &Ctx) {
-  return new BPFMCCodeEmitter(MCII, MRI, false);
+  return new BPFMCCodeEmitter(MCII, *Ctx.getRegisterInfo(), false);
 }
 
 unsigned BPFMCCodeEmitter::getMachineOpValue(const MCInst &MI,
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
index a426a132cf47..fc190504581c 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
@@ -14,6 +14,7 @@
 #define LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFMCTARGETDESC_H
 
 #include "llvm/Config/config.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/Support/DataTypes.h"
 
 #include <memory>
@@ -30,10 +31,8 @@ class MCTargetOptions;
 class Target;
 
 MCCodeEmitter *createBPFMCCodeEmitter(const MCInstrInfo &MCII,
-                                      const MCRegisterInfo &MRI,
                                       MCContext &Ctx);
 MCCodeEmitter *createBPFbeMCCodeEmitter(const MCInstrInfo &MCII,
-                                        const MCRegisterInfo &MRI,
                                         MCContext &Ctx);
 
 MCAsmBackend *createBPFAsmBackend(const Target &T, const MCSubtargetInfo &STI,
diff --git a/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp b/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp
index a62bd111cba9..63a60473d664 100644
--- a/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp
+++ b/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp
@@ -9,14 +9,17 @@
 #include "MCTargetDesc/CSKYInstPrinter.h"
 #include "MCTargetDesc/CSKYMCExpr.h"
 #include "MCTargetDesc/CSKYMCTargetDesc.h"
+#include "MCTargetDesc/CSKYTargetStreamer.h"
 #include "TargetInfo/CSKYTargetInfo.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/Register.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
@@ -25,6 +28,8 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/CSKYAttributes.h"
+#include "llvm/Support/CSKYTargetParser.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -52,6 +57,9 @@ class CSKYAsmParser : public MCTargetAsmParser {
 
   const MCRegisterInfo *MRI;
 
+  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
+                                      unsigned Kind) override;
+
   bool generateImmOutOfRangeError(OperandVector &Operands, uint64_t ErrorInfo,
                                   int64_t Lower, int64_t Upper, Twine Msg);
 
@@ -78,6 +86,16 @@ class CSKYAsmParser : public MCTargetAsmParser {
 
   bool processInstruction(MCInst &Inst, SMLoc IDLoc, OperandVector &Operands,
                           MCStreamer &Out);
+  bool processLRW(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out);
+  bool processJSRI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out);
+  bool processJMPI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out);
+
+  CSKYTargetStreamer &getTargetStreamer() {
+    assert(getParser().getStreamer().getTargetStreamer() &&
+           "do not have a target streamer");
+    MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
+    return static_cast<CSKYTargetStreamer &>(TS);
+  }
 
 // Auto-generated instruction matching functions
 #define GET_ASSEMBLER_HEADER
@@ -95,6 +113,8 @@ class CSKYAsmParser : public MCTargetAsmParser {
 
   bool parseOperand(OperandVector &Operands, StringRef Mnemonic);
 
+  bool parseDirectiveAttribute();
+
 public:
   enum CSKYMatchResultTy {
     Match_Dummy = FIRST_TARGET_MATCH_RESULT_TY,
@@ -108,7 +128,14 @@ public:
   CSKYAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
                 const MCInstrInfo &MII, const MCTargetOptions &Options)
       : MCTargetAsmParser(Options, STI, MII) {
+
+    MCAsmParserExtension::Initialize(Parser);
+
+    // Cache the MCRegisterInfo.
+    MRI = getContext().getRegisterInfo();
+
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+    getTargetStreamer().emitTargetAttributes(STI);
   }
 };
 
@@ -612,6 +639,11 @@ public:
 #define GET_MNEMONIC_SPELL_CHECKER
 #include "CSKYGenAsmMatcher.inc"
 
+static MCRegister convertFPR32ToFPR64(MCRegister Reg) {
+  assert(Reg >= CSKY::F0_32 && Reg <= CSKY::F31_32 && "Invalid register");
+  return Reg - CSKY::F0_32 + CSKY::F0_64;
+}
+
 static std::string CSKYMnemonicSpellCheck(StringRef S, const FeatureBitset &FBS,
                                           unsigned VariantID = 0);
 
@@ -788,6 +820,96 @@ bool CSKYAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   llvm_unreachable("Unknown match type detected!");
 }
 
+bool CSKYAsmParser::processLRW(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out) {
+  Inst.setLoc(IDLoc);
+
+  unsigned Opcode;
+  MCOperand Op;
+  if (Inst.getOpcode() == CSKY::PseudoLRW16)
+    Opcode = CSKY::LRW16;
+  else
+    Opcode = CSKY::LRW32;
+
+  if (Inst.getOperand(1).isImm()) {
+    if (isUInt<8>(Inst.getOperand(1).getImm()) &&
+        Inst.getOperand(0).getReg() <= CSKY::R7) {
+      Opcode = CSKY::MOVI16;
+    } else if (getSTI().getFeatureBits()[CSKY::HasE2] &&
+               isUInt<16>(Inst.getOperand(1).getImm())) {
+      Opcode = CSKY::MOVI32;
+    } else {
+      auto *Expr = getTargetStreamer().addConstantPoolEntry(
+          MCConstantExpr::create(Inst.getOperand(1).getImm(), getContext()),
+          Inst.getLoc());
+      Inst.erase(std::prev(Inst.end()));
+      Inst.addOperand(MCOperand::createExpr(Expr));
+    }
+  } else {
+    const MCExpr *AdjustExpr = nullptr;
+    if (const CSKYMCExpr *CSKYExpr =
+            dyn_cast<CSKYMCExpr>(Inst.getOperand(1).getExpr())) {
+      if (CSKYExpr->getKind() == CSKYMCExpr::VK_CSKY_TLSGD ||
+          CSKYExpr->getKind() == CSKYMCExpr::VK_CSKY_TLSIE ||
+          CSKYExpr->getKind() == CSKYMCExpr::VK_CSKY_TLSLDM) {
+        MCSymbol *Dot = getContext().createNamedTempSymbol();
+        Out.emitLabel(Dot);
+        AdjustExpr = MCSymbolRefExpr::create(Dot, getContext());
+      }
+    }
+    auto *Expr = getTargetStreamer().addConstantPoolEntry(
+        Inst.getOperand(1).getExpr(), Inst.getLoc(), AdjustExpr);
+    Inst.erase(std::prev(Inst.end()));
+    Inst.addOperand(MCOperand::createExpr(Expr));
+  }
+
+  Inst.setOpcode(Opcode);
+
+  Out.emitInstruction(Inst, getSTI());
+  return false;
+}
+
+bool CSKYAsmParser::processJSRI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out) {
+  Inst.setLoc(IDLoc);
+
+  if (Inst.getOperand(0).isImm()) {
+    const MCExpr *Expr = getTargetStreamer().addConstantPoolEntry(
+        MCConstantExpr::create(Inst.getOperand(0).getImm(), getContext()),
+        Inst.getLoc());
+    Inst.setOpcode(CSKY::JSRI32);
+    Inst.erase(std::prev(Inst.end()));
+    Inst.addOperand(MCOperand::createExpr(Expr));
+  } else {
+    const MCExpr *Expr = getTargetStreamer().addConstantPoolEntry(
+        Inst.getOperand(0).getExpr(), Inst.getLoc());
+    Inst.setOpcode(CSKY::JBSR32);
+    Inst.addOperand(MCOperand::createExpr(Expr));
+  }
+
+  Out.emitInstruction(Inst, getSTI());
+  return false;
+}
+
+bool CSKYAsmParser::processJMPI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out) {
+  Inst.setLoc(IDLoc);
+
+  if (Inst.getOperand(0).isImm()) {
+    const MCExpr *Expr = getTargetStreamer().addConstantPoolEntry(
+        MCConstantExpr::create(Inst.getOperand(0).getImm(), getContext()),
+        Inst.getLoc());
+    Inst.setOpcode(CSKY::JMPI32);
+    Inst.erase(std::prev(Inst.end()));
+    Inst.addOperand(MCOperand::createExpr(Expr));
+  } else {
+    const MCExpr *Expr = getTargetStreamer().addConstantPoolEntry(
+        Inst.getOperand(0).getExpr(), Inst.getLoc());
+    Inst.setOpcode(CSKY::JBR32);
+    Inst.addOperand(MCOperand::createExpr(Expr));
+  }
+
+  Out.emitInstruction(Inst, getSTI());
+  return false;
+}
+
 bool CSKYAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
                                        OperandVector &Operands,
                                        MCStreamer &Out) {
@@ -845,6 +967,28 @@ bool CSKYAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
     Inst.erase(std::next(Inst.begin()));
     Inst.insert(Inst.end(), MCOperand::createReg(CSKY::C));
     break;
+  case CSKY::PseudoLRW16:
+  case CSKY::PseudoLRW32:
+    return processLRW(Inst, IDLoc, Out);
+  case CSKY::PseudoJSRI32:
+    return processJSRI(Inst, IDLoc, Out);
+  case CSKY::PseudoJMPI32:
+    return processJMPI(Inst, IDLoc, Out);
+  case CSKY::JBSR32:
+  case CSKY::JBR16:
+  case CSKY::JBT16:
+  case CSKY::JBF16:
+  case CSKY::JBR32:
+  case CSKY::JBT32:
+  case CSKY::JBF32:
+    unsigned Num = Inst.getNumOperands() - 1;
+    assert(Inst.getOperand(Num).isExpr());
+
+    const MCExpr *Expr = getTargetStreamer().addConstantPoolEntry(
+        Inst.getOperand(Num).getExpr(), Inst.getLoc());
+
+    Inst.addOperand(MCOperand::createExpr(Expr));
+    break;
   }
 
   emitToStreamer(Out, Inst);
@@ -1471,7 +1615,132 @@ OperandMatchResultTy CSKYAsmParser::tryParseRegister(unsigned &RegNo,
   return MatchOperand_Success;
 }
 
-bool CSKYAsmParser::ParseDirective(AsmToken DirectiveID) { return true; }
+bool CSKYAsmParser::ParseDirective(AsmToken DirectiveID) {
+  // This returns false if this function recognizes the directive
+  // regardless of whether it is successfully handles or reports an
+  // error. Otherwise it returns true to give the generic parser a
+  // chance at recognizing it.
+  StringRef IDVal = DirectiveID.getString();
+
+  if (IDVal == ".csky_attribute")
+    return parseDirectiveAttribute();
+
+  return true;
+}
+
+/// parseDirectiveAttribute
+///  ::= .attribute expression ',' ( expression | "string" )
+bool CSKYAsmParser::parseDirectiveAttribute() {
+  MCAsmParser &Parser = getParser();
+  int64_t Tag;
+  SMLoc TagLoc;
+  TagLoc = Parser.getTok().getLoc();
+  if (Parser.getTok().is(AsmToken::Identifier)) {
+    StringRef Name = Parser.getTok().getIdentifier();
+    Optional<unsigned> Ret =
+        ELFAttrs::attrTypeFromString(Name, CSKYAttrs::getCSKYAttributeTags());
+    if (!Ret.hasValue()) {
+      Error(TagLoc, "attribute name not recognised: " + Name);
+      return false;
+    }
+    Tag = Ret.getValue();
+    Parser.Lex();
+  } else {
+    const MCExpr *AttrExpr;
+
+    TagLoc = Parser.getTok().getLoc();
+    if (Parser.parseExpression(AttrExpr))
+      return true;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(AttrExpr);
+    if (check(!CE, TagLoc, "expected numeric constant"))
+      return true;
+
+    Tag = CE->getValue();
+  }
+
+  if (Parser.parseToken(AsmToken::Comma, "comma expected"))
+    return true;
+
+  StringRef StringValue;
+  int64_t IntegerValue = 0;
+  bool IsIntegerValue = ((Tag != CSKYAttrs::CSKY_ARCH_NAME) &&
+                         (Tag != CSKYAttrs::CSKY_CPU_NAME) &&
+                         (Tag != CSKYAttrs::CSKY_FPU_NUMBER_MODULE));
+
+  SMLoc ValueExprLoc = Parser.getTok().getLoc();
+  if (IsIntegerValue) {
+    const MCExpr *ValueExpr;
+    if (Parser.parseExpression(ValueExpr))
+      return true;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ValueExpr);
+    if (!CE)
+      return Error(ValueExprLoc, "expected numeric constant");
+    IntegerValue = CE->getValue();
+  } else {
+    if (Parser.getTok().isNot(AsmToken::String))
+      return Error(Parser.getTok().getLoc(), "expected string constant");
+
+    StringValue = Parser.getTok().getStringContents();
+    Parser.Lex();
+  }
+
+  if (Parser.parseEOL())
+    return true;
+
+  if (IsIntegerValue)
+    getTargetStreamer().emitAttribute(Tag, IntegerValue);
+  else if (Tag != CSKYAttrs::CSKY_ARCH_NAME && Tag != CSKYAttrs::CSKY_CPU_NAME)
+    getTargetStreamer().emitTextAttribute(Tag, StringValue);
+  else {
+    CSKY::ArchKind ID = (Tag == CSKYAttrs::CSKY_ARCH_NAME)
+                            ? CSKY::parseArch(StringValue)
+                            : CSKY::parseCPUArch(StringValue);
+    if (ID == CSKY::ArchKind::INVALID)
+      return Error(ValueExprLoc, (Tag == CSKYAttrs::CSKY_ARCH_NAME)
+                                     ? "unknown arch name"
+                                     : "unknown cpu name");
+
+    getTargetStreamer().emitTextAttribute(Tag, StringValue);
+  }
+
+  return false;
+}
+
+unsigned CSKYAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
+                                                   unsigned Kind) {
+  CSKYOperand &Op = static_cast<CSKYOperand &>(AsmOp);
+
+  if (!Op.isReg())
+    return Match_InvalidOperand;
+
+  MCRegister Reg = Op.getReg();
+
+  if (CSKYMCRegisterClasses[CSKY::FPR32RegClassID].contains(Reg)) {
+    // As the parser couldn't differentiate an FPR64 from an FPR32, coerce the
+    // register from FPR32 to FPR64 if necessary.
+    if (Kind == MCK_FPR64 || Kind == MCK_sFPR64) {
+      Op.Reg.RegNum = convertFPR32ToFPR64(Reg);
+      if (Kind == MCK_sFPR64 &&
+          (Op.Reg.RegNum < CSKY::F0_64 || Op.Reg.RegNum > CSKY::F15_64))
+        return Match_InvalidRegOutOfRange;
+      if (Kind == MCK_FPR64 &&
+          (Op.Reg.RegNum < CSKY::F0_64 || Op.Reg.RegNum > CSKY::F31_64))
+        return Match_InvalidRegOutOfRange;
+      return Match_Success;
+    }
+  }
+
+  if (CSKYMCRegisterClasses[CSKY::GPRRegClassID].contains(Reg)) {
+    if (Kind == MCK_GPRPair) {
+      Op.Reg.RegNum = MRI->getEncodingValue(Reg) + CSKY::R0_R1;
+      return Match_Success;
+    }
+  }
+
+  return Match_InvalidOperand;
+}
 
 void CSKYAsmParser::emitToStreamer(MCStreamer &S, const MCInst &Inst) {
   MCInst CInst;
diff --git a/llvm/lib/Target/CSKY/CSKY.h b/llvm/lib/Target/CSKY/CSKY.h
index 401d6fa1a0a5..27a6c6d2f250 100644
--- a/llvm/lib/Target/CSKY/CSKY.h
+++ b/llvm/lib/Target/CSKY/CSKY.h
@@ -14,11 +14,13 @@
 #ifndef LLVM_LIB_TARGET_CSKY_CSKY_H
 #define LLVM_LIB_TARGET_CSKY_CSKY_H
 
+#include "llvm/PassRegistry.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
 class CSKYTargetMachine;
 class FunctionPass;
+class PassRegistry;
 
 FunctionPass *createCSKYISelDag(CSKYTargetMachine &TM);
 FunctionPass *createCSKYConstantIslandPass();
diff --git a/llvm/lib/Target/CSKY/CSKY.td b/llvm/lib/Target/CSKY/CSKY.td
index ddb7fe93706e..a8db9151e127 100644
--- a/llvm/lib/Target/CSKY/CSKY.td
+++ b/llvm/lib/Target/CSKY/CSKY.td
@@ -32,6 +32,26 @@ def HasFPUv2_DF : Predicate<"Subtarget->hasFPUv2DoubleFloat()">,
                   AssemblerPredicate<(all_of FeatureFPUV2_DF),
                   "Enable FPUv2 double float instructions">;
 
+def FeatureFdivdu : SubtargetFeature<"fdivdu", "HasFdivdu", "true",
+                                     "Enable float divide instructions">;
+def HasFdivdu : Predicate<"Subtarget->hasFdivdu()">,
+                AssemblerPredicate<(all_of FeatureFdivdu),
+                "Enable float divide instructions">;
+
+def FeatureFPUV3_HI
+    : SubtargetFeature<"fpuv3_hi", "HasFPUv3HalfWord", "true",
+                       "Enable FPUv3 harf word converting instructions">;
+def HasFPUv3_HI : Predicate<"Subtarget->hasFPUv3HalfWord()">,
+                  AssemblerPredicate<(all_of FeatureFPUV3_HI),
+                  "Enable FPUv3 harf word converting instructions">;
+
+def FeatureFPUV3_HF
+    : SubtargetFeature<"fpuv3_hf", "HasFPUv3HalfFloat", "true",
+                       "Enable FPUv3 harf precision operate instructions">;
+def HasFPUv3_HF : Predicate<"Subtarget->hasFPUv3HalfFloat()">,
+                  AssemblerPredicate<(all_of FeatureFPUV3_HF),
+                  "Enable FPUv3 harf precision operate instructions">;
+
 def FeatureFPUV3_SF
     : SubtargetFeature<"fpuv3_sf", "HasFPUv3SingleFloat", "true",
                        "Enable FPUv3 single float instructions">;
@@ -46,6 +66,85 @@ def HasFPUv3_DF : Predicate<"Subtarget->hasFPUv3DoubleFloat()">,
                   AssemblerPredicate<(all_of FeatureFPUV3_DF),
                   "Enable FPUv3 double float instructions">;
 
+def HasFLOATE1
+    : SubtargetFeature<"floate1", "HasFLOATE1", "true", "Support CSKY floate1 instructions">;
+def iHasFLOATE1 : Predicate<"Subtarget->hasFLOATE1()">,
+             AssemblerPredicate<(all_of HasFLOATE1),
+             "Support CSKY floate1 instructions">;
+
+def HasFLOAT1E2
+    : SubtargetFeature<"float1e2", "HasFLOAT1E2", "true", "Support CSKY float1e2 instructions">;
+def iHasFLOAT1E2 : Predicate<"Subtarget->hasFLOAT1E2()">,
+             AssemblerPredicate<(all_of HasFLOAT1E2),
+             "Support CSKY float1e2 instructions">;
+
+def HasFLOAT1E3
+    : SubtargetFeature<"float1e3", "HasFLOAT1E3", "true", "Support CSKY float1e3 instructions">;
+def iHasFLOAT1E3 : Predicate<"Subtarget->hasFLOAT1E3()">,
+             AssemblerPredicate<(all_of HasFLOAT1E3),
+             "Support CSKY float1e3 instructions">;
+
+def HasFLOAT3E4
+    : SubtargetFeature<"float3e4", "HasFLOAT3E4", "true", "Support CSKY float3e4 instructions">;
+def iHasFLOAT3E4 : Predicate<"Subtarget->hasFLOAT3E4()">,
+             AssemblerPredicate<(all_of HasFLOAT3E4),
+             "Support CSKY float3e4 instructions">;
+
+def HasFLOAT7E60
+    : SubtargetFeature<"float7e60", "HasFLOAT7E60", "true", "Support CSKY float7e60 instructions">;
+def iHasFLOAT7E60 : Predicate<"Subtarget->hasFLOAT7E60()">,
+             AssemblerPredicate<(all_of HasFLOAT7E60),
+             "Support CSKY float7e60 instructions">;
+
+def FeatureHWDiv : SubtargetFeature<"hwdiv", "HasHardwareDivide", "true",
+                                    "Enable divide instrutions">;
+def HasHWDiv : Predicate<"Subtarget->hasHardwareDivide()">,
+               AssemblerPredicate<(all_of FeatureHWDiv),
+               "Enable divide instrutions">;
+
+def FeatureSTM : SubtargetFeature<"multiple_stld", "HasSTM", "true",
+                                  "Enable multiple load/store instrutions">;
+def HasSTM : Predicate<"Subtarget->hasSTM()">,
+             AssemblerPredicate<(all_of FeatureSTM),
+             "Enable multiple load/store instrutions">;
+
+def FeaturePushPop : SubtargetFeature<"pushpop", "HasPushPop", "true",
+                                      "Enable push/pop instrutions">;
+def HasPushPop : Predicate<"Subtarget->hasPushPop()">,
+                 AssemblerPredicate<(all_of FeaturePushPop),
+                 "Enable push/pop instrutions">;
+
+def FeatureDSP
+    : SubtargetFeature<"edsp", "HasDSP", "true", "Enable DSP instrutions">;
+def HasDSP : Predicate<"Subtarget->hasDSP()">,
+             AssemblerPredicate<(all_of FeatureDSP),
+             "Enable DSP instrutions">;
+
+def HasDSP1E2
+    : SubtargetFeature<"dsp1e2", "HasDSP1E2", "true", "Support CSKY dsp1e2 instructions">;
+def iHasDSP1E2 : Predicate<"Subtarget->hasDSP1E2()">,
+             AssemblerPredicate<(all_of HasDSP1E2),
+             "Support CSKY dsp1e2 instructions">;
+
+def HasDSPE60
+    : SubtargetFeature<"dspe60", "HasDSPE60", "true", "Support CSKY dspe60 instructions">;
+def iHasDSPE60 : Predicate<"Subtarget->hasDSPE60()">,
+             AssemblerPredicate<(all_of HasDSPE60),
+             "Support CSKY dspe60 instructions">;
+
+def FeatureDSPV2 : SubtargetFeature<"dspv2", "HasDSPV2", "true",
+                                    "Enable DSP V2.0 instrutions">;
+def HasDSPV2 : Predicate<"Subtarget->hasDSPV2()">,
+               AssemblerPredicate<(all_of FeatureDSPV2),
+               "Enable DSP V2.0 instrutions">;
+
+def FeatureDSP_Silan : SubtargetFeature<"dsp_silan", "HasDSP_Silan", "true",
+                                    "Enable DSP Silan instrutions">;
+def HasDSP_Silan : Predicate<"Subtarget->hasDSP_Silan()">,
+               AssemblerPredicate<(all_of FeatureDSP_Silan),
+               "Enable DSP Silan instrutions">;
+
+// Atomic Support
 def FeatureBTST16 : SubtargetFeature<"btst16", "HasBTST16", "true",
                                      "Use the 16-bit btsti instruction">;
 def HasBTST16 : Predicate<"Subtarget->hasBTST16()">,
@@ -59,18 +158,110 @@ def HasExtendLrw : Predicate<"Subtarget->hasExtendLrw()">,
                    AssemblerPredicate<(all_of FeatureExtendLrw),
                    "Use the extend LRW instruction">;
 
+def FeatureTrust : SubtargetFeature<"trust", "HasTrust", "true",
+                                    "Enable trust instructions">;
+def HasTrust : Predicate<"Subtarget->hasTrust()">,
+               AssemblerPredicate<(all_of FeatureTrust),
+               "Enable trust instructions">;
+
 def FeatureJAVA
     : SubtargetFeature<"java", "HasJAVA", "true", "Enable java instructions">;
 def HasJAVA : Predicate<"Subtarget->hasJAVA()">,
               AssemblerPredicate<(all_of FeatureJAVA),
               "Enable java instructions">;
 
+def FeatureCache
+    : SubtargetFeature<"cache", "HasCache", "true", "Enable cache">;
+def HasCache : Predicate<"Subtarget->hasCache()">,
+              AssemblerPredicate<(all_of FeatureCache),
+              "Enable cache">;
+
+def FeatureNVIC
+    : SubtargetFeature<"nvic", "HasNVIC", "true", "Enable NVIC">;
+def HasNVIC : Predicate<"Subtarget->hasNVIC()">,
+              AssemblerPredicate<(all_of FeatureNVIC),
+              "Enable NVIC">;
+
 def FeatureDoloop : SubtargetFeature<"doloop", "HasDoloop", "true",
                                      "Enable doloop instructions">;
 def HasDoloop : Predicate<"Subtarget->hasDoloop()">,
                 AssemblerPredicate<(all_of FeatureDoloop),
                 "Enable doloop instructions">;
 
+// Other features than instructions
+def FeatureHighreg : SubtargetFeature<"high-registers", "HasHighRegisters",
+                                      "true", "Enable r16-r31 registers">;
+def HasHighRegisters : Predicate<"Subtarget->hasHighRegisters()">,
+                       AssemblerPredicate<(all_of FeatureHighreg),
+                       "Enable r16-r31 registers">;
+
+def FeatureSmart : SubtargetFeature<"smart", "SmartMode", "true",
+                                    "Let CPU work in Smart Mode">;
+def SmartMode : Predicate<"Subtarget->smartMode()">,
+                AssemblerPredicate<(all_of FeatureSmart),
+                "Let CPU work in Smart Mode">;
+
+def FeatureVDSPV2 : SubtargetFeature<"vdspv2", "HasVDSPV2", "true",
+                                     "Enable vdsp-v2 instructions">;
+def HasVDSPV2 : Predicate<"Subtarget->hasVDSPV2()">,
+                AssemblerPredicate<(all_of FeatureVDSPV2),
+                "Enable vdsp-v2 instructions">;
+
+def HasVDSPV2_FLOAT : Predicate<"Subtarget->hasVDSPV2_FLOAT()">;
+def HasVDSPV2_HALF: Predicate<"Subtarget->hasVDSPV2_HALF()">;
+
+def HasVDSP2E3
+    : SubtargetFeature<"vdsp2e3", "HasVDSP2E3", "true", "Support CSKY vdsp2e3 instructions">;
+def iHasVDSP2E3 : Predicate<"Subtarget->hasVDSP2E3()">,
+             AssemblerPredicate<(all_of HasVDSP2E3),
+             "Support CSKY vdsp2e3 instructions">;
+
+def HasVDSP2E60F
+    : SubtargetFeature<"vdsp2e60f", "HasVDSP2E60F", "true", "Support CSKY vdsp2e60f instructions">;
+def iHasVDSP2E60F : Predicate<"Subtarget->hasVDSP2E60F()">,
+             AssemblerPredicate<(all_of HasVDSP2E60F),
+             "Support CSKY vdsp2e60f instructions">;
+
+def FeatureHardTP : SubtargetFeature<"hard-tp", "ReadTPHard", "true",
+                                     "Enable TLS Pointer register">;
+def ReadTPHard : Predicate<"Subtarget->readTPHard()">,
+                 AssemblerPredicate<(all_of FeatureHardTP),
+                 "Enable TLS Pointer register">;
+
+def FeatureSoftTP : SubtargetFeature<"soft-tp", "ReadTPHard", "false",
+                                     "Disable TLS Pointer register">;
+
+def FeatureIstack : SubtargetFeature<"istack", "EnableInterruptAttribute",
+                                     "true", "Enable interrput attribute">;
+def EnableInterruptAttribute
+    : Predicate<"Subtarget->enableInterruptAttribute()">,
+      AssemblerPredicate<(all_of FeatureIstack),
+      "Enable interrput attribute">;
+
+def FeatureConstPool : SubtargetFeature<"constpool", "DumpConstPool", "true",
+                                        "Dump the constant pool by compiler">;
+def DumpConstPool : Predicate<"Subtarget->dumpConstPool()">,
+                    AssemblerPredicate<(all_of FeatureConstPool),
+                    "Dump the constant pool by compiler">;
+
+def FeatureStackSize : SubtargetFeature<"stack-size", "EnableStackSize", "true",
+                                        "Output stack size information">;
+def EnableStackSize : Predicate<"Subtarget->enableStackSize()">,
+                      AssemblerPredicate<(all_of FeatureStackSize),
+                      "Output stack size information">;
+
+def FeatureCCRT
+    : SubtargetFeature<"ccrt", "UseCCRT", "true", "Use CSKY compiler runtime">;
+def UseCCRT : Predicate<"Subtarget->useCCRT()">,
+              AssemblerPredicate<(all_of FeatureCCRT),
+              "Use CSKY compiler runtime">;
+
+def FeatureVDSPV1_128 : SubtargetFeature<"vdspv1", "HasVDSPV1_128", "true",
+                                         "Enable 128bit vdsp-v1 instructions">;
+def HasVDSPV1_128 : Predicate<"Subtarget->hasVDSPV1_128()">,
+                    AssemblerPredicate<(all_of FeatureVDSPV1_128),
+                    "Enable 128bit vdsp-v1 instructions">;
+
 def HasE1
     : SubtargetFeature<"e1", "HasE1", "true", "Support CSKY e1 instructions",
                        [FeatureExtendLrw]>;
@@ -91,12 +282,25 @@ def iHas2E3 : Predicate<"Subtarget->has2E3()">,
               AssemblerPredicate<(all_of Has2E3),
               "Support CSKY 2e3 instructions">;
 
+def HasMP : SubtargetFeature<"mp", "HasMP", "true",
+                                "Support CSKY mp instructions", [Has2E3]>;
+def iHasMP : Predicate<"Subtarget->hasMP()">,
+                AssemblerPredicate<(all_of HasMP),
+                "Support CSKY mp instructions">;
+
 def Has3E3r1 : SubtargetFeature<"3e3r1", "Has3E3r1", "true",
                                 "Support CSKY 3e3r1 instructions">;
 def iHas3E3r1 : Predicate<"Subtarget->has3E3r1()">,
                 AssemblerPredicate<(all_of Has3E3r1),
                 "Support CSKY 3e3r1 instructions">;
 
+def Has3r1E3r2 : SubtargetFeature<"3e3r2", "Has3r1E3r2", "true",
+                                  "Support CSKY 3e3r2 instructions",
+                                  [Has3E3r1, FeatureDoloop]>;
+def iHas3r1E3r2 : Predicate<"Subtarget->has3r1E3r2()">,
+                  AssemblerPredicate<(all_of Has3r1E3r2),
+                  "Support CSKY 3e3r2 instructions">;
+
 def Has3r2E3r3
     : SubtargetFeature<"3e3r3", "Has3r2E3r3", "true",
                        "Support CSKY 3e3r3 instructions", [FeatureDoloop]>;
@@ -128,6 +332,35 @@ def iHas10E60 : Predicate<"Subtarget->has10E60()">,
                AssemblerPredicate<(all_of Has10E60),
                "Support CSKY 10e60 instructions">;
 
+//===----------------------------------------------------------------------===//
+// CSKY Processor subtarget features.
+//===----------------------------------------------------------------------===//
+
+def ProcCK801 : SubtargetFeature<"ck801", "CSKYProcFamily", "CK801",
+                                 "CSKY ck801 processors", []>;
+def isCK801 : Predicate<"Subtarget->isCK801()">,
+              AssemblerPredicate<(all_of ProcCK801)>;
+def ProcCK802 : SubtargetFeature<"ck802", "CSKYProcFamily", "CK802",
+                                 "CSKY ck802 processors", []>;
+def ProcCK803 : SubtargetFeature<"ck803", "CSKYProcFamily", "CK803",
+                                 "CSKY ck803 processors", []>;
+def ProcCK803S : SubtargetFeature<"ck803s", "CSKYProcFamily", "CK803S",
+                                 "CSKY ck803s processors", []>;
+def ProcCK804 : SubtargetFeature<"ck804", "CSKYProcFamily", "CK804",
+                                 "CSKY ck804 processors", []>;
+def ProcCK805 : SubtargetFeature<"ck805", "CSKYProcFamily", "CK805",
+                                 "CSKY ck805 processors", []>;
+def ProcCK807 : SubtargetFeature<"ck807", "CSKYProcFamily", "CK807",
+                                 "CSKY ck807 processors", []>;
+def ProcCK810 : SubtargetFeature<"ck810", "CSKYProcFamily", "CK810",
+                                 "CSKY ck810 processors", []>;
+def ProcCK810V : SubtargetFeature<"ck810v", "CSKYProcFamily", "CK810V",
+                                 "CSKY ck810v processors", []>;
+def ProcCK860 : SubtargetFeature<"ck860", "CSKYProcFamily", "CK860",
+                                 "CSKY ck860 processors", []>;
+def ProcCK860V : SubtargetFeature<"ck860v", "CSKYProcFamily", "CK860V",
+                                 "CSKY ck860v processors", []>;
+
 //===----------------------------------------------------------------------===//
 // Registers, calling conventions, instruction descriptions.
 //===----------------------------------------------------------------------===//
@@ -142,6 +375,296 @@ include "CSKYInstrInfo.td"
 
 def : ProcessorModel<"generic", NoSchedModel, []>;
 
+// CK801 series
+class CK801<string n, SchedMachineModel m, list<SubtargetFeature> f,
+            list<SubtargetFeature> tunef = []>
+    : ProcessorModel<n, m, !listconcat(f, [HasE1, FeatureTrust, FeatureBTST16, ProcCK801]), !listconcat(tunef, [])>;
+
+def : CK801<"ck801", NoSchedModel, []>;
+def : CK801<"ck801t", NoSchedModel, []>;
+def : CK801<"e801", NoSchedModel, []>;
+
+// CK802 series
+class CK802<string n, SchedMachineModel m, list<SubtargetFeature> f,
+            list<SubtargetFeature> tunef = []>
+    : ProcessorModel<n, m, !listconcat(f, [HasE2, FeatureTrust, FeatureBTST16, FeatureNVIC, ProcCK802]), !listconcat(tunef, [])>;
+
+def : CK802<"ck802", NoSchedModel, []>;
+def : CK802<"ck802t", NoSchedModel, []>;
+def : CK802<"ck802j", NoSchedModel, [FeatureJAVA]>;
+def : CK802<"e802", NoSchedModel, []>;
+def : CK802<"e802t", NoSchedModel, []>;
+def : CK802<"s802", NoSchedModel, []>;
+def : CK802<"s802t", NoSchedModel, []>;
+
+// CK803 series
+class CK803<string n, SchedMachineModel m, list<SubtargetFeature> f,
+            list<SubtargetFeature> tunef = []>
+    : ProcessorModel<n, m, !listconcat(f, [Has2E3, HasMP, FeatureTrust, FeatureBTST16, FeatureNVIC, FeatureHWDiv, ProcCK803]), !listconcat(tunef, [])>;
+
+def : CK803<"ck803", NoSchedModel, []>;
+def : CK803<"ck803h", NoSchedModel, []>;
+def : CK803<"ck803t", NoSchedModel, []>;
+def : CK803<"ck803ht", NoSchedModel, []>;
+def : CK803<"ck803f", NoSchedModel, [FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3]>;
+def : CK803<"ck803fh", NoSchedModel, [FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3]>;
+def : CK803<"ck803e", NoSchedModel, [FeatureDSP, HasDSP1E2, HasDSPE60]>;
+def : CK803<"ck803eh", NoSchedModel, [FeatureDSP, HasDSP1E2, HasDSPE60]>;
+def : CK803<"ck803et", NoSchedModel, [FeatureDSP, HasDSP1E2, HasDSPE60]>;
+def : CK803<"ck803eht", NoSchedModel, [FeatureDSP, HasDSP1E2, HasDSPE60]>;
+def : CK803<"ck803ef", NoSchedModel,
+            [FeatureDSP, HasDSP1E2, HasDSPE60, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3]>;
+def : CK803<"ck803efh", NoSchedModel,
+            [FeatureDSP, HasDSP1E2, HasDSPE60, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3]>;
+def : CK803<"ck803ft", NoSchedModel, [FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3]>;
+def : CK803<"ck803eft", NoSchedModel,
+            [FeatureDSP, HasDSP1E2, HasDSPE60, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3]>;
+def : CK803<"ck803efht", NoSchedModel,
+            [FeatureDSP, HasDSP1E2, HasDSPE60, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3]>;
+def : CK803<"ck803r1", NoSchedModel, [Has3E3r1, Has3r2E3r3, FeatureDSPV2]>;
+def : CK803<"ck803hr1", NoSchedModel, [Has3E3r1, Has3r2E3r3, FeatureDSPV2]>;
+def : CK803<"ck803tr1", NoSchedModel, [Has3E3r1, Has3r2E3r3, FeatureDSPV2]>;
+def : CK803<"ck803htr1", NoSchedModel, [Has3E3r1, Has3r2E3r3, FeatureDSPV2]>;
+def : CK803<"ck803fr1", NoSchedModel,
+            [Has3E3r1, Has3r2E3r3, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3, FeatureDSPV2]>;
+def : CK803<"ck803fhr1", NoSchedModel,
+            [Has3E3r1, Has3r2E3r3, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3, FeatureDSPV2]>;
+def : CK803<"ck803er1", NoSchedModel,
+            [Has3E3r1, Has3r2E3r3, FeatureDSP, HasDSP1E2, HasDSPE60, FeatureDSPV2, FeatureHighreg]>;
+def : CK803<"ck803etr1", NoSchedModel,
+            [Has3E3r1, Has3r2E3r3, FeatureDSP, HasDSP1E2, HasDSPE60, FeatureDSPV2, FeatureHighreg]>;
+def : CK803<"ck803ehr1", NoSchedModel,
+            [Has3E3r1, Has3r2E3r3, FeatureDSP, HasDSP1E2, HasDSPE60, FeatureDSPV2, FeatureHighreg]>;
+def : CK803<"ck803ehtr1", NoSchedModel,
+            [Has3E3r1, Has3r2E3r3, FeatureDSP, HasDSP1E2, HasDSPE60, FeatureDSPV2, FeatureHighreg]>;
+def : CK803<"ck803efr1", NoSchedModel,
+            [Has3E3r1, FeatureDSP, HasDSP1E2, HasDSPE60, FeatureDSPV2, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3, FeatureHighreg]>;
+def : CK803<"ck803efhr1", NoSchedModel,
+            [Has3E3r1, FeatureDSP, HasDSP1E2, HasDSPE60, FeatureDSPV2, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3, FeatureHighreg]>;
+def : CK803<"ck803ftr1", NoSchedModel, [Has3E3r1, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3, FeatureDSPV2]>;
+def : CK803<"ck803eftr1", NoSchedModel,
+            [Has3E3r1, FeatureDSP, HasDSP1E2, HasDSPE60, FeatureDSPV2, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3, FeatureHighreg]>;
+def : CK803<"ck803efhtr1", NoSchedModel,
+            [Has3E3r1, FeatureDSP, HasDSP1E2, HasDSPE60, FeatureDSPV2, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3, FeatureHighreg]>;
+def : CK803<"ck803r2", NoSchedModel, [Has3r1E3r2, Has3r2E3r3, FeatureDSPV2]>;
+def : CK803<"ck803hr2", NoSchedModel, [Has3r1E3r2, Has3r2E3r3, FeatureDSPV2]>;
+def : CK803<"ck803tr2", NoSchedModel, [Has3r1E3r2, Has3r2E3r3, FeatureDSPV2]>;
+def : CK803<"ck803htr2", NoSchedModel, [Has3r1E3r2, Has3r2E3r3, FeatureDSPV2]>;
+def : CK803<"ck803fr2", NoSchedModel, [Has3r1E3r2, Has3r2E3r3, FeatureDSPV2, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3]>;
+def : CK803<"ck803fhr2", NoSchedModel, [Has3r1E3r2, Has3r2E3r3, FeatureDSPV2, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3]>;
+def : CK803<"ck803er2", NoSchedModel, [Has3r1E3r2, Has3r2E3r3, FeatureDSP, HasDSP1E2, HasDSPE60, FeatureDSPV2, FeatureHighreg]>;
+def : CK803<"ck803etr2", NoSchedModel, [Has3r1E3r2, Has3r2E3r3, FeatureDSP, HasDSP1E2, HasDSPE60, FeatureDSPV2, FeatureHighreg]>;
+def : CK803<"ck803ehr2", NoSchedModel, [Has3r1E3r2, Has3r2E3r3, FeatureDSP, HasDSP1E2, HasDSPE60, FeatureDSPV2, FeatureHighreg]>;
+def : CK803<"ck803ehtr2", NoSchedModel, [Has3r1E3r2, Has3r2E3r3, FeatureDSP, HasDSP1E2, HasDSPE60, FeatureDSPV2, FeatureHighreg]>;
+def : CK803<"ck803efr2", NoSchedModel,
+            [Has3r1E3r2, Has3r2E3r3, FeatureDSP, HasDSP1E2, HasDSPE60, FeatureDSPV2, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3, FeatureHighreg]>;
+def : CK803<"ck803efhr2", NoSchedModel,
+            [Has3r1E3r2, Has3r2E3r3, FeatureDSP, HasDSP1E2, HasDSPE60, FeatureDSPV2, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3, FeatureHighreg]>;
+def : CK803<"ck803ftr2", NoSchedModel, [Has3r1E3r2, Has3r2E3r3, FeatureDSPV2, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3]>;
+def : CK803<"ck803eftr2", NoSchedModel,
+            [Has3r1E3r2, Has3r2E3r3, FeatureDSP, HasDSP1E2, HasDSPE60, FeatureDSPV2, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3, FeatureHighreg]>;
+def : CK803<"ck803efhtr2", NoSchedModel,
+            [Has3r1E3r2, Has3r2E3r3, FeatureDSP, HasDSP1E2, HasDSPE60, FeatureDSPV2, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3, FeatureHighreg]>;
+def : CK803<"ck803r3", NoSchedModel, [Has3r1E3r2, Has3r2E3r3, FeatureDSPV2]>;
+def : CK803<"ck803hr3", NoSchedModel, [Has3r1E3r2, Has3r2E3r3, FeatureDSPV2]>;
+def : CK803<"ck803tr3", NoSchedModel, [Has3r1E3r2, Has3r2E3r3, FeatureDSPV2]>;
+def : CK803<"ck803htr3", NoSchedModel, [Has3r1E3r2, Has3r2E3r3, FeatureDSPV2]>;
+def : CK803<"ck803fr3", NoSchedModel, [Has3r1E3r2, Has3r2E3r3, FeatureDSPV2, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3]>;
+def : CK803<"ck803fhr3", NoSchedModel, [Has3r1E3r2, Has3r2E3r3, FeatureDSPV2, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3]>;
+def : CK803<"ck803er3", NoSchedModel, [Has3r1E3r2, Has3r2E3r3, FeatureDSP, HasDSP1E2, HasDSPE60, FeatureDSPV2, FeatureHighreg]>;
+def : CK803<"ck803etr3", NoSchedModel, [Has3r1E3r2, Has3r2E3r3, FeatureDSP, HasDSP1E2, HasDSPE60, FeatureDSPV2, FeatureHighreg]>;
+def : CK803<"ck803ehr3", NoSchedModel, [Has3r1E3r2, Has3r2E3r3, FeatureDSP, HasDSP1E2, HasDSPE60, FeatureDSPV2, FeatureHighreg]>;
+def : CK803<"ck803ehtr3", NoSchedModel, [Has3r1E3r2, Has3r2E3r3, FeatureDSP, HasDSP1E2, HasDSPE60, FeatureDSPV2, FeatureHighreg]>;
+def : CK803<"ck803efr3", NoSchedModel,
+            [Has3r1E3r2, Has3r2E3r3, FeatureDSP, HasDSP1E2, HasDSPE60, FeatureDSPV2, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3, FeatureHighreg]>;
+def : CK803<"ck803efhr3", NoSchedModel,
+            [Has3r1E3r2, Has3r2E3r3, FeatureDSP, HasDSP1E2, HasDSPE60, FeatureDSPV2, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3, FeatureHighreg]>;
+def : CK803<"ck803ftr3", NoSchedModel, [Has3r1E3r2, Has3r2E3r3, FeatureDSPV2, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3]>;
+def : CK803<"ck803eftr3", NoSchedModel,
+            [Has3r1E3r2, Has3r2E3r3, FeatureDSP, HasDSP1E2, HasDSPE60, FeatureDSPV2, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3, FeatureHighreg]>;
+def : CK803<"ck803efhtr3", NoSchedModel,
+            [Has3r1E3r2, Has3r2E3r3, FeatureDSP, HasDSP1E2, HasDSPE60, FeatureDSPV2, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3, FeatureHighreg]>;
+def : CK803<"s803", NoSchedModel, [Has3r1E3r2, Has3r2E3r3]>;
+def : CK803<"s803t", NoSchedModel, [Has3r1E3r2, Has3r2E3r3]>;
+def : CK803<"e803", NoSchedModel, [Has3r1E3r2, Has3r2E3r3]>;
+def : CK803<"e803t", NoSchedModel, [Has3r1E3r2, Has3r2E3r3]>;
+
+// CK803S series
+class CK803S<string n, SchedMachineModel m, list<SubtargetFeature> f,
+list<SubtargetFeature> tunef = []> : CK803<n, m, !listconcat(f, [Has3E3r1, ProcCK803S]), tunef>;
+
+def : CK803S<"ck803s", NoSchedModel, []>;
+def : CK803S<"ck803sn", NoSchedModel, [FeatureDSP_Silan]>;
+def : CK803S<"ck803st", NoSchedModel, []>;
+def : CK803S<"ck803snt", NoSchedModel, [FeatureDSP_Silan]>;
+def : CK803S<"ck803sf", NoSchedModel, [FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3]>;
+def : CK803S<"ck803sfn", NoSchedModel, [FeatureFPUV2_SF, FeatureDSP_Silan, HasFLOATE1, HasFLOAT1E3]>;
+def : CK803S<"ck803se", NoSchedModel, [FeatureDSP, HasDSP1E2, HasDSPE60]>;
+def : CK803S<"ck803sen", NoSchedModel, [FeatureDSP, HasDSP1E2, HasDSPE60, FeatureDSP_Silan]>;
+def : CK803S<"ck803sef", NoSchedModel,
+             [FeatureDSP, HasDSP1E2, HasDSPE60, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3]>;
+def : CK803S<"ck803sefn", NoSchedModel,
+             [FeatureDSP, HasDSP1E2, HasDSPE60, FeatureFPUV2_SF, FeatureDSP_Silan,
+              HasFLOATE1, HasFLOAT1E3]>;
+def : CK803S<"ck803seft", NoSchedModel,
+             [FeatureDSP, HasDSP1E2, HasDSPE60, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3]>;
+def : CK803S<"ck803sefnt", NoSchedModel,
+             [FeatureDSP, HasDSP1E2, HasDSPE60, FeatureFPUV2_SF, FeatureDSP_Silan,
+              HasFLOATE1, HasFLOAT1E3]>;
+
+// CK804 series
+class CK804<string n, SchedMachineModel m, list<SubtargetFeature> f,
+            list<SubtargetFeature> tunef = []>
+    : CK803<n, m, !listconcat(f, [Has3r1E3r2, Has3r2E3r3, ProcCK804]), !listconcat(tunef, [])>;
+
+def : CK804<"ck804", NoSchedModel, []>;
+def : CK804<"ck804h", NoSchedModel, []>;
+def : CK804<"ck804t", NoSchedModel, []>;
+def : CK804<"ck804ht", NoSchedModel, []>;
+def : CK804<"ck804f", NoSchedModel, [FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3]>;
+def : CK804<"ck804fh", NoSchedModel, [FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3]>;
+def : CK804<"ck804e", NoSchedModel, [FeatureDSPV2, FeatureHighreg]>;
+def : CK804<"ck804et", NoSchedModel, [FeatureDSPV2, FeatureHighreg]>;
+def : CK804<"ck804eh", NoSchedModel, [FeatureDSPV2, FeatureHighreg]>;
+def : CK804<"ck804eht", NoSchedModel, [FeatureDSPV2, FeatureHighreg]>;
+def : CK804<"ck804ef", NoSchedModel, [FeatureDSPV2, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3, FeatureHighreg]>;
+def : CK804<"ck804efh", NoSchedModel, [FeatureDSPV2, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3, FeatureHighreg]>;
+def : CK804<"ck804ft", NoSchedModel, [FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3]>;
+def : CK804<"ck804eft", NoSchedModel, [FeatureDSPV2, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3, FeatureHighreg]>;
+def : CK804<"ck804efht", NoSchedModel, [FeatureDSPV2, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3, FeatureHighreg]>;
+def : CK804<"e804d", NoSchedModel, [FeatureDSPV2, FeatureHighreg]>;
+def : CK804<"e804dt", NoSchedModel, [FeatureDSPV2, FeatureHighreg]>;
+def : CK804<"e804f", NoSchedModel, [FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3]>;
+def : CK804<"e804ft", NoSchedModel, [FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3]>;
+def : CK804<"e804df", NoSchedModel, [FeatureDSPV2, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3, FeatureHighreg]>;
+def : CK804<"e804dft", NoSchedModel, [FeatureDSPV2, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3, FeatureHighreg]>;
+
+// CK805 series
+class CK805<string n, SchedMachineModel m, list<SubtargetFeature> f,
+            list<SubtargetFeature> tunef = []>
+    : CK803<n, m, !listconcat(f, [FeatureHighreg, FeatureVDSPV2, HasVDSP2E3, Has3r1E3r2, Has3r2E3r3, ProcCK805]),
+            !listconcat(tunef, [])>;
+
+def : CK805<"ck805", NoSchedModel, []>;
+def : CK805<"i805", NoSchedModel, []>;
+def : CK805<"ck805t", NoSchedModel, []>;
+def : CK805<"i805f", NoSchedModel, [FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3]>;
+def : CK805<"ck805f", NoSchedModel, [FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3]>;
+def : CK805<"ck805e", NoSchedModel, [FeatureDSPV2]>;
+def : CK805<"ck805ef", NoSchedModel, [FeatureDSPV2, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3]>;
+def : CK805<"ck805et", NoSchedModel, [FeatureDSPV2]>;
+def : CK805<"ck805ft", NoSchedModel, [FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3]>;
+def : CK805<"ck805eft", NoSchedModel, [FeatureDSPV2, FeatureFPUV2_SF, HasFLOATE1, HasFLOAT1E3]>;
+
+// CK807 series
+class CK807<string n, SchedMachineModel m, list<SubtargetFeature> f,
+            list<SubtargetFeature> tunef = []>
+    : ProcessorModel<n, m, !listconcat(f,
+    [ Has3E7, HasMP, HasMP1E2, FeatureTrust, FeatureHWDiv, FeatureDSP, HasDSP1E2, HasDSPE60,
+      FeatureHighreg, FeatureHardTP, FeatureNVIC, FeatureCache, ProcCK807]), !listconcat(tunef, [])>;
+
+def : CK807<"ck807", NoSchedModel, []>;
+def : CK807<"c807", NoSchedModel, []>;
+def : CK807<"r807", NoSchedModel, []>;
+def : CK807<"ck807e", NoSchedModel, [FeatureDSP, HasDSP1E2, HasDSPE60]>;
+def : CK807<"ck807f", NoSchedModel,
+            [FeatureFPUV2_SF, FeatureFPUV2_DF, FeatureFdivdu,
+             HasFLOATE1, HasFLOAT1E2, HasFLOAT1E3, HasFLOAT3E4]>;
+def : CK807<"c807f", NoSchedModel,
+            [FeatureFPUV2_SF, FeatureFPUV2_DF, FeatureFdivdu,
+             HasFLOATE1, HasFLOAT1E2, HasFLOAT1E3, HasFLOAT3E4]>;
+def : CK807<"r807f", NoSchedModel,
+            [FeatureFPUV2_SF, FeatureFPUV2_DF, FeatureFdivdu,
+             HasFLOATE1, HasFLOAT1E2, HasFLOAT1E3, HasFLOAT3E4]>;
+def : CK807<"ck807ef", NoSchedModel, [
+  FeatureDSP, HasDSP1E2, HasDSPE60, FeatureFPUV2_SF, FeatureFPUV2_DF,
+  FeatureFdivdu, HasFLOATE1, HasFLOAT1E2, HasFLOAT1E3, HasFLOAT3E4]>;
+
+// CK810 series
+class CK810<string n, SchedMachineModel m, list<SubtargetFeature> f,
+            list<SubtargetFeature> tunef = []>
+    : ProcessorModel<n, m, !listconcat(f,
+    [ Has7E10, HasMP, HasMP1E2, FeatureTrust, FeatureHWDiv, FeatureDSP, HasDSP1E2, HasDSPE60,
+      FeatureHighreg, FeatureHardTP, FeatureNVIC, FeatureCache, ProcCK810]), !listconcat(tunef, [])>;
+
+def : CK810<"ck810", NoSchedModel, []>;
+def : CK810<"ck810e", NoSchedModel, []>;
+def : CK810<"ck810t", NoSchedModel, []>;
+def : CK810<"ck810et", NoSchedModel, []>;
+def : CK810<"c810", NoSchedModel,
+            [FeatureFPUV2_SF, FeatureFPUV2_DF, FeatureFdivdu,
+             HasFLOATE1, HasFLOAT1E2]>;
+def : CK810<"ck810f", NoSchedModel,
+            [FeatureFPUV2_SF, FeatureFPUV2_DF, FeatureFdivdu,
+             HasFLOATE1, HasFLOAT1E2]>;
+def : CK810<"ck810ef", NoSchedModel,
+            [FeatureFPUV2_SF, FeatureFPUV2_DF, FeatureFdivdu,
+             HasFLOATE1, HasFLOAT1E2]>;
+def : CK810<"ck810ft", NoSchedModel,
+            [FeatureFPUV2_SF, FeatureFPUV2_DF, FeatureFdivdu,
+             HasFLOATE1, HasFLOAT1E2]>;
+def : CK810<"ck810eft", NoSchedModel,
+            [FeatureFPUV2_SF, FeatureFPUV2_DF, FeatureFdivdu,
+             HasFLOATE1, HasFLOAT1E2]>;
+def : CK810<"c810t", NoSchedModel,
+            [FeatureFPUV2_SF, FeatureFPUV2_DF, FeatureFdivdu,
+             HasFLOATE1, HasFLOAT1E2]>;
+
+class CK810V<string n, SchedMachineModel m, list<SubtargetFeature> f,
+             list<SubtargetFeature> tunef = []>
+    : CK810<n, m, !listconcat(f, [FeatureVDSPV1_128, ProcCK810V]), !listconcat(tunef, [])>;
+
+def : CK810V<"ck810v", NoSchedModel, []>;
+def : CK810V<"ck810ev", NoSchedModel, []>;
+def : CK810V<"ck810tv", NoSchedModel, []>;
+def : CK810V<"ck810etv", NoSchedModel, []>;
+def : CK810V<"ck810fv", NoSchedModel, [
+  FeatureFPUV2_SF, FeatureFPUV2_DF, FeatureFdivdu,
+  HasFLOATE1, HasFLOAT1E2
+]>;
+def : CK810V<"ck810efv", NoSchedModel, [
+  FeatureFPUV2_SF, FeatureFPUV2_DF, FeatureFdivdu,
+  HasFLOATE1, HasFLOAT1E2
+]>;
+def : CK810V<"c810v", NoSchedModel, [
+  FeatureFPUV2_SF, FeatureFPUV2_DF, FeatureFdivdu,
+  HasFLOATE1, HasFLOAT1E2
+]>;
+def : CK810V<"ck810ftv", NoSchedModel, [
+  FeatureFPUV2_SF, FeatureFPUV2_DF, FeatureFdivdu,
+  HasFLOATE1, HasFLOAT1E2
+]>;
+def : CK810V<"ck810eftv", NoSchedModel, [
+  FeatureFPUV2_SF, FeatureFPUV2_DF, FeatureFdivdu,
+  HasFLOATE1, HasFLOAT1E2
+]>;
+def : CK810V<"c810tv", NoSchedModel, [
+  FeatureFPUV2_SF, FeatureFPUV2_DF, FeatureFdivdu,
+  HasFLOATE1, HasFLOAT1E2
+]>;
+
+// CK860 series
+class CK860<string n, SchedMachineModel m, list<SubtargetFeature> f,
+            list<SubtargetFeature> tunef = []>
+    : ProcessorModel<n, m, !listconcat(f,
+    [ Has10E60, HasMP, HasMP1E2, Has3r1E3r2, Has3r2E3r3, FeatureTrust, FeatureBTST16, FeatureHWDiv, HasDSPE60,
+      FeatureHighreg, FeatureHardTP, FeatureNVIC, FeatureCache, ProcCK860]), !listconcat(tunef, [])>;
+
+class CK860V<string n, SchedMachineModel m, list<SubtargetFeature> f,
+             list<SubtargetFeature> tunef = []>
+    : CK860<n, m, !listconcat(f, [FeatureVDSPV2, HasVDSP2E60F, ProcCK860V]), !listconcat(tunef, [])>;
+
+def : CK860<"ck860", NoSchedModel, []>;
+def : CK860<"ck860f", NoSchedModel,
+            [FeatureFPUV3_HI, FeatureFPUV3_HF, FeatureFPUV3_SF, FeatureFPUV3_DF, HasFLOAT7E60]>;
+def : CK860<"c860", NoSchedModel,
+            [FeatureFPUV3_HI, FeatureFPUV3_HF, FeatureFPUV3_SF, FeatureFPUV3_DF, HasFLOAT7E60]>;
+def : CK860V<"c860v", NoSchedModel,
+             [FeatureFPUV3_HI, FeatureFPUV3_HF, FeatureFPUV3_SF, FeatureFPUV3_DF, HasFLOAT7E60]>;
+def : CK860V<"ck860v", NoSchedModel, []>;
+def : CK860V<"ck860fv", NoSchedModel,
+             [FeatureFPUV3_HI, FeatureFPUV3_HF, FeatureFPUV3_SF, FeatureFPUV3_DF, HasFLOAT7E60]>;
+
 //===----------------------------------------------------------------------===//
 // Define the CSKY target.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp b/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp
index c8269eeacfdb..0236b22ad379 100644
--- a/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp
+++ b/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp
@@ -16,10 +16,12 @@
 #include "CSKYTargetMachine.h"
 #include "MCTargetDesc/CSKYInstPrinter.h"
 #include "MCTargetDesc/CSKYMCExpr.h"
+#include "MCTargetDesc/CSKYTargetStreamer.h"
 #include "TargetInfo/CSKYTargetInfo.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -40,7 +42,15 @@ CSKYAsmPrinter::CSKYAsmPrinter(llvm::TargetMachine &TM,
 
 bool CSKYAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   MCP = MF.getConstantPool();
-  Subtarget = &MF.getSubtarget<CSKYSubtarget>();
+  TII = MF.getSubtarget().getInstrInfo();
+
+  // Set the current MCSubtargetInfo to a copy which has the correct
+  // feature bits for the current MachineFunction
+  MCSubtargetInfo &NewSTI =
+      OutStreamer->getContext().getSubtargetCopy(*TM.getMCSubtargetInfo());
+  NewSTI.setFeatureBits(MF.getSubtarget().getFeatureBits());
+  Subtarget = &NewSTI;
+
   return AsmPrinter::runOnMachineFunction(MF);
 }
 
@@ -59,8 +69,6 @@ void CSKYAsmPrinter::EmitToStreamer(MCStreamer &S, const MCInst &Inst) {
 #include "CSKYGenMCPseudoLowering.inc"
 
 void CSKYAsmPrinter::expandTLSLA(const MachineInstr *MI) {
-  const CSKYInstrInfo *TII = Subtarget->getInstrInfo();
-
   DebugLoc DL = MI->getDebugLoc();
 
   MCSymbol *PCLabel = OutContext.getOrCreateSymbol(
@@ -119,6 +127,19 @@ void CSKYAsmPrinter::emitFunctionBodyEnd() {
   InConstantPool = false;
 }
 
+void CSKYAsmPrinter::emitStartOfAsmFile(Module &M) {
+  if (TM.getTargetTriple().isOSBinFormatELF())
+    emitAttributes();
+}
+
+void CSKYAsmPrinter::emitEndOfAsmFile(Module &M) {
+  CSKYTargetStreamer &CTS =
+      static_cast<CSKYTargetStreamer &>(*OutStreamer->getTargetStreamer());
+
+  if (TM.getTargetTriple().isOSBinFormatELF())
+    CTS.finishAttributeSection();
+}
+
 void CSKYAsmPrinter::emitInstruction(const MachineInstr *MI) {
   // Do any auto-generated pseudo lowerings.
   if (emitPseudoExpansionLowering(*OutStreamer, MI))
@@ -218,6 +239,84 @@ void CSKYAsmPrinter::emitMachineConstantPoolValue(
   OutStreamer->emitValue(Expr, Size);
 }
 
+void CSKYAsmPrinter::emitAttributes() {
+  CSKYTargetStreamer &CTS =
+      static_cast<CSKYTargetStreamer &>(*OutStreamer->getTargetStreamer());
+
+  const Triple &TT = TM.getTargetTriple();
+  StringRef CPU = TM.getTargetCPU();
+  StringRef FS = TM.getTargetFeatureString();
+  const CSKYTargetMachine &CTM = static_cast<const CSKYTargetMachine &>(TM);
+  /* TuneCPU doesn't impact emission of ELF attributes, ELF attributes only
+     care about arch related features, so we can set TuneCPU as CPU.  */
+  const CSKYSubtarget STI(TT, CPU, /*TuneCPU=*/CPU, FS, CTM);
+
+  CTS.emitTargetAttributes(STI);
+}
+
+bool CSKYAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                     const char *ExtraCode, raw_ostream &OS) {
+  // First try the generic code, which knows about modifiers like 'c' and 'n'.
+  if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, OS))
+    return false;
+
+  const MachineOperand &MO = MI->getOperand(OpNo);
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0)
+      return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      return true; // Unknown modifier.
+    case 'R':
+      if (MO.getType() == MachineOperand::MO_Register) {
+        OS << CSKYInstPrinter::getRegisterName(MO.getReg() + 1);
+        return false;
+      }
+    }
+  }
+
+  switch (MO.getType()) {
+  case MachineOperand::MO_Immediate:
+    OS << MO.getImm();
+    return false;
+  case MachineOperand::MO_Register:
+    if (MO.getReg() == CSKY::C)
+      return false;
+    OS << CSKYInstPrinter::getRegisterName(MO.getReg());
+    return false;
+  case MachineOperand::MO_GlobalAddress:
+    PrintSymbolOperand(MO, OS);
+    return false;
+  case MachineOperand::MO_BlockAddress: {
+    MCSymbol *Sym = GetBlockAddressSymbol(MO.getBlockAddress());
+    Sym->print(OS, MAI);
+    return false;
+  }
+  default:
+    break;
+  }
+
+  return true;
+}
+
+bool CSKYAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                           unsigned OpNo, const char *ExtraCode,
+                                           raw_ostream &OS) {
+  if (!ExtraCode) {
+    const MachineOperand &MO = MI->getOperand(OpNo);
+    // For now, we only support register memory operands in registers and
+    // assume there is no addend
+    if (!MO.isReg())
+      return true;
+
+    OS << "(" << CSKYInstPrinter::getRegisterName(MO.getReg()) << ", 0)";
+    return false;
+  }
+
+  return AsmPrinter::PrintAsmMemoryOperand(MI, OpNo, ExtraCode, OS);
+}
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeCSKYAsmPrinter() {
   RegisterAsmPrinter<CSKYAsmPrinter> X(getTheCSKYTarget());
 }
diff --git a/llvm/lib/Target/CSKY/CSKYAsmPrinter.h b/llvm/lib/Target/CSKY/CSKYAsmPrinter.h
index 04a253d349c8..5e87594e4fdf 100644
--- a/llvm/lib/Target/CSKY/CSKYAsmPrinter.h
+++ b/llvm/lib/Target/CSKY/CSKYAsmPrinter.h
@@ -18,7 +18,8 @@ namespace llvm {
 class LLVM_LIBRARY_VISIBILITY CSKYAsmPrinter : public AsmPrinter {
   CSKYMCInstLower MCInstLowering;
 
-  const CSKYSubtarget *Subtarget;
+  const MCSubtargetInfo *Subtarget;
+  const TargetInstrInfo *TII;
 
   bool InConstantPool = false;
 
@@ -28,6 +29,7 @@ class LLVM_LIBRARY_VISIBILITY CSKYAsmPrinter : public AsmPrinter {
 
   void expandTLSLA(const MachineInstr *MI);
   void emitCustomConstantPool(const MachineInstr *MI);
+  void emitAttributes();
 
 public:
   explicit CSKYAsmPrinter(TargetMachine &TM,
@@ -46,12 +48,22 @@ public:
 
   void emitFunctionBodyEnd() override;
 
+  void emitStartOfAsmFile(Module &M) override;
+
+  void emitEndOfAsmFile(Module &M) override;
+
   void emitInstruction(const MachineInstr *MI) override;
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   // we emit constant pools customly!
   void emitConstantPool() override{};
+
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       const char *ExtraCode, raw_ostream &OS) override;
+
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             const char *ExtraCode, raw_ostream &OS) override;
 };
 } // end namespace llvm
 
diff --git a/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp
index 3ac335e2ad9d..5d7241258543 100644
--- a/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp
+++ b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp
@@ -29,6 +29,7 @@
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -286,7 +287,7 @@ LLVM_DUMP_METHOD void CSKYConstantIslands::dumpBBs() {
 bool CSKYConstantIslands::runOnMachineFunction(MachineFunction &Mf) {
   MF = &Mf;
   MCP = Mf.getConstantPool();
-  STI = &static_cast<const CSKYSubtarget &>(Mf.getSubtarget());
+  STI = &Mf.getSubtarget<CSKYSubtarget>();
 
   LLVM_DEBUG(dbgs() << "***** CSKYConstantIslands: "
                     << MCP->getConstants().size() << " CP entries, aligned to "
@@ -904,8 +905,7 @@ static inline unsigned getUnconditionalBrDisp(int Opc) {
     Scale = 2;
     break;
   default:
-    assert(0);
-    break;
+    llvm_unreachable("");
   }
 
   unsigned MaxOffs = ((1 << (Bits - 1)) - 1) * Scale;
diff --git a/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp b/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp
index 3bf001c2cee7..9907f39b3f90 100644
--- a/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp
+++ b/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp
@@ -13,6 +13,7 @@
 #include "CSKYFrameLowering.h"
 #include "CSKYMachineFunctionInfo.h"
 #include "CSKYSubtarget.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -270,6 +271,17 @@ void CSKYFrameLowering::emitEpilogue(MachineFunction &MF,
             MachineInstr::FrameDestroy);
 }
 
+static unsigned EstimateFunctionSizeInBytes(const MachineFunction &MF,
+                                            const CSKYInstrInfo &TII) {
+  unsigned FnSize = 0;
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB)
+      FnSize += TII.getInstSizeInBytes(MI);
+  }
+  FnSize += MF.getConstantPool()->getConstants().size() * 4;
+  return FnSize;
+}
+
 static unsigned estimateRSStackSizeLimit(MachineFunction &MF,
                                          const CSKYSubtarget &STI) {
   unsigned Limit = (1 << 12) - 1;
@@ -349,6 +361,7 @@ void CSKYFrameLowering::determineCalleeSaves(MachineFunction &MF,
 
   CSKYMachineFunctionInfo *CFI = MF.getInfo<CSKYMachineFunctionInfo>();
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  const CSKYInstrInfo *TII = STI.getInstrInfo();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   MachineFrameInfo &MFI = MF.getFrameInfo();
 
@@ -411,8 +424,6 @@ void CSKYFrameLowering::determineCalleeSaves(MachineFunction &MF,
     }
   }
 
-  CFI->setLRIsSpilled(SavedRegs.test(CSKY::R15));
-
   unsigned CSStackSize = 0;
   for (unsigned Reg : SavedRegs.set_bits()) {
     auto RegSize = TRI->getRegSizeInBits(Reg, MRI) / 8;
@@ -432,6 +443,14 @@ void CSKYFrameLowering::determineCalleeSaves(MachineFunction &MF,
 
     RS->addScavengingFrameIndex(MFI.CreateStackObject(size, align, false));
   }
+
+  unsigned FnSize = EstimateFunctionSizeInBytes(MF, *TII);
+  // Force R15 to be spilled if the function size is > 65534. This enables
+  // use of BSR to implement far jump.
+  if (FnSize >= ((1 << (16 - 1)) * 2))
+    SavedRegs.set(CSKY::R15);
+
+  CFI->setLRIsSpilled(SavedRegs.test(CSKY::R15));
 }
 
 // Not preserve stack space within prologue for outgoing variables when the
diff --git a/llvm/lib/Target/CSKY/CSKYISelDAGToDAG.cpp b/llvm/lib/Target/CSKY/CSKYISelDAGToDAG.cpp
index d58f9095aa0d..b893487f1f0f 100644
--- a/llvm/lib/Target/CSKY/CSKYISelDAGToDAG.cpp
+++ b/llvm/lib/Target/CSKY/CSKYISelDAGToDAG.cpp
@@ -14,6 +14,7 @@
 #include "CSKYSubtarget.h"
 #include "CSKYTargetMachine.h"
 #include "MCTargetDesc/CSKYMCTargetDesc.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 
@@ -42,6 +43,13 @@ public:
   void Select(SDNode *N) override;
   bool selectAddCarry(SDNode *N);
   bool selectSubCarry(SDNode *N);
+  bool selectBITCAST_TO_LOHI(SDNode *N);
+  bool selectInlineAsm(SDNode *N);
+
+  SDNode *createGPRPairNode(EVT VT, SDValue V0, SDValue V1);
+
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+                                    std::vector<SDValue> &OutOps) override;
 
 #include "CSKYGenDAGISel.inc"
 };
@@ -86,6 +94,13 @@ void CSKYDAGToDAGISel::Select(SDNode *N) {
     IsSelected = true;
     break;
   }
+  case CSKYISD::BITCAST_TO_LOHI:
+    IsSelected = selectBITCAST_TO_LOHI(N);
+    break;
+  case ISD::INLINEASM:
+  case ISD::INLINEASM_BR:
+    IsSelected = selectInlineAsm(N);
+    break;
   }
 
   if (IsSelected)
@@ -95,6 +110,185 @@ void CSKYDAGToDAGISel::Select(SDNode *N) {
   SelectCode(N);
 }
 
+bool CSKYDAGToDAGISel::selectInlineAsm(SDNode *N) {
+  std::vector<SDValue> AsmNodeOperands;
+  unsigned Flag, Kind;
+  bool Changed = false;
+  unsigned NumOps = N->getNumOperands();
+
+  // Normally, i64 data is bounded to two arbitrary GRPs for "%r" constraint.
+  // However, some instructions (e.g. mula.s32) require GPR pair.
+  // Since there is no constraint to explicitly specify a
+  // reg pair, we use GPRPair reg class for "%r" for 64-bit data.
+
+  SDLoc dl(N);
+  SDValue Glue =
+      N->getGluedNode() ? N->getOperand(NumOps - 1) : SDValue(nullptr, 0);
+
+  SmallVector<bool, 8> OpChanged;
+  // Glue node will be appended late.
+  for (unsigned i = 0, e = N->getGluedNode() ? NumOps - 1 : NumOps; i < e;
+       ++i) {
+    SDValue op = N->getOperand(i);
+    AsmNodeOperands.push_back(op);
+
+    if (i < InlineAsm::Op_FirstOperand)
+      continue;
+
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(i))) {
+      Flag = C->getZExtValue();
+      Kind = InlineAsm::getKind(Flag);
+    } else
+      continue;
+
+    // Immediate operands to inline asm in the SelectionDAG are modeled with
+    // two operands. The first is a constant of value InlineAsm::Kind_Imm, and
+    // the second is a constant with the value of the immediate. If we get here
+    // and we have a Kind_Imm, skip the next operand, and continue.
+    if (Kind == InlineAsm::Kind_Imm) {
+      SDValue op = N->getOperand(++i);
+      AsmNodeOperands.push_back(op);
+      continue;
+    }
+
+    unsigned NumRegs = InlineAsm::getNumOperandRegisters(Flag);
+    if (NumRegs)
+      OpChanged.push_back(false);
+
+    unsigned DefIdx = 0;
+    bool IsTiedToChangedOp = false;
+    // If it's a use that is tied with a previous def, it has no
+    // reg class constraint.
+    if (Changed && InlineAsm::isUseOperandTiedToDef(Flag, DefIdx))
+      IsTiedToChangedOp = OpChanged[DefIdx];
+
+    // Memory operands to inline asm in the SelectionDAG are modeled with two
+    // operands: a constant of value InlineAsm::Kind_Mem followed by the input
+    // operand. If we get here and we have a Kind_Mem, skip the next operand (so
+    // it doesn't get misinterpreted), and continue. We do this here because
+    // it's important to update the OpChanged array correctly before moving on.
+    if (Kind == InlineAsm::Kind_Mem) {
+      SDValue op = N->getOperand(++i);
+      AsmNodeOperands.push_back(op);
+      continue;
+    }
+
+    if (Kind != InlineAsm::Kind_RegUse && Kind != InlineAsm::Kind_RegDef &&
+        Kind != InlineAsm::Kind_RegDefEarlyClobber)
+      continue;
+
+    unsigned RC;
+    bool HasRC = InlineAsm::hasRegClassConstraint(Flag, RC);
+    if ((!IsTiedToChangedOp && (!HasRC || RC != CSKY::GPRRegClassID)) ||
+        NumRegs != 2)
+      continue;
+
+    assert((i + 2 < NumOps) && "Invalid number of operands in inline asm");
+    SDValue V0 = N->getOperand(i + 1);
+    SDValue V1 = N->getOperand(i + 2);
+    unsigned Reg0 = cast<RegisterSDNode>(V0)->getReg();
+    unsigned Reg1 = cast<RegisterSDNode>(V1)->getReg();
+    SDValue PairedReg;
+    MachineRegisterInfo &MRI = MF->getRegInfo();
+
+    if (Kind == InlineAsm::Kind_RegDef ||
+        Kind == InlineAsm::Kind_RegDefEarlyClobber) {
+      // Replace the two GPRs with 1 GPRPair and copy values from GPRPair to
+      // the original GPRs.
+
+      Register GPVR = MRI.createVirtualRegister(&CSKY::GPRPairRegClass);
+      PairedReg = CurDAG->getRegister(GPVR, MVT::i64);
+      SDValue Chain = SDValue(N, 0);
+
+      SDNode *GU = N->getGluedUser();
+      SDValue RegCopy =
+          CurDAG->getCopyFromReg(Chain, dl, GPVR, MVT::i64, Chain.getValue(1));
+
+      // Extract values from a GPRPair reg and copy to the original GPR reg.
+      SDValue Sub0 =
+          CurDAG->getTargetExtractSubreg(CSKY::sub32_0, dl, MVT::i32, RegCopy);
+      SDValue Sub1 =
+          CurDAG->getTargetExtractSubreg(CSKY::sub32_32, dl, MVT::i32, RegCopy);
+      SDValue T0 =
+          CurDAG->getCopyToReg(Sub0, dl, Reg0, Sub0, RegCopy.getValue(1));
+      SDValue T1 = CurDAG->getCopyToReg(Sub1, dl, Reg1, Sub1, T0.getValue(1));
+
+      // Update the original glue user.
+      std::vector<SDValue> Ops(GU->op_begin(), GU->op_end() - 1);
+      Ops.push_back(T1.getValue(1));
+      CurDAG->UpdateNodeOperands(GU, Ops);
+    } else {
+      // For Kind  == InlineAsm::Kind_RegUse, we first copy two GPRs into a
+      // GPRPair and then pass the GPRPair to the inline asm.
+      SDValue Chain = AsmNodeOperands[InlineAsm::Op_InputChain];
+
+      // As REG_SEQ doesn't take RegisterSDNode, we copy them first.
+      SDValue T0 =
+          CurDAG->getCopyFromReg(Chain, dl, Reg0, MVT::i32, Chain.getValue(1));
+      SDValue T1 =
+          CurDAG->getCopyFromReg(Chain, dl, Reg1, MVT::i32, T0.getValue(1));
+      SDValue Pair = SDValue(createGPRPairNode(MVT::i64, T0, T1), 0);
+
+      // Copy REG_SEQ into a GPRPair-typed VR and replace the original two
+      // i32 VRs of inline asm with it.
+      Register GPVR = MRI.createVirtualRegister(&CSKY::GPRPairRegClass);
+      PairedReg = CurDAG->getRegister(GPVR, MVT::i64);
+      Chain = CurDAG->getCopyToReg(T1, dl, GPVR, Pair, T1.getValue(1));
+
+      AsmNodeOperands[InlineAsm::Op_InputChain] = Chain;
+      Glue = Chain.getValue(1);
+    }
+
+    Changed = true;
+
+    if (PairedReg.getNode()) {
+      OpChanged[OpChanged.size() - 1] = true;
+      Flag = InlineAsm::getFlagWord(Kind, 1 /* RegNum*/);
+      if (IsTiedToChangedOp)
+        Flag = InlineAsm::getFlagWordForMatchingOp(Flag, DefIdx);
+      else
+        Flag = InlineAsm::getFlagWordForRegClass(Flag, CSKY::GPRPairRegClassID);
+      // Replace the current flag.
+      AsmNodeOperands[AsmNodeOperands.size() - 1] =
+          CurDAG->getTargetConstant(Flag, dl, MVT::i32);
+      // Add the new register node and skip the original two GPRs.
+      AsmNodeOperands.push_back(PairedReg);
+      // Skip the next two GPRs.
+      i += 2;
+    }
+  }
+
+  if (Glue.getNode())
+    AsmNodeOperands.push_back(Glue);
+  if (!Changed)
+    return false;
+
+  SDValue New = CurDAG->getNode(N->getOpcode(), SDLoc(N),
+                                CurDAG->getVTList(MVT::Other, MVT::Glue),
+                                AsmNodeOperands);
+  New->setNodeId(-1);
+  ReplaceNode(N, New.getNode());
+  return true;
+}
+
+bool CSKYDAGToDAGISel::selectBITCAST_TO_LOHI(SDNode *N) {
+  SDLoc Dl(N);
+  auto VT = N->getValueType(0);
+  auto V = N->getOperand(0);
+
+  if (!Subtarget->hasFPUv2DoubleFloat())
+    return false;
+
+  SDValue V1 = SDValue(CurDAG->getMachineNode(CSKY::FMFVRL_D, Dl, VT, V), 0);
+  SDValue V2 = SDValue(CurDAG->getMachineNode(CSKY::FMFVRH_D, Dl, VT, V), 0);
+
+  ReplaceUses(SDValue(N, 0), V1);
+  ReplaceUses(SDValue(N, 1), V2);
+  CurDAG->RemoveDeadNode(N);
+
+  return true;
+}
+
 bool CSKYDAGToDAGISel::selectAddCarry(SDNode *N) {
   MachineSDNode *NewNode = nullptr;
   auto Type0 = N->getValueType(0);
@@ -175,6 +369,31 @@ bool CSKYDAGToDAGISel::selectSubCarry(SDNode *N) {
   return true;
 }
 
+SDNode *CSKYDAGToDAGISel::createGPRPairNode(EVT VT, SDValue V0, SDValue V1) {
+  SDLoc dl(V0.getNode());
+  SDValue RegClass =
+      CurDAG->getTargetConstant(CSKY::GPRPairRegClassID, dl, MVT::i32);
+  SDValue SubReg0 = CurDAG->getTargetConstant(CSKY::sub32_0, dl, MVT::i32);
+  SDValue SubReg1 = CurDAG->getTargetConstant(CSKY::sub32_32, dl, MVT::i32);
+  const SDValue Ops[] = {RegClass, V0, SubReg0, V1, SubReg1};
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
+}
+
+bool CSKYDAGToDAGISel::SelectInlineAsmMemoryOperand(
+    const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
+  switch (ConstraintID) {
+  case InlineAsm::Constraint_m:
+    // We just support simple memory operands that have a single address
+    // operand and need no special handling.
+    OutOps.push_back(Op);
+    return false;
+  default:
+    break;
+  }
+
+  return true;
+}
+
 FunctionPass *llvm::createCSKYISelDag(CSKYTargetMachine &TM) {
   return new CSKYDAGToDAGISel(TM);
 }
diff --git a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
index 0b589e3d3e4f..012de34c9809 100644
--- a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
+++ b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
@@ -19,6 +19,7 @@
 #include "CSKYSubtarget.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/Support/Debug.h"
 
@@ -103,9 +104,7 @@ CSKYTargetLowering::CSKYTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::UDIV, MVT::i32, Expand);
   }
 
-  if (!Subtarget.has3r2E3r3()) {
-    setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand);
-  }
+  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand);
 
   // Float
 
@@ -784,6 +783,175 @@ SDValue CSKYTargetLowering::getTargetConstantPoolValue(GlobalAddressSDNode *N,
   return DAG.getTargetConstantPool(CPV, Ty);
 }
 
+CSKYTargetLowering::ConstraintType
+CSKYTargetLowering::getConstraintType(StringRef Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default:
+      break;
+    case 'a':
+    case 'b':
+    case 'v':
+    case 'w':
+    case 'y':
+      return C_RegisterClass;
+    case 'c':
+    case 'l':
+    case 'h':
+    case 'z':
+      return C_Register;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+CSKYTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                                 StringRef Constraint,
+                                                 MVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':
+      return std::make_pair(0U, &CSKY::GPRRegClass);
+    case 'a':
+      return std::make_pair(0U, &CSKY::mGPRRegClass);
+    case 'b':
+      return std::make_pair(0U, &CSKY::sGPRRegClass);
+    case 'z':
+      return std::make_pair(CSKY::R14, &CSKY::GPRRegClass);
+    case 'c':
+      return std::make_pair(CSKY::C, &CSKY::CARRYRegClass);
+    case 'w':
+      if ((Subtarget.hasFPUv2SingleFloat() ||
+           Subtarget.hasFPUv3SingleFloat()) &&
+          VT == MVT::f32)
+        return std::make_pair(0U, &CSKY::sFPR32RegClass);
+      if ((Subtarget.hasFPUv2DoubleFloat() ||
+           Subtarget.hasFPUv3DoubleFloat()) &&
+          VT == MVT::f64)
+        return std::make_pair(0U, &CSKY::sFPR64RegClass);
+      break;
+    case 'v':
+      if (Subtarget.hasFPUv2SingleFloat() && VT == MVT::f32)
+        return std::make_pair(0U, &CSKY::sFPR32RegClass);
+      if (Subtarget.hasFPUv3SingleFloat() && VT == MVT::f32)
+        return std::make_pair(0U, &CSKY::FPR32RegClass);
+      if (Subtarget.hasFPUv2DoubleFloat() && VT == MVT::f64)
+        return std::make_pair(0U, &CSKY::sFPR64RegClass);
+      if (Subtarget.hasFPUv3DoubleFloat() && VT == MVT::f64)
+        return std::make_pair(0U, &CSKY::FPR64RegClass);
+      break;
+    default:
+      break;
+    }
+  }
+
+  if (Constraint == "{c}")
+    return std::make_pair(CSKY::C, &CSKY::CARRYRegClass);
+
+  // Clang will correctly decode the usage of register name aliases into their
+  // official names. However, other frontends like `rustc` do not. This allows
+  // users of these frontends to use the ABI names for registers in LLVM-style
+  // register constraints.
+  unsigned XRegFromAlias = StringSwitch<unsigned>(Constraint.lower())
+                               .Case("{a0}", CSKY::R0)
+                               .Case("{a1}", CSKY::R1)
+                               .Case("{a2}", CSKY::R2)
+                               .Case("{a3}", CSKY::R3)
+                               .Case("{l0}", CSKY::R4)
+                               .Case("{l1}", CSKY::R5)
+                               .Case("{l2}", CSKY::R6)
+                               .Case("{l3}", CSKY::R7)
+                               .Case("{l4}", CSKY::R8)
+                               .Case("{l5}", CSKY::R9)
+                               .Case("{l6}", CSKY::R10)
+                               .Case("{l7}", CSKY::R11)
+                               .Case("{t0}", CSKY::R12)
+                               .Case("{t1}", CSKY::R13)
+                               .Case("{sp}", CSKY::R14)
+                               .Case("{lr}", CSKY::R15)
+                               .Case("{l8}", CSKY::R16)
+                               .Case("{l9}", CSKY::R17)
+                               .Case("{t2}", CSKY::R18)
+                               .Case("{t3}", CSKY::R19)
+                               .Case("{t4}", CSKY::R20)
+                               .Case("{t5}", CSKY::R21)
+                               .Case("{t6}", CSKY::R22)
+                               .Cases("{t7}", "{fp}", CSKY::R23)
+                               .Cases("{t8}", "{top}", CSKY::R24)
+                               .Cases("{t9}", "{bsp}", CSKY::R25)
+                               .Case("{r26}", CSKY::R26)
+                               .Case("{r27}", CSKY::R27)
+                               .Cases("{gb}", "{rgb}", "{rdb}", CSKY::R28)
+                               .Cases("{tb}", "{rtb}", CSKY::R29)
+                               .Case("{svbr}", CSKY::R30)
+                               .Case("{tls}", CSKY::R31)
+                               .Default(CSKY::NoRegister);
+
+  if (XRegFromAlias != CSKY::NoRegister)
+    return std::make_pair(XRegFromAlias, &CSKY::GPRRegClass);
+
+  // Since TargetLowering::getRegForInlineAsmConstraint uses the name of the
+  // TableGen record rather than the AsmName to choose registers for InlineAsm
+  // constraints, plus we want to match those names to the widest floating point
+  // register type available, manually select floating point registers here.
+  //
+  // The second case is the ABI name of the register, so that frontends can also
+  // use the ABI names in register constraint lists.
+  if (Subtarget.useHardFloat()) {
+    unsigned FReg = StringSwitch<unsigned>(Constraint.lower())
+                        .Cases("{fr0}", "{vr0}", CSKY::F0_32)
+                        .Cases("{fr1}", "{vr1}", CSKY::F1_32)
+                        .Cases("{fr2}", "{vr2}", CSKY::F2_32)
+                        .Cases("{fr3}", "{vr3}", CSKY::F3_32)
+                        .Cases("{fr4}", "{vr4}", CSKY::F4_32)
+                        .Cases("{fr5}", "{vr5}", CSKY::F5_32)
+                        .Cases("{fr6}", "{vr6}", CSKY::F6_32)
+                        .Cases("{fr7}", "{vr7}", CSKY::F7_32)
+                        .Cases("{fr8}", "{vr8}", CSKY::F8_32)
+                        .Cases("{fr9}", "{vr9}", CSKY::F9_32)
+                        .Cases("{fr10}", "{vr10}", CSKY::F10_32)
+                        .Cases("{fr11}", "{vr11}", CSKY::F11_32)
+                        .Cases("{fr12}", "{vr12}", CSKY::F12_32)
+                        .Cases("{fr13}", "{vr13}", CSKY::F13_32)
+                        .Cases("{fr14}", "{vr14}", CSKY::F14_32)
+                        .Cases("{fr15}", "{vr15}", CSKY::F15_32)
+                        .Cases("{fr16}", "{vr16}", CSKY::F16_32)
+                        .Cases("{fr17}", "{vr17}", CSKY::F17_32)
+                        .Cases("{fr18}", "{vr18}", CSKY::F18_32)
+                        .Cases("{fr19}", "{vr19}", CSKY::F19_32)
+                        .Cases("{fr20}", "{vr20}", CSKY::F20_32)
+                        .Cases("{fr21}", "{vr21}", CSKY::F21_32)
+                        .Cases("{fr22}", "{vr22}", CSKY::F22_32)
+                        .Cases("{fr23}", "{vr23}", CSKY::F23_32)
+                        .Cases("{fr24}", "{vr24}", CSKY::F24_32)
+                        .Cases("{fr25}", "{vr25}", CSKY::F25_32)
+                        .Cases("{fr26}", "{vr26}", CSKY::F26_32)
+                        .Cases("{fr27}", "{vr27}", CSKY::F27_32)
+                        .Cases("{fr28}", "{vr28}", CSKY::F28_32)
+                        .Cases("{fr29}", "{vr29}", CSKY::F29_32)
+                        .Cases("{fr30}", "{vr30}", CSKY::F30_32)
+                        .Cases("{fr31}", "{vr31}", CSKY::F31_32)
+                        .Default(CSKY::NoRegister);
+    if (FReg != CSKY::NoRegister) {
+      assert(CSKY::F0_32 <= FReg && FReg <= CSKY::F31_32 && "Unknown fp-reg");
+      unsigned RegNo = FReg - CSKY::F0_32;
+      unsigned DReg = CSKY::F0_64 + RegNo;
+
+      if (Subtarget.hasFPUv2DoubleFloat())
+        return std::make_pair(DReg, &CSKY::sFPR64RegClass);
+      else if (Subtarget.hasFPUv3DoubleFloat())
+        return std::make_pair(DReg, &CSKY::FPR64RegClass);
+      else if (Subtarget.hasFPUv2SingleFloat())
+        return std::make_pair(FReg, &CSKY::sFPR32RegClass);
+      else if (Subtarget.hasFPUv3SingleFloat())
+        return std::make_pair(FReg, &CSKY::FPR32RegClass);
+    }
+  }
+
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
+
 static MachineBasicBlock *
 emitSelectPseudo(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode) {
 
@@ -853,6 +1021,12 @@ CSKYTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   switch (MI.getOpcode()) {
   default:
     llvm_unreachable("Unexpected instr type to insert");
+  case CSKY::FSELS:
+  case CSKY::FSELD:
+    if (Subtarget.hasE2())
+      return emitSelectPseudo(MI, BB, CSKY::BT32);
+    else
+      return emitSelectPseudo(MI, BB, CSKY::BT16);
   case CSKY::ISEL32:
     return emitSelectPseudo(MI, BB, CSKY::BT32);
   case CSKY::ISEL16:
diff --git a/llvm/lib/Target/CSKY/CSKYISelLowering.h b/llvm/lib/Target/CSKY/CSKYISelLowering.h
index e1744d5ce220..1cd0f99b17bc 100644
--- a/llvm/lib/Target/CSKY/CSKYISelLowering.h
+++ b/llvm/lib/Target/CSKY/CSKYISelLowering.h
@@ -88,6 +88,12 @@ private:
     return (Kind != ScalarCondVectorVal);
   }
 
+  ConstraintType getConstraintType(StringRef Constraint) const override;
+
+  std::pair<unsigned, const TargetRegisterClass *>
+  getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                               StringRef Constraint, MVT VT) const override;
+
   MachineBasicBlock *
   EmitInstrWithCustomInserter(MachineInstr &MI,
                               MachineBasicBlock *BB) const override;
diff --git a/llvm/lib/Target/CSKY/CSKYInstrAlias.td b/llvm/lib/Target/CSKY/CSKYInstrAlias.td
new file mode 100644
index 000000000000..e3c0538e752e
--- /dev/null
+++ b/llvm/lib/Target/CSKY/CSKYInstrAlias.td
@@ -0,0 +1,38 @@
+//===-- CSKYInstrAlias.td - Target Description for CSKY ----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the CSKY instructions alias.
+//
+//===----------------------------------------------------------------------===//
+
+def : InstAlias<"nop", (MOV16 R0, R0)>;
+def : InstAlias<"nop", (MOV32 R0, R0)>, Requires<[iHasE2]>;
+
+def : InstAlias<"bgeni16 $dst, $imm", (BGENI GPR:$dst, uimm5:$imm)>;
+def : InstAlias<"bgeni32 $dst, $imm", (BGENI GPR:$dst, uimm5:$imm)>;
+
+def : InstAlias<"bsr $dst", (BSR32 call_symbol:$dst)>;
+
+def : InstAlias<"grs\t$rz, $offset", (GRS32 GPR:$rz, bare_symbol:$offset)>;
+
+def : InstAlias<"jbsr\t$src1", (JBSR32 call_symbol:$src1)>;
+
+def : InstAlias<"jbr $dst", (JBR16 br_symbol_16bit:$dst)>;
+def : InstAlias<"jbt $dst", (JBT16 C, br_symbol_16bit:$dst)>;
+def : InstAlias<"jbf $dst", (JBF16 C, br_symbol_16bit:$dst)>;
+
+def : InstAlias<"lrw $rz, $src", (PseudoLRW16 mGPR:$rz, bare_symbol:$src)>;
+def : InstAlias<"lrw $rz, $src", (LRW16 mGPR:$rz, constpool_symbol_16bit:$src)>;
+def : InstAlias<"lrw $rz, $src", (PseudoLRW32 GPR:$rz, bare_symbol:$src)>;
+def : InstAlias<"lrw $rz, $src", (LRW32 GPR:$rz, constpool_symbol:$src)>;
+
+def : InstAlias<"jsri $dst", (PseudoJSRI32 call_symbol:$dst)>;
+def : InstAlias<"jsri $dst", (JSRI32 constpool_symbol:$dst)>;
+
+def : InstAlias<"jmpi $dst", (PseudoJMPI32 br_symbol:$dst)>;
+def : InstAlias<"jmpi $dst", (JMPI32 constpool_symbol:$dst)>;
\ No newline at end of file
diff --git a/llvm/lib/Target/CSKY/CSKYInstrFormats.td b/llvm/lib/Target/CSKY/CSKYInstrFormats.td
index 9b6ef9ca23db..8144a501b3d2 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrFormats.td
+++ b/llvm/lib/Target/CSKY/CSKYInstrFormats.td
@@ -655,7 +655,7 @@ class R_Z_1<bits<6> sop, bits<5> pcode, string op>
 
 // Format< OP[6] | RZ[5] | 00000[5] | SOP[6] | PCODE[5] | 00000[5] >
 // Instructions:(2) clrf32, clrt32
-class R_Z_2<bits<6> sop, bits<5> pcode, string op, list<dag> pattern>
+class R_Z_2<bits<6> sop, bits<5> pcode, string op>
     : CSKY32Inst<AddrModeNone, 0x31, (outs GPR:$rz),
     (ins CARRY:$ca, GPR:$false), !strconcat(op, "\t$rz"), []> {
   bits<5> rz;
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
index c57ccb9d6eea..d490b385ac16 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
@@ -14,6 +14,7 @@
 #include "CSKYConstantPoolValue.h"
 #include "CSKYMachineFunctionInfo.h"
 #include "CSKYTargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/MC/MCContext.h"
 
 #define DEBUG_TYPE "csky-instr-info"
@@ -222,9 +223,10 @@ bool CSKYInstrInfo::reverseBranchCondition(
 
 Register CSKYInstrInfo::movImm(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator MBBI,
-                               const DebugLoc &DL, int64_t Val,
+                               const DebugLoc &DL, uint64_t Val,
                                MachineInstr::MIFlag Flag) const {
-  assert(isUInt<32>(Val) && "should be uint32");
+  if (!isInt<32>(Val))
+    report_fatal_error("Should only materialize 32-bit constants.");
 
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
 
@@ -475,9 +477,6 @@ void CSKYInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator I,
                                 const DebugLoc &DL, MCRegister DestReg,
                                 MCRegister SrcReg, bool KillSrc) const {
-
-  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-
   if (CSKY::GPRRegClass.contains(SrcReg) &&
       CSKY::CARRYRegClass.contains(DestReg)) {
     if (STI.hasE2()) {
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.h b/llvm/lib/Target/CSKY/CSKYInstrInfo.h
index 1a1bbbf9154f..a979b0bf4b0d 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrInfo.h
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.h
@@ -79,7 +79,7 @@ public:
 
   // Materializes the given integer Val into DstReg.
   Register movImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-                  const DebugLoc &DL, int64_t Val,
+                  const DebugLoc &DL, uint64_t Val,
                   MachineInstr::MIFlag Flag = MachineInstr::NoFlags) const;
 };
 
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.td b/llvm/lib/Target/CSKY/CSKYInstrInfo.td
index a782efe7f4f4..300ecceae906 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrInfo.td
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.td
@@ -413,6 +413,19 @@ def psrflag : Operand<i32>, ImmLeaf<i32, "return isShiftedUInt<5, 0>(Imm);"> {
   let PrintMethod = "printPSRFlag";
 }
 
+multiclass uimm8SRLXForm<SDNode opc> {
+  def _0: SDNodeXForm<opc,
+    [{return CurDAG->getTargetConstant((N->getZExtValue() >> 0) & 0xFF, SDLoc(N), MVT::i32);}]>;
+  def _8: SDNodeXForm<opc,
+    [{return CurDAG->getTargetConstant((N->getZExtValue() >> 8) & 0xFF, SDLoc(N), MVT::i32);}]>;
+  def _16: SDNodeXForm<opc,
+    [{return CurDAG->getTargetConstant((N->getZExtValue() >> 16) & 0xFF, SDLoc(N), MVT::i32);}]>;
+  def _24: SDNodeXForm<opc,
+    [{return CurDAG->getTargetConstant((N->getZExtValue() >> 24) & 0xFF, SDLoc(N), MVT::i32);}]>;
+}
+
+defm uimm8SRL : uimm8SRLXForm<imm>;
+
 //===----------------------------------------------------------------------===//
 // Instruction Formats
 //===----------------------------------------------------------------------===//
@@ -709,8 +722,6 @@ let Predicates= [iHasE2] in {
   def MOVI32 : I_16_MOV<0x10, "movi32", uimm16>;
   let Size = 4, isCodeGenOnly = 0 in
   def BGENI : CSKYPseudo<(outs GPR:$dst), (ins uimm5:$imm), "bgeni\t$dst, $imm", []>;
-  def : InstAlias<"bgeni16 $dst, $imm", (BGENI GPR:$dst, uimm5:$imm)>;
-  def : InstAlias<"bgeni32 $dst, $imm", (BGENI GPR:$dst, uimm5:$imm)>;
   def MOVIH32 : I_16_MOV<0x11, "movih32", uimm16_16_xform>;
   def MVC32 : R_Z_1<0x1, 0x8, "mvc32">;
   let isCodeGenOnly = 1 in
@@ -723,8 +734,8 @@ let Predicates= [iHasE2] in {
 
 let Predicates = [iHas2E3] in {
   def MVCV32 : R_Z_1<0x1, 0x10, "mvcv32">;
-  def CLRF32 : R_Z_2<0xB, 0x1, "clrf32", []>;
-  def CLRT32 : R_Z_2<0xB, 0x2, "clrt32", []>;
+  def CLRF32 : R_Z_2<0xB, 0x1, "clrf32">;
+  def CLRT32 : R_Z_2<0xB, 0x2, "clrt32">;
 }
 
 //===----------------------------------------------------------------------===//
@@ -779,8 +790,6 @@ def BNEZAD32 : CSKY32Inst<AddrModeNone, 0x3a,
 
 def BSR32 : J<0x38, (outs), (ins call_symbol:$offset), "bsr32", []>;
 
-def : InstAlias<"bsr $dst", (BSR32 call_symbol:$dst)>;
-
 def BSR32_BR : J<0x38, (outs), (ins call_symbol:$offset), "bsr32", []>{
   let isCodeGenOnly = 1;
   let isBranch = 1;
@@ -804,7 +813,6 @@ let Predicates = [iHas2E3] in {
 
 def GRS32 : I_18_Z_L<0x3, "grs32\t$rz, $offset",
                     (outs GPR:$rz), (ins bare_symbol:$offset), []>;
-def : InstAlias<"grs\t$rz, $offset", (GRS32 GPR:$rz, bare_symbol:$offset)>;
 
 let Uses = [R28] in {
 def LRS32B : I_18_Z_L<0x0, "lrs32.b\t$rz, $offset",
@@ -1291,8 +1299,6 @@ let Predicates = [iHasE2] in {
 let isCall = 1, Defs = [ R15 ], mayLoad = 1, Size = 4, isCodeGenOnly = 0 in
 def JBSR32 : CSKYPseudo<(outs), (ins call_symbol:$src1), "jbsr32\t$src1", []>;
 
-def : InstAlias<"jbsr\t$src1", (JBSR32 call_symbol:$src1)>;
-
 def JBR32 : CSKYPseudo<(outs), (ins br_symbol:$src1), "jbr32\t$src1", []> {
   let isBranch = 1;
   let isTerminator = 1;
@@ -1338,18 +1344,13 @@ let mayLoad = 1, Size = 2, isCodeGenOnly = 0 in
 def PseudoLRW32 : CSKYPseudo<(outs GPR:$rz), (ins bare_symbol:$src), "lrw32 $rz, $src", []>;
 
 
-def : InstAlias<"lrw $rz, $src", (PseudoLRW32 GPR:$rz, bare_symbol:$src)>;
-def : InstAlias<"lrw $rz, $src", (LRW32 GPR:$rz, constpool_symbol:$src)>;
+
 
 let mayLoad = 1, Size = 4, isCodeGenOnly = 0 in
 def PseudoJSRI32 : CSKYPseudo<(outs), (ins call_symbol:$src), "jsri32 $src", []>;
-def : InstAlias<"jsri $dst", (PseudoJSRI32 call_symbol:$dst)>;
-def : InstAlias<"jsri $dst", (JSRI32 constpool_symbol:$dst)>;
 
 let mayLoad = 1, Size = 4, isCodeGenOnly = 0 in
 def PseudoJMPI32 : CSKYPseudo<(outs), (ins br_symbol:$src), "jmpi32 $src", []>;
-def : InstAlias<"jmpi $dst", (PseudoJMPI32 br_symbol:$dst)>;
-def : InstAlias<"jmpi $dst", (JMPI32 constpool_symbol:$dst)>;
 
 let isNotDuplicable = 1, mayLoad = 1, mayStore = 0, Size = 8 in
 def PseudoTLSLA32 : CSKYPseudo<(outs GPR:$dst1, GPR:$dst2),
@@ -1362,3 +1363,4 @@ def CONSTPOOL_ENTRY : CSKYPseudo<(outs),
 include "CSKYInstrInfo16Instr.td"
 include "CSKYInstrInfoF1.td"
 include "CSKYInstrInfoF2.td"
+include "CSKYInstrAlias.td"
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td b/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td
index 6a9dd03dfa1d..3be1ca8b7998 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td
@@ -441,6 +441,137 @@ let mayLoad = 1, Size = 2, isCodeGenOnly = 0 in
 def PseudoLRW16 : CSKYPseudo<(outs mGPR:$rz),
   (ins bare_symbol:$src), "lrw16 $rz, $src", []>;
 
+//===----------------------------------------------------------------------===//
+// Instruction Patterns.
+//===----------------------------------------------------------------------===//
+
+def : Pat<(sext_inreg mGPR:$src, i1), (ASRI16 (LSLI16 mGPR:$src, 7), 7)>;
+def : Pat<(sext_inreg sGPR:$src, i8), (SEXTB16 sGPR:$src)>;
+def : Pat<(sext_inreg sGPR:$src, i16), (SEXTH16 sGPR:$src)>;
+
+// Load & Store Patterns
+
+defm : LdPat<extloadi8, uimm5, LD16B, i32>;
+defm : LdPat<zextloadi8, uimm5, LD16B, i32>;
+
+defm : LdPat<extloadi16, uimm5_1, LD16H, i32>;
+defm : LdPat<zextloadi16, uimm5_1, LD16H, i32>;
+
+defm : LdPat<load, uimm5_2, LD16W, i32>;
+
+
+defm : StPat<truncstorei8, i32, uimm5, ST16B>;
+defm : StPat<truncstorei16, i32, uimm5_1, ST16H>;
+defm : StPat<store, i32, uimm5_2, ST16W>;
+
+def : Pat<(CSKY_CALLReg sGPR:$src), (JSR16 sGPR:$src)>;
+def : Pat<(CSKY_TAILReg sGPR:$src), (JMP16 sGPR:$src)>;
+
+// Symbol address Patterns
+def : Pat<(CSKY_LOAD_ADDR tglobaladdr, tconstpool:$src2), (LRW16 tconstpool:$src2)>;
+def : Pat<(CSKY_LOAD_ADDR tblockaddress, tconstpool:$src2), (LRW16 tconstpool:$src2)>;
+def : Pat<(CSKY_LOAD_ADDR tjumptable:$src1, tconstpool:$src2), (LRW16_Gen tjumptable:$src1, tconstpool:$src2)>;
+def : Pat<(CSKY_LOAD_ADDR texternalsym, tconstpool:$src2), (LRW16 tconstpool:$src2)>;
+
+def : Pat<(i32 (load constpool:$src)), (LRW16 (to_tconstpool tconstpool:$src))>;
+
+// Branch Patterns.
+
+def : Pat<(brcond CARRY:$ca, bb:$offset),
+          (BT16 CARRY:$ca, bb:$offset)>;
+
+def : Pat<(br bb:$offset), (BR16 bb:$offset)>;
+
+def : Pat<(brcond (i32 (setne mGPR:$rs1, uimm5:$rs2)), bb:$offset),
+          (BT16 (CMPNEI16 mGPR:$rs1, uimm5:$rs2), bb:$offset)>;
+def : Pat<(brcond (i32 (seteq mGPR:$rs1, uimm5:$rs2)), bb:$offset),
+          (BF16 (CMPNEI16 mGPR:$rs1, uimm5:$rs2), bb:$offset)>;
+def : Pat<(brcond (i32 (setuge mGPR:$rs1, oimm5:$rs2)), bb:$offset),
+          (BT16 (CMPHSI16 mGPR:$rs1, oimm5:$rs2), bb:$offset)>;
+def : Pat<(brcond (i32 (setult mGPR:$rs1, oimm5:$rs2)), bb:$offset),
+          (BF16 (CMPHSI16 mGPR:$rs1, oimm5:$rs2), bb:$offset)>;
+def : Pat<(brcond (i32 (setlt mGPR:$rs1, oimm5:$rs2)), bb:$offset),
+          (BT16 (CMPLTI16 mGPR:$rs1, oimm5:$rs2), bb:$offset)>;
+def : Pat<(brcond (i32 (setge mGPR:$rs1, oimm5:$rs2)), bb:$offset),
+          (BF16 (CMPLTI16 mGPR:$rs1, oimm5:$rs2), bb:$offset)>;
+
+def : Pat<(brcond (i32 (setne sGPR:$rs1, sGPR:$rs2)), bb:$offset),
+          (BT16 (CMPNE16 sGPR:$rs1, sGPR:$rs2), bb:$offset)>;
+def : Pat<(brcond (i32 (seteq sGPR:$rs1, sGPR:$rs2)), bb:$offset),
+          (BF16 (CMPNE16 sGPR:$rs1, sGPR:$rs2), bb:$offset)>;
+def : Pat<(brcond (i32 (setuge sGPR:$rs1, sGPR:$rs2)), bb:$offset),
+          (BT16 (CMPHS16 sGPR:$rs1, sGPR:$rs2), bb:$offset)>;
+def : Pat<(brcond (i32 (setule sGPR:$rs1, sGPR:$rs2)), bb:$offset),
+          (BT16 (CMPHS16 sGPR:$rs2, sGPR:$rs1), bb:$offset)>;
+def : Pat<(brcond (i32 (setult sGPR:$rs1, sGPR:$rs2)), bb:$offset),
+          (BF16 (CMPHS16 sGPR:$rs1, sGPR:$rs2), bb:$offset)>;
+def : Pat<(brcond (i32 (setugt sGPR:$rs1, sGPR:$rs2)), bb:$offset),
+          (BF16 (CMPHS16 sGPR:$rs2, sGPR:$rs1), bb:$offset)>;
+def : Pat<(brcond (i32 (setlt sGPR:$rs1, sGPR:$rs2)), bb:$offset),
+          (BT16 (CMPLT16 sGPR:$rs1, sGPR:$rs2), bb:$offset)>;
+def : Pat<(brcond (i32 (setgt sGPR:$rs1, sGPR:$rs2)), bb:$offset),
+          (BT16 (CMPLT16 sGPR:$rs2, sGPR:$rs1), bb:$offset)>;
+def : Pat<(brcond (i32 (setge sGPR:$rs1, sGPR:$rs2)), bb:$offset),
+          (BF16 (CMPLT16 sGPR:$rs1, sGPR:$rs2), bb:$offset)>;
+def : Pat<(brcond (i32 (setle sGPR:$rs1, sGPR:$rs2)), bb:$offset),
+          (BF16 (CMPLT16 sGPR:$rs2, sGPR:$rs1), bb:$offset)>;
+
+// Compare Patterns.
+def : Pat<(setne sGPR:$rs1, sGPR:$rs2),
+          (SUBU16XZ (MOVI16 1), (MVCV16 (CMPNE16 sGPR:$rs1, sGPR:$rs2)))>;
+def : Pat<(seteq sGPR:$rs1, sGPR:$rs2),
+          (MVCV16 (CMPNE16 sGPR:$rs1, sGPR:$rs2))>;
+def : Pat<(setuge sGPR:$rs1, sGPR:$rs2),
+          (SUBU16XZ (MOVI16 1), (MVCV16 (CMPHS16 sGPR:$rs1, sGPR:$rs2)))>;
+def : Pat<(setule sGPR:$rs1, sGPR:$rs2),
+          (SUBU16XZ (MOVI16 1), (MVCV16 (CMPHS16 sGPR:$rs2, sGPR:$rs1)))>;
+def : Pat<(setult sGPR:$rs1, sGPR:$rs2),
+          (MVCV16 (CMPHS16 sGPR:$rs1, sGPR:$rs2))>;
+def : Pat<(setugt sGPR:$rs1, sGPR:$rs2),
+          (MVCV16 (CMPHS16 sGPR:$rs2, sGPR:$rs1))>;
+def : Pat<(setlt sGPR:$rs1, sGPR:$rs2),
+          (SUBU16XZ (MOVI16 1), (MVCV16 (CMPLT16 sGPR:$rs1, sGPR:$rs2)))>;
+def : Pat<(setgt sGPR:$rs1, sGPR:$rs2),
+          (SUBU16XZ (MOVI16 1), (MVCV16 (CMPLT16 sGPR:$rs2, sGPR:$rs1)))>;
+def : Pat<(setge sGPR:$rs1, sGPR:$rs2),
+          (MVCV16 (CMPLT16 sGPR:$rs1, sGPR:$rs2))>;
+def : Pat<(setle sGPR:$rs1, sGPR:$rs2),
+          (MVCV16 (CMPLT16 sGPR:$rs2, sGPR:$rs1))>;
+
+
+def : Pat<(setne mGPR:$rs1, uimm5:$rs2),
+          (SUBU16XZ (MOVI16 1), (MVCV16 (CMPNEI16 mGPR:$rs1, uimm5:$rs2)))>;
+def : Pat<(seteq mGPR:$rs1, uimm5:$rs2),
+          (MVCV16 (CMPNEI16 mGPR:$rs1, uimm5:$rs2))>;
+def : Pat<(setuge mGPR:$rs1, oimm5:$rs2),
+          (SUBU16XZ (MOVI16 1), (MVCV16 (CMPHSI16 mGPR:$rs1, oimm5:$rs2)))>;
+def : Pat<(setult mGPR:$rs1, oimm5:$rs2),
+          (MVCV16 (CMPHSI16 mGPR:$rs1, oimm5:$rs2))>;
+def : Pat<(setlt mGPR:$rs1, oimm5:$rs2),
+          (SUBU16XZ (MOVI16 1), (MVCV16 (CMPLTI16 mGPR:$rs1, oimm5:$rs2)))>;
+def : Pat<(setge mGPR:$rs1, oimm5:$rs2),
+          (MVCV16 (CMPLTI16 mGPR:$rs1, oimm5:$rs2))>;
+
+def : Pat<(select CARRY:$ca, sGPR:$rx, sGPR:$false),
+          (ISEL16 CARRY:$ca, sGPR:$rx, sGPR:$false)>;
+def : Pat<(select (and CARRY:$ca, 1), sGPR:$rx, sGPR:$false),
+          (ISEL16 CARRY:$ca, sGPR:$rx, sGPR:$false)>;
+
+def : Pat<(rotl sGPR:$rs1, sGPR:$rs2),
+          (ROTL16 sGPR:$rs1, (AND16 sGPR:$rs2, (MOVI16 0x1f)))>;
+
+
+// FIXME: This is a temporary treatment for the e801.
+def : Pat<(i32 imm:$imm),
+          (OR16 (MOVI16 (uimm8SRL_0 imm:$imm)),
+	              (OR16 (LSLI16 (MOVI16 (uimm8SRL_8 imm:$imm)), 8),
+	                    (OR16 (LSLI16 (MOVI16 (uimm8SRL_16 imm:$imm)), 16),
+                            (LSLI16 (MOVI16 (uimm8SRL_24 imm:$imm)), 24))))>;
+
+// Other operations.
+let Predicates = [iHasE2] in {
+  def : Pat<(bswap sGPR:$rx), (REVB16 sGPR:$rx)>;
+}
 
 //===----------------------------------------------------------------------===//
 // Compress Instruction tablegen backend.
diff --git a/llvm/lib/Target/CSKY/CSKYMachineFunctionInfo.h b/llvm/lib/Target/CSKY/CSKYMachineFunctionInfo.h
index b6e303f8ccfb..57e0d62481ad 100644
--- a/llvm/lib/Target/CSKY/CSKYMachineFunctionInfo.h
+++ b/llvm/lib/Target/CSKY/CSKYMachineFunctionInfo.h
@@ -18,8 +18,6 @@
 namespace llvm {
 
 class CSKYMachineFunctionInfo : public MachineFunctionInfo {
-  MachineFunction &MF;
-
   Register GlobalBaseReg = 0;
   bool SpillsCR = false;
 
@@ -33,7 +31,14 @@ class CSKYMachineFunctionInfo : public MachineFunctionInfo {
   unsigned PICLabelUId = 0;
 
 public:
-  CSKYMachineFunctionInfo(MachineFunction &MF) : MF(MF) {}
+  CSKYMachineFunctionInfo(MachineFunction &) {}
+
+  MachineFunctionInfo *
+  clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+        const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+      const override {
+    return DestMF.cloneInfo<CSKYMachineFunctionInfo>(*this);
+  }
 
   Register getGlobalBaseReg() const { return GlobalBaseReg; }
   void setGlobalBaseReg(Register Reg) { GlobalBaseReg = Reg; }
diff --git a/llvm/lib/Target/CSKY/CSKYRegisterInfo.cpp b/llvm/lib/Target/CSKY/CSKYRegisterInfo.cpp
index 57b6ae3c27b5..4f7811d22868 100644
--- a/llvm/lib/Target/CSKY/CSKYRegisterInfo.cpp
+++ b/llvm/lib/Target/CSKY/CSKYRegisterInfo.cpp
@@ -13,6 +13,7 @@
 #include "CSKYRegisterInfo.h"
 #include "CSKY.h"
 #include "CSKYSubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/MC/MCContext.h"
@@ -29,6 +30,10 @@ const uint32_t *
 CSKYRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                                        CallingConv::ID Id) const {
   const CSKYSubtarget &STI = MF.getSubtarget<CSKYSubtarget>();
+  if (STI.hasFPUv2DoubleFloat() || STI.hasFPUv3DoubleFloat())
+    return CSR_GPR_FPR64_RegMask;
+  if (STI.hasFPUv2SingleFloat() || STI.hasFPUv3SingleFloat())
+    return CSR_GPR_FPR32_RegMask;
   return CSR_I32_RegMask;
 }
 
@@ -82,9 +87,21 @@ const MCPhysReg *
 CSKYRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   const CSKYSubtarget &STI = MF->getSubtarget<CSKYSubtarget>();
   if (MF->getFunction().hasFnAttribute("interrupt")) {
+    if (STI.hasFPUv3DoubleFloat())
+      return CSR_GPR_FPR64v3_ISR_SaveList;
+    if (STI.hasFPUv3SingleFloat())
+      return CSR_GPR_FPR32v3_ISR_SaveList;
+    if (STI.hasFPUv2DoubleFloat())
+      return CSR_GPR_FPR64_ISR_SaveList;
+    if (STI.hasFPUv2SingleFloat())
+      return CSR_GPR_FPR32_ISR_SaveList;
     return CSR_GPR_ISR_SaveList;
   }
 
+  if (STI.hasFPUv2DoubleFloat() || STI.hasFPUv3DoubleFloat())
+    return CSR_GPR_FPR64_SaveList;
+  if (STI.hasFPUv2SingleFloat() || STI.hasFPUv3SingleFloat())
+    return CSR_GPR_FPR32_SaveList;
   return CSR_I32_SaveList;
 }
 
@@ -248,7 +265,6 @@ void CSKYRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     assert(isInt<32>(Offset) && "Int32 expected");
     // The offset won't fit in an immediate, so use a scratch register instead
     // Modify Offset and FrameReg appropriately
-    assert(Offset >= 0);
     Register ScratchReg = TII->movImm(MBB, NewII, DL, Offset);
     BuildMI(MBB, NewII, DL,
             TII->get(STI.hasE2() ? CSKY::ADDU32 : CSKY::ADDU16XZ), ScratchReg)
@@ -265,7 +281,7 @@ void CSKYRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     MI->setDesc(TII->get(TargetOpcode::COPY));
     MI->getOperand(FIOperandNum)
         .ChangeToRegister(FrameReg, false, false, FrameRegIsKill);
-    MI->RemoveOperand(FIOperandNum + 1);
+    MI->removeOperand(FIOperandNum + 1);
   } else {
     MI->getOperand(FIOperandNum)
         .ChangeToRegister(FrameReg, false, false, FrameRegIsKill);
diff --git a/llvm/lib/Target/CSKY/CSKYRegisterInfo.td b/llvm/lib/Target/CSKY/CSKYRegisterInfo.td
index b7f4fc17166b..d12532a3c5c1 100644
--- a/llvm/lib/Target/CSKY/CSKYRegisterInfo.td
+++ b/llvm/lib/Target/CSKY/CSKYRegisterInfo.td
@@ -81,17 +81,21 @@ let RegAltNameIndices = [ABIRegAltName] in {
   def R29 : CSKYReg<29, "r29", ["rtb"]>, DwarfRegNum<[29]>;
   def R30 : CSKYReg<30, "r30", ["svbr"]>, DwarfRegNum<[30]>;
   def R31 : CSKYReg<31, "r31", ["tls"]>, DwarfRegNum<[31]>;
-  def C : CSKYReg<32, "cr0", ["psr"]>;
+
+  // Faked for GPRTuple
+  def R32 : CSKYReg<32, "r32", ["r32"]>, DwarfRegNum<[32]>;
+
+  def C : CSKYReg<33, "cr0", ["psr"]>;
 
 }
 
 def GPRTuple : RegisterTuples<
           [sub32_0, sub32_32],
-          [(add (sequence "R%u", 0, 30)), (add (sequence "R%u", 1, 31))],
+          [(add (sequence "R%u", 0, 31)), (add (sequence "R%u", 1, 32))],
           [ "r0",  "r1",  "r2",  "r3",  "r4",  "r5",  "r6",  "r7",
             "r8",  "r9",  "r10", "r11", "r12", "r13", "r14", "r15",
             "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
-            "r24", "r25", "r26", "r27", "r28", "r29", "r30"
+            "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31"
           ]>;
 
 // Floating point registers
@@ -189,9 +193,9 @@ def FPR32 : RegisterClass<"CSKY", [f32], 32,
 def sFPR32 : RegisterClass<"CSKY", [f32], 32,
                          (add (sequence "F%u_32", 0, 15))>;
 
-def FPR64 : RegisterClass<"CSKY", [f64], 64,
+def FPR64 : RegisterClass<"CSKY", [f64], 32,
                          (add (sequence "F%u_64", 0, 31))>;
-def sFPR64 : RegisterClass<"CSKY", [f64], 64,
+def sFPR64 : RegisterClass<"CSKY", [f64], 32,
                          (add (sequence "F%u_64", 0, 15))>;
 
 def sFPR64_V : RegisterClass<"CSKY", [v2f32], 32, (add sFPR64)>;
diff --git a/llvm/lib/Target/CSKY/CSKYSubtarget.cpp b/llvm/lib/Target/CSKY/CSKYSubtarget.cpp
index 963c2ede9c44..251dbed82708 100644
--- a/llvm/lib/Target/CSKY/CSKYSubtarget.cpp
+++ b/llvm/lib/Target/CSKY/CSKYSubtarget.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CSKYSubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 
 using namespace llvm;
 
@@ -33,14 +34,42 @@ CSKYSubtarget &CSKYSubtarget::initializeSubtargetDependencies(
   UseHardFloatABI = false;
   HasFPUv2SingleFloat = false;
   HasFPUv2DoubleFloat = false;
+  HasFPUv3HalfWord = false;
+  HasFPUv3HalfFloat = false;
   HasFPUv3SingleFloat = false;
   HasFPUv3DoubleFloat = false;
-
+  HasFdivdu = false;
+  HasFLOATE1 = false;
+  HasFLOAT1E2 = false;
+  HasFLOAT1E3 = false;
+  HasFLOAT3E4 = false;
+  HasFLOAT7E60 = false;
+  HasExtendLrw = false;
   HasBTST16 = false;
+  HasTrust = false;
   HasJAVA = false;
-  HasExtendLrw = false;
+  HasCache = false;
+  HasNVIC = false;
+  HasDSP = false;
+  HasDSP1E2 = false;
+  HasDSPE60 = false;
+  HasDSPV2 = false;
+  HasDSP_Silan = false;
   HasDoloop = false;
+  HasHardwareDivide = false;
   HasHighRegisters = false;
+  HasVDSPV2 = false;
+  HasVDSP2E3 = false;
+  HasVDSP2E60F = false;
+  ReadTPHard = false;
+  HasVDSPV1_128 = false;
+  UseCCRT = false;
+  DumpConstPool = false;
+  EnableInterruptAttribute = false;
+  HasPushPop = false;
+  HasSTM = false;
+  SmartMode = false;
+  EnableStackSize = false;
 
   HasE1 = false;
   HasE2 = false;
diff --git a/llvm/lib/Target/CSKY/CSKYSubtarget.h b/llvm/lib/Target/CSKY/CSKYSubtarget.h
index 4cd590e8e76e..9e7ad00c0a50 100644
--- a/llvm/lib/Target/CSKY/CSKYSubtarget.h
+++ b/llvm/lib/Target/CSKY/CSKYSubtarget.h
@@ -36,18 +36,65 @@ class CSKYSubtarget : public CSKYGenSubtargetInfo {
   CSKYTargetLowering TLInfo;
   SelectionDAGTargetInfo TSInfo;
 
+  enum CSKYProcFamilyEnum {
+    Others,
+
+    CK801,
+    CK802,
+    CK803,
+    CK803S,
+    CK804,
+    CK805,
+    CK807,
+    CK810,
+    CK810V,
+    CK860,
+    CK860V
+  };
+
+  /// CSKYProcFamily - CSKY processor family: CK801, CK802, and others.
+  CSKYProcFamilyEnum CSKYProcFamily = Others;
+
   bool UseHardFloat;
   bool UseHardFloatABI;
   bool HasFPUv2SingleFloat;
   bool HasFPUv2DoubleFloat;
+  bool HasFPUv3HalfWord;
+  bool HasFPUv3HalfFloat;
   bool HasFPUv3SingleFloat;
   bool HasFPUv3DoubleFloat;
-
+  bool HasFdivdu;
+  bool HasFLOATE1;
+  bool HasFLOAT1E2;
+  bool HasFLOAT1E3;
+  bool HasFLOAT3E4;
+  bool HasFLOAT7E60;
   bool HasBTST16;
-  bool HasJAVA;
   bool HasExtendLrw;
+  bool HasTrust;
+  bool HasJAVA;
+  bool HasCache;
+  bool HasNVIC;
+  bool HasDSP;
+  bool HasDSP1E2;
+  bool HasDSPE60;
+  bool HasDSPV2;
+  bool HasDSP_Silan;
   bool HasDoloop;
+  bool HasHardwareDivide;
   bool HasHighRegisters;
+  bool HasVDSPV2;
+  bool HasVDSP2E3;
+  bool HasVDSP2E60F;
+  bool ReadTPHard;
+  bool HasVDSPV1_128;
+  bool UseCCRT;
+  bool DumpConstPool;
+  bool EnableInterruptAttribute;
+  bool HasPushPop;
+  bool HasSTM;
+  bool SmartMode;
+  bool EnableStackSize;
 
   bool HasE1;
   bool HasE2;
@@ -92,16 +139,49 @@ public:
   bool hasFPUv2SingleFloat() const { return HasFPUv2SingleFloat; }
   bool hasFPUv2DoubleFloat() const { return HasFPUv2DoubleFloat; }
   bool hasFPUv2() const { return HasFPUv2SingleFloat || HasFPUv2DoubleFloat; }
+  bool hasFPUv3HalfWord() const { return HasFPUv3HalfWord; }
+  bool hasFPUv3HalfFloat() const { return HasFPUv3HalfFloat; }
   bool hasFPUv3SingleFloat() const { return HasFPUv3SingleFloat; }
   bool hasFPUv3DoubleFloat() const { return HasFPUv3DoubleFloat; }
-  bool hasFPUv3() const { return HasFPUv3SingleFloat || HasFPUv3DoubleFloat; }
+  bool hasFPUv3() const {
+    return HasFPUv3HalfFloat || HasFPUv3SingleFloat || HasFPUv3DoubleFloat;
+  }
   bool hasAnyFloatExt() const { return hasFPUv2() || hasFPUv3(); };
-
+  bool hasFdivdu() const { return HasFdivdu; }
+  bool hasFLOATE1() const { return HasFLOATE1; }
+  bool hasFLOAT1E2() const { return HasFLOAT1E2; }
+  bool hasFLOAT1E3() const { return HasFLOAT1E3; }
+  bool hasFLOAT3E4() const { return HasFLOAT3E4; }
+  bool hasFLOAT7E60() const { return HasFLOAT7E60; }
+  bool hasExtendLrw() const { return HasExtendLrw; }
   bool hasBTST16() const { return HasBTST16; }
+  bool hasTrust() const { return HasTrust; }
   bool hasJAVA() const { return HasJAVA; }
-  bool hasExtendLrw() const { return HasExtendLrw; }
+  bool hasCache() const { return HasCache; }
+  bool hasNVIC() const { return HasNVIC; }
+  bool hasDSP() const { return HasDSP; }
+  bool hasDSP1E2() const { return HasDSP1E2; }
+  bool hasDSPE60() const { return HasDSPE60; }
+  bool hasDSPV2() const { return HasDSPV2; }
+  bool hasDSP_Silan() const { return HasDSP_Silan; }
   bool hasDoloop() const { return HasDoloop; }
   bool hasHighRegisters() const { return HasHighRegisters; }
+  bool hasVDSPV2() const { return HasVDSPV2; }
+  bool hasVDSPV2_FLOAT() const { return HasVDSPV2 && UseHardFloat; }
+  bool hasVDSPV2_HALF() const {
+    return HasVDSPV2 && UseHardFloat && HasFPUv3HalfFloat;
+  }
+  bool hasVDSP2E3() const { return HasVDSP2E3; }
+  bool hasVDSP2E60F() const { return HasVDSP2E60F; }
+  bool readTPHard() const { return ReadTPHard; }
+  bool hasVDSPV1_128() const { return HasVDSPV1_128; }
+  bool useCCRT() const { return UseCCRT; }
+  bool dumpConstPool() const { return DumpConstPool; }
+  bool enableInterruptAttribute() const { return EnableInterruptAttribute; }
+  bool hasPushPop() const { return HasPushPop; }
+  bool hasSTM() const { return HasSTM; }
+  bool smartMode() const { return SmartMode; }
+  bool enableStackSize() const { return EnableStackSize; }
 
   bool hasE1() const { return HasE1; }
   bool hasE2() const { return HasE2; }
@@ -114,6 +194,18 @@ public:
   bool hasMP1E2() const { return HasMP1E2; }
   bool has7E10() const { return Has7E10; }
   bool has10E60() const { return Has10E60; }
+
+  bool isCK801() const { return CSKYProcFamily == CK801; }
+  bool isCK802() const { return CSKYProcFamily == CK802; }
+  bool isCK803() const { return CSKYProcFamily == CK803; }
+  bool isCK803S() const { return CSKYProcFamily == CK803S; }
+  bool isCK804() const { return CSKYProcFamily == CK804; }
+  bool isCK805() const { return CSKYProcFamily == CK805; }
+  bool isCK807() const { return CSKYProcFamily == CK807; }
+  bool isCK810() const { return CSKYProcFamily == CK810; }
+  bool isCK810V() const { return CSKYProcFamily == CK810V; }
+  bool isCK860() const { return CSKYProcFamily == CK860; }
+  bool isCK860V() const { return CSKYProcFamily == CK860V; }
 };
 } // namespace llvm
 
diff --git a/llvm/lib/Target/CSKY/CSKYTargetMachine.cpp b/llvm/lib/Target/CSKY/CSKYTargetMachine.cpp
index 94b24044c27d..d19f28fddd53 100644
--- a/llvm/lib/Target/CSKY/CSKYTargetMachine.cpp
+++ b/llvm/lib/Target/CSKY/CSKYTargetMachine.cpp
@@ -13,7 +13,9 @@
 #include "CSKYTargetMachine.h"
 #include "CSKY.h"
 #include "CSKYSubtarget.h"
+#include "CSKYTargetObjectFile.h"
 #include "TargetInfo/CSKYTargetInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -50,9 +52,9 @@ CSKYTargetMachine::CSKYTargetMachine(const Target &T, const Triple &TT,
                                      Optional<CodeModel::Model> CM,
                                      CodeGenOpt::Level OL, bool JIT)
     : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options,
-                        RM.getValueOr(Reloc::Static),
+                        RM.value_or(Reloc::Static),
                         getEffectiveCodeModel(CM, CodeModel::Small), OL),
-      TLOF(std::make_unique<TargetLoweringObjectFileELF>()) {
+      TLOF(std::make_unique<CSKYELFTargetObjectFile>()) {
   initAsmInfo();
 }
 
@@ -94,6 +96,7 @@ public:
     return getTM<CSKYTargetMachine>();
   }
 
+  void addIRPasses() override;
   bool addInstSelector() override;
   void addPreEmitPass() override;
 };
@@ -104,6 +107,11 @@ TargetPassConfig *CSKYTargetMachine::createPassConfig(PassManagerBase &PM) {
   return new CSKYPassConfig(*this, PM);
 }
 
+void CSKYPassConfig::addIRPasses() {
+  addPass(createAtomicExpandPass());
+  TargetPassConfig::addIRPasses();
+}
+
 bool CSKYPassConfig::addInstSelector() {
   addPass(createCSKYISelDag(getCSKYTargetMachine()));
 
diff --git a/llvm/lib/Target/CSKY/CSKYTargetObjectFile.cpp b/llvm/lib/Target/CSKY/CSKYTargetObjectFile.cpp
new file mode 100644
index 000000000000..b5592d34ca54
--- /dev/null
+++ b/llvm/lib/Target/CSKY/CSKYTargetObjectFile.cpp
@@ -0,0 +1,25 @@
+//===-- CSKYTargetObjectFile.h - CSKY Object Info -*- C++ ---------------*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "CSKYTargetObjectFile.h"
+#include "CSKYTargetMachine.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+
+using namespace llvm;
+
+void CSKYELFTargetObjectFile::Initialize(MCContext &Ctx,
+                                         const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+
+  LSDAEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
+  PersonalityEncoding =
+      dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
+  TTypeEncoding =
+      dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
+}
diff --git a/llvm/lib/Target/CSKY/CSKYTargetObjectFile.h b/llvm/lib/Target/CSKY/CSKYTargetObjectFile.h
new file mode 100644
index 000000000000..a82f2681c12a
--- /dev/null
+++ b/llvm/lib/Target/CSKY/CSKYTargetObjectFile.h
@@ -0,0 +1,24 @@
+//===-- CSKYTargetObjectFile.h - CSKY Object Info -*- C++ ---------------*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_CSKY_CSKYTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_CSKY_CSKYTARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+namespace llvm {
+
+class CSKYELFTargetObjectFile : public TargetLoweringObjectFileELF {
+public:
+  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_CSKY_CSKYTARGETOBJECTFILE_H
diff --git a/llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp b/llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp
new file mode 100644
index 000000000000..9b4d8ea8dc56
--- /dev/null
+++ b/llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp
@@ -0,0 +1,553 @@
+//===-- CSKYDisassembler.cpp - Disassembler for CSKY ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CSKYDisassembler class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/CSKYBaseInfo.h"
+#include "MCTargetDesc/CSKYMCTargetDesc.h"
+#include "TargetInfo/CSKYTargetInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDecoderOps.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Endian.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "csky-disassembler"
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+class CSKYDisassembler : public MCDisassembler {
+  std::unique_ptr<MCInstrInfo const> const MCII;
+  mutable StringRef symbolName;
+
+  DecodeStatus handleCROperand(MCInst &Instr) const;
+
+public:
+  CSKYDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
+                   MCInstrInfo const *MCII);
+
+  DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              raw_ostream &CStream) const override;
+};
+} // end anonymous namespace
+
+CSKYDisassembler::CSKYDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
+                                   MCInstrInfo const *MCII)
+    : MCDisassembler(STI, Ctx), MCII(MCII) {}
+
+static MCDisassembler *createCSKYDisassembler(const Target &T,
+                                              const MCSubtargetInfo &STI,
+                                              MCContext &Ctx) {
+  return new CSKYDisassembler(STI, Ctx, T.createMCInstrInfo());
+}
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeCSKYDisassembler() {
+  TargetRegistry::RegisterMCDisassembler(getTheCSKYTarget(),
+                                         createCSKYDisassembler);
+}
+
+static const uint16_t GPRDecoderTable[] = {
+    CSKY::R0,  CSKY::R1,  CSKY::R2,  CSKY::R3,  CSKY::R4,  CSKY::R5,  CSKY::R6,
+    CSKY::R7,  CSKY::R8,  CSKY::R9,  CSKY::R10, CSKY::R11, CSKY::R12, CSKY::R13,
+    CSKY::R14, CSKY::R15, CSKY::R16, CSKY::R17, CSKY::R18, CSKY::R19, CSKY::R20,
+    CSKY::R21, CSKY::R22, CSKY::R23, CSKY::R24, CSKY::R25, CSKY::R26, CSKY::R27,
+    CSKY::R28, CSKY::R29, CSKY::R30, CSKY::R31};
+
+static const uint16_t GPRPairDecoderTable[] = {
+    CSKY::R0_R1,   CSKY::R1_R2,   CSKY::R2_R3,   CSKY::R3_R4,   CSKY::R4_R5,
+    CSKY::R5_R6,   CSKY::R6_R7,   CSKY::R7_R8,   CSKY::R8_R9,   CSKY::R9_R10,
+    CSKY::R10_R11, CSKY::R11_R12, CSKY::R12_R13, CSKY::R13_R14, CSKY::R14_R15,
+    CSKY::R15_R16, CSKY::R16_R17, CSKY::R17_R18, CSKY::R18_R19, CSKY::R19_R20,
+    CSKY::R20_R21, CSKY::R21_R22, CSKY::R22_R23, CSKY::R23_R24, CSKY::R24_R25,
+    CSKY::R25_R26, CSKY::R26_R27, CSKY::R27_R28, CSKY::R28_R29, CSKY::R29_R30,
+    CSKY::R30_R31, CSKY::R31_R32};
+
+static const uint16_t FPR32DecoderTable[] = {
+    CSKY::F0_32,  CSKY::F1_32,  CSKY::F2_32,  CSKY::F3_32,  CSKY::F4_32,
+    CSKY::F5_32,  CSKY::F6_32,  CSKY::F7_32,  CSKY::F8_32,  CSKY::F9_32,
+    CSKY::F10_32, CSKY::F11_32, CSKY::F12_32, CSKY::F13_32, CSKY::F14_32,
+    CSKY::F15_32, CSKY::F16_32, CSKY::F17_32, CSKY::F18_32, CSKY::F19_32,
+    CSKY::F20_32, CSKY::F21_32, CSKY::F22_32, CSKY::F23_32, CSKY::F24_32,
+    CSKY::F25_32, CSKY::F26_32, CSKY::F27_32, CSKY::F28_32, CSKY::F29_32,
+    CSKY::F30_32, CSKY::F31_32};
+
+static const uint16_t FPR64DecoderTable[] = {
+    CSKY::F0_64,  CSKY::F1_64,  CSKY::F2_64,  CSKY::F3_64,  CSKY::F4_64,
+    CSKY::F5_64,  CSKY::F6_64,  CSKY::F7_64,  CSKY::F8_64,  CSKY::F9_64,
+    CSKY::F10_64, CSKY::F11_64, CSKY::F12_64, CSKY::F13_64, CSKY::F14_64,
+    CSKY::F15_64, CSKY::F16_64, CSKY::F17_64, CSKY::F18_64, CSKY::F19_64,
+    CSKY::F20_64, CSKY::F21_64, CSKY::F22_64, CSKY::F23_64, CSKY::F24_64,
+    CSKY::F25_64, CSKY::F26_64, CSKY::F27_64, CSKY::F28_64, CSKY::F29_64,
+    CSKY::F30_64, CSKY::F31_64};
+
+static const uint16_t FPR128DecoderTable[] = {
+    CSKY::F0_128,  CSKY::F1_128,  CSKY::F2_128,  CSKY::F3_128,  CSKY::F4_128,
+    CSKY::F5_128,  CSKY::F6_128,  CSKY::F7_128,  CSKY::F8_128,  CSKY::F9_128,
+    CSKY::F10_128, CSKY::F11_128, CSKY::F12_128, CSKY::F13_128, CSKY::F14_128,
+    CSKY::F15_128, CSKY::F16_128, CSKY::F17_128, CSKY::F18_128, CSKY::F19_128,
+    CSKY::F20_128, CSKY::F21_128, CSKY::F22_128, CSKY::F23_128, CSKY::F24_128,
+    CSKY::F25_128, CSKY::F26_128, CSKY::F27_128, CSKY::F28_128, CSKY::F29_128,
+    CSKY::F30_128, CSKY::F31_128};
+
+static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder) {
+  if (RegNo >= 32)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::createReg(GPRDecoderTable[RegNo]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, uint64_t RegNo,
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder) {
+  if (RegNo >= 32)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::createReg(FPR32DecoderTable[RegNo]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodesFPR32RegisterClass(MCInst &Inst, uint64_t RegNo,
+                                              uint64_t Address,
+                                              const MCDisassembler *Decoder) {
+  if (RegNo >= 16)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::createReg(FPR32DecoderTable[RegNo]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodesFPR64RegisterClass(MCInst &Inst, uint64_t RegNo,
+                                              uint64_t Address,
+                                              const MCDisassembler *Decoder) {
+  if (RegNo >= 16)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::createReg(FPR64DecoderTable[RegNo]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodesFPR64_VRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                                uint64_t Address,
+                                                const MCDisassembler *Decoder) {
+  if (RegNo >= 16)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::createReg(FPR64DecoderTable[RegNo]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, uint64_t RegNo,
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder) {
+  if (RegNo >= 32)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::createReg(FPR64DecoderTable[RegNo]));
+  return MCDisassembler::Success;
+}
+
+// TODO
+LLVM_ATTRIBUTE_UNUSED
+static DecodeStatus DecodesFPR128RegisterClass(MCInst &Inst, uint64_t RegNo,
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder) {
+  if (RegNo >= 16)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::createReg(FPR128DecoderTable[RegNo]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodesGPRRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder) {
+  if (RegNo >= 16)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::createReg(GPRDecoderTable[RegNo]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodemGPRRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder) {
+  if (RegNo >= 8)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::createReg(GPRDecoderTable[RegNo]));
+  return MCDisassembler::Success;
+}
+
+// TODO
+LLVM_ATTRIBUTE_UNUSED
+static DecodeStatus DecodeGPRSPRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder) {
+  if (RegNo != 14)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::createReg(GPRDecoderTable[RegNo]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder) {
+  const FeatureBitset &FeatureBits =
+      Decoder->getSubtargetInfo().getFeatureBits();
+  bool hasHighReg = FeatureBits[CSKY::FeatureHighreg];
+
+  if (RegNo >= 32 || (!hasHighReg && RegNo >= 16))
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::createReg(GPRPairDecoderTable[RegNo]));
+  return MCDisassembler::Success;
+}
+
+template <unsigned N, unsigned S>
+static DecodeStatus decodeUImmOperand(MCInst &Inst, uint64_t Imm,
+                                      int64_t Address,
+                                      const MCDisassembler *Decoder) {
+  assert(isUInt<N>(Imm) && "Invalid immediate");
+  Inst.addOperand(MCOperand::createImm(Imm << S));
+  return MCDisassembler::Success;
+}
+
+template <unsigned N>
+static DecodeStatus decodeOImmOperand(MCInst &Inst, uint64_t Imm,
+                                      int64_t Address,
+                                      const MCDisassembler *Decoder) {
+  assert(isUInt<N>(Imm) && "Invalid immediate");
+  Inst.addOperand(MCOperand::createImm(Imm + 1));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeLRW16Imm8(MCInst &Inst, uint64_t Imm, int64_t Address,
+                                    const MCDisassembler *Decoder) {
+  assert(isUInt<8>(Imm) && "Invalid immediate");
+  if ((Imm >> 7) & 0x1) {
+    Inst.addOperand(MCOperand::createImm((Imm & 0x7F) << 2));
+  } else {
+    uint64_t V = ((Imm ^ 0xFFFFFFFF) & 0xFF);
+    Inst.addOperand(MCOperand::createImm(V << 2));
+  }
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeJMPIXImmOperand(MCInst &Inst, uint64_t Imm,
+                                          int64_t Address,
+                                          const MCDisassembler *Decoder) {
+  assert(isUInt<2>(Imm) && "Invalid immediate");
+
+  if (Imm == 0)
+    Inst.addOperand(MCOperand::createImm(16));
+  else if (Imm == 1)
+    Inst.addOperand(MCOperand::createImm(24));
+  else if (Imm == 2)
+    Inst.addOperand(MCOperand::createImm(32));
+  else if (Imm == 3)
+    Inst.addOperand(MCOperand::createImm(40));
+  else
+    return MCDisassembler::Fail;
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeRegSeqOperand(MCInst &Inst, uint64_t Imm,
+                                        int64_t Address,
+                                        const MCDisassembler *Decoder) {
+  assert(isUInt<10>(Imm) && "Invalid immediate");
+
+  auto Imm5 = Imm & 0x1f;
+  auto Ry = (Imm >> 5) & 0x1f;
+
+  if (DecodeGPRRegisterClass(Inst, Ry, Address, Decoder) ==
+      MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Ry + Imm5]));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeRegSeqOperandF1(MCInst &Inst, uint64_t Imm,
+                                          int64_t Address,
+                                          const MCDisassembler *Decoder) {
+  assert(isUInt<10>(Imm) && "Invalid immediate");
+
+  auto Imm5 = Imm & 0x1f;
+  auto Ry = (Imm >> 5) & 0x1f;
+
+  if (DecodesFPR32RegisterClass(Inst, Ry, Address, Decoder) ==
+      MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::createReg(FPR32DecoderTable[Ry + Imm5]));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeRegSeqOperandD1(MCInst &Inst, uint64_t Imm,
+                                          int64_t Address,
+                                          const MCDisassembler *Decoder) {
+  assert(isUInt<10>(Imm) && "Invalid immediate");
+
+  auto Imm5 = Imm & 0x1f;
+  auto Ry = (Imm >> 5) & 0x1f;
+
+  if (DecodesFPR64RegisterClass(Inst, Ry, Address, Decoder) ==
+      MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::createReg(FPR64DecoderTable[Ry + Imm5]));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeRegSeqOperandF2(MCInst &Inst, uint64_t Imm,
+                                          int64_t Address,
+                                          const MCDisassembler *Decoder) {
+  assert(isUInt<10>(Imm) && "Invalid immediate");
+
+  auto Imm5 = Imm & 0x1f;
+  auto Ry = (Imm >> 5) & 0x1f;
+
+  if (DecodeFPR32RegisterClass(Inst, Ry, Address, Decoder) ==
+      MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::createReg(FPR32DecoderTable[Ry + Imm5]));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeRegSeqOperandD2(MCInst &Inst, uint64_t Imm,
+                                          int64_t Address,
+                                          const MCDisassembler *Decoder) {
+  assert(isUInt<10>(Imm) && "Invalid immediate");
+
+  auto Imm5 = Imm & 0x1f;
+  auto Ry = (Imm >> 5) & 0x1f;
+
+  if (DecodeFPR64RegisterClass(Inst, Ry, Address, Decoder) ==
+      MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::createReg(FPR64DecoderTable[Ry + Imm5]));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeImmShiftOpValue(MCInst &Inst, uint64_t Imm,
+                                          int64_t Address,
+                                          const MCDisassembler *Decoder) {
+  Inst.addOperand(MCOperand::createImm(Log2(Imm)));
+  return MCDisassembler::Success;
+}
+
+template <unsigned N, unsigned S>
+static DecodeStatus decodeSImmOperand(MCInst &Inst, uint64_t Imm,
+                                      int64_t Address,
+                                      const MCDisassembler *Decoder) {
+  assert(isUInt<N>(Imm) && "Invalid immediate");
+  // Sign-extend the number in the bottom N bits of Imm
+  Inst.addOperand(MCOperand::createImm(SignExtend64<N>(Imm) << S));
+  return MCDisassembler::Success;
+}
+
+#include "CSKYGenDisassemblerTables.inc"
+
+DecodeStatus CSKYDisassembler::handleCROperand(MCInst &MI) const {
+
+  // FIXME: To query instruction info from td file or a table inc file
+  switch (MI.getOpcode()) {
+  default:
+    return MCDisassembler::Success;
+  case CSKY::LD16WSP:
+  case CSKY::ST16WSP:
+  case CSKY::ADDI16ZSP:
+    MI.insert(std::next(MI.begin()), MCOperand::createReg(CSKY::R14));
+    return MCDisassembler::Success;
+  case CSKY::ADDI16SPSP:
+  case CSKY::SUBI16SPSP:
+    MI.insert(MI.begin(), MCOperand::createReg(CSKY::R14));
+    MI.insert(MI.begin(), MCOperand::createReg(CSKY::R14));
+    return MCDisassembler::Success;
+  case CSKY::FCMPHS_S:
+  case CSKY::FCMPHS_D:
+  case CSKY::FCMPLT_S:
+  case CSKY::FCMPLT_D:
+  case CSKY::FCMPNE_S:
+  case CSKY::FCMPNE_D:
+  case CSKY::FCMPUO_S:
+  case CSKY::FCMPUO_D:
+  case CSKY::FCMPZHS_S:
+  case CSKY::FCMPZHS_D:
+  case CSKY::FCMPZLS_S:
+  case CSKY::FCMPZLS_D:
+  case CSKY::FCMPZNE_S:
+  case CSKY::FCMPZNE_D:
+  case CSKY::FCMPZUO_S:
+  case CSKY::FCMPZUO_D:
+  case CSKY::f2FCMPHS_S:
+  case CSKY::f2FCMPHS_D:
+  case CSKY::f2FCMPLT_S:
+  case CSKY::f2FCMPLT_D:
+  case CSKY::f2FCMPNE_S:
+  case CSKY::f2FCMPNE_D:
+  case CSKY::f2FCMPUO_S:
+  case CSKY::f2FCMPUO_D:
+  case CSKY::f2FCMPHSZ_S:
+  case CSKY::f2FCMPHSZ_D:
+  case CSKY::f2FCMPHZ_S:
+  case CSKY::f2FCMPHZ_D:
+  case CSKY::f2FCMPLSZ_S:
+  case CSKY::f2FCMPLSZ_D:
+  case CSKY::f2FCMPLTZ_S:
+  case CSKY::f2FCMPLTZ_D:
+  case CSKY::f2FCMPNEZ_S:
+  case CSKY::f2FCMPNEZ_D:
+  case CSKY::f2FCMPUOZ_S:
+  case CSKY::f2FCMPUOZ_D:
+
+  case CSKY::BT32:
+  case CSKY::BF32:
+  case CSKY::BT16:
+  case CSKY::BF16:
+  case CSKY::CMPNEI32:
+  case CSKY::CMPNEI16:
+  case CSKY::CMPNE32:
+  case CSKY::CMPNE16:
+  case CSKY::CMPHSI32:
+  case CSKY::CMPHSI16:
+  case CSKY::CMPHS32:
+  case CSKY::CMPHS16:
+  case CSKY::CMPLTI32:
+  case CSKY::CMPLTI16:
+  case CSKY::CMPLT32:
+  case CSKY::CMPLT16:
+  case CSKY::BTSTI32:
+  case CSKY::BTSTI16:
+  case CSKY::TSTNBZ32:
+  case CSKY::TSTNBZ16:
+  case CSKY::TST32:
+  case CSKY::TST16:
+    MI.insert(MI.begin(), MCOperand::createReg(CSKY::C));
+    return MCDisassembler::Success;
+  case CSKY::LSLC32:
+  case CSKY::LSRC32:
+  case CSKY::ASRC32:
+    MI.insert(std::next(MI.begin()), MCOperand::createReg(CSKY::C));
+    return MCDisassembler::Success;
+  case CSKY::MOVF32:
+  case CSKY::MOVT32:
+  case CSKY::MVC32:
+  case CSKY::MVCV32:
+  case CSKY::MVCV16:
+  case CSKY::INCT32:
+  case CSKY::INCF32:
+  case CSKY::DECT32:
+  case CSKY::DECF32:
+  case CSKY::DECGT32:
+  case CSKY::DECLT32:
+  case CSKY::DECNE32:
+  case CSKY::CLRF32:
+  case CSKY::CLRT32:
+  case CSKY::f2FSEL_S:
+  case CSKY::f2FSEL_D:
+    MI.insert(std::next(MI.begin()), MCOperand::createReg(CSKY::C));
+    return MCDisassembler::Success;
+  case CSKY::ADDC32:
+  case CSKY::ADDC16:
+  case CSKY::SUBC32:
+  case CSKY::SUBC16:
+  case CSKY::XSR32:
+    MI.insert(std::next(MI.begin()), MCOperand::createReg(CSKY::C));
+    MI.insert(MI.end(), MCOperand::createReg(CSKY::C));
+    return MCDisassembler::Success;
+  case CSKY::INS32:
+    MI.getOperand(3).setImm(MI.getOperand(3).getImm() +
+                            MI.getOperand(4).getImm());
+    return MCDisassembler::Success;
+  }
+}
+
+static bool decodeFPUV3Instruction(MCInst &MI, uint32_t insn, uint64_t Address,
+                                   const MCDisassembler *DisAsm,
+                                   const MCSubtargetInfo &STI) {
+  LLVM_DEBUG(dbgs() << "Trying CSKY 32-bit fpuv3 table :\n");
+  if (!STI.getFeatureBits()[CSKY::FeatureFPUV3_HF] &&
+      !STI.getFeatureBits()[CSKY::FeatureFPUV3_SF] &&
+      !STI.getFeatureBits()[CSKY::FeatureFPUV3_DF])
+    return false;
+
+  DecodeStatus Result =
+      decodeInstruction(DecoderTableFPUV332, MI, insn, Address, DisAsm, STI);
+
+  if (Result == MCDisassembler::Fail) {
+    MI.clear();
+    return false;
+  }
+
+  return true;
+}
+
+DecodeStatus CSKYDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                              ArrayRef<uint8_t> Bytes,
+                                              uint64_t Address,
+                                              raw_ostream &CS) const {
+
+  uint32_t Insn;
+  DecodeStatus Result = MCDisassembler::Fail;
+
+  Insn = support::endian::read16le(Bytes.data());
+
+  if ((Insn >> 14) == 0x3) {
+    if (Bytes.size() < 4) {
+      Size = 0;
+      return MCDisassembler::Fail;
+    }
+    Insn = (Insn << 16) | support::endian::read16le(&Bytes[2]);
+
+    if (decodeFPUV3Instruction(MI, Insn, Address, this, STI))
+      Result = MCDisassembler::Success;
+    else {
+      LLVM_DEBUG(dbgs() << "Trying CSKY 32-bit table :\n");
+      Result = decodeInstruction(DecoderTable32, MI, Insn, Address, this, STI);
+    }
+
+    Size = 4;
+  } else {
+    if (Bytes.size() < 2) {
+      Size = 0;
+      return MCDisassembler::Fail;
+    }
+    LLVM_DEBUG(dbgs() << "Trying CSKY 16-bit table :\n");
+    Result = decodeInstruction(DecoderTable16, MI, Insn, Address, this, STI);
+    Size = 2;
+  }
+
+  handleCROperand(MI);
+
+  return Result;
+}
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
index daa655416c47..b5dfdfa0b42b 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
@@ -88,6 +88,13 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   switch (Fixup.getTargetKind()) {
   default:
     llvm_unreachable("Unknown fixup kind!");
+  case CSKY::fixup_csky_got32:
+  case CSKY::fixup_csky_got_imm18_scale4:
+  case CSKY::fixup_csky_gotoff:
+  case CSKY::fixup_csky_gotpc:
+  case CSKY::fixup_csky_plt32:
+  case CSKY::fixup_csky_plt_imm18_scale4:
+    llvm_unreachable("Relocation should be unconditionally forced\n");
   case FK_Data_1:
   case FK_Data_2:
   case FK_Data_4:
@@ -123,6 +130,71 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
       Ctx.reportError(Fixup.getLoc(), "fixup value must be 2-byte aligned.");
 
     return (Value >> 1) & 0x3ffff;
+  case CSKY::fixup_csky_pcrel_uimm8_scale4: {
+    if (!isUIntN(10, Value))
+      Ctx.reportError(Fixup.getLoc(), "out of range pc-relative fixup value.");
+    if (Value & 0x3)
+      Ctx.reportError(Fixup.getLoc(), "fixup value must be 4-byte aligned.");
+
+    unsigned IMM4L = (Value >> 2) & 0xf;
+    unsigned IMM4H = (Value >> 6) & 0xf;
+
+    Value = (IMM4H << 21) | (IMM4L << 4);
+    return Value;
+  }
+  case CSKY::fixup_csky_pcrel_imm10_scale2:
+    if (!isIntN(11, Value))
+      Ctx.reportError(Fixup.getLoc(), "out of range pc-relative fixup value.");
+    if (Value & 0x1)
+      Ctx.reportError(Fixup.getLoc(), "fixup value must be 2-byte aligned.");
+
+    return (Value >> 1) & 0x3ff;
+  case CSKY::fixup_csky_pcrel_uimm7_scale4:
+    if (!isUIntN(9, Value))
+      Ctx.reportError(Fixup.getLoc(), "out of range pc-relative fixup value.");
+    if (Value & 0x3)
+      Ctx.reportError(Fixup.getLoc(), "fixup value must be 4-byte aligned.");
+
+    if ((Value & 0xff) <= 0b111111100) {
+      unsigned IMM5L = (Value >> 2) & 0x1f;
+      unsigned IMM2H = (Value >> 7) & 0x3;
+
+      Value = (1 << 12) | (IMM2H << 8) | IMM5L;
+    } else {
+      unsigned IMM5L = (!Value >> 2) & 0x1f;
+      unsigned IMM2H = (!Value >> 7) & 0x3;
+
+      Value = (IMM2H << 8) | IMM5L;
+    }
+
+    return Value & 0xffff;
+  }
+}
+
+bool CSKYAsmBackend::fixupNeedsRelaxationAdvanced(const MCFixup &Fixup,
+                                                  bool Resolved, uint64_t Value,
+                                                  const MCRelaxableFragment *DF,
+                                                  const MCAsmLayout &Layout,
+                                                  const bool WasForced) const {
+  // Return true if the symbol is actually unresolved.
+  // Resolved could be always false when shouldForceRelocation return true.
+  // We use !WasForced to indicate that the symbol is unresolved and not forced
+  // by shouldForceRelocation.
+  if (!Resolved && !WasForced)
+    return true;
+
+  int64_t Offset = int64_t(Value);
+  switch (Fixup.getTargetKind()) {
+  default:
+    return false;
+  case CSKY::fixup_csky_pcrel_imm10_scale2:
+    return !isShiftedInt<10, 1>(Offset);
+  case CSKY::fixup_csky_pcrel_imm16_scale2:
+    return !isShiftedInt<16, 1>(Offset);
+  case CSKY::fixup_csky_pcrel_imm26_scale2:
+    return !isShiftedInt<26, 1>(Offset);
+  case CSKY::fixup_csky_pcrel_uimm7_scale4:
+    return !isShiftedUInt<8, 2>(Offset);
   }
 }
 
@@ -152,8 +224,9 @@ void CSKYAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
   // For each byte of the fragment that the fixup touches, mask in the
   // bits from the fixup value.
   bool IsLittleEndian = (Endian == support::little);
+  bool IsInstFixup = (Kind >= FirstTargetFixupKind);
 
-  if (IsLittleEndian && (NumBytes == 4)) {
+  if (IsLittleEndian && IsInstFixup && (NumBytes == 4)) {
     Data[Offset + 0] |= uint8_t((Value >> 16) & 0xff);
     Data[Offset + 1] |= uint8_t((Value >> 24) & 0xff);
     Data[Offset + 2] |= uint8_t(Value & 0xff);
@@ -166,6 +239,50 @@ void CSKYAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
   }
 }
 
+bool CSKYAsmBackend::mayNeedRelaxation(const MCInst &Inst,
+                                       const MCSubtargetInfo &STI) const {
+  switch (Inst.getOpcode()) {
+  default:
+    return false;
+  case CSKY::JBR32:
+  case CSKY::JBT32:
+  case CSKY::JBF32:
+  case CSKY::JBSR32:
+    if (!STI.getFeatureBits()[CSKY::Has2E3])
+      return false;
+    return true;
+  case CSKY::JBR16:
+  case CSKY::JBT16:
+  case CSKY::JBF16:
+  case CSKY::LRW16:
+  case CSKY::BR16:
+    return true;
+  }
+}
+
+bool CSKYAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
+                                           const MCFixup &Fixup,
+                                           const MCValue &Target) {
+  if (Fixup.getKind() >= FirstLiteralRelocationKind)
+    return true;
+  switch (Fixup.getTargetKind()) {
+  default:
+    break;
+  case CSKY::fixup_csky_got32:
+  case CSKY::fixup_csky_got_imm18_scale4:
+  case CSKY::fixup_csky_gotoff:
+  case CSKY::fixup_csky_gotpc:
+  case CSKY::fixup_csky_plt32:
+  case CSKY::fixup_csky_plt_imm18_scale4:
+  case CSKY::fixup_csky_doffset_imm18:
+  case CSKY::fixup_csky_doffset_imm18_scale2:
+  case CSKY::fixup_csky_doffset_imm18_scale4:
+    return true;
+  }
+
+  return false;
+}
+
 bool CSKYAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
                                           const MCRelaxableFragment *DF,
                                           const MCAsmLayout &Layout) const {
@@ -174,23 +291,62 @@ bool CSKYAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
 
 void CSKYAsmBackend::relaxInstruction(MCInst &Inst,
                                       const MCSubtargetInfo &STI) const {
-  llvm_unreachable("CSKYAsmBackend::relaxInstruction() unimplemented");
-}
+  MCInst Res;
 
-bool CSKYAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
-                                  const MCSubtargetInfo *STI) const {
-  if (Count % 2)
-    return false;
+  switch (Inst.getOpcode()) {
+  default:
+    LLVM_DEBUG(Inst.dump());
+    llvm_unreachable("Opcode not expected!");
+  case CSKY::LRW16:
+    Res.setOpcode(CSKY::LRW32);
+    Res.addOperand(Inst.getOperand(0));
+    Res.addOperand(Inst.getOperand(1));
+    break;
+  case CSKY::BR16:
+    Res.setOpcode(CSKY::BR32);
+    Res.addOperand(Inst.getOperand(0));
+    break;
+  case CSKY::JBSR32:
+    Res.setOpcode(CSKY::JSRI32);
+    Res.addOperand(Inst.getOperand(1));
+    break;
+  case CSKY::JBR32:
+    Res.setOpcode(CSKY::JMPI32);
+    Res.addOperand(Inst.getOperand(1));
+    break;
+  case CSKY::JBT32:
+  case CSKY::JBF32:
+    Res.setOpcode(Inst.getOpcode() == CSKY::JBT32 ? CSKY::JBT_E : CSKY::JBF_E);
+    Res.addOperand(Inst.getOperand(0));
+    Res.addOperand(Inst.getOperand(1));
+    Res.addOperand(Inst.getOperand(2));
+    break;
+  case CSKY::JBR16:
+    Res.setOpcode(CSKY::JBR32);
+    Res.addOperand(Inst.getOperand(0));
+    Res.addOperand(Inst.getOperand(1));
+    break;
+  case CSKY::JBT16:
+  case CSKY::JBF16:
+    // ck801
+    unsigned opcode;
+    if (STI.getFeatureBits()[CSKY::HasE2])
+      opcode = Inst.getOpcode() == CSKY::JBT16 ? CSKY::JBT32 : CSKY::JBF32;
+    else
+      opcode = Inst.getOpcode() == CSKY::JBT16 ? CSKY::JBT_E : CSKY::JBF_E;
 
-  // MOV32 r0, r0
-  while (Count >= 4) {
-    OS.write("\xc4\x00\x48\x20", 4);
-    Count -= 4;
+    Res.setOpcode(opcode);
+    Res.addOperand(Inst.getOperand(0));
+    Res.addOperand(Inst.getOperand(1));
+    Res.addOperand(Inst.getOperand(2));
+    break;
   }
-  // MOV16 r0, r0
-  if (Count)
-    OS.write("\x6c\x03", 2);
+  Inst = std::move(Res);
+}
 
+bool CSKYAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
+                                  const MCSubtargetInfo *STI) const {
+  OS.write_zeros(Count);
   return true;
 }
 
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
index e710954e9df8..09b3ce6cc82b 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
@@ -11,6 +11,7 @@
 
 #include "MCTargetDesc/CSKYFixupKinds.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCTargetOptions.h"
 
 namespace llvm {
@@ -39,9 +40,21 @@ public:
   void relaxInstruction(MCInst &Inst,
                         const MCSubtargetInfo &STI) const override;
 
+  bool mayNeedRelaxation(const MCInst &Inst,
+                         const MCSubtargetInfo &STI) const override;
+
+  bool fixupNeedsRelaxationAdvanced(const MCFixup &Fixup, bool Resolved,
+                                    uint64_t Value,
+                                    const MCRelaxableFragment *DF,
+                                    const MCAsmLayout &Layout,
+                                    const bool WasForced) const override;
+
   bool writeNopData(raw_ostream &OS, uint64_t Count,
                     const MCSubtargetInfo *STI) const override;
 
+  bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+                             const MCValue &Target) override;
+
   std::unique_ptr<MCObjectTargetWriter>
   createObjectTargetWriter() const override;
 };
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp
index 163632632290..d7cc4c8525ee 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CSKYFixupKinds.h"
+#include "CSKYMCExpr.h"
 #include "CSKYMCTargetDesc.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFObjectWriter.h"
@@ -33,10 +35,112 @@ unsigned CSKYELFObjectWriter::getRelocType(MCContext &Ctx,
                                            const MCValue &Target,
                                            const MCFixup &Fixup,
                                            bool IsPCRel) const {
-  // Determine the type of the relocation.
-  switch ((unsigned)Fixup.getKind()) {
+  const MCExpr *Expr = Fixup.getValue();
+  // Determine the type of the relocation
+  unsigned Kind = Fixup.getTargetKind();
+  MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
+
+  if (IsPCRel) {
+    switch (Kind) {
+    default:
+      LLVM_DEBUG(dbgs() << "Unknown Kind1  = " << Kind);
+      Ctx.reportError(Fixup.getLoc(), "Unsupported relocation type");
+      return ELF::R_CKCORE_NONE;
+    case FK_Data_4:
+    case FK_PCRel_4:
+      return ELF::R_CKCORE_PCREL32;
+    case CSKY::fixup_csky_pcrel_uimm16_scale4:
+      return ELF::R_CKCORE_PCREL_IMM16_4;
+    case CSKY::fixup_csky_pcrel_uimm8_scale4:
+      return ELF::R_CKCORE_PCREL_IMM8_4;
+    case CSKY::fixup_csky_pcrel_imm26_scale2:
+      return ELF::R_CKCORE_PCREL_IMM26_2;
+    case CSKY::fixup_csky_pcrel_imm18_scale2:
+      return ELF::R_CKCORE_PCREL_IMM18_2;
+    case CSKY::fixup_csky_pcrel_imm16_scale2:
+      return ELF::R_CKCORE_PCREL_IMM16_2;
+    case CSKY::fixup_csky_pcrel_imm10_scale2:
+      return ELF::R_CKCORE_PCREL_IMM10_2;
+    case CSKY::fixup_csky_pcrel_uimm7_scale4:
+      return ELF::R_CKCORE_PCREL_IMM7_4;
+    }
+  }
+
+  switch (Kind) {
   default:
-    llvm_unreachable("invalid fixup kind!");
+    LLVM_DEBUG(dbgs() << "Unknown Kind2  = " << Kind);
+    Ctx.reportError(Fixup.getLoc(), "Unsupported relocation type");
+    return ELF::R_CKCORE_NONE;
+  case FK_Data_1:
+    Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported");
+    return ELF::R_CKCORE_NONE;
+  case FK_Data_2:
+    Ctx.reportError(Fixup.getLoc(), "2-byte data relocations not supported");
+    return ELF::R_CKCORE_NONE;
+  case FK_Data_4:
+    if (Expr->getKind() == MCExpr::Target) {
+      auto TK = cast<CSKYMCExpr>(Expr)->getKind();
+      if (TK == CSKYMCExpr::VK_CSKY_ADDR)
+        return ELF::R_CKCORE_ADDR32;
+      if (TK == CSKYMCExpr::VK_CSKY_GOT)
+        return ELF::R_CKCORE_GOT32;
+      if (TK == CSKYMCExpr::VK_CSKY_GOTOFF)
+        return ELF::R_CKCORE_GOTOFF;
+      if (TK == CSKYMCExpr::VK_CSKY_PLT)
+        return ELF::R_CKCORE_PLT32;
+      if (TK == CSKYMCExpr::VK_CSKY_TLSIE)
+        return ELF::R_CKCORE_TLS_IE32;
+      if (TK == CSKYMCExpr::VK_CSKY_TLSLE)
+        return ELF::R_CKCORE_TLS_LE32;
+      if (TK == CSKYMCExpr::VK_CSKY_TLSGD)
+        return ELF::R_CKCORE_TLS_GD32;
+      if (TK == CSKYMCExpr::VK_CSKY_TLSLDM)
+        return ELF::R_CKCORE_TLS_LDM32;
+      if (TK == CSKYMCExpr::VK_CSKY_TLSLDO)
+        return ELF::R_CKCORE_TLS_LDO32;
+      if (TK == CSKYMCExpr::VK_CSKY_GOTPC)
+        return ELF::R_CKCORE_GOTPC;
+      if (TK == CSKYMCExpr::VK_CSKY_None)
+        return ELF::R_CKCORE_ADDR32;
+
+      LLVM_DEBUG(dbgs() << "Unknown FK_Data_4 TK  = " << TK);
+      Ctx.reportError(Fixup.getLoc(), "unknown target FK_Data_4");
+    } else {
+      switch (Modifier) {
+      default:
+        Ctx.reportError(Fixup.getLoc(),
+                        "invalid fixup for 4-byte data relocation");
+        return ELF::R_CKCORE_NONE;
+      case MCSymbolRefExpr::VK_GOT:
+        return ELF::R_CKCORE_GOT32;
+      case MCSymbolRefExpr::VK_GOTOFF:
+        return ELF::R_CKCORE_GOTOFF;
+      case MCSymbolRefExpr::VK_PLT:
+        return ELF::R_CKCORE_PLT32;
+      case MCSymbolRefExpr::VK_None:
+        return ELF::R_CKCORE_ADDR32;
+      }
+    }
+    return ELF::R_CKCORE_NONE;
+  case FK_Data_8:
+    Ctx.reportError(Fixup.getLoc(), "8-byte data relocations not supported");
+    return ELF::R_CKCORE_NONE;
+  case CSKY::fixup_csky_addr32:
+    return ELF::R_CKCORE_ADDR32;
+  case CSKY::fixup_csky_addr_hi16:
+    return ELF::R_CKCORE_ADDR_HI16;
+  case CSKY::fixup_csky_addr_lo16:
+    return ELF::R_CKCORE_ADDR_LO16;
+  case CSKY::fixup_csky_doffset_imm18:
+    return ELF::R_CKCORE_DOFFSET_IMM18;
+  case CSKY::fixup_csky_doffset_imm18_scale2:
+    return ELF::R_CKCORE_DOFFSET_IMM18_2;
+  case CSKY::fixup_csky_doffset_imm18_scale4:
+    return ELF::R_CKCORE_DOFFSET_IMM18_4;
+  case CSKY::fixup_csky_got_imm18_scale4:
+    return ELF::R_CKCORE_GOT_IMM18_4;
+  case CSKY::fixup_csky_plt_imm18_scale4:
+    return ELF::R_CKCORE_PLT_IMM18_4;
   }
 }
 
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.cpp
new file mode 100644
index 000000000000..90775c1b70f2
--- /dev/null
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.cpp
@@ -0,0 +1,335 @@
+//===-- CSKYELFStreamer.cpp - CSKY ELF Target Streamer Methods ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides CSKY specific target streamer methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CSKYELFStreamer.h"
+#include "CSKYMCTargetDesc.h"
+#include "MCTargetDesc/CSKYAsmBackend.h"
+#include "MCTargetDesc/CSKYBaseInfo.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/Support/CSKYAttributes.h"
+#include "llvm/Support/CSKYTargetParser.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/LEB128.h"
+
+using namespace llvm;
+
+// This part is for ELF object output.
+CSKYTargetELFStreamer::CSKYTargetELFStreamer(MCStreamer &S,
+                                             const MCSubtargetInfo &STI)
+    : CSKYTargetStreamer(S), CurrentVendor("csky") {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  const FeatureBitset &Features = STI.getFeatureBits();
+
+  unsigned EFlags = MCA.getELFHeaderEFlags();
+
+  EFlags |= ELF::EF_CSKY_ABIV2;
+
+  if (Features[CSKY::ProcCK801])
+    EFlags |= ELF::EF_CSKY_801;
+  else if (Features[CSKY::ProcCK802])
+    EFlags |= ELF::EF_CSKY_802;
+  else if (Features[CSKY::ProcCK803])
+    EFlags |= ELF::EF_CSKY_803;
+  else if (Features[CSKY::ProcCK804])
+    EFlags |= ELF::EF_CSKY_803;
+  else if (Features[CSKY::ProcCK805])
+    EFlags |= ELF::EF_CSKY_805;
+  else if (Features[CSKY::ProcCK807])
+    EFlags |= ELF::EF_CSKY_807;
+  else if (Features[CSKY::ProcCK810])
+    EFlags |= ELF::EF_CSKY_810;
+  else if (Features[CSKY::ProcCK860])
+    EFlags |= ELF::EF_CSKY_860;
+  else
+    EFlags |= ELF::EF_CSKY_810;
+
+  if (Features[CSKY::FeatureFPUV2_SF] || Features[CSKY::FeatureFPUV3_SF])
+    EFlags |= ELF::EF_CSKY_FLOAT;
+
+  EFlags |= ELF::EF_CSKY_EFV1;
+
+  MCA.setELFHeaderEFlags(EFlags);
+}
+
+MCELFStreamer &CSKYTargetELFStreamer::getStreamer() {
+  return static_cast<MCELFStreamer &>(Streamer);
+}
+
+void CSKYTargetELFStreamer::emitAttribute(unsigned Attribute, unsigned Value) {
+  setAttributeItem(Attribute, Value, /*OverwriteExisting=*/true);
+}
+
+void CSKYTargetELFStreamer::emitTextAttribute(unsigned Attribute,
+                                              StringRef String) {
+  setAttributeItem(Attribute, String, /*OverwriteExisting=*/true);
+}
+
+void CSKYTargetELFStreamer::finishAttributeSection() {
+  if (Contents.empty())
+    return;
+
+  if (AttributeSection) {
+    Streamer.switchSection(AttributeSection);
+  } else {
+    MCAssembler &MCA = getStreamer().getAssembler();
+    AttributeSection = MCA.getContext().getELFSection(
+        ".csky.attributes", ELF::SHT_CSKY_ATTRIBUTES, 0);
+    Streamer.switchSection(AttributeSection);
+    Streamer.emitInt8(ELFAttrs::Format_Version);
+  }
+
+  // Vendor size + Vendor name + '\0'
+  const size_t VendorHeaderSize = 4 + CurrentVendor.size() + 1;
+
+  // Tag + Tag Size
+  const size_t TagHeaderSize = 1 + 4;
+
+  const size_t ContentsSize = calculateContentSize();
+
+  Streamer.emitInt32(VendorHeaderSize + TagHeaderSize + ContentsSize);
+  Streamer.emitBytes(CurrentVendor);
+  Streamer.emitInt8(0); // '\0'
+
+  Streamer.emitInt8(ELFAttrs::File);
+  Streamer.emitInt32(TagHeaderSize + ContentsSize);
+
+  // Size should have been accounted for already, now
+  // emit each field as its type (ULEB or String).
+  for (AttributeItem item : Contents) {
+    Streamer.emitULEB128IntValue(item.Tag);
+    switch (item.Type) {
+    default:
+      llvm_unreachable("Invalid attribute type");
+    case AttributeType::Numeric:
+      Streamer.emitULEB128IntValue(item.IntValue);
+      break;
+    case AttributeType::Text:
+      Streamer.emitBytes(item.StringValue);
+      Streamer.emitInt8(0); // '\0'
+      break;
+    case AttributeType::NumericAndText:
+      Streamer.emitULEB128IntValue(item.IntValue);
+      Streamer.emitBytes(item.StringValue);
+      Streamer.emitInt8(0); // '\0'
+      break;
+    }
+  }
+
+  Contents.clear();
+}
+
+size_t CSKYTargetELFStreamer::calculateContentSize() const {
+  size_t Result = 0;
+  for (AttributeItem item : Contents) {
+    switch (item.Type) {
+    case AttributeType::Hidden:
+      break;
+    case AttributeType::Numeric:
+      Result += getULEB128Size(item.Tag);
+      Result += getULEB128Size(item.IntValue);
+      break;
+    case AttributeType::Text:
+      Result += getULEB128Size(item.Tag);
+      Result += item.StringValue.size() + 1; // string + '\0'
+      break;
+    case AttributeType::NumericAndText:
+      Result += getULEB128Size(item.Tag);
+      Result += getULEB128Size(item.IntValue);
+      Result += item.StringValue.size() + 1; // string + '\0';
+      break;
+    }
+  }
+  return Result;
+}
+
+void CSKYELFStreamer::EmitMappingSymbol(StringRef Name) {
+  if (Name == "$d" && State == EMS_Data)
+    return;
+  if (Name == "$t" && State == EMS_Text)
+    return;
+  if (Name == "$t" && State == EMS_None) {
+    State = EMS_Text;
+    return;
+  }
+
+  State = (Name == "$t" ? EMS_Text : EMS_Data);
+
+  auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol(
+      Name + "." + Twine(MappingSymbolCounter++)));
+  emitLabel(Symbol);
+
+  Symbol->setType(ELF::STT_NOTYPE);
+  Symbol->setBinding(ELF::STB_LOCAL);
+}
+
+void CSKYTargetELFStreamer::emitTargetAttributes(const MCSubtargetInfo &STI) {
+  StringRef CPU = STI.getCPU();
+  CSKY::ArchKind ArchID = CSKY::parseCPUArch(CPU);
+
+  if (ArchID == CSKY::ArchKind::CK804)
+    ArchID = CSKY::ArchKind::CK803;
+
+  StringRef CPU_ARCH = CSKY::getArchName(ArchID);
+
+  if (ArchID == CSKY::ArchKind::INVALID) {
+    CPU = "ck810";
+    CPU_ARCH = "ck810";
+  }
+  emitTextAttribute(CSKYAttrs::CSKY_ARCH_NAME, CPU_ARCH);
+  emitTextAttribute(CSKYAttrs::CSKY_CPU_NAME, CPU);
+
+  unsigned ISAFlag = 0;
+  if (STI.hasFeature(CSKY::HasE1))
+    ISAFlag |= CSKYAttrs::V2_ISA_E1;
+
+  if (STI.hasFeature(CSKY::HasE2))
+    ISAFlag |= CSKYAttrs::V2_ISA_1E2;
+
+  if (STI.hasFeature(CSKY::Has2E3))
+    ISAFlag |= CSKYAttrs::V2_ISA_2E3;
+
+  if (STI.hasFeature(CSKY::HasMP))
+    ISAFlag |= CSKYAttrs::ISA_MP;
+
+  if (STI.hasFeature(CSKY::Has3E3r1))
+    ISAFlag |= CSKYAttrs::V2_ISA_3E3R1;
+
+  if (STI.hasFeature(CSKY::Has3r1E3r2))
+    ISAFlag |= CSKYAttrs::V2_ISA_3E3R2;
+
+  if (STI.hasFeature(CSKY::Has3r2E3r3))
+    ISAFlag |= CSKYAttrs::V2_ISA_3E3R3;
+
+  if (STI.hasFeature(CSKY::Has3E7))
+    ISAFlag |= CSKYAttrs::V2_ISA_3E7;
+
+  if (STI.hasFeature(CSKY::HasMP1E2))
+    ISAFlag |= CSKYAttrs::ISA_MP_1E2;
+
+  if (STI.hasFeature(CSKY::Has7E10))
+    ISAFlag |= CSKYAttrs::V2_ISA_7E10;
+
+  if (STI.hasFeature(CSKY::Has10E60))
+    ISAFlag |= CSKYAttrs::V2_ISA_10E60;
+
+  if (STI.hasFeature(CSKY::FeatureTrust))
+    ISAFlag |= CSKYAttrs::ISA_TRUST;
+
+  if (STI.hasFeature(CSKY::FeatureJAVA))
+    ISAFlag |= CSKYAttrs::ISA_JAVA;
+
+  if (STI.hasFeature(CSKY::FeatureCache))
+    ISAFlag |= CSKYAttrs::ISA_CACHE;
+
+  if (STI.hasFeature(CSKY::FeatureNVIC))
+    ISAFlag |= CSKYAttrs::ISA_NVIC;
+
+  if (STI.hasFeature(CSKY::FeatureDSP))
+    ISAFlag |= CSKYAttrs::ISA_DSP;
+
+  if (STI.hasFeature(CSKY::HasDSP1E2))
+    ISAFlag |= CSKYAttrs::ISA_DSP_1E2;
+
+  if (STI.hasFeature(CSKY::HasDSPE60))
+    ISAFlag |= CSKYAttrs::V2_ISA_DSPE60;
+
+  if (STI.hasFeature(CSKY::FeatureDSPV2))
+    ISAFlag |= CSKYAttrs::ISA_DSP_ENHANCE;
+
+  if (STI.hasFeature(CSKY::FeatureDSP_Silan))
+    ISAFlag |= CSKYAttrs::ISA_DSP_SILAN;
+
+  if (STI.hasFeature(CSKY::FeatureVDSPV1_128))
+    ISAFlag |= CSKYAttrs::ISA_VDSP;
+
+  if (STI.hasFeature(CSKY::FeatureVDSPV2))
+    ISAFlag |= CSKYAttrs::ISA_VDSP_2;
+
+  if (STI.hasFeature(CSKY::HasVDSP2E3))
+    ISAFlag |= CSKYAttrs::ISA_VDSP_2E3;
+
+  if (STI.hasFeature(CSKY::HasVDSP2E60F))
+    ISAFlag |= CSKYAttrs::ISA_VDSP_2E60F;
+
+  emitAttribute(CSKYAttrs::CSKY_ISA_FLAGS, ISAFlag);
+
+  unsigned ISAExtFlag = 0;
+  if (STI.hasFeature(CSKY::HasFLOATE1))
+    ISAExtFlag |= CSKYAttrs::ISA_FLOAT_E1;
+
+  if (STI.hasFeature(CSKY::HasFLOAT1E2))
+    ISAExtFlag |= CSKYAttrs::ISA_FLOAT_1E2;
+
+  if (STI.hasFeature(CSKY::HasFLOAT1E3))
+    ISAExtFlag |= CSKYAttrs::ISA_FLOAT_1E3;
+
+  if (STI.hasFeature(CSKY::HasFLOAT3E4))
+    ISAExtFlag |= CSKYAttrs::ISA_FLOAT_3E4;
+
+  if (STI.hasFeature(CSKY::HasFLOAT7E60))
+    ISAExtFlag |= CSKYAttrs::ISA_FLOAT_7E60;
+
+  emitAttribute(CSKYAttrs::CSKY_ISA_EXT_FLAGS, ISAExtFlag);
+
+  if (STI.hasFeature(CSKY::FeatureDSP))
+    emitAttribute(CSKYAttrs::CSKY_DSP_VERSION,
+                  CSKYAttrs::DSP_VERSION_EXTENSION);
+  if (STI.hasFeature(CSKY::FeatureDSPV2))
+    emitAttribute(CSKYAttrs::CSKY_DSP_VERSION, CSKYAttrs::DSP_VERSION_2);
+
+  if (STI.hasFeature(CSKY::FeatureVDSPV2))
+    emitAttribute(CSKYAttrs::CSKY_VDSP_VERSION, CSKYAttrs::VDSP_VERSION_2);
+
+  if (STI.hasFeature(CSKY::FeatureFPUV2_SF) ||
+      STI.hasFeature(CSKY::FeatureFPUV2_DF))
+    emitAttribute(CSKYAttrs::CSKY_FPU_VERSION, CSKYAttrs::FPU_VERSION_2);
+  else if (STI.hasFeature(CSKY::FeatureFPUV3_HF) ||
+           STI.hasFeature(CSKY::FeatureFPUV3_SF) ||
+           STI.hasFeature(CSKY::FeatureFPUV3_DF))
+    emitAttribute(CSKYAttrs::CSKY_FPU_VERSION, CSKYAttrs::FPU_VERSION_3);
+
+  bool hasAnyFloatExt = STI.hasFeature(CSKY::FeatureFPUV2_SF) ||
+                        STI.hasFeature(CSKY::FeatureFPUV2_DF) ||
+                        STI.hasFeature(CSKY::FeatureFPUV3_HF) ||
+                        STI.hasFeature(CSKY::FeatureFPUV3_SF) ||
+                        STI.hasFeature(CSKY::FeatureFPUV3_DF);
+
+  if (hasAnyFloatExt && STI.hasFeature(CSKY::ModeHardFloat) &&
+      STI.hasFeature(CSKY::ModeHardFloatABI))
+    emitAttribute(CSKYAttrs::CSKY_FPU_ABI, CSKYAttrs::FPU_ABI_HARD);
+  else if (hasAnyFloatExt && STI.hasFeature(CSKY::ModeHardFloat))
+    emitAttribute(CSKYAttrs::CSKY_FPU_ABI, CSKYAttrs::FPU_ABI_SOFTFP);
+  else
+    emitAttribute(CSKYAttrs::CSKY_FPU_ABI, CSKYAttrs::FPU_ABI_SOFT);
+
+  unsigned HardFPFlag = 0;
+  if (STI.hasFeature(CSKY::FeatureFPUV3_HF))
+    HardFPFlag |= CSKYAttrs::FPU_HARDFP_HALF;
+  if (STI.hasFeature(CSKY::FeatureFPUV2_SF) ||
+      STI.hasFeature(CSKY::FeatureFPUV3_SF))
+    HardFPFlag |= CSKYAttrs::FPU_HARDFP_SINGLE;
+  if (STI.hasFeature(CSKY::FeatureFPUV2_DF) ||
+      STI.hasFeature(CSKY::FeatureFPUV3_DF))
+    HardFPFlag |= CSKYAttrs::FPU_HARDFP_DOUBLE;
+
+  if (HardFPFlag != 0) {
+    emitAttribute(CSKYAttrs::CSKY_FPU_DENORMAL, CSKYAttrs::NEEDED);
+    emitAttribute(CSKYAttrs::CSKY_FPU_EXCEPTION, CSKYAttrs::NEEDED);
+    emitTextAttribute(CSKYAttrs::CSKY_FPU_NUMBER_MODULE, "IEEE 754");
+    emitAttribute(CSKYAttrs::CSKY_FPU_HARDFP, HardFPFlag);
+  }
+}
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.h b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.h
new file mode 100644
index 000000000000..b7931e922279
--- /dev/null
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.h
@@ -0,0 +1,148 @@
+//===-- CSKYELFStreamer.h - CSKY ELF Target Streamer -----------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_CSKY_CSKYELFSTREAMER_H
+#define LLVM_LIB_TARGET_CSKY_CSKYELFSTREAMER_H
+
+#include "CSKYTargetStreamer.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCObjectWriter.h"
+
+namespace llvm {
+
+class CSKYTargetELFStreamer : public CSKYTargetStreamer {
+private:
+  enum class AttributeType { Hidden, Numeric, Text, NumericAndText };
+
+  struct AttributeItem {
+    AttributeType Type;
+    unsigned Tag;
+    unsigned IntValue;
+    std::string StringValue;
+  };
+
+  StringRef CurrentVendor;
+  SmallVector<AttributeItem, 64> Contents;
+
+  MCSection *AttributeSection = nullptr;
+
+  AttributeItem *getAttributeItem(unsigned Attribute) {
+    for (size_t i = 0; i < Contents.size(); ++i)
+      if (Contents[i].Tag == Attribute)
+        return &Contents[i];
+    return nullptr;
+  }
+
+  void setAttributeItem(unsigned Attribute, unsigned Value,
+                        bool OverwriteExisting) {
+    // Look for existing attribute item.
+    if (AttributeItem *Item = getAttributeItem(Attribute)) {
+      if (!OverwriteExisting)
+        return;
+      Item->Type = AttributeType::Numeric;
+      Item->IntValue = Value;
+      return;
+    }
+
+    // Create new attribute item.
+    Contents.push_back({AttributeType::Numeric, Attribute, Value, ""});
+  }
+
+  void setAttributeItem(unsigned Attribute, StringRef Value,
+                        bool OverwriteExisting) {
+    // Look for existing attribute item.
+    if (AttributeItem *Item = getAttributeItem(Attribute)) {
+      if (!OverwriteExisting)
+        return;
+      Item->Type = AttributeType::Text;
+      Item->StringValue = std::string(Value);
+      return;
+    }
+
+    // Create new attribute item.
+    Contents.push_back({AttributeType::Text, Attribute, 0, std::string(Value)});
+  }
+
+  void setAttributeItems(unsigned Attribute, unsigned IntValue,
+                         StringRef StringValue, bool OverwriteExisting) {
+    // Look for existing attribute item.
+    if (AttributeItem *Item = getAttributeItem(Attribute)) {
+      if (!OverwriteExisting)
+        return;
+      Item->Type = AttributeType::NumericAndText;
+      Item->IntValue = IntValue;
+      Item->StringValue = std::string(StringValue);
+      return;
+    }
+
+    // Create new attribute item.
+    Contents.push_back({AttributeType::NumericAndText, Attribute, IntValue,
+                        std::string(StringValue)});
+  }
+
+  void emitAttribute(unsigned Attribute, unsigned Value) override;
+  void emitTextAttribute(unsigned Attribute, StringRef String) override;
+  void finishAttributeSection() override;
+  size_t calculateContentSize() const;
+
+  void emitTargetAttributes(const MCSubtargetInfo &STI) override;
+
+public:
+  MCELFStreamer &getStreamer();
+  CSKYTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
+};
+
+class CSKYELFStreamer : public MCELFStreamer {
+  int64_t MappingSymbolCounter = 0;
+
+  void EmitMappingSymbol(StringRef Name);
+
+public:
+  friend class CSKYTargetELFStreamer;
+
+  enum ElfMappingSymbol { EMS_None, EMS_Text, EMS_Data };
+
+  ElfMappingSymbol State;
+
+  CSKYELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
+                  std::unique_ptr<MCObjectWriter> OW,
+                  std::unique_ptr<MCCodeEmitter> Emitter)
+      : MCELFStreamer(Context, std::move(TAB), std::move(OW),
+                      std::move(Emitter)),
+        State(EMS_None) {}
+
+  ~CSKYELFStreamer() override = default;
+
+  void emitFill(const MCExpr &NumBytes, uint64_t FillValue,
+                SMLoc Loc) override {
+    EmitMappingSymbol("$d");
+    MCObjectStreamer::emitFill(NumBytes, FillValue, Loc);
+  }
+  void emitBytes(StringRef Data) override {
+    EmitMappingSymbol("$d");
+    MCELFStreamer::emitBytes(Data);
+  }
+  void emitInstruction(const MCInst &Inst,
+                       const MCSubtargetInfo &STI) override {
+    EmitMappingSymbol("$t");
+    MCELFStreamer::emitInstruction(Inst, STI);
+  }
+  void emitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override {
+    EmitMappingSymbol("$d");
+    MCELFStreamer::emitValueImpl(Value, Size, Loc);
+  }
+  void reset() override {
+    MappingSymbolCounter = 0;
+    State = EMS_None;
+    MCELFStreamer::reset();
+  }
+};
+
+} // namespace llvm
+#endif
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.cpp
index 07757f03c258..3a0017d11e23 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.cpp
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.cpp
@@ -9,16 +9,21 @@
 // This class prints an CSKY MCInst to a .s file.
 //
 //===----------------------------------------------------------------------===//
-
 #include "CSKYInstPrinter.h"
+#include "MCTargetDesc/CSKYBaseInfo.h"
+#include "MCTargetDesc/CSKYMCExpr.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 
@@ -55,6 +60,14 @@ bool CSKYInstPrinter::applyTargetSpecificCLOption(StringRef Opt) {
     ArchRegNames = true;
     return true;
   }
+  if (Opt == "debug") {
+    DebugFlag = true;
+    return true;
+  }
+  if (Opt == "abi-names") {
+    ABIRegNames = true;
+    return true;
+  }
 
   return false;
 }
@@ -70,7 +83,11 @@ void CSKYInstPrinter::printInst(const MCInst *MI, uint64_t Address,
 }
 
 void CSKYInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const {
-  O << getRegisterName(RegNo);
+  if (PrintBranchImmAsAddress)
+    O << getRegisterName(RegNo, ABIRegNames ? CSKY::ABIRegAltName
+                                            : CSKY::NoRegAltName);
+  else
+    O << getRegisterName(RegNo);
 }
 
 void CSKYInstPrinter::printFPRRegName(raw_ostream &O, unsigned RegNo) const {
@@ -87,15 +104,38 @@ void CSKYInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   const MCOperand &MO = MI->getOperand(OpNo);
 
   if (MO.isReg()) {
-    if (MO.getReg() == CSKY::C)
-      O << "";
+    unsigned Reg = MO.getReg();
+    bool useABIName = false;
+    if (PrintBranchImmAsAddress)
+      useABIName = ABIRegNames;
     else
-      printRegName(O, MO.getReg());
+      useABIName = !ArchRegNames;
+
+    if (Reg == CSKY::C)
+      O << "";
+    else if (STI.getFeatureBits()[CSKY::FeatureJAVA]) {
+      if (Reg == CSKY::R23)
+        O << (useABIName ? "fp" : "r23");
+      else if (Reg == CSKY::R24)
+        O << (useABIName ? "top" : "r24");
+      else if (Reg == CSKY::R25)
+        O << (useABIName ? "bsp" : "r25");
+      else
+        printRegName(O, Reg);
+    } else
+      printRegName(O, Reg);
+
     return;
   }
 
   if (MO.isImm()) {
-    O << formatImm(MO.getImm());
+    uint64_t TSFlags = MII.get(MI->getOpcode()).TSFlags;
+
+    if (((TSFlags & CSKYII::AddrModeMask) != CSKYII::AddrModeNone) &&
+        PrintBranchImmAsAddress)
+      O << formatHex(MO.getImm());
+    else
+      O << MO.getImm();
     return;
   }
 
@@ -157,6 +197,22 @@ void CSKYInstPrinter::printCSKYSymbolOperand(const MCInst *MI, uint64_t Address,
   }
 }
 
+void CSKYInstPrinter::printPSRFlag(const MCInst *MI, unsigned OpNo,
+                                   const MCSubtargetInfo &STI, raw_ostream &O) {
+  auto V = MI->getOperand(OpNo).getImm();
+
+  ListSeparator LS;
+
+  if ((V >> 3) & 0x1)
+    O << LS << "ee";
+  if ((V >> 2) & 0x1)
+    O << LS << "ie";
+  if ((V >> 1) & 0x1)
+    O << LS << "fe";
+  if ((V >> 0) & 0x1)
+    O << LS << "af";
+}
+
 void CSKYInstPrinter::printRegisterSeq(const MCInst *MI, unsigned OpNum,
                                        const MCSubtargetInfo &STI,
                                        raw_ostream &O) {
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.cpp
index 1d220b749cb1..540f901fd479 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.cpp
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.cpp
@@ -16,6 +16,9 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/EndianStream.h"
 
 using namespace llvm;
@@ -64,15 +67,170 @@ static void writeData(uint32_t Bin, unsigned Size, raw_ostream &OS) {
   support::endian::write<uint16_t>(OS, LO16, support::little);
 }
 
+void CSKYMCCodeEmitter::expandJBTF(const MCInst &MI, raw_ostream &OS,
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const {
+
+  MCInst TmpInst;
+
+  uint32_t Binary;
+
+  TmpInst =
+      MCInstBuilder(MI.getOpcode() == CSKY::JBT_E ? CSKY::BF16 : CSKY::BT16)
+          .addOperand(MI.getOperand(0))
+          .addImm(6);
+  Binary = getBinaryCodeForInstr(TmpInst, Fixups, STI);
+  writeData(Binary, 2, OS);
+
+  if (!STI.getFeatureBits()[CSKY::Has2E3])
+    TmpInst = MCInstBuilder(CSKY::BR32)
+                  .addOperand(MI.getOperand(1))
+                  .addOperand(MI.getOperand(2));
+  else
+    TmpInst = MCInstBuilder(CSKY::JMPI32).addOperand(MI.getOperand(2));
+  Binary = getBinaryCodeForInstr(TmpInst, Fixups, STI);
+  Fixups[Fixups.size() - 1].setOffset(2);
+  writeData(Binary, 4, OS);
+}
+
+void CSKYMCCodeEmitter::expandNEG(const MCInst &MI, raw_ostream &OS,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const {
+
+  MCInst TmpInst;
+  uint32_t Binary;
+  unsigned Size = MI.getOpcode() == CSKY::NEG32 ? 4 : 2;
+
+  TmpInst = MCInstBuilder(Size == 4 ? CSKY::NOT32 : CSKY::NOT16)
+                .addOperand(MI.getOperand(0))
+                .addOperand(MI.getOperand(1));
+  Binary = getBinaryCodeForInstr(TmpInst, Fixups, STI);
+  writeData(Binary, Size, OS);
+
+  TmpInst = MCInstBuilder(Size == 4 ? CSKY::ADDI32 : CSKY::ADDI16)
+                .addOperand(MI.getOperand(0))
+                .addOperand(MI.getOperand(0))
+                .addImm(1);
+  Binary = getBinaryCodeForInstr(TmpInst, Fixups, STI);
+  writeData(Binary, Size, OS);
+}
+
+void CSKYMCCodeEmitter::expandRSUBI(const MCInst &MI, raw_ostream &OS,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const {
+
+  MCInst TmpInst;
+  uint32_t Binary;
+  unsigned Size = MI.getOpcode() == CSKY::RSUBI32 ? 4 : 2;
+
+  TmpInst = MCInstBuilder(Size == 4 ? CSKY::NOT32 : CSKY::NOT16)
+                .addOperand(MI.getOperand(0))
+                .addOperand(MI.getOperand(1));
+  Binary = getBinaryCodeForInstr(TmpInst, Fixups, STI);
+  writeData(Binary, Size, OS);
+
+  TmpInst = MCInstBuilder(Size == 4 ? CSKY::ADDI32 : CSKY::ADDI16)
+                .addOperand(MI.getOperand(0))
+                .addOperand(MI.getOperand(0))
+                .addImm(MI.getOperand(2).getImm() + 1);
+  Binary = getBinaryCodeForInstr(TmpInst, Fixups, STI);
+  writeData(Binary, Size, OS);
+}
+
 void CSKYMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
                                           SmallVectorImpl<MCFixup> &Fixups,
                                           const MCSubtargetInfo &STI) const {
   const MCInstrDesc &Desc = MII.get(MI.getOpcode());
   unsigned Size = Desc.getSize();
 
+  MCInst TmpInst;
+
+  switch (MI.getOpcode()) {
+  default:
+    TmpInst = MI;
+    break;
+  case CSKY::JBT_E:
+  case CSKY::JBF_E:
+    expandJBTF(MI, OS, Fixups, STI);
+    MCNumEmitted += 2;
+    return;
+  case CSKY::NEG32:
+  case CSKY::NEG16:
+    expandNEG(MI, OS, Fixups, STI);
+    MCNumEmitted += 2;
+    return;
+  case CSKY::RSUBI32:
+  case CSKY::RSUBI16:
+    expandRSUBI(MI, OS, Fixups, STI);
+    MCNumEmitted += 2;
+    return;
+  case CSKY::JBSR32:
+    TmpInst = MCInstBuilder(CSKY::BSR32).addOperand(MI.getOperand(0));
+    break;
+  case CSKY::JBR16:
+    TmpInst = MCInstBuilder(CSKY::BR16).addOperand(MI.getOperand(0));
+    break;
+  case CSKY::JBR32:
+    TmpInst = MCInstBuilder(CSKY::BR32).addOperand(MI.getOperand(0));
+    break;
+  case CSKY::JBT16:
+    TmpInst = MCInstBuilder(CSKY::BT16)
+                  .addOperand(MI.getOperand(0))
+                  .addOperand(MI.getOperand(1));
+    break;
+  case CSKY::JBT32:
+    TmpInst = MCInstBuilder(CSKY::BT32)
+                  .addOperand(MI.getOperand(0))
+                  .addOperand(MI.getOperand(1));
+    break;
+  case CSKY::JBF16:
+    TmpInst = MCInstBuilder(CSKY::BF16)
+                  .addOperand(MI.getOperand(0))
+                  .addOperand(MI.getOperand(1));
+    break;
+  case CSKY::JBF32:
+    TmpInst = MCInstBuilder(CSKY::BF32)
+                  .addOperand(MI.getOperand(0))
+                  .addOperand(MI.getOperand(1));
+    break;
+  case CSKY::LRW32_Gen:
+    TmpInst = MCInstBuilder(CSKY::LRW32)
+                  .addOperand(MI.getOperand(0))
+                  .addOperand(MI.getOperand(2));
+    break;
+  case CSKY::LRW16_Gen:
+    TmpInst = MCInstBuilder(CSKY::LRW16)
+                  .addOperand(MI.getOperand(0))
+                  .addOperand(MI.getOperand(2));
+    break;
+  case CSKY::CMPLEI32:
+    TmpInst = MCInstBuilder(CSKY::CMPLTI32)
+                  .addOperand(MI.getOperand(0))
+                  .addOperand(MI.getOperand(1))
+                  .addImm(MI.getOperand(2).getImm() + 1);
+    break;
+  case CSKY::CMPLEI16:
+    TmpInst = MCInstBuilder(CSKY::CMPLTI16)
+                  .addOperand(MI.getOperand(0))
+                  .addOperand(MI.getOperand(1))
+                  .addImm(MI.getOperand(2).getImm() + 1);
+    break;
+  case CSKY::ROTRI32:
+    TmpInst = MCInstBuilder(CSKY::ROTLI32)
+                  .addOperand(MI.getOperand(0))
+                  .addOperand(MI.getOperand(1))
+                  .addImm(32 - MI.getOperand(2).getImm());
+    break;
+  case CSKY::BGENI:
+    auto V = 1 << MI.getOperand(1).getImm();
+    TmpInst =
+        MCInstBuilder(CSKY::MOVI32).addOperand(MI.getOperand(0)).addImm(V);
+    break;
+  }
+
   ++MCNumEmitted;
 
-  uint32_t Bin = getBinaryCodeForInstr(MI, Fixups, STI);
+  uint32_t Bin = getBinaryCodeForInstr(TmpInst, Fixups, STI);
 
   uint16_t LO16 = static_cast<uint16_t>(Bin);
   uint16_t HI16 = static_cast<uint16_t>(Bin >> 16);
@@ -170,7 +328,6 @@ MCFixupKind CSKYMCCodeEmitter::getTargetFixup(const MCExpr *Expr) const {
 }
 
 MCCodeEmitter *llvm::createCSKYMCCodeEmitter(const MCInstrInfo &MCII,
-                                             const MCRegisterInfo &MRI,
                                              MCContext &Ctx) {
   return new CSKYMCCodeEmitter(Ctx, MCII);
 }
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.h b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.h
index bfba07bcb32a..128430197cc5 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.h
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.h
@@ -20,6 +20,8 @@
 
 namespace llvm {
 
+class MCInstrInfo;
+
 class CSKYMCCodeEmitter : public MCCodeEmitter {
   MCContext &Ctx;
   const MCInstrInfo &MII;
@@ -169,6 +171,16 @@ public:
     Fixups.push_back(MCFixup::create(0, MO.getExpr(), Kind, MI.getLoc()));
     return 0;
   }
+
+  void expandJBTF(const MCInst &MI, raw_ostream &OS,
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const;
+  void expandNEG(const MCInst &MI, raw_ostream &OS,
+                 SmallVectorImpl<MCFixup> &Fixups,
+                 const MCSubtargetInfo &STI) const;
+  void expandRSUBI(const MCInst &MI, raw_ostream &OS,
+                   SmallVectorImpl<MCFixup> &Fixups,
+                   const MCSubtargetInfo &STI) const;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCExpr.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCExpr.cpp
index 7987613b0608..b9989822dc36 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCExpr.cpp
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCExpr.cpp
@@ -8,10 +8,12 @@
 
 #include "CSKYMCExpr.h"
 #include "CSKYFixupKinds.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbolELF.h"
+#include "llvm/Support/Casting.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp
index 0901c0993607..1a69dc8acde0 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp
@@ -12,10 +12,14 @@
 
 #include "CSKYMCTargetDesc.h"
 #include "CSKYAsmBackend.h"
+#include "CSKYELFStreamer.h"
 #include "CSKYInstPrinter.h"
 #include "CSKYMCAsmInfo.h"
 #include "CSKYMCCodeEmitter.h"
+#include "CSKYTargetStreamer.h"
 #include "TargetInfo/CSKYTargetInfo.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -72,6 +76,81 @@ static MCSubtargetInfo *createCSKYMCSubtargetInfo(const Triple &TT,
   return createCSKYMCSubtargetInfoImpl(TT, CPUName, /*TuneCPU=*/CPUName, FS);
 }
 
+static MCTargetStreamer *
+createCSKYObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
+  const Triple &TT = STI.getTargetTriple();
+  if (TT.isOSBinFormatELF())
+    return new CSKYTargetELFStreamer(S, STI);
+  return nullptr;
+}
+
+static MCStreamer *createELFStreamer(const Triple &T, MCContext &Ctx,
+                                     std::unique_ptr<MCAsmBackend> &&MAB,
+                                     std::unique_ptr<MCObjectWriter> &&OW,
+                                     std::unique_ptr<MCCodeEmitter> &&Emitter,
+                                     bool RelaxAll) {
+  CSKYELFStreamer *S = new CSKYELFStreamer(Ctx, std::move(MAB), std::move(OW),
+                                           std::move(Emitter));
+
+  if (RelaxAll)
+    S->getAssembler().setRelaxAll(true);
+  return S;
+}
+
+static MCTargetStreamer *createCSKYAsmTargetStreamer(MCStreamer &S,
+                                                     formatted_raw_ostream &OS,
+                                                     MCInstPrinter *InstPrinter,
+                                                     bool isVerboseAsm) {
+  return new CSKYTargetAsmStreamer(S, OS);
+}
+
+static MCTargetStreamer *createCSKYNullTargetStreamer(MCStreamer &S) {
+  return new CSKYTargetStreamer(S);
+}
+
+namespace {
+
+class CSKYMCInstrAnalysis : public MCInstrAnalysis {
+public:
+  explicit CSKYMCInstrAnalysis(const MCInstrInfo *Info)
+      : MCInstrAnalysis(Info) {}
+
+  bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
+                      uint64_t &Target) const override {
+    if (isConditionalBranch(Inst) || isUnconditionalBranch(Inst)) {
+      int64_t Imm;
+      Imm = Inst.getOperand(Inst.getNumOperands() - 1).getImm();
+      Target = Addr + Imm;
+      return true;
+    }
+
+    if (Inst.getOpcode() == CSKY::BSR32) {
+      Target = Addr + Inst.getOperand(0).getImm();
+      return true;
+    }
+
+    switch (Inst.getOpcode()) {
+    default:
+      return false;
+    case CSKY::LRW16:
+    case CSKY::LRW32:
+    case CSKY::JSRI32:
+    case CSKY::JMPI32:
+      int64_t Imm = Inst.getOperand(Inst.getNumOperands() - 1).getImm();
+      Target = ((Addr + Imm) & 0xFFFFFFFC);
+      return true;
+    }
+
+    return false;
+  }
+};
+
+} // end anonymous namespace
+
+static MCInstrAnalysis *createCSKYInstrAnalysis(const MCInstrInfo *Info) {
+  return new CSKYMCInstrAnalysis(Info);
+}
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeCSKYTargetMC() {
   auto &CSKYTarget = getTheCSKYTarget();
   TargetRegistry::RegisterMCAsmBackend(CSKYTarget, createCSKYAsmBackend);
@@ -82,4 +161,13 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeCSKYTargetMC() {
   TargetRegistry::RegisterMCInstPrinter(CSKYTarget, createCSKYMCInstPrinter);
   TargetRegistry::RegisterMCSubtargetInfo(CSKYTarget,
                                           createCSKYMCSubtargetInfo);
+  TargetRegistry::RegisterELFStreamer(CSKYTarget, createELFStreamer);
+  TargetRegistry::RegisterObjectTargetStreamer(CSKYTarget,
+                                               createCSKYObjectTargetStreamer);
+  TargetRegistry::RegisterAsmTargetStreamer(CSKYTarget,
+                                            createCSKYAsmTargetStreamer);
+  // Register the null target streamer.
+  TargetRegistry::RegisterNullTargetStreamer(CSKYTarget,
+                                             createCSKYNullTargetStreamer);
+  TargetRegistry::RegisterMCInstrAnalysis(CSKYTarget, createCSKYInstrAnalysis);
 }
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.h b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.h
index 25bbd635fc58..4b8c45e95b74 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.h
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.h
@@ -34,9 +34,7 @@ MCAsmBackend *createCSKYAsmBackend(const Target &T, const MCSubtargetInfo &STI,
                                    const MCRegisterInfo &MRI,
                                    const MCTargetOptions &Options);
 
-MCCodeEmitter *createCSKYMCCodeEmitter(const MCInstrInfo &MCII,
-                                       const MCRegisterInfo &MRI,
-                                       MCContext &Ctx);
+MCCodeEmitter *createCSKYMCCodeEmitter(const MCInstrInfo &MCII, MCContext &Ctx);
 } // namespace llvm
 
 #define GET_REGINFO_ENUM
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYTargetStreamer.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYTargetStreamer.cpp
new file mode 100644
index 000000000000..dd7053d60aa1
--- /dev/null
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYTargetStreamer.cpp
@@ -0,0 +1,143 @@
+//===-- CSKYTargetStreamer.h - CSKY Target Streamer ----------*- C++ -*----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "CSKYTargetStreamer.h"
+#include "CSKYSubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/FormattedStream.h"
+
+using namespace llvm;
+
+//
+// ConstantPool implementation
+//
+// Emit the contents of the constant pool using the provided streamer.
+void CSKYConstantPool::emitAll(MCStreamer &Streamer) {
+  if (Entries.empty())
+    return;
+
+  if (CurrentSection != nullptr)
+    Streamer.switchSection(CurrentSection);
+
+  Streamer.emitDataRegion(MCDR_DataRegion);
+  for (const ConstantPoolEntry &Entry : Entries) {
+    Streamer.emitCodeAlignment(
+        Entry.Size,
+        Streamer.getContext().getSubtargetInfo()); // align naturally
+    Streamer.emitLabel(Entry.Label);
+    Streamer.emitValue(Entry.Value, Entry.Size, Entry.Loc);
+  }
+  Streamer.emitDataRegion(MCDR_DataRegionEnd);
+  Entries.clear();
+}
+
+const MCExpr *CSKYConstantPool::addEntry(MCStreamer &Streamer,
+                                         const MCExpr *Value, unsigned Size,
+                                         SMLoc Loc, const MCExpr *AdjustExpr) {
+  if (CurrentSection == nullptr)
+    CurrentSection = Streamer.getCurrentSectionOnly();
+
+  auto &Context = Streamer.getContext();
+
+  const MCConstantExpr *C = dyn_cast<MCConstantExpr>(Value);
+
+  // Check if there is existing entry for the same constant. If so, reuse it.
+  auto Itr = C ? CachedEntries.find(C->getValue()) : CachedEntries.end();
+  if (Itr != CachedEntries.end())
+    return Itr->second;
+
+  MCSymbol *CPEntryLabel = Context.createTempSymbol();
+  const auto SymRef = MCSymbolRefExpr::create(CPEntryLabel, Context);
+
+  if (AdjustExpr) {
+    const CSKYMCExpr *CSKYExpr = cast<CSKYMCExpr>(Value);
+
+    Value = MCBinaryExpr::createSub(AdjustExpr, SymRef, Context);
+    Value = MCBinaryExpr::createSub(CSKYExpr->getSubExpr(), Value, Context);
+    Value = CSKYMCExpr::create(Value, CSKYExpr->getKind(), Context);
+  }
+
+  Entries.push_back(ConstantPoolEntry(CPEntryLabel, Value, Size, Loc));
+
+  if (C)
+    CachedEntries[C->getValue()] = SymRef;
+  return SymRef;
+}
+
+bool CSKYConstantPool::empty() { return Entries.empty(); }
+
+void CSKYConstantPool::clearCache() {
+  CurrentSection = nullptr;
+  CachedEntries.clear();
+}
+
+CSKYTargetStreamer::CSKYTargetStreamer(MCStreamer &S)
+    : MCTargetStreamer(S), ConstantPool(new CSKYConstantPool()) {}
+
+const MCExpr *
+CSKYTargetStreamer::addConstantPoolEntry(const MCExpr *Expr, SMLoc Loc,
+                                         const MCExpr *AdjustExpr) {
+  auto ELFRefKind = CSKYMCExpr::VK_CSKY_Invalid;
+  ConstantCounter++;
+
+  const MCExpr *OrigExpr = Expr;
+
+  if (const CSKYMCExpr *CE = dyn_cast<CSKYMCExpr>(Expr)) {
+    Expr = CE->getSubExpr();
+    ELFRefKind = CE->getKind();
+  }
+
+  if (const MCSymbolRefExpr *SymExpr = dyn_cast<MCSymbolRefExpr>(Expr)) {
+    const MCSymbol *Sym = &SymExpr->getSymbol();
+
+    SymbolIndex Index = {Sym, ELFRefKind};
+
+    if (ConstantMap.find(Index) == ConstantMap.end()) {
+      ConstantMap[Index] =
+          ConstantPool->addEntry(getStreamer(), OrigExpr, 4, Loc, AdjustExpr);
+    }
+    return ConstantMap[Index];
+  }
+
+  return ConstantPool->addEntry(getStreamer(), Expr, 4, Loc, AdjustExpr);
+}
+
+void CSKYTargetStreamer::emitCurrentConstantPool() {
+  ConstantPool->emitAll(Streamer);
+  ConstantPool->clearCache();
+}
+
+// finish() - write out any non-empty assembler constant pools.
+void CSKYTargetStreamer::finish() {
+  if (ConstantCounter != 0) {
+    ConstantPool->emitAll(Streamer);
+  }
+
+  finishAttributeSection();
+}
+
+void CSKYTargetStreamer::emitTargetAttributes(const MCSubtargetInfo &STI) {}
+
+void CSKYTargetStreamer::emitAttribute(unsigned Attribute, unsigned Value) {}
+void CSKYTargetStreamer::emitTextAttribute(unsigned Attribute,
+                                           StringRef String) {}
+void CSKYTargetStreamer::finishAttributeSection() {}
+
+void CSKYTargetAsmStreamer::emitAttribute(unsigned Attribute, unsigned Value) {
+  OS << "\t.csky_attribute\t" << Attribute << ", " << Twine(Value) << "\n";
+}
+
+void CSKYTargetAsmStreamer::emitTextAttribute(unsigned Attribute,
+                                              StringRef String) {
+  OS << "\t.csky_attribute\t" << Attribute << ", \"" << String << "\"\n";
+}
+
+void CSKYTargetAsmStreamer::finishAttributeSection() {}
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYTargetStreamer.h b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYTargetStreamer.h
new file mode 100644
index 000000000000..270d48d5939c
--- /dev/null
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYTargetStreamer.h
@@ -0,0 +1,110 @@
+//===-- CSKYTargetStreamer.h - CSKY Target Streamer ----------*- C++ -*----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_CSKY_CSKYTARGETSTREAMER_H
+#define LLVM_LIB_TARGET_CSKY_CSKYTARGETSTREAMER_H
+
+#include "MCTargetDesc/CSKYMCExpr.h"
+#include "llvm/MC/ConstantPools.h"
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+
+class CSKYConstantPool {
+  using EntryVecTy = SmallVector<ConstantPoolEntry, 4>;
+  EntryVecTy Entries;
+  std::map<int64_t, const MCSymbolRefExpr *> CachedEntries;
+
+  MCSection *CurrentSection = nullptr;
+
+public:
+  // Initialize a new empty constant pool
+  CSKYConstantPool() = default;
+
+  // Add a new entry to the constant pool in the next slot.
+  // \param Value is the new entry to put in the constant pool.
+  // \param Size is the size in bytes of the entry
+  //
+  // \returns a MCExpr that references the newly inserted value
+  const MCExpr *addEntry(MCStreamer &Streamer, const MCExpr *Value,
+                         unsigned Size, SMLoc Loc, const MCExpr *AdjustExpr);
+
+  void emitAll(MCStreamer &Streamer);
+
+  // Return true if the constant pool is empty
+  bool empty();
+
+  void clearCache();
+};
+
+class CSKYTargetStreamer : public MCTargetStreamer {
+public:
+  typedef struct {
+    const MCSymbol *sym;
+    CSKYMCExpr::VariantKind kind;
+  } SymbolIndex;
+
+protected:
+  std::unique_ptr<CSKYConstantPool> ConstantPool;
+
+  DenseMap<SymbolIndex, const MCExpr *> ConstantMap;
+
+  unsigned ConstantCounter = 0;
+
+public:
+  CSKYTargetStreamer(MCStreamer &S);
+
+  virtual void emitTextAttribute(unsigned Attribute, StringRef String);
+  virtual void emitAttribute(unsigned Attribute, unsigned Value);
+  virtual void finishAttributeSection();
+
+  virtual void emitTargetAttributes(const MCSubtargetInfo &STI);
+  /// Add a new entry to the constant pool for the current section and return an
+  /// MCExpr that can be used to refer to the constant pool location.
+  const MCExpr *addConstantPoolEntry(const MCExpr *, SMLoc Loc,
+                                     const MCExpr *AdjustExpr = nullptr);
+
+  void emitCurrentConstantPool();
+
+  void finish() override;
+};
+
+template <> struct DenseMapInfo<CSKYTargetStreamer::SymbolIndex> {
+  static inline CSKYTargetStreamer::SymbolIndex getEmptyKey() {
+    return {nullptr, CSKYMCExpr::VK_CSKY_Invalid};
+  }
+  static inline CSKYTargetStreamer::SymbolIndex getTombstoneKey() {
+    return {nullptr, CSKYMCExpr::VK_CSKY_Invalid};
+  }
+  static unsigned getHashValue(const CSKYTargetStreamer::SymbolIndex &V) {
+    return hash_combine(DenseMapInfo<const MCSymbol *>::getHashValue(V.sym),
+                        DenseMapInfo<int>::getHashValue(V.kind));
+  }
+  static bool isEqual(const CSKYTargetStreamer::SymbolIndex &A,
+                      const CSKYTargetStreamer::SymbolIndex &B) {
+    return A.sym == B.sym && A.kind == B.kind;
+  }
+};
+
+class formatted_raw_ostream;
+
+class CSKYTargetAsmStreamer : public CSKYTargetStreamer {
+  formatted_raw_ostream &OS;
+
+  void emitAttribute(unsigned Attribute, unsigned Value) override;
+  void emitTextAttribute(unsigned Attribute, StringRef String) override;
+  void finishAttributeSection() override;
+
+public:
+  CSKYTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS)
+      : CSKYTargetStreamer(S), OS(OS) {}
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_CSKY_CSKYTARGETSTREAMER_H
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
new file mode 100644
index 000000000000..4d6e1a9d3166
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -0,0 +1,144 @@
+//- DXIL.td - Describe DXIL operation -------------------------*- tablegen -*-//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This is a target description file for DXIL operation.
+///
+//===----------------------------------------------------------------------===//
+
+include "llvm/IR/Intrinsics.td"
+
+class dxil_class<string _name> {
+  string name = _name;
+}
+class dxil_category<string _name> {
+  string name = _name;
+}
+
+def Unary : dxil_class<"Unary">;
+def Binary : dxil_class<"Binary">;
+def FlattenedThreadIdInGroupClass : dxil_class<"FlattenedThreadIdInGroup">;
+def ThreadIdInGroupClass : dxil_class<"ThreadIdInGroup">;
+def ThreadIdClass : dxil_class<"ThreadId">;
+def GroupIdClass : dxil_class<"GroupId">;
+
+def binary_uint : dxil_category<"Binary uint">;
+def unary_float : dxil_category<"Unary float">;
+def ComputeID : dxil_category<"Compute/Mesh/Amplification shader">;
+
+
+// The parameter description for a DXIL instruction
+class dxil_param<int _pos, string type, string _name, string _doc,
+                 bit _is_const = 0, string _enum_name = "",
+                 int _max_value = 0> {
+  int pos = _pos;           // position in parameter list
+  string llvm_type = type; // llvm type name, $o for overload, $r for resource
+                           // type, $cb for legacy cbuffer, $u4 for u4 struct
+  string name = _name;      // short, unique name
+  string doc = _doc;        // the documentation description of this parameter
+  bit is_const =
+      _is_const; // whether this argument requires a constant value in the IR
+  string enum_name = _enum_name; // the name of the enum type if applicable
+  int max_value =
+      _max_value; // the maximum value for this parameter if applicable
+}
+
+// A representation for a DXIL instruction
+class dxil_inst<string _name> {
+  string name = _name; // short, unique name
+
+  string dxil_op = "";       // name of DXIL operation
+  int dxil_opid = 0;         // ID of DXIL operation
+  dxil_class  op_class;      // name of the opcode class
+  dxil_category category;    // classification for this instruction
+  string doc = "";           // the documentation description of this instruction
+  list<dxil_param> ops = []; // the operands that this instruction takes
+  string oload_types = "";   // overload types if applicable
+  string fn_attr = "";       // attribute shorthands: rn=does not access
+                             // memory,ro=only reads from memory,
+  bit is_deriv = 0;          // whether this is some kind of derivative
+  bit is_gradient = 0;       // whether this requires a gradient calculation
+  bit is_feedback = 0;       // whether this is a sampler feedback op
+  bit is_wave = 0; // whether this requires in-wave, cross-lane functionality
+  bit requires_uniform_inputs = 0; // whether this operation requires that all
+                                   // of its inputs are uniform across the wave
+  // Group dxil operation for stats.
+  // Like how many atomic/float/uint/int/... instructions used in the program.
+  list<string> stats_group = [];
+}
+
+class dxil_op<string name, int code_id, dxil_class code_class, dxil_category op_category, string _doc,
+              string _oload_types, string _fn_attr, list<dxil_param> op_params,
+              list<string> _stats_group = []> : dxil_inst<name> {
+  let dxil_op = name;
+  let dxil_opid = code_id;
+  let doc = _doc;
+  let ops = op_params;
+  let op_class = code_class;
+  let category = op_category;
+  let oload_types = _oload_types;
+  let fn_attr = _fn_attr;
+  let stats_group = _stats_group;
+}
+
+// The intrinsic which map directly to this dxil op.
+class dxil_map_intrinsic<Intrinsic llvm_intrinsic_> { Intrinsic llvm_intrinsic = llvm_intrinsic_; }
+
+def Sin : dxil_op<"Sin", 13, Unary, unary_float, "returns sine(theta) for theta in radians.",
+  "half;float;", "rn",
+  [
+    dxil_param<0, "$o", "", "operation result">,
+    dxil_param<1, "i32", "opcode", "DXIL opcode">,
+    dxil_param<2, "$o", "value", "input value">
+  ],
+  ["floats"]>,
+  dxil_map_intrinsic<int_sin>;
+
+def UMax :dxil_op< "UMax", 39,  Binary,  binary_uint, "unsigned integer maximum. UMax(a,b) = a > b ? a : b",
+    "i16;i32;i64;",  "rn",
+  [
+    dxil_param<0,  "$o",  "",  "operation result">,
+    dxil_param<1,  "i32",  "opcode",  "DXIL opcode">,
+    dxil_param<2,  "$o",  "a",  "input value">,
+    dxil_param<3,  "$o",  "b",  "input value">
+  ],
+  ["uints"]>,
+  dxil_map_intrinsic<int_umax>;
+
+def ThreadId :dxil_op< "ThreadId", 93,  ThreadIdClass, ComputeID, "reads the thread ID", "i32;",  "rn",
+  [
+    dxil_param<0,  "i32",  "",  "thread ID component">,
+    dxil_param<1,  "i32",  "opcode",  "DXIL opcode">,
+    dxil_param<2,  "i32",  "component",  "component to read (x,y,z)">
+  ]>,
+  dxil_map_intrinsic<int_dxil_thread_id>;
+
+def GroupId :dxil_op< "GroupId", 94,  GroupIdClass, ComputeID, "reads the group ID (SV_GroupID)", "i32;",  "rn",
+  [
+    dxil_param<0,  "i32",  "",  "group ID component">,
+    dxil_param<1,  "i32",  "opcode",  "DXIL opcode">,
+    dxil_param<2,  "i32",  "component",  "component to read">
+  ]>,
+  dxil_map_intrinsic<int_dxil_group_id>;
+
+def ThreadIdInGroup :dxil_op< "ThreadIdInGroup", 95,  ThreadIdInGroupClass, ComputeID,
+  "reads the thread ID within the group (SV_GroupThreadID)", "i32;",  "rn",
+  [
+    dxil_param<0,  "i32",  "",  "thread ID in group component">,
+    dxil_param<1,  "i32",  "opcode",  "DXIL opcode">,
+    dxil_param<2,  "i32",  "component",  "component to read (x,y,z)">
+  ]>,
+  dxil_map_intrinsic<int_dxil_thread_id_in_group>;
+
+def FlattenedThreadIdInGroup :dxil_op< "FlattenedThreadIdInGroup", 96,  FlattenedThreadIdInGroupClass, ComputeID,
+   "provides a flattened index for a given thread within a given group (SV_GroupIndex)", "i32;",  "rn",
+  [
+    dxil_param<0,  "i32",  "",  "result">,
+    dxil_param<1,  "i32",  "opcode",  "DXIL opcode">
+  ]>,
+  dxil_map_intrinsic<int_dxil_flattened_thread_id_in_group>;
diff --git a/llvm/lib/Target/DirectX/DXILConstants.h b/llvm/lib/Target/DirectX/DXILConstants.h
new file mode 100644
index 000000000000..e8e7b5396a46
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DXILConstants.h
@@ -0,0 +1,25 @@
+//===- DXILConstants.h - Essential DXIL constants -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file This file contains essential DXIL constants.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_DIRECTX_DXILCONSTANTS_H
+#define LLVM_LIB_TARGET_DIRECTX_DXILCONSTANTS_H
+
+namespace llvm {
+namespace DXIL {
+
+#define DXIL_OP_ENUM
+#include "DXILOperation.inc"
+#undef DXIL_OP_ENUM
+
+} // namespace DXIL
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
new file mode 100644
index 000000000000..11b89e4ec890
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -0,0 +1,265 @@
+//===- DXILOpLower.cpp - Lowering LLVM intrinsic to DIXLOp function -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file This file contains passes and utilities to lower llvm intrinsic call
+/// to DXILOp function call.
+//===----------------------------------------------------------------------===//
+
+#include "DXILConstants.h"
+#include "DirectX.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsDirectX.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/ErrorHandling.h"
+
+#define DEBUG_TYPE "dxil-op-lower"
+
+using namespace llvm;
+using namespace llvm::DXIL;
+
+constexpr StringLiteral DXILOpNamePrefix = "dx.op.";
+
+enum OverloadKind : uint16_t {
+  VOID = 1,
+  HALF = 1 << 1,
+  FLOAT = 1 << 2,
+  DOUBLE = 1 << 3,
+  I1 = 1 << 4,
+  I8 = 1 << 5,
+  I16 = 1 << 6,
+  I32 = 1 << 7,
+  I64 = 1 << 8,
+  UserDefineType = 1 << 9,
+  ObjectType = 1 << 10,
+};
+
+static const char *getOverloadTypeName(OverloadKind Kind) {
+  switch (Kind) {
+  case OverloadKind::HALF:
+    return "f16";
+  case OverloadKind::FLOAT:
+    return "f32";
+  case OverloadKind::DOUBLE:
+    return "f64";
+  case OverloadKind::I1:
+    return "i1";
+  case OverloadKind::I8:
+    return "i8";
+  case OverloadKind::I16:
+    return "i16";
+  case OverloadKind::I32:
+    return "i32";
+  case OverloadKind::I64:
+    return "i64";
+  case OverloadKind::VOID:
+  case OverloadKind::ObjectType:
+  case OverloadKind::UserDefineType:
+    break;
+  }
+  llvm_unreachable("invalid overload type for name");
+  return "void";
+}
+
+static OverloadKind getOverloadKind(Type *Ty) {
+  Type::TypeID T = Ty->getTypeID();
+  switch (T) {
+  case Type::VoidTyID:
+    return OverloadKind::VOID;
+  case Type::HalfTyID:
+    return OverloadKind::HALF;
+  case Type::FloatTyID:
+    return OverloadKind::FLOAT;
+  case Type::DoubleTyID:
+    return OverloadKind::DOUBLE;
+  case Type::IntegerTyID: {
+    IntegerType *ITy = cast<IntegerType>(Ty);
+    unsigned Bits = ITy->getBitWidth();
+    switch (Bits) {
+    case 1:
+      return OverloadKind::I1;
+    case 8:
+      return OverloadKind::I8;
+    case 16:
+      return OverloadKind::I16;
+    case 32:
+      return OverloadKind::I32;
+    case 64:
+      return OverloadKind::I64;
+    default:
+      llvm_unreachable("invalid overload type");
+      return OverloadKind::VOID;
+    }
+  }
+  case Type::PointerTyID:
+    return OverloadKind::UserDefineType;
+  case Type::StructTyID:
+    return OverloadKind::ObjectType;
+  default:
+    llvm_unreachable("invalid overload type");
+    return OverloadKind::VOID;
+  }
+}
+
+static std::string getTypeName(OverloadKind Kind, Type *Ty) {
+  if (Kind < OverloadKind::UserDefineType) {
+    return getOverloadTypeName(Kind);
+  } else if (Kind == OverloadKind::UserDefineType) {
+    StructType *ST = cast<StructType>(Ty);
+    return ST->getStructName().str();
+  } else if (Kind == OverloadKind::ObjectType) {
+    StructType *ST = cast<StructType>(Ty);
+    return ST->getStructName().str();
+  } else {
+    std::string Str;
+    raw_string_ostream OS(Str);
+    Ty->print(OS);
+    return OS.str();
+  }
+}
+
+// Static properties.
+struct OpCodeProperty {
+  DXIL::OpCode OpCode;
+  // Offset in DXILOpCodeNameTable.
+  unsigned OpCodeNameOffset;
+  DXIL::OpCodeClass OpCodeClass;
+  // Offset in DXILOpCodeClassNameTable.
+  unsigned OpCodeClassNameOffset;
+  uint16_t OverloadTys;
+  llvm::Attribute::AttrKind FuncAttr;
+};
+
+// Include getOpCodeClassName getOpCodeProperty and getOpCodeName which
+// generated by tableGen.
+#define DXIL_OP_OPERATION_TABLE
+#include "DXILOperation.inc"
+#undef DXIL_OP_OPERATION_TABLE
+
+static std::string constructOverloadName(OverloadKind Kind, Type *Ty,
+                                         const OpCodeProperty &Prop) {
+  if (Kind == OverloadKind::VOID) {
+    return (Twine(DXILOpNamePrefix) + getOpCodeClassName(Prop)).str();
+  }
+  return (Twine(DXILOpNamePrefix) + getOpCodeClassName(Prop) + "." +
+          getTypeName(Kind, Ty))
+      .str();
+}
+
+static FunctionCallee createDXILOpFunction(DXIL::OpCode DXILOp, Function &F,
+                                           Module &M) {
+  const OpCodeProperty *Prop = getOpCodeProperty(DXILOp);
+
+  // Get return type as overload type for DXILOp.
+  // Only simple mapping case here, so return type is good enough.
+  Type *OverloadTy = F.getReturnType();
+
+  OverloadKind Kind = getOverloadKind(OverloadTy);
+  // FIXME: find the issue and report error in clang instead of check it in
+  // backend.
+  if ((Prop->OverloadTys & (uint16_t)Kind) == 0) {
+    llvm_unreachable("invalid overload");
+  }
+
+  std::string FnName = constructOverloadName(Kind, OverloadTy, *Prop);
+  assert(!M.getFunction(FnName) && "Function already exists");
+
+  auto &Ctx = M.getContext();
+  Type *OpCodeTy = Type::getInt32Ty(Ctx);
+
+  SmallVector<Type *> ArgTypes;
+  // DXIL has i32 opcode as first arg.
+  ArgTypes.emplace_back(OpCodeTy);
+  FunctionType *FT = F.getFunctionType();
+  ArgTypes.append(FT->param_begin(), FT->param_end());
+  FunctionType *DXILOpFT = FunctionType::get(OverloadTy, ArgTypes, false);
+  return M.getOrInsertFunction(FnName, DXILOpFT);
+}
+
+static void lowerIntrinsic(DXIL::OpCode DXILOp, Function &F, Module &M) {
+  auto DXILOpFn = createDXILOpFunction(DXILOp, F, M);
+  IRBuilder<> B(M.getContext());
+  Value *DXILOpArg = B.getInt32(static_cast<unsigned>(DXILOp));
+  for (User *U : make_early_inc_range(F.users())) {
+    CallInst *CI = dyn_cast<CallInst>(U);
+    if (!CI)
+      continue;
+
+    SmallVector<Value *> Args;
+    Args.emplace_back(DXILOpArg);
+    Args.append(CI->arg_begin(), CI->arg_end());
+    B.SetInsertPoint(CI);
+    CallInst *DXILCI = B.CreateCall(DXILOpFn, Args);
+    LLVM_DEBUG(DXILCI->setName(getOpCodeName(DXILOp)));
+    CI->replaceAllUsesWith(DXILCI);
+    CI->eraseFromParent();
+  }
+  if (F.user_empty())
+    F.eraseFromParent();
+}
+
+static bool lowerIntrinsics(Module &M) {
+  bool Updated = false;
+
+#define DXIL_OP_INTRINSIC_MAP
+#include "DXILOperation.inc"
+#undef DXIL_OP_INTRINSIC_MAP
+
+  for (Function &F : make_early_inc_range(M.functions())) {
+    if (!F.isDeclaration())
+      continue;
+    Intrinsic::ID ID = F.getIntrinsicID();
+    if (ID == Intrinsic::not_intrinsic)
+      continue;
+    auto LowerIt = LowerMap.find(ID);
+    if (LowerIt == LowerMap.end())
+      continue;
+    lowerIntrinsic(LowerIt->second, F, M);
+    Updated = true;
+  }
+  return Updated;
+}
+
+namespace {
+/// A pass that transforms external global definitions into declarations.
+class DXILOpLowering : public PassInfoMixin<DXILOpLowering> {
+public:
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &) {
+    if (lowerIntrinsics(M))
+      return PreservedAnalyses::none();
+    return PreservedAnalyses::all();
+  }
+};
+} // namespace
+
+namespace {
+class DXILOpLoweringLegacy : public ModulePass {
+public:
+  bool runOnModule(Module &M) override { return lowerIntrinsics(M); }
+  StringRef getPassName() const override { return "DXIL Op Lowering"; }
+  DXILOpLoweringLegacy() : ModulePass(ID) {}
+
+  static char ID; // Pass identification.
+};
+char DXILOpLoweringLegacy::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(DXILOpLoweringLegacy, DEBUG_TYPE, "DXIL Op Lowering",
+                      false, false)
+INITIALIZE_PASS_END(DXILOpLoweringLegacy, DEBUG_TYPE, "DXIL Op Lowering", false,
+                    false)
+
+ModulePass *llvm::createDXILOpLoweringLegacyPass() {
+  return new DXILOpLoweringLegacy();
+}
diff --git a/llvm/lib/Target/DirectX/DXILPointerType.cpp b/llvm/lib/Target/DirectX/DXILPointerType.cpp
new file mode 100644
index 000000000000..1e67f1a30ec4
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DXILPointerType.cpp
@@ -0,0 +1,66 @@
+//===- Target/DirectX/DXILTypedPointerType.cpp - DXIL Typed Pointer Type
+//-------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "DXILPointerType.h"
+#include "llvm/ADT/Any.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/LLVMContext.h"
+
+using namespace llvm;
+using namespace llvm::dxil;
+
+class TypedPointerTracking {
+public:
+  TypedPointerTracking() {}
+  DenseMap<Type *, std::unique_ptr<TypedPointerType>> PointerTypes;
+  DenseMap<std::pair<Type *, unsigned>, std::unique_ptr<TypedPointerType>>
+      ASPointerTypes;
+};
+
+TypedPointerType *TypedPointerType::get(Type *EltTy, unsigned AddressSpace) {
+  assert(EltTy && "Can't get a pointer to <null> type!");
+  assert(isValidElementType(EltTy) && "Invalid type for pointer element!");
+
+  llvm::Any &TargetData = EltTy->getContext().getTargetData();
+  if (!TargetData.hasValue())
+    TargetData = Any{std::make_shared<TypedPointerTracking>()};
+
+  assert(any_isa<std::shared_ptr<TypedPointerTracking>>(TargetData) &&
+         "Unexpected target data type");
+
+  std::shared_ptr<TypedPointerTracking> Tracking =
+      any_cast<std::shared_ptr<TypedPointerTracking>>(TargetData);
+
+  // Since AddressSpace #0 is the common case, we special case it.
+  std::unique_ptr<TypedPointerType> &Entry =
+      AddressSpace == 0
+          ? Tracking->PointerTypes[EltTy]
+          : Tracking->ASPointerTypes[std::make_pair(EltTy, AddressSpace)];
+
+  if (!Entry)
+    Entry = std::unique_ptr<TypedPointerType>(
+        new TypedPointerType(EltTy, AddressSpace));
+  return Entry.get();
+}
+
+TypedPointerType::TypedPointerType(Type *E, unsigned AddrSpace)
+    : Type(E->getContext(), DXILPointerTyID), PointeeTy(E) {
+  ContainedTys = &PointeeTy;
+  NumContainedTys = 1;
+  setSubclassData(AddrSpace);
+}
+
+bool TypedPointerType::isValidElementType(Type *ElemTy) {
+  return !ElemTy->isVoidTy() && !ElemTy->isLabelTy() &&
+         !ElemTy->isMetadataTy() && !ElemTy->isTokenTy() &&
+         !ElemTy->isX86_AMXTy();
+}
diff --git a/llvm/lib/Target/DirectX/DXILPointerType.h b/llvm/lib/Target/DirectX/DXILPointerType.h
new file mode 100644
index 000000000000..52cf2dbc40b0
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DXILPointerType.h
@@ -0,0 +1,52 @@
+//===- Target/DirectX/DXILPointerType.h - DXIL Typed Pointer Type ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_DIRECTX_DXILPOINTERTYPE_H
+#define LLVM_TARGET_DIRECTX_DXILPOINTERTYPE_H
+
+#include "llvm/IR/Type.h"
+
+namespace llvm {
+namespace dxil {
+
+// DXIL has typed pointers, this pointer type abstraction is used for tracking
+// in PointerTypeAnalysis and for the bitcode ValueEnumerator
+class TypedPointerType : public Type {
+  explicit TypedPointerType(Type *ElType, unsigned AddrSpace);
+
+  Type *PointeeTy;
+
+public:
+  TypedPointerType(const TypedPointerType &) = delete;
+  TypedPointerType &operator=(const TypedPointerType &) = delete;
+
+  /// This constructs a pointer to an object of the specified type in a numbered
+  /// address space.
+  static TypedPointerType *get(Type *ElementType, unsigned AddressSpace);
+
+  /// Return true if the specified type is valid as a element type.
+  static bool isValidElementType(Type *ElemTy);
+
+  /// Return the address space of the Pointer type.
+  unsigned getAddressSpace() const { return getSubclassData(); }
+
+  Type *getElementType() const { return PointeeTy; }
+
+  /// Implement support type inquiry through isa, cast, and dyn_cast.
+  static bool classof(const Type *T) {
+    return T->getTypeID() == DXILPointerTyID;
+  }
+};
+
+} // namespace dxil
+} // namespace llvm
+
+#endif // LLVM_TARGET_DIRECTX_DXILPOINTERTYPE_H
diff --git a/llvm/lib/Target/DirectX/DXILPrepare.cpp b/llvm/lib/Target/DirectX/DXILPrepare.cpp
new file mode 100644
index 000000000000..14d970e6b69a
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DXILPrepare.cpp
@@ -0,0 +1,184 @@
+//===- DXILPrepare.cpp - Prepare LLVM Module for DXIL encoding ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file This file contains pases and utilities to convert a modern LLVM
+/// module into a module compatible with the LLVM 3.7-based DirectX Intermediate
+/// Language (DXIL).
+//===----------------------------------------------------------------------===//
+
+#include "DirectX.h"
+#include "PointerTypeAnalysis.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Compiler.h"
+
+#define DEBUG_TYPE "dxil-prepare"
+
+using namespace llvm;
+using namespace llvm::dxil;
+
+namespace {
+
+constexpr bool isValidForDXIL(Attribute::AttrKind Attr) {
+  return is_contained({Attribute::Alignment,
+                       Attribute::AlwaysInline,
+                       Attribute::Builtin,
+                       Attribute::ByVal,
+                       Attribute::InAlloca,
+                       Attribute::Cold,
+                       Attribute::Convergent,
+                       Attribute::InlineHint,
+                       Attribute::InReg,
+                       Attribute::JumpTable,
+                       Attribute::MinSize,
+                       Attribute::Naked,
+                       Attribute::Nest,
+                       Attribute::NoAlias,
+                       Attribute::NoBuiltin,
+                       Attribute::NoCapture,
+                       Attribute::NoDuplicate,
+                       Attribute::NoImplicitFloat,
+                       Attribute::NoInline,
+                       Attribute::NonLazyBind,
+                       Attribute::NonNull,
+                       Attribute::Dereferenceable,
+                       Attribute::DereferenceableOrNull,
+                       Attribute::NoRedZone,
+                       Attribute::NoReturn,
+                       Attribute::NoUnwind,
+                       Attribute::OptimizeForSize,
+                       Attribute::OptimizeNone,
+                       Attribute::ReadNone,
+                       Attribute::ReadOnly,
+                       Attribute::ArgMemOnly,
+                       Attribute::Returned,
+                       Attribute::ReturnsTwice,
+                       Attribute::SExt,
+                       Attribute::StackAlignment,
+                       Attribute::StackProtect,
+                       Attribute::StackProtectReq,
+                       Attribute::StackProtectStrong,
+                       Attribute::SafeStack,
+                       Attribute::StructRet,
+                       Attribute::SanitizeAddress,
+                       Attribute::SanitizeThread,
+                       Attribute::SanitizeMemory,
+                       Attribute::UWTable,
+                       Attribute::ZExt},
+                      Attr);
+}
+
+class DXILPrepareModule : public ModulePass {
+
+  static Value *maybeGenerateBitcast(IRBuilder<> &Builder,
+                                     PointerTypeMap &PointerTypes,
+                                     Instruction &Inst, Value *Operand,
+                                     Type *Ty) {
+    // Omit bitcasts if the incoming value matches the instruction type.
+    auto It = PointerTypes.find(Operand);
+    if (It != PointerTypes.end())
+      if (cast<TypedPointerType>(It->second)->getElementType() == Ty)
+        return nullptr;
+    // Insert bitcasts where we are removing the instruction.
+    Builder.SetInsertPoint(&Inst);
+    // This code only gets hit in opaque-pointer mode, so the type of the
+    // pointer doesn't matter.
+    PointerType *PtrTy = cast<PointerType>(Operand->getType());
+    return Builder.Insert(
+        CastInst::Create(Instruction::BitCast, Operand,
+                         Builder.getInt8PtrTy(PtrTy->getAddressSpace())));
+  }
+
+public:
+  bool runOnModule(Module &M) override {
+    PointerTypeMap PointerTypes = PointerTypeAnalysis::run(M);
+    AttributeMask AttrMask;
+    for (Attribute::AttrKind I = Attribute::None; I != Attribute::EndAttrKinds;
+         I = Attribute::AttrKind(I + 1)) {
+      if (!isValidForDXIL(I))
+        AttrMask.addAttribute(I);
+    }
+    for (auto &F : M.functions()) {
+      F.removeFnAttrs(AttrMask);
+      F.removeRetAttrs(AttrMask);
+      for (size_t Idx = 0, End = F.arg_size(); Idx < End; ++Idx)
+        F.removeParamAttrs(Idx, AttrMask);
+
+      for (auto &BB : F) {
+        IRBuilder<> Builder(&BB);
+        for (auto &I : make_early_inc_range(BB)) {
+          if (I.getOpcode() == Instruction::FNeg) {
+            Builder.SetInsertPoint(&I);
+            Value *In = I.getOperand(0);
+            Value *Zero = ConstantFP::get(In->getType(), -0.0);
+            I.replaceAllUsesWith(Builder.CreateFSub(Zero, In));
+            I.eraseFromParent();
+            continue;
+          }
+          // Only insert bitcasts if the IR is using opaque pointers.
+          if (M.getContext().supportsTypedPointers())
+            continue;
+
+          // Emtting NoOp bitcast instructions allows the ValueEnumerator to be
+          // unmodified as it reserves instruction IDs during contruction.
+          if (auto LI = dyn_cast<LoadInst>(&I)) {
+            if (Value *NoOpBitcast = maybeGenerateBitcast(
+                    Builder, PointerTypes, I, LI->getPointerOperand(),
+                    LI->getType())) {
+              LI->replaceAllUsesWith(
+                  Builder.CreateLoad(LI->getType(), NoOpBitcast));
+              LI->eraseFromParent();
+            }
+            continue;
+          }
+          if (auto SI = dyn_cast<StoreInst>(&I)) {
+            if (Value *NoOpBitcast = maybeGenerateBitcast(
+                    Builder, PointerTypes, I, SI->getPointerOperand(),
+                    SI->getValueOperand()->getType())) {
+
+              SI->replaceAllUsesWith(
+                  Builder.CreateStore(SI->getValueOperand(), NoOpBitcast));
+              SI->eraseFromParent();
+            }
+            continue;
+          }
+          if (auto GEP = dyn_cast<GetElementPtrInst>(&I)) {
+            if (Value *NoOpBitcast = maybeGenerateBitcast(
+                    Builder, PointerTypes, I, GEP->getPointerOperand(),
+                    GEP->getResultElementType()))
+              GEP->setOperand(0, NoOpBitcast);
+            continue;
+          }
+        }
+      }
+    }
+    return true;
+  }
+
+  DXILPrepareModule() : ModulePass(ID) {}
+
+  static char ID; // Pass identification.
+};
+char DXILPrepareModule::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(DXILPrepareModule, DEBUG_TYPE, "DXIL Prepare Module",
+                      false, false)
+INITIALIZE_PASS_END(DXILPrepareModule, DEBUG_TYPE, "DXIL Prepare Module", false,
+                    false)
+
+ModulePass *llvm::createDXILPrepareModulePass() {
+  return new DXILPrepareModule();
+}
diff --git a/llvm/lib/Target/DirectX/DXILStubs.td b/llvm/lib/Target/DirectX/DXILStubs.td
new file mode 100644
index 000000000000..ce4327f93bc1
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DXILStubs.td
@@ -0,0 +1,18 @@
+// DXIL doesn't actually use registers, but this gets the boilerplate code
+// generated through tablegen.
+let Namespace = "DXIL" in {
+def DXIL : Register<"DXIL">;
+def DXILClass : RegisterClass<"DXIL", [i32], 32, (add DXIL)>;
+}
+
+class DXILInst : Instruction {
+  let Namespace = "DXIL";
+  let DecoderNamespace = "DXIL";
+
+  dag OutOperandList = (outs);
+  dag InOperandList =  (ins);
+  let AsmString = "dummy";
+  let Pattern = [];
+}
+
+def DummyInst : DXILInst;
diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
new file mode 100644
index 000000000000..634ead98a6ae
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
@@ -0,0 +1,121 @@
+//===- DXILTranslateMetadata.cpp - Pass to emit DXIL metadata ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+//===----------------------------------------------------------------------===//
+
+#include "DirectX.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+static uint32_t ConstMDToUint32(const MDOperand &MDO) {
+  ConstantInt *pConst = mdconst::extract<ConstantInt>(MDO);
+  return (uint32_t)pConst->getZExtValue();
+}
+
+static ConstantAsMetadata *Uint32ToConstMD(unsigned v, LLVMContext &Ctx) {
+  return ConstantAsMetadata::get(
+      Constant::getIntegerValue(IntegerType::get(Ctx, 32), APInt(32, v)));
+}
+
+constexpr StringLiteral ValVerKey = "dx.valver";
+constexpr unsigned DXILVersionNumFields = 2;
+
+static void emitDXILValidatorVersion(Module &M, VersionTuple &ValidatorVer) {
+  NamedMDNode *DXILValidatorVersionMD = M.getNamedMetadata(ValVerKey);
+
+  // Allow re-writing the validator version, since this can be changed at
+  // later points.
+  if (DXILValidatorVersionMD)
+    M.eraseNamedMetadata(DXILValidatorVersionMD);
+
+  DXILValidatorVersionMD = M.getOrInsertNamedMetadata(ValVerKey);
+
+  auto &Ctx = M.getContext();
+  Metadata *MDVals[DXILVersionNumFields];
+  MDVals[0] = Uint32ToConstMD(ValidatorVer.getMajor(), Ctx);
+  MDVals[1] = Uint32ToConstMD(ValidatorVer.getMinor().value_or(0), Ctx);
+
+  DXILValidatorVersionMD->addOperand(MDNode::get(Ctx, MDVals));
+}
+
+static VersionTuple loadDXILValidatorVersion(MDNode *ValVerMD) {
+  if (ValVerMD->getNumOperands() != DXILVersionNumFields)
+    return VersionTuple();
+
+  unsigned Major = ConstMDToUint32(ValVerMD->getOperand(0));
+  unsigned Minor = ConstMDToUint32(ValVerMD->getOperand(1));
+  return VersionTuple(Major, Minor);
+}
+
+static void cleanModuleFlags(Module &M) {
+  constexpr StringLiteral DeadKeys[] = {ValVerKey};
+  // Collect DeadKeys in ModuleFlags.
+  StringSet<> DeadKeySet;
+  for (auto &Key : DeadKeys) {
+    if (M.getModuleFlag(Key))
+      DeadKeySet.insert(Key);
+  }
+  if (DeadKeySet.empty())
+    return;
+
+  SmallVector<Module::ModuleFlagEntry, 8> ModuleFlags;
+  M.getModuleFlagsMetadata(ModuleFlags);
+  NamedMDNode *MDFlags = M.getModuleFlagsMetadata();
+  MDFlags->eraseFromParent();
+  // Add ModuleFlag which not dead.
+  for (auto &Flag : ModuleFlags) {
+    StringRef Key = Flag.Key->getString();
+    if (DeadKeySet.contains(Key))
+      continue;
+    M.addModuleFlag(Flag.Behavior, Key, Flag.Val);
+  }
+}
+
+static void cleanModule(Module &M) { cleanModuleFlags(M); }
+
+namespace {
+class DXILTranslateMetadata : public ModulePass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  explicit DXILTranslateMetadata() : ModulePass(ID), ValidatorVer(1, 0) {}
+
+  StringRef getPassName() const override { return "DXIL Metadata Emit"; }
+
+  bool runOnModule(Module &M) override;
+
+private:
+  VersionTuple ValidatorVer;
+};
+
+} // namespace
+
+bool DXILTranslateMetadata::runOnModule(Module &M) {
+  if (MDNode *ValVerMD = cast_or_null<MDNode>(M.getModuleFlag(ValVerKey))) {
+    auto ValVer = loadDXILValidatorVersion(ValVerMD);
+    if (!ValVer.empty())
+      ValidatorVer = ValVer;
+  }
+  emitDXILValidatorVersion(M, ValidatorVer);
+  cleanModule(M);
+  return false;
+}
+
+char DXILTranslateMetadata::ID = 0;
+
+ModulePass *llvm::createDXILTranslateMetadataPass() {
+  return new DXILTranslateMetadata();
+}
+
+INITIALIZE_PASS(DXILTranslateMetadata, "dxil-metadata-emit",
+                "DXIL Metadata Emit", false, false)
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
new file mode 100644
index 000000000000..494a71e51a89
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
@@ -0,0 +1,2963 @@
+//===- Bitcode/Writer/DXILBitcodeWriter.cpp - DXIL Bitcode Writer ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Bitcode writer implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DXILBitcodeWriter.h"
+#include "DXILValueEnumerator.h"
+#include "PointerTypeAnalysis.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Bitcode/BitcodeCommon.h"
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/Bitcode/LLVMBitCodes.h"
+#include "llvm/Bitstream/BitCodes.h"
+#include "llvm/Bitstream/BitstreamWriter.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Comdat.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalIFunc.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/UseListOrder.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Object/IRSymtab.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SHA1.h"
+
+namespace llvm {
+namespace dxil {
+
+// Generates an enum to use as an index in the Abbrev array of Metadata record.
+enum MetadataAbbrev : unsigned {
+#define HANDLE_MDNODE_LEAF(CLASS) CLASS##AbbrevID,
+#include "llvm/IR/Metadata.def"
+  LastPlusOne
+};
+
+class DXILBitcodeWriter {
+
+  /// These are manifest constants used by the bitcode writer. They do not need
+  /// to be kept in sync with the reader, but need to be consistent within this
+  /// file.
+  enum {
+    // VALUE_SYMTAB_BLOCK abbrev id's.
+    VST_ENTRY_8_ABBREV = bitc::FIRST_APPLICATION_ABBREV,
+    VST_ENTRY_7_ABBREV,
+    VST_ENTRY_6_ABBREV,
+    VST_BBENTRY_6_ABBREV,
+
+    // CONSTANTS_BLOCK abbrev id's.
+    CONSTANTS_SETTYPE_ABBREV = bitc::FIRST_APPLICATION_ABBREV,
+    CONSTANTS_INTEGER_ABBREV,
+    CONSTANTS_CE_CAST_Abbrev,
+    CONSTANTS_NULL_Abbrev,
+
+    // FUNCTION_BLOCK abbrev id's.
+    FUNCTION_INST_LOAD_ABBREV = bitc::FIRST_APPLICATION_ABBREV,
+    FUNCTION_INST_BINOP_ABBREV,
+    FUNCTION_INST_BINOP_FLAGS_ABBREV,
+    FUNCTION_INST_CAST_ABBREV,
+    FUNCTION_INST_RET_VOID_ABBREV,
+    FUNCTION_INST_RET_VAL_ABBREV,
+    FUNCTION_INST_UNREACHABLE_ABBREV,
+    FUNCTION_INST_GEP_ABBREV,
+  };
+
+  // Cache some types
+  Type *I8Ty;
+  Type *I8PtrTy;
+
+  /// The stream created and owned by the client.
+  BitstreamWriter &Stream;
+
+  StringTableBuilder &StrtabBuilder;
+
+  /// The Module to write to bitcode.
+  const Module &M;
+
+  /// Enumerates ids for all values in the module.
+  ValueEnumerator VE;
+
+  /// Map that holds the correspondence between GUIDs in the summary index,
+  /// that came from indirect call profiles, and a value id generated by this
+  /// class to use in the VST and summary block records.
+  std::map<GlobalValue::GUID, unsigned> GUIDToValueIdMap;
+
+  /// Tracks the last value id recorded in the GUIDToValueMap.
+  unsigned GlobalValueId;
+
+  /// Saves the offset of the VSTOffset record that must eventually be
+  /// backpatched with the offset of the actual VST.
+  uint64_t VSTOffsetPlaceholder = 0;
+
+  /// Pointer to the buffer allocated by caller for bitcode writing.
+  const SmallVectorImpl<char> &Buffer;
+
+  /// The start bit of the identification block.
+  uint64_t BitcodeStartBit;
+
+  /// This maps values to their typed pointers
+  PointerTypeMap PointerMap;
+
+public:
+  /// Constructs a ModuleBitcodeWriter object for the given Module,
+  /// writing to the provided \p Buffer.
+  DXILBitcodeWriter(const Module &M, SmallVectorImpl<char> &Buffer,
+                    StringTableBuilder &StrtabBuilder, BitstreamWriter &Stream)
+      : I8Ty(Type::getInt8Ty(M.getContext())),
+        I8PtrTy(TypedPointerType::get(I8Ty, 0)), Stream(Stream),
+        StrtabBuilder(StrtabBuilder), M(M), VE(M, I8PtrTy), Buffer(Buffer),
+        BitcodeStartBit(Stream.GetCurrentBitNo()),
+        PointerMap(PointerTypeAnalysis::run(M)) {
+    GlobalValueId = VE.getValues().size();
+    // Enumerate the typed pointers
+    for (auto El : PointerMap)
+      VE.EnumerateType(El.second);
+  }
+
+  /// Emit the current module to the bitstream.
+  void write();
+
+  static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind);
+  static void writeStringRecord(BitstreamWriter &Stream, unsigned Code,
+                                StringRef Str, unsigned AbbrevToUse);
+  static void writeIdentificationBlock(BitstreamWriter &Stream);
+  static void emitSignedInt64(SmallVectorImpl<uint64_t> &Vals, uint64_t V);
+  static void emitWideAPInt(SmallVectorImpl<uint64_t> &Vals, const APInt &A);
+
+  static unsigned getEncodedComdatSelectionKind(const Comdat &C);
+  static unsigned getEncodedLinkage(const GlobalValue::LinkageTypes Linkage);
+  static unsigned getEncodedLinkage(const GlobalValue &GV);
+  static unsigned getEncodedVisibility(const GlobalValue &GV);
+  static unsigned getEncodedThreadLocalMode(const GlobalValue &GV);
+  static unsigned getEncodedDLLStorageClass(const GlobalValue &GV);
+  static unsigned getEncodedCastOpcode(unsigned Opcode);
+  static unsigned getEncodedUnaryOpcode(unsigned Opcode);
+  static unsigned getEncodedBinaryOpcode(unsigned Opcode);
+  static unsigned getEncodedRMWOperation(AtomicRMWInst::BinOp Op);
+  static unsigned getEncodedOrdering(AtomicOrdering Ordering);
+  static uint64_t getOptimizationFlags(const Value *V);
+
+private:
+  void writeModuleVersion();
+  void writePerModuleGlobalValueSummary();
+
+  void writePerModuleFunctionSummaryRecord(SmallVector<uint64_t, 64> &NameVals,
+                                           GlobalValueSummary *Summary,
+                                           unsigned ValueID,
+                                           unsigned FSCallsAbbrev,
+                                           unsigned FSCallsProfileAbbrev,
+                                           const Function &F);
+  void writeModuleLevelReferences(const GlobalVariable &V,
+                                  SmallVector<uint64_t, 64> &NameVals,
+                                  unsigned FSModRefsAbbrev,
+                                  unsigned FSModVTableRefsAbbrev);
+
+  void assignValueId(GlobalValue::GUID ValGUID) {
+    GUIDToValueIdMap[ValGUID] = ++GlobalValueId;
+  }
+
+  unsigned getValueId(GlobalValue::GUID ValGUID) {
+    const auto &VMI = GUIDToValueIdMap.find(ValGUID);
+    // Expect that any GUID value had a value Id assigned by an
+    // earlier call to assignValueId.
+    assert(VMI != GUIDToValueIdMap.end() &&
+           "GUID does not have assigned value Id");
+    return VMI->second;
+  }
+
+  // Helper to get the valueId for the type of value recorded in VI.
+  unsigned getValueId(ValueInfo VI) {
+    if (!VI.haveGVs() || !VI.getValue())
+      return getValueId(VI.getGUID());
+    return VE.getValueID(VI.getValue());
+  }
+
+  std::map<GlobalValue::GUID, unsigned> &valueIds() { return GUIDToValueIdMap; }
+
+  uint64_t bitcodeStartBit() { return BitcodeStartBit; }
+
+  size_t addToStrtab(StringRef Str);
+
+  unsigned createDILocationAbbrev();
+  unsigned createGenericDINodeAbbrev();
+
+  void writeAttributeGroupTable();
+  void writeAttributeTable();
+  void writeTypeTable();
+  void writeComdats();
+  void writeValueSymbolTableForwardDecl();
+  void writeModuleInfo();
+  void writeValueAsMetadata(const ValueAsMetadata *MD,
+                            SmallVectorImpl<uint64_t> &Record);
+  void writeMDTuple(const MDTuple *N, SmallVectorImpl<uint64_t> &Record,
+                    unsigned Abbrev);
+  void writeDILocation(const DILocation *N, SmallVectorImpl<uint64_t> &Record,
+                       unsigned &Abbrev);
+  void writeGenericDINode(const GenericDINode *N,
+                          SmallVectorImpl<uint64_t> &Record, unsigned &Abbrev) {
+    llvm_unreachable("DXIL cannot contain GenericDI Nodes");
+  }
+  void writeDISubrange(const DISubrange *N, SmallVectorImpl<uint64_t> &Record,
+                       unsigned Abbrev);
+  void writeDIGenericSubrange(const DIGenericSubrange *N,
+                              SmallVectorImpl<uint64_t> &Record,
+                              unsigned Abbrev) {
+    llvm_unreachable("DXIL cannot contain DIGenericSubrange Nodes");
+  }
+  void writeDIEnumerator(const DIEnumerator *N,
+                         SmallVectorImpl<uint64_t> &Record, unsigned Abbrev);
+  void writeDIBasicType(const DIBasicType *N, SmallVectorImpl<uint64_t> &Record,
+                        unsigned Abbrev);
+  void writeDIStringType(const DIStringType *N,
+                         SmallVectorImpl<uint64_t> &Record, unsigned Abbrev) {
+    llvm_unreachable("DXIL cannot contain DIStringType Nodes");
+  }
+  void writeDIDerivedType(const DIDerivedType *N,
+                          SmallVectorImpl<uint64_t> &Record, unsigned Abbrev);
+  void writeDICompositeType(const DICompositeType *N,
+                            SmallVectorImpl<uint64_t> &Record, unsigned Abbrev);
+  void writeDISubroutineType(const DISubroutineType *N,
+                             SmallVectorImpl<uint64_t> &Record,
+                             unsigned Abbrev);
+  void writeDIFile(const DIFile *N, SmallVectorImpl<uint64_t> &Record,
+                   unsigned Abbrev);
+  void writeDICompileUnit(const DICompileUnit *N,
+                          SmallVectorImpl<uint64_t> &Record, unsigned Abbrev);
+  void writeDISubprogram(const DISubprogram *N,
+                         SmallVectorImpl<uint64_t> &Record, unsigned Abbrev);
+  void writeDILexicalBlock(const DILexicalBlock *N,
+                           SmallVectorImpl<uint64_t> &Record, unsigned Abbrev);
+  void writeDILexicalBlockFile(const DILexicalBlockFile *N,
+                               SmallVectorImpl<uint64_t> &Record,
+                               unsigned Abbrev);
+  void writeDICommonBlock(const DICommonBlock *N,
+                          SmallVectorImpl<uint64_t> &Record, unsigned Abbrev) {
+    llvm_unreachable("DXIL cannot contain DICommonBlock Nodes");
+  }
+  void writeDINamespace(const DINamespace *N, SmallVectorImpl<uint64_t> &Record,
+                        unsigned Abbrev);
+  void writeDIMacro(const DIMacro *N, SmallVectorImpl<uint64_t> &Record,
+                    unsigned Abbrev) {
+    llvm_unreachable("DXIL cannot contain DIMacro Nodes");
+  }
+  void writeDIMacroFile(const DIMacroFile *N, SmallVectorImpl<uint64_t> &Record,
+                        unsigned Abbrev) {
+    llvm_unreachable("DXIL cannot contain DIMacroFile Nodes");
+  }
+  void writeDIArgList(const DIArgList *N, SmallVectorImpl<uint64_t> &Record,
+                      unsigned Abbrev) {
+    llvm_unreachable("DXIL cannot contain DIArgList Nodes");
+  }
+  void writeDIModule(const DIModule *N, SmallVectorImpl<uint64_t> &Record,
+                     unsigned Abbrev);
+  void writeDITemplateTypeParameter(const DITemplateTypeParameter *N,
+                                    SmallVectorImpl<uint64_t> &Record,
+                                    unsigned Abbrev);
+  void writeDITemplateValueParameter(const DITemplateValueParameter *N,
+                                     SmallVectorImpl<uint64_t> &Record,
+                                     unsigned Abbrev);
+  void writeDIGlobalVariable(const DIGlobalVariable *N,
+                             SmallVectorImpl<uint64_t> &Record,
+                             unsigned Abbrev);
+  void writeDILocalVariable(const DILocalVariable *N,
+                            SmallVectorImpl<uint64_t> &Record, unsigned Abbrev);
+  void writeDILabel(const DILabel *N, SmallVectorImpl<uint64_t> &Record,
+                    unsigned Abbrev) {
+    llvm_unreachable("DXIL cannot contain DILabel Nodes");
+  }
+  void writeDIExpression(const DIExpression *N,
+                         SmallVectorImpl<uint64_t> &Record, unsigned Abbrev);
+  void writeDIGlobalVariableExpression(const DIGlobalVariableExpression *N,
+                                       SmallVectorImpl<uint64_t> &Record,
+                                       unsigned Abbrev) {
+    llvm_unreachable("DXIL cannot contain GlobalVariableExpression Nodes");
+  }
+  void writeDIObjCProperty(const DIObjCProperty *N,
+                           SmallVectorImpl<uint64_t> &Record, unsigned Abbrev);
+  void writeDIImportedEntity(const DIImportedEntity *N,
+                             SmallVectorImpl<uint64_t> &Record,
+                             unsigned Abbrev);
+  unsigned createNamedMetadataAbbrev();
+  void writeNamedMetadata(SmallVectorImpl<uint64_t> &Record);
+  unsigned createMetadataStringsAbbrev();
+  void writeMetadataStrings(ArrayRef<const Metadata *> Strings,
+                            SmallVectorImpl<uint64_t> &Record);
+  void writeMetadataRecords(ArrayRef<const Metadata *> MDs,
+                            SmallVectorImpl<uint64_t> &Record,
+                            std::vector<unsigned> *MDAbbrevs = nullptr,
+                            std::vector<uint64_t> *IndexPos = nullptr);
+  void writeModuleMetadata();
+  void writeFunctionMetadata(const Function &F);
+  void writeFunctionMetadataAttachment(const Function &F);
+  void pushGlobalMetadataAttachment(SmallVectorImpl<uint64_t> &Record,
+                                    const GlobalObject &GO);
+  void writeModuleMetadataKinds();
+  void writeOperandBundleTags();
+  void writeSyncScopeNames();
+  void writeConstants(unsigned FirstVal, unsigned LastVal, bool isGlobal);
+  void writeModuleConstants();
+  bool pushValueAndType(const Value *V, unsigned InstID,
+                        SmallVectorImpl<unsigned> &Vals);
+  void writeOperandBundles(const CallBase &CB, unsigned InstID);
+  void pushValue(const Value *V, unsigned InstID,
+                 SmallVectorImpl<unsigned> &Vals);
+  void pushValueSigned(const Value *V, unsigned InstID,
+                       SmallVectorImpl<uint64_t> &Vals);
+  void writeInstruction(const Instruction &I, unsigned InstID,
+                        SmallVectorImpl<unsigned> &Vals);
+  void writeFunctionLevelValueSymbolTable(const ValueSymbolTable &VST);
+  void writeGlobalValueSymbolTable(
+      DenseMap<const Function *, uint64_t> &FunctionToBitcodeIndex);
+  void writeUseList(UseListOrder &&Order);
+  void writeUseListBlock(const Function *F);
+  void writeFunction(const Function &F);
+  void writeBlockInfo();
+
+  unsigned getEncodedSyncScopeID(SyncScope::ID SSID) { return unsigned(SSID); }
+
+  unsigned getEncodedAlign(MaybeAlign Alignment) { return encode(Alignment); }
+
+  unsigned getTypeID(Type *T, const Value *V = nullptr);
+  unsigned getTypeID(Type *T, const Function *F);
+};
+
+} // namespace dxil
+} // namespace llvm
+
+using namespace llvm;
+using namespace llvm::dxil;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Begin dxil::BitcodeWriter Implementation
+////////////////////////////////////////////////////////////////////////////////
+
+dxil::BitcodeWriter::BitcodeWriter(SmallVectorImpl<char> &Buffer,
+                                   raw_fd_stream *FS)
+    : Buffer(Buffer), Stream(new BitstreamWriter(Buffer, FS, 512)) {
+  // Emit the file header.
+  Stream->Emit((unsigned)'B', 8);
+  Stream->Emit((unsigned)'C', 8);
+  Stream->Emit(0x0, 4);
+  Stream->Emit(0xC, 4);
+  Stream->Emit(0xE, 4);
+  Stream->Emit(0xD, 4);
+}
+
+dxil::BitcodeWriter::~BitcodeWriter() { assert(WroteStrtab); }
+
+/// Write the specified module to the specified output stream.
+void dxil::WriteDXILToFile(const Module &M, raw_ostream &Out) {
+  SmallVector<char, 0> Buffer;
+  Buffer.reserve(256 * 1024);
+
+  // If this is darwin or another generic macho target, reserve space for the
+  // header.
+  Triple TT(M.getTargetTriple());
+  if (TT.isOSDarwin() || TT.isOSBinFormatMachO())
+    Buffer.insert(Buffer.begin(), BWH_HeaderSize, 0);
+
+  BitcodeWriter Writer(Buffer, dyn_cast<raw_fd_stream>(&Out));
+  Writer.writeModule(M);
+  Writer.writeSymtab();
+  Writer.writeStrtab();
+
+  // Write the generated bitstream to "Out".
+  if (!Buffer.empty())
+    Out.write((char *)&Buffer.front(), Buffer.size());
+}
+
+void BitcodeWriter::writeBlob(unsigned Block, unsigned Record, StringRef Blob) {
+  Stream->EnterSubblock(Block, 3);
+
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
+  Abbv->Add(BitCodeAbbrevOp(Record));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
+  auto AbbrevNo = Stream->EmitAbbrev(std::move(Abbv));
+
+  Stream->EmitRecordWithBlob(AbbrevNo, ArrayRef<uint64_t>{Record}, Blob);
+
+  Stream->ExitBlock();
+}
+
+void BitcodeWriter::writeSymtab() {
+  assert(!WroteStrtab && !WroteSymtab);
+
+  // If any module has module-level inline asm, we will require a registered asm
+  // parser for the target so that we can create an accurate symbol table for
+  // the module.
+  for (Module *M : Mods) {
+    if (M->getModuleInlineAsm().empty())
+      continue;
+  }
+
+  WroteSymtab = true;
+  SmallVector<char, 0> Symtab;
+  // The irsymtab::build function may be unable to create a symbol table if the
+  // module is malformed (e.g. it contains an invalid alias). Writing a symbol
+  // table is not required for correctness, but we still want to be able to
+  // write malformed modules to bitcode files, so swallow the error.
+  if (Error E = irsymtab::build(Mods, Symtab, StrtabBuilder, Alloc)) {
+    consumeError(std::move(E));
+    return;
+  }
+
+  writeBlob(bitc::SYMTAB_BLOCK_ID, bitc::SYMTAB_BLOB,
+            {Symtab.data(), Symtab.size()});
+}
+
+void BitcodeWriter::writeStrtab() {
+  assert(!WroteStrtab);
+
+  std::vector<char> Strtab;
+  StrtabBuilder.finalizeInOrder();
+  Strtab.resize(StrtabBuilder.getSize());
+  StrtabBuilder.write((uint8_t *)Strtab.data());
+
+  writeBlob(bitc::STRTAB_BLOCK_ID, bitc::STRTAB_BLOB,
+            {Strtab.data(), Strtab.size()});
+
+  WroteStrtab = true;
+}
+
+void BitcodeWriter::copyStrtab(StringRef Strtab) {
+  writeBlob(bitc::STRTAB_BLOCK_ID, bitc::STRTAB_BLOB, Strtab);
+  WroteStrtab = true;
+}
+
+void BitcodeWriter::writeModule(const Module &M) {
+  assert(!WroteStrtab);
+
+  // The Mods vector is used by irsymtab::build, which requires non-const
+  // Modules in case it needs to materialize metadata. But the bitcode writer
+  // requires that the module is materialized, so we can cast to non-const here,
+  // after checking that it is in fact materialized.
+  assert(M.isMaterialized());
+  Mods.push_back(const_cast<Module *>(&M));
+
+  DXILBitcodeWriter ModuleWriter(M, Buffer, StrtabBuilder, *Stream);
+  ModuleWriter.write();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Begin dxil::BitcodeWriterBase Implementation
+////////////////////////////////////////////////////////////////////////////////
+
+unsigned DXILBitcodeWriter::getEncodedCastOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    llvm_unreachable("Unknown cast instruction!");
+  case Instruction::Trunc:
+    return bitc::CAST_TRUNC;
+  case Instruction::ZExt:
+    return bitc::CAST_ZEXT;
+  case Instruction::SExt:
+    return bitc::CAST_SEXT;
+  case Instruction::FPToUI:
+    return bitc::CAST_FPTOUI;
+  case Instruction::FPToSI:
+    return bitc::CAST_FPTOSI;
+  case Instruction::UIToFP:
+    return bitc::CAST_UITOFP;
+  case Instruction::SIToFP:
+    return bitc::CAST_SITOFP;
+  case Instruction::FPTrunc:
+    return bitc::CAST_FPTRUNC;
+  case Instruction::FPExt:
+    return bitc::CAST_FPEXT;
+  case Instruction::PtrToInt:
+    return bitc::CAST_PTRTOINT;
+  case Instruction::IntToPtr:
+    return bitc::CAST_INTTOPTR;
+  case Instruction::BitCast:
+    return bitc::CAST_BITCAST;
+  case Instruction::AddrSpaceCast:
+    return bitc::CAST_ADDRSPACECAST;
+  }
+}
+
+unsigned DXILBitcodeWriter::getEncodedUnaryOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    llvm_unreachable("Unknown binary instruction!");
+  case Instruction::FNeg:
+    return bitc::UNOP_FNEG;
+  }
+}
+
+unsigned DXILBitcodeWriter::getEncodedBinaryOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    llvm_unreachable("Unknown binary instruction!");
+  case Instruction::Add:
+  case Instruction::FAdd:
+    return bitc::BINOP_ADD;
+  case Instruction::Sub:
+  case Instruction::FSub:
+    return bitc::BINOP_SUB;
+  case Instruction::Mul:
+  case Instruction::FMul:
+    return bitc::BINOP_MUL;
+  case Instruction::UDiv:
+    return bitc::BINOP_UDIV;
+  case Instruction::FDiv:
+  case Instruction::SDiv:
+    return bitc::BINOP_SDIV;
+  case Instruction::URem:
+    return bitc::BINOP_UREM;
+  case Instruction::FRem:
+  case Instruction::SRem:
+    return bitc::BINOP_SREM;
+  case Instruction::Shl:
+    return bitc::BINOP_SHL;
+  case Instruction::LShr:
+    return bitc::BINOP_LSHR;
+  case Instruction::AShr:
+    return bitc::BINOP_ASHR;
+  case Instruction::And:
+    return bitc::BINOP_AND;
+  case Instruction::Or:
+    return bitc::BINOP_OR;
+  case Instruction::Xor:
+    return bitc::BINOP_XOR;
+  }
+}
+
+unsigned DXILBitcodeWriter::getTypeID(Type *T, const Value *V) {
+  if (!T->isOpaquePointerTy())
+    return VE.getTypeID(T);
+  auto It = PointerMap.find(V);
+  if (It != PointerMap.end())
+    return VE.getTypeID(It->second);
+  return VE.getTypeID(I8PtrTy);
+}
+
+unsigned DXILBitcodeWriter::getTypeID(Type *T, const Function *F) {
+  auto It = PointerMap.find(F);
+  if (It != PointerMap.end())
+    return VE.getTypeID(It->second);
+  return VE.getTypeID(T);
+}
+
+unsigned DXILBitcodeWriter::getEncodedRMWOperation(AtomicRMWInst::BinOp Op) {
+  switch (Op) {
+  default:
+    llvm_unreachable("Unknown RMW operation!");
+  case AtomicRMWInst::Xchg:
+    return bitc::RMW_XCHG;
+  case AtomicRMWInst::Add:
+    return bitc::RMW_ADD;
+  case AtomicRMWInst::Sub:
+    return bitc::RMW_SUB;
+  case AtomicRMWInst::And:
+    return bitc::RMW_AND;
+  case AtomicRMWInst::Nand:
+    return bitc::RMW_NAND;
+  case AtomicRMWInst::Or:
+    return bitc::RMW_OR;
+  case AtomicRMWInst::Xor:
+    return bitc::RMW_XOR;
+  case AtomicRMWInst::Max:
+    return bitc::RMW_MAX;
+  case AtomicRMWInst::Min:
+    return bitc::RMW_MIN;
+  case AtomicRMWInst::UMax:
+    return bitc::RMW_UMAX;
+  case AtomicRMWInst::UMin:
+    return bitc::RMW_UMIN;
+  case AtomicRMWInst::FAdd:
+    return bitc::RMW_FADD;
+  case AtomicRMWInst::FSub:
+    return bitc::RMW_FSUB;
+  }
+}
+
+unsigned DXILBitcodeWriter::getEncodedOrdering(AtomicOrdering Ordering) {
+  switch (Ordering) {
+  case AtomicOrdering::NotAtomic:
+    return bitc::ORDERING_NOTATOMIC;
+  case AtomicOrdering::Unordered:
+    return bitc::ORDERING_UNORDERED;
+  case AtomicOrdering::Monotonic:
+    return bitc::ORDERING_MONOTONIC;
+  case AtomicOrdering::Acquire:
+    return bitc::ORDERING_ACQUIRE;
+  case AtomicOrdering::Release:
+    return bitc::ORDERING_RELEASE;
+  case AtomicOrdering::AcquireRelease:
+    return bitc::ORDERING_ACQREL;
+  case AtomicOrdering::SequentiallyConsistent:
+    return bitc::ORDERING_SEQCST;
+  }
+  llvm_unreachable("Invalid ordering");
+}
+
+void DXILBitcodeWriter::writeStringRecord(BitstreamWriter &Stream,
+                                          unsigned Code, StringRef Str,
+                                          unsigned AbbrevToUse) {
+  SmallVector<unsigned, 64> Vals;
+
+  // Code: [strchar x N]
+  for (char C : Str) {
+    if (AbbrevToUse && !BitCodeAbbrevOp::isChar6(C))
+      AbbrevToUse = 0;
+    Vals.push_back(C);
+  }
+
+  // Emit the finished record.
+  Stream.EmitRecord(Code, Vals, AbbrevToUse);
+}
+
+uint64_t DXILBitcodeWriter::getAttrKindEncoding(Attribute::AttrKind Kind) {
+  switch (Kind) {
+  case Attribute::Alignment:
+    return bitc::ATTR_KIND_ALIGNMENT;
+  case Attribute::AlwaysInline:
+    return bitc::ATTR_KIND_ALWAYS_INLINE;
+  case Attribute::ArgMemOnly:
+    return bitc::ATTR_KIND_ARGMEMONLY;
+  case Attribute::Builtin:
+    return bitc::ATTR_KIND_BUILTIN;
+  case Attribute::ByVal:
+    return bitc::ATTR_KIND_BY_VAL;
+  case Attribute::Convergent:
+    return bitc::ATTR_KIND_CONVERGENT;
+  case Attribute::InAlloca:
+    return bitc::ATTR_KIND_IN_ALLOCA;
+  case Attribute::Cold:
+    return bitc::ATTR_KIND_COLD;
+  case Attribute::InlineHint:
+    return bitc::ATTR_KIND_INLINE_HINT;
+  case Attribute::InReg:
+    return bitc::ATTR_KIND_IN_REG;
+  case Attribute::JumpTable:
+    return bitc::ATTR_KIND_JUMP_TABLE;
+  case Attribute::MinSize:
+    return bitc::ATTR_KIND_MIN_SIZE;
+  case Attribute::Naked:
+    return bitc::ATTR_KIND_NAKED;
+  case Attribute::Nest:
+    return bitc::ATTR_KIND_NEST;
+  case Attribute::NoAlias:
+    return bitc::ATTR_KIND_NO_ALIAS;
+  case Attribute::NoBuiltin:
+    return bitc::ATTR_KIND_NO_BUILTIN;
+  case Attribute::NoCapture:
+    return bitc::ATTR_KIND_NO_CAPTURE;
+  case Attribute::NoDuplicate:
+    return bitc::ATTR_KIND_NO_DUPLICATE;
+  case Attribute::NoImplicitFloat:
+    return bitc::ATTR_KIND_NO_IMPLICIT_FLOAT;
+  case Attribute::NoInline:
+    return bitc::ATTR_KIND_NO_INLINE;
+  case Attribute::NonLazyBind:
+    return bitc::ATTR_KIND_NON_LAZY_BIND;
+  case Attribute::NonNull:
+    return bitc::ATTR_KIND_NON_NULL;
+  case Attribute::Dereferenceable:
+    return bitc::ATTR_KIND_DEREFERENCEABLE;
+  case Attribute::DereferenceableOrNull:
+    return bitc::ATTR_KIND_DEREFERENCEABLE_OR_NULL;
+  case Attribute::NoRedZone:
+    return bitc::ATTR_KIND_NO_RED_ZONE;
+  case Attribute::NoReturn:
+    return bitc::ATTR_KIND_NO_RETURN;
+  case Attribute::NoUnwind:
+    return bitc::ATTR_KIND_NO_UNWIND;
+  case Attribute::OptimizeForSize:
+    return bitc::ATTR_KIND_OPTIMIZE_FOR_SIZE;
+  case Attribute::OptimizeNone:
+    return bitc::ATTR_KIND_OPTIMIZE_NONE;
+  case Attribute::ReadNone:
+    return bitc::ATTR_KIND_READ_NONE;
+  case Attribute::ReadOnly:
+    return bitc::ATTR_KIND_READ_ONLY;
+  case Attribute::Returned:
+    return bitc::ATTR_KIND_RETURNED;
+  case Attribute::ReturnsTwice:
+    return bitc::ATTR_KIND_RETURNS_TWICE;
+  case Attribute::SExt:
+    return bitc::ATTR_KIND_S_EXT;
+  case Attribute::StackAlignment:
+    return bitc::ATTR_KIND_STACK_ALIGNMENT;
+  case Attribute::StackProtect:
+    return bitc::ATTR_KIND_STACK_PROTECT;
+  case Attribute::StackProtectReq:
+    return bitc::ATTR_KIND_STACK_PROTECT_REQ;
+  case Attribute::StackProtectStrong:
+    return bitc::ATTR_KIND_STACK_PROTECT_STRONG;
+  case Attribute::SafeStack:
+    return bitc::ATTR_KIND_SAFESTACK;
+  case Attribute::StructRet:
+    return bitc::ATTR_KIND_STRUCT_RET;
+  case Attribute::SanitizeAddress:
+    return bitc::ATTR_KIND_SANITIZE_ADDRESS;
+  case Attribute::SanitizeThread:
+    return bitc::ATTR_KIND_SANITIZE_THREAD;
+  case Attribute::SanitizeMemory:
+    return bitc::ATTR_KIND_SANITIZE_MEMORY;
+  case Attribute::UWTable:
+    return bitc::ATTR_KIND_UW_TABLE;
+  case Attribute::ZExt:
+    return bitc::ATTR_KIND_Z_EXT;
+  case Attribute::EndAttrKinds:
+    llvm_unreachable("Can not encode end-attribute kinds marker.");
+  case Attribute::None:
+    llvm_unreachable("Can not encode none-attribute.");
+  case Attribute::EmptyKey:
+  case Attribute::TombstoneKey:
+    llvm_unreachable("Trying to encode EmptyKey/TombstoneKey");
+  default:
+    llvm_unreachable("Trying to encode attribute not supported by DXIL. These "
+                     "should be stripped in DXILPrepare");
+  }
+
+  llvm_unreachable("Trying to encode unknown attribute");
+}
+
+void DXILBitcodeWriter::emitSignedInt64(SmallVectorImpl<uint64_t> &Vals,
+                                        uint64_t V) {
+  if ((int64_t)V >= 0)
+    Vals.push_back(V << 1);
+  else
+    Vals.push_back((-V << 1) | 1);
+}
+
+void DXILBitcodeWriter::emitWideAPInt(SmallVectorImpl<uint64_t> &Vals,
+                                      const APInt &A) {
+  // We have an arbitrary precision integer value to write whose
+  // bit width is > 64. However, in canonical unsigned integer
+  // format it is likely that the high bits are going to be zero.
+  // So, we only write the number of active words.
+  unsigned NumWords = A.getActiveWords();
+  const uint64_t *RawData = A.getRawData();
+  for (unsigned i = 0; i < NumWords; i++)
+    emitSignedInt64(Vals, RawData[i]);
+}
+
+uint64_t DXILBitcodeWriter::getOptimizationFlags(const Value *V) {
+  uint64_t Flags = 0;
+
+  if (const auto *OBO = dyn_cast<OverflowingBinaryOperator>(V)) {
+    if (OBO->hasNoSignedWrap())
+      Flags |= 1 << bitc::OBO_NO_SIGNED_WRAP;
+    if (OBO->hasNoUnsignedWrap())
+      Flags |= 1 << bitc::OBO_NO_UNSIGNED_WRAP;
+  } else if (const auto *PEO = dyn_cast<PossiblyExactOperator>(V)) {
+    if (PEO->isExact())
+      Flags |= 1 << bitc::PEO_EXACT;
+  } else if (const auto *FPMO = dyn_cast<FPMathOperator>(V)) {
+    if (FPMO->hasAllowReassoc())
+      Flags |= bitc::AllowReassoc;
+    if (FPMO->hasNoNaNs())
+      Flags |= bitc::NoNaNs;
+    if (FPMO->hasNoInfs())
+      Flags |= bitc::NoInfs;
+    if (FPMO->hasNoSignedZeros())
+      Flags |= bitc::NoSignedZeros;
+    if (FPMO->hasAllowReciprocal())
+      Flags |= bitc::AllowReciprocal;
+    if (FPMO->hasAllowContract())
+      Flags |= bitc::AllowContract;
+    if (FPMO->hasApproxFunc())
+      Flags |= bitc::ApproxFunc;
+  }
+
+  return Flags;
+}
+
+unsigned
+DXILBitcodeWriter::getEncodedLinkage(const GlobalValue::LinkageTypes Linkage) {
+  switch (Linkage) {
+  case GlobalValue::ExternalLinkage:
+    return 0;
+  case GlobalValue::WeakAnyLinkage:
+    return 16;
+  case GlobalValue::AppendingLinkage:
+    return 2;
+  case GlobalValue::InternalLinkage:
+    return 3;
+  case GlobalValue::LinkOnceAnyLinkage:
+    return 18;
+  case GlobalValue::ExternalWeakLinkage:
+    return 7;
+  case GlobalValue::CommonLinkage:
+    return 8;
+  case GlobalValue::PrivateLinkage:
+    return 9;
+  case GlobalValue::WeakODRLinkage:
+    return 17;
+  case GlobalValue::LinkOnceODRLinkage:
+    return 19;
+  case GlobalValue::AvailableExternallyLinkage:
+    return 12;
+  }
+  llvm_unreachable("Invalid linkage");
+}
+
+unsigned DXILBitcodeWriter::getEncodedLinkage(const GlobalValue &GV) {
+  return getEncodedLinkage(GV.getLinkage());
+}
+
+unsigned DXILBitcodeWriter::getEncodedVisibility(const GlobalValue &GV) {
+  switch (GV.getVisibility()) {
+  case GlobalValue::DefaultVisibility:
+    return 0;
+  case GlobalValue::HiddenVisibility:
+    return 1;
+  case GlobalValue::ProtectedVisibility:
+    return 2;
+  }
+  llvm_unreachable("Invalid visibility");
+}
+
+unsigned DXILBitcodeWriter::getEncodedDLLStorageClass(const GlobalValue &GV) {
+  switch (GV.getDLLStorageClass()) {
+  case GlobalValue::DefaultStorageClass:
+    return 0;
+  case GlobalValue::DLLImportStorageClass:
+    return 1;
+  case GlobalValue::DLLExportStorageClass:
+    return 2;
+  }
+  llvm_unreachable("Invalid DLL storage class");
+}
+
+unsigned DXILBitcodeWriter::getEncodedThreadLocalMode(const GlobalValue &GV) {
+  switch (GV.getThreadLocalMode()) {
+  case GlobalVariable::NotThreadLocal:
+    return 0;
+  case GlobalVariable::GeneralDynamicTLSModel:
+    return 1;
+  case GlobalVariable::LocalDynamicTLSModel:
+    return 2;
+  case GlobalVariable::InitialExecTLSModel:
+    return 3;
+  case GlobalVariable::LocalExecTLSModel:
+    return 4;
+  }
+  llvm_unreachable("Invalid TLS model");
+}
+
+unsigned DXILBitcodeWriter::getEncodedComdatSelectionKind(const Comdat &C) {
+  switch (C.getSelectionKind()) {
+  case Comdat::Any:
+    return bitc::COMDAT_SELECTION_KIND_ANY;
+  case Comdat::ExactMatch:
+    return bitc::COMDAT_SELECTION_KIND_EXACT_MATCH;
+  case Comdat::Largest:
+    return bitc::COMDAT_SELECTION_KIND_LARGEST;
+  case Comdat::NoDeduplicate:
+    return bitc::COMDAT_SELECTION_KIND_NO_DUPLICATES;
+  case Comdat::SameSize:
+    return bitc::COMDAT_SELECTION_KIND_SAME_SIZE;
+  }
+  llvm_unreachable("Invalid selection kind");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Begin DXILBitcodeWriter Implementation
+////////////////////////////////////////////////////////////////////////////////
+
+void DXILBitcodeWriter::writeAttributeGroupTable() {
+  const std::vector<ValueEnumerator::IndexAndAttrSet> &AttrGrps =
+      VE.getAttributeGroups();
+  if (AttrGrps.empty())
+    return;
+
+  Stream.EnterSubblock(bitc::PARAMATTR_GROUP_BLOCK_ID, 3);
+
+  SmallVector<uint64_t, 64> Record;
+  for (ValueEnumerator::IndexAndAttrSet Pair : AttrGrps) {
+    unsigned AttrListIndex = Pair.first;
+    AttributeSet AS = Pair.second;
+    Record.push_back(VE.getAttributeGroupID(Pair));
+    Record.push_back(AttrListIndex);
+
+    for (Attribute Attr : AS) {
+      if (Attr.isEnumAttribute()) {
+        uint64_t Val = getAttrKindEncoding(Attr.getKindAsEnum());
+        assert(Val <= bitc::ATTR_KIND_ARGMEMONLY &&
+               "DXIL does not support attributes above ATTR_KIND_ARGMEMONLY");
+        Record.push_back(0);
+        Record.push_back(Val);
+      } else if (Attr.isIntAttribute()) {
+        uint64_t Val = getAttrKindEncoding(Attr.getKindAsEnum());
+        assert(Val <= bitc::ATTR_KIND_ARGMEMONLY &&
+               "DXIL does not support attributes above ATTR_KIND_ARGMEMONLY");
+        Record.push_back(1);
+        Record.push_back(Val);
+        Record.push_back(Attr.getValueAsInt());
+      } else {
+        StringRef Kind = Attr.getKindAsString();
+        StringRef Val = Attr.getValueAsString();
+
+        Record.push_back(Val.empty() ? 3 : 4);
+        Record.append(Kind.begin(), Kind.end());
+        Record.push_back(0);
+        if (!Val.empty()) {
+          Record.append(Val.begin(), Val.end());
+          Record.push_back(0);
+        }
+      }
+    }
+
+    Stream.EmitRecord(bitc::PARAMATTR_GRP_CODE_ENTRY, Record);
+    Record.clear();
+  }
+
+  Stream.ExitBlock();
+}
+
+void DXILBitcodeWriter::writeAttributeTable() {
+  const std::vector<AttributeList> &Attrs = VE.getAttributeLists();
+  if (Attrs.empty())
+    return;
+
+  Stream.EnterSubblock(bitc::PARAMATTR_BLOCK_ID, 3);
+
+  SmallVector<uint64_t, 64> Record;
+  for (unsigned i = 0, e = Attrs.size(); i != e; ++i) {
+    AttributeList AL = Attrs[i];
+    for (unsigned i : AL.indexes()) {
+      AttributeSet AS = AL.getAttributes(i);
+      if (AS.hasAttributes())
+        Record.push_back(VE.getAttributeGroupID({i, AS}));
+    }
+
+    Stream.EmitRecord(bitc::PARAMATTR_CODE_ENTRY, Record);
+    Record.clear();
+  }
+
+  Stream.ExitBlock();
+}
+
+/// WriteTypeTable - Write out the type table for a module.
+void DXILBitcodeWriter::writeTypeTable() {
+  const ValueEnumerator::TypeList &TypeList = VE.getTypes();
+
+  Stream.EnterSubblock(bitc::TYPE_BLOCK_ID_NEW, 4 /*count from # abbrevs */);
+  SmallVector<uint64_t, 64> TypeVals;
+
+  uint64_t NumBits = VE.computeBitsRequiredForTypeIndicies();
+
+  // Abbrev for TYPE_CODE_POINTER.
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
+  Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_POINTER));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits));
+  Abbv->Add(BitCodeAbbrevOp(0)); // Addrspace = 0
+  unsigned PtrAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+
+  // Abbrev for TYPE_CODE_FUNCTION.
+  Abbv = std::make_shared<BitCodeAbbrev>();
+  Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_FUNCTION));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // isvararg
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits));
+  unsigned FunctionAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+
+  // Abbrev for TYPE_CODE_STRUCT_ANON.
+  Abbv = std::make_shared<BitCodeAbbrev>();
+  Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_STRUCT_ANON));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // ispacked
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits));
+  unsigned StructAnonAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+
+  // Abbrev for TYPE_CODE_STRUCT_NAME.
+  Abbv = std::make_shared<BitCodeAbbrev>();
+  Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_STRUCT_NAME));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
+  unsigned StructNameAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+
+  // Abbrev for TYPE_CODE_STRUCT_NAMED.
+  Abbv = std::make_shared<BitCodeAbbrev>();
+  Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_STRUCT_NAMED));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // ispacked
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits));
+  unsigned StructNamedAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+
+  // Abbrev for TYPE_CODE_ARRAY.
+  Abbv = std::make_shared<BitCodeAbbrev>();
+  Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_ARRAY));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // size
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits));
+  unsigned ArrayAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+
+  // Emit an entry count so the reader can reserve space.
+  TypeVals.push_back(TypeList.size());
+  Stream.EmitRecord(bitc::TYPE_CODE_NUMENTRY, TypeVals);
+  TypeVals.clear();
+
+  // Loop over all of the types, emitting each in turn.
+  for (Type *T : TypeList) {
+    int AbbrevToUse = 0;
+    unsigned Code = 0;
+
+    switch (T->getTypeID()) {
+    case Type::BFloatTyID:
+    case Type::X86_AMXTyID:
+    case Type::TokenTyID:
+      llvm_unreachable("These should never be used!!!");
+      break;
+    case Type::VoidTyID:
+      Code = bitc::TYPE_CODE_VOID;
+      break;
+    case Type::HalfTyID:
+      Code = bitc::TYPE_CODE_HALF;
+      break;
+    case Type::FloatTyID:
+      Code = bitc::TYPE_CODE_FLOAT;
+      break;
+    case Type::DoubleTyID:
+      Code = bitc::TYPE_CODE_DOUBLE;
+      break;
+    case Type::X86_FP80TyID:
+      Code = bitc::TYPE_CODE_X86_FP80;
+      break;
+    case Type::FP128TyID:
+      Code = bitc::TYPE_CODE_FP128;
+      break;
+    case Type::PPC_FP128TyID:
+      Code = bitc::TYPE_CODE_PPC_FP128;
+      break;
+    case Type::LabelTyID:
+      Code = bitc::TYPE_CODE_LABEL;
+      break;
+    case Type::MetadataTyID:
+      Code = bitc::TYPE_CODE_METADATA;
+      break;
+    case Type::X86_MMXTyID:
+      Code = bitc::TYPE_CODE_X86_MMX;
+      break;
+    case Type::IntegerTyID:
+      // INTEGER: [width]
+      Code = bitc::TYPE_CODE_INTEGER;
+      TypeVals.push_back(cast<IntegerType>(T)->getBitWidth());
+      break;
+    case Type::DXILPointerTyID: {
+      TypedPointerType *PTy = cast<TypedPointerType>(T);
+      // POINTER: [pointee type, address space]
+      Code = bitc::TYPE_CODE_POINTER;
+      TypeVals.push_back(getTypeID(PTy->getElementType()));
+      unsigned AddressSpace = PTy->getAddressSpace();
+      TypeVals.push_back(AddressSpace);
+      if (AddressSpace == 0)
+        AbbrevToUse = PtrAbbrev;
+      break;
+    }
+    case Type::PointerTyID: {
+      PointerType *PTy = cast<PointerType>(T);
+      // POINTER: [pointee type, address space]
+      Code = bitc::TYPE_CODE_POINTER;
+      // Emitting an empty struct type for the opaque pointer's type allows
+      // this to be order-independent. Non-struct types must be emitted in
+      // bitcode before they can be referenced.
+      if (PTy->isOpaquePointerTy()) {
+        TypeVals.push_back(false);
+        Code = bitc::TYPE_CODE_OPAQUE;
+        writeStringRecord(Stream, bitc::TYPE_CODE_STRUCT_NAME,
+                          "dxilOpaquePtrReservedName", StructNameAbbrev);
+      } else {
+        TypeVals.push_back(getTypeID(PTy->getNonOpaquePointerElementType()));
+        unsigned AddressSpace = PTy->getAddressSpace();
+        TypeVals.push_back(AddressSpace);
+        if (AddressSpace == 0)
+          AbbrevToUse = PtrAbbrev;
+      }
+      break;
+    }
+    case Type::FunctionTyID: {
+      FunctionType *FT = cast<FunctionType>(T);
+      // FUNCTION: [isvararg, retty, paramty x N]
+      Code = bitc::TYPE_CODE_FUNCTION;
+      TypeVals.push_back(FT->isVarArg());
+      TypeVals.push_back(getTypeID(FT->getReturnType()));
+      for (Type *PTy : FT->params())
+        TypeVals.push_back(getTypeID(PTy));
+      AbbrevToUse = FunctionAbbrev;
+      break;
+    }
+    case Type::StructTyID: {
+      StructType *ST = cast<StructType>(T);
+      // STRUCT: [ispacked, eltty x N]
+      TypeVals.push_back(ST->isPacked());
+      // Output all of the element types.
+      for (Type *ElTy : ST->elements())
+        TypeVals.push_back(getTypeID(ElTy));
+
+      if (ST->isLiteral()) {
+        Code = bitc::TYPE_CODE_STRUCT_ANON;
+        AbbrevToUse = StructAnonAbbrev;
+      } else {
+        if (ST->isOpaque()) {
+          Code = bitc::TYPE_CODE_OPAQUE;
+        } else {
+          Code = bitc::TYPE_CODE_STRUCT_NAMED;
+          AbbrevToUse = StructNamedAbbrev;
+        }
+
+        // Emit the name if it is present.
+        if (!ST->getName().empty())
+          writeStringRecord(Stream, bitc::TYPE_CODE_STRUCT_NAME, ST->getName(),
+                            StructNameAbbrev);
+      }
+      break;
+    }
+    case Type::ArrayTyID: {
+      ArrayType *AT = cast<ArrayType>(T);
+      // ARRAY: [numelts, eltty]
+      Code = bitc::TYPE_CODE_ARRAY;
+      TypeVals.push_back(AT->getNumElements());
+      TypeVals.push_back(getTypeID(AT->getElementType()));
+      AbbrevToUse = ArrayAbbrev;
+      break;
+    }
+    case Type::FixedVectorTyID:
+    case Type::ScalableVectorTyID: {
+      VectorType *VT = cast<VectorType>(T);
+      // VECTOR [numelts, eltty]
+      Code = bitc::TYPE_CODE_VECTOR;
+      TypeVals.push_back(VT->getElementCount().getKnownMinValue());
+      TypeVals.push_back(getTypeID(VT->getElementType()));
+      break;
+    }
+    }
+
+    // Emit the finished record.
+    Stream.EmitRecord(Code, TypeVals, AbbrevToUse);
+    TypeVals.clear();
+  }
+
+  Stream.ExitBlock();
+}
+
+void DXILBitcodeWriter::writeComdats() {
+  SmallVector<uint16_t, 64> Vals;
+  for (const Comdat *C : VE.getComdats()) {
+    // COMDAT: [selection_kind, name]
+    Vals.push_back(getEncodedComdatSelectionKind(*C));
+    size_t Size = C->getName().size();
+    assert(isUInt<16>(Size));
+    Vals.push_back(Size);
+    for (char Chr : C->getName())
+      Vals.push_back((unsigned char)Chr);
+    Stream.EmitRecord(bitc::MODULE_CODE_COMDAT, Vals, /*AbbrevToUse=*/0);
+    Vals.clear();
+  }
+}
+
+void DXILBitcodeWriter::writeValueSymbolTableForwardDecl() {}
+
+/// Emit top-level description of module, including target triple, inline asm,
+/// descriptors for global variables, and function prototype info.
+/// Returns the bit offset to backpatch with the location of the real VST.
+void DXILBitcodeWriter::writeModuleInfo() {
+  // Emit various pieces of data attached to a module.
+  if (!M.getTargetTriple().empty())
+    writeStringRecord(Stream, bitc::MODULE_CODE_TRIPLE, M.getTargetTriple(),
+                      0 /*TODO*/);
+  const std::string &DL = M.getDataLayoutStr();
+  if (!DL.empty())
+    writeStringRecord(Stream, bitc::MODULE_CODE_DATALAYOUT, DL, 0 /*TODO*/);
+  if (!M.getModuleInlineAsm().empty())
+    writeStringRecord(Stream, bitc::MODULE_CODE_ASM, M.getModuleInlineAsm(),
+                      0 /*TODO*/);
+
+  // Emit information about sections and GC, computing how many there are. Also
+  // compute the maximum alignment value.
+  std::map<std::string, unsigned> SectionMap;
+  std::map<std::string, unsigned> GCMap;
+  MaybeAlign MaxAlignment;
+  unsigned MaxGlobalType = 0;
+  const auto UpdateMaxAlignment = [&MaxAlignment](const MaybeAlign A) {
+    if (A)
+      MaxAlignment = !MaxAlignment ? *A : std::max(*MaxAlignment, *A);
+  };
+  for (const GlobalVariable &GV : M.globals()) {
+    UpdateMaxAlignment(GV.getAlign());
+    MaxGlobalType = std::max(MaxGlobalType, getTypeID(GV.getValueType(), &GV));
+    if (GV.hasSection()) {
+      // Give section names unique ID's.
+      unsigned &Entry = SectionMap[std::string(GV.getSection())];
+      if (!Entry) {
+        writeStringRecord(Stream, bitc::MODULE_CODE_SECTIONNAME,
+                          GV.getSection(), 0 /*TODO*/);
+        Entry = SectionMap.size();
+      }
+    }
+  }
+  for (const Function &F : M) {
+    UpdateMaxAlignment(F.getAlign());
+    if (F.hasSection()) {
+      // Give section names unique ID's.
+      unsigned &Entry = SectionMap[std::string(F.getSection())];
+      if (!Entry) {
+        writeStringRecord(Stream, bitc::MODULE_CODE_SECTIONNAME, F.getSection(),
+                          0 /*TODO*/);
+        Entry = SectionMap.size();
+      }
+    }
+    if (F.hasGC()) {
+      // Same for GC names.
+      unsigned &Entry = GCMap[F.getGC()];
+      if (!Entry) {
+        writeStringRecord(Stream, bitc::MODULE_CODE_GCNAME, F.getGC(),
+                          0 /*TODO*/);
+        Entry = GCMap.size();
+      }
+    }
+  }
+
+  // Emit abbrev for globals, now that we know # sections and max alignment.
+  unsigned SimpleGVarAbbrev = 0;
+  if (!M.global_empty()) {
+    // Add an abbrev for common globals with no visibility or thread
+    // localness.
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
+    Abbv->Add(BitCodeAbbrevOp(bitc::MODULE_CODE_GLOBALVAR));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
+                              Log2_32_Ceil(MaxGlobalType + 1)));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // AddrSpace << 2
+                                                           //| explicitType << 1
+                                                           //| constant
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // Initializer.
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 5)); // Linkage.
+    if (!MaxAlignment)                                     // Alignment.
+      Abbv->Add(BitCodeAbbrevOp(0));
+    else {
+      unsigned MaxEncAlignment = getEncodedAlign(MaxAlignment);
+      Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
+                                Log2_32_Ceil(MaxEncAlignment + 1)));
+    }
+    if (SectionMap.empty()) // Section.
+      Abbv->Add(BitCodeAbbrevOp(0));
+    else
+      Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
+                                Log2_32_Ceil(SectionMap.size() + 1)));
+    // Don't bother emitting vis + thread local.
+    SimpleGVarAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+  }
+
+  // Emit the global variable information.
+  SmallVector<unsigned, 64> Vals;
+  for (const GlobalVariable &GV : M.globals()) {
+    unsigned AbbrevToUse = 0;
+
+    // GLOBALVAR: [type, isconst, initid,
+    //             linkage, alignment, section, visibility, threadlocal,
+    //             unnamed_addr, externally_initialized, dllstorageclass,
+    //             comdat]
+    Vals.push_back(getTypeID(GV.getValueType(), &GV));
+    Vals.push_back(
+        GV.getType()->getAddressSpace() << 2 | 2 |
+        (GV.isConstant() ? 1 : 0)); // HLSL Change - bitwise | was used with
+                                    // unsigned int and bool
+    Vals.push_back(
+        GV.isDeclaration() ? 0 : (VE.getValueID(GV.getInitializer()) + 1));
+    Vals.push_back(getEncodedLinkage(GV));
+    Vals.push_back(getEncodedAlign(GV.getAlign()));
+    Vals.push_back(GV.hasSection() ? SectionMap[std::string(GV.getSection())]
+                                   : 0);
+    if (GV.isThreadLocal() ||
+        GV.getVisibility() != GlobalValue::DefaultVisibility ||
+        GV.getUnnamedAddr() != GlobalValue::UnnamedAddr::None ||
+        GV.isExternallyInitialized() ||
+        GV.getDLLStorageClass() != GlobalValue::DefaultStorageClass ||
+        GV.hasComdat()) {
+      Vals.push_back(getEncodedVisibility(GV));
+      Vals.push_back(getEncodedThreadLocalMode(GV));
+      Vals.push_back(GV.getUnnamedAddr() != GlobalValue::UnnamedAddr::None);
+      Vals.push_back(GV.isExternallyInitialized());
+      Vals.push_back(getEncodedDLLStorageClass(GV));
+      Vals.push_back(GV.hasComdat() ? VE.getComdatID(GV.getComdat()) : 0);
+    } else {
+      AbbrevToUse = SimpleGVarAbbrev;
+    }
+
+    Stream.EmitRecord(bitc::MODULE_CODE_GLOBALVAR, Vals, AbbrevToUse);
+    Vals.clear();
+  }
+
+  // Emit the function proto information.
+  for (const Function &F : M) {
+    // FUNCTION:  [type, callingconv, isproto, linkage, paramattrs, alignment,
+    //             section, visibility, gc, unnamed_addr, prologuedata,
+    //             dllstorageclass, comdat, prefixdata, personalityfn]
+    Vals.push_back(getTypeID(F.getFunctionType(), &F));
+    Vals.push_back(F.getCallingConv());
+    Vals.push_back(F.isDeclaration());
+    Vals.push_back(getEncodedLinkage(F));
+    Vals.push_back(VE.getAttributeListID(F.getAttributes()));
+    Vals.push_back(getEncodedAlign(F.getAlign()));
+    Vals.push_back(F.hasSection() ? SectionMap[std::string(F.getSection())]
+                                  : 0);
+    Vals.push_back(getEncodedVisibility(F));
+    Vals.push_back(F.hasGC() ? GCMap[F.getGC()] : 0);
+    Vals.push_back(F.getUnnamedAddr() != GlobalValue::UnnamedAddr::None);
+    Vals.push_back(
+        F.hasPrologueData() ? (VE.getValueID(F.getPrologueData()) + 1) : 0);
+    Vals.push_back(getEncodedDLLStorageClass(F));
+    Vals.push_back(F.hasComdat() ? VE.getComdatID(F.getComdat()) : 0);
+    Vals.push_back(F.hasPrefixData() ? (VE.getValueID(F.getPrefixData()) + 1)
+                                     : 0);
+    Vals.push_back(
+        F.hasPersonalityFn() ? (VE.getValueID(F.getPersonalityFn()) + 1) : 0);
+
+    unsigned AbbrevToUse = 0;
+    Stream.EmitRecord(bitc::MODULE_CODE_FUNCTION, Vals, AbbrevToUse);
+    Vals.clear();
+  }
+
+  // Emit the alias information.
+  for (const GlobalAlias &A : M.aliases()) {
+    // ALIAS: [alias type, aliasee val#, linkage, visibility]
+    Vals.push_back(getTypeID(A.getValueType(), &A));
+    Vals.push_back(VE.getValueID(A.getAliasee()));
+    Vals.push_back(getEncodedLinkage(A));
+    Vals.push_back(getEncodedVisibility(A));
+    Vals.push_back(getEncodedDLLStorageClass(A));
+    Vals.push_back(getEncodedThreadLocalMode(A));
+    Vals.push_back(A.getUnnamedAddr() != GlobalValue::UnnamedAddr::None);
+    unsigned AbbrevToUse = 0;
+    Stream.EmitRecord(bitc::MODULE_CODE_ALIAS_OLD, Vals, AbbrevToUse);
+    Vals.clear();
+  }
+}
+
+void DXILBitcodeWriter::writeValueAsMetadata(
+    const ValueAsMetadata *MD, SmallVectorImpl<uint64_t> &Record) {
+  // Mimic an MDNode with a value as one operand.
+  Value *V = MD->getValue();
+  Type *Ty = V->getType();
+  if (Function *F = dyn_cast<Function>(V))
+    Ty = TypedPointerType::get(F->getFunctionType(), F->getAddressSpace());
+  else if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+    Ty = TypedPointerType::get(GV->getValueType(), GV->getAddressSpace());
+  Record.push_back(getTypeID(Ty));
+  Record.push_back(VE.getValueID(V));
+  Stream.EmitRecord(bitc::METADATA_VALUE, Record, 0);
+  Record.clear();
+}
+
+void DXILBitcodeWriter::writeMDTuple(const MDTuple *N,
+                                     SmallVectorImpl<uint64_t> &Record,
+                                     unsigned Abbrev) {
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    Metadata *MD = N->getOperand(i);
+    assert(!(MD && isa<LocalAsMetadata>(MD)) &&
+           "Unexpected function-local metadata");
+    Record.push_back(VE.getMetadataOrNullID(MD));
+  }
+  Stream.EmitRecord(N->isDistinct() ? bitc::METADATA_DISTINCT_NODE
+                                    : bitc::METADATA_NODE,
+                    Record, Abbrev);
+  Record.clear();
+}
+
+void DXILBitcodeWriter::writeDILocation(const DILocation *N,
+                                        SmallVectorImpl<uint64_t> &Record,
+                                        unsigned &Abbrev) {
+  if (!Abbrev)
+    Abbrev = createDILocationAbbrev();
+  Record.push_back(N->isDistinct());
+  Record.push_back(N->getLine());
+  Record.push_back(N->getColumn());
+  Record.push_back(VE.getMetadataID(N->getScope()));
+  Record.push_back(VE.getMetadataOrNullID(N->getInlinedAt()));
+
+  Stream.EmitRecord(bitc::METADATA_LOCATION, Record, Abbrev);
+  Record.clear();
+}
+
+static uint64_t rotateSign(APInt Val) {
+  int64_t I = Val.getSExtValue();
+  uint64_t U = I;
+  return I < 0 ? ~(U << 1) : U << 1;
+}
+
+static uint64_t rotateSign(DISubrange::BoundType Val) {
+  return rotateSign(Val.get<ConstantInt *>()->getValue());
+}
+
+void DXILBitcodeWriter::writeDISubrange(const DISubrange *N,
+                                        SmallVectorImpl<uint64_t> &Record,
+                                        unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(
+      N->getCount().get<ConstantInt *>()->getValue().getSExtValue());
+  Record.push_back(rotateSign(N->getLowerBound()));
+
+  Stream.EmitRecord(bitc::METADATA_SUBRANGE, Record, Abbrev);
+  Record.clear();
+}
+
+void DXILBitcodeWriter::writeDIEnumerator(const DIEnumerator *N,
+                                          SmallVectorImpl<uint64_t> &Record,
+                                          unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(rotateSign(N->getValue()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+
+  Stream.EmitRecord(bitc::METADATA_ENUMERATOR, Record, Abbrev);
+  Record.clear();
+}
+
+void DXILBitcodeWriter::writeDIBasicType(const DIBasicType *N,
+                                         SmallVectorImpl<uint64_t> &Record,
+                                         unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(N->getTag());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+  Record.push_back(N->getSizeInBits());
+  Record.push_back(N->getAlignInBits());
+  Record.push_back(N->getEncoding());
+
+  Stream.EmitRecord(bitc::METADATA_BASIC_TYPE, Record, Abbrev);
+  Record.clear();
+}
+
+void DXILBitcodeWriter::writeDIDerivedType(const DIDerivedType *N,
+                                           SmallVectorImpl<uint64_t> &Record,
+                                           unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(N->getTag());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+  Record.push_back(VE.getMetadataOrNullID(N->getFile()));
+  Record.push_back(N->getLine());
+  Record.push_back(VE.getMetadataOrNullID(N->getScope()));
+  Record.push_back(VE.getMetadataOrNullID(N->getBaseType()));
+  Record.push_back(N->getSizeInBits());
+  Record.push_back(N->getAlignInBits());
+  Record.push_back(N->getOffsetInBits());
+  Record.push_back(N->getFlags());
+  Record.push_back(VE.getMetadataOrNullID(N->getExtraData()));
+
+  Stream.EmitRecord(bitc::METADATA_DERIVED_TYPE, Record, Abbrev);
+  Record.clear();
+}
+
+void DXILBitcodeWriter::writeDICompositeType(const DICompositeType *N,
+                                             SmallVectorImpl<uint64_t> &Record,
+                                             unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(N->getTag());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+  Record.push_back(VE.getMetadataOrNullID(N->getFile()));
+  Record.push_back(N->getLine());
+  Record.push_back(VE.getMetadataOrNullID(N->getScope()));
+  Record.push_back(VE.getMetadataOrNullID(N->getBaseType()));
+  Record.push_back(N->getSizeInBits());
+  Record.push_back(N->getAlignInBits());
+  Record.push_back(N->getOffsetInBits());
+  Record.push_back(N->getFlags());
+  Record.push_back(VE.getMetadataOrNullID(N->getElements().get()));
+  Record.push_back(N->getRuntimeLang());
+  Record.push_back(VE.getMetadataOrNullID(N->getVTableHolder()));
+  Record.push_back(VE.getMetadataOrNullID(N->getTemplateParams().get()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawIdentifier()));
+
+  Stream.EmitRecord(bitc::METADATA_COMPOSITE_TYPE, Record, Abbrev);
+  Record.clear();
+}
+
+void DXILBitcodeWriter::writeDISubroutineType(const DISubroutineType *N,
+                                              SmallVectorImpl<uint64_t> &Record,
+                                              unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(N->getFlags());
+  Record.push_back(VE.getMetadataOrNullID(N->getTypeArray().get()));
+
+  Stream.EmitRecord(bitc::METADATA_SUBROUTINE_TYPE, Record, Abbrev);
+  Record.clear();
+}
+
+void DXILBitcodeWriter::writeDIFile(const DIFile *N,
+                                    SmallVectorImpl<uint64_t> &Record,
+                                    unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawFilename()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawDirectory()));
+
+  Stream.EmitRecord(bitc::METADATA_FILE, Record, Abbrev);
+  Record.clear();
+}
+
+void DXILBitcodeWriter::writeDICompileUnit(const DICompileUnit *N,
+                                           SmallVectorImpl<uint64_t> &Record,
+                                           unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(N->getSourceLanguage());
+  Record.push_back(VE.getMetadataOrNullID(N->getFile()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawProducer()));
+  Record.push_back(N->isOptimized());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawFlags()));
+  Record.push_back(N->getRuntimeVersion());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawSplitDebugFilename()));
+  Record.push_back(N->getEmissionKind());
+  Record.push_back(VE.getMetadataOrNullID(N->getEnumTypes().get()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRetainedTypes().get()));
+  Record.push_back(/* subprograms */ 0);
+  Record.push_back(VE.getMetadataOrNullID(N->getGlobalVariables().get()));
+  Record.push_back(VE.getMetadataOrNullID(N->getImportedEntities().get()));
+  Record.push_back(N->getDWOId());
+
+  Stream.EmitRecord(bitc::METADATA_COMPILE_UNIT, Record, Abbrev);
+  Record.clear();
+}
+
+void DXILBitcodeWriter::writeDISubprogram(const DISubprogram *N,
+                                          SmallVectorImpl<uint64_t> &Record,
+                                          unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(VE.getMetadataOrNullID(N->getScope()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawLinkageName()));
+  Record.push_back(VE.getMetadataOrNullID(N->getFile()));
+  Record.push_back(N->getLine());
+  Record.push_back(VE.getMetadataOrNullID(N->getType()));
+  Record.push_back(N->isLocalToUnit());
+  Record.push_back(N->isDefinition());
+  Record.push_back(N->getScopeLine());
+  Record.push_back(VE.getMetadataOrNullID(N->getContainingType()));
+  Record.push_back(N->getVirtuality());
+  Record.push_back(N->getVirtualIndex());
+  Record.push_back(N->getFlags());
+  Record.push_back(N->isOptimized());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawUnit()));
+  Record.push_back(VE.getMetadataOrNullID(N->getTemplateParams().get()));
+  Record.push_back(VE.getMetadataOrNullID(N->getDeclaration()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRetainedNodes().get()));
+
+  Stream.EmitRecord(bitc::METADATA_SUBPROGRAM, Record, Abbrev);
+  Record.clear();
+}
+
+void DXILBitcodeWriter::writeDILexicalBlock(const DILexicalBlock *N,
+                                            SmallVectorImpl<uint64_t> &Record,
+                                            unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(VE.getMetadataOrNullID(N->getScope()));
+  Record.push_back(VE.getMetadataOrNullID(N->getFile()));
+  Record.push_back(N->getLine());
+  Record.push_back(N->getColumn());
+
+  Stream.EmitRecord(bitc::METADATA_LEXICAL_BLOCK, Record, Abbrev);
+  Record.clear();
+}
+
+void DXILBitcodeWriter::writeDILexicalBlockFile(
+    const DILexicalBlockFile *N, SmallVectorImpl<uint64_t> &Record,
+    unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(VE.getMetadataOrNullID(N->getScope()));
+  Record.push_back(VE.getMetadataOrNullID(N->getFile()));
+  Record.push_back(N->getDiscriminator());
+
+  Stream.EmitRecord(bitc::METADATA_LEXICAL_BLOCK_FILE, Record, Abbrev);
+  Record.clear();
+}
+
+void DXILBitcodeWriter::writeDINamespace(const DINamespace *N,
+                                         SmallVectorImpl<uint64_t> &Record,
+                                         unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(VE.getMetadataOrNullID(N->getScope()));
+  Record.push_back(VE.getMetadataOrNullID(N->getFile()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+  Record.push_back(/* line number */ 0);
+
+  Stream.EmitRecord(bitc::METADATA_NAMESPACE, Record, Abbrev);
+  Record.clear();
+}
+
+void DXILBitcodeWriter::writeDIModule(const DIModule *N,
+                                      SmallVectorImpl<uint64_t> &Record,
+                                      unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  for (auto &I : N->operands())
+    Record.push_back(VE.getMetadataOrNullID(I));
+
+  Stream.EmitRecord(bitc::METADATA_MODULE, Record, Abbrev);
+  Record.clear();
+}
+
+void DXILBitcodeWriter::writeDITemplateTypeParameter(
+    const DITemplateTypeParameter *N, SmallVectorImpl<uint64_t> &Record,
+    unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+  Record.push_back(VE.getMetadataOrNullID(N->getType()));
+
+  Stream.EmitRecord(bitc::METADATA_TEMPLATE_TYPE, Record, Abbrev);
+  Record.clear();
+}
+
+void DXILBitcodeWriter::writeDITemplateValueParameter(
+    const DITemplateValueParameter *N, SmallVectorImpl<uint64_t> &Record,
+    unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(N->getTag());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+  Record.push_back(VE.getMetadataOrNullID(N->getType()));
+  Record.push_back(VE.getMetadataOrNullID(N->getValue()));
+
+  Stream.EmitRecord(bitc::METADATA_TEMPLATE_VALUE, Record, Abbrev);
+  Record.clear();
+}
+
+void DXILBitcodeWriter::writeDIGlobalVariable(const DIGlobalVariable *N,
+                                              SmallVectorImpl<uint64_t> &Record,
+                                              unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(VE.getMetadataOrNullID(N->getScope()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawLinkageName()));
+  Record.push_back(VE.getMetadataOrNullID(N->getFile()));
+  Record.push_back(N->getLine());
+  Record.push_back(VE.getMetadataOrNullID(N->getType()));
+  Record.push_back(N->isLocalToUnit());
+  Record.push_back(N->isDefinition());
+  Record.push_back(/* N->getRawVariable() */ 0);
+  Record.push_back(VE.getMetadataOrNullID(N->getStaticDataMemberDeclaration()));
+
+  Stream.EmitRecord(bitc::METADATA_GLOBAL_VAR, Record, Abbrev);
+  Record.clear();
+}
+
+void DXILBitcodeWriter::writeDILocalVariable(const DILocalVariable *N,
+                                             SmallVectorImpl<uint64_t> &Record,
+                                             unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(N->getTag());
+  Record.push_back(VE.getMetadataOrNullID(N->getScope()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+  Record.push_back(VE.getMetadataOrNullID(N->getFile()));
+  Record.push_back(N->getLine());
+  Record.push_back(VE.getMetadataOrNullID(N->getType()));
+  Record.push_back(N->getArg());
+  Record.push_back(N->getFlags());
+
+  Stream.EmitRecord(bitc::METADATA_LOCAL_VAR, Record, Abbrev);
+  Record.clear();
+}
+
+void DXILBitcodeWriter::writeDIExpression(const DIExpression *N,
+                                          SmallVectorImpl<uint64_t> &Record,
+                                          unsigned Abbrev) {
+  Record.reserve(N->getElements().size() + 1);
+
+  Record.push_back(N->isDistinct());
+  Record.append(N->elements_begin(), N->elements_end());
+
+  Stream.EmitRecord(bitc::METADATA_EXPRESSION, Record, Abbrev);
+  Record.clear();
+}
+
+void DXILBitcodeWriter::writeDIObjCProperty(const DIObjCProperty *N,
+                                            SmallVectorImpl<uint64_t> &Record,
+                                            unsigned Abbrev) {
+  llvm_unreachable("DXIL does not support objc!!!");
+}
+
+void DXILBitcodeWriter::writeDIImportedEntity(const DIImportedEntity *N,
+                                              SmallVectorImpl<uint64_t> &Record,
+                                              unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(N->getTag());
+  Record.push_back(VE.getMetadataOrNullID(N->getScope()));
+  Record.push_back(VE.getMetadataOrNullID(N->getEntity()));
+  Record.push_back(N->getLine());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+
+  Stream.EmitRecord(bitc::METADATA_IMPORTED_ENTITY, Record, Abbrev);
+  Record.clear();
+}
+
+unsigned DXILBitcodeWriter::createDILocationAbbrev() {
+  // Abbrev for METADATA_LOCATION.
+  //
+  // Assume the column is usually under 128, and always output the inlined-at
+  // location (it's never more expensive than building an array size 1).
+  std::shared_ptr<BitCodeAbbrev> Abbv = std::make_shared<BitCodeAbbrev>();
+  Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_LOCATION));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
+  return Stream.EmitAbbrev(std::move(Abbv));
+}
+
+unsigned DXILBitcodeWriter::createGenericDINodeAbbrev() {
+  // Abbrev for METADATA_GENERIC_DEBUG.
+  //
+  // Assume the column is usually under 128, and always output the inlined-at
+  // location (it's never more expensive than building an array size 1).
+  std::shared_ptr<BitCodeAbbrev> Abbv = std::make_shared<BitCodeAbbrev>();
+  Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_GENERIC_DEBUG));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
+  return Stream.EmitAbbrev(std::move(Abbv));
+}
+
+void DXILBitcodeWriter::writeMetadataRecords(ArrayRef<const Metadata *> MDs,
+                                             SmallVectorImpl<uint64_t> &Record,
+                                             std::vector<unsigned> *MDAbbrevs,
+                                             std::vector<uint64_t> *IndexPos) {
+  if (MDs.empty())
+    return;
+
+    // Initialize MDNode abbreviations.
+#define HANDLE_MDNODE_LEAF(CLASS) unsigned CLASS##Abbrev = 0;
+#include "llvm/IR/Metadata.def"
+
+  for (const Metadata *MD : MDs) {
+    if (IndexPos)
+      IndexPos->push_back(Stream.GetCurrentBitNo());
+    if (const MDNode *N = dyn_cast<MDNode>(MD)) {
+      assert(N->isResolved() && "Expected forward references to be resolved");
+
+      switch (N->getMetadataID()) {
+      default:
+        llvm_unreachable("Invalid MDNode subclass");
+#define HANDLE_MDNODE_LEAF(CLASS)                                              \
+  case Metadata::CLASS##Kind:                                                  \
+    if (MDAbbrevs)                                                             \
+      write##CLASS(cast<CLASS>(N), Record,                                     \
+                   (*MDAbbrevs)[MetadataAbbrev::CLASS##AbbrevID]);             \
+    else                                                                       \
+      write##CLASS(cast<CLASS>(N), Record, CLASS##Abbrev);                     \
+    continue;
+#include "llvm/IR/Metadata.def"
+      }
+    }
+    writeValueAsMetadata(cast<ValueAsMetadata>(MD), Record);
+  }
+}
+
+unsigned DXILBitcodeWriter::createMetadataStringsAbbrev() {
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
+  Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_STRING_OLD));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));
+  return Stream.EmitAbbrev(std::move(Abbv));
+}
+
+void DXILBitcodeWriter::writeMetadataStrings(
+    ArrayRef<const Metadata *> Strings, SmallVectorImpl<uint64_t> &Record) {
+  for (const Metadata *MD : Strings) {
+    const MDString *MDS = cast<MDString>(MD);
+    // Code: [strchar x N]
+    Record.append(MDS->bytes_begin(), MDS->bytes_end());
+
+    // Emit the finished record.
+    Stream.EmitRecord(bitc::METADATA_STRING_OLD, Record,
+                      createMetadataStringsAbbrev());
+    Record.clear();
+  }
+}
+
+void DXILBitcodeWriter::writeModuleMetadata() {
+  if (!VE.hasMDs() && M.named_metadata_empty())
+    return;
+
+  Stream.EnterSubblock(bitc::METADATA_BLOCK_ID, 5);
+
+  // Emit all abbrevs upfront, so that the reader can jump in the middle of the
+  // block and load any metadata.
+  std::vector<unsigned> MDAbbrevs;
+
+  MDAbbrevs.resize(MetadataAbbrev::LastPlusOne);
+  MDAbbrevs[MetadataAbbrev::DILocationAbbrevID] = createDILocationAbbrev();
+  MDAbbrevs[MetadataAbbrev::GenericDINodeAbbrevID] =
+      createGenericDINodeAbbrev();
+
+  unsigned NameAbbrev = 0;
+  if (!M.named_metadata_empty()) {
+    // Abbrev for METADATA_NAME.
+    std::shared_ptr<BitCodeAbbrev> Abbv = std::make_shared<BitCodeAbbrev>();
+    Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_NAME));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));
+    NameAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+  }
+
+  SmallVector<uint64_t, 64> Record;
+  writeMetadataStrings(VE.getMDStrings(), Record);
+
+  std::vector<uint64_t> IndexPos;
+  IndexPos.reserve(VE.getNonMDStrings().size());
+  writeMetadataRecords(VE.getNonMDStrings(), Record, &MDAbbrevs, &IndexPos);
+
+  // Write named metadata.
+  for (const NamedMDNode &NMD : M.named_metadata()) {
+    // Write name.
+    StringRef Str = NMD.getName();
+    Record.append(Str.bytes_begin(), Str.bytes_end());
+    Stream.EmitRecord(bitc::METADATA_NAME, Record, NameAbbrev);
+    Record.clear();
+
+    // Write named metadata operands.
+    for (const MDNode *N : NMD.operands())
+      Record.push_back(VE.getMetadataID(N));
+    Stream.EmitRecord(bitc::METADATA_NAMED_NODE, Record, 0);
+    Record.clear();
+  }
+
+  Stream.ExitBlock();
+}
+
+void DXILBitcodeWriter::writeFunctionMetadata(const Function &F) {
+  if (!VE.hasMDs())
+    return;
+
+  Stream.EnterSubblock(bitc::METADATA_BLOCK_ID, 4);
+  SmallVector<uint64_t, 64> Record;
+  writeMetadataStrings(VE.getMDStrings(), Record);
+  writeMetadataRecords(VE.getNonMDStrings(), Record);
+  Stream.ExitBlock();
+}
+
+void DXILBitcodeWriter::writeFunctionMetadataAttachment(const Function &F) {
+  Stream.EnterSubblock(bitc::METADATA_ATTACHMENT_ID, 3);
+
+  SmallVector<uint64_t, 64> Record;
+
+  // Write metadata attachments
+  // METADATA_ATTACHMENT - [m x [value, [n x [id, mdnode]]]
+  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+  F.getAllMetadata(MDs);
+  if (!MDs.empty()) {
+    for (const auto &I : MDs) {
+      Record.push_back(I.first);
+      Record.push_back(VE.getMetadataID(I.second));
+    }
+    Stream.EmitRecord(bitc::METADATA_ATTACHMENT, Record, 0);
+    Record.clear();
+  }
+
+  for (const BasicBlock &BB : F)
+    for (const Instruction &I : BB) {
+      MDs.clear();
+      I.getAllMetadataOtherThanDebugLoc(MDs);
+
+      // If no metadata, ignore instruction.
+      if (MDs.empty())
+        continue;
+
+      Record.push_back(VE.getInstructionID(&I));
+
+      for (unsigned i = 0, e = MDs.size(); i != e; ++i) {
+        Record.push_back(MDs[i].first);
+        Record.push_back(VE.getMetadataID(MDs[i].second));
+      }
+      Stream.EmitRecord(bitc::METADATA_ATTACHMENT, Record, 0);
+      Record.clear();
+    }
+
+  Stream.ExitBlock();
+}
+
+void DXILBitcodeWriter::writeModuleMetadataKinds() {
+  SmallVector<uint64_t, 64> Record;
+
+  // Write metadata kinds
+  // METADATA_KIND - [n x [id, name]]
+  SmallVector<StringRef, 8> Names;
+  M.getMDKindNames(Names);
+
+  if (Names.empty())
+    return;
+
+  Stream.EnterSubblock(bitc::METADATA_BLOCK_ID, 3);
+
+  for (unsigned MDKindID = 0, e = Names.size(); MDKindID != e; ++MDKindID) {
+    Record.push_back(MDKindID);
+    StringRef KName = Names[MDKindID];
+    Record.append(KName.begin(), KName.end());
+
+    Stream.EmitRecord(bitc::METADATA_KIND, Record, 0);
+    Record.clear();
+  }
+
+  Stream.ExitBlock();
+}
+
+void DXILBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal,
+                                       bool isGlobal) {
+  if (FirstVal == LastVal)
+    return;
+
+  Stream.EnterSubblock(bitc::CONSTANTS_BLOCK_ID, 4);
+
+  unsigned AggregateAbbrev = 0;
+  unsigned String8Abbrev = 0;
+  unsigned CString7Abbrev = 0;
+  unsigned CString6Abbrev = 0;
+  // If this is a constant pool for the module, emit module-specific abbrevs.
+  if (isGlobal) {
+    // Abbrev for CST_CODE_AGGREGATE.
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
+    Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_AGGREGATE));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(
+        BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, Log2_32_Ceil(LastVal + 1)));
+    AggregateAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+
+    // Abbrev for CST_CODE_STRING.
+    Abbv = std::make_shared<BitCodeAbbrev>();
+    Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_STRING));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));
+    String8Abbrev = Stream.EmitAbbrev(std::move(Abbv));
+    // Abbrev for CST_CODE_CSTRING.
+    Abbv = std::make_shared<BitCodeAbbrev>();
+    Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_CSTRING));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7));
+    CString7Abbrev = Stream.EmitAbbrev(std::move(Abbv));
+    // Abbrev for CST_CODE_CSTRING.
+    Abbv = std::make_shared<BitCodeAbbrev>();
+    Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_CSTRING));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
+    CString6Abbrev = Stream.EmitAbbrev(std::move(Abbv));
+  }
+
+  SmallVector<uint64_t, 64> Record;
+
+  const ValueEnumerator::ValueList &Vals = VE.getValues();
+  Type *LastTy = nullptr;
+  for (unsigned i = FirstVal; i != LastVal; ++i) {
+    const Value *V = Vals[i].first;
+    // If we need to switch types, do so now.
+    if (V->getType() != LastTy) {
+      LastTy = V->getType();
+      Record.push_back(getTypeID(LastTy));
+      Stream.EmitRecord(bitc::CST_CODE_SETTYPE, Record,
+                        CONSTANTS_SETTYPE_ABBREV);
+      Record.clear();
+    }
+
+    if (const InlineAsm *IA = dyn_cast<InlineAsm>(V)) {
+      Record.push_back(unsigned(IA->hasSideEffects()) |
+                       unsigned(IA->isAlignStack()) << 1 |
+                       unsigned(IA->getDialect() & 1) << 2);
+
+      // Add the asm string.
+      const std::string &AsmStr = IA->getAsmString();
+      Record.push_back(AsmStr.size());
+      Record.append(AsmStr.begin(), AsmStr.end());
+
+      // Add the constraint string.
+      const std::string &ConstraintStr = IA->getConstraintString();
+      Record.push_back(ConstraintStr.size());
+      Record.append(ConstraintStr.begin(), ConstraintStr.end());
+      Stream.EmitRecord(bitc::CST_CODE_INLINEASM, Record);
+      Record.clear();
+      continue;
+    }
+    const Constant *C = cast<Constant>(V);
+    unsigned Code = -1U;
+    unsigned AbbrevToUse = 0;
+    if (C->isNullValue()) {
+      Code = bitc::CST_CODE_NULL;
+    } else if (isa<UndefValue>(C)) {
+      Code = bitc::CST_CODE_UNDEF;
+    } else if (const ConstantInt *IV = dyn_cast<ConstantInt>(C)) {
+      if (IV->getBitWidth() <= 64) {
+        uint64_t V = IV->getSExtValue();
+        emitSignedInt64(Record, V);
+        Code = bitc::CST_CODE_INTEGER;
+        AbbrevToUse = CONSTANTS_INTEGER_ABBREV;
+      } else { // Wide integers, > 64 bits in size.
+        // We have an arbitrary precision integer value to write whose
+        // bit width is > 64. However, in canonical unsigned integer
+        // format it is likely that the high bits are going to be zero.
+        // So, we only write the number of active words.
+        unsigned NWords = IV->getValue().getActiveWords();
+        const uint64_t *RawWords = IV->getValue().getRawData();
+        for (unsigned i = 0; i != NWords; ++i) {
+          emitSignedInt64(Record, RawWords[i]);
+        }
+        Code = bitc::CST_CODE_WIDE_INTEGER;
+      }
+    } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
+      Code = bitc::CST_CODE_FLOAT;
+      Type *Ty = CFP->getType();
+      if (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy()) {
+        Record.push_back(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
+      } else if (Ty->isX86_FP80Ty()) {
+        // api needed to prevent premature destruction
+        // bits are not in the same order as a normal i80 APInt, compensate.
+        APInt api = CFP->getValueAPF().bitcastToAPInt();
+        const uint64_t *p = api.getRawData();
+        Record.push_back((p[1] << 48) | (p[0] >> 16));
+        Record.push_back(p[0] & 0xffffLL);
+      } else if (Ty->isFP128Ty() || Ty->isPPC_FP128Ty()) {
+        APInt api = CFP->getValueAPF().bitcastToAPInt();
+        const uint64_t *p = api.getRawData();
+        Record.push_back(p[0]);
+        Record.push_back(p[1]);
+      } else {
+        assert(0 && "Unknown FP type!");
+      }
+    } else if (isa<ConstantDataSequential>(C) &&
+               cast<ConstantDataSequential>(C)->isString()) {
+      const ConstantDataSequential *Str = cast<ConstantDataSequential>(C);
+      // Emit constant strings specially.
+      unsigned NumElts = Str->getNumElements();
+      // If this is a null-terminated string, use the denser CSTRING encoding.
+      if (Str->isCString()) {
+        Code = bitc::CST_CODE_CSTRING;
+        --NumElts; // Don't encode the null, which isn't allowed by char6.
+      } else {
+        Code = bitc::CST_CODE_STRING;
+        AbbrevToUse = String8Abbrev;
+      }
+      bool isCStr7 = Code == bitc::CST_CODE_CSTRING;
+      bool isCStrChar6 = Code == bitc::CST_CODE_CSTRING;
+      for (unsigned i = 0; i != NumElts; ++i) {
+        unsigned char V = Str->getElementAsInteger(i);
+        Record.push_back(V);
+        isCStr7 &= (V & 128) == 0;
+        if (isCStrChar6)
+          isCStrChar6 = BitCodeAbbrevOp::isChar6(V);
+      }
+
+      if (isCStrChar6)
+        AbbrevToUse = CString6Abbrev;
+      else if (isCStr7)
+        AbbrevToUse = CString7Abbrev;
+    } else if (const ConstantDataSequential *CDS =
+                   dyn_cast<ConstantDataSequential>(C)) {
+      Code = bitc::CST_CODE_DATA;
+      Type *EltTy = CDS->getType()->getArrayElementType();
+      if (isa<IntegerType>(EltTy)) {
+        for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i)
+          Record.push_back(CDS->getElementAsInteger(i));
+      } else if (EltTy->isFloatTy()) {
+        for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
+          union {
+            float F;
+            uint32_t I;
+          };
+          F = CDS->getElementAsFloat(i);
+          Record.push_back(I);
+        }
+      } else {
+        assert(EltTy->isDoubleTy() && "Unknown ConstantData element type");
+        for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
+          union {
+            double F;
+            uint64_t I;
+          };
+          F = CDS->getElementAsDouble(i);
+          Record.push_back(I);
+        }
+      }
+    } else if (isa<ConstantArray>(C) || isa<ConstantStruct>(C) ||
+               isa<ConstantVector>(C)) {
+      Code = bitc::CST_CODE_AGGREGATE;
+      for (const Value *Op : C->operands())
+        Record.push_back(VE.getValueID(Op));
+      AbbrevToUse = AggregateAbbrev;
+    } else if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+      switch (CE->getOpcode()) {
+      default:
+        if (Instruction::isCast(CE->getOpcode())) {
+          Code = bitc::CST_CODE_CE_CAST;
+          Record.push_back(getEncodedCastOpcode(CE->getOpcode()));
+          Record.push_back(getTypeID(C->getOperand(0)->getType()));
+          Record.push_back(VE.getValueID(C->getOperand(0)));
+          AbbrevToUse = CONSTANTS_CE_CAST_Abbrev;
+        } else {
+          assert(CE->getNumOperands() == 2 && "Unknown constant expr!");
+          Code = bitc::CST_CODE_CE_BINOP;
+          Record.push_back(getEncodedBinaryOpcode(CE->getOpcode()));
+          Record.push_back(VE.getValueID(C->getOperand(0)));
+          Record.push_back(VE.getValueID(C->getOperand(1)));
+          uint64_t Flags = getOptimizationFlags(CE);
+          if (Flags != 0)
+            Record.push_back(Flags);
+        }
+        break;
+      case Instruction::GetElementPtr: {
+        Code = bitc::CST_CODE_CE_GEP;
+        const auto *GO = cast<GEPOperator>(C);
+        if (GO->isInBounds())
+          Code = bitc::CST_CODE_CE_INBOUNDS_GEP;
+        Record.push_back(getTypeID(GO->getSourceElementType()));
+        for (unsigned i = 0, e = CE->getNumOperands(); i != e; ++i) {
+          Record.push_back(getTypeID(C->getOperand(i)->getType()));
+          Record.push_back(VE.getValueID(C->getOperand(i)));
+        }
+        break;
+      }
+      case Instruction::Select:
+        Code = bitc::CST_CODE_CE_SELECT;
+        Record.push_back(VE.getValueID(C->getOperand(0)));
+        Record.push_back(VE.getValueID(C->getOperand(1)));
+        Record.push_back(VE.getValueID(C->getOperand(2)));
+        break;
+      case Instruction::ExtractElement:
+        Code = bitc::CST_CODE_CE_EXTRACTELT;
+        Record.push_back(getTypeID(C->getOperand(0)->getType()));
+        Record.push_back(VE.getValueID(C->getOperand(0)));
+        Record.push_back(getTypeID(C->getOperand(1)->getType()));
+        Record.push_back(VE.getValueID(C->getOperand(1)));
+        break;
+      case Instruction::InsertElement:
+        Code = bitc::CST_CODE_CE_INSERTELT;
+        Record.push_back(VE.getValueID(C->getOperand(0)));
+        Record.push_back(VE.getValueID(C->getOperand(1)));
+        Record.push_back(getTypeID(C->getOperand(2)->getType()));
+        Record.push_back(VE.getValueID(C->getOperand(2)));
+        break;
+      case Instruction::ShuffleVector:
+        // If the return type and argument types are the same, this is a
+        // standard shufflevector instruction.  If the types are different,
+        // then the shuffle is widening or truncating the input vectors, and
+        // the argument type must also be encoded.
+        if (C->getType() == C->getOperand(0)->getType()) {
+          Code = bitc::CST_CODE_CE_SHUFFLEVEC;
+        } else {
+          Code = bitc::CST_CODE_CE_SHUFVEC_EX;
+          Record.push_back(getTypeID(C->getOperand(0)->getType()));
+        }
+        Record.push_back(VE.getValueID(C->getOperand(0)));
+        Record.push_back(VE.getValueID(C->getOperand(1)));
+        Record.push_back(VE.getValueID(C->getOperand(2)));
+        break;
+      case Instruction::ICmp:
+      case Instruction::FCmp:
+        Code = bitc::CST_CODE_CE_CMP;
+        Record.push_back(getTypeID(C->getOperand(0)->getType()));
+        Record.push_back(VE.getValueID(C->getOperand(0)));
+        Record.push_back(VE.getValueID(C->getOperand(1)));
+        Record.push_back(CE->getPredicate());
+        break;
+      }
+    } else if (const BlockAddress *BA = dyn_cast<BlockAddress>(C)) {
+      Code = bitc::CST_CODE_BLOCKADDRESS;
+      Record.push_back(getTypeID(BA->getFunction()->getType()));
+      Record.push_back(VE.getValueID(BA->getFunction()));
+      Record.push_back(VE.getGlobalBasicBlockID(BA->getBasicBlock()));
+    } else {
+#ifndef NDEBUG
+      C->dump();
+#endif
+      llvm_unreachable("Unknown constant!");
+    }
+    Stream.EmitRecord(Code, Record, AbbrevToUse);
+    Record.clear();
+  }
+
+  Stream.ExitBlock();
+}
+
+void DXILBitcodeWriter::writeModuleConstants() {
+  const ValueEnumerator::ValueList &Vals = VE.getValues();
+
+  // Find the first constant to emit, which is the first non-globalvalue value.
+  // We know globalvalues have been emitted by WriteModuleInfo.
+  for (unsigned i = 0, e = Vals.size(); i != e; ++i) {
+    if (!isa<GlobalValue>(Vals[i].first)) {
+      writeConstants(i, Vals.size(), true);
+      return;
+    }
+  }
+}
+
+/// pushValueAndType - The file has to encode both the value and type id for
+/// many values, because we need to know what type to create for forward
+/// references.  However, most operands are not forward references, so this type
+/// field is not needed.
+///
+/// This function adds V's value ID to Vals.  If the value ID is higher than the
+/// instruction ID, then it is a forward reference, and it also includes the
+/// type ID.  The value ID that is written is encoded relative to the InstID.
+bool DXILBitcodeWriter::pushValueAndType(const Value *V, unsigned InstID,
+                                         SmallVectorImpl<unsigned> &Vals) {
+  unsigned ValID = VE.getValueID(V);
+  // Make encoding relative to the InstID.
+  Vals.push_back(InstID - ValID);
+  if (ValID >= InstID) {
+    Vals.push_back(getTypeID(V->getType(), V));
+    return true;
+  }
+  return false;
+}
+
+/// pushValue - Like pushValueAndType, but where the type of the value is
+/// omitted (perhaps it was already encoded in an earlier operand).
+void DXILBitcodeWriter::pushValue(const Value *V, unsigned InstID,
+                                  SmallVectorImpl<unsigned> &Vals) {
+  unsigned ValID = VE.getValueID(V);
+  Vals.push_back(InstID - ValID);
+}
+
+void DXILBitcodeWriter::pushValueSigned(const Value *V, unsigned InstID,
+                                        SmallVectorImpl<uint64_t> &Vals) {
+  unsigned ValID = VE.getValueID(V);
+  int64_t diff = ((int32_t)InstID - (int32_t)ValID);
+  emitSignedInt64(Vals, diff);
+}
+
+/// WriteInstruction - Emit an instruction
+void DXILBitcodeWriter::writeInstruction(const Instruction &I, unsigned InstID,
+                                         SmallVectorImpl<unsigned> &Vals) {
+  unsigned Code = 0;
+  unsigned AbbrevToUse = 0;
+  VE.setInstructionID(&I);
+  switch (I.getOpcode()) {
+  default:
+    if (Instruction::isCast(I.getOpcode())) {
+      Code = bitc::FUNC_CODE_INST_CAST;
+      if (!pushValueAndType(I.getOperand(0), InstID, Vals))
+        AbbrevToUse = (unsigned)FUNCTION_INST_CAST_ABBREV;
+      Vals.push_back(getTypeID(I.getType(), &I));
+      Vals.push_back(getEncodedCastOpcode(I.getOpcode()));
+    } else {
+      assert(isa<BinaryOperator>(I) && "Unknown instruction!");
+      Code = bitc::FUNC_CODE_INST_BINOP;
+      if (!pushValueAndType(I.getOperand(0), InstID, Vals))
+        AbbrevToUse = (unsigned)FUNCTION_INST_BINOP_ABBREV;
+      pushValue(I.getOperand(1), InstID, Vals);
+      Vals.push_back(getEncodedBinaryOpcode(I.getOpcode()));
+      uint64_t Flags = getOptimizationFlags(&I);
+      if (Flags != 0) {
+        if (AbbrevToUse == (unsigned)FUNCTION_INST_BINOP_ABBREV)
+          AbbrevToUse = (unsigned)FUNCTION_INST_BINOP_FLAGS_ABBREV;
+        Vals.push_back(Flags);
+      }
+    }
+    break;
+
+  case Instruction::GetElementPtr: {
+    Code = bitc::FUNC_CODE_INST_GEP;
+    AbbrevToUse = (unsigned)FUNCTION_INST_GEP_ABBREV;
+    auto &GEPInst = cast<GetElementPtrInst>(I);
+    Vals.push_back(GEPInst.isInBounds());
+    Vals.push_back(getTypeID(GEPInst.getSourceElementType()));
+    for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i)
+      pushValueAndType(I.getOperand(i), InstID, Vals);
+    break;
+  }
+  case Instruction::ExtractValue: {
+    Code = bitc::FUNC_CODE_INST_EXTRACTVAL;
+    pushValueAndType(I.getOperand(0), InstID, Vals);
+    const ExtractValueInst *EVI = cast<ExtractValueInst>(&I);
+    Vals.append(EVI->idx_begin(), EVI->idx_end());
+    break;
+  }
+  case Instruction::InsertValue: {
+    Code = bitc::FUNC_CODE_INST_INSERTVAL;
+    pushValueAndType(I.getOperand(0), InstID, Vals);
+    pushValueAndType(I.getOperand(1), InstID, Vals);
+    const InsertValueInst *IVI = cast<InsertValueInst>(&I);
+    Vals.append(IVI->idx_begin(), IVI->idx_end());
+    break;
+  }
+  case Instruction::Select:
+    Code = bitc::FUNC_CODE_INST_VSELECT;
+    pushValueAndType(I.getOperand(1), InstID, Vals);
+    pushValue(I.getOperand(2), InstID, Vals);
+    pushValueAndType(I.getOperand(0), InstID, Vals);
+    break;
+  case Instruction::ExtractElement:
+    Code = bitc::FUNC_CODE_INST_EXTRACTELT;
+    pushValueAndType(I.getOperand(0), InstID, Vals);
+    pushValueAndType(I.getOperand(1), InstID, Vals);
+    break;
+  case Instruction::InsertElement:
+    Code = bitc::FUNC_CODE_INST_INSERTELT;
+    pushValueAndType(I.getOperand(0), InstID, Vals);
+    pushValue(I.getOperand(1), InstID, Vals);
+    pushValueAndType(I.getOperand(2), InstID, Vals);
+    break;
+  case Instruction::ShuffleVector:
+    Code = bitc::FUNC_CODE_INST_SHUFFLEVEC;
+    pushValueAndType(I.getOperand(0), InstID, Vals);
+    pushValue(I.getOperand(1), InstID, Vals);
+    pushValue(I.getOperand(2), InstID, Vals);
+    break;
+  case Instruction::ICmp:
+  case Instruction::FCmp: {
+    // compare returning Int1Ty or vector of Int1Ty
+    Code = bitc::FUNC_CODE_INST_CMP2;
+    pushValueAndType(I.getOperand(0), InstID, Vals);
+    pushValue(I.getOperand(1), InstID, Vals);
+    Vals.push_back(cast<CmpInst>(I).getPredicate());
+    uint64_t Flags = getOptimizationFlags(&I);
+    if (Flags != 0)
+      Vals.push_back(Flags);
+    break;
+  }
+
+  case Instruction::Ret: {
+    Code = bitc::FUNC_CODE_INST_RET;
+    unsigned NumOperands = I.getNumOperands();
+    if (NumOperands == 0)
+      AbbrevToUse = (unsigned)FUNCTION_INST_RET_VOID_ABBREV;
+    else if (NumOperands == 1) {
+      if (!pushValueAndType(I.getOperand(0), InstID, Vals))
+        AbbrevToUse = (unsigned)FUNCTION_INST_RET_VAL_ABBREV;
+    } else {
+      for (unsigned i = 0, e = NumOperands; i != e; ++i)
+        pushValueAndType(I.getOperand(i), InstID, Vals);
+    }
+  } break;
+  case Instruction::Br: {
+    Code = bitc::FUNC_CODE_INST_BR;
+    const BranchInst &II = cast<BranchInst>(I);
+    Vals.push_back(VE.getValueID(II.getSuccessor(0)));
+    if (II.isConditional()) {
+      Vals.push_back(VE.getValueID(II.getSuccessor(1)));
+      pushValue(II.getCondition(), InstID, Vals);
+    }
+  } break;
+  case Instruction::Switch: {
+    Code = bitc::FUNC_CODE_INST_SWITCH;
+    const SwitchInst &SI = cast<SwitchInst>(I);
+    Vals.push_back(getTypeID(SI.getCondition()->getType()));
+    pushValue(SI.getCondition(), InstID, Vals);
+    Vals.push_back(VE.getValueID(SI.getDefaultDest()));
+    for (auto Case : SI.cases()) {
+      Vals.push_back(VE.getValueID(Case.getCaseValue()));
+      Vals.push_back(VE.getValueID(Case.getCaseSuccessor()));
+    }
+  } break;
+  case Instruction::IndirectBr:
+    Code = bitc::FUNC_CODE_INST_INDIRECTBR;
+    Vals.push_back(getTypeID(I.getOperand(0)->getType()));
+    // Encode the address operand as relative, but not the basic blocks.
+    pushValue(I.getOperand(0), InstID, Vals);
+    for (unsigned i = 1, e = I.getNumOperands(); i != e; ++i)
+      Vals.push_back(VE.getValueID(I.getOperand(i)));
+    break;
+
+  case Instruction::Invoke: {
+    const InvokeInst *II = cast<InvokeInst>(&I);
+    const Value *Callee = II->getCalledOperand();
+    FunctionType *FTy = II->getFunctionType();
+    Code = bitc::FUNC_CODE_INST_INVOKE;
+
+    Vals.push_back(VE.getAttributeListID(II->getAttributes()));
+    Vals.push_back(II->getCallingConv() | 1 << 13);
+    Vals.push_back(VE.getValueID(II->getNormalDest()));
+    Vals.push_back(VE.getValueID(II->getUnwindDest()));
+    Vals.push_back(getTypeID(FTy));
+    pushValueAndType(Callee, InstID, Vals);
+
+    // Emit value #'s for the fixed parameters.
+    for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
+      pushValue(I.getOperand(i), InstID, Vals); // fixed param.
+
+    // Emit type/value pairs for varargs params.
+    if (FTy->isVarArg()) {
+      for (unsigned i = FTy->getNumParams(), e = I.getNumOperands() - 3; i != e;
+           ++i)
+        pushValueAndType(I.getOperand(i), InstID, Vals); // vararg
+    }
+    break;
+  }
+  case Instruction::Resume:
+    Code = bitc::FUNC_CODE_INST_RESUME;
+    pushValueAndType(I.getOperand(0), InstID, Vals);
+    break;
+  case Instruction::Unreachable:
+    Code = bitc::FUNC_CODE_INST_UNREACHABLE;
+    AbbrevToUse = (unsigned)FUNCTION_INST_UNREACHABLE_ABBREV;
+    break;
+
+  case Instruction::PHI: {
+    const PHINode &PN = cast<PHINode>(I);
+    Code = bitc::FUNC_CODE_INST_PHI;
+    // With the newer instruction encoding, forward references could give
+    // negative valued IDs.  This is most common for PHIs, so we use
+    // signed VBRs.
+    SmallVector<uint64_t, 128> Vals64;
+    Vals64.push_back(getTypeID(PN.getType()));
+    for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
+      pushValueSigned(PN.getIncomingValue(i), InstID, Vals64);
+      Vals64.push_back(VE.getValueID(PN.getIncomingBlock(i)));
+    }
+    // Emit a Vals64 vector and exit.
+    Stream.EmitRecord(Code, Vals64, AbbrevToUse);
+    Vals64.clear();
+    return;
+  }
+
+  case Instruction::LandingPad: {
+    const LandingPadInst &LP = cast<LandingPadInst>(I);
+    Code = bitc::FUNC_CODE_INST_LANDINGPAD;
+    Vals.push_back(getTypeID(LP.getType()));
+    Vals.push_back(LP.isCleanup());
+    Vals.push_back(LP.getNumClauses());
+    for (unsigned I = 0, E = LP.getNumClauses(); I != E; ++I) {
+      if (LP.isCatch(I))
+        Vals.push_back(LandingPadInst::Catch);
+      else
+        Vals.push_back(LandingPadInst::Filter);
+      pushValueAndType(LP.getClause(I), InstID, Vals);
+    }
+    break;
+  }
+
+  case Instruction::Alloca: {
+    Code = bitc::FUNC_CODE_INST_ALLOCA;
+    const AllocaInst &AI = cast<AllocaInst>(I);
+    Vals.push_back(getTypeID(AI.getAllocatedType()));
+    Vals.push_back(getTypeID(I.getOperand(0)->getType()));
+    Vals.push_back(VE.getValueID(I.getOperand(0))); // size.
+    using APV = AllocaPackedValues;
+    unsigned Record = 0;
+    unsigned EncodedAlign = getEncodedAlign(AI.getAlign());
+    Bitfield::set<APV::AlignLower>(
+        Record, EncodedAlign & ((1 << APV::AlignLower::Bits) - 1));
+    Bitfield::set<APV::AlignUpper>(Record,
+                                   EncodedAlign >> APV::AlignLower::Bits);
+    Bitfield::set<APV::UsedWithInAlloca>(Record, AI.isUsedWithInAlloca());
+    Vals.push_back(Record);
+    break;
+  }
+
+  case Instruction::Load:
+    if (cast<LoadInst>(I).isAtomic()) {
+      Code = bitc::FUNC_CODE_INST_LOADATOMIC;
+      pushValueAndType(I.getOperand(0), InstID, Vals);
+    } else {
+      Code = bitc::FUNC_CODE_INST_LOAD;
+      if (!pushValueAndType(I.getOperand(0), InstID, Vals)) // ptr
+        AbbrevToUse = (unsigned)FUNCTION_INST_LOAD_ABBREV;
+    }
+    Vals.push_back(getTypeID(I.getType()));
+    Vals.push_back(Log2(cast<LoadInst>(I).getAlign()) + 1);
+    Vals.push_back(cast<LoadInst>(I).isVolatile());
+    if (cast<LoadInst>(I).isAtomic()) {
+      Vals.push_back(getEncodedOrdering(cast<LoadInst>(I).getOrdering()));
+      Vals.push_back(getEncodedSyncScopeID(cast<LoadInst>(I).getSyncScopeID()));
+    }
+    break;
+  case Instruction::Store:
+    if (cast<StoreInst>(I).isAtomic())
+      Code = bitc::FUNC_CODE_INST_STOREATOMIC;
+    else
+      Code = bitc::FUNC_CODE_INST_STORE;
+    pushValueAndType(I.getOperand(1), InstID, Vals); // ptrty + ptr
+    pushValueAndType(I.getOperand(0), InstID, Vals); // valty + val
+    Vals.push_back(Log2(cast<StoreInst>(I).getAlign()) + 1);
+    Vals.push_back(cast<StoreInst>(I).isVolatile());
+    if (cast<StoreInst>(I).isAtomic()) {
+      Vals.push_back(getEncodedOrdering(cast<StoreInst>(I).getOrdering()));
+      Vals.push_back(
+          getEncodedSyncScopeID(cast<StoreInst>(I).getSyncScopeID()));
+    }
+    break;
+  case Instruction::AtomicCmpXchg:
+    Code = bitc::FUNC_CODE_INST_CMPXCHG;
+    pushValueAndType(I.getOperand(0), InstID, Vals); // ptrty + ptr
+    pushValueAndType(I.getOperand(1), InstID, Vals); // cmp.
+    pushValue(I.getOperand(2), InstID, Vals);        // newval.
+    Vals.push_back(cast<AtomicCmpXchgInst>(I).isVolatile());
+    Vals.push_back(
+        getEncodedOrdering(cast<AtomicCmpXchgInst>(I).getSuccessOrdering()));
+    Vals.push_back(
+        getEncodedSyncScopeID(cast<AtomicCmpXchgInst>(I).getSyncScopeID()));
+    Vals.push_back(
+        getEncodedOrdering(cast<AtomicCmpXchgInst>(I).getFailureOrdering()));
+    Vals.push_back(cast<AtomicCmpXchgInst>(I).isWeak());
+    break;
+  case Instruction::AtomicRMW:
+    Code = bitc::FUNC_CODE_INST_ATOMICRMW;
+    pushValueAndType(I.getOperand(0), InstID, Vals); // ptrty + ptr
+    pushValue(I.getOperand(1), InstID, Vals);        // val.
+    Vals.push_back(
+        getEncodedRMWOperation(cast<AtomicRMWInst>(I).getOperation()));
+    Vals.push_back(cast<AtomicRMWInst>(I).isVolatile());
+    Vals.push_back(getEncodedOrdering(cast<AtomicRMWInst>(I).getOrdering()));
+    Vals.push_back(
+        getEncodedSyncScopeID(cast<AtomicRMWInst>(I).getSyncScopeID()));
+    break;
+  case Instruction::Fence:
+    Code = bitc::FUNC_CODE_INST_FENCE;
+    Vals.push_back(getEncodedOrdering(cast<FenceInst>(I).getOrdering()));
+    Vals.push_back(getEncodedSyncScopeID(cast<FenceInst>(I).getSyncScopeID()));
+    break;
+  case Instruction::Call: {
+    const CallInst &CI = cast<CallInst>(I);
+    FunctionType *FTy = CI.getFunctionType();
+
+    Code = bitc::FUNC_CODE_INST_CALL;
+
+    Vals.push_back(VE.getAttributeListID(CI.getAttributes()));
+    Vals.push_back((CI.getCallingConv() << 1) | unsigned(CI.isTailCall()) |
+                   unsigned(CI.isMustTailCall()) << 14 | 1 << 15);
+    Vals.push_back(getTypeID(FTy, CI.getCalledFunction()));
+    pushValueAndType(CI.getCalledOperand(), InstID, Vals); // Callee
+
+    // Emit value #'s for the fixed parameters.
+    for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
+      // Check for labels (can happen with asm labels).
+      if (FTy->getParamType(i)->isLabelTy())
+        Vals.push_back(VE.getValueID(CI.getArgOperand(i)));
+      else
+        pushValue(CI.getArgOperand(i), InstID, Vals); // fixed param.
+    }
+
+    // Emit type/value pairs for varargs params.
+    if (FTy->isVarArg()) {
+      for (unsigned i = FTy->getNumParams(), e = CI.arg_size(); i != e; ++i)
+        pushValueAndType(CI.getArgOperand(i), InstID, Vals); // varargs
+    }
+    break;
+  }
+  case Instruction::VAArg:
+    Code = bitc::FUNC_CODE_INST_VAARG;
+    Vals.push_back(getTypeID(I.getOperand(0)->getType())); // valistty
+    pushValue(I.getOperand(0), InstID, Vals);              // valist.
+    Vals.push_back(getTypeID(I.getType()));                // restype.
+    break;
+  }
+
+  Stream.EmitRecord(Code, Vals, AbbrevToUse);
+  Vals.clear();
+}
+
+// Emit names for globals/functions etc.
+void DXILBitcodeWriter::writeFunctionLevelValueSymbolTable(
+    const ValueSymbolTable &VST) {
+  if (VST.empty())
+    return;
+  Stream.EnterSubblock(bitc::VALUE_SYMTAB_BLOCK_ID, 4);
+
+  SmallVector<unsigned, 64> NameVals;
+
+  // HLSL Change
+  // Read the named values from a sorted list instead of the original list
+  // to ensure the binary is the same no matter what values ever existed.
+  SmallVector<const ValueName *, 16> SortedTable;
+
+  for (auto &VI : VST) {
+    SortedTable.push_back(VI.second->getValueName());
+  }
+  // The keys are unique, so there shouldn't be stability issues.
+  std::sort(SortedTable.begin(), SortedTable.end(),
+            [](const ValueName *A, const ValueName *B) {
+              return A->first() < B->first();
+            });
+
+  for (const ValueName *SI : SortedTable) {
+    auto &Name = *SI;
+
+    // Figure out the encoding to use for the name.
+    bool is7Bit = true;
+    bool isChar6 = true;
+    for (const char *C = Name.getKeyData(), *E = C + Name.getKeyLength();
+         C != E; ++C) {
+      if (isChar6)
+        isChar6 = BitCodeAbbrevOp::isChar6(*C);
+      if ((unsigned char)*C & 128) {
+        is7Bit = false;
+        break; // don't bother scanning the rest.
+      }
+    }
+
+    unsigned AbbrevToUse = VST_ENTRY_8_ABBREV;
+
+    // VST_ENTRY:   [valueid, namechar x N]
+    // VST_BBENTRY: [bbid, namechar x N]
+    unsigned Code;
+    if (isa<BasicBlock>(SI->getValue())) {
+      Code = bitc::VST_CODE_BBENTRY;
+      if (isChar6)
+        AbbrevToUse = VST_BBENTRY_6_ABBREV;
+    } else {
+      Code = bitc::VST_CODE_ENTRY;
+      if (isChar6)
+        AbbrevToUse = VST_ENTRY_6_ABBREV;
+      else if (is7Bit)
+        AbbrevToUse = VST_ENTRY_7_ABBREV;
+    }
+
+    NameVals.push_back(VE.getValueID(SI->getValue()));
+    for (const char *P = Name.getKeyData(),
+                    *E = Name.getKeyData() + Name.getKeyLength();
+         P != E; ++P)
+      NameVals.push_back((unsigned char)*P);
+
+    // Emit the finished record.
+    Stream.EmitRecord(Code, NameVals, AbbrevToUse);
+    NameVals.clear();
+  }
+  Stream.ExitBlock();
+}
+
+void DXILBitcodeWriter::writeUseList(UseListOrder &&Order) {
+  assert(Order.Shuffle.size() >= 2 && "Shuffle too small");
+  unsigned Code;
+  if (isa<BasicBlock>(Order.V))
+    Code = bitc::USELIST_CODE_BB;
+  else
+    Code = bitc::USELIST_CODE_DEFAULT;
+
+  SmallVector<uint64_t, 64> Record(Order.Shuffle.begin(), Order.Shuffle.end());
+  Record.push_back(VE.getValueID(Order.V));
+  Stream.EmitRecord(Code, Record);
+}
+
+void DXILBitcodeWriter::writeUseListBlock(const Function *F) {
+  auto hasMore = [&]() {
+    return !VE.UseListOrders.empty() && VE.UseListOrders.back().F == F;
+  };
+  if (!hasMore())
+    // Nothing to do.
+    return;
+
+  Stream.EnterSubblock(bitc::USELIST_BLOCK_ID, 3);
+  while (hasMore()) {
+    writeUseList(std::move(VE.UseListOrders.back()));
+    VE.UseListOrders.pop_back();
+  }
+  Stream.ExitBlock();
+}
+
+/// Emit a function body to the module stream.
+void DXILBitcodeWriter::writeFunction(const Function &F) {
+  Stream.EnterSubblock(bitc::FUNCTION_BLOCK_ID, 4);
+  VE.incorporateFunction(F);
+
+  SmallVector<unsigned, 64> Vals;
+
+  // Emit the number of basic blocks, so the reader can create them ahead of
+  // time.
+  Vals.push_back(VE.getBasicBlocks().size());
+  Stream.EmitRecord(bitc::FUNC_CODE_DECLAREBLOCKS, Vals);
+  Vals.clear();
+
+  // If there are function-local constants, emit them now.
+  unsigned CstStart, CstEnd;
+  VE.getFunctionConstantRange(CstStart, CstEnd);
+  writeConstants(CstStart, CstEnd, false);
+
+  // If there is function-local metadata, emit it now.
+  writeFunctionMetadata(F);
+
+  // Keep a running idea of what the instruction ID is.
+  unsigned InstID = CstEnd;
+
+  bool NeedsMetadataAttachment = F.hasMetadata();
+
+  DILocation *LastDL = nullptr;
+
+  // Finally, emit all the instructions, in order.
+  for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E;
+         ++I) {
+      writeInstruction(*I, InstID, Vals);
+
+      if (!I->getType()->isVoidTy())
+        ++InstID;
+
+      // If the instruction has metadata, write a metadata attachment later.
+      NeedsMetadataAttachment |= I->hasMetadataOtherThanDebugLoc();
+
+      // If the instruction has a debug location, emit it.
+      DILocation *DL = I->getDebugLoc();
+      if (!DL)
+        continue;
+
+      if (DL == LastDL) {
+        // Just repeat the same debug loc as last time.
+        Stream.EmitRecord(bitc::FUNC_CODE_DEBUG_LOC_AGAIN, Vals);
+        continue;
+      }
+
+      Vals.push_back(DL->getLine());
+      Vals.push_back(DL->getColumn());
+      Vals.push_back(VE.getMetadataOrNullID(DL->getScope()));
+      Vals.push_back(VE.getMetadataOrNullID(DL->getInlinedAt()));
+      Stream.EmitRecord(bitc::FUNC_CODE_DEBUG_LOC, Vals);
+      Vals.clear();
+
+      LastDL = DL;
+    }
+
+  // Emit names for all the instructions etc.
+  if (auto *Symtab = F.getValueSymbolTable())
+    writeFunctionLevelValueSymbolTable(*Symtab);
+
+  if (NeedsMetadataAttachment)
+    writeFunctionMetadataAttachment(F);
+
+  writeUseListBlock(&F);
+  VE.purgeFunction();
+  Stream.ExitBlock();
+}
+
+// Emit blockinfo, which defines the standard abbreviations etc.
+void DXILBitcodeWriter::writeBlockInfo() {
+  // We only want to emit block info records for blocks that have multiple
+  // instances: CONSTANTS_BLOCK, FUNCTION_BLOCK and VALUE_SYMTAB_BLOCK.
+  // Other blocks can define their abbrevs inline.
+  Stream.EnterBlockInfoBlock();
+
+  { // 8-bit fixed-width VST_ENTRY/VST_BBENTRY strings.
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));
+    if (Stream.EmitBlockInfoAbbrev(bitc::VALUE_SYMTAB_BLOCK_ID,
+                                   std::move(Abbv)) != VST_ENTRY_8_ABBREV)
+      assert(false && "Unexpected abbrev ordering!");
+  }
+
+  { // 7-bit fixed width VST_ENTRY strings.
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
+    Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_ENTRY));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7));
+    if (Stream.EmitBlockInfoAbbrev(bitc::VALUE_SYMTAB_BLOCK_ID,
+                                   std::move(Abbv)) != VST_ENTRY_7_ABBREV)
+      assert(false && "Unexpected abbrev ordering!");
+  }
+  { // 6-bit char6 VST_ENTRY strings.
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
+    Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_ENTRY));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
+    if (Stream.EmitBlockInfoAbbrev(bitc::VALUE_SYMTAB_BLOCK_ID,
+                                   std::move(Abbv)) != VST_ENTRY_6_ABBREV)
+      assert(false && "Unexpected abbrev ordering!");
+  }
+  { // 6-bit char6 VST_BBENTRY strings.
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
+    Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_BBENTRY));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
+    if (Stream.EmitBlockInfoAbbrev(bitc::VALUE_SYMTAB_BLOCK_ID,
+                                   std::move(Abbv)) != VST_BBENTRY_6_ABBREV)
+      assert(false && "Unexpected abbrev ordering!");
+  }
+
+  { // SETTYPE abbrev for CONSTANTS_BLOCK.
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
+    Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_SETTYPE));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
+                              VE.computeBitsRequiredForTypeIndicies()));
+    if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID, std::move(Abbv)) !=
+        CONSTANTS_SETTYPE_ABBREV)
+      assert(false && "Unexpected abbrev ordering!");
+  }
+
+  { // INTEGER abbrev for CONSTANTS_BLOCK.
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
+    Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_INTEGER));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+    if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID, std::move(Abbv)) !=
+        CONSTANTS_INTEGER_ABBREV)
+      assert(false && "Unexpected abbrev ordering!");
+  }
+
+  { // CE_CAST abbrev for CONSTANTS_BLOCK.
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
+    Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_CE_CAST));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // cast opc
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,      // typeid
+                              VE.computeBitsRequiredForTypeIndicies()));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // value id
+
+    if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID, std::move(Abbv)) !=
+        CONSTANTS_CE_CAST_Abbrev)
+      assert(false && "Unexpected abbrev ordering!");
+  }
+  { // NULL abbrev for CONSTANTS_BLOCK.
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
+    Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_NULL));
+    if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID, std::move(Abbv)) !=
+        CONSTANTS_NULL_Abbrev)
+      assert(false && "Unexpected abbrev ordering!");
+  }
+
+  // FIXME: This should only use space for first class types!
+
+  { // INST_LOAD abbrev for FUNCTION_BLOCK.
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
+    Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_LOAD));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Ptr
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,    // dest ty
+                              VE.computeBitsRequiredForTypeIndicies()));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // Align
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // volatile
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, std::move(Abbv)) !=
+        (unsigned)FUNCTION_INST_LOAD_ABBREV)
+      assert(false && "Unexpected abbrev ordering!");
+  }
+  { // INST_BINOP abbrev for FUNCTION_BLOCK.
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
+    Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_BINOP));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // LHS
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // RHS
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // opc
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, std::move(Abbv)) !=
+        (unsigned)FUNCTION_INST_BINOP_ABBREV)
+      assert(false && "Unexpected abbrev ordering!");
+  }
+  { // INST_BINOP_FLAGS abbrev for FUNCTION_BLOCK.
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
+    Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_BINOP));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // LHS
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // RHS
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // opc
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7)); // flags
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, std::move(Abbv)) !=
+        (unsigned)FUNCTION_INST_BINOP_FLAGS_ABBREV)
+      assert(false && "Unexpected abbrev ordering!");
+  }
+  { // INST_CAST abbrev for FUNCTION_BLOCK.
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
+    Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_CAST));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // OpVal
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,    // dest ty
+                              VE.computeBitsRequiredForTypeIndicies()));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // opc
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, std::move(Abbv)) !=
+        (unsigned)FUNCTION_INST_CAST_ABBREV)
+      assert(false && "Unexpected abbrev ordering!");
+  }
+
+  { // INST_RET abbrev for FUNCTION_BLOCK.
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
+    Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_RET));
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, std::move(Abbv)) !=
+        (unsigned)FUNCTION_INST_RET_VOID_ABBREV)
+      assert(false && "Unexpected abbrev ordering!");
+  }
+  { // INST_RET abbrev for FUNCTION_BLOCK.
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
+    Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_RET));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // ValID
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, std::move(Abbv)) !=
+        (unsigned)FUNCTION_INST_RET_VAL_ABBREV)
+      assert(false && "Unexpected abbrev ordering!");
+  }
+  { // INST_UNREACHABLE abbrev for FUNCTION_BLOCK.
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
+    Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_UNREACHABLE));
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, std::move(Abbv)) !=
+        (unsigned)FUNCTION_INST_UNREACHABLE_ABBREV)
+      assert(false && "Unexpected abbrev ordering!");
+  }
+  {
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
+    Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_GEP));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, // dest ty
+                              Log2_32_Ceil(VE.getTypes().size() + 1)));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, std::move(Abbv)) !=
+        (unsigned)FUNCTION_INST_GEP_ABBREV)
+      assert(false && "Unexpected abbrev ordering!");
+  }
+
+  Stream.ExitBlock();
+}
+
+void DXILBitcodeWriter::writeModuleVersion() {
+  // VERSION: [version#]
+  Stream.EmitRecord(bitc::MODULE_CODE_VERSION, ArrayRef<unsigned>{1});
+}
+
+/// WriteModule - Emit the specified module to the bitstream.
+void DXILBitcodeWriter::write() {
+  // The identification block is new since llvm-3.7, but the old bitcode reader
+  // will skip it.
+  // writeIdentificationBlock(Stream);
+
+  Stream.EnterSubblock(bitc::MODULE_BLOCK_ID, 3);
+
+  // It is redundant to fully-specify this here, but nice to make it explicit
+  // so that it is clear the DXIL module version is different.
+  DXILBitcodeWriter::writeModuleVersion();
+
+  // Emit blockinfo, which defines the standard abbreviations etc.
+  writeBlockInfo();
+
+  // Emit information about attribute groups.
+  writeAttributeGroupTable();
+
+  // Emit information about parameter attributes.
+  writeAttributeTable();
+
+  // Emit information describing all of the types in the module.
+  writeTypeTable();
+
+  writeComdats();
+
+  // Emit top-level description of module, including target triple, inline asm,
+  // descriptors for global variables, and function prototype info.
+  writeModuleInfo();
+
+  // Emit constants.
+  writeModuleConstants();
+
+  // Emit metadata.
+  writeModuleMetadataKinds();
+
+  // Emit metadata.
+  writeModuleMetadata();
+
+  // Emit names for globals/functions etc.
+  // DXIL uses the same format for module-level value symbol table as for the
+  // function level table.
+  writeFunctionLevelValueSymbolTable(M.getValueSymbolTable());
+
+  // Emit module-level use-lists.
+  writeUseListBlock(nullptr);
+
+  // Emit function bodies.
+  for (const Function &F : M)
+    if (!F.isDeclaration())
+      writeFunction(F);
+
+  Stream.ExitBlock();
+}
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.h b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.h
new file mode 100644
index 000000000000..289f692f0f82
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.h
@@ -0,0 +1,82 @@
+//===- Bitcode/Writer/DXILBitcodeWriter.cpp - DXIL Bitcode Writer ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Bitcode writer implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
+#include "llvm/MC/StringTableBuilder.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/MemoryBufferRef.h"
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace llvm {
+
+class BitstreamWriter;
+class Module;
+class raw_ostream;
+
+namespace dxil {
+
+class BitcodeWriter {
+  SmallVectorImpl<char> &Buffer;
+  std::unique_ptr<BitstreamWriter> Stream;
+
+  StringTableBuilder StrtabBuilder{StringTableBuilder::RAW};
+
+  // Owns any strings created by the irsymtab writer until we create the
+  // string table.
+  BumpPtrAllocator Alloc;
+
+  bool WroteStrtab = false, WroteSymtab = false;
+
+  void writeBlob(unsigned Block, unsigned Record, StringRef Blob);
+
+  std::vector<Module *> Mods;
+
+public:
+  /// Create a BitcodeWriter that writes to Buffer.
+  BitcodeWriter(SmallVectorImpl<char> &Buffer, raw_fd_stream *FS = nullptr);
+
+  ~BitcodeWriter();
+
+  /// Attempt to write a symbol table to the bitcode file. This must be called
+  /// at most once after all modules have been written.
+  ///
+  /// A reader does not require a symbol table to interpret a bitcode file;
+  /// the symbol table is needed only to improve link-time performance. So
+  /// this function may decide not to write a symbol table. It may so decide
+  /// if, for example, the target is unregistered or the IR is malformed.
+  void writeSymtab();
+
+  /// Write the bitcode file's string table. This must be called exactly once
+  /// after all modules and the optional symbol table have been written.
+  void writeStrtab();
+
+  /// Copy the string table for another module into this bitcode file. This
+  /// should be called after copying the module itself into the bitcode file.
+  void copyStrtab(StringRef Strtab);
+
+  /// Write the specified module to the buffer specified at construction time.
+  void writeModule(const Module &M);
+};
+
+/// Write the specified module to the specified raw output stream.
+///
+/// For streams where it matters, the given stream should be in "binary"
+/// mode.
+void WriteDXILToFile(const Module &M, raw_ostream &Out);
+
+} // namespace dxil
+
+} // namespace llvm
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.cpp
new file mode 100644
index 000000000000..08944ee3f1fe
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.cpp
@@ -0,0 +1,1147 @@
+//===- ValueEnumerator.cpp - Number values and types for bitcode writer ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ValueEnumerator class.
+// Forked from lib/Bitcode/Writer
+//
+//===----------------------------------------------------------------------===//
+
+#include "DXILValueEnumerator.h"
+#include "DXILPointerType.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalIFunc.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cstddef>
+#include <iterator>
+#include <tuple>
+
+using namespace llvm;
+using namespace llvm::dxil;
+
+namespace {
+
+struct OrderMap {
+  DenseMap<const Value *, std::pair<unsigned, bool>> IDs;
+  unsigned LastGlobalConstantID = 0;
+  unsigned LastGlobalValueID = 0;
+
+  OrderMap() = default;
+
+  bool isGlobalConstant(unsigned ID) const {
+    return ID <= LastGlobalConstantID;
+  }
+
+  bool isGlobalValue(unsigned ID) const {
+    return ID <= LastGlobalValueID && !isGlobalConstant(ID);
+  }
+
+  unsigned size() const { return IDs.size(); }
+  std::pair<unsigned, bool> &operator[](const Value *V) { return IDs[V]; }
+
+  std::pair<unsigned, bool> lookup(const Value *V) const {
+    return IDs.lookup(V);
+  }
+
+  void index(const Value *V) {
+    // Explicitly sequence get-size and insert-value operations to avoid UB.
+    unsigned ID = IDs.size() + 1;
+    IDs[V].first = ID;
+  }
+};
+
+} // end anonymous namespace
+
+static void orderValue(const Value *V, OrderMap &OM) {
+  if (OM.lookup(V).first)
+    return;
+
+  if (const Constant *C = dyn_cast<Constant>(V)) {
+    if (C->getNumOperands() && !isa<GlobalValue>(C)) {
+      for (const Value *Op : C->operands())
+        if (!isa<BasicBlock>(Op) && !isa<GlobalValue>(Op))
+          orderValue(Op, OM);
+      if (auto *CE = dyn_cast<ConstantExpr>(C))
+        if (CE->getOpcode() == Instruction::ShuffleVector)
+          orderValue(CE->getShuffleMaskForBitcode(), OM);
+    }
+  }
+
+  // Note: we cannot cache this lookup above, since inserting into the map
+  // changes the map's size, and thus affects the other IDs.
+  OM.index(V);
+}
+
+static OrderMap orderModule(const Module &M) {
+  // This needs to match the order used by ValueEnumerator::ValueEnumerator()
+  // and ValueEnumerator::incorporateFunction().
+  OrderMap OM;
+
+  // In the reader, initializers of GlobalValues are set *after* all the
+  // globals have been read.  Rather than awkwardly modeling this behaviour
+  // directly in predictValueUseListOrderImpl(), just assign IDs to
+  // initializers of GlobalValues before GlobalValues themselves to model this
+  // implicitly.
+  for (const GlobalVariable &G : M.globals())
+    if (G.hasInitializer())
+      if (!isa<GlobalValue>(G.getInitializer()))
+        orderValue(G.getInitializer(), OM);
+  for (const GlobalAlias &A : M.aliases())
+    if (!isa<GlobalValue>(A.getAliasee()))
+      orderValue(A.getAliasee(), OM);
+  for (const GlobalIFunc &I : M.ifuncs())
+    if (!isa<GlobalValue>(I.getResolver()))
+      orderValue(I.getResolver(), OM);
+  for (const Function &F : M) {
+    for (const Use &U : F.operands())
+      if (!isa<GlobalValue>(U.get()))
+        orderValue(U.get(), OM);
+  }
+
+  // As constants used in metadata operands are emitted as module-level
+  // constants, we must order them before other operands. Also, we must order
+  // these before global values, as these will be read before setting the
+  // global values' initializers. The latter matters for constants which have
+  // uses towards other constants that are used as initializers.
+  auto orderConstantValue = [&OM](const Value *V) {
+    if ((isa<Constant>(V) && !isa<GlobalValue>(V)) || isa<InlineAsm>(V))
+      orderValue(V, OM);
+  };
+  for (const Function &F : M) {
+    if (F.isDeclaration())
+      continue;
+    for (const BasicBlock &BB : F)
+      for (const Instruction &I : BB)
+        for (const Value *V : I.operands()) {
+          if (const auto *MAV = dyn_cast<MetadataAsValue>(V)) {
+            if (const auto *VAM =
+                    dyn_cast<ValueAsMetadata>(MAV->getMetadata())) {
+              orderConstantValue(VAM->getValue());
+            } else if (const auto *AL =
+                           dyn_cast<DIArgList>(MAV->getMetadata())) {
+              for (const auto *VAM : AL->getArgs())
+                orderConstantValue(VAM->getValue());
+            }
+          }
+        }
+  }
+  OM.LastGlobalConstantID = OM.size();
+
+  // Initializers of GlobalValues are processed in
+  // BitcodeReader::ResolveGlobalAndAliasInits().  Match the order there rather
+  // than ValueEnumerator, and match the code in predictValueUseListOrderImpl()
+  // by giving IDs in reverse order.
+  //
+  // Since GlobalValues never reference each other directly (just through
+  // initializers), their relative IDs only matter for determining order of
+  // uses in their initializers.
+  for (const Function &F : M)
+    orderValue(&F, OM);
+  for (const GlobalAlias &A : M.aliases())
+    orderValue(&A, OM);
+  for (const GlobalIFunc &I : M.ifuncs())
+    orderValue(&I, OM);
+  for (const GlobalVariable &G : M.globals())
+    orderValue(&G, OM);
+  OM.LastGlobalValueID = OM.size();
+
+  for (const Function &F : M) {
+    if (F.isDeclaration())
+      continue;
+    // Here we need to match the union of ValueEnumerator::incorporateFunction()
+    // and WriteFunction().  Basic blocks are implicitly declared before
+    // anything else (by declaring their size).
+    for (const BasicBlock &BB : F)
+      orderValue(&BB, OM);
+    for (const Argument &A : F.args())
+      orderValue(&A, OM);
+    for (const BasicBlock &BB : F)
+      for (const Instruction &I : BB) {
+        for (const Value *Op : I.operands())
+          if ((isa<Constant>(*Op) && !isa<GlobalValue>(*Op)) ||
+              isa<InlineAsm>(*Op))
+            orderValue(Op, OM);
+        if (auto *SVI = dyn_cast<ShuffleVectorInst>(&I))
+          orderValue(SVI->getShuffleMaskForBitcode(), OM);
+      }
+    for (const BasicBlock &BB : F)
+      for (const Instruction &I : BB)
+        orderValue(&I, OM);
+  }
+  return OM;
+}
+
+static void predictValueUseListOrderImpl(const Value *V, const Function *F,
+                                         unsigned ID, const OrderMap &OM,
+                                         UseListOrderStack &Stack) {
+  // Predict use-list order for this one.
+  using Entry = std::pair<const Use *, unsigned>;
+  SmallVector<Entry, 64> List;
+  for (const Use &U : V->uses())
+    // Check if this user will be serialized.
+    if (OM.lookup(U.getUser()).first)
+      List.push_back(std::make_pair(&U, List.size()));
+
+  if (List.size() < 2)
+    // We may have lost some users.
+    return;
+
+  bool IsGlobalValue = OM.isGlobalValue(ID);
+  llvm::sort(List, [&](const Entry &L, const Entry &R) {
+    const Use *LU = L.first;
+    const Use *RU = R.first;
+    if (LU == RU)
+      return false;
+
+    auto LID = OM.lookup(LU->getUser()).first;
+    auto RID = OM.lookup(RU->getUser()).first;
+
+    // Global values are processed in reverse order.
+    //
+    // Moreover, initializers of GlobalValues are set *after* all the globals
+    // have been read (despite having earlier IDs).  Rather than awkwardly
+    // modeling this behaviour here, orderModule() has assigned IDs to
+    // initializers of GlobalValues before GlobalValues themselves.
+    if (OM.isGlobalValue(LID) && OM.isGlobalValue(RID)) {
+      if (LID == RID)
+        return LU->getOperandNo() > RU->getOperandNo();
+      return LID < RID;
+    }
+
+    // If ID is 4, then expect: 7 6 5 1 2 3.
+    if (LID < RID) {
+      if (RID <= ID)
+        if (!IsGlobalValue) // GlobalValue uses don't get reversed.
+          return true;
+      return false;
+    }
+    if (RID < LID) {
+      if (LID <= ID)
+        if (!IsGlobalValue) // GlobalValue uses don't get reversed.
+          return false;
+      return true;
+    }
+
+    // LID and RID are equal, so we have different operands of the same user.
+    // Assume operands are added in order for all instructions.
+    if (LID <= ID)
+      if (!IsGlobalValue) // GlobalValue uses don't get reversed.
+        return LU->getOperandNo() < RU->getOperandNo();
+    return LU->getOperandNo() > RU->getOperandNo();
+  });
+
+  if (llvm::is_sorted(List, [](const Entry &L, const Entry &R) {
+        return L.second < R.second;
+      }))
+    // Order is already correct.
+    return;
+
+  // Store the shuffle.
+  Stack.emplace_back(V, F, List.size());
+  assert(List.size() == Stack.back().Shuffle.size() && "Wrong size");
+  for (size_t I = 0, E = List.size(); I != E; ++I)
+    Stack.back().Shuffle[I] = List[I].second;
+}
+
+static void predictValueUseListOrder(const Value *V, const Function *F,
+                                     OrderMap &OM, UseListOrderStack &Stack) {
+  auto &IDPair = OM[V];
+  assert(IDPair.first && "Unmapped value");
+  if (IDPair.second)
+    // Already predicted.
+    return;
+
+  // Do the actual prediction.
+  IDPair.second = true;
+  if (!V->use_empty() && std::next(V->use_begin()) != V->use_end())
+    predictValueUseListOrderImpl(V, F, IDPair.first, OM, Stack);
+
+  // Recursive descent into constants.
+  if (const Constant *C = dyn_cast<Constant>(V)) {
+    if (C->getNumOperands()) { // Visit GlobalValues.
+      for (const Value *Op : C->operands())
+        if (isa<Constant>(Op)) // Visit GlobalValues.
+          predictValueUseListOrder(Op, F, OM, Stack);
+      if (auto *CE = dyn_cast<ConstantExpr>(C))
+        if (CE->getOpcode() == Instruction::ShuffleVector)
+          predictValueUseListOrder(CE->getShuffleMaskForBitcode(), F, OM,
+                                   Stack);
+    }
+  }
+}
+
+static UseListOrderStack predictUseListOrder(const Module &M) {
+  OrderMap OM = orderModule(M);
+
+  // Use-list orders need to be serialized after all the users have been added
+  // to a value, or else the shuffles will be incomplete.  Store them per
+  // function in a stack.
+  //
+  // Aside from function order, the order of values doesn't matter much here.
+  UseListOrderStack Stack;
+
+  // We want to visit the functions backward now so we can list function-local
+  // constants in the last Function they're used in.  Module-level constants
+  // have already been visited above.
+  for (const Function &F : llvm::reverse(M)) {
+    if (F.isDeclaration())
+      continue;
+    for (const BasicBlock &BB : F)
+      predictValueUseListOrder(&BB, &F, OM, Stack);
+    for (const Argument &A : F.args())
+      predictValueUseListOrder(&A, &F, OM, Stack);
+    for (const BasicBlock &BB : F)
+      for (const Instruction &I : BB) {
+        for (const Value *Op : I.operands())
+          if (isa<Constant>(*Op) || isa<InlineAsm>(*Op)) // Visit GlobalValues.
+            predictValueUseListOrder(Op, &F, OM, Stack);
+        if (auto *SVI = dyn_cast<ShuffleVectorInst>(&I))
+          predictValueUseListOrder(SVI->getShuffleMaskForBitcode(), &F, OM,
+                                   Stack);
+      }
+    for (const BasicBlock &BB : F)
+      for (const Instruction &I : BB)
+        predictValueUseListOrder(&I, &F, OM, Stack);
+  }
+
+  // Visit globals last, since the module-level use-list block will be seen
+  // before the function bodies are processed.
+  for (const GlobalVariable &G : M.globals())
+    predictValueUseListOrder(&G, nullptr, OM, Stack);
+  for (const Function &F : M)
+    predictValueUseListOrder(&F, nullptr, OM, Stack);
+  for (const GlobalAlias &A : M.aliases())
+    predictValueUseListOrder(&A, nullptr, OM, Stack);
+  for (const GlobalIFunc &I : M.ifuncs())
+    predictValueUseListOrder(&I, nullptr, OM, Stack);
+  for (const GlobalVariable &G : M.globals())
+    if (G.hasInitializer())
+      predictValueUseListOrder(G.getInitializer(), nullptr, OM, Stack);
+  for (const GlobalAlias &A : M.aliases())
+    predictValueUseListOrder(A.getAliasee(), nullptr, OM, Stack);
+  for (const GlobalIFunc &I : M.ifuncs())
+    predictValueUseListOrder(I.getResolver(), nullptr, OM, Stack);
+  for (const Function &F : M) {
+    for (const Use &U : F.operands())
+      predictValueUseListOrder(U.get(), nullptr, OM, Stack);
+  }
+
+  return Stack;
+}
+
+ValueEnumerator::ValueEnumerator(const Module &M, Type *PrefixType) {
+  EnumerateType(PrefixType);
+  
+  UseListOrders = predictUseListOrder(M);
+
+  // Enumerate the global variables.
+  for (const GlobalVariable &GV : M.globals()) {
+    EnumerateValue(&GV);
+    EnumerateType(GV.getValueType());
+  }
+
+  // Enumerate the functions.
+  for (const Function &F : M) {
+    EnumerateValue(&F);
+    EnumerateType(F.getValueType());
+    EnumerateType(
+        dxil::TypedPointerType::get(F.getFunctionType(), F.getAddressSpace()));
+    EnumerateAttributes(F.getAttributes());
+  }
+
+  // Enumerate the aliases.
+  for (const GlobalAlias &GA : M.aliases()) {
+    EnumerateValue(&GA);
+    EnumerateType(GA.getValueType());
+  }
+
+  // Enumerate the ifuncs.
+  for (const GlobalIFunc &GIF : M.ifuncs()) {
+    EnumerateValue(&GIF);
+    EnumerateType(GIF.getValueType());
+  }
+
+  // Enumerate the global variable initializers and attributes.
+  for (const GlobalVariable &GV : M.globals()) {
+    if (GV.hasInitializer())
+      EnumerateValue(GV.getInitializer());
+    EnumerateType(
+        dxil::TypedPointerType::get(GV.getValueType(), GV.getAddressSpace()));
+    if (GV.hasAttributes())
+      EnumerateAttributes(GV.getAttributesAsList(AttributeList::FunctionIndex));
+  }
+
+  // Enumerate the aliasees.
+  for (const GlobalAlias &GA : M.aliases())
+    EnumerateValue(GA.getAliasee());
+
+  // Enumerate the ifunc resolvers.
+  for (const GlobalIFunc &GIF : M.ifuncs())
+    EnumerateValue(GIF.getResolver());
+
+  // Enumerate any optional Function data.
+  for (const Function &F : M)
+    for (const Use &U : F.operands())
+      EnumerateValue(U.get());
+
+  // Enumerate the metadata type.
+  //
+  // TODO: Move this to ValueEnumerator::EnumerateOperandType() once bitcode
+  // only encodes the metadata type when it's used as a value.
+  EnumerateType(Type::getMetadataTy(M.getContext()));
+
+  // Insert constants and metadata that are named at module level into the slot
+  // pool so that the module symbol table can refer to them...
+  EnumerateValueSymbolTable(M.getValueSymbolTable());
+  EnumerateNamedMetadata(M);
+
+  SmallVector<std::pair<unsigned, MDNode *>, 8> MDs;
+  for (const GlobalVariable &GV : M.globals()) {
+    MDs.clear();
+    GV.getAllMetadata(MDs);
+    for (const auto &I : MDs)
+      // FIXME: Pass GV to EnumerateMetadata and arrange for the bitcode writer
+      // to write metadata to the global variable's own metadata block
+      // (PR28134).
+      EnumerateMetadata(nullptr, I.second);
+  }
+
+  // Enumerate types used by function bodies and argument lists.
+  for (const Function &F : M) {
+    for (const Argument &A : F.args())
+      EnumerateType(A.getType());
+
+    // Enumerate metadata attached to this function.
+    MDs.clear();
+    F.getAllMetadata(MDs);
+    for (const auto &I : MDs)
+      EnumerateMetadata(F.isDeclaration() ? nullptr : &F, I.second);
+
+    for (const BasicBlock &BB : F)
+      for (const Instruction &I : BB) {
+        for (const Use &Op : I.operands()) {
+          auto *MD = dyn_cast<MetadataAsValue>(&Op);
+          if (!MD) {
+            EnumerateOperandType(Op);
+            continue;
+          }
+
+          // Local metadata is enumerated during function-incorporation, but
+          // any ConstantAsMetadata arguments in a DIArgList should be examined
+          // now.
+          if (isa<LocalAsMetadata>(MD->getMetadata()))
+            continue;
+          if (auto *AL = dyn_cast<DIArgList>(MD->getMetadata())) {
+            for (auto *VAM : AL->getArgs())
+              if (isa<ConstantAsMetadata>(VAM))
+                EnumerateMetadata(&F, VAM);
+            continue;
+          }
+
+          EnumerateMetadata(&F, MD->getMetadata());
+        }
+        if (auto *SVI = dyn_cast<ShuffleVectorInst>(&I))
+          EnumerateType(SVI->getShuffleMaskForBitcode()->getType());
+        if (auto *GEP = dyn_cast<GetElementPtrInst>(&I))
+          EnumerateType(GEP->getSourceElementType());
+        if (auto *AI = dyn_cast<AllocaInst>(&I))
+          EnumerateType(AI->getAllocatedType());
+        EnumerateType(I.getType());
+        if (const auto *Call = dyn_cast<CallBase>(&I)) {
+          EnumerateAttributes(Call->getAttributes());
+          EnumerateType(Call->getFunctionType());
+        }
+
+        // Enumerate metadata attached with this instruction.
+        MDs.clear();
+        I.getAllMetadataOtherThanDebugLoc(MDs);
+        for (unsigned i = 0, e = MDs.size(); i != e; ++i)
+          EnumerateMetadata(&F, MDs[i].second);
+
+        // Don't enumerate the location directly -- it has a special record
+        // type -- but enumerate its operands.
+        if (DILocation *L = I.getDebugLoc())
+          for (const Metadata *Op : L->operands())
+            EnumerateMetadata(&F, Op);
+      }
+  }
+
+  // Organize metadata ordering.
+  organizeMetadata();
+}
+
+unsigned ValueEnumerator::getInstructionID(const Instruction *Inst) const {
+  InstructionMapType::const_iterator I = InstructionMap.find(Inst);
+  assert(I != InstructionMap.end() && "Instruction is not mapped!");
+  return I->second;
+}
+
+unsigned ValueEnumerator::getComdatID(const Comdat *C) const {
+  unsigned ComdatID = Comdats.idFor(C);
+  assert(ComdatID && "Comdat not found!");
+  return ComdatID;
+}
+
+void ValueEnumerator::setInstructionID(const Instruction *I) {
+  InstructionMap[I] = InstructionCount++;
+}
+
+unsigned ValueEnumerator::getValueID(const Value *V) const {
+  if (auto *MD = dyn_cast<MetadataAsValue>(V))
+    return getMetadataID(MD->getMetadata());
+
+  ValueMapType::const_iterator I = ValueMap.find(V);
+  assert(I != ValueMap.end() && "Value not in slotcalculator!");
+  return I->second - 1;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void ValueEnumerator::dump() const {
+  print(dbgs(), ValueMap, "Default");
+  dbgs() << '\n';
+  print(dbgs(), MetadataMap, "MetaData");
+  dbgs() << '\n';
+}
+#endif
+
+void ValueEnumerator::print(raw_ostream &OS, const ValueMapType &Map,
+                            const char *Name) const {
+  OS << "Map Name: " << Name << "\n";
+  OS << "Size: " << Map.size() << "\n";
+  for (const auto &I : Map) {
+    const Value *V = I.first;
+    if (V->hasName())
+      OS << "Value: " << V->getName();
+    else
+      OS << "Value: [null]\n";
+    V->print(errs());
+    errs() << '\n';
+
+    OS << " Uses(" << V->getNumUses() << "):";
+    for (const Use &U : V->uses()) {
+      if (&U != &*V->use_begin())
+        OS << ",";
+      if (U->hasName())
+        OS << " " << U->getName();
+      else
+        OS << " [null]";
+    }
+    OS << "\n\n";
+  }
+}
+
+void ValueEnumerator::print(raw_ostream &OS, const MetadataMapType &Map,
+                            const char *Name) const {
+  OS << "Map Name: " << Name << "\n";
+  OS << "Size: " << Map.size() << "\n";
+  for (const auto &I : Map) {
+    const Metadata *MD = I.first;
+    OS << "Metadata: slot = " << I.second.ID << "\n";
+    OS << "Metadata: function = " << I.second.F << "\n";
+    MD->print(OS);
+    OS << "\n";
+  }
+}
+
+/// EnumerateValueSymbolTable - Insert all of the values in the specified symbol
+/// table into the values table.
+void ValueEnumerator::EnumerateValueSymbolTable(const ValueSymbolTable &VST) {
+  for (ValueSymbolTable::const_iterator VI = VST.begin(), VE = VST.end();
+       VI != VE; ++VI)
+    EnumerateValue(VI->getValue());
+}
+
+/// Insert all of the values referenced by named metadata in the specified
+/// module.
+void ValueEnumerator::EnumerateNamedMetadata(const Module &M) {
+  for (const auto &I : M.named_metadata())
+    EnumerateNamedMDNode(&I);
+}
+
+void ValueEnumerator::EnumerateNamedMDNode(const NamedMDNode *MD) {
+  for (unsigned i = 0, e = MD->getNumOperands(); i != e; ++i)
+    EnumerateMetadata(nullptr, MD->getOperand(i));
+}
+
+unsigned ValueEnumerator::getMetadataFunctionID(const Function *F) const {
+  return F ? getValueID(F) + 1 : 0;
+}
+
+void ValueEnumerator::EnumerateMetadata(const Function *F, const Metadata *MD) {
+  EnumerateMetadata(getMetadataFunctionID(F), MD);
+}
+
+void ValueEnumerator::EnumerateFunctionLocalMetadata(
+    const Function &F, const LocalAsMetadata *Local) {
+  EnumerateFunctionLocalMetadata(getMetadataFunctionID(&F), Local);
+}
+
+void ValueEnumerator::EnumerateFunctionLocalListMetadata(
+    const Function &F, const DIArgList *ArgList) {
+  EnumerateFunctionLocalListMetadata(getMetadataFunctionID(&F), ArgList);
+}
+
+void ValueEnumerator::dropFunctionFromMetadata(
+    MetadataMapType::value_type &FirstMD) {
+  SmallVector<const MDNode *, 64> Worklist;
+  auto push = [&Worklist](MetadataMapType::value_type &MD) {
+    auto &Entry = MD.second;
+
+    // Nothing to do if this metadata isn't tagged.
+    if (!Entry.F)
+      return;
+
+    // Drop the function tag.
+    Entry.F = 0;
+
+    // If this is has an ID and is an MDNode, then its operands have entries as
+    // well.  We need to drop the function from them too.
+    if (Entry.ID)
+      if (auto *N = dyn_cast<MDNode>(MD.first))
+        Worklist.push_back(N);
+  };
+  push(FirstMD);
+  while (!Worklist.empty())
+    for (const Metadata *Op : Worklist.pop_back_val()->operands()) {
+      if (!Op)
+        continue;
+      auto MD = MetadataMap.find(Op);
+      if (MD != MetadataMap.end())
+        push(*MD);
+    }
+}
+
+void ValueEnumerator::EnumerateMetadata(unsigned F, const Metadata *MD) {
+  // It's vital for reader efficiency that uniqued subgraphs are done in
+  // post-order; it's expensive when their operands have forward references.
+  // If a distinct node is referenced from a uniqued node, it'll be delayed
+  // until the uniqued subgraph has been completely traversed.
+  SmallVector<const MDNode *, 32> DelayedDistinctNodes;
+
+  // Start by enumerating MD, and then work through its transitive operands in
+  // post-order.  This requires a depth-first search.
+  SmallVector<std::pair<const MDNode *, MDNode::op_iterator>, 32> Worklist;
+  if (const MDNode *N = enumerateMetadataImpl(F, MD))
+    Worklist.push_back(std::make_pair(N, N->op_begin()));
+
+  while (!Worklist.empty()) {
+    const MDNode *N = Worklist.back().first;
+
+    // Enumerate operands until we hit a new node.  We need to traverse these
+    // nodes' operands before visiting the rest of N's operands.
+    MDNode::op_iterator I = std::find_if(
+        Worklist.back().second, N->op_end(),
+        [&](const Metadata *MD) { return enumerateMetadataImpl(F, MD); });
+    if (I != N->op_end()) {
+      auto *Op = cast<MDNode>(*I);
+      Worklist.back().second = ++I;
+
+      // Delay traversing Op if it's a distinct node and N is uniqued.
+      if (Op->isDistinct() && !N->isDistinct())
+        DelayedDistinctNodes.push_back(Op);
+      else
+        Worklist.push_back(std::make_pair(Op, Op->op_begin()));
+      continue;
+    }
+
+    // All the operands have been visited.  Now assign an ID.
+    Worklist.pop_back();
+    MDs.push_back(N);
+    MetadataMap[N].ID = MDs.size();
+
+    // Flush out any delayed distinct nodes; these are all the distinct nodes
+    // that are leaves in last uniqued subgraph.
+    if (Worklist.empty() || Worklist.back().first->isDistinct()) {
+      for (const MDNode *N : DelayedDistinctNodes)
+        Worklist.push_back(std::make_pair(N, N->op_begin()));
+      DelayedDistinctNodes.clear();
+    }
+  }
+}
+
+const MDNode *ValueEnumerator::enumerateMetadataImpl(unsigned F,
+                                                     const Metadata *MD) {
+  if (!MD)
+    return nullptr;
+
+  assert(
+      (isa<MDNode>(MD) || isa<MDString>(MD) || isa<ConstantAsMetadata>(MD)) &&
+      "Invalid metadata kind");
+
+  auto Insertion = MetadataMap.insert(std::make_pair(MD, MDIndex(F)));
+  MDIndex &Entry = Insertion.first->second;
+  if (!Insertion.second) {
+    // Already mapped.  If F doesn't match the function tag, drop it.
+    if (Entry.hasDifferentFunction(F))
+      dropFunctionFromMetadata(*Insertion.first);
+    return nullptr;
+  }
+
+  // Don't assign IDs to metadata nodes.
+  if (auto *N = dyn_cast<MDNode>(MD))
+    return N;
+
+  // Save the metadata.
+  MDs.push_back(MD);
+  Entry.ID = MDs.size();
+
+  // Enumerate the constant, if any.
+  if (auto *C = dyn_cast<ConstantAsMetadata>(MD))
+    EnumerateValue(C->getValue());
+
+  return nullptr;
+}
+
+/// EnumerateFunctionLocalMetadata - Incorporate function-local metadata
+/// information reachable from the metadata.
+void ValueEnumerator::EnumerateFunctionLocalMetadata(
+    unsigned F, const LocalAsMetadata *Local) {
+  assert(F && "Expected a function");
+
+  // Check to see if it's already in!
+  MDIndex &Index = MetadataMap[Local];
+  if (Index.ID) {
+    assert(Index.F == F && "Expected the same function");
+    return;
+  }
+
+  MDs.push_back(Local);
+  Index.F = F;
+  Index.ID = MDs.size();
+
+  EnumerateValue(Local->getValue());
+}
+
+/// EnumerateFunctionLocalListMetadata - Incorporate function-local metadata
+/// information reachable from the metadata.
+void ValueEnumerator::EnumerateFunctionLocalListMetadata(
+    unsigned F, const DIArgList *ArgList) {
+  assert(F && "Expected a function");
+
+  // Check to see if it's already in!
+  MDIndex &Index = MetadataMap[ArgList];
+  if (Index.ID) {
+    assert(Index.F == F && "Expected the same function");
+    return;
+  }
+
+  for (ValueAsMetadata *VAM : ArgList->getArgs()) {
+    if (isa<LocalAsMetadata>(VAM)) {
+      assert(MetadataMap.count(VAM) &&
+             "LocalAsMetadata should be enumerated before DIArgList");
+      assert(MetadataMap[VAM].F == F &&
+             "Expected LocalAsMetadata in the same function");
+    } else {
+      assert(isa<ConstantAsMetadata>(VAM) &&
+             "Expected LocalAsMetadata or ConstantAsMetadata");
+      assert(ValueMap.count(VAM->getValue()) &&
+             "Constant should be enumerated beforeDIArgList");
+      EnumerateMetadata(F, VAM);
+    }
+  }
+
+  MDs.push_back(ArgList);
+  Index.F = F;
+  Index.ID = MDs.size();
+}
+
+static unsigned getMetadataTypeOrder(const Metadata *MD) {
+  // Strings are emitted in bulk and must come first.
+  if (isa<MDString>(MD))
+    return 0;
+
+  // ConstantAsMetadata doesn't reference anything.  We may as well shuffle it
+  // to the front since we can detect it.
+  auto *N = dyn_cast<MDNode>(MD);
+  if (!N)
+    return 1;
+
+  // The reader is fast forward references for distinct node operands, but slow
+  // when uniqued operands are unresolved.
+  return N->isDistinct() ? 2 : 3;
+}
+
+void ValueEnumerator::organizeMetadata() {
+  assert(MetadataMap.size() == MDs.size() &&
+         "Metadata map and vector out of sync");
+
+  if (MDs.empty())
+    return;
+
+  // Copy out the index information from MetadataMap in order to choose a new
+  // order.
+  SmallVector<MDIndex, 64> Order;
+  Order.reserve(MetadataMap.size());
+  for (const Metadata *MD : MDs)
+    Order.push_back(MetadataMap.lookup(MD));
+
+  // Partition:
+  //   - by function, then
+  //   - by isa<MDString>
+  // and then sort by the original/current ID.  Since the IDs are guaranteed to
+  // be unique, the result of std::sort will be deterministic.  There's no need
+  // for std::stable_sort.
+  llvm::sort(Order, [this](MDIndex LHS, MDIndex RHS) {
+    return std::make_tuple(LHS.F, getMetadataTypeOrder(LHS.get(MDs)), LHS.ID) <
+           std::make_tuple(RHS.F, getMetadataTypeOrder(RHS.get(MDs)), RHS.ID);
+  });
+
+  // Rebuild MDs, index the metadata ranges for each function in FunctionMDs,
+  // and fix up MetadataMap.
+  std::vector<const Metadata *> OldMDs;
+  MDs.swap(OldMDs);
+  MDs.reserve(OldMDs.size());
+  for (unsigned I = 0, E = Order.size(); I != E && !Order[I].F; ++I) {
+    auto *MD = Order[I].get(OldMDs);
+    MDs.push_back(MD);
+    MetadataMap[MD].ID = I + 1;
+    if (isa<MDString>(MD))
+      ++NumMDStrings;
+  }
+
+  // Return early if there's nothing for the functions.
+  if (MDs.size() == Order.size())
+    return;
+
+  // Build the function metadata ranges.
+  MDRange R;
+  FunctionMDs.reserve(OldMDs.size());
+  unsigned PrevF = 0;
+  for (unsigned I = MDs.size(), E = Order.size(), ID = MDs.size(); I != E;
+       ++I) {
+    unsigned F = Order[I].F;
+    if (!PrevF) {
+      PrevF = F;
+    } else if (PrevF != F) {
+      R.Last = FunctionMDs.size();
+      std::swap(R, FunctionMDInfo[PrevF]);
+      R.First = FunctionMDs.size();
+
+      ID = MDs.size();
+      PrevF = F;
+    }
+
+    auto *MD = Order[I].get(OldMDs);
+    FunctionMDs.push_back(MD);
+    MetadataMap[MD].ID = ++ID;
+    if (isa<MDString>(MD))
+      ++R.NumStrings;
+  }
+  R.Last = FunctionMDs.size();
+  FunctionMDInfo[PrevF] = R;
+}
+
+void ValueEnumerator::incorporateFunctionMetadata(const Function &F) {
+  NumModuleMDs = MDs.size();
+
+  auto R = FunctionMDInfo.lookup(getValueID(&F) + 1);
+  NumMDStrings = R.NumStrings;
+  MDs.insert(MDs.end(), FunctionMDs.begin() + R.First,
+             FunctionMDs.begin() + R.Last);
+}
+
+void ValueEnumerator::EnumerateValue(const Value *V) {
+  assert(!V->getType()->isVoidTy() && "Can't insert void values!");
+  assert(!isa<MetadataAsValue>(V) && "EnumerateValue doesn't handle Metadata!");
+
+  // Check to see if it's already in!
+  unsigned &ValueID = ValueMap[V];
+  if (ValueID) {
+    // Increment use count.
+    Values[ValueID - 1].second++;
+    return;
+  }
+
+  if (auto *GO = dyn_cast<GlobalObject>(V))
+    if (const Comdat *C = GO->getComdat())
+      Comdats.insert(C);
+
+  // Enumerate the type of this value.
+  EnumerateType(V->getType());
+
+  if (const Constant *C = dyn_cast<Constant>(V)) {
+    if (isa<GlobalValue>(C)) {
+      // Initializers for globals are handled explicitly elsewhere.
+    } else if (C->getNumOperands()) {
+      // If a constant has operands, enumerate them.  This makes sure that if a
+      // constant has uses (for example an array of const ints), that they are
+      // inserted also.
+
+      // We prefer to enumerate them with values before we enumerate the user
+      // itself.  This makes it more likely that we can avoid forward references
+      // in the reader.  We know that there can be no cycles in the constants
+      // graph that don't go through a global variable.
+      for (User::const_op_iterator I = C->op_begin(), E = C->op_end(); I != E;
+           ++I)
+        if (!isa<BasicBlock>(*I)) // Don't enumerate BB operand to BlockAddress.
+          EnumerateValue(*I);
+      if (auto *CE = dyn_cast<ConstantExpr>(C)) {
+        if (CE->getOpcode() == Instruction::ShuffleVector)
+          EnumerateValue(CE->getShuffleMaskForBitcode());
+        if (auto *GEP = dyn_cast<GEPOperator>(CE))
+          EnumerateType(GEP->getSourceElementType());
+      }
+
+      // Finally, add the value.  Doing this could make the ValueID reference be
+      // dangling, don't reuse it.
+      Values.push_back(std::make_pair(V, 1U));
+      ValueMap[V] = Values.size();
+      return;
+    }
+  }
+
+  // Add the value.
+  Values.push_back(std::make_pair(V, 1U));
+  ValueID = Values.size();
+}
+
+void ValueEnumerator::EnumerateType(Type *Ty) {
+  unsigned *TypeID = &TypeMap[Ty];
+
+  // We've already seen this type.
+  if (*TypeID)
+    return;
+
+  // If it is a non-anonymous struct, mark the type as being visited so that we
+  // don't recursively visit it.  This is safe because we allow forward
+  // references of these in the bitcode reader.
+  if (StructType *STy = dyn_cast<StructType>(Ty))
+    if (!STy->isLiteral())
+      *TypeID = ~0U;
+
+  // Enumerate all of the subtypes before we enumerate this type.  This ensures
+  // that the type will be enumerated in an order that can be directly built.
+  for (Type *SubTy : Ty->subtypes())
+    EnumerateType(SubTy);
+
+  // Refresh the TypeID pointer in case the table rehashed.
+  TypeID = &TypeMap[Ty];
+
+  // Check to see if we got the pointer another way.  This can happen when
+  // enumerating recursive types that hit the base case deeper than they start.
+  //
+  // If this is actually a struct that we are treating as forward ref'able,
+  // then emit the definition now that all of its contents are available.
+  if (*TypeID && *TypeID != ~0U)
+    return;
+
+  // Add this type now that its contents are all happily enumerated.
+  Types.push_back(Ty);
+
+  *TypeID = Types.size();
+}
+
+// Enumerate the types for the specified value.  If the value is a constant,
+// walk through it, enumerating the types of the constant.
+void ValueEnumerator::EnumerateOperandType(const Value *V) {
+  EnumerateType(V->getType());
+
+  assert(!isa<MetadataAsValue>(V) && "Unexpected metadata operand");
+
+  const Constant *C = dyn_cast<Constant>(V);
+  if (!C)
+    return;
+
+  // If this constant is already enumerated, ignore it, we know its type must
+  // be enumerated.
+  if (ValueMap.count(C))
+    return;
+
+  // This constant may have operands, make sure to enumerate the types in
+  // them.
+  for (const Value *Op : C->operands()) {
+    // Don't enumerate basic blocks here, this happens as operands to
+    // blockaddress.
+    if (isa<BasicBlock>(Op))
+      continue;
+
+    EnumerateOperandType(Op);
+  }
+  if (auto *CE = dyn_cast<ConstantExpr>(C)) {
+    if (CE->getOpcode() == Instruction::ShuffleVector)
+      EnumerateOperandType(CE->getShuffleMaskForBitcode());
+    if (CE->getOpcode() == Instruction::GetElementPtr)
+      EnumerateType(cast<GEPOperator>(CE)->getSourceElementType());
+  }
+}
+
+void ValueEnumerator::EnumerateAttributes(AttributeList PAL) {
+  if (PAL.isEmpty())
+    return; // null is always 0.
+
+  // Do a lookup.
+  unsigned &Entry = AttributeListMap[PAL];
+  if (Entry == 0) {
+    // Never saw this before, add it.
+    AttributeLists.push_back(PAL);
+    Entry = AttributeLists.size();
+  }
+
+  // Do lookups for all attribute groups.
+  for (unsigned i : PAL.indexes()) {
+    AttributeSet AS = PAL.getAttributes(i);
+    if (!AS.hasAttributes())
+      continue;
+    IndexAndAttrSet Pair = {i, AS};
+    unsigned &Entry = AttributeGroupMap[Pair];
+    if (Entry == 0) {
+      AttributeGroups.push_back(Pair);
+      Entry = AttributeGroups.size();
+
+      for (Attribute Attr : AS) {
+        if (Attr.isTypeAttribute())
+          EnumerateType(Attr.getValueAsType());
+      }
+    }
+  }
+}
+
+void ValueEnumerator::incorporateFunction(const Function &F) {
+  InstructionCount = 0;
+  NumModuleValues = Values.size();
+
+  // Add global metadata to the function block.  This doesn't include
+  // LocalAsMetadata.
+  incorporateFunctionMetadata(F);
+
+  // Adding function arguments to the value table.
+  for (const auto &I : F.args()) {
+    EnumerateValue(&I);
+    if (I.hasAttribute(Attribute::ByVal))
+      EnumerateType(I.getParamByValType());
+    else if (I.hasAttribute(Attribute::StructRet))
+      EnumerateType(I.getParamStructRetType());
+    else if (I.hasAttribute(Attribute::ByRef))
+      EnumerateType(I.getParamByRefType());
+  }
+  FirstFuncConstantID = Values.size();
+
+  // Add all function-level constants to the value table.
+  for (const BasicBlock &BB : F) {
+    for (const Instruction &I : BB) {
+      for (const Use &OI : I.operands()) {
+        if ((isa<Constant>(OI) && !isa<GlobalValue>(OI)) || isa<InlineAsm>(OI))
+          EnumerateValue(OI);
+      }
+      if (auto *SVI = dyn_cast<ShuffleVectorInst>(&I))
+        EnumerateValue(SVI->getShuffleMaskForBitcode());
+    }
+    BasicBlocks.push_back(&BB);
+    ValueMap[&BB] = BasicBlocks.size();
+  }
+
+  // Add the function's parameter attributes so they are available for use in
+  // the function's instruction.
+  EnumerateAttributes(F.getAttributes());
+
+  FirstInstID = Values.size();
+
+  SmallVector<LocalAsMetadata *, 8> FnLocalMDVector;
+  SmallVector<DIArgList *, 8> ArgListMDVector;
+  // Add all of the instructions.
+  for (const BasicBlock &BB : F) {
+    for (const Instruction &I : BB) {
+      for (const Use &OI : I.operands()) {
+        if (auto *MD = dyn_cast<MetadataAsValue>(&OI)) {
+          if (auto *Local = dyn_cast<LocalAsMetadata>(MD->getMetadata())) {
+            // Enumerate metadata after the instructions they might refer to.
+            FnLocalMDVector.push_back(Local);
+          } else if (auto *ArgList = dyn_cast<DIArgList>(MD->getMetadata())) {
+            ArgListMDVector.push_back(ArgList);
+            for (ValueAsMetadata *VMD : ArgList->getArgs()) {
+              if (auto *Local = dyn_cast<LocalAsMetadata>(VMD)) {
+                // Enumerate metadata after the instructions they might refer
+                // to.
+                FnLocalMDVector.push_back(Local);
+              }
+            }
+          }
+        }
+      }
+
+      if (!I.getType()->isVoidTy())
+        EnumerateValue(&I);
+    }
+  }
+
+  // Add all of the function-local metadata.
+  for (unsigned i = 0, e = FnLocalMDVector.size(); i != e; ++i) {
+    // At this point, every local values have been incorporated, we shouldn't
+    // have a metadata operand that references a value that hasn't been seen.
+    assert(ValueMap.count(FnLocalMDVector[i]->getValue()) &&
+           "Missing value for metadata operand");
+    EnumerateFunctionLocalMetadata(F, FnLocalMDVector[i]);
+  }
+  // DIArgList entries must come after function-local metadata, as it is not
+  // possible to forward-reference them.
+  for (const DIArgList *ArgList : ArgListMDVector)
+    EnumerateFunctionLocalListMetadata(F, ArgList);
+}
+
+void ValueEnumerator::purgeFunction() {
+  /// Remove purged values from the ValueMap.
+  for (unsigned i = NumModuleValues, e = Values.size(); i != e; ++i)
+    ValueMap.erase(Values[i].first);
+  for (unsigned i = NumModuleMDs, e = MDs.size(); i != e; ++i)
+    MetadataMap.erase(MDs[i]);
+  for (const BasicBlock *BB : BasicBlocks)
+    ValueMap.erase(BB);
+
+  Values.resize(NumModuleValues);
+  MDs.resize(NumModuleMDs);
+  BasicBlocks.clear();
+  NumMDStrings = 0;
+}
+
+static void IncorporateFunctionInfoGlobalBBIDs(
+    const Function *F, DenseMap<const BasicBlock *, unsigned> &IDMap) {
+  unsigned Counter = 0;
+  for (const BasicBlock &BB : *F)
+    IDMap[&BB] = ++Counter;
+}
+
+/// getGlobalBasicBlockID - This returns the function-specific ID for the
+/// specified basic block.  This is relatively expensive information, so it
+/// should only be used by rare constructs such as address-of-label.
+unsigned ValueEnumerator::getGlobalBasicBlockID(const BasicBlock *BB) const {
+  unsigned &Idx = GlobalBasicBlockIDs[BB];
+  if (Idx != 0)
+    return Idx - 1;
+
+  IncorporateFunctionInfoGlobalBBIDs(BB->getParent(), GlobalBasicBlockIDs);
+  return getGlobalBasicBlockID(BB);
+}
+
+uint64_t ValueEnumerator::computeBitsRequiredForTypeIndicies() const {
+  return Log2_32_Ceil(getTypes().size() + 1);
+}
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.h b/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.h
new file mode 100644
index 000000000000..6cf339b7a5cd
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.h
@@ -0,0 +1,308 @@
+//===- DirectX/DXILWriter/ValueEnumerator.h - Number values -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This class gives values and types Unique ID's.
+// Forked from lib/Bitcode/Writer
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DXILWRITER_VALUEENUMERATOR_H
+#define LLVM_DXILWRITER_VALUEENUMERATOR_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/UniqueVector.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/UseListOrder.h"
+#include <cassert>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+namespace llvm {
+
+class BasicBlock;
+class Comdat;
+class DIArgList;
+class Function;
+class Instruction;
+class LocalAsMetadata;
+class MDNode;
+class Metadata;
+class Module;
+class NamedMDNode;
+class raw_ostream;
+class Type;
+class Value;
+class ValueSymbolTable;
+
+namespace dxil {
+
+class ValueEnumerator {
+public:
+  using TypeList = std::vector<Type *>;
+
+  // For each value, we remember its Value* and occurrence frequency.
+  using ValueList = std::vector<std::pair<const Value *, unsigned>>;
+
+  /// Attribute groups as encoded in bitcode are almost AttributeSets, but they
+  /// include the AttributeList index, so we have to track that in our map.
+  using IndexAndAttrSet = std::pair<unsigned, AttributeSet>;
+
+  UseListOrderStack UseListOrders;
+
+private:
+  using TypeMapType = DenseMap<Type *, unsigned>;
+  TypeMapType TypeMap;
+  TypeList Types;
+
+  using ValueMapType = DenseMap<const Value *, unsigned>;
+  ValueMapType ValueMap;
+  ValueList Values;
+
+  using ComdatSetType = UniqueVector<const Comdat *>;
+  ComdatSetType Comdats;
+
+  std::vector<const Metadata *> MDs;
+  std::vector<const Metadata *> FunctionMDs;
+
+  /// Index of information about a piece of metadata.
+  struct MDIndex {
+    unsigned F = 0;  ///< The ID of the function for this metadata, if any.
+    unsigned ID = 0; ///< The implicit ID of this metadata in bitcode.
+
+    MDIndex() = default;
+    explicit MDIndex(unsigned F) : F(F) {}
+
+    /// Check if this has a function tag, and it's different from NewF.
+    bool hasDifferentFunction(unsigned NewF) const { return F && F != NewF; }
+
+    /// Fetch the MD this references out of the given metadata array.
+    const Metadata *get(ArrayRef<const Metadata *> MDs) const {
+      assert(ID && "Expected non-zero ID");
+      assert(ID <= MDs.size() && "Expected valid ID");
+      return MDs[ID - 1];
+    }
+  };
+
+  using MetadataMapType = DenseMap<const Metadata *, MDIndex>;
+  MetadataMapType MetadataMap;
+
+  /// Range of metadata IDs, as a half-open range.
+  struct MDRange {
+    unsigned First = 0;
+    unsigned Last = 0;
+
+    /// Number of strings in the prefix of the metadata range.
+    unsigned NumStrings = 0;
+
+    MDRange() = default;
+    explicit MDRange(unsigned First) : First(First) {}
+  };
+  SmallDenseMap<unsigned, MDRange, 1> FunctionMDInfo;
+
+  using AttributeGroupMapType = DenseMap<IndexAndAttrSet, unsigned>;
+  AttributeGroupMapType AttributeGroupMap;
+  std::vector<IndexAndAttrSet> AttributeGroups;
+
+  using AttributeListMapType = DenseMap<AttributeList, unsigned>;
+  AttributeListMapType AttributeListMap;
+  std::vector<AttributeList> AttributeLists;
+
+  /// GlobalBasicBlockIDs - This map memoizes the basic block ID's referenced by
+  /// the "getGlobalBasicBlockID" method.
+  mutable DenseMap<const BasicBlock *, unsigned> GlobalBasicBlockIDs;
+
+  using InstructionMapType = DenseMap<const Instruction *, unsigned>;
+  InstructionMapType InstructionMap;
+  unsigned InstructionCount;
+
+  /// BasicBlocks - This contains all the basic blocks for the currently
+  /// incorporated function.  Their reverse mapping is stored in ValueMap.
+  std::vector<const BasicBlock *> BasicBlocks;
+
+  /// When a function is incorporated, this is the size of the Values list
+  /// before incorporation.
+  unsigned NumModuleValues;
+
+  /// When a function is incorporated, this is the size of the Metadatas list
+  /// before incorporation.
+  unsigned NumModuleMDs = 0;
+  unsigned NumMDStrings = 0;
+
+  unsigned FirstFuncConstantID;
+  unsigned FirstInstID;
+
+public:
+  ValueEnumerator(const Module &M, Type *PrefixType);
+  ValueEnumerator(const ValueEnumerator &) = delete;
+  ValueEnumerator &operator=(const ValueEnumerator &) = delete;
+
+  void dump() const;
+  void print(raw_ostream &OS, const ValueMapType &Map, const char *Name) const;
+  void print(raw_ostream &OS, const MetadataMapType &Map,
+             const char *Name) const;
+
+  unsigned getValueID(const Value *V) const;
+
+  unsigned getMetadataID(const Metadata *MD) const {
+    auto ID = getMetadataOrNullID(MD);
+    assert(ID != 0 && "Metadata not in slotcalculator!");
+    return ID - 1;
+  }
+
+  unsigned getMetadataOrNullID(const Metadata *MD) const {
+    return MetadataMap.lookup(MD).ID;
+  }
+
+  unsigned numMDs() const { return MDs.size(); }
+
+  unsigned getTypeID(Type *T) const {
+    TypeMapType::const_iterator I = TypeMap.find(T);
+    assert(I != TypeMap.end() && "Type not in ValueEnumerator!");
+    return I->second - 1;
+  }
+
+  unsigned getInstructionID(const Instruction *I) const;
+  void setInstructionID(const Instruction *I);
+
+  unsigned getAttributeListID(AttributeList PAL) const {
+    if (PAL.isEmpty())
+      return 0; // Null maps to zero.
+    AttributeListMapType::const_iterator I = AttributeListMap.find(PAL);
+    assert(I != AttributeListMap.end() && "Attribute not in ValueEnumerator!");
+    return I->second;
+  }
+
+  unsigned getAttributeGroupID(IndexAndAttrSet Group) const {
+    if (!Group.second.hasAttributes())
+      return 0; // Null maps to zero.
+    AttributeGroupMapType::const_iterator I = AttributeGroupMap.find(Group);
+    assert(I != AttributeGroupMap.end() && "Attribute not in ValueEnumerator!");
+    return I->second;
+  }
+
+  /// getFunctionConstantRange - Return the range of values that corresponds to
+  /// function-local constants.
+  void getFunctionConstantRange(unsigned &Start, unsigned &End) const {
+    Start = FirstFuncConstantID;
+    End = FirstInstID;
+  }
+
+  const ValueList &getValues() const { return Values; }
+
+  /// Check whether the current block has any metadata to emit.
+  bool hasMDs() const { return NumModuleMDs < MDs.size(); }
+
+  /// Get the MDString metadata for this block.
+  ArrayRef<const Metadata *> getMDStrings() const {
+    return makeArrayRef(MDs).slice(NumModuleMDs, NumMDStrings);
+  }
+
+  /// Get the non-MDString metadata for this block.
+  ArrayRef<const Metadata *> getNonMDStrings() const {
+    return makeArrayRef(MDs).slice(NumModuleMDs).slice(NumMDStrings);
+  }
+
+  const TypeList &getTypes() const { return Types; }
+
+  const std::vector<const BasicBlock *> &getBasicBlocks() const {
+    return BasicBlocks;
+  }
+
+  const std::vector<AttributeList> &getAttributeLists() const {
+    return AttributeLists;
+  }
+
+  const std::vector<IndexAndAttrSet> &getAttributeGroups() const {
+    return AttributeGroups;
+  }
+
+  const ComdatSetType &getComdats() const { return Comdats; }
+  unsigned getComdatID(const Comdat *C) const;
+
+  /// getGlobalBasicBlockID - This returns the function-specific ID for the
+  /// specified basic block.  This is relatively expensive information, so it
+  /// should only be used by rare constructs such as address-of-label.
+  unsigned getGlobalBasicBlockID(const BasicBlock *BB) const;
+
+  /// incorporateFunction/purgeFunction - If you'd like to deal with a function,
+  /// use these two methods to get its data into the ValueEnumerator!
+  void incorporateFunction(const Function &F);
+
+  void purgeFunction();
+  uint64_t computeBitsRequiredForTypeIndicies() const;
+
+  void EnumerateType(Type *T);
+
+private:
+
+  /// Reorder the reachable metadata.
+  ///
+  /// This is not just an optimization, but is mandatory for emitting MDString
+  /// correctly.
+  void organizeMetadata();
+
+  /// Drop the function tag from the transitive operands of the given node.
+  void dropFunctionFromMetadata(MetadataMapType::value_type &FirstMD);
+
+  /// Incorporate the function metadata.
+  ///
+  /// This should be called before enumerating LocalAsMetadata for the
+  /// function.
+  void incorporateFunctionMetadata(const Function &F);
+
+  /// Enumerate a single instance of metadata with the given function tag.
+  ///
+  /// If \c MD has already been enumerated, check that \c F matches its
+  /// function tag.  If not, call \a dropFunctionFromMetadata().
+  ///
+  /// Otherwise, mark \c MD as visited.  Assign it an ID, or just return it if
+  /// it's an \a MDNode.
+  const MDNode *enumerateMetadataImpl(unsigned F, const Metadata *MD);
+
+  unsigned getMetadataFunctionID(const Function *F) const;
+
+  /// Enumerate reachable metadata in (almost) post-order.
+  ///
+  /// Enumerate all the metadata reachable from MD.  We want to minimize the
+  /// cost of reading bitcode records, and so the primary consideration is that
+  /// operands of uniqued nodes are resolved before the nodes are read.  This
+  /// avoids re-uniquing them on the context and factors away RAUW support.
+  ///
+  /// This algorithm guarantees that subgraphs of uniqued nodes are in
+  /// post-order.  Distinct subgraphs reachable only from a single uniqued node
+  /// will be in post-order.
+  ///
+  /// \note The relative order of a distinct and uniqued node is irrelevant.
+  /// \a organizeMetadata() will later partition distinct nodes ahead of
+  /// uniqued ones.
+  ///{
+  void EnumerateMetadata(const Function *F, const Metadata *MD);
+  void EnumerateMetadata(unsigned F, const Metadata *MD);
+  ///}
+
+  void EnumerateFunctionLocalMetadata(const Function &F,
+                                      const LocalAsMetadata *Local);
+  void EnumerateFunctionLocalMetadata(unsigned F, const LocalAsMetadata *Local);
+  void EnumerateFunctionLocalListMetadata(const Function &F,
+                                          const DIArgList *ArgList);
+  void EnumerateFunctionLocalListMetadata(unsigned F, const DIArgList *Arglist);
+  void EnumerateNamedMDNode(const NamedMDNode *NMD);
+  void EnumerateValue(const Value *V);
+  void EnumerateOperandType(const Value *V);
+  void EnumerateAttributes(AttributeList PAL);
+
+  void EnumerateValueSymbolTable(const ValueSymbolTable &ST);
+  void EnumerateNamedMetadata(const Module &M);
+};
+
+} // end namespace dxil
+} // end namespace llvm
+
+#endif // LLVM_DXILWRITER_VALUEENUMERATOR_H
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp
new file mode 100644
index 000000000000..c1f9f4aec672
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp
@@ -0,0 +1,100 @@
+//===- DXILWriterPass.cpp - Bitcode writing pass --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// DXILWriterPass implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DXILWriterPass.h"
+#include "DXILBitcodeWriter.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/ModuleSummaryAnalysis.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+using namespace llvm::dxil;
+
+namespace {
+class WriteDXILPass : public llvm::ModulePass {
+  raw_ostream &OS; // raw_ostream to print on
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WriteDXILPass() : ModulePass(ID), OS(dbgs()) {
+    initializeWriteDXILPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  explicit WriteDXILPass(raw_ostream &o) : ModulePass(ID), OS(o) {
+    initializeWriteDXILPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "Bitcode Writer"; }
+
+  bool runOnModule(Module &M) override {
+    WriteDXILToFile(M, OS);
+    return false;
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+};
+
+class EmbedDXILPass : public llvm::ModulePass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  EmbedDXILPass() : ModulePass(ID) {
+    initializeEmbedDXILPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "DXIL Embedder"; }
+
+  bool runOnModule(Module &M) override {
+    std::string Data;
+    llvm::raw_string_ostream OS(Data);
+    WriteDXILToFile(M, OS);
+
+    Constant *ModuleConstant =
+        ConstantDataArray::get(M.getContext(), arrayRefFromStringRef(Data));
+    auto *GV = new llvm::GlobalVariable(M, ModuleConstant->getType(), true,
+                                        GlobalValue::PrivateLinkage,
+                                        ModuleConstant, "dx.dxil");
+    GV->setSection("DXIL");
+    GV->setAlignment(Align(4));
+    appendToCompilerUsed(M, {GV});
+    return true;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+};
+} // namespace
+
+char WriteDXILPass::ID = 0;
+INITIALIZE_PASS_BEGIN(WriteDXILPass, "write-bitcode", "Write Bitcode", false,
+                      true)
+INITIALIZE_PASS_DEPENDENCY(ModuleSummaryIndexWrapperPass)
+INITIALIZE_PASS_END(WriteDXILPass, "write-bitcode", "Write Bitcode", false,
+                    true)
+
+ModulePass *llvm::createDXILWriterPass(raw_ostream &Str) {
+  return new WriteDXILPass(Str);
+}
+
+char EmbedDXILPass::ID = 0;
+INITIALIZE_PASS(EmbedDXILPass, "dxil-embed", "Embed DXIL", false, true)
+
+ModulePass *llvm::createDXILEmbedderPass() { return new EmbedDXILPass(); }
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.h b/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.h
new file mode 100644
index 000000000000..2c9c12178677
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.h
@@ -0,0 +1,37 @@
+//===-- DXILWriterPass.h - Bitcode writing pass --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file provides a bitcode writing pass.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_BITCODE_DXILWriterPass_H
+#define LLVM_BITCODE_DXILWriterPass_H
+
+#include "DirectX.h"
+#include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+class Module;
+class raw_ostream;
+
+/// Create and return a pass that writes the module to the specified
+/// ostream. Note that this pass is designed for use with the legacy pass
+/// manager.
+ModulePass *createDXILWriterPass(raw_ostream &Str);
+
+/// Create and return a pass that writes the module to a global variable in the
+/// module for later emission in the MCStreamer. Note that this pass is designed
+/// for use with the legacy pass manager because it is run in CodeGen only.
+ModulePass *createDXILEmbedderPass();
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/Target/DirectX/DirectX.h b/llvm/lib/Target/DirectX/DirectX.h
new file mode 100644
index 000000000000..3883e4ba4621
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DirectX.h
@@ -0,0 +1,43 @@
+//===- DirectXTargetMachine.h - DirectX Target Implementation ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_DIRECTX_DIRECTX_H
+#define LLVM_LIB_TARGET_DIRECTX_DIRECTX_H
+
+namespace llvm {
+class ModulePass;
+class PassRegistry;
+
+/// Initializer for dxil writer pass
+void initializeWriteDXILPassPass(PassRegistry &);
+
+/// Initializer for dxil embedder pass
+void initializeEmbedDXILPassPass(PassRegistry &);
+
+/// Initializer for DXIL-prepare
+void initializeDXILPrepareModulePass(PassRegistry &);
+
+/// Pass to convert modules into DXIL-compatable modules
+ModulePass *createDXILPrepareModulePass();
+
+/// Initializer for DXILOpLowering
+void initializeDXILOpLoweringLegacyPass(PassRegistry &);
+
+/// Pass to lowering LLVM intrinsic call to DXIL op function call.
+ModulePass *createDXILOpLoweringLegacyPass();
+
+/// Initializer for DXILTranslateMetadata.
+void initializeDXILTranslateMetadataPass(PassRegistry &);
+
+/// Pass to emit metadata for DXIL.
+ModulePass *createDXILTranslateMetadataPass();
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_DIRECTX_DIRECTX_H
diff --git a/llvm/lib/Target/DirectX/DirectX.td b/llvm/lib/Target/DirectX/DirectX.td
new file mode 100644
index 000000000000..4d1d45b84a68
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DirectX.td
@@ -0,0 +1,54 @@
+//- DirectX.td - Describe the DirectX Target Machine ----------*- tablegen -*-//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This is a target description file for the DirectX target
+///
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+include "DXILStubs.td"
+
+//===----------------------------------------------------------------------===//
+// DirectX Subtarget features.
+//===----------------------------------------------------------------------===//
+
+def DirectXInstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// DirectX Processors supported.
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"generic", NoSchedModel, []>;
+
+
+//===----------------------------------------------------------------------===//
+// Target Declaration
+//===----------------------------------------------------------------------===//
+
+def DirectXAsmParser : AsmParser {
+  // The physical register names are not in the binary format or asm text
+  let ShouldEmitMatchRegisterName = 0;
+}
+
+def DirectXAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  int PassSubtarget = 0;
+  int Variant = 0;
+  bit isMCAsmWriter = 1;
+}
+
+def DirectX : Target {
+  let InstructionSet = DirectXInstrInfo;
+  let AssemblyParsers  = [DirectXAsmParser];
+  let AssemblyWriters = [DirectXAsmWriter];
+}
diff --git a/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp b/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp
new file mode 100644
index 000000000000..cea3283f6756
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp
@@ -0,0 +1,57 @@
+//===-- DirectXAsmPrinter.cpp - DirectX assembly writer --------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains AsmPrinters for the DirectX backend.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TargetInfo/DirectXTargetInfo.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/SectionKind.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+namespace {
+
+// The DXILAsmPrinter is mostly a stub because DXIL is just LLVM bitcode which
+// gets embedded into a DXContainer file.
+class DXILAsmPrinter : public AsmPrinter {
+public:
+  explicit DXILAsmPrinter(TargetMachine &TM,
+                          std::unique_ptr<MCStreamer> Streamer)
+      : AsmPrinter(TM, std::move(Streamer)) {}
+
+  StringRef getPassName() const override { return "DXIL Assembly Printer"; }
+  void emitGlobalVariable(const GlobalVariable *GV) override;
+  bool runOnMachineFunction(MachineFunction &MF) override { return false; }
+};
+} // namespace
+
+void DXILAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
+  // If there is no initializer or the section is implicit, do nothing
+  if (!GV->hasInitializer() || GV->hasImplicitSection())
+    return;
+  // Skip the LLVM metadata
+  if (GV->getSection() == "llvm.metadata")
+    return;
+  SectionKind GVKind = TargetLoweringObjectFile::getKindForGlobal(GV, TM);
+  MCSection *TheSection = getObjFileLowering().SectionForGlobal(GV, GVKind, TM);
+  OutStreamer->switchSection(TheSection);
+  emitGlobalConstant(GV->getParent()->getDataLayout(), GV->getInitializer());
+}
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXAsmPrinter() {
+  RegisterAsmPrinter<DXILAsmPrinter> X(getTheDirectXTarget());
+}
diff --git a/llvm/lib/Target/DirectX/DirectXFrameLowering.h b/llvm/lib/Target/DirectX/DirectXFrameLowering.h
new file mode 100644
index 000000000000..76a1450054be
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DirectXFrameLowering.h
@@ -0,0 +1,35 @@
+//===-- DirectXFrameLowering.h - Frame lowering for DirectX --*- C++ ---*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements DirectX-specific bits of TargetFrameLowering class.
+// This is just a stub because the current DXIL backend does not actually lower
+// through the MC layer.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DIRECTX_DIRECTXFRAMELOWERING_H
+#define LLVM_DIRECTX_DIRECTXFRAMELOWERING_H
+
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/Support/Alignment.h"
+
+namespace llvm {
+class DirectXSubtarget;
+
+class DirectXFrameLowering : public TargetFrameLowering {
+public:
+  explicit DirectXFrameLowering(const DirectXSubtarget &STI)
+      : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align(8), 0) {}
+
+  void emitPrologue(MachineFunction &, MachineBasicBlock &) const override {}
+  void emitEpilogue(MachineFunction &, MachineBasicBlock &) const override {}
+
+  bool hasFP(const MachineFunction &) const override { return false; }
+};
+} // namespace llvm
+#endif // LLVM_DIRECTX_DIRECTXFRAMELOWERING_H
diff --git a/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp b/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp
new file mode 100644
index 000000000000..07b68648f16c
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp
@@ -0,0 +1,20 @@
+//===-- DirectXInstrInfo.cpp - InstrInfo for DirectX -*- C++ ------------*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the DirectX specific subclass of TargetInstrInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DirectXInstrInfo.h"
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "DirectXGenInstrInfo.inc"
+
+using namespace llvm;
+
+DirectXInstrInfo::~DirectXInstrInfo() {}
diff --git a/llvm/lib/Target/DirectX/DirectXInstrInfo.h b/llvm/lib/Target/DirectX/DirectXInstrInfo.h
new file mode 100644
index 000000000000..4fe79ee547fe
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DirectXInstrInfo.h
@@ -0,0 +1,30 @@
+//===-- DirectXInstrInfo.h - Define InstrInfo for DirectX -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the DirectX specific subclass of TargetInstrInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DIRECTX_DIRECTXINSTRINFO_H
+#define LLVM_DIRECTX_DIRECTXINSTRINFO_H
+
+#include "DirectXRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "DirectXGenInstrInfo.inc"
+
+namespace llvm {
+struct DirectXInstrInfo : public DirectXGenInstrInfo {
+  explicit DirectXInstrInfo() : DirectXGenInstrInfo() {}
+
+  ~DirectXInstrInfo() override;
+};
+} // namespace llvm
+
+#endif // LLVM_DIRECTX_DIRECTXINSTRINFO_H
diff --git a/llvm/lib/Target/DirectX/DirectXRegisterInfo.cpp b/llvm/lib/Target/DirectX/DirectXRegisterInfo.cpp
new file mode 100644
index 000000000000..c54b494f3730
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DirectXRegisterInfo.cpp
@@ -0,0 +1,24 @@
+//===-- DirectXRegisterInfo.cpp - RegisterInfo for DirectX -*- C++ ------*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the DirectX specific subclass of TargetRegisterInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DirectXRegisterInfo.h"
+#include "DirectXFrameLowering.h"
+#include "MCTargetDesc/DirectXMCTargetDesc.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+
+#define GET_REGINFO_TARGET_DESC
+#include "DirectXGenRegisterInfo.inc"
+
+using namespace llvm;
+
+DirectXRegisterInfo::~DirectXRegisterInfo() {}
diff --git a/llvm/lib/Target/DirectX/DirectXRegisterInfo.h b/llvm/lib/Target/DirectX/DirectXRegisterInfo.h
new file mode 100644
index 000000000000..023c5c3ef337
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DirectXRegisterInfo.h
@@ -0,0 +1,28 @@
+//===-- DirectXRegisterInfo.h - Define RegisterInfo for DirectX -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the DirectX specific subclass of TargetRegisterInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DIRECTX_DXILREGISTERINFO_H
+#define LLVM_DIRECTX_DXILREGISTERINFO_H
+
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "DirectXGenRegisterInfo.inc"
+
+namespace llvm {
+struct DirectXRegisterInfo : public DirectXGenRegisterInfo {
+  DirectXRegisterInfo() : DirectXGenRegisterInfo(0) {}
+  ~DirectXRegisterInfo();
+};
+} // namespace llvm
+
+#endif // LLVM_DIRECTX_DXILREGISTERINFO_H
diff --git a/llvm/lib/Target/DirectX/DirectXSubtarget.cpp b/llvm/lib/Target/DirectX/DirectXSubtarget.cpp
new file mode 100644
index 000000000000..526b7d29fb13
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DirectXSubtarget.cpp
@@ -0,0 +1,29 @@
+//===-- DirectXSubtarget.cpp - DirectX Subtarget Information --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the DirectX-specific subclass of TargetSubtarget.
+///
+//===----------------------------------------------------------------------===//
+
+#include "DirectXSubtarget.h"
+#include "DirectXTargetLowering.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "directx-subtarget"
+
+#define GET_SUBTARGETINFO_CTOR
+#define GET_SUBTARGETINFO_TARGET_DESC
+#include "DirectXGenSubtargetInfo.inc"
+
+DirectXSubtarget::DirectXSubtarget(const Triple &TT, StringRef CPU,
+                                   StringRef FS, const DirectXTargetMachine &TM)
+    : DirectXGenSubtargetInfo(TT, CPU, CPU, FS), FL(*this), TL(TM, *this) {}
+
+void DirectXSubtarget::anchor() {}
diff --git a/llvm/lib/Target/DirectX/DirectXSubtarget.h b/llvm/lib/Target/DirectX/DirectXSubtarget.h
new file mode 100644
index 000000000000..464d05a0e1ff
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DirectXSubtarget.h
@@ -0,0 +1,56 @@
+//===-- DirectXSubtarget.h - Define Subtarget for DirectX -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the DirectX specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DIRECTX_DIRECTXSUBTARGET_H
+#define LLVM_DIRECTX_DIRECTXSUBTARGET_H
+
+#include "DirectXFrameLowering.h"
+#include "DirectXInstrInfo.h"
+#include "DirectXTargetLowering.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define GET_SUBTARGETINFO_HEADER
+#include "DirectXGenSubtargetInfo.inc"
+
+namespace llvm {
+
+class DirectXTargetMachine;
+
+class DirectXSubtarget : public DirectXGenSubtargetInfo {
+  DirectXFrameLowering FL;
+  DirectXTargetLowering TL;
+  DirectXInstrInfo InstrInfo;
+
+  virtual void anchor(); // virtual anchor method
+
+public:
+  DirectXSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
+                   const DirectXTargetMachine &TM);
+
+  /// Parses a subtarget feature string, setting appropriate options.
+  /// \note Definition of function is auto generated by `tblgen`.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
+
+  const DirectXTargetLowering *getTargetLowering() const override {
+    return &TL;
+  }
+
+  const DirectXFrameLowering *getFrameLowering() const override { return &FL; }
+
+  const DirectXInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_DIRECTX_DIRECTXSUBTARGET_H
diff --git a/llvm/lib/Target/DirectX/DirectXTargetLowering.h b/llvm/lib/Target/DirectX/DirectXTargetLowering.h
new file mode 100644
index 000000000000..dc19894ab165
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DirectXTargetLowering.h
@@ -0,0 +1,31 @@
+//===-- DirectXTargetLowering.h - Define DX TargetLowering  -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the DirectX specific subclass of TargetLowering.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DIRECTX_DIRECTXTARGETLOWERING_H
+#define LLVM_DIRECTX_DIRECTXTARGETLOWERING_H
+
+#include "llvm/CodeGen/TargetLowering.h"
+
+namespace llvm {
+
+class DirectXSubtarget;
+class DirectXTargetMachine;
+
+class DirectXTargetLowering : public TargetLowering {
+public:
+  explicit DirectXTargetLowering(const DirectXTargetMachine &TM,
+                                 const DirectXSubtarget &STI);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_DIRECTX_DIRECTXTARGETLOWERING_H
diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
new file mode 100644
index 000000000000..44bef80ea6fb
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
@@ -0,0 +1,144 @@
+//===- DirectXTargetMachine.cpp - DirectX Target Implementation -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains DirectX target initializer.
+///
+//===----------------------------------------------------------------------===//
+
+#include "DirectXTargetMachine.h"
+#include "DXILWriter/DXILWriterPass.h"
+#include "DirectX.h"
+#include "DirectXSubtarget.h"
+#include "DirectXTargetTransformInfo.h"
+#include "TargetInfo/DirectXTargetInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/MC/MCSectionDXContainer.h"
+#include "llvm/MC/SectionKind.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+using namespace llvm;
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTarget() {
+  RegisterTargetMachine<DirectXTargetMachine> X(getTheDirectXTarget());
+  auto *PR = PassRegistry::getPassRegistry();
+  initializeDXILPrepareModulePass(*PR);
+  initializeEmbedDXILPassPass(*PR);
+  initializeDXILOpLoweringLegacyPass(*PR);
+  initializeDXILTranslateMetadataPass(*PR);
+}
+
+class DXILTargetObjectFile : public TargetLoweringObjectFile {
+public:
+  DXILTargetObjectFile() = default;
+
+  MCSection *getExplicitSectionGlobal(const GlobalObject *GO, SectionKind Kind,
+                                      const TargetMachine &TM) const override {
+    return getContext().getDXContainerSection(GO->getSection(), Kind);
+  }
+
+protected:
+  MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind,
+                                    const TargetMachine &TM) const override {
+    llvm_unreachable("Not supported!");
+  }
+};
+
+class DirectXPassConfig : public TargetPassConfig {
+public:
+  DirectXPassConfig(DirectXTargetMachine &TM, PassManagerBase &PM)
+      : TargetPassConfig(TM, PM) {}
+
+  DirectXTargetMachine &getDirectXTargetMachine() const {
+    return getTM<DirectXTargetMachine>();
+  }
+
+  FunctionPass *createTargetRegisterAllocator(bool) override { return nullptr; }
+};
+
+DirectXTargetMachine::DirectXTargetMachine(const Target &T, const Triple &TT,
+                                           StringRef CPU, StringRef FS,
+                                           const TargetOptions &Options,
+                                           Optional<Reloc::Model> RM,
+                                           Optional<CodeModel::Model> CM,
+                                           CodeGenOpt::Level OL, bool JIT)
+    : LLVMTargetMachine(T,
+                        "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-"
+                        "f32:32-f64:64-n8:16:32:64",
+                        TT, CPU, FS, Options, Reloc::Static, CodeModel::Small,
+                        OL),
+      TLOF(std::make_unique<DXILTargetObjectFile>()),
+      Subtarget(std::make_unique<DirectXSubtarget>(TT, CPU, FS, *this)) {
+  initAsmInfo();
+}
+
+DirectXTargetMachine::~DirectXTargetMachine() {}
+
+bool DirectXTargetMachine::addPassesToEmitFile(
+    PassManagerBase &PM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
+    CodeGenFileType FileType, bool DisableVerify,
+    MachineModuleInfoWrapperPass *MMIWP) {
+  PM.add(createDXILOpLoweringLegacyPass());
+  PM.add(createDXILPrepareModulePass());
+  PM.add(createDXILTranslateMetadataPass());
+  if (TargetPassConfig::willCompleteCodeGenPipeline()) {
+    PM.add(createDXILEmbedderPass());
+  }
+  switch (FileType) {
+  case CGFT_AssemblyFile:
+    PM.add(createPrintModulePass(Out, "", true));
+    break;
+  case CGFT_ObjectFile:
+    if (TargetPassConfig::willCompleteCodeGenPipeline()) {
+      if (!MMIWP)
+        MMIWP = new MachineModuleInfoWrapperPass(this);
+      PM.add(MMIWP);
+      if (addAsmPrinter(PM, Out, DwoOut, FileType,
+                        MMIWP->getMMI().getContext()))
+        return true;
+    } else
+      PM.add(createDXILWriterPass(Out));
+    break;
+  case CGFT_Null:
+    break;
+  }
+  return false;
+}
+
+bool DirectXTargetMachine::addPassesToEmitMC(PassManagerBase &PM,
+                                             MCContext *&Ctx,
+                                             raw_pwrite_stream &Out,
+                                             bool DisableVerify) {
+  return true;
+}
+
+TargetPassConfig *DirectXTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new DirectXPassConfig(*this, PM);
+}
+
+const DirectXSubtarget *
+DirectXTargetMachine::getSubtargetImpl(const Function &) const {
+  return Subtarget.get();
+}
+
+TargetTransformInfo
+DirectXTargetMachine::getTargetTransformInfo(const Function &F) const {
+  return TargetTransformInfo(DirectXTTIImpl(this, F));
+}
+
+DirectXTargetLowering::DirectXTargetLowering(const DirectXTargetMachine &TM,
+                                             const DirectXSubtarget &STI)
+    : TargetLowering(TM) {}
diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.h b/llvm/lib/Target/DirectX/DirectXTargetMachine.h
new file mode 100644
index 000000000000..ae41638b6acf
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.h
@@ -0,0 +1,51 @@
+//===- DirectXTargetMachine.h - DirectX Target Implementation ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DIRECTX_DIRECTXTARGETMACHINE_H
+#define LLVM_DIRECTX_DIRECTXTARGETMACHINE_H
+
+#include "DirectXSubtarget.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class Function;
+class DirectXTargetMachine : public LLVMTargetMachine {
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  std::unique_ptr<DirectXSubtarget> Subtarget;
+
+public:
+  DirectXTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Optional<Reloc::Model> RM, Optional<CodeModel::Model> CM,
+                       CodeGenOpt::Level OL, bool JIT);
+
+  ~DirectXTargetMachine() override;
+
+  bool addPassesToEmitFile(PassManagerBase &PM, raw_pwrite_stream &Out,
+                           raw_pwrite_stream *DwoOut, CodeGenFileType FileType,
+                           bool DisableVerify,
+                           MachineModuleInfoWrapperPass *MMIWP) override;
+
+  bool addPassesToEmitMC(PassManagerBase &PM, MCContext *&Ctx,
+                         raw_pwrite_stream &Out, bool DisableVerify) override;
+
+  const DirectXSubtarget *getSubtargetImpl(const Function &) const override;
+
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
+
+  TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
+};
+} // namespace llvm
+
+#endif // LLVM_DIRECTX_DIRECTXTARGETMACHINE_H
diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.h b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.h
new file mode 100644
index 000000000000..90beb386fa44
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.h
@@ -0,0 +1,39 @@
+//===- DirectXTargetTransformInfo.h - DirectX TTI ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DIRECTX_DIRECTXTARGETTRANSFORMINFO_H
+#define LLVM_DIRECTX_DIRECTXTARGETTRANSFORMINFO_H
+
+#include "DirectXSubtarget.h"
+#include "DirectXTargetMachine.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/IR/Function.h"
+
+namespace llvm {
+class DirectXTTIImpl : public BasicTTIImplBase<DirectXTTIImpl> {
+  using BaseT = BasicTTIImplBase<DirectXTTIImpl>;
+  using TTI = TargetTransformInfo;
+
+  friend BaseT;
+
+  const DirectXSubtarget *ST;
+  const DirectXTargetLowering *TLI;
+
+  const DirectXSubtarget *getST() const { return ST; }
+  const DirectXTargetLowering *getTLI() const { return TLI; }
+
+public:
+  explicit DirectXTTIImpl(const DirectXTargetMachine *TM, const Function &F)
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+        TLI(ST->getTargetLowering()) {}
+};
+} // namespace llvm
+
+#endif // LLVM_DIRECTX_DIRECTXTARGETTRANSFORMINFO_H
diff --git a/llvm/lib/Target/DirectX/MCTargetDesc/DirectXContainerObjectWriter.cpp b/llvm/lib/Target/DirectX/MCTargetDesc/DirectXContainerObjectWriter.cpp
new file mode 100644
index 000000000000..78ccbc444bce
--- /dev/null
+++ b/llvm/lib/Target/DirectX/MCTargetDesc/DirectXContainerObjectWriter.cpp
@@ -0,0 +1,28 @@
+//===-- DirectXContainerObjectWriter.cpp - DX object writer ----*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains DXContainer object writers for the DirectX backend.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DirectXContainerObjectWriter.h"
+#include "llvm/MC/MCDXContainerWriter.h"
+
+using namespace llvm;
+
+namespace {
+class DirectXContainerObjectWriter : public MCDXContainerTargetWriter {
+public:
+  DirectXContainerObjectWriter() : MCDXContainerTargetWriter() {}
+};
+} // namespace
+
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createDXContainerTargetObjectWriter() {
+  return std::make_unique<DirectXContainerObjectWriter>();
+}
diff --git a/llvm/lib/Target/DirectX/MCTargetDesc/DirectXContainerObjectWriter.h b/llvm/lib/Target/DirectX/MCTargetDesc/DirectXContainerObjectWriter.h
new file mode 100644
index 000000000000..a6fbdc865f7d
--- /dev/null
+++ b/llvm/lib/Target/DirectX/MCTargetDesc/DirectXContainerObjectWriter.h
@@ -0,0 +1,24 @@
+//===-- DirectXContainerObjectWriter.h - DX object writer ------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains DXContainer object writers for the DirectX backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DIRECTX_DIRECTXCONTAINEROBJECTWRITER_H
+#define LLVM_DIRECTX_DIRECTXCONTAINEROBJECTWRITER_H
+
+#include "llvm/MC/MCObjectWriter.h"
+
+namespace llvm {
+
+std::unique_ptr<MCObjectTargetWriter> createDXContainerTargetObjectWriter();
+
+}
+
+#endif // LLVM_DIRECTX_DIRECTXCONTAINEROBJECTWRITER_H
diff --git a/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp b/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp
new file mode 100644
index 000000000000..0c97ab62a37b
--- /dev/null
+++ b/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp
@@ -0,0 +1,152 @@
+//===- DirectXMCTargetDesc.cpp - DirectX Target Implementation --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains DirectX target initializer.
+///
+//===----------------------------------------------------------------------===//
+
+#include "DirectXMCTargetDesc.h"
+#include "DirectXContainerObjectWriter.h"
+#include "TargetInfo/DirectXTargetInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCDXContainerWriter.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
+#include <memory>
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#define GET_INSTRINFO_MC_HELPERS
+#include "DirectXGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "DirectXGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "DirectXGenRegisterInfo.inc"
+
+namespace {
+
+// DXILInstPrinter is a null stub because DXIL instructions aren't printed.
+class DXILInstPrinter : public MCInstPrinter {
+public:
+  DXILInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                  const MCRegisterInfo &MRI)
+      : MCInstPrinter(MAI, MII, MRI) {}
+
+  void printInst(const MCInst *MI, uint64_t Address, StringRef Annot,
+                 const MCSubtargetInfo &STI, raw_ostream &O) override {}
+
+  std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override {
+    return std::make_pair<const char *, uint64_t>("", 0ull);
+  }
+
+private:
+};
+
+class DXILMCCodeEmitter : public MCCodeEmitter {
+public:
+  DXILMCCodeEmitter() {}
+
+  void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override {}
+};
+
+class DXILAsmBackend : public MCAsmBackend {
+
+public:
+  DXILAsmBackend(const MCSubtargetInfo &STI) : MCAsmBackend(support::little) {}
+  ~DXILAsmBackend() override = default;
+
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsResolved,
+                  const MCSubtargetInfo *STI) const override {}
+
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override {
+    return createDXContainerTargetObjectWriter();
+  }
+
+  unsigned getNumFixupKinds() const override { return 0; }
+
+  bool writeNopData(raw_ostream &OS, uint64_t Count,
+                    const MCSubtargetInfo *STI) const override {
+    return true;
+  }
+
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override {
+    return true;
+  }
+};
+
+class DirectXMCAsmInfo : public MCAsmInfo {
+public:
+  explicit DirectXMCAsmInfo(const Triple &TT, const MCTargetOptions &Options)
+      : MCAsmInfo() {}
+};
+
+} // namespace
+
+static MCInstPrinter *createDXILMCInstPrinter(const Triple &T,
+                                              unsigned SyntaxVariant,
+                                              const MCAsmInfo &MAI,
+                                              const MCInstrInfo &MII,
+                                              const MCRegisterInfo &MRI) {
+  if (SyntaxVariant == 0)
+    return new DXILInstPrinter(MAI, MII, MRI);
+  return nullptr;
+}
+
+MCCodeEmitter *createDXILMCCodeEmitter(const MCInstrInfo &MCII,
+                                       MCContext &Ctx) {
+  return new DXILMCCodeEmitter();
+}
+
+MCAsmBackend *createDXILMCAsmBackend(const Target &T,
+                                     const MCSubtargetInfo &STI,
+                                     const MCRegisterInfo &MRI,
+                                     const MCTargetOptions &Options) {
+  return new DXILAsmBackend(STI);
+}
+
+static MCSubtargetInfo *
+createDirectXMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
+  return createDirectXMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
+}
+
+static MCRegisterInfo *createDirectXMCRegisterInfo(const Triple &Triple) {
+  return new MCRegisterInfo();
+}
+
+static MCInstrInfo *createDirectXMCInstrInfo() { return new MCInstrInfo(); }
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTargetMC() {
+  Target &T = getTheDirectXTarget();
+  RegisterMCAsmInfo<DirectXMCAsmInfo> X(T);
+  TargetRegistry::RegisterMCInstrInfo(T, createDirectXMCInstrInfo);
+  TargetRegistry::RegisterMCInstPrinter(T, createDXILMCInstPrinter);
+  TargetRegistry::RegisterMCRegInfo(T, createDirectXMCRegisterInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(T, createDirectXMCSubtargetInfo);
+  TargetRegistry::RegisterMCCodeEmitter(T, createDXILMCCodeEmitter);
+  TargetRegistry::RegisterMCAsmBackend(T, createDXILMCAsmBackend);
+}
diff --git a/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.h b/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.h
new file mode 100644
index 000000000000..0c3873a24417
--- /dev/null
+++ b/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.h
@@ -0,0 +1,29 @@
+//===- DirectXMCTargetDesc.h - DirectX Target Interface ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains DirectX target interface.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DIRECTX_DIRECTXMCTARGETDESC_H
+#define LLVM_DIRECTX_DIRECTXMCTARGETDESC_H
+
+// Include DirectX stub register info
+#define GET_REGINFO_ENUM
+#include "DirectXGenRegisterInfo.inc"
+
+// Include DirectX stub instruction info
+#define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_MC_HELPER_DECLS
+#include "DirectXGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "DirectXGenSubtargetInfo.inc"
+
+#endif // LLVM_DIRECTX_DIRECTXMCTARGETDESC_H
diff --git a/llvm/lib/Target/DirectX/PointerTypeAnalysis.cpp b/llvm/lib/Target/DirectX/PointerTypeAnalysis.cpp
new file mode 100644
index 000000000000..1d536bbd0011
--- /dev/null
+++ b/llvm/lib/Target/DirectX/PointerTypeAnalysis.cpp
@@ -0,0 +1,119 @@
+//===- Target/DirectX/PointerTypeAnalisis.cpp - PointerType analysis ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Analysis pass to assign types to opaque pointers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PointerTypeAnalysis.h"
+#include "llvm/IR/Instructions.h"
+
+using namespace llvm;
+using namespace llvm::dxil;
+
+namespace {
+
+// Classifies the type of the value passed in by walking the value's users to
+// find a typed instruction to materialize a type from.
+TypedPointerType *classifyPointerType(const Value *V) {
+  assert(V->getType()->isOpaquePointerTy() &&
+         "classifyPointerType called with non-opaque pointer");
+  Type *PointeeTy = nullptr;
+  if (auto *Inst = dyn_cast<GetElementPtrInst>(V)) {
+    if (!Inst->getResultElementType()->isOpaquePointerTy())
+      PointeeTy = Inst->getResultElementType();
+  } else if (auto *Inst = dyn_cast<AllocaInst>(V)) {
+    PointeeTy = Inst->getAllocatedType();
+  }
+  for (const auto *User : V->users()) {
+    Type *NewPointeeTy = nullptr;
+    if (const auto *Inst = dyn_cast<LoadInst>(User)) {
+      NewPointeeTy = Inst->getType();
+    } else if (const auto *Inst = dyn_cast<StoreInst>(User)) {
+      NewPointeeTy = Inst->getValueOperand()->getType();
+    } else if (const auto *Inst = dyn_cast<GetElementPtrInst>(User)) {
+      NewPointeeTy = Inst->getSourceElementType();
+    }
+    if (NewPointeeTy) {
+      // HLSL doesn't support pointers, so it is unlikely to get more than one
+      // or two levels of indirection in the IR. Because of this, recursion is
+      // pretty safe.
+      if (NewPointeeTy->isOpaquePointerTy())
+        return TypedPointerType::get(classifyPointerType(User),
+                                     V->getType()->getPointerAddressSpace());
+      if (!PointeeTy)
+        PointeeTy = NewPointeeTy;
+      else if (PointeeTy != NewPointeeTy)
+        PointeeTy = Type::getInt8Ty(V->getContext());
+    }
+  }
+  // If we were unable to determine the pointee type, set to i8
+  if (!PointeeTy)
+    PointeeTy = Type::getInt8Ty(V->getContext());
+  return TypedPointerType::get(PointeeTy,
+                               V->getType()->getPointerAddressSpace());
+}
+
+// This function constructs a function type accepting typed pointers. It only
+// handles function arguments and return types, and assigns the function type to
+// the function's value in the type map.
+void classifyFunctionType(const Function &F, PointerTypeMap &Map) {
+  SmallVector<Type *, 8> NewArgs;
+  bool HasOpaqueTy = false;
+  Type *RetTy = F.getReturnType();
+  if (RetTy->isOpaquePointerTy()) {
+    RetTy = nullptr;
+    for (const auto &B : F) {
+      for (const auto &I : B) {
+        if (const auto *RetInst = dyn_cast_or_null<ReturnInst>(&I)) {
+          Type *NewRetTy = classifyPointerType(RetInst->getReturnValue());
+          if (!RetTy)
+            RetTy = NewRetTy;
+          else if (RetTy != NewRetTy)
+            RetTy = TypedPointerType::get(
+                Type::getInt8Ty(I.getContext()),
+                F.getReturnType()->getPointerAddressSpace());
+        }
+      }
+    }
+  }
+  for (auto &A : F.args()) {
+    Type *ArgTy = A.getType();
+    if (ArgTy->isOpaquePointerTy()) {
+      TypedPointerType *NewTy = classifyPointerType(&A);
+      Map[&A] = NewTy;
+      ArgTy = NewTy;
+      HasOpaqueTy = true;
+    }
+    NewArgs.push_back(ArgTy);
+  }
+  if (!HasOpaqueTy)
+    return;
+  Map[&F] = FunctionType::get(RetTy, NewArgs, false);
+}
+} // anonymous namespace
+
+PointerTypeMap PointerTypeAnalysis::run(const Module &M) {
+  PointerTypeMap Map;
+  for (auto &G : M.globals()) {
+    if (G.getType()->isOpaquePointerTy())
+      Map[&G] = classifyPointerType(&G);
+  }
+  for (auto &F : M) {
+    classifyFunctionType(F, Map);
+
+    for (const auto &B : F) {
+      for (const auto &I : B) {
+        if (I.getType()->isOpaquePointerTy())
+          Map[&I] = classifyPointerType(&I);
+      }
+    }
+  }
+
+  return Map;
+}
diff --git a/llvm/lib/Target/DirectX/PointerTypeAnalysis.h b/llvm/lib/Target/DirectX/PointerTypeAnalysis.h
new file mode 100644
index 000000000000..c4164b6bf359
--- /dev/null
+++ b/llvm/lib/Target/DirectX/PointerTypeAnalysis.h
@@ -0,0 +1,43 @@
+//===- Target/DirectX/PointerTypeAnalysis.h - PointerType analysis --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Analysis pass to assign types to opaque pointers.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_DIRECTX_POINTERTYPEANALYSIS_H
+#define LLVM_TARGET_DIRECTX_POINTERTYPEANALYSIS_H
+
+#include "DXILPointerType.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+namespace dxil {
+
+// Store the underlying type and the number of pointer indirections
+using PointerTypeMap = DenseMap<const Value *, Type *>;
+
+/// An analysis to compute the \c PointerTypes for pointers in a \c Module.
+/// Since this analysis is only run during codegen and the new pass manager
+/// doesn't support codegen passes, this is wrtten as a function in a namespace.
+/// It is very simple to transform it into a proper analysis pass.
+/// This code relies on typed pointers existing as LLVM types, but could be
+/// migrated to a custom Type if PointerType loses typed support.
+namespace PointerTypeAnalysis {
+
+/// Compute the \c PointerTypeMap for the module \c M.
+PointerTypeMap run(const Module &M);
+} // namespace PointerTypeAnalysis
+
+} // namespace dxil
+
+} // namespace llvm
+
+#endif // LLVM_TARGET_DIRECTX_POINTERTYPEANALYSIS_H
diff --git a/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp b/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp
new file mode 100644
index 000000000000..54c577debc34
--- /dev/null
+++ b/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp
@@ -0,0 +1,30 @@
+//===- DirectXTargetInfo.cpp - DirectX Target Implementation ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains DirectX target initializer.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+Target &getTheDirectXTarget() {
+  static Target TheDirectXTarget;
+  return TheDirectXTarget;
+}
+} // namespace llvm
+
+using namespace llvm;
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTargetInfo() {
+  RegisterTarget<Triple::dxil, /*HasJIT=*/false> X(
+      getTheDirectXTarget(), "dxil", "DirectX Intermediate Language", "DXIL");
+}
diff --git a/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.h b/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.h
new file mode 100644
index 000000000000..a860c430f81a
--- /dev/null
+++ b/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.h
@@ -0,0 +1,18 @@
+//===-- DirectXTargetInfo.h - DircetX Target Implementation -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DIRECTX_TARGETINFO_DIRECTXTARGETINFO_H
+#define LLVM_DIRECTX_TARGETINFO_DIRECTXTARGETINFO_H
+
+namespace llvm {
+class Target;
+
+Target &getTheDirectXTarget();
+} // namespace llvm
+
+#endif // LLVM_DIRECTX_TARGETINFO_DIRECTXTARGETINFO_H
diff --git a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index 15eba89eeb55..4553f2fd9228 100644
--- a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -681,7 +681,7 @@ bool HexagonAsmParser::ParseDirectiveSubsection(SMLoc L) {
     Subsection = HexagonMCExpr::create(
         MCConstantExpr::create(8192 + Res, getContext()), getContext());
 
-  getStreamer().SubSection(Subsection);
+  getStreamer().subSection(Subsection);
   return false;
 }
 
@@ -1450,7 +1450,7 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
       MCOperand &MO_0 = Inst.getOperand(0);
 
       // push section onto section stack
-      MES->PushSection();
+      MES->pushSection();
 
       std::string myCharStr;
       MCSectionELF *mySection;
@@ -1485,7 +1485,7 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
       } else
         llvm_unreachable("unexpected type of machine operand!");
 
-      MES->SwitchSection(mySection);
+      MES->switchSection(mySection);
       unsigned byteSize = is32bit ? 4 : 8;
       getStreamer().emitCodeAlignment(byteSize, &getSTI(), byteSize);
 
@@ -1526,7 +1526,7 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
       } else
         llvm_unreachable("unexpected type of machine operand!");
 
-      MES->PopSection();
+      MES->popSection();
 
       if (Sym) {
         MCInst TmpInst;
diff --git a/llvm/lib/Target/Hexagon/BitTracker.cpp b/llvm/lib/Target/Hexagon/BitTracker.cpp
index 17adf32750db..4d5789a3c5fe 100644
--- a/llvm/lib/Target/Hexagon/BitTracker.cpp
+++ b/llvm/lib/Target/Hexagon/BitTracker.cpp
@@ -1056,9 +1056,8 @@ void BT::runEdgeQueue(BitVector &BlockScanned) {
     CFGEdge Edge = FlowQ.front();
     FlowQ.pop();
 
-    if (EdgeExec.count(Edge))
+    if (!EdgeExec.insert(Edge).second)
       return;
-    EdgeExec.insert(Edge);
     ReachedBB.insert(Edge.second);
 
     const MachineBasicBlock &B = *MF.getBlockNumbered(Edge.second);
diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index 3c742c98077b..58d5df4c1f71 100644
--- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -14,9 +14,9 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDecoderOps.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -78,11 +78,12 @@ static uint64_t fullValue(HexagonDisassembler const &Disassembler, MCInst &MI,
   uint64_t Operand = Upper26 | Lower6;
   return Operand;
 }
-static HexagonDisassembler const &disassembler(void const *Decoder) {
+static HexagonDisassembler const &disassembler(const MCDisassembler *Decoder) {
   return *static_cast<HexagonDisassembler const *>(Decoder);
 }
 template <size_t T>
-static void signedDecoder(MCInst &MI, unsigned tmp, const void *Decoder) {
+static void signedDecoder(MCInst &MI, unsigned tmp,
+                          const MCDisassembler *Decoder) {
   HexagonDisassembler const &Disassembler = disassembler(Decoder);
   int64_t FullValue = fullValue(Disassembler, MI, SignExtend64<T>(tmp));
   int64_t Extended = SignExtend64<32>(FullValue);
@@ -95,65 +96,66 @@ static void signedDecoder(MCInst &MI, unsigned tmp, const void *Decoder) {
 
 static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder);
-static DecodeStatus DecodeGeneralSubRegsRegisterClass(MCInst &Inst,
-                                                      unsigned RegNo,
-                                                      uint64_t Address,
-                                                      const void *Decoder);
-static DecodeStatus DecodeIntRegsLow8RegisterClass(MCInst &Inst, unsigned RegNo,
-                                                   uint64_t Address,
-                                                   const void *Decoder);
+                                               const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeGeneralSubRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                  uint64_t Address,
+                                  const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeIntRegsLow8RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
+                               const MCDisassembler *Decoder);
 static DecodeStatus DecodeHvxVRRegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
-                                             const void *Decoder);
-static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo,
-                                                  uint64_t Address,
-                                                  const void *Decoder);
+                                             const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
+                              const MCDisassembler *Decoder);
 static DecodeStatus
 DecodeGeneralDoubleLow8RegsRegisterClass(MCInst &Inst, unsigned RegNo,
-                                         uint64_t Address, const void *Decoder);
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder);
 static DecodeStatus DecodeHvxWRRegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
-                                             const void *Decoder);
-static DecodeStatus DecodeHvxVQRRegisterClass(MCInst &Inst,
-                                              unsigned RegNo,
+                                             const MCDisassembler *Decoder);
+static DecodeStatus DecodeHvxVQRRegisterClass(MCInst &Inst, unsigned RegNo,
                                               uint64_t Address,
-                                              const void *Decoder);
+                                              const MCDisassembler *Decoder);
 static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                 uint64_t Address,
-                                                const void *Decoder);
+                                                const MCDisassembler *Decoder);
 static DecodeStatus DecodeHvxQRRegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
-                                             const void *Decoder);
+                                             const MCDisassembler *Decoder);
 static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder);
+                                               const MCDisassembler *Decoder);
 static DecodeStatus DecodeGuestRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                  uint64_t Address,
-                                                 const void *Decoder);
+                                                 const MCDisassembler *Decoder);
 static DecodeStatus DecodeSysRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder);
+                                               const MCDisassembler *Decoder);
 static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder);
+                                               const MCDisassembler *Decoder);
 static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
                                                  uint64_t Address,
-                                                 const void *Decoder);
-static DecodeStatus DecodeGuestRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
-                                                   uint64_t Address,
-                                                   const void *Decoder);
+                                                 const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeGuestRegs64RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
+                               const MCDisassembler *Decoder);
 static DecodeStatus DecodeSysRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
                                                  uint64_t Address,
-                                                 const void *Decoder);
-
+                                                 const MCDisassembler *Decoder);
 
 static DecodeStatus unsignedImmDecoder(MCInst &MI, unsigned tmp,
-                                       uint64_t Address, const void *Decoder);
+                                       uint64_t Address,
+                                       const MCDisassembler *Decoder);
 static DecodeStatus s32_0ImmDecoder(MCInst &MI, unsigned tmp,
-                                    uint64_t /*Address*/, const void *Decoder);
+                                    uint64_t /*Address*/,
+                                    const MCDisassembler *Decoder);
 static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
-                                    const void *Decoder);
+                                    const MCDisassembler *Decoder);
 #include "HexagonDepDecoders.inc"
 #include "HexagonGenDisassemblerTables.inc"
 
@@ -542,15 +544,15 @@ static DecodeStatus DecodeRegisterClass(MCInst &Inst, unsigned RegNo,
   return MCDisassembler::Fail;
 }
 
-static DecodeStatus DecodeIntRegsLow8RegisterClass(MCInst &Inst, unsigned RegNo,
-                                                   uint64_t Address,
-                                                   const void *Decoder) {
+static DecodeStatus
+DecodeIntRegsLow8RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
+                               const MCDisassembler *Decoder) {
   return DecodeIntRegsRegisterClass(Inst, RegNo, Address, Decoder);
 }
 
 static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   static const MCPhysReg IntRegDecoderTable[] = {
       Hexagon::R0,  Hexagon::R1,  Hexagon::R2,  Hexagon::R3,  Hexagon::R4,
       Hexagon::R5,  Hexagon::R6,  Hexagon::R7,  Hexagon::R8,  Hexagon::R9,
@@ -563,10 +565,10 @@ static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo,
   return DecodeRegisterClass(Inst, RegNo, IntRegDecoderTable);
 }
 
-static DecodeStatus DecodeGeneralSubRegsRegisterClass(MCInst &Inst,
-                                                      unsigned RegNo,
-                                                      uint64_t Address,
-                                                      const void *Decoder) {
+static DecodeStatus
+DecodeGeneralSubRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                  uint64_t Address,
+                                  const MCDisassembler *Decoder) {
   static const MCPhysReg GeneralSubRegDecoderTable[] = {
       Hexagon::R0,  Hexagon::R1,  Hexagon::R2,  Hexagon::R3,
       Hexagon::R4,  Hexagon::R5,  Hexagon::R6,  Hexagon::R7,
@@ -579,7 +581,7 @@ static DecodeStatus DecodeGeneralSubRegsRegisterClass(MCInst &Inst,
 
 static DecodeStatus DecodeHvxVRRegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t /*Address*/,
-                                             const void *Decoder) {
+                                             const MCDisassembler *Decoder) {
   static const MCPhysReg HvxVRDecoderTable[] = {
       Hexagon::V0,  Hexagon::V1,  Hexagon::V2,  Hexagon::V3,  Hexagon::V4,
       Hexagon::V5,  Hexagon::V6,  Hexagon::V7,  Hexagon::V8,  Hexagon::V9,
@@ -592,9 +594,10 @@ static DecodeStatus DecodeHvxVRRegisterClass(MCInst &Inst, unsigned RegNo,
   return DecodeRegisterClass(Inst, RegNo, HvxVRDecoderTable);
 }
 
-static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo,
-                                                  uint64_t /*Address*/,
-                                                  const void *Decoder) {
+static DecodeStatus
+DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                              uint64_t /*Address*/,
+                              const MCDisassembler *Decoder) {
   static const MCPhysReg DoubleRegDecoderTable[] = {
       Hexagon::D0,  Hexagon::D1,  Hexagon::D2,  Hexagon::D3,
       Hexagon::D4,  Hexagon::D5,  Hexagon::D6,  Hexagon::D7,
@@ -604,8 +607,10 @@ static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo,
   return DecodeRegisterClass(Inst, RegNo >> 1, DoubleRegDecoderTable);
 }
 
-static DecodeStatus DecodeGeneralDoubleLow8RegsRegisterClass(
-    MCInst &Inst, unsigned RegNo, uint64_t /*Address*/, const void *Decoder) {
+static DecodeStatus
+DecodeGeneralDoubleLow8RegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                         uint64_t /*Address*/,
+                                         const MCDisassembler *Decoder) {
   static const MCPhysReg GeneralDoubleLow8RegDecoderTable[] = {
       Hexagon::D0, Hexagon::D1, Hexagon::D2,  Hexagon::D3,
       Hexagon::D8, Hexagon::D9, Hexagon::D10, Hexagon::D11};
@@ -615,7 +620,7 @@ static DecodeStatus DecodeGeneralDoubleLow8RegsRegisterClass(
 
 static DecodeStatus DecodeHvxWRRegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t /*Address*/,
-                                             const void *Decoder) {
+                                             const MCDisassembler *Decoder) {
   static const MCPhysReg HvxWRDecoderTable[] = {
       Hexagon::W0,   Hexagon::WR0,  Hexagon::W1,   Hexagon::WR1,  Hexagon::W2,
       Hexagon::WR2,  Hexagon::W3,   Hexagon::WR3,  Hexagon::W4,   Hexagon::WR4,
@@ -629,11 +634,11 @@ static DecodeStatus DecodeHvxWRRegisterClass(MCInst &Inst, unsigned RegNo,
   return DecodeRegisterClass(Inst, RegNo, HvxWRDecoderTable);
 }
 
-LLVM_ATTRIBUTE_UNUSED  // Suppress warning temporarily.
-static DecodeStatus DecodeHvxVQRRegisterClass(MCInst &Inst,
-                                              unsigned RegNo,
-                                              uint64_t /*Address*/,
-                                              const void *Decoder) {
+LLVM_ATTRIBUTE_UNUSED // Suppress warning temporarily.
+    static DecodeStatus
+    DecodeHvxVQRRegisterClass(MCInst &Inst, unsigned RegNo,
+                              uint64_t /*Address*/,
+                              const MCDisassembler *Decoder) {
   static const MCPhysReg HvxVQRDecoderTable[] = {
       Hexagon::VQ0,  Hexagon::VQ1,  Hexagon::VQ2,  Hexagon::VQ3,
       Hexagon::VQ4,  Hexagon::VQ5,  Hexagon::VQ6,  Hexagon::VQ7};
@@ -643,7 +648,7 @@ static DecodeStatus DecodeHvxVQRRegisterClass(MCInst &Inst,
 
 static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                 uint64_t /*Address*/,
-                                                const void *Decoder) {
+                                                const MCDisassembler *Decoder) {
   static const MCPhysReg PredRegDecoderTable[] = {Hexagon::P0, Hexagon::P1,
                                                   Hexagon::P2, Hexagon::P3};
 
@@ -652,7 +657,7 @@ static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecodeHvxQRRegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t /*Address*/,
-                                             const void *Decoder) {
+                                             const MCDisassembler *Decoder) {
   static const MCPhysReg HvxQRDecoderTable[] = {Hexagon::Q0, Hexagon::Q1,
                                                 Hexagon::Q2, Hexagon::Q3};
 
@@ -661,7 +666,7 @@ static DecodeStatus DecodeHvxQRRegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t /*Address*/,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   using namespace Hexagon;
 
   static const MCPhysReg CtrlRegDecoderTable[] = {
@@ -687,9 +692,9 @@ static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
-                                                 uint64_t /*Address*/,
-                                                 const void *Decoder) {
+static DecodeStatus
+DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t /*Address*/,
+                             const MCDisassembler *Decoder) {
   using namespace Hexagon;
 
   static const MCPhysReg CtrlReg64DecoderTable[] = {
@@ -717,7 +722,7 @@ static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t /*Address*/,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   unsigned Register = 0;
   switch (RegNo) {
   case 0:
@@ -735,7 +740,7 @@ static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus unsignedImmDecoder(MCInst &MI, unsigned tmp,
                                        uint64_t /*Address*/,
-                                       const void *Decoder) {
+                                       const MCDisassembler *Decoder) {
   HexagonDisassembler const &Disassembler = disassembler(Decoder);
   int64_t FullValue = fullValue(Disassembler, MI, tmp);
   assert(FullValue >= 0 && "Negative in unsigned decoder");
@@ -744,7 +749,8 @@ static DecodeStatus unsignedImmDecoder(MCInst &MI, unsigned tmp,
 }
 
 static DecodeStatus s32_0ImmDecoder(MCInst &MI, unsigned tmp,
-                                    uint64_t /*Address*/, const void *Decoder) {
+                                    uint64_t /*Address*/,
+                                    const MCDisassembler *Decoder) {
   HexagonDisassembler const &Disassembler = disassembler(Decoder);
   unsigned Bits = HexagonMCInstrInfo::getExtentBits(*Disassembler.MCII, MI);
   tmp = SignExtend64(tmp, Bits);
@@ -754,7 +760,7 @@ static DecodeStatus s32_0ImmDecoder(MCInst &MI, unsigned tmp,
 
 // custom decoder for various jump/call immediates
 static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
-                                    const void *Decoder) {
+                                    const MCDisassembler *Decoder) {
   HexagonDisassembler const &Disassembler = disassembler(Decoder);
   unsigned Bits = HexagonMCInstrInfo::getExtentBits(*Disassembler.MCII, MI);
   // r13_2 is not extendable, so if there are no extent bits, it's r13_2
@@ -762,7 +768,8 @@ static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
     Bits = 15;
   uint64_t FullValue = fullValue(Disassembler, MI, SignExtend64(tmp, Bits));
   uint32_t Extended = FullValue + Address;
-  if (!Disassembler.tryAddingSymbolicOperand(MI, Extended, Address, true, 0, 4))
+  if (!Disassembler.tryAddingSymbolicOperand(MI, Extended, Address, true, 0, 0,
+                                             4))
     HexagonMCInstrInfo::addConstant(MI, Extended, Disassembler.getContext());
   return MCDisassembler::Success;
 }
@@ -799,7 +806,7 @@ static const uint16_t SysRegDecoderTable[] = {
 
 static DecodeStatus DecodeSysRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t /*Address*/,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   if (RegNo >= sizeof(SysRegDecoderTable) / sizeof(SysRegDecoderTable[0]))
     return MCDisassembler::Fail;
 
@@ -824,9 +831,9 @@ static const uint16_t SysReg64DecoderTable[] = {
     Hexagon::S73_72, Hexagon::S75_74, Hexagon::S77_76, Hexagon::S79_78,
 };
 
-static DecodeStatus DecodeSysRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
-                                                 uint64_t /*Address*/,
-                                                 const void *Decoder) {
+static DecodeStatus
+DecodeSysRegs64RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t /*Address*/,
+                             const MCDisassembler *Decoder) {
   RegNo = RegNo >> 1;
   if (RegNo >= sizeof(SysReg64DecoderTable) / sizeof(SysReg64DecoderTable[0]))
     return MCDisassembler::Fail;
@@ -839,9 +846,9 @@ static DecodeStatus DecodeSysRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeGuestRegsRegisterClass(MCInst &Inst, unsigned RegNo,
-                                                 uint64_t /*Address*/,
-                                                 const void *Decoder) {
+static DecodeStatus
+DecodeGuestRegsRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t /*Address*/,
+                             const MCDisassembler *Decoder) {
   using namespace Hexagon;
 
   static const MCPhysReg GuestRegDecoderTable[] = {
@@ -865,9 +872,10 @@ static DecodeStatus DecodeGuestRegsRegisterClass(MCInst &Inst, unsigned RegNo,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeGuestRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
-                                                   uint64_t /*Address*/,
-                                                   const void *Decoder) {
+static DecodeStatus
+DecodeGuestRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
+                               uint64_t /*Address*/,
+                               const MCDisassembler *Decoder) {
   using namespace Hexagon;
 
   static const MCPhysReg GuestReg64DecoderTable[] = {
diff --git a/llvm/lib/Target/Hexagon/HexagonArch.h b/llvm/lib/Target/Hexagon/HexagonArch.h
deleted file mode 100644
index 4a42ec98feb1..000000000000
--- a/llvm/lib/Target/Hexagon/HexagonArch.h
+++ /dev/null
@@ -1,31 +0,0 @@
-//===- HexagonArch.h ------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONARCH_H
-#define LLVM_LIB_TARGET_HEXAGON_HEXAGONARCH_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/StringRef.h"
-#include "HexagonDepArch.h"
-#include <algorithm>
-
-namespace llvm {
-namespace Hexagon {
-
-template <class ArchCont, typename Val>
-llvm::Optional<ArchEnum> GetCpu(ArchCont const &ArchList, Val CPUString) {
-  llvm::Optional<ArchEnum> Res;
-  auto Entry = ArchList.find(CPUString);
-  if (Entry != ArchList.end())
-    Res = Entry->second;
-  return Res;
-}
-} // namespace Hexagon
-} // namespace llvm
-#endif  // LLVM_LIB_TARGET_HEXAGON_HEXAGONARCH_H
diff --git a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index 411078052e0f..48d339234e9e 100644
--- a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -202,7 +202,7 @@ static MCSymbol *smallData(AsmPrinter &AP, const MachineInstr &MI,
 
     MCSectionELF *Section = OutStreamer.getContext().getELFSection(
         sectionName, ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
-    OutStreamer.SwitchSection(Section);
+    OutStreamer.switchSection(Section);
 
     Sym = AP.OutContext.getOrCreateSymbol(Twine(symbolName));
     if (Sym->isUndefined()) {
@@ -231,7 +231,7 @@ static MCSymbol *smallData(AsmPrinter &AP, const MachineInstr &MI,
     MCSectionELF *Section = OutStreamer.getContext().getELFSection(
         ".lita", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
 
-    OutStreamer.SwitchSection(Section);
+    OutStreamer.switchSection(Section);
     Sym = AP.OutContext.getOrCreateSymbol(Twine(LitaName));
     if (Sym->isUndefined()) {
       OutStreamer.emitLabel(Sym);
@@ -331,7 +331,7 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
       MCSymbol *Sym =
           smallData(*this, MI, *OutStreamer, Imm, 8, getSubtargetInfo());
 
-      OutStreamer->SwitchSection(Current.first, Current.second);
+      OutStreamer->switchSection(Current.first, Current.second);
       MCInst TmpInst;
       MCOperand &Reg = MappedInst.getOperand(0);
       TmpInst.setOpcode(Hexagon::L2_loadrdgp);
@@ -348,7 +348,7 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
       MCSectionSubPair Current = OutStreamer->getCurrentSection();
       MCSymbol *Sym =
           smallData(*this, MI, *OutStreamer, Imm, 4, getSubtargetInfo());
-      OutStreamer->SwitchSection(Current.first, Current.second);
+      OutStreamer->switchSection(Current.first, Current.second);
       MCInst TmpInst;
       MCOperand &Reg = MappedInst.getOperand(0);
       TmpInst.setOpcode(Hexagon::L2_loadrigp);
diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
index b2a842233bb8..673b397ef3c5 100644
--- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -39,6 +39,7 @@
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
+#include <deque>
 #include <iterator>
 #include <limits>
 #include <utility>
@@ -62,6 +63,9 @@ static cl::opt<unsigned> MaxBitSplit("hexbit-max-bitsplit", cl::Hidden,
   cl::init(std::numeric_limits<unsigned>::max()));
 static unsigned CountBitSplit = 0;
 
+static cl::opt<unsigned> RegisterSetLimit("hexbit-registerset-limit",
+  cl::Hidden, cl::init(1000));
+
 namespace llvm {
 
   void initializeHexagonBitSimplifyPass(PassRegistry& Registry);
@@ -72,23 +76,29 @@ namespace llvm {
 namespace {
 
   // Set of virtual registers, based on BitVector.
-  struct RegisterSet : private BitVector {
+  struct RegisterSet {
     RegisterSet() = default;
-    explicit RegisterSet(unsigned s, bool t = false) : BitVector(s, t) {}
+    explicit RegisterSet(unsigned s, bool t = false) : Bits(s, t) {}
     RegisterSet(const RegisterSet &RS) = default;
 
-    using BitVector::clear;
-    using BitVector::count;
+    void clear() {
+      Bits.clear();
+      LRU.clear();
+    }
+
+    unsigned count() const {
+      return Bits.count();
+    }
 
     unsigned find_first() const {
-      int First = BitVector::find_first();
+      int First = Bits.find_first();
       if (First < 0)
         return 0;
       return x2v(First);
     }
 
     unsigned find_next(unsigned Prev) const {
-      int Next = BitVector::find_next(v2x(Prev));
+      int Next = Bits.find_next(v2x(Prev));
       if (Next < 0)
         return 0;
       return x2v(Next);
@@ -97,54 +107,72 @@ namespace {
     RegisterSet &insert(unsigned R) {
       unsigned Idx = v2x(R);
       ensure(Idx);
-      return static_cast<RegisterSet&>(BitVector::set(Idx));
+      bool Exists = Bits.test(Idx);
+      Bits.set(Idx);
+      if (!Exists) {
+        LRU.push_back(Idx);
+        if (LRU.size() > RegisterSetLimit) {
+          unsigned T = LRU.front();
+          Bits.reset(T);
+          LRU.pop_front();
+        }
+      }
+      return *this;
     }
     RegisterSet &remove(unsigned R) {
       unsigned Idx = v2x(R);
-      if (Idx >= size())
-        return *this;
-      return static_cast<RegisterSet&>(BitVector::reset(Idx));
+      if (Idx < Bits.size()) {
+        bool Exists = Bits.test(Idx);
+        Bits.reset(Idx);
+        if (Exists) {
+          auto F = llvm::find(LRU, Idx);
+          assert(F != LRU.end());
+          LRU.erase(F);
+        }
+      }
+      return *this;
     }
 
     RegisterSet &insert(const RegisterSet &Rs) {
-      return static_cast<RegisterSet&>(BitVector::operator|=(Rs));
+      for (unsigned R = Rs.find_first(); R; R = Rs.find_next(R))
+        insert(R);
+      return *this;
     }
     RegisterSet &remove(const RegisterSet &Rs) {
-      return static_cast<RegisterSet&>(BitVector::reset(Rs));
+      for (unsigned R = Rs.find_first(); R; R = Rs.find_next(R))
+        remove(R);
+      return *this;
     }
 
-    reference operator[](unsigned R) {
-      unsigned Idx = v2x(R);
-      ensure(Idx);
-      return BitVector::operator[](Idx);
-    }
     bool operator[](unsigned R) const {
       unsigned Idx = v2x(R);
-      assert(Idx < size());
-      return BitVector::operator[](Idx);
+      return Idx < Bits.size() ? Bits[Idx] : false;
     }
     bool has(unsigned R) const {
       unsigned Idx = v2x(R);
-      if (Idx >= size())
+      if (Idx >= Bits.size())
         return false;
-      return BitVector::test(Idx);
+      return Bits.test(Idx);
     }
 
     bool empty() const {
-      return !BitVector::any();
+      return !Bits.any();
     }
     bool includes(const RegisterSet &Rs) const {
-      // A.BitVector::test(B)  <=>  A-B != {}
-      return !Rs.BitVector::test(*this);
+      // A.test(B)  <=>  A-B != {}
+      return !Rs.Bits.test(Bits);
     }
     bool intersects(const RegisterSet &Rs) const {
-      return BitVector::anyCommon(Rs);
+      return Bits.anyCommon(Rs.Bits);
     }
 
   private:
+    BitVector Bits;
+    std::deque<unsigned> LRU;
+
     void ensure(unsigned Idx) {
-      if (size() <= Idx)
-        resize(std::max(Idx+1, 32U));
+      if (Bits.size() <= Idx)
+        Bits.resize(std::max(Idx+1, 32U));
     }
 
     static inline unsigned v2x(unsigned v) {
@@ -1997,7 +2025,7 @@ bool BitSimplification::genStoreImmediate(MachineInstr *MI) {
   if (!isInt<8>(V))
     return false;
 
-  MI->RemoveOperand(2);
+  MI->removeOperand(2);
   switch (Opc) {
     case Hexagon::S2_storerb_io:
       MI->setDesc(HII.get(Hexagon::S4_storeirb_io));
diff --git a/llvm/lib/Target/Hexagon/HexagonBranchRelaxation.cpp b/llvm/lib/Target/Hexagon/HexagonBranchRelaxation.cpp
index faa48211cd82..ca7fddb0ebe5 100644
--- a/llvm/lib/Target/Hexagon/HexagonBranchRelaxation.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBranchRelaxation.cpp
@@ -33,8 +33,9 @@ using namespace llvm;
 
 // Since we have no exact knowledge of code layout, allow some safety buffer
 // for jump target. This is measured in bytes.
-static cl::opt<uint32_t> BranchRelaxSafetyBuffer("branch-relax-safety-buffer",
-  cl::init(200), cl::Hidden, cl::ZeroOrMore, cl::desc("safety buffer size"));
+static cl::opt<uint32_t>
+    BranchRelaxSafetyBuffer("branch-relax-safety-buffer", cl::init(200),
+                            cl::Hidden, cl::desc("safety buffer size"));
 
 namespace llvm {
 
diff --git a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
index fc5e05d8c9a0..2fe2e032714a 100644
--- a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
@@ -52,13 +52,12 @@
 using namespace llvm;
 
 static cl::opt<bool> OptSpeculate("commgep-speculate", cl::init(true),
-  cl::Hidden, cl::ZeroOrMore);
+                                  cl::Hidden);
 
-static cl::opt<bool> OptEnableInv("commgep-inv", cl::init(true), cl::Hidden,
-  cl::ZeroOrMore);
+static cl::opt<bool> OptEnableInv("commgep-inv", cl::init(true), cl::Hidden);
 
 static cl::opt<bool> OptEnableConst("commgep-const", cl::init(true),
-  cl::Hidden, cl::ZeroOrMore);
+                                    cl::Hidden);
 
 namespace llvm {
 
diff --git a/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp b/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
index d8af35cbf3a8..56fb50cdb09e 100644
--- a/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
@@ -9,6 +9,7 @@
 #include "HexagonInstrInfo.h"
 #include "HexagonRegisterInfo.h"
 #include "HexagonSubtarget.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -28,12 +29,13 @@
 
 using namespace llvm;
 
-static cl::opt<unsigned> CountThreshold("hexagon-cext-threshold",
-  cl::init(3), cl::Hidden, cl::ZeroOrMore,
-  cl::desc("Minimum number of extenders to trigger replacement"));
+static cl::opt<unsigned> CountThreshold(
+    "hexagon-cext-threshold", cl::init(3), cl::Hidden,
+    cl::desc("Minimum number of extenders to trigger replacement"));
 
-static cl::opt<unsigned> ReplaceLimit("hexagon-cext-limit", cl::init(0),
-  cl::Hidden, cl::ZeroOrMore, cl::desc("Maximum number of replacements"));
+static cl::opt<unsigned>
+    ReplaceLimit("hexagon-cext-limit", cl::init(0), cl::Hidden,
+                 cl::desc("Maximum number of replacements"));
 
 namespace llvm {
   void initializeHexagonConstExtendersPass(PassRegistry&);
diff --git a/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp b/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
index 105bf2811a20..8029dcff8052 100644
--- a/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
@@ -868,8 +868,8 @@ void MachineConstPropagator::removeCFGEdge(MachineBasicBlock *From,
     int N = PN.getNumOperands() - 2;
     while (N > 0) {
       if (PN.getOperand(N + 1).getMBB() == From) {
-        PN.RemoveOperand(N + 1);
-        PN.RemoveOperand(N);
+        PN.removeOperand(N + 1);
+        PN.removeOperand(N);
       }
       N -= 2;
     }
@@ -1217,8 +1217,8 @@ bool MachineConstEvaluator::evaluateCMPii(uint32_t Cmp, const APInt &A1,
   unsigned W2 = A2.getBitWidth();
   unsigned MaxW = (W1 >= W2) ? W1 : W2;
   if (Cmp & Comparison::U) {
-    const APInt Zx1 = A1.zextOrSelf(MaxW);
-    const APInt Zx2 = A2.zextOrSelf(MaxW);
+    APInt Zx1 = A1.zext(MaxW);
+    APInt Zx2 = A2.zext(MaxW);
     if (Cmp & Comparison::L)
       Result = Zx1.ult(Zx2);
     else if (Cmp & Comparison::G)
@@ -1227,8 +1227,8 @@ bool MachineConstEvaluator::evaluateCMPii(uint32_t Cmp, const APInt &A1,
   }
 
   // Signed comparison.
-  const APInt Sx1 = A1.sextOrSelf(MaxW);
-  const APInt Sx2 = A2.sextOrSelf(MaxW);
+  APInt Sx1 = A1.sext(MaxW);
+  APInt Sx2 = A2.sext(MaxW);
   if (Cmp & Comparison::L)
     Result = Sx1.slt(Sx2);
   else if (Cmp & Comparison::G)
@@ -1813,7 +1813,7 @@ bool MachineConstEvaluator::evaluateSplati(const APInt &A1, unsigned Bits,
       unsigned Count, APInt &Result) {
   assert(Count > 0);
   unsigned BW = A1.getBitWidth(), SW = Count*Bits;
-  APInt LoBits = (Bits < BW) ? A1.trunc(Bits) : A1.zextOrSelf(Bits);
+  APInt LoBits = (Bits < BW) ? A1.trunc(Bits) : A1.zext(Bits);
   if (Count > 1)
     LoBits = LoBits.zext(SW);
 
@@ -2510,7 +2510,7 @@ APInt HexagonConstEvaluator::getCmpImm(unsigned Opc, unsigned OpX,
 void HexagonConstEvaluator::replaceWithNop(MachineInstr &MI) {
   MI.setDesc(HII.get(Hexagon::A2_nop));
   while (MI.getNumOperands() > 0)
-    MI.RemoveOperand(0);
+    MI.removeOperand(0);
 }
 
 bool HexagonConstEvaluator::evaluateHexRSEQ32(RegisterSubReg RL, RegisterSubReg RH,
@@ -2538,9 +2538,9 @@ bool HexagonConstEvaluator::evaluateHexRSEQ32(RegisterSubReg RL, RegisterSubReg
   }
 
   for (unsigned i = 0; i < HiVs.size(); ++i) {
-    APInt HV = HiVs[i].zextOrSelf(64) << 32;
+    APInt HV = HiVs[i].zext(64) << 32;
     for (unsigned j = 0; j < LoVs.size(); ++j) {
-      APInt LV = LoVs[j].zextOrSelf(64);
+      APInt LV = LoVs[j].zext(64);
       const Constant *C = intToConst(HV | LV);
       Result.add(C);
       if (Result.isBottom())
@@ -3165,7 +3165,7 @@ bool HexagonConstEvaluator::rewriteHexBranch(MachineInstr &BrI,
                   .addMBB(TargetB);
       BrI.setDesc(JD);
       while (BrI.getNumOperands() > 0)
-        BrI.RemoveOperand(0);
+        BrI.removeOperand(0);
       // This ensures that all implicit operands (e.g. implicit-def %r31, etc)
       // are present in the rewritten branch.
       for (auto &Op : NI->operands())
diff --git a/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
index 2ee7f1325df9..dc5b674424c8 100644
--- a/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
@@ -33,16 +33,14 @@ using namespace llvm;
 
 #define DEBUG_TYPE "hexagon-copy-combine"
 
-static
-cl::opt<bool> IsCombinesDisabled("disable-merge-into-combines",
-                                 cl::Hidden, cl::ZeroOrMore,
-                                 cl::init(false),
-                                 cl::desc("Disable merging into combines"));
-static
-cl::opt<bool> IsConst64Disabled("disable-const64",
-                                 cl::Hidden, cl::ZeroOrMore,
-                                 cl::init(false),
-                                 cl::desc("Disable generation of const64"));
+static cl::opt<bool>
+    IsCombinesDisabled("disable-merge-into-combines", cl::Hidden,
+
+                       cl::desc("Disable merging into combines"));
+static cl::opt<bool>
+    IsConst64Disabled("disable-const64", cl::Hidden,
+
+                      cl::desc("Disable generation of const64"));
 static
 cl::opt<unsigned>
 MaxNumOfInstsBetweenNewValueStoreAndTFR("max-num-inst-between-tfr-and-nv-store",
diff --git a/llvm/lib/Target/Hexagon/HexagonDepArch.h b/llvm/lib/Target/Hexagon/HexagonDepArch.h
index 56174dc7e136..41ce5c465d41 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepArch.h
+++ b/llvm/lib/Target/Hexagon/HexagonDepArch.h
@@ -12,82 +12,28 @@
 #ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONDEPARCH_H
 #define LLVM_LIB_TARGET_HEXAGON_HEXAGONDEPARCH_H
 
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/BinaryFormat/ELF.h"
-
-#include <map>
-#include <string>
+#include "llvm/ADT/StringSwitch.h"
 
 namespace llvm {
 namespace Hexagon {
 enum class ArchEnum { NoArch, Generic, V5, V55, V60, V62, V65, V66, V67, V68, V69 };
 
-static constexpr unsigned ArchValsNumArray[] = {5, 55, 60, 62, 65, 66, 67, 68, 69};
-static constexpr ArrayRef<unsigned> ArchValsNum(ArchValsNumArray);
-
-static constexpr StringLiteral ArchValsTextArray[] = { "v5", "v55", "v60", "v62", "v65", "v66", "v67", "v68", "v69" };
-static constexpr ArrayRef<StringLiteral> ArchValsText(ArchValsTextArray);
-
-static constexpr StringLiteral CpuValsTextArray[] = { "hexagonv5", "hexagonv55", "hexagonv60", "hexagonv62", "hexagonv65", "hexagonv66", "hexagonv67", "hexagonv67t", "hexagonv68", "hexagonv69" };
-static constexpr ArrayRef<StringLiteral> CpuValsText(CpuValsTextArray);
-
-static constexpr StringLiteral CpuNickTextArray[] = { "v5", "v55", "v60", "v62", "v65", "v66", "v67", "v67t", "v68", "v69" };
-static constexpr ArrayRef<StringLiteral> CpuNickText(CpuNickTextArray);
-
-static const std::map<std::string, ArchEnum> CpuTable{
-    {"generic", Hexagon::ArchEnum::V5},
-    {"hexagonv5", Hexagon::ArchEnum::V5},
-    {"hexagonv55", Hexagon::ArchEnum::V55},
-    {"hexagonv60", Hexagon::ArchEnum::V60},
-    {"hexagonv62", Hexagon::ArchEnum::V62},
-    {"hexagonv65", Hexagon::ArchEnum::V65},
-    {"hexagonv66", Hexagon::ArchEnum::V66},
-    {"hexagonv67", Hexagon::ArchEnum::V67},
-    {"hexagonv67t", Hexagon::ArchEnum::V67},
-    {"hexagonv68", Hexagon::ArchEnum::V68},
-    {"hexagonv69", Hexagon::ArchEnum::V69},
-};
-
-static const std::map<std::string, unsigned> ElfFlagsByCpuStr = {
-  {"generic", llvm::ELF::EF_HEXAGON_MACH_V5},
-  {"hexagonv5", llvm::ELF::EF_HEXAGON_MACH_V5},
-  {"hexagonv55", llvm::ELF::EF_HEXAGON_MACH_V55},
-  {"hexagonv60", llvm::ELF::EF_HEXAGON_MACH_V60},
-  {"hexagonv62", llvm::ELF::EF_HEXAGON_MACH_V62},
-  {"hexagonv65", llvm::ELF::EF_HEXAGON_MACH_V65},
-  {"hexagonv66", llvm::ELF::EF_HEXAGON_MACH_V66},
-  {"hexagonv67", llvm::ELF::EF_HEXAGON_MACH_V67},
-  {"hexagonv67t", llvm::ELF::EF_HEXAGON_MACH_V67T},
-  {"hexagonv68", llvm::ELF::EF_HEXAGON_MACH_V68},
-  {"hexagonv69", llvm::ELF::EF_HEXAGON_MACH_V69},
-};
-static const std::map<unsigned, std::string> ElfArchByMachFlags = {
-  {llvm::ELF::EF_HEXAGON_MACH_V5, "V5"},
-  {llvm::ELF::EF_HEXAGON_MACH_V55, "V55"},
-  {llvm::ELF::EF_HEXAGON_MACH_V60, "V60"},
-  {llvm::ELF::EF_HEXAGON_MACH_V62, "V62"},
-  {llvm::ELF::EF_HEXAGON_MACH_V65, "V65"},
-  {llvm::ELF::EF_HEXAGON_MACH_V66, "V66"},
-  {llvm::ELF::EF_HEXAGON_MACH_V67, "V67"},
-  {llvm::ELF::EF_HEXAGON_MACH_V67T, "V67T"},
-  {llvm::ELF::EF_HEXAGON_MACH_V68, "V68"},
-  {llvm::ELF::EF_HEXAGON_MACH_V69, "V69"},
-};
-static const std::map<unsigned, std::string> ElfCpuByMachFlags = {
-  {llvm::ELF::EF_HEXAGON_MACH_V5, "hexagonv5"},
-  {llvm::ELF::EF_HEXAGON_MACH_V55, "hexagonv55"},
-  {llvm::ELF::EF_HEXAGON_MACH_V60, "hexagonv60"},
-  {llvm::ELF::EF_HEXAGON_MACH_V62, "hexagonv62"},
-  {llvm::ELF::EF_HEXAGON_MACH_V65, "hexagonv65"},
-  {llvm::ELF::EF_HEXAGON_MACH_V66, "hexagonv66"},
-  {llvm::ELF::EF_HEXAGON_MACH_V67, "hexagonv67"},
-  {llvm::ELF::EF_HEXAGON_MACH_V67T, "hexagonv67t"},
-  {llvm::ELF::EF_HEXAGON_MACH_V68, "hexagonv68"},
-  {llvm::ELF::EF_HEXAGON_MACH_V69, "hexagonv69"},
-};
-
+inline Optional<Hexagon::ArchEnum> getCpu(StringRef CPU) {
+  return StringSwitch<Optional<Hexagon::ArchEnum>>(CPU)
+      .Case("generic", Hexagon::ArchEnum::V5)
+      .Case("hexagonv5", Hexagon::ArchEnum::V5)
+      .Case("hexagonv55", Hexagon::ArchEnum::V55)
+      .Case("hexagonv60", Hexagon::ArchEnum::V60)
+      .Case("hexagonv62", Hexagon::ArchEnum::V62)
+      .Case("hexagonv65", Hexagon::ArchEnum::V65)
+      .Case("hexagonv66", Hexagon::ArchEnum::V66)
+      .Case("hexagonv67", Hexagon::ArchEnum::V67)
+      .Case("hexagonv67t", Hexagon::ArchEnum::V67)
+      .Case("hexagonv68", Hexagon::ArchEnum::V68)
+      .Case("hexagonv69", Hexagon::ArchEnum::V69)
+      .Default(None);
+}
 } // namespace Hexagon
-} // namespace llvm;
+} // namespace llvm
 
 #endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONDEPARCH_H
diff --git a/llvm/lib/Target/Hexagon/HexagonDepDecoders.inc b/llvm/lib/Target/Hexagon/HexagonDepDecoders.inc
index 7164af3ad5c6..e979cfe6e325 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepDecoders.inc
+++ b/llvm/lib/Target/Hexagon/HexagonDepDecoders.inc
@@ -14,58 +14,58 @@
 #pragma clang diagnostic ignored "-Wunused-function"
 #endif
 
-static DecodeStatus s6_0ImmDecoder(MCInst &MI, unsigned tmp,
-    uint64_t, const void *Decoder) {
+static DecodeStatus s6_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t,
+                                   const MCDisassembler *Decoder) {
   signedDecoder<6>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
-static DecodeStatus s31_1ImmDecoder(MCInst &MI, unsigned tmp,
-    uint64_t, const void *Decoder) {
+static DecodeStatus s31_1ImmDecoder(MCInst &MI, unsigned tmp, uint64_t,
+                                    const MCDisassembler *Decoder) {
   signedDecoder<12>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
-static DecodeStatus s30_2ImmDecoder(MCInst &MI, unsigned tmp,
-    uint64_t, const void *Decoder) {
+static DecodeStatus s30_2ImmDecoder(MCInst &MI, unsigned tmp, uint64_t,
+                                    const MCDisassembler *Decoder) {
   signedDecoder<13>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
-static DecodeStatus s29_3ImmDecoder(MCInst &MI, unsigned tmp,
-    uint64_t, const void *Decoder) {
+static DecodeStatus s29_3ImmDecoder(MCInst &MI, unsigned tmp, uint64_t,
+                                    const MCDisassembler *Decoder) {
   signedDecoder<14>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
-static DecodeStatus s3_0ImmDecoder(MCInst &MI, unsigned tmp,
-    uint64_t, const void *Decoder) {
+static DecodeStatus s3_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t,
+                                   const MCDisassembler *Decoder) {
   signedDecoder<3>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
-static DecodeStatus s4_0ImmDecoder(MCInst &MI, unsigned tmp,
-    uint64_t, const void *Decoder) {
+static DecodeStatus s4_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t,
+                                   const MCDisassembler *Decoder) {
   signedDecoder<4>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
-static DecodeStatus s4_1ImmDecoder(MCInst &MI, unsigned tmp,
-    uint64_t, const void *Decoder) {
+static DecodeStatus s4_1ImmDecoder(MCInst &MI, unsigned tmp, uint64_t,
+                                   const MCDisassembler *Decoder) {
   signedDecoder<5>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
-static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp,
-    uint64_t, const void *Decoder) {
+static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp, uint64_t,
+                                   const MCDisassembler *Decoder) {
   signedDecoder<6>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
-static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp,
-    uint64_t, const void *Decoder) {
+static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp, uint64_t,
+                                   const MCDisassembler *Decoder) {
   signedDecoder<7>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
-static DecodeStatus s6_3ImmDecoder(MCInst &MI, unsigned tmp,
-    uint64_t, const void *Decoder) {
+static DecodeStatus s6_3ImmDecoder(MCInst &MI, unsigned tmp, uint64_t,
+                                   const MCDisassembler *Decoder) {
   signedDecoder<9>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
-static DecodeStatus s8_0ImmDecoder(MCInst &MI, unsigned tmp,
-    uint64_t, const void *Decoder) {
+static DecodeStatus s8_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t,
+                                   const MCDisassembler *Decoder) {
   signedDecoder<8>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
index 2207925ceeba..f7227dca3b60 100644
--- a/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
@@ -826,8 +826,8 @@ void HexagonEarlyIfConversion::updatePhiNodes(MachineBasicBlock *WhereB,
         FR = RO.getReg(), FSR = RO.getSubReg();
       else
         continue;
-      PN->RemoveOperand(i+1);
-      PN->RemoveOperand(i);
+      PN->removeOperand(i+1);
+      PN->removeOperand(i);
     }
     if (TR == 0)
       TR = SR, TSR = SSR;
diff --git a/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
index 2693940bb1e9..853553f57ba4 100644
--- a/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
@@ -696,7 +696,7 @@ bool HexagonExpandCondsets::split(MachineInstr &MI,
       MI.setDesc(HII->get(TargetOpcode::COPY));
       unsigned S = getRegState(ST);
       while (MI.getNumOperands() > 1)
-        MI.RemoveOperand(MI.getNumOperands()-1);
+        MI.removeOperand(MI.getNumOperands()-1);
       MachineFunction &MF = *MI.getParent()->getParent();
       MachineInstrBuilder(MF, MI).addReg(RT.Reg, S, RT.Sub);
       return true;
diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 989a98571434..0b4a95bc9ce5 100644
--- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -152,33 +152,38 @@ using namespace llvm;
 static cl::opt<bool> DisableDeallocRet("disable-hexagon-dealloc-ret",
     cl::Hidden, cl::desc("Disable Dealloc Return for Hexagon target"));
 
-static cl::opt<unsigned> NumberScavengerSlots("number-scavenger-slots",
-    cl::Hidden, cl::desc("Set the number of scavenger slots"), cl::init(2),
-    cl::ZeroOrMore);
-
-static cl::opt<int> SpillFuncThreshold("spill-func-threshold",
-    cl::Hidden, cl::desc("Specify O2(not Os) spill func threshold"),
-    cl::init(6), cl::ZeroOrMore);
-
-static cl::opt<int> SpillFuncThresholdOs("spill-func-threshold-Os",
-    cl::Hidden, cl::desc("Specify Os spill func threshold"),
-    cl::init(1), cl::ZeroOrMore);
-
-static cl::opt<bool> EnableStackOVFSanitizer("enable-stackovf-sanitizer",
-    cl::Hidden, cl::desc("Enable runtime checks for stack overflow."),
-    cl::init(false), cl::ZeroOrMore);
-
-static cl::opt<bool> EnableShrinkWrapping("hexagon-shrink-frame",
-    cl::init(true), cl::Hidden, cl::ZeroOrMore,
-    cl::desc("Enable stack frame shrink wrapping"));
-
-static cl::opt<unsigned> ShrinkLimit("shrink-frame-limit",
-    cl::init(std::numeric_limits<unsigned>::max()), cl::Hidden, cl::ZeroOrMore,
-    cl::desc("Max count of stack frame shrink-wraps"));
-
-static cl::opt<bool> EnableSaveRestoreLong("enable-save-restore-long",
-    cl::Hidden, cl::desc("Enable long calls for save-restore stubs."),
-    cl::init(false), cl::ZeroOrMore);
+static cl::opt<unsigned>
+    NumberScavengerSlots("number-scavenger-slots", cl::Hidden,
+                         cl::desc("Set the number of scavenger slots"),
+                         cl::init(2));
+
+static cl::opt<int>
+    SpillFuncThreshold("spill-func-threshold", cl::Hidden,
+                       cl::desc("Specify O2(not Os) spill func threshold"),
+                       cl::init(6));
+
+static cl::opt<int>
+    SpillFuncThresholdOs("spill-func-threshold-Os", cl::Hidden,
+                         cl::desc("Specify Os spill func threshold"),
+                         cl::init(1));
+
+static cl::opt<bool> EnableStackOVFSanitizer(
+    "enable-stackovf-sanitizer", cl::Hidden,
+    cl::desc("Enable runtime checks for stack overflow."), cl::init(false));
+
+static cl::opt<bool>
+    EnableShrinkWrapping("hexagon-shrink-frame", cl::init(true), cl::Hidden,
+                         cl::desc("Enable stack frame shrink wrapping"));
+
+static cl::opt<unsigned>
+    ShrinkLimit("shrink-frame-limit",
+                cl::init(std::numeric_limits<unsigned>::max()), cl::Hidden,
+                cl::desc("Max count of stack frame shrink-wraps"));
+
+static cl::opt<bool>
+    EnableSaveRestoreLong("enable-save-restore-long", cl::Hidden,
+                          cl::desc("Enable long calls for save-restore stubs."),
+                          cl::init(false));
 
 static cl::opt<bool> EliminateFramePointer("hexagon-fp-elim", cl::init(true),
     cl::Hidden, cl::desc("Refrain from using FP whenever possible"));
@@ -1018,7 +1023,7 @@ findCFILocation(MachineBasicBlock &B) {
 void HexagonFrameLowering::insertCFIInstructions(MachineFunction &MF) const {
   for (auto &B : MF) {
     auto At = findCFILocation(B);
-    if (At.hasValue())
+    if (At)
       insertCFIInstructionsAt(B, At.getValue());
   }
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
index 0bb1658e7698..44f21dbacd3c 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -47,34 +47,36 @@
 
 using namespace llvm;
 
-static cl::opt<unsigned> VRegIndexCutoff("insert-vreg-cutoff", cl::init(~0U),
-  cl::Hidden, cl::ZeroOrMore, cl::desc("Vreg# cutoff for insert generation."));
+static cl::opt<unsigned>
+    VRegIndexCutoff("insert-vreg-cutoff", cl::init(~0U), cl::Hidden,
+                    cl::desc("Vreg# cutoff for insert generation."));
 // The distance cutoff is selected based on the precheckin-perf results:
 // cutoffs 20, 25, 35, and 40 are worse than 30.
-static cl::opt<unsigned> VRegDistCutoff("insert-dist-cutoff", cl::init(30U),
-  cl::Hidden, cl::ZeroOrMore, cl::desc("Vreg distance cutoff for insert "
-  "generation."));
+static cl::opt<unsigned>
+    VRegDistCutoff("insert-dist-cutoff", cl::init(30U), cl::Hidden,
+                   cl::desc("Vreg distance cutoff for insert "
+                            "generation."));
 
 // Limit the container sizes for extreme cases where we run out of memory.
-static cl::opt<unsigned> MaxORLSize("insert-max-orl", cl::init(4096),
-  cl::Hidden, cl::ZeroOrMore, cl::desc("Maximum size of OrderedRegisterList"));
+static cl::opt<unsigned>
+    MaxORLSize("insert-max-orl", cl::init(4096), cl::Hidden,
+               cl::desc("Maximum size of OrderedRegisterList"));
 static cl::opt<unsigned> MaxIFMSize("insert-max-ifmap", cl::init(1024),
-  cl::Hidden, cl::ZeroOrMore, cl::desc("Maximum size of IFMap"));
-
-static cl::opt<bool> OptTiming("insert-timing", cl::init(false), cl::Hidden,
-  cl::ZeroOrMore, cl::desc("Enable timing of insert generation"));
-static cl::opt<bool> OptTimingDetail("insert-timing-detail", cl::init(false),
-  cl::Hidden, cl::ZeroOrMore, cl::desc("Enable detailed timing of insert "
-  "generation"));
-
-static cl::opt<bool> OptSelectAll0("insert-all0", cl::init(false), cl::Hidden,
-  cl::ZeroOrMore);
-static cl::opt<bool> OptSelectHas0("insert-has0", cl::init(false), cl::Hidden,
-  cl::ZeroOrMore);
+                                    cl::Hidden,
+                                    cl::desc("Maximum size of IFMap"));
+
+static cl::opt<bool> OptTiming("insert-timing", cl::Hidden,
+                               cl::desc("Enable timing of insert generation"));
+static cl::opt<bool>
+    OptTimingDetail("insert-timing-detail", cl::Hidden,
+                    cl::desc("Enable detailed timing of insert "
+                             "generation"));
+
+static cl::opt<bool> OptSelectAll0("insert-all0", cl::init(false), cl::Hidden);
+static cl::opt<bool> OptSelectHas0("insert-has0", cl::init(false), cl::Hidden);
 // Whether to construct constant values via "insert". Could eliminate constant
 // extenders, but often not practical.
-static cl::opt<bool> OptConst("insert-const", cl::init(false), cl::Hidden,
-  cl::ZeroOrMore);
+static cl::opt<bool> OptConst("insert-const", cl::init(false), cl::Hidden);
 
 // The preprocessor gets confused when the DEBUG macro is passed larger
 // chunks of code. Use this function to detect debugging.
@@ -92,11 +94,8 @@ namespace {
   struct RegisterSet : private BitVector {
     RegisterSet() = default;
     explicit RegisterSet(unsigned s, bool t = false) : BitVector(s, t) {}
-    RegisterSet(const RegisterSet &RS) : BitVector(RS) {}
-    RegisterSet &operator=(const RegisterSet &RS) {
-      BitVector::operator=(RS);
-      return *this;
-    }
+    RegisterSet(const RegisterSet &RS) = default;
+    RegisterSet &operator=(const RegisterSet &RS) = default;
 
     using BitVector::clear;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index 43afae441457..acc0bb8941c1 100644
--- a/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -81,9 +81,9 @@ static cl::opt<bool> HWCreatePreheader("hexagon-hwloop-preheader",
 // Turn it off by default. If a preheader block is not created here, the
 // software pipeliner may be unable to find a block suitable to serve as
 // a preheader. In that case SWP will not run.
-static cl::opt<bool> SpecPreheader("hwloop-spec-preheader", cl::init(false),
-  cl::Hidden, cl::ZeroOrMore, cl::desc("Allow speculation of preheader "
-  "instructions"));
+static cl::opt<bool> SpecPreheader("hwloop-spec-preheader", cl::Hidden,
+                                   cl::desc("Allow speculation of preheader "
+                                            "instructions"));
 
 STATISTIC(NumHWLoops, "Number of loops converted to hardware loops");
 
@@ -1911,8 +1911,8 @@ MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop(
       for (int i = PN->getNumOperands()-2; i > 0; i -= 2) {
         MachineBasicBlock *PredB = PN->getOperand(i+1).getMBB();
         if (PredB != Latch) {
-          PN->RemoveOperand(i+1);
-          PN->RemoveOperand(i);
+          PN->removeOperand(i+1);
+          PN->removeOperand(i);
         }
       }
       PN->addOperand(MachineOperand::CreateReg(NewPR, false));
diff --git a/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp b/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp
index e2215c9900d0..577eccd25c19 100644
--- a/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp
@@ -106,7 +106,7 @@ bool HexagonHazardRecognizer::isNewStore(MachineInstr &MI) {
   if (!TII->mayBeNewStore(MI))
     return false;
   MachineOperand &MO = MI.getOperand(MI.getNumOperands() - 1);
-  return (MO.isReg() && RegDefs.count(MO.getReg()) != 0);
+  return MO.isReg() && RegDefs.contains(MO.getReg());
 }
 
 void HexagonHazardRecognizer::EmitInstruction(SUnit *SU) {
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index 161768b8dc22..b4979c953516 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -1345,7 +1345,8 @@ inline bool HexagonDAGToDAGISel::SelectAnyInt(SDValue &N, SDValue &R) {
   EVT T = N.getValueType();
   if (!T.isInteger() || T.getSizeInBits() != 32 || !isa<ConstantSDNode>(N))
     return false;
-  R = N;
+  int32_t V = cast<const ConstantSDNode>(N)->getZExtValue();
+  R = CurDAG->getTargetConstant(V, SDLoc(N), N.getValueType());
   return true;
 }
 
@@ -1540,7 +1541,7 @@ bool HexagonDAGToDAGISel::keepsLowBits(const SDValue &Val, unsigned NumBits,
     break;
   case ISD::AND: {
     // Check if this is an AND with NumBits of lower bits set to 1.
-    uint64_t Mask = (1 << NumBits) - 1;
+    uint64_t Mask = (1ULL << NumBits) - 1;
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val.getOperand(0))) {
       if (C->getZExtValue() == Mask) {
         Src = Val.getOperand(1);
@@ -1558,7 +1559,7 @@ bool HexagonDAGToDAGISel::keepsLowBits(const SDValue &Val, unsigned NumBits,
   case ISD::OR:
   case ISD::XOR: {
     // OR/XOR with the lower NumBits bits set to 0.
-    uint64_t Mask = (1 << NumBits) - 1;
+    uint64_t Mask = (1ULL << NumBits) - 1;
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val.getOperand(0))) {
       if ((C->getZExtValue() & Mask) == 0) {
         Src = Val.getOperand(1);
@@ -1580,7 +1581,7 @@ bool HexagonDAGToDAGISel::keepsLowBits(const SDValue &Val, unsigned NumBits,
 }
 
 bool HexagonDAGToDAGISel::isAlignedMemNode(const MemSDNode *N) const {
-  return N->getAlignment() >= N->getMemoryVT().getStoreSize();
+  return N->getAlign().value() >= N->getMemoryVT().getStoreSize();
 }
 
 bool HexagonDAGToDAGISel::isSmallStackStore(const StoreSDNode *N) const {
@@ -1655,7 +1656,7 @@ struct WeightedLeaf {
   int Weight;
   int InsertionOrder;
 
-  WeightedLeaf() : Value(SDValue()) { }
+  WeightedLeaf() {}
 
   WeightedLeaf(SDValue Value, int Weight, int InsertionOrder) :
     Value(Value), Weight(Weight), InsertionOrder(InsertionOrder) {
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
index 0a6dd727eb82..0848d30e7403 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
@@ -801,7 +801,7 @@ static const HexagonTargetLowering &getHexagonLowering(SelectionDAG &G) {
   return static_cast<const HexagonTargetLowering&>(G.getTargetLoweringInfo());
 }
 static const HexagonSubtarget &getHexagonSubtarget(SelectionDAG &G) {
-  return static_cast<const HexagonSubtarget&>(G.getSubtarget());
+  return G.getSubtarget<HexagonSubtarget>();
 }
 
 namespace llvm {
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index d7ca934a23e6..94411b2e4f98 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -72,41 +72,41 @@ static cl::opt<bool> EmitJumpTables("hexagon-emit-jump-tables",
   cl::init(true), cl::Hidden,
   cl::desc("Control jump table emission on Hexagon target"));
 
-static cl::opt<bool> EnableHexSDNodeSched("enable-hexagon-sdnode-sched",
-  cl::Hidden, cl::ZeroOrMore, cl::init(false),
-  cl::desc("Enable Hexagon SDNode scheduling"));
+static cl::opt<bool>
+    EnableHexSDNodeSched("enable-hexagon-sdnode-sched", cl::Hidden,
+                         cl::desc("Enable Hexagon SDNode scheduling"));
 
-static cl::opt<bool> EnableFastMath("ffast-math",
-  cl::Hidden, cl::ZeroOrMore, cl::init(false),
-  cl::desc("Enable Fast Math processing"));
+static cl::opt<bool> EnableFastMath("ffast-math", cl::Hidden,
+                                    cl::desc("Enable Fast Math processing"));
 
-static cl::opt<int> MinimumJumpTables("minimum-jump-tables",
-  cl::Hidden, cl::ZeroOrMore, cl::init(5),
-  cl::desc("Set minimum jump tables"));
+static cl::opt<int> MinimumJumpTables("minimum-jump-tables", cl::Hidden,
+                                      cl::init(5),
+                                      cl::desc("Set minimum jump tables"));
 
-static cl::opt<int> MaxStoresPerMemcpyCL("max-store-memcpy",
-  cl::Hidden, cl::ZeroOrMore, cl::init(6),
-  cl::desc("Max #stores to inline memcpy"));
+static cl::opt<int>
+    MaxStoresPerMemcpyCL("max-store-memcpy", cl::Hidden, cl::init(6),
+                         cl::desc("Max #stores to inline memcpy"));
 
-static cl::opt<int> MaxStoresPerMemcpyOptSizeCL("max-store-memcpy-Os",
-  cl::Hidden, cl::ZeroOrMore, cl::init(4),
-  cl::desc("Max #stores to inline memcpy"));
+static cl::opt<int>
+    MaxStoresPerMemcpyOptSizeCL("max-store-memcpy-Os", cl::Hidden, cl::init(4),
+                                cl::desc("Max #stores to inline memcpy"));
 
-static cl::opt<int> MaxStoresPerMemmoveCL("max-store-memmove",
-  cl::Hidden, cl::ZeroOrMore, cl::init(6),
-  cl::desc("Max #stores to inline memmove"));
+static cl::opt<int>
+    MaxStoresPerMemmoveCL("max-store-memmove", cl::Hidden, cl::init(6),
+                          cl::desc("Max #stores to inline memmove"));
 
-static cl::opt<int> MaxStoresPerMemmoveOptSizeCL("max-store-memmove-Os",
-  cl::Hidden, cl::ZeroOrMore, cl::init(4),
-  cl::desc("Max #stores to inline memmove"));
+static cl::opt<int>
+    MaxStoresPerMemmoveOptSizeCL("max-store-memmove-Os", cl::Hidden,
+                                 cl::init(4),
+                                 cl::desc("Max #stores to inline memmove"));
 
-static cl::opt<int> MaxStoresPerMemsetCL("max-store-memset",
-  cl::Hidden, cl::ZeroOrMore, cl::init(8),
-  cl::desc("Max #stores to inline memset"));
+static cl::opt<int>
+    MaxStoresPerMemsetCL("max-store-memset", cl::Hidden, cl::init(8),
+                         cl::desc("Max #stores to inline memset"));
 
-static cl::opt<int> MaxStoresPerMemsetOptSizeCL("max-store-memset-Os",
-  cl::Hidden, cl::ZeroOrMore, cl::init(4),
-  cl::desc("Max #stores to inline memset"));
+static cl::opt<int>
+    MaxStoresPerMemsetOptSizeCL("max-store-memset-Os", cl::Hidden, cl::init(4),
+                                cl::desc("Max #stores to inline memset"));
 
 static cl::opt<bool> AlignLoads("hexagon-align-loads",
   cl::Hidden, cl::init(false),
@@ -1396,10 +1396,9 @@ HexagonTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
   Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, Hexagon::R0, Chain, InFlag);
   InFlag = Chain.getValue(1);
 
-  unsigned Flags =
-      static_cast<const HexagonSubtarget &>(DAG.getSubtarget()).useLongCalls()
-          ? HexagonII::MO_GDPLT | HexagonII::HMOTF_ConstExtended
-          : HexagonII::MO_GDPLT;
+  unsigned Flags = DAG.getSubtarget<HexagonSubtarget>().useLongCalls()
+                       ? HexagonII::MO_GDPLT | HexagonII::HMOTF_ConstExtended
+                       : HexagonII::MO_GDPLT;
 
   return GetDynamicTLSAddr(DAG, Chain, GA, InFlag, PtrVT,
                            Hexagon::R0, Flags);
@@ -2164,6 +2163,11 @@ HexagonTargetLowering::getPreferredVectorAction(MVT VT) const {
   // Always widen (remaining) vectors of i1.
   if (ElemTy == MVT::i1)
     return TargetLoweringBase::TypeWidenVector;
+  // Widen non-power-of-2 vectors. Such types cannot be split right now,
+  // and computeRegisterProperties will override "split" with "widen",
+  // which can cause other issues.
+  if (!isPowerOf2_32(VecLen))
+    return TargetLoweringBase::TypeWidenVector;
 
   return TargetLoweringBase::TypeSplitVector;
 }
@@ -2423,16 +2427,25 @@ HexagonTargetLowering::buildVector32(ArrayRef<SDValue> Elem, const SDLoc &dl,
       llvm::all_of(Consts, [](ConstantInt *CI) { return CI->isZero(); }))
     return getZero(dl, VecTy, DAG);
 
-  if (ElemTy == MVT::i16) {
+  if (ElemTy == MVT::i16 || ElemTy == MVT::f16) {
     assert(Elem.size() == 2);
     if (AllConst) {
+      // The 'Consts' array will have all values as integers regardless
+      // of the vector element type.
       uint32_t V = (Consts[0]->getZExtValue() & 0xFFFF) |
                    Consts[1]->getZExtValue() << 16;
-      return DAG.getBitcast(MVT::v2i16, DAG.getConstant(V, dl, MVT::i32));
+      return DAG.getBitcast(VecTy, DAG.getConstant(V, dl, MVT::i32));
+    }
+    SDValue E0, E1;
+    if (ElemTy == MVT::f16) {
+      E0 = DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Elem[0]), dl, MVT::i32);
+      E1 = DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Elem[1]), dl, MVT::i32);
+    } else {
+      E0 = Elem[0];
+      E1 = Elem[1];
     }
-    SDValue N = getInstr(Hexagon::A2_combine_ll, dl, MVT::i32,
-                         {Elem[1], Elem[0]}, DAG);
-    return DAG.getBitcast(MVT::v2i16, N);
+    SDValue N = getInstr(Hexagon::A2_combine_ll, dl, MVT::i32, {E1, E0}, DAG);
+    return DAG.getBitcast(VecTy, N);
   }
 
   if (ElemTy == MVT::i8) {
@@ -2506,7 +2519,7 @@ HexagonTargetLowering::buildVector64(ArrayRef<SDValue> Elem, const SDLoc &dl,
     return getZero(dl, VecTy, DAG);
 
   // First try splat if possible.
-  if (ElemTy == MVT::i16) {
+  if (ElemTy == MVT::i16 || ElemTy == MVT::f16) {
     bool IsSplat = true;
     for (unsigned i = First+1; i != Num; ++i) {
       if (Elem[i] == Elem[First] || isUndef(Elem[i]))
@@ -2516,7 +2529,9 @@ HexagonTargetLowering::buildVector64(ArrayRef<SDValue> Elem, const SDLoc &dl,
     }
     if (IsSplat) {
       // Legalize the operand of SPLAT_VECTOR
-      SDValue Ext = DAG.getZExtOrTrunc(Elem[First], dl, MVT::i32);
+      SDValue S = ElemTy == MVT::f16 ? DAG.getBitcast(MVT::i16, Elem[First])
+                                     : Elem[First];
+      SDValue Ext = DAG.getZExtOrTrunc(S, dl, MVT::i32);
       return DAG.getNode(ISD::SPLAT_VECTOR, dl, VecTy, Ext);
     }
   }
@@ -2525,8 +2540,7 @@ HexagonTargetLowering::buildVector64(ArrayRef<SDValue> Elem, const SDLoc &dl,
   if (AllConst) {
     uint64_t Val = 0;
     unsigned W = ElemTy.getSizeInBits();
-    uint64_t Mask = (ElemTy == MVT::i8)  ? 0xFFull
-                  : (ElemTy == MVT::i16) ? 0xFFFFull : 0xFFFFFFFFull;
+    uint64_t Mask = (1ull << W) - 1;
     for (unsigned i = 0; i != Num; ++i)
       Val = (Val << W) | (Consts[Num-1-i]->getZExtValue() & Mask);
     SDValue V0 = DAG.getConstant(Val, dl, MVT::i64);
@@ -3656,9 +3670,12 @@ HexagonTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
              : AtomicExpansionKind::None;
 }
 
-bool HexagonTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+TargetLowering::AtomicExpansionKind
+HexagonTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
   // Do not expand loads and stores that don't exceed 64 bits.
-  return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() > 64;
+  return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() > 64
+             ? AtomicExpansionKind::Expand
+             : AtomicExpansionKind::None;
 }
 
 TargetLowering::AtomicExpansionKind
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
index f9ce7a9407aa..9561dfe8a35d 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -328,7 +328,7 @@ public:
   Value *emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr,
                               AtomicOrdering Ord) const override;
   AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
-  bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+  AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
   AtomicExpansionKind
   shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index 0ba75a544c04..da6ad3ca2c93 100755
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -24,7 +24,6 @@ static const MVT LegalW64[] =  { MVT::v128i8, MVT::v64i16,  MVT::v32i32 };
 static const MVT LegalV128[] = { MVT::v128i8, MVT::v64i16,  MVT::v32i32 };
 static const MVT LegalW128[] = { MVT::v256i8, MVT::v128i16, MVT::v64i32 };
 
-
 void
 HexagonTargetLowering::initializeHVXLowering() {
   if (Subtarget.useHVX64BOps()) {
@@ -79,80 +78,85 @@ HexagonTargetLowering::initializeHVXLowering() {
   // Handle bitcasts of vector predicates to scalars (e.g. v32i1 to i32).
   // Note: v16i1 -> i16 is handled in type legalization instead of op
   // legalization.
-  setOperationAction(ISD::BITCAST,            MVT::i16,   Custom);
-  setOperationAction(ISD::BITCAST,            MVT::i32,   Custom);
-  setOperationAction(ISD::BITCAST,            MVT::i64,   Custom);
+  setOperationAction(ISD::BITCAST,              MVT::i16, Custom);
+  setOperationAction(ISD::BITCAST,              MVT::i32, Custom);
+  setOperationAction(ISD::BITCAST,              MVT::i64, Custom);
   setOperationAction(ISD::BITCAST,            MVT::v16i1, Custom);
-  setOperationAction(ISD::BITCAST,            MVT::v128i1, Custom);
-  setOperationAction(ISD::BITCAST,            MVT::i128, Custom);
-  setOperationAction(ISD::VECTOR_SHUFFLE,     ByteV,      Legal);
-  setOperationAction(ISD::VECTOR_SHUFFLE,     ByteW,      Legal);
+  setOperationAction(ISD::BITCAST,           MVT::v128i1, Custom);
+  setOperationAction(ISD::BITCAST,             MVT::i128, Custom);
+  setOperationAction(ISD::VECTOR_SHUFFLE,          ByteV, Legal);
+  setOperationAction(ISD::VECTOR_SHUFFLE,          ByteW, Legal);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
 
   if (Subtarget.useHVX128BOps() && Subtarget.useHVXV68Ops() &&
       Subtarget.useHVXFloatingPoint()) {
-    setOperationAction(ISD::FMINNUM, MVT::v64f16, Legal);
-    setOperationAction(ISD::FMAXNUM, MVT::v64f16, Legal);
-    setOperationAction(ISD::FADD,    MVT::v64f16, Legal);
-    setOperationAction(ISD::FSUB,    MVT::v64f16, Legal);
-    setOperationAction(ISD::FMUL,    MVT::v64f16, Legal);
-    setOperationAction(ISD::FADD,    MVT::v32f32, Legal);
-    setOperationAction(ISD::FSUB,    MVT::v32f32, Legal);
-    setOperationAction(ISD::FMUL,    MVT::v32f32, Legal);
-    setOperationAction(ISD::FMINNUM, MVT::v32f32, Legal);
-    setOperationAction(ISD::FMAXNUM, MVT::v32f32, Legal);
-    setOperationAction(ISD::INSERT_SUBVECTOR,  MVT::v64f16, Custom);
-    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v64f16, Custom);
-    setOperationAction(ISD::INSERT_SUBVECTOR,  MVT::v32f32, Custom);
-    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom);
-
-    // Handle ISD::BUILD_VECTOR for v32f32 in a custom way to generate vsplat
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v32f32, Custom);
+
+    static const MVT FloatV[] = { MVT::v64f16, MVT::v32f32 };
+    static const MVT FloatW[] = { MVT::v128f16, MVT::v64f32 };
+
+    for (MVT T : FloatV) {
+      setOperationAction(ISD::FADD,              T, Legal);
+      setOperationAction(ISD::FSUB,              T, Legal);
+      setOperationAction(ISD::FMUL,              T, Legal);
+      setOperationAction(ISD::FMINNUM,           T, Legal);
+      setOperationAction(ISD::FMAXNUM,           T, Legal);
+
+      setOperationAction(ISD::INSERT_SUBVECTOR,  T, Custom);
+      setOperationAction(ISD::EXTRACT_SUBVECTOR, T, Custom);
+
+      setOperationAction(ISD::SPLAT_VECTOR,      T, Legal);
+      setOperationAction(ISD::SPLAT_VECTOR,      T, Legal);
+
+      setOperationAction(ISD::MLOAD,             T, Custom);
+      setOperationAction(ISD::MSTORE,            T, Custom);
+      // Custom-lower BUILD_VECTOR. The standard (target-independent)
+      // handling of it would convert it to a load, which is not always
+      // the optimal choice.
+      setOperationAction(ISD::BUILD_VECTOR,      T, Custom);
+    }
+
 
     // BUILD_VECTOR with f16 operands cannot be promoted without
     // promoting the result, so lower the node to vsplat or constant pool
-    setOperationAction(ISD::BUILD_VECTOR,      MVT::f16,    Custom);
-    setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::f16,    Custom);
-    setOperationAction(ISD::SPLAT_VECTOR,      MVT::f16,    Custom);
-    setOperationAction(ISD::SPLAT_VECTOR,      MVT::v64f16, Legal);
-    setOperationAction(ISD::SPLAT_VECTOR,      MVT::v32f32, Legal);
+    setOperationAction(ISD::BUILD_VECTOR,      MVT::f16, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::f16, Custom);
+    setOperationAction(ISD::SPLAT_VECTOR,      MVT::f16, Custom);
+
     // Vector shuffle is always promoted to ByteV and a bitcast to f16 is
     // generated.
-    setPromoteTo(ISD::VECTOR_SHUFFLE, MVT::v64f16, ByteV);
-    setPromoteTo(ISD::VECTOR_SHUFFLE, MVT::v64f32, ByteW);
-    setPromoteTo(ISD::VECTOR_SHUFFLE, MVT::v32f32, ByteV);
-
-    // Custom-lower BUILD_VECTOR for vector pairs. The standard (target-
-    // independent) handling of it would convert it to a load, which is
-    // not always the optimal choice.
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v64f32, Custom);
-    // Make concat-vectors custom to handle concats of more than 2 vectors.
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v128f16, Custom);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v64f32, Custom);
-
-    setOperationAction(ISD::LOAD,    MVT::v64f32, Custom);
-    setOperationAction(ISD::STORE,   MVT::v64f32, Custom);
-    setOperationAction(ISD::FADD,    MVT::v64f32, Custom);
-    setOperationAction(ISD::FSUB,    MVT::v64f32, Custom);
-    setOperationAction(ISD::FMUL,    MVT::v64f32, Custom);
-    setOperationAction(ISD::FMINNUM, MVT::v64f32, Custom);
-    setOperationAction(ISD::FMAXNUM, MVT::v64f32, Custom);
-    setOperationAction(ISD::VSELECT, MVT::v64f32, Custom);
+    setPromoteTo(ISD::VECTOR_SHUFFLE, MVT::v128f16, ByteW);
+    setPromoteTo(ISD::VECTOR_SHUFFLE,  MVT::v64f16, ByteV);
+    setPromoteTo(ISD::VECTOR_SHUFFLE,  MVT::v64f32, ByteW);
+    setPromoteTo(ISD::VECTOR_SHUFFLE,  MVT::v32f32, ByteV);
+
+    for (MVT P : FloatW) {
+      setOperationAction(ISD::LOAD,           P, Custom);
+      setOperationAction(ISD::STORE,          P, Custom);
+      setOperationAction(ISD::FADD,           P, Custom);
+      setOperationAction(ISD::FSUB,           P, Custom);
+      setOperationAction(ISD::FMUL,           P, Custom);
+      setOperationAction(ISD::FMINNUM,        P, Custom);
+      setOperationAction(ISD::FMAXNUM,        P, Custom);
+      setOperationAction(ISD::VSELECT,        P, Custom);
+
+      // Custom-lower BUILD_VECTOR. The standard (target-independent)
+      // handling of it would convert it to a load, which is not always
+      // the optimal choice.
+      setOperationAction(ISD::BUILD_VECTOR,   P, Custom);
+      // Make concat-vectors custom to handle concats of more than 2 vectors.
+      setOperationAction(ISD::CONCAT_VECTORS, P, Custom);
+
+      setOperationAction(ISD::MLOAD,          P, Custom);
+      setOperationAction(ISD::MSTORE,         P, Custom);
+    }
 
     if (Subtarget.useHVXQFloatOps()) {
       setOperationAction(ISD::FP_EXTEND, MVT::v64f32, Custom);
-      setOperationAction(ISD::FP_ROUND, MVT::v64f16, Legal);
+      setOperationAction(ISD::FP_ROUND,  MVT::v64f16, Legal);
     } else if (Subtarget.useHVXIEEEFPOps()) {
       setOperationAction(ISD::FP_EXTEND, MVT::v64f32, Legal);
-      setOperationAction(ISD::FP_ROUND, MVT::v64f16, Legal);
+      setOperationAction(ISD::FP_ROUND,  MVT::v64f16, Legal);
     }
-
-    setOperationAction(ISD::MLOAD, MVT::v32f32, Custom);
-    setOperationAction(ISD::MSTORE, MVT::v32f32, Custom);
-    setOperationAction(ISD::MLOAD, MVT::v64f16, Custom);
-    setOperationAction(ISD::MSTORE, MVT::v64f16, Custom);
-    setOperationAction(ISD::MLOAD, MVT::v64f32, Custom);
-    setOperationAction(ISD::MSTORE, MVT::v64f32, Custom);
   }
 
   for (MVT T : LegalV) {
@@ -382,8 +386,7 @@ HexagonTargetLowering::initializeHVXLowering() {
     }
   }
 
-  setTargetDAGCombine(ISD::SPLAT_VECTOR);
-  setTargetDAGCombine(ISD::VSELECT);
+  setTargetDAGCombine({ISD::SPLAT_VECTOR, ISD::VSELECT});
 }
 
 unsigned
@@ -780,7 +783,6 @@ HexagonTargetLowering::buildHvxVectorReg(ArrayRef<SDValue> Values,
   SDValue N = HalfV0;
   SDValue M = HalfV1;
   for (unsigned i = 0; i != NumWords/2; ++i) {
-
     // Rotate by element count since last insertion.
     if (Words[i] != Words[n] || VecHist[n] <= 1) {
       Sn = DAG.getConstant(Rn, dl, MVT::i32);
@@ -1411,6 +1413,17 @@ HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG)
   for (unsigned i = 0; i != Size; ++i)
     Ops.push_back(Op.getOperand(i));
 
+  // First, split the BUILD_VECTOR for vector pairs. We could generate
+  // some pairs directly (via splat), but splats should be generated
+  // by the combiner prior to getting here.
+  if (VecTy.getSizeInBits() == 16*Subtarget.getVectorLength()) {
+    ArrayRef<SDValue> A(Ops);
+    MVT SingleTy = typeSplit(VecTy).first;
+    SDValue V0 = buildHvxVectorReg(A.take_front(Size/2), dl, SingleTy, DAG);
+    SDValue V1 = buildHvxVectorReg(A.drop_front(Size/2), dl, SingleTy, DAG);
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VecTy, V0, V1);
+  }
+
   if (VecTy.getVectorElementType() == MVT::i1)
     return buildHvxVectorPred(Ops, dl, VecTy, DAG);
 
@@ -1427,14 +1440,6 @@ HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG)
     return DAG.getBitcast(tyVector(VecTy, MVT::f16), T0);
   }
 
-  if (VecTy.getSizeInBits() == 16*Subtarget.getVectorLength()) {
-    ArrayRef<SDValue> A(Ops);
-    MVT SingleTy = typeSplit(VecTy).first;
-    SDValue V0 = buildHvxVectorReg(A.take_front(Size/2), dl, SingleTy, DAG);
-    SDValue V1 = buildHvxVectorReg(A.drop_front(Size/2), dl, SingleTy, DAG);
-    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VecTy, V0, V1);
-  }
-
   return buildHvxVectorReg(Ops, dl, VecTy, DAG);
 }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 9b4e92a16663..c8e6276aa4de 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -77,9 +77,9 @@ cl::opt<bool> ScheduleInlineAsm("hexagon-sched-inline-asm", cl::Hidden,
 static cl::opt<bool> EnableBranchPrediction("hexagon-enable-branch-prediction",
   cl::Hidden, cl::init(true), cl::desc("Enable branch prediction"));
 
-static cl::opt<bool> DisableNVSchedule("disable-hexagon-nv-schedule",
-  cl::Hidden, cl::ZeroOrMore, cl::init(false),
-  cl::desc("Disable schedule adjustment for new value stores."));
+static cl::opt<bool> DisableNVSchedule(
+    "disable-hexagon-nv-schedule", cl::Hidden,
+    cl::desc("Disable schedule adjustment for new value stores."));
 
 static cl::opt<bool> EnableTimingClassLatency(
   "enable-timing-class-latency", cl::Hidden, cl::init(false),
@@ -94,11 +94,12 @@ static cl::opt<bool> EnableACCForwarding(
   cl::desc("Enable vec acc forwarding"));
 
 static cl::opt<bool> BranchRelaxAsmLarge("branch-relax-asm-large",
-  cl::init(true), cl::Hidden, cl::ZeroOrMore, cl::desc("branch relax asm"));
+                                         cl::init(true), cl::Hidden,
+                                         cl::desc("branch relax asm"));
 
-static cl::opt<bool> UseDFAHazardRec("dfa-hazard-rec",
-  cl::init(true), cl::Hidden, cl::ZeroOrMore,
-  cl::desc("Use the DFA based hazard recognizer."));
+static cl::opt<bool>
+    UseDFAHazardRec("dfa-hazard-rec", cl::init(true), cl::Hidden,
+                    cl::desc("Use the DFA based hazard recognizer."));
 
 /// Constants for Hexagon instructions.
 const int Hexagon_MEMW_OFFSET_MAX = 4095;
@@ -158,7 +159,7 @@ bool HexagonInstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
     auto Op = MI.getOperand(1);
     // If the instruction has a global address as operand, it is not cheap
     // since the operand will be constant extended.
-    if (Op.getType() == MachineOperand::MO_GlobalAddress)
+    if (Op.isGlobal())
       return false;
     // If the instruction has an operand of size > 16bits, its will be
     // const-extended and hence, it is not cheap.
@@ -1072,6 +1073,43 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   };
 
   switch (Opc) {
+    case Hexagon::PS_call_instrprof_custom: {
+      auto Op0 = MI.getOperand(0);
+      assert(Op0.isGlobal() &&
+             "First operand must be a global containing handler name.");
+      const GlobalValue *NameVar = Op0.getGlobal();
+      const GlobalVariable *GV = dyn_cast<GlobalVariable>(NameVar);
+      auto *Arr = cast<ConstantDataArray>(GV->getInitializer());
+      StringRef NameStr = Arr->isCString() ? Arr->getAsCString() : Arr->getAsString();
+
+      MachineOperand &Op1 = MI.getOperand(1);
+      // Set R0 with the imm value to be passed to the custom profiling handler.
+      BuildMI(MBB, MI, DL, get(Hexagon::A2_tfrsi), Hexagon::R0)
+        .addImm(Op1.getImm());
+      // The call to the custom handler is being treated as a special one as the
+      // callee is responsible for saving and restoring all the registers
+      // (including caller saved registers) it needs to modify. This is
+      // done to reduce the impact of instrumentation on the code being
+      // instrumented/profiled.
+      // NOTE: R14, R15 and R28 are reserved for PLT handling. These registers
+      // are in the Def list of the Hexagon::PS_call_instrprof_custom and
+      // therefore will be handled appropriately duing register allocation.
+
+      // TODO: It may be a good idea to add a separate pseudo instruction for
+      // static relocation which doesn't need to reserve r14, r15 and r28.
+
+      auto MIB = BuildMI(MBB, MI, DL, get(Hexagon::J2_call))
+                 .addUse(Hexagon::R0, RegState::Implicit|RegState::InternalRead)
+                 .addDef(Hexagon::R29, RegState::ImplicitDefine)
+                 .addDef(Hexagon::R30, RegState::ImplicitDefine)
+                 .addDef(Hexagon::R14, RegState::ImplicitDefine)
+                 .addDef(Hexagon::R15, RegState::ImplicitDefine)
+                 .addDef(Hexagon::R28, RegState::ImplicitDefine);
+      const char *cstr = MF.createExternalSymbolName(NameStr);
+      MIB.addExternalSymbol(cstr);
+      MBB.erase(MI);
+      return true;
+    }
     case TargetOpcode::COPY: {
       MachineOperand &MD = MI.getOperand(0);
       MachineOperand &MS = MI.getOperand(1);
@@ -1392,8 +1430,8 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       // Generate a misaligned load that is guaranteed to cause a crash.
       class CrashPseudoSourceValue : public PseudoSourceValue {
       public:
-        CrashPseudoSourceValue(const TargetInstrInfo &TII)
-          : PseudoSourceValue(TargetCustom, TII) {}
+        CrashPseudoSourceValue(const TargetMachine &TM)
+            : PseudoSourceValue(TargetCustom, TM) {}
 
         bool isConstant(const MachineFrameInfo *) const override {
           return false;
@@ -1409,7 +1447,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
         }
       };
 
-      static const CrashPseudoSourceValue CrashPSV(*this);
+      static const CrashPseudoSourceValue CrashPSV(MF.getTarget());
       MachineMemOperand *MMO = MF.getMachineMemOperand(
           MachinePointerInfo(&CrashPSV),
           MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 8,
@@ -1662,7 +1700,7 @@ bool HexagonInstrInfo::PredicateInstruction(
 
   MI.setDesc(get(PredOpc));
   while (unsigned n = MI.getNumOperands())
-    MI.RemoveOperand(n-1);
+    MI.removeOperand(n-1);
   for (unsigned i = 0, n = T->getNumOperands(); i < n; ++i)
     MI.addOperand(T->getOperand(i));
 
@@ -4464,6 +4502,9 @@ unsigned HexagonInstrInfo::getMemAccessSize(const MachineInstr &MI) const {
   unsigned Size = getMemAccessSizeInBytes(MemAccessSize(S));
   if (Size != 0)
     return Size;
+  // Y2_dcfetchbo is special
+  if (MI.getOpcode() == Hexagon::Y2_dcfetchbo)
+    return HexagonII::DoubleWordAccess;
 
   // Handle vector access sizes.
   const HexagonRegisterInfo &HRI = *Subtarget.getRegisterInfo();
diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index ccaf1aac1ce0..2d49fa369642 100644
--- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -192,10 +192,8 @@ private:
 
     void push_back(Value *V) {
       // Do not push back duplicates.
-      if (!S.count(V)) {
+      if (S.insert(V).second)
         Q.push_back(V);
-        S.insert(V);
-      }
     }
 
     Value *pop_front_val() {
@@ -1152,9 +1150,8 @@ bool PolynomialMultiplyRecognize::findCycle(Value *Out, Value *In,
     if (IsPhi && HadPhi)
       return false;
     HadPhi |= IsPhi;
-    if (Cycle.count(I))
+    if (!Cycle.insert(I))
       return false;
-    Cycle.insert(I);
     if (findCycle(I, In, Cycle))
       break;
     Cycle.remove(I);
@@ -1487,7 +1484,7 @@ bool PolynomialMultiplyRecognize::convertShiftsToLeft(BasicBlock *LoopB,
 
 void PolynomialMultiplyRecognize::cleanupLoopBody(BasicBlock *LoopB) {
   for (auto &I : *LoopB)
-    if (Value *SV = SimplifyInstruction(&I, {DL, &TLI, &DT}))
+    if (Value *SV = simplifyInstruction(&I, {DL, &TLI, &DT}))
       I.replaceAllUsesWith(SV);
 
   for (Instruction &I : llvm::make_early_inc_range(*LoopB))
@@ -2169,7 +2166,7 @@ CleanupAndExit:
                                SCEV::FlagNUW);
   Value *NumBytes = Expander.expandCodeFor(NumBytesS, IntPtrTy, ExpPt);
   if (Instruction *In = dyn_cast<Instruction>(NumBytes))
-    if (Value *Simp = SimplifyInstruction(In, {*DL, TLI, DT}))
+    if (Value *Simp = simplifyInstruction(In, {*DL, TLI, DT}))
       NumBytes = Simp;
 
   CallInst *NewCall;
@@ -2279,7 +2276,7 @@ CleanupAndExit:
       Value *NumWords = Expander.expandCodeFor(NumWordsS, Int32Ty,
                                                MemmoveB->getTerminator());
       if (Instruction *In = dyn_cast<Instruction>(NumWords))
-        if (Value *Simp = SimplifyInstruction(In, {*DL, TLI, DT}))
+        if (Value *Simp = simplifyInstruction(In, {*DL, TLI, DT}))
           NumWords = Simp;
 
       Value *Op0 = (StoreBasePtr->getType() == Int32PtrTy)
diff --git a/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp b/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp
index aabae009d7c3..539db8f55005 100644
--- a/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp
@@ -13,3 +13,9 @@ using namespace llvm;
 // pin vtable to this file
 void HexagonMachineFunctionInfo::anchor() {}
 
+MachineFunctionInfo *HexagonMachineFunctionInfo::clone(
+    BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+    const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+    const {
+  return DestMF.cloneInfo<HexagonMachineFunctionInfo>(*this);
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.h b/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
index 89ef5c2a891d..a02de24b176a 100644
--- a/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
@@ -42,6 +42,10 @@ public:
   HexagonMachineFunctionInfo() = default;
 
   HexagonMachineFunctionInfo(MachineFunction &MF) {}
+  MachineFunctionInfo *
+  clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+        const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+      const override;
 
   unsigned getSRetReturnReg() const { return SRetReturnReg; }
   void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
diff --git a/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp b/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
index 8edcb745d654..f539717e42d5 100644
--- a/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -61,8 +61,7 @@ static cl::opt<int> DbgNVJCount("nvj-count", cl::init(-1), cl::Hidden,
     "New Value Jump"));
 
 static cl::opt<bool> DisableNewValueJumps("disable-nvjump", cl::Hidden,
-    cl::ZeroOrMore, cl::init(false),
-    cl::desc("Disable New Value Jumps"));
+                                          cl::desc("Disable New Value Jumps"));
 
 namespace llvm {
 
diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td
index 3abbd896c519..80fbf33d83b7 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatterns.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td
@@ -3273,3 +3273,9 @@ let AddedComplexity = 100 in {
   def: Pat<(i1 (seteq (int_hexagon_S4_stored_locked I32:$Rs, I64:$Rt), 0)),
            (C2_not (S4_stored_locked I32:$Rs, I64:$Rt))>;
 }
+
+def: Pat<(int_hexagon_instrprof_custom (HexagonAtPcrel tglobaladdr:$addr), u32_0ImmPred:$I),
+         (PS_call_instrprof_custom tglobaladdr:$addr, imm:$I)>;
+
+def: Pat<(int_hexagon_instrprof_custom (HexagonCONST32 tglobaladdr:$addr), u32_0ImmPred:$I),
+         (PS_call_instrprof_custom tglobaladdr:$addr, imm:$I)>;
diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
index 0a3dff057ccd..6fb1313667a9 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
@@ -37,7 +37,7 @@ def SDTHexagonVINSERTW0: SDTypeProfile<1, 2,
 def HexagonVINSERTW0: SDNode<"HexagonISD::VINSERTW0", SDTHexagonVINSERTW0>;
 
 def HwLen2: SDNodeXForm<imm, [{
-  const auto &ST = static_cast<const HexagonSubtarget&>(CurDAG->getSubtarget());
+  const auto &ST = CurDAG->getSubtarget<HexagonSubtarget>();
   return CurDAG->getTargetConstant(ST.getVectorLength()/2, SDLoc(N), MVT::i32);
 }]>;
 
@@ -92,19 +92,19 @@ def IsVecOff : PatLeaf<(i32 imm), [{
 
 
 def alignedload: PatFrag<(ops node:$a), (load $a), [{
-  return isAlignedMemNode(dyn_cast<MemSDNode>(N));
+  return isAlignedMemNode(cast<MemSDNode>(N));
 }]>;
 
 def unalignedload: PatFrag<(ops node:$a), (load $a), [{
-  return !isAlignedMemNode(dyn_cast<MemSDNode>(N));
+  return !isAlignedMemNode(cast<MemSDNode>(N));
 }]>;
 
 def alignedstore: PatFrag<(ops node:$v, node:$a), (store $v, $a), [{
-  return isAlignedMemNode(dyn_cast<MemSDNode>(N));
+  return isAlignedMemNode(cast<MemSDNode>(N));
 }]>;
 
 def unalignedstore: PatFrag<(ops node:$v, node:$a), (store $v, $a), [{
-  return !isAlignedMemNode(dyn_cast<MemSDNode>(N));
+  return !isAlignedMemNode(cast<MemSDNode>(N));
 }]>;
 
 
@@ -738,9 +738,14 @@ let Predicates = [UseHVX] in {
 
 def V2Q: OutPatFrag<(ops node:$Vs), (V6_vandvrt $Vs, (A2_tfrsi -1))>;
 
-let Predicates = [UseHVX] in
-  def: Pat<(select I1:$Pu, VecI1:$Qs, VecI1:$Qt),
+let Predicates = [UseHVX] in {
+  def: Pat<(select I1:$Pu, VecQ8:$Qs, VecQ8:$Qt),
+           (V2Q (PS_vselect $Pu, (Q2V $Qs), (Q2V $Qt)))>;
+  def: Pat<(select I1:$Pu, VecQ16:$Qs, VecQ16:$Qt),
            (V2Q (PS_vselect $Pu, (Q2V $Qs), (Q2V $Qt)))>;
+  def: Pat<(select I1:$Pu, VecQ32:$Qs, VecQ32:$Qt),
+           (V2Q (PS_vselect $Pu, (Q2V $Qs), (Q2V $Qt)))>;
+}
 
 let Predicates = [UseHVX] in {
   def: Pat<(VecQ8   (qtrue)), (PS_qtrue)>;
diff --git a/llvm/lib/Target/Hexagon/HexagonPeephole.cpp b/llvm/lib/Target/Hexagon/HexagonPeephole.cpp
index 1ff248200572..ccd90f814813 100644
--- a/llvm/lib/Target/Hexagon/HexagonPeephole.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonPeephole.cpp
@@ -56,21 +56,21 @@ using namespace llvm;
 
 #define DEBUG_TYPE "hexagon-peephole"
 
-static cl::opt<bool> DisableHexagonPeephole("disable-hexagon-peephole",
-    cl::Hidden, cl::ZeroOrMore, cl::init(false),
-    cl::desc("Disable Peephole Optimization"));
+static cl::opt<bool>
+    DisableHexagonPeephole("disable-hexagon-peephole", cl::Hidden,
+                           cl::desc("Disable Peephole Optimization"));
 
-static cl::opt<bool> DisablePNotP("disable-hexagon-pnotp",
-    cl::Hidden, cl::ZeroOrMore, cl::init(false),
-    cl::desc("Disable Optimization of PNotP"));
+static cl::opt<bool> DisablePNotP("disable-hexagon-pnotp", cl::Hidden,
+                                  cl::desc("Disable Optimization of PNotP"));
 
-static cl::opt<bool> DisableOptSZExt("disable-hexagon-optszext",
-    cl::Hidden, cl::ZeroOrMore, cl::init(true),
-    cl::desc("Disable Optimization of Sign/Zero Extends"));
+static cl::opt<bool>
+    DisableOptSZExt("disable-hexagon-optszext", cl::Hidden, cl::init(true),
+                    cl::desc("Disable Optimization of Sign/Zero Extends"));
 
-static cl::opt<bool> DisableOptExtTo64("disable-hexagon-opt-ext-to-64",
-    cl::Hidden, cl::ZeroOrMore, cl::init(true),
-    cl::desc("Disable Optimization of extensions to i64."));
+static cl::opt<bool>
+    DisableOptExtTo64("disable-hexagon-opt-ext-to-64", cl::Hidden,
+                      cl::init(true),
+                      cl::desc("Disable Optimization of extensions to i64."));
 
 namespace llvm {
   FunctionPass *createHexagonPeephole();
@@ -208,14 +208,14 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
           // Try to find in the map.
           if (unsigned PeepholeSrc = PeepholeMap.lookup(SrcReg)) {
             // Change the 1st operand.
-            MI.RemoveOperand(1);
+            MI.removeOperand(1);
             MI.addOperand(MachineOperand::CreateReg(PeepholeSrc, false));
           } else  {
             DenseMap<unsigned, std::pair<unsigned, unsigned> >::iterator DI =
               PeepholeDoubleRegsMap.find(SrcReg);
             if (DI != PeepholeDoubleRegsMap.end()) {
               std::pair<unsigned,unsigned> PeepholeSrc = DI->second;
-              MI.RemoveOperand(1);
+              MI.removeOperand(1);
               MI.addOperand(MachineOperand::CreateReg(
                   PeepholeSrc.first, false /*isDef*/, false /*isImp*/,
                   false /*isKill*/, false /*isDead*/, false /*isUndef*/,
diff --git a/llvm/lib/Target/Hexagon/HexagonPseudo.td b/llvm/lib/Target/Hexagon/HexagonPseudo.td
index afd63d6d4aa7..7c45568f7734 100644
--- a/llvm/lib/Target/Hexagon/HexagonPseudo.td
+++ b/llvm/lib/Target/Hexagon/HexagonPseudo.td
@@ -182,6 +182,28 @@ let isCodeGenOnly = 1, isCall = 1, hasSideEffects = 1,
     Defs = [PC, R31, R6, R7, P0] in
 def PS_call_stk : T_Call<"">;
 
+// This pseudo instruction is used to replace int_hexagon_instrprof_custom intrinsic
+// with a call to custom handler passed as the first argument to the intrinsic.
+
+// Pleae Note:
+// 1) The call to the custom handler is being treated as a special one as the
+//    callee is responsible for saving and restoring all the registers it needs
+//    to modify. This includes caller saved registers as well as r0-r5 argument
+//    registers. This is done to reduce the impact of instrumentation on the
+//    code being instrumented/profiled.
+// 2) R14, R15 and R28 are reserved for PLT handling and therefore are
+//    part of the def list.
+// 3) R0 is used to pass the unique id associated with an instrumentation site
+//    to the handler.
+// 4) All the other registers (R29, R30, R31, PC) get modified by the call
+//    instruction.
+
+// TODO: It may be a good idea to add a separate pseudo instruction for
+// static relocation which doesn't need to reserve r14, r15 and r28.
+
+let hasSideEffects = 1, isCall = 1, Defs = [R0, R14, R15, R28, R29, R30, R31, PC] in
+def PS_call_instrprof_custom :  Pseudo<(outs), (ins s32_0Imm:$dst, u32_0Imm:$Ii), "">;
+
 // Call, no return.
 let isCall = 1, hasSideEffects = 1, cofMax1 = 1, isCodeGenOnly = 1 in
 def PS_callr_nr: InstHexagon<(outs), (ins IntRegs:$Rs),
diff --git a/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp b/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
index f26e23befde2..fb6918949cce 100644
--- a/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
@@ -201,7 +201,7 @@ void HexagonDCE::removeOperand(NodeAddr<InstrNode*> IA, unsigned OpNum) {
   for (NodeAddr<RefNode*> RA : Refs)
     OpMap.insert(std::make_pair(RA.Id, getOpNum(RA.Addr->getOp())));
 
-  MI->RemoveOperand(OpNum);
+  MI->removeOperand(OpNum);
 
   for (NodeAddr<RefNode*> RA : Refs) {
     unsigned N = OpMap[RA.Id];
diff --git a/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index 6e55bc6b5c2c..f0e56d74fcd1 100644
--- a/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -228,7 +228,7 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     case Hexagon::PS_fia:
       MI.setDesc(HII.get(Hexagon::A2_addi));
       MI.getOperand(FIOp).ChangeToImmediate(RealOffset);
-      MI.RemoveOperand(FIOp+1);
+      MI.removeOperand(FIOp+1);
       return;
     case Hexagon::PS_fi:
       // Set up the instruction for updating below.
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
index bdd2a2cfc5fa..2283d1b7f9c6 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -39,45 +39,46 @@ using namespace llvm;
 #define GET_SUBTARGETINFO_TARGET_DESC
 #include "HexagonGenSubtargetInfo.inc"
 
-static cl::opt<bool> EnableBSBSched("enable-bsb-sched",
-  cl::Hidden, cl::ZeroOrMore, cl::init(true));
+static cl::opt<bool> EnableBSBSched("enable-bsb-sched", cl::Hidden,
+                                    cl::init(true));
 
-static cl::opt<bool> EnableTCLatencySched("enable-tc-latency-sched",
-  cl::Hidden, cl::ZeroOrMore, cl::init(false));
+static cl::opt<bool> EnableTCLatencySched("enable-tc-latency-sched", cl::Hidden,
+                                          cl::init(false));
 
-static cl::opt<bool> EnableDotCurSched("enable-cur-sched",
-  cl::Hidden, cl::ZeroOrMore, cl::init(true),
-  cl::desc("Enable the scheduler to generate .cur"));
+static cl::opt<bool>
+    EnableDotCurSched("enable-cur-sched", cl::Hidden, cl::init(true),
+                      cl::desc("Enable the scheduler to generate .cur"));
 
-static cl::opt<bool> DisableHexagonMISched("disable-hexagon-misched",
-  cl::Hidden, cl::ZeroOrMore, cl::init(false),
-  cl::desc("Disable Hexagon MI Scheduling"));
+static cl::opt<bool>
+    DisableHexagonMISched("disable-hexagon-misched", cl::Hidden,
+                          cl::desc("Disable Hexagon MI Scheduling"));
 
-static cl::opt<bool> EnableSubregLiveness("hexagon-subreg-liveness",
-  cl::Hidden, cl::ZeroOrMore, cl::init(true),
-  cl::desc("Enable subregister liveness tracking for Hexagon"));
+static cl::opt<bool> EnableSubregLiveness(
+    "hexagon-subreg-liveness", cl::Hidden, cl::init(true),
+    cl::desc("Enable subregister liveness tracking for Hexagon"));
 
-static cl::opt<bool> OverrideLongCalls("hexagon-long-calls",
-  cl::Hidden, cl::ZeroOrMore, cl::init(false),
-  cl::desc("If present, forces/disables the use of long calls"));
+static cl::opt<bool> OverrideLongCalls(
+    "hexagon-long-calls", cl::Hidden,
+    cl::desc("If present, forces/disables the use of long calls"));
 
-static cl::opt<bool> EnablePredicatedCalls("hexagon-pred-calls",
-  cl::Hidden, cl::ZeroOrMore, cl::init(false),
-  cl::desc("Consider calls to be predicable"));
+static cl::opt<bool>
+    EnablePredicatedCalls("hexagon-pred-calls", cl::Hidden,
+                          cl::desc("Consider calls to be predicable"));
 
-static cl::opt<bool> SchedPredsCloser("sched-preds-closer",
-  cl::Hidden, cl::ZeroOrMore, cl::init(true));
+static cl::opt<bool> SchedPredsCloser("sched-preds-closer", cl::Hidden,
+                                      cl::init(true));
 
 static cl::opt<bool> SchedRetvalOptimization("sched-retval-optimization",
-  cl::Hidden, cl::ZeroOrMore, cl::init(true));
+                                             cl::Hidden, cl::init(true));
 
-static cl::opt<bool> EnableCheckBankConflict("hexagon-check-bank-conflict",
-  cl::Hidden, cl::ZeroOrMore, cl::init(true),
-  cl::desc("Enable checking for cache bank conflicts"));
+static cl::opt<bool> EnableCheckBankConflict(
+    "hexagon-check-bank-conflict", cl::Hidden, cl::init(true),
+    cl::desc("Enable checking for cache bank conflicts"));
 
 static cl::opt<bool> EnableV68FloatCodeGen(
-    "force-hvx-float", cl::Hidden, cl::ZeroOrMore, cl::init(false),
-    cl::desc("Enable the code-generation for vector float instructions on v68."));
+    "force-hvx-float", cl::Hidden,
+    cl::desc(
+        "Enable the code-generation for vector float instructions on v68."));
 
 HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU,
                                    StringRef FS, const TargetMachine &TM)
@@ -95,8 +96,7 @@ HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU,
 
 HexagonSubtarget &
 HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
-  Optional<Hexagon::ArchEnum> ArchVer =
-      Hexagon::GetCpu(Hexagon::CpuTable, CPUString);
+  Optional<Hexagon::ArchEnum> ArchVer = Hexagon::getCpu(CPUString);
   if (ArchVer)
     HexagonArchVersion = *ArchVer;
   else
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
index db682676cf12..f6c70928c2f6 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
@@ -13,7 +13,7 @@
 #ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONSUBTARGET_H
 #define LLVM_LIB_TARGET_HEXAGON_HEXAGONSUBTARGET_H
 
-#include "HexagonArch.h"
+#include "HexagonDepArch.h"
 #include "HexagonFrameLowering.h"
 #include "HexagonISelLowering.h"
 #include "HexagonInstrInfo.h"
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index c6703bb8a62a..4e04939e6690 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -32,41 +32,44 @@
 
 using namespace llvm;
 
-static cl::opt<bool> EnableCExtOpt("hexagon-cext", cl::Hidden, cl::ZeroOrMore,
-  cl::init(true), cl::desc("Enable Hexagon constant-extender optimization"));
+static cl::opt<bool>
+    EnableCExtOpt("hexagon-cext", cl::Hidden, cl::init(true),
+                  cl::desc("Enable Hexagon constant-extender optimization"));
 
-static cl::opt<bool> EnableRDFOpt("rdf-opt", cl::Hidden, cl::ZeroOrMore,
-  cl::init(true), cl::desc("Enable RDF-based optimizations"));
+static cl::opt<bool> EnableRDFOpt("rdf-opt", cl::Hidden, cl::init(true),
+                                  cl::desc("Enable RDF-based optimizations"));
 
 static cl::opt<bool> DisableHardwareLoops("disable-hexagon-hwloops",
   cl::Hidden, cl::desc("Disable Hardware Loops for Hexagon target"));
 
-static cl::opt<bool> DisableAModeOpt("disable-hexagon-amodeopt",
-  cl::Hidden, cl::ZeroOrMore, cl::init(false),
-  cl::desc("Disable Hexagon Addressing Mode Optimization"));
+static cl::opt<bool>
+    DisableAModeOpt("disable-hexagon-amodeopt", cl::Hidden,
+                    cl::desc("Disable Hexagon Addressing Mode Optimization"));
 
-static cl::opt<bool> DisableHexagonCFGOpt("disable-hexagon-cfgopt",
-  cl::Hidden, cl::ZeroOrMore, cl::init(false),
-  cl::desc("Disable Hexagon CFG Optimization"));
+static cl::opt<bool>
+    DisableHexagonCFGOpt("disable-hexagon-cfgopt", cl::Hidden,
+                         cl::desc("Disable Hexagon CFG Optimization"));
 
-static cl::opt<bool> DisableHCP("disable-hcp", cl::init(false), cl::Hidden,
-  cl::ZeroOrMore, cl::desc("Disable Hexagon constant propagation"));
+static cl::opt<bool>
+    DisableHCP("disable-hcp", cl::Hidden,
+               cl::desc("Disable Hexagon constant propagation"));
 
 static cl::opt<bool> DisableStoreWidening("disable-store-widen",
   cl::Hidden, cl::init(false), cl::desc("Disable store widening"));
 
 static cl::opt<bool> EnableExpandCondsets("hexagon-expand-condsets",
-  cl::init(true), cl::Hidden, cl::ZeroOrMore,
-  cl::desc("Early expansion of MUX"));
+                                          cl::init(true), cl::Hidden,
+                                          cl::desc("Early expansion of MUX"));
 
 static cl::opt<bool> EnableEarlyIf("hexagon-eif", cl::init(true), cl::Hidden,
-  cl::ZeroOrMore, cl::desc("Enable early if-conversion"));
+                                   cl::desc("Enable early if-conversion"));
 
 static cl::opt<bool> EnableGenInsert("hexagon-insert", cl::init(true),
   cl::Hidden, cl::desc("Generate \"insert\" instructions"));
 
-static cl::opt<bool> EnableCommGEP("hexagon-commgep", cl::init(true),
-  cl::Hidden, cl::ZeroOrMore, cl::desc("Enable commoning of GEP instructions"));
+static cl::opt<bool>
+    EnableCommGEP("hexagon-commgep", cl::init(true), cl::Hidden,
+                  cl::desc("Enable commoning of GEP instructions"));
 
 static cl::opt<bool> EnableGenExtract("hexagon-extract", cl::init(true),
   cl::Hidden, cl::desc("Generate \"extract\" instructions"));
@@ -78,9 +81,9 @@ static cl::opt<bool> EnableGenPred("hexagon-gen-pred", cl::init(true),
   cl::Hidden, cl::desc("Enable conversion of arithmetic operations to "
   "predicate instructions"));
 
-static cl::opt<bool> EnableLoopPrefetch("hexagon-loop-prefetch",
-  cl::init(false), cl::Hidden, cl::ZeroOrMore,
-  cl::desc("Enable loop data prefetch on Hexagon"));
+static cl::opt<bool>
+    EnableLoopPrefetch("hexagon-loop-prefetch", cl::Hidden,
+                       cl::desc("Enable loop data prefetch on Hexagon"));
 
 static cl::opt<bool> DisableHSDR("disable-hsdr", cl::init(false), cl::Hidden,
   cl::desc("Disable splitting double registers"));
@@ -94,22 +97,24 @@ static cl::opt<bool> EnableLoopResched("hexagon-loop-resched", cl::init(true),
 static cl::opt<bool> HexagonNoOpt("hexagon-noopt", cl::init(false),
   cl::Hidden, cl::desc("Disable backend optimizations"));
 
-static cl::opt<bool> EnableVectorPrint("enable-hexagon-vector-print",
-  cl::Hidden, cl::ZeroOrMore, cl::init(false),
-  cl::desc("Enable Hexagon Vector print instr pass"));
+static cl::opt<bool>
+    EnableVectorPrint("enable-hexagon-vector-print", cl::Hidden,
+                      cl::desc("Enable Hexagon Vector print instr pass"));
 
-static cl::opt<bool> EnableVExtractOpt("hexagon-opt-vextract", cl::Hidden,
-  cl::ZeroOrMore, cl::init(true), cl::desc("Enable vextract optimization"));
+static cl::opt<bool>
+    EnableVExtractOpt("hexagon-opt-vextract", cl::Hidden, cl::init(true),
+                      cl::desc("Enable vextract optimization"));
 
-static cl::opt<bool> EnableVectorCombine("hexagon-vector-combine", cl::Hidden,
-  cl::ZeroOrMore, cl::init(true), cl::desc("Enable HVX vector combining"));
+static cl::opt<bool>
+    EnableVectorCombine("hexagon-vector-combine", cl::Hidden, cl::init(true),
+                        cl::desc("Enable HVX vector combining"));
 
-static cl::opt<bool> EnableInitialCFGCleanup("hexagon-initial-cfg-cleanup",
-  cl::Hidden, cl::ZeroOrMore, cl::init(true),
-  cl::desc("Simplify the CFG after atomic expansion pass"));
+static cl::opt<bool> EnableInitialCFGCleanup(
+    "hexagon-initial-cfg-cleanup", cl::Hidden, cl::init(true),
+    cl::desc("Simplify the CFG after atomic expansion pass"));
 
 static cl::opt<bool> EnableInstSimplify("hexagon-instsimplify", cl::Hidden,
-                                        cl::ZeroOrMore, cl::init(true),
+                                        cl::init(true),
                                         cl::desc("Enable instsimplify"));
 
 /// HexagonTargetMachineModule - Note that this is used on hosts that
@@ -189,7 +194,7 @@ namespace llvm {
 } // end namespace llvm;
 
 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
-  return RM.getValueOr(Reloc::Static);
+  return RM.value_or(Reloc::Static);
 }
 
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonTarget() {
@@ -293,12 +298,11 @@ void HexagonTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 }
 
 TargetTransformInfo
-HexagonTargetMachine::getTargetTransformInfo(const Function &F) {
+HexagonTargetMachine::getTargetTransformInfo(const Function &F) const {
   return TargetTransformInfo(HexagonTTIImpl(this, F));
 }
 
-
-HexagonTargetMachine::~HexagonTargetMachine() {}
+HexagonTargetMachine::~HexagonTargetMachine() = default;
 
 namespace {
 /// Hexagon Code Generator Pass Configuration Options.
@@ -345,6 +349,7 @@ void HexagonPassConfig::addIRPasses() {
     if (EnableInitialCFGCleanup)
       addPass(createCFGSimplificationPass(SimplifyCFGOptions()
                                               .forwardSwitchCondToPhi(true)
+                                              .convertSwitchRangeToICmp(true)
                                               .convertSwitchToLookupTable(true)
                                               .needCanonicalLoops(false)
                                               .hoistCommonInsts(true)
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.h b/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
index 66679df93bd3..947df7574ab3 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
@@ -39,7 +39,7 @@ public:
   void adjustPassManager(PassManagerBuilder &PMB) override;
   void registerPassBuilderCallbacks(PassBuilder &PB) override;
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
-  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+  TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
 
   HexagonTargetObjectFile *getObjFileLowering() const override {
     return static_cast<HexagonTargetObjectFile*>(TLOF.get());
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index 7df32e4072e3..c83ed16f0272 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -41,9 +41,9 @@ static cl::opt<unsigned> SmallDataThreshold("hexagon-small-data-threshold",
 static cl::opt<bool> NoSmallDataSorting("mno-sort-sda", cl::init(false),
   cl::Hidden, cl::desc("Disable small data sections sorting"));
 
-static cl::opt<bool> StaticsInSData("hexagon-statics-in-small-data",
-  cl::init(false), cl::Hidden, cl::ZeroOrMore,
-  cl::desc("Allow static variables in .sdata"));
+static cl::opt<bool>
+    StaticsInSData("hexagon-statics-in-small-data", cl::Hidden,
+                   cl::desc("Allow static variables in .sdata"));
 
 static cl::opt<bool> TraceGVPlacement("trace-gv-placement",
   cl::Hidden, cl::init(false),
@@ -332,6 +332,7 @@ unsigned HexagonTargetObjectFile::getSmallestAddressableSize(const Type *Ty,
   case Type::X86_MMXTyID:
   case Type::X86_AMXTyID:
   case Type::TokenTyID:
+  case Type::DXILPointerTyID:
     return 0;
   }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index 1bdd8c3c513a..bb0aaa3150fb 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -223,7 +223,8 @@ HexagonTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
 
 InstructionCost HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp,
                                                ArrayRef<int> Mask, int Index,
-                                               Type *SubTp) {
+                                               Type *SubTp,
+                                               ArrayRef<const Value *> Args) {
   return 1;
 }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index 9e637dfc3e16..7bbaf7ae9cb2 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -86,12 +86,11 @@ public:
   unsigned getMinVectorRegisterBitWidth() const;
   ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const;
 
-  bool shouldMaximizeVectorBandwidth() const {
+  bool
+  shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const {
     return true;
   }
-  bool supportsEfficientVectorElementLoadStore() {
-    return false;
-  }
+  bool supportsEfficientVectorElementLoadStore() { return false; }
   bool hasBranchDivergence() {
     return false;
   }
@@ -125,7 +124,8 @@ public:
                                         Align Alignment, unsigned AddressSpace,
                                         TTI::TargetCostKind CostKind);
   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, Type *Tp,
-                                 ArrayRef<int> Mask, int Index, Type *SubTp);
+                                 ArrayRef<int> Mask, int Index, Type *SubTp,
+                                 ArrayRef<const Value *> Args = None);
   InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
                                          const Value *Ptr, bool VariableMask,
                                          Align Alignment,
diff --git a/llvm/lib/Target/Hexagon/HexagonVExtract.cpp b/llvm/lib/Target/Hexagon/HexagonVExtract.cpp
index b5f06ebd3189..845fa1e49578 100644
--- a/llvm/lib/Target/Hexagon/HexagonVExtract.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVExtract.cpp
@@ -27,9 +27,9 @@
 
 using namespace llvm;
 
-static cl::opt<unsigned> VExtractThreshold("hexagon-vextract-threshold",
-  cl::Hidden, cl::ZeroOrMore, cl::init(1),
-  cl::desc("Threshold for triggering vextract replacement"));
+static cl::opt<unsigned> VExtractThreshold(
+    "hexagon-vextract-threshold", cl::Hidden, cl::init(1),
+    cl::desc("Threshold for triggering vextract replacement"));
 
 namespace llvm {
   void initializeHexagonVExtractPass(PassRegistry& Registry);
@@ -106,8 +106,7 @@ bool HexagonVExtract::runOnMachineFunction(MachineFunction &MF) {
   MachineFrameInfo &MFI = MF.getFrameInfo();
   Register AR =
       MF.getInfo<HexagonMachineFunctionInfo>()->getStackAlignBaseVReg();
-  std::map<unsigned, SmallVector<MachineInstr*,4>> VExtractMap;
-  MaybeAlign MaxAlign;
+  std::map<unsigned, SmallVector<MachineInstr *, 4>> VExtractMap;
   bool Changed = false;
 
   for (MachineBasicBlock &MBB : MF) {
@@ -131,6 +130,7 @@ bool HexagonVExtract::runOnMachineFunction(MachineFunction &MF) {
     return AddrR;
   };
 
+  MaybeAlign MaxAlign;
   for (auto &P : VExtractMap) {
     unsigned VecR = P.first;
     if (P.second.size() <= VExtractThreshold)
@@ -138,7 +138,7 @@ bool HexagonVExtract::runOnMachineFunction(MachineFunction &MF) {
 
     const auto &VecRC = *MRI.getRegClass(VecR);
     Align Alignment = HRI.getSpillAlign(VecRC);
-    MaxAlign = max(MaxAlign, Alignment);
+    MaxAlign = std::max(MaxAlign.valueOrOne(), Alignment);
     // Make sure this is not a spill slot: spill slots cannot be aligned
     // if there are variable-sized objects on the stack. They must be
     // accessible via FP (which is not aligned), because SP is unknown,
diff --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index e9b658d18175..54d33a4113e7 100644
--- a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -55,24 +55,25 @@ using namespace llvm;
 
 #define DEBUG_TYPE "packets"
 
-static cl::opt<bool> DisablePacketizer("disable-packetizer", cl::Hidden,
-  cl::ZeroOrMore, cl::init(false),
-  cl::desc("Disable Hexagon packetizer pass"));
+static cl::opt<bool>
+    DisablePacketizer("disable-packetizer", cl::Hidden,
+                      cl::desc("Disable Hexagon packetizer pass"));
 
 static cl::opt<bool> Slot1Store("slot1-store-slot0-load", cl::Hidden,
-                                cl::ZeroOrMore, cl::init(true),
+                                cl::init(true),
                                 cl::desc("Allow slot1 store and slot0 load"));
 
-static cl::opt<bool> PacketizeVolatiles("hexagon-packetize-volatiles",
-  cl::ZeroOrMore, cl::Hidden, cl::init(true),
-  cl::desc("Allow non-solo packetization of volatile memory references"));
+static cl::opt<bool> PacketizeVolatiles(
+    "hexagon-packetize-volatiles", cl::Hidden, cl::init(true),
+    cl::desc("Allow non-solo packetization of volatile memory references"));
 
-static cl::opt<bool> EnableGenAllInsnClass("enable-gen-insn", cl::init(false),
-  cl::Hidden, cl::ZeroOrMore, cl::desc("Generate all instruction with TC"));
+static cl::opt<bool>
+    EnableGenAllInsnClass("enable-gen-insn", cl::Hidden,
+                          cl::desc("Generate all instruction with TC"));
 
-static cl::opt<bool> DisableVecDblNVStores("disable-vecdbl-nv-stores",
-  cl::init(false), cl::Hidden, cl::ZeroOrMore,
-  cl::desc("Disable vector double new-value-stores"));
+static cl::opt<bool>
+    DisableVecDblNVStores("disable-vecdbl-nv-stores", cl::Hidden,
+                          cl::desc("Disable vector double new-value-stores"));
 
 extern cl::opt<bool> ScheduleInlineAsm;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
index 6aca8d807872..abd84a188cfa 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
@@ -1310,7 +1310,7 @@ auto HexagonVectorCombine::calculatePointerDifference(Value *Ptr0,
   auto Simplify = [&](Value *V) {
     if (auto *I = dyn_cast<Instruction>(V)) {
       SimplifyQuery Q(DL, &TLI, &DT, &AC, I);
-      if (Value *S = SimplifyInstruction(I, Q))
+      if (Value *S = simplifyInstruction(I, Q))
         return S;
     }
     return V;
@@ -1404,7 +1404,7 @@ auto HexagonVectorCombine::isSafeToMoveBeforeInBB(const Instruction &In,
   if (isa<PHINode>(In) || (To != Block.end() && isa<PHINode>(*To)))
     return false;
 
-  if (!mayBeMemoryDependent(In))
+  if (!mayHaveNonDefUseDependency(In))
     return true;
   bool MayWrite = In.mayWriteToMemory();
   auto MaybeLoc = getLocOrNone(In);
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp b/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
index 94b878e21f4d..2b004a9c5ad4 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
@@ -53,10 +53,10 @@ using namespace llvm;
 STATISTIC(HexagonNumVectorLoopCarriedReuse,
           "Number of values that were reused from a previous iteration.");
 
-static cl::opt<int> HexagonVLCRIterationLim("hexagon-vlcr-iteration-lim",
-    cl::Hidden,
+static cl::opt<int> HexagonVLCRIterationLim(
+    "hexagon-vlcr-iteration-lim", cl::Hidden,
     cl::desc("Maximum distance of loop carried dependences that are handled"),
-    cl::init(2), cl::ZeroOrMore);
+    cl::init(2));
 
 namespace llvm {
 
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.h b/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.h
index f1e0c5804ace..f826b2eb568f 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.h
+++ b/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.h
@@ -127,7 +127,7 @@ class Loop;
 /// Hexagon Vector Loop Carried Reuse Pass
 struct HexagonVectorLoopCarriedReusePass
     : public PassInfoMixin<HexagonVectorLoopCarriedReusePass> {
-  HexagonVectorLoopCarriedReusePass() {}
+  HexagonVectorLoopCarriedReusePass() = default;
 
   /// Run pass over the Loop.
   PreservedAnalyses run(Loop &L, LoopAnalysisManager &LAM,
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorPrint.cpp b/llvm/lib/Target/Hexagon/HexagonVectorPrint.cpp
index fbc5e5c344ed..b09a393f7dd5 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorPrint.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorPrint.cpp
@@ -36,9 +36,9 @@ using namespace llvm;
 
 #define DEBUG_TYPE "hexagon-vector-print"
 
-static cl::opt<bool> TraceHexVectorStoresOnly("trace-hex-vector-stores-only",
-  cl::Hidden, cl::ZeroOrMore, cl::init(false),
-  cl::desc("Enables tracing of vector stores"));
+static cl::opt<bool>
+    TraceHexVectorStoresOnly("trace-hex-vector-stores-only", cl::Hidden,
+                             cl::desc("Enables tracing of vector stores"));
 
 namespace llvm {
 
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index 5e5a26fea076..37866a73ed0f 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -21,6 +21,7 @@
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/EndianStream.h"
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
index e5e5d08937ef..f3da67562320 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
@@ -34,5 +34,4 @@ HexagonMCAsmInfo::HexagonMCAsmInfo(const Triple &TT) {
   UsesELFSectionDirectiveForBSS  = true;
   ExceptionsType = ExceptionHandling::DwarfCFI;
   UseLogicalShr = false;
-  UseIntegratedAssembler = false;
 }
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
index 8a866cfe9161..18ff901d6441 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
@@ -22,6 +22,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/SourceMgr.h"
 #include <cassert>
@@ -29,8 +30,8 @@
 using namespace llvm;
 
 static cl::opt<bool>
-    RelaxNVChecks("relax-nv-checks", cl::init(false), cl::ZeroOrMore,
-                  cl::Hidden, cl::desc("Relax checks of new-value validity"));
+    RelaxNVChecks("relax-nv-checks", cl::Hidden,
+                  cl::desc("Relax checks of new-value validity"));
 
 const HexagonMCChecker::PredSense
     HexagonMCChecker::Unconditional(Hexagon::NoRegister, false);
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
index f8ac35aed7c0..ed2856eb1fe9 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
@@ -789,7 +789,6 @@ HexagonMCCodeEmitter::getMachineOpValue(MCInst const &MI, MCOperand const &MO,
 }
 
 MCCodeEmitter *llvm::createHexagonMCCodeEmitter(MCInstrInfo const &MII,
-                                                MCRegisterInfo const &MRI,
                                                 MCContext &MCT) {
   return new HexagonMCCodeEmitter(MII, MCT);
 }
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
index 0624214d284b..49725801f046 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
@@ -108,7 +108,7 @@ void HexagonMCELFStreamer::HexagonMCEmitCommonSymbol(MCSymbol *Symbol,
     MCSection &Section = *getAssembler().getContext().getELFSection(
         SectionName, ELF::SHT_NOBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
     MCSectionSubPair P = getCurrentSection();
-    SwitchSection(&Section);
+    switchSection(&Section);
 
     if (ELFSymbol->isUndefined()) {
       emitValueToAlignment(ByteAlignment, 0, 1, 0);
@@ -120,7 +120,7 @@ void HexagonMCELFStreamer::HexagonMCEmitCommonSymbol(MCSymbol *Symbol,
     if (Align(ByteAlignment) > Section.getAlignment())
       Section.setAlignment(Align(ByteAlignment));
 
-    SwitchSection(P.first, P.second);
+    switchSection(P.first, P.second);
   } else {
     if (ELFSymbol->declareCommon(Size, ByteAlignment))
       report_fatal_error("Symbol: " + Symbol->getName() +
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp
index 1e708ba1bcd3..ab5e9eb4eca6 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp
@@ -13,6 +13,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 6a08d7503bac..d068baf05998 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -11,7 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/HexagonMCTargetDesc.h"
-#include "HexagonArch.h"
+#include "HexagonDepArch.h"
 #include "HexagonTargetStreamer.h"
 #include "MCTargetDesc/HexagonInstPrinter.h"
 #include "MCTargetDesc/HexagonMCAsmInfo.h"
@@ -22,6 +22,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDwarf.h"
@@ -409,8 +410,8 @@ std::string selectHexagonFS(StringRef CPU, StringRef FS) {
 }
 }
 
-static bool isCPUValid(const std::string &CPU) {
-  return Hexagon::CpuTable.find(CPU) != Hexagon::CpuTable.cend();
+static bool isCPUValid(StringRef CPU) {
+  return Hexagon::getCpu(CPU).has_value();
 }
 
 namespace {
@@ -559,12 +560,18 @@ void Hexagon_MC::addArchSubtarget(MCSubtargetInfo const *STI,
 }
 
 unsigned Hexagon_MC::GetELFFlags(const MCSubtargetInfo &STI) {
-  using llvm::Hexagon::ElfFlagsByCpuStr;
-
-  const std::string CPU(STI.getCPU().str());
-  auto F = ElfFlagsByCpuStr.find(CPU);
-  assert(F != ElfFlagsByCpuStr.end() && "Unrecognized Architecture");
-  return F->second;
+  return StringSwitch<unsigned>(STI.getCPU())
+      .Case("generic", llvm::ELF::EF_HEXAGON_MACH_V5)
+      .Case("hexagonv5", llvm::ELF::EF_HEXAGON_MACH_V5)
+      .Case("hexagonv55", llvm::ELF::EF_HEXAGON_MACH_V55)
+      .Case("hexagonv60", llvm::ELF::EF_HEXAGON_MACH_V60)
+      .Case("hexagonv62", llvm::ELF::EF_HEXAGON_MACH_V62)
+      .Case("hexagonv65", llvm::ELF::EF_HEXAGON_MACH_V65)
+      .Case("hexagonv66", llvm::ELF::EF_HEXAGON_MACH_V66)
+      .Case("hexagonv67", llvm::ELF::EF_HEXAGON_MACH_V67)
+      .Case("hexagonv67t", llvm::ELF::EF_HEXAGON_MACH_V67T)
+      .Case("hexagonv68", llvm::ELF::EF_HEXAGON_MACH_V68)
+      .Case("hexagonv69", llvm::ELF::EF_HEXAGON_MACH_V69);
 }
 
 llvm::ArrayRef<MCPhysReg> Hexagon_MC::GetVectRegRev() {
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
index 5bf7c9a1a908..d717e710f3c0 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
@@ -85,7 +85,6 @@ namespace Hexagon_MC {
 }
 
 MCCodeEmitter *createHexagonMCCodeEmitter(const MCInstrInfo &MCII,
-                                          const MCRegisterInfo &MRI,
                                           MCContext &MCT);
 
 MCAsmBackend *createHexagonAsmBackend(const Target &T,
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
index d82731e153fe..c8805296017d 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
@@ -295,7 +295,7 @@ void HexagonShuffler::restrictBranchOrder(HexagonPacketSummary const &Summary) {
     Summary.branchInsts[0]->Core.setUnits(jumpSlot.first);
     Summary.branchInsts[1]->Core.setUnits(jumpSlot.second);
 
-    const bool HasShuffledPacket = tryAuction(Summary).hasValue();
+    const bool HasShuffledPacket = tryAuction(Summary).has_value();
     if (HasShuffledPacket)
       return;
 
@@ -599,7 +599,7 @@ void HexagonShuffler::restrictPreferSlot3(HexagonPacketSummary const &Summary,
   // and then pin it to slot #3
   const unsigned saveUnits = PrefSlot3Inst->Core.getUnits();
   PrefSlot3Inst->Core.setUnits(saveUnits & Slot3Mask);
-  const bool HasShuffledPacket = tryAuction(Summary).hasValue();
+  const bool HasShuffledPacket = tryAuction(Summary).has_value();
   if (HasShuffledPacket)
     return;
 
diff --git a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
index 660215ca7435..d715ba901a2b 100644
--- a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
+++ b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
@@ -704,14 +704,14 @@ LanaiAsmParser::parseRegister(bool RestoreOnFailure) {
   if (Lexer.getKind() == AsmToken::Identifier) {
     RegNum = MatchRegisterName(Lexer.getTok().getIdentifier());
     if (RegNum == 0) {
-      if (PercentTok.hasValue() && RestoreOnFailure)
+      if (PercentTok && RestoreOnFailure)
         Lexer.UnLex(PercentTok.getValue());
       return nullptr;
     }
     Parser.Lex(); // Eat identifier token
     return LanaiOperand::createReg(RegNum, Start, End);
   }
-  if (PercentTok.hasValue() && RestoreOnFailure)
+  if (PercentTok && RestoreOnFailure)
     Lexer.UnLex(PercentTok.getValue());
   return nullptr;
 }
diff --git a/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp b/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
index 57343784237d..e9fecef4ac5b 100644
--- a/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
+++ b/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
@@ -16,7 +16,7 @@
 #include "LanaiCondCode.h"
 #include "LanaiInstrInfo.h"
 #include "TargetInfo/LanaiTargetInfo.h"
-#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCDecoderOps.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
@@ -45,26 +45,30 @@ LanaiDisassembler::LanaiDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
 // Definition is further down.
 static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                            uint64_t Address,
-                                           const void *Decoder);
+                                           const MCDisassembler *Decoder);
 
 static DecodeStatus decodeRiMemoryValue(MCInst &Inst, unsigned Insn,
-                                        uint64_t Address, const void *Decoder);
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder);
 
 static DecodeStatus decodeRrMemoryValue(MCInst &Inst, unsigned Insn,
-                                        uint64_t Address, const void *Decoder);
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder);
 
 static DecodeStatus decodeSplsValue(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address, const void *Decoder);
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder);
 
 static DecodeStatus decodeBranch(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const void *Decoder);
+                                 const MCDisassembler *Decoder);
 
 static DecodeStatus decodePredicateOperand(MCInst &Inst, unsigned Val,
                                            uint64_t Address,
-                                           const void *Decoder);
+                                           const MCDisassembler *Decoder);
 
 static DecodeStatus decodeShiftImm(MCInst &Inst, unsigned Insn,
-                                   uint64_t Address, const void *Decoder);
+                                   uint64_t Address,
+                                   const MCDisassembler *Decoder);
 
 #include "LanaiGenDisassemblerTables.inc"
 
@@ -158,7 +162,7 @@ static const unsigned GPRDecoderTable[] = {
 
 DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                     uint64_t /*Address*/,
-                                    const void * /*Decoder*/) {
+                                    const MCDisassembler * /*Decoder*/) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
@@ -168,7 +172,8 @@ DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
 }
 
 static DecodeStatus decodeRiMemoryValue(MCInst &Inst, unsigned Insn,
-                                        uint64_t Address, const void *Decoder) {
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder) {
   // RI memory values encoded using 23 bits:
   //   5 bit register, 16 bit constant
   unsigned Register = (Insn >> 18) & 0x1f;
@@ -180,7 +185,8 @@ static DecodeStatus decodeRiMemoryValue(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus decodeRrMemoryValue(MCInst &Inst, unsigned Insn,
-                                        uint64_t Address, const void *Decoder) {
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder) {
   // RR memory values encoded using 20 bits:
   //   5 bit register, 5 bit register, 2 bit PQ, 3 bit ALU operator, 5 bit JJJJJ
   unsigned Register = (Insn >> 15) & 0x1f;
@@ -192,7 +198,8 @@ static DecodeStatus decodeRrMemoryValue(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus decodeSplsValue(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address, const void *Decoder) {
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder) {
   // RI memory values encoded using 17 bits:
   //   5 bit register, 10 bit constant
   unsigned Register = (Insn >> 12) & 0x1f;
@@ -206,14 +213,13 @@ static DecodeStatus decodeSplsValue(MCInst &Inst, unsigned Insn,
 static bool tryAddingSymbolicOperand(int64_t Value, bool IsBranch,
                                      uint64_t Address, uint64_t Offset,
                                      uint64_t Width, MCInst &MI,
-                                     const void *Decoder) {
-  const MCDisassembler *Dis = static_cast<const MCDisassembler *>(Decoder);
-  return Dis->tryAddingSymbolicOperand(MI, Value, Address, IsBranch, Offset,
-                                       Width);
+                                     const MCDisassembler *Decoder) {
+  return Decoder->tryAddingSymbolicOperand(MI, Value, Address, IsBranch, Offset,
+                                           Width, /*InstSize=*/0);
 }
 
 static DecodeStatus decodeBranch(MCInst &MI, unsigned Insn, uint64_t Address,
-                                 const void *Decoder) {
+                                 const MCDisassembler *Decoder) {
   if (!tryAddingSymbolicOperand(Insn + Address, false, Address, 2, 23, MI,
                                 Decoder))
     MI.addOperand(MCOperand::createImm(Insn));
@@ -221,7 +227,8 @@ static DecodeStatus decodeBranch(MCInst &MI, unsigned Insn, uint64_t Address,
 }
 
 static DecodeStatus decodeShiftImm(MCInst &Inst, unsigned Insn,
-                                   uint64_t Address, const void *Decoder) {
+                                   uint64_t Address,
+                                   const MCDisassembler *Decoder) {
   unsigned Offset = (Insn & 0xffff);
   Inst.addOperand(MCOperand::createImm(SignExtend32<16>(Offset)));
 
@@ -230,7 +237,7 @@ static DecodeStatus decodeShiftImm(MCInst &Inst, unsigned Insn,
 
 static DecodeStatus decodePredicateOperand(MCInst &Inst, unsigned Val,
                                            uint64_t Address,
-                                           const void *Decoder) {
+                                           const MCDisassembler *Decoder) {
   if (Val >= LPCC::UNKNOWN)
     return MCDisassembler::Fail;
   Inst.addOperand(MCOperand::createImm(Val));
diff --git a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
index 010ff80ad42a..832cafb3dabe 100644
--- a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
+++ b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
@@ -138,11 +138,7 @@ LanaiTargetLowering::LanaiTargetLowering(const TargetMachine &TM,
     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
   }
 
-  setTargetDAGCombine(ISD::ADD);
-  setTargetDAGCombine(ISD::SUB);
-  setTargetDAGCombine(ISD::AND);
-  setTargetDAGCombine(ISD::OR);
-  setTargetDAGCombine(ISD::XOR);
+  setTargetDAGCombine({ISD::ADD, ISD::SUB, ISD::AND, ISD::OR, ISD::XOR});
 
   // Function alignments
   setMinFunctionAlignment(Align(4));
diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
index 4217b8509676..bef2458fd126 100644
--- a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
+++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
@@ -592,9 +592,7 @@ bool LanaiInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
       }
 
       // If the block has any instructions after a branch, delete them.
-      while (std::next(Instruction) != MBB.end()) {
-        std::next(Instruction)->eraseFromParent();
-      }
+      MBB.erase(std::next(Instruction), MBB.end());
 
       Condition.clear();
       FalseBlock = nullptr;
diff --git a/llvm/lib/Target/Lanai/LanaiMachineFunctionInfo.cpp b/llvm/lib/Target/Lanai/LanaiMachineFunctionInfo.cpp
index eeef1d919925..fe8ce1093bd8 100644
--- a/llvm/lib/Target/Lanai/LanaiMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/Lanai/LanaiMachineFunctionInfo.cpp
@@ -11,3 +11,10 @@
 using namespace llvm;
 
 void LanaiMachineFunctionInfo::anchor() {}
+
+MachineFunctionInfo *LanaiMachineFunctionInfo::clone(
+    BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+    const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+    const {
+  return DestMF.cloneInfo<LanaiMachineFunctionInfo>(*this);
+}
diff --git a/llvm/lib/Target/Lanai/LanaiMachineFunctionInfo.h b/llvm/lib/Target/Lanai/LanaiMachineFunctionInfo.h
index de712637b5a4..edf5f2ee087e 100644
--- a/llvm/lib/Target/Lanai/LanaiMachineFunctionInfo.h
+++ b/llvm/lib/Target/Lanai/LanaiMachineFunctionInfo.h
@@ -40,6 +40,10 @@ class LanaiMachineFunctionInfo : public MachineFunctionInfo {
 public:
   explicit LanaiMachineFunctionInfo(MachineFunction &MF)
       : VarArgsFrameIndex(0) {}
+  MachineFunctionInfo *
+  clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+        const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+      const override;
 
   Register getSRetReturnReg() const { return SRetReturnReg; }
   void setSRetReturnReg(Register Reg) { SRetReturnReg = Reg; }
diff --git a/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp b/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
index 70b6fd2c185d..8af40d18d106 100644
--- a/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
+++ b/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
@@ -48,7 +48,7 @@ static std::string computeDataLayout() {
 }
 
 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
-  return RM.getValueOr(Reloc::PIC_);
+  return RM.value_or(Reloc::PIC_);
 }
 
 LanaiTargetMachine::LanaiTargetMachine(const Target &T, const Triple &TT,
@@ -68,7 +68,7 @@ LanaiTargetMachine::LanaiTargetMachine(const Target &T, const Triple &TT,
 }
 
 TargetTransformInfo
-LanaiTargetMachine::getTargetTransformInfo(const Function &F) {
+LanaiTargetMachine::getTargetTransformInfo(const Function &F) const {
   return TargetTransformInfo(LanaiTTIImpl(this, F));
 }
 
diff --git a/llvm/lib/Target/Lanai/LanaiTargetMachine.h b/llvm/lib/Target/Lanai/LanaiTargetMachine.h
index 00922f44f33a..258e58c86253 100644
--- a/llvm/lib/Target/Lanai/LanaiTargetMachine.h
+++ b/llvm/lib/Target/Lanai/LanaiTargetMachine.h
@@ -38,7 +38,7 @@ public:
     return &Subtarget;
   }
 
-  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+  TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
 
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &pass_manager) override;
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.h b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.h
index f0d287c858d8..08cc54b858ce 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.h
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.h
@@ -13,10 +13,10 @@
 #ifndef LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIINSTPRINTER_H
 #define LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIINSTPRINTER_H
 
-#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCInstPrinter.h"
 
 namespace llvm {
+class StringRef;
 
 class LanaiInstPrinter : public MCInstPrinter {
 public:
@@ -36,7 +36,6 @@ public:
   void printMemSplsOperand(const MCInst *MI, int OpNo, raw_ostream &O,
                            const char *Modifier = nullptr);
   void printCCOperand(const MCInst *MI, int OpNo, raw_ostream &O);
-  void printAluOperand(const MCInst *MI, int OpNo, raw_ostream &O);
   void printHi16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printHi16AndImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printLo16AndImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
index df4ee297155f..ec573a189a70 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
@@ -304,7 +304,6 @@ unsigned LanaiMCCodeEmitter::getBranchTargetOpValue(
 
 llvm::MCCodeEmitter *
 llvm::createLanaiMCCodeEmitter(const MCInstrInfo &InstrInfo,
-                               const MCRegisterInfo & /*MRI*/,
                                MCContext &context) {
   return new LanaiMCCodeEmitter(InstrInfo, context);
 }
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h
index 651ed36cdc24..e8da1bc88142 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h
@@ -27,7 +27,6 @@ class MCSubtargetInfo;
 class Target;
 
 MCCodeEmitter *createLanaiMCCodeEmitter(const MCInstrInfo &MCII,
-                                        const MCRegisterInfo &MRI,
                                         MCContext &Ctx);
 
 MCAsmBackend *createLanaiAsmBackend(const Target &T, const MCSubtargetInfo &STI,
diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
new file mode 100644
index 000000000000..d11f5a9080a0
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
@@ -0,0 +1,556 @@
+// LoongArchAsmParser.cpp - Parse LoongArch assembly to MCInst instructions -=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/LoongArchInstPrinter.h"
+#include "MCTargetDesc/LoongArchMCTargetDesc.h"
+#include "TargetInfo/LoongArchTargetInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Casting.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loongarch-asm-parser"
+
+namespace {
+class LoongArchAsmParser : public MCTargetAsmParser {
+  SMLoc getLoc() const { return getParser().getTok().getLoc(); }
+
+  /// Parse a register as used in CFI directives.
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  OperandMatchResultTy tryParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                        SMLoc &EndLoc) override;
+
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
+
+  bool ParseDirective(AsmToken DirectiveID) override { return true; }
+
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               uint64_t &ErrorInfo,
+                               bool MatchingInlineAsm) override;
+
+  unsigned checkTargetMatchPredicate(MCInst &Inst) override;
+
+  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
+                                      unsigned Kind) override;
+
+  bool generateImmOutOfRangeError(OperandVector &Operands, uint64_t ErrorInfo,
+                                  int64_t Lower, int64_t Upper, Twine Msg);
+
+  /// Helper for processing MC instructions that have been successfully matched
+  /// by MatchAndEmitInstruction.
+  bool processInstruction(MCInst &Inst, SMLoc IDLoc, OperandVector &Operands,
+                          MCStreamer &Out);
+
+// Auto-generated instruction matching functions.
+#define GET_ASSEMBLER_HEADER
+#include "LoongArchGenAsmMatcher.inc"
+
+  OperandMatchResultTy parseRegister(OperandVector &Operands);
+  OperandMatchResultTy parseImmediate(OperandVector &Operands);
+
+  bool parseOperand(OperandVector &Operands, StringRef Mnemonic);
+
+public:
+  enum LoongArchMatchResultTy {
+    Match_Dummy = FIRST_TARGET_MATCH_RESULT_TY,
+    Match_RequiresMsbNotLessThanLsb,
+    Match_RequiresOpnd2NotR0R1,
+#define GET_OPERAND_DIAGNOSTIC_TYPES
+#include "LoongArchGenAsmMatcher.inc"
+#undef GET_OPERAND_DIAGNOSTIC_TYPES
+  };
+
+  LoongArchAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
+                     const MCInstrInfo &MII, const MCTargetOptions &Options)
+      : MCTargetAsmParser(Options, STI, MII) {
+    Parser.addAliasForDirective(".half", ".2byte");
+    Parser.addAliasForDirective(".hword", ".2byte");
+    Parser.addAliasForDirective(".word", ".4byte");
+    Parser.addAliasForDirective(".dword", ".8byte");
+
+    // Initialize the set of available features.
+    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+  }
+};
+
+// Instances of this class represent a parsed LoongArch machine instruction.
+class LoongArchOperand : public MCParsedAsmOperand {
+  enum class KindTy {
+    Token,
+    Register,
+    Immediate,
+  } Kind;
+
+  struct RegOp {
+    MCRegister RegNum;
+  };
+
+  struct ImmOp {
+    const MCExpr *Val;
+  };
+
+  SMLoc StartLoc, EndLoc;
+  union {
+    StringRef Tok;
+    struct RegOp Reg;
+    struct ImmOp Imm;
+  };
+
+public:
+  LoongArchOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+
+  bool isToken() const override { return Kind == KindTy::Token; }
+  bool isReg() const override { return Kind == KindTy::Register; }
+  bool isImm() const override { return Kind == KindTy::Immediate; }
+  bool isMem() const override { return false; }
+  void setReg(MCRegister PhysReg) { Reg.RegNum = PhysReg; }
+
+  static bool evaluateConstantImm(const MCExpr *Expr, int64_t &Imm) {
+    if (auto CE = dyn_cast<MCConstantExpr>(Expr)) {
+      Imm = CE->getValue();
+      return true;
+    }
+
+    return false;
+  }
+
+  template <unsigned N, int P = 0> bool isUImm() const {
+    if (!isImm())
+      return false;
+
+    int64_t Imm;
+    bool IsConstantImm = evaluateConstantImm(getImm(), Imm);
+    return IsConstantImm && isUInt<N>(Imm - P);
+  }
+
+  template <unsigned N, unsigned S = 0> bool isSImm() const {
+    if (!isImm())
+      return false;
+
+    int64_t Imm;
+    bool IsConstantImm = evaluateConstantImm(getImm(), Imm);
+    return IsConstantImm && isShiftedInt<N, S>(Imm);
+  }
+
+  bool isUImm2() const { return isUImm<2>(); }
+  bool isUImm2plus1() const { return isUImm<2, 1>(); }
+  bool isUImm3() const { return isUImm<3>(); }
+  bool isUImm5() const { return isUImm<5>(); }
+  bool isUImm6() const { return isUImm<6>(); }
+  bool isUImm8() const { return isUImm<8>(); }
+  bool isUImm12() const { return isUImm<12>(); }
+  bool isUImm14() const { return isUImm<14>(); }
+  bool isUImm15() const { return isUImm<15>(); }
+  bool isSImm12() const { return isSImm<12>(); }
+  bool isSImm14lsl2() const { return isSImm<14, 2>(); }
+  bool isSImm16() const { return isSImm<16>(); }
+  bool isSImm16lsl2() const { return isSImm<16, 2>(); }
+  bool isSImm20() const { return isSImm<20>(); }
+  bool isSImm21lsl2() const { return isSImm<21, 2>(); }
+  bool isSImm26lsl2() const { return isSImm<26, 2>(); }
+
+  /// Gets location of the first token of this operand.
+  SMLoc getStartLoc() const override { return StartLoc; }
+  /// Gets location of the last token of this operand.
+  SMLoc getEndLoc() const override { return EndLoc; }
+
+  unsigned getReg() const override {
+    assert(Kind == KindTy::Register && "Invalid type access!");
+    return Reg.RegNum.id();
+  }
+
+  const MCExpr *getImm() const {
+    assert(Kind == KindTy::Immediate && "Invalid type access!");
+    return Imm.Val;
+  }
+
+  StringRef getToken() const {
+    assert(Kind == KindTy::Token && "Invalid type access!");
+    return Tok;
+  }
+
+  void print(raw_ostream &OS) const override {
+    auto RegName = [](unsigned Reg) {
+      if (Reg)
+        return LoongArchInstPrinter::getRegisterName(Reg);
+      else
+        return "noreg";
+    };
+
+    switch (Kind) {
+    case KindTy::Immediate:
+      OS << *getImm();
+      break;
+    case KindTy::Register:
+      OS << "<register " << RegName(getReg()) << ">";
+      break;
+    case KindTy::Token:
+      OS << "'" << getToken() << "'";
+      break;
+    }
+  }
+
+  static std::unique_ptr<LoongArchOperand> createToken(StringRef Str, SMLoc S) {
+    auto Op = std::make_unique<LoongArchOperand>(KindTy::Token);
+    Op->Tok = Str;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<LoongArchOperand> createReg(unsigned RegNo, SMLoc S,
+                                                     SMLoc E) {
+    auto Op = std::make_unique<LoongArchOperand>(KindTy::Register);
+    Op->Reg.RegNum = RegNo;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<LoongArchOperand> createImm(const MCExpr *Val, SMLoc S,
+                                                     SMLoc E) {
+    auto Op = std::make_unique<LoongArchOperand>(KindTy::Immediate);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+    if (auto CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::createImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::createExpr(Expr));
+  }
+
+  // Used by the TableGen Code.
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getReg()));
+  }
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+};
+} // end anonymous namespace
+
+#define GET_REGISTER_MATCHER
+#define GET_SUBTARGET_FEATURE_NAME
+#define GET_MATCHER_IMPLEMENTATION
+#define GET_MNEMONIC_SPELL_CHECKER
+#include "LoongArchGenAsmMatcher.inc"
+
+static MCRegister convertFPR32ToFPR64(MCRegister Reg) {
+  assert(Reg >= LoongArch::F0 && Reg <= LoongArch::F31 && "Invalid register");
+  return Reg - LoongArch::F0 + LoongArch::F0_64;
+}
+
+// Attempts to match Name as a register (either using the default name or
+// alternative ABI names), setting RegNo to the matching register. Upon
+// failure, returns true and sets RegNo to 0.
+static bool matchRegisterNameHelper(MCRegister &RegNo, StringRef Name) {
+  RegNo = MatchRegisterName(Name);
+  // The 32-bit and 64-bit FPRs have the same asm name. Check that the initial
+  // match always matches the 32-bit variant, and not the 64-bit one.
+  assert(!(RegNo >= LoongArch::F0_64 && RegNo <= LoongArch::F31_64));
+  // The default FPR register class is based on the tablegen enum ordering.
+  static_assert(LoongArch::F0 < LoongArch::F0_64,
+                "FPR matching must be updated");
+  if (RegNo == LoongArch::NoRegister)
+    RegNo = MatchRegisterAltName(Name);
+
+  return RegNo == LoongArch::NoRegister;
+}
+
+bool LoongArchAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                       SMLoc &EndLoc) {
+  return Error(getLoc(), "invalid register number");
+}
+
+OperandMatchResultTy LoongArchAsmParser::tryParseRegister(unsigned &RegNo,
+                                                          SMLoc &StartLoc,
+                                                          SMLoc &EndLoc) {
+  llvm_unreachable("Unimplemented function.");
+}
+
+OperandMatchResultTy
+LoongArchAsmParser::parseRegister(OperandVector &Operands) {
+  if (getLexer().getTok().isNot(AsmToken::Dollar))
+    return MatchOperand_NoMatch;
+
+  // Eat the $ prefix.
+  getLexer().Lex();
+  if (getLexer().getKind() != AsmToken::Identifier)
+    return MatchOperand_NoMatch;
+
+  StringRef Name = getLexer().getTok().getIdentifier();
+  MCRegister RegNo;
+  matchRegisterNameHelper(RegNo, Name);
+  if (RegNo == LoongArch::NoRegister)
+    return MatchOperand_NoMatch;
+
+  SMLoc S = getLoc();
+  SMLoc E = SMLoc::getFromPointer(S.getPointer() + Name.size());
+  getLexer().Lex();
+  Operands.push_back(LoongArchOperand::createReg(RegNo, S, E));
+
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+LoongArchAsmParser::parseImmediate(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  SMLoc E;
+  const MCExpr *Res;
+
+  if (getParser().parseExpression(Res, E))
+    return MatchOperand_ParseFail;
+
+  Operands.push_back(LoongArchOperand::createImm(Res, S, E));
+  return MatchOperand_Success;
+}
+
+/// Looks at a token type and creates the relevant operand from this
+/// information, adding to Operands. Return true upon an error.
+bool LoongArchAsmParser::parseOperand(OperandVector &Operands,
+                                      StringRef Mnemonic) {
+  if (parseRegister(Operands) == MatchOperand_Success ||
+      parseImmediate(Operands) == MatchOperand_Success)
+    return false;
+
+  // Finally we have exhausted all options and must declare defeat.
+  Error(getLoc(), "unknown operand");
+  return true;
+}
+
+bool LoongArchAsmParser::ParseInstruction(ParseInstructionInfo &Info,
+                                          StringRef Name, SMLoc NameLoc,
+                                          OperandVector &Operands) {
+  // First operand in MCInst is instruction mnemonic.
+  Operands.push_back(LoongArchOperand::createToken(Name, NameLoc));
+
+  // If there are no more operands, then finish.
+  if (parseOptionalToken(AsmToken::EndOfStatement))
+    return false;
+
+  // Parse first operand.
+  if (parseOperand(Operands, Name))
+    return true;
+
+  // Parse until end of statement, consuming commas between operands.
+  while (parseOptionalToken(AsmToken::Comma))
+    if (parseOperand(Operands, Name))
+      return true;
+
+  // Parse end of statement and return successfully.
+  if (parseOptionalToken(AsmToken::EndOfStatement))
+    return false;
+
+  SMLoc Loc = getLexer().getLoc();
+  getParser().eatToEndOfStatement();
+  return Error(Loc, "unexpected token");
+}
+
+bool LoongArchAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
+                                            OperandVector &Operands,
+                                            MCStreamer &Out) {
+  Inst.setLoc(IDLoc);
+  Out.emitInstruction(Inst, getSTI());
+  return false;
+}
+
+unsigned LoongArchAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
+  switch (Inst.getOpcode()) {
+  default:
+    break;
+  case LoongArch::CSRXCHG: {
+    unsigned Rj = Inst.getOperand(2).getReg();
+    if (Rj == LoongArch::R0 || Rj == LoongArch::R1)
+      return Match_RequiresOpnd2NotR0R1;
+    return Match_Success;
+  }
+  case LoongArch::BSTRINS_W:
+  case LoongArch::BSTRINS_D:
+  case LoongArch::BSTRPICK_W:
+  case LoongArch::BSTRPICK_D: {
+    unsigned Opc = Inst.getOpcode();
+    const signed Msb =
+        (Opc == LoongArch::BSTRINS_W || Opc == LoongArch::BSTRINS_D)
+            ? Inst.getOperand(3).getImm()
+            : Inst.getOperand(2).getImm();
+    const signed Lsb =
+        (Opc == LoongArch::BSTRINS_W || Opc == LoongArch::BSTRINS_D)
+            ? Inst.getOperand(4).getImm()
+            : Inst.getOperand(3).getImm();
+    if (Msb < Lsb)
+      return Match_RequiresMsbNotLessThanLsb;
+    return Match_Success;
+  }
+  }
+
+  return Match_Success;
+}
+
+unsigned
+LoongArchAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
+                                               unsigned Kind) {
+  LoongArchOperand &Op = static_cast<LoongArchOperand &>(AsmOp);
+  if (!Op.isReg())
+    return Match_InvalidOperand;
+
+  MCRegister Reg = Op.getReg();
+  // As the parser couldn't differentiate an FPR32 from an FPR64, coerce the
+  // register from FPR32 to FPR64 if necessary.
+  if (LoongArchMCRegisterClasses[LoongArch::FPR32RegClassID].contains(Reg) &&
+      Kind == MCK_FPR64) {
+    Op.setReg(convertFPR32ToFPR64(Reg));
+    return Match_Success;
+  }
+
+  return Match_InvalidOperand;
+}
+
+bool LoongArchAsmParser::generateImmOutOfRangeError(
+    OperandVector &Operands, uint64_t ErrorInfo, int64_t Lower, int64_t Upper,
+    Twine Msg = "immediate must be an integer in the range") {
+  SMLoc ErrorLoc = ((LoongArchOperand &)*Operands[ErrorInfo]).getStartLoc();
+  return Error(ErrorLoc, Msg + " [" + Twine(Lower) + ", " + Twine(Upper) + "]");
+}
+
+bool LoongArchAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                                 OperandVector &Operands,
+                                                 MCStreamer &Out,
+                                                 uint64_t &ErrorInfo,
+                                                 bool MatchingInlineAsm) {
+  MCInst Inst;
+  FeatureBitset MissingFeatures;
+
+  auto Result = MatchInstructionImpl(Operands, Inst, ErrorInfo, MissingFeatures,
+                                     MatchingInlineAsm);
+  switch (Result) {
+  default:
+    break;
+  case Match_Success:
+    return processInstruction(Inst, IDLoc, Operands, Out);
+  case Match_MissingFeature: {
+    assert(MissingFeatures.any() && "Unknown missing features!");
+    bool FirstFeature = true;
+    std::string Msg = "instruction requires the following:";
+    for (unsigned i = 0, e = MissingFeatures.size(); i != e; ++i) {
+      if (MissingFeatures[i]) {
+        Msg += FirstFeature ? " " : ", ";
+        Msg += getSubtargetFeatureName(i);
+        FirstFeature = false;
+      }
+    }
+    return Error(IDLoc, Msg);
+  }
+  case Match_MnemonicFail: {
+    FeatureBitset FBS = ComputeAvailableFeatures(getSTI().getFeatureBits());
+    std::string Suggestion = LoongArchMnemonicSpellCheck(
+        ((LoongArchOperand &)*Operands[0]).getToken(), FBS, 0);
+    return Error(IDLoc, "unrecognized instruction mnemonic" + Suggestion);
+  }
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0ULL) {
+      if (ErrorInfo >= Operands.size())
+        return Error(ErrorLoc, "too few operands for instruction");
+
+      ErrorLoc = ((LoongArchOperand &)*Operands[ErrorInfo]).getStartLoc();
+      if (ErrorLoc == SMLoc())
+        ErrorLoc = IDLoc;
+    }
+    return Error(ErrorLoc, "invalid operand for instruction");
+  }
+  }
+
+  // Handle the case when the error message is of specific type
+  // other than the generic Match_InvalidOperand, and the
+  // corresponding operand is missing.
+  if (Result > FIRST_TARGET_MATCH_RESULT_TY) {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0ULL && ErrorInfo >= Operands.size())
+      return Error(ErrorLoc, "too few operands for instruction");
+  }
+
+  switch (Result) {
+  default:
+    break;
+  case Match_RequiresMsbNotLessThanLsb: {
+    SMLoc ErrorStart = Operands[3]->getStartLoc();
+    return Error(ErrorStart, "msb is less than lsb",
+                 SMRange(ErrorStart, Operands[4]->getEndLoc()));
+  }
+  case Match_RequiresOpnd2NotR0R1:
+    return Error(Operands[2]->getStartLoc(), "must not be $r0 or $r1");
+  case Match_InvalidUImm2:
+    return generateImmOutOfRangeError(Operands, ErrorInfo, /*Lower=*/0,
+                                      /*Upper=*/(1 << 2) - 1);
+  case Match_InvalidUImm2plus1:
+    return generateImmOutOfRangeError(Operands, ErrorInfo, /*Lower=*/1,
+                                      /*Upper=*/(1 << 2));
+  case Match_InvalidUImm3:
+    return generateImmOutOfRangeError(Operands, ErrorInfo, /*Lower=*/0,
+                                      /*Upper=*/(1 << 3) - 1);
+  case Match_InvalidUImm5:
+    return generateImmOutOfRangeError(Operands, ErrorInfo, /*Lower=*/0,
+                                      /*Upper=*/(1 << 5) - 1);
+  case Match_InvalidUImm6:
+    return generateImmOutOfRangeError(Operands, ErrorInfo, /*Lower=*/0,
+                                      /*Upper=*/(1 << 6) - 1);
+  case Match_InvalidUImm12:
+    return generateImmOutOfRangeError(Operands, ErrorInfo, /*Lower=*/0,
+                                      /*Upper=*/(1 << 12) - 1);
+  case Match_InvalidUImm15:
+    return generateImmOutOfRangeError(Operands, ErrorInfo, /*Lower=*/0,
+                                      /*Upper=*/(1 << 15) - 1);
+  case Match_InvalidSImm12:
+    return generateImmOutOfRangeError(Operands, ErrorInfo, /*Lower=*/-(1 << 11),
+                                      /*Upper=*/(1 << 11) - 1);
+  case Match_InvalidSImm14lsl2:
+    return generateImmOutOfRangeError(
+        Operands, ErrorInfo, /*Lower=*/-(1 << 15), /*Upper=*/(1 << 15) - 4,
+        "immediate must be a multiple of 4 in the range");
+  case Match_InvalidSImm16:
+    return generateImmOutOfRangeError(Operands, ErrorInfo, /*Lower=*/-(1 << 15),
+                                      /*Upper=*/(1 << 15) - 1);
+  case Match_InvalidSImm16lsl2:
+    return generateImmOutOfRangeError(
+        Operands, ErrorInfo, /*Lower=*/-(1 << 17), /*Upper=*/(1 << 17) - 4,
+        "immediate must be a multiple of 4 in the range");
+  case Match_InvalidSImm20:
+    return generateImmOutOfRangeError(Operands, ErrorInfo, /*Lower=*/-(1 << 19),
+                                      /*Upper=*/(1 << 19) - 1);
+  case Match_InvalidSImm21lsl2:
+    return generateImmOutOfRangeError(
+        Operands, ErrorInfo, /*Lower=*/-(1 << 22), /*Upper=*/(1 << 22) - 4,
+        "immediate must be a multiple of 4 in the range");
+  case Match_InvalidSImm26lsl2:
+    return generateImmOutOfRangeError(
+        Operands, ErrorInfo, /*Lower=*/-(1 << 27), /*Upper=*/(1 << 27) - 4,
+        "immediate must be a multiple of 4 in the range");
+  }
+  llvm_unreachable("Unknown match type detected!");
+}
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLoongArchAsmParser() {
+  RegisterMCAsmParser<LoongArchAsmParser> X(getTheLoongArch32Target());
+  RegisterMCAsmParser<LoongArchAsmParser> Y(getTheLoongArch64Target());
+}
diff --git a/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp b/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp
new file mode 100644
index 000000000000..215d061f11f2
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp
@@ -0,0 +1,145 @@
+//===-- LoongArchDisassembler.cpp - Disassembler for LoongArch ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LoongArchDisassembler class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/LoongArchBaseInfo.h"
+#include "MCTargetDesc/LoongArchMCTargetDesc.h"
+#include "TargetInfo/LoongArchTargetInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDecoderOps.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Endian.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loongarch-disassembler"
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+class LoongArchDisassembler : public MCDisassembler {
+public:
+  LoongArchDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
+      : MCDisassembler(STI, Ctx) {}
+
+  DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              raw_ostream &CStream) const override;
+};
+} // end anonymous namespace
+
+static MCDisassembler *createLoongArchDisassembler(const Target &T,
+                                                   const MCSubtargetInfo &STI,
+                                                   MCContext &Ctx) {
+  return new LoongArchDisassembler(STI, Ctx);
+}
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLoongArchDisassembler() {
+  // Register the disassembler for each target.
+  TargetRegistry::RegisterMCDisassembler(getTheLoongArch32Target(),
+                                         createLoongArchDisassembler);
+  TargetRegistry::RegisterMCDisassembler(getTheLoongArch64Target(),
+                                         createLoongArchDisassembler);
+}
+
+static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder) {
+  if (RegNo >= 32)
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createReg(LoongArch::R0 + RegNo));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, uint64_t RegNo,
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder) {
+  if (RegNo >= 32)
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createReg(LoongArch::F0 + RegNo));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, uint64_t RegNo,
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder) {
+  if (RegNo >= 32)
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createReg(LoongArch::F0_64 + RegNo));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCFRRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder) {
+  if (RegNo >= 8)
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createReg(LoongArch::FCC0 + RegNo));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFCSRRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder) {
+  if (RegNo >= 4)
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createReg(LoongArch::FCSR0 + RegNo));
+  return MCDisassembler::Success;
+}
+
+template <unsigned N, int P = 0>
+static DecodeStatus decodeUImmOperand(MCInst &Inst, uint64_t Imm,
+                                      int64_t Address,
+                                      const MCDisassembler *Decoder) {
+  assert(isUInt<N>(Imm) && "Invalid immediate");
+  Inst.addOperand(MCOperand::createImm(Imm + P));
+  return MCDisassembler::Success;
+}
+
+template <unsigned N, unsigned S = 0>
+static DecodeStatus decodeSImmOperand(MCInst &Inst, uint64_t Imm,
+                                      int64_t Address,
+                                      const MCDisassembler *Decoder) {
+  assert(isUInt<N>(Imm) && "Invalid immediate");
+  // Sign-extend the number in the bottom <N> bits of Imm, then shift left <S>
+  // bits.
+  Inst.addOperand(MCOperand::createImm(SignExtend64<N>(Imm) << S));
+  return MCDisassembler::Success;
+}
+
+#include "LoongArchGenDisassemblerTables.inc"
+
+DecodeStatus LoongArchDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                                   ArrayRef<uint8_t> Bytes,
+                                                   uint64_t Address,
+                                                   raw_ostream &CS) const {
+  uint32_t Insn;
+  DecodeStatus Result;
+
+  // We want to read exactly 4 bytes of data because all LoongArch instructions
+  // are fixed 32 bits.
+  if (Bytes.size() < 4) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  Insn = support::endian::read32le(Bytes.data());
+  // Calling the auto-generated decoder function.
+  Result = decodeInstruction(DecoderTable32, MI, Insn, Address, this, STI);
+  Size = 4;
+
+  return Result;
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArch.h b/llvm/lib/Target/LoongArch/LoongArch.h
new file mode 100644
index 000000000000..caa7bd31e28b
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/LoongArch.h
@@ -0,0 +1,38 @@
+//===-- LoongArch.h - Top-level interface for LoongArch ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// LoongArch back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LOONGARCH_LOONGARCH_H
+#define LLVM_LIB_TARGET_LOONGARCH_LOONGARCH_H
+
+#include "MCTargetDesc/LoongArchBaseInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class LoongArchTargetMachine;
+class AsmPrinter;
+class FunctionPass;
+class MCInst;
+class MCOperand;
+class MachineInstr;
+class MachineOperand;
+
+bool lowerLoongArchMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+                                        AsmPrinter &AP);
+bool lowerLoongArchMachineOperandToMCOperand(const MachineOperand &MO,
+                                             MCOperand &MCOp,
+                                             const AsmPrinter &AP);
+
+FunctionPass *createLoongArchISelDag(LoongArchTargetMachine &TM);
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LOONGARCH_LOONGARCH_H
diff --git a/llvm/lib/Target/LoongArch/LoongArch.td b/llvm/lib/Target/LoongArch/LoongArch.td
new file mode 100644
index 000000000000..bf465c27ef99
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/LoongArch.td
@@ -0,0 +1,139 @@
+//===-- LoongArch.td - Describe the LoongArch Target -------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// LoongArch subtarget features and instruction predicates.
+//===----------------------------------------------------------------------===//
+
+// LoongArch is divided into two versions, the 32-bit version (LA32) and the
+// 64-bit version (LA64).
+def Feature64Bit
+    : SubtargetFeature<"64bit", "HasLA64", "true",
+                       "LA64 Basic Integer and Privilege Instruction Set">;
+def IsLA64
+    : Predicate<"Subtarget->is64Bit()">,
+      AssemblerPredicate<(all_of Feature64Bit),
+                         "LA64 Basic Integer and Privilege Instruction Set">;
+def IsLA32
+    : Predicate<"!Subtarget->is64Bit()">,
+      AssemblerPredicate<(all_of(not Feature64Bit)),
+                         "LA32 Basic Integer and Privilege Instruction Set">;
+
+defvar LA32 = DefaultMode;
+def LA64 : HwMode<"+64bit">;
+
+// Single Precision floating point
+def FeatureBasicF
+    : SubtargetFeature<"f", "HasBasicF", "true",
+                       "'F' (Single-Precision Floating-Point)">;
+def HasBasicF
+    : Predicate<"Subtarget->hasBasicF()">,
+      AssemblerPredicate<(all_of FeatureBasicF),
+                         "'F' (Single-Precision Floating-Point)">;
+
+// Double Precision floating point
+def FeatureBasicD
+    : SubtargetFeature<"d", "HasBasicD", "true",
+                       "'D' (Double-Precision Floating-Point)",
+                       [FeatureBasicF]>;
+def HasBasicD
+    : Predicate<"Subtarget->hasBasicD()">,
+      AssemblerPredicate<(all_of FeatureBasicD),
+                         "'D' (Double-Precision Floating-Point)">;
+
+// Loongson SIMD eXtension (LSX)
+def FeatureExtLSX
+    : SubtargetFeature<"lsx", "HasExtLSX", "true",
+                       "'LSX' (Loongson SIMD Extension)", [FeatureBasicD]>;
+def HasExtLSX
+    : Predicate<"Subtarget->hasExtLSX()">,
+      AssemblerPredicate<(all_of FeatureExtLSX),
+                         "'LSX' (Loongson SIMD Extension)">;
+
+// Loongson Advanced SIMD eXtension (LASX)
+def FeatureExtLASX
+    : SubtargetFeature<"lasx", "HasExtLASX", "true",
+                       "'LASX' (Loongson Advanced SIMD Extension)",
+                       [FeatureExtLSX]>;
+def HasExtLASX
+    : Predicate<"Subtarget->hasExtLASX()">,
+      AssemblerPredicate<(all_of FeatureExtLASX),
+                         "'LASX' (Loongson Advanced SIMD Extension)">;
+
+// Loongson VirtualiZation (LVZ)
+def FeatureExtLVZ
+    : SubtargetFeature<"lvz", "HasExtLVZ", "true",
+                       "'LVZ' (Loongson Virtualization Extension)">;
+def HasExtLVZ
+    : Predicate<"Subtarget->hasExtLVZ()">,
+      AssemblerPredicate<(all_of FeatureExtLVZ),
+                         "'LVZ' (Loongson Virtualization Extension)">;
+
+// Loongson Binary Translation (LBT)
+def FeatureExtLBT
+    : SubtargetFeature<"lbt", "HasExtLBT", "true",
+                       "'LBT' (Loongson Binary Translation Extension)">;
+def HasExtLBT
+    : Predicate<"Subtarget->hasExtLBT()">,
+      AssemblerPredicate<(all_of FeatureExtLBT),
+                         "'LBT' (Loongson Binary Translation Extension)">;
+
+//===----------------------------------------------------------------------===//
+// Registers, instruction descriptions ...
+//===----------------------------------------------------------------------===//
+
+include "LoongArchRegisterInfo.td"
+include "LoongArchCallingConv.td"
+include "LoongArchInstrInfo.td"
+
+//===----------------------------------------------------------------------===//
+// LoongArch processors supported.
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"generic-la32", NoSchedModel, []>;
+def : ProcessorModel<"generic-la64", NoSchedModel, [Feature64Bit]>;
+
+def : ProcessorModel<"la464", NoSchedModel, [Feature64Bit,
+                                             FeatureExtLASX,
+                                             FeatureExtLVZ,
+                                             FeatureExtLBT]>;
+
+//===----------------------------------------------------------------------===//
+// Define the LoongArch target.
+//===----------------------------------------------------------------------===//
+
+def LoongArchInstrInfo : InstrInfo {
+  // guess mayLoad, mayStore, and hasSideEffects
+  // This option is a temporary migration help. It will go away.
+  let guessInstructionProperties = 1;
+}
+
+def LoongArchAsmParser : AsmParser {
+  let ShouldEmitMatchRegisterAltName = 1;
+  let AllowDuplicateRegisterNames = 1;
+}
+
+def LoongArchAsmParserVariant : AsmParserVariant {
+  int Variant = 0;
+  // Recognize hard coded registers.
+  string RegisterPrefix = "$";
+}
+
+def LoongArchAsmWriter : AsmWriter {
+  int PassSubtarget = 1;
+}
+
+def LoongArch : Target {
+  let InstructionSet = LoongArchInstrInfo;
+  let AssemblyParsers = [LoongArchAsmParser];
+  let AssemblyParserVariants = [LoongArchAsmParserVariant];
+  let AssemblyWriters = [LoongArchAsmWriter];
+  let AllowRegisterRenaming = 1;
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
new file mode 100644
index 000000000000..dd61bb2df077
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
@@ -0,0 +1,48 @@
+//===- LoongArchAsmPrinter.cpp - LoongArch LLVM Assembly Printer -*- C++ -*--=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format LoongArch assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LoongArchAsmPrinter.h"
+#include "LoongArch.h"
+#include "LoongArchTargetMachine.h"
+#include "TargetInfo/LoongArchTargetInfo.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/MC/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loongarch-asm-printer"
+
+// Simple pseudo-instructions have their lowering (with expansion to real
+// instructions) auto-generated.
+#include "LoongArchGenMCPseudoLowering.inc"
+
+void LoongArchAsmPrinter::emitInstruction(const MachineInstr *MI) {
+  // Do any auto-generated pseudo lowerings.
+  if (emitPseudoExpansionLowering(*OutStreamer, MI))
+    return;
+
+  MCInst TmpInst;
+  if (!lowerLoongArchMachineInstrToMCInst(MI, TmpInst, *this))
+    EmitToStreamer(*OutStreamer, TmpInst);
+}
+
+bool LoongArchAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  AsmPrinter::runOnMachineFunction(MF);
+  return true;
+}
+
+// Force static initialization.
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLoongArchAsmPrinter() {
+  RegisterAsmPrinter<LoongArchAsmPrinter> X(getTheLoongArch32Target());
+  RegisterAsmPrinter<LoongArchAsmPrinter> Y(getTheLoongArch64Target());
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h
new file mode 100644
index 000000000000..7e5aa49f227c
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h
@@ -0,0 +1,46 @@
+//===- LoongArchAsmPrinter.h - LoongArch LLVM Assembly Printer -*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// LoongArch Assembly printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LOONGARCH_LOONGARCHASMPRINTER_H
+#define LLVM_LIB_TARGET_LOONGARCH_LOONGARCHASMPRINTER_H
+
+#include "LoongArchSubtarget.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+class LLVM_LIBRARY_VISIBILITY LoongArchAsmPrinter : public AsmPrinter {
+  const MCSubtargetInfo *STI;
+
+public:
+  explicit LoongArchAsmPrinter(TargetMachine &TM,
+                               std::unique_ptr<MCStreamer> Streamer)
+      : AsmPrinter(TM, std::move(Streamer)), STI(TM.getMCSubtargetInfo()) {}
+
+  StringRef getPassName() const override {
+    return "LoongArch Assembly Printer";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void emitInstruction(const MachineInstr *MI) override;
+
+  // tblgen'erated function.
+  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
+                                   const MachineInstr *MI);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_LOONGARCH_LOONGARCHASMPRINTER_H
diff --git a/llvm/lib/Target/LoongArch/LoongArchCallingConv.td b/llvm/lib/Target/LoongArch/LoongArchCallingConv.td
new file mode 100644
index 000000000000..9844163163a5
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/LoongArchCallingConv.td
@@ -0,0 +1,23 @@
+//=- LoongArchCallingConv.td - Calling Conventions LoongArch -*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the LoongArch architecture.
+//
+//===----------------------------------------------------------------------===//
+
+def CSR_ILP32S_LP64S
+    : CalleeSavedRegs<(add R1, (sequence "R%u", 22, 31))>;
+
+def CSR_ILP32F_LP64F
+    : CalleeSavedRegs<(add CSR_ILP32S_LP64S, (sequence "F%u", 24, 31))>;
+
+def CSR_ILP32D_LP64D
+    : CalleeSavedRegs<(add CSR_ILP32S_LP64S, (sequence "F%u_64", 24, 31))>;
+
+// Needed for implementation of LoongArchRegisterInfo::getNoPreservedMask()
+def CSR_NoRegs : CalleeSavedRegs<(add)>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
new file mode 100644
index 000000000000..5b117d40e0a9
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
@@ -0,0 +1,177 @@
+//=-- LoongArchInstrInfoF.td - Single-Precision Float instr --*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the baisc single-precision floating-point instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasBasicF] in {
+
+// Arithmetic Operation Instructions
+def FADD_S : FP_ALU_3R<0b00000001000000001, "fadd.s", FPR32>;
+def FSUB_S : FP_ALU_3R<0b00000001000000101, "fsub.s", FPR32>;
+def FMUL_S : FP_ALU_3R<0b00000001000001001, "fmul.s", FPR32>;
+def FDIV_S : FP_ALU_3R<0b00000001000001101, "fdiv.s", FPR32>;
+def FMADD_S  : FP_ALU_4R<0b000010000001, "fmadd.s", FPR32>;
+def FMSUB_S  : FP_ALU_4R<0b000010000101, "fmsub.s", FPR32>;
+def FNMADD_S : FP_ALU_4R<0b000010001001, "fnmadd.s", FPR32>;
+def FNMSUB_S : FP_ALU_4R<0b000010001101, "fnmsub.s", FPR32>;
+def FMAX_S  : FP_ALU_3R<0b00000001000010001, "fmax.s", FPR32>;
+def FMIN_S  : FP_ALU_3R<0b00000001000010101, "fmin.s", FPR32>;
+def FMAXA_S : FP_ALU_3R<0b00000001000011001, "fmaxa.s", FPR32>;
+def FMINA_S : FP_ALU_3R<0b00000001000011101, "fmina.s", FPR32>;
+def FABS_S   : FP_ALU_2R<0b0000000100010100000001, "fabs.s", FPR32>;
+def FNEG_S   : FP_ALU_2R<0b0000000100010100000101, "fneg.s", FPR32>;
+def FSQRT_S  : FP_ALU_2R<0b0000000100010100010001, "fsqrt.s", FPR32>;
+def FRECIP_S : FP_ALU_2R<0b0000000100010100010101, "frecip.s", FPR32>;
+def FRSQRT_S : FP_ALU_2R<0b0000000100010100011001, "frsqrt.s", FPR32>;
+def FSCALEB_S : FP_ALU_3R<0b00000001000100001, "fscaleb.s", FPR32>;
+def FLOGB_S   : FP_ALU_2R<0b0000000100010100001001, "flogb.s", FPR32>;
+def FCOPYSIGN_S : FP_ALU_3R<0b00000001000100101, "fcopysign.s", FPR32>;
+def FCLASS_S  : FP_ALU_2R<0b0000000100010100001101, "fclass.s", FPR32>;
+
+
+// Comparison Instructions
+def FCMP_CAF_S  : FP_CMP<FPCMP_OPC_S, FPCMP_COND_CAF, "fcmp.caf.s", FPR32>;
+def FCMP_CUN_S  : FP_CMP<FPCMP_OPC_S, FPCMP_COND_CUN, "fcmp.cun.s", FPR32>;
+def FCMP_CEQ_S  : FP_CMP<FPCMP_OPC_S, FPCMP_COND_CEQ, "fcmp.ceq.s", FPR32>;
+def FCMP_CUEQ_S : FP_CMP<FPCMP_OPC_S, FPCMP_COND_CUEQ, "fcmp.cueq.s", FPR32>;
+def FCMP_CLT_S  : FP_CMP<FPCMP_OPC_S, FPCMP_COND_CLT, "fcmp.clt.s", FPR32>;
+def FCMP_CULT_S : FP_CMP<FPCMP_OPC_S, FPCMP_COND_CULT, "fcmp.cult.s", FPR32>;
+def FCMP_CLE_S  : FP_CMP<FPCMP_OPC_S, FPCMP_COND_CLE, "fcmp.cle.s", FPR32>;
+def FCMP_CULE_S : FP_CMP<FPCMP_OPC_S, FPCMP_COND_CULE, "fcmp.cule.s", FPR32>;
+def FCMP_CNE_S  : FP_CMP<FPCMP_OPC_S, FPCMP_COND_CNE, "fcmp.cne.s", FPR32>;
+def FCMP_COR_S  : FP_CMP<FPCMP_OPC_S, FPCMP_COND_COR, "fcmp.cor.s", FPR32>;
+def FCMP_CUNE_S : FP_CMP<FPCMP_OPC_S, FPCMP_COND_CUNE, "fcmp.cune.s", FPR32>;
+def FCMP_SAF_S  : FP_CMP<FPCMP_OPC_S, FPCMP_COND_SAF, "fcmp.saf.s", FPR32>;
+def FCMP_SUN_S  : FP_CMP<FPCMP_OPC_S, FPCMP_COND_SUN, "fcmp.sun.s", FPR32>;
+def FCMP_SEQ_S  : FP_CMP<FPCMP_OPC_S, FPCMP_COND_SEQ, "fcmp.seq.s", FPR32>;
+def FCMP_SUEQ_S : FP_CMP<FPCMP_OPC_S, FPCMP_COND_SUEQ, "fcmp.sueq.s", FPR32>;
+def FCMP_SLT_S  : FP_CMP<FPCMP_OPC_S, FPCMP_COND_SLT, "fcmp.slt.s", FPR32>;
+def FCMP_SULT_S : FP_CMP<FPCMP_OPC_S, FPCMP_COND_SULT, "fcmp.sult.s", FPR32>;
+def FCMP_SLE_S  : FP_CMP<FPCMP_OPC_S, FPCMP_COND_SLE, "fcmp.sle.s", FPR32>;
+def FCMP_SULE_S : FP_CMP<FPCMP_OPC_S, FPCMP_COND_SULE, "fcmp.sule.s", FPR32>;
+def FCMP_SNE_S  : FP_CMP<FPCMP_OPC_S, FPCMP_COND_SNE, "fcmp.sne.s", FPR32>;
+def FCMP_SOR_S  : FP_CMP<FPCMP_OPC_S, FPCMP_COND_SOR, "fcmp.sor.s", FPR32>;
+def FCMP_SUNE_S : FP_CMP<FPCMP_OPC_S, FPCMP_COND_SUNE, "fcmp.sune.s", FPR32>;
+
+// Conversion Instructions
+def FFINT_S_W    : FP_CONV<0b0000000100011101000100, "ffint.s.w", FPR32, FPR32>;
+def FTINT_W_S    : FP_CONV<0b0000000100011011000001, "ftint.w.s", FPR32, FPR32>;
+def FTINTRM_W_S  : FP_CONV<0b0000000100011010000001, "ftintrm.w.s", FPR32,
+                           FPR32>;
+def FTINTRP_W_S  : FP_CONV<0b0000000100011010010001, "ftintrp.w.s", FPR32,
+                           FPR32>;
+def FTINTRZ_W_S  : FP_CONV<0b0000000100011010100001, "ftintrz.w.s", FPR32,
+                           FPR32>;
+def FTINTRNE_W_S : FP_CONV<0b0000000100011010110001, "ftintrne.w.s", FPR32,
+                           FPR32>;
+def FRINT_S      : FP_CONV<0b0000000100011110010001, "frint.s", FPR32, FPR32>;
+
+// Move Instructions
+def FSEL_S : FP_SEL<0b00001101000000, "fsel", FPR32>;
+def FMOV_S     : FP_MOV<0b0000000100010100100101, "fmov.s", FPR32, FPR32>;
+def MOVGR2FR_W : FP_MOV<0b0000000100010100101001, "movgr2fr.w", FPR32, GPR>;
+def MOVFR2GR_S : FP_MOV<0b0000000100010100101101, "movfr2gr.s", GPR, FPR32>;
+def MOVGR2FCSR : FP_MOV<0b0000000100010100110000, "movgr2fcsr", FCSR, GPR>;
+def MOVFCSR2GR : FP_MOV<0b0000000100010100110010, "movfcsr2gr", GPR, FCSR>;
+def MOVFR2CF_S : FP_MOV<0b0000000100010100110100, "movfr2cf", CFR, FPR32>;
+def MOVCF2FR_S : FP_MOV<0b0000000100010100110101, "movcf2fr", FPR32, CFR>;
+def MOVGR2CF   : FP_MOV<0b0000000100010100110110, "movgr2cf", CFR, GPR>;
+def MOVCF2GR   : FP_MOV<0b0000000100010100110111, "movcf2gr", GPR, CFR>;
+
+// Branch Instructions
+def BCEQZ : FP_BRANCH<0b01001000, "bceqz">;
+def BCNEZ : FP_BRANCH<0b01001001, "bcnez">;
+
+// Common Memory Access Instructions
+def FLD_S : FP_LOAD_2RI12<0b0010101100, "fld.s", FPR32>;
+def FST_S : FP_STORE_2RI12<0b0010101101, "fst.s", FPR32>;
+def FLDX_S : FP_LOAD_3R<0b00111000001100000, "fldx.s", FPR32>;
+def FSTX_S : FP_STORE_3R<0b00111000001110000, "fstx.s", FPR32>;
+
+// Bound Check Memory Access Instructions
+def FLDGT_S : FP_LOAD_3R<0b00111000011101000, "fldgt.s", FPR32>;
+def FLDLE_S : FP_LOAD_3R<0b00111000011101010, "fldle.s", FPR32>;
+def FSTGT_S : FP_STORE_3R<0b00111000011101100, "fstgt.s", FPR32>;
+def FSTLE_S : FP_STORE_3R<0b00111000011101110, "fstle.s", FPR32>;
+
+} // Predicates = [HasBasicF]
+
+//===----------------------------------------------------------------------===//
+// Pseudo-instructions and codegen patterns
+//===----------------------------------------------------------------------===//
+
+/// Generic pattern classes
+
+class PatFpr<SDPatternOperator OpNode, LAInst Inst, RegisterClass RegTy>
+    : Pat<(OpNode RegTy:$fj), (Inst $fj)>;
+class PatFprFpr<SDPatternOperator OpNode, LAInst Inst, RegisterClass RegTy>
+    : Pat<(OpNode RegTy:$fj, RegTy:$fk), (Inst $fj, $fk)>;
+
+let Predicates = [HasBasicF] in {
+
+/// Float arithmetic operations
+
+def : PatFprFpr<fadd, FADD_S, FPR32>;
+def : PatFprFpr<fsub, FSUB_S, FPR32>;
+def : PatFprFpr<fmul, FMUL_S, FPR32>;
+def : PatFprFpr<fdiv, FDIV_S, FPR32>;
+def : PatFpr<fneg, FNEG_S, FPR32>;
+
+/// Setcc
+
+// Match non-signaling comparison
+
+// TODO: change setcc to any_fsetcc after call is supported because
+// we need to call llvm.experimental.constrained.fcmp.f32 in testcase.
+// See RISCV float-fcmp-strict.ll for reference.
+class PatFPSetcc<CondCode cc, LAInst CmpInst, RegisterClass RegTy>
+    : Pat<(setcc RegTy:$fj, RegTy:$fk, cc),
+          (MOVCF2GR (CmpInst RegTy:$fj, RegTy:$fk))>;
+// SETOGT/SETOGE/SETUGT/SETUGE will expand into SETOLT/SETOLE/SETULT/SETULE.
+def : PatFPSetcc<SETOEQ, FCMP_CEQ_S,  FPR32>;
+def : PatFPSetcc<SETOLT, FCMP_CLT_S,  FPR32>;
+def : PatFPSetcc<SETOLE, FCMP_CLE_S,  FPR32>;
+def : PatFPSetcc<SETONE, FCMP_CNE_S,  FPR32>;
+def : PatFPSetcc<SETO,   FCMP_COR_S,  FPR32>;
+def : PatFPSetcc<SETUEQ, FCMP_CUEQ_S, FPR32>;
+def : PatFPSetcc<SETULT, FCMP_CULT_S, FPR32>;
+def : PatFPSetcc<SETULE, FCMP_CULE_S, FPR32>;
+def : PatFPSetcc<SETUNE, FCMP_CUNE_S, FPR32>;
+def : PatFPSetcc<SETUO,  FCMP_CUN_S,  FPR32>;
+
+// TODO: Match signaling comparison strict_fsetccs with FCMP_S*_S instructions.
+
+/// Select
+
+def : Pat<(select GPR:$cc, FPR32:$fk, FPR32:$fj),
+          (FSEL_S FPR32:$fj, FPR32:$fk, (MOVGR2CF GPR:$cc))>;
+
+/// Selectcc
+
+class PatFPSelectcc<CondCode cc, LAInst CmpInst, LAInst SelInst,
+                    RegisterClass RegTy>
+    : Pat<(select (GRLenVT (setcc RegTy:$a, RegTy:$b, cc)), RegTy:$t, RegTy:$f),
+          (SelInst RegTy:$f, RegTy:$t, (CmpInst RegTy:$a, RegTy:$b))>;
+def : PatFPSelectcc<SETOEQ, FCMP_CEQ_S,  FSEL_S, FPR32>;
+def : PatFPSelectcc<SETOLT, FCMP_CLT_S,  FSEL_S, FPR32>;
+def : PatFPSelectcc<SETOLE, FCMP_CLE_S,  FSEL_S, FPR32>;
+def : PatFPSelectcc<SETONE, FCMP_CNE_S,  FSEL_S, FPR32>;
+def : PatFPSelectcc<SETO,   FCMP_COR_S,  FSEL_S, FPR32>;
+def : PatFPSelectcc<SETUEQ, FCMP_CUEQ_S, FSEL_S, FPR32>;
+def : PatFPSelectcc<SETULT, FCMP_CULT_S, FSEL_S, FPR32>;
+def : PatFPSelectcc<SETULE, FCMP_CULE_S, FSEL_S, FPR32>;
+def : PatFPSelectcc<SETUNE, FCMP_CUNE_S, FSEL_S, FPR32>;
+def : PatFPSelectcc<SETUO,  FCMP_CUN_S,  FSEL_S, FPR32>;
+
+} // Predicates = [HasBasicF]
diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
new file mode 100644
index 000000000000..07fa61f4c361
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
@@ -0,0 +1,188 @@
+//=-- LoongArchInstrInfoD.td - Double-Precision Float instr -*- tablegen -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the basic double-precision floating-point instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasBasicD] in {
+
+// Arithmetic Operation Instructions
+def FADD_D : FP_ALU_3R<0b00000001000000010, "fadd.d", FPR64>;
+def FSUB_D : FP_ALU_3R<0b00000001000000110, "fsub.d", FPR64>;
+def FMUL_D : FP_ALU_3R<0b00000001000001010, "fmul.d", FPR64>;
+def FDIV_D : FP_ALU_3R<0b00000001000001110, "fdiv.d", FPR64>;
+def FMADD_D  : FP_ALU_4R<0b000010000010, "fmadd.d", FPR64>;
+def FMSUB_D  : FP_ALU_4R<0b000010000110, "fmsub.d", FPR64>;
+def FNMADD_D : FP_ALU_4R<0b000010001010, "fnmadd.d", FPR64>;
+def FNMSUB_D : FP_ALU_4R<0b000010001110, "fnmsub.d", FPR64>;
+def FMAX_D  : FP_ALU_3R<0b00000001000010010, "fmax.d", FPR64>;
+def FMIN_D  : FP_ALU_3R<0b00000001000010110, "fmin.d", FPR64>;
+def FMAXA_D : FP_ALU_3R<0b00000001000011010, "fmaxa.d", FPR64>;
+def FMINA_D : FP_ALU_3R<0b00000001000011110, "fmina.d", FPR64>;
+def FABS_D   : FP_ALU_2R<0b0000000100010100000010, "fabs.d", FPR64>;
+def FNEG_D   : FP_ALU_2R<0b0000000100010100000110, "fneg.d", FPR64>;
+def FSQRT_D  : FP_ALU_2R<0b0000000100010100010010, "fsqrt.d", FPR64>;
+def FRECIP_D : FP_ALU_2R<0b0000000100010100010110, "frecip.d", FPR64>;
+def FRSQRT_D : FP_ALU_2R<0b0000000100010100011010, "frsqrt.d", FPR64>;
+def FSCALEB_D : FP_ALU_3R<0b00000001000100010, "fscaleb.d", FPR64>;
+def FLOGB_D   : FP_ALU_2R<0b0000000100010100001010, "flogb.d", FPR64>;
+def FCOPYSIGN_D : FP_ALU_3R<0b00000001000100110, "fcopysign.d", FPR64>;
+def FCLASS_D  : FP_ALU_2R<0b0000000100010100001110, "fclass.d", FPR64>;
+
+// Comparison Instructions
+def FCMP_CAF_D  : FP_CMP<FPCMP_OPC_D, FPCMP_COND_CAF, "fcmp.caf.d", FPR64>;
+def FCMP_CUN_D  : FP_CMP<FPCMP_OPC_D, FPCMP_COND_CUN, "fcmp.cun.d", FPR64>;
+def FCMP_CEQ_D  : FP_CMP<FPCMP_OPC_D, FPCMP_COND_CEQ, "fcmp.ceq.d", FPR64>;
+def FCMP_CUEQ_D : FP_CMP<FPCMP_OPC_D, FPCMP_COND_CUEQ, "fcmp.cueq.d", FPR64>;
+def FCMP_CLT_D  : FP_CMP<FPCMP_OPC_D, FPCMP_COND_CLT, "fcmp.clt.d", FPR64>;
+def FCMP_CULT_D : FP_CMP<FPCMP_OPC_D, FPCMP_COND_CULT, "fcmp.cult.d", FPR64>;
+def FCMP_CLE_D  : FP_CMP<FPCMP_OPC_D, FPCMP_COND_CLE, "fcmp.cle.d", FPR64>;
+def FCMP_CULE_D : FP_CMP<FPCMP_OPC_D, FPCMP_COND_CULE, "fcmp.cule.d", FPR64>;
+def FCMP_CNE_D  : FP_CMP<FPCMP_OPC_D, FPCMP_COND_CNE, "fcmp.cne.d", FPR64>;
+def FCMP_COR_D  : FP_CMP<FPCMP_OPC_D, FPCMP_COND_COR, "fcmp.cor.d", FPR64>;
+def FCMP_CUNE_D : FP_CMP<FPCMP_OPC_D, FPCMP_COND_CUNE, "fcmp.cune.d", FPR64>;
+def FCMP_SAF_D  : FP_CMP<FPCMP_OPC_D, FPCMP_COND_SAF, "fcmp.saf.d", FPR64>;
+def FCMP_SUN_D  : FP_CMP<FPCMP_OPC_D, FPCMP_COND_SUN, "fcmp.sun.d", FPR64>;
+def FCMP_SEQ_D  : FP_CMP<FPCMP_OPC_D, FPCMP_COND_SEQ, "fcmp.seq.d", FPR64>;
+def FCMP_SUEQ_D : FP_CMP<FPCMP_OPC_D, FPCMP_COND_SUEQ, "fcmp.sueq.d", FPR64>;
+def FCMP_SLT_D  : FP_CMP<FPCMP_OPC_D, FPCMP_COND_SLT, "fcmp.slt.d", FPR64>;
+def FCMP_SULT_D : FP_CMP<FPCMP_OPC_D, FPCMP_COND_SULT, "fcmp.sult.d", FPR64>;
+def FCMP_SLE_D  : FP_CMP<FPCMP_OPC_D, FPCMP_COND_SLE, "fcmp.sle.d", FPR64>;
+def FCMP_SULE_D : FP_CMP<FPCMP_OPC_D, FPCMP_COND_SULE, "fcmp.sule.d", FPR64>;
+def FCMP_SNE_D  : FP_CMP<FPCMP_OPC_D, FPCMP_COND_SNE, "fcmp.sne.d", FPR64>;
+def FCMP_SOR_D  : FP_CMP<FPCMP_OPC_D, FPCMP_COND_SOR, "fcmp.sor.d", FPR64>;
+def FCMP_SUNE_D : FP_CMP<FPCMP_OPC_D, FPCMP_COND_SUNE, "fcmp.sune.d", FPR64>;
+
+// Conversion Instructions
+def FFINT_S_L : FP_CONV<0b0000000100011101000110, "ffint.s.l", FPR32, FPR64>;
+def FTINT_L_S : FP_CONV<0b0000000100011011001001, "ftint.l.s", FPR64, FPR32>;
+def FTINTRM_L_S : FP_CONV<0b0000000100011010001001, "ftintrm.l.s", FPR64,
+                          FPR32>;
+def FTINTRP_L_S : FP_CONV<0b0000000100011010011001, "ftintrp.l.s", FPR64,
+                          FPR32>;
+def FTINTRZ_L_S : FP_CONV<0b0000000100011010101001, "ftintrz.l.s", FPR64,
+                          FPR32>;
+def FTINTRNE_L_S : FP_CONV<0b0000000100011010111001, "ftintrne.l.s", FPR64,
+                           FPR32>;
+def FCVT_S_D : FP_CONV<0b0000000100011001000110, "fcvt.s.d", FPR32, FPR64>;
+def FCVT_D_S : FP_CONV<0b0000000100011001001001, "fcvt.d.s", FPR64, FPR32>;
+def FFINT_D_W : FP_CONV<0b0000000100011101001000, "ffint.d.w", FPR64, FPR32>;
+def FFINT_D_L : FP_CONV<0b0000000100011101001010, "ffint.d.l", FPR64, FPR64>;
+def FTINT_W_D : FP_CONV<0b0000000100011011000010, "ftint.w.d", FPR32, FPR64>;
+def FTINT_L_D : FP_CONV<0b0000000100011011001010, "ftint.l.d", FPR64, FPR64>;
+def FTINTRM_W_D : FP_CONV<0b0000000100011010000010, "ftintrm.w.d", FPR32,
+                          FPR64>;
+def FTINTRM_L_D : FP_CONV<0b0000000100011010001010, "ftintrm.l.d", FPR64,
+                          FPR64>;
+def FTINTRP_W_D : FP_CONV<0b0000000100011010010010, "ftintrp.w.d", FPR32,
+                          FPR64>;
+def FTINTRP_L_D : FP_CONV<0b0000000100011010011010, "ftintrp.l.d", FPR64,
+                          FPR64>;
+def FTINTRZ_W_D : FP_CONV<0b0000000100011010100010, "ftintrz.w.d", FPR32,
+                          FPR64>;
+def FTINTRZ_L_D : FP_CONV<0b0000000100011010101010, "ftintrz.l.d", FPR64,
+                          FPR64>;
+def FTINTRNE_W_D : FP_CONV<0b0000000100011010110010, "ftintrne.w.d", FPR32,
+                           FPR64>;
+def FTINTRNE_L_D : FP_CONV<0b0000000100011010111010, "ftintrne.l.d", FPR64,
+                           FPR64>;
+def FRINT_D : FP_CONV<0b0000000100011110010010, "frint.d", FPR64, FPR64>;
+
+// Move Instructions
+def FMOV_D        : FP_MOV<0b0000000100010100100110, "fmov.d", FPR64, FPR64>;
+def MOVFRH2GR_S   : FP_MOV<0b0000000100010100101111, "movfrh2gr.s", GPR, FPR64>;
+let isCodeGenOnly = 1 in {
+def MOVFR2GR_S_64 : FP_MOV<0b0000000100010100101101, "movfr2gr.s", GPR, FPR64>;
+def FSEL_D : FP_SEL<0b00001101000000, "fsel", FPR64>;
+} // isCodeGenOnly = 1
+let Constraints = "$dst = $out" in {
+def MOVGR2FRH_W : FPFmtMOV<0b0000000100010100101011, (outs FPR64:$out),
+                           (ins FPR64:$dst, GPR:$src), "movgr2frh.w",
+                           "$dst, $src">;
+} // Constraints = "$dst = $out"
+
+// Common Memory Access Instructions
+def FLD_D : FP_LOAD_2RI12<0b0010101110, "fld.d", FPR64>;
+def FST_D : FP_STORE_2RI12<0b0010101111, "fst.d", FPR64>;
+def FLDX_D : FP_LOAD_3R<0b00111000001101000, "fldx.d", FPR64>;
+def FSTX_D : FP_STORE_3R<0b00111000001111000, "fstx.d", FPR64>;
+
+// Bound Check Memory Access Instructions
+def FLDGT_D : FP_LOAD_3R<0b00111000011101001, "fldgt.d", FPR64>;
+def FLDLE_D : FP_LOAD_3R<0b00111000011101011, "fldle.d", FPR64>;
+def FSTGT_D : FP_STORE_3R<0b00111000011101101, "fstgt.d", FPR64>;
+def FSTLE_D : FP_STORE_3R<0b00111000011101111, "fstle.d", FPR64>;
+
+} // Predicates = [HasBasicD]
+
+// Instructions only available on LA64
+let Predicates = [HasBasicD, IsLA64] in {
+def MOVGR2FR_D  : FP_MOV<0b0000000100010100101010, "movgr2fr.d", FPR64, GPR>;
+def MOVFR2GR_D  : FP_MOV<0b0000000100010100101110, "movfr2gr.d", GPR, FPR64>;
+} // Predicates = [HasBasicD, IsLA64]
+
+//===----------------------------------------------------------------------===//
+// Pseudo-instructions and codegen patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasBasicD] in {
+
+/// Float arithmetic operations
+
+def : PatFprFpr<fadd, FADD_D, FPR64>;
+def : PatFprFpr<fsub, FSUB_D, FPR64>;
+def : PatFprFpr<fmul, FMUL_D, FPR64>;
+def : PatFprFpr<fdiv, FDIV_D, FPR64>;
+def : PatFpr<fneg, FNEG_D, FPR64>;
+
+/// Setcc
+
+// Match non-signaling comparison
+
+// TODO: Change setcc to any_fsetcc after call is supported because
+// we need to call llvm.experimental.constrained.fcmp.f64 in testcase.
+// See RISCV float-fcmp-strict.ll for reference.
+
+// SETOGT/SETOGE/SETUGT/SETUGE will expand into SETOLT/SETOLE/SETULT/SETULE.
+def : PatFPSetcc<SETOEQ, FCMP_CEQ_D,  FPR64>;
+def : PatFPSetcc<SETOLT, FCMP_CLT_D,  FPR64>;
+def : PatFPSetcc<SETOLE, FCMP_CLE_D,  FPR64>;
+def : PatFPSetcc<SETONE, FCMP_CNE_D,  FPR64>;
+def : PatFPSetcc<SETO,   FCMP_COR_D,  FPR64>;
+def : PatFPSetcc<SETUEQ, FCMP_CUEQ_D, FPR64>;
+def : PatFPSetcc<SETULT, FCMP_CULT_D, FPR64>;
+def : PatFPSetcc<SETULE, FCMP_CULE_D, FPR64>;
+def : PatFPSetcc<SETUNE, FCMP_CUNE_D, FPR64>;
+def : PatFPSetcc<SETUO,  FCMP_CUN_D,  FPR64>;
+
+// TODO: Match signaling comparison strict_fsetccs with FCMP_S*_D instructions.
+
+/// Select
+
+def : Pat<(select GPR:$cc, FPR64:$fk, FPR64:$fj),
+          (FSEL_D FPR64:$fj, FPR64:$fk, (MOVGR2CF GPR:$cc))>;
+
+/// Selectcc
+
+def : PatFPSelectcc<SETOEQ, FCMP_CEQ_D,  FSEL_D, FPR64>;
+def : PatFPSelectcc<SETOLT, FCMP_CLT_D,  FSEL_D, FPR64>;
+def : PatFPSelectcc<SETOLE, FCMP_CLE_D,  FSEL_D, FPR64>;
+def : PatFPSelectcc<SETONE, FCMP_CNE_D,  FSEL_D, FPR64>;
+def : PatFPSelectcc<SETO,   FCMP_COR_D,  FSEL_D, FPR64>;
+def : PatFPSelectcc<SETUEQ, FCMP_CUEQ_D, FSEL_D, FPR64>;
+def : PatFPSelectcc<SETULT, FCMP_CULT_D, FSEL_D, FPR64>;
+def : PatFPSelectcc<SETULE, FCMP_CULE_D, FSEL_D, FPR64>;
+def : PatFPSelectcc<SETUNE, FCMP_CUNE_D, FSEL_D, FPR64>;
+def : PatFPSelectcc<SETUO,  FCMP_CUN_D,  FSEL_D, FPR64>;
+
+} // Predicates = [HasBasicD]
diff --git a/llvm/lib/Target/LoongArch/LoongArchFloatInstrFormats.td b/llvm/lib/Target/LoongArch/LoongArchFloatInstrFormats.td
new file mode 100644
index 000000000000..d2ba1fdfffe4
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/LoongArchFloatInstrFormats.td
@@ -0,0 +1,241 @@
+//==- LoongArchInstrFormatsF.td - LoongArch FP Instr Formats -*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe LoongArch floating-point instructions format
+//
+//  opcode       - operation code.
+//  fd           - destination register operand.
+//  {c/f}{j/k/a} - source register operand.
+//  immN         - immediate data operand.
+//
+//===----------------------------------------------------------------------===//
+
+// 2R-type
+// <opcode | fj | fd>
+class FPFmt2R<bits<22> op, dag outs, dag ins, string opcstr, string opnstr,
+              list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<5> fj;
+  bits<5> fd;
+
+  let Inst{31-10} = op;
+  let Inst{9-5} = fj;
+  let Inst{4-0} = fd;
+}
+
+// 3R-type
+// <opcode | fk | fj | fd>
+class FPFmt3R<bits<17> op, dag outs, dag ins, string opcstr, string opnstr,
+              list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<5> fk;
+  bits<5> fj;
+  bits<5> fd;
+
+  let Inst{31-15} = op;
+  let Inst{14-10} = fk;
+  let Inst{9-5} = fj;
+  let Inst{4-0} = fd;
+}
+
+// 4R-type
+// <opcode | fa | fk | fj | fd>
+class FPFmt4R<bits<12> op, dag outs, dag ins, string opcstr, string opnstr,
+              list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<5> fa;
+  bits<5> fk;
+  bits<5> fj;
+  bits<5> fd;
+
+  let Inst{31-20} = op;
+  let Inst{19-15} = fa;
+  let Inst{14-10} = fk;
+  let Inst{9-5} = fj;
+  let Inst{4-0} = fd;
+}
+
+// 2RI12-type
+// <opcode | I12 | rj | fd>
+class FPFmt2RI12<bits<10> op, dag outs, dag ins, string opcstr, string opnstr,
+                 list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<12> imm12;
+  bits<5> rj;
+  bits<5> fd;
+
+  let Inst{31-22} = op;
+  let Inst{21-10} = imm12;
+  let Inst{9-5} = rj;
+  let Inst{4-0} = fd;
+}
+
+// FmtFCMP
+// <opcode | cond | fk | fj | 0b00 | cd>
+class FPFmtFCMP<bits<12> op, bits<5> cond, dag outs, dag ins, string opcstr,
+                string opnstr, list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<5> fk;
+  bits<5> fj;
+  bits<3> cd;
+
+  let Inst{31-20} = op;
+  let Inst{19-15} = cond;
+  let Inst{14-10} = fk;
+  let Inst{9-5} = fj;
+  let Inst{4-3} = 0b00;
+  let Inst{2-0} = cd;
+}
+
+// FPFmtBR
+// <opcode[7:2] | I21[15:0] | opcode[1:0] | cj | I21[20:16]>
+class FPFmtBR<bits<8> opcode, dag outs, dag ins, string opcstr,
+              string opnstr, list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<21> imm21;
+  bits<3> cj;
+
+  let Inst{31-26} = opcode{7-2};
+  let Inst{25-10} = imm21{15-0};
+  let Inst{9-8} = opcode{1-0};
+  let Inst{7-5} = cj;
+  let Inst{4-0} = imm21{20-16};
+}
+
+// FmtFSEL
+// <opcode | ca | fk | fj | fd>
+class FPFmtFSEL<bits<14> op, dag outs, dag ins, string opcstr, string opnstr,
+                list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<3> ca;
+  bits<5> fk;
+  bits<5> fj;
+  bits<5> fd;
+
+  let Inst{31-18} = op;
+  let Inst{17-15} = ca;
+  let Inst{14-10} = fk;
+  let Inst{9-5} = fj;
+  let Inst{4-0} = fd;
+}
+
+// FPFmtMOV
+// <opcode | src | dst>
+class FPFmtMOV<bits<22> op, dag outs, dag ins, string opcstr, string opnstr,
+               list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<5> src;
+  bits<5> dst;
+
+  let Inst{31-10} = op;
+  let Inst{9-5} = src;
+  let Inst{4-0} = dst;
+}
+
+// FPFmtMEM
+// <opcode | rk | rj | fd>
+class FPFmtMEM<bits<17> op, dag outs, dag ins, string opcstr, string opnstr,
+               list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<5> rk;
+  bits<5> rj;
+  bits<5> fd;
+
+  let Inst{31-15} = op;
+  let Inst{14-10} = rk;
+  let Inst{9-5} = rj;
+  let Inst{4-0} = fd;
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction class templates
+//===----------------------------------------------------------------------===//
+
+class FP_ALU_2R<bits<22> op, string opstr, RegisterClass rc>
+    : FPFmt2R<op, (outs rc:$fd), (ins rc:$fj), opstr, "$fd, $fj">;
+
+class FP_ALU_3R<bits<17> op, string opstr, RegisterClass rc>
+    : FPFmt3R<op, (outs rc:$fd), (ins rc:$fj, rc:$fk), opstr, "$fd, $fj, $fk">;
+
+class FP_ALU_4R<bits<12> op, string opstr, RegisterClass rc>
+    : FPFmt4R<op, (outs rc:$fd), (ins rc:$fj, rc:$fk, rc:$fa), opstr,
+              "$fd, $fj, $fk, $fa">;
+
+class FPCMPOpc<bits<12> value> {
+  bits<12> val = value;
+}
+
+class FPCMPCond<bits<5> value> {
+  bits<5> val = value;
+}
+
+class FP_CMP<FPCMPOpc op, FPCMPCond cond, string opstr, RegisterClass rc>
+    : FPFmtFCMP<op.val, cond.val, (outs CFR:$cd), (ins rc:$fj, rc:$fk), opstr,
+                "$cd, $fj, $fk">;
+
+class FP_CONV<bits<22> op, string opstr, RegisterClass rcd, RegisterClass rcs>
+    : FPFmt2R<op, (outs rcd:$fd), (ins rcs:$fj), opstr, "$fd, $fj">;
+
+class FP_MOV<bits<22> op, string opstr, RegisterClass rcd, RegisterClass rcs>
+    : FPFmtMOV<op, (outs rcd:$dst), (ins rcs:$src), opstr, "$dst, $src">;
+
+class FP_SEL<bits<14> op, string opstr, RegisterClass rc>
+    : FPFmtFSEL<op, (outs rc:$fd), (ins rc:$fj, rc:$fk, CFR:$ca), opstr,
+                "$fd, $fj, $fk, $ca">;
+
+class FP_BRANCH<bits<8> opcode, string opstr>
+    : FPFmtBR<opcode, (outs), (ins CFR:$cj, simm21_lsl2:$imm21), opstr,
+              "$cj, $imm21"> {
+  let isBranch = 1;
+  let isTerminator = 1;
+}
+
+let mayLoad = 1 in {
+class FP_LOAD_3R<bits<17> op, string opstr, RegisterClass rc>
+    : FPFmtMEM<op, (outs rc:$fd), (ins GPR:$rj, GPR:$rk), opstr,
+               "$fd, $rj, $rk">;
+class FP_LOAD_2RI12<bits<10> op, string opstr, RegisterClass rc>
+    : FPFmt2RI12<op, (outs rc:$fd), (ins GPR:$rj, simm12:$imm12), opstr,
+                 "$fd, $rj, $imm12">;
+} // mayLoad = 1
+
+let mayStore = 1 in {
+class FP_STORE_3R<bits<17> op, string opstr, RegisterClass rc>
+    : FPFmtMEM<op, (outs), (ins rc:$fd, GPR:$rj, GPR:$rk), opstr,
+               "$fd, $rj, $rk">;
+class FP_STORE_2RI12<bits<10> op, string opstr, RegisterClass rc>
+    : FPFmt2RI12<op, (outs), (ins rc:$fd, GPR:$rj, simm12:$imm12), opstr,
+                 "$fd, $rj, $imm12">;
+} // mayStore = 1
+
+def FPCMP_OPC_S : FPCMPOpc<0b000011000001>;
+def FPCMP_OPC_D : FPCMPOpc<0b000011000010>;
+
+def FPCMP_COND_CAF  : FPCMPCond<0x0>;
+def FPCMP_COND_CUN  : FPCMPCond<0x8>;
+def FPCMP_COND_CEQ  : FPCMPCond<0x4>;
+def FPCMP_COND_CUEQ : FPCMPCond<0xC>;
+def FPCMP_COND_CLT  : FPCMPCond<0x2>;
+def FPCMP_COND_CULT : FPCMPCond<0xA>;
+def FPCMP_COND_CLE  : FPCMPCond<0x6>;
+def FPCMP_COND_CULE : FPCMPCond<0xE>;
+def FPCMP_COND_CNE  : FPCMPCond<0x10>;
+def FPCMP_COND_COR  : FPCMPCond<0x14>;
+def FPCMP_COND_CUNE : FPCMPCond<0x18>;
+def FPCMP_COND_SAF  : FPCMPCond<0x1>;
+def FPCMP_COND_SUN  : FPCMPCond<0x9>;
+def FPCMP_COND_SEQ  : FPCMPCond<0x5>;
+def FPCMP_COND_SUEQ : FPCMPCond<0xD>;
+def FPCMP_COND_SLT  : FPCMPCond<0x3>;
+def FPCMP_COND_SULT : FPCMPCond<0xB>;
+def FPCMP_COND_SLE  : FPCMPCond<0x7>;
+def FPCMP_COND_SULE : FPCMPCond<0xF>;
+def FPCMP_COND_SNE  : FPCMPCond<0x11>;
+def FPCMP_COND_SOR  : FPCMPCond<0x15>;
+def FPCMP_COND_SUNE : FPCMPCond<0x19>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
new file mode 100644
index 000000000000..7182d55ca3cf
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
@@ -0,0 +1,55 @@
+//===-- LoongArchFrameLowering.cpp - LoongArch Frame Information -*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the LoongArch implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LoongArchFrameLowering.h"
+#include "LoongArchSubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/MC/MCDwarf.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loongarch-frame-lowering"
+
+// Return true if the specified function should have a dedicated frame
+// pointer register.  This is true if frame pointer elimination is
+// disabled, if it needs dynamic stack realignment, if the function has
+// variable sized allocas, or if the frame address is taken.
+bool LoongArchFrameLowering::hasFP(const MachineFunction &MF) const {
+  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  return MF.getTarget().Options.DisableFramePointerElim(MF) ||
+         RegInfo->hasStackRealignment(MF) || MFI.hasVarSizedObjects() ||
+         MFI.isFrameAddressTaken();
+}
+
+bool LoongArchFrameLowering::hasBP(const MachineFunction &MF) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+
+  return MFI.hasVarSizedObjects() && TRI->hasStackRealignment(MF);
+}
+
+void LoongArchFrameLowering::emitPrologue(MachineFunction &MF,
+                                          MachineBasicBlock &MBB) const {
+  // TODO: Implement this when we have function calls
+}
+
+void LoongArchFrameLowering::emitEpilogue(MachineFunction &MF,
+                                          MachineBasicBlock &MBB) const {
+  // TODO: Implement this when we have function calls
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h
new file mode 100644
index 000000000000..25c53efc10f1
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h
@@ -0,0 +1,38 @@
+//=- LoongArchFrameLowering.h - TargetFrameLowering for LoongArch -*- C++ -*--//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements LoongArch-specific bits of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LOONGARCH_LOONGARCHFRAMELOWERING_H
+#define LLVM_LIB_TARGET_LOONGARCH_LOONGARCHFRAMELOWERING_H
+
+#include "llvm/CodeGen/TargetFrameLowering.h"
+
+namespace llvm {
+class LoongArchSubtarget;
+
+class LoongArchFrameLowering : public TargetFrameLowering {
+  const LoongArchSubtarget &STI;
+
+public:
+  explicit LoongArchFrameLowering(const LoongArchSubtarget &STI)
+      : TargetFrameLowering(StackGrowsDown,
+                            /*StackAlignment=*/Align(16),
+                            /*LocalAreaOffset=*/0),
+        STI(STI) {}
+
+  void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  bool hasFP(const MachineFunction &MF) const override;
+  bool hasBP(const MachineFunction &MF) const;
+};
+} // namespace llvm
+#endif // LLVM_LIB_TARGET_LOONGARCH_LOONGARCHFRAMELOWERING_H
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp
new file mode 100644
index 000000000000..cc9ea0255d98
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp
@@ -0,0 +1,132 @@
+//=- LoongArchISelDAGToDAG.cpp - A dag to dag inst selector for LoongArch -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the LoongArch target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LoongArchISelDAGToDAG.h"
+#include "LoongArchISelLowering.h"
+#include "MCTargetDesc/LoongArchMCTargetDesc.h"
+#include "MCTargetDesc/LoongArchMatInt.h"
+#include "llvm/Support/KnownBits.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loongarch-isel"
+
+void LoongArchDAGToDAGISel::Select(SDNode *Node) {
+  // If we have a custom node, we have already selected.
+  if (Node->isMachineOpcode()) {
+    LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << "\n");
+    Node->setNodeId(-1);
+    return;
+  }
+
+  // Instruction Selection not handled by the auto-generated tablegen selection
+  // should be handled here.
+  unsigned Opcode = Node->getOpcode();
+  MVT GRLenVT = Subtarget->getGRLenVT();
+  SDLoc DL(Node);
+
+  switch (Opcode) {
+  default:
+    break;
+  case ISD::Constant: {
+    int64_t Imm = cast<ConstantSDNode>(Node)->getSExtValue();
+    if (Imm == 0 && Node->getSimpleValueType(0) == GRLenVT) {
+      SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
+                                           LoongArch::R0, GRLenVT);
+      ReplaceNode(Node, New.getNode());
+      return;
+    }
+    SDNode *Result = nullptr;
+    SDValue SrcReg = CurDAG->getRegister(LoongArch::R0, GRLenVT);
+    // The instructions in the sequence are handled here.
+    for (LoongArchMatInt::Inst &Inst : LoongArchMatInt::generateInstSeq(Imm)) {
+      SDValue SDImm = CurDAG->getTargetConstant(Inst.Imm, DL, GRLenVT);
+      if (Inst.Opc == LoongArch::LU12I_W)
+        Result = CurDAG->getMachineNode(LoongArch::LU12I_W, DL, GRLenVT, SDImm);
+      else
+        Result = CurDAG->getMachineNode(Inst.Opc, DL, GRLenVT, SrcReg, SDImm);
+      SrcReg = SDValue(Result, 0);
+    }
+
+    ReplaceNode(Node, Result);
+    return;
+  }
+    // TODO: Add selection nodes needed later.
+  }
+
+  // Select the default instruction.
+  SelectCode(Node);
+}
+
+bool LoongArchDAGToDAGISel::selectShiftMask(SDValue N, unsigned ShiftWidth,
+                                            SDValue &ShAmt) {
+  // Shift instructions on LoongArch only read the lower 5 or 6 bits of the
+  // shift amount. If there is an AND on the shift amount, we can bypass it if
+  // it doesn't affect any of those bits.
+  if (N.getOpcode() == ISD::AND && isa<ConstantSDNode>(N.getOperand(1))) {
+    const APInt &AndMask = N->getConstantOperandAPInt(1);
+
+    // Since the max shift amount is a power of 2 we can subtract 1 to make a
+    // mask that covers the bits needed to represent all shift amounts.
+    assert(isPowerOf2_32(ShiftWidth) && "Unexpected max shift amount!");
+    APInt ShMask(AndMask.getBitWidth(), ShiftWidth - 1);
+
+    if (ShMask.isSubsetOf(AndMask)) {
+      ShAmt = N.getOperand(0);
+      return true;
+    }
+
+    // SimplifyDemandedBits may have optimized the mask so try restoring any
+    // bits that are known zero.
+    KnownBits Known = CurDAG->computeKnownBits(N->getOperand(0));
+    if (ShMask.isSubsetOf(AndMask | Known.Zero)) {
+      ShAmt = N.getOperand(0);
+      return true;
+    }
+  } else if (N.getOpcode() == LoongArchISD::BSTRPICK) {
+    // Similar to the above AND, if there is a BSTRPICK on the shift amount, we
+    // can bypass it.
+    assert(isPowerOf2_32(ShiftWidth) && "Unexpected max shift amount!");
+    assert(isa<ConstantSDNode>(N.getOperand(1)) && "Illegal msb operand!");
+    assert(isa<ConstantSDNode>(N.getOperand(2)) && "Illegal lsb operand!");
+    uint64_t msb = N.getConstantOperandVal(1), lsb = N.getConstantOperandVal(2);
+    if (lsb == 0 && Log2_32(ShiftWidth) <= msb + 1) {
+      ShAmt = N.getOperand(0);
+      return true;
+    }
+  } else if (N.getOpcode() == ISD::SUB &&
+             isa<ConstantSDNode>(N.getOperand(0))) {
+    uint64_t Imm = N.getConstantOperandVal(0);
+    // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to
+    // generate a NEG instead of a SUB of a constant.
+    if (Imm != 0 && Imm % ShiftWidth == 0) {
+      SDLoc DL(N);
+      EVT VT = N.getValueType();
+      SDValue Zero =
+          CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, LoongArch::R0, VT);
+      unsigned NegOpc = VT == MVT::i64 ? LoongArch::SUB_D : LoongArch::SUB_W;
+      MachineSDNode *Neg =
+          CurDAG->getMachineNode(NegOpc, DL, VT, Zero, N.getOperand(1));
+      ShAmt = SDValue(Neg, 0);
+      return true;
+    }
+  }
+
+  ShAmt = N;
+  return true;
+}
+
+// This pass converts a legalized DAG into a LoongArch-specific DAG, ready
+// for instruction scheduling.
+FunctionPass *llvm::createLoongArchISelDag(LoongArchTargetMachine &TM) {
+  return new LoongArchDAGToDAGISel(TM);
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h
new file mode 100644
index 000000000000..f477129d933c
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h
@@ -0,0 +1,55 @@
+//=- LoongArchISelDAGToDAG.h - A dag to dag inst selector for LoongArch ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the LoongArch target.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LOONGARCH_LOONGARCHISELDAGTODAG_H
+#define LLVM_LIB_TARGET_LOONGARCH_LOONGARCHISELDAGTODAG_H
+
+#include "LoongArch.h"
+#include "LoongArchTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+
+// LoongArch-specific code to select LoongArch machine instructions for
+// SelectionDAG operations.
+namespace llvm {
+class LoongArchDAGToDAGISel : public SelectionDAGISel {
+  const LoongArchSubtarget *Subtarget = nullptr;
+
+public:
+  explicit LoongArchDAGToDAGISel(LoongArchTargetMachine &TM)
+      : SelectionDAGISel(TM) {}
+
+  StringRef getPassName() const override {
+    return "LoongArch DAG->DAG Pattern Instruction Selection";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    Subtarget = &MF.getSubtarget<LoongArchSubtarget>();
+    return SelectionDAGISel::runOnMachineFunction(MF);
+  }
+
+  void Select(SDNode *Node) override;
+
+  bool selectShiftMask(SDValue N, unsigned ShiftWidth, SDValue &ShAmt);
+  bool selectShiftMaskGRLen(SDValue N, SDValue &ShAmt) {
+    return selectShiftMask(N, Subtarget->getGRLen(), ShAmt);
+  }
+  bool selectShiftMask32(SDValue N, SDValue &ShAmt) {
+    return selectShiftMask(N, 32, ShAmt);
+  }
+
+// Include the pieces autogenerated from the target description.
+#include "LoongArchGenDAGISel.inc"
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LOONGARCH_LOONGARCHISELDAGTODAG_H
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
new file mode 100644
index 000000000000..d5a469216859
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -0,0 +1,531 @@
+//=- LoongArchISelLowering.cpp - LoongArch DAG Lowering Implementation  ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that LoongArch uses to lower LLVM code into
+// a selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LoongArchISelLowering.h"
+#include "LoongArch.h"
+#include "LoongArchMachineFunctionInfo.h"
+#include "LoongArchRegisterInfo.h"
+#include "LoongArchSubtarget.h"
+#include "LoongArchTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loongarch-isel-lowering"
+
+LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+                                                 const LoongArchSubtarget &STI)
+    : TargetLowering(TM), Subtarget(STI) {
+
+  MVT GRLenVT = Subtarget.getGRLenVT();
+  // Set up the register classes.
+  addRegisterClass(GRLenVT, &LoongArch::GPRRegClass);
+  if (Subtarget.hasBasicF())
+    addRegisterClass(MVT::f32, &LoongArch::FPR32RegClass);
+  if (Subtarget.hasBasicD())
+    addRegisterClass(MVT::f64, &LoongArch::FPR64RegClass);
+
+  // TODO: add necessary setOperationAction calls later.
+  setOperationAction(ISD::SHL_PARTS, GRLenVT, Custom);
+  setOperationAction(ISD::SRA_PARTS, GRLenVT, Custom);
+  setOperationAction(ISD::SRL_PARTS, GRLenVT, Custom);
+
+  if (Subtarget.is64Bit()) {
+    setOperationAction(ISD::SHL, MVT::i32, Custom);
+    setOperationAction(ISD::SRA, MVT::i32, Custom);
+    setOperationAction(ISD::SRL, MVT::i32, Custom);
+  }
+
+  static const ISD::CondCode FPCCToExpand[] = {ISD::SETOGT, ISD::SETOGE,
+                                               ISD::SETUGT, ISD::SETUGE};
+
+  if (Subtarget.hasBasicF()) {
+    setCondCodeAction(FPCCToExpand, MVT::f32, Expand);
+    setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+  }
+  if (Subtarget.hasBasicD()) {
+    setCondCodeAction(FPCCToExpand, MVT::f64, Expand);
+    setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+  }
+
+  setOperationAction(ISD::SELECT_CC, GRLenVT, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  // Compute derived properties from the register classes.
+  computeRegisterProperties(STI.getRegisterInfo());
+
+  setStackPointerRegisterToSaveRestore(LoongArch::R3);
+
+  setBooleanContents(ZeroOrOneBooleanContent);
+
+  // Function alignments.
+  const Align FunctionAlignment(4);
+  setMinFunctionAlignment(FunctionAlignment);
+
+  setTargetDAGCombine(ISD::AND);
+  setTargetDAGCombine(ISD::SRL);
+}
+
+SDValue LoongArchTargetLowering::LowerOperation(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default:
+    report_fatal_error("unimplemented operand");
+  case ISD::SHL_PARTS:
+    return lowerShiftLeftParts(Op, DAG);
+  case ISD::SRA_PARTS:
+    return lowerShiftRightParts(Op, DAG, true);
+  case ISD::SRL_PARTS:
+    return lowerShiftRightParts(Op, DAG, false);
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+    // This can be called for an i32 shift amount that needs to be promoted.
+    assert(Op.getOperand(1).getValueType() == MVT::i32 && Subtarget.is64Bit() &&
+           "Unexpected custom legalisation");
+    return SDValue();
+  }
+}
+
+SDValue LoongArchTargetLowering::lowerShiftLeftParts(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
+  SDValue Shamt = Op.getOperand(2);
+  EVT VT = Lo.getValueType();
+
+  // if Shamt-GRLen < 0: // Shamt < GRLen
+  //   Lo = Lo << Shamt
+  //   Hi = (Hi << Shamt) | ((Lo >>u 1) >>u (GRLen-1 ^ Shamt))
+  // else:
+  //   Lo = 0
+  //   Hi = Lo << (Shamt-GRLen)
+
+  SDValue Zero = DAG.getConstant(0, DL, VT);
+  SDValue One = DAG.getConstant(1, DL, VT);
+  SDValue MinusGRLen = DAG.getConstant(-(int)Subtarget.getGRLen(), DL, VT);
+  SDValue GRLenMinus1 = DAG.getConstant(Subtarget.getGRLen() - 1, DL, VT);
+  SDValue ShamtMinusGRLen = DAG.getNode(ISD::ADD, DL, VT, Shamt, MinusGRLen);
+  SDValue GRLenMinus1Shamt = DAG.getNode(ISD::XOR, DL, VT, Shamt, GRLenMinus1);
+
+  SDValue LoTrue = DAG.getNode(ISD::SHL, DL, VT, Lo, Shamt);
+  SDValue ShiftRight1Lo = DAG.getNode(ISD::SRL, DL, VT, Lo, One);
+  SDValue ShiftRightLo =
+      DAG.getNode(ISD::SRL, DL, VT, ShiftRight1Lo, GRLenMinus1Shamt);
+  SDValue ShiftLeftHi = DAG.getNode(ISD::SHL, DL, VT, Hi, Shamt);
+  SDValue HiTrue = DAG.getNode(ISD::OR, DL, VT, ShiftLeftHi, ShiftRightLo);
+  SDValue HiFalse = DAG.getNode(ISD::SHL, DL, VT, Lo, ShamtMinusGRLen);
+
+  SDValue CC = DAG.getSetCC(DL, VT, ShamtMinusGRLen, Zero, ISD::SETLT);
+
+  Lo = DAG.getNode(ISD::SELECT, DL, VT, CC, LoTrue, Zero);
+  Hi = DAG.getNode(ISD::SELECT, DL, VT, CC, HiTrue, HiFalse);
+
+  SDValue Parts[2] = {Lo, Hi};
+  return DAG.getMergeValues(Parts, DL);
+}
+
+SDValue LoongArchTargetLowering::lowerShiftRightParts(SDValue Op,
+                                                      SelectionDAG &DAG,
+                                                      bool IsSRA) const {
+  SDLoc DL(Op);
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
+  SDValue Shamt = Op.getOperand(2);
+  EVT VT = Lo.getValueType();
+
+  // SRA expansion:
+  //   if Shamt-GRLen < 0: // Shamt < GRLen
+  //     Lo = (Lo >>u Shamt) | ((Hi << 1) << (ShAmt ^ GRLen-1))
+  //     Hi = Hi >>s Shamt
+  //   else:
+  //     Lo = Hi >>s (Shamt-GRLen);
+  //     Hi = Hi >>s (GRLen-1)
+  //
+  // SRL expansion:
+  //   if Shamt-GRLen < 0: // Shamt < GRLen
+  //     Lo = (Lo >>u Shamt) | ((Hi << 1) << (ShAmt ^ GRLen-1))
+  //     Hi = Hi >>u Shamt
+  //   else:
+  //     Lo = Hi >>u (Shamt-GRLen);
+  //     Hi = 0;
+
+  unsigned ShiftRightOp = IsSRA ? ISD::SRA : ISD::SRL;
+
+  SDValue Zero = DAG.getConstant(0, DL, VT);
+  SDValue One = DAG.getConstant(1, DL, VT);
+  SDValue MinusGRLen = DAG.getConstant(-(int)Subtarget.getGRLen(), DL, VT);
+  SDValue GRLenMinus1 = DAG.getConstant(Subtarget.getGRLen() - 1, DL, VT);
+  SDValue ShamtMinusGRLen = DAG.getNode(ISD::ADD, DL, VT, Shamt, MinusGRLen);
+  SDValue GRLenMinus1Shamt = DAG.getNode(ISD::XOR, DL, VT, Shamt, GRLenMinus1);
+
+  SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, VT, Lo, Shamt);
+  SDValue ShiftLeftHi1 = DAG.getNode(ISD::SHL, DL, VT, Hi, One);
+  SDValue ShiftLeftHi =
+      DAG.getNode(ISD::SHL, DL, VT, ShiftLeftHi1, GRLenMinus1Shamt);
+  SDValue LoTrue = DAG.getNode(ISD::OR, DL, VT, ShiftRightLo, ShiftLeftHi);
+  SDValue HiTrue = DAG.getNode(ShiftRightOp, DL, VT, Hi, Shamt);
+  SDValue LoFalse = DAG.getNode(ShiftRightOp, DL, VT, Hi, ShamtMinusGRLen);
+  SDValue HiFalse =
+      IsSRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, GRLenMinus1) : Zero;
+
+  SDValue CC = DAG.getSetCC(DL, VT, ShamtMinusGRLen, Zero, ISD::SETLT);
+
+  Lo = DAG.getNode(ISD::SELECT, DL, VT, CC, LoTrue, LoFalse);
+  Hi = DAG.getNode(ISD::SELECT, DL, VT, CC, HiTrue, HiFalse);
+
+  SDValue Parts[2] = {Lo, Hi};
+  return DAG.getMergeValues(Parts, DL);
+}
+
+// Returns the opcode of the target-specific SDNode that implements the 32-bit
+// form of the given Opcode.
+static LoongArchISD::NodeType getLoongArchWOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    llvm_unreachable("Unexpected opcode");
+  case ISD::SHL:
+    return LoongArchISD::SLL_W;
+  case ISD::SRA:
+    return LoongArchISD::SRA_W;
+  case ISD::SRL:
+    return LoongArchISD::SRL_W;
+  }
+}
+
+// Converts the given i8/i16/i32 operation to a target-specific SelectionDAG
+// node. Because i8/i16/i32 isn't a legal type for LA64, these operations would
+// otherwise be promoted to i64, making it difficult to select the
+// SLL_W/.../*W later one because the fact the operation was originally of
+// type i8/i16/i32 is lost.
+static SDValue customLegalizeToWOp(SDNode *N, SelectionDAG &DAG,
+                                   unsigned ExtOpc = ISD::ANY_EXTEND) {
+  SDLoc DL(N);
+  LoongArchISD::NodeType WOpcode = getLoongArchWOpcode(N->getOpcode());
+  SDValue NewOp0 = DAG.getNode(ExtOpc, DL, MVT::i64, N->getOperand(0));
+  SDValue NewOp1 = DAG.getNode(ExtOpc, DL, MVT::i64, N->getOperand(1));
+  SDValue NewRes = DAG.getNode(WOpcode, DL, MVT::i64, NewOp0, NewOp1);
+  // ReplaceNodeResults requires we maintain the same type for the return value.
+  return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewRes);
+}
+
+void LoongArchTargetLowering::ReplaceNodeResults(
+    SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
+  SDLoc DL(N);
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Don't know how to legalize this operation");
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+    assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+           "Unexpected custom legalisation");
+    if (N->getOperand(1).getOpcode() != ISD::Constant) {
+      Results.push_back(customLegalizeToWOp(N, DAG));
+      break;
+    }
+    break;
+  }
+}
+
+static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const LoongArchSubtarget &Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue FirstOperand = N->getOperand(0);
+  SDValue SecondOperand = N->getOperand(1);
+  unsigned FirstOperandOpc = FirstOperand.getOpcode();
+  EVT ValTy = N->getValueType(0);
+  SDLoc DL(N);
+  uint64_t lsb, msb;
+  unsigned SMIdx, SMLen;
+  ConstantSDNode *CN;
+  SDValue NewOperand;
+  MVT GRLenVT = Subtarget.getGRLenVT();
+
+  // Op's second operand must be a shifted mask.
+  if (!(CN = dyn_cast<ConstantSDNode>(SecondOperand)) ||
+      !isShiftedMask_64(CN->getZExtValue(), SMIdx, SMLen))
+    return SDValue();
+
+  if (FirstOperandOpc == ISD::SRA || FirstOperandOpc == ISD::SRL) {
+    // Pattern match BSTRPICK.
+    //  $dst = and ((sra or srl) $src , lsb), (2**len - 1)
+    //  => BSTRPICK $dst, $src, msb, lsb
+    //  where msb = lsb + len - 1
+
+    // The second operand of the shift must be an immediate.
+    if (!(CN = dyn_cast<ConstantSDNode>(FirstOperand.getOperand(1))))
+      return SDValue();
+
+    lsb = CN->getZExtValue();
+
+    // Return if the shifted mask does not start at bit 0 or the sum of its
+    // length and lsb exceeds the word's size.
+    if (SMIdx != 0 || lsb + SMLen > ValTy.getSizeInBits())
+      return SDValue();
+
+    NewOperand = FirstOperand.getOperand(0);
+  } else {
+    // Pattern match BSTRPICK.
+    //  $dst = and $src, (2**len- 1) , if len > 12
+    //  => BSTRPICK $dst, $src, msb, lsb
+    //  where lsb = 0 and msb = len - 1
+
+    // If the mask is <= 0xfff, andi can be used instead.
+    if (CN->getZExtValue() <= 0xfff)
+      return SDValue();
+
+    // Return if the mask doesn't start at position 0.
+    if (SMIdx)
+      return SDValue();
+
+    lsb = 0;
+    NewOperand = FirstOperand;
+  }
+  msb = lsb + SMLen - 1;
+  return DAG.getNode(LoongArchISD::BSTRPICK, DL, ValTy, NewOperand,
+                     DAG.getConstant(msb, DL, GRLenVT),
+                     DAG.getConstant(lsb, DL, GRLenVT));
+}
+
+static SDValue performSRLCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const LoongArchSubtarget &Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  // $dst = srl (and $src, Mask), Shamt
+  // =>
+  // BSTRPICK $dst, $src, MaskIdx+MaskLen-1, Shamt
+  // when Mask is a shifted mask, and MaskIdx <= Shamt <= MaskIdx+MaskLen-1
+  //
+
+  SDValue FirstOperand = N->getOperand(0);
+  ConstantSDNode *CN;
+  EVT ValTy = N->getValueType(0);
+  SDLoc DL(N);
+  MVT GRLenVT = Subtarget.getGRLenVT();
+  unsigned MaskIdx, MaskLen;
+  uint64_t Shamt;
+
+  // The first operand must be an AND and the second operand of the AND must be
+  // a shifted mask.
+  if (FirstOperand.getOpcode() != ISD::AND ||
+      !(CN = dyn_cast<ConstantSDNode>(FirstOperand.getOperand(1))) ||
+      !isShiftedMask_64(CN->getZExtValue(), MaskIdx, MaskLen))
+    return SDValue();
+
+  // The second operand (shift amount) must be an immediate.
+  if (!(CN = dyn_cast<ConstantSDNode>(N->getOperand(1))))
+    return SDValue();
+
+  Shamt = CN->getZExtValue();
+  if (MaskIdx <= Shamt && Shamt <= MaskIdx + MaskLen - 1)
+    return DAG.getNode(LoongArchISD::BSTRPICK, DL, ValTy,
+                       FirstOperand->getOperand(0),
+                       DAG.getConstant(MaskIdx + MaskLen - 1, DL, GRLenVT),
+                       DAG.getConstant(Shamt, DL, GRLenVT));
+
+  return SDValue();
+}
+
+SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
+                                                   DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  switch (N->getOpcode()) {
+  default:
+    break;
+  case ISD::AND:
+    return performANDCombine(N, DAG, DCI, Subtarget);
+  case ISD::SRL:
+    return performSRLCombine(N, DAG, DCI, Subtarget);
+  }
+  return SDValue();
+}
+
+const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch ((LoongArchISD::NodeType)Opcode) {
+  case LoongArchISD::FIRST_NUMBER:
+    break;
+
+#define NODE_NAME_CASE(node)                                                   \
+  case LoongArchISD::node:                                                     \
+    return "LoongArchISD::" #node;
+
+    // TODO: Add more target-dependent nodes later.
+    NODE_NAME_CASE(RET)
+    NODE_NAME_CASE(SLL_W)
+    NODE_NAME_CASE(SRA_W)
+    NODE_NAME_CASE(SRL_W)
+    NODE_NAME_CASE(BSTRPICK)
+  }
+#undef NODE_NAME_CASE
+  return nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+//                     Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+// FIXME: Now, we only support CallingConv::C with fixed arguments which are
+// passed with integer or floating-point registers.
+const MCPhysReg ArgGPRs[] = {LoongArch::R4,  LoongArch::R5, LoongArch::R6,
+                             LoongArch::R7,  LoongArch::R8, LoongArch::R9,
+                             LoongArch::R10, LoongArch::R11};
+const MCPhysReg ArgFPR32s[] = {LoongArch::F0, LoongArch::F1, LoongArch::F2,
+                               LoongArch::F3, LoongArch::F4, LoongArch::F5,
+                               LoongArch::F6, LoongArch::F7};
+const MCPhysReg ArgFPR64s[] = {
+    LoongArch::F0_64, LoongArch::F1_64, LoongArch::F2_64, LoongArch::F3_64,
+    LoongArch::F4_64, LoongArch::F5_64, LoongArch::F6_64, LoongArch::F7_64};
+
+// Implements the LoongArch calling convention. Returns true upon failure.
+static bool CC_LoongArch(unsigned ValNo, MVT ValVT,
+                         CCValAssign::LocInfo LocInfo, CCState &State) {
+  // Allocate to a register if possible.
+  Register Reg;
+
+  if (ValVT == MVT::f32)
+    Reg = State.AllocateReg(ArgFPR32s);
+  else if (ValVT == MVT::f64)
+    Reg = State.AllocateReg(ArgFPR64s);
+  else
+    Reg = State.AllocateReg(ArgGPRs);
+  if (Reg) {
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, ValVT, LocInfo));
+    return false;
+  }
+
+  // TODO: Handle arguments passed without register.
+  return true;
+}
+
+void LoongArchTargetLowering::analyzeInputArgs(
+    CCState &CCInfo, const SmallVectorImpl<ISD::InputArg> &Ins,
+    LoongArchCCAssignFn Fn) const {
+  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+    MVT ArgVT = Ins[i].VT;
+
+    if (Fn(i, ArgVT, CCValAssign::Full, CCInfo)) {
+      LLVM_DEBUG(dbgs() << "InputArg #" << i << " has unhandled type "
+                        << EVT(ArgVT).getEVTString() << '\n');
+      llvm_unreachable("");
+    }
+  }
+}
+
+void LoongArchTargetLowering::analyzeOutputArgs(
+    CCState &CCInfo, const SmallVectorImpl<ISD::OutputArg> &Outs,
+    LoongArchCCAssignFn Fn) const {
+  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+    MVT ArgVT = Outs[i].VT;
+
+    if (Fn(i, ArgVT, CCValAssign::Full, CCInfo)) {
+      LLVM_DEBUG(dbgs() << "OutputArg #" << i << " has unhandled type "
+                        << EVT(ArgVT).getEVTString() << "\n");
+      llvm_unreachable("");
+    }
+  }
+}
+
+static SDValue unpackFromRegLoc(SelectionDAG &DAG, SDValue Chain,
+                                const CCValAssign &VA, const SDLoc &DL,
+                                const LoongArchTargetLowering &TLI) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  EVT LocVT = VA.getLocVT();
+  const TargetRegisterClass *RC = TLI.getRegClassFor(LocVT.getSimpleVT());
+  Register VReg = RegInfo.createVirtualRegister(RC);
+  RegInfo.addLiveIn(VA.getLocReg(), VReg);
+
+  return DAG.getCopyFromReg(Chain, DL, VReg, LocVT);
+}
+
+// Transform physical registers into virtual registers.
+SDValue LoongArchTargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  switch (CallConv) {
+  default:
+    llvm_unreachable("Unsupported calling convention");
+  case CallingConv::C:
+    break;
+  }
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+
+  analyzeInputArgs(CCInfo, Ins, CC_LoongArch);
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
+    InVals.push_back(unpackFromRegLoc(DAG, Chain, ArgLocs[i], DL, *this));
+
+  return Chain;
+}
+
+bool LoongArchTargetLowering::CanLowerReturn(
+    CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
+    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+  // Any return value split in to more than two values can't be returned
+  // directly.
+  return Outs.size() <= 2;
+}
+
+SDValue LoongArchTargetLowering::LowerReturn(
+    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
+    SelectionDAG &DAG) const {
+  // Stores the assignment of the return value to a location.
+  SmallVector<CCValAssign> RVLocs;
+
+  // Info about the registers and stack slot.
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
+
+  analyzeOutputArgs(CCInfo, Outs, CC_LoongArch);
+
+  SDValue Glue;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0, e = RVLocs.size(); i < e; ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    // Handle a 'normal' return.
+    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), OutVals[i], Glue);
+
+    // Guarantee that all emitted copies are stuck together.
+    Glue = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  RetOps[0] = Chain; // Update chain.
+
+  // Add the glue node if we have it.
+  if (Glue.getNode())
+    RetOps.push_back(Glue);
+
+  return DAG.getNode(LoongArchISD::RET, DL, MVT::Other, RetOps);
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
new file mode 100644
index 000000000000..c852577a3744
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -0,0 +1,95 @@
+//=- LoongArchISelLowering.h - LoongArch DAG Lowering Interface -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that LoongArch uses to lower LLVM code into
+// a selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LOONGARCH_LOONGARCHISELLOWERING_H
+#define LLVM_LIB_TARGET_LOONGARCH_LOONGARCHISELLOWERING_H
+
+#include "LoongArch.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLowering.h"
+
+namespace llvm {
+class LoongArchSubtarget;
+struct LoongArchRegisterInfo;
+namespace LoongArchISD {
+enum NodeType : unsigned {
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+  // TODO: add more LoongArchISDs
+  RET,
+  // 32-bit shifts, directly matching the semantics of the named LoongArch
+  // instructions.
+  SLL_W,
+  SRA_W,
+  SRL_W,
+
+  BSTRPICK,
+
+};
+} // namespace LoongArchISD
+
+class LoongArchTargetLowering : public TargetLowering {
+  const LoongArchSubtarget &Subtarget;
+
+public:
+  explicit LoongArchTargetLowering(const TargetMachine &TM,
+                                   const LoongArchSubtarget &STI);
+
+  const LoongArchSubtarget &getSubtarget() const { return Subtarget; }
+
+  // Provide custom lowering hooks for some operations.
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+  void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const override;
+
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+  // This method returns the name of a target specific DAG node.
+  const char *getTargetNodeName(unsigned Opcode) const override;
+
+  // Lower incoming arguments, copy physregs into vregs.
+  SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+                               bool IsVarArg,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               const SDLoc &DL, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) const override;
+  bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                      bool IsVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      LLVMContext &Context) const override;
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
+                      SelectionDAG &DAG) const override;
+
+private:
+  /// Target-specific function used to lower LoongArch calling conventions.
+  typedef bool LoongArchCCAssignFn(unsigned ValNo, MVT ValVT,
+                                   CCValAssign::LocInfo LocInfo,
+                                   CCState &State);
+
+  void analyzeInputArgs(CCState &CCInfo,
+                        const SmallVectorImpl<ISD::InputArg> &Ins,
+                        LoongArchCCAssignFn Fn) const;
+  void analyzeOutputArgs(CCState &CCInfo,
+                         const SmallVectorImpl<ISD::OutputArg> &Outs,
+                         LoongArchCCAssignFn Fn) const;
+
+  SDValue lowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerShiftRightParts(SDValue Op, SelectionDAG &DAG, bool IsSRA) const;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_LOONGARCH_LOONGARCHISELLOWERING_H
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrFormats.td b/llvm/lib/Target/LoongArch/LoongArchInstrFormats.td
new file mode 100644
index 000000000000..bebc83a861ae
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrFormats.td
@@ -0,0 +1,404 @@
+//===- LoongArchInstrFormats.td - LoongArch Instr. Formats -*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe LoongArch instructions format
+//
+//  opcode       - operation code.
+//  rd           - destination register operand.
+//  r{j/k}       - source register operand.
+//  immN         - immediate data operand.
+//
+//===----------------------------------------------------------------------===//
+
+class LAInst<dag outs, dag ins, string opcstr, string opnstr,
+             list<dag> pattern = []>
+    : Instruction {
+  field bits<32> Inst;
+  // SoftFail is a field the disassembler can use to provide a way for
+  // instructions to not match without killing the whole decode process. It is
+  // mainly used for ARM, but Tablegen expects this field to exist or it fails
+  // to build the decode table.
+  field bits<32> SoftFail = 0;
+
+  let Namespace = "LoongArch";
+  let Size = 4;
+  let OutOperandList = outs;
+  let InOperandList = ins;
+  let AsmString = opcstr # "\t" # opnstr;
+  let Pattern = pattern;
+}
+
+// Pseudo instructions
+class Pseudo<dag outs, dag ins, list<dag> pattern = [], string opcstr = "",
+             string opnstr = "">
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
+
+// 2R-type
+// <opcode | rj | rd>
+class Fmt2R<bits<22> op, dag outs, dag ins, string opcstr, string opnstr,
+            list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<5> rj;
+  bits<5> rd;
+
+  let Inst{31-10} = op;
+  let Inst{9-5} = rj;
+  let Inst{4-0} = rd;
+}
+
+// 3R-type
+// <opcode | rk | rj | rd>
+class Fmt3R<bits<17> op, dag outs, dag ins, string opcstr, string opnstr,
+            list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<5> rk;
+  bits<5> rj;
+  bits<5> rd;
+
+  let Inst{31-15} = op;
+  let Inst{14-10} = rk;
+  let Inst{9-5} = rj;
+  let Inst{4-0} = rd;
+}
+
+// 3RI2-type
+// <opcode | I2 | rk | rj | rd>
+class Fmt3RI2<bits<15> op, dag outs, dag ins, string opcstr, string opnstr,
+              list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<2> imm2;
+  bits<5> rk;
+  bits<5> rj;
+  bits<5> rd;
+
+  let Inst{31-17} = op;
+  let Inst{16-15} = imm2;
+  let Inst{14-10} = rk;
+  let Inst{9-5} = rj;
+  let Inst{4-0} = rd;
+}
+
+// 3RI3-type
+// <opcode | I3 | rk | rj | rd>
+class Fmt3RI3<bits<14> op, dag outs, dag ins, string opcstr, string opnstr,
+              list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<3> imm3;
+  bits<5> rk;
+  bits<5> rj;
+  bits<5> rd;
+
+  let Inst{31-18} = op;
+  let Inst{17-15} = imm3;
+  let Inst{14-10} = rk;
+  let Inst{9-5} = rj;
+  let Inst{4-0} = rd;
+}
+
+// 2RI5-type
+// <opcode | I5 | rj | rd>
+class Fmt2RI5<bits<17> op, dag outs, dag ins, string opcstr, string opnstr,
+              list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<5> imm5;
+  bits<5> rj;
+  bits<5> rd;
+
+  let Inst{31-15} = op;
+  let Inst{14-10} = imm5;
+  let Inst{9-5} = rj;
+  let Inst{4-0} = rd;
+}
+
+// 2RI6-type
+// <opcode | I6 | rj | rd>
+class Fmt2RI6<bits<16> op, dag outs, dag ins, string opcstr, string opnstr,
+              list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<6> imm6;
+  bits<5> rj;
+  bits<5> rd;
+
+  let Inst{31-16} = op;
+  let Inst{15-10} = imm6;
+  let Inst{9-5} = rj;
+  let Inst{4-0} = rd;
+}
+
+// 2RI8-type
+// <opcode | I8 | rj | rd>
+class Fmt2RI8<bits<14> op, dag outs, dag ins, string opcstr, string opnstr,
+              list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<8> imm8;
+  bits<5> rj;
+  bits<5> rd;
+
+  let Inst{31-18} = op;
+  let Inst{17-10} = imm8;
+  let Inst{9-5} = rj;
+  let Inst{4-0} = rd;
+}
+
+// 2RI12-type
+// <opcode | I12 | rj | rd>
+class Fmt2RI12<bits<10> op, dag outs, dag ins, string opcstr, string opnstr,
+               list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<12> imm12;
+  bits<5> rj;
+  bits<5> rd;
+
+  let Inst{31-22} = op;
+  let Inst{21-10} = imm12;
+  let Inst{9-5} = rj;
+  let Inst{4-0} = rd;
+}
+
+// 2RI14-type
+// <opcode | I14 | rj | rd>
+class Fmt2RI14<bits<8> op, dag outs, dag ins, string opcstr, string opnstr,
+               list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<14> imm14;
+  bits<5> rj;
+  bits<5> rd;
+
+  let Inst{31-24} = op;
+  let Inst{23-10} = imm14;
+  let Inst{9-5} = rj;
+  let Inst{4-0} = rd;
+}
+
+// 2RI16-type
+// <opcode | I16 | rj | rd>
+class Fmt2RI16<bits<6> op, dag outs, dag ins, string opcstr, string opnstr,
+               list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<16> imm16;
+  bits<5> rj;
+  bits<5> rd;
+
+  let Inst{31-26} = op;
+  let Inst{25-10} = imm16;
+  let Inst{9-5} = rj;
+  let Inst{4-0} = rd;
+}
+
+// 1RI20-type
+// <opcode | I20 | rd>
+class Fmt1RI20<bits<7> op, dag outs, dag ins, string opcstr, string opnstr,
+               list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<20> imm20;
+  bits<5> rd;
+
+  let Inst{31-25} = op;
+  let Inst{24-5} = imm20;
+  let Inst{4-0} = rd;
+}
+
+// 1RI21-type
+// <opcode | I21[15:0] | rj | I21[20:16]>
+class Fmt1RI21<bits<6> op, dag outs, dag ins, string opcstr, string opnstr,
+               list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<21> imm21;
+  bits<5> rj;
+
+  let Inst{31-26} = op;
+  let Inst{25-10} = imm21{15-0};
+  let Inst{9-5} = rj;
+  let Inst{4-0} = imm21{20-16};
+}
+
+// I15-type
+// <opcode | I15>
+class FmtI15<bits<17> op, dag outs, dag ins, string opcstr, string opnstr,
+             list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<15> imm15;
+
+  let Inst{31-15} = op;
+  let Inst{14-0} = imm15;
+}
+
+// I26-type
+// <opcode | I26[15:0] | I26[25:16]>
+class FmtI26<bits<6> op, dag outs, dag ins, string opcstr, string opnstr,
+             list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<26> imm26;
+
+  let Inst{31-26} = op;
+  let Inst{25-10} = imm26{15-0};
+  let Inst{9-0} = imm26{25-16};
+}
+
+// FmtBSTR_W
+// <opcode[11:1] | msbw | opcode[0] | lsbw | rj | rd>
+class FmtBSTR_W<bits<12> op, dag outs, dag ins, string opcstr, string opnstr,
+                list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<5> msbw;
+  bits<5> lsbw;
+  bits<5> rj;
+  bits<5> rd;
+
+  let Inst{31-21} = op{11-1};
+  let Inst{20-16} = msbw;
+  let Inst{15} = op{0};
+  let Inst{14-10} = lsbw;
+  let Inst{9-5} = rj;
+  let Inst{4-0} = rd;
+}
+
+// FmtBSTR_D
+// <opcode | msbd | lsbd | rj | rd>
+class FmtBSTR_D<bits<10> op, dag outs, dag ins, string opcstr, string opnstr,
+                list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<6> msbd;
+  bits<6> lsbd;
+  bits<5> rj;
+  bits<5> rd;
+
+  let Inst{31-22} = op;
+  let Inst{21-16} = msbd;
+  let Inst{15-10} = lsbd;
+  let Inst{9-5} = rj;
+  let Inst{4-0} = rd;
+}
+
+// FmtASRT
+// <opcode | rk | rj | 0x0>
+class FmtASRT<bits<17> op, dag outs, dag ins, string opcstr, string opnstr,
+              list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<5> rk;
+  bits<5> rj;
+
+  let Inst{31-15} = op;
+  let Inst{14-10} = rk;
+  let Inst{9-5} = rj;
+  let Inst{4-0} = 0x0;
+}
+
+// FmtPRELD
+// < 0b0010101011 | I12 | rj | I5>
+class FmtPRELD<dag outs, dag ins, string opcstr, string opnstr,
+               list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<12> imm12;
+  bits<5> rj;
+  bits<5> imm5;
+
+  let Inst{31-22} = 0b0010101011;
+  let Inst{21-10} = imm12;
+  let Inst{9-5} = rj;
+  let Inst{4-0} = imm5;
+}
+
+// FmtPRELDX
+// < 0b00111000001011000 | rk | rj | I5>
+class FmtPRELDX<dag outs, dag ins, string opcstr, string opnstr,
+                list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<5> rk;
+  bits<5> rj;
+  bits<5> imm5;
+
+  let Inst{31-15} = 0b00111000001011000;
+  let Inst{14-10} = rk;
+  let Inst{9-5} = rj;
+  let Inst{4-0} = imm5;
+}
+
+// FmtCSR
+// <opcode[12:5] | csr_num | opcode[4:0] | rd>
+class FmtCSR<bits<13> op, dag outs, dag ins, string opcstr, string opnstr,
+             list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<14> csr_num;
+  bits<5> rd;
+
+  let Inst{31-24} = op{12-5};
+  let Inst{23-10} = csr_num;
+  let Inst{9-5} = op{4-0};
+  let Inst{4-0} = rd;
+}
+
+// FmtCSRXCHG
+// <opcode | csr_num | rj | rd>
+class FmtCSRXCHG<bits<8> op, dag outs, dag ins, string opcstr, string opnstr,
+                 list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<14> csr_num;
+  bits<5> rj;
+  bits<5> rd;
+
+  let Inst{31-24} = op;
+  let Inst{23-10} = csr_num;
+  let Inst{9-5} = rj;
+  let Inst{4-0} = rd;
+}
+
+// FmtCACOP
+// <0b0000011000 | I12 | rj | I5>
+class FmtCACOP<dag outs, dag ins, string opcstr, string opnstr,
+               list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<12> imm12;
+  bits<5> rj;
+  bits<5> op;
+
+  let Inst{31-22} = 0b0000011000;
+  let Inst{21-10} = imm12;
+  let Inst{9-5} = rj;
+  let Inst{4-0} = op;
+}
+
+// FmtIMM32
+// <I32>
+class FmtI32<bits<32> op, string opstr, list<dag> pattern = []>
+    : LAInst<(outs), (ins), opstr, "", pattern> {
+  let Inst{31-0} = op;
+}
+
+// FmtINVTLB
+// <0b00000110010010011 | rk | rj | I5>
+class FmtINVTLB<dag outs, dag ins, string opcstr, string opnstr,
+                list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<5> rk;
+  bits<5> rj;
+  bits<5> op;
+
+  let Inst{31-15} = 0b00000110010010011;
+  let Inst{14-10} = rk;
+  let Inst{9-5} = rj;
+  let Inst{4-0} = op;
+}
+
+// FmtLDPTE
+// <0b00000110010001 | seq | rj | 00000>
+class FmtLDPTE<dag outs, dag ins, string opcstr, string opnstr,
+               list<dag> pattern = []>
+    : LAInst<outs, ins, opcstr, opnstr, pattern> {
+  bits<8> seq;
+  bits<5> rj;
+
+  let Inst{31-18} = 0b00000110010001;
+  let Inst{17-10} = seq;
+  let Inst{9-5} = rj;
+  let Inst{4-0} = 0b00000;
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
new file mode 100644
index 000000000000..146ef53befd5
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
@@ -0,0 +1,49 @@
+//=- LoongArchInstrInfo.cpp - LoongArch Instruction Information -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the LoongArch implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LoongArchInstrInfo.h"
+#include "LoongArch.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "LoongArchGenInstrInfo.inc"
+
+LoongArchInstrInfo::LoongArchInstrInfo(LoongArchSubtarget &STI)
+    // FIXME: add CFSetup and CFDestroy Inst when we implement function call.
+    : LoongArchGenInstrInfo() {}
+
+void LoongArchInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     const DebugLoc &DL, MCRegister DstReg,
+                                     MCRegister SrcReg, bool KillSrc) const {
+  if (LoongArch::GPRRegClass.contains(DstReg, SrcReg)) {
+    BuildMI(MBB, MBBI, DL, get(LoongArch::OR), DstReg)
+        .addReg(SrcReg, getKillRegState(KillSrc))
+        .addReg(LoongArch::R0);
+    return;
+  }
+
+  // FPR->FPR copies.
+  unsigned Opc;
+  if (LoongArch::FPR32RegClass.contains(DstReg, SrcReg)) {
+    Opc = LoongArch::FMOV_S;
+  } else if (LoongArch::FPR64RegClass.contains(DstReg, SrcReg)) {
+    Opc = LoongArch::FMOV_D;
+  } else {
+    // TODO: support other copies.
+    llvm_unreachable("Impossible reg-to-reg copy");
+  }
+
+  BuildMI(MBB, MBBI, DL, get(Opc), DstReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
new file mode 100644
index 000000000000..f31943b85a51
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
@@ -0,0 +1,36 @@
+//=- LoongArchInstrInfo.h - LoongArch Instruction Information ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the LoongArch implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LOONGARCH_LOONGARCHINSTRINFO_H
+#define LLVM_LIB_TARGET_LOONGARCH_LOONGARCHINSTRINFO_H
+
+#include "LoongArchRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "LoongArchGenInstrInfo.inc"
+
+namespace llvm {
+
+class LoongArchSubtarget;
+
+class LoongArchInstrInfo : public LoongArchGenInstrInfo {
+public:
+  explicit LoongArchInstrInfo(LoongArchSubtarget &STI);
+
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                   const DebugLoc &DL, MCRegister DstReg, MCRegister SrcReg,
+                   bool KillSrc) const override;
+};
+
+} // end namespace llvm
+#endif // LLVM_LIB_TARGET_LOONGARCH_LOONGARCHINSTRINFO_H
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
new file mode 100644
index 000000000000..6b8ee9e43f94
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -0,0 +1,730 @@
+//== LoongArchInstrInfo.td - Target Description for LoongArch -*- tablegen -*-//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the LoongArch instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// LoongArch specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+// Target-dependent type requirements.
+def SDT_LoongArchIntBinOpW : SDTypeProfile<1, 2, [
+  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVT<0, i64>
+]>;
+
+def SDT_LoongArchBStrPick: SDTypeProfile<1, 3, [
+  SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisSameAs<2, 3>
+]>;
+
+// TODO: Add LoongArch specific DAG Nodes
+// Target-dependent nodes.
+def loongarch_ret : SDNode<"LoongArchISD::RET", SDTNone,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def loongarch_sll_w : SDNode<"LoongArchISD::SLL_W", SDT_LoongArchIntBinOpW>;
+def loongarch_sra_w : SDNode<"LoongArchISD::SRA_W", SDT_LoongArchIntBinOpW>;
+def loongarch_srl_w : SDNode<"LoongArchISD::SRL_W", SDT_LoongArchIntBinOpW>;
+def loongarch_bstrpick
+    : SDNode<"LoongArchISD::BSTRPICK", SDT_LoongArchBStrPick>;
+
+//===----------------------------------------------------------------------===//
+// Operand and SDNode transformation definitions.
+//===----------------------------------------------------------------------===//
+
+class ImmAsmOperand<string prefix, int width, string suffix>
+    : AsmOperandClass {
+  let Name = prefix # "Imm" # width # suffix;
+  let DiagnosticType = !strconcat("Invalid", Name);
+  let RenderMethod = "addImmOperands";
+}
+
+class SImmAsmOperand<int width, string suffix = "">
+    : ImmAsmOperand<"S", width, suffix> {
+}
+
+class UImmAsmOperand<int width, string suffix = "">
+    : ImmAsmOperand<"U", width, suffix> {
+}
+
+def uimm2 : Operand<GRLenVT> {
+  let ParserMatchClass = UImmAsmOperand<2>;
+}
+
+def uimm2_plus1 : Operand<GRLenVT> {
+  let ParserMatchClass = UImmAsmOperand<2, "plus1">;
+  let EncoderMethod = "getImmOpValueSub1";
+  let DecoderMethod = "decodeUImmOperand<2, 1>";
+}
+
+def uimm3 : Operand<GRLenVT> {
+  let ParserMatchClass = UImmAsmOperand<3>;
+}
+
+def uimm5 : Operand<GRLenVT>, ImmLeaf<GRLenVT, [{return isUInt<5>(Imm);}]> {
+  let ParserMatchClass = UImmAsmOperand<5>;
+}
+
+def uimm6 : Operand<GRLenVT>, ImmLeaf<GRLenVT, [{return isUInt<6>(Imm);}]> {
+  let ParserMatchClass = UImmAsmOperand<6>;
+}
+
+def uimm8 : Operand<GRLenVT> {
+  let ParserMatchClass = UImmAsmOperand<8>;
+}
+
+def uimm12 : Operand<GRLenVT>, ImmLeaf<GRLenVT, [{return isUInt<12>(Imm);}]> {
+  let ParserMatchClass = UImmAsmOperand<12>;
+}
+
+def uimm14 : Operand<GRLenVT> {
+  let ParserMatchClass = UImmAsmOperand<14>;
+}
+
+def uimm15 : Operand<GRLenVT> {
+  let ParserMatchClass = UImmAsmOperand<15>;
+}
+
+def simm12 : Operand<GRLenVT>, ImmLeaf<GRLenVT, [{return isInt<12>(Imm);}]> {
+  let ParserMatchClass = SImmAsmOperand<12>;
+  let DecoderMethod = "decodeSImmOperand<12>";
+}
+
+def simm14_lsl2 : Operand<GRLenVT> {
+  let ParserMatchClass = SImmAsmOperand<14, "lsl2">;
+  let EncoderMethod = "getImmOpValueAsr2";
+  let DecoderMethod = "decodeSImmOperand<14, 2>";
+}
+
+def simm16 : Operand<GRLenVT> {
+  let ParserMatchClass = SImmAsmOperand<16>;
+  let DecoderMethod = "decodeSImmOperand<16>";
+}
+
+def simm16_lsl2 : Operand<GRLenVT> {
+  let ParserMatchClass = SImmAsmOperand<16, "lsl2">;
+  let EncoderMethod = "getImmOpValueAsr2";
+  let DecoderMethod = "decodeSImmOperand<16, 2>";
+}
+
+def simm20 : Operand<GRLenVT> {
+  let ParserMatchClass = SImmAsmOperand<20>;
+  let DecoderMethod = "decodeSImmOperand<20>";
+}
+
+def simm21_lsl2 : Operand<GRLenVT> {
+  let ParserMatchClass = SImmAsmOperand<21, "lsl2">;
+  let EncoderMethod = "getImmOpValueAsr2";
+  let DecoderMethod = "decodeSImmOperand<21, 2>";
+}
+
+def simm26_lsl2 : Operand<GRLenVT> {
+  let ParserMatchClass = SImmAsmOperand<26, "lsl2">;
+  let EncoderMethod = "getImmOpValueAsr2";
+  let DecoderMethod = "decodeSImmOperand<26, 2>";
+}
+
+// Standalone (codegen-only) immleaf patterns.
+
+// A 12-bit signed immediate plus one where the imm range will be [-2047, 2048].
+def simm12_plus1 : ImmLeaf<GRLenVT,
+  [{return (isInt<12>(Imm) && Imm != -2048) || Imm == 2048;}]>;
+
+// Return the negation of an immediate value.
+def NegImm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(-N->getSExtValue(), SDLoc(N),
+                                   N->getValueType(0));
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Formats
+//===----------------------------------------------------------------------===//
+
+include "LoongArchInstrFormats.td"
+include "LoongArchFloatInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Class Templates
+//===----------------------------------------------------------------------===//
+
+class ALU_3R<bits<17> op, string opstr>
+    : Fmt3R<op, (outs GPR:$rd), (ins GPR:$rj, GPR:$rk), opstr, "$rd, $rj, $rk">;
+class ALU_2R<bits<22> op, string opstr>
+    : Fmt2R<op, (outs GPR:$rd), (ins GPR:$rj), opstr, "$rd, $rj">;
+
+class ALU_3RI2<bits<15> op, string opstr, Operand ImmOpnd>
+    : Fmt3RI2<op, (outs GPR:$rd), (ins GPR:$rj, GPR:$rk, ImmOpnd:$imm2), opstr,
+              "$rd, $rj, $rk, $imm2">;
+class ALU_3RI3<bits<14> op, string opstr, Operand ImmOpnd>
+    : Fmt3RI3<op, (outs GPR:$rd), (ins GPR:$rj, GPR:$rk, ImmOpnd:$imm3), opstr,
+              "$rd, $rj, $rk, $imm3">;
+class ALU_2RI5<bits<17> op, string opstr, Operand ImmOpnd>
+    : Fmt2RI5<op, (outs GPR:$rd), (ins GPR:$rj, ImmOpnd:$imm5), opstr,
+              "$rd, $rj, $imm5">;
+class ALU_2RI6<bits<16> op, string opstr, Operand ImmOpnd>
+    : Fmt2RI6<op, (outs GPR:$rd), (ins GPR:$rj, ImmOpnd:$imm6), opstr,
+              "$rd, $rj, $imm6">;
+class ALU_2RI12<bits<10> op, string opstr, Operand ImmOpnd>
+    : Fmt2RI12<op, (outs GPR:$rd), (ins GPR:$rj, ImmOpnd:$imm12), opstr,
+               "$rd, $rj, $imm12">;
+class ALU_2RI16<bits<6> op, string opstr, Operand ImmOpnd>
+    : Fmt2RI16<op, (outs GPR:$rd), (ins GPR:$rj, ImmOpnd:$imm16), opstr,
+               "$rd, $rj, $imm16">;
+class ALU_1RI20<bits<7> op, string opstr, Operand ImmOpnd>
+    : Fmt1RI20<op, (outs GPR:$rd), (ins ImmOpnd:$imm20), opstr, "$rd, $imm20">;
+
+class MISC_I15<bits<17> op, string opstr>
+    : FmtI15<op, (outs), (ins uimm15:$imm15), opstr, "$imm15">;
+
+class RDTIME_2R<bits<22> op, string opstr>
+    : Fmt2R<op, (outs GPR:$rd, GPR:$rj), (ins), opstr, "$rd, $rj">;
+
+class BrCC_2RI16<bits<6> op, string opstr>
+    : Fmt2RI16<op, (outs), (ins GPR:$rj, GPR:$rd, simm16_lsl2:$imm16), opstr,
+               "$rj, $rd, $imm16"> {
+  let isBranch = 1;
+  let isTerminator = 1;
+}
+class BrCCZ_1RI21<bits<6> op, string opstr>
+    : Fmt1RI21<op, (outs), (ins GPR:$rj, simm21_lsl2:$imm21), opstr,
+               "$rj, $imm21"> {
+  let isBranch = 1;
+  let isTerminator = 1;
+}
+class Br_I26<bits<6> op, string opstr>
+    : FmtI26<op, (outs), (ins simm26_lsl2:$imm26), opstr, "$imm26"> {
+  let isBranch = 1;
+  let isTerminator = 1;
+}
+
+let mayLoad = 1 in {
+class LOAD_3R<bits<17> op, string opstr>
+    : Fmt3R<op, (outs GPR:$rd), (ins GPR:$rj, GPR:$rk), opstr, "$rd, $rj, $rk">;
+class LOAD_2RI12<bits<10> op, string opstr>
+    : Fmt2RI12<op, (outs GPR:$rd), (ins GPR:$rj, simm12:$imm12), opstr,
+               "$rd, $rj, $imm12">;
+class LOAD_2RI14<bits<8> op, string opstr>
+    : Fmt2RI14<op, (outs GPR:$rd), (ins GPR:$rj, simm14_lsl2:$imm14), opstr,
+               "$rd, $rj, $imm14">;
+} // mayLoad = 1
+
+let mayStore = 1 in {
+class STORE_3R<bits<17> op, string opstr>
+    : Fmt3R<op, (outs), (ins GPR:$rd, GPR:$rj, GPR:$rk), opstr,
+            "$rd, $rj, $rk">;
+class STORE_2RI12<bits<10> op, string opstr>
+    : Fmt2RI12<op, (outs), (ins GPR:$rd, GPR:$rj, simm12:$imm12), opstr,
+               "$rd, $rj, $imm12">;
+class STORE_2RI14<bits<8> op, string opstr>
+    : Fmt2RI14<op, (outs), (ins GPR:$rd, GPR:$rj, simm14_lsl2:$imm14), opstr,
+               "$rd, $rj, $imm14">;
+} // mayStore = 1
+
+let mayLoad = 1, mayStore = 1 in
+class AM_3R<bits<17> op, string opstr>
+    : Fmt3R<op, (outs GPR:$rd), (ins GPR:$rk, GPR:$rj), opstr, "$rd, $rk, $rj">;
+
+let mayLoad = 1 in
+class LLBase<bits<8> op, string opstr>
+    : Fmt2RI14<op, (outs GPR:$rd), (ins GPR:$rj, simm14_lsl2:$imm14), opstr,
+               "$rd, $rj, $imm14">;
+
+let mayStore = 1, Constraints = "$rd = $dst" in
+class SCBase<bits<8> op, string opstr>
+    : Fmt2RI14<op, (outs GPR:$dst), (ins GPR:$rd, GPR:$rj, simm14_lsl2:$imm14),
+               opstr, "$rd, $rj, $imm14">;
+
+class IOCSRRD<bits<22> op, string opstr>
+    : Fmt2R<op, (outs GPR:$rd), (ins GPR:$rj), opstr, "$rd, $rj">;
+
+class IOCSRWR<bits<22> op, string opstr>
+    : Fmt2R<op, (outs), (ins GPR:$rd, GPR:$rj), opstr, "$rd, $rj">;
+
+//===----------------------------------------------------------------------===//
+// Basic Integer Instructions
+//===----------------------------------------------------------------------===//
+
+// Arithmetic Operation Instructions
+def ADD_W : ALU_3R<0b00000000000100000, "add.w">;
+def SUB_W : ALU_3R<0b00000000000100010, "sub.w">;
+def ADDI_W : ALU_2RI12<0b0000001010, "addi.w", simm12>;
+def ALSL_W : ALU_3RI2<0b000000000000010, "alsl.w", uimm2_plus1>;
+def LU12I_W : ALU_1RI20<0b0001010, "lu12i.w", simm20>;
+def SLT  : ALU_3R<0b00000000000100100, "slt">;
+def SLTU : ALU_3R<0b00000000000100101, "sltu">;
+def SLTI  : ALU_2RI12<0b0000001000, "slti", simm12>;
+def SLTUI : ALU_2RI12<0b0000001001, "sltui", simm12>;
+def PCADDI    : ALU_1RI20<0b0001100, "pcaddi", simm20>;
+def PCADDU12I : ALU_1RI20<0b0001110, "pcaddu12i", simm20>;
+def PCALAU12I : ALU_1RI20<0b0001101, "pcalau12i", simm20>;
+def AND  : ALU_3R<0b00000000000101001, "and">;
+def OR   : ALU_3R<0b00000000000101010, "or">;
+def NOR  : ALU_3R<0b00000000000101000, "nor">;
+def XOR  : ALU_3R<0b00000000000101011, "xor">;
+def ANDN : ALU_3R<0b00000000000101101, "andn">;
+def ORN  : ALU_3R<0b00000000000101100, "orn">;
+def ANDI : ALU_2RI12<0b0000001101, "andi", uimm12>;
+def ORI  : ALU_2RI12<0b0000001110, "ori", uimm12>;
+def XORI : ALU_2RI12<0b0000001111, "xori", uimm12>;
+def MUL_W   : ALU_3R<0b00000000000111000, "mul.w">;
+def MULH_W  : ALU_3R<0b00000000000111001, "mulh.w">;
+def MULH_WU : ALU_3R<0b00000000000111010, "mulh.wu">;
+def DIV_W   : ALU_3R<0b00000000001000000, "div.w">;
+def MOD_W   : ALU_3R<0b00000000001000001, "mod.w">;
+def DIV_WU  : ALU_3R<0b00000000001000010, "div.wu">;
+def MOD_WU  : ALU_3R<0b00000000001000011, "mod.wu">;
+
+// Bit-shift Instructions
+def SLL_W  : ALU_3R<0b00000000000101110, "sll.w">;
+def SRL_W  : ALU_3R<0b00000000000101111, "srl.w">;
+def SRA_W  : ALU_3R<0b00000000000110000, "sra.w">;
+def ROTR_W : ALU_3R<0b00000000000110110, "rotr.w">;
+
+def SLLI_W  : ALU_2RI5<0b00000000010000001, "slli.w", uimm5>;
+def SRLI_W  : ALU_2RI5<0b00000000010001001, "srli.w", uimm5>;
+def SRAI_W  : ALU_2RI5<0b00000000010010001, "srai.w", uimm5>;
+def ROTRI_W : ALU_2RI5<0b00000000010011001, "rotri.w", uimm5>;
+
+// Bit-manipulation Instructions
+def EXT_W_B : ALU_2R<0b0000000000000000010111, "ext.w.b">;
+def EXT_W_H : ALU_2R<0b0000000000000000010110, "ext.w.h">;
+def CLO_W   : ALU_2R<0b0000000000000000000100, "clo.w">;
+def CLZ_W   : ALU_2R<0b0000000000000000000101, "clz.w">;
+def CTO_W   : ALU_2R<0b0000000000000000000110, "cto.w">;
+def CTZ_W   : ALU_2R<0b0000000000000000000111, "ctz.w">;
+def BYTEPICK_W : ALU_3RI2<0b000000000000100, "bytepick.w", uimm2>;
+def REVB_2H   : ALU_2R<0b0000000000000000001100, "revb.2h">;
+def BITREV_4B : ALU_2R<0b0000000000000000010010, "bitrev.4b">;
+def BITREV_W  : ALU_2R<0b0000000000000000010100, "bitrev.w">;
+let Constraints = "$rd = $dst" in {
+def BSTRINS_W  : FmtBSTR_W<0b000000000110, (outs GPR:$dst),
+                           (ins GPR:$rd, GPR:$rj, uimm5:$msbw, uimm5:$lsbw),
+                           "bstrins.w", "$rd, $rj, $msbw, $lsbw">;
+}
+def BSTRPICK_W : FmtBSTR_W<0b000000000111, (outs GPR:$rd),
+                           (ins GPR:$rj, uimm5:$msbw, uimm5:$lsbw),
+                           "bstrpick.w", "$rd, $rj, $msbw, $lsbw">;
+def MASKEQZ : ALU_3R<0b00000000000100110, "maskeqz">;
+def MASKNEZ : ALU_3R<0b00000000000100111, "masknez">;
+
+// Branch Instructions
+def BEQ  : BrCC_2RI16<0b010110, "beq">;
+def BNE  : BrCC_2RI16<0b010111, "bne">;
+def BLT  : BrCC_2RI16<0b011000, "blt">;
+def BGE  : BrCC_2RI16<0b011001, "bge">;
+def BLTU : BrCC_2RI16<0b011010, "bltu">;
+def BGEU : BrCC_2RI16<0b011011, "bgeu">;
+def BEQZ : BrCCZ_1RI21<0b010000, "beqz">;
+def BNEZ : BrCCZ_1RI21<0b010001, "bnez">;
+def B : Br_I26<0b010100, "b">;
+
+let isCall = 1 in
+def BL : FmtI26<0b010101, (outs), (ins simm26_lsl2:$imm26), "bl", "$imm26">;
+def JIRL : Fmt2RI16<0b010011, (outs GPR:$rd),
+                    (ins GPR:$rj, simm16_lsl2:$imm16), "jirl",
+                    "$rd, $rj, $imm16">;
+
+// Common Memory Access Instructions
+def LD_B  : LOAD_2RI12<0b0010100000, "ld.b">;
+def LD_H  : LOAD_2RI12<0b0010100001, "ld.h">;
+def LD_W  : LOAD_2RI12<0b0010100010, "ld.w">;
+def LD_BU : LOAD_2RI12<0b0010101000, "ld.bu">;
+def LD_HU : LOAD_2RI12<0b0010101001, "ld.hu">;
+def ST_B : STORE_2RI12<0b0010100100, "st.b">;
+def ST_H : STORE_2RI12<0b0010100101, "st.h">;
+def ST_W : STORE_2RI12<0b0010100110, "st.w">;
+def PRELD : FmtPRELD<(outs), (ins uimm5:$imm5, GPR:$rj, simm12:$imm12), "preld",
+                     "$imm5, $rj, $imm12">;
+
+// Atomic Memory Access Instructions
+def LL_W : LLBase<0b00100000, "ll.w">;
+def SC_W : SCBase<0b00100001, "sc.w">;
+
+// Barrier Instructions
+def DBAR : MISC_I15<0b00111000011100100, "dbar">;
+def IBAR : MISC_I15<0b00111000011100101, "ibar">;
+
+// Other Miscellaneous Instructions
+def SYSCALL : MISC_I15<0b00000000001010110, "syscall">;
+def BREAK   : MISC_I15<0b00000000001010100, "break">;
+def RDTIMEL_W : RDTIME_2R<0b0000000000000000011000, "rdtimel.w">;
+def RDTIMEH_W : RDTIME_2R<0b0000000000000000011001, "rdtimeh.w">;
+def CPUCFG : ALU_2R<0b0000000000000000011011, "cpucfg">;
+
+/// LA64 instructions
+
+let Predicates = [IsLA64] in {
+
+// Arithmetic Operation Instructions for 64-bits
+def ADD_D : ALU_3R<0b00000000000100001, "add.d">;
+def SUB_D : ALU_3R<0b00000000000100011, "sub.d">;
+def ADDI_D : ALU_2RI12<0b0000001011, "addi.d", simm12>;
+def ADDU16I_D : ALU_2RI16<0b000100, "addu16i.d", simm16>;
+def ALSL_WU : ALU_3RI2<0b000000000000011, "alsl.wu", uimm2_plus1>;
+def ALSL_D  : ALU_3RI2<0b000000000010110, "alsl.d", uimm2_plus1>;
+let Constraints = "$rd = $dst" in {
+def LU32I_D : Fmt1RI20<0b0001011, (outs GPR:$dst),
+                       (ins GPR:$rd, simm20:$imm20), "lu32i.d",
+                       "$rd, $imm20">;
+}
+def LU52I_D : ALU_2RI12<0b0000001100, "lu52i.d", simm12>;
+def PCADDU18I : ALU_1RI20<0b0001111, "pcaddu18i", simm20>;
+def MUL_D     : ALU_3R<0b00000000000111011, "mul.d">;
+def MULH_D    : ALU_3R<0b00000000000111100, "mulh.d">;
+def MULH_DU   : ALU_3R<0b00000000000111101, "mulh.du">;
+def MULW_D_W  : ALU_3R<0b00000000000111110, "mulw.d.w">;
+def MULW_D_WU : ALU_3R<0b00000000000111111, "mulw.d.wu">;
+def DIV_D     : ALU_3R<0b00000000001000100, "div.d">;
+def MOD_D     : ALU_3R<0b00000000001000101, "mod.d">;
+def DIV_DU    : ALU_3R<0b00000000001000110, "div.du">;
+def MOD_DU    : ALU_3R<0b00000000001000111, "mod.du">;
+
+// Bit-shift Instructions for 64-bits
+def SLL_D  : ALU_3R<0b00000000000110001, "sll.d">;
+def SRL_D  : ALU_3R<0b00000000000110010, "srl.d">;
+def SRA_D  : ALU_3R<0b00000000000110011, "sra.d">;
+def ROTR_D : ALU_3R<0b00000000000110111, "rotr.d">;
+def SLLI_D  : ALU_2RI6<0b0000000001000001, "slli.d", uimm6>;
+def SRLI_D  : ALU_2RI6<0b0000000001000101, "srli.d", uimm6>;
+def SRAI_D  : ALU_2RI6<0b0000000001001001, "srai.d", uimm6>;
+def ROTRI_D : ALU_2RI6<0b0000000001001101, "rotri.d", uimm6>;
+
+// Bit-manipulation Instructions for 64-bits
+def CLO_D : ALU_2R<0b0000000000000000001000, "clo.d">;
+def CLZ_D : ALU_2R<0b0000000000000000001001, "clz.d">;
+def CTO_D : ALU_2R<0b0000000000000000001010, "cto.d">;
+def CTZ_D : ALU_2R<0b0000000000000000001011, "ctz.d">;
+def BYTEPICK_D : ALU_3RI3<0b00000000000011, "bytepick.d", uimm3>;
+def REVB_4H   : ALU_2R<0b0000000000000000001101, "revb.4h">;
+def REVB_2W   : ALU_2R<0b0000000000000000001110, "revb.2w">;
+def REVB_D    : ALU_2R<0b0000000000000000001111, "revb.d">;
+def REVH_2W   : ALU_2R<0b0000000000000000010000, "revh.2w">;
+def REVH_D    : ALU_2R<0b0000000000000000010001, "revh.d">;
+def BITREV_8B : ALU_2R<0b0000000000000000010011, "bitrev.8b">;
+def BITREV_D  : ALU_2R<0b0000000000000000010101, "bitrev.d">;
+let Constraints = "$rd = $dst" in {
+def BSTRINS_D  : FmtBSTR_D<0b0000000010, (outs GPR:$dst),
+                           (ins GPR:$rd, GPR:$rj, uimm6:$msbd, uimm6:$lsbd),
+                           "bstrins.d", "$rd, $rj, $msbd, $lsbd">;
+}
+def BSTRPICK_D : FmtBSTR_D<0b0000000011, (outs GPR:$rd),
+                           (ins GPR:$rj, uimm6:$msbd, uimm6:$lsbd),
+                           "bstrpick.d", "$rd, $rj, $msbd, $lsbd">;
+
+// Common Memory Access Instructions for 64-bits
+def LD_WU : LOAD_2RI12<0b0010101010, "ld.wu">;
+def LD_D  : LOAD_2RI12<0b0010100011, "ld.d">;
+def ST_D : STORE_2RI12<0b0010100111, "st.d">;
+def LDX_B  : LOAD_3R<0b00111000000000000, "ldx.b">;
+def LDX_H  : LOAD_3R<0b00111000000001000, "ldx.h">;
+def LDX_W  : LOAD_3R<0b00111000000010000, "ldx.w">;
+def LDX_D  : LOAD_3R<0b00111000000011000, "ldx.d">;
+def LDX_BU : LOAD_3R<0b00111000001000000, "ldx.bu">;
+def LDX_HU : LOAD_3R<0b00111000001001000, "ldx.hu">;
+def LDX_WU : LOAD_3R<0b00111000001010000, "ldx.wu">;
+def STX_B : STORE_3R<0b00111000000100000, "stx.b">;
+def STX_H : STORE_3R<0b00111000000101000, "stx.h">;
+def STX_W : STORE_3R<0b00111000000110000, "stx.w">;
+def STX_D : STORE_3R<0b00111000000111000, "stx.d">;
+def LDPTR_W : LOAD_2RI14<0b00100100, "ldptr.w">;
+def LDPTR_D : LOAD_2RI14<0b00100110, "ldptr.d">;
+def STPTR_W : STORE_2RI14<0b00100101, "stptr.w">;
+def STPTR_D : STORE_2RI14<0b00100111, "stptr.d">;
+def PRELDX : FmtPRELDX<(outs), (ins uimm5:$imm5, GPR:$rj, GPR:$rk), "preldx",
+                       "$imm5, $rj, $rk">;
+
+// Bound Check Memory Access Instructions
+def LDGT_B : LOAD_3R<0b00111000011110000, "ldgt.b">;
+def LDGT_H : LOAD_3R<0b00111000011110001, "ldgt.h">;
+def LDGT_W : LOAD_3R<0b00111000011110010, "ldgt.w">;
+def LDGT_D : LOAD_3R<0b00111000011110011, "ldgt.d">;
+def LDLE_B : LOAD_3R<0b00111000011110100, "ldle.b">;
+def LDLE_H : LOAD_3R<0b00111000011110101, "ldle.h">;
+def LDLE_W : LOAD_3R<0b00111000011110110, "ldle.w">;
+def LDLE_D : LOAD_3R<0b00111000011110111, "ldle.d">;
+def STGT_B : STORE_3R<0b00111000011111000, "stgt.b">;
+def STGT_H : STORE_3R<0b00111000011111001, "stgt.h">;
+def STGT_W : STORE_3R<0b00111000011111010, "stgt.w">;
+def STGT_D : STORE_3R<0b00111000011111011, "stgt.d">;
+def STLE_B : STORE_3R<0b00111000011111100, "stle.b">;
+def STLE_H : STORE_3R<0b00111000011111101, "stle.h">;
+def STLE_W : STORE_3R<0b00111000011111110, "stle.w">;
+def STLE_D : STORE_3R<0b00111000011111111, "stle.d">;
+
+// Atomic Memory Access Instructions for 64-bits
+def AMSWAP_W    : AM_3R<0b00111000011000000, "amswap.w">;
+def AMSWAP_D    : AM_3R<0b00111000011000001, "amswap.d">;
+def AMADD_W     : AM_3R<0b00111000011000010, "amadd.w">;
+def AMADD_D     : AM_3R<0b00111000011000011, "amadd.d">;
+def AMAND_W     : AM_3R<0b00111000011000100, "amand.w">;
+def AMAND_D     : AM_3R<0b00111000011000101, "amand.d">;
+def AMOR_W      : AM_3R<0b00111000011000110, "amor.w">;
+def AMOR_D      : AM_3R<0b00111000011000111, "amor.d">;
+def AMXOR_W     : AM_3R<0b00111000011001000, "amxor.w">;
+def AMXOR_D     : AM_3R<0b00111000011001001, "amxor.d">;
+def AMMAX_W     : AM_3R<0b00111000011001010, "ammax.w">;
+def AMMAX_D     : AM_3R<0b00111000011001011, "ammax.d">;
+def AMMIN_W     : AM_3R<0b00111000011001100, "ammin.w">;
+def AMMIN_D     : AM_3R<0b00111000011001101, "ammin.d">;
+def AMMAX_WU    : AM_3R<0b00111000011001110, "ammax.wu">;
+def AMMAX_DU    : AM_3R<0b00111000011001111, "ammax.du">;
+def AMMIN_WU    : AM_3R<0b00111000011010000, "ammin.wu">;
+def AMMIN_DU    : AM_3R<0b00111000011010001, "ammin.du">;
+def AMSWAP_DB_W : AM_3R<0b00111000011010010, "amswap_db.w">;
+def AMSWAP_DB_D : AM_3R<0b00111000011010011, "amswap_db.d">;
+def AMADD_DB_W  : AM_3R<0b00111000011010100, "amadd_db.w">;
+def AMADD_DB_D  : AM_3R<0b00111000011010101, "amadd_db.d">;
+def AMAND_DB_W  : AM_3R<0b00111000011010110, "amand_db.w">;
+def AMAND_DB_D  : AM_3R<0b00111000011010111, "amand_db.d">;
+def AMOR_DB_W   : AM_3R<0b00111000011011000, "amor_db.w">;
+def AMOR_DB_D   : AM_3R<0b00111000011011001, "amor_db.d">;
+def AMXOR_DB_W  : AM_3R<0b00111000011011010, "amxor_db.w">;
+def AMXOR_DB_D  : AM_3R<0b00111000011011011, "amxor_db.d">;
+def AMMAX_DB_W  : AM_3R<0b00111000011011100, "ammax_db.w">;
+def AMMAX_DB_D  : AM_3R<0b00111000011011101, "ammax_db.d">;
+def AMMIN_DB_W  : AM_3R<0b00111000011011110, "ammin_db.w">;
+def AMMIN_DB_D  : AM_3R<0b00111000011011111, "ammin_db.d">;
+def AMMAX_DB_WU : AM_3R<0b00111000011100000, "ammax_db.wu">;
+def AMMAX_DB_DU : AM_3R<0b00111000011100001, "ammax_db.du">;
+def AMMIN_DB_WU : AM_3R<0b00111000011100010, "ammin_db.wu">;
+def AMMIN_DB_DU : AM_3R<0b00111000011100011, "ammin_db.du">;
+def LL_D : LLBase<0b00100010, "ll.d">;
+def SC_D : SCBase<0b00100011, "sc.d">;
+
+// CRC Check Instructions
+def CRC_W_B_W  : ALU_3R<0b00000000001001000, "crc.w.b.w">;
+def CRC_W_H_W  : ALU_3R<0b00000000001001001, "crc.w.h.w">;
+def CRC_W_W_W  : ALU_3R<0b00000000001001010, "crc.w.w.w">;
+def CRC_W_D_W  : ALU_3R<0b00000000001001011, "crc.w.d.w">;
+def CRCC_W_B_W : ALU_3R<0b00000000001001100, "crcc.w.b.w">;
+def CRCC_W_H_W : ALU_3R<0b00000000001001101, "crcc.w.h.w">;
+def CRCC_W_W_W : ALU_3R<0b00000000001001110, "crcc.w.w.w">;
+def CRCC_W_D_W : ALU_3R<0b00000000001001111, "crcc.w.d.w">;
+
+// Other Miscellaneous Instructions for 64-bits
+def ASRTLE_D : FmtASRT<0b00000000000000010, (outs), (ins GPR:$rj, GPR:$rk),
+                       "asrtle.d", "$rj, $rk">;
+def ASRTGT_D : FmtASRT<0b00000000000000011, (outs), (ins GPR:$rj, GPR:$rk),
+                       "asrtgt.d", "$rj, $rk">;
+def RDTIME_D : RDTIME_2R<0b0000000000000000011010, "rdtime.d">;
+} // Predicates = [IsLA64]
+
+//===----------------------------------------------------------------------===//
+// Pseudo-instructions and codegen patterns
+//
+// Naming convention: For 'generic' pattern classes, we use the naming
+// convention PatTy1Ty2.
+//===----------------------------------------------------------------------===//
+
+/// Generic pattern classes
+
+class PatGprGpr<SDPatternOperator OpNode, LAInst Inst>
+    : Pat<(OpNode GPR:$rj, GPR:$rk), (Inst GPR:$rj, GPR:$rk)>;
+class PatGprGpr_32<SDPatternOperator OpNode, LAInst Inst>
+    : Pat<(sext_inreg (OpNode GPR:$rj, GPR:$rk), i32), (Inst GPR:$rj, GPR:$rk)>;
+
+class PatGprImm<SDPatternOperator OpNode, LAInst Inst, Operand ImmOpnd>
+    : Pat<(OpNode GPR:$rj, ImmOpnd:$imm),
+          (Inst GPR:$rj, ImmOpnd:$imm)>;
+class PatGprImm_32<SDPatternOperator OpNode, LAInst Inst, Operand ImmOpnd>
+    : Pat<(sext_inreg (OpNode GPR:$rj, ImmOpnd:$imm), i32),
+          (Inst GPR:$rj, ImmOpnd:$imm)>;
+
+/// Simple arithmetic operations
+
+// Match both a plain shift and one where the shift amount is masked (this is
+// typically introduced when the legalizer promotes the shift amount and
+// zero-extends it). For LoongArch, the mask is unnecessary as shifts in the
+// base ISA only read the least significant 5 bits (LA32) or 6 bits (LA64).
+def shiftMaskGRLen
+    : ComplexPattern<GRLenVT, 1, "selectShiftMaskGRLen", [], [], 0>;
+def shiftMask32 : ComplexPattern<i64, 1, "selectShiftMask32", [], [], 0>;
+
+class shiftop<SDPatternOperator operator>
+    : PatFrag<(ops node:$val, node:$count),
+              (operator node:$val, (GRLenVT (shiftMaskGRLen node:$count)))>;
+class shiftopw<SDPatternOperator operator>
+    : PatFrag<(ops node:$val, node:$count),
+              (operator node:$val, (i64 (shiftMask32 node:$count)))>;
+
+let Predicates = [IsLA32] in {
+def : PatGprGpr<add, ADD_W>;
+def : PatGprImm<add, ADDI_W, simm12>;
+def : PatGprGpr<sub, SUB_W>;
+} // Predicates = [IsLA32]
+
+let Predicates = [IsLA64] in {
+def : PatGprGpr<add, ADD_D>;
+def : PatGprGpr_32<add, ADD_W>;
+def : PatGprImm<add, ADDI_D, simm12>;
+def : PatGprImm_32<add, ADDI_W, simm12>;
+def : PatGprGpr<sub, SUB_D>;
+def : PatGprGpr_32<sub, SUB_W>;
+} // Predicates = [IsLA64]
+
+def : PatGprGpr<and, AND>;
+def : PatGprImm<and, ANDI, uimm12>;
+def : PatGprGpr<or, OR>;
+def : PatGprImm<or, ORI, uimm12>;
+def : PatGprGpr<xor, XOR>;
+def : PatGprImm<xor, XORI, uimm12>;
+
+/// Shift
+
+let Predicates = [IsLA32] in {
+def : PatGprGpr<shiftop<shl>, SLL_W>;
+def : PatGprGpr<shiftop<sra>, SRA_W>;
+def : PatGprGpr<shiftop<srl>, SRL_W>;
+def : PatGprImm<shl, SLLI_W, uimm5>;
+def : PatGprImm<sra, SRAI_W, uimm5>;
+def : PatGprImm<srl, SRLI_W, uimm5>;
+} // Predicates = [IsLA32]
+
+let Predicates = [IsLA64] in {
+def : PatGprGpr<shiftopw<loongarch_sll_w>, SLL_W>;
+def : PatGprGpr<shiftopw<loongarch_sra_w>, SRA_W>;
+def : PatGprGpr<shiftopw<loongarch_srl_w>, SRL_W>;
+def : PatGprGpr<shiftop<shl>, SLL_D>;
+def : PatGprGpr<shiftop<sra>, SRA_D>;
+def : PatGprGpr<shiftop<srl>, SRL_D>;
+def : PatGprImm<shl, SLLI_D, uimm6>;
+def : PatGprImm<sra, SRAI_D, uimm6>;
+def : PatGprImm<srl, SRLI_D, uimm6>;
+} // Predicates = [IsLA64]
+
+/// sext and zext
+
+def : Pat<(sext_inreg GPR:$rj, i8), (EXT_W_B GPR:$rj)>;
+def : Pat<(sext_inreg GPR:$rj, i16), (EXT_W_H GPR:$rj)>;
+
+let Predicates = [IsLA64] in {
+def : Pat<(sext_inreg GPR:$rj, i32), (ADDI_W GPR:$rj, 0)>;
+} // Predicates = [IsLA64]
+
+/// Setcc
+
+def : PatGprGpr<setlt, SLT>;
+def : PatGprImm<setlt, SLTI, simm12>;
+def : PatGprGpr<setult, SLTU>;
+def : PatGprImm<setult, SLTUI, simm12>;
+
+// Define pattern expansions for setcc operations that aren't directly
+// handled by a LoongArch instruction.
+def : Pat<(seteq GPR:$rj, 0), (SLTUI GPR:$rj, 1)>;
+def : Pat<(seteq GPR:$rj, GPR:$rk), (SLTUI (XOR GPR:$rj, GPR:$rk), 1)>;
+let Predicates = [IsLA32] in {
+def : Pat<(seteq GPR:$rj, simm12_plus1:$imm12),
+          (SLTUI (ADDI_W GPR:$rj, (NegImm simm12_plus1:$imm12)), 1)>;
+} // Predicates = [IsLA32]
+let Predicates = [IsLA64] in {
+def : Pat<(seteq GPR:$rj, simm12_plus1:$imm12),
+          (SLTUI (ADDI_D GPR:$rj, (NegImm simm12_plus1:$imm12)), 1)>;
+} // Predicates = [IsLA64]
+def : Pat<(setne GPR:$rj, 0), (SLTU R0, GPR:$rj)>;
+def : Pat<(setne GPR:$rj, GPR:$rk), (SLTU R0, (XOR GPR:$rj, GPR:$rk))>;
+let Predicates = [IsLA32] in {
+def : Pat<(setne GPR:$rj, simm12_plus1:$imm12),
+          (SLTU R0, (ADDI_W GPR:$rj, (NegImm simm12_plus1:$imm12)))>;
+} // Predicates = [IsLA32]
+let Predicates = [IsLA64] in {
+def : Pat<(setne GPR:$rj, simm12_plus1:$imm12),
+          (SLTU R0, (ADDI_D GPR:$rj, (NegImm simm12_plus1:$imm12)))>;
+} // Predicates = [IsLA64]
+def : Pat<(setugt GPR:$rj, GPR:$rk), (SLTU GPR:$rk, GPR:$rj)>;
+def : Pat<(setuge GPR:$rj, GPR:$rk), (XORI (SLTU GPR:$rj, GPR:$rk), 1)>;
+def : Pat<(setule GPR:$rj, GPR:$rk), (XORI (SLTU GPR:$rk, GPR:$rj), 1)>;
+def : Pat<(setgt GPR:$rj, GPR:$rk), (SLT GPR:$rk, GPR:$rj)>;
+def : Pat<(setge GPR:$rj, GPR:$rk), (XORI (SLT GPR:$rj, GPR:$rk), 1)>;
+def : Pat<(setle GPR:$rj, GPR:$rk), (XORI (SLT GPR:$rk, GPR:$rj), 1)>;
+
+/// Select
+
+def : Pat<(select GPR:$cond, GPR:$t, GPR:$f),
+          (OR (MASKEQZ GPR:$t, GPR:$cond), (MASKNEZ GPR:$f, GPR:$cond))>;
+
+/// Branches and jumps
+
+let isBarrier = 1, isReturn = 1, isTerminator = 1 in
+def PseudoRET : Pseudo<(outs), (ins), [(loongarch_ret)]>,
+                PseudoInstExpansion<(JIRL R0, R1, 0)>;
+
+/// BSTRPICK
+
+let Predicates = [IsLA32] in
+def : Pat<(loongarch_bstrpick GPR:$rj, uimm5:$msbd, uimm5:$lsbd),
+          (BSTRPICK_W GPR:$rj, uimm5:$msbd, uimm5:$lsbd)>;
+
+let Predicates = [IsLA64] in
+def : Pat<(loongarch_bstrpick GPR:$rj, uimm6:$msbd, uimm6:$lsbd),
+          (BSTRPICK_D GPR:$rj, uimm6:$msbd, uimm6:$lsbd)>;
+
+//===----------------------------------------------------------------------===//
+// Assembler Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstAlias<"nop", (ANDI R0, R0, 0)>;
+def : InstAlias<"move $dst, $src", (OR GPR:$dst, GPR:$src, R0)>;
+
+//===----------------------------------------------------------------------===//
+// Basic Floating-Point Instructions
+//===----------------------------------------------------------------------===//
+
+include "LoongArchFloat32InstrInfo.td"
+include "LoongArchFloat64InstrInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Privilege Instructions
+//===----------------------------------------------------------------------===//
+
+// CSR Access Instructions
+def CSRRD : FmtCSR<0b0000010000000, (outs GPR:$rd), (ins uimm14:$csr_num),
+                   "csrrd", "$rd, $csr_num">;
+let Constraints = "$rd = $dst" in {
+def CSRWR : FmtCSR<0b0000010000001, (outs GPR:$dst),
+                   (ins GPR:$rd, uimm14:$csr_num), "csrwr", "$rd, $csr_num">;
+def CSRXCHG : FmtCSRXCHG<0b00000100, (outs GPR:$dst),
+                         (ins GPR:$rd, GPR:$rj, uimm14:$csr_num),
+                         "csrxchg", "$rd, $rj, $csr_num">;
+} // Constraints = "$rd = $dst"
+
+// IOCSR Access Instructions
+def IOCSRRD_B : IOCSRRD<0b0000011001001000000000, "iocsrrd.b">;
+def IOCSRRD_H : IOCSRRD<0b0000011001001000000001, "iocsrrd.h">;
+def IOCSRRD_W : IOCSRRD<0b0000011001001000000010, "iocsrrd.w">;
+def IOCSRWR_B : IOCSRWR<0b0000011001001000000100, "iocsrwr.b">;
+def IOCSRWR_H : IOCSRWR<0b0000011001001000000101, "iocsrwr.h">;
+def IOCSRWR_W : IOCSRWR<0b0000011001001000000110, "iocsrwr.w">;
+let Predicates = [IsLA64] in {
+def IOCSRRD_D : IOCSRRD<0b0000011001001000000011, "iocsrrd.d">;
+def IOCSRWR_D : IOCSRWR<0b0000011001001000000111, "iocsrwr.d">;
+} // Predicates = [IsLA64]
+
+// Cache Maintenance Instructions
+def CACOP : FmtCACOP<(outs), (ins uimm5:$op, GPR:$rj, simm12:$imm12), "cacop",
+                     "$op, $rj, $imm12">;
+
+// TLB Maintenance Instructions
+def TLBSRCH  : FmtI32<0b00000110010010000010100000000000, "tlbsrch">;
+def TLBRD    : FmtI32<0b00000110010010000010110000000000, "tlbrd">;
+def TLBWR    : FmtI32<0b00000110010010000011000000000000, "tlbwr">;
+def TLBFILL  : FmtI32<0b00000110010010000011010000000000, "tlbfill">;
+def TLBCLR   : FmtI32<0b00000110010010000010000000000000, "tlbclr">;
+def TLBFLUSH : FmtI32<0b00000110010010000010010000000000, "tlbflush">;
+def INVTLB : FmtINVTLB<(outs), (ins GPR:$rk, GPR:$rj, uimm5:$op), "invtlb",
+                       "$op, $rj, $rk">;
+
+// Software Page Walking Instructions
+def LDDIR : Fmt2RI8<0b00000110010000, (outs GPR:$rd),
+                    (ins GPR:$rj, uimm8:$imm8), "lddir", "$rd, $rj, $imm8">;
+def LDPTE : FmtLDPTE<(outs), (ins GPR:$rj, uimm8:$seq), "ldpte", "$rj, $seq">;
+
+
+// Other Miscellaneous Instructions
+def ERTN : FmtI32<0b00000110010010000011100000000000, "ertn">;
+def DBCL : MISC_I15<0b00000000001010101, "dbcl">;
+def IDLE : MISC_I15<0b00000110010010001, "idle">;
diff --git a/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp b/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
new file mode 100644
index 000000000000..7416c93b4d05
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
@@ -0,0 +1,66 @@
+//=- LoongArchMCInstLower.cpp - Convert LoongArch MachineInstr to an MCInst -=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower LoongArch MachineInstrs to their
+// corresponding MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LoongArch.h"
+#include "LoongArchSubtarget.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+bool llvm::lowerLoongArchMachineOperandToMCOperand(const MachineOperand &MO,
+                                                   MCOperand &MCOp,
+                                                   const AsmPrinter &AP) {
+  switch (MO.getType()) {
+  default:
+    report_fatal_error(
+        "lowerLoongArchMachineOperandToMCOperand: unknown operand type");
+  case MachineOperand::MO_Register:
+    // Ignore all implicit register operands.
+    if (MO.isImplicit())
+      return false;
+    MCOp = MCOperand::createReg(MO.getReg());
+    break;
+  case MachineOperand::MO_RegisterMask:
+    // Regmasks are like implicit defs.
+    return false;
+  case MachineOperand::MO_Immediate:
+    MCOp = MCOperand::createImm(MO.getImm());
+    break;
+  // TODO: lower special operands
+  case MachineOperand::MO_MachineBasicBlock:
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_BlockAddress:
+  case MachineOperand::MO_ExternalSymbol:
+  case MachineOperand::MO_ConstantPoolIndex:
+  case MachineOperand::MO_JumpTableIndex:
+    break;
+  }
+  return true;
+}
+
+bool llvm::lowerLoongArchMachineInstrToMCInst(const MachineInstr *MI,
+                                              MCInst &OutMI, AsmPrinter &AP) {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (const MachineOperand &MO : MI->operands()) {
+    MCOperand MCOp;
+    if (lowerLoongArchMachineOperandToMCOperand(MO, MCOp, AP))
+      OutMI.addOperand(MCOp);
+  }
+  return false;
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h b/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h
new file mode 100644
index 000000000000..d4a6c884bc9d
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h
@@ -0,0 +1,57 @@
+//=- LoongArchMachineFunctionInfo.h - LoongArch machine function info -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares LoongArch-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LOONGARCH_LOONGARCHMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_LOONGARCH_LOONGARCHMACHINEFUNCTIONINFO_H
+
+#include "LoongArchSubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// LoongArchMachineFunctionInfo - This class is derived from
+/// MachineFunctionInfo and contains private LoongArch-specific information for
+/// each MachineFunction.
+class LoongArchMachineFunctionInfo : public MachineFunctionInfo {
+private:
+  /// FrameIndex for start of varargs area
+  int VarArgsFrameIndex = 0;
+  /// Size of the save area used for varargs
+  int VarArgsSaveSize = 0;
+
+  /// Size of stack frame to save callee saved registers
+  unsigned CalleeSavedStackSize = 0;
+
+public:
+  LoongArchMachineFunctionInfo(const MachineFunction &MF) {}
+
+  MachineFunctionInfo *
+  clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+        const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+      const override {
+    return DestMF.cloneInfo<LoongArchMachineFunctionInfo>(*this);
+  }
+
+  int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+  void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
+
+  unsigned getVarArgsSaveSize() const { return VarArgsSaveSize; }
+  void setVarArgsSaveSize(int Size) { VarArgsSaveSize = Size; }
+
+  unsigned getCalleeSavedStackSize() const { return CalleeSavedStackSize; }
+  void setCalleeSavedStackSize(unsigned Size) { CalleeSavedStackSize = Size; }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_LOONGARCH_LOONGARCHMACHINEFUNCTIONINFO_H
diff --git a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp
new file mode 100644
index 000000000000..b9bae8e56304
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp
@@ -0,0 +1,115 @@
+//===- LoongArchRegisterInfo.cpp - LoongArch Register Information -*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the LoongArch implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LoongArchRegisterInfo.h"
+#include "LoongArch.h"
+#include "LoongArchSubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+#define GET_REGINFO_TARGET_DESC
+#include "LoongArchGenRegisterInfo.inc"
+
+LoongArchRegisterInfo::LoongArchRegisterInfo(unsigned HwMode)
+    : LoongArchGenRegisterInfo(LoongArch::R1, /*DwarfFlavour*/ 0,
+                               /*EHFlavor*/ 0,
+                               /*PC*/ 0, HwMode) {}
+
+const MCPhysReg *
+LoongArchRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  auto &Subtarget = MF->getSubtarget<LoongArchSubtarget>();
+
+  switch (Subtarget.getTargetABI()) {
+  default:
+    llvm_unreachable("Unrecognized ABI");
+  case LoongArchABI::ABI_ILP32S:
+  case LoongArchABI::ABI_LP64S:
+    return CSR_ILP32S_LP64S_SaveList;
+  case LoongArchABI::ABI_ILP32F:
+  case LoongArchABI::ABI_LP64F:
+    return CSR_ILP32F_LP64F_SaveList;
+  case LoongArchABI::ABI_ILP32D:
+  case LoongArchABI::ABI_LP64D:
+    return CSR_ILP32D_LP64D_SaveList;
+  }
+}
+
+const uint32_t *
+LoongArchRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+                                            CallingConv::ID CC) const {
+  auto &Subtarget = MF.getSubtarget<LoongArchSubtarget>();
+
+  switch (Subtarget.getTargetABI()) {
+  default:
+    llvm_unreachable("Unrecognized ABI");
+  case LoongArchABI::ABI_ILP32S:
+  case LoongArchABI::ABI_LP64S:
+    return CSR_ILP32S_LP64S_RegMask;
+  case LoongArchABI::ABI_ILP32F:
+  case LoongArchABI::ABI_LP64F:
+    return CSR_ILP32F_LP64F_RegMask;
+  case LoongArchABI::ABI_ILP32D:
+  case LoongArchABI::ABI_LP64D:
+    return CSR_ILP32D_LP64D_RegMask;
+  }
+}
+
+const uint32_t *LoongArchRegisterInfo::getNoPreservedMask() const {
+  return CSR_NoRegs_RegMask;
+}
+
+BitVector
+LoongArchRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  const LoongArchFrameLowering *TFI = getFrameLowering(MF);
+  BitVector Reserved(getNumRegs());
+
+  // Use markSuperRegs to ensure any register aliases are also reserved
+  markSuperRegs(Reserved, LoongArch::R0);  // zero
+  markSuperRegs(Reserved, LoongArch::R2);  // tp
+  markSuperRegs(Reserved, LoongArch::R3);  // sp
+  markSuperRegs(Reserved, LoongArch::R21); // non-allocatable
+  if (TFI->hasFP(MF))
+    markSuperRegs(Reserved, LoongArch::R22); // fp
+  // Reserve the base register if we need to realign the stack and allocate
+  // variable-sized objects at runtime.
+  if (TFI->hasBP(MF))
+    markSuperRegs(Reserved, LoongArchABI::getBPReg()); // bp
+
+  assert(checkAllSuperRegsMarked(Reserved));
+  return Reserved;
+}
+
+bool LoongArchRegisterInfo::isConstantPhysReg(MCRegister PhysReg) const {
+  return PhysReg == LoongArch::R0;
+}
+
+Register
+LoongArchRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = getFrameLowering(MF);
+  return TFI->hasFP(MF) ? LoongArch::R22 : LoongArch::R3;
+}
+
+void LoongArchRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                                int SPAdj,
+                                                unsigned FIOperandNum,
+                                                RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected non-zero SPAdj value");
+  // TODO: Implement this when we have function calls
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h
new file mode 100644
index 000000000000..02c9156e2b87
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h
@@ -0,0 +1,50 @@
+//= LoongArchRegisterInfo.h - LoongArch Register Information Impl -*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the LoongArch implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LOONGARCH_LOONGARCHREGISTERINFO_H
+#define LLVM_LIB_TARGET_LOONGARCH_LOONGARCHREGISTERINFO_H
+
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "LoongArchGenRegisterInfo.inc"
+
+namespace llvm {
+
+struct LoongArchRegisterInfo : public LoongArchGenRegisterInfo {
+
+  LoongArchRegisterInfo(unsigned HwMode);
+
+  const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+  const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+                                       CallingConv::ID) const override;
+  const uint32_t *getNoPreservedMask() const override;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+  bool isConstantPhysReg(MCRegister PhysReg) const override;
+
+  const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF,
+                     unsigned Kind = 0) const override {
+    return &LoongArch::GPRRegClass;
+  }
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+
+  Register getFrameRegister(const MachineFunction &MF) const override;
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LOONGARCH_LOONGARCHREGISTERINFO_H
diff --git a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.td b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.td
new file mode 100644
index 000000000000..2d5ad99f6156
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.td
@@ -0,0 +1,161 @@
+//===-- LoongArchRegisterInfo.td - LoongArch Register defs -*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the LoongArch register files
+//===----------------------------------------------------------------------===//
+
+let Namespace = "LoongArch" in {
+class LoongArchReg<bits<16> Enc, string n, list<string> alt = []>
+    : Register<n> {
+  let HWEncoding = Enc;
+  let AltNames = alt;
+}
+
+class LoongArchReg32<bits<16> Enc, string n, list<string> alt = []>
+    : Register<n> {
+  let HWEncoding = Enc;
+  let AltNames = alt;
+}
+
+def sub_32 : SubRegIndex<32>;
+class LoongArchReg64<LoongArchReg32 subreg>
+    : Register<""> {
+  let HWEncoding = subreg.HWEncoding;
+  let SubRegs = [subreg];
+  let SubRegIndices = [sub_32];
+  let AsmName = subreg.AsmName;
+  let AltNames = subreg.AltNames;
+}
+
+let FallbackRegAltNameIndex = NoRegAltName in
+def RegAliasName : RegAltNameIndex;
+} // Namespace = "LoongArch"
+
+// Integer registers
+
+let RegAltNameIndices = [RegAliasName] in {
+  def R0  : LoongArchReg<0,  "r0", ["zero"]>, DwarfRegNum<[0]>;
+  def R1  : LoongArchReg<1,  "r1", ["ra"]>, DwarfRegNum<[1]>;
+  def R2  : LoongArchReg<2,  "r2", ["tp"]>, DwarfRegNum<[2]>;
+  def R3  : LoongArchReg<3,  "r3", ["sp"]>, DwarfRegNum<[3]>;
+  def R4  : LoongArchReg<4,  "r4", ["a0"]>, DwarfRegNum<[4]>;
+  def R5  : LoongArchReg<5,  "r5", ["a1"]>, DwarfRegNum<[5]>;
+  def R6  : LoongArchReg<6,  "r6", ["a2"]>, DwarfRegNum<[6]>;
+  def R7  : LoongArchReg<7,  "r7", ["a3"]>, DwarfRegNum<[7]>;
+  def R8  : LoongArchReg<8,  "r8", ["a4"]>, DwarfRegNum<[8]>;
+  def R9  : LoongArchReg<9,  "r9", ["a5"]>, DwarfRegNum<[9]>;
+  def R10 : LoongArchReg<10, "r10", ["a6"]>, DwarfRegNum<[10]>;
+  def R11 : LoongArchReg<11, "r11", ["a7"]>, DwarfRegNum<[11]>;
+  def R12 : LoongArchReg<12, "r12", ["t0"]>, DwarfRegNum<[12]>;
+  def R13 : LoongArchReg<13, "r13", ["t1"]>, DwarfRegNum<[13]>;
+  def R14 : LoongArchReg<14, "r14", ["t2"]>, DwarfRegNum<[14]>;
+  def R15 : LoongArchReg<15, "r15", ["t3"]>, DwarfRegNum<[15]>;
+  def R16 : LoongArchReg<16, "r16", ["t4"]>, DwarfRegNum<[16]>;
+  def R17 : LoongArchReg<17, "r17", ["t5"]>, DwarfRegNum<[17]>;
+  def R18 : LoongArchReg<18, "r18", ["t6"]>, DwarfRegNum<[18]>;
+  def R19 : LoongArchReg<19, "r19", ["t7"]>, DwarfRegNum<[19]>;
+  def R20 : LoongArchReg<20, "r20", ["t8"]>, DwarfRegNum<[20]>;
+  def R21 : LoongArchReg<21, "r21", [""]>, DwarfRegNum<[21]>;
+  def R22 : LoongArchReg<22, "r22", ["fp", "s9"]>, DwarfRegNum<[22]>;
+  def R23 : LoongArchReg<23, "r23", ["s0"]>, DwarfRegNum<[23]>;
+  def R24 : LoongArchReg<24, "r24", ["s1"]>, DwarfRegNum<[24]>;
+  def R25 : LoongArchReg<25, "r25", ["s2"]>, DwarfRegNum<[25]>;
+  def R26 : LoongArchReg<26, "r26", ["s3"]>, DwarfRegNum<[26]>;
+  def R27 : LoongArchReg<27, "r27", ["s4"]>, DwarfRegNum<[27]>;
+  def R28 : LoongArchReg<28, "r28", ["s5"]>, DwarfRegNum<[28]>;
+  def R29 : LoongArchReg<29, "r29", ["s6"]>, DwarfRegNum<[29]>;
+  def R30 : LoongArchReg<30, "r30", ["s7"]>, DwarfRegNum<[30]>;
+  def R31 : LoongArchReg<31, "r31", ["s8"]>, DwarfRegNum<[31]>;
+} // RegAltNameIndices = [RegAliasName]
+
+def GRLenVT : ValueTypeByHwMode<[LA32, LA64],
+                                [i32,  i64]>;
+def GRLenRI : RegInfoByHwMode<
+      [LA32,              LA64],
+      [RegInfo<32,32,32>, RegInfo<64,64,64>]>;
+
+// The order of registers represents the preferred allocation sequence.
+// Registers are listed in the order caller-save, callee-save, specials.
+def GPR : RegisterClass<"LoongArch", [GRLenVT], 32, (add
+    // Argument registers (a0...a7)
+    (sequence "R%u", 4, 11),
+    // Temporary registers (t0...t8)
+    (sequence "R%u", 12, 20),
+    // Static register (s9/fp, s0...s8)
+    (sequence "R%u", 22, 31),
+    // Specials (r0, ra, tp, sp)
+    (sequence "R%u", 0, 3),
+    // Reserved (Non-allocatable)
+    R21
+  )> {
+  let RegInfos = GRLenRI;
+}
+
+// Floating point registers
+
+let RegAltNameIndices = [RegAliasName] in {
+  def F0  : LoongArchReg32<0, "f0", ["fa0"]>, DwarfRegNum<[32]>;
+  def F1  : LoongArchReg32<1, "f1", ["fa1"]>, DwarfRegNum<[33]>;
+  def F2  : LoongArchReg32<2, "f2", ["fa2"]>, DwarfRegNum<[34]>;
+  def F3  : LoongArchReg32<3, "f3", ["fa3"]>, DwarfRegNum<[35]>;
+  def F4  : LoongArchReg32<4, "f4", ["fa4"]>, DwarfRegNum<[36]>;
+  def F5  : LoongArchReg32<5, "f5", ["fa5"]>, DwarfRegNum<[37]>;
+  def F6  : LoongArchReg32<6, "f6", ["fa6"]>, DwarfRegNum<[38]>;
+  def F7  : LoongArchReg32<7, "f7", ["fa7"]>, DwarfRegNum<[39]>;
+  def F8  : LoongArchReg32<8, "f8", ["ft0"]>, DwarfRegNum<[40]>;
+  def F9  : LoongArchReg32<9, "f9", ["ft1"]>, DwarfRegNum<[41]>;
+  def F10 : LoongArchReg32<10,"f10", ["ft2"]>, DwarfRegNum<[42]>;
+  def F11 : LoongArchReg32<11,"f11", ["ft3"]>, DwarfRegNum<[43]>;
+  def F12 : LoongArchReg32<12,"f12", ["ft4"]>, DwarfRegNum<[44]>;
+  def F13 : LoongArchReg32<13,"f13", ["ft5"]>, DwarfRegNum<[45]>;
+  def F14 : LoongArchReg32<14,"f14", ["ft6"]>, DwarfRegNum<[46]>;
+  def F15 : LoongArchReg32<15,"f15", ["ft7"]>, DwarfRegNum<[47]>;
+  def F16 : LoongArchReg32<16,"f16", ["ft8"]>, DwarfRegNum<[48]>;
+  def F17 : LoongArchReg32<17,"f17", ["ft9"]>, DwarfRegNum<[49]>;
+  def F18 : LoongArchReg32<18,"f18", ["ft10"]>, DwarfRegNum<[50]>;
+  def F19 : LoongArchReg32<19,"f19", ["ft11"]>, DwarfRegNum<[51]>;
+  def F20 : LoongArchReg32<20,"f20", ["ft12"]>, DwarfRegNum<[52]>;
+  def F21 : LoongArchReg32<21,"f21", ["ft13"]>, DwarfRegNum<[53]>;
+  def F22 : LoongArchReg32<22,"f22", ["ft14"]>, DwarfRegNum<[54]>;
+  def F23 : LoongArchReg32<23,"f23", ["ft15"]>, DwarfRegNum<[55]>;
+  def F24 : LoongArchReg32<24,"f24", ["fs0"]>, DwarfRegNum<[56]>;
+  def F25 : LoongArchReg32<25,"f25", ["fs1"]>, DwarfRegNum<[57]>;
+  def F26 : LoongArchReg32<26,"f26", ["fs2"]>, DwarfRegNum<[58]>;
+  def F27 : LoongArchReg32<27,"f27", ["fs3"]>, DwarfRegNum<[59]>;
+  def F28 : LoongArchReg32<28,"f28", ["fs4"]>, DwarfRegNum<[60]>;
+  def F29 : LoongArchReg32<29,"f29", ["fs5"]>, DwarfRegNum<[61]>;
+  def F30 : LoongArchReg32<30,"f30", ["fs6"]>, DwarfRegNum<[62]>;
+  def F31 : LoongArchReg32<31,"f31", ["fs7"]>, DwarfRegNum<[63]>;
+
+  foreach I = 0-31 in {
+    def F#I#_64 : LoongArchReg64<!cast<LoongArchReg32>("F"#I)>,
+      DwarfRegNum<[!add(I, 32)]>;
+  }
+}
+
+// The order of registers represents the preferred allocation sequence.
+def FPR32 : RegisterClass<"LoongArch", [f32], 32, (sequence "F%u", 0, 31)>;
+def FPR64 : RegisterClass<"LoongArch", [f64], 64, (sequence "F%u_64", 0, 31)>;
+
+// Condition flag registers
+
+foreach I = 0-7 in
+def FCC#I : LoongArchReg<I, "fcc"#I>;
+
+def CFR : RegisterClass<"LoongArch", [GRLenVT], 32, (sequence "FCC%u", 0, 7)> {
+  let RegInfos = GRLenRI;
+}
+
+// Control and status registers
+
+foreach I = 0-3 in
+def FCSR#I : LoongArchReg<I, "fcsr"#I>;
+
+let isAllocatable = false in
+def FCSR : RegisterClass<"LoongArch", [i32], 32, (sequence "FCSR%u", 0, 3)>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp b/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp
new file mode 100644
index 000000000000..ff84e7c8cc1f
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp
@@ -0,0 +1,54 @@
+//===-- LoongArchSubtarget.cpp - LoongArch Subtarget Information -*- C++ -*--=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LoongArch specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LoongArchSubtarget.h"
+#include "LoongArchFrameLowering.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loongarch-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "LoongArchGenSubtargetInfo.inc"
+
+void LoongArchSubtarget::anchor() {}
+
+LoongArchSubtarget &LoongArchSubtarget::initializeSubtargetDependencies(
+    const Triple &TT, StringRef CPU, StringRef TuneCPU, StringRef FS,
+    StringRef ABIName) {
+  bool Is64Bit = TT.isArch64Bit();
+  if (CPU.empty())
+    CPU = Is64Bit ? "generic-la64" : "generic-la32";
+
+  if (TuneCPU.empty())
+    TuneCPU = CPU;
+
+  ParseSubtargetFeatures(CPU, TuneCPU, FS);
+  if (Is64Bit) {
+    GRLenVT = MVT::i64;
+    GRLen = 64;
+  }
+
+  // TODO: ILP32{S,F} LP64{S,F}
+  TargetABI = Is64Bit ? LoongArchABI::ABI_LP64D : LoongArchABI::ABI_ILP32D;
+  return *this;
+}
+
+LoongArchSubtarget::LoongArchSubtarget(const Triple &TT, StringRef CPU,
+                                       StringRef TuneCPU, StringRef FS,
+                                       StringRef ABIName,
+                                       const TargetMachine &TM)
+    : LoongArchGenSubtargetInfo(TT, CPU, TuneCPU, FS),
+      FrameLowering(
+          initializeSubtargetDependencies(TT, CPU, TuneCPU, FS, ABIName)),
+      InstrInfo(*this), RegInfo(getHwMode()), TLInfo(TM, *this) {}
diff --git a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
new file mode 100644
index 000000000000..95c2c676cc3c
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
@@ -0,0 +1,89 @@
+//===- LoongArchSubtarget.h - Define Subtarget for the LoongArch -*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the LoongArch specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LOONGARCH_LOONGARCHSUBTARGET_H
+#define LLVM_LIB_TARGET_LOONGARCH_LOONGARCHSUBTARGET_H
+
+#include "LoongArchFrameLowering.h"
+#include "LoongArchISelLowering.h"
+#include "LoongArchInstrInfo.h"
+#include "LoongArchRegisterInfo.h"
+#include "MCTargetDesc/LoongArchBaseInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define GET_SUBTARGETINFO_HEADER
+#include "LoongArchGenSubtargetInfo.inc"
+
+namespace llvm {
+class StringRef;
+
+class LoongArchSubtarget : public LoongArchGenSubtargetInfo {
+  virtual void anchor();
+  bool HasLA64 = false;
+  bool HasBasicF = false;
+  bool HasBasicD = false;
+  bool HasExtLSX = false;
+  bool HasExtLASX = false;
+  bool HasExtLVZ = false;
+  bool HasExtLBT = false;
+  unsigned GRLen = 32;
+  MVT GRLenVT = MVT::i32;
+  LoongArchABI::ABI TargetABI = LoongArchABI::ABI_Unknown;
+  LoongArchFrameLowering FrameLowering;
+  LoongArchInstrInfo InstrInfo;
+  LoongArchRegisterInfo RegInfo;
+  LoongArchTargetLowering TLInfo;
+
+  /// Initializes using the passed in CPU and feature strings so that we can
+  /// use initializer lists for subtarget initialization.
+  LoongArchSubtarget &initializeSubtargetDependencies(const Triple &TT,
+                                                      StringRef CPU,
+                                                      StringRef TuneCPU,
+                                                      StringRef FS,
+                                                      StringRef ABIName);
+
+public:
+  // Initializes the data members to match that of the specified triple.
+  LoongArchSubtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU,
+                     StringRef FS, StringRef ABIName, const TargetMachine &TM);
+
+  // Parses features string setting specified subtarget options. The
+  // definition of this function is auto-generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
+
+  const LoongArchFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+  const LoongArchInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const LoongArchRegisterInfo *getRegisterInfo() const override {
+    return &RegInfo;
+  }
+  const LoongArchTargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  bool is64Bit() const { return HasLA64; }
+  bool hasBasicF() const { return HasBasicF; }
+  bool hasBasicD() const { return HasBasicD; }
+  bool hasExtLSX() const { return HasExtLSX; }
+  bool hasExtLASX() const { return HasExtLASX; }
+  bool hasExtLVZ() const { return HasExtLVZ; }
+  bool hasExtLBT() const { return HasExtLBT; }
+  MVT getGRLenVT() const { return GRLenVT; }
+  unsigned getGRLen() const { return GRLen; }
+  LoongArchABI::ABI getTargetABI() const { return TargetABI; }
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LOONGARCH_LOONGARCHSUBTARGET_H
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
new file mode 100644
index 000000000000..3a1a46a9e624
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
@@ -0,0 +1,118 @@
+//===-- LoongArchTargetMachine.cpp - Define TargetMachine for LoongArch ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the info about LoongArch target spec.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LoongArchTargetMachine.h"
+#include "LoongArch.h"
+#include "MCTargetDesc/LoongArchBaseInfo.h"
+#include "TargetInfo/LoongArchTargetInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/MC/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loongarch"
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLoongArchTarget() {
+  // Register the target.
+  RegisterTargetMachine<LoongArchTargetMachine> X(getTheLoongArch32Target());
+  RegisterTargetMachine<LoongArchTargetMachine> Y(getTheLoongArch64Target());
+}
+
+static std::string computeDataLayout(const Triple &TT) {
+  if (TT.isArch64Bit())
+    return "e-m:e-p:64:64-i64:64-i128:128-n64-S128";
+  assert(TT.isArch32Bit() && "only LA32 and LA64 are currently supported");
+  return "e-m:e-p:32:32-i64:64-n32-S128";
+}
+
+static Reloc::Model getEffectiveRelocModel(const Triple &TT,
+                                           Optional<Reloc::Model> RM) {
+  if (!RM.hasValue())
+    return Reloc::Static;
+  return *RM;
+}
+
+LoongArchTargetMachine::LoongArchTargetMachine(
+    const Target &T, const Triple &TT, StringRef CPU, StringRef FS,
+    const TargetOptions &Options, Optional<Reloc::Model> RM,
+    Optional<CodeModel::Model> CM, CodeGenOpt::Level OL, bool JIT)
+    : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options,
+                        getEffectiveRelocModel(TT, RM),
+                        getEffectiveCodeModel(CM, CodeModel::Small), OL),
+      TLOF(std::make_unique<TargetLoweringObjectFileELF>()) {
+  initAsmInfo();
+}
+
+LoongArchTargetMachine::~LoongArchTargetMachine() = default;
+
+const LoongArchSubtarget *
+LoongArchTargetMachine::getSubtargetImpl(const Function &F) const {
+  Attribute CPUAttr = F.getFnAttribute("target-cpu");
+  Attribute TuneAttr = F.getFnAttribute("tune-cpu");
+  Attribute FSAttr = F.getFnAttribute("target-features");
+
+  std::string CPU =
+      CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
+  std::string TuneCPU =
+      TuneAttr.isValid() ? TuneAttr.getValueAsString().str() : CPU;
+  std::string FS =
+      FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS;
+
+  std::string Key = CPU + TuneCPU + FS;
+  auto &I = SubtargetMap[Key];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    auto ABIName = Options.MCOptions.getABIName();
+    if (const MDString *ModuleTargetABI = dyn_cast_or_null<MDString>(
+            F.getParent()->getModuleFlag("target-abi"))) {
+      auto TargetABI = LoongArchABI::getTargetABI(ABIName);
+      if (TargetABI != LoongArchABI::ABI_Unknown &&
+          ModuleTargetABI->getString() != ABIName) {
+        report_fatal_error("-target-abi option != target-abi module flag");
+      }
+      ABIName = ModuleTargetABI->getString();
+    }
+    I = std::make_unique<LoongArchSubtarget>(TargetTriple, CPU, TuneCPU, FS,
+                                             ABIName, *this);
+  }
+  return I.get();
+}
+
+namespace {
+class LoongArchPassConfig : public TargetPassConfig {
+public:
+  LoongArchPassConfig(LoongArchTargetMachine &TM, PassManagerBase &PM)
+      : TargetPassConfig(TM, PM) {}
+
+  LoongArchTargetMachine &getLoongArchTargetMachine() const {
+    return getTM<LoongArchTargetMachine>();
+  }
+
+  bool addInstSelector() override;
+};
+} // namespace
+
+TargetPassConfig *
+LoongArchTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new LoongArchPassConfig(*this, PM);
+}
+
+bool LoongArchPassConfig::addInstSelector() {
+  addPass(createLoongArchISelDag(getLoongArchTargetMachine()));
+
+  return false;
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.h b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.h
new file mode 100644
index 000000000000..cbd872031a32
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.h
@@ -0,0 +1,46 @@
+//=- LoongArchTargetMachine.h - Define TargetMachine for LoongArch -*- C++ -*-//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the LoongArch specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LOONGARCH_LOONGARCHTARGETMACHINE_H
+#define LLVM_LIB_TARGET_LOONGARCH_LOONGARCHTARGETMACHINE_H
+
+#include "LoongArchSubtarget.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class LoongArchTargetMachine : public LLVMTargetMachine {
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  mutable StringMap<std::unique_ptr<LoongArchSubtarget>> SubtargetMap;
+
+public:
+  LoongArchTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                         StringRef FS, const TargetOptions &Options,
+                         Optional<Reloc::Model> RM,
+                         Optional<CodeModel::Model> CM, CodeGenOpt::Level OL,
+                         bool JIT);
+  ~LoongArchTargetMachine() override;
+
+  const LoongArchSubtarget *getSubtargetImpl(const Function &F) const override;
+  const LoongArchSubtarget *getSubtargetImpl() const = delete;
+
+  // Pass Pipeline Configuration
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_LOONGARCH_LOONGARCHTARGETMACHINE_H
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
new file mode 100644
index 000000000000..94a068897f8c
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
@@ -0,0 +1,68 @@
+//===-- LoongArchAsmBackend.cpp - LoongArch Assembler Backend -*- C++ -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LoongArchAsmBackend class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LoongArchAsmBackend.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/EndianStream.h"
+
+#define DEBUG_TYPE "loongarch-asmbackend"
+
+using namespace llvm;
+
+void LoongArchAsmBackend::applyFixup(const MCAssembler &Asm,
+                                     const MCFixup &Fixup,
+                                     const MCValue &Target,
+                                     MutableArrayRef<char> Data, uint64_t Value,
+                                     bool IsResolved,
+                                     const MCSubtargetInfo *STI) const {
+  // TODO: Apply the Value for given Fixup into the provided data fragment.
+  return;
+}
+
+bool LoongArchAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
+                                                const MCFixup &Fixup,
+                                                const MCValue &Target) {
+  // TODO: Determine which relocation require special processing at linking
+  // time.
+  return false;
+}
+
+bool LoongArchAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
+                                       const MCSubtargetInfo *STI) const {
+  // Check for byte count not multiple of instruction word size
+  if (Count % 4 != 0)
+    return false;
+
+  // The nop on LoongArch is andi r0, r0, 0.
+  for (; Count >= 4; Count -= 4)
+    support::endian::write<uint32_t>(OS, 0x03400000, support::little);
+
+  return true;
+}
+
+std::unique_ptr<MCObjectTargetWriter>
+LoongArchAsmBackend::createObjectTargetWriter() const {
+  return createLoongArchELFObjectWriter(OSABI, Is64Bit);
+}
+
+MCAsmBackend *llvm::createLoongArchAsmBackend(const Target &T,
+                                              const MCSubtargetInfo &STI,
+                                              const MCRegisterInfo &MRI,
+                                              const MCTargetOptions &Options) {
+  const Triple &TT = STI.getTargetTriple();
+  uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS());
+  return new LoongArchAsmBackend(STI, OSABI, TT.isArch64Bit());
+}
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
new file mode 100644
index 000000000000..77bbfb095747
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
@@ -0,0 +1,63 @@
+//===-- LoongArchAsmBackend.h - LoongArch Assembler Backend ---*- C++ -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the LoongArchAsmBackend class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LOONGARCH_MCTARGETDESC_LOONGARCHASMBACKEND_H
+#define LLVM_LIB_TARGET_LOONGARCH_MCTARGETDESC_LOONGARCHASMBACKEND_H
+
+#include "MCTargetDesc/LoongArchBaseInfo.h"
+#include "MCTargetDesc/LoongArchMCTargetDesc.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+namespace llvm {
+
+class LoongArchAsmBackend : public MCAsmBackend {
+  uint8_t OSABI;
+  bool Is64Bit;
+
+public:
+  LoongArchAsmBackend(const MCSubtargetInfo &STI, uint8_t OSABI, bool Is64Bit)
+      : MCAsmBackend(support::little), OSABI(OSABI), Is64Bit(Is64Bit) {}
+  ~LoongArchAsmBackend() override {}
+
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsResolved,
+                  const MCSubtargetInfo *STI) const override;
+
+  bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+                             const MCValue &Target) override;
+
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override {
+    return false;
+  }
+
+  unsigned getNumFixupKinds() const override {
+    // FIXME: Implement this when we define fixup kind
+    return 0;
+  }
+
+  void relaxInstruction(MCInst &Inst,
+                        const MCSubtargetInfo &STI) const override {}
+
+  bool writeNopData(raw_ostream &OS, uint64_t Count,
+                    const MCSubtargetInfo *STI) const override;
+
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override;
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LOONGARCH_MCTARGETDESC_LOONGARCHASMBACKEND_H
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp
new file mode 100644
index 000000000000..f0c985883125
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp
@@ -0,0 +1,40 @@
+//= LoongArchBaseInfo.cpp - Top level definitions for LoongArch MC -*- C++ -*-//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements helper functions for the LoongArch target useful for the
+// compiler back-end and the MC libraries.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LoongArchBaseInfo.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+namespace llvm {
+
+namespace LoongArchABI {
+
+ABI getTargetABI(StringRef ABIName) {
+  auto TargetABI = StringSwitch<ABI>(ABIName)
+                       .Case("ilp32s", ABI_ILP32S)
+                       .Case("ilp32f", ABI_ILP32F)
+                       .Case("ilp32d", ABI_ILP32D)
+                       .Case("lp64s", ABI_LP64S)
+                       .Case("lp64f", ABI_LP64F)
+                       .Case("lp64d", ABI_LP64D)
+                       .Default(ABI_Unknown);
+  return TargetABI;
+}
+
+// FIXME: other register?
+MCRegister getBPReg() { return LoongArch::R31; }
+
+} // namespace LoongArchABI
+
+} // namespace llvm
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
new file mode 100644
index 000000000000..e26f22de0cbc
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
@@ -0,0 +1,44 @@
+//=- LoongArchBaseInfo.h - Top level definitions for LoongArch MC -*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone enum definitions and helper function
+// definitions for the LoongArch target useful for the compiler back-end and the
+// MC libraries.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_TARGET_LOONGARCH_MCTARGETDESC_LOONGARCHBASEINFO_H
+#define LLVM_LIB_TARGET_LOONGARCH_MCTARGETDESC_LOONGARCHBASEINFO_H
+
+#include "MCTargetDesc/LoongArchMCTargetDesc.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/SubtargetFeature.h"
+
+namespace llvm {
+
+namespace LoongArchABI {
+enum ABI {
+  ABI_ILP32S,
+  ABI_ILP32F,
+  ABI_ILP32D,
+  ABI_LP64S,
+  ABI_LP64F,
+  ABI_LP64D,
+  ABI_Unknown
+};
+
+ABI getTargetABI(StringRef ABIName);
+
+// Returns the register used to hold the stack pointer after realignment.
+MCRegister getBPReg();
+} // namespace LoongArchABI
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LOONGARCH_MCTARGETDESC_LOONGARCHBASEINFO_H
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
new file mode 100644
index 000000000000..95e1314f363a
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
@@ -0,0 +1,64 @@
+//===-- LoongArchELFObjectWriter.cpp - LoongArch ELF Writer ---*- C++ -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/LoongArchMCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+class LoongArchELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  LoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit);
+
+  ~LoongArchELFObjectWriter() override;
+
+  // Return true if the given relocation must be with a symbol rather than
+  // section plus offset.
+  bool needsRelocateWithSymbol(const MCSymbol &Sym,
+                               unsigned Type) const override {
+    return true;
+  }
+
+protected:
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const override;
+};
+} // namespace
+
+LoongArchELFObjectWriter::LoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit)
+    : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_LOONGARCH,
+                              /*HasRelocationAddend*/ true) {}
+
+LoongArchELFObjectWriter::~LoongArchELFObjectWriter() {}
+
+unsigned LoongArchELFObjectWriter::getRelocType(MCContext &Ctx,
+                                                const MCValue &Target,
+                                                const MCFixup &Fixup,
+                                                bool IsPCRel) const {
+  // Determine the type of the relocation
+  unsigned Kind = Fixup.getTargetKind();
+
+  if (Kind >= FirstLiteralRelocationKind)
+    return Kind - FirstLiteralRelocationKind;
+
+  switch (Kind) {
+  // TODO: Implement this when we defined fixup kind.
+  default:
+    return ELF::R_LARCH_NONE;
+  }
+}
+
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createLoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit) {
+  return std::make_unique<LoongArchELFObjectWriter>(OSABI, Is64Bit);
+}
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchInstPrinter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchInstPrinter.cpp
new file mode 100644
index 000000000000..66183868f468
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchInstPrinter.cpp
@@ -0,0 +1,63 @@
+//===- LoongArchInstPrinter.cpp - Convert LoongArch MCInst to asm syntax --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an LoongArch MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LoongArchInstPrinter.h"
+#include "LoongArchBaseInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loongarch-asm-printer"
+
+// Include the auto-generated portion of the assembly writer.
+#define PRINT_ALIAS_INSTR
+#include "LoongArchGenAsmWriter.inc"
+
+void LoongArchInstPrinter::printInst(const MCInst *MI, uint64_t Address,
+                                     StringRef Annot,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  if (!printAliasInstr(MI, Address, STI, O))
+    printInstruction(MI, Address, STI, O);
+  printAnnotation(O, Annot);
+}
+
+void LoongArchInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const {
+  O << '$' << getRegisterName(RegNo);
+}
+
+void LoongArchInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNo);
+
+  if (MO.isReg()) {
+    printRegName(O, MO.getReg());
+    return;
+  }
+
+  if (MO.isImm()) {
+    O << MO.getImm();
+    return;
+  }
+
+  assert(MO.isExpr() && "Unknown operand kind in printOperand");
+  MO.getExpr()->print(O, &MAI);
+}
+
+const char *LoongArchInstPrinter::getRegisterName(unsigned RegNo) {
+  // Default print reg alias name
+  return getRegisterName(RegNo, LoongArch::RegAliasName);
+}
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchInstPrinter.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchInstPrinter.h
new file mode 100644
index 000000000000..727fc6a3e1f3
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchInstPrinter.h
@@ -0,0 +1,49 @@
+//===-- LoongArchInstPrinter.h - Convert LoongArch MCInst to asm syntax ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints a LoongArch MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LOONGARCH_MCTARGETDESC_LOONGARCHINSTPRINTER_H
+#define LLVM_LIB_TARGET_LOONGARCH_MCTARGETDESC_LOONGARCHINSTPRINTER_H
+
+#include "MCTargetDesc/LoongArchMCTargetDesc.h"
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class LoongArchInstPrinter : public MCInstPrinter {
+public:
+  LoongArchInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                       const MCRegisterInfo &MRI)
+      : MCInstPrinter(MAI, MII, MRI) {}
+
+  void printInst(const MCInst *MI, uint64_t Address, StringRef Annot,
+                 const MCSubtargetInfo &STI, raw_ostream &O) override;
+  void printRegName(raw_ostream &O, unsigned RegNo) const override;
+
+  // Autogenerated by tblgen.
+  std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
+  void printInstruction(const MCInst *MI, uint64_t Address,
+                        const MCSubtargetInfo &STI, raw_ostream &O);
+  bool printAliasInstr(const MCInst *MI, uint64_t Address,
+                       const MCSubtargetInfo &STI, raw_ostream &O);
+  void printCustomAliasOperand(const MCInst *MI, uint64_t Address,
+                               unsigned OpIdx, unsigned PrintMethodIdx,
+                               const MCSubtargetInfo &STI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+  static const char *getRegisterName(unsigned RegNo, unsigned AltIdx);
+
+private:
+  void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                    raw_ostream &O);
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LOONGARCH_MCTARGETDESC_LOONGARCHINSTPRINTER_H
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.cpp
new file mode 100644
index 000000000000..bc946db2f449
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.cpp
@@ -0,0 +1,34 @@
+//===-- LoongArchMCAsmInfo.cpp - LoongArch Asm properties ------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the LoongArchMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LoongArchMCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/MC/MCStreamer.h"
+
+using namespace llvm;
+
+void LoongArchMCAsmInfo::anchor() {}
+
+LoongArchMCAsmInfo::LoongArchMCAsmInfo(const Triple &TT) {
+  CodePointerSize = CalleeSaveStackSlotSize = TT.isArch64Bit() ? 8 : 4;
+  AlignmentIsInBytes = false;
+  Data8bitsDirective = "\t.byte\t";
+  Data16bitsDirective = "\t.half\t";
+  Data32bitsDirective = "\t.word\t";
+  Data64bitsDirective = "\t.dword\t";
+  ZeroDirective = "\t.space\t";
+  CommentString = "#";
+  SupportsDebugInformation = true;
+  DwarfRegNumForCFI = true;
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+}
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.h
new file mode 100644
index 000000000000..1cf8a2fdf8aa
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.h
@@ -0,0 +1,30 @@
+//===-- LoongArchMCAsmInfo.h - LoongArch Asm Info --------------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the LoongArchMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LOONGARCH_MCTARGETDESC_LOONGARCHMCASMINFO_H
+#define LLVM_LIB_TARGET_LOONGARCH_MCTARGETDESC_LOONGARCHMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+class Triple;
+
+class LoongArchMCAsmInfo : public MCAsmInfoELF {
+  void anchor() override;
+
+public:
+  explicit LoongArchMCAsmInfo(const Triple &TargetTriple);
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LOONGARCH_MCTARGETDESC_LOONGARCHMCASMINFO_H
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
new file mode 100644
index 000000000000..9c6a4f39b9ea
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
@@ -0,0 +1,127 @@
+//=- LoongArchMCCodeEmitter.cpp - Convert LoongArch code to machine code --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LoongArchMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/LoongArchBaseInfo.h"
+#include "MCTargetDesc/LoongArchMCTargetDesc.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/EndianStream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+namespace {
+class LoongArchMCCodeEmitter : public MCCodeEmitter {
+  LoongArchMCCodeEmitter(const LoongArchMCCodeEmitter &) = delete;
+  void operator=(const LoongArchMCCodeEmitter &) = delete;
+  MCContext &Ctx;
+  MCInstrInfo const &MCII;
+
+public:
+  LoongArchMCCodeEmitter(MCContext &ctx, MCInstrInfo const &MCII)
+      : Ctx(ctx), MCII(MCII) {}
+
+  ~LoongArchMCCodeEmitter() override {}
+
+  void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+
+  /// TableGen'erated function for getting the binary encoding for an
+  /// instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  /// Return binary encoding of operand. If the machine operand requires
+  /// relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  /// Return binary encoding of an immediate operand specified by OpNo.
+  /// The value returned is the value of the immediate minus 1.
+  /// Note that this function is dedicated to specific immediate types,
+  /// e.g. uimm2_plus1.
+  unsigned getImmOpValueSub1(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  /// Return binary encoding of an immediate operand specified by OpNo.
+  /// The value returned is the value of the immediate shifted right
+  //  arithmetically by 2.
+  /// Note that this function is dedicated to specific immediate types,
+  /// e.g. simm14_lsl2, simm16_lsl2, simm21_lsl2 and simm26_lsl2.
+  unsigned getImmOpValueAsr2(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+};
+} // end anonymous namespace
+
+unsigned
+LoongArchMCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+
+  if (MO.isReg())
+    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
+
+  if (MO.isImm())
+    return static_cast<unsigned>(MO.getImm());
+
+  llvm_unreachable("Unhandled expression!");
+}
+
+unsigned
+LoongArchMCCodeEmitter::getImmOpValueSub1(const MCInst &MI, unsigned OpNo,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  return MI.getOperand(OpNo).getImm() - 1;
+}
+
+unsigned
+LoongArchMCCodeEmitter::getImmOpValueAsr2(const MCInst &MI, unsigned OpNo,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  unsigned Res = MI.getOperand(OpNo).getImm();
+  assert((Res & 3) == 0 && "lowest 2 bits are non-zero");
+  return Res >> 2;
+}
+
+void LoongArchMCCodeEmitter::encodeInstruction(
+    const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+  // Get byte count of instruction.
+  unsigned Size = Desc.getSize();
+
+  switch (Size) {
+  default:
+    llvm_unreachable("Unhandled encodeInstruction length!");
+  case 4: {
+    uint32_t Bits = getBinaryCodeForInstr(MI, Fixups, STI);
+    support::endian::write(OS, Bits, support::little);
+    break;
+  }
+  }
+}
+
+MCCodeEmitter *llvm::createLoongArchMCCodeEmitter(const MCInstrInfo &MCII,
+                                                  MCContext &Ctx) {
+  return new LoongArchMCCodeEmitter(Ctx, MCII);
+}
+
+#include "LoongArchGenMCCodeEmitter.inc"
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
new file mode 100644
index 000000000000..c733c194e6a2
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
@@ -0,0 +1,114 @@
+//===-- LoongArchMCTargetDesc.cpp - LoongArch Target Descriptions ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides LoongArch specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LoongArchMCTargetDesc.h"
+#include "LoongArchBaseInfo.h"
+#include "LoongArchInstPrinter.h"
+#include "LoongArchMCAsmInfo.h"
+#include "TargetInfo/LoongArchTargetInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "LoongArchGenInstrInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "LoongArchGenRegisterInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "LoongArchGenSubtargetInfo.inc"
+
+using namespace llvm;
+
+static MCRegisterInfo *createLoongArchMCRegisterInfo(const Triple &TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitLoongArchMCRegisterInfo(X, LoongArch::R1);
+  return X;
+}
+
+static MCInstrInfo *createLoongArchMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitLoongArchMCInstrInfo(X);
+  return X;
+}
+
+static MCSubtargetInfo *
+createLoongArchMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
+  if (CPU.empty())
+    CPU = TT.isArch64Bit() ? "la464" : "generic-la32";
+  return createLoongArchMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
+}
+
+static MCAsmInfo *createLoongArchMCAsmInfo(const MCRegisterInfo &MRI,
+                                           const Triple &TT,
+                                           const MCTargetOptions &Options) {
+  MCAsmInfo *MAI = new LoongArchMCAsmInfo(TT);
+
+  // Initial state of the frame pointer is sp(r3).
+  MCRegister SP = MRI.getDwarfRegNum(LoongArch::R3, true);
+  MCCFIInstruction Inst = MCCFIInstruction::cfiDefCfa(nullptr, SP, 0);
+  MAI->addInitialFrameState(Inst);
+
+  return MAI;
+}
+
+static MCInstPrinter *createLoongArchMCInstPrinter(const Triple &T,
+                                                   unsigned SyntaxVariant,
+                                                   const MCAsmInfo &MAI,
+                                                   const MCInstrInfo &MII,
+                                                   const MCRegisterInfo &MRI) {
+  return new LoongArchInstPrinter(MAI, MII, MRI);
+}
+
+namespace {
+
+class LoongArchMCInstrAnalysis : public MCInstrAnalysis {
+public:
+  explicit LoongArchMCInstrAnalysis(const MCInstrInfo *Info)
+      : MCInstrAnalysis(Info) {}
+
+  bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
+                      uint64_t &Target) const override {
+    unsigned NumOps = Inst.getNumOperands();
+    if (isBranch(Inst) || Inst.getOpcode() == LoongArch::BL) {
+      Target = Addr + Inst.getOperand(NumOps - 1).getImm();
+      return true;
+    }
+
+    return false;
+  }
+};
+
+} // end anonymous namespace
+
+static MCInstrAnalysis *createLoongArchInstrAnalysis(const MCInstrInfo *Info) {
+  return new LoongArchMCInstrAnalysis(Info);
+}
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLoongArchTargetMC() {
+  for (Target *T : {&getTheLoongArch32Target(), &getTheLoongArch64Target()}) {
+    TargetRegistry::RegisterMCRegInfo(*T, createLoongArchMCRegisterInfo);
+    TargetRegistry::RegisterMCInstrInfo(*T, createLoongArchMCInstrInfo);
+    TargetRegistry::RegisterMCSubtargetInfo(*T, createLoongArchMCSubtargetInfo);
+    TargetRegistry::RegisterMCAsmInfo(*T, createLoongArchMCAsmInfo);
+    TargetRegistry::RegisterMCCodeEmitter(*T, createLoongArchMCCodeEmitter);
+    TargetRegistry::RegisterMCAsmBackend(*T, createLoongArchAsmBackend);
+    TargetRegistry::RegisterMCInstPrinter(*T, createLoongArchMCInstPrinter);
+    TargetRegistry::RegisterMCInstrAnalysis(*T, createLoongArchInstrAnalysis);
+  }
+}
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h
new file mode 100644
index 000000000000..e576b9a49cd6
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h
@@ -0,0 +1,54 @@
+//===- LoongArchMCTargetDesc.h - LoongArch Target Descriptions --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides LoongArch specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LOONGARCH_MCTARGETDESC_LOONGARCHMCTARGETDESC_H
+#define LLVM_LIB_TARGET_LOONGARCH_MCTARGETDESC_LOONGARCHMCTARGETDESC_H
+
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Support/DataTypes.h"
+#include <memory>
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectTargetWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class Target;
+
+MCCodeEmitter *createLoongArchMCCodeEmitter(const MCInstrInfo &MCII,
+                                            MCContext &Ctx);
+
+MCAsmBackend *createLoongArchAsmBackend(const Target &T,
+                                        const MCSubtargetInfo &STI,
+                                        const MCRegisterInfo &MRI,
+                                        const MCTargetOptions &Options);
+
+std::unique_ptr<MCObjectTargetWriter>
+createLoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit);
+
+} // namespace llvm
+
+// Defines symbolic names for LoongArch registers.
+#define GET_REGINFO_ENUM
+#include "LoongArchGenRegisterInfo.inc"
+
+// Defines symbolic names for LoongArch instructions.
+#define GET_INSTRINFO_ENUM
+#include "LoongArchGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "LoongArchGenSubtargetInfo.inc"
+
+#endif // LLVM_LIB_TARGET_LOONGARCH_MCTARGETDESC_LOONGARCHMCTARGETDESC_H
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp
new file mode 100644
index 000000000000..1509c436c810
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp
@@ -0,0 +1,51 @@
+//===- LoongArchMatInt.cpp - Immediate materialisation ---------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "LoongArchMatInt.h"
+#include "MCTargetDesc/LoongArchMCTargetDesc.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace llvm;
+
+LoongArchMatInt::InstSeq LoongArchMatInt::generateInstSeq(int64_t Val) {
+  // Val:
+  // |            hi32              |              lo32            |
+  // +-----------+------------------+------------------+-----------+
+  // | Highest12 |    Higher20      |       Hi20       |    Lo12   |
+  // +-----------+------------------+------------------+-----------+
+  // 63        52 51              32 31              12 11         0
+  //
+  const int64_t Highest12 = Val >> 52 & 0xFFF;
+  const int64_t Higher20 = Val >> 32 & 0xFFFFF;
+  const int64_t Hi20 = Val >> 12 & 0xFFFFF;
+  const int64_t Lo12 = Val & 0xFFF;
+  InstSeq Insts;
+
+  if (Highest12 != 0 && SignExtend64<52>(Val) == 0) {
+    Insts.push_back(Inst(LoongArch::LU52I_D, SignExtend64<12>(Highest12)));
+    return Insts;
+  }
+
+  if (Hi20 == 0)
+    Insts.push_back(Inst(LoongArch::ORI, Lo12));
+  else if (SignExtend32<1>(Lo12 >> 11) == SignExtend32<20>(Hi20))
+    Insts.push_back(Inst(LoongArch::ADDI_W, SignExtend64<12>(Lo12)));
+  else {
+    Insts.push_back(Inst(LoongArch::LU12I_W, SignExtend64<20>(Hi20)));
+    if (Lo12 != 0)
+      Insts.push_back(Inst(LoongArch::ORI, Lo12));
+  }
+
+  if (SignExtend32<1>(Hi20 >> 19) != SignExtend32<20>(Higher20))
+    Insts.push_back(Inst(LoongArch::LU32I_D, SignExtend64<20>(Higher20)));
+
+  if (SignExtend32<1>(Higher20 >> 19) != SignExtend32<12>(Highest12))
+    Insts.push_back(Inst(LoongArch::LU52I_D, SignExtend64<12>(Highest12)));
+
+  return Insts;
+}
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.h
new file mode 100644
index 000000000000..945aa91e40c0
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.h
@@ -0,0 +1,30 @@
+//===- LoongArchMatInt.h - Immediate materialisation -  --------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LOONGARCH_MCTARGETDESC_MATINT_H
+#define LLVM_LIB_TARGET_LOONGARCH_MCTARGETDESC_MATINT_H
+
+#include "llvm/ADT/SmallVector.h"
+#include <cstdint>
+
+namespace llvm {
+namespace LoongArchMatInt {
+struct Inst {
+  unsigned Opc;
+  int64_t Imm;
+  Inst(unsigned Opc, int64_t Imm) : Opc(Opc), Imm(Imm) {}
+};
+using InstSeq = SmallVector<Inst, 4>;
+
+// Helper to generate an instruction sequence that will materialise the given
+// immediate value into a register.
+InstSeq generateInstSeq(int64_t Val);
+} // namespace LoongArchMatInt
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/Target/LoongArch/TargetInfo/LoongArchTargetInfo.cpp b/llvm/lib/Target/LoongArch/TargetInfo/LoongArchTargetInfo.cpp
new file mode 100644
index 000000000000..10654510032f
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/TargetInfo/LoongArchTargetInfo.cpp
@@ -0,0 +1,30 @@
+//===-- LoongArchTargetInfo.cpp - LoongArch Target Implementation ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TargetInfo/LoongArchTargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
+using namespace llvm;
+
+Target &llvm::getTheLoongArch32Target() {
+  static Target TheLoongArch32Target;
+  return TheLoongArch32Target;
+}
+
+Target &llvm::getTheLoongArch64Target() {
+  static Target TheLoongArch64Target;
+  return TheLoongArch64Target;
+}
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLoongArchTargetInfo() {
+  RegisterTarget<Triple::loongarch32, /*HasJIT=*/false> X(
+      getTheLoongArch32Target(), "loongarch32", "32-bit LoongArch",
+      "LoongArch");
+  RegisterTarget<Triple::loongarch64, /*HasJIT=*/false> Y(
+      getTheLoongArch64Target(), "loongarch64", "64-bit LoongArch",
+      "LoongArch");
+}
diff --git a/llvm/lib/Target/LoongArch/TargetInfo/LoongArchTargetInfo.h b/llvm/lib/Target/LoongArch/TargetInfo/LoongArchTargetInfo.h
new file mode 100644
index 000000000000..6fc13d52c065
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/TargetInfo/LoongArchTargetInfo.h
@@ -0,0 +1,21 @@
+//===-- LoongArchTargetInfo.h - LoongArch Target Implementation -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LOONGARCH_TARGETINFO_LOONGARCHTARGETINFO_H
+#define LLVM_LIB_TARGET_LOONGARCH_TARGETINFO_LOONGARCHTARGETINFO_H
+
+namespace llvm {
+
+class Target;
+
+Target &getTheLoongArch32Target();
+Target &getTheLoongArch64Target();
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LOONGARCH_TARGETINFO_LOONGARCHTARGETINFO_H
diff --git a/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp b/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp
index dcd581875f60..0a3d09552535 100644
--- a/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp
+++ b/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp
@@ -11,6 +11,7 @@
 #include "TargetInfo/M68kTargetInfo.h"
 
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCStreamer.h"
diff --git a/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp b/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp
index a565ff4e004d..31b59c17c0ca 100644
--- a/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp
+++ b/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp
@@ -20,8 +20,11 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCDecoderOps.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 
@@ -29,581 +32,112 @@ using namespace llvm;
 
 typedef MCDisassembler::DecodeStatus DecodeStatus;
 
-namespace {
-constexpr unsigned MaxInstructionWords = 11;
-
-class M68kInstructionBuffer {
-  typedef SmallVector<uint16_t, MaxInstructionWords> BufferType;
-  BufferType Buffer;
-
-public:
-  M68kInstructionBuffer() {}
-
-  template <typename TIt>
-  M68kInstructionBuffer(TIt Start, TIt End) : Buffer(Start, End) {}
-
-  unsigned size() const { return Buffer.size(); }
-
-  BufferType::const_iterator begin() const { return Buffer.begin(); }
-  BufferType::const_iterator end() const { return Buffer.end(); }
-
-  uint16_t operator[](unsigned Index) const {
-    assert((Index < Buffer.size()) && "tried to read out of bounds word");
-    return Buffer[Index];
-  }
-
-  void truncate(unsigned NewLength) {
-    assert((NewLength <= Buffer.size()) &&
-           "instruction buffer too short to truncate");
-    Buffer.resize(NewLength);
-  }
-
-  void dump() const;
-
-  static M68kInstructionBuffer fill(ArrayRef<uint8_t> Bytes);
-};
-
-class M68kInstructionReader {
-  M68kInstructionBuffer Buffer;
-  unsigned NumRead;
-
-public:
-  M68kInstructionReader(M68kInstructionBuffer Buf) : Buffer(Buf), NumRead(0) {}
-
-  unsigned size() const { return (Buffer.size() * 16) - NumRead; }
-
-  uint64_t readBits(unsigned NumBits);
-};
-
-struct M68kInstructionLookup {
-  unsigned OpCode;
-  M68kInstructionBuffer Mask;
-  M68kInstructionBuffer Value;
-
-  unsigned size() const { return Mask.size(); }
-
-  // Check whether this instruction could possibly match the given bytes.
-  bool matches(const M68kInstructionBuffer &Test) const;
-  void dump() const;
-};
-
-class M68kInstructionLookupBuilder {
-  std::array<uint16_t, MaxInstructionWords> Mask;
-  std::array<uint16_t, MaxInstructionWords> Value;
-  unsigned NumWritten;
-
-public:
-  M68kInstructionLookupBuilder() : NumWritten(0) {
-    Mask.fill(0);
-    Value.fill(0);
-  }
-
-  unsigned numWords() const {
-    assert(!(NumWritten & 0xf) && "instructions must be whole words");
-    return NumWritten >> 4;
-  }
-
-  bool isValid() const;
-  M68kInstructionLookup build(unsigned OpCode);
-  void addBits(unsigned N, uint64_t Bits);
-  void skipBits(unsigned N);
-};
-
-/// A disassembler class for M68k.
-class M68kDisassembler : public MCDisassembler {
-  MCInstrInfo *MCII;
-  std::vector<M68kInstructionLookup> Lookups;
-
-public:
-  M68kDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
-                   MCInstrInfo *MCII)
-      : MCDisassembler(STI, Ctx), MCII(MCII) {
-    buildBeadTable();
-  }
-  virtual ~M68kDisassembler() {}
-
-  void buildBeadTable();
-  DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
-                              ArrayRef<uint8_t> Bytes, uint64_t Address,
-                              raw_ostream &CStream) const override;
-  void decodeReg(MCInst &Instr, unsigned int Bead,
-                 M68kInstructionReader &Reader, unsigned &Scratch) const;
-  void decodeImm(MCInst &Instr, unsigned int Bead,
-                 M68kInstructionReader &Reader, unsigned &Scratch) const;
-  unsigned int getRegOperandIndex(MCInst &Instr, unsigned int Bead) const;
-  unsigned int getImmOperandIndex(MCInst &Instr, unsigned int Bead) const;
-};
-} // namespace
-
-static unsigned RegisterDecode[] = {
-    M68k::A0, M68k::A1, M68k::A2, M68k::A3, M68k::A4, M68k::A5,
-    M68k::A6, M68k::SP, M68k::D0, M68k::D1, M68k::D2, M68k::D3,
-    M68k::D4, M68k::D5, M68k::D6, M68k::D7,
+static const unsigned RegisterDecode[] = {
+    M68k::D0, M68k::D1, M68k::D2, M68k::D3, M68k::D4, M68k::D5,
+    M68k::D6, M68k::D7, M68k::A0, M68k::A1, M68k::A2, M68k::A3,
+    M68k::A4, M68k::A5, M68k::A6, M68k::SP,
 };
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD
-void M68kInstructionBuffer::dump() const {
-  for (auto Word : Buffer) {
-    for (unsigned B = 0; B < 16; ++B) {
-      uint16_t Bit = (1 << (16 - B - 1));
-      unsigned IsClear = !(Word & Bit);
-
-      if (B == 8)
-        dbgs() << " ";
-
-      char Ch = IsClear ? '0' : '1';
-      dbgs() << Ch;
-    }
-
-    dbgs() << " ";
-  }
-
-  dbgs() << "\n";
+static DecodeStatus DecodeRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                        uint64_t Address, const void *Decoder) {
+  if (RegNo >= 16)
+    return DecodeStatus::Fail;
+  Inst.addOperand(MCOperand::createReg(RegisterDecode[RegNo]));
+  return DecodeStatus::Success;
 }
-#endif
-
-M68kInstructionBuffer M68kInstructionBuffer::fill(ArrayRef<uint8_t> Bytes) {
-  SmallVector<uint16_t, MaxInstructionWords> Buffer;
-  Buffer.resize(std::min(Bytes.size() / 2, Buffer.max_size()));
-
-  for (unsigned I = 0, E = Buffer.size(); I < E; ++I) {
-    unsigned Offset = I * 2;
-    uint64_t Hi = Bytes[Offset];
-    uint64_t Lo = Bytes[Offset + 1];
-    uint64_t Word = (Hi << 8) | Lo;
-    Buffer[I] = Word;
-
-    LLVM_DEBUG(
-        errs() << format("Read word %x (%d)\n", (unsigned)Word, Buffer.size()));
-  }
-
-  return M68kInstructionBuffer(Buffer.begin(), Buffer.end());
-}
-
-uint64_t M68kInstructionReader::readBits(unsigned NumBits) {
-  assert((size() >= NumBits) && "not enough bits to read");
-
-  // We have to read the bits in 16-bit chunks because we read them as
-  // 16-bit words but they're actually written in big-endian. If a read
-  // crosses a word boundary we have to be careful.
-
-  uint64_t Value = 0;
-  unsigned BitsRead = 0;
-
-  while (BitsRead < NumBits) {
-    unsigned AvailableThisWord = 16 - (NumRead & 0xf);
-    unsigned ToRead = std::min(NumBits, AvailableThisWord);
-
-    unsigned WordIndex = NumRead >> 4;
-    uint64_t ThisWord = Buffer[WordIndex] >> (NumRead & 0xf);
-    uint64_t Mask = (1 << ToRead) - 1;
-    Value |= (ThisWord & Mask) << BitsRead;
-    NumRead += ToRead;
-    BitsRead += ToRead;
-  }
-  return Value;
-}
-
-bool M68kInstructionLookup::matches(const M68kInstructionBuffer &Test) const {
-  if (Test.size() < Value.size())
-    return false;
-
-  for (unsigned I = 0, E = Value.size(); I < E; ++I) {
-    uint16_t Have = Test[I];
-    uint16_t Need = Value[I];
-    uint16_t WordMask = Mask[I];
-
-    if ((Have & WordMask) != Need)
-      return false;
-  }
-
-  return true;
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD
-void M68kInstructionLookup::dump() const {
-  dbgs() << "M68kInstructionLookup " << OpCode << " ";
-
-  for (unsigned I = 0, E = Mask.size(); I < E; ++I) {
-    uint16_t WordMask = Mask[I];
-    uint16_t WordValue = Value[I];
-
-    for (unsigned B = 0; B < 16; ++B) {
-      uint16_t Bit = (1 << (15 - B));
-      unsigned IsMasked = !(WordMask & Bit);
-      unsigned IsClear = !(WordValue & Bit);
-
-      if (B == 8)
-        dbgs() << " ";
-
-      char Ch = IsMasked ? '?' : (IsClear ? '0' : '1');
-      dbgs() << Ch;
-    }
-
-    dbgs() << " ";
-  }
 
-  dbgs() << "\n";
+static DecodeStatus DecodeDR32RegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return DecodeRegisterClass(Inst, RegNo, Address, Decoder);
 }
-#endif
 
-bool M68kInstructionLookupBuilder::isValid() const {
-  for (unsigned I = 0, E = numWords(); I < E; ++I)
-    if (Mask[I])
-      return true;
-
-  return false;
+static DecodeStatus DecodeDR16RegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return DecodeRegisterClass(Inst, RegNo, Address, Decoder);
 }
 
-M68kInstructionLookup M68kInstructionLookupBuilder::build(unsigned OpCode) {
-  unsigned NumWords = numWords();
-  M68kInstructionBuffer MaskBuffer(Mask.begin(), Mask.begin() + NumWords);
-  M68kInstructionBuffer ValueBuffer(Value.begin(), Value.begin() + NumWords);
-  M68kInstructionLookup Ret;
-  Ret.OpCode = OpCode;
-  Ret.Mask = MaskBuffer;
-  Ret.Value = ValueBuffer;
-  return Ret;
+static DecodeStatus DecodeDR8RegisterClass(MCInst &Inst, uint64_t RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  return DecodeRegisterClass(Inst, RegNo, Address, Decoder);
 }
 
-void M68kInstructionLookupBuilder::addBits(unsigned N, uint64_t Bits) {
-  while (N > 0) {
-    unsigned WordIndex = NumWritten >> 4;
-    unsigned WordOffset = NumWritten & 0xf;
-    unsigned AvailableThisWord = 16 - WordOffset;
-    unsigned ToWrite = std::min(AvailableThisWord, N);
-
-    uint16_t WordMask = (1 << ToWrite) - 1;
-    uint16_t BitsToWrite = Bits & WordMask;
-
-    Value[WordIndex] |= (BitsToWrite << WordOffset);
-    Mask[WordIndex] |= (WordMask << WordOffset);
-
-    Bits >>= ToWrite;
-    N -= ToWrite;
-    NumWritten += ToWrite;
-  }
+static DecodeStatus DecodeAR32RegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return DecodeRegisterClass(Inst, RegNo | 8ULL, Address, Decoder);
 }
 
-void M68kInstructionLookupBuilder::skipBits(unsigned N) { NumWritten += N; }
-
-// This is a bit of a hack: we can't generate this table at table-gen time
-// because some of the definitions are in our platform.
-void M68kDisassembler::buildBeadTable() {
-  const unsigned NumInstr = M68k::INSTRUCTION_LIST_END;
-  Lookups.reserve(NumInstr);
-
-  for (unsigned I = 0; I < NumInstr; ++I) {
-    M68kInstructionLookupBuilder Builder;
-
-    for (const uint8_t *PartPtr = M68k::getMCInstrBeads(I); *PartPtr;
-         ++PartPtr) {
-      uint8_t Bead = *PartPtr;
-      unsigned Ext = Bead >> 4;
-      unsigned Op = Bead & 0xf;
-
-      switch (Op) {
-      case M68kBeads::Ctrl:
-        // Term will have already been skipped by the loop.
-        assert((Ext == M68kBeads::Ignore) && "unexpected command bead");
-        break;
-
-      case M68kBeads::Bits1:
-        Builder.addBits(1, Ext);
-        break;
-
-      case M68kBeads::Bits2:
-        Builder.addBits(2, Ext);
-        break;
-
-      case M68kBeads::Bits3:
-        Builder.addBits(3, Ext);
-        break;
-
-      case M68kBeads::Bits4:
-        Builder.addBits(4, Ext);
-        break;
-
-      case M68kBeads::DAReg:
-      case M68kBeads::DA:
-      case M68kBeads::DReg:
-      case M68kBeads::Reg:
-        if (Op != M68kBeads::DA)
-          Builder.skipBits(3);
-
-        if (Op != M68kBeads::Reg && Op != M68kBeads::DReg)
-          Builder.skipBits(1);
-
-        break;
-
-      case M68kBeads::Disp8:
-        Builder.skipBits(8);
-        break;
-
-      case M68kBeads::Imm8:
-      case M68kBeads::Imm16:
-        Builder.skipBits(16);
-        break;
-
-      case M68kBeads::Imm32:
-        Builder.skipBits(32);
-        break;
-
-      case M68kBeads::Imm3:
-        Builder.skipBits(3);
-        break;
-
-      default:
-        llvm_unreachable("unhandled bead type");
-      }
-    }
-
-    // Ignore instructions which are unmatchable (usually pseudo instructions).
-    if (!Builder.isValid())
-      continue;
-
-    Lookups.push_back(Builder.build(I));
-  }
+static DecodeStatus DecodeAR16RegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return DecodeRegisterClass(Inst, RegNo | 8ULL, Address, Decoder);
 }
 
-unsigned M68kDisassembler::getRegOperandIndex(MCInst &Instr,
-                                              unsigned Bead) const {
-  unsigned Ext = Bead >> 4;
-
-  const MCInstrDesc &Desc = MCII->get(Instr.getOpcode());
-  auto MIOpIdx = M68k::getLogicalOperandIdx(Instr.getOpcode(), Ext & 7);
-
-  if (M68kII::hasMultiMIOperands(Instr.getOpcode(), Ext & 7)) {
-    bool IsPCRel = Desc.OpInfo[MIOpIdx].OperandType == MCOI::OPERAND_PCREL;
-    if (IsPCRel)
-      MIOpIdx += M68k::PCRelIndex;
-    else if (Ext & 8)
-      MIOpIdx += M68k::MemIndex;
-    else
-      MIOpIdx += M68k::MemBase;
-  }
-
-  return MIOpIdx;
+static DecodeStatus DecodeXR32RegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return DecodeRegisterClass(Inst, RegNo, Address, Decoder);
 }
 
-unsigned M68kDisassembler::getImmOperandIndex(MCInst &Instr,
-                                              unsigned Bead) const {
-  unsigned Ext = Bead >> 4;
-
-  const MCInstrDesc &Desc = MCII->get(Instr.getOpcode());
-  auto MIOpIdx = M68k::getLogicalOperandIdx(Instr.getOpcode(), Ext & 7);
-
-  if (M68kII::hasMultiMIOperands(Instr.getOpcode(), Ext & 7)) {
-    bool IsPCRel = Desc.OpInfo[MIOpIdx].OperandType == MCOI::OPERAND_PCREL;
-    if (IsPCRel)
-      MIOpIdx += M68k::PCRelDisp;
-    else if (Ext & 8)
-      MIOpIdx += M68k::MemOuter;
-    else
-      MIOpIdx += M68k::MemDisp;
-  }
-
-  return MIOpIdx;
+static DecodeStatus DecodeXR16RegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return DecodeRegisterClass(Inst, RegNo, Address, Decoder);
 }
 
-void M68kDisassembler::decodeReg(MCInst &Instr, unsigned Bead,
-                                 M68kInstructionReader &Reader,
-                                 unsigned &Scratch) const {
-  unsigned Op = Bead & 0xf;
-  LLVM_DEBUG(errs() << format("decodeReg %x\n", Bead));
-
-  if (Op != M68kBeads::DA)
-    Scratch = (Scratch & ~7) | Reader.readBits(3);
-
-  if (Op != M68kBeads::Reg) {
-    bool DA = (Op != M68kBeads::DReg) && Reader.readBits(1);
-    if (!DA)
-      Scratch |= 8;
-    else
-      Scratch &= ~8;
-  }
+static DecodeStatus DecodeCCRCRegisterClass(MCInst &Inst, APInt &Insn,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  llvm_unreachable("unimplemented");
 }
 
-void M68kDisassembler::decodeImm(MCInst &Instr, unsigned Bead,
-                                 M68kInstructionReader &Reader,
-                                 unsigned &Scratch) const {
-  unsigned Op = Bead & 0xf;
-  LLVM_DEBUG(errs() << format("decodeImm %x\n", Bead));
+#include "M68kGenDisassemblerTable.inc"
 
-  unsigned NumToRead;
-  switch (Op) {
-  case M68kBeads::Disp8:
-    NumToRead = 8;
-    break;
-  case M68kBeads::Imm8:
-  case M68kBeads::Imm16:
-    NumToRead = 16;
-    break;
-  case M68kBeads::Imm32:
-    NumToRead = 32;
-    break;
-  case M68kBeads::Imm3:
-    NumToRead = 3;
-    break;
-  default:
-    llvm_unreachable("invalid imm");
-  }
+/// A disassembler class for M68k.
+struct M68kDisassembler : public MCDisassembler {
+  M68kDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
+      : MCDisassembler(STI, Ctx) {}
+  virtual ~M68kDisassembler() {}
 
-  Scratch = (NumToRead < 32) ? (Scratch << NumToRead) : 0;
-  Scratch |= Reader.readBits(NumToRead);
-}
+  DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              raw_ostream &CStream) const override;
+};
 
 DecodeStatus M68kDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
                                               ArrayRef<uint8_t> Bytes,
                                               uint64_t Address,
                                               raw_ostream &CStream) const {
-  // Read and shift the input (fetch as much as we can for now).
-  auto Buffer = M68kInstructionBuffer::fill(Bytes);
-  if (Buffer.size() == 0)
-    return Fail;
-
-  // Check through our lookup table.
-  bool Found = false;
-  for (unsigned I = 0, E = Lookups.size(); I < E; ++I) {
-    const M68kInstructionLookup &Lookup = Lookups[I];
-    if (!Lookup.matches(Buffer))
-      continue;
-
-    Found = true;
-    Size = Lookup.size() * 2;
-    Buffer.truncate(Lookup.size());
-    Instr.setOpcode(Lookup.OpCode);
-    LLVM_DEBUG(errs() << "decoding instruction " << MCII->getName(Lookup.OpCode)
-                      << "\n");
-    break;
-  }
-
-  if (!Found)
-    return Fail;
-
-  M68kInstructionReader Reader(Buffer);
-  const MCInstrDesc &Desc = MCII->get(Instr.getOpcode());
-  unsigned NumOperands = Desc.NumOperands;
-
-  // Now use the beads to decode the operands.
-  enum class OperandType {
-    Invalid,
-    Reg,
-    Imm,
-  };
-
-  SmallVector<OperandType, 6> OpType(NumOperands, OperandType::Invalid);
-  SmallVector<unsigned, 6> Scratch(NumOperands, 0);
-  for (const uint8_t *PartPtr = M68k::getMCInstrBeads(Instr.getOpcode());
-       *PartPtr; ++PartPtr) {
-    uint8_t Bead = *PartPtr;
-    unsigned Ext = Bead >> 4;
-    unsigned Op = Bead & 0xf;
-    unsigned MIOpIdx;
-
-    switch (Op) {
-    case M68kBeads::Ctrl:
-      // Term will have already been skipped by the loop.
-      assert((Ext == M68kBeads::Ignore) && "unexpected command bead");
-      break;
-
-      // These bits are constant - if we're here we've already matched them.
-    case M68kBeads::Bits1:
-      Reader.readBits(1);
-      break;
-    case M68kBeads::Bits2:
-      Reader.readBits(2);
-      break;
-    case M68kBeads::Bits3:
-      Reader.readBits(3);
-      break;
-    case M68kBeads::Bits4:
-      Reader.readBits(4);
-      break;
-
-    case M68kBeads::DAReg:
-    case M68kBeads::DA:
-    case M68kBeads::DReg:
-    case M68kBeads::Reg:
-      MIOpIdx = getRegOperandIndex(Instr, Bead);
-      assert(((OpType[MIOpIdx] == OperandType::Invalid) ||
-              (OpType[MIOpIdx] == OperandType::Reg)) &&
-             "operands cannot change type");
-      OpType[MIOpIdx] = OperandType::Reg;
-      decodeReg(Instr, Bead, Reader, Scratch[MIOpIdx]);
-      break;
-
-    case M68kBeads::Disp8:
-    case M68kBeads::Imm8:
-    case M68kBeads::Imm16:
-    case M68kBeads::Imm32:
-    case M68kBeads::Imm3:
-      MIOpIdx = getImmOperandIndex(Instr, Bead);
-      assert(((OpType[MIOpIdx] == OperandType::Invalid) ||
-              (OpType[MIOpIdx] == OperandType::Imm)) &&
-             "operands cannot change type");
-      OpType[MIOpIdx] = OperandType::Imm;
-      decodeImm(Instr, Bead, Reader, Scratch[MIOpIdx]);
-      break;
-
-    default:
-      llvm_unreachable("unhandled bead type");
-    }
-  }
-
-  // Copy constrained operands.
-  for (unsigned DstMIOpIdx = 0; DstMIOpIdx < NumOperands; ++DstMIOpIdx) {
-    int TiedTo = Desc.getOperandConstraint(DstMIOpIdx, MCOI::TIED_TO);
-    if (TiedTo < 0)
-      continue;
-
-    unsigned SrcMIOpIdx = TiedTo;
-
-    unsigned OpCount = 0;
-    for (unsigned I = 0;; ++I) {
-      unsigned Offset = M68k::getLogicalOperandIdx(Instr.getOpcode(), I);
-      assert(Offset <= SrcMIOpIdx && "missing logical operand");
-      if (Offset == SrcMIOpIdx) {
-        OpCount = M68k::getLogicalOperandSize(Instr.getOpcode(), I);
-        break;
-      }
+  DecodeStatus Result;
+  auto MakeUp = [&](APInt &Insn, unsigned InstrBits) {
+    unsigned Idx = Insn.getBitWidth() >> 3;
+    unsigned RoundUp = alignTo(InstrBits, Align(16));
+    if (RoundUp > Insn.getBitWidth())
+      Insn = Insn.zext(RoundUp);
+    RoundUp = RoundUp >> 3;
+    for (; Idx < RoundUp; Idx += 2) {
+      Insn.insertBits(support::endian::read16be(&Bytes[Idx]), Idx * 8, 16);
     }
-    assert(OpCount != 0 && "operand count not found");
-
-    for (unsigned I = 0; I < OpCount; ++I) {
-      assert(OpType[DstMIOpIdx + I] == OperandType::Invalid &&
-             "tried to stomp over operand whilst applying constraints");
-      OpType[DstMIOpIdx + I] = OpType[SrcMIOpIdx + I];
-      Scratch[DstMIOpIdx + I] = Scratch[SrcMIOpIdx + I];
-    }
-  }
-
-  // Create the operands from our scratch space.
-  for (unsigned O = 0; O < NumOperands; ++O) {
-    switch (OpType[O]) {
-    case OperandType::Invalid:
-      assert(false && "operand not parsed");
-
-    case OperandType::Imm:
-      Instr.addOperand(MCOperand::createImm(Scratch[O]));
-      break;
-
-    case OperandType::Reg:
-      Instr.addOperand(MCOperand::createReg(RegisterDecode[Scratch[O]]));
-      break;
-    }
-  }
-
-  assert((Reader.size() == 0) && "wrong number of bits consumed");
-  return Success;
+  };
+  APInt Insn(16, support::endian::read16be(Bytes.data()));
+  // 2 bytes of data are consumed, so set Size to 2
+  // If we don't do this, disassembler may generate result even
+  // the encoding is invalid. We need to let it fail correctly.
+  Size = 2;
+  Result = decodeInstruction(DecoderTable80, Instr, Insn, Address, this, STI,
+                             MakeUp);
+  if (Result == DecodeStatus::Success)
+    Size = InstrLenTable[Instr.getOpcode()] >> 3;
+  return Result;
 }
 
 static MCDisassembler *createM68kDisassembler(const Target &T,
                                               const MCSubtargetInfo &STI,
                                               MCContext &Ctx) {
-  return new M68kDisassembler(STI, Ctx, T.createMCInstrInfo());
+  return new M68kDisassembler(STI, Ctx);
 }
 
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeM68kDisassembler() {
diff --git a/llvm/lib/Target/M68k/GISel/M68kCallLowering.cpp b/llvm/lib/Target/M68k/GISel/M68kCallLowering.cpp
index b3d17184f1fe..e0aaa9d51cc3 100644
--- a/llvm/lib/Target/M68k/GISel/M68kCallLowering.cpp
+++ b/llvm/lib/Target/M68k/GISel/M68kCallLowering.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/TargetCallingConv.h"
 
 using namespace llvm;
@@ -27,10 +28,12 @@ using namespace llvm;
 M68kCallLowering::M68kCallLowering(const M68kTargetLowering &TLI)
     : CallLowering(&TLI) {}
 
-struct OutgoingArgHandler : public CallLowering::OutgoingValueHandler {
-  OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
-                     MachineInstrBuilder MIB)
-      : OutgoingValueHandler(MIRBuilder, MRI), MIB(MIB) {}
+struct M68kOutgoingArgHandler : public CallLowering::OutgoingValueHandler {
+  M68kOutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                         MachineInstrBuilder MIB)
+      : OutgoingValueHandler(MIRBuilder, MRI), MIB(MIB),
+        DL(MIRBuilder.getMF().getDataLayout()),
+        STI(MIRBuilder.getMF().getSubtarget<M68kSubtarget>()) {}
 
   void assignValueToReg(Register ValVReg, Register PhysReg,
                         CCValAssign VA) override {
@@ -41,16 +44,29 @@ struct OutgoingArgHandler : public CallLowering::OutgoingValueHandler {
 
   void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
                             MachinePointerInfo &MPO, CCValAssign &VA) override {
-    llvm_unreachable("unimplemented");
+    MachineFunction &MF = MIRBuilder.getMF();
+    Register ExtReg = extendRegister(ValVReg, VA);
+
+    auto *MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore, MemTy,
+                                        inferAlignFromPtrInfo(MF, MPO));
+    MIRBuilder.buildStore(ExtReg, Addr, *MMO);
   }
 
   Register getStackAddress(uint64_t Size, int64_t Offset,
                            MachinePointerInfo &MPO,
                            ISD::ArgFlagsTy Flags) override {
-    llvm_unreachable("unimplemented");
+    LLT p0 = LLT::pointer(0, DL.getPointerSizeInBits(0));
+    LLT SType = LLT::scalar(DL.getPointerSizeInBits(0));
+    Register StackReg = STI.getRegisterInfo()->getStackRegister();
+    auto SPReg = MIRBuilder.buildCopy(p0, StackReg).getReg(0);
+    auto OffsetReg = MIRBuilder.buildConstant(SType, Offset);
+    auto AddrReg = MIRBuilder.buildPtrAdd(p0, SPReg, OffsetReg);
+    MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset);
+    return AddrReg.getReg(0);
   }
-
   MachineInstrBuilder MIB;
+  const DataLayout &DL;
+  const M68kSubtarget &STI;
 };
 bool M68kCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
                                    const Value *Val, ArrayRef<Register> VRegs,
@@ -72,7 +88,7 @@ bool M68kCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
     setArgFlags(OrigArg, AttributeList::ReturnIndex, DL, F);
     splitToValueTypes(OrigArg, SplitArgs, DL, F.getCallingConv());
     OutgoingValueAssigner ArgAssigner(AssignFn);
-    OutgoingArgHandler ArgHandler(MIRBuilder, MRI, MIB);
+    M68kOutgoingArgHandler ArgHandler(MIRBuilder, MRI, MIB);
     Success = determineAndHandleAssignments(ArgHandler, ArgAssigner, SplitArgs,
                                             MIRBuilder, F.getCallingConv(),
                                             F.isVarArg());
@@ -144,9 +160,73 @@ Register M68kIncomingValueHandler::getStackAddress(uint64_t Size,
   return AddrReg.getReg(0);
 }
 
+void CallReturnHandler::assignValueToReg(Register ValVReg, Register PhysReg,
+                                         CCValAssign VA) {
+  MIB.addDef(PhysReg, RegState::Implicit);
+  MIRBuilder.buildCopy(ValVReg, PhysReg);
+}
+
 bool M68kCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
                                  CallLoweringInfo &Info) const {
-  return false;
+  MachineFunction &MF = MIRBuilder.getMF();
+  Function &F = MF.getFunction();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  auto &DL = F.getParent()->getDataLayout();
+  const M68kTargetLowering &TLI = *getTLI<M68kTargetLowering>();
+  const M68kSubtarget &STI = MF.getSubtarget<M68kSubtarget>();
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
+  const M68kRegisterInfo *TRI = STI.getRegisterInfo();
+
+  SmallVector<ArgInfo, 8> OutArgs;
+  for (auto &OrigArg : Info.OrigArgs)
+    splitToValueTypes(OrigArg, OutArgs, DL, Info.CallConv);
+
+  SmallVector<ArgInfo, 8> InArgs;
+  if (!Info.OrigRet.Ty->isVoidTy())
+    splitToValueTypes(Info.OrigRet, InArgs, DL, Info.CallConv);
+
+  unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
+  auto CallSeqStart = MIRBuilder.buildInstr(AdjStackDown);
+
+  unsigned Opc = TLI.getTargetMachine().isPositionIndependent() ? M68k::CALLq
+                 : Info.Callee.isReg()                          ? M68k::CALLj
+                                                                : M68k::CALLb;
+
+  auto MIB = MIRBuilder.buildInstrNoInsert(Opc)
+                 .add(Info.Callee)
+                 .addRegMask(TRI->getCallPreservedMask(MF, Info.CallConv));
+
+  CCAssignFn *AssignFn = TLI.getCCAssignFn(Info.CallConv, false, Info.IsVarArg);
+  OutgoingValueAssigner Assigner(AssignFn);
+  M68kOutgoingArgHandler Handler(MIRBuilder, MRI, MIB);
+  if (!determineAndHandleAssignments(Handler, Assigner, OutArgs, MIRBuilder,
+                                     Info.CallConv, Info.IsVarArg))
+    return false;
+
+  if (Info.Callee.isReg())
+    constrainOperandRegClass(MF, *TRI, MRI, *STI.getInstrInfo(),
+                             *STI.getRegBankInfo(), *MIB, MIB->getDesc(),
+                             Info.Callee, 0);
+
+  MIRBuilder.insertInstr(MIB);
+
+  if (!Info.OrigRet.Ty->isVoidTy()) {
+    CCAssignFn *RetAssignFn =
+        TLI.getCCAssignFn(Info.CallConv, true, Info.IsVarArg);
+
+    OutgoingValueAssigner Assigner(RetAssignFn, RetAssignFn);
+    CallReturnHandler Handler(MIRBuilder, MRI, MIB);
+    if (!determineAndHandleAssignments(Handler, Assigner, InArgs, MIRBuilder,
+                                       Info.CallConv, Info.IsVarArg))
+      return false;
+  }
+
+  CallSeqStart.addImm(Assigner.StackOffset).addImm(0);
+
+  unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
+  MIRBuilder.buildInstr(AdjStackUp).addImm(Assigner.StackOffset).addImm(0);
+
+  return true;
 }
 
 bool M68kCallLowering::enableBigEndian() const { return true; }
diff --git a/llvm/lib/Target/M68k/GISel/M68kCallLowering.h b/llvm/lib/Target/M68k/GISel/M68kCallLowering.h
index 24212e6dd9c6..a1589e96aa3d 100644
--- a/llvm/lib/Target/M68k/GISel/M68kCallLowering.h
+++ b/llvm/lib/Target/M68k/GISel/M68kCallLowering.h
@@ -22,6 +22,7 @@
 namespace llvm {
 
 class M68kTargetLowering;
+class MachineInstrBuilder;
 
 class M68kCallLowering : public CallLowering {
   // TODO: We are only supporting return instruction with no value at this time
@@ -67,6 +68,17 @@ struct FormalArgHandler : public M68kIncomingValueHandler {
       : M68kIncomingValueHandler(MIRBuilder, MRI) {}
 };
 
+struct CallReturnHandler : public M68kIncomingValueHandler {
+  CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                    MachineInstrBuilder &MIB)
+      : M68kIncomingValueHandler(MIRBuilder, MRI), MIB(MIB) {}
+
+private:
+  void assignValueToReg(Register ValVReg, Register PhysReg,
+                        CCValAssign VA) override;
+
+  MachineInstrBuilder &MIB;
+};
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_M68K_GLSEL_M68KCALLLOWERING_H
diff --git a/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.cpp b/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.cpp
index b6ed6ab28a5d..f833eb2d19d4 100644
--- a/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.cpp
+++ b/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.cpp
@@ -13,9 +13,9 @@
 #include "M68kRegisterBankInfo.h"
 #include "M68kInstrInfo.h" // For the register classes
 #include "M68kSubtarget.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterBank.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 
 #define GET_TARGET_REGBANK_IMPL
diff --git a/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.h b/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.h
index 6c0b8ca7ba5a..493c139f018c 100644
--- a/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.h
+++ b/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.h
@@ -13,7 +13,7 @@
 #ifndef LLVM_LIB_TARGET_M68K_GLSEL_M68KREGISTERBANKINFO_H
 #define LLVM_LIB_TARGET_M68K_GLSEL_M68KREGISTERBANKINFO_H
 
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
 
 #define GET_REGBANK_DECLARATIONS
 #include "M68kGenRegisterBank.inc"
diff --git a/llvm/lib/Target/M68k/M68kCollapseMOVEMPass.cpp b/llvm/lib/Target/M68k/M68kCollapseMOVEMPass.cpp
index 7f0c0dd92dbb..cbd69f24666e 100644
--- a/llvm/lib/Target/M68k/M68kCollapseMOVEMPass.cpp
+++ b/llvm/lib/Target/M68k/M68kCollapseMOVEMPass.cpp
@@ -231,7 +231,7 @@ public:
   }
 
   bool runOnMachineFunction(MachineFunction &MF) override {
-    STI = &static_cast<const M68kSubtarget &>(MF.getSubtarget());
+    STI = &MF.getSubtarget<M68kSubtarget>();
     TII = STI->getInstrInfo();
     TRI = STI->getRegisterInfo();
     MFI = MF.getInfo<M68kMachineFunctionInfo>();
diff --git a/llvm/lib/Target/M68k/M68kExpandPseudo.cpp b/llvm/lib/Target/M68k/M68kExpandPseudo.cpp
index acfa30f28c2b..51a148f5aa04 100644
--- a/llvm/lib/Target/M68k/M68kExpandPseudo.cpp
+++ b/llvm/lib/Target/M68k/M68kExpandPseudo.cpp
@@ -302,7 +302,7 @@ bool M68kExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
 }
 
 bool M68kExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
-  STI = &static_cast<const M68kSubtarget &>(MF.getSubtarget());
+  STI = &MF.getSubtarget<M68kSubtarget>();
   TII = STI->getInstrInfo();
   TRI = STI->getRegisterInfo();
   MFI = MF.getInfo<M68kMachineFunctionInfo>();
diff --git a/llvm/lib/Target/M68k/M68kISelDAGToDAG.cpp b/llvm/lib/Target/M68k/M68kISelDAGToDAG.cpp
index 9ef97b96ea9a..f9459e284aef 100644
--- a/llvm/lib/Target/M68k/M68kISelDAGToDAG.cpp
+++ b/llvm/lib/Target/M68k/M68kISelDAGToDAG.cpp
@@ -181,6 +181,7 @@ public:
   }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
+  bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
 
 private:
   /// Keep a pointer to the M68kSubtarget around so that we can
@@ -311,8 +312,35 @@ private:
 };
 } // namespace
 
+bool M68kDAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U,
+                                          SDNode *Root) const {
+  if (OptLevel == CodeGenOpt::None)
+    return false;
+
+  if (U == Root) {
+    switch (U->getOpcode()) {
+    default:
+      return true;
+    case M68kISD::SUB:
+    case ISD::SUB:
+      // Prefer NEG instruction when zero subtracts a value.
+      // e.g.
+      //   move.l	#0, %d0
+      //   sub.l	(4,%sp), %d0
+      // vs.
+      //   move.l	(4,%sp), %d0
+      //   neg.l	%d0
+      if (llvm::isNullConstant(U->getOperand(0)))
+        return false;
+      break;
+    }
+  }
+
+  return true;
+}
+
 bool M68kDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
-  Subtarget = &static_cast<const M68kSubtarget &>(MF.getSubtarget());
+  Subtarget = &MF.getSubtarget<M68kSubtarget>();
   return SelectionDAGISel::runOnMachineFunction(MF);
 }
 
diff --git a/llvm/lib/Target/M68k/M68kISelLowering.cpp b/llvm/lib/Target/M68k/M68kISelLowering.cpp
index dba190a2ebc0..250519efd14a 100644
--- a/llvm/lib/Target/M68k/M68kISelLowering.cpp
+++ b/llvm/lib/Target/M68k/M68kISelLowering.cpp
@@ -101,6 +101,9 @@ M68kTargetLowering::M68kTargetLowering(const M68kTargetMachine &TM,
     setOperationAction(OP, MVT::i32, Expand);
   }
 
+  for (auto OP : {ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS})
+    setOperationAction(OP, MVT::i32, Custom);
+
   // Add/Sub overflow ops with MVT::Glues are lowered to CCR dependences.
   for (auto VT : {MVT::i8, MVT::i16, MVT::i32}) {
     setOperationAction(ISD::ADDC, VT, Custom);
@@ -170,7 +173,7 @@ MVT M68kTargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
   if (Ty.isSimple()) {
     return Ty.getSimpleVT();
   }
-  return MVT::getIntegerVT(8 * DL.getPointerSize(0));
+  return MVT::getIntegerVT(DL.getPointerSizeInBits(0));
 }
 
 #include "M68kGenCallingConv.inc"
@@ -1354,6 +1357,12 @@ SDValue M68kTargetLowering::LowerOperation(SDValue Op,
     return LowerVASTART(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC:
     return LowerDYNAMIC_STACKALLOC(Op, DAG);
+  case ISD::SHL_PARTS:
+    return LowerShiftLeftParts(Op, DAG);
+  case ISD::SRA_PARTS:
+    return LowerShiftRightParts(Op, DAG, true);
+  case ISD::SRL_PARTS:
+    return LowerShiftRightParts(Op, DAG, false);
   }
 }
 
@@ -3239,6 +3248,102 @@ SDValue M68kTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   return DAG.getMergeValues(Ops, DL);
 }
 
+SDValue M68kTargetLowering::LowerShiftLeftParts(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
+  SDValue Shamt = Op.getOperand(2);
+  EVT VT = Lo.getValueType();
+
+  // if Shamt - register size < 0: // Shamt < register size
+  //   Lo = Lo << Shamt
+  //   Hi = (Hi << Shamt) | ((Lo >>u 1) >>u (register size - 1 ^ Shamt))
+  // else:
+  //   Lo = 0
+  //   Hi = Lo << (Shamt - register size)
+
+  SDValue Zero = DAG.getConstant(0, DL, VT);
+  SDValue One = DAG.getConstant(1, DL, VT);
+  SDValue MinusRegisterSize = DAG.getConstant(-32, DL, VT);
+  SDValue RegisterSizeMinus1 = DAG.getConstant(32 - 1, DL, VT);
+  SDValue ShamtMinusRegisterSize =
+      DAG.getNode(ISD::ADD, DL, VT, Shamt, MinusRegisterSize);
+  SDValue RegisterSizeMinus1Shamt =
+      DAG.getNode(ISD::XOR, DL, VT, RegisterSizeMinus1, Shamt);
+
+  SDValue LoTrue = DAG.getNode(ISD::SHL, DL, VT, Lo, Shamt);
+  SDValue ShiftRight1Lo = DAG.getNode(ISD::SRL, DL, VT, Lo, One);
+  SDValue ShiftRightLo =
+      DAG.getNode(ISD::SRL, DL, VT, ShiftRight1Lo, RegisterSizeMinus1Shamt);
+  SDValue ShiftLeftHi = DAG.getNode(ISD::SHL, DL, VT, Hi, Shamt);
+  SDValue HiTrue = DAG.getNode(ISD::OR, DL, VT, ShiftLeftHi, ShiftRightLo);
+  SDValue HiFalse = DAG.getNode(ISD::SHL, DL, VT, Lo, ShamtMinusRegisterSize);
+
+  SDValue CC =
+      DAG.getSetCC(DL, MVT::i8, ShamtMinusRegisterSize, Zero, ISD::SETLT);
+
+  Lo = DAG.getNode(ISD::SELECT, DL, VT, CC, LoTrue, Zero);
+  Hi = DAG.getNode(ISD::SELECT, DL, VT, CC, HiTrue, HiFalse);
+
+  return DAG.getMergeValues({Lo, Hi}, DL);
+}
+
+SDValue M68kTargetLowering::LowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
+                                                 bool IsSRA) const {
+  SDLoc DL(Op);
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
+  SDValue Shamt = Op.getOperand(2);
+  EVT VT = Lo.getValueType();
+
+  // SRA expansion:
+  //   if Shamt - register size < 0: // Shamt < register size
+  //     Lo = (Lo >>u Shamt) | ((Hi << 1) << (register size - 1 ^ Shamt))
+  //     Hi = Hi >>s Shamt
+  //   else:
+  //     Lo = Hi >>s (Shamt - register size);
+  //     Hi = Hi >>s (register size - 1)
+  //
+  // SRL expansion:
+  //   if Shamt - register size < 0: // Shamt < register size
+  //     Lo = (Lo >>u Shamt) | ((Hi << 1) << (register size - 1 ^ Shamt))
+  //     Hi = Hi >>u Shamt
+  //   else:
+  //     Lo = Hi >>u (Shamt - register size);
+  //     Hi = 0;
+
+  unsigned ShiftRightOp = IsSRA ? ISD::SRA : ISD::SRL;
+
+  SDValue Zero = DAG.getConstant(0, DL, VT);
+  SDValue One = DAG.getConstant(1, DL, VT);
+  SDValue MinusRegisterSize = DAG.getConstant(-32, DL, VT);
+  SDValue RegisterSizeMinus1 = DAG.getConstant(32 - 1, DL, VT);
+  SDValue ShamtMinusRegisterSize =
+      DAG.getNode(ISD::ADD, DL, VT, Shamt, MinusRegisterSize);
+  SDValue RegisterSizeMinus1Shamt =
+      DAG.getNode(ISD::XOR, DL, VT, RegisterSizeMinus1, Shamt);
+
+  SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, VT, Lo, Shamt);
+  SDValue ShiftLeftHi1 = DAG.getNode(ISD::SHL, DL, VT, Hi, One);
+  SDValue ShiftLeftHi =
+      DAG.getNode(ISD::SHL, DL, VT, ShiftLeftHi1, RegisterSizeMinus1Shamt);
+  SDValue LoTrue = DAG.getNode(ISD::OR, DL, VT, ShiftRightLo, ShiftLeftHi);
+  SDValue HiTrue = DAG.getNode(ShiftRightOp, DL, VT, Hi, Shamt);
+  SDValue LoFalse =
+      DAG.getNode(ShiftRightOp, DL, VT, Hi, ShamtMinusRegisterSize);
+  SDValue HiFalse =
+      IsSRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, RegisterSizeMinus1) : Zero;
+
+  SDValue CC =
+      DAG.getSetCC(DL, MVT::i8, ShamtMinusRegisterSize, Zero, ISD::SETLT);
+
+  Lo = DAG.getNode(ISD::SELECT, DL, VT, CC, LoTrue, LoFalse);
+  Hi = DAG.getNode(ISD::SELECT, DL, VT, CC, HiTrue, HiFalse);
+
+  return DAG.getMergeValues({Lo, Hi}, DL);
+}
+
 //===----------------------------------------------------------------------===//
 // DAG Combine
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/M68k/M68kISelLowering.h b/llvm/lib/Target/M68k/M68kISelLowering.h
index 9375a99962eb..f759a7d939c8 100644
--- a/llvm/lib/Target/M68k/M68kISelLowering.h
+++ b/llvm/lib/Target/M68k/M68kISelLowering.h
@@ -220,6 +220,8 @@ private:
   SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG, bool IsSRA) const;
 
   SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                           CallingConv::ID CallConv, bool IsVarArg,
diff --git a/llvm/lib/Target/M68k/M68kInstrArithmetic.td b/llvm/lib/Target/M68k/M68kInstrArithmetic.td
index ef50de576641..2339e3caa517 100644
--- a/llvm/lib/Target/M68k/M68kInstrArithmetic.td
+++ b/llvm/lib/Target/M68k/M68kInstrArithmetic.td
@@ -27,10 +27,35 @@
 ///
 //===----------------------------------------------------------------------===//
 
+//===----------------------------------------------------------------------===//
+// OPMODE Encoding
+//===----------------------------------------------------------------------===//
+class MxOpModeEncoding<bits<3> encoding> {
+  bits<3> Value = encoding;
+}
+
+// op EA, Dn
+def MxOpMode8_d_EA  : MxOpModeEncoding<0b000>;
+def MxOpMode16_d_EA : MxOpModeEncoding<0b001>;
+def MxOpMode32_d_EA : MxOpModeEncoding<0b010>;
+
+// op Dn, EA
+def MxOpMode8_EA_d  : MxOpModeEncoding<0b100>;
+def MxOpMode16_EA_d : MxOpModeEncoding<0b101>;
+def MxOpMode32_EA_d : MxOpModeEncoding<0b110>;
+
+// op EA, An
+def MxOpMode16_a_EA : MxOpModeEncoding<0b011>;
+def MxOpMode32_a_EA : MxOpModeEncoding<0b111>;
+
+
 //===----------------------------------------------------------------------===//
 // Encoding
 //===----------------------------------------------------------------------===//
 
+let Defs = [CCR] in {
+let Constraints = "$src = $dst" in {
+
 /// Encoding for Normal forms
 /// ----------------------------------------------------
 ///  F  E  D  C | B  A  9 | 8  7  6 | 5  4  3 | 2  1  0
@@ -38,23 +63,54 @@
 ///             |         |         | EFFECTIVE ADDRESS
 ///  x  x  x  x |   REG   | OP MODE |   MODE  |   REG
 /// ----------------------------------------------------
-class MxArithEncoding<MxBead4Bits CMD, MxEncOpMode OPMODE, MxBead REG,
-                      MxEncEA EA, MxEncExt EXT>
-    : MxEncoding<EA.Reg, EA.DA, EA.Mode, OPMODE.B0, OPMODE.B1, OPMODE.B2, REG,
-                 CMD,EXT.Imm, EXT.B8, EXT.Scale, EXT.WL, EXT.DAReg>;
 
-/// Encoding for Extended forms
-/// ------------------------------------------------------
-///  F  E  D  C | B  A  9 | 8 | 7  6 | 5  4 | 3 | 2  1  0
-/// ------------------------------------------------------
-///  x  x  x  x |  REG Rx | 1 | SIZE | 0  0 | M |  REG Ry
-/// ------------------------------------------------------
-/// Rx - destination
-/// Ry - source
-/// M  - address mode switch
-class MxArithXEncoding<MxBead4Bits CMD, MxEncSize SIZE, MxBead1Bit MODE,
-                       MxBeadDReg SRC, MxBeadDReg DST>
-    : MxEncoding<SRC, MODE, MxBead2Bits<0b00>, SIZE, MxBead1Bit<0b1>, DST, CMD>;
+// $reg, $ccr <- $reg op $reg
+class MxBiArOp_R_RR_xEA<string MN, SDNode NODE, MxType DST_TYPE, MxType SRC_TYPE,
+                        bits<4> CMD>
+    : MxInst<(outs DST_TYPE.ROp:$dst), (ins DST_TYPE.ROp:$src, SRC_TYPE.ROp:$opd),
+             MN#"."#DST_TYPE.Prefix#"\t$opd, $dst",
+             [(set DST_TYPE.VT:$dst, CCR, (NODE DST_TYPE.VT:$src, SRC_TYPE.VT:$opd))]> {
+  let Inst = (descend
+    CMD, (operand "$dst", 3),
+    !cast<MxOpModeEncoding>("MxOpMode"#DST_TYPE.Size#"_"#DST_TYPE.RLet#"_EA").Value,
+    !cond(
+      !eq(SRC_TYPE.RLet, "r") : (descend 0b00, (operand "$opd", 4)),
+      !eq(SRC_TYPE.RLet, "d") : (descend 0b000, (operand "$opd", 3))
+    )
+  );
+}
+
+/// This Op is similar to the one above except it uses reversed opmode, some
+/// commands(e.g. eor) do not support dEA or rEA modes and require EAd for
+/// register only operations.
+/// NOTE when using dd commands it is irrelevant which opmode to use(as it seems)
+/// but some opcodes support address register and some do not which creates this
+/// mess.
+class MxBiArOp_R_RR_EAd<string MN, SDNode NODE, MxType TYPE, bits<4> CMD>
+    : MxInst<(outs TYPE.ROp:$dst), (ins TYPE.ROp:$src, TYPE.ROp:$opd),
+             MN#"."#TYPE.Prefix#"\t$opd, $dst",
+             [(set TYPE.VT:$dst, CCR, (NODE TYPE.VT:$src, TYPE.VT:$opd))]> {
+  let Inst = (descend
+    CMD, (operand "$opd", 3),
+    !cast<MxOpModeEncoding>("MxOpMode"#TYPE.Size#"_EA_"#TYPE.RLet).Value,
+    /*Destination can only be a data register*/
+    /*MODE*/0b000,
+    /*REGISTER*/(operand "$dst", 3));
+}
+
+let mayLoad = 1 in
+class MxBiArOp_R_RM<string MN, SDNode NODE, MxType TYPE, MxOperand OPD, ComplexPattern PAT,
+                    bits<4> CMD, MxEncMemOp SRC_ENC>
+    : MxInst<(outs TYPE.ROp:$dst), (ins TYPE.ROp:$src, OPD:$opd),
+             MN#"."#TYPE.Prefix#"\t$opd, $dst",
+             [(set TYPE.VT:$dst, CCR, (NODE TYPE.VT:$src, (TYPE.Load PAT:$opd)))]> {
+  let Inst = (ascend
+    (descend CMD, (operand "$dst", 3),
+             !cast<MxOpModeEncoding>("MxOpMode"#TYPE.Size#"_"#TYPE.RLet#"_EA").Value,
+             SRC_ENC.EA),
+    SRC_ENC.Supplement
+  );
+}
 
 /// Encoding for Immediate forms
 /// ---------------------------------------------------
@@ -69,211 +125,154 @@ class MxArithXEncoding<MxBead4Bits CMD, MxEncSize SIZE, MxBead1Bit MODE,
 /// ---------------------------------------------------
 /// NOTE It is used to store an immediate to memory, imm-to-reg are handled with
 /// normal version
-class MxArithImmEncoding<MxBead4Bits CMD, MxEncSize SIZE,
-                         MxEncEA DST_EA, MxEncExt DST_EXT, MxEncExt SRC_EXT>
-    : MxEncoding<DST_EA.Reg, DST_EA.DA, DST_EA.Mode, SIZE, CMD, MxBead4Bits<0>,
-                 // Source
-                 SRC_EXT.Imm, SRC_EXT.B8, SRC_EXT.Scale,
-                 SRC_EXT.WL, SRC_EXT.DAReg,
-                 // Destination
-                 DST_EXT.Imm, DST_EXT.B8, DST_EXT.Scale,
-                 DST_EXT.WL, DST_EXT.DAReg>;
-
-
-//===----------------------------------------------------------------------===//
-// Add/Sub
-//===----------------------------------------------------------------------===//
-
-let Defs = [CCR] in {
-let Constraints = "$src = $dst" in {
-
-// $reg, $ccr <- $reg op $reg
-class MxBiArOp_RFRR_xEA<string MN, SDNode NODE, MxType DST_TYPE, MxType SRC_TYPE,
-                        bits<4> CMD, MxBead REG>
-    : MxInst<(outs DST_TYPE.ROp:$dst), (ins DST_TYPE.ROp:$src, SRC_TYPE.ROp:$opd),
-             MN#"."#DST_TYPE.Prefix#"\t$opd, $dst",
-             [(set DST_TYPE.VT:$dst, CCR, (NODE DST_TYPE.VT:$src, SRC_TYPE.VT:$opd))],
-             MxArithEncoding<MxBead4Bits<CMD>,
-                             !cast<MxEncOpMode>("MxOpMode"#DST_TYPE.Size#DST_TYPE.RLet#"EA"),
-                             REG,
-                             !cast<MxEncEA>("MxEncEA"#SRC_TYPE.RLet#"_2"),
-                             MxExtEmpty>>;
-
-/// This Op is similar to the one above except it uses reversed opmode, some
-/// commands(e.g. eor) do not support dEA or rEA modes and require EAd for
-/// register only operations.
-/// NOTE when using dd commands it is irrelevant which opmode to use(as it seems)
-/// but some opcodes support address register and some do not which creates this
-/// mess.
-class MxBiArOp_RFRR_EAd<string MN, SDNode NODE, MxType TYPE, bits<4> CMD>
-    : MxInst<(outs TYPE.ROp:$dst), (ins TYPE.ROp:$src, TYPE.ROp:$opd),
-             MN#"."#TYPE.Prefix#"\t$opd, $dst",
-             [(set TYPE.VT:$dst, CCR, (NODE TYPE.VT:$src, TYPE.VT:$opd))],
-             MxArithEncoding<MxBead4Bits<CMD>,
-                             !cast<MxEncOpMode>("MxOpMode"#TYPE.Size#"EAd"),
-                             MxBeadDReg<2>, MxEncEAd_0, MxExtEmpty>>;
 
 // $reg <- $reg op $imm
-class MxBiArOp_RFRI_xEA<string MN, SDNode NODE, MxType TYPE, bits<4> CMD>
+class MxBiArOp_R_RI_xEA<string MN, SDNode NODE, MxType TYPE, bits<4> CMD>
     : MxInst<(outs TYPE.ROp:$dst), (ins TYPE.ROp:$src, TYPE.IOp:$opd),
              MN#"."#TYPE.Prefix#"\t$opd, $dst",
-             [(set TYPE.VT:$dst, CCR, (NODE TYPE.VT:$src, TYPE.IPat:$opd))],
-             MxArithEncoding<MxBead4Bits<CMD>,
-                             !cast<MxEncOpMode>("MxOpMode"#TYPE.Size#TYPE.RLet#"EA"),
-                             MxBeadDReg<0>, MxEncEAi,
-                             !cast<MxEncExt>("MxExtI"#TYPE.Size#"_2")>>;
+             [(set TYPE.VT:$dst, CCR, (NODE TYPE.VT:$src, TYPE.IPat:$opd))]> {
+  let Inst = (ascend
+    (descend CMD, (operand "$dst", 3),
+             !cast<MxOpModeEncoding>("MxOpMode"#TYPE.Size#"_"#TYPE.RLet#"_EA").Value,
+             MxEncAddrMode_i<"opd", TYPE.Size>.EA),
+    MxEncAddrMode_i<"opd", TYPE.Size>.Supplement
+  );
+}
 
 // Again, there are two ways to write an immediate to Dn register either dEA
-// opmode or using *I encoding, and again some instrucitons also support address
+// opmode or using *I encoding, and again some instructions also support address
 // registers some do not.
-class MxBiArOp_RFRI<string MN, SDNode NODE, MxType TYPE, bits<4> CMD>
+class MxBiArOp_R_RI<string MN, SDNode NODE, MxType TYPE, bits<4> CMD>
     : MxInst<(outs TYPE.ROp:$dst), (ins TYPE.ROp:$src, TYPE.IOp:$opd),
              MN#"i."#TYPE.Prefix#"\t$opd, $dst",
-             [(set TYPE.VT:$dst, CCR, (NODE TYPE.VT:$src, TYPE.IPat:$opd))],
-             MxArithImmEncoding<MxBead4Bits<CMD>, !cast<MxEncSize>("MxEncSize"#TYPE.Size),
-                                !cast<MxEncEA>("MxEncEA"#TYPE.RLet#"_0"), MxExtEmpty,
-                                !cast<MxEncExt>("MxExtI"#TYPE.Size#"_2")>>;
-
-let mayLoad = 1 in
-class MxBiArOp_RFRM<string MN, SDNode NODE, MxType TYPE, MxOperand OPD, ComplexPattern PAT,
-                    bits<4> CMD, MxEncEA EA, MxEncExt EXT>
-    : MxInst<(outs TYPE.ROp:$dst), (ins TYPE.ROp:$src, OPD:$opd),
-             MN#"."#TYPE.Prefix#"\t$opd, $dst",
-             [(set TYPE.VT:$dst, CCR, (NODE TYPE.VT:$src, (TYPE.Load PAT:$opd)))],
-             MxArithEncoding<MxBead4Bits<CMD>,
-                             !cast<MxEncOpMode>("MxOpMode"#TYPE.Size#TYPE.RLet#"EA"),
-                             MxBeadDReg<0>, EA, EXT>>;
-
+             [(set TYPE.VT:$dst, CCR, (NODE TYPE.VT:$src, TYPE.IPat:$opd))]> {
+  let Inst = (ascend
+    (descend 0b0000, CMD,
+             !cast<MxNewEncSize>("MxNewEncSize"#TYPE.Size).Value,
+             // The destination cannot be address register, so it's always
+             // the MODE for data register direct mode.
+             /*MODE*/0b000,
+             /*REGISTER*/(operand "$dst", 3)),
+    // Source (i.e. immediate value) encoding
+    MxEncAddrMode_i<"opd", TYPE.Size>.Supplement
+  );
+}
 } // Constraints
 
 let mayLoad = 1, mayStore = 1 in {
 
 // FIXME MxBiArOp_FMR/FMI cannot consume CCR from MxAdd/MxSub which leads for
 // MxAdd to survive the match and subsequent mismatch.
-class MxBiArOp_FMR<string MN, MxType TYPE, MxOperand MEMOpd,
-                   bits<4> CMD, MxEncEA EA, MxEncExt EXT>
+class MxBiArOp_MR<string MN, MxType TYPE,
+                  MxOperand MEMOpd, bits<4> CMD, MxEncMemOp DST_ENC>
     : MxInst<(outs), (ins MEMOpd:$dst, TYPE.ROp:$opd),
-             MN#"."#TYPE.Prefix#"\t$opd, $dst",
-             [],
-             MxArithEncoding<MxBead4Bits<CMD>,
-                             !cast<MxEncOpMode>("MxOpMode"#TYPE.Size#"EA"#TYPE.RLet),
-                             MxBeadDReg<1>, EA, EXT>>;
+             MN#"."#TYPE.Prefix#"\t$opd, $dst", []> {
+  let Inst = (ascend
+    (descend CMD, (operand "$opd", 3),
+             !cast<MxOpModeEncoding>("MxOpMode"#TYPE.Size#"_EA_"#TYPE.RLet).Value,
+             DST_ENC.EA),
+    DST_ENC.Supplement
+  );
+}
 
-class MxBiArOp_FMI<string MN, MxType TYPE, MxOperand MEMOpd,
-                   bits<4> CMD, MxEncEA MEMEA, MxEncExt MEMExt>
+class MxBiArOp_MI<string MN, MxType TYPE,
+                  MxOperand MEMOpd, bits<4> CMD, MxEncMemOp DST_ENC>
     : MxInst<(outs), (ins MEMOpd:$dst, TYPE.IOp:$opd),
-             MN#"."#TYPE.Prefix#"\t$opd, $dst",
-             [],
-             MxArithImmEncoding<MxBead4Bits<CMD>,
-                                !cast<MxEncSize>("MxEncSize"#TYPE.Size),
-                                MEMEA, MEMExt,
-                                !cast<MxEncExt>("MxExtI"#TYPE.Size#"_1")>>;
+             MN#"."#TYPE.Prefix#"\t$opd, $dst", []> {
+  let Inst = (ascend
+    (descend 0b0000, CMD,
+             !cast<MxNewEncSize>("MxNewEncSize"#TYPE.Size).Value,
+             DST_ENC.EA),
+    // Source (i.e. immediate value) encoding
+    MxEncAddrMode_i<"opd", TYPE.Size>.Supplement,
+    // Destination encoding
+    DST_ENC.Supplement
+  );
+}
 } // mayLoad, mayStore
 } // Defs = [CCR]
 
 multiclass MxBiArOp_DF<string MN, SDNode NODE, bit isComm,
                        bits<4> CMD, bits<4> CMDI> {
 
-  // op $mem, $reg
-  def NAME#"8dk"  : MxBiArOp_RFRM<MN, NODE, MxType8d,  MxType8.KOp,  MxType8.KPat,
-                                  CMD, MxEncEAk, MxExtBrief_2>;
-  def NAME#"16dk" : MxBiArOp_RFRM<MN, NODE, MxType16d, MxType16.KOp, MxType16.KPat,
-                                  CMD, MxEncEAk, MxExtBrief_2>;
-  def NAME#"32dk" : MxBiArOp_RFRM<MN, NODE, MxType32d, MxType32.KOp, MxType32.KPat,
-                                  CMD, MxEncEAk, MxExtBrief_2>;
-
-  def NAME#"8dq"  : MxBiArOp_RFRM<MN, NODE, MxType8d,  MxType8.QOp,  MxType8.QPat,
-                                  CMD, MxEncEAq, MxExtI16_2>;
-  def NAME#"16dq" : MxBiArOp_RFRM<MN, NODE, MxType16d, MxType16.QOp, MxType16.QPat,
-                                  CMD, MxEncEAq, MxExtI16_2>;
-  def NAME#"32dq" : MxBiArOp_RFRM<MN, NODE, MxType32d, MxType32.QOp, MxType32.QPat,
-                                  CMD, MxEncEAq, MxExtI16_2>;
-
-  def NAME#"8dp"  : MxBiArOp_RFRM<MN, NODE, MxType8d,  MxType8.POp,  MxType8.PPat,
-                                  CMD, MxEncEAp_2, MxExtI16_2>;
-  def NAME#"16dp" : MxBiArOp_RFRM<MN, NODE, MxType16d, MxType16.POp, MxType16.PPat,
-                                  CMD, MxEncEAp_2, MxExtI16_2>;
-  def NAME#"32dp" : MxBiArOp_RFRM<MN, NODE, MxType32d, MxType32.POp, MxType32.PPat,
-                                  CMD, MxEncEAp_2, MxExtI16_2>;
-
-  def NAME#"8df"  : MxBiArOp_RFRM<MN, NODE, MxType8d,  MxType8.FOp,  MxType8.FPat,
-                                  CMD, MxEncEAf_2, MxExtBrief_2>;
-  def NAME#"16df" : MxBiArOp_RFRM<MN, NODE, MxType16d, MxType16.FOp, MxType16.FPat,
-                                  CMD, MxEncEAf_2, MxExtBrief_2>;
-  def NAME#"32df" : MxBiArOp_RFRM<MN, NODE, MxType32d, MxType32.FOp, MxType32.FPat,
-                                  CMD, MxEncEAf_2, MxExtBrief_2>;
-
-  def NAME#"8dj"  : MxBiArOp_RFRM<MN, NODE, MxType8d,  MxType8.JOp,  MxType8.JPat,
-                                  CMD, MxEncEAj_2, MxExtEmpty>;
-  def NAME#"16dj" : MxBiArOp_RFRM<MN, NODE, MxType16d, MxType16.JOp, MxType16.JPat,
-                                  CMD, MxEncEAj_2, MxExtEmpty>;
-  def NAME#"32dj" : MxBiArOp_RFRM<MN, NODE, MxType32d, MxType32.JOp, MxType32.JPat,
-                                  CMD, MxEncEAj_2, MxExtEmpty>;
-
-  // op $imm, $reg
-  def NAME#"8di"  : MxBiArOp_RFRI_xEA<MN, NODE, MxType8d,  CMD>;
-  def NAME#"16di" : MxBiArOp_RFRI_xEA<MN, NODE, MxType16d, CMD>;
-  def NAME#"32di" : MxBiArOp_RFRI_xEA<MN, NODE, MxType32d, CMD>;
-
-  // op $reg, $mem
-  def NAME#"8pd"  : MxBiArOp_FMR<MN, MxType8d,  MxType8.POp,
-                                 CMD, MxEncEAp_0, MxExtI16_0>;
-  def NAME#"16pd" : MxBiArOp_FMR<MN, MxType16d, MxType16.POp,
-                                 CMD, MxEncEAp_0, MxExtI16_0>;
-  def NAME#"32pd" : MxBiArOp_FMR<MN, MxType32d, MxType32.POp,
-                                 CMD, MxEncEAp_0, MxExtI16_0>;
-
-  def NAME#"8fd"  : MxBiArOp_FMR<MN, MxType8d,  MxType8.FOp,
-                                 CMD, MxEncEAf_0, MxExtBrief_0>;
-  def NAME#"16fd" : MxBiArOp_FMR<MN, MxType16d, MxType16.FOp,
-                                 CMD, MxEncEAf_0, MxExtBrief_0>;
-  def NAME#"32fd" : MxBiArOp_FMR<MN, MxType32d, MxType32.FOp,
-                                 CMD, MxEncEAf_0, MxExtBrief_0>;
-
-  def NAME#"8jd"  : MxBiArOp_FMR<MN, MxType8d,  MxType8.JOp,
-                                 CMD, MxEncEAj_0, MxExtEmpty>;
-  def NAME#"16jd" : MxBiArOp_FMR<MN, MxType16d, MxType16.JOp,
-                                 CMD, MxEncEAj_0, MxExtEmpty>;
-  def NAME#"32jd" : MxBiArOp_FMR<MN, MxType32d, MxType32.JOp,
-                                 CMD, MxEncEAj_0, MxExtEmpty>;
-
-  // op $imm, $mem
-  def NAME#"8pi"  : MxBiArOp_FMI<MN, MxType8,  MxType8.POp,
-                                 CMDI, MxEncEAp_0, MxExtI16_0>;
-  def NAME#"16pi" : MxBiArOp_FMI<MN, MxType16, MxType16.POp,
-                                 CMDI, MxEncEAp_0, MxExtI16_0>;
-  def NAME#"32pi" : MxBiArOp_FMI<MN, MxType32, MxType32.POp,
-                                 CMDI, MxEncEAp_0, MxExtI16_0>;
-
-  def NAME#"8fi"  : MxBiArOp_FMI<MN, MxType8,  MxType8.FOp,
-                                 CMDI, MxEncEAf_0, MxExtBrief_0>;
-  def NAME#"16fi" : MxBiArOp_FMI<MN, MxType16, MxType16.FOp,
-                                 CMDI, MxEncEAf_0, MxExtBrief_0>;
-  def NAME#"32fi" : MxBiArOp_FMI<MN, MxType32, MxType32.FOp,
-                                 CMDI, MxEncEAf_0, MxExtBrief_0>;
-
-  def NAME#"8ji"  : MxBiArOp_FMI<MN, MxType8,  MxType8.JOp,
-                                 CMDI, MxEncEAj_0, MxExtEmpty>;
-  def NAME#"16ji" : MxBiArOp_FMI<MN, MxType16, MxType16.JOp,
-                                 CMDI, MxEncEAj_0, MxExtEmpty>;
-  def NAME#"32ji" : MxBiArOp_FMI<MN, MxType32, MxType32.JOp,
-                                 CMDI, MxEncEAj_0, MxExtEmpty>;
-
-  def NAME#"16dr" : MxBiArOp_RFRR_xEA<MN, NODE, MxType16d, MxType16r,
-                                      CMD, MxBeadDReg<0>>;
-  def NAME#"32dr" : MxBiArOp_RFRR_xEA<MN, NODE, MxType32d, MxType32r,
-                                      CMD, MxBeadDReg<0>>;
-
-  let isCommutable = isComm in {
-
-    def NAME#"8dd"  : MxBiArOp_RFRR_xEA<MN, NODE, MxType8d, MxType8d,
-                                        CMD, MxBeadDReg<0>>;
-    def NAME#"16dd" : MxBiArOp_RFRR_xEA<MN, NODE, MxType16d, MxType16d,
-                                        CMD, MxBeadDReg<0>>;
-    def NAME#"32dd" : MxBiArOp_RFRR_xEA<MN, NODE, MxType32d, MxType32d,
-                                        CMD, MxBeadDReg<0>>;
-
-  } // isComm
+  foreach SZ = [8, 16, 32] in {
+    // op $mem, $reg
+    def NAME#SZ#"dk"  : MxBiArOp_R_RM<MN, NODE,
+                                      !cast<MxType>("MxType"#SZ#"d"),
+                                      !cast<MxType>("MxType"#SZ).KOp,
+                                      !cast<MxType>("MxType"#SZ).KPat,
+                                      CMD, MxEncAddrMode_k<"opd">>;
+
+    def NAME#SZ#"dq"  : MxBiArOp_R_RM<MN, NODE,
+                                      !cast<MxType>("MxType"#SZ#"d"),
+                                      !cast<MxType>("MxType"#SZ).QOp,
+                                      !cast<MxType>("MxType"#SZ).QPat,
+                                      CMD, MxEncAddrMode_q<"opd">>;
+
+    def NAME#SZ#"dp"  : MxBiArOp_R_RM<MN, NODE,
+                                      !cast<MxType>("MxType"#SZ#"d"),
+                                      !cast<MxType>("MxType"#SZ).POp,
+                                      !cast<MxType>("MxType"#SZ).PPat,
+                                      CMD, MxEncAddrMode_p<"opd">>;
+
+    def NAME#SZ#"df"  : MxBiArOp_R_RM<MN, NODE,
+                                      !cast<MxType>("MxType"#SZ#"d"),
+                                      !cast<MxType>("MxType"#SZ).FOp,
+                                      !cast<MxType>("MxType"#SZ).FPat,
+                                      CMD, MxEncAddrMode_f<"opd">>;
+
+    def NAME#SZ#"dj"  : MxBiArOp_R_RM<MN, NODE,
+                                      !cast<MxType>("MxType"#SZ#"d"),
+                                      !cast<MxType>("MxType"#SZ).JOp,
+                                      !cast<MxType>("MxType"#SZ).JPat,
+                                      CMD, MxEncAddrMode_j<"opd">>;
+    // op $imm, $reg
+    def NAME#SZ#"di"  : MxBiArOp_R_RI_xEA<MN, NODE,
+                                          !cast<MxType>("MxType"#SZ#"d"),
+                                          CMD>;
+    // op $reg, $mem
+    def NAME#SZ#"pd"  : MxBiArOp_MR<MN,
+                                    !cast<MxType>("MxType"#SZ#"d"),
+                                    !cast<MxType>("MxType"#SZ).POp,
+                                    CMD, MxEncAddrMode_p<"dst">>;
+
+    def NAME#SZ#"fd"  : MxBiArOp_MR<MN,
+                                    !cast<MxType>("MxType"#SZ#"d"),
+                                    !cast<MxType>("MxType"#SZ).FOp,
+                                    CMD, MxEncAddrMode_f<"dst">>;
+
+    def NAME#SZ#"jd"  : MxBiArOp_MR<MN,
+                                    !cast<MxType>("MxType"#SZ#"d"),
+                                    !cast<MxType>("MxType"#SZ).JOp,
+                                    CMD, MxEncAddrMode_j<"dst">>;
+    // op $imm, $mem
+    def NAME#SZ#"pi"  : MxBiArOp_MI<MN,
+                                    !cast<MxType>("MxType"#SZ),
+                                    !cast<MxType>("MxType"#SZ).POp,
+                                    CMDI, MxEncAddrMode_p<"dst">>;
+
+    def NAME#SZ#"fi"  : MxBiArOp_MI<MN,
+                                    !cast<MxType>("MxType"#SZ),
+                                    !cast<MxType>("MxType"#SZ).FOp,
+                                    CMDI, MxEncAddrMode_f<"dst">>;
+
+    def NAME#SZ#"ji"  : MxBiArOp_MI<MN,
+                                    !cast<MxType>("MxType"#SZ),
+                                    !cast<MxType>("MxType"#SZ).JOp,
+                                    CMDI, MxEncAddrMode_j<"dst">>;
+    // op $reg, $reg
+    let isCommutable = isComm in
+    def NAME#SZ#"dd" : MxBiArOp_R_RR_xEA<MN, NODE,
+                                         !cast<MxType>("MxType"#SZ#"d"),
+                                         !cast<MxType>("MxType"#SZ#"d"),
+                                         CMD>;
+  } // foreach SZ
+
+  foreach SZ = [16, 32] in
+  def NAME#SZ#"dr" : MxBiArOp_R_RR_xEA<MN, NODE,
+                                       !cast<MxType>("MxType"#SZ#"d"),
+                                       !cast<MxType>("MxType"#SZ#"r"),
+                                       CMD>;
 
 } // MxBiArOp_DF
 
@@ -284,25 +283,28 @@ multiclass MxBiArOp_DF<string MN, SDNode NODE, bit isComm,
 let Pattern = [(null_frag)] in
 multiclass MxBiArOp_AF<string MN, SDNode NODE, bits<4> CMD> {
 
-  def NAME#"32ak" : MxBiArOp_RFRM<MN, NODE, MxType32a, MxType32.KOp, MxType32.KPat,
-                                  CMD, MxEncEAk, MxExtBrief_2>;
-  def NAME#"32aq" : MxBiArOp_RFRM<MN, NODE, MxType32a, MxType32.QOp, MxType32.QPat,
-                                  CMD, MxEncEAq, MxExtI16_2>;
-  def NAME#"32af" : MxBiArOp_RFRM<MN, NODE, MxType32a, MxType32.FOp, MxType32.FPat,
-                                  CMD, MxEncEAf_2, MxExtBrief_2>;
-  def NAME#"32ap" : MxBiArOp_RFRM<MN, NODE, MxType32a, MxType32.POp, MxType32.PPat,
-                                  CMD, MxEncEAp_2, MxExtI16_2>;
-  def NAME#"32aj" : MxBiArOp_RFRM<MN, NODE, MxType32a, MxType32.JOp, MxType32.JPat,
-                                  CMD, MxEncEAj_2, MxExtEmpty>;
-  def NAME#"32ai" : MxBiArOp_RFRI_xEA<MN, NODE, MxType32a, CMD>;
-
-  def NAME#"32ar" : MxBiArOp_RFRR_xEA<MN, NODE, MxType32a, MxType32r,
-                                      CMD, MxBeadReg<0>>;
+  def NAME#"32ak" : MxBiArOp_R_RM<MN, NODE, MxType32a, MxType32.KOp, MxType32.KPat,
+                                  CMD, MxEncAddrMode_k<"opd">>;
+  def NAME#"32aq" : MxBiArOp_R_RM<MN, NODE, MxType32a, MxType32.QOp, MxType32.QPat,
+                                  CMD, MxEncAddrMode_q<"opd">>;
+  def NAME#"32af" : MxBiArOp_R_RM<MN, NODE, MxType32a, MxType32.FOp, MxType32.FPat,
+                                  CMD, MxEncAddrMode_f<"opd">>;
+  def NAME#"32ap" : MxBiArOp_R_RM<MN, NODE, MxType32a, MxType32.POp, MxType32.PPat,
+                                  CMD, MxEncAddrMode_p<"opd">>;
+  def NAME#"32aj" : MxBiArOp_R_RM<MN, NODE, MxType32a, MxType32.JOp, MxType32.JPat,
+                                  CMD, MxEncAddrMode_j<"opd">>;
+  def NAME#"32ai" : MxBiArOp_R_RI_xEA<MN, NODE, MxType32a, CMD>;
+
+  def NAME#"32ar" : MxBiArOp_R_RR_xEA<MN, NODE, MxType32a, MxType32r, CMD>;
 
 } // MxBiArOp_AF
 
 // NOTE These naturally produce CCR
 
+//===----------------------------------------------------------------------===//
+// Add/Sub
+//===----------------------------------------------------------------------===//
+
 defm ADD : MxBiArOp_DF<"add",  MxAdd, 1, 0xD, 0x6>;
 defm ADD : MxBiArOp_AF<"adda", MxAdd, 0xD>;
 defm SUB : MxBiArOp_DF<"sub",  MxSub, 0, 0x9, 0x4>;
@@ -312,26 +314,42 @@ defm SUB : MxBiArOp_AF<"suba", MxSub, 0x9>;
 let Uses = [CCR], Defs = [CCR] in {
 let Constraints = "$src = $dst" in {
 
+/// Encoding for Extended forms
+/// ------------------------------------------------------
+///  F  E  D  C | B  A  9 | 8 | 7  6 | 5  4 | 3 | 2  1  0
+/// ------------------------------------------------------
+///  x  x  x  x |  REG Rx | 1 | SIZE | 0  0 | M |  REG Ry
+/// ------------------------------------------------------
+/// Rx - destination
+/// Ry - source
+/// M  - address mode switch
+
 // $reg, ccr <- $reg op $reg op ccr
-class MxBiArOp_RFRRF<string MN, SDNode NODE, MxType TYPE, bits<4> CMD>
+class MxBiArOp_R_RRX<string MN, SDNode NODE, MxType TYPE, bits<4> CMD>
     : MxInst<(outs TYPE.ROp:$dst), (ins TYPE.ROp:$src, TYPE.ROp:$opd),
              MN#"."#TYPE.Prefix#"\t$opd, $dst",
-             [(set TYPE.VT:$dst, CCR, (NODE TYPE.VT:$src, TYPE.VT:$opd, CCR))],
-             MxArithXEncoding<MxBead4Bits<CMD>,
-                              !cast<MxEncSize>("MxEncSize"#TYPE.Size),
-                              MxBead1Bit<0>, MxBeadDReg<2>, MxBeadDReg<0>>>;
-
+             [(set TYPE.VT:$dst, CCR, (NODE TYPE.VT:$src, TYPE.VT:$opd, CCR))]> {
+  let Inst = (descend CMD,
+    // Destination register
+    (operand "$dst", 3),
+    0b1,
+    // SIZE
+    !cond(!eq(TYPE.Size, 8): 0b00,
+          !eq(TYPE.Size, 16): 0b01,
+          !eq(TYPE.Size, 32): 0b10),
+    0b00, /*R/M*/0b0,
+    // Source register
+    (operand "$opd", 3)
+  );
+}
 } // Constraints
 } // Uses, Defs
 
 multiclass MxBiArOp_RFF<string MN, SDNode NODE, bit isComm, bits<4> CMD> {
 
 let isCommutable = isComm in {
-
-  def NAME#"8dd"  : MxBiArOp_RFRRF<MN, NODE, MxType8d,  CMD>;
-  def NAME#"16dd" : MxBiArOp_RFRRF<MN, NODE, MxType16d, CMD>;
-  def NAME#"32dd" : MxBiArOp_RFRRF<MN, NODE, MxType32d, CMD>;
-
+  foreach SZ = [8, 16, 32] in
+  def NAME#SZ#"dd"  : MxBiArOp_R_RRX<MN, NODE, !cast<MxType>("MxType"#SZ#"d"), CMD>;
 } // isComm
 
 } // MxBiArOp_RFF
@@ -349,19 +367,16 @@ defm AND : MxBiArOp_DF<"and", MxAnd, 1, 0xC, 0x2>;
 defm OR  : MxBiArOp_DF<"or",  MxOr,  1, 0x8, 0x0>;
 
 multiclass MxBiArOp_DF_EAd<string MN, SDNode NODE, bits<4> CMD, bits<4> CMDI> {
-
-  let isCommutable = 1 in {
-
-  def NAME#"8dd"  : MxBiArOp_RFRR_EAd<MN, NODE, MxType8d,  CMD>;
-  def NAME#"16dd" : MxBiArOp_RFRR_EAd<MN, NODE, MxType16d, CMD>;
-  def NAME#"32dd" : MxBiArOp_RFRR_EAd<MN, NODE, MxType32d, CMD>;
-
-  } // isCommutable = 1
-
-  def NAME#"8di"  : MxBiArOp_RFRI<MN, NODE,  MxType8d, CMDI>;
-  def NAME#"16di" : MxBiArOp_RFRI<MN, NODE, MxType16d, CMDI>;
-  def NAME#"32di" : MxBiArOp_RFRI<MN, NODE, MxType32d, CMDI>;
-
+  foreach SZ = [8, 16, 32] in {
+    let isCommutable = 1 in
+    def NAME#SZ#"dd"  : MxBiArOp_R_RR_EAd<MN, NODE,
+                                          !cast<MxType>("MxType"#SZ#"d"),
+                                          CMD>;
+
+    def NAME#SZ#"di"  : MxBiArOp_R_RI<MN, NODE,
+                                      !cast<MxType>("MxType"#SZ#"d"),
+                                      CMDI>;
+  } // foreach SZ
 } // MxBiArOp_DF_EAd
 
 defm XOR : MxBiArOp_DF_EAd<"eor", MxXor, 0xB, 0xA>;
@@ -372,84 +387,112 @@ defm XOR : MxBiArOp_DF_EAd<"eor", MxXor, 0xB, 0xA>;
 //===----------------------------------------------------------------------===//
 
 let Defs = [CCR] in {
-class MxCmp_RR<MxType LHS_TYPE, MxType RHS_TYPE = LHS_TYPE,
-               MxBead REG = MxBeadDReg<1>>
+class MxCmp_RR<MxType LHS_TYPE, MxType RHS_TYPE = LHS_TYPE>
     : MxInst<(outs), (ins LHS_TYPE.ROp:$lhs, RHS_TYPE.ROp:$rhs),
              "cmp."#RHS_TYPE.Prefix#"\t$lhs, $rhs",
-             [(set CCR, (MxCmp LHS_TYPE.VT:$lhs, RHS_TYPE.VT:$rhs))],
-             MxArithEncoding<MxBead4Bits<0xB>,
-                             !cast<MxEncOpMode>("MxOpMode"#RHS_TYPE.Size#RHS_TYPE.RLet#"EA"),
-                             REG,
-                             !cast<MxEncEA>("MxEncEA"#LHS_TYPE.RLet#"_0"),
-                             MxExtEmpty>>;
+             [(set CCR, (MxCmp LHS_TYPE.VT:$lhs, RHS_TYPE.VT:$rhs))]> {
+  let Inst = (descend 0b1011,
+    // REGISTER
+    (operand "$rhs", 3),
+    // OPMODE
+    !cast<MxOpModeEncoding>("MxOpMode"#RHS_TYPE.Size#"_"#RHS_TYPE.RLet#"_EA").Value,
+    // MODE without last bit
+    0b00,
+    // REGISTER prefixed by D/A bit
+    (operand "$lhs", 4)
+  );
+}
 
 class MxCmp_RI<MxType TYPE>
     : MxInst<(outs), (ins TYPE.IOp:$imm, TYPE.ROp:$reg),
              "cmpi."#TYPE.Prefix#"\t$imm, $reg",
-             [(set CCR, (MxCmp TYPE.IPat:$imm, TYPE.VT:$reg))],
-             MxArithImmEncoding<MxBead4Bits<0xC>,
-                                !cast<MxEncSize>("MxEncSize"#TYPE.Size),
-                                MxEncEAd_1, MxExtEmpty,
-                                !cast<MxEncExt>("MxExtI"#TYPE.Size#"_0")>>;
+             [(set CCR, (MxCmp TYPE.IPat:$imm, TYPE.VT:$reg))]> {
+  let Inst = (ascend
+    (descend 0b00001100,
+             !cast<MxNewEncSize>("MxNewEncSize"#TYPE.Size).Value,
+             // The destination cannot be address register, so it's always
+             // the MODE for data register direct mode.
+             /*MODE*/0b000,
+             /*REGISTER*/(operand "$reg", 3)),
+    // Source (i.e. immediate value) encoding
+    MxEncAddrMode_i<"imm", TYPE.Size>.Supplement
+  );
+}
 
 let mayLoad = 1 in {
 
 class MxCmp_MI<MxType TYPE, MxOperand MEMOpd, ComplexPattern MEMPat,
-               MxEncEA EA, MxEncExt EXT>
+               MxEncMemOp MEM_ENC>
     : MxInst<(outs), (ins TYPE.IOp:$imm, MEMOpd:$mem),
              "cmpi."#TYPE.Prefix#"\t$imm, $mem",
-             [(set CCR, (MxCmp TYPE.IPat:$imm, (load MEMPat:$mem)))],
-             MxArithImmEncoding<MxBead4Bits<0xC>,
-                                !cast<MxEncSize>("MxEncSize"#TYPE.Size),
-                                EA, EXT,
-                                !cast<MxEncExt>("MxExtI"#TYPE.Size#"_0")>>;
+             [(set CCR, (MxCmp TYPE.IPat:$imm, (load MEMPat:$mem)))]> {
+  let Inst = (ascend
+    (descend 0b00001100,
+             !cast<MxNewEncSize>("MxNewEncSize"#TYPE.Size).Value,
+             MEM_ENC.EA),
+    // Source (i.e. immediate value) encoding
+    MxEncAddrMode_i<"imm", TYPE.Size>.Supplement,
+    // Destination (i.e. memory operand) encoding
+    MEM_ENC.Supplement
+  );
+}
 
+// FIXME: What about abs.W?
 class MxCmp_BI<MxType TYPE>
     : MxInst<(outs), (ins TYPE.IOp:$imm, MxAL32:$abs),
              "cmpi."#TYPE.Prefix#"\t$imm, $abs",
              [(set CCR, (MxCmp TYPE.IPat:$imm,
-                               (load (i32 (MxWrapper tglobaladdr:$abs)))))],
-             MxArithImmEncoding<MxBead4Bits<0xC>,
-                                !cast<MxEncSize>("MxEncSize"#TYPE.Size),
-                                MxEncEAb, MxExtI32_1,
-                                !cast<MxEncExt>("MxExtI"#TYPE.Size#"_0")>>;
+                               (load (i32 (MxWrapper tglobaladdr:$abs)))))]> {
+  defvar AbsEncoding = MxEncAddrMode_abs<"abs", true>;
+  let Inst = (ascend
+    (descend 0b00001100,
+             !cast<MxNewEncSize>("MxNewEncSize"#TYPE.Size).Value,
+             AbsEncoding.EA),
+    // Source (i.e. immediate value) encoding
+    MxEncAddrMode_i<"imm", TYPE.Size>.Supplement,
+    // Destination (i.e. memory operand) encoding
+    AbsEncoding.Supplement
+  );
+}
 
 class MxCmp_RM<MxType TYPE, MxOperand MEMOpd, ComplexPattern MEMPat,
-               MxEncEA EA, MxEncExt EXT>
+               MxEncMemOp MEM_ENC>
     : MxInst<(outs), (ins TYPE.ROp:$reg, MEMOpd:$mem),
              "cmp."#TYPE.Prefix#"\t$mem, $reg",
-             [(set CCR, (MxCmp (load MEMPat:$mem), TYPE.ROp:$reg))],
-             MxArithEncoding<MxBead4Bits<0xB>,
-                             !cast<MxEncOpMode>("MxOpMode"#TYPE.Size#"dEA"),
-                             MxBeadDReg<0>, EA, EXT>>;
+             [(set CCR, (MxCmp (load MEMPat:$mem), TYPE.ROp:$reg))]> {
+  let Inst = (ascend
+    (descend 0b1011,
+      // REGISTER
+      (operand "$reg", 3),
+      // OPMODE
+      !cast<MxOpModeEncoding>("MxOpMode"#TYPE.Size#"_d_EA").Value,
+      MEM_ENC.EA),
+    MEM_ENC.Supplement
+  );
+}
 } // let mayLoad = 1
 
 } // let Defs = [CCR]
 
 multiclass MMxCmp_RM<MxType TYPE> {
-  def NAME#TYPE.KOp.Letter : MxCmp_RM<TYPE, TYPE.KOp, TYPE.KPat, MxEncEAk,
-                                      MxExtBrief_1>;
-  def NAME#TYPE.QOp.Letter : MxCmp_RM<TYPE, TYPE.QOp, TYPE.QPat, MxEncEAq,
-                                      MxExtI16_1>;
-  def NAME#TYPE.POp.Letter : MxCmp_RM<TYPE, TYPE.POp, TYPE.PPat, MxEncEAp_1,
-                                      MxExtI16_1>;
-  def NAME#TYPE.FOp.Letter : MxCmp_RM<TYPE, TYPE.FOp, TYPE.FPat, MxEncEAf_1,
-                                      MxExtBrief_1>;
-  def NAME#TYPE.JOp.Letter : MxCmp_RM<TYPE, TYPE.JOp, TYPE.JPat, MxEncEAj_1,
-                                      MxExtEmpty>;
+  def NAME#TYPE.KOp.Letter : MxCmp_RM<TYPE, TYPE.KOp, TYPE.KPat, MxEncAddrMode_k<"mem">>;
+  def NAME#TYPE.QOp.Letter : MxCmp_RM<TYPE, TYPE.QOp, TYPE.QPat, MxEncAddrMode_q<"mem">>;
+  def NAME#TYPE.POp.Letter : MxCmp_RM<TYPE, TYPE.POp, TYPE.PPat, MxEncAddrMode_p<"mem">>;
+  def NAME#TYPE.FOp.Letter : MxCmp_RM<TYPE, TYPE.FOp, TYPE.FPat, MxEncAddrMode_f<"mem">>;
+  def NAME#TYPE.JOp.Letter : MxCmp_RM<TYPE, TYPE.JOp, TYPE.JPat, MxEncAddrMode_j<"mem">>;
 }
 
 multiclass MMxCmp_MI<MxType TYPE> {
-  def NAME#TYPE.KOp.Letter#"i" : MxCmp_MI<TYPE, TYPE.KOp, TYPE.KPat, MxEncEAk,
-                                          MxExtBrief_1>;
-  def NAME#TYPE.QOp.Letter#"i" : MxCmp_MI<TYPE, TYPE.QOp, TYPE.QPat, MxEncEAq,
-                                          MxExtI16_1>;
-  def NAME#TYPE.POp.Letter#"i" : MxCmp_MI<TYPE, TYPE.POp, TYPE.PPat, MxEncEAp_1,
-                                          MxExtI16_1>;
-  def NAME#TYPE.FOp.Letter#"i" : MxCmp_MI<TYPE, TYPE.FOp, TYPE.FPat, MxEncEAf_1,
-                                          MxExtBrief_1>;
-  def NAME#TYPE.JOp.Letter#"i" : MxCmp_MI<TYPE, TYPE.JOp, TYPE.JPat, MxEncEAj_1,
-                                          MxExtEmpty>;
+  def NAME#TYPE.KOp.Letter#"i" : MxCmp_MI<TYPE, TYPE.KOp, TYPE.KPat,
+                                          MxEncAddrMode_k<"mem">>;
+  def NAME#TYPE.QOp.Letter#"i" : MxCmp_MI<TYPE, TYPE.QOp, TYPE.QPat,
+                                          MxEncAddrMode_q<"mem">>;
+  def NAME#TYPE.POp.Letter#"i" : MxCmp_MI<TYPE, TYPE.POp, TYPE.PPat,
+                                          MxEncAddrMode_p<"mem">>;
+  def NAME#TYPE.FOp.Letter#"i" : MxCmp_MI<TYPE, TYPE.FOp, TYPE.FPat,
+                                          MxEncAddrMode_f<"mem">>;
+  def NAME#TYPE.JOp.Letter#"i" : MxCmp_MI<TYPE, TYPE.JOp, TYPE.JPat,
+                                          MxEncAddrMode_j<"mem">>;
 }
 
 foreach S = [8, 16, 32] in {
@@ -478,25 +521,31 @@ defm CMP32 : MMxCmp_MI<MxType32d>;
 // EXT
 //===----------------------------------------------------------------------===//
 
-def MxExtOpmode_wb : MxBead3Bits<0b010>;
-def MxExtOpmode_lw : MxBead3Bits<0b011>;
-def MxExtOpmode_lb : MxBead3Bits<0b111>;
-
 /// ---------------------------------------------------
 ///  F  E  D  C  B  A  9 | 8  7  6 | 5  4  3 | 2  1  0
 /// ---------------------------------------------------
 ///  0  1  0  0  1  0  0 |  OPMODE | 0  0  0 |   REG
 /// ---------------------------------------------------
-class MxExtEncoding<MxBead3Bits OPMODE>
-    : MxEncoding<MxBeadDReg<0>, MxBead3Bits<0b000>, OPMODE,
-                 MxBead3Bits<0b100>, MxBead4Bits<0b0100>>;
-
 let Defs = [CCR] in
 let Constraints = "$src = $dst" in
 class MxExt<MxType TO, MxType FROM>
     : MxInst<(outs TO.ROp:$dst), (ins TO.ROp:$src),
-             "ext."#TO.Prefix#"\t$src", [],
-             MxExtEncoding<!cast<MxBead3Bits>("MxExtOpmode_"#TO.Prefix#FROM.Prefix)>>;
+             "ext."#TO.Prefix#"\t$src", []> {
+  let Inst = (descend 0b0100100,
+    // OPMODE
+    !cond(
+      // byte -> word
+      !and(!eq(FROM.Size, 8), !eq(TO.Size, 16)): 0b010,
+      // word -> long
+      !and(!eq(FROM.Size, 16), !eq(TO.Size, 32)): 0b011,
+      // byte -> long
+      !and(!eq(FROM.Size, 8), !eq(TO.Size, 32)): 0b111
+    ),
+    0b000,
+    // REGISTER
+    (operand "$src", 3)
+  );
+}
 
 def EXT16 : MxExt<MxType16d, MxType8d>;
 def EXT32 : MxExt<MxType32d, MxType16d>;
@@ -511,9 +560,6 @@ def : Pat<(sext_inreg i32:$src, i8),
 // DIV/MUL
 //===----------------------------------------------------------------------===//
 
-def MxSDiMuOpmode : MxBead3Bits<0b111>;
-def MxUDiMuOpmode : MxBead3Bits<0b011>;
-
 /// Word operation:
 /// ----------------------------------------------------
 ///  F  E  D  C | B  A  9 | 8  7  6 | 5  4  3 | 2  1  0
@@ -521,40 +567,45 @@ def MxUDiMuOpmode : MxBead3Bits<0b011>;
 ///             |         |         | EFFECTIVE ADDRESS
 ///  x  x  x  x |   REG   | OP MODE |   MODE  |   REG
 /// ----------------------------------------------------
-class MxDiMuEncoding<MxBead4Bits CMD, MxBead3Bits OPMODE, MxEncEA EA, MxEncExt EXT>
-    : MxEncoding<EA.Reg, EA.DA, EA.Mode, OPMODE, MxBeadDReg<0>, CMD,
-                 EXT.Imm, EXT.B8, EXT.Scale, EXT.WL, EXT.DAReg>;
-
 let Defs = [CCR] in {
 let Constraints = "$src = $dst" in {
-// $reg <- $reg op $reg
-class MxDiMuOp_DD<string MN, bits<4> CMD, MxBead3Bits OPMODE,
+// $dreg <- $dreg op $dreg
+class MxDiMuOp_DD<string MN, bits<4> CMD, bit SIGNED = false,
                   MxOperand DST, MxOperand OPD>
-    : MxInst<(outs DST:$dst), (ins DST:$src, OPD:$opd), MN#"\t$opd, $dst", [],
-             MxDiMuEncoding<MxBead4Bits<CMD>, OPMODE, MxEncEAd_2, MxExtEmpty>>;
+    : MxInst<(outs DST:$dst), (ins DST:$src, OPD:$opd), MN#"\t$opd, $dst", []> {
+  let Inst = (descend CMD,
+    // REGISTER
+    (operand "$dst", 3),
+    !if(SIGNED, 0b111, 0b011),
+    /*MODE*/0b000, /*REGISTER*/(operand "$opd", 3)
+  );
+}
 
 // $reg <- $reg op $imm
-class MxDiMuOp_DI<string MN, bits<4> CMD, MxBead3Bits OPMODE,
+class MxDiMuOp_DI<string MN, bits<4> CMD, bit SIGNED = false,
                   MxOperand DST, MxOperand OPD>
-    : MxInst<(outs DST:$dst), (ins DST:$src, OPD:$opd), MN#"\t$opd, $dst", [],
-             MxDiMuEncoding<MxBead4Bits<CMD>, OPMODE, MxEncEAi, MxExtI16_2>>;
+    : MxInst<(outs DST:$dst), (ins DST:$src, OPD:$opd), MN#"\t$opd, $dst", []> {
+  // FIXME: Support immediates with different widths.
+  defvar ImmEnc = MxEncAddrMode_i<"opd", 16>;
+  let Inst = (ascend
+    (descend CMD,
+      // REGISTER
+      (operand "$dst", 3),
+      !if(SIGNED, 0b111, 0b011), ImmEnc.EA),
+    ImmEnc.Supplement
+  );
+}
 } // let Constraints
 } // Defs = [CCR]
 
 multiclass MxDiMuOp<string MN, bits<4> CMD, bit isComm = 0> {
-
   let isCommutable = isComm in {
-    def "S"#NAME#"d32d16" : MxDiMuOp_DD<MN#"s", CMD, MxSDiMuOpmode, MxDRD32,
-                                        MxDRD16>;
-    def "U"#NAME#"d32d16" : MxDiMuOp_DD<MN#"u", CMD, MxUDiMuOpmode, MxDRD32,
-                                        MxDRD16>;
+    def "S"#NAME#"d32d16" : MxDiMuOp_DD<MN#"s", CMD, /*SIGNED*/true, MxDRD32, MxDRD16>;
+    def "U"#NAME#"d32d16" : MxDiMuOp_DD<MN#"u", CMD, /*SIGNED*/false, MxDRD32, MxDRD16>;
   }
 
-  def "S"#NAME#"d32i16" : MxDiMuOp_DI<MN#"s", CMD, MxSDiMuOpmode, MxDRD32,
-                                      Mxi16imm>;
-  def "U"#NAME#"d32i16" : MxDiMuOp_DI<MN#"u", CMD, MxUDiMuOpmode, MxDRD32,
-                                      Mxi16imm>;
-
+  def "S"#NAME#"d32i16" : MxDiMuOp_DI<MN#"s", CMD, /*SIGNED*/true, MxDRD32, Mxi16imm>;
+  def "U"#NAME#"d32i16" : MxDiMuOp_DI<MN#"u", CMD, /*SIGNED*/false, MxDRD32, Mxi16imm>;
 }
 
 defm DIV : MxDiMuOp<"div", 0x8>;
@@ -697,29 +748,35 @@ def : Pat<(mulhu i16:$dst, MximmSExt16:$opd),
 ///             |            |      | EFFECTIVE ADDRESS
 ///  0  1  0  0 | x  x  x  x | SIZE |   MODE  |   REG
 /// ------------+------------+------+---------+---------
-class MxNEGEncoding<MxBead4Bits CMD, MxEncSize SIZE, MxEncEA EA, MxEncExt EXT>
-    : MxEncoding<EA.Reg, EA.DA, EA.Mode, SIZE, CMD, MxBead4Bits<0b0100>,
-                 EXT.Imm, EXT.B8, EXT.Scale, EXT.WL, EXT.DAReg>;
-
 let Defs = [CCR] in {
 let Constraints = "$src = $dst" in {
 
 class MxNeg_D<MxType TYPE>
     : MxInst<(outs TYPE.ROp:$dst), (ins TYPE.ROp:$src),
              "neg."#TYPE.Prefix#"\t$dst",
-             [(set TYPE.VT:$dst, (ineg TYPE.VT:$src))],
-             MxNEGEncoding<MxBead4Bits<0x4>,
-                           !cast<MxEncSize>("MxEncSize"#TYPE.Size),
-                           MxEncEAd_0, MxExtEmpty>>;
+             [(set TYPE.VT:$dst, (ineg TYPE.VT:$src))]> {
+  let Inst = (descend 0b01000100,
+    /*SIZE*/!cast<MxNewEncSize>("MxNewEncSize"#TYPE.Size).Value,
+    //MODE without last bit
+    0b00,
+    //REGISTER prefixed by D/A bit
+    (operand "$dst", 4)
+  );
+}
 
 let Uses = [CCR] in {
 class MxNegX_D<MxType TYPE>
     : MxInst<(outs TYPE.ROp:$dst), (ins TYPE.ROp:$src),
              "negx."#TYPE.Prefix#"\t$dst",
-             [(set TYPE.VT:$dst, (MxSubX 0, TYPE.VT:$src, CCR))],
-             MxNEGEncoding<MxBead4Bits<0x0>,
-                           !cast<MxEncSize>("MxEncSize"#TYPE.Size),
-                           MxEncEAd_0, MxExtEmpty>>;
+             [(set TYPE.VT:$dst, (MxSubX 0, TYPE.VT:$src, CCR))]> {
+  let Inst = (descend 0b01000000,
+    /*SIZE*/!cast<MxNewEncSize>("MxNewEncSize"#TYPE.Size).Value,
+    //MODE without last bit
+    0b00,
+    //REGISTER prefixed by D/A bit
+    (operand "$dst", 4)
+  );
+}
 }
 
 } // let Constraints
diff --git a/llvm/lib/Target/M68k/M68kInstrBits.td b/llvm/lib/Target/M68k/M68kInstrBits.td
index 0d1278102378..abd2ab3cf012 100644
--- a/llvm/lib/Target/M68k/M68kInstrBits.td
+++ b/llvm/lib/Target/M68k/M68kInstrBits.td
@@ -32,9 +32,15 @@
 /// ------------+---------+---------+---------+---------
 ///  0  0  0  0 |   REG   | 1  0  0 |   MODE  |   REG
 /// ------------+---------+---------+---------+---------
-class MxBTSTEnc_R<MxBeadDReg REG, MxEncEA EA, MxEncExt EXT>
-    : MxEncoding<EA.Reg, EA.DA, EA.Mode, MxBead3Bits<0b100>, REG, MxBead4Bits<0b0000>,
-                 EXT.Imm, EXT.B8, EXT.Scale, EXT.WL, EXT.DAReg>;
+class MxBTSTEnc_R<MxEncMemOp dst_enc, string bitno_name> {
+  dag Value = (ascend
+    (descend 0b0000,
+      (operand "$"#bitno_name, 3),
+      0b100, dst_enc.EA
+    ),
+    dst_enc.Supplement
+  );
+}
 
 /// -------------------------------+---------+---------
 ///  F  E  D  C  B  A  9  8 . 7  6 | 5  4  3 | 2  1  0
@@ -43,33 +49,40 @@ class MxBTSTEnc_R<MxBeadDReg REG, MxEncEA EA, MxEncExt EXT>
 /// ------------------------+------+---------+---------
 ///  0  0  0  0  0  0  0  0 |        BIT NUMBER
 /// ------------------------+--------------------------
-class MxBTSTEnc_I<MxBead8Imm IMM, MxEncEA EA, MxEncExt EXT>
-    : MxEncoding<EA.Reg, EA.DA, EA.Mode, MxBead2Bits<0b00>,
-                 MxBead4Bits<0b1000>, MxBead4Bits<0b0000>, IMM,
-                 EXT.Imm, EXT.B8, EXT.Scale, EXT.WL, EXT.DAReg>;
+class MxBTSTEnc_I<MxEncMemOp dst_enc, string bitno_name> {
+  dag Value = (ascend
+    (descend 0b0000100000, dst_enc.EA),
+    (descend 0b00000000, (operand "$"#bitno_name, 8)),
+    dst_enc.Supplement
+  );
+}
 
 let Defs = [CCR] in {
 class MxBTST_RR<MxType TYPE>
     : MxInst<(outs), (ins TYPE.ROp:$dst, TYPE.ROp:$bitno), "btst\t$bitno, $dst",
-             [(set CCR, (MxBtst TYPE.VT:$dst, TYPE.VT:$bitno))],
-             MxBTSTEnc_R<MxBeadDReg<1>, MxEncEAd_0, MxExtEmpty>>;
+             [(set CCR, (MxBtst TYPE.VT:$dst, TYPE.VT:$bitno))]> {
+  let Inst = MxBTSTEnc_R<MxEncAddrMode_r<"dst">, "bitno">.Value;
+}
 
 class MxBTST_RI<MxType TYPE>
     : MxInst<(outs), (ins TYPE.ROp:$dst, TYPE.IOp:$bitno), "btst\t$bitno, $dst",
-             [(set CCR, (MxBtst TYPE.VT:$dst, TYPE.IPat:$bitno))],
-             MxBTSTEnc_I<MxBead8Imm<1>, MxEncEAd_0, MxExtEmpty>>;
+             [(set CCR, (MxBtst TYPE.VT:$dst, TYPE.IPat:$bitno))]> {
+  let Inst = MxBTSTEnc_I<MxEncAddrMode_r<"dst">, "bitno">.Value;
+}
 
 class MxBTST_MR<MxType TYPE, MxOperand MEMOpd, ComplexPattern MEMPat,
-                MxEncEA EA, MxEncExt EXT>
+                MxEncMemOp DST_ENC>
     : MxInst<(outs), (ins MEMOpd:$dst, TYPE.ROp:$bitno), "btst\t$bitno, $dst",
-             [(set CCR, (MxBtst (TYPE.Load MEMPat:$dst), TYPE.VT:$bitno))],
-             MxBTSTEnc_R<MxBeadDReg<1>, EA, EXT>>;
+             [(set CCR, (MxBtst (TYPE.Load MEMPat:$dst), TYPE.VT:$bitno))]> {
+  let Inst = MxBTSTEnc_R<DST_ENC, "bitno">.Value;
+}
 
 class MxBTST_MI<MxType TYPE, MxOperand MEMOpd, ComplexPattern MEMPat,
-                MxEncEA EA, MxEncExt EXT>
+                MxEncMemOp DST_ENC>
     : MxInst<(outs), (ins MEMOpd:$dst, TYPE.IOp:$bitno), "btst\t$bitno, $dst",
-             [(set CCR, (MxBtst (TYPE.Load MEMPat:$dst), TYPE.IPat:$bitno))],
-             MxBTSTEnc_I<MxBead8Imm<1>, EA, EXT>>;
+             [(set CCR, (MxBtst (TYPE.Load MEMPat:$dst), TYPE.IPat:$bitno))]> {
+  let Inst = MxBTSTEnc_I<DST_ENC, "bitno">.Value;
+}
 } // Defs = [CCR]
 
 // Register BTST limited to 32 bits only
@@ -78,31 +91,31 @@ def BTST32di : MxBTST_RI<MxType32d>;
 
 // Memory BTST limited to 8 bits only
 def BTST8jd : MxBTST_MR<MxType8d, MxType8.JOp, MxType8.JPat,
-                        MxEncEAj_0, MxExtEmpty>;
+                        MxEncAddrMode_j<"dst">>;
 def BTST8od : MxBTST_MR<MxType8d, MxType8.OOp, MxType8.OPat,
-                        MxEncEAo_0, MxExtEmpty>;
+                        MxEncAddrMode_o<"dst">>;
 def BTST8ed : MxBTST_MR<MxType8d, MxType8.EOp, MxType8.EPat,
-                        MxEncEAe_0, MxExtEmpty>;
+                        MxEncAddrMode_e<"dst">>;
 def BTST8pd : MxBTST_MR<MxType8d, MxType8.POp, MxType8.PPat,
-                        MxEncEAp_0, MxExtI16_0>;
+                        MxEncAddrMode_p<"dst">>;
 def BTST8fd : MxBTST_MR<MxType8d, MxType8.FOp, MxType8.FPat,
-                        MxEncEAf_0, MxExtBrief_0>;
+                        MxEncAddrMode_f<"dst">>;
 def BTST8qd : MxBTST_MR<MxType8d, MxType8.QOp, MxType8.QPat,
-                        MxEncEAq,   MxExtI16_0>;
+                        MxEncAddrMode_q<"dst">>;
 def BTST8kd : MxBTST_MR<MxType8d, MxType8.KOp, MxType8.KPat,
-                        MxEncEAk,   MxExtBrief_0>;
+                        MxEncAddrMode_k<"dst">>;
 
 def BTST8ji : MxBTST_MI<MxType8d, MxType8.JOp, MxType8.JPat,
-                        MxEncEAj_0, MxExtEmpty>;
+                        MxEncAddrMode_j<"dst">>;
 def BTST8oi : MxBTST_MI<MxType8d, MxType8.OOp, MxType8.OPat,
-                        MxEncEAo_0, MxExtEmpty>;
+                        MxEncAddrMode_o<"dst">>;
 def BTST8ei : MxBTST_MI<MxType8d, MxType8.EOp, MxType8.EPat,
-                        MxEncEAe_0, MxExtEmpty>;
+                        MxEncAddrMode_e<"dst">>;
 def BTST8pi : MxBTST_MI<MxType8d, MxType8.POp, MxType8.PPat,
-                        MxEncEAp_0, MxExtI16_0>;
+                        MxEncAddrMode_p<"dst">>;
 def BTST8fi : MxBTST_MI<MxType8d, MxType8.FOp, MxType8.FPat,
-                        MxEncEAf_0, MxExtBrief_0>;
+                        MxEncAddrMode_f<"dst">>;
 def BTST8qi : MxBTST_MI<MxType8d, MxType8.QOp, MxType8.QPat,
-                        MxEncEAq,   MxExtI16_0>;
+                        MxEncAddrMode_q<"dst">>;
 def BTST8ki : MxBTST_MI<MxType8d, MxType8.KOp, MxType8.KPat,
-                        MxEncEAk,   MxExtBrief_0>;
+                        MxEncAddrMode_k<"dst">>;
diff --git a/llvm/lib/Target/M68k/M68kInstrControl.td b/llvm/lib/Target/M68k/M68kInstrControl.td
index be9045b6e0d2..d15283c769f6 100644
--- a/llvm/lib/Target/M68k/M68kInstrControl.td
+++ b/llvm/lib/Target/M68k/M68kInstrControl.td
@@ -12,10 +12,10 @@
 ///
 ///  Machine:
 ///
-///       BRA   [x]     BSR  [ ]     Bcc [ ]     DBcc [ ]     FBcc [ ]
+///       BRA   [x]     BSR  [ ]     Bcc [~]     DBcc [ ]     FBcc [ ]
 ///       FDBcc [ ]     FNOP [ ]     FPn [ ]     FScc [ ]     FTST [ ]
 ///       JMP   [~]     JSR  [x]     NOP [x]     RTD  [!]     RTR  [ ]
-///       RTS   [x]     Scc  [x]     TST [ ]
+///       RTS   [x]     Scc  [~]     TST [ ]
 ///
 ///  Pseudo:
 ///
@@ -43,7 +43,9 @@
 //===----------------------------------------------------------------------===//
 
 let hasSideEffects = 0 in {
-  def NOP : MxInst<(outs), (ins), "nop", [], MxEncFixed<0x4E71>>;
+  def NOP : MxInst<(outs), (ins), "nop", []> {
+    let Inst = (descend 0b0100, 0b1110, 0b0111, 0b0001);
+  }
 }
 
 
@@ -61,51 +63,60 @@ let hasSideEffects = 0 in {
 /// NE—Not equal        VS—Overflow set
 ///
 /// *Not applicable to the Bcc instructions.
-def MxCCt  : MxBead4Bits<0b0000>;
-def MxCCf  : MxBead4Bits<0b0001>;
-def MxCChi : MxBead4Bits<0b0010>;
-def MxCCls : MxBead4Bits<0b0011>;
-def MxCCcc : MxBead4Bits<0b0100>;
-def MxCCcs : MxBead4Bits<0b0101>;
-def MxCCne : MxBead4Bits<0b0110>;
-def MxCCeq : MxBead4Bits<0b0111>;
-def MxCCvc : MxBead4Bits<0b1000>;
-def MxCCvs : MxBead4Bits<0b1001>;
-def MxCCpl : MxBead4Bits<0b1010>;
-def MxCCmi : MxBead4Bits<0b1011>;
-def MxCCge : MxBead4Bits<0b1100>;
-def MxCClt : MxBead4Bits<0b1101>;
-def MxCCgt : MxBead4Bits<0b1110>;
-def MxCCle : MxBead4Bits<0b1111>;
+class MxEncCondOp<bits<4> cond> {
+  dag Value = (descend cond);
+}
+
+def MxCCt  : MxEncCondOp<0b0000>;
+def MxCCf  : MxEncCondOp<0b0001>;
+def MxCChi : MxEncCondOp<0b0010>;
+def MxCCls : MxEncCondOp<0b0011>;
+def MxCCcc : MxEncCondOp<0b0100>;
+def MxCCcs : MxEncCondOp<0b0101>;
+def MxCCne : MxEncCondOp<0b0110>;
+def MxCCeq : MxEncCondOp<0b0111>;
+def MxCCvc : MxEncCondOp<0b1000>;
+def MxCCvs : MxEncCondOp<0b1001>;
+def MxCCpl : MxEncCondOp<0b1010>;
+def MxCCmi : MxEncCondOp<0b1011>;
+def MxCCge : MxEncCondOp<0b1100>;
+def MxCClt : MxEncCondOp<0b1101>;
+def MxCCgt : MxEncCondOp<0b1110>;
+def MxCCle : MxEncCondOp<0b1111>;
+
+
 
 /// --------------------------------+---------+---------
 ///  F  E  D  C | B  A  9  8 | 7  6 | 5  4  3 | 2  1  0
 /// --------------------------------+---------+---------
 ///  0  1  0  1 | CONDITION  | 1  1 |   MODE  |   REG
 /// ----------------------------------------------------
-class MxSccEncoding<MxEncEA EA, MxEncExt EXT, MxBead4Bits CC>
-    : MxEncoding<EA.Reg, EA.DA, EA.Mode, MxBead2Bits<0b11>, CC, MxBead4Bits<0b0101>,
-                 EXT.Imm, EXT.B8, EXT.Scale, EXT.WL, EXT.DAReg>;
 
 let Uses = [CCR] in {
 class MxSccR<string CC>
     : MxInst<(outs MxDRD8:$dst), (ins), "s"#CC#"\t$dst",
-             [(set i8:$dst, (MxSetCC !cast<PatLeaf>("MxCOND"#CC), CCR))],
-             MxSccEncoding<MxEncEAd_0, MxExtEmpty,
-                           !cast<MxBead4Bits>("MxCC"#CC)>>;
+             [(set i8:$dst, (MxSetCC !cast<PatLeaf>("MxCOND"#CC), CCR))]> {
+  let Inst = (descend 0b0101, !cast<MxEncCondOp>("MxCC"#CC).Value, 0b11,
+              /*MODE without last bit*/0b00,
+              /*REGISTER prefixed with D/A bit*/(operand "$dst", 4));
+}
 
-class MxSccM<string CC, MxOperand MEMOpd, ComplexPattern MEMPat,
-             MxEncEA EA, MxEncExt EXT>
+class MxSccM<string CC, MxOperand MEMOpd, ComplexPattern MEMPat, MxEncMemOp DST_ENC>
     : MxInst<(outs), (ins MEMOpd:$dst), "s"#CC#"\t$dst",
-             [(store (MxSetCC !cast<PatLeaf>("MxCOND"#CC), CCR), MEMPat:$dst)],
-             MxSccEncoding<EA, EXT, !cast<MxBead4Bits>("MxCC"#CC)>>;
+             [(store (MxSetCC !cast<PatLeaf>("MxCOND"#CC), CCR), MEMPat:$dst)]> {
+  let Inst =
+    (ascend
+      (descend 0b0101, !cast<MxEncCondOp>("MxCC"#CC).Value, 0b11, DST_ENC.EA),
+      DST_ENC.Supplement
+    );
+}
 }
 
 foreach cc = [ "cc", "ls", "lt", "eq", "mi", "f", "ne", "ge",
                "cs", "pl", "gt", "t", "hi", "vc", "le", "vs"] in {
 def SET#"d8"#cc : MxSccR<cc>;
-def SET#"j8"#cc : MxSccM<cc, MxType8.JOp, MxType8.JPat, MxEncEAj_0, MxExtEmpty>;
-def SET#"p8"#cc : MxSccM<cc, MxType8.POp, MxType8.PPat, MxEncEAp_0, MxExtI16_0>;
+def SET#"j8"#cc : MxSccM<cc, MxType8.JOp, MxType8.JPat, MxEncAddrMode_j<"dst">>;
+def SET#"p8"#cc : MxSccM<cc, MxType8.POp, MxType8.PPat, MxEncAddrMode_p<"dst">>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -118,13 +129,16 @@ def SET#"p8"#cc : MxSccM<cc, MxType8.POp, MxType8.PPat, MxEncEAp_0, MxExtI16_0>;
 /// 0  1  0  0  1  1  1  0  1  1 |  MODE   |   REG
 ///------------------------------+---------+---------
 let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in
-class MxJMP<MxOperand LOCOp, MxEncEA EA, MxEncExt EXT>
-    : MxInst<(outs), (ins LOCOp:$dst), "jmp\t$dst", [(brind iPTR:$dst)],
-             MxEncoding<EA.Reg, EA.DA, EA.Mode, MxBead2Bits<0b11>,
-                        MxBead4Bits<0b1110>, MxBead4Bits<0b0100>,
-                        EXT.Imm, EXT.B8, EXT.Scale, EXT.WL, EXT.DAReg>>;
+class MxJMP<MxOperand LOCOp, MxEncMemOp DST_ENC>
+    : MxInst<(outs), (ins LOCOp:$dst), "jmp\t$dst", [(brind iPTR:$dst)]> {
+  let Inst =
+    (ascend
+      (descend 0b0100, 0b1110, 0b11, DST_ENC.EA),
+      DST_ENC.Supplement
+    );
+}
 
-def JMP32j : MxJMP<MxARI32, MxEncEAj_0, MxExtEmpty>;
+def JMP32j : MxJMP<MxARI32, MxEncAddrMode_j<"dst">>;
 
 
 // FIXME Support 16 bit indirect jump.
@@ -147,20 +161,35 @@ def JMP32j : MxJMP<MxARI32, MxEncEAj_0, MxExtEmpty>;
 ///  32-BIT DISPLACEMENT IF 8-BIT DISPLACEMENT = $FF
 /// --------------------------------------------------
 let isBranch = 1, isTerminator = 1, Uses = [CCR] in
-class MxBcc<string cc, Operand TARGET, MxEncoding ENC = MxEncEmpty>
-    : MxInst<(outs), (ins TARGET:$dst), "b"#cc#"\t$dst", [], ENC>;
+class MxBcc<string cc, Operand TARGET, dag disp_8, dag disp_16_32>
+    : MxInst<(outs), (ins TARGET:$dst), "b"#cc#"\t$dst", []> {
+  // FIXME: If we want to avoid supplying disp_16_32 with empty
+  //        (ascend) for 16/32 bits variants, we can use conditional
+  //        bang operator like this:
+  //        ```
+  //        class MxBcc<string cc, Operand TARGET, int SIZE>
+  //        ...
+  //        let Inst = !cond(
+  //            !eq(SIZE, 8):   /* encoding for Bcc8  */
+  //            !eq(SIZE, 16):  /* encoding for Bcc16 */
+  //            !eq(SIZE, 32):  /* encoding for Bcc32 */
+  //        );
+  let Inst =
+      (ascend
+        (descend 0b0110, !cast<MxEncCondOp>("MxCC"#cc).Value, disp_8),
+        disp_16_32
+      );
+}
 
 foreach cc = [ "cc", "ls", "lt", "eq", "mi", "ne", "ge",
                "cs", "pl", "gt", "hi", "vc", "le", "vs"] in {
   def B#cc#"8"
     : MxBcc<cc, MxBrTarget8,
-            MxEncoding<MxBead8Disp<0>,
-                       !cast<MxBead4Bits>("MxCC"#cc), MxBead4Bits<0x6>>>;
+            (operand "$dst", 8, (encoder "encodePCRelImm<8>")), (ascend)>;
+
   def B#cc#"16"
-    : MxBcc<cc, MxBrTarget16,
-            MxEncoding<MxBead4Bits<0x0>,
-                       MxBead4Bits<0x0>, !cast<MxBead4Bits>("MxCC"#cc),
-                       MxBead4Bits<0x6>, MxBead16Imm<0>>>;
+    : MxBcc<cc, MxBrTarget16, (descend 0b0000, 0b0000),
+            (operand "$dst", 16, (encoder "encodePCRelImm<16>"))>;
 }
 
 foreach cc = [ "cc", "ls", "lt", "eq", "mi", "ne", "ge",
@@ -178,17 +207,21 @@ def : Pat<(MxBrCond bb:$target, !cast<PatLeaf>("MxCOND"#cc), CCR),
 /// -------------------------------------------------
 ///  32-BIT DISPLACEMENT IF 8-BIT DISPLACEMENT = $FF
 /// -------------------------------------------------
-let isBranch = 1, isTerminator = 1, isBarrier=1 in
-class MxBra<Operand TARGET, MxEncoding ENC = MxEncEmpty>
-    : MxInst<(outs), (ins TARGET:$dst), "bra\t$dst", [], ENC>;
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in
+class MxBra<Operand TARGET, dag disp_8, dag disp_16_32>
+    : MxInst<(outs), (ins TARGET:$dst), "bra\t$dst", []> {
+  let Inst =
+    (ascend
+      (descend 0b0110, 0b0000, disp_8),
+      disp_16_32
+    );
+}
 
 def BRA8  : MxBra<MxBrTarget8,
-                  MxEncoding<MxBead8Disp<0>, MxBead4Bits<0x0>,
-                             MxBead4Bits<0x6>>>;
-def BRA16 : MxBra<MxBrTarget16,
-                  MxEncoding<MxBead4Bits<0x0>, MxBead4Bits<0x0>,
-                             MxBead4Bits<0x0>, MxBead4Bits<0x6>,
-                             MxBead16Imm<0>>>;
+                  (operand "$dst", 8, (encoder "encodePCRelImm<8>")), (ascend)>;
+
+def BRA16 : MxBra<MxBrTarget16, (descend 0b0000, 0b0000),
+                  (operand "$dst", 16, (encoder "encodePCRelImm<16>"))>;
 
 def : Pat<(br bb:$target), (BRA8 MxBrTarget8:$target)>;
 
@@ -208,16 +241,19 @@ let isCall = 1 in
 ///------------------------------+---------+---------
 /// 0  1  0  0  1  1  1  0  1  0 |  MODE   |   REG
 ///------------------------------+---------+---------
-class MxCall<MxOperand LOCOp, MxEncEA EA, MxEncExt EXT>
-    : MxInst<(outs), (ins LOCOp:$dst), "jsr\t$dst", [],
-             MxEncoding<EA.Reg, EA.DA, EA.Mode, MxBead2Bits<0b10>,
-                        MxBead4Bits<0b1110>, MxBead4Bits<0b0100>,
-                        EXT.Imm, EXT.B8, EXT.Scale, EXT.WL, EXT.DAReg>>;
+class MxCall<MxOperand LOCOp, MxEncMemOp DST_ENC>
+    : MxInst<(outs), (ins LOCOp:$dst), "jsr\t$dst", []> {
+  let Inst =
+    (ascend
+      (descend 0b0100, 0b1110, 0b10, DST_ENC.EA),
+      DST_ENC.Supplement
+    );
+}
 
-def CALLk : MxCall<MxPCI32, MxEncEAk,   MxExtBrief_0>;
-def CALLq : MxCall<MxPCD32, MxEncEAq,   MxExtI16_0>;
-def CALLb : MxCall<MxAL32,  MxEncEAb,   MxExtI32_0>;
-def CALLj : MxCall<MxARI32, MxEncEAj_0, MxExtEmpty>;
+def CALLk : MxCall<MxPCI32, MxEncAddrMode_k<"dst">>;
+def CALLq : MxCall<MxPCD32, MxEncAddrMode_q<"dst">>;
+def CALLb : MxCall<MxAL32,  MxEncAddrMode_abs<"dst", true>>;
+def CALLj : MxCall<MxARI32, MxEncAddrMode_j<"dst">>;
 
 multiclass CallPat<MxCall callOp, Predicate pred> {
   let Predicates = [pred] in {
@@ -261,7 +297,9 @@ def TAILJMPj  : MxPseudo<(outs), (ins MxARI32_TC:$dst)>;
 
 let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1 in {
 
-def RTS : MxInst<(outs), (ins), "rts", [], MxEncFixed<0x4E75>>;
+def RTS : MxInst<(outs), (ins), "rts", []> {
+  let Inst = (descend 0b0100, 0b1110, 0b0111, 0b0101);
+}
 
 let isCodeGenOnly = 1 in
 def RET : MxPseudo<(outs), (ins i32imm:$adj, variable_ops),
diff --git a/llvm/lib/Target/M68k/M68kInstrData.td b/llvm/lib/Target/M68k/M68kInstrData.td
index 3dd5d9f8c7ac..863432b94005 100644
--- a/llvm/lib/Target/M68k/M68kInstrData.td
+++ b/llvm/lib/Target/M68k/M68kInstrData.td
@@ -42,290 +42,192 @@
 /// -----------------------------------------------------
 ///
 /// NOTE Move requires EA X version for direct register destination(0)
-class MxMoveEncoding<MxBead2Bits size,
-                     MxEncEA srcEA, MxEncExt srcExt,
-                     MxEncEA dstEA, MxEncExt dstExt>
-    : MxEncoding<srcEA.Reg, srcEA.DA, srcEA.Mode, dstEA.DA, dstEA.Mode, dstEA.Reg,
-                 size, MxBead2Bits<0b00>,
-                 srcExt.Imm, srcExt.B8, srcExt.Scale, srcExt.WL, srcExt.DAReg,
-                 dstExt.Imm, dstExt.B8, dstExt.Scale, dstExt.WL, dstExt.DAReg>;
-
-/// MOVE has alternate size encoding
-class MxMoveSize<bits<2> value> : MxBead2Bits<value>;
+
+// MOVE has a different size encoding.
+class MxMoveSize<bits<2> value> {
+  bits<2> Value = value;
+}
 def MxMoveSize8  : MxMoveSize<0b01>;
 def MxMoveSize16 : MxMoveSize<0b11>;
 def MxMoveSize32 : MxMoveSize<0b10>;
 
-let Defs = [CCR] in
-class MxMove<string size, dag outs, dag ins, list<dag> pattern, MxEncoding enc>
-    : MxInst<outs, ins, "move."#size#"\t$src, $dst", pattern, enc>;
-
-class MxMove_RR<MxType DST, MxType SRC, MxMoveEncoding ENC>
-    : MxMove<DST.Prefix, (outs DST.ROp:$dst), (ins SRC.ROp:$src),
-             [(null_frag)], ENC>;
-
-let mayStore = 1 in {
-class MxMove_MR<MxOperand MEMOpd, ComplexPattern MEMPat, MxType REG,
-                MxMoveEncoding ENC>
-    : MxMove<REG.Prefix, (outs), (ins MEMOpd:$dst, REG.ROp:$src),
-             [(store REG.VT:$src, MEMPat:$dst)], ENC>;
-
-class MxMove_MI<MxOperand MEMOpd, ComplexPattern MEMPat, MxType TYPE,
-                MxMoveEncoding ENC>
-    : MxMove<TYPE.Prefix, (outs), (ins MEMOpd:$dst, TYPE.IOp:$src),
-             [(store TYPE.IPat:$src, MEMPat:$dst)], ENC>;
-} // let mayStore = 1
-
-class MxMove_RI<MxType DST, MxMoveEncoding ENC>
-    : MxMove<DST.Prefix, (outs DST.ROp:$dst), (ins DST.IOp:$src),
-              [(set DST.VT:$dst, DST.IPat:$src)], ENC>;
-
-
-let mayLoad = 1 in
-class MxMove_RM<MxType REG, MxOperand MEMOpd, ComplexPattern MEMPat,
-                MxBead2Bits SIZE,
-                MxEncEA SRCEA, MxEncExt SRCEXT,
-                MxEncEA DSTEA, MxEncExt DSTEXT>
-    : MxMove<REG.Prefix, (outs REG.ROp:$dst), (ins MEMOpd:$src),
-             [(set REG.VT:$dst, (REG.Load MEMPat:$src))],
-             MxMoveEncoding<SIZE, SRCEA, SRCEXT, DSTEA, DSTEXT>>;
-
-multiclass MMxMove_RM<MxType REG, MxMoveSize SIZE, MxEncEA EA_0> {
-
-  // REG <- (An)+
-  def NAME#REG.OOp.Letter#REG.Postfix : MxMove_RM<REG, REG.OOp, REG.OPat,
-      SIZE, MxEncEAo_1, MxExtEmpty, EA_0, MxExtEmpty>;
-
-  // REG <- -(An)
-  def NAME#REG.EOp.Letter#REG.Postfix : MxMove_RM<REG, REG.EOp, REG.EPat,
-      SIZE, MxEncEAe_1, MxExtEmpty, EA_0, MxExtEmpty>;
-
-  // REG <- (i,PC,Xn)
-  def NAME#REG.KOp.Letter#REG.Postfix : MxMove_RM<REG, REG.KOp, REG.KPat,
-      SIZE, MxEncEAk, MxExtBrief_1, EA_0, MxExtEmpty>;
-
-  // REG <- (i,PC)
-  def NAME#REG.QOp.Letter#REG.Postfix : MxMove_RM<REG, REG.QOp, REG.QPat,
-      SIZE, MxEncEAq, MxExtI16_1, EA_0, MxExtEmpty>;
-
-  // REG <- (i,An,Xn)
-  def NAME#REG.FOp.Letter#REG.Postfix : MxMove_RM<REG, REG.FOp, REG.FPat,
-      SIZE, MxEncEAf_1, MxExtBrief_1, EA_0, MxExtEmpty>;
-
-  // REG <- (i,An)
-  def NAME#REG.POp.Letter#REG.Postfix : MxMove_RM<REG, REG.POp, REG.PPat,
-      SIZE, MxEncEAp_1, MxExtI16_1, EA_0, MxExtEmpty>;
-
-  // REG <- (ABS)
-  def NAME#REG.BOp.Letter#REG.Postfix : MxMove_RM<REG, REG.BOp, REG.BPat,
-      SIZE, MxEncEAb, MxExtI32_1, EA_0, MxExtEmpty>;
-
-  // REG <- (An)
-  def NAME#REG.JOp.Letter#REG.Postfix : MxMove_RM<REG, REG.JOp, REG.JPat,
-      SIZE, MxEncEAj_1, MxExtEmpty, EA_0, MxExtEmpty>;
+class MxMoveEncoding<MxMoveSize size, MxEncMemOp dst_enc, MxEncMemOp src_enc> {
+  dag Value = (ascend
+    (descend 0b00, size.Value,
+             !cond(
+               !eq(!getdagop(dst_enc.EA), descend): !setdagop(dst_enc.EA, ascend),
+               !eq(!getdagop(dst_enc.EA), ascend): !setdagop(dst_enc.EA, descend)),
+             src_enc.EA),
+    // Source extension
+    src_enc.Supplement,
+    // Destination extension
+    dst_enc.Supplement
+  );
 }
 
-let mayLoad = 1, mayStore = 1 in {
-class MxMove_MM<string SIZE, PatFrag LOAD,
-                MxOperand DSTOpd, ComplexPattern DSTPat,
-                MxOperand SRCOpd, ComplexPattern SRCPat,
-                MxBead2Bits ESIZE,
-                MxEncEA SRCEA, MxEncExt SRCEXT,
-                MxEncEA DSTEA, MxEncExt DSTEXT>
-    : MxMove<SIZE, (outs), (ins DSTOpd:$dst, SRCOpd:$src),
-             [(store (LOAD SRCPat:$src), DSTPat:$dst)],
-             MxMoveEncoding<ESIZE, SRCEA, SRCEXT, DSTEA, DSTEXT>>;
-} // let mayLoad = 1, mayStore = 1
-
-multiclass MMxMove_MM<MxType TYPE, MxOperand DSTOpd, ComplexPattern DSTPat,
-                      MxMoveSize SIZE, MxEncEA EA_0, MxEncExt EXT_0> {
-
-  // MEM <- (An)+
-  def NAME#TYPE.OOp.Letter#TYPE.Postfix
-    : MxMove_MM<TYPE.Prefix, TYPE.Load, DSTOpd, DSTPat, TYPE.OOp, TYPE.OPat,
-                SIZE, MxEncEAo_1, MxExtEmpty, EA_0, EXT_0>;
-
-  // MEM <- -(An)
-  def NAME#TYPE.EOp.Letter#TYPE.Postfix
-    : MxMove_MM<TYPE.Prefix, TYPE.Load, DSTOpd, DSTPat, TYPE.EOp, TYPE.EPat,
-                SIZE, MxEncEAe_1, MxExtEmpty, EA_0, EXT_0>;
-
-  // MEM <- (i,An)
-  def NAME#TYPE.POp.Letter#TYPE.Postfix
-    : MxMove_MM<TYPE.Prefix, TYPE.Load, DSTOpd, DSTPat, TYPE.POp, TYPE.PPat,
-                SIZE, MxEncEAp_1, MxExtI16_1, EA_0, EXT_0>;
-
-  // MEM <- (i,An,Xn)
-  def NAME#TYPE.FOp.Letter#TYPE.Postfix
-    : MxMove_MM<TYPE.Prefix, TYPE.Load, DSTOpd, DSTPat, TYPE.FOp, TYPE.FPat,
-                SIZE, MxEncEAf_1, MxExtBrief_1, EA_0, EXT_0>;
-
-  // MEM <- (i,PC,Xn)
-  def NAME#TYPE.KOp.Letter#TYPE.Postfix
-    : MxMove_MM<TYPE.Prefix, TYPE.Load, DSTOpd, DSTPat, TYPE.KOp, TYPE.KPat,
-                SIZE, MxEncEAk, MxExtBrief_1, EA_0, EXT_0>;
-
-  // MEM <- (i,PC)
-  def NAME#TYPE.QOp.Letter#TYPE.Postfix
-    : MxMove_MM<TYPE.Prefix, TYPE.Load, DSTOpd, DSTPat, TYPE.QOp, TYPE.QPat,
-                SIZE, MxEncEAq, MxExtI16_1, EA_0, EXT_0>;
-
-  // MEM <- (ABS)
-  def NAME#TYPE.BOp.Letter#TYPE.Postfix
-    : MxMove_MM<TYPE.Prefix, TYPE.Load, DSTOpd, DSTPat, TYPE.BOp, TYPE.BPat,
-                SIZE, MxEncEAb, MxExtI32_1, EA_0, EXT_0>;
-
-  // MEM <- (An)
-  def NAME#TYPE.JOp.Letter#TYPE.Postfix
-    : MxMove_MM<TYPE.Prefix, TYPE.Load, DSTOpd, DSTPat, TYPE.JOp, TYPE.JPat,
-                SIZE, MxEncEAj_1, MxExtEmpty, EA_0, EXT_0>;
+// Special encoding for Xn
+class MxMoveEncAddrMode_r<string reg_opnd> : MxEncMemOp {
+  let EA = (descend (descend 0b00, (slice "$"#reg_opnd, 3, 3)),
+                    (operand "$"#reg_opnd, 3));
 }
 
-def MOV8dd
-  : MxMove_RR<MxType8d, MxType8d,
-    MxMoveEncoding<MxMoveSize8, MxEncEAd_1, MxExtEmpty, MxEncEAd_0, MxExtEmpty>>;
+// TODO: Generalize and adopt this utility in other .td files as well.
+multiclass MxMoveOperandEncodings<string opnd_name> {
+  // Dn
+  def MxMove#NAME#OpEnc_d : MxEncAddrMode_d<opnd_name>;
+  // An
+  def MxMove#NAME#OpEnc_a : MxEncAddrMode_a<opnd_name>;
+  // Xn
+  def MxMove#NAME#OpEnc_r : MxMoveEncAddrMode_r<opnd_name>;
+  // (An)+
+  def MxMove#NAME#OpEnc_o : MxEncAddrMode_o<opnd_name>;
+  // -(An)
+  def MxMove#NAME#OpEnc_e : MxEncAddrMode_e<opnd_name>;
+  // (i,PC,Xn)
+  def MxMove#NAME#OpEnc_k : MxEncAddrMode_k<opnd_name>;
+  // (i,PC)
+  def MxMove#NAME#OpEnc_q : MxEncAddrMode_q<opnd_name>;
+  // (i,An,Xn)
+  def MxMove#NAME#OpEnc_f : MxEncAddrMode_f<opnd_name>;
+  // (i,An)
+  def MxMove#NAME#OpEnc_p : MxEncAddrMode_p<opnd_name>;
+  // (ABS).L
+  def MxMove#NAME#OpEnc_b : MxEncAddrMode_abs<opnd_name, /*W/L=*/true>;
+  // (An)
+  def MxMove#NAME#OpEnc_j : MxEncAddrMode_j<opnd_name>;
+}
 
-// M <- R
-def MOV8fd : MxMove_MR<MxType8.FOp, MxType8.FPat, MxType8d,
-                       MxMoveEncoding<MxMoveSize8,
-                            /*src*/   MxEncEAd_1, MxExtEmpty,
-                            /*dst*/   MxEncEAf_0, MxExtBrief_0>>;
+defm Src : MxMoveOperandEncodings<"src">;
+defm Dst : MxMoveOperandEncodings<"dst">;
 
-def MOV8pd : MxMove_MR<MxType8.POp, MxType8.PPat, MxType8d,
-                       MxMoveEncoding<MxMoveSize8,
-                            /*src*/   MxEncEAd_1, MxExtEmpty,
-                            /*dst*/   MxEncEAp_0, MxExtI16_0>>;
+defvar MxMoveSupportedAMs = ["o", "e", "k", "q", "f", "p", "b", "j"];
 
-def MOV8ed : MxMove_MR<MxType8.EOp, MxType8.EPat, MxType8d,
-                       MxMoveEncoding<MxMoveSize8,
-                            /*src*/   MxEncEAd_1, MxExtEmpty,
-                            /*dst*/   MxEncEAe_0, MxExtEmpty>>;
+let Defs = [CCR] in
+class MxMove<string size, dag outs, dag ins, list<dag> pattern, MxMoveEncoding enc>
+    : MxInst<outs, ins, "move."#size#"\t$src, $dst", pattern> {
+  let Inst = enc.Value;
+}
 
-def MOV8od : MxMove_MR<MxType8.OOp, MxType8.OPat, MxType8d,
-                       MxMoveEncoding<MxMoveSize8,
-                            /*src*/   MxEncEAd_1, MxExtEmpty,
-                            /*dst*/   MxEncEAo_0, MxExtEmpty>>;
+// R <- R
+class MxMove_RR<MxType TYPE, string DST_REG, string SRC_REG,
+                MxMoveEncoding ENC,
+                MxOpBundle DST = !cast<MxOpBundle>("MxOp"#TYPE.Size#"AddrMode_"#DST_REG),
+                MxOpBundle SRC = !cast<MxOpBundle>("MxOp"#TYPE.Size#"AddrMode_"#SRC_REG)>
+    : MxMove<TYPE.Prefix,
+             (outs DST.Op:$dst), (ins SRC.Op:$src),
+             [(null_frag)], ENC>;
 
-def MOV8bd : MxMove_MR<MxType8.BOp, MxType8.BPat, MxType8d,
-                       MxMoveEncoding<MxMoveSize8,
-                            /*src*/   MxEncEAd_1, MxExtEmpty,
-                            /*dst*/   MxEncEAb,   MxExtI32_0>>;
+foreach DST_REG = ["r", "a"] in {
+  foreach SRC_REG = ["r", "a"] in
+  foreach TYPE = [MxType16, MxType32] in
+  def MOV # TYPE.Size # DST_REG # SRC_REG # TYPE.Postfix
+      : MxMove_RR<TYPE, DST_REG, SRC_REG,
+                  MxMoveEncoding<!cast<MxMoveSize>("MxMoveSize"#TYPE.Size),
+                                 !cast<MxEncMemOp>("MxMoveDstOpEnc_"#DST_REG),
+                                 !cast<MxEncMemOp>("MxMoveSrcOpEnc_"#SRC_REG)>>;
+} // foreach DST_REG
+foreach TYPE = [MxType8, MxType16, MxType32] in
+def MOV # TYPE.Size # dd # TYPE.Postfix
+    : MxMove_RR<TYPE, "d", "d",
+                MxMoveEncoding<!cast<MxMoveSize>("MxMoveSize"#TYPE.Size),
+                               MxMoveDstOpEnc_d, MxMoveSrcOpEnc_d>>;
 
-def MOV8jd : MxMove_MR<MxType8.JOp, MxType8.JPat, MxType8d,
-                       MxMoveEncoding<MxMoveSize8,
-                            /*src*/   MxEncEAd_1, MxExtEmpty,
-                            /*dst*/   MxEncEAj_0, MxExtEmpty>>;
+// M <- R
+let mayStore = 1 in {
+class MxMove_MR<MxType TYPE, MxOpBundle DST, string SRC_REG, MxMoveEncoding ENC,
+                MxOpBundle SRC = !cast<MxOpBundle>("MxOp"#TYPE.Size#"AddrMode_"#SRC_REG)>
+    : MxMove<TYPE.Prefix, (outs), (ins DST.Op:$dst, SRC.Op:$src),
+             [(store TYPE.VT:$src, DST.Pat:$dst)], ENC>;
+
+class MxMove_MI<MxType TYPE, MxOpBundle DST, MxMoveEncoding ENC,
+                MxImmOpBundle SRC = !cast<MxImmOpBundle>("MxOp"#TYPE.Size#"AddrMode_i")>
+    : MxMove<TYPE.Prefix, (outs), (ins DST.Op:$dst, SRC.Op:$src),
+             [(store SRC.ImmPat:$src, DST.Pat:$dst)], ENC>;
+} // let mayStore = 1
 
+foreach REG = ["r", "a", "d"] in
+foreach AM = MxMoveSupportedAMs in {
+  foreach TYPE = !if(!eq(REG, "d"), [MxType8, MxType16, MxType32], [MxType16, MxType32]) in
+  def MOV # TYPE.Size # AM # REG # TYPE.Postfix
+      : MxMove_MR<TYPE, !cast<MxOpBundle>("MxOp"#TYPE.Size#"AddrMode_"#AM), REG,
+                  MxMoveEncoding<!cast<MxMoveSize>("MxMoveSize"#TYPE.Size),
+                                 !cast<MxEncMemOp>("MxMoveDstOpEnc_"#AM),
+                                 !cast<MxEncMemOp>("MxMoveSrcOpEnc_"#REG)>>;
+} // foreach AM
+
+foreach AM = MxMoveSupportedAMs in {
+  foreach TYPE = [MxType8, MxType16, MxType32] in
+  def MOV # TYPE.Size # AM # i # TYPE.Postfix
+      : MxMove_MI<TYPE, !cast<MxOpBundle>("MxOp"#TYPE.Size#"AddrMode_"#AM),
+                  MxMoveEncoding<!cast<MxMoveSize>("MxMoveSize"#TYPE.Size),
+                                 !cast<MxEncMemOp>("MxMoveDstOpEnc_"#AM),
+                                 MxEncAddrMode_i<"src", TYPE.Size>>>;
+} // foreach AM
 
 // R <- I
-def MOV8di : MxMove_RI<MxType8d,
-    MxMoveEncoding<MxMoveSize8, MxEncEAi, MxExtI8_1, MxEncEAd_0, MxExtEmpty>>;
-
-foreach S = [16, 32] in {
-  foreach D = [ "r", "a" ] in {
-
-    foreach O = [ "r", "a" ] in {
-      def MOV#S#D#O : MxMove_RR<
-        !cast<MxType>("MxType"#S#D),
-        !cast<MxType>("MxType"#S#O),
-        MxMoveEncoding<!cast<MxMoveSize>("MxMoveSize"#S),
-                       !cast<MxEncEA>("MxEncEA"#D#"_1"), MxExtEmpty,
-                       !cast<MxEncEA>("MxEncEA"#D#"_0_reflected"), MxExtEmpty>>;
-    }
-
-    // M <- R
-    def MOV#S#"f"#D : MxMove_MR<
-      !cast<MxType>("MxType"#S).FOp,
-      !cast<MxType>("MxType"#S).FPat,
-      !cast<MxType>("MxType"#S#D),
-      MxMoveEncoding<!cast<MxMoveSize>("MxMoveSize"#S),
-                     !cast<MxEncEA>("MxEncEA"#D#"_1"), MxExtEmpty,
-                     MxEncEAf_0, MxExtBrief_0>>;
-
-    def MOV#S#"p"#D : MxMove_MR<
-      !cast<MxType>("MxType"#S).POp,
-      !cast<MxType>("MxType"#S).PPat,
-      !cast<MxType>("MxType"#S#D),
-      MxMoveEncoding<!cast<MxMoveSize>("MxMoveSize"#S),
-                     !cast<MxEncEA>("MxEncEA"#D#"_1"), MxExtEmpty,
-                     MxEncEAp_0, MxExtI16_0>>;
-
-    def MOV#S#"e"#D : MxMove_MR<
-      !cast<MxType>("MxType"#S).EOp,
-      !cast<MxType>("MxType"#S).EPat,
-      !cast<MxType>("MxType"#S#D),
-      MxMoveEncoding<!cast<MxMoveSize>("MxMoveSize"#S),
-                     !cast<MxEncEA>("MxEncEA"#D#"_1"), MxExtEmpty,
-                     MxEncEAe_0, MxExtEmpty>>;
-
-    def MOV#S#"o"#D : MxMove_MR<
-      !cast<MxType>("MxType"#S).OOp,
-      !cast<MxType>("MxType"#S).OPat,
-      !cast<MxType>("MxType"#S#D),
-      MxMoveEncoding<!cast<MxMoveSize>("MxMoveSize"#S),
-                     !cast<MxEncEA>("MxEncEA"#D#"_1"), MxExtEmpty,
-                     MxEncEAo_0, MxExtEmpty>>;
-
-    def MOV#S#"b"#D : MxMove_MR<
-      !cast<MxType>("MxType"#S).BOp,
-      !cast<MxType>("MxType"#S).BPat,
-      !cast<MxType>("MxType"#S#D),
-      MxMoveEncoding<!cast<MxMoveSize>("MxMoveSize"#S),
-                     !cast<MxEncEA>("MxEncEA"#D#"_1"), MxExtEmpty,
-                     MxEncEAb, MxExtI32_0>>;
-
-    def MOV#S#"j"#D : MxMove_MR<
-      !cast<MxType>("MxType"#S).JOp,
-      !cast<MxType>("MxType"#S).JPat,
-      !cast<MxType>("MxType"#S#D),
-      MxMoveEncoding<!cast<MxMoveSize>("MxMoveSize"#S),
-                     !cast<MxEncEA>("MxEncEA"#D#"_1"), MxExtEmpty,
-                     MxEncEAj_0, MxExtEmpty>>;
-
-
-    // R <- I
-    def MOV#S#D#"i" : MxMove_RI<
-      !cast<MxType>("MxType"#S#D),
-      MxMoveEncoding<!cast<MxMoveSize>("MxMoveSize"#S),
-                     MxEncEAi, !cast<MxEncExt>("MxExtI"#S#"_1"),
-                     !cast<MxEncEA>("MxEncEA"#D#"_0_reflected"), MxExtEmpty>>;
-  }
-}
+class MxMove_RI<MxType TYPE, string DST_REG, MxMoveEncoding ENC,
+                MxImmOpBundle SRC = !cast<MxImmOpBundle>("MxOp"#TYPE.Size#"AddrMode_i"),
+                MxOpBundle DST = !cast<MxOpBundle>("MxOp"#TYPE.Size#"AddrMode_"#DST_REG)>
+    : MxMove<TYPE.Prefix, (outs DST.Op:$dst), (ins SRC.Op:$src),
+              [(set TYPE.VT:$dst, SRC.ImmPat:$src)], ENC>;
+
+foreach REG = ["r", "a", "d"] in {
+  foreach TYPE = !if(!eq(REG, "d"), [MxType8, MxType16, MxType32], [MxType16, MxType32]) in
+  def MOV # TYPE.Size # REG # i # TYPE.Postfix
+      : MxMove_RI<TYPE, REG,
+                  MxMoveEncoding<!cast<MxMoveSize>("MxMoveSize"#TYPE.Size),
+                                 !cast<MxEncMemOp>("MxMoveDstOpEnc_"#REG),
+                                 MxEncAddrMode_i<"src", TYPE.Size>>>;
+} // foreach REG
 
-// M <- I
-foreach S = [8, 16, 32] in {
-  def MOV#S#"f"#"i" : MxMove_MI<
-    !cast<MxType>("MxType"#S).FOp,
-    !cast<MxType>("MxType"#S).FPat,
-    !cast<MxType>("MxType"#S),
-    MxMoveEncoding<!cast<MxMoveSize>("MxMoveSize"#S),
-                   MxEncEAi, !cast<MxEncExt>("MxExtI"#S#"_1"),
-                   MxEncEAf_0, MxExtBrief_0>>;
-
-  def MOV#S#"p"#"i" : MxMove_MI<
-    !cast<MxType>("MxType"#S).POp,
-    !cast<MxType>("MxType"#S).PPat,
-    !cast<MxType>("MxType"#S),
-    MxMoveEncoding<!cast<MxMoveSize>("MxMoveSize"#S),
-                   MxEncEAi, !cast<MxEncExt>("MxExtI"#S#"_1"),
-                   MxEncEAp_0, MxExtI16_0>>;
-
-  def MOV#S#"b"#"i" : MxMove_MI<
-    !cast<MxType>("MxType"#S).BOp,
-    !cast<MxType>("MxType"#S).BPat,
-    !cast<MxType>("MxType"#S),
-    MxMoveEncoding<!cast<MxMoveSize>("MxMoveSize"#S),
-                   MxEncEAi, !cast<MxEncExt>("MxExtI"#S#"_1"),
-                   MxEncEAb, MxExtI32_0>>;
-
-  def MOV#S#"j"#"i" : MxMove_MI<
-    !cast<MxType>("MxType"#S).JOp,
-    !cast<MxType>("MxType"#S).JPat,
-    !cast<MxType>("MxType"#S),
-    MxMoveEncoding<!cast<MxMoveSize>("MxMoveSize"#S),
-                   MxEncEAi, !cast<MxEncExt>("MxExtI"#S#"_1"),
-                   MxEncEAj_0, MxExtEmpty>>;
-}
+// R <- M
+let mayLoad = 1 in
+class MxMove_RM<MxType TYPE, string DST_REG, MxOpBundle SRC, MxEncMemOp SRC_ENC,
+                MxMoveSize SIZE_ENC = !cast<MxMoveSize>("MxMoveSize"#TYPE.Size),
+                MxOpBundle DST = !cast<MxOpBundle>("MxOp"#TYPE.Size#"AddrMode_"#DST_REG),
+                MxEncMemOp DST_ENC = !cast<MxEncMemOp>("MxMoveDstOpEnc_"#DST_REG)>
+    : MxMove<TYPE.Prefix, (outs DST.Op:$dst), (ins SRC.Op:$src),
+             [(set TYPE.VT:$dst, (TYPE.Load SRC.Pat:$src))],
+             MxMoveEncoding<SIZE_ENC, DST_ENC, SRC_ENC>>;
+
+foreach REG = ["r", "a", "d"] in
+foreach AM = MxMoveSupportedAMs in {
+  foreach TYPE = !if(!eq(REG, "d"), [MxType8, MxType16, MxType32], [MxType16, MxType32]) in
+  def MOV # TYPE.Size # REG # AM # TYPE.Postfix
+      : MxMove_RM<TYPE, REG, !cast<MxOpBundle>("MxOp"#TYPE.Size#"AddrMode_"#AM),
+                  !cast<MxEncMemOp>("MxMoveSrcOpEnc_"#AM)>;
+} // foreach AM
+
+// Tail call version
+let Pattern = [(null_frag)] in {
+  foreach REG = ["r", "a"] in
+  foreach AM = MxMoveSupportedAMs in {
+    foreach TYPE = [MxType16, MxType32] in
+    def MOV # TYPE.Size # REG # AM # _TC
+        : MxMove_RM<TYPE, REG, !cast<MxOpBundle>("MxOp"#TYPE.Size#"AddrMode_"#AM),
+                    !cast<MxEncMemOp>("MxMoveSrcOpEnc_"#AM)> {
+      let isCodeGenOnly = true;
+    }
+  } // foreach AM
+} // let Pattern
+
+let mayLoad = 1, mayStore = 1 in
+class MxMove_MM<MxType TYPE, MxOpBundle DST, MxOpBundle SRC,
+                MxEncMemOp DST_ENC, MxEncMemOp SRC_ENC>
+    : MxMove<TYPE.Prefix, (outs), (ins DST.Op:$dst, SRC.Op:$src),
+             [(store (TYPE.Load SRC.Pat:$src), DST.Pat:$dst)],
+             MxMoveEncoding<!cast<MxMoveSize>("MxMoveSize"#TYPE.Size),
+                            DST_ENC, SRC_ENC>>;
+
+foreach DST_AM = MxMoveSupportedAMs in
+foreach SRC_AM = MxMoveSupportedAMs in {
+  foreach TYPE = [MxType8, MxType16, MxType32] in
+  def MOV # TYPE.Size # DST_AM # SRC_AM # TYPE.Postfix
+      : MxMove_MM<TYPE, !cast<MxOpBundle>("MxOp"#TYPE.Size#"AddrMode_"#DST_AM),
+                  !cast<MxOpBundle>("MxOp"#TYPE.Size#"AddrMode_"#SRC_AM),
+                  !cast<MxEncMemOp>("MxMoveDstOpEnc_"#DST_AM),
+                  !cast<MxEncMemOp>("MxMoveSrcOpEnc_"#SRC_AM)>;
+} // foreach SRC_AM
 
 // Store ABS(basically pointer) as Immdiate to Mem
 def : Pat<(store   MxType32.BPat :$src, MxType32.PPat :$dst),
@@ -340,66 +242,6 @@ def : Pat<(store   MxType32.BPat :$src, MxType32.BPat :$dst),
 def : Pat<(store   MxType32.BPat :$src, MxType32.JPat :$dst),
           (MOV32ji MxType32.JOp  :$dst, MxType32.IOp  :$src)>;
 
-// R <- M
-defm MOV8d  : MMxMove_RM<MxType8d, MxMoveSize8, MxEncEAd_0>;
-
-defm MOV16r : MMxMove_RM<MxType16r, MxMoveSize16, MxEncEAr_0_reflected>;
-defm MOV16a : MMxMove_RM<MxType16a, MxMoveSize16, MxEncEAa_0>;
-
-defm MOV32r : MMxMove_RM<MxType32r, MxMoveSize32, MxEncEAr_0_reflected>;
-defm MOV32a : MMxMove_RM<MxType32a, MxMoveSize32, MxEncEAa_0>;
-
-let Pattern = [(null_frag)] in {
-defm MOV16r : MMxMove_RM<MxType16r_TC, MxMoveSize16, MxEncEAr_0_reflected>;
-defm MOV16a : MMxMove_RM<MxType16a_TC, MxMoveSize16, MxEncEAa_0>;
-
-defm MOV32r : MMxMove_RM<MxType32r_TC, MxMoveSize32, MxEncEAr_0_reflected>;
-defm MOV32a : MMxMove_RM<MxType32a_TC, MxMoveSize32, MxEncEAa_0>;
-} // Pattern
-
-// M <- M
-defm MOV8p  : MMxMove_MM<MxType8,      MxType8.POp,  MxType8.PPat,
-                         MxMoveSize8,  MxEncEAp_0,   MxExtI16_0>;
-defm MOV16p : MMxMove_MM<MxType16,     MxType16.POp, MxType16.PPat,
-                         MxMoveSize16, MxEncEAp_0,   MxExtI16_0>;
-defm MOV32p : MMxMove_MM<MxType32,     MxType32.POp, MxType32.PPat,
-                         MxMoveSize32, MxEncEAp_0,   MxExtI16_0>;
-
-defm MOV8f  : MMxMove_MM<MxType8,      MxType8.FOp,  MxType8.FPat,
-                         MxMoveSize8,  MxEncEAf_0,   MxExtBrief_0>;
-defm MOV16f : MMxMove_MM<MxType16,     MxType16.FOp, MxType16.FPat,
-                         MxMoveSize16, MxEncEAf_0,   MxExtBrief_0>;
-defm MOV32f : MMxMove_MM<MxType32,     MxType32.FOp, MxType32.FPat,
-                         MxMoveSize32, MxEncEAf_0,   MxExtBrief_0>;
-
-defm MOV8b  : MMxMove_MM<MxType8,      MxType8.BOp,  MxType8.BPat,
-                         MxMoveSize8,  MxEncEAb,     MxExtI32_0>;
-defm MOV16b : MMxMove_MM<MxType16,     MxType16.BOp, MxType16.BPat,
-                         MxMoveSize16, MxEncEAb,     MxExtI32_0>;
-defm MOV32b : MMxMove_MM<MxType32,     MxType32.BOp, MxType32.BPat,
-                         MxMoveSize32, MxEncEAb,     MxExtI32_0>;
-
-defm MOV8e  : MMxMove_MM<MxType8,      MxType8.EOp,  MxType8.EPat,
-                         MxMoveSize8,  MxEncEAe_0,   MxExtEmpty>;
-defm MOV16e : MMxMove_MM<MxType16,     MxType16.EOp, MxType16.EPat,
-                         MxMoveSize16, MxEncEAe_0,   MxExtEmpty>;
-defm MOV32e : MMxMove_MM<MxType32,     MxType32.EOp, MxType32.EPat,
-                         MxMoveSize32, MxEncEAe_0,   MxExtEmpty>;
-
-defm MOV8o  : MMxMove_MM<MxType8,      MxType8.OOp,  MxType8.OPat,
-                         MxMoveSize8,  MxEncEAo_0,   MxExtEmpty>;
-defm MOV16o : MMxMove_MM<MxType16,     MxType16.OOp, MxType16.OPat,
-                         MxMoveSize16, MxEncEAo_0,   MxExtEmpty>;
-defm MOV32o : MMxMove_MM<MxType32,     MxType32.OOp, MxType32.OPat,
-                         MxMoveSize32, MxEncEAo_0,   MxExtEmpty>;
-
-defm MOV8j  : MMxMove_MM<MxType8,      MxType8.JOp,  MxType8.JPat,
-                         MxMoveSize8,  MxEncEAj_0,   MxExtEmpty>;
-defm MOV16j : MMxMove_MM<MxType16,     MxType16.JOp, MxType16.JPat,
-                         MxMoveSize16, MxEncEAj_0,   MxExtEmpty>;
-defm MOV32j : MMxMove_MM<MxType32,     MxType32.JOp, MxType32.JPat,
-                         MxMoveSize32, MxEncEAj_0,   MxExtEmpty>;
-
 //===----------------------------------------------------------------------===//
 // MOVEM
 //
@@ -407,12 +249,12 @@ defm MOV32j : MMxMove_MM<MxType32,     MxType32.JOp, MxType32.JPat,
 //===----------------------------------------------------------------------===//
 
 // Direction
-def MxMOVEM_MR : MxBead1Bit<0>;
-def MxMOVEM_RM : MxBead1Bit<1>;
+defvar MxMOVEM_MR = false;
+defvar MxMOVEM_RM = true;
 
 // Size
-def MxMOVEM_W  : MxBead1Bit<0>;
-def MxMOVEM_L  : MxBead1Bit<1>;
+defvar MxMOVEM_W = false;
+defvar MxMOVEM_L = true;
 
 /// ---------------+-------------+-------------+---------
 ///  F  E  D  C  B | A | 9  8  7 | 6 | 5  4  3 | 2  1  0
@@ -423,31 +265,47 @@ def MxMOVEM_L  : MxBead1Bit<1>;
 /// -----------------------------------------------------
 /// D - direction(RM,MR)
 /// S - size(W,L)
-class MxMOVEMEncoding<MxEncEA EA, MxEncExt EXT, MxBead1Bit SIZE, MxBead1Bit DIR,
-                      MxBead16Imm IMM>
-    : MxEncoding<EA.Reg, EA.DA, EA.Mode, SIZE, MxBead3Bits<0b001>, DIR,
-                 MxBead1Bit<1>, MxBead4Bits<0b0100>, IMM,
-                 EXT.Imm, EXT.B8, EXT.Scale, EXT.WL, EXT.DAReg>;
+class MxMOVEMEncoding<MxEncMemOp opnd_enc, bit size, bit direction,
+                      string mask_op_name> {
+  dag Value = (ascend
+    (descend 0b01001, direction, 0b001, size, opnd_enc.EA),
+    // Mask
+    (operand "$"#mask_op_name, 16),
+    opnd_enc.Supplement
+  );
+}
 
 let mayStore = 1 in
-class MxMOVEM_MR<MxType TYPE, MxBead1Bit SIZE,
-                 MxOperand MEMOp, MxEncEA EA, MxEncExt EXT>
+class MxMOVEM_MR<MxType TYPE, bit SIZE_ENC,
+                 MxOperand MEMOp, MxEncMemOp MEM_ENC>
     : MxInst<(outs), (ins MEMOp:$dst, MxMoveMask:$mask),
-             "movem."#TYPE.Prefix#"\t$mask, $dst", [],
-             MxMOVEMEncoding<EA, EXT, SIZE, MxMOVEM_MR, MxBead16Imm<1>>>;
+             "movem."#TYPE.Prefix#"\t$mask, $dst", []> {
+  let Inst = MxMOVEMEncoding<MEM_ENC, SIZE_ENC, MxMOVEM_MR, "mask">.Value;
+}
+
+foreach AM = MxMoveSupportedAMs in {
+  foreach TYPE = [MxType16, MxType32] in
+  def MOVM # TYPE.Size # AM # m # TYPE.Postfix
+      : MxMOVEM_MR<TYPE, !if(!eq(TYPE, MxType16), MxMOVEM_W, MxMOVEM_L),
+                   !cast<MxOpBundle>("MxOp"#TYPE.Size#"AddrMode_"#AM).Op,
+                   !cast<MxEncMemOp>("MxMoveDstOpEnc_"#AM)>;
+} // foreach AM
 
 let mayLoad = 1 in
-class MxMOVEM_RM<MxType TYPE, MxBead1Bit SIZE,
-                 MxOperand MEMOp, MxEncEA EA, MxEncExt EXT>
+class MxMOVEM_RM<MxType TYPE, bit SIZE_ENC,
+                 MxOperand MEMOp, MxEncMemOp MEM_ENC>
     : MxInst<(outs), (ins MxMoveMask:$mask, MEMOp:$src),
-             "movem."#TYPE.Prefix#"\t$src, $mask", [],
-             MxMOVEMEncoding<EA, EXT, SIZE, MxMOVEM_RM, MxBead16Imm<0>>>;
-
-def MOVM32jm : MxMOVEM_MR<MxType32, MxMOVEM_L, MxType32.JOp, MxEncEAj_0, MxExtEmpty>;
-def MOVM32pm : MxMOVEM_MR<MxType32, MxMOVEM_L, MxType32.POp, MxEncEAp_0, MxExtI16_0>;
+             "movem."#TYPE.Prefix#"\t$src, $mask", []> {
+  let Inst = MxMOVEMEncoding<MEM_ENC, SIZE_ENC, MxMOVEM_RM, "mask">.Value;
+}
 
-def MOVM32mj : MxMOVEM_RM<MxType32, MxMOVEM_L, MxType32.JOp, MxEncEAj_1, MxExtEmpty>;
-def MOVM32mp : MxMOVEM_RM<MxType32, MxMOVEM_L, MxType32.POp, MxEncEAp_1, MxExtI16_1>;
+foreach AM = MxMoveSupportedAMs in {
+  foreach TYPE = [MxType16, MxType32] in
+  def MOVM # TYPE.Size # m # AM # TYPE.Postfix
+      : MxMOVEM_RM<TYPE, !if(!eq(TYPE, MxType16), MxMOVEM_W, MxMOVEM_L),
+                   !cast<MxOpBundle>("MxOp"#TYPE.Size#"AddrMode_"#AM).Op,
+                   !cast<MxEncMemOp>("MxMoveSrcOpEnc_"#AM)>;
+} // foreach AM
 
 // Pseudo versions. These a required by virtual register spill/restore since
 // the mask requires real register to encode. These instruction will be expanded
@@ -495,21 +353,27 @@ def MOVM32mp_P : MxMOVEM_RM_Pseudo<MxType32r, MxType32.POp>;
 ///  0  1  0  0  0  1  0  0  1  1 |   MODE  |   REG
 /// --------------------------------------------------
 let Defs = [CCR] in
-class MxMoveToCCR<dag INS, MxEncEA EA, MxEncExt EXT>
-    : MxInst<(outs CCRC:$dst), INS, "move.w\t$src, $dst", [],
-             MxEncoding<EA.Reg, EA.DA, EA.Mode,
-                        MxBead4Bits<0b0011>, MxBead4Bits<0b0001>, MxBead2Bits<0b01>,
-                        EXT.Imm, EXT.B8, EXT.Scale, EXT.WL, EXT.DAReg>>;
+class MxMoveToCCR<MxOperand MEMOp, MxEncMemOp SRC_ENC>
+    : MxInst<(outs CCRC:$dst), (ins MEMOp:$src), "move.w\t$src, $dst", []> {
+  let Inst = (ascend
+    (descend 0b0100010011, SRC_ENC.EA),
+    SRC_ENC.Supplement
+  );
+}
 
-class MxMoveToCCRPseudo<dag INS> : MxPseudo<(outs CCRC:$dst), INS>;
+class MxMoveToCCRPseudo<MxOperand MEMOp>
+    : MxPseudo<(outs CCRC:$dst), (ins MEMOp:$src)>;
 
-let mayLoad = 1 in {
-def MOV16cp : MxMoveToCCR<(ins MxType16d.POp:$src), MxEncEAp_1, MxExtI16_1>;
-def  MOV8cp : MxMoveToCCRPseudo<(ins MxType8d.POp:$src)>;
-} // let mayLoad = 1
+let mayLoad = 1 in
+foreach AM = MxMoveSupportedAMs in {
+  def MOV16c # AM : MxMoveToCCR<!cast<MxOpBundle>("MxOp16AddrMode_"#AM).Op,
+                                !cast<MxEncMemOp>("MxMoveSrcOpEnc_"#AM)>;
+  def MOV8c # AM  : MxMoveToCCRPseudo<!cast<MxOpBundle>("MxOp8AddrMode_"#AM).Op>;
+} // foreach AM
 
-def MOV16cd : MxMoveToCCR<(ins MxType16d.ROp:$src), MxEncEAd_1, MxExtEmpty>;
-def  MOV8cd : MxMoveToCCRPseudo<(ins MxType8d.ROp:$src)>;
+// Only data register is allowed.
+def MOV16cd : MxMoveToCCR<MxOp16AddrMode_d.Op, MxMoveSrcOpEnc_d>;
+def MOV8cd  : MxMoveToCCRPseudo<MxOp8AddrMode_d.Op>;
 
 /// Move from CCR
 /// --------------------------------------------------
@@ -518,27 +382,38 @@ def  MOV8cd : MxMoveToCCRPseudo<(ins MxType8d.ROp:$src)>;
 ///                               | EFFECTIVE ADDRESS
 ///  0  1  0  0  0  0  1  0  1  1 |   MODE  |   REG
 /// --------------------------------------------------
-let Uses = [CCR] in
-class MxMoveFromCCR<dag OUTS, dag INS, MxEncEA EA, MxEncExt EXT>
-    : MxInst<OUTS, INS, "move.w\t$src, $dst", [],
-             MxEncoding<EA.Reg, EA.DA, EA.Mode,
-                        MxBead4Bits<0b1011>, MxBead4Bits<0b0000>, MxBead2Bits<0b01>,
-                        EXT.Imm, EXT.B8, EXT.Scale, EXT.WL, EXT.DAReg>>,
-      Requires<[ IsM68010 ]>;
-
-class MxMoveFromCCRPseudo<dag INS> : MxPseudo<(outs), INS>;
-
-let mayStore = 1 in {
-def MOV16pc
-  : MxMoveFromCCR<(outs), (ins MxType16d.POp:$dst, CCRC:$src), MxEncEAp_0, MxExtI16_0>;
-def MOV8pc : MxMoveFromCCRPseudo<(ins MxType8d.POp:$dst, CCRC:$src)>;
-} // let mayStore = 1
+let Uses = [CCR] in {
+class MxMoveFromCCR_R
+    : MxInst<(outs MxDRD16:$dst), (ins CCRC:$src), "move.w\t$src, $dst", []>,
+      Requires<[ IsM68010 ]> {
+  let Inst = (descend 0b0100001011, MxEncAddrMode_d<"dst">.EA);
+}
 
-def MOV16dc
-  : MxMoveFromCCR<(outs MxType16d.ROp:$dst), (ins CCRC:$src), MxEncEAd_0, MxExtEmpty>;
+class MxMoveFromCCR_M<MxOperand MEMOp, MxEncMemOp DST_ENC>
+    : MxInst<(outs), (ins MEMOp:$dst, CCRC:$src), "move.w\t$src, $dst", []>,
+      Requires<[ IsM68010 ]> {
+  let Inst = (ascend
+    (descend 0b0100001011, DST_ENC.EA),
+    DST_ENC.Supplement
+  );
+}
 
-def MOV8dc : MxMoveFromCCRPseudo<(ins MxType8d.ROp:$dst, CCRC:$src)>;
+class MxMoveFromCCRPseudo<MxOperand MEMOp>
+    : MxPseudo<(outs), (ins MEMOp:$dst, CCRC:$src)>;
+} // let Uses = [CCR]
 
+let mayStore = 1 in
+foreach AM = MxMoveSupportedAMs in {
+  def MOV16 # AM # c
+    : MxMoveFromCCR_M<!cast<MxOpBundle>("MxOp16AddrMode_"#AM).Op,
+                      !cast<MxEncMemOp>("MxMoveDstOpEnc_"#AM)>;
+  def MOV8 # AM # c
+    : MxMoveFromCCRPseudo<!cast<MxOpBundle>("MxOp8AddrMode_"#AM).Op>;
+} // foreach AM
+
+// Only data register is allowed.
+def MOV16dc : MxMoveFromCCR_R;
+def MOV8dc  : MxMoveFromCCRPseudo<MxOp8AddrMode_d.Op>;
 
 //===----------------------------------------------------------------------===//
 // LEA
@@ -549,18 +424,18 @@ def MOV8dc : MxMoveFromCCRPseudo<(ins MxType8d.ROp:$dst, CCRC:$src)>;
 /// ----------------------------------------------------
 ///  0  1  0  0 | DST REG | 1  1  1 |   MODE  |   REG
 /// ----------------------------------------------------
-class MxLEA<MxOperand SRCOpd, ComplexPattern SRCPat, MxEncEA EA, MxEncExt EXT>
-    : MxInst<(outs MxARD32:$dst), (ins SRCOpd:$src),
-             "lea\t$src, $dst", [(set i32:$dst, SRCPat:$src)],
-             MxEncoding<EA.Reg, EA.DA, EA.Mode,
-                        MxBead3Bits<0b111>, MxBeadReg<0>, MxBead4Bits<0x4>,
-                        EXT.Imm, EXT.B8, EXT.Scale, EXT.WL, EXT.DAReg>>;
-
-def LEA32p : MxLEA<MxARID32, MxCP_ARID, MxEncEAp_1, MxExtI16_1>;
-def LEA32f : MxLEA<MxARII32, MxCP_ARII, MxEncEAf_1, MxExtBrief_1>;
-def LEA32q : MxLEA<MxPCD32,  MxCP_PCD,  MxEncEAq,   MxExtI16_1>;
-def LEA32b : MxLEA<MxAL32,   MxCP_AL,   MxEncEAb,   MxExtI32_1>;
+class MxLEA<MxOpBundle SRC, MxEncMemOp SRC_ENC>
+    : MxInst<(outs MxARD32:$dst), (ins SRC.Op:$src),
+             "lea\t$src, $dst", [(set i32:$dst, SRC.Pat:$src)]> {
+  let Inst = (ascend
+    (descend 0b0100, (operand "$dst", 3), 0b111, SRC_ENC.EA),
+    SRC_ENC.Supplement
+  );
+}
 
+foreach AM = ["p", "f", "b", "q", "k"] in
+def LEA32 # AM : MxLEA<!cast<MxOpBundle>("MxOp32AddrMode_"#AM),
+                       !cast<MxEncMemOp>("MxMoveSrcOpEnc_"#AM)>;
 
 //===----------------------------------------------------------------------===//
 // Pseudos
diff --git a/llvm/lib/Target/M68k/M68kInstrFormats.td b/llvm/lib/Target/M68k/M68kInstrFormats.td
index 7e0c96a5b1f6..78aed521f13a 100644
--- a/llvm/lib/Target/M68k/M68kInstrFormats.td
+++ b/llvm/lib/Target/M68k/M68kInstrFormats.td
@@ -200,6 +200,11 @@ class MxEncEA<MxBead reg, MxBead mode, MxBead da = MxBeadIgnore> {
   MxBead DA = da;
 }
 
+class MxEncMemOp {
+  dag EA = (ascend);
+  dag Supplement = (ascend);
+}
+
 // FIXME: Is there a way to factorize the addressing mode suffix (i.e.
 // 'r', 'd', 'a' etc.) and use something like multiclass to replace?
 def MxEncEAr_0: MxEncEA<MxBeadDAReg<0>, MxBead2Bits<0b00>>;
@@ -237,6 +242,126 @@ def MxEncEAq : MxEncEA<MxBead3Bits<0b010>, MxBead2Bits<0b11>, MxBead1Bit<1>>;
 def MxEncEAk : MxEncEA<MxBead3Bits<0b011>, MxBead2Bits<0b11>, MxBead1Bit<1>>;
 def MxEncEAi : MxEncEA<MxBead3Bits<0b100>, MxBead2Bits<0b11>, MxBead1Bit<1>>;
 
+class MxEncBriefExt<string reg_opnd, string disp_opnd,
+                    bit size_w_l = false, int scale = 1,
+                    string disp_encoder = ""> {
+  dag Value = (descend
+    // D/A + REGISTER
+    (operand "$"#reg_opnd, 4),
+    // W/L
+    size_w_l,
+    // SCALE
+    !cond(
+      !eq(scale, 1) : 0b00,
+      !eq(scale, 2) : 0b01,
+      !eq(scale, 4) : 0b10,
+      !eq(scale, 8) : 0b11
+    ),
+    0b0,
+    // Displacement
+    (operand "$"#disp_opnd, 8, (encoder disp_encoder))
+  );
+}
+
+class MxEncAddrMode_d<string reg_opnd> : MxEncMemOp {
+  let EA = (descend /*MODE*/0b000,
+                    /*REGISTER*/(operand "$"#reg_opnd, 3));
+}
+
+class MxEncAddrMode_a<string reg_opnd> : MxEncMemOp {
+  let EA = (descend /*MODE*/0b001,
+                    /*REGISTER*/(operand "$"#reg_opnd, 3));
+}
+
+class MxEncAddrMode_r<string reg_opnd> : MxEncMemOp {
+  let EA = (descend /*MODE without the last bit*/0b00,
+                    /*REGISTER with D/A bit*/(operand "$"#reg_opnd, 4));
+}
+
+class MxEncAddrMode_k<string opnd_name> : MxEncMemOp {
+  let EA = (descend /*MODE*/0b111,
+                    /*REGISTER*/0b011);
+
+  let Supplement = MxEncBriefExt<opnd_name#".index", opnd_name#".disp",
+                                 /*W/L*/true, /*SCALE*/1,
+                                 "encodePCRelImm<8>">.Value;
+}
+
+class MxEncAddrMode_q<string opnd_name> : MxEncMemOp {
+  let EA = (descend /*MODE*/0b111,
+                     /*REGISTER*/0b010);
+
+  // 16-bit Displacement
+  let Supplement = (operand "$"#opnd_name, 16,
+                            (encoder "encodePCRelImm<16>"));
+}
+
+class MxEncAddrMode_p<string opnd_name> : MxEncMemOp {
+  let EA = (descend /*MODE*/0b101,
+                     /*REGISTER*/(operand "$"#opnd_name#".reg", 3));
+
+  // 16-bit Displacement
+  let Supplement = (operand "$"#opnd_name#".disp", 16,
+                            (encoder "encodeRelocImm<16>"));
+}
+
+class MxEncAddrMode_f<string opnd_name> : MxEncMemOp {
+  let EA = (descend /*MODE*/0b110,
+                     /*REGISTER*/(operand "$"#opnd_name#".reg", 3));
+
+  let Supplement = MxEncBriefExt<opnd_name#".index", opnd_name#".disp",
+                                 /*W/L*/true, /*SCALE*/1,
+                                 "encodeRelocImm<8>">.Value;
+}
+
+class MxEncAddrMode_j<string reg_opnd> : MxEncMemOp {
+  let EA = (descend /*MODE*/0b010,
+                     /*REGISTER*/(operand "$"#reg_opnd, 3));
+}
+
+class MxEncAddrMode_i<string opnd_name, int size> : MxEncMemOp {
+  let EA = (descend /*MODE*/0b111,
+                     /*REGISTER*/0b100);
+
+  // Immediate
+  let Supplement =
+    !cond(
+      !eq(size, 8)  : (descend 0b00000000, (operand "$"#opnd_name, 8)),
+      !eq(size, 16) : (operand "$"#opnd_name, 16),
+      !eq(size, 32) : (ascend (slice "$"#opnd_name, 31, 16),
+                              (slice "$"#opnd_name, 15, 0))
+    );
+}
+
+// abs.W -> size_w_l = false
+// abs.L -> size_w_l = true
+class MxEncAddrMode_abs<string opnd_name, bit size_w_l = false> : MxEncMemOp {
+  let EA = (descend /*MODE*/0b111,
+                    // Wrap the REGISTER part in another dag to make sure
+                    // the dag assigned to EA only has two arguments. Such
+                    // that it's easier for MOV instructions to reverse
+                    // on its destination part.
+                    /*REGISTER*/(descend 0b00, size_w_l));
+
+  // Absolute address
+  let Supplement = !if(size_w_l,
+    // abs.L
+    (operand "$"#opnd_name, 32, (encoder "encodeRelocImm<32>")),
+    // abs.W
+    (operand "$"#opnd_name, 16, (encoder "encodeRelocImm<16>"))
+  );
+}
+
+class MxEncAddrMode_o<string reg_opnd> : MxEncMemOp {
+  let EA = (descend /*MODE*/0b011,
+                    /*REGISTER*/(operand "$"#reg_opnd, 3));
+}
+
+class MxEncAddrMode_e<string reg_opnd> : MxEncMemOp {
+  let EA = (descend /*MODE*/0b100,
+                    /*REGISTER*/(operand "$"#reg_opnd, 3));
+}
+
 // Allows you to specify each bit of opcode
 class MxEncOpMode<MxBead b0, MxBead b1 = MxBeadIgnore, MxBead b2 = MxBeadIgnore> {
   MxBead B0 = b0;
@@ -332,6 +457,16 @@ def MxEncSize16 : MxEncSize<0b01>;
 def MxEncSize32 : MxEncSize<0b10>;
 def MxEncSize64 : MxEncSize<0b11>;
 
+// TODO: Remove "New" in the name after the codebead-based
+// representation is deprecated.
+class MxNewEncSize<bits<2> value> {
+  bits<2> Value = value;
+}
+def MxNewEncSize8  : MxNewEncSize<0b00>;
+def MxNewEncSize16 : MxNewEncSize<0b01>;
+def MxNewEncSize32 : MxNewEncSize<0b10>;
+def MxNewEncSize64 : MxNewEncSize<0b11>;
+
 // M68k INSTRUCTION. Most instructions specify the location of an operand by
 // using the effective address field in the operation word. The effective address
 // is composed of two 3-bit fields: the mode field and the register field. The
@@ -357,6 +492,7 @@ class MxInst<dag outs, dag ins,
 
   // Byte stream
   field bits<192> Beads = beads.Value;
+  dag Inst = (ascend);
 
   // Number of bytes
   let Size = 0;
diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.cpp b/llvm/lib/Target/M68k/M68kInstrInfo.cpp
index 105c816f9885..b33469529ca5 100644
--- a/llvm/lib/Target/M68k/M68kInstrInfo.cpp
+++ b/llvm/lib/Target/M68k/M68kInstrInfo.cpp
@@ -26,6 +26,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Regex.h"
 
 #include <functional>
 
@@ -601,40 +602,26 @@ bool M68kInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
 bool M68kInstrInfo::isPCRelRegisterOperandLegal(
     const MachineOperand &MO) const {
   assert(MO.isReg());
-  const auto *MI = MO.getParent();
-  const uint8_t *Beads = M68k::getMCInstrBeads(MI->getOpcode());
-  assert(*Beads);
-
-  // Only addressing mode k has (non-pc) register with PCRel
-  // So we're looking for EA Beads equal to
-  // `3Bits<011>_1Bit<1>_2Bits<11>`
-  // FIXME: There is an important caveat and two assumptions
-  // here: The caveat is that EA encoding always sit on the LSB.
-  // Where the assumptions are that if there are more than one
-  // operands, the EA encoding for the source operand always sit
-  // on the LSB. At the same time, k addressing mode can not be used
-  // on destination operand.
-  // The last assumption is kinda dirty so we need to find a way around
-  // it
-  const uint8_t EncEAk[3] = {0b011, 0b1, 0b11};
-  for (const uint8_t Pat : EncEAk) {
-    uint8_t Bead = *(Beads++);
-    if (!Bead)
-      return false;
 
-    switch (Bead & 0xF) {
-    default:
-      return false;
-    case M68kBeads::Bits1:
-    case M68kBeads::Bits2:
-    case M68kBeads::Bits3: {
-      uint8_t Val = (Bead & 0xF0) >> 4;
-      if (Val != Pat)
-        return false;
-    }
-    }
-  }
-  return true;
+  // Check whether this MO belongs to an instruction with addressing mode 'k',
+  // Refer to TargetInstrInfo.h for more information about this function.
+
+  const MachineInstr *MI = MO.getParent();
+  const unsigned NameIndices = M68kInstrNameIndices[MI->getOpcode()];
+  StringRef InstrName(&M68kInstrNameData[NameIndices]);
+  const unsigned OperandNo = MI->getOperandNo(&MO);
+
+  // If this machine operand is the 2nd operand, then check
+  // whether the instruction has destination addressing mode 'k'.
+  if (OperandNo == 1)
+    return Regex("[A-Z]+(8|16|32)k[a-z](_TC)?$").match(InstrName);
+
+  // If this machine operand is the last one, then check
+  // whether the instruction has source addressing mode 'k'.
+  if (OperandNo == MI->getNumExplicitOperands() - 1)
+    return Regex("[A-Z]+(8|16|32)[a-z]k(_TC)?$").match(InstrName);
+
+  return false;
 }
 
 void M68kInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.td b/llvm/lib/Target/M68k/M68kInstrInfo.td
index c581dd91eaaa..67500af6bfb2 100644
--- a/llvm/lib/Target/M68k/M68kInstrInfo.td
+++ b/llvm/lib/Target/M68k/M68kInstrInfo.td
@@ -291,13 +291,13 @@ def MxARIPD32_TC  : MxMemOp<(ops AR32_TC), MxSize32, "e", "printARIPD32Mem", MxA
 // extension word. The reference is classified as a data reference with the
 // exception of the jump and jump-to-subroutine instructions.
 def MxARID        : MxOpClass<"ARID">;
-def MxARID8       : MxMemOp<(ops i16imm, AR32), MxSize8,  "p", "printARID8Mem", MxARID>;
-def MxARID16      : MxMemOp<(ops i16imm, AR32), MxSize16, "p", "printARID16Mem", MxARID>;
-def MxARID32      : MxMemOp<(ops i16imm, AR32), MxSize32, "p", "printARID32Mem", MxARID>;
+def MxARID8       : MxMemOp<(ops i16imm:$disp, AR32:$reg), MxSize8,  "p", "printARID8Mem", MxARID>;
+def MxARID16      : MxMemOp<(ops i16imm:$disp, AR32:$reg), MxSize16, "p", "printARID16Mem", MxARID>;
+def MxARID32      : MxMemOp<(ops i16imm:$disp, AR32:$reg), MxSize32, "p", "printARID32Mem", MxARID>;
 
-def MxARID8_TC    : MxMemOp<(ops i16imm, AR32_TC), MxSize8,  "p", "printARID8Mem", MxARID>;
-def MxARID16_TC   : MxMemOp<(ops i16imm, AR32_TC), MxSize16, "p", "printARID16Mem", MxARID>;
-def MxARID32_TC   : MxMemOp<(ops i16imm, AR32_TC), MxSize32, "p", "printARID32Mem", MxARID>;
+def MxARID8_TC    : MxMemOp<(ops i16imm:$disp, AR32_TC:$reg), MxSize8,  "p", "printARID8Mem", MxARID>;
+def MxARID16_TC   : MxMemOp<(ops i16imm:$disp, AR32_TC:$reg), MxSize16, "p", "printARID16Mem", MxARID>;
+def MxARID32_TC   : MxMemOp<(ops i16imm:$disp, AR32_TC:$reg), MxSize32, "p", "printARID32Mem", MxARID>;
 
 // ADDRESS REGISTER INDIRECT WITH INDEX. This addressing mode requires one word
 // of extension. The address of the operand is the sum of the address in the
@@ -306,13 +306,19 @@ def MxARID32_TC   : MxMemOp<(ops i16imm, AR32_TC), MxSize32, "p", "printARID32Me
 // The reference is classified as a data reference with the exception of the
 // jump and jump-to-subroutine instructions
 def MxARII       : MxOpClass<"ARII">;
-def MxARII8      : MxMemOp<(ops i8imm, AR32, XR32), MxSize8,  "f", "printARII8Mem", MxARII>;
-def MxARII16     : MxMemOp<(ops i8imm, AR32, XR32), MxSize16, "f", "printARII16Mem", MxARII>;
-def MxARII32     : MxMemOp<(ops i8imm, AR32, XR32), MxSize32, "f", "printARII32Mem", MxARII>;
-
-def MxARII8_TC   : MxMemOp<(ops i8imm, AR32_TC, XR32_TC), MxSize8,  "f", "printARII8Mem", MxARII>;
-def MxARII16_TC  : MxMemOp<(ops i8imm, AR32_TC, XR32_TC), MxSize16, "f", "printARII16Mem", MxARII>;
-def MxARII32_TC  : MxMemOp<(ops i8imm, AR32_TC, XR32_TC), MxSize32, "f", "printARII32Mem", MxARII>;
+def MxARII8      : MxMemOp<(ops i8imm:$disp, AR32:$reg, XR32:$index),
+                           MxSize8,  "f", "printARII8Mem", MxARII>;
+def MxARII16     : MxMemOp<(ops i8imm:$disp, AR32:$reg, XR32:$index),
+                           MxSize16, "f", "printARII16Mem", MxARII>;
+def MxARII32     : MxMemOp<(ops i8imm:$disp, AR32:$reg, XR32:$index),
+                           MxSize32, "f", "printARII32Mem", MxARII>;
+
+def MxARII8_TC   : MxMemOp<(ops i8imm:$disp, AR32_TC:$reg, XR32_TC:$index),
+                           MxSize8,  "f", "printARII8Mem", MxARII>;
+def MxARII16_TC  : MxMemOp<(ops i8imm:$disp, AR32_TC:$reg, XR32_TC:$index),
+                           MxSize16, "f", "printARII16Mem", MxARII>;
+def MxARII32_TC  : MxMemOp<(ops i8imm:$disp, AR32_TC:$reg, XR32_TC:$index),
+                           MxSize32, "f", "printARII32Mem", MxARII>;
 
 // ABSOLUTE SHORT ADDRESS. This addressing mode requires one word of extension.
 // The address of the operand is the extension word. The 16-bit address is sign
@@ -360,9 +366,9 @@ def MxPCD32    : MxMemOp<(ops i16imm), MxSize32, "q", "printPCD32Mem", MxPCD>;
 // word, and the contents of the index register.  The value in the program
 // counter is the address of the extension word. This reference is classified as
 // a program reference.
-def MxPCI8   : MxMemOp<(ops i8imm, XR32), MxSize8,  "k", "printPCI8Mem", MxPCI>;
-def MxPCI16  : MxMemOp<(ops i8imm, XR32), MxSize16, "k", "printPCI16Mem", MxPCI>;
-def MxPCI32  : MxMemOp<(ops i8imm, XR32), MxSize32, "k", "printPCI32Mem", MxPCI>;
+def MxPCI8   : MxMemOp<(ops i8imm:$disp, XR32:$index), MxSize8,  "k", "printPCI8Mem", MxPCI>;
+def MxPCI16  : MxMemOp<(ops i8imm:$disp, XR32:$index), MxSize16, "k", "printPCI16Mem", MxPCI>;
+def MxPCI32  : MxMemOp<(ops i8imm:$disp, XR32:$index), MxSize32, "k", "printPCI32Mem", MxPCI>;
 } // OPERAND_PCREL
 
 def MxImm : AsmOperandClass {
@@ -633,6 +639,74 @@ class MxType<ValueType vt, string prefix, string postfix,
   PatFrag Load = load;
 }
 
+// Provides an alternative way to access the MxOperand and
+// patterns w.r.t a specific addressing mode.
+class MxOpBundle<int size, MxOperand op, ComplexPattern pat> {
+  int Size = size;
+  MxOperand Op = op;
+  ComplexPattern Pat = pat;
+}
+
+class MxImmOpBundle<int size, MxOperand op, PatFrag pat>
+  : MxOpBundle<size, op, ?> {
+  PatFrag ImmPat = pat;
+}
+
+// TODO: We can use MxOp<S>AddrMode_<AM> in more places to
+// replace MxType-based operand factoring.
+foreach size = [8, 16, 32] in {
+  // Dn
+  def MxOp#size#AddrMode_d
+    : MxOpBundle<size, !cast<MxOperand>("MxDRD"#size), ?>;
+
+  // (An)
+  def MxOp#size#AddrMode_j
+    : MxOpBundle<size, !cast<MxOperand>("MxARI"#size), MxCP_ARI>;
+
+  // (An)+
+  def MxOp#size#AddrMode_o
+    : MxOpBundle<size, !cast<MxOperand>("MxARIPI"#size), MxCP_ARIPI>;
+
+  // -(An)
+  def MxOp#size#AddrMode_e
+    : MxOpBundle<size, !cast<MxOperand>("MxARIPD"#size), MxCP_ARIPD>;
+
+  // (i,An)
+  def MxOp#size#AddrMode_p
+    : MxOpBundle<size, !cast<MxOperand>("MxARID"#size), MxCP_ARID>;
+
+  // (i,An,Xn)
+  def MxOp#size#AddrMode_f
+    : MxOpBundle<size, !cast<MxOperand>("MxARII"#size), MxCP_ARII>;
+
+  // (ABS).L
+  def MxOp#size#AddrMode_b
+    : MxOpBundle<size, !cast<MxOperand>("MxAL"#size), MxCP_AL>;
+
+  // (i,PC)
+  def MxOp#size#AddrMode_q
+    : MxOpBundle<size, !cast<MxOperand>("MxPCD"#size), MxCP_PCD>;
+
+  // (i,PC,Xn)
+  def MxOp#size#AddrMode_k
+    : MxOpBundle<size, !cast<MxOperand>("MxPCI"#size), MxCP_PCI>;
+
+  // #imm
+  def MxOp#size#AddrMode_i
+    : MxImmOpBundle<size, !cast<MxOperand>("Mxi"#size#"imm"),
+                    !cast<PatFrag>("MximmSExt"#size)>;
+} // foreach size = [8, 16, 32]
+
+foreach size = [16, 32] in {
+  // An
+  def MxOp#size#AddrMode_a
+    : MxOpBundle<size, !cast<MxOperand>("MxARD"#size), ?>;
+
+  // Xn
+  def MxOp#size#AddrMode_r
+    : MxOpBundle<size, !cast<MxOperand>("MxXRD"#size), ?>;
+} // foreach size = [16, 32]
+
 class MxType8Class<string rLet, MxOperand reg>
     : MxType<i8, "b", "", rLet, reg,
              MxARI8,   MxCP_ARI,
diff --git a/llvm/lib/Target/M68k/M68kInstrShiftRotate.td b/llvm/lib/Target/M68k/M68kInstrShiftRotate.td
index f1967ec11928..b50354597a49 100644
--- a/llvm/lib/Target/M68k/M68kInstrShiftRotate.td
+++ b/llvm/lib/Target/M68k/M68kInstrShiftRotate.td
@@ -24,49 +24,55 @@
 ///
 //===----------------------------------------------------------------------===//
 
-def MxRODI_R : MxBead1Bit<0>;
-def MxRODI_L : MxBead1Bit<1>;
+defvar MxROKind_R = true;
+defvar MxROKind_I = false;
 
-def MxROOP_AS  : MxBead2Bits<0b00>;
-def MxROOP_LS  : MxBead2Bits<0b01>;
-def MxROOP_ROX : MxBead2Bits<0b10>;
-def MxROOP_RO  : MxBead2Bits<0b11>;
+defvar MxRODI_R = false;
+defvar MxRODI_L = true;
+
+defvar MxROOP_AS  = 0b00;
+defvar MxROOP_LS  = 0b01;
+defvar MxROOP_ROX = 0b10;
+defvar MxROOP_RO  = 0b11;
 
 /// ------------+---------+---+------+---+------+---------
 ///  F  E  D  C | B  A  9 | 8 | 7  6 | 5 | 4  3 | 2  1  0
 /// ------------+---------+---+------+---+------+---------
 ///  1  1  1  0 | REG/IMM | D | SIZE |R/I|  OP  |   REG
 /// ------------+---------+---+------+---+------+---------
-class MxSREncoding_R<MxBead1Bit DIRECTION, MxBead2Bits ROOP, MxEncSize SIZE>
-    : MxEncoding<MxBeadDReg<0>, ROOP, MxBead1Bit<1>, SIZE, DIRECTION,
-                 MxBeadDReg<2>, MxBead4Bits<0b1110>>;
-
-class MxSREncoding_I<MxBead1Bit DIRECTION, MxBead2Bits ROOP, MxEncSize SIZE>
-    : MxEncoding<MxBeadDReg<0>, ROOP, MxBead1Bit<0>, SIZE, DIRECTION,
-                 MxBead3Imm<2, 1>, MxBead4Bits<0b1110>>;
+class MxSREncoding<bit kind, string src_opnd, string dst_opnd,
+                   bit direction, bits<2> ro_op, MxNewEncSize size> {
+  dag Value = (descend 0b1110,
+    // REG/IMM
+    (operand "$"#src_opnd, 3),
+    direction, size.Value, kind, ro_op,
+    // REG
+    (operand "$"#dst_opnd, 3)
+  );
+}
 
 // $reg <- $reg op $reg
-class MxSR_DD<string MN, MxType TYPE, SDNode NODE,
-              MxBead1Bit RODI, MxBead2Bits ROOP>
+class MxSR_DD<string MN, MxType TYPE, SDNode NODE, bit RODI, bits<2> ROOP>
     : MxInst<(outs TYPE.ROp:$dst), (ins TYPE.ROp:$src, TYPE.ROp:$opd),
              MN#"."#TYPE.Prefix#"\t$opd, $dst",
-             [(set TYPE.VT:$dst, (NODE TYPE.VT:$src, TYPE.VT:$opd))],
-             MxSREncoding_R<RODI, ROOP,
-                            !cast<MxEncSize>("MxEncSize"#TYPE.Size)>>;
+             [(set TYPE.VT:$dst, (NODE TYPE.VT:$src, TYPE.VT:$opd))]> {
+  let Inst = MxSREncoding<MxROKind_R, "opd", "dst", RODI, ROOP,
+                          !cast<MxNewEncSize>("MxNewEncSize"#TYPE.Size)>.Value;
+}
 
 // $reg <- $reg op $imm
-class MxSR_DI<string MN, MxType TYPE, SDNode NODE,
-              MxBead1Bit RODI, MxBead2Bits ROOP>
+class MxSR_DI<string MN, MxType TYPE, SDNode NODE, bit RODI, bits<2> ROOP>
     : MxInst<(outs TYPE.ROp:$dst),
              (ins TYPE.ROp:$src, !cast<Operand>("Mxi"#TYPE.Size#"imm"):$opd),
              MN#"."#TYPE.Prefix#"\t$opd, $dst",
              [(set TYPE.VT:$dst,
                    (NODE TYPE.VT:$src,
-                         !cast<ImmLeaf>("Mximm"#TYPE.Size#"_1to8"):$opd))],
-             MxSREncoding_I<RODI, ROOP,
-                            !cast<MxEncSize>("MxEncSize"#TYPE.Size)>>;
+                         !cast<ImmLeaf>("Mximm"#TYPE.Size#"_1to8"):$opd))]> {
+  let Inst = MxSREncoding<MxROKind_I, "opd", "dst", RODI, ROOP,
+                          !cast<MxNewEncSize>("MxNewEncSize"#TYPE.Size)>.Value;
+}
 
-multiclass MxSROp<string MN, SDNode NODE, MxBead1Bit RODI, MxBead2Bits ROOP> {
+multiclass MxSROp<string MN, SDNode NODE, bit RODI, bits<2> ROOP> {
 
   let Defs = [CCR] in {
   let Constraints = "$src = $dst" in {
diff --git a/llvm/lib/Target/M68k/M68kMachineFunction.cpp b/llvm/lib/Target/M68k/M68kMachineFunction.cpp
index b1e7369116d7..ccc8f87db502 100644
--- a/llvm/lib/Target/M68k/M68kMachineFunction.cpp
+++ b/llvm/lib/Target/M68k/M68kMachineFunction.cpp
@@ -18,3 +18,10 @@
 using namespace llvm;
 
 void M68kMachineFunctionInfo::anchor() {}
+
+MachineFunctionInfo *M68kMachineFunctionInfo::clone(
+    BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+    const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+    const {
+  return DestMF.cloneInfo<M68kMachineFunctionInfo>(*this);
+}
diff --git a/llvm/lib/Target/M68k/M68kMachineFunction.h b/llvm/lib/Target/M68k/M68kMachineFunction.h
index 93c5255199d4..6ddf53d7d693 100644
--- a/llvm/lib/Target/M68k/M68kMachineFunction.h
+++ b/llvm/lib/Target/M68k/M68kMachineFunction.h
@@ -21,8 +21,6 @@
 namespace llvm {
 
 class M68kMachineFunctionInfo : public MachineFunctionInfo {
-  MachineFunction &MF;
-
   /// Non-zero if the function has base pointer and makes call to
   /// llvm.eh.sjlj.setjmp. When non-zero, the value is a displacement from the
   /// frame pointer to a slot where the base pointer is stashed.
@@ -68,7 +66,12 @@ class M68kMachineFunctionInfo : public MachineFunctionInfo {
   unsigned ArgumentStackSize = 0;
 
 public:
-  explicit M68kMachineFunctionInfo(MachineFunction &MF) : MF(MF) {}
+  explicit M68kMachineFunctionInfo(const MachineFunction &MF) {}
+
+  MachineFunctionInfo *
+  clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+        const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+      const override;
 
   bool getRestoreBasePointer() const { return RestoreBasePointerOffset != 0; }
   void setRestoreBasePointer(const MachineFunction *MF);
diff --git a/llvm/lib/Target/M68k/M68kRegisterInfo.cpp b/llvm/lib/Target/M68k/M68kRegisterInfo.cpp
index 0cae7ac4e312..5b632299fa4c 100644
--- a/llvm/lib/Target/M68k/M68kRegisterInfo.cpp
+++ b/llvm/lib/Target/M68k/M68kRegisterInfo.cpp
@@ -19,6 +19,7 @@
 
 #include "MCTargetDesc/M68kMCTargetDesc.h"
 
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
diff --git a/llvm/lib/Target/M68k/M68kRegisterInfo.h b/llvm/lib/Target/M68k/M68kRegisterInfo.h
index 7f822e1cb34f..fc55e19a958b 100644
--- a/llvm/lib/Target/M68k/M68kRegisterInfo.h
+++ b/llvm/lib/Target/M68k/M68kRegisterInfo.h
@@ -97,6 +97,14 @@ public:
   bool canRealignStack(const MachineFunction &MF) const override;
 
   Register getFrameRegister(const MachineFunction &MF) const override;
+
+  const TargetRegisterClass *
+  getCrossCopyRegClass(const TargetRegisterClass *RC) const override {
+    if (RC == &M68k::CCRCRegClass)
+      return &M68k::DR32RegClass;
+    return RC;
+  }
+
   unsigned getStackRegister() const { return StackPtr; }
   unsigned getBaseRegister() const { return BasePtr; }
   unsigned getGlobalBaseRegister() const { return GlobalBasePtr; }
diff --git a/llvm/lib/Target/M68k/M68kSubtarget.h b/llvm/lib/Target/M68k/M68kSubtarget.h
index 9bf2984983a1..9dd52095959e 100644
--- a/llvm/lib/Target/M68k/M68kSubtarget.h
+++ b/llvm/lib/Target/M68k/M68kSubtarget.h
@@ -22,7 +22,7 @@
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DataLayout.h"
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp
index 9227bd6c3a78..6b093623a106 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/raw_ostream.h"
+#include <type_traits>
 
 using namespace llvm;
 
@@ -39,31 +40,30 @@ class M68kMCCodeEmitter : public MCCodeEmitter {
   const MCInstrInfo &MCII;
   MCContext &Ctx;
 
-public:
-  M68kMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
-      : MCII(mcii), Ctx(ctx) {}
+  void getBinaryCodeForInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups,
+                             APInt &Inst, APInt &Scratch,
+                             const MCSubtargetInfo &STI) const;
 
-  ~M68kMCCodeEmitter() override {}
+  void getMachineOpValue(const MCInst &MI, const MCOperand &Op,
+                         unsigned InsertPos, APInt &Value,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const;
 
-  // TableGen'erated function
-  const uint8_t *getGenInstrBeads(const MCInst &MI) const {
-    return M68k::getMCInstrBeads(MI.getOpcode());
-  }
+  template <unsigned Size>
+  void encodeRelocImm(const MCInst &MI, unsigned OpIdx, unsigned InsertPos,
+                      APInt &Value, SmallVectorImpl<MCFixup> &Fixups,
+                      const MCSubtargetInfo &STI) const;
 
-  unsigned encodeBits(unsigned ThisByte, uint8_t Bead, const MCInst &MI,
-                      const MCInstrDesc &Desc, uint64_t &Buffer,
-                      unsigned Offset, SmallVectorImpl<MCFixup> &Fixups,
+  template <unsigned Size>
+  void encodePCRelImm(const MCInst &MI, unsigned OpIdx, unsigned InsertPos,
+                      APInt &Value, SmallVectorImpl<MCFixup> &Fixups,
                       const MCSubtargetInfo &STI) const;
 
-  unsigned encodeReg(unsigned ThisByte, uint8_t Bead, const MCInst &MI,
-                     const MCInstrDesc &Desc, uint64_t &Buffer, unsigned Offset,
-                     SmallVectorImpl<MCFixup> &Fixups,
-                     const MCSubtargetInfo &STI) const;
+public:
+  M68kMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
+      : MCII(mcii), Ctx(ctx) {}
 
-  unsigned encodeImm(unsigned ThisByte, uint8_t Bead, const MCInst &MI,
-                     const MCInstrDesc &Desc, uint64_t &Buffer, unsigned Offset,
-                     SmallVectorImpl<MCFixup> &Fixups,
-                     const MCSubtargetInfo &STI) const;
+  ~M68kMCCodeEmitter() override {}
 
   void encodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
@@ -72,316 +72,176 @@ public:
 
 } // end anonymous namespace
 
-unsigned M68kMCCodeEmitter::encodeBits(unsigned ThisByte, uint8_t Bead,
-                                       const MCInst &MI,
-                                       const MCInstrDesc &Desc,
-                                       uint64_t &Buffer, unsigned Offset,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
-  unsigned Num = 0;
-  switch (Bead & 0xF) {
-  case M68kBeads::Bits1:
-    Num = 1;
-    break;
-  case M68kBeads::Bits2:
-    Num = 2;
-    break;
-  case M68kBeads::Bits3:
-    Num = 3;
-    break;
-  case M68kBeads::Bits4:
-    Num = 4;
-    break;
-  }
-  unsigned char Val = (Bead & 0xF0) >> 4;
-
-  LLVM_DEBUG(dbgs() << "\tEncodeBits"
-                    << " Num: " << Num << " Val: 0x");
-  LLVM_DEBUG(dbgs().write_hex(Val) << "\n");
+#include "M68kGenMCCodeEmitter.inc"
 
-  Buffer |= (Val << Offset);
-
-  return Num;
-}
+// Select the proper unsigned integer type from a bit size.
+template <unsigned Size> struct select_uint_t {
+  using type = typename std::conditional<
+      Size == 8, uint8_t,
+      typename std::conditional<
+          Size == 16, uint16_t,
+          typename std::conditional<Size == 32, uint32_t,
+                                    uint64_t>::type>::type>::type;
+};
 
-unsigned M68kMCCodeEmitter::encodeReg(unsigned ThisByte, uint8_t Bead,
-                                      const MCInst &MI, const MCInstrDesc &Desc,
-                                      uint64_t &Buffer, unsigned Offset,
-                                      SmallVectorImpl<MCFixup> &Fixups,
-                                      const MCSubtargetInfo &STI) const {
-  bool DA, Reg;
-  switch (Bead & 0xF) {
-  default:
-    llvm_unreachable("Unrecognized Bead code for register type");
-  case M68kBeads::DAReg:
-    Reg = true;
-    DA = true;
-    break;
-  case M68kBeads::DA:
-    Reg = false;
-    DA = true;
-    break;
-  case M68kBeads::DReg:
-  case M68kBeads::Reg:
-    Reg = true;
-    DA = false;
-    break;
+// On a LE host:
+// MSB                   LSB    MSB                   LSB
+// | 0x12 0x34 | 0xAB 0xCD | -> | 0xAB 0xCD | 0x12 0x34 |
+// (On a BE host nothing changes)
+template <typename value_t> static value_t swapWord(value_t Val) {
+  const unsigned NumWords = sizeof(Val) / 2;
+  if (NumWords <= 1)
+    return Val;
+  Val = support::endian::byte_swap(Val, support::big);
+  value_t NewVal = 0;
+  for (unsigned i = 0U; i != NumWords; ++i) {
+    uint16_t Part = (Val >> (i * 16)) & 0xFFFF;
+    Part = support::endian::byte_swap(Part, support::big);
+    NewVal |= (Part << (i * 16));
   }
+  return NewVal;
+}
 
-  unsigned Op = (Bead & 0x70) >> 4;
-  bool Alt = (Bead & 0x80);
-  LLVM_DEBUG(dbgs() << "\tEncodeReg"
-                    << " Op: " << Op << ", DA: " << DA << ", Reg: " << Reg
-                    << ", Alt: " << Alt << "\n");
-
-  auto MIOpIdx = M68k::getLogicalOperandIdx(MI.getOpcode(), Op);
-  bool IsPCRel = Desc.OpInfo[MIOpIdx].OperandType == MCOI::OPERAND_PCREL;
-
-  MCOperand MCO;
-  if (M68kII::hasMultiMIOperands(MI.getOpcode(), Op)) {
-    if (IsPCRel) {
-      assert(Alt &&
-             "PCRel addresses use Alt bead register encoding by default");
-      MCO = MI.getOperand(MIOpIdx + M68k::PCRelIndex);
-    } else {
-      MCO = MI.getOperand(MIOpIdx + (Alt ? M68k::MemIndex : M68k::MemBase));
-    }
+// Figure out which byte we're at in big endian mode.
+template <unsigned Size> static unsigned getBytePosition(unsigned BitPos) {
+  if (Size % 16) {
+    return static_cast<unsigned>(BitPos / 8 + ((BitPos & 0b1111) < 8 ? 1 : -1));
   } else {
-    assert(!Alt && "You cannot use Alt register with a simple operand");
-    MCO = MI.getOperand(MIOpIdx);
+    assert(!(BitPos & 0b1111) && "Not aligned to word boundary?");
+    return BitPos / 8;
   }
-
-  unsigned RegNum = MCO.getReg();
-  auto RI = Ctx.getRegisterInfo();
-
-  unsigned Written = 0;
-  if (Reg) {
-    uint32_t Val = RI->getEncodingValue(RegNum);
-    Buffer |= (Val & 7) << Offset;
-    Offset += 3;
-    Written += 3;
-  }
-
-  if (DA) {
-    Buffer |= (uint64_t)M68kII::isAddressRegister(RegNum) << Offset;
-    Written++;
-  }
-
-  return Written;
-}
-
-static unsigned EmitConstant(uint64_t Val, unsigned Size, unsigned Pad,
-                             uint64_t &Buffer, unsigned Offset) {
-  assert(Size + Offset <= 64 && isUIntN(Size, Val) && "Value does not fit");
-
-  // Writing Value in host's endianness
-  Buffer |= (Val & ((1ULL << Size) - 1)) << Offset;
-  return Size + Pad;
 }
 
-unsigned M68kMCCodeEmitter::encodeImm(unsigned ThisByte, uint8_t Bead,
-                                      const MCInst &MI, const MCInstrDesc &Desc,
-                                      uint64_t &Buffer, unsigned Offset,
-                                      SmallVectorImpl<MCFixup> &Fixups,
-                                      const MCSubtargetInfo &STI) const {
-  unsigned ThisWord = ThisByte / 2;
-  unsigned Size = 0;
-  unsigned Pad = 0;
-  unsigned FixOffset = 0;
-  int64_t Addendum = 0;
-  bool NoExpr = false;
-
-  unsigned Type = Bead & 0xF;
-  unsigned Op = (Bead & 0x70) >> 4;
-  bool Alt = (Bead & 0x80);
-
-  auto MIOpIdx = M68k::getLogicalOperandIdx(MI.getOpcode(), Op);
-  bool IsPCRel = Desc.OpInfo[MIOpIdx].OperandType == MCOI::OPERAND_PCREL;
-
-  // The PC value upon instruction reading of a short jump will point to the
-  // next instruction, thus we need to compensate 2 bytes, which is the diff
-  // between the patch point and the PC.
-  if (IsPCRel && ThisWord == 0)
-    Addendum -= 2;
-
-  switch (Type) {
-  // ??? what happens if it is not byte aligned
-  // ??? is it even possible
-  case M68kBeads::Disp8:
-    Size = 8;
-    Pad = 0;
-    FixOffset = ThisByte + 1;
-    Addendum += 1;
-    break;
-  case M68kBeads::Imm8:
-    Size = 8;
-    Pad = 8;
-    FixOffset = ThisByte;
-    break;
-  case M68kBeads::Imm16:
-    Size = 16;
-    Pad = 0;
-    FixOffset = ThisByte;
-    break;
-  case M68kBeads::Imm32:
-    Size = 32;
-    Pad = 0;
-    FixOffset = ThisByte;
-    break;
-  case M68kBeads::Imm3:
-    Size = 3;
-    Pad = 0;
-    NoExpr = true;
-    break;
-  }
-
-  LLVM_DEBUG(dbgs() << "\tEncodeImm"
-                    << " Op: " << Op << ", Size: " << Size << ", Alt: " << Alt
-                    << "\n");
-
-  MCOperand MCO;
-  if (M68kII::hasMultiMIOperands(MI.getOpcode(), Op)) {
-
-    if (IsPCRel) {
-      assert(!Alt && "You cannot use ALT operand with PCRel");
-      MCO = MI.getOperand(MIOpIdx + M68k::PCRelDisp);
-    } else {
-      MCO = MI.getOperand(MIOpIdx + (Alt ? M68k::MemOuter : M68k::MemDisp));
+// We need special handlings for relocatable & pc-relative operands that are
+// larger than a word.
+// A M68k instruction is aligned by word (16 bits). That means, 32-bit
+// (& 64-bit) immediate values are separated into hi & lo words and placed
+// at lower & higher addresses, respectively. For immediate values that can
+// be easily expressed in TG, we explicitly rotate the word ordering like
+// this:
+// ```
+// (ascend (slice "$imm", 31, 16), (slice "$imm", 15, 0))
+// ```
+// For operands that call into encoder functions, we need to use the `swapWord`
+// function to assure the correct word ordering on LE host. Note that
+// M68kMCCodeEmitter does massage _byte_ ordering of the final encoded
+// instruction but it assumes everything aligns on word boundaries. So things
+// will go wrong if we don't take care of the _word_ ordering here.
+template <unsigned Size>
+void M68kMCCodeEmitter::encodeRelocImm(const MCInst &MI, unsigned OpIdx,
+                                       unsigned InsertPos, APInt &Value,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
+  using value_t = typename select_uint_t<Size>::type;
+  const MCOperand &MCO = MI.getOperand(OpIdx);
+  if (MCO.isImm()) {
+    Value |= swapWord<value_t>(static_cast<value_t>(MCO.getImm()));
+  } else if (MCO.isExpr()) {
+    const MCExpr *Expr = MCO.getExpr();
+
+    // Absolute address
+    int64_t Addr;
+    if (Expr->evaluateAsAbsolute(Addr)) {
+      Value |= swapWord<value_t>(static_cast<value_t>(Addr));
+      return;
     }
 
-    if (MCO.isExpr()) {
-      assert(!NoExpr && "Cannot use expression here");
-      const MCExpr *Expr = MCO.getExpr();
+    // Relocatable address
+    unsigned InsertByte = getBytePosition<Size>(InsertPos);
+    Fixups.push_back(MCFixup::create(InsertByte, Expr,
+                                     getFixupForSize(Size, /*IsPCRel=*/false),
+                                     MI.getLoc()));
+  }
+}
 
-      // This only makes sense for PCRel instructions since PC points to the
-      // extension word and Disp8 for example is right justified and requires
-      // correction. E.g. R_68K_PC32 is calculated as S + A - P, P for Disp8
-      // will be EXTENSION_WORD + 1 thus we need to have A equal to 1 to
-      // compensate.
-      // TODO count extension words
-      if (IsPCRel && Addendum != 0) {
+template <unsigned Size>
+void M68kMCCodeEmitter::encodePCRelImm(const MCInst &MI, unsigned OpIdx,
+                                       unsigned InsertPos, APInt &Value,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
+  const MCOperand &MCO = MI.getOperand(OpIdx);
+  if (MCO.isImm()) {
+    using value_t = typename select_uint_t<Size>::type;
+    Value |= swapWord<value_t>(static_cast<value_t>(MCO.getImm()));
+  } else if (MCO.isExpr()) {
+    const MCExpr *Expr = MCO.getExpr();
+    unsigned InsertByte = getBytePosition<Size>(InsertPos);
+
+    // Special handlings for sizes smaller than a word.
+    if (Size < 16) {
+      int LabelOffset = 0;
+      if (InsertPos < 16)
+        // If the patch point is at the first word, PC is pointing at the
+        // next word.
+        LabelOffset = InsertByte - 2;
+      else if (InsertByte % 2)
+        // Otherwise the PC is pointing at the first byte of this word.
+        // So we need to consider the offset between PC and the fixup byte.
+        LabelOffset = 1;
+
+      if (LabelOffset)
         Expr = MCBinaryExpr::createAdd(
-            Expr, MCConstantExpr::create(Addendum, Ctx), Ctx);
-      }
-
-      Fixups.push_back(MCFixup::create(
-          FixOffset, Expr, getFixupForSize(Size, IsPCRel), MI.getLoc()));
-      // Write zeros
-      return EmitConstant(0, Size, Pad, Buffer, Offset);
+            Expr, MCConstantExpr::create(LabelOffset, Ctx), Ctx);
     }
 
-  } else {
-    MCO = MI.getOperand(MIOpIdx);
-    if (MCO.isExpr()) {
-      assert(!NoExpr && "Cannot use expression here");
-      const MCExpr *Expr = MCO.getExpr();
-
-      if (Addendum != 0) {
-        Expr = MCBinaryExpr::createAdd(
-            Expr, MCConstantExpr::create(Addendum, Ctx), Ctx);
-      }
-
-      Fixups.push_back(MCFixup::create(
-          FixOffset, Expr, getFixupForSize(Size, IsPCRel), MI.getLoc()));
-      // Write zeros
-      return EmitConstant(0, Size, Pad, Buffer, Offset);
-    }
+    Fixups.push_back(MCFixup::create(InsertByte, Expr,
+                                     getFixupForSize(Size, /*IsPCRel=*/true),
+                                     MI.getLoc()));
   }
+}
 
-  int64_t I = MCO.getImm();
-
-  // Store 8 as 0, thus making range 1-8
-  if (Type == M68kBeads::Imm3 && Alt) {
-    assert(I && "Cannot encode Alt Imm3 zero value");
-    I %= 8;
+void M68kMCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &Op,
+                                          unsigned InsertPos, APInt &Value,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  // Register
+  if (Op.isReg()) {
+    unsigned RegNum = Op.getReg();
+    const auto *RI = Ctx.getRegisterInfo();
+    Value |= RI->getEncodingValue(RegNum);
+    // Setup the D/A bit
+    if (M68kII::isAddressRegister(RegNum))
+      Value |= 0b1000;
+  } else if (Op.isImm()) {
+    // Immediate
+    Value |= static_cast<uint64_t>(Op.getImm());
+  } else if (Op.isExpr()) {
+    // Absolute address
+    int64_t Addr;
+    if (!Op.getExpr()->evaluateAsAbsolute(Addr))
+      report_fatal_error("Unsupported asm expression. Only absolute address "
+                         "can be placed here.");
+    Value |= static_cast<uint64_t>(Addr);
   } else {
-    assert(isIntN(Size, I));
+    llvm_unreachable("Unsupported operand type");
   }
-
-  uint64_t Imm = I;
-
-  // 32 bit Imm requires HI16 first then LO16
-  if (Size == 32) {
-    Offset += EmitConstant((Imm >> 16) & 0xFFFF, 16, Pad, Buffer, Offset);
-    EmitConstant(Imm & 0xFFFF, 16, Pad, Buffer, Offset);
-    return Size;
-  }
-
-  return EmitConstant(Imm & ((1ULL << Size) - 1), Size, Pad, Buffer, Offset);
 }
 
-#include "M68kGenMCCodeBeads.inc"
-
 void M68kMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
                                           SmallVectorImpl<MCFixup> &Fixups,
                                           const MCSubtargetInfo &STI) const {
   unsigned Opcode = MI.getOpcode();
-  const MCInstrDesc &Desc = MCII.get(Opcode);
 
   LLVM_DEBUG(dbgs() << "EncodeInstruction: " << MCII.getName(Opcode) << "("
                     << Opcode << ")\n");
 
-  const uint8_t *Beads = getGenInstrBeads(MI);
-  if (!Beads || !*Beads) {
-    llvm_unreachable("*** Instruction does not have Beads defined");
-  }
-
-  uint64_t Buffer = 0;
-  unsigned Offset = 0;
-  unsigned ThisByte = 0;
-
-  for (uint8_t Bead = *Beads; Bead; Bead = *++Beads) {
-    // Check for control beads
-    if (!(Bead & 0xF)) {
-      switch (Bead >> 4) {
-      case M68kBeads::Ignore:
-        continue;
-      }
-    }
-
-    switch (Bead & 0xF) {
-    default:
-      llvm_unreachable("Unknown Bead code");
-      break;
-    case M68kBeads::Bits1:
-    case M68kBeads::Bits2:
-    case M68kBeads::Bits3:
-    case M68kBeads::Bits4:
-      Offset +=
-          encodeBits(ThisByte, Bead, MI, Desc, Buffer, Offset, Fixups, STI);
-      break;
-    case M68kBeads::DAReg:
-    case M68kBeads::DA:
-    case M68kBeads::DReg:
-    case M68kBeads::Reg:
-      Offset +=
-          encodeReg(ThisByte, Bead, MI, Desc, Buffer, Offset, Fixups, STI);
-      break;
-    case M68kBeads::Disp8:
-    case M68kBeads::Imm8:
-    case M68kBeads::Imm16:
-    case M68kBeads::Imm32:
-    case M68kBeads::Imm3:
-      Offset +=
-          encodeImm(ThisByte, Bead, MI, Desc, Buffer, Offset, Fixups, STI);
-      break;
-    }
-
-    // Since M68k is Big Endian we need to rotate each instruction word
-    while (Offset / 16) {
-      support::endian::write<uint16_t>(OS, Buffer, support::big);
-      Buffer >>= 16;
-      Offset -= 16;
-      ThisByte += 2;
+  // Try using the new method first.
+  APInt EncodedInst(16, 0U);
+  APInt Scratch(16, 0U);
+  getBinaryCodeForInstr(MI, Fixups, EncodedInst, Scratch, STI);
+
+  ArrayRef<uint64_t> Data(EncodedInst.getRawData(), EncodedInst.getNumWords());
+  int64_t InstSize = EncodedInst.getBitWidth();
+  for (uint64_t Word : Data) {
+    for (int i = 0; i < 4 && InstSize > 0; ++i, InstSize -= 16) {
+      support::endian::write<uint16_t>(OS, static_cast<uint16_t>(Word),
+                                       support::big);
+      Word >>= 16;
     }
   }
-
-  assert(Offset == 0 && "M68k Instructions are % 2 bytes");
-  assert((ThisByte && !(ThisByte % 2)) && "M68k Instructions are % 2 bytes");
 }
 
 MCCodeEmitter *llvm::createM68kMCCodeEmitter(const MCInstrInfo &MCII,
-                                             const MCRegisterInfo &MRI,
                                              MCContext &Ctx) {
   return new M68kMCCodeEmitter(MCII, Ctx);
 }
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.h b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.h
index aa53e13af4fc..0dc601ad876b 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.h
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.h
@@ -38,7 +38,6 @@ MCAsmBackend *createM68kAsmBackend(const Target &T, const MCSubtargetInfo &STI,
                                    const MCTargetOptions &Options);
 
 MCCodeEmitter *createM68kMCCodeEmitter(const MCInstrInfo &MCII,
-                                       const MCRegisterInfo &MRI,
                                        MCContext &Ctx);
 
 /// Construct an M68k ELF object writer.
diff --git a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
index 13cba8b079a9..196e492046b9 100644
--- a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
+++ b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
@@ -16,6 +16,7 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
diff --git a/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp b/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp
index 9bbb2938ab75..a4d63a62f6aa 100644
--- a/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp
+++ b/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp
@@ -14,8 +14,8 @@
 #include "MSP430.h"
 #include "TargetInfo/MSP430TargetInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDecoderOps.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
-#include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -72,7 +72,7 @@ static const unsigned GR8DecoderTable[] = {
 
 static DecodeStatus DecodeGR8RegisterClass(MCInst &MI, uint64_t RegNo,
                                            uint64_t Address,
-                                           const void *Decoder) {
+                                           const MCDisassembler *Decoder) {
   if (RegNo > 15)
     return MCDisassembler::Fail;
 
@@ -90,7 +90,7 @@ static const unsigned GR16DecoderTable[] = {
 
 static DecodeStatus DecodeGR16RegisterClass(MCInst &MI, uint64_t RegNo,
                                             uint64_t Address,
-                                            const void *Decoder) {
+                                            const MCDisassembler *Decoder) {
   if (RegNo > 15)
     return MCDisassembler::Fail;
 
@@ -100,16 +100,16 @@ static DecodeStatus DecodeGR16RegisterClass(MCInst &MI, uint64_t RegNo,
 }
 
 static DecodeStatus DecodeCGImm(MCInst &MI, uint64_t Bits, uint64_t Address,
-                                const void *Decoder);
+                                const MCDisassembler *Decoder);
 
 static DecodeStatus DecodeMemOperand(MCInst &MI, uint64_t Bits,
                                      uint64_t Address,
-                                     const void *Decoder);
+                                     const MCDisassembler *Decoder);
 
 #include "MSP430GenDisassemblerTables.inc"
 
 static DecodeStatus DecodeCGImm(MCInst &MI, uint64_t Bits, uint64_t Address,
-                                const void *Decoder) {
+                                const MCDisassembler *Decoder) {
   int64_t Imm;
   switch (Bits) {
   default:
@@ -127,7 +127,7 @@ static DecodeStatus DecodeCGImm(MCInst &MI, uint64_t Bits, uint64_t Address,
 
 static DecodeStatus DecodeMemOperand(MCInst &MI, uint64_t Bits,
                                      uint64_t Address,
-                                     const void *Decoder) {
+                                     const MCDisassembler *Decoder) {
   unsigned Reg = Bits & 15;
   unsigned Imm = Bits >> 4;
 
diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp
index 953916776c57..23af7d1149ed 100644
--- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp
+++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp
@@ -35,7 +35,7 @@ class MSP430AsmBackend : public MCAsmBackend {
 public:
   MSP430AsmBackend(const MCSubtargetInfo &STI, uint8_t OSABI)
       : MCAsmBackend(support::little), OSABI(OSABI) {}
-  ~MSP430AsmBackend() override {}
+  ~MSP430AsmBackend() override = default;
 
   void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                   const MCValue &Target, MutableArrayRef<char> Data,
diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp
index bb5351af6523..aa097ccb9de6 100644
--- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp
+++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp
@@ -24,7 +24,7 @@ public:
     : MCELFObjectTargetWriter(false, OSABI, ELF::EM_MSP430,
                               /*HasRelocationAddend*/ true) {}
 
-  ~MSP430ELFObjectWriter() override {}
+  ~MSP430ELFObjectWriter() override = default;
 
 protected:
   unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp
index 087045ccb1df..0cdb3a595f71 100644
--- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp
+++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp
@@ -12,6 +12,7 @@
 
 #include "MSP430MCTargetDesc.h"
 #include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCSectionELF.h"
@@ -42,7 +43,7 @@ MSP430TargetELFStreamer::MSP430TargetELFStreamer(MCStreamer &S,
   // MSP430 EABI (slaa534.pdf, part 13).
   MCSection *AttributeSection = getStreamer().getContext().getELFSection(
       ".MSP430.attributes", ELF::SHT_MSP430_ATTRIBUTES, 0);
-  Streamer.SwitchSection(AttributeSection);
+  Streamer.switchSection(AttributeSection);
 
   // Format version.
   Streamer.emitInt8(0x41);
diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp
index cf57e87a073d..2b16c6234a51 100644
--- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp
+++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp
@@ -167,7 +167,7 @@ unsigned MSP430MCCodeEmitter::getCGImmOpValue(const MCInst &MI, unsigned Op,
                                               const MCSubtargetInfo &STI) const {
   const MCOperand &MO = MI.getOperand(Op);
   assert(MO.isImm() && "Expr operand expected");
-  
+
   int64_t Imm = MO.getImm();
   switch (Imm) {
   default:
@@ -200,7 +200,6 @@ unsigned MSP430MCCodeEmitter::getCCOpValue(const MCInst &MI, unsigned Op,
 }
 
 MCCodeEmitter *createMSP430MCCodeEmitter(const MCInstrInfo &MCII,
-                                         const MCRegisterInfo &MRI,
                                          MCContext &Ctx) {
   return new MSP430MCCodeEmitter(Ctx, MCII);
 }
diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
index 02bfbe40c6bf..24b0b3298592 100644
--- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
+++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
@@ -31,7 +31,6 @@ class MCTargetStreamer;
 
 /// Creates a machine code emitter for MSP430.
 MCCodeEmitter *createMSP430MCCodeEmitter(const MCInstrInfo &MCII,
-                                         const MCRegisterInfo &MRI,
                                          MCContext &Ctx);
 
 MCAsmBackend *createMSP430MCAsmBackend(const Target &T,
diff --git a/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp b/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
index 8eb3fbd58328..85c59d5b14b5 100644
--- a/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
+++ b/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
@@ -166,11 +166,11 @@ void MSP430AsmPrinter::EmitInterruptVectorSection(MachineFunction &ISR) {
   MCSection *IV = OutStreamer->getContext().getELFSection(
     "__interrupt_vector_" + IVIdx,
     ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_EXECINSTR);
-  OutStreamer->SwitchSection(IV);
+  OutStreamer->switchSection(IV);
 
   const MCSymbol *FunctionSymbol = getSymbol(F);
   OutStreamer->emitSymbolValue(FunctionSymbol, TM.getProgramPointerSize());
-  OutStreamer->SwitchSection(Cur);
+  OutStreamer->switchSection(Cur);
 }
 
 bool MSP430AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
diff --git a/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
index abd48dfd5139..b623730e1574 100644
--- a/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -18,7 +18,6 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
-#include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
@@ -255,7 +254,7 @@ bool MSP430DAGToDAGISel::SelectAddr(SDValue N,
   Base = (AM.BaseType == MSP430ISelAddressMode::FrameIndexBase)
              ? CurDAG->getTargetFrameIndex(
                    AM.Base.FrameIndex,
-                   getTargetLowering()->getPointerTy(CurDAG->getDataLayout()))
+                   N.getValueType())
              : AM.Base.Reg;
 
   if (AM.GV)
diff --git a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
index aebfc6b0ae2e..73ab3b52e907 100644
--- a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -670,16 +670,17 @@ SDValue MSP430TargetLowering::LowerCCCArguments(
         InVals.push_back(ArgValue);
       }
     } else {
-      // Only arguments passed on the stack should make it here. 
+      // Only arguments passed on the stack should make it here.
       assert(VA.isMemLoc());
 
       SDValue InVal;
       ISD::ArgFlagsTy Flags = Ins[i].Flags;
 
       if (Flags.isByVal()) {
+        MVT PtrVT = VA.getLocVT();
         int FI = MFI.CreateFixedObject(Flags.getByValSize(),
                                        VA.getLocMemOffset(), true);
-        InVal = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+        InVal = DAG.getFrameIndex(FI, PtrVT);
       } else {
         // Load the argument to a virtual register
         unsigned ObjSize = VA.getLocVT().getSizeInBits()/8;
@@ -777,13 +778,14 @@ MSP430TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     if (!Reg)
       llvm_unreachable("sret virtual register not created in entry block");
 
+    MVT PtrVT = getFrameIndexTy(DAG.getDataLayout());
     SDValue Val =
-      DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy(DAG.getDataLayout()));
+      DAG.getCopyFromReg(Chain, dl, Reg, PtrVT);
     unsigned R12 = MSP430::R12;
 
     Chain = DAG.getCopyToReg(Chain, dl, R12, Val, Flag);
     Flag = Chain.getValue(1);
-    RetOps.push_back(DAG.getRegister(R12, getPointerTy(DAG.getDataLayout())));
+    RetOps.push_back(DAG.getRegister(R12, PtrVT));
   }
 
   unsigned Opc = (CallConv == CallingConv::MSP430_INTR ?
@@ -814,7 +816,7 @@ SDValue MSP430TargetLowering::LowerCCCCallTo(
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getNextStackOffset();
-  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  MVT PtrVT = getFrameIndexTy(DAG.getDataLayout());
 
   Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
 
@@ -1010,7 +1012,7 @@ SDValue MSP430TargetLowering::LowerGlobalAddress(SDValue Op,
                                                  SelectionDAG &DAG) const {
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
-  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  EVT PtrVT = Op.getValueType();
 
   // Create the TargetGlobalAddress node, folding in the constant offset.
   SDValue Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op), PtrVT, Offset);
@@ -1021,7 +1023,7 @@ SDValue MSP430TargetLowering::LowerExternalSymbol(SDValue Op,
                                                   SelectionDAG &DAG) const {
   SDLoc dl(Op);
   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
-  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  EVT PtrVT = Op.getValueType();
   SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT);
 
   return DAG.getNode(MSP430ISD::Wrapper, dl, PtrVT, Result);
@@ -1030,8 +1032,8 @@ SDValue MSP430TargetLowering::LowerExternalSymbol(SDValue Op,
 SDValue MSP430TargetLowering::LowerBlockAddress(SDValue Op,
                                                 SelectionDAG &DAG) const {
   SDLoc dl(Op);
-  auto PtrVT = getPointerTy(DAG.getDataLayout());
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  EVT PtrVT = Op.getValueType();
   SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT);
 
   return DAG.getNode(MSP430ISD::Wrapper, dl, PtrVT, Result);
@@ -1248,11 +1250,11 @@ MSP430TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MSP430MachineFunctionInfo *FuncInfo = MF.getInfo<MSP430MachineFunctionInfo>();
   int ReturnAddrIndex = FuncInfo->getRAIndex();
-  auto PtrVT = getPointerTy(MF.getDataLayout());
+  MVT PtrVT = getFrameIndexTy(MF.getDataLayout());
 
   if (ReturnAddrIndex == 0) {
     // Set up a frame object for the return address.
-    uint64_t SlotSize = MF.getDataLayout().getPointerSize();
+    uint64_t SlotSize = PtrVT.getStoreSize();
     ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize, -SlotSize,
                                                            true);
     FuncInfo->setRAIndex(ReturnAddrIndex);
@@ -1271,12 +1273,12 @@ SDValue MSP430TargetLowering::LowerRETURNADDR(SDValue Op,
 
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   SDLoc dl(Op);
-  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  EVT PtrVT = Op.getValueType();
 
   if (Depth > 0) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
     SDValue Offset =
-        DAG.getConstant(DAG.getDataLayout().getPointerSize(), dl, MVT::i16);
+      DAG.getConstant(PtrVT.getStoreSize(), dl, MVT::i16);
     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
                        DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
                        MachinePointerInfo());
@@ -1308,7 +1310,9 @@ SDValue MSP430TargetLowering::LowerVASTART(SDValue Op,
                                            SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MSP430MachineFunctionInfo *FuncInfo = MF.getInfo<MSP430MachineFunctionInfo>();
-  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  SDValue Ptr = Op.getOperand(1);
+  EVT PtrVT = Ptr.getValueType();
 
   // Frame index of first vararg argument
   SDValue FrameIndex =
@@ -1316,14 +1320,14 @@ SDValue MSP430TargetLowering::LowerVASTART(SDValue Op,
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
 
   // Create a store of the frame index to the location operand
-  return DAG.getStore(Op.getOperand(0), SDLoc(Op), FrameIndex, Op.getOperand(1),
+  return DAG.getStore(Op.getOperand(0), SDLoc(Op), FrameIndex, Ptr,
                       MachinePointerInfo(SV));
 }
 
 SDValue MSP430TargetLowering::LowerJumpTable(SDValue Op,
                                              SelectionDAG &DAG) const {
     JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
-    auto PtrVT = getPointerTy(DAG.getDataLayout());
+    EVT PtrVT = Op.getValueType();
     SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
     return DAG.getNode(MSP430ISD::Wrapper, SDLoc(JT), PtrVT, Result);
 }
diff --git a/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp b/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
index e9e26e295fd5..0646d6faebed 100644
--- a/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -197,8 +197,7 @@ bool MSP430InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
       }
 
       // If the block has any instructions after a JMP, delete them.
-      while (std::next(I) != MBB.end())
-        std::next(I)->eraseFromParent();
+      MBB.erase(std::next(I), MBB.end());
       Cond.clear();
       FBB = nullptr;
 
diff --git a/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp b/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp
index 1d3a6d118bd6..93b37b523a71 100644
--- a/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp
@@ -11,3 +11,10 @@
 using namespace llvm;
 
 void MSP430MachineFunctionInfo::anchor() { }
+
+MachineFunctionInfo *MSP430MachineFunctionInfo::clone(
+    BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+    const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+    const {
+  return DestMF.cloneInfo<MSP430MachineFunctionInfo>(*this);
+}
diff --git a/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h b/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h
index 261db9e288f5..93b388255877 100644
--- a/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h
+++ b/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h
@@ -43,6 +43,11 @@ public:
   explicit MSP430MachineFunctionInfo(MachineFunction &MF)
     : CalleeSavedFrameSize(0), ReturnAddrIndex(0), SRetReturnReg(0) {}
 
+  MachineFunctionInfo *
+  clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+        const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+      const override;
+
   unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
   void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
 
diff --git a/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp b/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
index a33146ce2239..6bba224aab8b 100644
--- a/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -27,9 +27,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMSP430Target() {
 }
 
 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
-  if (!RM.hasValue())
-    return Reloc::Static;
-  return *RM;
+  return RM.value_or(Reloc::Static);
 }
 
 static std::string computeDataLayout(const Triple &TT, StringRef CPU,
@@ -51,7 +49,7 @@ MSP430TargetMachine::MSP430TargetMachine(const Target &T, const Triple &TT,
   initAsmInfo();
 }
 
-MSP430TargetMachine::~MSP430TargetMachine() {}
+MSP430TargetMachine::~MSP430TargetMachine() = default;
 
 namespace {
 /// MSP430 Code Generator Pass Configuration Options.
diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 736c41f8ac03..b5817d9ae700 100644
--- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -25,6 +25,7 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
@@ -3412,10 +3413,10 @@ bool MipsAsmParser::expandLoadSingleImmToFPR(MCInst &Inst, SMLoc IDLoc,
   const MipsMCExpr *LoExpr =
       MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());
 
-  getStreamer().SwitchSection(ReadOnlySection);
+  getStreamer().switchSection(ReadOnlySection);
   getStreamer().emitLabel(Sym, IDLoc);
   getStreamer().emitInt32(ImmOp32);
-  getStreamer().SwitchSection(CS);
+  getStreamer().switchSection(CS);
 
   if (emitPartialAddress(TOut, IDLoc, Sym))
     return true;
@@ -3464,11 +3465,11 @@ bool MipsAsmParser::expandLoadDoubleImmToGPR(MCInst &Inst, SMLoc IDLoc,
   const MipsMCExpr *LoExpr =
       MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());
 
-  getStreamer().SwitchSection(ReadOnlySection);
+  getStreamer().switchSection(ReadOnlySection);
   getStreamer().emitLabel(Sym, IDLoc);
   getStreamer().emitValueToAlignment(8);
   getStreamer().emitIntValue(ImmOp64, 8);
-  getStreamer().SwitchSection(CS);
+  getStreamer().switchSection(CS);
 
   unsigned TmpReg = getATReg(IDLoc);
   if (!TmpReg)
@@ -3547,11 +3548,11 @@ bool MipsAsmParser::expandLoadDoubleImmToFPR(MCInst &Inst, bool Is64FPU,
   const MipsMCExpr *LoExpr =
       MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());
 
-  getStreamer().SwitchSection(ReadOnlySection);
+  getStreamer().switchSection(ReadOnlySection);
   getStreamer().emitLabel(Sym, IDLoc);
   getStreamer().emitValueToAlignment(8);
   getStreamer().emitIntValue(ImmOp64, 8);
-  getStreamer().SwitchSection(CS);
+  getStreamer().switchSection(CS);
 
   if (emitPartialAddress(TOut, IDLoc, Sym))
     return true;
@@ -8179,7 +8180,7 @@ bool MipsAsmParser::parseRSectionDirective(StringRef Section) {
 
   MCSection *ELFSection = getContext().getELFSection(
       Section, ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
-  getParser().getStreamer().SwitchSection(ELFSection);
+  getParser().getStreamer().switchSection(ELFSection);
 
   getParser().Lex(); // Eat EndOfStatement token.
   return false;
@@ -8197,7 +8198,7 @@ bool MipsAsmParser::parseSSectionDirective(StringRef Section, unsigned Type) {
 
   MCSection *ELFSection = getContext().getELFSection(
       Section, Type, ELF::SHF_WRITE | ELF::SHF_ALLOC | ELF::SHF_MIPS_GPREL);
-  getParser().getStreamer().SwitchSection(ELFSection);
+  getParser().getStreamer().switchSection(ELFSection);
 
   getParser().Lex(); // Eat EndOfStatement token.
   return false;
diff --git a/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index 9a66dd77c0d3..4e40a84ecfd0 100644
--- a/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -15,8 +15,8 @@
 #include "TargetInfo/MipsTargetInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDecoderOps.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
-#include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -79,338 +79,279 @@ public:
 
 // Forward declare these because the autogenerated code will reference them.
 // Definitions are further down.
-static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst,
-                                             unsigned RegNo,
+static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
-                                             const void *Decoder);
+                                             const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeCPU16RegsRegisterClass(MCInst &Inst,
-                                                 unsigned RegNo,
+static DecodeStatus DecodeCPU16RegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                  uint64_t Address,
-                                                 const void *Decoder);
+                                                 const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeGPRMM16RegisterClass(MCInst &Inst,
-                                               unsigned RegNo,
+static DecodeStatus DecodeGPRMM16RegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder);
+                                               const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeGPRMM16ZeroRegisterClass(MCInst &Inst,
-                                                   unsigned RegNo,
-                                                   uint64_t Address,
-                                                   const void *Decoder);
+static DecodeStatus
+DecodeGPRMM16ZeroRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
+                               const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeGPRMM16MovePRegisterClass(MCInst &Inst,
-                                                    unsigned RegNo,
-                                                    uint64_t Address,
-                                                    const void *Decoder);
+static DecodeStatus
+DecodeGPRMM16MovePRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
+                                const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst,
-                                             unsigned RegNo,
+static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
-                                             const void *Decoder);
+                                             const MCDisassembler *Decoder);
 
-static DecodeStatus DecodePtrRegisterClass(MCInst &Inst,
-                                           unsigned Insn,
+static DecodeStatus DecodePtrRegisterClass(MCInst &Inst, unsigned Insn,
                                            uint64_t Address,
-                                           const void *Decoder);
+                                           const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeDSPRRegisterClass(MCInst &Inst,
-                                            unsigned RegNo,
+static DecodeStatus DecodeDSPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
-                                            const void *Decoder);
+                                            const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeFGR64RegisterClass(MCInst &Inst,
-                                             unsigned RegNo,
+static DecodeStatus DecodeFGR64RegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
-                                             const void *Decoder);
+                                             const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeFGR32RegisterClass(MCInst &Inst,
-                                             unsigned RegNo,
+static DecodeStatus DecodeFGR32RegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
-                                             const void *Decoder);
+                                             const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeCCRRegisterClass(MCInst &Inst,
-                                           unsigned RegNo,
+static DecodeStatus DecodeCCRRegisterClass(MCInst &Inst, unsigned RegNo,
                                            uint64_t Address,
-                                           const void *Decoder);
+                                           const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeFCCRegisterClass(MCInst &Inst,
-                                           unsigned RegNo,
+static DecodeStatus DecodeFCCRegisterClass(MCInst &Inst, unsigned RegNo,
                                            uint64_t Address,
-                                           const void *Decoder);
+                                           const MCDisassembler *Decoder);
 
 static DecodeStatus DecodeFGRCCRegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
-                                             const void *Decoder);
+                                             const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeHWRegsRegisterClass(MCInst &Inst,
-                                              unsigned Insn,
+static DecodeStatus DecodeHWRegsRegisterClass(MCInst &Inst, unsigned Insn,
                                               uint64_t Address,
-                                              const void *Decoder);
+                                              const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst,
-                                              unsigned RegNo,
+static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst, unsigned RegNo,
                                               uint64_t Address,
-                                              const void *Decoder);
+                                              const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeACC64DSPRegisterClass(MCInst &Inst,
-                                                unsigned RegNo,
+static DecodeStatus DecodeACC64DSPRegisterClass(MCInst &Inst, unsigned RegNo,
                                                 uint64_t Address,
-                                                const void *Decoder);
+                                                const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeHI32DSPRegisterClass(MCInst &Inst,
-                                               unsigned RegNo,
+static DecodeStatus DecodeHI32DSPRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder);
+                                               const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeLO32DSPRegisterClass(MCInst &Inst,
-                                               unsigned RegNo,
+static DecodeStatus DecodeLO32DSPRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder);
+                                               const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeMSA128BRegisterClass(MCInst &Inst,
-                                               unsigned RegNo,
+static DecodeStatus DecodeMSA128BRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder);
+                                               const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeMSA128HRegisterClass(MCInst &Inst,
-                                               unsigned RegNo,
+static DecodeStatus DecodeMSA128HRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder);
+                                               const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeMSA128WRegisterClass(MCInst &Inst,
-                                               unsigned RegNo,
+static DecodeStatus DecodeMSA128WRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder);
+                                               const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeMSA128DRegisterClass(MCInst &Inst,
-                                               unsigned RegNo,
+static DecodeStatus DecodeMSA128DRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder);
+                                               const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeMSACtrlRegisterClass(MCInst &Inst,
-                                               unsigned RegNo,
+static DecodeStatus DecodeMSACtrlRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder);
+                                               const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeCOP0RegisterClass(MCInst &Inst,
-                                            unsigned RegNo,
+static DecodeStatus DecodeCOP0RegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
-                                            const void *Decoder);
+                                            const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeCOP2RegisterClass(MCInst &Inst,
-                                            unsigned RegNo,
+static DecodeStatus DecodeCOP2RegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
-                                            const void *Decoder);
+                                            const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeBranchTarget(MCInst &Inst,
-                                       unsigned Offset,
+static DecodeStatus DecodeBranchTarget(MCInst &Inst, unsigned Offset,
                                        uint64_t Address,
-                                       const void *Decoder);
+                                       const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeBranchTarget1SImm16(MCInst &Inst,
-                                              unsigned Offset,
+static DecodeStatus DecodeBranchTarget1SImm16(MCInst &Inst, unsigned Offset,
                                               uint64_t Address,
-                                              const void *Decoder);
+                                              const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeJumpTarget(MCInst &Inst,
-                                     unsigned Insn,
+static DecodeStatus DecodeJumpTarget(MCInst &Inst, unsigned Insn,
                                      uint64_t Address,
-                                     const void *Decoder);
+                                     const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeBranchTarget21(MCInst &Inst,
-                                         unsigned Offset,
+static DecodeStatus DecodeBranchTarget21(MCInst &Inst, unsigned Offset,
                                          uint64_t Address,
-                                         const void *Decoder);
+                                         const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeBranchTarget21MM(MCInst &Inst,
-                                           unsigned Offset,
+static DecodeStatus DecodeBranchTarget21MM(MCInst &Inst, unsigned Offset,
                                            uint64_t Address,
-                                           const void *Decoder);
+                                           const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeBranchTarget26(MCInst &Inst,
-                                         unsigned Offset,
+static DecodeStatus DecodeBranchTarget26(MCInst &Inst, unsigned Offset,
                                          uint64_t Address,
-                                         const void *Decoder);
+                                         const MCDisassembler *Decoder);
 
 // DecodeBranchTarget7MM - Decode microMIPS branch offset, which is
 // shifted left by 1 bit.
-static DecodeStatus DecodeBranchTarget7MM(MCInst &Inst,
-                                          unsigned Offset,
+static DecodeStatus DecodeBranchTarget7MM(MCInst &Inst, unsigned Offset,
                                           uint64_t Address,
-                                          const void *Decoder);
+                                          const MCDisassembler *Decoder);
 
 // DecodeBranchTarget10MM - Decode microMIPS branch offset, which is
 // shifted left by 1 bit.
-static DecodeStatus DecodeBranchTarget10MM(MCInst &Inst,
-                                           unsigned Offset,
+static DecodeStatus DecodeBranchTarget10MM(MCInst &Inst, unsigned Offset,
                                            uint64_t Address,
-                                           const void *Decoder);
+                                           const MCDisassembler *Decoder);
 
 // DecodeBranchTargetMM - Decode microMIPS branch offset, which is
 // shifted left by 1 bit.
-static DecodeStatus DecodeBranchTargetMM(MCInst &Inst,
-                                         unsigned Offset,
+static DecodeStatus DecodeBranchTargetMM(MCInst &Inst, unsigned Offset,
                                          uint64_t Address,
-                                         const void *Decoder);
+                                         const MCDisassembler *Decoder);
 
 // DecodeBranchTarget26MM - Decode microMIPS branch offset, which is
 // shifted left by 1 bit.
-static DecodeStatus DecodeBranchTarget26MM(MCInst &Inst,
-                                           unsigned Offset,
+static DecodeStatus DecodeBranchTarget26MM(MCInst &Inst, unsigned Offset,
                                            uint64_t Address,
-                                           const void *Decoder);
+                                           const MCDisassembler *Decoder);
 
 // DecodeJumpTargetMM - Decode microMIPS jump target, which is
 // shifted left by 1 bit.
-static DecodeStatus DecodeJumpTargetMM(MCInst &Inst,
-                                       unsigned Insn,
+static DecodeStatus DecodeJumpTargetMM(MCInst &Inst, unsigned Insn,
                                        uint64_t Address,
-                                       const void *Decoder);
+                                       const MCDisassembler *Decoder);
 
 // DecodeJumpTargetXMM - Decode microMIPS jump and link exchange target,
 // which is shifted left by 2 bit.
-static DecodeStatus DecodeJumpTargetXMM(MCInst &Inst,
-                                        unsigned Insn,
+static DecodeStatus DecodeJumpTargetXMM(MCInst &Inst, unsigned Insn,
                                         uint64_t Address,
-                                        const void *Decoder);
+                                        const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeMem(MCInst &Inst,
-                              unsigned Insn,
-                              uint64_t Address,
-                              const void *Decoder);
+static DecodeStatus DecodeMem(MCInst &Inst, unsigned Insn, uint64_t Address,
+                              const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeMemEVA(MCInst &Inst,
-                                 unsigned Insn,
-                                 uint64_t Address,
-                                 const void *Decoder);
+static DecodeStatus DecodeMemEVA(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                 const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeLoadByte15(MCInst &Inst,
-                                     unsigned Insn,
+static DecodeStatus DecodeLoadByte15(MCInst &Inst, unsigned Insn,
                                      uint64_t Address,
-                                     const void *Decoder);
+                                     const MCDisassembler *Decoder);
 
 static DecodeStatus DecodeCacheOp(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                  const void *Decoder);
+                                  const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeCacheeOp_CacheOpR6(MCInst &Inst,
-                                             unsigned Insn,
+static DecodeStatus DecodeCacheeOp_CacheOpR6(MCInst &Inst, unsigned Insn,
                                              uint64_t Address,
-                                             const void *Decoder);
+                                             const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeCacheOpMM(MCInst &Inst,
-                                    unsigned Insn,
+static DecodeStatus DecodeCacheOpMM(MCInst &Inst, unsigned Insn,
                                     uint64_t Address,
-                                    const void *Decoder);
+                                    const MCDisassembler *Decoder);
 
-static DecodeStatus DecodePrefeOpMM(MCInst &Inst,
-                                    unsigned Insn,
+static DecodeStatus DecodePrefeOpMM(MCInst &Inst, unsigned Insn,
                                     uint64_t Address,
-                                    const void *Decoder);
+                                    const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeSyncI(MCInst &Inst,
-                                unsigned Insn,
-                                uint64_t Address,
-                                const void *Decoder);
+static DecodeStatus DecodeSyncI(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeSyncI_MM(MCInst &Inst,
-                                   unsigned Insn,
+static DecodeStatus DecodeSyncI_MM(MCInst &Inst, unsigned Insn,
                                    uint64_t Address,
-                                   const void *Decoder);
+                                   const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeSynciR6(MCInst &Inst,
-                                  unsigned Insn,
-                                  uint64_t Address,
-                                  const void *Decoder);
+static DecodeStatus DecodeSynciR6(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                  const MCDisassembler *Decoder);
 
 static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address, const void *Decoder);
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeMemMMImm4(MCInst &Inst,
-                                    unsigned Insn,
+static DecodeStatus DecodeMemMMImm4(MCInst &Inst, unsigned Insn,
                                     uint64_t Address,
-                                    const void *Decoder);
+                                    const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeMemMMSPImm5Lsl2(MCInst &Inst,
-                                          unsigned Insn,
+static DecodeStatus DecodeMemMMSPImm5Lsl2(MCInst &Inst, unsigned Insn,
                                           uint64_t Address,
-                                          const void *Decoder);
+                                          const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeMemMMGPImm7Lsl2(MCInst &Inst,
-                                          unsigned Insn,
+static DecodeStatus DecodeMemMMGPImm7Lsl2(MCInst &Inst, unsigned Insn,
                                           uint64_t Address,
-                                          const void *Decoder);
+                                          const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeMemMMReglistImm4Lsl2(MCInst &Inst,
-                                               unsigned Insn,
+static DecodeStatus DecodeMemMMReglistImm4Lsl2(MCInst &Inst, unsigned Insn,
                                                uint64_t Address,
-                                               const void *Decoder);
+                                               const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeMemMMImm9(MCInst &Inst,
-                                    unsigned Insn,
+static DecodeStatus DecodeMemMMImm9(MCInst &Inst, unsigned Insn,
                                     uint64_t Address,
-                                    const void *Decoder);
+                                    const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeMemMMImm12(MCInst &Inst,
-                                     unsigned Insn,
+static DecodeStatus DecodeMemMMImm12(MCInst &Inst, unsigned Insn,
                                      uint64_t Address,
-                                     const void *Decoder);
+                                     const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeMemMMImm16(MCInst &Inst,
-                                     unsigned Insn,
+static DecodeStatus DecodeMemMMImm16(MCInst &Inst, unsigned Insn,
                                      uint64_t Address,
-                                     const void *Decoder);
+                                     const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeFMem(MCInst &Inst, unsigned Insn,
-                               uint64_t Address,
-                               const void *Decoder);
+static DecodeStatus DecodeFMem(MCInst &Inst, unsigned Insn, uint64_t Address,
+                               const MCDisassembler *Decoder);
 
 static DecodeStatus DecodeFMemMMR2(MCInst &Inst, unsigned Insn,
                                    uint64_t Address,
-                                   const void *Decoder);
+                                   const MCDisassembler *Decoder);
 
 static DecodeStatus DecodeFMem2(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                const void *Decoder);
+                                const MCDisassembler *Decoder);
 
 static DecodeStatus DecodeFMem3(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                const void *Decoder);
+                                const MCDisassembler *Decoder);
 
 static DecodeStatus DecodeFMemCop2R6(MCInst &Inst, unsigned Insn,
-                                     uint64_t Address, const void *Decoder);
+                                     uint64_t Address,
+                                     const MCDisassembler *Decoder);
 
 static DecodeStatus DecodeFMemCop2MMR6(MCInst &Inst, unsigned Insn,
                                        uint64_t Address,
-                                       const void *Decoder);
+                                       const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst,
-                                       unsigned Insn,
+static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst, unsigned Insn,
                                        uint64_t Address,
-                                       const void *Decoder);
+                                       const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeAddiur2Simm7(MCInst &Inst,
-                                       unsigned Value,
+static DecodeStatus DecodeAddiur2Simm7(MCInst &Inst, unsigned Value,
                                        uint64_t Address,
-                                       const void *Decoder);
+                                       const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeLi16Imm(MCInst &Inst,
-                                  unsigned Value,
+static DecodeStatus DecodeLi16Imm(MCInst &Inst, unsigned Value,
                                   uint64_t Address,
-                                  const void *Decoder);
+                                  const MCDisassembler *Decoder);
 
-static DecodeStatus DecodePOOL16BEncodedField(MCInst &Inst,
-                                              unsigned Value,
+static DecodeStatus DecodePOOL16BEncodedField(MCInst &Inst, unsigned Value,
                                               uint64_t Address,
-                                              const void *Decoder);
+                                              const MCDisassembler *Decoder);
 
 template <unsigned Bits, int Offset, int Scale>
 static DecodeStatus DecodeUImmWithOffsetAndScale(MCInst &Inst, unsigned Value,
                                                  uint64_t Address,
-                                                 const void *Decoder);
+                                                 const MCDisassembler *Decoder);
 
 template <unsigned Bits, int Offset>
 static DecodeStatus DecodeUImmWithOffset(MCInst &Inst, unsigned Value,
                                          uint64_t Address,
-                                         const void *Decoder) {
+                                         const MCDisassembler *Decoder) {
   return DecodeUImmWithOffsetAndScale<Bits, Offset, 1>(Inst, Value, Address,
                                                        Decoder);
 }
@@ -418,128 +359,132 @@ static DecodeStatus DecodeUImmWithOffset(MCInst &Inst, unsigned Value,
 template <unsigned Bits, int Offset = 0, int ScaleBy = 1>
 static DecodeStatus DecodeSImmWithOffsetAndScale(MCInst &Inst, unsigned Value,
                                                  uint64_t Address,
-                                                 const void *Decoder);
+                                                 const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeInsSize(MCInst &Inst,
-                                  unsigned Insn,
-                                  uint64_t Address,
-                                  const void *Decoder);
+static DecodeStatus DecodeInsSize(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                  const MCDisassembler *Decoder);
 
 static DecodeStatus DecodeSimm19Lsl2(MCInst &Inst, unsigned Insn,
-                                     uint64_t Address, const void *Decoder);
+                                     uint64_t Address,
+                                     const MCDisassembler *Decoder);
 
 static DecodeStatus DecodeSimm18Lsl3(MCInst &Inst, unsigned Insn,
-                                     uint64_t Address, const void *Decoder);
+                                     uint64_t Address,
+                                     const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeSimm9SP(MCInst &Inst, unsigned Insn,
-                                  uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSimm9SP(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                  const MCDisassembler *Decoder);
 
 static DecodeStatus DecodeANDI16Imm(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address, const void *Decoder);
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder);
 
 static DecodeStatus DecodeSimm23Lsl2(MCInst &Inst, unsigned Insn,
-                                     uint64_t Address, const void *Decoder);
+                                     uint64_t Address,
+                                     const MCDisassembler *Decoder);
 
 /// INSVE_[BHWD] have an implicit operand that the generated decoder doesn't
 /// handle.
 template <typename InsnType>
 static DecodeStatus DecodeINSVE_DF(MCInst &MI, InsnType insn, uint64_t Address,
-                                   const void *Decoder);
+                                   const MCDisassembler *Decoder);
 
 template <typename InsnType>
 static DecodeStatus DecodeDAHIDATIMMR6(MCInst &MI, InsnType insn,
-                                       uint64_t Address, const void *Decoder);
+                                       uint64_t Address,
+                                       const MCDisassembler *Decoder);
 
 template <typename InsnType>
 static DecodeStatus DecodeDAHIDATI(MCInst &MI, InsnType insn, uint64_t Address,
-                                   const void *Decoder);
+                                   const MCDisassembler *Decoder);
 
 template <typename InsnType>
-static DecodeStatus
-DecodeAddiGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
-                      const void *Decoder);
+static DecodeStatus DecodeAddiGroupBranch(MCInst &MI, InsnType insn,
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder);
 
 template <typename InsnType>
-static DecodeStatus
-DecodePOP35GroupBranchMMR6(MCInst &MI, InsnType insn, uint64_t Address,
-                           const void *Decoder);
+static DecodeStatus DecodePOP35GroupBranchMMR6(MCInst &MI, InsnType insn,
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder);
 
 template <typename InsnType>
-static DecodeStatus
-DecodeDaddiGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
-                       const void *Decoder);
+static DecodeStatus DecodeDaddiGroupBranch(MCInst &MI, InsnType insn,
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder);
 
 template <typename InsnType>
-static DecodeStatus
-DecodePOP37GroupBranchMMR6(MCInst &MI, InsnType insn, uint64_t Address,
-                           const void *Decoder);
+static DecodeStatus DecodePOP37GroupBranchMMR6(MCInst &MI, InsnType insn,
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder);
 
 template <typename InsnType>
-static DecodeStatus
-DecodePOP65GroupBranchMMR6(MCInst &MI, InsnType insn, uint64_t Address,
-                           const void *Decoder);
+static DecodeStatus DecodePOP65GroupBranchMMR6(MCInst &MI, InsnType insn,
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder);
 
 template <typename InsnType>
-static DecodeStatus
-DecodePOP75GroupBranchMMR6(MCInst &MI, InsnType insn, uint64_t Address,
-                           const void *Decoder);
+static DecodeStatus DecodePOP75GroupBranchMMR6(MCInst &MI, InsnType insn,
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder);
 
 template <typename InsnType>
-static DecodeStatus
-DecodeBlezlGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
-                       const void *Decoder);
+static DecodeStatus DecodeBlezlGroupBranch(MCInst &MI, InsnType insn,
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder);
 
 template <typename InsnType>
-static DecodeStatus
-DecodeBgtzlGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
-                       const void *Decoder);
+static DecodeStatus DecodeBgtzlGroupBranch(MCInst &MI, InsnType insn,
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder);
 
 template <typename InsnType>
-static DecodeStatus
-DecodeBgtzGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
-                      const void *Decoder);
+static DecodeStatus DecodeBgtzGroupBranch(MCInst &MI, InsnType insn,
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder);
 
 template <typename InsnType>
-static DecodeStatus
-DecodeBlezGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
-                       const void *Decoder);
+static DecodeStatus DecodeBlezGroupBranch(MCInst &MI, InsnType insn,
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder);
 
 template <typename InsnType>
-static DecodeStatus
-DecodeBgtzGroupBranchMMR6(MCInst &MI, InsnType insn, uint64_t Address,
-                          const void *Decoder);
+static DecodeStatus DecodeBgtzGroupBranchMMR6(MCInst &MI, InsnType insn,
+                                              uint64_t Address,
+                                              const MCDisassembler *Decoder);
 
 template <typename InsnType>
-static DecodeStatus
-DecodeBlezGroupBranchMMR6(MCInst &MI, InsnType insn, uint64_t Address,
-                          const void *Decoder);
+static DecodeStatus DecodeBlezGroupBranchMMR6(MCInst &MI, InsnType insn,
+                                              uint64_t Address,
+                                              const MCDisassembler *Decoder);
 
 template <typename InsnType>
 static DecodeStatus DecodeDINS(MCInst &MI, InsnType Insn, uint64_t Address,
-                               const void *Decoder);
+                               const MCDisassembler *Decoder);
 
 template <typename InsnType>
 static DecodeStatus DecodeDEXT(MCInst &MI, InsnType Insn, uint64_t Address,
-                               const void *Decoder);
+                               const MCDisassembler *Decoder);
 
 template <typename InsnType>
 static DecodeStatus DecodeCRC(MCInst &MI, InsnType Insn, uint64_t Address,
-                              const void *Decoder);
+                              const MCDisassembler *Decoder);
 
 static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Insn,
                                          uint64_t Address,
-                                         const void *Decoder);
+                                         const MCDisassembler *Decoder);
 
 static DecodeStatus DecodeRegListOperand16(MCInst &Inst, unsigned Insn,
                                            uint64_t Address,
-                                           const void *Decoder);
+                                           const MCDisassembler *Decoder);
 
 static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned RegPair,
                                        uint64_t Address,
-                                       const void *Decoder);
+                                       const MCDisassembler *Decoder);
 
 static DecodeStatus DecodeMovePOperands(MCInst &Inst, unsigned Insn,
-                                        uint64_t Address, const void *Decoder);
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder);
 
 static MCDisassembler *createMipsDisassembler(
                        const Target &T,
@@ -569,16 +514,16 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsDisassembler() {
 
 #include "MipsGenDisassemblerTables.inc"
 
-static unsigned getReg(const void *D, unsigned RC, unsigned RegNo) {
-  const MipsDisassembler *Dis = static_cast<const MipsDisassembler*>(D);
-  const MCRegisterInfo *RegInfo = Dis->getContext().getRegisterInfo();
+static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo) {
+  const MCRegisterInfo *RegInfo = D->getContext().getRegisterInfo();
   return *(RegInfo->getRegClass(RC).begin() + RegNo);
 }
 
 template <typename InsnType>
 static DecodeStatus DecodeINSVE_DF(MCInst &MI, InsnType insn, uint64_t Address,
-                                   const void *Decoder) {
-  using DecodeFN = DecodeStatus (*)(MCInst &, unsigned, uint64_t, const void *);
+                                   const MCDisassembler *Decoder) {
+  using DecodeFN =
+      DecodeStatus (*)(MCInst &, unsigned, uint64_t, const MCDisassembler *);
 
   // The size of the n field depends on the element size
   // The register class also depends on this.
@@ -624,7 +569,8 @@ static DecodeStatus DecodeINSVE_DF(MCInst &MI, InsnType insn, uint64_t Address,
 
 template <typename InsnType>
 static DecodeStatus DecodeDAHIDATIMMR6(MCInst &MI, InsnType insn,
-                                       uint64_t Address, const void *Decoder) {
+                                       uint64_t Address,
+                                       const MCDisassembler *Decoder) {
   InsnType Rs = fieldFromInstruction(insn, 16, 5);
   InsnType Imm = fieldFromInstruction(insn, 0, 16);
   MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR64RegClassID,
@@ -638,7 +584,7 @@ static DecodeStatus DecodeDAHIDATIMMR6(MCInst &MI, InsnType insn,
 
 template <typename InsnType>
 static DecodeStatus DecodeDAHIDATI(MCInst &MI, InsnType insn, uint64_t Address,
-                               const void *Decoder) {
+                                   const MCDisassembler *Decoder) {
   InsnType Rs = fieldFromInstruction(insn, 21, 5);
   InsnType Imm = fieldFromInstruction(insn, 0, 16);
   MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR64RegClassID,
@@ -653,7 +599,7 @@ static DecodeStatus DecodeDAHIDATI(MCInst &MI, InsnType insn, uint64_t Address,
 template <typename InsnType>
 static DecodeStatus DecodeAddiGroupBranch(MCInst &MI, InsnType insn,
                                           uint64_t Address,
-                                          const void *Decoder) {
+                                          const MCDisassembler *Decoder) {
   // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
   // (otherwise we would have matched the ADDI instruction from the earlier
   // ISA's instead).
@@ -692,7 +638,7 @@ static DecodeStatus DecodeAddiGroupBranch(MCInst &MI, InsnType insn,
 template <typename InsnType>
 static DecodeStatus DecodePOP35GroupBranchMMR6(MCInst &MI, InsnType insn,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   InsnType Rt = fieldFromInstruction(insn, 21, 5);
   InsnType Rs = fieldFromInstruction(insn, 16, 5);
   int64_t Imm = 0;
@@ -726,7 +672,7 @@ static DecodeStatus DecodePOP35GroupBranchMMR6(MCInst &MI, InsnType insn,
 template <typename InsnType>
 static DecodeStatus DecodeDaddiGroupBranch(MCInst &MI, InsnType insn,
                                            uint64_t Address,
-                                           const void *Decoder) {
+                                           const MCDisassembler *Decoder) {
   // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
   // (otherwise we would have matched the ADDI instruction from the earlier
   // ISA's instead).
@@ -765,7 +711,7 @@ static DecodeStatus DecodeDaddiGroupBranch(MCInst &MI, InsnType insn,
 template <typename InsnType>
 static DecodeStatus DecodePOP37GroupBranchMMR6(MCInst &MI, InsnType insn,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   InsnType Rt = fieldFromInstruction(insn, 21, 5);
   InsnType Rs = fieldFromInstruction(insn, 16, 5);
   int64_t Imm = 0;
@@ -799,7 +745,7 @@ static DecodeStatus DecodePOP37GroupBranchMMR6(MCInst &MI, InsnType insn,
 template <typename InsnType>
 static DecodeStatus DecodePOP65GroupBranchMMR6(MCInst &MI, InsnType insn,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   // We have:
   //    0b110101 ttttt sssss iiiiiiiiiiiiiiii
   //      Invalid if rt == 0
@@ -838,7 +784,7 @@ static DecodeStatus DecodePOP65GroupBranchMMR6(MCInst &MI, InsnType insn,
 template <typename InsnType>
 static DecodeStatus DecodePOP75GroupBranchMMR6(MCInst &MI, InsnType insn,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   // We have:
   //    0b111101 ttttt sssss iiiiiiiiiiiiiiii
   //      Invalid if rt == 0
@@ -877,7 +823,7 @@ static DecodeStatus DecodePOP75GroupBranchMMR6(MCInst &MI, InsnType insn,
 template <typename InsnType>
 static DecodeStatus DecodeBlezlGroupBranch(MCInst &MI, InsnType insn,
                                            uint64_t Address,
-                                           const void *Decoder) {
+                                           const MCDisassembler *Decoder) {
   // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
   // (otherwise we would have matched the BLEZL instruction from the earlier
   // ISA's instead).
@@ -920,7 +866,7 @@ static DecodeStatus DecodeBlezlGroupBranch(MCInst &MI, InsnType insn,
 template <typename InsnType>
 static DecodeStatus DecodeBgtzlGroupBranch(MCInst &MI, InsnType insn,
                                            uint64_t Address,
-                                           const void *Decoder) {
+                                           const MCDisassembler *Decoder) {
   // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
   // (otherwise we would have matched the BGTZL instruction from the earlier
   // ISA's instead).
@@ -964,7 +910,7 @@ static DecodeStatus DecodeBgtzlGroupBranch(MCInst &MI, InsnType insn,
 template <typename InsnType>
 static DecodeStatus DecodeBgtzGroupBranch(MCInst &MI, InsnType insn,
                                           uint64_t Address,
-                                          const void *Decoder) {
+                                          const MCDisassembler *Decoder) {
   // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
   // (otherwise we would have matched the BGTZ instruction from the earlier
   // ISA's instead).
@@ -1012,8 +958,8 @@ static DecodeStatus DecodeBgtzGroupBranch(MCInst &MI, InsnType insn,
 
 template <typename InsnType>
 static DecodeStatus DecodeBlezGroupBranch(MCInst &MI, InsnType insn,
-                                           uint64_t Address,
-                                           const void *Decoder) {
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder) {
   // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
   // (otherwise we would have matched the BLEZL instruction from the earlier
   // ISA's instead).
@@ -1056,7 +1002,7 @@ static DecodeStatus DecodeBlezGroupBranch(MCInst &MI, InsnType insn,
 // for feature / behaviour parity with  binutils.
 template <typename InsnType>
 static DecodeStatus DecodeDEXT(MCInst &MI, InsnType Insn, uint64_t Address,
-                               const void *Decoder) {
+                               const MCDisassembler *Decoder) {
   unsigned Msbd = fieldFromInstruction(Insn, 11, 5);
   unsigned Lsb = fieldFromInstruction(Insn, 6, 5);
   unsigned Size = 0;
@@ -1098,7 +1044,7 @@ static DecodeStatus DecodeDEXT(MCInst &MI, InsnType Insn, uint64_t Address,
 // for feature / behaviour parity with binutils.
 template <typename InsnType>
 static DecodeStatus DecodeDINS(MCInst &MI, InsnType Insn, uint64_t Address,
-                               const void *Decoder) {
+                               const MCDisassembler *Decoder) {
   unsigned Msbd = fieldFromInstruction(Insn, 11, 5);
   unsigned Lsb = fieldFromInstruction(Insn, 6, 5);
   unsigned Size = 0;
@@ -1140,7 +1086,7 @@ static DecodeStatus DecodeDINS(MCInst &MI, InsnType Insn, uint64_t Address,
 // Auto-generated decoder wouldn't add the third operand for CRC32*.
 template <typename InsnType>
 static DecodeStatus DecodeCRC(MCInst &MI, InsnType Insn, uint64_t Address,
-                              const void *Decoder) {
+                              const MCDisassembler *Decoder) {
   InsnType Rs = fieldFromInstruction(Insn, 21, 5);
   InsnType Rt = fieldFromInstruction(Insn, 16, 5);
   MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
@@ -1384,17 +1330,15 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
   return MCDisassembler::Fail;
 }
 
-static DecodeStatus DecodeCPU16RegsRegisterClass(MCInst &Inst,
-                                                 unsigned RegNo,
-                                                 uint64_t Address,
-                                                 const void *Decoder) {
+static DecodeStatus
+DecodeCPU16RegsRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
+                             const MCDisassembler *Decoder) {
   return MCDisassembler::Fail;
 }
 
-static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst,
-                                             unsigned RegNo,
+static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
-                                             const void *Decoder) {
+                                             const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
@@ -1403,10 +1347,9 @@ static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeGPRMM16RegisterClass(MCInst &Inst,
-                                               unsigned RegNo,
+static DecodeStatus DecodeGPRMM16RegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   if (RegNo > 7)
     return MCDisassembler::Fail;
   unsigned Reg = getReg(Decoder, Mips::GPRMM16RegClassID, RegNo);
@@ -1414,10 +1357,9 @@ static DecodeStatus DecodeGPRMM16RegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeGPRMM16ZeroRegisterClass(MCInst &Inst,
-                                                   unsigned RegNo,
-                                                   uint64_t Address,
-                                                   const void *Decoder) {
+static DecodeStatus
+DecodeGPRMM16ZeroRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
+                               const MCDisassembler *Decoder) {
   if (RegNo > 7)
     return MCDisassembler::Fail;
   unsigned Reg = getReg(Decoder, Mips::GPRMM16ZeroRegClassID, RegNo);
@@ -1425,10 +1367,9 @@ static DecodeStatus DecodeGPRMM16ZeroRegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeGPRMM16MovePRegisterClass(MCInst &Inst,
-                                                    unsigned RegNo,
-                                                    uint64_t Address,
-                                                    const void *Decoder) {
+static DecodeStatus
+DecodeGPRMM16MovePRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
+                                const MCDisassembler *Decoder) {
   if (RegNo > 7)
     return MCDisassembler::Fail;
   unsigned Reg = getReg(Decoder, Mips::GPRMM16MovePRegClassID, RegNo);
@@ -1436,10 +1377,9 @@ static DecodeStatus DecodeGPRMM16MovePRegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst,
-                                             unsigned RegNo,
+static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
-                                             const void *Decoder) {
+                                             const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
   unsigned Reg = getReg(Decoder, Mips::GPR32RegClassID, RegNo);
@@ -1447,27 +1387,24 @@ static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodePtrRegisterClass(MCInst &Inst,
-                                           unsigned RegNo,
+static DecodeStatus DecodePtrRegisterClass(MCInst &Inst, unsigned RegNo,
                                            uint64_t Address,
-                                           const void *Decoder) {
+                                           const MCDisassembler *Decoder) {
   if (static_cast<const MipsDisassembler *>(Decoder)->isGP64())
     return DecodeGPR64RegisterClass(Inst, RegNo, Address, Decoder);
 
   return DecodeGPR32RegisterClass(Inst, RegNo, Address, Decoder);
 }
 
-static DecodeStatus DecodeDSPRRegisterClass(MCInst &Inst,
-                                            unsigned RegNo,
+static DecodeStatus DecodeDSPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
-                                            const void *Decoder) {
+                                            const MCDisassembler *Decoder) {
   return DecodeGPR32RegisterClass(Inst, RegNo, Address, Decoder);
 }
 
-static DecodeStatus DecodeFGR64RegisterClass(MCInst &Inst,
-                                             unsigned RegNo,
+static DecodeStatus DecodeFGR64RegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
-                                             const void *Decoder) {
+                                             const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
@@ -1476,10 +1413,9 @@ static DecodeStatus DecodeFGR64RegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeFGR32RegisterClass(MCInst &Inst,
-                                             unsigned RegNo,
+static DecodeStatus DecodeFGR32RegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
-                                             const void *Decoder) {
+                                             const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
@@ -1488,10 +1424,9 @@ static DecodeStatus DecodeFGR32RegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeCCRRegisterClass(MCInst &Inst,
-                                           unsigned RegNo,
+static DecodeStatus DecodeCCRRegisterClass(MCInst &Inst, unsigned RegNo,
                                            uint64_t Address,
-                                           const void *Decoder) {
+                                           const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
   unsigned Reg = getReg(Decoder, Mips::CCRRegClassID, RegNo);
@@ -1499,10 +1434,9 @@ static DecodeStatus DecodeCCRRegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeFCCRegisterClass(MCInst &Inst,
-                                           unsigned RegNo,
+static DecodeStatus DecodeFCCRegisterClass(MCInst &Inst, unsigned RegNo,
                                            uint64_t Address,
-                                           const void *Decoder) {
+                                           const MCDisassembler *Decoder) {
   if (RegNo > 7)
     return MCDisassembler::Fail;
   unsigned Reg = getReg(Decoder, Mips::FCCRegClassID, RegNo);
@@ -1512,7 +1446,7 @@ static DecodeStatus DecodeFCCRegisterClass(MCInst &Inst,
 
 static DecodeStatus DecodeFGRCCRegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
-                                             const void *Decoder) {
+                                             const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
@@ -1521,10 +1455,8 @@ static DecodeStatus DecodeFGRCCRegisterClass(MCInst &Inst, unsigned RegNo,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeMem(MCInst &Inst,
-                              unsigned Insn,
-                              uint64_t Address,
-                              const void *Decoder) {
+static DecodeStatus DecodeMem(MCInst &Inst, unsigned Insn, uint64_t Address,
+                              const MCDisassembler *Decoder) {
   int Offset = SignExtend32<16>(Insn & 0xffff);
   unsigned Reg = fieldFromInstruction(Insn, 16, 5);
   unsigned Base = fieldFromInstruction(Insn, 21, 5);
@@ -1543,10 +1475,8 @@ static DecodeStatus DecodeMem(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeMemEVA(MCInst &Inst,
-                                 unsigned Insn,
-                                 uint64_t Address,
-                                 const void *Decoder) {
+static DecodeStatus DecodeMemEVA(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                 const MCDisassembler *Decoder) {
   int Offset = SignExtend32<9>(Insn >> 7);
   unsigned Reg = fieldFromInstruction(Insn, 16, 5);
   unsigned Base = fieldFromInstruction(Insn, 21, 5);
@@ -1564,10 +1494,9 @@ static DecodeStatus DecodeMemEVA(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeLoadByte15(MCInst &Inst,
-                                     unsigned Insn,
+static DecodeStatus DecodeLoadByte15(MCInst &Inst, unsigned Insn,
                                      uint64_t Address,
-                                     const void *Decoder) {
+                                     const MCDisassembler *Decoder) {
   int Offset = SignExtend32<16>(Insn & 0xffff);
   unsigned Base = fieldFromInstruction(Insn, 16, 5);
   unsigned Reg = fieldFromInstruction(Insn, 21, 5);
@@ -1582,10 +1511,8 @@ static DecodeStatus DecodeLoadByte15(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeCacheOp(MCInst &Inst,
-                              unsigned Insn,
-                              uint64_t Address,
-                              const void *Decoder) {
+static DecodeStatus DecodeCacheOp(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                  const MCDisassembler *Decoder) {
   int Offset = SignExtend32<16>(Insn & 0xffff);
   unsigned Hint = fieldFromInstruction(Insn, 16, 5);
   unsigned Base = fieldFromInstruction(Insn, 21, 5);
@@ -1599,10 +1526,9 @@ static DecodeStatus DecodeCacheOp(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeCacheOpMM(MCInst &Inst,
-                                    unsigned Insn,
+static DecodeStatus DecodeCacheOpMM(MCInst &Inst, unsigned Insn,
                                     uint64_t Address,
-                                    const void *Decoder) {
+                                    const MCDisassembler *Decoder) {
   int Offset = SignExtend32<12>(Insn & 0xfff);
   unsigned Base = fieldFromInstruction(Insn, 16, 5);
   unsigned Hint = fieldFromInstruction(Insn, 21, 5);
@@ -1616,10 +1542,9 @@ static DecodeStatus DecodeCacheOpMM(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodePrefeOpMM(MCInst &Inst,
-                                    unsigned Insn,
+static DecodeStatus DecodePrefeOpMM(MCInst &Inst, unsigned Insn,
                                     uint64_t Address,
-                                    const void *Decoder) {
+                                    const MCDisassembler *Decoder) {
   int Offset = SignExtend32<9>(Insn & 0x1ff);
   unsigned Base = fieldFromInstruction(Insn, 16, 5);
   unsigned Hint = fieldFromInstruction(Insn, 21, 5);
@@ -1633,10 +1558,9 @@ static DecodeStatus DecodePrefeOpMM(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeCacheeOp_CacheOpR6(MCInst &Inst,
-                                             unsigned Insn,
+static DecodeStatus DecodeCacheeOp_CacheOpR6(MCInst &Inst, unsigned Insn,
                                              uint64_t Address,
-                                             const void *Decoder) {
+                                             const MCDisassembler *Decoder) {
   int Offset = SignExtend32<9>(Insn >> 7);
   unsigned Hint = fieldFromInstruction(Insn, 16, 5);
   unsigned Base = fieldFromInstruction(Insn, 21, 5);
@@ -1650,10 +1574,8 @@ static DecodeStatus DecodeCacheeOp_CacheOpR6(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeSyncI(MCInst &Inst,
-                              unsigned Insn,
-                              uint64_t Address,
-                              const void *Decoder) {
+static DecodeStatus DecodeSyncI(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                const MCDisassembler *Decoder) {
   int Offset = SignExtend32<16>(Insn & 0xffff);
   unsigned Base = fieldFromInstruction(Insn, 21, 5);
 
@@ -1666,7 +1588,8 @@ static DecodeStatus DecodeSyncI(MCInst &Inst,
 }
 
 static DecodeStatus DecodeSyncI_MM(MCInst &Inst, unsigned Insn,
-                                   uint64_t Address, const void *Decoder) {
+                                   uint64_t Address,
+                                   const MCDisassembler *Decoder) {
   int Offset = SignExtend32<16>(Insn & 0xffff);
   unsigned Base = fieldFromInstruction(Insn, 16, 5);
 
@@ -1678,10 +1601,8 @@ static DecodeStatus DecodeSyncI_MM(MCInst &Inst, unsigned Insn,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeSynciR6(MCInst &Inst,
-                                  unsigned Insn,
-                                  uint64_t Address,
-                                  const void *Decoder) {
+static DecodeStatus DecodeSynciR6(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                  const MCDisassembler *Decoder) {
   int Immediate = SignExtend32<16>(Insn & 0xffff);
   unsigned Base = fieldFromInstruction(Insn, 16, 5);
 
@@ -1694,7 +1615,8 @@ static DecodeStatus DecodeSynciR6(MCInst &Inst,
 }
 
 static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address, const void *Decoder) {
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder) {
   int Offset = SignExtend32<10>(fieldFromInstruction(Insn, 16, 10));
   unsigned Reg = fieldFromInstruction(Insn, 6, 5);
   unsigned Base = fieldFromInstruction(Insn, 11, 5);
@@ -1739,10 +1661,9 @@ static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeMemMMImm4(MCInst &Inst,
-                                    unsigned Insn,
+static DecodeStatus DecodeMemMMImm4(MCInst &Inst, unsigned Insn,
                                     uint64_t Address,
-                                    const void *Decoder) {
+                                    const MCDisassembler *Decoder) {
   unsigned Offset = Insn & 0xf;
   unsigned Reg = fieldFromInstruction(Insn, 7, 3);
   unsigned Base = fieldFromInstruction(Insn, 4, 3);
@@ -1797,10 +1718,9 @@ static DecodeStatus DecodeMemMMImm4(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeMemMMSPImm5Lsl2(MCInst &Inst,
-                                          unsigned Insn,
+static DecodeStatus DecodeMemMMSPImm5Lsl2(MCInst &Inst, unsigned Insn,
                                           uint64_t Address,
-                                          const void *Decoder) {
+                                          const MCDisassembler *Decoder) {
   unsigned Offset = Insn & 0x1F;
   unsigned Reg = fieldFromInstruction(Insn, 5, 5);
 
@@ -1813,10 +1733,9 @@ static DecodeStatus DecodeMemMMSPImm5Lsl2(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeMemMMGPImm7Lsl2(MCInst &Inst,
-                                          unsigned Insn,
+static DecodeStatus DecodeMemMMGPImm7Lsl2(MCInst &Inst, unsigned Insn,
                                           uint64_t Address,
-                                          const void *Decoder) {
+                                          const MCDisassembler *Decoder) {
   unsigned Offset = Insn & 0x7F;
   unsigned Reg = fieldFromInstruction(Insn, 7, 3);
 
@@ -1829,10 +1748,9 @@ static DecodeStatus DecodeMemMMGPImm7Lsl2(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeMemMMReglistImm4Lsl2(MCInst &Inst,
-                                               unsigned Insn,
+static DecodeStatus DecodeMemMMReglistImm4Lsl2(MCInst &Inst, unsigned Insn,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   int Offset;
   switch (Inst.getOpcode()) {
   case Mips::LWM16_MMR6:
@@ -1854,10 +1772,9 @@ static DecodeStatus DecodeMemMMReglistImm4Lsl2(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeMemMMImm9(MCInst &Inst,
-                                    unsigned Insn,
+static DecodeStatus DecodeMemMMImm9(MCInst &Inst, unsigned Insn,
                                     uint64_t Address,
-                                    const void *Decoder) {
+                                    const MCDisassembler *Decoder) {
   int Offset = SignExtend32<9>(Insn & 0x1ff);
   unsigned Reg = fieldFromInstruction(Insn, 21, 5);
   unsigned Base = fieldFromInstruction(Insn, 16, 5);
@@ -1875,10 +1792,9 @@ static DecodeStatus DecodeMemMMImm9(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeMemMMImm12(MCInst &Inst,
-                                     unsigned Insn,
+static DecodeStatus DecodeMemMMImm12(MCInst &Inst, unsigned Insn,
                                      uint64_t Address,
-                                     const void *Decoder) {
+                                     const MCDisassembler *Decoder) {
   int Offset = SignExtend32<12>(Insn & 0x0fff);
   unsigned Reg = fieldFromInstruction(Insn, 21, 5);
   unsigned Base = fieldFromInstruction(Insn, 16, 5);
@@ -1910,10 +1826,9 @@ static DecodeStatus DecodeMemMMImm12(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeMemMMImm16(MCInst &Inst,
-                                     unsigned Insn,
+static DecodeStatus DecodeMemMMImm16(MCInst &Inst, unsigned Insn,
                                      uint64_t Address,
-                                     const void *Decoder) {
+                                     const MCDisassembler *Decoder) {
   int Offset = SignExtend32<16>(Insn & 0xffff);
   unsigned Reg = fieldFromInstruction(Insn, 21, 5);
   unsigned Base = fieldFromInstruction(Insn, 16, 5);
@@ -1928,10 +1843,8 @@ static DecodeStatus DecodeMemMMImm16(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeFMem(MCInst &Inst,
-                               unsigned Insn,
-                               uint64_t Address,
-                               const void *Decoder) {
+static DecodeStatus DecodeFMem(MCInst &Inst, unsigned Insn, uint64_t Address,
+                               const MCDisassembler *Decoder) {
   int Offset = SignExtend32<16>(Insn & 0xffff);
   unsigned Reg = fieldFromInstruction(Insn, 16, 5);
   unsigned Base = fieldFromInstruction(Insn, 21, 5);
@@ -1947,7 +1860,8 @@ static DecodeStatus DecodeFMem(MCInst &Inst,
 }
 
 static DecodeStatus DecodeFMemMMR2(MCInst &Inst, unsigned Insn,
-                                   uint64_t Address, const void *Decoder) {
+                                   uint64_t Address,
+                                   const MCDisassembler *Decoder) {
   // This function is the same as DecodeFMem but with the Reg and Base fields
   // swapped according to microMIPS spec.
   int Offset = SignExtend32<16>(Insn & 0xffff);
@@ -1964,10 +1878,8 @@ static DecodeStatus DecodeFMemMMR2(MCInst &Inst, unsigned Insn,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeFMem2(MCInst &Inst,
-                               unsigned Insn,
-                               uint64_t Address,
-                               const void *Decoder) {
+static DecodeStatus DecodeFMem2(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                const MCDisassembler *Decoder) {
   int Offset = SignExtend32<16>(Insn & 0xffff);
   unsigned Reg = fieldFromInstruction(Insn, 16, 5);
   unsigned Base = fieldFromInstruction(Insn, 21, 5);
@@ -1982,10 +1894,8 @@ static DecodeStatus DecodeFMem2(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeFMem3(MCInst &Inst,
-                               unsigned Insn,
-                               uint64_t Address,
-                               const void *Decoder) {
+static DecodeStatus DecodeFMem3(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                const MCDisassembler *Decoder) {
   int Offset = SignExtend32<16>(Insn & 0xffff);
   unsigned Reg = fieldFromInstruction(Insn, 16, 5);
   unsigned Base = fieldFromInstruction(Insn, 21, 5);
@@ -2000,10 +1910,9 @@ static DecodeStatus DecodeFMem3(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeFMemCop2R6(MCInst &Inst,
-                                    unsigned Insn,
-                                    uint64_t Address,
-                                    const void *Decoder) {
+static DecodeStatus DecodeFMemCop2R6(MCInst &Inst, unsigned Insn,
+                                     uint64_t Address,
+                                     const MCDisassembler *Decoder) {
   int Offset = SignExtend32<11>(Insn & 0x07ff);
   unsigned Reg = fieldFromInstruction(Insn, 16, 5);
   unsigned Base = fieldFromInstruction(Insn, 11, 5);
@@ -2019,7 +1928,8 @@ static DecodeStatus DecodeFMemCop2R6(MCInst &Inst,
 }
 
 static DecodeStatus DecodeFMemCop2MMR6(MCInst &Inst, unsigned Insn,
-                                       uint64_t Address, const void *Decoder) {
+                                       uint64_t Address,
+                                       const MCDisassembler *Decoder) {
   int Offset = SignExtend32<11>(Insn & 0x07ff);
   unsigned Reg = fieldFromInstruction(Insn, 21, 5);
   unsigned Base = fieldFromInstruction(Insn, 16, 5);
@@ -2034,10 +1944,9 @@ static DecodeStatus DecodeFMemCop2MMR6(MCInst &Inst, unsigned Insn,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst,
-                                       unsigned Insn,
+static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst, unsigned Insn,
                                        uint64_t Address,
-                                       const void *Decoder) {
+                                       const MCDisassembler *Decoder) {
   int64_t Offset = SignExtend64<9>((Insn >> 7) & 0x1ff);
   unsigned Rt = fieldFromInstruction(Insn, 16, 5);
   unsigned Base = fieldFromInstruction(Insn, 21, 5);
@@ -2056,10 +1965,9 @@ static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeHWRegsRegisterClass(MCInst &Inst,
-                                              unsigned RegNo,
+static DecodeStatus DecodeHWRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                               uint64_t Address,
-                                              const void *Decoder) {
+                                              const MCDisassembler *Decoder) {
   // Currently only hardware register 29 is supported.
   if (RegNo != 29)
     return  MCDisassembler::Fail;
@@ -2067,10 +1975,9 @@ static DecodeStatus DecodeHWRegsRegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst,
-                                              unsigned RegNo,
+static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst, unsigned RegNo,
                                               uint64_t Address,
-                                              const void *Decoder) {
+                                              const MCDisassembler *Decoder) {
   if (RegNo > 30 || RegNo %2)
     return MCDisassembler::Fail;
 
@@ -2079,10 +1986,9 @@ static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeACC64DSPRegisterClass(MCInst &Inst,
-                                                unsigned RegNo,
+static DecodeStatus DecodeACC64DSPRegisterClass(MCInst &Inst, unsigned RegNo,
                                                 uint64_t Address,
-                                                const void *Decoder) {
+                                                const MCDisassembler *Decoder) {
   if (RegNo >= 4)
     return MCDisassembler::Fail;
 
@@ -2091,10 +1997,9 @@ static DecodeStatus DecodeACC64DSPRegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeHI32DSPRegisterClass(MCInst &Inst,
-                                               unsigned RegNo,
+static DecodeStatus DecodeHI32DSPRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   if (RegNo >= 4)
     return MCDisassembler::Fail;
 
@@ -2103,10 +2008,9 @@ static DecodeStatus DecodeHI32DSPRegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeLO32DSPRegisterClass(MCInst &Inst,
-                                               unsigned RegNo,
+static DecodeStatus DecodeLO32DSPRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   if (RegNo >= 4)
     return MCDisassembler::Fail;
 
@@ -2115,10 +2019,9 @@ static DecodeStatus DecodeLO32DSPRegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeMSA128BRegisterClass(MCInst &Inst,
-                                               unsigned RegNo,
+static DecodeStatus DecodeMSA128BRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
@@ -2127,10 +2030,9 @@ static DecodeStatus DecodeMSA128BRegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeMSA128HRegisterClass(MCInst &Inst,
-                                               unsigned RegNo,
+static DecodeStatus DecodeMSA128HRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
@@ -2139,10 +2041,9 @@ static DecodeStatus DecodeMSA128HRegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeMSA128WRegisterClass(MCInst &Inst,
-                                               unsigned RegNo,
+static DecodeStatus DecodeMSA128WRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
@@ -2151,10 +2052,9 @@ static DecodeStatus DecodeMSA128WRegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeMSA128DRegisterClass(MCInst &Inst,
-                                               unsigned RegNo,
+static DecodeStatus DecodeMSA128DRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
@@ -2163,10 +2063,9 @@ static DecodeStatus DecodeMSA128DRegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeMSACtrlRegisterClass(MCInst &Inst,
-                                               unsigned RegNo,
+static DecodeStatus DecodeMSACtrlRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   if (RegNo > 7)
     return MCDisassembler::Fail;
 
@@ -2175,10 +2074,9 @@ static DecodeStatus DecodeMSACtrlRegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeCOP0RegisterClass(MCInst &Inst,
-                                            unsigned RegNo,
+static DecodeStatus DecodeCOP0RegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
-                                            const void *Decoder) {
+                                            const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
@@ -2187,10 +2085,9 @@ static DecodeStatus DecodeCOP0RegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeCOP2RegisterClass(MCInst &Inst,
-                                            unsigned RegNo,
+static DecodeStatus DecodeCOP2RegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
-                                            const void *Decoder) {
+                                            const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
@@ -2199,122 +2096,109 @@ static DecodeStatus DecodeCOP2RegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeBranchTarget(MCInst &Inst,
-                                       unsigned Offset,
+static DecodeStatus DecodeBranchTarget(MCInst &Inst, unsigned Offset,
                                        uint64_t Address,
-                                       const void *Decoder) {
+                                       const MCDisassembler *Decoder) {
   int32_t BranchOffset = (SignExtend32<16>(Offset) * 4) + 4;
   Inst.addOperand(MCOperand::createImm(BranchOffset));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeBranchTarget1SImm16(MCInst &Inst,
-                                              unsigned Offset,
+static DecodeStatus DecodeBranchTarget1SImm16(MCInst &Inst, unsigned Offset,
                                               uint64_t Address,
-                                              const void *Decoder) {
+                                              const MCDisassembler *Decoder) {
   int32_t BranchOffset = (SignExtend32<16>(Offset) * 2);
   Inst.addOperand(MCOperand::createImm(BranchOffset));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeJumpTarget(MCInst &Inst,
-                                     unsigned Insn,
+static DecodeStatus DecodeJumpTarget(MCInst &Inst, unsigned Insn,
                                      uint64_t Address,
-                                     const void *Decoder) {
+                                     const MCDisassembler *Decoder) {
   unsigned JumpOffset = fieldFromInstruction(Insn, 0, 26) << 2;
   Inst.addOperand(MCOperand::createImm(JumpOffset));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeBranchTarget21(MCInst &Inst,
-                                         unsigned Offset,
+static DecodeStatus DecodeBranchTarget21(MCInst &Inst, unsigned Offset,
                                          uint64_t Address,
-                                         const void *Decoder) {
+                                         const MCDisassembler *Decoder) {
   int32_t BranchOffset = SignExtend32<21>(Offset) * 4 + 4;
 
   Inst.addOperand(MCOperand::createImm(BranchOffset));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeBranchTarget21MM(MCInst &Inst,
-                                           unsigned Offset,
+static DecodeStatus DecodeBranchTarget21MM(MCInst &Inst, unsigned Offset,
                                            uint64_t Address,
-                                           const void *Decoder) {
+                                           const MCDisassembler *Decoder) {
   int32_t BranchOffset = SignExtend32<21>(Offset) * 4 + 4;
 
   Inst.addOperand(MCOperand::createImm(BranchOffset));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeBranchTarget26(MCInst &Inst,
-                                         unsigned Offset,
+static DecodeStatus DecodeBranchTarget26(MCInst &Inst, unsigned Offset,
                                          uint64_t Address,
-                                         const void *Decoder) {
+                                         const MCDisassembler *Decoder) {
   int32_t BranchOffset = SignExtend32<26>(Offset) * 4 + 4;
 
   Inst.addOperand(MCOperand::createImm(BranchOffset));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeBranchTarget7MM(MCInst &Inst,
-                                          unsigned Offset,
+static DecodeStatus DecodeBranchTarget7MM(MCInst &Inst, unsigned Offset,
                                           uint64_t Address,
-                                          const void *Decoder) {
+                                          const MCDisassembler *Decoder) {
   int32_t BranchOffset = SignExtend32<8>(Offset << 1);
   Inst.addOperand(MCOperand::createImm(BranchOffset));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeBranchTarget10MM(MCInst &Inst,
-                                           unsigned Offset,
+static DecodeStatus DecodeBranchTarget10MM(MCInst &Inst, unsigned Offset,
                                            uint64_t Address,
-                                           const void *Decoder) {
+                                           const MCDisassembler *Decoder) {
   int32_t BranchOffset = SignExtend32<11>(Offset << 1);
   Inst.addOperand(MCOperand::createImm(BranchOffset));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeBranchTargetMM(MCInst &Inst,
-                                         unsigned Offset,
+static DecodeStatus DecodeBranchTargetMM(MCInst &Inst, unsigned Offset,
                                          uint64_t Address,
-                                         const void *Decoder) {
+                                         const MCDisassembler *Decoder) {
   int32_t BranchOffset = SignExtend32<16>(Offset) * 2 + 4;
   Inst.addOperand(MCOperand::createImm(BranchOffset));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeBranchTarget26MM(MCInst &Inst,
-  unsigned Offset,
-  uint64_t Address,
-  const void *Decoder) {
+static DecodeStatus DecodeBranchTarget26MM(MCInst &Inst, unsigned Offset,
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder) {
   int32_t BranchOffset = SignExtend32<27>(Offset << 1);
 
   Inst.addOperand(MCOperand::createImm(BranchOffset));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeJumpTargetMM(MCInst &Inst,
-                                       unsigned Insn,
+static DecodeStatus DecodeJumpTargetMM(MCInst &Inst, unsigned Insn,
                                        uint64_t Address,
-                                       const void *Decoder) {
+                                       const MCDisassembler *Decoder) {
   unsigned JumpOffset = fieldFromInstruction(Insn, 0, 26) << 1;
   Inst.addOperand(MCOperand::createImm(JumpOffset));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeJumpTargetXMM(MCInst &Inst,
-                                        unsigned Insn,
+static DecodeStatus DecodeJumpTargetXMM(MCInst &Inst, unsigned Insn,
                                         uint64_t Address,
-                                        const void *Decoder) {
+                                        const MCDisassembler *Decoder) {
   unsigned JumpOffset = fieldFromInstruction(Insn, 0, 26) << 2;
   Inst.addOperand(MCOperand::createImm(JumpOffset));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeAddiur2Simm7(MCInst &Inst,
-                                       unsigned Value,
+static DecodeStatus DecodeAddiur2Simm7(MCInst &Inst, unsigned Value,
                                        uint64_t Address,
-                                       const void *Decoder) {
+                                       const MCDisassembler *Decoder) {
   if (Value == 0)
     Inst.addOperand(MCOperand::createImm(1));
   else if (Value == 0x7)
@@ -2324,10 +2208,9 @@ static DecodeStatus DecodeAddiur2Simm7(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeLi16Imm(MCInst &Inst,
-                                  unsigned Value,
+static DecodeStatus DecodeLi16Imm(MCInst &Inst, unsigned Value,
                                   uint64_t Address,
-                                  const void *Decoder) {
+                                  const MCDisassembler *Decoder) {
   if (Value == 0x7F)
     Inst.addOperand(MCOperand::createImm(-1));
   else
@@ -2335,18 +2218,17 @@ static DecodeStatus DecodeLi16Imm(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodePOOL16BEncodedField(MCInst &Inst,
-                                              unsigned Value,
+static DecodeStatus DecodePOOL16BEncodedField(MCInst &Inst, unsigned Value,
                                               uint64_t Address,
-                                              const void *Decoder) {
+                                              const MCDisassembler *Decoder) {
   Inst.addOperand(MCOperand::createImm(Value == 0x0 ? 8 : Value));
   return MCDisassembler::Success;
 }
 
 template <unsigned Bits, int Offset, int Scale>
-static DecodeStatus DecodeUImmWithOffsetAndScale(MCInst &Inst, unsigned Value,
-                                                 uint64_t Address,
-                                                 const void *Decoder) {
+static DecodeStatus
+DecodeUImmWithOffsetAndScale(MCInst &Inst, unsigned Value, uint64_t Address,
+                             const MCDisassembler *Decoder) {
   Value &= ((1 << Bits) - 1);
   Value *= Scale;
   Inst.addOperand(MCOperand::createImm(Value + Offset));
@@ -2354,18 +2236,16 @@ static DecodeStatus DecodeUImmWithOffsetAndScale(MCInst &Inst, unsigned Value,
 }
 
 template <unsigned Bits, int Offset, int ScaleBy>
-static DecodeStatus DecodeSImmWithOffsetAndScale(MCInst &Inst, unsigned Value,
-                                                 uint64_t Address,
-                                                 const void *Decoder) {
+static DecodeStatus
+DecodeSImmWithOffsetAndScale(MCInst &Inst, unsigned Value, uint64_t Address,
+                             const MCDisassembler *Decoder) {
   int32_t Imm = SignExtend32<Bits>(Value) * ScaleBy;
   Inst.addOperand(MCOperand::createImm(Imm + Offset));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeInsSize(MCInst &Inst,
-                                  unsigned Insn,
-                                  uint64_t Address,
-                                  const void *Decoder) {
+static DecodeStatus DecodeInsSize(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                  const MCDisassembler *Decoder) {
   // First we need to grab the pos(lsb) from MCInst.
   // This function only handles the 32 bit variants of ins, as dins
   // variants are handled differently.
@@ -2376,19 +2256,21 @@ static DecodeStatus DecodeInsSize(MCInst &Inst,
 }
 
 static DecodeStatus DecodeSimm19Lsl2(MCInst &Inst, unsigned Insn,
-                                     uint64_t Address, const void *Decoder) {
+                                     uint64_t Address,
+                                     const MCDisassembler *Decoder) {
   Inst.addOperand(MCOperand::createImm(SignExtend32<19>(Insn) * 4));
   return MCDisassembler::Success;
 }
 
 static DecodeStatus DecodeSimm18Lsl3(MCInst &Inst, unsigned Insn,
-                                     uint64_t Address, const void *Decoder) {
+                                     uint64_t Address,
+                                     const MCDisassembler *Decoder) {
   Inst.addOperand(MCOperand::createImm(SignExtend32<18>(Insn) * 8));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeSimm9SP(MCInst &Inst, unsigned Insn,
-                                  uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeSimm9SP(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                  const MCDisassembler *Decoder) {
   int32_t DecodedValue;
   switch (Insn) {
   case 0: DecodedValue = 256; break;
@@ -2402,7 +2284,8 @@ static DecodeStatus DecodeSimm9SP(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeANDI16Imm(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address, const void *Decoder) {
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder) {
   // Insn must be >= 0, since it is unsigned that condition is always true.
   assert(Insn < 16);
   int32_t DecodedValues[] = {128, 1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64,
@@ -2411,10 +2294,9 @@ static DecodeStatus DecodeANDI16Imm(MCInst &Inst, unsigned Insn,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeRegListOperand(MCInst &Inst,
-                                         unsigned Insn,
+static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Insn,
                                          uint64_t Address,
-                                         const void *Decoder) {
+                                         const MCDisassembler *Decoder) {
   unsigned Regs[] = {Mips::S0, Mips::S1, Mips::S2, Mips::S3, Mips::S4, Mips::S5,
                      Mips::S6, Mips::S7, Mips::FP};
   unsigned RegNum;
@@ -2442,7 +2324,7 @@ static DecodeStatus DecodeRegListOperand(MCInst &Inst,
 
 static DecodeStatus DecodeRegListOperand16(MCInst &Inst, unsigned Insn,
                                            uint64_t Address,
-                                           const void *Decoder) {
+                                           const MCDisassembler *Decoder) {
   unsigned Regs[] = {Mips::S0, Mips::S1, Mips::S2, Mips::S3};
   unsigned RegLst;
   switch(Inst.getOpcode()) {
@@ -2465,8 +2347,8 @@ static DecodeStatus DecodeRegListOperand16(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeMovePOperands(MCInst &Inst, unsigned Insn,
-                                          uint64_t Address,
-                                          const void *Decoder) {
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder) {
   unsigned RegPair = fieldFromInstruction(Insn, 7, 3);
   if (DecodeMovePRegPair(Inst, RegPair, Address, Decoder) ==
       MCDisassembler::Fail)
@@ -2491,7 +2373,8 @@ static DecodeStatus DecodeMovePOperands(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned RegPair,
-                                       uint64_t Address, const void *Decoder) {
+                                       uint64_t Address,
+                                       const MCDisassembler *Decoder) {
   switch (RegPair) {
   default:
     return MCDisassembler::Fail;
@@ -2533,15 +2416,16 @@ static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned RegPair,
 }
 
 static DecodeStatus DecodeSimm23Lsl2(MCInst &Inst, unsigned Insn,
-                                     uint64_t Address, const void *Decoder) {
+                                     uint64_t Address,
+                                     const MCDisassembler *Decoder) {
   Inst.addOperand(MCOperand::createImm(SignExtend32<25>(Insn << 2)));
   return MCDisassembler::Success;
 }
 
 template <typename InsnType>
 static DecodeStatus DecodeBgtzGroupBranchMMR6(MCInst &MI, InsnType insn,
-  uint64_t Address,
-  const void *Decoder) {
+                                              uint64_t Address,
+                                              const MCDisassembler *Decoder) {
   // We have:
   //    0b000111 ttttt sssss iiiiiiiiiiiiiiii
   //      Invalid      if rt == 0
@@ -2589,8 +2473,8 @@ static DecodeStatus DecodeBgtzGroupBranchMMR6(MCInst &MI, InsnType insn,
 
 template <typename InsnType>
 static DecodeStatus DecodeBlezGroupBranchMMR6(MCInst &MI, InsnType insn,
-  uint64_t Address,
-  const void *Decoder) {
+                                              uint64_t Address,
+                                              const MCDisassembler *Decoder) {
   // We have:
   //    0b000110 ttttt sssss iiiiiiiiiiiiiiii
   //      Invalid        if rt == 0
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
index 6091ee24b04d..1a5bb64863ee 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
@@ -9,7 +9,6 @@
 #ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSABIFLAGSSECTION_H
 #define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSABIFLAGSSECTION_H
 
-#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MipsABIFlags.h"
 #include <cstdint>
@@ -17,6 +16,7 @@
 namespace llvm {
 
 class MCStreamer;
+class StringRef;
 
 struct MipsABIFlagsSection {
   // Internal representation of the fp_abi related values used in .module.
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
index 3315a8ba18d6..227947d2766e 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
@@ -9,8 +9,10 @@
 #include "MipsABIInfo.h"
 #include "MipsRegisterInfo.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/LowLevelTypeImpl.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index a3dbe6f84a1e..8050f9b8cae0 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -301,6 +301,15 @@ void MipsAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
 }
 
 Optional<MCFixupKind> MipsAsmBackend::getFixupKind(StringRef Name) const {
+  unsigned Type = llvm::StringSwitch<unsigned>(Name)
+                      .Case("BFD_RELOC_NONE", ELF::R_MIPS_NONE)
+                      .Case("BFD_RELOC_16", ELF::R_MIPS_16)
+                      .Case("BFD_RELOC_32", ELF::R_MIPS_32)
+                      .Case("BFD_RELOC_64", ELF::R_MIPS_64)
+                      .Default(-1u);
+  if (Type != -1u)
+    return static_cast<MCFixupKind>(FirstLiteralRelocationKind + Type);
+
   return StringSwitch<Optional<MCFixupKind>>(Name)
       .Case("R_MIPS_NONE", FK_NONE)
       .Case("R_MIPS_32", FK_Data_4)
@@ -502,6 +511,8 @@ getFixupKindInfo(MCFixupKind Kind) const {
   static_assert(array_lengthof(BigEndianInfos) == Mips::NumTargetFixupKinds,
                 "Not all MIPS big endian fixup kinds added!");
 
+  if (Kind >= FirstLiteralRelocationKind)
+    return MCAsmBackend::getFixupKindInfo(FK_NONE);
   if (Kind < FirstTargetFixupKind)
     return MCAsmBackend::getFixupKindInfo(Kind);
 
@@ -534,6 +545,8 @@ bool MipsAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
 bool MipsAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
                                            const MCFixup &Fixup,
                                            const MCValue &Target) {
+  if (Fixup.getKind() >= FirstLiteralRelocationKind)
+    return true;
   const unsigned FixupKind = Fixup.getKind();
   switch (FixupKind) {
   default:
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index 9c317e3f8840..4990696fcfe0 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -220,6 +220,8 @@ unsigned MipsELFObjectWriter::getRelocType(MCContext &Ctx,
                                            bool IsPCRel) const {
   // Determine the type of the relocation.
   unsigned Kind = Fixup.getTargetKind();
+  if (Kind >= FirstLiteralRelocationKind)
+    return Kind - FirstLiteralRelocationKind;
 
   switch (Kind) {
   case FK_NONE:
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
index e6e32ec7f27c..9843b6144343 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
@@ -90,9 +90,9 @@ void MipsELFStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) {
   Labels.push_back(Symbol);
 }
 
-void MipsELFStreamer::SwitchSection(MCSection *Section,
+void MipsELFStreamer::switchSection(MCSection *Section,
                                     const MCExpr *Subsection) {
-  MCELFStreamer::SwitchSection(Section, Subsection);
+  MCELFStreamer::switchSection(Section, Subsection);
   Labels.clear();
 }
 
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
index f6a2c039c0c3..ac70e40d4dfe 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
@@ -50,7 +50,7 @@ public:
 
   /// Overriding this function allows us to dismiss all labels that are
   /// candidates for marking as microMIPS when .section directive is processed.
-  void SwitchSection(MCSection *Section,
+  void switchSection(MCSection *Section,
                      const MCExpr *Subsection = nullptr) override;
 
   /// Overriding these functions allows us to dismiss all labels that are
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.cpp
index 3700d6309e1a..632192103d38 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.cpp
@@ -88,29 +88,30 @@ void MipsInstPrinter::printInst(const MCInst *MI, uint64_t Address,
     break;
   case Mips::Save16:
     O << "\tsave\t";
-    printSaveRestore(MI, O);
+    printSaveRestore(MI, STI, O);
     O << " # 16 bit inst\n";
     return;
   case Mips::SaveX16:
     O << "\tsave\t";
-    printSaveRestore(MI, O);
+    printSaveRestore(MI, STI, O);
     O << "\n";
     return;
   case Mips::Restore16:
     O << "\trestore\t";
-    printSaveRestore(MI, O);
+    printSaveRestore(MI, STI, O);
     O << " # 16 bit inst\n";
     return;
   case Mips::RestoreX16:
     O << "\trestore\t";
-    printSaveRestore(MI, O);
+    printSaveRestore(MI, STI, O);
     O << "\n";
     return;
   }
 
   // Try to print any aliases first.
-  if (!printAliasInstr(MI, Address, O) && !printAlias(*MI, O))
-    printInstruction(MI, Address, O);
+  if (!printAliasInstr(MI, Address, STI, O) &&
+      !printAlias(*MI, Address, STI, O))
+    printInstruction(MI, Address, STI, O);
   printAnnotation(O, Annot);
 
   switch (MI->getOpcode()) {
@@ -123,7 +124,7 @@ void MipsInstPrinter::printInst(const MCInst *MI, uint64_t Address,
 }
 
 void MipsInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
-                                   raw_ostream &O) {
+                                   const MCSubtargetInfo &STI, raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isReg()) {
     printRegName(O, Op.getReg());
@@ -139,8 +140,42 @@ void MipsInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   Op.getExpr()->print(O, &MAI, true);
 }
 
+void MipsInstPrinter::printJumpOperand(const MCInst *MI, unsigned OpNo,
+                                       const MCSubtargetInfo &STI,
+                                       raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (!Op.isImm())
+    return printOperand(MI, OpNo, STI, O);
+
+  if (PrintBranchImmAsAddress)
+    O << formatHex(Op.getImm());
+  else
+    O << formatImm(Op.getImm());
+}
+
+void MipsInstPrinter::printBranchOperand(const MCInst *MI, uint64_t Address,
+                                         unsigned OpNo,
+                                         const MCSubtargetInfo &STI,
+                                         raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (!Op.isImm())
+    return printOperand(MI, OpNo, STI, O);
+
+  if (PrintBranchImmAsAddress) {
+    uint64_t Target = Address + Op.getImm();
+    if (STI.hasFeature(Mips::FeatureMips32))
+      Target &= 0xffffffff;
+    else if (STI.hasFeature(Mips::FeatureMips16))
+      Target &= 0xffff;
+    O << formatHex(Target);
+  } else {
+    O << formatImm(Op.getImm());
+  }
+}
+
 template <unsigned Bits, unsigned Offset>
-void MipsInstPrinter::printUImm(const MCInst *MI, int opNum, raw_ostream &O) {
+void MipsInstPrinter::printUImm(const MCInst *MI, int opNum,
+                                const MCSubtargetInfo &STI, raw_ostream &O) {
   const MCOperand &MO = MI->getOperand(opNum);
   if (MO.isImm()) {
     uint64_t Imm = MO.getImm();
@@ -151,11 +186,12 @@ void MipsInstPrinter::printUImm(const MCInst *MI, int opNum, raw_ostream &O) {
     return;
   }
 
-  printOperand(MI, opNum, O);
+  printOperand(MI, opNum, STI, O);
 }
 
-void MipsInstPrinter::
-printMemOperand(const MCInst *MI, int opNum, raw_ostream &O) {
+void MipsInstPrinter::printMemOperand(const MCInst *MI, int opNum,
+                                      const MCSubtargetInfo &STI,
+                                      raw_ostream &O) {
   // Load/Store memory operands -- imm($reg)
   // If PIC target the target is loaded as the
   // pattern lw $25,%call16($28)
@@ -175,24 +211,26 @@ printMemOperand(const MCInst *MI, int opNum, raw_ostream &O) {
     break;
   }
 
-  printOperand(MI, opNum+1, O);
+  printOperand(MI, opNum + 1, STI, O);
   O << "(";
-  printOperand(MI, opNum, O);
+  printOperand(MI, opNum, STI, O);
   O << ")";
 }
 
-void MipsInstPrinter::
-printMemOperandEA(const MCInst *MI, int opNum, raw_ostream &O) {
+void MipsInstPrinter::printMemOperandEA(const MCInst *MI, int opNum,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
   // when using stack locations for not load/store instructions
   // print the same way as all normal 3 operand instructions.
-  printOperand(MI, opNum, O);
+  printOperand(MI, opNum, STI, O);
   O << ", ";
-  printOperand(MI, opNum+1, O);
+  printOperand(MI, opNum + 1, STI, O);
 }
 
-void MipsInstPrinter::
-printFCCOperand(const MCInst *MI, int opNum, raw_ostream &O) {
-  const MCOperand& MO = MI->getOperand(opNum);
+void MipsInstPrinter::printFCCOperand(const MCInst *MI, int opNum,
+                                      const MCSubtargetInfo & /* STI */,
+                                      raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(opNum);
   O << MipsFCCToString((Mips::CondCode)MO.getImm());
 }
 
@@ -202,82 +240,116 @@ printSHFMask(const MCInst *MI, int opNum, raw_ostream &O) {
 }
 
 bool MipsInstPrinter::printAlias(const char *Str, const MCInst &MI,
-                                 unsigned OpNo, raw_ostream &OS) {
+                                 uint64_t Address, unsigned OpNo,
+                                 const MCSubtargetInfo &STI, raw_ostream &OS,
+                                 bool IsBranch) {
   OS << "\t" << Str << "\t";
-  printOperand(&MI, OpNo, OS);
+  if (IsBranch)
+    printBranchOperand(&MI, Address, OpNo, STI, OS);
+  else
+    printOperand(&MI, OpNo, STI, OS);
   return true;
 }
 
 bool MipsInstPrinter::printAlias(const char *Str, const MCInst &MI,
-                                 unsigned OpNo0, unsigned OpNo1,
-                                 raw_ostream &OS) {
-  printAlias(Str, MI, OpNo0, OS);
+                                 uint64_t Address, unsigned OpNo0,
+                                 unsigned OpNo1, const MCSubtargetInfo &STI,
+                                 raw_ostream &OS, bool IsBranch) {
+  printAlias(Str, MI, Address, OpNo0, STI, OS, IsBranch);
   OS << ", ";
-  printOperand(&MI, OpNo1, OS);
+  if (IsBranch)
+    printBranchOperand(&MI, Address, OpNo1, STI, OS);
+  else
+    printOperand(&MI, OpNo1, STI, OS);
   return true;
 }
 
-bool MipsInstPrinter::printAlias(const MCInst &MI, raw_ostream &OS) {
+bool MipsInstPrinter::printAlias(const MCInst &MI, uint64_t Address,
+                                 const MCSubtargetInfo &STI, raw_ostream &OS) {
   switch (MI.getOpcode()) {
   case Mips::BEQ:
   case Mips::BEQ_MM:
     // beq $zero, $zero, $L2 => b $L2
     // beq $r0, $zero, $L2 => beqz $r0, $L2
     return (isReg<Mips::ZERO>(MI, 0) && isReg<Mips::ZERO>(MI, 1) &&
-            printAlias("b", MI, 2, OS)) ||
-           (isReg<Mips::ZERO>(MI, 1) && printAlias("beqz", MI, 0, 2, OS));
+            printAlias("b", MI, Address, 2, STI, OS, true)) ||
+           (isReg<Mips::ZERO>(MI, 1) &&
+            printAlias("beqz", MI, Address, 0, 2, STI, OS, true));
   case Mips::BEQ64:
     // beq $r0, $zero, $L2 => beqz $r0, $L2
-    return isReg<Mips::ZERO_64>(MI, 1) && printAlias("beqz", MI, 0, 2, OS);
+    return isReg<Mips::ZERO_64>(MI, 1) &&
+           printAlias("beqz", MI, Address, 0, 2, STI, OS, true);
   case Mips::BNE:
   case Mips::BNE_MM:
     // bne $r0, $zero, $L2 => bnez $r0, $L2
-    return isReg<Mips::ZERO>(MI, 1) && printAlias("bnez", MI, 0, 2, OS);
+    return isReg<Mips::ZERO>(MI, 1) &&
+           printAlias("bnez", MI, Address, 0, 2, STI, OS, true);
   case Mips::BNE64:
     // bne $r0, $zero, $L2 => bnez $r0, $L2
-    return isReg<Mips::ZERO_64>(MI, 1) && printAlias("bnez", MI, 0, 2, OS);
+    return isReg<Mips::ZERO_64>(MI, 1) &&
+           printAlias("bnez", MI, Address, 0, 2, STI, OS, true);
   case Mips::BGEZAL:
     // bgezal $zero, $L1 => bal $L1
-    return isReg<Mips::ZERO>(MI, 0) && printAlias("bal", MI, 1, OS);
+    return isReg<Mips::ZERO>(MI, 0) &&
+           printAlias("bal", MI, Address, 1, STI, OS, true);
   case Mips::BC1T:
     // bc1t $fcc0, $L1 => bc1t $L1
-    return isReg<Mips::FCC0>(MI, 0) && printAlias("bc1t", MI, 1, OS);
+    return isReg<Mips::FCC0>(MI, 0) &&
+           printAlias("bc1t", MI, Address, 1, STI, OS, true);
   case Mips::BC1F:
     // bc1f $fcc0, $L1 => bc1f $L1
-    return isReg<Mips::FCC0>(MI, 0) && printAlias("bc1f", MI, 1, OS);
+    return isReg<Mips::FCC0>(MI, 0) &&
+           printAlias("bc1f", MI, Address, 1, STI, OS, true);
   case Mips::JALR:
+    // jalr $zero, $r1 => jr $r1
     // jalr $ra, $r1 => jalr $r1
-    return isReg<Mips::RA>(MI, 0) && printAlias("jalr", MI, 1, OS);
+    return (isReg<Mips::ZERO>(MI, 0) &&
+            printAlias("jr", MI, Address, 1, STI, OS)) ||
+           (isReg<Mips::RA>(MI, 0) &&
+            printAlias("jalr", MI, Address, 1, STI, OS));
   case Mips::JALR64:
+    // jalr $zero, $r1 => jr $r1
     // jalr $ra, $r1 => jalr $r1
-    return isReg<Mips::RA_64>(MI, 0) && printAlias("jalr", MI, 1, OS);
+    return (isReg<Mips::ZERO_64>(MI, 0) &&
+            printAlias("jr", MI, Address, 1, STI, OS)) ||
+           (isReg<Mips::RA_64>(MI, 0) &&
+            printAlias("jalr", MI, Address, 1, STI, OS));
   case Mips::NOR:
   case Mips::NOR_MM:
   case Mips::NOR_MMR6:
     // nor $r0, $r1, $zero => not $r0, $r1
-    return isReg<Mips::ZERO>(MI, 2) && printAlias("not", MI, 0, 1, OS);
+    return isReg<Mips::ZERO>(MI, 2) &&
+           printAlias("not", MI, Address, 0, 1, STI, OS);
   case Mips::NOR64:
     // nor $r0, $r1, $zero => not $r0, $r1
-    return isReg<Mips::ZERO_64>(MI, 2) && printAlias("not", MI, 0, 1, OS);
+    return isReg<Mips::ZERO_64>(MI, 2) &&
+           printAlias("not", MI, Address, 0, 1, STI, OS);
   case Mips::OR:
+  case Mips::ADDu:
     // or $r0, $r1, $zero => move $r0, $r1
-    return isReg<Mips::ZERO>(MI, 2) && printAlias("move", MI, 0, 1, OS);
-  default: return false;
+    // addu $r0, $r1, $zero => move $r0, $r1
+    return isReg<Mips::ZERO>(MI, 2) &&
+           printAlias("move", MI, Address, 0, 1, STI, OS);
+  default:
+    return false;
   }
 }
 
-void MipsInstPrinter::printSaveRestore(const MCInst *MI, raw_ostream &O) {
+void MipsInstPrinter::printSaveRestore(const MCInst *MI,
+                                       const MCSubtargetInfo &STI,
+                                       raw_ostream &O) {
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     if (i != 0) O << ", ";
     if (MI->getOperand(i).isReg())
       printRegName(O, MI->getOperand(i).getReg());
     else
-      printUImm<16>(MI, i, O);
+      printUImm<16>(MI, i, STI, O);
   }
 }
 
-void MipsInstPrinter::
-printRegisterList(const MCInst *MI, int opNum, raw_ostream &O) {
+void MipsInstPrinter::printRegisterList(const MCInst *MI, int opNum,
+                                        const MCSubtargetInfo & /* STI */,
+                                        raw_ostream &O) {
   // - 2 because register List is always first operand of instruction and it is
   // always followed by memory operand (base + offset).
   for (int i = opNum, e = MI->getNumOperands() - 2; i != e; ++i) {
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.h
index 68b13bf1fcc3..d91612b15a1a 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.h
@@ -80,38 +80,50 @@ public:
 
   // Autogenerated by tblgen.
   std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
-  void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
+  void printInstruction(const MCInst *MI, uint64_t Address,
+                        const MCSubtargetInfo &STI, raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
 
   void printRegName(raw_ostream &OS, unsigned RegNo) const override;
   void printInst(const MCInst *MI, uint64_t Address, StringRef Annot,
                  const MCSubtargetInfo &STI, raw_ostream &O) override;
 
-  bool printAliasInstr(const MCInst *MI, uint64_t Address, raw_ostream &OS);
+  bool printAliasInstr(const MCInst *MI, uint64_t Address,
+                       const MCSubtargetInfo &STI, raw_ostream &OS);
   void printCustomAliasOperand(const MCInst *MI, uint64_t Address,
                                unsigned OpIdx, unsigned PrintMethodIdx,
-                               raw_ostream &O);
+                               const MCSubtargetInfo &STI, raw_ostream &O);
 
 private:
-  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printOperand(const MCInst *MI, uint64_t /*Address*/, unsigned OpNum,
-                    raw_ostream &O) {
-    printOperand(MI, OpNum, O);
-  }
+  void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                    raw_ostream &O);
+  void printJumpOperand(const MCInst *MI, unsigned OpNo,
+                        const MCSubtargetInfo &STI, raw_ostream &O);
+  void printBranchOperand(const MCInst *MI, uint64_t Address, unsigned OpNo,
+                          const MCSubtargetInfo &STI, raw_ostream &O);
   template <unsigned Bits, unsigned Offset = 0>
-  void printUImm(const MCInst *MI, int opNum, raw_ostream &O);
-  void printMemOperand(const MCInst *MI, int opNum, raw_ostream &O);
-  void printMemOperandEA(const MCInst *MI, int opNum, raw_ostream &O);
-  void printFCCOperand(const MCInst *MI, int opNum, raw_ostream &O);
+  void printUImm(const MCInst *MI, int opNum, const MCSubtargetInfo &STI,
+                 raw_ostream &O);
+  void printMemOperand(const MCInst *MI, int opNum, const MCSubtargetInfo &STI,
+                       raw_ostream &O);
+  void printMemOperandEA(const MCInst *MI, int opNum,
+                         const MCSubtargetInfo &STI, raw_ostream &O);
+  void printFCCOperand(const MCInst *MI, int opNum, const MCSubtargetInfo &STI,
+                       raw_ostream &O);
   void printSHFMask(const MCInst *MI, int opNum, raw_ostream &O);
 
-  bool printAlias(const char *Str, const MCInst &MI, unsigned OpNo,
-                  raw_ostream &OS);
-  bool printAlias(const char *Str, const MCInst &MI, unsigned OpNo0,
-                  unsigned OpNo1, raw_ostream &OS);
-  bool printAlias(const MCInst &MI, raw_ostream &OS);
-  void printSaveRestore(const MCInst *MI, raw_ostream &O);
-  void printRegisterList(const MCInst *MI, int opNum, raw_ostream &O);
+  bool printAlias(const char *Str, const MCInst &MI, uint64_t Address,
+                  unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &OS,
+                  bool IsBranch = false);
+  bool printAlias(const char *Str, const MCInst &MI, uint64_t Address,
+                  unsigned OpNo0, unsigned OpNo1, const MCSubtargetInfo &STI,
+                  raw_ostream &OS, bool IsBranch = false);
+  bool printAlias(const MCInst &MI, uint64_t Address,
+                  const MCSubtargetInfo &STI, raw_ostream &OS);
+  void printSaveRestore(const MCInst *MI, const MCSubtargetInfo &STI,
+                        raw_ostream &O);
+  void printRegisterList(const MCInst *MI, int opNum,
+                         const MCSubtargetInfo &STI, raw_ostream &O);
 };
 } // end namespace llvm
 
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index b81ebedfb9c7..cf311337d5eb 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -42,13 +42,11 @@ using namespace llvm;
 namespace llvm {
 
 MCCodeEmitter *createMipsMCCodeEmitterEB(const MCInstrInfo &MCII,
-                                         const MCRegisterInfo &MRI,
                                          MCContext &Ctx) {
   return new MipsMCCodeEmitter(MCII, Ctx, false);
 }
 
 MCCodeEmitter *createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
-                                         const MCRegisterInfo &MRI,
                                          MCContext &Ctx) {
   return new MipsMCCodeEmitter(MCII, Ctx, true);
 }
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
index b7ecb0fdca5e..8531177ee924 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
@@ -31,10 +31,8 @@ class Target;
 class Triple;
 
 MCCodeEmitter *createMipsMCCodeEmitterEB(const MCInstrInfo &MCII,
-                                         const MCRegisterInfo &MRI,
                                          MCContext &Ctx);
 MCCodeEmitter *createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
-                                         const MCRegisterInfo &MRI,
                                          MCContext &Ctx);
 
 MCAsmBackend *createMipsAsmBackend(const Target &T, const MCSubtargetInfo &STI,
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
index befa883d5877..f1aa90d24023 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
@@ -24,7 +24,7 @@ void MipsRegInfoRecord::EmitMipsOptionRecord() {
   MipsTargetStreamer *MTS =
       static_cast<MipsTargetStreamer *>(Streamer->getTargetStreamer());
 
-  Streamer->PushSection();
+  Streamer->pushSection();
 
   // We need to distinguish between N64 and the rest because at the moment
   // we don't emit .Mips.options for other ELFs other than N64.
@@ -38,7 +38,7 @@ void MipsRegInfoRecord::EmitMipsOptionRecord() {
                               ELF::SHF_ALLOC | ELF::SHF_MIPS_NOSTRIP, 1);
     MCA.registerSection(*Sec);
     Sec->setAlignment(Align(8));
-    Streamer->SwitchSection(Sec);
+    Streamer->switchSection(Sec);
 
     Streamer->emitInt8(ELF::ODK_REGINFO); // kind
     Streamer->emitInt8(40);               // size
@@ -56,7 +56,7 @@ void MipsRegInfoRecord::EmitMipsOptionRecord() {
                                               ELF::SHF_ALLOC, 24);
     MCA.registerSection(*Sec);
     Sec->setAlignment(MTS->getABI().IsN32() ? Align(8) : Align(4));
-    Streamer->SwitchSection(Sec);
+    Streamer->switchSection(Sec);
 
     Streamer->emitInt32(ri_gprmask);
     Streamer->emitInt32(ri_cprmask[0]);
@@ -67,7 +67,7 @@ void MipsRegInfoRecord::EmitMipsOptionRecord() {
     Streamer->emitInt32(ri_gp_value);
   }
 
-  Streamer->PopSection();
+  Streamer->popSection();
 }
 
 void MipsRegInfoRecord::SetPhysRegUsed(unsigned Reg,
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index 57cd016da4dc..caae5890fae1 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -11,17 +11,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "MipsTargetStreamer.h"
-#include "MipsInstPrinter.h"
 #include "MCTargetDesc/MipsABIInfo.h"
 #include "MipsELFStreamer.h"
+#include "MipsInstPrinter.h"
 #include "MipsMCExpr.h"
 #include "MipsMCTargetDesc.h"
 #include "MipsTargetObjectFile.h"
 #include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbolELF.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
@@ -38,6 +40,10 @@ static bool isMicroMips(const MCSubtargetInfo *STI) {
   return STI->getFeatureBits()[Mips::FeatureMicroMips];
 }
 
+static bool isMips32r6(const MCSubtargetInfo *STI) {
+  return STI->getFeatureBits()[Mips::FeatureMips32r6];
+}
+
 MipsTargetStreamer::MipsTargetStreamer(MCStreamer &S)
     : MCTargetStreamer(S), GPReg(Mips::GP), ModuleDirectiveAllowed(true) {
   GPRInfoSet = FPRInfoSet = FrameInfoSet = false;
@@ -277,10 +283,18 @@ void MipsTargetStreamer::emitDSLL(unsigned DstReg, unsigned SrcReg,
 
 void MipsTargetStreamer::emitEmptyDelaySlot(bool hasShortDelaySlot, SMLoc IDLoc,
                                             const MCSubtargetInfo *STI) {
-  if (hasShortDelaySlot)
-    emitRR(Mips::MOVE16_MM, Mips::ZERO, Mips::ZERO, IDLoc, STI);
-  else
-    emitRRI(Mips::SLL, Mips::ZERO, Mips::ZERO, 0, IDLoc, STI);
+  // The default case of `nop` is `sll $zero, $zero, 0`.
+  unsigned Opc = Mips::SLL;
+  if (isMicroMips(STI) && hasShortDelaySlot) {
+    Opc = isMips32r6(STI) ? Mips::MOVE16_MMR6 : Mips::MOVE16_MM;
+    emitRR(Opc, Mips::ZERO, Mips::ZERO, IDLoc, STI);
+    return;
+  }
+
+  if (isMicroMips(STI))
+    Opc = isMips32r6(STI) ? Mips::SLL_MMR6 : Mips::SLL_MM;
+
+  emitRRI(Opc, Mips::ZERO, Mips::ZERO, 0, IDLoc, STI);
 }
 
 void MipsTargetStreamer::emitNop(SMLoc IDLoc, const MCSubtargetInfo *STI) {
@@ -900,8 +914,8 @@ void MipsTargetELFStreamer::finish() {
 
       unsigned Alignment = Section.getAlignment();
       if (Alignment) {
-        OS.SwitchSection(&Section);
-        if (Section.UseCodeAlign())
+        OS.switchSection(&Section);
+        if (Section.useCodeAlign())
           OS.emitCodeAlignment(Alignment, &STI, Alignment);
         else
           OS.emitValueToAlignment(Alignment, 0, 1, Alignment);
@@ -1012,9 +1026,9 @@ void MipsTargetELFStreamer::emitDirectiveEnd(StringRef Name) {
   MCA.registerSection(*Sec);
   Sec->setAlignment(Align(4));
 
-  OS.PushSection();
+  OS.pushSection();
 
-  OS.SwitchSection(Sec);
+  OS.switchSection(Sec);
 
   OS.emitValueImpl(ExprRef, 4);
 
@@ -1032,7 +1046,7 @@ void MipsTargetELFStreamer::emitDirectiveEnd(StringRef Name) {
   // the information gathered up until this point.
   GPRInfoSet = FPRInfoSet = FrameInfoSet = false;
 
-  OS.PopSection();
+  OS.popSection();
 
   // .end also implicitly sets the size.
   MCSymbol *CurPCSym = Context.createTempSymbol();
@@ -1312,7 +1326,7 @@ void MipsTargetELFStreamer::emitMipsAbiFlags() {
       ".MIPS.abiflags", ELF::SHT_MIPS_ABIFLAGS, ELF::SHF_ALLOC, 24);
   MCA.registerSection(*Sec);
   Sec->setAlignment(Align(8));
-  OS.SwitchSection(Sec);
+  OS.switchSection(Sec);
 
   OS << ABIFlagsSection;
 }
diff --git a/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td b/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td
index b1a05388884b..26cc6ac4dd38 100644
--- a/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td
+++ b/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td
@@ -15,6 +15,7 @@ def brtarget21_mm : Operand<OtherVT> {
   let OperandType = "OPERAND_PCREL";
   let DecoderMethod = "DecodeBranchTarget21MM";
   let ParserMatchClass = MipsJumpTargetAsmOperand;
+  let PrintMethod = "printBranchOperand";
 }
 
 def brtarget26_mm : Operand<OtherVT> {
@@ -22,6 +23,7 @@ def brtarget26_mm : Operand<OtherVT> {
   let OperandType = "OPERAND_PCREL";
   let DecoderMethod = "DecodeBranchTarget26MM";
   let ParserMatchClass = MipsJumpTargetAsmOperand;
+  let PrintMethod = "printBranchOperand";
 }
 
 def brtargetr6 : Operand<OtherVT> {
@@ -29,6 +31,7 @@ def brtargetr6 : Operand<OtherVT> {
   let OperandType = "OPERAND_PCREL";
   let DecoderMethod = "DecodeBranchTargetMM";
   let ParserMatchClass = MipsJumpTargetAsmOperand;
+  let PrintMethod = "printBranchOperand";
 }
 
 def brtarget_lsl2_mm : Operand<OtherVT> {
@@ -38,6 +41,7 @@ def brtarget_lsl2_mm : Operand<OtherVT> {
   // set with DecodeDisambiguates
   let DecoderMethod = "";
   let ParserMatchClass = MipsJumpTargetAsmOperand;
+  let PrintMethod = "printBranchOperand";
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/Mips/MicroMipsInstrFPU.td b/llvm/lib/Target/Mips/MicroMipsInstrFPU.td
index eea4d7746fa6..d5fc30cef695 100644
--- a/llvm/lib/Target/Mips/MicroMipsInstrFPU.td
+++ b/llvm/lib/Target/Mips/MicroMipsInstrFPU.td
@@ -278,18 +278,32 @@ let DecoderNamespace = "MicroMips" in {
 }
 
 let DecoderNamespace = "MicroMips",  DecoderMethod = "DecodeFMemMMR2" in {
-  def LDC1_MM : MMRel, LW_FT<"ldc1", AFGR64Opnd, mem_mm_16, II_LDC1, load>,
-                LW_FM_MM<0x2f>, ISA_MICROMIPS, FGR_32 {
+  def LDC1_MM_D32 : MMRel, LW_FT<"ldc1", AFGR64Opnd, mem_mm_16, II_LDC1, load>,
+                    LW_FM_MM<0x2f>, ISA_MICROMIPS, FGR_32 {
     let BaseOpcode = "LDC132";
   }
-  def SDC1_MM : MMRel, SW_FT<"sdc1", AFGR64Opnd, mem_mm_16, II_SDC1, store>,
-                LW_FM_MM<0x2e>, ISA_MICROMIPS, FGR_32;
+  def SDC1_MM_D32 : MMRel, SW_FT<"sdc1", AFGR64Opnd, mem_mm_16, II_SDC1, store>,
+                    LW_FM_MM<0x2e>, ISA_MICROMIPS, FGR_32 {
+    let BaseOpcode = "SDC164";
+  }
   def LWC1_MM : MMRel, LW_FT<"lwc1", FGR32Opnd, mem_mm_16, II_LWC1, load>,
                 LW_FM_MM<0x27>, ISA_MICROMIPS;
   def SWC1_MM : MMRel, SW_FT<"swc1", FGR32Opnd, mem_mm_16, II_SWC1, store>,
                 LW_FM_MM<0x26>, ISA_MICROMIPS;
 }
 
+let DecoderNamespace = "Mips64", DecoderMethod = "DecodeFMemMMR2" in {
+  def LDC1_MM_D64 : MMRel, LW_FT<"ldc1", FGR64Opnd, mem_mm_16, II_LDC1, load>,
+                    LW_FM_MM<0x2f>, ISA_MICROMIPS, FGR_64 {
+    let BaseOpcode = "LDC164";
+  }
+  def SDC1_MM_D64 : MMRel, SW_FT<"sdc1", FGR64Opnd, mem_mm_16, II_SDC1, store>,
+                    LW_FM_MM<0x2e>, ISA_MICROMIPS, FGR_64 {
+    let BaseOpcode = "SDC164";
+  }
+}
+
+
 multiclass C_COND_MM<string TypeStr, RegisterOperand RC, bits<2> fmt,
                      InstrItinClass itin> {
   def C_F_#NAME#_MM : MMRel, C_COND_FT<"f", TypeStr, RC, itin>,
@@ -400,8 +414,10 @@ let AdditionalPredicates = [NoNaNsFPMath, HasMadd4,
 
 // Patterns for loads/stores with a reg+imm operand.
 let AddedComplexity = 40 in {
-  def : LoadRegImmPat<LDC1_MM, f64, load>, ISA_MICROMIPS, FGR_32;
-  def : StoreRegImmPat<SDC1_MM, f64>, ISA_MICROMIPS, FGR_32;
+  def : LoadRegImmPat<LDC1_MM_D32, f64, load>, ISA_MICROMIPS, FGR_32;
+  def : StoreRegImmPat<SDC1_MM_D32, f64>, ISA_MICROMIPS, FGR_32;
+  def : LoadRegImmPat<LDC1_MM_D64, f64, load>, ISA_MICROMIPS, FGR_64;
+  def : StoreRegImmPat<SDC1_MM_D64, f64>, ISA_MICROMIPS, FGR_64;
   def : LoadRegImmPat<LWC1_MM, f32, load>, ISA_MICROMIPS;
   def : StoreRegImmPat<SWC1_MM, f32>, ISA_MICROMIPS;
 }
diff --git a/llvm/lib/Target/Mips/MicroMipsInstrInfo.td b/llvm/lib/Target/Mips/MicroMipsInstrInfo.td
index 5f6354e19ebc..43b8eb7faf0e 100644
--- a/llvm/lib/Target/Mips/MicroMipsInstrInfo.td
+++ b/llvm/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -163,10 +163,12 @@ def mem_mm_4sp : Operand<i32> {
 
 def jmptarget_mm : Operand<OtherVT> {
   let EncoderMethod = "getJumpTargetOpValueMM";
+  let PrintMethod = "printJumpOperand";
 }
 
 def calltarget_mm : Operand<iPTR> {
   let EncoderMethod = "getJumpTargetOpValueMM";
+  let PrintMethod = "printJumpOperand";
 }
 
 def brtarget7_mm : Operand<OtherVT> {
@@ -174,6 +176,7 @@ def brtarget7_mm : Operand<OtherVT> {
   let OperandType   = "OPERAND_PCREL";
   let DecoderMethod = "DecodeBranchTarget7MM";
   let ParserMatchClass = MipsJumpTargetAsmOperand;
+  let PrintMethod = "printBranchOperand";
 }
 
 def brtarget10_mm : Operand<OtherVT> {
@@ -181,6 +184,7 @@ def brtarget10_mm : Operand<OtherVT> {
   let OperandType   = "OPERAND_PCREL";
   let DecoderMethod = "DecodeBranchTarget10MM";
   let ParserMatchClass = MipsJumpTargetAsmOperand;
+  let PrintMethod = "printBranchOperand";
 }
 
 def brtarget_mm : Operand<OtherVT> {
@@ -188,6 +192,7 @@ def brtarget_mm : Operand<OtherVT> {
   let OperandType   = "OPERAND_PCREL";
   let DecoderMethod = "DecodeBranchTargetMM";
   let ParserMatchClass = MipsJumpTargetAsmOperand;
+  let PrintMethod = "printBranchOperand";
 }
 
 def simm23_lsl2 : Operand<i32> {
diff --git a/llvm/lib/Target/Mips/MicroMipsSizeReduction.cpp b/llvm/lib/Target/Mips/MicroMipsSizeReduction.cpp
index 55d3c59cbf03..b0de8dacf691 100644
--- a/llvm/lib/Target/Mips/MicroMipsSizeReduction.cpp
+++ b/llvm/lib/Target/Mips/MicroMipsSizeReduction.cpp
@@ -774,7 +774,7 @@ bool MicroMipsSizeReduce::ReplaceInstruction(MachineInstr *MI,
 
 bool MicroMipsSizeReduce::runOnMachineFunction(MachineFunction &MF) {
 
-  Subtarget = &static_cast<const MipsSubtarget &>(MF.getSubtarget());
+  Subtarget = &MF.getSubtarget<MipsSubtarget>();
 
   // TODO: Add support for the subtarget microMIPS32R6.
   if (!Subtarget->inMicroMipsMode() || !Subtarget->hasMips32r2() ||
diff --git a/llvm/lib/Target/Mips/Mips.h b/llvm/lib/Target/Mips/Mips.h
index faf58545db62..12dc29bbfe85 100644
--- a/llvm/lib/Target/Mips/Mips.h
+++ b/llvm/lib/Target/Mips/Mips.h
@@ -38,6 +38,7 @@ namespace llvm {
   FunctionPass *createMicroMipsSizeReducePass();
   FunctionPass *createMipsExpandPseudoPass();
   FunctionPass *createMipsPreLegalizeCombiner();
+  FunctionPass *createMipsPostLegalizeCombiner(bool IsOptNone);
   FunctionPass *createMipsMulMulBugPass();
 
   InstructionSelector *createMipsInstructionSelector(const MipsTargetMachine &,
@@ -48,6 +49,7 @@ namespace llvm {
   void initializeMipsBranchExpansionPass(PassRegistry &);
   void initializeMicroMipsSizeReducePass(PassRegistry &);
   void initializeMipsPreLegalizerCombinerPass(PassRegistry&);
+  void initializeMipsPostLegalizerCombinerPass(PassRegistry &);
   void initializeMipsMulMulBugFixPass(PassRegistry&);
 } // end namespace llvm;
 
diff --git a/llvm/lib/Target/Mips/Mips.td b/llvm/lib/Target/Mips/Mips.td
index 792960332bcc..398c38e678ba 100644
--- a/llvm/lib/Target/Mips/Mips.td
+++ b/llvm/lib/Target/Mips/Mips.td
@@ -217,6 +217,7 @@ include "MipsSchedule.td"
 include "MipsInstrInfo.td"
 include "MipsCallingConv.td"
 include "MipsRegisterBanks.td"
+include "MipsCombine.td"
 
 // Avoid forward declaration issues.
 include "MipsScheduleP5600.td"
@@ -267,8 +268,13 @@ def MipsAsmParserVariant : AsmParserVariant {
   string RegisterPrefix = "$";
 }
 
+def MipsAsmWriter : AsmWriter {
+  int PassSubtarget = 1;
+}
+
 def Mips : Target {
   let InstructionSet = MipsInstrInfo;
+  let AssemblyWriters = [MipsAsmWriter];
   let AssemblyParsers = [MipsAsmParser];
   let AssemblyParserVariants = [MipsAsmParserVariant];
   let AllowRegisterRenaming = 1;
diff --git a/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
index 50147c019bfd..ce04124a7b00 100644
--- a/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
@@ -35,7 +35,7 @@ using namespace llvm;
 #define DEBUG_TYPE "mips-isel"
 
 bool Mips16DAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
-  Subtarget = &static_cast<const MipsSubtarget &>(MF.getSubtarget());
+  Subtarget = &MF.getSubtarget<MipsSubtarget>();
   if (!Subtarget->inMips16Mode())
     return false;
   return MipsDAGToDAGISel::runOnMachineFunction(MF);
diff --git a/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp b/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp
index 563118dfe627..b7b1d74e66ed 100644
--- a/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp
+++ b/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp
@@ -37,7 +37,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "mips16-registerinfo"
 
-Mips16RegisterInfo::Mips16RegisterInfo() {}
+Mips16RegisterInfo::Mips16RegisterInfo() = default;
 
 bool Mips16RegisterInfo::requiresRegisterScavenging
   (const MachineFunction &MF) const {
diff --git a/llvm/lib/Target/Mips/Mips32r6InstrInfo.td b/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
index 192d0013d89c..0ae946160477 100644
--- a/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -39,6 +39,7 @@ def brtarget21 : Operand<OtherVT> {
   let OperandType = "OPERAND_PCREL";
   let DecoderMethod = "DecodeBranchTarget21";
   let ParserMatchClass = MipsJumpTargetAsmOperand;
+  let PrintMethod = "printBranchOperand";
 }
 
 def brtarget26 : Operand<OtherVT> {
@@ -46,6 +47,7 @@ def brtarget26 : Operand<OtherVT> {
   let OperandType = "OPERAND_PCREL";
   let DecoderMethod = "DecodeBranchTarget26";
   let ParserMatchClass = MipsJumpTargetAsmOperand;
+  let PrintMethod = "printBranchOperand";
 }
 
 def jmpoffset16 : Operand<OtherVT> {
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index 4bd8845e9cb9..9330a791a7cc 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -522,27 +522,27 @@ bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
       // See if this is a generic print operand
       return AsmPrinter::PrintAsmOperand(MI, OpNum, ExtraCode, O);
     case 'X': // hex const int
-      if ((MO.getType()) != MachineOperand::MO_Immediate)
+      if (!MO.isImm())
         return true;
       O << "0x" << Twine::utohexstr(MO.getImm());
       return false;
     case 'x': // hex const int (low 16 bits)
-      if ((MO.getType()) != MachineOperand::MO_Immediate)
+      if (!MO.isImm())
         return true;
       O << "0x" << Twine::utohexstr(MO.getImm() & 0xffff);
       return false;
     case 'd': // decimal const int
-      if ((MO.getType()) != MachineOperand::MO_Immediate)
+      if (!MO.isImm())
         return true;
       O << MO.getImm();
       return false;
     case 'm': // decimal const int minus 1
-      if ((MO.getType()) != MachineOperand::MO_Immediate)
+      if (!MO.isImm())
         return true;
       O << MO.getImm() - 1;
       return false;
     case 'y': // exact log2
-      if ((MO.getType()) != MachineOperand::MO_Immediate)
+      if (!MO.isImm())
         return true;
       if (!isPowerOf2_64(MO.getImm()))
         return true;
@@ -550,7 +550,7 @@ bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
       return false;
     case 'z':
       // $0 if zero, regular printing otherwise
-      if (MO.getType() == MachineOperand::MO_Immediate && MO.getImm() == 0) {
+      if (MO.isImm() && MO.getImm() == 0) {
         O << "$0";
         return false;
       }
@@ -798,7 +798,7 @@ void MipsAsmPrinter::emitStartOfAsmFile(Module &M) {
 
   // Tell the assembler which ABI we are using
   std::string SectionName = std::string(".mdebug.") + getCurrentABIString();
-  OutStreamer->SwitchSection(
+  OutStreamer->switchSection(
       OutContext.getELFSection(SectionName, ELF::SHT_PROGBITS, 0));
 
   // NaN: At the moment we only support:
@@ -825,7 +825,7 @@ void MipsAsmPrinter::emitStartOfAsmFile(Module &M) {
     TS.emitDirectiveModuleOddSPReg();
 
   // Switch to the .text section.
-  OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
+  OutStreamer->switchSection(getObjFileLowering().getTextSection());
 }
 
 void MipsAsmPrinter::emitInlineAsmStart() const {
@@ -841,12 +841,12 @@ void MipsAsmPrinter::emitInlineAsmStart() const {
   TS.emitDirectiveSetAt();
   TS.emitDirectiveSetMacro();
   TS.emitDirectiveSetReorder();
-  OutStreamer->AddBlankLine();
+  OutStreamer->addBlankLine();
 }
 
 void MipsAsmPrinter::emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
                                       const MCSubtargetInfo *EndInfo) const {
-  OutStreamer->AddBlankLine();
+  OutStreamer->addBlankLine();
   getTargetStreamer().emitDirectiveSetPop();
 }
 
@@ -1038,14 +1038,14 @@ void MipsAsmPrinter::EmitFPCallStub(
   //
   // probably not necessary but we save and restore the current section state
   //
-  OutStreamer->PushSection();
+  OutStreamer->pushSection();
   //
   // .section mips16.call.fpxxxx,"ax",@progbits
   //
   MCSectionELF *M = OutContext.getELFSection(
       ".mips16.call.fp." + std::string(Symbol), ELF::SHT_PROGBITS,
       ELF::SHF_ALLOC | ELF::SHF_EXECINSTR);
-  OutStreamer->SwitchSection(M, nullptr);
+  OutStreamer->switchSection(M, nullptr);
   //
   // .align 2
   //
@@ -1114,7 +1114,7 @@ void MipsAsmPrinter::EmitFPCallStub(
   const MCExpr *T_min_E = MCBinaryExpr::createSub(T, E, OutContext);
   OutStreamer->emitELFSize(Stub, T_min_E);
   TS.emitDirectiveEnd(x);
-  OutStreamer->PopSection();
+  OutStreamer->popSection();
 }
 
 void MipsAsmPrinter::emitEndOfAsmFile(Module &M) {
@@ -1130,7 +1130,7 @@ void MipsAsmPrinter::emitEndOfAsmFile(Module &M) {
     EmitFPCallStub(Symbol, Signature);
   }
   // return to the text section
-  OutStreamer->SwitchSection(OutContext.getObjectFileInfo()->getTextSection());
+  OutStreamer->switchSection(OutContext.getObjectFileInfo()->getTextSection());
 }
 
 void MipsAsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind) {
diff --git a/llvm/lib/Target/Mips/MipsBranchExpansion.cpp b/llvm/lib/Target/Mips/MipsBranchExpansion.cpp
index 4e9a23d077da..a4fa0792a998 100644
--- a/llvm/lib/Target/Mips/MipsBranchExpansion.cpp
+++ b/llvm/lib/Target/Mips/MipsBranchExpansion.cpp
@@ -36,7 +36,8 @@
 ///
 /// Regarding compact branch hazard prevention:
 ///
-/// Hazards handled: forbidden slots for MIPSR6, FPU slots for MIPS3 and below.
+/// Hazards handled: forbidden slots for MIPSR6, FPU slots for MIPS3 and below,
+/// load delay slots for MIPS1.
 ///
 /// A forbidden slot hazard occurs when a compact branch instruction is executed
 /// and the adjacent instruction in memory is a control transfer instruction
@@ -164,6 +165,7 @@ private:
   bool handleSlot(Pred Predicate, Safe SafeInSlot);
   bool handleForbiddenSlot();
   bool handleFPUDelaySlot();
+  bool handleLoadDelaySlot();
   bool handlePossibleLongBranch();
 
   const MipsSubtarget *STI;
@@ -532,7 +534,7 @@ void MipsBranchExpansion::expandToLongBranch(MBBInfo &I) {
       }
       if (hasDelaySlot) {
         if (STI->isTargetNaCl()) {
-          BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::NOP));
+          TII->insertNop(*BalTgtMBB, Pos, DL);
         } else {
           BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP)
               .addReg(Mips::SP)
@@ -675,9 +677,8 @@ void MipsBranchExpansion::expandToLongBranch(MBBInfo &I) {
       //  nop
       // $fallthrough:
       //
-      MIBundleBuilder(*LongBrMBB, Pos)
-          .append(BuildMI(*MFp, DL, TII->get(Mips::J)).addMBB(TgtMBB))
-          .append(BuildMI(*MFp, DL, TII->get(Mips::NOP)));
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::J)).addMBB(TgtMBB);
+      TII->insertNop(*LongBrMBB, Pos, DL)->bundleWithPred();
     } else {
       // At this point, offset where we need to branch does not fit into
       // immediate field of the branch instruction and is not in the same
@@ -722,7 +723,7 @@ void MipsBranchExpansion::expandToLongBranch(MBBInfo &I) {
   if (I.Br->isUnconditionalBranch()) {
     // Change branch destination.
     assert(I.Br->getDesc().getNumOperands() == 1);
-    I.Br->RemoveOperand(0);
+    I.Br->removeOperand(0);
     I.Br->addOperand(MachineOperand::CreateMBB(LongBrMBB));
   } else
     // Change branch destination and reverse condition.
@@ -762,13 +763,12 @@ bool MipsBranchExpansion::handleSlot(Pred Predicate, Safe SafeInSlot) {
       }
 
       if (LastInstInFunction || !SafeInSlot(*IInSlot, *I)) {
-
         MachineBasicBlock::instr_iterator Iit = I->getIterator();
         if (std::next(Iit) == FI->end() ||
             std::next(Iit)->getOpcode() != Mips::NOP) {
           Changed = true;
-          MIBundleBuilder(&*I).append(
-              BuildMI(*MFp, I->getDebugLoc(), TII->get(Mips::NOP)));
+          TII->insertNop(*(I->getParent()), std::next(I), I->getDebugLoc())
+              ->bundleWithPred();
           NumInsertedNops++;
         }
       }
@@ -801,6 +801,18 @@ bool MipsBranchExpansion::handleFPUDelaySlot() {
                     });
 }
 
+bool MipsBranchExpansion::handleLoadDelaySlot() {
+  // Load delay slot hazards are only for MIPS1.
+  if (STI->hasMips2())
+    return false;
+
+  return handleSlot(
+      [this](auto &I) -> bool { return TII->HasLoadDelaySlot(I); },
+      [this](auto &IInSlot, auto &I) -> bool {
+        return TII->SafeInLoadDelaySlot(IInSlot, I);
+      });
+}
+
 bool MipsBranchExpansion::handlePossibleLongBranch() {
   if (STI->inMips16Mode() || !STI->enableLongBranchPass())
     return false;
@@ -867,7 +879,7 @@ bool MipsBranchExpansion::runOnMachineFunction(MachineFunction &MF) {
   const TargetMachine &TM = MF.getTarget();
   IsPIC = TM.isPositionIndependent();
   ABI = static_cast<const MipsTargetMachine &>(TM).getABI();
-  STI = &static_cast<const MipsSubtarget &>(MF.getSubtarget());
+  STI = &MF.getSubtarget<MipsSubtarget>();
   TII = static_cast<const MipsInstrInfo *>(STI->getInstrInfo());
 
   if (IsPIC && ABI.IsO32() &&
@@ -877,19 +889,21 @@ bool MipsBranchExpansion::runOnMachineFunction(MachineFunction &MF) {
   MFp = &MF;
 
   ForceLongBranchFirstPass = ForceLongBranch;
-  // Run these two at least once
+  // Run these at least once.
   bool longBranchChanged = handlePossibleLongBranch();
   bool forbiddenSlotChanged = handleForbiddenSlot();
   bool fpuDelaySlotChanged = handleFPUDelaySlot();
+  bool loadDelaySlotChanged = handleLoadDelaySlot();
 
-  bool Changed =
-      longBranchChanged || forbiddenSlotChanged || fpuDelaySlotChanged;
+  bool Changed = longBranchChanged || forbiddenSlotChanged ||
+                 fpuDelaySlotChanged || loadDelaySlotChanged;
 
-  // Then run them alternatively while there are changes
+  // Then run them alternatively while there are changes.
   while (forbiddenSlotChanged) {
     longBranchChanged = handlePossibleLongBranch();
     fpuDelaySlotChanged = handleFPUDelaySlot();
-    if (!longBranchChanged && !fpuDelaySlotChanged)
+    loadDelaySlotChanged = handleLoadDelaySlot();
+    if (!longBranchChanged && !fpuDelaySlotChanged && !loadDelaySlotChanged)
       break;
     forbiddenSlotChanged = handleForbiddenSlot();
   }
diff --git a/llvm/lib/Target/Mips/MipsCallLowering.cpp b/llvm/lib/Target/Mips/MipsCallLowering.cpp
index f6ec34c7f403..3c1c2bcd7a1b 100644
--- a/llvm/lib/Target/Mips/MipsCallLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsCallLowering.cpp
@@ -18,6 +18,7 @@
 #include "MipsTargetMachine.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 
 using namespace llvm;
 
@@ -540,8 +541,7 @@ bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   }
   MIRBuilder.insertInstr(MIB);
   if (MIB->getOpcode() == Mips::JALRPseudo) {
-    const MipsSubtarget &STI =
-        static_cast<const MipsSubtarget &>(MIRBuilder.getMF().getSubtarget());
+    const MipsSubtarget &STI = MIRBuilder.getMF().getSubtarget<MipsSubtarget>();
     MIB.constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(),
                          *STI.getRegBankInfo());
   }
diff --git a/llvm/lib/Target/Mips/MipsCombine.td b/llvm/lib/Target/Mips/MipsCombine.td
new file mode 100644
index 000000000000..29550a15d38d
--- /dev/null
+++ b/llvm/lib/Target/Mips/MipsCombine.td
@@ -0,0 +1,15 @@
+//=- MipsCombine.td - Define Mips Combine Rules --------------*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/GlobalISel/Combine.td"
+
+def MipsPostLegalizerCombinerHelper: GICombinerHelper<
+  "MipsGenPostLegalizerCombinerHelper", []> {
+  let DisableRuleOption = "mipspostlegalizercombiner-disable-rule";
+}
+
diff --git a/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp b/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
index 1efbf5570287..0341af0caac4 100644
--- a/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
+++ b/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -436,7 +436,7 @@ bool MipsConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   // FIXME:
   MF = &mf;
   MCP = mf.getConstantPool();
-  STI = &static_cast<const MipsSubtarget &>(mf.getSubtarget());
+  STI = &mf.getSubtarget<MipsSubtarget>();
   LLVM_DEBUG(dbgs() << "constant island machine function "
                     << "\n");
   if (!STI->inMips16Mode() || !MipsSubtarget::useConstantIslands()) {
@@ -1653,8 +1653,8 @@ void MipsConstantIslands::prescanForConstants() {
             I->getOperand(2).ChangeToImmediate(index);
             LLVM_DEBUG(dbgs() << "constant island constant " << *I << "\n");
             I->setDesc(TII->get(Mips::LwRxPcTcp16));
-            I->RemoveOperand(1);
-            I->RemoveOperand(1);
+            I->removeOperand(1);
+            I->removeOperand(1);
             I->addOperand(MachineOperand::CreateCPI(index, 0));
             I->addOperand(MachineOperand::CreateImm(4));
           }
diff --git a/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp b/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
index cf6cec22308c..94053fa2eb7a 100644
--- a/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -677,7 +677,7 @@ bool MipsDelaySlotFiller::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
     // Bundle the NOP to the instruction with the delay slot.
     LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": could not fill delay slot for ";
                I->dump());
-    BuildMI(MBB, std::next(I), I->getDebugLoc(), TII->get(Mips::NOP));
+    TII->insertNop(MBB, std::next(I), I->getDebugLoc());
     MIBundleBuilder(MBB, I, std::next(I, 2));
     ++FilledSlots;
     Changed = true;
diff --git a/llvm/lib/Target/Mips/MipsExpandPseudo.cpp b/llvm/lib/Target/Mips/MipsExpandPseudo.cpp
index 31180d5a23ef..d242083f958b 100644
--- a/llvm/lib/Target/Mips/MipsExpandPseudo.cpp
+++ b/llvm/lib/Target/Mips/MipsExpandPseudo.cpp
@@ -892,7 +892,7 @@ bool MipsExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
 }
 
 bool MipsExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
-  STI = &static_cast<const MipsSubtarget &>(MF.getSubtarget());
+  STI = &MF.getSubtarget<MipsSubtarget>();
   TII = STI->getInstrInfo();
 
   bool Modified = false;
diff --git a/llvm/lib/Target/Mips/MipsFastISel.cpp b/llvm/lib/Target/Mips/MipsFastISel.cpp
index 6ddfec5d0f79..c1b8af70d8b0 100644
--- a/llvm/lib/Target/Mips/MipsFastISel.cpp
+++ b/llvm/lib/Target/Mips/MipsFastISel.cpp
@@ -178,12 +178,8 @@ private:
 
   // Emit helper routines.
   bool emitCmp(unsigned DestReg, const CmpInst *CI);
-  bool emitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
-                unsigned Alignment = 0);
-  bool emitStore(MVT VT, unsigned SrcReg, Address Addr,
-                 MachineMemOperand *MMO = nullptr);
-  bool emitStore(MVT VT, unsigned SrcReg, Address &Addr,
-                 unsigned Alignment = 0);
+  bool emitLoad(MVT VT, unsigned &ResultReg, Address &Addr);
+  bool emitStore(MVT VT, unsigned SrcReg, Address &Addr);
   unsigned emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt);
   bool emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, unsigned DestReg,
 
@@ -753,8 +749,7 @@ bool MipsFastISel::emitCmp(unsigned ResultReg, const CmpInst *CI) {
   return true;
 }
 
-bool MipsFastISel::emitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
-                            unsigned Alignment) {
+bool MipsFastISel::emitLoad(MVT VT, unsigned &ResultReg, Address &Addr) {
   //
   // more cases will be handled here in following patches.
   //
@@ -808,8 +803,7 @@ bool MipsFastISel::emitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
   return false;
 }
 
-bool MipsFastISel::emitStore(MVT VT, unsigned SrcReg, Address &Addr,
-                             unsigned Alignment) {
+bool MipsFastISel::emitStore(MVT VT, unsigned SrcReg, Address &Addr) {
   //
   // more cases will be handled here in following patches.
   //
@@ -902,7 +896,7 @@ bool MipsFastISel::selectLoad(const Instruction *I) {
     return false;
 
   unsigned ResultReg;
-  if (!emitLoad(VT, ResultReg, Addr, cast<LoadInst>(I)->getAlignment()))
+  if (!emitLoad(VT, ResultReg, Addr))
     return false;
   updateValueMap(I, ResultReg);
   return true;
@@ -931,7 +925,7 @@ bool MipsFastISel::selectStore(const Instruction *I) {
   if (!computeAddress(I->getOperand(1), Addr))
     return false;
 
-  if (!emitStore(VT, SrcReg, Addr, cast<StoreInst>(I)->getAlignment()))
+  if (!emitStore(VT, SrcReg, Addr))
     return false;
   return true;
 }
diff --git a/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp b/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp
index d88696525e9e..c4bb3d90b4d5 100644
--- a/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -54,7 +54,7 @@ void MipsDAGToDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 bool MipsDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
-  Subtarget = &static_cast<const MipsSubtarget &>(MF.getSubtarget());
+  Subtarget = &MF.getSubtarget<MipsSubtarget>();
   bool Ret = SelectionDAGISel::runOnMachineFunction(MF);
 
   processFunctionAfterISel(MF);
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 0c2e129b8f1f..b98be4ae4b75 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -94,18 +94,6 @@ static const MCPhysReg Mips64DPRegs[8] = {
   Mips::D16_64, Mips::D17_64, Mips::D18_64, Mips::D19_64
 };
 
-// If I is a shifted mask, set the size (Size) and the first bit of the
-// mask (Pos), and return true.
-// For example, if I is 0x003ff800, (Pos, Size) = (11, 11).
-static bool isShiftedMask(uint64_t I, uint64_t &Pos, uint64_t &Size) {
-  if (!isShiftedMask_64(I))
-    return false;
-
-  Size = countPopulation(I);
-  Pos = countTrailingZeros(I);
-  return true;
-}
-
 // The MIPS MSA ABI passes vector arguments in the integer register set.
 // The number of integer registers used is dependant on the ABI used.
 MVT MipsTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
@@ -192,6 +180,7 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MipsISD::Ret:               return "MipsISD::Ret";
   case MipsISD::ERet:              return "MipsISD::ERet";
   case MipsISD::EH_RETURN:         return "MipsISD::EH_RETURN";
+  case MipsISD::FAbs:              return "MipsISD::FAbs";
   case MipsISD::FMS:               return "MipsISD::FMS";
   case MipsISD::FPBrcond:          return "MipsISD::FPBrcond";
   case MipsISD::FPCmp:             return "MipsISD::FPCmp";
@@ -353,15 +342,12 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   setOperationAction(ISD::SETCC,              MVT::f32,   Custom);
   setOperationAction(ISD::SETCC,              MVT::f64,   Custom);
   setOperationAction(ISD::BRCOND,             MVT::Other, Custom);
+  setOperationAction(ISD::FABS,               MVT::f32,   Custom);
+  setOperationAction(ISD::FABS,               MVT::f64,   Custom);
   setOperationAction(ISD::FCOPYSIGN,          MVT::f32,   Custom);
   setOperationAction(ISD::FCOPYSIGN,          MVT::f64,   Custom);
   setOperationAction(ISD::FP_TO_SINT,         MVT::i32,   Custom);
 
-  if (!(TM.Options.NoNaNsFPMath || Subtarget.inAbs2008Mode())) {
-    setOperationAction(ISD::FABS, MVT::f32, Custom);
-    setOperationAction(ISD::FABS, MVT::f64, Custom);
-  }
-
   if (Subtarget.isGP64bit()) {
     setOperationAction(ISD::GlobalAddress,      MVT::i64,   Custom);
     setOperationAction(ISD::BlockAddress,       MVT::i64,   Custom);
@@ -494,15 +480,8 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
 
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 
-  setTargetDAGCombine(ISD::SDIVREM);
-  setTargetDAGCombine(ISD::UDIVREM);
-  setTargetDAGCombine(ISD::SELECT);
-  setTargetDAGCombine(ISD::AND);
-  setTargetDAGCombine(ISD::OR);
-  setTargetDAGCombine(ISD::ADD);
-  setTargetDAGCombine(ISD::SUB);
-  setTargetDAGCombine(ISD::AssertZext);
-  setTargetDAGCombine(ISD::SHL);
+  setTargetDAGCombine({ISD::SDIVREM, ISD::UDIVREM, ISD::SELECT, ISD::AND,
+                       ISD::OR, ISD::ADD, ISD::SUB, ISD::AssertZext, ISD::SHL});
 
   if (ABI.IsO32()) {
     // These libcalls are not available in 32-bit.
@@ -794,14 +773,15 @@ static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG,
   EVT ValTy = N->getValueType(0);
   SDLoc DL(N);
 
-  uint64_t Pos = 0, SMPos, SMSize;
+  uint64_t Pos = 0;
+  unsigned SMPos, SMSize;
   ConstantSDNode *CN;
   SDValue NewOperand;
   unsigned Opc;
 
   // Op's second operand must be a shifted mask.
   if (!(CN = dyn_cast<ConstantSDNode>(Mask)) ||
-      !isShiftedMask(CN->getZExtValue(), SMPos, SMSize))
+      !isShiftedMask_64(CN->getZExtValue(), SMPos, SMSize))
     return SDValue();
 
   if (FirstOperandOpc == ISD::SRA || FirstOperandOpc == ISD::SRL) {
@@ -875,7 +855,7 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   SDValue And0 = N->getOperand(0), And1 = N->getOperand(1);
-  uint64_t SMPos0, SMSize0, SMPos1, SMSize1;
+  unsigned SMPos0, SMSize0, SMPos1, SMSize1;
   ConstantSDNode *CN, *CN1;
 
   // See if Op's first operand matches (and $src1 , mask0).
@@ -883,7 +863,7 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   if (!(CN = dyn_cast<ConstantSDNode>(And0.getOperand(1))) ||
-      !isShiftedMask(~CN->getSExtValue(), SMPos0, SMSize0))
+      !isShiftedMask_64(~CN->getSExtValue(), SMPos0, SMSize0))
     return SDValue();
 
   // See if Op's second operand matches (and (shl $src, pos), mask1).
@@ -891,7 +871,7 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
       And1.getOperand(0).getOpcode() == ISD::SHL) {
 
     if (!(CN = dyn_cast<ConstantSDNode>(And1.getOperand(1))) ||
-        !isShiftedMask(CN->getZExtValue(), SMPos1, SMSize1))
+        !isShiftedMask_64(CN->getZExtValue(), SMPos1, SMSize1))
       return SDValue();
 
     // The shift masks must have the same position and size.
@@ -970,6 +950,14 @@ static SDValue performMADD_MSUBCombine(SDNode *ROOTNode, SelectionDAG &CurDAG,
       ROOTNode->getOperand(1).getOpcode() != ISD::MUL)
     return SDValue();
 
+  // In the case where we have a multiplication as the left operand of
+  // of a subtraction, we can't combine into a MipsISD::MSub node as the
+  // the instruction definition of msub(u) places the multiplication on
+  // on the right.
+  if (ROOTNode->getOpcode() == ISD::SUB &&
+      ROOTNode->getOperand(0).getOpcode() == ISD::MUL)
+    return SDValue();
+
   // We don't handle vector types here.
   if (ROOTNode->getValueType(0).isVector())
     return SDValue();
@@ -1118,7 +1106,8 @@ static SDValue performSHLCombine(SDNode *N, SelectionDAG &DAG,
   EVT ValTy = N->getValueType(0);
   SDLoc DL(N);
 
-  uint64_t Pos = 0, SMPos, SMSize;
+  uint64_t Pos = 0;
+  unsigned SMPos, SMSize;
   ConstantSDNode *CN;
   SDValue NewOperand;
 
@@ -1136,7 +1125,7 @@ static SDValue performSHLCombine(SDNode *N, SelectionDAG &DAG,
 
   // AND's second operand must be a shifted mask.
   if (!(CN = dyn_cast<ConstantSDNode>(FirstOperand.getOperand(1))) ||
-      !isShiftedMask(CN->getZExtValue(), SMPos, SMSize))
+      !isShiftedMask_64(CN->getZExtValue(), SMPos, SMSize))
     return SDValue();
 
   // Return if the shifted mask does not start at bit 0 or the sum of its size
@@ -1191,6 +1180,16 @@ bool MipsTargetLowering::isCheapToSpeculateCtlz() const {
   return Subtarget.hasMips32();
 }
 
+bool MipsTargetLowering::hasBitTest(SDValue X, SDValue Y) const {
+  // We can use ANDI+SLTIU as a bit test. Y contains the bit position.
+  // For MIPSR2 or later, we may be able to use the `ext` instruction or its'
+  // double-word variants.
+  if (auto *C = dyn_cast<ConstantSDNode>(Y))
+    return C->getAPIntValue().ule(15);
+
+  return false;
+}
+
 bool MipsTargetLowering::shouldFoldConstantShiftPairToMask(
     const SDNode *N, CombineLevel Level) const {
   if (N->getOperand(0).getValueType().isVector())
@@ -2421,11 +2420,14 @@ MipsTargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
   return lowerFCOPYSIGN32(Op, DAG, Subtarget.hasExtractInsert());
 }
 
-static SDValue lowerFABS32(SDValue Op, SelectionDAG &DAG,
-                           bool HasExtractInsert) {
+SDValue MipsTargetLowering::lowerFABS32(SDValue Op, SelectionDAG &DAG,
+                                        bool HasExtractInsert) const {
   SDLoc DL(Op);
   SDValue Res, Const1 = DAG.getConstant(1, DL, MVT::i32);
 
+  if (DAG.getTarget().Options.NoNaNsFPMath || Subtarget.inAbs2008Mode())
+    return DAG.getNode(MipsISD::FAbs, DL, Op.getValueType(), Op.getOperand(0));
+
   // If operand is of type f64, extract the upper 32-bit. Otherwise, bitcast it
   // to i32.
   SDValue X = (Op.getValueType() == MVT::f32)
@@ -2458,11 +2460,14 @@ static SDValue lowerFABS32(SDValue Op, SelectionDAG &DAG,
   return DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, LowX, Res);
 }
 
-static SDValue lowerFABS64(SDValue Op, SelectionDAG &DAG,
-                           bool HasExtractInsert) {
+SDValue MipsTargetLowering::lowerFABS64(SDValue Op, SelectionDAG &DAG,
+                                        bool HasExtractInsert) const {
   SDLoc DL(Op);
   SDValue Res, Const1 = DAG.getConstant(1, DL, MVT::i32);
 
+  if (DAG.getTarget().Options.NoNaNsFPMath || Subtarget.inAbs2008Mode())
+    return DAG.getNode(MipsISD::FAbs, DL, Op.getValueType(), Op.getOperand(0));
+
   // Bitcast to integer node.
   SDValue X = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Op.getOperand(0));
 
@@ -2673,7 +2678,7 @@ SDValue MipsTargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
     return Op;
 
   // Return if load is aligned or if MemVT is neither i32 nor i64.
-  if ((LD->getAlignment() >= MemVT.getSizeInBits() / 8) ||
+  if ((LD->getAlign().value() >= (MemVT.getSizeInBits() / 8)) ||
       ((MemVT != MVT::i32) && (MemVT != MVT::i64)))
     return SDValue();
 
@@ -2787,7 +2792,7 @@ static SDValue lowerFP_TO_SINT_STORE(StoreSDNode *SD, SelectionDAG &DAG,
   SDValue Tr = DAG.getNode(MipsISD::TruncIntFP, SDLoc(Val), FPTy,
                            Val.getOperand(0));
   return DAG.getStore(SD->getChain(), SDLoc(SD), Tr, SD->getBasePtr(),
-                      SD->getPointerInfo(), SD->getAlignment(),
+                      SD->getPointerInfo(), SD->getAlign(),
                       SD->getMemOperand()->getFlags());
 }
 
@@ -2797,7 +2802,7 @@ SDValue MipsTargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
 
   // Lower unaligned integer stores.
   if (!Subtarget.systemSupportsUnalignedAccess() &&
-      (SD->getAlignment() < MemVT.getSizeInBits() / 8) &&
+      (SD->getAlign().value() < (MemVT.getSizeInBits() / 8)) &&
       ((MemVT == MVT::i32) || (MemVT == MVT::i64)))
     return lowerUnalignedIntStore(SD, DAG, Subtarget.isLittle());
 
@@ -4732,18 +4737,19 @@ MipsTargetLowering::emitPseudoD_SELECT(MachineInstr &MI,
 Register
 MipsTargetLowering::getRegisterByName(const char *RegName, LLT VT,
                                       const MachineFunction &MF) const {
-  // Named registers is expected to be fairly rare. For now, just support $28
-  // since the linux kernel uses it.
+  // The Linux kernel uses $28 and sp.
   if (Subtarget.isGP64bit()) {
     Register Reg = StringSwitch<Register>(RegName)
-                         .Case("$28", Mips::GP_64)
-                         .Default(Register());
+                       .Case("$28", Mips::GP_64)
+                       .Case("sp", Mips::SP_64)
+                       .Default(Register());
     if (Reg)
       return Reg;
   } else {
     Register Reg = StringSwitch<Register>(RegName)
-                         .Case("$28", Mips::GP)
-                         .Default(Register());
+                       .Case("$28", Mips::GP)
+                       .Case("sp", Mips::SP)
+                       .Default(Register());
     if (Reg)
       return Reg;
   }
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.h b/llvm/lib/Target/Mips/MipsISelLowering.h
index 3905a18895de..1f921fbe9491 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.h
+++ b/llvm/lib/Target/Mips/MipsISelLowering.h
@@ -99,6 +99,9 @@ class TargetRegisterClass;
       // Floating Point Compare
       FPCmp,
 
+      // Floating point Abs
+      FAbs,
+
       // Floating point select
       FSELECT,
 
@@ -157,7 +160,7 @@ class TargetRegisterClass;
       Ins,
       CIns,
 
-      // EXTR.W instrinsic nodes.
+      // EXTR.W intrinsic nodes.
       EXTP,
       EXTPDP,
       EXTR_S_H,
@@ -282,6 +285,7 @@ class TargetRegisterClass;
 
     bool isCheapToSpeculateCttz() const override;
     bool isCheapToSpeculateCtlz() const override;
+    bool hasBitTest(SDValue X, SDValue Y) const override;
     bool shouldFoldConstantShiftPairToMask(const SDNode *N,
                                            CombineLevel Level) const override;
 
@@ -540,6 +544,10 @@ class TargetRegisterClass;
     SDValue lowerVAARG(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerFABS(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerFABS32(SDValue Op, SelectionDAG &DAG,
+                        bool HasExtractInsert) const;
+    SDValue lowerFABS64(SDValue Op, SelectionDAG &DAG,
+                        bool HasExtractInsert) const;
     SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.cpp b/llvm/lib/Target/Mips/MipsInstrInfo.cpp
index 2bf8562895d7..5cb7a0a1804d 100644
--- a/llvm/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsInstrInfo.cpp
@@ -54,7 +54,6 @@ bool MipsInstrInfo::isZeroImm(const MachineOperand &op) const {
 
 /// insertNoop - If data hazard condition is found insert the target nop
 /// instruction.
-// FIXME: This appears to be dead code.
 void MipsInstrInfo::
 insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const
 {
@@ -62,6 +61,19 @@ insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const
   BuildMI(MBB, MI, DL, get(Mips::NOP));
 }
 
+MachineInstrBuilder MipsInstrInfo::insertNop(MachineBasicBlock &MBB,
+                                             MachineBasicBlock::iterator MI,
+                                             DebugLoc DL) const {
+  assert(!Subtarget.inMips16Mode() &&
+         "insertNop does not support MIPS16e mode at this time");
+  const unsigned MMOpc =
+      Subtarget.hasMips32r6() ? Mips::SLL_MMR6 : Mips::SLL_MM;
+  const unsigned Opc = Subtarget.inMicroMipsMode() ? MMOpc : Mips::SLL;
+  return BuildMI(MBB, MI, DL, get(Opc), Mips::ZERO)
+      .addReg(Mips::ZERO)
+      .addImm(0);
+}
+
 MachineMemOperand *
 MipsInstrInfo::GetMemOperand(MachineBasicBlock &MBB, int FI,
                              MachineMemOperand::Flags Flags) const {
@@ -598,6 +610,18 @@ bool MipsInstrInfo::SafeInFPUDelaySlot(const MachineInstr &MIInSlot,
   return true;
 }
 
+/// Predicate for distinguishing instructions that are hazardous in a load delay
+/// slot. Consider inline assembly as unsafe as well.
+bool MipsInstrInfo::SafeInLoadDelaySlot(const MachineInstr &MIInSlot,
+                                        const MachineInstr &LoadMI) const {
+  if (MIInSlot.isInlineAsm())
+    return false;
+
+  return !llvm::any_of(LoadMI.defs(), [&](const MachineOperand &Op) {
+    return Op.isReg() && MIInSlot.readsRegister(Op.getReg());
+  });
+}
+
 /// Predicate for distingushing instructions that have forbidden slots.
 bool MipsInstrInfo::HasForbiddenSlot(const MachineInstr &MI) const {
   return (MI.getDesc().TSFlags & MipsII::HasForbiddenSlot) != 0;
@@ -622,6 +646,22 @@ bool MipsInstrInfo::HasFPUDelaySlot(const MachineInstr &MI) const {
   }
 }
 
+/// Predicate for distingushing instructions that have load delay slots.
+bool MipsInstrInfo::HasLoadDelaySlot(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case Mips::LB:
+  case Mips::LBu:
+  case Mips::LH:
+  case Mips::LHu:
+  case Mips::LW:
+  case Mips::LWR:
+  case Mips::LWL:
+    return true;
+  default:
+    return false;
+  }
+}
+
 /// Return the number of bytes of code the specified instruction may be.
 unsigned MipsInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
   switch (MI.getOpcode()) {
@@ -695,7 +735,7 @@ MipsInstrInfo::genInstrWithNewOpc(unsigned NewOpc,
       NewOpc == Mips::JIALC64) {
 
     if (NewOpc == Mips::JIALC || NewOpc == Mips::JIALC64)
-      MIB->RemoveOperand(0);
+      MIB->removeOperand(0);
 
     for (unsigned J = 0, E = I->getDesc().getNumOperands(); J < E; ++J) {
       MIB.add(I->getOperand(J));
diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.h b/llvm/lib/Target/Mips/MipsInstrInfo.h
index 46c1b73d512f..8b98ad3dceea 100644
--- a/llvm/lib/Target/Mips/MipsInstrInfo.h
+++ b/llvm/lib/Target/Mips/MipsInstrInfo.h
@@ -96,16 +96,29 @@ public:
   bool SafeInFPUDelaySlot(const MachineInstr &MIInSlot,
                           const MachineInstr &FPUMI) const;
 
+  /// Predicate to determine if an instruction can go in a load delay slot.
+  bool SafeInLoadDelaySlot(const MachineInstr &MIInSlot,
+                           const MachineInstr &LoadMI) const;
+
   /// Predicate to determine if an instruction has a forbidden slot.
   bool HasForbiddenSlot(const MachineInstr &MI) const;
 
   /// Predicate to determine if an instruction has an FPU delay slot.
   bool HasFPUDelaySlot(const MachineInstr &MI) const;
 
+  /// Predicate to determine if an instruction has a load delay slot.
+  bool HasLoadDelaySlot(const MachineInstr &MI) const;
+
   /// Insert nop instruction when hazard condition is found
   void insertNoop(MachineBasicBlock &MBB,
                   MachineBasicBlock::iterator MI) const override;
 
+  /// Insert an ISA appropriate `nop`.
+  // FIXME: Add support for MIPS16e.
+  MachineInstrBuilder insertNop(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI,
+                                DebugLoc DL) const;
+
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.td b/llvm/lib/Target/Mips/MipsInstrInfo.td
index 089fed9ec0bf..973f40a21dee 100644
--- a/llvm/lib/Target/Mips/MipsInstrInfo.td
+++ b/llvm/lib/Target/Mips/MipsInstrInfo.td
@@ -833,22 +833,26 @@ def MipsJumpTargetAsmOperand : AsmOperandClass {
 def jmptarget   : Operand<OtherVT> {
   let EncoderMethod = "getJumpTargetOpValue";
   let ParserMatchClass = MipsJumpTargetAsmOperand;
+  let PrintMethod = "printJumpOperand";
 }
 def brtarget    : Operand<OtherVT> {
   let EncoderMethod = "getBranchTargetOpValue";
   let OperandType = "OPERAND_PCREL";
   let DecoderMethod = "DecodeBranchTarget";
   let ParserMatchClass = MipsJumpTargetAsmOperand;
+  let PrintMethod = "printBranchOperand";
 }
 def brtarget1SImm16 : Operand<OtherVT> {
   let EncoderMethod = "getBranchTargetOpValue1SImm16";
   let OperandType = "OPERAND_PCREL";
   let DecoderMethod = "DecodeBranchTarget1SImm16";
   let ParserMatchClass = MipsJumpTargetAsmOperand;
+  let PrintMethod = "printBranchOperand";
 }
 def calltarget  : Operand<iPTR> {
   let EncoderMethod = "getJumpTargetOpValue";
   let ParserMatchClass = MipsJumpTargetAsmOperand;
+  let PrintMethod = "printJumpOperand";
 }
 
 def imm64: Operand<i64>;
diff --git a/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp b/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp
index 588b7e85c94c..35b0fe218d8f 100644
--- a/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp
@@ -13,6 +13,7 @@
 #include "MipsLegalizerInfo.h"
 #include "MipsTargetMachine.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/IR/IntrinsicsMips.h"
 
 using namespace llvm;
@@ -502,8 +503,7 @@ static bool MSA2OpIntrinsicToGeneric(MachineInstr &MI, unsigned Opcode,
 bool MipsLegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
                                           MachineInstr &MI) const {
   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
-  const MipsSubtarget &ST =
-      static_cast<const MipsSubtarget &>(MI.getMF()->getSubtarget());
+  const MipsSubtarget &ST = MI.getMF()->getSubtarget<MipsSubtarget>();
   const MipsInstrInfo &TII = *ST.getInstrInfo();
   const MipsRegisterInfo &TRI = *ST.getRegisterInfo();
   const RegisterBankInfo &RBI = *ST.getRegBankInfo();
diff --git a/llvm/lib/Target/Mips/MipsMachineFunction.cpp b/llvm/lib/Target/Mips/MipsMachineFunction.cpp
index 411a26e42713..7d9824aaf8ec 100644
--- a/llvm/lib/Target/Mips/MipsMachineFunction.cpp
+++ b/llvm/lib/Target/Mips/MipsMachineFunction.cpp
@@ -22,6 +22,13 @@ static cl::opt<bool>
 FixGlobalBaseReg("mips-fix-global-base-reg", cl::Hidden, cl::init(true),
                  cl::desc("Always use $gp as the global base register."));
 
+MachineFunctionInfo *
+MipsFunctionInfo::clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+                        const DenseMap<MachineBasicBlock *, MachineBasicBlock *>
+                            &Src2DstMBB) const {
+  return DestMF.cloneInfo<MipsFunctionInfo>(*this);
+}
+
 MipsFunctionInfo::~MipsFunctionInfo() = default;
 
 bool MipsFunctionInfo::globalBaseRegSet() const {
@@ -29,7 +36,7 @@ bool MipsFunctionInfo::globalBaseRegSet() const {
 }
 
 static const TargetRegisterClass &getGlobalBaseRegClass(MachineFunction &MF) {
-  auto &STI = static_cast<const MipsSubtarget &>(MF.getSubtarget());
+  auto &STI = MF.getSubtarget<MipsSubtarget>();
   auto &TM = static_cast<const MipsTargetMachine &>(MF.getTarget());
 
   if (STI.inMips16Mode())
diff --git a/llvm/lib/Target/Mips/MipsMachineFunction.h b/llvm/lib/Target/Mips/MipsMachineFunction.h
index 786d210e2aaa..7b17fd3ed0cd 100644
--- a/llvm/lib/Target/Mips/MipsMachineFunction.h
+++ b/llvm/lib/Target/Mips/MipsMachineFunction.h
@@ -26,6 +26,11 @@ class MipsFunctionInfo : public MachineFunctionInfo {
 public:
   MipsFunctionInfo(MachineFunction &MF) {}
 
+  MachineFunctionInfo *
+  clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+        const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+      const override;
+
   ~MipsFunctionInfo() override;
 
   unsigned getSRetReturnReg() const { return SRetReturnReg; }
diff --git a/llvm/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp b/llvm/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp
index a2b55e8bddcd..2c23d3b72dc6 100644
--- a/llvm/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp
@@ -10,8 +10,9 @@
 
 #include "Mips.h"
 #include "MipsTargetMachine.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/StackProtector.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
diff --git a/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp b/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp
index 2823d300dc6e..204c42ae5e5f 100644
--- a/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp
+++ b/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp
@@ -170,7 +170,7 @@ static void eraseGPOpnd(MachineInstr &MI) {
   for (unsigned I = 0; I < MI.getNumOperands(); ++I) {
     MachineOperand &MO = MI.getOperand(I);
     if (MO.isReg() && MO.getReg() == Reg) {
-      MI.RemoveOperand(I);
+      MI.removeOperand(I);
       return;
     }
   }
@@ -194,7 +194,7 @@ void MBBInfo::postVisit() {
 
 // OptimizePICCall methods.
 bool OptimizePICCall::runOnMachineFunction(MachineFunction &F) {
-  if (static_cast<const MipsSubtarget &>(F.getSubtarget()).inMips16Mode())
+  if (F.getSubtarget<MipsSubtarget>().inMips16Mode())
     return false;
 
   // Do a pre-order traversal of the dominator tree.
diff --git a/llvm/lib/Target/Mips/MipsOs16.cpp b/llvm/lib/Target/Mips/MipsOs16.cpp
index ac4e55f8a1f5..f6346a8bbc8b 100644
--- a/llvm/lib/Target/Mips/MipsOs16.cpp
+++ b/llvm/lib/Target/Mips/MipsOs16.cpp
@@ -13,6 +13,7 @@
 #include "Mips.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/llvm/lib/Target/Mips/MipsPostLegalizerCombiner.cpp b/llvm/lib/Target/Mips/MipsPostLegalizerCombiner.cpp
new file mode 100644
index 000000000000..7723a10af2d7
--- /dev/null
+++ b/llvm/lib/Target/Mips/MipsPostLegalizerCombiner.cpp
@@ -0,0 +1,148 @@
+//=== lib/CodeGen/GlobalISel/MipsPostLegalizerCombiner.cpp ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass does combining of machine instructions at the generic MI level,
+// after the legalizer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "Mips.h"
+#include "MipsLegalizerInfo.h"
+#include "MipsSubtarget.h"
+#include "llvm/CodeGen/GlobalISel/Combiner.h"
+#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
+#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define DEBUG_TYPE "mips-postlegalizer-combiner"
+
+using namespace llvm;
+using namespace MIPatternMatch;
+
+#define MIPSPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
+#include "MipsGenPostLegalizeGICombiner.inc"
+#undef MIPSPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
+
+namespace {
+#define MIPSPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
+#include "MipsGenPostLegalizeGICombiner.inc"
+#undef MIPSPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
+
+class MipsPostLegalizerCombinerInfo final : public CombinerInfo {
+  GISelKnownBits *KB;
+
+public:
+  MipsGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
+
+  MipsPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
+                                GISelKnownBits *KB, const MipsLegalizerInfo *LI)
+      : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
+                     /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
+        KB(KB) {
+    if (!GeneratedRuleCfg.parseCommandLineOption())
+      report_fatal_error("Invalid rule identifier");
+  }
+
+  bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
+               MachineIRBuilder &B) const override;
+};
+
+bool MipsPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
+                                            MachineInstr &MI,
+                                            MachineIRBuilder &B) const {
+
+  CombinerHelper Helper(Observer, B, KB,
+                        /*DominatorTree*/ nullptr, LInfo);
+  MipsGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper);
+  return Generated.tryCombineAll(Observer, MI, B, Helper);
+}
+
+#define MIPSPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
+#include "MipsGenPostLegalizeGICombiner.inc"
+#undef MIPSPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
+
+// Pass boilerplate
+// ================
+
+class MipsPostLegalizerCombiner : public MachineFunctionPass {
+public:
+  static char ID;
+
+  MipsPostLegalizerCombiner(bool IsOptNone = false);
+
+  StringRef getPassName() const override {
+    return "MipsPostLegalizerCombiner";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+private:
+  bool IsOptNone;
+};
+} // end anonymous namespace
+
+void MipsPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetPassConfig>();
+  AU.setPreservesCFG();
+  getSelectionDAGFallbackAnalysisUsage(AU);
+  AU.addRequired<GISelKnownBitsAnalysis>();
+  AU.addPreserved<GISelKnownBitsAnalysis>();
+  if (!IsOptNone) {
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+  }
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+MipsPostLegalizerCombiner::MipsPostLegalizerCombiner(bool IsOptNone)
+    : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
+  initializeMipsPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
+}
+
+bool MipsPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
+  if (MF.getProperties().hasProperty(
+          MachineFunctionProperties::Property::FailedISel))
+    return false;
+  auto *TPC = &getAnalysis<TargetPassConfig>();
+  const Function &F = MF.getFunction();
+  bool EnableOpt =
+      MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
+
+  const MipsSubtarget &ST = MF.getSubtarget<MipsSubtarget>();
+  const MipsLegalizerInfo *LI =
+      static_cast<const MipsLegalizerInfo *>(ST.getLegalizerInfo());
+
+  GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
+  MipsPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
+                                       F.hasMinSize(), KB, LI);
+  Combiner C(PCInfo, TPC);
+  return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
+}
+
+char MipsPostLegalizerCombiner::ID = 0;
+INITIALIZE_PASS_BEGIN(MipsPostLegalizerCombiner, DEBUG_TYPE,
+                      "Combine Mips machine instrs after legalization", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
+INITIALIZE_PASS_END(MipsPostLegalizerCombiner, DEBUG_TYPE,
+                    "Combine Mips machine instrs after legalization", false,
+                    false)
+
+namespace llvm {
+FunctionPass *createMipsPostLegalizeCombiner(bool IsOptNone) {
+  return new MipsPostLegalizerCombiner(IsOptNone);
+}
+} // end namespace llvm
diff --git a/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp b/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp
index 2ad9ffe4eb77..cb6d53ec0a12 100644
--- a/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp
@@ -16,6 +16,7 @@
 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/InitializePasses.h"
 
@@ -50,8 +51,7 @@ bool MipsPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
     // Don't attempt to combine non power of 2 loads or unaligned loads when
     // subtarget doesn't support them.
     auto MMO = *MI.memoperands_begin();
-    const MipsSubtarget &STI =
-        static_cast<const MipsSubtarget &>(MI.getMF()->getSubtarget());
+    const MipsSubtarget &STI = MI.getMF()->getSubtarget<MipsSubtarget>();
     if (!isPowerOf2_64(MMO->getSize()))
       return false;
     bool isUnaligned = MMO->getAlign() < MMO->getSize();
diff --git a/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp b/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
index 04b69c66bc0d..2544d9d9b76d 100644
--- a/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
@@ -73,8 +73,7 @@ RegisterBankInfo::ValueMapping ValueMappings[] = {
 
 using namespace llvm;
 
-MipsRegisterBankInfo::MipsRegisterBankInfo(const TargetRegisterInfo &TRI)
-    : MipsGenRegisterBankInfo() {}
+MipsRegisterBankInfo::MipsRegisterBankInfo(const TargetRegisterInfo &TRI) {}
 
 const RegisterBank &
 MipsRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
@@ -154,8 +153,7 @@ static bool isGprbTwoInstrUnalignedLoadOrStore(const MachineInstr *MI) {
   if (MI->getOpcode() == TargetOpcode::G_LOAD ||
       MI->getOpcode() == TargetOpcode::G_STORE) {
     auto MMO = *MI->memoperands_begin();
-    const MipsSubtarget &STI =
-        static_cast<const MipsSubtarget &>(MI->getMF()->getSubtarget());
+    const MipsSubtarget &STI = MI->getMF()->getSubtarget<MipsSubtarget>();
     if (MMO->getSize() == 4 && (!STI.systemSupportsUnalignedAccess() &&
                                 MMO->getAlign() < MMO->getSize()))
       return true;
@@ -399,7 +397,7 @@ void MipsRegisterBankInfo::TypeInfoForMF::cleanupIfNewFunction(
 
 static const MipsRegisterBankInfo::ValueMapping *
 getMSAMapping(const MachineFunction &MF) {
-  assert(static_cast<const MipsSubtarget &>(MF.getSubtarget()).hasMSA() &&
+  assert(MF.getSubtarget<MipsSubtarget>().hasMSA() &&
          "MSA mapping not available on target without MSA.");
   return &Mips::ValueMappings[Mips::MSAIdx];
 }
diff --git a/llvm/lib/Target/Mips/MipsRegisterBankInfo.h b/llvm/lib/Target/Mips/MipsRegisterBankInfo.h
index df51606e1e8a..9eca4fdab3d6 100644
--- a/llvm/lib/Target/Mips/MipsRegisterBankInfo.h
+++ b/llvm/lib/Target/Mips/MipsRegisterBankInfo.h
@@ -13,7 +13,7 @@
 #ifndef LLVM_LIB_TARGET_MIPS_MIPSREGISTERBANKINFO_H
 #define LLVM_LIB_TARGET_MIPS_MIPSREGISTERBANKINFO_H
 
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
 
 #define GET_REGBANK_DECLARATIONS
 #include "MipsGenRegisterBank.inc"
diff --git a/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp b/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
index 7ee2ddf3605f..7729d9cf92da 100644
--- a/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -97,7 +97,7 @@ private:
 
 ExpandPseudo::ExpandPseudo(MachineFunction &MF_)
     : MF(MF_), MRI(MF.getRegInfo()),
-      Subtarget(static_cast<const MipsSubtarget &>(MF.getSubtarget())),
+      Subtarget(MF.getSubtarget<MipsSubtarget>()),
       TII(*static_cast<const MipsSEInstrInfo *>(Subtarget.getInstrInfo())),
       RegInfo(*Subtarget.getRegisterInfo()) {}
 
diff --git a/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index 03a545605fe1..1124111c1a6e 100644
--- a/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -38,7 +38,7 @@ using namespace llvm;
 #define DEBUG_TYPE "mips-isel"
 
 bool MipsSEDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
-  Subtarget = &static_cast<const MipsSubtarget &>(MF.getSubtarget());
+  Subtarget = &MF.getSubtarget<MipsSubtarget>();
   if (Subtarget->inMips16Mode())
     return false;
   return MipsDAGToDAGISel::runOnMachineFunction(MF);
@@ -282,7 +282,7 @@ bool MipsSEDAGToDAGISel::selectAddrFrameIndexOffset(
     SDValue Addr, SDValue &Base, SDValue &Offset, unsigned OffsetBits,
     unsigned ShiftAmount = 0) const {
   if (CurDAG->isBaseWithConstantOffset(Addr)) {
-    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+    auto *CN = cast<ConstantSDNode>(Addr.getOperand(1));
     if (isIntN(OffsetBits + ShiftAmount, CN->getSExtValue())) {
       EVT ValTy = Addr.getValueType();
 
@@ -956,6 +956,38 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
     break;
   }
 
+  case MipsISD::FAbs: {
+    MVT ResTy = Node->getSimpleValueType(0);
+    assert((ResTy == MVT::f64 || ResTy == MVT::f32) &&
+           "Unsupported float type!");
+    unsigned Opc = 0;
+    if (ResTy == MVT::f64)
+      Opc = (Subtarget->isFP64bit() ? Mips::FABS_D64 : Mips::FABS_D32);
+    else
+      Opc = Mips::FABS_S;
+
+    if (Subtarget->inMicroMipsMode()) {
+      switch (Opc) {
+      case Mips::FABS_D64:
+        Opc = Mips::FABS_D64_MM;
+        break;
+      case Mips::FABS_D32:
+        Opc = Mips::FABS_D32_MM;
+        break;
+      case Mips::FABS_S:
+        Opc = Mips::FABS_S_MM;
+        break;
+      default:
+        llvm_unreachable("Unknown opcode for MIPS floating point abs!");
+      }
+    }
+
+    ReplaceNode(Node,
+                CurDAG->getMachineNode(Opc, DL, ResTy, Node->getOperand(0)));
+
+    return true;
+  }
+
   // Manually match MipsISD::Ins nodes to get the correct instruction. It has
   // to be done in this fashion so that we respect the differences between
   // dins and dinsm, as the difference is that the size operand has the range
diff --git a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
index 346ebe9664fc..f8bde3816fde 100644
--- a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -99,11 +99,8 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
       setOperationAction(ISD::BITCAST, VecTy, Legal);
     }
 
-    setTargetDAGCombine(ISD::SHL);
-    setTargetDAGCombine(ISD::SRA);
-    setTargetDAGCombine(ISD::SRL);
-    setTargetDAGCombine(ISD::SETCC);
-    setTargetDAGCombine(ISD::VSELECT);
+    setTargetDAGCombine(
+        {ISD::SHL, ISD::SRA, ISD::SRL, ISD::SETCC, ISD::VSELECT});
 
     if (Subtarget.hasMips32r2()) {
       setOperationAction(ISD::ADDC, MVT::i32, Legal);
@@ -161,11 +158,7 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
     setOperationAction(ISD::FMINIMUM, MVT::f16, Promote);
     setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote);
 
-    setTargetDAGCombine(ISD::AND);
-    setTargetDAGCombine(ISD::OR);
-    setTargetDAGCombine(ISD::SRA);
-    setTargetDAGCombine(ISD::VSELECT);
-    setTargetDAGCombine(ISD::XOR);
+    setTargetDAGCombine({ISD::AND, ISD::OR, ISD::SRA, ISD::VSELECT, ISD::XOR});
   }
 
   if (!Subtarget.useSoftFloat()) {
@@ -1184,13 +1177,13 @@ SDValue MipsSETargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
 
   // i32 load from lower address.
   SDValue Lo = DAG.getLoad(MVT::i32, DL, Chain, Ptr, MachinePointerInfo(),
-                           Nd.getAlignment(), Nd.getMemOperand()->getFlags());
+                           Nd.getAlign(), Nd.getMemOperand()->getFlags());
 
   // i32 load from higher address.
   Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, DAG.getConstant(4, DL, PtrVT));
   SDValue Hi = DAG.getLoad(
       MVT::i32, DL, Lo.getValue(1), Ptr, MachinePointerInfo(),
-      std::min(Nd.getAlignment(), 4U), Nd.getMemOperand()->getFlags());
+      commonAlignment(Nd.getAlign(), 4), Nd.getMemOperand()->getFlags());
 
   if (!Subtarget.isLittle())
     std::swap(Lo, Hi);
@@ -1219,14 +1212,13 @@ SDValue MipsSETargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
     std::swap(Lo, Hi);
 
   // i32 store to lower address.
-  Chain =
-      DAG.getStore(Chain, DL, Lo, Ptr, MachinePointerInfo(), Nd.getAlignment(),
-                   Nd.getMemOperand()->getFlags(), Nd.getAAInfo());
+  Chain = DAG.getStore(Chain, DL, Lo, Ptr, MachinePointerInfo(), Nd.getAlign(),
+                       Nd.getMemOperand()->getFlags(), Nd.getAAInfo());
 
   // i32 store to higher address.
   Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, DAG.getConstant(4, DL, PtrVT));
   return DAG.getStore(Chain, DL, Hi, Ptr, MachinePointerInfo(),
-                      std::min(Nd.getAlignment(), 4U),
+                      commonAlignment(Nd.getAlign(), 4),
                       Nd.getMemOperand()->getFlags(), Nd.getAAInfo());
 }
 
diff --git a/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp b/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp
index d6481793ef49..c86666cc40b6 100644
--- a/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp
@@ -38,7 +38,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "mips-reg-info"
 
-MipsSERegisterInfo::MipsSERegisterInfo() {}
+MipsSERegisterInfo::MipsSERegisterInfo() = default;
 
 bool MipsSERegisterInfo::
 requiresRegisterScavenging(const MachineFunction &MF) const {
diff --git a/llvm/lib/Target/Mips/MipsScheduleGeneric.td b/llvm/lib/Target/Mips/MipsScheduleGeneric.td
index f076f2f9cf10..931412cb261e 100644
--- a/llvm/lib/Target/Mips/MipsScheduleGeneric.td
+++ b/llvm/lib/Target/Mips/MipsScheduleGeneric.td
@@ -957,13 +957,13 @@ def : InstRW<[GenericWriteFPURcpS], (instrs RECIP_S_MM, RSQRT_S_MM)>;
 def : InstRW<[GenericWriteFPURcpD], (instrs RECIP_D32_MM, RECIP_D64_MM,
                                      RSQRT_D32_MM, RSQRT_D64_MM)>;
 
-def : InstRW<[GenericWriteFPUStore], (instrs SDC1_MM, SWC1_MM, SUXC1_MM,
-                                      SWXC1_MM)>;
+def : InstRW<[GenericWriteFPUStore], (instrs SDC1_MM_D32, SDC1_MM_D64, SWC1_MM,
+                                      SUXC1_MM, SWXC1_MM)>;
 
 def : InstRW<[GenericWriteFPUMoveGPRFPU], (instrs CFC1_MM, CTC1_MM)>;
 
-def : InstRW<[GenericWriteFPULoad], (instrs LDC1_MM, LUXC1_MM, LWC1_MM,
-                                     LWXC1_MM)>;
+def : InstRW<[GenericWriteFPULoad], (instrs LDC1_MM_D32, LDC1_MM_D64, LUXC1_MM,
+                                     LWC1_MM, LWXC1_MM)>;
 
 // microMIPS32r6
 // =============
diff --git a/llvm/lib/Target/Mips/MipsSubtarget.cpp b/llvm/lib/Target/Mips/MipsSubtarget.cpp
index c285385a19dd..10530cdafeed 100644
--- a/llvm/lib/Target/Mips/MipsSubtarget.cpp
+++ b/llvm/lib/Target/Mips/MipsSubtarget.cpp
@@ -64,6 +64,7 @@ bool MipsSubtarget::MSAWarningPrinted = false;
 bool MipsSubtarget::VirtWarningPrinted = false;
 bool MipsSubtarget::CRCWarningPrinted = false;
 bool MipsSubtarget::GINVWarningPrinted = false;
+bool MipsSubtarget::MIPS1WarningPrinted = false;
 
 void MipsSubtarget::anchor() {}
 
@@ -91,10 +92,14 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
   if (MipsArchVersion == MipsDefault)
     MipsArchVersion = Mips32;
 
-  // Don't even attempt to generate code for MIPS-I and MIPS-V. They have not
-  // been tested and currently exist for the integrated assembler only.
-  if (MipsArchVersion == Mips1)
-    report_fatal_error("Code generation for MIPS-I is not implemented", false);
+  // MIPS-I has not been tested.
+  if (MipsArchVersion == Mips1 && !MIPS1WarningPrinted) {
+    errs() << "warning: MIPS-I support is experimental\n";
+    MIPS1WarningPrinted = true;
+  }
+
+  // Don't even attempt to generate code for MIPS-V. It has not
+  // been tested and currently exists for the integrated assembler only.
   if (MipsArchVersion == Mips5)
     report_fatal_error("Code generation for MIPS-V is not implemented", false);
 
@@ -111,7 +116,7 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
   if (isFP64bit() && !hasMips64() && hasMips32() && !hasMips32r2())
     report_fatal_error(
         "FPU with 64-bit registers is not available on MIPS32 pre revision 2. "
-        "Use -mcpu=mips32r2 or greater.");
+        "Use -mcpu=mips32r2 or greater.", false);
 
   if (!isABI_O32() && !useOddSPReg())
     report_fatal_error("-mattr=+nooddspreg requires the O32 ABI.", false);
diff --git a/llvm/lib/Target/Mips/MipsSubtarget.h b/llvm/lib/Target/Mips/MipsSubtarget.h
index 2b4c2b19a95d..ec8ca64c8ce8 100644
--- a/llvm/lib/Target/Mips/MipsSubtarget.h
+++ b/llvm/lib/Target/Mips/MipsSubtarget.h
@@ -17,12 +17,12 @@
 #include "MipsFrameLowering.h"
 #include "MipsISelLowering.h"
 #include "MipsInstrInfo.h"
-#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
-#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -59,6 +59,9 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
   // Used to avoid printing ginv warnings multiple times.
   static bool GINVWarningPrinted;
 
+  // Used to avoid printing Mips1 warnings multiple times.
+  static bool MIPS1WarningPrinted;
+
   // Used to avoid printing virt warnings multiple times.
   static bool VirtWarningPrinted;
 
diff --git a/llvm/lib/Target/Mips/MipsTargetMachine.cpp b/llvm/lib/Target/Mips/MipsTargetMachine.cpp
index f9f662a00117..fb0aa397d393 100644
--- a/llvm/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/llvm/lib/Target/Mips/MipsTargetMachine.cpp
@@ -18,12 +18,14 @@
 #include "MipsSEISelDAGToDAG.h"
 #include "MipsSubtarget.h"
 #include "MipsTargetObjectFile.h"
+#include "MipsTargetTransformInfo.h"
 #include "TargetInfo/MipsTargetInfo.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
@@ -62,6 +64,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsTarget() {
   initializeMipsBranchExpansionPass(*PR);
   initializeMicroMipsSizeReducePass(*PR);
   initializeMipsPreLegalizerCombinerPass(*PR);
+  initializeMipsPostLegalizerCombinerPass(*PR);
   initializeMipsMulMulBugFixPass(*PR);
 }
 
@@ -103,7 +106,7 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU,
 
 static Reloc::Model getEffectiveRelocModel(bool JIT,
                                            Optional<Reloc::Model> RM) {
-  if (!RM.hasValue() || JIT)
+  if (!RM || JIT)
     return Reloc::Static;
   return *RM;
 }
@@ -238,6 +241,7 @@ public:
   bool addIRTranslator() override;
   void addPreLegalizeMachineIR() override;
   bool addLegalizeMachineIR() override;
+  void addPreRegBankSelect() override;
   bool addRegBankSelect() override;
   bool addGlobalInstructionSelect() override;
 
@@ -276,7 +280,7 @@ void MipsPassConfig::addPreRegAlloc() {
 }
 
 TargetTransformInfo
-MipsTargetMachine::getTargetTransformInfo(const Function &F) {
+MipsTargetMachine::getTargetTransformInfo(const Function &F) const {
   if (Subtarget->allowMixed16_32()) {
     LLVM_DEBUG(errs() << "No Target Transform Info Pass Added\n");
     // FIXME: This is no longer necessary as the TTI returned is per-function.
@@ -284,7 +288,7 @@ MipsTargetMachine::getTargetTransformInfo(const Function &F) {
   }
 
   LLVM_DEBUG(errs() << "Target Transform Info Pass Added\n");
-  return TargetTransformInfo(BasicTTIImpl(this, F));
+  return TargetTransformInfo(MipsTTIImpl(this, F));
 }
 
 // Implemented by targets that want to run passes immediately before
@@ -333,6 +337,11 @@ bool MipsPassConfig::addLegalizeMachineIR() {
   return false;
 }
 
+void MipsPassConfig::addPreRegBankSelect() {
+  bool IsOptNone = getOptLevel() == CodeGenOpt::None;
+  addPass(createMipsPostLegalizeCombiner(IsOptNone));
+}
+
 bool MipsPassConfig::addRegBankSelect() {
   addPass(new RegBankSelect());
   return false;
diff --git a/llvm/lib/Target/Mips/MipsTargetMachine.h b/llvm/lib/Target/Mips/MipsTargetMachine.h
index e0de924be4fd..46ffc11738df 100644
--- a/llvm/lib/Target/Mips/MipsTargetMachine.h
+++ b/llvm/lib/Target/Mips/MipsTargetMachine.h
@@ -43,7 +43,7 @@ public:
                     CodeGenOpt::Level OL, bool JIT, bool isLittle);
   ~MipsTargetMachine() override;
 
-  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+  TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
 
   const MipsSubtarget *getSubtargetImpl() const {
     if (Subtarget)
diff --git a/llvm/lib/Target/Mips/MipsTargetStreamer.h b/llvm/lib/Target/Mips/MipsTargetStreamer.h
index 44615b987e3c..2f4b6eb37aa1 100644
--- a/llvm/lib/Target/Mips/MipsTargetStreamer.h
+++ b/llvm/lib/Target/Mips/MipsTargetStreamer.h
@@ -178,7 +178,7 @@ public:
 
   MipsABIFlagsSection &getABIFlagsSection() { return ABIFlagsSection; }
   const MipsABIInfo &getABI() const {
-    assert(ABI.hasValue() && "ABI hasn't been set!");
+    assert(ABI && "ABI hasn't been set!");
     return *ABI;
   }
 
diff --git a/llvm/lib/Target/Mips/MipsTargetTransformInfo.cpp b/llvm/lib/Target/Mips/MipsTargetTransformInfo.cpp
new file mode 100644
index 000000000000..bd88a0af0ecf
--- /dev/null
+++ b/llvm/lib/Target/Mips/MipsTargetTransformInfo.cpp
@@ -0,0 +1,17 @@
+//===-- MipsTargetTransformInfo.cpp - Mips specific TTI ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsTargetTransformInfo.h"
+
+using namespace llvm;
+
+bool MipsTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
+  EVT VT = TLI->getValueType(DL, DataType);
+  return TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM,
+                                       VT);
+}
diff --git a/llvm/lib/Target/Mips/MipsTargetTransformInfo.h b/llvm/lib/Target/Mips/MipsTargetTransformInfo.h
new file mode 100644
index 000000000000..6f52eaa2f833
--- /dev/null
+++ b/llvm/lib/Target/Mips/MipsTargetTransformInfo.h
@@ -0,0 +1,40 @@
+//===-- MipsTargetTransformInfo.h - Mips specific TTI -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSTARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_MIPS_MIPSTARGETTRANSFORMINFO_H
+
+#include "MipsTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+
+namespace llvm {
+
+class MipsTTIImpl : public BasicTTIImplBase<MipsTTIImpl> {
+  using BaseT = BasicTTIImplBase<MipsTTIImpl>;
+  using TTI = TargetTransformInfo;
+
+  friend BaseT;
+
+  const MipsSubtarget *ST;
+  const MipsTargetLowering *TLI;
+
+  const MipsSubtarget *getST() const { return ST; }
+  const MipsTargetLowering *getTLI() const { return TLI; }
+
+public:
+  explicit MipsTTIImpl(const MipsTargetMachine *TM, const Function &F)
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+        TLI(ST->getTargetLowering()) {}
+
+  bool hasDivRemOp(Type *DataType, bool IsSigned);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
index f275011018a3..85ace96eeeaf 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
@@ -49,9 +49,20 @@ NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Triple &TheTriple,
   SupportsExtendedDwarfLocDirective = false;
   SupportsSignedData = false;
 
+  PrivateGlobalPrefix = "$L__";
+  PrivateLabelPrefix = PrivateGlobalPrefix;
+
   // @TODO: Can we just disable this?
   WeakDirective = "\t// .weak\t";
   GlobalDirective = "\t// .globl\t";
 
   UseIntegratedAssembler = false;
+
+  // Avoid using parens for identifiers starting with $ - ptxas does
+  // not expect them.
+  UseParensForDollarSignNames = false;
+
+  // ptxas does not support DWARF `.file fileno directory filename'
+  // syntax as of v11.X.
+  EnableDwarfFileDirectoryDefault = false;
 }
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
index 1cbd650bdf06..b72cea5d03f1 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
@@ -93,7 +93,7 @@ void NVPTXTargetStreamer::changeSection(const MCSection *CurSection,
     // Emit DWARF .file directives in the outermost scope.
     outputDwarfFileDirectives();
     OS << "\t.section";
-    Section->PrintSwitchToSection(*getStreamer().getContext().getAsmInfo(),
+    Section->printSwitchToSection(*getStreamer().getContext().getAsmInfo(),
                                   getStreamer().getContext().getTargetTriple(),
                                   OS, SubSection);
     // DWARF sections are enclosed into braces - emit the open one.
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 3a59306c4998..b1d842122060 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -45,7 +45,6 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/Attributes.h"
@@ -329,7 +328,7 @@ MCOperand NVPTXAsmPrinter::GetSymbolRef(const MCSymbol *Symbol) {
 void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
   const DataLayout &DL = getDataLayout();
   const NVPTXSubtarget &STI = TM.getSubtarget<NVPTXSubtarget>(*F);
-  const TargetLowering *TLI = STI.getTargetLowering();
+  const auto *TLI = cast<NVPTXTargetLowering>(STI.getTargetLowering());
 
   Type *Ty = F->getReturnType();
 
@@ -363,7 +362,7 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
       unsigned totalsz = DL.getTypeAllocSize(Ty);
       unsigned retAlignment = 0;
       if (!getAlign(*F, 0, retAlignment))
-        retAlignment = DL.getABITypeAlignment(Ty);
+        retAlignment = TLI->getFunctionParamOptimizedAlign(F, Ty, DL).value();
       O << ".param .align " << retAlignment << " .b8 func_retval0[" << totalsz
         << "]";
     } else
@@ -513,7 +512,7 @@ void NVPTXAsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
     OutStreamer->AddComment(Twine("implicit-def: ") +
                             STI.getRegisterInfo()->getName(RegNo));
   }
-  OutStreamer->AddBlankLine();
+  OutStreamer->addBlankLine();
 }
 
 void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
@@ -818,9 +817,13 @@ void NVPTXAsmPrinter::emitGlobals(const Module &M) {
          "Missed a global variable");
   assert(GVVisiting.size() == 0 && "Did not fully process a global variable");
 
+  const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
+  const NVPTXSubtarget &STI =
+      *static_cast<const NVPTXSubtarget *>(NTM.getSubtargetImpl());
+
   // Print out module-level global variables in proper order
   for (unsigned i = 0, e = Globals.size(); i != e; ++i)
-    printModuleLevelGV(Globals[i], OS2);
+    printModuleLevelGV(Globals[i], OS2, /*processDemoted=*/false, STI);
 
   OS2 << '\n';
 
@@ -888,17 +891,18 @@ bool NVPTXAsmPrinter::doFinalization(Module &M) {
 
   clearAnnotationCache(&M);
 
-  // Close the last emitted section
-  if (HasDebugInfo) {
-    static_cast<NVPTXTargetStreamer *>(OutStreamer->getTargetStreamer())
-        ->closeLastSection();
-    // Emit empty .debug_loc section for better support of the empty files.
-    OutStreamer->emitRawText("\t.section\t.debug_loc\t{\t}");
-  }
+  if (auto *TS = static_cast<NVPTXTargetStreamer *>(
+          OutStreamer->getTargetStreamer())) {
+    // Close the last emitted section
+    if (HasDebugInfo) {
+      TS->closeLastSection();
+      // Emit empty .debug_loc section for better support of the empty files.
+      OutStreamer->emitRawText("\t.section\t.debug_loc\t{\t}");
+    }
 
-  // Output last DWARF .file directives, if any.
-  static_cast<NVPTXTargetStreamer *>(OutStreamer->getTargetStreamer())
-      ->outputDwarfFileDirectives();
+    // Output last DWARF .file directives, if any.
+    TS->outputDwarfFileDirectives();
+  }
 
   return ret;
 
@@ -957,8 +961,8 @@ void NVPTXAsmPrinter::emitLinkageDirective(const GlobalValue *V,
 }
 
 void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
-                                         raw_ostream &O,
-                                         bool processDemoted) {
+                                         raw_ostream &O, bool processDemoted,
+                                         const NVPTXSubtarget &STI) {
   // Skip meta data
   if (GVar->hasSection()) {
     if (GVar->getSection() == "llvm.metadata")
@@ -1001,7 +1005,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
     // (extern) declarations, no definition or initializer
     // Currently the only known declaration is for an automatic __local
     // (.shared) promoted to global.
-    emitPTXGlobalVariable(GVar, O);
+    emitPTXGlobalVariable(GVar, O, STI);
     O << ";\n";
     return;
   }
@@ -1095,6 +1099,10 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
   emitPTXAddressSpace(PTy->getAddressSpace(), O);
 
   if (isManaged(*GVar)) {
+    if (STI.getPTXVersion() < 40 || STI.getSmVersion() < 30) {
+      report_fatal_error(
+          ".attribute(.managed) requires PTX version >= 4.0 and sm_30");
+    }
     O << " .attribute(.managed)";
   }
 
@@ -1214,9 +1222,13 @@ void NVPTXAsmPrinter::emitDemotedVars(const Function *f, raw_ostream &O) {
 
   std::vector<const GlobalVariable *> &gvars = localDecls[f];
 
+  const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
+  const NVPTXSubtarget &STI =
+      *static_cast<const NVPTXSubtarget *>(NTM.getSubtargetImpl());
+
   for (const GlobalVariable *GV : gvars) {
     O << "\t// demoted variable\n\t";
-    printModuleLevelGV(GV, O, true);
+    printModuleLevelGV(GV, O, /*processDemoted=*/true, STI);
   }
 }
 
@@ -1282,7 +1294,8 @@ NVPTXAsmPrinter::getPTXFundamentalTypeStr(Type *Ty, bool useB4PTR) const {
 }
 
 void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
-                                            raw_ostream &O) {
+                                            raw_ostream &O,
+                                            const NVPTXSubtarget &STI) {
   const DataLayout &DL = getDataLayout();
 
   // GlobalVariables are always constant pointers themselves.
@@ -1290,6 +1303,13 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
 
   O << ".";
   emitPTXAddressSpace(GVar->getType()->getAddressSpace(), O);
+  if (isManaged(*GVar)) {
+    if (STI.getPTXVersion() < 40 || STI.getSmVersion() < 30) {
+      report_fatal_error(
+          ".attribute(.managed) requires PTX version >= 4.0 and sm_30");
+    }
+    O << " .attribute(.managed)";
+  }
   if (MaybeAlign A = GVar->getAlign())
     O << " .align " << A->value();
   else
@@ -1335,34 +1355,6 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
   }
 }
 
-static unsigned int getOpenCLAlignment(const DataLayout &DL, Type *Ty) {
-  if (Ty->isSingleValueType())
-    return DL.getPrefTypeAlignment(Ty);
-
-  auto *ATy = dyn_cast<ArrayType>(Ty);
-  if (ATy)
-    return getOpenCLAlignment(DL, ATy->getElementType());
-
-  auto *STy = dyn_cast<StructType>(Ty);
-  if (STy) {
-    unsigned int alignStruct = 1;
-    // Go through each element of the struct and find the
-    // largest alignment.
-    for (unsigned i = 0, e = STy->getNumElements(); i != e; i++) {
-      Type *ETy = STy->getElementType(i);
-      unsigned int align = getOpenCLAlignment(DL, ETy);
-      if (align > alignStruct)
-        alignStruct = align;
-    }
-    return alignStruct;
-  }
-
-  auto *FTy = dyn_cast<FunctionType>(Ty);
-  if (FTy)
-    return DL.getPointerPrefAlignment().value();
-  return DL.getPrefTypeAlignment(Ty);
-}
-
 void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I,
                                      int paramIndex, raw_ostream &O) {
   getSymbol(I->getParent())->print(O, MAI);
@@ -1373,7 +1365,8 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
   const DataLayout &DL = getDataLayout();
   const AttributeList &PAL = F->getAttributes();
   const NVPTXSubtarget &STI = TM.getSubtarget<NVPTXSubtarget>(*F);
-  const TargetLowering *TLI = STI.getTargetLowering();
+  const auto *TLI = cast<NVPTXTargetLowering>(STI.getTargetLowering());
+
   Function::const_arg_iterator I, E;
   unsigned paramIndex = 0;
   bool first = true;
@@ -1430,18 +1423,24 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
       }
     }
 
+    auto getOptimalAlignForParam = [TLI, &DL, &PAL, F,
+                                    paramIndex](Type *Ty) -> Align {
+      Align TypeAlign = TLI->getFunctionParamOptimizedAlign(F, Ty, DL);
+      MaybeAlign ParamAlign = PAL.getParamAlignment(paramIndex);
+      return std::max(TypeAlign, ParamAlign.valueOrOne());
+    };
+
     if (!PAL.hasParamAttr(paramIndex, Attribute::ByVal)) {
       if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
         // Just print .param .align <a> .b8 .param[size];
-        // <a> = PAL.getparamalignment
+        // <a>  = optimal alignment for the element type; always multiple of
+        //        PAL.getParamAlignment
         // size = typeallocsize of element type
-        const Align align = DL.getValueOrABITypeAlignment(
-            PAL.getParamAlignment(paramIndex), Ty);
+        Align OptimalAlign = getOptimalAlignForParam(Ty);
 
-        unsigned sz = DL.getTypeAllocSize(Ty);
-        O << "\t.param .align " << align.value() << " .b8 ";
+        O << "\t.param .align " << OptimalAlign.value() << " .b8 ";
         printParamName(I, paramIndex, O);
-        O << "[" << sz << "]";
+        O << "[" << DL.getTypeAllocSize(Ty) << "]";
 
         continue;
       }
@@ -1454,7 +1453,6 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
 
           if (static_cast<NVPTXTargetMachine &>(TM).getDrvInterface() !=
               NVPTX::CUDA) {
-            Type *ETy = PTy->getPointerElementType();
             int addrSpace = PTy->getAddressSpace();
             switch (addrSpace) {
             default:
@@ -1470,7 +1468,8 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
               O << ".ptr .global ";
               break;
             }
-            O << ".align " << (int)getOpenCLAlignment(DL, ETy) << " ";
+            Align ParamAlign = I->getParamAlign().valueOrOne();
+            O << ".align " << ParamAlign.value() << " ";
           }
           printParamName(I, paramIndex, O);
           continue;
@@ -1511,17 +1510,17 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
       continue;
     }
 
-    // param has byVal attribute. So should be a pointer
-    auto *PTy = dyn_cast<PointerType>(Ty);
-    assert(PTy && "Param with byval attribute should be a pointer type");
-    Type *ETy = PTy->getPointerElementType();
+    // param has byVal attribute.
+    Type *ETy = PAL.getParamByValType(paramIndex);
+    assert(ETy && "Param should have byval type");
 
     if (isABI || isKernelFunc) {
       // Just print .param .align <a> .b8 .param[size];
-      // <a> = PAL.getparamalignment
+      // <a>  = optimal alignment for the element type; always multiple of
+      //        PAL.getParamAlignment
       // size = typeallocsize of element type
-      Align align =
-          DL.getValueOrABITypeAlignment(PAL.getParamAlignment(paramIndex), ETy);
+      Align OptimalAlign = getOptimalAlignForParam(ETy);
+
       // Work around a bug in ptxas. When PTX code takes address of
       // byval parameter with alignment < 4, ptxas generates code to
       // spill argument into memory. Alas on sm_50+ ptxas generates
@@ -1533,10 +1532,10 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
       // TODO: this will need to be undone when we get to support multi-TU
       // device-side compilation as it breaks ABI compatibility with nvcc.
       // Hopefully ptxas bug is fixed by then.
-      if (!isKernelFunc && align < Align(4))
-        align = Align(4);
+      if (!isKernelFunc && OptimalAlign < Align(4))
+        OptimalAlign = Align(4);
       unsigned sz = DL.getTypeAllocSize(ETy);
-      O << "\t.param .align " << align.value() << " .b8 ";
+      O << "\t.param .align " << OptimalAlign.value() << " .b8 ";
       printParamName(I, paramIndex, O);
       O << "[" << sz << "]";
       continue;
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
index 2a3a38d7b2f1..cd61e99a103a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -218,7 +218,7 @@ private:
   void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
                        const char *Modifier = nullptr);
   void printModuleLevelGV(const GlobalVariable *GVar, raw_ostream &O,
-                          bool = false);
+                          bool processDemoted, const NVPTXSubtarget &STI);
   void printParamName(Function::const_arg_iterator I, int paramIndex,
                       raw_ostream &O);
   void emitGlobals(const Module &M);
@@ -258,7 +258,8 @@ private:
   // List of variables demoted to a function scope.
   std::map<const Function *, std::vector<const GlobalVariable *>> localDecls;
 
-  void emitPTXGlobalVariable(const GlobalVariable *GVar, raw_ostream &O);
+  void emitPTXGlobalVariable(const GlobalVariable *GVar, raw_ostream &O,
+                             const NVPTXSubtarget &STI);
   void emitPTXAddressSpace(unsigned int AddressSpace, raw_ostream &O) const;
   std::string getPTXFundamentalTypeStr(Type *Ty, bool = true) const;
   void printScalarConstant(const Constant *CPV, raw_ostream &O);
diff --git a/llvm/lib/Target/NVPTX/NVPTXAtomicLower.cpp b/llvm/lib/Target/NVPTX/NVPTXAtomicLower.cpp
index 10bf56fd9a91..9661dffd3dae 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAtomicLower.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAtomicLower.cpp
@@ -17,7 +17,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/Transforms/Scalar/LowerAtomic.h"
+#include "llvm/Transforms/Utils/LowerAtomic.h"
 
 #include "MCTargetDesc/NVPTXBaseInfo.h"
 using namespace llvm;
diff --git a/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
index 888fc8ffac2c..2201eb19c80f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -83,6 +83,7 @@ bool GenericToNVVM::runOnModule(Module &M) {
           GV.hasInitializer() ? GV.getInitializer() : nullptr, "", &GV,
           GV.getThreadLocalMode(), llvm::ADDRESS_SPACE_GLOBAL);
       NewGV->copyAttributesFrom(&GV);
+      NewGV->copyMetadata(&GV, /*Offset=*/0);
       GVMap[&GV] = NewGV;
     }
   }
@@ -269,24 +270,16 @@ Value *GenericToNVVM::remapConstantExpr(Module *M, Function *F, ConstantExpr *C,
     // ShuffleVector
     return Builder.CreateShuffleVector(NewOperands[0], NewOperands[1],
                                        NewOperands[2]);
-  case Instruction::ExtractValue:
-    // ExtractValueConstantExpr
-    return Builder.CreateExtractValue(NewOperands[0], C->getIndices());
   case Instruction::InsertValue:
     // InsertValueConstantExpr
     return Builder.CreateInsertValue(NewOperands[0], NewOperands[1],
                                      C->getIndices());
   case Instruction::GetElementPtr:
     // GetElementPtrConstantExpr
-    return cast<GEPOperator>(C)->isInBounds()
-               ? Builder.CreateGEP(
-                     cast<GEPOperator>(C)->getSourceElementType(),
-                     NewOperands[0],
-                     makeArrayRef(&NewOperands[1], NumOperands - 1))
-               : Builder.CreateInBoundsGEP(
-                     cast<GEPOperator>(C)->getSourceElementType(),
-                     NewOperands[0],
-                     makeArrayRef(&NewOperands[1], NumOperands - 1));
+    return Builder.CreateGEP(cast<GEPOperator>(C)->getSourceElementType(),
+                             NewOperands[0],
+                             makeArrayRef(&NewOperands[1], NumOperands - 1), "",
+                             cast<GEPOperator>(C)->isInBounds());
   case Instruction::Select:
     // SelectConstantExpr
     return Builder.CreateSelect(NewOperands[0], NewOperands[1], NewOperands[2]);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index dd4290a605a9..48fa387e563a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -42,7 +42,7 @@ NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
 }
 
 bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
-  Subtarget = &static_cast<const NVPTXSubtarget &>(MF.getSubtarget());
+  Subtarget = &MF.getSubtarget<NVPTXSubtarget>();
   return SelectionDAGISel::runOnMachineFunction(MF);
 }
 
@@ -923,8 +923,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
                       getI32Imm(fromTypeWidth, dl), Addr, Chain };
-    NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
-                                     MVT::Other, Ops);
+    NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
   } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
                                : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
     Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
@@ -936,8 +935,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
                       getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
-    NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
-                                     MVT::Other, Ops);
+    NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
   } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
                                : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
     if (PointerSize == 64)
@@ -955,8 +953,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
                       getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
-    NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
-                                     MVT::Other, Ops);
+    NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
   } else {
     if (PointerSize == 64)
       Opcode = pickOpcodeForVT(
@@ -974,8 +971,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
                       getI32Imm(fromTypeWidth, dl), N1, Chain };
-    NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
-                                     MVT::Other, Ops);
+    NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
   }
 
   if (!NVPTXLD)
@@ -1092,7 +1088,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
                       getI32Imm(FromTypeWidth, DL), Addr, Chain };
-    LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
+    LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
   } else if (PointerSize == 64
                  ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
                  : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
@@ -1119,7 +1115,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
                       getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
-    LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
+    LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
   } else if (PointerSize == 64
                  ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
                  : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
@@ -1169,7 +1165,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
                       getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
 
-    LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
+    LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
   } else {
     if (PointerSize == 64) {
       switch (N->getOpcode()) {
@@ -1217,7 +1213,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
                       getI32Imm(FromTypeWidth, DL), Op1, Chain };
-    LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
+    LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
   }
 
   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
@@ -1361,7 +1357,7 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
     if (!Opcode)
       return false;
     SDValue Ops[] = { Addr, Chain };
-    LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
+    LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
   } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
                           : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
     if (TM.is64Bit()) {
@@ -1508,7 +1504,7 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
     if (!Opcode)
       return false;
     SDValue Ops[] = {Base, Offset, Chain};
-    LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
+    LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
   } else {
     if (TM.is64Bit()) {
       switch (N->getOpcode()) {
@@ -1654,7 +1650,7 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
     if (!Opcode)
       return false;
     SDValue Ops[] = { Op1, Chain };
-    LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
+    LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
   }
 
   MachineMemOperand *MemRef = Mem->getMemOperand();
@@ -1787,7 +1783,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
                      getI32Imm(toTypeWidth, dl),
                      Addr,
                      Chain};
-    NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
+    NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
   } else if (PointerSize == 64
                  ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset)
                  : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) {
@@ -1806,7 +1802,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
                      Base,
                      Offset,
                      Chain};
-    NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
+    NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
   } else if (PointerSize == 64
                  ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset)
                  : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) {
@@ -1832,7 +1828,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
                      Base,
                      Offset,
                      Chain};
-    NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
+    NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
   } else {
     if (PointerSize == 64)
       Opcode =
@@ -1855,7 +1851,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
                      getI32Imm(toTypeWidth, dl),
                      BasePtr,
                      Chain};
-    NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
+    NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
   }
 
   if (!NVPTXST)
@@ -2082,7 +2078,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
 
   StOps.push_back(Chain);
 
-  ST = CurDAG->getMachineNode(Opcode.getValue(), DL, MVT::Other, StOps);
+  ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, StOps);
 
   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
   CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef});
@@ -2164,7 +2160,7 @@ bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
   Ops.push_back(Chain);
   Ops.push_back(Flag);
 
-  ReplaceNode(Node, CurDAG->getMachineNode(Opcode.getValue(), DL, VTs, Ops));
+  ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops));
   return true;
 }
 
@@ -2230,7 +2226,7 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
   if (!Opcode)
     return false;
 
-  SDNode *Ret = CurDAG->getMachineNode(Opcode.getValue(), DL, MVT::Other, Ops);
+  SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
 
@@ -2333,8 +2329,7 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
   }
 
   SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
-  SDNode *Ret =
-      CurDAG->getMachineNode(Opcode.getValue(), DL, RetVTs, Ops);
+  SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, RetVTs, Ops);
   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 7b5248906b56..746f652bfa36 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -35,6 +35,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/FPEnv.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Instruction.h"
@@ -48,7 +49,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MachineValueType.h"
-#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
@@ -71,14 +71,14 @@ static cl::opt<bool> sched4reg(
     "nvptx-sched4reg",
     cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
 
-static cl::opt<unsigned>
-FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
-                    cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
-                             " 1: do it  2: do it aggressively"),
-                    cl::init(2));
+static cl::opt<unsigned> FMAContractLevelOpt(
+    "nvptx-fma-level", cl::Hidden,
+    cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
+             " 1: do it  2: do it aggressively"),
+    cl::init(2));
 
 static cl::opt<int> UsePrecDivF32(
-    "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden,
+    "nvptx-prec-divf32", cl::Hidden,
     cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
              " IEEE Compliant F32 div.rnd if available."),
     cl::init(2));
@@ -487,6 +487,17 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
     setOperationAction(ISD::CTLZ, Ty, Legal);
   }
 
+  setOperationAction(ISD::ADDC, MVT::i32, Legal);
+  setOperationAction(ISD::ADDE, MVT::i32, Legal);
+  setOperationAction(ISD::SUBC, MVT::i32, Legal);
+  setOperationAction(ISD::SUBE, MVT::i32, Legal);
+  if (STI.getPTXVersion() >= 43) {
+    setOperationAction(ISD::ADDC, MVT::i64, Legal);
+    setOperationAction(ISD::ADDE, MVT::i64, Legal);
+    setOperationAction(ISD::SUBC, MVT::i64, Legal);
+    setOperationAction(ISD::SUBE, MVT::i64, Legal);
+  }
+
   setOperationAction(ISD::CTTZ, MVT::i16, Expand);
   setOperationAction(ISD::CTTZ, MVT::i32, Expand);
   setOperationAction(ISD::CTTZ, MVT::i64, Expand);
@@ -499,13 +510,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
 
   // We have some custom DAG combine patterns for these nodes
-  setTargetDAGCombine(ISD::ADD);
-  setTargetDAGCombine(ISD::AND);
-  setTargetDAGCombine(ISD::FADD);
-  setTargetDAGCombine(ISD::MUL);
-  setTargetDAGCombine(ISD::SHL);
-  setTargetDAGCombine(ISD::SREM);
-  setTargetDAGCombine(ISD::UREM);
+  setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::FADD, ISD::MUL, ISD::SHL,
+                       ISD::SREM, ISD::UREM});
 
   // setcc for f16x2 needs special handling to prevent legalizer's
   // attempt to scalarize it due to v2i1 not being legal.
@@ -583,6 +589,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // Now deduce the information based on the above mentioned
   // actions
   computeRegisterProperties(STI.getRegisterInfo());
+
+  setMinCmpXchgSizeInBits(32);
 }
 
 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
@@ -1302,8 +1310,8 @@ std::string NVPTXTargetLowering::getPrototype(
 
   bool first = true;
 
-  unsigned OIdx = 0;
-  for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
+  const Function *F = CB.getFunction();
+  for (unsigned i = 0, e = Args.size(), OIdx = 0; i != e; ++i, ++OIdx) {
     Type *Ty = Args[i].Ty;
     if (!first) {
       O << ", ";
@@ -1312,15 +1320,14 @@ std::string NVPTXTargetLowering::getPrototype(
 
     if (!Outs[OIdx].Flags.isByVal()) {
       if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
-        unsigned align = 0;
+        unsigned ParamAlign = 0;
         const CallInst *CallI = cast<CallInst>(&CB);
         // +1 because index 0 is reserved for return type alignment
-        if (!getAlign(*CallI, i + 1, align))
-          align = DL.getABITypeAlignment(Ty);
-        unsigned sz = DL.getTypeAllocSize(Ty);
-        O << ".param .align " << align << " .b8 ";
+        if (!getAlign(*CallI, i + 1, ParamAlign))
+          ParamAlign = getFunctionParamOptimizedAlign(F, Ty, DL).value();
+        O << ".param .align " << ParamAlign << " .b8 ";
         O << "_";
-        O << "[" << sz << "]";
+        O << "[" << DL.getTypeAllocSize(Ty) << "]";
         // update the index for Outs
         SmallVector<EVT, 16> vtparts;
         ComputeValueVTs(*this, DL, Ty, vtparts);
@@ -1351,15 +1358,18 @@ std::string NVPTXTargetLowering::getPrototype(
       O << "_";
       continue;
     }
-    auto *PTy = dyn_cast<PointerType>(Ty);
-    assert(PTy && "Param with byval attribute should be a pointer type");
-    Type *ETy = PTy->getPointerElementType();
 
-    Align align = Outs[OIdx].Flags.getNonZeroByValAlign();
-    unsigned sz = DL.getTypeAllocSize(ETy);
-    O << ".param .align " << align.value() << " .b8 ";
+    Align ParamByValAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
+
+    // Try to increase alignment. This code matches logic in LowerCall when
+    // alignment increase is performed to increase vectorization options.
+    Type *ETy = Args[i].IndirectType;
+    Align AlignCandidate = getFunctionParamOptimizedAlign(F, ETy, DL);
+    ParamByValAlign = std::max(ParamByValAlign, AlignCandidate);
+
+    O << ".param .align " << ParamByValAlign.value() << " .b8 ";
     O << "_";
-    O << "[" << sz << "]";
+    O << "[" << Outs[OIdx].Flags.getByValSize() << "]";
   }
   O << ");";
   return O.str();
@@ -1406,12 +1416,15 @@ Align NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
 
   // Check for function alignment information if we found that the
   // ultimate target is a Function
-  if (DirectCallee)
+  if (DirectCallee) {
     if (getAlign(*DirectCallee, Idx, Alignment))
       return Align(Alignment);
+    // If alignment information is not available, fall back to the
+    // default function param optimized type alignment
+    return getFunctionParamOptimizedAlign(DirectCallee, Ty, DL);
+  }
 
-  // Call is indirect or alignment information is not available, fall back to
-  // the ABI type alignment
+  // Call is indirect, fall back to the ABI type alignment
   return DL.getABITypeAlign(Ty);
 }
 
@@ -1436,11 +1449,11 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     return Chain;
 
   unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1);
-  SDValue tempChain = Chain;
+  SDValue TempChain = Chain;
   Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl);
   SDValue InFlag = Chain.getValue(1);
 
-  unsigned paramCount = 0;
+  unsigned ParamCount = 0;
   // Args.size() and Outs.size() need not match.
   // Outs.size() will be larger
   //   * if there is an aggregate argument with multiple fields (each field
@@ -1456,173 +1469,155 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
     EVT VT = Outs[OIdx].VT;
     Type *Ty = Args[i].Ty;
+    bool IsByVal = Outs[OIdx].Flags.isByVal();
 
-    if (!Outs[OIdx].Flags.isByVal()) {
-      SmallVector<EVT, 16> VTs;
-      SmallVector<uint64_t, 16> Offsets;
-      ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets);
-      Align ArgAlign = getArgumentAlignment(Callee, CB, Ty, paramCount + 1, DL);
-      unsigned AllocSize = DL.getTypeAllocSize(Ty);
-      SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-      bool NeedAlign; // Does argument declaration specify alignment?
-      if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
-        // declare .param .align <align> .b8 .param<n>[<size>];
-        SDValue DeclareParamOps[] = {
-            Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32),
-            DAG.getConstant(paramCount, dl, MVT::i32),
-            DAG.getConstant(AllocSize, dl, MVT::i32), InFlag};
-        Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
-                            DeclareParamOps);
-        NeedAlign = true;
-      } else {
-        // declare .param .b<size> .param<n>;
-        if ((VT.isInteger() || VT.isFloatingPoint()) && AllocSize < 4) {
-          // PTX ABI requires integral types to be at least 32 bits in
-          // size. FP16 is loaded/stored using i16, so it's handled
-          // here as well.
-          AllocSize = 4;
-        }
-        SDValue DeclareScalarParamOps[] = {
-            Chain, DAG.getConstant(paramCount, dl, MVT::i32),
-            DAG.getConstant(AllocSize * 8, dl, MVT::i32),
-            DAG.getConstant(0, dl, MVT::i32), InFlag};
-        Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
-                            DeclareScalarParamOps);
-        NeedAlign = false;
-      }
-      InFlag = Chain.getValue(1);
+    SmallVector<EVT, 16> VTs;
+    SmallVector<uint64_t, 16> Offsets;
 
-      // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
-      // than 32-bits are sign extended or zero extended, depending on
-      // whether they are signed or unsigned types. This case applies
-      // only to scalar parameters and not to aggregate values.
-      bool ExtendIntegerParam =
-          Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
-
-      auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
-      SmallVector<SDValue, 6> StoreOperands;
-      for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
-        // New store.
-        if (VectorInfo[j] & PVF_FIRST) {
-          assert(StoreOperands.empty() && "Unfinished preceding store.");
-          StoreOperands.push_back(Chain);
-          StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32));
-          StoreOperands.push_back(DAG.getConstant(Offsets[j], dl, MVT::i32));
-        }
+    assert((!IsByVal || Args[i].IndirectType) &&
+           "byval arg must have indirect type");
+    Type *ETy = (IsByVal ? Args[i].IndirectType : Ty);
+    ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets);
+
+    Align ArgAlign;
+    if (IsByVal) {
+      // The ByValAlign in the Outs[OIdx].Flags is always set at this point,
+      // so we don't need to worry whether it's naturally aligned or not.
+      // See TargetLowering::LowerCallTo().
+      ArgAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
+
+      // Try to increase alignment to enhance vectorization options.
+      ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign(
+                                        CB->getCalledFunction(), ETy, DL));
+
+      // Enforce minumum alignment of 4 to work around ptxas miscompile
+      // for sm_50+. See corresponding alignment adjustment in
+      // emitFunctionParamList() for details.
+      ArgAlign = std::max(ArgAlign, Align(4));
+    } else {
+      ArgAlign = getArgumentAlignment(Callee, CB, Ty, ParamCount + 1, DL);
+    }
 
-        EVT EltVT = VTs[j];
-        SDValue StVal = OutVals[OIdx];
-        if (ExtendIntegerParam) {
-          assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
-          // zext/sext to i32
-          StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
-                                                        : ISD::ZERO_EXTEND,
-                              dl, MVT::i32, StVal);
-        } else if (EltVT.getSizeInBits() < 16) {
-          // Use 16-bit registers for small stores as it's the
-          // smallest general purpose register size supported by NVPTX.
-          StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
-        }
+    unsigned TypeSize =
+        (IsByVal ? Outs[OIdx].Flags.getByValSize() : DL.getTypeAllocSize(Ty));
+    SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
 
-        // Record the value to store.
-        StoreOperands.push_back(StVal);
-
-        if (VectorInfo[j] & PVF_LAST) {
-          unsigned NumElts = StoreOperands.size() - 3;
-          NVPTXISD::NodeType Op;
-          switch (NumElts) {
-          case 1:
-            Op = NVPTXISD::StoreParam;
-            break;
-          case 2:
-            Op = NVPTXISD::StoreParamV2;
-            break;
-          case 4:
-            Op = NVPTXISD::StoreParamV4;
-            break;
-          default:
-            llvm_unreachable("Invalid vector info.");
-          }
+    bool NeedAlign; // Does argument declaration specify alignment?
+    if (IsByVal ||
+        (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128))) {
+      // declare .param .align <align> .b8 .param<n>[<size>];
+      SDValue DeclareParamOps[] = {
+          Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32),
+          DAG.getConstant(ParamCount, dl, MVT::i32),
+          DAG.getConstant(TypeSize, dl, MVT::i32), InFlag};
+      Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
+                          DeclareParamOps);
+      NeedAlign = true;
+    } else {
+      // declare .param .b<size> .param<n>;
+      if ((VT.isInteger() || VT.isFloatingPoint()) && TypeSize < 4) {
+        // PTX ABI requires integral types to be at least 32 bits in
+        // size. FP16 is loaded/stored using i16, so it's handled
+        // here as well.
+        TypeSize = 4;
+      }
+      SDValue DeclareScalarParamOps[] = {
+          Chain, DAG.getConstant(ParamCount, dl, MVT::i32),
+          DAG.getConstant(TypeSize * 8, dl, MVT::i32),
+          DAG.getConstant(0, dl, MVT::i32), InFlag};
+      Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
+                          DeclareScalarParamOps);
+      NeedAlign = false;
+    }
+    InFlag = Chain.getValue(1);
 
-          StoreOperands.push_back(InFlag);
+    // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
+    // than 32-bits are sign extended or zero extended, depending on
+    // whether they are signed or unsigned types. This case applies
+    // only to scalar parameters and not to aggregate values.
+    bool ExtendIntegerParam =
+        Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
 
-          // Adjust type of the store op if we've extended the scalar
-          // return value.
-          EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : VTs[j];
-          MaybeAlign EltAlign;
-          if (NeedAlign)
-            EltAlign = commonAlignment(ArgAlign, Offsets[j]);
+    auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
+    SmallVector<SDValue, 6> StoreOperands;
+    for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
+      EVT EltVT = VTs[j];
+      int CurOffset = Offsets[j];
+      MaybeAlign PartAlign;
+      if (NeedAlign)
+        PartAlign = commonAlignment(ArgAlign, CurOffset);
+
+      // New store.
+      if (VectorInfo[j] & PVF_FIRST) {
+        assert(StoreOperands.empty() && "Unfinished preceding store.");
+        StoreOperands.push_back(Chain);
+        StoreOperands.push_back(DAG.getConstant(ParamCount, dl, MVT::i32));
+        StoreOperands.push_back(DAG.getConstant(CurOffset, dl, MVT::i32));
+      }
 
-          Chain = DAG.getMemIntrinsicNode(
-              Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
-              TheStoreType, MachinePointerInfo(), EltAlign,
-              MachineMemOperand::MOStore);
-          InFlag = Chain.getValue(1);
+      SDValue StVal = OutVals[OIdx];
+      if (IsByVal) {
+        auto PtrVT = getPointerTy(DL);
+        SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal,
+                                      DAG.getConstant(CurOffset, dl, PtrVT));
+        StVal = DAG.getLoad(EltVT, dl, TempChain, srcAddr, MachinePointerInfo(),
+                            PartAlign);
+      } else if (ExtendIntegerParam) {
+        assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
+        // zext/sext to i32
+        StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
+                                                      : ISD::ZERO_EXTEND,
+                            dl, MVT::i32, StVal);
+      }
 
-          // Cleanup.
-          StoreOperands.clear();
-        }
-        ++OIdx;
+      if (!ExtendIntegerParam && EltVT.getSizeInBits() < 16) {
+        // Use 16-bit registers for small stores as it's the
+        // smallest general purpose register size supported by NVPTX.
+        StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
       }
-      assert(StoreOperands.empty() && "Unfinished parameter store.");
-      if (VTs.size() > 0)
-        --OIdx;
-      ++paramCount;
-      continue;
-    }
 
-    // ByVal arguments
-    SmallVector<EVT, 16> VTs;
-    SmallVector<uint64_t, 16> Offsets;
-    auto *PTy = dyn_cast<PointerType>(Args[i].Ty);
-    assert(PTy && "Type of a byval parameter should be pointer");
-    ComputePTXValueVTs(*this, DL, PTy->getPointerElementType(), VTs, &Offsets,
-                       0);
+      // Record the value to store.
+      StoreOperands.push_back(StVal);
 
-    // declare .param .align <align> .b8 .param<n>[<size>];
-    unsigned sz = Outs[OIdx].Flags.getByValSize();
-    SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-    Align ArgAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
-    // The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
-    // so we don't need to worry about natural alignment or not.
-    // See TargetLowering::LowerCallTo().
-
-    // Enforce minumum alignment of 4 to work around ptxas miscompile
-    // for sm_50+. See corresponding alignment adjustment in
-    // emitFunctionParamList() for details.
-    if (ArgAlign < Align(4))
-      ArgAlign = Align(4);
-    SDValue DeclareParamOps[] = {
-        Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32),
-        DAG.getConstant(paramCount, dl, MVT::i32),
-        DAG.getConstant(sz, dl, MVT::i32), InFlag};
-    Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
-                        DeclareParamOps);
-    InFlag = Chain.getValue(1);
-    for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
-      EVT elemtype = VTs[j];
-      int curOffset = Offsets[j];
-      unsigned PartAlign = GreatestCommonDivisor64(ArgAlign.value(), curOffset);
-      auto PtrVT = getPointerTy(DL);
-      SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx],
-                                    DAG.getConstant(curOffset, dl, PtrVT));
-      SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
-                                   MachinePointerInfo(), PartAlign);
-      if (elemtype.getSizeInBits() < 16) {
-        theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
-      }
-      SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-      SDValue CopyParamOps[] = { Chain,
-                                 DAG.getConstant(paramCount, dl, MVT::i32),
-                                 DAG.getConstant(curOffset, dl, MVT::i32),
-                                 theVal, InFlag };
-      Chain = DAG.getMemIntrinsicNode(
-          NVPTXISD::StoreParam, dl, CopyParamVTs, CopyParamOps, elemtype,
-          MachinePointerInfo(), /* Align */ None, MachineMemOperand::MOStore);
+      if (VectorInfo[j] & PVF_LAST) {
+        unsigned NumElts = StoreOperands.size() - 3;
+        NVPTXISD::NodeType Op;
+        switch (NumElts) {
+        case 1:
+          Op = NVPTXISD::StoreParam;
+          break;
+        case 2:
+          Op = NVPTXISD::StoreParamV2;
+          break;
+        case 4:
+          Op = NVPTXISD::StoreParamV4;
+          break;
+        default:
+          llvm_unreachable("Invalid vector info.");
+        }
 
-      InFlag = Chain.getValue(1);
+        StoreOperands.push_back(InFlag);
+
+        // Adjust type of the store op if we've extended the scalar
+        // return value.
+        EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
+
+        Chain = DAG.getMemIntrinsicNode(
+            Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
+            TheStoreType, MachinePointerInfo(), PartAlign,
+            MachineMemOperand::MOStore);
+        InFlag = Chain.getValue(1);
+
+        // Cleanup.
+        StoreOperands.clear();
+      }
+      if (!IsByVal)
+        ++OIdx;
     }
-    ++paramCount;
+    assert(StoreOperands.empty() && "Unfinished parameter store.");
+    if (!IsByVal && VTs.size() > 0)
+      --OIdx;
+    ++ParamCount;
   }
 
   GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
@@ -1729,7 +1724,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                       CallArgBeginOps);
   InFlag = Chain.getValue(1);
 
-  for (unsigned i = 0, e = paramCount; i != e; ++i) {
+  for (unsigned i = 0, e = ParamCount; i != e; ++i) {
     unsigned opcode;
     if (i == (e - 1))
       opcode = NVPTXISD::LastCallArg;
@@ -1865,7 +1860,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     Chain = Ret.getValue(1);
     InFlag = Ret.getValue(2);
 
-    if (ProxyRegTruncates[i].hasValue()) {
+    if (ProxyRegTruncates[i]) {
       Ret = DAG.getNode(ISD::TRUNCATE, dl, ProxyRegTruncates[i].getValue(), Ret);
     }
 
@@ -2249,7 +2244,7 @@ SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
   assert(Node->getValueType(0) == MVT::i1 &&
          "Custom lowering for i1 load only");
   SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
-                              LD->getPointerInfo(), LD->getAlignment(),
+                              LD->getPointerInfo(), LD->getAlign(),
                               LD->getMemOperand()->getFlags());
   SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
   // The legalizer (the caller) is expecting two values from the legalized
@@ -2414,7 +2409,7 @@ SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
   Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
   SDValue Result =
       DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
-                        ST->getAlignment(), ST->getMemOperand()->getFlags());
+                        ST->getAlign(), ST->getMemOperand()->getFlags());
   return Result;
 }
 
@@ -2431,29 +2426,6 @@ NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
   return DAG.getTargetExternalSymbol(SavedStr->c_str(), v);
 }
 
-// Check to see if the kernel argument is image*_t or sampler_t
-
-static bool isImageOrSamplerVal(const Value *arg, const Module *context) {
-  static const char *const specialTypes[] = { "struct._image2d_t",
-                                              "struct._image3d_t",
-                                              "struct._sampler_t" };
-
-  Type *Ty = arg->getType();
-  auto *PTy = dyn_cast<PointerType>(Ty);
-
-  if (!PTy)
-    return false;
-
-  if (!context)
-    return false;
-
-  auto *STy = dyn_cast<StructType>(PTy->getPointerElementType());
-  if (!STy || STy->isLiteral())
-    return false;
-
-  return llvm::is_contained(specialTypes, STy->getName());
-}
-
 SDValue NVPTXTargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
@@ -2495,19 +2467,6 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
   for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
     Type *Ty = argTypes[i];
 
-    // If the kernel argument is image*_t or sampler_t, convert it to
-    // a i32 constant holding the parameter position. This can later
-    // matched in the AsmPrinter to output the correct mangled name.
-    if (isImageOrSamplerVal(
-            theArgs[i],
-            (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
-                                     : nullptr))) {
-      assert(isKernelFunction(*F) &&
-             "Only kernels can have image/sampler params");
-      InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32));
-      continue;
-    }
-
     if (theArgs[i]->use_empty()) {
       // argument is dead
       if (Ty->isAggregateType() || Ty->isIntegerTy(128)) {
@@ -2658,7 +2617,8 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
                                  const SmallVectorImpl<SDValue> &OutVals,
                                  const SDLoc &dl, SelectionDAG &DAG) const {
-  MachineFunction &MF = DAG.getMachineFunction();
+  const MachineFunction &MF = DAG.getMachineFunction();
+  const Function &F = MF.getFunction();
   Type *RetTy = MF.getFunction().getReturnType();
 
   bool isABI = (STI.getSmVersion() >= 20);
@@ -2673,7 +2633,9 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
 
   auto VectorInfo = VectorizePTXValueVTs(
-      VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlign(RetTy) : Align(1));
+      VTs, Offsets,
+      RetTy->isSized() ? getFunctionParamOptimizedAlign(&F, RetTy, DL)
+                       : Align(1));
 
   // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
   // 32-bits are sign extended or zero extended, depending on whether
@@ -4293,6 +4255,26 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
   return false;
 }
 
+/// getFunctionParamOptimizedAlign - since function arguments are passed via
+/// .param space, we may want to increase their alignment in a way that
+/// ensures that we can effectively vectorize their loads & stores. We can
+/// increase alignment only if the function has internal or has private
+/// linkage as for other linkage types callers may already rely on default
+/// alignment. To allow using 128-bit vectorized loads/stores, this function
+/// ensures that alignment is 16 or greater.
+Align NVPTXTargetLowering::getFunctionParamOptimizedAlign(
+    const Function *F, Type *ArgTy, const DataLayout &DL) const {
+  const uint64_t ABITypeAlign = DL.getABITypeAlign(ArgTy).value();
+
+  // If a function has linkage different from internal or private, we
+  // must use default ABI alignment as external users rely on it.
+  if (!F->hasLocalLinkage())
+    return Align(ABITypeAlign);
+
+  assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage");
+  return Align(std::max(uint64_t(16), ABITypeAlign));
+}
+
 /// isLegalAddressingMode - Return true if the addressing mode represented
 /// by AM is legal for this target, for a load/store of the specified type.
 /// Used to guide target specific optimizations, like loop strength reduction
@@ -4516,6 +4498,17 @@ static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
   return SDValue();
 }
 
+static SDValue PerformStoreRetvalCombine(SDNode *N) {
+  // Operands from the 2nd to the last one are the values to be stored
+  for (std::size_t I = 2, OpsCount = N->ops().size(); I != OpsCount; ++I)
+    if (!N->getOperand(I).isUndef())
+      return SDValue();
+
+  // Operand 0 is the previous value in the chain. Cannot return EntryToken
+  // as the previous value will become unused and eliminated later.
+  return N->getOperand(0);
+}
+
 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
 ///
 static SDValue PerformADDCombine(SDNode *N,
@@ -4844,6 +4837,10 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
       return PerformREMCombine(N, DCI, OptLevel);
     case ISD::SETCC:
       return PerformSETCCCombine(N, DCI);
+    case NVPTXISD::StoreRetval:
+    case NVPTXISD::StoreRetvalV2:
+    case NVPTXISD::StoreRetvalV4:
+      return PerformStoreRetvalCombine(N);
   }
   return SDValue();
 }
@@ -5130,8 +5127,69 @@ void NVPTXTargetLowering::ReplaceNodeResults(
   }
 }
 
+NVPTXTargetLowering::AtomicExpansionKind
+NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+  Type *Ty = AI->getValOperand()->getType();
+
+  if (AI->isFloatingPointOperation()) {
+    if (AI->getOperation() == AtomicRMWInst::BinOp::FAdd) {
+      if (Ty->isFloatTy())
+        return AtomicExpansionKind::None;
+      if (Ty->isDoubleTy() && STI.hasAtomAddF64())
+        return AtomicExpansionKind::None;
+    }
+    return AtomicExpansionKind::CmpXChg;
+  }
+
+  assert(Ty->isIntegerTy() && "Ty should be integer at this point");
+  auto ITy = cast<llvm::IntegerType>(Ty);
+
+  switch (AI->getOperation()) {
+  default:
+    return AtomicExpansionKind::CmpXChg;
+  case AtomicRMWInst::BinOp::And:
+  case AtomicRMWInst::BinOp::Or:
+  case AtomicRMWInst::BinOp::Xor:
+  case AtomicRMWInst::BinOp::Xchg:
+    switch (ITy->getBitWidth()) {
+    case 8:
+    case 16:
+      return AtomicExpansionKind::CmpXChg;
+    case 32:
+      return AtomicExpansionKind::None;
+    case 64:
+      if (STI.hasAtomBitwise64())
+        return AtomicExpansionKind::None;
+      return AtomicExpansionKind::CmpXChg;
+    default:
+      llvm_unreachable("unsupported width encountered");
+    }
+  case AtomicRMWInst::BinOp::Add:
+  case AtomicRMWInst::BinOp::Sub:
+  case AtomicRMWInst::BinOp::Max:
+  case AtomicRMWInst::BinOp::Min:
+  case AtomicRMWInst::BinOp::UMax:
+  case AtomicRMWInst::BinOp::UMin:
+    switch (ITy->getBitWidth()) {
+    case 8:
+    case 16:
+      return AtomicExpansionKind::CmpXChg;
+    case 32:
+      return AtomicExpansionKind::None;
+    case 64:
+      if (STI.hasAtomMinMax64())
+        return AtomicExpansionKind::None;
+      return AtomicExpansionKind::CmpXChg;
+    default:
+      llvm_unreachable("unsupported width encountered");
+    }
+  }
+
+  return AtomicExpansionKind::CmpXChg;
+}
+
 // Pin NVPTXTargetObjectFile's vtables to this file.
-NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {}
+NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default;
 
 MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal(
     const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 13829b924d4b..fb09f99a019d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -451,6 +451,16 @@ public:
                           MachineFunction &MF,
                           unsigned Intrinsic) const override;
 
+  /// getFunctionParamOptimizedAlign - since function arguments are passed via
+  /// .param space, we may want to increase their alignment in a way that
+  /// ensures that we can effectively vectorize their loads & stores. We can
+  /// increase alignment only if the function has internal or has private
+  /// linkage as for other linkage types callers may already rely on default
+  /// alignment. To allow using 128-bit vectorized loads/stores, this function
+  /// ensures that alignment is 16 or greater.
+  Align getFunctionParamOptimizedAlign(const Function *F, Type *ArgTy,
+                                       const DataLayout &DL) const;
+
   /// isLegalAddressingMode - Return true if the addressing mode represented
   /// by AM is legal for this target, for a load/store of the specified type
   /// Used to guide target specific optimizations, like loop strength
@@ -551,6 +561,17 @@ public:
   // instruction, so we say that ctlz is cheap to speculate.
   bool isCheapToSpeculateCtlz() const override { return true; }
 
+  AtomicExpansionKind shouldCastAtomicLoadInIR(LoadInst *LI) const override {
+    return AtomicExpansionKind::None;
+  }
+
+  AtomicExpansionKind shouldCastAtomicStoreInIR(StoreInst *SI) const override {
+    return AtomicExpansionKind::None;
+  }
+
+  AtomicExpansionKind
+  shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+
 private:
   const NVPTXSubtarget &STI; // cache the subtarget here
   SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index 953d95e55f65..8df6f13aa68e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -27,7 +27,7 @@ using namespace llvm;
 // Pin the vtable to this file.
 void NVPTXInstrInfo::anchor() {}
 
-NVPTXInstrInfo::NVPTXInstrInfo() : NVPTXGenInstrInfo(), RegInfo() {}
+NVPTXInstrInfo::NVPTXInstrInfo() : RegInfo() {}
 
 void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator I,
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 22084cddc092..6f9c40feb10e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -145,6 +145,8 @@ def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
 def True : Predicate<"true">;
 
 def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
+def hasPTX42 : Predicate<"Subtarget->getPTXVersion() >= 42">;
+def hasPTX43 : Predicate<"Subtarget->getPTXVersion() >= 43">;
 def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">;
 def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">;
 def hasPTX63 : Predicate<"Subtarget->getPTXVersion() >= 63">;
@@ -152,12 +154,16 @@ def hasPTX64 : Predicate<"Subtarget->getPTXVersion() >= 64">;
 def hasPTX65 : Predicate<"Subtarget->getPTXVersion() >= 65">;
 def hasPTX70 : Predicate<"Subtarget->getPTXVersion() >= 70">;
 def hasPTX71 : Predicate<"Subtarget->getPTXVersion() >= 71">;
+def hasPTX72 : Predicate<"Subtarget->getPTXVersion() >= 72">;
 
 def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">;
+def hasSM32 : Predicate<"Subtarget->getSmVersion() >= 32">;
+def hasSM53 : Predicate<"Subtarget->getSmVersion() >= 53">;
 def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">;
 def hasSM72 : Predicate<"Subtarget->getSmVersion() >= 72">;
 def hasSM75 : Predicate<"Subtarget->getSmVersion() >= 75">;
 def hasSM80 : Predicate<"Subtarget->getSmVersion() >= 80">;
+def hasSM86 : Predicate<"Subtarget->getSmVersion() >= 86">;
 
 // non-sync shfl instructions are not available on sm_70+ in PTX6.4+
 def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70" 
@@ -199,17 +205,29 @@ multiclass I3<string OpcStr, SDNode OpNode> {
               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
 }
 
-// Template for instructions which take 3 int32 args.  The instructions are
+// Template for instructions which take 3 int args.  The instructions are
 // named "<OpcStr>.s32" (e.g. "addc.cc.s32").
-multiclass ADD_SUB_INT_32<string OpcStr, SDNode OpNode> {
-   def i32rr :
-     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
-               !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
-               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
-   def i32ri :
-     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
-               !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
-               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+multiclass ADD_SUB_INT_CARRY<string OpcStr, SDNode OpNode> {
+  let hasSideEffects = 1 in {
+    def i32rr :
+      NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+                !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
+                [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
+    def i32ri :
+      NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
+                [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+    def i64rr :
+      NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
+                !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"),
+                [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>,
+      Requires<[hasPTX43]>;
+    def i64ri :
+      NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+                !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"),
+                [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>,
+      Requires<[hasPTX43]>;
+  }
 }
 
 // Template for instructions which take three fp64 or fp32 args.  The
@@ -579,14 +597,13 @@ defm SUB_i1 : ADD_SUB_i1<sub>;
 defm ADD : I3<"add.s", add>;
 defm SUB : I3<"sub.s", sub>;
 
-// int32 addition and subtraction with carry-out.
-// FIXME: PTX 4.3 adds a 64-bit add.cc (and maybe also 64-bit addc.cc?).
-defm ADDCC : ADD_SUB_INT_32<"add.cc", addc>;
-defm SUBCC : ADD_SUB_INT_32<"sub.cc", subc>;
+// in32 and int64 addition and subtraction with carry-out.
+defm ADDCC : ADD_SUB_INT_CARRY<"add.cc", addc>;
+defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc>;
 
-// int32 addition and subtraction with carry-in and carry-out.
-defm ADDCCC : ADD_SUB_INT_32<"addc.cc", adde>;
-defm SUBCCC : ADD_SUB_INT_32<"subc.cc", sube>;
+// int32 and int64 addition and subtraction with carry-in and carry-out.
+defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde>;
+defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube>;
 
 defm MULT : I3<"mul.lo.s", mul>;
 
@@ -2653,6 +2670,8 @@ def BITCONVERT_64_I2F : F_BITCONVERT<"64", Int64Regs, Float64Regs>;
 def BITCONVERT_64_F2I : F_BITCONVERT<"64", Float64Regs, Int64Regs>;
 def BITCONVERT_32_I2F16x2 : F_BITCONVERT<"32", Int32Regs, Float16x2Regs>;
 def BITCONVERT_32_F16x22I : F_BITCONVERT<"32", Float16x2Regs, Int32Regs>;
+def BITCONVERT_32_F2F16x2 : F_BITCONVERT<"32", Float32Regs, Float16x2Regs>;
+def BITCONVERT_32_F16x22F : F_BITCONVERT<"32", Float16x2Regs, Float32Regs>;
 
 // NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where
 // we cannot specify floating-point literals in isel patterns.  Therefore, we
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index ec069a0a02ae..1192cc078408 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -182,7 +182,7 @@ foreach sync = [false, true] in {
             foreach threadmask_imm = THREADMASK_INFO<sync>.ret in {
               def : SHFL_INSTR<sync, mode, regclass, return_pred,
                                offset_imm, mask_imm, threadmask_imm>,
-                    Requires<!if(sync, [hasSM30], [hasSM30, hasSHFL])>;
+                    Requires<!if(sync, [hasSM30, hasPTX60], [hasSM30, hasSHFL])>;
             }
           }
         }
@@ -223,21 +223,21 @@ defm VOTE_SYNC_BALLOT : VOTE_SYNC<Int32Regs, "ballot.b32", int_nvvm_vote_ballot_
 
 multiclass MATCH_ANY_SYNC<NVPTXRegClass regclass, string ptxtype, Intrinsic IntOp,
                           Operand ImmOp> {
-  def ii : NVPTXInst<(outs regclass:$dest), (ins i32imm:$mask, ImmOp:$value),
+  def ii : NVPTXInst<(outs Int32Regs:$dest), (ins i32imm:$mask, ImmOp:$value),
               "match.any.sync." # ptxtype # " \t$dest, $value, $mask;",
-              [(set regclass:$dest, (IntOp imm:$mask, imm:$value))]>,
+              [(set Int32Regs:$dest, (IntOp imm:$mask, imm:$value))]>,
            Requires<[hasPTX60, hasSM70]>;
-  def ir : NVPTXInst<(outs regclass:$dest), (ins Int32Regs:$mask, ImmOp:$value),
+  def ir : NVPTXInst<(outs Int32Regs:$dest), (ins Int32Regs:$mask, ImmOp:$value),
               "match.any.sync." # ptxtype # " \t$dest, $value, $mask;",
-              [(set regclass:$dest, (IntOp Int32Regs:$mask, imm:$value))]>,
+              [(set Int32Regs:$dest, (IntOp Int32Regs:$mask, imm:$value))]>,
            Requires<[hasPTX60, hasSM70]>;
-  def ri : NVPTXInst<(outs regclass:$dest), (ins i32imm:$mask, regclass:$value),
+  def ri : NVPTXInst<(outs Int32Regs:$dest), (ins i32imm:$mask, regclass:$value),
               "match.any.sync." # ptxtype # " \t$dest, $value, $mask;",
-              [(set regclass:$dest, (IntOp imm:$mask, regclass:$value))]>,
+              [(set Int32Regs:$dest, (IntOp imm:$mask, regclass:$value))]>,
            Requires<[hasPTX60, hasSM70]>;
-  def rr : NVPTXInst<(outs regclass:$dest), (ins Int32Regs:$mask, regclass:$value),
+  def rr : NVPTXInst<(outs Int32Regs:$dest), (ins Int32Regs:$mask, regclass:$value),
               "match.any.sync." # ptxtype # " \t$dest, $value, $mask;",
-              [(set regclass:$dest, (IntOp Int32Regs:$mask, regclass:$value))]>,
+              [(set Int32Regs:$dest, (IntOp Int32Regs:$mask, regclass:$value))]>,
            Requires<[hasPTX60, hasSM70]>;
 }
 
@@ -248,25 +248,25 @@ defm MATCH_ANY_SYNC_64 : MATCH_ANY_SYNC<Int64Regs, "b64", int_nvvm_match_any_syn
 
 multiclass MATCH_ALLP_SYNC<NVPTXRegClass regclass, string ptxtype, Intrinsic IntOp,
                           Operand ImmOp> {
-  def ii : NVPTXInst<(outs regclass:$dest, Int1Regs:$pred),
+  def ii : NVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred),
                      (ins i32imm:$mask, ImmOp:$value),
               "match.all.sync." # ptxtype # " \t$dest|$pred, $value, $mask;",
-              [(set regclass:$dest, Int1Regs:$pred, (IntOp imm:$mask, imm:$value))]>,
+              [(set Int32Regs:$dest, Int1Regs:$pred, (IntOp imm:$mask, imm:$value))]>,
            Requires<[hasPTX60, hasSM70]>;
-  def ir : NVPTXInst<(outs regclass:$dest, Int1Regs:$pred),
+  def ir : NVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred),
                      (ins Int32Regs:$mask, ImmOp:$value),
               "match.all.sync." # ptxtype # " \t$dest|$pred, $value, $mask;",
-              [(set regclass:$dest, Int1Regs:$pred, (IntOp Int32Regs:$mask, imm:$value))]>,
+              [(set Int32Regs:$dest, Int1Regs:$pred, (IntOp Int32Regs:$mask, imm:$value))]>,
            Requires<[hasPTX60, hasSM70]>;
-  def ri : NVPTXInst<(outs regclass:$dest, Int1Regs:$pred),
+  def ri : NVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred),
                      (ins i32imm:$mask, regclass:$value),
               "match.all.sync." # ptxtype # " \t$dest|$pred, $value, $mask;",
-              [(set regclass:$dest, Int1Regs:$pred, (IntOp imm:$mask, regclass:$value))]>,
+              [(set Int32Regs:$dest, Int1Regs:$pred, (IntOp imm:$mask, regclass:$value))]>,
            Requires<[hasPTX60, hasSM70]>;
-  def rr : NVPTXInst<(outs regclass:$dest, Int1Regs:$pred),
+  def rr : NVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred),
                      (ins Int32Regs:$mask, regclass:$value),
               "match.all.sync." # ptxtype # " \t$dest|$pred, $value, $mask;",
-              [(set regclass:$dest, Int1Regs:$pred, (IntOp Int32Regs:$mask, regclass:$value))]>,
+              [(set Int32Regs:$dest, Int1Regs:$pred, (IntOp Int32Regs:$mask, regclass:$value))]>,
            Requires<[hasPTX60, hasSM70]>;
 }
 defm MATCH_ALLP_SYNC_32 : MATCH_ALLP_SYNC<Int32Regs, "b32", int_nvvm_match_all_sync_i32p,
@@ -549,28 +549,32 @@ def : Pat<(int_nvvm_fmin_d
 // We need a full string for OpcStr here because we need to deal with case like
 // INT_PTX_RECIP.
 class F_MATH_1<string OpcStr, NVPTXRegClass target_regclass,
-  NVPTXRegClass src_regclass, Intrinsic IntOP>
+  NVPTXRegClass src_regclass, Intrinsic IntOP, list<Predicate> Preds = []>
             : NVPTXInst<(outs target_regclass:$dst), (ins src_regclass:$src0),
             OpcStr,
-        [(set target_regclass:$dst, (IntOP src_regclass:$src0))]>;
+        [(set target_regclass:$dst, (IntOP src_regclass:$src0))]>,
+        Requires<Preds>;
 
 // We need a full string for OpcStr here because we need to deal with the case
 // like INT_PTX_NATIVE_POWR_F.
 class F_MATH_2<string OpcStr, NVPTXRegClass t_regclass,
-  NVPTXRegClass s0_regclass, NVPTXRegClass s1_regclass, Intrinsic IntOP>
+  NVPTXRegClass s0_regclass, NVPTXRegClass s1_regclass, Intrinsic IntOP,
+  list<Predicate> Preds = []>
             : NVPTXInst<(outs t_regclass:$dst),
               (ins s0_regclass:$src0, s1_regclass:$src1),
             OpcStr,
-        [(set t_regclass:$dst, (IntOP s0_regclass:$src0, s1_regclass:$src1))]>;
+        [(set t_regclass:$dst, (IntOP s0_regclass:$src0, s1_regclass:$src1))]>,
+        Requires<Preds>;
 
 class F_MATH_3<string OpcStr, NVPTXRegClass t_regclass,
   NVPTXRegClass s0_regclass, NVPTXRegClass s1_regclass,
-  NVPTXRegClass s2_regclass, Intrinsic IntOP>
+  NVPTXRegClass s2_regclass, Intrinsic IntOP, list<Predicate> Preds = []>
             : NVPTXInst<(outs t_regclass:$dst),
               (ins s0_regclass:$src0, s1_regclass:$src1, s2_regclass:$src2),
             OpcStr,
         [(set t_regclass:$dst,
-          (IntOP s0_regclass:$src0, s1_regclass:$src1, s2_regclass:$src2))]>;
+          (IntOP s0_regclass:$src0, s1_regclass:$src1, s2_regclass:$src2))]>,
+          Requires<Preds>;
 
 //
 // MISC
@@ -587,17 +591,145 @@ def INT_NVVM_FMIN_F : F_MATH_2<"min.f32 \t$dst, $src0, $src1;", Float32Regs,
   Float32Regs, Float32Regs, int_nvvm_fmin_f>;
 def INT_NVVM_FMIN_FTZ_F : F_MATH_2<"min.ftz.f32 \t$dst, $src0, $src1;",
   Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_ftz_f>;
+def INT_NVVM_FMIN_NAN_F : F_MATH_2<"min.NaN.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_nan_f,
+  [hasPTX70, hasSM80]>;
+def INT_NVVM_FMIN_FTZ_NAN_F : F_MATH_2<"min.ftz.NaN.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_ftz_nan_f,
+  [hasPTX70, hasSM80]>;
+def INT_NVVM_FMIN_XORSIGN_ABS_F :
+  F_MATH_2<"min.xorsign.abs.f32 \t$dst, $src0, $src1;",
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_xorsign_abs_f,
+    [hasPTX72, hasSM86]>;
+def INT_NVVM_FMIN_FTZ_XORSIGN_ABS_F :
+  F_MATH_2<"min.ftz.xorsign.abs.f32 \t$dst, $src0, $src1;",
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_ftz_xorsign_abs_f,
+    [hasPTX72, hasSM86]>;
+def INT_NVVM_FMIN_NAN_XORSIGN_ABS_F :
+  F_MATH_2<"min.NaN.xorsign.abs.f32 \t$dst, $src0, $src1;",
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_nan_xorsign_abs_f,
+    [hasPTX72, hasSM86]>;
+def INT_NVVM_FMIN_FTZ_NAN_XORSIGN_ABS_F :
+  F_MATH_2<"min.ftz.NaN.xorsign.abs.f32 \t$dst, $src0, $src1;",
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_ftz_nan_xorsign_abs_f,
+    [hasPTX72, hasSM86]>;
 
 def INT_NVVM_FMAX_F : F_MATH_2<"max.f32 \t$dst, $src0, $src1;", Float32Regs,
   Float32Regs, Float32Regs, int_nvvm_fmax_f>;
 def INT_NVVM_FMAX_FTZ_F : F_MATH_2<"max.ftz.f32 \t$dst, $src0, $src1;",
   Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_ftz_f>;
+def INT_NVVM_FMAX_NAN_F : F_MATH_2<"max.NaN.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_nan_f,
+  [hasPTX70, hasSM80]>;
+def INT_NVVM_FMAX_FTZ_NAN_F : F_MATH_2<"max.ftz.NaN.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_ftz_nan_f,
+  [hasPTX70, hasSM80]>;
+def INT_NVVM_FMAX_XORSIGN_ABS_F :
+  F_MATH_2<"max.xorsign.abs.f32 \t$dst, $src0, $src1;",
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_xorsign_abs_f,
+    [hasPTX72, hasSM86]>;
+def INT_NVVM_FMAX_FTZ_XORSIGN_ABS_F :
+  F_MATH_2<"max.ftz.xorsign.abs.f32 \t$dst, $src0, $src1;",
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_ftz_xorsign_abs_f,
+    [hasPTX72, hasSM86]>;
+def INT_NVVM_FMAX_NAN_XORSIGN_ABS_F :
+  F_MATH_2<"max.NaN.xorsign.abs.f32 \t$dst, $src0, $src1;",
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_nan_xorsign_abs_f,
+    [hasPTX72, hasSM86]>;
+def INT_NVVM_FMAX_FTZ_NAN_XORSIGN_ABS_F :
+  F_MATH_2<"max.ftz.NaN.xorsign.abs.f32 \t$dst, $src0, $src1;",
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_ftz_nan_xorsign_abs_f,
+    [hasPTX72, hasSM86]>;
 
 def INT_NVVM_FMIN_D : F_MATH_2<"min.f64 \t$dst, $src0, $src1;", Float64Regs,
   Float64Regs, Float64Regs, int_nvvm_fmin_d>;
 def INT_NVVM_FMAX_D : F_MATH_2<"max.f64 \t$dst, $src0, $src1;", Float64Regs,
   Float64Regs, Float64Regs, int_nvvm_fmax_d>;
 
+//
+// Min Max f16, f16x2, bf16, bf16x2
+//
+
+class MIN_MAX_TUPLE<string V, Intrinsic I, NVPTXRegClass RC,
+                    list<Predicate> Preds = [hasPTX70, hasSM80]> {
+  string Variant = V;
+  Intrinsic Intr = I;
+  NVPTXRegClass RegClass = RC;
+  list<Predicate> Predicates = Preds;
+}
+
+multiclass MIN_MAX<string IntName> {
+  foreach P = [
+    MIN_MAX_TUPLE<"_f16", !if(!eq(IntName, "min"), int_nvvm_fmin_f16,
+      int_nvvm_fmax_f16), Float16Regs>,
+    MIN_MAX_TUPLE<"_ftz_f16", !if(!eq(IntName, "min"), int_nvvm_fmin_ftz_f16,
+      int_nvvm_fmax_ftz_f16), Float16Regs>,
+    MIN_MAX_TUPLE<"_NaN_f16", !if(!eq(IntName, "min"), int_nvvm_fmin_nan_f16,
+      int_nvvm_fmax_nan_f16), Float16Regs>,
+    MIN_MAX_TUPLE<"_ftz_NaN_f16", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_ftz_nan_f16, int_nvvm_fmax_ftz_nan_f16), Float16Regs>,
+    MIN_MAX_TUPLE<"_xorsign_abs_f16", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_xorsign_abs_f16, int_nvvm_fmax_xorsign_abs_f16),
+      Float16Regs, [hasPTX72, hasSM86]>,
+    MIN_MAX_TUPLE<"_ftz_xorsign_abs_f16", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_ftz_xorsign_abs_f16, int_nvvm_fmax_ftz_xorsign_abs_f16),
+      Float16Regs, [hasPTX72, hasSM86]>,
+    MIN_MAX_TUPLE<"_NaN_xorsign_abs_f16", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_nan_xorsign_abs_f16, int_nvvm_fmax_nan_xorsign_abs_f16),
+      Float16Regs, [hasPTX72, hasSM86]>,
+    MIN_MAX_TUPLE<"_ftz_NaN_xorsign_abs_f16", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_ftz_nan_xorsign_abs_f16,
+      int_nvvm_fmax_ftz_nan_xorsign_abs_f16), Float16Regs, [hasPTX72, hasSM86]>,
+    MIN_MAX_TUPLE<"_f16x2", !if(!eq(IntName, "min"), int_nvvm_fmin_f16x2,
+      int_nvvm_fmax_f16x2), Float16x2Regs>,
+    MIN_MAX_TUPLE<"_ftz_f16x2", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_ftz_f16x2, int_nvvm_fmax_ftz_f16x2), Float16x2Regs>,
+    MIN_MAX_TUPLE<"_NaN_f16x2", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_nan_f16x2, int_nvvm_fmax_nan_f16x2), Float16x2Regs>,
+    MIN_MAX_TUPLE<"_ftz_NaN_f16x2", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_ftz_nan_f16x2, int_nvvm_fmax_ftz_nan_f16x2), Float16x2Regs>,
+    MIN_MAX_TUPLE<"_xorsign_abs_f16x2", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_xorsign_abs_f16x2, int_nvvm_fmax_xorsign_abs_f16x2),
+      Float16x2Regs, [hasPTX72, hasSM86]>,
+    MIN_MAX_TUPLE<"_ftz_xorsign_abs_f16x2", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_ftz_xorsign_abs_f16x2, int_nvvm_fmax_ftz_xorsign_abs_f16x2),
+      Float16x2Regs, [hasPTX72, hasSM86]>,
+    MIN_MAX_TUPLE<"_NaN_xorsign_abs_f16x2", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_nan_xorsign_abs_f16x2, int_nvvm_fmax_nan_xorsign_abs_f16x2),
+      Float16x2Regs, [hasPTX72, hasSM86]>,
+    MIN_MAX_TUPLE<"_ftz_NaN_xorsign_abs_f16x2", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_ftz_nan_xorsign_abs_f16x2,
+      int_nvvm_fmax_ftz_nan_xorsign_abs_f16x2),
+      Float16x2Regs, [hasPTX72, hasSM86]>,
+    MIN_MAX_TUPLE<"_bf16", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_bf16, int_nvvm_fmax_bf16), Int16Regs>,
+    MIN_MAX_TUPLE<"_NaN_bf16", !if(!eq(IntName, "min"), int_nvvm_fmin_nan_bf16,
+      int_nvvm_fmax_nan_bf16), Int16Regs>,
+    MIN_MAX_TUPLE<"_xorsign_abs_bf16", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_xorsign_abs_bf16, int_nvvm_fmax_xorsign_abs_bf16),
+      Int16Regs, [hasPTX72, hasSM86]>,
+    MIN_MAX_TUPLE<"_NaN_xorsign_abs_bf16", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_nan_xorsign_abs_bf16, int_nvvm_fmax_nan_xorsign_abs_bf16),
+      Int16Regs, [hasPTX72, hasSM86]>,
+    MIN_MAX_TUPLE<"_bf16x2", !if(!eq(IntName, "min"), int_nvvm_fmin_bf16x2,
+      int_nvvm_fmax_bf16x2), Int32Regs>,
+    MIN_MAX_TUPLE<"_NaN_bf16x2", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_nan_bf16x2, int_nvvm_fmax_nan_bf16x2), Int32Regs>,
+    MIN_MAX_TUPLE<"_xorsign_abs_bf16x2", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_xorsign_abs_bf16x2, int_nvvm_fmax_xorsign_abs_bf16x2),
+      Int32Regs, [hasPTX72, hasSM86]>,
+    MIN_MAX_TUPLE<"_NaN_xorsign_abs_bf16x2", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_nan_xorsign_abs_bf16x2,
+      int_nvvm_fmax_nan_xorsign_abs_bf16x2),
+      Int32Regs, [hasPTX72, hasSM86]>] in {
+        def P.Variant : F_MATH_2<!strconcat(
+          IntName, !subst("_", ".", P.Variant), " \t$dst, $src0, $src1;"),
+          P.RegClass, P.RegClass, P.RegClass, P.Intr, P.Predicates>;
+  }
+}
+
+defm INT_NVVM_FMIN : MIN_MAX<"min">;
+defm INT_NVVM_FMAN : MIN_MAX<"max">;
 
 //
 // Multiplication
@@ -719,6 +851,19 @@ def INT_NVVM_FABS_F : F_MATH_1<"abs.f32 \t$dst, $src0;", Float32Regs,
 def INT_NVVM_FABS_D : F_MATH_1<"abs.f64 \t$dst, $src0;", Float64Regs,
   Float64Regs, int_nvvm_fabs_d>;
 
+//
+// Abs, Neg bf16, bf16x2
+//
+
+def INT_NVVM_ABS_BF16 : F_MATH_1<"abs.bf16 \t$dst, $src0;", Int16Regs,
+  Int16Regs, int_nvvm_abs_bf16, [hasPTX70, hasSM80]>;
+def INT_NVVM_ABS_BF16X2 : F_MATH_1<"abs.bf16x2 \t$dst, $src0;", Int32Regs,
+  Int32Regs, int_nvvm_abs_bf16x2, [hasPTX70, hasSM80]>;
+def INT_NVVM_NEG_BF16 : F_MATH_1<"neg.bf16 \t$dst, $src0;", Int16Regs,
+  Int16Regs, int_nvvm_neg_bf16, [hasPTX70, hasSM80]>;
+def INT_NVVM_NEG_BF16X2 : F_MATH_1<"neg.bf16x2 \t$dst, $src0;", Int32Regs,
+  Int32Regs, int_nvvm_neg_bf16x2, [hasPTX70, hasSM80]>;
+
 //
 // Round
 //
@@ -762,6 +907,10 @@ def INT_NVVM_EX2_APPROX_F : F_MATH_1<"ex2.approx.f32 \t$dst, $src0;",
   Float32Regs, Float32Regs, int_nvvm_ex2_approx_f>;
 def INT_NVVM_EX2_APPROX_D : F_MATH_1<"ex2.approx.f64 \t$dst, $src0;",
   Float64Regs, Float64Regs, int_nvvm_ex2_approx_d>;
+def INT_NVVM_EX2_APPROX_F16 : F_MATH_1<"ex2.approx.f16 \t$dst, $src0;",
+  Float16Regs, Float16Regs, int_nvvm_ex2_approx_f16, [hasPTX70, hasSM75]>;
+def INT_NVVM_EX2_APPROX_F16X2 : F_MATH_1<"ex2.approx.f16x2 \t$dst, $src0;",
+  Float16x2Regs, Float16x2Regs, int_nvvm_ex2_approx_f16x2, [hasPTX70, hasSM75]>;
 
 def INT_NVVM_LG2_APPROX_FTZ_F : F_MATH_1<"lg2.approx.ftz.f32 \t$dst, $src0;",
   Float32Regs, Float32Regs, int_nvvm_lg2_approx_ftz_f>;
@@ -788,35 +937,72 @@ def INT_NVVM_COS_APPROX_F : F_MATH_1<"cos.approx.f32 \t$dst, $src0;",
 // Fma
 //
 
-def INT_NVVM_FMA_RN_FTZ_F
-  : F_MATH_3<"fma.rn.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
-    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rn_ftz_f>;
-def INT_NVVM_FMA_RN_F : F_MATH_3<"fma.rn.f32 \t$dst, $src0, $src1, $src2;",
-  Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rn_f>;
-def INT_NVVM_FMA_RZ_FTZ_F
-  : F_MATH_3<"fma.rz.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
-    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rz_ftz_f>;
-def INT_NVVM_FMA_RZ_F : F_MATH_3<"fma.rz.f32 \t$dst, $src0, $src1, $src2;",
-  Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rz_f>;
-def INT_NVVM_FMA_RM_FTZ_F
-  : F_MATH_3<"fma.rm.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
-    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rm_ftz_f>;
-def INT_NVVM_FMA_RM_F : F_MATH_3<"fma.rm.f32 \t$dst, $src0, $src1, $src2;",
-  Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rm_f>;
-def INT_NVVM_FMA_RP_FTZ_F
-  : F_MATH_3<"fma.rp.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
-    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rp_ftz_f>;
-def INT_NVVM_FMA_RP_F : F_MATH_3<"fma.rp.f32 \t$dst, $src0, $src1, $src2;",
-  Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rp_f>;
-
-def INT_NVVM_FMA_RN_D : F_MATH_3<"fma.rn.f64 \t$dst, $src0, $src1, $src2;",
-  Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rn_d>;
-def INT_NVVM_FMA_RZ_D : F_MATH_3<"fma.rz.f64 \t$dst, $src0, $src1, $src2;",
-  Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rz_d>;
-def INT_NVVM_FMA_RM_D : F_MATH_3<"fma.rm.f64 \t$dst, $src0, $src1, $src2;",
-  Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rm_d>;
-def INT_NVVM_FMA_RP_D : F_MATH_3<"fma.rp.f64 \t$dst, $src0, $src1, $src2;",
-  Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rp_d>;
+class FMA_TUPLE<string V, Intrinsic I, NVPTXRegClass RC,
+                list<Predicate> Preds = []> {
+  string Variant = V;
+  Intrinsic Intr = I;
+  NVPTXRegClass RegClass = RC;
+  list<Predicate> Predicates = Preds;
+}
+
+multiclass FMA_INST {
+  foreach P = [
+    FMA_TUPLE<"_rn_f64", int_nvvm_fma_rn_d, Float64Regs>,
+    FMA_TUPLE<"_rz_f64", int_nvvm_fma_rz_d, Float64Regs>,
+    FMA_TUPLE<"_rm_f64", int_nvvm_fma_rm_d, Float64Regs>,
+    FMA_TUPLE<"_rp_f64", int_nvvm_fma_rp_d, Float64Regs>,
+
+    FMA_TUPLE<"_rn_ftz_f32", int_nvvm_fma_rn_ftz_f, Float32Regs>,
+    FMA_TUPLE<"_rn_f32", int_nvvm_fma_rn_f, Float32Regs>,
+    FMA_TUPLE<"_rz_ftz_f32", int_nvvm_fma_rz_ftz_f, Float32Regs>,
+    FMA_TUPLE<"_rz_f32", int_nvvm_fma_rz_f, Float32Regs>,
+    FMA_TUPLE<"_rm_f32", int_nvvm_fma_rm_f, Float32Regs>,
+    FMA_TUPLE<"_rm_ftz_f32", int_nvvm_fma_rm_ftz_f, Float32Regs>,
+    FMA_TUPLE<"_rp_f32", int_nvvm_fma_rp_f, Float32Regs>,
+    FMA_TUPLE<"_rp_ftz_f32", int_nvvm_fma_rp_ftz_f, Float32Regs>,
+
+    FMA_TUPLE<"_rn_f16", int_nvvm_fma_rn_f16, Float16Regs, [hasPTX42, hasSM53]>,
+    FMA_TUPLE<"_rn_ftz_f16", int_nvvm_fma_rn_ftz_f16, Float16Regs,
+      [hasPTX42, hasSM53]>,
+    FMA_TUPLE<"_rn_sat_f16", int_nvvm_fma_rn_sat_f16, Float16Regs,
+      [hasPTX42, hasSM53]>,
+    FMA_TUPLE<"_rn_ftz_sat_f16", int_nvvm_fma_rn_ftz_sat_f16, Float16Regs,
+      [hasPTX42, hasSM53]>,
+    FMA_TUPLE<"_rn_relu_f16", int_nvvm_fma_rn_relu_f16, Float16Regs,
+      [hasPTX70, hasSM80]>,
+    FMA_TUPLE<"_rn_ftz_relu_f16", int_nvvm_fma_rn_ftz_relu_f16, Float16Regs,
+      [hasPTX70, hasSM80]>,
+
+    FMA_TUPLE<"_rn_f16x2", int_nvvm_fma_rn_f16x2, Float16x2Regs,
+      [hasPTX42, hasSM53]>,
+    FMA_TUPLE<"_rn_ftz_f16x2", int_nvvm_fma_rn_ftz_f16x2, Float16x2Regs,
+      [hasPTX42, hasSM53]>,
+    FMA_TUPLE<"_rn_sat_f16x2", int_nvvm_fma_rn_sat_f16x2, Float16x2Regs,
+      [hasPTX42, hasSM53]>,
+    FMA_TUPLE<"_rn_ftz_sat_f16x2", int_nvvm_fma_rn_ftz_sat_f16x2,
+      Float16x2Regs, [hasPTX42, hasSM53]>,
+    FMA_TUPLE<"_rn_relu_f16x2", int_nvvm_fma_rn_relu_f16x2, Float16x2Regs,
+      [hasPTX70, hasSM80]>,
+    FMA_TUPLE<"_rn_ftz_relu_f16x2", int_nvvm_fma_rn_ftz_relu_f16x2,
+      Float16x2Regs, [hasPTX70, hasSM80]>,
+
+    FMA_TUPLE<"_rn_bf16", int_nvvm_fma_rn_bf16, Int16Regs, [hasPTX70, hasSM80]>,
+    FMA_TUPLE<"_rn_relu_bf16", int_nvvm_fma_rn_relu_bf16, Int16Regs,
+      [hasPTX70, hasSM80]>,
+
+    FMA_TUPLE<"_rn_bf16x2", int_nvvm_fma_rn_bf16x2, Int32Regs,
+      [hasPTX70, hasSM80]>,
+    FMA_TUPLE<"_rn_relu_bf16x2", int_nvvm_fma_rn_relu_bf16x2, Int32Regs,
+      [hasPTX70, hasSM80]>
+  ] in {
+    def P.Variant :
+      F_MATH_3<!strconcat("fma",
+        !subst("_", ".", P.Variant), " \t$dst, $src0, $src1, $src2;"),
+        P.RegClass, P.RegClass, P.RegClass, P.RegClass, P.Intr, P.Predicates>;
+  }
+}
+
+defm INT_NVVM_FMA : FMA_INST;
 
 //
 // Rcp
@@ -848,6 +1034,8 @@ def INT_NVVM_RCP_RM_D : F_MATH_1<"rcp.rm.f64 \t$dst, $src0;", Float64Regs,
 def INT_NVVM_RCP_RP_D : F_MATH_1<"rcp.rp.f64 \t$dst, $src0;", Float64Regs,
   Float64Regs, int_nvvm_rcp_rp_d>;
 
+def INT_NVVM_RCP_APPROX_FTZ_F : F_MATH_1<"rcp.approx.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_approx_ftz_f>;
 def INT_NVVM_RCP_APPROX_FTZ_D : F_MATH_1<"rcp.approx.ftz.f64 \t$dst, $src0;",
   Float64Regs, Float64Regs, int_nvvm_rcp_approx_ftz_d>;
 
@@ -1472,13 +1660,13 @@ defm INT_PTX_ATOM_LOAD_MAX_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".s32", ".max",
 defm INT_PTX_ATOM_LOAD_MAX_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
   ".s32", ".max", atomic_load_max_32_gen, i32imm, imm>;
 defm INT_PTX_ATOM_LOAD_MAX_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".s64",
-  ".max", atomic_load_max_64_g, i64imm, imm>;
+  ".max", atomic_load_max_64_g, i64imm, imm, [hasSM32]>;
 defm INT_PTX_ATOM_LOAD_MAX_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".s64",
-  ".max", atomic_load_max_64_s, i64imm, imm>;
+  ".max", atomic_load_max_64_s, i64imm, imm, [hasSM32]>;
 defm INT_PTX_ATOM_LOAD_MAX_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".s64", ".max",
-  atomic_load_max_64_gen, i64imm, imm>;
+  atomic_load_max_64_gen, i64imm, imm, [hasSM32]>;
 defm INT_PTX_ATOM_LOAD_MAX_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
-  ".s64", ".max", atomic_load_max_64_gen, i64imm, imm>;
+  ".s64", ".max", atomic_load_max_64_gen, i64imm, imm, [hasSM32]>;
 defm INT_PTX_ATOM_LOAD_UMAX_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32",
   ".max", atomic_load_umax_32_g, i32imm, imm>;
 defm INT_PTX_ATOM_LOAD_UMAX_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32",
@@ -1488,13 +1676,13 @@ defm INT_PTX_ATOM_LOAD_UMAX_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".max",
 defm INT_PTX_ATOM_LOAD_UMAX_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
   ".u32", ".max", atomic_load_umax_32_gen, i32imm, imm>;
 defm INT_PTX_ATOM_LOAD_UMAX_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64",
-  ".max", atomic_load_umax_64_g, i64imm, imm>;
+  ".max", atomic_load_umax_64_g, i64imm, imm, [hasSM32]>;
 defm INT_PTX_ATOM_LOAD_UMAX_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64",
-  ".max", atomic_load_umax_64_s, i64imm, imm>;
+  ".max", atomic_load_umax_64_s, i64imm, imm, [hasSM32]>;
 defm INT_PTX_ATOM_LOAD_UMAX_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".max",
-  atomic_load_umax_64_gen, i64imm, imm>;
+  atomic_load_umax_64_gen, i64imm, imm, [hasSM32]>;
 defm INT_PTX_ATOM_LOAD_UMAX_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
-  ".u64", ".max", atomic_load_umax_64_gen, i64imm, imm>;
+  ".u64", ".max", atomic_load_umax_64_gen, i64imm, imm, [hasSM32]>;
 
 // atom_min
 
@@ -1532,13 +1720,13 @@ defm INT_PTX_ATOM_LOAD_MIN_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".s32", ".min",
 defm INT_PTX_ATOM_LOAD_MIN_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
   ".s32", ".min", atomic_load_min_32_gen, i32imm, imm>;
 defm INT_PTX_ATOM_LOAD_MIN_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".s64",
-  ".min", atomic_load_min_64_g, i64imm, imm>;
+  ".min", atomic_load_min_64_g, i64imm, imm, [hasSM32]>;
 defm INT_PTX_ATOM_LOAD_MIN_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".s64",
-  ".min", atomic_load_min_64_s, i64imm, imm>;
+  ".min", atomic_load_min_64_s, i64imm, imm, [hasSM32]>;
 defm INT_PTX_ATOM_LOAD_MIN_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".s64", ".min",
-  atomic_load_min_64_gen, i64imm, imm>;
+  atomic_load_min_64_gen, i64imm, imm, [hasSM32]>;
 defm INT_PTX_ATOM_LOAD_MIN_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
-  ".s64", ".min", atomic_load_min_64_gen, i64imm, imm>;
+  ".s64", ".min", atomic_load_min_64_gen, i64imm, imm, [hasSM32]>;
 defm INT_PTX_ATOM_LOAD_UMIN_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32",
   ".min", atomic_load_umin_32_g, i32imm, imm>;
 defm INT_PTX_ATOM_LOAD_UMIN_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32",
@@ -1548,13 +1736,13 @@ defm INT_PTX_ATOM_LOAD_UMIN_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".min",
 defm INT_PTX_ATOM_LOAD_UMIN_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
   ".u32", ".min", atomic_load_umin_32_gen, i32imm, imm>;
 defm INT_PTX_ATOM_LOAD_UMIN_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64",
-  ".min", atomic_load_umin_64_g, i64imm, imm>;
+  ".min", atomic_load_umin_64_g, i64imm, imm, [hasSM32]>;
 defm INT_PTX_ATOM_LOAD_UMIN_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64",
-  ".min", atomic_load_umin_64_s, i64imm, imm>;
+  ".min", atomic_load_umin_64_s, i64imm, imm, [hasSM32]>;
 defm INT_PTX_ATOM_LOAD_UMIN_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".min",
-  atomic_load_umin_64_gen, i64imm, imm>;
+  atomic_load_umin_64_gen, i64imm, imm, [hasSM32]>;
 defm INT_PTX_ATOM_LOAD_UMIN_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
-  ".u64", ".min", atomic_load_umin_64_gen, i64imm, imm>;
+  ".u64", ".min", atomic_load_umin_64_gen, i64imm, imm, [hasSM32]>;
 
 // atom_inc  atom_dec
 
@@ -1612,13 +1800,13 @@ defm INT_PTX_ATOM_AND_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".and",
 defm INT_PTX_ATOM_AND_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
   ".and", atomic_load_and_32_gen, i32imm, imm>;
 defm INT_PTX_ATOM_AND_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".and",
-  atomic_load_and_64_g, i64imm, imm>;
+  atomic_load_and_64_g, i64imm, imm, [hasSM32]>;
 defm INT_PTX_ATOM_AND_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".and",
-  atomic_load_and_64_s, i64imm, imm>;
+  atomic_load_and_64_s, i64imm, imm, [hasSM32]>;
 defm INT_PTX_ATOM_AND_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".and",
-  atomic_load_and_64_gen, i64imm, imm>;
+  atomic_load_and_64_gen, i64imm, imm, [hasSM32]>;
 defm INT_PTX_ATOM_AND_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
-  ".and", atomic_load_and_64_gen, i64imm, imm>;
+  ".and", atomic_load_and_64_gen, i64imm, imm, [hasSM32]>;
 
 // atom_or
 
@@ -1644,13 +1832,13 @@ defm INT_PTX_ATOM_OR_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
 defm INT_PTX_ATOM_OR_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".or",
   atomic_load_or_32_s, i32imm, imm>;
 defm INT_PTX_ATOM_OR_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".or",
-  atomic_load_or_64_g, i64imm, imm>;
+  atomic_load_or_64_g, i64imm, imm, [hasSM32]>;
 defm INT_PTX_ATOM_OR_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".or",
-  atomic_load_or_64_gen, i64imm, imm>;
+  atomic_load_or_64_gen, i64imm, imm, [hasSM32]>;
 defm INT_PTX_ATOM_OR_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
-  ".or", atomic_load_or_64_gen, i64imm, imm>;
+  ".or", atomic_load_or_64_gen, i64imm, imm, [hasSM32]>;
 defm INT_PTX_ATOM_OR_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".or",
-  atomic_load_or_64_s, i64imm, imm>;
+  atomic_load_or_64_s, i64imm, imm, [hasSM32]>;
 
 // atom_xor
 
@@ -1676,13 +1864,13 @@ defm INT_PTX_ATOM_XOR_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".xor",
 defm INT_PTX_ATOM_XOR_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
   ".xor", atomic_load_xor_32_gen, i32imm, imm>;
 defm INT_PTX_ATOM_XOR_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".xor",
-  atomic_load_xor_64_g, i64imm, imm>;
+  atomic_load_xor_64_g, i64imm, imm, [hasSM32]>;
 defm INT_PTX_ATOM_XOR_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".xor",
-  atomic_load_xor_64_s, i64imm, imm>;
+  atomic_load_xor_64_s, i64imm, imm, [hasSM32]>;
 defm INT_PTX_ATOM_XOR_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".xor",
-  atomic_load_xor_64_gen, i64imm, imm>;
+  atomic_load_xor_64_gen, i64imm, imm, [hasSM32]>;
 defm INT_PTX_ATOM_XOR_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
-  ".xor", atomic_load_xor_64_gen, i64imm, imm>;
+  ".xor", atomic_load_xor_64_gen, i64imm, imm, [hasSM32]>;
 
 // atom_cas
 
@@ -1788,7 +1976,7 @@ multiclass ATOM3P_impl<string AsmStr,  Intrinsic Intr,
                     (Intr Int64Regs:$src, (ImmTy Imm:$b), (ImmTy Imm:$c))>;
 }
 
-// Constructs instrinsic name and instruction asm strings.
+// Constructs intrinsic name and instruction asm strings.
 multiclass ATOM2N_impl<string OpStr, string IntTypeStr, string TypeStr,
                        string ScopeStr, string SpaceStr,
                        NVPTXRegClass regclass, Operand ImmType, SDNode Imm,
@@ -2473,7 +2661,7 @@ def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt),
 
 // SW version of rotate 64
 def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)),
-          (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
+          (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>,
       Requires<[noHWROT32]>;
 def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt),
           (ROTL64reg_sw Int64Regs:$src, Int32Regs:$amt)>,
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
index f655f25602bc..f57c2920449b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@@ -115,7 +115,8 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
                               /* SrcAlign */ LI->getAlign(),
                               /* DestAlign */ SI->getAlign(),
                               /* SrcIsVolatile */ LI->isVolatile(),
-                              /* DstIsVolatile */ SI->isVolatile(), TTI);
+                              /* DstIsVolatile */ SI->isVolatile(),
+                              /* CanOverlap */ true, TTI);
 
     SI->eraseFromParent();
     LI->eraseFromParent();
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index 67aa49132016..53812d7552a9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -88,16 +88,17 @@
 // cancel the addrspacecast pair this pass emits.
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/NVPTXBaseInfo.h"
 #include "NVPTX.h"
 #include "NVPTXTargetMachine.h"
 #include "NVPTXUtilities.h"
-#include "MCTargetDesc/NVPTXBaseInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Pass.h"
+#include <queue>
 
 #define DEBUG_TYPE "nvptx-lower-args"
 
@@ -206,10 +207,8 @@ static void convertToParamAS(Value *OldUser, Value *Param) {
       // We've created a new instruction. Queue users of the old instruction to
       // be converted and the instruction itself to be deleted. We can't delete
       // the old instruction yet, because it's still in use by a load somewhere.
-      llvm::for_each(
-          I.OldInstruction->users(), [NewInst, &ItemsToConvert](Value *V) {
-            ItemsToConvert.push_back({cast<Instruction>(V), NewInst});
-          });
+      for (Value *V : I.OldInstruction->users())
+        ItemsToConvert.push_back({cast<Instruction>(V), NewInst});
 
       InstructionsToDelete.push_back(I.OldInstruction);
     }
@@ -222,18 +221,99 @@ static void convertToParamAS(Value *OldUser, Value *Param) {
   // E.g if we have Value = Load(BitCast(GEP(arg))), InstructionsToDelete will
   // have {GEP,BitCast}. GEP can't be deleted first, because it's still used by
   // the BitCast.
-  llvm::for_each(reverse(InstructionsToDelete),
-                 [](Instruction *I) { I->eraseFromParent(); });
+  for (Instruction *I : llvm::reverse(InstructionsToDelete))
+    I->eraseFromParent();
 }
 
-void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
+// Adjust alignment of arguments passed byval in .param address space. We can
+// increase alignment of such arguments in a way that ensures that we can
+// effectively vectorize their loads. We should also traverse all loads from
+// byval pointer and adjust their alignment, if those were using known offset.
+// Such alignment changes must be conformed with parameter store and load in
+// NVPTXTargetLowering::LowerCall.
+static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS,
+                                    const NVPTXTargetLowering *TLI) {
   Function *Func = Arg->getParent();
-  Instruction *FirstInst = &(Func->getEntryBlock().front());
-  PointerType *PType = dyn_cast<PointerType>(Arg->getType());
+  Type *StructType = Arg->getParamByValType();
+  const DataLayout DL(Func->getParent());
+
+  uint64_t NewArgAlign =
+      TLI->getFunctionParamOptimizedAlign(Func, StructType, DL).value();
+  uint64_t CurArgAlign =
+      Arg->getAttribute(Attribute::Alignment).getValueAsInt();
+
+  if (CurArgAlign >= NewArgAlign)
+    return;
+
+  LLVM_DEBUG(dbgs() << "Try to use alignment " << NewArgAlign << " instead of "
+                    << CurArgAlign << " for " << *Arg << '\n');
+
+  auto NewAlignAttr =
+      Attribute::get(Func->getContext(), Attribute::Alignment, NewArgAlign);
+  Arg->removeAttr(Attribute::Alignment);
+  Arg->addAttr(NewAlignAttr);
+
+  struct Load {
+    LoadInst *Inst;
+    uint64_t Offset;
+  };
+
+  struct LoadContext {
+    Value *InitialVal;
+    uint64_t Offset;
+  };
+
+  SmallVector<Load> Loads;
+  std::queue<LoadContext> Worklist;
+  Worklist.push({ArgInParamAS, 0});
+
+  while (!Worklist.empty()) {
+    LoadContext Ctx = Worklist.front();
+    Worklist.pop();
+
+    for (User *CurUser : Ctx.InitialVal->users()) {
+      if (auto *I = dyn_cast<LoadInst>(CurUser)) {
+        Loads.push_back({I, Ctx.Offset});
+        continue;
+      }
+
+      if (auto *I = dyn_cast<BitCastInst>(CurUser)) {
+        Worklist.push({I, Ctx.Offset});
+        continue;
+      }
+
+      if (auto *I = dyn_cast<GetElementPtrInst>(CurUser)) {
+        APInt OffsetAccumulated =
+            APInt::getZero(DL.getIndexSizeInBits(ADDRESS_SPACE_PARAM));
+
+        if (!I->accumulateConstantOffset(DL, OffsetAccumulated))
+          continue;
+
+        uint64_t OffsetLimit = -1;
+        uint64_t Offset = OffsetAccumulated.getLimitedValue(OffsetLimit);
+        assert(Offset != OffsetLimit && "Expect Offset less than UINT64_MAX");
+
+        Worklist.push({I, Ctx.Offset + Offset});
+        continue;
+      }
+
+      llvm_unreachable("All users must be one of: load, "
+                       "bitcast, getelementptr.");
+    }
+  }
 
-  assert(PType && "Expecting pointer type in handleByValParam");
+  for (Load &CurLoad : Loads) {
+    Align NewLoadAlign(greatestCommonDivisor(NewArgAlign, CurLoad.Offset));
+    Align CurLoadAlign(CurLoad.Inst->getAlign());
+    CurLoad.Inst->setAlignment(std::max(NewLoadAlign, CurLoadAlign));
+  }
+}
 
-  Type *StructType = PType->getPointerElementType();
+void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
+  Function *Func = Arg->getParent();
+  Instruction *FirstInst = &(Func->getEntryBlock().front());
+  Type *StructType = Arg->getParamByValType();
+  assert(StructType && "Missing byval type");
 
   auto IsALoadChain = [&](Value *Start) {
     SmallVector<Value *, 16> ValuesToCheck = {Start};
@@ -269,10 +349,19 @@ void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
     Value *ArgInParamAS = new AddrSpaceCastInst(
         Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(),
         FirstInst);
-    llvm::for_each(UsersToUpdate, [ArgInParamAS](Value *V) {
+    for (Value *V : UsersToUpdate)
       convertToParamAS(V, ArgInParamAS);
-    });
     LLVM_DEBUG(dbgs() << "No need to copy " << *Arg << "\n");
+
+    // Further optimizations require target lowering info.
+    if (!TM)
+      return;
+
+    const auto *TLI =
+        cast<NVPTXTargetLowering>(TM->getSubtargetImpl()->getTargetLowering());
+
+    adjustByValArgAlignment(Arg, ArgInParamAS, TLI);
+
     return;
   }
 
@@ -284,7 +373,7 @@ void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
   // later load/stores assume that alignment, and we are going to replace
   // the use of the byval parameter with this alloca instruction.
   AllocA->setAlignment(Func->getParamAlign(Arg->getArgNo())
-                           .getValueOr(DL.getPrefTypeAlign(StructType)));
+                           .value_or(DL.getPrefTypeAlign(StructType)));
   Arg->replaceAllUsesWith(AllocA);
 
   Value *ArgInParam = new AddrSpaceCastInst(
diff --git a/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h b/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
index cf63fc33e621..0a7b9cf468a6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
@@ -26,6 +26,13 @@ private:
 public:
   NVPTXMachineFunctionInfo(MachineFunction &MF) {}
 
+  MachineFunctionInfo *
+  clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+        const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+      const override {
+    return DestMF.cloneInfo<NVPTXMachineFunctionInfo>(*this);
+  }
+
   /// Returns the index for the symbol \p Symbol. If the symbol was previously,
   /// added, the same index is returned. Otherwise, the symbol is added and the
   /// new index is returned.
diff --git a/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
index f4934f0bc20b..4bd820e98f05 100644
--- a/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
@@ -64,8 +64,12 @@ bool NVPTXReplaceImageHandles::runOnMachineFunction(MachineFunction &MF) {
   // This is needed in debug mode when code cleanup passes are not executed,
   // but we need the handle access to be eliminated because they are not
   // valid instructions when image handles are disabled.
-  for (MachineInstr *MI : InstrsToRemove)
-    MI->eraseFromParent();
+  for (MachineInstr *MI : InstrsToRemove) {
+    unsigned DefReg = MI->getOperand(0).getReg();
+    // Only these that are not used can be removed.
+    if (MF.getRegInfo().use_nodbg_empty(DefReg))
+      MI->eraseFromParent();
+  }
   return Changed;
 }
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 5a6440c91fca..a03492a92bac 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -23,7 +23,7 @@ using namespace llvm;
 #include "NVPTXGenSubtargetInfo.inc"
 
 static cl::opt<bool>
-    NoF16Math("nvptx-no-f16-math", cl::ZeroOrMore, cl::Hidden,
+    NoF16Math("nvptx-no-f16-math", cl::Hidden,
               cl::desc("NVPTX Specific: Disable generation of f16 math ops."),
               cl::init(false));
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 0a1c61a35795..597b8af176a2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -237,7 +237,7 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 }
 
 TargetTransformInfo
-NVPTXTargetMachine::getTargetTransformInfo(const Function &F) {
+NVPTXTargetMachine::getTargetTransformInfo(const Function &F) const {
   return TargetTransformInfo(NVPTXTTIImpl(this, F));
 }
 
@@ -330,6 +330,8 @@ void NVPTXPassConfig::addIRPasses() {
     addStraightLineScalarOptimizationPasses();
   }
 
+  addPass(createAtomicExpandPass());
+
   // === LSR and other generic IR passes ===
   TargetPassConfig::addIRPasses();
   // EarlyCSE is not always strong enough to clean up what LSR produces. For
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
index 7a69197abcff..491e721479d3 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -65,7 +65,7 @@ public:
   void adjustPassManager(PassManagerBuilder &) override;
   void registerPassBuilderCallbacks(PassBuilder &PB) override;
 
-  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+  TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
 
   bool isMachineVerifierClean() const override {
     return false;
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h
index 4645671a0cd8..37b0a44243cb 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h
@@ -17,7 +17,7 @@ namespace llvm {
 
 class NVPTXTargetObjectFile : public TargetLoweringObjectFile {
 public:
-  NVPTXTargetObjectFile() {}
+  NVPTXTargetObjectFile() = default;
 
   ~NVPTXTargetObjectFile() override;
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 466aa7130216..fc4bc6b3cbf7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -96,7 +96,7 @@ bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) {
       // Instructions that read threadIdx are obviously divergent.
       if (readsThreadIndex(II) || readsLaneId(II))
         return true;
-      // Handle the NVPTX atomic instrinsics that cannot be represented as an
+      // Handle the NVPTX atomic intrinsics that cannot be represented as an
       // atomic IR instruction.
       if (isNVVMAtomic(II))
         return true;
@@ -145,11 +145,15 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
     Optional<SpecialCase> Special;
 
     FtzRequirementTy FtzRequirement = FTZ_Any;
+    // Denormal handling is guarded by different attributes depending on the
+    // type (denormal-fp-math vs denormal-fp-math-f32), take note of halfs.
+    bool IsHalfTy = false;
 
     SimplifyAction() = default;
 
-    SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq)
-        : IID(IID), FtzRequirement(FtzReq) {}
+    SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq,
+                   bool IsHalfTy = false)
+        : IID(IID), FtzRequirement(FtzReq), IsHalfTy(IsHalfTy) {}
 
     // Cast operations don't have anything to do with FTZ, so we skip that
     // argument.
@@ -191,18 +195,66 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
       return {Intrinsic::fma, FTZ_MustBeOff};
     case Intrinsic::nvvm_fma_rn_ftz_f:
       return {Intrinsic::fma, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fma_rn_f16:
+      return {Intrinsic::fma, FTZ_MustBeOff, true};
+    case Intrinsic::nvvm_fma_rn_ftz_f16:
+      return {Intrinsic::fma, FTZ_MustBeOn, true};
+    case Intrinsic::nvvm_fma_rn_f16x2:
+      return {Intrinsic::fma, FTZ_MustBeOff, true};
+    case Intrinsic::nvvm_fma_rn_ftz_f16x2:
+      return {Intrinsic::fma, FTZ_MustBeOn, true};
     case Intrinsic::nvvm_fmax_d:
       return {Intrinsic::maxnum, FTZ_Any};
     case Intrinsic::nvvm_fmax_f:
       return {Intrinsic::maxnum, FTZ_MustBeOff};
     case Intrinsic::nvvm_fmax_ftz_f:
       return {Intrinsic::maxnum, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fmax_nan_f:
+      return {Intrinsic::maximum, FTZ_MustBeOff};
+    case Intrinsic::nvvm_fmax_ftz_nan_f:
+      return {Intrinsic::maximum, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fmax_f16:
+      return {Intrinsic::maxnum, FTZ_MustBeOff, true};
+    case Intrinsic::nvvm_fmax_ftz_f16:
+      return {Intrinsic::maxnum, FTZ_MustBeOn, true};
+    case Intrinsic::nvvm_fmax_f16x2:
+      return {Intrinsic::maxnum, FTZ_MustBeOff, true};
+    case Intrinsic::nvvm_fmax_ftz_f16x2:
+      return {Intrinsic::maxnum, FTZ_MustBeOn, true};
+    case Intrinsic::nvvm_fmax_nan_f16:
+      return {Intrinsic::maximum, FTZ_MustBeOff, true};
+    case Intrinsic::nvvm_fmax_ftz_nan_f16:
+      return {Intrinsic::maximum, FTZ_MustBeOn, true};
+    case Intrinsic::nvvm_fmax_nan_f16x2:
+      return {Intrinsic::maximum, FTZ_MustBeOff, true};
+    case Intrinsic::nvvm_fmax_ftz_nan_f16x2:
+      return {Intrinsic::maximum, FTZ_MustBeOn, true};
     case Intrinsic::nvvm_fmin_d:
       return {Intrinsic::minnum, FTZ_Any};
     case Intrinsic::nvvm_fmin_f:
       return {Intrinsic::minnum, FTZ_MustBeOff};
     case Intrinsic::nvvm_fmin_ftz_f:
       return {Intrinsic::minnum, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fmin_nan_f:
+      return {Intrinsic::minimum, FTZ_MustBeOff};
+    case Intrinsic::nvvm_fmin_ftz_nan_f:
+      return {Intrinsic::minimum, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fmin_f16:
+      return {Intrinsic::minnum, FTZ_MustBeOff, true};
+    case Intrinsic::nvvm_fmin_ftz_f16:
+      return {Intrinsic::minnum, FTZ_MustBeOn, true};
+    case Intrinsic::nvvm_fmin_f16x2:
+      return {Intrinsic::minnum, FTZ_MustBeOff, true};
+    case Intrinsic::nvvm_fmin_ftz_f16x2:
+      return {Intrinsic::minnum, FTZ_MustBeOn, true};
+    case Intrinsic::nvvm_fmin_nan_f16:
+      return {Intrinsic::minimum, FTZ_MustBeOff, true};
+    case Intrinsic::nvvm_fmin_ftz_nan_f16:
+      return {Intrinsic::minimum, FTZ_MustBeOn, true};
+    case Intrinsic::nvvm_fmin_nan_f16x2:
+      return {Intrinsic::minimum, FTZ_MustBeOff, true};
+    case Intrinsic::nvvm_fmin_ftz_nan_f16x2:
+      return {Intrinsic::minimum, FTZ_MustBeOn, true};
     case Intrinsic::nvvm_round_d:
       return {Intrinsic::round, FTZ_Any};
     case Intrinsic::nvvm_round_f:
@@ -316,9 +368,10 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
   // intrinsic, we don't have to look up any module metadata, as
   // FtzRequirementTy will be FTZ_Any.)
   if (Action.FtzRequirement != FTZ_Any) {
-    StringRef Attr = II->getFunction()
-                         ->getFnAttribute("denormal-fp-math-f32")
-                         .getValueAsString();
+    const char *AttrName =
+        Action.IsHalfTy ? "denormal-fp-math" : "denormal-fp-math-f32";
+    StringRef Attr =
+        II->getFunction()->getFnAttribute(AttrName).getValueAsString();
     DenormalMode Mode = parseDenormalFPAttribute(Attr);
     bool FtzEnabled = Mode.Output != DenormalMode::IEEE;
 
diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 339f51d21087..3f3c4967609a 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -133,15 +133,13 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) {
       // FIXME: Add assertions about ConvCall.
       Str = ConvCall->getArgOperand(0);
     }
-    assert(isa<ConstantExpr>(Str) &&
-           "Format of __nvvm__reflect function not recognized");
-    const ConstantExpr *GEP = cast<ConstantExpr>(Str);
-
-    const Value *Sym = GEP->getOperand(0);
-    assert(isa<Constant>(Sym) &&
+    // Pre opaque pointers we have a constant expression wrapping the constant
+    // string.
+    Str = Str->stripPointerCasts();
+    assert(isa<Constant>(Str) &&
            "Format of __nvvm_reflect function not recognized");
 
-    const Value *Operand = cast<Constant>(Sym)->getOperand(0);
+    const Value *Operand = cast<Constant>(Str)->getOperand(0);
     if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Operand)) {
       // For CUDA-7.0 style __nvvm_reflect calls, we need to find the operand's
       // initializer.
diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index 715cff72dcab..7113fe33b5d7 100644
--- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -341,31 +341,11 @@ public:
 
   bool isU10Imm() const { return Kind == Immediate && isUInt<10>(getImm()); }
   bool isU12Imm() const { return Kind == Immediate && isUInt<12>(getImm()); }
-  bool isU16Imm() const {
-    switch (Kind) {
-      case Expression:
-        return true;
-      case Immediate:
-      case ContextImmediate:
-        return isUInt<16>(getImmU16Context());
-      default:
-        return false;
-    }
-  }
-  bool isS16Imm() const {
-    switch (Kind) {
-      case Expression:
-        return true;
-      case Immediate:
-      case ContextImmediate:
-        return isInt<16>(getImmS16Context());
-      default:
-        return false;
-    }
-  }
-  bool isS16ImmX4() const { return Kind == Expression ||
-                                   (Kind == Immediate && isInt<16>(getImm()) &&
-                                    (getImm() & 3) == 0); }
+  bool isU16Imm() const { return isExtImm<16>(/*Signed*/ false, 1); }
+  bool isS16Imm() const { return isExtImm<16>(/*Signed*/ true, 1); }
+  bool isS16ImmX4() const { return isExtImm<16>(/*Signed*/ true, 4); }
+  bool isS16ImmX16() const { return isExtImm<16>(/*Signed*/ true, 16); }
+  bool isS17Imm() const { return isExtImm<17>(/*Signed*/ true, 1); }
 
   bool isHashImmX8() const {
     // The Hash Imm form is used for instructions that check or store a hash.
@@ -375,9 +355,6 @@ public:
             (getImm() & 7) == 0);
   }
 
-  bool isS16ImmX16() const { return Kind == Expression ||
-                                    (Kind == Immediate && isInt<16>(getImm()) &&
-                                     (getImm() & 15) == 0); }
   bool isS34ImmX16() const {
     return Kind == Expression ||
            (Kind == Immediate && isInt<34>(getImm()) && (getImm() & 15) == 0);
@@ -388,17 +365,6 @@ public:
     return Kind == Expression || (Kind == Immediate && isInt<34>(getImm()));
   }
 
-  bool isS17Imm() const {
-    switch (Kind) {
-      case Expression:
-        return true;
-      case Immediate:
-      case ContextImmediate:
-        return isInt<17>(getImmS16Context());
-      default:
-        return false;
-    }
-  }
   bool isTLSReg() const { return Kind == TLSRegister; }
   bool isDirectBr() const {
     if (Kind == Expression)
@@ -712,6 +678,25 @@ public:
 
     return CreateExpr(Val, S, E, IsPPC64);
   }
+
+private:
+  template <unsigned Width>
+  bool isExtImm(bool Signed, unsigned Multiple) const {
+    switch (Kind) {
+    default:
+      return false;
+    case Expression:
+      return true;
+    case Immediate:
+    case ContextImmediate:
+      if (Signed)
+        return isInt<Width>(getImmS16Context()) &&
+               (getImmS16Context() & (Multiple - 1)) == 0;
+      else
+        return isUInt<Width>(getImmU16Context()) &&
+               (getImmU16Context() & (Multiple - 1)) == 0;
+    }
+  }
 };
 
 } // end anonymous namespace.
diff --git a/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
index 5a12c3f22dee..d3d720054f16 100644
--- a/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -8,8 +8,8 @@
 
 #include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "TargetInfo/PowerPCTargetInfo.h"
+#include "llvm/MC/MCDecoderOps.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
-#include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
@@ -64,14 +64,14 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCDisassembler() {
 
 static DecodeStatus decodeCondBrTarget(MCInst &Inst, unsigned Imm,
                                        uint64_t /*Address*/,
-                                       const void * /*Decoder*/) {
+                                       const MCDisassembler * /*Decoder*/) {
   Inst.addOperand(MCOperand::createImm(SignExtend32<14>(Imm)));
   return MCDisassembler::Success;
 }
 
 static DecodeStatus decodeDirectBrTarget(MCInst &Inst, unsigned Imm,
                                          uint64_t /*Address*/,
-                                         const void * /*Decoder*/) {
+                                         const MCDisassembler * /*Decoder*/) {
   int32_t Offset = SignExtend32<24>(Imm);
   Inst.addOperand(MCOperand::createImm(Offset));
   return MCDisassembler::Success;
@@ -90,85 +90,85 @@ static DecodeStatus decodeRegisterClass(MCInst &Inst, uint64_t RegNo,
 
 static DecodeStatus DecodeCRRCRegisterClass(MCInst &Inst, uint64_t RegNo,
                                             uint64_t Address,
-                                            const void *Decoder) {
+                                            const MCDisassembler *Decoder) {
   return decodeRegisterClass(Inst, RegNo, CRRegs);
 }
 
 static DecodeStatus DecodeCRBITRCRegisterClass(MCInst &Inst, uint64_t RegNo,
-                                            uint64_t Address,
-                                            const void *Decoder) {
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder) {
   return decodeRegisterClass(Inst, RegNo, CRBITRegs);
 }
 
 static DecodeStatus DecodeF4RCRegisterClass(MCInst &Inst, uint64_t RegNo,
                                             uint64_t Address,
-                                            const void *Decoder) {
+                                            const MCDisassembler *Decoder) {
   return decodeRegisterClass(Inst, RegNo, FRegs);
 }
 
 static DecodeStatus DecodeF8RCRegisterClass(MCInst &Inst, uint64_t RegNo,
                                             uint64_t Address,
-                                            const void *Decoder) {
+                                            const MCDisassembler *Decoder) {
   return decodeRegisterClass(Inst, RegNo, FRegs);
 }
 
 static DecodeStatus DecodeVFRCRegisterClass(MCInst &Inst, uint64_t RegNo,
                                             uint64_t Address,
-                                            const void *Decoder) {
+                                            const MCDisassembler *Decoder) {
   return decodeRegisterClass(Inst, RegNo, VFRegs);
 }
 
 static DecodeStatus DecodeVRRCRegisterClass(MCInst &Inst, uint64_t RegNo,
                                             uint64_t Address,
-                                            const void *Decoder) {
+                                            const MCDisassembler *Decoder) {
   return decodeRegisterClass(Inst, RegNo, VRegs);
 }
 
 static DecodeStatus DecodeVSRCRegisterClass(MCInst &Inst, uint64_t RegNo,
                                             uint64_t Address,
-                                            const void *Decoder) {
+                                            const MCDisassembler *Decoder) {
   return decodeRegisterClass(Inst, RegNo, VSRegs);
 }
 
 static DecodeStatus DecodeVSFRCRegisterClass(MCInst &Inst, uint64_t RegNo,
-                                            uint64_t Address,
-                                            const void *Decoder) {
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder) {
   return decodeRegisterClass(Inst, RegNo, VSFRegs);
 }
 
 static DecodeStatus DecodeVSSRCRegisterClass(MCInst &Inst, uint64_t RegNo,
-                                            uint64_t Address,
-                                            const void *Decoder) {
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder) {
   return decodeRegisterClass(Inst, RegNo, VSSRegs);
 }
 
 static DecodeStatus DecodeGPRCRegisterClass(MCInst &Inst, uint64_t RegNo,
                                             uint64_t Address,
-                                            const void *Decoder) {
+                                            const MCDisassembler *Decoder) {
   return decodeRegisterClass(Inst, RegNo, RRegs);
 }
 
-static DecodeStatus DecodeGPRC_NOR0RegisterClass(MCInst &Inst, uint64_t RegNo,
-                                            uint64_t Address,
-                                            const void *Decoder) {
+static DecodeStatus
+DecodeGPRC_NOR0RegisterClass(MCInst &Inst, uint64_t RegNo, uint64_t Address,
+                             const MCDisassembler *Decoder) {
   return decodeRegisterClass(Inst, RegNo, RRegsNoR0);
 }
 
 static DecodeStatus DecodeG8RCRegisterClass(MCInst &Inst, uint64_t RegNo,
                                             uint64_t Address,
-                                            const void *Decoder) {
+                                            const MCDisassembler *Decoder) {
   return decodeRegisterClass(Inst, RegNo, XRegs);
 }
 
 static DecodeStatus DecodeG8pRCRegisterClass(MCInst &Inst, uint64_t RegNo,
                                              uint64_t Address,
-                                             const void *Decoder) {
+                                             const MCDisassembler *Decoder) {
   return decodeRegisterClass(Inst, RegNo, XRegs);
 }
 
-static DecodeStatus DecodeG8RC_NOX0RegisterClass(MCInst &Inst, uint64_t RegNo,
-                                            uint64_t Address,
-                                            const void *Decoder) {
+static DecodeStatus
+DecodeG8RC_NOX0RegisterClass(MCInst &Inst, uint64_t RegNo, uint64_t Address,
+                             const MCDisassembler *Decoder) {
   return decodeRegisterClass(Inst, RegNo, XRegsNoX0);
 }
 
@@ -176,44 +176,47 @@ static DecodeStatus DecodeG8RC_NOX0RegisterClass(MCInst &Inst, uint64_t RegNo,
 #define DecodePointerLikeRegClass1 DecodeGPRC_NOR0RegisterClass
 
 static DecodeStatus DecodeSPERCRegisterClass(MCInst &Inst, uint64_t RegNo,
-                                            uint64_t Address,
-                                            const void *Decoder) {
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder) {
   return decodeRegisterClass(Inst, RegNo, SPERegs);
 }
 
 static DecodeStatus DecodeACCRCRegisterClass(MCInst &Inst, uint64_t RegNo,
                                              uint64_t Address,
-                                             const void *Decoder) {
+                                             const MCDisassembler *Decoder) {
   return decodeRegisterClass(Inst, RegNo, ACCRegs);
 }
 
 static DecodeStatus DecodeVSRpRCRegisterClass(MCInst &Inst, uint64_t RegNo,
                                               uint64_t Address,
-                                              const void *Decoder) {
+                                              const MCDisassembler *Decoder) {
   return decodeRegisterClass(Inst, RegNo, VSRpRegs);
 }
 
 #define DecodeQSRCRegisterClass DecodeQFRCRegisterClass
 #define DecodeQBRCRegisterClass DecodeQFRCRegisterClass
 
-template<unsigned N>
+template <unsigned N>
 static DecodeStatus decodeUImmOperand(MCInst &Inst, uint64_t Imm,
-                                      int64_t Address, const void *Decoder) {
+                                      int64_t Address,
+                                      const MCDisassembler *Decoder) {
   assert(isUInt<N>(Imm) && "Invalid immediate");
   Inst.addOperand(MCOperand::createImm(Imm));
   return MCDisassembler::Success;
 }
 
-template<unsigned N>
+template <unsigned N>
 static DecodeStatus decodeSImmOperand(MCInst &Inst, uint64_t Imm,
-                                      int64_t Address, const void *Decoder) {
+                                      int64_t Address,
+                                      const MCDisassembler *Decoder) {
   assert(isUInt<N>(Imm) && "Invalid immediate");
   Inst.addOperand(MCOperand::createImm(SignExtend64<N>(Imm)));
   return MCDisassembler::Success;
 }
 
 static DecodeStatus decodeImmZeroOperand(MCInst &Inst, uint64_t Imm,
-                                         int64_t Address, const void *Decoder) {
+                                         int64_t Address,
+                                         const MCDisassembler *Decoder) {
   if (Imm != 0)
     return MCDisassembler::Fail;
   Inst.addOperand(MCOperand::createImm(Imm));
@@ -222,7 +225,7 @@ static DecodeStatus decodeImmZeroOperand(MCInst &Inst, uint64_t Imm,
 
 static DecodeStatus decodeVSRpEvenOperands(MCInst &Inst, uint64_t RegNo,
                                            uint64_t Address,
-                                           const void *Decoder) {
+                                           const MCDisassembler *Decoder) {
   if (RegNo & 1)
     return MCDisassembler::Fail;
   Inst.addOperand(MCOperand::createReg(VSRpRegs[RegNo >> 1]));
@@ -230,7 +233,8 @@ static DecodeStatus decodeVSRpEvenOperands(MCInst &Inst, uint64_t RegNo,
 }
 
 static DecodeStatus decodeMemRIOperands(MCInst &Inst, uint64_t Imm,
-                                        int64_t Address, const void *Decoder) {
+                                        int64_t Address,
+                                        const MCDisassembler *Decoder) {
   // Decode the memri field (imm, reg), which has the low 16-bits as the
   // displacement and the next 5 bits as the register #.
 
@@ -265,7 +269,8 @@ static DecodeStatus decodeMemRIOperands(MCInst &Inst, uint64_t Imm,
 }
 
 static DecodeStatus decodeMemRIXOperands(MCInst &Inst, uint64_t Imm,
-                                         int64_t Address, const void *Decoder) {
+                                         int64_t Address,
+                                         const MCDisassembler *Decoder) {
   // Decode the memrix field (imm, reg), which has the low 14-bits as the
   // displacement and the next 5 bits as the register #.
 
@@ -287,7 +292,7 @@ static DecodeStatus decodeMemRIXOperands(MCInst &Inst, uint64_t Imm,
 
 static DecodeStatus decodeMemRIHashOperands(MCInst &Inst, uint64_t Imm,
                                             int64_t Address,
-                                            const void *Decoder) {
+                                            const MCDisassembler *Decoder) {
   // Decode the memrix field for a hash store or hash check operation.
   // The field is composed of a register and an immediate value that is 6 bits
   // and covers the range -8 to -512. The immediate is always negative and 2s
@@ -303,7 +308,8 @@ static DecodeStatus decodeMemRIHashOperands(MCInst &Inst, uint64_t Imm,
 }
 
 static DecodeStatus decodeMemRIX16Operands(MCInst &Inst, uint64_t Imm,
-                                         int64_t Address, const void *Decoder) {
+                                           int64_t Address,
+                                           const MCDisassembler *Decoder) {
   // Decode the memrix16 field (imm, reg), which has the low 12-bits as the
   // displacement with 16-byte aligned, and the next 5 bits as the register #.
 
@@ -319,7 +325,7 @@ static DecodeStatus decodeMemRIX16Operands(MCInst &Inst, uint64_t Imm,
 
 static DecodeStatus decodeMemRI34PCRelOperands(MCInst &Inst, uint64_t Imm,
                                                int64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   // Decode the memri34_pcrel field (imm, reg), which has the low 34-bits as the
   // displacement, and the next 5 bits as an immediate 0.
   uint64_t Base = Imm >> 34;
@@ -333,7 +339,7 @@ static DecodeStatus decodeMemRI34PCRelOperands(MCInst &Inst, uint64_t Imm,
 
 static DecodeStatus decodeMemRI34Operands(MCInst &Inst, uint64_t Imm,
                                           int64_t Address,
-                                          const void *Decoder) {
+                                          const MCDisassembler *Decoder) {
   // Decode the memri34 field (imm, reg), which has the low 34-bits as the
   // displacement, and the next 5 bits as the register #.
   uint64_t Base = Imm >> 34;
@@ -347,7 +353,8 @@ static DecodeStatus decodeMemRI34Operands(MCInst &Inst, uint64_t Imm,
 }
 
 static DecodeStatus decodeSPE8Operands(MCInst &Inst, uint64_t Imm,
-                                         int64_t Address, const void *Decoder) {
+                                       int64_t Address,
+                                       const MCDisassembler *Decoder) {
   // Decode the spe8disp field (imm, reg), which has the low 5-bits as the
   // displacement with 8-byte aligned, and the next 5 bits as the register #.
 
@@ -362,7 +369,8 @@ static DecodeStatus decodeSPE8Operands(MCInst &Inst, uint64_t Imm,
 }
 
 static DecodeStatus decodeSPE4Operands(MCInst &Inst, uint64_t Imm,
-                                         int64_t Address, const void *Decoder) {
+                                       int64_t Address,
+                                       const MCDisassembler *Decoder) {
   // Decode the spe4disp field (imm, reg), which has the low 5-bits as the
   // displacement with 4-byte aligned, and the next 5 bits as the register #.
 
@@ -377,7 +385,8 @@ static DecodeStatus decodeSPE4Operands(MCInst &Inst, uint64_t Imm,
 }
 
 static DecodeStatus decodeSPE2Operands(MCInst &Inst, uint64_t Imm,
-                                         int64_t Address, const void *Decoder) {
+                                       int64_t Address,
+                                       const MCDisassembler *Decoder) {
   // Decode the spe2disp field (imm, reg), which has the low 5-bits as the
   // displacement with 2-byte aligned, and the next 5 bits as the register #.
 
@@ -392,7 +401,8 @@ static DecodeStatus decodeSPE2Operands(MCInst &Inst, uint64_t Imm,
 }
 
 static DecodeStatus decodeCRBitMOperand(MCInst &Inst, uint64_t Imm,
-                                        int64_t Address, const void *Decoder) {
+                                        int64_t Address,
+                                        const MCDisassembler *Decoder) {
   // The cr bit encoding is 0x80 >> cr_reg_num.
 
   unsigned Zeros = countTrailingZeros(Imm);
diff --git a/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.cpp b/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.cpp
index 6b16af293244..b71d59ed79ed 100644
--- a/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.cpp
+++ b/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.cpp
@@ -19,6 +19,7 @@
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/TargetCallingConv.h"
 #include "llvm/Support/Debug.h"
 
diff --git a/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp b/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp
index 6af79324919c..58165fcaac03 100644
--- a/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp
+++ b/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp
@@ -23,5 +23,4 @@
 
 using namespace llvm;
 
-PPCRegisterBankInfo::PPCRegisterBankInfo(const TargetRegisterInfo &TRI)
-    : PPCGenRegisterBankInfo() {}
+PPCRegisterBankInfo::PPCRegisterBankInfo(const TargetRegisterInfo &TRI) {}
diff --git a/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.h b/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.h
index 358d5ed3cf14..31a4c528751f 100644
--- a/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.h
+++ b/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.h
@@ -14,8 +14,8 @@
 #ifndef LLVM_LIB_TARGET_PPC_GISEL_PPCREGISTERBANKINFO_H
 #define LLVM_LIB_TARGET_PPC_GISEL_PPCREGISTERBANKINFO_H
 
-#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/RegisterBank.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 
 #define GET_REGBANK_DECLARATIONS
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 9df94edc8cdf..2e678ffd58c2 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -44,6 +44,7 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
   case PPC::fixup_ppc_half16:
     return Value & 0xffff;
   case PPC::fixup_ppc_half16ds:
+  case PPC::fixup_ppc_half16dq:
     return Value & 0xfffc;
   case PPC::fixup_ppc_pcrel34:
   case PPC::fixup_ppc_imm34:
@@ -60,6 +61,7 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
   case FK_Data_2:
   case PPC::fixup_ppc_half16:
   case PPC::fixup_ppc_half16ds:
+  case PPC::fixup_ppc_half16dq:
     return 2;
   case FK_Data_4:
   case PPC::fixup_ppc_brcond14:
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
index 94ef7b45434f..1e58039582c2 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -125,6 +125,7 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
       }
       break;
     case PPC::fixup_ppc_half16ds:
+    case PPC::fixup_ppc_half16dq:
       Target.print(errs());
       errs() << '\n';
       report_fatal_error("Invalid PC-relative half16ds relocation");
@@ -349,6 +350,7 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
       }
       break;
     case PPC::fixup_ppc_half16ds:
+    case PPC::fixup_ppc_half16dq:
       switch (Modifier) {
       default: llvm_unreachable("Unsupported Modifier");
       case MCSymbolRefExpr::VK_None:
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
index b92b0fc342ec..b020635f4209 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
@@ -77,7 +77,7 @@ void PPCELFStreamer::emitPrefixedInstruction(const MCInst &Inst,
     // label to the top of the fragment containing the aligned instruction that
     // was just added.
     if (InstLine == LabelLine) {
-      AssignFragment(LastLabel, InstructionFragment);
+      assignFragment(LastLabel, InstructionFragment);
       LastLabel->setOffset(0);
     }
   }
@@ -98,7 +98,7 @@ void PPCELFStreamer::emitInstruction(const MCInst &Inst,
   // For example, the load that will get the relocation as follows:
   // .reloc .Lpcrel1-8,R_PPC64_PCREL_OPT,.-(.Lpcrel1-8)
   //  lwa 3, 4(3)
-  if (IsPartOfGOTToPCRelPair.hasValue() && !IsPartOfGOTToPCRelPair.getValue())
+  if (IsPartOfGOTToPCRelPair && !*IsPartOfGOTToPCRelPair)
     emitGOTToPCRelReloc(Inst);
 
   // Special handling is only for prefixed instructions.
@@ -113,7 +113,7 @@ void PPCELFStreamer::emitInstruction(const MCInst &Inst,
   // follows:
   //  pld 3, vec@got@pcrel(0), 1
   // .Lpcrel1:
-  if (IsPartOfGOTToPCRelPair.hasValue() && IsPartOfGOTToPCRelPair.getValue())
+  if (IsPartOfGOTToPCRelPair && *IsPartOfGOTToPCRelPair)
     emitGOTToPCRelLabel(Inst);
 }
 
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
index 73292f7b7938..df0c666f5b11 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
@@ -51,6 +51,10 @@ enum Fixups {
   /// register number.
   fixup_ppc_nofixup,
 
+  /// A 16-bit fixup corresponding to lo16(_foo) with implied 3 zero bits for
+  /// instrs like 'lxv'. Produces the same relocation as fixup_ppc_half16ds.
+  fixup_ppc_half16dq,
+
   // Marker
   LastTargetFixupKind,
   NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index 4dfa7d5e600c..46bbc44e1681 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -34,7 +34,6 @@ using namespace llvm;
 STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
 
 MCCodeEmitter *llvm::createPPCMCCodeEmitter(const MCInstrInfo &MCII,
-                                            const MCRegisterInfo &MRI,
                                             MCContext &Ctx) {
   return new PPCMCCodeEmitter(MCII, Ctx);
 }
@@ -47,10 +46,12 @@ getDirectBrEncoding(const MCInst &MI, unsigned OpNo,
 
   if (MO.isReg() || MO.isImm())
     return getMachineOpValue(MI, MO, Fixups, STI);
+
+  const PPCInstrInfo *InstrInfo = static_cast<const PPCInstrInfo *>(&MCII);
+  unsigned Opcode = MI.getOpcode();
   // Add a fixup for the branch target.
   Fixups.push_back(MCFixup::create(0, MO.getExpr(),
-                                   ((MI.getOpcode() == PPC::BL8_NOTOC ||
-                                     MI.getOpcode() == PPC::BL8_NOTOC_TLS)
+                                   (InstrInfo->isNoTOCCallInstr(Opcode)
                                         ? (MCFixupKind)PPC::fixup_ppc_br24_notoc
                                         : (MCFixupKind)PPC::fixup_ppc_br24)));
   return 0;
@@ -198,8 +199,8 @@ unsigned PPCMCCodeEmitter::getMemRIX16Encoding(const MCInst &MI, unsigned OpNo,
   }
 
   // Otherwise add a fixup for the displacement field.
-  Fixups.push_back(MCFixup::create(IsLittleEndian? 0 : 2, MO.getExpr(),
-                                   (MCFixupKind)PPC::fixup_ppc_half16ds));
+  Fixups.push_back(MCFixup::create(IsLittleEndian ? 0 : 2, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_half16dq));
   return RegBits;
 }
 
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
index abff44449131..6cd04ee018fd 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
@@ -110,9 +110,18 @@ PPCMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
 
   if (Value.isAbsolute()) {
     int64_t Result = evaluateAsInt64(Value.getConstant());
-    if ((Fixup == nullptr || (unsigned)Fixup->getKind() != PPC::fixup_ppc_half16) &&
-        (Result >= 0x8000))
+    bool IsHalf16 = Fixup && Fixup->getTargetKind() == PPC::fixup_ppc_half16;
+    bool IsHalf16DS =
+        Fixup && Fixup->getTargetKind() == PPC::fixup_ppc_half16ds;
+    bool IsHalf16DQ =
+        Fixup && Fixup->getTargetKind() == PPC::fixup_ppc_half16dq;
+    bool IsHalf = IsHalf16 || IsHalf16DS || IsHalf16DQ;
+
+    if (!IsHalf && Result >= 0x8000)
       return false;
+    if ((IsHalf16DS && (Result & 0x3)) || (IsHalf16DQ && (Result & 0xf)))
+      return false;
+
     Res = MCValue::get(Result);
   } else {
     if (!Layout)
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
index 03b316341717..acb860e16518 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
@@ -34,7 +34,6 @@ class MCTargetOptions;
 class Target;
 
 MCCodeEmitter *createPPCMCCodeEmitter(const MCInstrInfo &MCII,
-                                      const MCRegisterInfo &MRI,
                                       MCContext &Ctx);
 
 MCAsmBackend *createPPCAsmBackend(const Target &T, const MCSubtargetInfo &STI,
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
index 64e11dbc1efc..729cb35cbebc 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
@@ -71,6 +71,19 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize(
       return {XCOFF::RelocationType::R_TOCL, SignAndSizeForHalf16};
     }
   } break;
+  case PPC::fixup_ppc_half16ds:
+  case PPC::fixup_ppc_half16dq: {
+    if (IsPCRel)
+      report_fatal_error("Invalid PC-relative relocation.");
+    switch (Modifier) {
+    default:
+      llvm_unreachable("Unsupported Modifier");
+    case MCSymbolRefExpr::VK_None:
+      return {XCOFF::RelocationType::R_TOC, 15};
+    case MCSymbolRefExpr::VK_PPC_L:
+      return {XCOFF::RelocationType::R_TOCL, 15};
+    }
+  } break;
   case PPC::fixup_ppc_br24:
     // Branches are 4 byte aligned, so the 24 bits we encode in
     // the instruction actually represents a 26 bit offset.
@@ -78,15 +91,19 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize(
   case PPC::fixup_ppc_br24abs:
     return {XCOFF::RelocationType::R_RBA, EncodedSignednessIndicator | 25};
   case FK_Data_4:
+  case FK_Data_8:
+    const uint8_t SignAndSizeForFKData =
+        EncodedSignednessIndicator |
+        ((unsigned)Fixup.getKind() == FK_Data_4 ? 31 : 63);
     switch (Modifier) {
     default:
       report_fatal_error("Unsupported modifier");
     case MCSymbolRefExpr::VK_PPC_AIX_TLSGD:
-      return {XCOFF::RelocationType::R_TLS, EncodedSignednessIndicator | 31};
+      return {XCOFF::RelocationType::R_TLS, SignAndSizeForFKData};
     case MCSymbolRefExpr::VK_PPC_AIX_TLSGDM:
-      return {XCOFF::RelocationType::R_TLSM, EncodedSignednessIndicator | 31};
+      return {XCOFF::RelocationType::R_TLSM, SignAndSizeForFKData};
     case MCSymbolRefExpr::VK_None:
-      return {XCOFF::RelocationType::R_POS, EncodedSignednessIndicator | 31};
+      return {XCOFF::RelocationType::R_POS, SignAndSizeForFKData};
     }
   }
 }
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFStreamer.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFStreamer.cpp
index 79db03b0331b..f8b1914bd520 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFStreamer.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFStreamer.cpp
@@ -21,6 +21,7 @@
 #include "PPCMCCodeEmitter.h"
 #include "llvm/BinaryFormat/XCOFF.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCObjectWriter.h"
diff --git a/llvm/lib/Target/PowerPC/P10InstrResources.td b/llvm/lib/Target/PowerPC/P10InstrResources.td
index edd3b42d47e1..a6ba5adda839 100644
--- a/llvm/lib/Target/PowerPC/P10InstrResources.td
+++ b/llvm/lib/Target/PowerPC/P10InstrResources.td
@@ -956,7 +956,7 @@ def : InstRW<[P10W_FX_3C, P10W_DISP_ANY, P10FX_Read],
     WAIT,
     XSABSDP,
     XSABSQP,
-    XSNABSDP,
+    XSNABSDP, XSNABSDPs,
     XSNABSQP,
     XSNEGDP,
     XSNEGQP,
@@ -1372,7 +1372,7 @@ def : InstRW<[P10W_LD_6C, P10W_DISP_EVEN, P10W_DISP_ANY, P10LD_Read, P10LD_Read]
     LDCIX,
     LHZCIX,
     LWZCIX,
-    MTSPR, MTSPR8, MTSR, MTVRSAVE, MTVRSAVEv
+    MTSPR, MTSPR8, MTSR, MTUDSCR, MTVRSAVE, MTVRSAVEv
 )>;
 
 // Expand instructions
@@ -1469,7 +1469,7 @@ def : InstRW<[P10W_LD_6C, P10W_DISP_PAIR, P10W_SX_3C],
 // 13 Cycles Unknown operations, 1 input operands
 def : InstRW<[P10W_MFL_13C, P10W_DISP_EVEN, P10W_DISP_ANY],
       (instrs
-    MFSPR, MFSPR8, MFSR, MFTB8, MFVRSAVE, MFVRSAVEv
+    MFSPR, MFSPR8, MFSR, MFTB8, MFUDSCR, MFVRSAVE, MFVRSAVEv
 )>;
 
 // 10 Cycles SIMD Matrix Multiply Engine operations, 0 input operands
@@ -1625,6 +1625,7 @@ def : InstRW<[P10W_PM_4C, P10W_DISP_ANY, P10PM_Read],
       (instrs
     LVSL,
     LVSR,
+    LXVKQ,
     MFVSRLD,
     MTVSRWS,
     VCLZLSBB,
@@ -1979,7 +1980,6 @@ def : InstRW<[P10W_SX, P10W_DISP_ANY],
     ICBTLS,
     ICCCI,
     LA, LA8,
-    LDMX,
     MFDCR,
     MFPMR,
     MFSRIN,
diff --git a/llvm/lib/Target/PowerPC/P9InstrResources.td b/llvm/lib/Target/PowerPC/P9InstrResources.td
index c088d7847ce4..2bbab64ce0da 100644
--- a/llvm/lib/Target/PowerPC/P9InstrResources.td
+++ b/llvm/lib/Target/PowerPC/P9InstrResources.td
@@ -156,6 +156,7 @@ def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C],
     MCRF,
     MCRXRX,
     XSNABSDP,
+    XSNABSDPs,
     XSXEXPDP,
     XSABSDP,
     XSNEGDP,
@@ -807,14 +808,6 @@ def : InstRW<[P9_StoreAndALUOp_3C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C,
     (instregex "ST(B|H|W|D)CX$")
 )>;
 
-// Cracked Load Instruction.
-// Two consecutive load operations for a total of 8 cycles.
-def : InstRW<[P9_LoadAndLoadOp_8C, IP_AGEN_1C, IP_AGEN_1C,
-              DISP_1C, DISP_1C],
-      (instrs
-    LDMX
-)>;
-
 // Cracked Load instruction.
 // Requires consecutive Load and ALU pieces totaling 6 cycles. The Load and ALU
 // operations cannot be done at the same time and so their latencies are added.
@@ -940,6 +933,7 @@ def : InstRW<[P9_DIV_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_EVEN_1C],
     (instregex "M(T|F)TB(8)?$"),
     (instregex "MF(SPR|CTR|LR)(8)?$"),
     (instregex "M(T|F)MSR(D)?$"),
+    (instregex "M(T|F)(U)?DSCR$"),
     (instregex "MTSPR(8)?$")
 )>;
 
diff --git a/llvm/lib/Target/PowerPC/PPC.h b/llvm/lib/Target/PowerPC/PPC.h
index 7235a878e38b..4eceb3afc70f 100644
--- a/llvm/lib/Target/PowerPC/PPC.h
+++ b/llvm/lib/Target/PowerPC/PPC.h
@@ -33,7 +33,6 @@ class MCInst;
 class MCOperand;
 class ModulePass;
 
-FunctionPass *createPPCCTRLoops();
 #ifndef NDEBUG
   FunctionPass *createPPCCTRLoopsVerify();
 #endif
@@ -53,12 +52,12 @@ FunctionPass *createPPCCTRLoops();
   FunctionPass *createPPCExpandISELPass();
   FunctionPass *createPPCPreEmitPeepholePass();
   FunctionPass *createPPCExpandAtomicPseudoPass();
+  FunctionPass *createPPCCTRLoopsPass();
   void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                     AsmPrinter &AP);
   bool LowerPPCMachineOperandToMCOperand(const MachineOperand &MO,
                                          MCOperand &OutMO, AsmPrinter &AP);
 
-  void initializePPCCTRLoopsPass(PassRegistry&);
 #ifndef NDEBUG
   void initializePPCCTRLoopsVerifyPass(PassRegistry&);
 #endif
@@ -77,6 +76,7 @@ FunctionPass *createPPCCTRLoops();
   void initializePPCTLSDynamicCallPass(PassRegistry &);
   void initializePPCMIPeepholePass(PassRegistry&);
   void initializePPCExpandAtomicPseudoPass(PassRegistry &);
+  void initializePPCCTRLoopsPass(PassRegistry &);
 
   extern char &PPCVSXFMAMutateID;
 
@@ -84,6 +84,10 @@ FunctionPass *createPPCCTRLoops();
   void initializePPCLowerMASSVEntriesPass(PassRegistry &);
   extern char &PPCLowerMASSVEntriesID;
 
+  ModulePass *createPPCGenScalarMASSEntriesPass();
+  void initializePPCGenScalarMASSEntriesPass(PassRegistry &);
+  extern char &PPCGenScalarMASSEntriesID;
+
   InstructionSelector *
   createPPCInstructionSelector(const PPCTargetMachine &, const PPCSubtarget &,
                                const PPCRegisterBankInfo &);
diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td
index bbd5f5fd1941..310bf8125f1c 100644
--- a/llvm/lib/Target/PowerPC/PPC.td
+++ b/llvm/lib/Target/PowerPC/PPC.td
@@ -263,6 +263,10 @@ def FeatureISA3_1 : SubtargetFeature<"isa-v31-instructions", "IsISA3_1",
                                      "true",
                                      "Enable instructions in ISA 3.1.",
                                      [FeatureISA3_0]>;
+def FeatureISAFuture : SubtargetFeature<"isa-future-instructions",
+                                        "IsISAFuture", "true",
+                                        "Enable instructions for Future ISA.",
+                                        [FeatureISA3_1]>;
 def FeatureP9Altivec : SubtargetFeature<"power9-altivec", "HasP9Altivec", "true",
                                         "Enable POWER9 Altivec instructions",
                                         [FeatureISA3_0, FeatureP8Altivec]>;
@@ -376,7 +380,8 @@ def ProcessorFeatures {
      FeaturePartwordAtomic,
      FeatureQuadwordAtomic,
      FeaturePredictableSelectIsExpensive,
-     FeatureISA2_07
+     FeatureISA2_07,
+     FeatureCRBits
     ];
 
   list<SubtargetFeature> P8SpecificFeatures = [FeatureAddiLoadFusion,
@@ -429,7 +434,7 @@ def ProcessorFeatures {
   // Future
   // For future CPU we assume that all of the existing features from Power10
   // still exist with the exception of those we know are Power10 specific.
-  list<SubtargetFeature> FutureAdditionalFeatures = [];
+  list<SubtargetFeature> FutureAdditionalFeatures = [FeatureISAFuture];
   list<SubtargetFeature> FutureSpecificFeatures = [];
   list<SubtargetFeature> FutureInheritableFeatures =
     !listconcat(P10InheritableFeatures, FutureAdditionalFeatures);
@@ -591,7 +596,8 @@ def : ProcessorModel<"a2", PPCA2Model,
                    FeatureSTFIWX, FeatureLFIWAX,
                    FeatureFPRND, FeatureFPCVT, FeatureISEL,
                    FeatureSlowPOPCNTD, FeatureCMPB, FeatureLDBRX,
-                   Feature64Bit /*, Feature64BitRegs */, FeatureMFTB]>;
+                   Feature64Bit /*, Feature64BitRegs */, FeatureMFTB,
+                   FeatureISA2_06]>;
 def : ProcessorModel<"pwr3", G5Model,
                   [DirectivePwr3, FeatureAltivec,
                    FeatureFRES, FeatureFRSQRTE, FeatureMFOCRF,
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 780981806996..22f35c8fa8d3 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -34,6 +34,7 @@
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
@@ -254,6 +255,8 @@ public:
 
   void emitFunctionBodyEnd() override;
 
+  void emitPGORefs();
+
   void emitEndOfAsmFile(Module &) override;
 
   void emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const override;
@@ -879,7 +882,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
 
       // Print MO for better readability
       if (isVerbose())
-        OutStreamer->GetCommentOS() << MO << '\n';
+        OutStreamer->getCommentOS() << MO << '\n';
       EmitToStreamer(*OutStreamer, TmpInst);
       return;
     }
@@ -950,7 +953,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
 
     // Print MO for better readability
     if (isVerbose() && IsAIX)
-      OutStreamer->GetCommentOS() << MO << '\n';
+      OutStreamer->getCommentOS() << MO << '\n';
     EmitToStreamer(*OutStreamer, TmpInst);
     return;
   }
@@ -1582,7 +1585,7 @@ void PPCLinuxAsmPrinter::emitStartOfAsmFile(Module &M) {
   if (M.getPICLevel() == PICLevel::SmallPIC)
     return AsmPrinter::emitStartOfAsmFile(M);
 
-  OutStreamer->SwitchSection(OutContext.getELFSection(
+  OutStreamer->switchSection(OutContext.getELFSection(
       ".got2", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC));
 
   MCSymbol *TOCSym = OutContext.getOrCreateSymbol(Twine(".LTOC"));
@@ -1599,7 +1602,7 @@ void PPCLinuxAsmPrinter::emitStartOfAsmFile(Module &M) {
 
   OutStreamer->emitAssignment(TOCSym, tocExpr);
 
-  OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
+  OutStreamer->switchSection(getObjFileLowering().getTextSection());
 }
 
 void PPCLinuxAsmPrinter::emitFunctionEntryLabel() {
@@ -1657,7 +1660,7 @@ void PPCLinuxAsmPrinter::emitFunctionEntryLabel() {
   MCSectionSubPair Current = OutStreamer->getCurrentSection();
   MCSectionELF *Section = OutStreamer->getContext().getELFSection(
       ".opd", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
-  OutStreamer->SwitchSection(Section);
+  OutStreamer->switchSection(Section);
   OutStreamer->emitLabel(CurrentFnSym);
   OutStreamer->emitValueToAlignment(8);
   MCSymbol *Symbol1 = CurrentFnSymForSize;
@@ -1672,7 +1675,7 @@ void PPCLinuxAsmPrinter::emitFunctionEntryLabel() {
     8/*size*/);
   // Emit a null environment pointer.
   OutStreamer->emitIntValue(0, 8 /* size */);
-  OutStreamer->SwitchSection(Current.first, Current.second);
+  OutStreamer->switchSection(Current.first, Current.second);
 }
 
 void PPCLinuxAsmPrinter::emitEndOfAsmFile(Module &M) {
@@ -1689,7 +1692,7 @@ void PPCLinuxAsmPrinter::emitEndOfAsmFile(Module &M) {
     const char *Name = isPPC64 ? ".toc" : ".got2";
     MCSectionELF *Section = OutContext.getELFSection(
         Name, ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
-    OutStreamer->SwitchSection(Section);
+    OutStreamer->switchSection(Section);
     if (!isPPC64)
       OutStreamer->emitValueToAlignment(4);
 
@@ -1895,10 +1898,15 @@ void PPCAIXAsmPrinter::emitLinkage(const GlobalValue *GV,
 
   MCSymbolAttr VisibilityAttr = MCSA_Invalid;
   if (!TM.getIgnoreXCOFFVisibility()) {
+    if (GV->hasDLLExportStorageClass() && !GV->hasDefaultVisibility())
+      report_fatal_error(
+          "Cannot not be both dllexport and non-default visibility");
     switch (GV->getVisibility()) {
 
-    // TODO: "exported" and "internal" Visibility needs to go here.
+    // TODO: "internal" Visibility needs to go here.
     case GlobalValue::DefaultVisibility:
+      if (GV->hasDLLExportStorageClass())
+        VisibilityAttr = MAI->getExportedVisibilityAttr();
       break;
     case GlobalValue::HiddenVisibility:
       VisibilityAttr = MAI->getHiddenVisibilityAttr();
@@ -1956,7 +1964,7 @@ void PPCAIXAsmPrinter::emitFunctionBodyEnd() {
   if (!TargetLoweringObjectFileXCOFF::ShouldEmitEHBlock(MF) &&
       (getNumberOfVRSaved() > 0)) {
     // Emit dummy EH Info Table.
-    OutStreamer->SwitchSection(getObjFileLowering().getCompactUnwindSection());
+    OutStreamer->switchSection(getObjFileLowering().getCompactUnwindSection());
     MCSymbol *EHInfoLabel =
         TargetLoweringObjectFileXCOFF::getEHInfoTableSymbol(MF);
     OutStreamer->emitLabel(EHInfoLabel);
@@ -1971,7 +1979,7 @@ void PPCAIXAsmPrinter::emitFunctionBodyEnd() {
 
     OutStreamer->emitIntValue(0, PointerSize);
     OutStreamer->emitIntValue(0, PointerSize);
-    OutStreamer->SwitchSection(MF->getSection());
+    OutStreamer->switchSection(MF->getSection());
   }
 }
 
@@ -2382,9 +2390,9 @@ void PPCAIXAsmPrinter::emitGlobalVariableHelper(const GlobalVariable *GV) {
   // Print GV in verbose mode
   if (isVerbose()) {
     if (GV->hasInitializer()) {
-      GV->printAsOperand(OutStreamer->GetCommentOS(),
+      GV->printAsOperand(OutStreamer->getCommentOS(),
                          /*PrintType=*/false, GV->getParent());
-      OutStreamer->GetCommentOS() << '\n';
+      OutStreamer->getCommentOS() << '\n';
     }
   }
 
@@ -2392,14 +2400,14 @@ void PPCAIXAsmPrinter::emitGlobalVariableHelper(const GlobalVariable *GV) {
       getObjFileLowering().SectionForGlobal(GV, GVKind, TM));
 
   // Switch to the containing csect.
-  OutStreamer->SwitchSection(Csect);
+  OutStreamer->switchSection(Csect);
 
   const DataLayout &DL = GV->getParent()->getDataLayout();
 
   // Handle common and zero-initialized local symbols.
   if (GV->hasCommonLinkage() || GVKind.isBSSLocal() ||
       GVKind.isThreadBSSLocal()) {
-    Align Alignment = GV->getAlign().getValueOr(DL.getPreferredAlign(GV));
+    Align Alignment = GV->getAlign().value_or(DL.getPreferredAlign(GV));
     uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
     GVSym->setStorageClass(
         TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GV));
@@ -2424,9 +2432,8 @@ void PPCAIXAsmPrinter::emitGlobalVariableHelper(const GlobalVariable *GV) {
   }
 
   // Emit aliasing label for global variable.
-  llvm::for_each(GOAliasMap[GV], [this](const GlobalAlias *Alias) {
+  for (const GlobalAlias *Alias : GOAliasMap[GV])
     OutStreamer->emitLabel(getSymbol(Alias));
-  });
 
   emitGlobalConstant(GV->getParent()->getDataLayout(), GV->getInitializer());
 }
@@ -2437,14 +2444,12 @@ void PPCAIXAsmPrinter::emitFunctionDescriptor() {
 
   MCSectionSubPair Current = OutStreamer->getCurrentSection();
   // Emit function descriptor.
-  OutStreamer->SwitchSection(
+  OutStreamer->switchSection(
       cast<MCSymbolXCOFF>(CurrentFnDescSym)->getRepresentedCsect());
 
   // Emit aliasing label for function descriptor csect.
-  llvm::for_each(GOAliasMap[&MF->getFunction()],
-                 [this](const GlobalAlias *Alias) {
-                   OutStreamer->emitLabel(getSymbol(Alias));
-                 });
+  for (const GlobalAlias *Alias : GOAliasMap[&MF->getFunction()])
+    OutStreamer->emitLabel(getSymbol(Alias));
 
   // Emit function entry point address.
   OutStreamer->emitValue(MCSymbolRefExpr::create(CurrentFnSym, OutContext),
@@ -2458,7 +2463,7 @@ void PPCAIXAsmPrinter::emitFunctionDescriptor() {
   // Emit a null environment pointer.
   OutStreamer->emitIntValue(0, PointerSize);
 
-  OutStreamer->SwitchSection(Current.first, Current.second);
+  OutStreamer->switchSection(Current.first, Current.second);
 }
 
 void PPCAIXAsmPrinter::emitFunctionEntryLabel() {
@@ -2468,11 +2473,34 @@ void PPCAIXAsmPrinter::emitFunctionEntryLabel() {
     PPCAsmPrinter::emitFunctionEntryLabel();
 
   // Emit aliasing label for function entry point label.
-  llvm::for_each(
-      GOAliasMap[&MF->getFunction()], [this](const GlobalAlias *Alias) {
-        OutStreamer->emitLabel(
-            getObjFileLowering().getFunctionEntryPointSymbol(Alias, TM));
-      });
+  for (const GlobalAlias *Alias : GOAliasMap[&MF->getFunction()])
+    OutStreamer->emitLabel(
+        getObjFileLowering().getFunctionEntryPointSymbol(Alias, TM));
+}
+
+void PPCAIXAsmPrinter::emitPGORefs() {
+  if (OutContext.hasXCOFFSection(
+          "__llvm_prf_cnts",
+          XCOFF::CsectProperties(XCOFF::XMC_RW, XCOFF::XTY_SD))) {
+    MCSection *CntsSection = OutContext.getXCOFFSection(
+        "__llvm_prf_cnts", SectionKind::getData(),
+        XCOFF::CsectProperties(XCOFF::XMC_RW, XCOFF::XTY_SD),
+        /*MultiSymbolsAllowed*/ true);
+
+    OutStreamer->switchSection(CntsSection);
+    if (OutContext.hasXCOFFSection(
+            "__llvm_prf_data",
+            XCOFF::CsectProperties(XCOFF::XMC_RW, XCOFF::XTY_SD)))
+      OutStreamer->emitXCOFFRefDirective("__llvm_prf_data[RW]");
+    if (OutContext.hasXCOFFSection(
+            "__llvm_prf_names",
+            XCOFF::CsectProperties(XCOFF::XMC_RO, XCOFF::XTY_SD)))
+      OutStreamer->emitXCOFFRefDirective("__llvm_prf_names[RO]");
+    if (OutContext.hasXCOFFSection(
+            "__llvm_prf_vnds",
+            XCOFF::CsectProperties(XCOFF::XMC_RW, XCOFF::XTY_SD)))
+      OutStreamer->emitXCOFFRefDirective("__llvm_prf_vnds[RW]");
+  }
 }
 
 void PPCAIXAsmPrinter::emitEndOfAsmFile(Module &M) {
@@ -2481,8 +2509,10 @@ void PPCAIXAsmPrinter::emitEndOfAsmFile(Module &M) {
   if (M.empty() && TOCDataGlobalVars.empty())
     return;
 
+  emitPGORefs();
+
   // Switch to section to emit TOC base.
-  OutStreamer->SwitchSection(getObjFileLowering().getTOCBaseSection());
+  OutStreamer->switchSection(getObjFileLowering().getTOCBaseSection());
 
   PPCTargetStreamer *TS =
       static_cast<PPCTargetStreamer *>(OutStreamer->getTargetStreamer());
@@ -2504,7 +2534,7 @@ void PPCAIXAsmPrinter::emitEndOfAsmFile(Module &M) {
       TCEntry = cast<MCSectionXCOFF>(
           getObjFileLowering().getSectionForTOCEntry(I.first.first, TM));
     }
-    OutStreamer->SwitchSection(TCEntry);
+    OutStreamer->switchSection(TCEntry);
 
     OutStreamer->emitLabel(I.second);
     if (TS != nullptr)
diff --git a/llvm/lib/Target/PowerPC/PPCBack2BackFusion.def b/llvm/lib/Target/PowerPC/PPCBack2BackFusion.def
index 38ed5f2e78e3..f1eecfea5a5e 100644
--- a/llvm/lib/Target/PowerPC/PPCBack2BackFusion.def
+++ b/llvm/lib/Target/PowerPC/PPCBack2BackFusion.def
@@ -434,6 +434,7 @@ FUSION_FEATURE(GeneralBack2Back, hasBack2BackFusion, -1,
     XSMINDP,
     XSMINJDP,
     XSNABSDP,
+    XSNABSDPs,
     XSNABSQP,
     XSNEGDP,
     XSNEGQP,
@@ -978,6 +979,7 @@ FUSION_FEATURE(GeneralBack2Back, hasBack2BackFusion, -1,
     XSMINDP,
     XSMINJDP,
     XSNABSDP,
+    XSNABSDPs,
     XSNABSQP,
     XSNEGDP,
     XSNEGQP,
diff --git a/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp b/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
index b1f5bdd885cd..48167c3dc9ca 100644
--- a/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -1,4 +1,4 @@
-//===-- PPCCTRLoops.cpp - Verify CTR loops -----------------===//
+//===-- PPCCTRLoops.cpp - Generate CTR loops ------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,31 +6,38 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This pass verifies that all bdnz/bdz instructions are dominated by a loop
-// mtctr before any other instructions that might clobber the ctr register.
+// This pass generates machine instructions for the CTR loops related pseudos:
+// 1: MTCTRPseudo/DecreaseCTRPseudo
+// 2: MTCTR8Pseudo/DecreaseCTR8Pseudo
+//
+// If a CTR loop can be generated:
+// 1: MTCTRPseudo/MTCTR8Pseudo will be converted to "mtctr"
+// 2: DecreaseCTRPseudo/DecreaseCTR8Pseudo will be converted to "bdnz/bdz" and
+//    its user branch instruction can be deleted.
+//
+// If a CTR loop can not be generated due to clobber of CTR:
+// 1: MTCTRPseudo/MTCTR8Pseudo can be deleted.
+// 2: DecreaseCTRPseudo/DecreaseCTR8Pseudo will be converted to "addi -1" and
+//    a "cmplwi/cmpldi".
+//
+// This pass runs just before register allocation, because we don't want
+// register allocator to allocate register for DecreaseCTRPseudo if a CTR can be
+// generated or if a CTR loop can not be generated, we don't have any condition
+// register for the new added "cmplwi/cmpldi".
 //
 //===----------------------------------------------------------------------===//
 
-// CTR loops are produced by the HardwareLoops pass and this pass is simply a
-// verification that no invalid CTR loops are produced. As such, it isn't
-// something that needs to be run (or even defined) for Release builds so the
-// entire file is guarded by NDEBUG.
-#ifndef NDEBUG
-#include <vector>
-
-#include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "PPC.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/ilist_iterator.h"
+#include "PPCInstrInfo.h"
+#include "PPCSubtarget.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBundleIterator.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Register.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
@@ -38,148 +45,314 @@
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/GenericDomTreeConstruction.h"
-#include "llvm/Support/Printable.h"
-#include "llvm/Support/raw_ostream.h"
+#include <cassert>
 
 using namespace llvm;
 
-#define DEBUG_TYPE "ppc-ctrloops-verify"
+#define DEBUG_TYPE "ppc-ctrloops"
+
+STATISTIC(NumCTRLoops, "Number of CTR loops generated");
+STATISTIC(NumNormalLoops, "Number of normal compare + branch loops generated");
 
 namespace {
+class PPCCTRLoops : public MachineFunctionPass {
+public:
+  static char ID;
 
-  struct PPCCTRLoopsVerify : public MachineFunctionPass {
-  public:
-    static char ID;
+  PPCCTRLoops() : MachineFunctionPass(ID) {
+    initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry());
+  }
 
-    PPCCTRLoopsVerify() : MachineFunctionPass(ID) {
-      initializePPCCTRLoopsVerifyPass(*PassRegistry::getPassRegistry());
-    }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
 
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<MachineDominatorTree>();
-      MachineFunctionPass::getAnalysisUsage(AU);
-    }
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-    bool runOnMachineFunction(MachineFunction &MF) override;
+private:
+  const PPCInstrInfo *TII = nullptr;
+  MachineRegisterInfo *MRI = nullptr;
 
-  private:
-    MachineDominatorTree *MDT;
-  };
+  bool processLoop(MachineLoop *ML);
+  bool isCTRClobber(MachineInstr *MI, bool CheckReads) const;
+  void expandNormalLoops(MachineLoop *ML, MachineInstr *Start,
+                         MachineInstr *Dec);
+  void expandCTRLoops(MachineLoop *ML, MachineInstr *Start, MachineInstr *Dec);
+};
+} // namespace
+
+char PPCCTRLoops::ID = 0;
+
+INITIALIZE_PASS_BEGIN(PPCCTRLoops, DEBUG_TYPE, "PowerPC CTR loops generation",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(PPCCTRLoops, DEBUG_TYPE, "PowerPC CTR loops generation",
+                    false, false)
 
-  char PPCCTRLoopsVerify::ID = 0;
-} // end anonymous namespace
+FunctionPass *llvm::createPPCCTRLoopsPass() { return new PPCCTRLoops(); }
 
-INITIALIZE_PASS_BEGIN(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
-                      "PowerPC CTR Loops Verify", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_END(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
-                    "PowerPC CTR Loops Verify", false, false)
+bool PPCCTRLoops::runOnMachineFunction(MachineFunction &MF) {
+  bool Changed = false;
 
-FunctionPass *llvm::createPPCCTRLoopsVerify() {
-  return new PPCCTRLoopsVerify();
+  auto &MLI = getAnalysis<MachineLoopInfo>();
+  TII = static_cast<const PPCInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  MRI = &MF.getRegInfo();
+
+  for (auto ML : MLI) {
+    if (ML->isOutermost())
+      Changed |= processLoop(ML);
+  }
+
+  return Changed;
 }
 
-static bool clobbersCTR(const MachineInstr &MI) {
-  for (const MachineOperand &MO : MI.operands()) {
-    if (MO.isReg()) {
-      if (MO.isDef() && (MO.getReg() == PPC::CTR || MO.getReg() == PPC::CTR8))
-        return true;
-    } else if (MO.isRegMask()) {
-      if (MO.clobbersPhysReg(PPC::CTR) || MO.clobbersPhysReg(PPC::CTR8))
-        return true;
-    }
+bool PPCCTRLoops::isCTRClobber(MachineInstr *MI, bool CheckReads) const {
+  if (!CheckReads) {
+    // If we are only checking for defs, that is we are going to find
+    // definitions before MTCTRloop, for this case:
+    // CTR defination inside the callee of a call instruction will not impact
+    // the defination of MTCTRloop, so we can use definesRegister() for the
+    // check, no need to check the regmask.
+    return (MI->definesRegister(PPC::CTR) &&
+            !MI->registerDefIsDead(PPC::CTR)) ||
+           (MI->definesRegister(PPC::CTR8) &&
+            !MI->registerDefIsDead(PPC::CTR8));
   }
 
+  if ((MI->modifiesRegister(PPC::CTR) && !MI->registerDefIsDead(PPC::CTR)) ||
+      (MI->modifiesRegister(PPC::CTR8) && !MI->registerDefIsDead(PPC::CTR8)))
+    return true;
+
+  if (MI->getDesc().isCall())
+    return true;
+
+  // We define the CTR in the loop preheader, so if there is any CTR reader in
+  // the loop, we also can not use CTR loop form.
+  if (MI->readsRegister(PPC::CTR) || MI->readsRegister(PPC::CTR8))
+    return true;
+
   return false;
 }
 
-static bool verifyCTRBranch(MachineBasicBlock *MBB,
-                            MachineBasicBlock::iterator I) {
-  MachineBasicBlock::iterator BI = I;
-  SmallSet<MachineBasicBlock *, 16>   Visited;
-  SmallVector<MachineBasicBlock *, 8> Preds;
-  bool CheckPreds;
-
-  if (I == MBB->begin()) {
-    Visited.insert(MBB);
-    goto queue_preds;
-  } else
-    --I;
-
-check_block:
-  Visited.insert(MBB);
-  if (I == MBB->end())
-    goto queue_preds;
-
-  CheckPreds = true;
-  for (MachineBasicBlock::iterator IE = MBB->begin();; --I) {
-    unsigned Opc = I->getOpcode();
-    if (Opc == PPC::MTCTRloop || Opc == PPC::MTCTR8loop) {
-      CheckPreds = false;
+bool PPCCTRLoops::processLoop(MachineLoop *ML) {
+  bool Changed = false;
+
+  // Align with HardwareLoop pass, process inner loops first.
+  for (auto I = ML->begin(), E = ML->end(); I != E; ++I)
+    Changed |= processLoop(*I);
+
+  // If any inner loop is changed, outter loop must be without hardware loop
+  // intrinsics.
+  if (Changed)
+    return true;
+
+  auto IsLoopStart = [](MachineInstr &MI) {
+    return MI.getOpcode() == PPC::MTCTRPseudo ||
+           MI.getOpcode() == PPC::MTCTR8Pseudo;
+  };
+
+  auto SearchForStart =
+      [&IsLoopStart](MachineBasicBlock *MBB) -> MachineInstr * {
+    for (auto &MI : *MBB) {
+      if (IsLoopStart(MI))
+        return &MI;
+    }
+    return nullptr;
+  };
+
+  MachineInstr *Start = nullptr;
+  MachineInstr *Dec = nullptr;
+  bool InvalidCTRLoop = false;
+
+  MachineBasicBlock *Preheader = ML->getLoopPreheader();
+  // If there is no preheader for this loop, there must be no MTCTRPseudo
+  // either.
+  if (!Preheader)
+    return false;
+
+  Start = SearchForStart(Preheader);
+  // This is not a CTR loop candidate.
+  if (!Start)
+    return false;
+
+  // If CTR is live to the preheader, we can not redefine the CTR register.
+  if (Preheader->isLiveIn(PPC::CTR) || Preheader->isLiveIn(PPC::CTR8))
+    InvalidCTRLoop = true;
+
+  // Make sure there is also no CTR clobber in the block preheader between the
+  // begin and MTCTR.
+  for (MachineBasicBlock::reverse_instr_iterator I =
+           std::next(Start->getReverseIterator());
+       I != Preheader->instr_rend(); ++I)
+    // Only check the definitions of CTR. If there is non-dead definition for
+    // the CTR, we conservatively don't generate a CTR loop.
+    if (isCTRClobber(&*I, /* CheckReads */ false)) {
+      InvalidCTRLoop = true;
       break;
     }
 
-    if (I != BI && clobbersCTR(*I)) {
-      LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << " (" << MBB->getFullName()
-                        << ") instruction " << *I
-                        << " clobbers CTR, invalidating "
-                        << printMBBReference(*BI->getParent()) << " ("
-                        << BI->getParent()->getFullName() << ") instruction "
-                        << *BI << "\n");
-      return false;
+  // Make sure there is also no CTR clobber/user in the block preheader between
+  // MTCTR and the end.
+  for (MachineBasicBlock::instr_iterator I = std::next(Start->getIterator());
+       I != Preheader->instr_end(); ++I)
+    if (isCTRClobber(&*I, /* CheckReads */ true)) {
+      InvalidCTRLoop = true;
+      break;
     }
 
-    if (I == IE)
+  // Find the CTR loop components and decide whether or not to fall back to a
+  // normal loop.
+  for (auto *MBB : reverse(ML->getBlocks())) {
+    for (auto &MI : *MBB) {
+      if (MI.getOpcode() == PPC::DecreaseCTRPseudo ||
+          MI.getOpcode() == PPC::DecreaseCTR8Pseudo)
+        Dec = &MI;
+      else if (!InvalidCTRLoop)
+        // If any instruction clobber CTR, then we can not generate a CTR loop.
+        InvalidCTRLoop |= isCTRClobber(&MI, /* CheckReads */ true);
+    }
+    if (Dec && InvalidCTRLoop)
       break;
   }
 
-  if (!CheckPreds && Preds.empty())
-    return true;
-
-  if (CheckPreds) {
-queue_preds:
-    if (MachineFunction::iterator(MBB) == MBB->getParent()->begin()) {
-      LLVM_DEBUG(dbgs() << "Unable to find a MTCTR instruction for "
-                        << printMBBReference(*BI->getParent()) << " ("
-                        << BI->getParent()->getFullName() << ") instruction "
-                        << *BI << "\n");
-      return false;
-    }
+  assert(Dec && "CTR loop is not complete!");
 
-    append_range(Preds, MBB->predecessors());
+  if (InvalidCTRLoop) {
+    expandNormalLoops(ML, Start, Dec);
+    ++NumNormalLoops;
   }
+  else {
+    expandCTRLoops(ML, Start, Dec);
+    ++NumCTRLoops;
+  }
+  return true;
+}
+
+void PPCCTRLoops::expandNormalLoops(MachineLoop *ML, MachineInstr *Start,
+                                    MachineInstr *Dec) {
+  bool Is64Bit =
+      Start->getParent()->getParent()->getSubtarget<PPCSubtarget>().isPPC64();
+
+  MachineBasicBlock *Preheader = Start->getParent();
+  MachineBasicBlock *Exiting = Dec->getParent();
+  assert((Preheader && Exiting) &&
+         "Preheader and exiting should exist for CTR loop!");
+
+  assert(Dec->getOperand(1).getImm() == 1 &&
+         "Loop decrement stride must be 1");
+
+  unsigned ADDIOpcode = Is64Bit ? PPC::ADDI8 : PPC::ADDI;
+  unsigned CMPOpcode = Is64Bit ? PPC::CMPLDI : PPC::CMPLWI;
+
+  Register PHIDef =
+      MRI->createVirtualRegister(Is64Bit ? &PPC::G8RC_and_G8RC_NOX0RegClass
+                                         : &PPC::GPRC_and_GPRC_NOR0RegClass);
 
-  do {
-    MBB = Preds.pop_back_val();
-    if (!Visited.count(MBB)) {
-      I = MBB->getLastNonDebugInstr();
-      goto check_block;
+  Start->getParent()->getParent()->getProperties().reset(
+      MachineFunctionProperties::Property::NoPHIs);
+
+  // Generate "PHI" in the header block.
+  auto PHIMIB = BuildMI(*ML->getHeader(), ML->getHeader()->getFirstNonPHI(),
+                        DebugLoc(), TII->get(TargetOpcode::PHI), PHIDef);
+  PHIMIB.addReg(Start->getOperand(0).getReg()).addMBB(Preheader);
+
+  Register ADDIDef =
+      MRI->createVirtualRegister(Is64Bit ? &PPC::G8RC_and_G8RC_NOX0RegClass
+                                         : &PPC::GPRC_and_GPRC_NOR0RegClass);
+  // Generate "addi -1" in the exiting block.
+  BuildMI(*Exiting, Dec, Dec->getDebugLoc(), TII->get(ADDIOpcode), ADDIDef)
+      .addReg(PHIDef)
+      .addImm(-1);
+
+  // Add other inputs for the PHI node.
+  if (ML->isLoopLatch(Exiting)) {
+    // There must be only two predecessors for the loop header, one is the
+    // Preheader and the other one is loop latch Exiting. In hardware loop
+    // insertion pass, the block containing DecreaseCTRloop must dominate all
+    // loop latches. So there must be only one latch.
+    assert(ML->getHeader()->pred_size() == 2 &&
+           "Loop header predecessor is not right!");
+    PHIMIB.addReg(ADDIDef).addMBB(Exiting);
+  } else {
+    // If the block containing DecreaseCTRloop is not a loop latch, we can use
+    // ADDIDef as the value for all other blocks for the PHI. In hardware loop
+    // insertion pass, the block containing DecreaseCTRloop must dominate all
+    // loop latches.
+    for (MachineBasicBlock *P : ML->getHeader()->predecessors()) {
+      if (ML->contains(P)) {
+        assert(ML->isLoopLatch(P) &&
+               "Loop's header in-loop predecessor is not loop latch!");
+        PHIMIB.addReg(ADDIDef).addMBB(P);
+      } else
+        assert(P == Preheader &&
+               "CTR loop should not be generated for irreducible loop!");
     }
-  } while (!Preds.empty());
+  }
 
-  return true;
+  // Generate the compare in the exiting block.
+  Register CMPDef = MRI->createVirtualRegister(&PPC::CRRCRegClass);
+  auto CMPMIB =
+      BuildMI(*Exiting, Dec, Dec->getDebugLoc(), TII->get(CMPOpcode), CMPDef)
+          .addReg(ADDIDef)
+          .addImm(0);
+
+  BuildMI(*Exiting, Dec, Dec->getDebugLoc(), TII->get(TargetOpcode::COPY),
+          Dec->getOperand(0).getReg())
+      .addReg(CMPMIB->getOperand(0).getReg(), 0, PPC::sub_gt);
+
+  // Remove the pseudo instructions.
+  Start->eraseFromParent();
+  Dec->eraseFromParent();
 }
 
-bool PPCCTRLoopsVerify::runOnMachineFunction(MachineFunction &MF) {
-  MDT = &getAnalysis<MachineDominatorTree>();
-
-  // Verify that all bdnz/bdz instructions are dominated by a loop mtctr before
-  // any other instructions that might clobber the ctr register.
-  for (MachineBasicBlock &MBB : MF) {
-    if (!MDT->isReachableFromEntry(&MBB))
-      continue;
-
-    for (MachineBasicBlock::iterator MII = MBB.getFirstTerminator(),
-      MIIE = MBB.end(); MII != MIIE; ++MII) {
-      unsigned Opc = MII->getOpcode();
-      if (Opc == PPC::BDNZ8 || Opc == PPC::BDNZ ||
-          Opc == PPC::BDZ8  || Opc == PPC::BDZ)
-        if (!verifyCTRBranch(&MBB, MII))
-          llvm_unreachable("Invalid PPC CTR loop!");
-    }
+void PPCCTRLoops::expandCTRLoops(MachineLoop *ML, MachineInstr *Start,
+                                 MachineInstr *Dec) {
+  bool Is64Bit =
+      Start->getParent()->getParent()->getSubtarget<PPCSubtarget>().isPPC64();
+
+  MachineBasicBlock *Preheader = Start->getParent();
+  MachineBasicBlock *Exiting = Dec->getParent();
+  assert((Preheader && Exiting) &&
+         "Preheader and exiting should exist for CTR loop!");
+
+  assert(Dec->getOperand(1).getImm() == 1 && "Loop decrement must be 1!");
+
+  unsigned BDNZOpcode = Is64Bit ? PPC::BDNZ8 : PPC::BDNZ;
+  unsigned BDZOpcode = Is64Bit ? PPC::BDZ8 : PPC::BDZ;
+  auto BrInstr = MRI->use_instr_begin(Dec->getOperand(0).getReg());
+  assert(MRI->hasOneUse(Dec->getOperand(0).getReg()) &&
+         "There should be only one user for loop decrement pseudo!");
+
+  unsigned Opcode = 0;
+  switch (BrInstr->getOpcode()) {
+  case PPC::BC:
+    Opcode = BDNZOpcode;
+    (void) ML;
+    assert(ML->contains(BrInstr->getOperand(1).getMBB()) &&
+           "Invalid ctr loop!");
+    break;
+  case PPC::BCn:
+    Opcode = BDZOpcode;
+    assert(!ML->contains(BrInstr->getOperand(1).getMBB()) &&
+           "Invalid ctr loop!");
+    break;
+  default:
+    llvm_unreachable("Unhandled branch user for DecreaseCTRloop.");
   }
 
-  return false;
+  unsigned MTCTROpcode = Is64Bit ? PPC::MTCTR8 : PPC::MTCTR;
+
+  // Generate "mtctr" in the loop preheader.
+  BuildMI(*Preheader, Start, Start->getDebugLoc(), TII->get(MTCTROpcode))
+      .addReg(Start->getOperand(0).getReg());
+
+  // Generate "bdnz/bdz" in the exiting block just before the terminator.
+  BuildMI(*Exiting, &*BrInstr, BrInstr->getDebugLoc(), TII->get(Opcode))
+      .addMBB(BrInstr->getOperand(1).getMBB());
+
+  // Remove the pseudo instructions.
+  Start->eraseFromParent();
+  BrInstr->eraseFromParent();
+  Dec->eraseFromParent();
 }
-#endif // NDEBUG
diff --git a/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp b/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp
new file mode 100644
index 000000000000..b1f5bdd885cd
--- /dev/null
+++ b/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp
@@ -0,0 +1,185 @@
+//===-- PPCCTRLoops.cpp - Verify CTR loops -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass verifies that all bdnz/bdz instructions are dominated by a loop
+// mtctr before any other instructions that might clobber the ctr register.
+//
+//===----------------------------------------------------------------------===//
+
+// CTR loops are produced by the HardwareLoops pass and this pass is simply a
+// verification that no invalid CTR loops are produced. As such, it isn't
+// something that needs to be run (or even defined) for Release builds so the
+// entire file is guarded by NDEBUG.
+#ifndef NDEBUG
+#include <vector>
+
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "PPC.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/ilist_iterator.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBundleIterator.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/Register.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GenericDomTreeConstruction.h"
+#include "llvm/Support/Printable.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-ctrloops-verify"
+
+namespace {
+
+  struct PPCCTRLoopsVerify : public MachineFunctionPass {
+  public:
+    static char ID;
+
+    PPCCTRLoopsVerify() : MachineFunctionPass(ID) {
+      initializePPCCTRLoopsVerifyPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineDominatorTree>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+  private:
+    MachineDominatorTree *MDT;
+  };
+
+  char PPCCTRLoopsVerify::ID = 0;
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
+                      "PowerPC CTR Loops Verify", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
+                    "PowerPC CTR Loops Verify", false, false)
+
+FunctionPass *llvm::createPPCCTRLoopsVerify() {
+  return new PPCCTRLoopsVerify();
+}
+
+static bool clobbersCTR(const MachineInstr &MI) {
+  for (const MachineOperand &MO : MI.operands()) {
+    if (MO.isReg()) {
+      if (MO.isDef() && (MO.getReg() == PPC::CTR || MO.getReg() == PPC::CTR8))
+        return true;
+    } else if (MO.isRegMask()) {
+      if (MO.clobbersPhysReg(PPC::CTR) || MO.clobbersPhysReg(PPC::CTR8))
+        return true;
+    }
+  }
+
+  return false;
+}
+
+static bool verifyCTRBranch(MachineBasicBlock *MBB,
+                            MachineBasicBlock::iterator I) {
+  MachineBasicBlock::iterator BI = I;
+  SmallSet<MachineBasicBlock *, 16>   Visited;
+  SmallVector<MachineBasicBlock *, 8> Preds;
+  bool CheckPreds;
+
+  if (I == MBB->begin()) {
+    Visited.insert(MBB);
+    goto queue_preds;
+  } else
+    --I;
+
+check_block:
+  Visited.insert(MBB);
+  if (I == MBB->end())
+    goto queue_preds;
+
+  CheckPreds = true;
+  for (MachineBasicBlock::iterator IE = MBB->begin();; --I) {
+    unsigned Opc = I->getOpcode();
+    if (Opc == PPC::MTCTRloop || Opc == PPC::MTCTR8loop) {
+      CheckPreds = false;
+      break;
+    }
+
+    if (I != BI && clobbersCTR(*I)) {
+      LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << " (" << MBB->getFullName()
+                        << ") instruction " << *I
+                        << " clobbers CTR, invalidating "
+                        << printMBBReference(*BI->getParent()) << " ("
+                        << BI->getParent()->getFullName() << ") instruction "
+                        << *BI << "\n");
+      return false;
+    }
+
+    if (I == IE)
+      break;
+  }
+
+  if (!CheckPreds && Preds.empty())
+    return true;
+
+  if (CheckPreds) {
+queue_preds:
+    if (MachineFunction::iterator(MBB) == MBB->getParent()->begin()) {
+      LLVM_DEBUG(dbgs() << "Unable to find a MTCTR instruction for "
+                        << printMBBReference(*BI->getParent()) << " ("
+                        << BI->getParent()->getFullName() << ") instruction "
+                        << *BI << "\n");
+      return false;
+    }
+
+    append_range(Preds, MBB->predecessors());
+  }
+
+  do {
+    MBB = Preds.pop_back_val();
+    if (!Visited.count(MBB)) {
+      I = MBB->getLastNonDebugInstr();
+      goto check_block;
+    }
+  } while (!Preds.empty());
+
+  return true;
+}
+
+bool PPCCTRLoopsVerify::runOnMachineFunction(MachineFunction &MF) {
+  MDT = &getAnalysis<MachineDominatorTree>();
+
+  // Verify that all bdnz/bdz instructions are dominated by a loop mtctr before
+  // any other instructions that might clobber the ctr register.
+  for (MachineBasicBlock &MBB : MF) {
+    if (!MDT->isReachableFromEntry(&MBB))
+      continue;
+
+    for (MachineBasicBlock::iterator MII = MBB.getFirstTerminator(),
+      MIIE = MBB.end(); MII != MIIE; ++MII) {
+      unsigned Opc = MII->getOpcode();
+      if (Opc == PPC::BDNZ8 || Opc == PPC::BDNZ ||
+          Opc == PPC::BDZ8  || Opc == PPC::BDZ)
+        if (!verifyCTRBranch(&MBB, MII))
+          llvm_unreachable("Invalid PPC CTR loop!");
+    }
+  }
+
+  return false;
+}
+#endif // NDEBUG
diff --git a/llvm/lib/Target/PowerPC/PPCCallingConv.td b/llvm/lib/Target/PowerPC/PPCCallingConv.td
index 1e81276f1de3..1901e8d1ebf1 100644
--- a/llvm/lib/Target/PowerPC/PPCCallingConv.td
+++ b/llvm/lib/Target/PowerPC/PPCCallingConv.td
@@ -363,3 +363,25 @@ def CSR_64_AllRegs_VSX : CalleeSavedRegs<(add CSR_64_AllRegs_Altivec,
 
 def CSR_64_AllRegs_AIX_Dflt_VSX : CalleeSavedRegs<(add CSR_64_AllRegs_Altivec,
                                          (sequence "VSL%u", 0, 19))>;
+
+def CSR_ALL_VSRP : CalleeSavedRegs<(sequence "VSRp%u", 0, 31)>;
+
+def CSR_VSRP :
+  CalleeSavedRegs<(add VSRp26, VSRp27, VSRp28, VSRp29, VSRp30, VSRp31)>;
+
+def CSR_SVR432_VSRP : CalleeSavedRegs<(add CSR_SVR432_Altivec, CSR_VSRP)>;
+
+def CSR_SVR464_VSRP : CalleeSavedRegs<(add CSR_PPC64_Altivec, CSR_VSRP)>;
+
+def CSR_SVR464_R2_VSRP : CalleeSavedRegs<(add CSR_SVR464_VSRP, X2)>;
+
+def CSR_SVR32_ColdCC_VSRP : CalleeSavedRegs<(add CSR_SVR32_ColdCC_Altivec,
+                                            (sub CSR_ALL_VSRP, VSRp17))>;
+
+def CSR_SVR64_ColdCC_VSRP : CalleeSavedRegs<(add CSR_SVR64_ColdCC,
+                                            (sub CSR_ALL_VSRP, VSRp17))>;
+
+def CSR_SVR64_ColdCC_R2_VSRP : CalleeSavedRegs<(add CSR_SVR64_ColdCC_VSRP, X2)>;
+
+def CSR_64_AllRegs_VSRP :
+  CalleeSavedRegs<(add CSR_64_AllRegs_VSX, CSR_ALL_VSRP)>;
diff --git a/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
index e7cd107c5046..5c7f0619161c 100644
--- a/llvm/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
@@ -775,7 +775,7 @@ bool PPCFastISel::SelectBranch(const Instruction *I) {
       if (!OptPPCPred)
         return false;
 
-      PPC::Predicate PPCPred = OptPPCPred.getValue();
+      PPC::Predicate PPCPred = *OptPPCPred;
 
       // Take advantage of fall-through opportunities.
       if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index 65c969c196e1..0f70ec576af1 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -10,14 +10,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/PPCPredicates.h"
 #include "PPCFrameLowering.h"
+#include "MCTargetDesc/PPCPredicates.h"
 #include "PPCInstrBuilder.h"
 #include "PPCInstrInfo.h"
 #include "PPCMachineFunctionInfo.h"
 #include "PPCSubtarget.h"
 #include "PPCTargetMachine.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -625,7 +626,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
   // Work out frame sizes.
   uint64_t FrameSize = determineFrameLayoutAndUpdate(MF);
   int64_t NegFrameSize = -FrameSize;
-  if (!isInt<32>(FrameSize) || !isInt<32>(NegFrameSize))
+  if (!isPPC64 && (!isInt<32>(FrameSize) || !isInt<32>(NegFrameSize)))
     llvm_unreachable("Unhandled stack size!");
 
   if (MFI.isFrameAddressTaken())
@@ -660,10 +661,6 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
                                                      : PPC::STWU );
   const MCInstrDesc& StoreUpdtIdxInst = TII.get(isPPC64 ? PPC::STDUX
                                                         : PPC::STWUX);
-  const MCInstrDesc& LoadImmShiftedInst = TII.get(isPPC64 ? PPC::LIS8
-                                                          : PPC::LIS );
-  const MCInstrDesc& OrImmInst = TII.get(isPPC64 ? PPC::ORI8
-                                                 : PPC::ORI );
   const MCInstrDesc& OrInst = TII.get(isPPC64 ? PPC::OR8
                                               : PPC::OR );
   const MCInstrDesc& SubtractCarryingInst = TII.get(isPPC64 ? PPC::SUBFC8
@@ -934,11 +931,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
             .addImm(NegFrameSize);
       } else {
         assert(!SingleScratchReg && "Only a single scratch reg available");
-        BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, TempReg)
-            .addImm(NegFrameSize >> 16);
-        BuildMI(MBB, MBBI, dl, OrImmInst, TempReg)
-            .addReg(TempReg, RegState::Kill)
-            .addImm(NegFrameSize & 0xFFFF);
+        TII.materializeImmPostRA(MBB, MBBI, dl, TempReg, NegFrameSize);
         BuildMI(MBB, MBBI, dl, SubtractCarryingInst, ScratchReg)
             .addReg(ScratchReg, RegState::Kill)
             .addReg(TempReg, RegState::Kill);
@@ -957,11 +950,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
           .addReg(SPReg);
 
     } else {
-      BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg)
-          .addImm(NegFrameSize >> 16);
-      BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg)
-          .addReg(ScratchReg, RegState::Kill)
-          .addImm(NegFrameSize & 0xFFFF);
+      TII.materializeImmPostRA(MBB, MBBI, dl, ScratchReg, NegFrameSize);
       BuildMI(MBB, MBBI, dl, StoreUpdtIdxInst, SPReg)
           .addReg(SPReg, RegState::Kill)
           .addReg(SPReg)
@@ -1668,7 +1657,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
   // values from the stack, and set SPAdd to the value that needs to be added
   // to the SP at the end. The default values are as if red zone was present.
   unsigned RBReg = SPReg;
-  unsigned SPAdd = 0;
+  uint64_t SPAdd = 0;
 
   // Check if we can move the stack update instruction up the epilogue
   // past the callee saves. This will allow the move to LR instruction
@@ -1726,11 +1715,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
         BuildMI(MBB, MBBI, dl, AddImmInst, RBReg)
           .addReg(FPReg).addImm(FrameSize);
       } else {
-        BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg)
-          .addImm(FrameSize >> 16);
-        BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg)
-          .addReg(ScratchReg, RegState::Kill)
-          .addImm(FrameSize & 0xFFFF);
+        TII.materializeImmPostRA(MBB, MBBI, dl, ScratchReg, FrameSize);
         BuildMI(MBB, MBBI, dl, AddInst)
           .addReg(RBReg)
           .addReg(FPReg)
@@ -1974,6 +1959,15 @@ void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF,
 
   const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
 
+  // Do not explicitly save the callee saved VSRp registers.
+  // The individual VSR subregisters will be saved instead.
+  SavedRegs.reset(PPC::VSRp26);
+  SavedRegs.reset(PPC::VSRp27);
+  SavedRegs.reset(PPC::VSRp28);
+  SavedRegs.reset(PPC::VSRp29);
+  SavedRegs.reset(PPC::VSRp30);
+  SavedRegs.reset(PPC::VSRp31);
+
   //  Save and clear the LR state.
   PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
   unsigned LR = RegInfo->getRARegister();
@@ -2383,7 +2377,7 @@ bool PPCFrameLowering::spillCalleeSavedRegisters(
 
   // Map each VSR to GPRs to be spilled with into it. Single VSR can contain one
   // or two GPRs, so we need table to record information for later save/restore.
-  llvm::for_each(CSI, [&](const CalleeSavedInfo &Info) {
+  for (const CalleeSavedInfo &Info : CSI) {
     if (Info.isSpilledToReg()) {
       auto &SpilledVSR =
           VSRContainingGPRs.FindAndConstruct(Info.getDstReg()).second;
@@ -2394,7 +2388,7 @@ bool PPCFrameLowering::spillCalleeSavedRegisters(
       else
         SpilledVSR.second = Info.getReg();
     }
-  });
+  }
 
   for (const CalleeSavedInfo &I : CSI) {
     Register Reg = I.getReg();
diff --git a/llvm/lib/Target/PowerPC/PPCGenScalarMASSEntries.cpp b/llvm/lib/Target/PowerPC/PPCGenScalarMASSEntries.cpp
new file mode 100644
index 000000000000..00931b1f63b2
--- /dev/null
+++ b/llvm/lib/Target/PowerPC/PPCGenScalarMASSEntries.cpp
@@ -0,0 +1,149 @@
+//===-- PPCGenScalarMASSEntries.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation converts standard math functions into their
+// corresponding MASS (scalar) entries for PowerPC targets.
+// Following are examples of such conversion:
+//     tanh ---> __xl_tanh_finite
+// Such lowering is legal under the fast-math option.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "PPCSubtarget.h"
+#include "PPCTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+
+#define DEBUG_TYPE "ppc-gen-scalar-mass"
+
+using namespace llvm;
+
+namespace {
+
+class PPCGenScalarMASSEntries : public ModulePass {
+public:
+  static char ID;
+
+  PPCGenScalarMASSEntries() : ModulePass(ID) {
+    ScalarMASSFuncs = {
+#define TLI_DEFINE_SCALAR_MASS_FUNCS
+#include "llvm/Analysis/ScalarFuncs.def"
+    };
+  }
+
+  bool runOnModule(Module &M) override;
+
+  StringRef getPassName() const override {
+    return "PPC Generate Scalar MASS Entries";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
+
+private:
+  std::map<StringRef, StringRef> ScalarMASSFuncs;
+  bool isCandidateSafeToLower(const CallInst &CI) const;
+  bool isFiniteCallSafe(const CallInst &CI) const;
+  bool createScalarMASSCall(StringRef MASSEntry, CallInst &CI,
+                            Function &Func) const;
+};
+
+} // namespace
+
+// Returns true if 'afn' flag exists on the call instruction with the math
+// function
+bool PPCGenScalarMASSEntries::isCandidateSafeToLower(const CallInst &CI) const {
+  // skip functions with no scalar or vector FP type (like cosisin)
+  if (!isa<FPMathOperator>(CI))
+    return false;
+
+  return CI.hasApproxFunc();
+}
+
+// Returns true if 'nnan', 'ninf' and 'nsz' flags exist on the call instruction
+// with the math function
+bool PPCGenScalarMASSEntries::isFiniteCallSafe(const CallInst &CI) const {
+  // skip functions with no scalar or vector FP type (like cosisin)
+  if (!isa<FPMathOperator>(CI))
+    return false;
+
+  // FIXME: no-errno and trapping-math need to be set for MASS converstion
+  // but they don't have IR representation.
+  return CI.hasNoNaNs() && CI.hasNoInfs() && CI.hasNoSignedZeros();
+}
+
+/// Lowers scalar math functions to scalar MASS functions.
+///     e.g.: tanh         --> __xl_tanh_finite or __xl_tanh
+/// Both function prototype and its callsite is updated during lowering.
+bool PPCGenScalarMASSEntries::createScalarMASSCall(StringRef MASSEntry,
+                                                   CallInst &CI,
+                                                   Function &Func) const {
+  if (CI.use_empty())
+    return false;
+
+  Module *M = Func.getParent();
+  assert(M && "Expecting a valid Module");
+
+  std::string MASSEntryStr = MASSEntry.str();
+  if (isFiniteCallSafe(CI))
+    MASSEntryStr += "_finite";
+
+  FunctionCallee FCache = M->getOrInsertFunction(
+      MASSEntryStr, Func.getFunctionType(), Func.getAttributes());
+
+  CI.setCalledFunction(FCache);
+
+  return true;
+}
+
+bool PPCGenScalarMASSEntries::runOnModule(Module &M) {
+  bool Changed = false;
+
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC || skipModule(M))
+    return false;
+
+  for (Function &Func : M) {
+    if (!Func.isDeclaration())
+      continue;
+
+    auto Iter = ScalarMASSFuncs.find(Func.getName());
+    if (Iter == ScalarMASSFuncs.end())
+      continue;
+
+    // The call to createScalarMASSCall() invalidates the iterator over users
+    // upon replacing the users. Precomputing the current list of users allows
+    // us to replace all the call sites.
+    SmallVector<User *, 4> TheUsers;
+    for (auto *User : Func.users())
+      TheUsers.push_back(User);
+
+    for (auto *User : TheUsers)
+      if (auto *CI = dyn_cast_or_null<CallInst>(User)) {
+        if (isCandidateSafeToLower(*CI))
+          Changed |= createScalarMASSCall(Iter->second, *CI, Func);
+      }
+  }
+
+  return Changed;
+}
+
+char PPCGenScalarMASSEntries::ID = 0;
+
+char &llvm::PPCGenScalarMASSEntriesID = PPCGenScalarMASSEntries::ID;
+
+INITIALIZE_PASS(PPCGenScalarMASSEntries, DEBUG_TYPE,
+                "Generate Scalar MASS entries", false, false)
+
+ModulePass *llvm::createPPCGenScalarMASSEntriesPass() {
+  return new PPCGenScalarMASSEntries();
+}
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index fdcf6e7e80f2..4247cf557c2a 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -28,6 +28,7 @@
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -188,7 +189,7 @@ namespace {
     }
 
     /// getSmallIPtrImm - Return a target constant of pointer type.
-    inline SDValue getSmallIPtrImm(unsigned Imm, const SDLoc &dl) {
+    inline SDValue getSmallIPtrImm(uint64_t Imm, const SDLoc &dl) {
       return CurDAG->getTargetConstant(
           Imm, dl, PPCLowering->getPointerTy(CurDAG->getDataLayout()));
     }
@@ -202,7 +203,7 @@ namespace {
     /// base register.  Return the virtual register that holds this value.
     SDNode *getGlobalBaseReg();
 
-    void selectFrameIndex(SDNode *SN, SDNode *N, unsigned Offset = 0);
+    void selectFrameIndex(SDNode *SN, SDNode *N, uint64_t Offset = 0);
 
     // Select - Convert the specified operand from a target-independent to a
     // target-specific node if it hasn't already been changed.
@@ -639,7 +640,7 @@ static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
          && isInt32Immediate(N->getOperand(1).getNode(), Imm);
 }
 
-void PPCDAGToDAGISel::selectFrameIndex(SDNode *SN, SDNode *N, unsigned Offset) {
+void PPCDAGToDAGISel::selectFrameIndex(SDNode *SN, SDNode *N, uint64_t Offset) {
   SDLoc dl(SN);
   int FI = cast<FrameIndexSDNode>(N)->getIndex();
   SDValue TFI = CurDAG->getTargetFrameIndex(FI, N->getValueType(0));
@@ -4645,7 +4646,8 @@ static bool mayUseP9Setb(SDNode *N, const ISD::CondCode &CC, SelectionDAG *DAG,
 static bool isSWTestOp(SDValue N) {
   if (N.getOpcode() == PPCISD::FTSQRT)
     return true;
-  if (N.getNumOperands() < 1 || !isa<ConstantSDNode>(N.getOperand(0)))
+  if (N.getNumOperands() < 1 || !isa<ConstantSDNode>(N.getOperand(0)) ||
+      N.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
     return false;
   switch (N.getConstantOperandVal(0)) {
   case Intrinsic::ppc_vsx_xvtdivdp:
@@ -5377,7 +5379,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
       // If this is equivalent to an add, then we can fold it with the
       // FrameIndex calculation.
       if ((LHSKnown.Zero.getZExtValue()|~(uint64_t)Imm) == ~0ULL) {
-        selectFrameIndex(N, N->getOperand(0).getNode(), (int)Imm);
+        selectFrameIndex(N, N->getOperand(0).getNode(), (int64_t)Imm);
         return;
       }
     }
@@ -5435,7 +5437,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     int16_t Imm;
     if (N->getOperand(0)->getOpcode() == ISD::FrameIndex &&
         isIntS16Immediate(N->getOperand(1), Imm)) {
-      selectFrameIndex(N, N->getOperand(0).getNode(), (int)Imm);
+      selectFrameIndex(N, N->getOperand(0).getNode(), (int64_t)Imm);
       return;
     }
 
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index cbeae0ab03b8..5b9d1e66b04e 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -126,6 +126,16 @@ static cl::opt<bool> EnableQuadwordAtomics(
     cl::desc("enable quadword lock-free atomic operations"), cl::init(false),
     cl::Hidden);
 
+static cl::opt<bool>
+    DisablePerfectShuffle("ppc-disable-perfect-shuffle",
+                          cl::desc("disable vector permute decomposition"),
+                          cl::init(true), cl::Hidden);
+
+cl::opt<bool> DisableAutoPairedVecSt(
+    "disable-auto-paired-vec-st",
+    cl::desc("disable automatically generated 32byte paired vector stores"),
+    cl::init(true), cl::Hidden);
+
 STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumSiblingCalls, "Number of sibling calls");
 STATISTIC(ShufflesHandledWithVPERM, "Number of shuffles lowered to a VPERM");
@@ -379,6 +389,25 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
   setOperationAction(ISD::FREM , MVT::f32, Expand);
   setOperationAction(ISD::FPOW , MVT::f32, Expand);
+
+  // MASS transformation for LLVM intrinsics with replicating fast-math flag
+  // to be consistent to PPCGenScalarMASSEntries pass
+  if (TM.getOptLevel() == CodeGenOpt::Aggressive &&
+      TM.Options.PPCGenScalarMASSEntries) {
+    setOperationAction(ISD::FSIN , MVT::f64, Custom);
+    setOperationAction(ISD::FCOS , MVT::f64, Custom);
+    setOperationAction(ISD::FPOW , MVT::f64, Custom);
+    setOperationAction(ISD::FLOG, MVT::f64, Custom);
+    setOperationAction(ISD::FLOG10, MVT::f64, Custom);
+    setOperationAction(ISD::FEXP, MVT::f64, Custom);
+    setOperationAction(ISD::FSIN , MVT::f32, Custom);
+    setOperationAction(ISD::FCOS , MVT::f32, Custom);
+    setOperationAction(ISD::FPOW , MVT::f32, Custom);
+    setOperationAction(ISD::FLOG, MVT::f32, Custom);
+    setOperationAction(ISD::FLOG10, MVT::f32, Custom);
+    setOperationAction(ISD::FEXP, MVT::f32, Custom);
+  }
+
   if (Subtarget.hasSPE()) {
     setOperationAction(ISD::FMA  , MVT::f64, Expand);
     setOperationAction(ISD::FMA  , MVT::f32, Expand);
@@ -603,6 +632,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f64, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::ppcf128, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f64, Custom);
 
   // To handle counter-based loop conditions.
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom);
@@ -1000,7 +1031,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::LOAD, MVT::v2f64, Legal);
       setOperationAction(ISD::STORE, MVT::v2f64, Legal);
 
-      setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Legal);
+      setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
 
       if (Subtarget.hasP8Vector())
         addRegisterClass(MVT::f32, &PPC::VSSRCRegClass);
@@ -1048,7 +1079,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::STORE, MVT::v2i64, Promote);
       AddPromotedToType (ISD::STORE, MVT::v2i64, MVT::v2f64);
 
-      setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Legal);
+      setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
 
       setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i64, Legal);
       setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i64, Legal);
@@ -1264,6 +1295,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Legal);
       setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Legal);
     }
+
+    if (Subtarget.hasP10Vector()) {
+      setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
+    }
   }
 
   if (Subtarget.pairedVectorMemops()) {
@@ -1291,8 +1326,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand);
   }
 
-  if (EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics()) {
-    setMaxAtomicSizeInBitsSupported(128);
+  if (shouldInlineQuadwordAtomics()) {
     setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
     setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
     setOperationAction(ISD::INTRINSIC_VOID, MVT::i128, Custom);
@@ -1305,57 +1339,46 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
   }
 
+  setLibcallName(RTLIB::MULO_I128, nullptr);
   if (!isPPC64) {
     // These libcalls are not available in 32-bit.
     setLibcallName(RTLIB::SHL_I128, nullptr);
     setLibcallName(RTLIB::SRL_I128, nullptr);
     setLibcallName(RTLIB::SRA_I128, nullptr);
+    setLibcallName(RTLIB::MUL_I128, nullptr);
     setLibcallName(RTLIB::MULO_I64, nullptr);
   }
 
   if (!isPPC64)
     setMaxAtomicSizeInBitsSupported(32);
+  else if (shouldInlineQuadwordAtomics())
+    setMaxAtomicSizeInBitsSupported(128);
+  else
+    setMaxAtomicSizeInBitsSupported(64);
 
   setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1);
 
   // We have target-specific dag combine patterns for the following nodes:
-  setTargetDAGCombine(ISD::ADD);
-  setTargetDAGCombine(ISD::SHL);
-  setTargetDAGCombine(ISD::SRA);
-  setTargetDAGCombine(ISD::SRL);
-  setTargetDAGCombine(ISD::MUL);
-  setTargetDAGCombine(ISD::FMA);
-  setTargetDAGCombine(ISD::SINT_TO_FP);
-  setTargetDAGCombine(ISD::BUILD_VECTOR);
+  setTargetDAGCombine({ISD::ADD, ISD::SHL, ISD::SRA, ISD::SRL, ISD::MUL,
+                       ISD::FMA, ISD::SINT_TO_FP, ISD::BUILD_VECTOR});
   if (Subtarget.hasFPCVT())
     setTargetDAGCombine(ISD::UINT_TO_FP);
-  setTargetDAGCombine(ISD::LOAD);
-  setTargetDAGCombine(ISD::STORE);
-  setTargetDAGCombine(ISD::BR_CC);
+  setTargetDAGCombine({ISD::LOAD, ISD::STORE, ISD::BR_CC});
   if (Subtarget.useCRBits())
     setTargetDAGCombine(ISD::BRCOND);
-  setTargetDAGCombine(ISD::BSWAP);
-  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
-  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
-  setTargetDAGCombine(ISD::INTRINSIC_VOID);
-
-  setTargetDAGCombine(ISD::SIGN_EXTEND);
-  setTargetDAGCombine(ISD::ZERO_EXTEND);
-  setTargetDAGCombine(ISD::ANY_EXTEND);
+  setTargetDAGCombine({ISD::BSWAP, ISD::INTRINSIC_WO_CHAIN,
+                       ISD::INTRINSIC_W_CHAIN, ISD::INTRINSIC_VOID});
 
-  setTargetDAGCombine(ISD::TRUNCATE);
-  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+  setTargetDAGCombine({ISD::SIGN_EXTEND, ISD::ZERO_EXTEND, ISD::ANY_EXTEND});
 
+  setTargetDAGCombine({ISD::TRUNCATE, ISD::VECTOR_SHUFFLE});
 
   if (Subtarget.useCRBits()) {
-    setTargetDAGCombine(ISD::TRUNCATE);
-    setTargetDAGCombine(ISD::SETCC);
-    setTargetDAGCombine(ISD::SELECT_CC);
+    setTargetDAGCombine({ISD::TRUNCATE, ISD::SETCC, ISD::SELECT_CC});
   }
 
   if (Subtarget.hasP9Altivec()) {
-    setTargetDAGCombine(ISD::ABS);
-    setTargetDAGCombine(ISD::VSELECT);
+    setTargetDAGCombine({ISD::ABS, ISD::VSELECT});
   }
 
   setLibcallName(RTLIB::LOG_F128, "logf128");
@@ -1586,8 +1609,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch ((PPCISD::NodeType)Opcode) {
   case PPCISD::FIRST_NUMBER:    break;
   case PPCISD::FSEL:            return "PPCISD::FSEL";
-  case PPCISD::XSMAXCDP:        return "PPCISD::XSMAXCDP";
-  case PPCISD::XSMINCDP:        return "PPCISD::XSMINCDP";
+  case PPCISD::XSMAXC:          return "PPCISD::XSMAXC";
+  case PPCISD::XSMINC:          return "PPCISD::XSMINC";
   case PPCISD::FCFID:           return "PPCISD::FCFID";
   case PPCISD::FCFIDU:          return "PPCISD::FCFIDU";
   case PPCISD::FCFIDS:          return "PPCISD::FCFIDS";
@@ -1865,8 +1888,7 @@ bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
 /// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
 bool PPC::isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
                                SelectionDAG &DAG) {
-  const PPCSubtarget& Subtarget =
-      static_cast<const PPCSubtarget&>(DAG.getSubtarget());
+  const PPCSubtarget &Subtarget = DAG.getSubtarget<PPCSubtarget>();
   if (!Subtarget.hasP8Vector())
     return false;
 
@@ -2120,7 +2142,11 @@ int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
 /// specifies a splat of a single element that is suitable for input to
 /// one of the splat operations (VSPLTB/VSPLTH/VSPLTW/XXSPLTW/LXVDSX/etc.).
 bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
-  assert(N->getValueType(0) == MVT::v16i8 && isPowerOf2_32(EltSize) &&
+  EVT VT = N->getValueType(0);
+  if (VT == MVT::v2i64 || VT == MVT::v2f64)
+    return EltSize == 8 && N->getMaskElt(0) == N->getMaskElt(1);
+
+  assert(VT == MVT::v16i8 && isPowerOf2_32(EltSize) &&
          EltSize <= 8 && "Can only handle 1,2,4,8 byte element sizes");
 
   // The consecutive indices need to specify an element, not part of two
@@ -2421,6 +2447,12 @@ unsigned PPC::getSplatIdxForPPCMnemonics(SDNode *N, unsigned EltSize,
                                          SelectionDAG &DAG) {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   assert(isSplatShuffleMask(SVOp, EltSize));
+  EVT VT = SVOp->getValueType(0);
+
+  if (VT == MVT::v2i64 || VT == MVT::v2f64)
+    return DAG.getDataLayout().isLittleEndian() ? 1 - SVOp->getMaskElt(0)
+                                                : SVOp->getMaskElt(0);
+
   if (DAG.getDataLayout().isLittleEndian())
     return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize);
   else
@@ -2957,15 +2989,15 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
   bool isLoad = true;
   SDValue Ptr;
   EVT VT;
-  unsigned Alignment;
+  Align Alignment;
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
     Ptr = LD->getBasePtr();
     VT = LD->getMemoryVT();
-    Alignment = LD->getAlignment();
+    Alignment = LD->getAlign();
   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
     Ptr = ST->getBasePtr();
     VT  = ST->getMemoryVT();
-    Alignment = ST->getAlignment();
+    Alignment = ST->getAlign();
     isLoad = false;
   } else
     return false;
@@ -3009,7 +3041,7 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
       return false;
   } else {
     // LDU/STU need an address with at least 4-byte alignment.
-    if (Alignment < 4)
+    if (Alignment < Align(4))
       return false;
 
     if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, Align(4)))
@@ -4416,8 +4448,11 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
           SDValue Off = DAG.getConstant(j, dl, PtrVT);
           Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off);
         }
-        SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, Addr,
-                                     MachinePointerInfo(&*FuncArg, j));
+        unsigned StoreSizeInBits = std::min(PtrByteSize, (ObjSize - j)) * 8;
+        EVT ObjType = EVT::getIntegerVT(*DAG.getContext(), StoreSizeInBits);
+        SDValue Store =
+            DAG.getTruncStore(Val.getValue(1), dl, Val, Addr,
+                              MachinePointerInfo(&*FuncArg, j), ObjType);
         MemOps.push_back(Store);
         ++GPR_idx;
       }
@@ -6254,8 +6289,11 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
         SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType());
         SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
         if (GPR_idx != NumGPRs) {
-          SDValue Load =
-              DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo());
+          unsigned LoadSizeInBits = std::min(PtrByteSize, (Size - j)) * 8;
+          EVT ObjType = EVT::getIntegerVT(*DAG.getContext(), LoadSizeInBits);
+          SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, AddArg,
+                                        MachinePointerInfo(), ObjType);
+
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
           ArgOffset += PtrByteSize;
@@ -6888,8 +6926,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
   if (useSoftFloat())
     report_fatal_error("Soft float support is unimplemented on AIX.");
 
-  const PPCSubtarget &Subtarget =
-      static_cast<const PPCSubtarget &>(DAG.getSubtarget());
+  const PPCSubtarget &Subtarget = DAG.getSubtarget<PPCSubtarget>();
 
   const bool IsPPC64 = Subtarget.isPPC64();
   const unsigned PtrByteSize = IsPPC64 ? 8 : 4;
@@ -7194,8 +7231,7 @@ SDValue PPCTargetLowering::LowerCall_AIX(
   if (CFlags.IsPatchPoint)
     report_fatal_error("This call type is unimplemented on AIX.");
 
-  const PPCSubtarget& Subtarget =
-      static_cast<const PPCSubtarget&>(DAG.getSubtarget());
+  const PPCSubtarget &Subtarget = DAG.getSubtarget<PPCSubtarget>();
 
   MachineFunction &MF = DAG.getMachineFunction();
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -7879,7 +7915,7 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 
   SDNodeFlags Flags = Op.getNode()->getFlags();
 
-  // We have xsmaxcdp/xsmincdp which are OK to emit even in the
+  // We have xsmaxc[dq]p/xsminc[dq]p which are OK to emit even in the
   // presence of infinities.
   if (Subtarget.hasP9Vector() && LHS == TV && RHS == FV) {
     switch (CC) {
@@ -7887,10 +7923,10 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
       break;
     case ISD::SETOGT:
     case ISD::SETGT:
-      return DAG.getNode(PPCISD::XSMAXCDP, dl, Op.getValueType(), LHS, RHS);
+      return DAG.getNode(PPCISD::XSMAXC, dl, Op.getValueType(), LHS, RHS);
     case ISD::SETOLT:
     case ISD::SETLT:
-      return DAG.getNode(PPCISD::XSMINCDP, dl, Op.getValueType(), LHS, RHS);
+      return DAG.getNode(PPCISD::XSMINC, dl, Op.getValueType(), LHS, RHS);
     }
   }
 
@@ -9037,7 +9073,7 @@ SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
 
 static const SDValue *getNormalLoadInput(const SDValue &Op, bool &IsPermuted) {
   const SDValue *InputLoad = &Op;
-  if (InputLoad->getOpcode() == ISD::BITCAST)
+  while (InputLoad->getOpcode() == ISD::BITCAST)
     InputLoad = &InputLoad->getOperand(0);
   if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR ||
       InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED) {
@@ -9801,7 +9837,7 @@ SDValue PPCTargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
   SDValue N1 = peekThroughBitcasts(Op.getOperand(1));
   unsigned SHLAmt = N1.getConstantOperandVal(0);
   if (SHLAmt % 8 == 0) {
-    SmallVector<int, 16> Mask(16, 0);
+    std::array<int, 16> Mask;
     std::iota(Mask.begin(), Mask.end(), 0);
     std::rotate(Mask.begin(), Mask.begin() + SHLAmt / 8, Mask.end());
     if (SDValue Shuffle =
@@ -9903,6 +9939,11 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
       return LdSplt;
     }
   }
+
+  // All v2i64 and v2f64 shuffles are legal
+  if (VT == MVT::v2i64 || VT == MVT::v2f64)
+    return Op;
+
   if (Subtarget.hasP9Vector() &&
       PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap,
                            isLittleEndian)) {
@@ -10048,56 +10089,59 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   // perfect shuffle table to emit an optimal matching sequence.
   ArrayRef<int> PermMask = SVOp->getMask();
 
-  unsigned PFIndexes[4];
-  bool isFourElementShuffle = true;
-  for (unsigned i = 0; i != 4 && isFourElementShuffle; ++i) { // Element number
-    unsigned EltNo = 8;   // Start out undef.
-    for (unsigned j = 0; j != 4; ++j) {  // Intra-element byte.
-      if (PermMask[i*4+j] < 0)
-        continue;   // Undef, ignore it.
-
-      unsigned ByteSource = PermMask[i*4+j];
-      if ((ByteSource & 3) != j) {
-        isFourElementShuffle = false;
-        break;
-      }
+  if (!DisablePerfectShuffle && !isLittleEndian) {
+    unsigned PFIndexes[4];
+    bool isFourElementShuffle = true;
+    for (unsigned i = 0; i != 4 && isFourElementShuffle;
+         ++i) {                           // Element number
+      unsigned EltNo = 8;                 // Start out undef.
+      for (unsigned j = 0; j != 4; ++j) { // Intra-element byte.
+        if (PermMask[i * 4 + j] < 0)
+          continue; // Undef, ignore it.
+
+        unsigned ByteSource = PermMask[i * 4 + j];
+        if ((ByteSource & 3) != j) {
+          isFourElementShuffle = false;
+          break;
+        }
 
-      if (EltNo == 8) {
-        EltNo = ByteSource/4;
-      } else if (EltNo != ByteSource/4) {
-        isFourElementShuffle = false;
-        break;
+        if (EltNo == 8) {
+          EltNo = ByteSource / 4;
+        } else if (EltNo != ByteSource / 4) {
+          isFourElementShuffle = false;
+          break;
+        }
       }
+      PFIndexes[i] = EltNo;
+    }
+
+    // If this shuffle can be expressed as a shuffle of 4-byte elements, use the
+    // perfect shuffle vector to determine if it is cost effective to do this as
+    // discrete instructions, or whether we should use a vperm.
+    // For now, we skip this for little endian until such time as we have a
+    // little-endian perfect shuffle table.
+    if (isFourElementShuffle) {
+      // Compute the index in the perfect shuffle table.
+      unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
+                              PFIndexes[2] * 9 + PFIndexes[3];
+
+      unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+      unsigned Cost = (PFEntry >> 30);
+
+      // Determining when to avoid vperm is tricky.  Many things affect the cost
+      // of vperm, particularly how many times the perm mask needs to be
+      // computed. For example, if the perm mask can be hoisted out of a loop or
+      // is already used (perhaps because there are multiple permutes with the
+      // same shuffle mask?) the vperm has a cost of 1.  OTOH, hoisting the
+      // permute mask out of the loop requires an extra register.
+      //
+      // As a compromise, we only emit discrete instructions if the shuffle can
+      // be generated in 3 or fewer operations.  When we have loop information
+      // available, if this block is within a loop, we should avoid using vperm
+      // for 3-operation perms and use a constant pool load instead.
+      if (Cost < 3)
+        return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
     }
-    PFIndexes[i] = EltNo;
-  }
-
-  // If this shuffle can be expressed as a shuffle of 4-byte elements, use the
-  // perfect shuffle vector to determine if it is cost effective to do this as
-  // discrete instructions, or whether we should use a vperm.
-  // For now, we skip this for little endian until such time as we have a
-  // little-endian perfect shuffle table.
-  if (isFourElementShuffle && !isLittleEndian) {
-    // Compute the index in the perfect shuffle table.
-    unsigned PFTableIndex =
-      PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
-
-    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
-    unsigned Cost  = (PFEntry >> 30);
-
-    // Determining when to avoid vperm is tricky.  Many things affect the cost
-    // of vperm, particularly how many times the perm mask needs to be computed.
-    // For example, if the perm mask can be hoisted out of a loop or is already
-    // used (perhaps because there are multiple permutes with the same shuffle
-    // mask?) the vperm has a cost of 1.  OTOH, hoisting the permute mask out of
-    // the loop requires an extra register.
-    //
-    // As a compromise, we only emit discrete instructions if the shuffle can be
-    // generated in 3 or fewer operations.  When we have loop information
-    // available, if this block is within a loop, we should avoid using vperm
-    // for 3-operation perms and use a constant pool load instead.
-    if (Cost < 3)
-      return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
   }
 
   // Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant
@@ -10518,6 +10562,16 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
              DAG.getTargetConstant(PPC::PRED_EQ, dl, MVT::i32)}),
         0);
   }
+  case Intrinsic::ppc_fnmsub: {
+    EVT VT = Op.getOperand(1).getValueType();
+    if (!Subtarget.hasVSX() || (!Subtarget.hasFloat128() && VT == MVT::f128))
+      return DAG.getNode(
+          ISD::FNEG, dl, VT,
+          DAG.getNode(ISD::FMA, dl, VT, Op.getOperand(1), Op.getOperand(2),
+                      DAG.getNode(ISD::FNEG, dl, VT, Op.getOperand(3))));
+    return DAG.getNode(PPCISD::FNMSUB, dl, VT, Op.getOperand(1),
+                       Op.getOperand(2), Op.getOperand(3));
+  }
   case Intrinsic::ppc_convert_f128_to_ppcf128:
   case Intrinsic::ppc_convert_ppcf128_to_f128: {
     RTLIB::Libcall LC = IntrinsicID == Intrinsic::ppc_convert_ppcf128_to_f128
@@ -10529,6 +10583,31 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                     dl, SDValue());
     return Result.first;
   }
+  case Intrinsic::ppc_maxfe:
+  case Intrinsic::ppc_maxfl:
+  case Intrinsic::ppc_maxfs:
+  case Intrinsic::ppc_minfe:
+  case Intrinsic::ppc_minfl:
+  case Intrinsic::ppc_minfs: {
+    EVT VT = Op.getValueType();
+    assert(
+        all_of(Op->ops().drop_front(4),
+               [VT](const SDUse &Use) { return Use.getValueType() == VT; }) &&
+        "ppc_[max|min]f[e|l|s] must have uniform type arguments");
+    (void)VT;
+    ISD::CondCode CC = ISD::SETGT;
+    if (IntrinsicID == Intrinsic::ppc_minfe ||
+        IntrinsicID == Intrinsic::ppc_minfl ||
+        IntrinsicID == Intrinsic::ppc_minfs)
+      CC = ISD::SETLT;
+    unsigned I = Op.getNumOperands() - 2, Cnt = I;
+    SDValue Res = Op.getOperand(I);
+    for (--I; Cnt != 0; --Cnt, I = (--I == 0 ? (Op.getNumOperands() - 1) : I)) {
+      Res =
+          DAG.getSelectCC(dl, Res, Op.getOperand(I), Res, Op.getOperand(I), CC);
+    }
+    return Res;
+  }
   }
 
   // If this is a lowered altivec predicate compare, CompareOpc is set to the
@@ -11055,6 +11134,12 @@ SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
 SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Wasn't expecting to be able to lower this!");
+  case ISD::FPOW:               return lowerPow(Op, DAG);
+  case ISD::FSIN:               return lowerSin(Op, DAG);
+  case ISD::FCOS:               return lowerCos(Op, DAG);
+  case ISD::FLOG:               return lowerLog(Op, DAG);
+  case ISD::FLOG10:             return lowerLog10(Op, DAG);
+  case ISD::FEXP:               return lowerExp(Op, DAG);
   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
@@ -11183,6 +11268,9 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
       Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::ppcf128,
                                     N->getOperand(2), N->getOperand(1)));
       break;
+    case Intrinsic::ppc_maxfe:
+    case Intrinsic::ppc_minfe:
+    case Intrinsic::ppc_fnmsub:
     case Intrinsic::ppc_convert_f128_to_ppcf128:
       Results.push_back(LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG));
       break;
@@ -14075,13 +14163,13 @@ static SDValue combineBVOfConsecutiveLoads(SDNode *N, SelectionDAG &DAG) {
     assert(LD1 && "Input needs to be a LoadSDNode.");
     return DAG.getLoad(N->getValueType(0), dl, LD1->getChain(),
                        LD1->getBasePtr(), LD1->getPointerInfo(),
-                       LD1->getAlignment());
+                       LD1->getAlign());
   }
   if (InputsAreReverseConsecutive) {
     assert(LDL && "Input needs to be a LoadSDNode.");
-    SDValue Load = DAG.getLoad(N->getValueType(0), dl, LDL->getChain(),
-                               LDL->getBasePtr(), LDL->getPointerInfo(),
-                               LDL->getAlignment());
+    SDValue Load =
+        DAG.getLoad(N->getValueType(0), dl, LDL->getChain(), LDL->getBasePtr(),
+                    LDL->getPointerInfo(), LDL->getAlign());
     SmallVector<int, 16> Ops;
     for (int i = N->getNumOperands() - 1; i >= 0; i--)
       Ops.push_back(i);
@@ -14469,6 +14557,11 @@ SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
 // builtins) into loads with swaps.
 SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N,
                                               DAGCombinerInfo &DCI) const {
+  // Delay VSX load for LE combine until after LegalizeOps to prioritize other
+  // load combines.
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
   SelectionDAG &DAG = DCI.DAG;
   SDLoc dl(N);
   SDValue Chain;
@@ -14503,13 +14596,6 @@ SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N,
 
   MVT VecTy = N->getValueType(0).getSimpleVT();
 
-  // Do not expand to PPCISD::LXVD2X + PPCISD::XXSWAPD when the load is
-  // aligned and the type is a vector with elements up to 4 bytes
-  if (Subtarget.needsSwapsForVSXMemOps() && MMO->getAlign() >= Align(16) &&
-      VecTy.getScalarSizeInBits() <= 32) {
-    return SDValue();
-  }
-
   SDValue LoadOps[] = { Chain, Base };
   SDValue Load = DAG.getMemIntrinsicNode(PPCISD::LXVD2X, dl,
                                          DAG.getVTList(MVT::v2f64, MVT::Other),
@@ -14537,6 +14623,11 @@ SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N,
 // builtins) into stores with swaps.
 SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
+  // Delay VSX store for LE combine until after LegalizeOps to prioritize other
+  // store combines.
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
   SelectionDAG &DAG = DCI.DAG;
   SDLoc dl(N);
   SDValue Chain;
@@ -14574,13 +14665,6 @@ SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N,
   SDValue Src = N->getOperand(SrcOpnd);
   MVT VecTy = Src.getValueType().getSimpleVT();
 
-  // Do not expand to PPCISD::XXSWAPD and PPCISD::STXVD2X when the load is
-  // aligned and the type is a vector with elements up to 4 bytes
-  if (Subtarget.needsSwapsForVSXMemOps() && MMO->getAlign() >= Align(16) &&
-      VecTy.getScalarSizeInBits() <= 32) {
-    return SDValue();
-  }
-
   // All stores are done as v2f64 and possible bit cast.
   if (VecTy != MVT::v2f64) {
     Src = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Src);
@@ -14806,6 +14890,17 @@ SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN,
   SDValue SToVLHS = isScalarToVec(LHS);
   SDValue SToVRHS = isScalarToVec(RHS);
   if (SToVLHS || SToVRHS) {
+    // FIXME: If both LHS and RHS are SCALAR_TO_VECTOR, but are not the
+    // same type and have differing element sizes, then do not perform
+    // the following transformation. The current transformation for
+    // SCALAR_TO_VECTOR assumes that both input vectors have the same
+    // element size. This will be updated in the future to account for
+    // differing sizes of the LHS and RHS.
+    if (SToVLHS && SToVRHS &&
+        (SToVLHS.getValueType().getScalarSizeInBits() !=
+         SToVRHS.getValueType().getScalarSizeInBits()))
+      return Res;
+
     int NumEltsIn = SToVLHS ? SToVLHS.getValueType().getVectorNumElements()
                             : SToVRHS.getValueType().getVectorNumElements();
     int NumEltsOut = ShuffV.size();
@@ -14889,24 +14984,36 @@ SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN,
     // Example (even elements from first vector):
     // vector_shuffle<0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23> t1, <zero>
     if (Mask[0] < NumElts)
-      for (int i = 1, e = Mask.size(); i < e; i += 2)
+      for (int i = 1, e = Mask.size(); i < e; i += 2) {
+        if (ShuffV[i] < 0)
+          continue;
         ShuffV[i] = (ShuffV[i - 1] + NumElts);
+      }
     // Example (odd elements from first vector):
     // vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7> t1, <zero>
     else
-      for (int i = 0, e = Mask.size(); i < e; i += 2)
+      for (int i = 0, e = Mask.size(); i < e; i += 2) {
+        if (ShuffV[i] < 0)
+          continue;
         ShuffV[i] = (ShuffV[i + 1] + NumElts);
+      }
   } else {
     // Example (even elements from first vector):
     // vector_shuffle<0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23> <zero>, t1
     if (Mask[0] < NumElts)
-      for (int i = 0, e = Mask.size(); i < e; i += 2)
+      for (int i = 0, e = Mask.size(); i < e; i += 2) {
+        if (ShuffV[i] < 0)
+          continue;
         ShuffV[i] = ShuffV[i + 1] - NumElts;
+      }
     // Example (odd elements from first vector):
     // vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7> <zero>, t1
     else
-      for (int i = 1, e = Mask.size(); i < e; i += 2)
+      for (int i = 1, e = Mask.size(); i < e; i += 2) {
+        if (ShuffV[i] < 0)
+          continue;
         ShuffV[i] = ShuffV[i - 1] - NumElts;
+      }
   }
 
   // If the RHS has undefs, we need to remove them since we may have created
@@ -15223,7 +15330,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       auto MMOFlags =
           LD->getMemOperand()->getFlags() & ~MachineMemOperand::MOVolatile;
       SDValue FloatLoad = DAG.getLoad(MVT::f32, dl, LD->getChain(), BasePtr,
-                                      LD->getPointerInfo(), LD->getAlignment(),
+                                      LD->getPointerInfo(), LD->getAlign(),
                                       MMOFlags, LD->getAAInfo());
       SDValue AddPtr =
         DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
@@ -15231,7 +15338,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       SDValue FloatLoad2 = DAG.getLoad(
           MVT::f32, dl, SDValue(FloatLoad.getNode(), 1), AddPtr,
           LD->getPointerInfo().getWithOffset(4),
-          MinAlign(LD->getAlignment(), 4), MMOFlags, LD->getAAInfo());
+          commonAlignment(LD->getAlign(), 4), MMOFlags, LD->getAAInfo());
 
       if (LD->isIndexed()) {
         // Note that DAGCombine should re-form any pre-increment load(s) from
@@ -15544,7 +15651,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       return SDValue();
     SDValue BasePtr = LD->getBasePtr();
     SDValue Lo = DAG.getLoad(MVT::i32, dl, LD->getChain(), BasePtr,
-                             LD->getPointerInfo(), LD->getAlignment());
+                             LD->getPointerInfo(), LD->getAlign());
     Lo = DAG.getNode(ISD::BSWAP, dl, MVT::i32, Lo);
     BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
                           DAG.getIntPtrConstant(4, dl));
@@ -17718,6 +17825,114 @@ bool PPCTargetLowering::splitValueIntoRegisterParts(
   return false;
 }
 
+SDValue PPCTargetLowering::lowerToLibCall(const char *LibCallName, SDValue Op,
+                                          SelectionDAG &DAG) const {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  EVT RetVT = Op.getValueType();
+  Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
+  SDValue Callee =
+      DAG.getExternalSymbol(LibCallName, TLI.getPointerTy(DAG.getDataLayout()));
+  bool SignExtend = TLI.shouldSignExtendTypeInLibCall(RetVT, false);
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  for (const SDValue &N : Op->op_values()) {
+    EVT ArgVT = N.getValueType();
+    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Entry.Node = N;
+    Entry.Ty = ArgTy;
+    Entry.IsSExt = TLI.shouldSignExtendTypeInLibCall(ArgVT, SignExtend);
+    Entry.IsZExt = !Entry.IsSExt;
+    Args.push_back(Entry);
+  }
+
+  SDValue InChain = DAG.getEntryNode();
+  SDValue TCChain = InChain;
+  const Function &F = DAG.getMachineFunction().getFunction();
+  bool isTailCall =
+      TLI.isInTailCallPosition(DAG, Op.getNode(), TCChain) &&
+      (RetTy == F.getReturnType() || F.getReturnType()->isVoidTy());
+  if (isTailCall)
+    InChain = TCChain;
+  CLI.setDebugLoc(SDLoc(Op))
+      .setChain(InChain)
+      .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args))
+      .setTailCall(isTailCall)
+      .setSExtResult(SignExtend)
+      .setZExtResult(!SignExtend)
+      .setIsPostTypeLegalization(true);
+  return TLI.LowerCallTo(CLI).first;
+}
+
+SDValue PPCTargetLowering::lowerLibCallBasedOnType(
+    const char *LibCallFloatName, const char *LibCallDoubleName, SDValue Op,
+    SelectionDAG &DAG) const {
+  if (Op.getValueType() == MVT::f32)
+    return lowerToLibCall(LibCallFloatName, Op, DAG);
+
+  if (Op.getValueType() == MVT::f64)
+    return lowerToLibCall(LibCallDoubleName, Op, DAG);
+
+  return SDValue();
+}
+
+bool PPCTargetLowering::isLowringToMASSFiniteSafe(SDValue Op) const {
+  SDNodeFlags Flags = Op.getNode()->getFlags();
+  return isLowringToMASSSafe(Op) && Flags.hasNoSignedZeros() &&
+         Flags.hasNoNaNs() && Flags.hasNoInfs();
+}
+
+bool PPCTargetLowering::isLowringToMASSSafe(SDValue Op) const {
+  return Op.getNode()->getFlags().hasApproximateFuncs();
+}
+
+SDValue PPCTargetLowering::lowerLibCallBase(const char *LibCallDoubleName,
+                                            const char *LibCallFloatName,
+                                            const char *LibCallDoubleNameFinite,
+                                            const char *LibCallFloatNameFinite,
+                                            SDValue Op,
+                                            SelectionDAG &DAG) const {
+  if (!isLowringToMASSSafe(Op))
+    return SDValue();
+
+  if (!isLowringToMASSFiniteSafe(Op))
+    return lowerLibCallBasedOnType(LibCallFloatName, LibCallDoubleName, Op,
+                                   DAG);
+
+  return lowerLibCallBasedOnType(LibCallFloatNameFinite,
+                                 LibCallDoubleNameFinite, Op, DAG);
+}
+
+SDValue PPCTargetLowering::lowerPow(SDValue Op, SelectionDAG &DAG) const {
+  return lowerLibCallBase("__xl_pow", "__xl_powf", "__xl_pow_finite",
+                          "__xl_powf_finite", Op, DAG);
+}
+
+SDValue PPCTargetLowering::lowerSin(SDValue Op, SelectionDAG &DAG) const {
+  return lowerLibCallBase("__xl_sin", "__xl_sinf", "__xl_sin_finite",
+                          "__xl_sinf_finite", Op, DAG);
+}
+
+SDValue PPCTargetLowering::lowerCos(SDValue Op, SelectionDAG &DAG) const {
+  return lowerLibCallBase("__xl_cos", "__xl_cosf", "__xl_cos_finite",
+                          "__xl_cosf_finite", Op, DAG);
+}
+
+SDValue PPCTargetLowering::lowerLog(SDValue Op, SelectionDAG &DAG) const {
+  return lowerLibCallBase("__xl_log", "__xl_logf", "__xl_log_finite",
+                          "__xl_logf_finite", Op, DAG);
+}
+
+SDValue PPCTargetLowering::lowerLog10(SDValue Op, SelectionDAG &DAG) const {
+  return lowerLibCallBase("__xl_log10", "__xl_log10f", "__xl_log10_finite",
+                          "__xl_log10f_finite", Op, DAG);
+}
+
+SDValue PPCTargetLowering::lowerExp(SDValue Op, SelectionDAG &DAG) const {
+  return lowerLibCallBase("__xl_exp", "__xl_expf", "__xl_exp_finite",
+                          "__xl_expf_finite", Op, DAG);
+}
+
 // If we happen to match to an aligned D-Form, check if the Frame Index is
 // adequately aligned. If it is not, reset the mode to match to X-Form.
 static void setXFormForUnalignedFI(SDValue N, unsigned Flags,
@@ -17878,10 +18093,18 @@ CCAssignFn *PPCTargetLowering::ccAssignFnForCall(CallingConv::ID CC,
   }
 }
 
+bool PPCTargetLowering::shouldInlineQuadwordAtomics() const {
+  // TODO: 16-byte atomic type support for AIX is in progress; we should be able
+  // to inline 16-byte atomic ops on AIX too in the future.
+  return Subtarget.isPPC64() &&
+         (EnableQuadwordAtomics || !Subtarget.getTargetTriple().isOSAIX()) &&
+         Subtarget.hasQuadwordAtomics();
+}
+
 TargetLowering::AtomicExpansionKind
 PPCTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
-  if (EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics() && Size == 128)
+  if (shouldInlineQuadwordAtomics() && Size == 128)
     return AtomicExpansionKind::MaskedIntrinsic;
   return TargetLowering::shouldExpandAtomicRMWInIR(AI);
 }
@@ -17889,7 +18112,7 @@ PPCTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
 TargetLowering::AtomicExpansionKind
 PPCTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {
   unsigned Size = AI->getNewValOperand()->getType()->getPrimitiveSizeInBits();
-  if (EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics() && Size == 128)
+  if (shouldInlineQuadwordAtomics() && Size == 128)
     return AtomicExpansionKind::MaskedIntrinsic;
   return TargetLowering::shouldExpandAtomicCmpXchgInIR(AI);
 }
@@ -17919,10 +18142,9 @@ getIntrinsicForAtomicRMWBinOp128(AtomicRMWInst::BinOp BinOp) {
 Value *PPCTargetLowering::emitMaskedAtomicRMWIntrinsic(
     IRBuilderBase &Builder, AtomicRMWInst *AI, Value *AlignedAddr, Value *Incr,
     Value *Mask, Value *ShiftAmt, AtomicOrdering Ord) const {
-  assert(EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics() &&
-         "Only support quadword now");
+  assert(shouldInlineQuadwordAtomics() && "Only support quadword now");
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  Type *ValTy = AlignedAddr->getType()->getPointerElementType();
+  Type *ValTy = Incr->getType();
   assert(ValTy->getPrimitiveSizeInBits() == 128);
   Function *RMW = Intrinsic::getDeclaration(
       M, getIntrinsicForAtomicRMWBinOp128(AI->getOperation()));
@@ -17944,10 +18166,9 @@ Value *PPCTargetLowering::emitMaskedAtomicRMWIntrinsic(
 Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
     IRBuilderBase &Builder, AtomicCmpXchgInst *CI, Value *AlignedAddr,
     Value *CmpVal, Value *NewVal, Value *Mask, AtomicOrdering Ord) const {
-  assert(EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics() &&
-         "Only support quadword now");
+  assert(shouldInlineQuadwordAtomics() && "Only support quadword now");
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  Type *ValTy = AlignedAddr->getType()->getPointerElementType();
+  Type *ValTy = CmpVal->getType();
   assert(ValTy->getPrimitiveSizeInBits() == 128);
   Function *IntCmpXchg =
       Intrinsic::getDeclaration(M, Intrinsic::ppc_cmpxchg_i128);
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index eb52e4aa6273..f92a117fe27f 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -51,9 +51,9 @@ namespace llvm {
     ///
     FSEL,
 
-    /// XSMAXCDP, XSMINCDP - C-type min/max instructions.
-    XSMAXCDP,
-    XSMINCDP,
+    /// XSMAXC[DQ]P, XSMINC[DQ]P - C-type min/max instructions.
+    XSMAXC,
+    XSMINC,
 
     /// FCFID - The FCFID instruction, taking an f64 operand and producing
     /// and f64 value containing the FP representation of the integer that
@@ -77,7 +77,7 @@ namespace llvm {
     FCTIDUZ,
     FCTIWUZ,
 
-    /// Floating-point-to-interger conversion instructions
+    /// Floating-point-to-integer conversion instructions
     FP_TO_UINT_IN_VSR,
     FP_TO_SINT_IN_VSR,
 
@@ -765,8 +765,19 @@ namespace llvm {
     /// then the VPERM for the shuffle. All in all a very slow sequence.
     TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT)
       const override {
-      if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
-          VT.getScalarSizeInBits() % 8 == 0)
+      // Default handling for scalable and single-element vectors.
+      if (VT.isScalableVector() || VT.getVectorNumElements() == 1)
+        return TargetLoweringBase::getPreferredVectorAction(VT);
+
+      // Split and promote vNi1 vectors so we don't produce v256i1/v512i1
+      // types as those are only for MMA instructions.
+      if (VT.getScalarSizeInBits() == 1 && VT.getSizeInBits() > 16)
+        return TypeSplitVector;
+      if (VT.getScalarSizeInBits() == 1)
+        return TypePromoteInteger;
+
+      // Widen vectors that have reasonably sized elements.
+      if (VT.getScalarSizeInBits() % 8 == 0)
         return TypeWidenVector;
       return TargetLoweringBase::getPreferredVectorAction(VT);
     }
@@ -899,6 +910,8 @@ namespace llvm {
     Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
                                    AtomicOrdering Ord) const override;
 
+    bool shouldInlineQuadwordAtomics() const;
+
     TargetLowering::AtomicExpansionKind
     shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
 
@@ -1273,6 +1286,24 @@ namespace llvm {
     SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBSWAP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerToLibCall(const char *LibCallName, SDValue Op,
+                           SelectionDAG &DAG) const;
+    SDValue lowerLibCallBasedOnType(const char *LibCallFloatName,
+                                    const char *LibCallDoubleName, SDValue Op,
+                                    SelectionDAG &DAG) const;
+    bool isLowringToMASSFiniteSafe(SDValue Op) const;
+    bool isLowringToMASSSafe(SDValue Op) const;
+    SDValue lowerLibCallBase(const char *LibCallDoubleName,
+                             const char *LibCallFloatName,
+                             const char *LibCallDoubleNameFinite,
+                             const char *LibCallFloatNameFinite, SDValue Op,
+                             SelectionDAG &DAG) const;
+    SDValue lowerPow(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerSin(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerCos(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerLog(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerLog10(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerExp(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerATOMIC_LOAD_STORE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
index eae8e36e475e..dbe7a7805c61 100644
--- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -580,6 +580,14 @@ def MTCTR8loop : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS),
                  PPC970_DGroup_First, PPC970_Unit_FXU;
 }
 
+
+let hasSideEffects = 1, Defs = [CTR8] in
+def MTCTR8Pseudo : PPCEmitTimePseudo<(outs), (ins g8rc:$rS), "#MTCTR8Pseudo", []>;
+
+let hasSideEffects = 1, Uses = [CTR8], Defs = [CTR8] in
+def DecreaseCTR8Pseudo : PPCEmitTimePseudo<(outs crbitrc:$rT), (ins i64imm:$stride),
+                                          "#DecreaseCTR8Pseudo", []>;
+
 let Pattern = [(set i64:$rT, readcyclecounter)] in
 def MFTB8 : XFXForm_1_ext<31, 339, 268, (outs g8rc:$rT), (ins),
                           "mfspr $rT, 268", IIC_SprMFTB>,
@@ -1014,8 +1022,6 @@ let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
   def SETB8 : XForm_44<31, 128, (outs g8rc:$RT), (ins crrc:$BFA),
                        "setb $RT, $BFA", IIC_IntGeneral>, isPPC64;
 }
-def DARN : XForm_45<31, 755, (outs g8rc:$RT), (ins u2imm:$L),
-                     "darn $RT, $L", IIC_LdStLD>, isPPC64;
 def ADDPCIS : DXForm<19, 2, (outs g8rc:$RT), (ins i32imm:$D),
                      "addpcis $RT, $D", IIC_BrB, []>, isPPC64;
 def MODSD : XForm_8<31, 777, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
@@ -1040,6 +1046,11 @@ def MULLI8 : DForm_2<7, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm),
                        [(set i64:$rD, (mul i64:$rA, imm64SExt16:$imm))]>;
 }
 
+let hasSideEffects = 1 in {
+def DARN : XForm_45<31, 755, (outs g8rc:$RT), (ins u2imm:$L),
+                    "darn $RT, $L", IIC_LdStLD>, isPPC64;
+}
+
 let hasSideEffects = 0 in {
 defm RLDIMI : MDForm_1r<30, 3, (outs g8rc:$rA),
                         (ins g8rc:$rSi, g8rc:$rS, u6imm:$SH, u6imm:$MBE),
@@ -1396,10 +1407,6 @@ def LDUX : XForm_1_memOp<31, 53, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
                         "ldux $rD, $addr", IIC_LdStLDUX,
                         []>, RegConstraint<"$addr.ptrreg = $ea_result">,
                         NoEncode<"$ea_result">, isPPC64;
-
-def LDMX : XForm_1<31, 309, (outs g8rc:$rD), (ins memrr:$src),
-                   "ldmx $rD, $src", IIC_LdStLD, []>, isPPC64,
-                   Requires<[IsISA3_0]>;
 }
 
 let mayLoad = 1, hasNoSchedulingInfo = 1 in {
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index eada872c2a7d..59486c323567 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -2218,7 +2218,7 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI,
           .addReg(Pred[1].getReg(), RegState::ImplicitDefine);
     } else if (Pred[0].getImm() == PPC::PRED_BIT_SET) {
       MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
-      MI.RemoveOperand(0);
+      MI.removeOperand(0);
 
       MI.setDesc(get(PPC::BC));
       MachineInstrBuilder(*MI.getParent()->getParent(), MI)
@@ -2226,7 +2226,7 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI,
           .addMBB(MBB);
     } else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) {
       MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
-      MI.RemoveOperand(0);
+      MI.removeOperand(0);
 
       MI.setDesc(get(PPC::BCn));
       MachineInstrBuilder(*MI.getParent()->getParent(), MI)
@@ -2234,7 +2234,7 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI,
           .addMBB(MBB);
     } else {
       MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
-      MI.RemoveOperand(0);
+      MI.removeOperand(0);
 
       MI.setDesc(get(PPC::BCC));
       MachineInstrBuilder(*MI.getParent()->getParent(), MI)
@@ -2714,8 +2714,8 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
       }
       // If we've set the mask, we can transform.
       if (Mask != ~0LLU) {
-        MI->RemoveOperand(4);
-        MI->RemoveOperand(3);
+        MI->removeOperand(4);
+        MI->removeOperand(3);
         MI->getOperand(2).setImm(Mask);
         NumRcRotatesConvertedToRcAnd++;
       }
@@ -2724,7 +2724,7 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
       if (MB >= 48) {
         uint64_t Mask = (1LLU << (63 - MB + 1)) - 1;
         NewOpC = PPC::ANDI8_rec;
-        MI->RemoveOperand(3);
+        MI->removeOperand(3);
         MI->getOperand(2).setImm(Mask);
         NumRcRotatesConvertedToRcAnd++;
       }
@@ -3026,8 +3026,8 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   }
   case PPC::KILL_PAIR: {
     MI.setDesc(get(PPC::UNENCODED_NOP));
-    MI.RemoveOperand(1);
-    MI.RemoveOperand(0);
+    MI.removeOperand(1);
+    MI.removeOperand(0);
     return true;
   }
   case TargetOpcode::LOAD_STACK_GUARD: {
@@ -3122,7 +3122,7 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
         .addReg(PPC::CR7)
         .addImm(1);
     MI.setDesc(get(PPC::ISYNC));
-    MI.RemoveOperand(0);
+    MI.removeOperand(0);
     return true;
   }
   }
@@ -3188,7 +3188,7 @@ void PPCInstrInfo::replaceInstrOperandWithImm(MachineInstr &MI,
       // - implicit reg uses
       // Therefore, removing the implicit operand won't change the explicit
       // operands layout.
-      MI.RemoveOperand(UseOpIdx);
+      MI.removeOperand(UseOpIdx);
   }
 }
 
@@ -3199,7 +3199,7 @@ void PPCInstrInfo::replaceInstrWithLI(MachineInstr &MI,
   // Remove existing operands.
   int OperandToKeep = LII.SetCR ? 1 : 0;
   for (int i = MI.getNumOperands() - 1; i > OperandToKeep; i--)
-    MI.RemoveOperand(i);
+    MI.removeOperand(i);
 
   // Replace the instruction.
   if (LII.SetCR) {
@@ -3234,6 +3234,47 @@ MachineInstr *PPCInstrInfo::getDefMIPostRA(unsigned Reg, MachineInstr &MI,
   return nullptr;
 }
 
+void PPCInstrInfo::materializeImmPostRA(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MBBI,
+                                        const DebugLoc &DL, Register Reg,
+                                        int64_t Imm) const {
+  assert(!MBB.getParent()->getRegInfo().isSSA() &&
+         "Register should be in non-SSA form after RA");
+  bool isPPC64 = Subtarget.isPPC64();
+  // FIXME: Materialization here is not optimal.
+  // For some special bit patterns we can use less instructions.
+  // See `selectI64ImmDirect` in PPCISelDAGToDAG.cpp.
+  if (isInt<16>(Imm)) {
+    BuildMI(MBB, MBBI, DL, get(isPPC64 ? PPC::LI8 : PPC::LI), Reg).addImm(Imm);
+  } else if (isInt<32>(Imm)) {
+    BuildMI(MBB, MBBI, DL, get(isPPC64 ? PPC::LIS8 : PPC::LIS), Reg)
+        .addImm(Imm >> 16);
+    if (Imm & 0xFFFF)
+      BuildMI(MBB, MBBI, DL, get(isPPC64 ? PPC::ORI8 : PPC::ORI), Reg)
+          .addReg(Reg, RegState::Kill)
+          .addImm(Imm & 0xFFFF);
+  } else {
+    assert(isPPC64 && "Materializing 64-bit immediate to single register is "
+                      "only supported in PPC64");
+    BuildMI(MBB, MBBI, DL, get(PPC::LIS8), Reg).addImm(Imm >> 48);
+    if ((Imm >> 32) & 0xFFFF)
+      BuildMI(MBB, MBBI, DL, get(PPC::ORI8), Reg)
+          .addReg(Reg, RegState::Kill)
+          .addImm((Imm >> 32) & 0xFFFF);
+    BuildMI(MBB, MBBI, DL, get(PPC::RLDICR), Reg)
+        .addReg(Reg, RegState::Kill)
+        .addImm(32)
+        .addImm(31);
+    BuildMI(MBB, MBBI, DL, get(PPC::ORIS8), Reg)
+        .addReg(Reg, RegState::Kill)
+        .addImm((Imm >> 16) & 0xFFFF);
+    if (Imm & 0xFFFF)
+      BuildMI(MBB, MBBI, DL, get(PPC::ORI8), Reg)
+          .addReg(Reg, RegState::Kill)
+          .addImm(Imm & 0xFFFF);
+  }
+}
+
 MachineInstr *PPCInstrInfo::getForwardingDefMI(
   MachineInstr &MI,
   unsigned &OpNoForForwarding,
@@ -3790,15 +3831,15 @@ bool PPCInstrInfo::combineRLWINM(MachineInstr &MI,
 
     if (MI.getOpcode() == PPC::RLWINM || MI.getOpcode() == PPC::RLWINM8) {
       // Replace MI with "LI 0"
-      MI.RemoveOperand(4);
-      MI.RemoveOperand(3);
-      MI.RemoveOperand(2);
+      MI.removeOperand(4);
+      MI.removeOperand(3);
+      MI.removeOperand(2);
       MI.getOperand(1).ChangeToImmediate(0);
       MI.setDesc(get(Is64Bit ? PPC::LI8 : PPC::LI));
     } else {
       // Replace MI with "ANDI_rec reg, 0"
-      MI.RemoveOperand(4);
-      MI.RemoveOperand(3);
+      MI.removeOperand(4);
+      MI.removeOperand(3);
       MI.getOperand(2).setImm(0);
       MI.setDesc(get(Is64Bit ? PPC::ANDI8_rec : PPC::ANDI_rec));
       MI.getOperand(1).setReg(SrcMI->getOperand(1).getReg());
@@ -4282,8 +4323,8 @@ static void swapMIOperands(MachineInstr &MI, unsigned Op1, unsigned Op2) {
   unsigned MinOp = std::min(Op1, Op2);
   MachineOperand MOp1 = MI.getOperand(MinOp);
   MachineOperand MOp2 = MI.getOperand(MaxOp);
-  MI.RemoveOperand(std::max(Op1, Op2));
-  MI.RemoveOperand(std::min(Op1, Op2));
+  MI.removeOperand(std::max(Op1, Op2));
+  MI.removeOperand(std::min(Op1, Op2));
 
   // If the operands we are swapping are the two at the end (the common case)
   // we can just remove both and add them in the opposite order.
@@ -4297,7 +4338,7 @@ static void swapMIOperands(MachineInstr &MI, unsigned Op1, unsigned Op2) {
     unsigned TotalOps = MI.getNumOperands() + 2; // We've already removed 2 ops.
     for (unsigned i = MI.getNumOperands() - 1; i >= MinOp; i--) {
       MOps.push_back(MI.getOperand(i));
-      MI.RemoveOperand(i);
+      MI.removeOperand(i);
     }
     // MOp2 needs to be added next.
     MI.addOperand(MOp2);
@@ -4532,8 +4573,8 @@ bool PPCInstrInfo::simplifyToLI(MachineInstr &MI, MachineInstr &DefMI,
       if (RegToCopy == PPC::ZERO || RegToCopy == PPC::ZERO8) {
         CompareUseMI.setDesc(get(UseOpc == PPC::ISEL8 ? PPC::LI8 : PPC::LI));
         replaceInstrOperandWithImm(CompareUseMI, 1, 0);
-        CompareUseMI.RemoveOperand(3);
-        CompareUseMI.RemoveOperand(2);
+        CompareUseMI.removeOperand(3);
+        CompareUseMI.removeOperand(2);
         continue;
       }
       LLVM_DEBUG(
@@ -4542,8 +4583,8 @@ bool PPCInstrInfo::simplifyToLI(MachineInstr &MI, MachineInstr &DefMI,
       LLVM_DEBUG(dbgs() << "Is converted to:\n");
       // Convert to copy and remove unneeded operands.
       CompareUseMI.setDesc(get(PPC::COPY));
-      CompareUseMI.RemoveOperand(3);
-      CompareUseMI.RemoveOperand(RegToCopy == TrueReg ? 2 : 1);
+      CompareUseMI.removeOperand(3);
+      CompareUseMI.removeOperand(RegToCopy == TrueReg ? 2 : 1);
       CmpIselsConverted++;
       Changed = true;
       LLVM_DEBUG(CompareUseMI.dump());
@@ -4887,7 +4928,7 @@ bool PPCInstrInfo::transformToImmFormFedByAdd(
     SmallVector<MachineOperand, 2> MOps;
     for (unsigned i = MI.getNumOperands() - 1; i >= III.ZeroIsSpecialOrig; i--) {
       MOps.push_back(MI.getOperand(i));
-      MI.RemoveOperand(i);
+      MI.removeOperand(i);
     }
 
     // Remove the last MO in the list, which is ZERO operand in fact.
@@ -5010,7 +5051,7 @@ bool PPCInstrInfo::transformToImmFormFedByLI(MachineInstr &MI,
       // just convert this to a COPY. Can't do this post-RA since we've already
       // cleaned up the copies.
       else if (!SetCR && ShAmt == 0 && !PostRA) {
-        MI.RemoveOperand(2);
+        MI.removeOperand(2);
         MI.setDesc(get(PPC::COPY));
       } else {
         // The 32 bit and 64 bit instructions are quite different.
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
index c16e146da247..e22b0086bde8 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -295,6 +295,99 @@ public:
     return get(Opcode).TSFlags & PPCII::Prefixed;
   }
 
+  /// Check if Opcode corresponds to a call instruction that should be marked
+  /// with the NOTOC relocation.
+  bool isNoTOCCallInstr(unsigned Opcode) const {
+    if (!get(Opcode).isCall())
+      return false;
+
+    switch (Opcode) {
+    default:
+#ifndef NDEBUG
+      llvm_unreachable("Unknown call opcode");
+#endif
+      return false;
+    case PPC::BL8_NOTOC:
+    case PPC::BL8_NOTOC_TLS:
+    case PPC::BL8_NOTOC_RM:
+      return true;
+#ifndef NDEBUG
+    case PPC::BL8:
+    case PPC::BL:
+    case PPC::BL8_TLS:
+    case PPC::BL_TLS:
+    case PPC::BLA8:
+    case PPC::BLA:
+    case PPC::BCCL:
+    case PPC::BCCLA:
+    case PPC::BCL:
+    case PPC::BCLn:
+    case PPC::BL8_NOP:
+    case PPC::BL_NOP:
+    case PPC::BL8_NOP_TLS:
+    case PPC::BLA8_NOP:
+    case PPC::BCTRL8:
+    case PPC::BCTRL:
+    case PPC::BCCCTRL8:
+    case PPC::BCCCTRL:
+    case PPC::BCCTRL8:
+    case PPC::BCCTRL:
+    case PPC::BCCTRL8n:
+    case PPC::BCCTRLn:
+    case PPC::BL8_RM:
+    case PPC::BLA8_RM:
+    case PPC::BL8_NOP_RM:
+    case PPC::BLA8_NOP_RM:
+    case PPC::BCTRL8_RM:
+    case PPC::BCTRL8_LDinto_toc:
+    case PPC::BCTRL8_LDinto_toc_RM:
+    case PPC::BL8_TLS_:
+    case PPC::TCRETURNdi8:
+    case PPC::TCRETURNai8:
+    case PPC::TCRETURNri8:
+    case PPC::TAILBCTR8:
+    case PPC::TAILB8:
+    case PPC::TAILBA8:
+    case PPC::BCLalways:
+    case PPC::BLRL:
+    case PPC::BCCLRL:
+    case PPC::BCLRL:
+    case PPC::BCLRLn:
+    case PPC::BDZL:
+    case PPC::BDNZL:
+    case PPC::BDZLA:
+    case PPC::BDNZLA:
+    case PPC::BDZLp:
+    case PPC::BDNZLp:
+    case PPC::BDZLAp:
+    case PPC::BDNZLAp:
+    case PPC::BDZLm:
+    case PPC::BDNZLm:
+    case PPC::BDZLAm:
+    case PPC::BDNZLAm:
+    case PPC::BDZLRL:
+    case PPC::BDNZLRL:
+    case PPC::BDZLRLp:
+    case PPC::BDNZLRLp:
+    case PPC::BDZLRLm:
+    case PPC::BDNZLRLm:
+    case PPC::BL_RM:
+    case PPC::BLA_RM:
+    case PPC::BL_NOP_RM:
+    case PPC::BCTRL_RM:
+    case PPC::TCRETURNdi:
+    case PPC::TCRETURNai:
+    case PPC::TCRETURNri:
+    case PPC::BCTRL_LWZinto_toc:
+    case PPC::BCTRL_LWZinto_toc_RM:
+    case PPC::TAILBCTR:
+    case PPC::TAILB:
+    case PPC::TAILBA:
+      return false;
+#endif
+    }
+  }
+
   static bool isSameClassPhysRegCopy(unsigned Opcode) {
     unsigned CopyOpcodes[] = {PPC::OR,        PPC::OR8,   PPC::FMR,
                               PPC::VOR,       PPC::XXLOR, PPC::XXLORf,
@@ -653,6 +746,12 @@ public:
   MachineInstr *getDefMIPostRA(unsigned Reg, MachineInstr &MI,
                                bool &SeenIntermediateUse) const;
 
+  // Materialize immediate after RA.
+  void materializeImmPostRA(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            const DebugLoc &DL, Register Reg,
+                            int64_t Imm) const;
+
   /// getRegNumForOperand - some operands use different numbering schemes
   /// for the same registers. For example, a VSX instruction may have any of
   /// vs0-vs63 allocated whereas an Altivec instruction could only have
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index c26b4f6ceb7d..f651b51d2684 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -198,8 +198,8 @@ def PPCfsel   : SDNode<"PPCISD::FSEL",
    // Type constraint for fsel.
    SDTypeProfile<1, 3, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>,
                         SDTCisFP<0>, SDTCisVT<1, f64>]>, []>;
-def PPCxsmaxc : SDNode<"PPCISD::XSMAXCDP", SDT_PPCFPMinMax, []>;
-def PPCxsminc : SDNode<"PPCISD::XSMINCDP", SDT_PPCFPMinMax, []>;
+def PPCxsmaxc : SDNode<"PPCISD::XSMAXC", SDT_PPCFPMinMax, []>;
+def PPCxsminc : SDNode<"PPCISD::XSMINC", SDT_PPCFPMinMax, []>;
 def PPChi       : SDNode<"PPCISD::Hi", SDTIntBinOp, []>;
 def PPClo       : SDNode<"PPCISD::Lo", SDTIntBinOp, []>;
 def PPCtoc_entry: SDNode<"PPCISD::TOC_ENTRY", SDTIntBinOp,
@@ -633,514 +633,6 @@ class NoEncode<string E> {
 }
 
 
-//===----------------------------------------------------------------------===//
-// PowerPC Operand Definitions.
-
-// In the default PowerPC assembler syntax, registers are specified simply
-// by number, so they cannot be distinguished from immediate values (without
-// looking at the opcode).  This means that the default operand matching logic
-// for the asm parser does not work, and we need to specify custom matchers.
-// Since those can only be specified with RegisterOperand classes and not
-// directly on the RegisterClass, all instructions patterns used by the asm
-// parser need to use a RegisterOperand (instead of a RegisterClass) for
-// all their register operands.
-// For this purpose, we define one RegisterOperand for each RegisterClass,
-// using the same name as the class, just in lower case.
-
-def PPCRegGPRCAsmOperand : AsmOperandClass {
-  let Name = "RegGPRC"; let PredicateMethod = "isRegNumber";
-}
-def gprc : RegisterOperand<GPRC> {
-  let ParserMatchClass = PPCRegGPRCAsmOperand;
-}
-def PPCRegG8RCAsmOperand : AsmOperandClass {
-  let Name = "RegG8RC"; let PredicateMethod = "isRegNumber";
-}
-def g8rc : RegisterOperand<G8RC> {
-  let ParserMatchClass = PPCRegG8RCAsmOperand;
-}
-def PPCRegG8pRCAsmOperand : AsmOperandClass {
-  let Name = "RegG8pRC"; let PredicateMethod = "isEvenRegNumber";
-}
-def g8prc : RegisterOperand<G8pRC> {
-  let ParserMatchClass = PPCRegG8pRCAsmOperand;
-}
-def PPCRegGPRCNoR0AsmOperand : AsmOperandClass {
-  let Name = "RegGPRCNoR0"; let PredicateMethod = "isRegNumber";
-}
-def gprc_nor0 : RegisterOperand<GPRC_NOR0> {
-  let ParserMatchClass = PPCRegGPRCNoR0AsmOperand;
-}
-def PPCRegG8RCNoX0AsmOperand : AsmOperandClass {
-  let Name = "RegG8RCNoX0"; let PredicateMethod = "isRegNumber";
-}
-def g8rc_nox0 : RegisterOperand<G8RC_NOX0> {
-  let ParserMatchClass = PPCRegG8RCNoX0AsmOperand;
-}
-def PPCRegF8RCAsmOperand : AsmOperandClass {
-  let Name = "RegF8RC"; let PredicateMethod = "isRegNumber";
-}
-def f8rc : RegisterOperand<F8RC> {
-  let ParserMatchClass = PPCRegF8RCAsmOperand;
-}
-def PPCRegF4RCAsmOperand : AsmOperandClass {
-  let Name = "RegF4RC"; let PredicateMethod = "isRegNumber";
-}
-def f4rc : RegisterOperand<F4RC> {
-  let ParserMatchClass = PPCRegF4RCAsmOperand;
-}
-def PPCRegVRRCAsmOperand : AsmOperandClass {
-  let Name = "RegVRRC"; let PredicateMethod = "isRegNumber";
-}
-def vrrc : RegisterOperand<VRRC> {
-  let ParserMatchClass = PPCRegVRRCAsmOperand;
-}
-def PPCRegVFRCAsmOperand : AsmOperandClass {
-  let Name = "RegVFRC"; let PredicateMethod = "isRegNumber";
-}
-def vfrc : RegisterOperand<VFRC> {
-  let ParserMatchClass = PPCRegVFRCAsmOperand;
-}
-def PPCRegCRBITRCAsmOperand : AsmOperandClass {
-  let Name = "RegCRBITRC"; let PredicateMethod = "isCRBitNumber";
-}
-def crbitrc : RegisterOperand<CRBITRC> {
-  let ParserMatchClass = PPCRegCRBITRCAsmOperand;
-}
-def PPCRegCRRCAsmOperand : AsmOperandClass {
-  let Name = "RegCRRC"; let PredicateMethod = "isCCRegNumber";
-}
-def crrc : RegisterOperand<CRRC> {
-  let ParserMatchClass = PPCRegCRRCAsmOperand;
-}
-def PPCRegSPERCAsmOperand : AsmOperandClass {
-  let Name = "RegSPERC"; let PredicateMethod = "isRegNumber";
-}
-def sperc : RegisterOperand<SPERC> {
-  let ParserMatchClass = PPCRegSPERCAsmOperand;
-}
-def PPCRegSPE4RCAsmOperand : AsmOperandClass {
-  let Name = "RegSPE4RC"; let PredicateMethod = "isRegNumber";
-}
-def spe4rc : RegisterOperand<GPRC> {
-  let ParserMatchClass = PPCRegSPE4RCAsmOperand;
-}
-
-def PPCU1ImmAsmOperand : AsmOperandClass {
-  let Name = "U1Imm"; let PredicateMethod = "isU1Imm";
-  let RenderMethod = "addImmOperands";
-}
-def u1imm   : Operand<i32> {
-  let PrintMethod = "printU1ImmOperand";
-  let ParserMatchClass = PPCU1ImmAsmOperand;
-  let OperandType = "OPERAND_IMMEDIATE";
-}
-
-def PPCU2ImmAsmOperand : AsmOperandClass {
-  let Name = "U2Imm"; let PredicateMethod = "isU2Imm";
-  let RenderMethod = "addImmOperands";
-}
-def u2imm   : Operand<i32> {
-  let PrintMethod = "printU2ImmOperand";
-  let ParserMatchClass = PPCU2ImmAsmOperand;
-  let OperandType = "OPERAND_IMMEDIATE";
-}
-
-def PPCATBitsAsHintAsmOperand : AsmOperandClass {
-  let Name = "ATBitsAsHint"; let PredicateMethod = "isATBitsAsHint";
-  let RenderMethod = "addImmOperands"; // Irrelevant, predicate always fails.
-}
-def atimm   : Operand<i32> {
-  let PrintMethod = "printATBitsAsHint";
-  let ParserMatchClass = PPCATBitsAsHintAsmOperand;
-  let OperandType = "OPERAND_IMMEDIATE";
-}
-
-def PPCU3ImmAsmOperand : AsmOperandClass {
-  let Name = "U3Imm"; let PredicateMethod = "isU3Imm";
-  let RenderMethod = "addImmOperands";
-}
-def u3imm   : Operand<i32> {
-  let PrintMethod = "printU3ImmOperand";
-  let ParserMatchClass = PPCU3ImmAsmOperand;
-  let OperandType = "OPERAND_IMMEDIATE";
-}
-
-def PPCU4ImmAsmOperand : AsmOperandClass {
-  let Name = "U4Imm"; let PredicateMethod = "isU4Imm";
-  let RenderMethod = "addImmOperands";
-}
-def u4imm   : Operand<i32> {
-  let PrintMethod = "printU4ImmOperand";
-  let ParserMatchClass = PPCU4ImmAsmOperand;
-  let OperandType = "OPERAND_IMMEDIATE";
-}
-def PPCS5ImmAsmOperand : AsmOperandClass {
-  let Name = "S5Imm"; let PredicateMethod = "isS5Imm";
-  let RenderMethod = "addImmOperands";
-}
-def s5imm   : Operand<i32> {
-  let PrintMethod = "printS5ImmOperand";
-  let ParserMatchClass = PPCS5ImmAsmOperand;
-  let DecoderMethod = "decodeSImmOperand<5>";
-  let OperandType = "OPERAND_IMMEDIATE";
-}
-def PPCU5ImmAsmOperand : AsmOperandClass {
-  let Name = "U5Imm"; let PredicateMethod = "isU5Imm";
-  let RenderMethod = "addImmOperands";
-}
-def u5imm   : Operand<i32> {
-  let PrintMethod = "printU5ImmOperand";
-  let ParserMatchClass = PPCU5ImmAsmOperand;
-  let DecoderMethod = "decodeUImmOperand<5>";
-  let OperandType = "OPERAND_IMMEDIATE";
-}
-def PPCU6ImmAsmOperand : AsmOperandClass {
-  let Name = "U6Imm"; let PredicateMethod = "isU6Imm";
-  let RenderMethod = "addImmOperands";
-}
-def u6imm   : Operand<i32> {
-  let PrintMethod = "printU6ImmOperand";
-  let ParserMatchClass = PPCU6ImmAsmOperand;
-  let DecoderMethod = "decodeUImmOperand<6>";
-  let OperandType = "OPERAND_IMMEDIATE";
-}
-def PPCU7ImmAsmOperand : AsmOperandClass {
-  let Name = "U7Imm"; let PredicateMethod = "isU7Imm";
-  let RenderMethod = "addImmOperands";
-}
-def u7imm   : Operand<i32> {
-  let PrintMethod = "printU7ImmOperand";
-  let ParserMatchClass = PPCU7ImmAsmOperand;
-  let DecoderMethod = "decodeUImmOperand<7>";
-  let OperandType = "OPERAND_IMMEDIATE";
-}
-def PPCU8ImmAsmOperand : AsmOperandClass {
-  let Name = "U8Imm"; let PredicateMethod = "isU8Imm";
-  let RenderMethod = "addImmOperands";
-}
-def u8imm   : Operand<i32> {
-  let PrintMethod = "printU8ImmOperand";
-  let ParserMatchClass = PPCU8ImmAsmOperand;
-  let DecoderMethod = "decodeUImmOperand<8>";
-  let OperandType = "OPERAND_IMMEDIATE";
-}
-def PPCU10ImmAsmOperand : AsmOperandClass {
-  let Name = "U10Imm"; let PredicateMethod = "isU10Imm";
-  let RenderMethod = "addImmOperands";
-}
-def u10imm  : Operand<i32> {
-  let PrintMethod = "printU10ImmOperand";
-  let ParserMatchClass = PPCU10ImmAsmOperand;
-  let DecoderMethod = "decodeUImmOperand<10>";
-  let OperandType = "OPERAND_IMMEDIATE";
-}
-def PPCU12ImmAsmOperand : AsmOperandClass {
-  let Name = "U12Imm"; let PredicateMethod = "isU12Imm";
-  let RenderMethod = "addImmOperands";
-}
-def u12imm  : Operand<i32> {
-  let PrintMethod = "printU12ImmOperand";
-  let ParserMatchClass = PPCU12ImmAsmOperand;
-  let DecoderMethod = "decodeUImmOperand<12>";
-  let OperandType = "OPERAND_IMMEDIATE";
-}
-def PPCS16ImmAsmOperand : AsmOperandClass {
-  let Name = "S16Imm"; let PredicateMethod = "isS16Imm";
-  let RenderMethod = "addS16ImmOperands";
-}
-def s16imm  : Operand<i32> {
-  let PrintMethod = "printS16ImmOperand";
-  let EncoderMethod = "getImm16Encoding";
-  let ParserMatchClass = PPCS16ImmAsmOperand;
-  let DecoderMethod = "decodeSImmOperand<16>";
-  let OperandType = "OPERAND_IMMEDIATE";
-}
-def PPCU16ImmAsmOperand : AsmOperandClass {
-  let Name = "U16Imm"; let PredicateMethod = "isU16Imm";
-  let RenderMethod = "addU16ImmOperands";
-}
-def u16imm  : Operand<i32> {
-  let PrintMethod = "printU16ImmOperand";
-  let EncoderMethod = "getImm16Encoding";
-  let ParserMatchClass = PPCU16ImmAsmOperand;
-  let DecoderMethod = "decodeUImmOperand<16>";
-  let OperandType = "OPERAND_IMMEDIATE";
-}
-def PPCS17ImmAsmOperand : AsmOperandClass {
-  let Name = "S17Imm"; let PredicateMethod = "isS17Imm";
-  let RenderMethod = "addS16ImmOperands";
-}
-def s17imm  : Operand<i32> {
-  // This operand type is used for addis/lis to allow the assembler parser
-  // to accept immediates in the range -65536..65535 for compatibility with
-  // the GNU assembler.  The operand is treated as 16-bit otherwise.
-  let PrintMethod = "printS16ImmOperand";
-  let EncoderMethod = "getImm16Encoding";
-  let ParserMatchClass = PPCS17ImmAsmOperand;
-  let DecoderMethod = "decodeSImmOperand<16>";
-  let OperandType = "OPERAND_IMMEDIATE";
-}
-def PPCS34ImmAsmOperand : AsmOperandClass {
-  let Name = "S34Imm";
-  let PredicateMethod = "isS34Imm";
-  let RenderMethod = "addImmOperands";
-}
-def s34imm : Operand<i64> {
-  let PrintMethod = "printS34ImmOperand";
-  let EncoderMethod = "getImm34EncodingNoPCRel";
-  let ParserMatchClass = PPCS34ImmAsmOperand;
-  let DecoderMethod = "decodeSImmOperand<34>";
-  let OperandType = "OPERAND_IMMEDIATE";
-}
-def s34imm_pcrel : Operand<i64> {
-  let PrintMethod = "printS34ImmOperand";
-  let EncoderMethod = "getImm34EncodingPCRel";
-  let ParserMatchClass = PPCS34ImmAsmOperand;
-  let DecoderMethod = "decodeSImmOperand<34>";
-  let OperandType = "OPERAND_IMMEDIATE";
-}
-def PPCImmZeroAsmOperand : AsmOperandClass {
-  let Name = "ImmZero";
-  let PredicateMethod = "isImmZero";
-  let RenderMethod = "addImmOperands";
-}
-def immZero : Operand<i32> {
-  let PrintMethod = "printImmZeroOperand";
-  let ParserMatchClass = PPCImmZeroAsmOperand;
-  let DecoderMethod = "decodeImmZeroOperand";
-  let OperandType = "OPERAND_IMMEDIATE";
-}
-
-def fpimm0 : PatLeaf<(fpimm), [{ return N->isExactlyValue(+0.0); }]>;
-
-def PPCDirectBrAsmOperand : AsmOperandClass {
-  let Name = "DirectBr"; let PredicateMethod = "isDirectBr";
-  let RenderMethod = "addBranchTargetOperands";
-}
-def directbrtarget : Operand<OtherVT> {
-  let PrintMethod = "printBranchOperand";
-  let EncoderMethod = "getDirectBrEncoding";
-  let DecoderMethod = "decodeDirectBrTarget";
-  let ParserMatchClass = PPCDirectBrAsmOperand;
-  let OperandType = "OPERAND_PCREL";
-}
-def absdirectbrtarget : Operand<OtherVT> {
-  let PrintMethod = "printAbsBranchOperand";
-  let EncoderMethod = "getAbsDirectBrEncoding";
-  let ParserMatchClass = PPCDirectBrAsmOperand;
-}
-def PPCCondBrAsmOperand : AsmOperandClass {
-  let Name = "CondBr"; let PredicateMethod = "isCondBr";
-  let RenderMethod = "addBranchTargetOperands";
-}
-def condbrtarget : Operand<OtherVT> {
-  let PrintMethod = "printBranchOperand";
-  let EncoderMethod = "getCondBrEncoding";
-  let DecoderMethod = "decodeCondBrTarget";
-  let ParserMatchClass = PPCCondBrAsmOperand;
-  let OperandType = "OPERAND_PCREL";
-}
-def abscondbrtarget : Operand<OtherVT> {
-  let PrintMethod = "printAbsBranchOperand";
-  let EncoderMethod = "getAbsCondBrEncoding";
-  let ParserMatchClass = PPCCondBrAsmOperand;
-}
-def calltarget : Operand<iPTR> {
-  let PrintMethod = "printBranchOperand";
-  let EncoderMethod = "getDirectBrEncoding";
-  let DecoderMethod = "decodeDirectBrTarget";
-  let ParserMatchClass = PPCDirectBrAsmOperand;
-  let OperandType = "OPERAND_PCREL";
-}
-def abscalltarget : Operand<iPTR> {
-  let PrintMethod = "printAbsBranchOperand";
-  let EncoderMethod = "getAbsDirectBrEncoding";
-  let ParserMatchClass = PPCDirectBrAsmOperand;
-}
-def PPCCRBitMaskOperand : AsmOperandClass {
- let Name = "CRBitMask"; let PredicateMethod = "isCRBitMask";
-}
-def crbitm: Operand<i8> {
-  let PrintMethod = "printcrbitm";
-  let EncoderMethod = "get_crbitm_encoding";
-  let DecoderMethod = "decodeCRBitMOperand";
-  let ParserMatchClass = PPCCRBitMaskOperand;
-}
-// Address operands
-// A version of ptr_rc which excludes R0 (or X0 in 64-bit mode).
-def PPCRegGxRCNoR0Operand : AsmOperandClass {
-  let Name = "RegGxRCNoR0"; let PredicateMethod = "isRegNumber";
-}
-def ptr_rc_nor0 : Operand<iPTR>, PointerLikeRegClass<1> {
-  let ParserMatchClass = PPCRegGxRCNoR0Operand;
-}
-
-// New addressing modes with 34 bit immediates.
-def PPCDispRI34Operand : AsmOperandClass {
-  let Name = "DispRI34"; let PredicateMethod = "isS34Imm";
-  let RenderMethod = "addImmOperands";
-}
-def dispRI34 : Operand<iPTR> {
-  let ParserMatchClass = PPCDispRI34Operand;
-}
-def memri34 : Operand<iPTR> { // memri, imm is a 34-bit value.
-  let PrintMethod = "printMemRegImm34";
-  let MIOperandInfo = (ops dispRI34:$imm, ptr_rc_nor0:$reg);
-  let EncoderMethod = "getMemRI34Encoding";
-  let DecoderMethod = "decodeMemRI34Operands";
-}
-// memri, imm is a 34-bit value for pc-relative instructions where
-// base register is set to zero.
-def memri34_pcrel : Operand<iPTR> { // memri, imm is a 34-bit value.
-  let PrintMethod = "printMemRegImm34PCRel";
-  let MIOperandInfo = (ops dispRI34:$imm, immZero:$reg);
-  let EncoderMethod = "getMemRI34PCRelEncoding";
-  let DecoderMethod = "decodeMemRI34PCRelOperands";
-}
-
-// A version of ptr_rc usable with the asm parser.
-def PPCRegGxRCOperand : AsmOperandClass {
-  let Name = "RegGxRC"; let PredicateMethod = "isRegNumber";
-}
-def ptr_rc_idx : Operand<iPTR>, PointerLikeRegClass<0> {
-  let ParserMatchClass = PPCRegGxRCOperand;
-}
-
-def PPCDispRIOperand : AsmOperandClass {
- let Name = "DispRI"; let PredicateMethod = "isS16Imm";
- let RenderMethod = "addS16ImmOperands";
-}
-def dispRI : Operand<iPTR> {
-  let ParserMatchClass = PPCDispRIOperand;
-}
-def PPCDispRIXOperand : AsmOperandClass {
- let Name = "DispRIX"; let PredicateMethod = "isS16ImmX4";
- let RenderMethod = "addImmOperands";
-}
-def dispRIX : Operand<iPTR> {
-  let ParserMatchClass = PPCDispRIXOperand;
-}
-def PPCDispRIHashOperand : AsmOperandClass {
-  let Name = "DispRIHash"; let PredicateMethod = "isHashImmX8";
-  let RenderMethod = "addImmOperands";
-}
-def dispRIHash : Operand<iPTR> {
-  let ParserMatchClass = PPCDispRIHashOperand;
-}
-def PPCDispRIX16Operand : AsmOperandClass {
- let Name = "DispRIX16"; let PredicateMethod = "isS16ImmX16";
- let RenderMethod = "addImmOperands";
-}
-def dispRIX16 : Operand<iPTR> {
-  let ParserMatchClass = PPCDispRIX16Operand;
-}
-def PPCDispSPE8Operand : AsmOperandClass {
- let Name = "DispSPE8"; let PredicateMethod = "isU8ImmX8";
- let RenderMethod = "addImmOperands";
-}
-def dispSPE8 : Operand<iPTR> {
-  let ParserMatchClass = PPCDispSPE8Operand;
-}
-def PPCDispSPE4Operand : AsmOperandClass {
- let Name = "DispSPE4"; let PredicateMethod = "isU7ImmX4";
- let RenderMethod = "addImmOperands";
-}
-def dispSPE4 : Operand<iPTR> {
-  let ParserMatchClass = PPCDispSPE4Operand;
-}
-def PPCDispSPE2Operand : AsmOperandClass {
- let Name = "DispSPE2"; let PredicateMethod = "isU6ImmX2";
- let RenderMethod = "addImmOperands";
-}
-def dispSPE2 : Operand<iPTR> {
-  let ParserMatchClass = PPCDispSPE2Operand;
-}
-
-def memri : Operand<iPTR> {
-  let PrintMethod = "printMemRegImm";
-  let MIOperandInfo = (ops dispRI:$imm, ptr_rc_nor0:$reg);
-  let EncoderMethod = "getMemRIEncoding";
-  let DecoderMethod = "decodeMemRIOperands";
-  let OperandType = "OPERAND_MEMORY";
-}
-def memrr : Operand<iPTR> {
-  let PrintMethod = "printMemRegReg";
-  let MIOperandInfo = (ops ptr_rc_nor0:$ptrreg, ptr_rc_idx:$offreg);
-  let OperandType = "OPERAND_MEMORY";
-}
-def memrix : Operand<iPTR> {   // memri where the imm is 4-aligned.
-  let PrintMethod = "printMemRegImm";
-  let MIOperandInfo = (ops dispRIX:$imm, ptr_rc_nor0:$reg);
-  let EncoderMethod = "getMemRIXEncoding";
-  let DecoderMethod = "decodeMemRIXOperands";
-  let OperandType = "OPERAND_MEMORY";
-}
-def memrihash : Operand<iPTR> {
-  // memrihash 8-aligned for ROP Protection Instructions.
-  let PrintMethod = "printMemRegImmHash";
-  let MIOperandInfo = (ops dispRIHash:$imm, ptr_rc_nor0:$reg);
-  let EncoderMethod = "getMemRIHashEncoding";
-  let DecoderMethod = "decodeMemRIHashOperands";
-  let OperandType = "OPERAND_MEMORY";
-}
-def memrix16 : Operand<iPTR> { // memri, imm is 16-aligned, 12-bit, Inst{16:27}
-  let PrintMethod = "printMemRegImm";
-  let MIOperandInfo = (ops dispRIX16:$imm, ptr_rc_nor0:$reg);
-  let EncoderMethod = "getMemRIX16Encoding";
-  let DecoderMethod = "decodeMemRIX16Operands";
-  let OperandType = "OPERAND_MEMORY";
-}
-def spe8dis : Operand<iPTR> {   // SPE displacement where the imm is 8-aligned.
-  let PrintMethod = "printMemRegImm";
-  let MIOperandInfo = (ops dispSPE8:$imm, ptr_rc_nor0:$reg);
-  let EncoderMethod = "getSPE8DisEncoding";
-  let DecoderMethod = "decodeSPE8Operands";
-  let OperandType = "OPERAND_MEMORY";
-}
-def spe4dis : Operand<iPTR> {   // SPE displacement where the imm is 4-aligned.
-  let PrintMethod = "printMemRegImm";
-  let MIOperandInfo = (ops dispSPE4:$imm, ptr_rc_nor0:$reg);
-  let EncoderMethod = "getSPE4DisEncoding";
-  let DecoderMethod = "decodeSPE4Operands";
-  let OperandType = "OPERAND_MEMORY";
-}
-def spe2dis : Operand<iPTR> {   // SPE displacement where the imm is 2-aligned.
-  let PrintMethod = "printMemRegImm";
-  let MIOperandInfo = (ops dispSPE2:$imm, ptr_rc_nor0:$reg);
-  let EncoderMethod = "getSPE2DisEncoding";
-  let DecoderMethod = "decodeSPE2Operands";
-  let OperandType = "OPERAND_MEMORY";
-}
-
-// A single-register address. This is used with the SjLj
-// pseudo-instructions which translates to LD/LWZ.  These instructions requires
-// G8RC_NOX0 registers.
-def memr : Operand<iPTR> {
-  let MIOperandInfo = (ops ptr_rc_nor0:$ptrreg);
-  let OperandType = "OPERAND_MEMORY";
-}
-def PPCTLSRegOperand : AsmOperandClass {
-  let Name = "TLSReg"; let PredicateMethod = "isTLSReg";
-  let RenderMethod = "addTLSRegOperands";
-}
-def tlsreg32 : Operand<i32> {
-  let EncoderMethod = "getTLSRegEncoding";
-  let ParserMatchClass = PPCTLSRegOperand;
-}
-def tlsgd32 : Operand<i32> {}
-def tlscall32 : Operand<i32> {
-  let PrintMethod = "printTLSCall";
-  let MIOperandInfo = (ops calltarget:$func, tlsgd32:$sym);
-  let EncoderMethod = "getTLSCallEncoding";
-}
-
-// PowerPC Predicate operand.
-def pred : Operand<OtherVT> {
-  let PrintMethod = "printPredicateOperand";
-  let MIOperandInfo = (ops i32imm:$bibo, crrc:$reg);
-}
-
 // Define PowerPC specific addressing mode.
 
 // d-form
@@ -1212,6 +704,7 @@ def ModernAs: Predicate<"!Subtarget->isAIXABI() || Subtarget->HasModernAIXAs">,
                  AssemblerPredicate<(any_of (not AIXOS), FeatureModernAIXAs)>;
 def IsAIX : Predicate<"Subtarget->isAIXABI()">;
 def NotAIX : Predicate<"!Subtarget->isAIXABI()">;
+def IsISAFuture : Predicate<"Subtarget->isISAFuture()">;
 
 //===----------------------------------------------------------------------===//
 // PowerPC Multiclass Definitions.
@@ -3056,6 +2549,13 @@ def MTCTRloop : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS),
                 PPC970_DGroup_First, PPC970_Unit_FXU;
 }
 
+let hasSideEffects = 1, Defs = [CTR] in
+def MTCTRPseudo : PPCEmitTimePseudo<(outs), (ins gprc:$rS), "#MTCTRPseudo", []>;
+
+let hasSideEffects = 1, Uses = [CTR], Defs = [CTR] in
+def DecreaseCTRPseudo : PPCEmitTimePseudo<(outs crbitrc:$rT), (ins i32imm:$stride),
+                                          "#DecreaseCTRPseudo", []>;
+
 let hasSideEffects = 0 in {
 let Defs = [LR] in {
 def MTLR  : XFXForm_7_ext<31, 467, 8, (outs), (ins gprc:$rS),
@@ -3069,6 +2569,22 @@ def MFLR  : XFXForm_1_ext<31, 339, 8, (outs gprc:$rT), (ins),
 }
 }
 
+let hasSideEffects = 1 in {
+  def MTUDSCR : XFXForm_7_ext<31, 467, 3, (outs), (ins gprc:$rX),
+                              "mtspr 3, $rX", IIC_SprMTSPR>,
+                PPC970_DGroup_Single, PPC970_Unit_FXU;
+  def MFUDSCR : XFXForm_1_ext<31, 339, 3, (outs gprc:$rX), (ins),
+                              "mfspr $rX, 3", IIC_SprMFSPR>,
+                PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+
+// Disable these alias on AIX since they are not supported.
+let Predicates = [ModernAs] in {
+// Aliases for moving to/from dscr to mtspr/mfspr
+def : InstAlias<"mtudscr $Rx", (MTUDSCR gprc:$Rx)>;
+def : InstAlias<"mfudscr $Rx", (MFUDSCR gprc:$Rx)>;
+}
+
 let isCodeGenOnly = 1 in {
   // Move to/from VRSAVE: despite being a SPR, the VRSAVE register is renamed
   // like a GPR on the PPC970.  As such, copies in and out have the same
@@ -3728,12 +3244,12 @@ def : Pat<(fcopysign f32:$frB, f64:$frA),
 // XL Compat intrinsics.
 def : Pat<(int_ppc_fmsub f64:$A, f64:$B, f64:$C), (FMSUB $A, $B, $C)>;
 def : Pat<(int_ppc_fmsubs f32:$A, f32:$B, f32:$C), (FMSUBS $A, $B, $C)>;
-def : Pat<(int_ppc_fnmsub f64:$A, f64:$B, f64:$C), (FNMSUB $A, $B, $C)>;
-def : Pat<(int_ppc_fnmsubs f32:$A, f32:$B, f32:$C), (FNMSUBS $A, $B, $C)>;
 def : Pat<(int_ppc_fnmadd f64:$A, f64:$B, f64:$C), (FNMADD $A, $B, $C)>;
 def : Pat<(int_ppc_fnmadds f32:$A, f32:$B, f32:$C), (FNMADDS $A, $B, $C)>;
 def : Pat<(int_ppc_fre f64:$A), (FRE $A)>;
 def : Pat<(int_ppc_fres f32:$A), (FRES $A)>;
+def : Pat<(int_ppc_fnabs f64:$A), (FNABSD $A)>;
+def : Pat<(int_ppc_fnabss f32:$A), (FNABSS $A)>;
 
 include "PPCInstrAltivec.td"
 include "PPCInstrSPE.td"
@@ -3748,7 +3264,8 @@ def       : Pat<(not i1:$in),
 
 // Prefixed instructions may require access to the above defs at a later
 // time so we include this after the def.
-include "PPCInstrPrefix.td"
+include "PPCInstrP10.td"
+include "PPCInstrMMA.td"
 
 // Patterns for arithmetic i1 operations.
 def : Pat<(add i1:$a, i1:$b),
diff --git a/llvm/lib/Target/PowerPC/PPCInstrMMA.td b/llvm/lib/Target/PowerPC/PPCInstrMMA.td
new file mode 100644
index 000000000000..a7e85cda781f
--- /dev/null
+++ b/llvm/lib/Target/PowerPC/PPCInstrMMA.td
@@ -0,0 +1,628 @@
+
+// Mask immediates for MMA instructions (2, 4 and 8 bits).
+def Msk2Imm : ImmLeaf<i32, [{ return isUInt<2>(Imm); }]>;
+def Msk4Imm : ImmLeaf<i32, [{ return isUInt<4>(Imm); }]>;
+def Msk8Imm : ImmLeaf<i32, [{ return isUInt<8>(Imm); }]>;
+
+def MMA : Predicate<"Subtarget->hasMMA()">;
+
+
+// Multiclass definitions for MMA accumulator instructions.
+// ----------------------------------------------------------------------------
+
+// Defines 2 unmasked instructions where the xo field for acc/non-acc version
+// is even/odd.
+multiclass ACC_UM_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
+                       string asmstr> {
+  let Predicates = [MMA] in {
+  def NAME :
+    XX3Form_AT3_XAB6<opcode, !or(xo, 0x01), (outs acc:$AT), IOL,
+                     !strconcat(asmbase#" ", asmstr), IIC_VecFP, []>,
+    RegConstraint<"@earlyclobber $AT">;
+  def PP :
+    XX3Form_AT3_XAB6<opcode, xo, (outs acc:$AT), !con((ins acc:$ATi), IOL),
+                     !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
+// Defines 4 instructions, masked/unmasked with masks 8, 4, 4 bits.
+// The XO field for acc/non-acc version is even/odd.
+multiclass ACC_UM_M844_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
+                            string asmstr> {
+  defm NAME : ACC_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>;
+  let Predicates = [MMA, PrefixInstrs] in {
+  def PM#NAME :
+    MMIRR_XX3Form_XY4P8_XAB6<
+      opcode, !or(xo, 0x01), (outs acc:$AT),
+      !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u8imm:$PMSK)),
+      !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"@earlyclobber $AT">;
+  def PM#NAME#PP :
+    MMIRR_XX3Form_XY4P8_XAB6<
+      opcode, xo, (outs acc:$AT),
+      !con((ins acc:$ATi),
+           !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u8imm:$PMSK))),
+      !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
+// Defines 4 instructions, masked/unmasked with masks 4, 4, 4 bits.
+// The XO field for acc/non-acc version is even/odd.
+multiclass ACC_UM_M444_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
+                            string asmstr> {
+  defm NAME : ACC_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>;
+  let Predicates = [MMA, PrefixInstrs] in {
+  def PM#NAME :
+    MMIRR_XX3Form_XYP4_XAB6<
+      opcode, !or(xo, 0x01), (outs acc:$AT),
+      !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK)),
+      !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"@earlyclobber $AT">;
+  def PM#NAME#PP :
+    MMIRR_XX3Form_XYP4_XAB6<
+      opcode, xo, (outs acc:$AT),
+      !con((ins acc:$ATi),
+           !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK))),
+      !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
+// Defines 4 instructions, masked/unmasked with masks 2, 4, 4 bits.
+// The XO field for acc/non-acc version is even/odd.
+multiclass ACC_UM_M244_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
+                            string asmstr> {
+  defm NAME : ACC_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>;
+  let Predicates = [MMA, PrefixInstrs] in {
+  def PM#NAME :
+    MMIRR_XX3Form_XY4P2_XAB6<
+      opcode, !or(xo, 0x01), (outs acc:$AT),
+      !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK)),
+      !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"@earlyclobber $AT">;
+  def PM#NAME#PP :
+    MMIRR_XX3Form_XY4P2_XAB6<
+      opcode, xo, (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+      !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
+// Defines 4 instructions, masked/unmasked with masks 2, 4, 4 bits.
+// Upper nibble of XO field for acc/non-acc version is 0x4/0x6.
+multiclass ACC_UM_M244_XO46<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
+                            string asmstr> {
+  let Predicates = [MMA] in {
+  def NAME :
+    XX3Form_AT3_XAB6<opcode, xo, (outs acc:$AT), IOL,
+                     !strconcat(asmbase#" ", asmstr), IIC_VecFP, []>,
+    RegConstraint<"@earlyclobber $AT">;
+  def PP :
+    XX3Form_AT3_XAB6<
+      opcode, !or(xo, 0x20), (outs acc:$AT), !con((ins acc:$ATi), IOL),
+      !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+  let Predicates = [MMA, PrefixInstrs] in {
+  def PM#NAME :
+    MMIRR_XX3Form_XY4P2_XAB6<
+      opcode, xo, (outs acc:$AT),
+      !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK)),
+      !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"@earlyclobber $AT">;
+  def PM#NAME#PP :
+    MMIRR_XX3Form_XY4P2_XAB6<
+      opcode, !or(xo, 0x20), (outs acc:$AT),
+      !con((ins acc:$ATi),
+           !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+      !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
+// Defines 10 instructions, operand negating, unmasked, masked with 2, 4, 4
+// bits. Upper nibble are masked with 0x8, 0x4, 0xC for negating operands.
+multiclass ACC_NEG_UM_M244_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
+                                  string asmbase, string asmstr> {
+  defm NAME : ACC_UM_M244_XOEO<opcode, xo, IOL, asmbase, asmstr>;
+  let Predicates = [MMA] in {
+  def PN : XX3Form_AT3_XAB6<
+             opcode, !or(xo, 0x80), (outs acc:$AT), !con((ins acc:$ATi), IOL),
+             !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>,
+           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def NP : XX3Form_AT3_XAB6<
+             opcode, !or(xo, 0x40), (outs acc:$AT), !con((ins acc:$ATi), IOL),
+             !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>,
+           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def NN : XX3Form_AT3_XAB6<
+             opcode, !or(xo, 0xC0), (outs acc:$AT), !con((ins acc:$ATi), IOL),
+             !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>,
+           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+  let Predicates = [MMA, PrefixInstrs] in {
+  def PM#NAME#PN :
+    MMIRR_XX3Form_XY4P2_XAB6<
+      opcode, !or(xo, 0x80), (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+      !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def PM#NAME#NP :
+    MMIRR_XX3Form_XY4P2_XAB6<
+      opcode, !or(xo, 0x40), (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+      !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def PM#NAME#NN :
+    MMIRR_XX3Form_XY4P2_XAB6<
+      opcode, !or(xo, 0xC0), (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+      !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
+// Defines 5 instructions, unmasked, operand negating.
+// Upper nibble are masked with 0x8, 0x4, 0xC for negating operands.
+multiclass ACC_NEG_UM_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
+                             string asmbase, string asmstr> {
+  defm NAME : ACC_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>;
+  let Predicates = [MMA] in {
+  def PN : XX3Form_AT3_XAB6<opcode, !or(xo, 0x80), (outs acc:$AT),
+                            !con((ins acc:$ATi), IOL),
+                            !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>,
+           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def NP : XX3Form_AT3_XAB6<opcode, !or(xo, 0x40), (outs acc:$AT),
+                            !con((ins acc:$ATi), IOL),
+                            !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>,
+           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def NN : XX3Form_AT3_XAB6<opcode, !or(xo, 0xC0), (outs acc:$AT),
+                            !con((ins acc:$ATi), IOL),
+                            !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>,
+           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
+// Defines 10 instructions, operand negating, unmasked, masked with 4, 4 bits.
+// Upper nibble are masked with 0x8, 0x4, 0xC for negating operands.
+multiclass ACC_NEG_UM_M44_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
+                                 string asmbase, string asmstr> {
+  defm NAME : ACC_NEG_UM_XOM84C<opcode, xo, IOL, asmbase, asmstr>;
+  let Predicates = [MMA, PrefixInstrs] in {
+  def PM#NAME :
+    MMIRR_XX3Form_XY4_XAB6<
+      opcode, !or(xo, 0x01), (outs acc:$AT),
+      !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK)),
+      !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"@earlyclobber $AT">;
+  def PM#NAME#PP :
+    MMIRR_XX3Form_XY4_XAB6<
+      opcode, xo, (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
+      !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def PM#NAME#PN :
+    MMIRR_XX3Form_XY4_XAB6<
+      opcode, !or(xo, 0x80), (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
+      !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def PM#NAME#NP :
+    MMIRR_XX3Form_XY4_XAB6<
+      opcode, !or(xo, 0x40), (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
+      !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def PM#NAME#NN :
+    MMIRR_XX3Form_XY4_XAB6<
+      opcode, !or(xo, 0xC0), (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
+      !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
+// Defines 10 instructions, operand negating, unmasked, masked with 4, 2 bits.
+// Upper nibble are masked with 0x8, 0x4, 0xC for negating operands.
+multiclass ACC_NEG_UM_M42_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
+                                 string asmbase, string asmstr> {
+  defm NAME : ACC_NEG_UM_XOM84C<opcode, xo, IOL, asmbase, asmstr>;
+  let Predicates = [MMA, PrefixInstrs] in {
+  def PM#NAME :
+    MMIRR_XX3Form_X4Y2_XAB6<
+      opcode, !or(xo, 0x01), (outs acc:$AT),
+      !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK)),
+      !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"@earlyclobber $AT">;
+  def PM#NAME#PP :
+    MMIRR_XX3Form_X4Y2_XAB6<
+      opcode, xo, (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
+      !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def PM#NAME#PN :
+    MMIRR_XX3Form_X4Y2_XAB6<
+      opcode, !or(xo, 0x80), (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
+      !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def PM#NAME#NP :
+    MMIRR_XX3Form_X4Y2_XAB6<
+      opcode, !or(xo, 0x40), (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
+      !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def PM#NAME#NN :
+    MMIRR_XX3Form_X4Y2_XAB6<
+      opcode, !or(xo, 0xC0), (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
+      !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
+// End of class definitions.
+//-----------------------------------------------------------------------------
+
+let Predicates = [MMA] in {
+  def XXMFACC :
+    XForm_AT3<31, 0, 177, (outs acc:$ASo), (ins acc:$AS), "xxmfacc $AS",
+              IIC_VecGeneral,
+              [(set v512i1:$ASo, (int_ppc_mma_xxmfacc v512i1:$AS))]>,
+              RegConstraint<"$ASo = $AS">, NoEncode<"$ASo">;
+  def XXMTACC :
+    XForm_AT3<31, 1, 177, (outs acc:$AT), (ins acc:$ATi), "xxmtacc $AT",
+              IIC_VecGeneral,
+              [(set v512i1:$AT, (int_ppc_mma_xxmtacc v512i1:$ATi))]>,
+              RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def KILL_PAIR : PPCPostRAExpPseudo<(outs vsrprc:$XTp), (ins vsrprc:$XSp),
+                                      "#KILL_PAIR", []>,
+                                      RegConstraint<"$XTp = $XSp">;
+  def BUILD_UACC : PPCPostRAExpPseudo<(outs acc:$AT), (ins uacc:$AS),
+                                      "#BUILD_UACC $AT, $AS", []>;
+  // We define XXSETACCZ as rematerializable to undo CSE of that intrinsic in
+  // the backend. We avoid CSE here because it generates a copy of the acc
+  // register and this copy is more expensive than calling the intrinsic again.
+  let isAsCheapAsAMove = 1, isReMaterializable = 1 in {
+    def XXSETACCZ :
+      XForm_AT3<31, 3, 177, (outs acc:$AT), (ins), "xxsetaccz $AT", IIC_VecGeneral,
+                [(set v512i1:$AT, (int_ppc_mma_xxsetaccz))]>;
+  }
+  def XVI8GER4SPP :
+    XX3Form_AT3_XAB6<59, 99, (outs acc:$AT), (ins acc:$ATi, vsrc:$XA, vsrc:$XB),
+                     "xvi8ger4spp $AT, $XA, $XB", IIC_VecGeneral, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  let mayStore = 1 in {
+    def SPILL_ACC: PPCEmitTimePseudo<(outs), (ins acc:$AT, memrix16:$dst),
+                                     "#SPILL_ACC", []>;
+    def SPILL_UACC: PPCEmitTimePseudo<(outs), (ins uacc:$AT, memrix16:$dst),
+                                     "#SPILL_UACC", []>;
+  }
+  let mayLoad = 1, hasSideEffects = 0 in {
+    def RESTORE_ACC: PPCEmitTimePseudo<(outs acc:$AT), (ins memrix16:$src),
+                                       "#RESTORE_ACC", []>;
+    def RESTORE_UACC: PPCEmitTimePseudo<(outs uacc:$AT), (ins memrix16:$src),
+                                       "#RESTORE_UACC", []>;
+  }
+}
+
+let Predicates = [MMA, PrefixInstrs] in {
+  def PMXVI8GER4SPP :
+    MMIRR_XX3Form_XYP4_XAB6<59, 99, (outs acc:$AT),
+                            (ins acc:$ATi, vsrc:$XA,vsrc:$XB, u4imm:$XMSK,
+                             u4imm:$YMSK, u4imm:$PMSK),
+                            "pmxvi8ger4spp $AT, $XA, $XB, $XMSK, $YMSK, $PMSK",
+                            IIC_VecGeneral, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+}
+
+// MMA accumulating/non-accumulating instructions.
+//------------------------------------------------------------------------------
+
+// XVBF16GER2, XVBF16GER2PP, XVBF16GER2PN, XVBF16GER2NP, XVBF16GER2NN
+// PMXVBF16GER2, PMXVBF16GER2PP, PMXVBF16GER2PN, PMXVBF16GER2NP, PMXVBF16GER2NN
+defm XVBF16GER2 : ACC_NEG_UM_M244_XOM84C<59, 50, (ins vsrc:$XA, vsrc:$XB),
+                                         "xvbf16ger2", "$AT, $XA, $XB">;
+
+// XVI4GER8, XVI4GER8PP, PMXVI4GER8,  PMXVI4GER8PP
+defm XVI4GER8 : ACC_UM_M844_XOEO<59, 34, (ins vsrc:$XA, vsrc:$XB),
+                                 "xvi4ger8", "$AT, $XA, $XB">;
+
+// XVI8GER4, XVI8GER4PP, PMXVI8GER4, PMXVI8GER4PP
+defm XVI8GER4 : ACC_UM_M444_XOEO<59, 2, (ins vsrc:$XA, vsrc:$XB),
+                                 "xvi8ger4", "$AT, $XA, $XB">;
+
+// XVI16GER2, XVI16GER2PP, PMXVI16GER2, PMXVI16GER2PP
+defm XVI16GER2 : ACC_UM_M244_XO46<59, 75, (ins vsrc:$XA, vsrc:$XB),
+                                  "xvi16ger2", "$AT, $XA, $XB">;
+
+// XVI16GER2S, XVI16GER2SPP, PMXVI16GER2S, PMXVI16GER2SPP
+defm XVI16GER2S : ACC_UM_M244_XOEO<59, 42, (ins vsrc:$XA, vsrc:$XB),
+                                   "xvi16ger2s", "$AT, $XA, $XB">;
+
+// XVF16GER2, XVF16GER2PP, XVF16GER2PN, XVF16GER2NP, XVF16GER2NN
+// PMXVF16GER2, PMXVF16GER2PP, PMXVF16GER2PN, PMXVF16GER2NP, PMXVF16GER2NN
+defm XVF16GER2 : ACC_NEG_UM_M244_XOM84C<59, 18, (ins vsrc:$XA, vsrc:$XB),
+                                        "xvf16ger2", "$AT, $XA, $XB">;
+
+// XVF32GER, XVF32GERPP, XVF32GERPN, XVF32GERNP, XVF32GERPP
+// PMXVF32GER, PMXVF32GERPP, PMXVF32GERPN, PMXVF32GERNP, PMXVF32GERPP
+defm XVF32GER : ACC_NEG_UM_M44_XOM84C<59, 26, (ins vsrc:$XA, vsrc:$XB),
+                                      "xvf32ger", "$AT, $XA, $XB">;
+
+// XVF64GER, XVF64GERPP, XVF64GERPN, XVF64GERNP, XVF64GERNN
+// PMXVF64GER, PMXVF64GERPP, PMXVF64GERPN, PMXVF64GERNP, PMXVF64GERNN
+defm XVF64GER : ACC_NEG_UM_M42_XOM84C<59, 58, (ins vsrpevenrc:$XA, vsrc:$XB),
+                                      "xvf64ger", "$AT, $XA, $XB">;
+//------------------------------------------------------------------------------
+
+// MMA Intrinsics
+let Predicates = [MMA] in {
+  def : Pat<(v512i1 (int_ppc_mma_xvi4ger8 v16i8:$XA, v16i8:$XB)),
+            (XVI4GER8 RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvi4ger8pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVI4GER8PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+
+  def : Pat<(v512i1 (int_ppc_mma_xvi8ger4 v16i8:$XA, v16i8:$XB)),
+            (XVI8GER4 RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvi8ger4pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVI8GER4PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+
+  def : Pat<(v512i1 (int_ppc_mma_xvi16ger2s v16i8:$XA, v16i8:$XB)),
+            (XVI16GER2S RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvi16ger2spp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVI16GER2SPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+
+  def : Pat<(v512i1 (int_ppc_mma_xvf16ger2 v16i8:$XA, v16i8:$XB)),
+            (XVF16GER2 RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvf16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVF16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVF16GER2PN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVF16GER2NP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvf16ger2nn v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVF16GER2NN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+
+  def : Pat<(v512i1 (int_ppc_mma_xvf32ger v16i8:$XA, v16i8:$XB)),
+            (XVF32GER RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvf32gerpp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVF32GERPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvf32gerpn v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVF32GERPN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvf32gernp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVF32GERNP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvf32gernn v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVF32GERNN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvf64ger v256i1:$XA, v16i8:$XB)),
+            (XVF64GER $XA, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvf64gerpp v512i1:$ATi, v256i1:$XA, v16i8:$XB)),
+            (XVF64GERPP $ATi, $XA, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvf64gerpn v512i1:$ATi, v256i1:$XA, v16i8:$XB)),
+            (XVF64GERPN $ATi, $XA, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvf64gernp v512i1:$ATi, v256i1:$XA, v16i8:$XB)),
+            (XVF64GERNP $ATi, $XA, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvf64gernn v512i1:$ATi, v256i1:$XA, v16i8:$XB)),
+            (XVF64GERNN $ATi, $XA, RCCp.BToVSRC)>;
+
+  def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2 v16i8:$XA, v16i8:$XB)),
+            (XVBF16GER2 RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVBF16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVBF16GER2PN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVBF16GER2NP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2nn v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVBF16GER2NN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvi16ger2 v16i8:$XA, v16i8:$XB)),
+            (XVI16GER2 RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvi16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVI16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvi8ger4spp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVI8GER4SPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+}
+
+// MMA Intrinsics
+let Predicates = [MMA, PrefixInstrs] in {
+  def : Pat<(v512i1 (int_ppc_mma_pmxvi4ger8 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
+                                            Msk4Imm:$YMSK, Msk8Imm:$PMSK)),
+            (PMXVI4GER8 RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                        Msk4Imm:$YMSK, Msk8Imm:$PMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvi4ger8pp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                              Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+                                              Msk8Imm:$PMSK)),
+            (PMXVI4GER8PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                          Msk4Imm:$YMSK, Msk8Imm:$PMSK)>;
+
+  def : Pat<(v512i1 (int_ppc_mma_pmxvi8ger4 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
+                                            Msk4Imm:$YMSK, Msk4Imm:$PMSK)),
+            (PMXVI8GER4 RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                        Msk4Imm:$YMSK, Msk4Imm:$PMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvi8ger4pp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                              Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+                                              Msk4Imm:$PMSK)),
+            (PMXVI8GER4PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                          Msk4Imm:$YMSK, Msk4Imm:$PMSK)>;
+
+  def : Pat<(v512i1 (int_ppc_mma_pmxvi16ger2s v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
+                                              Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
+            (PMXVI16GER2S RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                          Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvi16ger2spp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                                Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+                                                Msk2Imm:$PMSK)),
+            (PMXVI16GER2SPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                            Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
+                                             Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
+            (PMXVF16GER2 RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                         Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                               Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+                                               Msk2Imm:$PMSK)),
+            (PMXVF16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                           Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                               Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+                                               Msk2Imm:$PMSK)),
+            (PMXVF16GER2PN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                           Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                               Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+                                               Msk2Imm:$PMSK)),
+            (PMXVF16GER2NP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                           Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2nn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                               Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+                                               Msk2Imm:$PMSK)),
+            (PMXVF16GER2NN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                           Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+
+  def : Pat<(v512i1 (int_ppc_mma_pmxvf32ger v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
+                                            Msk4Imm:$YMSK)),
+            (PMXVF32GER RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                        Msk4Imm:$YMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvf32gerpp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                              Msk4Imm:$XMSK, Msk4Imm:$YMSK)),
+            (PMXVF32GERPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                          Msk4Imm:$YMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvf32gerpn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                              Msk4Imm:$XMSK, Msk4Imm:$YMSK)),
+            (PMXVF32GERPN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                          Msk4Imm:$YMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvf32gernp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                              Msk4Imm:$XMSK, Msk4Imm:$YMSK)),
+            (PMXVF32GERNP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                          Msk4Imm:$YMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvf32gernn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                              Msk4Imm:$XMSK, Msk4Imm:$YMSK)),
+            (PMXVF32GERNN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                          Msk4Imm:$YMSK)>;
+
+  def : Pat<(v512i1 (int_ppc_mma_pmxvf64ger v256i1:$XA, v16i8:$XB, Msk4Imm:$XMSK,
+                                            Msk2Imm:$YMSK)),
+            (PMXVF64GER $XA, RCCp.BToVSRC, Msk4Imm:$XMSK, Msk2Imm:$YMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvf64gerpp v512i1:$ATi, v256i1:$XA, v16i8:$XB,
+                                              Msk4Imm:$XMSK, Msk2Imm:$YMSK)),
+            (PMXVF64GERPP $ATi, $XA, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                          Msk2Imm:$YMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvf64gerpn v512i1:$ATi, v256i1:$XA, v16i8:$XB,
+                                              Msk4Imm:$XMSK, Msk2Imm:$YMSK)),
+            (PMXVF64GERPN $ATi, $XA, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                          Msk2Imm:$YMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvf64gernp v512i1:$ATi, v256i1:$XA, v16i8:$XB,
+                                              Msk4Imm:$XMSK, Msk2Imm:$YMSK)),
+            (PMXVF64GERNP $ATi, $XA, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                          Msk2Imm:$YMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvf64gernn v512i1:$ATi, v256i1:$XA, v16i8:$XB,
+                                              Msk4Imm:$XMSK, Msk2Imm:$YMSK)),
+            (PMXVF64GERNN $ATi, $XA, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                          Msk2Imm:$YMSK)>;
+
+  def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
+                                              Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
+            (PMXVBF16GER2 RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                          Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                                Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+                                                Msk2Imm:$PMSK)),
+            (PMXVBF16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                            Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                                Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+                                                Msk2Imm:$PMSK)),
+            (PMXVBF16GER2PN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                            Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                                Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+                                                Msk2Imm:$PMSK)),
+            (PMXVBF16GER2NP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                            Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2nn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                                Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+                                                Msk2Imm:$PMSK)),
+            (PMXVBF16GER2NN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                            Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvi16ger2 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
+                                             Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
+            (PMXVI16GER2 RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                         Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvi8ger4spp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                               Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+                                               Msk2Imm:$PMSK)),
+            (PMXVI8GER4SPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                           Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvi16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                               Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+                                               Msk2Imm:$PMSK)),
+            (PMXVI16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                           Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+}
+
+def ConcatsMMA {
+  dag VecsToVecPair0 =
+    (v256i1 (INSERT_SUBREG
+      (INSERT_SUBREG (IMPLICIT_DEF), $vs0, sub_vsx1),
+      $vs1, sub_vsx0));
+  dag VecsToVecPair1 =
+    (v256i1 (INSERT_SUBREG
+      (INSERT_SUBREG (IMPLICIT_DEF), $vs2, sub_vsx1),
+      $vs3, sub_vsx0));
+  dag VecsToVecQuad =
+    (BUILD_UACC (INSERT_SUBREG
+                  (INSERT_SUBREG (v512i1 (IMPLICIT_DEF)),
+                                 (KILL_PAIR VecsToVecPair0), sub_pair0),
+                  (KILL_PAIR VecsToVecPair1), sub_pair1));
+}
+
+def Extracts {
+  dag Pair0 = (v256i1 (EXTRACT_SUBREG $v, sub_pair0));
+  dag Pair1 = (v256i1 (EXTRACT_SUBREG $v, sub_pair1));
+  dag Vec0 = (v4i32 (EXTRACT_SUBREG Pair0, sub_vsx0));
+  dag Vec1 = (v4i32 (EXTRACT_SUBREG Pair0, sub_vsx1));
+  dag Vec2 = (v4i32 (EXTRACT_SUBREG Pair1, sub_vsx0));
+  dag Vec3 = (v4i32 (EXTRACT_SUBREG Pair1, sub_vsx1));
+}
+
+let Predicates = [MMA] in {
+  def : Pat<(v512i1 (PPCAccBuild v4i32:$vs1, v4i32:$vs0, v4i32:$vs3, v4i32:$vs2)),
+            (XXMTACC ConcatsMMA.VecsToVecQuad)>;
+  def : Pat<(v512i1 (int_ppc_mma_assemble_acc v16i8:$vs1, v16i8:$vs0,
+                                              v16i8:$vs3, v16i8:$vs2)),
+            (XXMTACC ConcatsMMA.VecsToVecQuad)>;
+  def : Pat<(v512i1 (PPCxxmfacc v512i1:$AS)), (XXMFACC acc:$AS)>;
+  def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, 0)),
+            Extracts.Vec0>;
+  def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, 1)),
+            Extracts.Vec1>;
+  def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, 2)),
+            Extracts.Vec2>;
+  def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, 3)),
+            Extracts.Vec3>;
+}
+
+
diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td
new file mode 100644
index 000000000000..6cf3f1d3341e
--- /dev/null
+++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td
@@ -0,0 +1,2315 @@
+//===-- PPCInstrP10.td - Power10 Instruction Set -----------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions introduced for the Power10 CPU.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Naming convention for future instruction formats
+//
+// <INSTR_FORM>{_<OP_TYPE><OP_LENGTH>}+
+//
+// Where:
+// <INSTR_FORM> - name of instruction format as per the ISA
+//                (X-Form, VX-Form, etc.)
+// <OP_TYPE> - operand type
+//             * FRT/RT/VT/XT/BT - target register
+//                                 (FPR, GPR, VR, VSR, CR-bit respectively)
+//                                 In some situations, the 'T' is replaced by
+//                                 'D' when describing the target register.
+//             * [FR|R|V|X|B][A-Z] - register source (i.e. FRA, RA, XB, etc.)
+//             * IMM - immediate (where signedness matters,
+//                     this is SI/UI for signed/unsigned)
+//             * [R|X|FR]Tp - register pair target (i.e. FRTp, RTp)
+//             * R - PC-Relative bit
+//                   (denotes that the address is computed pc-relative)
+//             * VRM - Masked Registers
+//             * AT - target accumulator
+//             * N - the Nth bit in a VSR
+//             * Additional 1-bit operands may be required for certain
+//               instruction formats such as: MC, P, MP
+//             * X / Y / P - mask values. In the instruction encoding, this is
+//                           represented as XMSK, YMSK and PMSK.
+//             * MEM - indicates if the instruction format requires any memory
+//                     accesses. This does not have <OP_LENGTH> attached to it.
+// <OP_LENGTH> - the length of each operand in bits.
+//               For operands that are 1 bit, the '1' is omitted from the name.
+//
+// Example: 8RR_XX4Form_IMM8_XTAB6
+//          8RR_XX4Form is the instruction format.
+//          The operand is an 8-bit immediate (IMM), the destination (XT)
+//          and sources (XA, XB) that are all 6-bits. The destination and
+//          source registers are combined if they are of the same length.
+//          Moreover, the order of operands reflects the order of operands
+//          in the encoding.
+
+//-------------------------- Predicate definitions ---------------------------//
+def IsPPC32 : Predicate<"!Subtarget->isPPC64()">;
+
+
+//===----------------------------------------------------------------------===//
+// PowerPC ISA 3.1 specific type constraints.
+//
+
+def SDT_PPCSplat32 : SDTypeProfile<1, 3, [ SDTCisVT<0, v2i64>,
+  SDTCisVec<1>, SDTCisInt<2>, SDTCisInt<3>
+]>;
+def SDT_PPCAccBuild : SDTypeProfile<1, 4, [
+  SDTCisVT<0, v512i1>, SDTCisVT<1, v4i32>, SDTCisVT<2, v4i32>,
+                       SDTCisVT<3, v4i32>, SDTCisVT<4, v4i32>
+]>;
+def SDT_PPCPairBuild : SDTypeProfile<1, 2, [
+  SDTCisVT<0, v256i1>, SDTCisVT<1, v4i32>, SDTCisVT<2, v4i32>
+]>;
+def SDT_PPCAccExtractVsx : SDTypeProfile<1, 2, [
+  SDTCisVT<0, v4i32>, SDTCisVT<1, v512i1>, SDTCisPtrTy<2>
+]>;
+def SDT_PPCPairExtractVsx : SDTypeProfile<1, 2, [
+  SDTCisVT<0, v4i32>, SDTCisVT<1, v256i1>, SDTCisPtrTy<2>
+]>;
+def SDT_PPCxxmfacc : SDTypeProfile<1, 1, [
+  SDTCisVT<0, v512i1>, SDTCisVT<1, v512i1>
+]>;
+
+//===----------------------------------------------------------------------===//
+// ISA 3.1 specific PPCISD nodes.
+//
+
+def PPCxxsplti32dx : SDNode<"PPCISD::XXSPLTI32DX", SDT_PPCSplat32, []>;
+def PPCAccBuild : SDNode<"PPCISD::ACC_BUILD", SDT_PPCAccBuild, []>;
+def PPCPairBuild : SDNode<"PPCISD::PAIR_BUILD", SDT_PPCPairBuild, []>;
+def PPCAccExtractVsx : SDNode<"PPCISD::EXTRACT_VSX_REG", SDT_PPCAccExtractVsx,
+                       []>;
+def PPCPairExtractVsx : SDNode<"PPCISD::EXTRACT_VSX_REG", SDT_PPCPairExtractVsx,
+                        []>;
+def PPCxxmfacc : SDNode<"PPCISD::XXMFACC", SDT_PPCxxmfacc, []>;
+
+//===----------------------------------------------------------------------===//
+
+// PC Relative flag (for instructions that use the address of the prefix for
+// address computations).
+class isPCRel { bit PCRel = 1; }
+
+// PowerPC specific type constraints.
+def SDT_PPCLXVRZX : SDTypeProfile<1, 2, [
+  SDTCisVT<0, v1i128>, SDTCisPtrTy<1>, SDTCisPtrTy<2>
+]>;
+
+// PPC Specific DAG Nodes.
+def PPClxvrzx : SDNode<"PPCISD::LXVRZX", SDT_PPCLXVRZX,
+                       [SDNPHasChain, SDNPMayLoad]>;
+
+// Top-level class for prefixed instructions.
+class PI<bits<6> pref, bits<6> opcode, dag OOL, dag IOL, string asmstr,
+         InstrItinClass itin> : Instruction {
+  field bits<64> Inst;
+  field bits<64> SoftFail = 0;
+  bit PCRel = 0; // Default value, set by isPCRel.
+  let Size = 8;
+
+  let Namespace = "PPC";
+  let OutOperandList = OOL;
+  let InOperandList = IOL;
+  let AsmString = asmstr;
+  let Itinerary = itin;
+  let Inst{0-5} = pref;
+  let Inst{32-37} = opcode;
+
+  bits<1> PPC970_First = 0;
+  bits<1> PPC970_Single = 0;
+  bits<1> PPC970_Cracked = 0;
+  bits<3> PPC970_Unit = 0;
+
+  /// These fields correspond to the fields in PPCInstrInfo.h.  Any changes to
+  /// these must be reflected there!  See comments there for what these are.
+  let TSFlags{0}   = PPC970_First;
+  let TSFlags{1}   = PPC970_Single;
+  let TSFlags{2}   = PPC970_Cracked;
+  let TSFlags{5-3} = PPC970_Unit;
+
+  bits<1> Prefixed = 1;  // This is a prefixed instruction.
+  let TSFlags{7}  = Prefixed;
+
+  // For cases where multiple instruction definitions really represent the
+  // same underlying instruction but with one definition for 64-bit arguments
+  // and one for 32-bit arguments, this bit breaks the degeneracy between
+  // the two forms and allows TableGen to generate mapping tables.
+  bit Interpretation64Bit = 0;
+
+  // Fields used for relation models.
+  string BaseName = "";
+}
+
+// VX-Form: [ PO VT R VB RC XO ]
+class VXForm_VTB5_RC<bits<10> xo, bits<5> R, dag OOL, dag IOL, string asmstr,
+                      InstrItinClass itin, list<dag> pattern>
+  : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VT;
+  bits<5> VB;
+  bit RC = 0;
+
+  let Pattern = pattern;
+
+  let Inst{6-10} = VT;
+  let Inst{11-15} = R;
+  let Inst{16-20} = VB;
+  let Inst{21} = RC;
+  let Inst{22-31} = xo;
+}
+
+// Multiclass definition to account for record and non-record form
+// instructions of VXRForm.
+multiclass VXForm_VTB5_RCr<bits<10> xo, bits<5> R, dag OOL, dag IOL,
+                            string asmbase, string asmstr,
+                            InstrItinClass itin, list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : VXForm_VTB5_RC<xo, R, OOL, IOL,
+                               !strconcat(asmbase, !strconcat(" ", asmstr)),
+                               itin, pattern>, RecFormRel;
+    let Defs = [CR6] in
+    def _rec : VXForm_VTB5_RC<xo, R, OOL, IOL,
+                               !strconcat(asmbase, !strconcat(". ", asmstr)),
+                               itin, []>, isRecordForm, RecFormRel;
+  }
+}
+
+class MLS_DForm_R_SI34_RTA5_MEM<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                                InstrItinClass itin, list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<5> FRS;
+  bits<39> D_RA;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 2;
+  let Inst{8-10} = 0;
+  let Inst{11} = PCRel;
+  let Inst{12-13} = 0;
+  let Inst{14-31} = D_RA{33-16}; // d0
+
+  // The instruction.
+  let Inst{38-42} = FRS{4-0};
+  let Inst{43-47} = D_RA{38-34}; // RA
+  let Inst{48-63} = D_RA{15-0}; // d1
+}
+
+class MLS_DForm_R_SI34_RTA5<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                            InstrItinClass itin, list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RT;
+  bits<5> RA;
+  bits<34> SI;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 2;
+  let Inst{8-10} = 0;
+  let Inst{11} = PCRel;
+  let Inst{12-13} = 0;
+  let Inst{14-31} = SI{33-16};
+
+  // The instruction.
+  let Inst{38-42} = RT;
+  let Inst{43-47} = RA;
+  let Inst{48-63} = SI{15-0};
+}
+
+class MLS_DForm_SI34_RT5<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                         InstrItinClass itin, list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RT;
+  bits<34> SI;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 2;
+  let Inst{8-10} = 0;
+  let Inst{11} = 0;
+  let Inst{12-13} = 0;
+  let Inst{14-31} = SI{33-16};
+
+  // The instruction.
+  let Inst{38-42} = RT;
+  let Inst{43-47} = 0;
+  let Inst{48-63} = SI{15-0};
+}
+
+multiclass MLS_DForm_R_SI34_RTA5_p<bits<6> opcode, dag OOL, dag IOL,
+                                   dag PCRel_IOL, string asmstr,
+                                   InstrItinClass itin> {
+  def NAME : MLS_DForm_R_SI34_RTA5<opcode, OOL, IOL,
+                                   !strconcat(asmstr, ", 0"), itin, []>;
+  def pc : MLS_DForm_R_SI34_RTA5<opcode, OOL, PCRel_IOL,
+                                 !strconcat(asmstr, ", 1"), itin, []>, isPCRel;
+}
+
+class 8LS_DForm_R_SI34_RTA5_MEM<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                                InstrItinClass itin, list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RT;
+  bits<39> D_RA;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-10} = 0;
+  let Inst{11} = PCRel;
+  let Inst{12-13} = 0;
+  let Inst{14-31} = D_RA{33-16}; // d0
+
+  // The instruction.
+  let Inst{38-42} = RT{4-0};
+  let Inst{43-47} = D_RA{38-34}; // RA
+  let Inst{48-63} = D_RA{15-0}; // d1
+}
+
+// 8LS:D-Form: [ 1 0 0 // R // d0
+//               PO TX T RA d1 ]
+class 8LS_DForm_R_SI34_XT6_RA5_MEM<bits<5> opcode, dag OOL, dag IOL,
+                                   string asmstr, InstrItinClass itin,
+                                   list<dag> pattern>
+  : PI<1, { opcode, ? }, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<39> D_RA;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 0;
+  let Inst{8} = 0;
+  let Inst{9-10} = 0; // reserved
+  let Inst{11} = PCRel;
+  let Inst{12-13} = 0; // reserved
+  let Inst{14-31} = D_RA{33-16}; // d0
+
+  // The instruction.
+  let Inst{37} = XT{5};
+  let Inst{38-42} = XT{4-0};
+  let Inst{43-47} = D_RA{38-34}; // RA
+  let Inst{48-63} = D_RA{15-0}; // d1
+}
+
+// X-Form: [PO T IMM VRB XO TX]
+class XForm_XT6_IMM5_VB5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                         string asmstr, InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<5> VRB;
+  bits<5> IMM;
+
+  let Pattern = pattern;
+  let Inst{6-10} = XT{4-0};
+  let Inst{11-15} = IMM;
+  let Inst{16-20} = VRB;
+  let Inst{21-30} = xo;
+  let Inst{31} = XT{5};
+}
+
+class 8RR_XX4Form_IMM8_XTAB6<bits<6> opcode, bits<2> xo,
+                             dag OOL, dag IOL, string asmstr,
+                             InstrItinClass itin, list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+    bits<6> XT;
+    bits<6> XA;
+    bits<6> XB;
+    bits<6> XC;
+    bits<8> IMM;
+
+    let Pattern = pattern;
+
+    // The prefix.
+    let Inst{6-7} = 1;
+    let Inst{8} = 0;
+    let Inst{9-11} = 0;
+    let Inst{12-13} = 0;
+    let Inst{14-23} = 0;
+    let Inst{24-31} = IMM;
+
+    // The instruction.
+    let Inst{38-42} = XT{4-0};
+    let Inst{43-47} = XA{4-0};
+    let Inst{48-52} = XB{4-0};
+    let Inst{53-57} = XC{4-0};
+    let Inst{58-59} = xo;
+    let Inst{60} = XC{5};
+    let Inst{61} = XA{5};
+    let Inst{62} = XB{5};
+    let Inst{63} = XT{5};
+}
+
+class VXForm_RD5_N3_VB5<bits<11> xo, dag OOL, dag IOL, string asmstr,
+                        InstrItinClass itin, list<dag> pattern>
+  : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> RD;
+  bits<5> VB;
+  bits<3> N;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = RD;
+  let Inst{11-12} = 0;
+  let Inst{13-15} = N;
+  let Inst{16-20} = VB;
+  let Inst{21-31} = xo;
+}
+
+
+// VX-Form: [PO VRT RA VRB XO].
+// Destructive (insert) forms are suffixed with _ins.
+class VXForm_VTB5_RA5_ins<bits<11> xo, string opc, list<dag> pattern>
+  : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vDi, gprc:$rA, vrrc:$vB),
+             !strconcat(opc, " $vD, $rA, $vB"), IIC_VecGeneral, pattern>,
+             RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
+
+// VX-Form: [PO VRT RA RB XO].
+// Destructive (insert) forms are suffixed with _ins.
+class VXForm_VRT5_RAB5_ins<bits<11> xo, string opc, list<dag> pattern>
+  : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vDi, gprc:$rA, gprc:$rB),
+             !strconcat(opc, " $vD, $rA, $rB"), IIC_VecGeneral, pattern>,
+             RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
+
+// VX-Form: [ PO BF // VRA VRB XO ]
+class VXForm_BF3_VAB5<bits<11> xo, dag OOL, dag IOL, string asmstr,
+                      InstrItinClass itin, list<dag> pattern>
+  : I<4, OOL, IOL, asmstr, itin> {
+  bits<3> BF;
+  bits<5> VA;
+  bits<5> VB;
+
+  let Pattern = pattern;
+
+  let Inst{6-8} = BF;
+  let Inst{9-10} = 0;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21-31} = xo;
+}
+
+// VN-Form: [PO VRT VRA VRB PS SD XO]
+// SD is "Shift Direction"
+class VNForm_VTAB5_SD3<bits<6> xo, bits<2> ps, dag OOL, dag IOL, string asmstr,
+                       InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VRT;
+  bits<5> VRA;
+  bits<5> VRB;
+  bits<3> SD;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = VRT;
+  let Inst{11-15} = VRA;
+  let Inst{16-20} = VRB;
+  let Inst{21-22} = ps;
+  let Inst{23-25} = SD;
+  let Inst{26-31} = xo;
+}
+
+class VXForm_RD5_MP_VB5<bits<11> xo, bits<4> eo, dag OOL, dag IOL,
+                        string asmstr, InstrItinClass itin, list<dag> pattern>
+  : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> RD;
+  bits<5> VB;
+  bit MP;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = RD;
+  let Inst{11-14} = eo;
+  let Inst{15} = MP;
+  let Inst{16-20} = VB;
+  let Inst{21-31} = xo;
+}
+
+// 8RR:D-Form: [ 1 1 0 // // imm0
+//               PO T XO TX imm1 ].
+class 8RR_DForm_IMM32_XT6<bits<6> opcode, bits<4> xo, dag OOL, dag IOL,
+                          string asmstr, InstrItinClass itin,
+                          list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<32> IMM32;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 1;
+  let Inst{8-11} = 0;
+  let Inst{12-13} = 0; // reserved
+  let Inst{14-15} = 0; // reserved
+  let Inst{16-31} = IMM32{31-16};
+
+  // The instruction.
+  let Inst{38-42} = XT{4-0};
+  let Inst{43-46} = xo;
+  let Inst{47} = XT{5};
+  let Inst{48-63} = IMM32{15-0};
+}
+
+// 8RR:D-Form: [ 1 1 0 // // imm0
+//               PO T XO IX TX imm1 ].
+class 8RR_DForm_IMM32_XT6_IX<bits<6> opcode, bits<3> xo, dag OOL, dag IOL,
+                             string asmstr, InstrItinClass itin,
+                             list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bit IX;
+  bits<32> IMM32;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 1;
+  let Inst{8-11} = 0;
+  let Inst{12-13} = 0; // reserved
+  let Inst{14-15} = 0; // reserved
+  let Inst{16-31} = IMM32{31-16};
+
+  // The instruction.
+  let Inst{38-42} = XT{4-0};
+  let Inst{43-45} = xo;
+  let Inst{46} = IX;
+  let Inst{47} = XT{5};
+  let Inst{48-63} = IMM32{15-0};
+}
+
+class 8RR_XX4Form_XTABC6<bits<6> opcode, bits<2> xo, dag OOL, dag IOL,
+                         string asmstr, InstrItinClass itin, list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<6> XA;
+  bits<6> XB;
+  bits<6> XC;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 1;
+  let Inst{8-11} = 0;
+  let Inst{12-13} = 0;
+  let Inst{14-31} = 0;
+
+  // The instruction.
+  let Inst{38-42} = XT{4-0};
+  let Inst{43-47} = XA{4-0};
+  let Inst{48-52} = XB{4-0};
+  let Inst{53-57} = XC{4-0};
+  let Inst{58-59} = xo;
+  let Inst{60} = XC{5};
+  let Inst{61} = XA{5};
+  let Inst{62} = XB{5};
+  let Inst{63} = XT{5};
+}
+
+class 8RR_XX4Form_IMM3_XTABC6<bits<6> opcode, bits<2> xo, dag OOL, dag IOL,
+                              string asmstr, InstrItinClass itin,
+                              list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<6> XA;
+  bits<6> XB;
+  bits<6> XC;
+  bits<3> IMM;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 1;
+  let Inst{8-11} = 0;
+  let Inst{12-13} = 0;
+  let Inst{14-28} = 0;
+  let Inst{29-31} = IMM;
+
+  // The instruction.
+  let Inst{38-42} = XT{4-0};
+  let Inst{43-47} = XA{4-0};
+  let Inst{48-52} = XB{4-0};
+  let Inst{53-57} = XC{4-0};
+  let Inst{58-59} = xo;
+  let Inst{60} = XC{5};
+  let Inst{61} = XA{5};
+  let Inst{62} = XB{5};
+  let Inst{63} = XT{5};
+}
+
+// [PO BF / XO2 B XO BX /]
+class XX2_BF3_XO5_XB6_XO9<bits<6> opcode, bits<5> xo2, bits<9> xo, dag OOL,
+                          dag IOL, string asmstr, InstrItinClass itin,
+                          list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> BF;
+  bits<6> XB;
+
+  let Pattern = pattern;
+
+  let Inst{6-8}   = BF;
+  let Inst{9-10}  = 0;
+  let Inst{11-15} = xo2;
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-29} = xo;
+  let Inst{30}    = XB{5};
+  let Inst{31}    = 0;
+}
+
+// X-Form: [ PO RT BI /// XO / ]
+class XForm_XT5_BI5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                    string asmstr, InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let B = 0;
+}
+
+multiclass MLS_DForm_R_SI34_RTA5_MEM_p<bits<6> opcode, dag OOL, dag IOL,
+                                       dag PCRel_IOL, string asmstr,
+                                       InstrItinClass itin> {
+  def NAME : MLS_DForm_R_SI34_RTA5_MEM<opcode, OOL, IOL,
+                                       !strconcat(asmstr, ", 0"), itin, []>;
+  def pc : MLS_DForm_R_SI34_RTA5_MEM<opcode, OOL, PCRel_IOL,
+                                     !strconcat(asmstr, ", 1"), itin, []>,
+                                     isPCRel;
+}
+
+multiclass 8LS_DForm_R_SI34_RTA5_MEM_p<bits<6> opcode, dag OOL, dag IOL,
+                                       dag PCRel_IOL, string asmstr,
+                                       InstrItinClass itin> {
+  def NAME : 8LS_DForm_R_SI34_RTA5_MEM<opcode, OOL, IOL,
+                                       !strconcat(asmstr, ", 0"), itin, []>;
+  def pc : 8LS_DForm_R_SI34_RTA5_MEM<opcode, OOL, PCRel_IOL,
+                                     !strconcat(asmstr, ", 1"), itin, []>,
+                                     isPCRel;
+}
+
+multiclass 8LS_DForm_R_SI34_XT6_RA5_MEM_p<bits<5> opcode, dag OOL, dag IOL,
+                                          dag PCRel_IOL, string asmstr,
+                                          InstrItinClass itin> {
+  def NAME : 8LS_DForm_R_SI34_XT6_RA5_MEM<opcode, OOL, IOL,
+                                          !strconcat(asmstr, ", 0"), itin, []>;
+  def pc : 8LS_DForm_R_SI34_XT6_RA5_MEM<opcode, OOL, PCRel_IOL,
+                                        !strconcat(asmstr, ", 1"), itin, []>,
+                                        isPCRel;
+}
+
+def PrefixInstrs : Predicate<"Subtarget->hasPrefixInstrs()">;
+def IsISA3_1 : Predicate<"Subtarget->isISA3_1()">;
+def PairedVectorMemops : Predicate<"Subtarget->pairedVectorMemops()">;
+def RCCp {
+  dag AToVSRC = (COPY_TO_REGCLASS $XA, VSRC);
+  dag BToVSRC = (COPY_TO_REGCLASS $XB, VSRC);
+}
+
+let Predicates = [PrefixInstrs] in {
+  let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+    defm PADDI8 :
+      MLS_DForm_R_SI34_RTA5_p<14, (outs g8rc:$RT), (ins g8rc:$RA, s34imm:$SI),
+                              (ins immZero:$RA, s34imm_pcrel:$SI),
+                              "paddi $RT, $RA, $SI", IIC_LdStLFD>;
+    let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
+      def PLI8 : MLS_DForm_SI34_RT5<14, (outs g8rc:$RT),
+                                    (ins s34imm:$SI),
+                                    "pli $RT, $SI", IIC_IntSimple, []>;
+    }
+  }
+  defm PADDI :
+    MLS_DForm_R_SI34_RTA5_p<14, (outs gprc:$RT), (ins gprc:$RA, s34imm:$SI),
+                            (ins immZero:$RA, s34imm_pcrel:$SI),
+                            "paddi $RT, $RA, $SI", IIC_LdStLFD>;
+  let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
+    def PLI : MLS_DForm_SI34_RT5<14, (outs gprc:$RT),
+                                 (ins s34imm:$SI),
+                                 "pli $RT, $SI", IIC_IntSimple, []>;
+  }
+
+  let mayLoad = 1, mayStore = 0 in {
+    defm PLXV :
+      8LS_DForm_R_SI34_XT6_RA5_MEM_p<25, (outs vsrc:$XT), (ins memri34:$D_RA),
+                                     (ins memri34_pcrel:$D_RA),
+                                     "plxv $XT, $D_RA", IIC_LdStLFD>;
+    defm PLFS :
+      MLS_DForm_R_SI34_RTA5_MEM_p<48, (outs f4rc:$FRT), (ins memri34:$D_RA),
+                                  (ins memri34_pcrel:$D_RA), "plfs $FRT, $D_RA",
+                                  IIC_LdStLFD>;
+    defm PLFD :
+      MLS_DForm_R_SI34_RTA5_MEM_p<50, (outs f8rc:$FRT), (ins memri34:$D_RA),
+                                  (ins  memri34_pcrel:$D_RA), "plfd $FRT, $D_RA",
+                                  IIC_LdStLFD>;
+    defm PLXSSP :
+      8LS_DForm_R_SI34_RTA5_MEM_p<43, (outs vfrc:$VRT), (ins memri34:$D_RA),
+                                  (ins memri34_pcrel:$D_RA),
+                                  "plxssp $VRT, $D_RA", IIC_LdStLFD>;
+    defm PLXSD :
+      8LS_DForm_R_SI34_RTA5_MEM_p<42, (outs vfrc:$VRT), (ins memri34:$D_RA),
+                                  (ins memri34_pcrel:$D_RA),
+                                  "plxsd $VRT, $D_RA", IIC_LdStLFD>;
+    let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+      defm PLBZ8 :
+        MLS_DForm_R_SI34_RTA5_MEM_p<34, (outs g8rc:$RT), (ins memri34:$D_RA),
+                                    (ins memri34_pcrel:$D_RA), "plbz $RT, $D_RA",
+                                    IIC_LdStLFD>;
+      defm PLHZ8 :
+        MLS_DForm_R_SI34_RTA5_MEM_p<40, (outs g8rc:$RT), (ins memri34:$D_RA),
+                                    (ins memri34_pcrel:$D_RA), "plhz $RT, $D_RA",
+                                    IIC_LdStLFD>;
+      defm PLHA8 :
+        MLS_DForm_R_SI34_RTA5_MEM_p<42, (outs g8rc:$RT), (ins memri34:$D_RA),
+                                    (ins memri34_pcrel:$D_RA), "plha $RT, $D_RA",
+                                    IIC_LdStLFD>;
+      defm PLWA8 :
+        8LS_DForm_R_SI34_RTA5_MEM_p<41, (outs g8rc:$RT), (ins memri34:$D_RA),
+                                    (ins memri34_pcrel:$D_RA),
+                                    "plwa $RT, $D_RA", IIC_LdStLFD>;
+      defm PLWZ8 :
+        MLS_DForm_R_SI34_RTA5_MEM_p<32, (outs g8rc:$RT), (ins memri34:$D_RA),
+                                    (ins memri34_pcrel:$D_RA), "plwz $RT, $D_RA",
+                                    IIC_LdStLFD>;
+    }
+    defm PLBZ :
+      MLS_DForm_R_SI34_RTA5_MEM_p<34, (outs gprc:$RT), (ins memri34:$D_RA),
+                                  (ins memri34_pcrel:$D_RA), "plbz $RT, $D_RA",
+                                  IIC_LdStLFD>;
+    defm PLHZ :
+      MLS_DForm_R_SI34_RTA5_MEM_p<40, (outs gprc:$RT), (ins memri34:$D_RA),
+                                  (ins memri34_pcrel:$D_RA), "plhz $RT, $D_RA",
+                                  IIC_LdStLFD>;
+    defm PLHA :
+      MLS_DForm_R_SI34_RTA5_MEM_p<42, (outs gprc:$RT), (ins memri34:$D_RA),
+                                  (ins memri34_pcrel:$D_RA), "plha $RT, $D_RA",
+                                  IIC_LdStLFD>;
+    defm PLWZ :
+      MLS_DForm_R_SI34_RTA5_MEM_p<32, (outs gprc:$RT), (ins memri34:$D_RA),
+                                  (ins memri34_pcrel:$D_RA), "plwz $RT, $D_RA",
+                                  IIC_LdStLFD>;
+    defm PLWA :
+      8LS_DForm_R_SI34_RTA5_MEM_p<41, (outs gprc:$RT), (ins memri34:$D_RA),
+                                  (ins memri34_pcrel:$D_RA), "plwa $RT, $D_RA",
+                                  IIC_LdStLFD>;
+    defm PLD :
+      8LS_DForm_R_SI34_RTA5_MEM_p<57, (outs g8rc:$RT), (ins memri34:$D_RA),
+                                  (ins memri34_pcrel:$D_RA), "pld $RT, $D_RA",
+                                  IIC_LdStLFD>;
+  }
+
+  let mayStore = 1, mayLoad = 0 in {
+    defm PSTXV :
+      8LS_DForm_R_SI34_XT6_RA5_MEM_p<27, (outs), (ins vsrc:$XS, memri34:$D_RA),
+                                     (ins vsrc:$XS, memri34_pcrel:$D_RA),
+                                     "pstxv $XS, $D_RA", IIC_LdStLFD>;
+    defm PSTFS :
+      MLS_DForm_R_SI34_RTA5_MEM_p<52, (outs), (ins f4rc:$FRS, memri34:$D_RA),
+                                  (ins f4rc:$FRS, memri34_pcrel:$D_RA),
+                                  "pstfs $FRS, $D_RA", IIC_LdStLFD>;
+    defm PSTFD :
+      MLS_DForm_R_SI34_RTA5_MEM_p<54, (outs), (ins f8rc:$FRS, memri34:$D_RA),
+                                  (ins f8rc:$FRS, memri34_pcrel:$D_RA),
+                                  "pstfd $FRS, $D_RA", IIC_LdStLFD>;
+    defm PSTXSSP :
+      8LS_DForm_R_SI34_RTA5_MEM_p<47, (outs), (ins vfrc:$VRS, memri34:$D_RA),
+                                  (ins vfrc:$VRS, memri34_pcrel:$D_RA),
+                                  "pstxssp $VRS, $D_RA", IIC_LdStLFD>;
+    defm PSTXSD :
+      8LS_DForm_R_SI34_RTA5_MEM_p<46, (outs), (ins vfrc:$VRS, memri34:$D_RA),
+                                  (ins vfrc:$VRS, memri34_pcrel:$D_RA),
+                                  "pstxsd $VRS, $D_RA", IIC_LdStLFD>;
+    let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+      defm PSTB8 :
+        MLS_DForm_R_SI34_RTA5_MEM_p<38, (outs), (ins g8rc:$RS, memri34:$D_RA),
+                                    (ins g8rc:$RS, memri34_pcrel:$D_RA),
+                                    "pstb $RS, $D_RA", IIC_LdStLFD>;
+      defm PSTH8 :
+        MLS_DForm_R_SI34_RTA5_MEM_p<44, (outs), (ins g8rc:$RS, memri34:$D_RA),
+                                    (ins g8rc:$RS, memri34_pcrel:$D_RA),
+                                    "psth $RS, $D_RA", IIC_LdStLFD>;
+      defm PSTW8 :
+        MLS_DForm_R_SI34_RTA5_MEM_p<36, (outs), (ins g8rc:$RS, memri34:$D_RA),
+                                    (ins g8rc:$RS, memri34_pcrel:$D_RA),
+                                    "pstw $RS, $D_RA", IIC_LdStLFD>;
+    }
+    defm PSTB :
+      MLS_DForm_R_SI34_RTA5_MEM_p<38, (outs), (ins gprc:$RS, memri34:$D_RA),
+                                  (ins gprc:$RS, memri34_pcrel:$D_RA),
+                                  "pstb $RS, $D_RA", IIC_LdStLFD>;
+    defm PSTH :
+      MLS_DForm_R_SI34_RTA5_MEM_p<44, (outs), (ins gprc:$RS, memri34:$D_RA),
+                                  (ins gprc:$RS, memri34_pcrel:$D_RA),
+                                  "psth $RS, $D_RA", IIC_LdStLFD>;
+    defm PSTW :
+      MLS_DForm_R_SI34_RTA5_MEM_p<36, (outs), (ins gprc:$RS, memri34:$D_RA),
+                                  (ins gprc:$RS, memri34_pcrel:$D_RA),
+                                  "pstw $RS, $D_RA", IIC_LdStLFD>;
+    defm PSTD :
+      8LS_DForm_R_SI34_RTA5_MEM_p<61, (outs), (ins g8rc:$RS, memri34:$D_RA),
+                                  (ins g8rc:$RS, memri34_pcrel:$D_RA),
+                                  "pstd $RS, $D_RA", IIC_LdStLFD>;
+  }
+}
+
+class DQForm_XTp5_RA17_MEM<bits<6> opcode, bits<4> xo, dag OOL, dag IOL,
+                           string asmstr, InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> XTp;
+  bits<17> DQ_RA;
+  let Pattern = pattern;
+
+  let Inst{6-9} = XTp{3-0};
+  let Inst{10} = XTp{4};
+  let Inst{11-15} = DQ_RA{16-12};  // Register #
+  let Inst{16-27} = DQ_RA{11-0};   // Displacement.
+  let Inst{28-31} = xo;
+}
+
+class XForm_XTp5_XAB5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                      string asmstr, InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin>, XFormMemOp {
+  bits<5> XTp;
+  bits<5> A;
+  bits<5> B;
+
+  let Pattern = pattern;
+  let Inst{6-9} = XTp{3-0};
+  let Inst{10} = XTp{4};
+  let Inst{11-15} = A;
+  let Inst{16-20} = B;
+  let Inst{21-30} = xo;
+  let Inst{31} = 0;
+}
+
+class 8LS_DForm_R_XTp5_SI34_MEM<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                                InstrItinClass itin, list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<5> XTp;
+  bits<39> D_RA;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-10} = 0;
+  let Inst{11} = PCRel;
+  let Inst{12-13} = 0;
+  let Inst{14-31} = D_RA{33-16}; // Imm18
+
+  // The instruction.
+  let Inst{38-41} = XTp{3-0};
+  let Inst{42}    = XTp{4};
+  let Inst{43-47} = D_RA{38-34};   // Register #
+  let Inst{48-63} = D_RA{15-0};    // D
+}
+
+multiclass 8LS_DForm_R_XTp5_SI34_MEM_p<bits<6> opcode, dag OOL,
+                                       dag IOL, dag PCRel_IOL,
+                                       string asmstr, InstrItinClass itin> {
+  def NAME : 8LS_DForm_R_XTp5_SI34_MEM<opcode, OOL, IOL,
+                                       !strconcat(asmstr, ", 0"), itin, []>;
+  def pc : 8LS_DForm_R_XTp5_SI34_MEM<opcode, OOL, PCRel_IOL,
+                                     !strconcat(asmstr, ", 1"), itin, []>,
+                                     isPCRel;
+}
+
+
+
+// [PO AS XO2 XO]
+class XForm_AT3<bits<6> opcode, bits<5> xo2, bits<10> xo, dag OOL, dag IOL,
+                    string asmstr, InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> AT;
+
+  let Pattern = pattern;
+
+  let Inst{6-8}  = AT;
+  let Inst{9-10}  = 0;
+  let Inst{11-15} = xo2;
+  let Inst{16-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31} = 0;
+}
+
+// X-Form: [ PO T EO UIM XO TX ]
+class XForm_XT6_IMM5<bits<6> opcode, bits<5> eo, bits<10> xo, dag OOL, dag IOL,
+                     string asmstr, InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<5> UIM;
+
+  let Pattern = pattern;
+
+  let Inst{6-10} = XT{4-0};
+  let Inst{11-15} = eo;
+  let Inst{16-20} = UIM;
+  let Inst{21-30} = xo;
+  let Inst{31} = XT{5};
+}
+
+class XX3Form_AT3_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
+                           string asmstr, InstrItinClass itin,
+                           list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> AT;
+  bits<6> XA;
+  bits<6> XB;
+
+  let Pattern = pattern;
+
+  let Inst{6-8} = AT;
+  let Inst{9-10} = 0;
+  let Inst{11-15} = XA{4-0};
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-28} = xo;
+  let Inst{29}    = XA{5};
+  let Inst{30}    = XB{5};
+  let Inst{31} = 0;
+}
+
+class MMIRR_XX3Form_XY4P2_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
+                               string asmstr, InstrItinClass itin,
+                               list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<3> AT;
+  bits<6> XA;
+  bits<6> XB;
+  bits<4> XMSK;
+  bits<4> YMSK;
+  bits<2> PMSK;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 3;
+  let Inst{8-11} = 9;
+  let Inst{12-15} = 0;
+  let Inst{16-17} = PMSK;
+  let Inst{18-23} = 0;
+  let Inst{24-27} = XMSK;
+  let Inst{28-31} = YMSK;
+
+  // The instruction.
+  let Inst{38-40} = AT;
+  let Inst{41-42} = 0;
+  let Inst{43-47} = XA{4-0};
+  let Inst{48-52} = XB{4-0};
+  let Inst{53-60} = xo;
+  let Inst{61} = XA{5};
+  let Inst{62} = XB{5};
+  let Inst{63} = 0;
+}
+
+class MMIRR_XX3Form_XY4_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
+                             string asmstr, InstrItinClass itin,
+                             list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<3> AT;
+  bits<6> XA;
+  bits<6> XB;
+  bits<4> XMSK;
+  bits<4> YMSK;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 3;
+  let Inst{8-11} = 9;
+  let Inst{12-23} = 0;
+  let Inst{24-27} = XMSK;
+  let Inst{28-31} = YMSK;
+
+  // The instruction.
+  let Inst{38-40} = AT;
+  let Inst{41-42} = 0;
+  let Inst{43-47} = XA{4-0};
+  let Inst{48-52} = XB{4-0};
+  let Inst{53-60} = xo;
+  let Inst{61} = XA{5};
+  let Inst{62} = XB{5};
+  let Inst{63} = 0;
+}
+
+class MMIRR_XX3Form_X4Y2_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
+                              string asmstr, InstrItinClass itin,
+                              list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<3> AT;
+  bits<6> XA;
+  bits<6> XB;
+  bits<4> XMSK;
+  bits<2> YMSK;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 3;
+  let Inst{8-11} = 9;
+  let Inst{12-23} = 0;
+  let Inst{24-27} = XMSK;
+  let Inst{28-29} = YMSK;
+  let Inst{30-31} = 0;
+
+  // The instruction.
+  let Inst{38-40} = AT;
+  let Inst{41-42} = 0;
+  let Inst{43-47} = XA{4-0};
+  let Inst{48-52} = XB{4-0};
+  let Inst{53-60} = xo;
+  let Inst{61} = XA{5};
+  let Inst{62} = XB{5};
+  let Inst{63} = 0;
+}
+
+class MMIRR_XX3Form_XY4P8_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
+                               string asmstr, InstrItinClass itin,
+                               list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<3> AT;
+  bits<6> XA;
+  bits<6> XB;
+  bits<4> XMSK;
+  bits<4> YMSK;
+  bits<8> PMSK;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 3;
+  let Inst{8-11} = 9;
+  let Inst{12-15} = 0;
+  let Inst{16-23} = PMSK;
+  let Inst{24-27} = XMSK;
+  let Inst{28-31} = YMSK;
+
+  // The instruction.
+  let Inst{38-40} = AT;
+  let Inst{41-42} = 0;
+  let Inst{43-47} = XA{4-0};
+  let Inst{48-52} = XB{4-0};
+  let Inst{53-60} = xo;
+  let Inst{61} = XA{5};
+  let Inst{62} = XB{5};
+  let Inst{63} = 0;
+}
+
+class MMIRR_XX3Form_XYP4_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
+                              string asmstr, InstrItinClass itin,
+                              list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<3> AT;
+  bits<6> XA;
+  bits<6> XB;
+  bits<4> XMSK;
+  bits<4> YMSK;
+  bits<4> PMSK;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 3;
+  let Inst{8-11} = 9;
+  let Inst{12-15} = 0;
+  let Inst{16-19} = PMSK;
+  let Inst{20-23} = 0;
+  let Inst{24-27} = XMSK;
+  let Inst{28-31} = YMSK;
+
+  // The instruction.
+  let Inst{38-40} = AT;
+  let Inst{41-42} = 0;
+  let Inst{43-47} = XA{4-0};
+  let Inst{48-52} = XB{4-0};
+  let Inst{53-60} = xo;
+  let Inst{61} = XA{5};
+  let Inst{62} = XB{5};
+  let Inst{63} = 0;
+}
+
+
+
+def Concats {
+  dag VecsToVecPair0 =
+    (v256i1 (INSERT_SUBREG
+      (INSERT_SUBREG (IMPLICIT_DEF), $vs0, sub_vsx1),
+      $vs1, sub_vsx0));
+  dag VecsToVecPair1 =
+    (v256i1 (INSERT_SUBREG
+      (INSERT_SUBREG (IMPLICIT_DEF), $vs2, sub_vsx1),
+      $vs3, sub_vsx0));
+}
+
+let Predicates = [PairedVectorMemops] in {
+  def : Pat<(v256i1 (PPCPairBuild v4i32:$vs1, v4i32:$vs0)),
+            Concats.VecsToVecPair0>;
+  def : Pat<(v256i1 (int_ppc_vsx_assemble_pair v16i8:$vs1, v16i8:$vs0)),
+            Concats.VecsToVecPair0>;
+  def : Pat<(v4i32 (PPCPairExtractVsx vsrpevenrc:$v, 0)),
+            (v4i32 (EXTRACT_SUBREG $v, sub_vsx0))>;
+  def : Pat<(v4i32 (PPCPairExtractVsx vsrpevenrc:$v, 1)),
+            (v4i32 (EXTRACT_SUBREG $v, sub_vsx1))>;
+}
+
+let mayLoad = 1, mayStore = 0, Predicates = [PairedVectorMemops] in {
+  def LXVP : DQForm_XTp5_RA17_MEM<6, 0, (outs vsrprc:$XTp),
+                                  (ins memrix16:$DQ_RA), "lxvp $XTp, $DQ_RA",
+                                  IIC_LdStLFD, []>;
+  def LXVPX : XForm_XTp5_XAB5<31, 333, (outs vsrprc:$XTp), (ins memrr:$src),
+                              "lxvpx $XTp, $src", IIC_LdStLFD,
+                              []>;
+}
+
+let mayLoad = 0, mayStore = 1, Predicates = [PairedVectorMemops] in {
+  def STXVP : DQForm_XTp5_RA17_MEM<6, 1, (outs), (ins vsrprc:$XTp,
+                                   memrix16:$DQ_RA), "stxvp $XTp, $DQ_RA",
+                                   IIC_LdStLFD, []>;
+  def STXVPX : XForm_XTp5_XAB5<31, 461, (outs), (ins vsrprc:$XTp, memrr:$dst),
+                               "stxvpx $XTp, $dst", IIC_LdStLFD,
+                               []>;
+}
+
+let mayLoad = 1, mayStore = 0, Predicates = [PairedVectorMemops, PrefixInstrs] in {
+  defm PLXVP :
+    8LS_DForm_R_XTp5_SI34_MEM_p<58, (outs vsrprc:$XTp), (ins memri34:$D_RA),
+                                (ins memri34_pcrel:$D_RA), "plxvp $XTp, $D_RA",
+                                IIC_LdStLFD>;
+}
+
+let mayLoad = 0, mayStore = 1, Predicates = [PairedVectorMemops, PrefixInstrs] in {
+  defm PSTXVP :
+    8LS_DForm_R_XTp5_SI34_MEM_p<62, (outs), (ins vsrprc:$XTp, memri34:$D_RA),
+                                (ins vsrprc:$XTp, memri34_pcrel:$D_RA),
+                                "pstxvp $XTp, $D_RA", IIC_LdStLFD>;
+}
+
+let Predicates = [PairedVectorMemops] in {
+  // Intrinsics for Paired Vector Loads.
+  def : Pat<(v256i1 (int_ppc_vsx_lxvp DQForm:$src)), (LXVP memrix16:$src)>;
+  def : Pat<(v256i1 (int_ppc_vsx_lxvp XForm:$src)), (LXVPX XForm:$src)>;
+  let Predicates = [PairedVectorMemops, PrefixInstrs] in {
+    def : Pat<(v256i1 (int_ppc_vsx_lxvp PDForm:$src)), (PLXVP memri34:$src)>;
+  }
+  // Intrinsics for Paired Vector Stores.
+  def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, DQForm:$dst),
+            (STXVP $XSp, memrix16:$dst)>;
+  def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, XForm:$dst),
+            (STXVPX $XSp, XForm:$dst)>;
+  let Predicates = [PairedVectorMemops, PrefixInstrs] in {
+    def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, PDForm:$dst),
+              (PSTXVP $XSp, memri34:$dst)>;
+  }
+}
+
+let Predicates = [PCRelativeMemops] in {
+  // Load i32
+  def : Pat<(i32 (zextloadi1  (PPCmatpcreladdr PCRelForm:$ga))),
+            (PLBZpc $ga, 0)>;
+  def : Pat<(i32 (extloadi1  (PPCmatpcreladdr PCRelForm:$ga))),
+            (PLBZpc $ga, 0)>;
+  def : Pat<(i32 (zextloadi8  (PPCmatpcreladdr PCRelForm:$ga))),
+            (PLBZpc $ga, 0)>;
+  def : Pat<(i32 (extloadi8   (PPCmatpcreladdr PCRelForm:$ga))),
+            (PLBZpc $ga, 0)>;
+  def : Pat<(i32 (sextloadi16 (PPCmatpcreladdr PCRelForm:$ga))),
+            (PLHApc $ga, 0)>;
+  def : Pat<(i32 (zextloadi16 (PPCmatpcreladdr PCRelForm:$ga))),
+            (PLHZpc $ga, 0)>;
+  def : Pat<(i32 (extloadi16 (PPCmatpcreladdr PCRelForm:$ga))),
+            (PLHZpc $ga, 0)>;
+  def : Pat<(i32 (load (PPCmatpcreladdr PCRelForm:$ga))), (PLWZpc $ga, 0)>;
+
+  // Store i32
+  def : Pat<(truncstorei8 i32:$RS, (PPCmatpcreladdr PCRelForm:$ga)),
+            (PSTBpc $RS, $ga, 0)>;
+  def : Pat<(truncstorei16 i32:$RS, (PPCmatpcreladdr PCRelForm:$ga)),
+            (PSTHpc $RS, $ga, 0)>;
+  def : Pat<(store i32:$RS, (PPCmatpcreladdr PCRelForm:$ga)),
+            (PSTWpc $RS, $ga, 0)>;
+
+  // Load i64
+  def : Pat<(i64 (zextloadi1  (PPCmatpcreladdr PCRelForm:$ga))),
+            (PLBZ8pc $ga, 0)>;
+  def : Pat<(i64 (extloadi1  (PPCmatpcreladdr PCRelForm:$ga))),
+            (PLBZ8pc $ga, 0)>;
+  def : Pat<(i64 (zextloadi8  (PPCmatpcreladdr PCRelForm:$ga))),
+            (PLBZ8pc $ga, 0)>;
+  def : Pat<(i64 (extloadi8   (PPCmatpcreladdr PCRelForm:$ga))),
+            (PLBZ8pc $ga, 0)>;
+  def : Pat<(i64 (sextloadi16 (PPCmatpcreladdr PCRelForm:$ga))),
+            (PLHA8pc $ga, 0)>;
+  def : Pat<(i64 (zextloadi16 (PPCmatpcreladdr PCRelForm:$ga))),
+            (PLHZ8pc $ga, 0)>;
+  def : Pat<(i64 (extloadi16 (PPCmatpcreladdr PCRelForm:$ga))),
+            (PLHZ8pc $ga, 0)>;
+  def : Pat<(i64 (zextloadi32 (PPCmatpcreladdr PCRelForm:$ga))),
+            (PLWZ8pc $ga, 0)>;
+  def : Pat<(i64 (sextloadi32 (PPCmatpcreladdr PCRelForm:$ga))),
+            (PLWA8pc $ga, 0)>;
+  def : Pat<(i64 (extloadi32 (PPCmatpcreladdr PCRelForm:$ga))),
+            (PLWZ8pc $ga, 0)>;
+  def : Pat<(i64 (load (PPCmatpcreladdr PCRelForm:$ga))), (PLDpc $ga, 0)>;
+
+  // Store i64
+  def : Pat<(truncstorei8 i64:$RS, (PPCmatpcreladdr PCRelForm:$ga)),
+            (PSTB8pc $RS, $ga, 0)>;
+  def : Pat<(truncstorei16 i64:$RS, (PPCmatpcreladdr PCRelForm:$ga)),
+            (PSTH8pc $RS, $ga, 0)>;
+  def : Pat<(truncstorei32 i64:$RS, (PPCmatpcreladdr PCRelForm:$ga)),
+            (PSTW8pc $RS, $ga, 0)>;
+  def : Pat<(store i64:$RS, (PPCmatpcreladdr PCRelForm:$ga)),
+            (PSTDpc $RS, $ga, 0)>;
+
+  // Load f32
+  def : Pat<(f32 (load (PPCmatpcreladdr PCRelForm:$addr))), (PLFSpc $addr, 0)>;
+
+  // Store f32
+  def : Pat<(store f32:$FRS, (PPCmatpcreladdr PCRelForm:$ga)),
+            (PSTFSpc $FRS, $ga, 0)>;
+
+  // Load f64
+  def : Pat<(f64 (extloadf32 (PPCmatpcreladdr PCRelForm:$addr))),
+            (COPY_TO_REGCLASS (PLFSpc $addr, 0), VSFRC)>;
+  def : Pat<(f64 (load (PPCmatpcreladdr PCRelForm:$addr))), (PLFDpc $addr, 0)>;
+
+  // Store f64
+  def : Pat<(store f64:$FRS, (PPCmatpcreladdr PCRelForm:$ga)),
+            (PSTFDpc $FRS, $ga, 0)>;
+
+  // Load f128
+  def : Pat<(f128 (load (PPCmatpcreladdr PCRelForm:$addr))),
+            (COPY_TO_REGCLASS (PLXVpc $addr, 0), VRRC)>;
+
+  // Store f128
+  def : Pat<(store f128:$XS, (PPCmatpcreladdr PCRelForm:$ga)),
+            (PSTXVpc (COPY_TO_REGCLASS $XS, VSRC), $ga, 0)>;
+
+  // Load v4i32
+  def : Pat<(v4i32 (load (PPCmatpcreladdr PCRelForm:$addr))), (PLXVpc $addr, 0)>;
+
+  // Store v4i32
+  def : Pat<(store v4i32:$XS, (PPCmatpcreladdr PCRelForm:$ga)),
+            (PSTXVpc $XS, $ga, 0)>;
+
+  // Load v2i64
+  def : Pat<(v2i64 (load (PPCmatpcreladdr PCRelForm:$addr))), (PLXVpc $addr, 0)>;
+
+  // Store v2i64
+  def : Pat<(store v2i64:$XS, (PPCmatpcreladdr PCRelForm:$ga)),
+            (PSTXVpc $XS, $ga, 0)>;
+
+  // Load v4f32
+  def : Pat<(v4f32 (load (PPCmatpcreladdr PCRelForm:$addr))), (PLXVpc $addr, 0)>;
+
+  // Store v4f32
+  def : Pat<(store v4f32:$XS, (PPCmatpcreladdr PCRelForm:$ga)),
+            (PSTXVpc $XS, $ga, 0)>;
+
+  // Load v2f64
+  def : Pat<(v2f64 (load (PPCmatpcreladdr PCRelForm:$addr))), (PLXVpc $addr, 0)>;
+
+  // Store v2f64
+  def : Pat<(store v2f64:$XS, (PPCmatpcreladdr PCRelForm:$ga)),
+            (PSTXVpc $XS, $ga, 0)>;
+
+  // Atomic Load
+  def : Pat<(atomic_load_8 (PPCmatpcreladdr PCRelForm:$ga)),
+            (PLBZpc $ga, 0)>;
+  def : Pat<(atomic_load_16 (PPCmatpcreladdr PCRelForm:$ga)),
+            (PLHZpc $ga, 0)>;
+  def : Pat<(atomic_load_32 (PPCmatpcreladdr PCRelForm:$ga)),
+            (PLWZpc $ga, 0)>;
+  def : Pat<(atomic_load_64 (PPCmatpcreladdr PCRelForm:$ga)),
+            (PLDpc $ga, 0)>;
+
+  // Atomic Store
+  def : Pat<(atomic_store_8 (PPCmatpcreladdr PCRelForm:$ga), i32:$RS),
+            (PSTBpc $RS, $ga, 0)>;
+  def : Pat<(atomic_store_16 (PPCmatpcreladdr PCRelForm:$ga), i32:$RS),
+            (PSTHpc $RS, $ga, 0)>;
+  def : Pat<(atomic_store_32 (PPCmatpcreladdr PCRelForm:$ga), i32:$RS),
+            (PSTWpc $RS, $ga, 0)>;
+  def : Pat<(atomic_store_8 (PPCmatpcreladdr PCRelForm:$ga), i64:$RS),
+            (PSTB8pc $RS, $ga, 0)>;
+  def : Pat<(atomic_store_16 (PPCmatpcreladdr PCRelForm:$ga), i64:$RS),
+            (PSTH8pc $RS, $ga, 0)>;
+  def : Pat<(atomic_store_32 (PPCmatpcreladdr PCRelForm:$ga), i64:$RS),
+            (PSTW8pc $RS, $ga, 0)>;
+  def : Pat<(atomic_store_64 (PPCmatpcreladdr PCRelForm:$ga), i64:$RS),
+            (PSTDpc $RS, $ga, 0)>;
+
+  // Special Cases For PPCstore_scal_int_from_vsr
+  def : Pat<(PPCstore_scal_int_from_vsr
+              (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)),
+              (PPCmatpcreladdr PCRelForm:$dst), 8),
+            (PSTXSDpc (XSCVDPSXDS f64:$src), $dst, 0)>;
+  def : Pat<(PPCstore_scal_int_from_vsr
+              (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)),
+              (PPCmatpcreladdr PCRelForm:$dst), 8),
+            (PSTXSDpc (COPY_TO_REGCLASS (XSCVQPSDZ f128:$src), VFRC), $dst, 0)>;
+
+  def : Pat<(PPCstore_scal_int_from_vsr
+              (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)),
+              (PPCmatpcreladdr PCRelForm:$dst), 8),
+            (PSTXSDpc (XSCVDPUXDS f64:$src), $dst, 0)>;
+  def : Pat<(PPCstore_scal_int_from_vsr
+              (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)),
+              (PPCmatpcreladdr PCRelForm:$dst), 8),
+            (PSTXSDpc (COPY_TO_REGCLASS (XSCVQPUDZ f128:$src), VFRC), $dst, 0)>;
+
+  def : Pat<(v4f32 (PPCldvsxlh (PPCmatpcreladdr PCRelForm:$addr))),
+            (SUBREG_TO_REG (i64 1), (PLFDpc $addr, 0), sub_64)>;
+
+  // If the PPCmatpcreladdr node is not caught by any other pattern it should be
+  // caught here and turned into a paddi instruction to materialize the address.
+  def : Pat<(PPCmatpcreladdr PCRelForm:$addr), (PADDI8pc 0, $addr)>;
+  // PPCtlsdynamatpcreladdr node is used for TLS dynamic models to materialize
+  // tls global address with paddi instruction.
+  def : Pat<(PPCtlsdynamatpcreladdr PCRelForm:$addr), (PADDI8pc 0, $addr)>;
+  // PPCtlslocalexecmataddr node is used for TLS local exec models to
+  // materialize tls global address with paddi instruction.
+  def : Pat<(PPCaddTls i64:$in, (PPCtlslocalexecmataddr tglobaltlsaddr:$addr)),
+            (PADDI8 $in, $addr)>;
+}
+
+let Predicates = [PrefixInstrs] in {
+  def XXPERMX :
+    8RR_XX4Form_IMM3_XTABC6<34, 0, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB,
+                            vsrc:$XC, u3imm:$UIM),
+                            "xxpermx $XT, $XA, $XB, $XC, $UIM",
+                            IIC_VecPerm, []>;
+  def XXBLENDVB :
+    8RR_XX4Form_XTABC6<33, 0, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB,
+                       vsrc:$XC), "xxblendvb $XT, $XA, $XB, $XC",
+                       IIC_VecGeneral, []>;
+  def XXBLENDVH :
+    8RR_XX4Form_XTABC6<33, 1, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB,
+                       vsrc:$XC), "xxblendvh $XT, $XA, $XB, $XC",
+                       IIC_VecGeneral, []>;
+  def XXBLENDVW :
+    8RR_XX4Form_XTABC6<33, 2, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB,
+                       vsrc:$XC), "xxblendvw $XT, $XA, $XB, $XC",
+                       IIC_VecGeneral, []>;
+  def XXBLENDVD :
+    8RR_XX4Form_XTABC6<33, 3, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB,
+                       vsrc:$XC), "xxblendvd $XT, $XA, $XB, $XC",
+                       IIC_VecGeneral, []>;
+}
+
+// XXSPLTIW/DP/32DX need extra flags to make sure the compiler does not attempt
+// to spill part of the instruction when the values are similar.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1, Predicates = [PrefixInstrs] in {
+  def XXSPLTIW : 8RR_DForm_IMM32_XT6<32, 3, (outs vsrc:$XT),
+                                     (ins i32imm:$IMM32),
+                                     "xxspltiw $XT, $IMM32", IIC_VecGeneral,
+                                     []>;
+  def XXSPLTIDP : 8RR_DForm_IMM32_XT6<32, 2, (outs vsrc:$XT),
+                                      (ins i32imm:$IMM32),
+                                      "xxspltidp $XT, $IMM32", IIC_VecGeneral,
+                                      [(set v2f64:$XT,
+                                            (PPCxxspltidp i32:$IMM32))]>;
+  def XXSPLTI32DX :
+      8RR_DForm_IMM32_XT6_IX<32, 0, (outs vsrc:$XT),
+                             (ins vsrc:$XTi, u1imm:$IX, i32imm:$IMM32),
+                             "xxsplti32dx $XT, $IX, $IMM32", IIC_VecGeneral,
+                             [(set v2i64:$XT,
+                                   (PPCxxsplti32dx v2i64:$XTi, i32:$IX,
+                                                   i32:$IMM32))]>,
+                             RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">;
+}
+
+let Predicates = [IsISA3_1] in {
+  def SETBC : XForm_XT5_BI5<31, 384, (outs gprc:$RT), (ins crbitrc:$BI),
+                            "setbc $RT, $BI", IIC_IntCompare, []>;
+  def SETBCR : XForm_XT5_BI5<31, 416, (outs gprc:$RT), (ins crbitrc:$BI),
+                             "setbcr $RT, $BI", IIC_IntCompare, []>;
+  def SETNBC : XForm_XT5_BI5<31, 448, (outs gprc:$RT), (ins crbitrc:$BI),
+                             "setnbc $RT, $BI", IIC_IntCompare, []>;
+  def SETNBCR : XForm_XT5_BI5<31, 480, (outs gprc:$RT), (ins crbitrc:$BI),
+                              "setnbcr $RT, $BI", IIC_IntCompare, []>;
+
+  let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+    def SETBC8 : XForm_XT5_BI5<31, 384, (outs g8rc:$RT), (ins crbitrc:$BI),
+                               "setbc $RT, $BI", IIC_IntCompare, []>;
+    def SETBCR8 : XForm_XT5_BI5<31, 416, (outs g8rc:$RT), (ins crbitrc:$BI),
+                                "setbcr $RT, $BI", IIC_IntCompare, []>;
+    def SETNBC8 : XForm_XT5_BI5<31, 448, (outs g8rc:$RT), (ins crbitrc:$BI),
+                                "setnbc $RT, $BI", IIC_IntCompare, []>;
+    def SETNBCR8 : XForm_XT5_BI5<31, 480, (outs g8rc:$RT), (ins crbitrc:$BI),
+                                 "setnbcr $RT, $BI", IIC_IntCompare, []>;
+  }
+
+  def VSLDBI : VNForm_VTAB5_SD3<22, 0, (outs vrrc:$VRT),
+                                (ins vrrc:$VRA, vrrc:$VRB, u3imm:$SH),
+                                "vsldbi $VRT, $VRA, $VRB, $SH",
+                                IIC_VecGeneral,
+                                [(set v16i8:$VRT,
+                                      (int_ppc_altivec_vsldbi v16i8:$VRA,
+                                                              v16i8:$VRB,
+                                                              timm:$SH))]>;
+  def VSRDBI : VNForm_VTAB5_SD3<22, 1, (outs vrrc:$VRT),
+                                (ins vrrc:$VRA, vrrc:$VRB, u3imm:$SH),
+                                "vsrdbi $VRT, $VRA, $VRB, $SH",
+                                IIC_VecGeneral,
+                                [(set v16i8:$VRT,
+                                      (int_ppc_altivec_vsrdbi v16i8:$VRA,
+                                                              v16i8:$VRB, 
+                                                              timm:$SH))]>;
+  defm VSTRIBR : VXForm_VTB5_RCr<13, 1, (outs vrrc:$vT), (ins vrrc:$vB),
+                                 "vstribr", "$vT, $vB", IIC_VecGeneral,
+				 [(set v16i8:$vT,
+                                       (int_ppc_altivec_vstribr v16i8:$vB))]>;
+  defm VSTRIBL : VXForm_VTB5_RCr<13, 0, (outs vrrc:$vT), (ins vrrc:$vB),
+                                 "vstribl", "$vT, $vB", IIC_VecGeneral,
+                                 [(set v16i8:$vT,
+                                       (int_ppc_altivec_vstribl v16i8:$vB))]>;
+  defm VSTRIHR : VXForm_VTB5_RCr<13, 3, (outs vrrc:$vT), (ins vrrc:$vB),
+                                 "vstrihr", "$vT, $vB", IIC_VecGeneral,
+                                 [(set v8i16:$vT,
+                                       (int_ppc_altivec_vstrihr v8i16:$vB))]>;
+  defm VSTRIHL : VXForm_VTB5_RCr<13, 2, (outs vrrc:$vT), (ins vrrc:$vB),
+                                 "vstrihl", "$vT, $vB", IIC_VecGeneral,
+                                 [(set v8i16:$vT,
+                                       (int_ppc_altivec_vstrihl v8i16:$vB))]>;
+  def VINSW :
+    VXForm_1<207, (outs vrrc:$vD), (ins vrrc:$vDi, u4imm:$UIM, gprc:$rB),
+             "vinsw $vD, $rB, $UIM", IIC_VecGeneral,
+             [(set v4i32:$vD,
+                   (int_ppc_altivec_vinsw v4i32:$vDi, i32:$rB, timm:$UIM))]>,
+             RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
+  def VINSD :
+    VXForm_1<463, (outs vrrc:$vD), (ins vrrc:$vDi, u4imm:$UIM, g8rc:$rB),
+             "vinsd $vD, $rB, $UIM", IIC_VecGeneral,
+             [(set v2i64:$vD,
+                   (int_ppc_altivec_vinsd v2i64:$vDi, i64:$rB, timm:$UIM))]>,
+             RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
+  def VINSBVLX :
+    VXForm_VTB5_RA5_ins<15, "vinsbvlx",
+                        [(set v16i8:$vD,
+                              (int_ppc_altivec_vinsbvlx v16i8:$vDi, i32:$rA,
+                                                        v16i8:$vB))]>;
+  def VINSBVRX :
+    VXForm_VTB5_RA5_ins<271, "vinsbvrx",
+                        [(set v16i8:$vD,
+                              (int_ppc_altivec_vinsbvrx v16i8:$vDi, i32:$rA,
+                                                        v16i8:$vB))]>;
+  def VINSHVLX :
+    VXForm_VTB5_RA5_ins<79, "vinshvlx",
+                        [(set v8i16:$vD,
+                              (int_ppc_altivec_vinshvlx v8i16:$vDi, i32:$rA,
+                                                        v8i16:$vB))]>;
+  def VINSHVRX :
+    VXForm_VTB5_RA5_ins<335, "vinshvrx",
+                        [(set v8i16:$vD,
+                              (int_ppc_altivec_vinshvrx v8i16:$vDi, i32:$rA,
+                                                        v8i16:$vB))]>;
+  def VINSWVLX :
+    VXForm_VTB5_RA5_ins<143, "vinswvlx",
+                        [(set v4i32:$vD,
+                              (int_ppc_altivec_vinswvlx v4i32:$vDi, i32:$rA,
+                                                        v4i32:$vB))]>;
+  def VINSWVRX :
+    VXForm_VTB5_RA5_ins<399, "vinswvrx",
+                        [(set v4i32:$vD,
+                              (int_ppc_altivec_vinswvrx v4i32:$vDi, i32:$rA,
+                                                        v4i32:$vB))]>;
+  def VINSBLX :
+    VXForm_VRT5_RAB5_ins<527, "vinsblx",
+                         [(set v16i8:$vD,
+                               (int_ppc_altivec_vinsblx v16i8:$vDi, i32:$rA,
+                                                        i32:$rB))]>;
+  def VINSBRX :
+    VXForm_VRT5_RAB5_ins<783, "vinsbrx",
+                         [(set v16i8:$vD,
+                               (int_ppc_altivec_vinsbrx v16i8:$vDi, i32:$rA,
+                                                        i32:$rB))]>;
+  def VINSHLX :
+    VXForm_VRT5_RAB5_ins<591, "vinshlx",
+                         [(set v8i16:$vD,
+                               (int_ppc_altivec_vinshlx v8i16:$vDi, i32:$rA,
+                                                        i32:$rB))]>;
+  def VINSHRX :
+    VXForm_VRT5_RAB5_ins<847, "vinshrx",
+                         [(set v8i16:$vD,
+                               (int_ppc_altivec_vinshrx v8i16:$vDi, i32:$rA,
+                                                        i32:$rB))]>;
+  def VINSWLX :
+    VXForm_VRT5_RAB5_ins<655, "vinswlx",
+                         [(set v4i32:$vD,
+                               (int_ppc_altivec_vinswlx v4i32:$vDi, i32:$rA,
+                                                        i32:$rB))]>;
+  def VINSWRX :
+    VXForm_VRT5_RAB5_ins<911, "vinswrx",
+                         [(set v4i32:$vD,
+                               (int_ppc_altivec_vinswrx v4i32:$vDi, i32:$rA,
+                                                        i32:$rB))]>;
+  def VINSDLX :
+    VXForm_1<719, (outs vrrc:$vD), (ins vrrc:$vDi, g8rc:$rA, g8rc:$rB),
+             "vinsdlx $vD, $rA, $rB", IIC_VecGeneral,
+              [(set v2i64:$vD,
+                    (int_ppc_altivec_vinsdlx v2i64:$vDi, i64:$rA, i64:$rB))]>,
+              RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
+  def VINSDRX :
+    VXForm_1<975, (outs vrrc:$vD), (ins vrrc:$vDi, g8rc:$rA, g8rc:$rB),
+             "vinsdrx $vD, $rA, $rB", IIC_VecGeneral,
+              [(set v2i64:$vD,
+                    (int_ppc_altivec_vinsdrx v2i64:$vDi, i64:$rA, i64:$rB))]>,
+              RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
+  def VEXTRACTBM : VXForm_RD5_XO5_RS5<1602, 8, (outs gprc:$rD), (ins vrrc:$vB),
+                                      "vextractbm $rD, $vB", IIC_VecGeneral,
+                                      [(set i32:$rD,
+                                      (int_ppc_altivec_vextractbm v16i8:$vB))]>;
+  def VEXTRACTHM : VXForm_RD5_XO5_RS5<1602, 9, (outs gprc:$rD), (ins vrrc:$vB),
+                                      "vextracthm $rD, $vB", IIC_VecGeneral,
+                                      [(set i32:$rD,
+                                      (int_ppc_altivec_vextracthm v8i16:$vB))]>;
+  def VEXTRACTWM : VXForm_RD5_XO5_RS5<1602, 10, (outs gprc:$rD), (ins vrrc:$vB),
+                                      "vextractwm $rD, $vB", IIC_VecGeneral,
+                                      [(set i32:$rD,
+                                      (int_ppc_altivec_vextractwm v4i32:$vB))]>;
+  def VEXTRACTDM : VXForm_RD5_XO5_RS5<1602, 11, (outs gprc:$rD), (ins vrrc:$vB),
+                                      "vextractdm $rD, $vB", IIC_VecGeneral,
+                                      [(set i32:$rD,
+                                      (int_ppc_altivec_vextractdm v2i64:$vB))]>;
+  def VEXTRACTQM : VXForm_RD5_XO5_RS5<1602, 12, (outs gprc:$rD), (ins vrrc:$vB),
+                                      "vextractqm $rD, $vB", IIC_VecGeneral,
+                                      [(set i32:$rD,
+                                      (int_ppc_altivec_vextractqm v1i128:$vB))]>;
+  def VEXPANDBM : VXForm_RD5_XO5_RS5<1602, 0, (outs vrrc:$vD), (ins vrrc:$vB),
+                                     "vexpandbm $vD, $vB", IIC_VecGeneral,
+                                     [(set v16i8:$vD, (int_ppc_altivec_vexpandbm
+                                           v16i8:$vB))]>;
+  def VEXPANDHM : VXForm_RD5_XO5_RS5<1602, 1, (outs vrrc:$vD), (ins vrrc:$vB),
+                                     "vexpandhm $vD, $vB", IIC_VecGeneral,
+                                     [(set v8i16:$vD, (int_ppc_altivec_vexpandhm
+                                           v8i16:$vB))]>;
+  def VEXPANDWM : VXForm_RD5_XO5_RS5<1602, 2, (outs vrrc:$vD), (ins vrrc:$vB),
+                                     "vexpandwm $vD, $vB", IIC_VecGeneral,
+                                     [(set v4i32:$vD, (int_ppc_altivec_vexpandwm
+                                           v4i32:$vB))]>;
+  def VEXPANDDM : VXForm_RD5_XO5_RS5<1602, 3, (outs vrrc:$vD), (ins vrrc:$vB),
+                                     "vexpanddm $vD, $vB", IIC_VecGeneral,
+                                     [(set v2i64:$vD, (int_ppc_altivec_vexpanddm
+                                           v2i64:$vB))]>;
+  def VEXPANDQM : VXForm_RD5_XO5_RS5<1602, 4, (outs vrrc:$vD), (ins vrrc:$vB),
+                                     "vexpandqm $vD, $vB", IIC_VecGeneral,
+                                     [(set v1i128:$vD, (int_ppc_altivec_vexpandqm
+                                           v1i128:$vB))]>;
+  def MTVSRBM : VXForm_RD5_XO5_RS5<1602, 16, (outs vrrc:$vD), (ins g8rc:$rB),
+                                   "mtvsrbm $vD, $rB", IIC_VecGeneral,
+                                   [(set v16i8:$vD,
+                                         (int_ppc_altivec_mtvsrbm i64:$rB))]>;
+  def MTVSRHM : VXForm_RD5_XO5_RS5<1602, 17, (outs vrrc:$vD), (ins g8rc:$rB),
+                                   "mtvsrhm $vD, $rB", IIC_VecGeneral,
+                                   [(set v8i16:$vD,
+                                         (int_ppc_altivec_mtvsrhm i64:$rB))]>;
+  def MTVSRWM : VXForm_RD5_XO5_RS5<1602, 18, (outs vrrc:$vD), (ins g8rc:$rB),
+                                   "mtvsrwm $vD, $rB", IIC_VecGeneral,
+                                   [(set v4i32:$vD,
+                                         (int_ppc_altivec_mtvsrwm i64:$rB))]>;
+  def MTVSRDM : VXForm_RD5_XO5_RS5<1602, 19, (outs vrrc:$vD), (ins g8rc:$rB),
+                                   "mtvsrdm $vD, $rB", IIC_VecGeneral,
+                                   [(set v2i64:$vD,
+                                         (int_ppc_altivec_mtvsrdm i64:$rB))]>;
+  def MTVSRQM : VXForm_RD5_XO5_RS5<1602, 20, (outs vrrc:$vD), (ins g8rc:$rB),
+                                   "mtvsrqm $vD, $rB", IIC_VecGeneral,
+                                   [(set v1i128:$vD,
+                                         (int_ppc_altivec_mtvsrqm i64:$rB))]>;
+  def MTVSRBMI : DXForm<4, 10, (outs vrrc:$vD), (ins u16imm64:$D),
+                        "mtvsrbmi $vD, $D", IIC_VecGeneral,
+                        [(set v16i8:$vD,
+                              (int_ppc_altivec_mtvsrbm imm:$D))]>;
+  def VCNTMBB : VXForm_RD5_MP_VB5<1602, 12, (outs g8rc:$rD),
+                                  (ins vrrc:$vB, u1imm:$MP),
+                                  "vcntmbb $rD, $vB, $MP", IIC_VecGeneral,
+                                  [(set i64:$rD, (int_ppc_altivec_vcntmbb
+                                        v16i8:$vB, timm:$MP))]>;
+  def VCNTMBH : VXForm_RD5_MP_VB5<1602, 13, (outs g8rc:$rD),
+                                  (ins vrrc:$vB, u1imm:$MP),
+                                  "vcntmbh $rD, $vB, $MP", IIC_VecGeneral,
+                                  [(set i64:$rD, (int_ppc_altivec_vcntmbh
+                                        v8i16:$vB, timm:$MP))]>;
+  def VCNTMBW : VXForm_RD5_MP_VB5<1602, 14, (outs g8rc:$rD),
+                                  (ins vrrc:$vB, u1imm:$MP),
+                                  "vcntmbw $rD, $vB, $MP", IIC_VecGeneral,
+                                  [(set i64:$rD, (int_ppc_altivec_vcntmbw
+                                        v4i32:$vB, timm:$MP))]>;
+  def VCNTMBD : VXForm_RD5_MP_VB5<1602, 15, (outs g8rc:$rD),
+                                  (ins vrrc:$vB, u1imm:$MP),
+                                  "vcntmbd $rD, $vB, $MP", IIC_VecGeneral,
+                                  [(set i64:$rD, (int_ppc_altivec_vcntmbd
+                                        v2i64:$vB, timm:$MP))]>;
+  def VEXTDUBVLX : VAForm_1a<24, (outs vrrc:$vD),
+                             (ins vrrc:$vA, vrrc:$vB, gprc:$rC),
+                             "vextdubvlx $vD, $vA, $vB, $rC",
+                             IIC_VecGeneral,
+                             [(set v2i64:$vD,
+                                   (int_ppc_altivec_vextdubvlx v16i8:$vA,
+                                                               v16i8:$vB,
+                                                               i32:$rC))]>;
+  def VEXTDUBVRX : VAForm_1a<25, (outs vrrc:$vD),
+                             (ins vrrc:$vA, vrrc:$vB, gprc:$rC),
+                             "vextdubvrx $vD, $vA, $vB, $rC",
+                             IIC_VecGeneral,
+                             [(set v2i64:$vD,
+                                   (int_ppc_altivec_vextdubvrx v16i8:$vA,
+                                                               v16i8:$vB,
+                                                               i32:$rC))]>;
+  def VEXTDUHVLX : VAForm_1a<26, (outs vrrc:$vD),
+                             (ins vrrc:$vA, vrrc:$vB, gprc:$rC),
+                             "vextduhvlx $vD, $vA, $vB, $rC",
+                             IIC_VecGeneral,
+                             [(set v2i64:$vD,
+                                   (int_ppc_altivec_vextduhvlx v8i16:$vA,
+                                                               v8i16:$vB,
+                                                               i32:$rC))]>;
+  def VEXTDUHVRX : VAForm_1a<27, (outs vrrc:$vD),
+                             (ins vrrc:$vA, vrrc:$vB, gprc:$rC),
+                             "vextduhvrx $vD, $vA, $vB, $rC",
+                             IIC_VecGeneral,
+                             [(set v2i64:$vD,
+                                   (int_ppc_altivec_vextduhvrx v8i16:$vA,
+                                                               v8i16:$vB,
+                                                               i32:$rC))]>;
+  def VEXTDUWVLX : VAForm_1a<28, (outs vrrc:$vD),
+                             (ins vrrc:$vA, vrrc:$vB, gprc:$rC),
+                             "vextduwvlx $vD, $vA, $vB, $rC",
+                             IIC_VecGeneral,
+                             [(set v2i64:$vD,
+                                   (int_ppc_altivec_vextduwvlx v4i32:$vA,
+                                                               v4i32:$vB,
+                                                               i32:$rC))]>;
+  def VEXTDUWVRX : VAForm_1a<29, (outs vrrc:$vD),
+                             (ins vrrc:$vA, vrrc:$vB, gprc:$rC),
+                             "vextduwvrx $vD, $vA, $vB, $rC",
+                             IIC_VecGeneral,
+                             [(set v2i64:$vD,
+                                   (int_ppc_altivec_vextduwvrx v4i32:$vA,
+                                                               v4i32:$vB,
+                                                               i32:$rC))]>;
+  def VEXTDDVLX : VAForm_1a<30, (outs vrrc:$vD),
+                            (ins vrrc:$vA, vrrc:$vB, gprc:$rC),
+                            "vextddvlx $vD, $vA, $vB, $rC",
+                            IIC_VecGeneral,
+                            [(set v2i64:$vD,
+                                  (int_ppc_altivec_vextddvlx v2i64:$vA,
+                                                             v2i64:$vB,
+                                                             i32:$rC))]>;
+  def VEXTDDVRX : VAForm_1a<31, (outs vrrc:$vD),
+                            (ins vrrc:$vA, vrrc:$vB, gprc:$rC),
+                            "vextddvrx $vD, $vA, $vB, $rC",
+                            IIC_VecGeneral,
+                            [(set v2i64:$vD,
+                                  (int_ppc_altivec_vextddvrx v2i64:$vA,
+                                                             v2i64:$vB,
+                                                             i32:$rC))]>;
+   def VPDEPD : VXForm_1<1485, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vpdepd $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v2i64:$vD,
+                         (int_ppc_altivec_vpdepd v2i64:$vA, v2i64:$vB))]>;
+   def VPEXTD : VXForm_1<1421, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vpextd $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v2i64:$vD,
+                         (int_ppc_altivec_vpextd v2i64:$vA, v2i64:$vB))]>;
+   def PDEPD : XForm_6<31, 156, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                       "pdepd $rA, $rS, $rB", IIC_IntGeneral,
+                       [(set i64:$rA, (int_ppc_pdepd i64:$rS, i64:$rB))]>;
+   def PEXTD : XForm_6<31, 188, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                       "pextd $rA, $rS, $rB", IIC_IntGeneral,
+                       [(set i64:$rA, (int_ppc_pextd i64:$rS, i64:$rB))]>;
+   def VCFUGED : VXForm_1<1357, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                          "vcfuged $vD, $vA, $vB", IIC_VecGeneral,
+                          [(set v2i64:$vD,
+                          (int_ppc_altivec_vcfuged v2i64:$vA, v2i64:$vB))]>;
+   def VGNB : VXForm_RD5_N3_VB5<1228, (outs g8rc:$rD), (ins vrrc:$vB, u3imm:$N),
+                                "vgnb $rD, $vB, $N", IIC_VecGeneral,
+                                [(set i64:$rD,
+                                (int_ppc_altivec_vgnb v1i128:$vB, timm:$N))]>;
+   def CFUGED : XForm_6<31, 220, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                        "cfuged $rA, $rS, $rB", IIC_IntGeneral,
+                        [(set i64:$rA, (int_ppc_cfuged i64:$rS, i64:$rB))]>;
+   def XXEVAL :
+     8RR_XX4Form_IMM8_XTAB6<34, 1, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB,
+                            vsrc:$XC, u8imm:$IMM),
+                            "xxeval $XT, $XA, $XB, $XC, $IMM", IIC_VecGeneral,
+                            [(set v2i64:$XT, (int_ppc_vsx_xxeval v2i64:$XA,
+                                  v2i64:$XB, v2i64:$XC, timm:$IMM))]>;
+   def VCLZDM : VXForm_1<1924, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vclzdm $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v2i64:$vD,
+                         (int_ppc_altivec_vclzdm v2i64:$vA, v2i64:$vB))]>;
+   def VCTZDM : VXForm_1<1988, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vctzdm $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v2i64:$vD,
+                         (int_ppc_altivec_vctzdm v2i64:$vA, v2i64:$vB))]>;
+   def CNTLZDM : XForm_6<31, 59, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                         "cntlzdm $rA, $rS, $rB", IIC_IntGeneral,
+                         [(set i64:$rA,
+                         (int_ppc_cntlzdm i64:$rS, i64:$rB))]>;
+   def CNTTZDM : XForm_6<31, 571, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                         "cnttzdm $rA, $rS, $rB", IIC_IntGeneral,
+                         [(set i64:$rA,
+                         (int_ppc_cnttzdm i64:$rS, i64:$rB))]>;
+   def XXGENPCVBM :
+     XForm_XT6_IMM5_VB5<60, 916, (outs vsrc:$XT), (ins vrrc:$VRB, s5imm:$IMM),
+                        "xxgenpcvbm $XT, $VRB, $IMM", IIC_VecGeneral, []>;
+   def XXGENPCVHM :
+     XForm_XT6_IMM5_VB5<60, 917, (outs vsrc:$XT), (ins vrrc:$VRB, s5imm:$IMM),
+                        "xxgenpcvhm $XT, $VRB, $IMM", IIC_VecGeneral, []>;
+   def XXGENPCVWM :
+     XForm_XT6_IMM5_VB5<60, 948, (outs vsrc:$XT), (ins vrrc:$VRB, s5imm:$IMM),
+                        "xxgenpcvwm $XT, $VRB, $IMM", IIC_VecGeneral, []>;
+   def XXGENPCVDM :
+     XForm_XT6_IMM5_VB5<60, 949, (outs vsrc:$XT), (ins vrrc:$VRB, s5imm:$IMM),
+                        "xxgenpcvdm $XT, $VRB, $IMM", IIC_VecGeneral, []>;
+   def VCLRLB : VXForm_1<397, (outs vrrc:$vD), (ins vrrc:$vA, gprc:$rB),
+                         "vclrlb $vD, $vA, $rB", IIC_VecGeneral,
+                         [(set v16i8:$vD,
+                               (int_ppc_altivec_vclrlb v16i8:$vA, i32:$rB))]>;
+   def VCLRRB : VXForm_1<461, (outs vrrc:$vD), (ins vrrc:$vA, gprc:$rB),
+                         "vclrrb $vD, $vA, $rB", IIC_VecGeneral,
+                         [(set v16i8:$vD,
+                               (int_ppc_altivec_vclrrb v16i8:$vA, i32:$rB))]>;
+  def VMULLD : VXForm_1<457, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                        "vmulld $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v2i64:$vD, (mul v2i64:$vA, v2i64:$vB))]>;
+  def VMULHSW : VXForm_1<905, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vmulhsw $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v4i32:$vD, (mulhs v4i32:$vA, v4i32:$vB))]>;
+  def VMULHUW : VXForm_1<649, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vmulhuw $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v4i32:$vD, (mulhu v4i32:$vA, v4i32:$vB))]>;
+  def VMULHSD : VXForm_1<969, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vmulhsd $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v2i64:$vD, (mulhs v2i64:$vA, v2i64:$vB))]>;
+  def VMULHUD : VXForm_1<713, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vmulhud $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v2i64:$vD, (mulhu v2i64:$vA, v2i64:$vB))]>;
+  def VMODSW : VXForm_1<1931, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                        "vmodsw $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v4i32:$vD, (srem v4i32:$vA, v4i32:$vB))]>;
+  def VMODUW : VXForm_1<1675, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                        "vmoduw $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v4i32:$vD, (urem v4i32:$vA, v4i32:$vB))]>;
+  def VMODSD : VXForm_1<1995, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                        "vmodsd $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v2i64:$vD, (srem v2i64:$vA, v2i64:$vB))]>;
+  def VMODUD : VXForm_1<1739, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                        "vmodud $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v2i64:$vD, (urem v2i64:$vA, v2i64:$vB))]>;
+  def VDIVSW : VXForm_1<395, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                        "vdivsw $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v4i32:$vD, (sdiv v4i32:$vA, v4i32:$vB))]>;
+  def VDIVUW : VXForm_1<139, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                        "vdivuw $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v4i32:$vD, (udiv v4i32:$vA, v4i32:$vB))]>;
+  def VDIVSD : VXForm_1<459, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                        "vdivsd $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v2i64:$vD, (sdiv v2i64:$vA, v2i64:$vB))]>;
+  def VDIVUD : VXForm_1<203, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                        "vdivud $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v2i64:$vD, (udiv v2i64:$vA, v2i64:$vB))]>;
+  def VDIVESW : VXForm_1<907, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vdivesw $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v4i32:$vD, (int_ppc_altivec_vdivesw v4i32:$vA,
+                               v4i32:$vB))]>;
+  def VDIVEUW : VXForm_1<651, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vdiveuw $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v4i32:$vD, (int_ppc_altivec_vdiveuw v4i32:$vA,
+                               v4i32:$vB))]>;
+  def VDIVESD : VXForm_1<971, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vdivesd $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v2i64:$vD, (int_ppc_altivec_vdivesd v2i64:$vA,
+                               v2i64:$vB))]>;
+  def VDIVEUD : VXForm_1<715, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vdiveud $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v2i64:$vD, (int_ppc_altivec_vdiveud v2i64:$vA,
+                               v2i64:$vB))]>;
+  def XVTLSBB : XX2_BF3_XO5_XB6_XO9<60, 2, 475, (outs crrc:$BF), (ins vsrc:$XB),
+                                    "xvtlsbb $BF, $XB", IIC_VecGeneral, []>;
+
+  // The XFormMemOp flag for the following 8 instructions is set on
+  // the instruction format.
+  let mayLoad = 1, mayStore = 0 in {
+    def LXVRBX : X_XT6_RA5_RB5<31, 13, "lxvrbx", vsrc, []>;
+    def LXVRHX : X_XT6_RA5_RB5<31, 45, "lxvrhx", vsrc, []>;
+    def LXVRWX : X_XT6_RA5_RB5<31, 77, "lxvrwx", vsrc, []>;
+    def LXVRDX : X_XT6_RA5_RB5<31, 109, "lxvrdx", vsrc, []>;
+  }
+
+  let mayLoad = 0, mayStore = 1 in {
+    def STXVRBX : X_XS6_RA5_RB5<31, 141, "stxvrbx", vsrc, []>;
+    def STXVRHX : X_XS6_RA5_RB5<31, 173, "stxvrhx", vsrc, []>;
+    def STXVRWX : X_XS6_RA5_RB5<31, 205, "stxvrwx", vsrc, []>;
+    def STXVRDX : X_XS6_RA5_RB5<31, 237, "stxvrdx", vsrc, []>;
+  }
+
+  def VMULESD : VXForm_1<968, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vmulesd $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v1i128:$vD, (int_ppc_altivec_vmulesd v2i64:$vA,
+                               v2i64:$vB))]>;
+  def VMULEUD : VXForm_1<712, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vmuleud $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v1i128:$vD, (int_ppc_altivec_vmuleud v2i64:$vA,
+                               v2i64:$vB))]>;
+  def VMULOSD : VXForm_1<456, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vmulosd $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v1i128:$vD, (int_ppc_altivec_vmulosd v2i64:$vA,
+                               v2i64:$vB))]>;
+  def VMULOUD : VXForm_1<200, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vmuloud $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v1i128:$vD, (int_ppc_altivec_vmuloud v2i64:$vA,
+                               v2i64:$vB))]>;
+  def VMSUMCUD : VAForm_1a<23, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC),
+                           "vmsumcud $vD, $vA, $vB, $vC", IIC_VecGeneral,
+                           [(set v1i128:$vD, (int_ppc_altivec_vmsumcud
+                                 v2i64:$vA, v2i64:$vB, v1i128:$vC))]>;
+  def VDIVSQ : VXForm_1<267, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                        "vdivsq $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v1i128:$vD, (sdiv v1i128:$vA, v1i128:$vB))]>;
+  def VDIVUQ : VXForm_1<11, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                        "vdivuq $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v1i128:$vD, (udiv v1i128:$vA, v1i128:$vB))]>;
+  def VDIVESQ : VXForm_1<779, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vdivesq $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v1i128:$vD, (int_ppc_altivec_vdivesq v1i128:$vA,
+			       v1i128:$vB))]>;
+  def VDIVEUQ : VXForm_1<523, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vdiveuq $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v1i128:$vD, (int_ppc_altivec_vdiveuq v1i128:$vA,
+			       v1i128:$vB))]>;
+  def VCMPEQUQ : VCMP <455, "vcmpequq $vD, $vA, $vB" , v1i128>;
+  def VCMPGTSQ : VCMP <903, "vcmpgtsq $vD, $vA, $vB" , v1i128>;
+  def VCMPGTUQ : VCMP <647, "vcmpgtuq $vD, $vA, $vB" , v1i128>;
+  def VCMPEQUQ_rec : VCMP_rec <455, "vcmpequq. $vD, $vA, $vB" , v1i128>;
+  def VCMPGTSQ_rec : VCMP_rec <903, "vcmpgtsq. $vD, $vA, $vB" , v1i128>;
+  def VCMPGTUQ_rec : VCMP_rec <647, "vcmpgtuq. $vD, $vA, $vB" , v1i128>;
+  def VMODSQ : VXForm_1<1803, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                        "vmodsq $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v1i128:$vD, (srem v1i128:$vA, v1i128:$vB))]>;
+  def VMODUQ : VXForm_1<1547, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                        "vmoduq $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v1i128:$vD, (urem v1i128:$vA, v1i128:$vB))]>;
+  def VEXTSD2Q : VXForm_RD5_XO5_RS5<1538, 27, (outs vrrc:$vD), (ins vrrc:$vB),
+                               "vextsd2q $vD, $vB", IIC_VecGeneral,
+                               [(set v1i128:$vD, (int_ppc_altivec_vextsd2q v2i64:$vB))]>;
+  def VCMPUQ : VXForm_BF3_VAB5<257, (outs crrc:$BF), (ins vrrc:$vA, vrrc:$vB),
+                               "vcmpuq $BF, $vA, $vB", IIC_VecGeneral, []>;
+  def VCMPSQ : VXForm_BF3_VAB5<321, (outs crrc:$BF), (ins vrrc:$vA, vrrc:$vB),
+                               "vcmpsq $BF, $vA, $vB", IIC_VecGeneral, []>;
+  def VRLQNM : VX1_VT5_VA5_VB5<325, "vrlqnm",
+                               [(set v1i128:$vD,
+                                   (int_ppc_altivec_vrlqnm v1i128:$vA,
+                                                           v1i128:$vB))]>;
+  def VRLQMI : VXForm_1<69, (outs vrrc:$vD),
+                        (ins vrrc:$vA, vrrc:$vB, vrrc:$vDi),
+                        "vrlqmi $vD, $vA, $vB", IIC_VecFP,
+                        [(set v1i128:$vD,
+                          (int_ppc_altivec_vrlqmi v1i128:$vA, v1i128:$vB,
+                                                  v1i128:$vDi))]>,
+                        RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
+  def VSLQ : VX1_VT5_VA5_VB5<261, "vslq", []>;
+  def VSRAQ : VX1_VT5_VA5_VB5<773, "vsraq", []>;
+  def VSRQ : VX1_VT5_VA5_VB5<517, "vsrq", []>;
+  def VRLQ : VX1_VT5_VA5_VB5<5, "vrlq", []>;
+  def XSCVQPUQZ : X_VT5_XO5_VB5<63, 0, 836, "xscvqpuqz", []>;
+  def XSCVQPSQZ : X_VT5_XO5_VB5<63, 8, 836, "xscvqpsqz", []>;
+  def XSCVUQQP : X_VT5_XO5_VB5<63, 3, 836, "xscvuqqp", []>;
+  def XSCVSQQP : X_VT5_XO5_VB5<63, 11, 836, "xscvsqqp", []>;
+  def LXVKQ : XForm_XT6_IMM5<60, 31, 360, (outs vsrc:$XT), (ins u5imm:$UIM),
+                             "lxvkq $XT, $UIM", IIC_VecGeneral, []>;
+}
+
+let Predicates = [IsISA3_1, HasVSX] in {
+  def XVCVSPBF16 : XX2_XT6_XO5_XB6<60, 17, 475, "xvcvspbf16", vsrc, []>;
+  def XVCVBF16SPN : XX2_XT6_XO5_XB6<60, 16, 475, "xvcvbf16spn", vsrc, []>;
+  def XSMAXCQP : X_VT5_VA5_VB5<63, 676, "xsmaxcqp",
+                               [(set f128:$vT, (PPCxsmaxc f128:$vA, f128:$vB))]>;
+  def XSMINCQP : X_VT5_VA5_VB5<63, 740, "xsmincqp",
+                               [(set f128:$vT, (PPCxsminc f128:$vA, f128:$vB))]>;
+}
+
+// Multiclass defining patterns for Set Boolean Extension Reverse Instructions.
+// This is analogous to the CRNotPat multiclass but specifically for Power10
+// and newer subtargets since the extended forms use Set Boolean instructions.
+// The first two anonymous patterns defined are actually a duplicate of those
+// in CRNotPat, but it is preferable to define both multiclasses as complete
+// ones rather than pulling that small common section out.
+multiclass P10ReverseSetBool<dag pattern, dag result> {
+  def : Pat<pattern, (crnot result)>;
+  def : Pat<(not pattern), result>;
+
+  def : Pat<(i32 (zext pattern)),
+            (SETBCR result)>;
+  def : Pat<(i64 (zext pattern)),
+            (SETBCR8 result)>;
+
+  def : Pat<(i32 (sext pattern)),
+            (SETNBCR result)>;
+  def : Pat<(i64 (sext pattern)),
+            (SETNBCR8 result)>;
+
+  def : Pat<(i32 (anyext pattern)),
+            (SETBCR result)>;
+  def : Pat<(i64 (anyext pattern)),
+            (SETBCR8 result)>;
+}
+
+multiclass IntSetP10RevSetBool<SDNode SetCC, ValueType Ty, PatLeaf ZExtTy,
+                               ImmLeaf SExtTy, I Cmpi, I Cmpli,
+                               I Cmp, I Cmpl> {
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETUGE)),
+                           (EXTRACT_SUBREG (Cmpl $s1, $s2), sub_lt)>;
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETGE)),
+                           (EXTRACT_SUBREG (Cmp $s1, $s2), sub_lt)>;
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETULE)),
+                           (EXTRACT_SUBREG (Cmpl $s1, $s2), sub_gt)>;
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETLE)),
+                           (EXTRACT_SUBREG (Cmp $s1, $s2), sub_gt)>;
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETNE)),
+                           (EXTRACT_SUBREG (Cmp $s1, $s2), sub_eq)>;
+
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, ZExtTy:$imm, SETUGE)),
+                           (EXTRACT_SUBREG (Cmpli $s1, imm:$imm), sub_lt)>;
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, SExtTy:$imm, SETGE)),
+                           (EXTRACT_SUBREG (Cmpi $s1, imm:$imm), sub_lt)>;
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, ZExtTy:$imm, SETULE)),
+                           (EXTRACT_SUBREG (Cmpli $s1, imm:$imm), sub_gt)>;
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, SExtTy:$imm, SETLE)),
+                           (EXTRACT_SUBREG (Cmpi $s1, imm:$imm), sub_gt)>;
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, SExtTy:$imm, SETNE)),
+                           (EXTRACT_SUBREG (Cmpi $s1, imm:$imm), sub_eq)>;
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, ZExtTy:$imm, SETNE)),
+                           (EXTRACT_SUBREG (Cmpli $s1, imm:$imm), sub_eq)>;
+}
+
+multiclass FSetP10RevSetBool<SDNode SetCC, ValueType Ty, I FCmp> {
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETUGE)),
+                           (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>;
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETGE)),
+                           (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>;
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETULE)),
+                           (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>;
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETLE)),
+                           (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>;
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETUNE)),
+                           (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>;
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETNE)),
+                           (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>;
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETO)),
+                           (EXTRACT_SUBREG (FCmp $s1, $s2), sub_un)>;
+}
+
+let Predicates = [IsISA3_1] in {
+  def : Pat<(i32 (zext i1:$in)),
+            (SETBC $in)>;
+  def : Pat<(i64 (zext i1:$in)),
+            (SETBC8 $in)>;
+  def : Pat<(i32 (sext i1:$in)),
+            (SETNBC $in)>;
+  def : Pat<(i64 (sext i1:$in)),
+            (SETNBC8 $in)>;
+  def : Pat<(i32 (anyext i1:$in)),
+            (SETBC $in)>;
+  def : Pat<(i64 (anyext i1:$in)),
+            (SETBC8 $in)>;
+
+  // Instantiation of the set boolean reverse patterns for 32-bit integers.
+  defm : IntSetP10RevSetBool<setcc, i32, immZExt16, imm32SExt16,
+                             CMPWI, CMPLWI, CMPW, CMPLW>;
+  defm : P10ReverseSetBool<(i1 (setcc i32:$s1, imm:$imm, SETNE)),
+                           (EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)),
+                                           (LO16 imm:$imm)), sub_eq)>;
+
+  // Instantiation of the set boolean reverse patterns for 64-bit integers.
+  defm : IntSetP10RevSetBool<setcc, i64, immZExt16, imm64SExt16,
+                             CMPDI, CMPLDI, CMPD, CMPLD>;
+  defm : P10ReverseSetBool<(i1 (setcc i64:$s1, imm64ZExt32:$imm, SETNE)),
+                           (EXTRACT_SUBREG (CMPLDI (XORIS8 $s1, (HI16 imm:$imm)),
+                                           (LO16 imm:$imm)), sub_eq)>;
+}
+
+// Instantiation of the set boolean reverse patterns for f32, f64, f128.
+let Predicates = [IsISA3_1, HasFPU] in {
+  defm : FSetP10RevSetBool<setcc, f32, FCMPUS>;
+  defm : FSetP10RevSetBool<setcc, f64, FCMPUD>;
+  defm : FSetP10RevSetBool<setcc, f128, XSCMPUQP>;
+}
+
+//---------------------------- Anonymous Patterns ----------------------------//
+let Predicates = [IsISA3_1] in {
+  // Exploit the vector multiply high instructions using intrinsics.
+  def : Pat<(v4i32 (int_ppc_altivec_vmulhsw v4i32:$vA, v4i32:$vB)),
+            (v4i32 (VMULHSW $vA, $vB))>;
+  def : Pat<(v4i32 (int_ppc_altivec_vmulhuw v4i32:$vA, v4i32:$vB)),
+            (v4i32 (VMULHUW $vA, $vB))>;
+  def : Pat<(v2i64 (int_ppc_altivec_vmulhsd v2i64:$vA, v2i64:$vB)),
+            (v2i64 (VMULHSD $vA, $vB))>;
+  def : Pat<(v2i64 (int_ppc_altivec_vmulhud v2i64:$vA, v2i64:$vB)),
+            (v2i64 (VMULHUD $vA, $vB))>;
+  def : Pat<(v16i8 (int_ppc_vsx_xxgenpcvbm v16i8:$VRB, imm:$IMM)),
+            (v16i8 (COPY_TO_REGCLASS (XXGENPCVBM $VRB, imm:$IMM), VRRC))>;
+  def : Pat<(v8i16 (int_ppc_vsx_xxgenpcvhm v8i16:$VRB, imm:$IMM)),
+            (v8i16 (COPY_TO_REGCLASS (XXGENPCVHM $VRB, imm:$IMM), VRRC))>;
+  def : Pat<(v4i32 (int_ppc_vsx_xxgenpcvwm v4i32:$VRB, imm:$IMM)),
+            (v4i32 (COPY_TO_REGCLASS (XXGENPCVWM $VRB, imm:$IMM), VRRC))>;
+  def : Pat<(v2i64 (int_ppc_vsx_xxgenpcvdm v2i64:$VRB, imm:$IMM)),
+            (v2i64 (COPY_TO_REGCLASS (XXGENPCVDM $VRB, imm:$IMM), VRRC))>;
+  def : Pat<(i32 (int_ppc_vsx_xvtlsbb v16i8:$XB, 1)),
+            (EXTRACT_SUBREG (XVTLSBB (COPY_TO_REGCLASS $XB, VSRC)), sub_lt)>;
+  def : Pat<(i32 (int_ppc_vsx_xvtlsbb v16i8:$XB, 0)),
+            (EXTRACT_SUBREG (XVTLSBB (COPY_TO_REGCLASS $XB, VSRC)), sub_eq)>;
+
+  def : Pat <(v1i128 (PPClxvrzx ForceXForm:$src, 8)),
+             (v1i128 (COPY_TO_REGCLASS (LXVRBX ForceXForm:$src), VRRC))>;
+  def : Pat <(v1i128 (PPClxvrzx ForceXForm:$src, 16)),
+             (v1i128 (COPY_TO_REGCLASS (LXVRHX ForceXForm:$src), VRRC))>;
+  def : Pat <(v1i128 (PPClxvrzx ForceXForm:$src, 32)),
+             (v1i128 (COPY_TO_REGCLASS (LXVRWX ForceXForm:$src), VRRC))>;
+  def : Pat <(v1i128 (PPClxvrzx ForceXForm:$src, 64)),
+             (v1i128 (COPY_TO_REGCLASS (LXVRDX ForceXForm:$src), VRRC))>;
+
+  def : Pat<(v1i128 (rotl v1i128:$vA, v1i128:$vB)),
+            (v1i128 (VRLQ v1i128:$vA, v1i128:$vB))>;
+
+  def : Pat <(v2i64 (PPCxxsplti32dx v2i64:$XT, i32:$XI, i32:$IMM32)),
+             (v2i64 (XXSPLTI32DX v2i64:$XT, i32:$XI, i32:$IMM32))>;
+}
+
+let Predicates = [IsISA3_1, HasVSX] in {
+  def : Pat<(v16i8 (int_ppc_vsx_xvcvspbf16 v16i8:$XA)),
+            (COPY_TO_REGCLASS (XVCVSPBF16 RCCp.AToVSRC), VRRC)>;
+  def : Pat<(v16i8 (int_ppc_vsx_xvcvbf16spn v16i8:$XA)),
+            (COPY_TO_REGCLASS (XVCVBF16SPN RCCp.AToVSRC), VRRC)>;
+}
+
+let AddedComplexity = 400, Predicates = [IsISA3_1, IsLittleEndian] in {
+  // Store element 0 of a VSX register to memory
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$src, 0)), ForceXForm:$dst),
+            (STXVRBX (COPY_TO_REGCLASS v16i8:$src, VSRC), ForceXForm:$dst)>;
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$src, 0)), ForceXForm:$dst),
+            (STXVRHX (COPY_TO_REGCLASS v8i16:$src, VSRC), ForceXForm:$dst)>;
+  def : Pat<(store (i32 (extractelt v4i32:$src, 0)), ForceXForm:$dst),
+            (STXVRWX $src, ForceXForm:$dst)>;
+  def : Pat<(store (f32 (extractelt v4f32:$src, 0)), ForceXForm:$dst),
+            (STXVRWX $src, ForceXForm:$dst)>;
+  def : Pat<(store (i64 (extractelt v2i64:$src, 0)), ForceXForm:$dst),
+            (STXVRDX $src, ForceXForm:$dst)>;
+  def : Pat<(store (f64 (extractelt v2f64:$src, 0)), ForceXForm:$dst),
+            (STXVRDX $src, ForceXForm:$dst)>;
+  // Load element 0 of a VSX register to memory
+  def : Pat<(v8i16 (scalar_to_vector (i32 (extloadi16 ForceXForm:$src)))),
+            (v8i16 (COPY_TO_REGCLASS (LXVRHX ForceXForm:$src), VSRC))>;
+  def : Pat<(v16i8 (scalar_to_vector (i32 (extloadi8 ForceXForm:$src)))),
+            (v16i8 (COPY_TO_REGCLASS (LXVRBX ForceXForm:$src), VSRC))>;
+ }
+
+// FIXME: The swap is overkill when the shift amount is a constant.
+// We should just fix the constant in the DAG.
+let AddedComplexity = 400, Predicates = [IsISA3_1, HasVSX] in {
+  def : Pat<(v1i128 (shl v1i128:$VRA, v1i128:$VRB)),
+            (v1i128 (VSLQ v1i128:$VRA,
+                     (XXPERMDI (COPY_TO_REGCLASS $VRB, VSRC),
+                               (COPY_TO_REGCLASS $VRB, VSRC), 2)))>;
+  def : Pat<(v1i128 (PPCshl v1i128:$VRA, v1i128:$VRB)),
+            (v1i128 (VSLQ v1i128:$VRA,
+                     (XXPERMDI (COPY_TO_REGCLASS $VRB, VSRC),
+                               (COPY_TO_REGCLASS $VRB, VSRC), 2)))>;
+  def : Pat<(v1i128 (srl v1i128:$VRA, v1i128:$VRB)),
+            (v1i128 (VSRQ v1i128:$VRA,
+                     (XXPERMDI (COPY_TO_REGCLASS $VRB, VSRC),
+                               (COPY_TO_REGCLASS $VRB, VSRC), 2)))>;
+  def : Pat<(v1i128 (PPCsrl v1i128:$VRA, v1i128:$VRB)),
+            (v1i128 (VSRQ v1i128:$VRA,
+                     (XXPERMDI (COPY_TO_REGCLASS $VRB, VSRC),
+                               (COPY_TO_REGCLASS $VRB, VSRC), 2)))>;
+  def : Pat<(v1i128 (sra v1i128:$VRA, v1i128:$VRB)),
+            (v1i128 (VSRAQ v1i128:$VRA,
+                     (XXPERMDI (COPY_TO_REGCLASS $VRB, VSRC),
+                               (COPY_TO_REGCLASS $VRB, VSRC), 2)))>;
+  def : Pat<(v1i128 (PPCsra v1i128:$VRA, v1i128:$VRB)),
+            (v1i128 (VSRAQ v1i128:$VRA,
+                     (XXPERMDI (COPY_TO_REGCLASS $VRB, VSRC),
+                               (COPY_TO_REGCLASS $VRB, VSRC), 2)))>;
+}
+
+class xxevalPattern <dag pattern, bits<8> imm> :
+  Pat<(v4i32 pattern), (XXEVAL $vA, $vB, $vC, imm)> {}
+
+let AddedComplexity = 400, Predicates = [PrefixInstrs] in {
+ def : Pat<(v4i32 (build_vector i32immNonAllOneNonZero:$A,
+                                i32immNonAllOneNonZero:$A,
+                                i32immNonAllOneNonZero:$A,
+                                i32immNonAllOneNonZero:$A)),
+           (v4i32 (XXSPLTIW imm:$A))>;
+ def : Pat<(f32 nzFPImmAsi32:$A),
+           (COPY_TO_REGCLASS (XXSPLTIDP (getFPAs32BitInt fpimm:$A)),
+                             VSFRC)>;
+ def : Pat<(f64 nzFPImmAsi32:$A),
+           (COPY_TO_REGCLASS (XXSPLTIDP (getFPAs32BitInt fpimm:$A)),
+                             VSFRC)>;
+
+// To replace constant pool with XXSPLTI32DX for scalars.
+def : Pat<(f32 nzFPImmAsi64:$A),
+          (COPY_TO_REGCLASS (XXSPLTI32DX (XXSPLTI32DX(IMPLICIT_DEF), 0,
+                                        (getFPAs64BitIntHi $A)),
+                                        1, (getFPAs64BitIntLo $A)),
+                            VSSRC)>;
+
+def : Pat<(f64 nzFPImmAsi64:$A),
+          (COPY_TO_REGCLASS (XXSPLTI32DX (XXSPLTI32DX (IMPLICIT_DEF), 0,
+                                        (getFPAs64BitIntHi $A)),
+                                        1, (getFPAs64BitIntLo $A)),
+                            VSFRC)>;
+
+  // Anonymous patterns for XXEVAL
+  // AND
+  // and(A, B, C)
+  def : xxevalPattern<(and v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 1>;
+  // and(A, xor(B, C))
+  def : xxevalPattern<(and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 6>;
+  // and(A, or(B, C))
+  def : xxevalPattern<(and v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 7>;
+  // and(A, nor(B, C))
+  def : xxevalPattern<(and v4i32:$vA, (vnot (or v4i32:$vB, v4i32:$vC))), 8>;
+  // and(A, eqv(B, C))
+  def : xxevalPattern<(and v4i32:$vA, (vnot (xor v4i32:$vB, v4i32:$vC))), 9>;
+  // and(A, nand(B, C))
+  def : xxevalPattern<(and v4i32:$vA, (vnot (and v4i32:$vB, v4i32:$vC))), 14>;
+
+  // NAND
+  // nand(A, B, C)
+  def : xxevalPattern<(vnot (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC))),
+                       !sub(255, 1)>;
+  // nand(A, xor(B, C))
+  def : xxevalPattern<(vnot (and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC))),
+                       !sub(255, 6)>;
+  // nand(A, or(B, C))
+  def : xxevalPattern<(vnot (and v4i32:$vA, (or v4i32:$vB, v4i32:$vC))),
+                       !sub(255, 7)>;
+  // nand(A, nor(B, C))
+  def : xxevalPattern<(or (vnot v4i32:$vA), (or v4i32:$vB, v4i32:$vC)),
+                       !sub(255, 8)>;
+  // nand(A, eqv(B, C))
+  def : xxevalPattern<(or (vnot v4i32:$vA), (xor v4i32:$vB, v4i32:$vC)),
+                       !sub(255, 9)>;
+  // nand(A, nand(B, C))
+  def : xxevalPattern<(or (vnot v4i32:$vA), (and v4i32:$vB, v4i32:$vC)),
+                       !sub(255, 14)>;
+
+  // Anonymous patterns to select prefixed VSX loads and stores.
+  // Load / Store f128
+  def : Pat<(f128 (load PDForm:$src)),
+            (COPY_TO_REGCLASS (PLXV memri34:$src), VRRC)>;
+  def : Pat<(store f128:$XS, PDForm:$dst),
+            (PSTXV (COPY_TO_REGCLASS $XS, VSRC), memri34:$dst)>;
+
+  // Load / Store v4i32
+  def : Pat<(v4i32 (load PDForm:$src)), (PLXV memri34:$src)>;
+  def : Pat<(store v4i32:$XS, PDForm:$dst), (PSTXV $XS, memri34:$dst)>;
+
+  // Load / Store v2i64
+  def : Pat<(v2i64 (load PDForm:$src)), (PLXV memri34:$src)>;
+  def : Pat<(store v2i64:$XS, PDForm:$dst), (PSTXV $XS, memri34:$dst)>;
+
+  // Load / Store v4f32
+  def : Pat<(v4f32 (load PDForm:$src)), (PLXV memri34:$src)>;
+  def : Pat<(store v4f32:$XS, PDForm:$dst), (PSTXV $XS, memri34:$dst)>;
+
+  // Load / Store v2f64
+  def : Pat<(v2f64 (load PDForm:$src)), (PLXV memri34:$src)>;
+  def : Pat<(store v2f64:$XS, PDForm:$dst), (PSTXV $XS, memri34:$dst)>;
+
+  // Cases For PPCstore_scal_int_from_vsr
+  def : Pat<(PPCstore_scal_int_from_vsr
+            (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), PDForm:$dst, 8),
+            (PSTXSD (XSCVDPUXDS f64:$src), PDForm:$dst)>;
+  def : Pat<(PPCstore_scal_int_from_vsr
+            (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), PDForm:$dst, 8),
+            (PSTXSD (XSCVDPSXDS f64:$src), PDForm:$dst)>;
+  def : Pat<(PPCstore_scal_int_from_vsr
+            (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), PDForm:$dst, 8),
+            (PSTXSD (COPY_TO_REGCLASS (XSCVQPUDZ f128:$src), VFRC),
+                     PDForm:$dst)>;
+  def : Pat<(PPCstore_scal_int_from_vsr
+            (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), PDForm:$dst, 8),
+            (PSTXSD (COPY_TO_REGCLASS (XSCVQPSDZ f128:$src), VFRC),
+                     PDForm:$dst)>;
+}
+
+let Predicates = [PrefixInstrs] in {
+  def : Pat<(i32 imm34:$imm), (PLI (getImmAs64BitInt imm:$imm))>;
+  def : Pat<(i64 imm34:$imm), (PLI8 (getImmAs64BitInt imm:$imm))>;
+  def : Pat<(v16i8 (int_ppc_vsx_xxpermx v16i8:$A, v16i8:$B, v16i8:$C, timm:$D)),
+            (COPY_TO_REGCLASS (XXPERMX (COPY_TO_REGCLASS $A, VSRC),
+                                       (COPY_TO_REGCLASS $B, VSRC),
+                                       (COPY_TO_REGCLASS $C, VSRC), $D), VSRC)>;
+  def : Pat<(v16i8 (int_ppc_vsx_xxblendvb v16i8:$A, v16i8:$B, v16i8:$C)),
+            (COPY_TO_REGCLASS
+                   (XXBLENDVB (COPY_TO_REGCLASS $A, VSRC),
+                              (COPY_TO_REGCLASS $B, VSRC),
+                              (COPY_TO_REGCLASS $C, VSRC)), VSRC)>;
+  def : Pat<(v8i16 (int_ppc_vsx_xxblendvh v8i16:$A, v8i16:$B, v8i16:$C)),
+            (COPY_TO_REGCLASS
+                   (XXBLENDVH (COPY_TO_REGCLASS $A, VSRC),
+                              (COPY_TO_REGCLASS $B, VSRC),
+                              (COPY_TO_REGCLASS $C, VSRC)), VSRC)>;
+  def : Pat<(int_ppc_vsx_xxblendvw v4i32:$A, v4i32:$B, v4i32:$C),
+            (XXBLENDVW $A, $B, $C)>;
+  def : Pat<(int_ppc_vsx_xxblendvd v2i64:$A, v2i64:$B, v2i64:$C),
+            (XXBLENDVD $A, $B, $C)>;
+
+  // Anonymous patterns to select prefixed loads and stores.
+  // Load i32
+  def : Pat<(i32 (extloadi1 PDForm:$src)), (PLBZ memri34:$src)>;
+  def : Pat<(i32 (zextloadi1 PDForm:$src)), (PLBZ memri34:$src)>;
+  def : Pat<(i32 (extloadi8 PDForm:$src)), (PLBZ memri34:$src)>;
+  def : Pat<(i32 (zextloadi8 PDForm:$src)), (PLBZ memri34:$src)>;
+  def : Pat<(i32 (extloadi16 PDForm:$src)), (PLHZ memri34:$src)>;
+  def : Pat<(i32 (zextloadi16 PDForm:$src)), (PLHZ memri34:$src)>;
+  def : Pat<(i32 (sextloadi16 PDForm:$src)), (PLHA memri34:$src)>;
+  def : Pat<(i32 (load PDForm:$src)), (PLWZ memri34:$src)>;
+
+  // Store i32
+  def : Pat<(truncstorei8 i32:$rS, PDForm:$dst), (PSTB gprc:$rS, memri34:$dst)>;
+  def : Pat<(truncstorei16 i32:$rS, PDForm:$dst), (PSTH gprc:$rS, memri34:$dst)>;
+  def : Pat<(store i32:$rS, PDForm:$dst), (PSTW gprc:$rS, memri34:$dst)>;
+
+  // Load i64
+  def : Pat<(i64 (extloadi1 PDForm:$src)), (PLBZ8 memri34:$src)>;
+  def : Pat<(i64 (zextloadi1 PDForm:$src)), (PLBZ8 memri34:$src)>;
+  def : Pat<(i64 (extloadi8 PDForm:$src)), (PLBZ8 memri34:$src)>;
+  def : Pat<(i64 (zextloadi8 PDForm:$src)), (PLBZ8 memri34:$src)>;
+  def : Pat<(i64 (extloadi16 PDForm:$src)), (PLHZ8 memri34:$src)>;
+  def : Pat<(i64 (zextloadi16 PDForm:$src)), (PLHZ8 memri34:$src)>;
+  def : Pat<(i64 (sextloadi16 PDForm:$src)), (PLHA8 memri34:$src)>;
+  def : Pat<(i64 (extloadi32 PDForm:$src)), (PLWZ8 memri34:$src)>;
+  def : Pat<(i64 (zextloadi32 PDForm:$src)), (PLWZ8 memri34:$src)>;
+  def : Pat<(i64 (sextloadi32 PDForm:$src)), (PLWA8 memri34:$src)>;
+  def : Pat<(i64 (load PDForm:$src)), (PLD memri34:$src)>;
+
+  // Store i64
+  def : Pat<(truncstorei8 i64:$rS, PDForm:$dst), (PSTB8 g8rc:$rS, memri34:$dst)>;
+  def : Pat<(truncstorei16 i64:$rS, PDForm:$dst), (PSTH8 g8rc:$rS, memri34:$dst)>;
+  def : Pat<(truncstorei32 i64:$rS, PDForm:$dst), (PSTW8 g8rc:$rS, memri34:$dst)>;
+  def : Pat<(store i64:$rS, PDForm:$dst), (PSTD g8rc:$rS, memri34:$dst)>;
+
+  // Load / Store f32
+  def : Pat<(f32 (load PDForm:$src)), (PLFS memri34:$src)>;
+  def : Pat<(store f32:$FRS, PDForm:$dst), (PSTFS $FRS, memri34:$dst)>;
+
+  // Load / Store f64
+  def : Pat<(f64 (extloadf32 PDForm:$src)),
+            (COPY_TO_REGCLASS (PLFS memri34:$src), VSFRC)>;
+  def : Pat<(f64 (load PDForm:$src)), (PLFD memri34:$src)>;
+  def : Pat<(store f64:$FRS, PDForm:$dst), (PSTFD $FRS, memri34:$dst)>;
+
+  // Atomic Load
+  def : Pat<(atomic_load_8 PDForm:$src), (PLBZ memri34:$src)>;
+  def : Pat<(atomic_load_16 PDForm:$src), (PLHZ memri34:$src)>;
+  def : Pat<(atomic_load_32 PDForm:$src), (PLWZ memri34:$src)>;
+  def : Pat<(atomic_load_64 PDForm:$src), (PLD memri34:$src)>;
+
+  // Atomic Store
+  def : Pat<(atomic_store_8 PDForm:$dst, i32:$RS), (PSTB $RS, memri34:$dst)>;
+  def : Pat<(atomic_store_16 PDForm:$dst, i32:$RS), (PSTH $RS, memri34:$dst)>;
+  def : Pat<(atomic_store_32 PDForm:$dst, i32:$RS), (PSTW $RS, memri34:$dst)>;
+  def : Pat<(atomic_store_64 PDForm:$dst, i64:$RS), (PSTD $RS, memri34:$dst)>;
+
+  // Prefixed fpext to v2f64
+  def : Pat<(v4f32 (PPCldvsxlh PDForm:$src)),
+            (SUBREG_TO_REG (i64 1), (PLFD PDForm:$src), sub_64)>;
+}
+
+def InsertEltShift {
+  dag Sub32 = (i32 (EXTRACT_SUBREG $rB, sub_32));
+  dag Sub32Left1 = (RLWINM (EXTRACT_SUBREG $rB, sub_32), 1, 0, 30);
+  dag Sub32Left2 = (RLWINM (EXTRACT_SUBREG $rB, sub_32), 2, 0, 29);
+  dag Left1 = (RLWINM $rB, 1, 0, 30);
+  dag Left2 = (RLWINM $rB, 2, 0, 29);
+  dag Left3 = (RLWINM8 $rB, 3, 0, 28);
+}
+
+let Predicates = [IsISA3_1, HasVSX, IsLittleEndian] in {
+  // Indexed vector insert element
+  def : Pat<(v16i8 (vector_insert v16i8:$vDi, i32:$rA, i64:$rB)),
+            (VINSBRX $vDi, InsertEltShift.Sub32, $rA)>;
+  def : Pat<(v8i16 (vector_insert v8i16:$vDi, i32:$rA, i64:$rB)),
+            (VINSHRX $vDi, InsertEltShift.Sub32Left1, $rA)>;
+  def : Pat<(v4i32 (insertelt v4i32:$vDi, i32:$rA, i64:$rB)),
+            (VINSWRX $vDi, InsertEltShift.Sub32Left2, $rA)>;
+  def : Pat<(v2i64 (insertelt v2i64:$vDi, i64:$rA, i64:$rB)),
+            (VINSDRX $vDi, InsertEltShift.Left3, $rA)>;
+
+  def : Pat<(v4f32 (insertelt v4f32:$vDi, f32:$rA, i64:$rB)),
+            (VINSWVRX $vDi, InsertEltShift.Sub32Left2, (XSCVDPSPN $rA))>;
+
+  def : Pat<(v2f64 (insertelt v2f64:$vDi,  f64:$A, i64:$rB)),
+            (VINSDRX $vDi, InsertEltShift.Left3, Bitcast.DblToLong)>;
+  def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load DSForm:$rA)), i64:$rB)),
+            (VINSDRX $vDi, InsertEltShift.Left3, (LD memrix:$rA))>;
+  def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load PDForm:$rA)), i64:$rB)),
+            (VINSDRX $vDi, InsertEltShift.Left3, (PLD memri34:$rA))>;
+  def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load XForm:$rA)), i64:$rB)),
+            (VINSDRX $vDi, InsertEltShift.Left3, (LDX memrr:$rA))>;
+  let AddedComplexity = 400 in {
+    // Immediate vector insert element
+    foreach Idx = [0, 1, 2, 3] in {
+      def : Pat<(v4i32 (insertelt v4i32:$vDi, i32:$rA, Idx)),
+                (VINSW $vDi, !mul(!sub(3, Idx), 4), $rA)>;
+    }
+    foreach i = [0, 1] in
+     def : Pat<(v2i64 (insertelt v2i64:$vDi, i64:$rA, (i64 i))),
+               (VINSD $vDi, !mul(!sub(1, i), 8), $rA)>;
+  }
+}
+
+let Predicates = [IsISA3_1, HasVSX, IsBigEndian, IsPPC32] in {
+  // Indexed vector insert element
+  def : Pat<(v16i8 (vector_insert v16i8:$vDi, i32:$rA, i32:$rB)),
+            (VINSBLX $vDi, $rB, $rA)>;
+  def : Pat<(v8i16 (vector_insert v8i16:$vDi, i32:$rA, i32:$rB)),
+            (VINSHLX $vDi, InsertEltShift.Left1, $rA)>;
+  def : Pat<(v4i32 (insertelt v4i32:$vDi, i32:$rA, i32:$rB)),
+            (VINSWLX $vDi, InsertEltShift.Left2, $rA)>;
+
+  def : Pat<(v4f32 (insertelt v4f32:$vDi,  f32:$rA, i32:$rB)),
+            (VINSWVLX $vDi, InsertEltShift.Left2, (XSCVDPSPN $rA))>;
+}
+
+let Predicates = [IsISA3_1, HasVSX, IsBigEndian, IsPPC64] in {
+  // Indexed vector insert element
+  def : Pat<(v16i8 (vector_insert v16i8:$vDi, i32:$rA, i64:$rB)),
+            (VINSBLX $vDi, InsertEltShift.Sub32, $rA)>;
+  def : Pat<(v8i16 (vector_insert v8i16:$vDi, i32:$rA, i64:$rB)),
+            (VINSHLX $vDi, InsertEltShift.Sub32Left1, $rA)>;
+  def : Pat<(v4i32 (insertelt v4i32:$vDi, i32:$rA, i64:$rB)),
+            (VINSWLX $vDi, InsertEltShift.Sub32Left2, $rA)>;
+  def : Pat<(v2i64 (insertelt v2i64:$vDi, i64:$rA, i64:$rB)),
+            (VINSDLX $vDi, InsertEltShift.Left3, $rA)>;
+
+  def : Pat<(v4f32 (insertelt v4f32:$vDi,  f32:$rA, i64:$rB)),
+            (VINSWVLX $vDi, InsertEltShift.Sub32Left2, (XSCVDPSPN $rA))>;
+
+  def : Pat<(v2f64 (insertelt v2f64:$vDi,  f64:$A, i64:$rB)),
+            (VINSDLX $vDi, InsertEltShift.Left3, Bitcast.DblToLong)>;
+  def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load DSForm:$rA)), i64:$rB)),
+            (VINSDLX $vDi, InsertEltShift.Left3, (LD memrix:$rA))>;
+  def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load PDForm:$rA)), i64:$rB)),
+            (VINSDLX $vDi, InsertEltShift.Left3, (PLD memri34:$rA))>;
+  def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load XForm:$rA)), i64:$rB)),
+            (VINSDLX $vDi, InsertEltShift.Left3, (LDX memrr:$rA))>;
+}
+
+let AddedComplexity = 400, Predicates = [IsISA3_1, HasVSX, IsBigEndian] in {
+  // Immediate vector insert element
+  foreach Ty = [i32, i64] in {
+    foreach Idx = [0, 1, 2, 3] in {
+      def : Pat<(v4i32 (insertelt v4i32:$vDi, i32:$rA, (Ty Idx))),
+               (VINSW $vDi, !mul(Idx, 4), $rA)>;
+    }
+  }
+
+  foreach Idx = [0, 1] in
+    def : Pat<(v2i64 (insertelt v2i64:$vDi, i64:$rA, Idx)),
+              (VINSD $vDi, !mul(Idx, 8), $rA)>;
+}
diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
deleted file mode 100644
index ff43426dd1ef..000000000000
--- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
+++ /dev/null
@@ -1,2889 +0,0 @@
-//-------------------------- Predicate definitions ---------------------------//
-def IsPPC32 : Predicate<"!Subtarget->isPPC64()">;
-
-// Mask immediates for MMA instructions (2, 4 and 8 bits).
-def Msk2Imm : ImmLeaf<i32, [{ return isUInt<2>(Imm); }]>;
-def Msk4Imm : ImmLeaf<i32, [{ return isUInt<4>(Imm); }]>;
-def Msk8Imm : ImmLeaf<i32, [{ return isUInt<8>(Imm); }]>;
-
-//===----------------------------------------------------------------------===//
-// PowerPC ISA 3.1 specific type constraints.
-//
-
-def SDT_PPCSplat32 : SDTypeProfile<1, 3, [ SDTCisVT<0, v2i64>,
-  SDTCisVec<1>, SDTCisInt<2>, SDTCisInt<3>
-]>;
-def SDT_PPCAccBuild : SDTypeProfile<1, 4, [
-  SDTCisVT<0, v512i1>, SDTCisVT<1, v4i32>, SDTCisVT<2, v4i32>,
-                       SDTCisVT<3, v4i32>, SDTCisVT<4, v4i32>
-]>;
-def SDT_PPCPairBuild : SDTypeProfile<1, 2, [
-  SDTCisVT<0, v256i1>, SDTCisVT<1, v4i32>, SDTCisVT<2, v4i32>
-]>;
-def SDT_PPCAccExtractVsx : SDTypeProfile<1, 2, [
-  SDTCisVT<0, v4i32>, SDTCisVT<1, v512i1>, SDTCisPtrTy<2>
-]>;
-def SDT_PPCPairExtractVsx : SDTypeProfile<1, 2, [
-  SDTCisVT<0, v4i32>, SDTCisVT<1, v256i1>, SDTCisPtrTy<2>
-]>;
-def SDT_PPCxxmfacc : SDTypeProfile<1, 1, [
-  SDTCisVT<0, v512i1>, SDTCisVT<1, v512i1>
-]>;
-
-//===----------------------------------------------------------------------===//
-// ISA 3.1 specific PPCISD nodes.
-//
-
-def PPCxxsplti32dx : SDNode<"PPCISD::XXSPLTI32DX", SDT_PPCSplat32, []>;
-def PPCAccBuild : SDNode<"PPCISD::ACC_BUILD", SDT_PPCAccBuild, []>;
-def PPCPairBuild : SDNode<"PPCISD::PAIR_BUILD", SDT_PPCPairBuild, []>;
-def PPCAccExtractVsx : SDNode<"PPCISD::EXTRACT_VSX_REG", SDT_PPCAccExtractVsx,
-                       []>;
-def PPCPairExtractVsx : SDNode<"PPCISD::EXTRACT_VSX_REG", SDT_PPCPairExtractVsx,
-                        []>;
-def PPCxxmfacc : SDNode<"PPCISD::XXMFACC", SDT_PPCxxmfacc, []>;
-
-//===----------------------------------------------------------------------===//
-
-// PC Relative flag (for instructions that use the address of the prefix for
-// address computations).
-class isPCRel { bit PCRel = 1; }
-
-// PowerPC specific type constraints.
-def SDT_PPCLXVRZX : SDTypeProfile<1, 2, [
-  SDTCisVT<0, v1i128>, SDTCisPtrTy<1>, SDTCisPtrTy<2>
-]>;
-
-// PPC Specific DAG Nodes.
-def PPClxvrzx : SDNode<"PPCISD::LXVRZX", SDT_PPCLXVRZX,
-                       [SDNPHasChain, SDNPMayLoad]>;
-
-// Top-level class for prefixed instructions.
-class PI<bits<6> pref, bits<6> opcode, dag OOL, dag IOL, string asmstr,
-         InstrItinClass itin> : Instruction {
-  field bits<64> Inst;
-  field bits<64> SoftFail = 0;
-  bit PCRel = 0; // Default value, set by isPCRel.
-  let Size = 8;
-
-  let Namespace = "PPC";
-  let OutOperandList = OOL;
-  let InOperandList = IOL;
-  let AsmString = asmstr;
-  let Itinerary = itin;
-  let Inst{0-5} = pref;
-  let Inst{32-37} = opcode;
-
-  bits<1> PPC970_First = 0;
-  bits<1> PPC970_Single = 0;
-  bits<1> PPC970_Cracked = 0;
-  bits<3> PPC970_Unit = 0;
-
-  /// These fields correspond to the fields in PPCInstrInfo.h.  Any changes to
-  /// these must be reflected there!  See comments there for what these are.
-  let TSFlags{0}   = PPC970_First;
-  let TSFlags{1}   = PPC970_Single;
-  let TSFlags{2}   = PPC970_Cracked;
-  let TSFlags{5-3} = PPC970_Unit;
-
-  bits<1> Prefixed = 1;  // This is a prefixed instruction.
-  let TSFlags{7}  = Prefixed;
-
-  // For cases where multiple instruction definitions really represent the
-  // same underlying instruction but with one definition for 64-bit arguments
-  // and one for 32-bit arguments, this bit breaks the degeneracy between
-  // the two forms and allows TableGen to generate mapping tables.
-  bit Interpretation64Bit = 0;
-
-  // Fields used for relation models.
-  string BaseName = "";
-}
-
-// VX-Form: [ PO VT R VB RC XO ]
-class VXForm_VTB5_RC<bits<10> xo, bits<5> R, dag OOL, dag IOL, string asmstr,
-                      InstrItinClass itin, list<dag> pattern>
-  : I<4, OOL, IOL, asmstr, itin> {
-  bits<5> VT;
-  bits<5> VB;
-  bit RC = 0;
-
-  let Pattern = pattern;
-
-  let Inst{6-10} = VT;
-  let Inst{11-15} = R;
-  let Inst{16-20} = VB;
-  let Inst{21} = RC;
-  let Inst{22-31} = xo;
-}
-
-// Multiclass definition to account for record and non-record form
-// instructions of VXRForm.
-multiclass VXForm_VTB5_RCr<bits<10> xo, bits<5> R, dag OOL, dag IOL,
-                            string asmbase, string asmstr,
-                            InstrItinClass itin, list<dag> pattern> {
-  let BaseName = asmbase in {
-    def NAME : VXForm_VTB5_RC<xo, R, OOL, IOL,
-                               !strconcat(asmbase, !strconcat(" ", asmstr)),
-                               itin, pattern>, RecFormRel;
-    let Defs = [CR6] in
-    def _rec : VXForm_VTB5_RC<xo, R, OOL, IOL,
-                               !strconcat(asmbase, !strconcat(". ", asmstr)),
-                               itin, []>, isRecordForm, RecFormRel;
-  }
-}
-
-class MLS_DForm_R_SI34_RTA5_MEM<bits<6> opcode, dag OOL, dag IOL, string asmstr,
-                                InstrItinClass itin, list<dag> pattern>
-  : PI<1, opcode, OOL, IOL, asmstr, itin> {
-  bits<5> FRS;
-  bits<39> D_RA;
-
-  let Pattern = pattern;
-
-  // The prefix.
-  let Inst{6-7} = 2;
-  let Inst{8-10} = 0;
-  let Inst{11} = PCRel;
-  let Inst{12-13} = 0;
-  let Inst{14-31} = D_RA{33-16}; // d0
-
-  // The instruction.
-  let Inst{38-42} = FRS{4-0};
-  let Inst{43-47} = D_RA{38-34}; // RA
-  let Inst{48-63} = D_RA{15-0}; // d1
-}
-
-class MLS_DForm_R_SI34_RTA5<bits<6> opcode, dag OOL, dag IOL, string asmstr,
-                            InstrItinClass itin, list<dag> pattern>
-  : PI<1, opcode, OOL, IOL, asmstr, itin> {
-  bits<5> RT;
-  bits<5> RA;
-  bits<34> SI;
-
-  let Pattern = pattern;
-
-  // The prefix.
-  let Inst{6-7} = 2;
-  let Inst{8-10} = 0;
-  let Inst{11} = PCRel;
-  let Inst{12-13} = 0;
-  let Inst{14-31} = SI{33-16};
-
-  // The instruction.
-  let Inst{38-42} = RT;
-  let Inst{43-47} = RA;
-  let Inst{48-63} = SI{15-0};
-}
-
-class MLS_DForm_SI34_RT5<bits<6> opcode, dag OOL, dag IOL, string asmstr,
-                         InstrItinClass itin, list<dag> pattern>
-  : PI<1, opcode, OOL, IOL, asmstr, itin> {
-  bits<5> RT;
-  bits<34> SI;
-
-  let Pattern = pattern;
-
-  // The prefix.
-  let Inst{6-7} = 2;
-  let Inst{8-10} = 0;
-  let Inst{11} = 0;
-  let Inst{12-13} = 0;
-  let Inst{14-31} = SI{33-16};
-
-  // The instruction.
-  let Inst{38-42} = RT;
-  let Inst{43-47} = 0;
-  let Inst{48-63} = SI{15-0};
-}
-
-multiclass MLS_DForm_R_SI34_RTA5_p<bits<6> opcode, dag OOL, dag IOL,
-                                   dag PCRel_IOL, string asmstr,
-                                   InstrItinClass itin> {
-  def NAME : MLS_DForm_R_SI34_RTA5<opcode, OOL, IOL,
-                                   !strconcat(asmstr, ", 0"), itin, []>;
-  def pc : MLS_DForm_R_SI34_RTA5<opcode, OOL, PCRel_IOL,
-                                 !strconcat(asmstr, ", 1"), itin, []>, isPCRel;
-}
-
-class 8LS_DForm_R_SI34_RTA5<bits<6> opcode, dag OOL, dag IOL, string asmstr,
-                            InstrItinClass itin, list<dag> pattern>
-  : PI<1, opcode, OOL, IOL, asmstr, itin> {
-  bits<5> RT;
-  bits<39> D_RA;
-
-  let Pattern = pattern;
-
-  // The prefix.
-  let Inst{6-10} = 0;
-  let Inst{11} = PCRel;
-  let Inst{12-13} = 0;
-  let Inst{14-31} = D_RA{33-16}; // d0
-
-  // The instruction.
-  let Inst{38-42} = RT{4-0};
-  let Inst{43-47} = D_RA{38-34}; // RA
-  let Inst{48-63} = D_RA{15-0}; // d1
-}
-
-// 8LS:D-Form: [ 1 0 0 // R // d0
-//               PO TX T RA d1 ]
-class 8LS_DForm_R_SI34_XT6_RA5<bits<5> opcode, dag OOL, dag IOL, string asmstr,
-                               InstrItinClass itin, list<dag> pattern>
-  : PI<1, { opcode, ? }, OOL, IOL, asmstr, itin> {
-  bits<6> XT;
-  bits<39> D_RA;
-
-  let Pattern = pattern;
-
-  // The prefix.
-  let Inst{6-7} = 0;
-  let Inst{8} = 0;
-  let Inst{9-10} = 0; // reserved
-  let Inst{11} = PCRel;
-  let Inst{12-13} = 0; // reserved
-  let Inst{14-31} = D_RA{33-16}; // d0
-
-  // The instruction.
-  let Inst{37} = XT{5};
-  let Inst{38-42} = XT{4-0};
-  let Inst{43-47} = D_RA{38-34}; // RA
-  let Inst{48-63} = D_RA{15-0}; // d1
-}
-
-// X-Form: [PO T IMM VRB XO TX]
-class XForm_XT6_IMM5_VB5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
-                         string asmstr, InstrItinClass itin, list<dag> pattern>
-  : I<opcode, OOL, IOL, asmstr, itin> {
-  bits<6> XT;
-  bits<5> VRB;
-  bits<5> IMM;
-
-  let Pattern = pattern;
-  let Inst{6-10} = XT{4-0};
-  let Inst{11-15} = IMM;
-  let Inst{16-20} = VRB;
-  let Inst{21-30} = xo;
-  let Inst{31} = XT{5};
-}
-
-class 8RR_XX4Form_IMM8_XTAB6<bits<6> opcode, bits<2> xo,
-                             dag OOL, dag IOL, string asmstr,
-                             InstrItinClass itin, list<dag> pattern>
-  : PI<1, opcode, OOL, IOL, asmstr, itin> {
-    bits<6> XT;
-    bits<6> XA;
-    bits<6> XB;
-    bits<6> XC;
-    bits<8> IMM;
-
-    let Pattern = pattern;
-
-    // The prefix.
-    let Inst{6-7} = 1;
-    let Inst{8} = 0;
-    let Inst{9-11} = 0;
-    let Inst{12-13} = 0;
-    let Inst{14-23} = 0;
-    let Inst{24-31} = IMM;
-
-    // The instruction.
-    let Inst{38-42} = XT{4-0};
-    let Inst{43-47} = XA{4-0};
-    let Inst{48-52} = XB{4-0};
-    let Inst{53-57} = XC{4-0};
-    let Inst{58-59} = xo;
-    let Inst{60} = XC{5};
-    let Inst{61} = XA{5};
-    let Inst{62} = XB{5};
-    let Inst{63} = XT{5};
-}
-
-class VXForm_RD5_N3_VB5<bits<11> xo, dag OOL, dag IOL, string asmstr,
-                        InstrItinClass itin, list<dag> pattern>
-  : I<4, OOL, IOL, asmstr, itin> {
-  bits<5> RD;
-  bits<5> VB;
-  bits<3> N;
-
-  let Pattern = pattern;
-
-  let Inst{6-10}  = RD;
-  let Inst{11-12} = 0;
-  let Inst{13-15} = N;
-  let Inst{16-20} = VB;
-  let Inst{21-31} = xo;
-}
-
-
-// VX-Form: [PO VRT RA VRB XO].
-// Destructive (insert) forms are suffixed with _ins.
-class VXForm_VTB5_RA5_ins<bits<11> xo, string opc, list<dag> pattern>
-  : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vDi, gprc:$rA, vrrc:$vB),
-             !strconcat(opc, " $vD, $rA, $vB"), IIC_VecGeneral, pattern>,
-             RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
-
-// VX-Form: [PO VRT RA RB XO].
-// Destructive (insert) forms are suffixed with _ins.
-class VXForm_VRT5_RAB5_ins<bits<11> xo, string opc, list<dag> pattern>
-  : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vDi, gprc:$rA, gprc:$rB),
-             !strconcat(opc, " $vD, $rA, $rB"), IIC_VecGeneral, pattern>,
-             RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
-
-// VX-Form: [ PO BF // VRA VRB XO ]
-class VXForm_BF3_VAB5<bits<11> xo, dag OOL, dag IOL, string asmstr,
-                      InstrItinClass itin, list<dag> pattern>
-  : I<4, OOL, IOL, asmstr, itin> {
-  bits<3> BF;
-  bits<5> VA;
-  bits<5> VB;
-
-  let Pattern = pattern;
-
-  let Inst{6-8} = BF;
-  let Inst{9-10} = 0;
-  let Inst{11-15} = VA;
-  let Inst{16-20} = VB;
-  let Inst{21-31} = xo;
-}
-
-// VN-Form: [PO VRT VRA VRB PS SD XO]
-// SD is "Shift Direction"
-class VNForm_VTAB5_SD3<bits<6> xo, bits<2> ps, dag OOL, dag IOL, string asmstr,
-                       InstrItinClass itin, list<dag> pattern>
-    : I<4, OOL, IOL, asmstr, itin> {
-  bits<5> VRT;
-  bits<5> VRA;
-  bits<5> VRB;
-  bits<3> SD;
-
-  let Pattern = pattern;
-
-  let Inst{6-10}  = VRT;
-  let Inst{11-15} = VRA;
-  let Inst{16-20} = VRB;
-  let Inst{21-22} = ps;
-  let Inst{23-25} = SD;
-  let Inst{26-31} = xo;
-}
-
-class VXForm_RD5_MP_VB5<bits<11> xo, bits<4> eo, dag OOL, dag IOL,
-                        string asmstr, InstrItinClass itin, list<dag> pattern>
-  : I<4, OOL, IOL, asmstr, itin> {
-  bits<5> RD;
-  bits<5> VB;
-  bit MP;
-
-  let Pattern = pattern;
-
-  let Inst{6-10}  = RD;
-  let Inst{11-14} = eo;
-  let Inst{15} = MP;
-  let Inst{16-20} = VB;
-  let Inst{21-31} = xo;
-}
-
-// 8RR:D-Form: [ 1 1 0 // // imm0
-//               PO T XO TX imm1 ].
-class 8RR_DForm_IMM32_XT6<bits<6> opcode, bits<4> xo, dag OOL, dag IOL,
-                          string asmstr, InstrItinClass itin,
-                          list<dag> pattern>
-  : PI<1, opcode, OOL, IOL, asmstr, itin> {
-  bits<6> XT;
-  bits<32> IMM32;
-
-  let Pattern = pattern;
-
-  // The prefix.
-  let Inst{6-7} = 1;
-  let Inst{8-11} = 0;
-  let Inst{12-13} = 0; // reserved
-  let Inst{14-15} = 0; // reserved
-  let Inst{16-31} = IMM32{31-16};
-
-  // The instruction.
-  let Inst{38-42} = XT{4-0};
-  let Inst{43-46} = xo;
-  let Inst{47} = XT{5};
-  let Inst{48-63} = IMM32{15-0};
-}
-
-// 8RR:D-Form: [ 1 1 0 // // imm0
-//               PO T XO IX TX imm1 ].
-class 8RR_DForm_IMM32_XT6_IX<bits<6> opcode, bits<3> xo, dag OOL, dag IOL,
-                             string asmstr, InstrItinClass itin,
-                             list<dag> pattern>
-  : PI<1, opcode, OOL, IOL, asmstr, itin> {
-  bits<6> XT;
-  bit IX;
-  bits<32> IMM32;
-
-  let Pattern = pattern;
-
-  // The prefix.
-  let Inst{6-7} = 1;
-  let Inst{8-11} = 0;
-  let Inst{12-13} = 0; // reserved
-  let Inst{14-15} = 0; // reserved
-  let Inst{16-31} = IMM32{31-16};
-
-  // The instruction.
-  let Inst{38-42} = XT{4-0};
-  let Inst{43-45} = xo;
-  let Inst{46} = IX;
-  let Inst{47} = XT{5};
-  let Inst{48-63} = IMM32{15-0};
-}
-
-class 8RR_XX4Form_XTABC6<bits<6> opcode, bits<2> xo, dag OOL, dag IOL,
-                         string asmstr, InstrItinClass itin, list<dag> pattern>
-  : PI<1, opcode, OOL, IOL, asmstr, itin> {
-  bits<6> XT;
-  bits<6> XA;
-  bits<6> XB;
-  bits<6> XC;
-
-  let Pattern = pattern;
-
-  // The prefix.
-  let Inst{6-7} = 1;
-  let Inst{8-11} = 0;
-  let Inst{12-13} = 0;
-  let Inst{14-31} = 0;
-
-  // The instruction.
-  let Inst{38-42} = XT{4-0};
-  let Inst{43-47} = XA{4-0};
-  let Inst{48-52} = XB{4-0};
-  let Inst{53-57} = XC{4-0};
-  let Inst{58-59} = xo;
-  let Inst{60} = XC{5};
-  let Inst{61} = XA{5};
-  let Inst{62} = XB{5};
-  let Inst{63} = XT{5};
-}
-
-class 8RR_XX4Form_IMM3_XTABC6<bits<6> opcode, bits<2> xo, dag OOL, dag IOL,
-                              string asmstr, InstrItinClass itin,
-                              list<dag> pattern>
-  : PI<1, opcode, OOL, IOL, asmstr, itin> {
-  bits<6> XT;
-  bits<6> XA;
-  bits<6> XB;
-  bits<6> XC;
-  bits<3> IMM;
-
-  let Pattern = pattern;
-
-  // The prefix.
-  let Inst{6-7} = 1;
-  let Inst{8-11} = 0;
-  let Inst{12-13} = 0;
-  let Inst{14-28} = 0;
-  let Inst{29-31} = IMM;
-
-  // The instruction.
-  let Inst{38-42} = XT{4-0};
-  let Inst{43-47} = XA{4-0};
-  let Inst{48-52} = XB{4-0};
-  let Inst{53-57} = XC{4-0};
-  let Inst{58-59} = xo;
-  let Inst{60} = XC{5};
-  let Inst{61} = XA{5};
-  let Inst{62} = XB{5};
-  let Inst{63} = XT{5};
-}
-
-// [PO BF / XO2 B XO BX /]
-class XX2_BF3_XO5_XB6_XO9<bits<6> opcode, bits<5> xo2, bits<9> xo, dag OOL,
-                          dag IOL, string asmstr, InstrItinClass itin,
-                          list<dag> pattern>
-  : I<opcode, OOL, IOL, asmstr, itin> {
-  bits<3> BF;
-  bits<6> XB;
-
-  let Pattern = pattern;
-
-  let Inst{6-8}   = BF;
-  let Inst{9-10}  = 0;
-  let Inst{11-15} = xo2;
-  let Inst{16-20} = XB{4-0};
-  let Inst{21-29} = xo;
-  let Inst{30}    = XB{5};
-  let Inst{31}    = 0;
-}
-
-// X-Form: [ PO RT BI /// XO / ]
-class XForm_XT5_BI5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
-                    string asmstr, InstrItinClass itin, list<dag> pattern>
-  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
-  let B = 0;
-}
-
-multiclass MLS_DForm_R_SI34_RTA5_MEM_p<bits<6> opcode, dag OOL, dag IOL,
-                                       dag PCRel_IOL, string asmstr,
-                                       InstrItinClass itin> {
-  def NAME : MLS_DForm_R_SI34_RTA5_MEM<opcode, OOL, IOL,
-                                       !strconcat(asmstr, ", 0"), itin, []>;
-  def pc : MLS_DForm_R_SI34_RTA5_MEM<opcode, OOL, PCRel_IOL,
-                                     !strconcat(asmstr, ", 1"), itin, []>,
-                                     isPCRel;
-}
-
-multiclass 8LS_DForm_R_SI34_RTA5_p<bits<6> opcode, dag OOL, dag IOL,
-                                   dag PCRel_IOL, string asmstr,
-                                   InstrItinClass itin> {
-  def NAME : 8LS_DForm_R_SI34_RTA5<opcode, OOL, IOL,
-                                   !strconcat(asmstr, ", 0"), itin, []>;
-  def pc : 8LS_DForm_R_SI34_RTA5<opcode, OOL, PCRel_IOL,
-                                 !strconcat(asmstr, ", 1"), itin, []>, isPCRel;
-}
-
-multiclass 8LS_DForm_R_SI34_XT6_RA5_p<bits<5> opcode, dag OOL, dag IOL,
-                                      dag PCRel_IOL, string asmstr,
-                                      InstrItinClass itin> {
-  def NAME : 8LS_DForm_R_SI34_XT6_RA5<opcode, OOL, IOL,
-                                      !strconcat(asmstr, ", 0"), itin, []>;
-  def pc : 8LS_DForm_R_SI34_XT6_RA5<opcode, OOL, PCRel_IOL,
-                                    !strconcat(asmstr, ", 1"), itin, []>,
-                                    isPCRel;
-}
-
-def PPCRegVSRpRCAsmOperand : AsmOperandClass {
-  let Name = "RegVSRpRC"; let PredicateMethod = "isVSRpEvenRegNumber";
-}
-
-def vsrprc : RegisterOperand<VSRpRC> {
-  let ParserMatchClass = PPCRegVSRpRCAsmOperand;
-}
-
-def PPCRegVSRpEvenRCAsmOperand : AsmOperandClass {
-  let Name = "RegVSRpEvenRC"; let PredicateMethod = "isVSRpEvenRegNumber";
-}
-
-def vsrpevenrc : RegisterOperand<VSRpRC> {
-  let ParserMatchClass = PPCRegVSRpEvenRCAsmOperand;
-  let EncoderMethod = "getVSRpEvenEncoding";
-  let DecoderMethod = "decodeVSRpEvenOperands";
-}
-
-class DQForm_XTp5_RA17_MEM<bits<6> opcode, bits<4> xo, dag OOL, dag IOL,
-                           string asmstr, InstrItinClass itin, list<dag> pattern>
-  : I<opcode, OOL, IOL, asmstr, itin> {
-  bits<5> XTp;
-  bits<17> DQ_RA;
-  let Pattern = pattern;
-
-  let Inst{6-9} = XTp{3-0};
-  let Inst{10} = XTp{4};
-  let Inst{11-15} = DQ_RA{16-12};  // Register #
-  let Inst{16-27} = DQ_RA{11-0};   // Displacement.
-  let Inst{28-31} = xo;
-}
-
-class XForm_XTp5_XAB5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
-                      string asmstr, InstrItinClass itin, list<dag> pattern>
-  : I<opcode, OOL, IOL, asmstr, itin>, XFormMemOp {
-  bits<5> XTp;
-  bits<5> A;
-  bits<5> B;
-
-  let Pattern = pattern;
-  let Inst{6-9} = XTp{3-0};
-  let Inst{10} = XTp{4};
-  let Inst{11-15} = A;
-  let Inst{16-20} = B;
-  let Inst{21-30} = xo;
-  let Inst{31} = 0;
-}
-
-class 8LS_DForm_R_XTp5_SI34_MEM<bits<6> opcode, dag OOL, dag IOL, string asmstr,
-                                InstrItinClass itin, list<dag> pattern>
-  : PI<1, opcode, OOL, IOL, asmstr, itin> {
-  bits<5> XTp;
-  bits<39> D_RA;
-
-  let Pattern = pattern;
-
-  // The prefix.
-  let Inst{6-10} = 0;
-  let Inst{11} = PCRel;
-  let Inst{12-13} = 0;
-  let Inst{14-31} = D_RA{33-16}; // Imm18
-
-  // The instruction.
-  let Inst{38-41} = XTp{3-0};
-  let Inst{42}    = XTp{4};
-  let Inst{43-47} = D_RA{38-34};   // Register #
-  let Inst{48-63} = D_RA{15-0};    // D
-}
-
-multiclass 8LS_DForm_R_XTp5_SI34_MEM_p<bits<6> opcode, dag OOL,
-                                       dag IOL, dag PCRel_IOL,
-                                       string asmstr, InstrItinClass itin> {
-  def NAME : 8LS_DForm_R_XTp5_SI34_MEM<opcode, OOL, IOL,
-                                       !strconcat(asmstr, ", 0"), itin, []>;
-  def pc : 8LS_DForm_R_XTp5_SI34_MEM<opcode, OOL, PCRel_IOL,
-                                     !strconcat(asmstr, ", 1"), itin, []>,
-                                     isPCRel;
-}
-
-def PPCRegACCRCAsmOperand : AsmOperandClass {
-  let Name = "RegACCRC"; let PredicateMethod = "isACCRegNumber";
-}
-
-def acc : RegisterOperand<ACCRC> {
-  let ParserMatchClass = PPCRegACCRCAsmOperand;
-}
-
-def uacc : RegisterOperand<UACCRC> {
-  let ParserMatchClass = PPCRegACCRCAsmOperand;
-}
-
-// [PO AS XO2 XO]
-class XForm_AT3<bits<6> opcode, bits<5> xo2, bits<10> xo, dag OOL, dag IOL,
-                    string asmstr, InstrItinClass itin, list<dag> pattern>
-  : I<opcode, OOL, IOL, asmstr, itin> {
-  bits<3> AT;
-
-  let Pattern = pattern;
-
-  let Inst{6-8}  = AT;
-  let Inst{9-10}  = 0;
-  let Inst{11-15} = xo2;
-  let Inst{16-20} = 0;
-  let Inst{21-30} = xo;
-  let Inst{31} = 0;
-}
-
-class XX3Form_AT3_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
-                           string asmstr, InstrItinClass itin,
-                           list<dag> pattern>
-  : I<opcode, OOL, IOL, asmstr, itin> {
-  bits<3> AT;
-  bits<6> XA;
-  bits<6> XB;
-
-  let Pattern = pattern;
-
-  let Inst{6-8} = AT;
-  let Inst{9-10} = 0;
-  let Inst{11-15} = XA{4-0};
-  let Inst{16-20} = XB{4-0};
-  let Inst{21-28} = xo;
-  let Inst{29}    = XA{5};
-  let Inst{30}    = XB{5};
-  let Inst{31} = 0;
-}
-
-class MMIRR_XX3Form_XY4P2_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
-                               string asmstr, InstrItinClass itin,
-                               list<dag> pattern>
-  : PI<1, opcode, OOL, IOL, asmstr, itin> {
-  bits<3> AT;
-  bits<6> XA;
-  bits<6> XB;
-  bits<4> XMSK;
-  bits<4> YMSK;
-  bits<2> PMSK;
-
-  let Pattern = pattern;
-
-  // The prefix.
-  let Inst{6-7} = 3;
-  let Inst{8-11} = 9;
-  let Inst{12-15} = 0;
-  let Inst{16-17} = PMSK;
-  let Inst{18-23} = 0;
-  let Inst{24-27} = XMSK;
-  let Inst{28-31} = YMSK;
-
-  // The instruction.
-  let Inst{38-40} = AT;
-  let Inst{41-42} = 0;
-  let Inst{43-47} = XA{4-0};
-  let Inst{48-52} = XB{4-0};
-  let Inst{53-60} = xo;
-  let Inst{61} = XA{5};
-  let Inst{62} = XB{5};
-  let Inst{63} = 0;
-}
-
-class MMIRR_XX3Form_XY4_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
-                             string asmstr, InstrItinClass itin,
-                             list<dag> pattern>
-  : PI<1, opcode, OOL, IOL, asmstr, itin> {
-  bits<3> AT;
-  bits<6> XA;
-  bits<6> XB;
-  bits<4> XMSK;
-  bits<4> YMSK;
-
-  let Pattern = pattern;
-
-  // The prefix.
-  let Inst{6-7} = 3;
-  let Inst{8-11} = 9;
-  let Inst{12-23} = 0;
-  let Inst{24-27} = XMSK;
-  let Inst{28-31} = YMSK;
-
-  // The instruction.
-  let Inst{38-40} = AT;
-  let Inst{41-42} = 0;
-  let Inst{43-47} = XA{4-0};
-  let Inst{48-52} = XB{4-0};
-  let Inst{53-60} = xo;
-  let Inst{61} = XA{5};
-  let Inst{62} = XB{5};
-  let Inst{63} = 0;
-}
-
-class MMIRR_XX3Form_X4Y2_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
-                              string asmstr, InstrItinClass itin,
-                              list<dag> pattern>
-  : PI<1, opcode, OOL, IOL, asmstr, itin> {
-  bits<3> AT;
-  bits<6> XA;
-  bits<6> XB;
-  bits<4> XMSK;
-  bits<2> YMSK;
-
-  let Pattern = pattern;
-
-  // The prefix.
-  let Inst{6-7} = 3;
-  let Inst{8-11} = 9;
-  let Inst{12-23} = 0;
-  let Inst{24-27} = XMSK;
-  let Inst{28-29} = YMSK;
-  let Inst{30-31} = 0;
-
-  // The instruction.
-  let Inst{38-40} = AT;
-  let Inst{41-42} = 0;
-  let Inst{43-47} = XA{4-0};
-  let Inst{48-52} = XB{4-0};
-  let Inst{53-60} = xo;
-  let Inst{61} = XA{5};
-  let Inst{62} = XB{5};
-  let Inst{63} = 0;
-}
-
-class MMIRR_XX3Form_XY4P8_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
-                               string asmstr, InstrItinClass itin,
-                               list<dag> pattern>
-  : PI<1, opcode, OOL, IOL, asmstr, itin> {
-  bits<3> AT;
-  bits<6> XA;
-  bits<6> XB;
-  bits<4> XMSK;
-  bits<4> YMSK;
-  bits<8> PMSK;
-
-  let Pattern = pattern;
-
-  // The prefix.
-  let Inst{6-7} = 3;
-  let Inst{8-11} = 9;
-  let Inst{12-15} = 0;
-  let Inst{16-23} = PMSK;
-  let Inst{24-27} = XMSK;
-  let Inst{28-31} = YMSK;
-
-  // The instruction.
-  let Inst{38-40} = AT;
-  let Inst{41-42} = 0;
-  let Inst{43-47} = XA{4-0};
-  let Inst{48-52} = XB{4-0};
-  let Inst{53-60} = xo;
-  let Inst{61} = XA{5};
-  let Inst{62} = XB{5};
-  let Inst{63} = 0;
-}
-
-class MMIRR_XX3Form_XYP4_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
-                              string asmstr, InstrItinClass itin,
-                              list<dag> pattern>
-  : PI<1, opcode, OOL, IOL, asmstr, itin> {
-  bits<3> AT;
-  bits<6> XA;
-  bits<6> XB;
-  bits<4> XMSK;
-  bits<4> YMSK;
-  bits<4> PMSK;
-
-  let Pattern = pattern;
-
-  // The prefix.
-  let Inst{6-7} = 3;
-  let Inst{8-11} = 9;
-  let Inst{12-15} = 0;
-  let Inst{16-19} = PMSK;
-  let Inst{20-23} = 0;
-  let Inst{24-27} = XMSK;
-  let Inst{28-31} = YMSK;
-
-  // The instruction.
-  let Inst{38-40} = AT;
-  let Inst{41-42} = 0;
-  let Inst{43-47} = XA{4-0};
-  let Inst{48-52} = XB{4-0};
-  let Inst{53-60} = xo;
-  let Inst{61} = XA{5};
-  let Inst{62} = XB{5};
-  let Inst{63} = 0;
-}
-
-def PrefixInstrs : Predicate<"Subtarget->hasPrefixInstrs()">;
-def IsISA3_1 : Predicate<"Subtarget->isISA3_1()">;
-def PairedVectorMemops : Predicate<"Subtarget->pairedVectorMemops()">;
-def MMA : Predicate<"Subtarget->hasMMA()">;
-
-def RCCp {
-  dag AToVSRC = (COPY_TO_REGCLASS $XA, VSRC);
-  dag BToVSRC = (COPY_TO_REGCLASS $XB, VSRC);
-}
-
-let Predicates = [PrefixInstrs] in {
-  let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
-    defm PADDI8 :
-      MLS_DForm_R_SI34_RTA5_p<14, (outs g8rc:$RT), (ins g8rc:$RA, s34imm:$SI),
-                              (ins immZero:$RA, s34imm_pcrel:$SI),
-                              "paddi $RT, $RA, $SI", IIC_LdStLFD>;
-    let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
-      def PLI8 : MLS_DForm_SI34_RT5<14, (outs g8rc:$RT),
-                                    (ins s34imm:$SI),
-                                    "pli $RT, $SI", IIC_IntSimple, []>;
-    }
-  }
-  defm PADDI :
-    MLS_DForm_R_SI34_RTA5_p<14, (outs gprc:$RT), (ins gprc:$RA, s34imm:$SI),
-                            (ins immZero:$RA, s34imm_pcrel:$SI),
-                            "paddi $RT, $RA, $SI", IIC_LdStLFD>;
-  let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
-    def PLI : MLS_DForm_SI34_RT5<14, (outs gprc:$RT),
-                                 (ins s34imm:$SI),
-                                 "pli $RT, $SI", IIC_IntSimple, []>;
-  }
-
-  let mayLoad = 1, mayStore = 0 in {
-    defm PLXV :
-      8LS_DForm_R_SI34_XT6_RA5_p<25, (outs vsrc:$XT), (ins memri34:$D_RA),
-                                 (ins memri34_pcrel:$D_RA), "plxv $XT, $D_RA",
-                                 IIC_LdStLFD>;
-    defm PLFS :
-      MLS_DForm_R_SI34_RTA5_MEM_p<48, (outs f4rc:$FRT), (ins memri34:$D_RA),
-                                  (ins memri34_pcrel:$D_RA), "plfs $FRT, $D_RA",
-                                  IIC_LdStLFD>;
-    defm PLFD :
-      MLS_DForm_R_SI34_RTA5_MEM_p<50, (outs f8rc:$FRT), (ins memri34:$D_RA),
-                                  (ins  memri34_pcrel:$D_RA), "plfd $FRT, $D_RA",
-                                  IIC_LdStLFD>;
-    defm PLXSSP :
-      8LS_DForm_R_SI34_RTA5_p<43, (outs vfrc:$VRT), (ins memri34:$D_RA),
-                              (ins memri34_pcrel:$D_RA), "plxssp $VRT, $D_RA",
-                              IIC_LdStLFD>;
-    defm PLXSD :
-      8LS_DForm_R_SI34_RTA5_p<42, (outs vfrc:$VRT), (ins memri34:$D_RA),
-                              (ins memri34_pcrel:$D_RA), "plxsd $VRT, $D_RA",
-                              IIC_LdStLFD>;
-    let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
-      defm PLBZ8 :
-        MLS_DForm_R_SI34_RTA5_MEM_p<34, (outs g8rc:$RT), (ins memri34:$D_RA),
-                                    (ins memri34_pcrel:$D_RA), "plbz $RT, $D_RA",
-                                    IIC_LdStLFD>;
-      defm PLHZ8 :
-        MLS_DForm_R_SI34_RTA5_MEM_p<40, (outs g8rc:$RT), (ins memri34:$D_RA),
-                                    (ins memri34_pcrel:$D_RA), "plhz $RT, $D_RA",
-                                    IIC_LdStLFD>;
-      defm PLHA8 :
-        MLS_DForm_R_SI34_RTA5_MEM_p<42, (outs g8rc:$RT), (ins memri34:$D_RA),
-                                    (ins memri34_pcrel:$D_RA), "plha $RT, $D_RA",
-                                    IIC_LdStLFD>;
-      defm PLWA8 :
-        8LS_DForm_R_SI34_RTA5_p<41, (outs g8rc:$RT), (ins memri34:$D_RA),
-                                (ins memri34_pcrel:$D_RA), "plwa $RT, $D_RA",
-                                IIC_LdStLFD>;
-      defm PLWZ8 :
-        MLS_DForm_R_SI34_RTA5_MEM_p<32, (outs g8rc:$RT), (ins memri34:$D_RA),
-                                    (ins memri34_pcrel:$D_RA), "plwz $RT, $D_RA",
-                                    IIC_LdStLFD>;
-    }
-    defm PLBZ :
-      MLS_DForm_R_SI34_RTA5_MEM_p<34, (outs gprc:$RT), (ins memri34:$D_RA),
-                                  (ins memri34_pcrel:$D_RA), "plbz $RT, $D_RA",
-                                  IIC_LdStLFD>;
-    defm PLHZ :
-      MLS_DForm_R_SI34_RTA5_MEM_p<40, (outs gprc:$RT), (ins memri34:$D_RA),
-                                  (ins memri34_pcrel:$D_RA), "plhz $RT, $D_RA",
-                                  IIC_LdStLFD>;
-    defm PLHA :
-      MLS_DForm_R_SI34_RTA5_MEM_p<42, (outs gprc:$RT), (ins memri34:$D_RA),
-                                  (ins memri34_pcrel:$D_RA), "plha $RT, $D_RA",
-                                  IIC_LdStLFD>;
-    defm PLWZ :
-      MLS_DForm_R_SI34_RTA5_MEM_p<32, (outs gprc:$RT), (ins memri34:$D_RA),
-                                  (ins memri34_pcrel:$D_RA), "plwz $RT, $D_RA",
-                                  IIC_LdStLFD>;
-    defm PLWA :
-      8LS_DForm_R_SI34_RTA5_p<41, (outs gprc:$RT), (ins memri34:$D_RA),
-                              (ins memri34_pcrel:$D_RA), "plwa $RT, $D_RA",
-                              IIC_LdStLFD>;
-    defm PLD :
-      8LS_DForm_R_SI34_RTA5_p<57, (outs g8rc:$RT), (ins memri34:$D_RA),
-                              (ins memri34_pcrel:$D_RA), "pld $RT, $D_RA",
-                              IIC_LdStLFD>;
-  }
-
-  let mayStore = 1, mayLoad = 0 in {
-    defm PSTXV :
-      8LS_DForm_R_SI34_XT6_RA5_p<27, (outs), (ins vsrc:$XS, memri34:$D_RA),
-                                 (ins vsrc:$XS, memri34_pcrel:$D_RA),
-                                 "pstxv $XS, $D_RA", IIC_LdStLFD>;
-    defm PSTFS :
-      MLS_DForm_R_SI34_RTA5_MEM_p<52, (outs), (ins f4rc:$FRS, memri34:$D_RA),
-                                  (ins f4rc:$FRS, memri34_pcrel:$D_RA),
-                                  "pstfs $FRS, $D_RA", IIC_LdStLFD>;
-    defm PSTFD :
-      MLS_DForm_R_SI34_RTA5_MEM_p<54, (outs), (ins f8rc:$FRS, memri34:$D_RA),
-                                  (ins f8rc:$FRS, memri34_pcrel:$D_RA),
-                                  "pstfd $FRS, $D_RA", IIC_LdStLFD>;
-    defm PSTXSSP :
-      8LS_DForm_R_SI34_RTA5_p<47, (outs), (ins vfrc:$VRS, memri34:$D_RA),
-                              (ins vfrc:$VRS, memri34_pcrel:$D_RA),
-                              "pstxssp $VRS, $D_RA", IIC_LdStLFD>;
-    defm PSTXSD :
-      8LS_DForm_R_SI34_RTA5_p<46, (outs), (ins vfrc:$VRS, memri34:$D_RA),
-                              (ins vfrc:$VRS, memri34_pcrel:$D_RA),
-                              "pstxsd $VRS, $D_RA", IIC_LdStLFD>;
-    let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
-      defm PSTB8 :
-        MLS_DForm_R_SI34_RTA5_MEM_p<38, (outs), (ins g8rc:$RS, memri34:$D_RA),
-                                    (ins g8rc:$RS, memri34_pcrel:$D_RA),
-                                    "pstb $RS, $D_RA", IIC_LdStLFD>;
-      defm PSTH8 :
-        MLS_DForm_R_SI34_RTA5_MEM_p<44, (outs), (ins g8rc:$RS, memri34:$D_RA),
-                                    (ins g8rc:$RS, memri34_pcrel:$D_RA),
-                                    "psth $RS, $D_RA", IIC_LdStLFD>;
-      defm PSTW8 :
-        MLS_DForm_R_SI34_RTA5_MEM_p<36, (outs), (ins g8rc:$RS, memri34:$D_RA),
-                                    (ins g8rc:$RS, memri34_pcrel:$D_RA),
-                                    "pstw $RS, $D_RA", IIC_LdStLFD>;
-    }
-    defm PSTB :
-      MLS_DForm_R_SI34_RTA5_MEM_p<38, (outs), (ins gprc:$RS, memri34:$D_RA),
-                                  (ins gprc:$RS, memri34_pcrel:$D_RA),
-                                  "pstb $RS, $D_RA", IIC_LdStLFD>;
-    defm PSTH :
-      MLS_DForm_R_SI34_RTA5_MEM_p<44, (outs), (ins gprc:$RS, memri34:$D_RA),
-                                  (ins gprc:$RS, memri34_pcrel:$D_RA),
-                                  "psth $RS, $D_RA", IIC_LdStLFD>;
-    defm PSTW :
-      MLS_DForm_R_SI34_RTA5_MEM_p<36, (outs), (ins gprc:$RS, memri34:$D_RA),
-                                  (ins gprc:$RS, memri34_pcrel:$D_RA),
-                                  "pstw $RS, $D_RA", IIC_LdStLFD>;
-    defm PSTD :
-      8LS_DForm_R_SI34_RTA5_p<61, (outs), (ins g8rc:$RS, memri34:$D_RA),
-                              (ins g8rc:$RS, memri34_pcrel:$D_RA),
-                              "pstd $RS, $D_RA", IIC_LdStLFD>;
-  }
-}
-
-// Multiclass definitions for MMA accumulator instructions.
-// ----------------------------------------------------------------------------
-
-// Defines 2 unmasked instructions where the xo field for acc/non-acc version
-// is even/odd.
-multiclass ACC_UM_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
-                       string asmstr> {
-  let Predicates = [MMA] in {
-  def NAME :
-    XX3Form_AT3_XAB6<opcode, !or(xo, 0x01), (outs acc:$AT), IOL,
-                     !strconcat(asmbase#" ", asmstr), IIC_VecFP, []>,
-    RegConstraint<"@earlyclobber $AT">;
-  def PP :
-    XX3Form_AT3_XAB6<opcode, xo, (outs acc:$AT), !con((ins acc:$ATi), IOL),
-                     !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  }
-}
-
-// Defines 4 instructions, masked/unmasked with masks 8, 4, 4 bits.
-// The XO field for acc/non-acc version is even/odd.
-multiclass ACC_UM_M844_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
-                            string asmstr> {
-  defm NAME : ACC_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>;
-  let Predicates = [MMA, PrefixInstrs] in {
-  def PM#NAME :
-    MMIRR_XX3Form_XY4P8_XAB6<
-      opcode, !or(xo, 0x01), (outs acc:$AT),
-      !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u8imm:$PMSK)),
-      !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"@earlyclobber $AT">;
-  def PM#NAME#PP :
-    MMIRR_XX3Form_XY4P8_XAB6<
-      opcode, xo, (outs acc:$AT),
-      !con((ins acc:$ATi),
-           !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u8imm:$PMSK))),
-      !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  }
-}
-
-// Defines 4 instructions, masked/unmasked with masks 4, 4, 4 bits.
-// The XO field for acc/non-acc version is even/odd.
-multiclass ACC_UM_M444_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
-                            string asmstr> {
-  defm NAME : ACC_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>;
-  let Predicates = [MMA, PrefixInstrs] in {
-  def PM#NAME :
-    MMIRR_XX3Form_XYP4_XAB6<
-      opcode, !or(xo, 0x01), (outs acc:$AT),
-      !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK)),
-      !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"@earlyclobber $AT">;
-  def PM#NAME#PP :
-    MMIRR_XX3Form_XYP4_XAB6<
-      opcode, xo, (outs acc:$AT),
-      !con((ins acc:$ATi),
-           !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK))),
-      !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  }
-}
-
-// Defines 4 instructions, masked/unmasked with masks 2, 4, 4 bits.
-// The XO field for acc/non-acc version is even/odd.
-multiclass ACC_UM_M244_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
-                            string asmstr> {
-  defm NAME : ACC_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>;
-  let Predicates = [MMA, PrefixInstrs] in {
-  def PM#NAME :
-    MMIRR_XX3Form_XY4P2_XAB6<
-      opcode, !or(xo, 0x01), (outs acc:$AT),
-      !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK)),
-      !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"@earlyclobber $AT">;
-  def PM#NAME#PP :
-    MMIRR_XX3Form_XY4P2_XAB6<
-      opcode, xo, (outs acc:$AT),
-      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
-      !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  }
-}
-
-// Defines 4 instructions, masked/unmasked with masks 2, 4, 4 bits.
-// Upper nibble of XO field for acc/non-acc version is 0x4/0x6.
-multiclass ACC_UM_M244_XO46<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
-                            string asmstr> {
-  let Predicates = [MMA] in {
-  def NAME :
-    XX3Form_AT3_XAB6<opcode, xo, (outs acc:$AT), IOL,
-                     !strconcat(asmbase#" ", asmstr), IIC_VecFP, []>,
-    RegConstraint<"@earlyclobber $AT">;
-  def PP :
-    XX3Form_AT3_XAB6<
-      opcode, !or(xo, 0x20), (outs acc:$AT), !con((ins acc:$ATi), IOL),
-      !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  }
-  let Predicates = [MMA, PrefixInstrs] in {
-  def PM#NAME :
-    MMIRR_XX3Form_XY4P2_XAB6<
-      opcode, xo, (outs acc:$AT),
-      !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK)),
-      !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"@earlyclobber $AT">;
-  def PM#NAME#PP :
-    MMIRR_XX3Form_XY4P2_XAB6<
-      opcode, !or(xo, 0x20), (outs acc:$AT),
-      !con((ins acc:$ATi),
-           !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
-      !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  }
-}
-
-// Defines 10 instructions, operand negating, unmasked, masked with 2, 4, 4
-// bits. Upper nibble are masked with 0x8, 0x4, 0xC for negating operands.
-multiclass ACC_NEG_UM_M244_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
-                                  string asmbase, string asmstr> {
-  defm NAME : ACC_UM_M244_XOEO<opcode, xo, IOL, asmbase, asmstr>;
-  let Predicates = [MMA] in {
-  def PN : XX3Form_AT3_XAB6<
-             opcode, !or(xo, 0x80), (outs acc:$AT), !con((ins acc:$ATi), IOL),
-             !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>,
-           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  def NP : XX3Form_AT3_XAB6<
-             opcode, !or(xo, 0x40), (outs acc:$AT), !con((ins acc:$ATi), IOL),
-             !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>,
-           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  def NN : XX3Form_AT3_XAB6<
-             opcode, !or(xo, 0xC0), (outs acc:$AT), !con((ins acc:$ATi), IOL),
-             !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>,
-           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  }
-  let Predicates = [MMA, PrefixInstrs] in {
-  def PM#NAME#PN :
-    MMIRR_XX3Form_XY4P2_XAB6<
-      opcode, !or(xo, 0x80), (outs acc:$AT),
-      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
-      !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK, $PMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  def PM#NAME#NP :
-    MMIRR_XX3Form_XY4P2_XAB6<
-      opcode, !or(xo, 0x40), (outs acc:$AT),
-      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
-      !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK, $PMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  def PM#NAME#NN :
-    MMIRR_XX3Form_XY4P2_XAB6<
-      opcode, !or(xo, 0xC0), (outs acc:$AT),
-      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
-      !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK, $PMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  }
-}
-
-// Defines 5 instructions, unmasked, operand negating.
-// Upper nibble are masked with 0x8, 0x4, 0xC for negating operands.
-multiclass ACC_NEG_UM_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
-                             string asmbase, string asmstr> {
-  defm NAME : ACC_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>;
-  let Predicates = [MMA] in {
-  def PN : XX3Form_AT3_XAB6<opcode, !or(xo, 0x80), (outs acc:$AT),
-                            !con((ins acc:$ATi), IOL),
-                            !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>,
-           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  def NP : XX3Form_AT3_XAB6<opcode, !or(xo, 0x40), (outs acc:$AT),
-                            !con((ins acc:$ATi), IOL),
-                            !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>,
-           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  def NN : XX3Form_AT3_XAB6<opcode, !or(xo, 0xC0), (outs acc:$AT),
-                            !con((ins acc:$ATi), IOL),
-                            !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>,
-           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  }
-}
-
-// Defines 10 instructions, operand negating, unmasked, masked with 4, 4 bits.
-// Upper nibble are masked with 0x8, 0x4, 0xC for negating operands.
-multiclass ACC_NEG_UM_M44_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
-                                 string asmbase, string asmstr> {
-  defm NAME : ACC_NEG_UM_XOM84C<opcode, xo, IOL, asmbase, asmstr>;
-  let Predicates = [MMA, PrefixInstrs] in {
-  def PM#NAME :
-    MMIRR_XX3Form_XY4_XAB6<
-      opcode, !or(xo, 0x01), (outs acc:$AT),
-      !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK)),
-      !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"@earlyclobber $AT">;
-  def PM#NAME#PP :
-    MMIRR_XX3Form_XY4_XAB6<
-      opcode, xo, (outs acc:$AT),
-      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
-      !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  def PM#NAME#PN :
-    MMIRR_XX3Form_XY4_XAB6<
-      opcode, !or(xo, 0x80), (outs acc:$AT),
-      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
-      !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  def PM#NAME#NP :
-    MMIRR_XX3Form_XY4_XAB6<
-      opcode, !or(xo, 0x40), (outs acc:$AT),
-      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
-      !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  def PM#NAME#NN :
-    MMIRR_XX3Form_XY4_XAB6<
-      opcode, !or(xo, 0xC0), (outs acc:$AT),
-      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
-      !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  }
-}
-
-// Defines 10 instructions, operand negating, unmasked, masked with 4, 2 bits.
-// Upper nibble are masked with 0x8, 0x4, 0xC for negating operands.
-multiclass ACC_NEG_UM_M42_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
-                                 string asmbase, string asmstr> {
-  defm NAME : ACC_NEG_UM_XOM84C<opcode, xo, IOL, asmbase, asmstr>;
-  let Predicates = [MMA, PrefixInstrs] in {
-  def PM#NAME :
-    MMIRR_XX3Form_X4Y2_XAB6<
-      opcode, !or(xo, 0x01), (outs acc:$AT),
-      !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK)),
-      !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"@earlyclobber $AT">;
-  def PM#NAME#PP :
-    MMIRR_XX3Form_X4Y2_XAB6<
-      opcode, xo, (outs acc:$AT),
-      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
-      !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  def PM#NAME#PN :
-    MMIRR_XX3Form_X4Y2_XAB6<
-      opcode, !or(xo, 0x80), (outs acc:$AT),
-      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
-      !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  def PM#NAME#NP :
-    MMIRR_XX3Form_X4Y2_XAB6<
-      opcode, !or(xo, 0x40), (outs acc:$AT),
-      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
-      !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  def PM#NAME#NN :
-    MMIRR_XX3Form_X4Y2_XAB6<
-      opcode, !or(xo, 0xC0), (outs acc:$AT),
-      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
-      !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  }
-}
-
-// End of class definitions.
-//-----------------------------------------------------------------------------
-
-let Predicates = [MMA] in {
-  def XXMFACC :
-    XForm_AT3<31, 0, 177, (outs acc:$ASo), (ins acc:$AS), "xxmfacc $AS",
-              IIC_VecGeneral,
-              [(set v512i1:$ASo, (int_ppc_mma_xxmfacc v512i1:$AS))]>,
-              RegConstraint<"$ASo = $AS">, NoEncode<"$ASo">;
-  def XXMTACC :
-    XForm_AT3<31, 1, 177, (outs acc:$AT), (ins acc:$ATi), "xxmtacc $AT",
-              IIC_VecGeneral,
-              [(set v512i1:$AT, (int_ppc_mma_xxmtacc v512i1:$ATi))]>,
-              RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  def KILL_PAIR : PPCPostRAExpPseudo<(outs vsrprc:$XTp), (ins vsrprc:$XSp),
-                                      "#KILL_PAIR", []>,
-                                      RegConstraint<"$XTp = $XSp">;
-  def BUILD_UACC : PPCPostRAExpPseudo<(outs acc:$AT), (ins uacc:$AS),
-                                      "#BUILD_UACC $AT, $AS", []>;
-  // We define XXSETACCZ as rematerializable to undo CSE of that intrinsic in
-  // the backend. We avoid CSE here because it generates a copy of the acc
-  // register and this copy is more expensive than calling the intrinsic again.
-  let isAsCheapAsAMove = 1, isReMaterializable = 1 in {
-    def XXSETACCZ :
-      XForm_AT3<31, 3, 177, (outs acc:$AT), (ins), "xxsetaccz $AT", IIC_VecGeneral,
-                [(set v512i1:$AT, (int_ppc_mma_xxsetaccz))]>;
-  }
-  def XVI8GER4SPP :
-    XX3Form_AT3_XAB6<59, 99, (outs acc:$AT), (ins acc:$ATi, vsrc:$XA, vsrc:$XB),
-                     "xvi8ger4spp $AT, $XA, $XB", IIC_VecGeneral, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  let mayStore = 1 in {
-    def SPILL_ACC: PPCEmitTimePseudo<(outs), (ins acc:$AT, memrix16:$dst),
-                                     "#SPILL_ACC", []>;
-    def SPILL_UACC: PPCEmitTimePseudo<(outs), (ins uacc:$AT, memrix16:$dst),
-                                     "#SPILL_UACC", []>;
-  }
-  let mayLoad = 1, hasSideEffects = 0 in {
-    def RESTORE_ACC: PPCEmitTimePseudo<(outs acc:$AT), (ins memrix16:$src),
-                                       "#RESTORE_ACC", []>;
-    def RESTORE_UACC: PPCEmitTimePseudo<(outs uacc:$AT), (ins memrix16:$src),
-                                       "#RESTORE_UACC", []>;
-  }
-}
-
-let Predicates = [MMA, PrefixInstrs] in {
-  def PMXVI8GER4SPP :
-    MMIRR_XX3Form_XYP4_XAB6<59, 99, (outs acc:$AT),
-                            (ins acc:$ATi, vsrc:$XA,vsrc:$XB, u4imm:$XMSK,
-                             u4imm:$YMSK, u4imm:$PMSK),
-                            "pmxvi8ger4spp $AT, $XA, $XB, $XMSK, $YMSK, $PMSK",
-                            IIC_VecGeneral, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-}
-
-// MMA accumulating/non-accumulating instructions.
-//------------------------------------------------------------------------------
-
-// XVBF16GER2, XVBF16GER2PP, XVBF16GER2PN, XVBF16GER2NP, XVBF16GER2NN
-// PMXVBF16GER2, PMXVBF16GER2PP, PMXVBF16GER2PN, PMXVBF16GER2NP, PMXVBF16GER2NN
-defm XVBF16GER2 : ACC_NEG_UM_M244_XOM84C<59, 50, (ins vsrc:$XA, vsrc:$XB),
-                                         "xvbf16ger2", "$AT, $XA, $XB">;
-
-// XVI4GER8, XVI4GER8PP, PMXVI4GER8,  PMXVI4GER8PP
-defm XVI4GER8 : ACC_UM_M844_XOEO<59, 34, (ins vsrc:$XA, vsrc:$XB),
-                                 "xvi4ger8", "$AT, $XA, $XB">;
-
-// XVI8GER4, XVI8GER4PP, PMXVI8GER4, PMXVI8GER4PP
-defm XVI8GER4 : ACC_UM_M444_XOEO<59, 2, (ins vsrc:$XA, vsrc:$XB),
-                                 "xvi8ger4", "$AT, $XA, $XB">;
-
-// XVI16GER2, XVI16GER2PP, PMXVI16GER2, PMXVI16GER2PP
-defm XVI16GER2 : ACC_UM_M244_XO46<59, 75, (ins vsrc:$XA, vsrc:$XB),
-                                  "xvi16ger2", "$AT, $XA, $XB">;
-
-// XVI16GER2S, XVI16GER2SPP, PMXVI16GER2S, PMXVI16GER2SPP
-defm XVI16GER2S : ACC_UM_M244_XOEO<59, 42, (ins vsrc:$XA, vsrc:$XB),
-                                   "xvi16ger2s", "$AT, $XA, $XB">;
-
-// XVF16GER2, XVF16GER2PP, XVF16GER2PN, XVF16GER2NP, XVF16GER2NN
-// PMXVF16GER2, PMXVF16GER2PP, PMXVF16GER2PN, PMXVF16GER2NP, PMXVF16GER2NN
-defm XVF16GER2 : ACC_NEG_UM_M244_XOM84C<59, 18, (ins vsrc:$XA, vsrc:$XB),
-                                        "xvf16ger2", "$AT, $XA, $XB">;
-
-// XVF32GER, XVF32GERPP, XVF32GERPN, XVF32GERNP, XVF32GERPP
-// PMXVF32GER, PMXVF32GERPP, PMXVF32GERPN, PMXVF32GERNP, PMXVF32GERPP
-defm XVF32GER : ACC_NEG_UM_M44_XOM84C<59, 26, (ins vsrc:$XA, vsrc:$XB),
-                                      "xvf32ger", "$AT, $XA, $XB">;
-
-// XVF64GER, XVF64GERPP, XVF64GERPN, XVF64GERNP, XVF64GERNN
-// PMXVF64GER, PMXVF64GERPP, PMXVF64GERPN, PMXVF64GERNP, PMXVF64GERNN
-defm XVF64GER : ACC_NEG_UM_M42_XOM84C<59, 58, (ins vsrpevenrc:$XA, vsrc:$XB),
-                                      "xvf64ger", "$AT, $XA, $XB">;
-//------------------------------------------------------------------------------
-
-// MMA Intrinsics
-let Predicates = [MMA] in {
-  def : Pat<(v512i1 (int_ppc_mma_xvi4ger8 v16i8:$XA, v16i8:$XB)),
-            (XVI4GER8 RCCp.AToVSRC, RCCp.BToVSRC)>;
-  def : Pat<(v512i1 (int_ppc_mma_xvi4ger8pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
-            (XVI4GER8PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
-
-  def : Pat<(v512i1 (int_ppc_mma_xvi8ger4 v16i8:$XA, v16i8:$XB)),
-            (XVI8GER4 RCCp.AToVSRC, RCCp.BToVSRC)>;
-  def : Pat<(v512i1 (int_ppc_mma_xvi8ger4pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
-            (XVI8GER4PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
-
-  def : Pat<(v512i1 (int_ppc_mma_xvi16ger2s v16i8:$XA, v16i8:$XB)),
-            (XVI16GER2S RCCp.AToVSRC, RCCp.BToVSRC)>;
-  def : Pat<(v512i1 (int_ppc_mma_xvi16ger2spp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
-            (XVI16GER2SPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
-
-  def : Pat<(v512i1 (int_ppc_mma_xvf16ger2 v16i8:$XA, v16i8:$XB)),
-            (XVF16GER2 RCCp.AToVSRC, RCCp.BToVSRC)>;
-  def : Pat<(v512i1 (int_ppc_mma_xvf16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
-            (XVF16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
-  def : Pat<(v512i1 (int_ppc_mma_xvf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
-            (XVF16GER2PN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
-  def : Pat<(v512i1 (int_ppc_mma_xvf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
-            (XVF16GER2NP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
-  def : Pat<(v512i1 (int_ppc_mma_xvf16ger2nn v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
-            (XVF16GER2NN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
-
-  def : Pat<(v512i1 (int_ppc_mma_xvf32ger v16i8:$XA, v16i8:$XB)),
-            (XVF32GER RCCp.AToVSRC, RCCp.BToVSRC)>;
-  def : Pat<(v512i1 (int_ppc_mma_xvf32gerpp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
-            (XVF32GERPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
-  def : Pat<(v512i1 (int_ppc_mma_xvf32gerpn v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
-            (XVF32GERPN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
-  def : Pat<(v512i1 (int_ppc_mma_xvf32gernp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
-            (XVF32GERNP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
-  def : Pat<(v512i1 (int_ppc_mma_xvf32gernn v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
-            (XVF32GERNN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
-  def : Pat<(v512i1 (int_ppc_mma_xvf64ger v256i1:$XA, v16i8:$XB)),
-            (XVF64GER $XA, RCCp.BToVSRC)>;
-  def : Pat<(v512i1 (int_ppc_mma_xvf64gerpp v512i1:$ATi, v256i1:$XA, v16i8:$XB)),
-            (XVF64GERPP $ATi, $XA, RCCp.BToVSRC)>;
-  def : Pat<(v512i1 (int_ppc_mma_xvf64gerpn v512i1:$ATi, v256i1:$XA, v16i8:$XB)),
-            (XVF64GERPN $ATi, $XA, RCCp.BToVSRC)>;
-  def : Pat<(v512i1 (int_ppc_mma_xvf64gernp v512i1:$ATi, v256i1:$XA, v16i8:$XB)),
-            (XVF64GERNP $ATi, $XA, RCCp.BToVSRC)>;
-  def : Pat<(v512i1 (int_ppc_mma_xvf64gernn v512i1:$ATi, v256i1:$XA, v16i8:$XB)),
-            (XVF64GERNN $ATi, $XA, RCCp.BToVSRC)>;
-
-  def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2 v16i8:$XA, v16i8:$XB)),
-            (XVBF16GER2 RCCp.AToVSRC, RCCp.BToVSRC)>;
-  def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
-            (XVBF16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
-  def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
-            (XVBF16GER2PN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
-  def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
-            (XVBF16GER2NP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
-  def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2nn v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
-            (XVBF16GER2NN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
-  def : Pat<(v512i1 (int_ppc_mma_xvi16ger2 v16i8:$XA, v16i8:$XB)),
-            (XVI16GER2 RCCp.AToVSRC, RCCp.BToVSRC)>;
-  def : Pat<(v512i1 (int_ppc_mma_xvi16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
-            (XVI16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
-  def : Pat<(v512i1 (int_ppc_mma_xvi8ger4spp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
-            (XVI8GER4SPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
-}
-
-// MMA Intrinsics
-let Predicates = [MMA, PrefixInstrs] in {
-  def : Pat<(v512i1 (int_ppc_mma_pmxvi4ger8 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
-                                            Msk4Imm:$YMSK, Msk8Imm:$PMSK)),
-            (PMXVI4GER8 RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
-                        Msk4Imm:$YMSK, Msk8Imm:$PMSK)>;
-  def : Pat<(v512i1 (int_ppc_mma_pmxvi4ger8pp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
-                                              Msk4Imm:$XMSK, Msk4Imm:$YMSK,
-                                              Msk8Imm:$PMSK)),
-            (PMXVI4GER8PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
-                          Msk4Imm:$YMSK, Msk8Imm:$PMSK)>;
-
-  def : Pat<(v512i1 (int_ppc_mma_pmxvi8ger4 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
-                                            Msk4Imm:$YMSK, Msk4Imm:$PMSK)),
-            (PMXVI8GER4 RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
-                        Msk4Imm:$YMSK, Msk4Imm:$PMSK)>;
-  def : Pat<(v512i1 (int_ppc_mma_pmxvi8ger4pp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
-                                              Msk4Imm:$XMSK, Msk4Imm:$YMSK,
-                                              Msk4Imm:$PMSK)),
-            (PMXVI8GER4PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
-                          Msk4Imm:$YMSK, Msk4Imm:$PMSK)>;
-
-  def : Pat<(v512i1 (int_ppc_mma_pmxvi16ger2s v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
-                                              Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
-            (PMXVI16GER2S RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
-                          Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
-  def : Pat<(v512i1 (int_ppc_mma_pmxvi16ger2spp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
-                                                Msk4Imm:$XMSK, Msk4Imm:$YMSK,
-                                                Msk2Imm:$PMSK)),
-            (PMXVI16GER2SPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
-                            Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
-  def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
-                                             Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
-            (PMXVF16GER2 RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
-                         Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
-  def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
-                                               Msk4Imm:$XMSK, Msk4Imm:$YMSK,
-                                               Msk2Imm:$PMSK)),
-            (PMXVF16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
-                           Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
-  def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
-                                               Msk4Imm:$XMSK, Msk4Imm:$YMSK,
-                                               Msk2Imm:$PMSK)),
-            (PMXVF16GER2PN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
-                           Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
-  def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB,
-                                               Msk4Imm:$XMSK, Msk4Imm:$YMSK,
-                                               Msk2Imm:$PMSK)),
-            (PMXVF16GER2NP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
-                           Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
-  def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
-                                               Msk4Imm:$XMSK, Msk4Imm:$YMSK,
-                                               Msk2Imm:$PMSK)),
-            (PMXVF16GER2PN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
-                           Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
-  def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB,
-                                               Msk4Imm:$XMSK, Msk4Imm:$YMSK,
-                                               Msk2Imm:$PMSK)),
-            (PMXVF16GER2NP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
-                           Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
-  def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2nn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
-                                               Msk4Imm:$XMSK, Msk4Imm:$YMSK,
-                                               Msk2Imm:$PMSK)),
-            (PMXVF16GER2NN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
-                           Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
-
-  def : Pat<(v512i1 (int_ppc_mma_pmxvf32ger v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
-                                            Msk4Imm:$YMSK)),
-            (PMXVF32GER RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
-                        Msk4Imm:$YMSK)>;
-  def : Pat<(v512i1 (int_ppc_mma_pmxvf32gerpp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
-                                              Msk4Imm:$XMSK, Msk4Imm:$YMSK)),
-            (PMXVF32GERPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
-                          Msk4Imm:$YMSK)>;
-  def : Pat<(v512i1 (int_ppc_mma_pmxvf32gerpn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
-                                              Msk4Imm:$XMSK, Msk4Imm:$YMSK)),
-            (PMXVF32GERPN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
-                          Msk4Imm:$YMSK)>;
-  def : Pat<(v512i1 (int_ppc_mma_pmxvf32gernp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
-                                              Msk4Imm:$XMSK, Msk4Imm:$YMSK)),
-            (PMXVF32GERNP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
-                          Msk4Imm:$YMSK)>;
-  def : Pat<(v512i1 (int_ppc_mma_pmxvf32gernn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
-                                              Msk4Imm:$XMSK, Msk4Imm:$YMSK)),
-            (PMXVF32GERNN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
-                          Msk4Imm:$YMSK)>;
-
-  def : Pat<(v512i1 (int_ppc_mma_pmxvf64ger v256i1:$XA, v16i8:$XB, Msk4Imm:$XMSK,
-                                            Msk2Imm:$YMSK)),
-            (PMXVF64GER $XA, RCCp.BToVSRC, Msk4Imm:$XMSK, Msk2Imm:$YMSK)>;
-  def : Pat<(v512i1 (int_ppc_mma_pmxvf64gerpp v512i1:$ATi, v256i1:$XA, v16i8:$XB,
-                                              Msk4Imm:$XMSK, Msk2Imm:$YMSK)),
-            (PMXVF64GERPP $ATi, $XA, RCCp.BToVSRC, Msk4Imm:$XMSK,
-                          Msk2Imm:$YMSK)>;
-  def : Pat<(v512i1 (int_ppc_mma_pmxvf64gerpn v512i1:$ATi, v256i1:$XA, v16i8:$XB,
-                                              Msk4Imm:$XMSK, Msk2Imm:$YMSK)),
-            (PMXVF64GERPN $ATi, $XA, RCCp.BToVSRC, Msk4Imm:$XMSK,
-                          Msk2Imm:$YMSK)>;
-  def : Pat<(v512i1 (int_ppc_mma_pmxvf64gernp v512i1:$ATi, v256i1:$XA, v16i8:$XB,
-                                              Msk4Imm:$XMSK, Msk2Imm:$YMSK)),
-            (PMXVF64GERNP $ATi, $XA, RCCp.BToVSRC, Msk4Imm:$XMSK,
-                          Msk2Imm:$YMSK)>;
-  def : Pat<(v512i1 (int_ppc_mma_pmxvf64gernn v512i1:$ATi, v256i1:$XA, v16i8:$XB,
-                                              Msk4Imm:$XMSK, Msk2Imm:$YMSK)),
-            (PMXVF64GERNN $ATi, $XA, RCCp.BToVSRC, Msk4Imm:$XMSK,
-                          Msk2Imm:$YMSK)>;
-
-  def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
-                                              Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
-            (PMXVBF16GER2 RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
-                          Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
-  def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
-                                                Msk4Imm:$XMSK, Msk4Imm:$YMSK,
-                                                Msk2Imm:$PMSK)),
-            (PMXVBF16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
-                            Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
-  def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
-                                                Msk4Imm:$XMSK, Msk4Imm:$YMSK,
-                                                Msk2Imm:$PMSK)),
-            (PMXVBF16GER2PN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
-                            Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
-  def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB,
-                                                Msk4Imm:$XMSK, Msk4Imm:$YMSK,
-                                                Msk2Imm:$PMSK)),
-            (PMXVBF16GER2NP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
-                            Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
-  def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2nn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
-                                                Msk4Imm:$XMSK, Msk4Imm:$YMSK,
-                                                Msk2Imm:$PMSK)),
-            (PMXVBF16GER2NN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
-                            Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
-  def : Pat<(v512i1 (int_ppc_mma_pmxvi16ger2 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
-                                             Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
-            (PMXVI16GER2 RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
-                         Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
-  def : Pat<(v512i1 (int_ppc_mma_pmxvi8ger4spp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
-                                               Msk4Imm:$XMSK, Msk4Imm:$YMSK,
-                                               Msk2Imm:$PMSK)),
-            (PMXVI8GER4SPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
-                           Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
-  def : Pat<(v512i1 (int_ppc_mma_pmxvi16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
-                                               Msk4Imm:$XMSK, Msk4Imm:$YMSK,
-                                               Msk2Imm:$PMSK)),
-            (PMXVI16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
-                           Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
-}
-
-def Concats {
-  dag VecsToVecPair0 =
-    (v256i1 (INSERT_SUBREG
-      (INSERT_SUBREG (IMPLICIT_DEF), $vs0, sub_vsx1),
-      $vs1, sub_vsx0));
-  dag VecsToVecPair1 =
-    (v256i1 (INSERT_SUBREG
-      (INSERT_SUBREG (IMPLICIT_DEF), $vs2, sub_vsx1),
-      $vs3, sub_vsx0));
-  dag VecsToVecQuad =
-    (BUILD_UACC (INSERT_SUBREG
-                  (INSERT_SUBREG (v512i1 (IMPLICIT_DEF)),
-                                 (KILL_PAIR VecsToVecPair0), sub_pair0),
-                  (KILL_PAIR VecsToVecPair1), sub_pair1));
-}
-
-def Extracts {
-  dag Pair0 = (v256i1 (EXTRACT_SUBREG $v, sub_pair0));
-  dag Pair1 = (v256i1 (EXTRACT_SUBREG $v, sub_pair1));
-  dag Vec0 = (v4i32 (EXTRACT_SUBREG Pair0, sub_vsx0));
-  dag Vec1 = (v4i32 (EXTRACT_SUBREG Pair0, sub_vsx1));
-  dag Vec2 = (v4i32 (EXTRACT_SUBREG Pair1, sub_vsx0));
-  dag Vec3 = (v4i32 (EXTRACT_SUBREG Pair1, sub_vsx1));
-}
-
-let Predicates = [MMA] in {
-  def : Pat<(v512i1 (PPCAccBuild v4i32:$vs1, v4i32:$vs0, v4i32:$vs3, v4i32:$vs2)),
-            (XXMTACC Concats.VecsToVecQuad)>;
-  def : Pat<(v512i1 (int_ppc_mma_assemble_acc v16i8:$vs1, v16i8:$vs0,
-                                              v16i8:$vs3, v16i8:$vs2)),
-            (XXMTACC Concats.VecsToVecQuad)>;
-  def : Pat<(v512i1 (PPCxxmfacc v512i1:$AS)), (XXMFACC acc:$AS)>;
-  def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, 0)),
-            Extracts.Vec0>;
-  def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, 1)),
-            Extracts.Vec1>;
-  def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, 2)),
-            Extracts.Vec2>;
-  def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, 3)),
-            Extracts.Vec3>;
-}
-
-let Predicates = [PairedVectorMemops] in {
-  def : Pat<(v256i1 (PPCPairBuild v4i32:$vs1, v4i32:$vs0)),
-            Concats.VecsToVecPair0>;
-  def : Pat<(v256i1 (int_ppc_vsx_assemble_pair v16i8:$vs1, v16i8:$vs0)),
-            Concats.VecsToVecPair0>;
-  def : Pat<(v4i32 (PPCPairExtractVsx vsrpevenrc:$v, 0)),
-            (v4i32 (EXTRACT_SUBREG $v, sub_vsx0))>;
-  def : Pat<(v4i32 (PPCPairExtractVsx vsrpevenrc:$v, 1)),
-            (v4i32 (EXTRACT_SUBREG $v, sub_vsx1))>;
-}
-
-let mayLoad = 1, mayStore = 0, Predicates = [PairedVectorMemops] in {
-  def LXVP : DQForm_XTp5_RA17_MEM<6, 0, (outs vsrprc:$XTp),
-                                  (ins memrix16:$DQ_RA), "lxvp $XTp, $DQ_RA",
-                                  IIC_LdStLFD, []>;
-  def LXVPX : XForm_XTp5_XAB5<31, 333, (outs vsrprc:$XTp), (ins memrr:$src),
-                              "lxvpx $XTp, $src", IIC_LdStLFD,
-                              []>;
-}
-
-let mayLoad = 0, mayStore = 1, Predicates = [PairedVectorMemops] in {
-  def STXVP : DQForm_XTp5_RA17_MEM<6, 1, (outs), (ins vsrprc:$XTp,
-                                   memrix16:$DQ_RA), "stxvp $XTp, $DQ_RA",
-                                   IIC_LdStLFD, []>;
-  def STXVPX : XForm_XTp5_XAB5<31, 461, (outs), (ins vsrprc:$XTp, memrr:$dst),
-                               "stxvpx $XTp, $dst", IIC_LdStLFD,
-                               []>;
-}
-
-let mayLoad = 1, mayStore = 0, Predicates = [PairedVectorMemops, PrefixInstrs] in {
-  defm PLXVP :
-    8LS_DForm_R_XTp5_SI34_MEM_p<58, (outs vsrprc:$XTp), (ins memri34:$D_RA),
-                                (ins memri34_pcrel:$D_RA), "plxvp $XTp, $D_RA",
-                                IIC_LdStLFD>;
-}
-
-let mayLoad = 0, mayStore = 1, Predicates = [PairedVectorMemops, PrefixInstrs] in {
-  defm PSTXVP :
-    8LS_DForm_R_XTp5_SI34_MEM_p<62, (outs), (ins vsrprc:$XTp, memri34:$D_RA),
-                                (ins vsrprc:$XTp, memri34_pcrel:$D_RA),
-                                "pstxvp $XTp, $D_RA", IIC_LdStLFD>;
-}
-
-let Predicates = [PairedVectorMemops] in {
-  // Intrinsics for Paired Vector Loads.
-  def : Pat<(v256i1 (int_ppc_vsx_lxvp DQForm:$src)), (LXVP memrix16:$src)>;
-  def : Pat<(v256i1 (int_ppc_vsx_lxvp XForm:$src)), (LXVPX XForm:$src)>;
-  let Predicates = [PairedVectorMemops, PrefixInstrs] in {
-    def : Pat<(v256i1 (int_ppc_vsx_lxvp PDForm:$src)), (PLXVP memri34:$src)>;
-  }
-  // Intrinsics for Paired Vector Stores.
-  def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, DQForm:$dst),
-            (STXVP $XSp, memrix16:$dst)>;
-  def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, XForm:$dst),
-            (STXVPX $XSp, XForm:$dst)>;
-  let Predicates = [PairedVectorMemops, PrefixInstrs] in {
-    def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, PDForm:$dst),
-              (PSTXVP $XSp, memri34:$dst)>;
-  }
-}
-
-let Predicates = [PCRelativeMemops] in {
-  // Load i32
-  def : Pat<(i32 (zextloadi1  (PPCmatpcreladdr PCRelForm:$ga))),
-            (PLBZpc $ga, 0)>;
-  def : Pat<(i32 (extloadi1  (PPCmatpcreladdr PCRelForm:$ga))),
-            (PLBZpc $ga, 0)>;
-  def : Pat<(i32 (zextloadi8  (PPCmatpcreladdr PCRelForm:$ga))),
-            (PLBZpc $ga, 0)>;
-  def : Pat<(i32 (extloadi8   (PPCmatpcreladdr PCRelForm:$ga))),
-            (PLBZpc $ga, 0)>;
-  def : Pat<(i32 (sextloadi16 (PPCmatpcreladdr PCRelForm:$ga))),
-            (PLHApc $ga, 0)>;
-  def : Pat<(i32 (zextloadi16 (PPCmatpcreladdr PCRelForm:$ga))),
-            (PLHZpc $ga, 0)>;
-  def : Pat<(i32 (extloadi16 (PPCmatpcreladdr PCRelForm:$ga))),
-            (PLHZpc $ga, 0)>;
-  def : Pat<(i32 (load (PPCmatpcreladdr PCRelForm:$ga))), (PLWZpc $ga, 0)>;
-
-  // Store i32
-  def : Pat<(truncstorei8 i32:$RS, (PPCmatpcreladdr PCRelForm:$ga)),
-            (PSTBpc $RS, $ga, 0)>;
-  def : Pat<(truncstorei16 i32:$RS, (PPCmatpcreladdr PCRelForm:$ga)),
-            (PSTHpc $RS, $ga, 0)>;
-  def : Pat<(store i32:$RS, (PPCmatpcreladdr PCRelForm:$ga)),
-            (PSTWpc $RS, $ga, 0)>;
-
-  // Load i64
-  def : Pat<(i64 (zextloadi1  (PPCmatpcreladdr PCRelForm:$ga))),
-            (PLBZ8pc $ga, 0)>;
-  def : Pat<(i64 (extloadi1  (PPCmatpcreladdr PCRelForm:$ga))),
-            (PLBZ8pc $ga, 0)>;
-  def : Pat<(i64 (zextloadi8  (PPCmatpcreladdr PCRelForm:$ga))),
-            (PLBZ8pc $ga, 0)>;
-  def : Pat<(i64 (extloadi8   (PPCmatpcreladdr PCRelForm:$ga))),
-            (PLBZ8pc $ga, 0)>;
-  def : Pat<(i64 (sextloadi16 (PPCmatpcreladdr PCRelForm:$ga))),
-            (PLHA8pc $ga, 0)>;
-  def : Pat<(i64 (zextloadi16 (PPCmatpcreladdr PCRelForm:$ga))),
-            (PLHZ8pc $ga, 0)>;
-  def : Pat<(i64 (extloadi16 (PPCmatpcreladdr PCRelForm:$ga))),
-            (PLHZ8pc $ga, 0)>;
-  def : Pat<(i64 (zextloadi32 (PPCmatpcreladdr PCRelForm:$ga))),
-            (PLWZ8pc $ga, 0)>;
-  def : Pat<(i64 (sextloadi32 (PPCmatpcreladdr PCRelForm:$ga))),
-            (PLWA8pc $ga, 0)>;
-  def : Pat<(i64 (extloadi32 (PPCmatpcreladdr PCRelForm:$ga))),
-            (PLWZ8pc $ga, 0)>;
-  def : Pat<(i64 (load (PPCmatpcreladdr PCRelForm:$ga))), (PLDpc $ga, 0)>;
-
-  // Store i64
-  def : Pat<(truncstorei8 i64:$RS, (PPCmatpcreladdr PCRelForm:$ga)),
-            (PSTB8pc $RS, $ga, 0)>;
-  def : Pat<(truncstorei16 i64:$RS, (PPCmatpcreladdr PCRelForm:$ga)),
-            (PSTH8pc $RS, $ga, 0)>;
-  def : Pat<(truncstorei32 i64:$RS, (PPCmatpcreladdr PCRelForm:$ga)),
-            (PSTW8pc $RS, $ga, 0)>;
-  def : Pat<(store i64:$RS, (PPCmatpcreladdr PCRelForm:$ga)),
-            (PSTDpc $RS, $ga, 0)>;
-
-  // Load f32
-  def : Pat<(f32 (load (PPCmatpcreladdr PCRelForm:$addr))), (PLFSpc $addr, 0)>;
-
-  // Store f32
-  def : Pat<(store f32:$FRS, (PPCmatpcreladdr PCRelForm:$ga)),
-            (PSTFSpc $FRS, $ga, 0)>;
-
-  // Load f64
-  def : Pat<(f64 (extloadf32 (PPCmatpcreladdr PCRelForm:$addr))),
-            (COPY_TO_REGCLASS (PLFSpc $addr, 0), VSFRC)>;
-  def : Pat<(f64 (load (PPCmatpcreladdr PCRelForm:$addr))), (PLFDpc $addr, 0)>;
-
-  // Store f64
-  def : Pat<(store f64:$FRS, (PPCmatpcreladdr PCRelForm:$ga)),
-            (PSTFDpc $FRS, $ga, 0)>;
-
-  // Load f128
-  def : Pat<(f128 (load (PPCmatpcreladdr PCRelForm:$addr))),
-            (COPY_TO_REGCLASS (PLXVpc $addr, 0), VRRC)>;
-
-  // Store f128
-  def : Pat<(store f128:$XS, (PPCmatpcreladdr PCRelForm:$ga)),
-            (PSTXVpc (COPY_TO_REGCLASS $XS, VSRC), $ga, 0)>;
-
-  // Load v4i32
-  def : Pat<(v4i32 (load (PPCmatpcreladdr PCRelForm:$addr))), (PLXVpc $addr, 0)>;
-
-  // Store v4i32
-  def : Pat<(store v4i32:$XS, (PPCmatpcreladdr PCRelForm:$ga)),
-            (PSTXVpc $XS, $ga, 0)>;
-
-  // Load v2i64
-  def : Pat<(v2i64 (load (PPCmatpcreladdr PCRelForm:$addr))), (PLXVpc $addr, 0)>;
-
-  // Store v2i64
-  def : Pat<(store v2i64:$XS, (PPCmatpcreladdr PCRelForm:$ga)),
-            (PSTXVpc $XS, $ga, 0)>;
-
-  // Load v4f32
-  def : Pat<(v4f32 (load (PPCmatpcreladdr PCRelForm:$addr))), (PLXVpc $addr, 0)>;
-
-  // Store v4f32
-  def : Pat<(store v4f32:$XS, (PPCmatpcreladdr PCRelForm:$ga)),
-            (PSTXVpc $XS, $ga, 0)>;
-
-  // Load v2f64
-  def : Pat<(v2f64 (load (PPCmatpcreladdr PCRelForm:$addr))), (PLXVpc $addr, 0)>;
-
-  // Store v2f64
-  def : Pat<(store v2f64:$XS, (PPCmatpcreladdr PCRelForm:$ga)),
-            (PSTXVpc $XS, $ga, 0)>;
-
-  // Atomic Load
-  def : Pat<(atomic_load_8 (PPCmatpcreladdr PCRelForm:$ga)),
-            (PLBZpc $ga, 0)>;
-  def : Pat<(atomic_load_16 (PPCmatpcreladdr PCRelForm:$ga)),
-            (PLHZpc $ga, 0)>;
-  def : Pat<(atomic_load_32 (PPCmatpcreladdr PCRelForm:$ga)),
-            (PLWZpc $ga, 0)>;
-  def : Pat<(atomic_load_64 (PPCmatpcreladdr PCRelForm:$ga)),
-            (PLDpc $ga, 0)>;
-
-  // Atomic Store
-  def : Pat<(atomic_store_8 (PPCmatpcreladdr PCRelForm:$ga), i32:$RS),
-            (PSTBpc $RS, $ga, 0)>;
-  def : Pat<(atomic_store_16 (PPCmatpcreladdr PCRelForm:$ga), i32:$RS),
-            (PSTHpc $RS, $ga, 0)>;
-  def : Pat<(atomic_store_32 (PPCmatpcreladdr PCRelForm:$ga), i32:$RS),
-            (PSTWpc $RS, $ga, 0)>;
-  def : Pat<(atomic_store_8 (PPCmatpcreladdr PCRelForm:$ga), i64:$RS),
-            (PSTB8pc $RS, $ga, 0)>;
-  def : Pat<(atomic_store_16 (PPCmatpcreladdr PCRelForm:$ga), i64:$RS),
-            (PSTH8pc $RS, $ga, 0)>;
-  def : Pat<(atomic_store_32 (PPCmatpcreladdr PCRelForm:$ga), i64:$RS),
-            (PSTW8pc $RS, $ga, 0)>;
-  def : Pat<(atomic_store_64 (PPCmatpcreladdr PCRelForm:$ga), i64:$RS),
-            (PSTDpc $RS, $ga, 0)>;
-
-  // Special Cases For PPCstore_scal_int_from_vsr
-  def : Pat<(PPCstore_scal_int_from_vsr
-              (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)),
-              (PPCmatpcreladdr PCRelForm:$dst), 8),
-            (PSTXSDpc (XSCVDPSXDS f64:$src), $dst, 0)>;
-  def : Pat<(PPCstore_scal_int_from_vsr
-              (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)),
-              (PPCmatpcreladdr PCRelForm:$dst), 8),
-            (PSTXSDpc (COPY_TO_REGCLASS (XSCVQPSDZ f128:$src), VFRC), $dst, 0)>;
-
-  def : Pat<(PPCstore_scal_int_from_vsr
-              (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)),
-              (PPCmatpcreladdr PCRelForm:$dst), 8),
-            (PSTXSDpc (XSCVDPUXDS f64:$src), $dst, 0)>;
-  def : Pat<(PPCstore_scal_int_from_vsr
-              (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)),
-              (PPCmatpcreladdr PCRelForm:$dst), 8),
-            (PSTXSDpc (COPY_TO_REGCLASS (XSCVQPUDZ f128:$src), VFRC), $dst, 0)>;
-
-  def : Pat<(v4f32 (PPCldvsxlh (PPCmatpcreladdr PCRelForm:$addr))),
-            (SUBREG_TO_REG (i64 1), (PLFDpc $addr, 0), sub_64)>;
-
-  // If the PPCmatpcreladdr node is not caught by any other pattern it should be
-  // caught here and turned into a paddi instruction to materialize the address.
-  def : Pat<(PPCmatpcreladdr PCRelForm:$addr), (PADDI8pc 0, $addr)>;
-  // PPCtlsdynamatpcreladdr node is used for TLS dynamic models to materialize
-  // tls global address with paddi instruction.
-  def : Pat<(PPCtlsdynamatpcreladdr PCRelForm:$addr), (PADDI8pc 0, $addr)>;
-  // PPCtlslocalexecmataddr node is used for TLS local exec models to
-  // materialize tls global address with paddi instruction.
-  def : Pat<(PPCaddTls i64:$in, (PPCtlslocalexecmataddr tglobaltlsaddr:$addr)),
-            (PADDI8 $in, $addr)>;
-}
-
-let Predicates = [PrefixInstrs] in {
-  def XXPERMX :
-    8RR_XX4Form_IMM3_XTABC6<34, 0, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB,
-                            vsrc:$XC, u3imm:$UIM),
-                            "xxpermx $XT, $XA, $XB, $XC, $UIM",
-                            IIC_VecPerm, []>;
-  def XXBLENDVB :
-    8RR_XX4Form_XTABC6<33, 0, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB,
-                       vsrc:$XC), "xxblendvb $XT, $XA, $XB, $XC",
-                       IIC_VecGeneral, []>;
-  def XXBLENDVH :
-    8RR_XX4Form_XTABC6<33, 1, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB,
-                       vsrc:$XC), "xxblendvh $XT, $XA, $XB, $XC",
-                       IIC_VecGeneral, []>;
-  def XXBLENDVW :
-    8RR_XX4Form_XTABC6<33, 2, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB,
-                       vsrc:$XC), "xxblendvw $XT, $XA, $XB, $XC",
-                       IIC_VecGeneral, []>;
-  def XXBLENDVD :
-    8RR_XX4Form_XTABC6<33, 3, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB,
-                       vsrc:$XC), "xxblendvd $XT, $XA, $XB, $XC",
-                       IIC_VecGeneral, []>;
-}
-
-// XXSPLTIW/DP/32DX need extra flags to make sure the compiler does not attempt
-// to spill part of the instruction when the values are similar.
-let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1, Predicates = [PrefixInstrs] in {
-  def XXSPLTIW : 8RR_DForm_IMM32_XT6<32, 3, (outs vsrc:$XT),
-                                     (ins i32imm:$IMM32),
-                                     "xxspltiw $XT, $IMM32", IIC_VecGeneral,
-                                     []>;
-  def XXSPLTIDP : 8RR_DForm_IMM32_XT6<32, 2, (outs vsrc:$XT),
-                                      (ins i32imm:$IMM32),
-                                      "xxspltidp $XT, $IMM32", IIC_VecGeneral,
-                                      [(set v2f64:$XT,
-                                            (PPCxxspltidp i32:$IMM32))]>;
-  def XXSPLTI32DX :
-      8RR_DForm_IMM32_XT6_IX<32, 0, (outs vsrc:$XT),
-                             (ins vsrc:$XTi, u1imm:$IX, i32imm:$IMM32),
-                             "xxsplti32dx $XT, $IX, $IMM32", IIC_VecGeneral,
-                             [(set v2i64:$XT,
-                                   (PPCxxsplti32dx v2i64:$XTi, i32:$IX,
-                                                   i32:$IMM32))]>,
-                             RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">;
-}
-
-let Predicates = [IsISA3_1] in {
-  def SETBC : XForm_XT5_BI5<31, 384, (outs gprc:$RT), (ins crbitrc:$BI),
-                            "setbc $RT, $BI", IIC_IntCompare, []>;
-  def SETBCR : XForm_XT5_BI5<31, 416, (outs gprc:$RT), (ins crbitrc:$BI),
-                             "setbcr $RT, $BI", IIC_IntCompare, []>;
-  def SETNBC : XForm_XT5_BI5<31, 448, (outs gprc:$RT), (ins crbitrc:$BI),
-                             "setnbc $RT, $BI", IIC_IntCompare, []>;
-  def SETNBCR : XForm_XT5_BI5<31, 480, (outs gprc:$RT), (ins crbitrc:$BI),
-                              "setnbcr $RT, $BI", IIC_IntCompare, []>;
-
-  let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
-    def SETBC8 : XForm_XT5_BI5<31, 384, (outs g8rc:$RT), (ins crbitrc:$BI),
-                               "setbc $RT, $BI", IIC_IntCompare, []>;
-    def SETBCR8 : XForm_XT5_BI5<31, 416, (outs g8rc:$RT), (ins crbitrc:$BI),
-                                "setbcr $RT, $BI", IIC_IntCompare, []>;
-    def SETNBC8 : XForm_XT5_BI5<31, 448, (outs g8rc:$RT), (ins crbitrc:$BI),
-                                "setnbc $RT, $BI", IIC_IntCompare, []>;
-    def SETNBCR8 : XForm_XT5_BI5<31, 480, (outs g8rc:$RT), (ins crbitrc:$BI),
-                                 "setnbcr $RT, $BI", IIC_IntCompare, []>;
-  }
-
-  def VSLDBI : VNForm_VTAB5_SD3<22, 0, (outs vrrc:$VRT),
-                                (ins vrrc:$VRA, vrrc:$VRB, u3imm:$SH),
-                                "vsldbi $VRT, $VRA, $VRB, $SH",
-                                IIC_VecGeneral, 
-                                [(set v16i8:$VRT,
-                                      (int_ppc_altivec_vsldbi v16i8:$VRA,
-                                                              v16i8:$VRB,
-                                                              timm:$SH))]>;
-  def VSRDBI : VNForm_VTAB5_SD3<22, 1, (outs vrrc:$VRT),
-                                (ins vrrc:$VRA, vrrc:$VRB, u3imm:$SH),
-                                "vsrdbi $VRT, $VRA, $VRB, $SH",
-                                IIC_VecGeneral,
-                                [(set v16i8:$VRT,
-                                      (int_ppc_altivec_vsrdbi v16i8:$VRA,
-                                                              v16i8:$VRB, 
-                                                              timm:$SH))]>;
-  defm VSTRIBR : VXForm_VTB5_RCr<13, 1, (outs vrrc:$vT), (ins vrrc:$vB),
-                                 "vstribr", "$vT, $vB", IIC_VecGeneral,
-				 [(set v16i8:$vT,
-                                       (int_ppc_altivec_vstribr v16i8:$vB))]>;
-  defm VSTRIBL : VXForm_VTB5_RCr<13, 0, (outs vrrc:$vT), (ins vrrc:$vB),
-                                 "vstribl", "$vT, $vB", IIC_VecGeneral,
-                                 [(set v16i8:$vT,
-                                       (int_ppc_altivec_vstribl v16i8:$vB))]>;
-  defm VSTRIHR : VXForm_VTB5_RCr<13, 3, (outs vrrc:$vT), (ins vrrc:$vB),
-                                 "vstrihr", "$vT, $vB", IIC_VecGeneral,
-                                 [(set v8i16:$vT,
-                                       (int_ppc_altivec_vstrihr v8i16:$vB))]>;
-  defm VSTRIHL : VXForm_VTB5_RCr<13, 2, (outs vrrc:$vT), (ins vrrc:$vB),
-                                 "vstrihl", "$vT, $vB", IIC_VecGeneral,
-                                 [(set v8i16:$vT,
-                                       (int_ppc_altivec_vstrihl v8i16:$vB))]>;
-  def VINSW :
-    VXForm_1<207, (outs vrrc:$vD), (ins vrrc:$vDi, u4imm:$UIM, gprc:$rB),
-             "vinsw $vD, $rB, $UIM", IIC_VecGeneral,
-             [(set v4i32:$vD,
-                   (int_ppc_altivec_vinsw v4i32:$vDi, i32:$rB, timm:$UIM))]>,
-             RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
-  def VINSD :
-    VXForm_1<463, (outs vrrc:$vD), (ins vrrc:$vDi, u4imm:$UIM, g8rc:$rB),
-             "vinsd $vD, $rB, $UIM", IIC_VecGeneral,
-             [(set v2i64:$vD,
-                   (int_ppc_altivec_vinsd v2i64:$vDi, i64:$rB, timm:$UIM))]>,
-             RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
-  def VINSBVLX :
-    VXForm_VTB5_RA5_ins<15, "vinsbvlx",
-                        [(set v16i8:$vD,
-                              (int_ppc_altivec_vinsbvlx v16i8:$vDi, i32:$rA,
-                                                        v16i8:$vB))]>;
-  def VINSBVRX :
-    VXForm_VTB5_RA5_ins<271, "vinsbvrx",
-                        [(set v16i8:$vD,
-                              (int_ppc_altivec_vinsbvrx v16i8:$vDi, i32:$rA,
-                                                        v16i8:$vB))]>;
-  def VINSHVLX :
-    VXForm_VTB5_RA5_ins<79, "vinshvlx",
-                        [(set v8i16:$vD,
-                              (int_ppc_altivec_vinshvlx v8i16:$vDi, i32:$rA,
-                                                        v8i16:$vB))]>;
-  def VINSHVRX :
-    VXForm_VTB5_RA5_ins<335, "vinshvrx",
-                        [(set v8i16:$vD,
-                              (int_ppc_altivec_vinshvrx v8i16:$vDi, i32:$rA,
-                                                        v8i16:$vB))]>;
-  def VINSWVLX :
-    VXForm_VTB5_RA5_ins<143, "vinswvlx",
-                        [(set v4i32:$vD,
-                              (int_ppc_altivec_vinswvlx v4i32:$vDi, i32:$rA,
-                                                        v4i32:$vB))]>;
-  def VINSWVRX :
-    VXForm_VTB5_RA5_ins<399, "vinswvrx",
-                        [(set v4i32:$vD,
-                              (int_ppc_altivec_vinswvrx v4i32:$vDi, i32:$rA,
-                                                        v4i32:$vB))]>;
-  def VINSBLX :
-    VXForm_VRT5_RAB5_ins<527, "vinsblx",
-                         [(set v16i8:$vD,
-                               (int_ppc_altivec_vinsblx v16i8:$vDi, i32:$rA,
-                                                        i32:$rB))]>;
-  def VINSBRX :
-    VXForm_VRT5_RAB5_ins<783, "vinsbrx",
-                         [(set v16i8:$vD,
-                               (int_ppc_altivec_vinsbrx v16i8:$vDi, i32:$rA,
-                                                        i32:$rB))]>;
-  def VINSHLX :
-    VXForm_VRT5_RAB5_ins<591, "vinshlx",
-                         [(set v8i16:$vD,
-                               (int_ppc_altivec_vinshlx v8i16:$vDi, i32:$rA,
-                                                        i32:$rB))]>;
-  def VINSHRX :
-    VXForm_VRT5_RAB5_ins<847, "vinshrx",
-                         [(set v8i16:$vD,
-                               (int_ppc_altivec_vinshrx v8i16:$vDi, i32:$rA,
-                                                        i32:$rB))]>;
-  def VINSWLX :
-    VXForm_VRT5_RAB5_ins<655, "vinswlx",
-                         [(set v4i32:$vD,
-                               (int_ppc_altivec_vinswlx v4i32:$vDi, i32:$rA,
-                                                        i32:$rB))]>;
-  def VINSWRX :
-    VXForm_VRT5_RAB5_ins<911, "vinswrx",
-                         [(set v4i32:$vD,
-                               (int_ppc_altivec_vinswrx v4i32:$vDi, i32:$rA,
-                                                        i32:$rB))]>;
-  def VINSDLX :
-    VXForm_1<719, (outs vrrc:$vD), (ins vrrc:$vDi, g8rc:$rA, g8rc:$rB),
-             "vinsdlx $vD, $rA, $rB", IIC_VecGeneral,
-              [(set v2i64:$vD,
-                    (int_ppc_altivec_vinsdlx v2i64:$vDi, i64:$rA, i64:$rB))]>,
-              RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
-  def VINSDRX :
-    VXForm_1<975, (outs vrrc:$vD), (ins vrrc:$vDi, g8rc:$rA, g8rc:$rB),
-             "vinsdrx $vD, $rA, $rB", IIC_VecGeneral,
-              [(set v2i64:$vD,
-                    (int_ppc_altivec_vinsdrx v2i64:$vDi, i64:$rA, i64:$rB))]>,
-              RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
-  def VEXTRACTBM : VXForm_RD5_XO5_RS5<1602, 8, (outs gprc:$rD), (ins vrrc:$vB),
-                                      "vextractbm $rD, $vB", IIC_VecGeneral,
-                                      [(set i32:$rD,
-                                      (int_ppc_altivec_vextractbm v16i8:$vB))]>;
-  def VEXTRACTHM : VXForm_RD5_XO5_RS5<1602, 9, (outs gprc:$rD), (ins vrrc:$vB),
-                                      "vextracthm $rD, $vB", IIC_VecGeneral,
-                                      [(set i32:$rD,
-                                      (int_ppc_altivec_vextracthm v8i16:$vB))]>;
-  def VEXTRACTWM : VXForm_RD5_XO5_RS5<1602, 10, (outs gprc:$rD), (ins vrrc:$vB),
-                                      "vextractwm $rD, $vB", IIC_VecGeneral,
-                                      [(set i32:$rD,
-                                      (int_ppc_altivec_vextractwm v4i32:$vB))]>;
-  def VEXTRACTDM : VXForm_RD5_XO5_RS5<1602, 11, (outs gprc:$rD), (ins vrrc:$vB),
-                                      "vextractdm $rD, $vB", IIC_VecGeneral,
-                                      [(set i32:$rD,
-                                      (int_ppc_altivec_vextractdm v2i64:$vB))]>;
-  def VEXTRACTQM : VXForm_RD5_XO5_RS5<1602, 12, (outs gprc:$rD), (ins vrrc:$vB),
-                                      "vextractqm $rD, $vB", IIC_VecGeneral,
-                                      [(set i32:$rD,
-                                      (int_ppc_altivec_vextractqm v1i128:$vB))]>;
-  def VEXPANDBM : VXForm_RD5_XO5_RS5<1602, 0, (outs vrrc:$vD), (ins vrrc:$vB),
-                                     "vexpandbm $vD, $vB", IIC_VecGeneral,
-                                     [(set v16i8:$vD, (int_ppc_altivec_vexpandbm
-                                           v16i8:$vB))]>;
-  def VEXPANDHM : VXForm_RD5_XO5_RS5<1602, 1, (outs vrrc:$vD), (ins vrrc:$vB),
-                                     "vexpandhm $vD, $vB", IIC_VecGeneral,
-                                     [(set v8i16:$vD, (int_ppc_altivec_vexpandhm
-                                           v8i16:$vB))]>;
-  def VEXPANDWM : VXForm_RD5_XO5_RS5<1602, 2, (outs vrrc:$vD), (ins vrrc:$vB),
-                                     "vexpandwm $vD, $vB", IIC_VecGeneral,
-                                     [(set v4i32:$vD, (int_ppc_altivec_vexpandwm
-                                           v4i32:$vB))]>;
-  def VEXPANDDM : VXForm_RD5_XO5_RS5<1602, 3, (outs vrrc:$vD), (ins vrrc:$vB),
-                                     "vexpanddm $vD, $vB", IIC_VecGeneral,
-                                     [(set v2i64:$vD, (int_ppc_altivec_vexpanddm
-                                           v2i64:$vB))]>;
-  def VEXPANDQM : VXForm_RD5_XO5_RS5<1602, 4, (outs vrrc:$vD), (ins vrrc:$vB),
-                                     "vexpandqm $vD, $vB", IIC_VecGeneral,
-                                     [(set v1i128:$vD, (int_ppc_altivec_vexpandqm
-                                           v1i128:$vB))]>;
-  def MTVSRBM : VXForm_RD5_XO5_RS5<1602, 16, (outs vrrc:$vD), (ins g8rc:$rB),
-                                   "mtvsrbm $vD, $rB", IIC_VecGeneral,
-                                   [(set v16i8:$vD,
-                                         (int_ppc_altivec_mtvsrbm i64:$rB))]>;
-  def MTVSRHM : VXForm_RD5_XO5_RS5<1602, 17, (outs vrrc:$vD), (ins g8rc:$rB),
-                                   "mtvsrhm $vD, $rB", IIC_VecGeneral,
-                                   [(set v8i16:$vD,
-                                         (int_ppc_altivec_mtvsrhm i64:$rB))]>;
-  def MTVSRWM : VXForm_RD5_XO5_RS5<1602, 18, (outs vrrc:$vD), (ins g8rc:$rB),
-                                   "mtvsrwm $vD, $rB", IIC_VecGeneral,
-                                   [(set v4i32:$vD,
-                                         (int_ppc_altivec_mtvsrwm i64:$rB))]>;
-  def MTVSRDM : VXForm_RD5_XO5_RS5<1602, 19, (outs vrrc:$vD), (ins g8rc:$rB),
-                                   "mtvsrdm $vD, $rB", IIC_VecGeneral,
-                                   [(set v2i64:$vD,
-                                         (int_ppc_altivec_mtvsrdm i64:$rB))]>;
-  def MTVSRQM : VXForm_RD5_XO5_RS5<1602, 20, (outs vrrc:$vD), (ins g8rc:$rB),
-                                   "mtvsrqm $vD, $rB", IIC_VecGeneral,
-                                   [(set v1i128:$vD,
-                                         (int_ppc_altivec_mtvsrqm i64:$rB))]>;
-  def MTVSRBMI : DXForm<4, 10, (outs vrrc:$vD), (ins u16imm64:$D),
-                        "mtvsrbmi $vD, $D", IIC_VecGeneral,
-                        [(set v16i8:$vD,
-                              (int_ppc_altivec_mtvsrbm imm:$D))]>;
-  def VCNTMBB : VXForm_RD5_MP_VB5<1602, 12, (outs g8rc:$rD),
-                                  (ins vrrc:$vB, u1imm:$MP),
-                                  "vcntmbb $rD, $vB, $MP", IIC_VecGeneral,
-                                  [(set i64:$rD, (int_ppc_altivec_vcntmbb
-                                        v16i8:$vB, timm:$MP))]>;
-  def VCNTMBH : VXForm_RD5_MP_VB5<1602, 13, (outs g8rc:$rD),
-                                  (ins vrrc:$vB, u1imm:$MP),
-                                  "vcntmbh $rD, $vB, $MP", IIC_VecGeneral,
-                                  [(set i64:$rD, (int_ppc_altivec_vcntmbh
-                                        v8i16:$vB, timm:$MP))]>;
-  def VCNTMBW : VXForm_RD5_MP_VB5<1602, 14, (outs g8rc:$rD),
-                                  (ins vrrc:$vB, u1imm:$MP),
-                                  "vcntmbw $rD, $vB, $MP", IIC_VecGeneral,
-                                  [(set i64:$rD, (int_ppc_altivec_vcntmbw
-                                        v4i32:$vB, timm:$MP))]>;
-  def VCNTMBD : VXForm_RD5_MP_VB5<1602, 15, (outs g8rc:$rD),
-                                  (ins vrrc:$vB, u1imm:$MP),
-                                  "vcntmbd $rD, $vB, $MP", IIC_VecGeneral,
-                                  [(set i64:$rD, (int_ppc_altivec_vcntmbd
-                                        v2i64:$vB, timm:$MP))]>;
-  def VEXTDUBVLX : VAForm_1a<24, (outs vrrc:$vD),
-                             (ins vrrc:$vA, vrrc:$vB, gprc:$rC),
-                             "vextdubvlx $vD, $vA, $vB, $rC",
-                             IIC_VecGeneral,
-                             [(set v2i64:$vD,
-                                   (int_ppc_altivec_vextdubvlx v16i8:$vA,
-                                                               v16i8:$vB,
-                                                               i32:$rC))]>;
-  def VEXTDUBVRX : VAForm_1a<25, (outs vrrc:$vD),
-                             (ins vrrc:$vA, vrrc:$vB, gprc:$rC),
-                             "vextdubvrx $vD, $vA, $vB, $rC",
-                             IIC_VecGeneral,
-                             [(set v2i64:$vD,
-                                   (int_ppc_altivec_vextdubvrx v16i8:$vA,
-                                                               v16i8:$vB,
-                                                               i32:$rC))]>;
-  def VEXTDUHVLX : VAForm_1a<26, (outs vrrc:$vD),
-                             (ins vrrc:$vA, vrrc:$vB, gprc:$rC),
-                             "vextduhvlx $vD, $vA, $vB, $rC",
-                             IIC_VecGeneral,
-                             [(set v2i64:$vD,
-                                   (int_ppc_altivec_vextduhvlx v8i16:$vA,
-                                                               v8i16:$vB,
-                                                               i32:$rC))]>;
-  def VEXTDUHVRX : VAForm_1a<27, (outs vrrc:$vD),
-                             (ins vrrc:$vA, vrrc:$vB, gprc:$rC),
-                             "vextduhvrx $vD, $vA, $vB, $rC",
-                             IIC_VecGeneral,
-                             [(set v2i64:$vD,
-                                   (int_ppc_altivec_vextduhvrx v8i16:$vA,
-                                                               v8i16:$vB,
-                                                               i32:$rC))]>;
-  def VEXTDUWVLX : VAForm_1a<28, (outs vrrc:$vD),
-                             (ins vrrc:$vA, vrrc:$vB, gprc:$rC),
-                             "vextduwvlx $vD, $vA, $vB, $rC",
-                             IIC_VecGeneral,
-                             [(set v2i64:$vD,
-                                   (int_ppc_altivec_vextduwvlx v4i32:$vA,
-                                                               v4i32:$vB,
-                                                               i32:$rC))]>;
-  def VEXTDUWVRX : VAForm_1a<29, (outs vrrc:$vD),
-                             (ins vrrc:$vA, vrrc:$vB, gprc:$rC),
-                             "vextduwvrx $vD, $vA, $vB, $rC",
-                             IIC_VecGeneral,
-                             [(set v2i64:$vD,
-                                   (int_ppc_altivec_vextduwvrx v4i32:$vA,
-                                                               v4i32:$vB,
-                                                               i32:$rC))]>;
-  def VEXTDDVLX : VAForm_1a<30, (outs vrrc:$vD),
-                            (ins vrrc:$vA, vrrc:$vB, gprc:$rC),
-                            "vextddvlx $vD, $vA, $vB, $rC",
-                            IIC_VecGeneral,
-                            [(set v2i64:$vD,
-                                  (int_ppc_altivec_vextddvlx v2i64:$vA,
-                                                             v2i64:$vB,
-                                                             i32:$rC))]>;
-  def VEXTDDVRX : VAForm_1a<31, (outs vrrc:$vD),
-                            (ins vrrc:$vA, vrrc:$vB, gprc:$rC),
-                            "vextddvrx $vD, $vA, $vB, $rC",
-                            IIC_VecGeneral,
-                            [(set v2i64:$vD,
-                                  (int_ppc_altivec_vextddvrx v2i64:$vA,
-                                                             v2i64:$vB,
-                                                             i32:$rC))]>;
-   def VPDEPD : VXForm_1<1485, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                         "vpdepd $vD, $vA, $vB", IIC_VecGeneral,
-                         [(set v2i64:$vD,
-                         (int_ppc_altivec_vpdepd v2i64:$vA, v2i64:$vB))]>;
-   def VPEXTD : VXForm_1<1421, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                         "vpextd $vD, $vA, $vB", IIC_VecGeneral,
-                         [(set v2i64:$vD,
-                         (int_ppc_altivec_vpextd v2i64:$vA, v2i64:$vB))]>;
-   def PDEPD : XForm_6<31, 156, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
-                       "pdepd $rA, $rS, $rB", IIC_IntGeneral,
-                       [(set i64:$rA, (int_ppc_pdepd i64:$rS, i64:$rB))]>;
-   def PEXTD : XForm_6<31, 188, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
-                       "pextd $rA, $rS, $rB", IIC_IntGeneral,
-                       [(set i64:$rA, (int_ppc_pextd i64:$rS, i64:$rB))]>;
-   def VCFUGED : VXForm_1<1357, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                          "vcfuged $vD, $vA, $vB", IIC_VecGeneral,
-                          [(set v2i64:$vD,
-                          (int_ppc_altivec_vcfuged v2i64:$vA, v2i64:$vB))]>;
-   def VGNB : VXForm_RD5_N3_VB5<1228, (outs g8rc:$rD), (ins vrrc:$vB, u3imm:$N),
-                                "vgnb $rD, $vB, $N", IIC_VecGeneral,
-                                [(set i64:$rD,
-                                (int_ppc_altivec_vgnb v1i128:$vB, timm:$N))]>;
-   def CFUGED : XForm_6<31, 220, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
-                        "cfuged $rA, $rS, $rB", IIC_IntGeneral,
-                        [(set i64:$rA, (int_ppc_cfuged i64:$rS, i64:$rB))]>;
-   def XXEVAL :
-     8RR_XX4Form_IMM8_XTAB6<34, 1, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB,
-                            vsrc:$XC, u8imm:$IMM),
-                            "xxeval $XT, $XA, $XB, $XC, $IMM", IIC_VecGeneral,
-                            [(set v2i64:$XT, (int_ppc_vsx_xxeval v2i64:$XA,
-                                  v2i64:$XB, v2i64:$XC, timm:$IMM))]>;
-   def VCLZDM : VXForm_1<1924, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                         "vclzdm $vD, $vA, $vB", IIC_VecGeneral,
-                         [(set v2i64:$vD,
-                         (int_ppc_altivec_vclzdm v2i64:$vA, v2i64:$vB))]>;
-   def VCTZDM : VXForm_1<1988, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                         "vctzdm $vD, $vA, $vB", IIC_VecGeneral,
-                         [(set v2i64:$vD,
-                         (int_ppc_altivec_vctzdm v2i64:$vA, v2i64:$vB))]>;
-   def CNTLZDM : XForm_6<31, 59, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
-                         "cntlzdm $rA, $rS, $rB", IIC_IntGeneral,
-                         [(set i64:$rA,
-                         (int_ppc_cntlzdm i64:$rS, i64:$rB))]>;
-   def CNTTZDM : XForm_6<31, 571, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
-                         "cnttzdm $rA, $rS, $rB", IIC_IntGeneral,
-                         [(set i64:$rA,
-                         (int_ppc_cnttzdm i64:$rS, i64:$rB))]>;
-   def XXGENPCVBM :
-     XForm_XT6_IMM5_VB5<60, 916, (outs vsrc:$XT), (ins vrrc:$VRB, s5imm:$IMM),
-                        "xxgenpcvbm $XT, $VRB, $IMM", IIC_VecGeneral, []>;
-   def XXGENPCVHM :
-     XForm_XT6_IMM5_VB5<60, 917, (outs vsrc:$XT), (ins vrrc:$VRB, s5imm:$IMM),
-                        "xxgenpcvhm $XT, $VRB, $IMM", IIC_VecGeneral, []>;
-   def XXGENPCVWM :
-     XForm_XT6_IMM5_VB5<60, 948, (outs vsrc:$XT), (ins vrrc:$VRB, s5imm:$IMM),
-                        "xxgenpcvwm $XT, $VRB, $IMM", IIC_VecGeneral, []>;
-   def XXGENPCVDM :
-     XForm_XT6_IMM5_VB5<60, 949, (outs vsrc:$XT), (ins vrrc:$VRB, s5imm:$IMM),
-                        "xxgenpcvdm $XT, $VRB, $IMM", IIC_VecGeneral, []>;
-   def VCLRLB : VXForm_1<397, (outs vrrc:$vD), (ins vrrc:$vA, gprc:$rB),
-                         "vclrlb $vD, $vA, $rB", IIC_VecGeneral,
-                         [(set v16i8:$vD,
-                               (int_ppc_altivec_vclrlb v16i8:$vA, i32:$rB))]>;
-   def VCLRRB : VXForm_1<461, (outs vrrc:$vD), (ins vrrc:$vA, gprc:$rB),
-                         "vclrrb $vD, $vA, $rB", IIC_VecGeneral,
-                         [(set v16i8:$vD,
-                               (int_ppc_altivec_vclrrb v16i8:$vA, i32:$rB))]>;
-  def VMULLD : VXForm_1<457, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                        "vmulld $vD, $vA, $vB", IIC_VecGeneral,
-                        [(set v2i64:$vD, (mul v2i64:$vA, v2i64:$vB))]>;
-  def VMULHSW : VXForm_1<905, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                         "vmulhsw $vD, $vA, $vB", IIC_VecGeneral,
-                         [(set v4i32:$vD, (mulhs v4i32:$vA, v4i32:$vB))]>;
-  def VMULHUW : VXForm_1<649, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                         "vmulhuw $vD, $vA, $vB", IIC_VecGeneral,
-                         [(set v4i32:$vD, (mulhu v4i32:$vA, v4i32:$vB))]>;
-  def VMULHSD : VXForm_1<969, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                         "vmulhsd $vD, $vA, $vB", IIC_VecGeneral,
-                         [(set v2i64:$vD, (mulhs v2i64:$vA, v2i64:$vB))]>;
-  def VMULHUD : VXForm_1<713, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                         "vmulhud $vD, $vA, $vB", IIC_VecGeneral,
-                         [(set v2i64:$vD, (mulhu v2i64:$vA, v2i64:$vB))]>;
-  def VMODSW : VXForm_1<1931, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                        "vmodsw $vD, $vA, $vB", IIC_VecGeneral,
-                        [(set v4i32:$vD, (srem v4i32:$vA, v4i32:$vB))]>;
-  def VMODUW : VXForm_1<1675, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                        "vmoduw $vD, $vA, $vB", IIC_VecGeneral,
-                        [(set v4i32:$vD, (urem v4i32:$vA, v4i32:$vB))]>;
-  def VMODSD : VXForm_1<1995, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                        "vmodsd $vD, $vA, $vB", IIC_VecGeneral,
-                        [(set v2i64:$vD, (srem v2i64:$vA, v2i64:$vB))]>;
-  def VMODUD : VXForm_1<1739, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                        "vmodud $vD, $vA, $vB", IIC_VecGeneral,
-                        [(set v2i64:$vD, (urem v2i64:$vA, v2i64:$vB))]>;
-  def VDIVSW : VXForm_1<395, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                        "vdivsw $vD, $vA, $vB", IIC_VecGeneral,
-                        [(set v4i32:$vD, (sdiv v4i32:$vA, v4i32:$vB))]>;
-  def VDIVUW : VXForm_1<139, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                        "vdivuw $vD, $vA, $vB", IIC_VecGeneral,
-                        [(set v4i32:$vD, (udiv v4i32:$vA, v4i32:$vB))]>;
-  def VDIVSD : VXForm_1<459, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                        "vdivsd $vD, $vA, $vB", IIC_VecGeneral,
-                        [(set v2i64:$vD, (sdiv v2i64:$vA, v2i64:$vB))]>;
-  def VDIVUD : VXForm_1<203, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                        "vdivud $vD, $vA, $vB", IIC_VecGeneral,
-                        [(set v2i64:$vD, (udiv v2i64:$vA, v2i64:$vB))]>;
-  def VDIVESW : VXForm_1<907, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                         "vdivesw $vD, $vA, $vB", IIC_VecGeneral,
-                         [(set v4i32:$vD, (int_ppc_altivec_vdivesw v4i32:$vA,
-                               v4i32:$vB))]>;
-  def VDIVEUW : VXForm_1<651, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                         "vdiveuw $vD, $vA, $vB", IIC_VecGeneral,
-                         [(set v4i32:$vD, (int_ppc_altivec_vdiveuw v4i32:$vA,
-                               v4i32:$vB))]>;
-  def VDIVESD : VXForm_1<971, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                         "vdivesd $vD, $vA, $vB", IIC_VecGeneral,
-                         [(set v2i64:$vD, (int_ppc_altivec_vdivesd v2i64:$vA,
-                               v2i64:$vB))]>;
-  def VDIVEUD : VXForm_1<715, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                         "vdiveud $vD, $vA, $vB", IIC_VecGeneral,
-                         [(set v2i64:$vD, (int_ppc_altivec_vdiveud v2i64:$vA,
-                               v2i64:$vB))]>;
-  def XVTLSBB : XX2_BF3_XO5_XB6_XO9<60, 2, 475, (outs crrc:$BF), (ins vsrc:$XB),
-                                    "xvtlsbb $BF, $XB", IIC_VecGeneral, []>;
-
-  // The XFormMemOp flag for the following 8 instructions is set on
-  // the instruction format.
-  let mayLoad = 1, mayStore = 0 in {
-    def LXVRBX : X_XT6_RA5_RB5<31, 13, "lxvrbx", vsrc, []>;
-    def LXVRHX : X_XT6_RA5_RB5<31, 45, "lxvrhx", vsrc, []>;
-    def LXVRWX : X_XT6_RA5_RB5<31, 77, "lxvrwx", vsrc, []>;
-    def LXVRDX : X_XT6_RA5_RB5<31, 109, "lxvrdx", vsrc, []>;
-  }
-
-  let mayLoad = 0, mayStore = 1 in {
-    def STXVRBX : X_XS6_RA5_RB5<31, 141, "stxvrbx", vsrc, []>;
-    def STXVRHX : X_XS6_RA5_RB5<31, 173, "stxvrhx", vsrc, []>;
-    def STXVRWX : X_XS6_RA5_RB5<31, 205, "stxvrwx", vsrc, []>;
-    def STXVRDX : X_XS6_RA5_RB5<31, 237, "stxvrdx", vsrc, []>;
-  }
-
-  def VMULESD : VXForm_1<968, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                         "vmulesd $vD, $vA, $vB", IIC_VecGeneral,
-                         [(set v1i128:$vD, (int_ppc_altivec_vmulesd v2i64:$vA,
-                               v2i64:$vB))]>;
-  def VMULEUD : VXForm_1<712, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                         "vmuleud $vD, $vA, $vB", IIC_VecGeneral,
-                         [(set v1i128:$vD, (int_ppc_altivec_vmuleud v2i64:$vA,
-                               v2i64:$vB))]>;
-  def VMULOSD : VXForm_1<456, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                         "vmulosd $vD, $vA, $vB", IIC_VecGeneral,
-                         [(set v1i128:$vD, (int_ppc_altivec_vmulosd v2i64:$vA,
-                               v2i64:$vB))]>;
-  def VMULOUD : VXForm_1<200, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                         "vmuloud $vD, $vA, $vB", IIC_VecGeneral,
-                         [(set v1i128:$vD, (int_ppc_altivec_vmuloud v2i64:$vA,
-                               v2i64:$vB))]>;
-  def VMSUMCUD : VAForm_1a<23, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC),
-                           "vmsumcud $vD, $vA, $vB, $vC", IIC_VecGeneral,
-                           [(set v1i128:$vD, (int_ppc_altivec_vmsumcud
-                                 v2i64:$vA, v2i64:$vB, v1i128:$vC))]>;
-  def VDIVSQ : VXForm_1<267, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                        "vdivsq $vD, $vA, $vB", IIC_VecGeneral,
-                        [(set v1i128:$vD, (sdiv v1i128:$vA, v1i128:$vB))]>;
-  def VDIVUQ : VXForm_1<11, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                        "vdivuq $vD, $vA, $vB", IIC_VecGeneral,
-                        [(set v1i128:$vD, (udiv v1i128:$vA, v1i128:$vB))]>;
-  def VDIVESQ : VXForm_1<779, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                         "vdivesq $vD, $vA, $vB", IIC_VecGeneral,
-                         [(set v1i128:$vD, (int_ppc_altivec_vdivesq v1i128:$vA,
-			       v1i128:$vB))]>;
-  def VDIVEUQ : VXForm_1<523, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                         "vdiveuq $vD, $vA, $vB", IIC_VecGeneral,
-                         [(set v1i128:$vD, (int_ppc_altivec_vdiveuq v1i128:$vA,
-			       v1i128:$vB))]>;
-  def VCMPEQUQ : VCMP <455, "vcmpequq $vD, $vA, $vB" , v1i128>;
-  def VCMPGTSQ : VCMP <903, "vcmpgtsq $vD, $vA, $vB" , v1i128>;
-  def VCMPGTUQ : VCMP <647, "vcmpgtuq $vD, $vA, $vB" , v1i128>;
-  def VCMPEQUQ_rec : VCMP_rec <455, "vcmpequq. $vD, $vA, $vB" , v1i128>;
-  def VCMPGTSQ_rec : VCMP_rec <903, "vcmpgtsq. $vD, $vA, $vB" , v1i128>;
-  def VCMPGTUQ_rec : VCMP_rec <647, "vcmpgtuq. $vD, $vA, $vB" , v1i128>;
-  def VMODSQ : VXForm_1<1803, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                        "vmodsq $vD, $vA, $vB", IIC_VecGeneral,
-                        [(set v1i128:$vD, (srem v1i128:$vA, v1i128:$vB))]>;
-  def VMODUQ : VXForm_1<1547, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                        "vmoduq $vD, $vA, $vB", IIC_VecGeneral,
-                        [(set v1i128:$vD, (urem v1i128:$vA, v1i128:$vB))]>;
-  def VEXTSD2Q : VXForm_RD5_XO5_RS5<1538, 27, (outs vrrc:$vD), (ins vrrc:$vB),
-                               "vextsd2q $vD, $vB", IIC_VecGeneral,
-                               [(set v1i128:$vD, (int_ppc_altivec_vextsd2q v2i64:$vB))]>;
-  def VCMPUQ : VXForm_BF3_VAB5<257, (outs crrc:$BF), (ins vrrc:$vA, vrrc:$vB),
-                               "vcmpuq $BF, $vA, $vB", IIC_VecGeneral, []>;
-  def VCMPSQ : VXForm_BF3_VAB5<321, (outs crrc:$BF), (ins vrrc:$vA, vrrc:$vB),
-                               "vcmpsq $BF, $vA, $vB", IIC_VecGeneral, []>;
-  def VRLQNM : VX1_VT5_VA5_VB5<325, "vrlqnm",
-                               [(set v1i128:$vD,
-                                   (int_ppc_altivec_vrlqnm v1i128:$vA,
-                                                           v1i128:$vB))]>;
-  def VRLQMI : VXForm_1<69, (outs vrrc:$vD),
-                        (ins vrrc:$vA, vrrc:$vB, vrrc:$vDi),
-                        "vrlqmi $vD, $vA, $vB", IIC_VecFP,
-                        [(set v1i128:$vD,
-                          (int_ppc_altivec_vrlqmi v1i128:$vA, v1i128:$vB,
-                                                  v1i128:$vDi))]>,
-                        RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
-  def VSLQ : VX1_VT5_VA5_VB5<261, "vslq", []>;
-  def VSRAQ : VX1_VT5_VA5_VB5<773, "vsraq", []>;
-  def VSRQ : VX1_VT5_VA5_VB5<517, "vsrq", []>;
-  def VRLQ : VX1_VT5_VA5_VB5<5, "vrlq", []>;
-  def XSCVQPUQZ : X_VT5_XO5_VB5<63, 0, 836, "xscvqpuqz", []>;
-  def XSCVQPSQZ : X_VT5_XO5_VB5<63, 8, 836, "xscvqpsqz", []>;
-  def XSCVUQQP : X_VT5_XO5_VB5<63, 3, 836, "xscvuqqp", []>;
-  def XSCVSQQP : X_VT5_XO5_VB5<63, 11, 836, "xscvsqqp", []>;
-}
-
-let Predicates = [IsISA3_1, HasVSX] in {
-  def XVCVSPBF16 : XX2_XT6_XO5_XB6<60, 17, 475, "xvcvspbf16", vsrc, []>;
-  def XVCVBF16SPN : XX2_XT6_XO5_XB6<60, 16, 475, "xvcvbf16spn", vsrc, []>;
-  def XSMAXCQP : X_VT5_VA5_VB5<63, 676, "xsmaxcqp", []>;
-  def XSMINCQP : X_VT5_VA5_VB5<63, 740, "xsmincqp", []>;
-}
-
-// Multiclass defining patterns for Set Boolean Extension Reverse Instructions.
-// This is analogous to the CRNotPat multiclass but specifically for Power10
-// and newer subtargets since the extended forms use Set Boolean instructions.
-// The first two anonymous patterns defined are actually a duplicate of those
-// in CRNotPat, but it is preferable to define both multiclasses as complete
-// ones rather than pulling that small common section out.
-multiclass P10ReverseSetBool<dag pattern, dag result> {
-  def : Pat<pattern, (crnot result)>;
-  def : Pat<(not pattern), result>;
-
-  def : Pat<(i32 (zext pattern)),
-            (SETBCR result)>;
-  def : Pat<(i64 (zext pattern)),
-            (SETBCR8 result)>;
-
-  def : Pat<(i32 (sext pattern)),
-            (SETNBCR result)>;
-  def : Pat<(i64 (sext pattern)),
-            (SETNBCR8 result)>;
-
-  def : Pat<(i32 (anyext pattern)),
-            (SETBCR result)>;
-  def : Pat<(i64 (anyext pattern)),
-            (SETBCR8 result)>;
-}
-
-multiclass IntSetP10RevSetBool<SDNode SetCC, ValueType Ty, PatLeaf ZExtTy,
-                               ImmLeaf SExtTy, I Cmpi, I Cmpli,
-                               I Cmp, I Cmpl> {
-  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETUGE)),
-                           (EXTRACT_SUBREG (Cmpl $s1, $s2), sub_lt)>;
-  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETGE)),
-                           (EXTRACT_SUBREG (Cmp $s1, $s2), sub_lt)>;
-  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETULE)),
-                           (EXTRACT_SUBREG (Cmpl $s1, $s2), sub_gt)>;
-  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETLE)),
-                           (EXTRACT_SUBREG (Cmp $s1, $s2), sub_gt)>;
-  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETNE)),
-                           (EXTRACT_SUBREG (Cmp $s1, $s2), sub_eq)>;
-
-  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, ZExtTy:$imm, SETUGE)),
-                           (EXTRACT_SUBREG (Cmpli $s1, imm:$imm), sub_lt)>;
-  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, SExtTy:$imm, SETGE)),
-                           (EXTRACT_SUBREG (Cmpi $s1, imm:$imm), sub_lt)>;
-  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, ZExtTy:$imm, SETULE)),
-                           (EXTRACT_SUBREG (Cmpli $s1, imm:$imm), sub_gt)>;
-  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, SExtTy:$imm, SETLE)),
-                           (EXTRACT_SUBREG (Cmpi $s1, imm:$imm), sub_gt)>;
-  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, SExtTy:$imm, SETNE)),
-                           (EXTRACT_SUBREG (Cmpi $s1, imm:$imm), sub_eq)>;
-  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, ZExtTy:$imm, SETNE)),
-                           (EXTRACT_SUBREG (Cmpli $s1, imm:$imm), sub_eq)>;
-}
-
-multiclass FSetP10RevSetBool<SDNode SetCC, ValueType Ty, I FCmp> {
-  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETUGE)),
-                           (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>;
-  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETGE)),
-                           (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>;
-  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETULE)),
-                           (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>;
-  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETLE)),
-                           (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>;
-  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETUNE)),
-                           (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>;
-  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETNE)),
-                           (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>;
-  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETO)),
-                           (EXTRACT_SUBREG (FCmp $s1, $s2), sub_un)>;
-}
-
-let Predicates = [IsISA3_1] in {
-  def : Pat<(i32 (zext i1:$in)),
-            (SETBC $in)>;
-  def : Pat<(i64 (zext i1:$in)),
-            (SETBC8 $in)>;
-  def : Pat<(i32 (sext i1:$in)),
-            (SETNBC $in)>;
-  def : Pat<(i64 (sext i1:$in)),
-            (SETNBC8 $in)>;
-  def : Pat<(i32 (anyext i1:$in)),
-            (SETBC $in)>;
-  def : Pat<(i64 (anyext i1:$in)),
-            (SETBC8 $in)>;
-
-  // Instantiation of the set boolean reverse patterns for 32-bit integers.
-  defm : IntSetP10RevSetBool<setcc, i32, immZExt16, imm32SExt16,
-                             CMPWI, CMPLWI, CMPW, CMPLW>;
-  defm : P10ReverseSetBool<(i1 (setcc i32:$s1, imm:$imm, SETNE)),
-                           (EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)),
-                                           (LO16 imm:$imm)), sub_eq)>;
-
-  // Instantiation of the set boolean reverse patterns for 64-bit integers.
-  defm : IntSetP10RevSetBool<setcc, i64, immZExt16, imm64SExt16,
-                             CMPDI, CMPLDI, CMPD, CMPLD>;
-  defm : P10ReverseSetBool<(i1 (setcc i64:$s1, imm64ZExt32:$imm, SETNE)),
-                           (EXTRACT_SUBREG (CMPLDI (XORIS8 $s1, (HI16 imm:$imm)),
-                                           (LO16 imm:$imm)), sub_eq)>;
-}
-
-// Instantiation of the set boolean reverse patterns for f32, f64, f128.
-let Predicates = [IsISA3_1, HasFPU] in {
-  defm : FSetP10RevSetBool<setcc, f32, FCMPUS>;
-  defm : FSetP10RevSetBool<setcc, f64, FCMPUD>;
-  defm : FSetP10RevSetBool<setcc, f128, XSCMPUQP>;
-}
-
-//---------------------------- Anonymous Patterns ----------------------------//
-let Predicates = [IsISA3_1] in {
-  // Exploit the vector multiply high instructions using intrinsics.
-  def : Pat<(v4i32 (int_ppc_altivec_vmulhsw v4i32:$vA, v4i32:$vB)),
-            (v4i32 (VMULHSW $vA, $vB))>;
-  def : Pat<(v4i32 (int_ppc_altivec_vmulhuw v4i32:$vA, v4i32:$vB)),
-            (v4i32 (VMULHUW $vA, $vB))>;
-  def : Pat<(v2i64 (int_ppc_altivec_vmulhsd v2i64:$vA, v2i64:$vB)),
-            (v2i64 (VMULHSD $vA, $vB))>;
-  def : Pat<(v2i64 (int_ppc_altivec_vmulhud v2i64:$vA, v2i64:$vB)),
-            (v2i64 (VMULHUD $vA, $vB))>;
-  def : Pat<(v16i8 (int_ppc_vsx_xxgenpcvbm v16i8:$VRB, imm:$IMM)),
-            (v16i8 (COPY_TO_REGCLASS (XXGENPCVBM $VRB, imm:$IMM), VRRC))>;
-  def : Pat<(v8i16 (int_ppc_vsx_xxgenpcvhm v8i16:$VRB, imm:$IMM)),
-            (v8i16 (COPY_TO_REGCLASS (XXGENPCVHM $VRB, imm:$IMM), VRRC))>;
-  def : Pat<(v4i32 (int_ppc_vsx_xxgenpcvwm v4i32:$VRB, imm:$IMM)),
-            (v4i32 (COPY_TO_REGCLASS (XXGENPCVWM $VRB, imm:$IMM), VRRC))>;
-  def : Pat<(v2i64 (int_ppc_vsx_xxgenpcvdm v2i64:$VRB, imm:$IMM)),
-            (v2i64 (COPY_TO_REGCLASS (XXGENPCVDM $VRB, imm:$IMM), VRRC))>;
-  def : Pat<(i32 (int_ppc_vsx_xvtlsbb v16i8:$XB, 1)),
-            (EXTRACT_SUBREG (XVTLSBB (COPY_TO_REGCLASS $XB, VSRC)), sub_lt)>;
-  def : Pat<(i32 (int_ppc_vsx_xvtlsbb v16i8:$XB, 0)),
-            (EXTRACT_SUBREG (XVTLSBB (COPY_TO_REGCLASS $XB, VSRC)), sub_eq)>;
-
-  def : Pat <(v1i128 (PPClxvrzx ForceXForm:$src, 8)),
-             (v1i128 (COPY_TO_REGCLASS (LXVRBX ForceXForm:$src), VRRC))>;
-  def : Pat <(v1i128 (PPClxvrzx ForceXForm:$src, 16)),
-             (v1i128 (COPY_TO_REGCLASS (LXVRHX ForceXForm:$src), VRRC))>;
-  def : Pat <(v1i128 (PPClxvrzx ForceXForm:$src, 32)),
-             (v1i128 (COPY_TO_REGCLASS (LXVRWX ForceXForm:$src), VRRC))>;
-  def : Pat <(v1i128 (PPClxvrzx ForceXForm:$src, 64)),
-             (v1i128 (COPY_TO_REGCLASS (LXVRDX ForceXForm:$src), VRRC))>;
-
-  def : Pat<(v1i128 (rotl v1i128:$vA, v1i128:$vB)),
-            (v1i128 (VRLQ v1i128:$vA, v1i128:$vB))>;
-
-  def : Pat <(v2i64 (PPCxxsplti32dx v2i64:$XT, i32:$XI, i32:$IMM32)),
-             (v2i64 (XXSPLTI32DX v2i64:$XT, i32:$XI, i32:$IMM32))>;
-}
-
-let Predicates = [IsISA3_1, HasVSX] in {
-  def : Pat<(v16i8 (int_ppc_vsx_xvcvspbf16 v16i8:$XA)),
-            (COPY_TO_REGCLASS (XVCVSPBF16 RCCp.AToVSRC), VRRC)>;
-  def : Pat<(v16i8 (int_ppc_vsx_xvcvbf16spn v16i8:$XA)),
-            (COPY_TO_REGCLASS (XVCVBF16SPN RCCp.AToVSRC), VRRC)>;
-}
-
-let AddedComplexity = 400, Predicates = [IsISA3_1, IsLittleEndian] in {
-  // Store element 0 of a VSX register to memory
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$src, 0)), ForceXForm:$dst),
-            (STXVRBX (COPY_TO_REGCLASS v16i8:$src, VSRC), ForceXForm:$dst)>;
-  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$src, 0)), ForceXForm:$dst),
-            (STXVRHX (COPY_TO_REGCLASS v8i16:$src, VSRC), ForceXForm:$dst)>;
-  def : Pat<(store (i32 (extractelt v4i32:$src, 0)), ForceXForm:$dst),
-            (STXVRWX $src, ForceXForm:$dst)>;
-  def : Pat<(store (f32 (extractelt v4f32:$src, 0)), ForceXForm:$dst),
-            (STXVRWX $src, ForceXForm:$dst)>;
-  def : Pat<(store (i64 (extractelt v2i64:$src, 0)), ForceXForm:$dst),
-            (STXVRDX $src, ForceXForm:$dst)>;
-  def : Pat<(store (f64 (extractelt v2f64:$src, 0)), ForceXForm:$dst),
-            (STXVRDX $src, ForceXForm:$dst)>;
-  // Load element 0 of a VSX register to memory
-  def : Pat<(v8i16 (scalar_to_vector (i32 (extloadi16 ForceXForm:$src)))),
-            (v8i16 (COPY_TO_REGCLASS (LXVRHX ForceXForm:$src), VSRC))>;
-  def : Pat<(v16i8 (scalar_to_vector (i32 (extloadi8 ForceXForm:$src)))),
-            (v16i8 (COPY_TO_REGCLASS (LXVRBX ForceXForm:$src), VSRC))>;
- }
-
-// FIXME: The swap is overkill when the shift amount is a constant.
-// We should just fix the constant in the DAG.
-let AddedComplexity = 400, Predicates = [IsISA3_1, HasVSX] in {
-  def : Pat<(v1i128 (shl v1i128:$VRA, v1i128:$VRB)),
-            (v1i128 (VSLQ v1i128:$VRA,
-                     (XXPERMDI (COPY_TO_REGCLASS $VRB, VSRC),
-                               (COPY_TO_REGCLASS $VRB, VSRC), 2)))>;
-  def : Pat<(v1i128 (PPCshl v1i128:$VRA, v1i128:$VRB)),
-            (v1i128 (VSLQ v1i128:$VRA,
-                     (XXPERMDI (COPY_TO_REGCLASS $VRB, VSRC),
-                               (COPY_TO_REGCLASS $VRB, VSRC), 2)))>;
-  def : Pat<(v1i128 (srl v1i128:$VRA, v1i128:$VRB)),
-            (v1i128 (VSRQ v1i128:$VRA,
-                     (XXPERMDI (COPY_TO_REGCLASS $VRB, VSRC),
-                               (COPY_TO_REGCLASS $VRB, VSRC), 2)))>;
-  def : Pat<(v1i128 (PPCsrl v1i128:$VRA, v1i128:$VRB)),
-            (v1i128 (VSRQ v1i128:$VRA,
-                     (XXPERMDI (COPY_TO_REGCLASS $VRB, VSRC),
-                               (COPY_TO_REGCLASS $VRB, VSRC), 2)))>;
-  def : Pat<(v1i128 (sra v1i128:$VRA, v1i128:$VRB)),
-            (v1i128 (VSRAQ v1i128:$VRA,
-                     (XXPERMDI (COPY_TO_REGCLASS $VRB, VSRC),
-                               (COPY_TO_REGCLASS $VRB, VSRC), 2)))>;
-  def : Pat<(v1i128 (PPCsra v1i128:$VRA, v1i128:$VRB)),
-            (v1i128 (VSRAQ v1i128:$VRA,
-                     (XXPERMDI (COPY_TO_REGCLASS $VRB, VSRC),
-                               (COPY_TO_REGCLASS $VRB, VSRC), 2)))>;
-}
-
-class xxevalPattern <dag pattern, bits<8> imm> :
-  Pat<(v4i32 pattern), (XXEVAL $vA, $vB, $vC, imm)> {}
-
-let AddedComplexity = 400, Predicates = [PrefixInstrs] in {
- def : Pat<(v4i32 (build_vector i32immNonAllOneNonZero:$A,
-                                i32immNonAllOneNonZero:$A,
-                                i32immNonAllOneNonZero:$A,
-                                i32immNonAllOneNonZero:$A)),
-           (v4i32 (XXSPLTIW imm:$A))>;
- def : Pat<(f32 nzFPImmAsi32:$A),
-           (COPY_TO_REGCLASS (XXSPLTIDP (getFPAs32BitInt fpimm:$A)),
-                             VSFRC)>;
- def : Pat<(f64 nzFPImmAsi32:$A),
-           (COPY_TO_REGCLASS (XXSPLTIDP (getFPAs32BitInt fpimm:$A)),
-                             VSFRC)>;
-
-// To replace constant pool with XXSPLTI32DX for scalars.
-def : Pat<(f32 nzFPImmAsi64:$A),
-          (COPY_TO_REGCLASS (XXSPLTI32DX (XXSPLTI32DX(IMPLICIT_DEF), 0,
-                                        (getFPAs64BitIntHi $A)),
-                                        1, (getFPAs64BitIntLo $A)),
-                            VSSRC)>;
-
-def : Pat<(f64 nzFPImmAsi64:$A),
-          (COPY_TO_REGCLASS (XXSPLTI32DX (XXSPLTI32DX (IMPLICIT_DEF), 0,
-                                        (getFPAs64BitIntHi $A)),
-                                        1, (getFPAs64BitIntLo $A)),
-                            VSFRC)>;
-
-  // Anonymous patterns for XXEVAL
-  // AND
-  // and(A, B, C)
-  def : xxevalPattern<(and v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 1>;
-  // and(A, xor(B, C))
-  def : xxevalPattern<(and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 6>;
-  // and(A, or(B, C))
-  def : xxevalPattern<(and v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 7>;
-  // and(A, nor(B, C))
-  def : xxevalPattern<(and v4i32:$vA, (vnot (or v4i32:$vB, v4i32:$vC))), 8>;
-  // and(A, eqv(B, C))
-  def : xxevalPattern<(and v4i32:$vA, (vnot (xor v4i32:$vB, v4i32:$vC))), 9>;
-  // and(A, nand(B, C))
-  def : xxevalPattern<(and v4i32:$vA, (vnot (and v4i32:$vB, v4i32:$vC))), 14>;
-
-  // NAND
-  // nand(A, B, C)
-  def : xxevalPattern<(vnot (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC))),
-                       !sub(255, 1)>;
-  // nand(A, xor(B, C))
-  def : xxevalPattern<(vnot (and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC))),
-                       !sub(255, 6)>;
-  // nand(A, or(B, C))
-  def : xxevalPattern<(vnot (and v4i32:$vA, (or v4i32:$vB, v4i32:$vC))),
-                       !sub(255, 7)>;
-  // nand(A, nor(B, C))
-  def : xxevalPattern<(or (vnot v4i32:$vA), (or v4i32:$vB, v4i32:$vC)),
-                       !sub(255, 8)>;
-  // nand(A, eqv(B, C))
-  def : xxevalPattern<(or (vnot v4i32:$vA), (xor v4i32:$vB, v4i32:$vC)),
-                       !sub(255, 9)>;
-  // nand(A, nand(B, C))
-  def : xxevalPattern<(or (vnot v4i32:$vA), (and v4i32:$vB, v4i32:$vC)),
-                       !sub(255, 14)>;
-
-  // Anonymous patterns to select prefixed VSX loads and stores.
-  // Load / Store f128
-  def : Pat<(f128 (load PDForm:$src)),
-            (COPY_TO_REGCLASS (PLXV memri34:$src), VRRC)>;
-  def : Pat<(store f128:$XS, PDForm:$dst),
-            (PSTXV (COPY_TO_REGCLASS $XS, VSRC), memri34:$dst)>;
-
-  // Load / Store v4i32
-  def : Pat<(v4i32 (load PDForm:$src)), (PLXV memri34:$src)>;
-  def : Pat<(store v4i32:$XS, PDForm:$dst), (PSTXV $XS, memri34:$dst)>;
-
-  // Load / Store v2i64
-  def : Pat<(v2i64 (load PDForm:$src)), (PLXV memri34:$src)>;
-  def : Pat<(store v2i64:$XS, PDForm:$dst), (PSTXV $XS, memri34:$dst)>;
-
-  // Load / Store v4f32
-  def : Pat<(v4f32 (load PDForm:$src)), (PLXV memri34:$src)>;
-  def : Pat<(store v4f32:$XS, PDForm:$dst), (PSTXV $XS, memri34:$dst)>;
-
-  // Load / Store v2f64
-  def : Pat<(v2f64 (load PDForm:$src)), (PLXV memri34:$src)>;
-  def : Pat<(store v2f64:$XS, PDForm:$dst), (PSTXV $XS, memri34:$dst)>;
-
-  // Cases For PPCstore_scal_int_from_vsr
-  def : Pat<(PPCstore_scal_int_from_vsr
-            (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), PDForm:$dst, 8),
-            (PSTXSD (XSCVDPUXDS f64:$src), PDForm:$dst)>;
-  def : Pat<(PPCstore_scal_int_from_vsr
-            (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), PDForm:$dst, 8),
-            (PSTXSD (XSCVDPSXDS f64:$src), PDForm:$dst)>;
-  def : Pat<(PPCstore_scal_int_from_vsr
-            (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), PDForm:$dst, 8),
-            (PSTXSD (COPY_TO_REGCLASS (XSCVQPUDZ f128:$src), VFRC),
-                     PDForm:$dst)>;
-  def : Pat<(PPCstore_scal_int_from_vsr
-            (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), PDForm:$dst, 8),
-            (PSTXSD (COPY_TO_REGCLASS (XSCVQPSDZ f128:$src), VFRC),
-                     PDForm:$dst)>;
-}
-
-let Predicates = [PrefixInstrs] in {
-  def : Pat<(i32 imm34:$imm), (PLI (getImmAs64BitInt imm:$imm))>;
-  def : Pat<(i64 imm34:$imm), (PLI8 (getImmAs64BitInt imm:$imm))>;
-  def : Pat<(v16i8 (int_ppc_vsx_xxpermx v16i8:$A, v16i8:$B, v16i8:$C, timm:$D)),
-            (COPY_TO_REGCLASS (XXPERMX (COPY_TO_REGCLASS $A, VSRC),
-                                       (COPY_TO_REGCLASS $B, VSRC),
-                                       (COPY_TO_REGCLASS $C, VSRC), $D), VSRC)>;
-  def : Pat<(v16i8 (int_ppc_vsx_xxblendvb v16i8:$A, v16i8:$B, v16i8:$C)),
-            (COPY_TO_REGCLASS
-                   (XXBLENDVB (COPY_TO_REGCLASS $A, VSRC),
-                              (COPY_TO_REGCLASS $B, VSRC),
-                              (COPY_TO_REGCLASS $C, VSRC)), VSRC)>;
-  def : Pat<(v8i16 (int_ppc_vsx_xxblendvh v8i16:$A, v8i16:$B, v8i16:$C)),
-            (COPY_TO_REGCLASS
-                   (XXBLENDVH (COPY_TO_REGCLASS $A, VSRC),
-                              (COPY_TO_REGCLASS $B, VSRC),
-                              (COPY_TO_REGCLASS $C, VSRC)), VSRC)>;
-  def : Pat<(int_ppc_vsx_xxblendvw v4i32:$A, v4i32:$B, v4i32:$C),
-            (XXBLENDVW $A, $B, $C)>;
-  def : Pat<(int_ppc_vsx_xxblendvd v2i64:$A, v2i64:$B, v2i64:$C),
-            (XXBLENDVD $A, $B, $C)>;
-
-  // Anonymous patterns to select prefixed loads and stores.
-  // Load i32
-  def : Pat<(i32 (extloadi1 PDForm:$src)), (PLBZ memri34:$src)>;
-  def : Pat<(i32 (zextloadi1 PDForm:$src)), (PLBZ memri34:$src)>;
-  def : Pat<(i32 (extloadi8 PDForm:$src)), (PLBZ memri34:$src)>;
-  def : Pat<(i32 (zextloadi8 PDForm:$src)), (PLBZ memri34:$src)>;
-  def : Pat<(i32 (extloadi16 PDForm:$src)), (PLHZ memri34:$src)>;
-  def : Pat<(i32 (zextloadi16 PDForm:$src)), (PLHZ memri34:$src)>;
-  def : Pat<(i32 (sextloadi16 PDForm:$src)), (PLHA memri34:$src)>;
-  def : Pat<(i32 (load PDForm:$src)), (PLWZ memri34:$src)>;
-
-  // Store i32
-  def : Pat<(truncstorei8 i32:$rS, PDForm:$dst), (PSTB gprc:$rS, memri34:$dst)>;
-  def : Pat<(truncstorei16 i32:$rS, PDForm:$dst), (PSTH gprc:$rS, memri34:$dst)>;
-  def : Pat<(store i32:$rS, PDForm:$dst), (PSTW gprc:$rS, memri34:$dst)>;
-
-  // Load i64
-  def : Pat<(i64 (extloadi1 PDForm:$src)), (PLBZ8 memri34:$src)>;
-  def : Pat<(i64 (zextloadi1 PDForm:$src)), (PLBZ8 memri34:$src)>;
-  def : Pat<(i64 (extloadi8 PDForm:$src)), (PLBZ8 memri34:$src)>;
-  def : Pat<(i64 (zextloadi8 PDForm:$src)), (PLBZ8 memri34:$src)>;
-  def : Pat<(i64 (extloadi16 PDForm:$src)), (PLHZ8 memri34:$src)>;
-  def : Pat<(i64 (zextloadi16 PDForm:$src)), (PLHZ8 memri34:$src)>;
-  def : Pat<(i64 (sextloadi16 PDForm:$src)), (PLHA8 memri34:$src)>;
-  def : Pat<(i64 (extloadi32 PDForm:$src)), (PLWZ8 memri34:$src)>;
-  def : Pat<(i64 (zextloadi32 PDForm:$src)), (PLWZ8 memri34:$src)>;
-  def : Pat<(i64 (sextloadi32 PDForm:$src)), (PLWA8 memri34:$src)>;
-  def : Pat<(i64 (load PDForm:$src)), (PLD memri34:$src)>;
-
-  // Store i64
-  def : Pat<(truncstorei8 i64:$rS, PDForm:$dst), (PSTB8 g8rc:$rS, memri34:$dst)>;
-  def : Pat<(truncstorei16 i64:$rS, PDForm:$dst), (PSTH8 g8rc:$rS, memri34:$dst)>;
-  def : Pat<(truncstorei32 i64:$rS, PDForm:$dst), (PSTW8 g8rc:$rS, memri34:$dst)>;
-  def : Pat<(store i64:$rS, PDForm:$dst), (PSTD g8rc:$rS, memri34:$dst)>;
-
-  // Load / Store f32
-  def : Pat<(f32 (load PDForm:$src)), (PLFS memri34:$src)>;
-  def : Pat<(store f32:$FRS, PDForm:$dst), (PSTFS $FRS, memri34:$dst)>;
-
-  // Load / Store f64
-  def : Pat<(f64 (extloadf32 PDForm:$src)),
-            (COPY_TO_REGCLASS (PLFS memri34:$src), VSFRC)>;
-  def : Pat<(f64 (load PDForm:$src)), (PLFD memri34:$src)>;
-  def : Pat<(store f64:$FRS, PDForm:$dst), (PSTFD $FRS, memri34:$dst)>;
-
-  // Atomic Load
-  def : Pat<(atomic_load_8 PDForm:$src), (PLBZ memri34:$src)>;
-  def : Pat<(atomic_load_16 PDForm:$src), (PLHZ memri34:$src)>;
-  def : Pat<(atomic_load_32 PDForm:$src), (PLWZ memri34:$src)>;
-  def : Pat<(atomic_load_64 PDForm:$src), (PLD memri34:$src)>;
-
-  // Atomic Store
-  def : Pat<(atomic_store_8 PDForm:$dst, i32:$RS), (PSTB $RS, memri34:$dst)>;
-  def : Pat<(atomic_store_16 PDForm:$dst, i32:$RS), (PSTH $RS, memri34:$dst)>;
-  def : Pat<(atomic_store_32 PDForm:$dst, i32:$RS), (PSTW $RS, memri34:$dst)>;
-  def : Pat<(atomic_store_64 PDForm:$dst, i64:$RS), (PSTD $RS, memri34:$dst)>;
-
-  // Prefixed fpext to v2f64
-  def : Pat<(v4f32 (PPCldvsxlh PDForm:$src)),
-            (SUBREG_TO_REG (i64 1), (PLFD PDForm:$src), sub_64)>;
-}
-
-def InsertEltShift {
-  dag Sub32 = (i32 (EXTRACT_SUBREG $rB, sub_32));
-  dag Sub32Left1 = (RLWINM (EXTRACT_SUBREG $rB, sub_32), 1, 0, 30);
-  dag Sub32Left2 = (RLWINM (EXTRACT_SUBREG $rB, sub_32), 2, 0, 29);
-  dag Left1 = (RLWINM $rB, 1, 0, 30);
-  dag Left2 = (RLWINM $rB, 2, 0, 29);
-  dag Left3 = (RLWINM8 $rB, 3, 0, 28);
-}
-
-let Predicates = [IsISA3_1, HasVSX, IsLittleEndian] in {
-  // Indexed vector insert element
-  def : Pat<(v16i8 (vector_insert v16i8:$vDi, i32:$rA, i64:$rB)),
-            (VINSBRX $vDi, InsertEltShift.Sub32, $rA)>;
-  def : Pat<(v8i16 (vector_insert v8i16:$vDi, i32:$rA, i64:$rB)),
-            (VINSHRX $vDi, InsertEltShift.Sub32Left1, $rA)>;
-  def : Pat<(v4i32 (insertelt v4i32:$vDi, i32:$rA, i64:$rB)),
-            (VINSWRX $vDi, InsertEltShift.Sub32Left2, $rA)>;
-  def : Pat<(v2i64 (insertelt v2i64:$vDi, i64:$rA, i64:$rB)),
-            (VINSDRX $vDi, InsertEltShift.Left3, $rA)>;
-
-  def : Pat<(v4f32 (insertelt v4f32:$vDi, f32:$rA, i64:$rB)),
-            (VINSWVRX $vDi, InsertEltShift.Sub32Left2, (XSCVDPSPN $rA))>;
-
-  def : Pat<(v2f64 (insertelt v2f64:$vDi,  f64:$A, i64:$rB)),
-            (VINSDRX $vDi, InsertEltShift.Left3, Bitcast.DblToLong)>;
-  def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load DSForm:$rA)), i64:$rB)),
-            (VINSDRX $vDi, InsertEltShift.Left3, (LD memrix:$rA))>;
-  def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load PDForm:$rA)), i64:$rB)),
-            (VINSDRX $vDi, InsertEltShift.Left3, (PLD memri34:$rA))>;
-  def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load XForm:$rA)), i64:$rB)),
-            (VINSDRX $vDi, InsertEltShift.Left3, (LDX memrr:$rA))>;
-  let AddedComplexity = 400 in {
-    // Immediate vector insert element
-    foreach Idx = [0, 1, 2, 3] in {
-      def : Pat<(v4i32 (insertelt v4i32:$vDi, i32:$rA, Idx)),
-                (VINSW $vDi, !mul(!sub(3, Idx), 4), $rA)>;
-    }
-    foreach i = [0, 1] in
-     def : Pat<(v2i64 (insertelt v2i64:$vDi, i64:$rA, (i64 i))),
-               (VINSD $vDi, !mul(!sub(1, i), 8), $rA)>;
-  }
-}
-
-let Predicates = [IsISA3_1, HasVSX, IsBigEndian, IsPPC32] in {
-  // Indexed vector insert element
-  def : Pat<(v16i8 (vector_insert v16i8:$vDi, i32:$rA, i32:$rB)),
-            (VINSBLX $vDi, $rB, $rA)>;
-  def : Pat<(v8i16 (vector_insert v8i16:$vDi, i32:$rA, i32:$rB)),
-            (VINSHLX $vDi, InsertEltShift.Left1, $rA)>;
-  def : Pat<(v4i32 (insertelt v4i32:$vDi, i32:$rA, i32:$rB)),
-            (VINSWLX $vDi, InsertEltShift.Left2, $rA)>;
-
-  def : Pat<(v4f32 (insertelt v4f32:$vDi,  f32:$rA, i32:$rB)),
-            (VINSWVLX $vDi, InsertEltShift.Left2, (XSCVDPSPN $rA))>;
-}
-
-let Predicates = [IsISA3_1, HasVSX, IsBigEndian, IsPPC64] in {
-  // Indexed vector insert element
-  def : Pat<(v16i8 (vector_insert v16i8:$vDi, i32:$rA, i64:$rB)),
-            (VINSBLX $vDi, InsertEltShift.Sub32, $rA)>;
-  def : Pat<(v8i16 (vector_insert v8i16:$vDi, i32:$rA, i64:$rB)),
-            (VINSHLX $vDi, InsertEltShift.Sub32Left1, $rA)>;
-  def : Pat<(v4i32 (insertelt v4i32:$vDi, i32:$rA, i64:$rB)),
-            (VINSWLX $vDi, InsertEltShift.Sub32Left2, $rA)>;
-  def : Pat<(v2i64 (insertelt v2i64:$vDi, i64:$rA, i64:$rB)),
-            (VINSDLX $vDi, InsertEltShift.Left3, $rA)>;
-
-  def : Pat<(v4f32 (insertelt v4f32:$vDi,  f32:$rA, i64:$rB)),
-            (VINSWVLX $vDi, InsertEltShift.Sub32Left2, (XSCVDPSPN $rA))>;
-
-  def : Pat<(v2f64 (insertelt v2f64:$vDi,  f64:$A, i64:$rB)),
-            (VINSDLX $vDi, InsertEltShift.Left3, Bitcast.DblToLong)>;
-  def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load DSForm:$rA)), i64:$rB)),
-            (VINSDLX $vDi, InsertEltShift.Left3, (LD memrix:$rA))>;
-  def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load PDForm:$rA)), i64:$rB)),
-            (VINSDLX $vDi, InsertEltShift.Left3, (PLD memri34:$rA))>;
-  def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load XForm:$rA)), i64:$rB)),
-            (VINSDLX $vDi, InsertEltShift.Left3, (LDX memrr:$rA))>;
-}
-
-let AddedComplexity = 400, Predicates = [IsISA3_1, HasVSX, IsBigEndian] in {
-  // Immediate vector insert element
-  foreach Ty = [i32, i64] in {
-    foreach Idx = [0, 1, 2, 3] in {
-      def : Pat<(v4i32 (insertelt v4i32:$vDi, i32:$rA, (Ty Idx))),
-               (VINSW $vDi, !mul(Idx, 4), $rA)>;
-    }
-  }
-
-  foreach Idx = [0, 1] in
-    def : Pat<(v2i64 (insertelt v2i64:$vDi, i64:$rA, Idx)),
-              (VINSD $vDi, !mul(Idx, 8), $rA)>;
-}
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index 110f7d79fbc5..6e562498dcf9 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -51,35 +51,6 @@
 // **   printing (for example: xxswapd for xxpermdi with 0x2 as the imm).    **
 // ****************************************************************************
 
-def PPCRegVSRCAsmOperand : AsmOperandClass {
-  let Name = "RegVSRC"; let PredicateMethod = "isVSRegNumber";
-}
-def vsrc : RegisterOperand<VSRC> {
-  let ParserMatchClass = PPCRegVSRCAsmOperand;
-}
-
-def PPCRegVSFRCAsmOperand : AsmOperandClass {
-  let Name = "RegVSFRC"; let PredicateMethod = "isVSRegNumber";
-}
-def vsfrc : RegisterOperand<VSFRC> {
-  let ParserMatchClass = PPCRegVSFRCAsmOperand;
-}
-
-def PPCRegVSSRCAsmOperand : AsmOperandClass {
-  let Name = "RegVSSRC"; let PredicateMethod = "isVSRegNumber";
-}
-def vssrc : RegisterOperand<VSSRC> {
-  let ParserMatchClass = PPCRegVSSRCAsmOperand;
-}
-
-def PPCRegSPILLTOVSRRCAsmOperand : AsmOperandClass {
-  let Name = "RegSPILLTOVSRRC"; let PredicateMethod = "isVSRegNumber";
-}
-
-def spilltovsrrc : RegisterOperand<SPILLTOVSRRC> {
-  let ParserMatchClass = PPCRegSPILLTOVSRRCAsmOperand;
-}
-
 def SDT_PPCldvsxlh : SDTypeProfile<1, 1, [
   SDTCisVT<0, v4f32>, SDTCisPtrTy<1>
 ]>;
@@ -732,6 +703,11 @@ let hasSideEffects = 0 in {
                       (outs vsfrc:$XT), (ins vsfrc:$XB),
                       "xsnabsdp $XT, $XB", IIC_VecFP,
                       [(set f64:$XT, (fneg (fabs f64:$XB)))]>;
+  let isCodeGenOnly = 1 in
+  def XSNABSDPs : XX2Form<60, 361,
+                      (outs vssrc:$XT), (ins vssrc:$XB),
+                      "xsnabsdp $XT, $XB", IIC_VecFP,
+                      [(set f32:$XT, (fneg (fabs f32:$XB)))]>;
   def XSNEGDP : XX2Form<60, 377,
                       (outs vsfrc:$XT), (ins vsfrc:$XB),
                       "xsnegdp $XT, $XB", IIC_VecFP,
@@ -2897,10 +2873,32 @@ def : Pat<(v2i64 (PPCvcmp_rec v2i64:$vA, v2i64:$vB, 199)),
 
 // XL Compat builtins.
 def : Pat<(int_ppc_fmsub f64:$A, f64:$B, f64:$C), (XSMSUBMDP $A, $B, $C)>;
-def : Pat<(int_ppc_fnmsub f64:$A, f64:$B, f64:$C), (XSNMSUBMDP $A, $B, $C)>;
 def : Pat<(int_ppc_fnmadd f64:$A, f64:$B, f64:$C), (XSNMADDMDP $A, $B, $C)>;
 def : Pat<(int_ppc_fre f64:$A), (XSREDP $A)>;
 def : Pat<(int_ppc_frsqrte vsfrc:$XB), (XSRSQRTEDP $XB)>;
+def : Pat<(int_ppc_fnabs f64:$A), (XSNABSDP $A)>;
+def : Pat<(int_ppc_fnabss f32:$A), (XSNABSDPs $A)>;
+
+// XXMRG[LH]W is a direct replacement for VMRG[LH]W respectively.
+// Prefer the VSX form for greater register range.
+def:Pat<(vmrglw_unary_shuffle v16i8:$vA, undef),
+        (COPY_TO_REGCLASS (XXMRGLW (COPY_TO_REGCLASS $vA, VSRC),
+                                   (COPY_TO_REGCLASS $vA, VSRC)), VRRC)>;
+def:Pat<(vmrghw_unary_shuffle v16i8:$vA, undef),
+        (COPY_TO_REGCLASS (XXMRGHW (COPY_TO_REGCLASS $vA, VSRC),
+                                   (COPY_TO_REGCLASS $vA, VSRC)), VRRC)>;
+def:Pat<(vmrglw_shuffle v16i8:$vA, v16i8:$vB),
+        (COPY_TO_REGCLASS (XXMRGLW (COPY_TO_REGCLASS $vA, VSRC),
+                                   (COPY_TO_REGCLASS $vB, VSRC)), VRRC)>;
+def:Pat<(vmrghw_shuffle v16i8:$vA, v16i8:$vB),
+        (COPY_TO_REGCLASS (XXMRGHW (COPY_TO_REGCLASS $vA, VSRC),
+                                   (COPY_TO_REGCLASS $vB, VSRC)), VRRC)>;
+def:Pat<(vmrglw_swapped_shuffle v16i8:$vA, v16i8:$vB),
+        (COPY_TO_REGCLASS (XXMRGLW (COPY_TO_REGCLASS $vB, VSRC),
+                                   (COPY_TO_REGCLASS $vA, VSRC)), VRRC)>;
+def:Pat<(vmrghw_swapped_shuffle v16i8:$vA, v16i8:$vB),
+        (COPY_TO_REGCLASS (XXMRGHW (COPY_TO_REGCLASS $vB, VSRC),
+                                   (COPY_TO_REGCLASS $vA, VSRC)), VRRC)>;
 } // HasVSX
 
 // Any big endian VSX subtarget.
@@ -3311,7 +3309,6 @@ def : Pat<(v16i8 (bitconvert (v16i8 immAllOnesV))),
 
 // XL Compat builtins.
 def : Pat<(int_ppc_fmsubs f32:$A, f32:$B, f32:$C), (XSMSUBMSP $A, $B, $C)>;
-def : Pat<(int_ppc_fnmsubs f32:$A, f32:$B, f32:$C), (XSNMSUBMSP $A, $B, $C)>;
 def : Pat<(int_ppc_fnmadds f32:$A, f32:$B, f32:$C), (XSNMADDMSP $A, $B, $C)>;
 def : Pat<(int_ppc_fres f32:$A), (XSRESP $A)>;
 def : Pat<(i32 (int_ppc_extract_exp f64:$A)),
@@ -3370,6 +3367,15 @@ def : Pat<(f32 (vector_extract v4f32:$S, i32:$Idx)),
 
 def : Pat<(f64 (vector_extract v2f64:$S, i32:$Idx)),
           (f64 VectorExtractions.BE_32B_VARIABLE_DOUBLE)>;
+
+defm : ScalToVecWPermute<
+  v4i32, (i32 (load ForceXForm:$src)),
+  (XXSLDWIs (LIWZX ForceXForm:$src), 1),
+  (SUBREG_TO_REG (i64 1), (LIWZX ForceXForm:$src), sub_64)>;
+defm : ScalToVecWPermute<
+  v4f32, (f32 (load ForceXForm:$src)),
+  (XXSLDWIs (LIWZX ForceXForm:$src), 1),
+  (SUBREG_TO_REG (i64 1), (LIWZX ForceXForm:$src), sub_64)>;
 } // HasVSX, HasP8Vector, IsBigEndian
 
 // Big endian Power8 64Bit VSX subtarget.
@@ -3384,14 +3390,6 @@ def : Pat<(v2i64 (scalar_to_vector (i64 (sextloadi32 ForceXForm:$src)))),
           (v2i64 (SUBREG_TO_REG (i64 1), (LIWAX ForceXForm:$src), sub_64))>;
 def : Pat<(v2i64 (scalar_to_vector (i64 (zextloadi32 ForceXForm:$src)))),
           (v2i64 (SUBREG_TO_REG (i64 1), (LIWZX ForceXForm:$src), sub_64))>;
-defm : ScalToVecWPermute<
-  v4i32, (i32 (load ForceXForm:$src)),
-  (XXSLDWIs (LIWZX ForceXForm:$src), 1),
-  (SUBREG_TO_REG (i64 1), (LIWZX ForceXForm:$src), sub_64)>;
-defm : ScalToVecWPermute<
-  v4f32, (f32 (load ForceXForm:$src)),
-  (XXSLDWIs (LIWZX ForceXForm:$src), 1),
-  (SUBREG_TO_REG (i64 1), (LIWZX ForceXForm:$src), sub_64)>;
 
 def : Pat<DWToSPExtractConv.BVU,
           (v4f32 (VPKUDUM (XXSLDWI (XVCVUXDSP $S1), (XVCVUXDSP $S1), 3),
diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
index 0c7be96a0595..4689c0638ca6 100644
--- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
+++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
@@ -117,7 +117,6 @@ using namespace llvm;
 
 static cl::opt<unsigned>
     MaxVarsPrep("ppc-formprep-max-vars", cl::Hidden, cl::init(24),
-                cl::ZeroOrMore,
                 cl::desc("Potential common base number threshold per function "
                          "for PPC loop prep"));
 
diff --git a/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp b/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
index 22c5b6c11289..976effb96adc 100644
--- a/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -107,7 +107,7 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
   if (Subtarget->isUsingPCRelativeCalls()) {
     if (MIOpcode == PPC::TAILB || MIOpcode == PPC::TAILB8 ||
         MIOpcode == PPC::TCRETURNdi || MIOpcode == PPC::TCRETURNdi8 ||
-        MIOpcode == PPC::BL8_NOTOC) {
+        MIOpcode == PPC::BL8_NOTOC || MIOpcode == PPC::BL8_NOTOC_RM) {
       RefKind = MCSymbolRefExpr::VK_PPC_NOTOC;
     }
     if (MO.getTargetFlags() == PPCII::MO_PCREL_OPT_FLAG)
diff --git a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
index e5fa02bc8ccf..67d91d23962c 100644
--- a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -28,6 +28,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
@@ -985,7 +986,7 @@ bool PPCMIPeephole::simplifyCode() {
                   LiMI->getOpcode() == PPC::LI8) &&
                  "Invalid Opcode!");
           auto LiImm = LiMI->getOperand(1).getImm(); // save the imm of LI
-          LiMI->RemoveOperand(1);                    // remove the imm of LI
+          LiMI->removeOperand(1);                    // remove the imm of LI
           LiMI->setDesc(TII->get(LiMI->getOpcode() == PPC::LI ? PPC::ADDI
                                                               : PPC::ADDI8));
           MachineInstrBuilder(*LiMI->getParent()->getParent(), *LiMI)
diff --git a/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp b/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
index 782d41f93ae5..9d6dfd16ff9d 100644
--- a/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
@@ -23,6 +23,13 @@ void PPCFunctionInfo::anchor() {}
 PPCFunctionInfo::PPCFunctionInfo(const MachineFunction &MF)
     : DisableNonVolatileCR(PPCDisableNonVolatileCR) {}
 
+MachineFunctionInfo *
+PPCFunctionInfo::clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+                       const DenseMap<MachineBasicBlock *, MachineBasicBlock *>
+                           &Src2DstMBB) const {
+  return DestMF.cloneInfo<PPCFunctionInfo>(*this);
+}
+
 MCSymbol *PPCFunctionInfo::getPICOffsetSymbol(MachineFunction &MF) const {
   const DataLayout &DL = MF.getDataLayout();
   return MF.getContext().getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
diff --git a/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
index 07c503d47e98..b918e723de00 100644
--- a/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
@@ -153,6 +153,11 @@ private:
 public:
   explicit PPCFunctionInfo(const MachineFunction &MF);
 
+  MachineFunctionInfo *
+  clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+        const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+      const override;
+
   int getFramePointerSaveIndex() const { return FramePointerSaveIndex; }
   void setFramePointerSaveIndex(int Idx) { FramePointerSaveIndex = Idx; }
 
diff --git a/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp b/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp
index 9d5206f8fd43..58b74c6b8c7a 100644
--- a/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp
@@ -15,6 +15,7 @@
 #include "PPCSubtarget.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/CodeGen/MacroFusion.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
 
 using namespace llvm;
 namespace {
@@ -266,13 +267,13 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
         continue;
 
       auto DepOpIdx = Feature.depOpIdx();
-      if (DepOpIdx.hasValue()) {
+      if (DepOpIdx) {
         // Checking if the result of the FirstMI is the desired operand of the
         // SecondMI if the DepOpIdx is set. Otherwise, ignore it.
         if (!matchingRegOps(*FirstMI, 0, SecondMI, *DepOpIdx))
           return false;
       }
-  
+
       // Checking more on the instruction operands.
       if (checkOpConstraints(Feature.getKind(), *FirstMI, SecondMI))
         return true;
diff --git a/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp b/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
index a8853609a7c8..82c150b988ab 100644
--- a/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
+++ b/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -46,6 +47,10 @@ static cl::opt<bool>
 RunPreEmitPeephole("ppc-late-peephole", cl::Hidden, cl::init(true),
                    cl::desc("Run pre-emit peephole optimizations."));
 
+static cl::opt<uint64_t>
+DSCRValue("ppc-set-dscr", cl::Hidden,
+          cl::desc("Set the Data Stream Control Register."));
+
 namespace {
 
 static bool hasPCRelativeForm(MachineInstr &Use) {
@@ -407,6 +412,38 @@ static bool hasPCRelativeForm(MachineInstr &Use) {
     }
 
     bool runOnMachineFunction(MachineFunction &MF) override {
+      // If the user wants to set the DSCR using command-line options,
+      // load in the specified value at the start of main.
+      if (DSCRValue.getNumOccurrences() > 0 && MF.getName().equals("main") &&
+          MF.getFunction().hasExternalLinkage()) {
+        DSCRValue = (uint32_t)(DSCRValue & 0x01FFFFFF); // 25-bit DSCR mask
+        RegScavenger RS;
+        MachineBasicBlock &MBB = MF.front();
+        // Find an unused GPR according to register liveness
+        RS.enterBasicBlock(MBB);
+        unsigned InDSCR = RS.FindUnusedReg(&PPC::GPRCRegClass);
+        if (InDSCR) {
+          const PPCInstrInfo *TII =
+              MF.getSubtarget<PPCSubtarget>().getInstrInfo();
+          DebugLoc dl;
+          MachineBasicBlock::iterator IP = MBB.begin(); // Insert Point
+          // Copy the 32-bit DSCRValue integer into the GPR InDSCR using LIS and
+          // ORI, then move to DSCR. If the requested DSCR value is contained
+          // in a 16-bit signed number, we can emit a single `LI`, but the
+          // impact of saving one instruction in one function does not warrant
+          // any additional complexity in the logic here.
+          BuildMI(MBB, IP, dl, TII->get(PPC::LIS), InDSCR)
+              .addImm(DSCRValue >> 16);
+          BuildMI(MBB, IP, dl, TII->get(PPC::ORI), InDSCR)
+              .addReg(InDSCR)
+              .addImm(DSCRValue & 0xFFFF);
+          BuildMI(MBB, IP, dl, TII->get(PPC::MTUDSCR))
+              .addReg(InDSCR, RegState::Kill);
+        } else
+          errs() << "Warning: Ran out of registers - Unable to set DSCR as "
+                    "requested";
+      }
+
       if (skipFunction(MF.getFunction()) || !RunPreEmitPeephole) {
         // Remove UNENCODED_NOP even when this pass is disabled.
         // This needs to be done unconditionally so we don't emit zeros
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 76b016c0ee79..7349eb8addc9 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -28,6 +28,7 @@
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
@@ -90,6 +91,8 @@ ReportAccMoves("ppc-report-acc-moves",
                cl::Hidden, cl::init(false));
 #endif
 
+extern cl::opt<bool> DisableAutoPairedVecSt;
+
 static unsigned offsetMinAlignForOpcode(unsigned OpC);
 
 PPCRegisterInfo::PPCRegisterInfo(const PPCTargetMachine &TM)
@@ -113,6 +116,8 @@ PPCRegisterInfo::PPCRegisterInfo(const PPCTargetMachine &TM)
   ImmToIdxMap[PPC::STB8] = PPC::STBX8; ImmToIdxMap[PPC::STH8] = PPC::STHX8;
   ImmToIdxMap[PPC::STW8] = PPC::STWX8; ImmToIdxMap[PPC::STDU] = PPC::STDUX;
   ImmToIdxMap[PPC::ADDI8] = PPC::ADD8;
+  ImmToIdxMap[PPC::LQ] = PPC::LQX_PSEUDO;
+  ImmToIdxMap[PPC::STQ] = PPC::STQX_PSEUDO;
 
   // VSX
   ImmToIdxMap[PPC::DFLOADf32] = PPC::LXSSPX;
@@ -183,6 +188,8 @@ PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     if (!TM.isPPC64() && Subtarget.isAIXABI())
       report_fatal_error("AnyReg unimplemented on 32-bit AIX.");
     if (Subtarget.hasVSX()) {
+      if (Subtarget.pairedVectorMemops())
+        return CSR_64_AllRegs_VSRP_SaveList;
       if (Subtarget.isAIXABI() && !TM.getAIXExtendedAltivecABI())
         return CSR_64_AllRegs_AIX_Dflt_VSX_SaveList;
       return CSR_64_AllRegs_VSX_SaveList;
@@ -210,6 +217,9 @@ PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     if (Subtarget.isAIXABI())
       report_fatal_error("Cold calling unimplemented on AIX.");
     if (TM.isPPC64()) {
+      if (Subtarget.pairedVectorMemops())
+        return SaveR2 ? CSR_SVR64_ColdCC_R2_VSRP_SaveList
+                      : CSR_SVR64_ColdCC_VSRP_SaveList;
       if (Subtarget.hasAltivec())
         return SaveR2 ? CSR_SVR64_ColdCC_R2_Altivec_SaveList
                       : CSR_SVR64_ColdCC_Altivec_SaveList;
@@ -217,7 +227,9 @@ PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
                     : CSR_SVR64_ColdCC_SaveList;
     }
     // 32-bit targets.
-    if (Subtarget.hasAltivec())
+    if (Subtarget.pairedVectorMemops())
+      return CSR_SVR32_ColdCC_VSRP_SaveList;
+    else if (Subtarget.hasAltivec())
       return CSR_SVR32_ColdCC_Altivec_SaveList;
     else if (Subtarget.hasSPE())
       return CSR_SVR32_ColdCC_SPE_SaveList;
@@ -225,6 +237,8 @@ PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   }
   // Standard calling convention CSRs.
   if (TM.isPPC64()) {
+    if (Subtarget.pairedVectorMemops())
+      return SaveR2 ? CSR_SVR464_R2_VSRP_SaveList : CSR_SVR464_VSRP_SaveList;
     if (Subtarget.hasAltivec() &&
         (!Subtarget.isAIXABI() || TM.getAIXExtendedAltivecABI())) {
       return SaveR2 ? CSR_PPC64_R2_Altivec_SaveList
@@ -239,6 +253,8 @@ PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
                                            : CSR_AIX32_SaveList;
     return CSR_AIX32_SaveList;
   }
+  if (Subtarget.pairedVectorMemops())
+    return CSR_SVR432_VSRP_SaveList;
   if (Subtarget.hasAltivec())
     return CSR_SVR432_Altivec_SaveList;
   else if (Subtarget.hasSPE())
@@ -252,6 +268,8 @@ PPCRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
   const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
   if (CC == CallingConv::AnyReg) {
     if (Subtarget.hasVSX()) {
+      if (Subtarget.pairedVectorMemops())
+        return CSR_64_AllRegs_VSRP_RegMask;
       if (Subtarget.isAIXABI() && !TM.getAIXExtendedAltivecABI())
         return CSR_64_AllRegs_AIX_Dflt_VSX_RegMask;
       return CSR_64_AllRegs_VSX_RegMask;
@@ -275,20 +293,32 @@ PPCRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
   }
 
   if (CC == CallingConv::Cold) {
-    return TM.isPPC64() ? (Subtarget.hasAltivec() ? CSR_SVR64_ColdCC_Altivec_RegMask
-                                                  : CSR_SVR64_ColdCC_RegMask)
-                        : (Subtarget.hasAltivec() ? CSR_SVR32_ColdCC_Altivec_RegMask
-                                                  : (Subtarget.hasSPE()
-                                                  ? CSR_SVR32_ColdCC_SPE_RegMask
-                                                  : CSR_SVR32_ColdCC_RegMask));
+    if (TM.isPPC64())
+      return Subtarget.pairedVectorMemops()
+                 ? CSR_SVR64_ColdCC_VSRP_RegMask
+                 : (Subtarget.hasAltivec() ? CSR_SVR64_ColdCC_Altivec_RegMask
+                                           : CSR_SVR64_ColdCC_RegMask);
+    else
+      return Subtarget.pairedVectorMemops()
+                 ? CSR_SVR32_ColdCC_VSRP_RegMask
+                 : (Subtarget.hasAltivec()
+                        ? CSR_SVR32_ColdCC_Altivec_RegMask
+                        : (Subtarget.hasSPE() ? CSR_SVR32_ColdCC_SPE_RegMask
+                                              : CSR_SVR32_ColdCC_RegMask));
   }
 
-  return TM.isPPC64() ? (Subtarget.hasAltivec() ? CSR_PPC64_Altivec_RegMask
-                                                : CSR_PPC64_RegMask)
-                      : (Subtarget.hasAltivec()
-                             ? CSR_SVR432_Altivec_RegMask
-                             : (Subtarget.hasSPE() ? CSR_SVR432_SPE_RegMask
-                                                   : CSR_SVR432_RegMask));
+  if (TM.isPPC64())
+    return Subtarget.pairedVectorMemops()
+               ? CSR_SVR464_VSRP_RegMask
+               : (Subtarget.hasAltivec() ? CSR_PPC64_Altivec_RegMask
+                                         : CSR_PPC64_RegMask);
+  else
+    return Subtarget.pairedVectorMemops()
+               ? CSR_SVR432_VSRP_RegMask
+               : (Subtarget.hasAltivec()
+                      ? CSR_SVR432_Altivec_RegMask
+                      : (Subtarget.hasSPE() ? CSR_SVR432_SPE_RegMask
+                                            : CSR_SVR432_RegMask));
 }
 
 const uint32_t*
@@ -463,6 +493,14 @@ bool PPCRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) co
       LLVM_DEBUG(dbgs() << "TRUE - Memory operand is X-Form.\n");
       return true;
     }
+
+    // This is a spill/restore of a quadword.
+    if ((Opcode == PPC::RESTORE_QUADWORD) || (Opcode == PPC::SPILL_QUADWORD)) {
+      LLVM_DEBUG(dbgs() << "Memory Operand: " << InstrInfo->getName(Opcode)
+                        << " for register " << printReg(Reg, this) << ".\n");
+      LLVM_DEBUG(dbgs() << "TRUE - Memory operand is a quadword.\n");
+      return true;
+    }
   }
   LLVM_DEBUG(dbgs() << "FALSE - Scavenging is not required.\n");
   return false;
@@ -1082,7 +1120,7 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II,
   MBB.erase(II);
   if (SpillsKnownBit && KillsCRBit && !SeenUse) {
     Ins->setDesc(TII.get(PPC::UNENCODED_NOP));
-    Ins->RemoveOperand(0);
+    Ins->removeOperand(0);
   }
 }
 
@@ -1163,6 +1201,59 @@ static void emitAccSpillRestoreInfo(MachineBasicBlock &MBB, bool IsPrimed,
 #endif
 }
 
+static void spillRegPairs(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator II, DebugLoc DL,
+                          const TargetInstrInfo &TII, Register SrcReg,
+                          unsigned FrameIndex, bool IsLittleEndian,
+                          bool IsKilled, bool TwoPairs) {
+  unsigned Offset = 0;
+  if (TwoPairs)
+    Offset = IsLittleEndian ? 48 : 0;
+  else
+    Offset = IsLittleEndian ? 16 : 0;
+  Register Reg = (SrcReg > PPC::VSRp15) ? PPC::V0 + (SrcReg - PPC::VSRp16) * 2
+                                        : PPC::VSL0 + (SrcReg - PPC::VSRp0) * 2;
+  addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV))
+                        .addReg(Reg, getKillRegState(IsKilled)),
+                    FrameIndex, Offset);
+  Offset += IsLittleEndian ? -16 : 16;
+  addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV))
+                        .addReg(Reg + 1, getKillRegState(IsKilled)),
+                    FrameIndex, Offset);
+  if (TwoPairs) {
+    Offset += IsLittleEndian ? -16 : 16;
+    addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV))
+                          .addReg(Reg + 2, getKillRegState(IsKilled)),
+                      FrameIndex, Offset);
+    Offset += IsLittleEndian ? -16 : 16;
+    addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV))
+                          .addReg(Reg + 3, getKillRegState(IsKilled)),
+                      FrameIndex, Offset);
+  }
+}
+
+/// Remove any STXVP[X] instructions and split them out into a pair of
+/// STXV[X] instructions if --disable-auto-paired-vec-st is specified on
+/// the command line.
+void PPCRegisterInfo::lowerOctWordSpilling(MachineBasicBlock::iterator II,
+                                           unsigned FrameIndex) const {
+  assert(DisableAutoPairedVecSt &&
+         "Expecting to do this only if paired vector stores are disabled.");
+  MachineInstr &MI = *II; // STXVP <SrcReg>, <offset>
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
+  Register SrcReg = MI.getOperand(0).getReg();
+  bool IsLittleEndian = Subtarget.isLittleEndian();
+  bool IsKilled = MI.getOperand(0).isKill();
+  spillRegPairs(MBB, II, DL, TII, SrcReg, FrameIndex, IsLittleEndian, IsKilled,
+                /* TwoPairs */ false);
+  // Discard the original instruction.
+  MBB.erase(II);
+}
+
 /// lowerACCSpilling - Generate the code for spilling the accumulator register.
 /// Similarly to other spills/reloads that use pseudo-ops, we do not actually
 /// eliminate the FrameIndex here nor compute the stack offset. We simply
@@ -1192,12 +1283,17 @@ void PPCRegisterInfo::lowerACCSpilling(MachineBasicBlock::iterator II,
   // adjust the offset of the store that is within the 64-byte stack slot.
   if (IsPrimed)
     BuildMI(MBB, II, DL, TII.get(PPC::XXMFACC), SrcReg).addReg(SrcReg);
-  addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
-                        .addReg(Reg, getKillRegState(IsKilled)),
-                    FrameIndex, IsLittleEndian ? 32 : 0);
-  addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
-                        .addReg(Reg + 1, getKillRegState(IsKilled)),
-                    FrameIndex, IsLittleEndian ? 0 : 32);
+  if (DisableAutoPairedVecSt)
+    spillRegPairs(MBB, II, DL, TII, Reg, FrameIndex, IsLittleEndian, IsKilled,
+                  /* TwoPairs */ true);
+  else {
+    addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
+                          .addReg(Reg, getKillRegState(IsKilled)),
+                      FrameIndex, IsLittleEndian ? 32 : 0);
+    addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
+                          .addReg(Reg + 1, getKillRegState(IsKilled)),
+                      FrameIndex, IsLittleEndian ? 0 : 32);
+  }
   if (IsPrimed && !IsKilled)
     BuildMI(MBB, II, DL, TII.get(PPC::XXMTACC), SrcReg).addReg(SrcReg);
 
@@ -1433,6 +1529,9 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   } else if (OpC == PPC::RESTORE_ACC || OpC == PPC::RESTORE_UACC) {
     lowerACCRestore(II, FrameIndex);
     return;
+  } else if (OpC == PPC::STXVP && DisableAutoPairedVecSt) {
+    lowerOctWordSpilling(II, FrameIndex);
+    return;
   } else if (OpC == PPC::SPILL_QUADWORD) {
     lowerQuadwordSpilling(II, FrameIndex);
     return;
@@ -1451,7 +1550,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                    OpC != TargetOpcode::PATCHPOINT && !ImmToIdxMap.count(OpC);
 
   // Now add the frame object offset to the offset from r1.
-  int Offset = MFI.getObjectOffset(FrameIndex);
+  int64_t Offset = MFI.getObjectOffset(FrameIndex);
   Offset += MI.getOperand(OffsetOperandNo).getImm();
 
   // If we're not using a Frame Pointer that has been set to the value of the
@@ -1507,17 +1606,21 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   const TargetRegisterClass *RC = is64Bit ? G8RC : GPRC;
   Register SRegHi = MF.getRegInfo().createVirtualRegister(RC),
            SReg = MF.getRegInfo().createVirtualRegister(RC);
+  unsigned NewOpcode = 0u;
 
   // Insert a set of rA with the full offset value before the ld, st, or add
   if (isInt<16>(Offset))
     BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::LI8 : PPC::LI), SReg)
-      .addImm(Offset);
-  else {
+        .addImm(Offset);
+  else if (isInt<32>(Offset)) {
     BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::LIS8 : PPC::LIS), SRegHi)
-      .addImm(Offset >> 16);
+        .addImm(Offset >> 16);
     BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::ORI8 : PPC::ORI), SReg)
-      .addReg(SRegHi, RegState::Kill)
-      .addImm(Offset);
+        .addReg(SRegHi, RegState::Kill)
+        .addImm(Offset);
+  } else {
+    assert(is64Bit && "Huge stack is only supported on PPC64");
+    TII.materializeImmPostRA(MBB, II, dl, SReg, Offset);
   }
 
   // Convert into indexed form of the instruction:
@@ -1532,7 +1635,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
            OpC != TargetOpcode::INLINEASM_BR) {
     assert(ImmToIdxMap.count(OpC) &&
            "No indexed form of load or store available!");
-    unsigned NewOpcode = ImmToIdxMap.find(OpC)->second;
+    NewOpcode = ImmToIdxMap.find(OpC)->second;
     MI.setDesc(TII.get(NewOpcode));
     OperandBase = 1;
   } else {
@@ -1542,6 +1645,20 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   Register StackReg = MI.getOperand(FIOperandNum).getReg();
   MI.getOperand(OperandBase).ChangeToRegister(StackReg, false);
   MI.getOperand(OperandBase + 1).ChangeToRegister(SReg, false, false, true);
+
+  // Since these are not real X-Form instructions, we must
+  // add the registers and access 0(NewReg) rather than
+  // emitting the X-Form pseudo.
+  if (NewOpcode == PPC::LQX_PSEUDO || NewOpcode == PPC::STQX_PSEUDO) {
+    assert(is64Bit && "Quadword loads/stores only supported in 64-bit mode");
+    Register NewReg = MF.getRegInfo().createVirtualRegister(&PPC::G8RCRegClass);
+    BuildMI(MBB, II, dl, TII.get(PPC::ADD8), NewReg)
+        .addReg(SReg, RegState::Kill)
+        .addReg(StackReg);
+    MI.setDesc(TII.get(NewOpcode == PPC::LQX_PSEUDO ? PPC::LQ : PPC::STQ));
+    MI.getOperand(OperandBase + 1).ChangeToRegister(NewReg, false);
+    MI.getOperand(OperandBase).ChangeToImmediate(0);
+  }
 }
 
 Register PPCRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
index 114f6d0f4c66..aaa841fffa1b 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -130,6 +130,8 @@ public:
   void lowerCRBitRestore(MachineBasicBlock::iterator II,
                          unsigned FrameIndex) const;
 
+  void lowerOctWordSpilling(MachineBasicBlock::iterator II,
+                            unsigned FrameIndex) const;
   void lowerACCSpilling(MachineBasicBlock::iterator II,
                         unsigned FrameIndex) const;
   void lowerACCRestore(MachineBasicBlock::iterator II,
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
index 044035e0ef29..7892b0d12d01 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -18,8 +18,6 @@ def sub_32 : SubRegIndex<32>;
 def sub_64 : SubRegIndex<64>;
 def sub_vsx0 : SubRegIndex<128>;
 def sub_vsx1 : SubRegIndex<128, 128>;
-def sub_pair0 : SubRegIndex<256>;
-def sub_pair1 : SubRegIndex<256, 256>;
 def sub_gp8_x0 : SubRegIndex<64>;
 def sub_gp8_x1 : SubRegIndex<64, 64>;
 }
@@ -100,21 +98,6 @@ class CRBIT<bits<5> num, string n> : PPCReg<n> {
   let HWEncoding{4-0} = num;
 }
 
-// ACC - One of the 8 512-bit VSX accumulators.
-class ACC<bits<3> num, string n, list<Register> subregs> : PPCReg<n> {
-  let HWEncoding{2-0} = num;
-  let SubRegs = subregs;
-}
-
-// UACC - One of the 8 512-bit VSX accumulators prior to being primed.
-// Without using this register class, the register allocator has no way to
-// differentiate a primed accumulator from an unprimed accumulator.
-// This may result in invalid copies between primed and unprimed accumulators.
-class UACC<bits<3> num, string n, list<Register> subregs> : PPCReg<n> {
-  let HWEncoding{2-0} = num;
-  let SubRegs = subregs;
-}
-
 // VSR Pairs - One of the 32 paired even-odd consecutive VSRs.
 class VSRPair<bits<5> num, string n, list<Register> subregs> : PPCReg<n> {
   let HWEncoding{4-0} = num;
@@ -272,9 +255,6 @@ def CTR8 : SPR<9, "ctr">, DwarfRegNum<[66, -2]>;
 def VRSAVE: SPR<256, "vrsave">, DwarfRegNum<[109]>;
 
 // SPE extra registers
-// SPE Accumulator for multiply-accumulate SPE operations.  Never directly
-// accessed, so there's no real encoding for it.
-def SPEACC: DwarfRegNum<[99, 111]>;
 def SPEFSCR: SPR<512, "spefscr">, DwarfRegNum<[612, 112]>;
 
 def XER: SPR<1, "xer">, DwarfRegNum<[76]>;
@@ -448,72 +428,6 @@ def CARRYRC : RegisterClass<"PPC", [i32], 32, (add CARRY, XER)> {
   let CopyCost = -1;
 }
 
-let SubRegIndices = [sub_pair0, sub_pair1] in {
-  def ACC0 : ACC<0, "acc0", [VSRp0, VSRp1]>, DwarfRegNum<[-1, -1]>;
-  def ACC1 : ACC<1, "acc1", [VSRp2, VSRp3]>, DwarfRegNum<[-1, -1]>;
-  def ACC2 : ACC<2, "acc2", [VSRp4, VSRp5]>, DwarfRegNum<[-1, -1]>;
-  def ACC3 : ACC<3, "acc3", [VSRp6, VSRp7]>, DwarfRegNum<[-1, -1]>;
-  def ACC4 : ACC<4, "acc4", [VSRp8, VSRp9]>, DwarfRegNum<[-1, -1]>;
-  def ACC5 : ACC<5, "acc5", [VSRp10, VSRp11]>, DwarfRegNum<[-1, -1]>;
-  def ACC6 : ACC<6, "acc6", [VSRp12, VSRp13]>, DwarfRegNum<[-1, -1]>;
-  def ACC7 : ACC<7, "acc7", [VSRp14, VSRp15]>, DwarfRegNum<[-1, -1]>;
-}
-def ACCRC : RegisterClass<"PPC", [v512i1], 128, (add ACC0, ACC1, ACC2, ACC3,
-                                                      ACC4, ACC5, ACC6, ACC7)> {
-  // The AllocationPriority is in the range [0, 63]. Assigned the ACC registers
-  // the highest possible priority in this range to force the register allocator
-  // to assign these registers first. This is done because the ACC registers
-  // must represent 4 advacent vector registers. For example ACC1 must be
-  // VS4 - VS7. The value here must be at least 32 as we want to allocate
-  // these registers even before we allocate global ranges.
-  let AllocationPriority = 63;
-  let Size = 512;
-}
-
-let SubRegIndices = [sub_pair0, sub_pair1] in {
-  def UACC0 : UACC<0, "acc0", [VSRp0, VSRp1]>, DwarfRegNum<[-1, -1]>;
-  def UACC1 : UACC<1, "acc1", [VSRp2, VSRp3]>, DwarfRegNum<[-1, -1]>;
-  def UACC2 : UACC<2, "acc2", [VSRp4, VSRp5]>, DwarfRegNum<[-1, -1]>;
-  def UACC3 : UACC<3, "acc3", [VSRp6, VSRp7]>, DwarfRegNum<[-1, -1]>;
-  def UACC4 : UACC<4, "acc4", [VSRp8, VSRp9]>, DwarfRegNum<[-1, -1]>;
-  def UACC5 : UACC<5, "acc5", [VSRp10, VSRp11]>, DwarfRegNum<[-1, -1]>;
-  def UACC6 : UACC<6, "acc6", [VSRp12, VSRp13]>, DwarfRegNum<[-1, -1]>;
-  def UACC7 : UACC<7, "acc7", [VSRp14, VSRp15]>, DwarfRegNum<[-1, -1]>;
-}
-def UACCRC : RegisterClass<"PPC", [v512i1], 128,
-                           (add UACC0, UACC1, UACC2, UACC3,
-                                UACC4, UACC5, UACC6, UACC7)> {
-  // The AllocationPriority for the UACC registers is still high and must be at
-  // least 32 as we want to allocate these registers before we allocate other
-  // global ranges. The value must be less than the AllocationPriority of the
-  // ACC registers.
-  let AllocationPriority = 36;
-  let Size = 512;
-}
-
-// FIXME: This allocation order may increase stack frame size when allocating
-// non-volatile registers.
-//
-// Placing Altivec registers first and allocate the rest as underlying VSX
-// ones, to reduce interference with accumulator registers (lower 32 VSRs).
-// This reduces copies when loading for accumulators, which is common use for
-// paired VSX registers.
-def VSRpRC :
-  RegisterClass<"PPC", [v256i1], 128,
-                (add VSRp17, VSRp18, VSRp16, VSRp19, VSRp20, VSRp21,
-                     VSRp22, VSRp23, VSRp24, VSRp25, VSRp31, VSRp30,
-                     VSRp29, VSRp28, VSRp27, VSRp26,
-                     (sequence "VSRp%u", 0, 6),
-                     (sequence "VSRp%u", 15, 7))> {
-  // Give the VSRp registers a non-zero AllocationPriority. The value is less
-  // than 32 as these registers should not always be allocated before global
-  // ranges and the value should be less than the AllocationPriority - 32 for
-  // the UACC registers. Even global VSRp registers should be allocated after
-  // the UACC registers have been chosen.
-  let AllocationPriority = 2;
-  let Size = 256;
-}
-
 // Make AllocationOrder as similar as G8RC's to avoid potential spilling.
 // Similarly, we have an AltOrder for 64-bit ELF ABI which r2 is allocated
 // at last.
@@ -528,3 +442,572 @@ def G8pRC :
   }];
   let Size = 128;
 }
+
+include "PPCRegisterInfoMMA.td"
+
+//===----------------------------------------------------------------------===//
+// PowerPC Operand Definitions.
+
+// In the default PowerPC assembler syntax, registers are specified simply
+// by number, so they cannot be distinguished from immediate values (without
+// looking at the opcode).  This means that the default operand matching logic
+// for the asm parser does not work, and we need to specify custom matchers.
+// Since those can only be specified with RegisterOperand classes and not
+// directly on the RegisterClass, all instructions patterns used by the asm
+// parser need to use a RegisterOperand (instead of a RegisterClass) for
+// all their register operands.
+// For this purpose, we define one RegisterOperand for each RegisterClass,
+// using the same name as the class, just in lower case.
+
+def PPCRegGPRCAsmOperand : AsmOperandClass {
+  let Name = "RegGPRC"; let PredicateMethod = "isRegNumber";
+}
+def gprc : RegisterOperand<GPRC> {
+  let ParserMatchClass = PPCRegGPRCAsmOperand;
+}
+def PPCRegG8RCAsmOperand : AsmOperandClass {
+  let Name = "RegG8RC"; let PredicateMethod = "isRegNumber";
+}
+def g8rc : RegisterOperand<G8RC> {
+  let ParserMatchClass = PPCRegG8RCAsmOperand;
+}
+def PPCRegG8pRCAsmOperand : AsmOperandClass {
+  let Name = "RegG8pRC"; let PredicateMethod = "isEvenRegNumber";
+}
+def g8prc : RegisterOperand<G8pRC> {
+  let ParserMatchClass = PPCRegG8pRCAsmOperand;
+}
+def PPCRegGPRCNoR0AsmOperand : AsmOperandClass {
+  let Name = "RegGPRCNoR0"; let PredicateMethod = "isRegNumber";
+}
+def gprc_nor0 : RegisterOperand<GPRC_NOR0> {
+  let ParserMatchClass = PPCRegGPRCNoR0AsmOperand;
+}
+def PPCRegG8RCNoX0AsmOperand : AsmOperandClass {
+  let Name = "RegG8RCNoX0"; let PredicateMethod = "isRegNumber";
+}
+def g8rc_nox0 : RegisterOperand<G8RC_NOX0> {
+  let ParserMatchClass = PPCRegG8RCNoX0AsmOperand;
+}
+def PPCRegF8RCAsmOperand : AsmOperandClass {
+  let Name = "RegF8RC"; let PredicateMethod = "isRegNumber";
+}
+def f8rc : RegisterOperand<F8RC> {
+  let ParserMatchClass = PPCRegF8RCAsmOperand;
+}
+def PPCRegF4RCAsmOperand : AsmOperandClass {
+  let Name = "RegF4RC"; let PredicateMethod = "isRegNumber";
+}
+def f4rc : RegisterOperand<F4RC> {
+  let ParserMatchClass = PPCRegF4RCAsmOperand;
+}
+def PPCRegVRRCAsmOperand : AsmOperandClass {
+  let Name = "RegVRRC"; let PredicateMethod = "isRegNumber";
+}
+def vrrc : RegisterOperand<VRRC> {
+  let ParserMatchClass = PPCRegVRRCAsmOperand;
+}
+def PPCRegVFRCAsmOperand : AsmOperandClass {
+  let Name = "RegVFRC"; let PredicateMethod = "isRegNumber";
+}
+def vfrc : RegisterOperand<VFRC> {
+  let ParserMatchClass = PPCRegVFRCAsmOperand;
+}
+def PPCRegCRBITRCAsmOperand : AsmOperandClass {
+  let Name = "RegCRBITRC"; let PredicateMethod = "isCRBitNumber";
+}
+def crbitrc : RegisterOperand<CRBITRC> {
+  let ParserMatchClass = PPCRegCRBITRCAsmOperand;
+}
+def PPCRegCRRCAsmOperand : AsmOperandClass {
+  let Name = "RegCRRC"; let PredicateMethod = "isCCRegNumber";
+}
+def crrc : RegisterOperand<CRRC> {
+  let ParserMatchClass = PPCRegCRRCAsmOperand;
+}
+def PPCRegSPERCAsmOperand : AsmOperandClass {
+  let Name = "RegSPERC"; let PredicateMethod = "isRegNumber";
+}
+def sperc : RegisterOperand<SPERC> {
+  let ParserMatchClass = PPCRegSPERCAsmOperand;
+}
+def PPCRegSPE4RCAsmOperand : AsmOperandClass {
+  let Name = "RegSPE4RC"; let PredicateMethod = "isRegNumber";
+}
+def spe4rc : RegisterOperand<GPRC> {
+  let ParserMatchClass = PPCRegSPE4RCAsmOperand;
+}
+
+def PPCU1ImmAsmOperand : AsmOperandClass {
+  let Name = "U1Imm"; let PredicateMethod = "isU1Imm";
+  let RenderMethod = "addImmOperands";
+}
+def u1imm   : Operand<i32> {
+  let PrintMethod = "printU1ImmOperand";
+  let ParserMatchClass = PPCU1ImmAsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def PPCU2ImmAsmOperand : AsmOperandClass {
+  let Name = "U2Imm"; let PredicateMethod = "isU2Imm";
+  let RenderMethod = "addImmOperands";
+}
+def u2imm   : Operand<i32> {
+  let PrintMethod = "printU2ImmOperand";
+  let ParserMatchClass = PPCU2ImmAsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def PPCATBitsAsHintAsmOperand : AsmOperandClass {
+  let Name = "ATBitsAsHint"; let PredicateMethod = "isATBitsAsHint";
+  let RenderMethod = "addImmOperands"; // Irrelevant, predicate always fails.
+}
+def atimm   : Operand<i32> {
+  let PrintMethod = "printATBitsAsHint";
+  let ParserMatchClass = PPCATBitsAsHintAsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def PPCU3ImmAsmOperand : AsmOperandClass {
+  let Name = "U3Imm"; let PredicateMethod = "isU3Imm";
+  let RenderMethod = "addImmOperands";
+}
+def u3imm   : Operand<i32> {
+  let PrintMethod = "printU3ImmOperand";
+  let ParserMatchClass = PPCU3ImmAsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def PPCU4ImmAsmOperand : AsmOperandClass {
+  let Name = "U4Imm"; let PredicateMethod = "isU4Imm";
+  let RenderMethod = "addImmOperands";
+}
+def u4imm   : Operand<i32> {
+  let PrintMethod = "printU4ImmOperand";
+  let ParserMatchClass = PPCU4ImmAsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+def PPCS5ImmAsmOperand : AsmOperandClass {
+  let Name = "S5Imm"; let PredicateMethod = "isS5Imm";
+  let RenderMethod = "addImmOperands";
+}
+def s5imm   : Operand<i32> {
+  let PrintMethod = "printS5ImmOperand";
+  let ParserMatchClass = PPCS5ImmAsmOperand;
+  let DecoderMethod = "decodeSImmOperand<5>";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+def PPCU5ImmAsmOperand : AsmOperandClass {
+  let Name = "U5Imm"; let PredicateMethod = "isU5Imm";
+  let RenderMethod = "addImmOperands";
+}
+def u5imm   : Operand<i32> {
+  let PrintMethod = "printU5ImmOperand";
+  let ParserMatchClass = PPCU5ImmAsmOperand;
+  let DecoderMethod = "decodeUImmOperand<5>";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+def PPCU6ImmAsmOperand : AsmOperandClass {
+  let Name = "U6Imm"; let PredicateMethod = "isU6Imm";
+  let RenderMethod = "addImmOperands";
+}
+def u6imm   : Operand<i32> {
+  let PrintMethod = "printU6ImmOperand";
+  let ParserMatchClass = PPCU6ImmAsmOperand;
+  let DecoderMethod = "decodeUImmOperand<6>";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+def PPCU7ImmAsmOperand : AsmOperandClass {
+  let Name = "U7Imm"; let PredicateMethod = "isU7Imm";
+  let RenderMethod = "addImmOperands";
+}
+def u7imm   : Operand<i32> {
+  let PrintMethod = "printU7ImmOperand";
+  let ParserMatchClass = PPCU7ImmAsmOperand;
+  let DecoderMethod = "decodeUImmOperand<7>";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+def PPCU8ImmAsmOperand : AsmOperandClass {
+  let Name = "U8Imm"; let PredicateMethod = "isU8Imm";
+  let RenderMethod = "addImmOperands";
+}
+def u8imm   : Operand<i32> {
+  let PrintMethod = "printU8ImmOperand";
+  let ParserMatchClass = PPCU8ImmAsmOperand;
+  let DecoderMethod = "decodeUImmOperand<8>";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+def PPCU10ImmAsmOperand : AsmOperandClass {
+  let Name = "U10Imm"; let PredicateMethod = "isU10Imm";
+  let RenderMethod = "addImmOperands";
+}
+def u10imm  : Operand<i32> {
+  let PrintMethod = "printU10ImmOperand";
+  let ParserMatchClass = PPCU10ImmAsmOperand;
+  let DecoderMethod = "decodeUImmOperand<10>";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+def PPCU12ImmAsmOperand : AsmOperandClass {
+  let Name = "U12Imm"; let PredicateMethod = "isU12Imm";
+  let RenderMethod = "addImmOperands";
+}
+def u12imm  : Operand<i32> {
+  let PrintMethod = "printU12ImmOperand";
+  let ParserMatchClass = PPCU12ImmAsmOperand;
+  let DecoderMethod = "decodeUImmOperand<12>";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+def PPCS16ImmAsmOperand : AsmOperandClass {
+  let Name = "S16Imm"; let PredicateMethod = "isS16Imm";
+  let RenderMethod = "addS16ImmOperands";
+}
+def s16imm  : Operand<i32> {
+  let PrintMethod = "printS16ImmOperand";
+  let EncoderMethod = "getImm16Encoding";
+  let ParserMatchClass = PPCS16ImmAsmOperand;
+  let DecoderMethod = "decodeSImmOperand<16>";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+def PPCU16ImmAsmOperand : AsmOperandClass {
+  let Name = "U16Imm"; let PredicateMethod = "isU16Imm";
+  let RenderMethod = "addU16ImmOperands";
+}
+def u16imm  : Operand<i32> {
+  let PrintMethod = "printU16ImmOperand";
+  let EncoderMethod = "getImm16Encoding";
+  let ParserMatchClass = PPCU16ImmAsmOperand;
+  let DecoderMethod = "decodeUImmOperand<16>";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+def PPCS17ImmAsmOperand : AsmOperandClass {
+  let Name = "S17Imm"; let PredicateMethod = "isS17Imm";
+  let RenderMethod = "addS16ImmOperands";
+}
+def s17imm  : Operand<i32> {
+  // This operand type is used for addis/lis to allow the assembler parser
+  // to accept immediates in the range -65536..65535 for compatibility with
+  // the GNU assembler.  The operand is treated as 16-bit otherwise.
+  let PrintMethod = "printS16ImmOperand";
+  let EncoderMethod = "getImm16Encoding";
+  let ParserMatchClass = PPCS17ImmAsmOperand;
+  let DecoderMethod = "decodeSImmOperand<16>";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+def PPCS34ImmAsmOperand : AsmOperandClass {
+  let Name = "S34Imm";
+  let PredicateMethod = "isS34Imm";
+  let RenderMethod = "addImmOperands";
+}
+def s34imm : Operand<i64> {
+  let PrintMethod = "printS34ImmOperand";
+  let EncoderMethod = "getImm34EncodingNoPCRel";
+  let ParserMatchClass = PPCS34ImmAsmOperand;
+  let DecoderMethod = "decodeSImmOperand<34>";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+def s34imm_pcrel : Operand<i64> {
+  let PrintMethod = "printS34ImmOperand";
+  let EncoderMethod = "getImm34EncodingPCRel";
+  let ParserMatchClass = PPCS34ImmAsmOperand;
+  let DecoderMethod = "decodeSImmOperand<34>";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+def PPCImmZeroAsmOperand : AsmOperandClass {
+  let Name = "ImmZero";
+  let PredicateMethod = "isImmZero";
+  let RenderMethod = "addImmOperands";
+}
+def immZero : Operand<i32> {
+  let PrintMethod = "printImmZeroOperand";
+  let ParserMatchClass = PPCImmZeroAsmOperand;
+  let DecoderMethod = "decodeImmZeroOperand";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def fpimm0 : PatLeaf<(fpimm), [{ return N->isExactlyValue(+0.0); }]>;
+
+def PPCDirectBrAsmOperand : AsmOperandClass {
+  let Name = "DirectBr"; let PredicateMethod = "isDirectBr";
+  let RenderMethod = "addBranchTargetOperands";
+}
+def directbrtarget : Operand<OtherVT> {
+  let PrintMethod = "printBranchOperand";
+  let EncoderMethod = "getDirectBrEncoding";
+  let DecoderMethod = "decodeDirectBrTarget";
+  let ParserMatchClass = PPCDirectBrAsmOperand;
+  let OperandType = "OPERAND_PCREL";
+}
+def absdirectbrtarget : Operand<OtherVT> {
+  let PrintMethod = "printAbsBranchOperand";
+  let EncoderMethod = "getAbsDirectBrEncoding";
+  let ParserMatchClass = PPCDirectBrAsmOperand;
+}
+def PPCCondBrAsmOperand : AsmOperandClass {
+  let Name = "CondBr"; let PredicateMethod = "isCondBr";
+  let RenderMethod = "addBranchTargetOperands";
+}
+def condbrtarget : Operand<OtherVT> {
+  let PrintMethod = "printBranchOperand";
+  let EncoderMethod = "getCondBrEncoding";
+  let DecoderMethod = "decodeCondBrTarget";
+  let ParserMatchClass = PPCCondBrAsmOperand;
+  let OperandType = "OPERAND_PCREL";
+}
+def abscondbrtarget : Operand<OtherVT> {
+  let PrintMethod = "printAbsBranchOperand";
+  let EncoderMethod = "getAbsCondBrEncoding";
+  let ParserMatchClass = PPCCondBrAsmOperand;
+}
+def calltarget : Operand<iPTR> {
+  let PrintMethod = "printBranchOperand";
+  let EncoderMethod = "getDirectBrEncoding";
+  let DecoderMethod = "decodeDirectBrTarget";
+  let ParserMatchClass = PPCDirectBrAsmOperand;
+  let OperandType = "OPERAND_PCREL";
+}
+def abscalltarget : Operand<iPTR> {
+  let PrintMethod = "printAbsBranchOperand";
+  let EncoderMethod = "getAbsDirectBrEncoding";
+  let ParserMatchClass = PPCDirectBrAsmOperand;
+}
+def PPCCRBitMaskOperand : AsmOperandClass {
+ let Name = "CRBitMask"; let PredicateMethod = "isCRBitMask";
+}
+def crbitm: Operand<i8> {
+  let PrintMethod = "printcrbitm";
+  let EncoderMethod = "get_crbitm_encoding";
+  let DecoderMethod = "decodeCRBitMOperand";
+  let ParserMatchClass = PPCCRBitMaskOperand;
+}
+// Address operands
+// A version of ptr_rc which excludes R0 (or X0 in 64-bit mode).
+def PPCRegGxRCNoR0Operand : AsmOperandClass {
+  let Name = "RegGxRCNoR0"; let PredicateMethod = "isRegNumber";
+}
+def ptr_rc_nor0 : Operand<iPTR>, PointerLikeRegClass<1> {
+  let ParserMatchClass = PPCRegGxRCNoR0Operand;
+}
+
+// New addressing modes with 34 bit immediates.
+def PPCDispRI34Operand : AsmOperandClass {
+  let Name = "DispRI34"; let PredicateMethod = "isS34Imm";
+  let RenderMethod = "addImmOperands";
+}
+def dispRI34 : Operand<iPTR> {
+  let ParserMatchClass = PPCDispRI34Operand;
+}
+def memri34 : Operand<iPTR> { // memri, imm is a 34-bit value.
+  let PrintMethod = "printMemRegImm34";
+  let MIOperandInfo = (ops dispRI34:$imm, ptr_rc_nor0:$reg);
+  let EncoderMethod = "getMemRI34Encoding";
+  let DecoderMethod = "decodeMemRI34Operands";
+}
+// memri, imm is a 34-bit value for pc-relative instructions where
+// base register is set to zero.
+def memri34_pcrel : Operand<iPTR> { // memri, imm is a 34-bit value.
+  let PrintMethod = "printMemRegImm34PCRel";
+  let MIOperandInfo = (ops dispRI34:$imm, immZero:$reg);
+  let EncoderMethod = "getMemRI34PCRelEncoding";
+  let DecoderMethod = "decodeMemRI34PCRelOperands";
+}
+
+// A version of ptr_rc usable with the asm parser.
+def PPCRegGxRCOperand : AsmOperandClass {
+  let Name = "RegGxRC"; let PredicateMethod = "isRegNumber";
+}
+def ptr_rc_idx : Operand<iPTR>, PointerLikeRegClass<0> {
+  let ParserMatchClass = PPCRegGxRCOperand;
+}
+
+def PPCDispRIOperand : AsmOperandClass {
+ let Name = "DispRI"; let PredicateMethod = "isS16Imm";
+ let RenderMethod = "addS16ImmOperands";
+}
+def dispRI : Operand<iPTR> {
+  let ParserMatchClass = PPCDispRIOperand;
+}
+def PPCDispRIXOperand : AsmOperandClass {
+ let Name = "DispRIX"; let PredicateMethod = "isS16ImmX4";
+ let RenderMethod = "addS16ImmOperands";
+}
+def dispRIX : Operand<iPTR> {
+  let ParserMatchClass = PPCDispRIXOperand;
+}
+def PPCDispRIHashOperand : AsmOperandClass {
+  let Name = "DispRIHash"; let PredicateMethod = "isHashImmX8";
+  let RenderMethod = "addImmOperands";
+}
+def dispRIHash : Operand<iPTR> {
+  let ParserMatchClass = PPCDispRIHashOperand;
+}
+def PPCDispRIX16Operand : AsmOperandClass {
+ let Name = "DispRIX16"; let PredicateMethod = "isS16ImmX16";
+ let RenderMethod = "addS16ImmOperands";
+}
+def dispRIX16 : Operand<iPTR> {
+  let ParserMatchClass = PPCDispRIX16Operand;
+}
+def PPCDispSPE8Operand : AsmOperandClass {
+ let Name = "DispSPE8"; let PredicateMethod = "isU8ImmX8";
+ let RenderMethod = "addImmOperands";
+}
+def dispSPE8 : Operand<iPTR> {
+  let ParserMatchClass = PPCDispSPE8Operand;
+}
+def PPCDispSPE4Operand : AsmOperandClass {
+ let Name = "DispSPE4"; let PredicateMethod = "isU7ImmX4";
+ let RenderMethod = "addImmOperands";
+}
+def dispSPE4 : Operand<iPTR> {
+  let ParserMatchClass = PPCDispSPE4Operand;
+}
+def PPCDispSPE2Operand : AsmOperandClass {
+ let Name = "DispSPE2"; let PredicateMethod = "isU6ImmX2";
+ let RenderMethod = "addImmOperands";
+}
+def dispSPE2 : Operand<iPTR> {
+  let ParserMatchClass = PPCDispSPE2Operand;
+}
+
+def memri : Operand<iPTR> {
+  let PrintMethod = "printMemRegImm";
+  let MIOperandInfo = (ops dispRI:$imm, ptr_rc_nor0:$reg);
+  let EncoderMethod = "getMemRIEncoding";
+  let DecoderMethod = "decodeMemRIOperands";
+  let OperandType = "OPERAND_MEMORY";
+}
+def memrr : Operand<iPTR> {
+  let PrintMethod = "printMemRegReg";
+  let MIOperandInfo = (ops ptr_rc_nor0:$ptrreg, ptr_rc_idx:$offreg);
+  let OperandType = "OPERAND_MEMORY";
+}
+def memrix : Operand<iPTR> {   // memri where the imm is 4-aligned.
+  let PrintMethod = "printMemRegImm";
+  let MIOperandInfo = (ops dispRIX:$imm, ptr_rc_nor0:$reg);
+  let EncoderMethod = "getMemRIXEncoding";
+  let DecoderMethod = "decodeMemRIXOperands";
+  let OperandType = "OPERAND_MEMORY";
+}
+def memrihash : Operand<iPTR> {
+  // memrihash 8-aligned for ROP Protection Instructions.
+  let PrintMethod = "printMemRegImmHash";
+  let MIOperandInfo = (ops dispRIHash:$imm, ptr_rc_nor0:$reg);
+  let EncoderMethod = "getMemRIHashEncoding";
+  let DecoderMethod = "decodeMemRIHashOperands";
+  let OperandType = "OPERAND_MEMORY";
+}
+def memrix16 : Operand<iPTR> { // memri, imm is 16-aligned, 12-bit, Inst{16:27}
+  let PrintMethod = "printMemRegImm";
+  let MIOperandInfo = (ops dispRIX16:$imm, ptr_rc_nor0:$reg);
+  let EncoderMethod = "getMemRIX16Encoding";
+  let DecoderMethod = "decodeMemRIX16Operands";
+  let OperandType = "OPERAND_MEMORY";
+}
+def spe8dis : Operand<iPTR> {   // SPE displacement where the imm is 8-aligned.
+  let PrintMethod = "printMemRegImm";
+  let MIOperandInfo = (ops dispSPE8:$imm, ptr_rc_nor0:$reg);
+  let EncoderMethod = "getSPE8DisEncoding";
+  let DecoderMethod = "decodeSPE8Operands";
+  let OperandType = "OPERAND_MEMORY";
+}
+def spe4dis : Operand<iPTR> {   // SPE displacement where the imm is 4-aligned.
+  let PrintMethod = "printMemRegImm";
+  let MIOperandInfo = (ops dispSPE4:$imm, ptr_rc_nor0:$reg);
+  let EncoderMethod = "getSPE4DisEncoding";
+  let DecoderMethod = "decodeSPE4Operands";
+  let OperandType = "OPERAND_MEMORY";
+}
+def spe2dis : Operand<iPTR> {   // SPE displacement where the imm is 2-aligned.
+  let PrintMethod = "printMemRegImm";
+  let MIOperandInfo = (ops dispSPE2:$imm, ptr_rc_nor0:$reg);
+  let EncoderMethod = "getSPE2DisEncoding";
+  let DecoderMethod = "decodeSPE2Operands";
+  let OperandType = "OPERAND_MEMORY";
+}
+
+// A single-register address. This is used with the SjLj
+// pseudo-instructions which translates to LD/LWZ.  These instructions requires
+// G8RC_NOX0 registers.
+def memr : Operand<iPTR> {
+  let MIOperandInfo = (ops ptr_rc_nor0:$ptrreg);
+  let OperandType = "OPERAND_MEMORY";
+}
+def PPCTLSRegOperand : AsmOperandClass {
+  let Name = "TLSReg"; let PredicateMethod = "isTLSReg";
+  let RenderMethod = "addTLSRegOperands";
+}
+def tlsreg32 : Operand<i32> {
+  let EncoderMethod = "getTLSRegEncoding";
+  let ParserMatchClass = PPCTLSRegOperand;
+}
+def tlsgd32 : Operand<i32> {}
+def tlscall32 : Operand<i32> {
+  let PrintMethod = "printTLSCall";
+  let MIOperandInfo = (ops calltarget:$func, tlsgd32:$sym);
+  let EncoderMethod = "getTLSCallEncoding";
+}
+
+// PowerPC Predicate operand.
+def pred : Operand<OtherVT> {
+  let PrintMethod = "printPredicateOperand";
+  let MIOperandInfo = (ops i32imm:$bibo, crrc:$reg);
+}
+
+def PPCRegVSRCAsmOperand : AsmOperandClass {
+  let Name = "RegVSRC"; let PredicateMethod = "isVSRegNumber";
+}
+def vsrc : RegisterOperand<VSRC> {
+  let ParserMatchClass = PPCRegVSRCAsmOperand;
+}
+
+def PPCRegVSFRCAsmOperand : AsmOperandClass {
+  let Name = "RegVSFRC"; let PredicateMethod = "isVSRegNumber";
+}
+def vsfrc : RegisterOperand<VSFRC> {
+  let ParserMatchClass = PPCRegVSFRCAsmOperand;
+}
+
+def PPCRegVSSRCAsmOperand : AsmOperandClass {
+  let Name = "RegVSSRC"; let PredicateMethod = "isVSRegNumber";
+}
+def vssrc : RegisterOperand<VSSRC> {
+  let ParserMatchClass = PPCRegVSSRCAsmOperand;
+}
+
+def PPCRegSPILLTOVSRRCAsmOperand : AsmOperandClass {
+  let Name = "RegSPILLTOVSRRC"; let PredicateMethod = "isVSRegNumber";
+}
+
+def spilltovsrrc : RegisterOperand<SPILLTOVSRRC> {
+  let ParserMatchClass = PPCRegSPILLTOVSRRCAsmOperand;
+}
+
+def PPCRegVSRpRCAsmOperand : AsmOperandClass {
+  let Name = "RegVSRpRC"; let PredicateMethod = "isVSRpEvenRegNumber";
+}
+
+def vsrprc : RegisterOperand<VSRpRC> {
+  let ParserMatchClass = PPCRegVSRpRCAsmOperand;
+}
+
+def PPCRegVSRpEvenRCAsmOperand : AsmOperandClass {
+  let Name = "RegVSRpEvenRC"; let PredicateMethod = "isVSRpEvenRegNumber";
+}
+
+def vsrpevenrc : RegisterOperand<VSRpRC> {
+  let ParserMatchClass = PPCRegVSRpEvenRCAsmOperand;
+  let EncoderMethod = "getVSRpEvenEncoding";
+  let DecoderMethod = "decodeVSRpEvenOperands";
+}
+
+def PPCRegACCRCAsmOperand : AsmOperandClass {
+  let Name = "RegACCRC"; let PredicateMethod = "isACCRegNumber";
+}
+
+def acc : RegisterOperand<ACCRC> {
+  let ParserMatchClass = PPCRegACCRCAsmOperand;
+}
+
+def uacc : RegisterOperand<UACCRC> {
+  let ParserMatchClass = PPCRegACCRCAsmOperand;
+}
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfoMMA.td b/llvm/lib/Target/PowerPC/PPCRegisterInfoMMA.td
new file mode 100644
index 000000000000..0b6305f95a0a
--- /dev/null
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfoMMA.td
@@ -0,0 +1,106 @@
+//===-- PPCRegisterInfoMMA.td - The PowerPC Register File --*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Register info for registers related to MMA. These are the ACC and UACC
+// registers.
+//
+//===----------------------------------------------------------------------===//
+
+let Namespace = "PPC" in {
+def sub_pair0 : SubRegIndex<256>;
+def sub_pair1 : SubRegIndex<256, 256>;
+}
+
+// ACC - One of the 8 512-bit VSX accumulators.
+class ACC<bits<3> num, string n, list<Register> subregs> : PPCReg<n> {
+  let HWEncoding{2-0} = num;
+  let SubRegs = subregs;
+}
+
+// UACC - One of the 8 512-bit VSX accumulators prior to being primed.
+// Without using this register class, the register allocator has no way to
+// differentiate a primed accumulator from an unprimed accumulator.
+// This may result in invalid copies between primed and unprimed accumulators.
+class UACC<bits<3> num, string n, list<Register> subregs> : PPCReg<n> {
+  let HWEncoding{2-0} = num;
+  let SubRegs = subregs;
+}
+
+// SPE Accumulator for multiply-accumulate SPE operations.  Never directly
+// accessed, so there's no real encoding for it.
+def SPEACC: DwarfRegNum<[99, 111]>;
+
+let SubRegIndices = [sub_pair0, sub_pair1] in {
+  def ACC0 : ACC<0, "acc0", [VSRp0, VSRp1]>, DwarfRegNum<[-1, -1]>;
+  def ACC1 : ACC<1, "acc1", [VSRp2, VSRp3]>, DwarfRegNum<[-1, -1]>;
+  def ACC2 : ACC<2, "acc2", [VSRp4, VSRp5]>, DwarfRegNum<[-1, -1]>;
+  def ACC3 : ACC<3, "acc3", [VSRp6, VSRp7]>, DwarfRegNum<[-1, -1]>;
+  def ACC4 : ACC<4, "acc4", [VSRp8, VSRp9]>, DwarfRegNum<[-1, -1]>;
+  def ACC5 : ACC<5, "acc5", [VSRp10, VSRp11]>, DwarfRegNum<[-1, -1]>;
+  def ACC6 : ACC<6, "acc6", [VSRp12, VSRp13]>, DwarfRegNum<[-1, -1]>;
+  def ACC7 : ACC<7, "acc7", [VSRp14, VSRp15]>, DwarfRegNum<[-1, -1]>;
+}
+def ACCRC : RegisterClass<"PPC", [v512i1], 128, (add ACC0, ACC1, ACC2, ACC3,
+                                                      ACC4, ACC5, ACC6, ACC7)> {
+  // The AllocationPriority is in the range [0, 63]. Assigned the ACC registers
+  // the highest possible priority in this range to force the register allocator
+  // to assign these registers first. This is done because the ACC registers
+  // must represent 4 advacent vector registers. For example ACC1 must be
+  // VS4 - VS7. The value here must be at least 32 as we want to allocate
+  // these registers even before we allocate global ranges.
+  let AllocationPriority = 63;
+  let Size = 512;
+}
+
+let SubRegIndices = [sub_pair0, sub_pair1] in {
+  def UACC0 : UACC<0, "acc0", [VSRp0, VSRp1]>, DwarfRegNum<[-1, -1]>;
+  def UACC1 : UACC<1, "acc1", [VSRp2, VSRp3]>, DwarfRegNum<[-1, -1]>;
+  def UACC2 : UACC<2, "acc2", [VSRp4, VSRp5]>, DwarfRegNum<[-1, -1]>;
+  def UACC3 : UACC<3, "acc3", [VSRp6, VSRp7]>, DwarfRegNum<[-1, -1]>;
+  def UACC4 : UACC<4, "acc4", [VSRp8, VSRp9]>, DwarfRegNum<[-1, -1]>;
+  def UACC5 : UACC<5, "acc5", [VSRp10, VSRp11]>, DwarfRegNum<[-1, -1]>;
+  def UACC6 : UACC<6, "acc6", [VSRp12, VSRp13]>, DwarfRegNum<[-1, -1]>;
+  def UACC7 : UACC<7, "acc7", [VSRp14, VSRp15]>, DwarfRegNum<[-1, -1]>;
+}
+def UACCRC : RegisterClass<"PPC", [v512i1], 128,
+                           (add UACC0, UACC1, UACC2, UACC3,
+                                UACC4, UACC5, UACC6, UACC7)> {
+  // The AllocationPriority for the UACC registers is still high and must be at
+  // least 32 as we want to allocate these registers before we allocate other
+  // global ranges. The value must be less than the AllocationPriority of the
+  // ACC registers.
+  let AllocationPriority = 36;
+  let Size = 512;
+}
+
+// FIXME: This allocation order may increase stack frame size when allocating
+// non-volatile registers.
+//
+// Placing Altivec registers first and allocate the rest as underlying VSX
+// ones, to reduce interference with accumulator registers (lower 32 VSRs).
+// This reduces copies when loading for accumulators, which is common use for
+// paired VSX registers.
+def VSRpRC :
+  RegisterClass<"PPC", [v256i1], 128,
+                (add VSRp17, VSRp18, VSRp16, VSRp19, VSRp20, VSRp21,
+                     VSRp22, VSRp23, VSRp24, VSRp25, VSRp31, VSRp30,
+                     VSRp29, VSRp28, VSRp27, VSRp26,
+                     (sequence "VSRp%u", 0, 6),
+                     (sequence "VSRp%u", 15, 7))> {
+  // Give the VSRp registers a non-zero AllocationPriority. The value is less
+  // than 32 as these registers should not always be allocated before global
+  // ranges and the value should be less than the AllocationPriority - 32 for
+  // the UACC registers. Even global VSRp registers should be allocated after
+  // the UACC registers have been chosen.
+  let AllocationPriority = 2;
+  let Size = 256;
+}
+
+
+
+
diff --git a/llvm/lib/Target/PowerPC/PPCScheduleP10.td b/llvm/lib/Target/PowerPC/PPCScheduleP10.td
index bf56491f373a..f89ef735a367 100644
--- a/llvm/lib/Target/PowerPC/PPCScheduleP10.td
+++ b/llvm/lib/Target/PowerPC/PPCScheduleP10.td
@@ -36,7 +36,7 @@ def P10Model : SchedMachineModel {
   let CompleteModel = 1;
 
   // Do not support SPE (Signal Procesing Engine) on Power 10.
-  let UnsupportedFeatures = [HasSPE, IsE500, IsBookE];
+  let UnsupportedFeatures = [HasSPE, IsE500, IsBookE, IsISAFuture];
 }
 
 let SchedModel = P10Model in {
diff --git a/llvm/lib/Target/PowerPC/PPCScheduleP9.td b/llvm/lib/Target/PowerPC/PPCScheduleP9.td
index 3dc069ecad8a..d35011171715 100644
--- a/llvm/lib/Target/PowerPC/PPCScheduleP9.td
+++ b/llvm/lib/Target/PowerPC/PPCScheduleP9.td
@@ -42,7 +42,7 @@ def P9Model : SchedMachineModel {
   // Power 9, paired vector mem ops, MMA, PC relative mem ops, or instructions
   // introduced in ISA 3.1.
   let UnsupportedFeatures = [HasSPE, PrefixInstrs, PairedVectorMemops, MMA,
-                             PCRelativeMemops, IsISA3_1];
+                             PCRelativeMemops, IsISA3_1, IsISAFuture];
 }
 
 let SchedModel = P9Model in {
@@ -404,7 +404,6 @@ let SchedModel = P9Model in {
   def P9_LoadAndALU2Op_7C : WriteSequence<[P9_LS_4C, P9_ALU_3C]>;
   def P9_LoadAndALU2Op_8C : WriteSequence<[P9_LS_5C, P9_ALU_3C]>;
   def P9_LoadAndPMOp_8C : WriteSequence<[P9_LS_5C, P9_PM_3C]>;
-  def P9_LoadAndLoadOp_8C : WriteSequence<[P9_LS_4C, P9_LS_4C]>;
   def P9_IntDivAndALUOp_18C_8 : WriteSequence<[P9_DIV_16C_8, P9_ALU_2C]>;
   def P9_IntDivAndALUOp_26C_8 : WriteSequence<[P9_DIV_24C_8, P9_ALU_2C]>;
   def P9_IntDivAndALUOp_42C_8 : WriteSequence<[P9_DIV_40C_8, P9_ALU_2C]>;
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index f11b4e14073e..98424234a592 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -18,6 +18,7 @@
 #include "PPCRegisterInfo.h"
 #include "PPCTargetMachine.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/IR/Attributes.h"
@@ -140,6 +141,7 @@ void PPCSubtarget::initializeEnvironment() {
   IsISA2_07 = false;
   IsISA3_0 = false;
   IsISA3_1 = false;
+  IsISAFuture = false;
   UseLongCalls = false;
   SecurePlt = false;
   VectorsUseTwoUnits = false;
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h
index 1300b62b623a..3281816eab4a 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -19,7 +19,7 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DataLayout.h"
@@ -160,6 +160,7 @@ protected:
   bool IsISA2_07;
   bool IsISA3_0;
   bool IsISA3_1;
+  bool IsISAFuture;
   bool UseLongCalls;
   bool SecurePlt;
   bool VectorsUseTwoUnits;
@@ -336,6 +337,7 @@ public:
   bool isISA2_07() const { return IsISA2_07; }
   bool isISA3_0() const { return IsISA3_0; }
   bool isISA3_1() const { return IsISA3_1; }
+  bool isISAFuture() const { return IsISAFuture; }
   bool useLongCalls() const { return UseLongCalls; }
   bool hasFusion() const { return HasFusion; }
   bool hasStoreFusion() const { return HasStoreFusion; }
diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
index 3eff00fc3c05..fe396cbfc011 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
 #include "llvm/CodeGen/GlobalISel/Localizer.h"
 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
@@ -97,6 +98,13 @@ static cl::opt<bool>
   ReduceCRLogical("ppc-reduce-cr-logicals",
                   cl::desc("Expand eligible cr-logical binary ops to branches"),
                   cl::init(true), cl::Hidden);
+
+static cl::opt<bool> EnablePPCGenScalarMASSEntries(
+    "enable-ppc-gen-scalar-mass", cl::init(false),
+    cl::desc("Enable lowering math functions to their corresponding MASS "
+             "(scalar) entries"),
+    cl::Hidden);
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTarget() {
   // Register the targets
   RegisterTargetMachine<PPCTargetMachine> A(getThePPC32Target());
@@ -123,8 +131,10 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTarget() {
   initializePPCTLSDynamicCallPass(PR);
   initializePPCMIPeepholePass(PR);
   initializePPCLowerMASSVEntriesPass(PR);
+  initializePPCGenScalarMASSEntriesPass(PR);
   initializePPCExpandAtomicPseudoPass(PR);
   initializeGlobalISel(PR);
+  initializePPCCTRLoopsPass(PR);
 }
 
 static bool isLittleEndianTriple(const Triple &T) {
@@ -236,10 +246,10 @@ static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT,
 
 static Reloc::Model getEffectiveRelocModel(const Triple &TT,
                                            Optional<Reloc::Model> RM) {
-  assert((!TT.isOSAIX() || !RM.hasValue() || *RM == Reloc::PIC_) &&
+  assert((!TT.isOSAIX() || !RM || *RM == Reloc::PIC_) &&
          "Invalid relocation model for AIX.");
 
-  if (RM.hasValue())
+  if (RM)
     return *RM;
 
   // Big Endian PPC and AIX default to PIC.
@@ -429,6 +439,14 @@ void PPCPassConfig::addIRPasses() {
   // Lower generic MASSV routines to PowerPC subtarget-specific entries.
   addPass(createPPCLowerMASSVEntriesPass());
 
+  // Generate PowerPC target-specific entries for scalar math functions
+  // that are available in IBM MASS (scalar) library.
+  if (TM->getOptLevel() == CodeGenOpt::Aggressive &&
+      EnablePPCGenScalarMASSEntries) {
+    TM->Options.PPCGenScalarMASSEntries = EnablePPCGenScalarMASSEntries;
+    addPass(createPPCGenScalarMASSEntriesPass());
+  }
+
   // If explicitly requested, add explicit data prefetch intrinsics.
   if (EnablePrefetch.getNumOccurrences() > 0)
     addPass(createLoopDataPrefetchPass());
@@ -522,6 +540,16 @@ void PPCPassConfig::addPreRegAlloc() {
   if (EnableExtraTOCRegDeps)
     addPass(createPPCTOCRegDepsPass());
 
+  // Run CTR loops pass before MachinePipeliner pass.
+  // MachinePipeliner will pipeline all instructions before the terminator, but
+  // we don't want DecreaseCTRPseudo to be pipelined.
+  // Note we may lose some MachinePipeliner opportunities if we run CTR loops
+  // generation pass before MachinePipeliner and the loop is converted back to
+  // a normal loop. We can revisit this later for running PPCCTRLoops after
+  // MachinePipeliner and handling DecreaseCTRPseudo in MachinePipeliner pass.
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createPPCCTRLoopsPass());
+
   if (getOptLevel() != CodeGenOpt::None)
     addPass(&MachinePipelinerID);
 }
@@ -549,7 +577,7 @@ void PPCPassConfig::addPreEmitPass2() {
 }
 
 TargetTransformInfo
-PPCTargetMachine::getTargetTransformInfo(const Function &F) {
+PPCTargetMachine::getTargetTransformInfo(const Function &F) const {
   return TargetTransformInfo(PPCTTIImpl(this, F));
 }
 
diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.h b/llvm/lib/Target/PowerPC/PPCTargetMachine.h
index d3fe5362ccdc..bafb79c84942 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.h
@@ -51,7 +51,7 @@ public:
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
-  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+  TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
 
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
diff --git a/llvm/lib/Target/PowerPC/PPCTargetStreamer.h b/llvm/lib/Target/PowerPC/PPCTargetStreamer.h
index 82fcd9e1c2bc..e3fc6285494c 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetStreamer.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetStreamer.h
@@ -10,6 +10,7 @@
 #define LLVM_LIB_TARGET_POWERPC_PPCTARGETSTREAMER_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
 
 namespace llvm {
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index cc5738a5d7b6..cf728933c08d 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -28,11 +28,6 @@ using namespace llvm;
 static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
 cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
 
-// This is currently only used for the data prefetch pass
-static cl::opt<unsigned>
-CacheLineSize("ppc-loop-prefetch-cache-line", cl::Hidden, cl::init(64),
-              cl::desc("The loop prefetch cache line size"));
-
 static cl::opt<bool>
 EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false),
                 cl::desc("Enable using coldcc calling conv for cold "
@@ -491,15 +486,13 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo,
           case Intrinsic::experimental_constrained_sin:
           case Intrinsic::experimental_constrained_cos:
             return true;
-          // There is no corresponding FMA instruction for PPC double double.
-          // Thus, we need to disable CTR loop generation for this type.
-          case Intrinsic::fmuladd:
           case Intrinsic::copysign:
             if (CI->getArgOperand(0)->getType()->getScalarType()->
                 isPPC_FP128Ty())
               return true;
             else
               continue; // ISD::FCOPYSIGN is never a library call.
+          case Intrinsic::fmuladd:
           case Intrinsic::fma:                Opcode = ISD::FMA;        break;
           case Intrinsic::sqrt:               Opcode = ISD::FSQRT;      break;
           case Intrinsic::floor:              Opcode = ISD::FFLOOR;     break;
@@ -903,10 +896,6 @@ PPCTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
 }
 
 unsigned PPCTTIImpl::getCacheLineSize() const {
-  // Check first if the user specified a custom line size.
-  if (CacheLineSize.getNumOccurrences() > 0)
-    return CacheLineSize;
-
   // Starting with P7 we have a cache line size of 128.
   unsigned Directive = ST->getCPUDirective();
   // Assume that Future CPU has the same cache line size as the others.
@@ -1015,7 +1004,8 @@ InstructionCost PPCTTIImpl::getArithmeticInstrCost(
 
 InstructionCost PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp,
                                            ArrayRef<int> Mask, int Index,
-                                           Type *SubTp) {
+                                           Type *SubTp,
+                                           ArrayRef<const Value *> Args) {
 
   InstructionCost CostFactor =
       vectorCostAdjustmentFactor(Instruction::ShuffleVector, Tp, nullptr);
@@ -1319,8 +1309,8 @@ bool PPCTTIImpl::canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE,
   return true;
 }
 
-bool PPCTTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
-                               TargetTransformInfo::LSRCost &C2) {
+bool PPCTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
+                               const TargetTransformInfo::LSRCost &C2) {
   // PowerPC default behaviour here is "instruction number 1st priority".
   // If LsrNoInsnsCost is set, call default implementation.
   if (!LsrNoInsnsCost)
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 0af6f2a308d9..790eb0b42afa 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -76,8 +76,8 @@ public:
                                OptimizationRemarkEmitter *ORE);
   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                              TTI::PeelingPreferences &PP);
-  bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
-                     TargetTransformInfo::LSRCost &C2);
+  bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
+                     const TargetTransformInfo::LSRCost &C2);
   bool isNumRegsMajorCostOfLSR();
   bool shouldBuildRelLookupTables() const;
   /// @}
@@ -111,7 +111,8 @@ public:
       ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
       const Instruction *CxtI = nullptr);
   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, Type *Tp,
-                                 ArrayRef<int> Mask, int Index, Type *SubTp);
+                                 ArrayRef<int> Mask, int Index, Type *SubTp,
+                                 ArrayRef<const Value *> Args = None);
   InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                                    TTI::CastContextHint CCH,
                                    TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
index ff251f55afff..04fc7667257e 100644
--- a/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
+++ b/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
@@ -519,6 +519,8 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
       case PPC::XXSLDWI:
       case PPC::XSCVDPSPN:
       case PPC::XSCVSPDPN:
+      case PPC::MTVSCR:
+      case PPC::MFVSCR:
         break;
       }
     }
diff --git a/llvm/lib/Target/PowerPC/README_P9.txt b/llvm/lib/Target/PowerPC/README_P9.txt
index c9984b7604bd..ee1ea735acad 100644
--- a/llvm/lib/Target/PowerPC/README_P9.txt
+++ b/llvm/lib/Target/PowerPC/README_P9.txt
@@ -310,7 +310,7 @@ VSX:
   . I checked existing instruction "XSCMPUDP". They are different in target
     register. "XSCMPUDP" write to CR field, xscmp*dp write to VSX register
 
-  . Use instrinsic:
+  . Use intrinsic:
     (set i128:$XT, (int_ppc_vsx_xscmpeqdp f64:$XA, f64:$XB))
     (set i128:$XT, (int_ppc_vsx_xscmpgedp f64:$XA, f64:$XB))
     (set i128:$XT, (int_ppc_vsx_xscmpgtdp f64:$XA, f64:$XB))
@@ -322,7 +322,7 @@ VSX:
                                  "xvcmpeqdp", "$XT, $XA, $XB", IIC_VecFPCompare,
                                  int_ppc_vsx_xvcmpeqdp, v2i64, v2f64>;
 
-  . So we should use "XX3Form_Rcr" to implement instrinsic
+  . So we should use "XX3Form_Rcr" to implement intrinsic
 
 - Convert DP -> QP: xscvdpqp
   . Similar to XSCVDPSP:
@@ -579,11 +579,6 @@ Atomic operations (l[dw]at, st[dw]at):
 - Provide builtins since not all FC's necessarily have an existing LLVM
   atomic operation
 
-Load Doubleword Monitored (ldmx):
-- Investigate whether there are any uses for this. It seems to be related to
-  Garbage Collection so it isn't likely to be all that useful for most
-  languages we deal with.
-
 Move to CR from XER Extended (mcrxrx):
 - Is there a use for this in LLVM?
 
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 01f36e6dcdd2..69fb9d2844d3 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -24,6 +24,7 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
@@ -161,7 +162,7 @@ class RISCVAsmParser : public MCTargetAsmParser {
   OperandMatchResultTy parseRegister(OperandVector &Operands,
                                      bool AllowParens = false);
   OperandMatchResultTy parseMemOpBaseReg(OperandVector &Operands);
-  OperandMatchResultTy parseAtomicMemOp(OperandVector &Operands);
+  OperandMatchResultTy parseZeroOffsetMemOp(OperandVector &Operands);
   OperandMatchResultTy parseOperandWithModifier(OperandVector &Operands);
   OperandMatchResultTy parseBareSymbol(OperandVector &Operands);
   OperandMatchResultTy parseCallSymbol(OperandVector &Operands);
@@ -170,6 +171,7 @@ class RISCVAsmParser : public MCTargetAsmParser {
   OperandMatchResultTy parseVTypeI(OperandVector &Operands);
   OperandMatchResultTy parseMaskReg(OperandVector &Operands);
   OperandMatchResultTy parseInsnDirectiveOpcode(OperandVector &Operands);
+  OperandMatchResultTy parseGPRAsFPR(OperandVector &Operands);
 
   bool parseOperand(OperandVector &Operands, StringRef Mnemonic);
 
@@ -254,6 +256,11 @@ public:
                 "target-abi)\n";
     }
 
+    // Use computeTargetABI to check if ABIName is valid. If invalid, output
+    // error message.
+    RISCVABI::computeTargetABI(STI.getTargetTriple(), STI.getFeatureBits(),
+                               ABIName);
+
     const MCObjectFileInfo *MOFI = Parser.getContext().getObjectFileInfo();
     ParserOptions.IsPicEnabled = MOFI->isPositionIndependent();
   }
@@ -273,6 +280,8 @@ struct RISCVOperand : public MCParsedAsmOperand {
 
   bool IsRV64;
 
+  bool IsGPRAsFPR;
+
   struct RegOp {
     MCRegister RegNum;
   };
@@ -343,6 +352,14 @@ public:
            RISCVMCRegisterClasses[RISCV::GPRRegClassID].contains(Reg.RegNum);
   }
 
+  bool isGPRAsFPR() const { return isGPR() && IsGPRAsFPR; }
+
+  bool isGPRF64AsFPR() const { return isGPR() && IsGPRAsFPR && IsRV64; }
+
+  bool isGPRPF64AsFPR() const {
+    return isGPR() && IsGPRAsFPR && !IsRV64 && !((Reg.RegNum - RISCV::X0) & 1);
+  }
+
   static bool evaluateConstantImm(const MCExpr *Expr, int64_t &Imm,
                                   RISCVMCExpr::VariantKind &VK) {
     if (auto *RE = dyn_cast<RISCVMCExpr>(Expr)) {
@@ -447,8 +464,16 @@ public:
   bool isFenceArg() const {
     if (!isImm())
       return false;
-    const MCExpr *Val = getImm();
-    auto *SVal = dyn_cast<MCSymbolRefExpr>(Val);
+
+    int64_t Imm;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
+    if (evaluateConstantImm(getImm(), Imm, VK)) {
+      // Only accept 0 as a constant immediate.
+      return VK == RISCVMCExpr::VK_RISCV_None && Imm == 0;
+    }
+
+    auto *SVal = dyn_cast<MCSymbolRefExpr>(getImm());
+
     if (!SVal || SVal->getKind() != MCSymbolRefExpr::VK_None)
       return false;
 
@@ -530,41 +555,19 @@ public:
     return (isRV64() && isUInt<5>(Imm)) || isUInt<4>(Imm);
   }
 
-  bool isUImm2() const {
+  template <unsigned N> bool IsUImm() const {
     int64_t Imm;
     RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
     if (!isImm())
       return false;
     bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
-    return IsConstantImm && isUInt<2>(Imm) && VK == RISCVMCExpr::VK_RISCV_None;
+    return IsConstantImm && isUInt<N>(Imm) && VK == RISCVMCExpr::VK_RISCV_None;
   }
 
-  bool isUImm3() const {
-    int64_t Imm;
-    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
-    if (!isImm())
-      return false;
-    bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
-    return IsConstantImm && isUInt<3>(Imm) && VK == RISCVMCExpr::VK_RISCV_None;
-  }
-
-  bool isUImm5() const {
-    int64_t Imm;
-    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
-    if (!isImm())
-      return false;
-    bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
-    return IsConstantImm && isUInt<5>(Imm) && VK == RISCVMCExpr::VK_RISCV_None;
-  }
-
-  bool isUImm7() const {
-    int64_t Imm;
-    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
-    if (!isImm())
-      return false;
-    bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
-    return IsConstantImm && isUInt<7>(Imm) && VK == RISCVMCExpr::VK_RISCV_None;
-  }
+  bool isUImm2() { return IsUImm<2>(); }
+  bool isUImm3() { return IsUImm<3>(); }
+  bool isUImm5() { return IsUImm<5>(); }
+  bool isUImm7() { return IsUImm<7>(); }
 
   bool isRnumArg() const {
     int64_t Imm;
@@ -686,6 +689,16 @@ public:
 
   bool isSImm12Lsb0() const { return isBareSimmNLsb0<12>(); }
 
+  bool isSImm12Lsb00000() const {
+    if (!isImm())
+      return false;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
+    int64_t Imm;
+    bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
+    return IsConstantImm && isShiftedInt<7, 5>(Imm) &&
+           VK == RISCVMCExpr::VK_RISCV_None;
+  }
+
   bool isSImm13Lsb0() const { return isBareSimmNLsb0<13>(); }
 
   bool isSImm10Lsb0000NonZero() const {
@@ -831,12 +844,14 @@ public:
   }
 
   static std::unique_ptr<RISCVOperand> createReg(unsigned RegNo, SMLoc S,
-                                                 SMLoc E, bool IsRV64) {
+                                                 SMLoc E, bool IsRV64,
+                                                 bool IsGPRAsFPR = false) {
     auto Op = std::make_unique<RISCVOperand>(KindTy::Register);
     Op->Reg.RegNum = RegNo;
     Op->StartLoc = S;
     Op->EndLoc = E;
     Op->IsRV64 = IsRV64;
+    Op->IsGPRAsFPR = IsGPRAsFPR;
     return Op;
   }
 
@@ -897,6 +912,17 @@ public:
 
   void addFenceArgOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
+
+    int64_t Constant = 0;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
+    if (evaluateConstantImm(getImm(), Constant, VK)) {
+      if (Constant == 0) {
+        Inst.addOperand(MCOperand::createImm(Constant));
+        return;
+      }
+      llvm_unreachable("FenceArg must contain only [iorw] or be 0");
+    }
+
     // isFenceArg has validated the operand, meaning this cast is safe
     auto SE = cast<MCSymbolRefExpr>(getImm());
 
@@ -904,7 +930,7 @@ public:
     for (char c : SE->getSymbol().getName()) {
       switch (c) {
       default:
-        llvm_unreachable("FenceArg must contain only [iorw]");
+        llvm_unreachable("FenceArg must contain only [iorw] or be 0");
       case 'i':
         Imm |= RISCVFenceField::I;
         break;
@@ -1182,6 +1208,10 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return generateImmOutOfRangeError(
         Operands, ErrorInfo, -(1 << 11), (1 << 11) - 2,
         "immediate must be a multiple of 2 bytes in the range");
+  case Match_InvalidSImm12Lsb00000:
+    return generateImmOutOfRangeError(
+        Operands, ErrorInfo, -(1 << 11), (1 << 11) - 32,
+        "immediate must be a multiple of 32 bytes in the range");
   case Match_InvalidSImm13Lsb0:
     return generateImmOutOfRangeError(
         Operands, ErrorInfo, -(1 << 12), (1 << 12) - 2,
@@ -1208,9 +1238,8 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   }
   case Match_InvalidFenceArg: {
     SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
-    return Error(
-        ErrorLoc,
-        "operand must be formed of letters selected in-order from 'iorw'");
+    return Error(ErrorLoc, "operand must be formed of letters selected "
+                           "in-order from 'iorw' or be 0");
   }
   case Match_InvalidFRMArg: {
     SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
@@ -1594,9 +1623,11 @@ OperandMatchResultTy RISCVAsmParser::parseBareSymbol(OperandVector &Operands) {
     return MatchOperand_Success;
   case AsmToken::Plus:
     Opcode = MCBinaryExpr::Add;
+    getLexer().Lex();
     break;
   case AsmToken::Minus:
     Opcode = MCBinaryExpr::Sub;
+    getLexer().Lex();
     break;
   }
 
@@ -1737,9 +1768,7 @@ OperandMatchResultTy RISCVAsmParser::parseVTypeI(OperandVector &Operands) {
     else
       goto MatchFail;
 
-    unsigned LmulLog2 = Log2_32(Lmul);
-    RISCVII::VLMUL VLMUL =
-        static_cast<RISCVII::VLMUL>(Fractional ? 8 - LmulLog2 : LmulLog2);
+    RISCVII::VLMUL VLMUL = RISCVVType::encodeLMUL(Lmul, Fractional);
 
     unsigned VTypeI =
         RISCVVType::encodeVTYPE(VLMUL, Sew, TailAgnostic, MaskAgnostic);
@@ -1780,6 +1809,26 @@ OperandMatchResultTy RISCVAsmParser::parseMaskReg(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
+OperandMatchResultTy RISCVAsmParser::parseGPRAsFPR(OperandVector &Operands) {
+  switch (getLexer().getKind()) {
+  default:
+    return MatchOperand_NoMatch;
+  case AsmToken::Identifier:
+    StringRef Name = getLexer().getTok().getIdentifier();
+    MCRegister RegNo;
+    matchRegisterNameHelper(isRV32E(), RegNo, Name);
+
+    if (RegNo == RISCV::NoRegister)
+      return MatchOperand_NoMatch;
+    SMLoc S = getLoc();
+    SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
+    getLexer().Lex();
+    Operands.push_back(RISCVOperand::createReg(
+        RegNo, S, E, isRV64(), !getSTI().hasFeature(RISCV::FeatureStdExtF)));
+  }
+  return MatchOperand_Success;
+}
+
 OperandMatchResultTy
 RISCVAsmParser::parseMemOpBaseReg(OperandVector &Operands) {
   if (getLexer().isNot(AsmToken::LParen)) {
@@ -1806,7 +1855,8 @@ RISCVAsmParser::parseMemOpBaseReg(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
-OperandMatchResultTy RISCVAsmParser::parseAtomicMemOp(OperandVector &Operands) {
+OperandMatchResultTy
+RISCVAsmParser::parseZeroOffsetMemOp(OperandVector &Operands) {
   // Atomic operations such as lr.w, sc.w, and amo*.w accept a "memory operand"
   // as one of their register operands, such as `(a0)`. This just denotes that
   // the register (in this case `a0`) contains a memory address.
@@ -1822,9 +1872,9 @@ OperandMatchResultTy RISCVAsmParser::parseAtomicMemOp(OperandVector &Operands) {
   // offset if it is zero; require (and discard) parentheses; and add only the
   // parsed register operand to `Operands`.
   //
-  // These operands are printed with RISCVInstPrinter::printAtomicMemOp, which
-  // will only print the register surrounded by parentheses (which GNU as also
-  // uses as its canonical representation for these operands).
+  // These operands are printed with RISCVInstPrinter::printZeroOffsetMemOp,
+  // which will only print the register surrounded by parentheses (which GNU as
+  // also uses as its canonical representation for these operands).
   std::unique_ptr<RISCVOperand> OptionalImmOp;
 
   if (getLexer().isNot(AsmToken::LParen)) {
@@ -1935,7 +1985,6 @@ bool RISCVAsmParser::ParseInstruction(ParseInstructionInfo &Info,
     return true;
 
   // Parse until end of statement, consuming commas between operands
-  unsigned OperandIdx = 1;
   while (getLexer().is(AsmToken::Comma)) {
     // Consume comma token
     getLexer().Lex();
@@ -1943,8 +1992,6 @@ bool RISCVAsmParser::ParseInstruction(ParseInstructionInfo &Info,
     // Parse next operand
     if (parseOperand(Operands, Name))
       return true;
-
-    ++OperandIdx;
   }
 
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
@@ -2120,11 +2167,11 @@ bool RISCVAsmParser::parseDirectiveAttribute() {
     StringRef Name = Parser.getTok().getIdentifier();
     Optional<unsigned> Ret =
         ELFAttrs::attrTypeFromString(Name, RISCVAttrs::getRISCVAttributeTags());
-    if (!Ret.hasValue()) {
+    if (!Ret) {
       Error(TagLoc, "attribute name not recognised: " + Name);
       return false;
     }
-    Tag = Ret.getValue();
+    Tag = *Ret;
     Parser.Lex();
   } else {
     const MCExpr *AttrExpr;
@@ -2170,8 +2217,7 @@ bool RISCVAsmParser::parseDirectiveAttribute() {
     Parser.Lex();
   }
 
-  if (Parser.parseToken(AsmToken::EndOfStatement,
-                        "unexpected token in '.attribute' directive"))
+  if (Parser.parseEOL())
     return true;
 
   if (IsIntegerValue)
@@ -2263,23 +2309,26 @@ void RISCVAsmParser::emitLoadImm(MCRegister DestReg, int64_t Value,
 
   MCRegister SrcReg = RISCV::X0;
   for (RISCVMatInt::Inst &Inst : Seq) {
-    if (Inst.Opc == RISCV::LUI) {
+    switch (Inst.getOpndKind()) {
+    case RISCVMatInt::Imm:
+      emitToStreamer(Out,
+                     MCInstBuilder(Inst.Opc).addReg(DestReg).addImm(Inst.Imm));
+      break;
+    case RISCVMatInt::RegX0:
       emitToStreamer(
-          Out, MCInstBuilder(RISCV::LUI).addReg(DestReg).addImm(Inst.Imm));
-    } else if (Inst.Opc == RISCV::ADD_UW) {
-      emitToStreamer(Out, MCInstBuilder(RISCV::ADD_UW)
-                              .addReg(DestReg)
-                              .addReg(SrcReg)
-                              .addReg(RISCV::X0));
-    } else if (Inst.Opc == RISCV::SH1ADD || Inst.Opc == RISCV::SH2ADD ||
-               Inst.Opc == RISCV::SH3ADD) {
+          Out, MCInstBuilder(Inst.Opc).addReg(DestReg).addReg(SrcReg).addReg(
+                   RISCV::X0));
+      break;
+    case RISCVMatInt::RegReg:
       emitToStreamer(
           Out, MCInstBuilder(Inst.Opc).addReg(DestReg).addReg(SrcReg).addReg(
                    SrcReg));
-    } else {
+      break;
+    case RISCVMatInt::RegImm:
       emitToStreamer(
           Out, MCInstBuilder(Inst.Opc).addReg(DestReg).addReg(SrcReg).addImm(
                    Inst.Imm));
+      break;
     }
 
     // Only the first instruction has X0 as its source.
@@ -2541,8 +2590,7 @@ bool RISCVAsmParser::validateInstruction(MCInst &Inst,
   }
 
   const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
-  RISCVII::VConstraintType Constraints =
-      RISCVII::getConstraint(MCID.TSFlags);
+  RISCVII::VConstraintType Constraints = RISCVII::getConstraint(MCID.TSFlags);
   if (Constraints == RISCVII::NoConstraint)
     return false;
 
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index ff96b2b254ca..1c732a15de2f 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -14,8 +14,8 @@
 #include "MCTargetDesc/RISCVMCTargetDesc.h"
 #include "TargetInfo/RISCVTargetInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDecoderOps.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
-#include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -60,11 +60,9 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVDisassembler() {
 
 static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, uint64_t RegNo,
                                            uint64_t Address,
-                                           const void *Decoder) {
+                                           const MCDisassembler *Decoder) {
   const FeatureBitset &FeatureBits =
-      static_cast<const MCDisassembler *>(Decoder)
-          ->getSubtargetInfo()
-          .getFeatureBits();
+      Decoder->getSubtargetInfo().getFeatureBits();
   bool IsRV32E = FeatureBits[RISCV::FeatureRV32E];
 
   if (RegNo >= 32 || (IsRV32E && RegNo >= 16))
@@ -77,7 +75,7 @@ static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, uint64_t RegNo,
 
 static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, uint64_t RegNo,
                                              uint64_t Address,
-                                             const void *Decoder) {
+                                             const MCDisassembler *Decoder) {
   if (RegNo >= 32)
     return MCDisassembler::Fail;
 
@@ -88,7 +86,7 @@ static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, uint64_t RegNo,
 
 static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, uint64_t RegNo,
                                              uint64_t Address,
-                                             const void *Decoder) {
+                                             const MCDisassembler *Decoder) {
   if (RegNo >= 32)
     return MCDisassembler::Fail;
 
@@ -99,7 +97,7 @@ static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, uint64_t RegNo,
 
 static DecodeStatus DecodeFPR32CRegisterClass(MCInst &Inst, uint64_t RegNo,
                                               uint64_t Address,
-                                              const void *Decoder) {
+                                              const MCDisassembler *Decoder) {
   if (RegNo >= 8) {
     return MCDisassembler::Fail;
   }
@@ -110,7 +108,7 @@ static DecodeStatus DecodeFPR32CRegisterClass(MCInst &Inst, uint64_t RegNo,
 
 static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, uint64_t RegNo,
                                              uint64_t Address,
-                                             const void *Decoder) {
+                                             const MCDisassembler *Decoder) {
   if (RegNo >= 32)
     return MCDisassembler::Fail;
 
@@ -121,7 +119,7 @@ static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, uint64_t RegNo,
 
 static DecodeStatus DecodeFPR64CRegisterClass(MCInst &Inst, uint64_t RegNo,
                                               uint64_t Address,
-                                              const void *Decoder) {
+                                              const MCDisassembler *Decoder) {
   if (RegNo >= 8) {
     return MCDisassembler::Fail;
   }
@@ -132,7 +130,7 @@ static DecodeStatus DecodeFPR64CRegisterClass(MCInst &Inst, uint64_t RegNo,
 
 static DecodeStatus DecodeGPRNoX0RegisterClass(MCInst &Inst, uint64_t RegNo,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   if (RegNo == 0) {
     return MCDisassembler::Fail;
   }
@@ -140,9 +138,9 @@ static DecodeStatus DecodeGPRNoX0RegisterClass(MCInst &Inst, uint64_t RegNo,
   return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder);
 }
 
-static DecodeStatus DecodeGPRNoX0X2RegisterClass(MCInst &Inst, uint64_t RegNo,
-                                                 uint64_t Address,
-                                                 const void *Decoder) {
+static DecodeStatus
+DecodeGPRNoX0X2RegisterClass(MCInst &Inst, uint64_t RegNo, uint64_t Address,
+                             const MCDisassembler *Decoder) {
   if (RegNo == 2) {
     return MCDisassembler::Fail;
   }
@@ -152,7 +150,7 @@ static DecodeStatus DecodeGPRNoX0X2RegisterClass(MCInst &Inst, uint64_t RegNo,
 
 static DecodeStatus DecodeGPRCRegisterClass(MCInst &Inst, uint64_t RegNo,
                                             uint64_t Address,
-                                            const void *Decoder) {
+                                            const MCDisassembler *Decoder) {
   if (RegNo >= 8)
     return MCDisassembler::Fail;
 
@@ -161,9 +159,20 @@ static DecodeStatus DecodeGPRCRegisterClass(MCInst &Inst, uint64_t RegNo,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeGPRPF64RegisterClass(MCInst &Inst, uint64_t RegNo,
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder) {
+  if (RegNo >= 32 || RegNo & 1)
+    return MCDisassembler::Fail;
+
+  MCRegister Reg = RISCV::X0 + RegNo;
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeVRRegisterClass(MCInst &Inst, uint64_t RegNo,
                                           uint64_t Address,
-                                          const void *Decoder) {
+                                          const MCDisassembler *Decoder) {
   if (RegNo >= 32)
     return MCDisassembler::Fail;
 
@@ -174,7 +183,7 @@ static DecodeStatus DecodeVRRegisterClass(MCInst &Inst, uint64_t RegNo,
 
 static DecodeStatus DecodeVRM2RegisterClass(MCInst &Inst, uint64_t RegNo,
                                             uint64_t Address,
-                                            const void *Decoder) {
+                                            const MCDisassembler *Decoder) {
   if (RegNo >= 32)
     return MCDisassembler::Fail;
 
@@ -194,7 +203,7 @@ static DecodeStatus DecodeVRM2RegisterClass(MCInst &Inst, uint64_t RegNo,
 
 static DecodeStatus DecodeVRM4RegisterClass(MCInst &Inst, uint64_t RegNo,
                                             uint64_t Address,
-                                            const void *Decoder) {
+                                            const MCDisassembler *Decoder) {
   if (RegNo >= 32)
     return MCDisassembler::Fail;
 
@@ -214,7 +223,7 @@ static DecodeStatus DecodeVRM4RegisterClass(MCInst &Inst, uint64_t RegNo,
 
 static DecodeStatus DecodeVRM8RegisterClass(MCInst &Inst, uint64_t RegNo,
                                             uint64_t Address,
-                                            const void *Decoder) {
+                                            const MCDisassembler *Decoder) {
   if (RegNo >= 32)
     return MCDisassembler::Fail;
 
@@ -233,7 +242,8 @@ static DecodeStatus DecodeVRM8RegisterClass(MCInst &Inst, uint64_t RegNo,
 }
 
 static DecodeStatus decodeVMaskReg(MCInst &Inst, uint64_t RegNo,
-                                   uint64_t Address, const void *Decoder) {
+                                   uint64_t Address,
+                                   const MCDisassembler *Decoder) {
   MCRegister Reg = RISCV::NoRegister;
   switch (RegNo) {
   default:
@@ -250,7 +260,8 @@ static DecodeStatus decodeVMaskReg(MCInst &Inst, uint64_t RegNo,
 
 // Add implied SP operand for instructions *SP compressed instructions. The SP
 // operand isn't explicitly encoded in the instruction.
-static void addImplySP(MCInst &Inst, int64_t Address, const void *Decoder) {
+static void addImplySP(MCInst &Inst, int64_t Address,
+                       const MCDisassembler *Decoder) {
   if (Inst.getOpcode() == RISCV::C_LWSP || Inst.getOpcode() == RISCV::C_SWSP ||
       Inst.getOpcode() == RISCV::C_LDSP || Inst.getOpcode() == RISCV::C_SDSP ||
       Inst.getOpcode() == RISCV::C_FLWSP ||
@@ -268,7 +279,8 @@ static void addImplySP(MCInst &Inst, int64_t Address, const void *Decoder) {
 
 template <unsigned N>
 static DecodeStatus decodeUImmOperand(MCInst &Inst, uint64_t Imm,
-                                      int64_t Address, const void *Decoder) {
+                                      int64_t Address,
+                                      const MCDisassembler *Decoder) {
   assert(isUInt<N>(Imm) && "Invalid immediate");
   addImplySP(Inst, Address, Decoder);
   Inst.addOperand(MCOperand::createImm(Imm));
@@ -278,7 +290,7 @@ static DecodeStatus decodeUImmOperand(MCInst &Inst, uint64_t Imm,
 template <unsigned N>
 static DecodeStatus decodeUImmNonZeroOperand(MCInst &Inst, uint64_t Imm,
                                              int64_t Address,
-                                             const void *Decoder) {
+                                             const MCDisassembler *Decoder) {
   if (Imm == 0)
     return MCDisassembler::Fail;
   return decodeUImmOperand<N>(Inst, Imm, Address, Decoder);
@@ -286,7 +298,8 @@ static DecodeStatus decodeUImmNonZeroOperand(MCInst &Inst, uint64_t Imm,
 
 template <unsigned N>
 static DecodeStatus decodeSImmOperand(MCInst &Inst, uint64_t Imm,
-                                      int64_t Address, const void *Decoder) {
+                                      int64_t Address,
+                                      const MCDisassembler *Decoder) {
   assert(isUInt<N>(Imm) && "Invalid immediate");
   addImplySP(Inst, Address, Decoder);
   // Sign-extend the number in the bottom N bits of Imm
@@ -297,7 +310,7 @@ static DecodeStatus decodeSImmOperand(MCInst &Inst, uint64_t Imm,
 template <unsigned N>
 static DecodeStatus decodeSImmNonZeroOperand(MCInst &Inst, uint64_t Imm,
                                              int64_t Address,
-                                             const void *Decoder) {
+                                             const MCDisassembler *Decoder) {
   if (Imm == 0)
     return MCDisassembler::Fail;
   return decodeSImmOperand<N>(Inst, Imm, Address, Decoder);
@@ -306,7 +319,7 @@ static DecodeStatus decodeSImmNonZeroOperand(MCInst &Inst, uint64_t Imm,
 template <unsigned N>
 static DecodeStatus decodeSImmOperandAndLsl1(MCInst &Inst, uint64_t Imm,
                                              int64_t Address,
-                                             const void *Decoder) {
+                                             const MCDisassembler *Decoder) {
   assert(isUInt<N>(Imm) && "Invalid immediate");
   // Sign-extend the number in the bottom N bits of Imm after accounting for
   // the fact that the N bit immediate is stored in N-1 bits (the LSB is
@@ -317,7 +330,7 @@ static DecodeStatus decodeSImmOperandAndLsl1(MCInst &Inst, uint64_t Imm,
 
 static DecodeStatus decodeCLUIImmOperand(MCInst &Inst, uint64_t Imm,
                                          int64_t Address,
-                                         const void *Decoder) {
+                                         const MCDisassembler *Decoder) {
   assert(isUInt<6>(Imm) && "Invalid immediate");
   if (Imm > 31) {
     Imm = (SignExtend64<6>(Imm) & 0xfffff);
@@ -326,9 +339,8 @@ static DecodeStatus decodeCLUIImmOperand(MCInst &Inst, uint64_t Imm,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus decodeFRMArg(MCInst &Inst, uint64_t Imm,
-                                 int64_t Address,
-                                 const void *Decoder) {
+static DecodeStatus decodeFRMArg(MCInst &Inst, uint64_t Imm, int64_t Address,
+                                 const MCDisassembler *Decoder) {
   assert(isUInt<3>(Imm) && "Invalid immediate");
   if (!llvm::RISCVFPRndMode::isValidRoundingMode(Imm))
     return MCDisassembler::Fail;
@@ -338,26 +350,30 @@ static DecodeStatus decodeFRMArg(MCInst &Inst, uint64_t Imm,
 }
 
 static DecodeStatus decodeRVCInstrSImm(MCInst &Inst, unsigned Insn,
-                                       uint64_t Address, const void *Decoder);
+                                       uint64_t Address,
+                                       const MCDisassembler *Decoder);
 
 static DecodeStatus decodeRVCInstrRdSImm(MCInst &Inst, unsigned Insn,
-                                         uint64_t Address, const void *Decoder);
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder);
 
 static DecodeStatus decodeRVCInstrRdRs1UImm(MCInst &Inst, unsigned Insn,
                                             uint64_t Address,
-                                            const void *Decoder);
+                                            const MCDisassembler *Decoder);
 
 static DecodeStatus decodeRVCInstrRdRs2(MCInst &Inst, unsigned Insn,
-                                        uint64_t Address, const void *Decoder);
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder);
 
 static DecodeStatus decodeRVCInstrRdRs1Rs2(MCInst &Inst, unsigned Insn,
                                            uint64_t Address,
-                                           const void *Decoder);
+                                           const MCDisassembler *Decoder);
 
 #include "RISCVGenDisassemblerTables.inc"
 
 static DecodeStatus decodeRVCInstrSImm(MCInst &Inst, unsigned Insn,
-                                       uint64_t Address, const void *Decoder) {
+                                       uint64_t Address,
+                                       const MCDisassembler *Decoder) {
   uint64_t SImm6 =
       fieldFromInstruction(Insn, 12, 1) << 5 | fieldFromInstruction(Insn, 2, 5);
   DecodeStatus Result = decodeSImmOperand<6>(Inst, SImm6, Address, Decoder);
@@ -368,7 +384,7 @@ static DecodeStatus decodeRVCInstrSImm(MCInst &Inst, unsigned Insn,
 
 static DecodeStatus decodeRVCInstrRdSImm(MCInst &Inst, unsigned Insn,
                                          uint64_t Address,
-                                         const void *Decoder) {
+                                         const MCDisassembler *Decoder) {
   DecodeGPRRegisterClass(Inst, 0, Address, Decoder);
   uint64_t SImm6 =
       fieldFromInstruction(Insn, 12, 1) << 5 | fieldFromInstruction(Insn, 2, 5);
@@ -380,7 +396,7 @@ static DecodeStatus decodeRVCInstrRdSImm(MCInst &Inst, unsigned Insn,
 
 static DecodeStatus decodeRVCInstrRdRs1UImm(MCInst &Inst, unsigned Insn,
                                             uint64_t Address,
-                                            const void *Decoder) {
+                                            const MCDisassembler *Decoder) {
   DecodeGPRRegisterClass(Inst, 0, Address, Decoder);
   Inst.addOperand(Inst.getOperand(0));
   uint64_t UImm6 =
@@ -392,7 +408,8 @@ static DecodeStatus decodeRVCInstrRdRs1UImm(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus decodeRVCInstrRdRs2(MCInst &Inst, unsigned Insn,
-                                        uint64_t Address, const void *Decoder) {
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder) {
   unsigned Rd = fieldFromInstruction(Insn, 7, 5);
   unsigned Rs2 = fieldFromInstruction(Insn, 2, 5);
   DecodeGPRRegisterClass(Inst, Rd, Address, Decoder);
@@ -402,7 +419,7 @@ static DecodeStatus decodeRVCInstrRdRs2(MCInst &Inst, unsigned Insn,
 
 static DecodeStatus decodeRVCInstrRdRs1Rs2(MCInst &Inst, unsigned Insn,
                                            uint64_t Address,
-                                           const void *Decoder) {
+                                           const MCDisassembler *Decoder) {
   unsigned Rd = fieldFromInstruction(Insn, 7, 5);
   unsigned Rs2 = fieldFromInstruction(Insn, 2, 5);
   DecodeGPRRegisterClass(Inst, Rd, Address, Decoder);
@@ -427,6 +444,27 @@ DecodeStatus RISCVDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
       return MCDisassembler::Fail;
     }
     Insn = support::endian::read32le(Bytes.data());
+    if (STI.getFeatureBits()[RISCV::FeatureStdExtZdinx] &&
+        !STI.getFeatureBits()[RISCV::Feature64Bit]) {
+      LLVM_DEBUG(dbgs() << "Trying RV32Zdinx table (Double in Integer and"
+                           "rv32)\n");
+      Result = decodeInstruction(DecoderTableRV32Zdinx32, MI, Insn, Address,
+                                 this, STI);
+      if (Result != MCDisassembler::Fail) {
+        Size = 4;
+        return Result;
+      }
+    }
+
+    if (STI.getFeatureBits()[RISCV::FeatureStdExtZfinx]) {
+      LLVM_DEBUG(dbgs() << "Trying RVZfinx table (Float in Integer):\n");
+      Result = decodeInstruction(DecoderTableRVZfinx32, MI, Insn, Address, this,
+                                 STI);
+      if (Result != MCDisassembler::Fail) {
+        Size = 4;
+        return Result;
+      }
+    }
     LLVM_DEBUG(dbgs() << "Trying RISCV32 table :\n");
     Result = decodeInstruction(DecoderTable32, MI, Insn, Address, this, STI);
     Size = 4;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index 514789b3f645..a494adf8e210 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -583,16 +583,17 @@ void RISCVAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
 bool RISCVAsmBackend::shouldInsertExtraNopBytesForCodeAlign(
     const MCAlignFragment &AF, unsigned &Size) {
   // Calculate Nops Size only when linker relaxation enabled.
-  if (!STI.getFeatureBits()[RISCV::FeatureRelax])
+  const MCSubtargetInfo *STI = AF.getSubtargetInfo();
+  if (!STI->getFeatureBits()[RISCV::FeatureRelax])
     return false;
 
-  bool HasStdExtC = STI.getFeatureBits()[RISCV::FeatureStdExtC];
+  bool HasStdExtC = STI->getFeatureBits()[RISCV::FeatureStdExtC];
   unsigned MinNopLen = HasStdExtC ? 2 : 4;
 
   if (AF.getAlignment() <= MinNopLen) {
     return false;
   } else {
-    Size = AF.getAlignment() - MinNopLen;
+    Size = AF.getAlignment().value() - MinNopLen;
     return true;
   }
 }
@@ -606,7 +607,8 @@ bool RISCVAsmBackend::shouldInsertFixupForCodeAlign(MCAssembler &Asm,
                                                     const MCAsmLayout &Layout,
                                                     MCAlignFragment &AF) {
   // Insert the fixup only when linker relaxation enabled.
-  if (!STI.getFeatureBits()[RISCV::FeatureRelax])
+  const MCSubtargetInfo *STI = AF.getSubtargetInfo();
+  if (!STI->getFeatureBits()[RISCV::FeatureRelax])
     return false;
 
   // Calculate total Nops we need to insert. If there are none to insert
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
index f04d2912f09d..5d62c3a8b0df 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
@@ -27,18 +27,15 @@ class RISCVAsmBackend : public MCAsmBackend {
   bool Is64Bit;
   bool ForceRelocs = false;
   const MCTargetOptions &TargetOptions;
-  RISCVABI::ABI TargetABI = RISCVABI::ABI_Unknown;
 
 public:
   RISCVAsmBackend(const MCSubtargetInfo &STI, uint8_t OSABI, bool Is64Bit,
                   const MCTargetOptions &Options)
       : MCAsmBackend(support::little), STI(STI), OSABI(OSABI), Is64Bit(Is64Bit),
         TargetOptions(Options) {
-    TargetABI = RISCVABI::computeTargetABI(
-        STI.getTargetTriple(), STI.getFeatureBits(), Options.getABIName());
     RISCVFeatures::validate(STI.getTargetTriple(), STI.getFeatureBits());
   }
-  ~RISCVAsmBackend() override {}
+  ~RISCVAsmBackend() override = default;
 
   void setForceRelocs() { ForceRelocs = true; }
 
@@ -103,7 +100,6 @@ public:
                     const MCSubtargetInfo *STI) const override;
 
   const MCTargetOptions &getTargetOptions() const { return TargetOptions; }
-  RISCVABI::ABI getTargetABI() const { return TargetABI; }
 };
 }
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
index 144e761f002d..9b69170d1c4a 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/RISCVISAInfo.h"
+#include "llvm/Support/TargetParser.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
@@ -61,15 +62,11 @@ ABI computeTargetABI(const Triple &TT, FeatureBitset FeatureBits,
   if (TargetABI != ABI_Unknown)
     return TargetABI;
 
-  // For now, default to the ilp32/ilp32e/lp64 ABI if no explicit ABI is given
-  // or an invalid/unrecognised string is given. In the future, it might be
-  // worth changing this to default to ilp32f/lp64f and ilp32d/lp64d when
-  // hardware support for floating point is present.
-  if (IsRV32E)
-    return ABI_ILP32E;
-  if (IsRV64)
-    return ABI_LP64;
-  return ABI_ILP32;
+  // If no explicit ABI is given, try to compute the default ABI.
+  auto ISAInfo = RISCVFeatures::parseFeatureBits(IsRV64, FeatureBits);
+  if (!ISAInfo)
+    report_fatal_error(ISAInfo.takeError());
+  return getTargetABI((*ISAInfo)->computeDefaultABI());
 }
 
 ABI getTargetABI(StringRef ABIName) {
@@ -106,13 +103,17 @@ void validate(const Triple &TT, const FeatureBitset &FeatureBits) {
     report_fatal_error("RV32E can't be enabled for an RV64 target");
 }
 
-void toFeatureVector(std::vector<std::string> &FeatureVector,
-                     const FeatureBitset &FeatureBits) {
+llvm::Expected<std::unique_ptr<RISCVISAInfo>>
+parseFeatureBits(bool IsRV64, const FeatureBitset &FeatureBits) {
+  unsigned XLen = IsRV64 ? 64 : 32;
+  std::vector<std::string> FeatureVector;
+  // Convert FeatureBitset to FeatureVector.
   for (auto Feature : RISCVFeatureKV) {
     if (FeatureBits[Feature.Value] &&
         llvm::RISCVISAInfo::isSupportedExtensionFeature(Feature.Key))
       FeatureVector.push_back(std::string("+") + Feature.Key);
   }
+  return llvm::RISCVISAInfo::parseFeatures(XLen, FeatureVector);
 }
 
 } // namespace RISCVFeatures
@@ -130,7 +131,7 @@ unsigned RISCVVType::encodeVTYPE(RISCVII::VLMUL VLMUL, unsigned SEW,
                                  bool TailAgnostic, bool MaskAgnostic) {
   assert(isValidSEW(SEW) && "Invalid SEW");
   unsigned VLMULBits = static_cast<unsigned>(VLMUL);
-  unsigned VSEWBits = Log2_32(SEW) - 3;
+  unsigned VSEWBits = encodeSEW(SEW);
   unsigned VTypeI = (VSEWBits << 3) | (VLMULBits & 0x7);
   if (TailAgnostic)
     VTypeI |= 0x40;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index 01c6bd90ea58..fa408f7fc5d7 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/RISCVISAInfo.h"
 
 namespace llvm {
 
@@ -87,9 +88,16 @@ enum {
   // Pseudos.
   IsRVVWideningReductionShift = HasVecPolicyOpShift + 1,
   IsRVVWideningReductionMask = 1 << IsRVVWideningReductionShift,
+
+  // Does this instruction care about mask policy. If it is not, the mask policy
+  // could be either agnostic or undisturbed. For example, unmasked, store, and
+  // reduction operations result would not be affected by mask policy, so
+  // compiler has free to select either one.
+  UsesMaskPolicyShift = IsRVVWideningReductionShift + 1,
+  UsesMaskPolicyMask = 1 << UsesMaskPolicyShift,
 };
 
-// Match with the definitions in RISCVInstrFormatsV.td
+// Match with the definitions in RISCVInstrFormats.td
 enum VConstraintType {
   NoConstraint = 0,
   VS2Constraint = 0b001,
@@ -109,8 +117,8 @@ enum VLMUL : uint8_t {
 };
 
 enum {
-  TAIL_UNDISTURBED = 0,
   TAIL_AGNOSTIC = 1,
+  MASK_AGNOSTIC = 2,
 };
 
 // Helper functions to read TSFlags.
@@ -120,8 +128,8 @@ static inline unsigned getFormat(uint64_t TSFlags) {
 }
 /// \returns the constraint for the instruction.
 static inline VConstraintType getConstraint(uint64_t TSFlags) {
-  return static_cast<VConstraintType>
-             ((TSFlags & ConstraintMask) >> ConstraintShift);
+  return static_cast<VConstraintType>((TSFlags & ConstraintMask) >>
+                                      ConstraintShift);
 }
 /// \returns the LMUL for the instruction.
 static inline VLMUL getLMul(uint64_t TSFlags) {
@@ -155,6 +163,30 @@ static inline bool hasVecPolicyOp(uint64_t TSFlags) {
 static inline bool isRVVWideningReduction(uint64_t TSFlags) {
   return TSFlags & IsRVVWideningReductionMask;
 }
+/// \returns true if mask policy is valid for the instruction.
+static inline bool usesMaskPolicy(uint64_t TSFlags) {
+  return TSFlags & UsesMaskPolicyMask;
+}
+
+static inline unsigned getVLOpNum(const MCInstrDesc &Desc) {
+  const uint64_t TSFlags = Desc.TSFlags;
+  // This method is only called if we expect to have a VL operand, and all
+  // instructions with VL also have SEW.
+  assert(hasSEWOp(TSFlags) && hasVLOp(TSFlags));
+  unsigned Offset = 2;
+  if (hasVecPolicyOp(TSFlags))
+    Offset = 3;
+  return Desc.getNumOperands() - Offset;
+}
+
+static inline unsigned getSEWOpNum(const MCInstrDesc &Desc) {
+  const uint64_t TSFlags = Desc.TSFlags;
+  assert(hasSEWOp(TSFlags));
+  unsigned Offset = 1;
+  if (hasVecPolicyOp(TSFlags))
+    Offset = 2;
+  return Desc.getNumOperands() - Offset;
+}
 
 // RISC-V Specific Machine Operand Flags
 enum {
@@ -189,6 +221,7 @@ enum OperandType : unsigned {
   OPERAND_UIMM7,
   OPERAND_UIMM12,
   OPERAND_SIMM12,
+  OPERAND_SIMM12_LSB00000,
   OPERAND_UIMM20,
   OPERAND_UIMMLOG2XLEN,
   OPERAND_RVKRNUM,
@@ -344,9 +377,8 @@ namespace RISCVFeatures {
 // triple. Exits with report_fatal_error if not.
 void validate(const Triple &TT, const FeatureBitset &FeatureBits);
 
-// Convert FeatureBitset to FeatureVector.
-void toFeatureVector(std::vector<std::string> &FeatureVector,
-                     const FeatureBitset &FeatureBits);
+llvm::Expected<std::unique_ptr<RISCVISAInfo>>
+parseFeatureBits(bool IsRV64, const FeatureBitset &FeatureBits);
 
 } // namespace RISCVFeatures
 
@@ -372,11 +404,22 @@ inline static RISCVII::VLMUL getVLMUL(unsigned VType) {
 // Decode VLMUL into 1,2,4,8 and fractional indicator.
 std::pair<unsigned, bool> decodeVLMUL(RISCVII::VLMUL VLMUL);
 
+inline static RISCVII::VLMUL encodeLMUL(unsigned LMUL, bool Fractional) {
+  assert(isValidLMUL(LMUL, Fractional) && "Unsupported LMUL");
+  unsigned LmulLog2 = Log2_32(LMUL);
+  return static_cast<RISCVII::VLMUL>(Fractional ? 8 - LmulLog2 : LmulLog2);
+}
+
 inline static unsigned decodeVSEW(unsigned VSEW) {
   assert(VSEW < 8 && "Unexpected VSEW value");
   return 1 << (VSEW + 3);
 }
 
+inline static unsigned encodeSEW(unsigned SEW) {
+  assert(isValidSEW(SEW) && "Unexpected SEW value");
+  return Log2_32(SEW) - 3;
+}
+
 inline static unsigned getSEW(unsigned VType) {
   unsigned VSEW = (VType >> 3) & 0x7;
   return decodeVSEW(VSEW);
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
index fb1ce19d73bc..0c362c57e5c0 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
@@ -43,7 +43,7 @@ RISCVELFObjectWriter::RISCVELFObjectWriter(uint8_t OSABI, bool Is64Bit)
     : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_RISCV,
                               /*HasRelocationAddend*/ true) {}
 
-RISCVELFObjectWriter::~RISCVELFObjectWriter() {}
+RISCVELFObjectWriter::~RISCVELFObjectWriter() = default;
 
 unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx,
                                             const MCValue &Target,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
index d88ba9e4ac72..c5f8a42bab6a 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
@@ -16,6 +16,7 @@
 #include "RISCVMCTargetDesc.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCObjectWriter.h"
@@ -30,38 +31,12 @@ using namespace llvm;
 // This part is for ELF object output.
 RISCVTargetELFStreamer::RISCVTargetELFStreamer(MCStreamer &S,
                                                const MCSubtargetInfo &STI)
-    : RISCVTargetStreamer(S), CurrentVendor("riscv") {
+    : RISCVTargetStreamer(S), CurrentVendor("riscv"), STI(STI) {
   MCAssembler &MCA = getStreamer().getAssembler();
   const FeatureBitset &Features = STI.getFeatureBits();
   auto &MAB = static_cast<RISCVAsmBackend &>(MCA.getBackend());
-  RISCVABI::ABI ABI = MAB.getTargetABI();
-  assert(ABI != RISCVABI::ABI_Unknown && "Improperly initialised target ABI");
-
-  unsigned EFlags = MCA.getELFHeaderEFlags();
-
-  if (Features[RISCV::FeatureStdExtC])
-    EFlags |= ELF::EF_RISCV_RVC;
-
-  switch (ABI) {
-  case RISCVABI::ABI_ILP32:
-  case RISCVABI::ABI_LP64:
-    break;
-  case RISCVABI::ABI_ILP32F:
-  case RISCVABI::ABI_LP64F:
-    EFlags |= ELF::EF_RISCV_FLOAT_ABI_SINGLE;
-    break;
-  case RISCVABI::ABI_ILP32D:
-  case RISCVABI::ABI_LP64D:
-    EFlags |= ELF::EF_RISCV_FLOAT_ABI_DOUBLE;
-    break;
-  case RISCVABI::ABI_ILP32E:
-    EFlags |= ELF::EF_RISCV_RVE;
-    break;
-  case RISCVABI::ABI_Unknown:
-    llvm_unreachable("Improperly initialised target ABI");
-  }
-
-  MCA.setELFHeaderEFlags(EFlags);
+  setTargetABI(RISCVABI::computeTargetABI(STI.getTargetTriple(), Features,
+                                          MAB.getTargetOptions().getABIName()));
 }
 
 MCELFStreamer &RISCVTargetELFStreamer::getStreamer() {
@@ -98,12 +73,12 @@ void RISCVTargetELFStreamer::finishAttributeSection() {
     return;
 
   if (AttributeSection) {
-    Streamer.SwitchSection(AttributeSection);
+    Streamer.switchSection(AttributeSection);
   } else {
     MCAssembler &MCA = getStreamer().getAssembler();
     AttributeSection = MCA.getContext().getELFSection(
         ".riscv.attributes", ELF::SHT_RISCV_ATTRIBUTES, 0);
-    Streamer.SwitchSection(AttributeSection);
+    Streamer.switchSection(AttributeSection);
 
     Streamer.emitInt8(ELFAttrs::Format_Version);
   }
@@ -172,6 +147,44 @@ size_t RISCVTargetELFStreamer::calculateContentSize() const {
   return Result;
 }
 
+void RISCVTargetELFStreamer::finish() {
+  RISCVTargetStreamer::finish();
+  MCAssembler &MCA = getStreamer().getAssembler();
+  const FeatureBitset &Features = STI.getFeatureBits();
+  RISCVABI::ABI ABI = getTargetABI();
+
+  unsigned EFlags = MCA.getELFHeaderEFlags();
+
+  if (Features[RISCV::FeatureStdExtC])
+    EFlags |= ELF::EF_RISCV_RVC;
+
+  switch (ABI) {
+  case RISCVABI::ABI_ILP32:
+  case RISCVABI::ABI_LP64:
+    break;
+  case RISCVABI::ABI_ILP32F:
+  case RISCVABI::ABI_LP64F:
+    EFlags |= ELF::EF_RISCV_FLOAT_ABI_SINGLE;
+    break;
+  case RISCVABI::ABI_ILP32D:
+  case RISCVABI::ABI_LP64D:
+    EFlags |= ELF::EF_RISCV_FLOAT_ABI_DOUBLE;
+    break;
+  case RISCVABI::ABI_ILP32E:
+    EFlags |= ELF::EF_RISCV_RVE;
+    break;
+  case RISCVABI::ABI_Unknown:
+    llvm_unreachable("Improperly initialised target ABI");
+  }
+
+  MCA.setELFHeaderEFlags(EFlags);
+}
+
+void RISCVTargetELFStreamer::reset() {
+  AttributeSection = nullptr;
+  Contents.clear();
+}
+
 namespace {
 class RISCVELFStreamer : public MCELFStreamer {
   static std::pair<unsigned, unsigned> getRelocPairForSize(unsigned Size) {
@@ -194,6 +207,14 @@ class RISCVELFStreamer : public MCELFStreamer {
 
   static bool requiresFixups(MCContext &C, const MCExpr *Value,
                              const MCExpr *&LHS, const MCExpr *&RHS) {
+    auto IsMetadataOrEHFrameSection = [](const MCSection &S) -> bool {
+      // Additionally check .apple_names/.apple_types. They are fixed-size and
+      // do not need fixups. llvm-dwarfdump --apple-names does not process
+      // R_RISCV_{ADD,SUB}32 in them.
+      return S.getKind().isMetadata() || S.getName() == ".eh_frame" ||
+             S.getName() == ".apple_names" || S.getName() == ".apple_types";
+    };
+
     const auto *MBE = dyn_cast<MCBinaryExpr>(Value);
     if (MBE == nullptr)
       return false;
@@ -212,10 +233,20 @@ class RISCVELFStreamer : public MCELFStreamer {
                              MCConstantExpr::create(E.getConstant(), C), C);
     RHS = E.getSymB();
 
-    return (A.isInSection() ? A.getSection().hasInstructions()
-                            : !A.getName().empty()) ||
-           (B.isInSection() ? B.getSection().hasInstructions()
-                            : !B.getName().empty());
+    // TODO: when available, R_RISCV_n_PCREL should be preferred.
+
+    // Avoid pairwise relocations for symbolic difference in debug and .eh_frame
+    if (A.isInSection())
+      return !IsMetadataOrEHFrameSection(A.getSection());
+    if (B.isInSection())
+      return !IsMetadataOrEHFrameSection(B.getSection());
+    // as well as for absolute symbols.
+    return !A.getName().empty() || !B.getName().empty();
+  }
+
+  void reset() override {
+    static_cast<RISCVTargetStreamer *>(getTargetStreamer())->reset();
+    MCELFStreamer::reset();
   }
 
 public:
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
index 7ce7dafb8ca1..7ca2f5ab5623 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
@@ -29,6 +29,7 @@ private:
   SmallVector<AttributeItem, 64> Contents;
 
   MCSection *AttributeSection = nullptr;
+  const MCSubtargetInfo &STI;
 
   AttributeItem *getAttributeItem(unsigned Attribute) {
     for (size_t i = 0; i < Contents.size(); ++i)
@@ -91,6 +92,8 @@ private:
   void finishAttributeSection() override;
   size_t calculateContentSize() const;
 
+  void reset() override;
+
 public:
   MCELFStreamer &getStreamer();
   RISCVTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
@@ -103,6 +106,8 @@ public:
   void emitDirectiveOptionNoRVC() override;
   void emitDirectiveOptionRelax() override;
   void emitDirectiveOptionNoRelax() override;
+
+  void finish() override;
 };
 
 MCELFStreamer *createRISCVELFStreamer(MCContext &C,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
index 3268740849f0..7f88589374dd 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
@@ -146,7 +146,7 @@ void RISCVInstPrinter::printFenceArg(const MCInst *MI, unsigned OpNo,
   if ((FenceArg & RISCVFenceField::W) != 0)
     O << 'w';
   if (FenceArg == 0)
-    O << "unknown";
+    O << "0";
 }
 
 void RISCVInstPrinter::printFRMArg(const MCInst *MI, unsigned OpNo,
@@ -156,12 +156,12 @@ void RISCVInstPrinter::printFRMArg(const MCInst *MI, unsigned OpNo,
   O << RISCVFPRndMode::roundingModeToString(FRMArg);
 }
 
-void RISCVInstPrinter::printAtomicMemOp(const MCInst *MI, unsigned OpNo,
-                                        const MCSubtargetInfo &STI,
-                                        raw_ostream &O) {
+void RISCVInstPrinter::printZeroOffsetMemOp(const MCInst *MI, unsigned OpNo,
+                                            const MCSubtargetInfo &STI,
+                                            raw_ostream &O) {
   const MCOperand &MO = MI->getOperand(OpNo);
 
-  assert(MO.isReg() && "printAtomicMemOp can only print register operands");
+  assert(MO.isReg() && "printZeroOffsetMemOp can only print register operands");
   O << "(";
   printRegName(O, MO.getReg());
   O << ")";
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h
index d078ead2c8ad..763ce9c95d73 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h
@@ -40,8 +40,8 @@ public:
                      const MCSubtargetInfo &STI, raw_ostream &O);
   void printFRMArg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                    raw_ostream &O);
-  void printAtomicMemOp(const MCInst *MI, unsigned OpNo,
-                        const MCSubtargetInfo &STI, raw_ostream &O);
+  void printZeroOffsetMemOp(const MCInst *MI, unsigned OpNo,
+                            const MCSubtargetInfo &STI, raw_ostream &O);
   void printVTypeI(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                    raw_ostream &O);
   void printVMaskReg(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index 1078403a3fd2..7c062387fecd 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -23,6 +23,7 @@
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/EndianStream.h"
@@ -46,7 +47,7 @@ public:
   RISCVMCCodeEmitter(MCContext &ctx, MCInstrInfo const &MCII)
       : Ctx(ctx), MCII(MCII) {}
 
-  ~RISCVMCCodeEmitter() override {}
+  ~RISCVMCCodeEmitter() override = default;
 
   void encodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
@@ -93,7 +94,6 @@ private:
 } // end anonymous namespace
 
 MCCodeEmitter *llvm::createRISCVMCCodeEmitter(const MCInstrInfo &MCII,
-                                              const MCRegisterInfo &MRI,
                                               MCContext &Ctx) {
   return new RISCVMCCodeEmitter(Ctx, MCII);
 }
@@ -132,9 +132,7 @@ void RISCVMCCodeEmitter::expandFunctionCall(const MCInst &MI, raw_ostream &OS,
   const MCExpr *CallExpr = Func.getExpr();
 
   // Emit AUIPC Ra, Func with R_RISCV_CALL relocation type.
-  TmpInst = MCInstBuilder(RISCV::AUIPC)
-                .addReg(Ra)
-                .addOperand(MCOperand::createExpr(CallExpr));
+  TmpInst = MCInstBuilder(RISCV::AUIPC).addReg(Ra).addExpr(CallExpr);
   Binary = getBinaryCodeForInstr(TmpInst, Fixups, STI);
   support::endian::write(OS, Binary, support::little);
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
index 65714b914c60..336289cf107b 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
@@ -21,6 +21,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCObjectFileInfo.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCObjectFileInfo.cpp
index 9c9d9221578c..554711e87521 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCObjectFileInfo.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCObjectFileInfo.cpp
@@ -13,6 +13,7 @@
 #include "RISCVMCObjectFileInfo.h"
 #include "RISCVMCTargetDesc.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
index 07c2be624932..917d93479f18 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
@@ -77,11 +77,9 @@ createRISCVMCObjectFileInfo(MCContext &Ctx, bool PIC,
 
 static MCSubtargetInfo *createRISCVMCSubtargetInfo(const Triple &TT,
                                                    StringRef CPU, StringRef FS) {
-  if (CPU.empty())
+  if (CPU.empty() || CPU == "generic")
     CPU = TT.isArch64Bit() ? "generic-rv64" : "generic-rv32";
-  if (CPU == "generic")
-    report_fatal_error(Twine("CPU 'generic' is not supported. Use ") +
-                       (TT.isArch64Bit() ? "generic-rv64" : "generic-rv32"));
+
   return createRISCVMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
 }
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h
index 5216a689715a..276fc9efb6c0 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h
@@ -29,7 +29,6 @@ class MCSubtargetInfo;
 class Target;
 
 MCCodeEmitter *createRISCVMCCodeEmitter(const MCInstrInfo &MCII,
-                                        const MCRegisterInfo &MRI,
                                         MCContext &Ctx);
 
 MCAsmBackend *createRISCVAsmBackend(const Target &T, const MCSubtargetInfo &STI,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
index e935179e5f9b..d19da6bd3664 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
@@ -18,10 +18,9 @@ static int getInstSeqCost(RISCVMatInt::InstSeq &Res, bool HasRVC) {
 
   int Cost = 0;
   for (auto Instr : Res) {
-    bool Compressed;
+    // Assume instructions that aren't listed aren't compressible.
+    bool Compressed = false;
     switch (Instr.Opc) {
-    default:
-      llvm_unreachable("Unexpected opcode");
     case RISCV::SLLI:
     case RISCV::SRLI:
       Compressed = true;
@@ -31,9 +30,6 @@ static int getInstSeqCost(RISCVMatInt::InstSeq &Res, bool HasRVC) {
     case RISCV::LUI:
       Compressed = isInt<6>(Instr.Imm);
       break;
-    case RISCV::ADD_UW:
-      Compressed = false;
-      break;
     }
     // Two RVC instructions take the same space as one RVI instruction, but
     // can take longer to execute than the single RVI instruction. Thus, we
@@ -77,6 +73,12 @@ static void generateInstSeqImpl(int64_t Val,
 
   assert(IsRV64 && "Can't emit >32-bit imm for non-RV64 target");
 
+  // Use BSETI for a single bit.
+  if (ActiveFeatures[RISCV::FeatureStdExtZbs] && isPowerOf2_64(Val)) {
+    Res.push_back(RISCVMatInt::Inst(RISCV::BSETI, Log2_64(Val)));
+    return;
+  }
+
   // In the worst case, for a full 64-bit constant, a sequence of 8 instructions
   // (i.e., LUI+ADDIW+SLLI+ADDI+SLLI+ADDI+SLLI+ADDI) has to be emitted. Note
   // that the first two instructions (LUI+ADDIW) can contribute up to 32 bits
@@ -101,43 +103,53 @@ static void generateInstSeqImpl(int64_t Val,
   // performed when the recursion returns.
 
   int64_t Lo12 = SignExtend64<12>(Val);
-  int64_t Hi52 = ((uint64_t)Val + 0x800ull) >> 12;
-  int ShiftAmount = 12 + findFirstSet((uint64_t)Hi52);
-  Hi52 = SignExtend64(Hi52 >> (ShiftAmount - 12), 64 - ShiftAmount);
+  Val = (uint64_t)Val - (uint64_t)Lo12;
 
-  // If the remaining bits don't fit in 12 bits, we might be able to reduce the
-  // shift amount in order to use LUI which will zero the lower 12 bits.
+  int ShiftAmount = 0;
   bool Unsigned = false;
-  if (ShiftAmount > 12 && !isInt<12>(Hi52)) {
-    if (isInt<32>((uint64_t)Hi52 << 12)) {
-      // Reduce the shift amount and add zeros to the LSBs so it will match LUI.
-      ShiftAmount -= 12;
-      Hi52 = (uint64_t)Hi52 << 12;
-    } else if (isUInt<32>((uint64_t)Hi52 << 12) &&
-               ActiveFeatures[RISCV::FeatureStdExtZba]) {
-      // Reduce the shift amount and add zeros to the LSBs so it will match
-      // LUI, then shift left with SLLI.UW to clear the upper 32 set bits.
-      ShiftAmount -= 12;
-      Hi52 = ((uint64_t)Hi52 << 12) | (0xffffffffull << 32);
+
+  // Val might now be valid for LUI without needing a shift.
+  if (!isInt<32>(Val)) {
+    ShiftAmount = findFirstSet((uint64_t)Val);
+    Val >>= ShiftAmount;
+
+    // If the remaining bits don't fit in 12 bits, we might be able to reduce the
+    // shift amount in order to use LUI which will zero the lower 12 bits.
+    if (ShiftAmount > 12 && !isInt<12>(Val)) {
+      if (isInt<32>((uint64_t)Val << 12)) {
+        // Reduce the shift amount and add zeros to the LSBs so it will match LUI.
+        ShiftAmount -= 12;
+        Val = (uint64_t)Val << 12;
+      } else if (isUInt<32>((uint64_t)Val << 12) &&
+                 ActiveFeatures[RISCV::FeatureStdExtZba]) {
+        // Reduce the shift amount and add zeros to the LSBs so it will match
+        // LUI, then shift left with SLLI.UW to clear the upper 32 set bits.
+        ShiftAmount -= 12;
+        Val = ((uint64_t)Val << 12) | (0xffffffffull << 32);
+        Unsigned = true;
+      }
+    }
+
+    // Try to use SLLI_UW for Val when it is uint32 but not int32.
+    if (isUInt<32>((uint64_t)Val) && !isInt<32>((uint64_t)Val) &&
+        ActiveFeatures[RISCV::FeatureStdExtZba]) {
+      // Use LUI+ADDI or LUI to compose, then clear the upper 32 bits with
+      // SLLI_UW.
+      Val = ((uint64_t)Val) | (0xffffffffull << 32);
       Unsigned = true;
     }
   }
 
-  // Try to use SLLI_UW for Hi52 when it is uint32 but not int32.
-  if (isUInt<32>((uint64_t)Hi52) && !isInt<32>((uint64_t)Hi52) &&
-      ActiveFeatures[RISCV::FeatureStdExtZba]) {
-    // Use LUI+ADDI or LUI to compose, then clear the upper 32 bits with
-    // SLLI_UW.
-    Hi52 = ((uint64_t)Hi52) | (0xffffffffull << 32);
-    Unsigned = true;
-  }
+  generateInstSeqImpl(Val, ActiveFeatures, Res);
 
-  generateInstSeqImpl(Hi52, ActiveFeatures, Res);
+  // Skip shift if we were able to use LUI directly.
+  if (ShiftAmount) {
+    if (Unsigned)
+      Res.push_back(RISCVMatInt::Inst(RISCV::SLLI_UW, ShiftAmount));
+    else
+      Res.push_back(RISCVMatInt::Inst(RISCV::SLLI, ShiftAmount));
+  }
 
-  if (Unsigned)
-    Res.push_back(RISCVMatInt::Inst(RISCV::SLLI_UW, ShiftAmount));
-  else
-    Res.push_back(RISCVMatInt::Inst(RISCV::SLLI, ShiftAmount));
   if (Lo12)
     Res.push_back(RISCVMatInt::Inst(RISCV::ADDI, Lo12));
 }
@@ -166,6 +178,24 @@ InstSeq generateInstSeq(int64_t Val, const FeatureBitset &ActiveFeatures) {
   RISCVMatInt::InstSeq Res;
   generateInstSeqImpl(Val, ActiveFeatures, Res);
 
+  // If there are trailing zeros, try generating a sign extended constant with
+  // no trailing zeros and use a final SLLI to restore them.
+  if ((Val & 1) == 0 && Res.size() > 2) {
+    unsigned TrailingZeros = countTrailingZeros((uint64_t)Val);
+    int64_t ShiftedVal = Val >> TrailingZeros;
+    RISCVMatInt::InstSeq TmpSeq;
+    generateInstSeqImpl(ShiftedVal, ActiveFeatures, TmpSeq);
+    TmpSeq.push_back(RISCVMatInt::Inst(RISCV::SLLI, TrailingZeros));
+
+    // Keep the new sequence if it is an improvement.
+    if (TmpSeq.size() < Res.size()) {
+      Res = TmpSeq;
+      // A 2 instruction sequence is the best we can do.
+      if (Res.size() <= 2)
+        return Res;
+    }
+  }
+
   // If the constant is positive we might be able to generate a shifted constant
   // with no leading zeros and use a final SRLI to restore them.
   if (Val > 0 && Res.size() > 2) {
@@ -302,32 +332,34 @@ InstSeq generateInstSeq(int64_t Val, const FeatureBitset &ActiveFeatures) {
       TmpSeq.push_back(RISCVMatInt::Inst(Opc, 0));
       if (TmpSeq.size() < Res.size())
         Res = TmpSeq;
-    }
-    // Try to use LUI+SH*ADD+ADDI.
-    int64_t Hi52 = ((uint64_t)Val + 0x800ull) & ~0xfffull;
-    int64_t Lo12 = SignExtend64<12>(Val);
-    Div = 0;
-    if (isInt<32>(Hi52 / 3) && (Hi52 % 3) == 0) {
-      Div = 3;
-      Opc = RISCV::SH1ADD;
-    } else if (isInt<32>(Hi52 / 5) && (Hi52 % 5) == 0) {
-      Div = 5;
-      Opc = RISCV::SH2ADD;
-    } else if (isInt<32>(Hi52 / 9) && (Hi52 % 9) == 0) {
-      Div = 9;
-      Opc = RISCV::SH3ADD;
-    }
-    // Build the new instruction sequence.
-    if (Div > 0) {
-      // For Val that has zero Lo12 (implies Val equals to Hi52) should has
-      // already been processed to LUI+SH*ADD by previous optimization.
-      assert(Lo12 != 0 &&
-             "unexpected instruction sequence for immediate materialisation");
-      generateInstSeqImpl(Hi52 / Div, ActiveFeatures, TmpSeq);
-      TmpSeq.push_back(RISCVMatInt::Inst(Opc, 0));
-      TmpSeq.push_back(RISCVMatInt::Inst(RISCV::ADDI, Lo12));
-      if (TmpSeq.size() < Res.size())
-        Res = TmpSeq;
+    } else {
+      // Try to use LUI+SH*ADD+ADDI.
+      int64_t Hi52 = ((uint64_t)Val + 0x800ull) & ~0xfffull;
+      int64_t Lo12 = SignExtend64<12>(Val);
+      Div = 0;
+      if (isInt<32>(Hi52 / 3) && (Hi52 % 3) == 0) {
+        Div = 3;
+        Opc = RISCV::SH1ADD;
+      } else if (isInt<32>(Hi52 / 5) && (Hi52 % 5) == 0) {
+        Div = 5;
+        Opc = RISCV::SH2ADD;
+      } else if (isInt<32>(Hi52 / 9) && (Hi52 % 9) == 0) {
+        Div = 9;
+        Opc = RISCV::SH3ADD;
+      }
+      // Build the new instruction sequence.
+      if (Div > 0) {
+        // For Val that has zero Lo12 (implies Val equals to Hi52) should has
+        // already been processed to LUI+SH*ADD by previous optimization.
+        assert(Lo12 != 0 &&
+               "unexpected instruction sequence for immediate materialisation");
+        assert(TmpSeq.empty() && "Expected empty TmpSeq");
+        generateInstSeqImpl(Hi52 / Div, ActiveFeatures, TmpSeq);
+        TmpSeq.push_back(RISCVMatInt::Inst(Opc, 0));
+        TmpSeq.push_back(RISCVMatInt::Inst(RISCV::ADDI, Lo12));
+        if (TmpSeq.size() < Res.size())
+          Res = TmpSeq;
+      }
     }
   }
 
@@ -362,5 +394,30 @@ int getIntMatCost(const APInt &Val, unsigned Size,
   }
   return std::max(1, Cost);
 }
+
+OpndKind Inst::getOpndKind() const {
+  switch (Opc) {
+  default:
+    llvm_unreachable("Unexpected opcode!");
+  case RISCV::LUI:
+    return RISCVMatInt::Imm;
+  case RISCV::ADD_UW:
+    return RISCVMatInt::RegX0;
+  case RISCV::SH1ADD:
+  case RISCV::SH2ADD:
+  case RISCV::SH3ADD:
+    return RISCVMatInt::RegReg;
+  case RISCV::ADDI:
+  case RISCV::ADDIW:
+  case RISCV::SLLI:
+  case RISCV::SRLI:
+  case RISCV::SLLI_UW:
+  case RISCV::RORI:
+  case RISCV::BSETI:
+  case RISCV::BCLRI:
+    return RISCVMatInt::RegImm;
+  }
+}
+
 } // namespace RISCVMatInt
 } // namespace llvm
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h
index 6a8e0c640001..90c29f01c43d 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h
@@ -17,11 +17,21 @@ namespace llvm {
 class APInt;
 
 namespace RISCVMatInt {
+
+enum OpndKind {
+  RegImm, // ADDI/ADDIW/SLLI/SRLI/BSETI/BCLRI
+  Imm,    // LUI
+  RegReg, // SH1ADD/SH2ADD/SH3ADD
+  RegX0,  // ADD_UW
+};
+
 struct Inst {
   unsigned Opc;
   int64_t Imm;
 
   Inst(unsigned Opc, int64_t Imm) : Opc(Opc), Imm(Imm) {}
+
+  OpndKind getOpndKind() const;
 };
 using InstSeq = SmallVector<Inst, 8>;
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
index 2f016374e6a2..5f9ed77d07cf 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
@@ -22,6 +22,7 @@ using namespace llvm;
 RISCVTargetStreamer::RISCVTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
 
 void RISCVTargetStreamer::finish() { finishAttributeSection(); }
+void RISCVTargetStreamer::reset() {}
 
 void RISCVTargetStreamer::emitDirectiveOptionPush() {}
 void RISCVTargetStreamer::emitDirectiveOptionPop() {}
@@ -38,6 +39,10 @@ void RISCVTargetStreamer::emitTextAttribute(unsigned Attribute,
 void RISCVTargetStreamer::emitIntTextAttribute(unsigned Attribute,
                                                unsigned IntValue,
                                                StringRef StringValue) {}
+void RISCVTargetStreamer::setTargetABI(RISCVABI::ABI ABI) {
+  assert(ABI != RISCVABI::ABI_Unknown && "Improperly initialized target ABI");
+  TargetABI = ABI;
+}
 
 void RISCVTargetStreamer::emitTargetAttributes(const MCSubtargetInfo &STI) {
   if (STI.hasFeature(RISCV::FeatureRV32E))
@@ -45,15 +50,10 @@ void RISCVTargetStreamer::emitTargetAttributes(const MCSubtargetInfo &STI) {
   else
     emitAttribute(RISCVAttrs::STACK_ALIGN, RISCVAttrs::ALIGN_16);
 
-  unsigned XLen = STI.hasFeature(RISCV::Feature64Bit) ? 64 : 32;
-  std::vector<std::string> FeatureVector;
-  RISCVFeatures::toFeatureVector(FeatureVector, STI.getFeatureBits());
-
-  auto ParseResult = llvm::RISCVISAInfo::parseFeatures(XLen, FeatureVector);
+  auto ParseResult = RISCVFeatures::parseFeatureBits(
+      STI.hasFeature(RISCV::Feature64Bit), STI.getFeatureBits());
   if (!ParseResult) {
-    /* Assume any error about features should handled earlier.  */
-    consumeError(ParseResult.takeError());
-    llvm_unreachable("Parsing feature error when emitTargetAttributes?");
+    report_fatal_error(ParseResult.takeError());
   } else {
     auto &ISAInfo = *ParseResult;
     emitTextAttribute(RISCVAttrs::ARCH, ISAInfo->toString());
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h
index 171780d94ce7..0d35d0b698a9 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVTARGETSTREAMER_H
 #define LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVTARGETSTREAMER_H
 
+#include "RISCV.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 
@@ -17,9 +18,12 @@ namespace llvm {
 class formatted_raw_ostream;
 
 class RISCVTargetStreamer : public MCTargetStreamer {
+  RISCVABI::ABI TargetABI = RISCVABI::ABI_Unknown;
+
 public:
   RISCVTargetStreamer(MCStreamer &S);
   void finish() override;
+  virtual void reset();
 
   virtual void emitDirectiveOptionPush();
   virtual void emitDirectiveOptionPop();
@@ -36,6 +40,8 @@ public:
                                     StringRef StringValue);
 
   void emitTargetAttributes(const MCSubtargetInfo &STI);
+  void setTargetABI(RISCVABI::ABI ABI);
+  RISCVABI::ABI getTargetABI() const { return TargetABI; }
 };
 
 // This part is for ascii assembly output
diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h
index 03462240fd93..917837a307ad 100644
--- a/llvm/lib/Target/RISCV/RISCV.h
+++ b/llvm/lib/Target/RISCV/RISCV.h
@@ -32,10 +32,14 @@ class PassRegistry;
 
 bool lowerRISCVMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                     AsmPrinter &AP);
-bool LowerRISCVMachineOperandToMCOperand(const MachineOperand &MO,
+bool lowerRISCVMachineOperandToMCOperand(const MachineOperand &MO,
                                          MCOperand &MCOp, const AsmPrinter &AP);
 
-FunctionPass *createRISCVISelDag(RISCVTargetMachine &TM);
+FunctionPass *createRISCVISelDag(RISCVTargetMachine &TM,
+                                 CodeGenOpt::Level OptLevel);
+
+FunctionPass *createRISCVMakeCompressibleOptPass();
+void initializeRISCVMakeCompressibleOptPass(PassRegistry &);
 
 FunctionPass *createRISCVGatherScatterLoweringPass();
 void initializeRISCVGatherScatterLoweringPass(PassRegistry &);
@@ -55,6 +59,9 @@ void initializeRISCVExpandAtomicPseudoPass(PassRegistry &);
 FunctionPass *createRISCVInsertVSETVLIPass();
 void initializeRISCVInsertVSETVLIPass(PassRegistry &);
 
+FunctionPass *createRISCVRedundantCopyEliminationPass();
+void initializeRISCVRedundantCopyEliminationPass(PassRegistry &);
+
 InstructionSelector *createRISCVInstructionSelector(const RISCVTargetMachine &,
                                                     RISCVSubtarget &,
                                                     RISCVRegisterBankInfo &);
diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td
index e32a8fb010de..e783ef38b448 100644
--- a/llvm/lib/Target/RISCV/RISCV.td
+++ b/llvm/lib/Target/RISCV/RISCV.td
@@ -41,6 +41,13 @@ def HasStdExtD : Predicate<"Subtarget->hasStdExtD()">,
                            AssemblerPredicate<(all_of FeatureStdExtD),
                            "'D' (Double-Precision Floating-Point)">;
 
+def FeatureStdExtZihintpause
+    : SubtargetFeature<"zihintpause", "HasStdExtZihintpause", "true",
+                       "'zihintpause' (Pause Hint)">;
+def HasStdExtZihintpause : Predicate<"Subtarget->hasStdExtZihintpause()">,
+                                     AssemblerPredicate<(all_of FeatureStdExtZihintpause),
+                                     "'Zihintpause' (Pause Hint)">;
+
 def FeatureStdExtZfhmin
     : SubtargetFeature<"zfhmin", "HasStdExtZfhmin", "true",
                        "'Zfhmin' (Half-Precision Floating-Point Minimal)",
@@ -63,6 +70,43 @@ def HasStdExtZfhOrZfhmin
                                    "'Zfh' (Half-Precision Floating-Point) or "
                                    "'Zfhmin' (Half-Precision Floating-Point Minimal)">;
 
+def FeatureStdExtZfinx
+    : SubtargetFeature<"zfinx", "HasStdExtZfinx", "true",
+                       "'Zfinx' (Float in Integer)">;
+def HasStdExtZfinx : Predicate<"Subtarget->hasStdExtZfinx()">,
+                               AssemblerPredicate<(all_of FeatureStdExtZfinx),
+                               "'Zfinx' (Float in Integer)">;
+
+def FeatureStdExtZdinx
+    : SubtargetFeature<"zdinx", "HasStdExtZdinx", "true",
+                       "'Zdinx' (Double in Integer)",
+                       [FeatureStdExtZfinx]>;
+def HasStdExtZdinx : Predicate<"Subtarget->hasStdExtZdinx()">,
+                               AssemblerPredicate<(all_of FeatureStdExtZdinx),
+                               "'Zdinx' (Double in Integer)">;
+
+def FeatureStdExtZhinxmin
+    : SubtargetFeature<"zhinxmin", "HasStdExtZhinxmin", "true",
+                       "'Zhinxmin' (Half Float in Integer Minimal)",
+                       [FeatureStdExtZfinx]>;
+def HasStdExtZhinxmin : Predicate<"Subtarget->hasStdExtZhinxmin()">,
+                                  AssemblerPredicate<(all_of FeatureStdExtZhinxmin),
+                                  "'Zhinxmin' (Half Float in Integer Minimal)">;
+
+def FeatureStdExtZhinx
+    : SubtargetFeature<"zhinx", "HasStdExtZhinx", "true",
+                       "'Zhinx' (Half Float in Integer)",
+                       [FeatureStdExtZfinx]>;
+def HasStdExtZhinx : Predicate<"Subtarget->hasStdExtZhinx()">,
+                               AssemblerPredicate<(all_of FeatureStdExtZhinx),
+                               "'Zhinx' (Half Float in Integer)">;
+
+def HasStdExtZhinxOrZhinxmin
+    : Predicate<"Subtarget->hasStdExtZhinx() || Subtarget->hasStdExtZhinxmin()">,
+                AssemblerPredicate<(any_of FeatureStdExtZhinx, FeatureStdExtZhinxmin),
+                                   "'Zhinx' (Half Float in Integer) or "
+                                   "'Zhinxmin' (Half Float in Integer Minimal)">;
+
 def FeatureStdExtC
     : SubtargetFeature<"c", "HasStdExtC", "true",
                        "'C' (Compressed Instructions)">;
@@ -290,13 +334,13 @@ def HasRVCHints : Predicate<"Subtarget->enableRVCHintInstrs()">,
                   AssemblerPredicate<(all_of(not FeatureNoRVCHints)),
                                       "RVC Hint Instructions">;
 
-def FeatureStdExtZvl32b : SubtargetFeature<"zvl32b", "ZvlLen", "ExtZvl::Zvl32b",
+def FeatureStdExtZvl32b : SubtargetFeature<"zvl32b", "ZvlLen", "32",
                        "'Zvl' (Minimum Vector Length) 32">;
 
 foreach i = { 6-15 } in {
     defvar I = !shl(1, i);
     def FeatureStdExtZvl#I#b :
-        SubtargetFeature<"zvl"#I#"b", "ZvlLen", "ExtZvl::Zvl"#I#"b",
+        SubtargetFeature<"zvl"#I#"b", "ZvlLen", !cast<string>(I),
                         "'Zvl' (Minimum Vector Length) "#I,
                         [!cast<SubtargetFeature>("FeatureStdExtZvl"#!srl(I, 1)#"b")]>;
 }
@@ -333,24 +377,50 @@ def FeatureStdExtZve64d
 def FeatureStdExtV
     : SubtargetFeature<"v", "HasStdExtV", "true",
                        "'V' (Vector Extension for Application Processors)",
-                       [FeatureStdExtZvl128b, FeatureStdExtF, FeatureStdExtD]>;
+                       [FeatureStdExtZvl128b, FeatureStdExtZve64d, FeatureStdExtF, FeatureStdExtD]>;
 
 def HasVInstructions    : Predicate<"Subtarget->hasVInstructions()">,
       AssemblerPredicate<
-          (any_of FeatureStdExtZve32x, FeatureStdExtV),
+          (any_of FeatureStdExtZve32x),
           "'V' (Vector Extension for Application Processors), 'Zve32x' or "
           "'Zve64x' (Vector Extensions for Embedded Processors)">;
 def HasVInstructionsI64 : Predicate<"Subtarget->hasVInstructionsI64()">,
       AssemblerPredicate<
-          (any_of FeatureStdExtZve64x, FeatureStdExtV),
+          (any_of FeatureStdExtZve64x),
           "'V' (Vector Extension for Application Processors) or 'Zve64x' "
           "(Vector Extensions for Embedded Processors)">;
 def HasVInstructionsAnyF : Predicate<"Subtarget->hasVInstructionsAnyF()">,
       AssemblerPredicate<
-          (any_of FeatureStdExtZve32f, FeatureStdExtV),
+          (any_of FeatureStdExtZve32f),
           "'V' (Vector Extension for Application Processors), 'Zve32f', "
           "'Zve64f' or 'Zve64d' (Vector Extensions for Embedded Processors)">;
 
+def FeatureStdExtZvfh
+    : SubtargetFeature<"experimental-zvfh", "HasStdExtZvfh", "true",
+                       "'Zvfh' (Vector Half-Precision Floating-Point)",
+                       [FeatureStdExtZve32f]>;
+
+def FeatureStdExtZicbom
+    : SubtargetFeature<"zicbom", "HasStdExtZicbom", "true",
+                       "'Zicbom' (Cache-Block Management Instructions)">;
+def HasStdExtZicbom : Predicate<"Subtarget->hasStdExtZicbom()">,
+                                AssemblerPredicate<(all_of FeatureStdExtZicbom),
+                                "'Zicbom' (Cache-Block Management Instructions)">;
+
+def FeatureStdExtZicboz
+    : SubtargetFeature<"zicboz", "HasStdExtZicboz", "true",
+                       "'Zicboz' (Cache-Block Zero Instructions)">;
+def HasStdExtZicboz : Predicate<"Subtarget->hasStdExtZicboz()">,
+                                AssemblerPredicate<(all_of FeatureStdExtZicboz),
+                                "'Zicboz' (Cache-Block Zero Instructions)">;
+
+def FeatureStdExtZicbop
+    : SubtargetFeature<"zicbop", "HasStdExtZicbop", "true",
+                       "'Zicbop' (Cache-Block Prefetch Instructions)">;
+def HasStdExtZicbop : Predicate<"Subtarget->hasStdExtZicbop()">,
+                                AssemblerPredicate<(all_of FeatureStdExtZicbop),
+                                "'Zicbop' (Cache-Block Prefetch Instructions)">;
+
 def Feature64Bit
     : SubtargetFeature<"64bit", "HasRV64", "true", "Implements RV64">;
 def IsRV64 : Predicate<"Subtarget->is64Bit()">,
@@ -381,6 +451,19 @@ foreach i = {1-31} in
 def FeatureSaveRestore : SubtargetFeature<"save-restore", "EnableSaveRestore",
                                           "true", "Enable save/restore.">;
 
+def FeatureUnalignedScalarMem
+   : SubtargetFeature<"unaligned-scalar-mem", "EnableUnalignedScalarMem",
+                      "true", "Has reasonably performant unaligned scalar "
+                      "loads and stores">;
+
+def TuneLUIADDIFusion
+    : SubtargetFeature<"lui-addi-fusion", "HasLUIADDIFusion",
+                       "true", "Enable LUI+ADDI macrofusion">;
+
+def TuneNoDefaultUnroll
+    : SubtargetFeature<"no-default-unroll", "EnableDefaultUnroll", "false",
+                       "Disable default unroll preference.">;
+
 def TuneSiFive7 : SubtargetFeature<"sifive7", "RISCVProcFamily", "SiFive7",
                                    "SiFive 7-Series processors">;
 
@@ -408,14 +491,17 @@ include "RISCVSchedSiFive7.td"
 
 def : ProcessorModel<"generic-rv32", NoSchedModel, []>;
 def : ProcessorModel<"generic-rv64", NoSchedModel, [Feature64Bit]>;
+// Support generic for compatibility with other targets. The triple will be used
+// to change to the appropriate rv32/rv64 version.
+def : ProcessorModel<"generic", NoSchedModel, []>;
 
 def : ProcessorModel<"rocket-rv32", RocketModel, []>;
 def : ProcessorModel<"rocket-rv64", RocketModel, [Feature64Bit]>;
 
 def : ProcessorModel<"sifive-7-rv32", SiFive7Model, [],
-                     [TuneSiFive7]>;
+                     [TuneSiFive7, TuneNoDefaultUnroll]>;
 def : ProcessorModel<"sifive-7-rv64", SiFive7Model, [Feature64Bit],
-                     [TuneSiFive7]>;
+                     [TuneSiFive7, TuneNoDefaultUnroll]>;
 
 def : ProcessorModel<"sifive-e20", RocketModel, [FeatureStdExtM,
                                                  FeatureStdExtC]>;
@@ -442,7 +528,7 @@ def : ProcessorModel<"sifive-e76", SiFive7Model, [FeatureStdExtM,
                                                   FeatureStdExtA,
                                                   FeatureStdExtF,
                                                   FeatureStdExtC],
-                     [TuneSiFive7]>;
+                     [TuneSiFive7, TuneNoDefaultUnroll]>;
 
 def : ProcessorModel<"sifive-s21", RocketModel, [Feature64Bit,
                                                  FeatureStdExtM,
@@ -467,7 +553,7 @@ def : ProcessorModel<"sifive-s76", SiFive7Model, [Feature64Bit,
                                                   FeatureStdExtF,
                                                   FeatureStdExtD,
                                                   FeatureStdExtC],
-                     [TuneSiFive7]>;
+                     [TuneSiFive7, TuneNoDefaultUnroll]>;
 
 def : ProcessorModel<"sifive-u54", RocketModel, [Feature64Bit,
                                                  FeatureStdExtM,
@@ -482,7 +568,7 @@ def : ProcessorModel<"sifive-u74", SiFive7Model, [Feature64Bit,
                                                   FeatureStdExtF,
                                                   FeatureStdExtD,
                                                   FeatureStdExtC],
-                     [TuneSiFive7]>;
+                     [TuneSiFive7, TuneNoDefaultUnroll]>;
 
 //===----------------------------------------------------------------------===//
 // Define the RISC-V target.
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 9fed6e7baadc..5b2a247ebda0 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -38,12 +38,13 @@ STATISTIC(RISCVNumInstrsCompressed,
 
 namespace {
 class RISCVAsmPrinter : public AsmPrinter {
-  const MCSubtargetInfo *STI;
+  const MCSubtargetInfo *MCSTI;
+  const RISCVSubtarget *STI;
 
 public:
   explicit RISCVAsmPrinter(TargetMachine &TM,
                            std::unique_ptr<MCStreamer> Streamer)
-      : AsmPrinter(TM, std::move(Streamer)), STI(TM.getMCSubtargetInfo()) {}
+      : AsmPrinter(TM, std::move(Streamer)), MCSTI(TM.getMCSubtargetInfo()) {}
 
   StringRef getPassName() const override { return "RISCV Assembly Printer"; }
 
@@ -62,12 +63,14 @@ public:
 
   // Wrapper needed for tblgenned pseudo lowering.
   bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const {
-    return LowerRISCVMachineOperandToMCOperand(MO, MCOp, *this);
+    return lowerRISCVMachineOperandToMCOperand(MO, MCOp, *this);
   }
 
   void emitStartOfAsmFile(Module &M) override;
   void emitEndOfAsmFile(Module &M) override;
 
+  void emitFunctionEntryLabel() override;
+
 private:
   void emitAttributes();
 };
@@ -170,7 +173,8 @@ bool RISCVAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   MCSubtargetInfo &NewSTI =
     OutStreamer->getContext().getSubtargetCopy(*TM.getMCSubtargetInfo());
   NewSTI.setFeatureBits(MF.getSubtarget().getFeatureBits());
-  STI = &NewSTI;
+  MCSTI = &NewSTI;
+  STI = &MF.getSubtarget<RISCVSubtarget>();
 
   SetupMachineFunction(MF);
   emitFunctionBody();
@@ -193,7 +197,14 @@ void RISCVAsmPrinter::emitEndOfAsmFile(Module &M) {
 void RISCVAsmPrinter::emitAttributes() {
   RISCVTargetStreamer &RTS =
       static_cast<RISCVTargetStreamer &>(*OutStreamer->getTargetStreamer());
-  RTS.emitTargetAttributes(*STI);
+  RTS.emitTargetAttributes(*MCSTI);
+}
+
+void RISCVAsmPrinter::emitFunctionEntryLabel() {
+  AsmPrinter::emitFunctionEntryLabel();
+  RISCVTargetStreamer &RTS =
+      static_cast<RISCVTargetStreamer &>(*OutStreamer->getTargetStreamer());
+  RTS.setTargetABI(STI->getTargetABI());
 }
 
 // Force static initialization.
diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
index 0c5c13db7112..e4e01d9f6f2f 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
@@ -290,7 +290,7 @@ bool RISCVExpandPseudo::expandVSPILL(MachineBasicBlock &MBB,
   Register SrcReg = MBBI->getOperand(0).getReg();
   Register Base = MBBI->getOperand(1).getReg();
   Register VL = MBBI->getOperand(2).getReg();
-  auto ZvlssegInfo = TII->isRVVSpillForZvlsseg(MBBI->getOpcode());
+  auto ZvlssegInfo = RISCV::isRVVSpillForZvlsseg(MBBI->getOpcode());
   if (!ZvlssegInfo)
     return false;
   unsigned NF = ZvlssegInfo->first;
@@ -314,10 +314,15 @@ bool RISCVExpandPseudo::expandVSPILL(MachineBasicBlock &MBB,
     assert(LMUL == 1 && "LMUL must be 1, 2, or 4.");
 
   for (unsigned I = 0; I < NF; ++I) {
+    // Adding implicit-use of super register to describe we are using part of
+    // super register, that prevents machine verifier complaining when part of
+    // subreg is undef, see comment in MachineVerifier::checkLiveness for more
+    // detail.
     BuildMI(MBB, MBBI, DL, TII->get(Opcode))
         .addReg(TRI->getSubReg(SrcReg, SubRegIdx + I))
         .addReg(Base)
-        .addMemOperand(*(MBBI->memoperands_begin()));
+        .addMemOperand(*(MBBI->memoperands_begin()))
+        .addReg(SrcReg, RegState::Implicit);
     if (I != NF - 1)
       BuildMI(MBB, MBBI, DL, TII->get(RISCV::ADD), Base)
           .addReg(Base)
@@ -335,7 +340,7 @@ bool RISCVExpandPseudo::expandVRELOAD(MachineBasicBlock &MBB,
   Register DestReg = MBBI->getOperand(0).getReg();
   Register Base = MBBI->getOperand(1).getReg();
   Register VL = MBBI->getOperand(2).getReg();
-  auto ZvlssegInfo = TII->isRVVSpillForZvlsseg(MBBI->getOpcode());
+  auto ZvlssegInfo = RISCV::isRVVSpillForZvlsseg(MBBI->getOpcode());
   if (!ZvlssegInfo)
     return false;
   unsigned NF = ZvlssegInfo->first;
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index ad003404d793..57d8ba6f0161 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -21,6 +21,8 @@
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/MC/MCDwarf.h"
 
+#include <algorithm>
+
 using namespace llvm;
 
 // For now we use x18, a.k.a s2, as pointer to shadow call stack.
@@ -250,6 +252,7 @@ bool RISCVFrameLowering::hasBP(const MachineFunction &MF) const {
 // Determines the size of the frame and maximum call frame size.
 void RISCVFrameLowering::determineFrameLayout(MachineFunction &MF) const {
   MachineFrameInfo &MFI = MF.getFrameInfo();
+  auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
 
   // Get the number of bytes to allocate from the FrameInfo.
   uint64_t FrameSize = MFI.getStackSize();
@@ -262,6 +265,28 @@ void RISCVFrameLowering::determineFrameLayout(MachineFunction &MF) const {
 
   // Update frame info.
   MFI.setStackSize(FrameSize);
+
+  // When using SP or BP to access stack objects, we may require extra padding
+  // to ensure the bottom of the RVV stack is correctly aligned within the main
+  // stack. We calculate this as the amount required to align the scalar local
+  // variable section up to the RVV alignment.
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+  if (RVFI->getRVVStackSize() && (!hasFP(MF) || TRI->hasStackRealignment(MF))) {
+    int ScalarLocalVarSize = FrameSize - RVFI->getCalleeSavedStackSize() -
+                             RVFI->getVarArgsSaveSize();
+    if (auto RVVPadding =
+            offsetToAlignment(ScalarLocalVarSize, RVFI->getRVVStackAlign()))
+      RVFI->setRVVPadding(RVVPadding);
+  }
+}
+
+// Returns the stack size including RVV padding (when required), rounded back
+// up to the required stack alignment.
+uint64_t RISCVFrameLowering::getStackSizeWithRVVPadding(
+    const MachineFunction &MF) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
+  return alignTo(MFI.getStackSize() + RVFI->getRVVPadding(), getStackAlign());
 }
 
 void RISCVFrameLowering::adjustReg(MachineBasicBlock &MBB,
@@ -280,21 +305,43 @@ void RISCVFrameLowering::adjustReg(MachineBasicBlock &MBB,
         .addReg(SrcReg)
         .addImm(Val)
         .setMIFlag(Flag);
-  } else {
-    unsigned Opc = RISCV::ADD;
-    bool isSub = Val < 0;
-    if (isSub) {
-      Val = -Val;
-      Opc = RISCV::SUB;
-    }
+    return;
+  }
 
-    Register ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
-    TII->movImm(MBB, MBBI, DL, ScratchReg, Val, Flag);
-    BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
+  // Try to split the offset across two ADDIs. We need to keep the stack pointer
+  // aligned after each ADDI. We need to determine the maximum value we can put
+  // in each ADDI. In the negative direction, we can use -2048 which is always
+  // sufficiently aligned. In the positive direction, we need to find the
+  // largest 12-bit immediate that is aligned. Exclude -4096 since it can be
+  // created with LUI.
+  assert(getStackAlign().value() < 2048 && "Stack alignment too large");
+  int64_t MaxPosAdjStep = 2048 - getStackAlign().value();
+  if (Val > -4096 && Val <= (2 * MaxPosAdjStep)) {
+    int64_t FirstAdj = Val < 0 ? -2048 : MaxPosAdjStep;
+    Val -= FirstAdj;
+    BuildMI(MBB, MBBI, DL, TII->get(RISCV::ADDI), DestReg)
         .addReg(SrcReg)
-        .addReg(ScratchReg, RegState::Kill)
+        .addImm(FirstAdj)
+        .setMIFlag(Flag);
+    BuildMI(MBB, MBBI, DL, TII->get(RISCV::ADDI), DestReg)
+        .addReg(DestReg, RegState::Kill)
+        .addImm(Val)
         .setMIFlag(Flag);
+    return;
+  }
+
+  unsigned Opc = RISCV::ADD;
+  if (Val < 0) {
+    Val = -Val;
+    Opc = RISCV::SUB;
   }
+
+  Register ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+  TII->movImm(MBB, MBBI, DL, ScratchReg, Val, Flag);
+  BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
+      .addReg(SrcReg)
+      .addReg(ScratchReg, RegState::Kill)
+      .setMIFlag(Flag);
 }
 
 // Returns the register used to hold the frame pointer.
@@ -401,7 +448,7 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
 
   // FIXME (note copied from Lanai): This appears to be overallocating.  Needs
   // investigation. Get the number of bytes to allocate from the FrameInfo.
-  uint64_t StackSize = MFI.getStackSize() + RVFI->getRVVPadding();
+  uint64_t StackSize = getStackSizeWithRVVPadding(MF);
   uint64_t RealStackSize = StackSize + RVFI->getLibCallStackSize();
   uint64_t RVVStackSize = RVFI->getRVVStackSize();
 
@@ -482,7 +529,8 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
 
   // Emit the second SP adjustment after saving callee saved registers.
   if (FirstSPAdjustAmount) {
-    uint64_t SecondSPAdjustAmount = MFI.getStackSize() - FirstSPAdjustAmount;
+    uint64_t SecondSPAdjustAmount =
+        getStackSizeWithRVVPadding(MF) - FirstSPAdjustAmount;
     assert(SecondSPAdjustAmount > 0 &&
            "SecondSPAdjustAmount should be greater than zero");
     adjustReg(MBB, MBBI, DL, SPReg, SPReg, -SecondSPAdjustAmount,
@@ -492,8 +540,8 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
     // don't emit an sp-based .cfi_def_cfa_offset
     if (!hasFP(MF)) {
       // Emit ".cfi_def_cfa_offset StackSize"
-      unsigned CFIIndex = MF.addFrameInst(
-          MCCFIInstruction::cfiDefCfaOffset(nullptr, MFI.getStackSize()));
+      unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(
+          nullptr, getStackSizeWithRVVPadding(MF)));
       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex)
           .setMIFlag(MachineInstr::FrameSetup);
@@ -561,15 +609,11 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
   MachineBasicBlock::iterator MBBI = MBB.end();
   DebugLoc DL;
   if (!MBB.empty()) {
-    MBBI = MBB.getFirstTerminator();
-    if (MBBI == MBB.end())
-      MBBI = MBB.getLastNonDebugInstr();
-    DL = MBBI->getDebugLoc();
+    MBBI = MBB.getLastNonDebugInstr();
+    if (MBBI != MBB.end())
+      DL = MBBI->getDebugLoc();
 
-    // If this is not a terminator, the actual insert location should be after the
-    // last instruction.
-    if (!MBBI->isTerminator())
-      MBBI = std::next(MBBI);
+    MBBI = MBB.getFirstTerminator();
 
     // If callee-saved registers are saved via libcall, place stack adjustment
     // before this call.
@@ -587,7 +631,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
   if (!CSI.empty())
     LastFrameDestroy = std::prev(MBBI, CSI.size());
 
-  uint64_t StackSize = MFI.getStackSize() + RVFI->getRVVPadding();
+  uint64_t StackSize = getStackSizeWithRVVPadding(MF);
   uint64_t RealStackSize = StackSize + RVFI->getLibCallStackSize();
   uint64_t FPOffset = RealStackSize - RVFI->getVarArgsSaveSize();
   uint64_t RVVStackSize = RVFI->getRVVStackSize();
@@ -595,7 +639,15 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
   // Restore the stack pointer using the value of the frame pointer. Only
   // necessary if the stack pointer was modified, meaning the stack size is
   // unknown.
-  if (RI->hasStackRealignment(MF) || MFI.hasVarSizedObjects()) {
+  //
+  // In order to make sure the stack point is right through the EH region,
+  // we also need to restore stack pointer from the frame pointer if we
+  // don't preserve stack space within prologue/epilogue for outgoing variables,
+  // normally it's just checking the variable sized object is present or not
+  // is enough, but we also don't preserve that at prologue/epilogue when
+  // have vector objects in stack.
+  if (RI->hasStackRealignment(MF) || MFI.hasVarSizedObjects() ||
+      !hasReservedCallFrame(MF)) {
     assert(hasFP(MF) && "frame pointer should not have been eliminated");
     adjustReg(MBB, LastFrameDestroy, DL, SPReg, FPReg, -FPOffset,
               MachineInstr::FrameDestroy);
@@ -607,7 +659,8 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
 
   uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF);
   if (FirstSPAdjustAmount) {
-    uint64_t SecondSPAdjustAmount = MFI.getStackSize() - FirstSPAdjustAmount;
+    uint64_t SecondSPAdjustAmount =
+        getStackSizeWithRVVPadding(MF) - FirstSPAdjustAmount;
     assert(SecondSPAdjustAmount > 0 &&
            "SecondSPAdjustAmount should be greater than zero");
 
@@ -665,134 +718,138 @@ RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
     if (FirstSPAdjustAmount)
       Offset += StackOffset::getFixed(FirstSPAdjustAmount);
     else
-      Offset +=
-          StackOffset::getFixed(MFI.getStackSize() + RVFI->getRVVPadding());
-  } else if (RI->hasStackRealignment(MF) && !MFI.isFixedObjectIndex(FI)) {
+      Offset += StackOffset::getFixed(getStackSizeWithRVVPadding(MF));
+    return Offset;
+  }
+
+  if (RI->hasStackRealignment(MF) && !MFI.isFixedObjectIndex(FI)) {
     // If the stack was realigned, the frame pointer is set in order to allow
     // SP to be restored, so we need another base register to record the stack
     // after realignment.
+    // |--------------------------| -- <-- FP
+    // | callee-allocated save    | | <----|
+    // | area for register varargs| |      |
+    // |--------------------------| |      |
+    // | callee-saved registers   | |      |
+    // |--------------------------| --     |
+    // | realignment (the size of | |      |
+    // | this area is not counted | |      |
+    // | in MFI.getStackSize())   | |      |
+    // |--------------------------| --     |-- MFI.getStackSize()
+    // | RVV alignment padding    | |      |
+    // | (not counted in          | |      |
+    // | MFI.getStackSize() but   | |      |
+    // | counted in               | |      |
+    // | RVFI.getRVVStackSize())  | |      |
+    // |--------------------------| --     |
+    // | RVV objects              | |      |
+    // | (not counted in          | |      |
+    // | MFI.getStackSize())      | |      |
+    // |--------------------------| --     |
+    // | padding before RVV       | |      |
+    // | (not counted in          | |      |
+    // | MFI.getStackSize() or in | |      |
+    // | RVFI.getRVVStackSize())  | |      |
+    // |--------------------------| --     |
+    // | scalar local variables   | | <----'
+    // |--------------------------| -- <-- BP (if var sized objects present)
+    // | VarSize objects          | |
+    // |--------------------------| -- <-- SP
     if (hasBP(MF)) {
       FrameReg = RISCVABI::getBPReg();
-      // |--------------------------| -- <-- FP
-      // | callee-saved registers   | | <----.
-      // |--------------------------| --     |
-      // | realignment (the size of | |      |
-      // | this area is not counted | |      |
-      // | in MFI.getStackSize())   | |      |
-      // |--------------------------| --     |
-      // | Padding after RVV        | |      |
-      // | (not counted in          | |      |
-      // | MFI.getStackSize())      | |      |
-      // |--------------------------| --     |-- MFI.getStackSize()
-      // | RVV objects              | |      |
-      // | (not counted in          | |      |
-      // | MFI.getStackSize())      | |      |
-      // |--------------------------| --     |
-      // | Padding before RVV       | |      |
-      // | (not counted in          | |      |
-      // | MFI.getStackSize())      | |      |
-      // |--------------------------| --     |
-      // | scalar local variables   | | <----'
-      // |--------------------------| -- <-- BP
-      // | VarSize objects          | |
-      // |--------------------------| -- <-- SP
     } else {
+      // VarSize objects must be empty in this case!
+      assert(!MFI.hasVarSizedObjects());
       FrameReg = RISCV::X2;
-      // |--------------------------| -- <-- FP
-      // | callee-saved registers   | | <----.
-      // |--------------------------| --     |
-      // | realignment (the size of | |      |
-      // | this area is not counted | |      |
-      // | in MFI.getStackSize())   | |      |
-      // |--------------------------| --     |
-      // | Padding after RVV        | |      |
-      // | (not counted in          | |      |
-      // | MFI.getStackSize())      | |      |
-      // |--------------------------| --     |-- MFI.getStackSize()
-      // | RVV objects              | |      |
-      // | (not counted in          | |      |
-      // | MFI.getStackSize())      | |      |
-      // |--------------------------| --     |
-      // | Padding before RVV       | |      |
-      // | (not counted in          | |      |
-      // | MFI.getStackSize())      | |      |
-      // |--------------------------| --     |
-      // | scalar local variables   | | <----'
-      // |--------------------------| -- <-- SP
-    }
-    // The total amount of padding surrounding RVV objects is described by
-    // RVV->getRVVPadding() and it can be zero. It allows us to align the RVV
-    // objects to 8 bytes.
-    if (MFI.getStackID(FI) == TargetStackID::Default) {
-      Offset += StackOffset::getFixed(MFI.getStackSize());
-      if (FI < 0)
-        Offset += StackOffset::getFixed(RVFI->getLibCallStackSize());
-    } else if (MFI.getStackID(FI) == TargetStackID::ScalableVector) {
-      Offset += StackOffset::get(
-          alignTo(MFI.getStackSize() - RVFI->getCalleeSavedStackSize(), 8),
-          RVFI->getRVVStackSize());
     }
   } else {
     FrameReg = RI->getFrameRegister(MF);
-    if (hasFP(MF)) {
-      Offset += StackOffset::getFixed(RVFI->getVarArgsSaveSize());
-      if (FI >= 0)
-        Offset -= StackOffset::getFixed(RVFI->getLibCallStackSize());
-      // When using FP to access scalable vector objects, we need to minus
-      // the frame size.
-      //
-      // |--------------------------| -- <-- FP
-      // | callee-saved registers   | |
-      // |--------------------------| | MFI.getStackSize()
-      // | scalar local variables   | |
-      // |--------------------------| -- (Offset of RVV objects is from here.)
-      // | RVV objects              |
-      // |--------------------------|
-      // | VarSize objects          |
-      // |--------------------------| <-- SP
-      if (MFI.getStackID(FI) == TargetStackID::ScalableVector)
-        Offset -= StackOffset::getFixed(MFI.getStackSize());
-    } else {
-      // When using SP to access frame objects, we need to add RVV stack size.
-      //
-      // |--------------------------| -- <-- FP
-      // | callee-saved registers   | | <----.
-      // |--------------------------| --     |
-      // | Padding after RVV        | |      |
-      // | (not counted in          | |      |
-      // | MFI.getStackSize())      | |      |
-      // |--------------------------| --     |
-      // | RVV objects              | |      |-- MFI.getStackSize()
-      // | (not counted in          | |      |
-      // | MFI.getStackSize())      | |      |
-      // |--------------------------| --     |
-      // | Padding before RVV       | |      |
-      // | (not counted in          | |      |
-      // | MFI.getStackSize())      | |      |
-      // |--------------------------| --     |
-      // | scalar local variables   | | <----'
-      // |--------------------------| -- <-- SP
-      //
-      // The total amount of padding surrounding RVV objects is described by
-      // RVV->getRVVPadding() and it can be zero. It allows us to align the RVV
-      // objects to 8 bytes.
-      if (MFI.getStackID(FI) == TargetStackID::Default) {
-        if (MFI.isFixedObjectIndex(FI)) {
-          Offset +=
-              StackOffset::get(MFI.getStackSize() + RVFI->getRVVPadding() +
-                                   RVFI->getLibCallStackSize(),
-                               RVFI->getRVVStackSize());
-        } else {
-          Offset += StackOffset::getFixed(MFI.getStackSize());
-        }
-      } else if (MFI.getStackID(FI) == TargetStackID::ScalableVector) {
-        Offset += StackOffset::get(
-            alignTo(MFI.getStackSize() - RVFI->getCalleeSavedStackSize(), 8),
-            RVFI->getRVVStackSize());
-      }
+  }
+
+  if (FrameReg == getFPReg(STI)) {
+    Offset += StackOffset::getFixed(RVFI->getVarArgsSaveSize());
+    if (FI >= 0)
+      Offset -= StackOffset::getFixed(RVFI->getLibCallStackSize());
+    // When using FP to access scalable vector objects, we need to minus
+    // the frame size.
+    //
+    // |--------------------------| -- <-- FP
+    // | callee-allocated save    | |
+    // | area for register varargs| |
+    // |--------------------------| |
+    // | callee-saved registers   | |
+    // |--------------------------| | MFI.getStackSize()
+    // | scalar local variables   | |
+    // |--------------------------| -- (Offset of RVV objects is from here.)
+    // | RVV objects              |
+    // |--------------------------|
+    // | VarSize objects          |
+    // |--------------------------| <-- SP
+    if (MFI.getStackID(FI) == TargetStackID::ScalableVector) {
+      assert(!RI->hasStackRealignment(MF) &&
+             "Can't index across variable sized realign");
+      // We don't expect any extra RVV alignment padding, as the stack size
+      // and RVV object sections should be correct aligned in their own
+      // right.
+      assert(MFI.getStackSize() == getStackSizeWithRVVPadding(MF) &&
+             "Inconsistent stack layout");
+      Offset -= StackOffset::getFixed(MFI.getStackSize());
     }
+    return Offset;
   }
 
+  // This case handles indexing off both SP and BP.
+  // If indexing off SP, there must not be any var sized objects
+  assert(FrameReg == RISCVABI::getBPReg() || !MFI.hasVarSizedObjects());
+
+  // When using SP to access frame objects, we need to add RVV stack size.
+  //
+  // |--------------------------| -- <-- FP
+  // | callee-allocated save    | | <----|
+  // | area for register varargs| |      |
+  // |--------------------------| |      |
+  // | callee-saved registers   | |      |
+  // |--------------------------| --     |
+  // | RVV alignment padding    | |      |
+  // | (not counted in          | |      |
+  // | MFI.getStackSize() but   | |      |
+  // | counted in               | |      |
+  // | RVFI.getRVVStackSize())  | |      |
+  // |--------------------------| --     |
+  // | RVV objects              | |      |-- MFI.getStackSize()
+  // | (not counted in          | |      |
+  // | MFI.getStackSize())      | |      |
+  // |--------------------------| --     |
+  // | padding before RVV       | |      |
+  // | (not counted in          | |      |
+  // | MFI.getStackSize())      | |      |
+  // |--------------------------| --     |
+  // | scalar local variables   | | <----'
+  // |--------------------------| -- <-- BP (if var sized objects present)
+  // | VarSize objects          | |
+  // |--------------------------| -- <-- SP
+  //
+  // The total amount of padding surrounding RVV objects is described by
+  // RVV->getRVVPadding() and it can be zero. It allows us to align the RVV
+  // objects to the required alignment.
+  if (MFI.getStackID(FI) == TargetStackID::Default) {
+    if (MFI.isFixedObjectIndex(FI)) {
+      assert(!RI->hasStackRealignment(MF) &&
+             "Can't index across variable sized realign");
+      Offset += StackOffset::get(getStackSizeWithRVVPadding(MF) +
+                                 RVFI->getLibCallStackSize(),
+                                 RVFI->getRVVStackSize());
+    } else {
+      Offset += StackOffset::getFixed(MFI.getStackSize());
+    }
+  } else if (MFI.getStackID(FI) == TargetStackID::ScalableVector) {
+    // Ensure the base of the RVV stack is correctly aligned: add on the
+    // alignment padding.
+    int ScalarLocalVarSize =
+      MFI.getStackSize() - RVFI->getCalleeSavedStackSize() -
+      RVFI->getVarArgsSaveSize() + RVFI->getRVVPadding();
+    Offset += StackOffset::get(ScalarLocalVarSize, RVFI->getRVVStackSize());
+  }
   return Offset;
 }
 
@@ -841,9 +898,8 @@ void RISCVFrameLowering::determineCalleeSaves(MachineFunction &MF,
   }
 }
 
-int64_t
+std::pair<int64_t, Align>
 RISCVFrameLowering::assignRVVStackObjectOffsets(MachineFrameInfo &MFI) const {
-  int64_t Offset = 0;
   // Create a buffer of RVV objects to allocate.
   SmallVector<int, 8> ObjectsToAllocate;
   for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
@@ -857,29 +913,78 @@ RISCVFrameLowering::assignRVVStackObjectOffsets(MachineFrameInfo &MFI) const {
   }
 
   // Allocate all RVV locals and spills
+  int64_t Offset = 0;
+  // The minimum alignment is 16 bytes.
+  Align RVVStackAlign(16);
   for (int FI : ObjectsToAllocate) {
     // ObjectSize in bytes.
     int64_t ObjectSize = MFI.getObjectSize(FI);
+    auto ObjectAlign = std::max(Align(8), MFI.getObjectAlign(FI));
     // If the data type is the fractional vector type, reserve one vector
     // register for it.
     if (ObjectSize < 8)
       ObjectSize = 8;
-    // Currently, all scalable vector types are aligned to 8 bytes.
-    Offset = alignTo(Offset + ObjectSize, 8);
+    Offset = alignTo(Offset + ObjectSize, ObjectAlign);
     MFI.setObjectOffset(FI, -Offset);
+    // Update the maximum alignment of the RVV stack section
+    RVVStackAlign = std::max(RVVStackAlign, ObjectAlign);
   }
 
-  return Offset;
+  // Ensure the alignment of the RVV stack. Since we want the most-aligned
+  // object right at the bottom (i.e., any padding at the top of the frame),
+  // readjust all RVV objects down by the alignment padding.
+  uint64_t StackSize = Offset;
+  if (auto AlignmentPadding = offsetToAlignment(StackSize, RVVStackAlign)) {
+    StackSize += AlignmentPadding;
+    for (int FI : ObjectsToAllocate)
+      MFI.setObjectOffset(FI, MFI.getObjectOffset(FI) - AlignmentPadding);
+  }
+
+  return std::make_pair(StackSize, RVVStackAlign);
 }
 
-static bool hasRVVSpillWithFIs(MachineFunction &MF, const RISCVInstrInfo &TII) {
+static unsigned getScavSlotsNumForRVV(MachineFunction &MF) {
+  // For RVV spill, scalable stack offsets computing requires up to two scratch
+  // registers
+  static constexpr unsigned ScavSlotsNumRVVSpillScalableObject = 2;
+
+  // For RVV spill, non-scalable stack offsets computing requires up to one
+  // scratch register.
+  static constexpr unsigned ScavSlotsNumRVVSpillNonScalableObject = 1;
+
+  // ADDI instruction's destination register can be used for computing
+  // offsets. So Scalable stack offsets require up to one scratch register.
+  static constexpr unsigned ScavSlotsADDIScalableObject = 1;
+
+  static constexpr unsigned MaxScavSlotsNumKnown =
+      std::max({ScavSlotsADDIScalableObject, ScavSlotsNumRVVSpillScalableObject,
+                ScavSlotsNumRVVSpillNonScalableObject});
+
+  unsigned MaxScavSlotsNum = 0;
   if (!MF.getSubtarget<RISCVSubtarget>().hasVInstructions())
     return false;
-  return any_of(MF, [&TII](const MachineBasicBlock &MBB) {
-    return any_of(MBB, [&TII](const MachineInstr &MI) {
-      return TII.isRVVSpill(MI, /*CheckFIs*/ true);
-    });
-  });
+  for (const MachineBasicBlock &MBB : MF)
+    for (const MachineInstr &MI : MBB) {
+      bool IsRVVSpill = RISCV::isRVVSpill(MI);
+      for (auto &MO : MI.operands()) {
+        if (!MO.isFI())
+          continue;
+        bool IsScalableVectorID = MF.getFrameInfo().getStackID(MO.getIndex()) ==
+                                  TargetStackID::ScalableVector;
+        if (IsRVVSpill) {
+          MaxScavSlotsNum = std::max(
+              MaxScavSlotsNum, IsScalableVectorID
+                                   ? ScavSlotsNumRVVSpillScalableObject
+                                   : ScavSlotsNumRVVSpillNonScalableObject);
+        } else if (MI.getOpcode() == RISCV::ADDI && IsScalableVectorID) {
+          MaxScavSlotsNum =
+              std::max(MaxScavSlotsNum, ScavSlotsADDIScalableObject);
+        }
+      }
+      if (MaxScavSlotsNum == MaxScavSlotsNumKnown)
+        return MaxScavSlotsNumKnown;
+    }
+  return MaxScavSlotsNum;
 }
 
 void RISCVFrameLowering::processFunctionBeforeFrameFinalized(
@@ -890,9 +995,17 @@ void RISCVFrameLowering::processFunctionBeforeFrameFinalized(
   const TargetRegisterClass *RC = &RISCV::GPRRegClass;
   auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
 
-  int64_t RVVStackSize = assignRVVStackObjectOffsets(MFI);
+  int64_t RVVStackSize;
+  Align RVVStackAlign;
+  std::tie(RVVStackSize, RVVStackAlign) = assignRVVStackObjectOffsets(MFI);
+
   RVFI->setRVVStackSize(RVVStackSize);
-  const RISCVInstrInfo &TII = *MF.getSubtarget<RISCVSubtarget>().getInstrInfo();
+  RVFI->setRVVStackAlign(RVVStackAlign);
+
+  // Ensure the entire stack is aligned to at least the RVV requirement: some
+  // scalable-vector object alignments are not considered by the
+  // target-independent code.
+  MFI.ensureMaxAlignment(RVVStackAlign);
 
   // estimateStackSize has been observed to under-estimate the final stack
   // size, so give ourselves wiggle-room by checking for stack size
@@ -903,17 +1016,14 @@ void RISCVFrameLowering::processFunctionBeforeFrameFinalized(
   // RVV loads & stores have no capacity to hold the immediate address offsets
   // so we must always reserve an emergency spill slot if the MachineFunction
   // contains any RVV spills.
-  if (!isInt<11>(MFI.estimateStackSize(MF)) || hasRVVSpillWithFIs(MF, TII)) {
-    int RegScavFI = MFI.CreateStackObject(RegInfo->getSpillSize(*RC),
-                                          RegInfo->getSpillAlign(*RC), false);
-    RS->addScavengingFrameIndex(RegScavFI);
-    // For RVV, scalable stack offsets require up to two scratch registers to
-    // compute the final offset. Reserve an additional emergency spill slot.
-    if (RVVStackSize != 0) {
-      int RVVRegScavFI = MFI.CreateStackObject(
-          RegInfo->getSpillSize(*RC), RegInfo->getSpillAlign(*RC), false);
-      RS->addScavengingFrameIndex(RVVRegScavFI);
-    }
+  unsigned ScavSlotsNum = 0;
+  if (!isInt<11>(MFI.estimateStackSize(MF)))
+    ScavSlotsNum = 1;
+
+  ScavSlotsNum = std::max(ScavSlotsNum, getScavSlotsNumForRVV(MF));
+  for (unsigned i = 0; i < ScavSlotsNum; i++) {
+    RS->addScavengingFrameIndex(MFI.CreateStackObject(
+        RegInfo->getSpillSize(*RC), RegInfo->getSpillAlign(*RC), false));
   }
 
   if (MFI.getCalleeSavedInfo().empty() || RVFI->useSaveRestoreLibCalls(MF)) {
@@ -930,14 +1040,6 @@ void RISCVFrameLowering::processFunctionBeforeFrameFinalized(
     Size += MFI.getObjectSize(FrameIdx);
   }
   RVFI->setCalleeSavedStackSize(Size);
-
-  // Padding required to keep the RVV stack aligned to 8 bytes
-  // within the main stack. We only need this when not using FP.
-  if (RVVStackSize && !hasFP(MF) && Size % 8 != 0) {
-    // Because we add the padding to the size of the stack, adding
-    // getStackAlign() will keep it aligned.
-    RVFI->setRVVPadding(getStackAlign().value());
-  }
 }
 
 static bool hasRVVFrameObject(const MachineFunction &MF) {
@@ -1012,23 +1114,23 @@ RISCVFrameLowering::getFirstSPAdjustAmount(const MachineFunction &MF) const {
   const auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
-  uint64_t StackSize = MFI.getStackSize();
+  uint64_t StackSize = getStackSizeWithRVVPadding(MF);
 
-  // Disable SplitSPAdjust if save-restore libcall used. The callee saved
+  // Disable SplitSPAdjust if save-restore libcall is used. The callee-saved
   // registers will be pushed by the save-restore libcalls, so we don't have to
   // split the SP adjustment in this case.
   if (RVFI->getLibCallStackSize())
     return 0;
 
-  // Return the FirstSPAdjustAmount if the StackSize can not fit in signed
-  // 12-bit and there exists a callee saved register need to be pushed.
+  // Return the FirstSPAdjustAmount if the StackSize can not fit in a signed
+  // 12-bit and there exists a callee-saved register needing to be pushed.
   if (!isInt<12>(StackSize) && (CSI.size() > 0)) {
-    // FirstSPAdjustAmount is choosed as (2048 - StackAlign)
-    // because 2048 will cause sp = sp + 2048 in epilogue split into
-    // multi-instructions. The offset smaller than 2048 can fit in signle
-    // load/store instruction and we have to stick with the stack alignment.
-    // 2048 is 16-byte alignment. The stack alignment for RV32 and RV64 is 16,
-    // for RV32E is 4. So (2048 - StackAlign) will satisfy the stack alignment.
+    // FirstSPAdjustAmount is chosen as (2048 - StackAlign) because 2048 will
+    // cause sp = sp + 2048 in the epilogue to be split into multiple
+    // instructions. Offsets smaller than 2048 can fit in a single load/store
+    // instruction, and we have to stick with the stack alignment. 2048 has
+    // 16-byte alignment. The stack alignment for RV32 and RV64 is 16 and for
+    // RV32E it is 4. So (2048 - StackAlign) will satisfy the stack alignment.
     return 2048 - getStackAlign().value();
   }
   return 0;
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
index 1e94e34acf2f..466cd059b749 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
@@ -30,6 +30,8 @@ public:
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
+  uint64_t getStackSizeWithRVVPadding(const MachineFunction &MF) const;
+
   StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
                                      Register &FrameReg) const override;
 
@@ -81,7 +83,8 @@ private:
   void adjustStackForRVV(MachineFunction &MF, MachineBasicBlock &MBB,
                          MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
                          int64_t Amount, MachineInstr::MIFlag Flag) const;
-  int64_t assignRVVStackObjectOffsets(MachineFrameInfo &MFI) const;
+  std::pair<int64_t, Align>
+  assignRVVStackObjectOffsets(MachineFrameInfo &MFI) const;
 };
 }
 #endif
diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
index ba91b16661a4..2410cc1f8859 100644
--- a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
@@ -37,6 +37,11 @@ class RISCVGatherScatterLowering : public FunctionPass {
 
   SmallVector<WeakTrackingVH> MaybeDeadPHIs;
 
+  // Cache of the BasePtr and Stride determined from this GEP. When a GEP is
+  // used by multiple gathers/scatters, this allow us to reuse the scalar
+  // instructions we created for the first gather/scatter for the others.
+  DenseMap<GetElementPtrInst *, std::pair<Value *, Value *>> StridedAddrs;
+
 public:
   static char ID; // Pass identification, replacement for typeid
 
@@ -323,15 +328,19 @@ std::pair<Value *, Value *>
 RISCVGatherScatterLowering::determineBaseAndStride(GetElementPtrInst *GEP,
                                                    IRBuilder<> &Builder) {
 
+  auto I = StridedAddrs.find(GEP);
+  if (I != StridedAddrs.end())
+    return I->second;
+
   SmallVector<Value *, 2> Ops(GEP->operands());
 
   // Base pointer needs to be a scalar.
   if (Ops[0]->getType()->isVectorTy())
     return std::make_pair(nullptr, nullptr);
 
-  // Make sure we're in a loop and it is in loop simplify form.
+  // Make sure we're in a loop and that has a pre-header and a single latch.
   Loop *L = LI->getLoopFor(GEP->getParent());
-  if (!L || !L->isLoopSimplifyForm())
+  if (!L || !L->getLoopPreheader() || !L->getLoopLatch())
     return std::make_pair(nullptr, nullptr);
 
   Optional<unsigned> VecOperand;
@@ -387,13 +396,6 @@ RISCVGatherScatterLowering::determineBaseAndStride(GetElementPtrInst *GEP,
   Value *BasePtr =
       Builder.CreateGEP(SourceTy, Ops[0], makeArrayRef(Ops).drop_front());
 
-  // Cast the GEP to an i8*.
-  LLVMContext &Ctx = GEP->getContext();
-  Type *I8PtrTy =
-      Type::getInt8PtrTy(Ctx, GEP->getType()->getPointerAddressSpace());
-  if (BasePtr->getType() != I8PtrTy)
-    BasePtr = Builder.CreatePointerCast(BasePtr, I8PtrTy);
-
   // Final adjustments to stride should go in the start block.
   Builder.SetInsertPoint(
       BasePhi->getIncomingBlock(1 - IncrementingBlock)->getTerminator());
@@ -406,7 +408,9 @@ RISCVGatherScatterLowering::determineBaseAndStride(GetElementPtrInst *GEP,
   if (TypeScale != 1)
     Stride = Builder.CreateMul(Stride, ConstantInt::get(IntPtrTy, TypeScale));
 
-  return std::make_pair(BasePtr, Stride);
+  auto P = std::make_pair(BasePtr, Stride);
+  StridedAddrs[GEP] = P;
+  return P;
 }
 
 bool RISCVGatherScatterLowering::tryCreateStridedLoadStore(IntrinsicInst *II,
@@ -468,6 +472,8 @@ bool RISCVGatherScatterLowering::runOnFunction(Function &F) {
   DL = &F.getParent()->getDataLayout();
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
 
+  StridedAddrs.clear();
+
   SmallVector<IntrinsicInst *, 4> Gathers;
   SmallVector<IntrinsicInst *, 4> Scatters;
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 6f77428ae721..cfaafc7b53d2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -37,6 +37,7 @@ namespace RISCV {
 #define GET_RISCVVSETable_IMPL
 #define GET_RISCVVLXTable_IMPL
 #define GET_RISCVVSXTable_IMPL
+#define GET_RISCVMaskedPseudosTable_IMPL
 #include "RISCVGenSearchableTables.inc"
 } // namespace RISCV
 } // namespace llvm
@@ -47,17 +48,36 @@ void RISCVDAGToDAGISel::PreprocessISelDAG() {
        I != E;) {
     SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
 
+    // Convert integer SPLAT_VECTOR to VMV_V_X_VL and floating-point
+    // SPLAT_VECTOR to VFMV_V_F_VL to reduce isel burden.
+    if (N->getOpcode() == ISD::SPLAT_VECTOR) {
+      MVT VT = N->getSimpleValueType(0);
+      unsigned Opc =
+          VT.isInteger() ? RISCVISD::VMV_V_X_VL : RISCVISD::VFMV_V_F_VL;
+      SDLoc DL(N);
+      SDValue VL = CurDAG->getRegister(RISCV::X0, Subtarget->getXLenVT());
+      SDValue Result = CurDAG->getNode(Opc, DL, VT, CurDAG->getUNDEF(VT),
+                                       N->getOperand(0), VL);
+
+      --I;
+      CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
+      ++I;
+      CurDAG->DeleteNode(N);
+      continue;
+    }
+
     // Lower SPLAT_VECTOR_SPLIT_I64 to two scalar stores and a stride 0 vector
     // load. Done after lowering and combining so that we have a chance to
     // optimize this to VMV_V_X_VL when the upper bits aren't needed.
     if (N->getOpcode() != RISCVISD::SPLAT_VECTOR_SPLIT_I64_VL)
       continue;
 
-    assert(N->getNumOperands() == 3 && "Unexpected number of operands");
+    assert(N->getNumOperands() == 4 && "Unexpected number of operands");
     MVT VT = N->getSimpleValueType(0);
-    SDValue Lo = N->getOperand(0);
-    SDValue Hi = N->getOperand(1);
-    SDValue VL = N->getOperand(2);
+    SDValue Passthru = N->getOperand(0);
+    SDValue Lo = N->getOperand(1);
+    SDValue Hi = N->getOperand(2);
+    SDValue VL = N->getOperand(3);
     assert(VT.getVectorElementType() == MVT::i64 && VT.isScalableVector() &&
            Lo.getValueType() == MVT::i32 && Hi.getValueType() == MVT::i32 &&
            "Unexpected VTs!");
@@ -88,7 +108,7 @@ void RISCVDAGToDAGISel::PreprocessISelDAG() {
         CurDAG->getTargetConstant(Intrinsic::riscv_vlse, DL, MVT::i64);
     SDValue Ops[] = {Chain,
                      IntID,
-                     CurDAG->getUNDEF(VT),
+                     Passthru,
                      StackSlot,
                      CurDAG->getRegister(RISCV::X0, MVT::i64),
                      VL};
@@ -112,6 +132,7 @@ void RISCVDAGToDAGISel::PreprocessISelDAG() {
 }
 
 void RISCVDAGToDAGISel::PostprocessISelDAG() {
+  HandleSDNode Dummy(CurDAG->getRoot());
   SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
 
   bool MadeChange = false;
@@ -123,57 +144,70 @@ void RISCVDAGToDAGISel::PostprocessISelDAG() {
 
     MadeChange |= doPeepholeSExtW(N);
     MadeChange |= doPeepholeLoadStoreADDI(N);
+    MadeChange |= doPeepholeMaskedRVV(N);
   }
 
+  CurDAG->setRoot(Dummy.getValue());
+
   if (MadeChange)
     CurDAG->RemoveDeadNodes();
 }
 
-static SDNode *selectImmWithConstantPool(SelectionDAG *CurDAG, const SDLoc &DL,
-                                         const MVT VT, int64_t Imm,
-                                         const RISCVSubtarget &Subtarget) {
-  assert(VT == MVT::i64 && "Expecting MVT::i64");
-  const RISCVTargetLowering *TLI = Subtarget.getTargetLowering();
-  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(CurDAG->getConstantPool(
-      ConstantInt::get(EVT(VT).getTypeForEVT(*CurDAG->getContext()), Imm), VT));
-  SDValue Addr = TLI->getAddr(CP, *CurDAG);
-  SDValue Offset = CurDAG->getTargetConstant(0, DL, VT);
-  // Since there is no data race, the chain can be the entry node.
-  SDNode *Load = CurDAG->getMachineNode(RISCV::LD, DL, VT, Addr, Offset,
-                                        CurDAG->getEntryNode());
-  MachineFunction &MF = CurDAG->getMachineFunction();
-  MachineMemOperand *MemOp = MF.getMachineMemOperand(
-      MachinePointerInfo::getConstantPool(MF), MachineMemOperand::MOLoad,
-      LLT(VT), CP->getAlign());
-  CurDAG->setNodeMemRefs(cast<MachineSDNode>(Load), {MemOp});
-  return Load;
-}
-
-static SDNode *selectImm(SelectionDAG *CurDAG, const SDLoc &DL, const MVT VT,
-                         int64_t Imm, const RISCVSubtarget &Subtarget) {
-  MVT XLenVT = Subtarget.getXLenVT();
-  RISCVMatInt::InstSeq Seq =
-      RISCVMatInt::generateInstSeq(Imm, Subtarget.getFeatureBits());
+// Returns true if N is a MachineSDNode that has a reg and simm12 memory
+// operand. The indices of the base pointer and offset are returned in BaseOpIdx
+// and OffsetOpIdx.
+static bool hasMemOffset(SDNode *N, unsigned &BaseOpIdx,
+                         unsigned &OffsetOpIdx) {
+  switch (N->getMachineOpcode()) {
+  case RISCV::LB:
+  case RISCV::LH:
+  case RISCV::LW:
+  case RISCV::LBU:
+  case RISCV::LHU:
+  case RISCV::LWU:
+  case RISCV::LD:
+  case RISCV::FLH:
+  case RISCV::FLW:
+  case RISCV::FLD:
+    BaseOpIdx = 0;
+    OffsetOpIdx = 1;
+    return true;
+  case RISCV::SB:
+  case RISCV::SH:
+  case RISCV::SW:
+  case RISCV::SD:
+  case RISCV::FSH:
+  case RISCV::FSW:
+  case RISCV::FSD:
+    BaseOpIdx = 1;
+    OffsetOpIdx = 2;
+    return true;
+  }
 
-  // If Imm is expensive to build, then we put it into constant pool.
-  if (Subtarget.useConstantPoolForLargeInts() &&
-      Seq.size() > Subtarget.getMaxBuildIntsCost())
-    return selectImmWithConstantPool(CurDAG, DL, VT, Imm, Subtarget);
+  return false;
+}
 
+static SDNode *selectImmSeq(SelectionDAG *CurDAG, const SDLoc &DL, const MVT VT,
+                            RISCVMatInt::InstSeq &Seq) {
   SDNode *Result = nullptr;
-  SDValue SrcReg = CurDAG->getRegister(RISCV::X0, XLenVT);
+  SDValue SrcReg = CurDAG->getRegister(RISCV::X0, VT);
   for (RISCVMatInt::Inst &Inst : Seq) {
-    SDValue SDImm = CurDAG->getTargetConstant(Inst.Imm, DL, XLenVT);
-    if (Inst.Opc == RISCV::LUI)
-      Result = CurDAG->getMachineNode(RISCV::LUI, DL, XLenVT, SDImm);
-    else if (Inst.Opc == RISCV::ADD_UW)
-      Result = CurDAG->getMachineNode(RISCV::ADD_UW, DL, XLenVT, SrcReg,
-                                      CurDAG->getRegister(RISCV::X0, XLenVT));
-    else if (Inst.Opc == RISCV::SH1ADD || Inst.Opc == RISCV::SH2ADD ||
-             Inst.Opc == RISCV::SH3ADD)
-      Result = CurDAG->getMachineNode(Inst.Opc, DL, XLenVT, SrcReg, SrcReg);
-    else
-      Result = CurDAG->getMachineNode(Inst.Opc, DL, XLenVT, SrcReg, SDImm);
+    SDValue SDImm = CurDAG->getTargetConstant(Inst.Imm, DL, VT);
+    switch (Inst.getOpndKind()) {
+    case RISCVMatInt::Imm:
+      Result = CurDAG->getMachineNode(Inst.Opc, DL, VT, SDImm);
+      break;
+    case RISCVMatInt::RegX0:
+      Result = CurDAG->getMachineNode(Inst.Opc, DL, VT, SrcReg,
+                                      CurDAG->getRegister(RISCV::X0, VT));
+      break;
+    case RISCVMatInt::RegReg:
+      Result = CurDAG->getMachineNode(Inst.Opc, DL, VT, SrcReg, SrcReg);
+      break;
+    case RISCVMatInt::RegImm:
+      Result = CurDAG->getMachineNode(Inst.Opc, DL, VT, SrcReg, SDImm);
+      break;
+    }
 
     // Only the first instruction has X0 as its source.
     SrcReg = SDValue(Result, 0);
@@ -182,51 +216,28 @@ static SDNode *selectImm(SelectionDAG *CurDAG, const SDLoc &DL, const MVT VT,
   return Result;
 }
 
-static SDValue createTupleImpl(SelectionDAG &CurDAG, ArrayRef<SDValue> Regs,
-                               unsigned RegClassID, unsigned SubReg0) {
-  assert(Regs.size() >= 2 && Regs.size() <= 8);
-
-  SDLoc DL(Regs[0]);
-  SmallVector<SDValue, 8> Ops;
-
-  Ops.push_back(CurDAG.getTargetConstant(RegClassID, DL, MVT::i32));
+static SDNode *selectImm(SelectionDAG *CurDAG, const SDLoc &DL, const MVT VT,
+                         int64_t Imm, const RISCVSubtarget &Subtarget) {
+  RISCVMatInt::InstSeq Seq =
+      RISCVMatInt::generateInstSeq(Imm, Subtarget.getFeatureBits());
 
-  for (unsigned I = 0; I < Regs.size(); ++I) {
-    Ops.push_back(Regs[I]);
-    Ops.push_back(CurDAG.getTargetConstant(SubReg0 + I, DL, MVT::i32));
-  }
-  SDNode *N =
-      CurDAG.getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops);
-  return SDValue(N, 0);
+  return selectImmSeq(CurDAG, DL, VT, Seq);
 }
 
-static SDValue createM1Tuple(SelectionDAG &CurDAG, ArrayRef<SDValue> Regs,
-                             unsigned NF) {
-  static const unsigned RegClassIDs[] = {
+static SDValue createTuple(SelectionDAG &CurDAG, ArrayRef<SDValue> Regs,
+                           unsigned NF, RISCVII::VLMUL LMUL) {
+  static const unsigned M1TupleRegClassIDs[] = {
       RISCV::VRN2M1RegClassID, RISCV::VRN3M1RegClassID, RISCV::VRN4M1RegClassID,
       RISCV::VRN5M1RegClassID, RISCV::VRN6M1RegClassID, RISCV::VRN7M1RegClassID,
       RISCV::VRN8M1RegClassID};
+  static const unsigned M2TupleRegClassIDs[] = {RISCV::VRN2M2RegClassID,
+                                                RISCV::VRN3M2RegClassID,
+                                                RISCV::VRN4M2RegClassID};
 
-  return createTupleImpl(CurDAG, Regs, RegClassIDs[NF - 2], RISCV::sub_vrm1_0);
-}
-
-static SDValue createM2Tuple(SelectionDAG &CurDAG, ArrayRef<SDValue> Regs,
-                             unsigned NF) {
-  static const unsigned RegClassIDs[] = {RISCV::VRN2M2RegClassID,
-                                         RISCV::VRN3M2RegClassID,
-                                         RISCV::VRN4M2RegClassID};
-
-  return createTupleImpl(CurDAG, Regs, RegClassIDs[NF - 2], RISCV::sub_vrm2_0);
-}
-
-static SDValue createM4Tuple(SelectionDAG &CurDAG, ArrayRef<SDValue> Regs,
-                             unsigned NF) {
-  return createTupleImpl(CurDAG, Regs, RISCV::VRN2M4RegClassID,
-                         RISCV::sub_vrm4_0);
-}
+  assert(Regs.size() >= 2 && Regs.size() <= 8);
 
-static SDValue createTuple(SelectionDAG &CurDAG, ArrayRef<SDValue> Regs,
-                           unsigned NF, RISCVII::VLMUL LMUL) {
+  unsigned RegClassID;
+  unsigned SubReg0;
   switch (LMUL) {
   default:
     llvm_unreachable("Invalid LMUL.");
@@ -234,12 +245,37 @@ static SDValue createTuple(SelectionDAG &CurDAG, ArrayRef<SDValue> Regs,
   case RISCVII::VLMUL::LMUL_F4:
   case RISCVII::VLMUL::LMUL_F2:
   case RISCVII::VLMUL::LMUL_1:
-    return createM1Tuple(CurDAG, Regs, NF);
+    static_assert(RISCV::sub_vrm1_7 == RISCV::sub_vrm1_0 + 7,
+                  "Unexpected subreg numbering");
+    SubReg0 = RISCV::sub_vrm1_0;
+    RegClassID = M1TupleRegClassIDs[NF - 2];
+    break;
   case RISCVII::VLMUL::LMUL_2:
-    return createM2Tuple(CurDAG, Regs, NF);
+    static_assert(RISCV::sub_vrm2_3 == RISCV::sub_vrm2_0 + 3,
+                  "Unexpected subreg numbering");
+    SubReg0 = RISCV::sub_vrm2_0;
+    RegClassID = M2TupleRegClassIDs[NF - 2];
+    break;
   case RISCVII::VLMUL::LMUL_4:
-    return createM4Tuple(CurDAG, Regs, NF);
+    static_assert(RISCV::sub_vrm4_1 == RISCV::sub_vrm4_0 + 1,
+                  "Unexpected subreg numbering");
+    SubReg0 = RISCV::sub_vrm4_0;
+    RegClassID = RISCV::VRN2M4RegClassID;
+    break;
+  }
+
+  SDLoc DL(Regs[0]);
+  SmallVector<SDValue, 8> Ops;
+
+  Ops.push_back(CurDAG.getTargetConstant(RegClassID, DL, MVT::i32));
+
+  for (unsigned I = 0; I < Regs.size(); ++I) {
+    Ops.push_back(Regs[I]);
+    Ops.push_back(CurDAG.getTargetConstant(SubReg0 + I, DL, MVT::i32));
   }
+  SDNode *N =
+      CurDAG.getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops);
+  return SDValue(N, 0);
 }
 
 void RISCVDAGToDAGISel::addVectorLoadStoreOperands(
@@ -287,6 +323,10 @@ void RISCVDAGToDAGISel::addVectorLoadStoreOperands(
     Operands.push_back(Glue);
 }
 
+static bool isAllUndef(ArrayRef<SDValue> Values) {
+  return llvm::all_of(Values, [](SDValue V) { return V->isUndef(); });
+}
+
 void RISCVDAGToDAGISel::selectVLSEG(SDNode *Node, bool IsMasked,
                                     bool IsStrided) {
   SDLoc DL(Node);
@@ -297,19 +337,21 @@ void RISCVDAGToDAGISel::selectVLSEG(SDNode *Node, bool IsMasked,
 
   unsigned CurOp = 2;
   SmallVector<SDValue, 8> Operands;
-  if (IsMasked) {
-    SmallVector<SDValue, 8> Regs(Node->op_begin() + CurOp,
-                                 Node->op_begin() + CurOp + NF);
-    SDValue MaskedOff = createTuple(*CurDAG, Regs, NF, LMUL);
-    Operands.push_back(MaskedOff);
-    CurOp += NF;
+
+  SmallVector<SDValue, 8> Regs(Node->op_begin() + CurOp,
+                               Node->op_begin() + CurOp + NF);
+  bool IsTU = IsMasked || !isAllUndef(Regs);
+  if (IsTU) {
+    SDValue Merge = createTuple(*CurDAG, Regs, NF, LMUL);
+    Operands.push_back(Merge);
   }
+  CurOp += NF;
 
   addVectorLoadStoreOperands(Node, Log2SEW, DL, CurOp, IsMasked, IsStrided,
                              Operands, /*IsLoad=*/true);
 
   const RISCV::VLSEGPseudo *P =
-      RISCV::getVLSEGPseudo(NF, IsMasked, IsStrided, /*FF*/ false, Log2SEW,
+      RISCV::getVLSEGPseudo(NF, IsMasked, IsTU, IsStrided, /*FF*/ false, Log2SEW,
                             static_cast<unsigned>(LMUL));
   MachineSDNode *Load =
       CurDAG->getMachineNode(P->Pseudo, DL, MVT::Untyped, MVT::Other, Operands);
@@ -338,25 +380,25 @@ void RISCVDAGToDAGISel::selectVLSEGFF(SDNode *Node, bool IsMasked) {
 
   unsigned CurOp = 2;
   SmallVector<SDValue, 7> Operands;
-  if (IsMasked) {
-    SmallVector<SDValue, 8> Regs(Node->op_begin() + CurOp,
-                                 Node->op_begin() + CurOp + NF);
+
+  SmallVector<SDValue, 8> Regs(Node->op_begin() + CurOp,
+                               Node->op_begin() + CurOp + NF);
+  bool IsTU = IsMasked || !isAllUndef(Regs);
+  if (IsTU) {
     SDValue MaskedOff = createTuple(*CurDAG, Regs, NF, LMUL);
     Operands.push_back(MaskedOff);
-    CurOp += NF;
   }
+  CurOp += NF;
 
   addVectorLoadStoreOperands(Node, Log2SEW, DL, CurOp, IsMasked,
                              /*IsStridedOrIndexed*/ false, Operands,
                              /*IsLoad=*/true);
 
   const RISCV::VLSEGPseudo *P =
-      RISCV::getVLSEGPseudo(NF, IsMasked, /*Strided*/ false, /*FF*/ true,
+      RISCV::getVLSEGPseudo(NF, IsMasked, IsTU, /*Strided*/ false, /*FF*/ true,
                             Log2SEW, static_cast<unsigned>(LMUL));
   MachineSDNode *Load = CurDAG->getMachineNode(P->Pseudo, DL, MVT::Untyped,
-                                               MVT::Other, MVT::Glue, Operands);
-  SDNode *ReadVL = CurDAG->getMachineNode(RISCV::PseudoReadVL, DL, XLenVT,
-                                          /*Glue*/ SDValue(Load, 2));
+                                               XLenVT, MVT::Other, Operands);
 
   if (auto *MemOp = dyn_cast<MemSDNode>(Node))
     CurDAG->setNodeMemRefs(Load, {MemOp->getMemOperand()});
@@ -368,8 +410,8 @@ void RISCVDAGToDAGISel::selectVLSEGFF(SDNode *Node, bool IsMasked) {
                 CurDAG->getTargetExtractSubreg(SubRegIdx, DL, VT, SuperReg));
   }
 
-  ReplaceUses(SDValue(Node, NF), SDValue(ReadVL, 0));   // VL
-  ReplaceUses(SDValue(Node, NF + 1), SDValue(Load, 1)); // Chain
+  ReplaceUses(SDValue(Node, NF), SDValue(Load, 1));     // VL
+  ReplaceUses(SDValue(Node, NF + 1), SDValue(Load, 2)); // Chain
   CurDAG->RemoveDeadNode(Node);
 }
 
@@ -383,13 +425,15 @@ void RISCVDAGToDAGISel::selectVLXSEG(SDNode *Node, bool IsMasked,
 
   unsigned CurOp = 2;
   SmallVector<SDValue, 8> Operands;
-  if (IsMasked) {
-    SmallVector<SDValue, 8> Regs(Node->op_begin() + CurOp,
-                                 Node->op_begin() + CurOp + NF);
+
+  SmallVector<SDValue, 8> Regs(Node->op_begin() + CurOp,
+                               Node->op_begin() + CurOp + NF);
+  bool IsTU = IsMasked || !isAllUndef(Regs);
+  if (IsTU) {
     SDValue MaskedOff = createTuple(*CurDAG, Regs, NF, LMUL);
     Operands.push_back(MaskedOff);
-    CurOp += NF;
   }
+  CurOp += NF;
 
   MVT IndexVT;
   addVectorLoadStoreOperands(Node, Log2SEW, DL, CurOp, IsMasked,
@@ -406,7 +450,7 @@ void RISCVDAGToDAGISel::selectVLXSEG(SDNode *Node, bool IsMasked,
                        "values when XLEN=32");
   }
   const RISCV::VLXSEGPseudo *P = RISCV::getVLXSEGPseudo(
-      NF, IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL),
+      NF, IsMasked, IsTU, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL),
       static_cast<unsigned>(IndexLMUL));
   MachineSDNode *Load =
       CurDAG->getMachineNode(P->Pseudo, DL, MVT::Untyped, MVT::Other, Operands);
@@ -596,32 +640,125 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     int64_t Imm = ConstNode->getSExtValue();
     // If the upper XLen-16 bits are not used, try to convert this to a simm12
     // by sign extending bit 15.
-    if (isUInt<16>(Imm) && isInt<12>(SignExtend64(Imm, 16)) &&
+    if (isUInt<16>(Imm) && isInt<12>(SignExtend64<16>(Imm)) &&
         hasAllHUsers(Node))
-      Imm = SignExtend64(Imm, 16);
+      Imm = SignExtend64<16>(Imm);
     // If the upper 32-bits are not used try to convert this into a simm32 by
     // sign extending bit 32.
     if (!isInt<32>(Imm) && isUInt<32>(Imm) && hasAllWUsers(Node))
-      Imm = SignExtend64(Imm, 32);
+      Imm = SignExtend64<32>(Imm);
 
     ReplaceNode(Node, selectImm(CurDAG, DL, VT, Imm, *Subtarget));
     return;
   }
-  case ISD::FrameIndex: {
-    SDValue Imm = CurDAG->getTargetConstant(0, DL, XLenVT);
-    int FI = cast<FrameIndexSDNode>(Node)->getIndex();
-    SDValue TFI = CurDAG->getTargetFrameIndex(FI, VT);
-    ReplaceNode(Node, CurDAG->getMachineNode(RISCV::ADDI, DL, VT, TFI, Imm));
+  case ISD::ADD: {
+    // Try to select ADD + immediate used as memory addresses to
+    // (ADDI (ADD X, Imm-Lo12), Lo12) if it will allow the ADDI to be removed by
+    // doPeepholeLoadStoreADDI.
+
+    // LHS should be an immediate.
+    auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1));
+    if (!N1C)
+      break;
+
+    int64_t Offset = N1C->getSExtValue();
+    int64_t Lo12 = SignExtend64<12>(Offset);
+
+    // Don't do this if the lower 12 bits are 0 or we could use ADDI directly.
+    if (Lo12 == 0 || isInt<12>(Offset))
+      break;
+
+    // Don't do this if we can use a pair of ADDIs.
+    if (isInt<12>(Offset / 2) && isInt<12>(Offset - Offset / 2))
+      break;
+
+    RISCVMatInt::InstSeq Seq =
+        RISCVMatInt::generateInstSeq(Offset, Subtarget->getFeatureBits());
+
+    Offset -= Lo12;
+    // Restore sign bits for RV32.
+    if (!Subtarget->is64Bit())
+      Offset = SignExtend64<32>(Offset);
+
+    // We can fold if the last operation is an ADDI or its an ADDIW that could
+    // be treated as an ADDI.
+    if (Seq.back().Opc != RISCV::ADDI &&
+        !(Seq.back().Opc == RISCV::ADDIW && isInt<32>(Offset)))
+      break;
+    assert(Seq.back().Imm == Lo12 && "Expected immediate to match Lo12");
+    // Drop the last operation.
+    Seq.pop_back();
+    assert(!Seq.empty() && "Expected more instructions in sequence");
+
+    bool AllPointerUses = true;
+    for (auto UI = Node->use_begin(), UE = Node->use_end(); UI != UE; ++UI) {
+      SDNode *User = *UI;
+
+      // Is this user a memory instruction that uses a register and immediate
+      // that has this ADD as its pointer.
+      unsigned BaseOpIdx, OffsetOpIdx;
+      if (!User->isMachineOpcode() ||
+          !hasMemOffset(User, BaseOpIdx, OffsetOpIdx) ||
+          UI.getOperandNo() != BaseOpIdx) {
+        AllPointerUses = false;
+        break;
+      }
+
+      // If the memory instruction already has an offset, make sure the combined
+      // offset is foldable.
+      int64_t MemOffs =
+          cast<ConstantSDNode>(User->getOperand(OffsetOpIdx))->getSExtValue();
+      MemOffs += Lo12;
+      if (!isInt<12>(MemOffs)) {
+        AllPointerUses = false;
+        break;
+      }
+    }
+
+    if (!AllPointerUses)
+      break;
+
+    // Emit (ADDI (ADD X, Hi), Lo)
+    SDNode *Imm = selectImmSeq(CurDAG, DL, VT, Seq);
+    SDNode *ADD = CurDAG->getMachineNode(RISCV::ADD, DL, VT,
+                                         Node->getOperand(0), SDValue(Imm, 0));
+    SDNode *ADDI =
+        CurDAG->getMachineNode(RISCV::ADDI, DL, VT, SDValue(ADD, 0),
+                               CurDAG->getTargetConstant(Lo12, DL, VT));
+    ReplaceNode(Node, ADDI);
     return;
   }
+  case ISD::SHL: {
+    auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1));
+    if (!N1C)
+      break;
+    SDValue N0 = Node->getOperand(0);
+    if (N0.getOpcode() != ISD::AND || !N0.hasOneUse() ||
+        !isa<ConstantSDNode>(N0.getOperand(1)))
+      break;
+    unsigned ShAmt = N1C->getZExtValue();
+    uint64_t Mask = N0.getConstantOperandVal(1);
+
+    // Optimize (shl (and X, C2), C) -> (slli (srliw X, C3), C3+C) where C2 has
+    // 32 leading zeros and C3 trailing zeros.
+    if (ShAmt <= 32 && isShiftedMask_64(Mask)) {
+      unsigned XLen = Subtarget->getXLen();
+      unsigned LeadingZeros = XLen - (64 - countLeadingZeros(Mask));
+      unsigned TrailingZeros = countTrailingZeros(Mask);
+      if (TrailingZeros > 0 && LeadingZeros == 32) {
+        SDNode *SRLIW = CurDAG->getMachineNode(
+            RISCV::SRLIW, DL, VT, N0->getOperand(0),
+            CurDAG->getTargetConstant(TrailingZeros, DL, VT));
+        SDNode *SLLI = CurDAG->getMachineNode(
+            RISCV::SLLI, DL, VT, SDValue(SRLIW, 0),
+            CurDAG->getTargetConstant(TrailingZeros + ShAmt, DL, VT));
+        ReplaceNode(Node, SLLI);
+        return;
+      }
+    }
+    break;
+  }
   case ISD::SRL: {
-    // Optimize (srl (and X, C2), C) ->
-    //          (srli (slli X, (XLen-C3), (XLen-C3) + C)
-    // Where C2 is a mask with C3 trailing ones.
-    // Taking into account that the C2 may have had lower bits unset by
-    // SimplifyDemandedBits. This avoids materializing the C2 immediate.
-    // This pattern occurs when type legalizing right shifts for types with
-    // less than XLen bits.
     auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1));
     if (!N1C)
       break;
@@ -631,6 +768,32 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       break;
     unsigned ShAmt = N1C->getZExtValue();
     uint64_t Mask = N0.getConstantOperandVal(1);
+
+    // Optimize (srl (and X, C2), C) -> (slli (srliw X, C3), C3-C) where C2 has
+    // 32 leading zeros and C3 trailing zeros.
+    if (isShiftedMask_64(Mask)) {
+      unsigned XLen = Subtarget->getXLen();
+      unsigned LeadingZeros = XLen - (64 - countLeadingZeros(Mask));
+      unsigned TrailingZeros = countTrailingZeros(Mask);
+      if (LeadingZeros == 32 && TrailingZeros > ShAmt) {
+        SDNode *SRLIW = CurDAG->getMachineNode(
+            RISCV::SRLIW, DL, VT, N0->getOperand(0),
+            CurDAG->getTargetConstant(TrailingZeros, DL, VT));
+        SDNode *SLLI = CurDAG->getMachineNode(
+            RISCV::SLLI, DL, VT, SDValue(SRLIW, 0),
+            CurDAG->getTargetConstant(TrailingZeros - ShAmt, DL, VT));
+        ReplaceNode(Node, SLLI);
+        return;
+      }
+    }
+
+    // Optimize (srl (and X, C2), C) ->
+    //          (srli (slli X, (XLen-C3), (XLen-C3) + C)
+    // Where C2 is a mask with C3 trailing ones.
+    // Taking into account that the C2 may have had lower bits unset by
+    // SimplifyDemandedBits. This avoids materializing the C2 immediate.
+    // This pattern occurs when type legalizing right shifts for types with
+    // less than XLen bits.
     Mask |= maskTrailingOnes<uint64_t>(ShAmt);
     if (!isMask_64(Mask))
       break;
@@ -700,13 +863,12 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
 
     uint64_t C1 = N1C->getZExtValue();
 
-    // Keep track of whether this is a andi, zext.h, or zext.w.
-    bool ZExtOrANDI = isInt<12>(N1C->getSExtValue());
-    if (C1 == UINT64_C(0xFFFF) &&
-        (Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbp()))
-      ZExtOrANDI = true;
-    if (C1 == UINT64_C(0xFFFFFFFF) && Subtarget->hasStdExtZba())
-      ZExtOrANDI = true;
+    // Keep track of whether this is a c.andi. If we can't use c.andi, the
+    // shift pair might offer more compression opportunities.
+    // TODO: We could check for C extension here, but we don't have many lit
+    // tests with the C extension enabled so not checking gets better coverage.
+    // TODO: What if ANDI faster than shift?
+    bool IsCANDI = isInt<6>(N1C->getSExtValue());
 
     // Clear irrelevant bits in the mask.
     if (LeftShift)
@@ -727,9 +889,8 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       if (C2 < C3) {
         // If the number of leading zeros is C2+32 this can be SRLIW.
         if (C2 + 32 == C3) {
-          SDNode *SRLIW =
-              CurDAG->getMachineNode(RISCV::SRLIW, DL, XLenVT, X,
-                                     CurDAG->getTargetConstant(C2, DL, XLenVT));
+          SDNode *SRLIW = CurDAG->getMachineNode(
+              RISCV::SRLIW, DL, VT, X, CurDAG->getTargetConstant(C2, DL, VT));
           ReplaceNode(Node, SRLIW);
           return;
         }
@@ -739,27 +900,33 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
         //
         // This pattern occurs when (i32 (srl (sra 31), c3 - 32)) is type
         // legalized and goes through DAG combine.
-        SDValue Y;
         if (C2 >= 32 && (C3 - C2) == 1 && N0.hasOneUse() &&
-            selectSExti32(X, Y)) {
+            X.getOpcode() == ISD::SIGN_EXTEND_INREG &&
+            cast<VTSDNode>(X.getOperand(1))->getVT() == MVT::i32) {
           SDNode *SRAIW =
-              CurDAG->getMachineNode(RISCV::SRAIW, DL, XLenVT, Y,
-                                     CurDAG->getTargetConstant(31, DL, XLenVT));
+              CurDAG->getMachineNode(RISCV::SRAIW, DL, VT, X.getOperand(0),
+                                     CurDAG->getTargetConstant(31, DL, VT));
           SDNode *SRLIW = CurDAG->getMachineNode(
-              RISCV::SRLIW, DL, XLenVT, SDValue(SRAIW, 0),
-              CurDAG->getTargetConstant(C3 - 32, DL, XLenVT));
+              RISCV::SRLIW, DL, VT, SDValue(SRAIW, 0),
+              CurDAG->getTargetConstant(C3 - 32, DL, VT));
           ReplaceNode(Node, SRLIW);
           return;
         }
 
         // (srli (slli x, c3-c2), c3).
-        if (OneUseOrZExtW && !ZExtOrANDI) {
+        // Skip if we could use (zext.w (sraiw X, C2)).
+        bool Skip = Subtarget->hasStdExtZba() && C3 == 32 &&
+                    X.getOpcode() == ISD::SIGN_EXTEND_INREG &&
+                    cast<VTSDNode>(X.getOperand(1))->getVT() == MVT::i32;
+        // Also Skip if we can use bexti.
+        Skip |= Subtarget->hasStdExtZbs() && C3 == XLen - 1;
+        if (OneUseOrZExtW && !Skip) {
           SDNode *SLLI = CurDAG->getMachineNode(
-              RISCV::SLLI, DL, XLenVT, X,
-              CurDAG->getTargetConstant(C3 - C2, DL, XLenVT));
+              RISCV::SLLI, DL, VT, X,
+              CurDAG->getTargetConstant(C3 - C2, DL, VT));
           SDNode *SRLI =
-              CurDAG->getMachineNode(RISCV::SRLI, DL, XLenVT, SDValue(SLLI, 0),
-                                     CurDAG->getTargetConstant(C3, DL, XLenVT));
+              CurDAG->getMachineNode(RISCV::SRLI, DL, VT, SDValue(SLLI, 0),
+                                     CurDAG->getTargetConstant(C3, DL, VT));
           ReplaceNode(Node, SRLI);
           return;
         }
@@ -775,21 +942,20 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
           C1 == (maskTrailingOnes<uint64_t>(XLen - (C2 + C3)) << C2)) {
         // Use slli.uw when possible.
         if ((XLen - (C2 + C3)) == 32 && Subtarget->hasStdExtZba()) {
-          SDNode *SLLI_UW =
-              CurDAG->getMachineNode(RISCV::SLLI_UW, DL, XLenVT, X,
-                                     CurDAG->getTargetConstant(C2, DL, XLenVT));
+          SDNode *SLLI_UW = CurDAG->getMachineNode(
+              RISCV::SLLI_UW, DL, VT, X, CurDAG->getTargetConstant(C2, DL, VT));
           ReplaceNode(Node, SLLI_UW);
           return;
         }
 
         // (srli (slli c2+c3), c3)
-        if (OneUseOrZExtW && !ZExtOrANDI) {
+        if (OneUseOrZExtW && !IsCANDI) {
           SDNode *SLLI = CurDAG->getMachineNode(
-              RISCV::SLLI, DL, XLenVT, X,
-              CurDAG->getTargetConstant(C2 + C3, DL, XLenVT));
+              RISCV::SLLI, DL, VT, X,
+              CurDAG->getTargetConstant(C2 + C3, DL, VT));
           SDNode *SRLI =
-              CurDAG->getMachineNode(RISCV::SRLI, DL, XLenVT, SDValue(SLLI, 0),
-                                     CurDAG->getTargetConstant(C3, DL, XLenVT));
+              CurDAG->getMachineNode(RISCV::SRLI, DL, VT, SDValue(SLLI, 0),
+                                     CurDAG->getTargetConstant(C3, DL, VT));
           ReplaceNode(Node, SRLI);
           return;
         }
@@ -801,25 +967,31 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     if (!LeftShift && isShiftedMask_64(C1)) {
       uint64_t Leading = XLen - (64 - countLeadingZeros(C1));
       uint64_t C3 = countTrailingZeros(C1);
-      if (Leading == C2 && C2 + C3 < XLen && OneUseOrZExtW && !ZExtOrANDI) {
+      if (Leading == C2 && C2 + C3 < XLen && OneUseOrZExtW && !IsCANDI) {
+        unsigned SrliOpc = RISCV::SRLI;
+        // If the input is zexti32 we should use SRLIW.
+        if (X.getOpcode() == ISD::AND && isa<ConstantSDNode>(X.getOperand(1)) &&
+            X.getConstantOperandVal(1) == UINT64_C(0xFFFFFFFF)) {
+          SrliOpc = RISCV::SRLIW;
+          X = X.getOperand(0);
+        }
         SDNode *SRLI = CurDAG->getMachineNode(
-            RISCV::SRLI, DL, XLenVT, X,
-            CurDAG->getTargetConstant(C2 + C3, DL, XLenVT));
+            SrliOpc, DL, VT, X, CurDAG->getTargetConstant(C2 + C3, DL, VT));
         SDNode *SLLI =
-            CurDAG->getMachineNode(RISCV::SLLI, DL, XLenVT, SDValue(SRLI, 0),
-                                   CurDAG->getTargetConstant(C3, DL, XLenVT));
+            CurDAG->getMachineNode(RISCV::SLLI, DL, VT, SDValue(SRLI, 0),
+                                   CurDAG->getTargetConstant(C3, DL, VT));
         ReplaceNode(Node, SLLI);
         return;
       }
       // If the leading zero count is C2+32, we can use SRLIW instead of SRLI.
       if (Leading > 32 && (Leading - 32) == C2 && C2 + C3 < 32 &&
-          OneUseOrZExtW && !ZExtOrANDI) {
-        SDNode *SRLIW = CurDAG->getMachineNode(
-            RISCV::SRLIW, DL, XLenVT, X,
-            CurDAG->getTargetConstant(C2 + C3, DL, XLenVT));
+          OneUseOrZExtW && !IsCANDI) {
+        SDNode *SRLIW =
+            CurDAG->getMachineNode(RISCV::SRLIW, DL, VT, X,
+                                   CurDAG->getTargetConstant(C2 + C3, DL, VT));
         SDNode *SLLI =
-            CurDAG->getMachineNode(RISCV::SLLI, DL, XLenVT, SDValue(SRLIW, 0),
-                                   CurDAG->getTargetConstant(C3, DL, XLenVT));
+            CurDAG->getMachineNode(RISCV::SLLI, DL, VT, SDValue(SRLIW, 0),
+                                   CurDAG->getTargetConstant(C3, DL, VT));
         ReplaceNode(Node, SLLI);
         return;
       }
@@ -830,24 +1002,23 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     if (LeftShift && isShiftedMask_64(C1)) {
       uint64_t Leading = XLen - (64 - countLeadingZeros(C1));
       uint64_t C3 = countTrailingZeros(C1);
-      if (Leading == 0 && C2 < C3 && OneUseOrZExtW && !ZExtOrANDI) {
+      if (Leading == 0 && C2 < C3 && OneUseOrZExtW && !IsCANDI) {
         SDNode *SRLI = CurDAG->getMachineNode(
-            RISCV::SRLI, DL, XLenVT, X,
-            CurDAG->getTargetConstant(C3 - C2, DL, XLenVT));
+            RISCV::SRLI, DL, VT, X, CurDAG->getTargetConstant(C3 - C2, DL, VT));
         SDNode *SLLI =
-            CurDAG->getMachineNode(RISCV::SLLI, DL, XLenVT, SDValue(SRLI, 0),
-                                   CurDAG->getTargetConstant(C3, DL, XLenVT));
+            CurDAG->getMachineNode(RISCV::SLLI, DL, VT, SDValue(SRLI, 0),
+                                   CurDAG->getTargetConstant(C3, DL, VT));
         ReplaceNode(Node, SLLI);
         return;
       }
       // If we have (32-C2) leading zeros, we can use SRLIW instead of SRLI.
-      if (C2 < C3 && Leading + C2 == 32 && OneUseOrZExtW && !ZExtOrANDI) {
-        SDNode *SRLIW = CurDAG->getMachineNode(
-            RISCV::SRLIW, DL, XLenVT, X,
-            CurDAG->getTargetConstant(C3 - C2, DL, XLenVT));
+      if (C2 < C3 && Leading + C2 == 32 && OneUseOrZExtW && !IsCANDI) {
+        SDNode *SRLIW =
+            CurDAG->getMachineNode(RISCV::SRLIW, DL, VT, X,
+                                   CurDAG->getTargetConstant(C3 - C2, DL, VT));
         SDNode *SLLI =
-            CurDAG->getMachineNode(RISCV::SLLI, DL, XLenVT, SDValue(SRLIW, 0),
-                                   CurDAG->getTargetConstant(C3, DL, XLenVT));
+            CurDAG->getMachineNode(RISCV::SLLI, DL, VT, SDValue(SRLIW, 0),
+                                   CurDAG->getTargetConstant(C3, DL, VT));
         ReplaceNode(Node, SLLI);
         return;
       }
@@ -908,7 +1079,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     uint64_t ShiftedC1 = C1 << ConstantShift;
     // If this RV32, we need to sign extend the constant.
     if (XLen == 32)
-      ShiftedC1 = SignExtend64(ShiftedC1, 32);
+      ShiftedC1 = SignExtend64<32>(ShiftedC1);
 
     // Create (mulhu (slli X, lzcnt(C2)), C1 << (XLen - lzcnt(C2))).
     SDNode *Imm = selectImm(CurDAG, DL, VT, ShiftedC1, *Subtarget);
@@ -1005,45 +1176,44 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       }
       MVT Src1VT = Src1.getSimpleValueType();
       unsigned VMSLTOpcode, VMSLTMaskOpcode, VMXOROpcode, VMANDNOpcode,
-          VMSetOpcode, VMANDOpcode;
+          VMOROpcode;
       switch (RISCVTargetLowering::getLMUL(Src1VT)) {
       default:
         llvm_unreachable("Unexpected LMUL!");
-#define CASE_VMSLT_VMSET_OPCODES(lmulenum, suffix, suffix_b)                   \
+#define CASE_VMSLT_OPCODES(lmulenum, suffix, suffix_b)                         \
   case RISCVII::VLMUL::lmulenum:                                               \
     VMSLTOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_##suffix                 \
                              : RISCV::PseudoVMSLT_VX_##suffix;                 \
     VMSLTMaskOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_##suffix##_MASK      \
                                  : RISCV::PseudoVMSLT_VX_##suffix##_MASK;      \
-    VMSetOpcode = RISCV::PseudoVMSET_M_##suffix_b;                             \
     break;
-        CASE_VMSLT_VMSET_OPCODES(LMUL_F8, MF8, B1)
-        CASE_VMSLT_VMSET_OPCODES(LMUL_F4, MF4, B2)
-        CASE_VMSLT_VMSET_OPCODES(LMUL_F2, MF2, B4)
-        CASE_VMSLT_VMSET_OPCODES(LMUL_1, M1, B8)
-        CASE_VMSLT_VMSET_OPCODES(LMUL_2, M2, B16)
-        CASE_VMSLT_VMSET_OPCODES(LMUL_4, M4, B32)
-        CASE_VMSLT_VMSET_OPCODES(LMUL_8, M8, B64)
-#undef CASE_VMSLT_VMSET_OPCODES
+        CASE_VMSLT_OPCODES(LMUL_F8, MF8, B1)
+        CASE_VMSLT_OPCODES(LMUL_F4, MF4, B2)
+        CASE_VMSLT_OPCODES(LMUL_F2, MF2, B4)
+        CASE_VMSLT_OPCODES(LMUL_1, M1, B8)
+        CASE_VMSLT_OPCODES(LMUL_2, M2, B16)
+        CASE_VMSLT_OPCODES(LMUL_4, M4, B32)
+        CASE_VMSLT_OPCODES(LMUL_8, M8, B64)
+#undef CASE_VMSLT_OPCODES
       }
       // Mask operations use the LMUL from the mask type.
       switch (RISCVTargetLowering::getLMUL(VT)) {
       default:
         llvm_unreachable("Unexpected LMUL!");
-#define CASE_VMXOR_VMANDN_VMAND_OPCODES(lmulenum, suffix)                       \
+#define CASE_VMXOR_VMANDN_VMOR_OPCODES(lmulenum, suffix)                       \
   case RISCVII::VLMUL::lmulenum:                                               \
     VMXOROpcode = RISCV::PseudoVMXOR_MM_##suffix;                              \
     VMANDNOpcode = RISCV::PseudoVMANDN_MM_##suffix;                            \
-    VMANDOpcode = RISCV::PseudoVMAND_MM_##suffix;                              \
+    VMOROpcode = RISCV::PseudoVMOR_MM_##suffix;                                \
     break;
-        CASE_VMXOR_VMANDN_VMAND_OPCODES(LMUL_F8, MF8)
-        CASE_VMXOR_VMANDN_VMAND_OPCODES(LMUL_F4, MF4)
-        CASE_VMXOR_VMANDN_VMAND_OPCODES(LMUL_F2, MF2)
-        CASE_VMXOR_VMANDN_VMAND_OPCODES(LMUL_1, M1)
-        CASE_VMXOR_VMANDN_VMAND_OPCODES(LMUL_2, M2)
-        CASE_VMXOR_VMANDN_VMAND_OPCODES(LMUL_4, M4)
-        CASE_VMXOR_VMANDN_VMAND_OPCODES(LMUL_8, M8)
-#undef CASE_VMXOR_VMANDN_VMAND_OPCODES
+        CASE_VMXOR_VMANDN_VMOR_OPCODES(LMUL_F8, MF8)
+        CASE_VMXOR_VMANDN_VMOR_OPCODES(LMUL_F4, MF4)
+        CASE_VMXOR_VMANDN_VMOR_OPCODES(LMUL_F2, MF2)
+        CASE_VMXOR_VMANDN_VMOR_OPCODES(LMUL_1, M1)
+        CASE_VMXOR_VMANDN_VMOR_OPCODES(LMUL_2, M2)
+        CASE_VMXOR_VMANDN_VMOR_OPCODES(LMUL_4, M4)
+        CASE_VMXOR_VMANDN_VMOR_OPCODES(LMUL_8, M8)
+#undef CASE_VMXOR_VMANDN_VMOR_OPCODES
       }
       SDValue SEW = CurDAG->getTargetConstant(
           Log2_32(Src1VT.getScalarSizeInBits()), DL, XLenVT);
@@ -1053,12 +1223,17 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       SDValue MaskedOff = Node->getOperand(1);
       SDValue Mask = Node->getOperand(4);
 
-      // If vmsgeu_mask with 0 immediate, expand it to {vmset, vmand}.
+      // If vmsgeu_mask with 0 immediate, expand it to vmor mask, maskedoff.
       if (IsCmpUnsignedZero) {
-        SDValue VMSet =
-            SDValue(CurDAG->getMachineNode(VMSetOpcode, DL, VT, VL, SEW), 0);
-        ReplaceNode(Node, CurDAG->getMachineNode(VMANDOpcode, DL, VT,
-                                                 {Mask, VMSet, VL, MaskSEW}));
+        // We don't need vmor if the MaskedOff and the Mask are the same
+        // value.
+        if (Mask == MaskedOff) {
+          ReplaceUses(Node, Mask.getNode());
+          return;
+        }
+        ReplaceNode(Node,
+                    CurDAG->getMachineNode(VMOROpcode, DL, VT,
+                                           {Mask, MaskedOff, VL, MaskSEW}));
         return;
       }
 
@@ -1082,10 +1257,14 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
 
       // Otherwise use
       // vmslt{u}.vx vd, va, x, v0.t; vmxor.mm vd, vd, v0
+      // The result is mask undisturbed.
+      // We use the same instructions to emulate mask agnostic behavior, because
+      // the agnostic result can be either undisturbed or all 1.
       SDValue Cmp = SDValue(
           CurDAG->getMachineNode(VMSLTMaskOpcode, DL, VT,
                                  {MaskedOff, Src1, Src2, V0, VL, SEW, Glue}),
           0);
+      // vmxor.mm vd, vd, v0 is used to update active value.
       ReplaceNode(Node, CurDAG->getMachineNode(VMXOROpcode, DL, VT,
                                                {Cmp, Mask, VL, MaskSEW}));
       return;
@@ -1215,7 +1394,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
 
       unsigned CurOp = 2;
       // Masked intrinsic only have TU version pseduo instructions.
-      bool IsTU = IsMasked || (!IsMasked && !Node->getOperand(CurOp).isUndef());
+      bool IsTU = IsMasked || !Node->getOperand(CurOp).isUndef();
       SmallVector<SDValue, 8> Operands;
       if (IsTU)
         Operands.push_back(Node->getOperand(CurOp++));
@@ -1267,9 +1446,8 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       // The riscv_vlm intrinsic are always tail agnostic and no passthru operand.
       bool HasPassthruOperand = IntNo != Intrinsic::riscv_vlm;
       // Masked intrinsic only have TU version pseduo instructions.
-      bool IsTU =
-          HasPassthruOperand &&
-          ((!IsMasked && !Node->getOperand(CurOp).isUndef()) || IsMasked);
+      bool IsTU = HasPassthruOperand &&
+                  (IsMasked || !Node->getOperand(CurOp).isUndef());
       SmallVector<SDValue, 8> Operands;
       if (IsTU)
         Operands.push_back(Node->getOperand(CurOp++));
@@ -1302,7 +1480,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
 
       unsigned CurOp = 2;
       // Masked intrinsic only have TU version pseduo instructions.
-      bool IsTU = IsMasked || (!IsMasked && !Node->getOperand(CurOp).isUndef());
+      bool IsTU = IsMasked || !Node->getOperand(CurOp).isUndef();
       SmallVector<SDValue, 7> Operands;
       if (IsTU)
         Operands.push_back(Node->getOperand(CurOp++));
@@ -1318,19 +1496,12 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       const RISCV::VLEPseudo *P =
           RISCV::getVLEPseudo(IsMasked, IsTU, /*Strided*/ false, /*FF*/ true,
                               Log2SEW, static_cast<unsigned>(LMUL));
-      MachineSDNode *Load =
-          CurDAG->getMachineNode(P->Pseudo, DL, Node->getValueType(0),
-                                 MVT::Other, MVT::Glue, Operands);
-      SDNode *ReadVL = CurDAG->getMachineNode(RISCV::PseudoReadVL, DL, XLenVT,
-                                              /*Glue*/ SDValue(Load, 2));
-
+      MachineSDNode *Load = CurDAG->getMachineNode(
+          P->Pseudo, DL, Node->getVTList(), Operands);
       if (auto *MemOp = dyn_cast<MemSDNode>(Node))
         CurDAG->setNodeMemRefs(Load, {MemOp->getMemOperand()});
 
-      ReplaceUses(SDValue(Node, 0), SDValue(Load, 0));
-      ReplaceUses(SDValue(Node, 1), SDValue(ReadVL, 0)); // VL
-      ReplaceUses(SDValue(Node, 2), SDValue(Load, 1));   // Chain
-      CurDAG->RemoveDeadNode(Node);
+      ReplaceNode(Node, Load);
       return;
     }
     }
@@ -1610,9 +1781,10 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     // Try to match splat of a scalar load to a strided load with stride of x0.
     bool IsScalarMove = Node->getOpcode() == RISCVISD::VMV_S_X_VL ||
                         Node->getOpcode() == RISCVISD::VFMV_S_F_VL;
-    if (IsScalarMove && !Node->getOperand(0).isUndef())
+    bool HasPassthruOperand = Node->getOpcode() != ISD::SPLAT_VECTOR;
+    if (HasPassthruOperand && !Node->getOperand(0).isUndef())
       break;
-    SDValue Src = IsScalarMove ? Node->getOperand(1) : Node->getOperand(0);
+    SDValue Src = HasPassthruOperand ? Node->getOperand(1) : Node->getOperand(0);
     auto *Ld = dyn_cast<LoadSDNode>(Src);
     if (!Ld)
       break;
@@ -1634,7 +1806,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
         break;
       selectVLOp(Node->getOperand(2), VL);
     } else
-      selectVLOp(Node->getOperand(1), VL);
+      selectVLOp(Node->getOperand(2), VL);
 
     unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits());
     SDValue SEW = CurDAG->getTargetConstant(Log2SEW, DL, XLenVT);
@@ -1650,8 +1822,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     MachineSDNode *Load =
         CurDAG->getMachineNode(P->Pseudo, DL, Node->getVTList(), Operands);
 
-    if (auto *MemOp = dyn_cast<MemSDNode>(Node))
-      CurDAG->setNodeMemRefs(Load, {MemOp->getMemOperand()});
+    CurDAG->setNodeMemRefs(Load, {Ld->getMemOperand()});
 
     ReplaceNode(Node, Load);
     return;
@@ -1680,11 +1851,37 @@ bool RISCVDAGToDAGISel::SelectInlineAsmMemoryOperand(
   return true;
 }
 
-bool RISCVDAGToDAGISel::SelectAddrFI(SDValue Addr, SDValue &Base) {
+bool RISCVDAGToDAGISel::SelectAddrFrameIndex(SDValue Addr, SDValue &Base,
+                                             SDValue &Offset) {
   if (auto *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
     Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), Subtarget->getXLenVT());
+    Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), Subtarget->getXLenVT());
     return true;
   }
+
+  return false;
+}
+
+// Select a frame index and an optional immediate offset from an ADD or OR.
+bool RISCVDAGToDAGISel::SelectFrameAddrRegImm(SDValue Addr, SDValue &Base,
+                                              SDValue &Offset) {
+  if (SelectAddrFrameIndex(Addr, Base, Offset))
+    return true;
+
+  if (!CurDAG->isBaseWithConstantOffset(Addr))
+    return false;
+
+  if (auto *FIN = dyn_cast<FrameIndexSDNode>(Addr.getOperand(0))) {
+    int64_t CVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
+    if (isInt<12>(CVal)) {
+      Base = CurDAG->getTargetFrameIndex(FIN->getIndex(),
+                                         Subtarget->getXLenVT());
+      Offset = CurDAG->getTargetConstant(CVal, SDLoc(Addr),
+                                         Subtarget->getXLenVT());
+      return true;
+    }
+  }
+
   return false;
 }
 
@@ -1698,6 +1895,76 @@ bool RISCVDAGToDAGISel::SelectBaseAddr(SDValue Addr, SDValue &Base) {
   return true;
 }
 
+bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
+                                         SDValue &Offset) {
+  if (SelectAddrFrameIndex(Addr, Base, Offset))
+    return true;
+
+  SDLoc DL(Addr);
+  MVT VT = Addr.getSimpleValueType();
+
+  if (Addr.getOpcode() == RISCVISD::ADD_LO) {
+    Base = Addr.getOperand(0);
+    Offset = Addr.getOperand(1);
+    return true;
+  }
+
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    int64_t CVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
+    if (isInt<12>(CVal)) {
+      Base = Addr.getOperand(0);
+      if (Base.getOpcode() == RISCVISD::ADD_LO) {
+        SDValue LoOperand = Base.getOperand(1);
+        if (auto *GA = dyn_cast<GlobalAddressSDNode>(LoOperand)) {
+          // If the Lo in (ADD_LO hi, lo) is a global variable's address
+          // (its low part, really), then we can rely on the alignment of that
+          // variable to provide a margin of safety before low part can overflow
+          // the 12 bits of the load/store offset. Check if CVal falls within
+          // that margin; if so (low part + CVal) can't overflow.
+          const DataLayout &DL = CurDAG->getDataLayout();
+          Align Alignment = commonAlignment(
+              GA->getGlobal()->getPointerAlignment(DL), GA->getOffset());
+          if (CVal == 0 || Alignment > CVal) {
+            int64_t CombinedOffset = CVal + GA->getOffset();
+            Base = Base.getOperand(0);
+            Offset = CurDAG->getTargetGlobalAddress(
+                GA->getGlobal(), SDLoc(LoOperand), LoOperand.getValueType(),
+                CombinedOffset, GA->getTargetFlags());
+            return true;
+          }
+        }
+      }
+
+      if (auto *FIN = dyn_cast<FrameIndexSDNode>(Base))
+        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), VT);
+      Offset = CurDAG->getTargetConstant(CVal, DL, VT);
+      return true;
+    }
+  }
+
+  // Handle ADD with large immediates.
+  if (Addr.getOpcode() == ISD::ADD && isa<ConstantSDNode>(Addr.getOperand(1))) {
+    int64_t CVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
+    assert(!isInt<12>(CVal) && "simm12 not already handled?");
+
+    if (isInt<12>(CVal / 2) && isInt<12>(CVal - CVal / 2)) {
+      // We can use an ADDI for part of the offset and fold the rest into the
+      // load/store. This mirrors the AddiPair PatFrag in RISCVInstrInfo.td.
+      int64_t Adj = CVal < 0 ? -2048 : 2047;
+      Base = SDValue(
+          CurDAG->getMachineNode(RISCV::ADDI, DL, VT, Addr.getOperand(0),
+                                 CurDAG->getTargetConstant(Adj, DL, VT)),
+          0);
+      Offset = CurDAG->getTargetConstant(CVal - Adj, DL, VT);
+      return true;
+    }
+  }
+
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, DL, VT);
+  return true;
+}
+
 bool RISCVDAGToDAGISel::selectShiftMask(SDValue N, unsigned ShiftWidth,
                                         SDValue &ShAmt) {
   // Shift instructions on RISCV only read the lower 5 or 6 bits of the shift
@@ -1723,6 +1990,21 @@ bool RISCVDAGToDAGISel::selectShiftMask(SDValue N, unsigned ShiftWidth,
       ShAmt = N.getOperand(0);
       return true;
     }
+  } else if (N.getOpcode() == ISD::SUB &&
+             isa<ConstantSDNode>(N.getOperand(0))) {
+    uint64_t Imm = N.getConstantOperandVal(0);
+    // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to
+    // generate a NEG instead of a SUB of a constant.
+    if (Imm != 0 && Imm % ShiftWidth == 0) {
+      SDLoc DL(N);
+      EVT VT = N.getValueType();
+      SDValue Zero = CurDAG->getRegister(RISCV::X0, VT);
+      unsigned NegOpc = VT == MVT::i64 ? RISCV::SUBW : RISCV::SUB;
+      MachineSDNode *Neg = CurDAG->getMachineNode(NegOpc, DL, VT, Zero,
+                                                  N.getOperand(1));
+      ShAmt = SDValue(Neg, 0);
+      return true;
+    }
   }
 
   ShAmt = N;
@@ -1778,6 +2060,8 @@ bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits) const {
           Node->getOpcode() == ISD::MUL || Node->getOpcode() == ISD::SHL ||
           Node->getOpcode() == ISD::SRL ||
           Node->getOpcode() == ISD::SIGN_EXTEND_INREG ||
+          Node->getOpcode() == RISCVISD::GREV ||
+          Node->getOpcode() == RISCVISD::GORC ||
           isa<ConstantSDNode>(Node)) &&
          "Unexpected opcode");
 
@@ -1812,6 +2096,7 @@ bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits) const {
     case RISCV::CTZW:
     case RISCV::CPOPW:
     case RISCV::SLLI_UW:
+    case RISCV::FMV_W_X:
     case RISCV::FCVT_H_W:
     case RISCV::FCVT_H_WU:
     case RISCV::FCVT_S_W:
@@ -1835,6 +2120,7 @@ bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits) const {
         return false;
       break;
     case RISCV::SEXT_H:
+    case RISCV::FMV_H_X:
     case RISCV::ZEXT_H_RV32:
     case RISCV::ZEXT_H_RV64:
       if (Bits < 16)
@@ -1871,22 +2157,32 @@ bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits) const {
 // allows us to choose betwen VSETIVLI or VSETVLI later.
 bool RISCVDAGToDAGISel::selectVLOp(SDValue N, SDValue &VL) {
   auto *C = dyn_cast<ConstantSDNode>(N);
-  if (C && (isUInt<5>(C->getZExtValue()) ||
-            C->getSExtValue() == RISCV::VLMaxSentinel))
+  if (C && isUInt<5>(C->getZExtValue())) {
     VL = CurDAG->getTargetConstant(C->getZExtValue(), SDLoc(N),
                                    N->getValueType(0));
-  else
+  } else if (C && C->isAllOnesValue()) {
+    // Treat all ones as VLMax.
+    VL = CurDAG->getTargetConstant(RISCV::VLMaxSentinel, SDLoc(N),
+                                   N->getValueType(0));
+  } else if (isa<RegisterSDNode>(N) &&
+             cast<RegisterSDNode>(N)->getReg() == RISCV::X0) {
+    // All our VL operands use an operand that allows GPRNoX0 or an immediate
+    // as the register class. Convert X0 to a special immediate to pass the
+    // MachineVerifier. This is recognized specially by the vsetvli insertion
+    // pass.
+    VL = CurDAG->getTargetConstant(RISCV::VLMaxSentinel, SDLoc(N),
+                                   N->getValueType(0));
+  } else {
     VL = N;
+  }
 
   return true;
 }
 
 bool RISCVDAGToDAGISel::selectVSplat(SDValue N, SDValue &SplatVal) {
-  if (N.getOpcode() != ISD::SPLAT_VECTOR &&
-      N.getOpcode() != RISCVISD::SPLAT_VECTOR_I64 &&
-      N.getOpcode() != RISCVISD::VMV_V_X_VL)
+  if (N.getOpcode() != RISCVISD::VMV_V_X_VL || !N.getOperand(0).isUndef())
     return false;
-  SplatVal = N.getOperand(0);
+  SplatVal = N.getOperand(1);
   return true;
 }
 
@@ -1896,23 +2192,22 @@ static bool selectVSplatSimmHelper(SDValue N, SDValue &SplatVal,
                                    SelectionDAG &DAG,
                                    const RISCVSubtarget &Subtarget,
                                    ValidateFn ValidateImm) {
-  if ((N.getOpcode() != ISD::SPLAT_VECTOR &&
-       N.getOpcode() != RISCVISD::SPLAT_VECTOR_I64 &&
-       N.getOpcode() != RISCVISD::VMV_V_X_VL) ||
-      !isa<ConstantSDNode>(N.getOperand(0)))
+  if (N.getOpcode() != RISCVISD::VMV_V_X_VL || !N.getOperand(0).isUndef() ||
+      !isa<ConstantSDNode>(N.getOperand(1)))
     return false;
 
-  int64_t SplatImm = cast<ConstantSDNode>(N.getOperand(0))->getSExtValue();
+  int64_t SplatImm =
+      cast<ConstantSDNode>(N.getOperand(1))->getSExtValue();
 
-  // ISD::SPLAT_VECTOR, RISCVISD::SPLAT_VECTOR_I64 and RISCVISD::VMV_V_X_VL
-  // share semantics when the operand type is wider than the resulting vector
-  // element type: an implicit truncation first takes place. Therefore, perform
-  // a manual truncation/sign-extension in order to ignore any truncated bits
-  // and catch any zero-extended immediate.
+  // The semantics of RISCVISD::VMV_V_X_VL is that when the operand
+  // type is wider than the resulting vector element type: an implicit
+  // truncation first takes place. Therefore, perform a manual
+  // truncation/sign-extension in order to ignore any truncated bits and catch
+  // any zero-extended immediate.
   // For example, we wish to match (i8 -1) -> (XLenVT 255) as a simm5 by first
   // sign-extending to (XLenVT -1).
   MVT XLenVT = Subtarget.getXLenVT();
-  assert(XLenVT == N.getOperand(0).getSimpleValueType() &&
+  assert(XLenVT == N.getOperand(1).getSimpleValueType() &&
          "Unexpected splat operand type");
   MVT EltVT = N.getSimpleValueType().getVectorElementType();
   if (EltVT.bitsLT(XLenVT))
@@ -1945,13 +2240,12 @@ bool RISCVDAGToDAGISel::selectVSplatSimm5Plus1NonZero(SDValue N,
 }
 
 bool RISCVDAGToDAGISel::selectVSplatUimm5(SDValue N, SDValue &SplatVal) {
-  if ((N.getOpcode() != ISD::SPLAT_VECTOR &&
-       N.getOpcode() != RISCVISD::SPLAT_VECTOR_I64 &&
-       N.getOpcode() != RISCVISD::VMV_V_X_VL) ||
-      !isa<ConstantSDNode>(N.getOperand(0)))
+  if (N.getOpcode() != RISCVISD::VMV_V_X_VL || !N.getOperand(0).isUndef() ||
+      !isa<ConstantSDNode>(N.getOperand(1)))
     return false;
 
-  int64_t SplatImm = cast<ConstantSDNode>(N.getOperand(0))->getSExtValue();
+  int64_t SplatImm =
+      cast<ConstantSDNode>(N.getOperand(1))->getSExtValue();
 
   if (!isUInt<5>(SplatImm))
     return false;
@@ -1980,49 +2274,42 @@ bool RISCVDAGToDAGISel::selectRVVSimm5(SDValue N, unsigned Width,
 // Merge an ADDI into the offset of a load/store instruction where possible.
 // (load (addi base, off1), off2) -> (load base, off1+off2)
 // (store val, (addi base, off1), off2) -> (store val, base, off1+off2)
+// (load (add base, (addi src, off1)), off2)
+//    -> (load (add base, src), off1+off2)
+// (store val, (add base, (addi src, off1)), off2)
+//    -> (store val, (add base, src), off1+off2)
 // This is possible when off1+off2 fits a 12-bit immediate.
 bool RISCVDAGToDAGISel::doPeepholeLoadStoreADDI(SDNode *N) {
-  int OffsetOpIdx;
-  int BaseOpIdx;
-
-  // Only attempt this optimisation for I-type loads and S-type stores.
-  switch (N->getMachineOpcode()) {
-  default:
+  unsigned OffsetOpIdx, BaseOpIdx;
+  if (!hasMemOffset(N, BaseOpIdx, OffsetOpIdx))
     return false;
-  case RISCV::LB:
-  case RISCV::LH:
-  case RISCV::LW:
-  case RISCV::LBU:
-  case RISCV::LHU:
-  case RISCV::LWU:
-  case RISCV::LD:
-  case RISCV::FLH:
-  case RISCV::FLW:
-  case RISCV::FLD:
-    BaseOpIdx = 0;
-    OffsetOpIdx = 1;
-    break;
-  case RISCV::SB:
-  case RISCV::SH:
-  case RISCV::SW:
-  case RISCV::SD:
-  case RISCV::FSH:
-  case RISCV::FSW:
-  case RISCV::FSD:
-    BaseOpIdx = 1;
-    OffsetOpIdx = 2;
-    break;
-  }
 
   if (!isa<ConstantSDNode>(N->getOperand(OffsetOpIdx)))
     return false;
 
   SDValue Base = N->getOperand(BaseOpIdx);
 
-  // If the base is an ADDI, we can merge it in to the load/store.
-  if (!Base.isMachineOpcode() || Base.getMachineOpcode() != RISCV::ADDI)
+  if (!Base.isMachineOpcode())
     return false;
 
+  if (Base.getMachineOpcode() == RISCV::ADDI) {
+    // If the base is an ADDI, we can merge it in to the load/store.
+  } else if (Base.getMachineOpcode() == RISCV::ADDIW &&
+             isa<ConstantSDNode>(Base.getOperand(1)) &&
+             Base.getOperand(0).isMachineOpcode() &&
+             Base.getOperand(0).getMachineOpcode() == RISCV::LUI &&
+             isa<ConstantSDNode>(Base.getOperand(0).getOperand(0))) {
+    // ADDIW can be merged if it's part of LUI+ADDIW constant materialization
+    // and LUI+ADDI would have produced the same result. This is true for all
+    // simm32 values except 0x7ffff800-0x7fffffff.
+    int64_t Offset =
+      SignExtend64<32>(Base.getOperand(0).getConstantOperandVal(0) << 12);
+    Offset += cast<ConstantSDNode>(Base.getOperand(1))->getSExtValue();
+    if (!isInt<32>(Offset))
+      return false;
+  } else
+   return false;
+
   SDValue ImmOperand = Base.getOperand(1);
   uint64_t Offset2 = N->getConstantOperandVal(OffsetOpIdx);
 
@@ -2039,7 +2326,8 @@ bool RISCVDAGToDAGISel::doPeepholeLoadStoreADDI(SDNode *N) {
     // to provide a margin of safety before off1 can overflow the 12 bits.
     // Check if off2 falls within that margin; if so off1+off2 can't overflow.
     const DataLayout &DL = CurDAG->getDataLayout();
-    Align Alignment = GA->getGlobal()->getPointerAlignment(DL);
+    Align Alignment = commonAlignment(GA->getGlobal()->getPointerAlignment(DL),
+                                      GA->getOffset());
     if (Offset2 != 0 && Alignment <= Offset2)
       return false;
     int64_t Offset1 = GA->getOffset();
@@ -2049,7 +2337,7 @@ bool RISCVDAGToDAGISel::doPeepholeLoadStoreADDI(SDNode *N) {
         CombinedOffset, GA->getTargetFlags());
   } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(ImmOperand)) {
     // Ditto.
-    Align Alignment = CP->getAlign();
+    Align Alignment = commonAlignment(CP->getAlign(), CP->getOffset());
     if (Offset2 != 0 && Alignment <= Offset2)
       return false;
     int64_t Offset1 = CP->getOffset();
@@ -2068,12 +2356,13 @@ bool RISCVDAGToDAGISel::doPeepholeLoadStoreADDI(SDNode *N) {
   LLVM_DEBUG(dbgs() << "\n");
 
   // Modify the offset operand of the load/store.
-  if (BaseOpIdx == 0) // Load
-    CurDAG->UpdateNodeOperands(N, Base.getOperand(0), ImmOperand,
-                               N->getOperand(2));
-  else // Store
-    CurDAG->UpdateNodeOperands(N, N->getOperand(0), Base.getOperand(0),
-                               ImmOperand, N->getOperand(3));
+  if (BaseOpIdx == 0) { // Load
+    N = CurDAG->UpdateNodeOperands(N, Base.getOperand(0), ImmOperand,
+                                   N->getOperand(2));
+  } else { // Store
+    N = CurDAG->UpdateNodeOperands(N, N->getOperand(0), Base.getOperand(0),
+                                   ImmOperand, N->getOperand(3));
+  }
 
   return true;
 }
@@ -2130,6 +2419,8 @@ bool RISCVDAGToDAGISel::doPeepholeSExtW(SDNode *N) {
   case RISCV::SUBW:
   case RISCV::MULW:
   case RISCV::SLLIW:
+  case RISCV::GREVIW:
+  case RISCV::GORCIW:
     // Result is already sign extended just remove the sext.w.
     // NOTE: We only handle the nodes that are selected with hasAllWUsers.
     ReplaceUses(N, N0.getNode());
@@ -2139,8 +2430,113 @@ bool RISCVDAGToDAGISel::doPeepholeSExtW(SDNode *N) {
   return false;
 }
 
+// Optimize masked RVV pseudo instructions with a known all-ones mask to their
+// corresponding "unmasked" pseudo versions. The mask we're interested in will
+// take the form of a V0 physical register operand, with a glued
+// register-setting instruction.
+bool RISCVDAGToDAGISel::doPeepholeMaskedRVV(SDNode *N) {
+  const RISCV::RISCVMaskedPseudoInfo *I =
+      RISCV::getMaskedPseudoInfo(N->getMachineOpcode());
+  if (!I)
+    return false;
+
+  unsigned MaskOpIdx = I->MaskOpIdx;
+
+  // Check that we're using V0 as a mask register.
+  if (!isa<RegisterSDNode>(N->getOperand(MaskOpIdx)) ||
+      cast<RegisterSDNode>(N->getOperand(MaskOpIdx))->getReg() != RISCV::V0)
+    return false;
+
+  // The glued user defines V0.
+  const auto *Glued = N->getGluedNode();
+
+  if (!Glued || Glued->getOpcode() != ISD::CopyToReg)
+    return false;
+
+  // Check that we're defining V0 as a mask register.
+  if (!isa<RegisterSDNode>(Glued->getOperand(1)) ||
+      cast<RegisterSDNode>(Glued->getOperand(1))->getReg() != RISCV::V0)
+    return false;
+
+  // Check the instruction defining V0; it needs to be a VMSET pseudo.
+  SDValue MaskSetter = Glued->getOperand(2);
+
+  const auto IsVMSet = [](unsigned Opc) {
+    return Opc == RISCV::PseudoVMSET_M_B1 || Opc == RISCV::PseudoVMSET_M_B16 ||
+           Opc == RISCV::PseudoVMSET_M_B2 || Opc == RISCV::PseudoVMSET_M_B32 ||
+           Opc == RISCV::PseudoVMSET_M_B4 || Opc == RISCV::PseudoVMSET_M_B64 ||
+           Opc == RISCV::PseudoVMSET_M_B8;
+  };
+
+  // TODO: Check that the VMSET is the expected bitwidth? The pseudo has
+  // undefined behaviour if it's the wrong bitwidth, so we could choose to
+  // assume that it's all-ones? Same applies to its VL.
+  if (!MaskSetter->isMachineOpcode() || !IsVMSet(MaskSetter.getMachineOpcode()))
+    return false;
+
+  // Retrieve the tail policy operand index, if any.
+  Optional<unsigned> TailPolicyOpIdx;
+  const RISCVInstrInfo &TII = *Subtarget->getInstrInfo();
+  const MCInstrDesc &MaskedMCID = TII.get(N->getMachineOpcode());
+
+  bool IsTA = true;
+  if (RISCVII::hasVecPolicyOp(MaskedMCID.TSFlags)) {
+    // The last operand of the pseudo is the policy op, but we might have a
+    // Glue operand last. We might also have a chain.
+    TailPolicyOpIdx = N->getNumOperands() - 1;
+    if (N->getOperand(*TailPolicyOpIdx).getValueType() == MVT::Glue)
+      (*TailPolicyOpIdx)--;
+    if (N->getOperand(*TailPolicyOpIdx).getValueType() == MVT::Other)
+      (*TailPolicyOpIdx)--;
+
+    if (!(N->getConstantOperandVal(*TailPolicyOpIdx) &
+          RISCVII::TAIL_AGNOSTIC)) {
+      // Keep the true-masked instruction when there is no unmasked TU
+      // instruction
+      if (I->UnmaskedTUPseudo == I->MaskedPseudo && !N->getOperand(0).isUndef())
+        return false;
+      // We can't use TA if the tie-operand is not IMPLICIT_DEF
+      if (!N->getOperand(0).isUndef())
+        IsTA = false;
+    }
+  }
+
+  unsigned Opc = IsTA ? I->UnmaskedPseudo : I->UnmaskedTUPseudo;
+
+  // Check that we're dropping the mask operand and any policy operand
+  // when we transform to this unmasked pseudo. Additionally, if this insturtion
+  // is tail agnostic, the unmasked instruction should not have a merge op.
+  uint64_t TSFlags = TII.get(Opc).TSFlags;
+  assert((IsTA != RISCVII::hasMergeOp(TSFlags)) &&
+         RISCVII::hasDummyMaskOp(TSFlags) &&
+         !RISCVII::hasVecPolicyOp(TSFlags) &&
+         "Unexpected pseudo to transform to");
+  (void)TSFlags;
+
+  SmallVector<SDValue, 8> Ops;
+  // Skip the merge operand at index 0 if IsTA
+  for (unsigned I = IsTA, E = N->getNumOperands(); I != E; I++) {
+    // Skip the mask, the policy, and the Glue.
+    SDValue Op = N->getOperand(I);
+    if (I == MaskOpIdx || I == TailPolicyOpIdx ||
+        Op.getValueType() == MVT::Glue)
+      continue;
+    Ops.push_back(Op);
+  }
+
+  // Transitively apply any node glued to our new node.
+  if (auto *TGlued = Glued->getGluedNode())
+    Ops.push_back(SDValue(TGlued, TGlued->getNumValues() - 1));
+
+  SDNode *Result = CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops);
+  ReplaceUses(N, Result);
+
+  return true;
+}
+
 // This pass converts a legalized DAG into a RISCV-specific DAG, ready
 // for instruction scheduling.
-FunctionPass *llvm::createRISCVISelDag(RISCVTargetMachine &TM) {
-  return new RISCVDAGToDAGISel(TM);
+FunctionPass *llvm::createRISCVISelDag(RISCVTargetMachine &TM,
+                                       CodeGenOpt::Level OptLevel) {
+  return new RISCVDAGToDAGISel(TM, OptLevel);
 }
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index c429a9298739..b50927cfcca5 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -24,8 +24,9 @@ class RISCVDAGToDAGISel : public SelectionDAGISel {
   const RISCVSubtarget *Subtarget = nullptr;
 
 public:
-  explicit RISCVDAGToDAGISel(RISCVTargetMachine &TargetMachine)
-      : SelectionDAGISel(TargetMachine) {}
+  explicit RISCVDAGToDAGISel(RISCVTargetMachine &TargetMachine,
+                             CodeGenOpt::Level OptLevel)
+      : SelectionDAGISel(TargetMachine, OptLevel) {}
 
   StringRef getPassName() const override {
     return "RISCV DAG->DAG Pattern Instruction Selection";
@@ -44,8 +45,10 @@ public:
   bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
                                     std::vector<SDValue> &OutOps) override;
 
-  bool SelectAddrFI(SDValue Addr, SDValue &Base);
+  bool SelectAddrFrameIndex(SDValue Addr, SDValue &Base, SDValue &Offset);
+  bool SelectFrameAddrRegImm(SDValue Addr, SDValue &Base, SDValue &Offset);
   bool SelectBaseAddr(SDValue Addr, SDValue &Base);
+  bool SelectAddrRegImm(SDValue Addr, SDValue &Base, SDValue &Offset);
 
   bool selectShiftMask(SDValue N, unsigned ShiftWidth, SDValue &ShAmt);
   bool selectShiftMaskXLen(SDValue N, SDValue &ShAmt) {
@@ -117,12 +120,14 @@ public:
 private:
   bool doPeepholeLoadStoreADDI(SDNode *Node);
   bool doPeepholeSExtW(SDNode *Node);
+  bool doPeepholeMaskedRVV(SDNode *Node);
 };
 
 namespace RISCV {
 struct VLSEGPseudo {
   uint16_t NF : 4;
   uint16_t Masked : 1;
+  uint16_t IsTU : 1;
   uint16_t Strided : 1;
   uint16_t FF : 1;
   uint16_t Log2SEW : 3;
@@ -133,6 +138,7 @@ struct VLSEGPseudo {
 struct VLXSEGPseudo {
   uint16_t NF : 4;
   uint16_t Masked : 1;
+  uint16_t IsTU : 1;
   uint16_t Ordered : 1;
   uint16_t Log2SEW : 3;
   uint16_t LMUL : 3;
@@ -187,6 +193,13 @@ struct VLX_VSXPseudo {
   uint16_t Pseudo;
 };
 
+struct RISCVMaskedPseudoInfo {
+  uint16_t MaskedPseudo;
+  uint16_t UnmaskedPseudo;
+  uint16_t UnmaskedTUPseudo;
+  uint8_t MaskOpIdx;
+};
+
 #define GET_RISCVVSSEGTable_DECL
 #define GET_RISCVVLSEGTable_DECL
 #define GET_RISCVVLXSEGTable_DECL
@@ -195,6 +208,7 @@ struct VLX_VSXPseudo {
 #define GET_RISCVVSETable_DECL
 #define GET_RISCVVLXTable_DECL
 #define GET_RISCVVSXTable_DECL
+#define GET_RISCVMaskedPseudosTable_DECL
 #include "RISCVGenSearchableTables.inc"
 } // namespace RISCV
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 97d24c8e9c0b..ff645dea4e7a 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -112,17 +112,24 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
   if (Subtarget.hasVInstructions()) {
     auto addRegClassForRVV = [this](MVT VT) {
+      // Disable the smallest fractional LMUL types if ELEN is less than
+      // RVVBitsPerBlock.
+      unsigned MinElts = RISCV::RVVBitsPerBlock / Subtarget.getELEN();
+      if (VT.getVectorMinNumElements() < MinElts)
+        return;
+
       unsigned Size = VT.getSizeInBits().getKnownMinValue();
-      assert(Size <= 512 && isPowerOf2_32(Size));
       const TargetRegisterClass *RC;
-      if (Size <= 64)
+      if (Size <= RISCV::RVVBitsPerBlock)
         RC = &RISCV::VRRegClass;
-      else if (Size == 128)
+      else if (Size == 2 * RISCV::RVVBitsPerBlock)
         RC = &RISCV::VRM2RegClass;
-      else if (Size == 256)
+      else if (Size == 4 * RISCV::RVVBitsPerBlock)
         RC = &RISCV::VRM4RegClass;
-      else
+      else if (Size == 8 * RISCV::RVVBitsPerBlock)
         RC = &RISCV::VRM8RegClass;
+      else
+        llvm_unreachable("Unexpected size");
 
       addRegisterClass(VT, RC);
     };
@@ -170,8 +177,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
   setStackPointerRegisterToSaveRestore(RISCV::X2);
 
-  for (auto N : {ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD})
-    setLoadExtAction(N, XLenVT, MVT::i1, Promote);
+  setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, XLenVT,
+                   MVT::i1, Promote);
 
   // TODO: add all necessary setOperationAction calls.
   setOperationAction(ISD::DYNAMIC_STACKALLOC, XLenVT, Expand);
@@ -181,100 +188,75 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
   setOperationAction(ISD::SELECT_CC, XLenVT, Expand);
 
-  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
-  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+  setOperationAction({ISD::STACKSAVE, ISD::STACKRESTORE}, MVT::Other, Expand);
 
   setOperationAction(ISD::VASTART, MVT::Other, Custom);
-  setOperationAction(ISD::VAARG, MVT::Other, Expand);
-  setOperationAction(ISD::VACOPY, MVT::Other, Expand);
-  setOperationAction(ISD::VAEND, MVT::Other, Expand);
+  setOperationAction({ISD::VAARG, ISD::VACOPY, ISD::VAEND}, MVT::Other, Expand);
 
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
-  if (!Subtarget.hasStdExtZbb()) {
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
-  }
+
+  setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom);
+
+  if (!Subtarget.hasStdExtZbb())
+    setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::i8, MVT::i16}, Expand);
 
   if (Subtarget.is64Bit()) {
-    setOperationAction(ISD::ADD, MVT::i32, Custom);
-    setOperationAction(ISD::SUB, MVT::i32, Custom);
-    setOperationAction(ISD::SHL, MVT::i32, Custom);
-    setOperationAction(ISD::SRA, MVT::i32, Custom);
-    setOperationAction(ISD::SRL, MVT::i32, Custom);
-
-    setOperationAction(ISD::UADDO, MVT::i32, Custom);
-    setOperationAction(ISD::USUBO, MVT::i32, Custom);
-    setOperationAction(ISD::UADDSAT, MVT::i32, Custom);
-    setOperationAction(ISD::USUBSAT, MVT::i32, Custom);
+    setOperationAction(ISD::EH_DWARF_CFA, MVT::i64, Custom);
+
+    setOperationAction({ISD::ADD, ISD::SUB, ISD::SHL, ISD::SRA, ISD::SRL},
+                       MVT::i32, Custom);
+
+    setOperationAction({ISD::UADDO, ISD::USUBO, ISD::UADDSAT, ISD::USUBSAT},
+                       MVT::i32, Custom);
   } else {
-    setLibcallName(RTLIB::SHL_I128, nullptr);
-    setLibcallName(RTLIB::SRL_I128, nullptr);
-    setLibcallName(RTLIB::SRA_I128, nullptr);
-    setLibcallName(RTLIB::MUL_I128, nullptr);
+    setLibcallName(
+        {RTLIB::SHL_I128, RTLIB::SRL_I128, RTLIB::SRA_I128, RTLIB::MUL_I128},
+        nullptr);
     setLibcallName(RTLIB::MULO_I64, nullptr);
   }
 
   if (!Subtarget.hasStdExtM()) {
-    setOperationAction(ISD::MUL, XLenVT, Expand);
-    setOperationAction(ISD::MULHS, XLenVT, Expand);
-    setOperationAction(ISD::MULHU, XLenVT, Expand);
-    setOperationAction(ISD::SDIV, XLenVT, Expand);
-    setOperationAction(ISD::UDIV, XLenVT, Expand);
-    setOperationAction(ISD::SREM, XLenVT, Expand);
-    setOperationAction(ISD::UREM, XLenVT, Expand);
+    setOperationAction({ISD::MUL, ISD::MULHS, ISD::MULHU, ISD::SDIV, ISD::UDIV,
+                        ISD::SREM, ISD::UREM},
+                       XLenVT, Expand);
   } else {
     if (Subtarget.is64Bit()) {
-      setOperationAction(ISD::MUL, MVT::i32, Custom);
-      setOperationAction(ISD::MUL, MVT::i128, Custom);
-
-      setOperationAction(ISD::SDIV, MVT::i8, Custom);
-      setOperationAction(ISD::UDIV, MVT::i8, Custom);
-      setOperationAction(ISD::UREM, MVT::i8, Custom);
-      setOperationAction(ISD::SDIV, MVT::i16, Custom);
-      setOperationAction(ISD::UDIV, MVT::i16, Custom);
-      setOperationAction(ISD::UREM, MVT::i16, Custom);
-      setOperationAction(ISD::SDIV, MVT::i32, Custom);
-      setOperationAction(ISD::UDIV, MVT::i32, Custom);
-      setOperationAction(ISD::UREM, MVT::i32, Custom);
+      setOperationAction(ISD::MUL, {MVT::i32, MVT::i128}, Custom);
+
+      setOperationAction({ISD::SDIV, ISD::UDIV, ISD::UREM},
+                         {MVT::i8, MVT::i16, MVT::i32}, Custom);
     } else {
       setOperationAction(ISD::MUL, MVT::i64, Custom);
     }
   }
 
-  setOperationAction(ISD::SDIVREM, XLenVT, Expand);
-  setOperationAction(ISD::UDIVREM, XLenVT, Expand);
-  setOperationAction(ISD::SMUL_LOHI, XLenVT, Expand);
-  setOperationAction(ISD::UMUL_LOHI, XLenVT, Expand);
+  setOperationAction(
+      {ISD::SDIVREM, ISD::UDIVREM, ISD::SMUL_LOHI, ISD::UMUL_LOHI}, XLenVT,
+      Expand);
 
-  setOperationAction(ISD::SHL_PARTS, XLenVT, Custom);
-  setOperationAction(ISD::SRL_PARTS, XLenVT, Custom);
-  setOperationAction(ISD::SRA_PARTS, XLenVT, Custom);
+  setOperationAction({ISD::SHL_PARTS, ISD::SRL_PARTS, ISD::SRA_PARTS}, XLenVT,
+                     Custom);
 
   if (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbp() ||
       Subtarget.hasStdExtZbkb()) {
-    if (Subtarget.is64Bit()) {
-      setOperationAction(ISD::ROTL, MVT::i32, Custom);
-      setOperationAction(ISD::ROTR, MVT::i32, Custom);
-    }
+    if (Subtarget.is64Bit())
+      setOperationAction({ISD::ROTL, ISD::ROTR}, MVT::i32, Custom);
   } else {
-    setOperationAction(ISD::ROTL, XLenVT, Expand);
-    setOperationAction(ISD::ROTR, XLenVT, Expand);
+    setOperationAction({ISD::ROTL, ISD::ROTR}, XLenVT, Expand);
   }
 
   if (Subtarget.hasStdExtZbp()) {
     // Custom lower bswap/bitreverse so we can convert them to GREVI to enable
     // more combining.
-    setOperationAction(ISD::BITREVERSE, XLenVT,   Custom);
-    setOperationAction(ISD::BSWAP,      XLenVT,   Custom);
-    setOperationAction(ISD::BITREVERSE, MVT::i8,  Custom);
+    setOperationAction({ISD::BITREVERSE, ISD::BSWAP}, XLenVT, Custom);
+
     // BSWAP i8 doesn't exist.
-    setOperationAction(ISD::BITREVERSE, MVT::i16, Custom);
-    setOperationAction(ISD::BSWAP,      MVT::i16, Custom);
+    setOperationAction(ISD::BITREVERSE, MVT::i8, Custom);
 
-    if (Subtarget.is64Bit()) {
-      setOperationAction(ISD::BITREVERSE, MVT::i32, Custom);
-      setOperationAction(ISD::BSWAP,      MVT::i32, Custom);
-    }
+    setOperationAction({ISD::BITREVERSE, ISD::BSWAP}, MVT::i16, Custom);
+
+    if (Subtarget.is64Bit())
+      setOperationAction({ISD::BITREVERSE, ISD::BSWAP}, MVT::i32, Custom);
   } else {
     // With Zbb we have an XLen rev8 instruction, but not GREVI. So we'll
     // pattern match it directly in isel.
@@ -288,36 +270,38 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   }
 
   if (Subtarget.hasStdExtZbb()) {
-    setOperationAction(ISD::SMIN, XLenVT, Legal);
-    setOperationAction(ISD::SMAX, XLenVT, Legal);
-    setOperationAction(ISD::UMIN, XLenVT, Legal);
-    setOperationAction(ISD::UMAX, XLenVT, Legal);
+    setOperationAction({ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}, XLenVT,
+                       Legal);
 
-    if (Subtarget.is64Bit()) {
-      setOperationAction(ISD::CTTZ, MVT::i32, Custom);
-      setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
-      setOperationAction(ISD::CTLZ, MVT::i32, Custom);
-      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
-    }
+    if (Subtarget.is64Bit())
+      setOperationAction(
+          {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
+          MVT::i32, Custom);
   } else {
-    setOperationAction(ISD::CTTZ, XLenVT, Expand);
-    setOperationAction(ISD::CTLZ, XLenVT, Expand);
-    setOperationAction(ISD::CTPOP, XLenVT, Expand);
+    setOperationAction({ISD::CTTZ, ISD::CTLZ, ISD::CTPOP}, XLenVT, Expand);
+
+    if (Subtarget.is64Bit())
+      setOperationAction(ISD::ABS, MVT::i32, Custom);
   }
 
   if (Subtarget.hasStdExtZbt()) {
-    setOperationAction(ISD::FSHL, XLenVT, Custom);
-    setOperationAction(ISD::FSHR, XLenVT, Custom);
+    setOperationAction({ISD::FSHL, ISD::FSHR}, XLenVT, Custom);
     setOperationAction(ISD::SELECT, XLenVT, Legal);
 
-    if (Subtarget.is64Bit()) {
-      setOperationAction(ISD::FSHL, MVT::i32, Custom);
-      setOperationAction(ISD::FSHR, MVT::i32, Custom);
-    }
+    if (Subtarget.is64Bit())
+      setOperationAction({ISD::FSHL, ISD::FSHR}, MVT::i32, Custom);
   } else {
     setOperationAction(ISD::SELECT, XLenVT, Custom);
   }
 
+  static constexpr ISD::NodeType FPLegalNodeTypes[] = {
+      ISD::FMINNUM,        ISD::FMAXNUM,       ISD::LRINT,
+      ISD::LLRINT,         ISD::LROUND,        ISD::LLROUND,
+      ISD::STRICT_LRINT,   ISD::STRICT_LLRINT, ISD::STRICT_LROUND,
+      ISD::STRICT_LLROUND, ISD::STRICT_FMA,    ISD::STRICT_FADD,
+      ISD::STRICT_FSUB,    ISD::STRICT_FMUL,   ISD::STRICT_FDIV,
+      ISD::STRICT_FSQRT,   ISD::STRICT_FSETCC, ISD::STRICT_FSETCCS};
+
   static const ISD::CondCode FPCCToExpand[] = {
       ISD::SETOGT, ISD::SETOGE, ISD::SETONE, ISD::SETUEQ, ISD::SETUGT,
       ISD::SETUGE, ISD::SETULT, ISD::SETULE, ISD::SETUNE, ISD::SETGT,
@@ -331,50 +315,21 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::BITCAST, MVT::i16, Custom);
 
   if (Subtarget.hasStdExtZfh()) {
-    setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
-    setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
-    setOperationAction(ISD::LRINT, MVT::f16, Legal);
-    setOperationAction(ISD::LLRINT, MVT::f16, Legal);
-    setOperationAction(ISD::LROUND, MVT::f16, Legal);
-    setOperationAction(ISD::LLROUND, MVT::f16, Legal);
-    setOperationAction(ISD::STRICT_LRINT, MVT::f16, Legal);
-    setOperationAction(ISD::STRICT_LLRINT, MVT::f16, Legal);
-    setOperationAction(ISD::STRICT_LROUND, MVT::f16, Legal);
-    setOperationAction(ISD::STRICT_LLROUND, MVT::f16, Legal);
-    setOperationAction(ISD::STRICT_FADD, MVT::f16, Legal);
-    setOperationAction(ISD::STRICT_FMA, MVT::f16, Legal);
-    setOperationAction(ISD::STRICT_FSUB, MVT::f16, Legal);
-    setOperationAction(ISD::STRICT_FMUL, MVT::f16, Legal);
-    setOperationAction(ISD::STRICT_FDIV, MVT::f16, Legal);
+    for (auto NT : FPLegalNodeTypes)
+      setOperationAction(NT, MVT::f16, Legal);
     setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Legal);
     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
-    setOperationAction(ISD::STRICT_FSQRT, MVT::f16, Legal);
-    setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Legal);
-    setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Legal);
-    for (auto CC : FPCCToExpand)
-      setCondCodeAction(CC, MVT::f16, Expand);
+    setCondCodeAction(FPCCToExpand, MVT::f16, Expand);
     setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
     setOperationAction(ISD::SELECT, MVT::f16, Custom);
     setOperationAction(ISD::BR_CC, MVT::f16, Expand);
 
-    setOperationAction(ISD::FREM,       MVT::f16, Promote);
-    setOperationAction(ISD::FCEIL,      MVT::f16, Promote);
-    setOperationAction(ISD::FFLOOR,     MVT::f16, Promote);
-    setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote);
-    setOperationAction(ISD::FRINT,      MVT::f16, Promote);
-    setOperationAction(ISD::FROUND,     MVT::f16, Promote);
-    setOperationAction(ISD::FROUNDEVEN, MVT::f16, Promote);
-    setOperationAction(ISD::FTRUNC,     MVT::f16, Promote);
-    setOperationAction(ISD::FPOW,       MVT::f16, Promote);
-    setOperationAction(ISD::FPOWI,      MVT::f16, Promote);
-    setOperationAction(ISD::FCOS,       MVT::f16, Promote);
-    setOperationAction(ISD::FSIN,       MVT::f16, Promote);
-    setOperationAction(ISD::FSINCOS,    MVT::f16, Promote);
-    setOperationAction(ISD::FEXP,       MVT::f16, Promote);
-    setOperationAction(ISD::FEXP2,      MVT::f16, Promote);
-    setOperationAction(ISD::FLOG,       MVT::f16, Promote);
-    setOperationAction(ISD::FLOG2,      MVT::f16, Promote);
-    setOperationAction(ISD::FLOG10,     MVT::f16, Promote);
+    setOperationAction({ISD::FREM, ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT,
+                        ISD::FRINT, ISD::FROUND, ISD::FROUNDEVEN, ISD::FTRUNC,
+                        ISD::FPOW, ISD::FPOWI, ISD::FCOS, ISD::FSIN,
+                        ISD::FSINCOS, ISD::FEXP, ISD::FEXP2, ISD::FLOG,
+                        ISD::FLOG2, ISD::FLOG10},
+                       MVT::f16, Promote);
 
     // FIXME: Need to promote f16 STRICT_* to f32 libcalls, but we don't have
     // complete support for all operations in LegalizeDAG.
@@ -385,26 +340,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   }
 
   if (Subtarget.hasStdExtF()) {
-    setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
-    setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
-    setOperationAction(ISD::LRINT, MVT::f32, Legal);
-    setOperationAction(ISD::LLRINT, MVT::f32, Legal);
-    setOperationAction(ISD::LROUND, MVT::f32, Legal);
-    setOperationAction(ISD::LLROUND, MVT::f32, Legal);
-    setOperationAction(ISD::STRICT_LRINT, MVT::f32, Legal);
-    setOperationAction(ISD::STRICT_LLRINT, MVT::f32, Legal);
-    setOperationAction(ISD::STRICT_LROUND, MVT::f32, Legal);
-    setOperationAction(ISD::STRICT_LLROUND, MVT::f32, Legal);
-    setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
-    setOperationAction(ISD::STRICT_FMA, MVT::f32, Legal);
-    setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
-    setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
-    setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
-    setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
-    setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Legal);
-    setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Legal);
-    for (auto CC : FPCCToExpand)
-      setCondCodeAction(CC, MVT::f32, Expand);
+    for (auto NT : FPLegalNodeTypes)
+      setOperationAction(NT, MVT::f32, Legal);
+    setCondCodeAction(FPCCToExpand, MVT::f32, Expand);
     setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
     setOperationAction(ISD::SELECT, MVT::f32, Custom);
     setOperationAction(ISD::BR_CC, MVT::f32, Expand);
@@ -418,28 +356,11 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::BITCAST, MVT::i32, Custom);
 
   if (Subtarget.hasStdExtD()) {
-    setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
-    setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
-    setOperationAction(ISD::LRINT, MVT::f64, Legal);
-    setOperationAction(ISD::LLRINT, MVT::f64, Legal);
-    setOperationAction(ISD::LROUND, MVT::f64, Legal);
-    setOperationAction(ISD::LLROUND, MVT::f64, Legal);
-    setOperationAction(ISD::STRICT_LRINT, MVT::f64, Legal);
-    setOperationAction(ISD::STRICT_LLRINT, MVT::f64, Legal);
-    setOperationAction(ISD::STRICT_LROUND, MVT::f64, Legal);
-    setOperationAction(ISD::STRICT_LLROUND, MVT::f64, Legal);
-    setOperationAction(ISD::STRICT_FMA, MVT::f64, Legal);
-    setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
-    setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
-    setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
-    setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
+    for (auto NT : FPLegalNodeTypes)
+      setOperationAction(NT, MVT::f64, Legal);
     setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
-    setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
-    setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Legal);
-    setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Legal);
-    for (auto CC : FPCCToExpand)
-      setCondCodeAction(CC, MVT::f64, Expand);
+    setCondCodeAction(FPCCToExpand, MVT::f64, Expand);
     setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
     setOperationAction(ISD::SELECT, MVT::f64, Custom);
     setOperationAction(ISD::BR_CC, MVT::f64, Expand);
@@ -451,40 +372,38 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setTruncStoreAction(MVT::f64, MVT::f16, Expand);
   }
 
-  if (Subtarget.is64Bit()) {
-    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
-    setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
-    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
-    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
-  }
+  if (Subtarget.is64Bit())
+    setOperationAction({ISD::FP_TO_UINT, ISD::FP_TO_SINT,
+                        ISD::STRICT_FP_TO_UINT, ISD::STRICT_FP_TO_SINT},
+                       MVT::i32, Custom);
 
   if (Subtarget.hasStdExtF()) {
-    setOperationAction(ISD::FP_TO_UINT_SAT, XLenVT, Custom);
-    setOperationAction(ISD::FP_TO_SINT_SAT, XLenVT, Custom);
+    setOperationAction({ISD::FP_TO_UINT_SAT, ISD::FP_TO_SINT_SAT}, XLenVT,
+                       Custom);
 
-    setOperationAction(ISD::STRICT_FP_TO_UINT, XLenVT, Legal);
-    setOperationAction(ISD::STRICT_FP_TO_SINT, XLenVT, Legal);
-    setOperationAction(ISD::STRICT_UINT_TO_FP, XLenVT, Legal);
-    setOperationAction(ISD::STRICT_SINT_TO_FP, XLenVT, Legal);
+    setOperationAction({ISD::STRICT_FP_TO_UINT, ISD::STRICT_FP_TO_SINT,
+                        ISD::STRICT_UINT_TO_FP, ISD::STRICT_SINT_TO_FP},
+                       XLenVT, Legal);
 
     setOperationAction(ISD::FLT_ROUNDS_, XLenVT, Custom);
     setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
   }
 
-  setOperationAction(ISD::GlobalAddress, XLenVT, Custom);
-  setOperationAction(ISD::BlockAddress, XLenVT, Custom);
-  setOperationAction(ISD::ConstantPool, XLenVT, Custom);
-  setOperationAction(ISD::JumpTable, XLenVT, Custom);
+  setOperationAction({ISD::GlobalAddress, ISD::BlockAddress, ISD::ConstantPool,
+                      ISD::JumpTable},
+                     XLenVT, Custom);
 
   setOperationAction(ISD::GlobalTLSAddress, XLenVT, Custom);
 
+  if (Subtarget.is64Bit())
+    setOperationAction(ISD::Constant, MVT::i64, Custom);
+
   // TODO: On M-mode only targets, the cycle[h] CSR may not be present.
   // Unfortunately this can't be determined just from the ISA naming string.
   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64,
                      Subtarget.is64Bit() ? Legal : Custom);
 
-  setOperationAction(ISD::TRAP, MVT::Other, Legal);
-  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
+  setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Legal);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   if (Subtarget.is64Bit())
     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i32, Custom);
@@ -505,19 +424,16 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
     // RVV intrinsics may have illegal operands.
     // We also need to custom legalize vmv.x.s.
-    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
-    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
-    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
-    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom);
-    if (Subtarget.is64Bit()) {
+    setOperationAction({ISD::INTRINSIC_WO_CHAIN, ISD::INTRINSIC_W_CHAIN},
+                       {MVT::i8, MVT::i16}, Custom);
+    if (Subtarget.is64Bit())
       setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i32, Custom);
-    } else {
-      setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
-      setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
-    }
+    else
+      setOperationAction({ISD::INTRINSIC_WO_CHAIN, ISD::INTRINSIC_W_CHAIN},
+                         MVT::i64, Custom);
 
-    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
-    setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+    setOperationAction({ISD::INTRINSIC_W_CHAIN, ISD::INTRINSIC_VOID},
+                       MVT::Other, Custom);
 
     static const unsigned IntegerVPOps[] = {
         ISD::VP_ADD,         ISD::VP_SUB,         ISD::VP_MUL,
@@ -527,191 +443,175 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         ISD::VP_SHL,         ISD::VP_REDUCE_ADD,  ISD::VP_REDUCE_AND,
         ISD::VP_REDUCE_OR,   ISD::VP_REDUCE_XOR,  ISD::VP_REDUCE_SMAX,
         ISD::VP_REDUCE_SMIN, ISD::VP_REDUCE_UMAX, ISD::VP_REDUCE_UMIN,
-        ISD::VP_MERGE,       ISD::VP_SELECT};
+        ISD::VP_MERGE,       ISD::VP_SELECT,      ISD::VP_FPTOSI,
+        ISD::VP_FPTOUI,      ISD::VP_SETCC,       ISD::VP_SIGN_EXTEND,
+        ISD::VP_ZERO_EXTEND, ISD::VP_TRUNCATE};
 
     static const unsigned FloatingPointVPOps[] = {
-        ISD::VP_FADD,        ISD::VP_FSUB,        ISD::VP_FMUL,
-        ISD::VP_FDIV,        ISD::VP_REDUCE_FADD, ISD::VP_REDUCE_SEQ_FADD,
-        ISD::VP_REDUCE_FMIN, ISD::VP_REDUCE_FMAX, ISD::VP_MERGE,
-        ISD::VP_SELECT};
+        ISD::VP_FADD,        ISD::VP_FSUB,
+        ISD::VP_FMUL,        ISD::VP_FDIV,
+        ISD::VP_FNEG,        ISD::VP_FMA,
+        ISD::VP_REDUCE_FADD, ISD::VP_REDUCE_SEQ_FADD,
+        ISD::VP_REDUCE_FMIN, ISD::VP_REDUCE_FMAX,
+        ISD::VP_MERGE,       ISD::VP_SELECT,
+        ISD::VP_SITOFP,      ISD::VP_UITOFP,
+        ISD::VP_SETCC,       ISD::VP_FP_ROUND,
+        ISD::VP_FP_EXTEND};
 
     if (!Subtarget.is64Bit()) {
       // We must custom-lower certain vXi64 operations on RV32 due to the vector
       // element type being illegal.
-      setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::i64, Custom);
-      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::i64, Custom);
-
-      setOperationAction(ISD::VECREDUCE_ADD, MVT::i64, Custom);
-      setOperationAction(ISD::VECREDUCE_AND, MVT::i64, Custom);
-      setOperationAction(ISD::VECREDUCE_OR, MVT::i64, Custom);
-      setOperationAction(ISD::VECREDUCE_XOR, MVT::i64, Custom);
-      setOperationAction(ISD::VECREDUCE_SMAX, MVT::i64, Custom);
-      setOperationAction(ISD::VECREDUCE_SMIN, MVT::i64, Custom);
-      setOperationAction(ISD::VECREDUCE_UMAX, MVT::i64, Custom);
-      setOperationAction(ISD::VECREDUCE_UMIN, MVT::i64, Custom);
-
-      setOperationAction(ISD::VP_REDUCE_ADD, MVT::i64, Custom);
-      setOperationAction(ISD::VP_REDUCE_AND, MVT::i64, Custom);
-      setOperationAction(ISD::VP_REDUCE_OR, MVT::i64, Custom);
-      setOperationAction(ISD::VP_REDUCE_XOR, MVT::i64, Custom);
-      setOperationAction(ISD::VP_REDUCE_SMAX, MVT::i64, Custom);
-      setOperationAction(ISD::VP_REDUCE_SMIN, MVT::i64, Custom);
-      setOperationAction(ISD::VP_REDUCE_UMAX, MVT::i64, Custom);
-      setOperationAction(ISD::VP_REDUCE_UMIN, MVT::i64, Custom);
+      setOperationAction({ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT},
+                         MVT::i64, Custom);
+
+      setOperationAction({ISD::VECREDUCE_ADD, ISD::VECREDUCE_AND,
+                          ISD::VECREDUCE_OR, ISD::VECREDUCE_XOR,
+                          ISD::VECREDUCE_SMAX, ISD::VECREDUCE_SMIN,
+                          ISD::VECREDUCE_UMAX, ISD::VECREDUCE_UMIN},
+                         MVT::i64, Custom);
+
+      setOperationAction({ISD::VP_REDUCE_ADD, ISD::VP_REDUCE_AND,
+                          ISD::VP_REDUCE_OR, ISD::VP_REDUCE_XOR,
+                          ISD::VP_REDUCE_SMAX, ISD::VP_REDUCE_SMIN,
+                          ISD::VP_REDUCE_UMAX, ISD::VP_REDUCE_UMIN},
+                         MVT::i64, Custom);
     }
 
     for (MVT VT : BoolVecVTs) {
+      if (!isTypeLegal(VT))
+        continue;
+
       setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
 
       // Mask VTs are custom-expanded into a series of standard nodes
-      setOperationAction(ISD::TRUNCATE, VT, Custom);
-      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
-      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
-      setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+      setOperationAction({ISD::TRUNCATE, ISD::CONCAT_VECTORS,
+                          ISD::INSERT_SUBVECTOR, ISD::EXTRACT_SUBVECTOR},
+                         VT, Custom);
 
-      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
-      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+      setOperationAction({ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT}, VT,
+                         Custom);
 
       setOperationAction(ISD::SELECT, VT, Custom);
-      setOperationAction(ISD::SELECT_CC, VT, Expand);
-      setOperationAction(ISD::VSELECT, VT, Expand);
-      setOperationAction(ISD::VP_MERGE, VT, Expand);
-      setOperationAction(ISD::VP_SELECT, VT, Expand);
+      setOperationAction(
+          {ISD::SELECT_CC, ISD::VSELECT, ISD::VP_MERGE, ISD::VP_SELECT}, VT,
+          Expand);
 
-      setOperationAction(ISD::VP_AND, VT, Custom);
-      setOperationAction(ISD::VP_OR, VT, Custom);
-      setOperationAction(ISD::VP_XOR, VT, Custom);
+      setOperationAction({ISD::VP_AND, ISD::VP_OR, ISD::VP_XOR}, VT, Custom);
 
-      setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
-      setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
-      setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
+      setOperationAction(
+          {ISD::VECREDUCE_AND, ISD::VECREDUCE_OR, ISD::VECREDUCE_XOR}, VT,
+          Custom);
 
-      setOperationAction(ISD::VP_REDUCE_AND, VT, Custom);
-      setOperationAction(ISD::VP_REDUCE_OR, VT, Custom);
-      setOperationAction(ISD::VP_REDUCE_XOR, VT, Custom);
+      setOperationAction(
+          {ISD::VP_REDUCE_AND, ISD::VP_REDUCE_OR, ISD::VP_REDUCE_XOR}, VT,
+          Custom);
 
       // RVV has native int->float & float->int conversions where the
       // element type sizes are within one power-of-two of each other. Any
       // wider distances between type sizes have to be lowered as sequences
       // which progressively narrow the gap in stages.
-      setOperationAction(ISD::SINT_TO_FP, VT, Custom);
-      setOperationAction(ISD::UINT_TO_FP, VT, Custom);
-      setOperationAction(ISD::FP_TO_SINT, VT, Custom);
-      setOperationAction(ISD::FP_TO_UINT, VT, Custom);
+      setOperationAction(
+          {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
+          VT, Custom);
 
       // Expand all extending loads to types larger than this, and truncating
       // stores from types larger than this.
       for (MVT OtherVT : MVT::integer_scalable_vector_valuetypes()) {
         setTruncStoreAction(OtherVT, VT, Expand);
-        setLoadExtAction(ISD::EXTLOAD, OtherVT, VT, Expand);
-        setLoadExtAction(ISD::SEXTLOAD, OtherVT, VT, Expand);
-        setLoadExtAction(ISD::ZEXTLOAD, OtherVT, VT, Expand);
+        setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, OtherVT,
+                         VT, Expand);
       }
+
+      setOperationAction(
+          {ISD::VP_FPTOSI, ISD::VP_FPTOUI, ISD::VP_TRUNCATE, ISD::VP_SETCC}, VT,
+          Custom);
+      setOperationAction(ISD::VECTOR_REVERSE, VT, Custom);
     }
 
     for (MVT VT : IntVecVTs) {
-      if (VT.getVectorElementType() == MVT::i64 &&
-          !Subtarget.hasVInstructionsI64())
+      if (!isTypeLegal(VT))
         continue;
 
       setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
       setOperationAction(ISD::SPLAT_VECTOR_PARTS, VT, Custom);
 
       // Vectors implement MULHS/MULHU.
-      setOperationAction(ISD::SMUL_LOHI, VT, Expand);
-      setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+      setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT, Expand);
 
       // nxvXi64 MULHS/MULHU requires the V extension instead of Zve64*.
-      if (VT.getVectorElementType() == MVT::i64 && !Subtarget.hasStdExtV()) {
-        setOperationAction(ISD::MULHU, VT, Expand);
-        setOperationAction(ISD::MULHS, VT, Expand);
-      }
+      if (VT.getVectorElementType() == MVT::i64 && !Subtarget.hasStdExtV())
+        setOperationAction({ISD::MULHU, ISD::MULHS}, VT, Expand);
 
-      setOperationAction(ISD::SMIN, VT, Legal);
-      setOperationAction(ISD::SMAX, VT, Legal);
-      setOperationAction(ISD::UMIN, VT, Legal);
-      setOperationAction(ISD::UMAX, VT, Legal);
+      setOperationAction({ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}, VT,
+                         Legal);
 
-      setOperationAction(ISD::ROTL, VT, Expand);
-      setOperationAction(ISD::ROTR, VT, Expand);
+      setOperationAction({ISD::ROTL, ISD::ROTR}, VT, Expand);
 
-      setOperationAction(ISD::CTTZ, VT, Expand);
-      setOperationAction(ISD::CTLZ, VT, Expand);
-      setOperationAction(ISD::CTPOP, VT, Expand);
+      setOperationAction({ISD::CTTZ, ISD::CTLZ, ISD::CTPOP, ISD::BSWAP}, VT,
+                         Expand);
 
       setOperationAction(ISD::BSWAP, VT, Expand);
 
       // Custom-lower extensions and truncations from/to mask types.
-      setOperationAction(ISD::ANY_EXTEND, VT, Custom);
-      setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
-      setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
+      setOperationAction({ISD::ANY_EXTEND, ISD::SIGN_EXTEND, ISD::ZERO_EXTEND},
+                         VT, Custom);
 
       // RVV has native int->float & float->int conversions where the
       // element type sizes are within one power-of-two of each other. Any
       // wider distances between type sizes have to be lowered as sequences
       // which progressively narrow the gap in stages.
-      setOperationAction(ISD::SINT_TO_FP, VT, Custom);
-      setOperationAction(ISD::UINT_TO_FP, VT, Custom);
-      setOperationAction(ISD::FP_TO_SINT, VT, Custom);
-      setOperationAction(ISD::FP_TO_UINT, VT, Custom);
+      setOperationAction(
+          {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
+          VT, Custom);
 
-      setOperationAction(ISD::SADDSAT, VT, Legal);
-      setOperationAction(ISD::UADDSAT, VT, Legal);
-      setOperationAction(ISD::SSUBSAT, VT, Legal);
-      setOperationAction(ISD::USUBSAT, VT, Legal);
+      setOperationAction(
+          {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT}, VT, Legal);
 
       // Integer VTs are lowered as a series of "RISCVISD::TRUNCATE_VECTOR_VL"
       // nodes which truncate by one power of two at a time.
       setOperationAction(ISD::TRUNCATE, VT, Custom);
 
       // Custom-lower insert/extract operations to simplify patterns.
-      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
-      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+      setOperationAction({ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT}, VT,
+                         Custom);
 
       // Custom-lower reduction operations to set up the corresponding custom
       // nodes' operands.
-      setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
-      setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
-      setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
-      setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
-      setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
-      setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
-      setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
-      setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
-
-      for (unsigned VPOpc : IntegerVPOps)
-        setOperationAction(VPOpc, VT, Custom);
-
-      setOperationAction(ISD::LOAD, VT, Custom);
-      setOperationAction(ISD::STORE, VT, Custom);
-
-      setOperationAction(ISD::MLOAD, VT, Custom);
-      setOperationAction(ISD::MSTORE, VT, Custom);
-      setOperationAction(ISD::MGATHER, VT, Custom);
-      setOperationAction(ISD::MSCATTER, VT, Custom);
-
-      setOperationAction(ISD::VP_LOAD, VT, Custom);
-      setOperationAction(ISD::VP_STORE, VT, Custom);
-      setOperationAction(ISD::VP_GATHER, VT, Custom);
-      setOperationAction(ISD::VP_SCATTER, VT, Custom);
-
-      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
-      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
-      setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+      setOperationAction({ISD::VECREDUCE_ADD, ISD::VECREDUCE_AND,
+                          ISD::VECREDUCE_OR, ISD::VECREDUCE_XOR,
+                          ISD::VECREDUCE_SMAX, ISD::VECREDUCE_SMIN,
+                          ISD::VECREDUCE_UMAX, ISD::VECREDUCE_UMIN},
+                         VT, Custom);
+
+      setOperationAction(IntegerVPOps, VT, Custom);
+
+      setOperationAction({ISD::LOAD, ISD::STORE}, VT, Custom);
+
+      setOperationAction({ISD::MLOAD, ISD::MSTORE, ISD::MGATHER, ISD::MSCATTER},
+                         VT, Custom);
+
+      setOperationAction(
+          {ISD::VP_LOAD, ISD::VP_STORE, ISD::VP_GATHER, ISD::VP_SCATTER}, VT,
+          Custom);
+
+      setOperationAction(
+          {ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR, ISD::EXTRACT_SUBVECTOR},
+          VT, Custom);
 
       setOperationAction(ISD::SELECT, VT, Custom);
       setOperationAction(ISD::SELECT_CC, VT, Expand);
 
-      setOperationAction(ISD::STEP_VECTOR, VT, Custom);
-      setOperationAction(ISD::VECTOR_REVERSE, VT, Custom);
+      setOperationAction({ISD::STEP_VECTOR, ISD::VECTOR_REVERSE}, VT, Custom);
 
       for (MVT OtherVT : MVT::integer_scalable_vector_valuetypes()) {
         setTruncStoreAction(VT, OtherVT, Expand);
-        setLoadExtAction(ISD::EXTLOAD, OtherVT, VT, Expand);
-        setLoadExtAction(ISD::SEXTLOAD, OtherVT, VT, Expand);
-        setLoadExtAction(ISD::ZEXTLOAD, OtherVT, VT, Expand);
+        setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, OtherVT,
+                         VT, Expand);
       }
 
+      // Splice
+      setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
+
       // Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if we have a floating point
       // type that can represent the value exactly.
       if (VT.getVectorElementType() != MVT::i64) {
@@ -719,8 +619,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
             VT.getVectorElementType() == MVT::i32 ? MVT::f64 : MVT::f32;
         EVT FloatVT = MVT::getVectorVT(FloatEltVT, VT.getVectorElementCount());
         if (isTypeLegal(FloatVT)) {
-          setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
-          setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
+          setOperationAction({ISD::CTLZ_ZERO_UNDEF, ISD::CTTZ_ZERO_UNDEF}, VT,
+                             Custom);
         }
       }
     }
@@ -745,21 +645,35 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       // sizes are within one power-of-two of each other. Therefore conversions
       // between vXf16 and vXf64 must be lowered as sequences which convert via
       // vXf32.
-      setOperationAction(ISD::FP_ROUND, VT, Custom);
-      setOperationAction(ISD::FP_EXTEND, VT, Custom);
+      setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom);
       // Custom-lower insert/extract operations to simplify patterns.
-      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
-      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+      setOperationAction({ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT}, VT,
+                         Custom);
       // Expand various condition codes (explained above).
-      for (auto CC : VFPCCToExpand)
-        setCondCodeAction(CC, VT, Expand);
-
-      setOperationAction(ISD::FMINNUM, VT, Legal);
-      setOperationAction(ISD::FMAXNUM, VT, Legal);
-
-      setOperationAction(ISD::FTRUNC, VT, Custom);
-      setOperationAction(ISD::FCEIL, VT, Custom);
-      setOperationAction(ISD::FFLOOR, VT, Custom);
+      setCondCodeAction(VFPCCToExpand, VT, Expand);
+
+      setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, VT, Legal);
+
+      setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FFLOOR, ISD::FROUND},
+                         VT, Custom);
+
+      setOperationAction({ISD::VECREDUCE_FADD, ISD::VECREDUCE_SEQ_FADD,
+                          ISD::VECREDUCE_FMIN, ISD::VECREDUCE_FMAX},
+                         VT, Custom);
+
+      // Expand FP operations that need libcalls.
+      setOperationAction(ISD::FREM, VT, Expand);
+      setOperationAction(ISD::FPOW, VT, Expand);
+      setOperationAction(ISD::FCOS, VT, Expand);
+      setOperationAction(ISD::FSIN, VT, Expand);
+      setOperationAction(ISD::FSINCOS, VT, Expand);
+      setOperationAction(ISD::FEXP, VT, Expand);
+      setOperationAction(ISD::FEXP2, VT, Expand);
+      setOperationAction(ISD::FLOG, VT, Expand);
+      setOperationAction(ISD::FLOG2, VT, Expand);
+      setOperationAction(ISD::FLOG10, VT, Expand);
+      setOperationAction(ISD::FRINT, VT, Expand);
+      setOperationAction(ISD::FNEARBYINT, VT, Expand);
 
       setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
       setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
@@ -768,30 +682,25 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
       setOperationAction(ISD::FCOPYSIGN, VT, Legal);
 
-      setOperationAction(ISD::LOAD, VT, Custom);
-      setOperationAction(ISD::STORE, VT, Custom);
+      setOperationAction({ISD::LOAD, ISD::STORE}, VT, Custom);
 
-      setOperationAction(ISD::MLOAD, VT, Custom);
-      setOperationAction(ISD::MSTORE, VT, Custom);
-      setOperationAction(ISD::MGATHER, VT, Custom);
-      setOperationAction(ISD::MSCATTER, VT, Custom);
+      setOperationAction({ISD::MLOAD, ISD::MSTORE, ISD::MGATHER, ISD::MSCATTER},
+                         VT, Custom);
 
-      setOperationAction(ISD::VP_LOAD, VT, Custom);
-      setOperationAction(ISD::VP_STORE, VT, Custom);
-      setOperationAction(ISD::VP_GATHER, VT, Custom);
-      setOperationAction(ISD::VP_SCATTER, VT, Custom);
+      setOperationAction(
+          {ISD::VP_LOAD, ISD::VP_STORE, ISD::VP_GATHER, ISD::VP_SCATTER}, VT,
+          Custom);
 
       setOperationAction(ISD::SELECT, VT, Custom);
       setOperationAction(ISD::SELECT_CC, VT, Expand);
 
-      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
-      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
-      setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+      setOperationAction(
+          {ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR, ISD::EXTRACT_SUBVECTOR},
+          VT, Custom);
 
-      setOperationAction(ISD::VECTOR_REVERSE, VT, Custom);
+      setOperationAction({ISD::VECTOR_REVERSE, ISD::VECTOR_SPLICE}, VT, Custom);
 
-      for (unsigned VPOpc : FloatingPointVPOps)
-        setOperationAction(VPOpc, VT, Custom);
+      setOperationAction(FloatingPointVPOps, VT, Custom);
     };
 
     // Sets common extload/truncstore actions on RVV floating-point vector
@@ -804,21 +713,31 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
           }
         };
 
-    if (Subtarget.hasVInstructionsF16())
-      for (MVT VT : F16VecVTs)
+    if (Subtarget.hasVInstructionsF16()) {
+      for (MVT VT : F16VecVTs) {
+        if (!isTypeLegal(VT))
+          continue;
         SetCommonVFPActions(VT);
+      }
+    }
 
-    for (MVT VT : F32VecVTs) {
-      if (Subtarget.hasVInstructionsF32())
+    if (Subtarget.hasVInstructionsF32()) {
+      for (MVT VT : F32VecVTs) {
+        if (!isTypeLegal(VT))
+          continue;
         SetCommonVFPActions(VT);
-      SetCommonVFPExtLoadTruncStoreActions(VT, F16VecVTs);
+        SetCommonVFPExtLoadTruncStoreActions(VT, F16VecVTs);
+      }
     }
 
-    for (MVT VT : F64VecVTs) {
-      if (Subtarget.hasVInstructionsF64())
+    if (Subtarget.hasVInstructionsF64()) {
+      for (MVT VT : F64VecVTs) {
+        if (!isTypeLegal(VT))
+          continue;
         SetCommonVFPActions(VT);
-      SetCommonVFPExtLoadTruncStoreActions(VT, F16VecVTs);
-      SetCommonVFPExtLoadTruncStoreActions(VT, F32VecVTs);
+        SetCommonVFPExtLoadTruncStoreActions(VT, F16VecVTs);
+        SetCommonVFPExtLoadTruncStoreActions(VT, F32VecVTs);
+      }
     }
 
     if (Subtarget.useRVVForFixedLengthVectors()) {
@@ -831,23 +750,21 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
           setOperationAction(Op, VT, Expand);
         for (MVT OtherVT : MVT::integer_fixedlen_vector_valuetypes()) {
           setTruncStoreAction(VT, OtherVT, Expand);
-          setLoadExtAction(ISD::EXTLOAD, OtherVT, VT, Expand);
-          setLoadExtAction(ISD::SEXTLOAD, OtherVT, VT, Expand);
-          setLoadExtAction(ISD::ZEXTLOAD, OtherVT, VT, Expand);
+          setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD},
+                           OtherVT, VT, Expand);
         }
 
         // We use EXTRACT_SUBVECTOR as a "cast" from scalable to fixed.
-        setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
-        setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+        setOperationAction({ISD::INSERT_SUBVECTOR, ISD::EXTRACT_SUBVECTOR}, VT,
+                           Custom);
 
-        setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
-        setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+        setOperationAction({ISD::BUILD_VECTOR, ISD::CONCAT_VECTORS}, VT,
+                           Custom);
 
-        setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
-        setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+        setOperationAction({ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT},
+                           VT, Custom);
 
-        setOperationAction(ISD::LOAD, VT, Custom);
-        setOperationAction(ISD::STORE, VT, Custom);
+        setOperationAction({ISD::LOAD, ISD::STORE}, VT, Custom);
 
         setOperationAction(ISD::SETCC, VT, Custom);
 
@@ -857,100 +774,80 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
         setOperationAction(ISD::BITCAST, VT, Custom);
 
-        setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
-        setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
-        setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
+        setOperationAction(
+            {ISD::VECREDUCE_AND, ISD::VECREDUCE_OR, ISD::VECREDUCE_XOR}, VT,
+            Custom);
 
-        setOperationAction(ISD::VP_REDUCE_AND, VT, Custom);
-        setOperationAction(ISD::VP_REDUCE_OR, VT, Custom);
-        setOperationAction(ISD::VP_REDUCE_XOR, VT, Custom);
+        setOperationAction(
+            {ISD::VP_REDUCE_AND, ISD::VP_REDUCE_OR, ISD::VP_REDUCE_XOR}, VT,
+            Custom);
 
-        setOperationAction(ISD::SINT_TO_FP, VT, Custom);
-        setOperationAction(ISD::UINT_TO_FP, VT, Custom);
-        setOperationAction(ISD::FP_TO_SINT, VT, Custom);
-        setOperationAction(ISD::FP_TO_UINT, VT, Custom);
+        setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT,
+                            ISD::FP_TO_UINT},
+                           VT, Custom);
 
         // Operations below are different for between masks and other vectors.
         if (VT.getVectorElementType() == MVT::i1) {
-          setOperationAction(ISD::VP_AND, VT, Custom);
-          setOperationAction(ISD::VP_OR, VT, Custom);
-          setOperationAction(ISD::VP_XOR, VT, Custom);
-          setOperationAction(ISD::AND, VT, Custom);
-          setOperationAction(ISD::OR, VT, Custom);
-          setOperationAction(ISD::XOR, VT, Custom);
+          setOperationAction({ISD::VP_AND, ISD::VP_OR, ISD::VP_XOR, ISD::AND,
+                              ISD::OR, ISD::XOR},
+                             VT, Custom);
+
+          setOperationAction(
+              {ISD::VP_FPTOSI, ISD::VP_FPTOUI, ISD::VP_SETCC, ISD::VP_TRUNCATE},
+              VT, Custom);
           continue;
         }
 
-        // Use SPLAT_VECTOR to prevent type legalization from destroying the
-        // splats when type legalizing i64 scalar on RV32.
+        // Make SPLAT_VECTOR Legal so DAGCombine will convert splat vectors to
+        // it before type legalization for i64 vectors on RV32. It will then be
+        // type legalized to SPLAT_VECTOR_PARTS which we need to Custom handle.
         // FIXME: Use SPLAT_VECTOR for all types? DAGCombine probably needs
         // improvements first.
         if (!Subtarget.is64Bit() && VT.getVectorElementType() == MVT::i64) {
-          setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+          setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
           setOperationAction(ISD::SPLAT_VECTOR_PARTS, VT, Custom);
         }
 
         setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
         setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
 
-        setOperationAction(ISD::MLOAD, VT, Custom);
-        setOperationAction(ISD::MSTORE, VT, Custom);
-        setOperationAction(ISD::MGATHER, VT, Custom);
-        setOperationAction(ISD::MSCATTER, VT, Custom);
-
-        setOperationAction(ISD::VP_LOAD, VT, Custom);
-        setOperationAction(ISD::VP_STORE, VT, Custom);
-        setOperationAction(ISD::VP_GATHER, VT, Custom);
-        setOperationAction(ISD::VP_SCATTER, VT, Custom);
-
-        setOperationAction(ISD::ADD, VT, Custom);
-        setOperationAction(ISD::MUL, VT, Custom);
-        setOperationAction(ISD::SUB, VT, Custom);
-        setOperationAction(ISD::AND, VT, Custom);
-        setOperationAction(ISD::OR, VT, Custom);
-        setOperationAction(ISD::XOR, VT, Custom);
-        setOperationAction(ISD::SDIV, VT, Custom);
-        setOperationAction(ISD::SREM, VT, Custom);
-        setOperationAction(ISD::UDIV, VT, Custom);
-        setOperationAction(ISD::UREM, VT, Custom);
-        setOperationAction(ISD::SHL, VT, Custom);
-        setOperationAction(ISD::SRA, VT, Custom);
-        setOperationAction(ISD::SRL, VT, Custom);
-
-        setOperationAction(ISD::SMIN, VT, Custom);
-        setOperationAction(ISD::SMAX, VT, Custom);
-        setOperationAction(ISD::UMIN, VT, Custom);
-        setOperationAction(ISD::UMAX, VT, Custom);
-        setOperationAction(ISD::ABS,  VT, Custom);
+        setOperationAction(
+            {ISD::MLOAD, ISD::MSTORE, ISD::MGATHER, ISD::MSCATTER}, VT, Custom);
+
+        setOperationAction(
+            {ISD::VP_LOAD, ISD::VP_STORE, ISD::VP_GATHER, ISD::VP_SCATTER}, VT,
+            Custom);
+
+        setOperationAction({ISD::ADD, ISD::MUL, ISD::SUB, ISD::AND, ISD::OR,
+                            ISD::XOR, ISD::SDIV, ISD::SREM, ISD::UDIV,
+                            ISD::UREM, ISD::SHL, ISD::SRA, ISD::SRL},
+                           VT, Custom);
+
+        setOperationAction(
+            {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX, ISD::ABS}, VT, Custom);
 
         // vXi64 MULHS/MULHU requires the V extension instead of Zve64*.
-        if (VT.getVectorElementType() != MVT::i64 || Subtarget.hasStdExtV()) {
-          setOperationAction(ISD::MULHS, VT, Custom);
-          setOperationAction(ISD::MULHU, VT, Custom);
-        }
+        if (VT.getVectorElementType() != MVT::i64 || Subtarget.hasStdExtV())
+          setOperationAction({ISD::MULHS, ISD::MULHU}, VT, Custom);
 
-        setOperationAction(ISD::SADDSAT, VT, Custom);
-        setOperationAction(ISD::UADDSAT, VT, Custom);
-        setOperationAction(ISD::SSUBSAT, VT, Custom);
-        setOperationAction(ISD::USUBSAT, VT, Custom);
+        setOperationAction(
+            {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT}, VT,
+            Custom);
 
         setOperationAction(ISD::VSELECT, VT, Custom);
         setOperationAction(ISD::SELECT_CC, VT, Expand);
 
-        setOperationAction(ISD::ANY_EXTEND, VT, Custom);
-        setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
-        setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
+        setOperationAction(
+            {ISD::ANY_EXTEND, ISD::SIGN_EXTEND, ISD::ZERO_EXTEND}, VT, Custom);
 
         // Custom-lower reduction operations to set up the corresponding custom
         // nodes' operands.
-        setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
-        setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
-        setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
-        setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
-        setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
+        setOperationAction({ISD::VECREDUCE_ADD, ISD::VECREDUCE_SMAX,
+                            ISD::VECREDUCE_SMIN, ISD::VECREDUCE_UMAX,
+                            ISD::VECREDUCE_UMIN},
+                           VT, Custom);
 
-        for (unsigned VPOpc : IntegerVPOps)
-          setOperationAction(VPOpc, VT, Custom);
+        setOperationAction(IntegerVPOps, VT, Custom);
 
         // Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if we have a floating point
         // type that can represent the value exactly.
@@ -959,10 +856,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
               VT.getVectorElementType() == MVT::i32 ? MVT::f64 : MVT::f32;
           EVT FloatVT =
               MVT::getVectorVT(FloatEltVT, VT.getVectorElementCount());
-          if (isTypeLegal(FloatVT)) {
-            setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
-            setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
-          }
+          if (isTypeLegal(FloatVT))
+            setOperationAction({ISD::CTLZ_ZERO_UNDEF, ISD::CTTZ_ZERO_UNDEF}, VT,
+                               Custom);
         }
       }
 
@@ -979,69 +875,50 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         }
 
         // We use EXTRACT_SUBVECTOR as a "cast" from scalable to fixed.
-        setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
-        setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+        setOperationAction({ISD::INSERT_SUBVECTOR, ISD::EXTRACT_SUBVECTOR}, VT,
+                           Custom);
 
-        setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
-        setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
-        setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
-        setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
-        setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
-
-        setOperationAction(ISD::LOAD, VT, Custom);
-        setOperationAction(ISD::STORE, VT, Custom);
-        setOperationAction(ISD::MLOAD, VT, Custom);
-        setOperationAction(ISD::MSTORE, VT, Custom);
-        setOperationAction(ISD::MGATHER, VT, Custom);
-        setOperationAction(ISD::MSCATTER, VT, Custom);
-
-        setOperationAction(ISD::VP_LOAD, VT, Custom);
-        setOperationAction(ISD::VP_STORE, VT, Custom);
-        setOperationAction(ISD::VP_GATHER, VT, Custom);
-        setOperationAction(ISD::VP_SCATTER, VT, Custom);
-
-        setOperationAction(ISD::FADD, VT, Custom);
-        setOperationAction(ISD::FSUB, VT, Custom);
-        setOperationAction(ISD::FMUL, VT, Custom);
-        setOperationAction(ISD::FDIV, VT, Custom);
-        setOperationAction(ISD::FNEG, VT, Custom);
-        setOperationAction(ISD::FABS, VT, Custom);
-        setOperationAction(ISD::FCOPYSIGN, VT, Custom);
-        setOperationAction(ISD::FSQRT, VT, Custom);
-        setOperationAction(ISD::FMA, VT, Custom);
-        setOperationAction(ISD::FMINNUM, VT, Custom);
-        setOperationAction(ISD::FMAXNUM, VT, Custom);
-
-        setOperationAction(ISD::FP_ROUND, VT, Custom);
-        setOperationAction(ISD::FP_EXTEND, VT, Custom);
-
-        setOperationAction(ISD::FTRUNC, VT, Custom);
-        setOperationAction(ISD::FCEIL, VT, Custom);
-        setOperationAction(ISD::FFLOOR, VT, Custom);
+        setOperationAction({ISD::BUILD_VECTOR, ISD::CONCAT_VECTORS,
+                            ISD::VECTOR_SHUFFLE, ISD::INSERT_VECTOR_ELT,
+                            ISD::EXTRACT_VECTOR_ELT},
+                           VT, Custom);
+
+        setOperationAction({ISD::LOAD, ISD::STORE, ISD::MLOAD, ISD::MSTORE,
+                            ISD::MGATHER, ISD::MSCATTER},
+                           VT, Custom);
+
+        setOperationAction(
+            {ISD::VP_LOAD, ISD::VP_STORE, ISD::VP_GATHER, ISD::VP_SCATTER}, VT,
+            Custom);
+
+        setOperationAction({ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV,
+                            ISD::FNEG, ISD::FABS, ISD::FCOPYSIGN, ISD::FSQRT,
+                            ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM},
+                           VT, Custom);
+
+        setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom);
+
+        setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FFLOOR, ISD::FROUND},
+                           VT, Custom);
 
         for (auto CC : VFPCCToExpand)
           setCondCodeAction(CC, VT, Expand);
 
-        setOperationAction(ISD::VSELECT, VT, Custom);
-        setOperationAction(ISD::SELECT, VT, Custom);
+        setOperationAction({ISD::VSELECT, ISD::SELECT}, VT, Custom);
         setOperationAction(ISD::SELECT_CC, VT, Expand);
 
         setOperationAction(ISD::BITCAST, VT, Custom);
 
-        setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
-        setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
-        setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
-        setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
+        setOperationAction({ISD::VECREDUCE_FADD, ISD::VECREDUCE_SEQ_FADD,
+                            ISD::VECREDUCE_FMIN, ISD::VECREDUCE_FMAX},
+                           VT, Custom);
 
-        for (unsigned VPOpc : FloatingPointVPOps)
-          setOperationAction(VPOpc, VT, Custom);
+        setOperationAction(FloatingPointVPOps, VT, Custom);
       }
 
       // Custom-legalize bitcasts from fixed-length vectors to scalar types.
-      setOperationAction(ISD::BITCAST, MVT::i8, Custom);
-      setOperationAction(ISD::BITCAST, MVT::i16, Custom);
-      setOperationAction(ISD::BITCAST, MVT::i32, Custom);
-      setOperationAction(ISD::BITCAST, MVT::i64, Custom);
+      setOperationAction(ISD::BITCAST, {MVT::i8, MVT::i16, MVT::i32, MVT::i64},
+                         Custom);
       if (Subtarget.hasStdExtZfh())
         setOperationAction(ISD::BITCAST, MVT::f16, Custom);
       if (Subtarget.hasStdExtF())
@@ -1061,30 +938,33 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   // Jumps are expensive, compared to logic
   setJumpIsExpensive();
 
-  setTargetDAGCombine(ISD::ADD);
-  setTargetDAGCombine(ISD::SUB);
-  setTargetDAGCombine(ISD::AND);
-  setTargetDAGCombine(ISD::OR);
-  setTargetDAGCombine(ISD::XOR);
-  setTargetDAGCombine(ISD::ANY_EXTEND);
-  if (Subtarget.hasStdExtF()) {
-    setTargetDAGCombine(ISD::ZERO_EXTEND);
-    setTargetDAGCombine(ISD::FP_TO_SINT);
-    setTargetDAGCombine(ISD::FP_TO_UINT);
-    setTargetDAGCombine(ISD::FP_TO_SINT_SAT);
-    setTargetDAGCombine(ISD::FP_TO_UINT_SAT);
-  }
-  if (Subtarget.hasVInstructions()) {
-    setTargetDAGCombine(ISD::FCOPYSIGN);
-    setTargetDAGCombine(ISD::MGATHER);
-    setTargetDAGCombine(ISD::MSCATTER);
-    setTargetDAGCombine(ISD::VP_GATHER);
-    setTargetDAGCombine(ISD::VP_SCATTER);
+  setTargetDAGCombine({ISD::INTRINSIC_WO_CHAIN, ISD::ADD, ISD::SUB, ISD::AND,
+                       ISD::OR, ISD::XOR});
+  if (Subtarget.is64Bit())
     setTargetDAGCombine(ISD::SRA);
-    setTargetDAGCombine(ISD::SRL);
-    setTargetDAGCombine(ISD::SHL);
-    setTargetDAGCombine(ISD::STORE);
-  }
+
+  if (Subtarget.hasStdExtF())
+    setTargetDAGCombine({ISD::FADD, ISD::FMAXNUM, ISD::FMINNUM});
+
+  if (Subtarget.hasStdExtZbp())
+    setTargetDAGCombine({ISD::ROTL, ISD::ROTR});
+
+  if (Subtarget.hasStdExtZbb())
+    setTargetDAGCombine({ISD::UMAX, ISD::UMIN, ISD::SMAX, ISD::SMIN});
+
+  if (Subtarget.hasStdExtZbkb())
+    setTargetDAGCombine(ISD::BITREVERSE);
+  if (Subtarget.hasStdExtZfh() || Subtarget.hasStdExtZbb())
+    setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
+  if (Subtarget.hasStdExtF())
+    setTargetDAGCombine({ISD::ZERO_EXTEND, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
+                         ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT});
+  if (Subtarget.hasVInstructions())
+    setTargetDAGCombine({ISD::FCOPYSIGN, ISD::MGATHER, ISD::MSCATTER,
+                         ISD::VP_GATHER, ISD::VP_SCATTER, ISD::SRA, ISD::SRL,
+                         ISD::SHL, ISD::STORE, ISD::SPLAT_VECTOR});
+  if (Subtarget.useRVVForFixedLengthVectors())
+    setTargetDAGCombine(ISD::BITCAST);
 
   setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
   setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
@@ -1149,6 +1029,24 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.size = MemoryLocation::UnknownSize;
     Info.flags |= MachineMemOperand::MOStore;
     return true;
+  case Intrinsic::riscv_seg2_load:
+  case Intrinsic::riscv_seg3_load:
+  case Intrinsic::riscv_seg4_load:
+  case Intrinsic::riscv_seg5_load:
+  case Intrinsic::riscv_seg6_load:
+  case Intrinsic::riscv_seg7_load:
+  case Intrinsic::riscv_seg8_load:
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.memVT =
+        getValueType(DL, I.getType()->getStructElementType(0)->getScalarType());
+    Info.align =
+        Align(DL.getTypeSizeInBits(
+                  I.getType()->getStructElementType(0)->getScalarType()) /
+              8);
+    Info.size = MemoryLocation::UnknownSize;
+    Info.flags |= MachineMemOperand::MOLoad;
+    return true;
   }
 }
 
@@ -1160,6 +1058,10 @@ bool RISCVTargetLowering::isLegalAddressingMode(const DataLayout &DL,
   if (AM.BaseGV)
     return false;
 
+  // RVV instructions only support register addressing.
+  if (Subtarget.hasVInstructions() && isa<VectorType>(Ty))
+    return AM.HasBaseReg && AM.Scale == 0 && !AM.BaseOffs;
+
   // Require a 12-bit signed offset.
   if (!isInt<12>(AM.BaseOffs))
     return false;
@@ -1225,6 +1127,10 @@ bool RISCVTargetLowering::isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const {
   return Subtarget.is64Bit() && SrcVT == MVT::i32 && DstVT == MVT::i64;
 }
 
+bool RISCVTargetLowering::signExtendConstant(const ConstantInt *CI) const {
+  return Subtarget.is64Bit() && CI->getType()->isIntegerTy(32);
+}
+
 bool RISCVTargetLowering::isCheapToSpeculateCttz() const {
   return Subtarget.hasStdExtZbb();
 }
@@ -1245,6 +1151,36 @@ bool RISCVTargetLowering::hasAndNotCompare(SDValue Y) const {
          !isa<ConstantSDNode>(Y);
 }
 
+bool RISCVTargetLowering::hasBitTest(SDValue X, SDValue Y) const {
+  // We can use ANDI+SEQZ/SNEZ as a bit test. Y contains the bit position.
+  auto *C = dyn_cast<ConstantSDNode>(Y);
+  return C && C->getAPIntValue().ule(10);
+}
+
+bool RISCVTargetLowering::
+    shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
+        SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
+        unsigned OldShiftOpcode, unsigned NewShiftOpcode,
+        SelectionDAG &DAG) const {
+  // One interesting pattern that we'd want to form is 'bit extract':
+  //   ((1 >> Y) & 1) ==/!= 0
+  // But we also need to be careful not to try to reverse that fold.
+
+  // Is this '((1 >> Y) & 1)'?
+  if (XC && OldShiftOpcode == ISD::SRL && XC->isOne())
+    return false; // Keep the 'bit extract' pattern.
+
+  // Will this be '((1 >> Y) & 1)' after the transform?
+  if (NewShiftOpcode == ISD::SRL && CC->isOne())
+    return true; // Do form the 'bit extract' pattern.
+
+  // If 'X' is a constant, and we transform, then we will immediately
+  // try to undo the fold, thus causing endless combine loop.
+  // So only do the transform if X is not a constant. This matches the default
+  // implementation of this function.
+  return !XC;
+}
+
 /// Check if sinking \p I's operands to I's basic block is profitable, because
 /// the operands can be folded into a target instruction, e.g.
 /// splats of scalars can fold into vector instructions.
@@ -1282,6 +1218,7 @@ bool RISCVTargetLowering::shouldSinkOperands(
       if (auto *II = dyn_cast<IntrinsicInst>(I)) {
         switch (II->getIntrinsicID()) {
         case Intrinsic::fma:
+        case Intrinsic::vp_fma:
           return Operand == 0 || Operand == 1;
         // FIXME: Our patterns can only match vx/vf instructions when the splat
         // it on the RHS, because TableGen doesn't recognize our VP operations
@@ -1345,6 +1282,15 @@ bool RISCVTargetLowering::shouldSinkOperands(
   return true;
 }
 
+bool RISCVTargetLowering::isOffsetFoldingLegal(
+    const GlobalAddressSDNode *GA) const {
+  // In order to maximise the opportunity for common subexpression elimination,
+  // keep a separate ADD node for the global address offset instead of folding
+  // it in the global address node. Later peephole optimisations may choose to
+  // fold it back in when profitable.
+  return false;
+}
+
 bool RISCVTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
                                        bool ForCodeSize) const {
   // FIXME: Change to Zfhmin once f16 becomes a legal type with Zfhmin.
@@ -1583,7 +1529,7 @@ static bool useRVVForFixedLengthVectorVT(MVT VT,
   if (VT.getFixedSizeInBits() > 1024 * 8)
     return false;
 
-  unsigned MinVLen = Subtarget.getMinRVVVectorSizeInBits();
+  unsigned MinVLen = Subtarget.getRealMinVLen();
 
   MVT EltVT = VT.getVectorElementType();
 
@@ -1621,7 +1567,7 @@ static bool useRVVForFixedLengthVectorVT(MVT VT,
   }
 
   // Reject elements larger than ELEN.
-  if (EltVT.getSizeInBits() > Subtarget.getMaxELENForFixedLengthVectors())
+  if (EltVT.getSizeInBits() > Subtarget.getELEN())
     return false;
 
   unsigned LMul = divideCeil(VT.getSizeInBits(), MinVLen);
@@ -1649,8 +1595,8 @@ static MVT getContainerForFixedLengthVector(const TargetLowering &TLI, MVT VT,
           useRVVForFixedLengthVectorVT(VT, Subtarget)) &&
          "Expected legal fixed length vector!");
 
-  unsigned MinVLen = Subtarget.getMinRVVVectorSizeInBits();
-  unsigned MaxELen = Subtarget.getMaxELENForFixedLengthVectors();
+  unsigned MinVLen = Subtarget.getRealMinVLen();
+  unsigned MaxELen = Subtarget.getELEN();
 
   MVT EltVT = VT.getVectorElementType();
   switch (EltVT.SimpleTy) {
@@ -1710,6 +1656,23 @@ static SDValue convertFromScalableVector(EVT VT, SDValue V, SelectionDAG &DAG,
   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
 }
 
+/// Return the type of the mask type suitable for masking the provided
+/// vector type.  This is simply an i1 element type vector of the same
+/// (possibly scalable) length.
+static MVT getMaskTypeFor(EVT VecVT) {
+  assert(VecVT.isVector());
+  ElementCount EC = VecVT.getVectorElementCount();
+  return MVT::getVectorVT(MVT::i1, EC);
+}
+
+/// Creates an all ones mask suitable for masking a vector of type VecTy with
+/// vector length VL.  .
+static SDValue getAllOnesMask(MVT VecVT, SDValue VL, SDLoc DL,
+                              SelectionDAG &DAG) {
+  MVT MaskVT = getMaskTypeFor(VecVT);
+  return DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL);
+}
+
 // Gets the two common "VL" operands: an all-ones mask and the vector length.
 // VecVT is a vector type, either fixed-length or scalable, and ContainerVT is
 // the vector type that it is contained in.
@@ -1720,9 +1683,8 @@ getDefaultVLOps(MVT VecVT, MVT ContainerVT, SDLoc DL, SelectionDAG &DAG,
   MVT XLenVT = Subtarget.getXLenVT();
   SDValue VL = VecVT.isFixedLengthVector()
                    ? DAG.getConstant(VecVT.getVectorNumElements(), DL, XLenVT)
-                   : DAG.getTargetConstant(RISCV::VLMaxSentinel, DL, XLenVT);
-  MVT MaskVT = MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
-  SDValue Mask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL);
+                   : DAG.getRegister(RISCV::X0, XLenVT);
+  SDValue Mask = getAllOnesMask(ContainerVT, VL, DL, DAG);
   return {Mask, VL};
 }
 
@@ -1747,14 +1709,6 @@ bool RISCVTargetLowering::shouldExpandBuildVectorWithShuffles(
   return false;
 }
 
-bool RISCVTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
-  // Only splats are currently supported.
-  if (ShuffleVectorSDNode::isSplatMask(M.data(), VT))
-    return true;
-
-  return false;
-}
-
 static SDValue lowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG,
                                   const RISCVSubtarget &Subtarget) {
   // RISCV FP-to-int conversions saturate to the destination register size, but
@@ -1796,7 +1750,7 @@ static SDValue lowerFTRUNC_FCEIL_FFLOOR(SDValue Op, SelectionDAG &DAG) {
   SDLoc DL(Op);
 
   // Freeze the source since we are increasing the number of uses.
-  SDValue Src = DAG.getNode(ISD::FREEZE, DL, VT, Op.getOperand(0));
+  SDValue Src = DAG.getFreeze(Op.getOperand(0));
 
   // Truncate to integer and convert back to FP.
   MVT IntVT = VT.changeVectorElementTypeToInteger();
@@ -1844,21 +1798,56 @@ static SDValue lowerFTRUNC_FCEIL_FFLOOR(SDValue Op, SelectionDAG &DAG) {
   return DAG.getSelect(DL, VT, Setcc, Truncated, Src);
 }
 
-static SDValue lowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG,
-                                 const RISCVSubtarget &Subtarget) {
+// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
+// This mode isn't supported in vector hardware on RISCV. But as long as we
+// aren't compiling with trapping math, we can emulate this with
+// floor(X + copysign(nextafter(0.5, 0.0), X)).
+// FIXME: Could be shorter by changing rounding mode, but we don't have FRM
+// dependencies modeled yet.
+// FIXME: Use masked operations to avoid final merge.
+static SDValue lowerFROUND(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
-  assert(VT.isFixedLengthVector() && "Unexpected vector!");
-
-  MVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
+  assert(VT.isVector() && "Unexpected type");
 
   SDLoc DL(Op);
-  SDValue Mask, VL;
-  std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
 
-  unsigned Opc =
-      VT.isFloatingPoint() ? RISCVISD::VFMV_V_F_VL : RISCVISD::VMV_V_X_VL;
-  SDValue Splat = DAG.getNode(Opc, DL, ContainerVT, Op.getOperand(0), VL);
-  return convertFromScalableVector(VT, Splat, DAG, Subtarget);
+  // Freeze the source since we are increasing the number of uses.
+  SDValue Src = DAG.getFreeze(Op.getOperand(0));
+
+  // We do the conversion on the absolute value and fix the sign at the end.
+  SDValue Abs = DAG.getNode(ISD::FABS, DL, VT, Src);
+
+  const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT);
+  bool Ignored;
+  APFloat Point5Pred = APFloat(0.5f);
+  Point5Pred.convert(FltSem, APFloat::rmNearestTiesToEven, &Ignored);
+  Point5Pred.next(/*nextDown*/ true);
+
+  // Add the adjustment.
+  SDValue Adjust = DAG.getNode(ISD::FADD, DL, VT, Abs,
+                               DAG.getConstantFP(Point5Pred, DL, VT));
+
+  // Truncate to integer and convert back to fp.
+  MVT IntVT = VT.changeVectorElementTypeToInteger();
+  SDValue Truncated = DAG.getNode(ISD::FP_TO_SINT, DL, IntVT, Adjust);
+  Truncated = DAG.getNode(ISD::SINT_TO_FP, DL, VT, Truncated);
+
+  // Restore the original sign.
+  Truncated = DAG.getNode(ISD::FCOPYSIGN, DL, VT, Truncated, Src);
+
+  // Determine the largest integer that can be represented exactly. This and
+  // values larger than it don't have any fractional bits so don't need to
+  // be converted.
+  unsigned Precision = APFloat::semanticsPrecision(FltSem);
+  APFloat MaxVal = APFloat(FltSem);
+  MaxVal.convertFromAPInt(APInt::getOneBitSet(Precision, Precision - 1),
+                          /*IsSigned*/ false, APFloat::rmNearestTiesToEven);
+  SDValue MaxValNode = DAG.getConstantFP(MaxVal, DL, VT);
+
+  // If abs(Src) was larger than MaxVal or nan, keep it.
+  MVT SetccVT = MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
+  SDValue Setcc = DAG.getSetCC(DL, SetccVT, Abs, MaxValNode, ISD::SETOLT);
+  return DAG.getSelect(DL, VT, Setcc, Truncated, Src);
 }
 
 struct VIDSequence {
@@ -1908,37 +1897,27 @@ static Optional<VIDSequence> isSimpleVIDSequence(SDValue Op) {
       // A zero-value value difference means that we're somewhere in the middle
       // of a fractional step, e.g. <0,0,0*,0,1,1,1,1>. Wait until we notice a
       // step change before evaluating the sequence.
-      if (ValDiff != 0) {
-        int64_t Remainder = ValDiff % IdxDiff;
-        // Normalize the step if it's greater than 1.
-        if (Remainder != ValDiff) {
-          // The difference must cleanly divide the element span.
-          if (Remainder != 0)
-            return None;
-          ValDiff /= IdxDiff;
-          IdxDiff = 1;
-        }
-
-        if (!SeqStepNum)
-          SeqStepNum = ValDiff;
-        else if (ValDiff != SeqStepNum)
-          return None;
+      if (ValDiff == 0)
+        continue;
 
-        if (!SeqStepDenom)
-          SeqStepDenom = IdxDiff;
-        else if (IdxDiff != *SeqStepDenom)
+      int64_t Remainder = ValDiff % IdxDiff;
+      // Normalize the step if it's greater than 1.
+      if (Remainder != ValDiff) {
+        // The difference must cleanly divide the element span.
+        if (Remainder != 0)
           return None;
+        ValDiff /= IdxDiff;
+        IdxDiff = 1;
       }
-    }
 
-    // Record and/or check any addend.
-    if (SeqStepNum && SeqStepDenom) {
-      uint64_t ExpectedVal =
-          (int64_t)(Idx * (uint64_t)*SeqStepNum) / *SeqStepDenom;
-      int64_t Addend = SignExtend64(Val - ExpectedVal, EltSizeInBits);
-      if (!SeqAddend)
-        SeqAddend = Addend;
-      else if (SeqAddend != Addend)
+      if (!SeqStepNum)
+        SeqStepNum = ValDiff;
+      else if (ValDiff != SeqStepNum)
+        return None;
+
+      if (!SeqStepDenom)
+        SeqStepDenom = IdxDiff;
+      else if (IdxDiff != *SeqStepDenom)
         return None;
     }
 
@@ -1946,14 +1925,68 @@ static Optional<VIDSequence> isSimpleVIDSequence(SDValue Op) {
     if (!PrevElt || PrevElt->first != Val)
       PrevElt = std::make_pair(Val, Idx);
   }
-  // We need to have logged both a step and an addend for this to count as
-  // a legal index sequence.
-  if (!SeqStepNum || !SeqStepDenom || !SeqAddend)
+
+  // We need to have logged a step for this to count as a legal index sequence.
+  if (!SeqStepNum || !SeqStepDenom)
     return None;
 
+  // Loop back through the sequence and validate elements we might have skipped
+  // while waiting for a valid step. While doing this, log any sequence addend.
+  for (unsigned Idx = 0; Idx < NumElts; Idx++) {
+    if (Op.getOperand(Idx).isUndef())
+      continue;
+    uint64_t Val = Op.getConstantOperandVal(Idx) &
+                   maskTrailingOnes<uint64_t>(EltSizeInBits);
+    uint64_t ExpectedVal =
+        (int64_t)(Idx * (uint64_t)*SeqStepNum) / *SeqStepDenom;
+    int64_t Addend = SignExtend64(Val - ExpectedVal, EltSizeInBits);
+    if (!SeqAddend)
+      SeqAddend = Addend;
+    else if (Addend != SeqAddend)
+      return None;
+  }
+
+  assert(SeqAddend && "Must have an addend if we have a step");
+
   return VIDSequence{*SeqStepNum, *SeqStepDenom, *SeqAddend};
 }
 
+// Match a splatted value (SPLAT_VECTOR/BUILD_VECTOR) of an EXTRACT_VECTOR_ELT
+// and lower it as a VRGATHER_VX_VL from the source vector.
+static SDValue matchSplatAsGather(SDValue SplatVal, MVT VT, const SDLoc &DL,
+                                  SelectionDAG &DAG,
+                                  const RISCVSubtarget &Subtarget) {
+  if (SplatVal.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+    return SDValue();
+  SDValue Vec = SplatVal.getOperand(0);
+  // Only perform this optimization on vectors of the same size for simplicity.
+  // Don't perform this optimization for i1 vectors.
+  // FIXME: Support i1 vectors, maybe by promoting to i8?
+  if (Vec.getValueType() != VT || VT.getVectorElementType() == MVT::i1)
+    return SDValue();
+  SDValue Idx = SplatVal.getOperand(1);
+  // The index must be a legal type.
+  if (Idx.getValueType() != Subtarget.getXLenVT())
+    return SDValue();
+
+  MVT ContainerVT = VT;
+  if (VT.isFixedLengthVector()) {
+    ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
+    Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
+  }
+
+  SDValue Mask, VL;
+  std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
+
+  SDValue Gather = DAG.getNode(RISCVISD::VRGATHER_VX_VL, DL, ContainerVT, Vec,
+                               Idx, Mask, DAG.getUNDEF(ContainerVT), VL);
+
+  if (!VT.isFixedLengthVector())
+    return Gather;
+
+  return convertFromScalableVector(VT, Gather, DAG, Subtarget);
+}
+
 static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
                                  const RISCVSubtarget &Subtarget) {
   MVT VT = Op.getSimpleValueType();
@@ -1989,8 +2022,7 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
     // codegen across RV32 and RV64.
     unsigned NumViaIntegerBits =
         std::min(std::max(NumElts, 8u), Subtarget.getXLen());
-    NumViaIntegerBits = std::min(NumViaIntegerBits,
-                                 Subtarget.getMaxELENForFixedLengthVectors());
+    NumViaIntegerBits = std::min(NumViaIntegerBits, Subtarget.getELEN());
     if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
       // If we have to use more than one INSERT_VECTOR_ELT then this
       // optimization is likely to increase code size; avoid peforming it in
@@ -2012,7 +2044,7 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
         // our vector and clear our accumulated data.
         if (I != 0 && I % NumViaIntegerBits == 0) {
           if (NumViaIntegerBits <= 32)
-            Bits = SignExtend64(Bits, 32);
+            Bits = SignExtend64<32>(Bits);
           SDValue Elt = DAG.getConstant(Bits, DL, XLenVT);
           Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, IntegerViaVecVT, Vec,
                             Elt, DAG.getConstant(IntegerEltIdx, DL, XLenVT));
@@ -2028,7 +2060,7 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
       // Insert the (remaining) scalar value into position in our integer
       // vector type.
       if (NumViaIntegerBits <= 32)
-        Bits = SignExtend64(Bits, 32);
+        Bits = SignExtend64<32>(Bits);
       SDValue Elt = DAG.getConstant(Bits, DL, XLenVT);
       Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, IntegerViaVecVT, Vec, Elt,
                         DAG.getConstant(IntegerEltIdx, DL, XLenVT));
@@ -2077,9 +2109,12 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
   }
 
   if (SDValue Splat = cast<BuildVectorSDNode>(Op)->getSplatValue()) {
+    if (auto Gather = matchSplatAsGather(Splat, VT, DL, DAG, Subtarget))
+      return Gather;
     unsigned Opc = VT.isFloatingPoint() ? RISCVISD::VFMV_V_F_VL
                                         : RISCVISD::VMV_V_X_VL;
-    Splat = DAG.getNode(Opc, DL, ContainerVT, Splat, VL);
+    Splat =
+        DAG.getNode(Opc, DL, ContainerVT, DAG.getUNDEF(ContainerVT), Splat, VL);
     return convertFromScalableVector(VT, Splat, DAG, Subtarget);
   }
 
@@ -2109,7 +2144,8 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
     // a single addi instruction.
     if (((StepOpcode == ISD::MUL && isInt<12>(SplatStepVal)) ||
          (StepOpcode == ISD::SHL && isUInt<5>(SplatStepVal))) &&
-        isPowerOf2_32(StepDenominator) && isInt<5>(Addend)) {
+        isPowerOf2_32(StepDenominator) &&
+        (SplatStepVal >= 0 || StepDenominator == 1) && isInt<5>(Addend)) {
       SDValue VID = DAG.getNode(RISCVISD::VID_VL, DL, ContainerVT, Mask, VL);
       // Convert right out of the scalable type so we can use standard ISD
       // nodes for the rest of the computation. If we used scalable types with
@@ -2118,18 +2154,18 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
       VID = convertFromScalableVector(VT, VID, DAG, Subtarget);
       if ((StepOpcode == ISD::MUL && SplatStepVal != 1) ||
           (StepOpcode == ISD::SHL && SplatStepVal != 0)) {
-        SDValue SplatStep = DAG.getSplatVector(
+        SDValue SplatStep = DAG.getSplatBuildVector(
             VT, DL, DAG.getConstant(SplatStepVal, DL, XLenVT));
         VID = DAG.getNode(StepOpcode, DL, VT, VID, SplatStep);
       }
       if (StepDenominator != 1) {
-        SDValue SplatStep = DAG.getSplatVector(
+        SDValue SplatStep = DAG.getSplatBuildVector(
             VT, DL, DAG.getConstant(Log2_64(StepDenominator), DL, XLenVT));
         VID = DAG.getNode(ISD::SRL, DL, VT, VID, SplatStep);
       }
       if (Addend != 0 || Negate) {
-        SDValue SplatAddend =
-            DAG.getSplatVector(VT, DL, DAG.getConstant(Addend, DL, XLenVT));
+        SDValue SplatAddend = DAG.getSplatBuildVector(
+            VT, DL, DAG.getConstant(Addend, DL, XLenVT));
         VID = DAG.getNode(Negate ? ISD::SUB : ISD::ADD, DL, VT, SplatAddend, VID);
       }
       return VID;
@@ -2172,7 +2208,7 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
     // On RV64, sign-extend from 32 to 64 bits where possible in order to
     // achieve better constant materializion.
     if (Subtarget.is64Bit() && ViaIntVT == MVT::i32)
-      SplatValue = SignExtend64(SplatValue, 32);
+      SplatValue = SignExtend64<32>(SplatValue);
 
     // Since we can't introduce illegal i64 types at this stage, we can only
     // perform an i64 splat on RV32 if it is its own sign-extended value. That
@@ -2187,6 +2223,7 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
           getContainerForFixedLengthVector(DAG, ViaVecVT, Subtarget);
       SDValue Splat =
           DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ViaContainerVT,
+                      DAG.getUNDEF(ViaContainerVT),
                       DAG.getConstant(SplatValue, DL, XLenVT), ViaVL);
       Splat = convertFromScalableVector(ViaVecVT, Splat, DAG, Subtarget);
       return DAG.getBitcast(VT, Splat);
@@ -2274,57 +2311,66 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue splatPartsI64WithVL(const SDLoc &DL, MVT VT, SDValue Lo,
-                                   SDValue Hi, SDValue VL, SelectionDAG &DAG) {
+static SDValue splatPartsI64WithVL(const SDLoc &DL, MVT VT, SDValue Passthru,
+                                   SDValue Lo, SDValue Hi, SDValue VL,
+                                   SelectionDAG &DAG) {
+  if (!Passthru)
+    Passthru = DAG.getUNDEF(VT);
   if (isa<ConstantSDNode>(Lo) && isa<ConstantSDNode>(Hi)) {
     int32_t LoC = cast<ConstantSDNode>(Lo)->getSExtValue();
     int32_t HiC = cast<ConstantSDNode>(Hi)->getSExtValue();
     // If Hi constant is all the same sign bit as Lo, lower this as a custom
     // node in order to try and match RVV vector/scalar instructions.
     if ((LoC >> 31) == HiC)
-      return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Lo, VL);
+      return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Passthru, Lo, VL);
 
-    // If vl is equal to VLMax and Hi constant is equal to Lo, we could use
+    // If vl is equal to XLEN_MAX and Hi constant is equal to Lo, we could use
     // vmv.v.x whose EEW = 32 to lower it.
     auto *Const = dyn_cast<ConstantSDNode>(VL);
-    if (LoC == HiC && Const && Const->getSExtValue() == RISCV::VLMaxSentinel) {
+    if (LoC == HiC && Const && Const->isAllOnesValue()) {
       MVT InterVT = MVT::getVectorVT(MVT::i32, VT.getVectorElementCount() * 2);
       // TODO: if vl <= min(VLMAX), we can also do this. But we could not
       // access the subtarget here now.
-      auto InterVec = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, InterVT, Lo, VL);
+      auto InterVec = DAG.getNode(
+          RISCVISD::VMV_V_X_VL, DL, InterVT, DAG.getUNDEF(InterVT), Lo,
+                                  DAG.getRegister(RISCV::X0, MVT::i32));
       return DAG.getNode(ISD::BITCAST, DL, VT, InterVec);
     }
   }
 
   // Fall back to a stack store and stride x0 vector load.
-  return DAG.getNode(RISCVISD::SPLAT_VECTOR_SPLIT_I64_VL, DL, VT, Lo, Hi, VL);
+  return DAG.getNode(RISCVISD::SPLAT_VECTOR_SPLIT_I64_VL, DL, VT, Passthru, Lo,
+                     Hi, VL);
 }
 
 // Called by type legalization to handle splat of i64 on RV32.
 // FIXME: We can optimize this when the type has sign or zero bits in one
 // of the halves.
-static SDValue splatSplitI64WithVL(const SDLoc &DL, MVT VT, SDValue Scalar,
-                                   SDValue VL, SelectionDAG &DAG) {
+static SDValue splatSplitI64WithVL(const SDLoc &DL, MVT VT, SDValue Passthru,
+                                   SDValue Scalar, SDValue VL,
+                                   SelectionDAG &DAG) {
   assert(Scalar.getValueType() == MVT::i64 && "Unexpected VT!");
   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Scalar,
                            DAG.getConstant(0, DL, MVT::i32));
   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Scalar,
                            DAG.getConstant(1, DL, MVT::i32));
-  return splatPartsI64WithVL(DL, VT, Lo, Hi, VL, DAG);
+  return splatPartsI64WithVL(DL, VT, Passthru, Lo, Hi, VL, DAG);
 }
 
 // This function lowers a splat of a scalar operand Splat with the vector
 // length VL. It ensures the final sequence is type legal, which is useful when
 // lowering a splat after type legalization.
-static SDValue lowerScalarSplat(SDValue Scalar, SDValue VL, MVT VT, SDLoc DL,
-                                SelectionDAG &DAG,
+static SDValue lowerScalarSplat(SDValue Passthru, SDValue Scalar, SDValue VL,
+                                MVT VT, SDLoc DL, SelectionDAG &DAG,
                                 const RISCVSubtarget &Subtarget) {
+  bool HasPassthru = Passthru && !Passthru.isUndef();
+  if (!HasPassthru && !Passthru)
+    Passthru = DAG.getUNDEF(VT);
   if (VT.isFloatingPoint()) {
     // If VL is 1, we could use vfmv.s.f.
     if (isOneConstant(VL))
-      return DAG.getNode(RISCVISD::VFMV_S_F_VL, DL, VT, DAG.getUNDEF(VT),
-                         Scalar, VL);
-    return DAG.getNode(RISCVISD::VFMV_V_F_VL, DL, VT, Scalar, VL);
+      return DAG.getNode(RISCVISD::VFMV_S_F_VL, DL, VT, Passthru, Scalar, VL);
+    return DAG.getNode(RISCVISD::VFMV_V_F_VL, DL, VT, Passthru, Scalar, VL);
   }
 
   MVT XLenVT = Subtarget.getXLenVT();
@@ -2343,55 +2389,25 @@ static SDValue lowerScalarSplat(SDValue Scalar, SDValue VL, MVT VT, SDLoc DL,
     // use vmv.s.x.
     if (isOneConstant(VL) &&
         (!Const || isNullConstant(Scalar) || !isInt<5>(Const->getSExtValue())))
-      return DAG.getNode(RISCVISD::VMV_S_X_VL, DL, VT, DAG.getUNDEF(VT), Scalar,
-                         VL);
-    return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Scalar, VL);
+      return DAG.getNode(RISCVISD::VMV_S_X_VL, DL, VT, Passthru, Scalar, VL);
+    return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Passthru, Scalar, VL);
   }
 
   assert(XLenVT == MVT::i32 && Scalar.getValueType() == MVT::i64 &&
          "Unexpected scalar for splat lowering!");
 
   if (isOneConstant(VL) && isNullConstant(Scalar))
-    return DAG.getNode(RISCVISD::VMV_S_X_VL, DL, VT, DAG.getUNDEF(VT),
+    return DAG.getNode(RISCVISD::VMV_S_X_VL, DL, VT, Passthru,
                        DAG.getConstant(0, DL, XLenVT), VL);
 
   // Otherwise use the more complicated splatting algorithm.
-  return splatSplitI64WithVL(DL, VT, Scalar, VL, DAG);
-}
-
-// Is the mask a slidedown that shifts in undefs.
-static int matchShuffleAsSlideDown(ArrayRef<int> Mask) {
-  int Size = Mask.size();
-
-  // Elements shifted in should be undef.
-  auto CheckUndefs = [&](int Shift) {
-    for (int i = Size - Shift; i != Size; ++i)
-      if (Mask[i] >= 0)
-        return false;
-    return true;
-  };
-
-  // Elements should be shifted or undef.
-  auto MatchShift = [&](int Shift) {
-    for (int i = 0; i != Size - Shift; ++i)
-       if (Mask[i] >= 0 && Mask[i] != Shift + i)
-         return false;
-    return true;
-  };
-
-  // Try all possible shifts.
-  for (int Shift = 1; Shift != Size; ++Shift)
-    if (CheckUndefs(Shift) && MatchShift(Shift))
-      return Shift;
-
-  // No match.
-  return -1;
+  return splatSplitI64WithVL(DL, VT, Passthru, Scalar, VL, DAG);
 }
 
 static bool isInterleaveShuffle(ArrayRef<int> Mask, MVT VT, bool &SwapSources,
                                 const RISCVSubtarget &Subtarget) {
   // We need to be able to widen elements to the next larger integer type.
-  if (VT.getScalarSizeInBits() >= Subtarget.getMaxELENForFixedLengthVectors())
+  if (VT.getScalarSizeInBits() >= Subtarget.getELEN())
     return false;
 
   int Size = Mask.size();
@@ -2430,6 +2446,79 @@ static bool isInterleaveShuffle(ArrayRef<int> Mask, MVT VT, bool &SwapSources,
   return true;
 }
 
+/// Match shuffles that concatenate two vectors, rotate the concatenation,
+/// and then extract the original number of elements from the rotated result.
+/// This is equivalent to vector.splice or X86's PALIGNR instruction. The
+/// returned rotation amount is for a rotate right, where elements move from
+/// higher elements to lower elements. \p LoSrc indicates the first source
+/// vector of the rotate or -1 for undef. \p HiSrc indicates the second vector
+/// of the rotate or -1 for undef. At least one of \p LoSrc and \p HiSrc will be
+/// 0 or 1 if a rotation is found.
+///
+/// NOTE: We talk about rotate to the right which matches how bit shift and
+/// rotate instructions are described where LSBs are on the right, but LLVM IR
+/// and the table below write vectors with the lowest elements on the left.
+static int isElementRotate(int &LoSrc, int &HiSrc, ArrayRef<int> Mask) {
+  int Size = Mask.size();
+
+  // We need to detect various ways of spelling a rotation:
+  //   [11, 12, 13, 14, 15,  0,  1,  2]
+  //   [-1, 12, 13, 14, -1, -1,  1, -1]
+  //   [-1, -1, -1, -1, -1, -1,  1,  2]
+  //   [ 3,  4,  5,  6,  7,  8,  9, 10]
+  //   [-1,  4,  5,  6, -1, -1,  9, -1]
+  //   [-1,  4,  5,  6, -1, -1, -1, -1]
+  int Rotation = 0;
+  LoSrc = -1;
+  HiSrc = -1;
+  for (int i = 0; i != Size; ++i) {
+    int M = Mask[i];
+    if (M < 0)
+      continue;
+
+    // Determine where a rotate vector would have started.
+    int StartIdx = i - (M % Size);
+    // The identity rotation isn't interesting, stop.
+    if (StartIdx == 0)
+      return -1;
+
+    // If we found the tail of a vector the rotation must be the missing
+    // front. If we found the head of a vector, it must be how much of the
+    // head.
+    int CandidateRotation = StartIdx < 0 ? -StartIdx : Size - StartIdx;
+
+    if (Rotation == 0)
+      Rotation = CandidateRotation;
+    else if (Rotation != CandidateRotation)
+      // The rotations don't match, so we can't match this mask.
+      return -1;
+
+    // Compute which value this mask is pointing at.
+    int MaskSrc = M < Size ? 0 : 1;
+
+    // Compute which of the two target values this index should be assigned to.
+    // This reflects whether the high elements are remaining or the low elemnts
+    // are remaining.
+    int &TargetSrc = StartIdx < 0 ? HiSrc : LoSrc;
+
+    // Either set up this value if we've not encountered it before, or check
+    // that it remains consistent.
+    if (TargetSrc < 0)
+      TargetSrc = MaskSrc;
+    else if (TargetSrc != MaskSrc)
+      // This may be a rotation, but it pulls from the inputs in some
+      // unsupported interleaving.
+      return -1;
+  }
+
+  // Check that we successfully analyzed the mask, and normalize the results.
+  assert(Rotation != 0 && "Failed to locate a viable rotation!");
+  assert((LoSrc >= 0 || HiSrc >= 0) &&
+         "Failed to find a rotated input vector!");
+
+  return Rotation;
+}
+
 static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
                                    const RISCVSubtarget &Subtarget) {
   SDValue V1 = Op.getOperand(0);
@@ -2506,33 +2595,59 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
 
         unsigned Opc =
             VT.isFloatingPoint() ? RISCVISD::VFMV_V_F_VL : RISCVISD::VMV_V_X_VL;
-        SDValue Splat = DAG.getNode(Opc, DL, ContainerVT, V, VL);
+        SDValue Splat =
+            DAG.getNode(Opc, DL, ContainerVT, DAG.getUNDEF(ContainerVT), V, VL);
         return convertFromScalableVector(VT, Splat, DAG, Subtarget);
       }
 
       V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
       assert(Lane < (int)NumElts && "Unexpected lane!");
-      SDValue Gather =
-          DAG.getNode(RISCVISD::VRGATHER_VX_VL, DL, ContainerVT, V1,
-                      DAG.getConstant(Lane, DL, XLenVT), TrueMask, VL);
+      SDValue Gather = DAG.getNode(RISCVISD::VRGATHER_VX_VL, DL, ContainerVT,
+                                   V1, DAG.getConstant(Lane, DL, XLenVT),
+                                   TrueMask, DAG.getUNDEF(ContainerVT), VL);
       return convertFromScalableVector(VT, Gather, DAG, Subtarget);
     }
   }
 
   ArrayRef<int> Mask = SVN->getMask();
 
-  // Try to match as a slidedown.
-  int SlideAmt = matchShuffleAsSlideDown(Mask);
-  if (SlideAmt >= 0) {
-    // TODO: Should we reduce the VL to account for the upper undef elements?
-    // Requires additional vsetvlis, but might be faster to execute.
-    V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
-    SDValue SlideDown =
-        DAG.getNode(RISCVISD::VSLIDEDOWN_VL, DL, ContainerVT,
-                    DAG.getUNDEF(ContainerVT), V1,
-                    DAG.getConstant(SlideAmt, DL, XLenVT),
-                    TrueMask, VL);
-    return convertFromScalableVector(VT, SlideDown, DAG, Subtarget);
+  // Lower rotations to a SLIDEDOWN and a SLIDEUP. One of the source vectors may
+  // be undef which can be handled with a single SLIDEDOWN/UP.
+  int LoSrc, HiSrc;
+  int Rotation = isElementRotate(LoSrc, HiSrc, Mask);
+  if (Rotation > 0) {
+    SDValue LoV, HiV;
+    if (LoSrc >= 0) {
+      LoV = LoSrc == 0 ? V1 : V2;
+      LoV = convertToScalableVector(ContainerVT, LoV, DAG, Subtarget);
+    }
+    if (HiSrc >= 0) {
+      HiV = HiSrc == 0 ? V1 : V2;
+      HiV = convertToScalableVector(ContainerVT, HiV, DAG, Subtarget);
+    }
+
+    // We found a rotation. We need to slide HiV down by Rotation. Then we need
+    // to slide LoV up by (NumElts - Rotation).
+    unsigned InvRotate = NumElts - Rotation;
+
+    SDValue Res = DAG.getUNDEF(ContainerVT);
+    if (HiV) {
+      // If we are doing a SLIDEDOWN+SLIDEUP, reduce the VL for the SLIDEDOWN.
+      // FIXME: If we are only doing a SLIDEDOWN, don't reduce the VL as it
+      // causes multiple vsetvlis in some test cases such as lowering
+      // reduce.mul
+      SDValue DownVL = VL;
+      if (LoV)
+        DownVL = DAG.getConstant(InvRotate, DL, XLenVT);
+      Res =
+          DAG.getNode(RISCVISD::VSLIDEDOWN_VL, DL, ContainerVT, Res, HiV,
+                      DAG.getConstant(Rotation, DL, XLenVT), TrueMask, DownVL);
+    }
+    if (LoV)
+      Res = DAG.getNode(RISCVISD::VSLIDEUP_VL, DL, ContainerVT, Res, LoV,
+                        DAG.getConstant(InvRotate, DL, XLenVT), TrueMask, VL);
+
+    return convertFromScalableVector(VT, Res, DAG, Subtarget);
   }
 
   // Detect an interleave shuffle and lower to
@@ -2576,18 +2691,17 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
 
     // Freeze V2 since we use it twice and we need to be sure that the add and
     // multiply see the same value.
-    V2 = DAG.getNode(ISD::FREEZE, DL, IntHalfVT, V2);
+    V2 = DAG.getFreeze(V2);
 
     // Recreate TrueMask using the widened type's element count.
-    MVT MaskVT =
-        MVT::getVectorVT(MVT::i1, HalfContainerVT.getVectorElementCount());
-    TrueMask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL);
+    TrueMask = getAllOnesMask(HalfContainerVT, VL, DL, DAG);
 
     // Widen V1 and V2 with 0s and add one copy of V2 to V1.
     SDValue Add = DAG.getNode(RISCVISD::VWADDU_VL, DL, WideIntContainerVT, V1,
                               V2, TrueMask, VL);
     // Create 2^eltbits - 1 copies of V2 by multiplying by the largest integer.
     SDValue Multiplier = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, IntHalfVT,
+                                     DAG.getUNDEF(IntHalfVT),
                                      DAG.getAllOnesConstant(DL, XLenVT));
     SDValue WidenMul = DAG.getNode(RISCVISD::VWMULU_VL, DL, WideIntContainerVT,
                                    V2, Multiplier, TrueMask, VL);
@@ -2691,7 +2805,8 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
   // TODO: This doesn't trigger for i64 vectors on RV32, since there we
   // encounter a bitcasted BUILD_VECTOR with low/high i32 values.
   if (SDValue SplatValue = DAG.getSplatValue(V1, /*LegalTypes*/ true)) {
-    Gather = lowerScalarSplat(SplatValue, VL, ContainerVT, DL, DAG, Subtarget);
+    Gather = lowerScalarSplat(SDValue(), SplatValue, VL, ContainerVT, DL, DAG,
+                              Subtarget);
   } else {
     V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
     // If only one index is used, we can use a "splat" vrgather.
@@ -2699,16 +2814,16 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
     // that's beneficial.
     if (LHSIndexCounts.size() == 1) {
       int SplatIndex = LHSIndexCounts.begin()->getFirst();
-      Gather =
-          DAG.getNode(GatherVXOpc, DL, ContainerVT, V1,
-                      DAG.getConstant(SplatIndex, DL, XLenVT), TrueMask, VL);
+      Gather = DAG.getNode(GatherVXOpc, DL, ContainerVT, V1,
+                           DAG.getConstant(SplatIndex, DL, XLenVT), TrueMask,
+                           DAG.getUNDEF(ContainerVT), VL);
     } else {
       SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS);
       LHSIndices =
           convertToScalableVector(IndexContainerVT, LHSIndices, DAG, Subtarget);
 
       Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V1, LHSIndices,
-                           TrueMask, VL);
+                           TrueMask, DAG.getUNDEF(ContainerVT), VL);
     }
   }
 
@@ -2716,45 +2831,46 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
   // additional vrgather.
   if (!V2.isUndef()) {
     V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
+
+    MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1);
+    SelectMask =
+        convertToScalableVector(MaskContainerVT, SelectMask, DAG, Subtarget);
+
     // If only one index is used, we can use a "splat" vrgather.
     // TODO: We can splat the most-common index and fix-up any stragglers, if
     // that's beneficial.
     if (RHSIndexCounts.size() == 1) {
       int SplatIndex = RHSIndexCounts.begin()->getFirst();
-      V2 = DAG.getNode(GatherVXOpc, DL, ContainerVT, V2,
-                       DAG.getConstant(SplatIndex, DL, XLenVT), TrueMask, VL);
+      Gather = DAG.getNode(GatherVXOpc, DL, ContainerVT, V2,
+                           DAG.getConstant(SplatIndex, DL, XLenVT), SelectMask,
+                           Gather, VL);
     } else {
       SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS);
       RHSIndices =
           convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget);
-      V2 = DAG.getNode(GatherVVOpc, DL, ContainerVT, V2, RHSIndices, TrueMask,
-                       VL);
+      Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V2, RHSIndices,
+                           SelectMask, Gather, VL);
     }
-
-    MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1);
-    SelectMask =
-        convertToScalableVector(MaskContainerVT, SelectMask, DAG, Subtarget);
-
-    Gather = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, SelectMask, V2,
-                         Gather, VL);
   }
 
   return convertFromScalableVector(VT, Gather, DAG, Subtarget);
 }
 
-static SDValue getRVVFPExtendOrRound(SDValue Op, MVT VT, MVT ContainerVT,
-                                     SDLoc DL, SelectionDAG &DAG,
-                                     const RISCVSubtarget &Subtarget) {
-  if (VT.isScalableVector())
-    return DAG.getFPExtendOrRound(Op, DL, VT);
-  assert(VT.isFixedLengthVector() &&
-         "Unexpected value type for RVV FP extend/round lowering");
-  SDValue Mask, VL;
-  std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
-  unsigned RVVOpc = ContainerVT.bitsGT(Op.getSimpleValueType())
-                        ? RISCVISD::FP_EXTEND_VL
-                        : RISCVISD::FP_ROUND_VL;
-  return DAG.getNode(RVVOpc, DL, ContainerVT, Op, Mask, VL);
+bool RISCVTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
+  // Support splats for any type. These should type legalize well.
+  if (ShuffleVectorSDNode::isSplatMask(M.data(), VT))
+    return true;
+
+  // Only support legal VTs for other shuffles for now.
+  if (!isTypeLegal(VT))
+    return false;
+
+  MVT SVT = VT.getSimpleVT();
+
+  bool SwapSources;
+  int LoSrc, HiSrc;
+  return (isElementRotate(LoSrc, HiSrc, M) > 0) ||
+         isInterleaveShuffle(M, SVT, SwapSources, Subtarget);
 }
 
 // Lower CTLZ_ZERO_UNDEF or CTTZ_ZERO_UNDEF by converting to FP and extracting
@@ -2868,13 +2984,39 @@ SDValue RISCVTargetLowering::expandUnalignedRVVStore(SDValue Op,
                       Store->getMemOperand()->getFlags());
 }
 
-SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  switch (Op.getOpcode()) {
-  default:
-    report_fatal_error("unimplemented operand");
-  case ISD::GlobalAddress:
-    return lowerGlobalAddress(Op, DAG);
+static SDValue lowerConstant(SDValue Op, SelectionDAG &DAG,
+                             const RISCVSubtarget &Subtarget) {
+  assert(Op.getValueType() == MVT::i64 && "Unexpected VT");
+
+  int64_t Imm = cast<ConstantSDNode>(Op)->getSExtValue();
+
+  // All simm32 constants should be handled by isel.
+  // NOTE: The getMaxBuildIntsCost call below should return a value >= 2 making
+  // this check redundant, but small immediates are common so this check
+  // should have better compile time.
+  if (isInt<32>(Imm))
+    return Op;
+
+  // We only need to cost the immediate, if constant pool lowering is enabled.
+  if (!Subtarget.useConstantPoolForLargeInts())
+    return Op;
+
+  RISCVMatInt::InstSeq Seq =
+      RISCVMatInt::generateInstSeq(Imm, Subtarget.getFeatureBits());
+  if (Seq.size() <= Subtarget.getMaxBuildIntsCost())
+    return Op;
+
+  // Expand to a constant pool using the default expansion code.
+  return SDValue();
+}
+
+SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default:
+    report_fatal_error("unimplemented operand");
+  case ISD::GlobalAddress:
+    return lowerGlobalAddress(Op, DAG);
   case ISD::BlockAddress:
     return lowerBlockAddress(Op, DAG);
   case ISD::ConstantPool:
@@ -2883,6 +3025,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     return lowerJumpTable(Op, DAG);
   case ISD::GlobalTLSAddress:
     return lowerGlobalTLSAddress(Op, DAG);
+  case ISD::Constant:
+    return lowerConstant(Op, DAG, Subtarget);
   case ISD::SELECT:
     return lowerSELECT(Op, DAG);
   case ISD::BRCOND:
@@ -2905,6 +3049,30 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     SDValue Op0 = Op.getOperand(0);
     EVT Op0VT = Op0.getValueType();
     MVT XLenVT = Subtarget.getXLenVT();
+    if (VT == MVT::f16 && Op0VT == MVT::i16 && Subtarget.hasStdExtZfh()) {
+      SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, XLenVT, Op0);
+      SDValue FPConv = DAG.getNode(RISCVISD::FMV_H_X, DL, MVT::f16, NewOp0);
+      return FPConv;
+    }
+    if (VT == MVT::f32 && Op0VT == MVT::i32 && Subtarget.is64Bit() &&
+        Subtarget.hasStdExtF()) {
+      SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
+      SDValue FPConv =
+          DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, NewOp0);
+      return FPConv;
+    }
+
+    // Consider other scalar<->scalar casts as legal if the types are legal.
+    // Otherwise expand them.
+    if (!VT.isVector() && !Op0VT.isVector()) {
+      if (isTypeLegal(VT) && isTypeLegal(Op0VT))
+        return Op;
+      return SDValue();
+    }
+
+    assert(!VT.isScalableVector() && !Op0VT.isScalableVector() &&
+           "Unexpected types");
+
     if (VT.isFixedLengthVector()) {
       // We can handle fixed length vector bitcasts with a simple replacement
       // in isel.
@@ -2934,18 +3102,6 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, BVec,
                          DAG.getConstant(0, DL, XLenVT));
     }
-    if (VT == MVT::f16 && Op0VT == MVT::i16 && Subtarget.hasStdExtZfh()) {
-      SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, XLenVT, Op0);
-      SDValue FPConv = DAG.getNode(RISCVISD::FMV_H_X, DL, MVT::f16, NewOp0);
-      return FPConv;
-    }
-    if (VT == MVT::f32 && Op0VT == MVT::i32 && Subtarget.is64Bit() &&
-        Subtarget.hasStdExtF()) {
-      SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
-      SDValue FPConv =
-          DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, NewOp0);
-      return FPConv;
-    }
     return SDValue();
   }
   case ISD::INTRINSIC_WO_CHAIN:
@@ -3002,55 +3158,11 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     }
     return DAG.getNode(Opc, DL, VT, Op0, Op1, ShAmt);
   }
-  case ISD::TRUNCATE: {
-    SDLoc DL(Op);
-    MVT VT = Op.getSimpleValueType();
+  case ISD::TRUNCATE:
     // Only custom-lower vector truncates
-    if (!VT.isVector())
+    if (!Op.getSimpleValueType().isVector())
       return Op;
-
-    // Truncates to mask types are handled differently
-    if (VT.getVectorElementType() == MVT::i1)
-      return lowerVectorMaskTrunc(Op, DAG);
-
-    // RVV only has truncates which operate from SEW*2->SEW, so lower arbitrary
-    // truncates as a series of "RISCVISD::TRUNCATE_VECTOR_VL" nodes which
-    // truncate by one power of two at a time.
-    MVT DstEltVT = VT.getVectorElementType();
-
-    SDValue Src = Op.getOperand(0);
-    MVT SrcVT = Src.getSimpleValueType();
-    MVT SrcEltVT = SrcVT.getVectorElementType();
-
-    assert(DstEltVT.bitsLT(SrcEltVT) &&
-           isPowerOf2_64(DstEltVT.getSizeInBits()) &&
-           isPowerOf2_64(SrcEltVT.getSizeInBits()) &&
-           "Unexpected vector truncate lowering");
-
-    MVT ContainerVT = SrcVT;
-    if (SrcVT.isFixedLengthVector()) {
-      ContainerVT = getContainerForFixedLengthVector(SrcVT);
-      Src = convertToScalableVector(ContainerVT, Src, DAG, Subtarget);
-    }
-
-    SDValue Result = Src;
-    SDValue Mask, VL;
-    std::tie(Mask, VL) =
-        getDefaultVLOps(SrcVT, ContainerVT, DL, DAG, Subtarget);
-    LLVMContext &Context = *DAG.getContext();
-    const ElementCount Count = ContainerVT.getVectorElementCount();
-    do {
-      SrcEltVT = MVT::getIntegerVT(SrcEltVT.getSizeInBits() / 2);
-      EVT ResultVT = EVT::getVectorVT(Context, SrcEltVT, Count);
-      Result = DAG.getNode(RISCVISD::TRUNCATE_VECTOR_VL, DL, ResultVT, Result,
-                           Mask, VL);
-    } while (SrcEltVT != DstEltVT);
-
-    if (SrcVT.isFixedLengthVector())
-      Result = convertFromScalableVector(VT, Result, DAG, Subtarget);
-
-    return Result;
-  }
+    return lowerVectorTruncLike(Op, DAG);
   case ISD::ANY_EXTEND:
   case ISD::ZERO_EXTEND:
     if (Op.getOperand(0).getValueType().isVector() &&
@@ -3076,28 +3188,26 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     // minimum size. e.g. <vscale x 2 x i32>. VLENB is in bytes so we calculate
     // vscale as VLENB / 8.
     static_assert(RISCV::RVVBitsPerBlock == 64, "Unexpected bits per block!");
-    if (Subtarget.getMinVLen() < RISCV::RVVBitsPerBlock)
+    if (Subtarget.getRealMinVLen() < RISCV::RVVBitsPerBlock)
       report_fatal_error("Support for VLEN==32 is incomplete.");
-    if (isa<ConstantSDNode>(Op.getOperand(0))) {
-      // We assume VLENB is a multiple of 8. We manually choose the best shift
-      // here because SimplifyDemandedBits isn't always able to simplify it.
-      uint64_t Val = Op.getConstantOperandVal(0);
-      if (isPowerOf2_64(Val)) {
-        uint64_t Log2 = Log2_64(Val);
-        if (Log2 < 3)
-          return DAG.getNode(ISD::SRL, DL, VT, VLENB,
-                             DAG.getConstant(3 - Log2, DL, VT));
-        if (Log2 > 3)
-          return DAG.getNode(ISD::SHL, DL, VT, VLENB,
-                             DAG.getConstant(Log2 - 3, DL, VT));
-        return VLENB;
-      }
-      // If the multiplier is a multiple of 8, scale it down to avoid needing
-      // to shift the VLENB value.
-      if ((Val % 8) == 0)
-        return DAG.getNode(ISD::MUL, DL, VT, VLENB,
-                           DAG.getConstant(Val / 8, DL, VT));
-    }
+    // We assume VLENB is a multiple of 8. We manually choose the best shift
+    // here because SimplifyDemandedBits isn't always able to simplify it.
+    uint64_t Val = Op.getConstantOperandVal(0);
+    if (isPowerOf2_64(Val)) {
+      uint64_t Log2 = Log2_64(Val);
+      if (Log2 < 3)
+        return DAG.getNode(ISD::SRL, DL, VT, VLENB,
+                           DAG.getConstant(3 - Log2, DL, VT));
+      if (Log2 > 3)
+        return DAG.getNode(ISD::SHL, DL, VT, VLENB,
+                           DAG.getConstant(Log2 - 3, DL, VT));
+      return VLENB;
+    }
+    // If the multiplier is a multiple of 8, scale it down to avoid needing
+    // to shift the VLENB value.
+    if ((Val % 8) == 0)
+      return DAG.getNode(ISD::MUL, DL, VT, VLENB,
+                         DAG.getConstant(Val / 8, DL, VT));
 
     SDValue VScale = DAG.getNode(ISD::SRL, DL, VT, VLENB,
                                  DAG.getConstant(3, DL, VT));
@@ -3117,88 +3227,11 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     }
     return SDValue();
   }
-  case ISD::FP_EXTEND: {
-    // RVV can only do fp_extend to types double the size as the source. We
-    // custom-lower f16->f64 extensions to two hops of ISD::FP_EXTEND, going
-    // via f32.
-    SDLoc DL(Op);
-    MVT VT = Op.getSimpleValueType();
-    SDValue Src = Op.getOperand(0);
-    MVT SrcVT = Src.getSimpleValueType();
-
-    // Prepare any fixed-length vector operands.
-    MVT ContainerVT = VT;
-    if (SrcVT.isFixedLengthVector()) {
-      ContainerVT = getContainerForFixedLengthVector(VT);
-      MVT SrcContainerVT =
-          ContainerVT.changeVectorElementType(SrcVT.getVectorElementType());
-      Src = convertToScalableVector(SrcContainerVT, Src, DAG, Subtarget);
-    }
-
-    if (!VT.isVector() || VT.getVectorElementType() != MVT::f64 ||
-        SrcVT.getVectorElementType() != MVT::f16) {
-      // For scalable vectors, we only need to close the gap between
-      // vXf16->vXf64.
-      if (!VT.isFixedLengthVector())
-        return Op;
-      // For fixed-length vectors, lower the FP_EXTEND to a custom "VL" version.
-      Src = getRVVFPExtendOrRound(Src, VT, ContainerVT, DL, DAG, Subtarget);
-      return convertFromScalableVector(VT, Src, DAG, Subtarget);
-    }
-
-    MVT InterVT = VT.changeVectorElementType(MVT::f32);
-    MVT InterContainerVT = ContainerVT.changeVectorElementType(MVT::f32);
-    SDValue IntermediateExtend = getRVVFPExtendOrRound(
-        Src, InterVT, InterContainerVT, DL, DAG, Subtarget);
-
-    SDValue Extend = getRVVFPExtendOrRound(IntermediateExtend, VT, ContainerVT,
-                                           DL, DAG, Subtarget);
-    if (VT.isFixedLengthVector())
-      return convertFromScalableVector(VT, Extend, DAG, Subtarget);
-    return Extend;
-  }
-  case ISD::FP_ROUND: {
-    // RVV can only do fp_round to types half the size as the source. We
-    // custom-lower f64->f16 rounds via RVV's round-to-odd float
-    // conversion instruction.
-    SDLoc DL(Op);
-    MVT VT = Op.getSimpleValueType();
-    SDValue Src = Op.getOperand(0);
-    MVT SrcVT = Src.getSimpleValueType();
-
-    // Prepare any fixed-length vector operands.
-    MVT ContainerVT = VT;
-    if (VT.isFixedLengthVector()) {
-      MVT SrcContainerVT = getContainerForFixedLengthVector(SrcVT);
-      ContainerVT =
-          SrcContainerVT.changeVectorElementType(VT.getVectorElementType());
-      Src = convertToScalableVector(SrcContainerVT, Src, DAG, Subtarget);
-    }
-
-    if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
-        SrcVT.getVectorElementType() != MVT::f64) {
-      // For scalable vectors, we only need to close the gap between
-      // vXf64<->vXf16.
-      if (!VT.isFixedLengthVector())
-        return Op;
-      // For fixed-length vectors, lower the FP_ROUND to a custom "VL" version.
-      Src = getRVVFPExtendOrRound(Src, VT, ContainerVT, DL, DAG, Subtarget);
-      return convertFromScalableVector(VT, Src, DAG, Subtarget);
-    }
-
-    SDValue Mask, VL;
-    std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
-
-    MVT InterVT = ContainerVT.changeVectorElementType(MVT::f32);
-    SDValue IntermediateRound =
-        DAG.getNode(RISCVISD::VFNCVT_ROD_VL, DL, InterVT, Src, Mask, VL);
-    SDValue Round = getRVVFPExtendOrRound(IntermediateRound, VT, ContainerVT,
-                                          DL, DAG, Subtarget);
-
-    if (VT.isFixedLengthVector())
-      return convertFromScalableVector(VT, Round, DAG, Subtarget);
-    return Round;
-  }
+  case ISD::FP_EXTEND:
+  case ISD::FP_ROUND:
+    if (!Op.getValueType().isVector())
+      return Op;
+    return lowerVectorFPExtendOrRoundLike(Op, DAG);
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
   case ISD::SINT_TO_FP:
@@ -3221,10 +3254,10 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
 
     bool IsInt2FP = SrcEltVT.isInteger();
     // Widening conversions
-    if (EltSize > SrcEltSize && (EltSize / SrcEltSize >= 4)) {
+    if (EltSize > (2 * SrcEltSize)) {
       if (IsInt2FP) {
         // Do a regular integer sign/zero extension then convert to float.
-        MVT IVecVT = MVT::getVectorVT(MVT::getIntegerVT(EltVT.getSizeInBits()),
+        MVT IVecVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize),
                                       VT.getVectorElementCount());
         unsigned ExtOpcode = Op.getOpcode() == ISD::UINT_TO_FP
                                  ? ISD::ZERO_EXTEND
@@ -3242,7 +3275,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     }
 
     // Narrowing conversions
-    if (SrcEltSize > EltSize && (SrcEltSize / EltSize >= 4)) {
+    if (SrcEltSize > (2 * EltSize)) {
       if (IsInt2FP) {
         // One narrowing int_to_fp, then an fp_round.
         assert(EltVT == MVT::f16 && "Unexpected [US]_TO_FP lowering");
@@ -3253,9 +3286,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
       // FP2Int
       // One narrowing fp_to_int, then truncate the integer. If the float isn't
       // representable by the integer, the result is poison.
-      MVT IVecVT =
-          MVT::getVectorVT(MVT::getIntegerVT(SrcEltVT.getSizeInBits() / 2),
-                           VT.getVectorElementCount());
+      MVT IVecVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize / 2),
+                                    VT.getVectorElementCount());
       SDValue FP2Int = DAG.getNode(Op.getOpcode(), DL, IVecVT, Src);
       return DAG.getNode(ISD::TRUNCATE, DL, VT, FP2Int);
     }
@@ -3309,6 +3341,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
   case ISD::FCEIL:
   case ISD::FFLOOR:
     return lowerFTRUNC_FCEIL_FFLOOR(Op, DAG);
+  case ISD::FROUND:
+    return lowerFROUND(Op, DAG);
   case ISD::VECREDUCE_ADD:
   case ISD::VECREDUCE_UMAX:
   case ISD::VECREDUCE_SMAX:
@@ -3350,12 +3384,14 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     return lowerSTEP_VECTOR(Op, DAG);
   case ISD::VECTOR_REVERSE:
     return lowerVECTOR_REVERSE(Op, DAG);
+  case ISD::VECTOR_SPLICE:
+    return lowerVECTOR_SPLICE(Op, DAG);
   case ISD::BUILD_VECTOR:
     return lowerBUILD_VECTOR(Op, DAG, Subtarget);
   case ISD::SPLAT_VECTOR:
     if (Op.getValueType().getVectorElementType() == MVT::i1)
       return lowerVectorMaskSplat(Op, DAG);
-    return lowerSPLAT_VECTOR(Op, DAG, Subtarget);
+    return SDValue();
   case ISD::VECTOR_SHUFFLE:
     return lowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
   case ISD::CONCAT_VECTORS: {
@@ -3455,7 +3491,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
   case ISD::FSQRT:
     return lowerToScalableOp(Op, DAG, RISCVISD::FSQRT_VL);
   case ISD::FMA:
-    return lowerToScalableOp(Op, DAG, RISCVISD::FMA_VL);
+    return lowerToScalableOp(Op, DAG, RISCVISD::VFMADD_VL);
   case ISD::SMIN:
     return lowerToScalableOp(Op, DAG, RISCVISD::SMIN_VL);
   case ISD::SMAX:
@@ -3487,6 +3523,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     return lowerGET_ROUNDING(Op, DAG);
   case ISD::SET_ROUNDING:
     return lowerSET_ROUNDING(Op, DAG);
+  case ISD::EH_DWARF_CFA:
+    return lowerEH_DWARF_CFA(Op, DAG);
   case ISD::VP_SELECT:
     return lowerVPOp(Op, DAG, RISCVISD::VSELECT_VL);
   case ISD::VP_MERGE:
@@ -3525,6 +3563,35 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     return lowerVPOp(Op, DAG, RISCVISD::FMUL_VL);
   case ISD::VP_FDIV:
     return lowerVPOp(Op, DAG, RISCVISD::FDIV_VL);
+  case ISD::VP_FNEG:
+    return lowerVPOp(Op, DAG, RISCVISD::FNEG_VL);
+  case ISD::VP_FMA:
+    return lowerVPOp(Op, DAG, RISCVISD::VFMADD_VL);
+  case ISD::VP_SIGN_EXTEND:
+  case ISD::VP_ZERO_EXTEND:
+    if (Op.getOperand(0).getSimpleValueType().getVectorElementType() == MVT::i1)
+      return lowerVPExtMaskOp(Op, DAG);
+    return lowerVPOp(Op, DAG,
+                     Op.getOpcode() == ISD::VP_SIGN_EXTEND
+                         ? RISCVISD::VSEXT_VL
+                         : RISCVISD::VZEXT_VL);
+  case ISD::VP_TRUNCATE:
+    return lowerVectorTruncLike(Op, DAG);
+  case ISD::VP_FP_EXTEND:
+  case ISD::VP_FP_ROUND:
+    return lowerVectorFPExtendOrRoundLike(Op, DAG);
+  case ISD::VP_FPTOSI:
+    return lowerVPFPIntConvOp(Op, DAG, RISCVISD::FP_TO_SINT_VL);
+  case ISD::VP_FPTOUI:
+    return lowerVPFPIntConvOp(Op, DAG, RISCVISD::FP_TO_UINT_VL);
+  case ISD::VP_SITOFP:
+    return lowerVPFPIntConvOp(Op, DAG, RISCVISD::SINT_TO_FP_VL);
+  case ISD::VP_UITOFP:
+    return lowerVPFPIntConvOp(Op, DAG, RISCVISD::UINT_TO_FP_VL);
+  case ISD::VP_SETCC:
+    if (Op.getOperand(0).getSimpleValueType().getVectorElementType() == MVT::i1)
+      return lowerVPSetCCMaskOp(Op, DAG);
+    return lowerVPOp(Op, DAG, RISCVISD::SETCC_VL);
   }
 }
 
@@ -3562,12 +3629,21 @@ SDValue RISCVTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
       // Use PC-relative addressing to access the symbol. This generates the
       // pattern (PseudoLLA sym), which expands to (addi (auipc %pcrel_hi(sym))
       // %pcrel_lo(auipc)).
-      return SDValue(DAG.getMachineNode(RISCV::PseudoLLA, DL, Ty, Addr), 0);
+      return DAG.getNode(RISCVISD::LLA, DL, Ty, Addr);
 
     // Use PC-relative addressing to access the GOT for this symbol, then load
     // the address from the GOT. This generates the pattern (PseudoLA sym),
     // which expands to (ld (addi (auipc %got_pcrel_hi(sym)) %pcrel_lo(auipc))).
-    return SDValue(DAG.getMachineNode(RISCV::PseudoLA, DL, Ty, Addr), 0);
+    MachineFunction &MF = DAG.getMachineFunction();
+    MachineMemOperand *MemOp = MF.getMachineMemOperand(
+        MachinePointerInfo::getGOT(MF),
+        MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+            MachineMemOperand::MOInvariant,
+        LLT(Ty.getSimpleVT()), Align(Ty.getFixedSizeInBits() / 8));
+    SDValue Load =
+        DAG.getMemIntrinsicNode(RISCVISD::LA, DL, DAG.getVTList(Ty, MVT::Other),
+                                {DAG.getEntryNode(), Addr}, Ty, MemOp);
+    return Load;
   }
 
   switch (getTargetMachine().getCodeModel()) {
@@ -3578,15 +3654,15 @@ SDValue RISCVTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
     // address space. This generates the pattern (addi (lui %hi(sym)) %lo(sym)).
     SDValue AddrHi = getTargetNode(N, DL, Ty, DAG, RISCVII::MO_HI);
     SDValue AddrLo = getTargetNode(N, DL, Ty, DAG, RISCVII::MO_LO);
-    SDValue MNHi = SDValue(DAG.getMachineNode(RISCV::LUI, DL, Ty, AddrHi), 0);
-    return SDValue(DAG.getMachineNode(RISCV::ADDI, DL, Ty, MNHi, AddrLo), 0);
+    SDValue MNHi = DAG.getNode(RISCVISD::HI, DL, Ty, AddrHi);
+    return DAG.getNode(RISCVISD::ADD_LO, DL, Ty, MNHi, AddrLo);
   }
   case CodeModel::Medium: {
     // Generate a sequence for accessing addresses within any 2GiB range within
     // the address space. This generates the pattern (PseudoLLA sym), which
     // expands to (addi (auipc %pcrel_hi(sym)) %pcrel_lo(auipc)).
     SDValue Addr = getTargetNode(N, DL, Ty, DAG, 0);
-    return SDValue(DAG.getMachineNode(RISCV::PseudoLLA, DL, Ty, Addr), 0);
+    return DAG.getNode(RISCVISD::LLA, DL, Ty, Addr);
   }
   }
 }
@@ -3594,23 +3670,12 @@ SDValue RISCVTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
 SDValue RISCVTargetLowering::lowerGlobalAddress(SDValue Op,
                                                 SelectionDAG &DAG) const {
   SDLoc DL(Op);
-  EVT Ty = Op.getValueType();
   GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
-  int64_t Offset = N->getOffset();
-  MVT XLenVT = Subtarget.getXLenVT();
+  assert(N->getOffset() == 0 && "unexpected offset in global node");
 
   const GlobalValue *GV = N->getGlobal();
   bool IsLocal = getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
-  SDValue Addr = getAddr(N, DAG, IsLocal);
-
-  // In order to maximise the opportunity for common subexpression elimination,
-  // emit a separate ADD node for the global address offset instead of folding
-  // it in the global address node. Later peephole optimisations may choose to
-  // fold it back in when profitable.
-  if (Offset != 0)
-    return DAG.getNode(ISD::ADD, DL, Ty, Addr,
-                       DAG.getConstant(Offset, DL, XLenVT));
-  return Addr;
+  return getAddr(N, DAG, IsLocal);
 }
 
 SDValue RISCVTargetLowering::lowerBlockAddress(SDValue Op,
@@ -3648,8 +3713,15 @@ SDValue RISCVTargetLowering::getStaticTLSAddr(GlobalAddressSDNode *N,
     // the pattern (PseudoLA_TLS_IE sym), which expands to
     // (ld (auipc %tls_ie_pcrel_hi(sym)) %pcrel_lo(auipc)).
     SDValue Addr = DAG.getTargetGlobalAddress(GV, DL, Ty, 0, 0);
-    SDValue Load =
-        SDValue(DAG.getMachineNode(RISCV::PseudoLA_TLS_IE, DL, Ty, Addr), 0);
+    MachineFunction &MF = DAG.getMachineFunction();
+    MachineMemOperand *MemOp = MF.getMachineMemOperand(
+        MachinePointerInfo::getGOT(MF),
+        MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+            MachineMemOperand::MOInvariant,
+        LLT(Ty.getSimpleVT()), Align(Ty.getFixedSizeInBits() / 8));
+    SDValue Load = DAG.getMemIntrinsicNode(
+        RISCVISD::LA_TLS_IE, DL, DAG.getVTList(Ty, MVT::Other),
+        {DAG.getEntryNode(), Addr}, Ty, MemOp);
 
     // Add the thread pointer.
     SDValue TPReg = DAG.getRegister(RISCV::X4, XLenVT);
@@ -3667,12 +3739,11 @@ SDValue RISCVTargetLowering::getStaticTLSAddr(GlobalAddressSDNode *N,
   SDValue AddrLo =
       DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_TPREL_LO);
 
-  SDValue MNHi = SDValue(DAG.getMachineNode(RISCV::LUI, DL, Ty, AddrHi), 0);
+  SDValue MNHi = DAG.getNode(RISCVISD::HI, DL, Ty, AddrHi);
   SDValue TPReg = DAG.getRegister(RISCV::X4, XLenVT);
-  SDValue MNAdd = SDValue(
-      DAG.getMachineNode(RISCV::PseudoAddTPRel, DL, Ty, MNHi, TPReg, AddrAdd),
-      0);
-  return SDValue(DAG.getMachineNode(RISCV::ADDI, DL, Ty, MNAdd, AddrLo), 0);
+  SDValue MNAdd =
+      DAG.getNode(RISCVISD::ADD_TPREL, DL, Ty, MNHi, TPReg, AddrAdd);
+  return DAG.getNode(RISCVISD::ADD_LO, DL, Ty, MNAdd, AddrLo);
 }
 
 SDValue RISCVTargetLowering::getDynamicTLSAddr(GlobalAddressSDNode *N,
@@ -3686,8 +3757,7 @@ SDValue RISCVTargetLowering::getDynamicTLSAddr(GlobalAddressSDNode *N,
   // This generates the pattern (PseudoLA_TLS_GD sym), which expands to
   // (addi (auipc %tls_gd_pcrel_hi(sym)) %pcrel_lo(auipc)).
   SDValue Addr = DAG.getTargetGlobalAddress(GV, DL, Ty, 0, 0);
-  SDValue Load =
-      SDValue(DAG.getMachineNode(RISCV::PseudoLA_TLS_GD, DL, Ty, Addr), 0);
+  SDValue Load = DAG.getNode(RISCVISD::LA_TLS_GD, DL, Ty, Addr);
 
   // Prepare argument list to generate call.
   ArgListTy Args;
@@ -3710,10 +3780,8 @@ SDValue RISCVTargetLowering::getDynamicTLSAddr(GlobalAddressSDNode *N,
 SDValue RISCVTargetLowering::lowerGlobalTLSAddress(SDValue Op,
                                                    SelectionDAG &DAG) const {
   SDLoc DL(Op);
-  EVT Ty = Op.getValueType();
   GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
-  int64_t Offset = N->getOffset();
-  MVT XLenVT = Subtarget.getXLenVT();
+  assert(N->getOffset() == 0 && "unexpected offset in global node");
 
   TLSModel::Model Model = getTargetMachine().getTLSModel(N->getGlobal());
 
@@ -3735,13 +3803,6 @@ SDValue RISCVTargetLowering::lowerGlobalTLSAddress(SDValue Op,
     break;
   }
 
-  // In order to maximise the opportunity for common subexpression elimination,
-  // emit a separate ADD node for the global address offset instead of folding
-  // it in the global address node. Later peephole optimisations may choose to
-  // fold it back in when profitable.
-  if (Offset != 0)
-    return DAG.getNode(ISD::ADD, DL, Ty, Addr,
-                       DAG.getConstant(Offset, DL, XLenVT));
   return Addr;
 }
 
@@ -3911,7 +3972,7 @@ SDValue RISCVTargetLowering::lowerShiftLeftParts(SDValue Op,
 
   // if Shamt-XLEN < 0: // Shamt < XLEN
   //   Lo = Lo << Shamt
-  //   Hi = (Hi << Shamt) | ((Lo >>u 1) >>u (XLEN-1 - Shamt))
+  //   Hi = (Hi << Shamt) | ((Lo >>u 1) >>u (XLEN-1 ^ Shamt))
   // else:
   //   Lo = 0
   //   Hi = Lo << (Shamt-XLEN)
@@ -3921,7 +3982,7 @@ SDValue RISCVTargetLowering::lowerShiftLeftParts(SDValue Op,
   SDValue MinusXLen = DAG.getConstant(-(int)Subtarget.getXLen(), DL, VT);
   SDValue XLenMinus1 = DAG.getConstant(Subtarget.getXLen() - 1, DL, VT);
   SDValue ShamtMinusXLen = DAG.getNode(ISD::ADD, DL, VT, Shamt, MinusXLen);
-  SDValue XLenMinus1Shamt = DAG.getNode(ISD::SUB, DL, VT, XLenMinus1, Shamt);
+  SDValue XLenMinus1Shamt = DAG.getNode(ISD::XOR, DL, VT, Shamt, XLenMinus1);
 
   SDValue LoTrue = DAG.getNode(ISD::SHL, DL, VT, Lo, Shamt);
   SDValue ShiftRight1Lo = DAG.getNode(ISD::SRL, DL, VT, Lo, One);
@@ -3950,7 +4011,7 @@ SDValue RISCVTargetLowering::lowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
 
   // SRA expansion:
   //   if Shamt-XLEN < 0: // Shamt < XLEN
-  //     Lo = (Lo >>u Shamt) | ((Hi << 1) << (XLEN-1 - Shamt))
+  //     Lo = (Lo >>u Shamt) | ((Hi << 1) << (ShAmt ^ XLEN-1))
   //     Hi = Hi >>s Shamt
   //   else:
   //     Lo = Hi >>s (Shamt-XLEN);
@@ -3958,7 +4019,7 @@ SDValue RISCVTargetLowering::lowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
   //
   // SRL expansion:
   //   if Shamt-XLEN < 0: // Shamt < XLEN
-  //     Lo = (Lo >>u Shamt) | ((Hi << 1) << (XLEN-1 - Shamt))
+  //     Lo = (Lo >>u Shamt) | ((Hi << 1) << (ShAmt ^ XLEN-1))
   //     Hi = Hi >>u Shamt
   //   else:
   //     Lo = Hi >>u (Shamt-XLEN);
@@ -3971,7 +4032,7 @@ SDValue RISCVTargetLowering::lowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
   SDValue MinusXLen = DAG.getConstant(-(int)Subtarget.getXLen(), DL, VT);
   SDValue XLenMinus1 = DAG.getConstant(Subtarget.getXLen() - 1, DL, VT);
   SDValue ShamtMinusXLen = DAG.getNode(ISD::ADD, DL, VT, Shamt, MinusXLen);
-  SDValue XLenMinus1Shamt = DAG.getNode(ISD::SUB, DL, VT, XLenMinus1, Shamt);
+  SDValue XLenMinus1Shamt = DAG.getNode(ISD::XOR, DL, VT, Shamt, XLenMinus1);
 
   SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, VT, Lo, Shamt);
   SDValue ShiftLeftHi1 = DAG.getNode(ISD::SHL, DL, VT, Hi, One);
@@ -4022,7 +4083,7 @@ SDValue RISCVTargetLowering::lowerVectorMaskSplat(SDValue Op,
 // Custom-lower a SPLAT_VECTOR_PARTS where XLEN<SEW, as the SEW element type is
 // illegal (currently only vXi64 RV32).
 // FIXME: We could also catch non-constant sign-extended i32 values and lower
-// them to SPLAT_VECTOR_I64
+// them to VMV_V_X_VL.
 SDValue RISCVTargetLowering::lowerSPLAT_VECTOR_PARTS(SDValue Op,
                                                      SelectionDAG &DAG) const {
   SDLoc DL(Op);
@@ -4041,7 +4102,8 @@ SDValue RISCVTargetLowering::lowerSPLAT_VECTOR_PARTS(SDValue Op,
     std::tie(Mask, VL) =
         getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget);
 
-    SDValue Res = splatPartsI64WithVL(DL, ContainerVT, Lo, Hi, VL, DAG);
+    SDValue Res =
+        splatPartsI64WithVL(DL, ContainerVT, SDValue(), Lo, Hi, VL, DAG);
     return convertFromScalableVector(VecVT, Res, DAG, Subtarget);
   }
 
@@ -4051,18 +4113,21 @@ SDValue RISCVTargetLowering::lowerSPLAT_VECTOR_PARTS(SDValue Op,
     // If Hi constant is all the same sign bit as Lo, lower this as a custom
     // node in order to try and match RVV vector/scalar instructions.
     if ((LoC >> 31) == HiC)
-      return DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT, Lo);
+      return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VecVT, DAG.getUNDEF(VecVT),
+                         Lo, DAG.getRegister(RISCV::X0, MVT::i32));
   }
 
   // Detect cases where Hi is (SRA Lo, 31) which means Hi is Lo sign extended.
   if (Hi.getOpcode() == ISD::SRA && Hi.getOperand(0) == Lo &&
       isa<ConstantSDNode>(Hi.getOperand(1)) &&
       Hi.getConstantOperandVal(1) == 31)
-    return DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT, Lo);
+    return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VecVT, DAG.getUNDEF(VecVT), Lo,
+                       DAG.getRegister(RISCV::X0, MVT::i32));
 
   // Fall back to use a stack store and stride x0 vector load. Use X0 as VL.
-  return DAG.getNode(RISCVISD::SPLAT_VECTOR_SPLIT_I64_VL, DL, VecVT, Lo, Hi,
-                     DAG.getTargetConstant(RISCV::VLMaxSentinel, DL, MVT::i64));
+  return DAG.getNode(RISCVISD::SPLAT_VECTOR_SPLIT_I64_VL, DL, VecVT,
+                     DAG.getUNDEF(VecVT), Lo, Hi,
+                     DAG.getRegister(RISCV::X0, MVT::i32));
 }
 
 // Custom-lower extensions from mask vectors by using a vselect either with 1
@@ -4078,27 +4143,9 @@ SDValue RISCVTargetLowering::lowerVectorMaskExt(SDValue Op, SelectionDAG &DAG,
   assert(Src.getValueType().isVector() &&
          Src.getValueType().getVectorElementType() == MVT::i1);
 
-  MVT XLenVT = Subtarget.getXLenVT();
-  SDValue SplatZero = DAG.getConstant(0, DL, XLenVT);
-  SDValue SplatTrueVal = DAG.getConstant(ExtTrueVal, DL, XLenVT);
-
   if (VecVT.isScalableVector()) {
-    // Be careful not to introduce illegal scalar types at this stage, and be
-    // careful also about splatting constants as on RV32, vXi64 SPLAT_VECTOR is
-    // illegal and must be expanded. Since we know that the constants are
-    // sign-extended 32-bit values, we use SPLAT_VECTOR_I64 directly.
-    bool IsRV32E64 =
-        !Subtarget.is64Bit() && VecVT.getVectorElementType() == MVT::i64;
-
-    if (!IsRV32E64) {
-      SplatZero = DAG.getSplatVector(VecVT, DL, SplatZero);
-      SplatTrueVal = DAG.getSplatVector(VecVT, DL, SplatTrueVal);
-    } else {
-      SplatZero = DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT, SplatZero);
-      SplatTrueVal =
-          DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT, SplatTrueVal);
-    }
-
+    SDValue SplatZero = DAG.getConstant(0, DL, VecVT);
+    SDValue SplatTrueVal = DAG.getConstant(ExtTrueVal, DL, VecVT);
     return DAG.getNode(ISD::VSELECT, DL, VecVT, Src, SplatTrueVal, SplatZero);
   }
 
@@ -4111,9 +4158,14 @@ SDValue RISCVTargetLowering::lowerVectorMaskExt(SDValue Op, SelectionDAG &DAG,
   SDValue Mask, VL;
   std::tie(Mask, VL) = getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget);
 
-  SplatZero = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT, SplatZero, VL);
-  SplatTrueVal =
-      DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT, SplatTrueVal, VL);
+  MVT XLenVT = Subtarget.getXLenVT();
+  SDValue SplatZero = DAG.getConstant(0, DL, XLenVT);
+  SDValue SplatTrueVal = DAG.getConstant(ExtTrueVal, DL, XLenVT);
+
+  SplatZero = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
+                          DAG.getUNDEF(ContainerVT), SplatZero, VL);
+  SplatTrueVal = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
+                             DAG.getUNDEF(ContainerVT), SplatTrueVal, VL);
   SDValue Select = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, CC,
                                SplatTrueVal, SplatZero, VL);
 
@@ -4151,8 +4203,9 @@ SDValue RISCVTargetLowering::lowerFixedLengthVectorExtendToRVV(
 // Custom-lower truncations from vectors to mask vectors by using a mask and a
 // setcc operation:
 //   (vXi1 = trunc vXiN vec) -> (vXi1 = setcc (and vec, 1), 0, ne)
-SDValue RISCVTargetLowering::lowerVectorMaskTrunc(SDValue Op,
-                                                  SelectionDAG &DAG) const {
+SDValue RISCVTargetLowering::lowerVectorMaskTruncLike(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  bool IsVPTrunc = Op.getOpcode() == ISD::VP_TRUNCATE;
   SDLoc DL(Op);
   EVT MaskVT = Op.getValueType();
   // Only expect to custom-lower truncations to mask types
@@ -4160,34 +4213,176 @@ SDValue RISCVTargetLowering::lowerVectorMaskTrunc(SDValue Op,
          "Unexpected type for vector mask lowering");
   SDValue Src = Op.getOperand(0);
   MVT VecVT = Src.getSimpleValueType();
-
+  SDValue Mask, VL;
+  if (IsVPTrunc) {
+    Mask = Op.getOperand(1);
+    VL = Op.getOperand(2);
+  }
   // If this is a fixed vector, we need to convert it to a scalable vector.
   MVT ContainerVT = VecVT;
+
   if (VecVT.isFixedLengthVector()) {
     ContainerVT = getContainerForFixedLengthVector(VecVT);
     Src = convertToScalableVector(ContainerVT, Src, DAG, Subtarget);
+    if (IsVPTrunc) {
+      MVT MaskContainerVT =
+          getContainerForFixedLengthVector(Mask.getSimpleValueType());
+      Mask = convertToScalableVector(MaskContainerVT, Mask, DAG, Subtarget);
+    }
+  }
+
+  if (!IsVPTrunc) {
+    std::tie(Mask, VL) =
+        getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget);
   }
 
   SDValue SplatOne = DAG.getConstant(1, DL, Subtarget.getXLenVT());
   SDValue SplatZero = DAG.getConstant(0, DL, Subtarget.getXLenVT());
 
-  SplatOne = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT, SplatOne);
-  SplatZero = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT, SplatZero);
-
-  if (VecVT.isScalableVector()) {
-    SDValue Trunc = DAG.getNode(ISD::AND, DL, VecVT, Src, SplatOne);
-    return DAG.getSetCC(DL, MaskVT, Trunc, SplatZero, ISD::SETNE);
-  }
-
-  SDValue Mask, VL;
-  std::tie(Mask, VL) = getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget);
+  SplatOne = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
+                         DAG.getUNDEF(ContainerVT), SplatOne, VL);
+  SplatZero = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
+                          DAG.getUNDEF(ContainerVT), SplatZero, VL);
 
   MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1);
   SDValue Trunc =
       DAG.getNode(RISCVISD::AND_VL, DL, ContainerVT, Src, SplatOne, Mask, VL);
   Trunc = DAG.getNode(RISCVISD::SETCC_VL, DL, MaskContainerVT, Trunc, SplatZero,
                       DAG.getCondCode(ISD::SETNE), Mask, VL);
-  return convertFromScalableVector(MaskVT, Trunc, DAG, Subtarget);
+  if (MaskVT.isFixedLengthVector())
+    Trunc = convertFromScalableVector(MaskVT, Trunc, DAG, Subtarget);
+  return Trunc;
+}
+
+SDValue RISCVTargetLowering::lowerVectorTruncLike(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  bool IsVPTrunc = Op.getOpcode() == ISD::VP_TRUNCATE;
+  SDLoc DL(Op);
+
+  MVT VT = Op.getSimpleValueType();
+  // Only custom-lower vector truncates
+  assert(VT.isVector() && "Unexpected type for vector truncate lowering");
+
+  // Truncates to mask types are handled differently
+  if (VT.getVectorElementType() == MVT::i1)
+    return lowerVectorMaskTruncLike(Op, DAG);
+
+  // RVV only has truncates which operate from SEW*2->SEW, so lower arbitrary
+  // truncates as a series of "RISCVISD::TRUNCATE_VECTOR_VL" nodes which
+  // truncate by one power of two at a time.
+  MVT DstEltVT = VT.getVectorElementType();
+
+  SDValue Src = Op.getOperand(0);
+  MVT SrcVT = Src.getSimpleValueType();
+  MVT SrcEltVT = SrcVT.getVectorElementType();
+
+  assert(DstEltVT.bitsLT(SrcEltVT) && isPowerOf2_64(DstEltVT.getSizeInBits()) &&
+         isPowerOf2_64(SrcEltVT.getSizeInBits()) &&
+         "Unexpected vector truncate lowering");
+
+  MVT ContainerVT = SrcVT;
+  SDValue Mask, VL;
+  if (IsVPTrunc) {
+    Mask = Op.getOperand(1);
+    VL = Op.getOperand(2);
+  }
+  if (SrcVT.isFixedLengthVector()) {
+    ContainerVT = getContainerForFixedLengthVector(SrcVT);
+    Src = convertToScalableVector(ContainerVT, Src, DAG, Subtarget);
+    if (IsVPTrunc) {
+      MVT MaskVT = getMaskTypeFor(ContainerVT);
+      Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
+    }
+  }
+
+  SDValue Result = Src;
+  if (!IsVPTrunc) {
+    std::tie(Mask, VL) =
+        getDefaultVLOps(SrcVT, ContainerVT, DL, DAG, Subtarget);
+  }
+
+  LLVMContext &Context = *DAG.getContext();
+  const ElementCount Count = ContainerVT.getVectorElementCount();
+  do {
+    SrcEltVT = MVT::getIntegerVT(SrcEltVT.getSizeInBits() / 2);
+    EVT ResultVT = EVT::getVectorVT(Context, SrcEltVT, Count);
+    Result = DAG.getNode(RISCVISD::TRUNCATE_VECTOR_VL, DL, ResultVT, Result,
+                         Mask, VL);
+  } while (SrcEltVT != DstEltVT);
+
+  if (SrcVT.isFixedLengthVector())
+    Result = convertFromScalableVector(VT, Result, DAG, Subtarget);
+
+  return Result;
+}
+
+SDValue
+RISCVTargetLowering::lowerVectorFPExtendOrRoundLike(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  bool IsVP =
+      Op.getOpcode() == ISD::VP_FP_ROUND || Op.getOpcode() == ISD::VP_FP_EXTEND;
+  bool IsExtend =
+      Op.getOpcode() == ISD::VP_FP_EXTEND || Op.getOpcode() == ISD::FP_EXTEND;
+  // RVV can only do truncate fp to types half the size as the source. We
+  // custom-lower f64->f16 rounds via RVV's round-to-odd float
+  // conversion instruction.
+  SDLoc DL(Op);
+  MVT VT = Op.getSimpleValueType();
+
+  assert(VT.isVector() && "Unexpected type for vector truncate lowering");
+
+  SDValue Src = Op.getOperand(0);
+  MVT SrcVT = Src.getSimpleValueType();
+
+  bool IsDirectExtend = IsExtend && (VT.getVectorElementType() != MVT::f64 ||
+                                     SrcVT.getVectorElementType() != MVT::f16);
+  bool IsDirectTrunc = !IsExtend && (VT.getVectorElementType() != MVT::f16 ||
+                                     SrcVT.getVectorElementType() != MVT::f64);
+
+  bool IsDirectConv = IsDirectExtend || IsDirectTrunc;
+
+  // Prepare any fixed-length vector operands.
+  MVT ContainerVT = VT;
+  SDValue Mask, VL;
+  if (IsVP) {
+    Mask = Op.getOperand(1);
+    VL = Op.getOperand(2);
+  }
+  if (VT.isFixedLengthVector()) {
+    MVT SrcContainerVT = getContainerForFixedLengthVector(SrcVT);
+    ContainerVT =
+        SrcContainerVT.changeVectorElementType(VT.getVectorElementType());
+    Src = convertToScalableVector(SrcContainerVT, Src, DAG, Subtarget);
+    if (IsVP) {
+      MVT MaskVT = getMaskTypeFor(ContainerVT);
+      Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
+    }
+  }
+
+  if (!IsVP)
+    std::tie(Mask, VL) =
+        getDefaultVLOps(SrcVT, ContainerVT, DL, DAG, Subtarget);
+
+  unsigned ConvOpc = IsExtend ? RISCVISD::FP_EXTEND_VL : RISCVISD::FP_ROUND_VL;
+
+  if (IsDirectConv) {
+    Src = DAG.getNode(ConvOpc, DL, ContainerVT, Src, Mask, VL);
+    if (VT.isFixedLengthVector())
+      Src = convertFromScalableVector(VT, Src, DAG, Subtarget);
+    return Src;
+  }
+
+  unsigned InterConvOpc =
+      IsExtend ? RISCVISD::FP_EXTEND_VL : RISCVISD::VFNCVT_ROD_VL;
+
+  MVT InterVT = ContainerVT.changeVectorElementType(MVT::f32);
+  SDValue IntermediateConv =
+      DAG.getNode(InterConvOpc, DL, InterVT, Src, Mask, VL);
+  SDValue Result =
+      DAG.getNode(ConvOpc, DL, ContainerVT, IntermediateConv, Mask, VL);
+  if (VT.isFixedLengthVector())
+    return convertFromScalableVector(VT, Result, DAG, Subtarget);
+  return Result;
 }
 
 // Custom-legalize INSERT_VECTOR_ELT so that the value is inserted into the
@@ -4268,13 +4463,15 @@ SDValue RISCVTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
     SDValue InsertI64VL = DAG.getConstant(2, DL, XLenVT);
     // Note: We can't pass a UNDEF to the first VSLIDE1UP_VL since an untied
     // undef doesn't obey the earlyclobber constraint. Just splat a zero value.
-    ValInVec = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, I32ContainerVT, Zero,
-                           InsertI64VL);
+    ValInVec = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, I32ContainerVT,
+                           DAG.getUNDEF(I32ContainerVT), Zero, InsertI64VL);
     // First slide in the hi value, then the lo in underneath it.
-    ValInVec = DAG.getNode(RISCVISD::VSLIDE1UP_VL, DL, I32ContainerVT, ValInVec,
-                           ValHi, I32Mask, InsertI64VL);
-    ValInVec = DAG.getNode(RISCVISD::VSLIDE1UP_VL, DL, I32ContainerVT, ValInVec,
-                           ValLo, I32Mask, InsertI64VL);
+    ValInVec = DAG.getNode(RISCVISD::VSLIDE1UP_VL, DL, I32ContainerVT,
+                           DAG.getUNDEF(I32ContainerVT), ValInVec, ValHi,
+                           I32Mask, InsertI64VL);
+    ValInVec = DAG.getNode(RISCVISD::VSLIDE1UP_VL, DL, I32ContainerVT,
+                           DAG.getUNDEF(I32ContainerVT), ValInVec, ValLo,
+                           I32Mask, InsertI64VL);
     // Bitcast back to the right container type.
     ValInVec = DAG.getBitcast(ContainerVT, ValInVec);
   }
@@ -4310,7 +4507,7 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
         unsigned WidenVecLen;
         SDValue ExtractElementIdx;
         SDValue ExtractBitIdx;
-        unsigned MaxEEW = Subtarget.getMaxELENForFixedLengthVectors();
+        unsigned MaxEEW = Subtarget.getELEN();
         MVT LargestEltVT = MVT::getIntegerVT(
             std::min(MaxEEW, unsigned(XLenVT.getSizeInBits())));
         if (NumElts <= LargestEltVT.getSizeInBits()) {
@@ -4360,8 +4557,7 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
   if (!isNullConstant(Idx)) {
     // Use a VL of 1 to avoid processing more elements than we need.
     SDValue VL = DAG.getConstant(1, DL, XLenVT);
-    MVT MaskVT = MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
-    SDValue Mask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL);
+    SDValue Mask = getAllOnesMask(ContainerVT, VL, DL, DAG);
     Vec = DAG.getNode(RISCVISD::VSLIDEDOWN_VL, DL, ContainerVT,
                       DAG.getUNDEF(ContainerVT), Vec, Idx, Mask, VL);
   }
@@ -4378,8 +4574,8 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
 
 // Some RVV intrinsics may claim that they want an integer operand to be
 // promoted or expanded.
-static SDValue lowerVectorIntrinsicSplats(SDValue Op, SelectionDAG &DAG,
-                                          const RISCVSubtarget &Subtarget) {
+static SDValue lowerVectorIntrinsicScalars(SDValue Op, SelectionDAG &DAG,
+                                           const RISCVSubtarget &Subtarget) {
   assert((Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
           Op.getOpcode() == ISD::INTRINSIC_W_CHAIN) &&
          "Unexpected opcode");
@@ -4393,10 +4589,10 @@ static SDValue lowerVectorIntrinsicSplats(SDValue Op, SelectionDAG &DAG,
 
   const RISCVVIntrinsicsTable::RISCVVIntrinsicInfo *II =
       RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IntNo);
-  if (!II || !II->hasSplatOperand())
+  if (!II || !II->hasScalarOperand())
     return SDValue();
 
-  unsigned SplatOp = II->SplatOperand + 1 + HasChain;
+  unsigned SplatOp = II->ScalarOperand + 1 + HasChain;
   assert(SplatOp < Op.getNumOperands());
 
   SmallVector<SDValue, 8> Operands(Op->op_begin(), Op->op_end());
@@ -4426,28 +4622,141 @@ static SDValue lowerVectorIntrinsicSplats(SDValue Op, SelectionDAG &DAG,
   // that a widening operation never uses SEW=64.
   // NOTE: If this fails the below assert, we can probably just find the
   // element count from any operand or result and use it to construct the VT.
-  assert(II->SplatOperand > 0 && "Unexpected splat operand!");
+  assert(II->ScalarOperand > 0 && "Unexpected splat operand!");
   MVT VT = Op.getOperand(SplatOp - 1).getSimpleValueType();
 
   // The more complex case is when the scalar is larger than XLenVT.
   assert(XLenVT == MVT::i32 && OpVT == MVT::i64 &&
          VT.getVectorElementType() == MVT::i64 && "Unexpected VTs!");
 
-  // If this is a sign-extended 32-bit constant, we can truncate it and rely
-  // on the instruction to sign-extend since SEW>XLEN.
-  if (auto *CVal = dyn_cast<ConstantSDNode>(ScalarOp)) {
-    if (isInt<32>(CVal->getSExtValue())) {
-      ScalarOp = DAG.getConstant(CVal->getSExtValue(), DL, MVT::i32);
-      return DAG.getNode(Op->getOpcode(), DL, Op->getVTList(), Operands);
+  // If this is a sign-extended 32-bit value, we can truncate it and rely on the
+  // instruction to sign-extend since SEW>XLEN.
+  if (DAG.ComputeNumSignBits(ScalarOp) > 32) {
+    ScalarOp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, ScalarOp);
+    return DAG.getNode(Op->getOpcode(), DL, Op->getVTList(), Operands);
+  }
+
+  switch (IntNo) {
+  case Intrinsic::riscv_vslide1up:
+  case Intrinsic::riscv_vslide1down:
+  case Intrinsic::riscv_vslide1up_mask:
+  case Intrinsic::riscv_vslide1down_mask: {
+    // We need to special case these when the scalar is larger than XLen.
+    unsigned NumOps = Op.getNumOperands();
+    bool IsMasked = NumOps == 7;
+
+    // Convert the vector source to the equivalent nxvXi32 vector.
+    MVT I32VT = MVT::getVectorVT(MVT::i32, VT.getVectorElementCount() * 2);
+    SDValue Vec = DAG.getBitcast(I32VT, Operands[2]);
+
+    SDValue ScalarLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, ScalarOp,
+                                   DAG.getConstant(0, DL, XLenVT));
+    SDValue ScalarHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, ScalarOp,
+                                   DAG.getConstant(1, DL, XLenVT));
+
+    // Double the VL since we halved SEW.
+    SDValue AVL = getVLOperand(Op);
+    SDValue I32VL;
+
+    // Optimize for constant AVL
+    if (isa<ConstantSDNode>(AVL)) {
+      unsigned EltSize = VT.getScalarSizeInBits();
+      unsigned MinSize = VT.getSizeInBits().getKnownMinValue();
+
+      unsigned VectorBitsMax = Subtarget.getRealMaxVLen();
+      unsigned MaxVLMAX =
+          RISCVTargetLowering::computeVLMAX(VectorBitsMax, EltSize, MinSize);
+
+      unsigned VectorBitsMin = Subtarget.getRealMinVLen();
+      unsigned MinVLMAX =
+          RISCVTargetLowering::computeVLMAX(VectorBitsMin, EltSize, MinSize);
+
+      uint64_t AVLInt = cast<ConstantSDNode>(AVL)->getZExtValue();
+      if (AVLInt <= MinVLMAX) {
+        I32VL = DAG.getConstant(2 * AVLInt, DL, XLenVT);
+      } else if (AVLInt >= 2 * MaxVLMAX) {
+        // Just set vl to VLMAX in this situation
+        RISCVII::VLMUL Lmul = RISCVTargetLowering::getLMUL(I32VT);
+        SDValue LMUL = DAG.getConstant(Lmul, DL, XLenVT);
+        unsigned Sew = RISCVVType::encodeSEW(I32VT.getScalarSizeInBits());
+        SDValue SEW = DAG.getConstant(Sew, DL, XLenVT);
+        SDValue SETVLMAX = DAG.getTargetConstant(
+            Intrinsic::riscv_vsetvlimax_opt, DL, MVT::i32);
+        I32VL = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, XLenVT, SETVLMAX, SEW,
+                            LMUL);
+      } else {
+        // For AVL between (MinVLMAX, 2 * MaxVLMAX), the actual working vl
+        // is related to the hardware implementation.
+        // So let the following code handle
+      }
+    }
+    if (!I32VL) {
+      RISCVII::VLMUL Lmul = RISCVTargetLowering::getLMUL(VT);
+      SDValue LMUL = DAG.getConstant(Lmul, DL, XLenVT);
+      unsigned Sew = RISCVVType::encodeSEW(VT.getScalarSizeInBits());
+      SDValue SEW = DAG.getConstant(Sew, DL, XLenVT);
+      SDValue SETVL =
+          DAG.getTargetConstant(Intrinsic::riscv_vsetvli_opt, DL, MVT::i32);
+      // Using vsetvli instruction to get actually used length which related to
+      // the hardware implementation
+      SDValue VL = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, XLenVT, SETVL, AVL,
+                               SEW, LMUL);
+      I32VL =
+          DAG.getNode(ISD::SHL, DL, XLenVT, VL, DAG.getConstant(1, DL, XLenVT));
+    }
+
+    SDValue I32Mask = getAllOnesMask(I32VT, I32VL, DL, DAG);
+
+    // Shift the two scalar parts in using SEW=32 slide1up/slide1down
+    // instructions.
+    SDValue Passthru;
+    if (IsMasked)
+      Passthru = DAG.getUNDEF(I32VT);
+    else
+      Passthru = DAG.getBitcast(I32VT, Operands[1]);
+
+    if (IntNo == Intrinsic::riscv_vslide1up ||
+        IntNo == Intrinsic::riscv_vslide1up_mask) {
+      Vec = DAG.getNode(RISCVISD::VSLIDE1UP_VL, DL, I32VT, Passthru, Vec,
+                        ScalarHi, I32Mask, I32VL);
+      Vec = DAG.getNode(RISCVISD::VSLIDE1UP_VL, DL, I32VT, Passthru, Vec,
+                        ScalarLo, I32Mask, I32VL);
+    } else {
+      Vec = DAG.getNode(RISCVISD::VSLIDE1DOWN_VL, DL, I32VT, Passthru, Vec,
+                        ScalarLo, I32Mask, I32VL);
+      Vec = DAG.getNode(RISCVISD::VSLIDE1DOWN_VL, DL, I32VT, Passthru, Vec,
+                        ScalarHi, I32Mask, I32VL);
     }
+
+    // Convert back to nxvXi64.
+    Vec = DAG.getBitcast(VT, Vec);
+
+    if (!IsMasked)
+      return Vec;
+    // Apply mask after the operation.
+    SDValue Mask = Operands[NumOps - 3];
+    SDValue MaskedOff = Operands[1];
+    // Assume Policy operand is the last operand.
+    uint64_t Policy =
+        cast<ConstantSDNode>(Operands[NumOps - 1])->getZExtValue();
+    // We don't need to select maskedoff if it's undef.
+    if (MaskedOff.isUndef())
+      return Vec;
+    // TAMU
+    if (Policy == RISCVII::TAIL_AGNOSTIC)
+      return DAG.getNode(RISCVISD::VSELECT_VL, DL, VT, Mask, Vec, MaskedOff,
+                         AVL);
+    // TUMA or TUMU: Currently we always emit tumu policy regardless of tuma.
+    // It's fine because vmerge does not care mask policy.
+    return DAG.getNode(RISCVISD::VP_MERGE_VL, DL, VT, Mask, Vec, MaskedOff,
+                       AVL);
+  }
   }
 
   // We need to convert the scalar to a splat vector.
-  // FIXME: Can we implicitly truncate the scalar if it is known to
-  // be sign extended?
   SDValue VL = getVLOperand(Op);
   assert(VL.getValueType() == XLenVT);
-  ScalarOp = splatSplitI64WithVL(DL, VT, ScalarOp, VL, DAG);
+  ScalarOp = splatSplitI64WithVL(DL, VT, SDValue(), ScalarOp, VL, DAG);
   return DAG.getNode(Op->getOpcode(), DL, Op->getVTList(), Operands);
 }
 
@@ -4481,7 +4790,7 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::riscv_zip:
   case Intrinsic::riscv_unzip: {
     // Lower to the SHFLI encoding for zip or the UNSHFLI encoding for unzip.
-    // For i32 the immdiate is 15. For i64 the immediate is 31.
+    // For i32 the immediate is 15. For i64 the immediate is 31.
     unsigned Opc =
         IntNo == Intrinsic::riscv_zip ? RISCVISD::SHFL : RISCVISD::UNSHFL;
     unsigned BitWidth = Op.getValueSizeInBits();
@@ -4516,10 +4825,11 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                        Op.getOperand(1));
   case Intrinsic::riscv_vmv_v_x:
     return lowerScalarSplat(Op.getOperand(1), Op.getOperand(2),
-                            Op.getSimpleValueType(), DL, DAG, Subtarget);
+                            Op.getOperand(3), Op.getSimpleValueType(), DL, DAG,
+                            Subtarget);
   case Intrinsic::riscv_vfmv_v_f:
     return DAG.getNode(RISCVISD::VFMV_V_F_VL, DL, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
   case Intrinsic::riscv_vmv_s_x: {
     SDValue Scalar = Op.getOperand(2);
 
@@ -4533,7 +4843,7 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
     // This is an i64 value that lives in two scalar registers. We have to
     // insert this in a convoluted way. First we build vXi64 splat containing
-    // the/ two values that we assemble using some bit math. Next we'll use
+    // the two values that we assemble using some bit math. Next we'll use
     // vid.v and vmseq to build a mask with bit 0 set. Then we'll use that mask
     // to merge element 0 from our splat into the source vector.
     // FIXME: This is probably not the best way to do this, but it is
@@ -4550,12 +4860,15 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     SDValue Vec = Op.getOperand(1);
     SDValue VL = getVLOperand(Op);
 
-    SDValue SplattedVal = splatSplitI64WithVL(DL, VT, Scalar, VL, DAG);
-    SDValue SplattedIdx = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT,
-                                      DAG.getConstant(0, DL, MVT::i32), VL);
+    SDValue SplattedVal = splatSplitI64WithVL(DL, VT, SDValue(), Scalar, VL, DAG);
+    if (Op.getOperand(1).isUndef())
+      return SplattedVal;
+    SDValue SplattedIdx =
+        DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, DAG.getUNDEF(VT),
+                    DAG.getConstant(0, DL, MVT::i32), VL);
 
-    MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
-    SDValue Mask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL);
+    MVT MaskVT = getMaskTypeFor(VT);
+    SDValue Mask = getAllOnesMask(VT, VL, DL, DAG);
     SDValue VID = DAG.getNode(RISCVISD::VID_VL, DL, VT, Mask, VL);
     SDValue SelectCond =
         DAG.getNode(RISCVISD::SETCC_VL, DL, MaskVT, VID, SplattedIdx,
@@ -4563,73 +4876,9 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(RISCVISD::VSELECT_VL, DL, VT, SelectCond, SplattedVal,
                        Vec, VL);
   }
-  case Intrinsic::riscv_vslide1up:
-  case Intrinsic::riscv_vslide1down:
-  case Intrinsic::riscv_vslide1up_mask:
-  case Intrinsic::riscv_vslide1down_mask: {
-    // We need to special case these when the scalar is larger than XLen.
-    unsigned NumOps = Op.getNumOperands();
-    bool IsMasked = NumOps == 7;
-    unsigned OpOffset = IsMasked ? 1 : 0;
-    SDValue Scalar = Op.getOperand(2 + OpOffset);
-    if (Scalar.getValueType().bitsLE(XLenVT))
-      break;
-
-    // Splatting a sign extended constant is fine.
-    if (auto *CVal = dyn_cast<ConstantSDNode>(Scalar))
-      if (isInt<32>(CVal->getSExtValue()))
-        break;
-
-    MVT VT = Op.getSimpleValueType();
-    assert(VT.getVectorElementType() == MVT::i64 &&
-           Scalar.getValueType() == MVT::i64 && "Unexpected VTs");
-
-    // Convert the vector source to the equivalent nxvXi32 vector.
-    MVT I32VT = MVT::getVectorVT(MVT::i32, VT.getVectorElementCount() * 2);
-    SDValue Vec = DAG.getBitcast(I32VT, Op.getOperand(1 + OpOffset));
-
-    SDValue ScalarLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Scalar,
-                                   DAG.getConstant(0, DL, XLenVT));
-    SDValue ScalarHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Scalar,
-                                   DAG.getConstant(1, DL, XLenVT));
-
-    // Double the VL since we halved SEW.
-    SDValue VL = getVLOperand(Op);
-    SDValue I32VL =
-        DAG.getNode(ISD::SHL, DL, XLenVT, VL, DAG.getConstant(1, DL, XLenVT));
-
-    MVT I32MaskVT = MVT::getVectorVT(MVT::i1, I32VT.getVectorElementCount());
-    SDValue I32Mask = DAG.getNode(RISCVISD::VMSET_VL, DL, I32MaskVT, VL);
-
-    // Shift the two scalar parts in using SEW=32 slide1up/slide1down
-    // instructions.
-    if (IntNo == Intrinsic::riscv_vslide1up ||
-        IntNo == Intrinsic::riscv_vslide1up_mask) {
-      Vec = DAG.getNode(RISCVISD::VSLIDE1UP_VL, DL, I32VT, Vec, ScalarHi,
-                        I32Mask, I32VL);
-      Vec = DAG.getNode(RISCVISD::VSLIDE1UP_VL, DL, I32VT, Vec, ScalarLo,
-                        I32Mask, I32VL);
-    } else {
-      Vec = DAG.getNode(RISCVISD::VSLIDE1DOWN_VL, DL, I32VT, Vec, ScalarLo,
-                        I32Mask, I32VL);
-      Vec = DAG.getNode(RISCVISD::VSLIDE1DOWN_VL, DL, I32VT, Vec, ScalarHi,
-                        I32Mask, I32VL);
-    }
-
-    // Convert back to nxvXi64.
-    Vec = DAG.getBitcast(VT, Vec);
-
-    if (!IsMasked)
-      return Vec;
-
-    // Apply mask after the operation.
-    SDValue Mask = Op.getOperand(NumOps - 3);
-    SDValue MaskedOff = Op.getOperand(1);
-    return DAG.getNode(RISCVISD::VSELECT_VL, DL, VT, Mask, Vec, MaskedOff, VL);
-  }
   }
 
-  return lowerVectorIntrinsicSplats(Op, DAG, Subtarget);
+  return lowerVectorIntrinsicScalars(Op, DAG, Subtarget);
 }
 
 SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
@@ -4652,8 +4901,7 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
 
     SDValue PassThru = Op.getOperand(2);
     if (!IsUnmasked) {
-      MVT MaskVT =
-          MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
+      MVT MaskVT = getMaskTypeFor(ContainerVT);
       Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
       PassThru = convertToScalableVector(ContainerVT, PassThru, DAG, Subtarget);
     }
@@ -4680,17 +4928,56 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       Ops.push_back(Policy);
     }
 
-    SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other});
+    SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other});
+    SDValue Result =
+        DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops,
+                                Load->getMemoryVT(), Load->getMemOperand());
+    SDValue Chain = Result.getValue(1);
+    Result = convertFromScalableVector(VT, Result, DAG, Subtarget);
+    return DAG.getMergeValues({Result, Chain}, DL);
+  }
+  case Intrinsic::riscv_seg2_load:
+  case Intrinsic::riscv_seg3_load:
+  case Intrinsic::riscv_seg4_load:
+  case Intrinsic::riscv_seg5_load:
+  case Intrinsic::riscv_seg6_load:
+  case Intrinsic::riscv_seg7_load:
+  case Intrinsic::riscv_seg8_load: {
+    SDLoc DL(Op);
+    static const Intrinsic::ID VlsegInts[7] = {
+        Intrinsic::riscv_vlseg2, Intrinsic::riscv_vlseg3,
+        Intrinsic::riscv_vlseg4, Intrinsic::riscv_vlseg5,
+        Intrinsic::riscv_vlseg6, Intrinsic::riscv_vlseg7,
+        Intrinsic::riscv_vlseg8};
+    unsigned NF = Op->getNumValues() - 1;
+    assert(NF >= 2 && NF <= 8 && "Unexpected seg number");
+    MVT XLenVT = Subtarget.getXLenVT();
+    MVT VT = Op->getSimpleValueType(0);
+    MVT ContainerVT = getContainerForFixedLengthVector(VT);
+
+    SDValue VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
+    SDValue IntID = DAG.getTargetConstant(VlsegInts[NF - 2], DL, XLenVT);
+    auto *Load = cast<MemIntrinsicSDNode>(Op);
+    SmallVector<EVT, 9> ContainerVTs(NF, ContainerVT);
+    ContainerVTs.push_back(MVT::Other);
+    SDVTList VTs = DAG.getVTList(ContainerVTs);
+    SmallVector<SDValue, 12> Ops = {Load->getChain(), IntID};
+    Ops.insert(Ops.end(), NF, DAG.getUNDEF(ContainerVT));
+    Ops.push_back(Op.getOperand(2));
+    Ops.push_back(VL);
     SDValue Result =
         DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops,
                                 Load->getMemoryVT(), Load->getMemOperand());
-    SDValue Chain = Result.getValue(1);
-    Result = convertFromScalableVector(VT, Result, DAG, Subtarget);
-    return DAG.getMergeValues({Result, Chain}, DL);
+    SmallVector<SDValue, 9> Results;
+    for (unsigned int RetIdx = 0; RetIdx < NF; RetIdx++)
+      Results.push_back(convertFromScalableVector(VT, Result.getValue(RetIdx),
+                                                  DAG, Subtarget));
+    Results.push_back(Result.getValue(NF));
+    return DAG.getMergeValues(Results, DL);
   }
   }
 
-  return lowerVectorIntrinsicSplats(Op, DAG, Subtarget);
+  return lowerVectorIntrinsicScalars(Op, DAG, Subtarget);
 }
 
 SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
@@ -4714,8 +5001,7 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 
     Val = convertToScalableVector(ContainerVT, Val, DAG, Subtarget);
     if (!IsUnmasked) {
-      MVT MaskVT =
-          MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
+      MVT MaskVT = getMaskTypeFor(ContainerVT);
       Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
     }
 
@@ -4898,8 +5184,9 @@ SDValue RISCVTargetLowering::lowerVECREDUCE(SDValue Op,
 
   SDValue NeutralElem =
       DAG.getNeutralElement(BaseOpc, DL, VecEltVT, SDNodeFlags());
-  SDValue IdentitySplat = lowerScalarSplat(
-      NeutralElem, DAG.getConstant(1, DL, XLenVT), M1VT, DL, DAG, Subtarget);
+  SDValue IdentitySplat =
+      lowerScalarSplat(SDValue(), NeutralElem, DAG.getConstant(1, DL, XLenVT),
+                       M1VT, DL, DAG, Subtarget);
   SDValue Reduction = DAG.getNode(RVVOpcode, DL, M1VT, DAG.getUNDEF(M1VT), Vec,
                                   IdentitySplat, Mask, VL);
   SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VecEltVT, Reduction,
@@ -4960,8 +5247,9 @@ SDValue RISCVTargetLowering::lowerFPVECREDUCE(SDValue Op,
   SDValue Mask, VL;
   std::tie(Mask, VL) = getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget);
 
-  SDValue ScalarSplat = lowerScalarSplat(
-      ScalarVal, DAG.getConstant(1, DL, XLenVT), M1VT, DL, DAG, Subtarget);
+  SDValue ScalarSplat =
+      lowerScalarSplat(SDValue(), ScalarVal, DAG.getConstant(1, DL, XLenVT),
+                       M1VT, DL, DAG, Subtarget);
   SDValue Reduction = DAG.getNode(RVVOpcode, DL, M1VT, DAG.getUNDEF(M1VT),
                                   VectorVal, ScalarSplat, Mask, VL);
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VecEltVT, Reduction,
@@ -5027,9 +5315,9 @@ SDValue RISCVTargetLowering::lowerVPREDUCE(SDValue Op,
   MVT XLenVT = Subtarget.getXLenVT();
   MVT ResVT = !VecVT.isInteger() || VecEltVT.bitsGE(XLenVT) ? VecEltVT : XLenVT;
 
-  SDValue StartSplat =
-      lowerScalarSplat(Op.getOperand(0), DAG.getConstant(1, DL, XLenVT), M1VT,
-                       DL, DAG, Subtarget);
+  SDValue StartSplat = lowerScalarSplat(SDValue(), Op.getOperand(0),
+                                        DAG.getConstant(1, DL, XLenVT), M1VT,
+                                        DL, DAG, Subtarget);
   SDValue Reduction =
       DAG.getNode(RVVOpcode, DL, M1VT, StartSplat, Vec, StartSplat, Mask, VL);
   SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Reduction,
@@ -5331,13 +5619,13 @@ SDValue RISCVTargetLowering::lowerSTEP_VECTOR(SDValue Op,
   if (StepValImm != 1) {
     if (isPowerOf2_64(StepValImm)) {
       SDValue StepVal =
-          DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT,
+          DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, DAG.getUNDEF(VT),
                       DAG.getConstant(Log2_64(StepValImm), DL, XLenVT));
       StepVec = DAG.getNode(ISD::SHL, DL, VT, StepVec, StepVal);
     } else {
       SDValue StepVal = lowerScalarSplat(
-          DAG.getConstant(StepValImm, DL, VT.getVectorElementType()), VL, VT,
-          DL, DAG, Subtarget);
+          SDValue(), DAG.getConstant(StepValImm, DL, VT.getVectorElementType()),
+          VL, VT, DL, DAG, Subtarget);
       StepVec = DAG.getNode(ISD::MUL, DL, VT, StepVec, StepVal);
     }
   }
@@ -5353,22 +5641,26 @@ SDValue RISCVTargetLowering::lowerVECTOR_REVERSE(SDValue Op,
                                                  SelectionDAG &DAG) const {
   SDLoc DL(Op);
   MVT VecVT = Op.getSimpleValueType();
+  if (VecVT.getVectorElementType() == MVT::i1) {
+    MVT WidenVT = MVT::getVectorVT(MVT::i8, VecVT.getVectorElementCount());
+    SDValue Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenVT, Op.getOperand(0));
+    SDValue Op2 = DAG.getNode(ISD::VECTOR_REVERSE, DL, WidenVT, Op1);
+    return DAG.getNode(ISD::TRUNCATE, DL, VecVT, Op2);
+  }
   unsigned EltSize = VecVT.getScalarSizeInBits();
   unsigned MinSize = VecVT.getSizeInBits().getKnownMinValue();
-
-  unsigned MaxVLMAX = 0;
-  unsigned VectorBitsMax = Subtarget.getMaxRVVVectorSizeInBits();
-  if (VectorBitsMax != 0)
-    MaxVLMAX = ((VectorBitsMax / EltSize) * MinSize) / RISCV::RVVBitsPerBlock;
+  unsigned VectorBitsMax = Subtarget.getRealMaxVLen();
+  unsigned MaxVLMAX =
+    RISCVTargetLowering::computeVLMAX(VectorBitsMax, EltSize, MinSize);
 
   unsigned GatherOpc = RISCVISD::VRGATHER_VV_VL;
   MVT IntVT = VecVT.changeVectorElementTypeToInteger();
 
-  // If this is SEW=8 and VLMAX is unknown or more than 256, we need
+  // If this is SEW=8 and VLMAX is potentially more than 256, we need
   // to use vrgatherei16.vv.
   // TODO: It's also possible to use vrgatherei16.vv for other types to
   // decrease register width for the index calculation.
-  if ((MaxVLMAX == 0 || MaxVLMAX > 256) && EltSize == 8) {
+  if (MaxVLMAX > 256 && EltSize == 8) {
     // If this is LMUL=8, we have to split before can use vrgatherei16.vv.
     // Reverse each half, then reassemble them in reverse order.
     // NOTE: It's also possible that after splitting that VLMAX no longer
@@ -5413,13 +5705,51 @@ SDValue RISCVTargetLowering::lowerVECTOR_REVERSE(SDValue Op,
   if (!IsRV32E64)
     SplatVL = DAG.getSplatVector(IntVT, DL, VLMinus1);
   else
-    SplatVL = DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, IntVT, VLMinus1);
+    SplatVL = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, IntVT, DAG.getUNDEF(IntVT),
+                          VLMinus1, DAG.getRegister(RISCV::X0, XLenVT));
 
   SDValue VID = DAG.getNode(RISCVISD::VID_VL, DL, IntVT, Mask, VL);
   SDValue Indices =
       DAG.getNode(RISCVISD::SUB_VL, DL, IntVT, SplatVL, VID, Mask, VL);
 
-  return DAG.getNode(GatherOpc, DL, VecVT, Op.getOperand(0), Indices, Mask, VL);
+  return DAG.getNode(GatherOpc, DL, VecVT, Op.getOperand(0), Indices, Mask,
+                     DAG.getUNDEF(VecVT), VL);
+}
+
+SDValue RISCVTargetLowering::lowerVECTOR_SPLICE(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  MVT XLenVT = Subtarget.getXLenVT();
+  MVT VecVT = Op.getSimpleValueType();
+
+  unsigned MinElts = VecVT.getVectorMinNumElements();
+  SDValue VLMax = DAG.getNode(ISD::VSCALE, DL, XLenVT,
+                              DAG.getConstant(MinElts, DL, XLenVT));
+
+  int64_t ImmValue = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
+  SDValue DownOffset, UpOffset;
+  if (ImmValue >= 0) {
+    // The operand is a TargetConstant, we need to rebuild it as a regular
+    // constant.
+    DownOffset = DAG.getConstant(ImmValue, DL, XLenVT);
+    UpOffset = DAG.getNode(ISD::SUB, DL, XLenVT, VLMax, DownOffset);
+  } else {
+    // The operand is a TargetConstant, we need to rebuild it as a regular
+    // constant rather than negating the original operand.
+    UpOffset = DAG.getConstant(-ImmValue, DL, XLenVT);
+    DownOffset = DAG.getNode(ISD::SUB, DL, XLenVT, VLMax, UpOffset);
+  }
+
+  SDValue TrueMask = getAllOnesMask(VecVT, VLMax, DL, DAG);
+
+  SDValue SlideDown =
+      DAG.getNode(RISCVISD::VSLIDEDOWN_VL, DL, VecVT, DAG.getUNDEF(VecVT), V1,
+                  DownOffset, TrueMask, UpOffset);
+  return DAG.getNode(RISCVISD::VSLIDEUP_VL, DL, VecVT, SlideDown, V2, UpOffset,
+                     TrueMask,
+                     DAG.getTargetConstant(RISCV::VLMaxSentinel, DL, XLenVT));
 }
 
 SDValue
@@ -5434,18 +5764,26 @@ RISCVTargetLowering::lowerFixedLengthVectorLoadToRVV(SDValue Op,
          "Expecting a correctly-aligned load");
 
   MVT VT = Op.getSimpleValueType();
+  MVT XLenVT = Subtarget.getXLenVT();
   MVT ContainerVT = getContainerForFixedLengthVector(VT);
 
-  SDValue VL =
-      DAG.getConstant(VT.getVectorNumElements(), DL, Subtarget.getXLenVT());
+  SDValue VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
 
+  bool IsMaskOp = VT.getVectorElementType() == MVT::i1;
+  SDValue IntID = DAG.getTargetConstant(
+      IsMaskOp ? Intrinsic::riscv_vlm : Intrinsic::riscv_vle, DL, XLenVT);
+  SmallVector<SDValue, 4> Ops{Load->getChain(), IntID};
+  if (!IsMaskOp)
+    Ops.push_back(DAG.getUNDEF(ContainerVT));
+  Ops.push_back(Load->getBasePtr());
+  Ops.push_back(VL);
   SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other});
-  SDValue NewLoad = DAG.getMemIntrinsicNode(
-      RISCVISD::VLE_VL, DL, VTs, {Load->getChain(), Load->getBasePtr(), VL},
-      Load->getMemoryVT(), Load->getMemOperand());
+  SDValue NewLoad =
+      DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops,
+                              Load->getMemoryVT(), Load->getMemOperand());
 
   SDValue Result = convertFromScalableVector(VT, NewLoad, DAG, Subtarget);
-  return DAG.getMergeValues({Result, Load->getChain()}, DL);
+  return DAG.getMergeValues({Result, NewLoad.getValue(1)}, DL);
 }
 
 SDValue
@@ -5461,6 +5799,7 @@ RISCVTargetLowering::lowerFixedLengthVectorStoreToRVV(SDValue Op,
 
   SDValue StoreVal = Store->getValue();
   MVT VT = StoreVal.getSimpleValueType();
+  MVT XLenVT = Subtarget.getXLenVT();
 
   // If the size less than a byte, we need to pad with zeros to make a byte.
   if (VT.getVectorElementType() == MVT::i1 && VT.getVectorNumElements() < 8) {
@@ -5472,14 +5811,17 @@ RISCVTargetLowering::lowerFixedLengthVectorStoreToRVV(SDValue Op,
 
   MVT ContainerVT = getContainerForFixedLengthVector(VT);
 
-  SDValue VL =
-      DAG.getConstant(VT.getVectorNumElements(), DL, Subtarget.getXLenVT());
+  SDValue VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
 
   SDValue NewValue =
       convertToScalableVector(ContainerVT, StoreVal, DAG, Subtarget);
+
+  bool IsMaskOp = VT.getVectorElementType() == MVT::i1;
+  SDValue IntID = DAG.getTargetConstant(
+      IsMaskOp ? Intrinsic::riscv_vsm : Intrinsic::riscv_vse, DL, XLenVT);
   return DAG.getMemIntrinsicNode(
-      RISCVISD::VSE_VL, DL, DAG.getVTList(MVT::Other),
-      {Store->getChain(), NewValue, Store->getBasePtr(), VL},
+      ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other),
+      {Store->getChain(), IntID, NewValue, Store->getBasePtr(), VL},
       Store->getMemoryVT(), Store->getMemOperand());
 }
 
@@ -5514,8 +5856,7 @@ SDValue RISCVTargetLowering::lowerMaskedLoad(SDValue Op,
     ContainerVT = getContainerForFixedLengthVector(VT);
     PassThru = convertToScalableVector(ContainerVT, PassThru, DAG, Subtarget);
     if (!IsUnmasked) {
-      MVT MaskVT =
-          MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
+      MVT MaskVT = getMaskTypeFor(ContainerVT);
       Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
     }
   }
@@ -5581,8 +5922,7 @@ SDValue RISCVTargetLowering::lowerMaskedStore(SDValue Op,
 
     Val = convertToScalableVector(ContainerVT, Val, DAG, Subtarget);
     if (!IsUnmasked) {
-      MVT MaskVT =
-          MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
+      MVT MaskVT = getMaskTypeFor(ContainerVT);
       Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
     }
   }
@@ -5620,8 +5960,8 @@ RISCVTargetLowering::lowerFixedLengthVectorSetccToRVV(SDValue Op,
   SDValue VL =
       DAG.getConstant(VT.getVectorNumElements(), DL, Subtarget.getXLenVT());
 
-  MVT MaskVT = MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
-  SDValue Mask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL);
+  MVT MaskVT = getMaskTypeFor(ContainerVT);
+  SDValue Mask = getAllOnesMask(ContainerVT, VL, DL, DAG);
 
   SDValue Cmp = DAG.getNode(RISCVISD::SETCC_VL, DL, MaskVT, Op1, Op2,
                             Op.getOperand(2), Mask, VL);
@@ -5667,9 +6007,9 @@ SDValue RISCVTargetLowering::lowerABS(SDValue Op, SelectionDAG &DAG) const {
   SDValue Mask, VL;
   std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
 
-  SDValue SplatZero =
-      DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
-                  DAG.getConstant(0, DL, Subtarget.getXLenVT()));
+  SDValue SplatZero = DAG.getNode(
+      RISCVISD::VMV_V_X_VL, DL, ContainerVT, DAG.getUNDEF(ContainerVT),
+      DAG.getConstant(0, DL, Subtarget.getXLenVT()));
   SDValue NegX =
       DAG.getNode(RISCVISD::SUB_VL, DL, ContainerVT, SplatZero, X, Mask, VL);
   SDValue Max =
@@ -5787,15 +6127,260 @@ SDValue RISCVTargetLowering::lowerVPOp(SDValue Op, SelectionDAG &DAG,
   }
 
   if (!VT.isFixedLengthVector())
-    return DAG.getNode(RISCVISDOpc, DL, VT, Ops);
+    return DAG.getNode(RISCVISDOpc, DL, VT, Ops, Op->getFlags());
 
   MVT ContainerVT = getContainerForFixedLengthVector(VT);
 
-  SDValue VPOp = DAG.getNode(RISCVISDOpc, DL, ContainerVT, Ops);
+  SDValue VPOp = DAG.getNode(RISCVISDOpc, DL, ContainerVT, Ops, Op->getFlags());
 
   return convertFromScalableVector(VT, VPOp, DAG, Subtarget);
 }
 
+SDValue RISCVTargetLowering::lowerVPExtMaskOp(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  MVT VT = Op.getSimpleValueType();
+
+  SDValue Src = Op.getOperand(0);
+  // NOTE: Mask is dropped.
+  SDValue VL = Op.getOperand(2);
+
+  MVT ContainerVT = VT;
+  if (VT.isFixedLengthVector()) {
+    ContainerVT = getContainerForFixedLengthVector(VT);
+    MVT SrcVT = MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
+    Src = convertToScalableVector(SrcVT, Src, DAG, Subtarget);
+  }
+
+  MVT XLenVT = Subtarget.getXLenVT();
+  SDValue Zero = DAG.getConstant(0, DL, XLenVT);
+  SDValue ZeroSplat = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
+                                  DAG.getUNDEF(ContainerVT), Zero, VL);
+
+  SDValue SplatValue = DAG.getConstant(
+      Op.getOpcode() == ISD::VP_ZERO_EXTEND ? 1 : -1, DL, XLenVT);
+  SDValue Splat = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
+                              DAG.getUNDEF(ContainerVT), SplatValue, VL);
+
+  SDValue Result = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, Src,
+                               Splat, ZeroSplat, VL);
+  if (!VT.isFixedLengthVector())
+    return Result;
+  return convertFromScalableVector(VT, Result, DAG, Subtarget);
+}
+
+SDValue RISCVTargetLowering::lowerVPSetCCMaskOp(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  MVT VT = Op.getSimpleValueType();
+
+  SDValue Op1 = Op.getOperand(0);
+  SDValue Op2 = Op.getOperand(1);
+  ISD::CondCode Condition = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  // NOTE: Mask is dropped.
+  SDValue VL = Op.getOperand(4);
+
+  MVT ContainerVT = VT;
+  if (VT.isFixedLengthVector()) {
+    ContainerVT = getContainerForFixedLengthVector(VT);
+    Op1 = convertToScalableVector(ContainerVT, Op1, DAG, Subtarget);
+    Op2 = convertToScalableVector(ContainerVT, Op2, DAG, Subtarget);
+  }
+
+  SDValue Result;
+  SDValue AllOneMask = DAG.getNode(RISCVISD::VMSET_VL, DL, ContainerVT, VL);
+
+  switch (Condition) {
+  default:
+    break;
+  // X != Y  --> (X^Y)
+  case ISD::SETNE:
+    Result = DAG.getNode(RISCVISD::VMXOR_VL, DL, ContainerVT, Op1, Op2, VL);
+    break;
+  // X == Y  --> ~(X^Y)
+  case ISD::SETEQ: {
+    SDValue Temp =
+        DAG.getNode(RISCVISD::VMXOR_VL, DL, ContainerVT, Op1, Op2, VL);
+    Result =
+        DAG.getNode(RISCVISD::VMXOR_VL, DL, ContainerVT, Temp, AllOneMask, VL);
+    break;
+  }
+  // X >s Y   -->  X == 0 & Y == 1  -->  ~X & Y
+  // X <u Y   -->  X == 0 & Y == 1  -->  ~X & Y
+  case ISD::SETGT:
+  case ISD::SETULT: {
+    SDValue Temp =
+        DAG.getNode(RISCVISD::VMXOR_VL, DL, ContainerVT, Op1, AllOneMask, VL);
+    Result = DAG.getNode(RISCVISD::VMAND_VL, DL, ContainerVT, Temp, Op2, VL);
+    break;
+  }
+  // X <s Y   --> X == 1 & Y == 0  -->  ~Y & X
+  // X >u Y   --> X == 1 & Y == 0  -->  ~Y & X
+  case ISD::SETLT:
+  case ISD::SETUGT: {
+    SDValue Temp =
+        DAG.getNode(RISCVISD::VMXOR_VL, DL, ContainerVT, Op2, AllOneMask, VL);
+    Result = DAG.getNode(RISCVISD::VMAND_VL, DL, ContainerVT, Op1, Temp, VL);
+    break;
+  }
+  // X >=s Y  --> X == 0 | Y == 1  -->  ~X | Y
+  // X <=u Y  --> X == 0 | Y == 1  -->  ~X | Y
+  case ISD::SETGE:
+  case ISD::SETULE: {
+    SDValue Temp =
+        DAG.getNode(RISCVISD::VMXOR_VL, DL, ContainerVT, Op1, AllOneMask, VL);
+    Result = DAG.getNode(RISCVISD::VMXOR_VL, DL, ContainerVT, Temp, Op2, VL);
+    break;
+  }
+  // X <=s Y  --> X == 1 | Y == 0  -->  ~Y | X
+  // X >=u Y  --> X == 1 | Y == 0  -->  ~Y | X
+  case ISD::SETLE:
+  case ISD::SETUGE: {
+    SDValue Temp =
+        DAG.getNode(RISCVISD::VMXOR_VL, DL, ContainerVT, Op2, AllOneMask, VL);
+    Result = DAG.getNode(RISCVISD::VMXOR_VL, DL, ContainerVT, Temp, Op1, VL);
+    break;
+  }
+  }
+
+  if (!VT.isFixedLengthVector())
+    return Result;
+  return convertFromScalableVector(VT, Result, DAG, Subtarget);
+}
+
+// Lower Floating-Point/Integer Type-Convert VP SDNodes
+SDValue RISCVTargetLowering::lowerVPFPIntConvOp(SDValue Op, SelectionDAG &DAG,
+                                                unsigned RISCVISDOpc) const {
+  SDLoc DL(Op);
+
+  SDValue Src = Op.getOperand(0);
+  SDValue Mask = Op.getOperand(1);
+  SDValue VL = Op.getOperand(2);
+
+  MVT DstVT = Op.getSimpleValueType();
+  MVT SrcVT = Src.getSimpleValueType();
+  if (DstVT.isFixedLengthVector()) {
+    DstVT = getContainerForFixedLengthVector(DstVT);
+    SrcVT = getContainerForFixedLengthVector(SrcVT);
+    Src = convertToScalableVector(SrcVT, Src, DAG, Subtarget);
+    MVT MaskVT = getMaskTypeFor(DstVT);
+    Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
+  }
+
+  unsigned RISCVISDExtOpc = (RISCVISDOpc == RISCVISD::SINT_TO_FP_VL ||
+                             RISCVISDOpc == RISCVISD::FP_TO_SINT_VL)
+                                ? RISCVISD::VSEXT_VL
+                                : RISCVISD::VZEXT_VL;
+
+  unsigned DstEltSize = DstVT.getScalarSizeInBits();
+  unsigned SrcEltSize = SrcVT.getScalarSizeInBits();
+
+  SDValue Result;
+  if (DstEltSize >= SrcEltSize) { // Single-width and widening conversion.
+    if (SrcVT.isInteger()) {
+      assert(DstVT.isFloatingPoint() && "Wrong input/output vector types");
+
+      // Do we need to do any pre-widening before converting?
+      if (SrcEltSize == 1) {
+        MVT IntVT = DstVT.changeVectorElementTypeToInteger();
+        MVT XLenVT = Subtarget.getXLenVT();
+        SDValue Zero = DAG.getConstant(0, DL, XLenVT);
+        SDValue ZeroSplat = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, IntVT,
+                                        DAG.getUNDEF(IntVT), Zero, VL);
+        SDValue One = DAG.getConstant(
+            RISCVISDExtOpc == RISCVISD::VZEXT_VL ? 1 : -1, DL, XLenVT);
+        SDValue OneSplat = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, IntVT,
+                                       DAG.getUNDEF(IntVT), One, VL);
+        Src = DAG.getNode(RISCVISD::VSELECT_VL, DL, IntVT, Src, OneSplat,
+                          ZeroSplat, VL);
+      } else if (DstEltSize > (2 * SrcEltSize)) {
+        // Widen before converting.
+        MVT IntVT = MVT::getVectorVT(MVT::getIntegerVT(DstEltSize / 2),
+                                     DstVT.getVectorElementCount());
+        Src = DAG.getNode(RISCVISDExtOpc, DL, IntVT, Src, Mask, VL);
+      }
+
+      Result = DAG.getNode(RISCVISDOpc, DL, DstVT, Src, Mask, VL);
+    } else {
+      assert(SrcVT.isFloatingPoint() && DstVT.isInteger() &&
+             "Wrong input/output vector types");
+
+      // Convert f16 to f32 then convert f32 to i64.
+      if (DstEltSize > (2 * SrcEltSize)) {
+        assert(SrcVT.getVectorElementType() == MVT::f16 && "Unexpected type!");
+        MVT InterimFVT =
+            MVT::getVectorVT(MVT::f32, DstVT.getVectorElementCount());
+        Src =
+            DAG.getNode(RISCVISD::FP_EXTEND_VL, DL, InterimFVT, Src, Mask, VL);
+      }
+
+      Result = DAG.getNode(RISCVISDOpc, DL, DstVT, Src, Mask, VL);
+    }
+  } else { // Narrowing + Conversion
+    if (SrcVT.isInteger()) {
+      assert(DstVT.isFloatingPoint() && "Wrong input/output vector types");
+      // First do a narrowing convert to an FP type half the size, then round
+      // the FP type to a small FP type if needed.
+
+      MVT InterimFVT = DstVT;
+      if (SrcEltSize > (2 * DstEltSize)) {
+        assert(SrcEltSize == (4 * DstEltSize) && "Unexpected types!");
+        assert(DstVT.getVectorElementType() == MVT::f16 && "Unexpected type!");
+        InterimFVT = MVT::getVectorVT(MVT::f32, DstVT.getVectorElementCount());
+      }
+
+      Result = DAG.getNode(RISCVISDOpc, DL, InterimFVT, Src, Mask, VL);
+
+      if (InterimFVT != DstVT) {
+        Src = Result;
+        Result = DAG.getNode(RISCVISD::FP_ROUND_VL, DL, DstVT, Src, Mask, VL);
+      }
+    } else {
+      assert(SrcVT.isFloatingPoint() && DstVT.isInteger() &&
+             "Wrong input/output vector types");
+      // First do a narrowing conversion to an integer half the size, then
+      // truncate if needed.
+
+      if (DstEltSize == 1) {
+        // First convert to the same size integer, then convert to mask using
+        // setcc.
+        assert(SrcEltSize >= 16 && "Unexpected FP type!");
+        MVT InterimIVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize),
+                                          DstVT.getVectorElementCount());
+        Result = DAG.getNode(RISCVISDOpc, DL, InterimIVT, Src, Mask, VL);
+
+        // Compare the integer result to 0. The integer should be 0 or 1/-1,
+        // otherwise the conversion was undefined.
+        MVT XLenVT = Subtarget.getXLenVT();
+        SDValue SplatZero = DAG.getConstant(0, DL, XLenVT);
+        SplatZero = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, InterimIVT,
+                                DAG.getUNDEF(InterimIVT), SplatZero);
+        Result = DAG.getNode(RISCVISD::SETCC_VL, DL, DstVT, Result, SplatZero,
+                             DAG.getCondCode(ISD::SETNE), Mask, VL);
+      } else {
+        MVT InterimIVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize / 2),
+                                          DstVT.getVectorElementCount());
+
+        Result = DAG.getNode(RISCVISDOpc, DL, InterimIVT, Src, Mask, VL);
+
+        while (InterimIVT != DstVT) {
+          SrcEltSize /= 2;
+          Src = Result;
+          InterimIVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize / 2),
+                                        DstVT.getVectorElementCount());
+          Result = DAG.getNode(RISCVISD::TRUNCATE_VECTOR_VL, DL, InterimIVT,
+                               Src, Mask, VL);
+        }
+      }
+    }
+  }
+
+  MVT VT = Op.getSimpleValueType();
+  if (!VT.isFixedLengthVector())
+    return Result;
+  return convertFromScalableVector(VT, Result, DAG, Subtarget);
+}
+
 SDValue RISCVTargetLowering::lowerLogicVPOp(SDValue Op, SelectionDAG &DAG,
                                             unsigned MaskOpc,
                                             unsigned VecOpc) const {
@@ -5876,23 +6461,14 @@ SDValue RISCVTargetLowering::lowerMaskedGather(SDValue Op,
 
   MVT ContainerVT = VT;
   if (VT.isFixedLengthVector()) {
-    // We need to use the larger of the result and index type to determine the
-    // scalable type to use so we don't increase LMUL for any operand/result.
-    if (VT.bitsGE(IndexVT)) {
-      ContainerVT = getContainerForFixedLengthVector(VT);
-      IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(),
-                                 ContainerVT.getVectorElementCount());
-    } else {
-      IndexVT = getContainerForFixedLengthVector(IndexVT);
-      ContainerVT = MVT::getVectorVT(ContainerVT.getVectorElementType(),
-                                     IndexVT.getVectorElementCount());
-    }
+    ContainerVT = getContainerForFixedLengthVector(VT);
+    IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(),
+                               ContainerVT.getVectorElementCount());
 
     Index = convertToScalableVector(IndexVT, Index, DAG, Subtarget);
 
     if (!IsUnmasked) {
-      MVT MaskVT =
-          MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
+      MVT MaskVT = getMaskTypeFor(ContainerVT);
       Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
       PassThru = convertToScalableVector(ContainerVT, PassThru, DAG, Subtarget);
     }
@@ -5987,24 +6563,15 @@ SDValue RISCVTargetLowering::lowerMaskedScatter(SDValue Op,
 
   MVT ContainerVT = VT;
   if (VT.isFixedLengthVector()) {
-    // We need to use the larger of the value and index type to determine the
-    // scalable type to use so we don't increase LMUL for any operand/result.
-    if (VT.bitsGE(IndexVT)) {
-      ContainerVT = getContainerForFixedLengthVector(VT);
-      IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(),
-                                 ContainerVT.getVectorElementCount());
-    } else {
-      IndexVT = getContainerForFixedLengthVector(IndexVT);
-      ContainerVT = MVT::getVectorVT(VT.getVectorElementType(),
-                                     IndexVT.getVectorElementCount());
-    }
+    ContainerVT = getContainerForFixedLengthVector(VT);
+    IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(),
+                               ContainerVT.getVectorElementCount());
 
     Index = convertToScalableVector(IndexVT, Index, DAG, Subtarget);
     Val = convertToScalableVector(ContainerVT, Val, DAG, Subtarget);
 
     if (!IsUnmasked) {
-      MVT MaskVT =
-          MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
+      MVT MaskVT = getMaskTypeFor(ContainerVT);
       Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
     }
   }
@@ -6095,14 +6662,21 @@ SDValue RISCVTargetLowering::lowerSET_ROUNDING(SDValue Op,
                      RMValue);
 }
 
+SDValue RISCVTargetLowering::lowerEH_DWARF_CFA(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  bool isRISCV64 = Subtarget.is64Bit();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+  int FI = MF.getFrameInfo().CreateFixedObject(isRISCV64 ? 8 : 4, 0, false);
+  return DAG.getFrameIndex(FI, PtrVT);
+}
+
 static RISCVISD::NodeType getRISCVWOpcodeByIntr(unsigned IntNo) {
   switch (IntNo) {
   default:
     llvm_unreachable("Unexpected Intrinsic");
-  case Intrinsic::riscv_grev:
-    return RISCVISD::GREVW;
-  case Intrinsic::riscv_gorc:
-    return RISCVISD::GORCW;
   case Intrinsic::riscv_bcompress:
     return RISCVISD::BCOMPRESSW;
   case Intrinsic::riscv_bdecompress:
@@ -6121,9 +6695,12 @@ static SDValue customLegalizeToWOpByIntr(SDNode *N, SelectionDAG &DAG,
                                          unsigned IntNo) {
   SDLoc DL(N);
   RISCVISD::NodeType WOpcode = getRISCVWOpcodeByIntr(IntNo);
-  SDValue NewOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
-  SDValue NewOp2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(2));
-  SDValue NewRes = DAG.getNode(WOpcode, DL, MVT::i64, NewOp1, NewOp2);
+  // Deal with the Instruction Operands
+  SmallVector<SDValue, 3> NewOps;
+  for (SDValue Op : drop_begin(N->ops()))
+    // Promote the operand to i64 type
+    NewOps.push_back(DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op));
+  SDValue NewRes = DAG.getNode(WOpcode, DL, MVT::i64, NewOps);
   // ReplaceNodeResults requires we maintain the same type for the return value.
   return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewRes);
 }
@@ -6150,10 +6727,6 @@ static RISCVISD::NodeType getRISCVWOpcode(unsigned Opcode) {
     return RISCVISD::ROLW;
   case ISD::ROTR:
     return RISCVISD::RORW;
-  case RISCVISD::GREV:
-    return RISCVISD::GREVW;
-  case RISCVISD::GORC:
-    return RISCVISD::GORCW;
   }
 }
 
@@ -6309,6 +6882,10 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
     assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
            "Unexpected custom legalisation");
     if (N->getOperand(1).getOpcode() != ISD::Constant) {
+      // If we can use a BSET instruction, allow default promotion to apply.
+      if (N->getOpcode() == ISD::SHL && Subtarget.hasStdExtZbs() &&
+          isOneConstant(N->getOperand(0)))
+        break;
       Results.push_back(customLegalizeToWOp(N, DAG));
       break;
     }
@@ -6388,12 +6965,23 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
     Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, Res,
                       DAG.getValueType(MVT::i32));
 
-    // Sign extend the LHS and perform an unsigned compare with the ADDW result.
-    // Since the inputs are sign extended from i32, this is equivalent to
-    // comparing the lower 32 bits.
-    LHS = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, N->getOperand(0));
-    SDValue Overflow = DAG.getSetCC(DL, N->getValueType(1), Res, LHS,
-                                    IsAdd ? ISD::SETULT : ISD::SETUGT);
+    SDValue Overflow;
+    if (IsAdd && isOneConstant(RHS)) {
+      // Special case uaddo X, 1 overflowed if the addition result is 0.
+      // The general case (X + C) < C is not necessarily beneficial. Although we
+      // reduce the live range of X, we may introduce the materialization of
+      // constant C, especially when the setcc result is used by branch. We have
+      // no compare with constant and branch instructions.
+      Overflow = DAG.getSetCC(DL, N->getValueType(1), Res,
+                              DAG.getConstant(0, DL, MVT::i64), ISD::SETEQ);
+    } else {
+      // Sign extend the LHS and perform an unsigned compare with the ADDW
+      // result. Since the inputs are sign extended from i32, this is equivalent
+      // to comparing the lower 32 bits.
+      LHS = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, N->getOperand(0));
+      Overflow = DAG.getSetCC(DL, N->getValueType(1), Res, LHS,
+                              IsAdd ? ISD::SETULT : ISD::SETUGT);
+    }
 
     Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
     Results.push_back(Overflow);
@@ -6421,6 +7009,33 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
     Results.push_back(expandAddSubSat(N, DAG));
     return;
   }
+  case ISD::ABS: {
+    assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+           "Unexpected custom legalisation");
+
+    // Expand abs to Y = (sraiw X, 31); subw(xor(X, Y), Y)
+
+    SDValue Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));
+
+    // Freeze the source so we can increase it's use count.
+    Src = DAG.getFreeze(Src);
+
+    // Copy sign bit to all bits using the sraiw pattern.
+    SDValue SignFill = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, Src,
+                                   DAG.getValueType(MVT::i32));
+    SignFill = DAG.getNode(ISD::SRA, DL, MVT::i64, SignFill,
+                           DAG.getConstant(31, DL, MVT::i64));
+
+    SDValue NewRes = DAG.getNode(ISD::XOR, DL, MVT::i64, Src, SignFill);
+    NewRes = DAG.getNode(ISD::SUB, DL, MVT::i64, NewRes, SignFill);
+
+    // NOTE: The result is only required to be anyextended, but sext is
+    // consistent with type legalization of sub.
+    NewRes = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, NewRes,
+                         DAG.getValueType(MVT::i32));
+    Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewRes));
+    return;
+  }
   case ISD::BITCAST: {
     EVT VT = N->getValueType(0);
     assert(VT.isInteger() && !VT.isVector() && "Unexpected VT!");
@@ -6451,37 +7066,24 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
     break;
   }
   case RISCVISD::GREV:
-  case RISCVISD::GORC: {
-    assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
-           "Unexpected custom legalisation");
-    assert(isa<ConstantSDNode>(N->getOperand(1)) && "Expected constant");
-    // This is similar to customLegalizeToWOp, except that we pass the second
-    // operand (a TargetConstant) straight through: it is already of type
-    // XLenVT.
-    RISCVISD::NodeType WOpcode = getRISCVWOpcode(N->getOpcode());
-    SDValue NewOp0 =
-        DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));
-    SDValue NewOp1 =
-        DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
-    SDValue NewRes = DAG.getNode(WOpcode, DL, MVT::i64, NewOp0, NewOp1);
-    // ReplaceNodeResults requires we maintain the same type for the return
-    // value.
-    Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewRes));
-    break;
-  }
+  case RISCVISD::GORC:
   case RISCVISD::SHFL: {
-    // There is no SHFLIW instruction, but we can just promote the operation.
-    assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+    MVT VT = N->getSimpleValueType(0);
+    MVT XLenVT = Subtarget.getXLenVT();
+    assert((VT == MVT::i16 || (VT == MVT::i32 && Subtarget.is64Bit())) &&
            "Unexpected custom legalisation");
     assert(isa<ConstantSDNode>(N->getOperand(1)) && "Expected constant");
-    SDValue NewOp0 =
-        DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));
+    assert((Subtarget.hasStdExtZbp() ||
+            (Subtarget.hasStdExtZbkb() && N->getOpcode() == RISCVISD::GREV &&
+             N->getConstantOperandVal(1) == 7)) &&
+           "Unexpected extension");
+    SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, XLenVT, N->getOperand(0));
     SDValue NewOp1 =
-        DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
-    SDValue NewRes = DAG.getNode(RISCVISD::SHFL, DL, MVT::i64, NewOp0, NewOp1);
+        DAG.getNode(ISD::ZERO_EXTEND, DL, XLenVT, N->getOperand(1));
+    SDValue NewRes = DAG.getNode(N->getOpcode(), DL, XLenVT, NewOp0, NewOp1);
     // ReplaceNodeResults requires we maintain the same type for the return
     // value.
-    Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewRes));
+    Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, NewRes));
     break;
   }
   case ISD::BSWAP:
@@ -6496,9 +7098,8 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
     // If this is BSWAP rather than BITREVERSE, clear the lower 3 bits.
     if (N->getOpcode() == ISD::BSWAP)
       Imm &= ~0x7U;
-    unsigned Opc = Subtarget.is64Bit() ? RISCVISD::GREVW : RISCVISD::GREV;
-    SDValue GREVI =
-        DAG.getNode(Opc, DL, XLenVT, NewOp0, DAG.getConstant(Imm, DL, XLenVT));
+    SDValue GREVI = DAG.getNode(RISCVISD::GREV, DL, XLenVT, NewOp0,
+                                DAG.getConstant(Imm, DL, XLenVT));
     // ReplaceNodeResults requires we maintain the same type for the return
     // value.
     Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, GREVI));
@@ -6564,9 +7165,8 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
     MVT XLenVT = Subtarget.getXLenVT();
 
     // Use a VL of 1 to avoid processing more elements than we need.
-    MVT MaskVT = MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
     SDValue VL = DAG.getConstant(1, DL, XLenVT);
-    SDValue Mask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL);
+    SDValue Mask = getAllOnesMask(ContainerVT, VL, DL, DAG);
 
     // Unless the index is known to be 0, we must slide the vector down to get
     // the desired element into index 0.
@@ -6581,6 +7181,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
     // To extract the upper XLEN bits of the vector element, shift the first
     // element right by 32 bits and re-extract the lower XLEN bits.
     SDValue ThirtyTwoV = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
+                                     DAG.getUNDEF(ContainerVT),
                                      DAG.getConstant(32, DL, XLenVT), VL);
     SDValue LShr32 = DAG.getNode(RISCVISD::SRL_VL, DL, ContainerVT, Vec,
                                  ThirtyTwoV, Mask, VL);
@@ -6597,38 +7198,42 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
       llvm_unreachable(
           "Don't know how to custom type legalize this intrinsic!");
     case Intrinsic::riscv_grev:
-    case Intrinsic::riscv_gorc:
-    case Intrinsic::riscv_bcompress:
-    case Intrinsic::riscv_bdecompress:
-    case Intrinsic::riscv_bfp: {
+    case Intrinsic::riscv_gorc: {
       assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
              "Unexpected custom legalisation");
-      Results.push_back(customLegalizeToWOpByIntr(N, DAG, IntNo));
+      SDValue NewOp1 =
+          DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
+      SDValue NewOp2 =
+          DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(2));
+      unsigned Opc =
+          IntNo == Intrinsic::riscv_grev ? RISCVISD::GREVW : RISCVISD::GORCW;
+      // If the control is a constant, promote the node by clearing any extra
+      // bits bits in the control. isel will form greviw/gorciw if the result is
+      // sign extended.
+      if (isa<ConstantSDNode>(NewOp2)) {
+        NewOp2 = DAG.getNode(ISD::AND, DL, MVT::i64, NewOp2,
+                             DAG.getConstant(0x1f, DL, MVT::i64));
+        Opc = IntNo == Intrinsic::riscv_grev ? RISCVISD::GREV : RISCVISD::GORC;
+      }
+      SDValue Res = DAG.getNode(Opc, DL, MVT::i64, NewOp1, NewOp2);
+      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
       break;
     }
+    case Intrinsic::riscv_bcompress:
+    case Intrinsic::riscv_bdecompress:
+    case Intrinsic::riscv_bfp:
     case Intrinsic::riscv_fsl:
     case Intrinsic::riscv_fsr: {
       assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
              "Unexpected custom legalisation");
-      SDValue NewOp1 =
-          DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
-      SDValue NewOp2 =
-          DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(2));
-      SDValue NewOp3 =
-          DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(3));
-      unsigned Opc = getRISCVWOpcodeByIntr(IntNo);
-      SDValue Res = DAG.getNode(Opc, DL, MVT::i64, NewOp1, NewOp2, NewOp3);
-      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
+      Results.push_back(customLegalizeToWOpByIntr(N, DAG, IntNo));
       break;
     }
     case Intrinsic::riscv_orc_b: {
       // Lower to the GORCI encoding for orc.b with the operand extended.
       SDValue NewOp =
           DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
-      // If Zbp is enabled, use GORCIW which will sign extend the result.
-      unsigned Opc =
-          Subtarget.hasStdExtZbp() ? RISCVISD::GORCW : RISCVISD::GORC;
-      SDValue Res = DAG.getNode(Opc, DL, MVT::i64, NewOp,
+      SDValue Res = DAG.getNode(RISCVISD::GORC, DL, MVT::i64, NewOp,
                                 DAG.getConstant(7, DL, MVT::i64));
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
       return;
@@ -6681,10 +7286,11 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
       // To extract the upper XLEN bits of the vector element, shift the first
       // element right by 32 bits and re-extract the lower XLEN bits.
       SDValue VL = DAG.getConstant(1, DL, XLenVT);
-      MVT MaskVT = MVT::getVectorVT(MVT::i1, VecVT.getVectorElementCount());
-      SDValue Mask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL);
-      SDValue ThirtyTwoV = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VecVT,
-                                       DAG.getConstant(32, DL, XLenVT), VL);
+      SDValue Mask = getAllOnesMask(VecVT, VL, DL, DAG);
+
+      SDValue ThirtyTwoV =
+          DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VecVT, DAG.getUNDEF(VecVT),
+                      DAG.getConstant(32, DL, XLenVT), VL);
       SDValue LShr32 =
           DAG.getNode(RISCVISD::SRL_VL, DL, VecVT, Vec, ThirtyTwoV, Mask, VL);
       SDValue EltHi = DAG.getNode(RISCVISD::VMV_X_S, DL, XLenVT, LShr32);
@@ -6840,6 +7446,110 @@ static Optional<RISCVBitmanipPat> matchGREVIPat(SDValue Op) {
   return matchRISCVBitmanipPat(Op, BitmanipMasks);
 }
 
+// Try to fold (<bop> x, (reduction.<bop> vec, start))
+static SDValue combineBinOpToReduce(SDNode *N, SelectionDAG &DAG) {
+  auto BinOpToRVVReduce = [](unsigned Opc) {
+    switch (Opc) {
+    default:
+      llvm_unreachable("Unhandled binary to transfrom reduction");
+    case ISD::ADD:
+      return RISCVISD::VECREDUCE_ADD_VL;
+    case ISD::UMAX:
+      return RISCVISD::VECREDUCE_UMAX_VL;
+    case ISD::SMAX:
+      return RISCVISD::VECREDUCE_SMAX_VL;
+    case ISD::UMIN:
+      return RISCVISD::VECREDUCE_UMIN_VL;
+    case ISD::SMIN:
+      return RISCVISD::VECREDUCE_SMIN_VL;
+    case ISD::AND:
+      return RISCVISD::VECREDUCE_AND_VL;
+    case ISD::OR:
+      return RISCVISD::VECREDUCE_OR_VL;
+    case ISD::XOR:
+      return RISCVISD::VECREDUCE_XOR_VL;
+    case ISD::FADD:
+      return RISCVISD::VECREDUCE_FADD_VL;
+    case ISD::FMAXNUM:
+      return RISCVISD::VECREDUCE_FMAX_VL;
+    case ISD::FMINNUM:
+      return RISCVISD::VECREDUCE_FMIN_VL;
+    }
+  };
+
+  auto IsReduction = [&BinOpToRVVReduce](SDValue V, unsigned Opc) {
+    return V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+           isNullConstant(V.getOperand(1)) &&
+           V.getOperand(0).getOpcode() == BinOpToRVVReduce(Opc);
+  };
+
+  unsigned Opc = N->getOpcode();
+  unsigned ReduceIdx;
+  if (IsReduction(N->getOperand(0), Opc))
+    ReduceIdx = 0;
+  else if (IsReduction(N->getOperand(1), Opc))
+    ReduceIdx = 1;
+  else
+    return SDValue();
+
+  // Skip if FADD disallows reassociation but the combiner needs.
+  if (Opc == ISD::FADD && !N->getFlags().hasAllowReassociation())
+    return SDValue();
+
+  SDValue Extract = N->getOperand(ReduceIdx);
+  SDValue Reduce = Extract.getOperand(0);
+  if (!Reduce.hasOneUse())
+    return SDValue();
+
+  SDValue ScalarV = Reduce.getOperand(2);
+
+  // Make sure that ScalarV is a splat with VL=1.
+  if (ScalarV.getOpcode() != RISCVISD::VFMV_S_F_VL &&
+      ScalarV.getOpcode() != RISCVISD::VMV_S_X_VL &&
+      ScalarV.getOpcode() != RISCVISD::VMV_V_X_VL)
+    return SDValue();
+
+  if (!isOneConstant(ScalarV.getOperand(2)))
+    return SDValue();
+
+  // TODO: Deal with value other than neutral element.
+  auto IsRVVNeutralElement = [Opc, &DAG](SDNode *N, SDValue V) {
+    if (Opc == ISD::FADD && N->getFlags().hasNoSignedZeros() &&
+        isNullFPConstant(V))
+      return true;
+    return DAG.getNeutralElement(Opc, SDLoc(V), V.getSimpleValueType(),
+                                 N->getFlags()) == V;
+  };
+
+  // Check the scalar of ScalarV is neutral element
+  if (!IsRVVNeutralElement(N, ScalarV.getOperand(1)))
+    return SDValue();
+
+  if (!ScalarV.hasOneUse())
+    return SDValue();
+
+  EVT SplatVT = ScalarV.getValueType();
+  SDValue NewStart = N->getOperand(1 - ReduceIdx);
+  unsigned SplatOpc = RISCVISD::VFMV_S_F_VL;
+  if (SplatVT.isInteger()) {
+    auto *C = dyn_cast<ConstantSDNode>(NewStart.getNode());
+    if (!C || C->isZero() || !isInt<5>(C->getSExtValue()))
+      SplatOpc = RISCVISD::VMV_S_X_VL;
+    else
+      SplatOpc = RISCVISD::VMV_V_X_VL;
+  }
+
+  SDValue NewScalarV =
+      DAG.getNode(SplatOpc, SDLoc(N), SplatVT, ScalarV.getOperand(0), NewStart,
+                  ScalarV.getOperand(2));
+  SDValue NewReduce =
+      DAG.getNode(Reduce.getOpcode(), SDLoc(Reduce), Reduce.getValueType(),
+                  Reduce.getOperand(0), Reduce.getOperand(1), NewScalarV,
+                  Reduce.getOperand(3), Reduce.getOperand(4));
+  return DAG.getNode(Extract.getOpcode(), SDLoc(Extract),
+                     Extract.getValueType(), NewReduce, Extract.getOperand(1));
+}
+
 // Match the following pattern as a GREVI(W) operation
 //   (or (BITMANIP_SHL x), (BITMANIP_SRL x))
 static SDValue combineORToGREV(SDValue Op, SelectionDAG &DAG,
@@ -7066,11 +7776,70 @@ static SDValue transformAddShlImm(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(ISD::SHL, DL, VT, NA1, DAG.getConstant(Bits, DL, VT));
 }
 
+// Combine
+// ROTR ((GREVI x, 24), 16) -> (GREVI x, 8) for RV32
+// ROTL ((GREVI x, 24), 16) -> (GREVI x, 8) for RV32
+// ROTR ((GREVI x, 56), 32) -> (GREVI x, 24) for RV64
+// ROTL ((GREVI x, 56), 32) -> (GREVI x, 24) for RV64
+// RORW ((GREVI x, 24), 16) -> (GREVIW x, 8) for RV64
+// ROLW ((GREVI x, 24), 16) -> (GREVIW x, 8) for RV64
+// The grev patterns represents BSWAP.
+// FIXME: This can be generalized to any GREV. We just need to toggle the MSB
+// off the grev.
+static SDValue combineROTR_ROTL_RORW_ROLW(SDNode *N, SelectionDAG &DAG,
+                                          const RISCVSubtarget &Subtarget) {
+  bool IsWInstruction =
+      N->getOpcode() == RISCVISD::RORW || N->getOpcode() == RISCVISD::ROLW;
+  assert((N->getOpcode() == ISD::ROTR || N->getOpcode() == ISD::ROTL ||
+          IsWInstruction) &&
+         "Unexpected opcode!");
+  SDValue Src = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  if (!Subtarget.hasStdExtZbp() || Src.getOpcode() != RISCVISD::GREV)
+    return SDValue();
+
+  if (!isa<ConstantSDNode>(N->getOperand(1)) ||
+      !isa<ConstantSDNode>(Src.getOperand(1)))
+    return SDValue();
+
+  unsigned BitWidth = IsWInstruction ? 32 : VT.getSizeInBits();
+  assert(isPowerOf2_32(BitWidth) && "Expected a power of 2");
+
+  // Needs to be a rotate by half the bitwidth for ROTR/ROTL or by 16 for
+  // RORW/ROLW. And the grev should be the encoding for bswap for this width.
+  unsigned ShAmt1 = N->getConstantOperandVal(1);
+  unsigned ShAmt2 = Src.getConstantOperandVal(1);
+  if (BitWidth < 32 || ShAmt1 != (BitWidth / 2) || ShAmt2 != (BitWidth - 8))
+    return SDValue();
+
+  Src = Src.getOperand(0);
+
+  // Toggle bit the MSB of the shift.
+  unsigned CombinedShAmt = ShAmt1 ^ ShAmt2;
+  if (CombinedShAmt == 0)
+    return Src;
+
+  SDValue Res = DAG.getNode(
+      RISCVISD::GREV, DL, VT, Src,
+      DAG.getConstant(CombinedShAmt, DL, N->getOperand(1).getValueType()));
+  if (!IsWInstruction)
+    return Res;
+
+  // Sign extend the result to match the behavior of the rotate. This will be
+  // selected to GREVIW in isel.
+  return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Res,
+                     DAG.getValueType(MVT::i32));
+}
+
 // Combine (GREVI (GREVI x, C2), C1) -> (GREVI x, C1^C2) when C1^C2 is
 // non-zero, and to x when it is. Any repeated GREVI stage undoes itself.
 // Combine (GORCI (GORCI x, C2), C1) -> (GORCI x, C1|C2). Repeated stage does
 // not undo itself, but they are redundant.
 static SDValue combineGREVI_GORCI(SDNode *N, SelectionDAG &DAG) {
+  bool IsGORC = N->getOpcode() == RISCVISD::GORC;
+  assert((IsGORC || N->getOpcode() == RISCVISD::GREV) && "Unexpected opcode");
   SDValue Src = N->getOperand(0);
 
   if (Src.getOpcode() != N->getOpcode())
@@ -7085,7 +7854,7 @@ static SDValue combineGREVI_GORCI(SDNode *N, SelectionDAG &DAG) {
   Src = Src.getOperand(0);
 
   unsigned CombinedShAmt;
-  if (N->getOpcode() == RISCVISD::GORC || N->getOpcode() == RISCVISD::GORCW)
+  if (IsGORC)
     CombinedShAmt = ShAmt1 | ShAmt2;
   else
     CombinedShAmt = ShAmt1 ^ ShAmt2;
@@ -7203,6 +7972,11 @@ static SDValue transformAddImmMulImm(SDNode *N, SelectionDAG &DAG,
   auto *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
   if (!N0C || !N1C)
     return SDValue();
+  // If N0C has multiple uses it's possible one of the cases in
+  // DAGCombiner::isMulAddWithConstProfitable will be true, which would result
+  // in an infinite loop.
+  if (!N0C->hasOneUse())
+    return SDValue();
   int64_t C0 = N0C->getSExtValue();
   int64_t C1 = N1C->getSExtValue();
   int64_t CA, CB;
@@ -7238,6 +8012,8 @@ static SDValue performADDCombine(SDNode *N, SelectionDAG &DAG,
     return V;
   if (SDValue V = transformAddShlImm(N, DAG, Subtarget))
     return V;
+  if (SDValue V = combineBinOpToReduce(N, DAG))
+    return V;
   // fold (add (select lhs, rhs, cc, 0, y), x) ->
   //      (select lhs, rhs, cc, x, (add x, y))
   return combineSelectAndUseCommutative(N, DAG, /*AllOnes*/ false);
@@ -7251,7 +8027,30 @@ static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG) {
   return combineSelectAndUse(N, N1, N0, DAG, /*AllOnes*/ false);
 }
 
-static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG,
+                                 const RISCVSubtarget &Subtarget) {
+  SDValue N0 = N->getOperand(0);
+  // Pre-promote (i32 (and (srl X, Y), 1)) on RV64 with Zbs without zero
+  // extending X. This is safe since we only need the LSB after the shift and
+  // shift amounts larger than 31 would produce poison. If we wait until
+  // type legalization, we'll create RISCVISD::SRLW and we can't recover it
+  // to use a BEXT instruction.
+  if (Subtarget.is64Bit() && Subtarget.hasStdExtZbs() &&
+      N->getValueType(0) == MVT::i32 && isOneConstant(N->getOperand(1)) &&
+      N0.getOpcode() == ISD::SRL && !isa<ConstantSDNode>(N0.getOperand(1)) &&
+      N0.hasOneUse()) {
+    SDLoc DL(N);
+    SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N0.getOperand(0));
+    SDValue Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N0.getOperand(1));
+    SDValue Srl = DAG.getNode(ISD::SRL, DL, MVT::i64, Op0, Op1);
+    SDValue And = DAG.getNode(ISD::AND, DL, MVT::i64, Srl,
+                              DAG.getConstant(1, DL, MVT::i64));
+    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, And);
+  }
+
+  if (SDValue V = combineBinOpToReduce(N, DAG))
+    return V;
+
   // fold (and (select lhs, rhs, cc, -1, y), x) ->
   //      (select lhs, rhs, cc, x, (and x, y))
   return combineSelectAndUseCommutative(N, DAG, /*AllOnes*/ true);
@@ -7268,99 +8067,197 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
       return SHFL;
   }
 
+  if (SDValue V = combineBinOpToReduce(N, DAG))
+    return V;
   // fold (or (select cond, 0, y), x) ->
   //      (select cond, x, (or x, y))
   return combineSelectAndUseCommutative(N, DAG, /*AllOnes*/ false);
 }
 
 static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // fold (xor (sllw 1, x), -1) -> (rolw ~1, x)
+  // NOTE: Assumes ROL being legal means ROLW is legal.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (N0.getOpcode() == RISCVISD::SLLW &&
+      isAllOnesConstant(N1) && isOneConstant(N0.getOperand(0)) &&
+      TLI.isOperationLegal(ISD::ROTL, MVT::i64)) {
+    SDLoc DL(N);
+    return DAG.getNode(RISCVISD::ROLW, DL, MVT::i64,
+                       DAG.getConstant(~1, DL, MVT::i64), N0.getOperand(1));
+  }
+
+  if (SDValue V = combineBinOpToReduce(N, DAG))
+    return V;
   // fold (xor (select cond, 0, y), x) ->
   //      (select cond, x, (xor x, y))
   return combineSelectAndUseCommutative(N, DAG, /*AllOnes*/ false);
 }
 
-// Attempt to turn ANY_EXTEND into SIGN_EXTEND if the input to the ANY_EXTEND
-// has users that require SIGN_EXTEND and the SIGN_EXTEND can be done for free
-// by an instruction like ADDW/SUBW/MULW. Without this the ANY_EXTEND would be
-// removed during type legalization leaving an ADD/SUB/MUL use that won't use
-// ADDW/SUBW/MULW.
-static SDValue performANY_EXTENDCombine(SDNode *N,
-                                        TargetLowering::DAGCombinerInfo &DCI,
-                                        const RISCVSubtarget &Subtarget) {
-  if (!Subtarget.is64Bit())
-    return SDValue();
-
-  SelectionDAG &DAG = DCI.DAG;
-
+static SDValue
+performSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
+                                const RISCVSubtarget &Subtarget) {
   SDValue Src = N->getOperand(0);
   EVT VT = N->getValueType(0);
-  if (VT != MVT::i64 || Src.getValueType() != MVT::i32)
-    return SDValue();
 
-  // The opcode must be one that can implicitly sign_extend.
-  // FIXME: Additional opcodes.
-  switch (Src.getOpcode()) {
-  default:
-    return SDValue();
-  case ISD::MUL:
-    if (!Subtarget.hasStdExtM())
-      return SDValue();
-    LLVM_FALLTHROUGH;
-  case ISD::ADD:
-  case ISD::SUB:
-    break;
+  // Fold (sext_inreg (fmv_x_anyexth X), i16) -> (fmv_x_signexth X)
+  if (Src.getOpcode() == RISCVISD::FMV_X_ANYEXTH &&
+      cast<VTSDNode>(N->getOperand(1))->getVT().bitsGE(MVT::i16))
+    return DAG.getNode(RISCVISD::FMV_X_SIGNEXTH, SDLoc(N), VT,
+                       Src.getOperand(0));
+
+  // Fold (i64 (sext_inreg (abs X), i32)) ->
+  // (i64 (smax (sext_inreg (neg X), i32), X)) if X has more than 32 sign bits.
+  // The (sext_inreg (neg X), i32) will be selected to negw by isel. This
+  // pattern occurs after type legalization of (i32 (abs X)) on RV64 if the user
+  // of the (i32 (abs X)) is a sext or setcc or something else that causes type
+  // legalization to add a sext_inreg after the abs. The (i32 (abs X)) will have
+  // been type legalized to (i64 (abs (sext_inreg X, i32))), but the sext_inreg
+  // may get combined into an earlier operation so we need to use
+  // ComputeNumSignBits.
+  // NOTE: (i64 (sext_inreg (abs X), i32)) can also be created for
+  // (i64 (ashr (shl (abs X), 32), 32)) without any type legalization so
+  // we can't assume that X has 33 sign bits. We must check.
+  if (Subtarget.hasStdExtZbb() && Subtarget.is64Bit() &&
+      Src.getOpcode() == ISD::ABS && Src.hasOneUse() && VT == MVT::i64 &&
+      cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i32 &&
+      DAG.ComputeNumSignBits(Src.getOperand(0)) > 32) {
+    SDLoc DL(N);
+    SDValue Freeze = DAG.getFreeze(Src.getOperand(0));
+    SDValue Neg =
+        DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, MVT::i64), Freeze);
+    Neg = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, Neg,
+                      DAG.getValueType(MVT::i32));
+    return DAG.getNode(ISD::SMAX, DL, MVT::i64, Freeze, Neg);
   }
 
-  // Only handle cases where the result is used by a CopyToReg. That likely
-  // means the value is a liveout of the basic block. This helps prevent
-  // infinite combine loops like PR51206.
-  if (none_of(N->uses(),
-              [](SDNode *User) { return User->getOpcode() == ISD::CopyToReg; }))
-    return SDValue();
+  return SDValue();
+}
 
-  SmallVector<SDNode *, 4> SetCCs;
-  for (SDNode::use_iterator UI = Src.getNode()->use_begin(),
-                            UE = Src.getNode()->use_end();
-       UI != UE; ++UI) {
-    SDNode *User = *UI;
-    if (User == N)
-      continue;
-    if (UI.getUse().getResNo() != Src.getResNo())
-      continue;
-    // All i32 setccs are legalized by sign extending operands.
-    if (User->getOpcode() == ISD::SETCC) {
-      SetCCs.push_back(User);
-      continue;
-    }
-    // We don't know if we can extend this user.
-    break;
+// Try to form vwadd(u).wv/wx or vwsub(u).wv/wx. It might later be optimized to
+// vwadd(u).vv/vx or vwsub(u).vv/vx.
+static SDValue combineADDSUB_VLToVWADDSUB_VL(SDNode *N, SelectionDAG &DAG,
+                                             bool Commute = false) {
+  assert((N->getOpcode() == RISCVISD::ADD_VL ||
+          N->getOpcode() == RISCVISD::SUB_VL) &&
+         "Unexpected opcode");
+  bool IsAdd = N->getOpcode() == RISCVISD::ADD_VL;
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  if (Commute)
+    std::swap(Op0, Op1);
+
+  MVT VT = N->getSimpleValueType(0);
+
+  // Determine the narrow size for a widening add/sub.
+  unsigned NarrowSize = VT.getScalarSizeInBits() / 2;
+  MVT NarrowVT = MVT::getVectorVT(MVT::getIntegerVT(NarrowSize),
+                                  VT.getVectorElementCount());
+
+  SDValue Mask = N->getOperand(2);
+  SDValue VL = N->getOperand(3);
+
+  SDLoc DL(N);
+
+  // If the RHS is a sext or zext, we can form a widening op.
+  if ((Op1.getOpcode() == RISCVISD::VZEXT_VL ||
+       Op1.getOpcode() == RISCVISD::VSEXT_VL) &&
+      Op1.hasOneUse() && Op1.getOperand(1) == Mask && Op1.getOperand(2) == VL) {
+    unsigned ExtOpc = Op1.getOpcode();
+    Op1 = Op1.getOperand(0);
+    // Re-introduce narrower extends if needed.
+    if (Op1.getValueType() != NarrowVT)
+      Op1 = DAG.getNode(ExtOpc, DL, NarrowVT, Op1, Mask, VL);
+
+    unsigned WOpc;
+    if (ExtOpc == RISCVISD::VSEXT_VL)
+      WOpc = IsAdd ? RISCVISD::VWADD_W_VL : RISCVISD::VWSUB_W_VL;
+    else
+      WOpc = IsAdd ? RISCVISD::VWADDU_W_VL : RISCVISD::VWSUBU_W_VL;
+
+    return DAG.getNode(WOpc, DL, VT, Op0, Op1, Mask, VL);
   }
 
-  // If we don't have any SetCCs, this isn't worthwhile.
-  if (SetCCs.empty())
-    return SDValue();
+  // FIXME: Is it useful to form a vwadd.wx or vwsub.wx if it removes a scalar
+  // sext/zext?
+
+  return SDValue();
+}
+
+// Try to convert vwadd(u).wv/wx or vwsub(u).wv/wx to vwadd(u).vv/vx or
+// vwsub(u).vv/vx.
+static SDValue combineVWADD_W_VL_VWSUB_W_VL(SDNode *N, SelectionDAG &DAG) {
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  SDValue Mask = N->getOperand(2);
+  SDValue VL = N->getOperand(3);
+
+  MVT VT = N->getSimpleValueType(0);
+  MVT NarrowVT = Op1.getSimpleValueType();
+  unsigned NarrowSize = NarrowVT.getScalarSizeInBits();
+
+  unsigned VOpc;
+  switch (N->getOpcode()) {
+  default: llvm_unreachable("Unexpected opcode");
+  case RISCVISD::VWADD_W_VL:  VOpc = RISCVISD::VWADD_VL;  break;
+  case RISCVISD::VWSUB_W_VL:  VOpc = RISCVISD::VWSUB_VL;  break;
+  case RISCVISD::VWADDU_W_VL: VOpc = RISCVISD::VWADDU_VL; break;
+  case RISCVISD::VWSUBU_W_VL: VOpc = RISCVISD::VWSUBU_VL; break;
+  }
+
+  bool IsSigned = N->getOpcode() == RISCVISD::VWADD_W_VL ||
+                  N->getOpcode() == RISCVISD::VWSUB_W_VL;
 
   SDLoc DL(N);
-  SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Src);
-  DCI.CombineTo(N, SExt);
 
-  // Promote all the setccs.
-  for (SDNode *SetCC : SetCCs) {
-    SmallVector<SDValue, 4> Ops;
+  // If the LHS is a sext or zext, we can narrow this op to the same size as
+  // the RHS.
+  if (((Op0.getOpcode() == RISCVISD::VZEXT_VL && !IsSigned) ||
+       (Op0.getOpcode() == RISCVISD::VSEXT_VL && IsSigned)) &&
+      Op0.hasOneUse() && Op0.getOperand(1) == Mask && Op0.getOperand(2) == VL) {
+    unsigned ExtOpc = Op0.getOpcode();
+    Op0 = Op0.getOperand(0);
+    // Re-introduce narrower extends if needed.
+    if (Op0.getValueType() != NarrowVT)
+      Op0 = DAG.getNode(ExtOpc, DL, NarrowVT, Op0, Mask, VL);
+    return DAG.getNode(VOpc, DL, VT, Op0, Op1, Mask, VL);
+  }
 
-    for (unsigned j = 0; j != 2; ++j) {
-      SDValue SOp = SetCC->getOperand(j);
-      if (SOp == Src)
-        Ops.push_back(SExt);
-      else
-        Ops.push_back(DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, SOp));
+  bool IsAdd = N->getOpcode() == RISCVISD::VWADD_W_VL ||
+               N->getOpcode() == RISCVISD::VWADDU_W_VL;
+
+  // Look for splats on the left hand side of a vwadd(u).wv. We might be able
+  // to commute and use a vwadd(u).vx instead.
+  if (IsAdd && Op0.getOpcode() == RISCVISD::VMV_V_X_VL &&
+      Op0.getOperand(0).isUndef() && Op0.getOperand(2) == VL) {
+    Op0 = Op0.getOperand(1);
+
+    // See if have enough sign bits or zero bits in the scalar to use a
+    // widening add/sub by splatting to smaller element size.
+    unsigned EltBits = VT.getScalarSizeInBits();
+    unsigned ScalarBits = Op0.getValueSizeInBits();
+    // Make sure we're getting all element bits from the scalar register.
+    // FIXME: Support implicit sign extension of vmv.v.x?
+    if (ScalarBits < EltBits)
+      return SDValue();
+
+    if (IsSigned) {
+      if (DAG.ComputeNumSignBits(Op0) <= (ScalarBits - NarrowSize))
+        return SDValue();
+    } else {
+      APInt Mask = APInt::getBitsSetFrom(ScalarBits, NarrowSize);
+      if (!DAG.MaskedValueIsZero(Op0, Mask))
+        return SDValue();
     }
 
-    Ops.push_back(SetCC->getOperand(2));
-    DCI.CombineTo(SetCC,
-                  DAG.getNode(ISD::SETCC, DL, SetCC->getValueType(0), Ops));
+    Op0 = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, NarrowVT,
+                      DAG.getUNDEF(NarrowVT), Op0, VL);
+    return DAG.getNode(VOpc, DL, VT, Op1, Op0, Mask, VL);
   }
-  return SDValue(N, 0);
+
+  return SDValue();
 }
 
 // Try to form VWMUL, VWMULU or VWMULSU.
@@ -7408,12 +8305,15 @@ static SDValue combineMUL_VLToVWMUL_VL(SDNode *N, SelectionDAG &DAG,
   } else if (Op1.getOpcode() == RISCVISD::VMV_V_X_VL) {
     // The operand is a splat of a scalar.
 
+    // The pasthru must be undef for tail agnostic
+    if (!Op1.getOperand(0).isUndef())
+      return SDValue();
     // The VL must be the same.
-    if (Op1.getOperand(1) != VL)
+    if (Op1.getOperand(2) != VL)
       return SDValue();
 
     // Get the scalar value.
-    Op1 = Op1.getOperand(0);
+    Op1 = Op1.getOperand(1);
 
     // See if have enough sign bits or zero bits in the scalar to use a
     // widening multiply by splatting to smaller element size.
@@ -7424,16 +8324,20 @@ static SDValue combineMUL_VLToVWMUL_VL(SDNode *N, SelectionDAG &DAG,
     if (ScalarBits < EltBits)
       return SDValue();
 
-    if (IsSignExt) {
-      if (DAG.ComputeNumSignBits(Op1) <= (ScalarBits - NarrowSize))
-        return SDValue();
+    // If the LHS is a sign extend, try to use vwmul.
+    if (IsSignExt && DAG.ComputeNumSignBits(Op1) > (ScalarBits - NarrowSize)) {
+      // Can use vwmul.
     } else {
+      // Otherwise try to use vwmulu or vwmulsu.
       APInt Mask = APInt::getBitsSetFrom(ScalarBits, NarrowSize);
-      if (!DAG.MaskedValueIsZero(Op1, Mask))
+      if (DAG.MaskedValueIsZero(Op1, Mask))
+        IsVWMULSU = IsSignExt;
+      else
         return SDValue();
     }
 
-    Op1 = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, NarrowVT, Op1, VL);
+    Op1 = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, NarrowVT,
+                      DAG.getUNDEF(NarrowVT), Op1, VL);
   } else
     return SDValue();
 
@@ -7443,6 +8347,8 @@ static SDValue combineMUL_VLToVWMUL_VL(SDNode *N, SelectionDAG &DAG,
   unsigned ExtOpc = IsSignExt ? RISCVISD::VSEXT_VL : RISCVISD::VZEXT_VL;
   if (Op0.getValueType() != NarrowVT)
     Op0 = DAG.getNode(ExtOpc, DL, NarrowVT, Op0, Mask, VL);
+  // vwmulsu requires second operand to be zero extended.
+  ExtOpc = IsVWMULSU ? RISCVISD::VZEXT_VL : ExtOpc;
   if (Op1.getValueType() != NarrowVT)
     Op1 = DAG.getNode(ExtOpc, DL, NarrowVT, Op1, Mask, VL);
 
@@ -7569,6 +8475,133 @@ static SDValue performFP_TO_INT_SATCombine(SDNode *N,
   return DAG.getSelectCC(DL, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
 }
 
+// Combine (bitreverse (bswap X)) to the BREV8 GREVI encoding if the type is
+// smaller than XLenVT.
+static SDValue performBITREVERSECombine(SDNode *N, SelectionDAG &DAG,
+                                        const RISCVSubtarget &Subtarget) {
+  assert(Subtarget.hasStdExtZbkb() && "Unexpected extension");
+
+  SDValue Src = N->getOperand(0);
+  if (Src.getOpcode() != ISD::BSWAP)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  if (!VT.isScalarInteger() || VT.getSizeInBits() >= Subtarget.getXLen() ||
+      !isPowerOf2_32(VT.getSizeInBits()))
+    return SDValue();
+
+  SDLoc DL(N);
+  return DAG.getNode(RISCVISD::GREV, DL, VT, Src.getOperand(0),
+                     DAG.getConstant(7, DL, VT));
+}
+
+// Convert from one FMA opcode to another based on whether we are negating the
+// multiply result and/or the accumulator.
+// NOTE: Only supports RVV operations with VL.
+static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc) {
+  assert((NegMul || NegAcc) && "Not negating anything?");
+
+  // Negating the multiply result changes ADD<->SUB and toggles 'N'.
+  if (NegMul) {
+    // clang-format off
+    switch (Opcode) {
+    default: llvm_unreachable("Unexpected opcode");
+    case RISCVISD::VFMADD_VL:  Opcode = RISCVISD::VFNMSUB_VL; break;
+    case RISCVISD::VFNMSUB_VL: Opcode = RISCVISD::VFMADD_VL;  break;
+    case RISCVISD::VFNMADD_VL: Opcode = RISCVISD::VFMSUB_VL;  break;
+    case RISCVISD::VFMSUB_VL:  Opcode = RISCVISD::VFNMADD_VL; break;
+    }
+    // clang-format on
+  }
+
+  // Negating the accumulator changes ADD<->SUB.
+  if (NegAcc) {
+    // clang-format off
+    switch (Opcode) {
+    default: llvm_unreachable("Unexpected opcode");
+    case RISCVISD::VFMADD_VL:  Opcode = RISCVISD::VFMSUB_VL;  break;
+    case RISCVISD::VFMSUB_VL:  Opcode = RISCVISD::VFMADD_VL;  break;
+    case RISCVISD::VFNMADD_VL: Opcode = RISCVISD::VFNMSUB_VL; break;
+    case RISCVISD::VFNMSUB_VL: Opcode = RISCVISD::VFNMADD_VL; break;
+    }
+    // clang-format on
+  }
+
+  return Opcode;
+}
+
+// Combine (sra (shl X, 32), 32 - C) -> (shl (sext_inreg X, i32), C)
+// FIXME: Should this be a generic combine? There's a similar combine on X86.
+//
+// Also try these folds where an add or sub is in the middle.
+// (sra (add (shl X, 32), C1), 32 - C) -> (shl (sext_inreg (add X, C1), C)
+// (sra (sub C1, (shl X, 32)), 32 - C) -> (shl (sext_inreg (sub C1, X), C)
+static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG,
+                                 const RISCVSubtarget &Subtarget) {
+  assert(N->getOpcode() == ISD::SRA && "Unexpected opcode");
+
+  if (N->getValueType(0) != MVT::i64 || !Subtarget.is64Bit())
+    return SDValue();
+
+  auto *ShAmtC = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!ShAmtC || ShAmtC->getZExtValue() > 32)
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+
+  SDValue Shl;
+  ConstantSDNode *AddC = nullptr;
+
+  // We might have an ADD or SUB between the SRA and SHL.
+  bool IsAdd = N0.getOpcode() == ISD::ADD;
+  if ((IsAdd || N0.getOpcode() == ISD::SUB)) {
+    if (!N0.hasOneUse())
+      return SDValue();
+    // Other operand needs to be a constant we can modify.
+    AddC = dyn_cast<ConstantSDNode>(N0.getOperand(IsAdd ? 1 : 0));
+    if (!AddC)
+      return SDValue();
+
+    // AddC needs to have at least 32 trailing zeros.
+    if (AddC->getAPIntValue().countTrailingZeros() < 32)
+      return SDValue();
+
+    Shl = N0.getOperand(IsAdd ? 0 : 1);
+  } else {
+    // Not an ADD or SUB.
+    Shl = N0;
+  }
+
+  // Look for a shift left by 32.
+  if (Shl.getOpcode() != ISD::SHL || !Shl.hasOneUse() ||
+      !isa<ConstantSDNode>(Shl.getOperand(1)) ||
+      Shl.getConstantOperandVal(1) != 32)
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue In = Shl.getOperand(0);
+
+  // If we looked through an ADD or SUB, we need to rebuild it with the shifted
+  // constant.
+  if (AddC) {
+    SDValue ShiftedAddC =
+        DAG.getConstant(AddC->getAPIntValue().lshr(32), DL, MVT::i64);
+    if (IsAdd)
+      In = DAG.getNode(ISD::ADD, DL, MVT::i64, In, ShiftedAddC);
+    else
+      In = DAG.getNode(ISD::SUB, DL, MVT::i64, ShiftedAddC, In);
+  }
+
+  SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, In,
+                             DAG.getValueType(MVT::i32));
+  if (ShAmtC->getZExtValue() == 32)
+    return SExt;
+
+  return DAG.getNode(
+      ISD::SHL, DL, MVT::i64, SExt,
+      DAG.getConstant(32 - ShAmtC->getZExtValue(), DL, MVT::i64));
+}
+
 SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -7597,6 +8630,12 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     if (Op0->getOpcode() == RISCVISD::BuildPairF64)
       return DCI.CombineTo(N, Op0.getOperand(0), Op0.getOperand(1));
 
+    if (Op0->isUndef()) {
+      SDValue Lo = DAG.getUNDEF(MVT::i32);
+      SDValue Hi = DAG.getUNDEF(MVT::i32);
+      return DCI.CombineTo(N, Lo, Hi);
+    }
+
     SDLoc DL(N);
 
     // It's cheaper to materialise two 32-bit integers than to load a double
@@ -7634,15 +8673,27 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
   }
   case RISCVISD::SLLW:
   case RISCVISD::SRAW:
-  case RISCVISD::SRLW:
-  case RISCVISD::ROLW:
-  case RISCVISD::RORW: {
+  case RISCVISD::SRLW: {
     // Only the lower 32 bits of LHS and lower 5 bits of RHS are read.
     if (SimplifyDemandedLowBitsHelper(0, 32) ||
         SimplifyDemandedLowBitsHelper(1, 5))
       return SDValue(N, 0);
+
     break;
   }
+  case ISD::ROTR:
+  case ISD::ROTL:
+  case RISCVISD::RORW:
+  case RISCVISD::ROLW: {
+    if (N->getOpcode() == RISCVISD::RORW || N->getOpcode() == RISCVISD::ROLW) {
+      // Only the lower 32 bits of LHS and lower 5 bits of RHS are read.
+      if (SimplifyDemandedLowBitsHelper(0, 32) ||
+          SimplifyDemandedLowBitsHelper(1, 5))
+        return SDValue(N, 0);
+    }
+
+    return combineROTR_ROTL_RORW_ROLW(N, DAG, Subtarget);
+  }
   case RISCVISD::CLZW:
   case RISCVISD::CTZW: {
     // Only the lower 32 bits of the first operand are read
@@ -7667,7 +8718,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
         SimplifyDemandedLowBitsHelper(1, 5))
       return SDValue(N, 0);
 
-    return combineGREVI_GORCI(N, DAG);
+    break;
   }
   case RISCVISD::SHFL:
   case RISCVISD::UNSHFL: {
@@ -7682,10 +8733,6 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
   case RISCVISD::SHFLW:
   case RISCVISD::UNSHFLW: {
     // Only the lower 32 bits of LHS and lower 4 bits of RHS are read.
-    SDValue LHS = N->getOperand(0);
-    SDValue RHS = N->getOperand(1);
-    APInt LHSMask = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 32);
-    APInt RHSMask = APInt::getLowBitsSet(RHS.getValueSizeInBits(), 4);
     if (SimplifyDemandedLowBitsHelper(0, 32) ||
         SimplifyDemandedLowBitsHelper(1, 4))
       return SDValue(N, 0);
@@ -7701,6 +8748,21 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
 
     break;
   }
+  case RISCVISD::FSR:
+  case RISCVISD::FSL:
+  case RISCVISD::FSRW:
+  case RISCVISD::FSLW: {
+    bool IsWInstruction =
+        N->getOpcode() == RISCVISD::FSRW || N->getOpcode() == RISCVISD::FSLW;
+    unsigned BitWidth =
+        IsWInstruction ? 32 : N->getSimpleValueType(0).getSizeInBits();
+    assert(isPowerOf2_32(BitWidth) && "Unexpected bit width");
+    // Only the lower log2(Bitwidth)+1 bits of the the shift amount are read.
+    if (SimplifyDemandedLowBitsHelper(1, Log2_32(BitWidth) + 1))
+      return SDValue(N, 0);
+
+    break;
+  }
   case RISCVISD::FMV_X_ANYEXTH:
   case RISCVISD::FMV_X_ANYEXTW_RV64: {
     SDLoc DL(N);
@@ -7727,7 +8789,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
       break;
     SDValue NewFMV = DAG.getNode(N->getOpcode(), DL, VT, Op0.getOperand(0));
     unsigned FPBits = N->getOpcode() == RISCVISD::FMV_X_ANYEXTW_RV64 ? 32 : 16;
-    APInt SignBit = APInt::getSignMask(FPBits).sextOrSelf(VT.getSizeInBits());
+    APInt SignBit = APInt::getSignMask(FPBits).sext(VT.getSizeInBits());
     if (Op0.getOpcode() == ISD::FNEG)
       return DAG.getNode(ISD::XOR, DL, VT, NewFMV,
                          DAG.getConstant(SignBit, DL, VT));
@@ -7741,13 +8803,21 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SUB:
     return performSUBCombine(N, DAG);
   case ISD::AND:
-    return performANDCombine(N, DAG);
+    return performANDCombine(N, DAG, Subtarget);
   case ISD::OR:
     return performORCombine(N, DAG, Subtarget);
   case ISD::XOR:
     return performXORCombine(N, DAG);
-  case ISD::ANY_EXTEND:
-    return performANY_EXTENDCombine(N, DCI, Subtarget);
+  case ISD::FADD:
+  case ISD::UMAX:
+  case ISD::UMIN:
+  case ISD::SMAX:
+  case ISD::SMIN:
+  case ISD::FMAXNUM:
+  case ISD::FMINNUM:
+    return combineBinOpToReduce(N, DAG);
+  case ISD::SIGN_EXTEND_INREG:
+    return performSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
   case ISD::ZERO_EXTEND:
     // Fold (zero_extend (fp_to_uint X)) to prevent forming fcvt+zexti32 during
     // type legalization. This is safe because fp_to_uint produces poison if
@@ -7879,6 +8949,8 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     }
     break;
   }
+  case ISD::BITREVERSE:
+    return performBITREVERSECombine(N, DAG, Subtarget);
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
     return performFP_TO_INTCombine(N, DCI, Subtarget);
@@ -7952,40 +9024,41 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
                           DL, IndexVT, Index);
     }
 
-    unsigned Scale = cast<ConstantSDNode>(ScaleOp)->getZExtValue();
-    if (IsIndexScaled && Scale != 1) {
-      // Manually scale the indices by the element size.
+    if (IsIndexScaled) {
+      // Manually scale the indices.
       // TODO: Sanitize the scale operand here?
       // TODO: For VP nodes, should we use VP_SHL here?
+      unsigned Scale = cast<ConstantSDNode>(ScaleOp)->getZExtValue();
       assert(isPowerOf2_32(Scale) && "Expecting power-of-two types");
       SDValue SplatScale = DAG.getConstant(Log2_32(Scale), DL, IndexVT);
       Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index, SplatScale);
+      ScaleOp = DAG.getTargetConstant(1, DL, ScaleOp.getValueType());
     }
 
-    ISD::MemIndexType NewIndexTy = ISD::UNSIGNED_UNSCALED;
+    ISD::MemIndexType NewIndexTy = ISD::UNSIGNED_SCALED;
     if (const auto *VPGN = dyn_cast<VPGatherSDNode>(N))
       return DAG.getGatherVP(N->getVTList(), VPGN->getMemoryVT(), DL,
                              {VPGN->getChain(), VPGN->getBasePtr(), Index,
-                              VPGN->getScale(), VPGN->getMask(),
+                              ScaleOp, VPGN->getMask(),
                               VPGN->getVectorLength()},
                              VPGN->getMemOperand(), NewIndexTy);
     if (const auto *VPSN = dyn_cast<VPScatterSDNode>(N))
       return DAG.getScatterVP(N->getVTList(), VPSN->getMemoryVT(), DL,
                               {VPSN->getChain(), VPSN->getValue(),
-                               VPSN->getBasePtr(), Index, VPSN->getScale(),
+                               VPSN->getBasePtr(), Index, ScaleOp,
                                VPSN->getMask(), VPSN->getVectorLength()},
                               VPSN->getMemOperand(), NewIndexTy);
     if (const auto *MGN = dyn_cast<MaskedGatherSDNode>(N))
       return DAG.getMaskedGather(
           N->getVTList(), MGN->getMemoryVT(), DL,
           {MGN->getChain(), MGN->getPassThru(), MGN->getMask(),
-           MGN->getBasePtr(), Index, MGN->getScale()},
+           MGN->getBasePtr(), Index, ScaleOp},
           MGN->getMemOperand(), NewIndexTy, MGN->getExtensionType());
     const auto *MSN = cast<MaskedScatterSDNode>(N);
     return DAG.getMaskedScatter(
         N->getVTList(), MSN->getMemoryVT(), DL,
         {MSN->getChain(), MSN->getValue(), MSN->getMask(), MSN->getBasePtr(),
-         Index, MSN->getScale()},
+         Index, ScaleOp},
         MSN->getMemOperand(), NewIndexTy, MSN->isTruncatingStore());
   }
   case RISCVISD::SRA_VL:
@@ -7997,14 +9070,17 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
       SDLoc DL(N);
       SDValue VL = N->getOperand(3);
       EVT VT = N->getValueType(0);
-      ShAmt =
-          DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, ShAmt.getOperand(0), VL);
+      ShAmt = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, DAG.getUNDEF(VT),
+                          ShAmt.getOperand(1), VL);
       return DAG.getNode(N->getOpcode(), DL, VT, N->getOperand(0), ShAmt,
                          N->getOperand(2), N->getOperand(3));
     }
     break;
   }
   case ISD::SRA:
+    if (SDValue V = performSRACombine(N, DAG, Subtarget))
+      return V;
+    LLVM_FALLTHROUGH;
   case ISD::SRL:
   case ISD::SHL: {
     SDValue ShAmt = N->getOperand(1);
@@ -8012,17 +9088,63 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
       // We don't need the upper 32 bits of a 64-bit element for a shift amount.
       SDLoc DL(N);
       EVT VT = N->getValueType(0);
-      ShAmt =
-          DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VT, ShAmt.getOperand(0));
+      ShAmt = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, DAG.getUNDEF(VT),
+                          ShAmt.getOperand(1),
+                          DAG.getRegister(RISCV::X0, Subtarget.getXLenVT()));
       return DAG.getNode(N->getOpcode(), DL, VT, N->getOperand(0), ShAmt);
     }
     break;
   }
+  case RISCVISD::ADD_VL:
+    if (SDValue V = combineADDSUB_VLToVWADDSUB_VL(N, DAG, /*Commute*/ false))
+      return V;
+    return combineADDSUB_VLToVWADDSUB_VL(N, DAG, /*Commute*/ true);
+  case RISCVISD::SUB_VL:
+    return combineADDSUB_VLToVWADDSUB_VL(N, DAG);
+  case RISCVISD::VWADD_W_VL:
+  case RISCVISD::VWADDU_W_VL:
+  case RISCVISD::VWSUB_W_VL:
+  case RISCVISD::VWSUBU_W_VL:
+    return combineVWADD_W_VL_VWSUB_W_VL(N, DAG);
   case RISCVISD::MUL_VL:
     if (SDValue V = combineMUL_VLToVWMUL_VL(N, DAG, /*Commute*/ false))
       return V;
     // Mul is commutative.
     return combineMUL_VLToVWMUL_VL(N, DAG, /*Commute*/ true);
+  case RISCVISD::VFMADD_VL:
+  case RISCVISD::VFNMADD_VL:
+  case RISCVISD::VFMSUB_VL:
+  case RISCVISD::VFNMSUB_VL: {
+    // Fold FNEG_VL into FMA opcodes.
+    SDValue A = N->getOperand(0);
+    SDValue B = N->getOperand(1);
+    SDValue C = N->getOperand(2);
+    SDValue Mask = N->getOperand(3);
+    SDValue VL = N->getOperand(4);
+
+    auto invertIfNegative = [&Mask, &VL](SDValue &V) {
+      if (V.getOpcode() == RISCVISD::FNEG_VL && V.getOperand(1) == Mask &&
+          V.getOperand(2) == VL) {
+        // Return the negated input.
+        V = V.getOperand(0);
+        return true;
+      }
+
+      return false;
+    };
+
+    bool NegA = invertIfNegative(A);
+    bool NegB = invertIfNegative(B);
+    bool NegC = invertIfNegative(C);
+
+    // If no operands are negated, we're done.
+    if (!NegA && !NegB && !NegC)
+      return SDValue();
+
+    unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC);
+    return DAG.getNode(NewOpcode, SDLoc(N), N->getValueType(0), A, B, C, Mask,
+                       VL);
+  }
   case ISD::STORE: {
     auto *Store = cast<StoreSDNode>(N);
     SDValue Val = Store->getValue();
@@ -8035,7 +9157,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
       // The memory VT and the element type must match.
       if (VecVT.getVectorElementType() == MemVT) {
         SDLoc DL(N);
-        MVT MaskVT = MVT::getVectorVT(MVT::i1, VecVT.getVectorElementCount());
+        MVT MaskVT = getMaskTypeFor(VecVT);
         return DAG.getStoreVP(
             Store->getChain(), DL, Src, Store->getBasePtr(), Store->getOffset(),
             DAG.getConstant(1, DL, MaskVT),
@@ -8047,6 +9169,73 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
 
     break;
   }
+  case ISD::SPLAT_VECTOR: {
+    EVT VT = N->getValueType(0);
+    // Only perform this combine on legal MVT types.
+    if (!isTypeLegal(VT))
+      break;
+    if (auto Gather = matchSplatAsGather(N->getOperand(0), VT.getSimpleVT(), N,
+                                         DAG, Subtarget))
+      return Gather;
+    break;
+  }
+  case RISCVISD::VMV_V_X_VL: {
+    // Tail agnostic VMV.V.X only demands the vector element bitwidth from the
+    // scalar input.
+    unsigned ScalarSize = N->getOperand(1).getValueSizeInBits();
+    unsigned EltWidth = N->getValueType(0).getScalarSizeInBits();
+    if (ScalarSize > EltWidth && N->getOperand(0).isUndef())
+      if (SimplifyDemandedLowBitsHelper(1, EltWidth))
+        return SDValue(N, 0);
+
+    break;
+  }
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntNo = N->getConstantOperandVal(0);
+    switch (IntNo) {
+      // By default we do not combine any intrinsic.
+    default:
+      return SDValue();
+    case Intrinsic::riscv_vcpop:
+    case Intrinsic::riscv_vcpop_mask:
+    case Intrinsic::riscv_vfirst:
+    case Intrinsic::riscv_vfirst_mask: {
+      SDValue VL = N->getOperand(2);
+      if (IntNo == Intrinsic::riscv_vcpop_mask ||
+          IntNo == Intrinsic::riscv_vfirst_mask)
+        VL = N->getOperand(3);
+      if (!isNullConstant(VL))
+        return SDValue();
+      // If VL is 0, vcpop -> li 0, vfirst -> li -1.
+      SDLoc DL(N);
+      EVT VT = N->getValueType(0);
+      if (IntNo == Intrinsic::riscv_vfirst ||
+          IntNo == Intrinsic::riscv_vfirst_mask)
+        return DAG.getConstant(-1, DL, VT);
+      return DAG.getConstant(0, DL, VT);
+    }
+    }
+  }
+  case ISD::BITCAST: {
+    assert(Subtarget.useRVVForFixedLengthVectors());
+    SDValue N0 = N->getOperand(0);
+    EVT VT = N->getValueType(0);
+    EVT SrcVT = N0.getValueType();
+    // If this is a bitcast between a MVT::v4i1/v2i1/v1i1 and an illegal integer
+    // type, widen both sides to avoid a trip through memory.
+    if ((SrcVT == MVT::v1i1 || SrcVT == MVT::v2i1 || SrcVT == MVT::v4i1) &&
+        VT.isScalarInteger()) {
+      unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
+      SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
+      Ops[0] = N0;
+      SDLoc DL(N);
+      N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i1, Ops);
+      N0 = DAG.getBitcast(MVT::i8, N0);
+      return DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
+    }
+
+    return SDValue();
+  }
   }
 
   return SDValue();
@@ -8182,22 +9371,23 @@ bool RISCVTargetLowering::targetShrinkDemandedConstant(
   return UseMask(NewMask);
 }
 
-static void computeGREV(APInt &Src, unsigned ShAmt) {
-  ShAmt &= Src.getBitWidth() - 1;
-  uint64_t x = Src.getZExtValue();
-  if (ShAmt & 1)
-    x = ((x & 0x5555555555555555LL) << 1) | ((x & 0xAAAAAAAAAAAAAAAALL) >> 1);
-  if (ShAmt & 2)
-    x = ((x & 0x3333333333333333LL) << 2) | ((x & 0xCCCCCCCCCCCCCCCCLL) >> 2);
-  if (ShAmt & 4)
-    x = ((x & 0x0F0F0F0F0F0F0F0FLL) << 4) | ((x & 0xF0F0F0F0F0F0F0F0LL) >> 4);
-  if (ShAmt & 8)
-    x = ((x & 0x00FF00FF00FF00FFLL) << 8) | ((x & 0xFF00FF00FF00FF00LL) >> 8);
-  if (ShAmt & 16)
-    x = ((x & 0x0000FFFF0000FFFFLL) << 16) | ((x & 0xFFFF0000FFFF0000LL) >> 16);
-  if (ShAmt & 32)
-    x = ((x & 0x00000000FFFFFFFFLL) << 32) | ((x & 0xFFFFFFFF00000000LL) >> 32);
-  Src = x;
+static uint64_t computeGREVOrGORC(uint64_t x, unsigned ShAmt, bool IsGORC) {
+  static const uint64_t GREVMasks[] = {
+      0x5555555555555555ULL, 0x3333333333333333ULL, 0x0F0F0F0F0F0F0F0FULL,
+      0x00FF00FF00FF00FFULL, 0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL};
+
+  for (unsigned Stage = 0; Stage != 6; ++Stage) {
+    unsigned Shift = 1 << Stage;
+    if (ShAmt & Shift) {
+      uint64_t Mask = GREVMasks[Stage];
+      uint64_t Res = ((x & Mask) << Shift) | ((x >> Shift) & Mask);
+      if (IsGORC)
+        Res |= x;
+      x = Res;
+    }
+  }
+
+  return x;
 }
 
 void RISCVTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
@@ -8263,28 +9453,28 @@ void RISCVTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     break;
   }
   case RISCVISD::GREV:
-  case RISCVISD::GREVW: {
+  case RISCVISD::GORC: {
     if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
       Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
-      if (Opc == RISCVISD::GREVW)
-        Known = Known.trunc(32);
-      unsigned ShAmt = C->getZExtValue();
-      computeGREV(Known.Zero, ShAmt);
-      computeGREV(Known.One, ShAmt);
-      if (Opc == RISCVISD::GREVW)
-        Known = Known.sext(BitWidth);
+      unsigned ShAmt = C->getZExtValue() & (Known.getBitWidth() - 1);
+      bool IsGORC = Op.getOpcode() == RISCVISD::GORC;
+      // To compute zeros, we need to invert the value and invert it back after.
+      Known.Zero =
+          ~computeGREVOrGORC(~Known.Zero.getZExtValue(), ShAmt, IsGORC);
+      Known.One = computeGREVOrGORC(Known.One.getZExtValue(), ShAmt, IsGORC);
     }
     break;
   }
   case RISCVISD::READ_VLENB: {
-    // If we know the minimum VLen from Zvl extensions, we can use that to
-    // determine the trailing zeros of VLENB.
-    // FIXME: Limit to 128 bit vectors until we have more testing.
-    unsigned MinVLenB = std::min(128U, Subtarget.getMinVLen()) / 8;
-    if (MinVLenB > 0)
-      Known.Zero.setLowBits(Log2_32(MinVLenB));
-    // We assume VLENB is no more than 65536 / 8 bytes.
-    Known.Zero.setBitsFrom(14);
+    // We can use the minimum and maximum VLEN values to bound VLENB.  We
+    // know VLEN must be a power of two.
+    const unsigned MinVLenB = Subtarget.getRealMinVLen() / 8;
+    const unsigned MaxVLenB = Subtarget.getRealMaxVLen() / 8;
+    assert(MinVLenB > 0 && "READ_VLENB without vector extension enabled?");
+    Known.Zero.setLowBits(Log2_32(MinVLenB));
+    Known.Zero.setBitsFrom(Log2_32(MaxVLenB)+1);
+    if (MaxVLenB == MinVLenB)
+      Known.One.setBit(Log2_32(MinVLenB));
     break;
   }
   case ISD::INTRINSIC_W_CHAIN:
@@ -8381,6 +9571,51 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
   return 1;
 }
 
+const Constant *
+RISCVTargetLowering::getTargetConstantFromLoad(LoadSDNode *Ld) const {
+  assert(Ld && "Unexpected null LoadSDNode");
+  if (!ISD::isNormalLoad(Ld))
+    return nullptr;
+
+  SDValue Ptr = Ld->getBasePtr();
+
+  // Only constant pools with no offset are supported.
+  auto GetSupportedConstantPool = [](SDValue Ptr) -> ConstantPoolSDNode * {
+    auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
+    if (!CNode || CNode->isMachineConstantPoolEntry() ||
+        CNode->getOffset() != 0)
+      return nullptr;
+
+    return CNode;
+  };
+
+  // Simple case, LLA.
+  if (Ptr.getOpcode() == RISCVISD::LLA) {
+    auto *CNode = GetSupportedConstantPool(Ptr);
+    if (!CNode || CNode->getTargetFlags() != 0)
+      return nullptr;
+
+    return CNode->getConstVal();
+  }
+
+  // Look for a HI and ADD_LO pair.
+  if (Ptr.getOpcode() != RISCVISD::ADD_LO ||
+      Ptr.getOperand(0).getOpcode() != RISCVISD::HI)
+    return nullptr;
+
+  auto *CNodeLo = GetSupportedConstantPool(Ptr.getOperand(1));
+  auto *CNodeHi = GetSupportedConstantPool(Ptr.getOperand(0).getOperand(0));
+
+  if (!CNodeLo || CNodeLo->getTargetFlags() != RISCVII::MO_LO ||
+      !CNodeHi || CNodeHi->getTargetFlags() != RISCVII::MO_HI)
+    return nullptr;
+
+  if (CNodeLo->getConstVal() != CNodeHi->getConstVal())
+    return nullptr;
+
+  return CNodeLo->getConstVal();
+}
+
 static MachineBasicBlock *emitReadCycleWidePseudo(MachineInstr &MI,
                                                   MachineBasicBlock *BB) {
   assert(MI.getOpcode() == RISCV::ReadCycleWide && "Unexpected instruction");
@@ -8559,6 +9794,109 @@ static MachineBasicBlock *emitQuietFCMP(MachineInstr &MI, MachineBasicBlock *BB,
   return BB;
 }
 
+static MachineBasicBlock *
+EmitLoweredCascadedSelect(MachineInstr &First, MachineInstr &Second,
+                          MachineBasicBlock *ThisMBB,
+                          const RISCVSubtarget &Subtarget) {
+  // Select_FPRX_ (rs1, rs2, imm, rs4, (Select_FPRX_ rs1, rs2, imm, rs4, rs5)
+  // Without this, custom-inserter would have generated:
+  //
+  //   A
+  //   | \
+  //   |  B
+  //   | /
+  //   C
+  //   | \
+  //   |  D
+  //   | /
+  //   E
+  //
+  // A: X = ...; Y = ...
+  // B: empty
+  // C: Z = PHI [X, A], [Y, B]
+  // D: empty
+  // E: PHI [X, C], [Z, D]
+  //
+  // If we lower both Select_FPRX_ in a single step, we can instead generate:
+  //
+  //   A
+  //   | \
+  //   |  C
+  //   | /|
+  //   |/ |
+  //   |  |
+  //   |  D
+  //   | /
+  //   E
+  //
+  // A: X = ...; Y = ...
+  // D: empty
+  // E: PHI [X, A], [X, C], [Y, D]
+
+  const RISCVInstrInfo &TII = *Subtarget.getInstrInfo();
+  const DebugLoc &DL = First.getDebugLoc();
+  const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
+  MachineFunction *F = ThisMBB->getParent();
+  MachineBasicBlock *FirstMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *SecondMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineFunction::iterator It = ++ThisMBB->getIterator();
+  F->insert(It, FirstMBB);
+  F->insert(It, SecondMBB);
+  F->insert(It, SinkMBB);
+
+  // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
+  SinkMBB->splice(SinkMBB->begin(), ThisMBB,
+                  std::next(MachineBasicBlock::iterator(First)),
+                  ThisMBB->end());
+  SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
+
+  // Fallthrough block for ThisMBB.
+  ThisMBB->addSuccessor(FirstMBB);
+  // Fallthrough block for FirstMBB.
+  FirstMBB->addSuccessor(SecondMBB);
+  ThisMBB->addSuccessor(SinkMBB);
+  FirstMBB->addSuccessor(SinkMBB);
+  // This is fallthrough.
+  SecondMBB->addSuccessor(SinkMBB);
+
+  auto FirstCC = static_cast<RISCVCC::CondCode>(First.getOperand(3).getImm());
+  Register FLHS = First.getOperand(1).getReg();
+  Register FRHS = First.getOperand(2).getReg();
+  // Insert appropriate branch.
+  BuildMI(ThisMBB, DL, TII.getBrCond(FirstCC))
+      .addReg(FLHS)
+      .addReg(FRHS)
+      .addMBB(SinkMBB);
+
+  Register SLHS = Second.getOperand(1).getReg();
+  Register SRHS = Second.getOperand(2).getReg();
+  Register Op1Reg4 = First.getOperand(4).getReg();
+  Register Op1Reg5 = First.getOperand(5).getReg();
+
+  auto SecondCC = static_cast<RISCVCC::CondCode>(Second.getOperand(3).getImm());
+  // Insert appropriate branch.
+  BuildMI(FirstMBB, DL, TII.getBrCond(SecondCC))
+      .addReg(SLHS)
+      .addReg(SRHS)
+      .addMBB(SinkMBB);
+
+  Register DestReg = Second.getOperand(0).getReg();
+  Register Op2Reg4 = Second.getOperand(4).getReg();
+  BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII.get(RISCV::PHI), DestReg)
+      .addReg(Op1Reg4)
+      .addMBB(ThisMBB)
+      .addReg(Op2Reg4)
+      .addMBB(FirstMBB)
+      .addReg(Op1Reg5)
+      .addMBB(SecondMBB);
+
+  // Now remove the Select_FPRX_s.
+  First.eraseFromParent();
+  Second.eraseFromParent();
+  return SinkMBB;
+}
+
 static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI,
                                            MachineBasicBlock *BB,
                                            const RISCVSubtarget &Subtarget) {
@@ -8586,6 +9924,10 @@ static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI,
   // previous selects in the sequence.
   // These conditions could be further relaxed. See the X86 target for a
   // related approach and more information.
+  //
+  // Select_FPRX_ (rs1, rs2, imm, rs4, (Select_FPRX_ rs1, rs2, imm, rs4, rs5))
+  // is checked here and handled by a separate function -
+  // EmitLoweredCascadedSelect.
   Register LHS = MI.getOperand(1).getReg();
   Register RHS = MI.getOperand(2).getReg();
   auto CC = static_cast<RISCVCC::CondCode>(MI.getOperand(3).getImm());
@@ -8595,12 +9937,19 @@ static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI,
   SelectDests.insert(MI.getOperand(0).getReg());
 
   MachineInstr *LastSelectPseudo = &MI;
+  auto Next = next_nodbg(MI.getIterator(), BB->instr_end());
+  if (MI.getOpcode() != RISCV::Select_GPR_Using_CC_GPR && Next != BB->end() &&
+      Next->getOpcode() == MI.getOpcode() &&
+      Next->getOperand(5).getReg() == MI.getOperand(0).getReg() &&
+      Next->getOperand(5).isKill()) {
+    return EmitLoweredCascadedSelect(MI, *Next, BB, Subtarget);
+  }
 
   for (auto E = BB->end(), SequenceMBBI = MachineBasicBlock::iterator(MI);
        SequenceMBBI != E; ++SequenceMBBI) {
     if (SequenceMBBI->isDebugInstr())
       continue;
-    else if (isSelectPseudo(*SequenceMBBI)) {
+    if (isSelectPseudo(*SequenceMBBI)) {
       if (SequenceMBBI->getOperand(1).getReg() != LHS ||
           SequenceMBBI->getOperand(2).getReg() != RHS ||
           SequenceMBBI->getOperand(3).getImm() != CC ||
@@ -8831,7 +10180,7 @@ static unsigned allocateRVVReg(MVT ValVT, unsigned ValNo,
     // Assign the first mask argument to V0.
     // This is an interim calling convention and it may be changed in the
     // future.
-    if (FirstMaskArgument.hasValue() && ValNo == FirstMaskArgument.getValue())
+    if (FirstMaskArgument && ValNo == *FirstMaskArgument)
       return State.AllocateReg(RISCV::V0);
     return State.AllocateReg(ArgVRs);
   }
@@ -10112,6 +11461,13 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(BuildPairF64)
   NODE_NAME_CASE(SplitF64)
   NODE_NAME_CASE(TAIL)
+  NODE_NAME_CASE(ADD_LO)
+  NODE_NAME_CASE(HI)
+  NODE_NAME_CASE(LLA)
+  NODE_NAME_CASE(ADD_TPREL)
+  NODE_NAME_CASE(LA)
+  NODE_NAME_CASE(LA_TLS_IE)
+  NODE_NAME_CASE(LA_TLS_GD)
   NODE_NAME_CASE(MULHSU)
   NODE_NAME_CASE(SLLW)
   NODE_NAME_CASE(SRAW)
@@ -10129,6 +11485,7 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(FSR)
   NODE_NAME_CASE(FMV_H_X)
   NODE_NAME_CASE(FMV_X_ANYEXTH)
+  NODE_NAME_CASE(FMV_X_SIGNEXTH)
   NODE_NAME_CASE(FMV_W_X_RV64)
   NODE_NAME_CASE(FMV_X_ANYEXTW_RV64)
   NODE_NAME_CASE(FCVT_X)
@@ -10157,7 +11514,6 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(VMV_X_S)
   NODE_NAME_CASE(VMV_S_X_VL)
   NODE_NAME_CASE(VFMV_S_F_VL)
-  NODE_NAME_CASE(SPLAT_VECTOR_I64)
   NODE_NAME_CASE(SPLAT_VECTOR_SPLIT_I64_VL)
   NODE_NAME_CASE(READ_VLENB)
   NODE_NAME_CASE(TRUNCATE_VECTOR_VL)
@@ -10203,7 +11559,10 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(FNEG_VL)
   NODE_NAME_CASE(FABS_VL)
   NODE_NAME_CASE(FSQRT_VL)
-  NODE_NAME_CASE(FMA_VL)
+  NODE_NAME_CASE(VFMADD_VL)
+  NODE_NAME_CASE(VFNMADD_VL)
+  NODE_NAME_CASE(VFMSUB_VL)
+  NODE_NAME_CASE(VFNMSUB_VL)
   NODE_NAME_CASE(FCOPYSIGN_VL)
   NODE_NAME_CASE(SMIN_VL)
   NODE_NAME_CASE(SMAX_VL)
@@ -10222,7 +11581,14 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(VWMUL_VL)
   NODE_NAME_CASE(VWMULU_VL)
   NODE_NAME_CASE(VWMULSU_VL)
+  NODE_NAME_CASE(VWADD_VL)
   NODE_NAME_CASE(VWADDU_VL)
+  NODE_NAME_CASE(VWSUB_VL)
+  NODE_NAME_CASE(VWSUBU_VL)
+  NODE_NAME_CASE(VWADD_W_VL)
+  NODE_NAME_CASE(VWADDU_W_VL)
+  NODE_NAME_CASE(VWSUB_W_VL)
+  NODE_NAME_CASE(VWSUBU_W_VL)
   NODE_NAME_CASE(SETCC_VL)
   NODE_NAME_CASE(VSELECT_VL)
   NODE_NAME_CASE(VP_MERGE_VL)
@@ -10237,8 +11603,6 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(VSEXT_VL)
   NODE_NAME_CASE(VZEXT_VL)
   NODE_NAME_CASE(VCPOP_VL)
-  NODE_NAME_CASE(VLE_VL)
-  NODE_NAME_CASE(VSE_VL)
   NODE_NAME_CASE(READ_CSR)
   NODE_NAME_CASE(WRITE_CSR)
   NODE_NAME_CASE(SWAP_CSR)
@@ -10459,7 +11823,18 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
     }
   }
 
-  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+  std::pair<Register, const TargetRegisterClass *> Res =
+      TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+
+  // If we picked one of the Zfinx register classes, remap it to the GPR class.
+  // FIXME: When Zfinx is supported in CodeGen this will need to take the
+  // Subtarget into account.
+  if (Res.second == &RISCV::GPRF16RegClass ||
+      Res.second == &RISCV::GPRF32RegClass ||
+      Res.second == &RISCV::GPRF64RegClass)
+    return std::make_pair(Res.first, &RISCV::GPRRegClass);
+
+  return Res;
 }
 
 unsigned
@@ -10681,7 +12056,8 @@ Value *RISCVTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
   return Result;
 }
 
-bool RISCVTargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const {
+bool RISCVTargetLowering::shouldRemoveExtendFromGSIndex(EVT IndexVT,
+                                                        EVT DataVT) const {
   return false;
 }
 
@@ -10797,7 +12173,7 @@ bool RISCVTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
         APInt ImmS = Imm.ashr(Imm.countTrailingZeros());
         if ((ImmS + 1).isPowerOf2() || (ImmS - 1).isPowerOf2() ||
             (1 - ImmS).isPowerOf2())
-        return true;
+          return true;
       }
     }
   }
@@ -10805,8 +12181,8 @@ bool RISCVTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
   return false;
 }
 
-bool RISCVTargetLowering::isMulAddWithConstProfitable(
-    const SDValue &AddNode, const SDValue &ConstNode) const {
+bool RISCVTargetLowering::isMulAddWithConstProfitable(SDValue AddNode,
+                                                      SDValue ConstNode) const {
   // Let the DAGCombiner decide for vectors.
   EVT VT = AddNode.getValueType();
   if (VT.isVector())
@@ -10831,9 +12207,13 @@ bool RISCVTargetLowering::isMulAddWithConstProfitable(
 bool RISCVTargetLowering::allowsMisalignedMemoryAccesses(
     EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
     bool *Fast) const {
-  if (!VT.isVector())
-    return false;
+  if (!VT.isVector()) {
+    if (Fast)
+      *Fast = false;
+    return Subtarget.enableUnalignedScalarMem();
+  }
 
+  // All vector implementations must support element alignment
   EVT ElemVT = VT.getVectorElementType();
   if (Alignment >= ElemVT.getStoreSize()) {
     if (Fast)
@@ -10847,7 +12227,7 @@ bool RISCVTargetLowering::allowsMisalignedMemoryAccesses(
 bool RISCVTargetLowering::splitValueIntoRegisterParts(
     SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
     unsigned NumParts, MVT PartVT, Optional<CallingConv::ID> CC) const {
-  bool IsABIRegCopy = CC.hasValue();
+  bool IsABIRegCopy = CC.has_value();
   EVT ValueVT = Val.getValueType();
   if (IsABIRegCopy && ValueVT == MVT::f16 && PartVT == MVT::f32) {
     // Cast the f16 to i16, extend to i32, pad with ones to make a float nan,
@@ -10901,7 +12281,7 @@ bool RISCVTargetLowering::splitValueIntoRegisterParts(
 SDValue RISCVTargetLowering::joinRegisterPartsIntoValue(
     SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
     MVT PartVT, EVT ValueVT, Optional<CallingConv::ID> CC) const {
-  bool IsABIRegCopy = CC.hasValue();
+  bool IsABIRegCopy = CC.has_value();
   if (IsABIRegCopy && ValueVT == MVT::f16 && PartVT == MVT::f32) {
     SDValue Val = Parts[0];
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 840a821870a7..eb013d4b6682 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -41,6 +41,21 @@ enum NodeType : unsigned {
   BuildPairF64,
   SplitF64,
   TAIL,
+
+  // Add the Lo 12 bits from an address. Selected to ADDI.
+  ADD_LO,
+  // Get the Hi 20 bits from an address. Selected to LUI.
+  HI,
+
+  // Represents an AUIPC+ADDI pair. Selected to PseudoLLA.
+  LLA,
+
+  // Selected as PseudoAddTPRel. Used to emit a TP-relative relocation.
+  ADD_TPREL,
+
+  // Load address.
+  LA_TLS_GD,
+
   // Multiply high for signedxunsigned.
   MULHSU,
   // RV64I shifts, directly matching the semantics of the named RISC-V
@@ -75,6 +90,7 @@ enum NodeType : unsigned {
   //
   // FMV_H_X matches the semantics of the FMV.H.X.
   // FMV_X_ANYEXTH is similar to FMV.X.H but has an any-extended result.
+  // FMV_X_SIGNEXTH is similar to FMV.X.H and has a sign-extended result.
   // FMV_W_X_RV64 matches the semantics of the FMV.W.X.
   // FMV_X_ANYEXTW_RV64 is similar to FMV.X.W but has an any-extended result.
   //
@@ -82,6 +98,7 @@ enum NodeType : unsigned {
   // unnecessary GPR->FPR->GPR moves.
   FMV_H_X,
   FMV_X_ANYEXTH,
+  FMV_X_SIGNEXTH,
   FMV_W_X_RV64,
   FMV_X_ANYEXTW_RV64,
   // FP to XLen int conversions. Corresponds to fcvt.l(u).s/d/h on RV64 and
@@ -129,10 +146,12 @@ enum NodeType : unsigned {
   BFPW,
   // Vector Extension
   // VMV_V_X_VL matches the semantics of vmv.v.x but includes an extra operand
-  // for the VL value to be used for the operation.
+  // for the VL value to be used for the operation. The first operand is
+  // passthru operand.
   VMV_V_X_VL,
   // VFMV_V_F_VL matches the semantics of vfmv.v.f but includes an extra operand
-  // for the VL value to be used for the operation.
+  // for the VL value to be used for the operation. The first operand is
+  // passthru operand.
   VFMV_V_F_VL,
   // VMV_X_S matches the semantics of vmv.x.s. The result is always XLenVT sign
   // extended from the vector element size.
@@ -141,11 +160,9 @@ enum NodeType : unsigned {
   VMV_S_X_VL,
   // VFMV_S_F_VL matches the semantics of vfmv.s.f. It carries a VL operand.
   VFMV_S_F_VL,
-  // Splats an i64 scalar to a vector type (with element type i64) where the
-  // scalar is a sign-extended i32.
-  SPLAT_VECTOR_I64,
   // Splats an 64-bit value that has been split into two i32 parts. This is
   // expanded late to two scalar stores and a stride 0 vector load.
+  // The first operand is passthru operand.
   SPLAT_VECTOR_SPLIT_I64_VL,
   // Read VLENB CSR
   READ_VLENB,
@@ -158,9 +175,9 @@ enum NodeType : unsigned {
   // and the fifth the VL.
   VSLIDEUP_VL,
   VSLIDEDOWN_VL,
-  // Matches the semantics of vslide1up/slide1down. The first operand is the
-  // source vector, the second is the XLenVT scalar value. The third and fourth
-  // operands are the mask and VL operands.
+  // Matches the semantics of vslide1up/slide1down. The first operand is
+  // passthru operand, the second is source vector, third is the XLenVT scalar
+  // value. The fourth and fifth operands are the mask and VL operands.
   VSLIDE1UP_VL,
   VSLIDE1DOWN_VL,
   // Matches the semantics of the vid.v instruction, with a mask and VL
@@ -225,7 +242,10 @@ enum NodeType : unsigned {
   FNEG_VL,
   FABS_VL,
   FSQRT_VL,
-  FMA_VL,
+  VFMADD_VL,
+  VFNMADD_VL,
+  VFMSUB_VL,
+  VFNMSUB_VL,
   FCOPYSIGN_VL,
   SMIN_VL,
   SMAX_VL,
@@ -246,7 +266,14 @@ enum NodeType : unsigned {
   VWMUL_VL,
   VWMULU_VL,
   VWMULSU_VL,
+  VWADD_VL,
   VWADDU_VL,
+  VWSUB_VL,
+  VWSUBU_VL,
+  VWADD_W_VL,
+  VWADDU_W_VL,
+  VWSUB_W_VL,
+  VWSUBU_W_VL,
 
   // Vector compare producing a mask. Fourth operand is input mask. Fifth
   // operand is VL.
@@ -268,8 +295,8 @@ enum NodeType : unsigned {
   VMCLR_VL,
   VMSET_VL,
 
-  // Matches the semantics of vrgather.vx and vrgather.vv with an extra operand
-  // for VL.
+  // Matches the semantics of vrgather.vx and vrgather.vv with extra operands
+  // for passthru and VL. Operands are (src, index, mask, passthru, vl).
   VRGATHER_VX_VL,
   VRGATHER_VV_VL,
   VRGATHEREI16_VV_VL,
@@ -302,16 +329,21 @@ enum NodeType : unsigned {
   STRICT_FCVT_W_RV64 = ISD::FIRST_TARGET_STRICTFP_OPCODE,
   STRICT_FCVT_WU_RV64,
 
-  // Memory opcodes start here.
-  VLE_VL = ISD::FIRST_TARGET_MEMORY_OPCODE,
-  VSE_VL,
-
   // WARNING: Do not add anything in the end unless you want the node to
   // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
   // opcodes will be thought as target memory ops!
+
+  // Load address.
+  LA = ISD::FIRST_TARGET_MEMORY_OPCODE,
+  LA_TLS_IE,
 };
 } // namespace RISCVISD
 
+namespace RISCV {
+// We use 64 bits as the known part in the scalable vector types.
+static constexpr unsigned RVVBitsPerBlock = 64;
+} // namespace RISCV
+
 class RISCVTargetLowering : public TargetLowering {
   const RISCVSubtarget &Subtarget;
 
@@ -333,11 +365,18 @@ public:
   bool isTruncateFree(EVT SrcVT, EVT DstVT) const override;
   bool isZExtFree(SDValue Val, EVT VT2) const override;
   bool isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const override;
+  bool signExtendConstant(const ConstantInt *CI) const override;
   bool isCheapToSpeculateCttz() const override;
   bool isCheapToSpeculateCtlz() const override;
   bool hasAndNotCompare(SDValue Y) const override;
+  bool hasBitTest(SDValue X, SDValue Y) const override;
+  bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
+      SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
+      unsigned OldShiftOpcode, unsigned NewShiftOpcode,
+      SelectionDAG &DAG) const override;
   bool shouldSinkOperands(Instruction *I,
                           SmallVectorImpl<Use *> &Ops) const override;
+  bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
   bool isFPImmLegal(const APFloat &Imm, EVT VT,
                     bool ForCodeSize) const override;
 
@@ -384,6 +423,8 @@ public:
                                            const SelectionDAG &DAG,
                                            unsigned Depth) const override;
 
+  const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override;
+
   // This method returns the name of a target specific DAG node.
   const char *getTargetNodeName(unsigned Opcode) const override;
 
@@ -477,8 +518,6 @@ public:
                       SelectionDAG &DAG) const override;
   SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
-  template <class NodeTy>
-  SDValue getAddr(NodeTy *N, SelectionDAG &DAG, bool IsLocal = true) const;
 
   bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                          Type *Ty) const override {
@@ -490,8 +529,8 @@ public:
   bool decomposeMulByConstant(LLVMContext &Context, EVT VT,
                               SDValue C) const override;
 
-  bool isMulAddWithConstProfitable(const SDValue &AddNode,
-                                   const SDValue &ConstNode) const override;
+  bool isMulAddWithConstProfitable(SDValue AddNode,
+                                   SDValue ConstNode) const override;
 
   TargetLowering::AtomicExpansionKind
   shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
@@ -526,6 +565,15 @@ public:
                              Optional<CallingConv::ID> CC) const override;
 
   static RISCVII::VLMUL getLMUL(MVT VT);
+  inline static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize,
+                                      unsigned MinSize) {
+    // Original equation:
+    //   VLMAX = (VectorBits / EltSize) * LMUL
+    //   where LMUL = MinSize / RISCV::RVVBitsPerBlock
+    // The following equations have been reordered to prevent loss of precision
+    // when calculating fractional LMUL.
+    return ((VectorBits / EltSize) * MinSize) / RISCV::RVVBitsPerBlock;
+  };
   static unsigned getRegClassIDForLMUL(RISCVII::VLMUL LMul);
   static unsigned getSubregIndexByMVT(MVT VT, unsigned Index);
   static unsigned getRegClassIDForVecVT(MVT VT);
@@ -535,7 +583,7 @@ public:
                                            const RISCVRegisterInfo *TRI);
   MVT getContainerForFixedLengthVector(MVT VT) const;
 
-  bool shouldRemoveExtendFromGSIndex(EVT VT) const override;
+  bool shouldRemoveExtendFromGSIndex(EVT IndexVT, EVT DataVT) const override;
 
   bool isLegalElementTypeForRVV(Type *ScalarTy) const;
 
@@ -571,6 +619,8 @@ private:
                          bool IsRet, CallLoweringInfo *CLI,
                          RISCVCCAssignFn Fn) const;
 
+  template <class NodeTy>
+  SDValue getAddr(NodeTy *N, SelectionDAG &DAG, bool IsLocal = true) const;
   SDValue getStaticTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG,
                            bool UseGOT) const;
   SDValue getDynamicTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG) const;
@@ -591,7 +641,9 @@ private:
   SDValue lowerVectorMaskSplat(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVectorMaskExt(SDValue Op, SelectionDAG &DAG,
                              int64_t ExtTrueVal) const;
-  SDValue lowerVectorMaskTrunc(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVectorMaskTruncLike(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVectorTruncLike(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVectorFPExtendOrRoundLike(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
@@ -606,6 +658,7 @@ private:
   SDValue lowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSTEP_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVECTOR_REVERSE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVECTOR_SPLICE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerABS(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerMaskedLoad(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerMaskedStore(SDValue Op, SelectionDAG &DAG) const;
@@ -627,11 +680,17 @@ private:
   SDValue lowerVPOp(SDValue Op, SelectionDAG &DAG, unsigned RISCVISDOpc) const;
   SDValue lowerLogicVPOp(SDValue Op, SelectionDAG &DAG, unsigned MaskOpc,
                          unsigned VecOpc) const;
+  SDValue lowerVPExtMaskOp(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVPSetCCMaskOp(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVPFPIntConvOp(SDValue Op, SelectionDAG &DAG,
+                             unsigned RISCVISDOpc) const;
   SDValue lowerFixedLengthVectorExtendToRVV(SDValue Op, SelectionDAG &DAG,
                                             unsigned ExtendOpc) const;
   SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const;
+
   SDValue expandUnalignedRVVLoad(SDValue Op, SelectionDAG &DAG) const;
   SDValue expandUnalignedRVVStore(SDValue Op, SelectionDAG &DAG) const;
 
@@ -665,21 +724,15 @@ private:
     return false;
   };
 };
-
-namespace RISCV {
-// We use 64 bits as the known part in the scalable vector types.
-static constexpr unsigned RVVBitsPerBlock = 64;
-} // namespace RISCV
-
 namespace RISCVVIntrinsicsTable {
 
 struct RISCVVIntrinsicInfo {
   unsigned IntrinsicID;
-  uint8_t SplatOperand;
+  uint8_t ScalarOperand;
   uint8_t VLOperand;
-  bool hasSplatOperand() const {
-    // 0xF is not valid. See NoSplatOperand in IntrinsicsRISCV.td.
-    return SplatOperand != 0xF;
+  bool hasScalarOperand() const {
+    // 0xF is not valid. See NoScalarOperand in IntrinsicsRISCV.td.
+    return ScalarOperand != 0xF;
   }
   bool hasVLOperand() const {
     // 0x1F is not valid. See NoVLOperand in IntrinsicsRISCV.td.
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 649eb57b325b..fc0a983f6542 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -7,7 +7,8 @@
 //===----------------------------------------------------------------------===//
 //
 // This file implements a function pass that inserts VSETVLI instructions where
-// needed.
+// needed and expands the vl outputs of VLEFF/VLSEGFF to PseudoReadVL
+// instructions.
 //
 // This pass consists of 3 phases:
 //
@@ -37,8 +38,371 @@ static cl::opt<bool> DisableInsertVSETVLPHIOpt(
     "riscv-disable-insert-vsetvl-phi-opt", cl::init(false), cl::Hidden,
     cl::desc("Disable looking through phis when inserting vsetvlis."));
 
+static cl::opt<bool> UseStrictAsserts(
+    "riscv-insert-vsetvl-strict-asserts", cl::init(true), cl::Hidden,
+    cl::desc("Enable strict assertion checking for the dataflow algorithm"));
+
 namespace {
 
+static unsigned getVLOpNum(const MachineInstr &MI) {
+  return RISCVII::getVLOpNum(MI.getDesc());
+}
+
+static unsigned getSEWOpNum(const MachineInstr &MI) {
+  return RISCVII::getSEWOpNum(MI.getDesc());
+}
+
+static bool isScalarMoveInstr(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case RISCV::PseudoVMV_S_X_M1:
+  case RISCV::PseudoVMV_S_X_M2:
+  case RISCV::PseudoVMV_S_X_M4:
+  case RISCV::PseudoVMV_S_X_M8:
+  case RISCV::PseudoVMV_S_X_MF2:
+  case RISCV::PseudoVMV_S_X_MF4:
+  case RISCV::PseudoVMV_S_X_MF8:
+  case RISCV::PseudoVFMV_S_F16_M1:
+  case RISCV::PseudoVFMV_S_F16_M2:
+  case RISCV::PseudoVFMV_S_F16_M4:
+  case RISCV::PseudoVFMV_S_F16_M8:
+  case RISCV::PseudoVFMV_S_F16_MF2:
+  case RISCV::PseudoVFMV_S_F16_MF4:
+  case RISCV::PseudoVFMV_S_F32_M1:
+  case RISCV::PseudoVFMV_S_F32_M2:
+  case RISCV::PseudoVFMV_S_F32_M4:
+  case RISCV::PseudoVFMV_S_F32_M8:
+  case RISCV::PseudoVFMV_S_F32_MF2:
+  case RISCV::PseudoVFMV_S_F64_M1:
+  case RISCV::PseudoVFMV_S_F64_M2:
+  case RISCV::PseudoVFMV_S_F64_M4:
+  case RISCV::PseudoVFMV_S_F64_M8:
+    return true;
+  }
+}
+
+/// Get the EEW for a load or store instruction.  Return None if MI is not
+/// a load or store which ignores SEW.
+static Optional<unsigned> getEEWForLoadStore(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    return None;
+  case RISCV::PseudoVLE8_V_M1:
+  case RISCV::PseudoVLE8_V_M1_MASK:
+  case RISCV::PseudoVLE8_V_M2:
+  case RISCV::PseudoVLE8_V_M2_MASK:
+  case RISCV::PseudoVLE8_V_M4:
+  case RISCV::PseudoVLE8_V_M4_MASK:
+  case RISCV::PseudoVLE8_V_M8:
+  case RISCV::PseudoVLE8_V_M8_MASK:
+  case RISCV::PseudoVLE8_V_MF2:
+  case RISCV::PseudoVLE8_V_MF2_MASK:
+  case RISCV::PseudoVLE8_V_MF4:
+  case RISCV::PseudoVLE8_V_MF4_MASK:
+  case RISCV::PseudoVLE8_V_MF8:
+  case RISCV::PseudoVLE8_V_MF8_MASK:
+  case RISCV::PseudoVLSE8_V_M1:
+  case RISCV::PseudoVLSE8_V_M1_MASK:
+  case RISCV::PseudoVLSE8_V_M2:
+  case RISCV::PseudoVLSE8_V_M2_MASK:
+  case RISCV::PseudoVLSE8_V_M4:
+  case RISCV::PseudoVLSE8_V_M4_MASK:
+  case RISCV::PseudoVLSE8_V_M8:
+  case RISCV::PseudoVLSE8_V_M8_MASK:
+  case RISCV::PseudoVLSE8_V_MF2:
+  case RISCV::PseudoVLSE8_V_MF2_MASK:
+  case RISCV::PseudoVLSE8_V_MF4:
+  case RISCV::PseudoVLSE8_V_MF4_MASK:
+  case RISCV::PseudoVLSE8_V_MF8:
+  case RISCV::PseudoVLSE8_V_MF8_MASK:
+  case RISCV::PseudoVSE8_V_M1:
+  case RISCV::PseudoVSE8_V_M1_MASK:
+  case RISCV::PseudoVSE8_V_M2:
+  case RISCV::PseudoVSE8_V_M2_MASK:
+  case RISCV::PseudoVSE8_V_M4:
+  case RISCV::PseudoVSE8_V_M4_MASK:
+  case RISCV::PseudoVSE8_V_M8:
+  case RISCV::PseudoVSE8_V_M8_MASK:
+  case RISCV::PseudoVSE8_V_MF2:
+  case RISCV::PseudoVSE8_V_MF2_MASK:
+  case RISCV::PseudoVSE8_V_MF4:
+  case RISCV::PseudoVSE8_V_MF4_MASK:
+  case RISCV::PseudoVSE8_V_MF8:
+  case RISCV::PseudoVSE8_V_MF8_MASK:
+  case RISCV::PseudoVSSE8_V_M1:
+  case RISCV::PseudoVSSE8_V_M1_MASK:
+  case RISCV::PseudoVSSE8_V_M2:
+  case RISCV::PseudoVSSE8_V_M2_MASK:
+  case RISCV::PseudoVSSE8_V_M4:
+  case RISCV::PseudoVSSE8_V_M4_MASK:
+  case RISCV::PseudoVSSE8_V_M8:
+  case RISCV::PseudoVSSE8_V_M8_MASK:
+  case RISCV::PseudoVSSE8_V_MF2:
+  case RISCV::PseudoVSSE8_V_MF2_MASK:
+  case RISCV::PseudoVSSE8_V_MF4:
+  case RISCV::PseudoVSSE8_V_MF4_MASK:
+  case RISCV::PseudoVSSE8_V_MF8:
+  case RISCV::PseudoVSSE8_V_MF8_MASK:
+    return 8;
+  case RISCV::PseudoVLE16_V_M1:
+  case RISCV::PseudoVLE16_V_M1_MASK:
+  case RISCV::PseudoVLE16_V_M2:
+  case RISCV::PseudoVLE16_V_M2_MASK:
+  case RISCV::PseudoVLE16_V_M4:
+  case RISCV::PseudoVLE16_V_M4_MASK:
+  case RISCV::PseudoVLE16_V_M8:
+  case RISCV::PseudoVLE16_V_M8_MASK:
+  case RISCV::PseudoVLE16_V_MF2:
+  case RISCV::PseudoVLE16_V_MF2_MASK:
+  case RISCV::PseudoVLE16_V_MF4:
+  case RISCV::PseudoVLE16_V_MF4_MASK:
+  case RISCV::PseudoVLSE16_V_M1:
+  case RISCV::PseudoVLSE16_V_M1_MASK:
+  case RISCV::PseudoVLSE16_V_M2:
+  case RISCV::PseudoVLSE16_V_M2_MASK:
+  case RISCV::PseudoVLSE16_V_M4:
+  case RISCV::PseudoVLSE16_V_M4_MASK:
+  case RISCV::PseudoVLSE16_V_M8:
+  case RISCV::PseudoVLSE16_V_M8_MASK:
+  case RISCV::PseudoVLSE16_V_MF2:
+  case RISCV::PseudoVLSE16_V_MF2_MASK:
+  case RISCV::PseudoVLSE16_V_MF4:
+  case RISCV::PseudoVLSE16_V_MF4_MASK:
+  case RISCV::PseudoVSE16_V_M1:
+  case RISCV::PseudoVSE16_V_M1_MASK:
+  case RISCV::PseudoVSE16_V_M2:
+  case RISCV::PseudoVSE16_V_M2_MASK:
+  case RISCV::PseudoVSE16_V_M4:
+  case RISCV::PseudoVSE16_V_M4_MASK:
+  case RISCV::PseudoVSE16_V_M8:
+  case RISCV::PseudoVSE16_V_M8_MASK:
+  case RISCV::PseudoVSE16_V_MF2:
+  case RISCV::PseudoVSE16_V_MF2_MASK:
+  case RISCV::PseudoVSE16_V_MF4:
+  case RISCV::PseudoVSE16_V_MF4_MASK:
+  case RISCV::PseudoVSSE16_V_M1:
+  case RISCV::PseudoVSSE16_V_M1_MASK:
+  case RISCV::PseudoVSSE16_V_M2:
+  case RISCV::PseudoVSSE16_V_M2_MASK:
+  case RISCV::PseudoVSSE16_V_M4:
+  case RISCV::PseudoVSSE16_V_M4_MASK:
+  case RISCV::PseudoVSSE16_V_M8:
+  case RISCV::PseudoVSSE16_V_M8_MASK:
+  case RISCV::PseudoVSSE16_V_MF2:
+  case RISCV::PseudoVSSE16_V_MF2_MASK:
+  case RISCV::PseudoVSSE16_V_MF4:
+  case RISCV::PseudoVSSE16_V_MF4_MASK:
+    return 16;
+  case RISCV::PseudoVLE32_V_M1:
+  case RISCV::PseudoVLE32_V_M1_MASK:
+  case RISCV::PseudoVLE32_V_M2:
+  case RISCV::PseudoVLE32_V_M2_MASK:
+  case RISCV::PseudoVLE32_V_M4:
+  case RISCV::PseudoVLE32_V_M4_MASK:
+  case RISCV::PseudoVLE32_V_M8:
+  case RISCV::PseudoVLE32_V_M8_MASK:
+  case RISCV::PseudoVLE32_V_MF2:
+  case RISCV::PseudoVLE32_V_MF2_MASK:
+  case RISCV::PseudoVLSE32_V_M1:
+  case RISCV::PseudoVLSE32_V_M1_MASK:
+  case RISCV::PseudoVLSE32_V_M2:
+  case RISCV::PseudoVLSE32_V_M2_MASK:
+  case RISCV::PseudoVLSE32_V_M4:
+  case RISCV::PseudoVLSE32_V_M4_MASK:
+  case RISCV::PseudoVLSE32_V_M8:
+  case RISCV::PseudoVLSE32_V_M8_MASK:
+  case RISCV::PseudoVLSE32_V_MF2:
+  case RISCV::PseudoVLSE32_V_MF2_MASK:
+  case RISCV::PseudoVSE32_V_M1:
+  case RISCV::PseudoVSE32_V_M1_MASK:
+  case RISCV::PseudoVSE32_V_M2:
+  case RISCV::PseudoVSE32_V_M2_MASK:
+  case RISCV::PseudoVSE32_V_M4:
+  case RISCV::PseudoVSE32_V_M4_MASK:
+  case RISCV::PseudoVSE32_V_M8:
+  case RISCV::PseudoVSE32_V_M8_MASK:
+  case RISCV::PseudoVSE32_V_MF2:
+  case RISCV::PseudoVSE32_V_MF2_MASK:
+  case RISCV::PseudoVSSE32_V_M1:
+  case RISCV::PseudoVSSE32_V_M1_MASK:
+  case RISCV::PseudoVSSE32_V_M2:
+  case RISCV::PseudoVSSE32_V_M2_MASK:
+  case RISCV::PseudoVSSE32_V_M4:
+  case RISCV::PseudoVSSE32_V_M4_MASK:
+  case RISCV::PseudoVSSE32_V_M8:
+  case RISCV::PseudoVSSE32_V_M8_MASK:
+  case RISCV::PseudoVSSE32_V_MF2:
+  case RISCV::PseudoVSSE32_V_MF2_MASK:
+    return 32;
+  case RISCV::PseudoVLE64_V_M1:
+  case RISCV::PseudoVLE64_V_M1_MASK:
+  case RISCV::PseudoVLE64_V_M2:
+  case RISCV::PseudoVLE64_V_M2_MASK:
+  case RISCV::PseudoVLE64_V_M4:
+  case RISCV::PseudoVLE64_V_M4_MASK:
+  case RISCV::PseudoVLE64_V_M8:
+  case RISCV::PseudoVLE64_V_M8_MASK:
+  case RISCV::PseudoVLSE64_V_M1:
+  case RISCV::PseudoVLSE64_V_M1_MASK:
+  case RISCV::PseudoVLSE64_V_M2:
+  case RISCV::PseudoVLSE64_V_M2_MASK:
+  case RISCV::PseudoVLSE64_V_M4:
+  case RISCV::PseudoVLSE64_V_M4_MASK:
+  case RISCV::PseudoVLSE64_V_M8:
+  case RISCV::PseudoVLSE64_V_M8_MASK:
+  case RISCV::PseudoVSE64_V_M1:
+  case RISCV::PseudoVSE64_V_M1_MASK:
+  case RISCV::PseudoVSE64_V_M2:
+  case RISCV::PseudoVSE64_V_M2_MASK:
+  case RISCV::PseudoVSE64_V_M4:
+  case RISCV::PseudoVSE64_V_M4_MASK:
+  case RISCV::PseudoVSE64_V_M8:
+  case RISCV::PseudoVSE64_V_M8_MASK:
+  case RISCV::PseudoVSSE64_V_M1:
+  case RISCV::PseudoVSSE64_V_M1_MASK:
+  case RISCV::PseudoVSSE64_V_M2:
+  case RISCV::PseudoVSSE64_V_M2_MASK:
+  case RISCV::PseudoVSSE64_V_M4:
+  case RISCV::PseudoVSSE64_V_M4_MASK:
+  case RISCV::PseudoVSSE64_V_M8:
+  case RISCV::PseudoVSSE64_V_M8_MASK:
+    return 64;
+  }
+}
+
+/// Return true if this is an operation on mask registers.  Note that
+/// this includes both arithmetic/logical ops and load/store (vlm/vsm).
+static bool isMaskRegOp(const MachineInstr &MI) {
+  if (RISCVII::hasSEWOp(MI.getDesc().TSFlags)) {
+    const unsigned Log2SEW = MI.getOperand(getSEWOpNum(MI)).getImm();
+    // A Log2SEW of 0 is an operation on mask registers only.
+    return Log2SEW == 0;
+  }
+  return false;
+}
+
+static unsigned getSEWLMULRatio(unsigned SEW, RISCVII::VLMUL VLMul) {
+  unsigned LMul;
+  bool Fractional;
+  std::tie(LMul, Fractional) = RISCVVType::decodeVLMUL(VLMul);
+
+  // Convert LMul to a fixed point value with 3 fractional bits.
+  LMul = Fractional ? (8 / LMul) : (LMul * 8);
+
+  assert(SEW >= 8 && "Unexpected SEW value");
+  return (SEW * 8) / LMul;
+}
+
+/// Which subfields of VL or VTYPE have values we need to preserve?
+struct DemandedFields {
+  bool VL = false;
+  bool SEW = false;
+  bool LMUL = false;
+  bool SEWLMULRatio = false;
+  bool TailPolicy = false;
+  bool MaskPolicy = false;
+
+  // Return true if any part of VTYPE was used
+  bool usedVTYPE() {
+    return SEW || LMUL || SEWLMULRatio || TailPolicy || MaskPolicy;
+  }
+
+  // Mark all VTYPE subfields and properties as demanded
+  void demandVTYPE() {
+    SEW = true;
+    LMUL = true;
+    SEWLMULRatio = true;
+    TailPolicy = true;
+    MaskPolicy = true;
+  }
+};
+
+/// Return true if the two values of the VTYPE register provided are
+/// indistinguishable from the perspective of an instruction (or set of
+/// instructions) which use only the Used subfields and properties.
+static bool areCompatibleVTYPEs(uint64_t VType1,
+                                uint64_t VType2,
+                                const DemandedFields &Used) {
+  if (Used.SEW &&
+      RISCVVType::getSEW(VType1) != RISCVVType::getSEW(VType2))
+    return false;
+
+  if (Used.LMUL &&
+      RISCVVType::getVLMUL(VType1) != RISCVVType::getVLMUL(VType2))
+    return false;
+
+  if (Used.SEWLMULRatio) {
+    auto Ratio1 = getSEWLMULRatio(RISCVVType::getSEW(VType1),
+                                  RISCVVType::getVLMUL(VType1));
+    auto Ratio2 = getSEWLMULRatio(RISCVVType::getSEW(VType2),
+                                  RISCVVType::getVLMUL(VType2));
+    if (Ratio1 != Ratio2)
+      return false;
+  }
+
+  if (Used.TailPolicy &&
+      RISCVVType::isTailAgnostic(VType1) != RISCVVType::isTailAgnostic(VType2))
+    return false;
+  if (Used.MaskPolicy &&
+      RISCVVType::isMaskAgnostic(VType1) != RISCVVType::isMaskAgnostic(VType2))
+    return false;
+  return true;
+}
+
+/// Return the fields and properties demanded by the provided instruction.
+static DemandedFields getDemanded(const MachineInstr &MI) {
+  // Warning: This function has to work on both the lowered (i.e. post
+  // emitVSETVLIs) and pre-lowering forms.  The main implication of this is
+  // that it can't use the value of a SEW, VL, or Policy operand as they might
+  // be stale after lowering.
+
+  // Most instructions don't use any of these subfeilds.
+  DemandedFields Res;
+  // Start conservative if registers are used
+  if (MI.isCall() || MI.isInlineAsm() || MI.readsRegister(RISCV::VL))
+    Res.VL = true;
+  if (MI.isCall() || MI.isInlineAsm() || MI.readsRegister(RISCV::VTYPE))
+    Res.demandVTYPE();
+  // Start conservative on the unlowered form too
+  uint64_t TSFlags = MI.getDesc().TSFlags;
+  if (RISCVII::hasSEWOp(TSFlags)) {
+    Res.demandVTYPE();
+    if (RISCVII::hasVLOp(TSFlags))
+      Res.VL = true;
+  }
+
+  // Loads and stores with implicit EEW do not demand SEW or LMUL directly.
+  // They instead demand the ratio of the two which is used in computing
+  // EMUL, but which allows us the flexibility to change SEW and LMUL
+  // provided we don't change the ratio.
+  // Note: We assume that the instructions initial SEW is the EEW encoded
+  // in the opcode.  This is asserted when constructing the VSETVLIInfo.
+  if (getEEWForLoadStore(MI)) {
+    Res.SEW = false;
+    Res.LMUL = false;
+  }
+
+  // Store instructions don't use the policy fields.
+  if (RISCVII::hasSEWOp(TSFlags) && MI.getNumExplicitDefs() == 0) {
+    Res.TailPolicy = false;
+    Res.MaskPolicy = false;
+  }
+
+  // If this is a mask reg operation, it only cares about VLMAX.
+  // TODO: Possible extensions to this logic
+  // * Probably ok if available VLMax is larger than demanded
+  // * The policy bits can probably be ignored..
+  if (isMaskRegOp(MI)) {
+    Res.SEW = false;
+    Res.LMUL = false;
+  }
+
+  return Res;
+}
+
+/// Defines the abstract state with which the forward dataflow models the
+/// values of the VL and VTYPE registers after insertion.
 class VSETVLIInfo {
   union {
     Register AVLReg;
@@ -57,15 +421,12 @@ class VSETVLIInfo {
   uint8_t SEW = 0;
   uint8_t TailAgnostic : 1;
   uint8_t MaskAgnostic : 1;
-  uint8_t MaskRegOp : 1;
-  uint8_t StoreOp : 1;
-  uint8_t ScalarMovOp : 1;
   uint8_t SEWLMULRatioOnly : 1;
 
 public:
   VSETVLIInfo()
-      : AVLImm(0), TailAgnostic(false), MaskAgnostic(false), MaskRegOp(false),
-        StoreOp(false), ScalarMovOp(false), SEWLMULRatioOnly(false) {}
+      : AVLImm(0), TailAgnostic(false), MaskAgnostic(false),
+        SEWLMULRatioOnly(false) {}
 
   static VSETVLIInfo getUnknown() {
     VSETVLIInfo Info;
@@ -97,11 +458,10 @@ public:
     assert(hasAVLImm());
     return AVLImm;
   }
-  bool hasZeroAVL() const {
-    if (hasAVLImm())
-      return getAVLImm() == 0;
-    return false;
-  }
+
+  unsigned getSEW() const { return SEW; }
+  RISCVII::VLMUL getVLMUL() const { return VLMul; }
+
   bool hasNonZeroAVL() const {
     if (hasAVLImm())
       return getAVLImm() > 0;
@@ -132,17 +492,13 @@ public:
     TailAgnostic = RISCVVType::isTailAgnostic(VType);
     MaskAgnostic = RISCVVType::isMaskAgnostic(VType);
   }
-  void setVTYPE(RISCVII::VLMUL L, unsigned S, bool TA, bool MA, bool MRO,
-                bool IsStore, bool IsScalarMovOp) {
+  void setVTYPE(RISCVII::VLMUL L, unsigned S, bool TA, bool MA) {
     assert(isValid() && !isUnknown() &&
            "Can't set VTYPE for uninitialized or unknown");
     VLMul = L;
     SEW = S;
     TailAgnostic = TA;
     MaskAgnostic = MA;
-    MaskRegOp = MRO;
-    StoreOp = IsStore;
-    ScalarMovOp = IsScalarMovOp;
   }
 
   unsigned encodeVTYPE() const {
@@ -175,25 +531,16 @@ public:
                     Other.MaskAgnostic);
   }
 
-  static unsigned getSEWLMULRatio(unsigned SEW, RISCVII::VLMUL VLMul) {
-    unsigned LMul;
-    bool Fractional;
-    std::tie(LMul, Fractional) = RISCVVType::decodeVLMUL(VLMul);
-
-    // Convert LMul to a fixed point value with 3 fractional bits.
-    LMul = Fractional ? (8 / LMul) : (LMul * 8);
-
-    assert(SEW >= 8 && "Unexpected SEW value");
-    return (SEW * 8) / LMul;
-  }
-
   unsigned getSEWLMULRatio() const {
     assert(isValid() && !isUnknown() &&
            "Can't use VTYPE for uninitialized or unknown");
-    return getSEWLMULRatio(SEW, VLMul);
+    return ::getSEWLMULRatio(SEW, VLMul);
   }
 
   // Check if the VTYPE for these two VSETVLIInfos produce the same VLMAX.
+  // Note that having the same VLMAX ensures that both share the same
+  // function from AVL to VL; that is, they must produce the same VL value
+  // for any given AVL value.
   bool hasSameVLMAX(const VSETVLIInfo &Other) const {
     assert(isValid() && Other.isValid() &&
            "Can't compare invalid VSETVLIInfos");
@@ -211,36 +558,22 @@ public:
            MaskAgnostic == Other.MaskAgnostic;
   }
 
-  bool hasCompatibleVTYPE(const VSETVLIInfo &InstrInfo, bool Strict) const {
-    // Simple case, see if full VTYPE matches.
-    if (hasSameVTYPE(InstrInfo))
-      return true;
-
-    if (Strict)
-      return false;
-
-    // If this is a mask reg operation, it only cares about VLMAX.
-    // FIXME: Mask reg operations are probably ok if "this" VLMAX is larger
-    // than "InstrInfo".
-    // FIXME: The policy bits can probably be ignored for mask reg operations.
-    if (InstrInfo.MaskRegOp && hasSameVLMAX(InstrInfo) &&
-        TailAgnostic == InstrInfo.TailAgnostic &&
-        MaskAgnostic == InstrInfo.MaskAgnostic)
-      return true;
-
-    return false;
+  bool hasCompatibleVTYPE(const MachineInstr &MI,
+                          const VSETVLIInfo &Require) const {
+    const DemandedFields Used = getDemanded(MI);
+    return areCompatibleVTYPEs(encodeVTYPE(), Require.encodeVTYPE(), Used);
   }
 
   // Determine whether the vector instructions requirements represented by
-  // InstrInfo are compatible with the previous vsetvli instruction represented
-  // by this.
-  bool isCompatible(const VSETVLIInfo &InstrInfo, bool Strict) const {
-    assert(isValid() && InstrInfo.isValid() &&
+  // Require are compatible with the previous vsetvli instruction represented
+  // by this.  MI is the instruction whose requirements we're considering.
+  bool isCompatible(const MachineInstr &MI, const VSETVLIInfo &Require) const {
+    assert(isValid() && Require.isValid() &&
            "Can't compare invalid VSETVLIInfos");
-    assert(!InstrInfo.SEWLMULRatioOnly &&
+    assert(!Require.SEWLMULRatioOnly &&
            "Expected a valid VTYPE for instruction!");
     // Nothing is compatible with Unknown.
-    if (isUnknown() || InstrInfo.isUnknown())
+    if (isUnknown() || Require.isUnknown())
       return false;
 
     // If only our VLMAX ratio is valid, then this isn't compatible.
@@ -249,61 +582,11 @@ public:
 
     // If the instruction doesn't need an AVLReg and the SEW matches, consider
     // it compatible.
-    if (!Strict && InstrInfo.hasAVLReg() &&
-        InstrInfo.AVLReg == RISCV::NoRegister) {
-      if (SEW == InstrInfo.SEW)
+    if (Require.hasAVLReg() && Require.AVLReg == RISCV::NoRegister)
+      if (SEW == Require.SEW)
         return true;
-    }
-
-    // For vmv.s.x and vfmv.s.f, there is only two behaviors, VL = 0 and VL > 0.
-    // So it's compatible when we could make sure that both VL be the same
-    // situation.
-    if (!Strict && InstrInfo.ScalarMovOp && InstrInfo.hasAVLImm() &&
-        ((hasNonZeroAVL() && InstrInfo.hasNonZeroAVL()) ||
-         (hasZeroAVL() && InstrInfo.hasZeroAVL())) &&
-        hasSameSEW(InstrInfo) && hasSamePolicy(InstrInfo))
-      return true;
-
-    // The AVL must match.
-    if (!hasSameAVL(InstrInfo))
-      return false;
-
-    if (hasCompatibleVTYPE(InstrInfo, Strict))
-      return true;
-
-    // Strict matches must ensure a full VTYPE match.
-    if (Strict)
-      return false;
-
-    // Store instructions don't use the policy fields.
-    // TODO: Move into hasCompatibleVTYPE?
-    if (InstrInfo.StoreOp && VLMul == InstrInfo.VLMul && SEW == InstrInfo.SEW)
-      return true;
-
-    // Anything else is not compatible.
-    return false;
-  }
 
-  bool isCompatibleWithLoadStoreEEW(unsigned EEW,
-                                    const VSETVLIInfo &InstrInfo) const {
-    assert(isValid() && InstrInfo.isValid() &&
-           "Can't compare invalid VSETVLIInfos");
-    assert(!InstrInfo.SEWLMULRatioOnly &&
-           "Expected a valid VTYPE for instruction!");
-    assert(EEW == InstrInfo.SEW && "Mismatched EEW/SEW for store");
-
-    if (isUnknown() || hasSEWLMULRatioOnly())
-      return false;
-
-    if (!hasSameAVL(InstrInfo))
-      return false;
-
-    // Stores can ignore the tail and mask policies.
-    if (!InstrInfo.StoreOp && (TailAgnostic != InstrInfo.TailAgnostic ||
-                               MaskAgnostic != InstrInfo.MaskAgnostic))
-      return false;
-
-    return getSEWLMULRatio() == getSEWLMULRatio(EEW, InstrInfo.VLMul);
+    return hasSameAVL(Require) && hasCompatibleVTYPE(MI, Require);
   }
 
   bool operator==(const VSETVLIInfo &Other) const {
@@ -322,16 +605,20 @@ public:
     if (!hasSameAVL(Other))
       return false;
 
+    // If the SEWLMULRatioOnly bits are different, then they aren't equal.
+    if (SEWLMULRatioOnly != Other.SEWLMULRatioOnly)
+      return false;
+
     // If only the VLMAX is valid, check that it is the same.
-    if (SEWLMULRatioOnly && Other.SEWLMULRatioOnly)
+    if (SEWLMULRatioOnly)
       return hasSameVLMAX(Other);
 
     // If the full VTYPE is valid, check that it is the same.
-    if (!SEWLMULRatioOnly && !Other.SEWLMULRatioOnly)
-      return hasSameVTYPE(Other);
+    return hasSameVTYPE(Other);
+  }
 
-    // If the SEWLMULRatioOnly bits are different, then they aren't equal.
-    return false;
+  bool operator!=(const VSETVLIInfo &Other) const {
+    return !(*this == Other);
   }
 
   // Calculate the VSETVLIInfo visible to a block assuming this and Other are
@@ -365,25 +652,43 @@ public:
     return VSETVLIInfo::getUnknown();
   }
 
-  // Calculate the VSETVLIInfo visible at the end of the block assuming this
-  // is the predecessor value, and Other is change for this block.
-  VSETVLIInfo merge(const VSETVLIInfo &Other) const {
-    assert(isValid() && "Can only merge with a valid VSETVLInfo");
-
-    // Nothing changed from the predecessor, keep it.
-    if (!Other.isValid())
-      return *this;
-
-    // If the change is compatible with the input, we won't create a VSETVLI
-    // and should keep the predecessor.
-    if (isCompatible(Other, /*Strict*/ true))
-      return *this;
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Support for debugging, callable in GDB: V->dump()
+  LLVM_DUMP_METHOD void dump() const {
+    print(dbgs());
+    dbgs() << "\n";
+  }
 
-    // Otherwise just use whatever is in this block.
-    return Other;
+  /// Implement operator<<.
+  /// @{
+  void print(raw_ostream &OS) const {
+    OS << "{";
+    if (!isValid())
+      OS << "Uninitialized";
+    if (isUnknown())
+      OS << "unknown";
+    if (hasAVLReg())
+      OS << "AVLReg=" << (unsigned)AVLReg;
+    if (hasAVLImm())
+      OS << "AVLImm=" << (unsigned)AVLImm;
+    OS << ", "
+       << "VLMul=" << (unsigned)VLMul << ", "
+       << "SEW=" << (unsigned)SEW << ", "
+       << "TailAgnostic=" << (bool)TailAgnostic << ", "
+       << "MaskAgnostic=" << (bool)MaskAgnostic << ", "
+       << "SEWLMULRatioOnly=" << (bool)SEWLMULRatioOnly << "}";
   }
+#endif
 };
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_ATTRIBUTE_USED
+inline raw_ostream &operator<<(raw_ostream &OS, const VSETVLIInfo &V) {
+  V.print(OS);
+  return OS;
+}
+#endif
+
 struct BlockData {
   // The VSETVLIInfo that represents the net changes to the VL/VTYPE registers
   // made by this block. Calculated in Phase 1.
@@ -400,7 +705,7 @@ struct BlockData {
   // Keeps track of whether the block is already in the queue.
   bool InQueue = false;
 
-  BlockData() {}
+  BlockData() = default;
 };
 
 class RISCVInsertVSETVLI : public MachineFunctionPass {
@@ -426,14 +731,24 @@ public:
   StringRef getPassName() const override { return RISCV_INSERT_VSETVLI_NAME; }
 
 private:
-  bool needVSETVLI(const VSETVLIInfo &Require, const VSETVLIInfo &CurInfo);
-  bool needVSETVLIPHI(const VSETVLIInfo &Require, const MachineBasicBlock &MBB);
+  bool needVSETVLI(const MachineInstr &MI, const VSETVLIInfo &Require,
+                   const VSETVLIInfo &CurInfo) const;
+  bool needVSETVLIPHI(const VSETVLIInfo &Require,
+                      const MachineBasicBlock &MBB) const;
   void insertVSETVLI(MachineBasicBlock &MBB, MachineInstr &MI,
                      const VSETVLIInfo &Info, const VSETVLIInfo &PrevInfo);
+  void insertVSETVLI(MachineBasicBlock &MBB,
+                     MachineBasicBlock::iterator InsertPt, DebugLoc DL,
+                     const VSETVLIInfo &Info, const VSETVLIInfo &PrevInfo);
 
+  void transferBefore(VSETVLIInfo &Info, const MachineInstr &MI);
+  void transferAfter(VSETVLIInfo &Info, const MachineInstr &MI);
   bool computeVLVTYPEChanges(const MachineBasicBlock &MBB);
   void computeIncomingVLVTYPE(const MachineBasicBlock &MBB);
   void emitVSETVLIs(MachineBasicBlock &MBB);
+  void doLocalPostpass(MachineBasicBlock &MBB);
+  void doPRE(MachineBasicBlock &MBB);
+  void insertReadVL(MachineBasicBlock &MBB);
 };
 
 } // end anonymous namespace
@@ -443,474 +758,349 @@ char RISCVInsertVSETVLI::ID = 0;
 INITIALIZE_PASS(RISCVInsertVSETVLI, DEBUG_TYPE, RISCV_INSERT_VSETVLI_NAME,
                 false, false)
 
-static MachineInstr *elideCopies(MachineInstr *MI,
-                                 const MachineRegisterInfo *MRI) {
-  while (true) {
-    if (!MI->isFullCopy())
-      return MI;
-    if (!Register::isVirtualRegister(MI->getOperand(1).getReg()))
-      return nullptr;
-    MI = MRI->getVRegDef(MI->getOperand(1).getReg());
-    if (!MI)
-      return nullptr;
-  }
-}
-
-static bool isScalarMoveInstr(const MachineInstr &MI) {
-  switch (MI.getOpcode()) {
-  default:
-    return false;
-  case RISCV::PseudoVMV_S_X_M1:
-  case RISCV::PseudoVMV_S_X_M2:
-  case RISCV::PseudoVMV_S_X_M4:
-  case RISCV::PseudoVMV_S_X_M8:
-  case RISCV::PseudoVMV_S_X_MF2:
-  case RISCV::PseudoVMV_S_X_MF4:
-  case RISCV::PseudoVMV_S_X_MF8:
-  case RISCV::PseudoVFMV_S_F16_M1:
-  case RISCV::PseudoVFMV_S_F16_M2:
-  case RISCV::PseudoVFMV_S_F16_M4:
-  case RISCV::PseudoVFMV_S_F16_M8:
-  case RISCV::PseudoVFMV_S_F16_MF2:
-  case RISCV::PseudoVFMV_S_F16_MF4:
-  case RISCV::PseudoVFMV_S_F32_M1:
-  case RISCV::PseudoVFMV_S_F32_M2:
-  case RISCV::PseudoVFMV_S_F32_M4:
-  case RISCV::PseudoVFMV_S_F32_M8:
-  case RISCV::PseudoVFMV_S_F32_MF2:
-  case RISCV::PseudoVFMV_S_F64_M1:
-  case RISCV::PseudoVFMV_S_F64_M2:
-  case RISCV::PseudoVFMV_S_F64_M4:
-  case RISCV::PseudoVFMV_S_F64_M8:
-    return true;
-  }
-}
-
-static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags,
-                                       const MachineRegisterInfo *MRI) {
-  VSETVLIInfo InstrInfo;
-  unsigned NumOperands = MI.getNumExplicitOperands();
-  bool HasPolicy = RISCVII::hasVecPolicyOp(TSFlags);
-
-  // Default to tail agnostic unless the destination is tied to a source.
-  // Unless the source is undef. In that case the user would have some control
-  // over the tail values. Some pseudo instructions force a tail agnostic policy
-  // despite having a tied def.
-  bool ForceTailAgnostic = RISCVII::doesForceTailAgnostic(TSFlags);
-  bool TailAgnostic = true;
-  // If the instruction has policy argument, use the argument.
-  if (HasPolicy) {
-    const MachineOperand &Op = MI.getOperand(MI.getNumExplicitOperands() - 1);
-    TailAgnostic = Op.getImm() & 0x1;
-  }
-
-  unsigned UseOpIdx;
-  if (!(ForceTailAgnostic || (HasPolicy && TailAgnostic)) &&
-      MI.isRegTiedToUseOperand(0, &UseOpIdx)) {
-    TailAgnostic = false;
-    // If the tied operand is an IMPLICIT_DEF we can keep TailAgnostic.
-    const MachineOperand &UseMO = MI.getOperand(UseOpIdx);
-    MachineInstr *UseMI = MRI->getVRegDef(UseMO.getReg());
-    if (UseMI) {
-      UseMI = elideCopies(UseMI, MRI);
-      if (UseMI && UseMI->isImplicitDef())
-        TailAgnostic = true;
-    }
-  }
-
-  // Remove the tail policy so we can find the SEW and VL.
-  if (HasPolicy)
-    --NumOperands;
-
-  RISCVII::VLMUL VLMul = RISCVII::getLMul(TSFlags);
-
-  unsigned Log2SEW = MI.getOperand(NumOperands - 1).getImm();
-  // A Log2SEW of 0 is an operation on mask registers only.
-  bool MaskRegOp = Log2SEW == 0;
-  unsigned SEW = Log2SEW ? 1 << Log2SEW : 8;
-  assert(RISCVVType::isValidSEW(SEW) && "Unexpected SEW");
-
-  // If there are no explicit defs, this is a store instruction which can
-  // ignore the tail and mask policies.
-  bool StoreOp = MI.getNumExplicitDefs() == 0;
-  bool ScalarMovOp = isScalarMoveInstr(MI);
-
-  if (RISCVII::hasVLOp(TSFlags)) {
-    const MachineOperand &VLOp = MI.getOperand(NumOperands - 2);
-    if (VLOp.isImm()) {
-      int64_t Imm = VLOp.getImm();
-      // Conver the VLMax sentintel to X0 register.
-      if (Imm == RISCV::VLMaxSentinel)
-        InstrInfo.setAVLReg(RISCV::X0);
-      else
-        InstrInfo.setAVLImm(Imm);
-    } else {
-      InstrInfo.setAVLReg(VLOp.getReg());
-    }
-  } else
-    InstrInfo.setAVLReg(RISCV::NoRegister);
-  InstrInfo.setVTYPE(VLMul, SEW, /*TailAgnostic*/ TailAgnostic,
-                     /*MaskAgnostic*/ false, MaskRegOp, StoreOp, ScalarMovOp);
-
-  return InstrInfo;
-}
-
-void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB, MachineInstr &MI,
-                                       const VSETVLIInfo &Info,
-                                       const VSETVLIInfo &PrevInfo) {
-  DebugLoc DL = MI.getDebugLoc();
-
-  // Use X0, X0 form if the AVL is the same and the SEW+LMUL gives the same
-  // VLMAX.
-  if (PrevInfo.isValid() && !PrevInfo.isUnknown() &&
-      Info.hasSameAVL(PrevInfo) && Info.hasSameVLMAX(PrevInfo)) {
-    BuildMI(MBB, MI, DL, TII->get(RISCV::PseudoVSETVLIX0))
-        .addReg(RISCV::X0, RegState::Define | RegState::Dead)
-        .addReg(RISCV::X0, RegState::Kill)
-        .addImm(Info.encodeVTYPE())
-        .addReg(RISCV::VL, RegState::Implicit);
-    return;
-  }
-
-  if (Info.hasAVLImm()) {
-    BuildMI(MBB, MI, DL, TII->get(RISCV::PseudoVSETIVLI))
-        .addReg(RISCV::X0, RegState::Define | RegState::Dead)
-        .addImm(Info.getAVLImm())
-        .addImm(Info.encodeVTYPE());
-    return;
-  }
-
-  Register AVLReg = Info.getAVLReg();
-  if (AVLReg == RISCV::NoRegister) {
-    // We can only use x0, x0 if there's no chance of the vtype change causing
-    // the previous vl to become invalid.
-    if (PrevInfo.isValid() && !PrevInfo.isUnknown() &&
-        Info.hasSameVLMAX(PrevInfo)) {
-      BuildMI(MBB, MI, DL, TII->get(RISCV::PseudoVSETVLIX0))
-          .addReg(RISCV::X0, RegState::Define | RegState::Dead)
-          .addReg(RISCV::X0, RegState::Kill)
-          .addImm(Info.encodeVTYPE())
-          .addReg(RISCV::VL, RegState::Implicit);
-      return;
-    }
-    // Otherwise use an AVL of 0 to avoid depending on previous vl.
-    BuildMI(MBB, MI, DL, TII->get(RISCV::PseudoVSETIVLI))
-        .addReg(RISCV::X0, RegState::Define | RegState::Dead)
-        .addImm(0)
-        .addImm(Info.encodeVTYPE());
-    return;
-  }
-
-  if (AVLReg.isVirtual())
-    MRI->constrainRegClass(AVLReg, &RISCV::GPRNoX0RegClass);
-
-  // Use X0 as the DestReg unless AVLReg is X0. We also need to change the
-  // opcode if the AVLReg is X0 as they have different register classes for
-  // the AVL operand.
-  Register DestReg = RISCV::X0;
-  unsigned Opcode = RISCV::PseudoVSETVLI;
-  if (AVLReg == RISCV::X0) {
-    DestReg = MRI->createVirtualRegister(&RISCV::GPRRegClass);
-    Opcode = RISCV::PseudoVSETVLIX0;
-  }
-  BuildMI(MBB, MI, DL, TII->get(Opcode))
-      .addReg(DestReg, RegState::Define | RegState::Dead)
-      .addReg(AVLReg)
-      .addImm(Info.encodeVTYPE());
-}
-
-// Return a VSETVLIInfo representing the changes made by this VSETVLI or
-// VSETIVLI instruction.
-static VSETVLIInfo getInfoForVSETVLI(const MachineInstr &MI) {
-  VSETVLIInfo NewInfo;
-  if (MI.getOpcode() == RISCV::PseudoVSETIVLI) {
-    NewInfo.setAVLImm(MI.getOperand(1).getImm());
-  } else {
-    assert(MI.getOpcode() == RISCV::PseudoVSETVLI ||
-           MI.getOpcode() == RISCV::PseudoVSETVLIX0);
-    Register AVLReg = MI.getOperand(1).getReg();
-    assert((AVLReg != RISCV::X0 || MI.getOperand(0).getReg() != RISCV::X0) &&
-           "Can't handle X0, X0 vsetvli yet");
-    NewInfo.setAVLReg(AVLReg);
-  }
-  NewInfo.setVTYPE(MI.getOperand(2).getImm());
-
-  return NewInfo;
+static bool isVectorConfigInstr(const MachineInstr &MI) {
+  return MI.getOpcode() == RISCV::PseudoVSETVLI ||
+         MI.getOpcode() == RISCV::PseudoVSETVLIX0 ||
+         MI.getOpcode() == RISCV::PseudoVSETIVLI;
 }
 
-bool RISCVInsertVSETVLI::needVSETVLI(const VSETVLIInfo &Require,
-                                     const VSETVLIInfo &CurInfo) {
-  if (CurInfo.isCompatible(Require, /*Strict*/ false))
+/// Return true if this is 'vsetvli x0, x0, vtype' which preserves
+/// VL and only sets VTYPE.
+static bool isVLPreservingConfig(const MachineInstr &MI) {
+  if (MI.getOpcode() != RISCV::PseudoVSETVLIX0)
     return false;
-
-  // We didn't find a compatible value. If our AVL is a virtual register,
-  // it might be defined by a VSET(I)VLI. If it has the same VTYPE we need
-  // and the last VL/VTYPE we observed is the same, we don't need a
-  // VSETVLI here.
-  if (!CurInfo.isUnknown() && Require.hasAVLReg() &&
-      Require.getAVLReg().isVirtual() && !CurInfo.hasSEWLMULRatioOnly() &&
-      CurInfo.hasCompatibleVTYPE(Require, /*Strict*/ false)) {
-    if (MachineInstr *DefMI = MRI->getVRegDef(Require.getAVLReg())) {
-      if (DefMI->getOpcode() == RISCV::PseudoVSETVLI ||
-          DefMI->getOpcode() == RISCV::PseudoVSETVLIX0 ||
-          DefMI->getOpcode() == RISCV::PseudoVSETIVLI) {
-        VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI);
-        if (DefInfo.hasSameAVL(CurInfo) && DefInfo.hasSameVTYPE(CurInfo))
-          return false;
-      }
-    }
-  }
-
-  return true;
-}
-
-bool canSkipVSETVLIForLoadStore(const MachineInstr &MI,
-                                const VSETVLIInfo &Require,
-                                const VSETVLIInfo &CurInfo) {
-  unsigned EEW;
-  switch (MI.getOpcode()) {
-  default:
-    return false;
-  case RISCV::PseudoVLE8_V_M1:
-  case RISCV::PseudoVLE8_V_M1_MASK:
-  case RISCV::PseudoVLE8_V_M2:
-  case RISCV::PseudoVLE8_V_M2_MASK:
-  case RISCV::PseudoVLE8_V_M4:
-  case RISCV::PseudoVLE8_V_M4_MASK:
-  case RISCV::PseudoVLE8_V_M8:
-  case RISCV::PseudoVLE8_V_M8_MASK:
-  case RISCV::PseudoVLE8_V_MF2:
-  case RISCV::PseudoVLE8_V_MF2_MASK:
-  case RISCV::PseudoVLE8_V_MF4:
-  case RISCV::PseudoVLE8_V_MF4_MASK:
-  case RISCV::PseudoVLE8_V_MF8:
-  case RISCV::PseudoVLE8_V_MF8_MASK:
-  case RISCV::PseudoVLSE8_V_M1:
-  case RISCV::PseudoVLSE8_V_M1_MASK:
-  case RISCV::PseudoVLSE8_V_M2:
-  case RISCV::PseudoVLSE8_V_M2_MASK:
-  case RISCV::PseudoVLSE8_V_M4:
-  case RISCV::PseudoVLSE8_V_M4_MASK:
-  case RISCV::PseudoVLSE8_V_M8:
-  case RISCV::PseudoVLSE8_V_M8_MASK:
-  case RISCV::PseudoVLSE8_V_MF2:
-  case RISCV::PseudoVLSE8_V_MF2_MASK:
-  case RISCV::PseudoVLSE8_V_MF4:
-  case RISCV::PseudoVLSE8_V_MF4_MASK:
-  case RISCV::PseudoVLSE8_V_MF8:
-  case RISCV::PseudoVLSE8_V_MF8_MASK:
-  case RISCV::PseudoVSE8_V_M1:
-  case RISCV::PseudoVSE8_V_M1_MASK:
-  case RISCV::PseudoVSE8_V_M2:
-  case RISCV::PseudoVSE8_V_M2_MASK:
-  case RISCV::PseudoVSE8_V_M4:
-  case RISCV::PseudoVSE8_V_M4_MASK:
-  case RISCV::PseudoVSE8_V_M8:
-  case RISCV::PseudoVSE8_V_M8_MASK:
-  case RISCV::PseudoVSE8_V_MF2:
-  case RISCV::PseudoVSE8_V_MF2_MASK:
-  case RISCV::PseudoVSE8_V_MF4:
-  case RISCV::PseudoVSE8_V_MF4_MASK:
-  case RISCV::PseudoVSE8_V_MF8:
-  case RISCV::PseudoVSE8_V_MF8_MASK:
-  case RISCV::PseudoVSSE8_V_M1:
-  case RISCV::PseudoVSSE8_V_M1_MASK:
-  case RISCV::PseudoVSSE8_V_M2:
-  case RISCV::PseudoVSSE8_V_M2_MASK:
-  case RISCV::PseudoVSSE8_V_M4:
-  case RISCV::PseudoVSSE8_V_M4_MASK:
-  case RISCV::PseudoVSSE8_V_M8:
-  case RISCV::PseudoVSSE8_V_M8_MASK:
-  case RISCV::PseudoVSSE8_V_MF2:
-  case RISCV::PseudoVSSE8_V_MF2_MASK:
-  case RISCV::PseudoVSSE8_V_MF4:
-  case RISCV::PseudoVSSE8_V_MF4_MASK:
-  case RISCV::PseudoVSSE8_V_MF8:
-  case RISCV::PseudoVSSE8_V_MF8_MASK:
-    EEW = 8;
-    break;
-  case RISCV::PseudoVLE16_V_M1:
-  case RISCV::PseudoVLE16_V_M1_MASK:
-  case RISCV::PseudoVLE16_V_M2:
-  case RISCV::PseudoVLE16_V_M2_MASK:
-  case RISCV::PseudoVLE16_V_M4:
-  case RISCV::PseudoVLE16_V_M4_MASK:
-  case RISCV::PseudoVLE16_V_M8:
-  case RISCV::PseudoVLE16_V_M8_MASK:
-  case RISCV::PseudoVLE16_V_MF2:
-  case RISCV::PseudoVLE16_V_MF2_MASK:
-  case RISCV::PseudoVLE16_V_MF4:
-  case RISCV::PseudoVLE16_V_MF4_MASK:
-  case RISCV::PseudoVLSE16_V_M1:
-  case RISCV::PseudoVLSE16_V_M1_MASK:
-  case RISCV::PseudoVLSE16_V_M2:
-  case RISCV::PseudoVLSE16_V_M2_MASK:
-  case RISCV::PseudoVLSE16_V_M4:
-  case RISCV::PseudoVLSE16_V_M4_MASK:
-  case RISCV::PseudoVLSE16_V_M8:
-  case RISCV::PseudoVLSE16_V_M8_MASK:
-  case RISCV::PseudoVLSE16_V_MF2:
-  case RISCV::PseudoVLSE16_V_MF2_MASK:
-  case RISCV::PseudoVLSE16_V_MF4:
-  case RISCV::PseudoVLSE16_V_MF4_MASK:
-  case RISCV::PseudoVSE16_V_M1:
-  case RISCV::PseudoVSE16_V_M1_MASK:
-  case RISCV::PseudoVSE16_V_M2:
-  case RISCV::PseudoVSE16_V_M2_MASK:
-  case RISCV::PseudoVSE16_V_M4:
-  case RISCV::PseudoVSE16_V_M4_MASK:
-  case RISCV::PseudoVSE16_V_M8:
-  case RISCV::PseudoVSE16_V_M8_MASK:
-  case RISCV::PseudoVSE16_V_MF2:
-  case RISCV::PseudoVSE16_V_MF2_MASK:
-  case RISCV::PseudoVSE16_V_MF4:
-  case RISCV::PseudoVSE16_V_MF4_MASK:
-  case RISCV::PseudoVSSE16_V_M1:
-  case RISCV::PseudoVSSE16_V_M1_MASK:
-  case RISCV::PseudoVSSE16_V_M2:
-  case RISCV::PseudoVSSE16_V_M2_MASK:
-  case RISCV::PseudoVSSE16_V_M4:
-  case RISCV::PseudoVSSE16_V_M4_MASK:
-  case RISCV::PseudoVSSE16_V_M8:
-  case RISCV::PseudoVSSE16_V_M8_MASK:
-  case RISCV::PseudoVSSE16_V_MF2:
-  case RISCV::PseudoVSSE16_V_MF2_MASK:
-  case RISCV::PseudoVSSE16_V_MF4:
-  case RISCV::PseudoVSSE16_V_MF4_MASK:
-    EEW = 16;
-    break;
-  case RISCV::PseudoVLE32_V_M1:
-  case RISCV::PseudoVLE32_V_M1_MASK:
-  case RISCV::PseudoVLE32_V_M2:
-  case RISCV::PseudoVLE32_V_M2_MASK:
-  case RISCV::PseudoVLE32_V_M4:
-  case RISCV::PseudoVLE32_V_M4_MASK:
-  case RISCV::PseudoVLE32_V_M8:
-  case RISCV::PseudoVLE32_V_M8_MASK:
-  case RISCV::PseudoVLE32_V_MF2:
-  case RISCV::PseudoVLE32_V_MF2_MASK:
-  case RISCV::PseudoVLSE32_V_M1:
-  case RISCV::PseudoVLSE32_V_M1_MASK:
-  case RISCV::PseudoVLSE32_V_M2:
-  case RISCV::PseudoVLSE32_V_M2_MASK:
-  case RISCV::PseudoVLSE32_V_M4:
-  case RISCV::PseudoVLSE32_V_M4_MASK:
-  case RISCV::PseudoVLSE32_V_M8:
-  case RISCV::PseudoVLSE32_V_M8_MASK:
-  case RISCV::PseudoVLSE32_V_MF2:
-  case RISCV::PseudoVLSE32_V_MF2_MASK:
-  case RISCV::PseudoVSE32_V_M1:
-  case RISCV::PseudoVSE32_V_M1_MASK:
-  case RISCV::PseudoVSE32_V_M2:
-  case RISCV::PseudoVSE32_V_M2_MASK:
-  case RISCV::PseudoVSE32_V_M4:
-  case RISCV::PseudoVSE32_V_M4_MASK:
-  case RISCV::PseudoVSE32_V_M8:
-  case RISCV::PseudoVSE32_V_M8_MASK:
-  case RISCV::PseudoVSE32_V_MF2:
-  case RISCV::PseudoVSE32_V_MF2_MASK:
-  case RISCV::PseudoVSSE32_V_M1:
-  case RISCV::PseudoVSSE32_V_M1_MASK:
-  case RISCV::PseudoVSSE32_V_M2:
-  case RISCV::PseudoVSSE32_V_M2_MASK:
-  case RISCV::PseudoVSSE32_V_M4:
-  case RISCV::PseudoVSSE32_V_M4_MASK:
-  case RISCV::PseudoVSSE32_V_M8:
-  case RISCV::PseudoVSSE32_V_M8_MASK:
-  case RISCV::PseudoVSSE32_V_MF2:
-  case RISCV::PseudoVSSE32_V_MF2_MASK:
-    EEW = 32;
-    break;
-  case RISCV::PseudoVLE64_V_M1:
-  case RISCV::PseudoVLE64_V_M1_MASK:
-  case RISCV::PseudoVLE64_V_M2:
-  case RISCV::PseudoVLE64_V_M2_MASK:
-  case RISCV::PseudoVLE64_V_M4:
-  case RISCV::PseudoVLE64_V_M4_MASK:
-  case RISCV::PseudoVLE64_V_M8:
-  case RISCV::PseudoVLE64_V_M8_MASK:
-  case RISCV::PseudoVLSE64_V_M1:
-  case RISCV::PseudoVLSE64_V_M1_MASK:
-  case RISCV::PseudoVLSE64_V_M2:
-  case RISCV::PseudoVLSE64_V_M2_MASK:
-  case RISCV::PseudoVLSE64_V_M4:
-  case RISCV::PseudoVLSE64_V_M4_MASK:
-  case RISCV::PseudoVLSE64_V_M8:
-  case RISCV::PseudoVLSE64_V_M8_MASK:
-  case RISCV::PseudoVSE64_V_M1:
-  case RISCV::PseudoVSE64_V_M1_MASK:
-  case RISCV::PseudoVSE64_V_M2:
-  case RISCV::PseudoVSE64_V_M2_MASK:
-  case RISCV::PseudoVSE64_V_M4:
-  case RISCV::PseudoVSE64_V_M4_MASK:
-  case RISCV::PseudoVSE64_V_M8:
-  case RISCV::PseudoVSE64_V_M8_MASK:
-  case RISCV::PseudoVSSE64_V_M1:
-  case RISCV::PseudoVSSE64_V_M1_MASK:
-  case RISCV::PseudoVSSE64_V_M2:
-  case RISCV::PseudoVSSE64_V_M2_MASK:
-  case RISCV::PseudoVSSE64_V_M4:
-  case RISCV::PseudoVSSE64_V_M4_MASK:
-  case RISCV::PseudoVSSE64_V_M8:
-  case RISCV::PseudoVSSE64_V_M8_MASK:
-    EEW = 64;
-    break;
+  assert(RISCV::X0 == MI.getOperand(1).getReg());
+  return RISCV::X0 == MI.getOperand(0).getReg();
+}
+
+static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags,
+                                       const MachineRegisterInfo *MRI) {
+  VSETVLIInfo InstrInfo;
+
+  // If the instruction has policy argument, use the argument.
+  // If there is no policy argument, default to tail agnostic unless the
+  // destination is tied to a source. Unless the source is undef. In that case
+  // the user would have some control over the policy values.
+  bool TailAgnostic = true;
+  bool UsesMaskPolicy = RISCVII::usesMaskPolicy(TSFlags);
+  // FIXME: Could we look at the above or below instructions to choose the
+  // matched mask policy to reduce vsetvli instructions? Default mask policy is
+  // agnostic if instructions use mask policy, otherwise is undisturbed. Because
+  // most mask operations are mask undisturbed, so we could possibly reduce the
+  // vsetvli between mask and nomasked instruction sequence.
+  bool MaskAgnostic = UsesMaskPolicy;
+  unsigned UseOpIdx;
+  if (RISCVII::hasVecPolicyOp(TSFlags)) {
+    const MachineOperand &Op = MI.getOperand(MI.getNumExplicitOperands() - 1);
+    uint64_t Policy = Op.getImm();
+    assert(Policy <= (RISCVII::TAIL_AGNOSTIC | RISCVII::MASK_AGNOSTIC) &&
+           "Invalid Policy Value");
+    // Although in some cases, mismatched passthru/maskedoff with policy value
+    // does not make sense (ex. tied operand is IMPLICIT_DEF with non-TAMA
+    // policy, or tied operand is not IMPLICIT_DEF with TAMA policy), but users
+    // have set the policy value explicitly, so compiler would not fix it.
+    TailAgnostic = Policy & RISCVII::TAIL_AGNOSTIC;
+    MaskAgnostic = Policy & RISCVII::MASK_AGNOSTIC;
+  } else if (MI.isRegTiedToUseOperand(0, &UseOpIdx)) {
+    TailAgnostic = false;
+    if (UsesMaskPolicy)
+      MaskAgnostic = false;
+    // If the tied operand is an IMPLICIT_DEF we can keep TailAgnostic.
+    const MachineOperand &UseMO = MI.getOperand(UseOpIdx);
+    MachineInstr *UseMI = MRI->getVRegDef(UseMO.getReg());
+    if (UseMI && UseMI->isImplicitDef()) {
+      TailAgnostic = true;
+      if (UsesMaskPolicy)
+        MaskAgnostic = true;
+    }
+    // Some pseudo instructions force a tail agnostic policy despite having a
+    // tied def.
+    if (RISCVII::doesForceTailAgnostic(TSFlags))
+      TailAgnostic = true;
+  }
+
+  RISCVII::VLMUL VLMul = RISCVII::getLMul(TSFlags);
+
+  unsigned Log2SEW = MI.getOperand(getSEWOpNum(MI)).getImm();
+  // A Log2SEW of 0 is an operation on mask registers only.
+  unsigned SEW = Log2SEW ? 1 << Log2SEW : 8;
+  assert(RISCVVType::isValidSEW(SEW) && "Unexpected SEW");
+
+  if (RISCVII::hasVLOp(TSFlags)) {
+    const MachineOperand &VLOp = MI.getOperand(getVLOpNum(MI));
+    if (VLOp.isImm()) {
+      int64_t Imm = VLOp.getImm();
+      // Conver the VLMax sentintel to X0 register.
+      if (Imm == RISCV::VLMaxSentinel)
+        InstrInfo.setAVLReg(RISCV::X0);
+      else
+        InstrInfo.setAVLImm(Imm);
+    } else {
+      InstrInfo.setAVLReg(VLOp.getReg());
+    }
+  } else {
+    InstrInfo.setAVLReg(RISCV::NoRegister);
   }
+#ifndef NDEBUG
+  if (Optional<unsigned> EEW = getEEWForLoadStore(MI)) {
+    assert(SEW == EEW && "Initial SEW doesn't match expected EEW");
+  }
+#endif
+  InstrInfo.setVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic);
 
-  return CurInfo.isCompatibleWithLoadStoreEEW(EEW, Require);
+  return InstrInfo;
 }
 
-bool RISCVInsertVSETVLI::computeVLVTYPEChanges(const MachineBasicBlock &MBB) {
-  bool HadVectorOp = false;
+void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB, MachineInstr &MI,
+                                       const VSETVLIInfo &Info,
+                                       const VSETVLIInfo &PrevInfo) {
+  DebugLoc DL = MI.getDebugLoc();
+  insertVSETVLI(MBB, MachineBasicBlock::iterator(&MI), DL, Info, PrevInfo);
+}
 
-  BlockData &BBInfo = BlockInfo[MBB.getNumber()];
-  for (const MachineInstr &MI : MBB) {
-    // If this is an explicit VSETVLI or VSETIVLI, update our state.
-    if (MI.getOpcode() == RISCV::PseudoVSETVLI ||
-        MI.getOpcode() == RISCV::PseudoVSETVLIX0 ||
-        MI.getOpcode() == RISCV::PseudoVSETIVLI) {
-      HadVectorOp = true;
-      BBInfo.Change = getInfoForVSETVLI(MI);
-      continue;
+void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
+                     MachineBasicBlock::iterator InsertPt, DebugLoc DL,
+                     const VSETVLIInfo &Info, const VSETVLIInfo &PrevInfo) {
+
+  // Use X0, X0 form if the AVL is the same and the SEW+LMUL gives the same
+  // VLMAX.
+  if (PrevInfo.isValid() && !PrevInfo.isUnknown() &&
+      Info.hasSameAVL(PrevInfo) && Info.hasSameVLMAX(PrevInfo)) {
+    BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETVLIX0))
+        .addReg(RISCV::X0, RegState::Define | RegState::Dead)
+        .addReg(RISCV::X0, RegState::Kill)
+        .addImm(Info.encodeVTYPE())
+        .addReg(RISCV::VL, RegState::Implicit);
+    return;
+  }
+
+  if (Info.hasAVLImm()) {
+    BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETIVLI))
+        .addReg(RISCV::X0, RegState::Define | RegState::Dead)
+        .addImm(Info.getAVLImm())
+        .addImm(Info.encodeVTYPE());
+    return;
+  }
+
+  Register AVLReg = Info.getAVLReg();
+  if (AVLReg == RISCV::NoRegister) {
+    // We can only use x0, x0 if there's no chance of the vtype change causing
+    // the previous vl to become invalid.
+    if (PrevInfo.isValid() && !PrevInfo.isUnknown() &&
+        Info.hasSameVLMAX(PrevInfo)) {
+      BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETVLIX0))
+          .addReg(RISCV::X0, RegState::Define | RegState::Dead)
+          .addReg(RISCV::X0, RegState::Kill)
+          .addImm(Info.encodeVTYPE())
+          .addReg(RISCV::VL, RegState::Implicit);
+      return;
     }
+    // Otherwise use an AVL of 0 to avoid depending on previous vl.
+    BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETIVLI))
+        .addReg(RISCV::X0, RegState::Define | RegState::Dead)
+        .addImm(0)
+        .addImm(Info.encodeVTYPE());
+    return;
+  }
 
-    uint64_t TSFlags = MI.getDesc().TSFlags;
-    if (RISCVII::hasSEWOp(TSFlags)) {
-      HadVectorOp = true;
+  if (AVLReg.isVirtual())
+    MRI->constrainRegClass(AVLReg, &RISCV::GPRNoX0RegClass);
+
+  // Use X0 as the DestReg unless AVLReg is X0. We also need to change the
+  // opcode if the AVLReg is X0 as they have different register classes for
+  // the AVL operand.
+  Register DestReg = RISCV::X0;
+  unsigned Opcode = RISCV::PseudoVSETVLI;
+  if (AVLReg == RISCV::X0) {
+    DestReg = MRI->createVirtualRegister(&RISCV::GPRRegClass);
+    Opcode = RISCV::PseudoVSETVLIX0;
+  }
+  BuildMI(MBB, InsertPt, DL, TII->get(Opcode))
+      .addReg(DestReg, RegState::Define | RegState::Dead)
+      .addReg(AVLReg)
+      .addImm(Info.encodeVTYPE());
+}
+
+// Return a VSETVLIInfo representing the changes made by this VSETVLI or
+// VSETIVLI instruction.
+static VSETVLIInfo getInfoForVSETVLI(const MachineInstr &MI) {
+  VSETVLIInfo NewInfo;
+  if (MI.getOpcode() == RISCV::PseudoVSETIVLI) {
+    NewInfo.setAVLImm(MI.getOperand(1).getImm());
+  } else {
+    assert(MI.getOpcode() == RISCV::PseudoVSETVLI ||
+           MI.getOpcode() == RISCV::PseudoVSETVLIX0);
+    Register AVLReg = MI.getOperand(1).getReg();
+    assert((AVLReg != RISCV::X0 || MI.getOperand(0).getReg() != RISCV::X0) &&
+           "Can't handle X0, X0 vsetvli yet");
+    NewInfo.setAVLReg(AVLReg);
+  }
+  NewInfo.setVTYPE(MI.getOperand(2).getImm());
+
+  return NewInfo;
+}
+
+/// Return true if a VSETVLI is required to transition from CurInfo to Require
+/// before MI.
+bool RISCVInsertVSETVLI::needVSETVLI(const MachineInstr &MI,
+                                     const VSETVLIInfo &Require,
+                                     const VSETVLIInfo &CurInfo) const {
+  assert(Require == computeInfoForInstr(MI, MI.getDesc().TSFlags, MRI));
+
+  if (CurInfo.isCompatible(MI, Require))
+    return false;
+
+  if (!CurInfo.isValid() || CurInfo.isUnknown() || CurInfo.hasSEWLMULRatioOnly())
+    return true;
+
+  // For vmv.s.x and vfmv.s.f, there is only two behaviors, VL = 0 and VL > 0.
+  // VL=0 is uninteresting (as it should have been deleted already), so it is
+  // compatible if we can prove both are non-zero.  Additionally, if writing
+  // to an implicit_def operand, we don't need to preserve any other bits and
+  // are thus compatible with any larger etype, and can disregard policy bits.
+  if (isScalarMoveInstr(MI) &&
+      CurInfo.hasNonZeroAVL() && Require.hasNonZeroAVL()) {
+    auto *VRegDef = MRI->getVRegDef(MI.getOperand(1).getReg());
+    if (VRegDef && VRegDef->isImplicitDef() &&
+        CurInfo.getSEW() >= Require.getSEW())
+      return false;
+    if (CurInfo.hasSameSEW(Require) && CurInfo.hasSamePolicy(Require))
+      return false;
+  }
 
-      VSETVLIInfo NewInfo = computeInfoForInstr(MI, TSFlags, MRI);
-
-      if (!BBInfo.Change.isValid()) {
-        BBInfo.Change = NewInfo;
-      } else {
-        // If this instruction isn't compatible with the previous VL/VTYPE
-        // we need to insert a VSETVLI.
-        // If this is a unit-stride or strided load/store, we may be able to use
-        // the EMUL=(EEW/SEW)*LMUL relationship to avoid changing vtype.
-        // NOTE: We only do this if the vtype we're comparing against was
-        // created in this block. We need the first and third phase to treat
-        // the store the same way.
-        if (!canSkipVSETVLIForLoadStore(MI, NewInfo, BBInfo.Change) &&
-            needVSETVLI(NewInfo, BBInfo.Change))
-          BBInfo.Change = NewInfo;
+  // We didn't find a compatible value. If our AVL is a virtual register,
+  // it might be defined by a VSET(I)VLI. If it has the same VLMAX we need
+  // and the last VL/VTYPE we observed is the same, we don't need a
+  // VSETVLI here.
+  if (Require.hasAVLReg() && Require.getAVLReg().isVirtual() &&
+      CurInfo.hasCompatibleVTYPE(MI, Require)) {
+    if (MachineInstr *DefMI = MRI->getVRegDef(Require.getAVLReg())) {
+      if (isVectorConfigInstr(*DefMI)) {
+        VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI);
+        if (DefInfo.hasSameAVL(CurInfo) && DefInfo.hasSameVLMAX(CurInfo))
+          return false;
       }
     }
+  }
 
-    // If this is something that updates VL/VTYPE that we don't know about, set
-    // the state to unknown.
-    if (MI.isCall() || MI.isInlineAsm() || MI.modifiesRegister(RISCV::VL) ||
-        MI.modifiesRegister(RISCV::VTYPE)) {
-      BBInfo.Change = VSETVLIInfo::getUnknown();
-    }
+  return true;
+}
+
+// Given an incoming state reaching MI, modifies that state so that it is minimally
+// compatible with MI.  The resulting state is guaranteed to be semantically legal
+// for MI, but may not be the state requested by MI.
+void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info, const MachineInstr &MI) {
+  uint64_t TSFlags = MI.getDesc().TSFlags;
+  if (!RISCVII::hasSEWOp(TSFlags))
+    return;
+
+  const VSETVLIInfo NewInfo = computeInfoForInstr(MI, TSFlags, MRI);
+  if (Info.isValid() && !needVSETVLI(MI, NewInfo, Info))
+    return;
+
+  const VSETVLIInfo PrevInfo = Info;
+  Info = NewInfo;
+
+  if (!RISCVII::hasVLOp(TSFlags))
+    return;
+
+  // For vmv.s.x and vfmv.s.f, there are only two behaviors, VL = 0 and
+  // VL > 0. We can discard the user requested AVL and just use the last
+  // one if we can prove it equally zero.  This removes a vsetvli entirely
+  // if the types match or allows use of cheaper avl preserving variant
+  // if VLMAX doesn't change.  If VLMAX might change, we couldn't use
+  // the 'vsetvli x0, x0, vtype" variant, so we avoid the transform to
+  // prevent extending live range of an avl register operand.
+  // TODO: We can probably relax this for immediates.
+  if (isScalarMoveInstr(MI) && PrevInfo.isValid() &&
+      PrevInfo.hasNonZeroAVL() && Info.hasNonZeroAVL() &&
+      Info.hasSameVLMAX(PrevInfo)) {
+    if (PrevInfo.hasAVLImm())
+      Info.setAVLImm(PrevInfo.getAVLImm());
+    else
+      Info.setAVLReg(PrevInfo.getAVLReg());
+    return;
+  }
+
+  // Two cases involving an AVL resulting from a previous vsetvli.
+  // 1) If the AVL is the result of a previous vsetvli which has the
+  //    same AVL and VLMAX as our current state, we can reuse the AVL
+  //    from the current state for the new one.  This allows us to
+  //    generate 'vsetvli x0, x0, vtype" or possible skip the transition
+  //    entirely.
+  // 2) If AVL is defined by a vsetvli with the same VLMAX, we can
+  //    replace the AVL operand with the AVL of the defining vsetvli.
+  //    We avoid general register AVLs to avoid extending live ranges
+  //    without being sure we can kill the original source reg entirely.
+  if (!Info.hasAVLReg() || !Info.getAVLReg().isVirtual())
+    return;
+  MachineInstr *DefMI = MRI->getVRegDef(Info.getAVLReg());
+  if (!DefMI || !isVectorConfigInstr(*DefMI))
+    return;
+
+  VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI);
+  // case 1
+  if (PrevInfo.isValid() && !PrevInfo.isUnknown() &&
+      DefInfo.hasSameAVL(PrevInfo) &&
+      DefInfo.hasSameVLMAX(PrevInfo)) {
+    if (PrevInfo.hasAVLImm())
+      Info.setAVLImm(PrevInfo.getAVLImm());
+    else
+      Info.setAVLReg(PrevInfo.getAVLReg());
+    return;
   }
+  // case 2
+  if (DefInfo.hasSameVLMAX(Info) &&
+      (DefInfo.hasAVLImm() || DefInfo.getAVLReg() == RISCV::X0)) {
+    if (DefInfo.hasAVLImm())
+      Info.setAVLImm(DefInfo.getAVLImm());
+    else
+      Info.setAVLReg(DefInfo.getAVLReg());
+    return;
+  }
+}
+
+// Given a state with which we evaluated MI (see transferBefore above for why
+// this might be different that the state MI requested), modify the state to
+// reflect the changes MI might make.
+void RISCVInsertVSETVLI::transferAfter(VSETVLIInfo &Info, const MachineInstr &MI) {
+  if (isVectorConfigInstr(MI)) {
+    Info = getInfoForVSETVLI(MI);
+    return;
+  }
+
+  if (RISCV::isFaultFirstLoad(MI)) {
+    // Update AVL to vl-output of the fault first load.
+    Info.setAVLReg(MI.getOperand(1).getReg());
+    return;
+  }
+
+  // If this is something that updates VL/VTYPE that we don't know about, set
+  // the state to unknown.
+  if (MI.isCall() || MI.isInlineAsm() || MI.modifiesRegister(RISCV::VL) ||
+      MI.modifiesRegister(RISCV::VTYPE))
+    Info = VSETVLIInfo::getUnknown();
+}
+
+bool RISCVInsertVSETVLI::computeVLVTYPEChanges(const MachineBasicBlock &MBB) {
+  bool HadVectorOp = false;
+
+  BlockData &BBInfo = BlockInfo[MBB.getNumber()];
+  BBInfo.Change = BBInfo.Pred;
+  for (const MachineInstr &MI : MBB) {
+    transferBefore(BBInfo.Change, MI);
+
+    if (isVectorConfigInstr(MI) || RISCVII::hasSEWOp(MI.getDesc().TSFlags))
+      HadVectorOp = true;
 
-  // Initial exit state is whatever change we found in the block.
-  BBInfo.Exit = BBInfo.Change;
+    transferAfter(BBInfo.Change, MI);
+  }
 
   return HadVectorOp;
 }
 
 void RISCVInsertVSETVLI::computeIncomingVLVTYPE(const MachineBasicBlock &MBB) {
+
   BlockData &BBInfo = BlockInfo[MBB.getNumber()];
 
   BBInfo.InQueue = false;
@@ -928,9 +1118,20 @@ void RISCVInsertVSETVLI::computeIncomingVLVTYPE(const MachineBasicBlock &MBB) {
   if (!InInfo.isValid())
     return;
 
+  // If no change, no need to rerun block
+  if (InInfo == BBInfo.Pred)
+    return;
+
   BBInfo.Pred = InInfo;
+  LLVM_DEBUG(dbgs() << "Entry state of " << printMBBReference(MBB)
+                    << " changed to " << BBInfo.Pred << "\n");
 
-  VSETVLIInfo TmpStatus = BBInfo.Pred.merge(BBInfo.Change);
+  // Note: It's tempting to cache the state changes here, but due to the
+  // compatibility checks performed a blocks output state can change based on
+  // the input state.  To cache, we'd have to add logic for finding
+  // never-compatible state changes.
+  computeVLVTYPEChanges(MBB);
+  VSETVLIInfo TmpStatus = BBInfo.Change;
 
   // If the new exit value matches the old exit value, we don't need to revisit
   // any blocks.
@@ -938,6 +1139,8 @@ void RISCVInsertVSETVLI::computeIncomingVLVTYPE(const MachineBasicBlock &MBB) {
     return;
 
   BBInfo.Exit = TmpStatus;
+  LLVM_DEBUG(dbgs() << "Exit state of " << printMBBReference(MBB)
+                    << " changed to " << BBInfo.Exit << "\n");
 
   // Add the successors to the work list so we can propagate the changed exit
   // status.
@@ -947,10 +1150,10 @@ void RISCVInsertVSETVLI::computeIncomingVLVTYPE(const MachineBasicBlock &MBB) {
 }
 
 // If we weren't able to prove a vsetvli was directly unneeded, it might still
-// be/ unneeded if the AVL is a phi node where all incoming values are VL
+// be unneeded if the AVL is a phi node where all incoming values are VL
 // outputs from the last VSETVLI in their respective basic blocks.
 bool RISCVInsertVSETVLI::needVSETVLIPHI(const VSETVLIInfo &Require,
-                                        const MachineBasicBlock &MBB) {
+                                        const MachineBasicBlock &MBB) const {
   if (DisableInsertVSETVLPHIOpt)
     return true;
 
@@ -973,15 +1176,12 @@ bool RISCVInsertVSETVLI::needVSETVLIPHI(const VSETVLIInfo &Require,
     const BlockData &PBBInfo = BlockInfo[PBB->getNumber()];
     // If the exit from the predecessor has the VTYPE we are looking for
     // we might be able to avoid a VSETVLI.
-    if (PBBInfo.Exit.isUnknown() ||
-        !PBBInfo.Exit.hasCompatibleVTYPE(Require, /*Strict*/ false))
+    if (PBBInfo.Exit.isUnknown() || !PBBInfo.Exit.hasSameVTYPE(Require))
       return true;
 
     // We need the PHI input to the be the output of a VSET(I)VLI.
     MachineInstr *DefMI = MRI->getVRegDef(InReg);
-    if (!DefMI || (DefMI->getOpcode() != RISCV::PseudoVSETVLI &&
-                   DefMI->getOpcode() != RISCV::PseudoVSETVLIX0 &&
-                   DefMI->getOpcode() != RISCV::PseudoVSETIVLI))
+    if (!DefMI || !isVectorConfigInstr(*DefMI))
       return true;
 
     // We found a VSET(I)VLI make sure it matches the output of the
@@ -998,42 +1198,42 @@ bool RISCVInsertVSETVLI::needVSETVLIPHI(const VSETVLIInfo &Require,
 }
 
 void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
-  VSETVLIInfo CurInfo;
-  // BBLocalInfo tracks the VL/VTYPE state the same way BBInfo.Change was
-  // calculated in computeIncomingVLVTYPE. We need this to apply
-  // canSkipVSETVLIForLoadStore the same way computeIncomingVLVTYPE did. We
-  // can't include predecessor information in that decision to avoid disagreeing
-  // with the global analysis.
-  VSETVLIInfo BBLocalInfo;
-  // Only be set if current VSETVLIInfo is from an explicit VSET(I)VLI.
-  MachineInstr *PrevVSETVLIMI = nullptr;
-
+  VSETVLIInfo CurInfo = BlockInfo[MBB.getNumber()].Pred;
+  // Track whether the prefix of the block we've scanned is transparent
+  // (meaning has not yet changed the abstract state).
+  bool PrefixTransparent = true;
   for (MachineInstr &MI : MBB) {
+    const VSETVLIInfo PrevInfo = CurInfo;
+    transferBefore(CurInfo, MI);
+
     // If this is an explicit VSETVLI or VSETIVLI, update our state.
-    if (MI.getOpcode() == RISCV::PseudoVSETVLI ||
-        MI.getOpcode() == RISCV::PseudoVSETVLIX0 ||
-        MI.getOpcode() == RISCV::PseudoVSETIVLI) {
+    if (isVectorConfigInstr(MI)) {
       // Conservatively, mark the VL and VTYPE as live.
       assert(MI.getOperand(3).getReg() == RISCV::VL &&
              MI.getOperand(4).getReg() == RISCV::VTYPE &&
              "Unexpected operands where VL and VTYPE should be");
       MI.getOperand(3).setIsDead(false);
       MI.getOperand(4).setIsDead(false);
-      CurInfo = getInfoForVSETVLI(MI);
-      BBLocalInfo = getInfoForVSETVLI(MI);
-      PrevVSETVLIMI = &MI;
-      continue;
+      PrefixTransparent = false;
     }
 
     uint64_t TSFlags = MI.getDesc().TSFlags;
     if (RISCVII::hasSEWOp(TSFlags)) {
-      VSETVLIInfo NewInfo = computeInfoForInstr(MI, TSFlags, MRI);
+      if (PrevInfo != CurInfo) {
+        // If this is the first implicit state change, and the state change
+        // requested can be proven to produce the same register contents, we
+        // can skip emitting the actual state change and continue as if we
+        // had since we know the GPR result of the implicit state change
+        // wouldn't be used and VL/VTYPE registers are correct.  Note that
+        // we *do* need to model the state as if it changed as while the
+        // register contents are unchanged, the abstract model can change.
+        if (!PrefixTransparent || needVSETVLIPHI(CurInfo, MBB))
+          insertVSETVLI(MBB, MI, CurInfo, PrevInfo);
+        PrefixTransparent = false;
+      }
+
       if (RISCVII::hasVLOp(TSFlags)) {
-        unsigned Offset = 2;
-        if (RISCVII::hasVecPolicyOp(TSFlags))
-          Offset = 3;
-        MachineOperand &VLOp =
-            MI.getOperand(MI.getNumExplicitOperands() - Offset);
+        MachineOperand &VLOp = MI.getOperand(getVLOpNum(MI));
         if (VLOp.isReg()) {
           // Erase the AVL operand from the instruction.
           VLOp.setReg(RISCV::NoRegister);
@@ -1044,76 +1244,217 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
       }
       MI.addOperand(MachineOperand::CreateReg(RISCV::VTYPE, /*isDef*/ false,
                                               /*isImp*/ true));
+    }
 
-      if (!CurInfo.isValid()) {
-        // We haven't found any vector instructions or VL/VTYPE changes yet,
-        // use the predecessor information.
-        assert(BlockInfo[MBB.getNumber()].Pred.isValid() &&
-               "Expected a valid predecessor state.");
-        // Don't use predecessor information if there was an earlier instruction
-        // in this block that allowed a vsetvli to be skipped for load/store.
-        if (!(BBLocalInfo.isValid() &&
-              canSkipVSETVLIForLoadStore(MI, NewInfo, BBLocalInfo)) &&
-            needVSETVLI(NewInfo, BlockInfo[MBB.getNumber()].Pred) &&
-            needVSETVLIPHI(NewInfo, MBB)) {
-          insertVSETVLI(MBB, MI, NewInfo, BlockInfo[MBB.getNumber()].Pred);
-          CurInfo = NewInfo;
-          BBLocalInfo = NewInfo;
-        }
+    if (MI.isCall() || MI.isInlineAsm() || MI.modifiesRegister(RISCV::VL) ||
+        MI.modifiesRegister(RISCV::VTYPE))
+      PrefixTransparent = false;
 
-        // We must update BBLocalInfo for every vector instruction.
-        if (!BBLocalInfo.isValid())
-          BBLocalInfo = NewInfo;
-      } else {
-        assert(BBLocalInfo.isValid());
-        // If this instruction isn't compatible with the previous VL/VTYPE
-        // we need to insert a VSETVLI.
-        // If this is a unit-stride or strided load/store, we may be able to use
-        // the EMUL=(EEW/SEW)*LMUL relationship to avoid changing vtype.
-        // NOTE: We can't use predecessor information for the store. We must
-        // treat it the same as the first phase so that we produce the correct
-        // vl/vtype for succesor blocks.
-        if (!canSkipVSETVLIForLoadStore(MI, NewInfo, CurInfo) &&
-            needVSETVLI(NewInfo, CurInfo)) {
-          // If the previous VL/VTYPE is set by VSETVLI and do not use, Merge it
-          // with current VL/VTYPE.
-          bool NeedInsertVSETVLI = true;
-          if (PrevVSETVLIMI) {
-            bool HasSameAVL =
-                CurInfo.hasSameAVL(NewInfo) ||
-                (NewInfo.hasAVLReg() && NewInfo.getAVLReg().isVirtual() &&
-                 NewInfo.getAVLReg() == PrevVSETVLIMI->getOperand(0).getReg());
-            // If these two VSETVLI have the same AVL and the same VLMAX,
-            // we could merge these two VSETVLI.
-            if (HasSameAVL &&
-                CurInfo.getSEWLMULRatio() == NewInfo.getSEWLMULRatio()) {
-              PrevVSETVLIMI->getOperand(2).setImm(NewInfo.encodeVTYPE());
-              NeedInsertVSETVLI = false;
-            }
-            if (isScalarMoveInstr(MI) &&
-                ((CurInfo.hasNonZeroAVL() && NewInfo.hasNonZeroAVL()) ||
-                 (CurInfo.hasZeroAVL() && NewInfo.hasZeroAVL())) &&
-                NewInfo.hasSameVLMAX(CurInfo)) {
-              PrevVSETVLIMI->getOperand(2).setImm(NewInfo.encodeVTYPE());
-              NeedInsertVSETVLI = false;
-            }
-          }
-          if (NeedInsertVSETVLI)
-            insertVSETVLI(MBB, MI, NewInfo, CurInfo);
-          CurInfo = NewInfo;
-          BBLocalInfo = NewInfo;
-        }
+    transferAfter(CurInfo, MI);
+  }
+
+  // If we reach the end of the block and our current info doesn't match the
+  // expected info, insert a vsetvli to correct.
+  if (!UseStrictAsserts) {
+    const VSETVLIInfo &ExitInfo = BlockInfo[MBB.getNumber()].Exit;
+    if (CurInfo.isValid() && ExitInfo.isValid() && !ExitInfo.isUnknown() &&
+        CurInfo != ExitInfo) {
+      // Note there's an implicit assumption here that terminators never use
+      // or modify VL or VTYPE.  Also, fallthrough will return end().
+      auto InsertPt = MBB.getFirstInstrTerminator();
+      insertVSETVLI(MBB, InsertPt, MBB.findDebugLoc(InsertPt), ExitInfo,
+                    CurInfo);
+      CurInfo = ExitInfo;
+    }
+  }
+
+  if (UseStrictAsserts && CurInfo.isValid()) {
+    const auto &Info = BlockInfo[MBB.getNumber()];
+    if (CurInfo != Info.Exit) {
+      LLVM_DEBUG(dbgs() << "in block " << printMBBReference(MBB) << "\n");
+      LLVM_DEBUG(dbgs() << "  begin        state: " << Info.Pred << "\n");
+      LLVM_DEBUG(dbgs() << "  expected end state: " << Info.Exit << "\n");
+      LLVM_DEBUG(dbgs() << "  actual   end state: " << CurInfo << "\n");
+    }
+    assert(CurInfo == Info.Exit &&
+           "InsertVSETVLI dataflow invariant violated");
+  }
+}
+
+/// Return true if the VL value configured must be equal to the requested one.
+static bool hasFixedResult(const VSETVLIInfo &Info, const RISCVSubtarget &ST) {
+  if (!Info.hasAVLImm())
+    // VLMAX is always the same value.
+    // TODO: Could extend to other registers by looking at the associated vreg
+    // def placement.
+    return RISCV::X0 == Info.getAVLReg();
+
+  unsigned AVL = Info.getAVLImm();
+  unsigned SEW = Info.getSEW();
+  unsigned AVLInBits = AVL * SEW;
+
+  unsigned LMul;
+  bool Fractional;
+  std::tie(LMul, Fractional) = RISCVVType::decodeVLMUL(Info.getVLMUL());
+
+  if (Fractional)
+    return ST.getRealMinVLen() / LMul >= AVLInBits;
+  return ST.getRealMinVLen() * LMul >= AVLInBits;
+}
+
+/// Perform simple partial redundancy elimination of the VSETVLI instructions
+/// we're about to insert by looking for cases where we can PRE from the
+/// beginning of one block to the end of one of its predecessors.  Specifically,
+/// this is geared to catch the common case of a fixed length vsetvl in a single
+/// block loop when it could execute once in the preheader instead.
+void RISCVInsertVSETVLI::doPRE(MachineBasicBlock &MBB) {
+  const MachineFunction &MF = *MBB.getParent();
+  const RISCVSubtarget &ST = MF.getSubtarget<RISCVSubtarget>();
+
+  if (!BlockInfo[MBB.getNumber()].Pred.isUnknown())
+    return;
+
+  MachineBasicBlock *UnavailablePred = nullptr;
+  VSETVLIInfo AvailableInfo;
+  for (MachineBasicBlock *P : MBB.predecessors()) {
+    const VSETVLIInfo &PredInfo = BlockInfo[P->getNumber()].Exit;
+    if (PredInfo.isUnknown()) {
+      if (UnavailablePred)
+        return;
+      UnavailablePred = P;
+    } else if (!AvailableInfo.isValid()) {
+      AvailableInfo = PredInfo;
+    } else if (AvailableInfo != PredInfo) {
+      return;
+    }
+  }
+
+  // Unreachable, single pred, or full redundancy. Note that FRE is handled by
+  // phase 3.
+  if (!UnavailablePred || !AvailableInfo.isValid())
+    return;
+
+  // Critical edge - TODO: consider splitting?
+  if (UnavailablePred->succ_size() != 1)
+    return;
+
+  // If VL can be less than AVL, then we can't reduce the frequency of exec.
+  if (!hasFixedResult(AvailableInfo, ST))
+    return;
+
+  // Does it actually let us remove an implicit transition in MBB?
+  bool Found = false;
+  for (auto &MI : MBB) {
+    if (isVectorConfigInstr(MI))
+      return;
+
+    const uint64_t TSFlags = MI.getDesc().TSFlags;
+    if (RISCVII::hasSEWOp(TSFlags)) {
+      if (AvailableInfo != computeInfoForInstr(MI, TSFlags, MRI))
+        return;
+      Found = true;
+      break;
+    }
+  }
+  if (!Found)
+    return;
+
+  // Finally, update both data flow state and insert the actual vsetvli.
+  // Doing both keeps the code in sync with the dataflow results, which
+  // is critical for correctness of phase 3.
+  auto OldInfo = BlockInfo[UnavailablePred->getNumber()].Exit;
+  LLVM_DEBUG(dbgs() << "PRE VSETVLI from " << MBB.getName() << " to "
+                    << UnavailablePred->getName() << " with state "
+                    << AvailableInfo << "\n");
+  BlockInfo[UnavailablePred->getNumber()].Exit = AvailableInfo;
+  BlockInfo[MBB.getNumber()].Pred = AvailableInfo;
+
+  // Note there's an implicit assumption here that terminators never use
+  // or modify VL or VTYPE.  Also, fallthrough will return end().
+  auto InsertPt = UnavailablePred->getFirstInstrTerminator();
+  insertVSETVLI(*UnavailablePred, InsertPt,
+                UnavailablePred->findDebugLoc(InsertPt),
+                AvailableInfo, OldInfo);
+}
+
+static void doUnion(DemandedFields &A, DemandedFields B) {
+  A.VL |= B.VL;
+  A.SEW |= B.SEW;
+  A.LMUL |= B.LMUL;
+  A.SEWLMULRatio |= B.SEWLMULRatio;
+  A.TailPolicy |= B.TailPolicy;
+  A.MaskPolicy |= B.MaskPolicy;
+}
+
+// Return true if we can mutate PrevMI's VTYPE to match MI's
+// without changing any the fields which have been used.
+// TODO: Restructure code to allow code reuse between this and isCompatible
+// above.
+static bool canMutatePriorConfig(const MachineInstr &PrevMI,
+                                 const MachineInstr &MI,
+                                 const DemandedFields &Used) {
+  // TODO: Extend this to handle cases where VL does change, but VL
+  // has not been used.  (e.g. over a vmv.x.s)
+  if (!isVLPreservingConfig(MI))
+    // Note: `vsetvli x0, x0, vtype' is the canonical instruction
+    // for this case.  If you find yourself wanting to add other forms
+    // to this "unused VTYPE" case, we're probably missing a
+    // canonicalization earlier.
+    return false;
+
+  if (!PrevMI.getOperand(2).isImm() || !MI.getOperand(2).isImm())
+    return false;
+
+  auto PriorVType = PrevMI.getOperand(2).getImm();
+  auto VType = MI.getOperand(2).getImm();
+  return areCompatibleVTYPEs(PriorVType, VType, Used);
+}
+
+void RISCVInsertVSETVLI::doLocalPostpass(MachineBasicBlock &MBB) {
+  MachineInstr *PrevMI = nullptr;
+  DemandedFields Used;
+  SmallVector<MachineInstr*> ToDelete;
+  for (MachineInstr &MI : MBB) {
+    // Note: Must be *before* vsetvli handling to account for config cases
+    // which only change some subfields.
+    doUnion(Used, getDemanded(MI));
+
+    if (!isVectorConfigInstr(MI))
+      continue;
+
+    if (PrevMI) {
+      if (!Used.VL && !Used.usedVTYPE()) {
+        ToDelete.push_back(PrevMI);
+        // fallthrough
+      } else if (canMutatePriorConfig(*PrevMI, MI, Used)) {
+        PrevMI->getOperand(2).setImm(MI.getOperand(2).getImm());
+        ToDelete.push_back(&MI);
+        // Leave PrevMI unchanged
+        continue;
       }
-      PrevVSETVLIMI = nullptr;
     }
+    PrevMI = &MI;
+    Used = getDemanded(MI);
+    Register VRegDef = MI.getOperand(0).getReg();
+    if (VRegDef != RISCV::X0 &&
+        !(VRegDef.isVirtual() && MRI->use_nodbg_empty(VRegDef)))
+      Used.VL = true;
+  }
 
-    // If this is something updates VL/VTYPE that we don't know about, set
-    // the state to unknown.
-    if (MI.isCall() || MI.isInlineAsm() || MI.modifiesRegister(RISCV::VL) ||
-        MI.modifiesRegister(RISCV::VTYPE)) {
-      CurInfo = VSETVLIInfo::getUnknown();
-      BBLocalInfo = VSETVLIInfo::getUnknown();
-      PrevVSETVLIMI = nullptr;
+  for (auto *MI : ToDelete)
+    MI->eraseFromParent();
+}
+
+void RISCVInsertVSETVLI::insertReadVL(MachineBasicBlock &MBB) {
+  for (auto I = MBB.begin(), E = MBB.end(); I != E;) {
+    MachineInstr &MI = *I++;
+    if (RISCV::isFaultFirstLoad(MI)) {
+      Register VLOutput = MI.getOperand(1).getReg();
+      if (!MRI->use_nodbg_empty(VLOutput))
+        BuildMI(MBB, I, MI.getDebugLoc(), TII->get(RISCV::PseudoReadVL),
+                VLOutput);
+      // We don't use the vl output of the VLEFF/VLSEGFF anymore.
+      MI.getOperand(1).setReg(RISCV::X0);
     }
   }
 }
@@ -1124,6 +1465,8 @@ bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) {
   if (!ST.hasVInstructions())
     return false;
 
+  LLVM_DEBUG(dbgs() << "Entering InsertVSETVLI for " << MF.getName() << "\n");
+
   TII = ST.getInstrInfo();
   MRI = &MF.getRegInfo();
 
@@ -1133,34 +1476,77 @@ bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) {
   bool HaveVectorOp = false;
 
   // Phase 1 - determine how VL/VTYPE are affected by the each block.
-  for (const MachineBasicBlock &MBB : MF)
+  for (const MachineBasicBlock &MBB : MF) {
     HaveVectorOp |= computeVLVTYPEChanges(MBB);
+    // Initial exit state is whatever change we found in the block.
+    BlockData &BBInfo = BlockInfo[MBB.getNumber()];
+    BBInfo.Exit = BBInfo.Change;
+    LLVM_DEBUG(dbgs() << "Initial exit state of " << printMBBReference(MBB)
+                      << " is " << BBInfo.Exit << "\n");
+
+  }
 
   // If we didn't find any instructions that need VSETVLI, we're done.
-  if (HaveVectorOp) {
-    // Phase 2 - determine the exit VL/VTYPE from each block. We add all
-    // blocks to the list here, but will also add any that need to be revisited
-    // during Phase 2 processing.
-    for (const MachineBasicBlock &MBB : MF) {
-      WorkList.push(&MBB);
-      BlockInfo[MBB.getNumber()].InQueue = true;
-    }
-    while (!WorkList.empty()) {
-      const MachineBasicBlock &MBB = *WorkList.front();
-      WorkList.pop();
-      computeIncomingVLVTYPE(MBB);
-    }
+  if (!HaveVectorOp) {
+    BlockInfo.clear();
+    return false;
+  }
 
-    // Phase 3 - add any vsetvli instructions needed in the block. Use the
-    // Phase 2 information to avoid adding vsetvlis before the first vector
-    // instruction in the block if the VL/VTYPE is satisfied by its
-    // predecessors.
-    for (MachineBasicBlock &MBB : MF)
-      emitVSETVLIs(MBB);
+  // Phase 2 - determine the exit VL/VTYPE from each block. We add all
+  // blocks to the list here, but will also add any that need to be revisited
+  // during Phase 2 processing.
+  for (const MachineBasicBlock &MBB : MF) {
+    WorkList.push(&MBB);
+    BlockInfo[MBB.getNumber()].InQueue = true;
+  }
+  while (!WorkList.empty()) {
+    const MachineBasicBlock &MBB = *WorkList.front();
+    WorkList.pop();
+    computeIncomingVLVTYPE(MBB);
   }
 
-  BlockInfo.clear();
+  // Perform partial redundancy elimination of vsetvli transitions.
+  for (MachineBasicBlock &MBB : MF)
+    doPRE(MBB);
+
+  // Phase 3 - add any vsetvli instructions needed in the block. Use the
+  // Phase 2 information to avoid adding vsetvlis before the first vector
+  // instruction in the block if the VL/VTYPE is satisfied by its
+  // predecessors.
+  for (MachineBasicBlock &MBB : MF)
+    emitVSETVLIs(MBB);
+
+  // Now that all vsetvlis are explicit, go through and do block local
+  // DSE and peephole based demanded fields based transforms.  Note that
+  // this *must* be done outside the main dataflow so long as we allow
+  // any cross block analysis within the dataflow.  We can't have both
+  // demanded fields based mutation and non-local analysis in the
+  // dataflow at the same time without introducing inconsistencies.
+  for (MachineBasicBlock &MBB : MF)
+    doLocalPostpass(MBB);
+
+  // Once we're fully done rewriting all the instructions, do a final pass
+  // through to check for VSETVLIs which write to an unused destination.
+  // For the non X0, X0 variant, we can replace the destination register
+  // with X0 to reduce register pressure.  This is really a generic
+  // optimization which can be applied to any dead def (TODO: generalize).
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (MI.getOpcode() == RISCV::PseudoVSETVLI ||
+          MI.getOpcode() == RISCV::PseudoVSETIVLI) {
+        Register VRegDef = MI.getOperand(0).getReg();
+        if (VRegDef != RISCV::X0 && MRI->use_nodbg_empty(VRegDef))
+          MI.getOperand(0).setReg(RISCV::X0);
+      }
+    }
+  }
 
+  // Insert PseudoReadVL after VLEFF/VLSEGFF and replace it with the vl output
+  // of VLEFF/VLSEGFF.
+  for (MachineBasicBlock &MBB : MF)
+    insertReadVL(MBB);
+
+  BlockInfo.clear();
   return HaveVectorOp;
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
index f99d0f56c406..18b31f85bfdb 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
@@ -196,7 +196,10 @@ class RVInst<dag outs, dag ins, string opcodestr, string argstr,
   let TSFlags{16} = HasVecPolicyOp;
 
   bit IsRVVWideningReduction = 0;
-  let TSFlags{17} =  IsRVVWideningReduction;
+  let TSFlags{17} = IsRVVWideningReduction;
+
+  bit UsesMaskPolicy = 0;
+  let TSFlags{18} = UsesMaskPolicy;
 }
 
 // Pseudo instructions
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 55f4a19b79eb..685604ad9a59 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -264,6 +264,16 @@ void RISCVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   }
 
+  // Handle copy from csr
+  if (RISCV::VCSRRegClass.contains(SrcReg) &&
+      RISCV::GPRRegClass.contains(DstReg)) {
+    const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+    BuildMI(MBB, MBBI, DL, get(RISCV::CSRRS), DstReg)
+      .addImm(RISCVSysReg::lookupSysRegByName(TRI.getName(SrcReg))->Encoding)
+      .addReg(RISCV::X0);
+    return;
+  }
+
   // FPR->FPR copies and VR->VR copies.
   unsigned Opc;
   bool IsScalableVector = true;
@@ -631,11 +641,7 @@ void RISCVInstrInfo::movImm(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MBBI,
                             const DebugLoc &DL, Register DstReg, uint64_t Val,
                             MachineInstr::MIFlag Flag) const {
-  MachineFunction *MF = MBB.getParent();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
   Register SrcReg = RISCV::X0;
-  Register Result = MRI.createVirtualRegister(&RISCV::GPRRegClass);
-  unsigned Num = 0;
 
   if (!STI.is64Bit() && !isInt<32>(Val))
     report_fatal_error("Should only materialize 32-bit constants for RV32");
@@ -645,34 +651,34 @@ void RISCVInstrInfo::movImm(MachineBasicBlock &MBB,
   assert(!Seq.empty());
 
   for (RISCVMatInt::Inst &Inst : Seq) {
-    // Write the final result to DstReg if it's the last instruction in the Seq.
-    // Otherwise, write the result to the temp register.
-    if (++Num == Seq.size())
-      Result = DstReg;
-
-    if (Inst.Opc == RISCV::LUI) {
-      BuildMI(MBB, MBBI, DL, get(RISCV::LUI), Result)
+    switch (Inst.getOpndKind()) {
+    case RISCVMatInt::Imm:
+      BuildMI(MBB, MBBI, DL, get(Inst.Opc), DstReg)
           .addImm(Inst.Imm)
           .setMIFlag(Flag);
-    } else if (Inst.Opc == RISCV::ADD_UW) {
-      BuildMI(MBB, MBBI, DL, get(RISCV::ADD_UW), Result)
+      break;
+    case RISCVMatInt::RegX0:
+      BuildMI(MBB, MBBI, DL, get(Inst.Opc), DstReg)
           .addReg(SrcReg, RegState::Kill)
           .addReg(RISCV::X0)
           .setMIFlag(Flag);
-    } else if (Inst.Opc == RISCV::SH1ADD || Inst.Opc == RISCV::SH2ADD ||
-               Inst.Opc == RISCV::SH3ADD) {
-      BuildMI(MBB, MBBI, DL, get(Inst.Opc), Result)
+      break;
+    case RISCVMatInt::RegReg:
+      BuildMI(MBB, MBBI, DL, get(Inst.Opc), DstReg)
           .addReg(SrcReg, RegState::Kill)
           .addReg(SrcReg, RegState::Kill)
           .setMIFlag(Flag);
-    } else {
-      BuildMI(MBB, MBBI, DL, get(Inst.Opc), Result)
+      break;
+    case RISCVMatInt::RegImm:
+      BuildMI(MBB, MBBI, DL, get(Inst.Opc), DstReg)
           .addReg(SrcReg, RegState::Kill)
           .addImm(Inst.Imm)
           .setMIFlag(Flag);
+      break;
     }
+
     // Only the first instruction has X0 as its source.
-    SrcReg = Result;
+    SrcReg = DstReg;
   }
 }
 
@@ -1052,29 +1058,25 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
         switch (OpType) {
         default:
           llvm_unreachable("Unexpected operand type");
-        case RISCVOp::OPERAND_UIMM2:
-          Ok = isUInt<2>(Imm);
-          break;
-        case RISCVOp::OPERAND_UIMM3:
-          Ok = isUInt<3>(Imm);
-          break;
-        case RISCVOp::OPERAND_UIMM4:
-          Ok = isUInt<4>(Imm);
-          break;
-        case RISCVOp::OPERAND_UIMM5:
-          Ok = isUInt<5>(Imm);
-          break;
-        case RISCVOp::OPERAND_UIMM7:
-          Ok = isUInt<7>(Imm);
-          break;
-        case RISCVOp::OPERAND_UIMM12:
-          Ok = isUInt<12>(Imm);
-          break;
+
+          // clang-format off
+#define CASE_OPERAND_UIMM(NUM)                                                 \
+  case RISCVOp::OPERAND_UIMM##NUM:                                             \
+    Ok = isUInt<NUM>(Imm);                                                     \
+    break;
+        CASE_OPERAND_UIMM(2)
+        CASE_OPERAND_UIMM(3)
+        CASE_OPERAND_UIMM(4)
+        CASE_OPERAND_UIMM(5)
+        CASE_OPERAND_UIMM(7)
+        CASE_OPERAND_UIMM(12)
+        CASE_OPERAND_UIMM(20)
+          // clang-format on
         case RISCVOp::OPERAND_SIMM12:
           Ok = isInt<12>(Imm);
           break;
-        case RISCVOp::OPERAND_UIMM20:
-          Ok = isUInt<20>(Imm);
+        case RISCVOp::OPERAND_SIMM12_LSB00000:
+          Ok = isShiftedInt<7, 5>(Imm);
           break;
         case RISCVOp::OPERAND_UIMMLOG2XLEN:
           if (STI.getTargetTriple().isArch64Bit())
@@ -1205,6 +1207,11 @@ enum MachineOutlinerConstructionID {
   MachineOutlinerDefault
 };
 
+bool RISCVInstrInfo::shouldOutlineFromFunctionByDefault(
+    MachineFunction &MF) const {
+  return MF.getFunction().hasMinSize();
+}
+
 outliner::OutlinedFunction RISCVInstrInfo::getOutliningCandidateInfo(
     std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
 
@@ -1212,10 +1219,7 @@ outliner::OutlinedFunction RISCVInstrInfo::getOutliningCandidateInfo(
   // be used to setup the function call.
   auto CannotInsertCall = [](outliner::Candidate &C) {
     const TargetRegisterInfo *TRI = C.getMF()->getSubtarget().getRegisterInfo();
-
-    C.initLRU(*TRI);
-    LiveRegUnits LRU = C.LRU;
-    return !LRU.available(RISCV::X5);
+    return !C.isAvailableAcrossAndOutOfSeq(RISCV::X5, *TRI);
   };
 
   llvm::erase_if(RepeatedSequenceLocs, CannotInsertCall);
@@ -1258,7 +1262,12 @@ RISCVInstrInfo::getOutliningType(MachineBasicBlock::iterator &MBBI,
   if (MI.isPosition()) {
     // We can manually strip out CFI instructions later.
     if (MI.isCFIInstruction())
-      return outliner::InstrType::Invisible;
+      // If current function has exception handling code, we can't outline &
+      // strip these CFI instructions since it may break .eh_frame section
+      // needed in unwinding.
+      return MI.getMF()->getFunction().needsUnwindTableEntry()
+                 ? outliner::InstrType::Illegal
+                 : outliner::InstrType::Invisible;
 
     return outliner::InstrType::Illegal;
   }
@@ -1325,7 +1334,7 @@ void RISCVInstrInfo::buildOutlinedFrame(
 
 MachineBasicBlock::iterator RISCVInstrInfo::insertOutlinedCall(
     Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
-    MachineFunction &MF, const outliner::Candidate &C) const {
+    MachineFunction &MF, outliner::Candidate &C) const {
 
   // Add in a call instruction to the outlined function at the given location.
   It = MBB.insert(It,
@@ -1335,6 +1344,53 @@ MachineBasicBlock::iterator RISCVInstrInfo::insertOutlinedCall(
   return It;
 }
 
+// MIR printer helper function to annotate Operands with a comment.
+std::string RISCVInstrInfo::createMIROperandComment(
+    const MachineInstr &MI, const MachineOperand &Op, unsigned OpIdx,
+    const TargetRegisterInfo *TRI) const {
+  // Print a generic comment for this operand if there is one.
+  std::string GenericComment =
+      TargetInstrInfo::createMIROperandComment(MI, Op, OpIdx, TRI);
+  if (!GenericComment.empty())
+    return GenericComment;
+
+  // If not, we must have an immediate operand.
+  if (!Op.isImm())
+    return std::string();
+
+  std::string Comment;
+  raw_string_ostream OS(Comment);
+
+  uint64_t TSFlags = MI.getDesc().TSFlags;
+
+  // Print the full VType operand of vsetvli/vsetivli instructions, and the SEW
+  // operand of vector codegen pseudos.
+  if ((MI.getOpcode() == RISCV::VSETVLI || MI.getOpcode() == RISCV::VSETIVLI ||
+       MI.getOpcode() == RISCV::PseudoVSETVLI ||
+       MI.getOpcode() == RISCV::PseudoVSETIVLI ||
+       MI.getOpcode() == RISCV::PseudoVSETVLIX0) &&
+      OpIdx == 2) {
+    unsigned Imm = MI.getOperand(OpIdx).getImm();
+    RISCVVType::printVType(Imm, OS);
+  } else if (RISCVII::hasSEWOp(TSFlags)) {
+    unsigned NumOperands = MI.getNumExplicitOperands();
+    bool HasPolicy = RISCVII::hasVecPolicyOp(TSFlags);
+
+    // The SEW operand is before any policy operand.
+    if (OpIdx != NumOperands - HasPolicy - 1)
+      return std::string();
+
+    unsigned Log2SEW = MI.getOperand(OpIdx).getImm();
+    unsigned SEW = Log2SEW ? 1 << Log2SEW : 8;
+    assert(RISCVVType::isValidSEW(SEW) && "Unexpected SEW");
+
+    OS << "e" << SEW;
+  }
+
+  OS.flush();
+  return Comment;
+}
+
 // clang-format off
 #define CASE_VFMA_OPCODE_COMMON(OP, TYPE, LMUL)                                \
   RISCV::PseudoV##OP##_##TYPE##_##LMUL
@@ -1653,6 +1709,12 @@ MachineInstr *RISCVInstrInfo::convertToThreeAddress(MachineInstr &MI,
   case CASE_WIDEOP_OPCODE_LMULS(WADDU_WV):
   case CASE_WIDEOP_OPCODE_LMULS(WSUB_WV):
   case CASE_WIDEOP_OPCODE_LMULS(WSUBU_WV): {
+    // If the tail policy is undisturbed we can't convert.
+    assert(RISCVII::hasVecPolicyOp(MI.getDesc().TSFlags) &&
+           MI.getNumExplicitOperands() == 6);
+    if ((MI.getOperand(5).getImm() & 1) == 0)
+      return nullptr;
+
     // clang-format off
     unsigned NewOpc;
     switch (MI.getOpcode()) {
@@ -1722,11 +1784,10 @@ Register RISCVInstrInfo::getVLENFactoredAmount(MachineFunction &MF,
          "Reserve the stack by the multiple of one vector size.");
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const RISCVInstrInfo *TII = MF.getSubtarget<RISCVSubtarget>().getInstrInfo();
   int64_t NumOfVReg = Amount / 8;
 
   Register VL = MRI.createVirtualRegister(&RISCV::GPRRegClass);
-  BuildMI(MBB, II, DL, TII->get(RISCV::PseudoReadVLENB), VL)
+  BuildMI(MBB, II, DL, get(RISCV::PseudoReadVLENB), VL)
     .setMIFlag(Flag);
   assert(isInt<32>(NumOfVReg) &&
          "Expect the number of vector registers within 32-bits.");
@@ -1734,47 +1795,55 @@ Register RISCVInstrInfo::getVLENFactoredAmount(MachineFunction &MF,
     uint32_t ShiftAmount = Log2_32(NumOfVReg);
     if (ShiftAmount == 0)
       return VL;
-    BuildMI(MBB, II, DL, TII->get(RISCV::SLLI), VL)
+    BuildMI(MBB, II, DL, get(RISCV::SLLI), VL)
         .addReg(VL, RegState::Kill)
         .addImm(ShiftAmount)
         .setMIFlag(Flag);
+  } else if ((NumOfVReg == 3 || NumOfVReg == 5 || NumOfVReg == 9) &&
+             STI.hasStdExtZba()) {
+    // We can use Zba SHXADD instructions for multiply in some cases.
+    // TODO: Generalize to SHXADD+SLLI.
+    unsigned Opc;
+    switch (NumOfVReg) {
+    default: llvm_unreachable("Unexpected number of vregs");
+    case 3: Opc = RISCV::SH1ADD; break;
+    case 5: Opc = RISCV::SH2ADD; break;
+    case 9: Opc = RISCV::SH3ADD; break;
+    }
+    BuildMI(MBB, II, DL, get(Opc), VL)
+        .addReg(VL, RegState::Kill)
+        .addReg(VL)
+        .setMIFlag(Flag);
   } else if (isPowerOf2_32(NumOfVReg - 1)) {
     Register ScaledRegister = MRI.createVirtualRegister(&RISCV::GPRRegClass);
     uint32_t ShiftAmount = Log2_32(NumOfVReg - 1);
-    BuildMI(MBB, II, DL, TII->get(RISCV::SLLI), ScaledRegister)
+    BuildMI(MBB, II, DL, get(RISCV::SLLI), ScaledRegister)
         .addReg(VL)
         .addImm(ShiftAmount)
         .setMIFlag(Flag);
-    BuildMI(MBB, II, DL, TII->get(RISCV::ADD), VL)
+    BuildMI(MBB, II, DL, get(RISCV::ADD), VL)
         .addReg(ScaledRegister, RegState::Kill)
         .addReg(VL, RegState::Kill)
         .setMIFlag(Flag);
   } else if (isPowerOf2_32(NumOfVReg + 1)) {
     Register ScaledRegister = MRI.createVirtualRegister(&RISCV::GPRRegClass);
     uint32_t ShiftAmount = Log2_32(NumOfVReg + 1);
-    BuildMI(MBB, II, DL, TII->get(RISCV::SLLI), ScaledRegister)
+    BuildMI(MBB, II, DL, get(RISCV::SLLI), ScaledRegister)
         .addReg(VL)
         .addImm(ShiftAmount)
         .setMIFlag(Flag);
-    BuildMI(MBB, II, DL, TII->get(RISCV::SUB), VL)
+    BuildMI(MBB, II, DL, get(RISCV::SUB), VL)
         .addReg(ScaledRegister, RegState::Kill)
         .addReg(VL, RegState::Kill)
         .setMIFlag(Flag);
   } else {
     Register N = MRI.createVirtualRegister(&RISCV::GPRRegClass);
-    if (!isInt<12>(NumOfVReg))
-      movImm(MBB, II, DL, N, NumOfVReg);
-    else {
-      BuildMI(MBB, II, DL, TII->get(RISCV::ADDI), N)
-          .addReg(RISCV::X0)
-          .addImm(NumOfVReg)
-          .setMIFlag(Flag);
-    }
-    if (!MF.getSubtarget<RISCVSubtarget>().hasStdExtM())
+    movImm(MBB, II, DL, N, NumOfVReg, Flag);
+    if (!STI.hasStdExtM())
       MF.getFunction().getContext().diagnose(DiagnosticInfoUnsupported{
           MF.getFunction(),
           "M-extension must be enabled to calculate the vscaled size/offset."});
-    BuildMI(MBB, II, DL, TII->get(RISCV::MUL), VL)
+    BuildMI(MBB, II, DL, get(RISCV::MUL), VL)
         .addReg(VL, RegState::Kill)
         .addReg(N, RegState::Kill)
         .setMIFlag(Flag);
@@ -1811,20 +1880,18 @@ static bool isRVVWholeLoadStore(unsigned Opcode) {
   }
 }
 
-bool RISCVInstrInfo::isRVVSpill(const MachineInstr &MI, bool CheckFIs) const {
+bool RISCV::isRVVSpill(const MachineInstr &MI) {
   // RVV lacks any support for immediate addressing for stack addresses, so be
   // conservative.
   unsigned Opcode = MI.getOpcode();
   if (!RISCVVPseudosTable::getPseudoInfo(Opcode) &&
       !isRVVWholeLoadStore(Opcode) && !isRVVSpillForZvlsseg(Opcode))
     return false;
-  return !CheckFIs || any_of(MI.operands(), [](const MachineOperand &MO) {
-    return MO.isFI();
-  });
+  return true;
 }
 
 Optional<std::pair<unsigned, unsigned>>
-RISCVInstrInfo::isRVVSpillForZvlsseg(unsigned Opcode) const {
+RISCV::isRVVSpillForZvlsseg(unsigned Opcode) {
   switch (Opcode) {
   default:
     return None;
@@ -1863,3 +1930,8 @@ RISCVInstrInfo::isRVVSpillForZvlsseg(unsigned Opcode) const {
     return std::make_pair(8u, 1u);
   }
 }
+
+bool RISCV::isFaultFirstLoad(const MachineInstr &MI) {
+  return MI.getNumExplicitDefs() == 2 && MI.modifiesRegister(RISCV::VL) &&
+         !MI.isInlineAsm();
+}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index da0877c4299a..5368437618bd 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -135,6 +135,8 @@ public:
   virtual bool isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
                                       unsigned &Flags) const override;
 
+  bool shouldOutlineFromFunctionByDefault(MachineFunction &MF) const override;
+
   // Calculate target-specific information for a set of outlining candidates.
   outliner::OutlinedFunction getOutliningCandidateInfo(
       std::vector<outliner::Candidate> &RepeatedSequenceLocs) const override;
@@ -153,7 +155,7 @@ public:
   virtual MachineBasicBlock::iterator
   insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
                      MachineBasicBlock::iterator &It, MachineFunction &MF,
-                     const outliner::Candidate &C) const override;
+                     outliner::Candidate &C) const override;
 
   bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1,
                              unsigned &SrcOpIdx2) const override;
@@ -164,25 +166,31 @@ public:
   MachineInstr *convertToThreeAddress(MachineInstr &MI, LiveVariables *LV,
                                       LiveIntervals *LIS) const override;
 
+  // MIR printer helper function to annotate Operands with a comment.
+  std::string
+  createMIROperandComment(const MachineInstr &MI, const MachineOperand &Op,
+                          unsigned OpIdx,
+                          const TargetRegisterInfo *TRI) const override;
+
   Register getVLENFactoredAmount(
       MachineFunction &MF, MachineBasicBlock &MBB,
       MachineBasicBlock::iterator II, const DebugLoc &DL, int64_t Amount,
       MachineInstr::MIFlag Flag = MachineInstr::NoFlags) const;
 
-  // Returns true if the given MI is an RVV instruction opcode for which we may
-  // expect to see a FrameIndex operand. When CheckFIs is true, the instruction
-  // must contain at least one FrameIndex operand.
-  bool isRVVSpill(const MachineInstr &MI, bool CheckFIs) const;
-
-  Optional<std::pair<unsigned, unsigned>>
-  isRVVSpillForZvlsseg(unsigned Opcode) const;
-
 protected:
   const RISCVSubtarget &STI;
 };
 
 namespace RISCV {
 
+// Returns true if the given MI is an RVV instruction opcode for which we may
+// expect to see a FrameIndex operand.
+bool isRVVSpill(const MachineInstr &MI);
+
+Optional<std::pair<unsigned, unsigned>> isRVVSpillForZvlsseg(unsigned Opcode);
+
+bool isFaultFirstLoad(const MachineInstr &MI);
+
 // Implemented in RISCVGenInstrInfo.inc
 int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIndex);
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index ee6a74b7f14f..ee4c026af8f4 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -83,6 +83,21 @@ def riscv_read_cycle_wide : SDNode<"RISCVISD::READ_CYCLE_WIDE",
                                    SDT_RISCVReadCycleWide,
                                    [SDNPHasChain, SDNPSideEffect]>;
 
+def riscv_add_lo : SDNode<"RISCVISD::ADD_LO", SDTIntBinOp>;
+def riscv_hi : SDNode<"RISCVISD::HI", SDTIntUnaryOp>;
+def riscv_lla : SDNode<"RISCVISD::LLA", SDTIntUnaryOp>;
+def riscv_add_tprel : SDNode<"RISCVISD::ADD_TPREL",
+                             SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
+                                                  SDTCisSameAs<0, 2>,
+                                                  SDTCisSameAs<0, 3>,
+                                                  SDTCisInt<0>]>>;
+
+def riscv_la : SDNode<"RISCVISD::LA", SDTLoad,
+                      [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def riscv_la_tls_ie : SDNode<"RISCVISD::LA_TLS_IE", SDTLoad,
+                             [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def riscv_la_tls_gd : SDNode<"RISCVISD::LA_TLS_GD", SDTIntUnaryOp>;
+
 //===----------------------------------------------------------------------===//
 // Operand and SDNode transformation definitions.
 //===----------------------------------------------------------------------===//
@@ -105,6 +120,19 @@ def ImmZeroAsmOperand : AsmOperandClass {
   let DiagnosticType = !strconcat("Invalid", Name);
 }
 
+// A parse method for (${gpr}) or 0(${gpr}), where the 0 is be silently ignored.
+def ZeroOffsetMemOpOperand : AsmOperandClass {
+  let Name = "ZeroOffsetMemOpOperand";
+  let RenderMethod = "addRegOperands";
+  let PredicateMethod = "isGPR";
+  let ParserMethod = "parseZeroOffsetMemOp";
+}
+
+def GPRMemZeroOffset : RegisterOperand<GPR> {
+  let ParserMatchClass = ZeroOffsetMemOpOperand;
+  let PrintMethod = "printZeroOffsetMemOp";
+}
+
 class SImmAsmOperand<int width, string suffix = "">
     : ImmAsmOperand<"S", width, suffix> {
 }
@@ -334,10 +362,19 @@ def ixlenimm_li : Operand<XLenVT> {
 
 // Standalone (codegen-only) immleaf patterns.
 
-// A 12-bit signed immediate plus one where the imm range will be -2047~2048.
+// A 12-bit signed immediate plus one where the imm range will be [-2047, 2048].
 def simm12_plus1 : ImmLeaf<XLenVT,
   [{return (isInt<12>(Imm) && Imm != -2048) || Imm == 2048;}]>;
 
+// A 12-bit signed immediate sub one and exclude zero
+def simm12_minus1_nonzero : PatLeaf<(imm), [{
+  if (!N->hasOneUse())
+    return false;
+  // The immediate operand must be in range [-2049, 0) or (0, 2046].
+  int64_t Imm = N->getSExtValue();
+  return (Imm >= -2049 && Imm < 0) || (Imm > 0 && Imm <= 2046);
+}]>;
+
 // A 6-bit constant greater than 32.
 def uimm6gt32 : ImmLeaf<XLenVT, [{
   return isUInt<6>(Imm) && Imm > 32;
@@ -345,8 +382,10 @@ def uimm6gt32 : ImmLeaf<XLenVT, [{
 
 // Addressing modes.
 // Necessary because a frameindex can't be matched directly in a pattern.
-def AddrFI : ComplexPattern<iPTR, 1, "SelectAddrFI", [frameindex], []>;
+def FrameAddrRegImm : ComplexPattern<iPTR, 2, "SelectFrameAddrRegImm",
+                                     [frameindex, or, add]>;
 def BaseAddr : ComplexPattern<iPTR, 1, "SelectBaseAddr">;
+def AddrRegImm : ComplexPattern<iPTR, 2, "SelectAddrRegImm">;
 
 // Return the negation of an immediate value.
 def NegImm : SDNodeXForm<imm, [{
@@ -360,9 +399,9 @@ def ImmSub32 : SDNodeXForm<imm, [{
                                    N->getValueType(0));
 }]>;
 
-// Return an immediate value plus 32.
-def ImmPlus32 : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getSExtValue() + 32, SDLoc(N),
+// Return an immediate value plus 1.
+def ImmPlus1 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue() + 1, SDLoc(N),
                                    N->getValueType(0));
 }]>;
 
@@ -380,7 +419,9 @@ def ImmSubFrom32 : SDNodeXForm<imm, [{
 }]>;
 
 // Check if (add r, imm) can be optimized to (ADDI (ADDI r, imm0), imm1),
-// in which imm = imm0 + imm1 and both imm0 and imm1 are simm12.
+// in which imm = imm0 + imm1 and both imm0 and imm1 are simm12. We make imm0
+// as large as possible and imm1 as small as possible so that we might be able
+// to use c.addi for the small immediate.
 def AddiPair : PatLeaf<(imm), [{
   if (!N->hasOneUse())
     return false;
@@ -389,19 +430,27 @@ def AddiPair : PatLeaf<(imm), [{
   return (-4096 <= Imm && Imm <= -2049) || (2048 <= Imm && Imm <= 4094);
 }]>;
 
-// Return imm/2.
-def AddiPairImmA : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getSExtValue() / 2, SDLoc(N),
+// Return imm - (imm < 0 ? -2048 : 2047).
+def AddiPairImmSmall : SDNodeXForm<imm, [{
+  int64_t Imm = N->getSExtValue();
+  int64_t Adj = N->getSExtValue() < 0 ? -2048 : 2047;
+  return CurDAG->getTargetConstant(Imm - Adj, SDLoc(N),
                                    N->getValueType(0));
 }]>;
 
-// Return imm - imm/2.
-def AddiPairImmB : SDNodeXForm<imm, [{
-  int64_t Imm = N->getSExtValue();
-  return CurDAG->getTargetConstant(Imm - Imm / 2, SDLoc(N),
+// Return -2048 if immediate is negative or 2047 if positive. These are the
+// largest simm12 values.
+def AddiPairImmLarge : SDNodeXForm<imm, [{
+  int64_t Imm = N->getSExtValue() < 0 ? -2048 : 2047;
+  return CurDAG->getTargetConstant(Imm, SDLoc(N),
                                    N->getValueType(0));
 }]>;
 
+def TrailingZeros : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getAPIntValue().countTrailingZeros(),
+                                   SDLoc(N), N->getValueType(0));
+}]>;
+
 def XLenSubTrailingOnes : SDNodeXForm<imm, [{
   uint64_t XLen = Subtarget->getXLen();
   uint64_t TrailingOnes = N->getAPIntValue().countTrailingOnes();
@@ -410,7 +459,13 @@ def XLenSubTrailingOnes : SDNodeXForm<imm, [{
 }]>;
 
 // Checks if this mask is a non-empty sequence of ones starting at the
-// least significant bit with the remainder zero and exceeds simm12.
+// most/least significant bit with the remainder zero and exceeds simm32/simm12.
+def LeadingOnesMask : PatLeaf<(imm), [{
+  if (!N->hasOneUse())
+    return false;
+  return !isInt<32>(N->getSExtValue()) && isMask_64(~N->getSExtValue());
+}], TrailingZeros>;
+
 def TrailingOnesMask : PatLeaf<(imm), [{
   if (!N->hasOneUse())
     return false;
@@ -437,20 +492,35 @@ class BranchCC_rri<bits<3> funct3, string opcodestr>
   let isTerminator = 1;
 }
 
-let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in
+let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
 class Load_ri<bits<3> funct3, string opcodestr>
     : RVInstI<funct3, OPC_LOAD, (outs GPR:$rd), (ins GPR:$rs1, simm12:$imm12),
               opcodestr, "$rd, ${imm12}(${rs1})">;
 
+class HLoad_r<bits<7> funct7, bits<5> funct5, string opcodestr>
+    : RVInstR<funct7, 0b100, OPC_SYSTEM, (outs GPR:$rd),
+              (ins GPRMemZeroOffset:$rs1), opcodestr, "$rd, $rs1"> {
+  let rs2 = funct5;
+}
+}
+
 // Operands for stores are in the order srcreg, base, offset rather than
 // reflecting the order these fields are specified in the instruction
 // encoding.
-let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
+let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
 class Store_rri<bits<3> funct3, string opcodestr>
     : RVInstS<funct3, OPC_STORE, (outs),
               (ins GPR:$rs2, GPR:$rs1, simm12:$imm12),
               opcodestr, "$rs2, ${imm12}(${rs1})">;
 
+class HStore_rr<bits<7> funct7, string opcodestr>
+    : RVInstR<funct7, 0b100, OPC_SYSTEM, (outs),
+              (ins GPR:$rs2, GPRMemZeroOffset:$rs1),
+               opcodestr, "$rs2, $rs1"> {
+  let rd = 0;
+}
+}
+
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 class ALU_ri<bits<3> funct3, string opcodestr>
     : RVInstI<funct3, OPC_OP_IMM, (outs GPR:$rd), (ins GPR:$rs1, simm12:$imm12),
@@ -465,9 +535,12 @@ class Shift_ri<bits<5> imm11_7, bits<3> funct3, string opcodestr>
       Sched<[WriteShiftImm, ReadShiftImm]>;
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class ALU_rr<bits<7> funct7, bits<3> funct3, string opcodestr>
+class ALU_rr<bits<7> funct7, bits<3> funct3, string opcodestr,
+             bit Commutable = 0>
     : RVInstR<funct7, funct3, OPC_OP, (outs GPR:$rd), (ins GPR:$rs1, GPR:$rs2),
-              opcodestr, "$rd, $rs1, $rs2">;
+              opcodestr, "$rd, $rs1, $rs2"> {
+  let isCommutable = Commutable;
+}
 
 let hasNoSchedulingInfo = 1,
     hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
@@ -490,15 +563,25 @@ class ShiftW_ri<bits<7> imm11_5, bits<3> funct3, string opcodestr>
       Sched<[WriteShiftImm32, ReadShiftImm32]>;
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class ALUW_rr<bits<7> funct7, bits<3> funct3, string opcodestr>
+class ALUW_rr<bits<7> funct7, bits<3> funct3, string opcodestr,
+              bit Commutable = 0>
     : RVInstR<funct7, funct3, OPC_OP_32, (outs GPR:$rd),
-              (ins GPR:$rs1, GPR:$rs2), opcodestr, "$rd, $rs1, $rs2">;
+              (ins GPR:$rs1, GPR:$rs2), opcodestr, "$rd, $rs1, $rs2"> {
+  let isCommutable = Commutable;
+}
 
 let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
 class Priv<string opcodestr, bits<7> funct7>
     : RVInstR<funct7, 0b000, OPC_SYSTEM, (outs), (ins GPR:$rs1, GPR:$rs2),
               opcodestr, "">;
 
+let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
+class Priv_rr<string opcodestr, bits<7> funct7>
+    : RVInstR<funct7, 0b000, OPC_SYSTEM, (outs), (ins GPR:$rs1, GPR:$rs2),
+              opcodestr, "$rs1, $rs2"> {
+  let rd = 0;
+}
+
 //===----------------------------------------------------------------------===//
 // Instructions
 //===----------------------------------------------------------------------===//
@@ -558,16 +641,26 @@ def SLLI : Shift_ri<0b00000, 0b001, "slli">;
 def SRLI : Shift_ri<0b00000, 0b101, "srli">;
 def SRAI : Shift_ri<0b01000, 0b101, "srai">;
 
-def ADD  : ALU_rr<0b0000000, 0b000, "add">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
-def SUB  : ALU_rr<0b0100000, 0b000, "sub">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
-def SLL  : ALU_rr<0b0000000, 0b001, "sll">, Sched<[WriteShiftReg, ReadShiftReg, ReadShiftReg]>;
-def SLT  : ALU_rr<0b0000000, 0b010, "slt">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
-def SLTU : ALU_rr<0b0000000, 0b011, "sltu">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
-def XOR  : ALU_rr<0b0000000, 0b100, "xor">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
-def SRL  : ALU_rr<0b0000000, 0b101, "srl">, Sched<[WriteShiftReg, ReadShiftReg, ReadShiftReg]>;
-def SRA  : ALU_rr<0b0100000, 0b101, "sra">, Sched<[WriteShiftReg, ReadShiftReg, ReadShiftReg]>;
-def OR   : ALU_rr<0b0000000, 0b110, "or">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
-def AND  : ALU_rr<0b0000000, 0b111, "and">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+def ADD  : ALU_rr<0b0000000, 0b000, "add", /*Commutable*/1>,
+           Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+def SUB  : ALU_rr<0b0100000, 0b000, "sub">,
+           Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+def SLL  : ALU_rr<0b0000000, 0b001, "sll">,
+           Sched<[WriteShiftReg, ReadShiftReg, ReadShiftReg]>;
+def SLT  : ALU_rr<0b0000000, 0b010, "slt">,
+           Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+def SLTU : ALU_rr<0b0000000, 0b011, "sltu">,
+           Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+def XOR  : ALU_rr<0b0000000, 0b100, "xor", /*Commutable*/1>,
+           Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+def SRL  : ALU_rr<0b0000000, 0b101, "srl">,
+           Sched<[WriteShiftReg, ReadShiftReg, ReadShiftReg]>;
+def SRA  : ALU_rr<0b0100000, 0b101, "sra">,
+           Sched<[WriteShiftReg, ReadShiftReg, ReadShiftReg]>;
+def OR   : ALU_rr<0b0000000, 0b110, "or", /*Commutable*/1>,
+           Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+def AND  : ALU_rr<0b0000000, 0b111, "and", /*Commutable*/1>,
+           Sched<[WriteIALU, ReadIALU, ReadIALU]>;
 
 let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in {
 def FENCE : RVInstI<0b000, OPC_MISC_MEM, (outs),
@@ -642,7 +735,7 @@ def SLLIW : ShiftW_ri<0b0000000, 0b001, "slliw">;
 def SRLIW : ShiftW_ri<0b0000000, 0b101, "srliw">;
 def SRAIW : ShiftW_ri<0b0100000, 0b101, "sraiw">;
 
-def ADDW  : ALUW_rr<0b0000000, 0b000, "addw">,
+def ADDW  : ALUW_rr<0b0000000, 0b000, "addw", /*Commutable*/1>,
             Sched<[WriteIALU32, ReadIALU32, ReadIALU32]>;
 def SUBW  : ALUW_rr<0b0100000, 0b000, "subw">,
             Sched<[WriteIALU32, ReadIALU32, ReadIALU32]>;
@@ -684,11 +777,40 @@ def WFI : Priv<"wfi", 0b0001000>, Sched<[]> {
   let rs2 = 0b00101;
 }
 
-let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
-def SFENCE_VMA : RVInstR<0b0001001, 0b000, OPC_SYSTEM, (outs),
-                         (ins GPR:$rs1, GPR:$rs2),
-                         "sfence.vma", "$rs1, $rs2">, Sched<[]> {
+def SFENCE_W_INVAL : Priv<"sfence.w.inval", 0b0001100>, Sched<[]> {
   let rd = 0;
+  let rs1 = 0;
+  let rs2 = 0;
+}
+
+def SFENCE_INVAL_IR : Priv<"sfence.inval.ir", 0b0001100>, Sched<[]> {
+  let rd = 0;
+  let rs1 = 0;
+  let rs2 = 0b00001;
+}
+
+def SFENCE_VMA  : Priv_rr<"sfence.vma", 0b0001001>, Sched<[]>;
+def SINVAL_VMA  : Priv_rr<"sinval.vma", 0b0001011>, Sched<[]>;
+def HFENCE_VVMA : Priv_rr<"hfence.vvma", 0b0010001>, Sched<[]>;
+def HFENCE_GVMA : Priv_rr<"hfence.gvma", 0b0110001>, Sched<[]>;
+def HINVAL_VVMA : Priv_rr<"hinval.vvma", 0b0010011>, Sched<[]>;
+def HINVAL_GVMA : Priv_rr<"hinval.gvma", 0b0110011>, Sched<[]>;
+
+def HLV_B   : HLoad_r<0b0110000, 0b00000, "hlv.b">, Sched<[]>;
+def HLV_BU  : HLoad_r<0b0110000, 0b00001, "hlv.bu">, Sched<[]>;
+def HLV_H   : HLoad_r<0b0110010, 0b00000, "hlv.h">, Sched<[]>;
+def HLV_HU  : HLoad_r<0b0110010, 0b00001, "hlv.hu">, Sched<[]>;
+def HLVX_HU : HLoad_r<0b0110010, 0b00011, "hlvx.hu">, Sched<[]>;
+def HLV_W   : HLoad_r<0b0110100, 0b00000, "hlv.w">, Sched<[]>;
+def HLVX_WU : HLoad_r<0b0110100, 0b00011, "hlvx.wu">, Sched<[]>;
+def HSV_B   : HStore_rr<0b0110001, "hsv.b">, Sched<[]>;
+def HSV_H   : HStore_rr<0b0110011, "hsv.h">, Sched<[]>;
+def HSV_W   : HStore_rr<0b0110101, "hsv.w">, Sched<[]>;
+
+let Predicates = [IsRV64] in {
+def HLV_WU  : HLoad_r<0b0110100, 0b00001, "hlv.wu">, Sched<[]>;
+def HLV_D   : HLoad_r<0b0110110, 0b00000, "hlv.d">, Sched<[]>;
+def HSV_D   : HStore_rr<0b0110111, "hsv.d">, Sched<[]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -799,6 +921,9 @@ def : InstAlias<"jalr $rd, $rs, $offset", (JALR GPR:$rd, GPR:$rs, simm12:$offset
 
 def : InstAlias<"fence", (FENCE 0xF, 0xF)>; // 0xF == iorw
 
+let Predicates = [HasStdExtZihintpause] in
+def : InstAlias<"pause", (FENCE 0x1, 0x0)>; // 0x1 == w
+
 def : InstAlias<"rdinstret $rd", (CSRRS GPR:$rd, INSTRET.Encoding, X0)>;
 def : InstAlias<"rdcycle $rd",   (CSRRS GPR:$rd, CYCLE.Encoding, X0)>;
 def : InstAlias<"rdtime $rd",    (CSRRS GPR:$rd, TIME.Encoding, X0)>;
@@ -831,6 +956,12 @@ def : InstAlias<"csrrc $rd, $csr, $imm", (CSRRCI GPR:$rd, csr_sysreg:$csr, uimm5
 def : InstAlias<"sfence.vma",     (SFENCE_VMA      X0, X0)>;
 def : InstAlias<"sfence.vma $rs", (SFENCE_VMA GPR:$rs, X0)>;
 
+def : InstAlias<"hfence.gvma",     (HFENCE_GVMA      X0, X0)>;
+def : InstAlias<"hfence.gvma $rs", (HFENCE_GVMA GPR:$rs, X0)>;
+
+def : InstAlias<"hfence.vvma",     (HFENCE_VVMA      X0, X0)>;
+def : InstAlias<"hfence.vvma $rs", (HFENCE_VVMA GPR:$rs, X0)>;
+
 let EmitPriority = 0 in {
 def : InstAlias<"lb $rd, (${rs1})",
                 (LB  GPR:$rd, GPR:$rs1, 0)>;
@@ -1006,9 +1137,6 @@ class PatGprUimmLog2XLen<SDPatternOperator OpNode, RVInstIShift Inst>
 
 /// Predicates
 
-def IsOrAdd: PatFrag<(ops node:$A, node:$B), (or node:$A, node:$B), [{
-  return isOrEquivalentToAdd(N);
-}]>;
 def assertsexti32 : PatFrag<(ops node:$src), (assertsext node:$src), [{
   return cast<VTSDNode>(N->getOperand(1))->getVT().bitsLE(MVT::i32);
 }]>;
@@ -1018,13 +1146,14 @@ def assertzexti32 : PatFrag<(ops node:$src), (assertzext node:$src), [{
 }]>;
 def zexti32 : ComplexPattern<i64, 1, "selectZExti32">;
 
-def add_oneuse : PatFrag<(ops node:$A, node:$B), (add node:$A, node:$B), [{
+class binop_oneuse<SDPatternOperator operator>
+    : PatFrag<(ops node:$A, node:$B),
+              (operator node:$A, node:$B), [{
   return N->hasOneUse();
 }]>;
 
-def mul_oneuse : PatFrag<(ops node:$A, node:$B), (mul node:$A, node:$B), [{
-  return N->hasOneUse();
-}]>;
+def add_oneuse : binop_oneuse<add>;
+def mul_oneuse : binop_oneuse<mul>;
 
 def mul_const_oneuse : PatFrag<(ops node:$A, node:$B),
                                (mul node:$A, node:$B), [{
@@ -1034,22 +1163,16 @@ def mul_const_oneuse : PatFrag<(ops node:$A, node:$B),
   return false;
 }]>;
 
-def sext_oneuse : PatFrag<(ops node:$A), (sext node:$A), [{
-  return N->hasOneUse();
-}]>;
-
-def zext_oneuse : PatFrag<(ops node:$A), (zext node:$A), [{
+class unop_oneuse<SDPatternOperator operator>
+    : PatFrag<(ops node:$A),
+              (operator node:$A), [{
   return N->hasOneUse();
 }]>;
 
-def anyext_oneuse : PatFrag<(ops node:$A), (anyext node:$A), [{
-  return N->hasOneUse();
-}]>;
-
-def fpext_oneuse : PatFrag<(ops node:$A),
-                           (any_fpextend node:$A), [{
-  return N->hasOneUse();
-}]>;
+def sext_oneuse   : unop_oneuse<sext>;
+def zext_oneuse   : unop_oneuse<zext>;
+def anyext_oneuse : unop_oneuse<anyext>;
+def fpext_oneuse  : unop_oneuse<any_fpextend>;
 
 /// Simple arithmetic operations
 
@@ -1066,7 +1189,9 @@ def : PatGprUimmLog2XLen<shl, SLLI>;
 def : PatGprUimmLog2XLen<srl, SRLI>;
 def : PatGprUimmLog2XLen<sra, SRAI>;
 
-// AND with trailing ones mask exceeding simm12.
+// AND with leading/trailing ones mask exceeding simm32/simm12.
+def : Pat<(i64 (and GPR:$rs, LeadingOnesMask:$mask)),
+          (SLLI (SRLI $rs, LeadingOnesMask:$mask), LeadingOnesMask:$mask)>;
 def : Pat<(XLenVT (and GPR:$rs, TrailingOnesMask:$mask)),
           (SRLI (SLLI $rs, TrailingOnesMask:$mask), TrailingOnesMask:$mask)>;
 
@@ -1099,10 +1224,32 @@ def PseudoAddTPRel : Pseudo<(outs GPR:$rd),
 
 /// FrameIndex calculations
 
-def : Pat<(add (XLenVT AddrFI:$Rs), simm12:$imm12),
-          (ADDI (XLenVT AddrFI:$Rs), simm12:$imm12)>;
-def : Pat<(IsOrAdd (XLenVT AddrFI:$Rs), simm12:$imm12),
-          (ADDI (XLenVT AddrFI:$Rs), simm12:$imm12)>;
+def : Pat<(FrameAddrRegImm GPR:$rs1, simm12:$imm12),
+          (ADDI GPR:$rs1, simm12:$imm12)>;
+
+/// HI and ADD_LO address nodes.
+
+def : Pat<(riscv_hi tglobaladdr:$in), (LUI tglobaladdr:$in)>;
+def : Pat<(riscv_hi tblockaddress:$in), (LUI tblockaddress:$in)>;
+def : Pat<(riscv_hi tjumptable:$in), (LUI tjumptable:$in)>;
+def : Pat<(riscv_hi tconstpool:$in), (LUI tconstpool:$in)>;
+
+def : Pat<(riscv_add_lo GPR:$hi, tglobaladdr:$lo),
+          (ADDI GPR:$hi, tglobaladdr:$lo)>;
+def : Pat<(riscv_add_lo GPR:$hi, tblockaddress:$lo),
+          (ADDI GPR:$hi, tblockaddress:$lo)>;
+def : Pat<(riscv_add_lo GPR:$hi, tjumptable:$lo),
+          (ADDI GPR:$hi, tjumptable:$lo)>;
+def : Pat<(riscv_add_lo GPR:$hi, tconstpool:$lo),
+          (ADDI GPR:$hi, tconstpool:$lo)>;
+
+/// TLS address nodes.
+
+def : Pat<(riscv_hi tglobaltlsaddr:$in), (LUI tglobaltlsaddr:$in)>;
+def : Pat<(riscv_add_tprel GPR:$rs1, GPR:$rs2, tglobaltlsaddr:$src),
+          (PseudoAddTPRel GPR:$rs1, GPR:$rs2, tglobaltlsaddr:$src)>;
+def : Pat<(riscv_add_lo GPR:$src, tglobaltlsaddr:$lo),
+          (ADDI GPR:$src, tglobaltlsaddr:$lo)>;
 
 /// Setcc
 
@@ -1127,6 +1274,10 @@ def : Pat<(setule GPR:$rs1, GPR:$rs2), (XORI (SLTU GPR:$rs2, GPR:$rs1), 1)>;
 def : Pat<(setgt GPR:$rs1, GPR:$rs2), (SLT GPR:$rs2, GPR:$rs1)>;
 def : Pat<(setge GPR:$rs1, GPR:$rs2), (XORI (SLT GPR:$rs1, GPR:$rs2), 1)>;
 def : Pat<(setle GPR:$rs1, GPR:$rs2), (XORI (SLT GPR:$rs2, GPR:$rs1), 1)>;
+def : Pat<(setgt GPR:$rs1, simm12_minus1_nonzero:$imm),
+          (XORI (SLTI GPR:$rs1, (ImmPlus1 simm12_minus1_nonzero:$imm)), 1)>;
+def : Pat<(setugt GPR:$rs1, simm12_minus1_nonzero:$imm),
+          (XORI (SLTIU GPR:$rs1, (ImmPlus1 simm12_minus1_nonzero:$imm)), 1)>;
 
 def IntCCtoRISCVCC : SDNodeXForm<riscv_selectcc, [{
   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
@@ -1185,7 +1336,8 @@ def : Pat<(brind (add GPRJALR:$rs1, simm12:$imm12)),
 // Define isCodeGenOnly = 0 to support parsing assembly "call" instruction.
 let isCall = 1, isBarrier = 1, isCodeGenOnly = 0, Size = 8, hasSideEffects = 0,
     mayStore = 0, mayLoad = 0 in
-def PseudoCALLReg : Pseudo<(outs GPR:$rd), (ins call_symbol:$func), []> {
+def PseudoCALLReg : Pseudo<(outs GPR:$rd), (ins call_symbol:$func), []>,
+                    Sched<[WriteIALU, WriteJalr, ReadJalr]> {
   let AsmString = "call\t$rd, $func";
 }
 
@@ -1196,7 +1348,8 @@ def PseudoCALLReg : Pseudo<(outs GPR:$rd), (ins call_symbol:$func), []> {
 // Define AsmString to print "call" when compile with -S flag.
 // Define isCodeGenOnly = 0 to support parsing assembly "call" instruction.
 let isCall = 1, Defs = [X1], isCodeGenOnly = 0, Size = 8 in
-def PseudoCALL : Pseudo<(outs), (ins call_symbol:$func), []> {
+def PseudoCALL : Pseudo<(outs), (ins call_symbol:$func), []>,
+                 Sched<[WriteIALU, WriteJalr, ReadJalr]> {
   let AsmString = "call\t$func";
 }
 
@@ -1221,7 +1374,8 @@ def PseudoRET : Pseudo<(outs), (ins), [(riscv_ret_flag)]>,
 // Define AsmString to print "tail" when compile with -S flag.
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [X2],
     Size = 8, isCodeGenOnly = 0 in
-def PseudoTAIL : Pseudo<(outs), (ins call_symbol:$dst), []> {
+def PseudoTAIL : Pseudo<(outs), (ins call_symbol:$dst), []>,
+                 Sched<[WriteIALU, WriteJalr, ReadJalr]> {
   let AsmString = "tail\t$dst";
 }
 
@@ -1231,13 +1385,14 @@ def PseudoTAILIndirect : Pseudo<(outs), (ins GPRTC:$rs1),
                          PseudoInstExpansion<(JALR X0, GPR:$rs1, 0)>;
 
 def : Pat<(riscv_tail (iPTR tglobaladdr:$dst)),
-          (PseudoTAIL texternalsym:$dst)>;
+          (PseudoTAIL tglobaladdr:$dst)>;
 def : Pat<(riscv_tail (iPTR texternalsym:$dst)),
           (PseudoTAIL texternalsym:$dst)>;
 
 let isCall = 0, isBarrier = 1, isBranch = 1, isTerminator = 1, Size = 8,
     isCodeGenOnly = 0, hasSideEffects = 0, mayStore = 0, mayLoad = 0 in
-def PseudoJump : Pseudo<(outs GPR:$rd), (ins pseudo_jump_symbol:$target), []> {
+def PseudoJump : Pseudo<(outs GPR:$rd), (ins pseudo_jump_symbol:$target), []>,
+                 Sched<[WriteIALU, WriteJalr, ReadJalr]> {
   let AsmString = "jump\t$target, $rd";
 }
 
@@ -1246,21 +1401,33 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Size = 8, isCodeGenOnly = 0,
 def PseudoLLA : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
                        "lla", "$dst, $src">;
 
+def : Pat<(riscv_lla tglobaladdr:$in), (PseudoLLA tglobaladdr:$in)>;
+def : Pat<(riscv_lla tblockaddress:$in), (PseudoLLA tblockaddress:$in)>;
+def : Pat<(riscv_lla tjumptable:$in), (PseudoLLA tjumptable:$in)>;
+def : Pat<(riscv_lla tconstpool:$in), (PseudoLLA tconstpool:$in)>;
+
 let hasSideEffects = 0, mayLoad = 1, mayStore = 0, Size = 8, isCodeGenOnly = 0,
     isAsmParserOnly = 1 in
 def PseudoLA : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
                       "la", "$dst, $src">;
 
+def : Pat<(riscv_la tglobaladdr:$in), (PseudoLA tglobaladdr:$in)>;
+
 let hasSideEffects = 0, mayLoad = 1, mayStore = 0, Size = 8, isCodeGenOnly = 0,
     isAsmParserOnly = 1 in
 def PseudoLA_TLS_IE : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
                              "la.tls.ie", "$dst, $src">;
 
-let hasSideEffects = 0, mayLoad = 1, mayStore = 0, Size = 8, isCodeGenOnly = 0,
+def : Pat<(riscv_la_tls_ie tglobaltlsaddr:$in),
+          (PseudoLA_TLS_IE  tglobaltlsaddr:$in)>;
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Size = 8, isCodeGenOnly = 0,
     isAsmParserOnly = 1 in
 def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
                              "la.tls.gd", "$dst, $src">;
 
+def : Pat<(riscv_la_tls_gd tglobaltlsaddr:$in),
+          (PseudoLA_TLS_GD  tglobaltlsaddr:$in)>;
 
 /// Sign/Zero Extends
 
@@ -1283,11 +1450,8 @@ def PseudoZEXT_W : Pseudo<(outs GPR:$rd), (ins GPR:$rs), [], "zext.w", "$rd, $rs
 /// Loads
 
 multiclass LdPat<PatFrag LoadOp, RVInst Inst, ValueType vt = XLenVT> {
-  def : Pat<(vt (LoadOp BaseAddr:$rs1)), (Inst BaseAddr:$rs1, 0)>;
-  def : Pat<(vt (LoadOp (add BaseAddr:$rs1, simm12:$imm12))),
-            (Inst BaseAddr:$rs1, simm12:$imm12)>;
-  def : Pat<(vt (LoadOp (IsOrAdd AddrFI:$rs1, simm12:$imm12))),
-            (Inst AddrFI:$rs1, simm12:$imm12)>;
+  def : Pat<(vt (LoadOp (AddrRegImm GPR:$rs1, simm12:$imm12))),
+            (Inst GPR:$rs1, simm12:$imm12)>;
 }
 
 defm : LdPat<sextloadi8, LB>;
@@ -1302,12 +1466,8 @@ defm : LdPat<zextloadi16, LHU>;
 
 multiclass StPat<PatFrag StoreOp, RVInst Inst, RegisterClass StTy,
                  ValueType vt> {
-  def : Pat<(StoreOp (vt StTy:$rs2), BaseAddr:$rs1),
-            (Inst StTy:$rs2, BaseAddr:$rs1, 0)>;
-  def : Pat<(StoreOp (vt StTy:$rs2), (add BaseAddr:$rs1, simm12:$imm12)),
-            (Inst StTy:$rs2, BaseAddr:$rs1, simm12:$imm12)>;
-  def : Pat<(StoreOp (vt StTy:$rs2), (IsOrAdd AddrFI:$rs1, simm12:$imm12)),
-            (Inst StTy:$rs2, AddrFI:$rs1, simm12:$imm12)>;
+  def : Pat<(StoreOp (vt StTy:$rs2), (AddrRegImm GPR:$rs1, simm12:$imm12)),
+            (Inst StTy:$rs2, GPR:$rs1, simm12:$imm12)>;
 }
 
 defm : StPat<truncstorei8, SB, GPR, XLenVT>;
@@ -1415,7 +1575,7 @@ def : Pat<(i64 (shl (and GPR:$rs1, 0xffffffff), uimm5:$shamt)),
 // if only the lower 32 bits of their result is used.
 class binop_allwusers<SDPatternOperator operator>
     : PatFrag<(ops node:$lhs, node:$rhs),
-              (operator node:$lhs, node:$rhs), [{
+              (i64 (operator node:$lhs, node:$rhs)), [{
   return hasAllWUsers(Node);
 }]>;
 
@@ -1496,14 +1656,14 @@ def : Pat<(debugtrap), (EBREAK)>;
 
 /// Simple optimization
 def : Pat<(add GPR:$rs1, (AddiPair:$rs2)),
-          (ADDI (ADDI GPR:$rs1, (AddiPairImmB AddiPair:$rs2)),
-                (AddiPairImmA GPR:$rs2))>;
+          (ADDI (ADDI GPR:$rs1, (AddiPairImmLarge AddiPair:$rs2)),
+                (AddiPairImmSmall GPR:$rs2))>;
 
 let Predicates = [IsRV64] in {
 // Select W instructions if only the lower 32-bits of the result are used.
 def : Pat<(binop_allwusers<add> GPR:$rs1, (AddiPair:$rs2)),
-          (ADDIW (ADDIW GPR:$rs1, (AddiPairImmB AddiPair:$rs2)),
-                 (AddiPairImmA AddiPair:$rs2))>;
+          (ADDIW (ADDIW GPR:$rs1, (AddiPairImmLarge AddiPair:$rs2)),
+                 (AddiPairImmSmall AddiPair:$rs2))>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1519,3 +1679,4 @@ include "RISCVInstrInfoZb.td"
 include "RISCVInstrInfoZk.td"
 include "RISCVInstrInfoV.td"
 include "RISCVInstrInfoZfh.td"
+include "RISCVInstrInfoZicbo.td"
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
index 7d23dafb0346..dd4b174d7e62 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
@@ -11,24 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-//===----------------------------------------------------------------------===//
-// Operand and SDNode transformation definitions.
-//===----------------------------------------------------------------------===//
-
-// A parse method for (${gpr}) or 0(${gpr}), where the 0 is be silently ignored.
-// Used for GNU as Compatibility.
-def AtomicMemOpOperand : AsmOperandClass {
-  let Name = "AtomicMemOpOperand";
-  let RenderMethod = "addRegOperands";
-  let PredicateMethod = "isGPR";
-  let ParserMethod = "parseAtomicMemOp";
-}
-
-def GPRMemAtomic : RegisterOperand<GPR> {
-  let ParserMatchClass = AtomicMemOpOperand;
-  let PrintMethod = "printAtomicMemOp";
-}
-
 //===----------------------------------------------------------------------===//
 // Instruction class templates
 //===----------------------------------------------------------------------===//
@@ -36,7 +18,7 @@ def GPRMemAtomic : RegisterOperand<GPR> {
 let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in
 class LR_r<bit aq, bit rl, bits<3> funct3, string opcodestr>
     : RVInstRAtomic<0b00010, aq, rl, funct3, OPC_AMO,
-                    (outs GPR:$rd), (ins GPRMemAtomic:$rs1),
+                    (outs GPR:$rd), (ins GPRMemZeroOffset:$rs1),
                     opcodestr, "$rd, $rs1"> {
   let rs2 = 0;
 }
@@ -51,7 +33,7 @@ multiclass LR_r_aq_rl<bits<3> funct3, string opcodestr> {
 let hasSideEffects = 0, mayLoad = 1, mayStore = 1 in
 class AMO_rr<bits<5> funct5, bit aq, bit rl, bits<3> funct3, string opcodestr>
     : RVInstRAtomic<funct5, aq, rl, funct3, OPC_AMO,
-                    (outs GPR:$rd), (ins GPRMemAtomic:$rs1, GPR:$rs2),
+                    (outs GPR:$rd), (ins GPRMemZeroOffset:$rs1, GPR:$rs2),
                     opcodestr, "$rd, $rs2, $rs1">;
 
 multiclass AMO_rr_aq_rl<bits<5> funct5, bits<3> funct3, string opcodestr> {
@@ -63,12 +45,8 @@ multiclass AMO_rr_aq_rl<bits<5> funct5, bits<3> funct3, string opcodestr> {
 
 multiclass AtomicStPat<PatFrag StoreOp, RVInst Inst, RegisterClass StTy,
                        ValueType vt = XLenVT> {
-  def : Pat<(StoreOp BaseAddr:$rs1, (vt StTy:$rs2)),
-            (Inst StTy:$rs2, BaseAddr:$rs1, 0)>;
-  def : Pat<(StoreOp (add BaseAddr:$rs1, simm12:$imm12), (vt StTy:$rs2)),
-            (Inst StTy:$rs2, BaseAddr:$rs1, simm12:$imm12)>;
-  def : Pat<(StoreOp (IsOrAdd AddrFI:$rs1, simm12:$imm12), (vt StTy:$rs2)),
-            (Inst StTy:$rs2, AddrFI:$rs1, simm12:$imm12)>;
+  def : Pat<(StoreOp (AddrRegImm GPR:$rs1, simm12:$imm12), (vt StTy:$rs2)),
+            (Inst StTy:$rs2, GPR:$rs1, simm12:$imm12)>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
index 2837b92da81f..6fb9e36d7666 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -25,6 +25,69 @@ def SDT_RISCVSplitF64     : SDTypeProfile<2, 1, [SDTCisVT<0, i32>,
 def RISCVBuildPairF64 : SDNode<"RISCVISD::BuildPairF64", SDT_RISCVBuildPairF64>;
 def RISCVSplitF64     : SDNode<"RISCVISD::SplitF64", SDT_RISCVSplitF64>;
 
+//===----------------------------------------------------------------------===//
+// Operand and SDNode transformation definitions.
+//===----------------------------------------------------------------------===//
+
+// Zdinx
+
+def GPRPF64AsFPR : AsmOperandClass {
+  let Name = "GPRPF64AsFPR";
+  let ParserMethod = "parseGPRAsFPR";
+  let RenderMethod = "addRegOperands";
+}
+
+def GPRF64AsFPR : AsmOperandClass {
+  let Name = "GPRF64AsFPR";
+  let ParserMethod = "parseGPRAsFPR";
+  let RenderMethod = "addRegOperands";
+}
+
+def FPR64INX : RegisterOperand<GPRF64> {
+  let ParserMatchClass = GPRF64AsFPR;
+  let DecoderMethod = "DecodeGPRRegisterClass";
+}
+
+def FPR64IN32X : RegisterOperand<GPRPF64> {
+  let ParserMatchClass = GPRPF64AsFPR;
+}
+
+def DExt       : ExtInfo<0, [HasStdExtD]>;
+def D64Ext     : ExtInfo<0, [HasStdExtD, IsRV64]>;
+def ZdinxExt   : ExtInfo<1, [HasStdExtZdinx, IsRV64]>;
+def Zdinx32Ext : ExtInfo<2, [HasStdExtZdinx, IsRV32]>;
+
+def D       : ExtInfo_r<DExt,       FPR64>;
+def D_INX   : ExtInfo_r<ZdinxExt,   FPR64INX>;
+def D_IN32X : ExtInfo_r<Zdinx32Ext, FPR64IN32X>;
+
+def DD       : ExtInfo_rr<DExt,       FPR64,      FPR64>;
+def DD_INX   : ExtInfo_rr<ZdinxExt,   FPR64INX,   FPR64INX>;
+def DD_IN32X : ExtInfo_rr<Zdinx32Ext, FPR64IN32X, FPR64IN32X>;
+def DF       : ExtInfo_rr<DExt,       FPR64,      FPR32>;
+def DF_INX   : ExtInfo_rr<ZdinxExt,   FPR64INX,   FPR32INX>;
+def DF_IN32X : ExtInfo_rr<Zdinx32Ext, FPR64IN32X, FPR32INX>;
+def DX       : ExtInfo_rr<DExt,       FPR64,      GPR>;
+def DX_INX   : ExtInfo_rr<ZdinxExt,   FPR64INX,   GPR>;
+def DX_IN32X : ExtInfo_rr<Zdinx32Ext, FPR64IN32X, GPR>;
+def DX_64    : ExtInfo_rr<D64Ext,     FPR64,      GPR>;
+def FD       : ExtInfo_rr<DExt,       FPR32,      FPR64>;
+def FD_INX   : ExtInfo_rr<ZdinxExt,   FPR32INX,   FPR64INX>;
+def FD_IN32X : ExtInfo_rr<Zdinx32Ext, FPR32INX,   FPR64IN32X>;
+def XD       : ExtInfo_rr<DExt,       GPR,        FPR64>;
+def XD_INX   : ExtInfo_rr<ZdinxExt,   GPR,        FPR64INX>;
+def XD_IN32X : ExtInfo_rr<Zdinx32Ext, GPR,        FPR64IN32X>;
+def XD_64    : ExtInfo_rr<D64Ext,     GPR,        FPR64>;
+
+defvar DINX    = [D,     D_INX,  D_IN32X];
+defvar DDINX   = [DD,    DD_INX, DD_IN32X];
+defvar DXINX   = [DX,    DX_INX, DX_IN32X];
+defvar DFINX   = [DF,    DF_INX, DF_IN32X];
+defvar FDINX   = [FD,    FD_INX, FD_IN32X];
+defvar XDINX   = [XD,    XD_INX, XD_IN32X];
+defvar DXIN64X = [DX_64, DX_INX];
+defvar XDIN64X = [XD_64, XD_INX];
+
 //===----------------------------------------------------------------------===//
 // Instructions
 //===----------------------------------------------------------------------===//
@@ -36,106 +99,104 @@ def FLD : FPLoad_r<0b011, "fld", FPR64, WriteFLD64>;
 // reflecting the order these fields are specified in the instruction
 // encoding.
 def FSD : FPStore_r<0b011, "fsd", FPR64, WriteFST64>;
+} // Predicates = [HasStdExtD]
 
 let SchedRW = [WriteFMA64, ReadFMA64, ReadFMA64, ReadFMA64] in {
-def FMADD_D  : FPFMA_rrr_frm<OPC_MADD,  0b01, "fmadd.d",  FPR64>;
-def FMSUB_D  : FPFMA_rrr_frm<OPC_MSUB,  0b01, "fmsub.d",  FPR64>;
-def FNMSUB_D : FPFMA_rrr_frm<OPC_NMSUB, 0b01, "fnmsub.d", FPR64>;
-def FNMADD_D : FPFMA_rrr_frm<OPC_NMADD, 0b01, "fnmadd.d", FPR64>;
+defm FMADD_D  : FPFMA_rrr_frm_m<OPC_MADD,  0b01, "fmadd.d",  DINX>;
+defm FMSUB_D  : FPFMA_rrr_frm_m<OPC_MSUB,  0b01, "fmsub.d",  DINX>;
+defm FNMSUB_D : FPFMA_rrr_frm_m<OPC_NMSUB, 0b01, "fnmsub.d", DINX>;
+defm FNMADD_D : FPFMA_rrr_frm_m<OPC_NMADD, 0b01, "fnmadd.d", DINX>;
 }
 
-def : FPFMADynFrmAlias<FMADD_D,  "fmadd.d",  FPR64>;
-def : FPFMADynFrmAlias<FMSUB_D,  "fmsub.d",  FPR64>;
-def : FPFMADynFrmAlias<FNMSUB_D, "fnmsub.d", FPR64>;
-def : FPFMADynFrmAlias<FNMADD_D, "fnmadd.d", FPR64>;
-
-def FADD_D : FPALU_rr_frm<0b0000001, "fadd.d", FPR64>,
-             Sched<[WriteFALU64, ReadFALU64, ReadFALU64]>;
-def FSUB_D : FPALU_rr_frm<0b0000101, "fsub.d", FPR64>,
-             Sched<[WriteFALU64, ReadFALU64, ReadFALU64]>;
-def FMUL_D : FPALU_rr_frm<0b0001001, "fmul.d", FPR64>,
-             Sched<[WriteFMul64, ReadFMul64, ReadFMul64]>;
-def FDIV_D : FPALU_rr_frm<0b0001101, "fdiv.d", FPR64>,
-             Sched<[WriteFDiv64, ReadFDiv64, ReadFDiv64]>;
-
-def        : FPALUDynFrmAlias<FADD_D, "fadd.d", FPR64>;
-def        : FPALUDynFrmAlias<FSUB_D, "fsub.d", FPR64>;
-def        : FPALUDynFrmAlias<FMUL_D, "fmul.d", FPR64>;
-def        : FPALUDynFrmAlias<FDIV_D, "fdiv.d", FPR64>;
-
-def FSQRT_D : FPUnaryOp_r_frm<0b0101101, 0b00000, FPR64, FPR64, "fsqrt.d">,
-              Sched<[WriteFSqrt64, ReadFSqrt64]>;
-def         : FPUnaryOpDynFrmAlias<FSQRT_D, "fsqrt.d", FPR64, FPR64>;
+defm : FPFMADynFrmAlias_m<FMADD_D,  "fmadd.d",  DINX>;
+defm : FPFMADynFrmAlias_m<FMSUB_D,  "fmsub.d",  DINX>;
+defm : FPFMADynFrmAlias_m<FNMSUB_D, "fnmsub.d", DINX>;
+defm : FPFMADynFrmAlias_m<FNMADD_D, "fnmadd.d", DINX>;
+
+let SchedRW = [WriteFALU64, ReadFALU64, ReadFALU64] in {
+defm FADD_D : FPALU_rr_frm_m<0b0000001, "fadd.d", DINX, /*Commutable*/1>;
+defm FSUB_D : FPALU_rr_frm_m<0b0000101, "fsub.d", DINX>;
+}
+let SchedRW = [WriteFMul64, ReadFMul64, ReadFMul64] in
+defm FMUL_D : FPALU_rr_frm_m<0b0001001, "fmul.d", DINX, /*Commutable*/1>;
+
+let SchedRW = [WriteFDiv64, ReadFDiv64, ReadFDiv64] in
+defm FDIV_D : FPALU_rr_frm_m<0b0001101, "fdiv.d", DINX>;
+
+defm : FPALUDynFrmAlias_m<FADD_D, "fadd.d", DINX>;
+defm : FPALUDynFrmAlias_m<FSUB_D, "fsub.d", DINX>;
+defm : FPALUDynFrmAlias_m<FMUL_D, "fmul.d", DINX>;
+defm : FPALUDynFrmAlias_m<FDIV_D, "fdiv.d", DINX>;
+
+defm FSQRT_D : FPUnaryOp_r_frm_m<0b0101101, 0b00000, DDINX, "fsqrt.d">,
+               Sched<[WriteFSqrt64, ReadFSqrt64]>;
+defm         : FPUnaryOpDynFrmAlias_m<FSQRT_D, "fsqrt.d", DDINX>;
 
 let SchedRW = [WriteFSGNJ64, ReadFSGNJ64, ReadFSGNJ64],
     mayRaiseFPException = 0 in {
-def FSGNJ_D  : FPALU_rr<0b0010001, 0b000, "fsgnj.d", FPR64>;
-def FSGNJN_D : FPALU_rr<0b0010001, 0b001, "fsgnjn.d", FPR64>;
-def FSGNJX_D : FPALU_rr<0b0010001, 0b010, "fsgnjx.d", FPR64>;
+defm FSGNJ_D  : FPALU_rr_m<0b0010001, 0b000, "fsgnj.d",  DINX>;
+defm FSGNJN_D : FPALU_rr_m<0b0010001, 0b001, "fsgnjn.d", DINX>;
+defm FSGNJX_D : FPALU_rr_m<0b0010001, 0b010, "fsgnjx.d", DINX>;
 }
 
 let SchedRW = [WriteFMinMax64, ReadFMinMax64, ReadFMinMax64] in {
-def FMIN_D   : FPALU_rr<0b0010101, 0b000, "fmin.d", FPR64>;
-def FMAX_D   : FPALU_rr<0b0010101, 0b001, "fmax.d", FPR64>;
+defm FMIN_D   : FPALU_rr_m<0b0010101, 0b000, "fmin.d", DINX, /*Commutable*/1>;
+defm FMAX_D   : FPALU_rr_m<0b0010101, 0b001, "fmax.d", DINX, /*Commutable*/1>;
 }
 
-def FCVT_S_D : FPUnaryOp_r_frm<0b0100000, 0b00001, FPR32, FPR64, "fcvt.s.d">,
-               Sched<[WriteFCvtF64ToF32, ReadFCvtF64ToF32]>;
-def          : FPUnaryOpDynFrmAlias<FCVT_S_D, "fcvt.s.d", FPR32, FPR64>;
+defm FCVT_S_D : FPUnaryOp_r_frm_m<0b0100000, 0b00001, FDINX, "fcvt.s.d">,
+                Sched<[WriteFCvtF64ToF32, ReadFCvtF64ToF32]>;
+defm          : FPUnaryOpDynFrmAlias_m<FCVT_S_D, "fcvt.s.d", FDINX>;
 
-def FCVT_D_S : FPUnaryOp_r<0b0100001, 0b00000, 0b000, FPR64, FPR32, "fcvt.d.s">,
-               Sched<[WriteFCvtF32ToF64, ReadFCvtF32ToF64]>;
+defm FCVT_D_S : FPUnaryOp_r_m<0b0100001, 0b00000, 0b000, DFINX, "fcvt.d.s">,
+                Sched<[WriteFCvtF32ToF64, ReadFCvtF32ToF64]>;
 
 let SchedRW = [WriteFCmp64, ReadFCmp64, ReadFCmp64] in {
-def FEQ_D : FPCmp_rr<0b1010001, 0b010, "feq.d", FPR64>;
-def FLT_D : FPCmp_rr<0b1010001, 0b001, "flt.d", FPR64>;
-def FLE_D : FPCmp_rr<0b1010001, 0b000, "fle.d", FPR64>;
+defm FEQ_D : FPCmp_rr_m<0b1010001, 0b010, "feq.d", DINX, /*Commutable*/1>;
+defm FLT_D : FPCmp_rr_m<0b1010001, 0b001, "flt.d", DINX>;
+defm FLE_D : FPCmp_rr_m<0b1010001, 0b000, "fle.d", DINX>;
 }
 
-let mayRaiseFPException = 0 in
-def FCLASS_D : FPUnaryOp_r<0b1110001, 0b00000, 0b001, GPR, FPR64, "fclass.d">,
-               Sched<[WriteFClass64, ReadFClass64]>;
+defm FCLASS_D : FPUnaryOp_r_m<0b1110001, 0b00000, 0b001, XDINX, "fclass.d">,
+                Sched<[WriteFClass64, ReadFClass64]>;
 
-def FCVT_W_D : FPUnaryOp_r_frm<0b1100001, 0b00000, GPR, FPR64, "fcvt.w.d">,
+defm FCVT_W_D : FPUnaryOp_r_frm_m<0b1100001, 0b00000, XDINX, "fcvt.w.d">,
                Sched<[WriteFCvtF64ToI32, ReadFCvtF64ToI32]>;
-def          : FPUnaryOpDynFrmAlias<FCVT_W_D, "fcvt.w.d", GPR, FPR64>;
+defm          : FPUnaryOpDynFrmAlias_m<FCVT_W_D, "fcvt.w.d", XDINX>;
 
-def FCVT_WU_D : FPUnaryOp_r_frm<0b1100001, 0b00001, GPR, FPR64, "fcvt.wu.d">,
-                Sched<[WriteFCvtF64ToI32, ReadFCvtF64ToI32]>;
-def           : FPUnaryOpDynFrmAlias<FCVT_WU_D, "fcvt.wu.d", GPR, FPR64>;
+defm FCVT_WU_D : FPUnaryOp_r_frm_m<0b1100001, 0b00001, XDINX, "fcvt.wu.d">,
+                 Sched<[WriteFCvtF64ToI32, ReadFCvtF64ToI32]>;
+defm           : FPUnaryOpDynFrmAlias_m<FCVT_WU_D, "fcvt.wu.d", XDINX>;
 
-def FCVT_D_W : FPUnaryOp_r<0b1101001, 0b00000, 0b000, FPR64, GPR, "fcvt.d.w">,
-               Sched<[WriteFCvtI32ToF64, ReadFCvtI32ToF64]>;
-
-def FCVT_D_WU : FPUnaryOp_r<0b1101001, 0b00001, 0b000, FPR64, GPR, "fcvt.d.wu">,
+defm FCVT_D_W : FPUnaryOp_r_m<0b1101001, 0b00000, 0b000, DXINX, "fcvt.d.w">,
                 Sched<[WriteFCvtI32ToF64, ReadFCvtI32ToF64]>;
-} // Predicates = [HasStdExtD]
 
-let Predicates = [HasStdExtD, IsRV64] in {
-def FCVT_L_D : FPUnaryOp_r_frm<0b1100001, 0b00010, GPR, FPR64, "fcvt.l.d">,
-               Sched<[WriteFCvtF64ToI64, ReadFCvtF64ToI64]>;
-def          : FPUnaryOpDynFrmAlias<FCVT_L_D, "fcvt.l.d", GPR, FPR64>;
+defm FCVT_D_WU : FPUnaryOp_r_m<0b1101001, 0b00001, 0b000, DXINX, "fcvt.d.wu">,
+                 Sched<[WriteFCvtI32ToF64, ReadFCvtI32ToF64]>;
 
-def FCVT_LU_D : FPUnaryOp_r_frm<0b1100001, 0b00011, GPR, FPR64, "fcvt.lu.d">,
+defm FCVT_L_D : FPUnaryOp_r_frm_m<0b1100001, 0b00010, XDIN64X, "fcvt.l.d">,
                 Sched<[WriteFCvtF64ToI64, ReadFCvtF64ToI64]>;
-def           : FPUnaryOpDynFrmAlias<FCVT_LU_D, "fcvt.lu.d", GPR, FPR64>;
+defm          : FPUnaryOpDynFrmAlias_m<FCVT_L_D, "fcvt.l.d", XDIN64X>;
 
-let mayRaiseFPException = 0 in
+defm FCVT_LU_D : FPUnaryOp_r_frm_m<0b1100001, 0b00011, XDIN64X, "fcvt.lu.d">,
+                 Sched<[WriteFCvtF64ToI64, ReadFCvtF64ToI64]>;
+defm           : FPUnaryOpDynFrmAlias_m<FCVT_LU_D, "fcvt.lu.d", XDIN64X>;
+
+let Predicates = [HasStdExtD, IsRV64], mayRaiseFPException = 0 in
 def FMV_X_D : FPUnaryOp_r<0b1110001, 0b00000, 0b000, GPR, FPR64, "fmv.x.d">,
               Sched<[WriteFMovF64ToI64, ReadFMovF64ToI64]>;
 
-def FCVT_D_L : FPUnaryOp_r_frm<0b1101001, 0b00010, FPR64, GPR, "fcvt.d.l">,
-               Sched<[WriteFCvtI64ToF64, ReadFCvtI64ToF64]>;
-def          : FPUnaryOpDynFrmAlias<FCVT_D_L, "fcvt.d.l", FPR64, GPR>;
-
-def FCVT_D_LU : FPUnaryOp_r_frm<0b1101001, 0b00011, FPR64, GPR, "fcvt.d.lu">,
+defm FCVT_D_L : FPUnaryOp_r_frm_m<0b1101001, 0b00010, DXIN64X, "fcvt.d.l">,
                 Sched<[WriteFCvtI64ToF64, ReadFCvtI64ToF64]>;
-def           : FPUnaryOpDynFrmAlias<FCVT_D_LU, "fcvt.d.lu", FPR64, GPR>;
+defm          : FPUnaryOpDynFrmAlias_m<FCVT_D_L, "fcvt.d.l", DXIN64X>;
 
-let mayRaiseFPException = 0 in
+defm FCVT_D_LU : FPUnaryOp_r_frm_m<0b1101001, 0b00011, DXIN64X, "fcvt.d.lu">,
+                 Sched<[WriteFCvtI64ToF64, ReadFCvtI64ToF64]>;
+defm           : FPUnaryOpDynFrmAlias_m<FCVT_D_LU, "fcvt.d.lu", DXIN64X>;
+
+let Predicates = [HasStdExtD, IsRV64], mayRaiseFPException = 0 in
 def FMV_D_X : FPUnaryOp_r<0b1111001, 0b00000, 0b000, FPR64, GPR, "fmv.d.x">,
               Sched<[WriteFMovI64ToF64, ReadFMovI64ToF64]>;
-} // Predicates = [HasStdExtD, IsRV64]
 
 //===----------------------------------------------------------------------===//
 // Assembler Pseudo Instructions (User-Level ISA, Version 2.2, Chapter 20)
@@ -164,16 +225,30 @@ def PseudoQuietFLT_D : PseudoQuietFCMP<FPR64>;
 }
 } // Predicates = [HasStdExtD]
 
+let Predicates = [HasStdExtZdinx, IsRV64] in {
+def : InstAlias<"fabs.d $rd, $rs", (FSGNJX_D_INX FPR64INX:$rd, FPR64INX:$rs, FPR64INX:$rs)>;
+def : InstAlias<"fneg.d $rd, $rs", (FSGNJN_D_INX FPR64INX:$rd, FPR64INX:$rs, FPR64INX:$rs)>;
+
+def : InstAlias<"fgt.d $rd, $rs, $rt",
+                (FLT_D_INX GPR:$rd, FPR64INX:$rt, FPR64INX:$rs), 0>;
+def : InstAlias<"fge.d $rd, $rs, $rt",
+                (FLE_D_INX GPR:$rd, FPR64INX:$rt, FPR64INX:$rs), 0>;
+} // Predicates = [HasStdExtZdinx, IsRV64]
+
+let Predicates = [HasStdExtZdinx, IsRV32] in {
+def : InstAlias<"fabs.d $rd, $rs", (FSGNJX_D_IN32X FPR64IN32X:$rd, FPR64IN32X:$rs, FPR64IN32X:$rs)>;
+def : InstAlias<"fneg.d $rd, $rs", (FSGNJN_D_IN32X FPR64IN32X:$rd, FPR64IN32X:$rs, FPR64IN32X:$rs)>;
+
+def : InstAlias<"fgt.d $rd, $rs, $rt",
+                (FLT_D_IN32X GPR:$rd, FPR64IN32X:$rt, FPR64IN32X:$rs), 0>;
+def : InstAlias<"fge.d $rd, $rs, $rt",
+                (FLE_D_IN32X GPR:$rd, FPR64IN32X:$rt, FPR64IN32X:$rs), 0>;
+} // Predicates = [HasStdExtZdinx, IsRV32]
+
 //===----------------------------------------------------------------------===//
 // Pseudo-instructions and codegen patterns
 //===----------------------------------------------------------------------===//
 
-class PatFpr64Fpr64<SDPatternOperator OpNode, RVInstR Inst>
-    : Pat<(OpNode FPR64:$rs1, FPR64:$rs2), (Inst $rs1, $rs2)>;
-
-class PatFpr64Fpr64DynFrm<SDPatternOperator OpNode, RVInstRFrm Inst>
-    : Pat<(OpNode FPR64:$rs1, FPR64:$rs2), (Inst $rs1, $rs2, 0b111)>;
-
 let Predicates = [HasStdExtD] in {
 
 /// Float conversion operations
@@ -187,17 +262,17 @@ def : Pat<(any_fpextend FPR32:$rs1), (FCVT_D_S FPR32:$rs1)>;
 
 /// Float arithmetic operations
 
-def : PatFpr64Fpr64DynFrm<any_fadd, FADD_D>;
-def : PatFpr64Fpr64DynFrm<any_fsub, FSUB_D>;
-def : PatFpr64Fpr64DynFrm<any_fmul, FMUL_D>;
-def : PatFpr64Fpr64DynFrm<any_fdiv, FDIV_D>;
+def : PatFprFprDynFrm<any_fadd, FADD_D, FPR64>;
+def : PatFprFprDynFrm<any_fsub, FSUB_D, FPR64>;
+def : PatFprFprDynFrm<any_fmul, FMUL_D, FPR64>;
+def : PatFprFprDynFrm<any_fdiv, FDIV_D, FPR64>;
 
 def : Pat<(any_fsqrt FPR64:$rs1), (FSQRT_D FPR64:$rs1, 0b111)>;
 
 def : Pat<(fneg FPR64:$rs1), (FSGNJN_D $rs1, $rs1)>;
 def : Pat<(fabs FPR64:$rs1), (FSGNJX_D $rs1, $rs1)>;
 
-def : PatFpr64Fpr64<fcopysign, FSGNJ_D>;
+def : PatFprFpr<fcopysign, FSGNJ_D, FPR64>;
 def : Pat<(fcopysign FPR64:$rs1, (fneg FPR64:$rs2)), (FSGNJN_D $rs1, $rs2)>;
 def : Pat<(fcopysign FPR64:$rs1, FPR32:$rs2), (FSGNJ_D $rs1, (FCVT_D_S $rs2))>;
 def : Pat<(fcopysign FPR32:$rs1, FPR64:$rs2), (FSGNJ_S $rs1, (FCVT_S_D $rs2,
@@ -219,11 +294,15 @@ def : Pat<(any_fma (fneg FPR64:$rs1), FPR64:$rs2, FPR64:$rs3),
 def : Pat<(any_fma (fneg FPR64:$rs1), FPR64:$rs2, (fneg FPR64:$rs3)),
           (FNMADD_D FPR64:$rs1, FPR64:$rs2, FPR64:$rs3, 0b111)>;
 
+// fnmadd: -(rs1 * rs2 + rs3) (the nsz flag on the FMA)
+def : Pat<(fneg (any_fma_nsz FPR64:$rs1, FPR64:$rs2, FPR64:$rs3)),
+          (FNMADD_D FPR64:$rs1, FPR64:$rs2, FPR64:$rs3, 0b111)>;
+
 // The ratified 20191213 ISA spec defines fmin and fmax in a way that matches
 // LLVM's fminnum and fmaxnum.
 // <https://github.com/riscv/riscv-isa-manual/commit/cd20cee7efd9bac7c5aa127ec3b451749d2b3cce>.
-def : PatFpr64Fpr64<fminnum, FMIN_D>;
-def : PatFpr64Fpr64<fmaxnum, FMAX_D>;
+def : PatFprFpr<fminnum, FMIN_D, FPR64>;
+def : PatFprFpr<fmaxnum, FMAX_D, FPR64>;
 
 /// Setcc
 // FIXME: SETEQ/SETLT/SETLE imply nonans, can we pick better instructions for
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
index a8ac06ba8da3..a71d5b4737c3 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
@@ -53,10 +53,81 @@ def riscv_any_fcvt_wu_rv64 : PatFrags<(ops node:$src, node:$frm),
                                       [(riscv_strict_fcvt_wu_rv64 node:$src, node:$frm),
                                        (riscv_fcvt_wu_rv64 node:$src, node:$frm)]>;
 
+def any_fma_nsz : PatFrag<(ops node:$rs1, node:$rs2, node:$rs3),
+                          (any_fma node:$rs1, node:$rs2, node:$rs3), [{
+  return N->getFlags().hasNoSignedZeros();
+}]>;
 //===----------------------------------------------------------------------===//
 // Operand and SDNode transformation definitions.
 //===----------------------------------------------------------------------===//
 
+// Zfinx
+
+def GPRAsFPR : AsmOperandClass {
+  let Name = "GPRAsFPR";
+  let ParserMethod = "parseGPRAsFPR";
+  let RenderMethod = "addRegOperands";
+}
+
+def FPR32INX : RegisterOperand<GPRF32> {
+  let ParserMatchClass = GPRAsFPR;
+  let DecoderMethod = "DecodeGPRRegisterClass";
+}
+
+// inx = 0 : f, d, zfh, zfhmin
+//     = 1 : zfinx, zdinx, zhinx, zhinxmin
+//     = 2 : zdinx_rv32
+class ExtInfo<bits<2> inx, list<Predicate> pres> {
+  string Suffix = !cond(!eq(inx, 0): "",
+                        !eq(inx, 1): "_INX",
+                        !eq(inx, 2): "_IN32X");
+  list<Predicate> Predicates = pres;
+  string Space = !cond(!eq(inx, 0): "",
+                       !eq(inx, 1): "RVZfinx",
+                       !eq(inx, 2): "RV32Zdinx");
+}
+
+class ExtInfo_r<ExtInfo ext, DAGOperand reg> {
+  string Suffix = ext.Suffix;
+  list<Predicate> Predicates = ext.Predicates;
+  string Space = ext.Space;
+  DAGOperand Reg = reg;
+}
+
+class ExtInfo_rr<ExtInfo ext, DAGOperand rdty, DAGOperand rs1ty> {
+  string Suffix = ext.Suffix;
+  list<Predicate> Predicates = ext.Predicates;
+  string Space = ext.Space;
+  DAGOperand RdTy = rdty;
+  DAGOperand Rs1Ty = rs1ty;
+}
+
+def FExt       : ExtInfo<0, [HasStdExtF]>;
+def F64Ext     : ExtInfo<0, [HasStdExtF, IsRV64]>;
+def ZfinxExt   : ExtInfo<1, [HasStdExtZfinx]>;
+def Zfinx64Ext : ExtInfo<1, [HasStdExtZfinx, IsRV64]>;
+
+def F      : ExtInfo_r<FExt,     FPR32>;
+def F_INX  : ExtInfo_r<ZfinxExt, FPR32INX>;
+
+def FF        : ExtInfo_rr<FExt,       FPR32,    FPR32>;
+def FF_INX    : ExtInfo_rr<ZfinxExt,   FPR32INX, FPR32INX>;
+def FX        : ExtInfo_rr<FExt,       FPR32,    GPR>;
+def FX_INX    : ExtInfo_rr<ZfinxExt,   FPR32INX, GPR>;
+def FX_64     : ExtInfo_rr<F64Ext,     FPR32,    GPR>;
+def FX_INX_64 : ExtInfo_rr<Zfinx64Ext, FPR32INX, GPR>;
+def XF        : ExtInfo_rr<FExt,       GPR,      FPR32>;
+def XF_64     : ExtInfo_rr<F64Ext,     GPR,      FPR32>;
+def XF_INX    : ExtInfo_rr<ZfinxExt,   GPR,      FPR32INX>;
+def XF_INX_64 : ExtInfo_rr<Zfinx64Ext, GPR,      FPR32INX>;
+
+defvar FINX    = [F,     F_INX];
+defvar FFINX   = [FF,    FF_INX];
+defvar FXINX   = [FX,    FX_INX];
+defvar XFINX   = [XF,    XF_INX];
+defvar XFIN64X = [XF_64, XF_INX_64];
+defvar FXIN64X = [FX_64, FX_INX_64];
+
 // Floating-point rounding mode
 
 def FRMArg : AsmOperandClass {
@@ -92,64 +163,131 @@ class FPStore_r<bits<3> funct3, string opcodestr, RegisterClass rty,
       Sched<[sw, ReadStoreData, ReadFMemBase]>;
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1,
-    UseNamedOperandTable = 1, hasPostISelHook = 1 in
+    UseNamedOperandTable = 1, hasPostISelHook = 1, isCommutable = 1 in
 class FPFMA_rrr_frm<RISCVOpcode opcode, bits<2> funct2, string opcodestr,
-                    RegisterClass rty>
+                    DAGOperand rty>
     : RVInstR4Frm<funct2, opcode, (outs rty:$rd),
                   (ins rty:$rs1, rty:$rs2, rty:$rs3, frmarg:$frm),
                   opcodestr, "$rd, $rs1, $rs2, $rs3, $frm">;
 
+multiclass FPFMA_rrr_frm_m<RISCVOpcode opcode, bits<2> funct2,
+                           string opcodestr, list<ExtInfo_r> Exts> {
+  foreach Ext = Exts in
+    let Predicates = Ext.Predicates, DecoderNamespace = Ext.Space in
+    def Ext.Suffix : FPFMA_rrr_frm<opcode, funct2, opcodestr, Ext.Reg>;
+}
+
 class FPFMADynFrmAlias<FPFMA_rrr_frm Inst, string OpcodeStr,
-                       RegisterClass rty>
+                       DAGOperand rty>
     : InstAlias<OpcodeStr#" $rd, $rs1, $rs2, $rs3",
                 (Inst rty:$rd, rty:$rs1, rty:$rs2, rty:$rs3, 0b111)>;
+multiclass FPFMADynFrmAlias_m<FPFMA_rrr_frm Inst, string OpcodeStr,
+                              list<ExtInfo_r> Exts> {
+  foreach Ext = Exts in
+    let Predicates = Ext.Predicates in
+    def : FPFMADynFrmAlias<!cast<FPFMA_rrr_frm>(Inst#Ext.Suffix), OpcodeStr,
+                           Ext.Reg>;
+}
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1 in
 class FPALU_rr<bits<7> funct7, bits<3> funct3, string opcodestr,
-               RegisterClass rty>
+               DAGOperand rty, bit Commutable>
     : RVInstR<funct7, funct3, OPC_OP_FP, (outs rty:$rd),
-              (ins rty:$rs1, rty:$rs2), opcodestr, "$rd, $rs1, $rs2">;
+              (ins rty:$rs1, rty:$rs2), opcodestr, "$rd, $rs1, $rs2"> {
+  let isCommutable = Commutable;
+}
+multiclass FPALU_rr_m<bits<7> funct7, bits<3> funct3, string opcodestr,
+                      list<ExtInfo_r> Exts, bit Commutable = 0> {
+  foreach Ext = Exts in
+    let Predicates = Ext.Predicates, DecoderNamespace = Ext.Space in
+    def Ext.Suffix : FPALU_rr<funct7, funct3, opcodestr, Ext.Reg, Commutable>;
+}
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1,
     UseNamedOperandTable = 1, hasPostISelHook = 1 in
-class FPALU_rr_frm<bits<7> funct7, string opcodestr, RegisterClass rty>
+class FPALU_rr_frm<bits<7> funct7, string opcodestr, DAGOperand rty,
+                   bit Commutable>
     : RVInstRFrm<funct7, OPC_OP_FP, (outs rty:$rd),
                  (ins rty:$rs1, rty:$rs2, frmarg:$frm), opcodestr,
-                  "$rd, $rs1, $rs2, $frm">;
+                  "$rd, $rs1, $rs2, $frm"> {
+  let isCommutable = Commutable;
+}
+multiclass FPALU_rr_frm_m<bits<7> funct7, string opcodestr,
+                          list<ExtInfo_r> Exts, bit Commutable = 0> {
+  foreach Ext = Exts in
+    let Predicates = Ext.Predicates, DecoderNamespace = Ext.Space in
+    def Ext.Suffix : FPALU_rr_frm<funct7, opcodestr, Ext.Reg, Commutable>;
+}
 
 class FPALUDynFrmAlias<FPALU_rr_frm Inst, string OpcodeStr,
-                       RegisterClass rty>
+                       DAGOperand rty>
     : InstAlias<OpcodeStr#" $rd, $rs1, $rs2",
                 (Inst rty:$rd, rty:$rs1, rty:$rs2, 0b111)>;
+multiclass FPALUDynFrmAlias_m<FPALU_rr_frm Inst, string OpcodeStr,
+                              list<ExtInfo_r> Exts> {
+  foreach Ext = Exts in
+    let Predicates = Ext.Predicates in
+    def : FPALUDynFrmAlias<!cast<FPALU_rr_frm>(Inst#Ext.Suffix), OpcodeStr,
+                           Ext.Reg>;
+}
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1 in
 class FPUnaryOp_r<bits<7> funct7, bits<5> rs2val, bits<3> funct3,
-                  RegisterClass rdty, RegisterClass rs1ty, string opcodestr>
+                  DAGOperand rdty, DAGOperand rs1ty, string opcodestr>
     : RVInstR<funct7, funct3, OPC_OP_FP, (outs rdty:$rd), (ins rs1ty:$rs1),
               opcodestr, "$rd, $rs1"> {
   let rs2 = rs2val;
 }
+multiclass FPUnaryOp_r_m<bits<7> funct7, bits<5> rs2val, bits<3> funct3,
+                         list<ExtInfo_rr> Exts, string opcodestr> {
+  foreach Ext = Exts in
+    let Predicates = Ext.Predicates, DecoderNamespace = Ext.Space in
+    def Ext.Suffix : FPUnaryOp_r<funct7, rs2val, funct3, Ext.RdTy, Ext.Rs1Ty,
+                                 opcodestr>;
+}
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1,
     UseNamedOperandTable = 1, hasPostISelHook = 1 in
-class FPUnaryOp_r_frm<bits<7> funct7, bits<5> rs2val, RegisterClass rdty,
-                      RegisterClass rs1ty, string opcodestr>
+class FPUnaryOp_r_frm<bits<7> funct7, bits<5> rs2val, DAGOperand rdty,
+                      DAGOperand rs1ty, string opcodestr>
     : RVInstRFrm<funct7, OPC_OP_FP, (outs rdty:$rd),
                  (ins rs1ty:$rs1, frmarg:$frm), opcodestr,
                   "$rd, $rs1, $frm"> {
   let rs2 = rs2val;
 }
+multiclass FPUnaryOp_r_frm_m<bits<7> funct7, bits<5> rs2val,
+                             list<ExtInfo_rr> Exts, string opcodestr> {
+  foreach Ext = Exts in
+    let Predicates = Ext.Predicates, DecoderNamespace = Ext.Space in
+    def Ext.Suffix : FPUnaryOp_r_frm<funct7, rs2val, Ext.RdTy, Ext.Rs1Ty,
+                                     opcodestr>;
+}
 
 class FPUnaryOpDynFrmAlias<FPUnaryOp_r_frm Inst, string OpcodeStr,
-                           RegisterClass rdty, RegisterClass rs1ty>
+                           DAGOperand rdty, DAGOperand rs1ty>
     : InstAlias<OpcodeStr#" $rd, $rs1",
                 (Inst rdty:$rd, rs1ty:$rs1, 0b111)>;
+multiclass FPUnaryOpDynFrmAlias_m<FPUnaryOp_r_frm Inst, string OpcodeStr,
+                                  list<ExtInfo_rr> Exts> {
+  foreach Ext = Exts in
+    let Predicates = Ext.Predicates in
+    def : FPUnaryOpDynFrmAlias<!cast<FPUnaryOp_r_frm>(Inst#Ext.Suffix),
+                               OpcodeStr, Ext.RdTy, Ext.Rs1Ty>;
+}
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1 in
 class FPCmp_rr<bits<7> funct7, bits<3> funct3, string opcodestr,
-               RegisterClass rty>
+               DAGOperand rty, bit Commutable>
     : RVInstR<funct7, funct3, OPC_OP_FP, (outs GPR:$rd),
-              (ins rty:$rs1, rty:$rs2), opcodestr, "$rd, $rs1, $rs2">;
+              (ins rty:$rs1, rty:$rs2), opcodestr, "$rd, $rs1, $rs2"> {
+  let isCommutable = Commutable;
+}
+multiclass FPCmp_rr_m<bits<7> funct7, bits<3> funct3, string opcodestr,
+                      list<ExtInfo_r> Exts, bit Commutable = 0> {
+  foreach Ext = Exts in
+    let Predicates = Ext.Predicates, DecoderNamespace = Ext.Space in
+    def Ext.Suffix : FPCmp_rr<funct7, funct3, opcodestr, Ext.Reg, Commutable>;
+}
 
 //===----------------------------------------------------------------------===//
 // Instructions
@@ -162,101 +300,100 @@ def FLW : FPLoad_r<0b010, "flw", FPR32, WriteFLD32>;
 // reflecting the order these fields are specified in the instruction
 // encoding.
 def FSW : FPStore_r<0b010, "fsw", FPR32, WriteFST32>;
+} // Predicates = [HasStdExtF]
 
 let SchedRW = [WriteFMA32, ReadFMA32, ReadFMA32, ReadFMA32] in {
-def FMADD_S  : FPFMA_rrr_frm<OPC_MADD,  0b00, "fmadd.s",  FPR32>;
-def FMSUB_S  : FPFMA_rrr_frm<OPC_MSUB,  0b00, "fmsub.s",  FPR32>;
-def FNMSUB_S : FPFMA_rrr_frm<OPC_NMSUB, 0b00, "fnmsub.s", FPR32>;
-def FNMADD_S : FPFMA_rrr_frm<OPC_NMADD, 0b00, "fnmadd.s", FPR32>;
+defm FMADD_S  : FPFMA_rrr_frm_m<OPC_MADD,  0b00, "fmadd.s",  FINX>;
+defm FMSUB_S  : FPFMA_rrr_frm_m<OPC_MSUB,  0b00, "fmsub.s",  FINX>;
+defm FNMSUB_S : FPFMA_rrr_frm_m<OPC_NMSUB, 0b00, "fnmsub.s", FINX>;
+defm FNMADD_S : FPFMA_rrr_frm_m<OPC_NMADD, 0b00, "fnmadd.s", FINX>;
+}
+
+defm : FPFMADynFrmAlias_m<FMADD_S,  "fmadd.s",  FINX>;
+defm : FPFMADynFrmAlias_m<FMSUB_S,  "fmsub.s",  FINX>;
+defm : FPFMADynFrmAlias_m<FNMSUB_S, "fnmsub.s", FINX>;
+defm : FPFMADynFrmAlias_m<FNMADD_S, "fnmadd.s", FINX>;
+
+let SchedRW = [WriteFALU32, ReadFALU32, ReadFALU32] in {
+defm FADD_S : FPALU_rr_frm_m<0b0000000, "fadd.s", FINX, /*Commutable*/1>;
+defm FSUB_S : FPALU_rr_frm_m<0b0000100, "fsub.s", FINX>;
 }
+let SchedRW = [WriteFMul32, ReadFMul32, ReadFMul32] in
+defm FMUL_S : FPALU_rr_frm_m<0b0001000, "fmul.s", FINX, /*Commutable*/1>;
 
-def : FPFMADynFrmAlias<FMADD_S,  "fmadd.s",  FPR32>;
-def : FPFMADynFrmAlias<FMSUB_S,  "fmsub.s",  FPR32>;
-def : FPFMADynFrmAlias<FNMSUB_S, "fnmsub.s", FPR32>;
-def : FPFMADynFrmAlias<FNMADD_S, "fnmadd.s", FPR32>;
-
-def FADD_S : FPALU_rr_frm<0b0000000, "fadd.s", FPR32>,
-             Sched<[WriteFALU32, ReadFALU32, ReadFALU32]>;
-def FSUB_S : FPALU_rr_frm<0b0000100, "fsub.s", FPR32>,
-             Sched<[WriteFALU32, ReadFALU32, ReadFALU32]>;
-def FMUL_S : FPALU_rr_frm<0b0001000, "fmul.s", FPR32>,
-             Sched<[WriteFMul32, ReadFMul32, ReadFMul32]>;
-def FDIV_S : FPALU_rr_frm<0b0001100, "fdiv.s", FPR32>,
-             Sched<[WriteFDiv32, ReadFDiv32, ReadFDiv32]>;
-
-def        : FPALUDynFrmAlias<FADD_S, "fadd.s", FPR32>;
-def        : FPALUDynFrmAlias<FSUB_S, "fsub.s", FPR32>;
-def        : FPALUDynFrmAlias<FMUL_S, "fmul.s", FPR32>;
-def        : FPALUDynFrmAlias<FDIV_S, "fdiv.s", FPR32>;
-
-def FSQRT_S : FPUnaryOp_r_frm<0b0101100, 0b00000, FPR32, FPR32, "fsqrt.s">,
-              Sched<[WriteFSqrt32, ReadFSqrt32]>;
-def         : FPUnaryOpDynFrmAlias<FSQRT_S, "fsqrt.s", FPR32, FPR32>;
+let SchedRW = [WriteFDiv32, ReadFDiv32, ReadFDiv32] in
+defm FDIV_S : FPALU_rr_frm_m<0b0001100, "fdiv.s", FINX>;
+
+defm : FPALUDynFrmAlias_m<FADD_S, "fadd.s", FINX>;
+defm : FPALUDynFrmAlias_m<FSUB_S, "fsub.s", FINX>;
+defm : FPALUDynFrmAlias_m<FMUL_S, "fmul.s", FINX>;
+defm : FPALUDynFrmAlias_m<FDIV_S, "fdiv.s", FINX>;
+
+defm FSQRT_S : FPUnaryOp_r_frm_m<0b0101100, 0b00000, FFINX, "fsqrt.s">,
+               Sched<[WriteFSqrt32, ReadFSqrt32]>;
+defm         : FPUnaryOpDynFrmAlias_m<FSQRT_S, "fsqrt.s", FFINX>;
 
 let SchedRW = [WriteFSGNJ32, ReadFSGNJ32, ReadFSGNJ32],
     mayRaiseFPException = 0 in {
-def FSGNJ_S  : FPALU_rr<0b0010000, 0b000, "fsgnj.s", FPR32>;
-def FSGNJN_S : FPALU_rr<0b0010000, 0b001, "fsgnjn.s", FPR32>;
-def FSGNJX_S : FPALU_rr<0b0010000, 0b010, "fsgnjx.s", FPR32>;
+defm FSGNJ_S  : FPALU_rr_m<0b0010000, 0b000, "fsgnj.s",  FINX>;
+defm FSGNJN_S : FPALU_rr_m<0b0010000, 0b001, "fsgnjn.s", FINX>;
+defm FSGNJX_S : FPALU_rr_m<0b0010000, 0b010, "fsgnjx.s", FINX>;
 }
 
 let SchedRW = [WriteFMinMax32, ReadFMinMax32, ReadFMinMax32] in {
-def FMIN_S   : FPALU_rr<0b0010100, 0b000, "fmin.s", FPR32>;
-def FMAX_S   : FPALU_rr<0b0010100, 0b001, "fmax.s", FPR32>;
+defm FMIN_S   : FPALU_rr_m<0b0010100, 0b000, "fmin.s", FINX, /*Commutable*/1>;
+defm FMAX_S   : FPALU_rr_m<0b0010100, 0b001, "fmax.s", FINX, /*Commutable*/1>;
 }
 
-def FCVT_W_S : FPUnaryOp_r_frm<0b1100000, 0b00000, GPR, FPR32, "fcvt.w.s">,
-               Sched<[WriteFCvtF32ToI32, ReadFCvtF32ToI32]>;
-def          : FPUnaryOpDynFrmAlias<FCVT_W_S, "fcvt.w.s", GPR, FPR32>;
-
-def FCVT_WU_S : FPUnaryOp_r_frm<0b1100000, 0b00001, GPR, FPR32, "fcvt.wu.s">,
+defm FCVT_W_S : FPUnaryOp_r_frm_m<0b1100000, 0b00000, XFINX, "fcvt.w.s">,
                 Sched<[WriteFCvtF32ToI32, ReadFCvtF32ToI32]>;
-def           : FPUnaryOpDynFrmAlias<FCVT_WU_S, "fcvt.wu.s", GPR, FPR32>;
+defm          : FPUnaryOpDynFrmAlias_m<FCVT_W_S, "fcvt.w.s", XFINX>;
+
+defm FCVT_WU_S : FPUnaryOp_r_frm_m<0b1100000, 0b00001, XFINX, "fcvt.wu.s">,
+                 Sched<[WriteFCvtF32ToI32, ReadFCvtF32ToI32]>;
+defm           : FPUnaryOpDynFrmAlias_m<FCVT_WU_S, "fcvt.wu.s", XFINX>;
 
 let mayRaiseFPException = 0 in
 def FMV_X_W : FPUnaryOp_r<0b1110000, 0b00000, 0b000, GPR, FPR32, "fmv.x.w">,
               Sched<[WriteFMovF32ToI32, ReadFMovF32ToI32]>;
 
 let SchedRW = [WriteFCmp32, ReadFCmp32, ReadFCmp32] in {
-def FEQ_S : FPCmp_rr<0b1010000, 0b010, "feq.s", FPR32>;
-def FLT_S : FPCmp_rr<0b1010000, 0b001, "flt.s", FPR32>;
-def FLE_S : FPCmp_rr<0b1010000, 0b000, "fle.s", FPR32>;
+defm FEQ_S : FPCmp_rr_m<0b1010000, 0b010, "feq.s", FINX, /*Commutable*/1>;
+defm FLT_S : FPCmp_rr_m<0b1010000, 0b001, "flt.s", FINX>;
+defm FLE_S : FPCmp_rr_m<0b1010000, 0b000, "fle.s", FINX>;
 }
 
 let mayRaiseFPException = 0 in
-def FCLASS_S : FPUnaryOp_r<0b1110000, 0b00000, 0b001, GPR, FPR32, "fclass.s">,
-               Sched<[WriteFClass32, ReadFClass32]>;
-
-def FCVT_S_W : FPUnaryOp_r_frm<0b1101000, 0b00000, FPR32, GPR, "fcvt.s.w">,
-               Sched<[WriteFCvtI32ToF32, ReadFCvtI32ToF32]>;
-def          : FPUnaryOpDynFrmAlias<FCVT_S_W, "fcvt.s.w", FPR32, GPR>;
+defm FCLASS_S : FPUnaryOp_r_m<0b1110000, 0b00000, 0b001, XFINX, "fclass.s">,
+                Sched<[WriteFClass32, ReadFClass32]>;
 
-def FCVT_S_WU : FPUnaryOp_r_frm<0b1101000, 0b00001, FPR32, GPR, "fcvt.s.wu">,
+defm FCVT_S_W : FPUnaryOp_r_frm_m<0b1101000, 0b00000, FXINX, "fcvt.s.w">,
                 Sched<[WriteFCvtI32ToF32, ReadFCvtI32ToF32]>;
-def           : FPUnaryOpDynFrmAlias<FCVT_S_WU, "fcvt.s.wu", FPR32, GPR>;
+defm          : FPUnaryOpDynFrmAlias_m<FCVT_S_W, "fcvt.s.w", FXINX>;
+
+defm FCVT_S_WU : FPUnaryOp_r_frm_m<0b1101000, 0b00001, FXINX, "fcvt.s.wu">,
+                 Sched<[WriteFCvtI32ToF32, ReadFCvtI32ToF32]>;
+defm           : FPUnaryOpDynFrmAlias_m<FCVT_S_WU, "fcvt.s.wu", FXINX>;
 
 let mayRaiseFPException = 0 in
 def FMV_W_X : FPUnaryOp_r<0b1111000, 0b00000, 0b000, FPR32, GPR, "fmv.w.x">,
               Sched<[WriteFMovI32ToF32, ReadFMovI32ToF32]>;
-} // Predicates = [HasStdExtF]
-
-let Predicates = [HasStdExtF, IsRV64] in {
-def FCVT_L_S  : FPUnaryOp_r_frm<0b1100000, 0b00010, GPR, FPR32, "fcvt.l.s">,
-                Sched<[WriteFCvtF32ToI64, ReadFCvtF32ToI64]>;
-def           : FPUnaryOpDynFrmAlias<FCVT_L_S, "fcvt.l.s", GPR, FPR32>;
 
-def FCVT_LU_S  : FPUnaryOp_r_frm<0b1100000, 0b00011, GPR, FPR32, "fcvt.lu.s">,
+defm FCVT_L_S  : FPUnaryOp_r_frm_m<0b1100000, 0b00010, XFIN64X, "fcvt.l.s">,
                  Sched<[WriteFCvtF32ToI64, ReadFCvtF32ToI64]>;
-def            : FPUnaryOpDynFrmAlias<FCVT_LU_S, "fcvt.lu.s", GPR, FPR32>;
+defm           : FPUnaryOpDynFrmAlias_m<FCVT_L_S, "fcvt.l.s", XFIN64X>;
 
-def FCVT_S_L : FPUnaryOp_r_frm<0b1101000, 0b00010, FPR32, GPR, "fcvt.s.l">,
-               Sched<[WriteFCvtI64ToF32, ReadFCvtI64ToF32]>;
-def          : FPUnaryOpDynFrmAlias<FCVT_S_L, "fcvt.s.l", FPR32, GPR>;
+defm FCVT_LU_S  : FPUnaryOp_r_frm_m<0b1100000, 0b00011, XFIN64X, "fcvt.lu.s">,
+                  Sched<[WriteFCvtF32ToI64, ReadFCvtF32ToI64]>;
+defm            : FPUnaryOpDynFrmAlias_m<FCVT_LU_S, "fcvt.lu.s", XFIN64X>;
 
-def FCVT_S_LU : FPUnaryOp_r_frm<0b1101000, 0b00011, FPR32, GPR, "fcvt.s.lu">,
+defm FCVT_S_L : FPUnaryOp_r_frm_m<0b1101000, 0b00010, FXIN64X, "fcvt.s.l">,
                 Sched<[WriteFCvtI64ToF32, ReadFCvtI64ToF32]>;
-def           : FPUnaryOpDynFrmAlias<FCVT_S_LU, "fcvt.s.lu", FPR32, GPR>;
-} // Predicates = [HasStdExtF, IsRV64]
+defm          : FPUnaryOpDynFrmAlias_m<FCVT_S_L, "fcvt.s.l", FXIN64X>;
+
+defm FCVT_S_LU : FPUnaryOp_r_frm_m<0b1101000, 0b00011, FXIN64X, "fcvt.s.lu">,
+                 Sched<[WriteFCvtI64ToF32, ReadFCvtI64ToF32]>;
+defm           : FPUnaryOpDynFrmAlias_m<FCVT_S_LU, "fcvt.s.lu", FXIN64X>;
 
 //===----------------------------------------------------------------------===//
 // Assembler Pseudo Instructions (User-Level ISA, Version 2.2, Chapter 20)
@@ -315,6 +452,16 @@ def PseudoQuietFLT_S : PseudoQuietFCMP<FPR32>;
 }
 } // Predicates = [HasStdExtF]
 
+let Predicates = [HasStdExtZfinx] in {
+def : InstAlias<"fabs.s $rd, $rs", (FSGNJX_S_INX FPR32INX:$rd, FPR32INX:$rs, FPR32INX:$rs)>;
+def : InstAlias<"fneg.s $rd, $rs", (FSGNJN_S_INX FPR32INX:$rd, FPR32INX:$rs, FPR32INX:$rs)>;
+
+def : InstAlias<"fgt.s $rd, $rs, $rt",
+                (FLT_S_INX GPR:$rd, FPR32INX:$rt, FPR32INX:$rs), 0>;
+def : InstAlias<"fge.s $rd, $rs, $rt",
+                (FLE_S_INX GPR:$rd, FPR32INX:$rt, FPR32INX:$rs), 0>;
+} // Predicates = [HasStdExtZfinx]
+
 //===----------------------------------------------------------------------===//
 // Pseudo-instructions and codegen patterns
 //===----------------------------------------------------------------------===//
@@ -327,11 +474,13 @@ def fpimmneg0 : PatLeaf<(fpimm), [{ return N->isExactlyValue(-0.0); }]>;
 class PatSetCC<RegisterClass Ty, SDPatternOperator OpNode, CondCode Cond, RVInst Inst>
     : Pat<(OpNode Ty:$rs1, Ty:$rs2, Cond), (Inst $rs1, $rs2)>;
 
-class PatFpr32Fpr32<SDPatternOperator OpNode, RVInstR Inst>
-    : Pat<(OpNode FPR32:$rs1, FPR32:$rs2), (Inst $rs1, $rs2)>;
+class PatFprFpr<SDPatternOperator OpNode, RVInstR Inst,
+                RegisterClass RegTy>
+    : Pat<(OpNode RegTy:$rs1, RegTy:$rs2), (Inst $rs1, $rs2)>;
 
-class PatFpr32Fpr32DynFrm<SDPatternOperator OpNode, RVInstRFrm Inst>
-    : Pat<(OpNode FPR32:$rs1, FPR32:$rs2), (Inst $rs1, $rs2, 0b111)>;
+class PatFprFprDynFrm<SDPatternOperator OpNode, RVInstRFrm Inst,
+                      RegisterClass RegTy>
+    : Pat<(OpNode RegTy:$rs1, RegTy:$rs2), (Inst $rs1, $rs2, 0b111)>;
 
 let Predicates = [HasStdExtF] in {
 
@@ -346,17 +495,17 @@ def : Pat<(f32 (fpimmneg0)), (FSGNJN_S (FMV_W_X X0), (FMV_W_X X0))>;
 
 /// Float arithmetic operations
 
-def : PatFpr32Fpr32DynFrm<any_fadd, FADD_S>;
-def : PatFpr32Fpr32DynFrm<any_fsub, FSUB_S>;
-def : PatFpr32Fpr32DynFrm<any_fmul, FMUL_S>;
-def : PatFpr32Fpr32DynFrm<any_fdiv, FDIV_S>;
+def : PatFprFprDynFrm<any_fadd, FADD_S, FPR32>;
+def : PatFprFprDynFrm<any_fsub, FSUB_S, FPR32>;
+def : PatFprFprDynFrm<any_fmul, FMUL_S, FPR32>;
+def : PatFprFprDynFrm<any_fdiv, FDIV_S, FPR32>;
 
 def : Pat<(any_fsqrt FPR32:$rs1), (FSQRT_S FPR32:$rs1, 0b111)>;
 
 def : Pat<(fneg FPR32:$rs1), (FSGNJN_S $rs1, $rs1)>;
 def : Pat<(fabs FPR32:$rs1), (FSGNJX_S $rs1, $rs1)>;
 
-def : PatFpr32Fpr32<fcopysign, FSGNJ_S>;
+def : PatFprFpr<fcopysign, FSGNJ_S, FPR32>;
 def : Pat<(fcopysign FPR32:$rs1, (fneg FPR32:$rs2)), (FSGNJN_S $rs1, $rs2)>;
 
 // fmadd: rs1 * rs2 + rs3
@@ -375,11 +524,15 @@ def : Pat<(any_fma (fneg FPR32:$rs1), FPR32:$rs2, FPR32:$rs3),
 def : Pat<(any_fma (fneg FPR32:$rs1), FPR32:$rs2, (fneg FPR32:$rs3)),
           (FNMADD_S FPR32:$rs1, FPR32:$rs2, FPR32:$rs3, 0b111)>;
 
+// fnmadd: -(rs1 * rs2 + rs3) (the nsz flag on the FMA)
+def : Pat<(fneg (any_fma_nsz FPR32:$rs1, FPR32:$rs2, FPR32:$rs3)),
+          (FNMADD_S FPR32:$rs1, FPR32:$rs2, FPR32:$rs3, 0b111)>;
+
 // The ratified 20191213 ISA spec defines fmin and fmax in a way that matches
 // LLVM's fminnum and fmaxnum
 // <https://github.com/riscv/riscv-isa-manual/commit/cd20cee7efd9bac7c5aa127ec3b451749d2b3cce>.
-def : PatFpr32Fpr32<fminnum, FMIN_S>;
-def : PatFpr32Fpr32<fmaxnum, FMAX_S>;
+def : PatFprFpr<fminnum, FMIN_S, FPR32>;
+def : PatFprFpr<fmaxnum, FMAX_S, FPR32>;
 
 /// Setcc
 // FIXME: SETEQ/SETLT/SETLE imply nonans, can we pick better instructions for
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
index b62e23d3b0fa..72ba8460116f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
@@ -25,13 +25,13 @@ def riscv_remuw : SDNode<"RISCVISD::REMUW", SDT_RISCVIntBinOpW>;
 //===----------------------------------------------------------------------===//
 
 let Predicates = [HasStdExtM] in {
-def MUL     : ALU_rr<0b0000001, 0b000, "mul">,
+def MUL     : ALU_rr<0b0000001, 0b000, "mul", /*Commutable*/1>,
               Sched<[WriteIMul, ReadIMul, ReadIMul]>;
-def MULH    : ALU_rr<0b0000001, 0b001, "mulh">,
+def MULH    : ALU_rr<0b0000001, 0b001, "mulh", /*Commutable*/1>,
               Sched<[WriteIMul, ReadIMul, ReadIMul]>;
 def MULHSU  : ALU_rr<0b0000001, 0b010, "mulhsu">,
               Sched<[WriteIMul, ReadIMul, ReadIMul]>;
-def MULHU   : ALU_rr<0b0000001, 0b011, "mulhu">,
+def MULHU   : ALU_rr<0b0000001, 0b011, "mulhu", /*Commutable*/1>,
               Sched<[WriteIMul, ReadIMul, ReadIMul]>;
 def DIV     : ALU_rr<0b0000001, 0b100, "div">,
               Sched<[WriteIDiv, ReadIDiv, ReadIDiv]>;
@@ -44,7 +44,7 @@ def REMU    : ALU_rr<0b0000001, 0b111, "remu">,
 } // Predicates = [HasStdExtM]
 
 let Predicates = [HasStdExtM, IsRV64] in {
-def MULW    : ALUW_rr<0b0000001, 0b000, "mulw">,
+def MULW    : ALUW_rr<0b0000001, 0b000, "mulw", /*Commutable*/1>,
               Sched<[WriteIMul32, ReadIMul32, ReadIMul32]>;
 def DIVW    : ALUW_rr<0b0000001, 0b100, "divw">,
               Sched<[WriteIDiv32, ReadIDiv32, ReadIDiv32]>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
index 306024a3e4fd..f8bc241039f8 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
@@ -7,9 +7,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// This file describes the RISC-V instructions from the standard 'V' Vector
-/// extension, version 0.10.
-/// This version is still experimental as the 'V' extension hasn't been
-/// ratified yet.
+/// extension, version 1.0.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -895,6 +893,7 @@ defm VSUB_V : VALU_IV_V_X<"vsub", 0b000010>;
 defm VRSUB_V : VALU_IV_X_I<"vrsub", 0b000011>;
 
 def : InstAlias<"vneg.v $vd, $vs$vm", (VRSUB_VX VR:$vd, VR:$vs, X0, VMaskOp:$vm)>;
+def : InstAlias<"vneg.v $vd, $vs", (VRSUB_VX VR:$vd, VR:$vs, X0, zero_reg)>;
 
 // Vector Widening Integer Add/Subtract
 // Refer to 11.2 Widening Vector Arithmetic Instructions
@@ -922,8 +921,12 @@ defm VWSUB_W : VALU_MV_V_X<"vwsub", 0b110111, "w">;
 
 def : InstAlias<"vwcvt.x.x.v $vd, $vs$vm",
                 (VWADD_VX VR:$vd, VR:$vs, X0, VMaskOp:$vm)>;
+def : InstAlias<"vwcvt.x.x.v $vd, $vs",
+                (VWADD_VX VR:$vd, VR:$vs, X0, zero_reg)>;
 def : InstAlias<"vwcvtu.x.x.v $vd, $vs$vm",
                 (VWADDU_VX VR:$vd, VR:$vs, X0, VMaskOp:$vm)>;
+def : InstAlias<"vwcvtu.x.x.v $vd, $vs",
+                (VWADDU_VX VR:$vd, VR:$vs, X0, zero_reg)>;
 
 // Vector Integer Extension
 defm VZEXT_VF8 : VALU_MV_VS2<"vzext.vf8", 0b010010, 0b00010>;
@@ -952,6 +955,8 @@ defm VXOR_V : VALU_IV_V_X_I<"vxor", 0b001011>;
 
 def : InstAlias<"vnot.v $vd, $vs$vm",
                 (VXOR_VI VR:$vd, VR:$vs, -1, VMaskOp:$vm)>;
+def : InstAlias<"vnot.v $vd, $vs",
+                (VXOR_VI VR:$vd, VR:$vs, -1, zero_reg)>;
 
 // Vector Single-Width Bit Shift Instructions
 defm VSLL_V : VSHT_IV_V_X_I<"vsll", 0b100101, uimm5>;
@@ -970,6 +975,8 @@ defm VNSRA_W : VNSHT_IV_V_X_I<"vnsra", 0b101101, uimm5, "w">;
 
 def : InstAlias<"vncvt.x.x.w $vd, $vs$vm",
                 (VNSRL_WX VR:$vd, VR:$vs, X0, VMaskOp:$vm)>;
+def : InstAlias<"vncvt.x.x.w $vd, $vs",
+                (VNSRL_WX VR:$vd, VR:$vs, X0, zero_reg)>;
 
 // Vector Integer Comparison Instructions
 let RVVConstraint = NoConstraint in {
@@ -1124,12 +1131,16 @@ defm VNCLIP_W : VNCLP_IV_V_X_I<"vnclip", 0b101111, uimm5, "w">;
 
 let Predicates = [HasVInstructionsAnyF] in {
 // Vector Single-Width Floating-Point Add/Subtract Instructions
+let Uses = [FRM], mayRaiseFPException = true in {
 defm VFADD_V : VALU_FV_V_F<"vfadd", 0b000000>;
 defm VFSUB_V : VALU_FV_V_F<"vfsub", 0b000010>;
 defm VFRSUB_V : VALU_FV_F<"vfrsub", 0b100111>;
+}
 
 // Vector Widening Floating-Point Add/Subtract Instructions
-let Constraints = "@earlyclobber $vd" in {
+let Constraints = "@earlyclobber $vd",
+    Uses = [FRM],
+    mayRaiseFPException = true in {
 let RVVConstraint = WidenV in {
 defm VFWADD_V : VWALU_FV_V_F<"vfwadd", 0b110000>;
 defm VFWSUB_V : VWALU_FV_V_F<"vfwsub", 0b110010>;
@@ -1142,19 +1153,23 @@ let RVVConstraint = WidenW in {
 defm VFWADD_W : VWALU_FV_V_F<"vfwadd", 0b110100, "w">;
 defm VFWSUB_W : VWALU_FV_V_F<"vfwsub", 0b110110, "w">;
 } // RVVConstraint = WidenW
-} // Constraints = "@earlyclobber $vd"
+} // Constraints = "@earlyclobber $vd", Uses = [FRM], mayRaiseFPException = true
 
 // Vector Single-Width Floating-Point Multiply/Divide Instructions
+let Uses = [FRM], mayRaiseFPException = true in {
 defm VFMUL_V : VMUL_FV_V_F<"vfmul", 0b100100>;
 defm VFDIV_V : VDIV_FV_V_F<"vfdiv", 0b100000>;
 defm VFRDIV_V : VRDIV_FV_F<"vfrdiv", 0b100001>;
+}
 
 // Vector Widening Floating-Point Multiply
-let Constraints = "@earlyclobber $vd", RVVConstraint = WidenV in {
+let Constraints = "@earlyclobber $vd", RVVConstraint = WidenV,
+    Uses = [FRM], mayRaiseFPException = true in {
 defm VFWMUL_V : VWMUL_FV_V_F<"vfwmul", 0b111000>;
-} // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV
+} // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV, Uses = [FRM], mayRaiseFPException = true
 
 // Vector Single-Width Floating-Point Fused Multiply-Add Instructions
+let Uses = [FRM], mayRaiseFPException = true in {
 defm VFMACC_V : VMAC_FV_V_F<"vfmacc", 0b101100>;
 defm VFNMACC_V : VMAC_FV_V_F<"vfnmacc", 0b101101>;
 defm VFMSAC_V : VMAC_FV_V_F<"vfmsac", 0b101110>;
@@ -1163,23 +1178,31 @@ defm VFMADD_V : VMAC_FV_V_F<"vfmadd", 0b101000>;
 defm VFNMADD_V : VMAC_FV_V_F<"vfnmadd", 0b101001>;
 defm VFMSUB_V : VMAC_FV_V_F<"vfmsub", 0b101010>;
 defm VFNMSUB_V : VMAC_FV_V_F<"vfnmsub", 0b101011>;
+}
 
 // Vector Widening Floating-Point Fused Multiply-Add Instructions
-let Constraints = "@earlyclobber $vd", RVVConstraint = WidenV in {
+let Constraints = "@earlyclobber $vd", RVVConstraint = WidenV,
+    Uses = [FRM], mayRaiseFPException = true in {
 defm VFWMACC_V : VWMAC_FV_V_F<"vfwmacc", 0b111100>;
 defm VFWNMACC_V : VWMAC_FV_V_F<"vfwnmacc", 0b111101>;
 defm VFWMSAC_V : VWMAC_FV_V_F<"vfwmsac", 0b111110>;
 defm VFWNMSAC_V : VWMAC_FV_V_F<"vfwnmsac", 0b111111>;
-} // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV
+} // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV, Uses = [FRM], mayRaiseFPException = true
 
 // Vector Floating-Point Square-Root Instruction
+let Uses = [FRM], mayRaiseFPException = true in {
 defm VFSQRT_V : VSQR_FV_VS2<"vfsqrt.v", 0b010011, 0b00000>;
-defm VFRSQRT7_V : VRCP_FV_VS2<"vfrsqrt7.v", 0b010011, 0b00100>;
 defm VFREC7_V : VRCP_FV_VS2<"vfrec7.v", 0b010011, 0b00101>;
+}
+
+let mayRaiseFPException = true in
+defm VFRSQRT7_V : VRCP_FV_VS2<"vfrsqrt7.v", 0b010011, 0b00100>;
 
 // Vector Floating-Point MIN/MAX Instructions
+let mayRaiseFPException = true in {
 defm VFMIN_V : VCMP_FV_V_F<"vfmin", 0b000100>;
 defm VFMAX_V : VCMP_FV_V_F<"vfmax", 0b000110>;
+}
 
 // Vector Floating-Point Sign-Injection Instructions
 defm VFSGNJ_V : VSGNJ_FV_V_F<"vfsgnj", 0b001000>;
@@ -1188,18 +1211,22 @@ defm VFSGNJX_V : VSGNJ_FV_V_F<"vfsgnjx", 0b001010>;
 
 def : InstAlias<"vfneg.v $vd, $vs$vm",
                 (VFSGNJN_VV VR:$vd, VR:$vs, VR:$vs, VMaskOp:$vm)>;
+def : InstAlias<"vfneg.v $vd, $vs",
+                (VFSGNJN_VV VR:$vd, VR:$vs, VR:$vs, zero_reg)>;
 def : InstAlias<"vfabs.v $vd, $vs$vm",
                 (VFSGNJX_VV VR:$vd, VR:$vs, VR:$vs, VMaskOp:$vm)>;
+def : InstAlias<"vfabs.v $vd, $vs",
+                (VFSGNJX_VV VR:$vd, VR:$vs, VR:$vs, zero_reg)>;
 
 // Vector Floating-Point Compare Instructions
-let RVVConstraint = NoConstraint in {
+let RVVConstraint = NoConstraint, mayRaiseFPException = true in {
 defm VMFEQ_V : VCMP_FV_V_F<"vmfeq", 0b011000>;
 defm VMFNE_V : VCMP_FV_V_F<"vmfne", 0b011100>;
 defm VMFLT_V : VCMP_FV_V_F<"vmflt", 0b011011>;
 defm VMFLE_V : VCMP_FV_V_F<"vmfle", 0b011001>;
 defm VMFGT_V : VCMP_FV_F<"vmfgt", 0b011101>;
 defm VMFGE_V : VCMP_FV_F<"vmfge", 0b011111>;
-} // RVVConstraint = NoConstraint
+} // RVVConstraint = NoConstraint, mayRaiseFPException = true
 
 def : InstAlias<"vmfgt.vv $vd, $va, $vb$vm",
                 (VMFLT_VV VR:$vd, VR:$vb, VR:$va, VMaskOp:$vm), 0>;
@@ -1288,10 +1315,14 @@ defm VWREDSUM : VWRED_IV_V<"vwredsum", 0b110001>;
 let Predicates = [HasVInstructionsAnyF] in {
 // Vector Single-Width Floating-Point Reduction Instructions
 let RVVConstraint = NoConstraint in {
+let Uses = [FRM], mayRaiseFPException = true in {
 defm VFREDOSUM : VREDO_FV_V<"vfredosum", 0b000011>;
 defm VFREDUSUM : VRED_FV_V<"vfredusum", 0b000001>;
+}
+let mayRaiseFPException = true in {
 defm VFREDMAX : VRED_FV_V<"vfredmax", 0b000111>;
 defm VFREDMIN : VRED_FV_V<"vfredmin", 0b000101>;
+}
 } // RVVConstraint = NoConstraint
 
 def : InstAlias<"vfredsum.vs $vd, $vs2, $vs1$vm",
@@ -1303,8 +1334,10 @@ let Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint in {
 // This has the downside that the earlyclobber constraint is too coarse and
 // will impose unnecessary restrictions by not allowing the destination to
 // overlap with the first (wide) operand.
+let Uses = [FRM], mayRaiseFPException = true in {
 defm VFWREDOSUM : VWREDO_FV_V<"vfwredosum", 0b110011>;
 defm VFWREDUSUM : VWRED_FV_V<"vfwredusum", 0b110001>;
+}
 } // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint
 
 def : InstAlias<"vfwredsum.vs $vd, $vs2, $vs1$vm",
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 9087ed50f9fc..fbe396d278b4 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -7,8 +7,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// This file contains the required infrastructure to support code generation
-/// for the standard 'V' (Vector) extension, version 0.10.  This version is still
-/// experimental as the 'V' extension hasn't been ratified yet.
+/// for the standard 'V' (Vector) extension, version 1.0.
 ///
 /// This file is included from RISCVInstrInfoV.td
 ///
@@ -40,13 +39,37 @@ def DecImm : SDNodeXForm<imm, [{
                                    N->getValueType(0));
 }]>;
 
-defvar TAIL_UNDISTURBED = 0;
+defvar TAIL_UNDISTURBED_MASK_UNDISTURBED = 0;
 defvar TAIL_AGNOSTIC = 1;
 
 //===----------------------------------------------------------------------===//
 // Utilities.
 //===----------------------------------------------------------------------===//
 
+class PseudoToVInst<string PseudoInst> {
+  string VInst = !subst("_M8", "",
+                 !subst("_M4", "",
+                 !subst("_M2", "",
+                 !subst("_M1", "",
+                 !subst("_MF2", "",
+                 !subst("_MF4", "",
+                 !subst("_MF8", "",
+                 !subst("_B1", "",
+                 !subst("_B2", "",
+                 !subst("_B4", "",
+                 !subst("_B8", "",
+                 !subst("_B16", "",
+                 !subst("_B32", "",
+                 !subst("_B64", "",
+                 !subst("_MASK", "",
+                 !subst("_TIED", "",
+                 !subst("_TU", "",
+                 !subst("F16", "F",
+                 !subst("F32", "F",
+                 !subst("F64", "F",
+                 !subst("Pseudo", "", PseudoInst)))))))))))))))))))));
+}
+
 // This class describes information associated to the LMUL.
 class LMULInfo<int lmul, int oct, VReg regclass, VReg wregclass,
                VReg f2regclass, VReg f4regclass, VReg f8regclass, string mx> {
@@ -403,7 +426,7 @@ class CONST8b<bits<8> val> {
 def InvalidIndex : CONST8b<0x80>;
 class RISCVVPseudo {
   Pseudo Pseudo = !cast<Pseudo>(NAME); // Used as a key.
-  Instruction BaseInstr;
+  Instruction BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
 // The actual table.
@@ -419,11 +442,26 @@ def RISCVVPseudosTable : GenericTable {
 def RISCVVIntrinsicsTable : GenericTable {
   let FilterClass = "RISCVVIntrinsic";
   let CppTypeName = "RISCVVIntrinsicInfo";
-  let Fields = ["IntrinsicID", "SplatOperand", "VLOperand"];
+  let Fields = ["IntrinsicID", "ScalarOperand", "VLOperand"];
   let PrimaryKey = ["IntrinsicID"];
   let PrimaryKeyName = "getRISCVVIntrinsicInfo";
 }
 
+class RISCVMaskedPseudo<bits<4> MaskIdx, bit HasTU = true> {
+  Pseudo MaskedPseudo = !cast<Pseudo>(NAME);
+  Pseudo UnmaskedPseudo = !cast<Pseudo>(!subst("_MASK", "", NAME));
+  Pseudo UnmaskedTUPseudo = !if(HasTU, !cast<Pseudo>(!subst("_MASK", "", NAME # "_TU")), MaskedPseudo);
+  bits<4> MaskOpIdx = MaskIdx;
+}
+
+def RISCVMaskedPseudosTable : GenericTable {
+  let FilterClass = "RISCVMaskedPseudo";
+  let CppTypeName = "RISCVMaskedPseudoInfo";
+  let Fields = ["MaskedPseudo", "UnmaskedPseudo", "UnmaskedTUPseudo", "MaskOpIdx"];
+  let PrimaryKey = ["MaskedPseudo"];
+  let PrimaryKeyName = "getMaskedPseudoInfo";
+}
+
 class RISCVVLE<bit M, bit TU, bit Str, bit F, bits<3> S, bits<3> L> {
   bits<1> Masked = M;
   bits<1> IsTU = TU;
@@ -489,9 +527,10 @@ def RISCVVSXTable : RISCVVLX_VSXTable {
   let PrimaryKeyName = "getVSXPseudo";
 }
 
-class RISCVVLSEG<bits<4> N, bit M, bit Str, bit F, bits<3> S, bits<3> L> {
+class RISCVVLSEG<bits<4> N, bit M, bit TU, bit Str, bit F, bits<3> S, bits<3> L> {
   bits<4> NF = N;
   bits<1> Masked = M;
+  bits<1> IsTU = TU;
   bits<1> Strided = Str;
   bits<1> FF = F;
   bits<3> Log2SEW = S;
@@ -502,14 +541,15 @@ class RISCVVLSEG<bits<4> N, bit M, bit Str, bit F, bits<3> S, bits<3> L> {
 def RISCVVLSEGTable : GenericTable {
   let FilterClass = "RISCVVLSEG";
   let CppTypeName = "VLSEGPseudo";
-  let Fields = ["NF", "Masked", "Strided", "FF", "Log2SEW", "LMUL", "Pseudo"];
-  let PrimaryKey = ["NF", "Masked", "Strided", "FF", "Log2SEW", "LMUL"];
+  let Fields = ["NF", "Masked", "IsTU", "Strided", "FF", "Log2SEW", "LMUL", "Pseudo"];
+  let PrimaryKey = ["NF", "Masked", "IsTU", "Strided", "FF", "Log2SEW", "LMUL"];
   let PrimaryKeyName = "getVLSEGPseudo";
 }
 
-class RISCVVLXSEG<bits<4> N, bit M, bit O, bits<3> S, bits<3> L, bits<3> IL> {
+class RISCVVLXSEG<bits<4> N, bit M, bit TU, bit O, bits<3> S, bits<3> L, bits<3> IL> {
   bits<4> NF = N;
   bits<1> Masked = M;
+  bits<1> IsTU = TU;
   bits<1> Ordered = O;
   bits<3> Log2SEW = S;
   bits<3> LMUL = L;
@@ -520,8 +560,8 @@ class RISCVVLXSEG<bits<4> N, bit M, bit O, bits<3> S, bits<3> L, bits<3> IL> {
 def RISCVVLXSEGTable : GenericTable {
   let FilterClass = "RISCVVLXSEG";
   let CppTypeName = "VLXSEGPseudo";
-  let Fields = ["NF", "Masked", "Ordered", "Log2SEW", "LMUL", "IndexLMUL", "Pseudo"];
-  let PrimaryKey = ["NF", "Masked", "Ordered", "Log2SEW", "LMUL", "IndexLMUL"];
+  let Fields = ["NF", "Masked", "IsTU", "Ordered", "Log2SEW", "LMUL", "IndexLMUL", "Pseudo"];
+  let PrimaryKey = ["NF", "Masked", "IsTU", "Ordered", "Log2SEW", "LMUL", "IndexLMUL"];
   let PrimaryKeyName = "getVLXSEGPseudo";
 }
 
@@ -564,30 +604,6 @@ def RISCVVSXSEGTable : GenericTable {
 // Helpers to define the different pseudo instructions.
 //===----------------------------------------------------------------------===//
 
-class PseudoToVInst<string PseudoInst> {
-  string VInst = !subst("_M8", "",
-                 !subst("_M4", "",
-                 !subst("_M2", "",
-                 !subst("_M1", "",
-                 !subst("_MF2", "",
-                 !subst("_MF4", "",
-                 !subst("_MF8", "",
-                 !subst("_B1", "",
-                 !subst("_B2", "",
-                 !subst("_B4", "",
-                 !subst("_B8", "",
-                 !subst("_B16", "",
-                 !subst("_B32", "",
-                 !subst("_B64", "",
-                 !subst("_MASK", "",
-                 !subst("_TIED", "",
-                 !subst("_TU", "",
-                 !subst("F16", "F",
-                 !subst("F32", "F",
-                 !subst("F64", "F",
-                 !subst("Pseudo", "", PseudoInst)))))))))))))))))))));
-}
-
 // The destination vector register group for a masked vector instruction cannot
 // overlap the source mask register (v0), unless the destination vector register
 // is being written with a mask value (e.g., comparisons) or the scalar result
@@ -627,25 +643,24 @@ class VPseudo<Instruction instr, LMULInfo m, dag outs, dag ins> :
   let VLMul = m.value;
 }
 
-class VPseudoUSLoadNoMask<VReg RetClass, int EEW, bit isFF> :
+class VPseudoUSLoadNoMask<VReg RetClass, int EEW, bit DummyMask = 1> :
       Pseudo<(outs RetClass:$rd),
              (ins GPR:$rs1, AVL:$vl, ixlenimm:$sew),[]>,
       RISCVVPseudo,
-      RISCVVLE</*Masked*/0, /*TU*/0, /*Strided*/0, /*FF*/isFF, log2<EEW>.val, VLMul> {
+      RISCVVLE</*Masked*/0, /*TU*/0, /*Strided*/0, /*FF*/0, log2<EEW>.val, VLMul> {
   let mayLoad = 1;
   let mayStore = 0;
   let hasSideEffects = 0;
   let HasVLOp = 1;
   let HasSEWOp = 1;
-  let HasDummyMask = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+  let HasDummyMask = DummyMask;
 }
 
-class VPseudoUSLoadNoMaskTU<VReg RetClass, int EEW, bit isFF> :
+class VPseudoUSLoadNoMaskTU<VReg RetClass, int EEW> :
       Pseudo<(outs RetClass:$rd),
              (ins RetClass:$dest, GPR:$rs1, AVL:$vl, ixlenimm:$sew),[]>,
       RISCVVPseudo,
-      RISCVVLE</*Masked*/0, /*TU*/1, /*Strided*/0, /*FF*/isFF, log2<EEW>.val, VLMul> {
+      RISCVVLE</*Masked*/0, /*TU*/1, /*Strided*/0, /*FF*/0, log2<EEW>.val, VLMul> {
   let mayLoad = 1;
   let mayStore = 0;
   let hasSideEffects = 0;
@@ -654,16 +669,15 @@ class VPseudoUSLoadNoMaskTU<VReg RetClass, int EEW, bit isFF> :
   let HasDummyMask = 1;
   let HasMergeOp = 1;
   let Constraints = "$rd = $dest";
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
-class VPseudoUSLoadMask<VReg RetClass, int EEW, bit isFF> :
+class VPseudoUSLoadMask<VReg RetClass, int EEW> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
               (ins GetVRegNoV0<RetClass>.R:$merge,
                    GPR:$rs1,
                    VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy),[]>,
       RISCVVPseudo,
-      RISCVVLE</*Masked*/1, /*TU*/1, /*Strided*/0, /*FF*/isFF, log2<EEW>.val, VLMul> {
+      RISCVVLE</*Masked*/1, /*TU*/1, /*Strided*/0, /*FF*/0, log2<EEW>.val, VLMul> {
   let mayLoad = 1;
   let mayStore = 0;
   let hasSideEffects = 0;
@@ -672,7 +686,53 @@ class VPseudoUSLoadMask<VReg RetClass, int EEW, bit isFF> :
   let HasSEWOp = 1;
   let HasMergeOp = 1;
   let HasVecPolicyOp = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+  let UsesMaskPolicy = 1;
+}
+
+class VPseudoUSLoadFFNoMask<VReg RetClass, int EEW, bit DummyMask = 1> :
+      Pseudo<(outs RetClass:$rd, GPR:$vl),
+             (ins GPR:$rs1, AVL:$avl, ixlenimm:$sew),[]>,
+      RISCVVPseudo,
+      RISCVVLE</*Masked*/0, /*TU*/0, /*Strided*/0, /*FF*/1, log2<EEW>.val, VLMul> {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasDummyMask = DummyMask;
+}
+
+class VPseudoUSLoadFFNoMaskTU<VReg RetClass, int EEW> :
+      Pseudo<(outs RetClass:$rd, GPR:$vl),
+             (ins RetClass:$dest, GPR:$rs1, AVL:$avl, ixlenimm:$sew),[]>,
+      RISCVVPseudo,
+      RISCVVLE</*Masked*/0, /*TU*/1, /*Strided*/0, /*FF*/1, log2<EEW>.val, VLMul> {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasDummyMask = 1;
+  let HasMergeOp = 1;
+  let Constraints = "$rd = $dest";
+}
+
+class VPseudoUSLoadFFMask<VReg RetClass, int EEW> :
+      Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd, GPR:$vl),
+              (ins GetVRegNoV0<RetClass>.R:$merge,
+                   GPR:$rs1,
+                   VMaskOp:$vm, AVL:$avl, ixlenimm:$sew, ixlenimm:$policy),[]>,
+      RISCVVPseudo,
+      RISCVVLE</*Masked*/1, /*TU*/1, /*Strided*/0, /*FF*/1, log2<EEW>.val, VLMul> {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let Constraints = "$rd = $merge";
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasMergeOp = 1;
+  let HasVecPolicyOp = 1;
+  let UsesMaskPolicy = 1;
 }
 
 class VPseudoSLoadNoMask<VReg RetClass, int EEW>:
@@ -686,7 +746,6 @@ class VPseudoSLoadNoMask<VReg RetClass, int EEW>:
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasDummyMask = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
 class VPseudoSLoadNoMaskTU<VReg RetClass, int EEW>:
@@ -702,7 +761,6 @@ class VPseudoSLoadNoMaskTU<VReg RetClass, int EEW>:
   let HasDummyMask = 1;
   let HasMergeOp = 1;
   let Constraints = "$rd = $dest";
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
 class VPseudoSLoadMask<VReg RetClass, int EEW>:
@@ -720,7 +778,7 @@ class VPseudoSLoadMask<VReg RetClass, int EEW>:
   let HasSEWOp = 1;
   let HasMergeOp = 1;
   let HasVecPolicyOp = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+  let UsesMaskPolicy = 1;
 }
 
 class VPseudoILoadNoMask<VReg RetClass, VReg IdxClass, int EEW, bits<3> LMUL,
@@ -737,7 +795,6 @@ class VPseudoILoadNoMask<VReg RetClass, VReg IdxClass, int EEW, bits<3> LMUL,
   let HasSEWOp = 1;
   let HasDummyMask = 1;
   let Constraints = !if(!eq(EarlyClobber, 1), "@earlyclobber $rd", "");
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
 class VPseudoILoadNoMaskTU<VReg RetClass, VReg IdxClass, int EEW, bits<3> LMUL,
@@ -755,7 +812,6 @@ class VPseudoILoadNoMaskTU<VReg RetClass, VReg IdxClass, int EEW, bits<3> LMUL,
   let HasDummyMask = 1;
   let HasMergeOp = 1;
   let Constraints = !if(!eq(EarlyClobber, 1), "@earlyclobber $rd, $rd = $dest", "$rd = $dest");
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
 class VPseudoILoadMask<VReg RetClass, VReg IdxClass, int EEW, bits<3> LMUL,
@@ -774,10 +830,10 @@ class VPseudoILoadMask<VReg RetClass, VReg IdxClass, int EEW, bits<3> LMUL,
   let HasSEWOp = 1;
   let HasMergeOp = 1;
   let HasVecPolicyOp = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+  let UsesMaskPolicy = 1;
 }
 
-class VPseudoUSStoreNoMask<VReg StClass, int EEW>:
+class VPseudoUSStoreNoMask<VReg StClass, int EEW, bit DummyMask = 1>:
       Pseudo<(outs),
               (ins StClass:$rd, GPR:$rs1, AVL:$vl, ixlenimm:$sew),[]>,
       RISCVVPseudo,
@@ -787,8 +843,7 @@ class VPseudoUSStoreNoMask<VReg StClass, int EEW>:
   let hasSideEffects = 0;
   let HasVLOp = 1;
   let HasSEWOp = 1;
-  let HasDummyMask = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+  let HasDummyMask = DummyMask;
 }
 
 class VPseudoUSStoreMask<VReg StClass, int EEW>:
@@ -801,7 +856,6 @@ class VPseudoUSStoreMask<VReg StClass, int EEW>:
   let hasSideEffects = 0;
   let HasVLOp = 1;
   let HasSEWOp = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
 class VPseudoSStoreNoMask<VReg StClass, int EEW>:
@@ -815,7 +869,6 @@ class VPseudoSStoreNoMask<VReg StClass, int EEW>:
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasDummyMask = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
 class VPseudoSStoreMask<VReg StClass, int EEW>:
@@ -828,7 +881,6 @@ class VPseudoSStoreMask<VReg StClass, int EEW>:
   let hasSideEffects = 0;
   let HasVLOp = 1;
   let HasSEWOp = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
 // Unary instruction that is never masked so HasDummyMask=0.
@@ -842,7 +894,20 @@ class VPseudoUnaryNoDummyMask<VReg RetClass,
   let hasSideEffects = 0;
   let HasVLOp = 1;
   let HasSEWOp = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoUnaryNoDummyMaskTU<VReg RetClass,
+                                DAGOperand Op2Class> :
+        Pseudo<(outs RetClass:$rd),
+               (ins RetClass:$dest, Op2Class:$rs1, AVL:$vl, ixlenimm:$sew), []>,
+        RISCVVPseudo {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasMergeOp = 1;
+  let Constraints = "$rd = $dest";
 }
 
 class VPseudoNullaryNoMask<VReg RegClass>:
@@ -855,13 +920,26 @@ class VPseudoNullaryNoMask<VReg RegClass>:
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasDummyMask = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoNullaryNoMaskTU<VReg RegClass>:
+      Pseudo<(outs RegClass:$rd),
+             (ins RegClass:$merge, AVL:$vl, ixlenimm:$sew),
+             []>, RISCVVPseudo {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let Constraints = "$rd = $merge";
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasDummyMask = 1;
+  let HasMergeOp = 1;
 }
 
 class VPseudoNullaryMask<VReg RegClass>:
       Pseudo<(outs GetVRegNoV0<RegClass>.R:$rd),
              (ins GetVRegNoV0<RegClass>.R:$merge, VMaskOp:$vm, AVL:$vl,
-              ixlenimm:$sew), []>, RISCVVPseudo {
+              ixlenimm:$sew, ixlenimm:$policy), []>, RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
@@ -869,7 +947,8 @@ class VPseudoNullaryMask<VReg RegClass>:
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasMergeOp = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+  let UsesMaskPolicy = 1;
+  let HasVecPolicyOp = 1;
 }
 
 // Nullary for pseudo instructions. They are expanded in
@@ -899,7 +978,21 @@ class VPseudoUnaryNoMask<DAGOperand RetClass, VReg OpClass, string Constraint =
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasDummyMask = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+// RetClass could be GPR or VReg.
+class VPseudoUnaryNoMaskTU<DAGOperand RetClass, VReg OpClass, string Constraint = ""> :
+      Pseudo<(outs RetClass:$rd),
+        (ins RetClass:$merge, OpClass:$rs2, AVL:$vl, ixlenimm:$sew), []>,
+        RISCVVPseudo {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let Constraints = Join<[Constraint, "$rd = $merge"], ",">.ret;
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasDummyMask = 1;
+  let HasMergeOp = 1;
 }
 
 class VPseudoUnaryMask<VReg RetClass, VReg OpClass, string Constraint = ""> :
@@ -914,7 +1007,7 @@ class VPseudoUnaryMask<VReg RetClass, VReg OpClass, string Constraint = ""> :
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasMergeOp = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+  let UsesMaskPolicy = 1;
 }
 
 class VPseudoUnaryMaskTA<VReg RetClass, VReg OpClass, string Constraint = ""> :
@@ -930,7 +1023,7 @@ class VPseudoUnaryMaskTA<VReg RetClass, VReg OpClass, string Constraint = ""> :
   let HasSEWOp = 1;
   let HasMergeOp = 1;
   let HasVecPolicyOp = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+  let UsesMaskPolicy = 1;
 }
 
 // mask unary operation without maskedoff
@@ -943,7 +1036,6 @@ class VPseudoMaskUnarySOutMask:
   let hasSideEffects = 0;
   let HasVLOp = 1;
   let HasSEWOp = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
 // Mask can be V0~V31
@@ -962,13 +1054,13 @@ class VPseudoUnaryAnyMask<VReg RetClass,
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasMergeOp = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
 class VPseudoBinaryNoMask<VReg RetClass,
                           VReg Op1Class,
                           DAGOperand Op2Class,
-                          string Constraint> :
+                          string Constraint,
+                          int DummyMask = 1> :
         Pseudo<(outs RetClass:$rd),
                (ins Op1Class:$rs2, Op2Class:$rs1, AVL:$vl, ixlenimm:$sew), []>,
         RISCVVPseudo {
@@ -978,8 +1070,24 @@ class VPseudoBinaryNoMask<VReg RetClass,
   let Constraints = Constraint;
   let HasVLOp = 1;
   let HasSEWOp = 1;
+  let HasDummyMask = DummyMask;
+}
+
+class VPseudoBinaryNoMaskTU<VReg RetClass,
+                            VReg Op1Class,
+                            DAGOperand Op2Class,
+                            string Constraint> :
+        Pseudo<(outs RetClass:$rd),
+               (ins RetClass:$merge, Op1Class:$rs2, Op2Class:$rs1, AVL:$vl, ixlenimm:$sew), []>,
+        RISCVVPseudo {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let Constraints = Join<[Constraint, "$rd = $merge"], ",">.ret;
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
   let HasDummyMask = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+  let HasMergeOp = 1;
 }
 
 // Special version of VPseudoBinaryNoMask where we pretend the first source is
@@ -989,7 +1097,8 @@ class VPseudoTiedBinaryNoMask<VReg RetClass,
                               DAGOperand Op2Class,
                               string Constraint> :
         Pseudo<(outs RetClass:$rd),
-               (ins RetClass:$rs2, Op2Class:$rs1, AVL:$vl, ixlenimm:$sew), []>,
+               (ins RetClass:$rs2, Op2Class:$rs1, AVL:$vl, ixlenimm:$sew,
+                    ixlenimm:$policy), []>,
         RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
@@ -998,9 +1107,8 @@ class VPseudoTiedBinaryNoMask<VReg RetClass,
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasDummyMask = 1;
-  let ForceTailAgnostic = 1;
+  let HasVecPolicyOp = 1;
   let isConvertibleToThreeAddress = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
 class VPseudoIStoreNoMask<VReg StClass, VReg IdxClass, int EEW, bits<3> LMUL,
@@ -1015,7 +1123,6 @@ class VPseudoIStoreNoMask<VReg StClass, VReg IdxClass, int EEW, bits<3> LMUL,
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasDummyMask = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
 class VPseudoIStoreMask<VReg StClass, VReg IdxClass, int EEW, bits<3> LMUL,
@@ -1029,7 +1136,6 @@ class VPseudoIStoreMask<VReg StClass, VReg IdxClass, int EEW, bits<3> LMUL,
   let hasSideEffects = 0;
   let HasVLOp = 1;
   let HasSEWOp = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
 class VPseudoBinaryMask<VReg RetClass,
@@ -1048,13 +1154,12 @@ class VPseudoBinaryMask<VReg RetClass,
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasMergeOp = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
-class VPseudoBinaryMaskTA<VReg RetClass,
-                          RegisterClass Op1Class,
-                          DAGOperand Op2Class,
-                          string Constraint> :
+class VPseudoBinaryMaskPolicy<VReg RetClass,
+                              RegisterClass Op1Class,
+                              DAGOperand Op2Class,
+                              string Constraint> :
         Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
                 (ins GetVRegNoV0<RetClass>.R:$merge,
                      Op1Class:$rs2, Op2Class:$rs1,
@@ -1068,7 +1173,7 @@ class VPseudoBinaryMaskTA<VReg RetClass,
   let HasSEWOp = 1;
   let HasMergeOp = 1;
   let HasVecPolicyOp = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+  let UsesMaskPolicy = 1;
 }
 
 // Like VPseudoBinaryMask, but output can be V0.
@@ -1088,7 +1193,7 @@ class VPseudoBinaryMOutMask<VReg RetClass,
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasMergeOp = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+  let UsesMaskPolicy = 1;
 }
 
 // Special version of VPseudoBinaryMask where we pretend the first source is
@@ -1110,7 +1215,7 @@ class VPseudoTiedBinaryMask<VReg RetClass,
   let HasSEWOp = 1;
   let HasMergeOp = 0; // Merge is also rs2.
   let HasVecPolicyOp = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+  let UsesMaskPolicy = 1;
 }
 
 class VPseudoBinaryCarryIn<VReg RetClass,
@@ -1132,7 +1237,6 @@ class VPseudoBinaryCarryIn<VReg RetClass,
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasMergeOp = 0;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
   let VLMul = MInfo.value;
 }
 
@@ -1156,7 +1260,6 @@ class VPseudoTiedBinaryCarryIn<VReg RetClass,
   let HasSEWOp = 1;
   let HasMergeOp = 1;
   let HasVecPolicyOp = 0;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
   let VLMul = MInfo.value;
 }
 
@@ -1177,7 +1280,6 @@ class VPseudoTernaryNoMask<VReg RetClass,
   let HasSEWOp = 1;
   let HasMergeOp = 1;
   let HasDummyMask = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
 class VPseudoTernaryNoMaskWithPolicy<VReg RetClass,
@@ -1198,29 +1300,87 @@ class VPseudoTernaryNoMaskWithPolicy<VReg RetClass,
   let HasSEWOp = 1;
   let HasMergeOp = 1;
   let HasDummyMask = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
-class VPseudoUSSegLoadNoMask<VReg RetClass, int EEW, bits<4> NF, bit isFF>:
+class VPseudoUSSegLoadNoMask<VReg RetClass, int EEW, bits<4> NF>:
       Pseudo<(outs RetClass:$rd),
              (ins GPR:$rs1, AVL:$vl, ixlenimm:$sew),[]>,
       RISCVVPseudo,
-      RISCVVLSEG<NF, /*Masked*/0, /*Strided*/0, /*FF*/isFF, log2<EEW>.val, VLMul> {
+      RISCVVLSEG<NF, /*Masked*/0, /*TU*/0, /*Strided*/0, /*FF*/0, log2<EEW>.val, VLMul> {
   let mayLoad = 1;
   let mayStore = 0;
   let hasSideEffects = 0;
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasDummyMask = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
-class VPseudoUSSegLoadMask<VReg RetClass, int EEW, bits<4> NF, bit isFF>:
+class VPseudoUSSegLoadNoMaskTU<VReg RetClass, int EEW, bits<4> NF>:
+      Pseudo<(outs RetClass:$rd),
+             (ins RetClass:$dest, GPR:$rs1, AVL:$vl, ixlenimm:$sew),[]>,
+      RISCVVPseudo,
+      RISCVVLSEG<NF, /*Masked*/0, /*TU*/1, /*Strided*/0, /*FF*/0, log2<EEW>.val, VLMul> {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasDummyMask = 1;
+  let HasMergeOp = 1;
+  let Constraints = "$rd = $dest";
+}
+
+class VPseudoUSSegLoadMask<VReg RetClass, int EEW, bits<4> NF>:
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
              (ins GetVRegNoV0<RetClass>.R:$merge, GPR:$rs1,
                   VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy),[]>,
       RISCVVPseudo,
-      RISCVVLSEG<NF, /*Masked*/1, /*Strided*/0, /*FF*/isFF, log2<EEW>.val, VLMul> {
+      RISCVVLSEG<NF, /*Masked*/1, /*TU*/1, /*Strided*/0, /*FF*/0, log2<EEW>.val, VLMul> {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let Constraints = "$rd = $merge";
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasMergeOp = 1;
+  let HasVecPolicyOp = 1;
+  let UsesMaskPolicy = 1;
+}
+
+class VPseudoUSSegLoadFFNoMask<VReg RetClass, int EEW, bits<4> NF>:
+      Pseudo<(outs RetClass:$rd, GPR:$vl),
+             (ins GPR:$rs1, AVL:$avl, ixlenimm:$sew),[]>,
+      RISCVVPseudo,
+      RISCVVLSEG<NF, /*Masked*/0, /*TU*/0, /*Strided*/0, /*FF*/1, log2<EEW>.val, VLMul> {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasDummyMask = 1;
+}
+
+class VPseudoUSSegLoadFFNoMaskTU<VReg RetClass, int EEW, bits<4> NF>:
+      Pseudo<(outs RetClass:$rd, GPR:$vl),
+             (ins RetClass:$dest, GPR:$rs1, AVL:$avl, ixlenimm:$sew),[]>,
+      RISCVVPseudo,
+      RISCVVLSEG<NF, /*Masked*/0, /*TU*/1, /*Strided*/0, /*FF*/1, log2<EEW>.val, VLMul> {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasDummyMask = 1;
+  let HasMergeOp = 1;
+  let Constraints = "$rd = $dest";
+}
+
+class VPseudoUSSegLoadFFMask<VReg RetClass, int EEW, bits<4> NF>:
+      Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd, GPR:$vl),
+             (ins GetVRegNoV0<RetClass>.R:$merge, GPR:$rs1,
+                  VMaskOp:$vm, AVL:$avl, ixlenimm:$sew, ixlenimm:$policy),[]>,
+      RISCVVPseudo,
+      RISCVVLSEG<NF, /*Masked*/1, /*TU*/1, /*Strided*/0, /*FF*/1, log2<EEW>.val, VLMul> {
   let mayLoad = 1;
   let mayStore = 0;
   let hasSideEffects = 0;
@@ -1229,14 +1389,14 @@ class VPseudoUSSegLoadMask<VReg RetClass, int EEW, bits<4> NF, bit isFF>:
   let HasSEWOp = 1;
   let HasMergeOp = 1;
   let HasVecPolicyOp = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+  let UsesMaskPolicy = 1;
 }
 
 class VPseudoSSegLoadNoMask<VReg RetClass, int EEW, bits<4> NF>:
       Pseudo<(outs RetClass:$rd),
              (ins GPR:$rs1, GPR:$offset, AVL:$vl, ixlenimm:$sew),[]>,
       RISCVVPseudo,
-      RISCVVLSEG<NF, /*Masked*/0, /*Strided*/1, /*FF*/0, log2<EEW>.val, VLMul> {
+      RISCVVLSEG<NF, /*Masked*/0, /*TU*/0, /*Strided*/1, /*FF*/0, log2<EEW>.val, VLMul> {
   let mayLoad = 1;
   let mayLoad = 1;
   let mayStore = 0;
@@ -1244,7 +1404,22 @@ class VPseudoSSegLoadNoMask<VReg RetClass, int EEW, bits<4> NF>:
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasDummyMask = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoSSegLoadNoMaskTU<VReg RetClass, int EEW, bits<4> NF>:
+      Pseudo<(outs RetClass:$rd),
+             (ins RetClass:$merge, GPR:$rs1, GPR:$offset, AVL:$vl, ixlenimm:$sew),[]>,
+      RISCVVPseudo,
+      RISCVVLSEG<NF, /*Masked*/0, /*TU*/1, /*Strided*/1, /*FF*/0, log2<EEW>.val, VLMul> {
+  let mayLoad = 1;
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasDummyMask = 1;
+  let HasMergeOp = 1;
+  let Constraints = "$rd = $merge";
 }
 
 class VPseudoSSegLoadMask<VReg RetClass, int EEW, bits<4> NF>:
@@ -1253,7 +1428,7 @@ class VPseudoSSegLoadMask<VReg RetClass, int EEW, bits<4> NF>:
                   GPR:$offset, VMaskOp:$vm, AVL:$vl, ixlenimm:$sew,
                   ixlenimm:$policy),[]>,
       RISCVVPseudo,
-      RISCVVLSEG<NF, /*Masked*/1, /*Strided*/1, /*FF*/0, log2<EEW>.val, VLMul> {
+      RISCVVLSEG<NF, /*Masked*/1, /*TU*/1, /*Strided*/1, /*FF*/0, log2<EEW>.val, VLMul> {
   let mayLoad = 1;
   let mayStore = 0;
   let hasSideEffects = 0;
@@ -1262,7 +1437,7 @@ class VPseudoSSegLoadMask<VReg RetClass, int EEW, bits<4> NF>:
   let HasSEWOp = 1;
   let HasMergeOp = 1;
   let HasVecPolicyOp = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+  let UsesMaskPolicy = 1;
 }
 
 class VPseudoISegLoadNoMask<VReg RetClass, VReg IdxClass, int EEW, bits<3> LMUL,
@@ -1270,7 +1445,7 @@ class VPseudoISegLoadNoMask<VReg RetClass, VReg IdxClass, int EEW, bits<3> LMUL,
       Pseudo<(outs RetClass:$rd),
              (ins GPR:$rs1, IdxClass:$offset, AVL:$vl, ixlenimm:$sew),[]>,
       RISCVVPseudo,
-      RISCVVLXSEG<NF, /*Masked*/0, Ordered, log2<EEW>.val, VLMul, LMUL> {
+      RISCVVLXSEG<NF, /*Masked*/0, /*TU*/0, Ordered, log2<EEW>.val, VLMul, LMUL> {
   let mayLoad = 1;
   let mayStore = 0;
   let hasSideEffects = 0;
@@ -1280,7 +1455,24 @@ class VPseudoISegLoadNoMask<VReg RetClass, VReg IdxClass, int EEW, bits<3> LMUL,
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasDummyMask = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoISegLoadNoMaskTU<VReg RetClass, VReg IdxClass, int EEW, bits<3> LMUL,
+                              bits<4> NF, bit Ordered>:
+      Pseudo<(outs RetClass:$rd),
+             (ins RetClass:$merge, GPR:$rs1, IdxClass:$offset, AVL:$vl, ixlenimm:$sew),[]>,
+      RISCVVPseudo,
+      RISCVVLXSEG<NF, /*Masked*/0, /*TU*/1, Ordered, log2<EEW>.val, VLMul, LMUL> {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  // For vector indexed segment loads, the destination vector register groups
+  // cannot overlap the source vector register group
+  let Constraints = "@earlyclobber $rd, $rd = $merge";
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasDummyMask = 1;
+  let HasMergeOp = 1;
 }
 
 class VPseudoISegLoadMask<VReg RetClass, VReg IdxClass, int EEW, bits<3> LMUL,
@@ -1290,7 +1482,7 @@ class VPseudoISegLoadMask<VReg RetClass, VReg IdxClass, int EEW, bits<3> LMUL,
                   IdxClass:$offset, VMaskOp:$vm, AVL:$vl, ixlenimm:$sew,
                   ixlenimm:$policy),[]>,
       RISCVVPseudo,
-      RISCVVLXSEG<NF, /*Masked*/1, Ordered, log2<EEW>.val, VLMul, LMUL> {
+      RISCVVLXSEG<NF, /*Masked*/1, /*TU*/1, Ordered, log2<EEW>.val, VLMul, LMUL> {
   let mayLoad = 1;
   let mayStore = 0;
   let hasSideEffects = 0;
@@ -1301,7 +1493,7 @@ class VPseudoISegLoadMask<VReg RetClass, VReg IdxClass, int EEW, bits<3> LMUL,
   let HasSEWOp = 1;
   let HasMergeOp = 1;
   let HasVecPolicyOp = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+  let UsesMaskPolicy = 1;
 }
 
 class VPseudoUSSegStoreNoMask<VReg ValClass, int EEW, bits<4> NF>:
@@ -1315,7 +1507,6 @@ class VPseudoUSSegStoreNoMask<VReg ValClass, int EEW, bits<4> NF>:
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasDummyMask = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
 class VPseudoUSSegStoreMask<VReg ValClass, int EEW, bits<4> NF>:
@@ -1329,7 +1520,6 @@ class VPseudoUSSegStoreMask<VReg ValClass, int EEW, bits<4> NF>:
   let hasSideEffects = 0;
   let HasVLOp = 1;
   let HasSEWOp = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
 class VPseudoSSegStoreNoMask<VReg ValClass, int EEW, bits<4> NF>:
@@ -1343,7 +1533,6 @@ class VPseudoSSegStoreNoMask<VReg ValClass, int EEW, bits<4> NF>:
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasDummyMask = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
 class VPseudoSSegStoreMask<VReg ValClass, int EEW, bits<4> NF>:
@@ -1357,7 +1546,6 @@ class VPseudoSSegStoreMask<VReg ValClass, int EEW, bits<4> NF>:
   let hasSideEffects = 0;
   let HasVLOp = 1;
   let HasSEWOp = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
 class VPseudoISegStoreNoMask<VReg ValClass, VReg IdxClass, int EEW, bits<3> LMUL,
@@ -1373,7 +1561,6 @@ class VPseudoISegStoreNoMask<VReg ValClass, VReg IdxClass, int EEW, bits<3> LMUL
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasDummyMask = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
 class VPseudoISegStoreMask<VReg ValClass, VReg IdxClass, int EEW, bits<3> LMUL,
@@ -1388,7 +1575,6 @@ class VPseudoISegStoreMask<VReg ValClass, VReg IdxClass, int EEW, bits<3> LMUL,
   let hasSideEffects = 0;
   let HasVLOp = 1;
   let HasSEWOp = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
 multiclass VPseudoUSLoad {
@@ -1398,13 +1584,13 @@ multiclass VPseudoUSLoad {
       defvar vreg = lmul.vrclass;
       let VLMul = lmul.value in {
         def "E" # eew # "_V_" # LInfo :
-          VPseudoUSLoadNoMask<vreg, eew, false>,
+          VPseudoUSLoadNoMask<vreg, eew>,
           VLESched<eew>;
         def "E" # eew # "_V_" # LInfo # "_TU":
-          VPseudoUSLoadNoMaskTU<vreg, eew, false>,
+          VPseudoUSLoadNoMaskTU<vreg, eew>,
           VLESched<eew>;
         def "E" # eew # "_V_" # LInfo # "_MASK" :
-          VPseudoUSLoadMask<vreg, eew, false>,
+          VPseudoUSLoadMask<vreg, eew>,
           VLESched<eew>;
       }
     }
@@ -1417,14 +1603,14 @@ multiclass VPseudoFFLoad {
       defvar LInfo = lmul.MX;
       defvar vreg = lmul.vrclass;
       let VLMul = lmul.value in {
-        def "E" # eew # "FF_V_" # LInfo :
-          VPseudoUSLoadNoMask<vreg, eew, true>,
+        def "E" # eew # "FF_V_" # LInfo:
+          VPseudoUSLoadFFNoMask<vreg, eew>,
           VLFSched<eew>;
         def "E" # eew # "FF_V_" # LInfo # "_TU":
-          VPseudoUSLoadNoMaskTU<vreg, eew, true>,
+          VPseudoUSLoadFFNoMaskTU<vreg, eew>,
           VLFSched<eew>;
-        def "E" # eew # "FF_V_" # LInfo # "_MASK" :
-          VPseudoUSLoadMask<vreg, eew, true>,
+        def "E" # eew # "FF_V_" # LInfo # "_MASK":
+          VPseudoUSLoadFFMask<vreg, eew>,
           VLFSched<eew>;
       }
     }
@@ -1434,7 +1620,7 @@ multiclass VPseudoFFLoad {
 multiclass VPseudoLoadMask {
   foreach mti = AllMasks in {
     let VLMul = mti.LMul.value in {
-      def "_V_" # mti.BX : VPseudoUSLoadNoMask<VR, /*EEW*/1, /*isFF*/0>;
+      def "_V_" # mti.BX : VPseudoUSLoadNoMask<VR, /*EEW*/1, /*DummyMask*/0>;
     }
   }
 }
@@ -1506,7 +1692,7 @@ multiclass VPseudoUSStore {
 multiclass VPseudoStoreMask {
   foreach mti = AllMasks in {
     let VLMul = mti.LMul.value in {
-      def "_V_" # mti.BX : VPseudoUSStoreNoMask<VR, /*EEW*/1>;
+      def "_V_" # mti.BX : VPseudoUSStoreNoMask<VR, /*EEW*/1, /*DummyMask*/0>;
     }
   }
 }
@@ -1596,6 +1782,8 @@ multiclass VPseudoVID_V {
     let VLMul = m.value in {
       def "_V_" # m.MX : VPseudoNullaryNoMask<m.vrclass>,
                          Sched<[WriteVMIdxV, ReadVMask]>;
+      def "_V_" # m.MX # "_TU": VPseudoNullaryNoMaskTU<m.vrclass>,
+                                Sched<[WriteVMIdxV, ReadVMask]>;
       def "_V_" # m.MX # "_MASK" : VPseudoNullaryMask<m.vrclass>,
                                    Sched<[WriteVMIdxV, ReadVMask]>;
     }
@@ -1616,7 +1804,9 @@ multiclass VPseudoVIOT_M {
     let VLMul = m.value in {
       def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, VR, constraint>,
                        Sched<[WriteVMIotV, ReadVMIotV, ReadVMask]>;
-      def "_" # m.MX # "_MASK" : VPseudoUnaryMask<m.vrclass, VR, constraint>,
+      def "_" # m.MX # "_TU" : VPseudoUnaryNoMaskTU<m.vrclass, VR, constraint>,
+                               Sched<[WriteVMIotV, ReadVMIotV, ReadVMask]>;
+      def "_" # m.MX # "_MASK" : VPseudoUnaryMaskTA<m.vrclass, VR, constraint>,
                                  Sched<[WriteVMIotV, ReadVMIotV, ReadVMask]>;
     }
   }
@@ -1638,8 +1828,11 @@ multiclass VPseudoBinary<VReg RetClass,
   let VLMul = MInfo.value in {
     def "_" # MInfo.MX : VPseudoBinaryNoMask<RetClass, Op1Class, Op2Class,
                                              Constraint>;
-    def "_" # MInfo.MX # "_MASK" : VPseudoBinaryMaskTA<RetClass, Op1Class, Op2Class,
+    def "_" # MInfo.MX # "_TU" : VPseudoBinaryNoMaskTU<RetClass, Op1Class, Op2Class,
                                                        Constraint>;
+    def "_" # MInfo.MX # "_MASK" : VPseudoBinaryMaskPolicy<RetClass, Op1Class, Op2Class,
+                                                           Constraint>,
+                                   RISCVMaskedPseudo</*MaskOpIdx*/ 3>;
   }
 }
 
@@ -1653,7 +1846,8 @@ multiclass VPseudoBinaryM<VReg RetClass,
                                              Constraint>;
     let ForceTailAgnostic = true in
     def "_" # MInfo.MX # "_MASK" : VPseudoBinaryMOutMask<RetClass, Op1Class,
-                                                         Op2Class, Constraint>;
+                                                         Op2Class, Constraint>,
+                                   RISCVMaskedPseudo</*MaskOpIdx*/ 3, /*HasTU*/ false>;
   }
 }
 
@@ -1666,8 +1860,11 @@ multiclass VPseudoBinaryEmul<VReg RetClass,
   let VLMul = lmul.value in {
     def "_" # lmul.MX # "_" # emul.MX : VPseudoBinaryNoMask<RetClass, Op1Class, Op2Class,
                                                             Constraint>;
-    def "_" # lmul.MX # "_" # emul.MX # "_MASK" : VPseudoBinaryMaskTA<RetClass, Op1Class, Op2Class,
-                                                                      Constraint>;
+    def "_" # lmul.MX # "_" # emul.MX # "_TU": VPseudoBinaryNoMaskTU<RetClass, Op1Class, Op2Class,
+                                                                     Constraint>;
+    def "_" # lmul.MX # "_" # emul.MX # "_MASK" : VPseudoBinaryMaskPolicy<RetClass, Op1Class, Op2Class,
+                                                                          Constraint>,
+                                                  RISCVMaskedPseudo</*MaskOpIdx*/ 3>;
   }
 }
 
@@ -1744,7 +1941,7 @@ multiclass VPseudoBinaryV_VI<Operand ImmType = simm5, string Constraint = ""> {
 multiclass VPseudoVALU_MM {
   foreach m = MxList in
     let VLMul = m.value in {
-      def "_MM_" # m.MX : VPseudoBinaryNoMask<VR, VR, VR, "">,
+      def "_MM_" # m.MX : VPseudoBinaryNoMask<VR, VR, VR, "", /*DummyMask*/0>,
                           Sched<[WriteVMALUV, ReadVMALUV, ReadVMALUV]>;
     }
 }
@@ -1907,6 +2104,12 @@ multiclass VPseudoUnaryVMV_V_X_I {
                          Sched<[WriteVIMovX, ReadVIMovX]>;
       def "_I_" # m.MX : VPseudoUnaryNoDummyMask<m.vrclass, simm5>,
                          Sched<[WriteVIMovI]>;
+      def "_V_" # m.MX # "_TU": VPseudoUnaryNoDummyMaskTU<m.vrclass, m.vrclass>,
+                         Sched<[WriteVIMovV, ReadVIMovV]>;
+      def "_X_" # m.MX # "_TU": VPseudoUnaryNoDummyMaskTU<m.vrclass, GPR>,
+                         Sched<[WriteVIMovX, ReadVIMovX]>;
+      def "_I_" # m.MX # "_TU": VPseudoUnaryNoDummyMaskTU<m.vrclass, simm5>,
+                         Sched<[WriteVIMovI]>;
     }
   }
 }
@@ -1918,6 +2121,9 @@ multiclass VPseudoVMV_F {
         def "_" # f.FX # "_" # m.MX :
           VPseudoUnaryNoDummyMask<m.vrclass, f.fprclass>,
           Sched<[WriteVFMovV, ReadVFMovF]>;
+        def "_" # f.FX # "_" # m.MX # "_TU":
+          VPseudoUnaryNoDummyMaskTU<m.vrclass, f.fprclass>,
+          Sched<[WriteVFMovV, ReadVFMovF]>;
       }
     }
   }
@@ -1928,7 +2134,9 @@ multiclass VPseudoVCLS_V {
     let VLMul = m.value in {
       def "_V_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.vrclass>,
                          Sched<[WriteVFClassV, ReadVFClassV, ReadVMask]>;
-      def "_V_" # m.MX # "_MASK" : VPseudoUnaryMask<m.vrclass, m.vrclass>,
+      def "_V_" # m.MX # "_TU": VPseudoUnaryNoMaskTU<m.vrclass, m.vrclass>,
+                                Sched<[WriteVFClassV, ReadVFClassV, ReadVMask]>;
+      def "_V_" # m.MX # "_MASK" : VPseudoUnaryMaskTA<m.vrclass, m.vrclass>,
                                    Sched<[WriteVFClassV, ReadVFClassV, ReadVMask]>;
     }
   }
@@ -1939,6 +2147,8 @@ multiclass VPseudoVSQR_V {
     let VLMul = m.value in {
       def "_V_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.vrclass>,
                          Sched<[WriteVFSqrtV, ReadVFSqrtV, ReadVMask]>;
+      def "_V_" # m.MX # "_TU": VPseudoUnaryNoMaskTU<m.vrclass, m.vrclass>,
+                                Sched<[WriteVFSqrtV, ReadVFSqrtV, ReadVMask]>;
       def "_V_" # m.MX # "_MASK" : VPseudoUnaryMaskTA<m.vrclass, m.vrclass>,
                                    Sched<[WriteVFSqrtV, ReadVFSqrtV, ReadVMask]>;
     }
@@ -1950,6 +2160,8 @@ multiclass VPseudoVRCP_V {
     let VLMul = m.value in {
       def "_V_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.vrclass>,
                          Sched<[WriteVFRecpV, ReadVFRecpV, ReadVMask]>;
+      def "_V_" # m.MX # "_TU": VPseudoUnaryNoMaskTU<m.vrclass, m.vrclass>,
+                                Sched<[WriteVFRecpV, ReadVFRecpV, ReadVMask]>;
       def "_V_" # m.MX # "_MASK" : VPseudoUnaryMaskTA<m.vrclass, m.vrclass>,
                                    Sched<[WriteVFRecpV, ReadVFRecpV, ReadVMask]>;
     }
@@ -1963,8 +2175,11 @@ multiclass PseudoVEXT_VF2 {
     let VLMul = m.value in {
       def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f2vrclass, constraints>,
                        Sched<[WriteVExtV, ReadVExtV, ReadVMask]>;
+      def "_" # m.MX # "_TU": VPseudoUnaryNoMaskTU<m.vrclass, m.f2vrclass, constraints>,
+                              Sched<[WriteVExtV, ReadVExtV, ReadVMask]>;
       def "_" # m.MX # "_MASK" :
         VPseudoUnaryMaskTA<m.vrclass, m.f2vrclass, constraints>,
+        RISCVMaskedPseudo</*MaskOpIdx*/ 2>,
         Sched<[WriteVExtV, ReadVExtV, ReadVMask]>;
     }
   }
@@ -1977,8 +2192,11 @@ multiclass PseudoVEXT_VF4 {
     let VLMul = m.value in {
       def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f4vrclass, constraints>,
                        Sched<[WriteVExtV, ReadVExtV, ReadVMask]>;
+      def "_" # m.MX # "_TU": VPseudoUnaryNoMaskTU<m.vrclass, m.f4vrclass, constraints>,
+                              Sched<[WriteVExtV, ReadVExtV, ReadVMask]>;
       def "_" # m.MX # "_MASK" :
         VPseudoUnaryMaskTA<m.vrclass, m.f4vrclass, constraints>,
+        RISCVMaskedPseudo</*MaskOpIdx*/ 2>,
         Sched<[WriteVExtV, ReadVExtV, ReadVMask]>;
     }
   }
@@ -1991,8 +2209,11 @@ multiclass PseudoVEXT_VF8 {
     let VLMul = m.value in {
       def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f8vrclass, constraints>,
                        Sched<[WriteVExtV, ReadVExtV, ReadVMask]>;
+      def "_" # m.MX # "_TU": VPseudoUnaryNoMaskTU<m.vrclass, m.f8vrclass, constraints>,
+                              Sched<[WriteVExtV, ReadVExtV, ReadVMask]>;
       def "_" # m.MX # "_MASK" :
         VPseudoUnaryMaskTA<m.vrclass, m.f8vrclass, constraints>,
+        RISCVMaskedPseudo</*MaskOpIdx*/ 2>,
         Sched<[WriteVExtV, ReadVExtV, ReadVMask]>;
     }
   }
@@ -2248,6 +2469,13 @@ multiclass VPseudoVCALU_VM_XM_IM {
             Sched<[WriteVICALUX, ReadVIALUCV, ReadVIALUCX, ReadVMask]>;
   defm "" : VPseudoBinaryV_IM,
             Sched<[WriteVICALUI, ReadVIALUCV, ReadVMask]>;
+  // Tied versions to allow codegen control over the tail elements
+  defm "" : VPseudoTiedBinaryV_VM,
+            Sched<[WriteVICALUV, ReadVIALUCV, ReadVIALUCV, ReadVMask]>;
+  defm "" : VPseudoTiedBinaryV_XM,
+            Sched<[WriteVICALUX, ReadVIALUCV, ReadVIALUCX, ReadVMask]>;
+  defm "" : VPseudoTiedBinaryV_IM,
+            Sched<[WriteVICALUI, ReadVIALUCV, ReadVMask]>;
 }
 
 multiclass VPseudoVCALU_VM_XM {
@@ -2255,6 +2483,11 @@ multiclass VPseudoVCALU_VM_XM {
             Sched<[WriteVICALUV, ReadVIALUCV, ReadVIALUCV, ReadVMask]>;
   defm "" : VPseudoBinaryV_XM,
             Sched<[WriteVICALUX, ReadVIALUCV, ReadVIALUCX, ReadVMask]>;
+  // Tied versions to allow codegen control over the tail elements
+  defm "" : VPseudoTiedBinaryV_VM,
+            Sched<[WriteVICALUV, ReadVIALUCV, ReadVIALUCV, ReadVMask]>;
+  defm "" : VPseudoTiedBinaryV_XM,
+            Sched<[WriteVICALUX, ReadVIALUCV, ReadVIALUCX, ReadVMask]>;
 }
 
 multiclass VPseudoVCALUM_VM_XM_IM<string Constraint> {
@@ -2318,6 +2551,19 @@ multiclass VPseudoTernary<VReg RetClass,
   }
 }
 
+multiclass VPseudoTernaryNoMaskNoPolicy<VReg RetClass,
+                                        RegisterClass Op1Class,
+                                        DAGOperand Op2Class,
+                                        LMULInfo MInfo,
+                                        string Constraint = ""> {
+  let VLMul = MInfo.value in {
+    def "_" # MInfo.MX : VPseudoTernaryNoMask<RetClass, Op1Class, Op2Class, Constraint>;
+    def "_" # MInfo.MX # "_MASK" : VPseudoBinaryMaskPolicy<RetClass, Op1Class, Op2Class,
+                                                           Constraint>;
+                                   
+  }
+}
+
 multiclass VPseudoTernaryWithPolicy<VReg RetClass,
                                     RegisterClass Op1Class,
                                     DAGOperand Op2Class,
@@ -2327,7 +2573,7 @@ multiclass VPseudoTernaryWithPolicy<VReg RetClass,
   let VLMul = MInfo.value in {
     let isCommutable = Commutable in
     def "_" # MInfo.MX : VPseudoTernaryNoMaskWithPolicy<RetClass, Op1Class, Op2Class, Constraint>;
-    def "_" # MInfo.MX # "_MASK" : VPseudoBinaryMask<RetClass, Op1Class, Op2Class, Constraint>;
+    def "_" # MInfo.MX # "_MASK" : VPseudoBinaryMaskPolicy<RetClass, Op1Class, Op2Class, Constraint>;
   }
 }
 
@@ -2339,9 +2585,9 @@ multiclass VPseudoTernaryV_VV_AAXA<string Constraint = "",
   }
 }
 
-multiclass VPseudoTernaryV_VX<string Constraint = ""> {
+multiclass VPseudoVSLDV_VX<string Constraint = ""> {
   foreach m = MxList in
-    defm _VX : VPseudoTernary<m.vrclass, m.vrclass, GPR, m, Constraint>;
+    defm _VX : VPseudoTernaryWithPolicy<m.vrclass, m.vrclass, GPR, m, Constraint>;
 }
 
 multiclass VPseudoTernaryV_VX_AAXA<string Constraint = ""> {
@@ -2380,9 +2626,9 @@ multiclass VPseudoTernaryW_VF {
                                                   m.vrclass, m, constraint>;
 }
 
-multiclass VPseudoTernaryV_VI<Operand ImmType = simm5, string Constraint = ""> {
+multiclass VPseudoVSLDV_VI<Operand ImmType = simm5, string Constraint = ""> {
   foreach m = MxList in
-    defm _VI : VPseudoTernary<m.vrclass, m.vrclass, ImmType, m, Constraint>;
+    defm _VI : VPseudoTernaryWithPolicy<m.vrclass, m.vrclass, ImmType, m, Constraint>;
 }
 
 multiclass VPseudoVMAC_VV_VX_AAXA<string Constraint = ""> {
@@ -2400,9 +2646,9 @@ multiclass VPseudoVMAC_VV_VF_AAXA<string Constraint = ""> {
 }
 
 multiclass VPseudoVSLD_VX_VI<Operand ImmType = simm5, string Constraint = ""> {
-  defm "" : VPseudoTernaryV_VX<Constraint>,
+  defm "" : VPseudoVSLDV_VX<Constraint>,
             Sched<[WriteVISlideX, ReadVISlideV, ReadVISlideV, ReadVISlideX, ReadVMask]>;
-  defm "" : VPseudoTernaryV_VI<ImmType, Constraint>,
+  defm "" : VPseudoVSLDV_VI<ImmType, Constraint>,
             Sched<[WriteVISlideI, ReadVISlideV, ReadVISlideV, ReadVMask]>;
 }
 
@@ -2501,8 +2747,10 @@ multiclass VPseudoConversion<VReg RetClass,
                              string Constraint = ""> {
   let VLMul = MInfo.value in {
     def "_" # MInfo.MX : VPseudoUnaryNoMask<RetClass, Op1Class, Constraint>;
+    def "_" # MInfo.MX # "_TU": VPseudoUnaryNoMaskTU<RetClass, Op1Class, Constraint>;
     def "_" # MInfo.MX # "_MASK" : VPseudoUnaryMaskTA<RetClass, Op1Class,
-                                                      Constraint>;
+                                                      Constraint>,
+                                   RISCVMaskedPseudo</*MaskOpIdx*/ 2>;
   }
 }
 
@@ -2566,18 +2814,38 @@ multiclass VPseudoVNCVTD_W {
               Sched<[WriteVFNCvtFToFV, ReadVFNCvtFToFV, ReadVMask]>;
 }
 
-multiclass VPseudoUSSegLoad<bit isFF> {
+multiclass VPseudoUSSegLoad {
   foreach eew = EEWList in {
     foreach lmul = MxSet<eew>.m in {
       defvar LInfo = lmul.MX;
       let VLMul = lmul.value in {
         foreach nf = NFSet<lmul>.L in {
           defvar vreg = SegRegClass<lmul, nf>.RC;
-          defvar FFStr = !if(isFF, "FF", "");
-          def nf # "E" # eew # FFStr # "_V_" # LInfo :
-            VPseudoUSSegLoadNoMask<vreg, eew, nf, isFF>;
-          def nf # "E" # eew # FFStr # "_V_" # LInfo # "_MASK" :
-            VPseudoUSSegLoadMask<vreg, eew, nf, isFF>;
+          def nf # "E" # eew # "_V_" # LInfo :
+            VPseudoUSSegLoadNoMask<vreg, eew, nf>;
+          def nf # "E" # eew # "_V_" # LInfo # "_TU" :
+            VPseudoUSSegLoadNoMaskTU<vreg, eew, nf>;
+          def nf # "E" # eew # "_V_" # LInfo # "_MASK" :
+            VPseudoUSSegLoadMask<vreg, eew, nf>;
+        }
+      }
+    }
+  }
+}
+
+multiclass VPseudoUSSegLoadFF {
+  foreach eew = EEWList in {
+    foreach lmul = MxSet<eew>.m in {
+      defvar LInfo = lmul.MX;
+      let VLMul = lmul.value in {
+        foreach nf = NFSet<lmul>.L in {
+          defvar vreg = SegRegClass<lmul, nf>.RC;
+          def nf # "E" # eew # "FF_V_" # LInfo :
+            VPseudoUSSegLoadFFNoMask<vreg, eew, nf>;
+          def nf # "E" # eew # "FF_V_" # LInfo # "_TU" :
+            VPseudoUSSegLoadFFNoMaskTU<vreg, eew, nf>;
+          def nf # "E" # eew # "FF_V_" # LInfo # "_MASK" :
+            VPseudoUSSegLoadFFMask<vreg, eew, nf>;
         }
       }
     }
@@ -2592,6 +2860,7 @@ multiclass VPseudoSSegLoad {
         foreach nf = NFSet<lmul>.L in {
           defvar vreg = SegRegClass<lmul, nf>.RC;
           def nf # "E" # eew # "_V_" # LInfo : VPseudoSSegLoadNoMask<vreg, eew, nf>;
+          def nf # "E" # eew # "_V_" # LInfo # "_TU" : VPseudoSSegLoadNoMaskTU<vreg, eew, nf>;
           def nf # "E" # eew # "_V_" # LInfo # "_MASK" : VPseudoSSegLoadMask<vreg, eew, nf>;
         }
       }
@@ -2618,6 +2887,9 @@ multiclass VPseudoISegLoad<bit Ordered> {
               def nf # "EI" # idx_eew # "_V_" # IdxLInfo # "_" # ValLInfo :
                 VPseudoISegLoadNoMask<ValVreg, IdxVreg, idx_eew, idx_lmul.value,
                                       nf, Ordered>;
+              def nf # "EI" # idx_eew # "_V_" # IdxLInfo # "_" # ValLInfo # "_TU" :
+                VPseudoISegLoadNoMaskTU<ValVreg, IdxVreg, idx_eew, idx_lmul.value,
+                                        nf, Ordered>;
               def nf # "EI" # idx_eew # "_V_" # IdxLInfo # "_" # ValLInfo # "_MASK" :
                 VPseudoISegLoadMask<ValVreg, IdxVreg, idx_eew, idx_lmul.value,
                                     nf, Ordered>;
@@ -2702,12 +2974,31 @@ class VPatUnaryNoMask<string intrinsic_name,
                       LMULInfo vlmul,
                       VReg op2_reg_class> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
+                   (result_type undef),
                    (op2_type op2_reg_class:$rs2),
                    VLOpFrag)),
                    (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX)
                    (op2_type op2_reg_class:$rs2),
                    GPR:$vl, sew)>;
 
+class VPatUnaryNoMaskTU<string intrinsic_name,
+                        string inst,
+                        string kind,
+                        ValueType result_type,
+                        ValueType op2_type,
+                        int sew,
+                        LMULInfo vlmul,
+                        VReg result_reg_class,
+                        VReg op2_reg_class> :
+  Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
+                   (result_type result_reg_class:$merge),
+                   (op2_type op2_reg_class:$rs2),
+                   VLOpFrag)),
+                   (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX#"_TU")
+                   (result_type result_reg_class:$merge),
+                   (op2_type op2_reg_class:$rs2),
+                   GPR:$vl, sew)>;
+
 class VPatUnaryMask<string intrinsic_name,
                     string inst,
                     string kind,
@@ -2792,15 +3083,33 @@ class VPatUnaryAnyMask<string intrinsic,
                    (mask_type VR:$rs2),
                    GPR:$vl, sew)>;
 
-class VPatBinaryNoMask<string intrinsic_name,
-                       string inst,
-                       ValueType result_type,
-                       ValueType op1_type,
-                       ValueType op2_type,
-                       int sew,
-                       VReg op1_reg_class,
-                       DAGOperand op2_kind> :
+class VPatBinaryM<string intrinsic_name,
+                  string inst,
+                  ValueType result_type,
+                  ValueType op1_type,
+                  ValueType op2_type,
+                  int sew,
+                  VReg op1_reg_class,
+                  DAGOperand op2_kind> :
+  Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
+                   (op1_type op1_reg_class:$rs1),
+                   (op2_type op2_kind:$rs2),
+                   VLOpFrag)),
+                   (!cast<Instruction>(inst)
+                   (op1_type op1_reg_class:$rs1),
+                   (op2_type op2_kind:$rs2),
+                   GPR:$vl, sew)>;
+
+class VPatBinaryNoMaskTA<string intrinsic_name,
+                         string inst,
+                         ValueType result_type,
+                         ValueType op1_type,
+                         ValueType op2_type,
+                         int sew,
+                         VReg op1_reg_class,
+                         DAGOperand op2_kind> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
+                   (result_type (undef)),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
                    VLOpFrag)),
@@ -2809,6 +3118,26 @@ class VPatBinaryNoMask<string intrinsic_name,
                    (op2_type op2_kind:$rs2),
                    GPR:$vl, sew)>;
 
+class VPatBinaryNoMaskTU<string intrinsic_name,
+                         string inst,
+                         ValueType result_type,
+                         ValueType op1_type,
+                         ValueType op2_type,
+                         int sew,
+                         VReg result_reg_class,
+                         VReg op1_reg_class,
+                         DAGOperand op2_kind> :
+  Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
+                   (result_type result_reg_class:$merge),
+                   (op1_type op1_reg_class:$rs1),
+                   (op2_type op2_kind:$rs2),
+                   VLOpFrag)),
+                   (!cast<Instruction>(inst#"_TU")
+                   (result_type result_reg_class:$merge),
+                   (op1_type op1_reg_class:$rs1),
+                   (op2_type op2_kind:$rs2),
+                   GPR:$vl, sew)>;
+
 // Same as above but source operands are swapped.
 class VPatBinaryNoMaskSwapped<string intrinsic_name,
                               string inst,
@@ -2902,13 +3231,31 @@ class VPatTiedBinaryNoMask<string intrinsic_name,
                            VReg result_reg_class,
                            DAGOperand op2_kind> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
+                   (result_type (undef)),
                    (result_type result_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
                    VLOpFrag)),
                    (!cast<Instruction>(inst#"_TIED")
                    (result_type result_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
-                   GPR:$vl, sew)>;
+                   GPR:$vl, sew, TAIL_AGNOSTIC)>;
+
+class VPatTiedBinaryNoMaskTU<string intrinsic_name,
+                             string inst,
+                             ValueType result_type,
+                             ValueType op2_type,
+                             int sew,
+                             VReg result_reg_class,
+                             DAGOperand op2_kind> :
+  Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
+                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$merge),
+                   (op2_type op2_kind:$rs2),
+                   VLOpFrag)),
+                   (!cast<Instruction>(inst#"_TIED")
+                   (result_type result_reg_class:$merge),
+                   (op2_type op2_kind:$rs2),
+                   GPR:$vl, sew, TAIL_UNDISTURBED_MASK_UNDISTURBED)>;
 
 class VPatTiedBinaryMask<string intrinsic_name,
                          string inst,
@@ -2966,12 +3313,12 @@ class VPatTernaryNoMaskWithPolicy<string intrinsic,
                     (result_type result_reg_class:$rs3),
                     (op1_type op1_reg_class:$rs1),
                     (op2_type op2_kind:$rs2),
-                    VLOpFrag)),
+                    VLOpFrag, (XLenVT timm:$policy))),
                    (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX)
                     result_reg_class:$rs3,
                     (op1_type op1_reg_class:$rs1),
                     op2_kind:$rs2,
-                    GPR:$vl, sew, TAIL_UNDISTURBED)>;
+                    GPR:$vl, sew, (XLenVT timm:$policy))>;
 
 class VPatTernaryMask<string intrinsic,
                       string inst,
@@ -2998,6 +3345,31 @@ class VPatTernaryMask<string intrinsic,
                     (mask_type V0),
                     GPR:$vl, sew)>;
 
+class VPatTernaryMaskPolicy<string intrinsic,
+                            string inst,
+                            string kind,
+                            ValueType result_type,
+                            ValueType op1_type,
+                            ValueType op2_type,
+                            ValueType mask_type,
+                            int sew,
+                            LMULInfo vlmul,
+                            VReg result_reg_class,
+                            RegisterClass op1_reg_class,
+                            DAGOperand op2_kind> :
+  Pat<(result_type (!cast<Intrinsic>(intrinsic#"_mask")
+                    (result_type result_reg_class:$rs3),
+                    (op1_type op1_reg_class:$rs1),
+                    (op2_type op2_kind:$rs2),
+                    (mask_type V0),
+                    VLOpFrag, (XLenVT timm:$policy))),
+                   (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX # "_MASK")
+                    result_reg_class:$rs3,
+                    (op1_type op1_reg_class:$rs1),
+                    op2_kind:$rs2,
+                    (mask_type V0),
+                    GPR:$vl, sew, (XLenVT timm:$policy))>;
+
 multiclass VPatUnaryS_M<string intrinsic_name,
                              string inst>
 {
@@ -3037,8 +3409,10 @@ multiclass VPatUnaryV_M<string intrinsic, string instruction>
   foreach vti = AllIntegerVectors in {
     def : VPatUnaryNoMask<intrinsic, instruction, "M", vti.Vector, vti.Mask,
                           vti.Log2SEW, vti.LMul, VR>;
-    def : VPatUnaryMask<intrinsic, instruction, "M", vti.Vector, vti.Mask,
-                        vti.Mask, vti.Log2SEW, vti.LMul, vti.RegClass, VR>;
+    def : VPatUnaryNoMaskTU<intrinsic, instruction, "M", vti.Vector, vti.Mask,
+                            vti.Log2SEW, vti.LMul, vti.RegClass,VR>;
+    def : VPatUnaryMaskTA<intrinsic, instruction, "M", vti.Vector, vti.Mask,
+                          vti.Mask, vti.Log2SEW, vti.LMul, vti.RegClass, VR>;
   }
 }
 
@@ -3052,6 +3426,9 @@ multiclass VPatUnaryV_VF<string intrinsic, string instruction, string suffix,
       def : VPatUnaryNoMask<intrinsic, instruction, suffix,
                             vti.Vector, fti.Vector,
                             vti.Log2SEW, vti.LMul, fti.RegClass>;
+      def : VPatUnaryNoMaskTU<intrinsic, instruction, suffix,
+                              vti.Vector, fti.Vector,
+                              vti.Log2SEW, vti.LMul, vti.RegClass, fti.RegClass>;
       def : VPatUnaryMaskTA<intrinsic, instruction, suffix,
                             vti.Vector, fti.Vector, vti.Mask,
                             vti.Log2SEW, vti.LMul, vti.RegClass, fti.RegClass>;
@@ -3064,6 +3441,9 @@ multiclass VPatUnaryV_V<string intrinsic, string instruction,
     def : VPatUnaryNoMask<intrinsic, instruction, "V",
                           vti.Vector, vti.Vector,
                           vti.Log2SEW, vti.LMul, vti.RegClass>;
+    def : VPatUnaryNoMaskTU<intrinsic, instruction, "V",
+                            vti.Vector, vti.Vector,
+                            vti.Log2SEW, vti.LMul, vti.RegClass, vti.RegClass>;
     def : VPatUnaryMaskTA<intrinsic, instruction, "V",
                           vti.Vector, vti.Vector, vti.Mask,
                           vti.Log2SEW, vti.LMul, vti.RegClass, vti.RegClass>;
@@ -3074,27 +3454,33 @@ multiclass VPatNullaryV<string intrinsic, string instruction>
 {
   foreach vti = AllIntegerVectors in {
     def : Pat<(vti.Vector (!cast<Intrinsic>(intrinsic)
+                          (vti.Vector undef),
                           VLOpFrag)),
                           (!cast<Instruction>(instruction#"_V_" # vti.LMul.MX)
                           GPR:$vl, vti.Log2SEW)>;
+    def : Pat<(vti.Vector (!cast<Intrinsic>(intrinsic)
+                          (vti.Vector vti.RegClass:$merge),
+                          VLOpFrag)),
+                          (!cast<Instruction>(instruction#"_V_" # vti.LMul.MX # "_TU")
+                          vti.RegClass:$merge, GPR:$vl, vti.Log2SEW)>;
     def : Pat<(vti.Vector (!cast<Intrinsic>(intrinsic # "_mask")
                           (vti.Vector vti.RegClass:$merge),
-                          (vti.Mask V0), VLOpFrag)),
+                          (vti.Mask V0), VLOpFrag, (XLenVT timm:$policy))),
                           (!cast<Instruction>(instruction#"_V_" # vti.LMul.MX # "_MASK")
                           vti.RegClass:$merge, (vti.Mask V0),
-                          GPR:$vl, vti.Log2SEW)>;
+                          GPR:$vl, vti.Log2SEW, (XLenVT timm:$policy))>;
   }
 }
 
 multiclass VPatNullaryM<string intrinsic, string inst> {
   foreach mti = AllMasks in
     def : Pat<(mti.Mask (!cast<Intrinsic>(intrinsic)
-                        (XLenVT (VLOp (XLenVT (XLenVT GPR:$vl)))))),
+                        VLOpFrag)),
                         (!cast<Instruction>(inst#"_M_"#mti.BX)
                         GPR:$vl, mti.Log2SEW)>;
 }
 
-multiclass VPatBinary<string intrinsic,
+multiclass VPatBinaryM<string intrinsic,
                       string inst,
                       ValueType result_type,
                       ValueType op1_type,
@@ -3105,8 +3491,8 @@ multiclass VPatBinary<string intrinsic,
                       VReg op1_reg_class,
                       DAGOperand op2_kind>
 {
-  def : VPatBinaryNoMask<intrinsic, inst, result_type, op1_type, op2_type,
-                         sew, op1_reg_class, op2_kind>;
+  def : VPatBinaryM<intrinsic, inst, result_type, op1_type, op2_type,
+                    sew, op1_reg_class, op2_kind>;
   def : VPatBinaryMask<intrinsic, inst, result_type, op1_type, op2_type,
                        mask_type, sew, result_reg_class, op1_reg_class,
                        op2_kind>;
@@ -3123,8 +3509,10 @@ multiclass VPatBinaryTA<string intrinsic,
                         VReg op1_reg_class,
                         DAGOperand op2_kind>
 {
-  def : VPatBinaryNoMask<intrinsic, inst, result_type, op1_type, op2_type,
-                         sew, op1_reg_class, op2_kind>;
+  def : VPatBinaryNoMaskTA<intrinsic, inst, result_type, op1_type, op2_type,
+                           sew, op1_reg_class, op2_kind>;
+  def : VPatBinaryNoMaskTU<intrinsic, inst, result_type, op1_type, op2_type,
+                           sew, result_reg_class, op1_reg_class, op2_kind>;
   def : VPatBinaryMaskTA<intrinsic, inst, result_type, op1_type, op2_type,
                          mask_type, sew, result_reg_class, op1_reg_class,
                          op2_kind>;
@@ -3148,6 +3536,42 @@ multiclass VPatBinarySwapped<string intrinsic,
                               op2_kind>;
 }
 
+multiclass VPatBinaryCarryInTAIL<string intrinsic,
+                                 string inst,
+                                 string kind,
+                                 ValueType result_type,
+                                 ValueType op1_type,
+                                 ValueType op2_type,
+                                 ValueType mask_type,
+                                 int sew,
+                                 LMULInfo vlmul,
+                                 VReg result_reg_class,
+                                 VReg op1_reg_class,
+                                 DAGOperand op2_kind>
+{
+  def : Pat<(result_type (!cast<Intrinsic>(intrinsic)
+                         (result_type undef),
+                         (op1_type op1_reg_class:$rs1),
+                         (op2_type op2_kind:$rs2),
+                         (mask_type V0),
+                         VLOpFrag)),
+                         (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX)
+                         (op1_type op1_reg_class:$rs1),
+                         (op2_type op2_kind:$rs2),
+                         (mask_type V0), GPR:$vl, sew)>;
+  def : Pat<(result_type (!cast<Intrinsic>(intrinsic)
+                         (result_type result_reg_class:$merge),
+                         (op1_type op1_reg_class:$rs1),
+                         (op2_type op2_kind:$rs2),
+                         (mask_type V0),
+                         VLOpFrag)),
+                         (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX#"_TU")
+                         (result_type result_reg_class:$merge),
+                         (op1_type op1_reg_class:$rs1),
+                         (op2_type op2_kind:$rs2),
+                         (mask_type V0), GPR:$vl, sew)>;
+}
+
 multiclass VPatBinaryCarryIn<string intrinsic,
                              string inst,
                              string kind,
@@ -3192,23 +3616,6 @@ multiclass VPatBinaryMaskOut<string intrinsic,
                          GPR:$vl, sew)>;
 }
 
-multiclass VPatConversion<string intrinsic,
-                          string inst,
-                          string kind,
-                          ValueType result_type,
-                          ValueType op1_type,
-                          ValueType mask_type,
-                          int sew,
-                          LMULInfo vlmul,
-                          VReg result_reg_class,
-                          VReg op1_reg_class>
-{
-  def : VPatUnaryNoMask<intrinsic, inst, kind, result_type, op1_type,
-                        sew, vlmul, op1_reg_class>;
-  def : VPatUnaryMask<intrinsic, inst, kind, result_type, op1_type,
-                      mask_type, sew, vlmul, result_reg_class, op1_reg_class>;
-}
-
 multiclass VPatConversionTA<string intrinsic,
                             string inst,
                             string kind,
@@ -3222,6 +3629,8 @@ multiclass VPatConversionTA<string intrinsic,
 {
   def : VPatUnaryNoMask<intrinsic, inst, kind, result_type, op1_type,
                         sew, vlmul, op1_reg_class>;
+  def : VPatUnaryNoMaskTU<intrinsic, inst, kind, result_type, op1_type,
+                          sew, vlmul, result_reg_class, op1_reg_class>;
   def : VPatUnaryMaskTA<intrinsic, inst, kind, result_type, op1_type,
                         mask_type, sew, vlmul, result_reg_class, op1_reg_class>;
 }
@@ -3296,9 +3705,9 @@ multiclass VPatBinaryV_VI<string intrinsic, string instruction,
 
 multiclass VPatBinaryM_MM<string intrinsic, string instruction> {
   foreach mti = AllMasks in
-    def : VPatBinaryNoMask<intrinsic, instruction # "_MM_" # mti.LMul.MX,
-                           mti.Mask, mti.Mask, mti.Mask,
-                           mti.Log2SEW, VR, VR>;
+    def : VPatBinaryM<intrinsic, instruction # "_MM_" # mti.LMul.MX,
+                      mti.Mask, mti.Mask, mti.Mask,
+                      mti.Log2SEW, VR, VR>;
 }
 
 multiclass VPatBinaryW_VV<string intrinsic, string instruction,
@@ -3334,10 +3743,17 @@ multiclass VPatBinaryW_WV<string intrinsic, string instruction,
     def : VPatTiedBinaryNoMask<intrinsic, instruction # "_WV_" # Vti.LMul.MX,
                                Wti.Vector, Vti.Vector,
                                Vti.Log2SEW, Wti.RegClass, Vti.RegClass>;
-    let AddedComplexity = 1 in
+    def : VPatBinaryNoMaskTU<intrinsic, instruction # "_WV_" # Vti.LMul.MX,
+                             Wti.Vector, Wti.Vector, Vti.Vector, Vti.Log2SEW,
+                             Wti.RegClass, Wti.RegClass, Vti.RegClass>;
+    let AddedComplexity = 1 in {
+    def : VPatTiedBinaryNoMaskTU<intrinsic, instruction # "_WV_" # Vti.LMul.MX,
+                                 Wti.Vector, Vti.Vector,
+                                 Vti.Log2SEW, Wti.RegClass, Vti.RegClass>;
     def : VPatTiedBinaryMask<intrinsic, instruction # "_WV_" # Vti.LMul.MX,
                              Wti.Vector, Vti.Vector, Vti.Mask,
                              Vti.Log2SEW, Wti.RegClass, Vti.RegClass>;
+    }
     def : VPatBinaryMaskTA<intrinsic, instruction # "_WV_" # Vti.LMul.MX,
                            Wti.Vector, Wti.Vector, Vti.Vector, Vti.Mask,
                            Vti.Log2SEW, Wti.RegClass,
@@ -3428,6 +3844,39 @@ multiclass VPatBinaryV_IM<string intrinsic, string instruction,
                              vti.RegClass, simm5>;
 }
 
+multiclass VPatBinaryV_VM_TAIL<string intrinsic, string instruction,
+                               bit CarryOut = 0,
+                               list<VTypeInfo> vtilist = AllIntegerVectors> {
+  foreach vti = vtilist in
+    defm : VPatBinaryCarryInTAIL<intrinsic, instruction, "VVM",
+                                 !if(CarryOut, vti.Mask, vti.Vector),
+                                 vti.Vector, vti.Vector, vti.Mask,
+                                 vti.Log2SEW, vti.LMul, vti.RegClass,
+                                 vti.RegClass, vti.RegClass>;
+}
+
+multiclass VPatBinaryV_XM_TAIL<string intrinsic, string instruction,
+                               bit CarryOut = 0,
+                               list<VTypeInfo> vtilist = AllIntegerVectors> {
+  foreach vti = vtilist in
+    defm : VPatBinaryCarryInTAIL<intrinsic, instruction,
+                                 "V"#vti.ScalarSuffix#"M",
+                                 !if(CarryOut, vti.Mask, vti.Vector),
+                                 vti.Vector, vti.Scalar, vti.Mask,
+                                 vti.Log2SEW, vti.LMul, vti.RegClass,
+                                 vti.RegClass, vti.ScalarRegClass>;
+}
+
+multiclass VPatBinaryV_IM_TAIL<string intrinsic, string instruction,
+                               bit CarryOut = 0> {
+  foreach vti = AllIntegerVectors in
+    defm : VPatBinaryCarryInTAIL<intrinsic, instruction, "VIM",
+                                 !if(CarryOut, vti.Mask, vti.Vector),
+                                 vti.Vector, XLenVT, vti.Mask,
+                                 vti.Log2SEW, vti.LMul,
+                                 vti.RegClass, vti.RegClass, simm5>;
+}
+
 multiclass VPatBinaryV_V<string intrinsic, string instruction> {
   foreach vti = AllIntegerVectors in
     defm : VPatBinaryMaskOut<intrinsic, instruction, "VV",
@@ -3455,10 +3904,10 @@ multiclass VPatBinaryV_I<string intrinsic, string instruction> {
 multiclass VPatBinaryM_VV<string intrinsic, string instruction,
                           list<VTypeInfo> vtilist> {
   foreach vti = vtilist in
-    defm : VPatBinary<intrinsic, instruction # "_VV_" # vti.LMul.MX,
-                      vti.Mask, vti.Vector, vti.Vector, vti.Mask,
-                      vti.Log2SEW, VR,
-                      vti.RegClass, vti.RegClass>;
+    defm : VPatBinaryM<intrinsic, instruction # "_VV_" # vti.LMul.MX,
+                       vti.Mask, vti.Vector, vti.Vector, vti.Mask,
+                       vti.Log2SEW, VR,
+                       vti.RegClass, vti.RegClass>;
 }
 
 multiclass VPatBinarySwappedM_VV<string intrinsic, string instruction,
@@ -3474,20 +3923,20 @@ multiclass VPatBinaryM_VX<string intrinsic, string instruction,
                           list<VTypeInfo> vtilist> {
   foreach vti = vtilist in {
     defvar kind = "V"#vti.ScalarSuffix;
-    defm : VPatBinary<intrinsic, instruction#"_"#kind#"_"#vti.LMul.MX,
-                      vti.Mask, vti.Vector, vti.Scalar, vti.Mask,
-                      vti.Log2SEW, VR,
-                      vti.RegClass, vti.ScalarRegClass>;
+    defm : VPatBinaryM<intrinsic, instruction#"_"#kind#"_"#vti.LMul.MX,
+                       vti.Mask, vti.Vector, vti.Scalar, vti.Mask,
+                       vti.Log2SEW, VR,
+                       vti.RegClass, vti.ScalarRegClass>;
   }
 }
 
 multiclass VPatBinaryM_VI<string intrinsic, string instruction,
                           list<VTypeInfo> vtilist> {
   foreach vti = vtilist in
-    defm : VPatBinary<intrinsic, instruction # "_VI_" # vti.LMul.MX,
-                      vti.Mask, vti.Vector, XLenVT, vti.Mask,
-                      vti.Log2SEW, VR,
-                      vti.RegClass, simm5>;
+    defm : VPatBinaryM<intrinsic, instruction # "_VI_" # vti.LMul.MX,
+                       vti.Mask, vti.Vector, XLenVT, vti.Mask,
+                       vti.Log2SEW, VR,
+                       vti.RegClass, simm5>;
 }
 
 multiclass VPatBinaryV_VV_VX_VI<string intrinsic, string instruction,
@@ -3523,9 +3972,9 @@ multiclass VPatBinaryV_WV_WX_WI<string intrinsic, string instruction,
       VPatBinaryV_WI<intrinsic, instruction, vtilist>;
 
 multiclass VPatBinaryV_VM_XM_IM<string intrinsic, string instruction>
-    : VPatBinaryV_VM<intrinsic, instruction>,
-      VPatBinaryV_XM<intrinsic, instruction>,
-      VPatBinaryV_IM<intrinsic, instruction>;
+    : VPatBinaryV_VM_TAIL<intrinsic, instruction>,
+      VPatBinaryV_XM_TAIL<intrinsic, instruction>,
+      VPatBinaryV_IM_TAIL<intrinsic, instruction>;
 
 multiclass VPatBinaryM_VM_XM_IM<string intrinsic, string instruction>
     : VPatBinaryV_VM<intrinsic, instruction, /*CarryOut=*/1>,
@@ -3538,8 +3987,8 @@ multiclass VPatBinaryM_V_X_I<string intrinsic, string instruction>
       VPatBinaryV_I<intrinsic, instruction>;
 
 multiclass VPatBinaryV_VM_XM<string intrinsic, string instruction>
-    : VPatBinaryV_VM<intrinsic, instruction>,
-      VPatBinaryV_XM<intrinsic, instruction>;
+    : VPatBinaryV_VM_TAIL<intrinsic, instruction>,
+      VPatBinaryV_XM_TAIL<intrinsic, instruction>;
 
 multiclass VPatBinaryM_VM_XM<string intrinsic, string instruction>
     : VPatBinaryV_VM<intrinsic, instruction, /*CarryOut=*/1>,
@@ -3569,6 +4018,26 @@ multiclass VPatTernary<string intrinsic,
                         op2_kind>;
 }
 
+multiclass VPatTernaryNoMaskNoPolicy<string intrinsic,
+                                     string inst,
+                                     string kind,
+                                     ValueType result_type,
+                                     ValueType op1_type,
+                                     ValueType op2_type,
+                                     ValueType mask_type,
+                                     int sew,
+                                     LMULInfo vlmul,
+                                     VReg result_reg_class,
+                                     RegisterClass op1_reg_class,
+                                     DAGOperand op2_kind> {
+  def : VPatTernaryNoMask<intrinsic, inst, kind, result_type, op1_type, op2_type,
+                          sew, vlmul, result_reg_class, op1_reg_class,
+                          op2_kind>;
+  def : VPatTernaryMaskPolicy<intrinsic, inst, kind, result_type, op1_type, op2_type,
+                              mask_type, sew, vlmul, result_reg_class, op1_reg_class,
+                              op2_kind>;
+}
+
 multiclass VPatTernaryWithPolicy<string intrinsic,
                                  string inst,
                                  string kind,
@@ -3584,9 +4053,9 @@ multiclass VPatTernaryWithPolicy<string intrinsic,
   def : VPatTernaryNoMaskWithPolicy<intrinsic, inst, kind, result_type, op1_type,
                                     op2_type, sew, vlmul, result_reg_class,
                                     op1_reg_class, op2_kind>;
-  def : VPatTernaryMask<intrinsic, inst, kind, result_type, op1_type, op2_type,
-                        mask_type, sew, vlmul, result_reg_class, op1_reg_class,
-                        op2_kind>;
+  def : VPatTernaryMaskPolicy<intrinsic, inst, kind, result_type, op1_type, op2_type,
+                              mask_type, sew, vlmul, result_reg_class, op1_reg_class,
+                              op2_kind>;
 }
 
 multiclass VPatTernaryV_VV_AAXA<string intrinsic, string instruction,
@@ -3601,10 +4070,10 @@ multiclass VPatTernaryV_VV_AAXA<string intrinsic, string instruction,
 multiclass VPatTernaryV_VX<string intrinsic, string instruction,
                            list<VTypeInfo> vtilist> {
   foreach vti = vtilist in
-    defm : VPatTernary<intrinsic, instruction, "VX",
-                       vti.Vector, vti.Vector, XLenVT, vti.Mask,
-                       vti.Log2SEW, vti.LMul, vti.RegClass,
-                       vti.RegClass, GPR>;
+    defm : VPatTernaryWithPolicy<intrinsic, instruction, "VX",
+                                 vti.Vector, vti.Vector, XLenVT, vti.Mask,
+                                 vti.Log2SEW, vti.LMul, vti.RegClass,
+                                 vti.RegClass, GPR>;
 }
 
 multiclass VPatTernaryV_VX_AAXA<string intrinsic, string instruction,
@@ -3620,10 +4089,10 @@ multiclass VPatTernaryV_VX_AAXA<string intrinsic, string instruction,
 multiclass VPatTernaryV_VI<string intrinsic, string instruction,
                            list<VTypeInfo> vtilist, Operand Imm_type> {
   foreach vti = vtilist in
-    defm : VPatTernary<intrinsic, instruction, "VI",
-                      vti.Vector, vti.Vector, XLenVT, vti.Mask,
-                      vti.Log2SEW, vti.LMul, vti.RegClass,
-                      vti.RegClass, Imm_type>;
+    defm : VPatTernaryWithPolicy<intrinsic, instruction, "VI",
+                                 vti.Vector, vti.Vector, XLenVT, vti.Mask,
+                                 vti.Log2SEW, vti.LMul, vti.RegClass,
+                                 vti.RegClass, Imm_type>;
 }
 
 multiclass VPatTernaryW_VV<string intrinsic, string instruction,
@@ -3661,6 +4130,7 @@ multiclass VPatTernaryV_VX_VI<string intrinsic, string instruction,
     : VPatTernaryV_VX<intrinsic, instruction, vtilist>,
       VPatTernaryV_VI<intrinsic, instruction, vtilist, Imm_type>;
 
+
 multiclass VPatBinaryM_VV_VX_VI<string intrinsic, string instruction,
                                 list<VTypeInfo> vtilist>
     : VPatBinaryM_VV<intrinsic, instruction, vtilist>,
@@ -3724,19 +4194,6 @@ multiclass VPatReductionW_VS<string intrinsic, string instruction, bit IsFloat =
   }
 }
 
-multiclass VPatClassifyVI_VF<string intrinsic,
-                             string instruction>
-{
-  foreach fvti = AllFloatVectors in
-  {
-    defvar ivti = GetIntVTypeInfo<fvti>.Vti;
-
-    defm : VPatConversion<intrinsic, instruction, "V",
-                          ivti.Vector, fvti.Vector, ivti.Mask, fvti.Log2SEW,
-                          fvti.LMul, ivti.RegClass, fvti.RegClass>;
-  }
-}
-
 multiclass VPatConversionVI_VF<string intrinsic,
                                string instruction>
 {
@@ -3973,7 +4430,7 @@ defm PseudoVL : VPseudoFFLoad;
 //===----------------------------------------------------------------------===//
 // 7.8. Vector Load/Store Segment Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVLSEG : VPseudoUSSegLoad</*isFF=*/false>;
+defm PseudoVLSEG : VPseudoUSSegLoad;
 defm PseudoVLSSEG : VPseudoSSegLoad;
 defm PseudoVLOXSEG : VPseudoISegLoad</*Ordered=*/true>;
 defm PseudoVLUXSEG : VPseudoISegLoad</*Ordered=*/false>;
@@ -3983,8 +4440,9 @@ defm PseudoVSOXSEG : VPseudoISegStore</*Ordered=*/true>;
 defm PseudoVSUXSEG : VPseudoISegStore</*Ordered=*/false>;
 
 // vlseg<nf>e<eew>ff.v may update VL register
-let hasSideEffects = 1, Defs = [VL] in
-defm PseudoVLSEG : VPseudoUSSegLoad</*isFF=*/true>;
+let hasSideEffects = 1, Defs = [VL] in {
+defm PseudoVLSEG : VPseudoUSSegLoadFF;
+}
 
 //===----------------------------------------------------------------------===//
 // 12. Vector Integer Arithmetic Instructions
@@ -4002,13 +4460,24 @@ foreach vti = AllIntegerVectors in {
   // Occurs when legalizing vrsub.vx intrinsics for i64 on RV32 since we need
   // to use a more complex splat sequence. Add the pattern for all VTs for
   // consistency.
-  def : Pat<(vti.Vector (int_riscv_vrsub (vti.Vector vti.RegClass:$rs2),
+  def : Pat<(vti.Vector (int_riscv_vrsub (vti.Vector (undef)),
+                                         (vti.Vector vti.RegClass:$rs2),
                                          (vti.Vector vti.RegClass:$rs1),
                                          VLOpFrag)),
             (!cast<Instruction>("PseudoVSUB_VV_"#vti.LMul.MX) vti.RegClass:$rs1,
                                                               vti.RegClass:$rs2,
                                                               GPR:$vl,
                                                               vti.Log2SEW)>;
+  def : Pat<(vti.Vector (int_riscv_vrsub (vti.Vector vti.RegClass:$merge),
+                                         (vti.Vector vti.RegClass:$rs2),
+                                         (vti.Vector vti.RegClass:$rs1),
+                                         VLOpFrag)),
+            (!cast<Instruction>("PseudoVSUB_VV_"#vti.LMul.MX#"_TU")
+                                                      vti.RegClass:$merge,
+                                                      vti.RegClass:$rs1,
+                                                      vti.RegClass:$rs2,
+                                                      GPR:$vl,
+                                                      vti.Log2SEW)>;
   def : Pat<(vti.Vector (int_riscv_vrsub_mask (vti.Vector vti.RegClass:$merge),
                                               (vti.Vector vti.RegClass:$rs2),
                                               (vti.Vector vti.RegClass:$rs1),
@@ -4025,7 +4494,8 @@ foreach vti = AllIntegerVectors in {
                                                       (XLenVT timm:$policy))>;
 
   // Match VSUB with a small immediate to vadd.vi by negating the immediate.
-  def : Pat<(vti.Vector (int_riscv_vsub (vti.Vector vti.RegClass:$rs1),
+  def : Pat<(vti.Vector (int_riscv_vsub (vti.Vector (undef)),
+                                        (vti.Vector vti.RegClass:$rs1),
                                         (vti.Scalar simm5_plus1:$rs2),
                                         VLOpFrag)),
             (!cast<Instruction>("PseudoVADD_VI_"#vti.LMul.MX) vti.RegClass:$rs1,
@@ -4219,33 +4689,42 @@ let Predicates = [HasVInstructionsAnyF] in {
 //===----------------------------------------------------------------------===//
 // 14.2. Vector Single-Width Floating-Point Add/Subtract Instructions
 //===----------------------------------------------------------------------===//
+let Uses = [FRM], mayRaiseFPException = true in {
 defm PseudoVFADD  : VPseudoVALU_VV_VF;
 defm PseudoVFSUB  : VPseudoVALU_VV_VF;
 defm PseudoVFRSUB : VPseudoVALU_VF;
+}
 
 //===----------------------------------------------------------------------===//
 // 14.3. Vector Widening Floating-Point Add/Subtract Instructions
 //===----------------------------------------------------------------------===//
+let Uses = [FRM], mayRaiseFPException = true in {
 defm PseudoVFWADD : VPseudoVFWALU_VV_VF;
 defm PseudoVFWSUB : VPseudoVFWALU_VV_VF;
 defm PseudoVFWADD : VPseudoVFWALU_WV_WF;
 defm PseudoVFWSUB : VPseudoVFWALU_WV_WF;
+}
 
 //===----------------------------------------------------------------------===//
 // 14.4. Vector Single-Width Floating-Point Multiply/Divide Instructions
 //===----------------------------------------------------------------------===//
+let Uses = [FRM], mayRaiseFPException = true in {
 defm PseudoVFMUL  : VPseudoVFMUL_VV_VF;
 defm PseudoVFDIV  : VPseudoVFDIV_VV_VF;
 defm PseudoVFRDIV : VPseudoVFRDIV_VF;
+}
 
 //===----------------------------------------------------------------------===//
 // 14.5. Vector Widening Floating-Point Multiply
 //===----------------------------------------------------------------------===//
+let Uses = [FRM], mayRaiseFPException = true in {
 defm PseudoVFWMUL : VPseudoVWMUL_VV_VF;
+}
 
 //===----------------------------------------------------------------------===//
 // 14.6. Vector Single-Width Floating-Point Fused Multiply-Add Instructions
 //===----------------------------------------------------------------------===//
+let Uses = [FRM], mayRaiseFPException = true in {
 defm PseudoVFMACC  : VPseudoVMAC_VV_VF_AAXA;
 defm PseudoVFNMACC : VPseudoVMAC_VV_VF_AAXA;
 defm PseudoVFMSAC  : VPseudoVMAC_VV_VF_AAXA;
@@ -4254,35 +4733,43 @@ defm PseudoVFMADD  : VPseudoVMAC_VV_VF_AAXA;
 defm PseudoVFNMADD : VPseudoVMAC_VV_VF_AAXA;
 defm PseudoVFMSUB  : VPseudoVMAC_VV_VF_AAXA;
 defm PseudoVFNMSUB : VPseudoVMAC_VV_VF_AAXA;
+}
 
 //===----------------------------------------------------------------------===//
 // 14.7. Vector Widening Floating-Point Fused Multiply-Add Instructions
 //===----------------------------------------------------------------------===//
+let Uses = [FRM], mayRaiseFPException = true in {
 defm PseudoVFWMACC  : VPseudoVWMAC_VV_VF;
 defm PseudoVFWNMACC : VPseudoVWMAC_VV_VF;
 defm PseudoVFWMSAC  : VPseudoVWMAC_VV_VF;
 defm PseudoVFWNMSAC : VPseudoVWMAC_VV_VF;
+}
 
 //===----------------------------------------------------------------------===//
 // 14.8. Vector Floating-Point Square-Root Instruction
 //===----------------------------------------------------------------------===//
+let Uses = [FRM], mayRaiseFPException = true in
 defm PseudoVFSQRT : VPseudoVSQR_V;
 
 //===----------------------------------------------------------------------===//
 // 14.9. Vector Floating-Point Reciprocal Square-Root Estimate Instruction
 //===----------------------------------------------------------------------===//
+let mayRaiseFPException = true in
 defm PseudoVFRSQRT7 : VPseudoVRCP_V;
 
 //===----------------------------------------------------------------------===//
 // 14.10. Vector Floating-Point Reciprocal Estimate Instruction
 //===----------------------------------------------------------------------===//
+let Uses = [FRM], mayRaiseFPException = true in
 defm PseudoVFREC7 : VPseudoVRCP_V;
 
 //===----------------------------------------------------------------------===//
 // 14.11. Vector Floating-Point Min/Max Instructions
 //===----------------------------------------------------------------------===//
+let mayRaiseFPException = true in {
 defm PseudoVFMIN : VPseudoVMAX_VV_VF;
 defm PseudoVFMAX : VPseudoVMAX_VV_VF;
+}
 
 //===----------------------------------------------------------------------===//
 // 14.12. Vector Floating-Point Sign-Injection Instructions
@@ -4294,12 +4781,14 @@ defm PseudoVFSGNJX : VPseudoVSGNJ_VV_VF;
 //===----------------------------------------------------------------------===//
 // 14.13. Vector Floating-Point Compare Instructions
 //===----------------------------------------------------------------------===//
+let mayRaiseFPException = true in {
 defm PseudoVMFEQ : VPseudoVCMPM_VV_VF;
 defm PseudoVMFNE : VPseudoVCMPM_VV_VF;
 defm PseudoVMFLT : VPseudoVCMPM_VV_VF;
 defm PseudoVMFLE : VPseudoVCMPM_VV_VF;
 defm PseudoVMFGT : VPseudoVCMPM_VF;
 defm PseudoVMFGE : VPseudoVCMPM_VF;
+}
 
 //===----------------------------------------------------------------------===//
 // 14.14. Vector Floating-Point Classify Instruction
@@ -4376,15 +4865,21 @@ let Predicates = [HasVInstructionsAnyF] in {
 //===----------------------------------------------------------------------===//
 // 15.3. Vector Single-Width Floating-Point Reduction Instructions
 //===----------------------------------------------------------------------===//
+let Uses = [FRM], mayRaiseFPException = true in {
 defm PseudoVFREDOSUM : VPseudoVFREDO_VS;
 defm PseudoVFREDUSUM : VPseudoVFRED_VS;
+}
+let mayRaiseFPException = true in {
 defm PseudoVFREDMIN  : VPseudoVFRED_VS;
 defm PseudoVFREDMAX  : VPseudoVFRED_VS;
+}
 
 //===----------------------------------------------------------------------===//
 // 15.4. Vector Widening Floating-Point Reduction Instructions
 //===----------------------------------------------------------------------===//
-let IsRVVWideningReduction = 1 in {
+let IsRVVWideningReduction = 1,
+    Uses = [FRM],
+    mayRaiseFPException = true in {
 defm PseudoVFWREDUSUM  : VPseudoVFWRED_VS;
 defm PseudoVFWREDOSUM  : VPseudoVFWRED_VS;
 }
@@ -4611,7 +5106,8 @@ defm : VPatBinaryV_VV_VX_VI<"int_riscv_vsra", "PseudoVSRA", AllIntegerVectors,
 
 foreach vti = AllIntegerVectors in {
   // Emit shift by 1 as an add since it might be faster.
-  def : Pat<(vti.Vector (int_riscv_vsll (vti.Vector vti.RegClass:$rs1),
+  def : Pat<(vti.Vector (int_riscv_vsll (vti.Vector undef),
+                                        (vti.Vector vti.RegClass:$rs1),
                                         (XLenVT 1), VLOpFrag)),
             (!cast<Instruction>("PseudoVADD_VV_"#vti.LMul.MX) vti.RegClass:$rs1,
                                                               vti.RegClass:$rs1,
@@ -4726,10 +5222,16 @@ defm : VPatBinaryV_VM_XM_IM<"int_riscv_vmerge", "PseudoVMERGE">;
 // 12.16. Vector Integer Move Instructions
 //===----------------------------------------------------------------------===//
 foreach vti = AllVectors in {
-  def : Pat<(vti.Vector (int_riscv_vmv_v_v (vti.Vector vti.RegClass:$rs1),
+  def : Pat<(vti.Vector (int_riscv_vmv_v_v (vti.Vector undef),
+                                           (vti.Vector vti.RegClass:$rs1),
                                            VLOpFrag)),
             (!cast<Instruction>("PseudoVMV_V_V_"#vti.LMul.MX)
              $rs1, GPR:$vl, vti.Log2SEW)>;
+  def : Pat<(vti.Vector (int_riscv_vmv_v_v (vti.Vector vti.RegClass:$passthru),
+                                           (vti.Vector vti.RegClass:$rs1),
+                                           VLOpFrag)),
+            (!cast<Instruction>("PseudoVMV_V_V_"#vti.LMul.MX#"_TU")
+             $passthru, $rs1, GPR:$vl, vti.Log2SEW)>;
 
   // vmv.v.x/vmv.v.i are handled in RISCInstrVInstrInfoVVLPatterns.td
 }
@@ -4862,7 +5364,7 @@ defm : VPatBinarySwappedM_VV<"int_riscv_vmfge", "PseudoVMFLE", AllFloatVectors>;
 //===----------------------------------------------------------------------===//
 // 14.14. Vector Floating-Point Classify Instruction
 //===----------------------------------------------------------------------===//
-defm : VPatClassifyVI_VF<"int_riscv_vfclass", "PseudoVFCLASS">;
+defm : VPatConversionVI_VF<"int_riscv_vfclass", "PseudoVFCLASS">;
 
 //===----------------------------------------------------------------------===//
 // 14.15. Vector Floating-Point Merge Instruction
@@ -4870,19 +5372,27 @@ defm : VPatClassifyVI_VF<"int_riscv_vfclass", "PseudoVFCLASS">;
 // We can use vmerge.vvm to support vector-vector vfmerge.
 // NOTE: Clang previously used int_riscv_vfmerge for vector-vector, but now uses
 // int_riscv_vmerge. Support both for compatibility.
-defm : VPatBinaryV_VM<"int_riscv_vmerge", "PseudoVMERGE",
-                      /*CarryOut = */0, /*vtilist=*/AllFloatVectors>;
-defm : VPatBinaryV_VM<"int_riscv_vfmerge", "PseudoVMERGE",
-                      /*CarryOut = */0, /*vtilist=*/AllFloatVectors>;
-defm : VPatBinaryV_XM<"int_riscv_vfmerge", "PseudoVFMERGE",
-                      /*CarryOut = */0, /*vtilist=*/AllFloatVectors>;
+defm : VPatBinaryV_VM_TAIL<"int_riscv_vmerge", "PseudoVMERGE",
+                           /*CarryOut = */0, /*vtilist=*/AllFloatVectors>;
+defm : VPatBinaryV_VM_TAIL<"int_riscv_vfmerge", "PseudoVMERGE",
+                           /*CarryOut = */0, /*vtilist=*/AllFloatVectors>;
+defm : VPatBinaryV_XM_TAIL<"int_riscv_vfmerge", "PseudoVFMERGE",
+                           /*CarryOut = */0, /*vtilist=*/AllFloatVectors>;
 
 foreach fvti = AllFloatVectors in {
   defvar instr = !cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX);
-  def : Pat<(fvti.Vector (int_riscv_vfmerge (fvti.Vector fvti.RegClass:$rs2),
+  def : Pat<(fvti.Vector (int_riscv_vfmerge (fvti.Vector undef),
+                                            (fvti.Vector fvti.RegClass:$rs2),
                                             (fvti.Scalar (fpimm0)),
                                             (fvti.Mask V0), VLOpFrag)),
             (instr fvti.RegClass:$rs2, 0, (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>;
+  defvar instr_tu = !cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX#"_TU");
+  def : Pat<(fvti.Vector (int_riscv_vfmerge (fvti.Vector fvti.RegClass:$merge),
+                                            (fvti.Vector fvti.RegClass:$rs2),
+                                            (fvti.Scalar (fpimm0)),
+                                            (fvti.Mask V0), VLOpFrag)),
+            (instr_tu fvti.RegClass:$merge, fvti.RegClass:$rs2, 0,
+                      (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -5048,6 +5558,11 @@ foreach fvti = AllFloatVectors in {
              (fvti.Vector $rs1),
              (fvti.Scalar fvti.ScalarRegClass:$rs2),
              GPR:$vl, fvti.Log2SEW)>;
+
+  def : Pat<(fvti.Vector (int_riscv_vfmv_s_f (fvti.Vector fvti.RegClass:$rs1),
+                         (fvti.Scalar (fpimm0)), VLOpFrag)),
+            (!cast<Instruction>("PseudoVMV_S_X_" # fvti.LMul.MX)
+             (fvti.Vector $rs1), X0, GPR:$vl, fvti.Log2SEW)>;
 }
 } // Predicates = [HasVInstructionsAnyF]
 
@@ -5097,5 +5612,5 @@ let Predicates = [HasVInstructionsAnyF] in {
 } // Predicates = [HasVInstructionsAnyF]
 
 // Include the non-intrinsic ISel patterns
-include "RISCVInstrInfoVSDPatterns.td"
 include "RISCVInstrInfoVVLPatterns.td"
+include "RISCVInstrInfoVSDPatterns.td"
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
index 2b920d29ab81..06d4c4d0a9e6 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
@@ -8,8 +8,7 @@
 ///
 /// This file contains the required infrastructure and SDNode patterns to
 /// support code generation for the standard 'V' (Vector) extension, version
-/// 0.10.  This version is still experimental as the 'V' extension hasn't been
-/// ratified yet.
+/// version 1.0.
 ///
 /// This file is included from and depends upon RISCVInstrInfoVPseudos.td
 ///
@@ -22,35 +21,9 @@
 // Helpers to define the SDNode patterns.
 //===----------------------------------------------------------------------===//
 
-def SDTSplatI64 : SDTypeProfile<1, 1, [
-  SDTCVecEltisVT<0, i64>, SDTCisVT<1, i32>
-]>;
-
-def rv32_splat_i64 : SDNode<"RISCVISD::SPLAT_VECTOR_I64", SDTSplatI64>;
-
-def SDT_RISCVVMSETCLR_VL : SDTypeProfile<1, 1, [SDTCVecEltisVT<0, i1>,
-                                                SDTCisVT<1, XLenVT>]>;
-def riscv_vmclr_vl : SDNode<"RISCVISD::VMCLR_VL", SDT_RISCVVMSETCLR_VL>;
-def riscv_vmset_vl : SDNode<"RISCVISD::VMSET_VL", SDT_RISCVVMSETCLR_VL>;
-
 def rvv_vnot : PatFrag<(ops node:$in),
                        (xor node:$in, (riscv_vmset_vl (XLenVT srcvalue)))>;
 
-// Give explicit Complexity to prefer simm5/uimm5.
-def SplatPat       : ComplexPattern<vAny, 1, "selectVSplat",      [splat_vector, rv32_splat_i64], [], 1>;
-def SplatPat_simm5 : ComplexPattern<vAny, 1, "selectVSplatSimm5", [splat_vector, rv32_splat_i64], [], 2>;
-def SplatPat_uimm5 : ComplexPattern<vAny, 1, "selectVSplatUimm5", [splat_vector, rv32_splat_i64], [], 2>;
-def SplatPat_simm5_plus1
-    : ComplexPattern<vAny, 1, "selectVSplatSimm5Plus1",
-                     [splat_vector, rv32_splat_i64], [], 2>;
-def SplatPat_simm5_plus1_nonzero
-    : ComplexPattern<vAny, 1, "selectVSplatSimm5Plus1NonZero",
-                     [splat_vector, rv32_splat_i64], [], 2>;
-
-class SwapHelper<dag Prefix, dag A, dag B, dag Suffix, bit swap> {
-   dag Value = !con(Prefix, !if(swap, B, A), !if(swap, A, B), Suffix);
-}
-
 multiclass VPatUSLoadStoreSDNode<ValueType type,
                                  int log2sew,
                                  LMULInfo vlmul,
@@ -169,7 +142,7 @@ class VPatBinarySDNode_VF<SDNode vop,
                           VReg vop_reg_class,
                           DAGOperand xop_kind> :
     Pat<(result_type (vop (vop_type vop_reg_class:$rs1),
-                          (vop_type (splat_vector xop_kind:$rs2)))),
+                          (vop_type (SplatFPOp xop_kind:$rs2)))),
         (!cast<Instruction>(instruction_name#"_"#vlmul.MX)
                      vop_reg_class:$rs1,
                      (xop_type xop_kind:$rs2),
@@ -189,7 +162,7 @@ multiclass VPatBinaryFPSDNode_VV_VF<SDNode vop, string instruction_name> {
 
 multiclass VPatBinaryFPSDNode_R_VF<SDNode vop, string instruction_name> {
   foreach fvti = AllFloatVectors in
-    def : Pat<(fvti.Vector (vop (fvti.Vector (splat_vector fvti.Scalar:$rs2)),
+    def : Pat<(fvti.Vector (vop (fvti.Vector (SplatFPOp fvti.Scalar:$rs2)),
                                 (fvti.Vector fvti.RegClass:$rs1))),
               (!cast<Instruction>(instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX)
                            fvti.RegClass:$rs1,
@@ -197,67 +170,70 @@ multiclass VPatBinaryFPSDNode_R_VF<SDNode vop, string instruction_name> {
                            fvti.AVL, fvti.Log2SEW)>;
 }
 
-multiclass VPatIntegerSetCCSDNode_VV<CondCode cc,
-                                     string instruction_name,
-                                     bit swap = 0> {
+multiclass VPatIntegerSetCCSDNode_VV<string instruction_name,
+                                     CondCode cc> {
   foreach vti = AllIntegerVectors in {
     defvar instruction = !cast<Instruction>(instruction_name#"_VV_"#vti.LMul.MX);
     def : Pat<(vti.Mask (setcc (vti.Vector vti.RegClass:$rs1),
                                (vti.Vector vti.RegClass:$rs2), cc)),
-              SwapHelper<(instruction),
-                         (instruction vti.RegClass:$rs1),
-                         (instruction vti.RegClass:$rs2),
-                         (instruction vti.AVL, vti.Log2SEW),
-                         swap>.Value>;
+              (instruction vti.RegClass:$rs1, vti.RegClass:$rs2, vti.AVL,
+              vti.Log2SEW)>;
   }
 }
 
-multiclass VPatIntegerSetCCSDNode_XI<CondCode cc,
+multiclass VPatIntegerSetCCSDNode_VV_Swappable<string instruction_name,
+                                               CondCode cc, CondCode invcc>
+    : VPatIntegerSetCCSDNode_VV<instruction_name, cc> {
+  foreach vti = AllIntegerVectors in {
+    defvar instruction = !cast<Instruction>(instruction_name#"_VV_"#vti.LMul.MX);
+    def : Pat<(vti.Mask (setcc (vti.Vector vti.RegClass:$rs2),
+                               (vti.Vector vti.RegClass:$rs1), invcc)),
+              (instruction vti.RegClass:$rs1, vti.RegClass:$rs2, vti.AVL,
+              vti.Log2SEW)>;
+  }
+}
+
+multiclass VPatIntegerSetCCSDNode_XI<
                                      string instruction_name,
+                                     CondCode cc,
                                      string kind,
                                      ComplexPattern SplatPatKind,
-                                     DAGOperand xop_kind,
-                                     bit swap = 0> {
+                                     DAGOperand xop_kind> {
   foreach vti = AllIntegerVectors in {
     defvar instruction = !cast<Instruction>(instruction_name#_#kind#_#vti.LMul.MX);
     def : Pat<(vti.Mask (setcc (vti.Vector vti.RegClass:$rs1),
                                (vti.Vector (SplatPatKind xop_kind:$rs2)), cc)),
-              SwapHelper<(instruction),
-                         (instruction vti.RegClass:$rs1),
-                         (instruction xop_kind:$rs2),
-                         (instruction vti.AVL, vti.Log2SEW),
-                         swap>.Value>;
+              (instruction vti.RegClass:$rs1, xop_kind:$rs2, vti.AVL, vti.Log2SEW)>;
   }
 }
 
-multiclass VPatIntegerSetCCSDNode_VV_VX_VI<CondCode cc,
-                                           string instruction_name,
-                                           bit swap = 0> {
-  defm : VPatIntegerSetCCSDNode_VV<cc, instruction_name, swap>;
-  defm : VPatIntegerSetCCSDNode_XI<cc, instruction_name, "VX",
-                                   SplatPat, GPR, swap>;
-  defm : VPatIntegerSetCCSDNode_XI<cc, instruction_name, "VI",
-                                   SplatPat_simm5, simm5, swap>;
+multiclass VPatIntegerSetCCSDNode_XI_Swappable<string instruction_name,
+                                               CondCode cc, CondCode invcc,
+                                               string kind,
+                                               ComplexPattern SplatPatKind,
+                                               DAGOperand xop_kind>
+    : VPatIntegerSetCCSDNode_XI<instruction_name, cc, kind, SplatPatKind,
+                                xop_kind> {
+  foreach vti = AllIntegerVectors in {
+    defvar instruction = !cast<Instruction>(instruction_name#_#kind#_#vti.LMul.MX);
+    def : Pat<(vti.Mask (setcc (vti.Vector vti.RegClass:$rs1),
+                               (vti.Vector (SplatPatKind xop_kind:$rs2)), cc)),
+              (instruction vti.RegClass:$rs1, xop_kind:$rs2, vti.AVL, vti.Log2SEW)>;
+    def : Pat<(vti.Mask (setcc (vti.Vector (SplatPatKind xop_kind:$rs2)),
+                               (vti.Vector vti.RegClass:$rs1), invcc)),
+              (instruction vti.RegClass:$rs1, xop_kind:$rs2, vti.AVL, vti.Log2SEW)>;
+  }
 }
 
-multiclass VPatIntegerSetCCSDNode_VV_VX<CondCode cc,
-                                        string instruction_name,
-                                        bit swap = 0> {
-  defm : VPatIntegerSetCCSDNode_VV<cc, instruction_name, swap>;
-  defm : VPatIntegerSetCCSDNode_XI<cc, instruction_name, "VX",
-                                   SplatPat, GPR, swap>;
-}
+multiclass VPatIntegerSetCCSDNode_VX_Swappable<string instruction_name,
+                                               CondCode cc, CondCode invcc>
+    : VPatIntegerSetCCSDNode_XI_Swappable<instruction_name, cc, invcc, "VX",
+                                          SplatPat, GPR>;
 
-multiclass VPatIntegerSetCCSDNode_VX_VI<CondCode cc,
-                                        string instruction_name,
-                                        bit swap = 0> {
-  defm : VPatIntegerSetCCSDNode_XI<cc, instruction_name, "VX",
-                                   SplatPat, GPR, swap>;
-  defm : VPatIntegerSetCCSDNode_XI<cc, instruction_name, "VI",
-                                   SplatPat_simm5, simm5, swap>;
-}
+multiclass VPatIntegerSetCCSDNode_VI<string instruction_name, CondCode cc>
+    : VPatIntegerSetCCSDNode_XI<instruction_name, cc, "VI", SplatPat_simm5, simm5>;
 
-multiclass VPatIntegerSetCCSDNode_VIPlus1<CondCode cc, string instruction_name,
+multiclass VPatIntegerSetCCSDNode_VIPlus1<string instruction_name, CondCode cc,
                                           ComplexPattern splatpat_kind> {
   foreach vti = AllIntegerVectors in {
     defvar instruction = !cast<Instruction>(instruction_name#"_VI_"#vti.LMul.MX);
@@ -279,12 +255,12 @@ multiclass VPatFPSetCCSDNode_VV_VF_FV<CondCode cc,
               (!cast<Instruction>(inst_name#"_VV_"#fvti.LMul.MX)
                   fvti.RegClass:$rs1, fvti.RegClass:$rs2, fvti.AVL, fvti.Log2SEW)>;
     def : Pat<(fvti.Mask (setcc (fvti.Vector fvti.RegClass:$rs1),
-                                (splat_vector fvti.ScalarRegClass:$rs2),
+                                (SplatFPOp fvti.ScalarRegClass:$rs2),
                                 cc)),
               (!cast<Instruction>(inst_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX)
                   fvti.RegClass:$rs1, fvti.ScalarRegClass:$rs2,
                   fvti.AVL, fvti.Log2SEW)>;
-    def : Pat<(fvti.Mask (setcc (splat_vector fvti.ScalarRegClass:$rs2),
+    def : Pat<(fvti.Mask (setcc (SplatFPOp fvti.ScalarRegClass:$rs2),
                                 (fvti.Vector fvti.RegClass:$rs1),
                                 cc)),
               (!cast<Instruction>(swapped_op_inst_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX)
@@ -363,83 +339,122 @@ multiclass VPatNConvertFP2ISDNode_V<SDNode vop, string instruction_name> {
   }
 }
 
-multiclass VPatWidenBinarySDNode_VV_VX_WV_WX<SDNode op, PatFrags extop, string instruction_name> {
-  foreach vti = AllWidenableIntVectors in {
-    def : Pat<(op (vti.Wti.Vector (extop (vti.Vti.Vector vti.Vti.RegClass:$rs2))),
-                  (vti.Wti.Vector (extop (vti.Vti.Vector vti.Vti.RegClass:$rs1)))),
-              (!cast<Instruction>(instruction_name#"_VV_"#vti.Vti.LMul.MX)
-                 vti.Vti.RegClass:$rs2, vti.Vti.RegClass:$rs1,
-                 vti.Vti.AVL, vti.Vti.Log2SEW)>;
-    def : Pat<(op (vti.Wti.Vector (extop (vti.Vti.Vector vti.Vti.RegClass:$rs2))),
-                  (vti.Wti.Vector (extop (vti.Vti.Vector (SplatPat GPR:$rs1))))),
-              (!cast<Instruction>(instruction_name#"_VX_"#vti.Vti.LMul.MX)
-                 vti.Vti.RegClass:$rs2, GPR:$rs1,
-                 vti.Vti.AVL, vti.Vti.Log2SEW)>;
-    def : Pat<(op (vti.Wti.Vector vti.Wti.RegClass:$rs2),
-                  (vti.Wti.Vector (extop (vti.Vti.Vector vti.Vti.RegClass:$rs1)))),
-              (!cast<Instruction>(instruction_name#"_WV_"#vti.Vti.LMul.MX)
-                 vti.Wti.RegClass:$rs2, vti.Vti.RegClass:$rs1,
-                 vti.Vti.AVL, vti.Vti.Log2SEW)>;
-    def : Pat<(op (vti.Wti.Vector vti.Wti.RegClass:$rs2),
-                  (vti.Wti.Vector (extop (vti.Vti.Vector (SplatPat GPR:$rs1))))),
-              (!cast<Instruction>(instruction_name#"_WX_"#vti.Vti.LMul.MX)
-                 vti.Wti.RegClass:$rs2, GPR:$rs1,
-                 vti.Vti.AVL, vti.Vti.Log2SEW)>;
+multiclass VPatWidenBinarySDNode_VV_VX<SDNode op, PatFrags extop1, PatFrags extop2,
+                                       string instruction_name> {
+  foreach vtiToWti = AllWidenableIntVectors in {
+    defvar vti = vtiToWti.Vti;
+    defvar wti = vtiToWti.Wti;
+    def : Pat<(op (wti.Vector (extop1 (vti.Vector vti.RegClass:$rs2))),
+                  (wti.Vector (extop2 (vti.Vector vti.RegClass:$rs1)))),
+              (!cast<Instruction>(instruction_name#"_VV_"#vti.LMul.MX)
+                 vti.RegClass:$rs2, vti.RegClass:$rs1, vti.AVL, vti.Log2SEW)>;
+    def : Pat<(op (wti.Vector (extop1 (vti.Vector vti.RegClass:$rs2))),
+                  (wti.Vector (extop2 (vti.Vector (SplatPat GPR:$rs1))))),
+              (!cast<Instruction>(instruction_name#"_VX_"#vti.LMul.MX)
+                 vti.RegClass:$rs2, GPR:$rs1, vti.AVL, vti.Log2SEW)>;
+  }
+}
+
+multiclass VPatWidenBinarySDNode_WV_WX<SDNode op, PatFrags extop,
+                                       string instruction_name> {
+  foreach vtiToWti = AllWidenableIntVectors in {
+    defvar vti = vtiToWti.Vti;
+    defvar wti = vtiToWti.Wti;
+    def : Pat<(op (wti.Vector wti.RegClass:$rs2),
+                  (wti.Vector (extop (vti.Vector vti.RegClass:$rs1)))),
+              (!cast<Instruction>(instruction_name#"_WV_"#vti.LMul.MX)
+                 wti.RegClass:$rs2, vti.RegClass:$rs1, vti.AVL, vti.Log2SEW)>;
+    def : Pat<(op (wti.Vector wti.RegClass:$rs2),
+                  (wti.Vector (extop (vti.Vector (SplatPat GPR:$rs1))))),
+              (!cast<Instruction>(instruction_name#"_WX_"#vti.LMul.MX)
+                 wti.RegClass:$rs2, GPR:$rs1, vti.AVL, vti.Log2SEW)>;
   }
 }
 
+multiclass VPatWidenBinarySDNode_VV_VX_WV_WX<SDNode op, PatFrags extop,
+                                             string instruction_name> {
+  defm : VPatWidenBinarySDNode_VV_VX<op, extop, extop, instruction_name>;
+  defm : VPatWidenBinarySDNode_WV_WX<op, extop, instruction_name>;
+}
+
 multiclass VPatWidenMulAddSDNode_VV<PatFrags extop1, PatFrags extop2, string instruction_name> {
-  foreach vti = AllWidenableIntVectors in {
+  foreach vtiToWti = AllWidenableIntVectors in {
+    defvar vti = vtiToWti.Vti;
+    defvar wti = vtiToWti.Wti;
     def : Pat<
-      (add (vti.Wti.Vector vti.Wti.RegClass:$rd),
-        (mul_oneuse (vti.Wti.Vector (extop1 (vti.Vti.Vector vti.Vti.RegClass:$rs1))),
-                    (vti.Wti.Vector (extop2 (vti.Vti.Vector vti.Vti.RegClass:$rs2))))),
-      (!cast<Instruction>(instruction_name#"_VV_"#vti.Vti.LMul.MX)
-        vti.Wti.RegClass:$rd, vti.Vti.RegClass:$rs1, vti.Vti.RegClass:$rs2,
-        vti.Vti.AVL, vti.Vti.Log2SEW, TAIL_AGNOSTIC
+      (add (wti.Vector wti.RegClass:$rd),
+        (mul_oneuse (wti.Vector (extop1 (vti.Vector vti.RegClass:$rs1))),
+                    (wti.Vector (extop2 (vti.Vector vti.RegClass:$rs2))))),
+      (!cast<Instruction>(instruction_name#"_VV_"#vti.LMul.MX)
+        wti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
+        vti.AVL, vti.Log2SEW, TAIL_AGNOSTIC
       )>;
   }
 }
 multiclass VPatWidenMulAddSDNode_VX<PatFrags extop1, PatFrags extop2, string instruction_name> {
-  foreach vti = AllWidenableIntVectors in {
+  foreach vtiToWti = AllWidenableIntVectors in {
+    defvar vti = vtiToWti.Vti;
+    defvar wti = vtiToWti.Wti;
     def : Pat<
-      (add (vti.Wti.Vector vti.Wti.RegClass:$rd),
-        (mul_oneuse (vti.Wti.Vector (extop1 (vti.Vti.Vector (SplatPat GPR:$rs1)))),
-                    (vti.Wti.Vector (extop2 (vti.Vti.Vector vti.Vti.RegClass:$rs2))))),
-      (!cast<Instruction>(instruction_name#"_VX_"#vti.Vti.LMul.MX)
-        vti.Wti.RegClass:$rd, GPR:$rs1, vti.Vti.RegClass:$rs2,
-        vti.Vti.AVL, vti.Vti.Log2SEW, TAIL_AGNOSTIC
+      (add (wti.Vector wti.RegClass:$rd),
+        (mul_oneuse (wti.Vector (extop1 (vti.Vector (SplatPat GPR:$rs1)))),
+                    (wti.Vector (extop2 (vti.Vector vti.RegClass:$rs2))))),
+      (!cast<Instruction>(instruction_name#"_VX_"#vti.LMul.MX)
+        wti.RegClass:$rd, GPR:$rs1, vti.RegClass:$rs2,
+        vti.AVL, vti.Log2SEW, TAIL_AGNOSTIC
       )>;
   }
 }
 
 multiclass VPatWidenBinaryFPSDNode_VV_VF<SDNode op, string instruction_name> {
-  foreach vti = AllWidenableFloatVectors in {
-    def : Pat<(op (vti.Wti.Vector (fpext_oneuse (vti.Vti.Vector vti.Vti.RegClass:$rs2))),
-                  (vti.Wti.Vector (fpext_oneuse (vti.Vti.Vector vti.Vti.RegClass:$rs1)))),
-              (!cast<Instruction>(instruction_name#"_VV_"#vti.Vti.LMul.MX)
-                 vti.Vti.RegClass:$rs2, vti.Vti.RegClass:$rs1,
-                 vti.Vti.AVL, vti.Vti.Log2SEW)>;
-    def : Pat<(op (vti.Wti.Vector (fpext_oneuse (vti.Vti.Vector vti.Vti.RegClass:$rs2))),
-                  (vti.Wti.Vector (fpext_oneuse (vti.Vti.Vector (SplatPat vti.Vti.ScalarRegClass:$rs1))))),
-              (!cast<Instruction>(instruction_name#"_V"#vti.Vti.ScalarSuffix#"_"#vti.Vti.LMul.MX)
-                 vti.Vti.RegClass:$rs2, vti.Vti.ScalarRegClass:$rs1,
-                 vti.Vti.AVL, vti.Vti.Log2SEW)>;
+  foreach vtiToWti = AllWidenableFloatVectors in {
+    defvar vti = vtiToWti.Vti;
+    defvar wti = vtiToWti.Wti;
+    def : Pat<(op (wti.Vector (riscv_fpextend_vl_oneuse
+                                   (vti.Vector vti.RegClass:$rs2),
+                                   (vti.Mask true_mask), (XLenVT srcvalue))),
+                  (wti.Vector (riscv_fpextend_vl_oneuse
+                                   (vti.Vector vti.RegClass:$rs1),
+                                   (vti.Mask true_mask), (XLenVT srcvalue)))),
+              (!cast<Instruction>(instruction_name#"_VV_"#vti.LMul.MX)
+                 vti.RegClass:$rs2, vti.RegClass:$rs1, vti.AVL, vti.Log2SEW)>;
+    def : Pat<(op (wti.Vector (riscv_fpextend_vl_oneuse
+                                   (vti.Vector vti.RegClass:$rs2),
+                                   (vti.Mask true_mask), (XLenVT srcvalue))),
+                  (wti.Vector (riscv_fpextend_vl_oneuse
+                                   (vti.Vector (SplatFPOp vti.ScalarRegClass:$rs1)),
+                                   (vti.Mask true_mask), (XLenVT srcvalue)))),
+              (!cast<Instruction>(instruction_name#"_V"#vti.ScalarSuffix#"_"#vti.LMul.MX)
+                 vti.RegClass:$rs2, vti.ScalarRegClass:$rs1, vti.AVL, vti.Log2SEW)>;
+    def : Pat<(op (wti.Vector (riscv_fpextend_vl_oneuse
+                                   (vti.Vector vti.RegClass:$rs2),
+                                   (vti.Mask true_mask), (XLenVT srcvalue))),
+                  (wti.Vector (SplatFPOp (fpext_oneuse vti.ScalarRegClass:$rs1)))),
+              (!cast<Instruction>(instruction_name#"_V"#vti.ScalarSuffix#"_"#vti.LMul.MX)
+                 vti.RegClass:$rs2, vti.ScalarRegClass:$rs1, vti.AVL, vti.Log2SEW)>;
   }
 }
 
 multiclass VPatWidenBinaryFPSDNode_WV_WF<SDNode op, string instruction_name> {
-  foreach vti = AllWidenableFloatVectors in {
-    def : Pat<(op (vti.Wti.Vector vti.Wti.RegClass:$rs2),
-                  (vti.Wti.Vector (fpext_oneuse (vti.Vti.Vector vti.Vti.RegClass:$rs1)))),
-              (!cast<Instruction>(instruction_name#"_WV_"#vti.Vti.LMul.MX)
-                 vti.Wti.RegClass:$rs2, vti.Vti.RegClass:$rs1,
-                 vti.Vti.AVL, vti.Vti.Log2SEW)>;
-    def : Pat<(op (vti.Wti.Vector vti.Wti.RegClass:$rs2),
-                  (vti.Wti.Vector (fpext_oneuse (vti.Vti.Vector (SplatPat vti.Vti.ScalarRegClass:$rs1))))),
-              (!cast<Instruction>(instruction_name#"_W"#vti.Vti.ScalarSuffix#"_"#vti.Vti.LMul.MX)
-                 vti.Wti.RegClass:$rs2, vti.Vti.ScalarRegClass:$rs1,
-                 vti.Vti.AVL, vti.Vti.Log2SEW)>;
+  foreach vtiToWti = AllWidenableFloatVectors in {
+    defvar vti = vtiToWti.Vti;
+    defvar wti = vtiToWti.Wti;
+    def : Pat<(op (wti.Vector wti.RegClass:$rs2),
+                  (wti.Vector (riscv_fpextend_vl_oneuse
+                                   (vti.Vector vti.RegClass:$rs1),
+                                   (vti.Mask true_mask), (XLenVT srcvalue)))),
+              (!cast<Instruction>(instruction_name#"_WV_"#vti.LMul.MX)
+                 wti.RegClass:$rs2, vti.RegClass:$rs1, vti.AVL, vti.Log2SEW)>;
+    def : Pat<(op (wti.Vector wti.RegClass:$rs2),
+                  (wti.Vector (riscv_fpextend_vl_oneuse
+                                   (vti.Vector (SplatFPOp vti.ScalarRegClass:$rs1)),
+                                   (vti.Mask true_mask), (XLenVT srcvalue)))),
+              (!cast<Instruction>(instruction_name#"_W"#vti.ScalarSuffix#"_"#vti.LMul.MX)
+                 wti.RegClass:$rs2, vti.ScalarRegClass:$rs1, vti.AVL, vti.Log2SEW)>;
+    def : Pat<(op (wti.Vector wti.RegClass:$rs2),
+                  (wti.Vector (SplatFPOp (fpext_oneuse vti.ScalarRegClass:$rs1)))),
+              (!cast<Instruction>(instruction_name#"_W"#vti.ScalarSuffix#"_"#vti.LMul.MX)
+                 wti.RegClass:$rs2, vti.ScalarRegClass:$rs1, vti.AVL, vti.Log2SEW)>;
   }
 }
 
@@ -448,6 +463,148 @@ multiclass VPatWidenBinaryFPSDNode_VV_VF_WV_WF<SDNode op, string instruction_nam
   defm : VPatWidenBinaryFPSDNode_WV_WF<op, instruction_name>;
 }
 
+multiclass VPatWidenFPMulAccSDNode_VV_VF<string instruction_name> {
+  foreach vtiToWti = AllWidenableFloatVectors in {
+    defvar vti = vtiToWti.Vti;
+    defvar wti = vtiToWti.Wti;
+    def : Pat<(fma (wti.Vector (riscv_fpextend_vl_oneuse
+                                    (vti.Vector vti.RegClass:$rs1),
+                                    (vti.Mask true_mask), (XLenVT srcvalue))),
+                   (wti.Vector (riscv_fpextend_vl_oneuse
+                                    (vti.Vector vti.RegClass:$rs2),
+                                    (vti.Mask true_mask), (XLenVT srcvalue))),
+                   (wti.Vector wti.RegClass:$rd)),
+              (!cast<Instruction>(instruction_name#"_VV_"#vti.LMul.MX)
+                 wti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
+                 vti.AVL, vti.Log2SEW, TAIL_AGNOSTIC)>;
+    def : Pat<(fma (wti.Vector (riscv_fpextend_vl_oneuse
+                                    (vti.Vector (SplatFPOp vti.ScalarRegClass:$rs1)),
+                                    (vti.Mask true_mask), (XLenVT srcvalue))),
+                   (wti.Vector (riscv_fpextend_vl_oneuse
+                                    (vti.Vector vti.RegClass:$rs2),
+                                    (vti.Mask true_mask), (XLenVT srcvalue))),
+                   (wti.Vector wti.RegClass:$rd)),
+              (!cast<Instruction>(instruction_name#"_V"#vti.ScalarSuffix#"_"#vti.LMul.MX)
+                 wti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
+                 vti.AVL, vti.Log2SEW, TAIL_AGNOSTIC)>;
+  }
+}
+
+multiclass VPatWidenFPNegMulAccSDNode_VV_VF<string instruction_name> {
+  foreach vtiToWti = AllWidenableFloatVectors in {
+    defvar vti = vtiToWti.Vti;
+    defvar wti = vtiToWti.Wti;
+    def : Pat<(fma (fneg (wti.Vector (riscv_fpextend_vl_oneuse
+                                          (vti.Vector vti.RegClass:$rs1),
+                                          (vti.Mask true_mask), (XLenVT srcvalue)))),
+                   (riscv_fpextend_vl_oneuse (vti.Vector vti.RegClass:$rs2),
+                                             (vti.Mask true_mask), (XLenVT srcvalue)),
+                   (fneg wti.RegClass:$rd)),
+              (!cast<Instruction>(instruction_name#"_VV_"#vti.LMul.MX)
+                 wti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
+                 vti.AVL, vti.Log2SEW, TAIL_AGNOSTIC)>;
+    def : Pat<(fma (riscv_fpextend_vl_oneuse
+                        (vti.Vector (SplatFPOp vti.ScalarRegClass:$rs1)),
+                        (vti.Mask true_mask), (XLenVT srcvalue)),
+                   (fneg (wti.Vector (riscv_fpextend_vl_oneuse
+                                          (vti.Vector vti.RegClass:$rs2),
+                                          (vti.Mask true_mask), (XLenVT srcvalue)))),
+                   (fneg wti.RegClass:$rd)),
+              (!cast<Instruction>(instruction_name#"_V"#vti.ScalarSuffix#"_"#vti.LMul.MX)
+                 wti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
+                 vti.AVL, vti.Log2SEW, TAIL_AGNOSTIC)>;
+    def : Pat<(fma (fneg (wti.Vector (riscv_fpextend_vl_oneuse
+                                          (vti.Vector (SplatFPOp vti.ScalarRegClass:$rs1)),
+                                          (vti.Mask true_mask), (XLenVT srcvalue)))),
+                   (riscv_fpextend_vl_oneuse (vti.Vector vti.RegClass:$rs2),
+                                             (vti.Mask true_mask), (XLenVT srcvalue)),
+                   (fneg wti.RegClass:$rd)),
+              (!cast<Instruction>(instruction_name#"_V"#vti.ScalarSuffix#"_"#vti.LMul.MX)
+                 wti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
+                 vti.AVL, vti.Log2SEW, TAIL_AGNOSTIC)>;
+  }
+}
+
+multiclass VPatWidenFPMulSacSDNode_VV_VF<string instruction_name> {
+  foreach vtiToWti = AllWidenableFloatVectors in {
+    defvar vti = vtiToWti.Vti;
+    defvar wti = vtiToWti.Wti;
+    def : Pat<(fma (wti.Vector (riscv_fpextend_vl_oneuse
+                                    (vti.Vector vti.RegClass:$rs1),
+                                    (vti.Mask true_mask), (XLenVT srcvalue))),
+                   (riscv_fpextend_vl_oneuse (vti.Vector vti.RegClass:$rs2),
+                                             (vti.Mask true_mask), (XLenVT srcvalue)),
+                   (fneg wti.RegClass:$rd)),
+              (!cast<Instruction>(instruction_name#"_VV_"#vti.LMul.MX)
+                 wti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
+                 vti.AVL, vti.Log2SEW, TAIL_AGNOSTIC)>;
+    def : Pat<(fma (wti.Vector (riscv_fpextend_vl_oneuse
+                                    (vti.Vector (SplatFPOp vti.ScalarRegClass:$rs1)),
+                                    (vti.Mask true_mask), (XLenVT srcvalue))),
+                   (riscv_fpextend_vl_oneuse (vti.Vector vti.RegClass:$rs2),
+                                             (vti.Mask true_mask), (XLenVT srcvalue)),
+                   (fneg wti.RegClass:$rd)),
+              (!cast<Instruction>(instruction_name#"_V"#vti.ScalarSuffix#"_"#vti.LMul.MX)
+                 wti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
+                 vti.AVL, vti.Log2SEW, TAIL_AGNOSTIC)>;
+  }
+}
+
+multiclass VPatWidenFPNegMulSacSDNode_VV_VF<string instruction_name> {
+  foreach vtiToWti = AllWidenableFloatVectors in {
+    defvar vti = vtiToWti.Vti;
+    defvar wti = vtiToWti.Wti;
+    def : Pat<(fma (fneg (wti.Vector (riscv_fpextend_vl_oneuse
+                                          (vti.Vector vti.RegClass:$rs1),
+                                          (vti.Mask true_mask), (XLenVT srcvalue)))),
+                   (riscv_fpextend_vl_oneuse (vti.Vector vti.RegClass:$rs2),
+                                             (vti.Mask true_mask), (XLenVT srcvalue)),
+                   wti.RegClass:$rd),
+              (!cast<Instruction>(instruction_name#"_VV_"#vti.LMul.MX)
+                 wti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
+                 vti.AVL, vti.Log2SEW, TAIL_AGNOSTIC)>;
+    def : Pat<(fma (wti.Vector (riscv_fpextend_vl_oneuse
+                                    (vti.Vector (SplatFPOp vti.ScalarRegClass:$rs1)),
+                                    (vti.Mask true_mask), (XLenVT srcvalue))),
+                   (fneg (wti.Vector (riscv_fpextend_vl_oneuse
+                                          (vti.Vector vti.RegClass:$rs2),
+                                          (vti.Mask true_mask), (XLenVT srcvalue)))),
+                   wti.RegClass:$rd),
+              (!cast<Instruction>(instruction_name#"_V"#vti.ScalarSuffix#"_"#vti.LMul.MX)
+                 wti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
+                 vti.AVL, vti.Log2SEW, TAIL_AGNOSTIC)>;
+    def : Pat<(fma (fneg (wti.Vector (riscv_fpextend_vl_oneuse
+                                          (vti.Vector (SplatFPOp vti.ScalarRegClass:$rs1)),
+                                          (vti.Mask true_mask), (XLenVT srcvalue)))),
+                   (riscv_fpextend_vl_oneuse (vti.Vector vti.RegClass:$rs2),
+                                             (vti.Mask true_mask), (XLenVT srcvalue)),
+                   wti.RegClass:$rd),
+              (!cast<Instruction>(instruction_name#"_V"#vti.ScalarSuffix#"_"#vti.LMul.MX)
+                 wti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
+                 vti.AVL, vti.Log2SEW, TAIL_AGNOSTIC)>;
+  }
+}
+
+multiclass VPatMultiplyAddSDNode_VV_VX<SDNode op, string instruction_name> {
+  foreach vti = AllIntegerVectors in {
+    defvar suffix = vti.LMul.MX;
+    // NOTE: We choose VMADD because it has the most commuting freedom. So it
+    // works best with how TwoAddressInstructionPass tries commuting.
+    def : Pat<(vti.Vector (op vti.RegClass:$rs2,
+                              (mul_oneuse vti.RegClass:$rs1, vti.RegClass:$rd))),
+              (!cast<Instruction>(instruction_name#"_VV_"# suffix)
+                 vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
+                 vti.AVL, vti.Log2SEW, TAIL_AGNOSTIC)>;
+    // The choice of VMADD here is arbitrary, vmadd.vx and vmacc.vx are equally
+    // commutable.
+    def : Pat<(vti.Vector (op vti.RegClass:$rs2,
+                              (mul_oneuse (SplatPat XLenVT:$rs1), vti.RegClass:$rd))),
+              (!cast<Instruction>(instruction_name#"_VX_" # suffix)
+                 vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
+                 vti.AVL, vti.Log2SEW, TAIL_AGNOSTIC)>;
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // Patterns.
 //===----------------------------------------------------------------------===//
@@ -520,42 +677,45 @@ defm : VPatBinarySDNode_VV_VX_VI<sra, "PseudoVSRA", uimm5>;
 foreach vti = AllIntegerVectors in {
   // Emit shift by 1 as an add since it might be faster.
   def : Pat<(shl (vti.Vector vti.RegClass:$rs1),
-                 (vti.Vector (splat_vector (XLenVT 1)))),
-            (!cast<Instruction>("PseudoVADD_VV_"# vti.LMul.MX)
-                 vti.RegClass:$rs1, vti.RegClass:$rs1, vti.AVL, vti.Log2SEW)>;
-}
-foreach vti = [VI64M1, VI64M2, VI64M4, VI64M8] in {
-  def : Pat<(shl (vti.Vector vti.RegClass:$rs1),
-                 (vti.Vector (rv32_splat_i64 (XLenVT 1)))),
+                 (vti.Vector (riscv_vmv_v_x_vl (vti.Vector undef), 1, (XLenVT srcvalue)))),
             (!cast<Instruction>("PseudoVADD_VV_"# vti.LMul.MX)
                  vti.RegClass:$rs1, vti.RegClass:$rs1, vti.AVL, vti.Log2SEW)>;
 
 }
 
 // 12.8. Vector Integer Comparison Instructions
-defm : VPatIntegerSetCCSDNode_VV_VX_VI<SETEQ,  "PseudoVMSEQ">;
-defm : VPatIntegerSetCCSDNode_VV_VX_VI<SETNE,  "PseudoVMSNE">;
-
-defm : VPatIntegerSetCCSDNode_VV_VX<SETLT,  "PseudoVMSLT">;
-defm : VPatIntegerSetCCSDNode_VV_VX<SETULT, "PseudoVMSLTU">;
-defm : VPatIntegerSetCCSDNode_VIPlus1<SETLT, "PseudoVMSLE",
+defm : VPatIntegerSetCCSDNode_VV<"PseudoVMSEQ", SETEQ>;
+defm : VPatIntegerSetCCSDNode_VV<"PseudoVMSNE", SETNE>;
+
+defm : VPatIntegerSetCCSDNode_VV_Swappable<"PseudoVMSLT",  SETLT, SETGT>;
+defm : VPatIntegerSetCCSDNode_VV_Swappable<"PseudoVMSLTU", SETULT, SETUGT>;
+defm : VPatIntegerSetCCSDNode_VV_Swappable<"PseudoVMSLE",  SETLE,  SETGE>;
+defm : VPatIntegerSetCCSDNode_VV_Swappable<"PseudoVMSLEU", SETULE, SETUGE>;
+
+defm : VPatIntegerSetCCSDNode_VX_Swappable<"PseudoVMSEQ",  SETEQ,  SETEQ>;
+defm : VPatIntegerSetCCSDNode_VX_Swappable<"PseudoVMSNE",  SETNE,  SETNE>;
+defm : VPatIntegerSetCCSDNode_VX_Swappable<"PseudoVMSLT",  SETLT,  SETGT>;
+defm : VPatIntegerSetCCSDNode_VX_Swappable<"PseudoVMSLTU", SETULT, SETUGT>;
+defm : VPatIntegerSetCCSDNode_VX_Swappable<"PseudoVMSLE",  SETLE,  SETGE>;
+defm : VPatIntegerSetCCSDNode_VX_Swappable<"PseudoVMSLEU", SETULE, SETUGE>;
+defm : VPatIntegerSetCCSDNode_VX_Swappable<"PseudoVMSGT",  SETGT,  SETLT>;
+defm : VPatIntegerSetCCSDNode_VX_Swappable<"PseudoVMSGTU", SETUGT, SETULT>;
+// There is no VMSGE(U)_VX instruction
+
+defm : VPatIntegerSetCCSDNode_VI<"PseudoVMSEQ",  SETEQ>;
+defm : VPatIntegerSetCCSDNode_VI<"PseudoVMSNE",  SETNE>;
+defm : VPatIntegerSetCCSDNode_VI<"PseudoVMSLE",  SETLE>;
+defm : VPatIntegerSetCCSDNode_VI<"PseudoVMSLEU", SETULE>;
+defm : VPatIntegerSetCCSDNode_VI<"PseudoVMSGT",  SETGT>;
+defm : VPatIntegerSetCCSDNode_VI<"PseudoVMSGTU", SETUGT>;
+
+defm : VPatIntegerSetCCSDNode_VIPlus1<"PseudoVMSLE", SETLT,
                                       SplatPat_simm5_plus1_nonzero>;
-defm : VPatIntegerSetCCSDNode_VIPlus1<SETULT, "PseudoVMSLEU",
+defm : VPatIntegerSetCCSDNode_VIPlus1<"PseudoVMSLEU", SETULT,
                                       SplatPat_simm5_plus1_nonzero>;
-
-defm : VPatIntegerSetCCSDNode_VV<SETGT,  "PseudoVMSLT", /*swap*/1>;
-defm : VPatIntegerSetCCSDNode_VV<SETUGT, "PseudoVMSLTU", /*swap*/1>;
-defm : VPatIntegerSetCCSDNode_VX_VI<SETGT,  "PseudoVMSGT">;
-defm : VPatIntegerSetCCSDNode_VX_VI<SETUGT, "PseudoVMSGTU">;
-
-defm : VPatIntegerSetCCSDNode_VV_VX_VI<SETLE,  "PseudoVMSLE">;
-defm : VPatIntegerSetCCSDNode_VV_VX_VI<SETULE, "PseudoVMSLEU">;
-
-defm : VPatIntegerSetCCSDNode_VV<SETGE,  "PseudoVMSLE", /*swap*/1>;
-defm : VPatIntegerSetCCSDNode_VV<SETUGE, "PseudoVMSLEU", /*swap*/1>;
-defm : VPatIntegerSetCCSDNode_VIPlus1<SETGE, "PseudoVMSGT",
+defm : VPatIntegerSetCCSDNode_VIPlus1<"PseudoVMSGT", SETGE,
                                       SplatPat_simm5_plus1>;
-defm : VPatIntegerSetCCSDNode_VIPlus1<SETUGE, "PseudoVMSGTU",
+defm : VPatIntegerSetCCSDNode_VIPlus1<"PseudoVMSGTU", SETUGE,
                                       SplatPat_simm5_plus1_nonzero>;
 
 // 12.9. Vector Integer Min/Max Instructions
@@ -575,37 +735,23 @@ defm : VPatBinarySDNode_VV_VX<sdiv, "PseudoVDIV">;
 defm : VPatBinarySDNode_VV_VX<urem, "PseudoVREMU">;
 defm : VPatBinarySDNode_VV_VX<srem, "PseudoVREM">;
 
-// 12.13 Vector Single-Width Integer Multiply-Add Instructions.
-foreach vti = AllIntegerVectors in {
-  // NOTE: We choose VMADD because it has the most commuting freedom. So it
-  // works best with how TwoAddressInstructionPass tries commuting.
-  defvar suffix = vti.LMul.MX;
-  def : Pat<(vti.Vector (add vti.RegClass:$rs2,
-                              (mul_oneuse vti.RegClass:$rs1, vti.RegClass:$rd))),
-            (!cast<Instruction>("PseudoVMADD_VV_"# suffix)
-                 vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                 vti.AVL, vti.Log2SEW, TAIL_AGNOSTIC)>;
-  def : Pat<(vti.Vector (sub vti.RegClass:$rs2,
-                              (mul_oneuse vti.RegClass:$rs1, vti.RegClass:$rd))),
-            (!cast<Instruction>("PseudoVNMSUB_VV_"# suffix)
-                 vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                 vti.AVL, vti.Log2SEW, TAIL_AGNOSTIC)>;
+// 12.12. Vector Widening Integer Multiply Instructions
+defm : VPatWidenBinarySDNode_VV_VX<mul, sext_oneuse, sext_oneuse,
+                                   "PseudoVWMUL">;
+defm : VPatWidenBinarySDNode_VV_VX<mul, zext_oneuse, zext_oneuse,
+                                   "PseudoVWMULU">;
+defm : VPatWidenBinarySDNode_VV_VX<mul, anyext_oneuse, anyext_oneuse,
+                                   "PseudoVWMULU">;
+defm : VPatWidenBinarySDNode_VV_VX<mul, zext_oneuse, anyext_oneuse,
+                                   "PseudoVWMULU">;
+defm : VPatWidenBinarySDNode_VV_VX<mul, sext_oneuse, zext_oneuse,
+                                   "PseudoVWMULSU">;
+defm : VPatWidenBinarySDNode_VV_VX<mul, sext_oneuse, anyext_oneuse,
+                                   "PseudoVWMULSU">;
 
-  // The choice of VMADD here is arbitrary, vmadd.vx and vmacc.vx are equally
-  // commutable.
-  def : Pat<(vti.Vector (add vti.RegClass:$rs2,
-                              (mul_oneuse (SplatPat XLenVT:$rs1),
-                                          vti.RegClass:$rd))),
-            (!cast<Instruction>("PseudoVMADD_VX_" # suffix)
-                 vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                 vti.AVL, vti.Log2SEW, TAIL_AGNOSTIC)>;
-  def : Pat<(vti.Vector (sub vti.RegClass:$rs2,
-                              (mul_oneuse (SplatPat XLenVT:$rs1),
-                                          vti.RegClass:$rd))),
-            (!cast<Instruction>("PseudoVNMSUB_VX_" # suffix)
-                 vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                 vti.AVL, vti.Log2SEW, TAIL_AGNOSTIC)>;
-}
+// 12.13 Vector Single-Width Integer Multiply-Add Instructions.
+defm : VPatMultiplyAddSDNode_VV_VX<add, "PseudoVMADD">;
+defm : VPatMultiplyAddSDNode_VV_VX<sub, "PseudoVNMSUB">;
 
 // 12.14 Vector Widening Integer Multiply-Add Instructions
 defm : VPatWidenMulAddSDNode_VV<sext_oneuse, sext_oneuse, "PseudoVWMACC">;
@@ -725,41 +871,47 @@ foreach fvti = AllFloatVectors in {
 
   // The choice of VFMADD here is arbitrary, vfmadd.vf and vfmacc.vf are equally
   // commutable.
-  def : Pat<(fvti.Vector (fma (splat_vector fvti.ScalarRegClass:$rs1),
+  def : Pat<(fvti.Vector (fma (SplatFPOp fvti.ScalarRegClass:$rs1),
                               fvti.RegClass:$rd, fvti.RegClass:$rs2)),
             (!cast<Instruction>("PseudoVFMADD_V" # fvti.ScalarSuffix # "_" # suffix)
                  fvti.RegClass:$rd, fvti.ScalarRegClass:$rs1, fvti.RegClass:$rs2,
                  fvti.AVL, fvti.Log2SEW, TAIL_AGNOSTIC)>;
-  def : Pat<(fvti.Vector (fma (splat_vector fvti.ScalarRegClass:$rs1),
+  def : Pat<(fvti.Vector (fma (SplatFPOp fvti.ScalarRegClass:$rs1),
                               fvti.RegClass:$rd, (fneg fvti.RegClass:$rs2))),
             (!cast<Instruction>("PseudoVFMSUB_V" # fvti.ScalarSuffix # "_" # suffix)
                  fvti.RegClass:$rd, fvti.ScalarRegClass:$rs1, fvti.RegClass:$rs2,
                  fvti.AVL, fvti.Log2SEW, TAIL_AGNOSTIC)>;
 
-  def : Pat<(fvti.Vector (fma (splat_vector fvti.ScalarRegClass:$rs1),
+  def : Pat<(fvti.Vector (fma (SplatFPOp fvti.ScalarRegClass:$rs1),
                               (fneg fvti.RegClass:$rd), (fneg fvti.RegClass:$rs2))),
             (!cast<Instruction>("PseudoVFNMADD_V" # fvti.ScalarSuffix # "_" # suffix)
                  fvti.RegClass:$rd, fvti.ScalarRegClass:$rs1, fvti.RegClass:$rs2,
                  fvti.AVL, fvti.Log2SEW, TAIL_AGNOSTIC)>;
-  def : Pat<(fvti.Vector (fma (splat_vector fvti.ScalarRegClass:$rs1),
+  def : Pat<(fvti.Vector (fma (SplatFPOp fvti.ScalarRegClass:$rs1),
                               (fneg fvti.RegClass:$rd), fvti.RegClass:$rs2)),
             (!cast<Instruction>("PseudoVFNMSUB_V" # fvti.ScalarSuffix # "_" # suffix)
                  fvti.RegClass:$rd, fvti.ScalarRegClass:$rs1, fvti.RegClass:$rs2,
                  fvti.AVL, fvti.Log2SEW, TAIL_AGNOSTIC)>;
 
   // The splat might be negated.
-  def : Pat<(fvti.Vector (fma (fneg (splat_vector fvti.ScalarRegClass:$rs1)),
+  def : Pat<(fvti.Vector (fma (fneg (SplatFPOp fvti.ScalarRegClass:$rs1)),
                               fvti.RegClass:$rd, (fneg fvti.RegClass:$rs2))),
             (!cast<Instruction>("PseudoVFNMADD_V" # fvti.ScalarSuffix # "_" # suffix)
                  fvti.RegClass:$rd, fvti.ScalarRegClass:$rs1, fvti.RegClass:$rs2,
                  fvti.AVL, fvti.Log2SEW, TAIL_AGNOSTIC)>;
-  def : Pat<(fvti.Vector (fma (fneg (splat_vector fvti.ScalarRegClass:$rs1)),
+  def : Pat<(fvti.Vector (fma (fneg (SplatFPOp fvti.ScalarRegClass:$rs1)),
                               fvti.RegClass:$rd, fvti.RegClass:$rs2)),
             (!cast<Instruction>("PseudoVFNMSUB_V" # fvti.ScalarSuffix # "_" # suffix)
                  fvti.RegClass:$rd, fvti.ScalarRegClass:$rs1, fvti.RegClass:$rs2,
                  fvti.AVL, fvti.Log2SEW, TAIL_AGNOSTIC)>;
 }
 
+// 14.7. Vector Widening Floating-Point Fused Multiply-Add Instructions
+defm : VPatWidenFPMulAccSDNode_VV_VF<"PseudoVFWMACC">;
+defm : VPatWidenFPNegMulAccSDNode_VV_VF<"PseudoVFWNMACC">;
+defm : VPatWidenFPMulSacSDNode_VV_VF<"PseudoVFWMSAC">;
+defm : VPatWidenFPNegMulSacSDNode_VV_VF<"PseudoVFWNMSAC">;
+
 foreach vti = AllFloatVectors in {
   // 14.8. Vector Floating-Point Square-Root Instruction
   def : Pat<(fsqrt (vti.Vector vti.RegClass:$rs2)),
@@ -780,7 +932,7 @@ foreach vti = AllFloatVectors in {
             (!cast<Instruction>("PseudoVFSGNJ_VV_"# vti.LMul.MX)
                  vti.RegClass:$rs1, vti.RegClass:$rs2, vti.AVL, vti.Log2SEW)>;
   def : Pat<(vti.Vector (fcopysign (vti.Vector vti.RegClass:$rs1),
-                                   (vti.Vector (splat_vector vti.ScalarRegClass:$rs2)))),
+                                   (vti.Vector (SplatFPOp vti.ScalarRegClass:$rs2)))),
             (!cast<Instruction>("PseudoVFSGNJ_V"#vti.ScalarSuffix#"_"#vti.LMul.MX)
                  vti.RegClass:$rs1, vti.ScalarRegClass:$rs2, vti.AVL, vti.Log2SEW)>;
 
@@ -789,7 +941,7 @@ foreach vti = AllFloatVectors in {
             (!cast<Instruction>("PseudoVFSGNJN_VV_"# vti.LMul.MX)
                  vti.RegClass:$rs1, vti.RegClass:$rs2, vti.AVL, vti.Log2SEW)>;
   def : Pat<(vti.Vector (fcopysign (vti.Vector vti.RegClass:$rs1),
-                                   (vti.Vector (fneg (splat_vector vti.ScalarRegClass:$rs2))))),
+                                   (vti.Vector (fneg (SplatFPOp vti.ScalarRegClass:$rs2))))),
             (!cast<Instruction>("PseudoVFSGNJN_V"#vti.ScalarSuffix#"_"#vti.LMul.MX)
                  vti.RegClass:$rs1, vti.ScalarRegClass:$rs2, vti.AVL, vti.Log2SEW)>;
 }
@@ -822,7 +974,7 @@ foreach fvti = AllFloatVectors in {
                  fvti.AVL, fvti.Log2SEW)>;
 
   def : Pat<(fvti.Vector (vselect (fvti.Mask V0),
-                                  (splat_vector fvti.ScalarRegClass:$rs1),
+                                  (SplatFPOp fvti.ScalarRegClass:$rs1),
                                   fvti.RegClass:$rs2)),
             (!cast<Instruction>("PseudoVFMERGE_V"#fvti.ScalarSuffix#"M_"#fvti.LMul.MX)
                  fvti.RegClass:$rs2,
@@ -830,7 +982,7 @@ foreach fvti = AllFloatVectors in {
                  (fvti.Mask V0), fvti.AVL, fvti.Log2SEW)>;
 
   def : Pat<(fvti.Vector (vselect (fvti.Mask V0),
-                                  (splat_vector (fvti.Scalar fpimm0)),
+                                  (SplatFPOp (fvti.Scalar fpimm0)),
                                   fvti.RegClass:$rs2)),
             (!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX)
                  fvti.RegClass:$rs2, 0, (fvti.Mask V0), fvti.AVL, fvti.Log2SEW)>;
@@ -847,13 +999,6 @@ defm : VPatWConvertFP2ISDNode_V<fp_to_sint, "PseudoVFWCVT_RTZ_X_F_V">;
 defm : VPatWConvertFP2ISDNode_V<fp_to_uint, "PseudoVFWCVT_RTZ_XU_F_V">;
 defm : VPatWConvertI2FPSDNode_V<sint_to_fp, "PseudoVFWCVT_F_X_V">;
 defm : VPatWConvertI2FPSDNode_V<uint_to_fp, "PseudoVFWCVT_F_XU_V">;
-foreach fvtiToFWti = AllWidenableFloatVectors in {
-  defvar fvti = fvtiToFWti.Vti;
-  defvar fwti = fvtiToFWti.Wti;
-  def : Pat<(fwti.Vector (fpextend (fvti.Vector fvti.RegClass:$rs1))),
-            (!cast<Instruction>("PseudoVFWCVT_F_F_V_"#fvti.LMul.MX)
-                fvti.RegClass:$rs1, fvti.AVL, fvti.Log2SEW)>;
-}
 
 // 14.19. Narrowing Floating-Point/Integer Type-Convert Instructions
 defm : VPatNConvertFP2ISDNode_V<fp_to_sint, "PseudoVFNCVT_RTZ_X_F_W">;
@@ -873,25 +1018,14 @@ foreach fvtiToFWti = AllWidenableFloatVectors in {
 // Vector Splats
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasVInstructions] in {
-foreach vti = AllIntegerVectors in {
-  def : Pat<(vti.Vector (SplatPat GPR:$rs1)),
-            (!cast<Instruction>("PseudoVMV_V_X_" # vti.LMul.MX)
-              GPR:$rs1, vti.AVL, vti.Log2SEW)>;
-  def : Pat<(vti.Vector (SplatPat_simm5 simm5:$rs1)),
-            (!cast<Instruction>("PseudoVMV_V_I_" # vti.LMul.MX)
-              simm5:$rs1, vti.AVL, vti.Log2SEW)>;
-}
-} // Predicates = [HasVInstructions]
-
 let Predicates = [HasVInstructionsAnyF] in {
 foreach fvti = AllFloatVectors in {
-  def : Pat<(fvti.Vector (splat_vector fvti.ScalarRegClass:$rs1)),
+  def : Pat<(fvti.Vector (SplatFPOp fvti.ScalarRegClass:$rs1)),
             (!cast<Instruction>("PseudoVFMV_V_"#fvti.ScalarSuffix#"_"#fvti.LMul.MX)
               (fvti.Scalar fvti.ScalarRegClass:$rs1),
               fvti.AVL, fvti.Log2SEW)>;
 
-  def : Pat<(fvti.Vector (splat_vector (fvti.Scalar fpimm0))),
+  def : Pat<(fvti.Vector (SplatFPOp (fvti.Scalar fpimm0))),
             (!cast<Instruction>("PseudoVMV_V_I_"#fvti.LMul.MX)
               0, fvti.AVL, fvti.Log2SEW)>;
 }
@@ -902,6 +1036,13 @@ foreach fvti = AllFloatVectors in {
 //===----------------------------------------------------------------------===//
 let Predicates = [HasVInstructionsAnyF] in
 foreach vti = AllFloatVectors in {
+  // Fold store of vmv.f.s to a vse with VL=1.
+  defvar store_instr = !cast<Instruction>("PseudoVSE"#vti.SEW#"_V_"#vti.LMul.MX);
+  def : Pat<(store (vti.Scalar (int_riscv_vfmv_f_s (vti.Vector vti.RegClass:$rs2))), BaseAddr:$rs1),
+            (store_instr vti.RegClass:$rs2, BaseAddr:$rs1, 1, vti.Log2SEW)>;
+  def : Pat<(store (extractelt (vti.Vector vti.RegClass:$rs2), 0), BaseAddr:$rs1),
+            (store_instr vti.RegClass:$rs2, BaseAddr:$rs1, 1, vti.Log2SEW)>;
+
   defvar vmv_f_s_inst = !cast<Instruction>(!strconcat("PseudoVFMV_",
                                                        vti.ScalarSuffix,
                                                        "_S_", vti.LMul.MX));
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index e71c498fd5f4..081f61617d59 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -8,8 +8,7 @@
 ///
 /// This file contains the required infrastructure and VL patterns to
 /// support code generation for the standard 'V' (Vector) extension, version
-/// 0.10.  This version is still experimental as the 'V' extension hasn't been
-/// ratified yet.
+/// version 1.0.
 ///
 /// This file is included from and depends upon RISCVInstrInfoVPseudos.td
 ///
@@ -22,11 +21,6 @@
 // Helpers to define the VL patterns.
 //===----------------------------------------------------------------------===//
 
-def SDT_RISCVVLE_VL : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisPtrTy<1>,
-                                           SDTCisVT<2, XLenVT>]>;
-def SDT_RISCVVSE_VL : SDTypeProfile<0, 3, [SDTCisVec<0>, SDTCisPtrTy<1>,
-                                           SDTCisVT<2, XLenVT>]>;
-
 def SDT_RISCVIntBinOp_VL : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
                                                 SDTCisSameAs<0, 2>,
                                                 SDTCisVec<0>, SDTCisInt<0>,
@@ -47,13 +41,15 @@ def SDT_RISCVFPBinOp_VL : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
                                                SDTCisVT<4, XLenVT>]>;
 
 def riscv_vmv_v_x_vl : SDNode<"RISCVISD::VMV_V_X_VL",
-                              SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<0>,
-                                                   SDTCisVT<1, XLenVT>,
-                                                   SDTCisVT<2, XLenVT>]>>;
+                              SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<0>,
+                                                   SDTCisSameAs<0, 1>,
+                                                   SDTCisVT<2, XLenVT>,
+                                                   SDTCisVT<3, XLenVT>]>>;
 def riscv_vfmv_v_f_vl : SDNode<"RISCVISD::VFMV_V_F_VL",
-                               SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisFP<0>,
-                                                    SDTCisEltOfVec<1, 0>,
-                                                    SDTCisVT<2, XLenVT>]>>;
+                               SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisFP<0>,
+                                                    SDTCisSameAs<0, 1>,
+                                                    SDTCisEltOfVec<2, 0>,
+                                                    SDTCisVT<3, XLenVT>]>>;
 def riscv_vmv_s_x_vl : SDNode<"RISCVISD::VMV_S_X_VL",
                               SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
                                                    SDTCisInt<0>,
@@ -65,11 +61,6 @@ def riscv_vfmv_s_f_vl : SDNode<"RISCVISD::VFMV_S_F_VL",
                                                     SDTCisEltOfVec<2, 0>,
                                                     SDTCisVT<3, XLenVT>]>>;
 
-def riscv_vle_vl : SDNode<"RISCVISD::VLE_VL", SDT_RISCVVLE_VL,
-                          [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
-def riscv_vse_vl : SDNode<"RISCVISD::VSE_VL", SDT_RISCVVSE_VL,
-                          [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
-
 def riscv_add_vl   : SDNode<"RISCVISD::ADD_VL",   SDT_RISCVIntBinOp_VL, [SDNPCommutative]>;
 def riscv_sub_vl   : SDNode<"RISCVISD::SUB_VL",   SDT_RISCVIntBinOp_VL>;
 def riscv_mul_vl   : SDNode<"RISCVISD::MUL_VL",   SDT_RISCVIntBinOp_VL, [SDNPCommutative]>;
@@ -113,7 +104,10 @@ def SDT_RISCVVecFMA_VL : SDTypeProfile<1, 5, [SDTCisSameAs<0, 1>,
                                               SDTCVecEltisVT<4, i1>,
                                               SDTCisSameNumEltsAs<0, 4>,
                                               SDTCisVT<5, XLenVT>]>;
-def riscv_fma_vl : SDNode<"RISCVISD::FMA_VL", SDT_RISCVVecFMA_VL, [SDNPCommutative]>;
+def riscv_vfmadd_vl : SDNode<"RISCVISD::VFMADD_VL", SDT_RISCVVecFMA_VL, [SDNPCommutative]>;
+def riscv_vfnmadd_vl : SDNode<"RISCVISD::VFNMADD_VL", SDT_RISCVVecFMA_VL, [SDNPCommutative]>;
+def riscv_vfmsub_vl : SDNode<"RISCVISD::VFMSUB_VL", SDT_RISCVVecFMA_VL, [SDNPCommutative]>;
+def riscv_vfnmsub_vl : SDNode<"RISCVISD::VFNMSUB_VL", SDT_RISCVVecFMA_VL, [SDNPCommutative]>;
 
 def SDT_RISCVFPRoundOp_VL  : SDTypeProfile<1, 3, [
   SDTCisFP<0>, SDTCisFP<1>, SDTCisOpSmallerThanOp<0, 1>, SDTCisSameNumEltsAs<0, 1>,
@@ -152,30 +146,33 @@ def riscv_setcc_vl : SDNode<"RISCVISD::SETCC_VL",
                                                  SDTCisVT<5, XLenVT>]>>;
 
 def riscv_vrgather_vx_vl : SDNode<"RISCVISD::VRGATHER_VX_VL",
-                                  SDTypeProfile<1, 4, [SDTCisVec<0>,
+                                  SDTypeProfile<1, 5, [SDTCisVec<0>,
                                                        SDTCisSameAs<0, 1>,
                                                        SDTCisVT<2, XLenVT>,
                                                        SDTCVecEltisVT<3, i1>,
                                                        SDTCisSameNumEltsAs<0, 3>,
-                                                       SDTCisVT<4, XLenVT>]>>;
+                                                       SDTCisSameAs<0, 4>,
+                                                       SDTCisVT<5, XLenVT>]>>;
 def riscv_vrgather_vv_vl : SDNode<"RISCVISD::VRGATHER_VV_VL",
-                                  SDTypeProfile<1, 4, [SDTCisVec<0>,
+                                  SDTypeProfile<1, 5, [SDTCisVec<0>,
                                                        SDTCisSameAs<0, 1>,
                                                        SDTCisInt<2>,
                                                        SDTCisSameNumEltsAs<0, 2>,
                                                        SDTCisSameSizeAs<0, 2>,
                                                        SDTCVecEltisVT<3, i1>,
                                                        SDTCisSameNumEltsAs<0, 3>,
-                                                       SDTCisVT<4, XLenVT>]>>;
+                                                       SDTCisSameAs<0, 4>,
+                                                       SDTCisVT<5, XLenVT>]>>;
 def riscv_vrgatherei16_vv_vl : SDNode<"RISCVISD::VRGATHEREI16_VV_VL",
-                                      SDTypeProfile<1, 4, [SDTCisVec<0>,
+                                      SDTypeProfile<1, 5, [SDTCisVec<0>,
                                                            SDTCisSameAs<0, 1>,
                                                            SDTCisInt<2>,
                                                            SDTCVecEltisVT<2, i16>,
                                                            SDTCisSameNumEltsAs<0, 2>,
                                                            SDTCVecEltisVT<3, i1>,
                                                            SDTCisSameNumEltsAs<0, 3>,
-                                                           SDTCisVT<4, XLenVT>]>>;
+                                                           SDTCisSameAs<0, 4>,
+                                                           SDTCisVT<5, XLenVT>]>>;
 
 def SDT_RISCVSelect_VL  : SDTypeProfile<1, 4, [
   SDTCisVec<0>, SDTCisVec<1>, SDTCisSameNumEltsAs<0, 1>, SDTCVecEltisVT<1, i1>,
@@ -185,6 +182,11 @@ def SDT_RISCVSelect_VL  : SDTypeProfile<1, 4, [
 def riscv_vselect_vl  : SDNode<"RISCVISD::VSELECT_VL", SDT_RISCVSelect_VL>;
 def riscv_vp_merge_vl : SDNode<"RISCVISD::VP_MERGE_VL", SDT_RISCVSelect_VL>;
 
+def SDT_RISCVVMSETCLR_VL : SDTypeProfile<1, 1, [SDTCVecEltisVT<0, i1>,
+                                                SDTCisVT<1, XLenVT>]>;
+def riscv_vmclr_vl : SDNode<"RISCVISD::VMCLR_VL", SDT_RISCVVMSETCLR_VL>;
+def riscv_vmset_vl : SDNode<"RISCVISD::VMSET_VL", SDT_RISCVVMSETCLR_VL>;
+
 def SDT_RISCVMaskBinOp_VL : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
                                                  SDTCisSameAs<0, 2>,
                                                  SDTCVecEltisVT<0, i1>,
@@ -229,7 +231,22 @@ def SDT_RISCVVWBinOp_VL : SDTypeProfile<1, 4, [SDTCisVec<0>,
 def riscv_vwmul_vl  : SDNode<"RISCVISD::VWMUL_VL",  SDT_RISCVVWBinOp_VL, [SDNPCommutative]>;
 def riscv_vwmulu_vl : SDNode<"RISCVISD::VWMULU_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>;
 def riscv_vwmulsu_vl : SDNode<"RISCVISD::VWMULSU_VL", SDT_RISCVVWBinOp_VL>;
+def riscv_vwadd_vl :  SDNode<"RISCVISD::VWADD_VL",  SDT_RISCVVWBinOp_VL, [SDNPCommutative]>;
 def riscv_vwaddu_vl : SDNode<"RISCVISD::VWADDU_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>;
+def riscv_vwsub_vl :  SDNode<"RISCVISD::VWSUB_VL",  SDT_RISCVVWBinOp_VL, [SDNPCommutative]>;
+def riscv_vwsubu_vl : SDNode<"RISCVISD::VWSUBU_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>;
+
+def SDT_RISCVVWBinOpW_VL : SDTypeProfile<1, 4, [SDTCisVec<0>,
+                                                SDTCisSameAs<0, 1>,
+                                                SDTCisSameNumEltsAs<1, 2>,
+                                                SDTCisOpSmallerThanOp<2, 1>,
+                                                SDTCisSameNumEltsAs<1, 3>,
+                                                SDTCVecEltisVT<3, i1>,
+                                                SDTCisVT<4, XLenVT>]>;
+def riscv_vwadd_w_vl :  SDNode<"RISCVISD::VWADD_W_VL",  SDT_RISCVVWBinOpW_VL>;
+def riscv_vwaddu_w_vl : SDNode<"RISCVISD::VWADDU_W_VL", SDT_RISCVVWBinOpW_VL>;
+def riscv_vwsub_w_vl :  SDNode<"RISCVISD::VWSUB_W_VL",  SDT_RISCVVWBinOpW_VL>;
+def riscv_vwsubu_w_vl : SDNode<"RISCVISD::VWSUBU_W_VL", SDT_RISCVVWBinOpW_VL>;
 
 def SDTRVVVecReduce : SDTypeProfile<1, 5, [
   SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisSameAs<0, 3>,
@@ -254,45 +271,69 @@ def riscv_vwmulu_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C, node:$D),
   return N->hasOneUse();
 }]>;
 
+def riscv_vwmulsu_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C, node:$D),
+                                      (riscv_vwmulsu_vl node:$A, node:$B, node:$C,
+                                                        node:$D), [{
+  return N->hasOneUse();
+}]>;
+
+def riscv_sext_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C),
+                           (riscv_sext_vl node:$A, node:$B, node:$C), [{
+  return N->hasOneUse();
+}]>;
+
+def riscv_zext_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C),
+                           (riscv_zext_vl node:$A, node:$B, node:$C), [{
+  return N->hasOneUse();
+}]>;
+
+def riscv_fpextend_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C),
+                           (riscv_fpextend_vl node:$A, node:$B, node:$C), [{
+  return N->hasOneUse();
+}]>;
+
 foreach kind = ["ADD", "UMAX", "SMAX", "UMIN", "SMIN", "AND", "OR", "XOR",
                 "FADD", "SEQ_FADD", "FMIN", "FMAX"] in
   def rvv_vecreduce_#kind#_vl : SDNode<"RISCVISD::VECREDUCE_"#kind#"_VL", SDTRVVVecReduce>;
 
+// Give explicit Complexity to prefer simm5/uimm5.
+def SplatPat       : ComplexPattern<vAny, 1, "selectVSplat",      [], [], 1>;
+def SplatPat_simm5 : ComplexPattern<vAny, 1, "selectVSplatSimm5", [], [], 2>;
+def SplatPat_uimm5 : ComplexPattern<vAny, 1, "selectVSplatUimm5", [], [], 2>;
+def SplatPat_simm5_plus1
+    : ComplexPattern<vAny, 1, "selectVSplatSimm5Plus1", [], [], 2>;
+def SplatPat_simm5_plus1_nonzero
+    : ComplexPattern<vAny, 1, "selectVSplatSimm5Plus1NonZero", [], [], 2>;
+
 // Ignore the vl operand.
 def SplatFPOp : PatFrag<(ops node:$op),
-                        (riscv_vfmv_v_f_vl node:$op, srcvalue)>;
+                        (riscv_vfmv_v_f_vl undef, node:$op, srcvalue)>;
 
 def sew8simm5  : ComplexPattern<XLenVT, 1, "selectRVVSimm5<8>",  []>;
 def sew16simm5 : ComplexPattern<XLenVT, 1, "selectRVVSimm5<16>", []>;
 def sew32simm5 : ComplexPattern<XLenVT, 1, "selectRVVSimm5<32>", []>;
 def sew64simm5 : ComplexPattern<XLenVT, 1, "selectRVVSimm5<64>", []>;
 
-multiclass VPatBinaryVL_VV<SDNode vop,
-                           string instruction_name,
-                           ValueType result_type,
-                           ValueType op_type,
-                           ValueType mask_type,
-                           int sew,
-                           LMULInfo vlmul,
-                           VReg op_reg_class> {
-  def : Pat<(result_type (vop
-                         (op_type op_reg_class:$rs1),
-                         (op_type op_reg_class:$rs2),
-                         (mask_type true_mask),
-                         VLOpFrag)),
-            (!cast<Instruction>(instruction_name#"_VV_"# vlmul.MX)
-                         op_reg_class:$rs1,
-                         op_reg_class:$rs2,
-                         GPR:$vl, sew)>;
+multiclass VPatBinaryVL_V<SDNode vop,
+                          string instruction_name,
+                          string suffix,
+                          ValueType result_type,
+                          ValueType op1_type,
+                          ValueType op2_type,
+                          ValueType mask_type,
+                          int sew,
+                          LMULInfo vlmul,
+                          VReg op1_reg_class,
+                          VReg op2_reg_class> {
   def : Pat<(result_type (vop
-                         (op_type op_reg_class:$rs1),
-                         (op_type op_reg_class:$rs2),
+                         (op1_type op1_reg_class:$rs1),
+                         (op2_type op2_reg_class:$rs2),
                          (mask_type V0),
                          VLOpFrag)),
-        (!cast<Instruction>(instruction_name#"_VV_"# vlmul.MX#"_MASK")
+        (!cast<Instruction>(instruction_name#"_"#suffix#"_"# vlmul.MX#"_MASK")
                      (result_type (IMPLICIT_DEF)),
-                     op_reg_class:$rs1,
-                     op_reg_class:$rs2,
+                     op1_reg_class:$rs1,
+                     op2_reg_class:$rs2,
                      (mask_type V0), GPR:$vl, sew, TAIL_AGNOSTIC)>;
 }
 
@@ -300,7 +341,8 @@ multiclass VPatBinaryVL_XI<SDNode vop,
                            string instruction_name,
                            string suffix,
                            ValueType result_type,
-                           ValueType vop_type,
+                           ValueType vop1_type,
+                           ValueType vop2_type,
                            ValueType mask_type,
                            int sew,
                            LMULInfo vlmul,
@@ -308,17 +350,8 @@ multiclass VPatBinaryVL_XI<SDNode vop,
                            ComplexPattern SplatPatKind,
                            DAGOperand xop_kind> {
   def : Pat<(result_type (vop
-                     (vop_type vop_reg_class:$rs1),
-                     (vop_type (SplatPatKind (XLenVT xop_kind:$rs2))),
-                     (mask_type true_mask),
-                     VLOpFrag)),
-        (!cast<Instruction>(instruction_name#_#suffix#_# vlmul.MX)
-                     vop_reg_class:$rs1,
-                     xop_kind:$rs2,
-                     GPR:$vl, sew)>;
-  def : Pat<(result_type (vop
-                     (vop_type vop_reg_class:$rs1),
-                     (vop_type (SplatPatKind (XLenVT xop_kind:$rs2))),
+                     (vop1_type vop_reg_class:$rs1),
+                     (vop2_type (SplatPatKind (XLenVT xop_kind:$rs2))),
                      (mask_type V0),
                      VLOpFrag)),
         (!cast<Instruction>(instruction_name#_#suffix#_# vlmul.MX#"_MASK")
@@ -330,12 +363,12 @@ multiclass VPatBinaryVL_XI<SDNode vop,
 
 multiclass VPatBinaryVL_VV_VX<SDNode vop, string instruction_name> {
   foreach vti = AllIntegerVectors in {
-    defm : VPatBinaryVL_VV<vop, instruction_name,
-                           vti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
-                           vti.LMul, vti.RegClass>;
+    defm : VPatBinaryVL_V<vop, instruction_name, "VV",
+                           vti.Vector, vti.Vector, vti.Vector, vti.Mask,
+                           vti.Log2SEW, vti.LMul, vti.RegClass, vti.RegClass>;
     defm : VPatBinaryVL_XI<vop, instruction_name, "VX",
-                           vti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
-                           vti.LMul, vti.RegClass, SplatPat, GPR>;
+                           vti.Vector, vti.Vector, vti.Vector, vti.Mask,
+                           vti.Log2SEW, vti.LMul, vti.RegClass, SplatPat, GPR>;
   }
 }
 
@@ -344,8 +377,8 @@ multiclass VPatBinaryVL_VV_VX_VI<SDNode vop, string instruction_name,
     : VPatBinaryVL_VV_VX<vop, instruction_name> {
   foreach vti = AllIntegerVectors in {
     defm : VPatBinaryVL_XI<vop, instruction_name, "VI",
-                           vti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
-                           vti.LMul, vti.RegClass,
+                           vti.Vector, vti.Vector, vti.Vector, vti.Mask,
+                           vti.Log2SEW, vti.LMul, vti.RegClass,
                            !cast<ComplexPattern>(SplatPat#_#ImmType),
                            ImmType>;
   }
@@ -355,12 +388,26 @@ multiclass VPatBinaryWVL_VV_VX<SDNode vop, string instruction_name> {
   foreach VtiToWti = AllWidenableIntVectors in {
     defvar vti = VtiToWti.Vti;
     defvar wti = VtiToWti.Wti;
-    defm : VPatBinaryVL_VV<vop, instruction_name,
-                           wti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
-                           vti.LMul, vti.RegClass>;
+    defm : VPatBinaryVL_V<vop, instruction_name, "VV",
+                           wti.Vector, vti.Vector, vti.Vector, vti.Mask,
+                           vti.Log2SEW, vti.LMul, vti.RegClass, vti.RegClass>;
     defm : VPatBinaryVL_XI<vop, instruction_name, "VX",
-                           wti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
-                           vti.LMul, vti.RegClass, SplatPat, GPR>;
+                           wti.Vector, vti.Vector, vti.Vector, vti.Mask,
+                           vti.Log2SEW, vti.LMul, vti.RegClass, SplatPat, GPR>;
+  }
+}
+multiclass VPatBinaryWVL_VV_VX_WV_WX<SDNode vop, SDNode vop_w,
+                                     string instruction_name>
+    : VPatBinaryWVL_VV_VX<vop, instruction_name> {
+  foreach VtiToWti = AllWidenableIntVectors in {
+    defvar vti = VtiToWti.Vti;
+    defvar wti = VtiToWti.Wti;
+    defm : VPatBinaryVL_V<vop_w, instruction_name, "WV",
+                           wti.Vector, wti.Vector, vti.Vector, vti.Mask,
+                           vti.Log2SEW, vti.LMul, wti.RegClass, vti.RegClass>;
+    defm : VPatBinaryVL_XI<vop_w, instruction_name, "WX",
+                           wti.Vector, wti.Vector, vti.Vector, vti.Mask,
+                           vti.Log2SEW, vti.LMul, wti.RegClass, SplatPat, GPR>;
   }
 }
 
@@ -373,14 +420,6 @@ multiclass VPatBinaryVL_VF<SDNode vop,
                            LMULInfo vlmul,
                            VReg vop_reg_class,
                            RegisterClass scalar_reg_class> {
-  def : Pat<(result_type (vop (vop_type vop_reg_class:$rs1),
-                         (vop_type (SplatFPOp scalar_reg_class:$rs2)),
-                         (mask_type true_mask),
-                         VLOpFrag)),
-        (!cast<Instruction>(instruction_name#"_"#vlmul.MX)
-                     vop_reg_class:$rs1,
-                     scalar_reg_class:$rs2,
-                     GPR:$vl, sew)>;
   def : Pat<(result_type (vop (vop_type vop_reg_class:$rs1),
                          (vop_type (SplatFPOp scalar_reg_class:$rs2)),
                          (mask_type V0),
@@ -394,9 +433,9 @@ multiclass VPatBinaryVL_VF<SDNode vop,
 
 multiclass VPatBinaryFPVL_VV_VF<SDNode vop, string instruction_name> {
   foreach vti = AllFloatVectors in {
-    defm : VPatBinaryVL_VV<vop, instruction_name,
-                           vti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
-                           vti.LMul, vti.RegClass>;
+    defm : VPatBinaryVL_V<vop, instruction_name, "VV",
+                          vti.Vector, vti.Vector, vti.Vector, vti.Mask,
+                          vti.Log2SEW, vti.LMul, vti.RegClass, vti.RegClass>;
     defm : VPatBinaryVL_VF<vop, instruction_name#"_V"#vti.ScalarSuffix,
                            vti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
                            vti.LMul, vti.RegClass, vti.ScalarRegClass>;
@@ -405,13 +444,6 @@ multiclass VPatBinaryFPVL_VV_VF<SDNode vop, string instruction_name> {
 
 multiclass VPatBinaryFPVL_R_VF<SDNode vop, string instruction_name> {
   foreach fvti = AllFloatVectors in {
-    def : Pat<(fvti.Vector (vop (SplatFPOp fvti.ScalarRegClass:$rs2),
-                                fvti.RegClass:$rs1,
-                                (fvti.Mask true_mask),
-                                VLOpFrag)),
-              (!cast<Instruction>(instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX)
-                           fvti.RegClass:$rs1, fvti.ScalarRegClass:$rs2,
-                           GPR:$vl, fvti.Log2SEW)>;
     def : Pat<(fvti.Vector (vop (SplatFPOp fvti.ScalarRegClass:$rs2),
                                 fvti.RegClass:$rs1,
                                 (fvti.Mask V0),
@@ -427,65 +459,87 @@ multiclass VPatIntegerSetCCVL_VV<VTypeInfo vti, string instruction_name,
                                  CondCode cc> {
   def : Pat<(vti.Mask (riscv_setcc_vl (vti.Vector vti.RegClass:$rs1),
                                       vti.RegClass:$rs2, cc,
-                                      (vti.Mask true_mask),
+                                      (vti.Mask V0),
                                       VLOpFrag)),
-            (!cast<Instruction>(instruction_name#"_VV_"#vti.LMul.MX)
-                         vti.RegClass:$rs1, vti.RegClass:$rs2, GPR:$vl,
-                         vti.Log2SEW)>;
+            (!cast<Instruction>(instruction_name#"_VV_"#vti.LMul.MX#"_MASK")
+                         (vti.Mask (IMPLICIT_DEF)),
+                         vti.RegClass:$rs1,
+                         vti.RegClass:$rs2,
+                         (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
 }
 
 // Inherits from VPatIntegerSetCCVL_VV and adds a pattern with operands swapped.
 multiclass VPatIntegerSetCCVL_VV_Swappable<VTypeInfo vti, string instruction_name,
-                                           CondCode cc, CondCode invcc> :
-  VPatIntegerSetCCVL_VV<vti, instruction_name, cc> {
+                                           CondCode cc, CondCode invcc>
+    : VPatIntegerSetCCVL_VV<vti, instruction_name, cc> {
   def : Pat<(vti.Mask (riscv_setcc_vl (vti.Vector vti.RegClass:$rs2),
                                       vti.RegClass:$rs1, invcc,
-                                      (vti.Mask true_mask),
+                                      (vti.Mask V0),
                                       VLOpFrag)),
-            (!cast<Instruction>(instruction_name#"_VV_"#vti.LMul.MX)
-                         vti.RegClass:$rs1, vti.RegClass:$rs2, GPR:$vl,
-                         vti.Log2SEW)>;
+            (!cast<Instruction>(instruction_name#"_VV_"#vti.LMul.MX#"_MASK")
+                         (vti.Mask (IMPLICIT_DEF)), vti.RegClass:$rs1,
+                         vti.RegClass:$rs2, (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
 }
 
 multiclass VPatIntegerSetCCVL_VX_Swappable<VTypeInfo vti, string instruction_name,
                                            CondCode cc, CondCode invcc> {
-  defvar instruction = !cast<Instruction>(instruction_name#"_VX_"#vti.LMul.MX);
+  defvar instruction_masked = !cast<Instruction>(instruction_name#"_VX_"#vti.LMul.MX#"_MASK");
   def : Pat<(vti.Mask (riscv_setcc_vl (vti.Vector vti.RegClass:$rs1),
                                       (SplatPat (XLenVT GPR:$rs2)), cc,
-                                      (vti.Mask true_mask),
+                                      (vti.Mask V0),
                                       VLOpFrag)),
-            (instruction vti.RegClass:$rs1, GPR:$rs2, GPR:$vl, vti.Log2SEW)>;
+            (instruction_masked (vti.Mask (IMPLICIT_DEF)), vti.RegClass:$rs1,
+                                GPR:$rs2, (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
   def : Pat<(vti.Mask (riscv_setcc_vl (SplatPat (XLenVT GPR:$rs2)),
                                       (vti.Vector vti.RegClass:$rs1), invcc,
-                                      (vti.Mask true_mask),
+                                      (vti.Mask V0),
                                       VLOpFrag)),
-            (instruction vti.RegClass:$rs1, GPR:$rs2, GPR:$vl, vti.Log2SEW)>;
+            (instruction_masked (vti.Mask (IMPLICIT_DEF)), vti.RegClass:$rs1,
+                                GPR:$rs2, (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
 }
 
 multiclass VPatIntegerSetCCVL_VI_Swappable<VTypeInfo vti, string instruction_name,
                                            CondCode cc, CondCode invcc> {
-  defvar instruction = !cast<Instruction>(instruction_name#"_VI_"#vti.LMul.MX);
+  defvar instruction_masked = !cast<Instruction>(instruction_name#"_VI_"#vti.LMul.MX#"_MASK");
   def : Pat<(vti.Mask (riscv_setcc_vl (vti.Vector vti.RegClass:$rs1),
                                       (SplatPat_simm5 simm5:$rs2), cc,
-                                      (vti.Mask true_mask),
+                                      (vti.Mask V0),
                                       VLOpFrag)),
-            (instruction vti.RegClass:$rs1, XLenVT:$rs2, GPR:$vl, vti.Log2SEW)>;
+            (instruction_masked (vti.Mask (IMPLICIT_DEF)), vti.RegClass:$rs1,
+                                XLenVT:$rs2, (vti.Mask V0), GPR:$vl,
+                                vti.Log2SEW)>;
+
+  // FIXME: Can do some canonicalization to remove these patterns.
   def : Pat<(vti.Mask (riscv_setcc_vl (SplatPat_simm5 simm5:$rs2),
                                       (vti.Vector vti.RegClass:$rs1), invcc,
-                                      (vti.Mask true_mask),
+                                      (vti.Mask V0),
                                       VLOpFrag)),
-            (instruction vti.RegClass:$rs1, simm5:$rs2, GPR:$vl, vti.Log2SEW)>;
+            (instruction_masked (vti.Mask (IMPLICIT_DEF)), vti.RegClass:$rs1,
+                                simm5:$rs2, (vti.Mask V0), GPR:$vl,
+                                vti.Log2SEW)>;
 }
 
-multiclass VPatIntegerSetCCVL_VIPlus1<VTypeInfo vti, string instruction_name,
-                                      CondCode cc, ComplexPattern splatpat_kind> {
-  defvar instruction = !cast<Instruction>(instruction_name#"_VI_"#vti.LMul.MX);
+multiclass VPatIntegerSetCCVL_VIPlus1_Swappable<VTypeInfo vti,
+                                                string instruction_name,
+                                                CondCode cc, CondCode invcc,
+                                                ComplexPattern splatpat_kind> {
+  defvar instruction_masked = !cast<Instruction>(instruction_name#"_VI_"#vti.LMul.MX#"_MASK");
   def : Pat<(vti.Mask (riscv_setcc_vl (vti.Vector vti.RegClass:$rs1),
                                       (splatpat_kind simm5:$rs2), cc,
-                                      (vti.Mask true_mask),
+                                      (vti.Mask V0),
                                       VLOpFrag)),
-            (instruction vti.RegClass:$rs1, (DecImm simm5:$rs2),
-                         GPR:$vl, vti.Log2SEW)>;
+            (instruction_masked (vti.Mask (IMPLICIT_DEF)), vti.RegClass:$rs1,
+                                (DecImm simm5:$rs2), (vti.Mask V0), GPR:$vl,
+                                vti.Log2SEW)>;
+
+  // FIXME: Can do some canonicalization to remove these patterns.
+  def : Pat<(vti.Mask (riscv_setcc_vl (splatpat_kind simm5:$rs2),
+                                      (vti.Vector vti.RegClass:$rs1), invcc,
+                                      (vti.Mask V0),
+                                      VLOpFrag)),
+            (instruction_masked (vti.Mask (IMPLICIT_DEF)), vti.RegClass:$rs1,
+                                (DecImm simm5:$rs2), (vti.Mask V0), GPR:$vl,
+                                vti.Log2SEW)>;
 }
 
 multiclass VPatFPSetCCVL_VV_VF_FV<CondCode cc,
@@ -495,25 +549,29 @@ multiclass VPatFPSetCCVL_VV_VF_FV<CondCode cc,
     def : Pat<(fvti.Mask (riscv_setcc_vl (fvti.Vector fvti.RegClass:$rs1),
                                          fvti.RegClass:$rs2,
                                          cc,
-                                         (fvti.Mask true_mask),
+                                         (fvti.Mask V0),
                                          VLOpFrag)),
-              (!cast<Instruction>(inst_name#"_VV_"#fvti.LMul.MX)
-                  fvti.RegClass:$rs1, fvti.RegClass:$rs2, GPR:$vl, fvti.Log2SEW)>;
+              (!cast<Instruction>(inst_name#"_VV_"#fvti.LMul.MX#"_MASK")
+                  (fvti.Mask (IMPLICIT_DEF)), fvti.RegClass:$rs1,
+                  fvti.RegClass:$rs2, (fvti.Mask V0),
+                  GPR:$vl, fvti.Log2SEW)>;
     def : Pat<(fvti.Mask (riscv_setcc_vl (fvti.Vector fvti.RegClass:$rs1),
                                          (SplatFPOp fvti.ScalarRegClass:$rs2),
                                          cc,
-                                         (fvti.Mask true_mask),
+                                         (fvti.Mask V0),
                                          VLOpFrag)),
-              (!cast<Instruction>(inst_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX)
-                  fvti.RegClass:$rs1, fvti.ScalarRegClass:$rs2,
+              (!cast<Instruction>(inst_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_MASK")
+                  (fvti.Mask (IMPLICIT_DEF)), fvti.RegClass:$rs1,
+                  fvti.ScalarRegClass:$rs2, (fvti.Mask V0),
                   GPR:$vl, fvti.Log2SEW)>;
     def : Pat<(fvti.Mask (riscv_setcc_vl (SplatFPOp fvti.ScalarRegClass:$rs2),
                                          (fvti.Vector fvti.RegClass:$rs1),
                                          cc,
-                                         (fvti.Mask true_mask),
+                                         (fvti.Mask V0),
                                          VLOpFrag)),
-              (!cast<Instruction>(swapped_op_inst_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX)
-                  fvti.RegClass:$rs1, fvti.ScalarRegClass:$rs2,
+              (!cast<Instruction>(swapped_op_inst_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_MASK")
+                  (fvti.Mask (IMPLICIT_DEF)), fvti.RegClass:$rs1,
+                  fvti.ScalarRegClass:$rs2, (fvti.Mask V0),
                   GPR:$vl, fvti.Log2SEW)>;
   }
 }
@@ -524,9 +582,11 @@ multiclass VPatExtendSDNode_V_VL<SDNode vop, string inst_name, string suffix,
     defvar vti = vtiTofti.Vti;
     defvar fti = vtiTofti.Fti;
     def : Pat<(vti.Vector (vop (fti.Vector fti.RegClass:$rs2),
-                               true_mask, VLOpFrag)),
-              (!cast<Instruction>(inst_name#"_"#suffix#"_"#vti.LMul.MX)
-                  fti.RegClass:$rs2, GPR:$vl, vti.Log2SEW)>;
+                               (fti.Mask V0), VLOpFrag)),
+              (!cast<Instruction>(inst_name#"_"#suffix#"_"#vti.LMul.MX#"_MASK")
+                  (vti.Vector (IMPLICIT_DEF)),
+                  fti.RegClass:$rs2,
+                  (fti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 }
 
@@ -534,10 +594,11 @@ multiclass VPatConvertFP2ISDNode_V_VL<SDNode vop, string instruction_name> {
   foreach fvti = AllFloatVectors in {
     defvar ivti = GetIntVTypeInfo<fvti>.Vti;
     def : Pat<(ivti.Vector (vop (fvti.Vector fvti.RegClass:$rs1),
-                                (fvti.Mask true_mask),
+                                (fvti.Mask V0),
                                 VLOpFrag)),
-              (!cast<Instruction>(instruction_name#"_"#ivti.LMul.MX)
-                  fvti.RegClass:$rs1, GPR:$vl, ivti.Log2SEW)>;
+              (!cast<Instruction>(instruction_name#"_"#ivti.LMul.MX#"_MASK")
+                  (ivti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1,
+                  (fvti.Mask V0), GPR:$vl, ivti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 }
 
@@ -545,10 +606,11 @@ multiclass VPatConvertI2FPSDNode_V_VL<SDNode vop, string instruction_name> {
   foreach fvti = AllFloatVectors in {
     defvar ivti = GetIntVTypeInfo<fvti>.Vti;
     def : Pat<(fvti.Vector (vop (ivti.Vector ivti.RegClass:$rs1),
-                                (ivti.Mask true_mask),
+                                (ivti.Mask V0),
                                 VLOpFrag)),
-              (!cast<Instruction>(instruction_name#"_"#fvti.LMul.MX)
-                  ivti.RegClass:$rs1, GPR:$vl, fvti.Log2SEW)>;
+              (!cast<Instruction>(instruction_name#"_"#fvti.LMul.MX#"_MASK")
+                  (fvti.Vector (IMPLICIT_DEF)), ivti.RegClass:$rs1,
+                  (ivti.Mask V0), GPR:$vl, fvti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 }
 
@@ -557,10 +619,11 @@ multiclass VPatWConvertFP2ISDNode_V_VL<SDNode vop, string instruction_name> {
     defvar fvti = fvtiToFWti.Vti;
     defvar iwti = GetIntVTypeInfo<fvtiToFWti.Wti>.Vti;
     def : Pat<(iwti.Vector (vop (fvti.Vector fvti.RegClass:$rs1),
-                                (fvti.Mask true_mask),
+                                (fvti.Mask V0),
                                 VLOpFrag)),
-              (!cast<Instruction>(instruction_name#"_"#fvti.LMul.MX)
-                  fvti.RegClass:$rs1, GPR:$vl, fvti.Log2SEW)>;
+              (!cast<Instruction>(instruction_name#"_"#fvti.LMul.MX#"_MASK")
+                  (iwti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1,
+                  (fvti.Mask V0), GPR:$vl, fvti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 }
 
@@ -569,10 +632,11 @@ multiclass VPatWConvertI2FPSDNode_V_VL<SDNode vop, string instruction_name> {
     defvar ivti = vtiToWti.Vti;
     defvar fwti = vtiToWti.Wti;
     def : Pat<(fwti.Vector (vop (ivti.Vector ivti.RegClass:$rs1),
-                                (ivti.Mask true_mask),
+                                (ivti.Mask V0),
                                 VLOpFrag)),
-              (!cast<Instruction>(instruction_name#"_"#ivti.LMul.MX)
-                  ivti.RegClass:$rs1, GPR:$vl, ivti.Log2SEW)>;
+              (!cast<Instruction>(instruction_name#"_"#ivti.LMul.MX#"_MASK")
+                  (fwti.Vector (IMPLICIT_DEF)), ivti.RegClass:$rs1,
+                  (ivti.Mask V0), GPR:$vl, ivti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 }
 
@@ -581,10 +645,11 @@ multiclass VPatNConvertFP2ISDNode_V_VL<SDNode vop, string instruction_name> {
     defvar vti = vtiToWti.Vti;
     defvar fwti = vtiToWti.Wti;
     def : Pat<(vti.Vector (vop (fwti.Vector fwti.RegClass:$rs1),
-                               (fwti.Mask true_mask),
+                               (fwti.Mask V0),
                                VLOpFrag)),
-              (!cast<Instruction>(instruction_name#"_"#vti.LMul.MX)
-                  fwti.RegClass:$rs1, GPR:$vl, vti.Log2SEW)>;
+              (!cast<Instruction>(instruction_name#"_"#vti.LMul.MX#"_MASK")
+                  (vti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1,
+                  (fwti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 }
 
@@ -593,10 +658,11 @@ multiclass VPatNConvertI2FPSDNode_V_VL<SDNode vop, string instruction_name> {
     defvar fvti = fvtiToFWti.Vti;
     defvar iwti = GetIntVTypeInfo<fvtiToFWti.Wti>.Vti;
     def : Pat<(fvti.Vector (vop (iwti.Vector iwti.RegClass:$rs1),
-                                (iwti.Mask true_mask),
+                                (iwti.Mask V0),
                                 VLOpFrag)),
-              (!cast<Instruction>(instruction_name#"_"#fvti.LMul.MX)
-                  iwti.RegClass:$rs1, GPR:$vl, fvti.Log2SEW)>;
+              (!cast<Instruction>(instruction_name#"_"#fvti.LMul.MX#"_MASK")
+                  (fvti.Vector (IMPLICIT_DEF)), iwti.RegClass:$rs1,
+                  (iwti.Mask V0), GPR:$vl, fvti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 }
 
@@ -622,45 +688,286 @@ multiclass VPatReductionVL<SDNode vop, string instruction_name, bit is_float> {
   }
 }
 
-multiclass VPatBinarySDNodeExt_V_WV<SDNode op, PatFrags extop, string instruction_name> {
-  foreach vti = AllWidenableIntVectors in {
+multiclass VPatBinarySDNodeExt_V_WV_WX<SDNode op, PatFrags extop, string instruction_name> {
+  foreach vtiToWti = AllWidenableIntVectors in {
+    defvar vti = vtiToWti.Vti;
+    defvar wti = vtiToWti.Wti;
+    def : Pat<
+      (vti.Vector
+        (riscv_trunc_vector_vl
+          (op (wti.Vector wti.RegClass:$rs2),
+              (wti.Vector (extop (vti.Vector vti.RegClass:$rs1)))),
+          (vti.Mask true_mask),
+          VLOpFrag)),
+      (!cast<Instruction>(instruction_name#"_WV_"#vti.LMul.MX)
+        wti.RegClass:$rs2, vti.RegClass:$rs1, GPR:$vl, vti.Log2SEW)>;
+    def : Pat<
+      (vti.Vector
+        (riscv_trunc_vector_vl
+          (op (wti.Vector wti.RegClass:$rs2),
+              (wti.Vector (extop (vti.Vector (SplatPat GPR:$rs1))))),
+          (vti.Mask true_mask),
+          VLOpFrag)),
+      (!cast<Instruction>(instruction_name#"_WX_"#vti.LMul.MX)
+        wti.RegClass:$rs2, GPR:$rs1, GPR:$vl, vti.Log2SEW)>;
+  }
+}
+
+multiclass VPatBinarySDNode_V_WV_WX_WI<SDNode op, string instruction_name> {
+  defm : VPatBinarySDNodeExt_V_WV_WX<op, sext_oneuse, instruction_name>;
+  defm : VPatBinarySDNodeExt_V_WV_WX<op, zext_oneuse, instruction_name>;
+  foreach vtiToWti = AllWidenableIntVectors in {
+    defvar vti = vtiToWti.Vti;
+    defvar wti = vtiToWti.Wti;
     def : Pat<
-      (vti.Vti.Vector
+      (vti.Vector
         (riscv_trunc_vector_vl
-          (op (vti.Wti.Vector vti.Wti.RegClass:$rs2),
-              (vti.Wti.Vector (extop (vti.Vti.Vector vti.Vti.RegClass:$rs1)))),
-          (riscv_vmset_vl VLMax),
-          VLMax)),
-      (!cast<Instruction>(instruction_name#"_WV_"#vti.Vti.LMul.MX)
-        vti.Wti.RegClass:$rs2, vti.Vti.RegClass:$rs1,
-        vti.Vti.AVL, vti.Vti.Log2SEW)>;
+          (op (wti.Vector wti.RegClass:$rs2),
+              (wti.Vector (SplatPat_uimm5 uimm5:$rs1))), (vti.Mask true_mask),
+          VLOpFrag)),
+      (!cast<Instruction>(instruction_name#"_WI_"#vti.LMul.MX)
+        wti.RegClass:$rs2, uimm5:$rs1, GPR:$vl, vti.Log2SEW)>;
+  }
+}
+
+multiclass VPatWidenReductionVL<SDNode vop, PatFrags extop, string instruction_name, bit is_float> {
+  foreach vtiToWti = !if(is_float, AllWidenableFloatVectors, AllWidenableIntVectors) in {
+    defvar vti = vtiToWti.Vti;
+    defvar wti = vtiToWti.Wti;
+    defvar wti_m1 = !cast<VTypeInfo>(!if(is_float, "VF", "VI") # wti.SEW # "M1");
+    def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$merge),
+                                 (wti.Vector (extop (vti.Vector vti.RegClass:$rs1))),
+                                 VR:$rs2, (vti.Mask true_mask), VLOpFrag)),
+             (!cast<Instruction>(instruction_name#"_VS_"#vti.LMul.MX)
+                (wti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1),
+                (wti_m1.Vector VR:$rs2), GPR:$vl, vti.Log2SEW)>;
+    def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$merge),
+                                 (wti.Vector (extop (vti.Vector vti.RegClass:$rs1))),
+                                 VR:$rs2, (vti.Mask V0), VLOpFrag)),
+             (!cast<Instruction>(instruction_name#"_VS_"#vti.LMul.MX#"_MASK")
+                (wti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1),
+                (wti_m1.Vector VR:$rs2), (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
+  }
+}
+
+multiclass VPatWidenReductionVL_Ext_VL<SDNode vop, PatFrags extop, string instruction_name, bit is_float> {
+  foreach vtiToWti = !if(is_float, AllWidenableFloatVectors, AllWidenableIntVectors) in {
+    defvar vti = vtiToWti.Vti;
+    defvar wti = vtiToWti.Wti;
+    defvar wti_m1 = !cast<VTypeInfo>(!if(is_float, "VF", "VI") # wti.SEW # "M1");
+    def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$merge),
+                                 (wti.Vector (extop (vti.Vector vti.RegClass:$rs1), (vti.Mask true_mask), VLOpFrag)),
+                                 VR:$rs2, (vti.Mask true_mask), VLOpFrag)),
+             (!cast<Instruction>(instruction_name#"_VS_"#vti.LMul.MX)
+                (wti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1),
+                (wti_m1.Vector VR:$rs2), GPR:$vl, vti.Log2SEW)>;
+    def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$merge),
+                                 (wti.Vector (extop (vti.Vector vti.RegClass:$rs1), (vti.Mask true_mask), VLOpFrag)),
+                                 VR:$rs2, (vti.Mask V0), VLOpFrag)),
+             (!cast<Instruction>(instruction_name#"_VS_"#vti.LMul.MX#"_MASK")
+                (wti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1),
+                (wti_m1.Vector VR:$rs2), (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
   }
 }
 
-multiclass VPatBinarySDNodeExt_V_WX<SDNode op, PatFrags extop, string instruction_name> {
-  foreach vti = AllWidenableIntVectors in {
+multiclass VPatWidenBinaryFPVL_VV_VF<SDNode op, PatFrags extop, string instruction_name> {
+  foreach fvtiToFWti = AllWidenableFloatVectors in {
+    defvar fvti = fvtiToFWti.Vti;
+    defvar fwti = fvtiToFWti.Wti;
+    def : Pat<(fwti.Vector (op (fwti.Vector (extop (fvti.Vector fvti.RegClass:$rs2),
+                                                   (fvti.Mask true_mask), VLOpFrag)),
+                               (fwti.Vector (extop (fvti.Vector fvti.RegClass:$rs1),
+                                                   (fvti.Mask true_mask), VLOpFrag)),
+                               (fwti.Mask true_mask), VLOpFrag)),
+              (!cast<Instruction>(instruction_name#"_VV_"#fvti.LMul.MX)
+                 fvti.RegClass:$rs2, fvti.RegClass:$rs1,
+                 GPR:$vl, fvti.Log2SEW)>;
+    def : Pat<(fwti.Vector (op (fwti.Vector (extop (fvti.Vector fvti.RegClass:$rs2),
+                                                   (fvti.Mask true_mask), VLOpFrag)),
+                               (fwti.Vector (extop (fvti.Vector (SplatFPOp fvti.ScalarRegClass:$rs1)),
+                                                   (fvti.Mask true_mask), VLOpFrag)),
+                               (fwti.Mask true_mask), VLOpFrag)),
+              (!cast<Instruction>(instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX)
+                 fvti.RegClass:$rs2, fvti.ScalarRegClass:$rs1,
+                 GPR:$vl, fvti.Log2SEW)>;
+  }
+}
+
+multiclass VPatWidenBinaryFPVL_WV_WF<SDNode op, PatFrags extop, string instruction_name> {
+  foreach fvtiToFWti = AllWidenableFloatVectors in {
+    defvar fvti = fvtiToFWti.Vti;
+    defvar fwti = fvtiToFWti.Wti;
+    def : Pat<(fwti.Vector (op (fwti.Vector fwti.RegClass:$rs2),
+                               (fwti.Vector (extop (fvti.Vector fvti.RegClass:$rs1),
+                                                   (fvti.Mask true_mask), VLOpFrag)),
+                               (fwti.Mask true_mask), VLOpFrag)),
+              (!cast<Instruction>(instruction_name#"_WV_"#fvti.LMul.MX)
+                 fwti.RegClass:$rs2, fvti.RegClass:$rs1,
+                 GPR:$vl, fvti.Log2SEW)>;
+    def : Pat<(fwti.Vector (op (fwti.Vector fwti.RegClass:$rs2),
+                               (fwti.Vector (extop (fvti.Vector (SplatFPOp fvti.ScalarRegClass:$rs1)),
+                                                   (fvti.Mask true_mask), VLOpFrag)),
+                               (fwti.Mask true_mask), VLOpFrag)),
+              (!cast<Instruction>(instruction_name#"_W"#fvti.ScalarSuffix#"_"#fvti.LMul.MX)
+                 fwti.RegClass:$rs2, fvti.ScalarRegClass:$rs1,
+                 GPR:$vl, fvti.Log2SEW)>;
+  }
+}
+
+multiclass VPatWidenBinaryFPVL_VV_VF_WV_WF<SDNode op, string instruction_name> {
+  defm : VPatWidenBinaryFPVL_VV_VF<op, riscv_fpextend_vl_oneuse, instruction_name>;
+  defm : VPatWidenBinaryFPVL_WV_WF<op, riscv_fpextend_vl_oneuse, instruction_name>;
+}
+
+multiclass VPatNarrowShiftSplatExt_WX<SDNode op, PatFrags extop, string instruction_name> {
+  foreach vtiToWti = AllWidenableIntVectors in {
+    defvar vti = vtiToWti.Vti;
+    defvar wti = vtiToWti.Wti;
     def : Pat<
-      (vti.Vti.Vector
+      (vti.Vector
         (riscv_trunc_vector_vl
-          (op (vti.Wti.Vector vti.Wti.RegClass:$rs2),
-              (vti.Wti.Vector (extop (vti.Vti.Vector (SplatPat GPR:$rs1))))),
-          (riscv_vmset_vl VLMax),
-          VLMax)),
-      (!cast<Instruction>(instruction_name#"_WX_"#vti.Vti.LMul.MX)
-        vti.Wti.RegClass:$rs2, GPR:$rs1,
-        vti.Vti.AVL, vti.Vti.Log2SEW)>;
+          (op (wti.Vector wti.RegClass:$rs2),
+              (wti.Vector (extop (vti.Vector (SplatPat GPR:$rs1)),
+                                 (vti.Mask true_mask), VLOpFrag)),
+          (wti.Mask true_mask), VLOpFrag),
+        (vti.Mask true_mask), VLOpFrag)),
+      (!cast<Instruction>(instruction_name#"_WX_"#vti.LMul.MX)
+        wti.RegClass:$rs2, GPR:$rs1, GPR:$vl, vti.Log2SEW)>;
+  }
+}
+
+multiclass VPatMultiplyAddVL_VV_VX<SDNode op, string instruction_name> {
+  foreach vti = AllIntegerVectors in {
+    defvar suffix = vti.LMul.MX;
+    // NOTE: We choose VMADD because it has the most commuting freedom. So it
+    // works best with how TwoAddressInstructionPass tries commuting.
+    def : Pat<(vti.Vector
+             (op vti.RegClass:$rs2,
+                 (riscv_mul_vl_oneuse vti.RegClass:$rs1,
+                                      vti.RegClass:$rd,
+                                      (vti.Mask true_mask), VLOpFrag),
+                           (vti.Mask true_mask), VLOpFrag)),
+            (!cast<Instruction>(instruction_name#"_VV_"# suffix)
+                 vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
+                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+    // The choice of VMADD here is arbitrary, vmadd.vx and vmacc.vx are equally
+    // commutable.
+    def : Pat<(vti.Vector
+             (op vti.RegClass:$rs2,
+                 (riscv_mul_vl_oneuse (SplatPat XLenVT:$rs1),
+                                       vti.RegClass:$rd,
+                                       (vti.Mask true_mask), VLOpFrag),
+                           (vti.Mask true_mask), VLOpFrag)),
+            (!cast<Instruction>(instruction_name#"_VX_" # suffix)
+                 vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
+                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+  }
+}
+
+multiclass VPatWidenMultiplyAddVL_VV_VX<PatFrag op1, string instruction_name> {
+  foreach vtiTowti = AllWidenableIntVectors in {
+    defvar vti = vtiTowti.Vti;
+    defvar wti = vtiTowti.Wti;
+    def : Pat<(wti.Vector
+             (riscv_add_vl wti.RegClass:$rd,
+                           (op1 vti.RegClass:$rs1,
+                                (vti.Vector vti.RegClass:$rs2),
+                                (vti.Mask true_mask), VLOpFrag),
+                          (vti.Mask true_mask), VLOpFrag)),
+            (!cast<Instruction>(instruction_name#"_VV_" # vti.LMul.MX)
+                 wti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
+                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+    def : Pat<(wti.Vector
+             (riscv_add_vl wti.RegClass:$rd,
+                          (op1 (SplatPat XLenVT:$rs1),
+                               (vti.Vector vti.RegClass:$rs2),
+                               (vti.Mask true_mask), VLOpFrag),
+                           (vti.Mask true_mask), VLOpFrag)),
+            (!cast<Instruction>(instruction_name#"_VX_" # vti.LMul.MX)
+                 wti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
+                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 }
 
+multiclass VPatNarrowShiftSplat_WX_WI<SDNode op, string instruction_name> {
+  foreach vtiTowti = AllWidenableIntVectors in {
+    defvar vti = vtiTowti.Vti;
+    defvar wti = vtiTowti.Wti;
+    def : Pat<(vti.Vector (riscv_trunc_vector_vl
+              (wti.Vector (op wti.RegClass:$rs1, (SplatPat XLenVT:$rs2),
+                              true_mask, VLOpFrag)), true_mask, VLOpFrag)),
+              (!cast<Instruction>(instruction_name#"_WX_"#vti.LMul.MX)
+                   wti.RegClass:$rs1, GPR:$rs2, GPR:$vl, vti.Log2SEW)>;
+    def : Pat<(vti.Vector (riscv_trunc_vector_vl
+              (wti.Vector (op wti.RegClass:$rs1, (SplatPat_uimm5 uimm5:$rs2),
+                              true_mask, VLOpFrag)), true_mask, VLOpFrag)),
+              (!cast<Instruction>(instruction_name#"_WI_"#vti.LMul.MX)
+                   wti.RegClass:$rs1, uimm5:$rs2, GPR:$vl, vti.Log2SEW)>;
+  }
+}
 
-multiclass VPatBinarySDNode_V_WV<SDNode op, string instruction_name> {
-  defm : VPatBinarySDNodeExt_V_WV<op, sext_oneuse, instruction_name>;
-  defm : VPatBinarySDNodeExt_V_WV<op, zext_oneuse, instruction_name>;
+multiclass VPatFPMulAddVL_VV_VF<SDNode vop, string instruction_name> {
+  foreach vti = AllFloatVectors in {
+  defvar suffix = vti.LMul.MX;
+  def : Pat<(vti.Vector (vop vti.RegClass:$rs1, vti.RegClass:$rd,
+                             vti.RegClass:$rs2, (vti.Mask true_mask),
+                             VLOpFrag)),
+            (!cast<Instruction>(instruction_name#"_VV_"# suffix)
+                 vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
+                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+  def : Pat<(vti.Vector (vop vti.RegClass:$rs1, vti.RegClass:$rd,
+                             vti.RegClass:$rs2, (vti.Mask V0),
+                             VLOpFrag)),
+            (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK")
+                 vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
+                 (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+
+  def : Pat<(vti.Vector (vop (SplatFPOp vti.ScalarRegClass:$rs1),
+                             vti.RegClass:$rd, vti.RegClass:$rs2,
+                             (vti.Mask true_mask),
+                             VLOpFrag)),
+            (!cast<Instruction>(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix)
+                 vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
+                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+  def : Pat<(vti.Vector (vop (SplatFPOp vti.ScalarRegClass:$rs1),
+                             vti.RegClass:$rd, vti.RegClass:$rs2,
+                             (vti.Mask V0),
+                             VLOpFrag)),
+            (!cast<Instruction>(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK")
+                 vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
+                 (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+  }
 }
 
-multiclass VPatBinarySDNode_V_WX<SDNode op, string instruction_name> {
-  defm : VPatBinarySDNodeExt_V_WX<op, sext_oneuse, instruction_name>;
-  defm : VPatBinarySDNodeExt_V_WX<op, zext_oneuse, instruction_name>;
+multiclass VPatWidenFPMulAccVL_VV_VF<SDNode vop, string instruction_name> {
+  foreach vtiToWti = AllWidenableFloatVectors in {
+    defvar vti = vtiToWti.Vti;
+    defvar wti = vtiToWti.Wti;
+    def : Pat<(vop
+                   (wti.Vector (riscv_fpextend_vl_oneuse
+                                    (vti.Vector vti.RegClass:$rs1),
+                                    (vti.Mask true_mask), VLOpFrag)),
+                   (wti.Vector (riscv_fpextend_vl_oneuse
+                                    (vti.Vector vti.RegClass:$rs2),
+                                    (vti.Mask true_mask), VLOpFrag)),
+                   (wti.Vector wti.RegClass:$rd), (vti.Mask true_mask),
+                   VLOpFrag),
+              (!cast<Instruction>(instruction_name#"_VV_"#vti.LMul.MX)
+                 wti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
+                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+    def : Pat<(vop
+                   (wti.Vector (riscv_fpextend_vl_oneuse
+                                    (vti.Vector (SplatFPOp vti.ScalarRegClass:$rs1)),
+                                    (vti.Mask true_mask), VLOpFrag)),
+                   (wti.Vector (riscv_fpextend_vl_oneuse
+                                    (vti.Vector vti.RegClass:$rs2),
+                                    (vti.Mask true_mask), VLOpFrag)),
+                   (wti.Vector wti.RegClass:$rd), (vti.Mask true_mask),
+                   VLOpFrag),
+              (!cast<Instruction>(instruction_name#"_V"#vti.ScalarSuffix#"_"#vti.LMul.MX)
+                 wti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
+                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -669,51 +976,18 @@ multiclass VPatBinarySDNode_V_WX<SDNode op, string instruction_name> {
 
 let Predicates = [HasVInstructions] in {
 
-// 7.4. Vector Unit-Stride Instructions
-foreach vti = AllVectors in {
-  defvar load_instr = !cast<Instruction>("PseudoVLE"#vti.SEW#"_V_"#vti.LMul.MX);
-  defvar store_instr = !cast<Instruction>("PseudoVSE"#vti.SEW#"_V_"#vti.LMul.MX);
-  // Load
-  def : Pat<(vti.Vector (riscv_vle_vl BaseAddr:$rs1, VLOpFrag)),
-            (load_instr BaseAddr:$rs1, GPR:$vl, vti.Log2SEW)>;
-  // Store
-  def : Pat<(riscv_vse_vl (vti.Vector vti.RegClass:$rs2), BaseAddr:$rs1,
-                          VLOpFrag),
-            (store_instr vti.RegClass:$rs2, BaseAddr:$rs1, GPR:$vl, vti.Log2SEW)>;
-}
-
-foreach mti = AllMasks in {
-  defvar load_instr = !cast<Instruction>("PseudoVLM_V_"#mti.BX);
-  defvar store_instr = !cast<Instruction>("PseudoVSM_V_"#mti.BX);
-  def : Pat<(mti.Mask (riscv_vle_vl BaseAddr:$rs1, VLOpFrag)),
-            (load_instr BaseAddr:$rs1, GPR:$vl, mti.Log2SEW)>;
-  def : Pat<(riscv_vse_vl (mti.Mask VR:$rs2), BaseAddr:$rs1,
-                          VLOpFrag),
-            (store_instr VR:$rs2, BaseAddr:$rs1, GPR:$vl, mti.Log2SEW)>;
-}
-
 // 12.1. Vector Single-Width Integer Add and Subtract
 defm : VPatBinaryVL_VV_VX_VI<riscv_add_vl, "PseudoVADD">;
 defm : VPatBinaryVL_VV_VX<riscv_sub_vl, "PseudoVSUB">;
 // Handle VRSUB specially since it's the only integer binary op with reversed
 // pattern operands
 foreach vti = AllIntegerVectors in {
-  def : Pat<(riscv_sub_vl (vti.Vector (SplatPat (XLenVT GPR:$rs2))),
-                          (vti.Vector vti.RegClass:$rs1), (vti.Mask true_mask),
-                          VLOpFrag),
-            (!cast<Instruction>("PseudoVRSUB_VX_"# vti.LMul.MX)
-                 vti.RegClass:$rs1, GPR:$rs2, GPR:$vl, vti.Log2SEW)>;
   def : Pat<(riscv_sub_vl (vti.Vector (SplatPat (XLenVT GPR:$rs2))),
                           (vti.Vector vti.RegClass:$rs1), (vti.Mask V0),
                           VLOpFrag),
             (!cast<Instruction>("PseudoVRSUB_VX_"# vti.LMul.MX#"_MASK")
                  (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1, GPR:$rs2,
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
-  def : Pat<(riscv_sub_vl (vti.Vector (SplatPat_simm5 simm5:$rs2)),
-                          (vti.Vector vti.RegClass:$rs1), (vti.Mask true_mask),
-                          VLOpFrag),
-            (!cast<Instruction>("PseudoVRSUB_VI_"# vti.LMul.MX)
-                 vti.RegClass:$rs1, simm5:$rs2, GPR:$vl, vti.Log2SEW)>;
   def : Pat<(riscv_sub_vl (vti.Vector (SplatPat_simm5 simm5:$rs2)),
                           (vti.Vector vti.RegClass:$rs1), (vti.Mask V0),
                           VLOpFrag),
@@ -723,7 +997,10 @@ foreach vti = AllIntegerVectors in {
 }
 
 // 12.2. Vector Widening Integer Add/Subtract
-defm : VPatBinaryWVL_VV_VX<riscv_vwaddu_vl, "PseudoVWADDU">;
+defm : VPatBinaryWVL_VV_VX_WV_WX<riscv_vwadd_vl,  riscv_vwadd_w_vl,  "PseudoVWADD">;
+defm : VPatBinaryWVL_VV_VX_WV_WX<riscv_vwaddu_vl, riscv_vwaddu_w_vl, "PseudoVWADDU">;
+defm : VPatBinaryWVL_VV_VX_WV_WX<riscv_vwsub_vl,  riscv_vwsub_w_vl,  "PseudoVWSUB">;
+defm : VPatBinaryWVL_VV_VX_WV_WX<riscv_vwsubu_vl, riscv_vwsubu_w_vl, "PseudoVWSUBU">;
 
 // 12.3. Vector Integer Extension
 defm : VPatExtendSDNode_V_VL<riscv_zext_vl, "PseudoVZEXT", "VF2",
@@ -737,7 +1014,7 @@ defm : VPatExtendSDNode_V_VL<riscv_sext_vl, "PseudoVSEXT", "VF4",
 defm : VPatExtendSDNode_V_VL<riscv_zext_vl, "PseudoVZEXT", "VF8",
                              AllFractionableVF8IntVectors>;
 defm : VPatExtendSDNode_V_VL<riscv_sext_vl, "PseudoVSEXT", "VF8",
-                                AllFractionableVF8IntVectors>;
+                             AllFractionableVF8IntVectors>;
 
 // 12.5. Vector Bitwise Logical Instructions
 defm : VPatBinaryVL_VV_VX_VI<riscv_and_vl, "PseudoVAND">;
@@ -752,7 +1029,7 @@ defm : VPatBinaryVL_VV_VX_VI<riscv_sra_vl, "PseudoVSRA", uimm5>;
 foreach vti = AllIntegerVectors in {
   // Emit shift by 1 as an add since it might be faster.
   def : Pat<(riscv_shl_vl (vti.Vector vti.RegClass:$rs1),
-                          (riscv_vmv_v_x_vl 1, (XLenVT srcvalue)),
+                          (riscv_vmv_v_x_vl (vti.Vector undef), 1, (XLenVT srcvalue)),
                           (vti.Mask true_mask),
                           VLOpFrag),
             (!cast<Instruction>("PseudoVADD_VV_"# vti.LMul.MX)
@@ -760,49 +1037,25 @@ foreach vti = AllIntegerVectors in {
 }
 
 // 12.7. Vector Narrowing Integer Right Shift Instructions
-defm : VPatBinarySDNode_V_WV<srl, "PseudoVNSRL">;
-defm : VPatBinarySDNode_V_WX<srl, "PseudoVNSRL">;
-defm : VPatBinarySDNode_V_WV<sra, "PseudoVNSRA">;
-defm : VPatBinarySDNode_V_WX<sra, "PseudoVNSRA">;
+defm : VPatBinarySDNode_V_WV_WX_WI<srl, "PseudoVNSRL">;
+defm : VPatBinarySDNode_V_WV_WX_WI<sra, "PseudoVNSRA">;
+
+defm : VPatNarrowShiftSplat_WX_WI<riscv_sra_vl, "PseudoVNSRA">;
+defm : VPatNarrowShiftSplat_WX_WI<riscv_srl_vl, "PseudoVNSRL">;
+defm : VPatNarrowShiftSplatExt_WX<riscv_sra_vl, riscv_sext_vl_oneuse, "PseudoVNSRA">;
+defm : VPatNarrowShiftSplatExt_WX<riscv_sra_vl, riscv_zext_vl_oneuse, "PseudoVNSRA">;
+defm : VPatNarrowShiftSplatExt_WX<riscv_srl_vl, riscv_sext_vl_oneuse, "PseudoVNSRL">;
+defm : VPatNarrowShiftSplatExt_WX<riscv_srl_vl, riscv_zext_vl_oneuse, "PseudoVNSRL">;
 
 foreach vtiTowti = AllWidenableIntVectors in {
   defvar vti = vtiTowti.Vti;
   defvar wti = vtiTowti.Wti;
   def : Pat<(vti.Vector (riscv_trunc_vector_vl (wti.Vector wti.RegClass:$rs1),
-                                               (vti.Mask true_mask),
+                                               (vti.Mask V0),
                                                VLOpFrag)),
-            (!cast<Instruction>("PseudoVNSRL_WX_"#vti.LMul.MX)
-                wti.RegClass:$rs1, X0, GPR:$vl, vti.Log2SEW)>;
-
-  def : Pat<(vti.Vector
-             (riscv_trunc_vector_vl
-              (wti.Vector
-               (riscv_sra_vl wti.RegClass:$rs1, (SplatPat XLenVT:$rs2),
-                             true_mask, VLOpFrag)), true_mask, VLOpFrag)),
-            (!cast<Instruction>("PseudoVNSRA_WX_"#vti.LMul.MX)
-                wti.RegClass:$rs1, GPR:$rs2, GPR:$vl, vti.Log2SEW)>;
-  def : Pat<(vti.Vector
-             (riscv_trunc_vector_vl
-              (wti.Vector
-               (riscv_sra_vl wti.RegClass:$rs1, (SplatPat_uimm5 uimm5:$rs2),
-                             true_mask, VLOpFrag)), true_mask, VLOpFrag)),
-            (!cast<Instruction>("PseudoVNSRA_WI_"#vti.LMul.MX)
-                wti.RegClass:$rs1, uimm5:$rs2, GPR:$vl, vti.Log2SEW)>;
-
-  def : Pat<(vti.Vector
-             (riscv_trunc_vector_vl
-              (wti.Vector
-               (riscv_srl_vl wti.RegClass:$rs1, (SplatPat XLenVT:$rs2),
-                             true_mask, VLOpFrag)), true_mask, VLOpFrag)),
-            (!cast<Instruction>("PseudoVNSRL_WX_"#vti.LMul.MX)
-                wti.RegClass:$rs1, GPR:$rs2, GPR:$vl, vti.Log2SEW)>;
-  def : Pat<(vti.Vector
-             (riscv_trunc_vector_vl
-              (wti.Vector
-               (riscv_srl_vl wti.RegClass:$rs1, (SplatPat_uimm5 uimm5:$rs2),
-                             true_mask, VLOpFrag)), true_mask, VLOpFrag)),
-            (!cast<Instruction>("PseudoVNSRL_WI_"#vti.LMul.MX)
-                wti.RegClass:$rs1, uimm5:$rs2, GPR:$vl, vti.Log2SEW)>;
+            (!cast<Instruction>("PseudoVNSRL_WX_"#vti.LMul.MX#"_MASK")
+                (vti.Vector (IMPLICIT_DEF)), wti.RegClass:$rs1, X0,
+                (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 }
 
 // 12.8. Vector Integer Comparison Instructions
@@ -832,14 +1085,14 @@ foreach vti = AllIntegerVectors in {
   defm : VPatIntegerSetCCVL_VI_Swappable<vti, "PseudoVMSGT",  SETGT,  SETLT>;
   defm : VPatIntegerSetCCVL_VI_Swappable<vti, "PseudoVMSGTU", SETUGT, SETULT>;
 
-  defm : VPatIntegerSetCCVL_VIPlus1<vti, "PseudoVMSLE",  SETLT,
-                                    SplatPat_simm5_plus1_nonzero>;
-  defm : VPatIntegerSetCCVL_VIPlus1<vti, "PseudoVMSLEU", SETULT,
-                                    SplatPat_simm5_plus1_nonzero>;
-  defm : VPatIntegerSetCCVL_VIPlus1<vti, "PseudoVMSGT",  SETGE,
-                                    SplatPat_simm5_plus1>;
-  defm : VPatIntegerSetCCVL_VIPlus1<vti, "PseudoVMSGTU", SETUGE,
-                                    SplatPat_simm5_plus1_nonzero>;
+  defm : VPatIntegerSetCCVL_VIPlus1_Swappable<vti, "PseudoVMSLE",  SETLT, SETGT,
+                                              SplatPat_simm5_plus1_nonzero>;
+  defm : VPatIntegerSetCCVL_VIPlus1_Swappable<vti, "PseudoVMSLEU", SETULT, SETUGT,
+                                              SplatPat_simm5_plus1_nonzero>;
+  defm : VPatIntegerSetCCVL_VIPlus1_Swappable<vti, "PseudoVMSGT",  SETGE, SETLE,
+                                              SplatPat_simm5_plus1>;
+  defm : VPatIntegerSetCCVL_VIPlus1_Swappable<vti, "PseudoVMSGTU", SETUGE, SETULE,
+                                              SplatPat_simm5_plus1_nonzero>;
 } // foreach vti = AllIntegerVectors
 
 // 12.9. Vector Integer Min/Max Instructions
@@ -865,92 +1118,24 @@ defm : VPatBinaryWVL_VV_VX<riscv_vwmulu_vl, "PseudoVWMULU">;
 defm : VPatBinaryWVL_VV_VX<riscv_vwmulsu_vl, "PseudoVWMULSU">;
 
 // 12.13 Vector Single-Width Integer Multiply-Add Instructions
-foreach vti = AllIntegerVectors in {
-  // NOTE: We choose VMADD because it has the most commuting freedom. So it
-  // works best with how TwoAddressInstructionPass tries commuting.
-  defvar suffix = vti.LMul.MX;
-  def : Pat<(vti.Vector
-             (riscv_add_vl vti.RegClass:$rs2,
-                           (riscv_mul_vl_oneuse vti.RegClass:$rs1,
-                                                vti.RegClass:$rd,
-                                                (vti.Mask true_mask), VLOpFrag),
-                           (vti.Mask true_mask), VLOpFrag)),
-            (!cast<Instruction>("PseudoVMADD_VV_"# suffix)
-                 vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
-  def : Pat<(vti.Vector
-             (riscv_sub_vl vti.RegClass:$rs2,
-                           (riscv_mul_vl_oneuse vti.RegClass:$rs1,
-                                                vti.RegClass:$rd,
-                                                (vti.Mask true_mask), VLOpFrag),
-                           (vti.Mask true_mask), VLOpFrag)),
-            (!cast<Instruction>("PseudoVNMSUB_VV_"# suffix)
-                 vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
-
-  // The choice of VMADD here is arbitrary, vmadd.vx and vmacc.vx are equally
-  // commutable.
-  def : Pat<(vti.Vector
-             (riscv_add_vl vti.RegClass:$rs2,
-                           (riscv_mul_vl_oneuse (SplatPat XLenVT:$rs1),
-                                                vti.RegClass:$rd,
-                                                (vti.Mask true_mask), VLOpFrag),
-                           (vti.Mask true_mask), VLOpFrag)),
-            (!cast<Instruction>("PseudoVMADD_VX_" # suffix)
-                 vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
-  def : Pat<(vti.Vector
-             (riscv_sub_vl vti.RegClass:$rs2,
-                           (riscv_mul_vl_oneuse (SplatPat XLenVT:$rs1),
-                                                vti.RegClass:$rd,
-                                                (vti.Mask true_mask),
-                                                VLOpFrag),
-                           (vti.Mask true_mask), VLOpFrag)),
-            (!cast<Instruction>("PseudoVNMSUB_VX_" # suffix)
-                 vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
-}
+defm : VPatMultiplyAddVL_VV_VX<riscv_add_vl, "PseudoVMADD">;
+defm : VPatMultiplyAddVL_VV_VX<riscv_sub_vl, "PseudoVNMSUB">;
 
 // 12.14. Vector Widening Integer Multiply-Add Instructions
+defm : VPatWidenMultiplyAddVL_VV_VX<riscv_vwmul_vl_oneuse, "PseudoVWMACC">;
+defm : VPatWidenMultiplyAddVL_VV_VX<riscv_vwmulu_vl_oneuse, "PseudoVWMACCU">;
+defm : VPatWidenMultiplyAddVL_VV_VX<riscv_vwmulsu_vl_oneuse, "PseudoVWMACCSU">;
 foreach vtiTowti = AllWidenableIntVectors in {
   defvar vti = vtiTowti.Vti;
   defvar wti = vtiTowti.Wti;
   def : Pat<(wti.Vector
              (riscv_add_vl wti.RegClass:$rd,
-                           (riscv_vwmul_vl_oneuse vti.RegClass:$rs1,
-                                                  (vti.Vector vti.RegClass:$rs2),
-                                                  (vti.Mask true_mask), VLOpFrag),
-                           (vti.Mask true_mask), VLOpFrag)),
-            (!cast<Instruction>("PseudoVWMACC_VV_" # vti.LMul.MX)
-                 wti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
-  def : Pat<(wti.Vector
-             (riscv_add_vl wti.RegClass:$rd,
-                           (riscv_vwmulu_vl_oneuse vti.RegClass:$rs1,
-                                                   (vti.Vector vti.RegClass:$rs2),
-                                                   (vti.Mask true_mask), VLOpFrag),
+                           (riscv_vwmulsu_vl_oneuse (vti.Vector vti.RegClass:$rs1),
+                                                    (SplatPat XLenVT:$rs2),
+                                                    (vti.Mask true_mask), VLOpFrag),
                            (vti.Mask true_mask), VLOpFrag)),
-            (!cast<Instruction>("PseudoVWMACCU_VV_" # vti.LMul.MX)
-                 wti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
-
-  def : Pat<(wti.Vector
-             (riscv_add_vl wti.RegClass:$rd,
-                           (riscv_vwmul_vl_oneuse (SplatPat XLenVT:$rs1),
-                                                  (vti.Vector vti.RegClass:$rs2),
-                                                  (vti.Mask true_mask), VLOpFrag),
-                           (vti.Mask true_mask), VLOpFrag)),
-            (!cast<Instruction>("PseudoVWMACC_VX_" # vti.LMul.MX)
-                 wti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
-  def : Pat<(wti.Vector
-             (riscv_add_vl wti.RegClass:$rd,
-                           (riscv_vwmulu_vl_oneuse (SplatPat XLenVT:$rs1),
-                                                   (vti.Vector vti.RegClass:$rs2),
-                                                   (vti.Mask true_mask), VLOpFrag),
-                           (vti.Mask true_mask), VLOpFrag)),
-            (!cast<Instruction>("PseudoVWMACCU_VX_" # vti.LMul.MX)
-                 wti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
+            (!cast<Instruction>("PseudoVWMACCUS_VX_" # vti.LMul.MX)
+                 wti.RegClass:$rd, vti.ScalarRegClass:$rs2, vti.RegClass:$rs1,
                  GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 }
 
@@ -1005,14 +1190,21 @@ foreach vti = AllIntegerVectors in {
 
 // 12.16. Vector Integer Move Instructions
 foreach vti = AllIntegerVectors in {
-  def : Pat<(vti.Vector (riscv_vmv_v_x_vl GPR:$rs2, VLOpFrag)),
+  def : Pat<(vti.Vector (riscv_vmv_v_x_vl (vti.Vector undef), GPR:$rs2, VLOpFrag)),
             (!cast<Instruction>("PseudoVMV_V_X_"#vti.LMul.MX)
              $rs2, GPR:$vl, vti.Log2SEW)>;
+  def : Pat<(vti.Vector (riscv_vmv_v_x_vl vti.Vector:$passthru, GPR:$rs2, VLOpFrag)),
+            (!cast<Instruction>("PseudoVMV_V_X_"#vti.LMul.MX#"_TU")
+             $passthru, $rs2, GPR:$vl, vti.Log2SEW)>;
   defvar ImmPat = !cast<ComplexPattern>("sew"#vti.SEW#"simm5");
-  def : Pat<(vti.Vector (riscv_vmv_v_x_vl (ImmPat XLenVT:$imm5),
+  def : Pat<(vti.Vector (riscv_vmv_v_x_vl (vti.Vector undef), (ImmPat XLenVT:$imm5),
                                           VLOpFrag)),
             (!cast<Instruction>("PseudoVMV_V_I_"#vti.LMul.MX)
              XLenVT:$imm5, GPR:$vl, vti.Log2SEW)>;
+  def : Pat<(vti.Vector (riscv_vmv_v_x_vl vti.Vector:$passthru, (ImmPat XLenVT:$imm5),
+                                              VLOpFrag)),
+            (!cast<Instruction>("PseudoVMV_V_I_"#vti.LMul.MX#"_TU")
+             $passthru, XLenVT:$imm5, GPR:$vl, vti.Log2SEW)>;
 }
 
 // 12.1. Vector Single-Width Saturating Add and Subtract
@@ -1033,6 +1225,13 @@ defm : VPatReductionVL<rvv_vecreduce_SMIN_vl, "PseudoVREDMIN", /*is_float*/0>;
 defm : VPatReductionVL<rvv_vecreduce_AND_vl,  "PseudoVREDAND", /*is_float*/0>;
 defm : VPatReductionVL<rvv_vecreduce_OR_vl,   "PseudoVREDOR", /*is_float*/0>;
 defm : VPatReductionVL<rvv_vecreduce_XOR_vl,  "PseudoVREDXOR", /*is_float*/0>;
+
+// 15.2. Vector Widening Integer Reduction Instructions
+defm : VPatWidenReductionVL<rvv_vecreduce_ADD_vl, anyext_oneuse, "PseudoVWREDSUMU", /*is_float*/0>;
+defm : VPatWidenReductionVL<rvv_vecreduce_ADD_vl, zext_oneuse, "PseudoVWREDSUMU", /*is_float*/0>;
+defm : VPatWidenReductionVL_Ext_VL<rvv_vecreduce_ADD_vl, riscv_zext_vl_oneuse, "PseudoVWREDSUMU", /*is_float*/0>;
+defm : VPatWidenReductionVL<rvv_vecreduce_ADD_vl, sext_oneuse, "PseudoVWREDSUM", /*is_float*/0>;
+defm : VPatWidenReductionVL_Ext_VL<rvv_vecreduce_ADD_vl, riscv_sext_vl_oneuse, "PseudoVWREDSUM", /*is_float*/0>;
 } // Predicates = [HasVInstructions]
 
 // 15.3. Vector Single-Width Floating-Point Reduction Instructions
@@ -1041,6 +1240,12 @@ defm : VPatReductionVL<rvv_vecreduce_SEQ_FADD_vl, "PseudoVFREDOSUM", /*is_float*
 defm : VPatReductionVL<rvv_vecreduce_FADD_vl,     "PseudoVFREDUSUM", /*is_float*/1>;
 defm : VPatReductionVL<rvv_vecreduce_FMIN_vl,     "PseudoVFREDMIN", /*is_float*/1>;
 defm : VPatReductionVL<rvv_vecreduce_FMAX_vl,     "PseudoVFREDMAX", /*is_float*/1>;
+
+// 15.4. Vector Widening Floating-Point Reduction Instructions
+defm : VPatWidenReductionVL<rvv_vecreduce_SEQ_FADD_vl, fpext_oneuse, "PseudoVFWREDOSUM", /*is_float*/1>;
+defm : VPatWidenReductionVL_Ext_VL<rvv_vecreduce_SEQ_FADD_vl, riscv_fpextend_vl_oneuse, "PseudoVFWREDOSUM", /*is_float*/1>;
+defm : VPatWidenReductionVL<rvv_vecreduce_FADD_vl, fpext_oneuse, "PseudoVFWREDUSUM", /*is_float*/1>;
+defm : VPatWidenReductionVL_Ext_VL<rvv_vecreduce_FADD_vl, riscv_fpextend_vl_oneuse, "PseudoVFWREDUSUM", /*is_float*/1>;
 } // Predicates = [HasVInstructionsAnyF]
 
 let Predicates = [HasVInstructionsAnyF] in {
@@ -1050,118 +1255,29 @@ defm : VPatBinaryFPVL_VV_VF<riscv_fadd_vl, "PseudoVFADD">;
 defm : VPatBinaryFPVL_VV_VF<riscv_fsub_vl, "PseudoVFSUB">;
 defm : VPatBinaryFPVL_R_VF<riscv_fsub_vl, "PseudoVFRSUB">;
 
+// 14.3. Vector Widening Floating-Point Add/Subtract Instructions
+defm : VPatWidenBinaryFPVL_VV_VF_WV_WF<riscv_fadd_vl, "PseudoVFWADD">;
+defm : VPatWidenBinaryFPVL_VV_VF_WV_WF<riscv_fsub_vl, "PseudoVFWSUB">;
+
 // 14.4. Vector Single-Width Floating-Point Multiply/Divide Instructions
 defm : VPatBinaryFPVL_VV_VF<riscv_fmul_vl, "PseudoVFMUL">;
 defm : VPatBinaryFPVL_VV_VF<riscv_fdiv_vl, "PseudoVFDIV">;
 defm : VPatBinaryFPVL_R_VF<riscv_fdiv_vl, "PseudoVFRDIV">;
 
-// 14.6 Vector Single-Width Floating-Point Fused Multiply-Add Instructions.
-foreach vti = AllFloatVectors in {
-  // NOTE: We choose VFMADD because it has the most commuting freedom. So it
-  // works best with how TwoAddressInstructionPass tries commuting.
-  defvar suffix = vti.LMul.MX;
-  def : Pat<(vti.Vector (riscv_fma_vl vti.RegClass:$rs1, vti.RegClass:$rd,
-                                      vti.RegClass:$rs2, (vti.Mask true_mask),
-                                      VLOpFrag)),
-            (!cast<Instruction>("PseudoVFMADD_VV_"# suffix)
-                 vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
-  def : Pat<(vti.Vector (riscv_fma_vl vti.RegClass:$rs1, vti.RegClass:$rd,
-                                      (riscv_fneg_vl vti.RegClass:$rs2,
-                                                     (vti.Mask true_mask),
-                                                     VLOpFrag),
-                                      (vti.Mask true_mask),
-                                      VLOpFrag)),
-            (!cast<Instruction>("PseudoVFMSUB_VV_"# suffix)
-                 vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
-  def : Pat<(vti.Vector (riscv_fma_vl (riscv_fneg_vl vti.RegClass:$rs1,
-                                                     (vti.Mask true_mask),
-                                                     VLOpFrag),
-                                      vti.RegClass:$rd,
-                                      (riscv_fneg_vl vti.RegClass:$rs2,
-                                                     (vti.Mask true_mask),
-                                                     VLOpFrag),
-                                      (vti.Mask true_mask),
-                                      VLOpFrag)),
-            (!cast<Instruction>("PseudoVFNMADD_VV_"# suffix)
-                 vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
-  def : Pat<(vti.Vector (riscv_fma_vl (riscv_fneg_vl vti.RegClass:$rs1,
-                                                     (vti.Mask true_mask),
-                                                     VLOpFrag),
-                                      vti.RegClass:$rd, vti.RegClass:$rs2,
-                                      (vti.Mask true_mask),
-                                      VLOpFrag)),
-            (!cast<Instruction>("PseudoVFNMSUB_VV_"# suffix)
-                 vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+// 14.5. Vector Widening Floating-Point Multiply Instructions
+defm : VPatWidenBinaryFPVL_VV_VF<riscv_fmul_vl, riscv_fpextend_vl_oneuse, "PseudoVFWMUL">;
 
-  // The choice of VFMADD here is arbitrary, vfmadd.vf and vfmacc.vf are equally
-  // commutable.
-  def : Pat<(vti.Vector (riscv_fma_vl (SplatFPOp vti.ScalarRegClass:$rs1),
-                                       vti.RegClass:$rd, vti.RegClass:$rs2,
-                                       (vti.Mask true_mask),
-                                       VLOpFrag)),
-            (!cast<Instruction>("PseudoVFMADD_V" # vti.ScalarSuffix # "_" # suffix)
-                 vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
-  def : Pat<(vti.Vector (riscv_fma_vl (SplatFPOp vti.ScalarRegClass:$rs1),
-                                       vti.RegClass:$rd,
-                                       (riscv_fneg_vl vti.RegClass:$rs2,
-                                                      (vti.Mask true_mask),
-                                                      VLOpFrag),
-                                       (vti.Mask true_mask),
-                                       VLOpFrag)),
-            (!cast<Instruction>("PseudoVFMSUB_V" # vti.ScalarSuffix # "_" # suffix)
-                 vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
-  def : Pat<(vti.Vector (riscv_fma_vl (SplatFPOp vti.ScalarRegClass:$rs1),
-                                       (riscv_fneg_vl vti.RegClass:$rd,
-                                                      (vti.Mask true_mask),
-                                                      VLOpFrag),
-                                       (riscv_fneg_vl vti.RegClass:$rs2,
-                                                      (vti.Mask true_mask),
-                                                      VLOpFrag),
-                                       (vti.Mask true_mask),
-                                       VLOpFrag)),
-            (!cast<Instruction>("PseudoVFNMADD_V" # vti.ScalarSuffix # "_" # suffix)
-                 vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
-  def : Pat<(vti.Vector (riscv_fma_vl (SplatFPOp vti.ScalarRegClass:$rs1),
-                                       (riscv_fneg_vl vti.RegClass:$rd,
-                                                      (vti.Mask true_mask),
-                                                      VLOpFrag),
-                                       vti.RegClass:$rs2,
-                                       (vti.Mask true_mask),
-                                       VLOpFrag)),
-            (!cast<Instruction>("PseudoVFNMSUB_V" # vti.ScalarSuffix # "_" # suffix)
-                 vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+// 14.6 Vector Single-Width Floating-Point Fused Multiply-Add Instructions.
+defm : VPatFPMulAddVL_VV_VF<riscv_vfmadd_vl,  "PseudoVFMADD">;
+defm : VPatFPMulAddVL_VV_VF<riscv_vfmsub_vl,  "PseudoVFMSUB">;
+defm : VPatFPMulAddVL_VV_VF<riscv_vfnmadd_vl, "PseudoVFNMADD">;
+defm : VPatFPMulAddVL_VV_VF<riscv_vfnmsub_vl, "PseudoVFNMSUB">;
 
-  // The splat might be negated.
-  def : Pat<(vti.Vector (riscv_fma_vl (riscv_fneg_vl (SplatFPOp vti.ScalarRegClass:$rs1),
-                                                     (vti.Mask true_mask),
-                                                     VLOpFrag),
-                                       vti.RegClass:$rd,
-                                       (riscv_fneg_vl vti.RegClass:$rs2,
-                                                      (vti.Mask true_mask),
-                                                      VLOpFrag),
-                                       (vti.Mask true_mask),
-                                       VLOpFrag)),
-            (!cast<Instruction>("PseudoVFNMADD_V" # vti.ScalarSuffix # "_" # suffix)
-                 vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
-  def : Pat<(vti.Vector (riscv_fma_vl (riscv_fneg_vl (SplatFPOp vti.ScalarRegClass:$rs1),
-                                                     (vti.Mask true_mask),
-                                                     VLOpFrag),
-                                       vti.RegClass:$rd, vti.RegClass:$rs2,
-                                       (vti.Mask true_mask),
-                                       VLOpFrag)),
-            (!cast<Instruction>("PseudoVFNMSUB_V" # vti.ScalarSuffix # "_" # suffix)
-                 vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
-}
+// 14.7. Vector Widening Floating-Point Fused Multiply-Add Instructions
+defm : VPatWidenFPMulAccVL_VV_VF<riscv_vfmadd_vl, "PseudoVFWMACC">;
+defm : VPatWidenFPMulAccVL_VV_VF<riscv_vfnmadd_vl, "PseudoVFWNMACC">;
+defm : VPatWidenFPMulAccVL_VV_VF<riscv_vfmsub_vl, "PseudoVFWMSAC">;
+defm : VPatWidenFPMulAccVL_VV_VF<riscv_vfnmsub_vl, "PseudoVFWNMSAC">;
 
 // 14.11. Vector Floating-Point MIN/MAX Instructions
 defm : VPatBinaryFPVL_VV_VF<riscv_fminnum_vl, "PseudoVFMIN">;
@@ -1193,10 +1309,13 @@ foreach vti = AllFloatVectors in {
             (!cast<Instruction>("PseudoVFSGNJX_VV_"# vti.LMul.MX)
                  vti.RegClass:$rs, vti.RegClass:$rs, GPR:$vl, vti.Log2SEW)>;
   // Handle fneg with VFSGNJN using the same input for both operands.
-  def : Pat<(riscv_fneg_vl (vti.Vector vti.RegClass:$rs), (vti.Mask true_mask),
+  def : Pat<(riscv_fneg_vl (vti.Vector vti.RegClass:$rs), (vti.Mask V0),
                            VLOpFrag),
-            (!cast<Instruction>("PseudoVFSGNJN_VV_"# vti.LMul.MX)
-                 vti.RegClass:$rs, vti.RegClass:$rs, GPR:$vl, vti.Log2SEW)>;
+            (!cast<Instruction>("PseudoVFSGNJN_VV_"# vti.LMul.MX #"_MASK")
+                 (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs,
+                 vti.RegClass:$rs, (vti.Mask V0), GPR:$vl, vti.Log2SEW,
+                 TAIL_AGNOSTIC)>;
+
   def : Pat<(riscv_fcopysign_vl (vti.Vector vti.RegClass:$rs1),
                                 (vti.Vector vti.RegClass:$rs2),
                                 (vti.Mask true_mask),
@@ -1276,16 +1395,26 @@ foreach fvti = AllFloatVectors in {
   // 14.16. Vector Floating-Point Move Instruction
   // If we're splatting fpimm0, use vmv.v.x vd, x0.
   def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl
-                         (fvti.Scalar (fpimm0)), VLOpFrag)),
+                         (fvti.Vector undef), (fvti.Scalar (fpimm0)), VLOpFrag)),
             (!cast<Instruction>("PseudoVMV_V_I_"#fvti.LMul.MX)
              0, GPR:$vl, fvti.Log2SEW)>;
+  def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl
+                         fvti.Vector:$passthru, (fvti.Scalar (fpimm0)), VLOpFrag)),
+            (!cast<Instruction>("PseudoVMV_V_I_"#fvti.LMul.MX#"_TU")
+             $passthru, 0, GPR:$vl, fvti.Log2SEW)>;
 
   def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl
-                         (fvti.Scalar fvti.ScalarRegClass:$rs2), VLOpFrag)),
+                         (fvti.Vector undef), (fvti.Scalar fvti.ScalarRegClass:$rs2), VLOpFrag)),
             (!cast<Instruction>("PseudoVFMV_V_" # fvti.ScalarSuffix # "_" #
                                 fvti.LMul.MX)
              (fvti.Scalar fvti.ScalarRegClass:$rs2),
              GPR:$vl, fvti.Log2SEW)>;
+  def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl
+                         fvti.Vector:$passthru, (fvti.Scalar fvti.ScalarRegClass:$rs2), VLOpFrag)),
+            (!cast<Instruction>("PseudoVFMV_V_" # fvti.ScalarSuffix # "_" #
+                                fvti.LMul.MX # "_TU")
+             $passthru, (fvti.Scalar fvti.ScalarRegClass:$rs2),
+             GPR:$vl, fvti.Log2SEW)>;
 
   // 14.17. Vector Single-Width Floating-Point/Integer Type-Convert Instructions
   defm : VPatConvertFP2ISDNode_V_VL<riscv_fp_to_sint_vl, "PseudoVFCVT_RTZ_X_F_V">;
@@ -1302,10 +1431,11 @@ foreach fvti = AllFloatVectors in {
     defvar fvti = fvtiToFWti.Vti;
     defvar fwti = fvtiToFWti.Wti;
     def : Pat<(fwti.Vector (riscv_fpextend_vl (fvti.Vector fvti.RegClass:$rs1),
-                                              (fvti.Mask true_mask),
+                                              (fvti.Mask V0),
                                               VLOpFrag)),
-              (!cast<Instruction>("PseudoVFWCVT_F_F_V_"#fvti.LMul.MX)
-                  fvti.RegClass:$rs1, GPR:$vl, fvti.Log2SEW)>;
+              (!cast<Instruction>("PseudoVFWCVT_F_F_V_"#fvti.LMul.MX#"_MASK")
+                  (fwti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1,
+                  (fvti.Mask V0), GPR:$vl, fvti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 
   // 14.19 Narrowing Floating-Point/Integer Type-Convert Instructions
@@ -1317,16 +1447,18 @@ foreach fvti = AllFloatVectors in {
     defvar fvti = fvtiToFWti.Vti;
     defvar fwti = fvtiToFWti.Wti;
     def : Pat<(fvti.Vector (riscv_fpround_vl (fwti.Vector fwti.RegClass:$rs1),
-                                             (fwti.Mask true_mask),
+                                             (fwti.Mask V0),
                                              VLOpFrag)),
-              (!cast<Instruction>("PseudoVFNCVT_F_F_W_"#fvti.LMul.MX)
-                  fwti.RegClass:$rs1, GPR:$vl, fvti.Log2SEW)>;
+              (!cast<Instruction>("PseudoVFNCVT_F_F_W_"#fvti.LMul.MX#"_MASK")
+                  (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1,
+                  (fwti.Mask V0), GPR:$vl, fvti.Log2SEW, TAIL_AGNOSTIC)>;
 
     def : Pat<(fvti.Vector (riscv_fncvt_rod_vl (fwti.Vector fwti.RegClass:$rs1),
-                                               (fwti.Mask true_mask),
+                                               (fwti.Mask V0),
                                                VLOpFrag)),
-              (!cast<Instruction>("PseudoVFNCVT_ROD_F_F_W_"#fvti.LMul.MX)
-                  fwti.RegClass:$rs1, GPR:$vl, fvti.Log2SEW)>;
+              (!cast<Instruction>("PseudoVFNCVT_ROD_F_F_W_"#fvti.LMul.MX#"_MASK")
+                  (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1,
+                  (fwti.Mask V0), GPR:$vl, fvti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 }
 
@@ -1412,43 +1544,27 @@ foreach vti = AllIntegerVectors in {
             (!cast<Instruction>("PseudoVMV_S_X_"#vti.LMul.MX)
                 vti.RegClass:$merge,
                 (vti.Scalar vti.ScalarRegClass:$rs1), GPR:$vl, vti.Log2SEW)>;
+
   def : Pat<(vti.Vector (riscv_vrgather_vv_vl vti.RegClass:$rs2,
-                                              (vti.Vector vti.RegClass:$rs1),
-                                              (vti.Mask true_mask),
-                                              VLOpFrag)),
-            (!cast<Instruction>("PseudoVRGATHER_VV_"# vti.LMul.MX)
-                 vti.RegClass:$rs2, vti.RegClass:$rs1, GPR:$vl, vti.Log2SEW)>;
-  def : Pat<(vti.Vector (riscv_vrgather_vx_vl vti.RegClass:$rs2, GPR:$rs1,
-                                              (vti.Mask true_mask),
-                                              VLOpFrag)),
-            (!cast<Instruction>("PseudoVRGATHER_VX_"# vti.LMul.MX)
-                 vti.RegClass:$rs2, GPR:$rs1, GPR:$vl, vti.Log2SEW)>;
-  def : Pat<(vti.Vector (riscv_vrgather_vx_vl vti.RegClass:$rs2, uimm5:$imm,
-                                              (vti.Mask true_mask),
+                                              vti.RegClass:$rs1,
+                                              (vti.Mask V0),
+                                              vti.RegClass:$merge,
                                               VLOpFrag)),
-            (!cast<Instruction>("PseudoVRGATHER_VI_"# vti.LMul.MX)
-                 vti.RegClass:$rs2, uimm5:$imm, GPR:$vl, vti.Log2SEW)>;
-
-  def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask V0),
-                                          (riscv_vrgather_vv_vl
-                                            vti.RegClass:$rs2,
-                                            vti.RegClass:$rs1,
-                                            (vti.Mask true_mask),
-                                            VLOpFrag),
-                                          vti.RegClass:$merge,
-                                          VLOpFrag)),
             (!cast<Instruction>("PseudoVRGATHER_VV_"# vti.LMul.MX#"_MASK")
                  vti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
-
-  def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask V0),
-                                          (riscv_vrgather_vx_vl
-                                            vti.RegClass:$rs2,
-                                            uimm5:$imm,
-                                            (vti.Mask true_mask),
-                                            VLOpFrag),
-                                          vti.RegClass:$merge,
-                                          VLOpFrag)),
+  def : Pat<(vti.Vector (riscv_vrgather_vx_vl vti.RegClass:$rs2, GPR:$rs1,
+                                              (vti.Mask V0),
+                                              vti.RegClass:$merge,
+                                              VLOpFrag)),
+            (!cast<Instruction>("PseudoVRGATHER_VX_"# vti.LMul.MX#"_MASK")
+                 vti.RegClass:$merge, vti.RegClass:$rs2, GPR:$rs1,
+                 (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+  def : Pat<(vti.Vector (riscv_vrgather_vx_vl vti.RegClass:$rs2,
+                                              uimm5:$imm,
+                                              (vti.Mask V0),
+                                              vti.RegClass:$merge,
+                                              VLOpFrag)),
             (!cast<Instruction>("PseudoVRGATHER_VI_"# vti.LMul.MX#"_MASK")
                  vti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$imm,
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
@@ -1461,21 +1577,13 @@ foreach vti = AllIntegerVectors in {
     defvar emul_str = octuple_to_str<octuple_emul>.ret;
     defvar ivti = !cast<VTypeInfo>("VI16" # emul_str);
     defvar inst = "PseudoVRGATHEREI16_VV_" # vti.LMul.MX # "_" # emul_str;
-    def : Pat<(vti.Vector (riscv_vrgatherei16_vv_vl vti.RegClass:$rs2,
-                                                    (ivti.Vector ivti.RegClass:$rs1),
-                                                    (vti.Mask true_mask),
-                                                    VLOpFrag)),
-              (!cast<Instruction>(inst)
-                   vti.RegClass:$rs2, ivti.RegClass:$rs1, GPR:$vl, vti.Log2SEW)>;
-
-    def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask V0),
-                                            (riscv_vrgatherei16_vv_vl
-                                              vti.RegClass:$rs2,
-                                              (ivti.Vector ivti.RegClass:$rs1),
-                                              (vti.Mask true_mask),
-                                              VLOpFrag),
-                                            vti.RegClass:$merge,
-                                            VLOpFrag)),
+
+    def : Pat<(vti.Vector
+               (riscv_vrgatherei16_vv_vl vti.RegClass:$rs2,
+                                         (ivti.Vector ivti.RegClass:$rs1),
+                                         (vti.Mask V0),
+                                         vti.RegClass:$merge,
+                                         VLOpFrag)),
               (!cast<Instruction>(inst#"_MASK")
                    vti.RegClass:$merge, vti.RegClass:$rs2, ivti.RegClass:$rs1,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
@@ -1500,43 +1608,29 @@ foreach vti = AllFloatVectors in {
                 vti.RegClass:$merge,
                 (vti.Scalar vti.ScalarRegClass:$rs1), GPR:$vl, vti.Log2SEW)>;
   defvar ivti = GetIntVTypeInfo<vti>.Vti;
-  def : Pat<(vti.Vector (riscv_vrgather_vv_vl vti.RegClass:$rs2,
-                                              (ivti.Vector vti.RegClass:$rs1),
-                                              (vti.Mask true_mask),
-                                              VLOpFrag)),
-            (!cast<Instruction>("PseudoVRGATHER_VV_"# vti.LMul.MX)
-                 vti.RegClass:$rs2, vti.RegClass:$rs1, GPR:$vl, vti.Log2SEW)>;
-  def : Pat<(vti.Vector (riscv_vrgather_vx_vl vti.RegClass:$rs2, GPR:$rs1,
-                                              (vti.Mask true_mask),
-                                              VLOpFrag)),
-            (!cast<Instruction>("PseudoVRGATHER_VX_"# vti.LMul.MX)
-                 vti.RegClass:$rs2, GPR:$rs1, GPR:$vl, vti.Log2SEW)>;
-  def : Pat<(vti.Vector (riscv_vrgather_vx_vl vti.RegClass:$rs2, uimm5:$imm,
-                                              (vti.Mask true_mask),
-                                              VLOpFrag)),
-            (!cast<Instruction>("PseudoVRGATHER_VI_"# vti.LMul.MX)
-                 vti.RegClass:$rs2, uimm5:$imm, GPR:$vl, vti.Log2SEW)>;
 
-  def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask V0),
-                                          (riscv_vrgather_vv_vl
-                                            vti.RegClass:$rs2,
-                                            (ivti.Vector vti.RegClass:$rs1),
-                                            (vti.Mask true_mask),
-                                            VLOpFrag),
-                                          vti.RegClass:$merge,
-                                          VLOpFrag)),
+  def : Pat<(vti.Vector
+             (riscv_vrgather_vv_vl vti.RegClass:$rs2,
+                                   (ivti.Vector vti.RegClass:$rs1),
+                                   (vti.Mask V0),
+                                   vti.RegClass:$merge,
+                                   VLOpFrag)),
             (!cast<Instruction>("PseudoVRGATHER_VV_"# vti.LMul.MX#"_MASK")
                  vti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
-
-  def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask V0),
-                                          (riscv_vrgather_vx_vl
-                                            vti.RegClass:$rs2,
-                                            uimm5:$imm,
-                                            (vti.Mask true_mask),
-                                            VLOpFrag),
-                                          vti.RegClass:$merge,
-                                          VLOpFrag)),
+  def : Pat<(vti.Vector (riscv_vrgather_vx_vl vti.RegClass:$rs2, GPR:$rs1,
+                                              (vti.Mask V0),
+                                              vti.RegClass:$merge,
+                                              VLOpFrag)),
+            (!cast<Instruction>("PseudoVRGATHER_VX_"# vti.LMul.MX#"_MASK")
+                 vti.RegClass:$merge, vti.RegClass:$rs2, GPR:$rs1,
+                 (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+  def : Pat<(vti.Vector
+             (riscv_vrgather_vx_vl vti.RegClass:$rs2,
+                                   uimm5:$imm,
+                                   (vti.Mask V0),
+                                   vti.RegClass:$merge,
+                                   VLOpFrag)),
             (!cast<Instruction>("PseudoVRGATHER_VI_"# vti.LMul.MX#"_MASK")
                  vti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$imm,
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
@@ -1548,21 +1642,13 @@ foreach vti = AllFloatVectors in {
     defvar emul_str = octuple_to_str<octuple_emul>.ret;
     defvar ivti = !cast<VTypeInfo>("VI16" # emul_str);
     defvar inst = "PseudoVRGATHEREI16_VV_" # vti.LMul.MX # "_" # emul_str;
-    def : Pat<(vti.Vector (riscv_vrgatherei16_vv_vl vti.RegClass:$rs2,
-                                                    (ivti.Vector ivti.RegClass:$rs1),
-                                                    (vti.Mask true_mask),
-                                                    VLOpFrag)),
-              (!cast<Instruction>(inst)
-                   vti.RegClass:$rs2, ivti.RegClass:$rs1, GPR:$vl, vti.Log2SEW)>;
-
-    def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask V0),
-                                            (riscv_vrgatherei16_vv_vl
-                                              vti.RegClass:$rs2,
-                                              (ivti.Vector ivti.RegClass:$rs1),
-                                              (vti.Mask true_mask),
-                                              VLOpFrag),
-                                            vti.RegClass:$merge,
-                                            VLOpFrag)),
+
+    def : Pat<(vti.Vector
+               (riscv_vrgatherei16_vv_vl vti.RegClass:$rs2,
+                                         (ivti.Vector ivti.RegClass:$rs1),
+                                         (vti.Mask V0),
+                                         vti.RegClass:$merge,
+                                         VLOpFrag)),
               (!cast<Instruction>(inst#"_MASK")
                    vti.RegClass:$merge, vti.RegClass:$rs2, ivti.RegClass:$rs1,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
@@ -1583,9 +1669,10 @@ def SDTRVVSlide : SDTypeProfile<1, 5, [
   SDTCisVec<0>, SDTCisSameAs<1, 0>, SDTCisSameAs<2, 0>, SDTCisVT<3, XLenVT>,
   SDTCVecEltisVT<4, i1>, SDTCisSameNumEltsAs<0, 4>, SDTCisVT<5, XLenVT>
 ]>;
-def SDTRVVSlide1 : SDTypeProfile<1, 4, [
-  SDTCisVec<0>, SDTCisSameAs<1, 0>, SDTCisInt<0>, SDTCisVT<2, XLenVT>,
-  SDTCVecEltisVT<3, i1>, SDTCisSameNumEltsAs<0, 3>, SDTCisVT<4, XLenVT>
+def SDTRVVSlide1 : SDTypeProfile<1, 5, [
+  SDTCisVec<0>, SDTCisSameAs<1, 0>, SDTCisSameAs<2, 0>, SDTCisInt<0>,
+  SDTCisVT<3, XLenVT>, SDTCVecEltisVT<4, i1>, SDTCisSameNumEltsAs<0, 4>,
+  SDTCisVT<5, XLenVT>
 ]>;
 
 def riscv_slideup_vl   : SDNode<"RISCVISD::VSLIDEUP_VL", SDTRVVSlide, []>;
@@ -1600,16 +1687,30 @@ foreach vti = AllIntegerVectors in {
                                       VLOpFrag)),
             (!cast<Instruction>("PseudoVID_V_"#vti.LMul.MX) GPR:$vl, vti.Log2SEW)>;
 
-  def : Pat<(vti.Vector (riscv_slide1up_vl (vti.Vector vti.RegClass:$rs1),
+  def : Pat<(vti.Vector (riscv_slide1up_vl (vti.Vector undef),
+                                           (vti.Vector vti.RegClass:$rs1),
                                            GPR:$rs2, (vti.Mask true_mask),
                                            VLOpFrag)),
             (!cast<Instruction>("PseudoVSLIDE1UP_VX_"#vti.LMul.MX)
                 vti.RegClass:$rs1, GPR:$rs2, GPR:$vl, vti.Log2SEW)>;
-  def : Pat<(vti.Vector (riscv_slide1down_vl (vti.Vector vti.RegClass:$rs1),
+  def : Pat<(vti.Vector (riscv_slide1up_vl (vti.Vector vti.RegClass:$rd),
+                                           (vti.Vector vti.RegClass:$rs1),
                                            GPR:$rs2, (vti.Mask true_mask),
                                            VLOpFrag)),
+            (!cast<Instruction>("PseudoVSLIDE1UP_VX_"#vti.LMul.MX#"_TU")
+                vti.RegClass:$rd, vti.RegClass:$rs1, GPR:$rs2, GPR:$vl, vti.Log2SEW)>;
+  def : Pat<(vti.Vector (riscv_slide1down_vl (vti.Vector undef),
+                                             (vti.Vector vti.RegClass:$rs1),
+                                             GPR:$rs2, (vti.Mask true_mask),
+                                             VLOpFrag)),
             (!cast<Instruction>("PseudoVSLIDE1DOWN_VX_"#vti.LMul.MX)
                 vti.RegClass:$rs1, GPR:$rs2, GPR:$vl, vti.Log2SEW)>;
+  def : Pat<(vti.Vector (riscv_slide1down_vl (vti.Vector vti.RegClass:$rd),
+                                             (vti.Vector vti.RegClass:$rs1),
+                                             GPR:$rs2, (vti.Mask true_mask),
+                                             VLOpFrag)),
+            (!cast<Instruction>("PseudoVSLIDE1DOWN_VX_"#vti.LMul.MX#"_TU")
+                vti.RegClass:$rd, vti.RegClass:$rs1, GPR:$rs2, GPR:$vl, vti.Log2SEW)>;
 }
 
 foreach vti = !listconcat(AllIntegerVectors, AllFloatVectors) in {
@@ -1619,7 +1720,7 @@ foreach vti = !listconcat(AllIntegerVectors, AllFloatVectors) in {
                                           VLOpFrag)),
             (!cast<Instruction>("PseudoVSLIDEUP_VI_"#vti.LMul.MX)
                 vti.RegClass:$rs3, vti.RegClass:$rs1, uimm5:$rs2,
-                GPR:$vl, vti.Log2SEW)>;
+                GPR:$vl, vti.Log2SEW, TAIL_UNDISTURBED_MASK_UNDISTURBED)>;
 
   def : Pat<(vti.Vector (riscv_slideup_vl (vti.Vector vti.RegClass:$rs3),
                                           (vti.Vector vti.RegClass:$rs1),
@@ -1627,7 +1728,7 @@ foreach vti = !listconcat(AllIntegerVectors, AllFloatVectors) in {
                                           VLOpFrag)),
             (!cast<Instruction>("PseudoVSLIDEUP_VX_"#vti.LMul.MX)
                 vti.RegClass:$rs3, vti.RegClass:$rs1, GPR:$rs2,
-                GPR:$vl, vti.Log2SEW)>;
+                GPR:$vl, vti.Log2SEW, TAIL_UNDISTURBED_MASK_UNDISTURBED)>;
 
   def : Pat<(vti.Vector (riscv_slidedown_vl (vti.Vector vti.RegClass:$rs3),
                                             (vti.Vector vti.RegClass:$rs1),
@@ -1635,7 +1736,14 @@ foreach vti = !listconcat(AllIntegerVectors, AllFloatVectors) in {
                                             VLOpFrag)),
             (!cast<Instruction>("PseudoVSLIDEDOWN_VI_"#vti.LMul.MX)
                 vti.RegClass:$rs3, vti.RegClass:$rs1, uimm5:$rs2,
-                GPR:$vl, vti.Log2SEW)>;
+                GPR:$vl, vti.Log2SEW, TAIL_UNDISTURBED_MASK_UNDISTURBED)>;
+  def : Pat<(vti.Vector (riscv_slidedown_vl (vti.Vector undef),
+                                            (vti.Vector vti.RegClass:$rs1),
+                                            uimm5:$rs2, (vti.Mask true_mask),
+                                            VLOpFrag)),
+            (!cast<Instruction>("PseudoVSLIDEDOWN_VI_"#vti.LMul.MX)
+                (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1, uimm5:$rs2,
+                GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
   def : Pat<(vti.Vector (riscv_slidedown_vl (vti.Vector vti.RegClass:$rs3),
                                             (vti.Vector vti.RegClass:$rs1),
@@ -1643,7 +1751,14 @@ foreach vti = !listconcat(AllIntegerVectors, AllFloatVectors) in {
                                             VLOpFrag)),
             (!cast<Instruction>("PseudoVSLIDEDOWN_VX_"#vti.LMul.MX)
                 vti.RegClass:$rs3, vti.RegClass:$rs1, GPR:$rs2,
-                GPR:$vl, vti.Log2SEW)>;
+                GPR:$vl, vti.Log2SEW, TAIL_UNDISTURBED_MASK_UNDISTURBED)>;
+  def : Pat<(vti.Vector (riscv_slidedown_vl (vti.Vector undef),
+                                            (vti.Vector vti.RegClass:$rs1),
+                                            GPR:$rs2, (vti.Mask true_mask),
+                                            VLOpFrag)),
+            (!cast<Instruction>("PseudoVSLIDEDOWN_VX_"#vti.LMul.MX)
+                (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1, GPR:$rs2,
+                GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 }
 
 } // Predicates = [HasVInstructions]
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index 07884d35f63c..9532d1dd3dd2 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -211,15 +211,16 @@ def CSImm12MulBy4 : PatLeaf<(imm), [{
     return false;
   int64_t C = N->getSExtValue();
   // Skip if C is simm12 or can be optimized by the PatLeaf AddiPair.
-  return !isInt<13>(C) && isInt<14>(C) && (C & 3) == 0;
+  return !isInt<13>(C) && isShiftedInt<12, 2>(C);
 }]>;
 
 def CSImm12MulBy8 : PatLeaf<(imm), [{
   if (!N->hasOneUse())
     return false;
   int64_t C = N->getSExtValue();
-  // Skip if C is simm12 or can be optimized by the PatLeaf AddiPair.
-  return !isInt<13>(C) && isInt<15>(C) && (C & 7) == 0;
+  // Skip if C is simm12 or can be optimized by the PatLeaf AddiPair or
+  // CSImm12MulBy4.
+  return !isInt<14>(C) && isShiftedInt<12, 3>(C);
 }]>;
 
 def SimmShiftRightBy2XForm : SDNodeXForm<imm, [{
@@ -232,6 +233,12 @@ def SimmShiftRightBy3XForm : SDNodeXForm<imm, [{
                                    N->getValueType(0));
 }]>;
 
+// Pattern to exclude simm12 immediates from matching.
+def non_imm12 : PatLeaf<(XLenVT GPR:$a), [{
+  auto *C = dyn_cast<ConstantSDNode>(N);
+  return !C || !isInt<12>(C->getSExtValue());
+}]>;
+
 //===----------------------------------------------------------------------===//
 // Instruction class templates
 //===----------------------------------------------------------------------===//
@@ -348,7 +355,7 @@ def SH2ADD_UW : ALUW_rr<0b0010000, 0b100, "sh2add.uw">,
                 Sched<[WriteSHXADD32, ReadSHXADD32, ReadSHXADD32]>;
 def SH3ADD_UW : ALUW_rr<0b0010000, 0b110, "sh3add.uw">,
                 Sched<[WriteSHXADD32, ReadSHXADD32, ReadSHXADD32]>;
-} // Predicates = [HasStdExtZbb, IsRV64]
+} // Predicates = [HasStdExtZba, IsRV64]
 
 let Predicates = [HasStdExtZbbOrZbpOrZbkb] in {
 def ROL   : ALU_rr<0b0110000, 0b001, "rol">,
@@ -368,7 +375,7 @@ def RORW  : ALUW_rr<0b0110000, 0b101, "rorw">,
 
 def RORIW : RVBShiftW_ri<0b0110000, 0b101, OPC_OP_IMM_32, "roriw">,
             Sched<[WriteRotateImm32, ReadRotateImm32]>;
-} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
+} // Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV64]
 
 let Predicates = [HasStdExtZbs] in {
 def BCLR : ALU_rr<0b0100100, 0b001, "bclr">,
@@ -391,32 +398,48 @@ def BEXTI : RVBShift_ri<0b01001, 0b101, OPC_OP_IMM, "bexti">,
 } // Predicates = [HasStdExtZbs]
 
 let Predicates = [HasStdExtZbp] in {
-def GORC : ALU_rr<0b0010100, 0b101, "gorc">, Sched<[]>;
-def GREV : ALU_rr<0b0110100, 0b101, "grev">, Sched<[]>;
-
-def GREVI : RVBShift_ri<0b01101, 0b101, OPC_OP_IMM, "grevi">, Sched<[]>;
-def GORCI : RVBShift_ri<0b00101, 0b101, OPC_OP_IMM, "gorci">, Sched<[]>;
-
-def SHFL   : ALU_rr<0b0000100, 0b001, "shfl">, Sched<[]>;
-def UNSHFL : ALU_rr<0b0000100, 0b101, "unshfl">, Sched<[]>;
-
-def SHFLI   : RVBShfl_ri<0b0000100, 0b001, OPC_OP_IMM, "shfli">, Sched<[]>;
-def UNSHFLI : RVBShfl_ri<0b0000100, 0b101, OPC_OP_IMM, "unshfli">, Sched<[]>;
-
-def XPERM_H : ALU_rr<0b0010100, 0b110, "xperm.h">, Sched<[]>;
+def GORC : ALU_rr<0b0010100, 0b101, "gorc">,
+           Sched<[WriteORC, ReadORC, ReadORC]>;
+def GREV : ALU_rr<0b0110100, 0b101, "grev">,
+           Sched<[WriteREV, ReadREV, ReadREV]>;
+
+def GREVI : RVBShift_ri<0b01101, 0b101, OPC_OP_IMM, "grevi">,
+            Sched<[WriteREVImm, ReadREVImm]>;
+def GORCI : RVBShift_ri<0b00101, 0b101, OPC_OP_IMM, "gorci">,
+            Sched<[WriteORCImm, ReadORCImm]>;
+
+def SHFL   : ALU_rr<0b0000100, 0b001, "shfl">,
+             Sched<[WriteSHFL, ReadSHFL, ReadSHFL]>;
+def UNSHFL : ALU_rr<0b0000100, 0b101, "unshfl">,
+             Sched<[WriteUNSHFL, ReadUNSHFL, ReadUNSHFL]>;
+
+def SHFLI   : RVBShfl_ri<0b0000100, 0b001, OPC_OP_IMM, "shfli">,
+              Sched<[WriteSHFLImm, ReadSHFLImm]>;
+def UNSHFLI : RVBShfl_ri<0b0000100, 0b101, OPC_OP_IMM, "unshfli">,
+              Sched<[WriteUNSHFLImm, ReadUNSHFLImm]>;
+
+def XPERM_H : ALU_rr<0b0010100, 0b110, "xperm.h">,
+              Sched<[WriteXPERMH, ReadXPERMH, ReadXPERMH]>;
 } // Predicates = [HasStdExtZbp]
 
 let Predicates = [HasStdExtZbp, IsRV64] in {
-def GORCW  : ALUW_rr<0b0010100, 0b101, "gorcw">, Sched<[]>;
-def GREVW  : ALUW_rr<0b0110100, 0b101, "grevw">, Sched<[]>;
-
-def GORCIW : RVBShiftW_ri<0b0010100, 0b101, OPC_OP_IMM_32, "gorciw">, Sched<[]>;
-def GREVIW : RVBShiftW_ri<0b0110100, 0b101, OPC_OP_IMM_32, "greviw">, Sched<[]>;
-
-def SHFLW   : ALUW_rr<0b0000100, 0b001, "shflw">, Sched<[]>;
-def UNSHFLW : ALUW_rr<0b0000100, 0b101, "unshflw">, Sched<[]>;
-
-def XPERM_W : ALU_rr<0b0010100, 0b000, "xperm.w">, Sched<[]>;
+def GORCW  : ALUW_rr<0b0010100, 0b101, "gorcw">,
+             Sched<[WriteORC32, ReadORC32, ReadORC32]>;
+def GREVW  : ALUW_rr<0b0110100, 0b101, "grevw">,
+             Sched<[WriteREV32, ReadREV32, ReadREV32]>;
+
+def GORCIW : RVBShiftW_ri<0b0010100, 0b101, OPC_OP_IMM_32, "gorciw">,
+             Sched<[WriteREVImm32, ReadREVImm32]>;
+def GREVIW : RVBShiftW_ri<0b0110100, 0b101, OPC_OP_IMM_32, "greviw">,
+             Sched<[WriteORCImm32, ReadORCImm32]>;
+
+def SHFLW   : ALUW_rr<0b0000100, 0b001, "shflw">,
+              Sched<[WriteSHFL32, ReadSHFL32, ReadSHFL32]>;
+def UNSHFLW : ALUW_rr<0b0000100, 0b101, "unshflw">,
+              Sched<[WriteUNSHFL32, ReadUNSHFL32, ReadUNSHFL32]>;
+
+def XPERM_W : ALU_rr<0b0010100, 0b000, "xperm.w">,
+              Sched<[WriteXPERMW, ReadXPERMW, ReadXPERMW]>;
 } // Predicates = [HasStdExtZbp, IsRV64]
 
 // These instructions were named xperm.n and xperm.b in the last version of
@@ -429,24 +452,28 @@ def XPERM8 : ALU_rr<0b0010100, 0b100, "xperm8">, Sched<[]>;
 
 let Predicates = [HasStdExtZbt] in {
 def CMIX : RVBTernaryR<0b11, 0b001, OPC_OP, "cmix", "$rd, $rs2, $rs1, $rs3">,
-           Sched<[]>;
+           Sched<[WriteCMix, ReadCMix, ReadCMix, ReadCMix]>;
 def CMOV : RVBTernaryR<0b11, 0b101, OPC_OP, "cmov", "$rd, $rs2, $rs1, $rs3">,
-           Sched<[]>;
+           Sched<[WriteCMov, ReadCMov, ReadCMov, ReadCMov]>;
 def FSL  : RVBTernaryR<0b10, 0b001, OPC_OP, "fsl", "$rd, $rs1, $rs3, $rs2">,
-           Sched<[]>;
+           Sched<[WriteFSReg, ReadFSReg, ReadFSReg, ReadFSReg]>;
 def FSR  : RVBTernaryR<0b10, 0b101, OPC_OP, "fsr", "$rd, $rs1, $rs3, $rs2">,
-           Sched<[]>;
+           Sched<[WriteFSReg, ReadFSReg, ReadFSReg, ReadFSReg]>;
 def FSRI : RVBTernaryImm6<0b101, OPC_OP_IMM, "fsri",
-                          "$rd, $rs1, $rs3, $shamt">, Sched<[]>;
+                          "$rd, $rs1, $rs3, $shamt">,
+           Sched<[WriteFSRImm, ReadFSRImm, ReadFSRImm]>;
 } // Predicates = [HasStdExtZbt]
 
 let Predicates = [HasStdExtZbt, IsRV64] in {
 def FSLW  : RVBTernaryR<0b10, 0b001, OPC_OP_32,
-                        "fslw", "$rd, $rs1, $rs3, $rs2">, Sched<[]>;
+                        "fslw", "$rd, $rs1, $rs3, $rs2">,
+            Sched<[WriteFSReg32, ReadFSReg32, ReadFSReg32, ReadFSReg32]>;
 def FSRW  : RVBTernaryR<0b10, 0b101, OPC_OP_32, "fsrw",
-                        "$rd, $rs1, $rs3, $rs2">, Sched<[]>;
+                        "$rd, $rs1, $rs3, $rs2">,
+            Sched<[WriteFSReg32, ReadFSReg32, ReadFSReg32, ReadFSReg32]>;
 def FSRIW : RVBTernaryImm5<0b10, 0b101, OPC_OP_IMM_32,
-                           "fsriw", "$rd, $rs1, $rs3, $shamt">, Sched<[]>;
+                           "fsriw", "$rd, $rs1, $rs3, $shamt">,
+            Sched<[WriteFSRImm32, ReadFSRImm32, ReadFSRImm32]>;
 } // Predicates = [HasStdExtZbt, IsRV64]
 
 let Predicates = [HasStdExtZbb] in {
@@ -476,88 +503,96 @@ def SEXT_H : RVBUnary<0b0110000, 0b00101, 0b001, OPC_OP_IMM, "sext.h">,
 
 let Predicates = [HasStdExtZbr] in {
 def CRC32_B : RVBUnary<0b0110000, 0b10000, 0b001, OPC_OP_IMM, "crc32.b">,
-              Sched<[]>;
+              Sched<[WriteCRCB, ReadCRCB]>;
 def CRC32_H : RVBUnary<0b0110000, 0b10001, 0b001, OPC_OP_IMM, "crc32.h">,
-              Sched<[]>;
+              Sched<[WriteCRCH, ReadCRCH]>;
 def CRC32_W : RVBUnary<0b0110000, 0b10010, 0b001, OPC_OP_IMM, "crc32.w">,
-              Sched<[]>;
+              Sched<[WriteCRCW, ReadCRCW]>;
 
 def CRC32C_B : RVBUnary<0b0110000, 0b11000, 0b001, OPC_OP_IMM, "crc32c.b">,
-               Sched<[]>;
+               Sched<[WriteCRCCB, ReadCRCCB]>;
 def CRC32C_H : RVBUnary<0b0110000, 0b11001, 0b001, OPC_OP_IMM, "crc32c.h">,
-               Sched<[]>;
+               Sched<[WriteCRCCH, ReadCRCCH]>;
 def CRC32C_W : RVBUnary<0b0110000, 0b11010, 0b001, OPC_OP_IMM, "crc32c.w">,
-               Sched<[]>;
+               Sched<[WriteCRCCW, ReadCRCCW]>;
 } // Predicates = [HasStdExtZbr]
 
 let Predicates = [HasStdExtZbr, IsRV64] in {
 def CRC32_D  : RVBUnary<0b0110000, 0b10011, 0b001, OPC_OP_IMM, "crc32.d">,
-               Sched<[]>;
+               Sched<[WriteCRCD, ReadCRCD]>;
 
 def CRC32C_D : RVBUnary<0b0110000, 0b11011, 0b001, OPC_OP_IMM, "crc32c.d">,
-               Sched<[]>;
+               Sched<[WriteCRCCD, ReadCRCCD]>;
 } // Predicates = [HasStdExtZbr, IsRV64]
 
 let Predicates = [HasStdExtZbc] in {
-def CLMULR : ALU_rr<0b0000101, 0b010, "clmulr">,
+def CLMULR : ALU_rr<0b0000101, 0b010, "clmulr", /*Commutable*/1>,
              Sched<[WriteCLMUL, ReadCLMUL, ReadCLMUL]>;
 } // Predicates = [HasStdExtZbc]
 
 let Predicates = [HasStdExtZbcOrZbkc] in {
-def CLMUL  : ALU_rr<0b0000101, 0b001, "clmul">,
+def CLMUL  : ALU_rr<0b0000101, 0b001, "clmul", /*Commutable*/1>,
              Sched<[WriteCLMUL, ReadCLMUL, ReadCLMUL]>;
-def CLMULH : ALU_rr<0b0000101, 0b011, "clmulh">,
+def CLMULH : ALU_rr<0b0000101, 0b011, "clmulh", /*Commutable*/1>,
              Sched<[WriteCLMUL, ReadCLMUL, ReadCLMUL]>;
 } // Predicates = [HasStdExtZbcOrZbkc]
 
 let Predicates = [HasStdExtZbb] in {
-def MIN  : ALU_rr<0b0000101, 0b100, "min">,
+def MIN  : ALU_rr<0b0000101, 0b100, "min", /*Commutable*/1>,
            Sched<[WriteIALU, ReadIALU, ReadIALU]>;
-def MINU : ALU_rr<0b0000101, 0b101, "minu">,
+def MINU : ALU_rr<0b0000101, 0b101, "minu", /*Commutable*/1>,
            Sched<[WriteIALU, ReadIALU, ReadIALU]>;
-def MAX  : ALU_rr<0b0000101, 0b110, "max">,
+def MAX  : ALU_rr<0b0000101, 0b110, "max", /*Commutable*/1>,
            Sched<[WriteIALU, ReadIALU, ReadIALU]>;
-def MAXU : ALU_rr<0b0000101, 0b111, "maxu">,
+def MAXU : ALU_rr<0b0000101, 0b111, "maxu", /*Commutable*/1>,
            Sched<[WriteIALU, ReadIALU, ReadIALU]>;
 } // Predicates = [HasStdExtZbb]
 
-let Predicates = [HasStdExtZbp] in {
-} // Predicates = [HasStdExtZbp]
-
 let Predicates = [HasStdExtZbe] in {
 // NOTE: These mnemonics are from the 0.94 spec. There is a name conflict with
 // bext in the 0.93 spec.
-def BDECOMPRESS : ALU_rr<0b0100100, 0b110, "bdecompress">, Sched<[]>;
-def BCOMPRESS   : ALU_rr<0b0000100, 0b110, "bcompress">, Sched<[]>;
+def BDECOMPRESS : ALU_rr<0b0100100, 0b110, "bdecompress">,
+                  Sched<[WriteDecompress, ReadDecompress, ReadDecompress]>;
+def BCOMPRESS   : ALU_rr<0b0000100, 0b110, "bcompress">,
+                  Sched<[WriteCompress, ReadCompress, ReadCompress]>;
 } // Predicates = [HasStdExtZbe]
 
 let Predicates = [HasStdExtZbe, IsRV64] in {
 // NOTE: These mnemonics are from the 0.94 spec. There is a name conflict with
 // bextw in the 0.93 spec.
-def BDECOMPRESSW : ALUW_rr<0b0100100, 0b110, "bdecompressw">, Sched<[]>;
-def BCOMPRESSW   : ALUW_rr<0b0000100, 0b110, "bcompressw">, Sched<[]>;
+def BDECOMPRESSW : ALUW_rr<0b0100100, 0b110, "bdecompressw">,
+                   Sched<[WriteDecompress32, ReadDecompress32, ReadDecompress32]>;
+def BCOMPRESSW   : ALUW_rr<0b0000100, 0b110, "bcompressw">,
+                   Sched<[WriteCompress32, ReadCompress32, ReadCompress32]>;
 } // Predicates = [HasStdExtZbe, IsRV64]
 
 let Predicates = [HasStdExtZbpOrZbkb] in {
-def PACK  : ALU_rr<0b0000100, 0b100, "pack">, Sched<[]>;
-def PACKH : ALU_rr<0b0000100, 0b111, "packh">, Sched<[]>;
+def PACK  : ALU_rr<0b0000100, 0b100, "pack">,
+            Sched<[WritePACK, ReadPACK, ReadPACK]>;
+def PACKH : ALU_rr<0b0000100, 0b111, "packh">,
+            Sched<[WritePACK, ReadPACK, ReadPACK]>;
 } // Predicates = [HasStdExtZbpOrZbkb]
 
 let Predicates = [HasStdExtZbpOrZbkb, IsRV64] in 
-def PACKW  : ALUW_rr<0b0000100, 0b100, "packw">, Sched<[]>;
+def PACKW  : ALUW_rr<0b0000100, 0b100, "packw">,
+             Sched<[WritePACK32, ReadPACK32, ReadPACK32]>;
 
 let Predicates = [HasStdExtZbp] in 
-def PACKU : ALU_rr<0b0100100, 0b100, "packu">, Sched<[]>;
+def PACKU : ALU_rr<0b0100100, 0b100, "packu">,
+            Sched<[WritePACKU, ReadPACKU, ReadPACKU]>;
 
 let Predicates = [HasStdExtZbp, IsRV64] in 
-def PACKUW : ALUW_rr<0b0100100, 0b100, "packuw">, Sched<[]>;
+def PACKUW : ALUW_rr<0b0100100, 0b100, "packuw">,
+             Sched<[WritePACKU32, ReadPACKU32, ReadPACKU32]>;
 
 let Predicates = [HasStdExtZbm, IsRV64] in {
 def BMATFLIP : RVBUnary<0b0110000, 0b00011, 0b001, OPC_OP_IMM, "bmatflip">,
-               Sched<[]>;
+               Sched<[WriteBMatrix, ReadBMatrix]>;
 
-def BMATOR   : ALU_rr<0b0000100, 0b011, "bmator">, Sched<[]>;
-def BMATXOR  : ALU_rr<0b0100100, 0b011, "bmatxor">, Sched<[]>;
+def BMATOR   : ALU_rr<0b0000100, 0b011, "bmator">,
+               Sched<[WriteBMatrix, ReadBMatrix, ReadBMatrix]>;
+def BMATXOR  : ALU_rr<0b0100100, 0b011, "bmatxor">,
+               Sched<[WriteBMatrix, ReadBMatrix, ReadBMatrix]>;
 } // Predicates = [HasStdExtZbm, IsRV64]
 
 let Predicates = [HasStdExtZbf] in
@@ -601,12 +636,15 @@ def ORC_B : RVBUnary<0b0010100, 0b00111, 0b101, OPC_OP_IMM, "orc.b">,
 } // Predicates = [HasStdExtZbbOrZbp]
 
 let Predicates = [HasStdExtZbpOrZbkb] in 
-def BREV8 : RVBUnary<0b0110100, 0b00111, 0b101, OPC_OP_IMM, "brev8">;
+def BREV8 : RVBUnary<0b0110100, 0b00111, 0b101, OPC_OP_IMM, "brev8">,
+            Sched<[]>;
 
 let Predicates = [HasStdExtZbpOrZbkb, IsRV32] in {
-def ZIP_RV32   : RVBUnary<0b0000100, 0b01111, 0b001, OPC_OP_IMM, "zip">;
-def UNZIP_RV32 : RVBUnary<0b0000100, 0b01111, 0b101, OPC_OP_IMM, "unzip">;
-} // Predicates = [HasStdExtZbkb, IsRV32]
+def ZIP_RV32   : RVBUnary<0b0000100, 0b01111, 0b001, OPC_OP_IMM, "zip">,
+                 Sched<[]>;
+def UNZIP_RV32 : RVBUnary<0b0000100, 0b01111, 0b101, OPC_OP_IMM, "unzip">,
+                 Sched<[]>;
+} // Predicates = [HasStdExtZbpOrZbkb, IsRV32]
 
 
 //===----------------------------------------------------------------------===//
@@ -615,7 +653,7 @@ def UNZIP_RV32 : RVBUnary<0b0000100, 0b01111, 0b101, OPC_OP_IMM, "unzip">;
 
 let Predicates = [HasStdExtZba, IsRV64] in {
 def : InstAlias<"zext.w $rd, $rs", (ADD_UW GPR:$rd, GPR:$rs, X0)>;
-}
+} // Predicates = [HasStdExtZba, IsRV64]
 
 let Predicates = [HasStdExtZbp] in {
 def : InstAlias<"rev.p $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b00001)>;
@@ -780,8 +818,8 @@ def : Pat<(xor GPR:$rs1, (not GPR:$rs2)), (XNOR GPR:$rs1, GPR:$rs2)>;
 } // Predicates = [HasStdExtZbbOrZbpOrZbkb]
 
 let Predicates = [HasStdExtZbbOrZbpOrZbkb] in {
-def : PatGprGpr<rotl, ROL>;
-def : PatGprGpr<rotr, ROR>;
+def : PatGprGpr<shiftop<rotl>, ROL>;
+def : PatGprGpr<shiftop<rotr>, ROR>;
 
 def : PatGprImm<rotr, RORI, uimmlog2xlen>;
 // There's no encoding for roli in the the 'B' extension as it can be
@@ -791,8 +829,8 @@ def : Pat<(rotl GPR:$rs1, uimmlog2xlen:$shamt),
 } // Predicates = [HasStdExtZbbOrZbpOrZbkb]
 
 let Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV64] in {
-def : PatGprGpr<riscv_rolw, ROLW>;
-def : PatGprGpr<riscv_rorw, RORW>;
+def : PatGprGpr<shiftopw<riscv_rolw>, ROLW>;
+def : PatGprGpr<shiftopw<riscv_rorw>, RORW>;
 def : PatGprImm<riscv_rorw, RORIW, uimm5>;
 def : Pat<(riscv_rolw GPR:$rs1, uimm5:$rs2),
           (RORIW GPR:$rs1, (ImmSubFrom32 uimm5:$rs2))>;
@@ -843,23 +881,25 @@ def : Pat<(and GPR:$r, BCLRITwoBitsMask:$i),
 def : Pat<(and GPR:$r, BCLRIANDIMask:$i),
           (BCLRI (ANDI GPR:$r, (BCLRIANDIMaskLow BCLRIANDIMask:$i)),
                  (BCLRITwoBitsMaskHigh BCLRIANDIMask:$i))>;
-}
+} // Predicates = [HasStdExtZbs]
 
 let Predicates = [HasStdExtZbbOrZbp] in {
 // We treat orc.b as a separate instruction, so match it directly. We also
 // lower the Zbb orc.b intrinsic to this.
 def : Pat<(riscv_gorc GPR:$rs1, 7), (ORC_B GPR:$rs1)>;
-}
+} // Predicates = [HasStdExtZbbOrZbp]
 
 let Predicates = [HasStdExtZbpOrZbkb] in {
 // We treat brev8 as a separate instruction, so match it directly. We also
 // use this for brev8 when lowering bitreverse with Zbkb.
 def : Pat<(riscv_grev GPR:$rs1, 7), (BREV8 GPR:$rs1)>;
+} // Predicates = [HasStdExtZbpOrZbkb]
 
+let Predicates = [HasStdExtZbpOrZbkb, IsRV32] in {
 // We treat zip and unzip as separate instructions, so match it directly.
 def : Pat<(i32 (riscv_shfl GPR:$rs1, 15)), (ZIP_RV32 GPR:$rs1)>;
 def : Pat<(i32 (riscv_unshfl GPR:$rs1, 15)), (UNZIP_RV32 GPR:$rs1)>;
-}
+} // Predicates = [HasStdExtZbpOrZbkb, IsRV32]
 
 let Predicates = [HasStdExtZbp] in {
 def : PatGprGpr<riscv_grev, GREV>;
@@ -880,12 +920,16 @@ def : PatGprGpr<int_riscv_xperm_h, XPERM_H>;
 let Predicates = [HasStdExtZbp, IsRV64] in {
 def : PatGprGpr<riscv_grevw, GREVW>;
 def : PatGprGpr<riscv_gorcw, GORCW>;
-def : PatGprImm<riscv_grevw, GREVIW, uimm5>;
-def : PatGprImm<riscv_gorcw, GORCIW, uimm5>;
 
-// FIXME: Move to DAG combine.
-def : Pat<(riscv_rorw (riscv_grevw GPR:$rs1, 24), 16), (GREVIW GPR:$rs1, 8)>;
-def : Pat<(riscv_rolw (riscv_grevw GPR:$rs1, 24), 16), (GREVIW GPR:$rs1, 8)>;
+// Select GREVIW/GORCIW when the immediate doesn't have bit 5 set and the result
+// is sign extended.
+// FIXME: Two special patterns keeped when Imm is 7.
+def : Pat<(i64 (sext_inreg (binop_oneuse<riscv_grev> GPR:$rs1, 7), i32)),
+          (GREVIW GPR:$rs1, 7)>;
+def : Pat<(i64 (sext_inreg (binop_oneuse<riscv_gorc> GPR:$rs1, 7), i32)),
+          (GORCIW GPR:$rs1, 7)>;
+def : PatGprImm<binop_allwusers<riscv_grev>, GREVIW, uimm5>;
+def : PatGprImm<binop_allwusers<riscv_gorc>, GORCIW, uimm5>;
 
 def : PatGprGpr<riscv_shflw, SHFLW>;
 def : PatGprGpr<riscv_unshflw, UNSHFLW>;
@@ -895,10 +939,6 @@ let Predicates = [HasStdExtZbp, IsRV64] in
 def : PatGprGpr<int_riscv_xperm_w, XPERM_W>;
 
 let Predicates = [HasStdExtZbp, IsRV32] in {
-// FIXME : Move to DAG combine.
-def : Pat<(i32 (rotr (riscv_grev GPR:$rs1, 24), (i32 16))), (GREVI GPR:$rs1, 8)>;
-def : Pat<(i32 (rotl (riscv_grev GPR:$rs1, 24), (i32 16))), (GREVI GPR:$rs1, 8)>;
-
 // We treat rev8 as a separate instruction, so match it directly.
 def : Pat<(i32 (riscv_grev GPR:$rs1, 24)), (REV8_RV32 GPR:$rs1)>;
 } // Predicates = [HasStdExtZbp, IsRV32]
@@ -911,6 +951,8 @@ def : Pat<(i64 (riscv_grev GPR:$rs1, 56)), (REV8_RV64 GPR:$rs1)>;
 let Predicates = [HasStdExtZbt] in {
 def : Pat<(or (and (not GPR:$rs2), GPR:$rs3), (and GPR:$rs2, GPR:$rs1)),
           (CMIX GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
+def : Pat<(xor (and (xor GPR:$rs1, GPR:$rs3), GPR:$rs2), GPR:$rs3),
+          (CMIX GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
 
 def : Pat<(select (XLenVT (setne GPR:$rs2, 0)), GPR:$rs1, GPR:$rs3),
           (CMOV GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
@@ -932,6 +974,13 @@ def : Pat<(select (XLenVT (setge GPR:$x, GPR:$y)), GPR:$rs3, GPR:$rs1),
           (CMOV GPR:$rs1, (SLT GPR:$x, GPR:$y), GPR:$rs3)>;
 def : Pat<(select (XLenVT (setle GPR:$y, GPR:$x)), GPR:$rs3, GPR:$rs1),
           (CMOV GPR:$rs1, (SLT GPR:$x, GPR:$y), GPR:$rs3)>;
+
+// setge X, Imm is canonicalized to setgt X, (Imm - 1).
+def : Pat<(select (XLenVT (setgt GPR:$x, simm12_minus1_nonzero:$imm)), GPR:$rs3, GPR:$rs1),
+          (CMOV GPR:$rs1, (SLTI GPR:$x, (ImmPlus1 simm12_minus1_nonzero:$imm)), GPR:$rs3)>;
+def : Pat<(select (XLenVT (setugt GPR:$x, simm12_minus1_nonzero:$imm)), GPR:$rs3, GPR:$rs1),
+          (CMOV GPR:$rs1, (SLTIU GPR:$x, (ImmPlus1 simm12_minus1_nonzero:$imm)), GPR:$rs3)>;
+
 def : Pat<(select GPR:$rs2, GPR:$rs1, GPR:$rs3),
           (CMOV GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
 } // Predicates = [HasStdExtZbt]
@@ -977,7 +1026,7 @@ def : Pat<(i64 (ctpop (i64 (zexti32 (i64 GPR:$rs1))))), (CPOPW GPR:$rs1)>;
 let Predicates = [HasStdExtZbb] in {
 def : Pat<(sext_inreg GPR:$rs1, i8), (SEXT_B GPR:$rs1)>;
 def : Pat<(sext_inreg GPR:$rs1, i16), (SEXT_H GPR:$rs1)>;
-}
+} // Predicates = [HasStdExtZbb]
 
 let Predicates = [HasStdExtZbb] in {
 def : PatGprGpr<smin, MIN>;
@@ -1018,7 +1067,7 @@ def : Pat<(i64 (sext_inreg (or (shl GPR:$rs2, (i64 16)),
 def : Pat<(i64 (or (sext_inreg (shl GPR:$rs2, (i64 16)), i32),
                    (and GPR:$rs1, 0x000000000000FFFF))),
           (PACKW GPR:$rs1, GPR:$rs2)>;
-}
+} // Predicates = [HasStdExtZbpOrZbkb, IsRV64]
 
 let Predicates = [HasStdExtZbp, IsRV32] in
 def : Pat<(i32 (or (and GPR:$rs2, 0xFFFF0000), (srl GPR:$rs1, (i32 16)))),
@@ -1031,19 +1080,13 @@ def : Pat<(i64 (or (and GPR:$rs2, 0xFFFFFFFF00000000), (srl GPR:$rs1, (i64 32)))
 def : Pat<(i64 (or (and (assertsexti32 GPR:$rs2), 0xFFFFFFFFFFFF0000),
                    (srl (and GPR:$rs1, 0xFFFFFFFF), (i64 16)))),
           (PACKUW GPR:$rs1, GPR:$rs2)>;
-}
+} // Predicates = [HasStdExtZbp, IsRV64]
 
 let Predicates = [HasStdExtZbbOrZbp, IsRV32] in
 def : Pat<(i32 (and GPR:$rs, 0xFFFF)), (ZEXT_H_RV32 GPR:$rs)>;
 let Predicates = [HasStdExtZbbOrZbp, IsRV64] in
 def : Pat<(i64 (and GPR:$rs, 0xFFFF)), (ZEXT_H_RV64 GPR:$rs)>;
 
-// Pattern to exclude simm12 immediates from matching.
-def non_imm12 : PatLeaf<(XLenVT GPR:$a), [{
-  auto *C = dyn_cast<ConstantSDNode>(N);
-  return !C || !isInt<12>(C->getSExtValue());
-}]>;
-
 let Predicates = [HasStdExtZba] in {
 def : Pat<(add (shl GPR:$rs1, (XLenVT 1)), non_imm12:$rs2),
           (SH1ADD GPR:$rs1, GPR:$rs2)>;
@@ -1132,6 +1175,33 @@ def : Pat<(i64 (add (and (shl GPR:$rs1, (i64 2)), 0x3FFFFFFFF), non_imm12:$rs2))
           (SH2ADD_UW GPR:$rs1, GPR:$rs2)>;
 def : Pat<(i64 (add (and (shl GPR:$rs1, (i64 3)), 0x7FFFFFFFF), non_imm12:$rs2)),
           (SH3ADD_UW GPR:$rs1, GPR:$rs2)>;
+
+def : Pat<(i64 (add (and GPR:$rs1, 0xFFFFFFFE), non_imm12:$rs2)),
+          (SH1ADD (SRLIW GPR:$rs1, 1), GPR:$rs2)>;
+def : Pat<(i64 (add (and GPR:$rs1, 0xFFFFFFFC), non_imm12:$rs2)),
+          (SH2ADD (SRLIW GPR:$rs1, 2), GPR:$rs2)>;
+def : Pat<(i64 (add (and GPR:$rs1, 0xFFFFFFF8), non_imm12:$rs2)),
+          (SH3ADD (SRLIW GPR:$rs1, 3), GPR:$rs2)>;
+
+// Use SRLI to clear the LSBs and SHXADD_UW to mask and shift.
+def : Pat<(i64 (add (and GPR:$rs1, 0x1FFFFFFFE), non_imm12:$rs2)),
+          (SH1ADD_UW (SRLI GPR:$rs1, 1), GPR:$rs2)>;
+def : Pat<(i64 (add (and GPR:$rs1, 0x3FFFFFFFC), non_imm12:$rs2)),
+          (SH2ADD_UW (SRLI GPR:$rs1, 2), GPR:$rs2)>;
+def : Pat<(i64 (add (and GPR:$rs1, 0x7FFFFFFF8), non_imm12:$rs2)),
+          (SH3ADD_UW (SRLI GPR:$rs1, 3), GPR:$rs2)>;
+
+// Use SRLIW to shift out the LSBs and zero the upper 32-bits. Use SHXADD to
+// shift zeros into the LSBs the addition shl amount.
+def : Pat<(i64 (add (shl (binop_oneuse<and> GPR:$rs1, 0xFFFFFFFE), (i64 1)),
+                    non_imm12:$rs2)),
+          (SH2ADD (SRLIW GPR:$rs1, 1), GPR:$rs2)>;
+def : Pat<(i64 (add (shl (binop_oneuse<and> GPR:$rs1, 0xFFFFFFFE), (i64 2)),
+                    non_imm12:$rs2)),
+          (SH3ADD (SRLIW GPR:$rs1, 1), GPR:$rs2)>;
+def : Pat<(i64 (add (shl (binop_oneuse<and> GPR:$rs1, 0xFFFFFFFC), (i64 1)),
+                    non_imm12:$rs2)),
+          (SH3ADD (SRLIW GPR:$rs1, 2), GPR:$rs2)>;
 } // Predicates = [HasStdExtZba, IsRV64]
 
 let Predicates = [HasStdExtZbcOrZbkc] in {
@@ -1175,4 +1245,4 @@ def : PatGprGpr<riscv_bfpw, BFPW>;
 let Predicates = [HasStdExtZbkx] in {
 def : PatGprGpr<int_riscv_xperm4, XPERM4>;
 def : PatGprGpr<int_riscv_xperm8, XPERM8>;
-}
+} // Predicates = [HasStdExtZbkx]
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
index a2753c132354..5a4366b0908c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
@@ -17,13 +17,71 @@
 
 def SDT_RISCVFMV_H_X
     : SDTypeProfile<1, 1, [SDTCisVT<0, f16>, SDTCisVT<1, XLenVT>]>;
-def SDT_RISCVFMV_X_ANYEXTH
+def SDT_RISCVFMV_X_EXTH
     : SDTypeProfile<1, 1, [SDTCisVT<0, XLenVT>, SDTCisVT<1, f16>]>;
 
 def riscv_fmv_h_x
     : SDNode<"RISCVISD::FMV_H_X", SDT_RISCVFMV_H_X>;
 def riscv_fmv_x_anyexth
-    : SDNode<"RISCVISD::FMV_X_ANYEXTH", SDT_RISCVFMV_X_ANYEXTH>;
+    : SDNode<"RISCVISD::FMV_X_ANYEXTH", SDT_RISCVFMV_X_EXTH>;
+def riscv_fmv_x_signexth
+    : SDNode<"RISCVISD::FMV_X_SIGNEXTH", SDT_RISCVFMV_X_EXTH>;
+
+//===----------------------------------------------------------------------===//
+// Operand and SDNode transformation definitions.
+//===----------------------------------------------------------------------===//
+
+// Zhinxmin and Zhinx
+
+def FPR16INX : RegisterOperand<GPRF16> {
+  let ParserMatchClass = GPRAsFPR;
+  let DecoderMethod = "DecodeGPRRegisterClass";
+}
+
+def ZfhExt           : ExtInfo<0, [HasStdExtZfh]>;
+def Zfh64Ext         : ExtInfo<0, [HasStdExtZfh,             IsRV64]>;
+def ZfhminExt        : ExtInfo<0, [HasStdExtZfhOrZfhmin]>;
+def ZhinxExt         : ExtInfo<1, [HasStdExtZhinx]>;
+def ZhinxminExt      : ExtInfo<1, [HasStdExtZhinxOrZhinxmin]>;
+def Zhinx64Ext       : ExtInfo<1, [HasStdExtZhinx,           IsRV64]>;
+
+def ZfhminDExt       : ExtInfo<0, [HasStdExtZfhOrZfhmin,     HasStdExtD]>;
+def ZhinxminZdinxExt : ExtInfo<1, [HasStdExtZhinxOrZhinxmin, HasStdExtZdinx]>;
+
+def H     : ExtInfo_r<ZfhExt,     FPR16>;
+def H_INX : ExtInfo_r<ZhinxExt, FPR16INX>;
+
+def HH        : ExtInfo_rr<ZfhExt,           FPR16,    FPR16>;
+def HH_INX    : ExtInfo_rr<ZhinxExt,         FPR16INX, FPR16INX>;
+def XH        : ExtInfo_rr<ZfhExt,           GPR,      FPR16>;
+def XH_INX    : ExtInfo_rr<ZhinxExt,         GPR,      FPR16INX>;
+def HX        : ExtInfo_rr<ZfhExt,           FPR16,    GPR>;
+def HX_INX    : ExtInfo_rr<ZhinxExt,         FPR16INX, GPR>;
+def XH_64     : ExtInfo_rr<Zfh64Ext,         GPR,      FPR16>;
+def HX_64     : ExtInfo_rr<Zfh64Ext,         FPR16,    GPR>;
+def XH_INX_64 : ExtInfo_rr<Zhinx64Ext,       GPR,      FPR16INX>;
+def HX_INX_64 : ExtInfo_rr<Zhinx64Ext,       FPR16INX, GPR>;
+def HFmin     : ExtInfo_rr<ZfhminExt,        FPR16,    FPR32>;
+def HF_INXmin : ExtInfo_rr<ZhinxminExt,      FPR16INX, FPR32INX>;
+def HF_INX    : ExtInfo_rr<ZhinxExt,         FPR16INX, FPR32INX>;
+def FHmin     : ExtInfo_rr<ZfhminExt,        FPR32,    FPR16>;
+def FH_INXmin : ExtInfo_rr<ZhinxminExt,      FPR32INX, FPR16INX>;
+def FH_INX    : ExtInfo_rr<ZhinxExt,         FPR32INX, FPR16INX>;
+def DHmin     : ExtInfo_rr<ZfhminDExt,       FPR64,    FPR16>;
+def DH_INXmin : ExtInfo_rr<ZhinxminZdinxExt, FPR64INX, FPR16INX>;
+def HDmin     : ExtInfo_rr<ZfhminDExt,       FPR16,    FPR64>;
+def HD_INXmin : ExtInfo_rr<ZhinxminZdinxExt, FPR16INX, FPR64INX>;
+
+defvar HINX     = [H,     H_INX];
+defvar HHINX    = [HH,    HH_INX];
+defvar XHINX    = [XH,    XH_INX];
+defvar HXINX    = [HX,    HX_INX];
+defvar XHIN64X  = [XH_64, XH_INX_64];
+defvar HXIN64X  = [HX_64, HX_INX_64];
+defvar HFINXmin = [HFmin, HF_INXmin];
+defvar FHINXmin = [FHmin, FH_INXmin];
+defvar DHINXmin = [DHmin, DH_INXmin];
+defvar HDINXmin = [HDmin, HD_INXmin];
 
 //===----------------------------------------------------------------------===//
 // Instructions
@@ -38,74 +96,73 @@ def FLH : FPLoad_r<0b001, "flh", FPR16, WriteFLD16>;
 def FSH : FPStore_r<0b001, "fsh", FPR16, WriteFST16>;
 } // Predicates = [HasStdExtZfhOrZfhmin]
 
-let Predicates = [HasStdExtZfh] in {
 let SchedRW = [WriteFMA16, ReadFMA16, ReadFMA16, ReadFMA16] in {
-def FMADD_H  : FPFMA_rrr_frm<OPC_MADD,  0b10, "fmadd.h",  FPR16>;
-def FMSUB_H  : FPFMA_rrr_frm<OPC_MSUB,  0b10, "fmsub.h",  FPR16>;
-def FNMSUB_H : FPFMA_rrr_frm<OPC_NMSUB, 0b10, "fnmsub.h", FPR16>;
-def FNMADD_H : FPFMA_rrr_frm<OPC_NMADD, 0b10, "fnmadd.h", FPR16>;
+defm FMADD_H  : FPFMA_rrr_frm_m<OPC_MADD,  0b10, "fmadd.h",  HINX>;
+defm FMSUB_H  : FPFMA_rrr_frm_m<OPC_MSUB,  0b10, "fmsub.h",  HINX>;
+defm FNMSUB_H : FPFMA_rrr_frm_m<OPC_NMSUB, 0b10, "fnmsub.h", HINX>;
+defm FNMADD_H : FPFMA_rrr_frm_m<OPC_NMADD, 0b10, "fnmadd.h", HINX>;
+}
+
+defm : FPFMADynFrmAlias_m<FMADD_H,  "fmadd.h",  HINX>;
+defm : FPFMADynFrmAlias_m<FMSUB_H,  "fmsub.h",  HINX>;
+defm : FPFMADynFrmAlias_m<FNMSUB_H, "fnmsub.h", HINX>;
+defm : FPFMADynFrmAlias_m<FNMADD_H, "fnmadd.h", HINX>;
+
+let SchedRW = [WriteFALU16, ReadFALU16, ReadFALU16] in {
+defm FADD_H : FPALU_rr_frm_m<0b0000010, "fadd.h", HINX, /*Commutable*/1>;
+defm FSUB_H : FPALU_rr_frm_m<0b0000110, "fsub.h", HINX>;
 }
+let SchedRW = [WriteFMul16, ReadFMul16, ReadFMul16] in
+defm FMUL_H : FPALU_rr_frm_m<0b0001010, "fmul.h", HINX, /*Commutable*/1>;
+
+let SchedRW = [WriteFDiv16, ReadFDiv16, ReadFDiv16] in
+defm FDIV_H : FPALU_rr_frm_m<0b0001110, "fdiv.h", HINX>;
 
-def : FPFMADynFrmAlias<FMADD_H,  "fmadd.h",  FPR16>;
-def : FPFMADynFrmAlias<FMSUB_H,  "fmsub.h",  FPR16>;
-def : FPFMADynFrmAlias<FNMSUB_H, "fnmsub.h", FPR16>;
-def : FPFMADynFrmAlias<FNMADD_H, "fnmadd.h", FPR16>;
-
-def FADD_H : FPALU_rr_frm<0b0000010, "fadd.h", FPR16>,
-             Sched<[WriteFALU16, ReadFALU16, ReadFALU16]>;
-def FSUB_H : FPALU_rr_frm<0b0000110, "fsub.h", FPR16>,
-             Sched<[WriteFALU16, ReadFALU16, ReadFALU16]>;
-def FMUL_H : FPALU_rr_frm<0b0001010, "fmul.h", FPR16>,
-             Sched<[WriteFMul16, ReadFMul16, ReadFMul16]>;
-def FDIV_H : FPALU_rr_frm<0b0001110, "fdiv.h", FPR16>,
-             Sched<[WriteFDiv16, ReadFDiv16, ReadFDiv16]>;
-
-def        : FPALUDynFrmAlias<FADD_H, "fadd.h", FPR16>;
-def        : FPALUDynFrmAlias<FSUB_H, "fsub.h", FPR16>;
-def        : FPALUDynFrmAlias<FMUL_H, "fmul.h", FPR16>;
-def        : FPALUDynFrmAlias<FDIV_H, "fdiv.h", FPR16>;
-
-def FSQRT_H : FPUnaryOp_r_frm<0b0101110, 0b00000, FPR16, FPR16, "fsqrt.h">,
-              Sched<[WriteFSqrt16, ReadFSqrt16]>;
-def         : FPUnaryOpDynFrmAlias<FSQRT_H, "fsqrt.h", FPR16, FPR16>;
+defm : FPALUDynFrmAlias_m<FADD_H, "fadd.h", HINX>;
+defm : FPALUDynFrmAlias_m<FSUB_H, "fsub.h", HINX>;
+defm : FPALUDynFrmAlias_m<FMUL_H, "fmul.h", HINX>;
+defm : FPALUDynFrmAlias_m<FDIV_H, "fdiv.h", HINX>;
+
+defm FSQRT_H : FPUnaryOp_r_frm_m<0b0101110, 0b00000, HHINX, "fsqrt.h">,
+               Sched<[WriteFSqrt16, ReadFSqrt16]>;
+defm         : FPUnaryOpDynFrmAlias_m<FSQRT_H, "fsqrt.h", HHINX>;
 
 let SchedRW = [WriteFSGNJ16, ReadFSGNJ16, ReadFSGNJ16],
     mayRaiseFPException = 0 in {
-def FSGNJ_H  : FPALU_rr<0b0010010, 0b000, "fsgnj.h", FPR16>;
-def FSGNJN_H : FPALU_rr<0b0010010, 0b001, "fsgnjn.h", FPR16>;
-def FSGNJX_H : FPALU_rr<0b0010010, 0b010, "fsgnjx.h", FPR16>;
+defm FSGNJ_H  : FPALU_rr_m<0b0010010, 0b000, "fsgnj.h",  HINX>;
+defm FSGNJN_H : FPALU_rr_m<0b0010010, 0b001, "fsgnjn.h", HINX>;
+defm FSGNJX_H : FPALU_rr_m<0b0010010, 0b010, "fsgnjx.h", HINX>;
 }
 
 let SchedRW = [WriteFMinMax16, ReadFMinMax16, ReadFMinMax16] in {
-def FMIN_H   : FPALU_rr<0b0010110, 0b000, "fmin.h", FPR16>;
-def FMAX_H   : FPALU_rr<0b0010110, 0b001, "fmax.h", FPR16>;
+defm FMIN_H   : FPALU_rr_m<0b0010110, 0b000, "fmin.h", HINX, /*Commutable*/1>;
+defm FMAX_H   : FPALU_rr_m<0b0010110, 0b001, "fmax.h", HINX, /*Commutable*/1>;
 }
 
-def FCVT_W_H : FPUnaryOp_r_frm<0b1100010, 0b00000, GPR, FPR16, "fcvt.w.h">,
-               Sched<[WriteFCvtF16ToI32, ReadFCvtF16ToI32]>;
-def          : FPUnaryOpDynFrmAlias<FCVT_W_H, "fcvt.w.h", GPR, FPR16>;
-
-def FCVT_WU_H : FPUnaryOp_r_frm<0b1100010, 0b00001, GPR, FPR16, "fcvt.wu.h">,
+defm FCVT_W_H : FPUnaryOp_r_frm_m<0b1100010, 0b00000, XHINX, "fcvt.w.h">,
                 Sched<[WriteFCvtF16ToI32, ReadFCvtF16ToI32]>;
-def           : FPUnaryOpDynFrmAlias<FCVT_WU_H, "fcvt.wu.h", GPR, FPR16>;
+defm          : FPUnaryOpDynFrmAlias_m<FCVT_W_H, "fcvt.w.h", XHINX>;
 
-def FCVT_H_W : FPUnaryOp_r_frm<0b1101010, 0b00000, FPR16, GPR, "fcvt.h.w">,
-               Sched<[WriteFCvtI32ToF16, ReadFCvtI32ToF16]>;
-def          : FPUnaryOpDynFrmAlias<FCVT_H_W, "fcvt.h.w", FPR16, GPR>;
+defm FCVT_WU_H : FPUnaryOp_r_frm_m<0b1100010, 0b00001, XHINX, "fcvt.wu.h">,
+                 Sched<[WriteFCvtF16ToI32, ReadFCvtF16ToI32]>;
+defm           : FPUnaryOpDynFrmAlias_m<FCVT_WU_H, "fcvt.wu.h", XHINX>;
 
-def FCVT_H_WU : FPUnaryOp_r_frm<0b1101010, 0b00001, FPR16, GPR, "fcvt.h.wu">,
+defm FCVT_H_W : FPUnaryOp_r_frm_m<0b1101010, 0b00000, HXINX, "fcvt.h.w">,
                 Sched<[WriteFCvtI32ToF16, ReadFCvtI32ToF16]>;
-def           : FPUnaryOpDynFrmAlias<FCVT_H_WU, "fcvt.h.wu", FPR16, GPR>;
-} // Predicates = [HasStdExtZfh]
+defm          : FPUnaryOpDynFrmAlias_m<FCVT_H_W, "fcvt.h.w", HXINX>;
 
-let Predicates = [HasStdExtZfhOrZfhmin] in {
-def FCVT_H_S : FPUnaryOp_r_frm<0b0100010, 0b00000, FPR16, FPR32, "fcvt.h.s">,
-               Sched<[WriteFCvtF32ToF16, ReadFCvtF32ToF16]>;
-def          : FPUnaryOpDynFrmAlias<FCVT_H_S, "fcvt.h.s", FPR16, FPR32>;
+defm FCVT_H_WU : FPUnaryOp_r_frm_m<0b1101010, 0b00001, HXINX, "fcvt.h.wu">,
+                 Sched<[WriteFCvtI32ToF16, ReadFCvtI32ToF16]>;
+defm           : FPUnaryOpDynFrmAlias_m<FCVT_H_WU, "fcvt.h.wu", HXINX>;
 
-def FCVT_S_H : FPUnaryOp_r<0b0100000, 0b00010, 0b000, FPR32, FPR16, "fcvt.s.h">,
+defm FCVT_H_S : FPUnaryOp_r_frm_m<0b0100010, 0b00000, HFINXmin, "fcvt.h.s">,
+                Sched<[WriteFCvtF32ToF16, ReadFCvtF32ToF16]>;
+defm          : FPUnaryOpDynFrmAlias_m<FCVT_H_S, "fcvt.h.s", HFINXmin>;
+
+defm FCVT_S_H : FPUnaryOp_r_m<0b0100000, 0b00010, 0b000, FHINXmin, "fcvt.s.h">,
                Sched<[WriteFCvtF16ToF32, ReadFCvtF16ToF32]>;
 
+let Predicates = [HasStdExtZfhOrZfhmin] in {
 let mayRaiseFPException = 0 in
 def FMV_X_H : FPUnaryOp_r<0b1110010, 0b00000, 0b000, GPR, FPR16, "fmv.x.h">,
               Sched<[WriteFMovF16ToI16, ReadFMovF16ToI16]>;
@@ -115,45 +172,38 @@ def FMV_H_X : FPUnaryOp_r<0b1111010, 0b00000, 0b000, FPR16, GPR, "fmv.h.x">,
               Sched<[WriteFMovI16ToF16, ReadFMovI16ToF16]>;
 } // Predicates = [HasStdExtZfhOrZfhmin]
 
-let Predicates = [HasStdExtZfh] in {
-
 let SchedRW = [WriteFCmp16, ReadFCmp16, ReadFCmp16] in {
-def FEQ_H : FPCmp_rr<0b1010010, 0b010, "feq.h", FPR16>;
-def FLT_H : FPCmp_rr<0b1010010, 0b001, "flt.h", FPR16>;
-def FLE_H : FPCmp_rr<0b1010010, 0b000, "fle.h", FPR16>;
+defm FEQ_H : FPCmp_rr_m<0b1010010, 0b010, "feq.h", HINX, /*Commutable*/1>;
+defm FLT_H : FPCmp_rr_m<0b1010010, 0b001, "flt.h", HINX>;
+defm FLE_H : FPCmp_rr_m<0b1010010, 0b000, "fle.h", HINX>;
 }
 
 let mayRaiseFPException = 0 in
-def FCLASS_H : FPUnaryOp_r<0b1110010, 0b00000, 0b001, GPR, FPR16, "fclass.h">,
-               Sched<[WriteFClass16, ReadFClass16]>;
-} // Predicates = [HasStdExtZfh]
-
-let Predicates = [HasStdExtZfh, IsRV64] in {
-def FCVT_L_H  : FPUnaryOp_r_frm<0b1100010, 0b00010, GPR, FPR16, "fcvt.l.h">,
-                Sched<[WriteFCvtF16ToI64, ReadFCvtF16ToI64]>;
-def           : FPUnaryOpDynFrmAlias<FCVT_L_H, "fcvt.l.h", GPR, FPR16>;
+defm FCLASS_H : FPUnaryOp_r_m<0b1110010, 0b00000, 0b001, XHINX, "fclass.h">,
+                Sched<[WriteFClass16, ReadFClass16]>;
 
-def FCVT_LU_H  : FPUnaryOp_r_frm<0b1100010, 0b00011, GPR, FPR16, "fcvt.lu.h">,
+defm FCVT_L_H  : FPUnaryOp_r_frm_m<0b1100010, 0b00010, XHIN64X, "fcvt.l.h">,
                  Sched<[WriteFCvtF16ToI64, ReadFCvtF16ToI64]>;
-def            : FPUnaryOpDynFrmAlias<FCVT_LU_H, "fcvt.lu.h", GPR, FPR16>;
+defm           : FPUnaryOpDynFrmAlias_m<FCVT_L_H, "fcvt.l.h", XHIN64X>;
 
-def FCVT_H_L : FPUnaryOp_r_frm<0b1101010, 0b00010, FPR16, GPR, "fcvt.h.l">,
-               Sched<[WriteFCvtI64ToF16, ReadFCvtI64ToF16]>;
-def          : FPUnaryOpDynFrmAlias<FCVT_H_L, "fcvt.h.l", FPR16, GPR>;
+defm FCVT_LU_H  : FPUnaryOp_r_frm_m<0b1100010, 0b00011, XHIN64X, "fcvt.lu.h">,
+                  Sched<[WriteFCvtF16ToI64, ReadFCvtF16ToI64]>;
+defm            : FPUnaryOpDynFrmAlias_m<FCVT_LU_H, "fcvt.lu.h", XHIN64X>;
 
-def FCVT_H_LU : FPUnaryOp_r_frm<0b1101010, 0b00011, FPR16, GPR, "fcvt.h.lu">,
+defm FCVT_H_L : FPUnaryOp_r_frm_m<0b1101010, 0b00010, HXIN64X, "fcvt.h.l">,
                 Sched<[WriteFCvtI64ToF16, ReadFCvtI64ToF16]>;
-def           : FPUnaryOpDynFrmAlias<FCVT_H_LU, "fcvt.h.lu", FPR16, GPR>;
-} // Predicates = [HasStdExtZfh, IsRV64]
+defm          : FPUnaryOpDynFrmAlias_m<FCVT_H_L, "fcvt.h.l", HXIN64X>;
 
-let Predicates = [HasStdExtZfhOrZfhmin, HasStdExtD] in {
-def FCVT_H_D : FPUnaryOp_r_frm<0b0100010, 0b00001, FPR16, FPR64, "fcvt.h.d">,
-               Sched<[WriteFCvtF64ToF16, ReadFCvtF64ToF16]>;
-def          : FPUnaryOpDynFrmAlias<FCVT_H_D, "fcvt.h.d", FPR16, FPR64>;
+defm FCVT_H_LU : FPUnaryOp_r_frm_m<0b1101010, 0b00011, HXIN64X, "fcvt.h.lu">,
+                 Sched<[WriteFCvtI64ToF16, ReadFCvtI64ToF16]>;
+defm           : FPUnaryOpDynFrmAlias_m<FCVT_H_LU, "fcvt.h.lu", HXIN64X>;
 
-def FCVT_D_H : FPUnaryOp_r<0b0100001, 0b00010, 0b000, FPR64, FPR16, "fcvt.d.h">,
-               Sched<[WriteFCvtF16ToF64, ReadFCvtF16ToF64]>;
-} // Predicates = [HasStdExtZfhOrZfhmin, HasStdExtD]
+defm FCVT_H_D : FPUnaryOp_r_frm_m<0b0100010, 0b00001, HDINXmin, "fcvt.h.d">,
+                Sched<[WriteFCvtF64ToF16, ReadFCvtF64ToF16]>;
+defm          : FPUnaryOpDynFrmAlias_m<FCVT_H_D, "fcvt.h.d", HDINXmin>;
+
+defm FCVT_D_H : FPUnaryOp_r_m<0b0100001, 0b00010, 0b000, DHINXmin, "fcvt.d.h">,
+                Sched<[WriteFCvtF16ToF64, ReadFCvtF16ToF64]>;
 
 //===----------------------------------------------------------------------===//
 // Assembler Pseudo Instructions (User-Level ISA, Version 2.2, Chapter 20)
@@ -186,17 +236,21 @@ def PseudoQuietFLT_H : PseudoQuietFCMP<FPR16>;
 }
 } // Predicates = [HasStdExtZfhOrZfhmin]
 
+let Predicates = [HasStdExtZhinx] in {
+def : InstAlias<"fmv.h $rd, $rs",  (FSGNJ_H_INX  FPR16INX:$rd, FPR16INX:$rs, FPR16INX:$rs)>;
+def : InstAlias<"fabs.h $rd, $rs", (FSGNJX_H_INX FPR16INX:$rd, FPR16INX:$rs, FPR16INX:$rs)>;
+def : InstAlias<"fneg.h $rd, $rs", (FSGNJN_H_INX FPR16INX:$rd, FPR16INX:$rs, FPR16INX:$rs)>;
+
+def : InstAlias<"fgt.h $rd, $rs, $rt",
+                (FLT_H_INX GPR:$rd, FPR16INX:$rt, FPR16INX:$rs), 0>;
+def : InstAlias<"fge.h $rd, $rs, $rt",
+                (FLE_H_INX GPR:$rd, FPR16INX:$rt, FPR16INX:$rs), 0>;
+} // Predicates = [HasStdExtZhinx]
+
 //===----------------------------------------------------------------------===//
 // Pseudo-instructions and codegen patterns
 //===----------------------------------------------------------------------===//
 
-/// Generic pattern classes
-class PatFpr16Fpr16<SDPatternOperator OpNode, RVInstR Inst>
-    : Pat<(OpNode FPR16:$rs1, FPR16:$rs2), (Inst $rs1, $rs2)>;
-
-class PatFpr16Fpr16DynFrm<SDPatternOperator OpNode, RVInstRFrm Inst>
-    : Pat<(OpNode FPR16:$rs1, FPR16:$rs2), (Inst $rs1, $rs2, 0b111)>;
-
 let Predicates = [HasStdExtZfh] in {
 
 /// Float constants
@@ -210,17 +264,17 @@ def : Pat<(f16 (fpimmneg0)), (FSGNJN_H (FMV_H_X X0), (FMV_H_X X0))>;
 
 /// Float arithmetic operations
 
-def : PatFpr16Fpr16DynFrm<any_fadd, FADD_H>;
-def : PatFpr16Fpr16DynFrm<any_fsub, FSUB_H>;
-def : PatFpr16Fpr16DynFrm<any_fmul, FMUL_H>;
-def : PatFpr16Fpr16DynFrm<any_fdiv, FDIV_H>;
+def : PatFprFprDynFrm<any_fadd, FADD_H, FPR16>;
+def : PatFprFprDynFrm<any_fsub, FSUB_H, FPR16>;
+def : PatFprFprDynFrm<any_fmul, FMUL_H, FPR16>;
+def : PatFprFprDynFrm<any_fdiv, FDIV_H, FPR16>;
 
 def : Pat<(any_fsqrt FPR16:$rs1), (FSQRT_H FPR16:$rs1, 0b111)>;
 
 def : Pat<(fneg FPR16:$rs1), (FSGNJN_H $rs1, $rs1)>;
 def : Pat<(fabs FPR16:$rs1), (FSGNJX_H $rs1, $rs1)>;
 
-def : PatFpr16Fpr16<fcopysign, FSGNJ_H>;
+def : PatFprFpr<fcopysign, FSGNJ_H, FPR16>;
 def : Pat<(fcopysign FPR16:$rs1, (fneg FPR16:$rs2)), (FSGNJN_H $rs1, $rs2)>;
 def : Pat<(fcopysign FPR16:$rs1, FPR32:$rs2),
           (FSGNJ_H $rs1, (FCVT_H_S $rs2, 0b111))>;
@@ -242,11 +296,15 @@ def : Pat<(any_fma (fneg FPR16:$rs1), FPR16:$rs2, FPR16:$rs3),
 def : Pat<(any_fma (fneg FPR16:$rs1), FPR16:$rs2, (fneg FPR16:$rs3)),
           (FNMADD_H FPR16:$rs1, FPR16:$rs2, FPR16:$rs3, 0b111)>;
 
+// fnmadd: -(rs1 * rs2 + rs3) (the nsz flag on the FMA)
+def : Pat<(fneg (any_fma_nsz FPR16:$rs1, FPR16:$rs2, FPR16:$rs3)),
+          (FNMADD_H FPR16:$rs1, FPR16:$rs2, FPR16:$rs3, 0b111)>;
+
 // The ratified 20191213 ISA spec defines fmin and fmax in a way that matches
 // LLVM's fminnum and fmaxnum
 // <https://github.com/riscv/riscv-isa-manual/commit/cd20cee7efd9bac7c5aa127ec3b451749d2b3cce>.
-def : PatFpr16Fpr16<fminnum, FMIN_H>;
-def : PatFpr16Fpr16<fmaxnum, FMAX_H>;
+def : PatFprFpr<fminnum, FMIN_H, FPR16>;
+def : PatFprFpr<fmaxnum, FMAX_H, FPR16>;
 
 /// Setcc
 // FIXME: SETEQ/SETLT/SETLE imply nonans, can we pick better instructions for
@@ -299,6 +357,7 @@ def : Pat<(any_fpextend FPR16:$rs1), (FCVT_S_H FPR16:$rs1)>;
 // Moves (no conversion)
 def : Pat<(riscv_fmv_h_x GPR:$src), (FMV_H_X GPR:$src)>;
 def : Pat<(riscv_fmv_x_anyexth FPR16:$src), (FMV_X_H FPR16:$src)>;
+def : Pat<(riscv_fmv_x_signexth FPR16:$src), (FMV_X_H FPR16:$src)>;
 } // Predicates = [HasStdExtZfhOrZfhmin]
 
 let Predicates = [HasStdExtZfh, IsRV32] in {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZicbo.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZicbo.td
new file mode 100644
index 000000000000..57fd74b0c0fe
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZicbo.td
@@ -0,0 +1,71 @@
+//===-- RISCVInstrInfoZicbo.td - RISC-V CMO instructions ---*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the RISC-V instructions from the standard Base Cache
+// Management Operation ISA Extensions document (Zicbop, Zicboz, and Zicbop).
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Operand definitions.
+//===----------------------------------------------------------------------===//
+
+// A 12-bit signed immediate where the least significant five bits are zero.
+def simm12_lsb00000 : Operand<XLenVT>,
+                      ImmLeaf<XLenVT, [{return isShiftedInt<7, 5>(Imm);}]> {
+  let ParserMatchClass = SImmAsmOperand<12, "Lsb00000">;
+  let EncoderMethod = "getImmOpValue";
+  let DecoderMethod = "decodeSImmOperand<12>";
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (MCOp.evaluateAsConstantImm(Imm))
+      return isShiftedInt<7, 5>(Imm);
+    return MCOp.isBareSymbolRef();
+  }];
+  let OperandType = "OPERAND_SIMM12_LSB00000";
+  let OperandNamespace = "RISCVOp";
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction Class Templates
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
+class CBO_r<bits<12> optype, string opcodestr>
+    : RVInstI<0b010, OPC_MISC_MEM, (outs), (ins GPRMemZeroOffset:$rs1),
+              opcodestr, "$rs1"> {
+  let imm12 = optype;
+  let rd = 0b00000;
+}
+
+let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in
+class Prefetch_ri<bits<5> optype, string opcodestr>
+    : RVInstS<0b110, OPC_OP_IMM, (outs), (ins GPR:$rs1, simm12_lsb00000:$imm12),
+              opcodestr, "${imm12}(${rs1})"> {
+  let Inst{11-7} = 0b00000;
+  let rs2 = optype;
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtZicbom] in {
+def CBO_CLEAN : CBO_r<0b000000000001, "cbo.clean">, Sched<[]>;
+def CBO_FLUSH : CBO_r<0b000000000010, "cbo.flush">, Sched<[]>;
+def CBO_INVAL : CBO_r<0b000000000000, "cbo.inval">, Sched<[]>;
+} // Predicates = [HasStdExtZicbom]
+
+let Predicates = [HasStdExtZicboz] in {
+def CBO_ZERO : CBO_r<0b000000000100, "cbo.zero">, Sched<[]>;
+} // Predicates = [HasStdExtZicboz]
+
+let Predicates = [HasStdExtZicbop] in {
+def PREFETCH_I : Prefetch_ri<0b00000, "prefetch.i">, Sched<[]>;
+def PREFETCH_R : Prefetch_ri<0b00001, "prefetch.r">, Sched<[]>;
+def PREFETCH_W : Prefetch_ri<0b00011, "prefetch.w">, Sched<[]>;
+} // Predicates = [HasStdExtZicbop]
diff --git a/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp b/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp
index c167c095521a..c457a95544cf 100644
--- a/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp
@@ -87,7 +87,7 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
   return MCOperand::createExpr(ME);
 }
 
-bool llvm::LowerRISCVMachineOperandToMCOperand(const MachineOperand &MO,
+bool llvm::lowerRISCVMachineOperandToMCOperand(const MachineOperand &MO,
                                                MCOperand &MCOp,
                                                const AsmPrinter &AP) {
   switch (MO.getType()) {
@@ -145,6 +145,7 @@ static bool lowerRISCVVMachineInstrToMCInst(const MachineInstr *MI,
 
   const TargetRegisterInfo *TRI =
       MF->getSubtarget<RISCVSubtarget>().getRegisterInfo();
+
   assert(TRI && "TargetRegisterInfo expected");
 
   uint64_t TSFlags = MI->getDesc().TSFlags;
@@ -158,12 +159,16 @@ static bool lowerRISCVVMachineInstrToMCInst(const MachineInstr *MI,
   if (RISCVII::hasSEWOp(TSFlags))
     --NumOps;
 
+  bool hasVLOutput = RISCV::isFaultFirstLoad(*MI);
   for (unsigned OpNo = 0; OpNo != NumOps; ++OpNo) {
     const MachineOperand &MO = MI->getOperand(OpNo);
+    // Skip vl ouput. It should be the second output.
+    if (hasVLOutput && OpNo == 1)
+      continue;
 
     // Skip merge op. It should be the first operand after the result.
-    if (RISCVII::hasMergeOp(TSFlags) && OpNo == 1) {
-      assert(MI->getNumExplicitDefs() == 1);
+    if (RISCVII::hasMergeOp(TSFlags) && OpNo == 1U + hasVLOutput) {
+      assert(MI->getNumExplicitDefs() == 1U + hasVLOutput);
       continue;
     }
 
@@ -214,7 +219,7 @@ bool llvm::lowerRISCVMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
 
   for (const MachineOperand &MO : MI->operands()) {
     MCOperand MCOp;
-    if (LowerRISCVMachineOperandToMCOperand(MO, MCOp, AP))
+    if (lowerRISCVMachineOperandToMCOperand(MO, MCOp, AP))
       OutMI.addOperand(MCOp);
   }
 
diff --git a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.cpp b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.cpp
new file mode 100644
index 000000000000..8cb046bcfbb6
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.cpp
@@ -0,0 +1,37 @@
+//=- RISCVMachineFunctionInfo.cpp - RISCV machine function info ---*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares RISCV-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCVMachineFunctionInfo.h"
+
+using namespace llvm;
+
+yaml::RISCVMachineFunctionInfo::RISCVMachineFunctionInfo(
+    const llvm::RISCVMachineFunctionInfo &MFI)
+    : VarArgsFrameIndex(MFI.getVarArgsFrameIndex()),
+      VarArgsSaveSize(MFI.getVarArgsSaveSize()) {}
+
+MachineFunctionInfo *RISCVMachineFunctionInfo::clone(
+    BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+    const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+    const {
+  return DestMF.cloneInfo<RISCVMachineFunctionInfo>(*this);
+}
+
+void yaml::RISCVMachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) {
+  MappingTraits<RISCVMachineFunctionInfo>::mapping(YamlIO, *this);
+}
+
+void RISCVMachineFunctionInfo::initializeBaseYamlFields(
+    const yaml::RISCVMachineFunctionInfo &YamlMFI) {
+  VarArgsFrameIndex = YamlMFI.VarArgsFrameIndex;
+  VarArgsSaveSize = YamlMFI.VarArgsSaveSize;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
index b5609e9a3890..622767540d99 100644
--- a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
@@ -14,11 +14,34 @@
 #define LLVM_LIB_TARGET_RISCV_RISCVMACHINEFUNCTIONINFO_H
 
 #include "RISCVSubtarget.h"
+#include "llvm/CodeGen/MIRYamlMapping.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 
 namespace llvm {
 
+class RISCVMachineFunctionInfo;
+
+namespace yaml {
+struct RISCVMachineFunctionInfo final : public yaml::MachineFunctionInfo {
+  int VarArgsFrameIndex;
+  int VarArgsSaveSize;
+
+  RISCVMachineFunctionInfo() = default;
+  RISCVMachineFunctionInfo(const llvm::RISCVMachineFunctionInfo &MFI);
+
+  void mappingImpl(yaml::IO &YamlIO) override;
+  ~RISCVMachineFunctionInfo() = default;
+};
+
+template <> struct MappingTraits<RISCVMachineFunctionInfo> {
+  static void mapping(IO &YamlIO, RISCVMachineFunctionInfo &MFI) {
+    YamlIO.mapOptional("varArgsFrameIndex", MFI.VarArgsFrameIndex);
+    YamlIO.mapOptional("varArgsSaveSize", MFI.VarArgsSaveSize);
+  }
+};
+} // end namespace yaml
+
 /// RISCVMachineFunctionInfo - This class is derived from MachineFunctionInfo
 /// and contains private RISCV-specific information for each MachineFunction.
 class RISCVMachineFunctionInfo : public MachineFunctionInfo {
@@ -34,6 +57,8 @@ private:
   unsigned LibCallStackSize = 0;
   /// Size of RVV stack.
   uint64_t RVVStackSize = 0;
+  /// Alignment of RVV stack.
+  Align RVVStackAlign;
   /// Padding required to keep RVV stack aligned within the main stack.
   uint64_t RVVPadding = 0;
   /// Size of stack frame to save callee saved registers
@@ -42,6 +67,11 @@ private:
 public:
   RISCVMachineFunctionInfo(const MachineFunction &MF) {}
 
+  MachineFunctionInfo *
+  clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+        const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+      const override;
+
   int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
   void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
 
@@ -69,11 +99,16 @@ public:
   uint64_t getRVVStackSize() const { return RVVStackSize; }
   void setRVVStackSize(uint64_t Size) { RVVStackSize = Size; }
 
+  Align getRVVStackAlign() const { return RVVStackAlign; }
+  void setRVVStackAlign(Align StackAlign) { RVVStackAlign = StackAlign; }
+
   uint64_t getRVVPadding() const { return RVVPadding; }
   void setRVVPadding(uint64_t Padding) { RVVPadding = Padding; }
 
   unsigned getCalleeSavedStackSize() const { return CalleeSavedStackSize; }
   void setCalleeSavedStackSize(unsigned Size) { CalleeSavedStackSize = Size; }
+
+  void initializeBaseYamlFields(const yaml::RISCVMachineFunctionInfo &YamlMFI);
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/RISCV/RISCVMacroFusion.cpp b/llvm/lib/Target/RISCV/RISCVMacroFusion.cpp
new file mode 100644
index 000000000000..3b9177bc1635
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVMacroFusion.cpp
@@ -0,0 +1,67 @@
+//===- RISCVMacroFusion.cpp - RISCV Macro Fusion --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the RISCV implementation of the DAG scheduling
+/// mutation to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+//
+#include "RISCVMacroFusion.h"
+#include "RISCVSubtarget.h"
+#include "llvm/CodeGen/MacroFusion.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+
+using namespace llvm;
+
+// Fuse LUI followed by ADDI or ADDIW.
+// rd = imm[31:0] which decomposes to
+// lui rd, imm[31:12]
+// addi(w) rd, rd, imm[11:0]
+static bool isLUIADDI(const MachineInstr *FirstMI,
+                      const MachineInstr &SecondMI) {
+  if (SecondMI.getOpcode() != RISCV::ADDI &&
+      SecondMI.getOpcode() != RISCV::ADDIW)
+    return false;
+
+  // Assume the 1st instr to be a wildcard if it is unspecified.
+  if (!FirstMI)
+    return true;
+
+  if (FirstMI->getOpcode() != RISCV::LUI)
+    return false;
+
+  // The first operand of ADDI might be a frame index.
+  if (!SecondMI.getOperand(1).isReg())
+    return false;
+
+  Register FirstDest = FirstMI->getOperand(0).getReg();
+
+  // Destination of LUI should be the ADDI(W) source register.
+  if (SecondMI.getOperand(1).getReg() != FirstDest)
+    return false;
+
+  // If the FirstMI destination is non-virtual, it should match the SecondMI
+  // destination.
+  return FirstDest.isVirtual() || SecondMI.getOperand(0).getReg() == FirstDest;
+}
+
+static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
+                                   const TargetSubtargetInfo &TSI,
+                                   const MachineInstr *FirstMI,
+                                   const MachineInstr &SecondMI) {
+  const RISCVSubtarget &ST = static_cast<const RISCVSubtarget &>(TSI);
+
+  if (ST.hasLUIADDIFusion() && isLUIADDI(FirstMI, SecondMI))
+    return true;
+
+  return false;
+}
+
+std::unique_ptr<ScheduleDAGMutation> llvm::createRISCVMacroFusionDAGMutation() {
+  return createMacroFusionDAGMutation(shouldScheduleAdjacent);
+}
diff --git a/llvm/lib/Target/RISCV/RISCVMacroFusion.h b/llvm/lib/Target/RISCV/RISCVMacroFusion.h
new file mode 100644
index 000000000000..c238dacc37f6
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVMacroFusion.h
@@ -0,0 +1,28 @@
+//===- RISCVMacroFusion.h - RISCV Macro Fusion ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the RISCV definition of the DAG scheduling mutation
+/// to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_RISCV_RISCVMACROFUSION_H
+#define LLVM_LIB_TARGET_RISCV_RISCVMACROFUSION_H
+
+#include "llvm/CodeGen/MachineScheduler.h"
+
+namespace llvm {
+
+/// Note that you have to add:
+///   DAG.addMutation(createRISCVMacroFusionDAGMutation());
+/// to RISCVPassConfig::createMachineScheduler() to have an effect.
+std::unique_ptr<ScheduleDAGMutation> createRISCVMacroFusionDAGMutation();
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp b/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp
new file mode 100644
index 000000000000..1fc424411c12
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp
@@ -0,0 +1,382 @@
+//===-- RISCVMakeCompressible.cpp - Make more instructions compressible ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass searches for instructions that are prevented from being compressed
+// by one of the following:
+//
+//   1. The use of a single uncompressed register.
+//   2. A base register + offset where the offset is too large to be compressed
+//   and the base register may or may not be compressed.
+//
+//
+// For case 1, if a compressed register is available, then the uncompressed
+// register is copied to the compressed register and its uses are replaced.
+//
+// For example, storing zero uses the uncompressible zero register:
+//   sw zero, 0(a0)   # if zero
+//   sw zero, 8(a0)   # if zero
+//   sw zero, 4(a0)   # if zero
+//   sw zero, 24(a0)   # if zero
+//
+// If a compressed register (e.g. a1) is available, the above can be transformed
+// to the following to improve code size:
+//   li a1, 0
+//   c.sw a1, 0(a0)
+//   c.sw a1, 8(a0)
+//   c.sw a1, 4(a0)
+//   c.sw a1, 24(a0)
+//
+//
+// For case 2, if a compressed register is available, then the original base
+// is copied and adjusted such that:
+//
+//   new_base_register = base_register + adjustment
+//   base_register + large_offset = new_base_register + small_offset
+//
+// For example, the following offsets are too large for c.sw:
+//   lui a2, 983065
+//   sw  a1, -236(a2)
+//   sw  a1, -240(a2)
+//   sw  a1, -244(a2)
+//   sw  a1, -248(a2)
+//   sw  a1, -252(a2)
+//   sw  a0, -256(a2)
+//
+// If a compressed register is available (e.g. a3), a new base could be created
+// such that the addresses can accessed with a compressible offset, thus
+// improving code size:
+//   lui a2, 983065
+//   addi  a3, a2, -256
+//   c.sw  a1, 20(a3)
+//   c.sw  a1, 16(a3)
+//   c.sw  a1, 12(a3)
+//   c.sw  a1, 8(a3)
+//   c.sw  a1, 4(a3)
+//   c.sw  a0, 0(a3)
+//
+//
+// This optimization is only applied if there are enough uses of the copied
+// register for code size to be reduced.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "RISCVSubtarget.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "riscv-make-compressible"
+#define RISCV_COMPRESS_INSTRS_NAME "RISCV Make Compressible"
+
+namespace {
+
+struct RISCVMakeCompressibleOpt : public MachineFunctionPass {
+  static char ID;
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  RISCVMakeCompressibleOpt() : MachineFunctionPass(ID) {
+    initializeRISCVMakeCompressibleOptPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return RISCV_COMPRESS_INSTRS_NAME; }
+};
+} // namespace
+
+char RISCVMakeCompressibleOpt::ID = 0;
+INITIALIZE_PASS(RISCVMakeCompressibleOpt, "riscv-make-compressible",
+                RISCV_COMPRESS_INSTRS_NAME, false, false)
+
+// Return log2(widthInBytes) of load/store done by Opcode.
+static unsigned log2LdstWidth(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    llvm_unreachable("Unexpected opcode");
+  case RISCV::LW:
+  case RISCV::SW:
+  case RISCV::FLW:
+  case RISCV::FSW:
+    return 2;
+  case RISCV::LD:
+  case RISCV::SD:
+  case RISCV::FLD:
+  case RISCV::FSD:
+    return 3;
+  }
+}
+
+// Return a mask for the offset bits of a non-stack-pointer based compressed
+// load/store.
+static uint8_t compressedLDSTOffsetMask(unsigned Opcode) {
+  return 0x1f << log2LdstWidth(Opcode);
+}
+
+// Return true if Offset fits within a compressed stack-pointer based
+// load/store.
+static bool compressibleSPOffset(int64_t Offset, unsigned Opcode) {
+  return log2LdstWidth(Opcode) == 2 ? isShiftedUInt<6, 2>(Offset)
+                                    : isShiftedUInt<6, 3>(Offset);
+}
+
+// Given an offset for a load/store, return the adjustment required to the base
+// register such that the address can be accessed with a compressible offset.
+// This will return 0 if the offset is already compressible.
+static int64_t getBaseAdjustForCompression(int64_t Offset, unsigned Opcode) {
+  // Return the excess bits that do not fit in a compressible offset.
+  return Offset & ~compressedLDSTOffsetMask(Opcode);
+}
+
+// Return true if Reg is in a compressed register class.
+static bool isCompressedReg(Register Reg) {
+  return RISCV::GPRCRegClass.contains(Reg) ||
+         RISCV::FPR32CRegClass.contains(Reg) ||
+         RISCV::FPR64CRegClass.contains(Reg);
+}
+
+// Return true if MI is a load for which there exists a compressed version.
+static bool isCompressibleLoad(const MachineInstr &MI) {
+  const RISCVSubtarget &STI = MI.getMF()->getSubtarget<RISCVSubtarget>();
+  const unsigned Opcode = MI.getOpcode();
+
+  return Opcode == RISCV::LW || (!STI.is64Bit() && Opcode == RISCV::FLW) ||
+         Opcode == RISCV::LD || Opcode == RISCV::FLD;
+}
+
+// Return true if MI is a store for which there exists a compressed version.
+static bool isCompressibleStore(const MachineInstr &MI) {
+  const RISCVSubtarget &STI = MI.getMF()->getSubtarget<RISCVSubtarget>();
+  const unsigned Opcode = MI.getOpcode();
+
+  return Opcode == RISCV::SW || (!STI.is64Bit() && Opcode == RISCV::FSW) ||
+         Opcode == RISCV::SD || Opcode == RISCV::FSD;
+}
+
+// Find a single register and/or large offset which, if compressible, would
+// allow the given instruction to be compressed.
+//
+// Possible return values:
+//
+//   {Reg, 0}               - Uncompressed Reg needs replacing with a compressed
+//                            register.
+//   {Reg, N}               - Reg needs replacing with a compressed register and
+//                            N needs adding to the new register. (Reg may be
+//                            compressed or uncompressed).
+//   {RISCV::NoRegister, 0} - No suitable optimization found for this
+//   instruction.
+static RegImmPair getRegImmPairPreventingCompression(const MachineInstr &MI) {
+  const unsigned Opcode = MI.getOpcode();
+
+  if (isCompressibleLoad(MI) || isCompressibleStore(MI)) {
+    const MachineOperand &MOImm = MI.getOperand(2);
+    if (!MOImm.isImm())
+      return RegImmPair(RISCV::NoRegister, 0);
+
+    int64_t Offset = MOImm.getImm();
+    int64_t NewBaseAdjust = getBaseAdjustForCompression(Offset, Opcode);
+    Register Base = MI.getOperand(1).getReg();
+
+    // Memory accesses via the stack pointer do not have a requirement for
+    // either of the registers to be compressible and can take a larger offset.
+    if (RISCV::SPRegClass.contains(Base)) {
+      if (!compressibleSPOffset(Offset, Opcode) && NewBaseAdjust)
+        return RegImmPair(Base, NewBaseAdjust);
+    } else {
+      Register SrcDest = MI.getOperand(0).getReg();
+      bool SrcDestCompressed = isCompressedReg(SrcDest);
+      bool BaseCompressed = isCompressedReg(Base);
+
+      // If only Base and/or offset prevent compression, then return Base and
+      // any adjustment required to make the offset compressible.
+      if ((!BaseCompressed || NewBaseAdjust) && SrcDestCompressed)
+        return RegImmPair(Base, NewBaseAdjust);
+
+      // For loads, we can only change the base register since dest is defined
+      // rather than used.
+      //
+      // For stores, we can change SrcDest (and Base if SrcDest == Base) but
+      // cannot resolve an uncompressible offset in this case.
+      if (isCompressibleStore(MI)) {
+        if (!SrcDestCompressed && (BaseCompressed || SrcDest == Base) &&
+            !NewBaseAdjust)
+          return RegImmPair(SrcDest, NewBaseAdjust);
+      }
+    }
+  }
+  return RegImmPair(RISCV::NoRegister, 0);
+}
+
+// Check all uses after FirstMI of the given register, keeping a vector of
+// instructions that would be compressible if the given register (and offset if
+// applicable) were compressible.
+//
+// If there are enough uses for this optimization to improve code size and a
+// compressed register is available, return that compressed register.
+static Register analyzeCompressibleUses(MachineInstr &FirstMI,
+                                        RegImmPair RegImm,
+                                        SmallVectorImpl<MachineInstr *> &MIs) {
+  MachineBasicBlock &MBB = *FirstMI.getParent();
+  const TargetRegisterInfo *TRI =
+      MBB.getParent()->getSubtarget().getRegisterInfo();
+
+  RegScavenger RS;
+  RS.enterBasicBlock(MBB);
+
+  for (MachineBasicBlock::instr_iterator I = FirstMI.getIterator(),
+                                         E = MBB.instr_end();
+       I != E; ++I) {
+    MachineInstr &MI = *I;
+
+    // Determine if this is an instruction which would benefit from using the
+    // new register.
+    RegImmPair CandidateRegImm = getRegImmPairPreventingCompression(MI);
+    if (CandidateRegImm.Reg == RegImm.Reg &&
+        CandidateRegImm.Imm == RegImm.Imm) {
+      // Advance tracking since the value in the new register must be live for
+      // this instruction too.
+      RS.forward(I);
+
+      MIs.push_back(&MI);
+    }
+
+    // If RegImm.Reg is modified by this instruction, then we cannot optimize
+    // past this instruction. If the register is already compressed, then it may
+    // possible to optimize a large offset in the current instruction - this
+    // will have been detected by the preceeding call to
+    // getRegImmPairPreventingCompression.
+    if (MI.modifiesRegister(RegImm.Reg, TRI))
+      break;
+  }
+
+  // Adjusting the base costs one new uncompressed addi and therefore three uses
+  // are required for a code size reduction. If no base adjustment is required,
+  // then copying the register costs one new c.mv (or c.li Rd, 0 for "copying"
+  // the zero register) and therefore two uses are required for a code size
+  // reduction.
+  if (MIs.size() < 2 || (RegImm.Imm != 0 && MIs.size() < 3))
+    return RISCV::NoRegister;
+
+  // Find a compressible register which will be available from the first
+  // instruction we care about to the last.
+  const TargetRegisterClass *RCToScavenge;
+
+  // Work out the compressed register class from which to scavenge.
+  if (RISCV::GPRRegClass.contains(RegImm.Reg))
+    RCToScavenge = &RISCV::GPRCRegClass;
+  else if (RISCV::FPR32RegClass.contains(RegImm.Reg))
+    RCToScavenge = &RISCV::FPR32CRegClass;
+  else if (RISCV::FPR64RegClass.contains(RegImm.Reg))
+    RCToScavenge = &RISCV::FPR64CRegClass;
+  else
+    return RISCV::NoRegister;
+
+  return RS.scavengeRegisterBackwards(*RCToScavenge, FirstMI.getIterator(),
+                                      /*RestoreAfter=*/false, /*SPAdj=*/0,
+                                      /*AllowSpill=*/false);
+}
+
+// Update uses of the old register in the given instruction to the new register.
+static void updateOperands(MachineInstr &MI, RegImmPair OldRegImm,
+                           Register NewReg) {
+  unsigned Opcode = MI.getOpcode();
+
+  // If this pass is extended to support more instructions, the check for
+  // definedness may need to be strengthened.
+  assert((isCompressibleLoad(MI) || isCompressibleStore(MI)) &&
+         "Unsupported instruction for this optimization.");
+
+  // Update registers
+  for (MachineOperand &MO : MI.operands())
+    if (MO.isReg() && MO.getReg() == OldRegImm.Reg) {
+      // Do not update operands that define the old register.
+      //
+      // The new register was scavenged for the range of instructions that are
+      // being updated, therefore it should not be defined within this range
+      // except possibly in the final instruction.
+      if (MO.isDef()) {
+        assert(isCompressibleLoad(MI));
+        continue;
+      }
+      // Update reg
+      MO.setReg(NewReg);
+    }
+
+  // Update offset
+  MachineOperand &MOImm = MI.getOperand(2);
+  int64_t NewOffset = MOImm.getImm() & compressedLDSTOffsetMask(Opcode);
+  MOImm.setImm(NewOffset);
+}
+
+bool RISCVMakeCompressibleOpt::runOnMachineFunction(MachineFunction &Fn) {
+  // This is a size optimization.
+  if (skipFunction(Fn.getFunction()) || !Fn.getFunction().hasMinSize())
+    return false;
+
+  const RISCVSubtarget &STI = Fn.getSubtarget<RISCVSubtarget>();
+  const RISCVInstrInfo &TII = *STI.getInstrInfo();
+
+  // This optimization only makes sense if compressed instructions are emitted.
+  if (!STI.hasStdExtC())
+    return false;
+
+  for (MachineBasicBlock &MBB : Fn) {
+    LLVM_DEBUG(dbgs() << "MBB: " << MBB.getName() << "\n");
+    for (MachineInstr &MI : MBB) {
+      // Determine if this instruction would otherwise be compressed if not for
+      // an uncompressible register or offset.
+      RegImmPair RegImm = getRegImmPairPreventingCompression(MI);
+      if (!RegImm.Reg && RegImm.Imm == 0)
+        continue;
+
+      // Determine if there is a set of instructions for which replacing this
+      // register with a compressed register (and compressible offset if
+      // applicable) is possible and will allow compression.
+      SmallVector<MachineInstr *, 8> MIs;
+      Register NewReg = analyzeCompressibleUses(MI, RegImm, MIs);
+      if (!NewReg)
+        continue;
+
+      // Create the appropriate copy and/or offset.
+      if (RISCV::GPRRegClass.contains(RegImm.Reg)) {
+        assert(isInt<12>(RegImm.Imm));
+        BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(RISCV::ADDI), NewReg)
+            .addReg(RegImm.Reg)
+            .addImm(RegImm.Imm);
+      } else {
+        // If we are looking at replacing an FPR register we don't expect to
+        // have any offset. The only compressible FP instructions with an offset
+        // are loads and stores, for which the offset applies to the GPR operand
+        // not the FPR operand.
+        assert(RegImm.Imm == 0);
+        unsigned Opcode = RISCV::FPR32RegClass.contains(RegImm.Reg)
+                              ? RISCV::FSGNJ_S
+                              : RISCV::FSGNJ_D;
+        BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(Opcode), NewReg)
+            .addReg(RegImm.Reg)
+            .addReg(RegImm.Reg);
+      }
+
+      // Update the set of instructions to use the compressed register and
+      // compressible offset instead. These instructions should now be
+      // compressible.
+      // TODO: Update all uses if RegImm.Imm == 0? Not just those that are
+      // expected to become compressible.
+      for (MachineInstr *UpdateMI : MIs)
+        updateOperands(*UpdateMI, RegImm, NewReg);
+    }
+  }
+  return true;
+}
+
+/// Returns an instance of the Make Compressible Optimization pass.
+FunctionPass *llvm::createRISCVMakeCompressibleOptPass() {
+  return new RISCVMakeCompressibleOpt();
+}
diff --git a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
index 5f4022439abb..b060a73846c4 100644
--- a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
@@ -25,6 +25,7 @@
 
 #include "RISCV.h"
 #include "RISCVTargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Debug.h"
@@ -37,6 +38,10 @@ using namespace llvm;
 namespace {
 
 struct RISCVMergeBaseOffsetOpt : public MachineFunctionPass {
+private:
+  const RISCVSubtarget *ST = nullptr;
+
+public:
   static char ID;
   bool runOnMachineFunction(MachineFunction &Fn) override;
   bool detectLuiAddiGlobal(MachineInstr &LUI, MachineInstr *&ADDI);
@@ -45,6 +50,9 @@ struct RISCVMergeBaseOffsetOpt : public MachineFunctionPass {
   void foldOffset(MachineInstr &HiLUI, MachineInstr &LoADDI, MachineInstr &Tail,
                   int64_t Offset);
   bool matchLargeOffset(MachineInstr &TailAdd, Register GSReg, int64_t &Offset);
+  bool matchShiftedOffset(MachineInstr &TailShXAdd, Register GSReg,
+                          int64_t &Offset);
+
   RISCVMergeBaseOffsetOpt() : MachineFunctionPass(ID) {}
 
   MachineFunctionProperties getRequiredProperties() const override {
@@ -85,17 +93,16 @@ bool RISCVMergeBaseOffsetOpt::detectLuiAddiGlobal(MachineInstr &HiLUI,
                                                   MachineInstr *&LoADDI) {
   if (HiLUI.getOpcode() != RISCV::LUI ||
       HiLUI.getOperand(1).getTargetFlags() != RISCVII::MO_HI ||
-      HiLUI.getOperand(1).getType() != MachineOperand::MO_GlobalAddress ||
+      !HiLUI.getOperand(1).isGlobal() ||
       HiLUI.getOperand(1).getOffset() != 0 ||
       !MRI->hasOneUse(HiLUI.getOperand(0).getReg()))
     return false;
   Register HiLuiDestReg = HiLUI.getOperand(0).getReg();
-  LoADDI = MRI->use_begin(HiLuiDestReg)->getParent();
+  LoADDI = &*MRI->use_instr_begin(HiLuiDestReg);
   if (LoADDI->getOpcode() != RISCV::ADDI ||
       LoADDI->getOperand(2).getTargetFlags() != RISCVII::MO_LO ||
-      LoADDI->getOperand(2).getType() != MachineOperand::MO_GlobalAddress ||
-      LoADDI->getOperand(2).getOffset() != 0 ||
-      !MRI->hasOneUse(LoADDI->getOperand(0).getReg()))
+      !LoADDI->getOperand(2).isGlobal() ||
+      LoADDI->getOperand(2).getOffset() != 0)
     return false;
   return true;
 }
@@ -106,6 +113,7 @@ bool RISCVMergeBaseOffsetOpt::detectLuiAddiGlobal(MachineInstr &HiLUI,
 void RISCVMergeBaseOffsetOpt::foldOffset(MachineInstr &HiLUI,
                                          MachineInstr &LoADDI,
                                          MachineInstr &Tail, int64_t Offset) {
+  assert(isInt<32>(Offset) && "Unexpected offset");
   // Put the offset back in HiLUI and the LoADDI
   HiLUI.getOperand(1).setOffset(Offset);
   LoADDI.getOperand(2).setOffset(Offset);
@@ -148,7 +156,8 @@ bool RISCVMergeBaseOffsetOpt::matchLargeOffset(MachineInstr &TailAdd,
     return false;
   // This can point to an ADDI or a LUI:
   MachineInstr &OffsetTail = *MRI->getVRegDef(Reg);
-  if (OffsetTail.getOpcode() == RISCV::ADDI) {
+  if (OffsetTail.getOpcode() == RISCV::ADDI ||
+      OffsetTail.getOpcode() == RISCV::ADDIW) {
     // The offset value has non zero bits in both %hi and %lo parts.
     // Detect an ADDI that feeds from a LUI instruction.
     MachineOperand &AddiImmOp = OffsetTail.getOperand(2);
@@ -162,8 +171,14 @@ bool RISCVMergeBaseOffsetOpt::matchLargeOffset(MachineInstr &TailAdd,
         LuiImmOp.getTargetFlags() != RISCVII::MO_None ||
         !MRI->hasOneUse(OffsetLui.getOperand(0).getReg()))
       return false;
-    int64_t OffHi = OffsetLui.getOperand(1).getImm();
-    Offset = (OffHi << 12) + OffLo;
+    Offset = SignExtend64<32>(LuiImmOp.getImm() << 12);
+    Offset += OffLo;
+    // RV32 ignores the upper 32 bits. ADDIW sign extends the result.
+    if (!ST->is64Bit() || OffsetTail.getOpcode() == RISCV::ADDIW)
+       Offset = SignExtend64<32>(Offset);
+    // We can only fold simm32 offsets.
+    if (!isInt<32>(Offset))
+      return false;
     LLVM_DEBUG(dbgs() << "  Offset Instrs: " << OffsetTail
                       << "                 " << OffsetLui);
     DeadInstrs.insert(&OffsetTail);
@@ -173,98 +188,204 @@ bool RISCVMergeBaseOffsetOpt::matchLargeOffset(MachineInstr &TailAdd,
     // The offset value has all zero bits in the lower 12 bits. Only LUI
     // exists.
     LLVM_DEBUG(dbgs() << "  Offset Instr: " << OffsetTail);
-    Offset = OffsetTail.getOperand(1).getImm() << 12;
+    Offset = SignExtend64<32>(OffsetTail.getOperand(1).getImm() << 12);
     DeadInstrs.insert(&OffsetTail);
     return true;
   }
   return false;
 }
 
+// Detect patterns for offsets that are passed into a SHXADD instruction.
+// The offset has 1,2, or 3 trailing zeros and fits in simm13, simm14, simm15.
+// The constant is created with addi    voff, x0, C, and shXadd is used to
+// fill insert the trailing zeros and do the addition.
+//
+// HiLUI:      lui     vreg1, %hi(s)
+// LoADDI:     addi    vreg2, vreg1, %lo(s)
+// OffsetTail: addi    voff, x0, C
+// TailAdd:    shXadd  vreg4, voff, vreg2
+bool RISCVMergeBaseOffsetOpt::matchShiftedOffset(MachineInstr &TailShXAdd,
+                                                 Register GAReg,
+                                                 int64_t &Offset) {
+  assert((TailShXAdd.getOpcode() == RISCV::SH1ADD ||
+          TailShXAdd.getOpcode() == RISCV::SH2ADD ||
+          TailShXAdd.getOpcode() == RISCV::SH3ADD) &&
+         "Expected SHXADD instruction!");
+
+  // The first source is the shifted operand.
+  Register Rs1 = TailShXAdd.getOperand(1).getReg();
+
+  if (GAReg != TailShXAdd.getOperand(2).getReg())
+    return false;
+
+  // Can't fold if the register has more than one use.
+  if (!MRI->hasOneUse(Rs1))
+    return false;
+  // This can point to an ADDI X0, C.
+  MachineInstr &OffsetTail = *MRI->getVRegDef(Rs1);
+  if (OffsetTail.getOpcode() != RISCV::ADDI)
+    return false;
+  if (!OffsetTail.getOperand(1).isReg() ||
+      OffsetTail.getOperand(1).getReg() != RISCV::X0 ||
+      !OffsetTail.getOperand(2).isImm())
+    return false;
+
+  Offset = OffsetTail.getOperand(2).getImm();
+  assert(isInt<12>(Offset) && "Unexpected offset");
+
+  unsigned ShAmt;
+  switch (TailShXAdd.getOpcode()) {
+  default: llvm_unreachable("Unexpected opcode");
+  case RISCV::SH1ADD: ShAmt = 1; break;
+  case RISCV::SH2ADD: ShAmt = 2; break;
+  case RISCV::SH3ADD: ShAmt = 3; break;
+  }
+
+  Offset = (uint64_t)Offset << ShAmt;
+
+  LLVM_DEBUG(dbgs() << "  Offset Instr: " << OffsetTail);
+  DeadInstrs.insert(&OffsetTail);
+  return true;
+}
+
 bool RISCVMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &HiLUI,
                                                   MachineInstr &LoADDI) {
   Register DestReg = LoADDI.getOperand(0).getReg();
-  assert(MRI->hasOneUse(DestReg) && "expected one use for LoADDI");
-  // LoADDI has only one use.
-  MachineInstr &Tail = *MRI->use_begin(DestReg)->getParent();
-  switch (Tail.getOpcode()) {
-  default:
-    LLVM_DEBUG(dbgs() << "Don't know how to get offset from this instr:"
-                      << Tail);
-    return false;
-  case RISCV::ADDI: {
-    // Offset is simply an immediate operand.
-    int64_t Offset = Tail.getOperand(2).getImm();
-    LLVM_DEBUG(dbgs() << "  Offset Instr: " << Tail);
-    foldOffset(HiLUI, LoADDI, Tail, Offset);
-    return true;
+
+  // First, look for arithmetic instructions we can get an offset from.
+  // We might be able to remove the arithmetic instructions by folding the
+  // offset into the LUI+ADDI.
+  if (MRI->hasOneUse(DestReg)) {
+    // LoADDI has only one use.
+    MachineInstr &Tail = *MRI->use_instr_begin(DestReg);
+    switch (Tail.getOpcode()) {
+    default:
+      LLVM_DEBUG(dbgs() << "Don't know how to get offset from this instr:"
+                        << Tail);
+      break;
+    case RISCV::ADDI: {
+      // Offset is simply an immediate operand.
+      int64_t Offset = Tail.getOperand(2).getImm();
+
+      // We might have two ADDIs in a row.
+      Register TailDestReg = Tail.getOperand(0).getReg();
+      if (MRI->hasOneUse(TailDestReg)) {
+        MachineInstr &TailTail = *MRI->use_instr_begin(TailDestReg);
+        if (TailTail.getOpcode() == RISCV::ADDI) {
+          Offset += TailTail.getOperand(2).getImm();
+          LLVM_DEBUG(dbgs() << "  Offset Instrs: " << Tail << TailTail);
+          DeadInstrs.insert(&Tail);
+          foldOffset(HiLUI, LoADDI, TailTail, Offset);
+          return true;
+        }
+      }
+
+      LLVM_DEBUG(dbgs() << "  Offset Instr: " << Tail);
+      foldOffset(HiLUI, LoADDI, Tail, Offset);
+      return true;
+    }
+    case RISCV::ADD: {
+      // The offset is too large to fit in the immediate field of ADDI.
+      // This can be in two forms:
+      // 1) LUI hi_Offset followed by:
+      //    ADDI lo_offset
+      //    This happens in case the offset has non zero bits in
+      //    both hi 20 and lo 12 bits.
+      // 2) LUI (offset20)
+      //    This happens in case the lower 12 bits of the offset are zeros.
+      int64_t Offset;
+      if (!matchLargeOffset(Tail, DestReg, Offset))
+        return false;
+      foldOffset(HiLUI, LoADDI, Tail, Offset);
+      return true;
+    }
+    case RISCV::SH1ADD:
+    case RISCV::SH2ADD:
+    case RISCV::SH3ADD: {
+      // The offset is too large to fit in the immediate field of ADDI.
+      // It may be encoded as (SH2ADD (ADDI X0, C), DestReg) or
+      // (SH3ADD (ADDI X0, C), DestReg).
+      int64_t Offset;
+      if (!matchShiftedOffset(Tail, DestReg, Offset))
+        return false;
+      foldOffset(HiLUI, LoADDI, Tail, Offset);
+      return true;
+    }
+    }
   }
-  case RISCV::ADD: {
-    // The offset is too large to fit in the immediate field of ADDI.
-    // This can be in two forms:
-    // 1) LUI hi_Offset followed by:
-    //    ADDI lo_offset
-    //    This happens in case the offset has non zero bits in
-    //    both hi 20 and lo 12 bits.
-    // 2) LUI (offset20)
-    //    This happens in case the lower 12 bits of the offset are zeros.
-    int64_t Offset;
-    if (!matchLargeOffset(Tail, DestReg, Offset))
+
+  // We didn't find an arithmetic instruction. If all the uses are memory ops
+  // with the same offset, we can transform
+  // HiLUI:  lui vreg1, %hi(foo)          --->  lui vreg1, %hi(foo+8)
+  // LoADDI: addi vreg2, vreg1, %lo(foo)  --->  lw vreg3, lo(foo+8)(vreg1)
+  // Tail:   lw vreg3, 8(vreg2)
+
+  Optional<int64_t> CommonOffset;
+  for (const MachineInstr &UseMI : MRI->use_instructions(DestReg)) {
+    switch (UseMI.getOpcode()) {
+    default:
+      LLVM_DEBUG(dbgs() << "Not a load or store instruction: " << UseMI);
       return false;
-    foldOffset(HiLUI, LoADDI, Tail, Offset);
-    return true;
+    case RISCV::LB:
+    case RISCV::LH:
+    case RISCV::LW:
+    case RISCV::LBU:
+    case RISCV::LHU:
+    case RISCV::LWU:
+    case RISCV::LD:
+    case RISCV::FLH:
+    case RISCV::FLW:
+    case RISCV::FLD:
+    case RISCV::SB:
+    case RISCV::SH:
+    case RISCV::SW:
+    case RISCV::SD:
+    case RISCV::FSH:
+    case RISCV::FSW:
+    case RISCV::FSD: {
+      if (UseMI.getOperand(1).isFI())
+        return false;
+      // Register defined by LoADDI should not be the value register.
+      if (DestReg == UseMI.getOperand(0).getReg())
+        return false;
+      assert(DestReg == UseMI.getOperand(1).getReg() &&
+             "Expected base address use");
+      // All load/store instructions must use the same offset.
+      int64_t Offset = UseMI.getOperand(2).getImm();
+      if (CommonOffset && Offset != CommonOffset)
+        return false;
+      CommonOffset = Offset;
+    }
+    }
   }
-  case RISCV::LB:
-  case RISCV::LH:
-  case RISCV::LW:
-  case RISCV::LBU:
-  case RISCV::LHU:
-  case RISCV::LWU:
-  case RISCV::LD:
-  case RISCV::FLH:
-  case RISCV::FLW:
-  case RISCV::FLD:
-  case RISCV::SB:
-  case RISCV::SH:
-  case RISCV::SW:
-  case RISCV::SD:
-  case RISCV::FSH:
-  case RISCV::FSW:
-  case RISCV::FSD: {
-    // Transforms the sequence:            Into:
-    // HiLUI:  lui vreg1, %hi(foo)          --->  lui vreg1, %hi(foo+8)
-    // LoADDI: addi vreg2, vreg1, %lo(foo)  --->  lw vreg3, lo(foo+8)(vreg1)
-    // Tail:   lw vreg3, 8(vreg2)
-    if (Tail.getOperand(1).isFI())
-      return false;
-    // Register defined by LoADDI should be used in the base part of the
-    // load\store instruction. Otherwise, no folding possible.
-    Register BaseAddrReg = Tail.getOperand(1).getReg();
-    if (DestReg != BaseAddrReg)
-      return false;
-    MachineOperand &TailImmOp = Tail.getOperand(2);
-    int64_t Offset = TailImmOp.getImm();
-    // Update the offsets in global address lowering.
-    HiLUI.getOperand(1).setOffset(Offset);
-    // Update the immediate in the Tail instruction to add the offset.
-    Tail.RemoveOperand(2);
-    MachineOperand &ImmOp = LoADDI.getOperand(2);
-    ImmOp.setOffset(Offset);
-    Tail.addOperand(ImmOp);
+
+  // We found a common offset.
+  // Update the offsets in global address lowering.
+  HiLUI.getOperand(1).setOffset(*CommonOffset);
+  MachineOperand &ImmOp = LoADDI.getOperand(2);
+  ImmOp.setOffset(*CommonOffset);
+
+  // Update the immediate in the load/store instructions to add the offset.
+  for (MachineInstr &UseMI :
+       llvm::make_early_inc_range(MRI->use_instructions(DestReg))) {
+    UseMI.removeOperand(2);
+    UseMI.addOperand(ImmOp);
     // Update the base reg in the Tail instruction to feed from LUI.
     // Output of HiLUI is only used in LoADDI, no need to use
     // MRI->replaceRegWith().
-    Tail.getOperand(1).setReg(HiLUI.getOperand(0).getReg());
-    DeadInstrs.insert(&LoADDI);
-    return true;
+    UseMI.getOperand(1).setReg(HiLUI.getOperand(0).getReg());
   }
-  }
-  return false;
+
+  DeadInstrs.insert(&LoADDI);
+  return true;
 }
 
 bool RISCVMergeBaseOffsetOpt::runOnMachineFunction(MachineFunction &Fn) {
   if (skipFunction(Fn.getFunction()))
     return false;
 
+  ST = &Fn.getSubtarget<RISCVSubtarget>();
+
   bool MadeChange = false;
   DeadInstrs.clear();
   MRI = &Fn.getRegInfo();
@@ -274,9 +395,8 @@ bool RISCVMergeBaseOffsetOpt::runOnMachineFunction(MachineFunction &Fn) {
       MachineInstr *LoADDI = nullptr;
       if (!detectLuiAddiGlobal(HiLUI, LoADDI))
         continue;
-      LLVM_DEBUG(dbgs() << "  Found lowered global address with one use: "
+      LLVM_DEBUG(dbgs() << "  Found lowered global address: "
                         << *LoADDI->getOperand(2).getGlobal() << "\n");
-      // If the use count is only one, merge the offset
       MadeChange |= detectAndFoldOffset(HiLUI, *LoADDI);
     }
   }
diff --git a/llvm/lib/Target/RISCV/RISCVRedundantCopyElimination.cpp b/llvm/lib/Target/RISCV/RISCVRedundantCopyElimination.cpp
new file mode 100644
index 000000000000..3c4a60b81d8e
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVRedundantCopyElimination.cpp
@@ -0,0 +1,179 @@
+//=- RISCVRedundantCopyElimination.cpp - Remove useless copy for RISCV ------=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass removes unnecessary zero copies in BBs that are targets of
+// beqz/bnez instructions. For instance, the copy instruction in the code below
+// can be removed because the beqz jumps to BB#2 when a0 is zero.
+//  BB#1:
+//    beqz %a0, <BB#2>
+//  BB#2:
+//    %a0 = COPY %x0
+// This pass should be run after register allocation.
+//
+// This pass is based on the earliest versions of
+// AArch64RedundantCopyElimination.
+//
+// FIXME: Support compares with constants other than zero? This is harder to
+// do on RISC-V since branches can't have immediates.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "riscv-copyelim"
+
+STATISTIC(NumCopiesRemoved, "Number of copies removed.");
+
+namespace {
+class RISCVRedundantCopyElimination : public MachineFunctionPass {
+  const MachineRegisterInfo *MRI;
+  const TargetRegisterInfo *TRI;
+
+public:
+  static char ID;
+  RISCVRedundantCopyElimination() : MachineFunctionPass(ID) {
+    initializeRISCVRedundantCopyEliminationPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+  StringRef getPassName() const override {
+    return "RISCV Redundant Copy Elimination";
+  }
+
+private:
+  bool optimizeBlock(MachineBasicBlock &MBB);
+};
+
+} // end anonymous namespace
+
+char RISCVRedundantCopyElimination::ID = 0;
+
+INITIALIZE_PASS(RISCVRedundantCopyElimination, "riscv-copyelim",
+                "RISCV redundant copy elimination pass", false, false)
+
+static bool guaranteesZeroRegInBlock(const MachineInstr &MI,
+                                     const MachineBasicBlock &MBB) {
+  unsigned Opc = MI.getOpcode();
+  if (Opc == RISCV::BEQ && MI.getOperand(1).getReg() == RISCV::X0 &&
+      &MBB == MI.getOperand(2).getMBB())
+    return true;
+  if (Opc == RISCV::BNE && MI.getOperand(1).getReg() == RISCV::X0 &&
+      &MBB != MI.getOperand(2).getMBB())
+    return true;
+
+  return false;
+}
+
+bool RISCVRedundantCopyElimination::optimizeBlock(MachineBasicBlock &MBB) {
+  // Check if the current basic block has a single predecessor.
+  if (MBB.pred_size() != 1)
+    return false;
+
+  // Check if the predecessor has two successors, implying the block ends in a
+  // conditional branch.
+  MachineBasicBlock *PredMBB = *MBB.pred_begin();
+  if (PredMBB->succ_size() != 2)
+    return false;
+
+  MachineBasicBlock::iterator CondBr = PredMBB->getLastNonDebugInstr();
+  if (CondBr == PredMBB->end())
+    return false;
+
+  while (true) {
+    // If we run out of terminators, give up.
+    if (!CondBr->isTerminator())
+      return false;
+    // If we found a branch with X0, stop searching and try to remove copies.
+    // TODO: Handle multiple branches with different registers.
+    if (guaranteesZeroRegInBlock(*CondBr, MBB))
+      break;
+    // If we reached the beginning of the basic block, give up.
+    if (CondBr == PredMBB->begin())
+      return false;
+    --CondBr;
+  }
+
+  Register TargetReg = CondBr->getOperand(0).getReg();
+  if (!TargetReg)
+    return false;
+
+  bool Changed = false;
+  MachineBasicBlock::iterator LastChange = MBB.begin();
+  // Remove redundant Copy instructions unless TargetReg is modified.
+  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
+    MachineInstr *MI = &*I;
+    ++I;
+    if (MI->isCopy() && MI->getOperand(0).isReg() &&
+        MI->getOperand(1).isReg()) {
+      Register DefReg = MI->getOperand(0).getReg();
+      Register SrcReg = MI->getOperand(1).getReg();
+
+      if (SrcReg == RISCV::X0 && !MRI->isReserved(DefReg) &&
+          TargetReg == DefReg) {
+        LLVM_DEBUG(dbgs() << "Remove redundant Copy : ");
+        LLVM_DEBUG(MI->print(dbgs()));
+
+        MI->eraseFromParent();
+        Changed = true;
+        LastChange = I;
+        ++NumCopiesRemoved;
+        continue;
+      }
+    }
+
+    if (MI->modifiesRegister(TargetReg, TRI))
+      break;
+  }
+
+  if (!Changed)
+    return false;
+
+  // Otherwise, we have to fixup the use-def chain, starting with the
+  // BEQ/BNE. Conservatively mark as much as we can live.
+  CondBr->clearRegisterKills(TargetReg, TRI);
+
+  // Add newly used reg to the block's live-in list if it isn't there already.
+  if (!MBB.isLiveIn(TargetReg))
+    MBB.addLiveIn(TargetReg);
+
+  // Clear any kills of TargetReg between CondBr and the last removed COPY.
+  for (MachineInstr &MMI : make_range(MBB.begin(), LastChange))
+    MMI.clearRegisterKills(TargetReg, TRI);
+
+  return true;
+}
+
+bool RISCVRedundantCopyElimination::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  TRI = MF.getSubtarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+
+  bool Changed = false;
+  for (MachineBasicBlock &MBB : MF)
+    Changed |= optimizeBlock(MBB);
+
+  return Changed;
+}
+
+FunctionPass *llvm::createRISCVRedundantCopyEliminationPass() {
+  return new RISCVRedundantCopyElimination();
+}
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterBankInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterBankInfo.cpp
index bd3b95a98b9f..5371b790a148 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterBankInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterBankInfo.cpp
@@ -12,9 +12,9 @@
 
 #include "RISCVRegisterBankInfo.h"
 #include "MCTargetDesc/RISCVMCTargetDesc.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterBank.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 
 #define GET_TARGET_REGBANK_IMPL
@@ -22,5 +22,4 @@
 
 using namespace llvm;
 
-RISCVRegisterBankInfo::RISCVRegisterBankInfo(const TargetRegisterInfo &TRI)
-    : RISCVGenRegisterBankInfo() {}
+RISCVRegisterBankInfo::RISCVRegisterBankInfo(const TargetRegisterInfo &TRI) {}
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterBankInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterBankInfo.h
index 05fac992734d..194a1548af24 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterBankInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVRegisterBankInfo.h
@@ -13,7 +13,7 @@
 #ifndef LLVM_LIB_TARGET_RISCV_RISCVREGISTERBANKINFO_H
 #define LLVM_LIB_TARGET_RISCV_RISCVREGISTERBANKINFO_H
 
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
 
 #define GET_REGBANK_DECLARATIONS
 #include "RISCVGenRegisterBank.inc"
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index 35363bf37c0d..0c9219076498 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -14,6 +14,7 @@
 #include "RISCV.h"
 #include "RISCVMachineFunctionInfo.h"
 #include "RISCVSubtarget.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -101,6 +102,7 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   markSuperRegs(Reserved, RISCV::VTYPE);
   markSuperRegs(Reserved, RISCV::VXSAT);
   markSuperRegs(Reserved, RISCV::VXRM);
+  markSuperRegs(Reserved, RISCV::VLENB); // vlenb (constant)
 
   // Floating point environment registers.
   markSuperRegs(Reserved, RISCV::FRM);
@@ -116,7 +118,7 @@ bool RISCVRegisterInfo::isAsmClobberable(const MachineFunction &MF,
 }
 
 bool RISCVRegisterInfo::isConstantPhysReg(MCRegister PhysReg) const {
-  return PhysReg == RISCV::X0;
+  return PhysReg == RISCV::X0 || PhysReg == RISCV::VLENB;
 }
 
 const uint32_t *RISCVRegisterInfo::getNoPreservedMask() const {
@@ -125,7 +127,7 @@ const uint32_t *RISCVRegisterInfo::getNoPreservedMask() const {
 
 // Frame indexes representing locations of CSRs which are given a fixed location
 // by save/restore libcalls.
-static const std::map<unsigned, int> FixedCSRFIMap = {
+static const std::pair<unsigned, int> FixedCSRFIMap[] = {
   {/*ra*/  RISCV::X1,   -1},
   {/*s0*/  RISCV::X8,   -2},
   {/*s1*/  RISCV::X9,   -3},
@@ -148,8 +150,9 @@ bool RISCVRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
   if (!RVFI->useSaveRestoreLibCalls(MF))
     return false;
 
-  auto FII = FixedCSRFIMap.find(Reg);
-  if (FII == FixedCSRFIMap.end())
+  const auto *FII =
+      llvm::find_if(FixedCSRFIMap, [&](auto P) { return P.first == Reg; });
+  if (FII == std::end(FixedCSRFIMap))
     return false;
 
   FrameIdx = FII->second;
@@ -171,7 +174,7 @@ void RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   Register FrameReg;
   StackOffset Offset =
       getFrameLowering(MF)->getFrameIndexReference(MF, FrameIndex, FrameReg);
-  bool IsRVVSpill = TII->isRVVSpill(MI, /*CheckFIs*/ false);
+  bool IsRVVSpill = RISCV::isRVVSpill(MI);
   if (!IsRVVSpill)
     Offset += StackOffset::getFixed(MI.getOperand(FIOperandNum + 1).getImm());
 
@@ -270,7 +273,7 @@ void RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset.getFixed());
   }
 
-  auto ZvlssegInfo = TII->isRVVSpillForZvlsseg(MI.getOpcode());
+  auto ZvlssegInfo = RISCV::isRVVSpillForZvlsseg(MI.getOpcode());
   if (ZvlssegInfo) {
     Register VL = MRI.createVirtualRegister(&RISCV::GPRRegClass);
     BuildMI(MBB, II, DL, TII->get(RISCV::PseudoReadVLENB), VL);
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index 8c1c03b51c24..4ff60ebda5aa 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -66,6 +66,7 @@ def sub_vrm1_5 : ComposedSubRegIndex<sub_vrm2_2, sub_vrm1_1>;
 def sub_vrm1_6 : ComposedSubRegIndex<sub_vrm2_3, sub_vrm1_0>;
 def sub_vrm1_7 : ComposedSubRegIndex<sub_vrm2_3, sub_vrm1_1>;
 
+def sub_32_hi  : SubRegIndex<32, 32>;
 } // Namespace = "RISCV"
 
 // Integer registers
@@ -461,6 +462,12 @@ let RegAltNameIndices = [ABIRegAltName] in {
                DwarfRegNum<[!add(4096, SysRegVLENB.Encoding)]>;
 }
 
+def VCSR : RegisterClass<"RISCV", [XLenVT], 32,
+                          (add VTYPE, VL, VLENB)> {
+  let RegInfos = XLenRI;
+}
+
+
 foreach m = [1, 2, 4] in {
   foreach n = NFList<m>.L in {
     def "VN" # n # "M" # m # "NoV0": RegisterTuples<
@@ -534,6 +541,35 @@ def VMV0 : RegisterClass<"RISCV", VMaskVTs, 64, (add V0)> {
   let Size = 64;
 }
 
+let RegInfos = XLenRI in {
+def GPRF16  : RegisterClass<"RISCV", [f16], 16, (add GPR)>;
+def GPRF32  : RegisterClass<"RISCV", [f32], 32, (add GPR)>;
+def GPRF64  : RegisterClass<"RISCV", [f64], 64, (add GPR)>;
+} // RegInfos = XLenRI
+
+let RegAltNameIndices = [ABIRegAltName] in {
+  foreach Index = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22,
+                   24, 26, 28, 30] in {
+    defvar Reg = !cast<Register>("X"#Index);
+    def X#Index#_PD : RISCVRegWithSubRegs<Index, Reg.AsmName,
+                                          [!cast<Register>("X"#Index),
+                                           !cast<Register>("X"#!add(Index, 1))],
+                                           Reg.AltNames> {
+      let SubRegIndices = [sub_32, sub_32_hi];
+    }
+  }
+}
+
+let RegInfos = RegInfoByHwMode<[RV64], [RegInfo<64, 64, 64>]> in
+def GPRPF64 : RegisterClass<"RISCV", [f64], 64, (add
+    X10_PD, X12_PD, X14_PD, X16_PD,
+    X6_PD,
+    X28_PD, X30_PD,
+    X8_PD,
+    X18_PD, X20_PD, X22_PD, X24_PD, X26_PD,
+    X0_PD, X2_PD, X4_PD
+)>;
+
 // The register class is added for inline assembly for vector mask types.
 def VM : VReg<VMaskVTs,
            (add (sequence "V%u", 8, 31),
diff --git a/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp b/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp
index 715d92b036e3..dadf8f81a2c0 100644
--- a/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp
@@ -21,6 +21,8 @@ using namespace llvm;
 #define DEBUG_TYPE "riscv-sextw-removal"
 
 STATISTIC(NumRemovedSExtW, "Number of removed sign-extensions");
+STATISTIC(NumTransformedToWInstrs,
+          "Number of instructions transformed to W-ops");
 
 static cl::opt<bool> DisableSExtWRemoval("riscv-disable-sextw-removal",
                                          cl::desc("Disable removal of sext.w"),
@@ -55,11 +57,143 @@ FunctionPass *llvm::createRISCVSExtWRemovalPass() {
   return new RISCVSExtWRemoval();
 }
 
+// add uses of MI to the Worklist
+static void addUses(const MachineInstr &MI,
+                    SmallVectorImpl<const MachineInstr *> &Worklist,
+                    MachineRegisterInfo &MRI) {
+  for (auto &UserOp : MRI.reg_operands(MI.getOperand(0).getReg())) {
+    const auto *User = UserOp.getParent();
+    if (User == &MI) // ignore the def, current MI
+      continue;
+    Worklist.push_back(User);
+  }
+}
+
+// returns true if all uses of OrigMI only depend on the lower word of its
+// output, so we can transform OrigMI to the corresponding W-version.
+// TODO: handle multiple interdependent transformations
+static bool isAllUsesReadW(const MachineInstr &OrigMI,
+                           MachineRegisterInfo &MRI) {
+
+  SmallPtrSet<const MachineInstr *, 4> Visited;
+  SmallVector<const MachineInstr *, 4> Worklist;
+
+  Visited.insert(&OrigMI);
+  addUses(OrigMI, Worklist, MRI);
+
+  while (!Worklist.empty()) {
+    const MachineInstr *MI = Worklist.pop_back_val();
+
+    if (!Visited.insert(MI).second) {
+      // If we've looped back to OrigMI through a PHI cycle, we can't transform
+      // LD or LWU, because these operations use all 64 bits of input.
+      if (MI == &OrigMI) {
+        unsigned opcode = MI->getOpcode();
+        if (opcode == RISCV::LD || opcode == RISCV::LWU)
+          return false;
+      }
+      continue;
+    }
+
+    switch (MI->getOpcode()) {
+    case RISCV::ADDIW:
+    case RISCV::ADDW:
+    case RISCV::DIVUW:
+    case RISCV::DIVW:
+    case RISCV::MULW:
+    case RISCV::REMUW:
+    case RISCV::REMW:
+    case RISCV::SLLIW:
+    case RISCV::SLLW:
+    case RISCV::SRAIW:
+    case RISCV::SRAW:
+    case RISCV::SRLIW:
+    case RISCV::SRLW:
+    case RISCV::SUBW:
+    case RISCV::ROLW:
+    case RISCV::RORW:
+    case RISCV::RORIW:
+    case RISCV::CLZW:
+    case RISCV::CTZW:
+    case RISCV::CPOPW:
+    case RISCV::SLLI_UW:
+    case RISCV::FCVT_S_W:
+    case RISCV::FCVT_S_WU:
+    case RISCV::FCVT_D_W:
+    case RISCV::FCVT_D_WU:
+      continue;
+
+    // these overwrite higher input bits, otherwise the lower word of output
+    // depends only on the lower word of input. So check their uses read W.
+    case RISCV::SLLI:
+      if (MI->getOperand(2).getImm() >= 32)
+        continue;
+      addUses(*MI, Worklist, MRI);
+      continue;
+    case RISCV::ANDI:
+      if (isUInt<11>(MI->getOperand(2).getImm()))
+        continue;
+      addUses(*MI, Worklist, MRI);
+      continue;
+    case RISCV::ORI:
+      if (!isUInt<11>(MI->getOperand(2).getImm()))
+        continue;
+      addUses(*MI, Worklist, MRI);
+      continue;
+
+    case RISCV::BEXTI:
+      if (MI->getOperand(2).getImm() >= 32)
+        return false;
+      continue;
+
+    // For these, lower word of output in these operations, depends only on
+    // the lower word of input. So, we check all uses only read lower word.
+    case RISCV::COPY:
+    case RISCV::PHI:
+
+    case RISCV::ADD:
+    case RISCV::ADDI:
+    case RISCV::AND:
+    case RISCV::MUL:
+    case RISCV::OR:
+    case RISCV::SLL:
+    case RISCV::SUB:
+    case RISCV::XOR:
+    case RISCV::XORI:
+
+    case RISCV::ADD_UW:
+    case RISCV::ANDN:
+    case RISCV::CLMUL:
+    case RISCV::ORC_B:
+    case RISCV::ORN:
+    case RISCV::SEXT_B:
+    case RISCV::SEXT_H:
+    case RISCV::SH1ADD:
+    case RISCV::SH1ADD_UW:
+    case RISCV::SH2ADD:
+    case RISCV::SH2ADD_UW:
+    case RISCV::SH3ADD:
+    case RISCV::SH3ADD_UW:
+    case RISCV::XNOR:
+    case RISCV::ZEXT_H_RV64:
+      addUses(*MI, Worklist, MRI);
+      continue;
+    default:
+      return false;
+    }
+  }
+  return true;
+}
+
 // This function returns true if the machine instruction always outputs a value
 // where bits 63:32 match bit 31.
+// Alternatively, if the instruction can be converted to W variant
+// (e.g. ADD->ADDW) and all of its uses only use the lower word of its output,
+// then return true and add the instr to FixableDef to be convereted later
 // TODO: Allocate a bit in TSFlags for the W instructions?
 // TODO: Add other W instructions.
-static bool isSignExtendingOpW(const MachineInstr &MI) {
+static bool isSignExtendingOpW(MachineInstr &MI, MachineRegisterInfo &MRI,
+                               SmallPtrSetImpl<MachineInstr *> &FixableDef) {
   switch (MI.getOpcode()) {
   case RISCV::LUI:
   case RISCV::LW:
@@ -89,8 +223,9 @@ static bool isSignExtendingOpW(const MachineInstr &MI) {
   case RISCV::FCVT_WU_S:
   case RISCV::FCVT_W_D:
   case RISCV::FCVT_WU_D:
+  case RISCV::FMV_X_W:
   // The following aren't W instructions, but are either sign extended from a
-  // smaller size or put zeros in bits 63:31.
+  // smaller size, always outputs a small integer, or put zeros in bits 63:31.
   case RISCV::LBU:
   case RISCV::LHU:
   case RISCV::LB:
@@ -102,6 +237,12 @@ static bool isSignExtendingOpW(const MachineInstr &MI) {
   case RISCV::SEXT_B:
   case RISCV::SEXT_H:
   case RISCV::ZEXT_H_RV64:
+  case RISCV::FMV_X_H:
+  case RISCV::BEXT:
+  case RISCV::BEXTI:
+  case RISCV::CLZ:
+  case RISCV::CPOP:
+  case RISCV::CTZ:
     return true;
   // shifting right sufficiently makes the value 32-bit sign-extended
   case RISCV::SRAI:
@@ -110,7 +251,14 @@ static bool isSignExtendingOpW(const MachineInstr &MI) {
     return MI.getOperand(2).getImm() > 32;
   // The LI pattern ADDI rd, X0, imm is sign extended.
   case RISCV::ADDI:
-    return MI.getOperand(1).isReg() && MI.getOperand(1).getReg() == RISCV::X0;
+    if (MI.getOperand(1).isReg() && MI.getOperand(1).getReg() == RISCV::X0)
+      return true;
+    if (isAllUsesReadW(MI, MRI)) {
+      // transform to ADDIW
+      FixableDef.insert(&MI);
+      return true;
+    }
+    return false;
   // An ANDI with an 11 bit immediate will zero bits 63:11.
   case RISCV::ANDI:
     return isUInt<11>(MI.getOperand(2).getImm());
@@ -120,28 +268,45 @@ static bool isSignExtendingOpW(const MachineInstr &MI) {
   // Copying from X0 produces zero.
   case RISCV::COPY:
     return MI.getOperand(1).getReg() == RISCV::X0;
+
+  // With these opcode, we can "fix" them with the W-version
+  // if we know all users of the result only rely on bits 31:0
+  case RISCV::SLLI:
+    // SLLIW reads the lowest 5 bits, while SLLI reads lowest 6 bits
+    if (MI.getOperand(2).getImm() >= 32)
+      return false;
+    LLVM_FALLTHROUGH;
+  case RISCV::ADD:
+  case RISCV::LD:
+  case RISCV::LWU:
+  case RISCV::MUL:
+  case RISCV::SUB:
+    if (isAllUsesReadW(MI, MRI)) {
+      FixableDef.insert(&MI);
+      return true;
+    }
   }
 
   return false;
 }
 
-static bool isSignExtendedW(const MachineInstr &OrigMI,
-                            MachineRegisterInfo &MRI) {
+static bool isSignExtendedW(MachineInstr &OrigMI, MachineRegisterInfo &MRI,
+                            SmallPtrSetImpl<MachineInstr *> &FixableDef) {
 
   SmallPtrSet<const MachineInstr *, 4> Visited;
-  SmallVector<const MachineInstr *, 4> Worklist;
+  SmallVector<MachineInstr *, 4> Worklist;
 
   Worklist.push_back(&OrigMI);
 
   while (!Worklist.empty()) {
-    const MachineInstr *MI = Worklist.pop_back_val();
+    MachineInstr *MI = Worklist.pop_back_val();
 
     // If we already visited this instruction, we don't need to check it again.
     if (!Visited.insert(MI).second)
       continue;
 
     // If this is a sign extending operation we don't need to look any further.
-    if (isSignExtendingOpW(*MI))
+    if (isSignExtendingOpW(*MI, MRI, FixableDef))
       continue;
 
     // Is this an instruction that propagates sign extend.
@@ -157,7 +322,7 @@ static bool isSignExtendedW(const MachineInstr &OrigMI,
       // If this is a copy from another register, check its source instruction.
       if (!SrcReg.isVirtual())
         return false;
-      const MachineInstr *SrcMI = MRI.getVRegDef(SrcReg);
+      MachineInstr *SrcMI = MRI.getVRegDef(SrcReg);
       if (!SrcMI)
         return false;
 
@@ -165,18 +330,25 @@ static bool isSignExtendedW(const MachineInstr &OrigMI,
       Worklist.push_back(SrcMI);
       break;
     }
+
+    // For these, we just need to check if the 1st operand is sign extended.
+    case RISCV::BCLRI:
+    case RISCV::BINVI:
+    case RISCV::BSETI:
+      if (MI->getOperand(2).getImm() >= 31)
+        return false;
+      LLVM_FALLTHROUGH;
     case RISCV::REM:
     case RISCV::ANDI:
     case RISCV::ORI:
     case RISCV::XORI: {
       // |Remainder| is always <= |Dividend|. If D is 32-bit, then so is R.
       // DIV doesn't work because of the edge case 0xf..f 8000 0000 / (long)-1
-      // Logical operations use a sign extended 12-bit immediate. We just need
-      // to check if the other operand is sign extended.
+      // Logical operations use a sign extended 12-bit immediate.
       Register SrcReg = MI->getOperand(1).getReg();
       if (!SrcReg.isVirtual())
         return false;
-      const MachineInstr *SrcMI = MRI.getVRegDef(SrcReg);
+      MachineInstr *SrcMI = MRI.getVRegDef(SrcReg);
       if (!SrcMI)
         return false;
 
@@ -214,7 +386,7 @@ static bool isSignExtendedW(const MachineInstr &OrigMI,
         Register SrcReg = MI->getOperand(I).getReg();
         if (!SrcReg.isVirtual())
           return false;
-        const MachineInstr *SrcMI = MRI.getVRegDef(SrcReg);
+        MachineInstr *SrcMI = MRI.getVRegDef(SrcReg);
         if (!SrcMI)
           return false;
 
@@ -232,6 +404,26 @@ static bool isSignExtendedW(const MachineInstr &OrigMI,
   return true;
 }
 
+static unsigned getWOp(unsigned Opcode) {
+  switch (Opcode) {
+  case RISCV::ADDI:
+    return RISCV::ADDIW;
+  case RISCV::ADD:
+    return RISCV::ADDW;
+  case RISCV::LD:
+  case RISCV::LWU:
+    return RISCV::LW;
+  case RISCV::MUL:
+    return RISCV::MULW;
+  case RISCV::SLLI:
+    return RISCV::SLLIW;
+  case RISCV::SUB:
+    return RISCV::SUBW;
+  default:
+    llvm_unreachable("Unexpected opcode for replacement with W variant");
+  }
+}
+
 bool RISCVSExtWRemoval::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()) || DisableSExtWRemoval)
     return false;
@@ -242,7 +434,10 @@ bool RISCVSExtWRemoval::runOnMachineFunction(MachineFunction &MF) {
   if (!ST.is64Bit())
     return false;
 
-  bool MadeChange = false;
+  SmallPtrSet<MachineInstr *, 4> SExtWRemovalCands;
+
+  // Replacing instructions invalidates the MI iterator
+  // we collect the candidates, then iterate over them separately.
   for (MachineBasicBlock &MBB : MF) {
     for (auto I = MBB.begin(), IE = MBB.end(); I != IE;) {
       MachineInstr *MI = &*I++;
@@ -257,21 +452,49 @@ bool RISCVSExtWRemoval::runOnMachineFunction(MachineFunction &MF) {
       if (!SrcReg.isVirtual())
         continue;
 
-      const MachineInstr &SrcMI = *MRI.getVRegDef(SrcReg);
-      if (!isSignExtendedW(SrcMI, MRI))
-        continue;
+      SExtWRemovalCands.insert(MI);
+    }
+  }
 
-      Register DstReg = MI->getOperand(0).getReg();
-      if (!MRI.constrainRegClass(SrcReg, MRI.getRegClass(DstReg)))
-        continue;
+  bool MadeChange = false;
+  for (auto MI : SExtWRemovalCands) {
+    SmallPtrSet<MachineInstr *, 4> FixableDef;
+    Register SrcReg = MI->getOperand(1).getReg();
+    MachineInstr &SrcMI = *MRI.getVRegDef(SrcReg);
+
+    // If all definitions reaching MI sign-extend their output,
+    // then sext.w is redundant
+    if (!isSignExtendedW(SrcMI, MRI, FixableDef))
+      continue;
 
-      LLVM_DEBUG(dbgs() << "Removing redundant sign-extension\n");
-      MRI.replaceRegWith(DstReg, SrcReg);
-      MRI.clearKillFlags(SrcReg);
-      MI->eraseFromParent();
-      ++NumRemovedSExtW;
-      MadeChange = true;
+    Register DstReg = MI->getOperand(0).getReg();
+    if (!MRI.constrainRegClass(SrcReg, MRI.getRegClass(DstReg)))
+      continue;
+    // Replace Fixable instructions with their W versions.
+    for (MachineInstr *Fixable : FixableDef) {
+      MachineBasicBlock &MBB = *Fixable->getParent();
+      const DebugLoc &DL = Fixable->getDebugLoc();
+      unsigned Code = getWOp(Fixable->getOpcode());
+      MachineInstrBuilder Replacement =
+          BuildMI(MBB, Fixable, DL, ST.getInstrInfo()->get(Code));
+      for (auto Op : Fixable->operands())
+        Replacement.add(Op);
+      for (auto Op : Fixable->memoperands())
+        Replacement.addMemOperand(Op);
+
+      LLVM_DEBUG(dbgs() << "Replacing " << *Fixable);
+      LLVM_DEBUG(dbgs() << "     with " << *Replacement);
+
+      Fixable->eraseFromParent();
+      ++NumTransformedToWInstrs;
     }
+
+    LLVM_DEBUG(dbgs() << "Removing redundant sign-extension\n");
+    MRI.replaceRegWith(DstReg, SrcReg);
+    MRI.clearKillFlags(SrcReg);
+    MI->eraseFromParent();
+    ++NumRemovedSExtW;
+    MadeChange = true;
   }
 
   return MadeChange;
diff --git a/llvm/lib/Target/RISCV/RISCVSchedRocket.td b/llvm/lib/Target/RISCV/RISCVSchedRocket.td
index 78cf34c8c582..5a3c8deb7943 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedRocket.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedRocket.td
@@ -242,6 +242,11 @@ defm : UnsupportedSchedZba;
 defm : UnsupportedSchedZbb;
 defm : UnsupportedSchedZbc;
 defm : UnsupportedSchedZbs;
+defm : UnsupportedSchedZbe;
 defm : UnsupportedSchedZbf;
+defm : UnsupportedSchedZbm;
+defm : UnsupportedSchedZbp;
+defm : UnsupportedSchedZbr;
+defm : UnsupportedSchedZbt;
 defm : UnsupportedSchedZfh;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index 9f5e5ff1223c..cfbd9722d7bc 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -229,6 +229,11 @@ defm : UnsupportedSchedZba;
 defm : UnsupportedSchedZbb;
 defm : UnsupportedSchedZbc;
 defm : UnsupportedSchedZbs;
+defm : UnsupportedSchedZbe;
 defm : UnsupportedSchedZbf;
+defm : UnsupportedSchedZbm;
+defm : UnsupportedSchedZbp;
+defm : UnsupportedSchedZbr;
+defm : UnsupportedSchedZbt;
 defm : UnsupportedSchedZfh;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVScheduleB.td b/llvm/lib/Target/RISCV/RISCVScheduleB.td
index 193760e1e15b..4bfe7b316eeb 100644
--- a/llvm/lib/Target/RISCV/RISCVScheduleB.td
+++ b/llvm/lib/Target/RISCV/RISCVScheduleB.td
@@ -33,10 +33,59 @@ def WriteCLMUL       : SchedWrite; // CLMUL/CLMULR/CLMULH
 def WriteSingleBit   : SchedWrite; // BCLR/BSET/BINV/BEXT
 def WriteSingleBitImm: SchedWrite; // BCLRI/BSETI/BINVI/BEXTI
 
+// Zbe extension
+def WriteDecompress  : SchedWrite; // bdecompress
+def WriteCompress    : SchedWrite; // bcompress
+def WriteDecompress32: SchedWrite; // bdecompressw
+def WriteCompress32  : SchedWrite; // bcompressw
+
 // Zbf extension
 def WriteBFP         : SchedWrite; // BFP
 def WriteBFP32       : SchedWrite; // BFPW
 
+// Zbm extension
+def WriteBMatrix     : SchedWrite; // bmator/bmatxor/bmatflip
+
+// Zbp extension
+def WriteORC         : SchedWrite; // gorc
+def WriteREV         : SchedWrite; // grev
+def WriteORC32       : SchedWrite; // gorcw
+def WriteREV32       : SchedWrite; // grevw
+def WriteREVImm      : SchedWrite; // grevi
+def WriteORCImm      : SchedWrite; // gorci
+def WriteREVImm32    : SchedWrite; // greviw
+def WriteORCImm32    : SchedWrite; // gorciw
+def WriteSHFL        : SchedWrite; // shfl
+def WriteUNSHFL      : SchedWrite; // unshfl
+def WriteSHFL32      : SchedWrite; // shflw
+def WriteUNSHFL32    : SchedWrite; // unshflw
+def WriteSHFLImm     : SchedWrite; // shfli
+def WriteUNSHFLImm   : SchedWrite; // unshfli
+def WriteXPERMH      : SchedWrite; // xperm.h
+def WriteXPERMW      : SchedWrite; // xperm.w
+def WritePACK        : SchedWrite; // pack/packh
+def WritePACK32      : SchedWrite; // packw
+def WritePACKU       : SchedWrite; // packu
+def WritePACKU32     : SchedWrite; // packuw
+
+// Zbr extension
+def WriteCRCB        : SchedWrite; // crc32.b
+def WriteCRCH        : SchedWrite; // crc32.h
+def WriteCRCW        : SchedWrite; // crc32.w
+def WriteCRCD        : SchedWrite; // crc32.d
+def WriteCRCCB       : SchedWrite; // crc32c.b
+def WriteCRCCH       : SchedWrite; // crc32c.h
+def WriteCRCCW       : SchedWrite; // crc32c.w
+def WriteCRCCD       : SchedWrite; // crc32c.d
+
+// Zbt extension
+def WriteCMix        : SchedWrite; // cmix
+def WriteCMov        : SchedWrite; // cmov
+def WriteFSReg       : SchedWrite; // fsl/fsr
+def WriteFSRImm      : SchedWrite; // fsri
+def WriteFSReg32     : SchedWrite; // fslw/fsrw
+def WriteFSRImm32    : SchedWrite; // fsriw
+
 /// Define scheduler resources associated with use operands.
 
 // Zba extension
@@ -64,10 +113,59 @@ def ReadCLMUL       : SchedRead; // CLMUL/CLMULR/CLMULH
 def ReadSingleBit   : SchedRead; // BCLR/BSET/BINV/BEXT
 def ReadSingleBitImm: SchedRead; // BCLRI/BSETI/BINVI/BEXTI
 
+// Zbe extension
+def ReadDecompress  : SchedRead; // bdecompress
+def ReadCompress    : SchedRead; // bcompress
+def ReadDecompress32: SchedRead; // bdecompressw
+def ReadCompress32  : SchedRead; // bcompressw
+
 // Zbf extension
 def ReadBFP         : SchedRead; // BFP
 def ReadBFP32       : SchedRead; // BFPW
 
+// Zbm extension
+def ReadBMatrix     : SchedRead; // bmator/bmatxor/bmatflip
+
+// Zbp extension
+def ReadORC         : SchedRead; // gorc
+def ReadREV         : SchedRead; // grev
+def ReadORC32       : SchedRead; // gorcw
+def ReadREV32       : SchedRead; // grevw
+def ReadREVImm      : SchedRead; // grevi
+def ReadORCImm      : SchedRead; // groci
+def ReadREVImm32    : SchedRead; // greviw
+def ReadORCImm32    : SchedRead; // gorciw
+def ReadSHFL        : SchedRead; // shfl
+def ReadUNSHFL      : SchedRead; // unshfl
+def ReadSHFL32      : SchedRead; // shflw
+def ReadUNSHFL32    : SchedRead; // unshflw
+def ReadSHFLImm     : SchedRead; // shfli
+def ReadUNSHFLImm   : SchedRead; // unshfli
+def ReadXPERMH      : SchedRead; // xperm.h
+def ReadXPERMW      : SchedRead; // xperm.w
+def ReadPACK        : SchedRead; // pack/packh
+def ReadPACK32      : SchedRead; // packw
+def ReadPACKU       : SchedRead; // packu
+def ReadPACKU32     : SchedRead; // packuw
+
+// Zbr extension
+def ReadCRCB        : SchedRead; // crc32.b
+def ReadCRCH        : SchedRead; // crc32.h
+def ReadCRCW        : SchedRead; // crc32.w
+def ReadCRCD        : SchedRead; // crc32.d
+def ReadCRCCB       : SchedRead; // crc32c.b
+def ReadCRCCH       : SchedRead; // crc32c.h
+def ReadCRCCW       : SchedRead; // crc32c.w
+def ReadCRCCD       : SchedRead; // crc32c.d
+
+// Zbt extension
+def ReadCMix        : SchedRead; // cmix
+def ReadCMov        : SchedRead; // cmov
+def ReadFSReg       : SchedRead; // fsl/fsr
+def ReadFSRImm      : SchedRead; // fsri
+def ReadFSReg32     : SchedRead; // fslw/fsrw
+def ReadFSRImm32    : SchedRead; // fsriw
+
 /// Define default scheduler resources for B.
 
 multiclass UnsupportedSchedZba {
@@ -128,6 +226,20 @@ def : ReadAdvance<ReadSingleBitImm, 0>;
 }
 }
 
+multiclass UnsupportedSchedZbe {
+let Unsupported = true in {
+def : WriteRes<WriteDecompress, []>;
+def : WriteRes<WriteCompress, []>;
+def : WriteRes<WriteDecompress32, []>;
+def : WriteRes<WriteCompress32, []>;
+
+def : ReadAdvance<ReadDecompress, 0>;
+def : ReadAdvance<ReadCompress, 0>;
+def : ReadAdvance<ReadDecompress32, 0>;
+def : ReadAdvance<ReadCompress32, 0>;
+}
+}
+
 multiclass UnsupportedSchedZbf {
 let Unsupported = true in {
 def : WriteRes<WriteBFP, []>;
@@ -137,3 +249,97 @@ def : ReadAdvance<ReadBFP, 0>;
 def : ReadAdvance<ReadBFP32, 0>;
 }
 }
+
+multiclass UnsupportedSchedZbm {
+let Unsupported = true in {
+def : WriteRes<WriteBMatrix, []>;
+
+def : ReadAdvance<ReadBMatrix, 0>;
+}
+}
+
+multiclass UnsupportedSchedZbp {
+let Unsupported = true in {
+def : WriteRes<WriteORC, []>;
+def : WriteRes<WriteREV, []>;
+def : WriteRes<WriteORC32, []>;
+def : WriteRes<WriteREV32, []>;
+def : WriteRes<WriteREVImm, []>;
+def : WriteRes<WriteORCImm, []>;
+def : WriteRes<WriteREVImm32, []>;
+def : WriteRes<WriteORCImm32, []>;
+def : WriteRes<WriteSHFL, []>;
+def : WriteRes<WriteUNSHFL, []>;
+def : WriteRes<WriteSHFL32, []>;
+def : WriteRes<WriteUNSHFL32, []>;
+def : WriteRes<WriteSHFLImm, []>;
+def : WriteRes<WriteUNSHFLImm, []>;
+def : WriteRes<WriteXPERMH, []>;
+def : WriteRes<WriteXPERMW, []>;
+def : WriteRes<WritePACK, []>;
+def : WriteRes<WritePACK32, []>;
+def : WriteRes<WritePACKU, []>;
+def : WriteRes<WritePACKU32, []>;
+
+def : ReadAdvance<ReadORC, 0>;
+def : ReadAdvance<ReadREV, 0>;
+def : ReadAdvance<ReadORC32, 0>;
+def : ReadAdvance<ReadREV32, 0>;
+def : ReadAdvance<ReadREVImm, 0>;
+def : ReadAdvance<ReadORCImm, 0>;
+def : ReadAdvance<ReadREVImm32, 0>;
+def : ReadAdvance<ReadORCImm32, 0>;
+def : ReadAdvance<ReadSHFL, 0>;
+def : ReadAdvance<ReadUNSHFL, 0>;
+def : ReadAdvance<ReadSHFL32, 0>;
+def : ReadAdvance<ReadUNSHFL32, 0>;
+def : ReadAdvance<ReadSHFLImm, 0>;
+def : ReadAdvance<ReadUNSHFLImm, 0>;
+def : ReadAdvance<ReadXPERMH, 0>;
+def : ReadAdvance<ReadXPERMW, 0>;
+def : ReadAdvance<ReadPACK, 0>;
+def : ReadAdvance<ReadPACK32, 0>;
+def : ReadAdvance<ReadPACKU, 0>;
+def : ReadAdvance<ReadPACKU32, 0>;
+}
+}
+
+multiclass UnsupportedSchedZbr {
+let Unsupported = true in {
+def : WriteRes<WriteCRCB, []>;
+def : WriteRes<WriteCRCH, []>;
+def : WriteRes<WriteCRCW, []>;
+def : WriteRes<WriteCRCD, []>;
+def : WriteRes<WriteCRCCB, []>;
+def : WriteRes<WriteCRCCH, []>;
+def : WriteRes<WriteCRCCW, []>;
+def : WriteRes<WriteCRCCD, []>;
+
+def : ReadAdvance<ReadCRCB, 0>;
+def : ReadAdvance<ReadCRCH, 0>;
+def : ReadAdvance<ReadCRCW, 0>;
+def : ReadAdvance<ReadCRCD, 0>;
+def : ReadAdvance<ReadCRCCB, 0>;
+def : ReadAdvance<ReadCRCCH, 0>;
+def : ReadAdvance<ReadCRCCW, 0>;
+def : ReadAdvance<ReadCRCCD, 0>;
+}
+}
+
+multiclass UnsupportedSchedZbt {
+let Unsupported = true in {
+def : WriteRes<WriteCMix, []>;
+def : WriteRes<WriteCMov, []>;
+def : WriteRes<WriteFSReg, []>;
+def : WriteRes<WriteFSRImm, []>;
+def : WriteRes<WriteFSReg32, []>;
+def : WriteRes<WriteFSRImm32, []>;
+
+def : ReadAdvance<ReadCMix, 0>;
+def : ReadAdvance<ReadCMov, 0>;
+def : ReadAdvance<ReadFSReg, 0>;
+def : ReadAdvance<ReadFSRImm, 0>;
+def : ReadAdvance<ReadFSReg32, 0>;
+def : ReadAdvance<ReadFSRImm32, 0>;
+}
+}
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
index 976e4ccb1422..7589b44b81d3 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
@@ -15,6 +15,7 @@
 #include "RISCVCallLowering.h"
 #include "RISCVFrameLowering.h"
 #include "RISCVLegalizerInfo.h"
+#include "RISCVMacroFusion.h"
 #include "RISCVRegisterBankInfo.h"
 #include "RISCVTargetMachine.h"
 #include "llvm/MC/TargetRegistry.h"
@@ -28,16 +29,21 @@ using namespace llvm;
 #define GET_SUBTARGETINFO_CTOR
 #include "RISCVGenSubtargetInfo.inc"
 
-static cl::opt<unsigned> RVVVectorBitsMax(
+static cl::opt<bool> EnableSubRegLiveness("riscv-enable-subreg-liveness",
+                                          cl::init(false), cl::Hidden);
+
+static cl::opt<int> RVVVectorBitsMax(
     "riscv-v-vector-bits-max",
     cl::desc("Assume V extension vector registers are at most this big, "
              "with zero meaning no maximum size is assumed."),
     cl::init(0), cl::Hidden);
 
-static cl::opt<unsigned> RVVVectorBitsMin(
+static cl::opt<int> RVVVectorBitsMin(
     "riscv-v-vector-bits-min",
     cl::desc("Assume V extension vector registers are at least this big, "
-             "with zero meaning no minimum size is assumed."),
+             "with zero meaning no minimum size is assumed. A value of -1 "
+             "means use Zvl*b extension. This is primarily used to enable "
+             "autovectorization with fixed width vectors."),
     cl::init(0), cl::Hidden);
 
 static cl::opt<unsigned> RVVVectorLMULMax(
@@ -46,11 +52,6 @@ static cl::opt<unsigned> RVVVectorLMULMax(
              "Fractional LMUL values are not supported."),
     cl::init(8), cl::Hidden);
 
-static cl::opt<unsigned> RVVVectorELENMax(
-    "riscv-v-fixed-length-vector-elen-max",
-    cl::desc("The maximum ELEN value to use for fixed length vectors."),
-    cl::init(64), cl::Hidden);
-
 static cl::opt<bool> RISCVDisableUsingConstantPoolForLargeInts(
     "riscv-disable-using-constant-pool-for-large-ints",
     cl::desc("Disable using constant pool for large integers."),
@@ -69,11 +70,8 @@ RISCVSubtarget::initializeSubtargetDependencies(const Triple &TT, StringRef CPU,
                                                 StringRef ABIName) {
   // Determine default and user-specified characteristics
   bool Is64Bit = TT.isArch64Bit();
-  if (CPU.empty())
+  if (CPU.empty() || CPU == "generic")
     CPU = Is64Bit ? "generic-rv64" : "generic-rv32";
-  if (CPU == "generic")
-    report_fatal_error(Twine("CPU 'generic' is not supported. Use ") +
-                       (Is64Bit ? "generic-rv64" : "generic-rv32"));
 
   if (TuneCPU.empty())
     TuneCPU = CPU;
@@ -144,7 +142,7 @@ unsigned RISCVSubtarget::getMaxRVVVectorSizeInBits() const {
 
   // ZvlLen specifies the minimum required vlen. The upper bound provided by
   // riscv-v-vector-bits-max should be no less than it.
-  if (RVVVectorBitsMax < ZvlLen)
+  if (RVVVectorBitsMax < (int)ZvlLen)
     report_fatal_error("riscv-v-vector-bits-max specified is lower "
                        "than the Zvl*b limitation");
 
@@ -162,14 +160,18 @@ unsigned RISCVSubtarget::getMaxRVVVectorSizeInBits() const {
 }
 
 unsigned RISCVSubtarget::getMinRVVVectorSizeInBits() const {
+  assert(hasVInstructions() &&
+         "Tried to get vector length without Zve or V extension support!");
+
+  if (RVVVectorBitsMin == -1)
+    return ZvlLen;
+
   // ZvlLen specifies the minimum required vlen. The lower bound provided by
   // riscv-v-vector-bits-min should be no less than it.
-  if (RVVVectorBitsMin != 0 && RVVVectorBitsMin < ZvlLen)
+  if (RVVVectorBitsMin != 0 && RVVVectorBitsMin < (int)ZvlLen)
     report_fatal_error("riscv-v-vector-bits-min specified is lower "
                        "than the Zvl*b limitation");
 
-  assert(hasVInstructions() &&
-         "Tried to get vector length without Zve or V extension support!");
   // FIXME: Change to >= 32 when VLEN = 32 is supported
   assert(
       (RVVVectorBitsMin == 0 ||
@@ -195,17 +197,19 @@ unsigned RISCVSubtarget::getMaxLMULForFixedLengthVectors() const {
       std::max<unsigned>(std::min<unsigned>(RVVVectorLMULMax, 8), 1));
 }
 
-unsigned RISCVSubtarget::getMaxELENForFixedLengthVectors() const {
-  assert(hasVInstructions() &&
-         "Tried to get maximum ELEN without Zve or V extension support!");
-  assert(RVVVectorELENMax <= 64 && RVVVectorELENMax >= 8 &&
-         isPowerOf2_32(RVVVectorELENMax) &&
-         "V extension requires a ELEN to be a power of 2 between 8 and 64!");
-  unsigned ELEN = hasVInstructionsI64() ? 64 : 32;
-  return PowerOf2Floor(
-      std::max<unsigned>(std::min<unsigned>(RVVVectorELENMax, ELEN), 8));
-}
-
 bool RISCVSubtarget::useRVVForFixedLengthVectors() const {
   return hasVInstructions() && getMinRVVVectorSizeInBits() != 0;
 }
+
+bool RISCVSubtarget::enableSubRegLiveness() const {
+  if (EnableSubRegLiveness.getNumOccurrences())
+    return EnableSubRegLiveness;
+  // Enable subregister liveness for RVV to better handle LMUL>1 and segment
+  // load/store.
+  return hasVInstructions();
+}
+
+void RISCVSubtarget::getPostRAMutations(
+    std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
+  Mutations.push_back(createRISCVMacroFusionDAGMutation());
+}
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 34c6e8e684ac..831f7fadaa62 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -20,7 +20,7 @@
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DataLayout.h"
@@ -34,22 +34,6 @@ class StringRef;
 
 class RISCVSubtarget : public RISCVGenSubtargetInfo {
 public:
-  enum ExtZvl : unsigned {
-    NotSet = 0,
-    Zvl32b = 32,
-    Zvl64b = 64,
-    Zvl128b = 128,
-    Zvl256b = 256,
-    Zvl512b = 512,
-    Zvl1024b = 1024,
-    Zvl2048b = 2048,
-    Zvl4096b = 4096,
-    Zvl8192b = 8192,
-    Zvl16384b = 16384,
-    Zvl32768b = 32768,
-    Zvl65536b = 65536
-  };
-
   enum RISCVProcFamilyEnum : uint8_t {
     Others,
     SiFive7,
@@ -65,6 +49,7 @@ private:
   bool HasStdExtF = false;
   bool HasStdExtD = false;
   bool HasStdExtC = false;
+  bool HasStdExtZihintpause = false;
   bool HasStdExtZba = false;
   bool HasStdExtZbb = false;
   bool HasStdExtZbc = false;
@@ -81,8 +66,13 @@ private:
   bool HasStdExtZve64x = false;
   bool HasStdExtZve64f = false;
   bool HasStdExtZve64d = false;
+  bool HasStdExtZvfh = false;
   bool HasStdExtZfhmin = false;
   bool HasStdExtZfh = false;
+  bool HasStdExtZfinx = false;
+  bool HasStdExtZdinx = false;
+  bool HasStdExtZhinxmin = false;
+  bool HasStdExtZhinx = false;
   bool HasStdExtZbkb = false;
   bool HasStdExtZbkc = false;
   bool HasStdExtZbkx = false;
@@ -96,13 +86,19 @@ private:
   bool HasStdExtZks = false;
   bool HasStdExtZkt = false;
   bool HasStdExtZk = false;
+  bool HasStdExtZicbom = false;
+  bool HasStdExtZicboz = false;
+  bool HasStdExtZicbop = false;
   bool HasRV64 = false;
   bool IsRV32E = false;
   bool EnableLinkerRelax = false;
   bool EnableRVCHintInstrs = true;
+  bool EnableDefaultUnroll = true;
   bool EnableSaveRestore = false;
+  bool EnableUnalignedScalarMem = false;
+  bool HasLUIADDIFusion = false;
   unsigned XLen = 32;
-  ExtZvl ZvlLen = ExtZvl::NotSet;
+  unsigned ZvlLen = 0;
   MVT XLenVT = MVT::i32;
   uint8_t MaxInterleaveFactor = 2;
   RISCVABI::ABI TargetABI = RISCVABI::ABI_Unknown;
@@ -157,6 +153,7 @@ public:
   bool hasStdExtD() const { return HasStdExtD; }
   bool hasStdExtC() const { return HasStdExtC; }
   bool hasStdExtV() const { return HasStdExtV; }
+  bool hasStdExtZihintpause() const { return HasStdExtZihintpause; }
   bool hasStdExtZba() const { return HasStdExtZba; }
   bool hasStdExtZbb() const { return HasStdExtZbb; }
   bool hasStdExtZbc() const { return HasStdExtZbc; }
@@ -167,9 +164,14 @@ public:
   bool hasStdExtZbr() const { return HasStdExtZbr; }
   bool hasStdExtZbs() const { return HasStdExtZbs; }
   bool hasStdExtZbt() const { return HasStdExtZbt; }
-  bool hasStdExtZvl() const { return ZvlLen != ExtZvl::NotSet; }
+  bool hasStdExtZvl() const { return ZvlLen != 0; }
+  bool hasStdExtZvfh() const { return HasStdExtZvfh; }
   bool hasStdExtZfhmin() const { return HasStdExtZfhmin; }
   bool hasStdExtZfh() const { return HasStdExtZfh; }
+  bool hasStdExtZfinx() const { return HasStdExtZfinx; }
+  bool hasStdExtZdinx() const { return HasStdExtZdinx; }
+  bool hasStdExtZhinxmin() const { return HasStdExtZhinxmin; }
+  bool hasStdExtZhinx() const { return HasStdExtZhinx; }
   bool hasStdExtZbkb() const { return HasStdExtZbkb; }
   bool hasStdExtZbkc() const { return HasStdExtZbkc; }
   bool hasStdExtZbkx() const { return HasStdExtZbkx; }
@@ -179,11 +181,17 @@ public:
   bool hasStdExtZksed() const { return HasStdExtZksed; }
   bool hasStdExtZksh() const { return HasStdExtZksh; }
   bool hasStdExtZkr() const { return HasStdExtZkr; }
+  bool hasStdExtZicbom() const { return HasStdExtZicbom; }
+  bool hasStdExtZicboz() const { return HasStdExtZicboz; }
+  bool hasStdExtZicbop() const { return HasStdExtZicbop; }
   bool is64Bit() const { return HasRV64; }
   bool isRV32E() const { return IsRV32E; }
   bool enableLinkerRelax() const { return EnableLinkerRelax; }
   bool enableRVCHintInstrs() const { return EnableRVCHintInstrs; }
+  bool enableDefaultUnroll() const { return EnableDefaultUnroll; }
   bool enableSaveRestore() const { return EnableSaveRestore; }
+  bool enableUnalignedScalarMem() const { return EnableUnalignedScalarMem; }
+  bool hasLUIADDIFusion() const { return HasLUIADDIFusion; }
   MVT getXLenVT() const { return XLenVT; }
   unsigned getXLen() const { return XLen; }
   unsigned getFLen() const {
@@ -195,27 +203,34 @@ public:
 
     return 0;
   }
-  unsigned getMinVLen() const { return ZvlLen; }
+  unsigned getELEN() const {
+    assert(hasVInstructions() && "Expected V extension");
+    return hasVInstructionsI64() ? 64 : 32;
+  }
+  unsigned getRealMinVLen() const {
+    unsigned VLen = getMinRVVVectorSizeInBits();
+    return VLen == 0 ? getArchMinVLen() : VLen;
+  }
+  unsigned getRealMaxVLen() const {
+    unsigned VLen = getMaxRVVVectorSizeInBits();
+    return VLen == 0 ? getArchMaxVLen() : VLen;
+  }
   RISCVABI::ABI getTargetABI() const { return TargetABI; }
   bool isRegisterReservedByUser(Register i) const {
     assert(i < RISCV::NUM_TARGET_REGS && "Register out of range");
     return UserReservedRegister[i];
   }
 
+  bool hasMacroFusion() const { return hasLUIADDIFusion(); }
+
   // Vector codegen related methods.
-  bool hasVInstructions() const { return HasStdExtV || HasStdExtZve32x; }
-  bool hasVInstructionsI64() const { return HasStdExtV || HasStdExtZve64x; }
-  bool hasVInstructionsF16() const {
-    return (HasStdExtV || HasStdExtZve32f) && HasStdExtZfh;
-  }
+  bool hasVInstructions() const { return HasStdExtZve32x; }
+  bool hasVInstructionsI64() const { return HasStdExtZve64x; }
+  bool hasVInstructionsF16() const { return HasStdExtZvfh && HasStdExtZfh; }
   // FIXME: Consider Zfinx in the future
-  bool hasVInstructionsF32() const {
-    return HasStdExtV || (HasStdExtZve32f && HasStdExtF);
-  }
+  bool hasVInstructionsF32() const { return HasStdExtZve32f && HasStdExtF; }
   // FIXME: Consider Zdinx in the future
-  bool hasVInstructionsF64() const {
-    return HasStdExtV || (HasStdExtZve64d && HasStdExtD);
-  }
+  bool hasVInstructionsF64() const { return HasStdExtZve64d && HasStdExtD; }
   // F16 and F64 both require F32.
   bool hasVInstructionsAnyF() const { return hasVInstructionsF32(); }
   unsigned getMaxInterleaveFactor() const {
@@ -229,6 +244,18 @@ protected:
   std::unique_ptr<LegalizerInfo> Legalizer;
   std::unique_ptr<RegisterBankInfo> RegBankInfo;
 
+  // Return the known range for the bit length of RVV data registers as set
+  // at the command line. A value of 0 means nothing is known about that particular
+  // limit beyond what's implied by the architecture.
+  // NOTE: Please use getRealMinVLen and getRealMaxVLen instead!
+  unsigned getMaxRVVVectorSizeInBits() const;
+  unsigned getMinRVVVectorSizeInBits() const;
+
+  // Return the known range for the bit length of RVV data registers as indicated
+  // by -march and -mattr.
+  unsigned getArchMinVLen() const { return ZvlLen; }
+  unsigned getArchMaxVLen() const { return 65536; }
+
 public:
   const CallLowering *getCallLowering() const override;
   InstructionSelector *getInstructionSelector() const override;
@@ -241,14 +268,13 @@ public:
   // pool if exceeded.
   unsigned getMaxBuildIntsCost() const;
 
-  // Return the known range for the bit length of RVV data registers. A value
-  // of 0 means nothing is known about that particular limit beyond what's
-  // implied by the architecture.
-  unsigned getMaxRVVVectorSizeInBits() const;
-  unsigned getMinRVVVectorSizeInBits() const;
   unsigned getMaxLMULForFixedLengthVectors() const;
-  unsigned getMaxELENForFixedLengthVectors() const;
   bool useRVVForFixedLengthVectors() const;
+
+  bool enableSubRegLiveness() const override;
+
+  void getPostRAMutations(std::vector<std::unique_ptr<ScheduleDAGMutation>>
+                              &Mutations) const override;
 };
 } // End llvm namespace
 
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index db5e2f1eeb6f..b2707b753e87 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -13,6 +13,8 @@
 #include "RISCVTargetMachine.h"
 #include "MCTargetDesc/RISCVBaseInfo.h"
 #include "RISCV.h"
+#include "RISCVMachineFunctionInfo.h"
+#include "RISCVMacroFusion.h"
 #include "RISCVTargetObjectFile.h"
 #include "RISCVTargetTransformInfo.h"
 #include "TargetInfo/RISCVTargetInfo.h"
@@ -22,6 +24,8 @@
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#include "llvm/CodeGen/MIRParser/MIParser.h"
+#include "llvm/CodeGen/MIRYamlMapping.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
@@ -30,13 +34,20 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/IPO.h"
 using namespace llvm;
 
+static cl::opt<bool> EnableRedundantCopyElimination(
+    "riscv-enable-copyelim",
+    cl::desc("Enable the redundant copy elimination pass"), cl::init(true),
+    cl::Hidden);
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
   RegisterTargetMachine<RISCVTargetMachine> X(getTheRISCV32Target());
   RegisterTargetMachine<RISCVTargetMachine> Y(getTheRISCV64Target());
   auto *PR = PassRegistry::getPassRegistry();
   initializeGlobalISel(*PR);
+  initializeRISCVMakeCompressibleOptPass(*PR);
   initializeRISCVGatherScatterLoweringPass(*PR);
   initializeRISCVMergeBaseOffsetOptPass(*PR);
   initializeRISCVSExtWRemovalPass(*PR);
@@ -53,9 +64,7 @@ static StringRef computeDataLayout(const Triple &TT) {
 
 static Reloc::Model getEffectiveRelocModel(const Triple &TT,
                                            Optional<Reloc::Model> RM) {
-  if (!RM.hasValue())
-    return Reloc::Static;
-  return *RM;
+  return RM.value_or(Reloc::Static);
 }
 
 RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT,
@@ -72,6 +81,7 @@ RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT,
 
   // RISC-V supports the MachineOutliner.
   setMachineOutliner(true);
+  setSupportsDefaultOutlining(true);
 }
 
 const RISCVSubtarget *
@@ -109,7 +119,7 @@ RISCVTargetMachine::getSubtargetImpl(const Function &F) const {
 }
 
 TargetTransformInfo
-RISCVTargetMachine::getTargetTransformInfo(const Function &F) {
+RISCVTargetMachine::getTargetTransformInfo(const Function &F) const {
   return TargetTransformInfo(RISCVTTIImpl(this, F));
 }
 
@@ -132,7 +142,30 @@ public:
     return getTM<RISCVTargetMachine>();
   }
 
+  ScheduleDAGInstrs *
+  createMachineScheduler(MachineSchedContext *C) const override {
+    const RISCVSubtarget &ST = C->MF->getSubtarget<RISCVSubtarget>();
+    if (ST.hasMacroFusion()) {
+      ScheduleDAGMILive *DAG = createGenericSchedLive(C);
+      DAG->addMutation(createRISCVMacroFusionDAGMutation());
+      return DAG;
+    }
+    return nullptr;
+  }
+
+  ScheduleDAGInstrs *
+  createPostMachineScheduler(MachineSchedContext *C) const override {
+    const RISCVSubtarget &ST = C->MF->getSubtarget<RISCVSubtarget>();
+    if (ST.hasMacroFusion()) {
+      ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
+      DAG->addMutation(createRISCVMacroFusionDAGMutation());
+      return DAG;
+    }
+    return nullptr;
+  }
+
   void addIRPasses() override;
+  bool addPreISel() override;
   bool addInstSelector() override;
   bool addIRTranslator() override;
   bool addLegalizeMachineIR() override;
@@ -143,6 +176,7 @@ public:
   void addPreSched2() override;
   void addMachineSSAOptimization() override;
   void addPreRegAlloc() override;
+  void addPostRegAlloc() override;
 };
 } // namespace
 
@@ -158,8 +192,18 @@ void RISCVPassConfig::addIRPasses() {
   TargetPassConfig::addIRPasses();
 }
 
+bool RISCVPassConfig::addPreISel() {
+  if (TM->getOptLevel() != CodeGenOpt::None) {
+    // Add a barrier before instruction selection so that we will not get
+    // deleted block address after enabling default outlining. See D99707 for
+    // more details.
+    addPass(createBarrierNoopPass());
+  }
+  return false;
+}
+
 bool RISCVPassConfig::addInstSelector() {
-  addPass(createRISCVISelDag(getRISCVTargetMachine()));
+  addPass(createRISCVISelDag(getRISCVTargetMachine(), getOptLevel()));
 
   return false;
 }
@@ -186,7 +230,10 @@ bool RISCVPassConfig::addGlobalInstructionSelect() {
 
 void RISCVPassConfig::addPreSched2() {}
 
-void RISCVPassConfig::addPreEmitPass() { addPass(&BranchRelaxationPassID); }
+void RISCVPassConfig::addPreEmitPass() {
+  addPass(&BranchRelaxationPassID);
+  addPass(createRISCVMakeCompressibleOptPass());
+}
 
 void RISCVPassConfig::addPreEmitPass2() {
   addPass(createRISCVExpandPseudoPass());
@@ -208,3 +255,28 @@ void RISCVPassConfig::addPreRegAlloc() {
     addPass(createRISCVMergeBaseOffsetOptPass());
   addPass(createRISCVInsertVSETVLIPass());
 }
+
+void RISCVPassConfig::addPostRegAlloc() {
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableRedundantCopyElimination)
+    addPass(createRISCVRedundantCopyEliminationPass());
+}
+
+yaml::MachineFunctionInfo *
+RISCVTargetMachine::createDefaultFuncInfoYAML() const {
+  return new yaml::RISCVMachineFunctionInfo();
+}
+
+yaml::MachineFunctionInfo *
+RISCVTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
+  const auto *MFI = MF.getInfo<RISCVMachineFunctionInfo>();
+  return new yaml::RISCVMachineFunctionInfo(*MFI);
+}
+
+bool RISCVTargetMachine::parseMachineFunctionInfo(
+    const yaml::MachineFunctionInfo &MFI, PerFunctionMIParsingState &PFS,
+    SMDiagnostic &Error, SMRange &SourceRange) const {
+  const auto &YamlMFI =
+      static_cast<const yaml::RISCVMachineFunctionInfo &>(MFI);
+  PFS.MF.getInfo<RISCVMachineFunctionInfo>()->initializeBaseYamlFields(YamlMFI);
+  return false;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.h b/llvm/lib/Target/RISCV/RISCVTargetMachine.h
index 3156333f7ee1..087646fb5ed9 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.h
@@ -42,10 +42,18 @@ public:
     return TLOF.get();
   }
 
-  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+  TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
 
   virtual bool isNoopAddrSpaceCast(unsigned SrcAS,
                                    unsigned DstAS) const override;
+
+  yaml::MachineFunctionInfo *createDefaultFuncInfoYAML() const override;
+  yaml::MachineFunctionInfo *
+  convertFuncInfoToYAML(const MachineFunction &MF) const override;
+  bool parseMachineFunctionInfo(const yaml::MachineFunctionInfo &,
+                                PerFunctionMIParsingState &PFS,
+                                SMDiagnostic &Error,
+                                SMRange &SourceRange) const override;
 };
 } // namespace llvm
 
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 99e6774a02e4..29d3c5e491de 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -11,6 +11,7 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/CodeGen/TargetLowering.h"
+#include <cmath>
 using namespace llvm;
 
 #define DEBUG_TYPE "riscvtti"
@@ -131,19 +132,17 @@ bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
 }
 
 Optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
-  // There is no assumption of the maximum vector length in V specification.
-  // We use the value specified by users as the maximum vector length.
-  // This function will use the assumed maximum vector length to get the
-  // maximum vscale for LoopVectorizer.
-  // If users do not specify the maximum vector length, we have no way to
-  // know whether the LoopVectorizer is safe to do or not.
-  // We only consider to use single vector register (LMUL = 1) to vectorize.
-  unsigned MaxVectorSizeInBits = ST->getMaxRVVVectorSizeInBits();
-  if (ST->hasVInstructions() && MaxVectorSizeInBits != 0)
-    return MaxVectorSizeInBits / RISCV::RVVBitsPerBlock;
+  if (ST->hasVInstructions())
+    return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
   return BaseT::getMaxVScale();
 }
 
+Optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
+  if (ST->hasVInstructions())
+    return ST->getRealMinVLen() / RISCV::RVVBitsPerBlock;
+  return BaseT::getVScaleForTuning();
+}
+
 TypeSize
 RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
   unsigned LMUL = PowerOf2Floor(
@@ -153,7 +152,7 @@ RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
     return TypeSize::getFixed(ST->getXLen());
   case TargetTransformInfo::RGK_FixedWidthVector:
     return TypeSize::getFixed(
-        ST->hasVInstructions() ? LMUL * ST->getMinRVVVectorSizeInBits() : 0);
+        ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
   case TargetTransformInfo::RGK_ScalableVector:
     return TypeSize::getScalable(
         ST->hasVInstructions() ? LMUL * RISCV::RVVBitsPerBlock : 0);
@@ -162,6 +161,61 @@ RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
   llvm_unreachable("Unsupported register kind");
 }
 
+InstructionCost RISCVTTIImpl::getSpliceCost(VectorType *Tp, int Index) {
+  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+
+  unsigned Cost = 2; // vslidedown+vslideup.
+  // TODO: LMUL should increase cost.
+  // TODO: Multiplying by LT.first implies this legalizes into multiple copies
+  // of similar code, but I think we expand through memory.
+  return Cost * LT.first;
+}
+
+InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
+                                             VectorType *Tp, ArrayRef<int> Mask,
+                                             int Index, VectorType *SubTp,
+                                             ArrayRef<const Value *> Args) {
+  if (isa<ScalableVectorType>(Tp)) {
+    std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+    switch (Kind) {
+    default:
+      // Fallthrough to generic handling.
+      // TODO: Most of these cases will return getInvalid in generic code, and
+      // must be implemented here.
+      break;
+    case TTI::SK_Broadcast: {
+      return LT.first * 1;
+    }
+    case TTI::SK_Splice:
+      return getSpliceCost(Tp, Index);
+    case TTI::SK_Reverse:
+      // Most of the cost here is producing the vrgather index register
+      // Example sequence:
+      //   csrr a0, vlenb
+      //   srli a0, a0, 3
+      //   addi a0, a0, -1
+      //   vsetvli a1, zero, e8, mf8, ta, mu (ignored)
+      //   vid.v v9
+      //   vrsub.vx v10, v9, a0
+      //   vrgather.vv v9, v8, v10
+      return LT.first * 6;
+    }
+  }
+
+  return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
+}
+
+InstructionCost
+RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
+                                    unsigned AddressSpace,
+                                    TTI::TargetCostKind CostKind) {
+  if (!isa<ScalableVectorType>(Src))
+    return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+                                        CostKind);
+
+  return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
+}
+
 InstructionCost RISCVTTIImpl::getGatherScatterOpCost(
     unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
     Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
@@ -176,31 +230,152 @@ InstructionCost RISCVTTIImpl::getGatherScatterOpCost(
     return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
                                          Alignment, CostKind, I);
 
-  // FIXME: Only supporting fixed vectors for now.
-  if (!isa<FixedVectorType>(DataTy))
-    return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
-                                         Alignment, CostKind, I);
-
-  auto *VTy = cast<FixedVectorType>(DataTy);
-  unsigned NumLoads = VTy->getNumElements();
-  InstructionCost MemOpCost =
-      getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0, CostKind, I);
+  // Cost is proportional to the number of memory operations implied.  For
+  // scalable vectors, we use an upper bound on that number since we don't
+  // know exactly what VL will be.
+  auto &VTy = *cast<VectorType>(DataTy);
+  InstructionCost MemOpCost = getMemoryOpCost(Opcode, VTy.getElementType(),
+                                              Alignment, 0, CostKind, I);
+  unsigned NumLoads = getMaxVLFor(&VTy);
   return NumLoads * MemOpCost;
 }
 
+InstructionCost
+RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+                                    TTI::TargetCostKind CostKind) {
+  auto *RetTy = ICA.getReturnType();
+  switch (ICA.getID()) {
+  // TODO: add more intrinsic
+  case Intrinsic::experimental_stepvector: {
+    unsigned Cost = 1; // vid
+    auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
+    return Cost + (LT.first - 1);
+  }
+  default:
+    break;
+  }
+  return BaseT::getIntrinsicInstrCost(ICA, CostKind);
+}
+
+InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
+                                               Type *Src,
+                                               TTI::CastContextHint CCH,
+                                               TTI::TargetCostKind CostKind,
+                                               const Instruction *I) {
+  if (isa<VectorType>(Dst) && isa<VectorType>(Src)) {
+    // FIXME: Need to compute legalizing cost for illegal types.
+    if (!isTypeLegal(Src) || !isTypeLegal(Dst))
+      return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
+
+    // Skip if element size of Dst or Src is bigger than ELEN.
+    if (Src->getScalarSizeInBits() > ST->getELEN() ||
+        Dst->getScalarSizeInBits() > ST->getELEN())
+      return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
+
+    int ISD = TLI->InstructionOpcodeToISD(Opcode);
+    assert(ISD && "Invalid opcode");
+
+    // FIXME: Need to consider vsetvli and lmul.
+    int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) -
+                  (int)Log2_32(Src->getScalarSizeInBits());
+    switch (ISD) {
+    case ISD::SIGN_EXTEND:
+    case ISD::ZERO_EXTEND:
+      return 1;
+    case ISD::TRUNCATE:
+    case ISD::FP_EXTEND:
+    case ISD::FP_ROUND:
+      // Counts of narrow/widen instructions.
+      return std::abs(PowDiff);
+    case ISD::FP_TO_SINT:
+    case ISD::FP_TO_UINT:
+    case ISD::SINT_TO_FP:
+    case ISD::UINT_TO_FP:
+      if (std::abs(PowDiff) <= 1)
+        return 1;
+      // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8),
+      // so it only need two conversion.
+      if (Src->isIntOrIntVectorTy())
+        return 2;
+      // Counts of narrow/widen instructions.
+      return std::abs(PowDiff);
+    }
+  }
+  return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
+}
+
+unsigned RISCVTTIImpl::getMaxVLFor(VectorType *Ty) {
+  if (isa<ScalableVectorType>(Ty)) {
+    const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
+    const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
+    const unsigned VectorBitsMax = ST->getRealMaxVLen();
+    return RISCVTargetLowering::computeVLMAX(VectorBitsMax, EltSize, MinSize);
+  }
+  return cast<FixedVectorType>(Ty)->getNumElements();
+}
+
+InstructionCost
+RISCVTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
+                                     bool IsUnsigned,
+                                     TTI::TargetCostKind CostKind) {
+  if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
+    return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
+
+  // Skip if scalar size of Ty is bigger than ELEN.
+  if (Ty->getScalarSizeInBits() > ST->getELEN())
+    return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
+
+  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+  if (Ty->getElementType()->isIntegerTy(1))
+    // vcpop sequences, see vreduction-mask.ll.  umax, smin actually only
+    // cost 2, but we don't have enough info here so we slightly over cost.
+    return (LT.first - 1) + 3;
+
+  // IR Reduction is composed by two vmv and one rvv reduction instruction.
+  InstructionCost BaseCost = 2;
+  unsigned VL = getMaxVLFor(Ty);
+  return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
+}
+
+InstructionCost
+RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
+                                         Optional<FastMathFlags> FMF,
+                                         TTI::TargetCostKind CostKind) {
+  if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
+    return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
+
+  // Skip if scalar size of Ty is bigger than ELEN.
+  if (Ty->getScalarSizeInBits() > ST->getELEN())
+    return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
+      ISD != ISD::FADD)
+    return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
+
+  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+  if (Ty->getElementType()->isIntegerTy(1))
+    // vcpop sequences, see vreduction-mask.ll
+    return (LT.first - 1) + (ISD == ISD::AND ? 3 : 2);
+
+  // IR Reduction is composed by two vmv and one rvv reduction instruction.
+  InstructionCost BaseCost = 2;
+  unsigned VL = getMaxVLFor(Ty);
+  if (TTI::requiresOrderedReduction(FMF))
+    return (LT.first - 1) + BaseCost + VL;
+  return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
+}
+
 void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                            TTI::UnrollingPreferences &UP,
                                            OptimizationRemarkEmitter *ORE) {
   // TODO: More tuning on benchmarks and metrics with changes as needed
   //       would apply to all settings below to enable performance.
 
-  // Support explicit targets enabled for SiFive with the unrolling preferences
-  // below
-  bool UseDefaultPreferences = true;
-  if (ST->getProcFamily() == RISCVSubtarget::SiFive7)
-    UseDefaultPreferences = false;
 
-  if (UseDefaultPreferences)
+  if (ST->enableDefaultUnroll())
     return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
 
   // Enable Upper bound unrolling universally, not dependant upon the conditions
@@ -276,14 +451,14 @@ void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
   BaseT::getPeelingPreferences(L, SE, PP);
 }
 
-InstructionCost RISCVTTIImpl::getRegUsageForType(Type *Ty) {
+unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) {
   TypeSize Size = Ty->getPrimitiveSizeInBits();
   if (Ty->isVectorTy()) {
     if (Size.isScalable() && ST->hasVInstructions())
       return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
 
     if (ST->useRVVForFixedLengthVectors())
-      return divideCeil(Size, ST->getMinRVVVectorSizeInBits());
+      return divideCeil(Size, ST->getRealMinVLen());
   }
 
   return BaseT::getRegUsageForType(Ty);
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index e79c4f75712b..7caf0fedb2ca 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -37,6 +37,7 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
   const RISCVSubtarget *getST() const { return ST; }
   const RISCVTargetLowering *getTLI() const { return TLI; }
 
+  unsigned getMaxVLFor(VectorType *Ty);
 public:
   explicit RISCVTTIImpl(const RISCVTargetMachine *TM, const Function &F)
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
@@ -57,10 +58,15 @@ public:
   bool shouldExpandReduction(const IntrinsicInst *II) const;
   bool supportsScalableVectors() const { return ST->hasVInstructions(); }
   Optional<unsigned> getMaxVScale() const;
+  Optional<unsigned> getVScaleForTuning() const;
 
   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
 
-  InstructionCost getRegUsageForType(Type *Ty);
+  unsigned getRegUsageForType(Type *Ty);
+
+  InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
+                                        Align Alignment, unsigned AddressSpace,
+                                        TTI::TargetCostKind CostKind);
 
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP,
@@ -73,24 +79,50 @@ public:
     return ST->useRVVForFixedLengthVectors() ? 16 : 0;
   }
 
+  InstructionCost getSpliceCost(VectorType *Tp, int Index);
+  InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
+                                 ArrayRef<int> Mask, int Index,
+                                 VectorType *SubTp,
+                                 ArrayRef<const Value *> Args = None);
+
+  InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+                                        TTI::TargetCostKind CostKind);
+
   InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
                                          const Value *Ptr, bool VariableMask,
                                          Align Alignment,
                                          TTI::TargetCostKind CostKind,
                                          const Instruction *I);
 
+  InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                   TTI::CastContextHint CCH,
+                                   TTI::TargetCostKind CostKind,
+                                   const Instruction *I = nullptr);
+
+  InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
+                                         bool IsUnsigned,
+                                         TTI::TargetCostKind CostKind);
+
+  InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
+                                             Optional<FastMathFlags> FMF,
+                                             TTI::TargetCostKind CostKind);
+
+  bool isElementTypeLegalForScalableVector(Type *Ty) const {
+    return TLI->isLegalElementTypeForRVV(Ty);
+  }
+
   bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) {
     if (!ST->hasVInstructions())
       return false;
 
     // Only support fixed vectors if we know the minimum vector size.
-    if (isa<FixedVectorType>(DataType) && ST->getMinRVVVectorSizeInBits() == 0)
+    if (isa<FixedVectorType>(DataType) && !ST->useRVVForFixedLengthVectors())
       return false;
 
     // Don't allow elements larger than the ELEN.
     // FIXME: How to limit for scalable vectors?
     if (isa<FixedVectorType>(DataType) &&
-        DataType->getScalarSizeInBits() > ST->getMaxELENForFixedLengthVectors())
+        DataType->getScalarSizeInBits() > ST->getELEN())
       return false;
 
     if (Alignment <
@@ -112,13 +144,13 @@ public:
       return false;
 
     // Only support fixed vectors if we know the minimum vector size.
-    if (isa<FixedVectorType>(DataType) && ST->getMinRVVVectorSizeInBits() == 0)
+    if (isa<FixedVectorType>(DataType) && !ST->useRVVForFixedLengthVectors())
       return false;
 
     // Don't allow elements larger than the ELEN.
     // FIXME: How to limit for scalable vectors?
     if (isa<FixedVectorType>(DataType) &&
-        DataType->getScalarSizeInBits() > ST->getMaxELENForFixedLengthVectors())
+        DataType->getScalarSizeInBits() > ST->getELEN())
       return false;
 
     if (Alignment <
@@ -135,6 +167,16 @@ public:
     return isLegalMaskedGatherScatter(DataType, Alignment);
   }
 
+  bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) {
+    // Scalarize masked gather for RV64 if EEW=64 indices aren't supported.
+    return ST->is64Bit() && !ST->hasVInstructionsI64();
+  }
+
+  bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment) {
+    // Scalarize masked scatter for RV64 if EEW=64 indices aren't supported.
+    return ST->is64Bit() && !ST->hasVInstructionsI64();
+  }
+
   /// \returns How the target needs this vector-predicated operation to be
   /// transformed.
   TargetTransformInfo::VPLegalization
@@ -145,9 +187,6 @@ public:
 
   bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc,
                                    ElementCount VF) const {
-    if (!ST->hasVInstructions())
-      return false;
-
     if (!VF.isScalable())
       return true;
 
@@ -179,18 +218,53 @@ public:
     return VF == 1 ? 1 : ST->getMaxInterleaveFactor();
   }
 
-  // TODO: We should define RISC-V's own register classes.
-  //       e.g. register class for FPR.
+  enum RISCVRegisterClass { GPRRC, FPRRC, VRRC };
   unsigned getNumberOfRegisters(unsigned ClassID) const {
-    bool Vector = (ClassID == 1);
-    if (Vector) {
-      if (ST->hasVInstructions())
+    switch (ClassID) {
+    case RISCVRegisterClass::GPRRC:
+      // 31 = 32 GPR - x0 (zero register)
+      // FIXME: Should we exclude fixed registers like SP, TP or GP?
+      return 31;
+    case RISCVRegisterClass::FPRRC:
+      if (ST->hasStdExtF())
         return 32;
       return 0;
+    case RISCVRegisterClass::VRRC:
+      // Although there are 32 vector registers, v0 is special in that it is the
+      // only register that can be used to hold a mask.
+      // FIXME: Should we conservatively return 31 as the number of usable
+      // vector registers?
+      return ST->hasVInstructions() ? 32 : 0;
+    }
+    llvm_unreachable("unknown register class");
+  }
+
+  unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const {
+    if (Vector)
+      return RISCVRegisterClass::VRRC;
+    if (!Ty)
+      return RISCVRegisterClass::GPRRC;
+
+    Type *ScalarTy = Ty->getScalarType();
+    if ((ScalarTy->isHalfTy() && ST->hasStdExtZfh()) ||
+        (ScalarTy->isFloatTy() && ST->hasStdExtF()) ||
+        (ScalarTy->isDoubleTy() && ST->hasStdExtD())) {
+      return RISCVRegisterClass::FPRRC;
+    }
+
+    return RISCVRegisterClass::GPRRC;
+  }
+
+  const char *getRegisterClassName(unsigned ClassID) const {
+    switch (ClassID) {
+    case RISCVRegisterClass::GPRRC:
+      return "RISCV::GPRRC";
+    case RISCVRegisterClass::FPRRC:
+      return "RISCV::FPRRC";
+    case RISCVRegisterClass::VRRC:
+      return "RISCV::VRRC";
     }
-    // 31 = 32 GPR - x0 (zero register)
-    // FIXME: Should we exclude fixed registers like SP, TP or GP?
-    return 31;
+    llvm_unreachable("unknown register class");
   }
 };
 
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVAsmBackend.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVAsmBackend.cpp
new file mode 100644
index 000000000000..4156a0026411
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVAsmBackend.cpp
@@ -0,0 +1,63 @@
+//===-- SPIRVAsmBackend.cpp - SPIR-V Assembler Backend ---------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/SPIRVMCTargetDesc.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/Support/EndianStream.h"
+
+using namespace llvm;
+
+namespace {
+
+class SPIRVAsmBackend : public MCAsmBackend {
+public:
+  SPIRVAsmBackend(support::endianness Endian) : MCAsmBackend(Endian) {}
+
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsResolved,
+                  const MCSubtargetInfo *STI) const override {}
+
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override {
+    return createSPIRVObjectTargetWriter();
+  }
+
+  // No instruction requires relaxation.
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override {
+    return false;
+  }
+
+  unsigned getNumFixupKinds() const override { return 1; }
+
+  bool mayNeedRelaxation(const MCInst &Inst,
+                         const MCSubtargetInfo &STI) const override {
+    return false;
+  }
+
+  void relaxInstruction(MCInst &Inst,
+                        const MCSubtargetInfo &STI) const override {}
+
+  bool writeNopData(raw_ostream &OS, uint64_t Count,
+                    const MCSubtargetInfo *STI) const override {
+    return false;
+  }
+};
+
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createSPIRVAsmBackend(const Target &T,
+                                          const MCSubtargetInfo &STI,
+                                          const MCRegisterInfo &MRI,
+                                          const MCTargetOptions &) {
+  return new SPIRVAsmBackend(support::little);
+}
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp
new file mode 100644
index 000000000000..1a3e35a5f901
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp
@@ -0,0 +1,1072 @@
+//===-- SPIRVBaseInfo.cpp -  Top level definitions for SPIRV ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the SPIRV target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPIRVBaseInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+namespace SPIRV {
+
+#define CASE(CLASS, ATTR)                                                      \
+  case CLASS::ATTR:                                                            \
+    return #ATTR;
+#define CASE_SUF(CLASS, SF, ATTR)                                              \
+  case CLASS::SF##_##ATTR:                                                     \
+    return #ATTR;
+
+// Implement getEnumName(Enum e) helper functions.
+// TODO: re-implement all the functions using TableGen.
+StringRef getCapabilityName(Capability e) {
+  switch (e) {
+    CASE(Capability, Matrix)
+    CASE(Capability, Shader)
+    CASE(Capability, Geometry)
+    CASE(Capability, Tessellation)
+    CASE(Capability, Addresses)
+    CASE(Capability, Linkage)
+    CASE(Capability, Kernel)
+    CASE(Capability, Vector16)
+    CASE(Capability, Float16Buffer)
+    CASE(Capability, Float16)
+    CASE(Capability, Float64)
+    CASE(Capability, Int64)
+    CASE(Capability, Int64Atomics)
+    CASE(Capability, ImageBasic)
+    CASE(Capability, ImageReadWrite)
+    CASE(Capability, ImageMipmap)
+    CASE(Capability, Pipes)
+    CASE(Capability, Groups)
+    CASE(Capability, DeviceEnqueue)
+    CASE(Capability, LiteralSampler)
+    CASE(Capability, AtomicStorage)
+    CASE(Capability, Int16)
+    CASE(Capability, TessellationPointSize)
+    CASE(Capability, GeometryPointSize)
+    CASE(Capability, ImageGatherExtended)
+    CASE(Capability, StorageImageMultisample)
+    CASE(Capability, UniformBufferArrayDynamicIndexing)
+    CASE(Capability, SampledImageArrayDymnamicIndexing)
+    CASE(Capability, ClipDistance)
+    CASE(Capability, CullDistance)
+    CASE(Capability, ImageCubeArray)
+    CASE(Capability, SampleRateShading)
+    CASE(Capability, ImageRect)
+    CASE(Capability, SampledRect)
+    CASE(Capability, GenericPointer)
+    CASE(Capability, Int8)
+    CASE(Capability, InputAttachment)
+    CASE(Capability, SparseResidency)
+    CASE(Capability, MinLod)
+    CASE(Capability, Sampled1D)
+    CASE(Capability, Image1D)
+    CASE(Capability, SampledCubeArray)
+    CASE(Capability, SampledBuffer)
+    CASE(Capability, ImageBuffer)
+    CASE(Capability, ImageMSArray)
+    CASE(Capability, StorageImageExtendedFormats)
+    CASE(Capability, ImageQuery)
+    CASE(Capability, DerivativeControl)
+    CASE(Capability, InterpolationFunction)
+    CASE(Capability, TransformFeedback)
+    CASE(Capability, GeometryStreams)
+    CASE(Capability, StorageImageReadWithoutFormat)
+    CASE(Capability, StorageImageWriteWithoutFormat)
+    CASE(Capability, MultiViewport)
+    CASE(Capability, SubgroupDispatch)
+    CASE(Capability, NamedBarrier)
+    CASE(Capability, PipeStorage)
+    CASE(Capability, GroupNonUniform)
+    CASE(Capability, GroupNonUniformVote)
+    CASE(Capability, GroupNonUniformArithmetic)
+    CASE(Capability, GroupNonUniformBallot)
+    CASE(Capability, GroupNonUniformShuffle)
+    CASE(Capability, GroupNonUniformShuffleRelative)
+    CASE(Capability, GroupNonUniformClustered)
+    CASE(Capability, GroupNonUniformQuad)
+    CASE(Capability, SubgroupBallotKHR)
+    CASE(Capability, DrawParameters)
+    CASE(Capability, SubgroupVoteKHR)
+    CASE(Capability, StorageBuffer16BitAccess)
+    CASE(Capability, StorageUniform16)
+    CASE(Capability, StoragePushConstant16)
+    CASE(Capability, StorageInputOutput16)
+    CASE(Capability, DeviceGroup)
+    CASE(Capability, MultiView)
+    CASE(Capability, VariablePointersStorageBuffer)
+    CASE(Capability, VariablePointers)
+    CASE(Capability, AtomicStorageOps)
+    CASE(Capability, SampleMaskPostDepthCoverage)
+    CASE(Capability, StorageBuffer8BitAccess)
+    CASE(Capability, UniformAndStorageBuffer8BitAccess)
+    CASE(Capability, StoragePushConstant8)
+    CASE(Capability, DenormPreserve)
+    CASE(Capability, DenormFlushToZero)
+    CASE(Capability, SignedZeroInfNanPreserve)
+    CASE(Capability, RoundingModeRTE)
+    CASE(Capability, RoundingModeRTZ)
+    CASE(Capability, Float16ImageAMD)
+    CASE(Capability, ImageGatherBiasLodAMD)
+    CASE(Capability, FragmentMaskAMD)
+    CASE(Capability, StencilExportEXT)
+    CASE(Capability, ImageReadWriteLodAMD)
+    CASE(Capability, SampleMaskOverrideCoverageNV)
+    CASE(Capability, GeometryShaderPassthroughNV)
+    CASE(Capability, ShaderViewportIndexLayerEXT)
+    CASE(Capability, ShaderViewportMaskNV)
+    CASE(Capability, ShaderStereoViewNV)
+    CASE(Capability, PerViewAttributesNV)
+    CASE(Capability, FragmentFullyCoveredEXT)
+    CASE(Capability, MeshShadingNV)
+    CASE(Capability, ShaderNonUniformEXT)
+    CASE(Capability, RuntimeDescriptorArrayEXT)
+    CASE(Capability, InputAttachmentArrayDynamicIndexingEXT)
+    CASE(Capability, UniformTexelBufferArrayDynamicIndexingEXT)
+    CASE(Capability, StorageTexelBufferArrayDynamicIndexingEXT)
+    CASE(Capability, UniformBufferArrayNonUniformIndexingEXT)
+    CASE(Capability, SampledImageArrayNonUniformIndexingEXT)
+    CASE(Capability, StorageBufferArrayNonUniformIndexingEXT)
+    CASE(Capability, StorageImageArrayNonUniformIndexingEXT)
+    CASE(Capability, InputAttachmentArrayNonUniformIndexingEXT)
+    CASE(Capability, UniformTexelBufferArrayNonUniformIndexingEXT)
+    CASE(Capability, StorageTexelBufferArrayNonUniformIndexingEXT)
+    CASE(Capability, RayTracingNV)
+    CASE(Capability, SubgroupShuffleINTEL)
+    CASE(Capability, SubgroupBufferBlockIOINTEL)
+    CASE(Capability, SubgroupImageBlockIOINTEL)
+    CASE(Capability, SubgroupImageMediaBlockIOINTEL)
+    CASE(Capability, SubgroupAvcMotionEstimationINTEL)
+    CASE(Capability, SubgroupAvcMotionEstimationIntraINTEL)
+    CASE(Capability, SubgroupAvcMotionEstimationChromaINTEL)
+    CASE(Capability, GroupNonUniformPartitionedNV)
+    CASE(Capability, VulkanMemoryModelKHR)
+    CASE(Capability, VulkanMemoryModelDeviceScopeKHR)
+    CASE(Capability, ImageFootprintNV)
+    CASE(Capability, FragmentBarycentricNV)
+    CASE(Capability, ComputeDerivativeGroupQuadsNV)
+    CASE(Capability, ComputeDerivativeGroupLinearNV)
+    CASE(Capability, FragmentDensityEXT)
+    CASE(Capability, PhysicalStorageBufferAddressesEXT)
+    CASE(Capability, CooperativeMatrixNV)
+    break;
+  }
+  llvm_unreachable("Unexpected operand");
+}
+
+StringRef getSourceLanguageName(SourceLanguage e) {
+  switch (e) {
+    CASE(SourceLanguage, Unknown)
+    CASE(SourceLanguage, ESSL)
+    CASE(SourceLanguage, GLSL)
+    CASE(SourceLanguage, OpenCL_C)
+    CASE(SourceLanguage, OpenCL_CPP)
+    CASE(SourceLanguage, HLSL)
+    break;
+  }
+  llvm_unreachable("Unexpected operand");
+}
+
+StringRef getExecutionModelName(ExecutionModel e) {
+  switch (e) {
+    CASE(ExecutionModel, Vertex)
+    CASE(ExecutionModel, TessellationControl)
+    CASE(ExecutionModel, TessellationEvaluation)
+    CASE(ExecutionModel, Geometry)
+    CASE(ExecutionModel, Fragment)
+    CASE(ExecutionModel, GLCompute)
+    CASE(ExecutionModel, Kernel)
+    CASE(ExecutionModel, TaskNV)
+    CASE(ExecutionModel, MeshNV)
+    CASE(ExecutionModel, RayGenerationNV)
+    CASE(ExecutionModel, IntersectionNV)
+    CASE(ExecutionModel, AnyHitNV)
+    CASE(ExecutionModel, ClosestHitNV)
+    CASE(ExecutionModel, MissNV)
+    CASE(ExecutionModel, CallableNV)
+    break;
+  }
+  llvm_unreachable("Unexpected operand");
+}
+
+StringRef getAddressingModelName(AddressingModel e) {
+  switch (e) {
+    CASE(AddressingModel, Logical)
+    CASE(AddressingModel, Physical32)
+    CASE(AddressingModel, Physical64)
+    CASE(AddressingModel, PhysicalStorageBuffer64EXT)
+    break;
+  }
+  llvm_unreachable("Unexpected operand");
+}
+
+StringRef getMemoryModelName(MemoryModel e) {
+  switch (e) {
+    CASE(MemoryModel, Simple)
+    CASE(MemoryModel, GLSL450)
+    CASE(MemoryModel, OpenCL)
+    CASE(MemoryModel, VulkanKHR)
+    break;
+  }
+  llvm_unreachable("Unexpected operand");
+}
+
+StringRef getExecutionModeName(ExecutionMode e) {
+  switch (e) {
+    CASE(ExecutionMode, Invocations)
+    CASE(ExecutionMode, SpacingEqual)
+    CASE(ExecutionMode, SpacingFractionalEven)
+    CASE(ExecutionMode, SpacingFractionalOdd)
+    CASE(ExecutionMode, VertexOrderCw)
+    CASE(ExecutionMode, VertexOrderCcw)
+    CASE(ExecutionMode, PixelCenterInteger)
+    CASE(ExecutionMode, OriginUpperLeft)
+    CASE(ExecutionMode, OriginLowerLeft)
+    CASE(ExecutionMode, EarlyFragmentTests)
+    CASE(ExecutionMode, PointMode)
+    CASE(ExecutionMode, Xfb)
+    CASE(ExecutionMode, DepthReplacing)
+    CASE(ExecutionMode, DepthGreater)
+    CASE(ExecutionMode, DepthLess)
+    CASE(ExecutionMode, DepthUnchanged)
+    CASE(ExecutionMode, LocalSize)
+    CASE(ExecutionMode, LocalSizeHint)
+    CASE(ExecutionMode, InputPoints)
+    CASE(ExecutionMode, InputLines)
+    CASE(ExecutionMode, InputLinesAdjacency)
+    CASE(ExecutionMode, Triangles)
+    CASE(ExecutionMode, InputTrianglesAdjacency)
+    CASE(ExecutionMode, Quads)
+    CASE(ExecutionMode, Isolines)
+    CASE(ExecutionMode, OutputVertices)
+    CASE(ExecutionMode, OutputPoints)
+    CASE(ExecutionMode, OutputLineStrip)
+    CASE(ExecutionMode, OutputTriangleStrip)
+    CASE(ExecutionMode, VecTypeHint)
+    CASE(ExecutionMode, ContractionOff)
+    CASE(ExecutionMode, Initializer)
+    CASE(ExecutionMode, Finalizer)
+    CASE(ExecutionMode, SubgroupSize)
+    CASE(ExecutionMode, SubgroupsPerWorkgroup)
+    CASE(ExecutionMode, SubgroupsPerWorkgroupId)
+    CASE(ExecutionMode, LocalSizeId)
+    CASE(ExecutionMode, LocalSizeHintId)
+    CASE(ExecutionMode, PostDepthCoverage)
+    CASE(ExecutionMode, DenormPreserve)
+    CASE(ExecutionMode, DenormFlushToZero)
+    CASE(ExecutionMode, SignedZeroInfNanPreserve)
+    CASE(ExecutionMode, RoundingModeRTE)
+    CASE(ExecutionMode, RoundingModeRTZ)
+    CASE(ExecutionMode, StencilRefReplacingEXT)
+    CASE(ExecutionMode, OutputLinesNV)
+    CASE(ExecutionMode, DerivativeGroupQuadsNV)
+    CASE(ExecutionMode, DerivativeGroupLinearNV)
+    CASE(ExecutionMode, OutputTrianglesNV)
+    break;
+  }
+  llvm_unreachable("Unexpected operand");
+}
+
+StringRef getStorageClassName(StorageClass e) {
+  switch (e) {
+    CASE(StorageClass, UniformConstant)
+    CASE(StorageClass, Input)
+    CASE(StorageClass, Uniform)
+    CASE(StorageClass, Output)
+    CASE(StorageClass, Workgroup)
+    CASE(StorageClass, CrossWorkgroup)
+    CASE(StorageClass, Private)
+    CASE(StorageClass, Function)
+    CASE(StorageClass, Generic)
+    CASE(StorageClass, PushConstant)
+    CASE(StorageClass, AtomicCounter)
+    CASE(StorageClass, Image)
+    CASE(StorageClass, StorageBuffer)
+    CASE(StorageClass, CallableDataNV)
+    CASE(StorageClass, IncomingCallableDataNV)
+    CASE(StorageClass, RayPayloadNV)
+    CASE(StorageClass, HitAttributeNV)
+    CASE(StorageClass, IncomingRayPayloadNV)
+    CASE(StorageClass, ShaderRecordBufferNV)
+    CASE(StorageClass, PhysicalStorageBufferEXT)
+    break;
+  }
+  llvm_unreachable("Unexpected operand");
+}
+
+StringRef getDimName(Dim dim) {
+  switch (dim) {
+    CASE_SUF(Dim, DIM, 1D)
+    CASE_SUF(Dim, DIM, 2D)
+    CASE_SUF(Dim, DIM, 3D)
+    CASE_SUF(Dim, DIM, Cube)
+    CASE_SUF(Dim, DIM, Rect)
+    CASE_SUF(Dim, DIM, Buffer)
+    CASE_SUF(Dim, DIM, SubpassData)
+    break;
+  }
+  llvm_unreachable("Unexpected operand");
+}
+
+StringRef getSamplerAddressingModeName(SamplerAddressingMode e) {
+  switch (e) {
+    CASE(SamplerAddressingMode, None)
+    CASE(SamplerAddressingMode, ClampToEdge)
+    CASE(SamplerAddressingMode, Clamp)
+    CASE(SamplerAddressingMode, Repeat)
+    CASE(SamplerAddressingMode, RepeatMirrored)
+    break;
+  }
+  llvm_unreachable("Unexpected operand");
+}
+
+StringRef getSamplerFilterModeName(SamplerFilterMode e) {
+  switch (e) {
+    CASE(SamplerFilterMode, Nearest)
+    CASE(SamplerFilterMode, Linear)
+    break;
+  }
+  llvm_unreachable("Unexpected operand");
+}
+
+StringRef getImageFormatName(ImageFormat e) {
+  switch (e) {
+    CASE(ImageFormat, Unknown)
+    CASE(ImageFormat, Rgba32f)
+    CASE(ImageFormat, Rgba16f)
+    CASE(ImageFormat, R32f)
+    CASE(ImageFormat, Rgba8)
+    CASE(ImageFormat, Rgba8Snorm)
+    CASE(ImageFormat, Rg32f)
+    CASE(ImageFormat, Rg16f)
+    CASE(ImageFormat, R11fG11fB10f)
+    CASE(ImageFormat, R16f)
+    CASE(ImageFormat, Rgba16)
+    CASE(ImageFormat, Rgb10A2)
+    CASE(ImageFormat, Rg16)
+    CASE(ImageFormat, Rg8)
+    CASE(ImageFormat, R16)
+    CASE(ImageFormat, R8)
+    CASE(ImageFormat, Rgba16Snorm)
+    CASE(ImageFormat, Rg16Snorm)
+    CASE(ImageFormat, Rg8Snorm)
+    CASE(ImageFormat, R16Snorm)
+    CASE(ImageFormat, R8Snorm)
+    CASE(ImageFormat, Rgba32i)
+    CASE(ImageFormat, Rgba16i)
+    CASE(ImageFormat, Rgba8i)
+    CASE(ImageFormat, R32i)
+    CASE(ImageFormat, Rg32i)
+    CASE(ImageFormat, Rg16i)
+    CASE(ImageFormat, Rg8i)
+    CASE(ImageFormat, R16i)
+    CASE(ImageFormat, R8i)
+    CASE(ImageFormat, Rgba32ui)
+    CASE(ImageFormat, Rgba16ui)
+    CASE(ImageFormat, Rgba8ui)
+    CASE(ImageFormat, R32ui)
+    CASE(ImageFormat, Rgb10a2ui)
+    CASE(ImageFormat, Rg32ui)
+    CASE(ImageFormat, Rg16ui)
+    CASE(ImageFormat, Rg8ui)
+    CASE(ImageFormat, R16ui)
+    CASE(ImageFormat, R8ui)
+    break;
+  }
+  llvm_unreachable("Unexpected operand");
+}
+
+StringRef getImageChannelOrderName(ImageChannelOrder e) {
+  switch (e) {
+    CASE(ImageChannelOrder, R)
+    CASE(ImageChannelOrder, A)
+    CASE(ImageChannelOrder, RG)
+    CASE(ImageChannelOrder, RA)
+    CASE(ImageChannelOrder, RGB)
+    CASE(ImageChannelOrder, RGBA)
+    CASE(ImageChannelOrder, BGRA)
+    CASE(ImageChannelOrder, ARGB)
+    CASE(ImageChannelOrder, Intensity)
+    CASE(ImageChannelOrder, Luminance)
+    CASE(ImageChannelOrder, Rx)
+    CASE(ImageChannelOrder, RGx)
+    CASE(ImageChannelOrder, RGBx)
+    CASE(ImageChannelOrder, Depth)
+    CASE(ImageChannelOrder, DepthStencil)
+    CASE(ImageChannelOrder, sRGB)
+    CASE(ImageChannelOrder, sRGBx)
+    CASE(ImageChannelOrder, sRGBA)
+    CASE(ImageChannelOrder, sBGRA)
+    CASE(ImageChannelOrder, ABGR)
+    break;
+  }
+  llvm_unreachable("Unexpected operand");
+}
+
+StringRef getImageChannelDataTypeName(ImageChannelDataType e) {
+  switch (e) {
+    CASE(ImageChannelDataType, SnormInt8)
+    CASE(ImageChannelDataType, SnormInt16)
+    CASE(ImageChannelDataType, UnormInt8)
+    CASE(ImageChannelDataType, UnormInt16)
+    CASE(ImageChannelDataType, UnormShort565)
+    CASE(ImageChannelDataType, UnormShort555)
+    CASE(ImageChannelDataType, UnormInt101010)
+    CASE(ImageChannelDataType, SignedInt8)
+    CASE(ImageChannelDataType, SignedInt16)
+    CASE(ImageChannelDataType, SignedInt32)
+    CASE(ImageChannelDataType, UnsignedInt8)
+    CASE(ImageChannelDataType, UnsignedInt16)
+    CASE(ImageChannelDataType, UnsigendInt32)
+    CASE(ImageChannelDataType, HalfFloat)
+    CASE(ImageChannelDataType, Float)
+    CASE(ImageChannelDataType, UnormInt24)
+    CASE(ImageChannelDataType, UnormInt101010_2)
+    break;
+  }
+  llvm_unreachable("Unexpected operand");
+}
+
+std::string getImageOperandName(uint32_t e) {
+  std::string nameString = "";
+  std::string sep = "";
+  if (e == static_cast<uint32_t>(ImageOperand::None))
+    return "None";
+  if (e == static_cast<uint32_t>(ImageOperand::Bias))
+    return "Bias";
+  if (e & static_cast<uint32_t>(ImageOperand::Bias)) {
+    nameString += sep + "Bias";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(ImageOperand::Lod))
+    return "Lod";
+  if (e & static_cast<uint32_t>(ImageOperand::Lod)) {
+    nameString += sep + "Lod";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(ImageOperand::Grad))
+    return "Grad";
+  if (e & static_cast<uint32_t>(ImageOperand::Grad)) {
+    nameString += sep + "Grad";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(ImageOperand::ConstOffset))
+    return "ConstOffset";
+  if (e & static_cast<uint32_t>(ImageOperand::ConstOffset)) {
+    nameString += sep + "ConstOffset";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(ImageOperand::Offset))
+    return "Offset";
+  if (e & static_cast<uint32_t>(ImageOperand::Offset)) {
+    nameString += sep + "Offset";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(ImageOperand::ConstOffsets))
+    return "ConstOffsets";
+  if (e & static_cast<uint32_t>(ImageOperand::ConstOffsets)) {
+    nameString += sep + "ConstOffsets";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(ImageOperand::Sample))
+    return "Sample";
+  if (e & static_cast<uint32_t>(ImageOperand::Sample)) {
+    nameString += sep + "Sample";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(ImageOperand::MinLod))
+    return "MinLod";
+  if (e & static_cast<uint32_t>(ImageOperand::MinLod)) {
+    nameString += sep + "MinLod";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(ImageOperand::MakeTexelAvailableKHR))
+    return "MakeTexelAvailableKHR";
+  if (e & static_cast<uint32_t>(ImageOperand::MakeTexelAvailableKHR)) {
+    nameString += sep + "MakeTexelAvailableKHR";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(ImageOperand::MakeTexelVisibleKHR))
+    return "MakeTexelVisibleKHR";
+  if (e & static_cast<uint32_t>(ImageOperand::MakeTexelVisibleKHR)) {
+    nameString += sep + "MakeTexelVisibleKHR";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(ImageOperand::NonPrivateTexelKHR))
+    return "NonPrivateTexelKHR";
+  if (e & static_cast<uint32_t>(ImageOperand::NonPrivateTexelKHR)) {
+    nameString += sep + "NonPrivateTexelKHR";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(ImageOperand::VolatileTexelKHR))
+    return "VolatileTexelKHR";
+  if (e & static_cast<uint32_t>(ImageOperand::VolatileTexelKHR)) {
+    nameString += sep + "VolatileTexelKHR";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(ImageOperand::SignExtend))
+    return "SignExtend";
+  if (e & static_cast<uint32_t>(ImageOperand::SignExtend)) {
+    nameString += sep + "SignExtend";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(ImageOperand::ZeroExtend))
+    return "ZeroExtend";
+  if (e & static_cast<uint32_t>(ImageOperand::ZeroExtend)) {
+    nameString += sep + "ZeroExtend";
+    sep = "|";
+  };
+  return nameString;
+}
+
+std::string getFPFastMathModeName(uint32_t e) {
+  std::string nameString = "";
+  std::string sep = "";
+  if (e == static_cast<uint32_t>(FPFastMathMode::None))
+    return "None";
+  if (e == static_cast<uint32_t>(FPFastMathMode::NotNaN))
+    return "NotNaN";
+  if (e & static_cast<uint32_t>(FPFastMathMode::NotNaN)) {
+    nameString += sep + "NotNaN";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(FPFastMathMode::NotInf))
+    return "NotInf";
+  if (e & static_cast<uint32_t>(FPFastMathMode::NotInf)) {
+    nameString += sep + "NotInf";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(FPFastMathMode::NSZ))
+    return "NSZ";
+  if (e & static_cast<uint32_t>(FPFastMathMode::NSZ)) {
+    nameString += sep + "NSZ";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(FPFastMathMode::AllowRecip))
+    return "AllowRecip";
+  if (e & static_cast<uint32_t>(FPFastMathMode::AllowRecip)) {
+    nameString += sep + "AllowRecip";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(FPFastMathMode::Fast))
+    return "Fast";
+  if (e & static_cast<uint32_t>(FPFastMathMode::Fast)) {
+    nameString += sep + "Fast";
+    sep = "|";
+  };
+  return nameString;
+}
+
+StringRef getFPRoundingModeName(FPRoundingMode e) {
+  switch (e) {
+    CASE(FPRoundingMode, RTE)
+    CASE(FPRoundingMode, RTZ)
+    CASE(FPRoundingMode, RTP)
+    CASE(FPRoundingMode, RTN)
+    break;
+  }
+  llvm_unreachable("Unexpected operand");
+}
+
+StringRef getLinkageTypeName(LinkageType e) {
+  switch (e) {
+    CASE(LinkageType, Export)
+    CASE(LinkageType, Import)
+    break;
+  }
+  llvm_unreachable("Unexpected operand");
+}
+
+StringRef getAccessQualifierName(AccessQualifier e) {
+  switch (e) {
+    CASE(AccessQualifier, ReadOnly)
+    CASE(AccessQualifier, WriteOnly)
+    CASE(AccessQualifier, ReadWrite)
+    break;
+  }
+  llvm_unreachable("Unexpected operand");
+}
+
+StringRef getFunctionParameterAttributeName(FunctionParameterAttribute e) {
+  switch (e) {
+    CASE(FunctionParameterAttribute, Zext)
+    CASE(FunctionParameterAttribute, Sext)
+    CASE(FunctionParameterAttribute, ByVal)
+    CASE(FunctionParameterAttribute, Sret)
+    CASE(FunctionParameterAttribute, NoAlias)
+    CASE(FunctionParameterAttribute, NoCapture)
+    CASE(FunctionParameterAttribute, NoWrite)
+    CASE(FunctionParameterAttribute, NoReadWrite)
+    break;
+  }
+  llvm_unreachable("Unexpected operand");
+}
+
+StringRef getDecorationName(Decoration e) {
+  switch (e) {
+    CASE(Decoration, RelaxedPrecision)
+    CASE(Decoration, SpecId)
+    CASE(Decoration, Block)
+    CASE(Decoration, BufferBlock)
+    CASE(Decoration, RowMajor)
+    CASE(Decoration, ColMajor)
+    CASE(Decoration, ArrayStride)
+    CASE(Decoration, MatrixStride)
+    CASE(Decoration, GLSLShared)
+    CASE(Decoration, GLSLPacked)
+    CASE(Decoration, CPacked)
+    CASE(Decoration, BuiltIn)
+    CASE(Decoration, NoPerspective)
+    CASE(Decoration, Flat)
+    CASE(Decoration, Patch)
+    CASE(Decoration, Centroid)
+    CASE(Decoration, Sample)
+    CASE(Decoration, Invariant)
+    CASE(Decoration, Restrict)
+    CASE(Decoration, Aliased)
+    CASE(Decoration, Volatile)
+    CASE(Decoration, Constant)
+    CASE(Decoration, Coherent)
+    CASE(Decoration, NonWritable)
+    CASE(Decoration, NonReadable)
+    CASE(Decoration, Uniform)
+    CASE(Decoration, UniformId)
+    CASE(Decoration, SaturatedConversion)
+    CASE(Decoration, Stream)
+    CASE(Decoration, Location)
+    CASE(Decoration, Component)
+    CASE(Decoration, Index)
+    CASE(Decoration, Binding)
+    CASE(Decoration, DescriptorSet)
+    CASE(Decoration, Offset)
+    CASE(Decoration, XfbBuffer)
+    CASE(Decoration, XfbStride)
+    CASE(Decoration, FuncParamAttr)
+    CASE(Decoration, FPRoundingMode)
+    CASE(Decoration, FPFastMathMode)
+    CASE(Decoration, LinkageAttributes)
+    CASE(Decoration, NoContraction)
+    CASE(Decoration, InputAttachmentIndex)
+    CASE(Decoration, Alignment)
+    CASE(Decoration, MaxByteOffset)
+    CASE(Decoration, AlignmentId)
+    CASE(Decoration, MaxByteOffsetId)
+    CASE(Decoration, NoSignedWrap)
+    CASE(Decoration, NoUnsignedWrap)
+    CASE(Decoration, ExplicitInterpAMD)
+    CASE(Decoration, OverrideCoverageNV)
+    CASE(Decoration, PassthroughNV)
+    CASE(Decoration, ViewportRelativeNV)
+    CASE(Decoration, SecondaryViewportRelativeNV)
+    CASE(Decoration, PerPrimitiveNV)
+    CASE(Decoration, PerViewNV)
+    CASE(Decoration, PerVertexNV)
+    CASE(Decoration, NonUniformEXT)
+    CASE(Decoration, CountBuffer)
+    CASE(Decoration, UserSemantic)
+    CASE(Decoration, RestrictPointerEXT)
+    CASE(Decoration, AliasedPointerEXT)
+    break;
+  }
+  llvm_unreachable("Unexpected operand");
+}
+
+StringRef getBuiltInName(BuiltIn e) {
+  switch (e) {
+    CASE(BuiltIn, Position)
+    CASE(BuiltIn, PointSize)
+    CASE(BuiltIn, ClipDistance)
+    CASE(BuiltIn, CullDistance)
+    CASE(BuiltIn, VertexId)
+    CASE(BuiltIn, InstanceId)
+    CASE(BuiltIn, PrimitiveId)
+    CASE(BuiltIn, InvocationId)
+    CASE(BuiltIn, Layer)
+    CASE(BuiltIn, ViewportIndex)
+    CASE(BuiltIn, TessLevelOuter)
+    CASE(BuiltIn, TessLevelInner)
+    CASE(BuiltIn, TessCoord)
+    CASE(BuiltIn, PatchVertices)
+    CASE(BuiltIn, FragCoord)
+    CASE(BuiltIn, PointCoord)
+    CASE(BuiltIn, FrontFacing)
+    CASE(BuiltIn, SampleId)
+    CASE(BuiltIn, SamplePosition)
+    CASE(BuiltIn, SampleMask)
+    CASE(BuiltIn, FragDepth)
+    CASE(BuiltIn, HelperInvocation)
+    CASE(BuiltIn, NumWorkgroups)
+    CASE(BuiltIn, WorkgroupSize)
+    CASE(BuiltIn, WorkgroupId)
+    CASE(BuiltIn, LocalInvocationId)
+    CASE(BuiltIn, GlobalInvocationId)
+    CASE(BuiltIn, LocalInvocationIndex)
+    CASE(BuiltIn, WorkDim)
+    CASE(BuiltIn, GlobalSize)
+    CASE(BuiltIn, EnqueuedWorkgroupSize)
+    CASE(BuiltIn, GlobalOffset)
+    CASE(BuiltIn, GlobalLinearId)
+    CASE(BuiltIn, SubgroupSize)
+    CASE(BuiltIn, SubgroupMaxSize)
+    CASE(BuiltIn, NumSubgroups)
+    CASE(BuiltIn, NumEnqueuedSubgroups)
+    CASE(BuiltIn, SubgroupId)
+    CASE(BuiltIn, SubgroupLocalInvocationId)
+    CASE(BuiltIn, VertexIndex)
+    CASE(BuiltIn, InstanceIndex)
+    CASE(BuiltIn, SubgroupEqMask)
+    CASE(BuiltIn, SubgroupGeMask)
+    CASE(BuiltIn, SubgroupGtMask)
+    CASE(BuiltIn, SubgroupLeMask)
+    CASE(BuiltIn, SubgroupLtMask)
+    CASE(BuiltIn, BaseVertex)
+    CASE(BuiltIn, BaseInstance)
+    CASE(BuiltIn, DrawIndex)
+    CASE(BuiltIn, DeviceIndex)
+    CASE(BuiltIn, ViewIndex)
+    CASE(BuiltIn, BaryCoordNoPerspAMD)
+    CASE(BuiltIn, BaryCoordNoPerspCentroidAMD)
+    CASE(BuiltIn, BaryCoordNoPerspSampleAMD)
+    CASE(BuiltIn, BaryCoordSmoothAMD)
+    CASE(BuiltIn, BaryCoordSmoothCentroid)
+    CASE(BuiltIn, BaryCoordSmoothSample)
+    CASE(BuiltIn, BaryCoordPullModel)
+    CASE(BuiltIn, FragStencilRefEXT)
+    CASE(BuiltIn, ViewportMaskNV)
+    CASE(BuiltIn, SecondaryPositionNV)
+    CASE(BuiltIn, SecondaryViewportMaskNV)
+    CASE(BuiltIn, PositionPerViewNV)
+    CASE(BuiltIn, ViewportMaskPerViewNV)
+    CASE(BuiltIn, FullyCoveredEXT)
+    CASE(BuiltIn, TaskCountNV)
+    CASE(BuiltIn, PrimitiveCountNV)
+    CASE(BuiltIn, PrimitiveIndicesNV)
+    CASE(BuiltIn, ClipDistancePerViewNV)
+    CASE(BuiltIn, CullDistancePerViewNV)
+    CASE(BuiltIn, LayerPerViewNV)
+    CASE(BuiltIn, MeshViewCountNV)
+    CASE(BuiltIn, MeshViewIndices)
+    CASE(BuiltIn, BaryCoordNV)
+    CASE(BuiltIn, BaryCoordNoPerspNV)
+    CASE(BuiltIn, FragSizeEXT)
+    CASE(BuiltIn, FragInvocationCountEXT)
+    CASE(BuiltIn, LaunchIdNV)
+    CASE(BuiltIn, LaunchSizeNV)
+    CASE(BuiltIn, WorldRayOriginNV)
+    CASE(BuiltIn, WorldRayDirectionNV)
+    CASE(BuiltIn, ObjectRayOriginNV)
+    CASE(BuiltIn, ObjectRayDirectionNV)
+    CASE(BuiltIn, RayTminNV)
+    CASE(BuiltIn, RayTmaxNV)
+    CASE(BuiltIn, InstanceCustomIndexNV)
+    CASE(BuiltIn, ObjectToWorldNV)
+    CASE(BuiltIn, WorldToObjectNV)
+    CASE(BuiltIn, HitTNV)
+    CASE(BuiltIn, HitKindNV)
+    CASE(BuiltIn, IncomingRayFlagsNV)
+    break;
+  }
+  llvm_unreachable("Unexpected operand");
+}
+
+std::string getSelectionControlName(uint32_t e) {
+  std::string nameString = "";
+  std::string sep = "";
+  if (e == static_cast<uint32_t>(SelectionControl::None))
+    return "None";
+  if (e == static_cast<uint32_t>(SelectionControl::Flatten))
+    return "Flatten";
+  if (e & static_cast<uint32_t>(SelectionControl::Flatten)) {
+    nameString += sep + "Flatten";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(SelectionControl::DontFlatten))
+    return "DontFlatten";
+  if (e & static_cast<uint32_t>(SelectionControl::DontFlatten)) {
+    nameString += sep + "DontFlatten";
+    sep = "|";
+  };
+  return nameString;
+}
+
+std::string getLoopControlName(uint32_t e) {
+  std::string nameString = "";
+  std::string sep = "";
+  if (e == static_cast<uint32_t>(LoopControl::None))
+    return "None";
+  if (e == static_cast<uint32_t>(LoopControl::Unroll))
+    return "Unroll";
+  if (e & static_cast<uint32_t>(LoopControl::Unroll)) {
+    nameString += sep + "Unroll";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(LoopControl::DontUnroll))
+    return "DontUnroll";
+  if (e & static_cast<uint32_t>(LoopControl::DontUnroll)) {
+    nameString += sep + "DontUnroll";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(LoopControl::DependencyInfinite))
+    return "DependencyInfinite";
+  if (e & static_cast<uint32_t>(LoopControl::DependencyInfinite)) {
+    nameString += sep + "DependencyInfinite";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(LoopControl::DependencyLength))
+    return "DependencyLength";
+  if (e & static_cast<uint32_t>(LoopControl::DependencyLength)) {
+    nameString += sep + "DependencyLength";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(LoopControl::MinIterations))
+    return "MinIterations";
+  if (e & static_cast<uint32_t>(LoopControl::MinIterations)) {
+    nameString += sep + "MinIterations";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(LoopControl::MaxIterations))
+    return "MaxIterations";
+  if (e & static_cast<uint32_t>(LoopControl::MaxIterations)) {
+    nameString += sep + "MaxIterations";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(LoopControl::IterationMultiple))
+    return "IterationMultiple";
+  if (e & static_cast<uint32_t>(LoopControl::IterationMultiple)) {
+    nameString += sep + "IterationMultiple";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(LoopControl::PeelCount))
+    return "PeelCount";
+  if (e & static_cast<uint32_t>(LoopControl::PeelCount)) {
+    nameString += sep + "PeelCount";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(LoopControl::PartialCount))
+    return "PartialCount";
+  if (e & static_cast<uint32_t>(LoopControl::PartialCount)) {
+    nameString += sep + "PartialCount";
+    sep = "|";
+  };
+  return nameString;
+}
+
+std::string getFunctionControlName(uint32_t e) {
+  std::string nameString = "";
+  std::string sep = "";
+  if (e == static_cast<uint32_t>(FunctionControl::None))
+    return "None";
+  if (e == static_cast<uint32_t>(FunctionControl::Inline))
+    return "Inline";
+  if (e & static_cast<uint32_t>(FunctionControl::Inline)) {
+    nameString += sep + "Inline";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(FunctionControl::DontInline))
+    return "DontInline";
+  if (e & static_cast<uint32_t>(FunctionControl::DontInline)) {
+    nameString += sep + "DontInline";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(FunctionControl::Pure))
+    return "Pure";
+  if (e & static_cast<uint32_t>(FunctionControl::Pure)) {
+    nameString += sep + "Pure";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(FunctionControl::Const))
+    return "Const";
+  if (e & static_cast<uint32_t>(FunctionControl::Const)) {
+    nameString += sep + "Const";
+    sep = "|";
+  };
+  return nameString;
+}
+
+std::string getMemorySemanticsName(uint32_t e) {
+  std::string nameString = "";
+  std::string sep = "";
+  if (e == static_cast<uint32_t>(MemorySemantics::None))
+    return "None";
+  if (e == static_cast<uint32_t>(MemorySemantics::Acquire))
+    return "Acquire";
+  if (e & static_cast<uint32_t>(MemorySemantics::Acquire)) {
+    nameString += sep + "Acquire";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(MemorySemantics::Release))
+    return "Release";
+  if (e & static_cast<uint32_t>(MemorySemantics::Release)) {
+    nameString += sep + "Release";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(MemorySemantics::AcquireRelease))
+    return "AcquireRelease";
+  if (e & static_cast<uint32_t>(MemorySemantics::AcquireRelease)) {
+    nameString += sep + "AcquireRelease";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(MemorySemantics::SequentiallyConsistent))
+    return "SequentiallyConsistent";
+  if (e & static_cast<uint32_t>(MemorySemantics::SequentiallyConsistent)) {
+    nameString += sep + "SequentiallyConsistent";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(MemorySemantics::UniformMemory))
+    return "UniformMemory";
+  if (e & static_cast<uint32_t>(MemorySemantics::UniformMemory)) {
+    nameString += sep + "UniformMemory";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(MemorySemantics::SubgroupMemory))
+    return "SubgroupMemory";
+  if (e & static_cast<uint32_t>(MemorySemantics::SubgroupMemory)) {
+    nameString += sep + "SubgroupMemory";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(MemorySemantics::WorkgroupMemory))
+    return "WorkgroupMemory";
+  if (e & static_cast<uint32_t>(MemorySemantics::WorkgroupMemory)) {
+    nameString += sep + "WorkgroupMemory";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(MemorySemantics::CrossWorkgroupMemory))
+    return "CrossWorkgroupMemory";
+  if (e & static_cast<uint32_t>(MemorySemantics::CrossWorkgroupMemory)) {
+    nameString += sep + "CrossWorkgroupMemory";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(MemorySemantics::AtomicCounterMemory))
+    return "AtomicCounterMemory";
+  if (e & static_cast<uint32_t>(MemorySemantics::AtomicCounterMemory)) {
+    nameString += sep + "AtomicCounterMemory";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(MemorySemantics::ImageMemory))
+    return "ImageMemory";
+  if (e & static_cast<uint32_t>(MemorySemantics::ImageMemory)) {
+    nameString += sep + "ImageMemory";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(MemorySemantics::OutputMemoryKHR))
+    return "OutputMemoryKHR";
+  if (e & static_cast<uint32_t>(MemorySemantics::OutputMemoryKHR)) {
+    nameString += sep + "OutputMemoryKHR";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(MemorySemantics::MakeAvailableKHR))
+    return "MakeAvailableKHR";
+  if (e & static_cast<uint32_t>(MemorySemantics::MakeAvailableKHR)) {
+    nameString += sep + "MakeAvailableKHR";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(MemorySemantics::MakeVisibleKHR))
+    return "MakeVisibleKHR";
+  if (e & static_cast<uint32_t>(MemorySemantics::MakeVisibleKHR)) {
+    nameString += sep + "MakeVisibleKHR";
+    sep = "|";
+  };
+  return nameString;
+}
+
+std::string getMemoryOperandName(uint32_t e) {
+  std::string nameString = "";
+  std::string sep = "";
+  if (e == static_cast<uint32_t>(MemoryOperand::None))
+    return "None";
+  if (e == static_cast<uint32_t>(MemoryOperand::Volatile))
+    return "Volatile";
+  if (e & static_cast<uint32_t>(MemoryOperand::Volatile)) {
+    nameString += sep + "Volatile";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(MemoryOperand::Aligned))
+    return "Aligned";
+  if (e & static_cast<uint32_t>(MemoryOperand::Aligned)) {
+    nameString += sep + "Aligned";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(MemoryOperand::Nontemporal))
+    return "Nontemporal";
+  if (e & static_cast<uint32_t>(MemoryOperand::Nontemporal)) {
+    nameString += sep + "Nontemporal";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(MemoryOperand::MakePointerAvailableKHR))
+    return "MakePointerAvailableKHR";
+  if (e & static_cast<uint32_t>(MemoryOperand::MakePointerAvailableKHR)) {
+    nameString += sep + "MakePointerAvailableKHR";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(MemoryOperand::MakePointerVisibleKHR))
+    return "MakePointerVisibleKHR";
+  if (e & static_cast<uint32_t>(MemoryOperand::MakePointerVisibleKHR)) {
+    nameString += sep + "MakePointerVisibleKHR";
+    sep = "|";
+  }
+  if (e == static_cast<uint32_t>(MemoryOperand::NonPrivatePointerKHR))
+    return "NonPrivatePointerKHR";
+  if (e & static_cast<uint32_t>(MemoryOperand::NonPrivatePointerKHR)) {
+    nameString += sep + "NonPrivatePointerKHR";
+    sep = "|";
+  };
+  return nameString;
+}
+
+StringRef getScopeName(Scope e) {
+  switch (e) {
+    CASE(Scope, CrossDevice)
+    CASE(Scope, Device)
+    CASE(Scope, Workgroup)
+    CASE(Scope, Subgroup)
+    CASE(Scope, Invocation)
+    CASE(Scope, QueueFamilyKHR)
+    break;
+  }
+  llvm_unreachable("Unexpected operand");
+}
+
+StringRef getGroupOperationName(GroupOperation e) {
+  switch (e) {
+    CASE(GroupOperation, Reduce)
+    CASE(GroupOperation, InclusiveScan)
+    CASE(GroupOperation, ExclusiveScan)
+    CASE(GroupOperation, ClusteredReduce)
+    CASE(GroupOperation, PartitionedReduceNV)
+    CASE(GroupOperation, PartitionedInclusiveScanNV)
+    CASE(GroupOperation, PartitionedExclusiveScanNV)
+    break;
+  }
+  llvm_unreachable("Unexpected operand");
+}
+
+StringRef getKernelEnqueueFlagsName(KernelEnqueueFlags e) {
+  switch (e) {
+    CASE(KernelEnqueueFlags, NoWait)
+    CASE(KernelEnqueueFlags, WaitKernel)
+    CASE(KernelEnqueueFlags, WaitWorkGroup)
+    break;
+  }
+  llvm_unreachable("Unexpected operand");
+}
+
+StringRef getKernelProfilingInfoName(KernelProfilingInfo e) {
+  switch (e) {
+    CASE(KernelProfilingInfo, None)
+    CASE(KernelProfilingInfo, CmdExecTime)
+    break;
+  }
+  llvm_unreachable("Unexpected operand");
+}
+} // namespace SPIRV
+} // namespace llvm
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
new file mode 100644
index 000000000000..2aa9f076c78e
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
@@ -0,0 +1,739 @@
+//===-- SPIRVBaseInfo.h -  Top level definitions for SPIRV ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the SPIRV target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPIRV_MCTARGETDESC_SPIRVBASEINFO_H
+#define LLVM_LIB_TARGET_SPIRV_MCTARGETDESC_SPIRVBASEINFO_H
+
+#include "llvm/ADT/StringRef.h"
+#include <string>
+
+namespace llvm {
+namespace SPIRV {
+enum class Capability : uint32_t {
+  Matrix = 0,
+  Shader = 1,
+  Geometry = 2,
+  Tessellation = 3,
+  Addresses = 4,
+  Linkage = 5,
+  Kernel = 6,
+  Vector16 = 7,
+  Float16Buffer = 8,
+  Float16 = 9,
+  Float64 = 10,
+  Int64 = 11,
+  Int64Atomics = 12,
+  ImageBasic = 13,
+  ImageReadWrite = 14,
+  ImageMipmap = 15,
+  Pipes = 17,
+  Groups = 18,
+  DeviceEnqueue = 19,
+  LiteralSampler = 20,
+  AtomicStorage = 21,
+  Int16 = 22,
+  TessellationPointSize = 23,
+  GeometryPointSize = 24,
+  ImageGatherExtended = 25,
+  StorageImageMultisample = 27,
+  UniformBufferArrayDynamicIndexing = 28,
+  SampledImageArrayDymnamicIndexing = 29,
+  ClipDistance = 32,
+  CullDistance = 33,
+  ImageCubeArray = 34,
+  SampleRateShading = 35,
+  ImageRect = 36,
+  SampledRect = 37,
+  GenericPointer = 38,
+  Int8 = 39,
+  InputAttachment = 40,
+  SparseResidency = 41,
+  MinLod = 42,
+  Sampled1D = 43,
+  Image1D = 44,
+  SampledCubeArray = 45,
+  SampledBuffer = 46,
+  ImageBuffer = 47,
+  ImageMSArray = 48,
+  StorageImageExtendedFormats = 49,
+  ImageQuery = 50,
+  DerivativeControl = 51,
+  InterpolationFunction = 52,
+  TransformFeedback = 53,
+  GeometryStreams = 54,
+  StorageImageReadWithoutFormat = 55,
+  StorageImageWriteWithoutFormat = 56,
+  MultiViewport = 57,
+  SubgroupDispatch = 58,
+  NamedBarrier = 59,
+  PipeStorage = 60,
+  GroupNonUniform = 61,
+  GroupNonUniformVote = 62,
+  GroupNonUniformArithmetic = 63,
+  GroupNonUniformBallot = 64,
+  GroupNonUniformShuffle = 65,
+  GroupNonUniformShuffleRelative = 66,
+  GroupNonUniformClustered = 67,
+  GroupNonUniformQuad = 68,
+  SubgroupBallotKHR = 4423,
+  DrawParameters = 4427,
+  SubgroupVoteKHR = 4431,
+  StorageBuffer16BitAccess = 4433,
+  StorageUniform16 = 4434,
+  StoragePushConstant16 = 4435,
+  StorageInputOutput16 = 4436,
+  DeviceGroup = 4437,
+  MultiView = 4439,
+  VariablePointersStorageBuffer = 4441,
+  VariablePointers = 4442,
+  AtomicStorageOps = 4445,
+  SampleMaskPostDepthCoverage = 4447,
+  StorageBuffer8BitAccess = 4448,
+  UniformAndStorageBuffer8BitAccess = 4449,
+  StoragePushConstant8 = 4450,
+  DenormPreserve = 4464,
+  DenormFlushToZero = 4465,
+  SignedZeroInfNanPreserve = 4466,
+  RoundingModeRTE = 4467,
+  RoundingModeRTZ = 4468,
+  Float16ImageAMD = 5008,
+  ImageGatherBiasLodAMD = 5009,
+  FragmentMaskAMD = 5010,
+  StencilExportEXT = 5013,
+  ImageReadWriteLodAMD = 5015,
+  SampleMaskOverrideCoverageNV = 5249,
+  GeometryShaderPassthroughNV = 5251,
+  ShaderViewportIndexLayerEXT = 5254,
+  ShaderViewportMaskNV = 5255,
+  ShaderStereoViewNV = 5259,
+  PerViewAttributesNV = 5260,
+  FragmentFullyCoveredEXT = 5265,
+  MeshShadingNV = 5266,
+  ShaderNonUniformEXT = 5301,
+  RuntimeDescriptorArrayEXT = 5302,
+  InputAttachmentArrayDynamicIndexingEXT = 5303,
+  UniformTexelBufferArrayDynamicIndexingEXT = 5304,
+  StorageTexelBufferArrayDynamicIndexingEXT = 5305,
+  UniformBufferArrayNonUniformIndexingEXT = 5306,
+  SampledImageArrayNonUniformIndexingEXT = 5307,
+  StorageBufferArrayNonUniformIndexingEXT = 5308,
+  StorageImageArrayNonUniformIndexingEXT = 5309,
+  InputAttachmentArrayNonUniformIndexingEXT = 5310,
+  UniformTexelBufferArrayNonUniformIndexingEXT = 5311,
+  StorageTexelBufferArrayNonUniformIndexingEXT = 5312,
+  RayTracingNV = 5340,
+  SubgroupShuffleINTEL = 5568,
+  SubgroupBufferBlockIOINTEL = 5569,
+  SubgroupImageBlockIOINTEL = 5570,
+  SubgroupImageMediaBlockIOINTEL = 5579,
+  SubgroupAvcMotionEstimationINTEL = 5696,
+  SubgroupAvcMotionEstimationIntraINTEL = 5697,
+  SubgroupAvcMotionEstimationChromaINTEL = 5698,
+  GroupNonUniformPartitionedNV = 5297,
+  VulkanMemoryModelKHR = 5345,
+  VulkanMemoryModelDeviceScopeKHR = 5346,
+  ImageFootprintNV = 5282,
+  FragmentBarycentricNV = 5284,
+  ComputeDerivativeGroupQuadsNV = 5288,
+  ComputeDerivativeGroupLinearNV = 5350,
+  FragmentDensityEXT = 5291,
+  PhysicalStorageBufferAddressesEXT = 5347,
+  CooperativeMatrixNV = 5357,
+};
+StringRef getCapabilityName(Capability e);
+
+enum class SourceLanguage : uint32_t {
+  Unknown = 0,
+  ESSL = 1,
+  GLSL = 2,
+  OpenCL_C = 3,
+  OpenCL_CPP = 4,
+  HLSL = 5,
+};
+StringRef getSourceLanguageName(SourceLanguage e);
+
+enum class AddressingModel : uint32_t {
+  Logical = 0,
+  Physical32 = 1,
+  Physical64 = 2,
+  PhysicalStorageBuffer64EXT = 5348,
+};
+StringRef getAddressingModelName(AddressingModel e);
+
+enum class ExecutionModel : uint32_t {
+  Vertex = 0,
+  TessellationControl = 1,
+  TessellationEvaluation = 2,
+  Geometry = 3,
+  Fragment = 4,
+  GLCompute = 5,
+  Kernel = 6,
+  TaskNV = 5267,
+  MeshNV = 5268,
+  RayGenerationNV = 5313,
+  IntersectionNV = 5314,
+  AnyHitNV = 5315,
+  ClosestHitNV = 5316,
+  MissNV = 5317,
+  CallableNV = 5318,
+};
+StringRef getExecutionModelName(ExecutionModel e);
+
+enum class MemoryModel : uint32_t {
+  Simple = 0,
+  GLSL450 = 1,
+  OpenCL = 2,
+  VulkanKHR = 3,
+};
+StringRef getMemoryModelName(MemoryModel e);
+
+enum class ExecutionMode : uint32_t {
+  Invocations = 0,
+  SpacingEqual = 1,
+  SpacingFractionalEven = 2,
+  SpacingFractionalOdd = 3,
+  VertexOrderCw = 4,
+  VertexOrderCcw = 5,
+  PixelCenterInteger = 6,
+  OriginUpperLeft = 7,
+  OriginLowerLeft = 8,
+  EarlyFragmentTests = 9,
+  PointMode = 10,
+  Xfb = 11,
+  DepthReplacing = 12,
+  DepthGreater = 14,
+  DepthLess = 15,
+  DepthUnchanged = 16,
+  LocalSize = 17,
+  LocalSizeHint = 18,
+  InputPoints = 19,
+  InputLines = 20,
+  InputLinesAdjacency = 21,
+  Triangles = 22,
+  InputTrianglesAdjacency = 23,
+  Quads = 24,
+  Isolines = 25,
+  OutputVertices = 26,
+  OutputPoints = 27,
+  OutputLineStrip = 28,
+  OutputTriangleStrip = 29,
+  VecTypeHint = 30,
+  ContractionOff = 31,
+  Initializer = 33,
+  Finalizer = 34,
+  SubgroupSize = 35,
+  SubgroupsPerWorkgroup = 36,
+  SubgroupsPerWorkgroupId = 37,
+  LocalSizeId = 38,
+  LocalSizeHintId = 39,
+  PostDepthCoverage = 4446,
+  DenormPreserve = 4459,
+  DenormFlushToZero = 4460,
+  SignedZeroInfNanPreserve = 4461,
+  RoundingModeRTE = 4462,
+  RoundingModeRTZ = 4463,
+  StencilRefReplacingEXT = 5027,
+  OutputLinesNV = 5269,
+  DerivativeGroupQuadsNV = 5289,
+  DerivativeGroupLinearNV = 5290,
+  OutputTrianglesNV = 5298,
+};
+StringRef getExecutionModeName(ExecutionMode e);
+
+enum class StorageClass : uint32_t {
+  UniformConstant = 0,
+  Input = 1,
+  Uniform = 2,
+  Output = 3,
+  Workgroup = 4,
+  CrossWorkgroup = 5,
+  Private = 6,
+  Function = 7,
+  Generic = 8,
+  PushConstant = 9,
+  AtomicCounter = 10,
+  Image = 11,
+  StorageBuffer = 12,
+  CallableDataNV = 5328,
+  IncomingCallableDataNV = 5329,
+  RayPayloadNV = 5338,
+  HitAttributeNV = 5339,
+  IncomingRayPayloadNV = 5342,
+  ShaderRecordBufferNV = 5343,
+  PhysicalStorageBufferEXT = 5349,
+};
+StringRef getStorageClassName(StorageClass e);
+
+enum class Dim : uint32_t {
+  DIM_1D = 0,
+  DIM_2D = 1,
+  DIM_3D = 2,
+  DIM_Cube = 3,
+  DIM_Rect = 4,
+  DIM_Buffer = 5,
+  DIM_SubpassData = 6,
+};
+StringRef getDimName(Dim e);
+
+enum class SamplerAddressingMode : uint32_t {
+  None = 0,
+  ClampToEdge = 1,
+  Clamp = 2,
+  Repeat = 3,
+  RepeatMirrored = 4,
+};
+StringRef getSamplerAddressingModeName(SamplerAddressingMode e);
+
+enum class SamplerFilterMode : uint32_t {
+  Nearest = 0,
+  Linear = 1,
+};
+StringRef getSamplerFilterModeName(SamplerFilterMode e);
+
+enum class ImageFormat : uint32_t {
+  Unknown = 0,
+  Rgba32f = 1,
+  Rgba16f = 2,
+  R32f = 3,
+  Rgba8 = 4,
+  Rgba8Snorm = 5,
+  Rg32f = 6,
+  Rg16f = 7,
+  R11fG11fB10f = 8,
+  R16f = 9,
+  Rgba16 = 10,
+  Rgb10A2 = 11,
+  Rg16 = 12,
+  Rg8 = 13,
+  R16 = 14,
+  R8 = 15,
+  Rgba16Snorm = 16,
+  Rg16Snorm = 17,
+  Rg8Snorm = 18,
+  R16Snorm = 19,
+  R8Snorm = 20,
+  Rgba32i = 21,
+  Rgba16i = 22,
+  Rgba8i = 23,
+  R32i = 24,
+  Rg32i = 25,
+  Rg16i = 26,
+  Rg8i = 27,
+  R16i = 28,
+  R8i = 29,
+  Rgba32ui = 30,
+  Rgba16ui = 31,
+  Rgba8ui = 32,
+  R32ui = 33,
+  Rgb10a2ui = 34,
+  Rg32ui = 35,
+  Rg16ui = 36,
+  Rg8ui = 37,
+  R16ui = 38,
+  R8ui = 39,
+};
+StringRef getImageFormatName(ImageFormat e);
+
+enum class ImageChannelOrder : uint32_t {
+  R = 0,
+  A = 1,
+  RG = 2,
+  RA = 3,
+  RGB = 4,
+  RGBA = 5,
+  BGRA = 6,
+  ARGB = 7,
+  Intensity = 8,
+  Luminance = 9,
+  Rx = 10,
+  RGx = 11,
+  RGBx = 12,
+  Depth = 13,
+  DepthStencil = 14,
+  sRGB = 15,
+  sRGBx = 16,
+  sRGBA = 17,
+  sBGRA = 18,
+  ABGR = 19,
+};
+StringRef getImageChannelOrderName(ImageChannelOrder e);
+
+enum class ImageChannelDataType : uint32_t {
+  SnormInt8 = 0,
+  SnormInt16 = 1,
+  UnormInt8 = 2,
+  UnormInt16 = 3,
+  UnormShort565 = 4,
+  UnormShort555 = 5,
+  UnormInt101010 = 6,
+  SignedInt8 = 7,
+  SignedInt16 = 8,
+  SignedInt32 = 9,
+  UnsignedInt8 = 10,
+  UnsignedInt16 = 11,
+  UnsigendInt32 = 12,
+  HalfFloat = 13,
+  Float = 14,
+  UnormInt24 = 15,
+  UnormInt101010_2 = 16,
+};
+StringRef getImageChannelDataTypeName(ImageChannelDataType e);
+
+enum class ImageOperand : uint32_t {
+  None = 0x0,
+  Bias = 0x1,
+  Lod = 0x2,
+  Grad = 0x4,
+  ConstOffset = 0x8,
+  Offset = 0x10,
+  ConstOffsets = 0x20,
+  Sample = 0x40,
+  MinLod = 0x80,
+  MakeTexelAvailableKHR = 0x100,
+  MakeTexelVisibleKHR = 0x200,
+  NonPrivateTexelKHR = 0x400,
+  VolatileTexelKHR = 0x800,
+  SignExtend = 0x1000,
+  ZeroExtend = 0x2000,
+};
+std::string getImageOperandName(uint32_t e);
+
+enum class FPFastMathMode : uint32_t {
+  None = 0x0,
+  NotNaN = 0x1,
+  NotInf = 0x2,
+  NSZ = 0x4,
+  AllowRecip = 0x8,
+  Fast = 0x10,
+};
+std::string getFPFastMathModeName(uint32_t e);
+
+enum class FPRoundingMode : uint32_t {
+  RTE = 0,
+  RTZ = 1,
+  RTP = 2,
+  RTN = 3,
+};
+StringRef getFPRoundingModeName(FPRoundingMode e);
+
+enum class LinkageType : uint32_t {
+  Export = 0,
+  Import = 1,
+};
+StringRef getLinkageTypeName(LinkageType e);
+
+enum class AccessQualifier : uint32_t {
+  ReadOnly = 0,
+  WriteOnly = 1,
+  ReadWrite = 2,
+};
+StringRef getAccessQualifierName(AccessQualifier e);
+
+enum class FunctionParameterAttribute : uint32_t {
+  Zext = 0,
+  Sext = 1,
+  ByVal = 2,
+  Sret = 3,
+  NoAlias = 4,
+  NoCapture = 5,
+  NoWrite = 6,
+  NoReadWrite = 7,
+};
+StringRef getFunctionParameterAttributeName(FunctionParameterAttribute e);
+
+enum class Decoration : uint32_t {
+  RelaxedPrecision = 0,
+  SpecId = 1,
+  Block = 2,
+  BufferBlock = 3,
+  RowMajor = 4,
+  ColMajor = 5,
+  ArrayStride = 6,
+  MatrixStride = 7,
+  GLSLShared = 8,
+  GLSLPacked = 9,
+  CPacked = 10,
+  BuiltIn = 11,
+  NoPerspective = 13,
+  Flat = 14,
+  Patch = 15,
+  Centroid = 16,
+  Sample = 17,
+  Invariant = 18,
+  Restrict = 19,
+  Aliased = 20,
+  Volatile = 21,
+  Constant = 22,
+  Coherent = 23,
+  NonWritable = 24,
+  NonReadable = 25,
+  Uniform = 26,
+  UniformId = 27,
+  SaturatedConversion = 28,
+  Stream = 29,
+  Location = 30,
+  Component = 31,
+  Index = 32,
+  Binding = 33,
+  DescriptorSet = 34,
+  Offset = 35,
+  XfbBuffer = 36,
+  XfbStride = 37,
+  FuncParamAttr = 38,
+  FPRoundingMode = 39,
+  FPFastMathMode = 40,
+  LinkageAttributes = 41,
+  NoContraction = 42,
+  InputAttachmentIndex = 43,
+  Alignment = 44,
+  MaxByteOffset = 45,
+  AlignmentId = 46,
+  MaxByteOffsetId = 47,
+  NoSignedWrap = 4469,
+  NoUnsignedWrap = 4470,
+  ExplicitInterpAMD = 4999,
+  OverrideCoverageNV = 5248,
+  PassthroughNV = 5250,
+  ViewportRelativeNV = 5252,
+  SecondaryViewportRelativeNV = 5256,
+  PerPrimitiveNV = 5271,
+  PerViewNV = 5272,
+  PerVertexNV = 5273,
+  NonUniformEXT = 5300,
+  CountBuffer = 5634,
+  UserSemantic = 5635,
+  RestrictPointerEXT = 5355,
+  AliasedPointerEXT = 5356,
+};
+StringRef getDecorationName(Decoration e);
+
+enum class BuiltIn : uint32_t {
+  Position = 0,
+  PointSize = 1,
+  ClipDistance = 3,
+  CullDistance = 4,
+  VertexId = 5,
+  InstanceId = 6,
+  PrimitiveId = 7,
+  InvocationId = 8,
+  Layer = 9,
+  ViewportIndex = 10,
+  TessLevelOuter = 11,
+  TessLevelInner = 12,
+  TessCoord = 13,
+  PatchVertices = 14,
+  FragCoord = 15,
+  PointCoord = 16,
+  FrontFacing = 17,
+  SampleId = 18,
+  SamplePosition = 19,
+  SampleMask = 20,
+  FragDepth = 22,
+  HelperInvocation = 23,
+  NumWorkgroups = 24,
+  WorkgroupSize = 25,
+  WorkgroupId = 26,
+  LocalInvocationId = 27,
+  GlobalInvocationId = 28,
+  LocalInvocationIndex = 29,
+  WorkDim = 30,
+  GlobalSize = 31,
+  EnqueuedWorkgroupSize = 32,
+  GlobalOffset = 33,
+  GlobalLinearId = 34,
+  SubgroupSize = 36,
+  SubgroupMaxSize = 37,
+  NumSubgroups = 38,
+  NumEnqueuedSubgroups = 39,
+  SubgroupId = 40,
+  SubgroupLocalInvocationId = 41,
+  VertexIndex = 42,
+  InstanceIndex = 43,
+  SubgroupEqMask = 4416,
+  SubgroupGeMask = 4417,
+  SubgroupGtMask = 4418,
+  SubgroupLeMask = 4419,
+  SubgroupLtMask = 4420,
+  BaseVertex = 4424,
+  BaseInstance = 4425,
+  DrawIndex = 4426,
+  DeviceIndex = 4438,
+  ViewIndex = 4440,
+  BaryCoordNoPerspAMD = 4492,
+  BaryCoordNoPerspCentroidAMD = 4493,
+  BaryCoordNoPerspSampleAMD = 4494,
+  BaryCoordSmoothAMD = 4495,
+  BaryCoordSmoothCentroid = 4496,
+  BaryCoordSmoothSample = 4497,
+  BaryCoordPullModel = 4498,
+  FragStencilRefEXT = 5014,
+  ViewportMaskNV = 5253,
+  SecondaryPositionNV = 5257,
+  SecondaryViewportMaskNV = 5258,
+  PositionPerViewNV = 5261,
+  ViewportMaskPerViewNV = 5262,
+  FullyCoveredEXT = 5264,
+  TaskCountNV = 5274,
+  PrimitiveCountNV = 5275,
+  PrimitiveIndicesNV = 5276,
+  ClipDistancePerViewNV = 5277,
+  CullDistancePerViewNV = 5278,
+  LayerPerViewNV = 5279,
+  MeshViewCountNV = 5280,
+  MeshViewIndices = 5281,
+  BaryCoordNV = 5286,
+  BaryCoordNoPerspNV = 5287,
+  FragSizeEXT = 5292,
+  FragInvocationCountEXT = 5293,
+  LaunchIdNV = 5319,
+  LaunchSizeNV = 5320,
+  WorldRayOriginNV = 5321,
+  WorldRayDirectionNV = 5322,
+  ObjectRayOriginNV = 5323,
+  ObjectRayDirectionNV = 5324,
+  RayTminNV = 5325,
+  RayTmaxNV = 5326,
+  InstanceCustomIndexNV = 5327,
+  ObjectToWorldNV = 5330,
+  WorldToObjectNV = 5331,
+  HitTNV = 5332,
+  HitKindNV = 5333,
+  IncomingRayFlagsNV = 5351,
+};
+StringRef getBuiltInName(BuiltIn e);
+
+enum class SelectionControl : uint32_t {
+  None = 0x0,
+  Flatten = 0x1,
+  DontFlatten = 0x2,
+};
+std::string getSelectionControlName(uint32_t e);
+
+enum class LoopControl : uint32_t {
+  None = 0x0,
+  Unroll = 0x1,
+  DontUnroll = 0x2,
+  DependencyInfinite = 0x4,
+  DependencyLength = 0x8,
+  MinIterations = 0x10,
+  MaxIterations = 0x20,
+  IterationMultiple = 0x40,
+  PeelCount = 0x80,
+  PartialCount = 0x100,
+};
+std::string getLoopControlName(uint32_t e);
+
+enum class FunctionControl : uint32_t {
+  None = 0x0,
+  Inline = 0x1,
+  DontInline = 0x2,
+  Pure = 0x4,
+  Const = 0x8,
+};
+std::string getFunctionControlName(uint32_t e);
+
+enum class MemorySemantics : uint32_t {
+  None = 0x0,
+  Acquire = 0x2,
+  Release = 0x4,
+  AcquireRelease = 0x8,
+  SequentiallyConsistent = 0x10,
+  UniformMemory = 0x40,
+  SubgroupMemory = 0x80,
+  WorkgroupMemory = 0x100,
+  CrossWorkgroupMemory = 0x200,
+  AtomicCounterMemory = 0x400,
+  ImageMemory = 0x800,
+  OutputMemoryKHR = 0x1000,
+  MakeAvailableKHR = 0x2000,
+  MakeVisibleKHR = 0x4000,
+};
+std::string getMemorySemanticsName(uint32_t e);
+
+enum class MemoryOperand : uint32_t {
+  None = 0x0,
+  Volatile = 0x1,
+  Aligned = 0x2,
+  Nontemporal = 0x4,
+  MakePointerAvailableKHR = 0x8,
+  MakePointerVisibleKHR = 0x10,
+  NonPrivatePointerKHR = 0x20,
+};
+std::string getMemoryOperandName(uint32_t e);
+
+enum class Scope : uint32_t {
+  CrossDevice = 0,
+  Device = 1,
+  Workgroup = 2,
+  Subgroup = 3,
+  Invocation = 4,
+  QueueFamilyKHR = 5,
+};
+StringRef getScopeName(Scope e);
+
+enum class GroupOperation : uint32_t {
+  Reduce = 0,
+  InclusiveScan = 1,
+  ExclusiveScan = 2,
+  ClusteredReduce = 3,
+  PartitionedReduceNV = 6,
+  PartitionedInclusiveScanNV = 7,
+  PartitionedExclusiveScanNV = 8,
+};
+StringRef getGroupOperationName(GroupOperation e);
+
+enum class KernelEnqueueFlags : uint32_t {
+  NoWait = 0,
+  WaitKernel = 1,
+  WaitWorkGroup = 2,
+};
+StringRef getKernelEnqueueFlagsName(KernelEnqueueFlags e);
+
+enum class KernelProfilingInfo : uint32_t {
+  None = 0x0,
+  CmdExecTime = 0x1,
+};
+StringRef getKernelProfilingInfoName(KernelProfilingInfo e);
+} // namespace SPIRV
+} // namespace llvm
+
+// Return a string representation of the operands from startIndex onwards.
+// Templated to allow both MachineInstr and MCInst to use the same logic.
+template <class InstType>
+std::string getSPIRVStringOperand(const InstType &MI, unsigned StartIndex) {
+  std::string s; // Iteratively append to this string.
+
+  const unsigned NumOps = MI.getNumOperands();
+  bool IsFinished = false;
+  for (unsigned i = StartIndex; i < NumOps && !IsFinished; ++i) {
+    const auto &Op = MI.getOperand(i);
+    if (!Op.isImm()) // Stop if we hit a register operand.
+      break;
+    assert((Op.getImm() >> 32) == 0 && "Imm operand should be i32 word");
+    const uint32_t Imm = Op.getImm(); // Each i32 word is up to 4 characters.
+    for (unsigned ShiftAmount = 0; ShiftAmount < 32; ShiftAmount += 8) {
+      char c = (Imm >> ShiftAmount) & 0xff;
+      if (c == 0) { // Stop if we hit a null-terminator character.
+        IsFinished = true;
+        break;
+      } else {
+        s += c; // Otherwise, append the character to the result string.
+      }
+    }
+  }
+  return s;
+}
+
+#endif // LLVM_LIB_TARGET_SPIRV_MCTARGETDESC_SPIRVBASEINFO_H
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
new file mode 100644
index 000000000000..3105baa02c90
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
@@ -0,0 +1,556 @@
+//===-- SPIRVInstPrinter.cpp - Output SPIR-V MCInsts as ASM -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints a SPIR-V MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPIRVInstPrinter.h"
+#include "SPIRV.h"
+#include "SPIRVBaseInfo.h"
+#include "llvm/CodeGen/Register.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+// Include the auto-generated portion of the assembly writer.
+#include "SPIRVGenAsmWriter.inc"
+
+void SPIRVInstPrinter::printRemainingVariableOps(const MCInst *MI,
+                                                 unsigned StartIndex,
+                                                 raw_ostream &O,
+                                                 bool SkipFirstSpace,
+                                                 bool SkipImmediates) {
+  const unsigned NumOps = MI->getNumOperands();
+  for (unsigned i = StartIndex; i < NumOps; ++i) {
+    if (!SkipImmediates || !MI->getOperand(i).isImm()) {
+      if (!SkipFirstSpace || i != StartIndex)
+        O << ' ';
+      printOperand(MI, i, O);
+    }
+  }
+}
+
+void SPIRVInstPrinter::printOpConstantVarOps(const MCInst *MI,
+                                             unsigned StartIndex,
+                                             raw_ostream &O) {
+  O << ' ';
+  if (MI->getNumOperands() - StartIndex == 2) { // Handle 64 bit literals.
+    uint64_t Imm = MI->getOperand(StartIndex).getImm();
+    Imm |= (MI->getOperand(StartIndex + 1).getImm() << 32);
+    O << Imm;
+  } else {
+    printRemainingVariableOps(MI, StartIndex, O, true, false);
+  }
+}
+
+void SPIRVInstPrinter::recordOpExtInstImport(const MCInst *MI) {
+  llvm_unreachable("Unimplemented recordOpExtInstImport");
+}
+
+void SPIRVInstPrinter::printInst(const MCInst *MI, uint64_t Address,
+                                 StringRef Annot, const MCSubtargetInfo &STI,
+                                 raw_ostream &OS) {
+  const unsigned OpCode = MI->getOpcode();
+  printInstruction(MI, Address, OS);
+
+  if (OpCode == SPIRV::OpDecorate) {
+    printOpDecorate(MI, OS);
+  } else if (OpCode == SPIRV::OpExtInstImport) {
+    recordOpExtInstImport(MI);
+  } else if (OpCode == SPIRV::OpExtInst) {
+    printOpExtInst(MI, OS);
+  } else {
+    // Print any extra operands for variadic instructions.
+    MCInstrDesc MCDesc = MII.get(OpCode);
+    if (MCDesc.isVariadic()) {
+      const unsigned NumFixedOps = MCDesc.getNumOperands();
+      const unsigned LastFixedIndex = NumFixedOps - 1;
+      const int FirstVariableIndex = NumFixedOps;
+      if (NumFixedOps > 0 &&
+          MCDesc.OpInfo[LastFixedIndex].OperandType == MCOI::OPERAND_UNKNOWN) {
+        // For instructions where a custom type (not reg or immediate) comes as
+        // the last operand before the variable_ops. This is usually a StringImm
+        // operand, but there are a few other cases.
+        switch (OpCode) {
+        case SPIRV::OpTypeImage:
+          OS << ' ';
+          printAccessQualifier(MI, FirstVariableIndex, OS);
+          break;
+        case SPIRV::OpVariable:
+          OS << ' ';
+          printOperand(MI, FirstVariableIndex, OS);
+          break;
+        case SPIRV::OpEntryPoint: {
+          // Print the interface ID operands, skipping the name's string
+          // literal.
+          printRemainingVariableOps(MI, NumFixedOps, OS, false, true);
+          break;
+        }
+        case SPIRV::OpExecutionMode:
+        case SPIRV::OpExecutionModeId:
+        case SPIRV::OpLoopMerge: {
+          // Print any literals after the OPERAND_UNKNOWN argument normally.
+          printRemainingVariableOps(MI, NumFixedOps, OS);
+          break;
+        }
+        default:
+          break; // printStringImm has already been handled
+        }
+      } else {
+        // For instructions with no fixed ops or a reg/immediate as the final
+        // fixed operand, we can usually print the rest with "printOperand", but
+        // check for a few cases with custom types first.
+        switch (OpCode) {
+        case SPIRV::OpLoad:
+        case SPIRV::OpStore:
+          OS << ' ';
+          printMemoryOperand(MI, FirstVariableIndex, OS);
+          printRemainingVariableOps(MI, FirstVariableIndex + 1, OS);
+          break;
+        case SPIRV::OpImageSampleImplicitLod:
+        case SPIRV::OpImageSampleDrefImplicitLod:
+        case SPIRV::OpImageSampleProjImplicitLod:
+        case SPIRV::OpImageSampleProjDrefImplicitLod:
+        case SPIRV::OpImageFetch:
+        case SPIRV::OpImageGather:
+        case SPIRV::OpImageDrefGather:
+        case SPIRV::OpImageRead:
+        case SPIRV::OpImageWrite:
+        case SPIRV::OpImageSparseSampleImplicitLod:
+        case SPIRV::OpImageSparseSampleDrefImplicitLod:
+        case SPIRV::OpImageSparseSampleProjImplicitLod:
+        case SPIRV::OpImageSparseSampleProjDrefImplicitLod:
+        case SPIRV::OpImageSparseFetch:
+        case SPIRV::OpImageSparseGather:
+        case SPIRV::OpImageSparseDrefGather:
+        case SPIRV::OpImageSparseRead:
+        case SPIRV::OpImageSampleFootprintNV:
+          OS << ' ';
+          printImageOperand(MI, FirstVariableIndex, OS);
+          printRemainingVariableOps(MI, NumFixedOps + 1, OS);
+          break;
+        case SPIRV::OpCopyMemory:
+        case SPIRV::OpCopyMemorySized: {
+          const unsigned NumOps = MI->getNumOperands();
+          for (unsigned i = NumFixedOps; i < NumOps; ++i) {
+            OS << ' ';
+            printMemoryOperand(MI, i, OS);
+            if (MI->getOperand(i).getImm() &
+                static_cast<unsigned>(SPIRV::MemoryOperand::Aligned)) {
+              assert(i + 1 < NumOps && "Missing alignment operand");
+              OS << ' ';
+              printOperand(MI, i + 1, OS);
+              i += 1;
+            }
+          }
+          break;
+        }
+        case SPIRV::OpConstantI:
+        case SPIRV::OpConstantF:
+          printOpConstantVarOps(MI, NumFixedOps, OS);
+          break;
+        default:
+          printRemainingVariableOps(MI, NumFixedOps, OS);
+          break;
+        }
+      }
+    }
+  }
+
+  printAnnotation(OS, Annot);
+}
+
+void SPIRVInstPrinter::printOpExtInst(const MCInst *MI, raw_ostream &O) {
+  llvm_unreachable("Unimplemented printOpExtInst");
+}
+
+void SPIRVInstPrinter::printOpDecorate(const MCInst *MI, raw_ostream &O) {
+  // The fixed operands have already been printed, so just need to decide what
+  // type of decoration operands to print based on the Decoration type.
+  MCInstrDesc MCDesc = MII.get(MI->getOpcode());
+  unsigned NumFixedOps = MCDesc.getNumOperands();
+
+  if (NumFixedOps != MI->getNumOperands()) {
+    auto DecOp = MI->getOperand(NumFixedOps - 1);
+    auto Dec = static_cast<SPIRV::Decoration>(DecOp.getImm());
+
+    O << ' ';
+
+    switch (Dec) {
+    case SPIRV::Decoration::BuiltIn:
+      printBuiltIn(MI, NumFixedOps, O);
+      break;
+    case SPIRV::Decoration::UniformId:
+      printScope(MI, NumFixedOps, O);
+      break;
+    case SPIRV::Decoration::FuncParamAttr:
+      printFunctionParameterAttribute(MI, NumFixedOps, O);
+      break;
+    case SPIRV::Decoration::FPRoundingMode:
+      printFPRoundingMode(MI, NumFixedOps, O);
+      break;
+    case SPIRV::Decoration::FPFastMathMode:
+      printFPFastMathMode(MI, NumFixedOps, O);
+      break;
+    case SPIRV::Decoration::LinkageAttributes:
+    case SPIRV::Decoration::UserSemantic:
+      printStringImm(MI, NumFixedOps, O);
+      break;
+    default:
+      printRemainingVariableOps(MI, NumFixedOps, O, true);
+      break;
+    }
+  }
+}
+
+static void printExpr(const MCExpr *Expr, raw_ostream &O) {
+#ifndef NDEBUG
+  const MCSymbolRefExpr *SRE;
+
+  if (const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr))
+    SRE = cast<MCSymbolRefExpr>(BE->getLHS());
+  else
+    SRE = cast<MCSymbolRefExpr>(Expr);
+
+  MCSymbolRefExpr::VariantKind Kind = SRE->getKind();
+
+  assert(Kind == MCSymbolRefExpr::VK_None);
+#endif
+  O << *Expr;
+}
+
+void SPIRVInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                    raw_ostream &O, const char *Modifier) {
+  assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported");
+  if (OpNo < MI->getNumOperands()) {
+    const MCOperand &Op = MI->getOperand(OpNo);
+    if (Op.isReg())
+      O << '%' << (Register::virtReg2Index(Op.getReg()) + 1);
+    else if (Op.isImm())
+      O << formatImm((int64_t)Op.getImm());
+    else if (Op.isDFPImm())
+      O << formatImm((double)Op.getDFPImm());
+    else if (Op.isExpr())
+      printExpr(Op.getExpr(), O);
+    else
+      llvm_unreachable("Unexpected operand type");
+  }
+}
+
+void SPIRVInstPrinter::printStringImm(const MCInst *MI, unsigned OpNo,
+                                      raw_ostream &O) {
+  const unsigned NumOps = MI->getNumOperands();
+  unsigned StrStartIndex = OpNo;
+  while (StrStartIndex < NumOps) {
+    if (MI->getOperand(StrStartIndex).isReg())
+      break;
+
+    std::string Str = getSPIRVStringOperand(*MI, OpNo);
+    if (StrStartIndex != OpNo)
+      O << ' '; // Add a space if we're starting a new string/argument.
+    O << '"';
+    for (char c : Str) {
+      if (c == '"')
+        O.write('\\'); // Escape " characters (might break for complex UTF-8).
+      O.write(c);
+    }
+    O << '"';
+
+    unsigned numOpsInString = (Str.size() / 4) + 1;
+    StrStartIndex += numOpsInString;
+
+    // Check for final Op of "OpDecorate %x %stringImm %linkageAttribute".
+    if (MI->getOpcode() == SPIRV::OpDecorate &&
+        MI->getOperand(1).getImm() ==
+            static_cast<unsigned>(SPIRV::Decoration::LinkageAttributes)) {
+      O << ' ';
+      printLinkageType(MI, StrStartIndex, O);
+      break;
+    }
+  }
+}
+
+void SPIRVInstPrinter::printExtInst(const MCInst *MI, unsigned OpNo,
+                                    raw_ostream &O) {
+  llvm_unreachable("Unimplemented printExtInst");
+}
+
+void SPIRVInstPrinter::printCapability(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  if (OpNo < MI->getNumOperands()) {
+    SPIRV::Capability e =
+        static_cast<SPIRV::Capability>(MI->getOperand(OpNo).getImm());
+    O << SPIRV::getCapabilityName(e);
+  }
+}
+
+void SPIRVInstPrinter::printSourceLanguage(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  if (OpNo < MI->getNumOperands()) {
+    SPIRV::SourceLanguage e =
+        static_cast<SPIRV::SourceLanguage>(MI->getOperand(OpNo).getImm());
+    O << SPIRV::getSourceLanguageName(e);
+  }
+}
+
+void SPIRVInstPrinter::printExecutionModel(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  if (OpNo < MI->getNumOperands()) {
+    SPIRV::ExecutionModel e =
+        static_cast<SPIRV::ExecutionModel>(MI->getOperand(OpNo).getImm());
+    O << SPIRV::getExecutionModelName(e);
+  }
+}
+
+void SPIRVInstPrinter::printAddressingModel(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) {
+  if (OpNo < MI->getNumOperands()) {
+    SPIRV::AddressingModel e =
+        static_cast<SPIRV::AddressingModel>(MI->getOperand(OpNo).getImm());
+    O << SPIRV::getAddressingModelName(e);
+  }
+}
+
+void SPIRVInstPrinter::printMemoryModel(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  if (OpNo < MI->getNumOperands()) {
+    SPIRV::MemoryModel e =
+        static_cast<SPIRV::MemoryModel>(MI->getOperand(OpNo).getImm());
+    O << SPIRV::getMemoryModelName(e);
+  }
+}
+
+void SPIRVInstPrinter::printExecutionMode(const MCInst *MI, unsigned OpNo,
+                                          raw_ostream &O) {
+  if (OpNo < MI->getNumOperands()) {
+    SPIRV::ExecutionMode e =
+        static_cast<SPIRV::ExecutionMode>(MI->getOperand(OpNo).getImm());
+    O << SPIRV::getExecutionModeName(e);
+  }
+}
+
+void SPIRVInstPrinter::printStorageClass(const MCInst *MI, unsigned OpNo,
+                                         raw_ostream &O) {
+  if (OpNo < MI->getNumOperands()) {
+    SPIRV::StorageClass e =
+        static_cast<SPIRV::StorageClass>(MI->getOperand(OpNo).getImm());
+    O << SPIRV::getStorageClassName(e);
+  }
+}
+
+void SPIRVInstPrinter::printDim(const MCInst *MI, unsigned OpNo,
+                                raw_ostream &O) {
+  if (OpNo < MI->getNumOperands()) {
+    SPIRV::Dim e = static_cast<SPIRV::Dim>(MI->getOperand(OpNo).getImm());
+    O << SPIRV::getDimName(e);
+  }
+}
+
+void SPIRVInstPrinter::printSamplerAddressingMode(const MCInst *MI,
+                                                  unsigned OpNo,
+                                                  raw_ostream &O) {
+  if (OpNo < MI->getNumOperands()) {
+    SPIRV::SamplerAddressingMode e = static_cast<SPIRV::SamplerAddressingMode>(
+        MI->getOperand(OpNo).getImm());
+    O << SPIRV::getSamplerAddressingModeName(e);
+  }
+}
+
+void SPIRVInstPrinter::printSamplerFilterMode(const MCInst *MI, unsigned OpNo,
+                                              raw_ostream &O) {
+  if (OpNo < MI->getNumOperands()) {
+    SPIRV::SamplerFilterMode e =
+        static_cast<SPIRV::SamplerFilterMode>(MI->getOperand(OpNo).getImm());
+    O << SPIRV::getSamplerFilterModeName(e);
+  }
+}
+
+void SPIRVInstPrinter::printImageFormat(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  if (OpNo < MI->getNumOperands()) {
+    SPIRV::ImageFormat e =
+        static_cast<SPIRV::ImageFormat>(MI->getOperand(OpNo).getImm());
+    O << SPIRV::getImageFormatName(e);
+  }
+}
+
+void SPIRVInstPrinter::printImageChannelOrder(const MCInst *MI, unsigned OpNo,
+                                              raw_ostream &O) {
+  if (OpNo < MI->getNumOperands()) {
+    SPIRV::ImageChannelOrder e =
+        static_cast<SPIRV::ImageChannelOrder>(MI->getOperand(OpNo).getImm());
+    O << SPIRV::getImageChannelOrderName(e);
+  }
+}
+
+void SPIRVInstPrinter::printImageChannelDataType(const MCInst *MI,
+                                                 unsigned OpNo,
+                                                 raw_ostream &O) {
+  if (OpNo < MI->getNumOperands()) {
+    SPIRV::ImageChannelDataType e =
+        static_cast<SPIRV::ImageChannelDataType>(MI->getOperand(OpNo).getImm());
+    O << SPIRV::getImageChannelDataTypeName(e);
+  }
+}
+
+void SPIRVInstPrinter::printImageOperand(const MCInst *MI, unsigned OpNo,
+                                         raw_ostream &O) {
+  if (OpNo < MI->getNumOperands()) {
+    unsigned e = static_cast<unsigned>(MI->getOperand(OpNo).getImm());
+    O << SPIRV::getImageOperandName(e);
+  }
+}
+
+void SPIRVInstPrinter::printFPFastMathMode(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  if (OpNo < MI->getNumOperands()) {
+    unsigned e = static_cast<unsigned>(MI->getOperand(OpNo).getImm());
+    O << SPIRV::getFPFastMathModeName(e);
+  }
+}
+
+void SPIRVInstPrinter::printFPRoundingMode(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  if (OpNo < MI->getNumOperands()) {
+    SPIRV::FPRoundingMode e =
+        static_cast<SPIRV::FPRoundingMode>(MI->getOperand(OpNo).getImm());
+    O << SPIRV::getFPRoundingModeName(e);
+  }
+}
+
+void SPIRVInstPrinter::printLinkageType(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  if (OpNo < MI->getNumOperands()) {
+    SPIRV::LinkageType e =
+        static_cast<SPIRV::LinkageType>(MI->getOperand(OpNo).getImm());
+    O << SPIRV::getLinkageTypeName(e);
+  }
+}
+
+void SPIRVInstPrinter::printAccessQualifier(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) {
+  if (OpNo < MI->getNumOperands()) {
+    SPIRV::AccessQualifier e =
+        static_cast<SPIRV::AccessQualifier>(MI->getOperand(OpNo).getImm());
+    O << SPIRV::getAccessQualifierName(e);
+  }
+}
+
+void SPIRVInstPrinter::printFunctionParameterAttribute(const MCInst *MI,
+                                                       unsigned OpNo,
+                                                       raw_ostream &O) {
+  if (OpNo < MI->getNumOperands()) {
+    SPIRV::FunctionParameterAttribute e =
+        static_cast<SPIRV::FunctionParameterAttribute>(
+            MI->getOperand(OpNo).getImm());
+    O << SPIRV::getFunctionParameterAttributeName(e);
+  }
+}
+
+void SPIRVInstPrinter::printDecoration(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  if (OpNo < MI->getNumOperands()) {
+    SPIRV::Decoration e =
+        static_cast<SPIRV::Decoration>(MI->getOperand(OpNo).getImm());
+    O << SPIRV::getDecorationName(e);
+  }
+}
+
+void SPIRVInstPrinter::printBuiltIn(const MCInst *MI, unsigned OpNo,
+                                    raw_ostream &O) {
+  if (OpNo < MI->getNumOperands()) {
+    SPIRV::BuiltIn e =
+        static_cast<SPIRV::BuiltIn>(MI->getOperand(OpNo).getImm());
+    O << SPIRV::getBuiltInName(e);
+  }
+}
+
+void SPIRVInstPrinter::printSelectionControl(const MCInst *MI, unsigned OpNo,
+                                             raw_ostream &O) {
+  if (OpNo < MI->getNumOperands()) {
+    unsigned e = static_cast<unsigned>(MI->getOperand(OpNo).getImm());
+    O << SPIRV::getSelectionControlName(e);
+  }
+}
+
+void SPIRVInstPrinter::printLoopControl(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  if (OpNo < MI->getNumOperands()) {
+    unsigned e = static_cast<unsigned>(MI->getOperand(OpNo).getImm());
+    O << SPIRV::getLoopControlName(e);
+  }
+}
+
+void SPIRVInstPrinter::printFunctionControl(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) {
+  if (OpNo < MI->getNumOperands()) {
+    unsigned e = static_cast<unsigned>(MI->getOperand(OpNo).getImm());
+    O << SPIRV::getFunctionControlName(e);
+  }
+}
+
+void SPIRVInstPrinter::printMemorySemantics(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) {
+  if (OpNo < MI->getNumOperands()) {
+    unsigned e = static_cast<unsigned>(MI->getOperand(OpNo).getImm());
+    O << SPIRV::getMemorySemanticsName(e);
+  }
+}
+
+void SPIRVInstPrinter::printMemoryOperand(const MCInst *MI, unsigned OpNo,
+                                          raw_ostream &O) {
+  if (OpNo < MI->getNumOperands()) {
+    unsigned e = static_cast<unsigned>(MI->getOperand(OpNo).getImm());
+    O << SPIRV::getMemoryOperandName(e);
+  }
+}
+
+void SPIRVInstPrinter::printScope(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  if (OpNo < MI->getNumOperands()) {
+    SPIRV::Scope e = static_cast<SPIRV::Scope>(MI->getOperand(OpNo).getImm());
+    O << SPIRV::getScopeName(e);
+  }
+}
+
+void SPIRVInstPrinter::printGroupOperation(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  if (OpNo < MI->getNumOperands()) {
+    SPIRV::GroupOperation e =
+        static_cast<SPIRV::GroupOperation>(MI->getOperand(OpNo).getImm());
+    O << SPIRV::getGroupOperationName(e);
+  }
+}
+
+void SPIRVInstPrinter::printKernelEnqueueFlags(const MCInst *MI, unsigned OpNo,
+                                               raw_ostream &O) {
+  if (OpNo < MI->getNumOperands()) {
+    SPIRV::KernelEnqueueFlags e =
+        static_cast<SPIRV::KernelEnqueueFlags>(MI->getOperand(OpNo).getImm());
+    O << SPIRV::getKernelEnqueueFlagsName(e);
+  }
+}
+
+void SPIRVInstPrinter::printKernelProfilingInfo(const MCInst *MI, unsigned OpNo,
+                                                raw_ostream &O) {
+  if (OpNo < MI->getNumOperands()) {
+    SPIRV::KernelProfilingInfo e =
+        static_cast<SPIRV::KernelProfilingInfo>(MI->getOperand(OpNo).getImm());
+    O << SPIRV::getKernelProfilingInfoName(e);
+  }
+}
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.h
new file mode 100644
index 000000000000..cd3b6f1e6d66
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.h
@@ -0,0 +1,94 @@
+//===-- SPIRVInstPrinter.h - Output SPIR-V MCInsts as ASM -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints a SPIR-V MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPIRV_INSTPRINTER_SPIRVINSTPRINTER_H
+#define LLVM_LIB_TARGET_SPIRV_INSTPRINTER_SPIRVINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+class SPIRVInstPrinter : public MCInstPrinter {
+private:
+  void recordOpExtInstImport(const MCInst *MI);
+
+public:
+  using MCInstPrinter::MCInstPrinter;
+
+  void printInst(const MCInst *MI, uint64_t Address, StringRef Annot,
+                 const MCSubtargetInfo &STI, raw_ostream &OS) override;
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O,
+                    const char *Modifier = nullptr);
+
+  void printStringImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+  void printOpDecorate(const MCInst *MI, raw_ostream &O);
+  void printOpExtInst(const MCInst *MI, raw_ostream &O);
+  void printRemainingVariableOps(const MCInst *MI, unsigned StartIndex,
+                                 raw_ostream &O, bool SkipFirstSpace = false,
+                                 bool SkipImmediates = false);
+  void printOpConstantVarOps(const MCInst *MI, unsigned StartIndex,
+                             raw_ostream &O);
+
+  void printExtInst(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+  // SPIR-V enumerations printing.
+  void printCapability(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSourceLanguage(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printExecutionModel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printAddressingModel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMemoryModel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printExecutionMode(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printStorageClass(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printDim(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+  void printSamplerAddressingMode(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O);
+  void printSamplerFilterMode(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+  void printImageFormat(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printImageChannelOrder(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printImageChannelDataType(const MCInst *MI, unsigned OpNo,
+                                 raw_ostream &O);
+  void printImageOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+  void printFPFastMathMode(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printFPRoundingMode(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+  void printLinkageType(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printAccessQualifier(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printFunctionParameterAttribute(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O);
+
+  void printDecoration(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printBuiltIn(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+  void printSelectionControl(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printLoopControl(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printFunctionControl(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+  void printMemorySemantics(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMemoryOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+  void printScope(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printGroupOperation(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+  void printKernelEnqueueFlags(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printKernelProfilingInfo(const MCInst *MI, unsigned OpNo,
+                                raw_ostream &O);
+  // Autogenerated by tblgen.
+  std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
+  void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_SPIRV_INSTPRINTER_SPIRVINSTPRINTER_H
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCAsmInfo.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCAsmInfo.cpp
new file mode 100644
index 000000000000..2f3462f419e5
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCAsmInfo.cpp
@@ -0,0 +1,34 @@
+//===-- SPIRVMCAsmInfo.h - SPIR-V asm properties --------------*- C++ -*--====//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the SPIRVMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPIRVMCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+
+using namespace llvm;
+
+SPIRVMCAsmInfo::SPIRVMCAsmInfo(const Triple &TT,
+                               const MCTargetOptions &Options) {
+  IsLittleEndian = true;
+
+  HasSingleParameterDotFile = false;
+  HasDotTypeDotSizeDirective = false;
+
+  MinInstAlignment = 4;
+
+  CodePointerSize = 4;
+  CommentString = ";";
+  HasFunctionAlignment = false;
+}
+
+bool SPIRVMCAsmInfo::shouldOmitSectionDirective(StringRef SectionName) const {
+  return true;
+}
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCAsmInfo.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCAsmInfo.h
new file mode 100644
index 000000000000..08e579e1c32c
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCAsmInfo.h
@@ -0,0 +1,29 @@
+//===-- SPIRVMCAsmInfo.h - SPIR-V asm properties --------------*- C++ -*--====//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the SPIRVMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPIRV_MCTARGETDESC_SPIRVMCASMINFO_H
+#define LLVM_LIB_TARGET_SPIRV_MCTARGETDESC_SPIRVMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+
+class Triple;
+
+class SPIRVMCAsmInfo : public MCAsmInfo {
+public:
+  explicit SPIRVMCAsmInfo(const Triple &TT, const MCTargetOptions &Options);
+  bool shouldOmitSectionDirective(StringRef SectionName) const override;
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_SPIRV_MCTARGETDESC_SPIRVMCASMINFO_H
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCCodeEmitter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCCodeEmitter.cpp
new file mode 100644
index 000000000000..d953bc590473
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCCodeEmitter.cpp
@@ -0,0 +1,132 @@
+//===-- SPIRVMCCodeEmitter.cpp - Emit SPIR-V machine code -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SPIRVMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/SPIRVMCTargetDesc.h"
+#include "llvm/CodeGen/Register.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/EndianStream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "spirv-mccodeemitter"
+
+namespace {
+
+class SPIRVMCCodeEmitter : public MCCodeEmitter {
+  const MCInstrInfo &MCII;
+
+public:
+  SPIRVMCCodeEmitter(const MCInstrInfo &mcii) : MCII(mcii) {}
+  SPIRVMCCodeEmitter(const SPIRVMCCodeEmitter &) = delete;
+  void operator=(const SPIRVMCCodeEmitter &) = delete;
+  ~SPIRVMCCodeEmitter() override = default;
+
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+
+private:
+  FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const;
+  void
+  verifyInstructionPredicates(const MCInst &MI,
+                              const FeatureBitset &AvailableFeatures) const;
+};
+
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createSPIRVMCCodeEmitter(const MCInstrInfo &MCII,
+                                              MCContext &Ctx) {
+  return new SPIRVMCCodeEmitter(MCII);
+}
+
+using EndianWriter = support::endian::Writer;
+
+// Check if the instruction has a type argument for operand 1, and defines an ID
+// output register in operand 0. If so, we need to swap operands 0 and 1 so the
+// type comes first in the output, despide coming second in the MCInst.
+static bool hasType(const MCInst &MI, const MCInstrInfo &MII) {
+  MCInstrDesc MCDesc = MII.get(MI.getOpcode());
+  // If we define an output, and have at least one other argument.
+  if (MCDesc.getNumDefs() == 1 && MCDesc.getNumOperands() >= 2) {
+    // Check if we define an ID, and take a type as operand 1.
+    auto DefOpInfo = MCDesc.opInfo_begin();
+    auto FirstArgOpInfo = MCDesc.opInfo_begin() + 1;
+    return (DefOpInfo->RegClass == SPIRV::IDRegClassID ||
+            DefOpInfo->RegClass == SPIRV::ANYIDRegClassID) &&
+           FirstArgOpInfo->RegClass == SPIRV::TYPERegClassID;
+  }
+  return false;
+}
+
+static void emitOperand(const MCOperand &Op, EndianWriter &OSE) {
+  if (Op.isReg()) {
+    // Emit the id index starting at 1 (0 is an invalid index).
+    OSE.write<uint32_t>(Register::virtReg2Index(Op.getReg()) + 1);
+  } else if (Op.isImm()) {
+    OSE.write<uint32_t>(Op.getImm());
+  } else {
+    llvm_unreachable("Unexpected operand type in VReg");
+  }
+}
+
+// Emit the type in operand 1 before the ID in operand 0 it defines, and all
+// remaining operands in the order they come naturally.
+static void emitTypedInstrOperands(const MCInst &MI, EndianWriter &OSE) {
+  unsigned NumOps = MI.getNumOperands();
+  emitOperand(MI.getOperand(1), OSE);
+  emitOperand(MI.getOperand(0), OSE);
+  for (unsigned i = 2; i < NumOps; ++i)
+    emitOperand(MI.getOperand(i), OSE);
+}
+
+// Emit operands in the order they come naturally.
+static void emitUntypedInstrOperands(const MCInst &MI, EndianWriter &OSE) {
+  for (const auto &Op : MI)
+    emitOperand(Op, OSE);
+}
+
+void SPIRVMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  auto Features = computeAvailableFeatures(STI.getFeatureBits());
+  verifyInstructionPredicates(MI, Features);
+
+  EndianWriter OSE(OS, support::little);
+
+  // Encode the first 32 SPIR-V bytes with the number of args and the opcode.
+  const uint64_t OpCode = getBinaryCodeForInstr(MI, Fixups, STI);
+  const uint32_t NumWords = MI.getNumOperands() + 1;
+  const uint32_t FirstWord = (NumWords << 16) | OpCode;
+  OSE.write<uint32_t>(FirstWord);
+
+  // Emit the instruction arguments (emitting the output type first if present).
+  if (hasType(MI, MCII))
+    emitTypedInstrOperands(MI, OSE);
+  else
+    emitUntypedInstrOperands(MI, OSE);
+}
+
+#define ENABLE_INSTR_PREDICATE_VERIFIER
+#include "SPIRVGenMCCodeEmitter.inc"
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp
new file mode 100644
index 000000000000..6b8b4a73af92
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp
@@ -0,0 +1,102 @@
+//===-- SPIRVMCTargetDesc.cpp - SPIR-V Target Descriptions ----*- C++ -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides SPIR-V specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPIRVMCTargetDesc.h"
+#include "SPIRVInstPrinter.h"
+#include "SPIRVMCAsmInfo.h"
+#include "SPIRVTargetStreamer.h"
+#include "TargetInfo/SPIRVTargetInfo.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "SPIRVGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "SPIRVGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "SPIRVGenRegisterInfo.inc"
+
+using namespace llvm;
+
+static MCInstrInfo *createSPIRVMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitSPIRVMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createSPIRVMCRegisterInfo(const Triple &TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  return X;
+}
+
+static MCSubtargetInfo *
+createSPIRVMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
+  return createSPIRVMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
+}
+
+static MCStreamer *
+createSPIRVMCStreamer(const Triple &T, MCContext &Ctx,
+                      std::unique_ptr<MCAsmBackend> &&MAB,
+                      std::unique_ptr<MCObjectWriter> &&OW,
+                      std::unique_ptr<MCCodeEmitter> &&Emitter, bool RelaxAll) {
+  return createSPIRVStreamer(Ctx, std::move(MAB), std::move(OW),
+                             std::move(Emitter), RelaxAll);
+}
+
+static MCTargetStreamer *createTargetAsmStreamer(MCStreamer &S,
+                                                 formatted_raw_ostream &,
+                                                 MCInstPrinter *, bool) {
+  return new SPIRVTargetStreamer(S);
+}
+
+static MCInstPrinter *createSPIRVMCInstPrinter(const Triple &T,
+                                               unsigned SyntaxVariant,
+                                               const MCAsmInfo &MAI,
+                                               const MCInstrInfo &MII,
+                                               const MCRegisterInfo &MRI) {
+  assert(SyntaxVariant == 0);
+  return new SPIRVInstPrinter(MAI, MII, MRI);
+}
+
+namespace {
+
+class SPIRVMCInstrAnalysis : public MCInstrAnalysis {
+public:
+  explicit SPIRVMCInstrAnalysis(const MCInstrInfo *Info)
+      : MCInstrAnalysis(Info) {}
+};
+
+} // end anonymous namespace
+
+static MCInstrAnalysis *createSPIRVInstrAnalysis(const MCInstrInfo *Info) {
+  return new SPIRVMCInstrAnalysis(Info);
+}
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSPIRVTargetMC() {
+  for (Target *T : {&getTheSPIRV32Target(), &getTheSPIRV64Target()}) {
+    RegisterMCAsmInfo<SPIRVMCAsmInfo> X(*T);
+    TargetRegistry::RegisterMCInstrInfo(*T, createSPIRVMCInstrInfo);
+    TargetRegistry::RegisterMCRegInfo(*T, createSPIRVMCRegisterInfo);
+    TargetRegistry::RegisterMCSubtargetInfo(*T, createSPIRVMCSubtargetInfo);
+    TargetRegistry::RegisterSPIRVStreamer(*T, createSPIRVMCStreamer);
+    TargetRegistry::RegisterMCInstPrinter(*T, createSPIRVMCInstPrinter);
+    TargetRegistry::RegisterMCInstrAnalysis(*T, createSPIRVInstrAnalysis);
+    TargetRegistry::RegisterMCCodeEmitter(*T, createSPIRVMCCodeEmitter);
+    TargetRegistry::RegisterMCAsmBackend(*T, createSPIRVAsmBackend);
+    TargetRegistry::RegisterAsmTargetStreamer(*T, createTargetAsmStreamer);
+  }
+}
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.h
new file mode 100644
index 000000000000..4009fa96aa68
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.h
@@ -0,0 +1,52 @@
+//===-- SPIRVMCTargetDesc.h - SPIR-V Target Descriptions --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides SPIR-V specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPIRV_MCTARGETDESC_SPIRVMCTARGETDESC_H
+#define LLVM_LIB_TARGET_SPIRV_MCTARGETDESC_SPIRVMCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+#include <memory>
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectTargetWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class MCTargetOptions;
+class Target;
+
+MCCodeEmitter *createSPIRVMCCodeEmitter(const MCInstrInfo &MCII,
+                                        MCContext &Ctx);
+
+MCAsmBackend *createSPIRVAsmBackend(const Target &T, const MCSubtargetInfo &STI,
+                                    const MCRegisterInfo &MRI,
+                                    const MCTargetOptions &Options);
+
+std::unique_ptr<MCObjectTargetWriter> createSPIRVObjectTargetWriter();
+} // namespace llvm
+
+// Defines symbolic names for SPIR-V registers.  This defines a mapping from
+// register name to register number.
+#define GET_REGINFO_ENUM
+#include "SPIRVGenRegisterInfo.inc"
+
+// Defines symbolic names for the SPIR-V instructions.
+#define GET_INSTRINFO_ENUM
+#include "SPIRVGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "SPIRVGenSubtargetInfo.inc"
+
+#endif // LLVM_LIB_TARGET_SPIRV_MCTARGETDESC_SPIRVMCTARGETDESC_H
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVObjectTargetWriter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVObjectTargetWriter.cpp
new file mode 100644
index 000000000000..685168b4073d
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVObjectTargetWriter.cpp
@@ -0,0 +1,25 @@
+//===- SPIRVObjectTargetWriter.cpp - SPIR-V Object Target Writer *- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPIRVMCTargetDesc.h"
+#include "llvm/MC/MCSPIRVObjectWriter.h"
+
+using namespace llvm;
+
+namespace {
+
+class SPIRVObjectTargetWriter : public MCSPIRVObjectTargetWriter {
+public:
+  SPIRVObjectTargetWriter() = default;
+};
+
+} // namespace
+
+std::unique_ptr<MCObjectTargetWriter> llvm::createSPIRVObjectTargetWriter() {
+  return std::make_unique<SPIRVObjectTargetWriter>();
+}
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.cpp
new file mode 100644
index 000000000000..0a318e0e01e5
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.cpp
@@ -0,0 +1,18 @@
+//=====- SPIRVTargetStreamer.cpp - SPIRVTargetStreamer class ------------=====//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SPIRVTargetStreamer class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPIRVTargetStreamer.h"
+
+using namespace llvm;
+
+SPIRVTargetStreamer::SPIRVTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+SPIRVTargetStreamer::~SPIRVTargetStreamer() {}
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.h
new file mode 100644
index 000000000000..2cc8f50aba67
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.h
@@ -0,0 +1,28 @@
+//===-- SPIRVTargetStreamer.h - SPIRV Target Streamer ----------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LIB_TARGET_SPIRV_MCTARGETDESC_SPIRVTARGETSTREAMER_H
+#define LIB_TARGET_SPIRV_MCTARGETDESC_SPIRVTARGETSTREAMER_H
+
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+
+class MCSection;
+
+class SPIRVTargetStreamer : public MCTargetStreamer {
+public:
+  SPIRVTargetStreamer(MCStreamer &S);
+  ~SPIRVTargetStreamer() override;
+
+  void changeSection(const MCSection *CurSection, MCSection *Section,
+                     const MCExpr *SubSection, raw_ostream &OS) override{};
+};
+} // namespace llvm
+
+#endif // LIB_TARGET_SPIRV_MCTARGETDESC_SPIRVTARGETSTREAMER_H_
diff --git a/llvm/lib/Target/SPIRV/SPIRV.h b/llvm/lib/Target/SPIRV/SPIRV.h
new file mode 100644
index 000000000000..8da54a5d6e61
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRV.h
@@ -0,0 +1,34 @@
+//===-- SPIRV.h - Top-level interface for SPIR-V representation -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPIRV_SPIRV_H
+#define LLVM_LIB_TARGET_SPIRV_SPIRV_H
+
+#include "MCTargetDesc/SPIRVMCTargetDesc.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class SPIRVTargetMachine;
+class SPIRVSubtarget;
+class InstructionSelector;
+class RegisterBankInfo;
+
+FunctionPass *createSPIRVPreLegalizerPass();
+FunctionPass *createSPIRVEmitIntrinsicsPass(SPIRVTargetMachine *TM);
+InstructionSelector *
+createSPIRVInstructionSelector(const SPIRVTargetMachine &TM,
+                               const SPIRVSubtarget &Subtarget,
+                               const RegisterBankInfo &RBI);
+
+void initializeSPIRVModuleAnalysisPass(PassRegistry &);
+void initializeSPIRVPreLegalizerPass(PassRegistry &);
+void initializeSPIRVEmitIntrinsicsPass(PassRegistry &);
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_SPIRV_SPIRV_H
diff --git a/llvm/lib/Target/SPIRV/SPIRV.td b/llvm/lib/Target/SPIRV/SPIRV.td
new file mode 100644
index 000000000000..27374acb8882
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRV.td
@@ -0,0 +1,43 @@
+//===-- SPIRV.td - Describe the SPIR-V Target Machine ------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+include "SPIRVRegisterInfo.td"
+include "SPIRVRegisterBanks.td"
+include "SPIRVInstrInfo.td"
+
+def SPIRVInstrInfo : InstrInfo;
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic", []>;
+
+def SPIRV10 : SubtargetFeature<"spirv1.0", "SPIRVVersion", "10",
+                             "Use SPIR-V version 1.0">;
+def SPIRV11 : SubtargetFeature<"spirv1.1", "SPIRVVersion", "11",
+                             "Use SPIR-V version 1.1">;
+def SPIRV12 : SubtargetFeature<"spirv1.2", "SPIRVVersion", "12",
+                             "Use SPIR-V version 1.2">;
+def SPIRV13 : SubtargetFeature<"spirv1.3", "SPIRVVersion", "13",
+                             "Use SPIR-V version 1.3">;
+def SPIRV14 : SubtargetFeature<"spirv1.4", "SPIRVVersion", "14",
+                             "Use SPIR-V version 1.4">;
+def SPIRV15 : SubtargetFeature<"spirv1.5", "SPIRVVersion", "15",
+                             "Use SPIR-V version 1.5">;
+
+def SPIRVInstPrinter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  bit isMCAsmWriter = 1;
+}
+
+def SPIRV : Target {
+  let InstructionSet = SPIRVInstrInfo;
+  let AssemblyWriters = [SPIRVInstPrinter];
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
new file mode 100644
index 000000000000..0de232651377
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -0,0 +1,348 @@
+//===-- SPIRVAsmPrinter.cpp - SPIR-V LLVM assembly writer ------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to the SPIR-V assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/SPIRVInstPrinter.h"
+#include "SPIRV.h"
+#include "SPIRVInstrInfo.h"
+#include "SPIRVMCInstLower.h"
+#include "SPIRVModuleAnalysis.h"
+#include "SPIRVSubtarget.h"
+#include "SPIRVTargetMachine.h"
+#include "SPIRVUtils.h"
+#include "TargetInfo/SPIRVTargetInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+namespace {
+class SPIRVAsmPrinter : public AsmPrinter {
+public:
+  explicit SPIRVAsmPrinter(TargetMachine &TM,
+                           std::unique_ptr<MCStreamer> Streamer)
+      : AsmPrinter(TM, std::move(Streamer)), ST(nullptr), TII(nullptr) {}
+  bool ModuleSectionsEmitted;
+  const SPIRVSubtarget *ST;
+  const SPIRVInstrInfo *TII;
+
+  StringRef getPassName() const override { return "SPIRV Assembly Printer"; }
+  void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O);
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       const char *ExtraCode, raw_ostream &O) override;
+
+  void outputMCInst(MCInst &Inst);
+  void outputInstruction(const MachineInstr *MI);
+  void outputModuleSection(SPIRV::ModuleSectionType MSType);
+  void outputEntryPoints();
+  void outputDebugSourceAndStrings(const Module &M);
+  void outputOpMemoryModel();
+  void outputOpFunctionEnd();
+  void outputExtFuncDecls();
+  void outputModuleSections();
+
+  void emitInstruction(const MachineInstr *MI) override;
+  void emitFunctionEntryLabel() override {}
+  void emitFunctionHeader() override;
+  void emitFunctionBodyStart() override {}
+  void emitFunctionBodyEnd() override;
+  void emitBasicBlockStart(const MachineBasicBlock &MBB) override;
+  void emitBasicBlockEnd(const MachineBasicBlock &MBB) override {}
+  void emitGlobalVariable(const GlobalVariable *GV) override {}
+  void emitOpLabel(const MachineBasicBlock &MBB);
+  void emitEndOfAsmFile(Module &M) override;
+  bool doInitialization(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  SPIRV::ModuleAnalysisInfo *MAI;
+};
+} // namespace
+
+void SPIRVAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<SPIRVModuleAnalysis>();
+  AU.addPreserved<SPIRVModuleAnalysis>();
+  AsmPrinter::getAnalysisUsage(AU);
+}
+
+// If the module has no functions, we need output global info anyway.
+void SPIRVAsmPrinter::emitEndOfAsmFile(Module &M) {
+  if (ModuleSectionsEmitted == false) {
+    outputModuleSections();
+    ModuleSectionsEmitted = true;
+  }
+}
+
+void SPIRVAsmPrinter::emitFunctionHeader() {
+  if (ModuleSectionsEmitted == false) {
+    outputModuleSections();
+    ModuleSectionsEmitted = true;
+  }
+  // Get the subtarget from the current MachineFunction.
+  ST = &MF->getSubtarget<SPIRVSubtarget>();
+  TII = ST->getInstrInfo();
+  const Function &F = MF->getFunction();
+
+  if (isVerbose()) {
+    OutStreamer->getCommentOS()
+        << "-- Begin function "
+        << GlobalValue::dropLLVMManglingEscape(F.getName()) << '\n';
+  }
+
+  auto Section = getObjFileLowering().SectionForGlobal(&F, TM);
+  MF->setSection(Section);
+}
+
+void SPIRVAsmPrinter::outputOpFunctionEnd() {
+  MCInst FunctionEndInst;
+  FunctionEndInst.setOpcode(SPIRV::OpFunctionEnd);
+  outputMCInst(FunctionEndInst);
+}
+
+// Emit OpFunctionEnd at the end of MF and clear BBNumToRegMap.
+void SPIRVAsmPrinter::emitFunctionBodyEnd() {
+  outputOpFunctionEnd();
+  MAI->BBNumToRegMap.clear();
+}
+
+void SPIRVAsmPrinter::emitOpLabel(const MachineBasicBlock &MBB) {
+  MCInst LabelInst;
+  LabelInst.setOpcode(SPIRV::OpLabel);
+  LabelInst.addOperand(MCOperand::createReg(MAI->getOrCreateMBBRegister(MBB)));
+  outputMCInst(LabelInst);
+}
+
+void SPIRVAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
+  // If it's the first MBB in MF, it has OpFunction and OpFunctionParameter, so
+  // OpLabel should be output after them.
+  if (MBB.getNumber() == MF->front().getNumber()) {
+    for (const MachineInstr &MI : MBB)
+      if (MI.getOpcode() == SPIRV::OpFunction)
+        return;
+    // TODO: this case should be checked by the verifier.
+    report_fatal_error("OpFunction is expected in the front MBB of MF");
+  }
+  emitOpLabel(MBB);
+}
+
+void SPIRVAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
+                                   raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    O << SPIRVInstPrinter::getRegisterName(MO.getReg());
+    break;
+
+  case MachineOperand::MO_Immediate:
+    O << MO.getImm();
+    break;
+
+  case MachineOperand::MO_FPImmediate:
+    O << MO.getFPImm();
+    break;
+
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol();
+    break;
+
+  case MachineOperand::MO_GlobalAddress:
+    O << *getSymbol(MO.getGlobal());
+    break;
+
+  case MachineOperand::MO_BlockAddress: {
+    MCSymbol *BA = GetBlockAddressSymbol(MO.getBlockAddress());
+    O << BA->getName();
+    break;
+  }
+
+  case MachineOperand::MO_ExternalSymbol:
+    O << *GetExternalSymbolSymbol(MO.getSymbolName());
+    break;
+
+  case MachineOperand::MO_JumpTableIndex:
+  case MachineOperand::MO_ConstantPoolIndex:
+  default:
+    llvm_unreachable("<unknown operand type>");
+  }
+}
+
+bool SPIRVAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                      const char *ExtraCode, raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Invalid instruction - SPIR-V does not have special modifiers
+
+  printOperand(MI, OpNo, O);
+  return false;
+}
+
+static bool isFuncOrHeaderInstr(const MachineInstr *MI,
+                                const SPIRVInstrInfo *TII) {
+  return TII->isHeaderInstr(*MI) || MI->getOpcode() == SPIRV::OpFunction ||
+         MI->getOpcode() == SPIRV::OpFunctionParameter;
+}
+
+void SPIRVAsmPrinter::outputMCInst(MCInst &Inst) {
+  OutStreamer->emitInstruction(Inst, *OutContext.getSubtargetInfo());
+}
+
+void SPIRVAsmPrinter::outputInstruction(const MachineInstr *MI) {
+  SPIRVMCInstLower MCInstLowering;
+  MCInst TmpInst;
+  MCInstLowering.lower(MI, TmpInst, MAI);
+  outputMCInst(TmpInst);
+}
+
+void SPIRVAsmPrinter::emitInstruction(const MachineInstr *MI) {
+  if (!MAI->getSkipEmission(MI))
+    outputInstruction(MI);
+
+  // Output OpLabel after OpFunction and OpFunctionParameter in the first MBB.
+  const MachineInstr *NextMI = MI->getNextNode();
+  if (!MAI->hasMBBRegister(*MI->getParent()) && isFuncOrHeaderInstr(MI, TII) &&
+      (!NextMI || !isFuncOrHeaderInstr(NextMI, TII))) {
+    assert(MI->getParent()->getNumber() == MF->front().getNumber() &&
+           "OpFunction is not in the front MBB of MF");
+    emitOpLabel(*MI->getParent());
+  }
+}
+
+void SPIRVAsmPrinter::outputModuleSection(SPIRV::ModuleSectionType MSType) {
+  for (MachineInstr *MI : MAI->getMSInstrs(MSType))
+    outputInstruction(MI);
+}
+
+void SPIRVAsmPrinter::outputDebugSourceAndStrings(const Module &M) {
+  // Output OpSource.
+  MCInst Inst;
+  Inst.setOpcode(SPIRV::OpSource);
+  Inst.addOperand(MCOperand::createImm(static_cast<unsigned>(MAI->SrcLang)));
+  Inst.addOperand(
+      MCOperand::createImm(static_cast<unsigned>(MAI->SrcLangVersion)));
+  outputMCInst(Inst);
+}
+
+void SPIRVAsmPrinter::outputOpMemoryModel() {
+  MCInst Inst;
+  Inst.setOpcode(SPIRV::OpMemoryModel);
+  Inst.addOperand(MCOperand::createImm(static_cast<unsigned>(MAI->Addr)));
+  Inst.addOperand(MCOperand::createImm(static_cast<unsigned>(MAI->Mem)));
+  outputMCInst(Inst);
+}
+
+// Before the OpEntryPoints' output, we need to add the entry point's
+// interfaces. The interface is a list of IDs of global OpVariable instructions.
+// These declare the set of global variables from a module that form
+// the interface of this entry point.
+void SPIRVAsmPrinter::outputEntryPoints() {
+  // Find all OpVariable IDs with required StorageClass.
+  DenseSet<Register> InterfaceIDs;
+  for (MachineInstr *MI : MAI->GlobalVarList) {
+    assert(MI->getOpcode() == SPIRV::OpVariable);
+    auto SC = static_cast<SPIRV::StorageClass>(MI->getOperand(2).getImm());
+    // Before version 1.4, the interface's storage classes are limited to
+    // the Input and Output storage classes. Starting with version 1.4,
+    // the interface's storage classes are all storage classes used in
+    // declaring all global variables referenced by the entry point call tree.
+    if (ST->getSPIRVVersion() >= 14 || SC == SPIRV::StorageClass::Input ||
+        SC == SPIRV::StorageClass::Output) {
+      MachineFunction *MF = MI->getMF();
+      Register Reg = MAI->getRegisterAlias(MF, MI->getOperand(0).getReg());
+      InterfaceIDs.insert(Reg);
+    }
+  }
+
+  // Output OpEntryPoints adding interface args to all of them.
+  for (MachineInstr *MI : MAI->getMSInstrs(SPIRV::MB_EntryPoints)) {
+    SPIRVMCInstLower MCInstLowering;
+    MCInst TmpInst;
+    MCInstLowering.lower(MI, TmpInst, MAI);
+    for (Register Reg : InterfaceIDs) {
+      assert(Reg.isValid());
+      TmpInst.addOperand(MCOperand::createReg(Reg));
+    }
+    outputMCInst(TmpInst);
+  }
+}
+
+void SPIRVAsmPrinter::outputExtFuncDecls() {
+  // Insert OpFunctionEnd after each declaration.
+  SmallVectorImpl<MachineInstr *>::iterator
+      I = MAI->getMSInstrs(SPIRV::MB_ExtFuncDecls).begin(),
+      E = MAI->getMSInstrs(SPIRV::MB_ExtFuncDecls).end();
+  for (; I != E; ++I) {
+    outputInstruction(*I);
+    if ((I + 1) == E || (*(I + 1))->getOpcode() == SPIRV::OpFunction)
+      outputOpFunctionEnd();
+  }
+}
+
+void SPIRVAsmPrinter::outputModuleSections() {
+  const Module *M = MMI->getModule();
+  // Get the global subtarget to output module-level info.
+  ST = static_cast<const SPIRVTargetMachine &>(TM).getSubtargetImpl();
+  TII = ST->getInstrInfo();
+  MAI = &SPIRVModuleAnalysis::MAI;
+  assert(ST && TII && MAI && M && "Module analysis is required");
+  // Output instructions according to the Logical Layout of a Module:
+  // TODO: 1,2. All OpCapability instructions, then optional OpExtension
+  // instructions.
+  // TODO: 3. Optional OpExtInstImport instructions.
+  // 4. The single required OpMemoryModel instruction.
+  outputOpMemoryModel();
+  // 5. All entry point declarations, using OpEntryPoint.
+  outputEntryPoints();
+  // 6. Execution-mode declarations, using OpExecutionMode or OpExecutionModeId.
+  // TODO:
+  // 7a. Debug: all OpString, OpSourceExtension, OpSource, and
+  // OpSourceContinued, without forward references.
+  outputDebugSourceAndStrings(*M);
+  // 7b. Debug: all OpName and all OpMemberName.
+  outputModuleSection(SPIRV::MB_DebugNames);
+  // 7c. Debug: all OpModuleProcessed instructions.
+  outputModuleSection(SPIRV::MB_DebugModuleProcessed);
+  // 8. All annotation instructions (all decorations).
+  outputModuleSection(SPIRV::MB_Annotations);
+  // 9. All type declarations (OpTypeXXX instructions), all constant
+  // instructions, and all global variable declarations. This section is
+  // the first section to allow use of: OpLine and OpNoLine debug information;
+  // non-semantic instructions with OpExtInst.
+  outputModuleSection(SPIRV::MB_TypeConstVars);
+  // 10. All function declarations (functions without a body).
+  outputExtFuncDecls();
+  // 11. All function definitions (functions with a body).
+  // This is done in regular function output.
+}
+
+bool SPIRVAsmPrinter::doInitialization(Module &M) {
+  ModuleSectionsEmitted = false;
+  // We need to call the parent's one explicitly.
+  return AsmPrinter::doInitialization(M);
+}
+
+// Force static initialization.
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSPIRVAsmPrinter() {
+  RegisterAsmPrinter<SPIRVAsmPrinter> X(getTheSPIRV32Target());
+  RegisterAsmPrinter<SPIRVAsmPrinter> Y(getTheSPIRV64Target());
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
new file mode 100644
index 000000000000..df07a126eeea
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -0,0 +1,223 @@
+//===--- SPIRVCallLowering.cpp - Call lowering ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the lowering of LLVM calls to machine code calls for
+// GlobalISel.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPIRVCallLowering.h"
+#include "MCTargetDesc/SPIRVBaseInfo.h"
+#include "SPIRV.h"
+#include "SPIRVGlobalRegistry.h"
+#include "SPIRVISelLowering.h"
+#include "SPIRVRegisterInfo.h"
+#include "SPIRVSubtarget.h"
+#include "SPIRVUtils.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+
+using namespace llvm;
+
+SPIRVCallLowering::SPIRVCallLowering(const SPIRVTargetLowering &TLI,
+                                     const SPIRVSubtarget &ST,
+                                     SPIRVGlobalRegistry *GR)
+    : CallLowering(&TLI), ST(ST), GR(GR) {}
+
+bool SPIRVCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
+                                    const Value *Val, ArrayRef<Register> VRegs,
+                                    FunctionLoweringInfo &FLI,
+                                    Register SwiftErrorVReg) const {
+  // Currently all return types should use a single register.
+  // TODO: handle the case of multiple registers.
+  if (VRegs.size() > 1)
+    return false;
+  if (Val)
+    return MIRBuilder.buildInstr(SPIRV::OpReturnValue)
+        .addUse(VRegs[0])
+        .constrainAllUses(MIRBuilder.getTII(), *ST.getRegisterInfo(),
+                          *ST.getRegBankInfo());
+  MIRBuilder.buildInstr(SPIRV::OpReturn);
+  return true;
+}
+
+// Based on the LLVM function attributes, get a SPIR-V FunctionControl.
+static uint32_t getFunctionControl(const Function &F) {
+  uint32_t FuncControl = static_cast<uint32_t>(SPIRV::FunctionControl::None);
+  if (F.hasFnAttribute(Attribute::AttrKind::AlwaysInline)) {
+    FuncControl |= static_cast<uint32_t>(SPIRV::FunctionControl::Inline);
+  }
+  if (F.hasFnAttribute(Attribute::AttrKind::ReadNone)) {
+    FuncControl |= static_cast<uint32_t>(SPIRV::FunctionControl::Pure);
+  }
+  if (F.hasFnAttribute(Attribute::AttrKind::ReadOnly)) {
+    FuncControl |= static_cast<uint32_t>(SPIRV::FunctionControl::Const);
+  }
+  if (F.hasFnAttribute(Attribute::AttrKind::NoInline)) {
+    FuncControl |= static_cast<uint32_t>(SPIRV::FunctionControl::DontInline);
+  }
+  return FuncControl;
+}
+
+bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
+                                             const Function &F,
+                                             ArrayRef<ArrayRef<Register>> VRegs,
+                                             FunctionLoweringInfo &FLI) const {
+  assert(GR && "Must initialize the SPIRV type registry before lowering args.");
+
+  // Assign types and names to all args, and store their types for later.
+  SmallVector<Register, 4> ArgTypeVRegs;
+  if (VRegs.size() > 0) {
+    unsigned i = 0;
+    for (const auto &Arg : F.args()) {
+      // Currently formal args should use single registers.
+      // TODO: handle the case of multiple registers.
+      if (VRegs[i].size() > 1)
+        return false;
+      auto *SpirvTy =
+          GR->assignTypeToVReg(Arg.getType(), VRegs[i][0], MIRBuilder);
+      ArgTypeVRegs.push_back(GR->getSPIRVTypeID(SpirvTy));
+
+      if (Arg.hasName())
+        buildOpName(VRegs[i][0], Arg.getName(), MIRBuilder);
+      if (Arg.getType()->isPointerTy()) {
+        auto DerefBytes = static_cast<unsigned>(Arg.getDereferenceableBytes());
+        if (DerefBytes != 0)
+          buildOpDecorate(VRegs[i][0], MIRBuilder,
+                          SPIRV::Decoration::MaxByteOffset, {DerefBytes});
+      }
+      if (Arg.hasAttribute(Attribute::Alignment)) {
+        buildOpDecorate(VRegs[i][0], MIRBuilder, SPIRV::Decoration::Alignment,
+                        {static_cast<unsigned>(Arg.getParamAlignment())});
+      }
+      if (Arg.hasAttribute(Attribute::ReadOnly)) {
+        auto Attr =
+            static_cast<unsigned>(SPIRV::FunctionParameterAttribute::NoWrite);
+        buildOpDecorate(VRegs[i][0], MIRBuilder,
+                        SPIRV::Decoration::FuncParamAttr, {Attr});
+      }
+      if (Arg.hasAttribute(Attribute::ZExt)) {
+        auto Attr =
+            static_cast<unsigned>(SPIRV::FunctionParameterAttribute::Zext);
+        buildOpDecorate(VRegs[i][0], MIRBuilder,
+                        SPIRV::Decoration::FuncParamAttr, {Attr});
+      }
+      ++i;
+    }
+  }
+
+  // Generate a SPIR-V type for the function.
+  auto MRI = MIRBuilder.getMRI();
+  Register FuncVReg = MRI->createGenericVirtualRegister(LLT::scalar(32));
+  MRI->setRegClass(FuncVReg, &SPIRV::IDRegClass);
+
+  auto *FTy = F.getFunctionType();
+  auto FuncTy = GR->assignTypeToVReg(FTy, FuncVReg, MIRBuilder);
+
+  // Build the OpTypeFunction declaring it.
+  Register ReturnTypeID = FuncTy->getOperand(1).getReg();
+  uint32_t FuncControl = getFunctionControl(F);
+
+  MIRBuilder.buildInstr(SPIRV::OpFunction)
+      .addDef(FuncVReg)
+      .addUse(ReturnTypeID)
+      .addImm(FuncControl)
+      .addUse(GR->getSPIRVTypeID(FuncTy));
+
+  // Add OpFunctionParameters.
+  const unsigned NumArgs = ArgTypeVRegs.size();
+  for (unsigned i = 0; i < NumArgs; ++i) {
+    assert(VRegs[i].size() == 1 && "Formal arg has multiple vregs");
+    MRI->setRegClass(VRegs[i][0], &SPIRV::IDRegClass);
+    MIRBuilder.buildInstr(SPIRV::OpFunctionParameter)
+        .addDef(VRegs[i][0])
+        .addUse(ArgTypeVRegs[i]);
+  }
+  // Name the function.
+  if (F.hasName())
+    buildOpName(FuncVReg, F.getName(), MIRBuilder);
+
+  // Handle entry points and function linkage.
+  if (F.getCallingConv() == CallingConv::SPIR_KERNEL) {
+    auto MIB = MIRBuilder.buildInstr(SPIRV::OpEntryPoint)
+                   .addImm(static_cast<uint32_t>(SPIRV::ExecutionModel::Kernel))
+                   .addUse(FuncVReg);
+    addStringImm(F.getName(), MIB);
+  } else if (F.getLinkage() == GlobalValue::LinkageTypes::ExternalLinkage ||
+             F.getLinkage() == GlobalValue::LinkOnceODRLinkage) {
+    auto LnkTy = F.isDeclaration() ? SPIRV::LinkageType::Import
+                                   : SPIRV::LinkageType::Export;
+    buildOpDecorate(FuncVReg, MIRBuilder, SPIRV::Decoration::LinkageAttributes,
+                    {static_cast<uint32_t>(LnkTy)}, F.getGlobalIdentifier());
+  }
+
+  return true;
+}
+
+bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
+                                  CallLoweringInfo &Info) const {
+  // Currently call returns should have single vregs.
+  // TODO: handle the case of multiple registers.
+  if (Info.OrigRet.Regs.size() > 1)
+    return false;
+
+  Register ResVReg =
+      Info.OrigRet.Regs.empty() ? Register(0) : Info.OrigRet.Regs[0];
+  // Emit a regular OpFunctionCall. If it's an externally declared function,
+  // be sure to emit its type and function declaration here. It will be
+  // hoisted globally later.
+  if (Info.Callee.isGlobal()) {
+    auto *CF = dyn_cast_or_null<const Function>(Info.Callee.getGlobal());
+    // TODO: support constexpr casts and indirect calls.
+    if (CF == nullptr)
+      return false;
+    if (CF->isDeclaration()) {
+      // Emit the type info and forward function declaration to the first MBB
+      // to ensure VReg definition dependencies are valid across all MBBs.
+      MachineBasicBlock::iterator OldII = MIRBuilder.getInsertPt();
+      MachineBasicBlock &OldBB = MIRBuilder.getMBB();
+      MachineBasicBlock &FirstBB = *MIRBuilder.getMF().getBlockNumbered(0);
+      MIRBuilder.setInsertPt(FirstBB, FirstBB.instr_end());
+
+      SmallVector<ArrayRef<Register>, 8> VRegArgs;
+      SmallVector<SmallVector<Register, 1>, 8> ToInsert;
+      for (const Argument &Arg : CF->args()) {
+        if (MIRBuilder.getDataLayout().getTypeStoreSize(Arg.getType()).isZero())
+          continue; // Don't handle zero sized types.
+        ToInsert.push_back({MIRBuilder.getMRI()->createGenericVirtualRegister(
+            LLT::scalar(32))});
+        VRegArgs.push_back(ToInsert.back());
+      }
+      // TODO: Reuse FunctionLoweringInfo.
+      FunctionLoweringInfo FuncInfo;
+      lowerFormalArguments(MIRBuilder, *CF, VRegArgs, FuncInfo);
+      MIRBuilder.setInsertPt(OldBB, OldII);
+    }
+  }
+
+  // Make sure there's a valid return reg, even for functions returning void.
+  if (!ResVReg.isValid()) {
+    ResVReg = MIRBuilder.getMRI()->createVirtualRegister(&SPIRV::IDRegClass);
+  }
+  SPIRVType *RetType =
+      GR->assignTypeToVReg(Info.OrigRet.Ty, ResVReg, MIRBuilder);
+
+  // Emit the OpFunctionCall and its args.
+  auto MIB = MIRBuilder.buildInstr(SPIRV::OpFunctionCall)
+                 .addDef(ResVReg)
+                 .addUse(GR->getSPIRVTypeID(RetType))
+                 .add(Info.Callee);
+
+  for (const auto &Arg : Info.OrigArgs) {
+    // Currently call args should have single vregs.
+    if (Arg.Regs.size() > 1)
+      return false;
+    MIB.addUse(Arg.Regs[0]);
+  }
+  return MIB.constrainAllUses(MIRBuilder.getTII(), *ST.getRegisterInfo(),
+                              *ST.getRegBankInfo());
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.h b/llvm/lib/Target/SPIRV/SPIRVCallLowering.h
new file mode 100644
index 000000000000..c179bb35154b
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.h
@@ -0,0 +1,50 @@
+//===--- SPIRVCallLowering.h - Call lowering --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes how to lower LLVM calls to machine code calls.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPIRV_SPIRVCALLLOWERING_H
+#define LLVM_LIB_TARGET_SPIRV_SPIRVCALLLOWERING_H
+
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+
+namespace llvm {
+
+class SPIRVGlobalRegistry;
+class SPIRVSubtarget;
+class SPIRVTargetLowering;
+
+class SPIRVCallLowering : public CallLowering {
+private:
+  const SPIRVSubtarget &ST;
+  // Used to create and assign function, argument, and return type information.
+  SPIRVGlobalRegistry *GR;
+
+public:
+  SPIRVCallLowering(const SPIRVTargetLowering &TLI, const SPIRVSubtarget &ST,
+                    SPIRVGlobalRegistry *GR);
+
+  // Built OpReturn or OpReturnValue.
+  bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val,
+                   ArrayRef<Register> VRegs, FunctionLoweringInfo &FLI,
+                   Register SwiftErrorVReg) const override;
+
+  // Build OpFunction, OpFunctionParameter, and any EntryPoint or Linkage data.
+  bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
+                            ArrayRef<ArrayRef<Register>> VRegs,
+                            FunctionLoweringInfo &FLI) const override;
+
+  // Build OpCall, or replace with a builtin function.
+  bool lowerCall(MachineIRBuilder &MIRBuilder,
+                 CallLoweringInfo &Info) const override;
+};
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_SPIRV_SPIRVCALLLOWERING_H
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
new file mode 100644
index 000000000000..9624482e3622
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -0,0 +1,433 @@
+//===-- SPIRVEmitIntrinsics.cpp - emit SPIRV intrinsics ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The pass emits SPIRV intrinsics keeping essential high-level information for
+// the translation of LLVM IR to SPIR-V.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPIRV.h"
+#include "SPIRVTargetMachine.h"
+#include "SPIRVUtils.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IntrinsicsSPIRV.h"
+
+#include <queue>
+
+// This pass performs the following transformation on LLVM IR level required
+// for the following translation to SPIR-V:
+// - replaces direct usages of aggregate constants with target-specific
+//   intrinsics;
+// - replaces aggregates-related instructions (extract/insert, ld/st, etc)
+//   with a target-specific intrinsics;
+// - emits intrinsics for the global variable initializers since IRTranslator
+//   doesn't handle them and it's not very convenient to translate them
+//   ourselves;
+// - emits intrinsics to keep track of the string names assigned to the values;
+// - emits intrinsics to keep track of constants (this is necessary to have an
+//   LLVM IR constant after the IRTranslation is completed) for their further
+//   deduplication;
+// - emits intrinsics to keep track of original LLVM types of the values
+//   to be able to emit proper SPIR-V types eventually.
+//
+// TODO: consider removing spv.track.constant in favor of spv.assign.type.
+
+using namespace llvm;
+
+namespace llvm {
+void initializeSPIRVEmitIntrinsicsPass(PassRegistry &);
+} // namespace llvm
+
+namespace {
+class SPIRVEmitIntrinsics
+    : public FunctionPass,
+      public InstVisitor<SPIRVEmitIntrinsics, Instruction *> {
+  SPIRVTargetMachine *TM = nullptr;
+  IRBuilder<> *IRB = nullptr;
+  Function *F = nullptr;
+  bool TrackConstants = true;
+  DenseMap<Instruction *, Constant *> AggrConsts;
+  DenseSet<Instruction *> AggrStores;
+  void preprocessCompositeConstants();
+  CallInst *buildIntrWithMD(Intrinsic::ID IntrID, ArrayRef<Type *> Types,
+                            Value *Arg, Value *Arg2) {
+    ConstantAsMetadata *CM = ValueAsMetadata::getConstant(Arg);
+    MDTuple *TyMD = MDNode::get(F->getContext(), CM);
+    MetadataAsValue *VMD = MetadataAsValue::get(F->getContext(), TyMD);
+    return IRB->CreateIntrinsic(IntrID, {Types}, {Arg2, VMD});
+  }
+  void replaceMemInstrUses(Instruction *Old, Instruction *New);
+  void processInstrAfterVisit(Instruction *I);
+  void insertAssignTypeIntrs(Instruction *I);
+  void processGlobalValue(GlobalVariable &GV);
+
+public:
+  static char ID;
+  SPIRVEmitIntrinsics() : FunctionPass(ID) {
+    initializeSPIRVEmitIntrinsicsPass(*PassRegistry::getPassRegistry());
+  }
+  SPIRVEmitIntrinsics(SPIRVTargetMachine *_TM) : FunctionPass(ID), TM(_TM) {
+    initializeSPIRVEmitIntrinsicsPass(*PassRegistry::getPassRegistry());
+  }
+  Instruction *visitInstruction(Instruction &I) { return &I; }
+  Instruction *visitSwitchInst(SwitchInst &I);
+  Instruction *visitGetElementPtrInst(GetElementPtrInst &I);
+  Instruction *visitBitCastInst(BitCastInst &I);
+  Instruction *visitInsertElementInst(InsertElementInst &I);
+  Instruction *visitExtractElementInst(ExtractElementInst &I);
+  Instruction *visitInsertValueInst(InsertValueInst &I);
+  Instruction *visitExtractValueInst(ExtractValueInst &I);
+  Instruction *visitLoadInst(LoadInst &I);
+  Instruction *visitStoreInst(StoreInst &I);
+  Instruction *visitAllocaInst(AllocaInst &I);
+  bool runOnFunction(Function &F) override;
+};
+} // namespace
+
+char SPIRVEmitIntrinsics::ID = 0;
+
+INITIALIZE_PASS(SPIRVEmitIntrinsics, "emit-intrinsics", "SPIRV emit intrinsics",
+                false, false)
+
+static inline bool isAssignTypeInstr(const Instruction *I) {
+  return isa<IntrinsicInst>(I) &&
+         cast<IntrinsicInst>(I)->getIntrinsicID() == Intrinsic::spv_assign_type;
+}
+
+static bool isMemInstrToReplace(Instruction *I) {
+  return isa<StoreInst>(I) || isa<LoadInst>(I) || isa<InsertValueInst>(I) ||
+         isa<ExtractValueInst>(I);
+}
+
+static bool isAggrToReplace(const Value *V) {
+  return isa<ConstantAggregate>(V) || isa<ConstantDataArray>(V) ||
+         (isa<ConstantAggregateZero>(V) && !V->getType()->isVectorTy());
+}
+
+static void setInsertPointSkippingPhis(IRBuilder<> &B, Instruction *I) {
+  if (isa<PHINode>(I))
+    B.SetInsertPoint(I->getParent(), I->getParent()->getFirstInsertionPt());
+  else
+    B.SetInsertPoint(I);
+}
+
+static bool requireAssignType(Instruction *I) {
+  IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(I);
+  if (Intr) {
+    switch (Intr->getIntrinsicID()) {
+    case Intrinsic::invariant_start:
+    case Intrinsic::invariant_end:
+      return false;
+    }
+  }
+  return true;
+}
+
+void SPIRVEmitIntrinsics::replaceMemInstrUses(Instruction *Old,
+                                              Instruction *New) {
+  while (!Old->user_empty()) {
+    auto *U = Old->user_back();
+    if (isMemInstrToReplace(U) || isa<ReturnInst>(U)) {
+      U->replaceUsesOfWith(Old, New);
+    } else if (isAssignTypeInstr(U)) {
+      IRB->SetInsertPoint(U);
+      SmallVector<Value *, 2> Args = {New, U->getOperand(1)};
+      IRB->CreateIntrinsic(Intrinsic::spv_assign_type, {New->getType()}, Args);
+      U->eraseFromParent();
+    } else {
+      llvm_unreachable("illegal aggregate intrinsic user");
+    }
+  }
+  Old->eraseFromParent();
+}
+
+void SPIRVEmitIntrinsics::preprocessCompositeConstants() {
+  std::queue<Instruction *> Worklist;
+  for (auto &I : instructions(F))
+    Worklist.push(&I);
+
+  while (!Worklist.empty()) {
+    auto *I = Worklist.front();
+    assert(I);
+    bool KeepInst = false;
+    for (const auto &Op : I->operands()) {
+      auto BuildCompositeIntrinsic = [&KeepInst, &Worklist, &I, &Op,
+                                      this](Constant *AggrC,
+                                            ArrayRef<Value *> Args) {
+        IRB->SetInsertPoint(I);
+        auto *CCI =
+            IRB->CreateIntrinsic(Intrinsic::spv_const_composite, {}, {Args});
+        Worklist.push(CCI);
+        I->replaceUsesOfWith(Op, CCI);
+        KeepInst = true;
+        AggrConsts[CCI] = AggrC;
+      };
+
+      if (auto *AggrC = dyn_cast<ConstantAggregate>(Op)) {
+        SmallVector<Value *> Args(AggrC->op_begin(), AggrC->op_end());
+        BuildCompositeIntrinsic(AggrC, Args);
+      } else if (auto *AggrC = dyn_cast<ConstantDataArray>(Op)) {
+        SmallVector<Value *> Args;
+        for (unsigned i = 0; i < AggrC->getNumElements(); ++i)
+          Args.push_back(AggrC->getElementAsConstant(i));
+        BuildCompositeIntrinsic(AggrC, Args);
+      } else if (isa<ConstantAggregateZero>(Op) &&
+                 !Op->getType()->isVectorTy()) {
+        auto *AggrC = cast<ConstantAggregateZero>(Op);
+        SmallVector<Value *> Args(AggrC->op_begin(), AggrC->op_end());
+        BuildCompositeIntrinsic(AggrC, Args);
+      }
+    }
+    if (!KeepInst)
+      Worklist.pop();
+  }
+}
+
+Instruction *SPIRVEmitIntrinsics::visitSwitchInst(SwitchInst &I) {
+  SmallVector<Value *, 4> Args;
+  for (auto &Op : I.operands())
+    if (Op.get()->getType()->isSized())
+      Args.push_back(Op);
+  IRB->CreateIntrinsic(Intrinsic::spv_switch, {I.getOperand(0)->getType()},
+                       {Args});
+  return &I;
+}
+
+Instruction *SPIRVEmitIntrinsics::visitGetElementPtrInst(GetElementPtrInst &I) {
+  SmallVector<Type *, 2> Types = {I.getType(), I.getOperand(0)->getType()};
+  SmallVector<Value *, 4> Args;
+  Args.push_back(IRB->getInt1(I.isInBounds()));
+  for (auto &Op : I.operands())
+    Args.push_back(Op);
+  auto *NewI = IRB->CreateIntrinsic(Intrinsic::spv_gep, {Types}, {Args});
+  I.replaceAllUsesWith(NewI);
+  I.eraseFromParent();
+  return NewI;
+}
+
+Instruction *SPIRVEmitIntrinsics::visitBitCastInst(BitCastInst &I) {
+  SmallVector<Type *, 2> Types = {I.getType(), I.getOperand(0)->getType()};
+  SmallVector<Value *> Args(I.op_begin(), I.op_end());
+  auto *NewI = IRB->CreateIntrinsic(Intrinsic::spv_bitcast, {Types}, {Args});
+  std::string InstName = I.hasName() ? I.getName().str() : "";
+  I.replaceAllUsesWith(NewI);
+  I.eraseFromParent();
+  NewI->setName(InstName);
+  return NewI;
+}
+
+Instruction *SPIRVEmitIntrinsics::visitInsertElementInst(InsertElementInst &I) {
+  SmallVector<Type *, 4> Types = {I.getType(), I.getOperand(0)->getType(),
+                                  I.getOperand(1)->getType(),
+                                  I.getOperand(2)->getType()};
+  SmallVector<Value *> Args(I.op_begin(), I.op_end());
+  auto *NewI = IRB->CreateIntrinsic(Intrinsic::spv_insertelt, {Types}, {Args});
+  std::string InstName = I.hasName() ? I.getName().str() : "";
+  I.replaceAllUsesWith(NewI);
+  I.eraseFromParent();
+  NewI->setName(InstName);
+  return NewI;
+}
+
+Instruction *
+SPIRVEmitIntrinsics::visitExtractElementInst(ExtractElementInst &I) {
+  SmallVector<Type *, 3> Types = {I.getType(), I.getVectorOperandType(),
+                                  I.getIndexOperand()->getType()};
+  SmallVector<Value *, 2> Args = {I.getVectorOperand(), I.getIndexOperand()};
+  auto *NewI = IRB->CreateIntrinsic(Intrinsic::spv_extractelt, {Types}, {Args});
+  std::string InstName = I.hasName() ? I.getName().str() : "";
+  I.replaceAllUsesWith(NewI);
+  I.eraseFromParent();
+  NewI->setName(InstName);
+  return NewI;
+}
+
+Instruction *SPIRVEmitIntrinsics::visitInsertValueInst(InsertValueInst &I) {
+  SmallVector<Type *, 1> Types = {I.getInsertedValueOperand()->getType()};
+  SmallVector<Value *> Args;
+  for (auto &Op : I.operands())
+    if (isa<UndefValue>(Op))
+      Args.push_back(UndefValue::get(IRB->getInt32Ty()));
+    else
+      Args.push_back(Op);
+  for (auto &Op : I.indices())
+    Args.push_back(IRB->getInt32(Op));
+  Instruction *NewI =
+      IRB->CreateIntrinsic(Intrinsic::spv_insertv, {Types}, {Args});
+  replaceMemInstrUses(&I, NewI);
+  return NewI;
+}
+
+Instruction *SPIRVEmitIntrinsics::visitExtractValueInst(ExtractValueInst &I) {
+  SmallVector<Value *> Args;
+  for (auto &Op : I.operands())
+    Args.push_back(Op);
+  for (auto &Op : I.indices())
+    Args.push_back(IRB->getInt32(Op));
+  auto *NewI =
+      IRB->CreateIntrinsic(Intrinsic::spv_extractv, {I.getType()}, {Args});
+  I.replaceAllUsesWith(NewI);
+  I.eraseFromParent();
+  return NewI;
+}
+
+Instruction *SPIRVEmitIntrinsics::visitLoadInst(LoadInst &I) {
+  if (!I.getType()->isAggregateType())
+    return &I;
+  TrackConstants = false;
+  const auto *TLI = TM->getSubtargetImpl()->getTargetLowering();
+  MachineMemOperand::Flags Flags =
+      TLI->getLoadMemOperandFlags(I, F->getParent()->getDataLayout());
+  auto *NewI =
+      IRB->CreateIntrinsic(Intrinsic::spv_load, {I.getOperand(0)->getType()},
+                           {I.getPointerOperand(), IRB->getInt16(Flags),
+                            IRB->getInt8(I.getAlign().value())});
+  replaceMemInstrUses(&I, NewI);
+  return NewI;
+}
+
+Instruction *SPIRVEmitIntrinsics::visitStoreInst(StoreInst &I) {
+  if (!AggrStores.contains(&I))
+    return &I;
+  TrackConstants = false;
+  const auto *TLI = TM->getSubtargetImpl()->getTargetLowering();
+  MachineMemOperand::Flags Flags =
+      TLI->getStoreMemOperandFlags(I, F->getParent()->getDataLayout());
+  auto *PtrOp = I.getPointerOperand();
+  auto *NewI =
+      IRB->CreateIntrinsic(Intrinsic::spv_store, {PtrOp->getType()},
+                           {I.getValueOperand(), PtrOp, IRB->getInt16(Flags),
+                            IRB->getInt8(I.getAlign().value())});
+  I.eraseFromParent();
+  return NewI;
+}
+
+Instruction *SPIRVEmitIntrinsics::visitAllocaInst(AllocaInst &I) {
+  TrackConstants = false;
+  return &I;
+}
+
+void SPIRVEmitIntrinsics::processGlobalValue(GlobalVariable &GV) {
+  // Skip special artifical variable llvm.global.annotations.
+  if (GV.getName() == "llvm.global.annotations")
+    return;
+  if (GV.hasInitializer() && !isa<UndefValue>(GV.getInitializer())) {
+    Constant *Init = GV.getInitializer();
+    Type *Ty = isAggrToReplace(Init) ? IRB->getInt32Ty() : Init->getType();
+    Constant *Const = isAggrToReplace(Init) ? IRB->getInt32(1) : Init;
+    auto *InitInst = IRB->CreateIntrinsic(Intrinsic::spv_init_global,
+                                          {GV.getType(), Ty}, {&GV, Const});
+    InitInst->setArgOperand(1, Init);
+  }
+  if ((!GV.hasInitializer() || isa<UndefValue>(GV.getInitializer())) &&
+      GV.getNumUses() == 0)
+    IRB->CreateIntrinsic(Intrinsic::spv_unref_global, GV.getType(), &GV);
+}
+
+void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I) {
+  Type *Ty = I->getType();
+  if (!Ty->isVoidTy() && requireAssignType(I)) {
+    setInsertPointSkippingPhis(*IRB, I->getNextNode());
+    Type *TypeToAssign = Ty;
+    if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+      if (II->getIntrinsicID() == Intrinsic::spv_const_composite) {
+        auto t = AggrConsts.find(II);
+        assert(t != AggrConsts.end());
+        TypeToAssign = t->second->getType();
+      }
+    }
+    Constant *Const = Constant::getNullValue(TypeToAssign);
+    buildIntrWithMD(Intrinsic::spv_assign_type, {Ty}, Const, I);
+  }
+  for (const auto &Op : I->operands()) {
+    if (isa<ConstantPointerNull>(Op) || isa<UndefValue>(Op) ||
+        // Check GetElementPtrConstantExpr case.
+        (isa<ConstantExpr>(Op) && isa<GEPOperator>(Op))) {
+      IRB->SetInsertPoint(I);
+      buildIntrWithMD(Intrinsic::spv_assign_type, {Op->getType()}, Op, Op);
+    }
+  }
+  // StoreInst's operand type can be changed in the next stage so we need to
+  // store it in the set.
+  if (isa<StoreInst>(I) &&
+      cast<StoreInst>(I)->getValueOperand()->getType()->isAggregateType())
+    AggrStores.insert(I);
+}
+
+void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I) {
+  auto *II = dyn_cast<IntrinsicInst>(I);
+  if (II && II->getIntrinsicID() == Intrinsic::spv_const_composite &&
+      TrackConstants) {
+    IRB->SetInsertPoint(I->getNextNode());
+    Type *Ty = IRB->getInt32Ty();
+    auto t = AggrConsts.find(I);
+    assert(t != AggrConsts.end());
+    auto *NewOp =
+        buildIntrWithMD(Intrinsic::spv_track_constant, {Ty, Ty}, t->second, I);
+    I->replaceAllUsesWith(NewOp);
+    NewOp->setArgOperand(0, I);
+  }
+  for (const auto &Op : I->operands()) {
+    if ((isa<ConstantAggregateZero>(Op) && Op->getType()->isVectorTy()) ||
+        isa<PHINode>(I) || isa<SwitchInst>(I))
+      TrackConstants = false;
+    if (isa<ConstantData>(Op) && TrackConstants) {
+      unsigned OpNo = Op.getOperandNo();
+      if (II && ((II->getIntrinsicID() == Intrinsic::spv_gep && OpNo == 0) ||
+                 (II->paramHasAttr(OpNo, Attribute::ImmArg))))
+        continue;
+      IRB->SetInsertPoint(I);
+      auto *NewOp = buildIntrWithMD(Intrinsic::spv_track_constant,
+                                    {Op->getType(), Op->getType()}, Op, Op);
+      I->setOperand(OpNo, NewOp);
+    }
+  }
+  if (I->hasName()) {
+    setInsertPointSkippingPhis(*IRB, I->getNextNode());
+    std::vector<Value *> Args = {I};
+    addStringImm(I->getName(), *IRB, Args);
+    IRB->CreateIntrinsic(Intrinsic::spv_assign_name, {I->getType()}, Args);
+  }
+}
+
+bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) {
+  if (Func.isDeclaration())
+    return false;
+  F = &Func;
+  IRB = new IRBuilder<>(Func.getContext());
+  AggrConsts.clear();
+  AggrStores.clear();
+
+  IRB->SetInsertPoint(&Func.getEntryBlock().front());
+
+  for (auto &GV : Func.getParent()->globals())
+    processGlobalValue(GV);
+
+  preprocessCompositeConstants();
+  SmallVector<Instruction *> Worklist;
+  for (auto &I : instructions(Func))
+    Worklist.push_back(&I);
+
+  for (auto &I : Worklist)
+    insertAssignTypeIntrs(I);
+
+  for (auto *I : Worklist) {
+    TrackConstants = true;
+    if (!I->getType()->isVoidTy() || isa<StoreInst>(I))
+      IRB->SetInsertPoint(I->getNextNode());
+    I = visit(*I);
+    processInstrAfterVisit(I);
+  }
+  return true;
+}
+
+FunctionPass *llvm::createSPIRVEmitIntrinsicsPass(SPIRVTargetMachine *TM) {
+  return new SPIRVEmitIntrinsics(TM);
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVEnums.td b/llvm/lib/Target/SPIRV/SPIRVEnums.td
new file mode 100644
index 000000000000..1d0c6ffd6e37
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVEnums.td
@@ -0,0 +1,51 @@
+//===-- SPIRVEnums.td - Describe SPIRV Enum Operands -------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// All SPIRV enums defined in SPIRVBaseInfo.h should have a corresponding enum
+// operand here. This enables the correct PrintMethod to be defined so
+// its name or mask bits can be automatically printed in SPIRVInstPrinter
+// when referred to in SPIRVInstrInfo.td.
+//
+//===----------------------------------------------------------------------===//
+
+class EnumOperand<string Name> : Operand<i32>{
+  let PrintMethod = "print"#Name;
+}
+
+def ExtInst : EnumOperand<"ExtInst">;
+
+def Capability : EnumOperand<"Capability">;
+def SourceLanguage : EnumOperand<"SourceLanguage">;
+def ExecutionModel : EnumOperand<"ExecutionModel">;
+def AddressingModel : EnumOperand<"AddressingModel">;
+def MemoryModel : EnumOperand<"MemoryModel">;
+def ExecutionMode : EnumOperand<"ExecutionMode">;
+def StorageClass : EnumOperand<"StorageClass">;
+def Dim : EnumOperand<"Dim">;
+def SamplerAddressingMode : EnumOperand<"SamplerAddressingMode">;
+def SamplerFilterMode : EnumOperand<"SamplerFilterMode">;
+def ImageFormat : EnumOperand<"ImageFormat">;
+def ImageChannelOrder : EnumOperand<"ImageChannelOrder">;
+def ImageChannelDataType : EnumOperand<"ImageChannelDataType">;
+def ImageOperand : EnumOperand<"ImageOperand">;
+def FPFastMathMode : EnumOperand<"FPFastMathMode">;
+def FProundingMode : EnumOperand<"FPRoundingMode">;
+def LinkageType : EnumOperand<"LinkageType">;
+def AccessQualifier : EnumOperand<"AccessQualifier">;
+def FunctionParameterAttribute : EnumOperand<"FunctionParameterAttribute">;
+def Decoration : EnumOperand<"Decoration">;
+def Builtin : EnumOperand<"Builtin">;
+def SelectionControl: EnumOperand<"SelectionControl">;
+def LoopControl: EnumOperand<"LoopControl">;
+def FunctionControl : EnumOperand<"FunctionControl">;
+def MemorySemantics : EnumOperand<"MemorySemantics">;
+def MemoryOperand : EnumOperand<"MemoryOperand">;
+def Scope : EnumOperand<"Scope">;
+def GroupOperation : EnumOperand<"GroupOperation">;
+def KernelEnqueueFlags : EnumOperand<"KernelEnqueueFlags">;
+def KernelProfilingInfo : EnumOperand<"KernelProfilingInfo">;
diff --git a/llvm/lib/Target/SPIRV/SPIRVFrameLowering.h b/llvm/lib/Target/SPIRV/SPIRVFrameLowering.h
new file mode 100644
index 000000000000..b98f8d0928e5
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVFrameLowering.h
@@ -0,0 +1,39 @@
+//===-- SPIRVFrameLowering.h - Define frame lowering for SPIR-V -*- C++-*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements SPIRV-specific bits of TargetFrameLowering class.
+// The target uses only virtual registers. It does not operate with stack frame
+// explicitly and does not generate prologues/epilogues of functions.
+// As a result, we are not required to implemented the frame lowering
+// functionality substantially.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPIRV_SPIRVFRAMELOWERING_H
+#define LLVM_LIB_TARGET_SPIRV_SPIRVFRAMELOWERING_H
+
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/Support/Alignment.h"
+
+namespace llvm {
+class SPIRVSubtarget;
+
+class SPIRVFrameLowering : public TargetFrameLowering {
+public:
+  explicit SPIRVFrameLowering(const SPIRVSubtarget &sti)
+      : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align(8), 0) {}
+
+  void emitPrologue(MachineFunction &MF,
+                    MachineBasicBlock &MBB) const override {}
+  void emitEpilogue(MachineFunction &MF,
+                    MachineBasicBlock &MBB) const override {}
+
+  bool hasFP(const MachineFunction &MF) const override { return false; }
+};
+} // namespace llvm
+#endif // LLVM_LIB_TARGET_SPIRV_SPIRVFRAMELOWERING_H
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
new file mode 100644
index 000000000000..02a6905a1abc
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -0,0 +1,459 @@
+//===-- SPIRVGlobalRegistry.cpp - SPIR-V Global Registry --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the SPIRVGlobalRegistry class,
+// which is used to maintain rich type information required for SPIR-V even
+// after lowering from LLVM IR to GMIR. It can convert an llvm::Type into
+// an OpTypeXXX instruction, and map it to a virtual register. Also it builds
+// and supports consistency of constants and global variables.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPIRVGlobalRegistry.h"
+#include "SPIRV.h"
+#include "SPIRVSubtarget.h"
+#include "SPIRVTargetMachine.h"
+#include "SPIRVUtils.h"
+
+using namespace llvm;
+SPIRVGlobalRegistry::SPIRVGlobalRegistry(unsigned PointerSize)
+    : PointerSize(PointerSize) {}
+
+SPIRVType *SPIRVGlobalRegistry::assignTypeToVReg(
+    const Type *Type, Register VReg, MachineIRBuilder &MIRBuilder,
+    SPIRV::AccessQualifier AccessQual, bool EmitIR) {
+
+  SPIRVType *SpirvType =
+      getOrCreateSPIRVType(Type, MIRBuilder, AccessQual, EmitIR);
+  assignSPIRVTypeToVReg(SpirvType, VReg, MIRBuilder.getMF());
+  return SpirvType;
+}
+
+void SPIRVGlobalRegistry::assignSPIRVTypeToVReg(SPIRVType *SpirvType,
+                                                Register VReg,
+                                                MachineFunction &MF) {
+  VRegToTypeMap[&MF][VReg] = SpirvType;
+}
+
+static Register createTypeVReg(MachineIRBuilder &MIRBuilder) {
+  auto &MRI = MIRBuilder.getMF().getRegInfo();
+  auto Res = MRI.createGenericVirtualRegister(LLT::scalar(32));
+  MRI.setRegClass(Res, &SPIRV::TYPERegClass);
+  return Res;
+}
+
+static Register createTypeVReg(MachineRegisterInfo &MRI) {
+  auto Res = MRI.createGenericVirtualRegister(LLT::scalar(32));
+  MRI.setRegClass(Res, &SPIRV::TYPERegClass);
+  return Res;
+}
+
+SPIRVType *SPIRVGlobalRegistry::getOpTypeBool(MachineIRBuilder &MIRBuilder) {
+  return MIRBuilder.buildInstr(SPIRV::OpTypeBool)
+      .addDef(createTypeVReg(MIRBuilder));
+}
+
+SPIRVType *SPIRVGlobalRegistry::getOpTypeInt(uint32_t Width,
+                                             MachineIRBuilder &MIRBuilder,
+                                             bool IsSigned) {
+  auto MIB = MIRBuilder.buildInstr(SPIRV::OpTypeInt)
+                 .addDef(createTypeVReg(MIRBuilder))
+                 .addImm(Width)
+                 .addImm(IsSigned ? 1 : 0);
+  return MIB;
+}
+
+SPIRVType *SPIRVGlobalRegistry::getOpTypeFloat(uint32_t Width,
+                                               MachineIRBuilder &MIRBuilder) {
+  auto MIB = MIRBuilder.buildInstr(SPIRV::OpTypeFloat)
+                 .addDef(createTypeVReg(MIRBuilder))
+                 .addImm(Width);
+  return MIB;
+}
+
+SPIRVType *SPIRVGlobalRegistry::getOpTypeVoid(MachineIRBuilder &MIRBuilder) {
+  return MIRBuilder.buildInstr(SPIRV::OpTypeVoid)
+      .addDef(createTypeVReg(MIRBuilder));
+}
+
+SPIRVType *SPIRVGlobalRegistry::getOpTypeVector(uint32_t NumElems,
+                                                SPIRVType *ElemType,
+                                                MachineIRBuilder &MIRBuilder) {
+  auto EleOpc = ElemType->getOpcode();
+  assert((EleOpc == SPIRV::OpTypeInt || EleOpc == SPIRV::OpTypeFloat ||
+          EleOpc == SPIRV::OpTypeBool) &&
+         "Invalid vector element type");
+
+  auto MIB = MIRBuilder.buildInstr(SPIRV::OpTypeVector)
+                 .addDef(createTypeVReg(MIRBuilder))
+                 .addUse(getSPIRVTypeID(ElemType))
+                 .addImm(NumElems);
+  return MIB;
+}
+
+Register SPIRVGlobalRegistry::buildConstantInt(uint64_t Val,
+                                               MachineIRBuilder &MIRBuilder,
+                                               SPIRVType *SpvType,
+                                               bool EmitIR) {
+  auto &MF = MIRBuilder.getMF();
+  Register Res;
+  const IntegerType *LLVMIntTy;
+  if (SpvType)
+    LLVMIntTy = cast<IntegerType>(getTypeForSPIRVType(SpvType));
+  else
+    LLVMIntTy = IntegerType::getInt32Ty(MF.getFunction().getContext());
+  // Find a constant in DT or build a new one.
+  const auto ConstInt =
+      ConstantInt::get(const_cast<IntegerType *>(LLVMIntTy), Val);
+  unsigned BitWidth = SpvType ? getScalarOrVectorBitWidth(SpvType) : 32;
+  Res = MF.getRegInfo().createGenericVirtualRegister(LLT::scalar(BitWidth));
+  assignTypeToVReg(LLVMIntTy, Res, MIRBuilder);
+  if (EmitIR)
+    MIRBuilder.buildConstant(Res, *ConstInt);
+  else
+    MIRBuilder.buildInstr(SPIRV::OpConstantI)
+        .addDef(Res)
+        .addImm(ConstInt->getSExtValue());
+  return Res;
+}
+
+Register SPIRVGlobalRegistry::buildConstantFP(APFloat Val,
+                                              MachineIRBuilder &MIRBuilder,
+                                              SPIRVType *SpvType) {
+  auto &MF = MIRBuilder.getMF();
+  Register Res;
+  const Type *LLVMFPTy;
+  if (SpvType) {
+    LLVMFPTy = getTypeForSPIRVType(SpvType);
+    assert(LLVMFPTy->isFloatingPointTy());
+  } else {
+    LLVMFPTy = IntegerType::getFloatTy(MF.getFunction().getContext());
+  }
+  // Find a constant in DT or build a new one.
+  const auto ConstFP = ConstantFP::get(LLVMFPTy->getContext(), Val);
+  unsigned BitWidth = SpvType ? getScalarOrVectorBitWidth(SpvType) : 32;
+  Res = MF.getRegInfo().createGenericVirtualRegister(LLT::scalar(BitWidth));
+  assignTypeToVReg(LLVMFPTy, Res, MIRBuilder);
+  MIRBuilder.buildFConstant(Res, *ConstFP);
+  return Res;
+}
+
+Register SPIRVGlobalRegistry::buildGlobalVariable(
+    Register ResVReg, SPIRVType *BaseType, StringRef Name,
+    const GlobalValue *GV, SPIRV::StorageClass Storage,
+    const MachineInstr *Init, bool IsConst, bool HasLinkageTy,
+    SPIRV::LinkageType LinkageType, MachineIRBuilder &MIRBuilder,
+    bool IsInstSelector) {
+  const GlobalVariable *GVar = nullptr;
+  if (GV)
+    GVar = cast<const GlobalVariable>(GV);
+  else {
+    // If GV is not passed explicitly, use the name to find or construct
+    // the global variable.
+    Module *M = MIRBuilder.getMF().getFunction().getParent();
+    GVar = M->getGlobalVariable(Name);
+    if (GVar == nullptr) {
+      const Type *Ty = getTypeForSPIRVType(BaseType); // TODO: check type.
+      GVar = new GlobalVariable(*M, const_cast<Type *>(Ty), false,
+                                GlobalValue::ExternalLinkage, nullptr,
+                                Twine(Name));
+    }
+    GV = GVar;
+  }
+  Register Reg;
+  auto MIB = MIRBuilder.buildInstr(SPIRV::OpVariable)
+                 .addDef(ResVReg)
+                 .addUse(getSPIRVTypeID(BaseType))
+                 .addImm(static_cast<uint32_t>(Storage));
+
+  if (Init != 0) {
+    MIB.addUse(Init->getOperand(0).getReg());
+  }
+
+  // ISel may introduce a new register on this step, so we need to add it to
+  // DT and correct its type avoiding fails on the next stage.
+  if (IsInstSelector) {
+    const auto &Subtarget = CurMF->getSubtarget();
+    constrainSelectedInstRegOperands(*MIB, *Subtarget.getInstrInfo(),
+                                     *Subtarget.getRegisterInfo(),
+                                     *Subtarget.getRegBankInfo());
+  }
+  Reg = MIB->getOperand(0).getReg();
+
+  // Set to Reg the same type as ResVReg has.
+  auto MRI = MIRBuilder.getMRI();
+  assert(MRI->getType(ResVReg).isPointer() && "Pointer type is expected");
+  if (Reg != ResVReg) {
+    LLT RegLLTy = LLT::pointer(MRI->getType(ResVReg).getAddressSpace(), 32);
+    MRI->setType(Reg, RegLLTy);
+    assignSPIRVTypeToVReg(BaseType, Reg, MIRBuilder.getMF());
+  }
+
+  // If it's a global variable with name, output OpName for it.
+  if (GVar && GVar->hasName())
+    buildOpName(Reg, GVar->getName(), MIRBuilder);
+
+  // Output decorations for the GV.
+  // TODO: maybe move to GenerateDecorations pass.
+  if (IsConst)
+    buildOpDecorate(Reg, MIRBuilder, SPIRV::Decoration::Constant, {});
+
+  if (GVar && GVar->getAlign().valueOrOne().value() != 1)
+    buildOpDecorate(
+        Reg, MIRBuilder, SPIRV::Decoration::Alignment,
+        {static_cast<uint32_t>(GVar->getAlign().valueOrOne().value())});
+
+  if (HasLinkageTy)
+    buildOpDecorate(Reg, MIRBuilder, SPIRV::Decoration::LinkageAttributes,
+                    {static_cast<uint32_t>(LinkageType)}, Name);
+  return Reg;
+}
+
+SPIRVType *SPIRVGlobalRegistry::getOpTypeArray(uint32_t NumElems,
+                                               SPIRVType *ElemType,
+                                               MachineIRBuilder &MIRBuilder,
+                                               bool EmitIR) {
+  assert((ElemType->getOpcode() != SPIRV::OpTypeVoid) &&
+         "Invalid array element type");
+  Register NumElementsVReg =
+      buildConstantInt(NumElems, MIRBuilder, nullptr, EmitIR);
+  auto MIB = MIRBuilder.buildInstr(SPIRV::OpTypeArray)
+                 .addDef(createTypeVReg(MIRBuilder))
+                 .addUse(getSPIRVTypeID(ElemType))
+                 .addUse(NumElementsVReg);
+  return MIB;
+}
+
+SPIRVType *SPIRVGlobalRegistry::getOpTypePointer(SPIRV::StorageClass SC,
+                                                 SPIRVType *ElemType,
+                                                 MachineIRBuilder &MIRBuilder) {
+  auto MIB = MIRBuilder.buildInstr(SPIRV::OpTypePointer)
+                 .addDef(createTypeVReg(MIRBuilder))
+                 .addImm(static_cast<uint32_t>(SC))
+                 .addUse(getSPIRVTypeID(ElemType));
+  return MIB;
+}
+
+SPIRVType *SPIRVGlobalRegistry::getOpTypeFunction(
+    SPIRVType *RetType, const SmallVectorImpl<SPIRVType *> &ArgTypes,
+    MachineIRBuilder &MIRBuilder) {
+  auto MIB = MIRBuilder.buildInstr(SPIRV::OpTypeFunction)
+                 .addDef(createTypeVReg(MIRBuilder))
+                 .addUse(getSPIRVTypeID(RetType));
+  for (const SPIRVType *ArgType : ArgTypes)
+    MIB.addUse(getSPIRVTypeID(ArgType));
+  return MIB;
+}
+
+SPIRVType *SPIRVGlobalRegistry::createSPIRVType(const Type *Ty,
+                                                MachineIRBuilder &MIRBuilder,
+                                                SPIRV::AccessQualifier AccQual,
+                                                bool EmitIR) {
+  if (auto IType = dyn_cast<IntegerType>(Ty)) {
+    const unsigned Width = IType->getBitWidth();
+    return Width == 1 ? getOpTypeBool(MIRBuilder)
+                      : getOpTypeInt(Width, MIRBuilder, false);
+  }
+  if (Ty->isFloatingPointTy())
+    return getOpTypeFloat(Ty->getPrimitiveSizeInBits(), MIRBuilder);
+  if (Ty->isVoidTy())
+    return getOpTypeVoid(MIRBuilder);
+  if (Ty->isVectorTy()) {
+    auto El = getOrCreateSPIRVType(cast<FixedVectorType>(Ty)->getElementType(),
+                                   MIRBuilder);
+    return getOpTypeVector(cast<FixedVectorType>(Ty)->getNumElements(), El,
+                           MIRBuilder);
+  }
+  if (Ty->isArrayTy()) {
+    auto *El = getOrCreateSPIRVType(Ty->getArrayElementType(), MIRBuilder);
+    return getOpTypeArray(Ty->getArrayNumElements(), El, MIRBuilder, EmitIR);
+  }
+  assert(!isa<StructType>(Ty) && "Unsupported StructType");
+  if (auto FType = dyn_cast<FunctionType>(Ty)) {
+    SPIRVType *RetTy = getOrCreateSPIRVType(FType->getReturnType(), MIRBuilder);
+    SmallVector<SPIRVType *, 4> ParamTypes;
+    for (const auto &t : FType->params()) {
+      ParamTypes.push_back(getOrCreateSPIRVType(t, MIRBuilder));
+    }
+    return getOpTypeFunction(RetTy, ParamTypes, MIRBuilder);
+  }
+  if (auto PType = dyn_cast<PointerType>(Ty)) {
+    SPIRVType *SpvElementType;
+    // At the moment, all opaque pointers correspond to i8 element type.
+    // TODO: change the implementation once opaque pointers are supported
+    // in the SPIR-V specification.
+    if (PType->isOpaque()) {
+      SpvElementType = getOrCreateSPIRVIntegerType(8, MIRBuilder);
+    } else {
+      Type *ElemType = PType->getNonOpaquePointerElementType();
+      // TODO: support OpenCL and SPIRV builtins like image2d_t that are passed
+      // as pointers, but should be treated as custom types like OpTypeImage.
+      assert(!isa<StructType>(ElemType) && "Unsupported StructType pointer");
+
+      // Otherwise, treat it as a regular pointer type.
+      SpvElementType = getOrCreateSPIRVType(
+          ElemType, MIRBuilder, SPIRV::AccessQualifier::ReadWrite, EmitIR);
+    }
+    auto SC = addressSpaceToStorageClass(PType->getAddressSpace());
+    return getOpTypePointer(SC, SpvElementType, MIRBuilder);
+  }
+  llvm_unreachable("Unable to convert LLVM type to SPIRVType");
+}
+
+SPIRVType *SPIRVGlobalRegistry::getSPIRVTypeForVReg(Register VReg) const {
+  auto t = VRegToTypeMap.find(CurMF);
+  if (t != VRegToTypeMap.end()) {
+    auto tt = t->second.find(VReg);
+    if (tt != t->second.end())
+      return tt->second;
+  }
+  return nullptr;
+}
+
+SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVType(
+    const Type *Type, MachineIRBuilder &MIRBuilder,
+    SPIRV::AccessQualifier AccessQual, bool EmitIR) {
+  SPIRVType *SpirvType = createSPIRVType(Type, MIRBuilder, AccessQual, EmitIR);
+  VRegToTypeMap[&MIRBuilder.getMF()][getSPIRVTypeID(SpirvType)] = SpirvType;
+  SPIRVToLLVMType[SpirvType] = Type;
+  return SpirvType;
+}
+
+bool SPIRVGlobalRegistry::isScalarOfType(Register VReg,
+                                         unsigned TypeOpcode) const {
+  SPIRVType *Type = getSPIRVTypeForVReg(VReg);
+  assert(Type && "isScalarOfType VReg has no type assigned");
+  return Type->getOpcode() == TypeOpcode;
+}
+
+bool SPIRVGlobalRegistry::isScalarOrVectorOfType(Register VReg,
+                                                 unsigned TypeOpcode) const {
+  SPIRVType *Type = getSPIRVTypeForVReg(VReg);
+  assert(Type && "isScalarOrVectorOfType VReg has no type assigned");
+  if (Type->getOpcode() == TypeOpcode)
+    return true;
+  if (Type->getOpcode() == SPIRV::OpTypeVector) {
+    Register ScalarTypeVReg = Type->getOperand(1).getReg();
+    SPIRVType *ScalarType = getSPIRVTypeForVReg(ScalarTypeVReg);
+    return ScalarType->getOpcode() == TypeOpcode;
+  }
+  return false;
+}
+
+unsigned
+SPIRVGlobalRegistry::getScalarOrVectorBitWidth(const SPIRVType *Type) const {
+  assert(Type && "Invalid Type pointer");
+  if (Type->getOpcode() == SPIRV::OpTypeVector) {
+    auto EleTypeReg = Type->getOperand(1).getReg();
+    Type = getSPIRVTypeForVReg(EleTypeReg);
+  }
+  if (Type->getOpcode() == SPIRV::OpTypeInt ||
+      Type->getOpcode() == SPIRV::OpTypeFloat)
+    return Type->getOperand(1).getImm();
+  if (Type->getOpcode() == SPIRV::OpTypeBool)
+    return 1;
+  llvm_unreachable("Attempting to get bit width of non-integer/float type.");
+}
+
+bool SPIRVGlobalRegistry::isScalarOrVectorSigned(const SPIRVType *Type) const {
+  assert(Type && "Invalid Type pointer");
+  if (Type->getOpcode() == SPIRV::OpTypeVector) {
+    auto EleTypeReg = Type->getOperand(1).getReg();
+    Type = getSPIRVTypeForVReg(EleTypeReg);
+  }
+  if (Type->getOpcode() == SPIRV::OpTypeInt)
+    return Type->getOperand(2).getImm() != 0;
+  llvm_unreachable("Attempting to get sign of non-integer type.");
+}
+
+SPIRV::StorageClass
+SPIRVGlobalRegistry::getPointerStorageClass(Register VReg) const {
+  SPIRVType *Type = getSPIRVTypeForVReg(VReg);
+  assert(Type && Type->getOpcode() == SPIRV::OpTypePointer &&
+         Type->getOperand(1).isImm() && "Pointer type is expected");
+  return static_cast<SPIRV::StorageClass>(Type->getOperand(1).getImm());
+}
+
+SPIRVType *
+SPIRVGlobalRegistry::getOrCreateSPIRVIntegerType(unsigned BitWidth,
+                                                 MachineIRBuilder &MIRBuilder) {
+  return getOrCreateSPIRVType(
+      IntegerType::get(MIRBuilder.getMF().getFunction().getContext(), BitWidth),
+      MIRBuilder);
+}
+
+SPIRVType *SPIRVGlobalRegistry::restOfCreateSPIRVType(Type *LLVMTy,
+                                                      MachineInstrBuilder MIB) {
+  SPIRVType *SpirvType = MIB;
+  VRegToTypeMap[CurMF][getSPIRVTypeID(SpirvType)] = SpirvType;
+  SPIRVToLLVMType[SpirvType] = LLVMTy;
+  return SpirvType;
+}
+
+SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVIntegerType(
+    unsigned BitWidth, MachineInstr &I, const SPIRVInstrInfo &TII) {
+  Type *LLVMTy = IntegerType::get(CurMF->getFunction().getContext(), BitWidth);
+  MachineBasicBlock &BB = *I.getParent();
+  auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpTypeInt))
+                 .addDef(createTypeVReg(CurMF->getRegInfo()))
+                 .addImm(BitWidth)
+                 .addImm(0);
+  return restOfCreateSPIRVType(LLVMTy, MIB);
+}
+
+SPIRVType *
+SPIRVGlobalRegistry::getOrCreateSPIRVBoolType(MachineIRBuilder &MIRBuilder) {
+  return getOrCreateSPIRVType(
+      IntegerType::get(MIRBuilder.getMF().getFunction().getContext(), 1),
+      MIRBuilder);
+}
+
+SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVVectorType(
+    SPIRVType *BaseType, unsigned NumElements, MachineIRBuilder &MIRBuilder) {
+  return getOrCreateSPIRVType(
+      FixedVectorType::get(const_cast<Type *>(getTypeForSPIRVType(BaseType)),
+                           NumElements),
+      MIRBuilder);
+}
+
+SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVVectorType(
+    SPIRVType *BaseType, unsigned NumElements, MachineInstr &I,
+    const SPIRVInstrInfo &TII) {
+  Type *LLVMTy = FixedVectorType::get(
+      const_cast<Type *>(getTypeForSPIRVType(BaseType)), NumElements);
+  MachineBasicBlock &BB = *I.getParent();
+  auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpTypeVector))
+                 .addDef(createTypeVReg(CurMF->getRegInfo()))
+                 .addUse(getSPIRVTypeID(BaseType))
+                 .addImm(NumElements);
+  return restOfCreateSPIRVType(LLVMTy, MIB);
+}
+
+SPIRVType *
+SPIRVGlobalRegistry::getOrCreateSPIRVPointerType(SPIRVType *BaseType,
+                                                 MachineIRBuilder &MIRBuilder,
+                                                 SPIRV::StorageClass SClass) {
+  return getOrCreateSPIRVType(
+      PointerType::get(const_cast<Type *>(getTypeForSPIRVType(BaseType)),
+                       storageClassToAddressSpace(SClass)),
+      MIRBuilder);
+}
+
+SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVPointerType(
+    SPIRVType *BaseType, MachineInstr &I, const SPIRVInstrInfo &TII,
+    SPIRV::StorageClass SC) {
+  Type *LLVMTy =
+      PointerType::get(const_cast<Type *>(getTypeForSPIRVType(BaseType)),
+                       storageClassToAddressSpace(SC));
+  MachineBasicBlock &BB = *I.getParent();
+  auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpTypePointer))
+                 .addDef(createTypeVReg(CurMF->getRegInfo()))
+                 .addImm(static_cast<uint32_t>(SC))
+                 .addUse(getSPIRVTypeID(BaseType));
+  return restOfCreateSPIRVType(LLVMTy, MIB);
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
new file mode 100644
index 000000000000..952ab4c13e29
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
@@ -0,0 +1,174 @@
+//===-- SPIRVGlobalRegistry.h - SPIR-V Global Registry ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// SPIRVGlobalRegistry is used to maintain rich type information required for
+// SPIR-V even after lowering from LLVM IR to GMIR. It can convert an llvm::Type
+// into an OpTypeXXX instruction, and map it to a virtual register. Also it
+// builds and supports consistency of constants and global variables.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPIRV_SPIRVTYPEMANAGER_H
+#define LLVM_LIB_TARGET_SPIRV_SPIRVTYPEMANAGER_H
+
+#include "MCTargetDesc/SPIRVBaseInfo.h"
+#include "SPIRVInstrInfo.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+
+namespace llvm {
+using SPIRVType = const MachineInstr;
+
+class SPIRVGlobalRegistry {
+  // Registers holding values which have types associated with them.
+  // Initialized upon VReg definition in IRTranslator.
+  // Do not confuse this with DuplicatesTracker as DT maps Type* to <MF, Reg>
+  // where Reg = OpType...
+  // while VRegToTypeMap tracks SPIR-V type assigned to other regs (i.e. not
+  // type-declaring ones)
+  DenseMap<MachineFunction *, DenseMap<Register, SPIRVType *>> VRegToTypeMap;
+
+  DenseMap<SPIRVType *, const Type *> SPIRVToLLVMType;
+
+  // Number of bits pointers and size_t integers require.
+  const unsigned PointerSize;
+
+  // Add a new OpTypeXXX instruction without checking for duplicates.
+  SPIRVType *
+  createSPIRVType(const Type *Type, MachineIRBuilder &MIRBuilder,
+                  SPIRV::AccessQualifier AQ = SPIRV::AccessQualifier::ReadWrite,
+                  bool EmitIR = true);
+
+public:
+  SPIRVGlobalRegistry(unsigned PointerSize);
+
+  MachineFunction *CurMF;
+
+  // Get or create a SPIR-V type corresponding the given LLVM IR type,
+  // and map it to the given VReg by creating an ASSIGN_TYPE instruction.
+  SPIRVType *assignTypeToVReg(
+      const Type *Type, Register VReg, MachineIRBuilder &MIRBuilder,
+      SPIRV::AccessQualifier AQ = SPIRV::AccessQualifier::ReadWrite,
+      bool EmitIR = true);
+
+  // In cases where the SPIR-V type is already known, this function can be
+  // used to map it to the given VReg via an ASSIGN_TYPE instruction.
+  void assignSPIRVTypeToVReg(SPIRVType *Type, Register VReg,
+                             MachineFunction &MF);
+
+  // Either generate a new OpTypeXXX instruction or return an existing one
+  // corresponding to the given LLVM IR type.
+  // EmitIR controls if we emit GMIR or SPV constants (e.g. for array sizes)
+  // because this method may be called from InstructionSelector and we don't
+  // want to emit extra IR instructions there.
+  SPIRVType *getOrCreateSPIRVType(
+      const Type *Type, MachineIRBuilder &MIRBuilder,
+      SPIRV::AccessQualifier AQ = SPIRV::AccessQualifier::ReadWrite,
+      bool EmitIR = true);
+
+  const Type *getTypeForSPIRVType(const SPIRVType *Ty) const {
+    auto Res = SPIRVToLLVMType.find(Ty);
+    assert(Res != SPIRVToLLVMType.end());
+    return Res->second;
+  }
+
+  // Return the SPIR-V type instruction corresponding to the given VReg, or
+  // nullptr if no such type instruction exists.
+  SPIRVType *getSPIRVTypeForVReg(Register VReg) const;
+
+  // Whether the given VReg has a SPIR-V type mapped to it yet.
+  bool hasSPIRVTypeForVReg(Register VReg) const {
+    return getSPIRVTypeForVReg(VReg) != nullptr;
+  }
+
+  // Return the VReg holding the result of the given OpTypeXXX instruction.
+  Register getSPIRVTypeID(const SPIRVType *SpirvType) const {
+    assert(SpirvType && "Attempting to get type id for nullptr type.");
+    return SpirvType->defs().begin()->getReg();
+  }
+
+  void setCurrentFunc(MachineFunction &MF) { CurMF = &MF; }
+
+  // Whether the given VReg has an OpTypeXXX instruction mapped to it with the
+  // given opcode (e.g. OpTypeFloat).
+  bool isScalarOfType(Register VReg, unsigned TypeOpcode) const;
+
+  // Return true if the given VReg's assigned SPIR-V type is either a scalar
+  // matching the given opcode, or a vector with an element type matching that
+  // opcode (e.g. OpTypeBool, or OpTypeVector %x 4, where %x is OpTypeBool).
+  bool isScalarOrVectorOfType(Register VReg, unsigned TypeOpcode) const;
+
+  // For vectors or scalars of ints/floats, return the scalar type's bitwidth.
+  unsigned getScalarOrVectorBitWidth(const SPIRVType *Type) const;
+
+  // For integer vectors or scalars, return whether the integers are signed.
+  bool isScalarOrVectorSigned(const SPIRVType *Type) const;
+
+  // Gets the storage class of the pointer type assigned to this vreg.
+  SPIRV::StorageClass getPointerStorageClass(Register VReg) const;
+
+  // Return the number of bits SPIR-V pointers and size_t variables require.
+  unsigned getPointerSize() const { return PointerSize; }
+
+private:
+  SPIRVType *getOpTypeBool(MachineIRBuilder &MIRBuilder);
+
+  SPIRVType *getOpTypeInt(uint32_t Width, MachineIRBuilder &MIRBuilder,
+                          bool IsSigned = false);
+
+  SPIRVType *getOpTypeFloat(uint32_t Width, MachineIRBuilder &MIRBuilder);
+
+  SPIRVType *getOpTypeVoid(MachineIRBuilder &MIRBuilder);
+
+  SPIRVType *getOpTypeVector(uint32_t NumElems, SPIRVType *ElemType,
+                             MachineIRBuilder &MIRBuilder);
+
+  SPIRVType *getOpTypeArray(uint32_t NumElems, SPIRVType *ElemType,
+                            MachineIRBuilder &MIRBuilder, bool EmitIR = true);
+
+  SPIRVType *getOpTypePointer(SPIRV::StorageClass SC, SPIRVType *ElemType,
+                              MachineIRBuilder &MIRBuilder);
+
+  SPIRVType *getOpTypeFunction(SPIRVType *RetType,
+                               const SmallVectorImpl<SPIRVType *> &ArgTypes,
+                               MachineIRBuilder &MIRBuilder);
+  SPIRVType *restOfCreateSPIRVType(Type *LLVMTy, MachineInstrBuilder MIB);
+
+public:
+  Register buildConstantInt(uint64_t Val, MachineIRBuilder &MIRBuilder,
+                            SPIRVType *SpvType = nullptr, bool EmitIR = true);
+  Register buildConstantFP(APFloat Val, MachineIRBuilder &MIRBuilder,
+                           SPIRVType *SpvType = nullptr);
+  Register
+  buildGlobalVariable(Register Reg, SPIRVType *BaseType, StringRef Name,
+                      const GlobalValue *GV, SPIRV::StorageClass Storage,
+                      const MachineInstr *Init, bool IsConst, bool HasLinkageTy,
+                      SPIRV::LinkageType LinkageType,
+                      MachineIRBuilder &MIRBuilder, bool IsInstSelector);
+
+  // Convenient helpers for getting types with check for duplicates.
+  SPIRVType *getOrCreateSPIRVIntegerType(unsigned BitWidth,
+                                         MachineIRBuilder &MIRBuilder);
+  SPIRVType *getOrCreateSPIRVIntegerType(unsigned BitWidth, MachineInstr &I,
+                                         const SPIRVInstrInfo &TII);
+  SPIRVType *getOrCreateSPIRVBoolType(MachineIRBuilder &MIRBuilder);
+  SPIRVType *getOrCreateSPIRVVectorType(SPIRVType *BaseType,
+                                        unsigned NumElements,
+                                        MachineIRBuilder &MIRBuilder);
+  SPIRVType *getOrCreateSPIRVVectorType(SPIRVType *BaseType,
+                                        unsigned NumElements, MachineInstr &I,
+                                        const SPIRVInstrInfo &TII);
+
+  SPIRVType *getOrCreateSPIRVPointerType(
+      SPIRVType *BaseType, MachineIRBuilder &MIRBuilder,
+      SPIRV::StorageClass SClass = SPIRV::StorageClass::Function);
+  SPIRVType *getOrCreateSPIRVPointerType(
+      SPIRVType *BaseType, MachineInstr &I, const SPIRVInstrInfo &TII,
+      SPIRV::StorageClass SClass = SPIRV::StorageClass::Function);
+};
+} // end namespace llvm
+#endif // LLLVM_LIB_TARGET_SPIRV_SPIRVTYPEMANAGER_H
diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
new file mode 100644
index 000000000000..66ff51c912b0
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
@@ -0,0 +1,45 @@
+//===- SPIRVISelLowering.cpp - SPIR-V DAG Lowering Impl ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SPIRVTargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPIRVISelLowering.h"
+#include "SPIRV.h"
+
+#define DEBUG_TYPE "spirv-lower"
+
+using namespace llvm;
+
+unsigned SPIRVTargetLowering::getNumRegistersForCallingConv(
+    LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
+  // This code avoids CallLowering fail inside getVectorTypeBreakdown
+  // on v3i1 arguments. Maybe we need to return 1 for all types.
+  // TODO: remove it once this case is supported by the default implementation.
+  if (VT.isVector() && VT.getVectorNumElements() == 3 &&
+      (VT.getVectorElementType() == MVT::i1 ||
+       VT.getVectorElementType() == MVT::i8))
+    return 1;
+  return getNumRegisters(Context, VT);
+}
+
+MVT SPIRVTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
+                                                       CallingConv::ID CC,
+                                                       EVT VT) const {
+  // This code avoids CallLowering fail inside getVectorTypeBreakdown
+  // on v3i1 arguments. Maybe we need to return i32 for all types.
+  // TODO: remove it once this case is supported by the default implementation.
+  if (VT.isVector() && VT.getVectorNumElements() == 3) {
+    if (VT.getVectorElementType() == MVT::i1)
+      return MVT::v4i1;
+    else if (VT.getVectorElementType() == MVT::i8)
+      return MVT::v4i8;
+  }
+  return getRegisterType(Context, VT);
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.h b/llvm/lib/Target/SPIRV/SPIRVISelLowering.h
new file mode 100644
index 000000000000..bee9220f5248
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.h
@@ -0,0 +1,47 @@
+//===-- SPIRVISelLowering.h - SPIR-V DAG Lowering Interface -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that SPIR-V uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPIRV_SPIRVISELLOWERING_H
+#define LLVM_LIB_TARGET_SPIRV_SPIRVISELLOWERING_H
+
+#include "llvm/CodeGen/TargetLowering.h"
+
+namespace llvm {
+class SPIRVSubtarget;
+
+class SPIRVTargetLowering : public TargetLowering {
+public:
+  explicit SPIRVTargetLowering(const TargetMachine &TM,
+                               const SPIRVSubtarget &STI)
+      : TargetLowering(TM) {}
+
+  // Stop IRTranslator breaking up FMA instrs to preserve types information.
+  bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+                                  EVT) const override {
+    return true;
+  }
+
+  // This is to prevent sexts of non-i64 vector indices which are generated
+  // within general IRTranslator hence type generation for it is omitted.
+  MVT getVectorIdxTy(const DataLayout &DL) const override {
+    return MVT::getIntegerVT(32);
+  }
+  unsigned getNumRegistersForCallingConv(LLVMContext &Context,
+                                         CallingConv::ID CC,
+                                         EVT VT) const override;
+  MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
+                                    EVT VT) const override;
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_SPIRV_SPIRVISELLOWERING_H
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrFormats.td b/llvm/lib/Target/SPIRV/SPIRVInstrFormats.td
new file mode 100644
index 000000000000..c78c8ee11590
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrFormats.td
@@ -0,0 +1,31 @@
+//===-- SPIRVInstrFormats.td - SPIR-V Instruction Formats --*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+def StringImm: Operand<i32>{
+  let PrintMethod="printStringImm";
+}
+
+class Op<bits<16> Opcode, dag outs, dag ins, string asmstr, list<dag> pattern = []>
+  : Instruction {
+  field bits<16> Inst;
+
+  let Inst = Opcode;
+
+  let Namespace = "SPIRV";
+  let DecoderNamespace = "SPIRV";
+
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString = asmstr;
+  let Pattern = pattern;
+}
+
+// Pseudo instructions
+class Pseudo<dag outs, dag ins> : Op<0, outs, ins, ""> {
+  let isPseudo = 1;
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
new file mode 100644
index 000000000000..754906308114
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
@@ -0,0 +1,195 @@
+//===-- SPIRVInstrInfo.cpp - SPIR-V Instruction Information ------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SPIR-V implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPIRVInstrInfo.h"
+#include "SPIRV.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Support/ErrorHandling.h"
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "SPIRVGenInstrInfo.inc"
+
+using namespace llvm;
+
+SPIRVInstrInfo::SPIRVInstrInfo() : SPIRVGenInstrInfo() {}
+
+bool SPIRVInstrInfo::isConstantInstr(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case SPIRV::OpConstantTrue:
+  case SPIRV::OpConstantFalse:
+  case SPIRV::OpConstantI:
+  case SPIRV::OpConstantF:
+  case SPIRV::OpConstantComposite:
+  case SPIRV::OpConstantSampler:
+  case SPIRV::OpConstantNull:
+  case SPIRV::OpSpecConstantTrue:
+  case SPIRV::OpSpecConstantFalse:
+  case SPIRV::OpSpecConstant:
+  case SPIRV::OpSpecConstantComposite:
+  case SPIRV::OpSpecConstantOp:
+  case SPIRV::OpUndef:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool SPIRVInstrInfo::isTypeDeclInstr(const MachineInstr &MI) const {
+  auto &MRI = MI.getMF()->getRegInfo();
+  if (MI.getNumDefs() >= 1 && MI.getOperand(0).isReg()) {
+    auto DefRegClass = MRI.getRegClassOrNull(MI.getOperand(0).getReg());
+    return DefRegClass && DefRegClass->getID() == SPIRV::TYPERegClass.getID();
+  } else {
+    return false;
+  }
+}
+
+bool SPIRVInstrInfo::isDecorationInstr(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case SPIRV::OpDecorate:
+  case SPIRV::OpDecorateId:
+  case SPIRV::OpDecorateString:
+  case SPIRV::OpMemberDecorate:
+  case SPIRV::OpMemberDecorateString:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool SPIRVInstrInfo::isHeaderInstr(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case SPIRV::OpCapability:
+  case SPIRV::OpExtension:
+  case SPIRV::OpExtInstImport:
+  case SPIRV::OpMemoryModel:
+  case SPIRV::OpEntryPoint:
+  case SPIRV::OpExecutionMode:
+  case SPIRV::OpExecutionModeId:
+  case SPIRV::OpString:
+  case SPIRV::OpSourceExtension:
+  case SPIRV::OpSource:
+  case SPIRV::OpSourceContinued:
+  case SPIRV::OpName:
+  case SPIRV::OpMemberName:
+  case SPIRV::OpModuleProcessed:
+    return true;
+  default:
+    return isTypeDeclInstr(MI) || isConstantInstr(MI) || isDecorationInstr(MI);
+  }
+}
+
+// Analyze the branching code at the end of MBB, returning
+// true if it cannot be understood (e.g. it's a switch dispatch or isn't
+// implemented for a target).  Upon success, this returns false and returns
+// with the following information in various cases:
+//
+// 1. If this block ends with no branches (it just falls through to its succ)
+//    just return false, leaving TBB/FBB null.
+// 2. If this block ends with only an unconditional branch, it sets TBB to be
+//    the destination block.
+// 3. If this block ends with a conditional branch and it falls through to a
+//    successor block, it sets TBB to be the branch destination block and a
+//    list of operands that evaluate the condition. These operands can be
+//    passed to other TargetInstrInfo methods to create new branches.
+// 4. If this block ends with a conditional branch followed by an
+//    unconditional branch, it returns the 'true' destination in TBB, the
+//    'false' destination in FBB, and a list of operands that evaluate the
+//    condition.  These operands can be passed to other TargetInstrInfo
+//    methods to create new branches.
+//
+// Note that removeBranch and insertBranch must be implemented to support
+// cases where this method returns success.
+//
+// If AllowModify is true, then this routine is allowed to modify the basic
+// block (e.g. delete instructions after the unconditional branch).
+//
+// The CFG information in MBB.Predecessors and MBB.Successors must be valid
+// before calling this function.
+bool SPIRVInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                   MachineBasicBlock *&TBB,
+                                   MachineBasicBlock *&FBB,
+                                   SmallVectorImpl<MachineOperand> &Cond,
+                                   bool AllowModify) const {
+  TBB = nullptr;
+  FBB = nullptr;
+  if (MBB.empty())
+    return false;
+  auto MI = MBB.getLastNonDebugInstr();
+  if (!MI.isValid())
+    return false;
+  if (MI->getOpcode() == SPIRV::OpBranch) {
+    TBB = MI->getOperand(0).getMBB();
+    return false;
+  } else if (MI->getOpcode() == SPIRV::OpBranchConditional) {
+    Cond.push_back(MI->getOperand(0));
+    TBB = MI->getOperand(1).getMBB();
+    if (MI->getNumOperands() == 3) {
+      FBB = MI->getOperand(2).getMBB();
+    }
+    return false;
+  } else {
+    return true;
+  }
+}
+
+// Remove the branching code at the end of the specific MBB.
+// This is only invoked in cases where analyzeBranch returns success. It
+// returns the number of instructions that were removed.
+// If \p BytesRemoved is non-null, report the change in code size from the
+// removed instructions.
+unsigned SPIRVInstrInfo::removeBranch(MachineBasicBlock &MBB,
+                                      int *BytesRemoved) const {
+  report_fatal_error("Branch removal not supported, as MBB info not propagated"
+                     " to OpPhi instructions. Try using -O0 instead.");
+}
+
+// Insert branch code into the end of the specified MachineBasicBlock. The
+// operands to this method are the same as those returned by analyzeBranch.
+// This is only invoked in cases where analyzeBranch returns success. It
+// returns the number of instructions inserted. If \p BytesAdded is non-null,
+// report the change in code size from the added instructions.
+//
+// It is also invoked by tail merging to add unconditional branches in
+// cases where analyzeBranch doesn't apply because there was no original
+// branch to analyze.  At least this much must be implemented, else tail
+// merging needs to be disabled.
+//
+// The CFG information in MBB.Predecessors and MBB.Successors must be valid
+// before calling this function.
+unsigned SPIRVInstrInfo::insertBranch(
+    MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
+    ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
+  report_fatal_error("Branch insertion not supported, as MBB info not "
+                     "propagated to OpPhi instructions. Try using "
+                     "-O0 instead.");
+}
+
+void SPIRVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator I,
+                                 const DebugLoc &DL, MCRegister DestReg,
+                                 MCRegister SrcReg, bool KillSrc) const {
+  // Actually we don't need this COPY instruction. However if we do nothing with
+  // it, post RA pseudo instrs expansion just removes it and we get the code
+  // with undef registers. Therefore, we need to replace all uses of dst with
+  // the src register. COPY instr itself will be safely removed later.
+  assert(I->isCopy() && "Copy instruction is expected");
+  auto DstOp = I->getOperand(0);
+  auto SrcOp = I->getOperand(1);
+  assert(DstOp.isReg() && SrcOp.isReg() &&
+         "Register operands are expected in COPY");
+  auto &MRI = I->getMF()->getRegInfo();
+  MRI.replaceRegWith(DstOp.getReg(), SrcOp.getReg());
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
new file mode 100644
index 000000000000..2600d9cfca2e
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
@@ -0,0 +1,54 @@
+//===-- SPIRVInstrInfo.h - SPIR-V Instruction Information -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SPIR-V implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPIRV_SPIRVINSTRINFO_H
+#define LLVM_LIB_TARGET_SPIRV_SPIRVINSTRINFO_H
+
+#include "SPIRVRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "SPIRVGenInstrInfo.inc"
+
+namespace llvm {
+
+class SPIRVInstrInfo : public SPIRVGenInstrInfo {
+  const SPIRVRegisterInfo RI;
+
+public:
+  SPIRVInstrInfo();
+
+  const SPIRVRegisterInfo &getRegisterInfo() const { return RI; }
+  bool isHeaderInstr(const MachineInstr &MI) const;
+  bool isConstantInstr(const MachineInstr &MI) const;
+  bool isTypeDeclInstr(const MachineInstr &MI) const;
+  bool isDecorationInstr(const MachineInstr &MI) const;
+
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify = false) const override;
+
+  unsigned removeBranch(MachineBasicBlock &MBB,
+                        int *BytesRemoved = nullptr) const override;
+
+  unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+                        const DebugLoc &DL,
+                        int *BytesAdded = nullptr) const override;
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
+                   bool KillSrc) const override;
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_SPIRV_SPIRVINSTRINFO_H
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
new file mode 100644
index 000000000000..d6fec5fd0785
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -0,0 +1,732 @@
+//===-- SPIRVInstrInfo.td - Target Description for SPIR-V Target ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the SPIR-V instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+include "SPIRVInstrFormats.td"
+include "SPIRVEnums.td"
+
+// Codegen only metadata instructions
+let isCodeGenOnly=1 in {
+  def ASSIGN_TYPE: Pseudo<(outs ANYID:$dst_id), (ins ANYID:$src_id, TYPE:$src_ty)>;
+  def DECL_TYPE: Pseudo<(outs ANYID:$dst_id), (ins ANYID:$src_id, TYPE:$src_ty)>;
+  def GET_ID: Pseudo<(outs ID:$dst_id), (ins ANYID:$src)>;
+  def GET_fID: Pseudo<(outs fID:$dst_id), (ins ANYID:$src)>;
+  def GET_pID: Pseudo<(outs pID:$dst_id), (ins ANYID:$src)>;
+  def GET_vID: Pseudo<(outs vID:$dst_id), (ins ANYID:$src)>;
+  def GET_vfID: Pseudo<(outs vfID:$dst_id), (ins ANYID:$src)>;
+}
+
+def SPVTypeBin : SDTypeProfile<1, 2, []>;
+
+def assigntype : SDNode<"SPIRVISD::AssignType", SPVTypeBin>;
+
+def : GINodeEquiv<ASSIGN_TYPE, assigntype>;
+
+class BinOp<string name, bits<16> opCode, list<dag> pattern=[]>
+                : Op<opCode, (outs ANYID:$dst), (ins TYPE:$src_ty, ANYID:$src, ANYID:$src2),
+                  "$dst = "#name#" $src_ty $src $src2", pattern>;
+
+class BinOpTyped<string name, bits<16> opCode, RegisterClass CID, SDNode node>
+                : Op<opCode, (outs ID:$dst), (ins TYPE:$src_ty, CID:$src, CID:$src2),
+                  "$dst = "#name#" $src_ty $src $src2", [(set ID:$dst, (assigntype (node CID:$src, CID:$src2), TYPE:$src_ty))]>;
+
+class TernOpTyped<string name, bits<16> opCode, RegisterClass CCond, RegisterClass CID, SDNode node>
+                : Op<opCode, (outs ID:$dst), (ins TYPE:$src_ty, CCond:$cond, CID:$src1, CID:$src2),
+                  "$dst = "#name#" $src_ty $cond $src1 $src2", [(set ID:$dst, (assigntype (node CCond:$cond, CID:$src1, CID:$src2), TYPE:$src_ty))]>;
+
+multiclass BinOpTypedGen<string name, bits<16> opCode, SDNode node, bit genF = 0, bit genV = 0> {
+  if genF then
+    def S: BinOpTyped<name, opCode, fID, node>;
+  else
+    def S: BinOpTyped<name, opCode, ID, node>;
+  if genV then {
+    if genF then
+      def V: BinOpTyped<name, opCode, vfID, node>;
+    else
+      def V: BinOpTyped<name, opCode, vID, node>;
+  }
+}
+
+multiclass TernOpTypedGen<string name, bits<16> opCode, SDNode node, bit genI = 1, bit genF = 0, bit genV = 0> {
+  if genF then {
+    def SFSCond: TernOpTyped<name, opCode, ID, fID, node>;
+    def SFVCond: TernOpTyped<name, opCode, vID, fID, node>;
+  }
+  if genI then {
+    def SISCond: TernOpTyped<name, opCode, ID, ID, node>;
+    def SIVCond: TernOpTyped<name, opCode, vID, ID, node>;
+  }
+  if genV then {
+    if genF then {
+      def VFSCond: TernOpTyped<name, opCode, ID, vfID, node>;
+      def VFVCond: TernOpTyped<name, opCode, vID, vfID, node>;
+    }
+    if genI then {
+      def VISCond: TernOpTyped<name, opCode, ID, vID, node>;
+      def VIVCond: TernOpTyped<name, opCode, vID, vID, node>;
+    }
+  }
+}
+
+class UnOp<string name, bits<16> opCode, list<dag> pattern=[]>
+                : Op<opCode, (outs ANYID:$dst), (ins TYPE:$type, ANYID:$src),
+                  "$dst = "#name#" $type $src", pattern>;
+class UnOpTyped<string name, bits<16> opCode, RegisterClass CID, SDNode node>
+                : Op<opCode, (outs ID:$dst), (ins TYPE:$src_ty, CID:$src),
+                  "$dst = "#name#" $src_ty $src", [(set ID:$dst, (assigntype (node CID:$src), TYPE:$src_ty))]>;
+
+class SimpleOp<string name, bits<16> opCode>: Op<opCode, (outs), (ins), name>;
+
+// 3.42.1 Miscellaneous Instructions
+
+def OpNop: SimpleOp<"OpNop", 0>;
+def OpUndef: Op<1, (outs ID:$res), (ins TYPE:$type), "$res = OpUndef $type">;
+def OpSizeOf: Op<321, (outs ID:$res), (ins TYPE:$ty, ID:$ptr), "$res = OpSizeOf $ty $ptr">;
+
+// 3.42.2 Debug Instructions
+
+def OpSourceContinued: Op<2, (outs), (ins StringImm:$str, variable_ops),
+                  "OpSourceContinued $str">;
+def OpSource: Op<3, (outs), (ins SourceLanguage:$lang, i32imm:$version, variable_ops),
+                  "OpSource $lang $version">;
+def OpSourceExtension: Op<4, (outs), (ins StringImm:$extension, variable_ops),
+                  "OpSourceExtension $extension">;
+def OpName: Op<5, (outs), (ins ANY:$tar, StringImm:$name, variable_ops), "OpName $tar $name">;
+def OpMemberName: Op<6, (outs), (ins TYPE:$ty, i32imm:$mem, StringImm:$name, variable_ops),
+                  "OpMemberName $ty $mem $name">;
+def OpString: Op<7, (outs ID:$r), (ins StringImm:$s, variable_ops), "$r = OpString $s">;
+def OpLine: Op<8, (outs), (ins ID:$file, i32imm:$ln, i32imm:$col), "OpLine $file $ln $col">;
+def OpNoLine: Op<317, (outs), (ins), "OpNoLine">;
+def OpModuleProcessed: Op<330, (outs), (ins StringImm:$process, variable_ops),
+                  "OpModuleProcessed $process">;
+
+// 3.42.3 Annotation Instructions
+
+def OpDecorate: Op<71, (outs), (ins ANY:$target, Decoration:$dec, variable_ops),
+                  "OpDecorate $target $dec">;
+def OpMemberDecorate: Op<72, (outs), (ins TYPE:$t, i32imm:$m, Decoration:$d, variable_ops),
+                  "OpMemberDecorate $t $m $d">;
+
+// TODO Currently some deprecated opcodes are missing: OpDecorationGroup,
+// OpGroupDecorate and OpGroupMemberDecorate
+
+def OpDecorateId: Op<332, (outs), (ins ANY:$target, Decoration:$dec, variable_ops),
+                  "OpDecorateId $target $dec">;
+def OpDecorateString: Op<5632, (outs), (ins ANY:$t, Decoration:$d, StringImm:$s, variable_ops),
+                  "OpDecorateString $t $d $s">;
+def OpMemberDecorateString: Op<5633, (outs),
+                  (ins TYPE:$ty, i32imm:$mem, Decoration:$dec, StringImm:$str, variable_ops),
+                  "OpMemberDecorateString $ty $mem $dec $str">;
+
+// 3.42.4 Extension Instructions
+
+def OpExtension: Op<10, (outs), (ins StringImm:$name, variable_ops), "OpExtension $name">;
+def OpExtInstImport: Op<11, (outs ID:$res), (ins StringImm:$extInstsName, variable_ops),
+                  "$res = OpExtInstImport $extInstsName">;
+def OpExtInst: Op<12, (outs ID:$res), (ins TYPE:$ty, ID:$set, ExtInst:$inst, variable_ops),
+                  "$res = OpExtInst $ty $set $inst">;
+
+// 3.42.5 Mode-Setting Instructions
+
+def OpMemoryModel: Op<14, (outs), (ins AddressingModel:$addr, MemoryModel:$mem),
+                  "OpMemoryModel $addr $mem">;
+def OpEntryPoint: Op<15, (outs),
+                  (ins ExecutionModel:$model, ID:$entry, StringImm:$name, variable_ops),
+                  "OpEntryPoint $model $entry $name">;
+def OpExecutionMode: Op<16, (outs), (ins ID:$entry, ExecutionMode:$mode, variable_ops),
+                  "OpExecutionMode $entry $mode">;
+def OpCapability: Op<17, (outs), (ins Capability:$cap), "OpCapability $cap">;
+def OpExecutionModeId: Op<331, (outs), (ins ID:$entry, ExecutionMode:$mode, variable_ops),
+                  "OpExecutionModeId $entry $mode">;
+
+// 3.42.6 Type-Declaration Instructions
+
+def OpTypeVoid: Op<19, (outs TYPE:$type), (ins), "$type = OpTypeVoid">;
+def OpTypeBool: Op<20, (outs TYPE:$type), (ins), "$type = OpTypeBool">;
+def OpTypeInt: Op<21, (outs TYPE:$type), (ins i32imm:$width, i32imm:$signedness),
+                  "$type = OpTypeInt $width $signedness">;
+def OpTypeFloat: Op<22, (outs TYPE:$type), (ins i32imm:$width),
+                  "$type = OpTypeFloat $width">;
+def OpTypeVector: Op<23, (outs TYPE:$type), (ins TYPE:$compType, i32imm:$compCount),
+                  "$type = OpTypeVector $compType $compCount">;
+def OpTypeMatrix: Op<24, (outs TYPE:$type), (ins TYPE:$colType, i32imm:$colCount),
+                  "$type = OpTypeMatrix $colType $colCount">;
+def OpTypeImage: Op<25, (outs TYPE:$res), (ins TYPE:$sampTy, Dim:$dim, i32imm:$depth,
+      i32imm:$arrayed, i32imm:$MS, i32imm:$sampled, ImageFormat:$imFormat, variable_ops),
+                  "$res = OpTypeImage $sampTy $dim $depth $arrayed $MS $sampled $imFormat">;
+def OpTypeSampler: Op<26, (outs TYPE:$res), (ins), "$res = OpTypeSampler">;
+def OpTypeSampledImage: Op<27, (outs TYPE:$res), (ins TYPE:$imageType),
+                  "$res = OpTypeSampledImage $imageType">;
+def OpTypeArray: Op<28, (outs TYPE:$type), (ins TYPE:$elementType, ID:$length),
+                  "$type = OpTypeArray $elementType $length">;
+def OpTypeRuntimeArray: Op<29, (outs TYPE:$type), (ins TYPE:$elementType),
+                  "$type = OpTypeRuntimeArray $elementType">;
+def OpTypeStruct: Op<30, (outs TYPE:$res), (ins variable_ops), "$res = OpTypeStruct">;
+def OpTypeOpaque: Op<31, (outs TYPE:$res), (ins StringImm:$name, variable_ops),
+                  "$res = OpTypeOpaque $name">;
+def OpTypePointer: Op<32, (outs TYPE:$res), (ins StorageClass:$storage, TYPE:$type),
+                  "$res = OpTypePointer $storage $type">;
+def OpTypeFunction: Op<33, (outs TYPE:$funcType), (ins TYPE:$returnType, variable_ops),
+                  "$funcType = OpTypeFunction $returnType">;
+def OpTypeEvent: Op<34, (outs TYPE:$res), (ins), "$res = OpTypeEvent">;
+def OpTypeDeviceEvent: Op<35, (outs TYPE:$res), (ins), "$res = OpTypeDeviceEvent">;
+def OpTypeReserveId: Op<36, (outs TYPE:$res), (ins), "$res = OpTypeReserveId">;
+def OpTypeQueue: Op<37, (outs TYPE:$res), (ins), "$res = OpTypeQueue">;
+def OpTypePipe: Op<38, (outs TYPE:$res), (ins AccessQualifier:$a), "$res = OpTypePipe $a">;
+def OpTypeForwardPointer: Op<39, (outs), (ins TYPE:$ptrType, StorageClass:$storageClass),
+                  "OpTypeForwardPointer $ptrType $storageClass">;
+def OpTypePipeStorage: Op<322, (outs TYPE:$res), (ins), "$res = OpTypePipeStorage">;
+def OpTypeNamedBarrier: Op<327, (outs TYPE:$res), (ins), "$res = OpTypeNamedBarrier">;
+def OpTypeAccelerationStructureNV: Op<5341, (outs TYPE:$res), (ins),
+                  "$res = OpTypeAccelerationStructureNV">;
+def OpTypeCooperativeMatrixNV: Op<5358, (outs TYPE:$res),
+                  (ins TYPE:$compType, ID:$scope, ID:$rows, ID:$cols),
+                  "$res = OpTypeCooperativeMatrixNV $compType $scope $rows $cols">;
+
+// 3.42.7 Constant-Creation Instructions
+
+def imm_to_i32 : SDNodeXForm<imm, [{
+return CurDAG->getTargetConstant(
+  N->getValueAP().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+def fimm_to_i32 : SDNodeXForm<imm, [{
+return CurDAG->getTargetConstant(
+  N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+def gi_bitcast_fimm_to_i32 : GICustomOperandRenderer<"renderFImm32">,
+  GISDNodeXFormEquiv<fimm_to_i32>;
+
+def gi_bitcast_imm_to_i32 : GICustomOperandRenderer<"renderImm32">,
+  GISDNodeXFormEquiv<imm_to_i32>;
+
+def PseudoConstI: IntImmLeaf<i32, [{ return Imm.getBitWidth() <= 32; }], imm_to_i32>;
+def PseudoConstF: FPImmLeaf<f32, [{  return true; }], fimm_to_i32>;
+def ConstPseudoTrue: IntImmLeaf<i32, [{ return Imm.getBitWidth() == 1 && Imm.getZExtValue() == 1; }]>;
+def ConstPseudoFalse: IntImmLeaf<i32, [{ return Imm.getBitWidth() == 1 && Imm.getZExtValue() == 0; }]>;
+def ConstPseudoNull: IntImmLeaf<i64, [{ return Imm.isNullValue(); }]>;
+
+multiclass IntFPImm<bits<16> opCode, string name> {
+  def I: Op<opCode, (outs ID:$dst), (ins TYPE:$type, ID:$src, variable_ops),
+                  "$dst = "#name#" $type $src", [(set ID:$dst, (assigntype PseudoConstI:$src, TYPE:$type))]>;
+  def F: Op<opCode, (outs ID:$dst), (ins TYPE:$type, fID:$src, variable_ops),
+                  "$dst = "#name#" $type $src", [(set ID:$dst, (assigntype PseudoConstF:$src, TYPE:$type))]>;
+}
+
+def OpConstantTrue: Op<41, (outs ID:$dst), (ins TYPE:$src_ty), "$dst = OpConstantTrue $src_ty",
+                      [(set ID:$dst, (assigntype ConstPseudoTrue, TYPE:$src_ty))]>;
+def OpConstantFalse: Op<42, (outs ID:$dst), (ins TYPE:$src_ty), "$dst = OpConstantFalse $src_ty",
+                      [(set ID:$dst, (assigntype ConstPseudoFalse, TYPE:$src_ty))]>;
+
+defm OpConstant: IntFPImm<43, "OpConstant">;
+
+def OpConstantComposite: Op<44, (outs ID:$res), (ins TYPE:$type, variable_ops),
+                  "$res = OpConstantComposite $type">;
+def OpConstantSampler: Op<45, (outs ID:$res),
+                  (ins TYPE:$t, SamplerAddressingMode:$s, i32imm:$p, SamplerFilterMode:$f),
+                  "$res = OpConstantSampler $t $s $p $f">;
+def OpConstantNull: Op<46, (outs ID:$dst), (ins TYPE:$src_ty), "$dst = OpConstantNull $src_ty",
+                      [(set ID:$dst, (assigntype ConstPseudoNull, TYPE:$src_ty))]>;
+
+def OpSpecConstantTrue: Op<48, (outs ID:$r), (ins TYPE:$t), "$r = OpSpecConstantTrue $t">;
+def OpSpecConstantFalse: Op<49, (outs ID:$r), (ins TYPE:$t), "$r = OpSpecConstantFalse $t">;
+def OpSpecConstant: Op<50, (outs ID:$res), (ins TYPE:$type, i32imm:$imm, variable_ops),
+                  "$res = OpSpecConstant $type $imm">;
+def OpSpecConstantComposite: Op<51, (outs ID:$res), (ins TYPE:$type, variable_ops),
+                  "$res = OpSpecConstantComposite $type">;
+def OpSpecConstantOp: Op<52, (outs ID:$res), (ins TYPE:$t, i32imm:$c, ID:$o, variable_ops),
+                  "$res = OpSpecConstantOp $t $c $o">;
+
+// 3.42.8 Memory Instructions
+
+def OpVariable: Op<59, (outs ID:$res), (ins TYPE:$type, StorageClass:$sc, variable_ops),
+                  "$res = OpVariable $type $sc">;
+def OpImageTexelPointer: Op<60, (outs ID:$res),
+                  (ins TYPE:$resType, ID:$image, ID:$coord, ID:$sample),
+                  "$res = OpImageTexelPointer $resType $image $coord $sample">;
+def OpLoad: Op<61, (outs ID:$res), (ins TYPE:$resType, ID:$pointer, variable_ops),
+                  "$res = OpLoad $resType $pointer">;
+def OpStore: Op<62, (outs), (ins ID:$pointer, ID:$objectToStore, variable_ops),
+                  "OpStore $pointer $objectToStore">;
+def OpCopyMemory: Op<63, (outs), (ins ID:$dest, ID:$src, variable_ops),
+                  "OpCopyMemory $dest $src">;
+def OpCopyMemorySized: Op<64, (outs), (ins ID:$dest, ID:$src, ID:$size, variable_ops),
+                  "OpCopyMemorySized $dest $src $size">;
+def OpAccessChain: Op<65, (outs ID:$res), (ins TYPE:$type, ID:$base, variable_ops),
+                  "$res = OpAccessChain $type $base">;
+def OpInBoundsAccessChain: Op<66, (outs ID:$res),
+                  (ins TYPE:$type, ID:$base, variable_ops),
+                  "$res = OpInBoundsAccessChain $type $base">;
+def OpPtrAccessChain: Op<67, (outs ID:$res),
+                  (ins TYPE:$type, ID:$base, ID:$element, variable_ops),
+                  "$res = OpPtrAccessChain $type $base $element">;
+def OpArrayLength: Op<68, (outs ID:$res), (ins TYPE:$resTy, ID:$struct, i32imm:$arrayMember),
+                  "$res = OpArrayLength $resTy $struct $arrayMember">;
+def OpGenericPtrMemSemantics: Op<69, (outs ID:$res), (ins TYPE:$resType, ID:$pointer),
+                  "$res = OpGenericPtrMemSemantics $resType $pointer">;
+def OpInBoundsPtrAccessChain: Op<70, (outs ID:$res),
+                  (ins TYPE:$type, ID:$base, ID:$element, variable_ops),
+                  "$res = OpInBoundsPtrAccessChain $type $base $element">;
+def OpPtrEqual: Op<401, (outs ID:$res), (ins TYPE:$resType, ID:$a, ID:$b),
+                  "$res = OpPtrEqual $resType $a $b">;
+def OpPtrNotEqual: Op<402, (outs ID:$res), (ins TYPE:$resType, ID:$a, ID:$b),
+                  "$res = OpPtrNotEqual $resType $a $b">;
+def OpPtrDiff: Op<403, (outs ID:$res), (ins TYPE:$resType, ID:$a, ID:$b),
+                  "$res = OpPtrDiff $resType $a $b">;
+
+// 3.42.9 Function Instructions
+
+def OpFunction: Op<54, (outs ID:$func),
+                  (ins TYPE:$resType, FunctionControl:$funcControl, TYPE:$funcType),
+                  "$func = OpFunction $resType $funcControl $funcType">;
+def OpFunctionParameter: Op<55, (outs ID:$arg), (ins TYPE:$type),
+                  "$arg = OpFunctionParameter $type">;
+def OpFunctionEnd: Op<56, (outs), (ins), "OpFunctionEnd"> {
+  let isTerminator=1;
+}
+def OpFunctionCall: Op<57, (outs ID:$res), (ins TYPE:$resType, ID:$function, variable_ops),
+                  "$res = OpFunctionCall $resType $function">;
+
+// 3.42.10 Image Instructions
+
+def OpSampledImage: BinOp<"OpSampledImage", 86>;
+
+def OpImageSampleImplicitLod: Op<87, (outs ID:$res),
+                  (ins TYPE:$type, ID:$sampledImage, ID:$coord, variable_ops),
+                  "$res = OpImageSampleImplicitLod $type $sampledImage $coord">;
+def OpImageSampleExplicitLod: Op<88, (outs ID:$res),
+                  (ins TYPE:$ty, ID:$sImage, ID:$uv, ImageOperand:$op, ID:$i, variable_ops),
+                  "$res = OpImageSampleExplicitLod $ty $sImage $uv $op $i">;
+
+def OpImageSampleDrefImplicitLod: Op<89, (outs ID:$res),
+                  (ins TYPE:$type, ID:$sampledImage, ID:$coord, ID:$dref, variable_ops),
+                  "$res = OpImageSampleDrefImplicitLod $type $sampledImage $dref $coord">;
+def OpImageSampleDrefExplicitLod: Op<90, (outs ID:$res),
+                  (ins TYPE:$ty, ID:$im, ID:$uv, ID:$d, ImageOperand:$op, ID:$i, variable_ops),
+                  "$res = OpImageSampleDrefExplicitLod $ty $im $uv $d $op $i">;
+
+def OpImageSampleProjImplicitLod: Op<91, (outs ID:$res),
+                  (ins TYPE:$type, ID:$sampledImage, ID:$coord, variable_ops),
+                  "$res = OpImageSampleProjImplicitLod $type $sampledImage $coord">;
+def OpImageSampleProjExplicitLod: Op<92, (outs ID:$res),
+                  (ins TYPE:$ty, ID:$im, ID:$uv, ID:$d, ImageOperand:$op, ID:$i, variable_ops),
+                  "$res = OpImageSampleProjExplicitLod $ty $im $uv $op $i">;
+
+def OpImageSampleProjDrefImplicitLod: Op<93, (outs ID:$res),
+                  (ins TYPE:$type, ID:$sampledImage, ID:$coord, ID:$dref, variable_ops),
+                  "$res = OpImageSampleProjDrefImplicitLod $type $sampledImage $dref $coord">;
+def OpImageSampleProjDrefExplicitLod: Op<94, (outs ID:$res),
+                  (ins TYPE:$ty, ID:$im, ID:$uv, ID:$d, ImageOperand:$op, ID:$i, variable_ops),
+                  "$res = OpImageSampleProjDrefExplicitLod $ty $im $uv $d $op $i">;
+
+def OpImageFetch: Op<95, (outs ID:$res),
+                  (ins TYPE:$type, ID:$image, ID:$coord, variable_ops),
+                  "$res = OpImageFetch $type $image $coord">;
+def OpImageGather: Op<96, (outs ID:$res),
+                  (ins TYPE:$type, ID:$sampledImage, ID:$coord, ID:$component, variable_ops),
+                  "$res = OpImageGather $type $sampledImage $coord $component">;
+def OpImageDrefGather: Op<97, (outs ID:$res),
+                  (ins TYPE:$type, ID:$sampledImage, ID:$coord, ID:$dref, variable_ops),
+                  "$res = OpImageDrefGather $type $sampledImage $coord $dref">;
+
+def OpImageRead: Op<98, (outs ID:$res),
+                  (ins TYPE:$type, ID:$image, ID:$coord, variable_ops),
+                  "$res = OpImageRead $type $image $coord">;
+def OpImageWrite: Op<99, (outs), (ins ID:$image, ID:$coord, ID:$texel, variable_ops),
+                  "OpImageWrite $image $coord $texel">;
+
+def OpImage: UnOp<"OpImage", 100>;
+def OpImageQueryFormat: UnOp<"OpImageQueryFormat", 101>;
+def OpImageQueryOrder: UnOp<"OpImageQueryOrder", 102>;
+def OpImageQuerySizeLod: BinOp<"OpImageQuerySizeLod", 103>;
+def OpImageQuerySize: UnOp<"OpImageQuerySize", 104>;
+def OpImageQueryLod: BinOp<"OpImageQueryLod", 105>;
+def OpImageQueryLevels: UnOp<"OpImageQueryLevels", 106>;
+def OpImageQuerySamples: UnOp<"OpImageQuerySamples", 107>;
+
+def OpImageSparseSampleImplicitLod: Op<305, (outs ID:$res),
+                  (ins TYPE:$type, ID:$sampledImage, ID:$coord, variable_ops),
+                  "$res = OpImageSparseSampleImplicitLod $type $sampledImage $coord">;
+def OpImageSparseSampleExplicitLod: Op<306, (outs ID:$res),
+                  (ins TYPE:$ty, ID:$sImage, ID:$uv, ImageOperand:$op, ID:$i, variable_ops),
+                  "$res = OpImageSparseSampleExplicitLod $ty $sImage $uv $op $i">;
+
+def OpImageSparseSampleDrefImplicitLod: Op<307, (outs ID:$res),
+                  (ins TYPE:$type, ID:$sampledImg, ID:$coord, ID:$dref, variable_ops),
+                  "$res = OpImageSparseSampleDrefImplicitLod $type $sampledImg $dref $coord">;
+def OpImageSparseSampleDrefExplicitLod: Op<308, (outs ID:$res),
+                  (ins TYPE:$ty, ID:$im, ID:$uv, ID:$d, ImageOperand:$op, ID:$i, variable_ops),
+                  "$res = OpImageSparseSampleDrefExplicitLod $ty $im $uv $d $op $i">;
+
+def OpImageSparseSampleProjImplicitLod: Op<309, (outs ID:$res),
+                  (ins TYPE:$type, ID:$sampledImage, ID:$coord, variable_ops),
+                  "$res = OpImageSparseSampleProjImplicitLod $type $sampledImage $coord">;
+def OpImageSparseSampleProjExplicitLod: Op<310, (outs ID:$res),
+                  (ins TYPE:$ty, ID:$im, ID:$uv, ID:$d, ImageOperand:$op, ID:$i, variable_ops),
+                  "$res = OpImageSparseSampleProjExplicitLod $ty $im $uv $op $i">;
+
+def OpImageSparseSampleProjDrefImplicitLod: Op<311, (outs ID:$res),
+                  (ins TYPE:$type, ID:$sImage, ID:$coord, ID:$dref, variable_ops),
+                  "$res = OpImageSparseSampleProjDrefImplicitLod $type $sImage $dref $coord">;
+def OpImageSparseSampleProjDrefExplicitLod: Op<312, (outs ID:$res),
+                  (ins TYPE:$ty, ID:$im, ID:$uv, ID:$d, ImageOperand:$op, ID:$i, variable_ops),
+                  "$res = OpImageSparseSampleProjDrefExplicitLod $ty $im $uv $d $op $i">;
+
+def OpImageSparseFetch: Op<313, (outs ID:$res),
+                  (ins TYPE:$type, ID:$image, ID:$coord, variable_ops),
+                  "$res = OpImageSparseFetch $type $image $coord">;
+def OpImageSparseGather: Op<314, (outs ID:$res),
+                  (ins TYPE:$type, ID:$sampledImage, ID:$coord, ID:$component, variable_ops),
+                  "$res = OpImageSparseGather $type $sampledImage $coord $component">;
+def OpImageSparseDrefGather: Op<315, (outs ID:$res),
+                  (ins TYPE:$type, ID:$sampledImage, ID:$coord, ID:$dref, variable_ops),
+                  "$res = OpImageSparseDrefGather $type $sampledImage $coord $dref">;
+
+def OpImageSparseTexelsResident: UnOp<"OpImageSparseTexelsResident", 316>;
+
+def OpImageSparseRead: Op<320, (outs ID:$res),
+                  (ins TYPE:$type, ID:$image, ID:$coord, variable_ops),
+                  "$res = OpImageSparseRead $type $image $coord">;
+
+def OpImageSampleFootprintNV: Op<5283, (outs ID:$res),
+                  (ins TYPE:$ty, ID:$sImg, ID:$uv, ID:$granularity, ID:$coarse, variable_ops),
+                  "$res = OpImageSampleFootprintNV $ty $sImg $uv $granularity $coarse">;
+
+// 3.42.11 Conversion instructions
+
+def OpConvertFToU : UnOp<"OpConvertFToU", 109>;
+def OpConvertFToS : UnOp<"OpConvertFToS", 110>;
+def OpConvertSToF : UnOp<"OpConvertSToF", 111>;
+def OpConvertUToF : UnOp<"OpConvertUToF", 112>;
+
+def OpUConvert : UnOp<"OpUConvert", 113>;
+def OpSConvert : UnOp<"OpSConvert", 114>;
+def OpFConvert : UnOp<"OpFConvert", 115>;
+
+def OpQuantizeToF16 : UnOp<"OpQuantizeToF16", 116>;
+
+def OpConvertPtrToU : UnOp<"OpConvertPtrToU", 117>;
+
+def OpSatConvertSToU : UnOp<"OpSatConvertSToU", 118>;
+def OpSatConvertUToS : UnOp<"OpSatConvertUToS", 119>;
+
+def OpConvertUToPtr : UnOp<"OpConvertUToPtr", 120>;
+def OpPtrCastToGeneric : UnOp<"OpPtrCastToGeneric", 121>;
+def OpGenericCastToPtr : UnOp<"OpGenericCastToPtr", 122>;
+def OpGenericCastToPtrExplicit : Op<123, (outs ID:$r), (ins TYPE:$t, ID:$p, StorageClass:$s),
+                              "$r = OpGenericCastToPtrExplicit $t $p $s">;
+def OpBitcast : UnOp<"OpBitcast", 124>;
+
+// 3.42.12 Composite Instructions
+
+def OpVectorExtractDynamic: Op<77, (outs ID:$res), (ins TYPE:$type, vID:$vec, ID:$idx),
+                  "$res = OpVectorExtractDynamic $type $vec $idx", [(set ID:$res, (assigntype (extractelt vID:$vec, ID:$idx), TYPE:$type))]>;
+
+def OpVectorInsertDynamic: Op<78, (outs ID:$res), (ins TYPE:$ty, ID:$vec, ID:$comp, ID:$idx),
+                  "$res = OpVectorInsertDynamic $ty $vec $comp $idx">;
+def OpVectorShuffle: Op<79, (outs ID:$res), (ins TYPE:$ty, ID:$v1, ID:$v2, variable_ops),
+                  "$res = OpVectorShuffle $ty $v1 $v2">;
+def OpCompositeConstruct: Op<80, (outs ID:$res), (ins TYPE:$type, variable_ops),
+                  "$res = OpCompositeConstruct $type">;
+def OpCompositeExtract: Op<81, (outs ID:$res), (ins TYPE:$type, ID:$base, variable_ops),
+                  "$res = OpCompositeExtract $type $base">;
+def OpCompositeInsert: Op<82, (outs ID:$r), (ins TYPE:$ty, ID:$obj, ID:$base, variable_ops),
+                  "$r = OpCompositeInsert $ty $obj $base">;
+def OpCopyObject: UnOp<"OpCopyObject", 83>;
+def OpTranspose: UnOp<"OpTranspose", 84>;
+def OpCopyLogical: UnOp<"OpCopyLogical", 400>;
+
+// 3.42.13 Arithmetic Instructions
+
+def OpSNegate: UnOp<"OpSNegate", 126>;
+def OpFNegate: UnOpTyped<"OpFNegate", 127, fID, fneg>;
+defm OpIAdd: BinOpTypedGen<"OpIAdd", 128, add, 0, 1>;
+defm OpFAdd: BinOpTypedGen<"OpFAdd", 129, fadd, 1, 1>;
+
+defm OpISub: BinOpTypedGen<"OpISub", 130, sub, 0, 1>;
+defm OpFSub: BinOpTypedGen<"OpFSub", 131, fsub, 1, 1>;
+
+defm OpIMul: BinOpTypedGen<"OpIMul", 132, mul, 0, 1>;
+defm OpFMul: BinOpTypedGen<"OpFMul", 133, fmul, 1, 1>;
+
+defm OpUDiv: BinOpTypedGen<"OpUDiv", 134, udiv, 0, 1>;
+defm OpSDiv: BinOpTypedGen<"OpSDiv", 135, sdiv, 0, 1>;
+defm OpFDiv: BinOpTypedGen<"OpFDiv", 136, fdiv, 1, 1>;
+
+defm OpUMod: BinOpTypedGen<"OpUMod", 137, urem, 0, 1>;
+defm OpSRem: BinOpTypedGen<"OpSRem", 138, srem, 0, 1>;
+
+def OpSMod: BinOp<"OpSMod", 139>;
+
+defm OpFRem: BinOpTypedGen<"OpFRem", 140, frem, 1, 1>;
+def OpFMod: BinOp<"OpFMod", 141>;
+
+def OpVectorTimesScalar: BinOp<"OpVectorTimesScalar", 142>;
+def OpMatrixTimesScalar: BinOp<"OpMatrixTimesScalar", 143>;
+def OpVectorTimesMatrix: BinOp<"OpVectorTimesMatrix", 144>;
+def OpMatrixTimesVector: BinOp<"OpMatrixTimesVector", 145>;
+def OpMatrixTimesMatrix: BinOp<"OpMatrixTimesMatrix", 146>;
+
+def OpOuterProduct: BinOp<"OpOuterProduct", 147>;
+def OpDot: BinOp<"OpDot", 148>;
+
+def OpIAddCarry: BinOpTyped<"OpIAddCarry", 149, ID, addc>;
+def OpISubBorrow: BinOpTyped<"OpISubBorrow", 150, ID, subc>;
+def OpUMulExtended: BinOp<"OpUMulExtended", 151>;
+def OpSMulExtended: BinOp<"OpSMulExtended", 152>;
+
+// 3.42.14 Bit Instructions
+
+defm OpShiftRightLogical: BinOpTypedGen<"OpShiftRightLogical", 194, srl, 0, 1>;
+defm OpShiftRightArithmetic: BinOpTypedGen<"OpShiftRightArithmetic", 195, sra, 0, 1>;
+defm OpShiftLeftLogical: BinOpTypedGen<"OpShiftLeftLogical", 196, shl, 0, 1>;
+
+defm OpBitwiseOr: BinOpTypedGen<"OpBitwiseOr", 197, or, 0, 1>;
+defm OpBitwiseXor: BinOpTypedGen<"OpBitwiseXor", 198, xor, 0, 1>;
+defm OpBitwiseAnd: BinOpTypedGen<"OpBitwiseAnd", 199, and, 0, 1>;
+def OpNot: UnOp<"OpNot", 200>;
+
+def OpBitFieldInsert: Op<201, (outs ID:$res),
+                  (ins TYPE:$ty, ID:$base, ID:$insert, ID:$offset, ID:$count),
+                  "$res = OpBitFieldInsert $ty $base $insert $offset $count">;
+def OpBitFieldSExtract: Op<202, (outs ID:$res),
+                  (ins TYPE:$ty, ID:$base, ID:$offset, ID:$count),
+                  "$res = OpBitFieldSExtract $ty $base $offset $count">;
+def OpBitFieldUExtract: Op<203, (outs ID:$res),
+                  (ins TYPE:$ty, ID:$base, ID:$offset, ID:$count),
+                  "$res = OpBitFieldUExtract $ty $base $offset $count">;
+def OpBitReverse: Op<204, (outs ID:$r), (ins TYPE:$ty, ID:$b), "$r = OpBitReverse $ty $b">;
+def OpBitCount: Op<205, (outs ID:$r), (ins TYPE:$ty, ID:$b), "$r = OpBitCount $ty $b">;
+
+// 3.42.15 Relational and Logical Instructions
+
+def OpAny: Op<154, (outs ID:$res), (ins TYPE:$ty, ID:$vec),
+                  "$res = OpAny $ty $vec">;
+def OpAll: Op<155, (outs ID:$res), (ins TYPE:$ty, ID:$vec),
+                  "$res = OpAll $ty $vec">;
+
+def OpIsNan: UnOp<"OpIsNan", 156>;
+def OpIsInf: UnOp<"OpIsInf", 157>;
+def OpIsFinite: UnOp<"OpIsFinite", 158>;
+def OpIsNormal: UnOp<"OpIsNormal", 159>;
+def OpSignBitSet: UnOp<"OpSignBitSet", 160>;
+
+def OpLessOrGreater: BinOp<"OpLessOrGreater", 161>;
+def OpOrdered: BinOp<"OpOrdered", 162>;
+def OpUnordered: BinOp<"OpUnordered", 163>;
+
+def OpLogicalEqual: BinOp<"OpLogicalEqual", 164>;
+def OpLogicalNotEqual: BinOp<"OpLogicalNotEqual", 165>;
+def OpLogicalOr: BinOp<"OpLogicalOr", 166>;
+def OpLogicalAnd: BinOp<"OpLogicalAnd", 167>;
+def OpLogicalNot: UnOp<"OpLogicalNot", 168>;
+
+defm OpSelect: TernOpTypedGen<"OpSelect", 169, select, 1, 1, 1>;
+
+def OpIEqual: BinOp<"OpIEqual", 170>;
+def OpINotEqual: BinOp<"OpINotEqual", 171>;
+
+def OpUGreaterThan: BinOp<"OpUGreaterThan", 172>;
+def OpSGreaterThan: BinOp<"OpSGreaterThan", 173>;
+def OpUGreaterThanEqual: BinOp<"OpUGreaterThanEqual", 174>;
+def OpSGreaterThanEqual: BinOp<"OpSGreaterThanEqual", 175>;
+def OpULessThan: BinOp<"OpULessThan", 176>;
+def OpSLessThan: BinOp<"OpSLessThan", 177>;
+def OpULessThanEqual: BinOp<"OpULessThanEqual", 178>;
+def OpSLessThanEqual: BinOp<"OpSLessThanEqual", 179>;
+
+def OpFOrdEqual: BinOp<"OpFOrdEqual", 180>;
+def OpFUnordEqual: BinOp<"OpFUnordEqual", 181>;
+def OpFOrdNotEqual: BinOp<"OpFOrdNotEqual", 182>;
+def OpFUnordNotEqual: BinOp<"OpFUnordNotEqual", 183>;
+
+def OpFOrdLessThan: BinOp<"OpFOrdLessThan", 184>;
+def OpFUnordLessThan: BinOp<"OpFUnordLessThan", 185>;
+def OpFOrdGreaterThan: BinOp<"OpFOrdGreaterThan", 186>;
+def OpFUnordGreaterThan: BinOp<"OpFUnordGreaterThan", 187>;
+
+def OpFOrdLessThanEqual: BinOp<"OpFOrdLessThanEqual", 188>;
+def OpFUnordLessThanEqual: BinOp<"OpFUnordLessThanEqual", 189>;
+def OpFOrdGreaterThanEqual: BinOp<"OpFOrdGreaterThanEqual", 190>;
+def OpFUnordGreaterThanEqual: BinOp<"OpFUnordGreaterThanEqual", 191>;
+
+// 3.42.16 Derivative Instructions
+
+def OpDPdx: UnOp<"OpDPdx", 207>;
+def OpDPdy: UnOp<"OpDPdy", 208>;
+def OpFwidth: UnOp<"OpFwidth", 209>;
+
+def OpDPdxFine: UnOp<"OpDPdxFine", 210>;
+def OpDPdyFine: UnOp<"OpDPdyFine", 211>;
+def OpFwidthFine: UnOp<"OpFwidthFine", 212>;
+
+def OpDPdxCoarse: UnOp<"OpDPdxCoarse", 213>;
+def OpDPdyCoarse: UnOp<"OpDPdyCoarse", 214>;
+def OpFwidthCoarse: UnOp<"OpFwidthCoarse", 215>;
+
+// 3.42.17 Control-Flow Instructions
+
+def OpPhi: Op<245, (outs ID:$res), (ins TYPE:$type, ID:$var0, ID:$block0, variable_ops),
+                  "$res = OpPhi $type $var0 $block0">;
+def OpLoopMerge: Op<246, (outs), (ins ID:$merge, ID:$continue, LoopControl:$lc, variable_ops),
+                  "OpLoopMerge $merge $merge $continue $lc">;
+def OpSelectionMerge: Op<247, (outs), (ins ID:$merge, SelectionControl:$sc),
+                  "OpSelectionMerge $merge $sc">;
+def OpLabel: Op<248, (outs ID:$label), (ins), "$label = OpLabel">;
+let isTerminator=1 in {
+  def OpBranch: Op<249, (outs), (ins ID:$label), "OpBranch $label">;
+  def OpBranchConditional: Op<250, (outs), (ins ID:$cond, ID:$true, ID:$false, variable_ops),
+                  "OpBranchConditional $cond $true $false">;
+  def OpSwitch: Op<251, (outs), (ins ID:$sel, ID:$dflt, variable_ops), "OpSwitch $sel $dflt">;
+}
+let isReturn = 1, hasDelaySlot=0, isBarrier = 0, isTerminator=1, isNotDuplicable = 1 in {
+  def OpKill: SimpleOp<"OpKill", 252>;
+  def OpReturn: SimpleOp<"OpReturn", 253>;
+  def OpReturnValue: Op<254, (outs), (ins ANYID:$ret), "OpReturnValue $ret">;
+  def OpUnreachable: SimpleOp<"OpUnreachable", 255>;
+}
+def OpLifetimeStart: Op<256, (outs), (ins ID:$ptr, i32imm:$sz), "OpLifetimeStart $ptr, $sz">;
+def OpLifetimeStop: Op<257, (outs), (ins ID:$ptr, i32imm:$sz), "OpLifetimeStop $ptr, $sz">;
+
+// 3.42.18 Atomic Instructions
+
+class AtomicOp<string name, bits<16> opCode>: Op<opCode, (outs ID:$res),
+                  (ins TYPE:$ty, ID:$ptr, ID:$sc, ID:$sem),
+                  "$res = "#name#" $ty $ptr $sc $sem">;
+
+class AtomicOpVal<string name, bits<16> opCode>: Op<opCode, (outs ID:$res),
+                  (ins TYPE:$ty, ID:$ptr, ID:$sc, ID:$sem, ID:$val),
+                  "$res = "#name#" $ty $ptr $sc $sem $val">;
+
+def OpAtomicLoad: AtomicOp<"OpAtomicLoad", 227>;
+
+def OpAtomicStore: Op<228, (outs), (ins ID:$ptr, ID:$sc, ID:$sem, ID:$val),
+                  "OpAtomicStore $ptr $sc $sem $val">;
+def OpAtomicExchange: Op<229, (outs ID:$res),
+                  (ins TYPE:$ty, ID:$ptr, ID:$sc, ID:$sem, ID:$val),
+                  "$res = OpAtomicExchange $ty $ptr $sc $sem $val">;
+def OpAtomicCompareExchange: Op<230, (outs ID:$res),
+                  (ins TYPE:$ty, ID:$ptr, ID:$sc, ID:$eq,
+                   ID:$neq, ID:$val, ID:$cmp),
+                  "$res = OpAtomicCompareExchange $ty $ptr $sc $eq $neq $val $cmp">;
+// TODO Currently the following deprecated opcode is missing:
+// OpAtomicCompareExchangeWeak
+
+def OpAtomicIIncrement: AtomicOp<"OpAtomicIIncrement", 232>;
+def OpAtomicIDecrement: AtomicOp<"OpAtomicIDecrement", 233>;
+
+def OpAtomicIAdd: AtomicOpVal<"OpAtomicIAdd", 234>;
+def OpAtomicISub: AtomicOpVal<"OpAtomicISub", 235>;
+
+def OpAtomicSMin: AtomicOpVal<"OpAtomicSMin", 236>;
+def OpAtomicUMin: AtomicOpVal<"OpAtomicUMin", 237>;
+def OpAtomicSMax: AtomicOpVal<"OpAtomicSMax", 238>;
+def OpAtomicUMax: AtomicOpVal<"OpAtomicUMax", 239>;
+
+def OpAtomicAnd: AtomicOpVal<"OpAtomicAnd", 240>;
+def OpAtomicOr: AtomicOpVal<"OpAtomicOr", 241>;
+def OpAtomicXor: AtomicOpVal<"OpAtomicXor", 242>;
+
+
+def OpAtomicFlagTestAndSet: AtomicOp<"OpAtomicFlagTestAndSet", 318>;
+def OpAtomicFlagClear: Op<319, (outs), (ins ID:$ptr, ID:$sc, ID:$sem),
+                  "OpAtomicFlagClear $ptr $sc $sem">;
+
+// 3.42.19 Primitive Instructions
+
+def OpEmitVertex: SimpleOp<"OpEmitVertex", 218>;
+def OpEndPrimitive: SimpleOp<"OpEndPrimitive", 219>;
+def OpEmitStreamVertex: Op<220, (outs), (ins ID:$stream), "OpEmitStreamVertex $stream">;
+def OpEndStreamPrimitive: Op<221, (outs), (ins ID:$stream), "OpEndStreamPrimitive $stream">;
+
+// 3.42.20 Barrier Instructions
+
+def OpControlBarrier: Op<224, (outs), (ins ID:$exec, ID:$mem, ID:$sem),
+                  "OpControlBarrier $exec $mem $sem">;
+def OpMemoryBarrier: Op<225, (outs), (ins ID:$mem, ID:$sem),
+                  "OpMemoryBarrier $mem $sem">;
+def OpNamedBarrierInitialize: UnOp<"OpNamedBarrierInitialize", 328>;
+def OpMemoryNamedBarrier: Op<329, (outs), (ins ID:$barr, ID:$mem, ID:$sem),
+                  "OpMemoryNamedBarrier $barr $mem $sem">;
+
+// 3.42.21. Group and Subgroup Instructions
+
+def OpGroupAll: Op<261, (outs ID:$res), (ins TYPE:$ty, ID:$scope, ID:$pr),
+                  "$res = OpGroupAll $ty $scope $pr">;
+def OpGroupAny: Op<262, (outs ID:$res), (ins TYPE:$ty, ID:$scope, ID:$pr),
+                  "$res = OpGroupAny $ty $scope $pr">;
+def OpGroupBroadcast: Op<263, (outs ID:$res), (ins TYPE:$ty, ID:$scope,
+                               ID:$val, ID:$id),
+                  "$res = OpGroupBroadcast $ty $scope $val $id">;
+class OpGroup<string name, bits<16> opCode>: Op<opCode, (outs ID:$res),
+                  (ins TYPE:$ty, ID:$scope, GroupOperation:$groupOp, ID:$x),
+                  "$res = OpGroup"#name#" $ty $scope $groupOp $x">;
+def OpGroupIAdd: OpGroup<"IAdd", 264>;
+def OpGroupFAdd: OpGroup<"FAdd", 265>;
+def OpGroupFMin: OpGroup<"FMin", 266>;
+def OpGroupUMin: OpGroup<"UMin", 267>;
+def OpGroupSMin: OpGroup<"SMin", 268>;
+def OpGroupFMax: OpGroup<"FMax", 269>;
+def OpGroupUMax: OpGroup<"UMax", 270>;
+def OpGroupSMax: OpGroup<"SMax", 271>;
+
+// TODO: 3.42.22. Device-Side Enqueue Instructions
+// TODO: 3.42.23. Pipe Instructions
+
+// 3.42.24. Non-Uniform Instructions
+
+def OpGroupNonUniformElect: Op<333, (outs ID:$res), (ins TYPE:$ty, ID:$scope),
+                  "$res = OpGroupNonUniformElect $ty $scope">;
+class OpGroupNU3<string name, bits<16> opCode>: Op<opCode,
+                  (outs ID:$res), (ins TYPE:$ty, ID:$scope, ID:$pred),
+                  "$res = OpGroupNonUniform"#name#" $ty $scope $pred">;
+class OpGroupNU4<string name, bits<16> opCode>: Op<opCode,
+                  (outs ID:$res), (ins TYPE:$ty, ID:$scope, ID:$val, ID:$id),
+                  "$res = OpGroupNonUniform"#name#" $ty $scope $val $id">;
+def OpGroupNonUniformAll: OpGroupNU3<"All", 334>;
+def OpGroupNonUniformAny: OpGroupNU3<"Any", 335>;
+def OpGroupNonUniformAllEqual: OpGroupNU3<"AllEqual", 336>;
+def OpGroupNonUniformBroadcast: OpGroupNU4<"Broadcast", 337>;
+def OpGroupNonUniformBroadcastFirst: OpGroupNU3<"BroadcastFirst", 338>;
+def OpGroupNonUniformBallot: OpGroupNU3<"Ballot", 339>;
+def OpGroupNonUniformInverseBallot: OpGroupNU3<"InverseBallot", 340>;
+def OpGroupNonUniformBallotBitExtract: OpGroupNU4<"BallotBitExtract", 341>;
+def OpGroupNonUniformBallotBitCount: Op<342, (outs ID:$res),
+                  (ins TYPE:$ty, ID:$scope, GroupOperation:$groupOp, ID:$val),
+                  "$res = OpGroupNonUniformBallotBitCount "
+                          "$ty $scope $groupOp $val">;
+def OpGroupNonUniformBallotFindLSB: OpGroupNU3<"BallotFindLSB", 343>;
+def OpGroupNonUniformBallotFindMSB: OpGroupNU3<"BallotFindMSB", 344>;
+def OpGroupNonUniformShuffle: OpGroupNU4<"Shuffle", 345>;
+def OpGroupNonUniformShuffleXor: OpGroupNU4<"ShuffleXor", 346>;
+def OpGroupNonUniformShuffleUp: OpGroupNU4<"ShuffleUp", 347>;
+def OpGroupNonUniformShuffleDown: OpGroupNU4<"ShuffleDown", 348>;
+class OpGroupNUGroup<string name, bits<16> opCode>: Op<opCode, (outs ID:$res),
+                  (ins TYPE:$ty, ID:$scope, GroupOperation:$groupOp,
+                   ID:$val, variable_ops),
+                  "$res = OpGroupNonUniform"#name#" $ty $scope $groupOp $val">;
+def OpGroupNonUniformIAdd: OpGroupNUGroup<"IAdd", 349>;
+def OpGroupNonUniformFAdd: OpGroupNUGroup<"FAdd", 350>;
+def OpGroupNonUniformIMul: OpGroupNUGroup<"IMul", 351>;
+def OpGroupNonUniformFMul: OpGroupNUGroup<"FMul", 352>;
+def OpGroupNonUniformSMin: OpGroupNUGroup<"SMin", 353>;
+def OpGroupNonUniformUMin: OpGroupNUGroup<"UMin", 354>;
+def OpGroupNonUniformFMin: OpGroupNUGroup<"FMin", 355>;
+def OpGroupNonUniformSMax: OpGroupNUGroup<"SMax", 356>;
+def OpGroupNonUniformUMax: OpGroupNUGroup<"UMax", 357>;
+def OpGroupNonUniformFMax: OpGroupNUGroup<"FMax", 358>;
+def OpGroupNonUniformBitwiseAnd: OpGroupNUGroup<"BitwiseAnd", 359>;
+def OpGroupNonUniformBitwiseOr: OpGroupNUGroup<"BitwiseOr", 360>;
+def OpGroupNonUniformBitwiseXor: OpGroupNUGroup<"BitwiseXor", 361>;
+def OpGroupNonUniformLogicalAnd: OpGroupNUGroup<"LogicalAnd", 362>;
+def OpGroupNonUniformLogicalOr: OpGroupNUGroup<"LogicalOr", 363>;
+def OpGroupNonUniformLogicalXor: OpGroupNUGroup<"LogicalXor", 364>;
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
new file mode 100644
index 000000000000..9294a60506a8
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -0,0 +1,1268 @@
+//===- SPIRVInstructionSelector.cpp ------------------------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the targeting of the InstructionSelector class for
+// SPIRV.
+// TODO: This should be generated by TableGen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPIRV.h"
+#include "SPIRVGlobalRegistry.h"
+#include "SPIRVInstrInfo.h"
+#include "SPIRVRegisterBankInfo.h"
+#include "SPIRVRegisterInfo.h"
+#include "SPIRVTargetMachine.h"
+#include "SPIRVUtils.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/IntrinsicsSPIRV.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "spirv-isel"
+
+using namespace llvm;
+
+namespace {
+
+#define GET_GLOBALISEL_PREDICATE_BITSET
+#include "SPIRVGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATE_BITSET
+
+class SPIRVInstructionSelector : public InstructionSelector {
+  const SPIRVSubtarget &STI;
+  const SPIRVInstrInfo &TII;
+  const SPIRVRegisterInfo &TRI;
+  const RegisterBankInfo &RBI;
+  SPIRVGlobalRegistry &GR;
+  MachineRegisterInfo *MRI;
+
+public:
+  SPIRVInstructionSelector(const SPIRVTargetMachine &TM,
+                           const SPIRVSubtarget &ST,
+                           const RegisterBankInfo &RBI);
+  void setupMF(MachineFunction &MF, GISelKnownBits *KB,
+               CodeGenCoverage &CoverageInfo, ProfileSummaryInfo *PSI,
+               BlockFrequencyInfo *BFI) override;
+  // Common selection code. Instruction-specific selection occurs in spvSelect.
+  bool select(MachineInstr &I) override;
+  static const char *getName() { return DEBUG_TYPE; }
+
+#define GET_GLOBALISEL_PREDICATES_DECL
+#include "SPIRVGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_DECL
+
+#define GET_GLOBALISEL_TEMPORARIES_DECL
+#include "SPIRVGenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_DECL
+
+private:
+  // tblgen-erated 'select' implementation, used as the initial selector for
+  // the patterns that don't require complex C++.
+  bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
+
+  // All instruction-specific selection that didn't happen in "select()".
+  // Is basically a large Switch/Case delegating to all other select method.
+  bool spvSelect(Register ResVReg, const SPIRVType *ResType,
+                 MachineInstr &I) const;
+
+  bool selectGlobalValue(Register ResVReg, MachineInstr &I,
+                         const MachineInstr *Init = nullptr) const;
+
+  bool selectUnOpWithSrc(Register ResVReg, const SPIRVType *ResType,
+                         MachineInstr &I, Register SrcReg,
+                         unsigned Opcode) const;
+  bool selectUnOp(Register ResVReg, const SPIRVType *ResType, MachineInstr &I,
+                  unsigned Opcode) const;
+
+  bool selectLoad(Register ResVReg, const SPIRVType *ResType,
+                  MachineInstr &I) const;
+  bool selectStore(MachineInstr &I) const;
+
+  bool selectMemOperation(Register ResVReg, MachineInstr &I) const;
+
+  bool selectAtomicRMW(Register ResVReg, const SPIRVType *ResType,
+                       MachineInstr &I, unsigned NewOpcode) const;
+
+  bool selectAtomicCmpXchg(Register ResVReg, const SPIRVType *ResType,
+                           MachineInstr &I) const;
+
+  bool selectFence(MachineInstr &I) const;
+
+  bool selectAddrSpaceCast(Register ResVReg, const SPIRVType *ResType,
+                           MachineInstr &I) const;
+
+  bool selectBitreverse(Register ResVReg, const SPIRVType *ResType,
+                        MachineInstr &I) const;
+
+  bool selectConstVector(Register ResVReg, const SPIRVType *ResType,
+                         MachineInstr &I) const;
+
+  bool selectCmp(Register ResVReg, const SPIRVType *ResType,
+                 unsigned comparisonOpcode, MachineInstr &I) const;
+
+  bool selectICmp(Register ResVReg, const SPIRVType *ResType,
+                  MachineInstr &I) const;
+  bool selectFCmp(Register ResVReg, const SPIRVType *ResType,
+                  MachineInstr &I) const;
+
+  void renderImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
+                   int OpIdx) const;
+  void renderFImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
+                    int OpIdx) const;
+
+  bool selectConst(Register ResVReg, const SPIRVType *ResType, const APInt &Imm,
+                   MachineInstr &I) const;
+
+  bool selectSelect(Register ResVReg, const SPIRVType *ResType, MachineInstr &I,
+                    bool IsSigned) const;
+  bool selectIToF(Register ResVReg, const SPIRVType *ResType, MachineInstr &I,
+                  bool IsSigned, unsigned Opcode) const;
+  bool selectExt(Register ResVReg, const SPIRVType *ResType, MachineInstr &I,
+                 bool IsSigned) const;
+
+  bool selectTrunc(Register ResVReg, const SPIRVType *ResType,
+                   MachineInstr &I) const;
+
+  bool selectIntToBool(Register IntReg, Register ResVReg,
+                       const SPIRVType *intTy, const SPIRVType *boolTy,
+                       MachineInstr &I) const;
+
+  bool selectOpUndef(Register ResVReg, const SPIRVType *ResType,
+                     MachineInstr &I) const;
+  bool selectIntrinsic(Register ResVReg, const SPIRVType *ResType,
+                       MachineInstr &I) const;
+  bool selectExtractVal(Register ResVReg, const SPIRVType *ResType,
+                        MachineInstr &I) const;
+  bool selectInsertVal(Register ResVReg, const SPIRVType *ResType,
+                       MachineInstr &I) const;
+  bool selectExtractElt(Register ResVReg, const SPIRVType *ResType,
+                        MachineInstr &I) const;
+  bool selectInsertElt(Register ResVReg, const SPIRVType *ResType,
+                       MachineInstr &I) const;
+  bool selectGEP(Register ResVReg, const SPIRVType *ResType,
+                 MachineInstr &I) const;
+
+  bool selectFrameIndex(Register ResVReg, const SPIRVType *ResType,
+                        MachineInstr &I) const;
+
+  bool selectBranch(MachineInstr &I) const;
+  bool selectBranchCond(MachineInstr &I) const;
+
+  bool selectPhi(Register ResVReg, const SPIRVType *ResType,
+                 MachineInstr &I) const;
+
+  Register buildI32Constant(uint32_t Val, MachineInstr &I,
+                            const SPIRVType *ResType = nullptr) const;
+
+  Register buildZerosVal(const SPIRVType *ResType, MachineInstr &I) const;
+  Register buildOnesVal(bool AllOnes, const SPIRVType *ResType,
+                        MachineInstr &I) const;
+};
+
+} // end anonymous namespace
+
+#define GET_GLOBALISEL_IMPL
+#include "SPIRVGenGlobalISel.inc"
+#undef GET_GLOBALISEL_IMPL
+
+SPIRVInstructionSelector::SPIRVInstructionSelector(const SPIRVTargetMachine &TM,
+                                                   const SPIRVSubtarget &ST,
+                                                   const RegisterBankInfo &RBI)
+    : InstructionSelector(), STI(ST), TII(*ST.getInstrInfo()),
+      TRI(*ST.getRegisterInfo()), RBI(RBI), GR(*ST.getSPIRVGlobalRegistry()),
+#define GET_GLOBALISEL_PREDICATES_INIT
+#include "SPIRVGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_INIT
+#define GET_GLOBALISEL_TEMPORARIES_INIT
+#include "SPIRVGenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_INIT
+{
+}
+
+void SPIRVInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits *KB,
+                                       CodeGenCoverage &CoverageInfo,
+                                       ProfileSummaryInfo *PSI,
+                                       BlockFrequencyInfo *BFI) {
+  MRI = &MF.getRegInfo();
+  GR.setCurrentFunc(MF);
+  InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);
+}
+
+// Defined in SPIRVLegalizerInfo.cpp.
+extern bool isTypeFoldingSupported(unsigned Opcode);
+
+bool SPIRVInstructionSelector::select(MachineInstr &I) {
+  assert(I.getParent() && "Instruction should be in a basic block!");
+  assert(I.getParent()->getParent() && "Instruction should be in a function!");
+
+  Register Opcode = I.getOpcode();
+  // If it's not a GMIR instruction, we've selected it already.
+  if (!isPreISelGenericOpcode(Opcode)) {
+    if (Opcode == SPIRV::ASSIGN_TYPE) { // These pseudos aren't needed any more.
+      auto *Def = MRI->getVRegDef(I.getOperand(1).getReg());
+      if (isTypeFoldingSupported(Def->getOpcode())) {
+        auto Res = selectImpl(I, *CoverageInfo);
+        assert(Res || Def->getOpcode() == TargetOpcode::G_CONSTANT);
+        if (Res)
+          return Res;
+      }
+      MRI->replaceRegWith(I.getOperand(1).getReg(), I.getOperand(0).getReg());
+      I.removeFromParent();
+    } else if (I.getNumDefs() == 1) {
+      // Make all vregs 32 bits (for SPIR-V IDs).
+      MRI->setType(I.getOperand(0).getReg(), LLT::scalar(32));
+    }
+    return true;
+  }
+
+  if (I.getNumOperands() != I.getNumExplicitOperands()) {
+    LLVM_DEBUG(errs() << "Generic instr has unexpected implicit operands\n");
+    return false;
+  }
+
+  // Common code for getting return reg+type, and removing selected instr
+  // from parent occurs here. Instr-specific selection happens in spvSelect().
+  bool HasDefs = I.getNumDefs() > 0;
+  Register ResVReg = HasDefs ? I.getOperand(0).getReg() : Register(0);
+  SPIRVType *ResType = HasDefs ? GR.getSPIRVTypeForVReg(ResVReg) : nullptr;
+  assert(!HasDefs || ResType || I.getOpcode() == TargetOpcode::G_GLOBAL_VALUE);
+  if (spvSelect(ResVReg, ResType, I)) {
+    if (HasDefs) // Make all vregs 32 bits (for SPIR-V IDs).
+      MRI->setType(ResVReg, LLT::scalar(32));
+    I.removeFromParent();
+    return true;
+  }
+  return false;
+}
+
+bool SPIRVInstructionSelector::spvSelect(Register ResVReg,
+                                         const SPIRVType *ResType,
+                                         MachineInstr &I) const {
+  assert(!isTypeFoldingSupported(I.getOpcode()) ||
+         I.getOpcode() == TargetOpcode::G_CONSTANT);
+  const unsigned Opcode = I.getOpcode();
+  switch (Opcode) {
+  case TargetOpcode::G_CONSTANT:
+    return selectConst(ResVReg, ResType, I.getOperand(1).getCImm()->getValue(),
+                       I);
+  case TargetOpcode::G_GLOBAL_VALUE:
+    return selectGlobalValue(ResVReg, I);
+  case TargetOpcode::G_IMPLICIT_DEF:
+    return selectOpUndef(ResVReg, ResType, I);
+
+  case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
+    return selectIntrinsic(ResVReg, ResType, I);
+  case TargetOpcode::G_BITREVERSE:
+    return selectBitreverse(ResVReg, ResType, I);
+
+  case TargetOpcode::G_BUILD_VECTOR:
+    return selectConstVector(ResVReg, ResType, I);
+
+  case TargetOpcode::G_SHUFFLE_VECTOR: {
+    MachineBasicBlock &BB = *I.getParent();
+    auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpVectorShuffle))
+                   .addDef(ResVReg)
+                   .addUse(GR.getSPIRVTypeID(ResType))
+                   .addUse(I.getOperand(1).getReg())
+                   .addUse(I.getOperand(2).getReg());
+    for (auto V : I.getOperand(3).getShuffleMask())
+      MIB.addImm(V);
+    return MIB.constrainAllUses(TII, TRI, RBI);
+  }
+  case TargetOpcode::G_MEMMOVE:
+  case TargetOpcode::G_MEMCPY:
+    return selectMemOperation(ResVReg, I);
+
+  case TargetOpcode::G_ICMP:
+    return selectICmp(ResVReg, ResType, I);
+  case TargetOpcode::G_FCMP:
+    return selectFCmp(ResVReg, ResType, I);
+
+  case TargetOpcode::G_FRAME_INDEX:
+    return selectFrameIndex(ResVReg, ResType, I);
+
+  case TargetOpcode::G_LOAD:
+    return selectLoad(ResVReg, ResType, I);
+  case TargetOpcode::G_STORE:
+    return selectStore(I);
+
+  case TargetOpcode::G_BR:
+    return selectBranch(I);
+  case TargetOpcode::G_BRCOND:
+    return selectBranchCond(I);
+
+  case TargetOpcode::G_PHI:
+    return selectPhi(ResVReg, ResType, I);
+
+  case TargetOpcode::G_FPTOSI:
+    return selectUnOp(ResVReg, ResType, I, SPIRV::OpConvertFToS);
+  case TargetOpcode::G_FPTOUI:
+    return selectUnOp(ResVReg, ResType, I, SPIRV::OpConvertFToU);
+
+  case TargetOpcode::G_SITOFP:
+    return selectIToF(ResVReg, ResType, I, true, SPIRV::OpConvertSToF);
+  case TargetOpcode::G_UITOFP:
+    return selectIToF(ResVReg, ResType, I, false, SPIRV::OpConvertUToF);
+
+  case TargetOpcode::G_CTPOP:
+    return selectUnOp(ResVReg, ResType, I, SPIRV::OpBitCount);
+
+  case TargetOpcode::G_SEXT:
+    return selectExt(ResVReg, ResType, I, true);
+  case TargetOpcode::G_ANYEXT:
+  case TargetOpcode::G_ZEXT:
+    return selectExt(ResVReg, ResType, I, false);
+  case TargetOpcode::G_TRUNC:
+    return selectTrunc(ResVReg, ResType, I);
+  case TargetOpcode::G_FPTRUNC:
+  case TargetOpcode::G_FPEXT:
+    return selectUnOp(ResVReg, ResType, I, SPIRV::OpFConvert);
+
+  case TargetOpcode::G_PTRTOINT:
+    return selectUnOp(ResVReg, ResType, I, SPIRV::OpConvertPtrToU);
+  case TargetOpcode::G_INTTOPTR:
+    return selectUnOp(ResVReg, ResType, I, SPIRV::OpConvertUToPtr);
+  case TargetOpcode::G_BITCAST:
+    return selectUnOp(ResVReg, ResType, I, SPIRV::OpBitcast);
+  case TargetOpcode::G_ADDRSPACE_CAST:
+    return selectAddrSpaceCast(ResVReg, ResType, I);
+
+  case TargetOpcode::G_ATOMICRMW_OR:
+    return selectAtomicRMW(ResVReg, ResType, I, SPIRV::OpAtomicOr);
+  case TargetOpcode::G_ATOMICRMW_ADD:
+    return selectAtomicRMW(ResVReg, ResType, I, SPIRV::OpAtomicIAdd);
+  case TargetOpcode::G_ATOMICRMW_AND:
+    return selectAtomicRMW(ResVReg, ResType, I, SPIRV::OpAtomicAnd);
+  case TargetOpcode::G_ATOMICRMW_MAX:
+    return selectAtomicRMW(ResVReg, ResType, I, SPIRV::OpAtomicSMax);
+  case TargetOpcode::G_ATOMICRMW_MIN:
+    return selectAtomicRMW(ResVReg, ResType, I, SPIRV::OpAtomicSMin);
+  case TargetOpcode::G_ATOMICRMW_SUB:
+    return selectAtomicRMW(ResVReg, ResType, I, SPIRV::OpAtomicISub);
+  case TargetOpcode::G_ATOMICRMW_XOR:
+    return selectAtomicRMW(ResVReg, ResType, I, SPIRV::OpAtomicXor);
+  case TargetOpcode::G_ATOMICRMW_UMAX:
+    return selectAtomicRMW(ResVReg, ResType, I, SPIRV::OpAtomicUMax);
+  case TargetOpcode::G_ATOMICRMW_UMIN:
+    return selectAtomicRMW(ResVReg, ResType, I, SPIRV::OpAtomicUMin);
+  case TargetOpcode::G_ATOMICRMW_XCHG:
+    return selectAtomicRMW(ResVReg, ResType, I, SPIRV::OpAtomicExchange);
+  case TargetOpcode::G_ATOMIC_CMPXCHG:
+    return selectAtomicCmpXchg(ResVReg, ResType, I);
+
+  case TargetOpcode::G_FENCE:
+    return selectFence(I);
+
+  default:
+    return false;
+  }
+}
+
+bool SPIRVInstructionSelector::selectUnOpWithSrc(Register ResVReg,
+                                                 const SPIRVType *ResType,
+                                                 MachineInstr &I,
+                                                 Register SrcReg,
+                                                 unsigned Opcode) const {
+  return BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opcode))
+      .addDef(ResVReg)
+      .addUse(GR.getSPIRVTypeID(ResType))
+      .addUse(SrcReg)
+      .constrainAllUses(TII, TRI, RBI);
+}
+
+bool SPIRVInstructionSelector::selectUnOp(Register ResVReg,
+                                          const SPIRVType *ResType,
+                                          MachineInstr &I,
+                                          unsigned Opcode) const {
+  return selectUnOpWithSrc(ResVReg, ResType, I, I.getOperand(1).getReg(),
+                           Opcode);
+}
+
+static SPIRV::MemorySemantics getMemSemantics(AtomicOrdering Ord) {
+  switch (Ord) {
+  case AtomicOrdering::Acquire:
+    return SPIRV::MemorySemantics::Acquire;
+  case AtomicOrdering::Release:
+    return SPIRV::MemorySemantics::Release;
+  case AtomicOrdering::AcquireRelease:
+    return SPIRV::MemorySemantics::AcquireRelease;
+  case AtomicOrdering::SequentiallyConsistent:
+    return SPIRV::MemorySemantics::SequentiallyConsistent;
+  case AtomicOrdering::Unordered:
+  case AtomicOrdering::Monotonic:
+  case AtomicOrdering::NotAtomic:
+    return SPIRV::MemorySemantics::None;
+  }
+}
+
+static SPIRV::Scope getScope(SyncScope::ID Ord) {
+  switch (Ord) {
+  case SyncScope::SingleThread:
+    return SPIRV::Scope::Invocation;
+  case SyncScope::System:
+    return SPIRV::Scope::Device;
+  default:
+    llvm_unreachable("Unsupported synchronization Scope ID.");
+  }
+}
+
+static void addMemoryOperands(MachineMemOperand *MemOp,
+                              MachineInstrBuilder &MIB) {
+  uint32_t SpvMemOp = static_cast<uint32_t>(SPIRV::MemoryOperand::None);
+  if (MemOp->isVolatile())
+    SpvMemOp |= static_cast<uint32_t>(SPIRV::MemoryOperand::Volatile);
+  if (MemOp->isNonTemporal())
+    SpvMemOp |= static_cast<uint32_t>(SPIRV::MemoryOperand::Nontemporal);
+  if (MemOp->getAlign().value())
+    SpvMemOp |= static_cast<uint32_t>(SPIRV::MemoryOperand::Aligned);
+
+  if (SpvMemOp != static_cast<uint32_t>(SPIRV::MemoryOperand::None)) {
+    MIB.addImm(SpvMemOp);
+    if (SpvMemOp & static_cast<uint32_t>(SPIRV::MemoryOperand::Aligned))
+      MIB.addImm(MemOp->getAlign().value());
+  }
+}
+
+static void addMemoryOperands(uint64_t Flags, MachineInstrBuilder &MIB) {
+  uint32_t SpvMemOp = static_cast<uint32_t>(SPIRV::MemoryOperand::None);
+  if (Flags & MachineMemOperand::Flags::MOVolatile)
+    SpvMemOp |= static_cast<uint32_t>(SPIRV::MemoryOperand::Volatile);
+  if (Flags & MachineMemOperand::Flags::MONonTemporal)
+    SpvMemOp |= static_cast<uint32_t>(SPIRV::MemoryOperand::Nontemporal);
+
+  if (SpvMemOp != static_cast<uint32_t>(SPIRV::MemoryOperand::None))
+    MIB.addImm(SpvMemOp);
+}
+
+bool SPIRVInstructionSelector::selectLoad(Register ResVReg,
+                                          const SPIRVType *ResType,
+                                          MachineInstr &I) const {
+  unsigned OpOffset =
+      I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS ? 1 : 0;
+  Register Ptr = I.getOperand(1 + OpOffset).getReg();
+  auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpLoad))
+                 .addDef(ResVReg)
+                 .addUse(GR.getSPIRVTypeID(ResType))
+                 .addUse(Ptr);
+  if (!I.getNumMemOperands()) {
+    assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
+    addMemoryOperands(I.getOperand(2 + OpOffset).getImm(), MIB);
+  } else {
+    addMemoryOperands(*I.memoperands_begin(), MIB);
+  }
+  return MIB.constrainAllUses(TII, TRI, RBI);
+}
+
+bool SPIRVInstructionSelector::selectStore(MachineInstr &I) const {
+  unsigned OpOffset =
+      I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS ? 1 : 0;
+  Register StoreVal = I.getOperand(0 + OpOffset).getReg();
+  Register Ptr = I.getOperand(1 + OpOffset).getReg();
+  MachineBasicBlock &BB = *I.getParent();
+  auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpStore))
+                 .addUse(Ptr)
+                 .addUse(StoreVal);
+  if (!I.getNumMemOperands()) {
+    assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
+    addMemoryOperands(I.getOperand(2 + OpOffset).getImm(), MIB);
+  } else {
+    addMemoryOperands(*I.memoperands_begin(), MIB);
+  }
+  return MIB.constrainAllUses(TII, TRI, RBI);
+}
+
+bool SPIRVInstructionSelector::selectMemOperation(Register ResVReg,
+                                                  MachineInstr &I) const {
+  MachineBasicBlock &BB = *I.getParent();
+  auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpCopyMemorySized))
+                 .addDef(I.getOperand(0).getReg())
+                 .addUse(I.getOperand(1).getReg())
+                 .addUse(I.getOperand(2).getReg());
+  if (I.getNumMemOperands())
+    addMemoryOperands(*I.memoperands_begin(), MIB);
+  bool Result = MIB.constrainAllUses(TII, TRI, RBI);
+  if (ResVReg.isValid() && ResVReg != MIB->getOperand(0).getReg()) {
+    BuildMI(BB, I, I.getDebugLoc(), TII.get(TargetOpcode::COPY), ResVReg)
+        .addUse(MIB->getOperand(0).getReg());
+  }
+  return Result;
+}
+
+bool SPIRVInstructionSelector::selectAtomicRMW(Register ResVReg,
+                                               const SPIRVType *ResType,
+                                               MachineInstr &I,
+                                               unsigned NewOpcode) const {
+  assert(I.hasOneMemOperand());
+  const MachineMemOperand *MemOp = *I.memoperands_begin();
+  uint32_t Scope = static_cast<uint32_t>(getScope(MemOp->getSyncScopeID()));
+  Register ScopeReg = buildI32Constant(Scope, I);
+
+  Register Ptr = I.getOperand(1).getReg();
+  // TODO: Changed as it's implemented in the translator. See test/atomicrmw.ll
+  // auto ScSem =
+  // getMemSemanticsForStorageClass(GR.getPointerStorageClass(Ptr));
+  AtomicOrdering AO = MemOp->getSuccessOrdering();
+  uint32_t MemSem = static_cast<uint32_t>(getMemSemantics(AO));
+  Register MemSemReg = buildI32Constant(MemSem /*| ScSem*/, I);
+
+  return BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(NewOpcode))
+      .addDef(ResVReg)
+      .addUse(GR.getSPIRVTypeID(ResType))
+      .addUse(Ptr)
+      .addUse(ScopeReg)
+      .addUse(MemSemReg)
+      .addUse(I.getOperand(2).getReg())
+      .constrainAllUses(TII, TRI, RBI);
+}
+
+bool SPIRVInstructionSelector::selectFence(MachineInstr &I) const {
+  AtomicOrdering AO = AtomicOrdering(I.getOperand(0).getImm());
+  uint32_t MemSem = static_cast<uint32_t>(getMemSemantics(AO));
+  Register MemSemReg = buildI32Constant(MemSem, I);
+  SyncScope::ID Ord = SyncScope::ID(I.getOperand(1).getImm());
+  uint32_t Scope = static_cast<uint32_t>(getScope(Ord));
+  Register ScopeReg = buildI32Constant(Scope, I);
+  MachineBasicBlock &BB = *I.getParent();
+  return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpMemoryBarrier))
+      .addUse(ScopeReg)
+      .addUse(MemSemReg)
+      .constrainAllUses(TII, TRI, RBI);
+}
+
+bool SPIRVInstructionSelector::selectAtomicCmpXchg(Register ResVReg,
+                                                   const SPIRVType *ResType,
+                                                   MachineInstr &I) const {
+  assert(I.hasOneMemOperand());
+  const MachineMemOperand *MemOp = *I.memoperands_begin();
+  uint32_t Scope = static_cast<uint32_t>(getScope(MemOp->getSyncScopeID()));
+  Register ScopeReg = buildI32Constant(Scope, I);
+
+  Register Ptr = I.getOperand(2).getReg();
+  Register Cmp = I.getOperand(3).getReg();
+  Register Val = I.getOperand(4).getReg();
+
+  SPIRVType *SpvValTy = GR.getSPIRVTypeForVReg(Val);
+  SPIRV::StorageClass SC = GR.getPointerStorageClass(Ptr);
+  uint32_t ScSem = static_cast<uint32_t>(getMemSemanticsForStorageClass(SC));
+  AtomicOrdering AO = MemOp->getSuccessOrdering();
+  uint32_t MemSemEq = static_cast<uint32_t>(getMemSemantics(AO)) | ScSem;
+  Register MemSemEqReg = buildI32Constant(MemSemEq, I);
+  AtomicOrdering FO = MemOp->getFailureOrdering();
+  uint32_t MemSemNeq = static_cast<uint32_t>(getMemSemantics(FO)) | ScSem;
+  Register MemSemNeqReg =
+      MemSemEq == MemSemNeq ? MemSemEqReg : buildI32Constant(MemSemNeq, I);
+  const DebugLoc &DL = I.getDebugLoc();
+  return BuildMI(*I.getParent(), I, DL, TII.get(SPIRV::OpAtomicCompareExchange))
+      .addDef(ResVReg)
+      .addUse(GR.getSPIRVTypeID(SpvValTy))
+      .addUse(Ptr)
+      .addUse(ScopeReg)
+      .addUse(MemSemEqReg)
+      .addUse(MemSemNeqReg)
+      .addUse(Val)
+      .addUse(Cmp)
+      .constrainAllUses(TII, TRI, RBI);
+}
+
+static bool isGenericCastablePtr(SPIRV::StorageClass SC) {
+  switch (SC) {
+  case SPIRV::StorageClass::Workgroup:
+  case SPIRV::StorageClass::CrossWorkgroup:
+  case SPIRV::StorageClass::Function:
+    return true;
+  default:
+    return false;
+  }
+}
+
+// In SPIR-V address space casting can only happen to and from the Generic
+// storage class. We can also only case Workgroup, CrossWorkgroup, or Function
+// pointers to and from Generic pointers. As such, we can convert e.g. from
+// Workgroup to Function by going via a Generic pointer as an intermediary. All
+// other combinations can only be done by a bitcast, and are probably not safe.
+bool SPIRVInstructionSelector::selectAddrSpaceCast(Register ResVReg,
+                                                   const SPIRVType *ResType,
+                                                   MachineInstr &I) const {
+  Register SrcPtr = I.getOperand(1).getReg();
+  SPIRVType *SrcPtrTy = GR.getSPIRVTypeForVReg(SrcPtr);
+  SPIRV::StorageClass SrcSC = GR.getPointerStorageClass(SrcPtr);
+  SPIRV::StorageClass DstSC = GR.getPointerStorageClass(ResVReg);
+
+  // Casting from an eligable pointer to Generic.
+  if (DstSC == SPIRV::StorageClass::Generic && isGenericCastablePtr(SrcSC))
+    return selectUnOp(ResVReg, ResType, I, SPIRV::OpPtrCastToGeneric);
+  // Casting from Generic to an eligable pointer.
+  if (SrcSC == SPIRV::StorageClass::Generic && isGenericCastablePtr(DstSC))
+    return selectUnOp(ResVReg, ResType, I, SPIRV::OpGenericCastToPtr);
+  // Casting between 2 eligable pointers using Generic as an intermediary.
+  if (isGenericCastablePtr(SrcSC) && isGenericCastablePtr(DstSC)) {
+    Register Tmp = MRI->createVirtualRegister(&SPIRV::IDRegClass);
+    SPIRVType *GenericPtrTy = GR.getOrCreateSPIRVPointerType(
+        SrcPtrTy, I, TII, SPIRV::StorageClass::Generic);
+    MachineBasicBlock &BB = *I.getParent();
+    const DebugLoc &DL = I.getDebugLoc();
+    bool Success = BuildMI(BB, I, DL, TII.get(SPIRV::OpPtrCastToGeneric))
+                       .addDef(Tmp)
+                       .addUse(GR.getSPIRVTypeID(GenericPtrTy))
+                       .addUse(SrcPtr)
+                       .constrainAllUses(TII, TRI, RBI);
+    return Success && BuildMI(BB, I, DL, TII.get(SPIRV::OpGenericCastToPtr))
+                          .addDef(ResVReg)
+                          .addUse(GR.getSPIRVTypeID(ResType))
+                          .addUse(Tmp)
+                          .constrainAllUses(TII, TRI, RBI);
+  }
+  // TODO Should this case just be disallowed completely?
+  // We're casting 2 other arbitrary address spaces, so have to bitcast.
+  return selectUnOp(ResVReg, ResType, I, SPIRV::OpBitcast);
+}
+
+static unsigned getFCmpOpcode(unsigned PredNum) {
+  auto Pred = static_cast<CmpInst::Predicate>(PredNum);
+  switch (Pred) {
+  case CmpInst::FCMP_OEQ:
+    return SPIRV::OpFOrdEqual;
+  case CmpInst::FCMP_OGE:
+    return SPIRV::OpFOrdGreaterThanEqual;
+  case CmpInst::FCMP_OGT:
+    return SPIRV::OpFOrdGreaterThan;
+  case CmpInst::FCMP_OLE:
+    return SPIRV::OpFOrdLessThanEqual;
+  case CmpInst::FCMP_OLT:
+    return SPIRV::OpFOrdLessThan;
+  case CmpInst::FCMP_ONE:
+    return SPIRV::OpFOrdNotEqual;
+  case CmpInst::FCMP_ORD:
+    return SPIRV::OpOrdered;
+  case CmpInst::FCMP_UEQ:
+    return SPIRV::OpFUnordEqual;
+  case CmpInst::FCMP_UGE:
+    return SPIRV::OpFUnordGreaterThanEqual;
+  case CmpInst::FCMP_UGT:
+    return SPIRV::OpFUnordGreaterThan;
+  case CmpInst::FCMP_ULE:
+    return SPIRV::OpFUnordLessThanEqual;
+  case CmpInst::FCMP_ULT:
+    return SPIRV::OpFUnordLessThan;
+  case CmpInst::FCMP_UNE:
+    return SPIRV::OpFUnordNotEqual;
+  case CmpInst::FCMP_UNO:
+    return SPIRV::OpUnordered;
+  default:
+    llvm_unreachable("Unknown predicate type for FCmp");
+  }
+}
+
+static unsigned getICmpOpcode(unsigned PredNum) {
+  auto Pred = static_cast<CmpInst::Predicate>(PredNum);
+  switch (Pred) {
+  case CmpInst::ICMP_EQ:
+    return SPIRV::OpIEqual;
+  case CmpInst::ICMP_NE:
+    return SPIRV::OpINotEqual;
+  case CmpInst::ICMP_SGE:
+    return SPIRV::OpSGreaterThanEqual;
+  case CmpInst::ICMP_SGT:
+    return SPIRV::OpSGreaterThan;
+  case CmpInst::ICMP_SLE:
+    return SPIRV::OpSLessThanEqual;
+  case CmpInst::ICMP_SLT:
+    return SPIRV::OpSLessThan;
+  case CmpInst::ICMP_UGE:
+    return SPIRV::OpUGreaterThanEqual;
+  case CmpInst::ICMP_UGT:
+    return SPIRV::OpUGreaterThan;
+  case CmpInst::ICMP_ULE:
+    return SPIRV::OpULessThanEqual;
+  case CmpInst::ICMP_ULT:
+    return SPIRV::OpULessThan;
+  default:
+    llvm_unreachable("Unknown predicate type for ICmp");
+  }
+}
+
+static unsigned getPtrCmpOpcode(unsigned Pred) {
+  switch (static_cast<CmpInst::Predicate>(Pred)) {
+  case CmpInst::ICMP_EQ:
+    return SPIRV::OpPtrEqual;
+  case CmpInst::ICMP_NE:
+    return SPIRV::OpPtrNotEqual;
+  default:
+    llvm_unreachable("Unknown predicate type for pointer comparison");
+  }
+}
+
+// Return the logical operation, or abort if none exists.
+static unsigned getBoolCmpOpcode(unsigned PredNum) {
+  auto Pred = static_cast<CmpInst::Predicate>(PredNum);
+  switch (Pred) {
+  case CmpInst::ICMP_EQ:
+    return SPIRV::OpLogicalEqual;
+  case CmpInst::ICMP_NE:
+    return SPIRV::OpLogicalNotEqual;
+  default:
+    llvm_unreachable("Unknown predicate type for Bool comparison");
+  }
+}
+
+bool SPIRVInstructionSelector::selectBitreverse(Register ResVReg,
+                                                const SPIRVType *ResType,
+                                                MachineInstr &I) const {
+  MachineBasicBlock &BB = *I.getParent();
+  return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpBitReverse))
+      .addDef(ResVReg)
+      .addUse(GR.getSPIRVTypeID(ResType))
+      .addUse(I.getOperand(1).getReg())
+      .constrainAllUses(TII, TRI, RBI);
+}
+
+bool SPIRVInstructionSelector::selectConstVector(Register ResVReg,
+                                                 const SPIRVType *ResType,
+                                                 MachineInstr &I) const {
+  // TODO: only const case is supported for now.
+  assert(std::all_of(
+      I.operands_begin(), I.operands_end(), [this](const MachineOperand &MO) {
+        if (MO.isDef())
+          return true;
+        if (!MO.isReg())
+          return false;
+        SPIRVType *ConstTy = this->MRI->getVRegDef(MO.getReg());
+        assert(ConstTy && ConstTy->getOpcode() == SPIRV::ASSIGN_TYPE &&
+               ConstTy->getOperand(1).isReg());
+        Register ConstReg = ConstTy->getOperand(1).getReg();
+        const MachineInstr *Const = this->MRI->getVRegDef(ConstReg);
+        assert(Const);
+        return (Const->getOpcode() == TargetOpcode::G_CONSTANT ||
+                Const->getOpcode() == TargetOpcode::G_FCONSTANT);
+      }));
+
+  auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                     TII.get(SPIRV::OpConstantComposite))
+                 .addDef(ResVReg)
+                 .addUse(GR.getSPIRVTypeID(ResType));
+  for (unsigned i = I.getNumExplicitDefs(); i < I.getNumExplicitOperands(); ++i)
+    MIB.addUse(I.getOperand(i).getReg());
+  return MIB.constrainAllUses(TII, TRI, RBI);
+}
+
+bool SPIRVInstructionSelector::selectCmp(Register ResVReg,
+                                         const SPIRVType *ResType,
+                                         unsigned CmpOpc,
+                                         MachineInstr &I) const {
+  Register Cmp0 = I.getOperand(2).getReg();
+  Register Cmp1 = I.getOperand(3).getReg();
+  assert(GR.getSPIRVTypeForVReg(Cmp0)->getOpcode() ==
+             GR.getSPIRVTypeForVReg(Cmp1)->getOpcode() &&
+         "CMP operands should have the same type");
+  return BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(CmpOpc))
+      .addDef(ResVReg)
+      .addUse(GR.getSPIRVTypeID(ResType))
+      .addUse(Cmp0)
+      .addUse(Cmp1)
+      .constrainAllUses(TII, TRI, RBI);
+}
+
+bool SPIRVInstructionSelector::selectICmp(Register ResVReg,
+                                          const SPIRVType *ResType,
+                                          MachineInstr &I) const {
+  auto Pred = I.getOperand(1).getPredicate();
+  unsigned CmpOpc;
+
+  Register CmpOperand = I.getOperand(2).getReg();
+  if (GR.isScalarOfType(CmpOperand, SPIRV::OpTypePointer))
+    CmpOpc = getPtrCmpOpcode(Pred);
+  else if (GR.isScalarOrVectorOfType(CmpOperand, SPIRV::OpTypeBool))
+    CmpOpc = getBoolCmpOpcode(Pred);
+  else
+    CmpOpc = getICmpOpcode(Pred);
+  return selectCmp(ResVReg, ResType, CmpOpc, I);
+}
+
+void SPIRVInstructionSelector::renderFImm32(MachineInstrBuilder &MIB,
+                                            const MachineInstr &I,
+                                            int OpIdx) const {
+  assert(I.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
+         "Expected G_FCONSTANT");
+  const ConstantFP *FPImm = I.getOperand(1).getFPImm();
+  addNumImm(FPImm->getValueAPF().bitcastToAPInt(), MIB);
+}
+
+void SPIRVInstructionSelector::renderImm32(MachineInstrBuilder &MIB,
+                                           const MachineInstr &I,
+                                           int OpIdx) const {
+  assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
+         "Expected G_CONSTANT");
+  addNumImm(I.getOperand(1).getCImm()->getValue(), MIB);
+}
+
+Register
+SPIRVInstructionSelector::buildI32Constant(uint32_t Val, MachineInstr &I,
+                                           const SPIRVType *ResType) const {
+  const SPIRVType *SpvI32Ty =
+      ResType ? ResType : GR.getOrCreateSPIRVIntegerType(32, I, TII);
+  Register NewReg;
+  NewReg = MRI->createGenericVirtualRegister(LLT::scalar(32));
+  MachineInstr *MI;
+  MachineBasicBlock &BB = *I.getParent();
+  if (Val == 0) {
+    MI = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantNull))
+             .addDef(NewReg)
+             .addUse(GR.getSPIRVTypeID(SpvI32Ty));
+  } else {
+    MI = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantI))
+             .addDef(NewReg)
+             .addUse(GR.getSPIRVTypeID(SpvI32Ty))
+             .addImm(APInt(32, Val).getZExtValue());
+  }
+  constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
+  return NewReg;
+}
+
+bool SPIRVInstructionSelector::selectFCmp(Register ResVReg,
+                                          const SPIRVType *ResType,
+                                          MachineInstr &I) const {
+  unsigned CmpOp = getFCmpOpcode(I.getOperand(1).getPredicate());
+  return selectCmp(ResVReg, ResType, CmpOp, I);
+}
+
+Register SPIRVInstructionSelector::buildZerosVal(const SPIRVType *ResType,
+                                                 MachineInstr &I) const {
+  return buildI32Constant(0, I, ResType);
+}
+
+Register SPIRVInstructionSelector::buildOnesVal(bool AllOnes,
+                                                const SPIRVType *ResType,
+                                                MachineInstr &I) const {
+  unsigned BitWidth = GR.getScalarOrVectorBitWidth(ResType);
+  APInt One = AllOnes ? APInt::getAllOnesValue(BitWidth)
+                      : APInt::getOneBitSet(BitWidth, 0);
+  Register OneReg = buildI32Constant(One.getZExtValue(), I, ResType);
+  if (ResType->getOpcode() == SPIRV::OpTypeVector) {
+    const unsigned NumEles = ResType->getOperand(2).getImm();
+    Register OneVec = MRI->createVirtualRegister(&SPIRV::IDRegClass);
+    unsigned Opcode = SPIRV::OpConstantComposite;
+    auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opcode))
+                   .addDef(OneVec)
+                   .addUse(GR.getSPIRVTypeID(ResType));
+    for (unsigned i = 0; i < NumEles; ++i)
+      MIB.addUse(OneReg);
+    constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+    return OneVec;
+  }
+  return OneReg;
+}
+
+bool SPIRVInstructionSelector::selectSelect(Register ResVReg,
+                                            const SPIRVType *ResType,
+                                            MachineInstr &I,
+                                            bool IsSigned) const {
+  // To extend a bool, we need to use OpSelect between constants.
+  Register ZeroReg = buildZerosVal(ResType, I);
+  Register OneReg = buildOnesVal(IsSigned, ResType, I);
+  bool IsScalarBool =
+      GR.isScalarOfType(I.getOperand(1).getReg(), SPIRV::OpTypeBool);
+  unsigned Opcode =
+      IsScalarBool ? SPIRV::OpSelectSISCond : SPIRV::OpSelectSIVCond;
+  return BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opcode))
+      .addDef(ResVReg)
+      .addUse(GR.getSPIRVTypeID(ResType))
+      .addUse(I.getOperand(1).getReg())
+      .addUse(OneReg)
+      .addUse(ZeroReg)
+      .constrainAllUses(TII, TRI, RBI);
+}
+
+bool SPIRVInstructionSelector::selectIToF(Register ResVReg,
+                                          const SPIRVType *ResType,
+                                          MachineInstr &I, bool IsSigned,
+                                          unsigned Opcode) const {
+  Register SrcReg = I.getOperand(1).getReg();
+  // We can convert bool value directly to float type without OpConvert*ToF,
+  // however the translator generates OpSelect+OpConvert*ToF, so we do the same.
+  if (GR.isScalarOrVectorOfType(I.getOperand(1).getReg(), SPIRV::OpTypeBool)) {
+    unsigned BitWidth = GR.getScalarOrVectorBitWidth(ResType);
+    SPIRVType *TmpType = GR.getOrCreateSPIRVIntegerType(BitWidth, I, TII);
+    if (ResType->getOpcode() == SPIRV::OpTypeVector) {
+      const unsigned NumElts = ResType->getOperand(2).getImm();
+      TmpType = GR.getOrCreateSPIRVVectorType(TmpType, NumElts, I, TII);
+    }
+    SrcReg = MRI->createVirtualRegister(&SPIRV::IDRegClass);
+    selectSelect(SrcReg, TmpType, I, false);
+  }
+  return selectUnOpWithSrc(ResVReg, ResType, I, SrcReg, Opcode);
+}
+
+bool SPIRVInstructionSelector::selectExt(Register ResVReg,
+                                         const SPIRVType *ResType,
+                                         MachineInstr &I, bool IsSigned) const {
+  if (GR.isScalarOrVectorOfType(I.getOperand(1).getReg(), SPIRV::OpTypeBool))
+    return selectSelect(ResVReg, ResType, I, IsSigned);
+  unsigned Opcode = IsSigned ? SPIRV::OpSConvert : SPIRV::OpUConvert;
+  return selectUnOp(ResVReg, ResType, I, Opcode);
+}
+
+bool SPIRVInstructionSelector::selectIntToBool(Register IntReg,
+                                               Register ResVReg,
+                                               const SPIRVType *IntTy,
+                                               const SPIRVType *BoolTy,
+                                               MachineInstr &I) const {
+  // To truncate to a bool, we use OpBitwiseAnd 1 and OpINotEqual to zero.
+  Register BitIntReg = MRI->createVirtualRegister(&SPIRV::IDRegClass);
+  bool IsVectorTy = IntTy->getOpcode() == SPIRV::OpTypeVector;
+  unsigned Opcode = IsVectorTy ? SPIRV::OpBitwiseAndV : SPIRV::OpBitwiseAndS;
+  Register Zero = buildZerosVal(IntTy, I);
+  Register One = buildOnesVal(false, IntTy, I);
+  MachineBasicBlock &BB = *I.getParent();
+  BuildMI(BB, I, I.getDebugLoc(), TII.get(Opcode))
+      .addDef(BitIntReg)
+      .addUse(GR.getSPIRVTypeID(IntTy))
+      .addUse(IntReg)
+      .addUse(One)
+      .constrainAllUses(TII, TRI, RBI);
+  return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpINotEqual))
+      .addDef(ResVReg)
+      .addUse(GR.getSPIRVTypeID(BoolTy))
+      .addUse(BitIntReg)
+      .addUse(Zero)
+      .constrainAllUses(TII, TRI, RBI);
+}
+
+bool SPIRVInstructionSelector::selectTrunc(Register ResVReg,
+                                           const SPIRVType *ResType,
+                                           MachineInstr &I) const {
+  if (GR.isScalarOrVectorOfType(ResVReg, SPIRV::OpTypeBool)) {
+    Register IntReg = I.getOperand(1).getReg();
+    const SPIRVType *ArgType = GR.getSPIRVTypeForVReg(IntReg);
+    return selectIntToBool(IntReg, ResVReg, ArgType, ResType, I);
+  }
+  bool IsSigned = GR.isScalarOrVectorSigned(ResType);
+  unsigned Opcode = IsSigned ? SPIRV::OpSConvert : SPIRV::OpUConvert;
+  return selectUnOp(ResVReg, ResType, I, Opcode);
+}
+
+bool SPIRVInstructionSelector::selectConst(Register ResVReg,
+                                           const SPIRVType *ResType,
+                                           const APInt &Imm,
+                                           MachineInstr &I) const {
+  assert(ResType->getOpcode() != SPIRV::OpTypePointer || Imm.isNullValue());
+  MachineBasicBlock &BB = *I.getParent();
+  if (ResType->getOpcode() == SPIRV::OpTypePointer && Imm.isNullValue()) {
+    return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantNull))
+        .addDef(ResVReg)
+        .addUse(GR.getSPIRVTypeID(ResType))
+        .constrainAllUses(TII, TRI, RBI);
+  }
+  auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantI))
+                 .addDef(ResVReg)
+                 .addUse(GR.getSPIRVTypeID(ResType));
+  // <=32-bit integers should be caught by the sdag pattern.
+  assert(Imm.getBitWidth() > 32);
+  addNumImm(Imm, MIB);
+  return MIB.constrainAllUses(TII, TRI, RBI);
+}
+
+bool SPIRVInstructionSelector::selectOpUndef(Register ResVReg,
+                                             const SPIRVType *ResType,
+                                             MachineInstr &I) const {
+  return BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpUndef))
+      .addDef(ResVReg)
+      .addUse(GR.getSPIRVTypeID(ResType))
+      .constrainAllUses(TII, TRI, RBI);
+}
+
+static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI) {
+  assert(MO.isReg());
+  const SPIRVType *TypeInst = MRI->getVRegDef(MO.getReg());
+  if (TypeInst->getOpcode() != SPIRV::ASSIGN_TYPE)
+    return false;
+  assert(TypeInst->getOperand(1).isReg());
+  MachineInstr *ImmInst = MRI->getVRegDef(TypeInst->getOperand(1).getReg());
+  return ImmInst->getOpcode() == TargetOpcode::G_CONSTANT;
+}
+
+static int64_t foldImm(const MachineOperand &MO, MachineRegisterInfo *MRI) {
+  const SPIRVType *TypeInst = MRI->getVRegDef(MO.getReg());
+  MachineInstr *ImmInst = MRI->getVRegDef(TypeInst->getOperand(1).getReg());
+  assert(ImmInst->getOpcode() == TargetOpcode::G_CONSTANT);
+  return ImmInst->getOperand(1).getCImm()->getZExtValue();
+}
+
+bool SPIRVInstructionSelector::selectInsertVal(Register ResVReg,
+                                               const SPIRVType *ResType,
+                                               MachineInstr &I) const {
+  MachineBasicBlock &BB = *I.getParent();
+  return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpCompositeInsert))
+      .addDef(ResVReg)
+      .addUse(GR.getSPIRVTypeID(ResType))
+      // object to insert
+      .addUse(I.getOperand(3).getReg())
+      // composite to insert into
+      .addUse(I.getOperand(2).getReg())
+      // TODO: support arbitrary number of indices
+      .addImm(foldImm(I.getOperand(4), MRI))
+      .constrainAllUses(TII, TRI, RBI);
+}
+
+bool SPIRVInstructionSelector::selectExtractVal(Register ResVReg,
+                                                const SPIRVType *ResType,
+                                                MachineInstr &I) const {
+  MachineBasicBlock &BB = *I.getParent();
+  return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpCompositeExtract))
+      .addDef(ResVReg)
+      .addUse(GR.getSPIRVTypeID(ResType))
+      .addUse(I.getOperand(2).getReg())
+      // TODO: support arbitrary number of indices
+      .addImm(foldImm(I.getOperand(3), MRI))
+      .constrainAllUses(TII, TRI, RBI);
+}
+
+bool SPIRVInstructionSelector::selectInsertElt(Register ResVReg,
+                                               const SPIRVType *ResType,
+                                               MachineInstr &I) const {
+  if (isImm(I.getOperand(4), MRI))
+    return selectInsertVal(ResVReg, ResType, I);
+  MachineBasicBlock &BB = *I.getParent();
+  return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpVectorInsertDynamic))
+      .addDef(ResVReg)
+      .addUse(GR.getSPIRVTypeID(ResType))
+      .addUse(I.getOperand(2).getReg())
+      .addUse(I.getOperand(3).getReg())
+      .addUse(I.getOperand(4).getReg())
+      .constrainAllUses(TII, TRI, RBI);
+}
+
+bool SPIRVInstructionSelector::selectExtractElt(Register ResVReg,
+                                                const SPIRVType *ResType,
+                                                MachineInstr &I) const {
+  if (isImm(I.getOperand(3), MRI))
+    return selectExtractVal(ResVReg, ResType, I);
+  MachineBasicBlock &BB = *I.getParent();
+  return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpVectorExtractDynamic))
+      .addDef(ResVReg)
+      .addUse(GR.getSPIRVTypeID(ResType))
+      .addUse(I.getOperand(2).getReg())
+      .addUse(I.getOperand(3).getReg())
+      .constrainAllUses(TII, TRI, RBI);
+}
+
+bool SPIRVInstructionSelector::selectGEP(Register ResVReg,
+                                         const SPIRVType *ResType,
+                                         MachineInstr &I) const {
+  // In general we should also support OpAccessChain instrs here (i.e. not
+  // PtrAccessChain) but SPIRV-LLVM Translator doesn't emit them at all and so
+  // do we to stay compliant with its test and more importantly consumers.
+  unsigned Opcode = I.getOperand(2).getImm() ? SPIRV::OpInBoundsPtrAccessChain
+                                             : SPIRV::OpPtrAccessChain;
+  auto Res = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opcode))
+                 .addDef(ResVReg)
+                 .addUse(GR.getSPIRVTypeID(ResType))
+                 // Object to get a pointer to.
+                 .addUse(I.getOperand(3).getReg());
+  // Adding indices.
+  for (unsigned i = 4; i < I.getNumExplicitOperands(); ++i)
+    Res.addUse(I.getOperand(i).getReg());
+  return Res.constrainAllUses(TII, TRI, RBI);
+}
+
+bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
+                                               const SPIRVType *ResType,
+                                               MachineInstr &I) const {
+  MachineBasicBlock &BB = *I.getParent();
+  switch (I.getIntrinsicID()) {
+  case Intrinsic::spv_load:
+    return selectLoad(ResVReg, ResType, I);
+    break;
+  case Intrinsic::spv_store:
+    return selectStore(I);
+    break;
+  case Intrinsic::spv_extractv:
+    return selectExtractVal(ResVReg, ResType, I);
+    break;
+  case Intrinsic::spv_insertv:
+    return selectInsertVal(ResVReg, ResType, I);
+    break;
+  case Intrinsic::spv_extractelt:
+    return selectExtractElt(ResVReg, ResType, I);
+    break;
+  case Intrinsic::spv_insertelt:
+    return selectInsertElt(ResVReg, ResType, I);
+    break;
+  case Intrinsic::spv_gep:
+    return selectGEP(ResVReg, ResType, I);
+    break;
+  case Intrinsic::spv_unref_global:
+  case Intrinsic::spv_init_global: {
+    MachineInstr *MI = MRI->getVRegDef(I.getOperand(1).getReg());
+    MachineInstr *Init = I.getNumExplicitOperands() > 2
+                             ? MRI->getVRegDef(I.getOperand(2).getReg())
+                             : nullptr;
+    assert(MI);
+    return selectGlobalValue(MI->getOperand(0).getReg(), *MI, Init);
+  } break;
+  case Intrinsic::spv_const_composite: {
+    // If no values are attached, the composite is null constant.
+    bool IsNull = I.getNumExplicitDefs() + 1 == I.getNumExplicitOperands();
+    unsigned Opcode =
+        IsNull ? SPIRV::OpConstantNull : SPIRV::OpConstantComposite;
+    auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(Opcode))
+                   .addDef(ResVReg)
+                   .addUse(GR.getSPIRVTypeID(ResType));
+    // skip type MD node we already used when generated assign.type for this
+    if (!IsNull) {
+      for (unsigned i = I.getNumExplicitDefs() + 1;
+           i < I.getNumExplicitOperands(); ++i) {
+        MIB.addUse(I.getOperand(i).getReg());
+      }
+    }
+    return MIB.constrainAllUses(TII, TRI, RBI);
+  } break;
+  case Intrinsic::spv_assign_name: {
+    auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpName));
+    MIB.addUse(I.getOperand(I.getNumExplicitDefs() + 1).getReg());
+    for (unsigned i = I.getNumExplicitDefs() + 2;
+         i < I.getNumExplicitOperands(); ++i) {
+      MIB.addImm(I.getOperand(i).getImm());
+    }
+    return MIB.constrainAllUses(TII, TRI, RBI);
+  } break;
+  case Intrinsic::spv_switch: {
+    auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpSwitch));
+    for (unsigned i = 1; i < I.getNumExplicitOperands(); ++i) {
+      if (I.getOperand(i).isReg())
+        MIB.addReg(I.getOperand(i).getReg());
+      else if (I.getOperand(i).isCImm())
+        addNumImm(I.getOperand(i).getCImm()->getValue(), MIB);
+      else if (I.getOperand(i).isMBB())
+        MIB.addMBB(I.getOperand(i).getMBB());
+      else
+        llvm_unreachable("Unexpected OpSwitch operand");
+    }
+    return MIB.constrainAllUses(TII, TRI, RBI);
+  } break;
+  default:
+    llvm_unreachable("Intrinsic selection not implemented");
+  }
+  return true;
+}
+
+bool SPIRVInstructionSelector::selectFrameIndex(Register ResVReg,
+                                                const SPIRVType *ResType,
+                                                MachineInstr &I) const {
+  return BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpVariable))
+      .addDef(ResVReg)
+      .addUse(GR.getSPIRVTypeID(ResType))
+      .addImm(static_cast<uint32_t>(SPIRV::StorageClass::Function))
+      .constrainAllUses(TII, TRI, RBI);
+}
+
+bool SPIRVInstructionSelector::selectBranch(MachineInstr &I) const {
+  // InstructionSelector walks backwards through the instructions. We can use
+  // both a G_BR and a G_BRCOND to create an OpBranchConditional. We hit G_BR
+  // first, so can generate an OpBranchConditional here. If there is no
+  // G_BRCOND, we just use OpBranch for a regular unconditional branch.
+  const MachineInstr *PrevI = I.getPrevNode();
+  MachineBasicBlock &MBB = *I.getParent();
+  if (PrevI != nullptr && PrevI->getOpcode() == TargetOpcode::G_BRCOND) {
+    return BuildMI(MBB, I, I.getDebugLoc(), TII.get(SPIRV::OpBranchConditional))
+        .addUse(PrevI->getOperand(0).getReg())
+        .addMBB(PrevI->getOperand(1).getMBB())
+        .addMBB(I.getOperand(0).getMBB())
+        .constrainAllUses(TII, TRI, RBI);
+  }
+  return BuildMI(MBB, I, I.getDebugLoc(), TII.get(SPIRV::OpBranch))
+      .addMBB(I.getOperand(0).getMBB())
+      .constrainAllUses(TII, TRI, RBI);
+}
+
+bool SPIRVInstructionSelector::selectBranchCond(MachineInstr &I) const {
+  // InstructionSelector walks backwards through the instructions. For an
+  // explicit conditional branch with no fallthrough, we use both a G_BR and a
+  // G_BRCOND to create an OpBranchConditional. We should hit G_BR first, and
+  // generate the OpBranchConditional in selectBranch above.
+  //
+  // If an OpBranchConditional has been generated, we simply return, as the work
+  // is alread done. If there is no OpBranchConditional, LLVM must be relying on
+  // implicit fallthrough to the next basic block, so we need to create an
+  // OpBranchConditional with an explicit "false" argument pointing to the next
+  // basic block that LLVM would fall through to.
+  const MachineInstr *NextI = I.getNextNode();
+  // Check if this has already been successfully selected.
+  if (NextI != nullptr && NextI->getOpcode() == SPIRV::OpBranchConditional)
+    return true;
+  // Must be relying on implicit block fallthrough, so generate an
+  // OpBranchConditional with the "next" basic block as the "false" target.
+  MachineBasicBlock &MBB = *I.getParent();
+  unsigned NextMBBNum = MBB.getNextNode()->getNumber();
+  MachineBasicBlock *NextMBB = I.getMF()->getBlockNumbered(NextMBBNum);
+  return BuildMI(MBB, I, I.getDebugLoc(), TII.get(SPIRV::OpBranchConditional))
+      .addUse(I.getOperand(0).getReg())
+      .addMBB(I.getOperand(1).getMBB())
+      .addMBB(NextMBB)
+      .constrainAllUses(TII, TRI, RBI);
+}
+
+bool SPIRVInstructionSelector::selectPhi(Register ResVReg,
+                                         const SPIRVType *ResType,
+                                         MachineInstr &I) const {
+  auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpPhi))
+                 .addDef(ResVReg)
+                 .addUse(GR.getSPIRVTypeID(ResType));
+  const unsigned NumOps = I.getNumOperands();
+  for (unsigned i = 1; i < NumOps; i += 2) {
+    MIB.addUse(I.getOperand(i + 0).getReg());
+    MIB.addMBB(I.getOperand(i + 1).getMBB());
+  }
+  return MIB.constrainAllUses(TII, TRI, RBI);
+}
+
+bool SPIRVInstructionSelector::selectGlobalValue(
+    Register ResVReg, MachineInstr &I, const MachineInstr *Init) const {
+  // FIXME: don't use MachineIRBuilder here, replace it with BuildMI.
+  MachineIRBuilder MIRBuilder(I);
+  const GlobalValue *GV = I.getOperand(1).getGlobal();
+  SPIRVType *ResType = GR.getOrCreateSPIRVType(
+      GV->getType(), MIRBuilder, SPIRV::AccessQualifier::ReadWrite, false);
+
+  std::string GlobalIdent = GV->getGlobalIdentifier();
+  // TODO: suport @llvm.global.annotations.
+  auto GlobalVar = cast<GlobalVariable>(GV);
+
+  bool HasInit = GlobalVar->hasInitializer() &&
+                 !isa<UndefValue>(GlobalVar->getInitializer());
+  // Skip empty declaration for GVs with initilaizers till we get the decl with
+  // passed initializer.
+  if (HasInit && !Init)
+    return true;
+
+  unsigned AddrSpace = GV->getAddressSpace();
+  SPIRV::StorageClass Storage = addressSpaceToStorageClass(AddrSpace);
+  bool HasLnkTy = GV->getLinkage() != GlobalValue::InternalLinkage &&
+                  Storage != SPIRV::StorageClass::Function;
+  SPIRV::LinkageType LnkType =
+      (GV->isDeclaration() || GV->hasAvailableExternallyLinkage())
+          ? SPIRV::LinkageType::Import
+          : SPIRV::LinkageType::Export;
+
+  Register Reg = GR.buildGlobalVariable(ResVReg, ResType, GlobalIdent, GV,
+                                        Storage, Init, GlobalVar->isConstant(),
+                                        HasLnkTy, LnkType, MIRBuilder, true);
+  return Reg.isValid();
+}
+
+namespace llvm {
+InstructionSelector *
+createSPIRVInstructionSelector(const SPIRVTargetMachine &TM,
+                               const SPIRVSubtarget &Subtarget,
+                               const RegisterBankInfo &RBI) {
+  return new SPIRVInstructionSelector(TM, Subtarget, RBI);
+}
+} // namespace llvm
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
new file mode 100644
index 000000000000..87f9e9545dd3
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -0,0 +1,301 @@
+//===- SPIRVLegalizerInfo.cpp --- SPIR-V Legalization Rules ------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the targeting of the Machinelegalizer class for SPIR-V.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPIRVLegalizerInfo.h"
+#include "SPIRV.h"
+#include "SPIRVGlobalRegistry.h"
+#include "SPIRVSubtarget.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+
+using namespace llvm;
+using namespace llvm::LegalizeActions;
+using namespace llvm::LegalityPredicates;
+
+static const std::set<unsigned> TypeFoldingSupportingOpcs = {
+    TargetOpcode::G_ADD,
+    TargetOpcode::G_FADD,
+    TargetOpcode::G_SUB,
+    TargetOpcode::G_FSUB,
+    TargetOpcode::G_MUL,
+    TargetOpcode::G_FMUL,
+    TargetOpcode::G_SDIV,
+    TargetOpcode::G_UDIV,
+    TargetOpcode::G_FDIV,
+    TargetOpcode::G_SREM,
+    TargetOpcode::G_UREM,
+    TargetOpcode::G_FREM,
+    TargetOpcode::G_FNEG,
+    TargetOpcode::G_CONSTANT,
+    TargetOpcode::G_FCONSTANT,
+    TargetOpcode::G_AND,
+    TargetOpcode::G_OR,
+    TargetOpcode::G_XOR,
+    TargetOpcode::G_SHL,
+    TargetOpcode::G_ASHR,
+    TargetOpcode::G_LSHR,
+    TargetOpcode::G_SELECT,
+    TargetOpcode::G_EXTRACT_VECTOR_ELT,
+};
+
+bool isTypeFoldingSupported(unsigned Opcode) {
+  return TypeFoldingSupportingOpcs.count(Opcode) > 0;
+}
+
+SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
+  using namespace TargetOpcode;
+
+  this->ST = &ST;
+  GR = ST.getSPIRVGlobalRegistry();
+
+  const LLT s1 = LLT::scalar(1);
+  const LLT s8 = LLT::scalar(8);
+  const LLT s16 = LLT::scalar(16);
+  const LLT s32 = LLT::scalar(32);
+  const LLT s64 = LLT::scalar(64);
+
+  const LLT v16s64 = LLT::fixed_vector(16, 64);
+  const LLT v16s32 = LLT::fixed_vector(16, 32);
+  const LLT v16s16 = LLT::fixed_vector(16, 16);
+  const LLT v16s8 = LLT::fixed_vector(16, 8);
+  const LLT v16s1 = LLT::fixed_vector(16, 1);
+
+  const LLT v8s64 = LLT::fixed_vector(8, 64);
+  const LLT v8s32 = LLT::fixed_vector(8, 32);
+  const LLT v8s16 = LLT::fixed_vector(8, 16);
+  const LLT v8s8 = LLT::fixed_vector(8, 8);
+  const LLT v8s1 = LLT::fixed_vector(8, 1);
+
+  const LLT v4s64 = LLT::fixed_vector(4, 64);
+  const LLT v4s32 = LLT::fixed_vector(4, 32);
+  const LLT v4s16 = LLT::fixed_vector(4, 16);
+  const LLT v4s8 = LLT::fixed_vector(4, 8);
+  const LLT v4s1 = LLT::fixed_vector(4, 1);
+
+  const LLT v3s64 = LLT::fixed_vector(3, 64);
+  const LLT v3s32 = LLT::fixed_vector(3, 32);
+  const LLT v3s16 = LLT::fixed_vector(3, 16);
+  const LLT v3s8 = LLT::fixed_vector(3, 8);
+  const LLT v3s1 = LLT::fixed_vector(3, 1);
+
+  const LLT v2s64 = LLT::fixed_vector(2, 64);
+  const LLT v2s32 = LLT::fixed_vector(2, 32);
+  const LLT v2s16 = LLT::fixed_vector(2, 16);
+  const LLT v2s8 = LLT::fixed_vector(2, 8);
+  const LLT v2s1 = LLT::fixed_vector(2, 1);
+
+  const unsigned PSize = ST.getPointerSize();
+  const LLT p0 = LLT::pointer(0, PSize); // Function
+  const LLT p1 = LLT::pointer(1, PSize); // CrossWorkgroup
+  const LLT p2 = LLT::pointer(2, PSize); // UniformConstant
+  const LLT p3 = LLT::pointer(3, PSize); // Workgroup
+  const LLT p4 = LLT::pointer(4, PSize); // Generic
+  const LLT p5 = LLT::pointer(5, PSize); // Input
+
+  // TODO: remove copy-pasting here by using concatenation in some way.
+  auto allPtrsScalarsAndVectors = {
+      p0,    p1,    p2,    p3,    p4,    p5,    s1,     s8,     s16,
+      s32,   s64,   v2s1,  v2s8,  v2s16, v2s32, v2s64,  v3s1,   v3s8,
+      v3s16, v3s32, v3s64, v4s1,  v4s8,  v4s16, v4s32,  v4s64,  v8s1,
+      v8s8,  v8s16, v8s32, v8s64, v16s1, v16s8, v16s16, v16s32, v16s64};
+
+  auto allScalarsAndVectors = {
+      s1,   s8,   s16,   s32,   s64,   v2s1,  v2s8,  v2s16,  v2s32,  v2s64,
+      v3s1, v3s8, v3s16, v3s32, v3s64, v4s1,  v4s8,  v4s16,  v4s32,  v4s64,
+      v8s1, v8s8, v8s16, v8s32, v8s64, v16s1, v16s8, v16s16, v16s32, v16s64};
+
+  auto allIntScalarsAndVectors = {s8,    s16,   s32,   s64,    v2s8,   v2s16,
+                                  v2s32, v2s64, v3s8,  v3s16,  v3s32,  v3s64,
+                                  v4s8,  v4s16, v4s32, v4s64,  v8s8,   v8s16,
+                                  v8s32, v8s64, v16s8, v16s16, v16s32, v16s64};
+
+  auto allBoolScalarsAndVectors = {s1, v2s1, v3s1, v4s1, v8s1, v16s1};
+
+  auto allIntScalars = {s8, s16, s32, s64};
+
+  auto allFloatScalarsAndVectors = {
+      s16,   s32,   s64,   v2s16, v2s32, v2s64, v3s16,  v3s32,  v3s64,
+      v4s16, v4s32, v4s64, v8s16, v8s32, v8s64, v16s16, v16s32, v16s64};
+
+  auto allFloatAndIntScalars = allIntScalars;
+
+  auto allPtrs = {p0, p1, p2, p3, p4, p5};
+  auto allWritablePtrs = {p0, p1, p3, p4};
+
+  for (auto Opc : TypeFoldingSupportingOpcs)
+    getActionDefinitionsBuilder(Opc).custom();
+
+  getActionDefinitionsBuilder(G_GLOBAL_VALUE).alwaysLegal();
+
+  // TODO: add proper rules for vectors legalization.
+  getActionDefinitionsBuilder({G_BUILD_VECTOR, G_SHUFFLE_VECTOR}).alwaysLegal();
+
+  getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE})
+      .legalIf(all(typeInSet(0, allWritablePtrs), typeInSet(1, allPtrs)));
+
+  getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
+      .legalForCartesianProduct(allPtrs, allPtrs);
+
+  getActionDefinitionsBuilder({G_LOAD, G_STORE}).legalIf(typeInSet(1, allPtrs));
+
+  getActionDefinitionsBuilder(G_BITREVERSE).legalFor(allFloatScalarsAndVectors);
+
+  getActionDefinitionsBuilder(G_FMA).legalFor(allFloatScalarsAndVectors);
+
+  getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
+      .legalForCartesianProduct(allIntScalarsAndVectors,
+                                allFloatScalarsAndVectors);
+
+  getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
+      .legalForCartesianProduct(allFloatScalarsAndVectors,
+                                allScalarsAndVectors);
+
+  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
+      .legalFor(allIntScalarsAndVectors);
+
+  getActionDefinitionsBuilder(G_CTPOP).legalForCartesianProduct(
+      allIntScalarsAndVectors, allIntScalarsAndVectors);
+
+  getActionDefinitionsBuilder(G_PHI).legalFor(allPtrsScalarsAndVectors);
+
+  getActionDefinitionsBuilder(G_BITCAST).legalIf(all(
+      typeInSet(0, allPtrsScalarsAndVectors),
+      typeInSet(1, allPtrsScalarsAndVectors),
+      LegalityPredicate(([=](const LegalityQuery &Query) {
+        return Query.Types[0].getSizeInBits() == Query.Types[1].getSizeInBits();
+      }))));
+
+  getActionDefinitionsBuilder(G_IMPLICIT_DEF).alwaysLegal();
+
+  getActionDefinitionsBuilder(G_INTTOPTR)
+      .legalForCartesianProduct(allPtrs, allIntScalars);
+  getActionDefinitionsBuilder(G_PTRTOINT)
+      .legalForCartesianProduct(allIntScalars, allPtrs);
+  getActionDefinitionsBuilder(G_PTR_ADD).legalForCartesianProduct(
+      allPtrs, allIntScalars);
+
+  // ST.canDirectlyComparePointers() for pointer args is supported in
+  // legalizeCustom().
+  getActionDefinitionsBuilder(G_ICMP).customIf(
+      all(typeInSet(0, allBoolScalarsAndVectors),
+          typeInSet(1, allPtrsScalarsAndVectors)));
+
+  getActionDefinitionsBuilder(G_FCMP).legalIf(
+      all(typeInSet(0, allBoolScalarsAndVectors),
+          typeInSet(1, allFloatScalarsAndVectors)));
+
+  getActionDefinitionsBuilder({G_ATOMICRMW_OR, G_ATOMICRMW_ADD, G_ATOMICRMW_AND,
+                               G_ATOMICRMW_MAX, G_ATOMICRMW_MIN,
+                               G_ATOMICRMW_SUB, G_ATOMICRMW_XOR,
+                               G_ATOMICRMW_UMAX, G_ATOMICRMW_UMIN})
+      .legalForCartesianProduct(allIntScalars, allWritablePtrs);
+
+  getActionDefinitionsBuilder(G_ATOMICRMW_XCHG)
+      .legalForCartesianProduct(allFloatAndIntScalars, allWritablePtrs);
+
+  getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS).lower();
+  // TODO: add proper legalization rules.
+  getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG).alwaysLegal();
+
+  getActionDefinitionsBuilder({G_UADDO, G_USUBO, G_SMULO, G_UMULO})
+      .alwaysLegal();
+
+  // Extensions.
+  getActionDefinitionsBuilder({G_TRUNC, G_ZEXT, G_SEXT, G_ANYEXT})
+      .legalForCartesianProduct(allScalarsAndVectors);
+
+  // FP conversions.
+  getActionDefinitionsBuilder({G_FPTRUNC, G_FPEXT})
+      .legalForCartesianProduct(allFloatScalarsAndVectors);
+
+  // Pointer-handling.
+  getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});
+
+  // Control-flow.
+  getActionDefinitionsBuilder(G_BRCOND).legalFor({s1});
+
+  getActionDefinitionsBuilder({G_FPOW,
+                               G_FEXP,
+                               G_FEXP2,
+                               G_FLOG,
+                               G_FLOG2,
+                               G_FABS,
+                               G_FMINNUM,
+                               G_FMAXNUM,
+                               G_FCEIL,
+                               G_FCOS,
+                               G_FSIN,
+                               G_FSQRT,
+                               G_FFLOOR,
+                               G_FRINT,
+                               G_FNEARBYINT,
+                               G_INTRINSIC_ROUND,
+                               G_INTRINSIC_TRUNC,
+                               G_FMINIMUM,
+                               G_FMAXIMUM,
+                               G_INTRINSIC_ROUNDEVEN})
+      .legalFor(allFloatScalarsAndVectors);
+
+  getActionDefinitionsBuilder(G_FCOPYSIGN)
+      .legalForCartesianProduct(allFloatScalarsAndVectors,
+                                allFloatScalarsAndVectors);
+
+  getActionDefinitionsBuilder(G_FPOWI).legalForCartesianProduct(
+      allFloatScalarsAndVectors, allIntScalarsAndVectors);
+
+  getLegacyLegalizerInfo().computeTables();
+  verify(*ST.getInstrInfo());
+}
+
+static Register convertPtrToInt(Register Reg, LLT ConvTy, SPIRVType *SpirvType,
+                                LegalizerHelper &Helper,
+                                MachineRegisterInfo &MRI,
+                                SPIRVGlobalRegistry *GR) {
+  Register ConvReg = MRI.createGenericVirtualRegister(ConvTy);
+  GR->assignSPIRVTypeToVReg(SpirvType, ConvReg, Helper.MIRBuilder.getMF());
+  Helper.MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT)
+      .addDef(ConvReg)
+      .addUse(Reg);
+  return ConvReg;
+}
+
+bool SPIRVLegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
+                                        MachineInstr &MI) const {
+  auto Opc = MI.getOpcode();
+  MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+  if (!isTypeFoldingSupported(Opc)) {
+    assert(Opc == TargetOpcode::G_ICMP);
+    assert(GR->getSPIRVTypeForVReg(MI.getOperand(0).getReg()));
+    auto &Op0 = MI.getOperand(2);
+    auto &Op1 = MI.getOperand(3);
+    Register Reg0 = Op0.getReg();
+    Register Reg1 = Op1.getReg();
+    CmpInst::Predicate Cond =
+        static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
+    if ((!ST->canDirectlyComparePointers() ||
+         (Cond != CmpInst::ICMP_EQ && Cond != CmpInst::ICMP_NE)) &&
+        MRI.getType(Reg0).isPointer() && MRI.getType(Reg1).isPointer()) {
+      LLT ConvT = LLT::scalar(ST->getPointerSize());
+      Type *LLVMTy = IntegerType::get(MI.getMF()->getFunction().getContext(),
+                                      ST->getPointerSize());
+      SPIRVType *SpirvTy = GR->getOrCreateSPIRVType(LLVMTy, Helper.MIRBuilder);
+      Op0.setReg(convertPtrToInt(Reg0, ConvT, SpirvTy, Helper, MRI, GR));
+      Op1.setReg(convertPtrToInt(Reg1, ConvT, SpirvTy, Helper, MRI, GR));
+    }
+    return true;
+  }
+  // TODO: implement legalization for other opcodes.
+  return true;
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h
new file mode 100644
index 000000000000..2541ff29edb0
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h
@@ -0,0 +1,36 @@
+//===- SPIRVLegalizerInfo.h --- SPIR-V Legalization Rules --------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the targeting of the MachineLegalizer class for SPIR-V.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPIRV_SPIRVMACHINELEGALIZER_H
+#define LLVM_LIB_TARGET_SPIRV_SPIRVMACHINELEGALIZER_H
+
+#include "SPIRVGlobalRegistry.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+
+bool isTypeFoldingSupported(unsigned Opcode);
+
+namespace llvm {
+
+class LLVMContext;
+class SPIRVSubtarget;
+
+// This class provides the information for legalizing SPIR-V instructions.
+class SPIRVLegalizerInfo : public LegalizerInfo {
+  const SPIRVSubtarget *ST;
+  SPIRVGlobalRegistry *GR;
+
+public:
+  bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override;
+  SPIRVLegalizerInfo(const SPIRVSubtarget &ST);
+};
+} // namespace llvm
+#endif // LLVM_LIB_TARGET_SPIRV_SPIRVMACHINELEGALIZER_H
diff --git a/llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp b/llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp
new file mode 100644
index 000000000000..8e4ab973bf07
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp
@@ -0,0 +1,58 @@
+//=- SPIRVMCInstLower.cpp - Convert SPIR-V MachineInstr to MCInst -*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower SPIR-V MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPIRVMCInstLower.h"
+#include "SPIRV.h"
+#include "SPIRVModuleAnalysis.h"
+#include "SPIRVUtils.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/Constants.h"
+
+using namespace llvm;
+
+void SPIRVMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI,
+                             SPIRV::ModuleAnalysisInfo *MAI) const {
+  OutMI.setOpcode(MI->getOpcode());
+  const MachineFunction *MF = MI->getMF();
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    MCOperand MCOp;
+    switch (MO.getType()) {
+    default:
+      llvm_unreachable("unknown operand type");
+    case MachineOperand::MO_GlobalAddress: {
+      Register FuncReg = MAI->getFuncReg(MO.getGlobal()->getGlobalIdentifier());
+      assert(FuncReg.isValid() && "Cannot find function Id");
+      MCOp = MCOperand::createReg(FuncReg);
+      break;
+    }
+    case MachineOperand::MO_MachineBasicBlock:
+      MCOp = MCOperand::createReg(MAI->getOrCreateMBBRegister(*MO.getMBB()));
+      break;
+    case MachineOperand::MO_Register: {
+      Register NewReg = MAI->getRegisterAlias(MF, MO.getReg());
+      MCOp = MCOperand::createReg(NewReg.isValid() ? NewReg : MO.getReg());
+      break;
+    }
+    case MachineOperand::MO_Immediate:
+      MCOp = MCOperand::createImm(MO.getImm());
+      break;
+    case MachineOperand::MO_FPImmediate:
+      MCOp = MCOperand::createDFPImm(
+          MO.getFPImm()->getValueAPF().convertToFloat());
+      break;
+    }
+
+    OutMI.addOperand(MCOp);
+  }
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVMCInstLower.h b/llvm/lib/Target/SPIRV/SPIRVMCInstLower.h
new file mode 100644
index 000000000000..8392656ed067
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVMCInstLower.h
@@ -0,0 +1,29 @@
+//=- SPIRVMCInstLower.h -- Convert SPIR-V MachineInstr to MCInst --*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPIRV_SPIRVMCINSTLOWER_H
+#define LLVM_LIB_TARGET_SPIRV_SPIRVMCINSTLOWER_H
+
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class MCInst;
+class MachineInstr;
+namespace SPIRV {
+struct ModuleAnalysisInfo;
+} // namespace SPIRV
+
+// This class is used to lower a MachineInstr into an MCInst.
+class LLVM_LIBRARY_VISIBILITY SPIRVMCInstLower {
+public:
+  void lower(const MachineInstr *MI, MCInst &OutMI,
+             SPIRV::ModuleAnalysisInfo *MAI) const;
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_SPIRV_SPIRVMCINSTLOWER_H
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
new file mode 100644
index 000000000000..fa78dd7942c6
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -0,0 +1,250 @@
+//===- SPIRVModuleAnalysis.cpp - analysis of global instrs & regs - C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The analysis collects instructions that should be output at the module level
+// and performs the global register numbering.
+//
+// The results of this analysis are used in AsmPrinter to rename registers
+// globally and to output required instructions at the module level.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPIRVModuleAnalysis.h"
+#include "SPIRV.h"
+#include "SPIRVGlobalRegistry.h"
+#include "SPIRVSubtarget.h"
+#include "SPIRVTargetMachine.h"
+#include "SPIRVUtils.h"
+#include "TargetInfo/SPIRVTargetInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "spirv-module-analysis"
+
+char llvm::SPIRVModuleAnalysis::ID = 0;
+
+namespace llvm {
+void initializeSPIRVModuleAnalysisPass(PassRegistry &);
+} // namespace llvm
+
+INITIALIZE_PASS(SPIRVModuleAnalysis, DEBUG_TYPE, "SPIRV module analysis", true,
+                true)
+
+// Retrieve an unsigned from an MDNode with a list of them as operands.
+static unsigned getMetadataUInt(MDNode *MdNode, unsigned OpIndex,
+                                unsigned DefaultVal = 0) {
+  if (MdNode && OpIndex < MdNode->getNumOperands()) {
+    const auto &Op = MdNode->getOperand(OpIndex);
+    return mdconst::extract<ConstantInt>(Op)->getZExtValue();
+  }
+  return DefaultVal;
+}
+
+void SPIRVModuleAnalysis::setBaseInfo(const Module &M) {
+  MAI.MaxID = 0;
+  for (int i = 0; i < SPIRV::NUM_MODULE_SECTIONS; i++)
+    MAI.MS[i].clear();
+  MAI.RegisterAliasTable.clear();
+  MAI.InstrsToDelete.clear();
+  MAI.FuncNameMap.clear();
+  MAI.GlobalVarList.clear();
+
+  // TODO: determine memory model and source language from the configuratoin.
+  MAI.Mem = SPIRV::MemoryModel::OpenCL;
+  MAI.SrcLang = SPIRV::SourceLanguage::OpenCL_C;
+  unsigned PtrSize = ST->getPointerSize();
+  MAI.Addr = PtrSize == 32   ? SPIRV::AddressingModel::Physical32
+             : PtrSize == 64 ? SPIRV::AddressingModel::Physical64
+                             : SPIRV::AddressingModel::Logical;
+  // Get the OpenCL version number from metadata.
+  // TODO: support other source languages.
+  MAI.SrcLangVersion = 0;
+  if (auto VerNode = M.getNamedMetadata("opencl.ocl.version")) {
+    // Construct version literal according to OpenCL 2.2 environment spec.
+    auto VersionMD = VerNode->getOperand(0);
+    unsigned MajorNum = getMetadataUInt(VersionMD, 0, 2);
+    unsigned MinorNum = getMetadataUInt(VersionMD, 1);
+    unsigned RevNum = getMetadataUInt(VersionMD, 2);
+    MAI.SrcLangVersion = 0 | (MajorNum << 16) | (MinorNum << 8) | RevNum;
+  }
+}
+
+// True if there is an instruction in the MS list with all the same operands as
+// the given instruction has (after the given starting index).
+// TODO: maybe it needs to check Opcodes too.
+static bool findSameInstrInMS(const MachineInstr &A,
+                              SPIRV::ModuleSectionType MSType,
+                              SPIRV::ModuleAnalysisInfo &MAI,
+                              bool UpdateRegAliases,
+                              unsigned StartOpIndex = 0) {
+  for (const auto *B : MAI.MS[MSType]) {
+    const unsigned NumAOps = A.getNumOperands();
+    if (NumAOps == B->getNumOperands() && A.getNumDefs() == B->getNumDefs()) {
+      bool AllOpsMatch = true;
+      for (unsigned i = StartOpIndex; i < NumAOps && AllOpsMatch; ++i) {
+        if (A.getOperand(i).isReg() && B->getOperand(i).isReg()) {
+          Register RegA = A.getOperand(i).getReg();
+          Register RegB = B->getOperand(i).getReg();
+          AllOpsMatch = MAI.getRegisterAlias(A.getMF(), RegA) ==
+                        MAI.getRegisterAlias(B->getMF(), RegB);
+        } else {
+          AllOpsMatch = A.getOperand(i).isIdenticalTo(B->getOperand(i));
+        }
+      }
+      if (AllOpsMatch) {
+        if (UpdateRegAliases) {
+          assert(A.getOperand(0).isReg() && B->getOperand(0).isReg());
+          Register LocalReg = A.getOperand(0).getReg();
+          Register GlobalReg =
+              MAI.getRegisterAlias(B->getMF(), B->getOperand(0).getReg());
+          MAI.setRegisterAlias(A.getMF(), LocalReg, GlobalReg);
+        }
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// Look for IDs declared with Import linkage, and map the imported name string
+// to the register defining that variable (which will usually be the result of
+// an OpFunction). This lets us call externally imported functions using
+// the correct ID registers.
+void SPIRVModuleAnalysis::collectFuncNames(MachineInstr &MI,
+                                           const Function &F) {
+  if (MI.getOpcode() == SPIRV::OpDecorate) {
+    // If it's got Import linkage.
+    auto Dec = MI.getOperand(1).getImm();
+    if (Dec == static_cast<unsigned>(SPIRV::Decoration::LinkageAttributes)) {
+      auto Lnk = MI.getOperand(MI.getNumOperands() - 1).getImm();
+      if (Lnk == static_cast<unsigned>(SPIRV::LinkageType::Import)) {
+        // Map imported function name to function ID register.
+        std::string Name = getStringImm(MI, 2);
+        Register Target = MI.getOperand(0).getReg();
+        // TODO: check defs from different MFs.
+        MAI.FuncNameMap[Name] = MAI.getRegisterAlias(MI.getMF(), Target);
+      }
+    }
+  } else if (MI.getOpcode() == SPIRV::OpFunction) {
+    // Record all internal OpFunction declarations.
+    Register Reg = MI.defs().begin()->getReg();
+    Register GlobalReg = MAI.getRegisterAlias(MI.getMF(), Reg);
+    assert(GlobalReg.isValid());
+    // TODO: check that it does not conflict with existing entries.
+    MAI.FuncNameMap[F.getGlobalIdentifier()] = GlobalReg;
+  }
+}
+
+// Collect the given instruction in the specified MS. We assume global register
+// numbering has already occurred by this point. We can directly compare reg
+// arguments when detecting duplicates.
+static void collectOtherInstr(MachineInstr &MI, SPIRV::ModuleAnalysisInfo &MAI,
+                              SPIRV::ModuleSectionType MSType,
+                              bool IsConstOrType = false) {
+  MAI.setSkipEmission(&MI);
+  if (findSameInstrInMS(MI, MSType, MAI, IsConstOrType, IsConstOrType ? 1 : 0))
+    return; // Found a duplicate, so don't add it.
+  // No duplicates, so add it.
+  MAI.MS[MSType].push_back(&MI);
+}
+
+// Some global instructions make reference to function-local ID regs, so cannot
+// be correctly collected until these registers are globally numbered.
+void SPIRVModuleAnalysis::processOtherInstrs(const Module &M) {
+  for (auto F = M.begin(), E = M.end(); F != E; ++F) {
+    if ((*F).isDeclaration())
+      continue;
+    MachineFunction *MF = MMI->getMachineFunction(*F);
+    assert(MF);
+    unsigned FCounter = 0;
+    for (MachineBasicBlock &MBB : *MF)
+      for (MachineInstr &MI : MBB) {
+        if (MI.getOpcode() == SPIRV::OpFunction)
+          FCounter++;
+        if (MAI.getSkipEmission(&MI))
+          continue;
+        const unsigned OpCode = MI.getOpcode();
+        const bool IsFuncOrParm =
+            OpCode == SPIRV::OpFunction || OpCode == SPIRV::OpFunctionParameter;
+        const bool IsConstOrType =
+            TII->isConstantInstr(MI) || TII->isTypeDeclInstr(MI);
+        if (OpCode == SPIRV::OpName || OpCode == SPIRV::OpMemberName) {
+          collectOtherInstr(MI, MAI, SPIRV::MB_DebugNames);
+        } else if (OpCode == SPIRV::OpEntryPoint) {
+          collectOtherInstr(MI, MAI, SPIRV::MB_EntryPoints);
+        } else if (TII->isDecorationInstr(MI)) {
+          collectOtherInstr(MI, MAI, SPIRV::MB_Annotations);
+          collectFuncNames(MI, *F);
+        } else if (IsConstOrType || (FCounter > 1 && IsFuncOrParm)) {
+          // Now OpSpecConstant*s are not in DT,
+          // but they need to be collected anyway.
+          enum SPIRV::ModuleSectionType Type =
+              IsFuncOrParm ? SPIRV::MB_ExtFuncDecls : SPIRV::MB_TypeConstVars;
+          collectOtherInstr(MI, MAI, Type, IsConstOrType);
+        } else if (OpCode == SPIRV::OpFunction) {
+          collectFuncNames(MI, *F);
+        }
+      }
+  }
+}
+
+// Number registers in all functions globally from 0 onwards and store
+// the result in global register alias table.
+void SPIRVModuleAnalysis::numberRegistersGlobally(const Module &M) {
+  for (auto F = M.begin(), E = M.end(); F != E; ++F) {
+    if ((*F).isDeclaration())
+      continue;
+    MachineFunction *MF = MMI->getMachineFunction(*F);
+    assert(MF);
+    for (MachineBasicBlock &MBB : *MF) {
+      for (MachineInstr &MI : MBB) {
+        for (MachineOperand &Op : MI.operands()) {
+          if (!Op.isReg())
+            continue;
+          Register Reg = Op.getReg();
+          if (MAI.hasRegisterAlias(MF, Reg))
+            continue;
+          Register NewReg = Register::index2VirtReg(MAI.getNextID());
+          MAI.setRegisterAlias(MF, Reg, NewReg);
+        }
+      }
+    }
+  }
+}
+
+struct SPIRV::ModuleAnalysisInfo SPIRVModuleAnalysis::MAI;
+
+void SPIRVModuleAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetPassConfig>();
+  AU.addRequired<MachineModuleInfoWrapperPass>();
+}
+
+bool SPIRVModuleAnalysis::runOnModule(Module &M) {
+  SPIRVTargetMachine &TM =
+      getAnalysis<TargetPassConfig>().getTM<SPIRVTargetMachine>();
+  ST = TM.getSubtargetImpl();
+  GR = ST->getSPIRVGlobalRegistry();
+  TII = ST->getInstrInfo();
+
+  MMI = &getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+
+  setBaseInfo(M);
+
+  // TODO: Process type/const/global var/func decl instructions, number their
+  // destination registers from 0 to N, collect Extensions and Capabilities.
+
+  // Number rest of registers from N+1 onwards.
+  numberRegistersGlobally(M);
+
+  // Collect OpName, OpEntryPoint, OpDecorate etc, process other instructions.
+  processOtherInstrs(M);
+
+  return false;
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
new file mode 100644
index 000000000000..1bef13d458c1
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
@@ -0,0 +1,137 @@
+//===- SPIRVModuleAnalysis.h - analysis of global instrs & regs -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The analysis collects instructions that should be output at the module level
+// and performs the global register numbering.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPIRV_SPIRVMODULEANALYSIS_H
+#define LLVM_LIB_TARGET_SPIRV_SPIRVMODULEANALYSIS_H
+
+#include "MCTargetDesc/SPIRVBaseInfo.h"
+#include "SPIRVSubtarget.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+
+namespace llvm {
+class MachineFunction;
+class MachineModuleInfo;
+
+namespace SPIRV {
+// The enum contains logical module sections for the instruction collection.
+enum ModuleSectionType {
+  //  MB_Capabilities, MB_Extensions, MB_ExtInstImports, MB_MemoryModel,
+  MB_EntryPoints, // All OpEntryPoint instructions (if any).
+  //  MB_ExecutionModes, MB_DebugSourceAndStrings,
+  MB_DebugNames,           // All OpName and OpMemberName intrs.
+  MB_DebugModuleProcessed, // All OpModuleProcessed instructions.
+  MB_Annotations,          // OpDecorate, OpMemberDecorate etc.
+  MB_TypeConstVars,        // OpTypeXXX, OpConstantXXX, and global OpVariables.
+  MB_ExtFuncDecls,         // OpFunction etc. to declare for external funcs.
+  NUM_MODULE_SECTIONS      // Total number of sections requiring basic blocks.
+};
+
+using InstrList = SmallVector<MachineInstr *>;
+// Maps a local register to the corresponding global alias.
+using LocalToGlobalRegTable = std::map<Register, Register>;
+using RegisterAliasMapTy =
+    std::map<const MachineFunction *, LocalToGlobalRegTable>;
+
+// The struct contains results of the module analysis and methods
+// to access them.
+struct ModuleAnalysisInfo {
+  SPIRV::MemoryModel Mem;
+  SPIRV::AddressingModel Addr;
+  SPIRV::SourceLanguage SrcLang;
+  unsigned SrcLangVersion;
+  // Contains the list of all global OpVariables in the module.
+  SmallVector<MachineInstr *, 4> GlobalVarList;
+  // Maps function names to coresponding function ID registers.
+  StringMap<Register> FuncNameMap;
+  // The set contains machine instructions which are necessary
+  // for correct MIR but will not be emitted in function bodies.
+  DenseSet<MachineInstr *> InstrsToDelete;
+  // The table contains global aliases of local registers for each machine
+  // function. The aliases are used to substitute local registers during
+  // code emission.
+  RegisterAliasMapTy RegisterAliasTable;
+  // The counter holds the maximum ID we have in the module.
+  unsigned MaxID;
+  // The array contains lists of MIs for each module section.
+  InstrList MS[NUM_MODULE_SECTIONS];
+  // The table maps MBB number to SPIR-V unique ID register.
+  DenseMap<int, Register> BBNumToRegMap;
+
+  Register getFuncReg(std::string FuncName) {
+    auto FuncReg = FuncNameMap.find(FuncName);
+    assert(FuncReg != FuncNameMap.end() && "Cannot find function Id");
+    return FuncReg->second;
+  }
+  InstrList &getMSInstrs(unsigned MSType) { return MS[MSType]; }
+  void setSkipEmission(MachineInstr *MI) { InstrsToDelete.insert(MI); }
+  bool getSkipEmission(const MachineInstr *MI) {
+    return InstrsToDelete.contains(MI);
+  }
+  void setRegisterAlias(const MachineFunction *MF, Register Reg,
+                        Register AliasReg) {
+    RegisterAliasTable[MF][Reg] = AliasReg;
+  }
+  Register getRegisterAlias(const MachineFunction *MF, Register Reg) {
+    auto RI = RegisterAliasTable[MF].find(Reg);
+    if (RI == RegisterAliasTable[MF].end()) {
+      return Register(0);
+    }
+    return RegisterAliasTable[MF][Reg];
+  }
+  bool hasRegisterAlias(const MachineFunction *MF, Register Reg) {
+    return RegisterAliasTable.find(MF) != RegisterAliasTable.end() &&
+           RegisterAliasTable[MF].find(Reg) != RegisterAliasTable[MF].end();
+  }
+  unsigned getNextID() { return MaxID++; }
+  bool hasMBBRegister(const MachineBasicBlock &MBB) {
+    return BBNumToRegMap.find(MBB.getNumber()) != BBNumToRegMap.end();
+  }
+  // Convert MBB's number to corresponding ID register.
+  Register getOrCreateMBBRegister(const MachineBasicBlock &MBB) {
+    auto f = BBNumToRegMap.find(MBB.getNumber());
+    if (f != BBNumToRegMap.end())
+      return f->second;
+    Register NewReg = Register::index2VirtReg(getNextID());
+    BBNumToRegMap[MBB.getNumber()] = NewReg;
+    return NewReg;
+  }
+};
+} // namespace SPIRV
+
+struct SPIRVModuleAnalysis : public ModulePass {
+  static char ID;
+
+public:
+  SPIRVModuleAnalysis() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  static struct SPIRV::ModuleAnalysisInfo MAI;
+
+private:
+  void setBaseInfo(const Module &M);
+  template <typename T> void collectTypesConstsVars();
+  void processDefInstrs(const Module &M);
+  void collectFuncNames(MachineInstr &MI, const Function &F);
+  void processOtherInstrs(const Module &M);
+  void numberRegistersGlobally(const Module &M);
+
+  const SPIRVSubtarget *ST;
+  SPIRVGlobalRegistry *GR;
+  const SPIRVInstrInfo *TII;
+  MachineModuleInfo *MMI;
+};
+} // namespace llvm
+#endif // LLVM_LIB_TARGET_SPIRV_SPIRVMODULEANALYSIS_H
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
new file mode 100644
index 000000000000..687f84046650
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -0,0 +1,440 @@
+//===-- SPIRVPreLegalizer.cpp - prepare IR for legalization -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The pass prepares IR for legalization: it assigns SPIR-V types to registers
+// and removes intrinsics which holded these types during IR translation.
+// Also it processes constants and registers them in GR to avoid duplication.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPIRV.h"
+#include "SPIRVGlobalRegistry.h"
+#include "SPIRVSubtarget.h"
+#include "SPIRVUtils.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/IntrinsicsSPIRV.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+
+#define DEBUG_TYPE "spirv-prelegalizer"
+
+using namespace llvm;
+
+namespace {
+class SPIRVPreLegalizer : public MachineFunctionPass {
+public:
+  static char ID;
+  SPIRVPreLegalizer() : MachineFunctionPass(ID) {
+    initializeSPIRVPreLegalizerPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // namespace
+
+static bool isSpvIntrinsic(MachineInstr &MI, Intrinsic::ID IntrinsicID) {
+  if (MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS &&
+      MI.getIntrinsicID() == IntrinsicID)
+    return true;
+  return false;
+}
+
+static void foldConstantsIntoIntrinsics(MachineFunction &MF) {
+  SmallVector<MachineInstr *, 10> ToErase;
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const unsigned AssignNameOperandShift = 2;
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (!isSpvIntrinsic(MI, Intrinsic::spv_assign_name))
+        continue;
+      unsigned NumOp = MI.getNumExplicitDefs() + AssignNameOperandShift;
+      while (MI.getOperand(NumOp).isReg()) {
+        MachineOperand &MOp = MI.getOperand(NumOp);
+        MachineInstr *ConstMI = MRI.getVRegDef(MOp.getReg());
+        assert(ConstMI->getOpcode() == TargetOpcode::G_CONSTANT);
+        MI.removeOperand(NumOp);
+        MI.addOperand(MachineOperand::CreateImm(
+            ConstMI->getOperand(1).getCImm()->getZExtValue()));
+        if (MRI.use_empty(ConstMI->getOperand(0).getReg()))
+          ToErase.push_back(ConstMI);
+      }
+    }
+  }
+  for (MachineInstr *MI : ToErase)
+    MI->eraseFromParent();
+}
+
+static void insertBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR,
+                           MachineIRBuilder MIB) {
+  SmallVector<MachineInstr *, 10> ToErase;
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (!isSpvIntrinsic(MI, Intrinsic::spv_bitcast))
+        continue;
+      assert(MI.getOperand(2).isReg());
+      MIB.setInsertPt(*MI.getParent(), MI);
+      MIB.buildBitcast(MI.getOperand(0).getReg(), MI.getOperand(2).getReg());
+      ToErase.push_back(&MI);
+    }
+  }
+  for (MachineInstr *MI : ToErase)
+    MI->eraseFromParent();
+}
+
+// Translating GV, IRTranslator sometimes generates following IR:
+//   %1 = G_GLOBAL_VALUE
+//   %2 = COPY %1
+//   %3 = G_ADDRSPACE_CAST %2
+// New registers have no SPIRVType and no register class info.
+//
+// Set SPIRVType for GV, propagate it from GV to other instructions,
+// also set register classes.
+static SPIRVType *propagateSPIRVType(MachineInstr *MI, SPIRVGlobalRegistry *GR,
+                                     MachineRegisterInfo &MRI,
+                                     MachineIRBuilder &MIB) {
+  SPIRVType *SpirvTy = nullptr;
+  assert(MI && "Machine instr is expected");
+  if (MI->getOperand(0).isReg()) {
+    Register Reg = MI->getOperand(0).getReg();
+    SpirvTy = GR->getSPIRVTypeForVReg(Reg);
+    if (!SpirvTy) {
+      switch (MI->getOpcode()) {
+      case TargetOpcode::G_CONSTANT: {
+        MIB.setInsertPt(*MI->getParent(), MI);
+        Type *Ty = MI->getOperand(1).getCImm()->getType();
+        SpirvTy = GR->getOrCreateSPIRVType(Ty, MIB);
+        break;
+      }
+      case TargetOpcode::G_GLOBAL_VALUE: {
+        MIB.setInsertPt(*MI->getParent(), MI);
+        Type *Ty = MI->getOperand(1).getGlobal()->getType();
+        SpirvTy = GR->getOrCreateSPIRVType(Ty, MIB);
+        break;
+      }
+      case TargetOpcode::G_TRUNC:
+      case TargetOpcode::G_ADDRSPACE_CAST:
+      case TargetOpcode::COPY: {
+        MachineOperand &Op = MI->getOperand(1);
+        MachineInstr *Def = Op.isReg() ? MRI.getVRegDef(Op.getReg()) : nullptr;
+        if (Def)
+          SpirvTy = propagateSPIRVType(Def, GR, MRI, MIB);
+        break;
+      }
+      default:
+        break;
+      }
+      if (SpirvTy)
+        GR->assignSPIRVTypeToVReg(SpirvTy, Reg, MIB.getMF());
+      if (!MRI.getRegClassOrNull(Reg))
+        MRI.setRegClass(Reg, &SPIRV::IDRegClass);
+    }
+  }
+  return SpirvTy;
+}
+
+// Insert ASSIGN_TYPE instuction between Reg and its definition, set NewReg as
+// a dst of the definition, assign SPIRVType to both registers. If SpirvTy is
+// provided, use it as SPIRVType in ASSIGN_TYPE, otherwise create it from Ty.
+// TODO: maybe move to SPIRVUtils.
+static Register insertAssignInstr(Register Reg, Type *Ty, SPIRVType *SpirvTy,
+                                  SPIRVGlobalRegistry *GR,
+                                  MachineIRBuilder &MIB,
+                                  MachineRegisterInfo &MRI) {
+  MachineInstr *Def = MRI.getVRegDef(Reg);
+  assert((Ty || SpirvTy) && "Either LLVM or SPIRV type is expected.");
+  MIB.setInsertPt(*Def->getParent(),
+                  (Def->getNextNode() ? Def->getNextNode()->getIterator()
+                                      : Def->getParent()->end()));
+  Register NewReg = MRI.createGenericVirtualRegister(MRI.getType(Reg));
+  if (auto *RC = MRI.getRegClassOrNull(Reg))
+    MRI.setRegClass(NewReg, RC);
+  SpirvTy = SpirvTy ? SpirvTy : GR->getOrCreateSPIRVType(Ty, MIB);
+  GR->assignSPIRVTypeToVReg(SpirvTy, Reg, MIB.getMF());
+  // This is to make it convenient for Legalizer to get the SPIRVType
+  // when processing the actual MI (i.e. not pseudo one).
+  GR->assignSPIRVTypeToVReg(SpirvTy, NewReg, MIB.getMF());
+  MIB.buildInstr(SPIRV::ASSIGN_TYPE)
+      .addDef(Reg)
+      .addUse(NewReg)
+      .addUse(GR->getSPIRVTypeID(SpirvTy));
+  Def->getOperand(0).setReg(NewReg);
+  MRI.setRegClass(Reg, &SPIRV::ANYIDRegClass);
+  return NewReg;
+}
+
+static void generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
+                                 MachineIRBuilder MIB) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  SmallVector<MachineInstr *, 10> ToErase;
+
+  for (MachineBasicBlock *MBB : post_order(&MF)) {
+    if (MBB->empty())
+      continue;
+
+    bool ReachedBegin = false;
+    for (auto MII = std::prev(MBB->end()), Begin = MBB->begin();
+         !ReachedBegin;) {
+      MachineInstr &MI = *MII;
+
+      if (isSpvIntrinsic(MI, Intrinsic::spv_assign_type)) {
+        Register Reg = MI.getOperand(1).getReg();
+        Type *Ty = getMDOperandAsType(MI.getOperand(2).getMetadata(), 0);
+        MachineInstr *Def = MRI.getVRegDef(Reg);
+        assert(Def && "Expecting an instruction that defines the register");
+        // G_GLOBAL_VALUE already has type info.
+        if (Def->getOpcode() != TargetOpcode::G_GLOBAL_VALUE)
+          insertAssignInstr(Reg, Ty, nullptr, GR, MIB, MF.getRegInfo());
+        ToErase.push_back(&MI);
+      } else if (MI.getOpcode() == TargetOpcode::G_CONSTANT ||
+                 MI.getOpcode() == TargetOpcode::G_FCONSTANT ||
+                 MI.getOpcode() == TargetOpcode::G_BUILD_VECTOR) {
+        // %rc = G_CONSTANT ty Val
+        // ===>
+        // %cty = OpType* ty
+        // %rctmp = G_CONSTANT ty Val
+        // %rc = ASSIGN_TYPE %rctmp, %cty
+        Register Reg = MI.getOperand(0).getReg();
+        if (MRI.hasOneUse(Reg)) {
+          MachineInstr &UseMI = *MRI.use_instr_begin(Reg);
+          if (isSpvIntrinsic(UseMI, Intrinsic::spv_assign_type) ||
+              isSpvIntrinsic(UseMI, Intrinsic::spv_assign_name))
+            continue;
+        }
+        Type *Ty = nullptr;
+        if (MI.getOpcode() == TargetOpcode::G_CONSTANT)
+          Ty = MI.getOperand(1).getCImm()->getType();
+        else if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
+          Ty = MI.getOperand(1).getFPImm()->getType();
+        else {
+          assert(MI.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
+          Type *ElemTy = nullptr;
+          MachineInstr *ElemMI = MRI.getVRegDef(MI.getOperand(1).getReg());
+          assert(ElemMI);
+
+          if (ElemMI->getOpcode() == TargetOpcode::G_CONSTANT)
+            ElemTy = ElemMI->getOperand(1).getCImm()->getType();
+          else if (ElemMI->getOpcode() == TargetOpcode::G_FCONSTANT)
+            ElemTy = ElemMI->getOperand(1).getFPImm()->getType();
+          else
+            llvm_unreachable("Unexpected opcode");
+          unsigned NumElts =
+              MI.getNumExplicitOperands() - MI.getNumExplicitDefs();
+          Ty = VectorType::get(ElemTy, NumElts, false);
+        }
+        insertAssignInstr(Reg, Ty, nullptr, GR, MIB, MRI);
+      } else if (MI.getOpcode() == TargetOpcode::G_TRUNC ||
+                 MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE ||
+                 MI.getOpcode() == TargetOpcode::COPY ||
+                 MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST) {
+        propagateSPIRVType(&MI, GR, MRI, MIB);
+      }
+
+      if (MII == Begin)
+        ReachedBegin = true;
+      else
+        --MII;
+    }
+  }
+  for (MachineInstr *MI : ToErase)
+    MI->eraseFromParent();
+}
+
+static std::pair<Register, unsigned>
+createNewIdReg(Register ValReg, unsigned Opcode, MachineRegisterInfo &MRI,
+               const SPIRVGlobalRegistry &GR) {
+  LLT NewT = LLT::scalar(32);
+  SPIRVType *SpvType = GR.getSPIRVTypeForVReg(ValReg);
+  assert(SpvType && "VReg is expected to have SPIRV type");
+  bool IsFloat = SpvType->getOpcode() == SPIRV::OpTypeFloat;
+  bool IsVectorFloat =
+      SpvType->getOpcode() == SPIRV::OpTypeVector &&
+      GR.getSPIRVTypeForVReg(SpvType->getOperand(1).getReg())->getOpcode() ==
+          SPIRV::OpTypeFloat;
+  IsFloat |= IsVectorFloat;
+  auto GetIdOp = IsFloat ? SPIRV::GET_fID : SPIRV::GET_ID;
+  auto DstClass = IsFloat ? &SPIRV::fIDRegClass : &SPIRV::IDRegClass;
+  if (MRI.getType(ValReg).isPointer()) {
+    NewT = LLT::pointer(0, 32);
+    GetIdOp = SPIRV::GET_pID;
+    DstClass = &SPIRV::pIDRegClass;
+  } else if (MRI.getType(ValReg).isVector()) {
+    NewT = LLT::fixed_vector(2, NewT);
+    GetIdOp = IsFloat ? SPIRV::GET_vfID : SPIRV::GET_vID;
+    DstClass = IsFloat ? &SPIRV::vfIDRegClass : &SPIRV::vIDRegClass;
+  }
+  Register IdReg = MRI.createGenericVirtualRegister(NewT);
+  MRI.setRegClass(IdReg, DstClass);
+  return {IdReg, GetIdOp};
+}
+
+static void processInstr(MachineInstr &MI, MachineIRBuilder &MIB,
+                         MachineRegisterInfo &MRI, SPIRVGlobalRegistry *GR) {
+  unsigned Opc = MI.getOpcode();
+  assert(MI.getNumDefs() > 0 && MRI.hasOneUse(MI.getOperand(0).getReg()));
+  MachineInstr &AssignTypeInst =
+      *(MRI.use_instr_begin(MI.getOperand(0).getReg()));
+  auto NewReg = createNewIdReg(MI.getOperand(0).getReg(), Opc, MRI, *GR).first;
+  AssignTypeInst.getOperand(1).setReg(NewReg);
+  MI.getOperand(0).setReg(NewReg);
+  MIB.setInsertPt(*MI.getParent(),
+                  (MI.getNextNode() ? MI.getNextNode()->getIterator()
+                                    : MI.getParent()->end()));
+  for (auto &Op : MI.operands()) {
+    if (!Op.isReg() || Op.isDef())
+      continue;
+    auto IdOpInfo = createNewIdReg(Op.getReg(), Opc, MRI, *GR);
+    MIB.buildInstr(IdOpInfo.second).addDef(IdOpInfo.first).addUse(Op.getReg());
+    Op.setReg(IdOpInfo.first);
+  }
+}
+
+// Defined in SPIRVLegalizerInfo.cpp.
+extern bool isTypeFoldingSupported(unsigned Opcode);
+
+static void processInstrsWithTypeFolding(MachineFunction &MF,
+                                         SPIRVGlobalRegistry *GR,
+                                         MachineIRBuilder MIB) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (isTypeFoldingSupported(MI.getOpcode()))
+        processInstr(MI, MIB, MRI, GR);
+    }
+  }
+}
+
+static void processSwitches(MachineFunction &MF, SPIRVGlobalRegistry *GR,
+                            MachineIRBuilder MIB) {
+  DenseMap<Register, SmallDenseMap<uint64_t, MachineBasicBlock *>>
+      SwitchRegToMBB;
+  DenseMap<Register, MachineBasicBlock *> DefaultMBBs;
+  DenseSet<Register> SwitchRegs;
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  // Before IRTranslator pass, spv_switch calls are inserted before each
+  // switch instruction. IRTranslator lowers switches to ICMP+CBr+Br triples.
+  // A switch with two cases may be translated to this MIR sequesnce:
+  //   intrinsic(@llvm.spv.switch), %CmpReg, %Const0, %Const1
+  //   %Dst0 = G_ICMP intpred(eq), %CmpReg, %Const0
+  //   G_BRCOND %Dst0, %bb.2
+  //   G_BR %bb.5
+  // bb.5.entry:
+  //   %Dst1 = G_ICMP intpred(eq), %CmpReg, %Const1
+  //   G_BRCOND %Dst1, %bb.3
+  //   G_BR %bb.4
+  // bb.2.sw.bb:
+  //   ...
+  // bb.3.sw.bb1:
+  //   ...
+  // bb.4.sw.epilog:
+  //   ...
+  // Walk MIs and collect information about destination MBBs to update
+  // spv_switch call. We assume that all spv_switch precede corresponding ICMPs.
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (isSpvIntrinsic(MI, Intrinsic::spv_switch)) {
+        assert(MI.getOperand(1).isReg());
+        Register Reg = MI.getOperand(1).getReg();
+        SwitchRegs.insert(Reg);
+        // Set the first successor as default MBB to support empty switches.
+        DefaultMBBs[Reg] = *MBB.succ_begin();
+      }
+      // Process only ICMPs that relate to spv_switches.
+      if (MI.getOpcode() == TargetOpcode::G_ICMP && MI.getOperand(2).isReg() &&
+          SwitchRegs.contains(MI.getOperand(2).getReg())) {
+        assert(MI.getOperand(0).isReg() && MI.getOperand(1).isPredicate() &&
+               MI.getOperand(3).isReg());
+        Register Dst = MI.getOperand(0).getReg();
+        // Set type info for destination register of switch's ICMP instruction.
+        if (GR->getSPIRVTypeForVReg(Dst) == nullptr) {
+          MIB.setInsertPt(*MI.getParent(), MI);
+          Type *LLVMTy = IntegerType::get(MF.getFunction().getContext(), 1);
+          SPIRVType *SpirvTy = GR->getOrCreateSPIRVType(LLVMTy, MIB);
+          MRI.setRegClass(Dst, &SPIRV::IDRegClass);
+          GR->assignSPIRVTypeToVReg(SpirvTy, Dst, MIB.getMF());
+        }
+        Register CmpReg = MI.getOperand(2).getReg();
+        MachineOperand &PredOp = MI.getOperand(1);
+        const auto CC = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
+        assert(CC == CmpInst::ICMP_EQ && MRI.hasOneUse(Dst) &&
+               MRI.hasOneDef(CmpReg));
+        uint64_t Val = getIConstVal(MI.getOperand(3).getReg(), &MRI);
+        MachineInstr *CBr = MRI.use_begin(Dst)->getParent();
+        assert(CBr->getOpcode() == SPIRV::G_BRCOND &&
+               CBr->getOperand(1).isMBB());
+        SwitchRegToMBB[CmpReg][Val] = CBr->getOperand(1).getMBB();
+        // The next MI is always BR to either the next case or the default.
+        MachineInstr *NextMI = CBr->getNextNode();
+        assert(NextMI->getOpcode() == SPIRV::G_BR &&
+               NextMI->getOperand(0).isMBB());
+        MachineBasicBlock *NextMBB = NextMI->getOperand(0).getMBB();
+        assert(NextMBB != nullptr);
+        // The default MBB is not started by ICMP with switch's cmp register.
+        if (NextMBB->front().getOpcode() != SPIRV::G_ICMP ||
+            (NextMBB->front().getOperand(2).isReg() &&
+             NextMBB->front().getOperand(2).getReg() != CmpReg))
+          DefaultMBBs[CmpReg] = NextMBB;
+      }
+    }
+  }
+  // Modify spv_switch's operands by collected values. For the example above,
+  // the result will be like this:
+  //   intrinsic(@llvm.spv.switch), %CmpReg, %bb.4, i32 0, %bb.2, i32 1, %bb.3
+  // Note that ICMP+CBr+Br sequences are not removed, but ModuleAnalysis marks
+  // them as skipped and AsmPrinter does not output them.
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (!isSpvIntrinsic(MI, Intrinsic::spv_switch))
+        continue;
+      assert(MI.getOperand(1).isReg());
+      Register Reg = MI.getOperand(1).getReg();
+      unsigned NumOp = MI.getNumExplicitOperands();
+      SmallVector<const ConstantInt *, 3> Vals;
+      SmallVector<MachineBasicBlock *, 3> MBBs;
+      for (unsigned i = 2; i < NumOp; i++) {
+        Register CReg = MI.getOperand(i).getReg();
+        uint64_t Val = getIConstVal(CReg, &MRI);
+        MachineInstr *ConstInstr = getDefInstrMaybeConstant(CReg, &MRI);
+        Vals.push_back(ConstInstr->getOperand(1).getCImm());
+        MBBs.push_back(SwitchRegToMBB[Reg][Val]);
+      }
+      for (unsigned i = MI.getNumExplicitOperands() - 1; i > 1; i--)
+        MI.removeOperand(i);
+      MI.addOperand(MachineOperand::CreateMBB(DefaultMBBs[Reg]));
+      for (unsigned i = 0; i < Vals.size(); i++) {
+        MI.addOperand(MachineOperand::CreateCImm(Vals[i]));
+        MI.addOperand(MachineOperand::CreateMBB(MBBs[i]));
+      }
+    }
+  }
+}
+
+bool SPIRVPreLegalizer::runOnMachineFunction(MachineFunction &MF) {
+  // Initialize the type registry.
+  const SPIRVSubtarget &ST = MF.getSubtarget<SPIRVSubtarget>();
+  SPIRVGlobalRegistry *GR = ST.getSPIRVGlobalRegistry();
+  GR->setCurrentFunc(MF);
+  MachineIRBuilder MIB(MF);
+  foldConstantsIntoIntrinsics(MF);
+  insertBitcasts(MF, GR, MIB);
+  generateAssignInstrs(MF, GR, MIB);
+  processInstrsWithTypeFolding(MF, GR, MIB);
+  processSwitches(MF, GR, MIB);
+
+  return true;
+}
+
+INITIALIZE_PASS(SPIRVPreLegalizer, DEBUG_TYPE, "SPIRV pre legalizer", false,
+                false)
+
+char SPIRVPreLegalizer::ID = 0;
+
+FunctionPass *llvm::createSPIRVPreLegalizerPass() {
+  return new SPIRVPreLegalizer();
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVRegisterBankInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVRegisterBankInfo.cpp
new file mode 100644
index 000000000000..9bf9d7fe5b39
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVRegisterBankInfo.cpp
@@ -0,0 +1,47 @@
+//===- SPIRVRegisterBankInfo.cpp ------------------------------*- C++ -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the targeting of the RegisterBankInfo class for SPIR-V.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPIRVRegisterBankInfo.h"
+#include "SPIRVRegisterInfo.h"
+#include "llvm/CodeGen/RegisterBank.h"
+
+#define GET_REGINFO_ENUM
+#include "SPIRVGenRegisterInfo.inc"
+
+#define GET_TARGET_REGBANK_IMPL
+#include "SPIRVGenRegisterBank.inc"
+
+using namespace llvm;
+
+// This required for .td selection patterns to work or we'd end up with RegClass
+// checks being redundant as all the classes would be mapped to the same bank.
+const RegisterBank &
+SPIRVRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
+                                              LLT Ty) const {
+  switch (RC.getID()) {
+  case SPIRV::TYPERegClassID:
+    return SPIRV::TYPERegBank;
+  case SPIRV::pIDRegClassID:
+  case SPIRV::IDRegClassID:
+    return SPIRV::IDRegBank;
+  case SPIRV::fIDRegClassID:
+    return SPIRV::fIDRegBank;
+  case SPIRV::vIDRegClassID:
+    return SPIRV::vIDRegBank;
+  case SPIRV::vfIDRegClassID:
+    return SPIRV::vfIDRegBank;
+  case SPIRV::ANYIDRegClassID:
+  case SPIRV::ANYRegClassID:
+    return SPIRV::IDRegBank;
+  }
+  llvm_unreachable("Unknown register class");
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVRegisterBankInfo.h b/llvm/lib/Target/SPIRV/SPIRVRegisterBankInfo.h
new file mode 100644
index 000000000000..67ddcdefb7dd
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVRegisterBankInfo.h
@@ -0,0 +1,38 @@
+//===- SPIRVRegisterBankInfo.h -----------------------------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the targeting of the RegisterBankInfo class for SPIR-V.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPIRV_SPIRVREGISTERBANKINFO_H
+#define LLVM_LIB_TARGET_SPIRV_SPIRVREGISTERBANKINFO_H
+
+#include "llvm/CodeGen/RegisterBankInfo.h"
+
+#define GET_REGBANK_DECLARATIONS
+#include "SPIRVGenRegisterBank.inc"
+
+namespace llvm {
+
+class TargetRegisterInfo;
+
+class SPIRVGenRegisterBankInfo : public RegisterBankInfo {
+protected:
+#define GET_TARGET_REGBANK_CLASS
+#include "SPIRVGenRegisterBank.inc"
+};
+
+// This class provides the information for the target register banks.
+class SPIRVRegisterBankInfo final : public SPIRVGenRegisterBankInfo {
+public:
+  const RegisterBank &getRegBankFromRegClass(const TargetRegisterClass &RC,
+                                             LLT Ty) const override;
+};
+} // namespace llvm
+#endif // LLVM_LIB_TARGET_SPIRV_SPIRVREGISTERBANKINFO_H
diff --git a/llvm/lib/Target/SPIRV/SPIRVRegisterBanks.td b/llvm/lib/Target/SPIRV/SPIRVRegisterBanks.td
new file mode 100644
index 000000000000..90c7f3a6e672
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVRegisterBanks.td
@@ -0,0 +1,15 @@
+//===-- SPIRVRegisterBanks.td - Describe SPIR-V RegBanks ---*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Although RegisterBankSelection is disabled we need to distinct the banks
+// as InstructionSelector RegClass checking code relies on them
+def IDRegBank : RegisterBank<"IDBank", [ID]>;
+def fIDRegBank : RegisterBank<"fIDBank", [fID]>;
+def vIDRegBank : RegisterBank<"vIDBank", [vID]>;
+def vfIDRegBank : RegisterBank<"vfIDBank", [vfID]>;
+def TYPERegBank : RegisterBank<"TYPEBank", [TYPE]>;
diff --git a/llvm/lib/Target/SPIRV/SPIRVRegisterInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVRegisterInfo.cpp
new file mode 100644
index 000000000000..cf8a967d59c4
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVRegisterInfo.cpp
@@ -0,0 +1,32 @@
+//===-- SPIRVRegisterInfo.cpp - SPIR-V Register Information -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SPIR-V implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPIRVRegisterInfo.h"
+#include "SPIRV.h"
+#include "SPIRVSubtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
+
+#define GET_REGINFO_TARGET_DESC
+#include "SPIRVGenRegisterInfo.inc"
+using namespace llvm;
+
+SPIRVRegisterInfo::SPIRVRegisterInfo() : SPIRVGenRegisterInfo(SPIRV::ID0) {}
+
+BitVector SPIRVRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  return BitVector(getNumRegs());
+}
+
+const MCPhysReg *
+SPIRVRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  static const MCPhysReg CalleeSavedReg = {0};
+  return &CalleeSavedReg;
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVRegisterInfo.h b/llvm/lib/Target/SPIRV/SPIRVRegisterInfo.h
new file mode 100644
index 000000000000..f6f22b81e0bc
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVRegisterInfo.h
@@ -0,0 +1,36 @@
+//===-- SPIRVRegisterInfo.h - SPIR-V Register Information -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SPIR-V implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPIRV_SPIRVREGISTERINFO_H
+#define LLVM_LIB_TARGET_SPIRV_SPIRVREGISTERINFO_H
+
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "SPIRVGenRegisterInfo.inc"
+
+namespace llvm {
+
+struct SPIRVRegisterInfo : public SPIRVGenRegisterInfo {
+  SPIRVRegisterInfo();
+  const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override {}
+  Register getFrameRegister(const MachineFunction &MF) const override {
+    return 0;
+  }
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_SPIRV_SPIRVREGISTERINFO_H
diff --git a/llvm/lib/Target/SPIRV/SPIRVRegisterInfo.td b/llvm/lib/Target/SPIRV/SPIRVRegisterInfo.td
new file mode 100644
index 000000000000..d0b64b6895d0
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVRegisterInfo.td
@@ -0,0 +1,39 @@
+//===-- SPIRVRegisterInfo.td - SPIR-V Register defs --------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  Declarations that describe the SPIR-V register file.
+//
+//===----------------------------------------------------------------------===//
+
+let Namespace = "SPIRV" in {
+  def p0 : PtrValueType <i32, 0>;
+  // All registers are for 32-bit identifiers, so have a single dummy register
+
+  // Class for registers that are the result of OpTypeXXX instructions
+  def TYPE0 : Register<"TYPE0">;
+  def TYPE : RegisterClass<"SPIRV", [i32], 32, (add TYPE0)>;
+
+  // Class for every other non-type ID
+  def ID0 : Register<"ID0">;
+  def ID : RegisterClass<"SPIRV", [i32], 32, (add ID0)>;
+  def fID0 : Register<"FID0">;
+  def fID : RegisterClass<"SPIRV", [f32], 32, (add fID0)>;
+  def pID0 : Register<"pID0">;
+  def pID : RegisterClass<"SPIRV", [p0], 32, (add pID0)>;
+  def vID0 : Register<"pID0">;
+  def vID : RegisterClass<"SPIRV", [v2i32], 32, (add vID0)>;
+  def vfID0 : Register<"pID0">;
+  def vfID : RegisterClass<"SPIRV", [v2f32], 32, (add vfID0)>;
+
+  def ANYID : RegisterClass<"SPIRV", [i32, f32, p0, v2i32, v2f32], 32, (add ID, fID, pID, vID, vfID)>;
+
+  // A few instructions like OpName can take ids from both type and non-type
+  // instructions, so we need a super-class to allow for both to count as valid
+  // arguments for these instructions.
+  def ANY : RegisterClass<"SPIRV", [i32], 32, (add TYPE, ID)>;
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
new file mode 100644
index 000000000000..cdf3a160f373
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
@@ -0,0 +1,68 @@
+//===-- SPIRVSubtarget.cpp - SPIR-V Subtarget Information ------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SPIR-V specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPIRVSubtarget.h"
+#include "SPIRV.h"
+#include "SPIRVGlobalRegistry.h"
+#include "SPIRVLegalizerInfo.h"
+#include "SPIRVRegisterBankInfo.h"
+#include "SPIRVTargetMachine.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Host.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "spirv-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "SPIRVGenSubtargetInfo.inc"
+
+// Compare version numbers, but allow 0 to mean unspecified.
+static bool isAtLeastVer(uint32_t Target, uint32_t VerToCompareTo) {
+  return Target == 0 || Target >= VerToCompareTo;
+}
+
+static unsigned computePointerSize(const Triple &TT) {
+  const auto Arch = TT.getArch();
+  // TODO: unify this with pointers legalization.
+  assert(TT.isSPIRV());
+  return Arch == Triple::spirv32 ? 32 : 64;
+}
+
+SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU,
+                               const std::string &FS,
+                               const SPIRVTargetMachine &TM)
+    : SPIRVGenSubtargetInfo(TT, CPU, /*TuneCPU=*/CPU, FS),
+      PointerSize(computePointerSize(TT)), SPIRVVersion(0), InstrInfo(),
+      FrameLowering(initSubtargetDependencies(CPU, FS)), TLInfo(TM, *this) {
+  GR = std::make_unique<SPIRVGlobalRegistry>(PointerSize);
+  CallLoweringInfo =
+      std::make_unique<SPIRVCallLowering>(TLInfo, *this, GR.get());
+  Legalizer = std::make_unique<SPIRVLegalizerInfo>(*this);
+  RegBankInfo = std::make_unique<SPIRVRegisterBankInfo>();
+  InstSelector.reset(
+      createSPIRVInstructionSelector(TM, *this, *RegBankInfo.get()));
+}
+
+SPIRVSubtarget &SPIRVSubtarget::initSubtargetDependencies(StringRef CPU,
+                                                          StringRef FS) {
+  ParseSubtargetFeatures(CPU, /*TuneCPU=*/CPU, FS);
+  if (SPIRVVersion == 0)
+    SPIRVVersion = 14;
+  return *this;
+}
+
+// If the SPIR-V version is >= 1.4 we can call OpPtrEqual and OpPtrNotEqual.
+bool SPIRVSubtarget::canDirectlyComparePointers() const {
+  return isAtLeastVer(SPIRVVersion, 14);
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.h b/llvm/lib/Target/SPIRV/SPIRVSubtarget.h
new file mode 100644
index 000000000000..a6332cfefa8e
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.h
@@ -0,0 +1,93 @@
+//===-- SPIRVSubtarget.h - SPIR-V Subtarget Information --------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SPIR-V specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPIRV_SPIRVSUBTARGET_H
+#define LLVM_LIB_TARGET_SPIRV_SPIRVSUBTARGET_H
+
+#include "SPIRVCallLowering.h"
+#include "SPIRVFrameLowering.h"
+#include "SPIRVISelLowering.h"
+#include "SPIRVInstrInfo.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define GET_SUBTARGETINFO_HEADER
+#include "SPIRVGenSubtargetInfo.inc"
+
+namespace llvm {
+class StringRef;
+class SPIRVGlobalRegistry;
+class SPIRVTargetMachine;
+
+class SPIRVSubtarget : public SPIRVGenSubtargetInfo {
+private:
+  const unsigned PointerSize;
+  uint32_t SPIRVVersion;
+
+  std::unique_ptr<SPIRVGlobalRegistry> GR;
+
+  SPIRVInstrInfo InstrInfo;
+  SPIRVFrameLowering FrameLowering;
+  SPIRVTargetLowering TLInfo;
+
+  // GlobalISel related APIs.
+  std::unique_ptr<CallLowering> CallLoweringInfo;
+  std::unique_ptr<RegisterBankInfo> RegBankInfo;
+  std::unique_ptr<LegalizerInfo> Legalizer;
+  std::unique_ptr<InstructionSelector> InstSelector;
+
+public:
+  // This constructor initializes the data members to match that
+  // of the specified triple.
+  SPIRVSubtarget(const Triple &TT, const std::string &CPU,
+                 const std::string &FS, const SPIRVTargetMachine &TM);
+  SPIRVSubtarget &initSubtargetDependencies(StringRef CPU, StringRef FS);
+
+  // Parses features string setting specified subtarget options.
+  // The definition of this function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
+  unsigned getPointerSize() const { return PointerSize; }
+  bool canDirectlyComparePointers() const;
+  uint32_t getSPIRVVersion() const { return SPIRVVersion; };
+  SPIRVGlobalRegistry *getSPIRVGlobalRegistry() const { return GR.get(); }
+
+  const CallLowering *getCallLowering() const override {
+    return CallLoweringInfo.get();
+  }
+  const RegisterBankInfo *getRegBankInfo() const override {
+    return RegBankInfo.get();
+  }
+  const LegalizerInfo *getLegalizerInfo() const override {
+    return Legalizer.get();
+  }
+  InstructionSelector *getInstructionSelector() const override {
+    return InstSelector.get();
+  }
+  const SPIRVInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const SPIRVFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+  const SPIRVTargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const SPIRVRegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo.getRegisterInfo();
+  }
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_SPIRV_SPIRVSUBTARGET_H
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
new file mode 100644
index 000000000000..f7c88a5c6d4a
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
@@ -0,0 +1,186 @@
+//===- SPIRVTargetMachine.cpp - Define TargetMachine for SPIR-V -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the info about SPIR-V target spec.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPIRVTargetMachine.h"
+#include "SPIRV.h"
+#include "SPIRVCallLowering.h"
+#include "SPIRVGlobalRegistry.h"
+#include "SPIRVLegalizerInfo.h"
+#include "SPIRVTargetObjectFile.h"
+#include "SPIRVTargetTransformInfo.h"
+#include "TargetInfo/SPIRVTargetInfo.h"
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSPIRVTarget() {
+  // Register the target.
+  RegisterTargetMachine<SPIRVTargetMachine> X(getTheSPIRV32Target());
+  RegisterTargetMachine<SPIRVTargetMachine> Y(getTheSPIRV64Target());
+
+  PassRegistry &PR = *PassRegistry::getPassRegistry();
+  initializeGlobalISel(PR);
+  initializeSPIRVModuleAnalysisPass(PR);
+}
+
+static std::string computeDataLayout(const Triple &TT) {
+  const auto Arch = TT.getArch();
+  if (Arch == Triple::spirv32)
+    return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-"
+           "v96:128-v192:256-v256:256-v512:512-v1024:1024";
+  return "e-i64:64-v16:16-v24:32-v32:32-v48:64-"
+         "v96:128-v192:256-v256:256-v512:512-v1024:1024";
+}
+
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+  if (!RM)
+    return Reloc::PIC_;
+  return *RM;
+}
+
+// Pin SPIRVTargetObjectFile's vtables to this file.
+SPIRVTargetObjectFile::~SPIRVTargetObjectFile() {}
+
+SPIRVTargetMachine::SPIRVTargetMachine(const Target &T, const Triple &TT,
+                                       StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
+                                       Optional<Reloc::Model> RM,
+                                       Optional<CodeModel::Model> CM,
+                                       CodeGenOpt::Level OL, bool JIT)
+    : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options,
+                        getEffectiveRelocModel(RM),
+                        getEffectiveCodeModel(CM, CodeModel::Small), OL),
+      TLOF(std::make_unique<TargetLoweringObjectFileELF>()),
+      Subtarget(TT, CPU.str(), FS.str(), *this) {
+  initAsmInfo();
+  setGlobalISel(true);
+  setFastISel(false);
+  setO0WantsFastISel(false);
+  setRequiresStructuredCFG(false);
+}
+
+namespace {
+// SPIR-V Code Generator Pass Configuration Options.
+class SPIRVPassConfig : public TargetPassConfig {
+public:
+  SPIRVPassConfig(SPIRVTargetMachine &TM, PassManagerBase &PM)
+      : TargetPassConfig(TM, PM) {}
+
+  SPIRVTargetMachine &getSPIRVTargetMachine() const {
+    return getTM<SPIRVTargetMachine>();
+  }
+  void addIRPasses() override;
+  void addISelPrepare() override;
+
+  bool addIRTranslator() override;
+  void addPreLegalizeMachineIR() override;
+  bool addLegalizeMachineIR() override;
+  bool addRegBankSelect() override;
+  bool addGlobalInstructionSelect() override;
+
+  FunctionPass *createTargetRegisterAllocator(bool) override;
+  void addFastRegAlloc() override {}
+  void addOptimizedRegAlloc() override {}
+
+  void addPostRegAlloc() override;
+};
+} // namespace
+
+// We do not use physical registers, and maintain virtual registers throughout
+// the entire pipeline, so return nullptr to disable register allocation.
+FunctionPass *SPIRVPassConfig::createTargetRegisterAllocator(bool) {
+  return nullptr;
+}
+
+// Disable passes that break from assuming no virtual registers exist.
+void SPIRVPassConfig::addPostRegAlloc() {
+  // Do not work with vregs instead of physical regs.
+  disablePass(&MachineCopyPropagationID);
+  disablePass(&PostRAMachineSinkingID);
+  disablePass(&PostRASchedulerID);
+  disablePass(&FuncletLayoutID);
+  disablePass(&StackMapLivenessID);
+  disablePass(&PatchableFunctionID);
+  disablePass(&ShrinkWrapID);
+  disablePass(&LiveDebugValuesID);
+
+  // Do not work with OpPhi.
+  disablePass(&BranchFolderPassID);
+  disablePass(&MachineBlockPlacementID);
+
+  TargetPassConfig::addPostRegAlloc();
+}
+
+TargetTransformInfo
+SPIRVTargetMachine::getTargetTransformInfo(const Function &F) const {
+  return TargetTransformInfo(SPIRVTTIImpl(this, F));
+}
+
+TargetPassConfig *SPIRVTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new SPIRVPassConfig(*this, PM);
+}
+
+void SPIRVPassConfig::addIRPasses() { TargetPassConfig::addIRPasses(); }
+
+void SPIRVPassConfig::addISelPrepare() {
+  addPass(createSPIRVEmitIntrinsicsPass(&getTM<SPIRVTargetMachine>()));
+  TargetPassConfig::addISelPrepare();
+}
+
+bool SPIRVPassConfig::addIRTranslator() {
+  addPass(new IRTranslator(getOptLevel()));
+  return false;
+}
+
+void SPIRVPassConfig::addPreLegalizeMachineIR() {
+  addPass(createSPIRVPreLegalizerPass());
+}
+
+// Use a default legalizer.
+bool SPIRVPassConfig::addLegalizeMachineIR() {
+  addPass(new Legalizer());
+  return false;
+}
+
+// Do not add a RegBankSelect pass, as we only ever need virtual registers.
+bool SPIRVPassConfig::addRegBankSelect() {
+  disablePass(&RegBankSelect::ID);
+  return false;
+}
+
+namespace {
+// A custom subclass of InstructionSelect, which is mostly the same except from
+// not requiring RegBankSelect to occur previously.
+class SPIRVInstructionSelect : public InstructionSelect {
+  // We don't use register banks, so unset the requirement for them
+  MachineFunctionProperties getRequiredProperties() const override {
+    return InstructionSelect::getRequiredProperties().reset(
+        MachineFunctionProperties::Property::RegBankSelected);
+  }
+};
+} // namespace
+
+bool SPIRVPassConfig::addGlobalInstructionSelect() {
+  addPass(new SPIRVInstructionSelect());
+  return false;
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.h b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.h
new file mode 100644
index 000000000000..f3597971bc95
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.h
@@ -0,0 +1,47 @@
+//===-- SPIRVTargetMachine.h - Define TargetMachine for SPIR-V -*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SPIR-V specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPIRV_SPIRVTARGETMACHINE_H
+#define LLVM_LIB_TARGET_SPIRV_SPIRVTARGETMACHINE_H
+
+#include "SPIRVSubtarget.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class SPIRVTargetMachine : public LLVMTargetMachine {
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  SPIRVSubtarget Subtarget;
+
+public:
+  SPIRVTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                     StringRef FS, const TargetOptions &Options,
+                     Optional<Reloc::Model> RM, Optional<CodeModel::Model> CM,
+                     CodeGenOpt::Level OL, bool JIT);
+
+  const SPIRVSubtarget *getSubtargetImpl() const { return &Subtarget; }
+
+  const SPIRVSubtarget *getSubtargetImpl(const Function &) const override {
+    return &Subtarget;
+  }
+
+  TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
+
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+  bool usesPhysRegsForValues() const override { return false; }
+
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_SPIRV_SPIRVTARGETMACHINE_H
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetObjectFile.h b/llvm/lib/Target/SPIRV/SPIRVTargetObjectFile.h
new file mode 100644
index 000000000000..00c456971ef1
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetObjectFile.h
@@ -0,0 +1,45 @@
+//===-- SPIRVTargetObjectFile.h - SPIRV Object Info -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPIRV_SPIRVTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_SPIRV_SPIRVTARGETOBJECTFILE_H
+
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/SectionKind.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+namespace llvm {
+
+class SPIRVTargetObjectFile : public TargetLoweringObjectFile {
+public:
+  ~SPIRVTargetObjectFile() override;
+
+  void Initialize(MCContext &ctx, const TargetMachine &TM) override {
+    TargetLoweringObjectFile::Initialize(ctx, TM);
+  }
+  // All words in a SPIR-V module (excepting the first 5 ones) are a linear
+  // sequence of instructions in a specific order. We put all the instructions
+  // in the single text section.
+  MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
+                                   const Constant *C,
+                                   Align &Alignment) const override {
+    return TextSection;
+  }
+  MCSection *getExplicitSectionGlobal(const GlobalObject *GO, SectionKind Kind,
+                                      const TargetMachine &TM) const override {
+    return TextSection;
+  }
+  MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind,
+                                    const TargetMachine &TM) const override {
+    return TextSection;
+  }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_SPIRV_SPIRVTARGETOBJECTFILE_H
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h b/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h
new file mode 100644
index 000000000000..ac351cf42f5c
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h
@@ -0,0 +1,44 @@
+//===- SPIRVTargetTransformInfo.h - SPIR-V specific TTI ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// \file
+// This file contains a TargetTransformInfo::Concept conforming object specific
+// to the SPIRV target machine. It uses the target's detailed information to
+// provide more precise answers to certain TTI queries, while letting the
+// target independent and default TTI implementations handle the rest.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPIRV_SPIRVTARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_SPIRV_SPIRVTARGETTRANSFORMINFO_H
+
+#include "SPIRV.h"
+#include "SPIRVTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+
+namespace llvm {
+class SPIRVTTIImpl : public BasicTTIImplBase<SPIRVTTIImpl> {
+  using BaseT = BasicTTIImplBase<SPIRVTTIImpl>;
+
+  friend BaseT;
+
+  const SPIRVSubtarget *ST;
+  const SPIRVTargetLowering *TLI;
+
+  const TargetSubtargetInfo *getST() const { return ST; }
+  const SPIRVTargetLowering *getTLI() const { return TLI; }
+
+public:
+  explicit SPIRVTTIImpl(const SPIRVTargetMachine *TM, const Function &F)
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+        TLI(ST->getTargetLowering()) {}
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_SPIRV_SPIRVTARGETTRANSFORMINFO_H
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
new file mode 100644
index 000000000000..b92dc12735f8
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
@@ -0,0 +1,207 @@
+//===--- SPIRVUtils.cpp ---- SPIR-V Utility Functions -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains miscellaneous utility functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPIRVUtils.h"
+#include "MCTargetDesc/SPIRVBaseInfo.h"
+#include "SPIRV.h"
+#include "SPIRVInstrInfo.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/IntrinsicsSPIRV.h"
+
+using namespace llvm;
+
+// The following functions are used to add these string literals as a series of
+// 32-bit integer operands with the correct format, and unpack them if necessary
+// when making string comparisons in compiler passes.
+// SPIR-V requires null-terminated UTF-8 strings padded to 32-bit alignment.
+static uint32_t convertCharsToWord(const StringRef &Str, unsigned i) {
+  uint32_t Word = 0u; // Build up this 32-bit word from 4 8-bit chars.
+  for (unsigned WordIndex = 0; WordIndex < 4; ++WordIndex) {
+    unsigned StrIndex = i + WordIndex;
+    uint8_t CharToAdd = 0;       // Initilize char as padding/null.
+    if (StrIndex < Str.size()) { // If it's within the string, get a real char.
+      CharToAdd = Str[StrIndex];
+    }
+    Word |= (CharToAdd << (WordIndex * 8));
+  }
+  return Word;
+}
+
+// Get length including padding and null terminator.
+static size_t getPaddedLen(const StringRef &Str) {
+  const size_t Len = Str.size() + 1;
+  return (Len % 4 == 0) ? Len : Len + (4 - (Len % 4));
+}
+
+void addStringImm(const StringRef &Str, MachineInstrBuilder &MIB) {
+  const size_t PaddedLen = getPaddedLen(Str);
+  for (unsigned i = 0; i < PaddedLen; i += 4) {
+    // Add an operand for the 32-bits of chars or padding.
+    MIB.addImm(convertCharsToWord(Str, i));
+  }
+}
+
+void addStringImm(const StringRef &Str, IRBuilder<> &B,
+                  std::vector<Value *> &Args) {
+  const size_t PaddedLen = getPaddedLen(Str);
+  for (unsigned i = 0; i < PaddedLen; i += 4) {
+    // Add a vector element for the 32-bits of chars or padding.
+    Args.push_back(B.getInt32(convertCharsToWord(Str, i)));
+  }
+}
+
+std::string getStringImm(const MachineInstr &MI, unsigned StartIndex) {
+  return getSPIRVStringOperand(MI, StartIndex);
+}
+
+void addNumImm(const APInt &Imm, MachineInstrBuilder &MIB) {
+  const auto Bitwidth = Imm.getBitWidth();
+  switch (Bitwidth) {
+  case 1:
+    break; // Already handled.
+  case 8:
+  case 16:
+  case 32:
+    MIB.addImm(Imm.getZExtValue());
+    break;
+  case 64: {
+    uint64_t FullImm = Imm.getZExtValue();
+    uint32_t LowBits = FullImm & 0xffffffff;
+    uint32_t HighBits = (FullImm >> 32) & 0xffffffff;
+    MIB.addImm(LowBits).addImm(HighBits);
+    break;
+  }
+  default:
+    report_fatal_error("Unsupported constant bitwidth");
+  }
+}
+
+void buildOpName(Register Target, const StringRef &Name,
+                 MachineIRBuilder &MIRBuilder) {
+  if (!Name.empty()) {
+    auto MIB = MIRBuilder.buildInstr(SPIRV::OpName).addUse(Target);
+    addStringImm(Name, MIB);
+  }
+}
+
+static void finishBuildOpDecorate(MachineInstrBuilder &MIB,
+                                  const std::vector<uint32_t> &DecArgs,
+                                  StringRef StrImm) {
+  if (!StrImm.empty())
+    addStringImm(StrImm, MIB);
+  for (const auto &DecArg : DecArgs)
+    MIB.addImm(DecArg);
+}
+
+void buildOpDecorate(Register Reg, MachineIRBuilder &MIRBuilder,
+                     llvm::SPIRV::Decoration Dec,
+                     const std::vector<uint32_t> &DecArgs, StringRef StrImm) {
+  auto MIB = MIRBuilder.buildInstr(SPIRV::OpDecorate)
+                 .addUse(Reg)
+                 .addImm(static_cast<uint32_t>(Dec));
+  finishBuildOpDecorate(MIB, DecArgs, StrImm);
+}
+
+void buildOpDecorate(Register Reg, MachineInstr &I, const SPIRVInstrInfo &TII,
+                     llvm::SPIRV::Decoration Dec,
+                     const std::vector<uint32_t> &DecArgs, StringRef StrImm) {
+  MachineBasicBlock &MBB = *I.getParent();
+  auto MIB = BuildMI(MBB, I, I.getDebugLoc(), TII.get(SPIRV::OpDecorate))
+                 .addUse(Reg)
+                 .addImm(static_cast<uint32_t>(Dec));
+  finishBuildOpDecorate(MIB, DecArgs, StrImm);
+}
+
+// TODO: maybe the following two functions should be handled in the subtarget
+// to allow for different OpenCL vs Vulkan handling.
+unsigned storageClassToAddressSpace(SPIRV::StorageClass SC) {
+  switch (SC) {
+  case SPIRV::StorageClass::Function:
+    return 0;
+  case SPIRV::StorageClass::CrossWorkgroup:
+    return 1;
+  case SPIRV::StorageClass::UniformConstant:
+    return 2;
+  case SPIRV::StorageClass::Workgroup:
+    return 3;
+  case SPIRV::StorageClass::Generic:
+    return 4;
+  case SPIRV::StorageClass::Input:
+    return 7;
+  default:
+    llvm_unreachable("Unable to get address space id");
+  }
+}
+
+SPIRV::StorageClass addressSpaceToStorageClass(unsigned AddrSpace) {
+  switch (AddrSpace) {
+  case 0:
+    return SPIRV::StorageClass::Function;
+  case 1:
+    return SPIRV::StorageClass::CrossWorkgroup;
+  case 2:
+    return SPIRV::StorageClass::UniformConstant;
+  case 3:
+    return SPIRV::StorageClass::Workgroup;
+  case 4:
+    return SPIRV::StorageClass::Generic;
+  case 7:
+    return SPIRV::StorageClass::Input;
+  default:
+    llvm_unreachable("Unknown address space");
+  }
+}
+
+SPIRV::MemorySemantics getMemSemanticsForStorageClass(SPIRV::StorageClass SC) {
+  switch (SC) {
+  case SPIRV::StorageClass::StorageBuffer:
+  case SPIRV::StorageClass::Uniform:
+    return SPIRV::MemorySemantics::UniformMemory;
+  case SPIRV::StorageClass::Workgroup:
+    return SPIRV::MemorySemantics::WorkgroupMemory;
+  case SPIRV::StorageClass::CrossWorkgroup:
+    return SPIRV::MemorySemantics::CrossWorkgroupMemory;
+  case SPIRV::StorageClass::AtomicCounter:
+    return SPIRV::MemorySemantics::AtomicCounterMemory;
+  case SPIRV::StorageClass::Image:
+    return SPIRV::MemorySemantics::ImageMemory;
+  default:
+    return SPIRV::MemorySemantics::None;
+  }
+}
+
+MachineInstr *getDefInstrMaybeConstant(Register &ConstReg,
+                                       const MachineRegisterInfo *MRI) {
+  MachineInstr *ConstInstr = MRI->getVRegDef(ConstReg);
+  if (ConstInstr->getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS &&
+      ConstInstr->getIntrinsicID() == Intrinsic::spv_track_constant) {
+    ConstReg = ConstInstr->getOperand(2).getReg();
+    ConstInstr = MRI->getVRegDef(ConstReg);
+  } else if (ConstInstr->getOpcode() == SPIRV::ASSIGN_TYPE) {
+    ConstReg = ConstInstr->getOperand(1).getReg();
+    ConstInstr = MRI->getVRegDef(ConstReg);
+  }
+  return ConstInstr;
+}
+
+uint64_t getIConstVal(Register ConstReg, const MachineRegisterInfo *MRI) {
+  const MachineInstr *MI = getDefInstrMaybeConstant(ConstReg, MRI);
+  assert(MI && MI->getOpcode() == TargetOpcode::G_CONSTANT);
+  return MI->getOperand(1).getCImm()->getValue().getZExtValue();
+}
+
+Type *getMDOperandAsType(const MDNode *N, unsigned I) {
+  return cast<ValueAsMetadata>(N->getOperand(I))->getType();
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h
new file mode 100644
index 000000000000..ffa82c9c1fe4
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h
@@ -0,0 +1,83 @@
+//===--- SPIRVUtils.h ---- SPIR-V Utility Functions -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains miscellaneous utility functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPIRV_SPIRVUTILS_H
+#define LLVM_LIB_TARGET_SPIRV_SPIRVUTILS_H
+
+#include "MCTargetDesc/SPIRVBaseInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include <string>
+
+namespace llvm {
+class MCInst;
+class MachineFunction;
+class MachineInstr;
+class MachineInstrBuilder;
+class MachineIRBuilder;
+class MachineRegisterInfo;
+class Register;
+class StringRef;
+class SPIRVInstrInfo;
+} // namespace llvm
+
+// Add the given string as a series of integer operand, inserting null
+// terminators and padding to make sure the operands all have 32-bit
+// little-endian words.
+void addStringImm(const llvm::StringRef &Str, llvm::MachineInstrBuilder &MIB);
+void addStringImm(const llvm::StringRef &Str, llvm::IRBuilder<> &B,
+                  std::vector<llvm::Value *> &Args);
+
+// Read the series of integer operands back as a null-terminated string using
+// the reverse of the logic in addStringImm.
+std::string getStringImm(const llvm::MachineInstr &MI, unsigned StartIndex);
+
+// Add the given numerical immediate to MIB.
+void addNumImm(const llvm::APInt &Imm, llvm::MachineInstrBuilder &MIB);
+
+// Add an OpName instruction for the given target register.
+void buildOpName(llvm::Register Target, const llvm::StringRef &Name,
+                 llvm::MachineIRBuilder &MIRBuilder);
+
+// Add an OpDecorate instruction for the given Reg.
+void buildOpDecorate(llvm::Register Reg, llvm::MachineIRBuilder &MIRBuilder,
+                     llvm::SPIRV::Decoration Dec,
+                     const std::vector<uint32_t> &DecArgs,
+                     llvm::StringRef StrImm = "");
+void buildOpDecorate(llvm::Register Reg, llvm::MachineInstr &I,
+                     const llvm::SPIRVInstrInfo &TII,
+                     llvm::SPIRV::Decoration Dec,
+                     const std::vector<uint32_t> &DecArgs,
+                     llvm::StringRef StrImm = "");
+
+// Convert a SPIR-V storage class to the corresponding LLVM IR address space.
+unsigned storageClassToAddressSpace(llvm::SPIRV::StorageClass SC);
+
+// Convert an LLVM IR address space to a SPIR-V storage class.
+llvm::SPIRV::StorageClass addressSpaceToStorageClass(unsigned AddrSpace);
+
+llvm::SPIRV::MemorySemantics
+getMemSemanticsForStorageClass(llvm::SPIRV::StorageClass SC);
+
+// Find def instruction for the given ConstReg, walking through
+// spv_track_constant and ASSIGN_TYPE instructions. Updates ConstReg by def
+// of OpConstant instruction.
+llvm::MachineInstr *
+getDefInstrMaybeConstant(llvm::Register &ConstReg,
+                         const llvm::MachineRegisterInfo *MRI);
+
+// Get constant integer value of the given ConstReg.
+uint64_t getIConstVal(llvm::Register ConstReg,
+                      const llvm::MachineRegisterInfo *MRI);
+
+// Get type of i-th operand of the metadata node.
+llvm::Type *getMDOperandAsType(const llvm::MDNode *N, unsigned I);
+#endif // LLVM_LIB_TARGET_SPIRV_SPIRVUTILS_H
diff --git a/llvm/lib/Target/SPIRV/TargetInfo/SPIRVTargetInfo.cpp b/llvm/lib/Target/SPIRV/TargetInfo/SPIRVTargetInfo.cpp
new file mode 100644
index 000000000000..fb7cab4fe779
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/TargetInfo/SPIRVTargetInfo.cpp
@@ -0,0 +1,28 @@
+//===-- SPIRVTargetInfo.cpp - SPIR-V Target Implementation ----*- C++ -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TargetInfo/SPIRVTargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
+
+using namespace llvm;
+
+Target &llvm::getTheSPIRV32Target() {
+  static Target TheSPIRV32Target;
+  return TheSPIRV32Target;
+}
+Target &llvm::getTheSPIRV64Target() {
+  static Target TheSPIRV64Target;
+  return TheSPIRV64Target;
+}
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSPIRVTargetInfo() {
+  RegisterTarget<Triple::spirv32> X(getTheSPIRV32Target(), "spirv32",
+                                    "SPIR-V 32-bit", "SPIRV");
+  RegisterTarget<Triple::spirv64> Y(getTheSPIRV64Target(), "spirv64",
+                                    "SPIR-V 64-bit", "SPIRV");
+}
diff --git a/llvm/lib/Target/SPIRV/TargetInfo/SPIRVTargetInfo.h b/llvm/lib/Target/SPIRV/TargetInfo/SPIRVTargetInfo.h
new file mode 100644
index 000000000000..4353258e1d1a
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/TargetInfo/SPIRVTargetInfo.h
@@ -0,0 +1,21 @@
+//===-- SPIRVTargetInfo.h - SPIRV Target Implementation ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPIRV_TARGETINFO_SPIRVTARGETINFO_H
+#define LLVM_LIB_TARGET_SPIRV_TARGETINFO_SPIRVTARGETINFO_H
+
+namespace llvm {
+
+class Target;
+
+Target &getTheSPIRV32Target();
+Target &getTheSPIRV64Target();
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_SPIRV_TARGETINFO_SPIRVTARGETINFO_H
diff --git a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index af3304f0907d..77e9b1d96612 100644
--- a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -16,6 +16,7 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
@@ -54,6 +55,8 @@ class SparcOperand;
 class SparcAsmParser : public MCTargetAsmParser {
   MCAsmParser &Parser;
 
+  enum class TailRelocKind { Load_GOT, Add_TLS, Load_TLS, Call_TLS };
+
   /// @name Auto-generated Match Functions
   /// {
 
@@ -82,6 +85,9 @@ class SparcAsmParser : public MCTargetAsmParser {
 
   OperandMatchResultTy parseMembarTag(OperandVector &Operands);
 
+  template <TailRelocKind Kind>
+  OperandMatchResultTy parseTailRelocSym(OperandVector &Operands);
+
   template <unsigned N>
   OperandMatchResultTy parseShiftAmtImm(OperandVector &Operands);
 
@@ -112,6 +118,8 @@ class SparcAsmParser : public MCTargetAsmParser {
   bool expandSET(MCInst &Inst, SMLoc IDLoc,
                  SmallVectorImpl<MCInst> &Instructions);
 
+  SMLoc getLoc() const { return getParser().getTok().getLoc(); }
+
 public:
   SparcAsmParser(const MCSubtargetInfo &sti, MCAsmParser &parser,
                 const MCInstrInfo &MII,
@@ -266,6 +274,7 @@ public:
   bool isMEMrr() const { return Kind == k_MemoryReg; }
   bool isMEMri() const { return Kind == k_MemoryImm; }
   bool isMembarTag() const { return Kind == k_Immediate; }
+  bool isTailRelocSym() const { return Kind == k_Immediate; }
 
   bool isCallTarget() const {
     if (!isImm())
@@ -426,6 +435,11 @@ public:
     addExpr(Inst, getImm());
   }
 
+  void addTailRelocSymOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
   static std::unique_ptr<SparcOperand> CreateToken(StringRef Str, SMLoc S) {
     auto Op = std::make_unique<SparcOperand>(k_Token);
     Op->Tok.Data = Str.data();
@@ -849,6 +863,97 @@ OperandMatchResultTy SparcAsmParser::parseShiftAmtImm(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
+template <SparcAsmParser::TailRelocKind Kind>
+OperandMatchResultTy
+SparcAsmParser::parseTailRelocSym(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
+
+  auto MatchesKind = [](SparcMCExpr::VariantKind VK) -> bool {
+    switch (Kind) {
+    case TailRelocKind::Load_GOT:
+      // Non-TLS relocations on ld (or ldx).
+      // ld [%rr + %rr], %rr, %rel(sym)
+      return VK == SparcMCExpr::VK_Sparc_GOTDATA_OP;
+    case TailRelocKind::Add_TLS:
+      // TLS relocations on add.
+      // add %rr, %rr, %rr, %rel(sym)
+      switch (VK) {
+      case SparcMCExpr::VK_Sparc_TLS_GD_ADD:
+      case SparcMCExpr::VK_Sparc_TLS_IE_ADD:
+      case SparcMCExpr::VK_Sparc_TLS_LDM_ADD:
+      case SparcMCExpr::VK_Sparc_TLS_LDO_ADD:
+        return true;
+      default:
+        return false;
+      }
+    case TailRelocKind::Load_TLS:
+      // TLS relocations on ld (or ldx).
+      // ld[x] %addr, %rr, %rel(sym)
+      switch (VK) {
+      case SparcMCExpr::VK_Sparc_TLS_IE_LD:
+      case SparcMCExpr::VK_Sparc_TLS_IE_LDX:
+        return true;
+      default:
+        return false;
+      }
+    case TailRelocKind::Call_TLS:
+      // TLS relocations on call.
+      // call sym, %rel(sym)
+      switch (VK) {
+      case SparcMCExpr::VK_Sparc_TLS_GD_CALL:
+      case SparcMCExpr::VK_Sparc_TLS_LDM_CALL:
+        return true;
+      default:
+        return false;
+      }
+    }
+    llvm_unreachable("Unhandled SparcAsmParser::TailRelocKind enum");
+  };
+
+  if (getLexer().getKind() != AsmToken::Percent) {
+    Error(getLoc(), "expected '%' for operand modifier");
+    return MatchOperand_ParseFail;
+  }
+
+  const AsmToken Tok = Parser.getTok();
+  getParser().Lex(); // Eat '%'
+
+  if (getLexer().getKind() != AsmToken::Identifier) {
+    Error(getLoc(), "expected valid identifier for operand modifier");
+    return MatchOperand_ParseFail;
+  }
+
+  StringRef Name = getParser().getTok().getIdentifier();
+  SparcMCExpr::VariantKind VK = SparcMCExpr::parseVariantKind(Name);
+  if (VK == SparcMCExpr::VK_Sparc_None) {
+    Error(getLoc(), "invalid operand modifier");
+    return MatchOperand_ParseFail;
+  }
+
+  if (!MatchesKind(VK)) {
+    // Did not match the specified set of relocation types, put '%' back.
+    getLexer().UnLex(Tok);
+    return MatchOperand_NoMatch;
+  }
+
+  Parser.Lex(); // Eat the identifier.
+  if (getLexer().getKind() != AsmToken::LParen) {
+    Error(getLoc(), "expected '('");
+    return MatchOperand_ParseFail;
+  }
+
+  getParser().Lex(); // Eat '('
+  const MCExpr *SubExpr;
+  if (getParser().parseParenExpression(SubExpr, E)) {
+    return MatchOperand_ParseFail;
+  }
+
+  const MCExpr *Val = adjustPICRelocation(VK, SubExpr);
+  Operands.push_back(SparcOperand::CreateImm(Val, S, E));
+  return MatchOperand_Success;
+}
+
 OperandMatchResultTy SparcAsmParser::parseMembarTag(OperandVector &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const MCExpr *EVal;
@@ -1408,10 +1513,27 @@ bool SparcAsmParser::matchSparcAsmModifiers(const MCExpr *&EVal,
   StringRef name = Tok.getString();
 
   SparcMCExpr::VariantKind VK = SparcMCExpr::parseVariantKind(name);
+  switch (VK) {
+  case SparcMCExpr::VK_Sparc_None:
+    Error(getLoc(), "invalid operand modifier");
+    return false;
 
-  if (VK == SparcMCExpr::VK_Sparc_None)
+  case SparcMCExpr::VK_Sparc_GOTDATA_OP:
+  case SparcMCExpr::VK_Sparc_TLS_GD_ADD:
+  case SparcMCExpr::VK_Sparc_TLS_GD_CALL:
+  case SparcMCExpr::VK_Sparc_TLS_IE_ADD:
+  case SparcMCExpr::VK_Sparc_TLS_IE_LD:
+  case SparcMCExpr::VK_Sparc_TLS_IE_LDX:
+  case SparcMCExpr::VK_Sparc_TLS_LDM_ADD:
+  case SparcMCExpr::VK_Sparc_TLS_LDM_CALL:
+  case SparcMCExpr::VK_Sparc_TLS_LDO_ADD:
+    // These are special-cased at tablegen level.
     return false;
 
+  default:
+    break;
+  }
+
   Parser.Lex(); // Eat the identifier.
   if (Parser.getTok().getKind() != AsmToken::LParen)
     return false;
diff --git a/llvm/lib/Target/Sparc/DelaySlotFiller.cpp b/llvm/lib/Target/Sparc/DelaySlotFiller.cpp
index 259b37954183..cc132d46de85 100644
--- a/llvm/lib/Target/Sparc/DelaySlotFiller.cpp
+++ b/llvm/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -174,17 +174,20 @@ Filler::findDelayInstr(MachineBasicBlock &MBB,
   if (slot == MBB.begin())
     return MBB.end();
 
-  if (slot->getOpcode() == SP::RET || slot->getOpcode() == SP::TLS_CALL)
+  unsigned Opc = slot->getOpcode();
+
+  if (Opc == SP::RET || Opc == SP::TLS_CALL)
     return MBB.end();
 
-  if (slot->getOpcode() == SP::RETL) {
+  if (Opc == SP::RETL || Opc == SP::TAIL_CALL || Opc == SP::TAIL_CALLri) {
     MachineBasicBlock::iterator J = slot;
     --J;
 
     if (J->getOpcode() == SP::RESTORErr
         || J->getOpcode() == SP::RESTOREri) {
       // change retl to ret.
-      slot->setDesc(Subtarget->getInstrInfo()->get(SP::RET));
+      if (Opc == SP::RETL)
+        slot->setDesc(Subtarget->getInstrInfo()->get(SP::RET));
       return J;
     }
   }
@@ -360,6 +363,8 @@ bool Filler::needsUnimp(MachineBasicBlock::iterator I, unsigned &StructSize)
   case SP::CALLrr:
   case SP::CALLri: structSizeOpNum = 2; break;
   case SP::TLS_CALL: return false;
+  case SP::TAIL_CALLri:
+  case SP::TAIL_CALL: return false;
   }
 
   const MachineOperand &MO = I->getOperand(structSizeOpNum);
diff --git a/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp b/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
index 142124a8e0d9..1825b95dd6ac 100644
--- a/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
+++ b/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
@@ -14,8 +14,8 @@
 #include "TargetInfo/SparcTargetInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDecoderOps.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
-#include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/TargetRegistry.h"
 
@@ -32,7 +32,7 @@ class SparcDisassembler : public MCDisassembler {
 public:
   SparcDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
       : MCDisassembler(STI, Ctx) {}
-  virtual ~SparcDisassembler() {}
+  virtual ~SparcDisassembler() = default;
 
   DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
                               ArrayRef<uint8_t> Bytes, uint64_t Address,
@@ -142,10 +142,9 @@ static const uint16_t CPPairDecoderTable[] = {
   SP::C24_C25, SP::C26_C27, SP::C28_C29, SP::C30_C31
 };
 
-static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst,
-                                               unsigned RegNo,
+static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
   unsigned Reg = IntRegDecoderTable[RegNo];
@@ -153,10 +152,9 @@ static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeI64RegsRegisterClass(MCInst &Inst,
-                                               unsigned RegNo,
+static DecodeStatus DecodeI64RegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
   unsigned Reg = IntRegDecoderTable[RegNo];
@@ -164,11 +162,9 @@ static DecodeStatus DecodeI64RegsRegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-
-static DecodeStatus DecodeFPRegsRegisterClass(MCInst &Inst,
-                                              unsigned RegNo,
+static DecodeStatus DecodeFPRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                               uint64_t Address,
-                                              const void *Decoder) {
+                                              const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
   unsigned Reg = FPRegDecoderTable[RegNo];
@@ -176,11 +172,9 @@ static DecodeStatus DecodeFPRegsRegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-
-static DecodeStatus DecodeDFPRegsRegisterClass(MCInst &Inst,
-                                               unsigned RegNo,
+static DecodeStatus DecodeDFPRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
   unsigned Reg = DFPRegDecoderTable[RegNo];
@@ -188,11 +182,9 @@ static DecodeStatus DecodeDFPRegsRegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-
-static DecodeStatus DecodeQFPRegsRegisterClass(MCInst &Inst,
-                                               unsigned RegNo,
+static DecodeStatus DecodeQFPRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
@@ -203,10 +195,9 @@ static DecodeStatus DecodeQFPRegsRegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeCPRegsRegisterClass(MCInst &Inst,
-                                               unsigned RegNo,
-                                               uint64_t Address,
-                                               const void *Decoder) {
+static DecodeStatus DecodeCPRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                              uint64_t Address,
+                                              const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
   unsigned Reg = CPRegDecoderTable[RegNo];
@@ -216,7 +207,7 @@ static DecodeStatus DecodeCPRegsRegisterClass(MCInst &Inst,
 
 static DecodeStatus DecodeFCCRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   if (RegNo > 3)
     return MCDisassembler::Fail;
   Inst.addOperand(MCOperand::createReg(FCCRegDecoderTable[RegNo]));
@@ -225,7 +216,7 @@ static DecodeStatus DecodeFCCRegsRegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecodeASRRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
   Inst.addOperand(MCOperand::createReg(ASRRegDecoderTable[RegNo]));
@@ -233,8 +224,8 @@ static DecodeStatus DecodeASRRegsRegisterClass(MCInst &Inst, unsigned RegNo,
 }
 
 static DecodeStatus DecodePRRegsRegisterClass(MCInst &Inst, unsigned RegNo,
-                                               uint64_t Address,
-                                               const void *Decoder) {
+                                              uint64_t Address,
+                                              const MCDisassembler *Decoder) {
   if (RegNo >= array_lengthof(PRRegDecoderTable))
     return MCDisassembler::Fail;
   Inst.addOperand(MCOperand::createReg(PRRegDecoderTable[RegNo]));
@@ -242,7 +233,8 @@ static DecodeStatus DecodePRRegsRegisterClass(MCInst &Inst, unsigned RegNo,
 }
 
 static DecodeStatus DecodeIntPairRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder) {
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   if (RegNo > 31)
@@ -257,7 +249,8 @@ static DecodeStatus DecodeIntPairRegisterClass(MCInst &Inst, unsigned RegNo,
 }
 
 static DecodeStatus DecodeCPPairRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder) {
+                                              uint64_t Address,
+                                              const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
@@ -267,45 +260,52 @@ static DecodeStatus DecodeCPPairRegisterClass(MCInst &Inst, unsigned RegNo,
 }
 
 static DecodeStatus DecodeLoadInt(MCInst &Inst, unsigned insn, uint64_t Address,
-                                  const void *Decoder);
-static DecodeStatus DecodeLoadIntPair(MCInst &Inst, unsigned insn, uint64_t Address,
-                                  const void *Decoder);
+                                  const MCDisassembler *Decoder);
+static DecodeStatus DecodeLoadIntPair(MCInst &Inst, unsigned insn,
+                                      uint64_t Address,
+                                      const MCDisassembler *Decoder);
 static DecodeStatus DecodeLoadFP(MCInst &Inst, unsigned insn, uint64_t Address,
-                                 const void *Decoder);
+                                 const MCDisassembler *Decoder);
 static DecodeStatus DecodeLoadDFP(MCInst &Inst, unsigned insn, uint64_t Address,
-                                  const void *Decoder);
+                                  const MCDisassembler *Decoder);
 static DecodeStatus DecodeLoadQFP(MCInst &Inst, unsigned insn, uint64_t Address,
-                                  const void *Decoder);
+                                  const MCDisassembler *Decoder);
 static DecodeStatus DecodeLoadCP(MCInst &Inst, unsigned insn, uint64_t Address,
-                                  const void *Decoder);
-static DecodeStatus DecodeLoadCPPair(MCInst &Inst, unsigned insn, uint64_t Address,
-                                  const void *Decoder);
+                                 const MCDisassembler *Decoder);
+static DecodeStatus DecodeLoadCPPair(MCInst &Inst, unsigned insn,
+                                     uint64_t Address,
+                                     const MCDisassembler *Decoder);
 static DecodeStatus DecodeStoreInt(MCInst &Inst, unsigned insn,
-                                   uint64_t Address, const void *Decoder);
+                                   uint64_t Address,
+                                   const MCDisassembler *Decoder);
 static DecodeStatus DecodeStoreIntPair(MCInst &Inst, unsigned insn,
-                                   uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeStoreFP(MCInst &Inst, unsigned insn,
-                                  uint64_t Address, const void *Decoder);
+                                       uint64_t Address,
+                                       const MCDisassembler *Decoder);
+static DecodeStatus DecodeStoreFP(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const MCDisassembler *Decoder);
 static DecodeStatus DecodeStoreDFP(MCInst &Inst, unsigned insn,
-                                   uint64_t Address, const void *Decoder);
+                                   uint64_t Address,
+                                   const MCDisassembler *Decoder);
 static DecodeStatus DecodeStoreQFP(MCInst &Inst, unsigned insn,
-                                   uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeStoreCP(MCInst &Inst, unsigned insn,
-                                   uint64_t Address, const void *Decoder);
+                                   uint64_t Address,
+                                   const MCDisassembler *Decoder);
+static DecodeStatus DecodeStoreCP(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const MCDisassembler *Decoder);
 static DecodeStatus DecodeStoreCPPair(MCInst &Inst, unsigned insn,
-                                   uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeCall(MCInst &Inst, unsigned insn,
-                               uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeSIMM13(MCInst &Inst, unsigned insn,
-                                 uint64_t Address, const void *Decoder);
+                                      uint64_t Address,
+                                      const MCDisassembler *Decoder);
+static DecodeStatus DecodeCall(MCInst &Inst, unsigned insn, uint64_t Address,
+                               const MCDisassembler *Decoder);
+static DecodeStatus DecodeSIMM13(MCInst &Inst, unsigned insn, uint64_t Address,
+                                 const MCDisassembler *Decoder);
 static DecodeStatus DecodeJMPL(MCInst &Inst, unsigned insn, uint64_t Address,
-                               const void *Decoder);
+                               const MCDisassembler *Decoder);
 static DecodeStatus DecodeReturn(MCInst &MI, unsigned insn, uint64_t Address,
-                                 const void *Decoder);
+                                 const MCDisassembler *Decoder);
 static DecodeStatus DecodeSWAP(MCInst &Inst, unsigned insn, uint64_t Address,
-                               const void *Decoder);
+                               const MCDisassembler *Decoder);
 static DecodeStatus DecodeTRAP(MCInst &Inst, unsigned insn, uint64_t Address,
-                               const void *Decoder);
+                               const MCDisassembler *Decoder);
 
 #include "SparcGenDisassemblerTables.inc"
 
@@ -363,13 +363,12 @@ DecodeStatus SparcDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
   return MCDisassembler::Fail;
 }
 
-
 typedef DecodeStatus (*DecodeFunc)(MCInst &MI, unsigned insn, uint64_t Address,
-                                   const void *Decoder);
+                                   const MCDisassembler *Decoder);
 
 static DecodeStatus DecodeMem(MCInst &MI, unsigned insn, uint64_t Address,
-                              const void *Decoder,
-                              bool isLoad, DecodeFunc DecodeRD) {
+                              const MCDisassembler *Decoder, bool isLoad,
+                              DecodeFunc DecodeRD) {
   unsigned rd = fieldFromInstruction(insn, 25, 5);
   unsigned rs1 = fieldFromInstruction(insn, 14, 5);
   bool isImm = fieldFromInstruction(insn, 13, 1);
@@ -415,100 +414,106 @@ static DecodeStatus DecodeMem(MCInst &MI, unsigned insn, uint64_t Address,
 }
 
 static DecodeStatus DecodeLoadInt(MCInst &Inst, unsigned insn, uint64_t Address,
-                                  const void *Decoder) {
+                                  const MCDisassembler *Decoder) {
   return DecodeMem(Inst, insn, Address, Decoder, true,
                    DecodeIntRegsRegisterClass);
 }
 
-static DecodeStatus DecodeLoadIntPair(MCInst &Inst, unsigned insn, uint64_t Address,
-                                  const void *Decoder) {
+static DecodeStatus DecodeLoadIntPair(MCInst &Inst, unsigned insn,
+                                      uint64_t Address,
+                                      const MCDisassembler *Decoder) {
   return DecodeMem(Inst, insn, Address, Decoder, true,
                    DecodeIntPairRegisterClass);
 }
 
 static DecodeStatus DecodeLoadFP(MCInst &Inst, unsigned insn, uint64_t Address,
-                                 const void *Decoder) {
+                                 const MCDisassembler *Decoder) {
   return DecodeMem(Inst, insn, Address, Decoder, true,
                    DecodeFPRegsRegisterClass);
 }
 
 static DecodeStatus DecodeLoadDFP(MCInst &Inst, unsigned insn, uint64_t Address,
-                                  const void *Decoder) {
+                                  const MCDisassembler *Decoder) {
   return DecodeMem(Inst, insn, Address, Decoder, true,
                    DecodeDFPRegsRegisterClass);
 }
 
 static DecodeStatus DecodeLoadQFP(MCInst &Inst, unsigned insn, uint64_t Address,
-                                  const void *Decoder) {
+                                  const MCDisassembler *Decoder) {
   return DecodeMem(Inst, insn, Address, Decoder, true,
                    DecodeQFPRegsRegisterClass);
 }
 
 static DecodeStatus DecodeLoadCP(MCInst &Inst, unsigned insn, uint64_t Address,
-                                  const void *Decoder) {
+                                 const MCDisassembler *Decoder) {
   return DecodeMem(Inst, insn, Address, Decoder, true,
                    DecodeCPRegsRegisterClass);
 }
 
-static DecodeStatus DecodeLoadCPPair(MCInst &Inst, unsigned insn, uint64_t Address,
-                                  const void *Decoder) {
+static DecodeStatus DecodeLoadCPPair(MCInst &Inst, unsigned insn,
+                                     uint64_t Address,
+                                     const MCDisassembler *Decoder) {
   return DecodeMem(Inst, insn, Address, Decoder, true,
                    DecodeCPPairRegisterClass);
 }
 
 static DecodeStatus DecodeStoreInt(MCInst &Inst, unsigned insn,
-                                   uint64_t Address, const void *Decoder) {
+                                   uint64_t Address,
+                                   const MCDisassembler *Decoder) {
   return DecodeMem(Inst, insn, Address, Decoder, false,
                    DecodeIntRegsRegisterClass);
 }
 
 static DecodeStatus DecodeStoreIntPair(MCInst &Inst, unsigned insn,
-                                   uint64_t Address, const void *Decoder) {
+                                       uint64_t Address,
+                                       const MCDisassembler *Decoder) {
   return DecodeMem(Inst, insn, Address, Decoder, false,
                    DecodeIntPairRegisterClass);
 }
 
 static DecodeStatus DecodeStoreFP(MCInst &Inst, unsigned insn, uint64_t Address,
-                                  const void *Decoder) {
+                                  const MCDisassembler *Decoder) {
   return DecodeMem(Inst, insn, Address, Decoder, false,
                    DecodeFPRegsRegisterClass);
 }
 
 static DecodeStatus DecodeStoreDFP(MCInst &Inst, unsigned insn,
-                                   uint64_t Address, const void *Decoder) {
+                                   uint64_t Address,
+                                   const MCDisassembler *Decoder) {
   return DecodeMem(Inst, insn, Address, Decoder, false,
                    DecodeDFPRegsRegisterClass);
 }
 
 static DecodeStatus DecodeStoreQFP(MCInst &Inst, unsigned insn,
-                                   uint64_t Address, const void *Decoder) {
+                                   uint64_t Address,
+                                   const MCDisassembler *Decoder) {
   return DecodeMem(Inst, insn, Address, Decoder, false,
                    DecodeQFPRegsRegisterClass);
 }
 
-static DecodeStatus DecodeStoreCP(MCInst &Inst, unsigned insn,
-                                   uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeStoreCP(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const MCDisassembler *Decoder) {
   return DecodeMem(Inst, insn, Address, Decoder, false,
                    DecodeCPRegsRegisterClass);
 }
 
 static DecodeStatus DecodeStoreCPPair(MCInst &Inst, unsigned insn,
-                                   uint64_t Address, const void *Decoder) {
+                                      uint64_t Address,
+                                      const MCDisassembler *Decoder) {
   return DecodeMem(Inst, insn, Address, Decoder, false,
                    DecodeCPPairRegisterClass);
 }
 
-static bool tryAddingSymbolicOperand(int64_t Value,  bool isBranch,
+static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch,
                                      uint64_t Address, uint64_t Offset,
                                      uint64_t Width, MCInst &MI,
-                                     const void *Decoder) {
-  const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
-  return Dis->tryAddingSymbolicOperand(MI, Value, Address, isBranch,
-                                       Offset, Width);
+                                     const MCDisassembler *Decoder) {
+  return Decoder->tryAddingSymbolicOperand(MI, Value, Address, isBranch, Offset,
+                                           Width, /*InstSize=*/4);
 }
 
-static DecodeStatus DecodeCall(MCInst &MI, unsigned insn,
-                               uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeCall(MCInst &MI, unsigned insn, uint64_t Address,
+                               const MCDisassembler *Decoder) {
   unsigned tgt = fieldFromInstruction(insn, 0, 30);
   tgt <<= 2;
   if (!tryAddingSymbolicOperand(tgt+Address, false, Address,
@@ -517,15 +522,15 @@ static DecodeStatus DecodeCall(MCInst &MI, unsigned insn,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeSIMM13(MCInst &MI, unsigned insn,
-                                 uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeSIMM13(MCInst &MI, unsigned insn, uint64_t Address,
+                                 const MCDisassembler *Decoder) {
   unsigned tgt = SignExtend32<13>(fieldFromInstruction(insn, 0, 13));
   MI.addOperand(MCOperand::createImm(tgt));
   return MCDisassembler::Success;
 }
 
 static DecodeStatus DecodeJMPL(MCInst &MI, unsigned insn, uint64_t Address,
-                               const void *Decoder) {
+                               const MCDisassembler *Decoder) {
 
   unsigned rd = fieldFromInstruction(insn, 25, 5);
   unsigned rs1 = fieldFromInstruction(insn, 14, 5);
@@ -559,7 +564,7 @@ static DecodeStatus DecodeJMPL(MCInst &MI, unsigned insn, uint64_t Address,
 }
 
 static DecodeStatus DecodeReturn(MCInst &MI, unsigned insn, uint64_t Address,
-                                 const void *Decoder) {
+                                 const MCDisassembler *Decoder) {
 
   unsigned rs1 = fieldFromInstruction(insn, 14, 5);
   unsigned isImm = fieldFromInstruction(insn, 13, 1);
@@ -587,7 +592,7 @@ static DecodeStatus DecodeReturn(MCInst &MI, unsigned insn, uint64_t Address,
 }
 
 static DecodeStatus DecodeSWAP(MCInst &MI, unsigned insn, uint64_t Address,
-                               const void *Decoder) {
+                               const MCDisassembler *Decoder) {
 
   unsigned rd = fieldFromInstruction(insn, 25, 5);
   unsigned rs1 = fieldFromInstruction(insn, 14, 5);
@@ -627,7 +632,7 @@ static DecodeStatus DecodeSWAP(MCInst &MI, unsigned insn, uint64_t Address,
 }
 
 static DecodeStatus DecodeTRAP(MCInst &MI, unsigned insn, uint64_t Address,
-                               const void *Decoder) {
+                               const MCDisassembler *Decoder) {
 
   unsigned rs1 = fieldFromInstruction(insn, 14, 5);
   unsigned isImm = fieldFromInstruction(insn, 13, 1);
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
index 4d69040a4508..7b2d8afd3605 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -47,6 +47,9 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
   case Sparc::fixup_sparc_br16_14:
     return (Value >> 2) & 0x3fff;
 
+  case Sparc::fixup_sparc_hix22:
+    return (~Value >> 10) & 0x3fffff;
+
   case Sparc::fixup_sparc_pc22:
   case Sparc::fixup_sparc_got22:
   case Sparc::fixup_sparc_tls_gd_hi22:
@@ -60,6 +63,9 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
   case Sparc::fixup_sparc_13:
     return Value & 0x1fff;
 
+  case Sparc::fixup_sparc_lox10:
+    return (Value & 0x3ff) | 0x1c00;
+
   case Sparc::fixup_sparc_pc10:
   case Sparc::fixup_sparc_got10:
   case Sparc::fixup_sparc_tls_gd_lo10:
@@ -98,6 +104,9 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
   case Sparc::fixup_sparc_tls_ie_ld:
   case Sparc::fixup_sparc_tls_ie_ldx:
   case Sparc::fixup_sparc_tls_ie_add:
+  case Sparc::fixup_sparc_gotdata_lox10:
+  case Sparc::fixup_sparc_gotdata_hix22:
+  case Sparc::fixup_sparc_gotdata_op:
     return 0;
   }
 }
@@ -189,7 +198,12 @@ namespace {
         { "fixup_sparc_tls_ie_ldx",     0,  0,  0 },
         { "fixup_sparc_tls_ie_add",     0,  0,  0 },
         { "fixup_sparc_tls_le_hix22",   0,  0,  0 },
-        { "fixup_sparc_tls_le_lox10",   0,  0,  0 }
+        { "fixup_sparc_tls_le_lox10",   0,  0,  0 },
+        { "fixup_sparc_hix22",         10, 22,  0 },
+        { "fixup_sparc_lox10",         19, 13,  0 },
+        { "fixup_sparc_gotdata_hix22",  0,  0,  0 },
+        { "fixup_sparc_gotdata_lox10",  0,  0,  0 },
+        { "fixup_sparc_gotdata_op",     0,  0,  0 },
       };
 
       const static MCFixupKindInfo InfosLE[Sparc::NumTargetFixupKinds] = {
@@ -231,7 +245,12 @@ namespace {
         { "fixup_sparc_tls_ie_ldx",     0,  0,  0 },
         { "fixup_sparc_tls_ie_add",     0,  0,  0 },
         { "fixup_sparc_tls_le_hix22",   0,  0,  0 },
-        { "fixup_sparc_tls_le_lox10",   0,  0,  0 }
+        { "fixup_sparc_tls_le_lox10",   0,  0,  0 },
+        { "fixup_sparc_hix22",          0, 22,  0 },
+        { "fixup_sparc_lox10",          0, 13,  0 },
+        { "fixup_sparc_gotdata_hix22",  0,  0,  0 },
+        { "fixup_sparc_gotdata_lox10",  0,  0,  0 },
+        { "fixup_sparc_gotdata_op",     0,  0,  0 },
       };
 
       // Fixup kinds from .reloc directive are like R_SPARC_NONE. They do
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
index 02261dc5c4cd..9c50c41f6bf2 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
@@ -26,7 +26,7 @@ namespace {
                                 Is64Bit ?  ELF::EM_SPARCV9 : ELF::EM_SPARC,
                                 /*HasRelocationAddend*/ true) {}
 
-    ~SparcELFObjectWriter() override {}
+    ~SparcELFObjectWriter() override = default;
 
   protected:
     unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
@@ -112,6 +112,11 @@ unsigned SparcELFObjectWriter::getRelocType(MCContext &Ctx,
   case Sparc::fixup_sparc_tls_ie_add:    return ELF::R_SPARC_TLS_IE_ADD;
   case Sparc::fixup_sparc_tls_le_hix22:  return ELF::R_SPARC_TLS_LE_HIX22;
   case Sparc::fixup_sparc_tls_le_lox10:  return ELF::R_SPARC_TLS_LE_LOX10;
+  case Sparc::fixup_sparc_hix22:         return ELF::R_SPARC_HIX22;
+  case Sparc::fixup_sparc_lox10:         return ELF::R_SPARC_LOX10;
+  case Sparc::fixup_sparc_gotdata_hix22: return ELF::R_SPARC_GOTDATA_HIX22;
+  case Sparc::fixup_sparc_gotdata_lox10: return ELF::R_SPARC_GOTDATA_LOX10;
+  case Sparc::fixup_sparc_gotdata_op:    return ELF::R_SPARC_GOTDATA_OP;
   }
 
   return ELF::R_SPARC_NONE;
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h b/llvm/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h
index e0a43095ec0b..701d8513e657 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h
@@ -95,6 +95,18 @@ namespace llvm {
       fixup_sparc_tls_le_hix22,
       fixup_sparc_tls_le_lox10,
 
+      /// 22-bit fixup corresponding to %hix(foo)
+      fixup_sparc_hix22,
+      /// 13-bit fixup corresponding to %lox(foo)
+      fixup_sparc_lox10,
+
+      /// 22-bit fixup corresponding to %gdop_hix22(foo)
+      fixup_sparc_gotdata_hix22,
+      /// 13-bit fixup corresponding to %gdop_lox10(foo)
+      fixup_sparc_gotdata_lox10,
+      /// 32-bit fixup corresponding to %gdop(foo)
+      fixup_sparc_gotdata_op,
+
       // Marker
       LastTargetFixupKind,
       NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
index 9f8522541332..d75d41b35838 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
@@ -104,17 +104,21 @@ void SparcMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
   support::endian::write(OS, Bits,
                          Ctx.getAsmInfo()->isLittleEndian() ? support::little
                                                             : support::big);
-  unsigned tlsOpNo = 0;
+
+  // Some instructions have phantom operands that only contribute a fixup entry.
+  unsigned SymOpNo = 0;
   switch (MI.getOpcode()) {
   default: break;
-  case SP::TLS_CALL:   tlsOpNo = 1; break;
+  case SP::TLS_CALL:   SymOpNo = 1; break;
+  case SP::GDOP_LDrr:
+  case SP::GDOP_LDXrr:
   case SP::TLS_ADDrr:
   case SP::TLS_ADDXrr:
   case SP::TLS_LDrr:
-  case SP::TLS_LDXrr:  tlsOpNo = 3; break;
+  case SP::TLS_LDXrr:  SymOpNo = 3; break;
   }
-  if (tlsOpNo != 0) {
-    const MCOperand &MO = MI.getOperand(tlsOpNo);
+  if (SymOpNo != 0) {
+    const MCOperand &MO = MI.getOperand(SymOpNo);
     uint64_t op = getMachineOpValue(MI, MO, Fixups, STI);
     assert(op == 0 && "Unexpected operand value!");
     (void)op; // suppress warning.
@@ -253,7 +257,6 @@ getBranchOnRegTargetOpValue(const MCInst &MI, unsigned OpNo,
 #include "SparcGenMCCodeEmitter.inc"
 
 MCCodeEmitter *llvm::createSparcMCCodeEmitter(const MCInstrInfo &MCII,
-                                              const MCRegisterInfo &MRI,
                                               MCContext &Ctx) {
   return new SparcMCCodeEmitter(MCII, Ctx);
 }
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
index c2db4526ef66..cc73ea7e6120 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
@@ -17,6 +17,7 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCSymbolELF.h"
+#include "llvm/Support/Casting.h"
 
 using namespace llvm;
 
@@ -80,6 +81,11 @@ bool SparcMCExpr::printVariantKind(raw_ostream &OS, VariantKind Kind)
   case VK_Sparc_TLS_IE_ADD:    OS << "%tie_add(";    return true;
   case VK_Sparc_TLS_LE_HIX22:  OS << "%tle_hix22(";  return true;
   case VK_Sparc_TLS_LE_LOX10:  OS << "%tle_lox10(";  return true;
+  case VK_Sparc_HIX22:         OS << "%hix(";        return true;
+  case VK_Sparc_LOX10:         OS << "%lox(";        return true;
+  case VK_Sparc_GOTDATA_HIX22: OS << "%gdop_hix22("; return true;
+  case VK_Sparc_GOTDATA_LOX10: OS << "%gdop_lox10("; return true;
+  case VK_Sparc_GOTDATA_OP:    OS << "%gdop(";       return true;
   }
   llvm_unreachable("Unhandled SparcMCExpr::VariantKind");
 }
@@ -119,6 +125,11 @@ SparcMCExpr::VariantKind SparcMCExpr::parseVariantKind(StringRef name)
     .Case("tie_add",    VK_Sparc_TLS_IE_ADD)
     .Case("tle_hix22",  VK_Sparc_TLS_LE_HIX22)
     .Case("tle_lox10",  VK_Sparc_TLS_LE_LOX10)
+    .Case("hix",        VK_Sparc_HIX22)
+    .Case("lox",        VK_Sparc_LOX10)
+    .Case("gdop_hix22", VK_Sparc_GOTDATA_HIX22)
+    .Case("gdop_lox10", VK_Sparc_GOTDATA_LOX10)
+    .Case("gdop",       VK_Sparc_GOTDATA_OP)
     .Default(VK_Sparc_None);
 }
 
@@ -159,6 +170,11 @@ Sparc::Fixups SparcMCExpr::getFixupKind(SparcMCExpr::VariantKind Kind) {
   case VK_Sparc_TLS_IE_ADD:    return Sparc::fixup_sparc_tls_ie_add;
   case VK_Sparc_TLS_LE_HIX22:  return Sparc::fixup_sparc_tls_le_hix22;
   case VK_Sparc_TLS_LE_LOX10:  return Sparc::fixup_sparc_tls_le_lox10;
+  case VK_Sparc_HIX22:         return Sparc::fixup_sparc_hix22;
+  case VK_Sparc_LOX10:         return Sparc::fixup_sparc_lox10;
+  case VK_Sparc_GOTDATA_HIX22: return Sparc::fixup_sparc_gotdata_hix22;
+  case VK_Sparc_GOTDATA_LOX10: return Sparc::fixup_sparc_gotdata_lox10;
+  case VK_Sparc_GOTDATA_OP:    return Sparc::fixup_sparc_gotdata_op;
   }
 }
 
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
index 504e959194f5..d98ad26c96a9 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
@@ -58,7 +58,12 @@ public:
     VK_Sparc_TLS_IE_LDX,
     VK_Sparc_TLS_IE_ADD,
     VK_Sparc_TLS_LE_HIX22,
-    VK_Sparc_TLS_LE_LOX10
+    VK_Sparc_TLS_LE_LOX10,
+    VK_Sparc_HIX22,
+    VK_Sparc_LOX10,
+    VK_Sparc_GOTDATA_HIX22,
+    VK_Sparc_GOTDATA_LOX10,
+    VK_Sparc_GOTDATA_OP,
   };
 
 private:
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
index f360946b9a79..7ef043d9df40 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
@@ -29,7 +29,6 @@ class MCTargetOptions;
 class Target;
 
 MCCodeEmitter *createSparcMCCodeEmitter(const MCInstrInfo &MCII,
-                                        const MCRegisterInfo &MRI,
                                         MCContext &Ctx);
 MCAsmBackend *createSparcAsmBackend(const Target &T, const MCSubtargetInfo &STI,
                                     const MCRegisterInfo &MRI,
diff --git a/llvm/lib/Target/Sparc/SparcCallingConv.td b/llvm/lib/Target/Sparc/SparcCallingConv.td
index db540d6f0c42..e6d23f741ea5 100644
--- a/llvm/lib/Target/Sparc/SparcCallingConv.td
+++ b/llvm/lib/Target/Sparc/SparcCallingConv.td
@@ -134,7 +134,7 @@ def RetCC_Sparc64 : CallingConv<[
 // Callee-saved registers are handled by the register window mechanism.
 def CSR : CalleeSavedRegs<(add)> {
   let OtherPreserved = (add (sequence "I%u", 0, 7),
-                            (sequence "L%u", 0, 7));
+                            (sequence "L%u", 0, 7), O6);
 }
 
 // Callee-saved registers for calls with ReturnsTwice attribute.
diff --git a/llvm/lib/Target/Sparc/SparcFrameLowering.cpp b/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
index a740de9123c9..000418be9a9e 100644
--- a/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -218,8 +218,9 @@ void SparcFrameLowering::emitEpilogue(MachineFunction &MF,
   const SparcInstrInfo &TII =
       *static_cast<const SparcInstrInfo *>(MF.getSubtarget().getInstrInfo());
   DebugLoc dl = MBBI->getDebugLoc();
-  assert(MBBI->getOpcode() == SP::RETL &&
-         "Can only put epilog before 'retl' instruction!");
+  assert((MBBI->getOpcode() == SP::RETL || MBBI->getOpcode() == SP::TAIL_CALL ||
+          MBBI->getOpcode() == SP::TAIL_CALLri) &&
+         "Can only put epilog before 'retl' or 'tail_call' instruction!");
   if (!FuncInfo->isLeafProc()) {
     BuildMI(MBB, MBBI, dl, TII.get(SP::RESTORErr), SP::G0).addReg(SP::G0)
       .addReg(SP::G0);
@@ -228,10 +229,19 @@ void SparcFrameLowering::emitEpilogue(MachineFunction &MF,
   MachineFrameInfo &MFI = MF.getFrameInfo();
 
   int NumBytes = (int) MFI.getStackSize();
-  if (NumBytes == 0)
-    return;
-
-  emitSPAdjustment(MF, MBB, MBBI, NumBytes, SP::ADDrr, SP::ADDri);
+  if (NumBytes != 0)
+    emitSPAdjustment(MF, MBB, MBBI, NumBytes, SP::ADDrr, SP::ADDri);
+
+  // Preserve return address in %o7
+  if (MBBI->getOpcode() == SP::TAIL_CALL) {
+    MBB.addLiveIn(SP::O7);
+    BuildMI(MBB, MBBI, dl, TII.get(SP::ORrr), SP::G1)
+        .addReg(SP::G0)
+        .addReg(SP::O7);
+    BuildMI(MBB, MBBI, dl, TII.get(SP::ORrr), SP::O7)
+        .addReg(SP::G0)
+        .addReg(SP::G1);
+  }
 }
 
 bool SparcFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
@@ -316,10 +326,11 @@ bool SparcFrameLowering::isLeafProc(MachineFunction &MF) const
   MachineRegisterInfo &MRI = MF.getRegInfo();
   MachineFrameInfo    &MFI = MF.getFrameInfo();
 
-  return !(MFI.hasCalls()                  // has calls
-           || MRI.isPhysRegUsed(SP::L0)    // Too many registers needed
-           || MRI.isPhysRegUsed(SP::O6)    // %sp is used
-           || hasFP(MF));                  // need %fp
+  return !(MFI.hasCalls()               // has calls
+           || MRI.isPhysRegUsed(SP::L0) // Too many registers needed
+           || MRI.isPhysRegUsed(SP::O6) // %sp is used
+           || hasFP(MF)                 // need %fp
+           || MF.hasInlineAsm());       // has inline assembly
 }
 
 void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const {
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index 6d6879bc94b3..2cb74e7709c7 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -710,6 +710,36 @@ static bool hasReturnsTwiceAttr(SelectionDAG &DAG, SDValue Callee,
   return CalleeFn->hasFnAttribute(Attribute::ReturnsTwice);
 }
 
+/// IsEligibleForTailCallOptimization - Check whether the call is eligible
+/// for tail call optimization.
+bool SparcTargetLowering::IsEligibleForTailCallOptimization(
+    CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF) const {
+
+  auto &Outs = CLI.Outs;
+  auto &Caller = MF.getFunction();
+
+  // Do not tail call opt functions with "disable-tail-calls" attribute.
+  if (Caller.getFnAttribute("disable-tail-calls").getValueAsString() == "true")
+    return false;
+
+  // Do not tail call opt if the stack is used to pass parameters.
+  if (CCInfo.getNextStackOffset() != 0)
+    return false;
+
+  // Do not tail call opt if either the callee or caller returns
+  // a struct and the other does not.
+  if (!Outs.empty() && Caller.hasStructRetAttr() != Outs[0].Flags.isSRet())
+    return false;
+
+  // Byval parameters hand the function a pointer directly into the stack area
+  // we want to reuse during a tail call.
+  for (auto &Arg : Outs)
+    if (Arg.Flags.isByVal())
+      return false;
+
+  return true;
+}
+
 // Lower a call for the 32-bit ABI.
 SDValue
 SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
@@ -725,15 +755,15 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
   CallingConv::ID CallConv              = CLI.CallConv;
   bool isVarArg                         = CLI.IsVarArg;
 
-  // Sparc target does not yet support tail call optimization.
-  isTailCall = false;
-
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
                  *DAG.getContext());
   CCInfo.AnalyzeCallOperands(Outs, CC_Sparc32);
 
+  isTailCall = isTailCall && IsEligibleForTailCallOptimization(
+                                 CCInfo, CLI, DAG.getMachineFunction());
+
   // Get the size of the outgoing arguments stack space requirement.
   unsigned ArgsSize = CCInfo.getNextStackOffset();
 
@@ -771,7 +801,10 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
     }
   }
 
-  Chain = DAG.getCALLSEQ_START(Chain, ArgsSize, 0, dl);
+  assert(!isTailCall || ArgsSize == 0);
+
+  if (!isTailCall)
+    Chain = DAG.getCALLSEQ_START(Chain, ArgsSize, 0, dl);
 
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
@@ -816,6 +849,10 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
 
     if (Flags.isSRet()) {
       assert(VA.needsCustom());
+
+      if (isTailCall)
+        continue;
+
       // store SRet argument in %sp+64
       SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
       SDValue PtrOff = DAG.getIntPtrConstant(64, dl);
@@ -825,9 +862,8 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
       hasStructRetAttr = true;
       // sret only allowed on first argument
       assert(Outs[realArgIdx].OrigArgIndex == 0);
-      PointerType *Ty = cast<PointerType>(CLI.getArgs()[0].Ty);
-      Type *ElementTy = Ty->getPointerElementType();
-      SRetArgSize = DAG.getDataLayout().getTypeAllocSize(ElementTy);
+      SRetArgSize =
+          DAG.getDataLayout().getTypeAllocSize(CLI.getArgs()[0].IndirectType);
       continue;
     }
 
@@ -929,7 +965,9 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
   // stuck together.
   SDValue InFlag;
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-    Register Reg = toCallerWindow(RegsToPass[i].first);
+    Register Reg = RegsToPass[i].first;
+    if (!isTailCall)
+      Reg = toCallerWindow(Reg);
     Chain = DAG.getCopyToReg(Chain, dl, Reg, RegsToPass[i].second, InFlag);
     InFlag = Chain.getValue(1);
   }
@@ -953,9 +991,12 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
   Ops.push_back(Callee);
   if (hasStructRetAttr)
     Ops.push_back(DAG.getTargetConstant(SRetArgSize, dl, MVT::i32));
-  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
-    Ops.push_back(DAG.getRegister(toCallerWindow(RegsToPass[i].first),
-                                  RegsToPass[i].second.getValueType()));
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Register Reg = RegsToPass[i].first;
+    if (!isTailCall)
+      Reg = toCallerWindow(Reg);
+    Ops.push_back(DAG.getRegister(Reg, RegsToPass[i].second.getValueType()));
+  }
 
   // Add a register mask operand representing the call-preserved registers.
   const SparcRegisterInfo *TRI = Subtarget->getRegisterInfo();
@@ -969,6 +1010,11 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
   if (InFlag.getNode())
     Ops.push_back(InFlag);
 
+  if (isTailCall) {
+    DAG.getMachineFunction().getFrameInfo().setHasTailCall();
+    return DAG.getNode(SPISD::TAIL_CALL, dl, MVT::Other, Ops);
+  }
+
   Chain = DAG.getNode(SPISD::CALL, dl, NodeTys, Ops);
   InFlag = Chain.getValue(1);
 
@@ -1408,7 +1454,7 @@ static SPCC::CondCodes FPCondCCodeToFCC(ISD::CondCode CC) {
 SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
                                          const SparcSubtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
-  MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize(0));
+  MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
 
   // Instructions which use registers as conditionals examine all the
   // bits (as does the pseudo SELECT_CC expansion). I don't think it
@@ -1853,6 +1899,8 @@ const char *SparcTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case SPISD::TLS_ADD:         return "SPISD::TLS_ADD";
   case SPISD::TLS_LD:          return "SPISD::TLS_LD";
   case SPISD::TLS_CALL:        return "SPISD::TLS_CALL";
+  case SPISD::TAIL_CALL:       return "SPISD::TAIL_CALL";
+  case SPISD::LOAD_GDOP:       return "SPISD::LOAD_GDOP";
   }
   return nullptr;
 }
@@ -2178,8 +2226,10 @@ SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG,
     RetPtr = DAG.getFrameIndex(RetFI, PtrVT);
     Entry.Node = RetPtr;
     Entry.Ty   = PointerType::getUnqual(RetTy);
-    if (!Subtarget->is64Bit())
+    if (!Subtarget->is64Bit()) {
       Entry.IsSRet = true;
+      Entry.IndirectType = RetTy;
+    }
     Entry.IsReturned = false;
     Args.push_back(Entry);
     RetTyABI = Type::getVoidTy(*DAG.getContext());
@@ -3126,6 +3176,11 @@ SparcTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case SP::SELECT_CC_DFP_ICC:
   case SP::SELECT_CC_QFP_ICC:
     return expandSelectCC(MI, BB, SP::BCOND);
+  case SP::SELECT_CC_Int_XCC:
+  case SP::SELECT_CC_FP_XCC:
+  case SP::SELECT_CC_DFP_XCC:
+  case SP::SELECT_CC_QFP_XCC:
+    return expandSelectCC(MI, BB, SP::BPXCC);
   case SP::SELECT_CC_Int_FCC:
   case SP::SELECT_CC_FP_FCC:
   case SP::SELECT_CC_DFP_FCC:
@@ -3276,6 +3331,9 @@ std::pair<unsigned, const TargetRegisterClass *>
 SparcTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                                   StringRef Constraint,
                                                   MVT VT) const {
+  if (Constraint.empty())
+    return std::make_pair(0U, nullptr);
+
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'r':
@@ -3304,46 +3362,60 @@ SparcTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       // This will generate an error message
       return std::make_pair(0U, nullptr);
     }
-  } else if (!Constraint.empty() && Constraint.size() <= 5
-              && Constraint[0] == '{' && *(Constraint.end()-1) == '}') {
-    // constraint = '{r<d>}'
-    // Remove the braces from around the name.
-    StringRef name(Constraint.data()+1, Constraint.size()-2);
-    // Handle register aliases:
-    //       r0-r7   -> g0-g7
-    //       r8-r15  -> o0-o7
-    //       r16-r23 -> l0-l7
-    //       r24-r31 -> i0-i7
-    uint64_t intVal = 0;
-    if (name.substr(0, 1).equals("r")
-        && !name.substr(1).getAsInteger(10, intVal) && intVal <= 31) {
-      const char regTypes[] = { 'g', 'o', 'l', 'i' };
-      char regType = regTypes[intVal/8];
-      char regIdx = '0' + (intVal % 8);
-      char tmp[] = { '{', regType, regIdx, '}', 0 };
-      std::string newConstraint = std::string(tmp);
-      return TargetLowering::getRegForInlineAsmConstraint(TRI, newConstraint,
-                                                          VT);
-    }
-    if (name.substr(0, 1).equals("f") &&
-        !name.substr(1).getAsInteger(10, intVal) && intVal <= 63) {
-      std::string newConstraint;
-
-      if (VT == MVT::f32 || VT == MVT::Other) {
-        newConstraint = "{f" + utostr(intVal) + "}";
-      } else if (VT == MVT::f64 && (intVal % 2 == 0)) {
-        newConstraint = "{d" + utostr(intVal / 2) + "}";
-      } else if (VT == MVT::f128 && (intVal % 4 == 0)) {
-        newConstraint = "{q" + utostr(intVal / 4) + "}";
-      } else {
-        return std::make_pair(0U, nullptr);
-      }
-      return TargetLowering::getRegForInlineAsmConstraint(TRI, newConstraint,
-                                                          VT);
+  }
+
+  if (Constraint.front() != '{')
+    return std::make_pair(0U, nullptr);
+
+  assert(Constraint.back() == '}' && "Not a brace enclosed constraint?");
+  StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
+  if (RegName.empty())
+    return std::make_pair(0U, nullptr);
+
+  unsigned long long RegNo;
+  // Handle numbered register aliases.
+  if (RegName[0] == 'r' &&
+      getAsUnsignedInteger(RegName.begin() + 1, 10, RegNo)) {
+    // r0-r7   -> g0-g7
+    // r8-r15  -> o0-o7
+    // r16-r23 -> l0-l7
+    // r24-r31 -> i0-i7
+    if (RegNo > 31)
+      return std::make_pair(0U, nullptr);
+    const char RegTypes[] = {'g', 'o', 'l', 'i'};
+    char RegType = RegTypes[RegNo / 8];
+    char RegIndex = '0' + (RegNo % 8);
+    char Tmp[] = {'{', RegType, RegIndex, '}', 0};
+    return getRegForInlineAsmConstraint(TRI, Tmp, VT);
+  }
+
+  // Rewrite the fN constraint according to the value type if needed.
+  if (VT != MVT::f32 && VT != MVT::Other && RegName[0] == 'f' &&
+      getAsUnsignedInteger(RegName.begin() + 1, 10, RegNo)) {
+    if (VT == MVT::f64 && (RegNo % 2 == 0)) {
+      return getRegForInlineAsmConstraint(
+          TRI, StringRef("{d" + utostr(RegNo / 2) + "}"), VT);
+    } else if (VT == MVT::f128 && (RegNo % 4 == 0)) {
+      return getRegForInlineAsmConstraint(
+          TRI, StringRef("{q" + utostr(RegNo / 4) + "}"), VT);
+    } else {
+      return std::make_pair(0U, nullptr);
     }
   }
 
-  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+  auto ResultPair =
+      TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+  if (!ResultPair.second)
+    return std::make_pair(0U, nullptr);
+
+  // Force the use of I64Regs over IntRegs for 64-bit values.
+  if (Subtarget->is64Bit() && VT == MVT::i64) {
+    assert(ResultPair.second == &SP::IntRegsRegClass &&
+           "Unexpected register class");
+    return std::make_pair(ResultPair.first, &SP::I64RegsRegClass);
+  }
+
+  return ResultPair;
 }
 
 bool
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.h b/llvm/lib/Target/Sparc/SparcISelLowering.h
index 5c9703823a64..2768bb20566a 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.h
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.h
@@ -44,9 +44,13 @@ namespace llvm {
       GLOBAL_BASE_REG, // Global base reg for PIC.
       FLUSHW,      // FLUSH register windows to stack.
 
+      TAIL_CALL,   // Tail call
+
       TLS_ADD,     // For Thread Local Storage (TLS).
       TLS_LD,
-      TLS_CALL
+      TLS_CALL,
+
+      LOAD_GDOP,   // Load operation w/ gdop relocation.
     };
   }
 
@@ -182,6 +186,10 @@ namespace llvm {
 
     SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
+    bool IsEligibleForTailCallOptimization(CCState &CCInfo,
+                                           CallLoweringInfo &CLI,
+                                           MachineFunction &MF) const;
+
     bool ShouldShrinkFPConstant(EVT VT) const override {
       // Do not shrink FP constpool if VT == MVT::f128.
       // (ldd, call _Q_fdtoq) is more expensive than two ldds.
diff --git a/llvm/lib/Target/Sparc/SparcInstr64Bit.td b/llvm/lib/Target/Sparc/SparcInstr64Bit.td
index df65c5457c1d..a471d65201c3 100644
--- a/llvm/lib/Target/Sparc/SparcInstr64Bit.td
+++ b/llvm/lib/Target/Sparc/SparcInstr64Bit.td
@@ -163,7 +163,7 @@ defm ADDX    : F3_12<"add", 0b000000, add, I64Regs, i64, i64imm>;
 defm SUBX    : F3_12<"sub", 0b000100, sub, I64Regs, i64, i64imm>;
 
 def TLS_ADDXrr : F3_1<2, 0b000000, (outs I64Regs:$rd),
-                   (ins I64Regs:$rs1, I64Regs:$rs2, TLSSym:$sym),
+                   (ins I64Regs:$rs1, I64Regs:$rs2, TailRelocSymTLSAdd:$sym),
                    "add $rs1, $rs2, $rd, $sym",
                    [(set i64:$rd,
                        (tlsadd i64:$rs1, i64:$rs2, tglobaltlsaddr:$sym))]>;
@@ -238,12 +238,20 @@ let Predicates = [Is64Bit] in {
 let DecoderMethod = "DecodeLoadInt" in
   defm LDX   : Load<"ldx", 0b001011, load, I64Regs, i64>;
 
-let mayLoad = 1, isAsmParserOnly = 1 in
+let mayLoad = 1, isAsmParserOnly = 1 in {
   def TLS_LDXrr : F3_1<3, 0b001011,
-                       (outs IntRegs:$dst), (ins MEMrr:$addr, TLSSym:$sym),
+                       (outs IntRegs:$dst),
+                       (ins MEMrr:$addr, TailRelocSymTLSLoad:$sym),
                        "ldx [$addr], $dst, $sym",
                        [(set i64:$dst,
                            (tlsld ADDRrr:$addr, tglobaltlsaddr:$sym))]>;
+  def GDOP_LDXrr : F3_1<3, 0b001011,
+                       (outs I64Regs:$dst),
+                       (ins MEMrr:$addr, TailRelocSymGOTLoad:$sym),
+                       "ldx [$addr], $dst, $sym",
+                       [(set i64:$dst,
+                           (load_gdop ADDRrr:$addr, tglobaladdr:$sym))]>;
+}
 
 // Extending loads to i64.
 def : Pat<(i64 (zextloadi1 ADDRrr:$addr)), (LDUBrr ADDRrr:$addr)>;
@@ -336,6 +344,7 @@ def FMOVD_XCC : F4_3<0b110101, 0b000010, (outs DFPRegs:$rd),
                       "fmovd$cond %xcc, $rs2, $rd",
                       [(set f64:$rd,
                        (SPselectxcc f64:$rs2, f64:$f, imm:$cond))]>;
+let Predicates = [Is64Bit, HasHardQuad] in
 def FMOVQ_XCC : F4_3<0b110101, 0b000011, (outs QFPRegs:$rd),
                       (ins QFPRegs:$rs2, QFPRegs:$f, CCOp:$cond),
                       "fmovq$cond %xcc, $rs2, $rd",
@@ -436,11 +445,11 @@ def FXTOD : F3_3u<2, 0b110100, 0b010001000,
                  (outs DFPRegs:$rd), (ins DFPRegs:$rs2),
                  "fxtod $rs2, $rd",
                  [(set DFPRegs:$rd, (SPxtof DFPRegs:$rs2))]>;
+let Predicates = [Is64Bit, HasHardQuad] in
 def FXTOQ : F3_3u<2, 0b110100, 0b010001100,
                  (outs QFPRegs:$rd), (ins DFPRegs:$rs2),
                  "fxtoq $rs2, $rd",
-                 [(set QFPRegs:$rd, (SPxtof DFPRegs:$rs2))]>,
-                 Requires<[HasHardQuad]>;
+                 [(set QFPRegs:$rd, (SPxtof DFPRegs:$rs2))]>;
 
 def FSTOX : F3_3u<2, 0b110100, 0b010000001,
                  (outs DFPRegs:$rd), (ins FPRegs:$rs2),
@@ -450,11 +459,11 @@ def FDTOX : F3_3u<2, 0b110100, 0b010000010,
                  (outs DFPRegs:$rd), (ins DFPRegs:$rs2),
                  "fdtox $rs2, $rd",
                  [(set DFPRegs:$rd, (SPftox DFPRegs:$rs2))]>;
+let Predicates = [Is64Bit, HasHardQuad] in
 def FQTOX : F3_3u<2, 0b110100, 0b010000011,
                  (outs DFPRegs:$rd), (ins QFPRegs:$rs2),
                  "fqtox $rs2, $rd",
-                 [(set DFPRegs:$rd, (SPftox QFPRegs:$rs2))]>,
-                 Requires<[HasHardQuad]>;
+                 [(set DFPRegs:$rd, (SPftox QFPRegs:$rs2))]>;
 
 } // Predicates = [Is64Bit]
 
diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.td b/llvm/lib/Target/Sparc/SparcInstrInfo.td
index 5e305fc9df71..481bd7d2f7fa 100644
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.td
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.td
@@ -147,7 +147,29 @@ def MEMri : Operand<iPTR> {
   let ParserMatchClass = SparcMEMriAsmOperand;
 }
 
-def TLSSym : Operand<iPTR>;
+// Represents a tail relocation operand for instructions such as add, ld, call.
+class SparcTailRelocSymAsmOperand<string Kind> : AsmOperandClass {
+  let Name = "TailRelocSym" # Kind;
+  let RenderMethod = "addTailRelocSymOperands";
+  let PredicateMethod = "isTailRelocSym";
+  let ParserMethod = "parseTailRelocSym<TailRelocKind::" # Kind # ">";
+}
+
+def TailRelocSymGOTLoad : Operand<iPTR> {
+  let ParserMatchClass = SparcTailRelocSymAsmOperand<"Load_GOT">;
+}
+
+def TailRelocSymTLSAdd : Operand<iPTR> {
+  let ParserMatchClass = SparcTailRelocSymAsmOperand<"Add_TLS">;
+}
+
+def TailRelocSymTLSLoad : Operand<iPTR> {
+  let ParserMatchClass = SparcTailRelocSymAsmOperand<"Load_TLS">;
+}
+
+def TailRelocSymTLSCall : Operand<iPTR> {
+  let ParserMatchClass = SparcTailRelocSymAsmOperand<"Call_TLS">;
+}
 
 def SparcMembarTagAsmOperand : AsmOperandClass {
   let Name = "MembarTag";
@@ -214,6 +236,9 @@ SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
 def SDTSPtlsld :
 SDTypeProfile<1, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
 
+def SDTSPloadgdop :
+SDTypeProfile<1, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
+
 def SPcmpicc : SDNode<"SPISD::CMPICC", SDTSPcmpicc, [SDNPOutGlue]>;
 def SPcmpfcc : SDNode<"SPISD::CMPFCC", SDTSPcmpfcc, [SDNPOutGlue]>;
 def SPbricc : SDNode<"SPISD::BRICC", SDTSPbrcc, [SDNPHasChain, SDNPInGlue]>;
@@ -248,6 +273,10 @@ def call          : SDNode<"SPISD::CALL", SDT_SPCall,
                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                             SDNPVariadic]>;
 
+def tailcall      : SDNode<"SPISD::TAIL_CALL", SDT_SPCall,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                            SDNPVariadic]>;
+
 def SDT_SPRet     : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
 def retflag       : SDNode<"SPISD::RET_FLAG", SDT_SPRet,
                            [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
@@ -261,6 +290,8 @@ def tlscall       : SDNode<"SPISD::TLS_CALL", SDT_SPCall,
                             [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                              SDNPVariadic]>;
 
+def load_gdop : SDNode<"SPISD::LOAD_GDOP",  SDTSPloadgdop>;
+
 def getPCX        : Operand<iPTR> {
   let PrintMethod = "printGetPCX";
 }
@@ -484,6 +515,27 @@ let Uses = [ICC], usesCustomInserter = 1 in {
             [(set f128:$dst, (SPselecticc f128:$T, f128:$F, imm:$Cond))]>;
 }
 
+let Uses = [ICC], usesCustomInserter = 1 in {
+  def SELECT_CC_Int_XCC
+   : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, IntRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_Int_XCC PSEUDO!",
+            [(set i32:$dst, (SPselectxcc i32:$T, i32:$F, imm:$Cond))]>;
+  def SELECT_CC_FP_XCC
+   : Pseudo<(outs FPRegs:$dst), (ins FPRegs:$T, FPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_FP_XCC PSEUDO!",
+            [(set f32:$dst, (SPselectxcc f32:$T, f32:$F, imm:$Cond))]>;
+
+  def SELECT_CC_DFP_XCC
+   : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$T, DFPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_DFP_XCC PSEUDO!",
+            [(set f64:$dst, (SPselectxcc f64:$T, f64:$F, imm:$Cond))]>;
+
+  def SELECT_CC_QFP_XCC
+   : Pseudo<(outs QFPRegs:$dst), (ins QFPRegs:$T, QFPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_QFP_XCC PSEUDO!",
+            [(set f128:$dst, (SPselectxcc f128:$T, f128:$F, imm:$Cond))]>;
+}
+
 let usesCustomInserter = 1, Uses = [FCC0] in {
 
   def SELECT_CC_Int_FCC
@@ -562,6 +614,15 @@ let DecoderMethod = "DecodeLoadFP" in
     }
   }
 
+let mayLoad = 1, isAsmParserOnly = 1 in {
+  def GDOP_LDrr : F3_1<3, 0b000000,
+                      (outs IntRegs:$dst),
+                      (ins MEMrr:$addr, TailRelocSymGOTLoad:$sym),
+                      "ld [$addr], $dst, $sym",
+                      [(set i32:$dst,
+                          (load_gdop ADDRrr:$addr, tglobaladdr:$sym))]>;
+}
+
 // Section B.4 - Store Integer Instructions, p. 95
 let DecoderMethod = "DecodeStoreInt" in {
   defm STB   : StoreA<"stb", 0b000101, 0b010101, truncstorei8,  IntRegs, i32>;
@@ -1344,21 +1405,24 @@ let Defs = [FCC0], rd = 0, isCodeGenOnly = 1 in {
 let isAsmParserOnly = 1 in {
 def TLS_ADDrr : F3_1<2, 0b000000,
                     (outs IntRegs:$rd),
-                    (ins IntRegs:$rs1, IntRegs:$rs2, TLSSym:$sym),
+                    (ins IntRegs:$rs1, IntRegs:$rs2, TailRelocSymTLSAdd:$sym),
                     "add $rs1, $rs2, $rd, $sym",
                     [(set i32:$rd,
                         (tlsadd i32:$rs1, i32:$rs2, tglobaltlsaddr:$sym))]>;
 
-let mayLoad = 1 in
+let mayLoad = 1 in {
   def TLS_LDrr : F3_1<3, 0b000000,
-                      (outs IntRegs:$dst), (ins MEMrr:$addr, TLSSym:$sym),
+                      (outs IntRegs:$dst),
+                      (ins MEMrr:$addr, TailRelocSymTLSLoad:$sym),
                       "ld [$addr], $dst, $sym",
                       [(set i32:$dst,
                           (tlsld ADDRrr:$addr, tglobaltlsaddr:$sym))]>;
+}
 
 let Uses = [O6], isCall = 1, hasDelaySlot = 1 in
   def TLS_CALL : InstSP<(outs),
-                        (ins calltarget:$disp, TLSSym:$sym, variable_ops),
+                        (ins calltarget:$disp, TailRelocSymTLSCall:$sym,
+                         variable_ops),
                         "call $disp, $sym",
                         [(tlscall texternalsym:$disp, tglobaltlsaddr:$sym)],
                         IIC_jmp_or_call> {
@@ -1368,6 +1432,31 @@ let Uses = [O6], isCall = 1, hasDelaySlot = 1 in
 }
 }
 
+//===----------------------------------------------------------------------===//
+// Instructions for tail calls.
+//===----------------------------------------------------------------------===//
+let isCodeGenOnly = 1, isReturn = 1,  hasDelaySlot = 1,
+    isTerminator = 1, isBarrier = 1 in {
+  def TAIL_CALL : InstSP<(outs), (ins calltarget:$disp, variable_ops),
+                         "call $disp",
+                         [(tailcall tglobaladdr:$disp)]> {
+  bits<30> disp;
+  let op = 1;
+  let Inst{29-0} = disp;
+  }
+}
+
+def : Pat<(tailcall (iPTR texternalsym:$dst)),
+          (TAIL_CALL texternalsym:$dst)>;
+
+let isCodeGenOnly = 1, isReturn = 1,  hasDelaySlot = 1,  isTerminator = 1,
+    isBarrier = 1, rd = 0 in {
+  def TAIL_CALLri : F3_2<2, 0b111000,
+                         (outs), (ins MEMri:$ptr, variable_ops),
+                         "jmp $ptr",
+                         [(tailcall ADDRri:$ptr)]>;
+}
+
 //===----------------------------------------------------------------------===//
 // V9 Instructions
 //===----------------------------------------------------------------------===//
@@ -1415,12 +1504,12 @@ let Predicates = [HasV9], Constraints = "$f = $rd" in {
                (ins DFPRegs:$rs2, DFPRegs:$f, CCOp:$cond),
                "fmovd$cond %icc, $rs2, $rd",
                [(set f64:$rd, (SPselecticc f64:$rs2, f64:$f, imm:$cond))]>;
+    let Predicates = [HasV9, HasHardQuad] in
     def FMOVQ_ICC
       : F4_3<0b110101, 0b000011, (outs QFPRegs:$rd),
                (ins QFPRegs:$rs2, QFPRegs:$f, CCOp:$cond),
                "fmovq$cond %icc, $rs2, $rd",
-               [(set f128:$rd, (SPselecticc f128:$rs2, f128:$f, imm:$cond))]>,
-               Requires<[HasHardQuad]>;
+               [(set f128:$rd, (SPselecticc f128:$rs2, f128:$f, imm:$cond))]>;
   }
 
   let Uses = [FCC0], intcc = 0, opf_cc = 0b00 in {
@@ -1434,12 +1523,12 @@ let Predicates = [HasV9], Constraints = "$f = $rd" in {
              (ins DFPRegs:$rs2, DFPRegs:$f, CCOp:$cond),
              "fmovd$cond %fcc0, $rs2, $rd",
              [(set f64:$rd, (SPselectfcc f64:$rs2, f64:$f, imm:$cond))]>;
+    let Predicates = [HasV9, HasHardQuad] in
     def FMOVQ_FCC
       : F4_3<0b110101, 0b000011, (outs QFPRegs:$rd),
              (ins QFPRegs:$rs2, QFPRegs:$f, CCOp:$cond),
              "fmovq$cond %fcc0, $rs2, $rd",
-             [(set f128:$rd, (SPselectfcc f128:$rs2, f128:$f, imm:$cond))]>,
-             Requires<[HasHardQuad]>;
+             [(set f128:$rd, (SPselectfcc f128:$rs2, f128:$f, imm:$cond))]>;
   }
 
 }
@@ -1449,28 +1538,28 @@ let Predicates = [HasV9] in {
   def FMOVD : F3_3u<2, 0b110100, 0b000000010,
                    (outs DFPRegs:$rd), (ins DFPRegs:$rs2),
                    "fmovd $rs2, $rd", []>;
+  let Predicates = [HasV9, HasHardQuad] in
   def FMOVQ : F3_3u<2, 0b110100, 0b000000011,
                    (outs QFPRegs:$rd), (ins QFPRegs:$rs2),
-                   "fmovq $rs2, $rd", []>,
-                   Requires<[HasHardQuad]>;
+                   "fmovq $rs2, $rd", []>;
   def FNEGD : F3_3u<2, 0b110100, 0b000000110,
                    (outs DFPRegs:$rd), (ins DFPRegs:$rs2),
                    "fnegd $rs2, $rd",
                    [(set f64:$rd, (fneg f64:$rs2))]>;
+  let Predicates = [HasV9, HasHardQuad] in
   def FNEGQ : F3_3u<2, 0b110100, 0b000000111,
                    (outs QFPRegs:$rd), (ins QFPRegs:$rs2),
                    "fnegq $rs2, $rd",
-                   [(set f128:$rd, (fneg f128:$rs2))]>,
-                   Requires<[HasHardQuad]>;
+                   [(set f128:$rd, (fneg f128:$rs2))]>;
   def FABSD : F3_3u<2, 0b110100, 0b000001010,
                    (outs DFPRegs:$rd), (ins DFPRegs:$rs2),
                    "fabsd $rs2, $rd",
                    [(set f64:$rd, (fabs f64:$rs2))]>;
+  let Predicates = [HasV9, HasHardQuad] in
   def FABSQ : F3_3u<2, 0b110100, 0b000001011,
                    (outs QFPRegs:$rd), (ins QFPRegs:$rs2),
                    "fabsq $rs2, $rd",
-                   [(set f128:$rd, (fabs f128:$rs2))]>,
-                   Requires<[HasHardQuad]>;
+                   [(set f128:$rd, (fabs f128:$rs2))]>;
 }
 
 // Floating-point compare instruction with %fcc0-%fcc3.
@@ -1517,11 +1606,11 @@ let Predicates = [HasV9] in {
       : F4_3<0b110101, 0b000010, (outs DFPRegs:$rd),
              (ins FCCRegs:$opf_cc, DFPRegs:$rs2, DFPRegs:$f, CCOp:$cond),
              "fmovd$cond $opf_cc, $rs2, $rd", []>;
+    let Predicates = [HasV9, HasHardQuad] in
     def V9FMOVQ_FCC
       : F4_3<0b110101, 0b000011, (outs QFPRegs:$rd),
              (ins FCCRegs:$opf_cc, QFPRegs:$rs2, QFPRegs:$f, CCOp:$cond),
-             "fmovq$cond $opf_cc, $rs2, $rd", []>,
-             Requires<[HasHardQuad]>;
+             "fmovq$cond $opf_cc, $rs2, $rd", []>;
   } // Constraints = "$f = $rd", ...
 } // let Predicates = [hasV9]
 
diff --git a/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.cpp b/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.cpp
index 7c36c4ab865f..01db1f3747eb 100644
--- a/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.cpp
@@ -11,3 +11,10 @@
 using namespace llvm;
 
 void SparcMachineFunctionInfo::anchor() { }
+
+MachineFunctionInfo *SparcMachineFunctionInfo::clone(
+    BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+    const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+    const {
+  return DestMF.cloneInfo<SparcMachineFunctionInfo>(*this);
+}
diff --git a/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.h b/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.h
index d557c8ea22e2..e1a1568d28a2 100644
--- a/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.h
+++ b/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.h
@@ -38,6 +38,11 @@ namespace llvm {
       : GlobalBaseReg(0), VarArgsFrameOffset(0), SRetReturnReg(0),
         IsLeafProc(false) {}
 
+    MachineFunctionInfo *
+    clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+          const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+        const override;
+
     Register getGlobalBaseReg() const { return GlobalBaseReg; }
     void setGlobalBaseReg(Register Reg) { GlobalBaseReg = Reg; }
 
diff --git a/llvm/lib/Target/Sparc/SparcTargetMachine.cpp b/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
index 27c49a408a02..8bd51a703d47 100644
--- a/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -55,7 +55,7 @@ static std::string computeDataLayout(const Triple &T, bool is64Bit) {
 }
 
 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
-  return RM.getValueOr(Reloc::Static);
+  return RM.value_or(Reloc::Static);
 }
 
 // Code models. Some only make sense for 64-bit code.
@@ -102,7 +102,7 @@ SparcTargetMachine::SparcTargetMachine(
   initAsmInfo();
 }
 
-SparcTargetMachine::~SparcTargetMachine() {}
+SparcTargetMachine::~SparcTargetMachine() = default;
 
 const SparcSubtarget *
 SparcTargetMachine::getSubtargetImpl(const Function &F) const {
diff --git a/llvm/lib/Target/Sparc/SparcTargetObjectFile.h b/llvm/lib/Target/Sparc/SparcTargetObjectFile.h
index f30ddc7b4955..28ab13918042 100644
--- a/llvm/lib/Target/Sparc/SparcTargetObjectFile.h
+++ b/llvm/lib/Target/Sparc/SparcTargetObjectFile.h
@@ -18,7 +18,7 @@ class TargetMachine;
 
 class SparcELFTargetObjectFile : public TargetLoweringObjectFileELF {
 public:
-  SparcELFTargetObjectFile() {}
+  SparcELFTargetObjectFile() = default;
 
   void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
 
diff --git a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index 40ed417d0817..60e1b05a6d1a 100644
--- a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCAsmParserExtension.h"
@@ -1589,9 +1590,11 @@ SystemZAsmParser::parsePCRel(OperandVector &Operands, int64_t MinVal,
   if (getParser().parseExpression(Expr))
     return MatchOperand_NoMatch;
 
-  auto isOutOfRangeConstant = [&](const MCExpr *E) -> bool {
+  auto isOutOfRangeConstant = [&](const MCExpr *E, bool Negate) -> bool {
     if (auto *CE = dyn_cast<MCConstantExpr>(E)) {
       int64_t Value = CE->getValue();
+      if (Negate)
+        Value = -Value;
       if ((Value & 1) || Value < MinVal || Value > MaxVal)
         return true;
     }
@@ -1605,7 +1608,7 @@ SystemZAsmParser::parsePCRel(OperandVector &Operands, int64_t MinVal,
       Error(StartLoc, "Expected PC-relative expression");
       return MatchOperand_ParseFail;
     }
-    if (isOutOfRangeConstant(CE)) {
+    if (isOutOfRangeConstant(CE, false)) {
       Error(StartLoc, "offset out of range");
       return MatchOperand_ParseFail;
     }
@@ -1620,8 +1623,9 @@ SystemZAsmParser::parsePCRel(OperandVector &Operands, int64_t MinVal,
   // For consistency with the GNU assembler, conservatively assume that a
   // constant offset must by itself be within the given size range.
   if (const auto *BE = dyn_cast<MCBinaryExpr>(Expr))
-    if (isOutOfRangeConstant(BE->getLHS()) ||
-        isOutOfRangeConstant(BE->getRHS())) {
+    if (isOutOfRangeConstant(BE->getLHS(), false) ||
+        isOutOfRangeConstant(BE->getRHS(),
+                             BE->getOpcode() == MCBinaryExpr::Sub)) {
       Error(StartLoc, "offset out of range");
       return MatchOperand_ParseFail;
     }
diff --git a/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
index 5eba150dadc3..979141a1962a 100644
--- a/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
+++ b/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
@@ -9,8 +9,8 @@
 #include "MCTargetDesc/SystemZMCTargetDesc.h"
 #include "SystemZ.h"
 #include "TargetInfo/SystemZTargetInfo.h"
+#include "llvm/MC/MCDecoderOps.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
-#include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
@@ -73,10 +73,9 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZDisassembler() {
 static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch,
                                      uint64_t Address, uint64_t Offset,
                                      uint64_t Width, MCInst &MI,
-                                     const void *Decoder) {
-  const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
-  return Dis->tryAddingSymbolicOperand(MI, Value, Address, isBranch,
-                                       Offset, Width);
+                                     const MCDisassembler *Decoder) {
+  return Decoder->tryAddingSymbolicOperand(MI, Value, Address, isBranch, Offset,
+                                           Width, /*InstSize=*/0);
 }
 
 static DecodeStatus decodeRegisterClass(MCInst &Inst, uint64_t RegNo,
@@ -91,79 +90,79 @@ static DecodeStatus decodeRegisterClass(MCInst &Inst, uint64_t RegNo,
 
 static DecodeStatus DecodeGR32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   return decodeRegisterClass(Inst, RegNo, SystemZMC::GR32Regs, 16);
 }
 
 static DecodeStatus DecodeGRH32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
                                                 uint64_t Address,
-                                                const void *Decoder) {
+                                                const MCDisassembler *Decoder) {
   return decodeRegisterClass(Inst, RegNo, SystemZMC::GRH32Regs, 16);
 }
 
 static DecodeStatus DecodeGR64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   return decodeRegisterClass(Inst, RegNo, SystemZMC::GR64Regs, 16);
 }
 
 static DecodeStatus DecodeGR128BitRegisterClass(MCInst &Inst, uint64_t RegNo,
                                                 uint64_t Address,
-                                                const void *Decoder) {
+                                                const MCDisassembler *Decoder) {
   return decodeRegisterClass(Inst, RegNo, SystemZMC::GR128Regs, 16);
 }
 
-static DecodeStatus DecodeADDR64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
-                                                 uint64_t Address,
-                                                 const void *Decoder) {
+static DecodeStatus
+DecodeADDR64BitRegisterClass(MCInst &Inst, uint64_t RegNo, uint64_t Address,
+                             const MCDisassembler *Decoder) {
   return decodeRegisterClass(Inst, RegNo, SystemZMC::GR64Regs, 16);
 }
 
 static DecodeStatus DecodeFP32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   return decodeRegisterClass(Inst, RegNo, SystemZMC::FP32Regs, 16);
 }
 
 static DecodeStatus DecodeFP64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   return decodeRegisterClass(Inst, RegNo, SystemZMC::FP64Regs, 16);
 }
 
 static DecodeStatus DecodeFP128BitRegisterClass(MCInst &Inst, uint64_t RegNo,
                                                 uint64_t Address,
-                                                const void *Decoder) {
+                                                const MCDisassembler *Decoder) {
   return decodeRegisterClass(Inst, RegNo, SystemZMC::FP128Regs, 16);
 }
 
 static DecodeStatus DecodeVR32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   return decodeRegisterClass(Inst, RegNo, SystemZMC::VR32Regs, 32);
 }
 
 static DecodeStatus DecodeVR64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   return decodeRegisterClass(Inst, RegNo, SystemZMC::VR64Regs, 32);
 }
 
 static DecodeStatus DecodeVR128BitRegisterClass(MCInst &Inst, uint64_t RegNo,
                                                 uint64_t Address,
-                                                const void *Decoder) {
+                                                const MCDisassembler *Decoder) {
   return decodeRegisterClass(Inst, RegNo, SystemZMC::VR128Regs, 32);
 }
 
 static DecodeStatus DecodeAR32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   return decodeRegisterClass(Inst, RegNo, SystemZMC::AR32Regs, 16);
 }
 
 static DecodeStatus DecodeCR64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   return decodeRegisterClass(Inst, RegNo, SystemZMC::CR64Regs, 16);
 }
 
@@ -184,70 +183,81 @@ static DecodeStatus decodeSImmOperand(MCInst &Inst, uint64_t Imm) {
 }
 
 static DecodeStatus decodeU1ImmOperand(MCInst &Inst, uint64_t Imm,
-                                       uint64_t Address, const void *Decoder) {
+                                       uint64_t Address,
+                                       const MCDisassembler *Decoder) {
   return decodeUImmOperand<1>(Inst, Imm);
 }
 
 static DecodeStatus decodeU2ImmOperand(MCInst &Inst, uint64_t Imm,
-                                       uint64_t Address, const void *Decoder) {
+                                       uint64_t Address,
+                                       const MCDisassembler *Decoder) {
   return decodeUImmOperand<2>(Inst, Imm);
 }
 
 static DecodeStatus decodeU3ImmOperand(MCInst &Inst, uint64_t Imm,
-                                       uint64_t Address, const void *Decoder) {
+                                       uint64_t Address,
+                                       const MCDisassembler *Decoder) {
   return decodeUImmOperand<3>(Inst, Imm);
 }
 
 static DecodeStatus decodeU4ImmOperand(MCInst &Inst, uint64_t Imm,
-                                       uint64_t Address, const void *Decoder) {
+                                       uint64_t Address,
+                                       const MCDisassembler *Decoder) {
   return decodeUImmOperand<4>(Inst, Imm);
 }
 
 static DecodeStatus decodeU6ImmOperand(MCInst &Inst, uint64_t Imm,
-                                       uint64_t Address, const void *Decoder) {
+                                       uint64_t Address,
+                                       const MCDisassembler *Decoder) {
   return decodeUImmOperand<6>(Inst, Imm);
 }
 
 static DecodeStatus decodeU8ImmOperand(MCInst &Inst, uint64_t Imm,
-                                       uint64_t Address, const void *Decoder) {
+                                       uint64_t Address,
+                                       const MCDisassembler *Decoder) {
   return decodeUImmOperand<8>(Inst, Imm);
 }
 
 static DecodeStatus decodeU12ImmOperand(MCInst &Inst, uint64_t Imm,
-                                        uint64_t Address, const void *Decoder) {
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder) {
   return decodeUImmOperand<12>(Inst, Imm);
 }
 
 static DecodeStatus decodeU16ImmOperand(MCInst &Inst, uint64_t Imm,
-                                        uint64_t Address, const void *Decoder) {
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder) {
   return decodeUImmOperand<16>(Inst, Imm);
 }
 
 static DecodeStatus decodeU32ImmOperand(MCInst &Inst, uint64_t Imm,
-                                        uint64_t Address, const void *Decoder) {
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder) {
   return decodeUImmOperand<32>(Inst, Imm);
 }
 
 static DecodeStatus decodeS8ImmOperand(MCInst &Inst, uint64_t Imm,
-                                       uint64_t Address, const void *Decoder) {
+                                       uint64_t Address,
+                                       const MCDisassembler *Decoder) {
   return decodeSImmOperand<8>(Inst, Imm);
 }
 
 static DecodeStatus decodeS16ImmOperand(MCInst &Inst, uint64_t Imm,
-                                        uint64_t Address, const void *Decoder) {
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder) {
   return decodeSImmOperand<16>(Inst, Imm);
 }
 
 static DecodeStatus decodeS32ImmOperand(MCInst &Inst, uint64_t Imm,
-                                        uint64_t Address, const void *Decoder) {
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder) {
   return decodeSImmOperand<32>(Inst, Imm);
 }
 
-template<unsigned N>
+template <unsigned N>
 static DecodeStatus decodePCDBLOperand(MCInst &Inst, uint64_t Imm,
-                                       uint64_t Address,
-                                       bool isBranch,
-                                       const void *Decoder) {
+                                       uint64_t Address, bool isBranch,
+                                       const MCDisassembler *Decoder) {
   assert(isUInt<N>(Imm) && "Invalid PC-relative offset");
   uint64_t Value = SignExtend64<N>(Imm) * 2 + Address;
 
@@ -260,31 +270,31 @@ static DecodeStatus decodePCDBLOperand(MCInst &Inst, uint64_t Imm,
 
 static DecodeStatus decodePC12DBLBranchOperand(MCInst &Inst, uint64_t Imm,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   return decodePCDBLOperand<12>(Inst, Imm, Address, true, Decoder);
 }
 
 static DecodeStatus decodePC16DBLBranchOperand(MCInst &Inst, uint64_t Imm,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   return decodePCDBLOperand<16>(Inst, Imm, Address, true, Decoder);
 }
 
 static DecodeStatus decodePC24DBLBranchOperand(MCInst &Inst, uint64_t Imm,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   return decodePCDBLOperand<24>(Inst, Imm, Address, true, Decoder);
 }
 
 static DecodeStatus decodePC32DBLBranchOperand(MCInst &Inst, uint64_t Imm,
                                                uint64_t Address,
-                                               const void *Decoder) {
+                                               const MCDisassembler *Decoder) {
   return decodePCDBLOperand<32>(Inst, Imm, Address, true, Decoder);
 }
 
 static DecodeStatus decodePC32DBLOperand(MCInst &Inst, uint64_t Imm,
                                          uint64_t Address,
-                                         const void *Decoder) {
+                                         const MCDisassembler *Decoder) {
   return decodePCDBLOperand<32>(Inst, Imm, Address, false, Decoder);
 }
 
@@ -382,64 +392,61 @@ static DecodeStatus decodeBDVAddr12Operand(MCInst &Inst, uint64_t Field,
 
 static DecodeStatus decodeBDAddr32Disp12Operand(MCInst &Inst, uint64_t Field,
                                                 uint64_t Address,
-                                                const void *Decoder) {
+                                                const MCDisassembler *Decoder) {
   return decodeBDAddr12Operand(Inst, Field, SystemZMC::GR32Regs);
 }
 
 static DecodeStatus decodeBDAddr32Disp20Operand(MCInst &Inst, uint64_t Field,
                                                 uint64_t Address,
-                                                const void *Decoder) {
+                                                const MCDisassembler *Decoder) {
   return decodeBDAddr20Operand(Inst, Field, SystemZMC::GR32Regs);
 }
 
 static DecodeStatus decodeBDAddr64Disp12Operand(MCInst &Inst, uint64_t Field,
                                                 uint64_t Address,
-                                                const void *Decoder) {
+                                                const MCDisassembler *Decoder) {
   return decodeBDAddr12Operand(Inst, Field, SystemZMC::GR64Regs);
 }
 
 static DecodeStatus decodeBDAddr64Disp20Operand(MCInst &Inst, uint64_t Field,
                                                 uint64_t Address,
-                                                const void *Decoder) {
+                                                const MCDisassembler *Decoder) {
   return decodeBDAddr20Operand(Inst, Field, SystemZMC::GR64Regs);
 }
 
-static DecodeStatus decodeBDXAddr64Disp12Operand(MCInst &Inst, uint64_t Field,
-                                                 uint64_t Address,
-                                                 const void *Decoder) {
+static DecodeStatus
+decodeBDXAddr64Disp12Operand(MCInst &Inst, uint64_t Field, uint64_t Address,
+                             const MCDisassembler *Decoder) {
   return decodeBDXAddr12Operand(Inst, Field, SystemZMC::GR64Regs);
 }
 
-static DecodeStatus decodeBDXAddr64Disp20Operand(MCInst &Inst, uint64_t Field,
-                                                 uint64_t Address,
-                                                 const void *Decoder) {
+static DecodeStatus
+decodeBDXAddr64Disp20Operand(MCInst &Inst, uint64_t Field, uint64_t Address,
+                             const MCDisassembler *Decoder) {
   return decodeBDXAddr20Operand(Inst, Field, SystemZMC::GR64Regs);
 }
 
-static DecodeStatus decodeBDLAddr64Disp12Len4Operand(MCInst &Inst,
-                                                     uint64_t Field,
-                                                     uint64_t Address,
-                                                     const void *Decoder) {
+static DecodeStatus
+decodeBDLAddr64Disp12Len4Operand(MCInst &Inst, uint64_t Field, uint64_t Address,
+                                 const MCDisassembler *Decoder) {
   return decodeBDLAddr12Len4Operand(Inst, Field, SystemZMC::GR64Regs);
 }
 
-static DecodeStatus decodeBDLAddr64Disp12Len8Operand(MCInst &Inst,
-                                                     uint64_t Field,
-                                                     uint64_t Address,
-                                                     const void *Decoder) {
+static DecodeStatus
+decodeBDLAddr64Disp12Len8Operand(MCInst &Inst, uint64_t Field, uint64_t Address,
+                                 const MCDisassembler *Decoder) {
   return decodeBDLAddr12Len8Operand(Inst, Field, SystemZMC::GR64Regs);
 }
 
-static DecodeStatus decodeBDRAddr64Disp12Operand(MCInst &Inst,
-                                                 uint64_t Field,
-                                                 uint64_t Address,
-                                                 const void *Decoder) {
+static DecodeStatus
+decodeBDRAddr64Disp12Operand(MCInst &Inst, uint64_t Field, uint64_t Address,
+                             const MCDisassembler *Decoder) {
   return decodeBDRAddr12Operand(Inst, Field, SystemZMC::GR64Regs);
 }
 
-static DecodeStatus decodeBDVAddr64Disp12Operand(MCInst &Inst, uint64_t Field,
-                                                 uint64_t Address,
-                                                 const void *Decoder) {
+static DecodeStatus
+decodeBDVAddr64Disp12Operand(MCInst &Inst, uint64_t Field, uint64_t Address,
+                             const MCDisassembler *Decoder) {
   return decodeBDVAddr12Operand(Inst, Field, SystemZMC::GR64Regs);
 }
 
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
index c83796b8579b..242f566da2c9 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
@@ -37,6 +37,8 @@ class SystemZMCCodeEmitter : public MCCodeEmitter {
   const MCInstrInfo &MCII;
   MCContext &Ctx;
 
+  mutable unsigned MemOpsEmitted;
+
 public:
   SystemZMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
     : MCII(mcii), Ctx(ctx) {
@@ -165,6 +167,7 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
   verifyInstructionPredicates(MI,
                               computeAvailableFeatures(STI.getFeatureBits()));
 
+  MemOpsEmitted = 0;
   uint64_t Bits = getBinaryCodeForInstr(MI, Fixups, STI);
   unsigned Size = MCII.get(MI.getOpcode()).getSize();
   // Big-endian insertion of Size bytes.
@@ -191,12 +194,14 @@ getDispOpValue(const MCInst &MI, unsigned OpNum,
                SmallVectorImpl<MCFixup> &Fixups,
                SystemZ::FixupKind Kind) const {
   const MCOperand &MO = MI.getOperand(OpNum);
-  if (MO.isImm())
+  if (MO.isImm()) {
+    ++MemOpsEmitted;
     return static_cast<uint64_t>(MO.getImm());
+  }
   if (MO.isExpr()) {
     // All instructions follow the pattern where the first displacement has a
     // 2 bytes offset, and the second one 4 bytes.
-    unsigned ByteOffs = Fixups.size() == 0 ? 2 : 4;
+    unsigned ByteOffs = MemOpsEmitted++ == 0 ? 2 : 4;
     Fixups.push_back(MCFixup::create(ByteOffs, MO.getExpr(), (MCFixupKind)Kind,
                                      MI.getLoc()));
     assert(Fixups.size() <= 2 && "More than two memory operands in MI?");
@@ -328,7 +333,6 @@ SystemZMCCodeEmitter::getPCRelEncoding(const MCInst &MI, unsigned OpNum,
 #include "SystemZGenMCCodeEmitter.inc"
 
 MCCodeEmitter *llvm::createSystemZMCCodeEmitter(const MCInstrInfo &MCII,
-                                                const MCRegisterInfo &MRI,
                                                 MCContext &Ctx) {
   return new SystemZMCCodeEmitter(MCII, Ctx);
 }
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index c7b73fd3b805..03141ecf551d 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -13,6 +13,7 @@
 #include "TargetInfo/SystemZTargetInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
@@ -193,7 +194,7 @@ void SystemZTargetStreamer::emitConstantPools() {
     return;
   // Switch to the .text section.
   const MCObjectFileInfo &OFI = *Streamer.getContext().getObjectFileInfo();
-  Streamer.SwitchSection(OFI.getTextSection());
+  Streamer.switchSection(OFI.getTextSection());
   for (auto &I : EXRLTargets2Sym) {
     Streamer.emitLabel(I.second);
     const MCInstSTIPair &MCI_STI = I.first;
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
index e76fa03af3bf..db4485423416 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
@@ -78,7 +78,6 @@ inline unsigned getRegAsVR128(unsigned Reg) {
 } // end namespace SystemZMC
 
 MCCodeEmitter *createSystemZMCCodeEmitter(const MCInstrInfo &MCII,
-                                          const MCRegisterInfo &MRI,
                                           MCContext &Ctx);
 
 MCAsmBackend *createSystemZMCAsmBackend(const Target &T,
diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index e01adcce04ab..6fb080607f51 100644
--- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -88,13 +88,19 @@ static const MCSymbolRefExpr *getGlobalOffsetTable(MCContext &Context) {
 // an instruction with the corresponding hint set.
 static void lowerAlignmentHint(const MachineInstr *MI, MCInst &LoweredMI,
                                unsigned Opcode) {
-  if (!MI->hasOneMemOperand())
+  if (MI->memoperands_empty())
     return;
-  const MachineMemOperand *MMO = *MI->memoperands_begin();
+
+  Align Alignment = Align(16);
+  for (MachineInstr::mmo_iterator MMOI = MI->memoperands_begin(),
+         EE = MI->memoperands_end(); MMOI != EE; ++MMOI)
+    if ((*MMOI)->getAlign() < Alignment)
+      Alignment = (*MMOI)->getAlign();
+
   unsigned AlignmentHint = 0;
-  if (MMO->getAlign() >= Align(16))
+  if (Alignment >= Align(16))
     AlignmentHint = 4;
-  else if (MMO->getAlign() >= Align(8))
+  else if (Alignment >= Align(8))
     AlignmentHint = 3;
   if (AlignmentHint == 0)
     return;
@@ -124,17 +130,32 @@ static MCInst lowerSubvectorStore(const MachineInstr *MI, unsigned Opcode) {
     .addImm(0);
 }
 
+// The XPLINK ABI requires that a no-op encoding the call type is emitted after
+// each call to a subroutine. This information can be used by the called
+// function to determine its entry point, e.g. for generating a backtrace. The
+// call type is encoded as a register number in the bcr instruction. See
+// enumeration CallType for the possible values.
+void SystemZAsmPrinter::emitCallInformation(CallType CT) {
+  EmitToStreamer(*OutStreamer,
+                 MCInstBuilder(SystemZ::BCRAsm)
+                     .addImm(0)
+                     .addReg(SystemZMC::GR64Regs[static_cast<unsigned>(CT)]));
+}
+
 void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
   SystemZMCInstLower Lower(MF->getContext(), *this);
-  const SystemZSubtarget *Subtarget = &MF->getSubtarget<SystemZSubtarget>();
   MCInst LoweredMI;
   switch (MI->getOpcode()) {
   case SystemZ::Return:
-    if (Subtarget->isTargetXPLINK64())
-      LoweredMI =
-          MCInstBuilder(SystemZ::B).addReg(SystemZ::R7D).addImm(2).addReg(0);
-    else
-      LoweredMI = MCInstBuilder(SystemZ::BR).addReg(SystemZ::R14D);
+    LoweredMI = MCInstBuilder(SystemZ::BR)
+      .addReg(SystemZ::R14D);
+    break;
+
+  case SystemZ::Return_XPLINK:
+    LoweredMI = MCInstBuilder(SystemZ::B)
+      .addReg(SystemZ::R7D)
+      .addImm(2)
+      .addReg(0);
     break;
 
   case SystemZ::CondReturn:
@@ -144,6 +165,15 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
       .addReg(SystemZ::R14D);
     break;
 
+  case SystemZ::CondReturn_XPLINK:
+    LoweredMI = MCInstBuilder(SystemZ::BC)
+      .addImm(MI->getOperand(0).getImm())
+      .addImm(MI->getOperand(1).getImm())
+      .addReg(SystemZ::R7D)
+      .addImm(2)
+      .addReg(0);
+    break;
+
   case SystemZ::CRBReturn:
     LoweredMI = MCInstBuilder(SystemZ::CRB)
       .addReg(MI->getOperand(0).getReg())
@@ -222,18 +252,21 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
                        .addReg(SystemZ::R7D)
                        .addExpr(Lower.getExpr(MI->getOperand(0),
                                               MCSymbolRefExpr::VK_PLT)));
-    EmitToStreamer(
-        *OutStreamer,
-        MCInstBuilder(SystemZ::BCRAsm).addImm(0).addReg(SystemZ::R3D));
+    emitCallInformation(CallType::BRASL7);
     return;
 
   case SystemZ::CallBASR_XPLINK64:
     EmitToStreamer(*OutStreamer, MCInstBuilder(SystemZ::BASR)
                                      .addReg(SystemZ::R7D)
                                      .addReg(MI->getOperand(0).getReg()));
-    EmitToStreamer(
-        *OutStreamer,
-        MCInstBuilder(SystemZ::BCRAsm).addImm(0).addReg(SystemZ::R0D));
+    emitCallInformation(CallType::BASR76);
+    return;
+
+  case SystemZ::CallBASR_STACKEXT:
+    EmitToStreamer(*OutStreamer, MCInstBuilder(SystemZ::BASR)
+                                     .addReg(SystemZ::R3D)
+                                     .addReg(MI->getOperand(0).getReg()));
+    emitCallInformation(CallType::BASR33);
     return;
 
   case SystemZ::CallBRASL:
@@ -608,11 +641,11 @@ void SystemZAsmPrinter::LowerFENTRY_CALL(const MachineInstr &MI,
   MCContext &Ctx = MF->getContext();
   if (MF->getFunction().hasFnAttribute("mrecord-mcount")) {
     MCSymbol *DotSym = OutContext.createTempSymbol();
-    OutStreamer->PushSection();
-    OutStreamer->SwitchSection(
+    OutStreamer->pushSection();
+    OutStreamer->switchSection(
         Ctx.getELFSection("__mcount_loc", ELF::SHT_PROGBITS, ELF::SHF_ALLOC));
     OutStreamer->emitSymbolValue(DotSym, 8);
-    OutStreamer->PopSection();
+    OutStreamer->popSection();
     OutStreamer->emitLabel(DotSym);
   }
 
@@ -630,8 +663,7 @@ void SystemZAsmPrinter::LowerFENTRY_CALL(const MachineInstr &MI,
 }
 
 void SystemZAsmPrinter::LowerSTACKMAP(const MachineInstr &MI) {
-  const SystemZInstrInfo *TII =
-    static_cast<const SystemZInstrInfo *>(MF->getSubtarget().getInstrInfo());
+  auto *TII = MF->getSubtarget<SystemZSubtarget>().getInstrInfo();
 
   unsigned NumNOPBytes = MI.getOperand(1).getImm();
 
@@ -786,13 +818,253 @@ void SystemZAsmPrinter::emitEndOfAsmFile(Module &M) {
   emitStackMaps(SM);
 }
 
+void SystemZAsmPrinter::emitFunctionBodyEnd() {
+  if (TM.getTargetTriple().isOSzOS()) {
+    // Emit symbol for the end of function if the z/OS target streamer
+    // is used. This is needed to calculate the size of the function.
+    MCSymbol *FnEndSym = createTempSymbol("func_end");
+    OutStreamer->emitLabel(FnEndSym);
+
+    OutStreamer->pushSection();
+    OutStreamer->switchSection(getObjFileLowering().getPPA1Section());
+    emitPPA1(FnEndSym);
+    OutStreamer->popSection();
+
+    CurrentFnPPA1Sym = nullptr;
+    CurrentFnEPMarkerSym = nullptr;
+  }
+}
+
+static void emitPPA1Flags(std::unique_ptr<MCStreamer> &OutStreamer, bool VarArg,
+                          bool StackProtector, bool FPRMask, bool VRMask) {
+  enum class PPA1Flag1 : uint8_t {
+    DSA64Bit = (0x80 >> 0),
+    VarArg = (0x80 >> 7),
+    LLVM_MARK_AS_BITMASK_ENUM(DSA64Bit)
+  };
+  enum class PPA1Flag2 : uint8_t {
+    ExternalProcedure = (0x80 >> 0),
+    STACKPROTECTOR = (0x80 >> 3),
+    LLVM_MARK_AS_BITMASK_ENUM(ExternalProcedure)
+  };
+  enum class PPA1Flag3 : uint8_t {
+    FPRMask = (0x80 >> 2),
+    LLVM_MARK_AS_BITMASK_ENUM(FPRMask)
+  };
+  enum class PPA1Flag4 : uint8_t {
+    EPMOffsetPresent = (0x80 >> 0),
+    VRMask = (0x80 >> 2),
+    ProcedureNamePresent = (0x80 >> 7),
+    LLVM_MARK_AS_BITMASK_ENUM(EPMOffsetPresent)
+  };
+
+  // Declare optional section flags that can be modified.
+  auto Flags1 = PPA1Flag1(0);
+  auto Flags2 = PPA1Flag2::ExternalProcedure;
+  auto Flags3 = PPA1Flag3(0);
+  auto Flags4 = PPA1Flag4::EPMOffsetPresent | PPA1Flag4::ProcedureNamePresent;
+
+  Flags1 |= PPA1Flag1::DSA64Bit;
+
+  if (VarArg)
+    Flags1 |= PPA1Flag1::VarArg;
+
+  if (StackProtector)
+    Flags2 |= PPA1Flag2::STACKPROTECTOR;
+
+  // SavedGPRMask, SavedFPRMask, and SavedVRMask are precomputed in.
+  if (FPRMask)
+    Flags3 |= PPA1Flag3::FPRMask; // Add emit FPR mask flag.
+
+  if (VRMask)
+    Flags4 |= PPA1Flag4::VRMask; // Add emit VR mask flag.
+
+  OutStreamer->AddComment("PPA1 Flags 1");
+  if ((Flags1 & PPA1Flag1::DSA64Bit) == PPA1Flag1::DSA64Bit)
+    OutStreamer->AddComment("  Bit 0: 1 = 64-bit DSA");
+  else
+    OutStreamer->AddComment("  Bit 0: 0 = 32-bit DSA");
+  if ((Flags1 & PPA1Flag1::VarArg) == PPA1Flag1::VarArg)
+    OutStreamer->AddComment("  Bit 7: 1 = Vararg function");
+  OutStreamer->emitInt8(static_cast<uint8_t>(Flags1)); // Flags 1.
+
+  OutStreamer->AddComment("PPA1 Flags 2");
+  if ((Flags2 & PPA1Flag2::ExternalProcedure) == PPA1Flag2::ExternalProcedure)
+    OutStreamer->AddComment("  Bit 0: 1 = External procedure");
+  if ((Flags2 & PPA1Flag2::STACKPROTECTOR) == PPA1Flag2::STACKPROTECTOR)
+    OutStreamer->AddComment("  Bit 3: 1 = STACKPROTECT is enabled");
+  else
+    OutStreamer->AddComment("  Bit 3: 0 = STACKPROTECT is not enabled");
+  OutStreamer->emitInt8(static_cast<uint8_t>(Flags2)); // Flags 2.
+
+  OutStreamer->AddComment("PPA1 Flags 3");
+  if ((Flags3 & PPA1Flag3::FPRMask) == PPA1Flag3::FPRMask)
+    OutStreamer->AddComment("  Bit 2: 1 = FP Reg Mask is in optional area");
+  OutStreamer->emitInt8(
+      static_cast<uint8_t>(Flags3)); // Flags 3 (optional sections).
+
+  OutStreamer->AddComment("PPA1 Flags 4");
+  if ((Flags4 & PPA1Flag4::VRMask) == PPA1Flag4::VRMask)
+    OutStreamer->AddComment("  Bit 2: 1 = Vector Reg Mask is in optional area");
+  OutStreamer->emitInt8(static_cast<uint8_t>(
+      Flags4)); // Flags 4 (optional sections, always emit these).
+}
+
+void SystemZAsmPrinter::emitPPA1(MCSymbol *FnEndSym) {
+  const TargetRegisterInfo *TRI = MF->getRegInfo().getTargetRegisterInfo();
+  const SystemZSubtarget &Subtarget = MF->getSubtarget<SystemZSubtarget>();
+  const auto TargetHasVector = Subtarget.hasVector();
+
+  const SystemZMachineFunctionInfo *ZFI =
+      MF->getInfo<SystemZMachineFunctionInfo>();
+  const auto *ZFL = static_cast<const SystemZXPLINKFrameLowering *>(
+      Subtarget.getFrameLowering());
+  const MachineFrameInfo &MFFrame = MF->getFrameInfo();
+
+  // Get saved GPR/FPR/VPR masks.
+  const std::vector<CalleeSavedInfo> &CSI = MFFrame.getCalleeSavedInfo();
+  uint16_t SavedGPRMask = 0;
+  uint16_t SavedFPRMask = 0;
+  uint8_t SavedVRMask = 0;
+  int64_t OffsetFPR = 0;
+  int64_t OffsetVR = 0;
+  const int64_t TopOfStack =
+      MFFrame.getOffsetAdjustment() + MFFrame.getStackSize();
+
+  // Loop over the spilled registers. The CalleeSavedInfo can't be used because
+  // it does not contain all spilled registers.
+  for (unsigned I = ZFI->getSpillGPRRegs().LowGPR,
+                E = ZFI->getSpillGPRRegs().HighGPR;
+       I && E && I <= E; ++I) {
+    unsigned V = TRI->getEncodingValue((Register)I);
+    assert(V < 16 && "GPR index out of range");
+    SavedGPRMask |= 1 << (15 - V);
+  }
+
+  for (auto &CS : CSI) {
+    unsigned Reg = CS.getReg();
+    unsigned I = TRI->getEncodingValue(Reg);
+
+    if (SystemZ::FP64BitRegClass.contains(Reg)) {
+      assert(I < 16 && "FPR index out of range");
+      SavedFPRMask |= 1 << (15 - I);
+      int64_t Temp = MFFrame.getObjectOffset(CS.getFrameIdx());
+      if (Temp < OffsetFPR)
+        OffsetFPR = Temp;
+    } else if (SystemZ::VR128BitRegClass.contains(Reg)) {
+      assert(I >= 16 && I <= 23 && "VPR index out of range");
+      unsigned BitNum = I - 16;
+      SavedVRMask |= 1 << (7 - BitNum);
+      int64_t Temp = MFFrame.getObjectOffset(CS.getFrameIdx());
+      if (Temp < OffsetVR)
+        OffsetVR = Temp;
+    }
+  }
+
+  // Adjust the offset.
+  OffsetFPR += (OffsetFPR < 0) ? TopOfStack : 0;
+  OffsetVR += (OffsetVR < 0) ? TopOfStack : 0;
+
+  // Get alloca register.
+  uint8_t FrameReg = TRI->getEncodingValue(TRI->getFrameRegister(*MF));
+  uint8_t AllocaReg = ZFL->hasFP(*MF) ? FrameReg : 0;
+  assert(AllocaReg < 16 && "Can't have alloca register larger than 15");
+  (void)AllocaReg;
+
+  // Build FPR save area offset.
+  uint32_t FrameAndFPROffset = 0;
+  if (SavedFPRMask) {
+    uint64_t FPRSaveAreaOffset = OffsetFPR;
+    assert(FPRSaveAreaOffset < 0x10000000 && "Offset out of range");
+
+    FrameAndFPROffset = FPRSaveAreaOffset & 0x0FFFFFFF; // Lose top 4 bits.
+    FrameAndFPROffset |= FrameReg << 28;                // Put into top 4 bits.
+  }
+
+  // Build VR save area offset.
+  uint32_t FrameAndVROffset = 0;
+  if (TargetHasVector && SavedVRMask) {
+    uint64_t VRSaveAreaOffset = OffsetVR;
+    assert(VRSaveAreaOffset < 0x10000000 && "Offset out of range");
+
+    FrameAndVROffset = VRSaveAreaOffset & 0x0FFFFFFF; // Lose top 4 bits.
+    FrameAndVROffset |= FrameReg << 28;               // Put into top 4 bits.
+  }
+
+  // Emit PPA1 section.
+  OutStreamer->AddComment("PPA1");
+  OutStreamer->emitLabel(CurrentFnPPA1Sym);
+  OutStreamer->AddComment("Version");
+  OutStreamer->emitInt8(0x02); // Version.
+  OutStreamer->AddComment("LE Signature X'CE'");
+  OutStreamer->emitInt8(0xCE); // CEL signature.
+  OutStreamer->AddComment("Saved GPR Mask");
+  OutStreamer->emitInt16(SavedGPRMask);
+
+  emitPPA1Flags(OutStreamer, MF->getFunction().isVarArg(),
+                MFFrame.hasStackProtectorIndex(), SavedFPRMask != 0,
+                TargetHasVector && SavedVRMask != 0);
+
+  OutStreamer->AddComment("Length/4 of Parms");
+  OutStreamer->emitInt16(
+      static_cast<uint16_t>(MFFrame.getMaxCallFrameSize() / 4)); // Parms/4.
+  OutStreamer->AddComment("Length of Code");
+  OutStreamer->emitAbsoluteSymbolDiff(FnEndSym, CurrentFnEPMarkerSym, 4);
+
+  // Emit saved FPR mask and offset to FPR save area (0x20 of flags 3).
+  if (SavedFPRMask) {
+    OutStreamer->AddComment("FPR mask");
+    OutStreamer->emitInt16(SavedFPRMask);
+    OutStreamer->AddComment("AR mask");
+    OutStreamer->emitInt16(0); // AR Mask, unused currently.
+    OutStreamer->AddComment("FPR Save Area Locator");
+    OutStreamer->AddComment(Twine("  Bit 0-3: Register R")
+                                .concat(utostr(FrameAndFPROffset >> 28))
+                                .str());
+    OutStreamer->AddComment(Twine("  Bit 4-31: Offset ")
+                                .concat(utostr(FrameAndFPROffset & 0x0FFFFFFF))
+                                .str());
+    OutStreamer->emitInt32(FrameAndFPROffset); // Offset to FPR save area with
+                                               // register to add value to
+                                               // (alloca reg).
+  }
+
+  // Emit saved VR mask to VR save area.
+  if (TargetHasVector && SavedVRMask) {
+    OutStreamer->AddComment("VR mask");
+    OutStreamer->emitInt8(SavedVRMask);
+    OutStreamer->emitInt8(0);  // Reserved.
+    OutStreamer->emitInt16(0); // Also reserved.
+    OutStreamer->AddComment("VR Save Area Locator");
+    OutStreamer->AddComment(Twine("  Bit 0-3: Register R")
+                                .concat(utostr(FrameAndVROffset >> 28))
+                                .str());
+    OutStreamer->AddComment(Twine("  Bit 4-31: Offset ")
+                                .concat(utostr(FrameAndVROffset & 0x0FFFFFFF))
+                                .str());
+    OutStreamer->emitInt32(FrameAndVROffset);
+  }
+
+  // Emit offset to entry point optional section (0x80 of flags 4).
+  OutStreamer->emitAbsoluteSymbolDiff(CurrentFnEPMarkerSym, CurrentFnPPA1Sym,
+                                      4);
+}
+
 void SystemZAsmPrinter::emitFunctionEntryLabel() {
-  const SystemZSubtarget &Subtarget =
-      static_cast<const SystemZSubtarget &>(MF->getSubtarget());
+  const SystemZSubtarget &Subtarget = MF->getSubtarget<SystemZSubtarget>();
 
   if (Subtarget.getTargetTriple().isOSzOS()) {
     MCContext &OutContext = OutStreamer->getContext();
-    MCSymbol *EPMarkerSym = OutContext.createTempSymbol("CM_", true);
+
+    // Save information for later use.
+    std::string N(MF->getFunction().hasName()
+                      ? Twine(MF->getFunction().getName()).concat("_").str()
+                      : "");
+
+    CurrentFnEPMarkerSym =
+        OutContext.createTempSymbol(Twine("EPM_").concat(N).str(), true);
+    CurrentFnPPA1Sym =
+        OutContext.createTempSymbol(Twine("PPA1_").concat(N).str(), true);
 
     // EntryPoint Marker
     const MachineFrameInfo &MFFrame = MF->getFrameInfo();
@@ -811,11 +1083,14 @@ void SystemZAsmPrinter::emitFunctionEntryLabel() {
 
     // Emit entry point marker section.
     OutStreamer->AddComment("XPLINK Routine Layout Entry");
-    OutStreamer->emitLabel(EPMarkerSym);
+    OutStreamer->emitLabel(CurrentFnEPMarkerSym);
     OutStreamer->AddComment("Eyecatcher 0x00C300C500C500");
     OutStreamer->emitIntValueInHex(0x00C300C500C500, 7); // Eyecatcher.
     OutStreamer->AddComment("Mark Type C'1'");
     OutStreamer->emitInt8(0xF1); // Mark Type.
+    OutStreamer->AddComment("Offset to PPA1");
+    OutStreamer->emitAbsoluteSymbolDiff(CurrentFnPPA1Sym, CurrentFnEPMarkerSym,
+                                        4);
     if (OutStreamer->isVerboseAsm()) {
       OutStreamer->AddComment("DSA Size 0x" + Twine::utohexstr(DSASize));
       OutStreamer->AddComment("Entry Flags");
diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h
index 80d68d1b93ff..f14b4a184f62 100644
--- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h
+++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h
@@ -26,6 +26,8 @@ class raw_ostream;
 class LLVM_LIBRARY_VISIBILITY SystemZAsmPrinter : public AsmPrinter {
 private:
   StackMaps SM;
+  MCSymbol *CurrentFnPPA1Sym;     // PPA1 Symbol.
+  MCSymbol *CurrentFnEPMarkerSym; // Entry Point Marker.
 
   SystemZTargetStreamer *getTargetStreamer() {
     MCTargetStreamer *TS = OutStreamer->getTargetStreamer();
@@ -33,9 +35,24 @@ private:
     return static_cast<SystemZTargetStreamer *>(TS);
   }
 
+  /// Call type information for XPLINK.
+  enum class CallType {
+    BASR76 = 0,   // b'x000' == BASR  r7,r6
+    BRAS7 = 1,    // b'x001' == BRAS  r7,ep
+    RESVD_2 = 2,  // b'x010'
+    BRASL7 = 3,   // b'x011' == BRASL r7,ep
+    RESVD_4 = 4,  // b'x100'
+    RESVD_5 = 5,  // b'x101'
+    BALR1415 = 6, // b'x110' == BALR  r14,r15
+    BASR33 = 7,   // b'x111' == BASR  r3,r3
+  };
+
+  void emitPPA1(MCSymbol *FnEndSym);
+
 public:
   SystemZAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
-      : AsmPrinter(TM, std::move(Streamer)), SM(*this) {}
+      : AsmPrinter(TM, std::move(Streamer)), SM(*this),
+        CurrentFnPPA1Sym(nullptr), CurrentFnEPMarkerSym(nullptr) {}
 
   // Override AsmPrinter.
   StringRef getPassName() const override { return "SystemZ Assembly Printer"; }
@@ -52,8 +69,10 @@ public:
     return AsmPrinter::doInitialization(M);
   }
   void emitFunctionEntryLabel() override;
+  void emitFunctionBodyEnd() override;
 
 private:
+  void emitCallInformation(CallType CT);
   void LowerFENTRY_CALL(const MachineInstr &MI, SystemZMCInstLower &MCIL);
   void LowerSTACKMAP(const MachineInstr &MI);
   void LowerPATCHPOINT(const MachineInstr &MI, SystemZMCInstLower &Lower);
diff --git a/llvm/lib/Target/SystemZ/SystemZCopyPhysRegs.cpp b/llvm/lib/Target/SystemZ/SystemZCopyPhysRegs.cpp
index 763aa8c0e41f..9fc6765dbbf7 100644
--- a/llvm/lib/Target/SystemZ/SystemZCopyPhysRegs.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZCopyPhysRegs.cpp
@@ -100,7 +100,7 @@ bool SystemZCopyPhysRegs::visitMBB(MachineBasicBlock &MBB) {
 }
 
 bool SystemZCopyPhysRegs::runOnMachineFunction(MachineFunction &F) {
-  TII = static_cast<const SystemZInstrInfo *>(F.getSubtarget().getInstrInfo());
+  TII = F.getSubtarget<SystemZSubtarget>().getInstrInfo();
   MRI = &F.getRegInfo();
 
   bool Modified = false;
diff --git a/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp b/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
index 4893acc81335..340dba1362af 100644
--- a/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
@@ -224,7 +224,7 @@ bool SystemZElimCompare::convertToBRCT(
   // The transformation is OK.  Rebuild Branch as a BRCT(G) or BRCTH.
   MachineOperand Target(Branch->getOperand(2));
   while (Branch->getNumOperands())
-    Branch->RemoveOperand(0);
+    Branch->removeOperand(0);
   Branch->setDesc(TII->get(BRCT));
   MachineInstrBuilder MIB(*Branch->getParent()->getParent(), Branch);
   MIB.add(MI.getOperand(0)).add(MI.getOperand(1)).add(Target);
@@ -267,7 +267,7 @@ bool SystemZElimCompare::convertToLoadAndTrap(
 
   // The transformation is OK.  Rebuild Branch as a load-and-trap.
   while (Branch->getNumOperands())
-    Branch->RemoveOperand(0);
+    Branch->removeOperand(0);
   Branch->setDesc(TII->get(LATOpcode));
   MachineInstrBuilder(*Branch->getParent()->getParent(), Branch)
       .add(MI.getOperand(0))
@@ -649,16 +649,16 @@ bool SystemZElimCompare::fuseCompareOperations(
   // Clear out all current operands.
   int CCUse = MBBI->findRegisterUseOperandIdx(SystemZ::CC, false, TRI);
   assert(CCUse >= 0 && "BRC/BCR must use CC");
-  Branch->RemoveOperand(CCUse);
+  Branch->removeOperand(CCUse);
   // Remove regmask (sibcall).
   if (Type == SystemZII::CompareAndSibcall)
-    Branch->RemoveOperand(3);
+    Branch->removeOperand(3);
   // Remove target (branch or sibcall).
   if (Type == SystemZII::CompareAndBranch ||
       Type == SystemZII::CompareAndSibcall)
-    Branch->RemoveOperand(2);
-  Branch->RemoveOperand(1);
-  Branch->RemoveOperand(0);
+    Branch->removeOperand(2);
+  Branch->removeOperand(1);
+  Branch->removeOperand(0);
 
   // Rebuild Branch as a fused compare and branch.
   // SrcNOps is the number of MI operands of the compare instruction
@@ -735,7 +735,7 @@ bool SystemZElimCompare::runOnMachineFunction(MachineFunction &F) {
   if (skipFunction(F.getFunction()))
     return false;
 
-  TII = static_cast<const SystemZInstrInfo *>(F.getSubtarget().getInstrInfo());
+  TII = F.getSubtarget<SystemZSubtarget>().getInstrInfo();
   TRI = &TII->getRegisterInfo();
 
   bool Changed = false;
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index 610627e7e3f0..43bc7426cfa8 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -13,6 +13,7 @@
 #include "SystemZMachineFunctionInfo.h"
 #include "SystemZRegisterInfo.h"
 #include "SystemZSubtarget.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
@@ -95,8 +96,7 @@ typedef std::vector<SZFrameSortingObj> SZFrameObjVec;
 void SystemZELFFrameLowering::orderFrameObjects(
     const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-  const SystemZInstrInfo *TII =
-      static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  auto *TII = MF.getSubtarget<SystemZSubtarget>().getInstrInfo();
 
   // Make a vector of sorting objects to track all MFI objects and mark those
   // to be sorted as valid.
@@ -1153,12 +1153,6 @@ void SystemZXPLINKFrameLowering::emitPrologue(MachineFunction &MF,
   MFFrame.setStackSize(MFFrame.getStackSize() + Regs.getCallFrameSize());
   uint64_t StackSize = MFFrame.getStackSize();
 
-  // FIXME: Implement support for large stack sizes, when the stack extension
-  // routine needs to be called.
-  if (StackSize > 1024 * 1024) {
-    llvm_unreachable("Huge Stack Frame not yet supported on z/OS");
-  }
-
   if (ZFI->getSpillGPRRegs().LowGPR) {
     // Skip over the GPR saves.
     if ((MBBI != MBB.end()) && ((MBBI->getOpcode() == SystemZ::STMG))) {
@@ -1201,6 +1195,18 @@ void SystemZXPLINKFrameLowering::emitPrologue(MachineFunction &MF,
 
     emitIncrement(MBB, InsertPt, DL, Regs.getStackPointerRegister(), Delta,
                   ZII);
+
+    // If the requested stack size is larger than the guard page, then we need
+    // to check if we need to call the stack extender. This requires adding a
+    // conditional branch, but splitting the prologue block is not possible at
+    // this point since it would invalidate the SaveBlocks / RestoreBlocks sets
+    // of PEI in the single block function case. Build a pseudo to be handled
+    // later by inlineStackProbe().
+    const uint64_t GuardPageSize = 1024 * 1024;
+    if (StackSize > GuardPageSize) {
+      assert(StoreInstr && "Wrong insertion point");
+      BuildMI(MBB, InsertPt, DL, ZII->get(SystemZ::XPLINK_STACKALLOC));
+    }
   }
 
   if (HasFP) {
@@ -1239,6 +1245,74 @@ void SystemZXPLINKFrameLowering::emitEpilogue(MachineFunction &MF,
   }
 }
 
+// Emit a compare of the stack pointer against the stack floor, and a call to
+// the LE stack extender if needed.
+void SystemZXPLINKFrameLowering::inlineStackProbe(
+    MachineFunction &MF, MachineBasicBlock &PrologMBB) const {
+  auto *ZII =
+      static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+  MachineInstr *StackAllocMI = nullptr;
+  for (MachineInstr &MI : PrologMBB)
+    if (MI.getOpcode() == SystemZ::XPLINK_STACKALLOC) {
+      StackAllocMI = &MI;
+      break;
+    }
+  if (StackAllocMI == nullptr)
+    return;
+
+  MachineBasicBlock &MBB = PrologMBB;
+  const DebugLoc DL = StackAllocMI->getDebugLoc();
+
+  // The 2nd half of block MBB after split.
+  MachineBasicBlock *NextMBB;
+
+  // Add new basic block for the call to the stack overflow function.
+  MachineBasicBlock *StackExtMBB =
+      MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.push_back(StackExtMBB);
+
+  // LG r3,72(,r3)
+  BuildMI(StackExtMBB, DL, ZII->get(SystemZ::LG), SystemZ::R3D)
+      .addReg(SystemZ::R3D)
+      .addImm(72)
+      .addReg(0);
+  // BASR r3,r3
+  BuildMI(StackExtMBB, DL, ZII->get(SystemZ::CallBASR_STACKEXT))
+      .addReg(SystemZ::R3D);
+
+  // LLGT r3,1208
+  BuildMI(MBB, StackAllocMI, DL, ZII->get(SystemZ::LLGT), SystemZ::R3D)
+      .addReg(0)
+      .addImm(1208)
+      .addReg(0);
+  // CG r4,64(,r3)
+  BuildMI(MBB, StackAllocMI, DL, ZII->get(SystemZ::CG))
+      .addReg(SystemZ::R4D)
+      .addReg(SystemZ::R3D)
+      .addImm(64)
+      .addReg(0);
+  // JLL b'0100',F'37'
+  BuildMI(MBB, StackAllocMI, DL, ZII->get(SystemZ::BRC))
+      .addImm(SystemZ::CCMASK_ICMP)
+      .addImm(SystemZ::CCMASK_CMP_LT)
+      .addMBB(StackExtMBB);
+
+  NextMBB = SystemZ::splitBlockBefore(StackAllocMI, &MBB);
+  MBB.addSuccessor(NextMBB);
+  MBB.addSuccessor(StackExtMBB);
+
+  // Add jump back from stack extension BB.
+  BuildMI(StackExtMBB, DL, ZII->get(SystemZ::J)).addMBB(NextMBB);
+  StackExtMBB->addSuccessor(NextMBB);
+
+  StackAllocMI->eraseFromParent();
+
+  // Compute the live-in lists for the new blocks.
+  recomputeLiveIns(*NextMBB);
+  recomputeLiveIns(*StackExtMBB);
+}
+
 bool SystemZXPLINKFrameLowering::hasFP(const MachineFunction &MF) const {
   return (MF.getFrameInfo().hasVarSizedObjects());
 }
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
index 2b3d7efed53b..bec83a9457e0 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -127,6 +127,9 @@ public:
 
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
+  void inlineStackProbe(MachineFunction &MF,
+                        MachineBasicBlock &PrologMBB) const override;
+
   bool hasFP(const MachineFunction &MF) const override;
 
   void processFunctionBeforeFrameFinalized(MachineFunction &MF,
diff --git a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index cf55318d328d..9ac7eafd5f34 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -968,7 +968,7 @@ bool SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) {
     if (RISBG.Input.getOpcode() != ISD::ANY_EXTEND &&
         RISBG.Input.getOpcode() != ISD::TRUNCATE)
       Count += 1;
-  if (Count == 0)
+  if (Count == 0 || isa<ConstantSDNode>(RISBG.Input))
     return false;
 
   // Prefer to use normal shift instructions over RISBG, since they can handle
@@ -1472,7 +1472,7 @@ bool SystemZDAGToDAGISel::storeLoadIsAligned(SDNode *N) const {
   assert(MMO && "Expected a memory operand.");
 
   // The memory access must have a proper alignment and no index register.
-  if (MemAccess->getAlignment() < StoreSize ||
+  if (MemAccess->getAlign().value() < StoreSize ||
       !MemAccess->getOffset().isUndef())
     return false;
 
@@ -1683,16 +1683,19 @@ SelectInlineAsmMemoryOperand(const SDValue &Op,
     llvm_unreachable("Unexpected asm memory constraint");
   case InlineAsm::Constraint_i:
   case InlineAsm::Constraint_Q:
+  case InlineAsm::Constraint_ZQ:
     // Accept an address with a short displacement, but no index.
     Form = SystemZAddressingMode::FormBD;
     DispRange = SystemZAddressingMode::Disp12Only;
     break;
   case InlineAsm::Constraint_R:
+  case InlineAsm::Constraint_ZR:
     // Accept an address with a short displacement and an index.
     Form = SystemZAddressingMode::FormBDXNormal;
     DispRange = SystemZAddressingMode::Disp12Only;
     break;
   case InlineAsm::Constraint_S:
+  case InlineAsm::Constraint_ZS:
     // Accept an address with a long displacement, but no index.
     Form = SystemZAddressingMode::FormBD;
     DispRange = SystemZAddressingMode::Disp20Only;
@@ -1700,6 +1703,8 @@ SelectInlineAsmMemoryOperand(const SDValue &Op,
   case InlineAsm::Constraint_T:
   case InlineAsm::Constraint_m:
   case InlineAsm::Constraint_o:
+  case InlineAsm::Constraint_p:
+  case InlineAsm::Constraint_ZT:
     // Accept an address with a long displacement and an index.
     // m works the same as T, as this is the most general case.
     // We don't really have any special handling of "offsettable"
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index f10651d5c5d7..42c1c77f14e4 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -80,7 +80,7 @@ static MachineOperand earlyUseOperand(MachineOperand Op) {
 SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
                                              const SystemZSubtarget &STI)
     : TargetLowering(TM), Subtarget(STI) {
-  MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize(0));
+  MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
 
   auto *Regs = STI.getSpecialRegisters();
 
@@ -471,6 +471,9 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::FREM, VT, Expand);
       setOperationAction(ISD::FPOW, VT, Expand);
 
+      // Special treatment.
+      setOperationAction(ISD::IS_FPCLASS, VT, Custom);
+
       // Handle constrained floating-point operations.
       setOperationAction(ISD::STRICT_FADD, VT, Legal);
       setOperationAction(ISD::STRICT_FSUB, VT, Legal);
@@ -640,33 +643,33 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::VAEND,   MVT::Other, Expand);
 
   // Codes for which we want to perform some z-specific combinations.
-  setTargetDAGCombine(ISD::ZERO_EXTEND);
-  setTargetDAGCombine(ISD::SIGN_EXTEND);
-  setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
-  setTargetDAGCombine(ISD::LOAD);
-  setTargetDAGCombine(ISD::STORE);
-  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
-  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
-  setTargetDAGCombine(ISD::FP_ROUND);
-  setTargetDAGCombine(ISD::STRICT_FP_ROUND);
-  setTargetDAGCombine(ISD::FP_EXTEND);
-  setTargetDAGCombine(ISD::SINT_TO_FP);
-  setTargetDAGCombine(ISD::UINT_TO_FP);
-  setTargetDAGCombine(ISD::STRICT_FP_EXTEND);
-  setTargetDAGCombine(ISD::BSWAP);
-  setTargetDAGCombine(ISD::SDIV);
-  setTargetDAGCombine(ISD::UDIV);
-  setTargetDAGCombine(ISD::SREM);
-  setTargetDAGCombine(ISD::UREM);
-  setTargetDAGCombine(ISD::INTRINSIC_VOID);
-  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
+  setTargetDAGCombine({ISD::ZERO_EXTEND,
+                       ISD::SIGN_EXTEND,
+                       ISD::SIGN_EXTEND_INREG,
+                       ISD::LOAD,
+                       ISD::STORE,
+                       ISD::VECTOR_SHUFFLE,
+                       ISD::EXTRACT_VECTOR_ELT,
+                       ISD::FP_ROUND,
+                       ISD::STRICT_FP_ROUND,
+                       ISD::FP_EXTEND,
+                       ISD::SINT_TO_FP,
+                       ISD::UINT_TO_FP,
+                       ISD::STRICT_FP_EXTEND,
+                       ISD::BSWAP,
+                       ISD::SDIV,
+                       ISD::UDIV,
+                       ISD::SREM,
+                       ISD::UREM,
+                       ISD::INTRINSIC_VOID,
+                       ISD::INTRINSIC_W_CHAIN});
 
   // Handle intrinsics.
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
 
   // We want to use MVC in preference to even a single load/store pair.
-  MaxStoresPerMemcpy = 0;
+  MaxStoresPerMemcpy = Subtarget.hasVector() ? 2 : 0;
   MaxStoresPerMemcpyOptSize = 0;
 
   // The main memset sequence is a byte store followed by an MVC.
@@ -674,7 +677,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   // generated by target-independent code don't when the byte value is
   // variable.  E.g.  "STC <reg>;MHI <reg>,257;STH <reg>" is not better
   // than "STC;MVC".  Handle the choice in target-specific code instead.
-  MaxStoresPerMemset = 0;
+  MaxStoresPerMemset = Subtarget.hasVector() ? 2 : 0;
   MaxStoresPerMemsetOptSize = 0;
 
   // Default to having -disable-strictnode-mutation on
@@ -716,8 +719,7 @@ bool SystemZTargetLowering::isFMAFasterThanFMulAndFAdd(
 // such as VGM, VGMB or VREPI.
 bool SystemZVectorConstantInfo::isVectorConstantLegal(
     const SystemZSubtarget &Subtarget) {
-  const SystemZInstrInfo *TII =
-      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+  const SystemZInstrInfo *TII = Subtarget.getInstrInfo();
   if (!Subtarget.hasVector() ||
       (isFP128 && !Subtarget.hasVectorEnhancements1()))
     return false;
@@ -790,14 +792,17 @@ bool SystemZVectorConstantInfo::isVectorConstantLegal(
   return tryValue(SplatBitsZ | Middle);
 }
 
-SystemZVectorConstantInfo::SystemZVectorConstantInfo(APFloat FPImm) {
-  IntBits = FPImm.bitcastToAPInt().zextOrSelf(128);
-  isFP128 = (&FPImm.getSemantics() == &APFloat::IEEEquad());
-  SplatBits = FPImm.bitcastToAPInt();
-  unsigned Width = SplatBits.getBitWidth();
-  IntBits <<= (SystemZ::VectorBits - Width);
+SystemZVectorConstantInfo::SystemZVectorConstantInfo(APInt IntImm) {
+  if (IntImm.isSingleWord()) {
+    IntBits = APInt(128, IntImm.getZExtValue());
+    IntBits <<= (SystemZ::VectorBits - IntImm.getBitWidth());
+  } else
+    IntBits = IntImm;
+  assert(IntBits.getBitWidth() == 128 && "Unsupported APInt.");
 
   // Find the smallest splat.
+  SplatBits = IntImm;
+  unsigned Width = SplatBits.getBitWidth();
   while (Width > 8) {
     unsigned HalfSize = Width / 2;
     APInt HighValue = SplatBits.lshr(HalfSize).trunc(HalfSize);
@@ -973,7 +978,8 @@ bool SystemZTargetLowering::isLegalAddressingMode(const DataLayout &DL,
   if (!isInt<20>(AM.BaseOffs))
     return false;
 
-  AddressingMode SupportedAM(true, true);
+  bool RequireD12 = Subtarget.hasVector() && Ty->isVectorTy();
+  AddressingMode SupportedAM(!RequireD12, true);
   if (I != nullptr)
     SupportedAM = supportedAddressingMode(I, Subtarget.hasVector());
 
@@ -988,6 +994,30 @@ bool SystemZTargetLowering::isLegalAddressingMode(const DataLayout &DL,
     return AM.Scale == 0 || AM.Scale == 1;
 }
 
+bool SystemZTargetLowering::findOptimalMemOpLowering(
+    std::vector<EVT> &MemOps, unsigned Limit, const MemOp &Op, unsigned DstAS,
+    unsigned SrcAS, const AttributeList &FuncAttributes) const {
+  const int MVCFastLen = 16;
+
+  if (Limit != ~unsigned(0)) {
+    // Don't expand Op into scalar loads/stores in these cases:
+    if (Op.isMemcpy() && Op.allowOverlap() && Op.size() <= MVCFastLen)
+      return false; // Small memcpy: Use MVC
+    if (Op.isMemset() && Op.size() - 1 <= MVCFastLen)
+      return false; // Small memset (first byte with STC/MVI): Use MVC
+    if (Op.isZeroMemset())
+      return false; // Memset zero: Use XC
+  }
+
+  return TargetLowering::findOptimalMemOpLowering(MemOps, Limit, Op, DstAS,
+                                                  SrcAS, FuncAttributes);
+}
+
+EVT SystemZTargetLowering::getOptimalMemOpType(const MemOp &Op,
+                                   const AttributeList &FuncAttributes) const {
+  return Subtarget.hasVector() ? MVT::v2i64 : MVT::Other;
+}
+
 bool SystemZTargetLowering::isTruncateFree(Type *FromType, Type *ToType) const {
   if (!FromType->isIntegerTy() || !ToType->isIntegerTy())
     return false;
@@ -1034,6 +1064,17 @@ SystemZTargetLowering::getConstraintType(StringRef Constraint) const {
     case 'M': // 0x7fffffff
       return C_Immediate;
 
+    default:
+      break;
+    }
+  } else if (Constraint.size() == 2 && Constraint[0] == 'Z') {
+    switch (Constraint[1]) {
+    case 'Q': // Address with base and unsigned 12-bit displacement
+    case 'R': // Likewise, plus an index
+    case 'S': // Address with base and signed 20-bit displacement
+    case 'T': // Likewise, plus an index
+      return C_Address;
+
     default:
       break;
     }
@@ -1218,12 +1259,17 @@ SystemZTargetLowering::getRegForInlineAsmConstraint(
 
 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
-Register SystemZTargetLowering::getRegisterByName(const char *RegName, LLT VT,
-                                                  const MachineFunction &MF) const {
+Register
+SystemZTargetLowering::getRegisterByName(const char *RegName, LLT VT,
+                                         const MachineFunction &MF) const {
+  const SystemZSubtarget *Subtarget = &MF.getSubtarget<SystemZSubtarget>();
+
+  Register Reg =
+      StringSwitch<Register>(RegName)
+          .Case("r4", Subtarget->isTargetXPLINK64() ? SystemZ::R4D : 0)
+          .Case("r15", Subtarget->isTargetELF() ? SystemZ::R15D : 0)
+          .Default(0);
 
-  Register Reg = StringSwitch<Register>(RegName)
-                   .Case("r15", SystemZ::R15D)
-                   .Default(0);
   if (Reg)
     return Reg;
   report_fatal_error("Invalid register name global variable");
@@ -1833,6 +1879,40 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
   return Chain;
 }
 
+// Generate a call taking the given operands as arguments and returning a
+// result of type RetVT.
+std::pair<SDValue, SDValue> SystemZTargetLowering::makeExternalCall(
+    SDValue Chain, SelectionDAG &DAG, const char *CalleeName, EVT RetVT,
+    ArrayRef<SDValue> Ops, CallingConv::ID CallConv, bool IsSigned, SDLoc DL,
+    bool DoesNotReturn, bool IsReturnValueUsed) const {
+  TargetLowering::ArgListTy Args;
+  Args.reserve(Ops.size());
+
+  TargetLowering::ArgListEntry Entry;
+  for (SDValue Op : Ops) {
+    Entry.Node = Op;
+    Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
+    Entry.IsSExt = shouldSignExtendTypeInLibCall(Op.getValueType(), IsSigned);
+    Entry.IsZExt = !shouldSignExtendTypeInLibCall(Op.getValueType(), IsSigned);
+    Args.push_back(Entry);
+  }
+
+  SDValue Callee =
+      DAG.getExternalSymbol(CalleeName, getPointerTy(DAG.getDataLayout()));
+
+  Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  bool SignExtend = shouldSignExtendTypeInLibCall(RetVT, IsSigned);
+  CLI.setDebugLoc(DL)
+      .setChain(Chain)
+      .setCallee(CallConv, RetTy, Callee, std::move(Args))
+      .setNoReturn(DoesNotReturn)
+      .setDiscardResult(!IsReturnValueUsed)
+      .setSExtResult(SignExtend)
+      .setZExtResult(!SignExtend);
+  return LowerCallTo(CLI);
+}
+
 bool SystemZTargetLowering::
 CanLowerReturn(CallingConv::ID CallConv,
                MachineFunction &MF, bool isVarArg,
@@ -2237,7 +2317,7 @@ static void adjustSubwordCmp(SelectionDAG &DAG, const SDLoc &DL,
       Load->getExtensionType() != ExtType) {
     C.Op0 = DAG.getExtLoad(ExtType, SDLoc(Load), MVT::i32, Load->getChain(),
                            Load->getBasePtr(), Load->getPointerInfo(),
-                           Load->getMemoryVT(), Load->getAlignment(),
+                           Load->getMemoryVT(), Load->getAlign(),
                            Load->getMemOperand()->getFlags());
     // Update the chain uses.
     DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), C.Op0.getValue(1));
@@ -3471,6 +3551,32 @@ SDValue SystemZTargetLowering::lowerBITCAST(SDValue Op,
 
 SDValue SystemZTargetLowering::lowerVASTART(SDValue Op,
                                             SelectionDAG &DAG) const {
+
+  if (Subtarget.isTargetXPLINK64())
+    return lowerVASTART_XPLINK(Op, DAG);
+  else
+    return lowerVASTART_ELF(Op, DAG);
+}
+
+SDValue SystemZTargetLowering::lowerVASTART_XPLINK(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  SystemZMachineFunctionInfo *FuncInfo =
+      MF.getInfo<SystemZMachineFunctionInfo>();
+
+  SDLoc DL(Op);
+
+  // vastart just stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument.
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
+                      MachinePointerInfo(SV));
+}
+
+SDValue SystemZTargetLowering::lowerVASTART_ELF(SDValue Op,
+                                                SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   SystemZMachineFunctionInfo *FuncInfo =
     MF.getInfo<SystemZMachineFunctionInfo>();
@@ -3514,14 +3620,90 @@ SDValue SystemZTargetLowering::lowerVACOPY(SDValue Op,
   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
   SDLoc DL(Op);
 
-  return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(32, DL),
+  uint32_t Sz =
+      Subtarget.isTargetXPLINK64() ? getTargetMachine().getPointerSize(0) : 32;
+  return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(Sz, DL),
                        Align(8), /*isVolatile*/ false, /*AlwaysInline*/ false,
                        /*isTailCall*/ false, MachinePointerInfo(DstSV),
                        MachinePointerInfo(SrcSV));
 }
 
-SDValue SystemZTargetLowering::
-lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
+SDValue
+SystemZTargetLowering::lowerDYNAMIC_STACKALLOC(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  if (Subtarget.isTargetXPLINK64())
+    return lowerDYNAMIC_STACKALLOC_XPLINK(Op, DAG);
+  else
+    return lowerDYNAMIC_STACKALLOC_ELF(Op, DAG);
+}
+
+SDValue
+SystemZTargetLowering::lowerDYNAMIC_STACKALLOC_XPLINK(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool RealignOpt = !MF.getFunction().hasFnAttribute("no-realign-stack");
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size = Op.getOperand(1);
+  SDValue Align = Op.getOperand(2);
+  SDLoc DL(Op);
+
+  // If user has set the no alignment function attribute, ignore
+  // alloca alignments.
+  uint64_t AlignVal =
+      (RealignOpt ? cast<ConstantSDNode>(Align)->getZExtValue() : 0);
+
+  uint64_t StackAlign = TFI->getStackAlignment();
+  uint64_t RequiredAlign = std::max(AlignVal, StackAlign);
+  uint64_t ExtraAlignSpace = RequiredAlign - StackAlign;
+
+  SDValue NeededSpace = Size;
+
+  // Add extra space for alignment if needed.
+  EVT PtrVT = getPointerTy(MF.getDataLayout());
+  if (ExtraAlignSpace)
+    NeededSpace = DAG.getNode(ISD::ADD, DL, PtrVT, NeededSpace,
+                              DAG.getConstant(ExtraAlignSpace, DL, PtrVT));
+
+  bool IsSigned = false;
+  bool DoesNotReturn = false;
+  bool IsReturnValueUsed = false;
+  EVT VT = Op.getValueType();
+  SDValue AllocaCall =
+      makeExternalCall(Chain, DAG, "@@ALCAXP", VT, makeArrayRef(NeededSpace),
+                       CallingConv::C, IsSigned, DL, DoesNotReturn,
+                       IsReturnValueUsed)
+          .first;
+
+  // Perform a CopyFromReg from %GPR4 (stack pointer register). Chain and Glue
+  // to end of call in order to ensure it isn't broken up from the call
+  // sequence.
+  auto &Regs = Subtarget.getSpecialRegisters<SystemZXPLINK64Registers>();
+  Register SPReg = Regs.getStackPointerRegister();
+  Chain = AllocaCall.getValue(1);
+  SDValue Glue = AllocaCall.getValue(2);
+  SDValue NewSPRegNode = DAG.getCopyFromReg(Chain, DL, SPReg, PtrVT, Glue);
+  Chain = NewSPRegNode.getValue(1);
+
+  MVT PtrMVT = getPointerMemTy(MF.getDataLayout());
+  SDValue ArgAdjust = DAG.getNode(SystemZISD::ADJDYNALLOC, DL, PtrMVT);
+  SDValue Result = DAG.getNode(ISD::ADD, DL, PtrMVT, NewSPRegNode, ArgAdjust);
+
+  // Dynamically realign if needed.
+  if (ExtraAlignSpace) {
+    Result = DAG.getNode(ISD::ADD, DL, PtrVT, Result,
+                         DAG.getConstant(ExtraAlignSpace, DL, PtrVT));
+    Result = DAG.getNode(ISD::AND, DL, PtrVT, Result,
+                         DAG.getConstant(~(RequiredAlign - 1), DL, PtrVT));
+  }
+
+  SDValue Ops[2] = {Result, Chain};
+  return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue
+SystemZTargetLowering::lowerDYNAMIC_STACKALLOC_ELF(SDValue Op,
+                                                   SelectionDAG &DAG) const {
   const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
   MachineFunction &MF = DAG.getMachineFunction();
   bool RealignOpt = !MF.getFunction().hasFnAttribute("no-realign-stack");
@@ -5468,6 +5650,41 @@ SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG,
   return Op;
 }
 
+SDValue SystemZTargetLowering::lowerIS_FPCLASS(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  MVT ResultVT = Op.getSimpleValueType();
+  SDValue Arg = Op.getOperand(0);
+  auto CNode = cast<ConstantSDNode>(Op.getOperand(1));
+  unsigned Check = CNode->getZExtValue();
+
+  unsigned TDCMask = 0;
+  if (Check & fcSNan)
+    TDCMask |= SystemZ::TDCMASK_SNAN_PLUS | SystemZ::TDCMASK_SNAN_MINUS;
+  if (Check & fcQNan)
+    TDCMask |= SystemZ::TDCMASK_QNAN_PLUS | SystemZ::TDCMASK_QNAN_MINUS;
+  if (Check & fcPosInf)
+    TDCMask |= SystemZ::TDCMASK_INFINITY_PLUS;
+  if (Check & fcNegInf)
+    TDCMask |= SystemZ::TDCMASK_INFINITY_MINUS;
+  if (Check & fcPosNormal)
+    TDCMask |= SystemZ::TDCMASK_NORMAL_PLUS;
+  if (Check & fcNegNormal)
+    TDCMask |= SystemZ::TDCMASK_NORMAL_MINUS;
+  if (Check & fcPosSubnormal)
+    TDCMask |= SystemZ::TDCMASK_SUBNORMAL_PLUS;
+  if (Check & fcNegSubnormal)
+    TDCMask |= SystemZ::TDCMASK_SUBNORMAL_MINUS;
+  if (Check & fcPosZero)
+    TDCMask |= SystemZ::TDCMASK_ZERO_PLUS;
+  if (Check & fcNegZero)
+    TDCMask |= SystemZ::TDCMASK_ZERO_MINUS;
+  SDValue TDCMaskV = DAG.getConstant(TDCMask, DL, MVT::i64);
+
+  SDValue Intr = DAG.getNode(SystemZISD::TDC, DL, ResultVT, Arg, TDCMaskV);
+  return getCCResult(DAG, Intr);
+}
+
 SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
                                               SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
@@ -5585,6 +5802,8 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
     return lowerShift(Op, DAG, SystemZISD::VSRL_BY_SCALAR);
   case ISD::SRA:
     return lowerShift(Op, DAG, SystemZISD::VSRA_BY_SCALAR);
+  case ISD::IS_FPCLASS:
+    return lowerIS_FPCLASS(Op, DAG);
   default:
     llvm_unreachable("Unexpected node to lower");
   }
@@ -6142,6 +6361,23 @@ static bool isVectorElementSwap(ArrayRef<int> M, EVT VT) {
   return true;
 }
 
+static bool isOnlyUsedByStores(SDValue StoredVal, SelectionDAG &DAG) {
+  for (auto *U : StoredVal->uses()) {
+    if (StoreSDNode *ST = dyn_cast<StoreSDNode>(U)) {
+      EVT CurrMemVT = ST->getMemoryVT().getScalarType();
+      if (CurrMemVT.isRound() && CurrMemVT.getStoreSize() <= 16)
+        continue;
+    } else if (isa<BuildVectorSDNode>(U)) {
+      SDValue BuildVector = SDValue(U, 0);
+      if (DAG.isSplatValue(BuildVector, true/*AllowUndefs*/) &&
+          isOnlyUsedByStores(BuildVector, DAG))
+        continue;
+    }
+    return false;
+  }
+  return true;
+}
+
 SDValue SystemZTargetLowering::combineSTORE(
     SDNode *N, DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -6200,6 +6436,82 @@ SDValue SystemZTargetLowering::combineSTORE(
     }
   }
 
+  // Replicate a reg or immediate with VREP instead of scalar multiply or
+  // immediate load. It seems best to do this during the first DAGCombine as
+  // it is straight-forward to handle the zero-extend node in the initial
+  // DAG, and also not worry about the keeping the new MemVT legal (e.g. when
+  // extracting an i16 element from a v16i8 vector).
+  if (Subtarget.hasVector() && DCI.Level == BeforeLegalizeTypes &&
+      isOnlyUsedByStores(Op1, DAG)) {
+    SDValue Word = SDValue();
+    EVT WordVT;
+
+    // Find a replicated immediate and return it if found in Word and its
+    // type in WordVT.
+    auto FindReplicatedImm = [&](ConstantSDNode *C, unsigned TotBytes) {
+      // Some constants are better handled with a scalar store.
+      if (C->getAPIntValue().getBitWidth() > 64 || C->isAllOnes() ||
+          isInt<16>(C->getSExtValue()) || MemVT.getStoreSize() <= 2)
+        return;
+      SystemZVectorConstantInfo VCI(APInt(TotBytes * 8, C->getZExtValue()));
+      if (VCI.isVectorConstantLegal(Subtarget) &&
+          VCI.Opcode == SystemZISD::REPLICATE) {
+        Word = DAG.getConstant(VCI.OpVals[0], SDLoc(SN), MVT::i32);
+        WordVT = VCI.VecVT.getScalarType();
+      }
+    };
+
+    // Find a replicated register and return it if found in Word and its type
+    // in WordVT.
+    auto FindReplicatedReg = [&](SDValue MulOp) {
+      EVT MulVT = MulOp.getValueType();
+      if (MulOp->getOpcode() == ISD::MUL &&
+          (MulVT == MVT::i16 || MulVT == MVT::i32 || MulVT == MVT::i64)) {
+        // Find a zero extended value and its type.
+        SDValue LHS = MulOp->getOperand(0);
+        if (LHS->getOpcode() == ISD::ZERO_EXTEND)
+          WordVT = LHS->getOperand(0).getValueType();
+        else if (LHS->getOpcode() == ISD::AssertZext)
+          WordVT = cast<VTSDNode>(LHS->getOperand(1))->getVT();
+        else
+          return;
+        // Find a replicating constant, e.g. 0x00010001.
+        if (auto *C = dyn_cast<ConstantSDNode>(MulOp->getOperand(1))) {
+          SystemZVectorConstantInfo VCI(
+              APInt(MulVT.getSizeInBits(), C->getZExtValue()));
+          if (VCI.isVectorConstantLegal(Subtarget) &&
+              VCI.Opcode == SystemZISD::REPLICATE && VCI.OpVals[0] == 1 &&
+              WordVT == VCI.VecVT.getScalarType())
+            Word = DAG.getZExtOrTrunc(LHS->getOperand(0), SDLoc(SN), WordVT);
+        }
+      }
+    };
+
+    if (isa<BuildVectorSDNode>(Op1) &&
+        DAG.isSplatValue(Op1, true/*AllowUndefs*/)) {
+      SDValue SplatVal = Op1->getOperand(0);
+      if (auto *C = dyn_cast<ConstantSDNode>(SplatVal))
+        FindReplicatedImm(C, SplatVal.getValueType().getStoreSize());
+      else
+        FindReplicatedReg(SplatVal);
+    } else {
+      if (auto *C = dyn_cast<ConstantSDNode>(Op1))
+        FindReplicatedImm(C, MemVT.getStoreSize());
+      else
+        FindReplicatedReg(Op1);
+    }
+
+    if (Word != SDValue()) {
+      assert(MemVT.getSizeInBits() % WordVT.getSizeInBits() == 0 &&
+             "Bad type handling");
+      unsigned NumElts = MemVT.getSizeInBits() / WordVT.getSizeInBits();
+      EVT SplatVT = EVT::getVectorVT(*DAG.getContext(), WordVT, NumElts);
+      SDValue SplatVal = DAG.getSplatVector(SplatVT, SDLoc(SN), Word);
+      return DAG.getStore(SN->getChain(), SDLoc(SN), SplatVal,
+                          SN->getBasePtr(), SN->getMemOperand());
+    }
+  }
+
   return SDValue();
 }
 
@@ -6442,22 +6754,26 @@ SDValue SystemZTargetLowering::combineINT_TO_FP(
     SDNode *N, DAGCombinerInfo &DCI) const {
   if (DCI.Level != BeforeLegalizeTypes)
     return SDValue();
+  SelectionDAG &DAG = DCI.DAG;
+  LLVMContext &Ctx = *DAG.getContext();
   unsigned Opcode = N->getOpcode();
   EVT OutVT = N->getValueType(0);
-  SelectionDAG &DAG = DCI.DAG;
+  Type *OutLLVMTy = OutVT.getTypeForEVT(Ctx);
   SDValue Op = N->getOperand(0);
-  unsigned OutScalarBits = OutVT.getScalarSizeInBits();
+  unsigned OutScalarBits = OutLLVMTy->getScalarSizeInBits();
   unsigned InScalarBits = Op->getValueType(0).getScalarSizeInBits();
 
   // Insert an extension before type-legalization to avoid scalarization, e.g.:
   // v2f64 = uint_to_fp v2i16
   // =>
   // v2f64 = uint_to_fp (v2i64 zero_extend v2i16)
-  if (OutVT.isVector() && OutScalarBits > InScalarBits) {
-    MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(OutVT.getScalarSizeInBits()),
-                                 OutVT.getVectorNumElements());
+  if (OutLLVMTy->isVectorTy() && OutScalarBits > InScalarBits &&
+      OutScalarBits <= 64) {
+    unsigned NumElts = cast<FixedVectorType>(OutLLVMTy)->getNumElements();
+    EVT ExtVT = EVT::getVectorVT(
+        Ctx, EVT::getIntegerVT(Ctx, OutLLVMTy->getScalarSizeInBits()), NumElts);
     unsigned ExtOpcode =
-      (Opcode == ISD::UINT_TO_FP ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND);
+        (Opcode == ISD::UINT_TO_FP ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND);
     SDValue ExtOp = DAG.getNode(ExtOpcode, SDLoc(N), ExtVT, Op);
     return DAG.getNode(Opcode, SDLoc(N), OutVT, ExtOp);
   }
@@ -7271,8 +7587,7 @@ MachineBasicBlock *
 SystemZTargetLowering::emitSelect(MachineInstr &MI,
                                   MachineBasicBlock *MBB) const {
   assert(isSelectPseudo(MI) && "Bad call to emitSelect()");
-  const SystemZInstrInfo *TII =
-      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+  const SystemZInstrInfo *TII = Subtarget.getInstrInfo();
 
   unsigned CCValid = MI.getOperand(3).getImm();
   unsigned CCMask = MI.getOperand(4).getImm();
@@ -7368,8 +7683,7 @@ MachineBasicBlock *SystemZTargetLowering::emitCondStore(MachineInstr &MI,
                                                         unsigned StoreOpcode,
                                                         unsigned STOCOpcode,
                                                         bool Invert) const {
-  const SystemZInstrInfo *TII =
-      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+  const SystemZInstrInfo *TII = Subtarget.getInstrInfo();
 
   Register SrcReg = MI.getOperand(0).getReg();
   MachineOperand Base = MI.getOperand(1);
@@ -7460,8 +7774,7 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary(
     MachineInstr &MI, MachineBasicBlock *MBB, unsigned BinOpcode,
     unsigned BitSize, bool Invert) const {
   MachineFunction &MF = *MBB->getParent();
-  const SystemZInstrInfo *TII =
-      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+  const SystemZInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   bool IsSubWord = (BitSize < 32);
 
@@ -7579,8 +7892,7 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadMinMax(
     MachineInstr &MI, MachineBasicBlock *MBB, unsigned CompareOpcode,
     unsigned KeepOldMask, unsigned BitSize) const {
   MachineFunction &MF = *MBB->getParent();
-  const SystemZInstrInfo *TII =
-      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+  const SystemZInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   bool IsSubWord = (BitSize < 32);
 
@@ -7693,8 +8005,7 @@ MachineBasicBlock *
 SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr &MI,
                                           MachineBasicBlock *MBB) const {
   MachineFunction &MF = *MBB->getParent();
-  const SystemZInstrInfo *TII =
-      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+  const SystemZInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
   // Extract the operands.  Base can be a register or a frame index.
@@ -7810,8 +8121,7 @@ MachineBasicBlock *
 SystemZTargetLowering::emitPair128(MachineInstr &MI,
                                    MachineBasicBlock *MBB) const {
   MachineFunction &MF = *MBB->getParent();
-  const SystemZInstrInfo *TII =
-      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+  const SystemZInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   DebugLoc DL = MI.getDebugLoc();
 
@@ -7838,8 +8148,7 @@ MachineBasicBlock *SystemZTargetLowering::emitExt128(MachineInstr &MI,
                                                      MachineBasicBlock *MBB,
                                                      bool ClearEven) const {
   MachineFunction &MF = *MBB->getParent();
-  const SystemZInstrInfo *TII =
-      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+  const SystemZInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   DebugLoc DL = MI.getDebugLoc();
 
@@ -7870,8 +8179,7 @@ SystemZTargetLowering::emitMemMemWrapper(MachineInstr &MI,
                                          MachineBasicBlock *MBB,
                                          unsigned Opcode, bool IsMemset) const {
   MachineFunction &MF = *MBB->getParent();
-  const SystemZInstrInfo *TII =
-      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+  const SystemZInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   DebugLoc DL = MI.getDebugLoc();
 
@@ -8225,8 +8533,7 @@ SystemZTargetLowering::emitMemMemWrapper(MachineInstr &MI,
 MachineBasicBlock *SystemZTargetLowering::emitStringWrapper(
     MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const {
   MachineFunction &MF = *MBB->getParent();
-  const SystemZInstrInfo *TII =
-      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+  const SystemZInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   DebugLoc DL = MI.getDebugLoc();
 
@@ -8331,8 +8638,7 @@ MachineBasicBlock *SystemZTargetLowering::emitLoadAndTestCmp0(
     MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const {
   MachineFunction &MF = *MBB->getParent();
   MachineRegisterInfo *MRI = &MF.getRegInfo();
-  const SystemZInstrInfo *TII =
-      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+  const SystemZInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
 
   Register SrcReg = MI.getOperand(0).getReg();
@@ -8355,8 +8661,7 @@ MachineBasicBlock *SystemZTargetLowering::emitProbedAlloca(
     MachineInstr &MI, MachineBasicBlock *MBB) const {
   MachineFunction &MF = *MBB->getParent();
   MachineRegisterInfo *MRI = &MF.getRegInfo();
-  const SystemZInstrInfo *TII =
-      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+  const SystemZInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
   const unsigned ProbeSize = getStackProbeSize(MF);
   Register DstReg = MI.getOperand(0).getReg();
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index de446f33f5f1..b9c95274f62b 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -419,8 +419,7 @@ public:
   getNumRegisters(LLVMContext &Context, EVT VT,
                   Optional<MVT> RegisterVT) const override {
     // i128 inline assembly operand.
-    if (VT == MVT::i128 &&
-        RegisterVT.hasValue() && RegisterVT.getValue() == MVT::Untyped)
+    if (VT == MVT::i128 && RegisterVT && *RegisterVT == MVT::Untyped)
       return 1;
     return TargetLowering::getNumRegisters(Context, VT);
   }
@@ -457,6 +456,12 @@ public:
   bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, Align Alignment,
                                       MachineMemOperand::Flags Flags,
                                       bool *Fast) const override;
+  bool
+  findOptimalMemOpLowering(std::vector<EVT> &MemOps, unsigned Limit,
+                           const MemOp &Op, unsigned DstAS, unsigned SrcAS,
+                           const AttributeList &FuncAttributes) const override;
+  EVT getOptimalMemOpType(const MemOp &Op,
+                          const AttributeList &FuncAttributes) const override;
   bool isTruncateFree(Type *, Type *) const override;
   bool isTruncateFree(EVT, EVT) const override;
 
@@ -467,6 +472,8 @@ public:
     return VT == MVT::i32 || VT == MVT::i64;
   }
 
+  bool shouldConsiderGEPOffsetSplit() const override { return true; }
+
   const char *getTargetNodeName(unsigned Opcode) const override;
   std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
@@ -497,6 +504,19 @@ public:
       case 'T':
         return InlineAsm::Constraint_T;
       }
+    } else if (ConstraintCode.size() == 2 && ConstraintCode[0] == 'Z') {
+      switch (ConstraintCode[1]) {
+      default:
+        break;
+      case 'Q':
+        return InlineAsm::Constraint_ZQ;
+      case 'R':
+        return InlineAsm::Constraint_ZR;
+      case 'S':
+        return InlineAsm::Constraint_ZS;
+      case 'T':
+        return InlineAsm::Constraint_ZT;
+      }
     }
     return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
   }
@@ -553,6 +573,12 @@ public:
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
 
+  std::pair<SDValue, SDValue>
+  makeExternalCall(SDValue Chain, SelectionDAG &DAG, const char *CalleeName,
+                   EVT RetVT, ArrayRef<SDValue> Ops, CallingConv::ID CallConv,
+                   bool IsSigned, SDLoc DL, bool DoesNotReturn,
+                   bool IsReturnValueUsed) const;
+
   bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
                       bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
@@ -622,8 +648,12 @@ private:
   SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVASTART_ELF(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVASTART_XPLINK(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerDYNAMIC_STACKALLOC_ELF(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerDYNAMIC_STACKALLOC_XPLINK(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerGET_DYNAMIC_AREA_OFFSET(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerUMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
@@ -657,6 +687,7 @@ private:
   SDValue lowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerZERO_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const;
+  SDValue lowerIS_FPCLASS(SDValue Op, SelectionDAG &DAG) const;
 
   bool canTreatAsByteVector(EVT VT) const;
   SDValue combineExtract(const SDLoc &DL, EVT ElemVT, EVT VecVT, SDValue OrigOp,
@@ -743,12 +774,15 @@ private:
   APInt SplatUndef;          // Bits correspoding to undef operands of the BVN.
   unsigned SplatBitSize = 0;
   bool isFP128 = false;
-
 public:
   unsigned Opcode = 0;
   SmallVector<unsigned, 2> OpVals;
   MVT VecVT;
-  SystemZVectorConstantInfo(APFloat FPImm);
+  SystemZVectorConstantInfo(APInt IntImm);
+  SystemZVectorConstantInfo(APFloat FPImm)
+      : SystemZVectorConstantInfo(FPImm.bitcastToAPInt()) {
+    isFP128 = (&FPImm.getSemantics() == &APFloat::IEEEquad());
+  }
   SystemZVectorConstantInfo(BuildVectorSDNode *BVN);
   bool isVectorConstantLegal(const SystemZSubtarget &Subtarget);
 };
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 4b6aa60f5d55..1436be1e4052 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -30,6 +31,7 @@
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/BranchProbability.h"
@@ -119,9 +121,11 @@ void SystemZInstrInfo::splitAdjDynAlloc(MachineBasicBlock::iterator MI) const {
   MachineFunction &MF = *MBB->getParent();
   MachineFrameInfo &MFFrame = MF.getFrameInfo();
   MachineOperand &OffsetMO = MI->getOperand(2);
+  SystemZCallingConventionRegisters *Regs = STI.getSpecialRegisters();
 
   uint64_t Offset = (MFFrame.getMaxCallFrameSize() +
-                     SystemZMC::ELFCallFrameSize +
+                     Regs->getCallFrameSize() +
+                     Regs->getStackPointerBias() +
                      OffsetMO.getImm());
   unsigned NewOpcode = getOpcodeForOffset(SystemZ::LA, Offset);
   assert(NewOpcode && "No support for huge argument lists yet");
@@ -393,8 +397,7 @@ bool SystemZInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
       }
 
       // If the block has any instructions after a JMP, delete them.
-      while (std::next(I) != MBB.end())
-        std::next(I)->eraseFromParent();
+      MBB.erase(std::next(I), MBB.end());
 
       Cond.clear();
       FBB = nullptr;
@@ -674,6 +677,7 @@ bool SystemZInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
 bool SystemZInstrInfo::isPredicable(const MachineInstr &MI) const {
   unsigned Opcode = MI.getOpcode();
   if (Opcode == SystemZ::Return ||
+      Opcode == SystemZ::Return_XPLINK ||
       Opcode == SystemZ::Trap ||
       Opcode == SystemZ::CallJG ||
       Opcode == SystemZ::CallBR)
@@ -731,18 +735,20 @@ bool SystemZInstrInfo::PredicateInstruction(
       .addReg(SystemZ::CC, RegState::Implicit);
     return true;
   }
-  if (Opcode == SystemZ::Return) {
-    MI.setDesc(get(SystemZ::CondReturn));
+  if (Opcode == SystemZ::Return || Opcode == SystemZ::Return_XPLINK) {
+    MI.setDesc(get(Opcode == SystemZ::Return ? SystemZ::CondReturn
+                                             : SystemZ::CondReturn_XPLINK));
     MachineInstrBuilder(*MI.getParent()->getParent(), MI)
-      .addImm(CCValid).addImm(CCMask)
-      .addReg(SystemZ::CC, RegState::Implicit);
+        .addImm(CCValid)
+        .addImm(CCMask)
+        .addReg(SystemZ::CC, RegState::Implicit);
     return true;
   }
   if (Opcode == SystemZ::CallJG) {
     MachineOperand FirstOp = MI.getOperand(0);
     const uint32_t *RegMask = MI.getOperand(1).getRegMask();
-    MI.RemoveOperand(1);
-    MI.RemoveOperand(0);
+    MI.removeOperand(1);
+    MI.removeOperand(0);
     MI.setDesc(get(SystemZ::CallBRCL));
     MachineInstrBuilder(*MI.getParent()->getParent(), MI)
         .addImm(CCValid)
@@ -755,8 +761,8 @@ bool SystemZInstrInfo::PredicateInstruction(
   if (Opcode == SystemZ::CallBR) {
     MachineOperand Target = MI.getOperand(0);
     const uint32_t *RegMask = MI.getOperand(1).getRegMask();
-    MI.RemoveOperand(1);
-    MI.RemoveOperand(0);
+    MI.removeOperand(1);
+    MI.removeOperand(0);
     MI.setDesc(get(SystemZ::CallBCR));
     MachineInstrBuilder(*MI.getParent()->getParent(), MI)
       .addImm(CCValid).addImm(CCMask)
@@ -1626,7 +1632,8 @@ void SystemZInstrInfo::getLoadStoreOpcodes(const TargetRegisterClass *RC,
 }
 
 unsigned SystemZInstrInfo::getOpcodeForOffset(unsigned Opcode,
-                                              int64_t Offset) const {
+                                              int64_t Offset,
+                                              const MachineInstr *MI) const {
   const MCInstrDesc &MCID = get(Opcode);
   int64_t Offset2 = (MCID.TSFlags & SystemZII::Is128Bit ? Offset + 8 : Offset);
   if (isUInt<12>(Offset) && isUInt<12>(Offset2)) {
@@ -1648,6 +1655,24 @@ unsigned SystemZInstrInfo::getOpcodeForOffset(unsigned Opcode,
     // Check whether Opcode allows signed 20-bit displacements.
     if (MCID.TSFlags & SystemZII::Has20BitOffset)
       return Opcode;
+
+    // If a VR32/VR64 reg ended up in an FP register, use the FP opcode.
+    if (MI && MI->getOperand(0).isReg()) {
+      Register Reg = MI->getOperand(0).getReg();
+      if (Reg.isPhysical() && SystemZMC::getFirstReg(Reg) < 16) {
+        switch (Opcode) {
+        case SystemZ::VL32:
+          return SystemZ::LEY;
+        case SystemZ::VST32:
+          return SystemZ::STEY;
+        case SystemZ::VL64:
+          return SystemZ::LDY;
+        case SystemZ::VST64:
+          return SystemZ::STDY;
+        default: break;
+        }
+      }
+    }
   }
   return 0;
 }
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
index 9e5b2729a707..0525f5827736 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -47,8 +47,7 @@ enum {
   CCMaskFirst            = (1 << 18),
   CCMaskLast             = (1 << 19),
   IsLogical              = (1 << 20),
-  CCIfNoSignedWrap       = (1 << 21),
-  MemMemOp               = (1 << 22)
+  CCIfNoSignedWrap       = (1 << 21)
 };
 
 static inline unsigned getAccessSize(unsigned int Flags) {
@@ -309,8 +308,10 @@ public:
   // and the caller wants to perform that instruction's operation on an
   // address that has displacement Offset.  Return the opcode of a suitable
   // instruction (which might be Opcode itself) or 0 if no such instruction
-  // exists.
-  unsigned getOpcodeForOffset(unsigned Opcode, int64_t Offset) const;
+  // exists.  MI may be passed in order to allow examination of physical
+  // register operands (i.e. if a VR32/64 reg ended up as an FP or Vector reg).
+  unsigned getOpcodeForOffset(unsigned Opcode, int64_t Offset,
+                              const MachineInstr *MI = nullptr) const;
 
   // Return true if Opcode has a mapping in 12 <-> 20 bit displacements.
   bool hasDisplacementPairInsn(unsigned Opcode) const;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
index 84f1e0fb428c..ed7e3c02a10d 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -42,6 +42,10 @@ let Defs = [R1D, R15D, CC], Uses = [R15D], hasNoSchedulingInfo = 1,
     hasSideEffects = 1 in
   def PROBED_STACKALLOC : Pseudo<(outs), (ins i64imm:$stacksize), []>;
 
+let Defs = [R3D, CC], Uses = [R3D, R4D], hasNoSchedulingInfo = 1,
+    hasSideEffects = 1 in
+  def XPLINK_STACKALLOC : Pseudo<(outs), (ins), []>;
+
 //===----------------------------------------------------------------------===//
 // Branch instructions
 //===----------------------------------------------------------------------===//
@@ -285,6 +289,10 @@ let Predicates = [IsTargetXPLINK64] in {
     def CallBASR_XPLINK64  : Alias<4, (outs), (ins ADDR64:$R2, variable_ops),
                           [(z_call ADDR64:$R2)]>;
   }
+
+  let isCall = 1, Defs = [R3D, CC], Uses = [FPC] in {
+    def CallBASR_STACKEXT  : Alias<4, (outs), (ins ADDR64:$R2), []>;
+  }
 }
 
 // Regular calls.
@@ -336,13 +344,25 @@ let isCall = 1, isTerminator = 1, isReturn = 1 in {
   def CLGIBCall : Alias<6, (outs), (ins GR64:$R1, imm64zx8:$I2, cond4:$M3, ADDR64:$R4), []>;
 }
 
-// A return instruction (br %r14) for ELF and (b 2 %r7) for XPLink.
-let isReturn = 1, isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in
-  def Return : Alias<2, (outs), (ins), [(z_retflag)]>;
+let Predicates = [IsTargetXPLINK64] in {
+  // A return instruction (b 2(%r7)).
+  let isReturn = 1, isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in
+    def Return_XPLINK : Alias<4, (outs), (ins), [(z_retflag)]>;
+
+  // A conditional return instruction (bc <cond>, 2(%r7)).
+  let isReturn = 1, isTerminator = 1, hasCtrlDep = 1, CCMaskFirst = 1, Uses = [CC] in
+    def CondReturn_XPLINK : Alias<4, (outs), (ins cond4:$valid, cond4:$R1), []>;
+}
+
+let Predicates = [IsTargetELF] in {
+  // A return instruction (br %r14).
+  let isReturn = 1, isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in
+    def Return : Alias<2, (outs), (ins), [(z_retflag)]>;
 
-// A conditional return instruction (bcr <cond>, %r14).
-let isReturn = 1, isTerminator = 1, hasCtrlDep = 1, CCMaskFirst = 1, Uses = [CC] in
-  def CondReturn : Alias<2, (outs), (ins cond4:$valid, cond4:$R1), []>;
+  // A conditional return instruction (bcr <cond>, %r14).
+  let isReturn = 1, isTerminator = 1, hasCtrlDep = 1, CCMaskFirst = 1, Uses = [CC] in
+    def CondReturn : Alias<2, (outs), (ins cond4:$valid, cond4:$R1), []>;
+}
 
 // Fused compare and conditional returns.
 let isReturn = 1, isTerminator = 1, hasCtrlDep = 1 in {
diff --git a/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp b/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp
index d6c795985448..1e6f971906e9 100644
--- a/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp
@@ -66,7 +66,7 @@ bool SystemZLDCleanup::runOnMachineFunction(MachineFunction &F) {
   if (skipFunction(F.getFunction()))
     return false;
 
-  TII = static_cast<const SystemZInstrInfo *>(F.getSubtarget().getInstrInfo());
+  TII = F.getSubtarget<SystemZSubtarget>().getInstrInfo();
   MF = &F;
 
   SystemZMachineFunctionInfo* MFI = F.getInfo<SystemZMachineFunctionInfo>();
diff --git a/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp b/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp
index 9b6aa3593ce0..cada880a82d8 100644
--- a/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp
@@ -14,3 +14,9 @@ using namespace llvm;
 // pin vtable to this file
 void SystemZMachineFunctionInfo::anchor() {}
 
+MachineFunctionInfo *SystemZMachineFunctionInfo::clone(
+    BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+    const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+    const {
+  return DestMF.cloneInfo<SystemZMachineFunctionInfo>(*this);
+}
diff --git a/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h b/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
index ec4b812eb0e1..de73a5d86422 100644
--- a/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
@@ -41,6 +41,11 @@ public:
     : VarArgsFirstGPR(0), VarArgsFirstFPR(0), VarArgsFrameIndex(0),
       RegSaveFrameIndex(0), FramePointerSaveIndex(0), NumLocalDynamics(0) {}
 
+  MachineFunctionInfo *
+  clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+        const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+      const override;
+
   // Get and set the first and last call-saved GPR that should be saved by
   // this function and the SP offset for the STMG.  These are 0 if no GPRs
   // need to be saved or restored.
diff --git a/llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp b/llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp
index 5a2cfc53da49..e15f9027cc20 100644
--- a/llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp
@@ -17,6 +17,7 @@
 #include "SystemZInstrInfo.h"
 #include "SystemZSubtarget.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 using namespace llvm;
@@ -253,7 +254,7 @@ bool SystemZPostRewrite::selectMBB(MachineBasicBlock &MBB) {
 }
 
 bool SystemZPostRewrite::runOnMachineFunction(MachineFunction &MF) {
-  TII = static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  TII = MF.getSubtarget<SystemZSubtarget>().getInstrInfo();
 
   bool Modified = false;
   for (auto &MBB : MF)
diff --git a/llvm/lib/Target/SystemZ/SystemZProcessors.td b/llvm/lib/Target/SystemZ/SystemZProcessors.td
index 4fceaa14c598..d00b94d00242 100644
--- a/llvm/lib/Target/SystemZ/SystemZProcessors.td
+++ b/llvm/lib/Target/SystemZ/SystemZProcessors.td
@@ -38,5 +38,6 @@ def : ProcessorModel<"z14", Z14Model, Arch12SupportedFeatures.List>;
 def : ProcessorModel<"arch13", Z15Model, Arch13SupportedFeatures.List>;
 def : ProcessorModel<"z15", Z15Model, Arch13SupportedFeatures.List>;
 
-def : ProcessorModel<"arch14", Z15Model, Arch14SupportedFeatures.List>;
+def : ProcessorModel<"arch14", Z16Model, Arch14SupportedFeatures.List>;
+def : ProcessorModel<"z16", Z16Model, Arch14SupportedFeatures.List>;
 
diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index 48cec176b006..be65fe55c634 100644
--- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -290,8 +290,7 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
 
   MachineBasicBlock &MBB = *MI->getParent();
   MachineFunction &MF = *MBB.getParent();
-  auto *TII =
-      static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  auto *TII = MF.getSubtarget<SystemZSubtarget>().getInstrInfo();
   const SystemZFrameLowering *TFI = getFrameLowering(MF);
   DebugLoc DL = MI->getDebugLoc();
 
@@ -321,7 +320,7 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
   // See if the offset is in range, or if an equivalent instruction that
   // accepts the offset exists.
   unsigned Opcode = MI->getOpcode();
-  unsigned OpcodeForOffset = TII->getOpcodeForOffset(Opcode, Offset);
+  unsigned OpcodeForOffset = TII->getOpcodeForOffset(Opcode, Offset, &*MI);
   if (OpcodeForOffset) {
     if (OpcodeForOffset == SystemZ::LE &&
         MF.getSubtarget<SystemZSubtarget>().hasVector()) {
diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
index 8ce01074873a..93ffa9847f06 100644
--- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -74,7 +74,7 @@ public:
 
   /// Destroys the object. Bogus destructor allowing derived classes
   /// to override it.
-  virtual ~SystemZCallingConventionRegisters(){};
+  virtual ~SystemZCallingConventionRegisters() = default;
 };
 
 /// XPLINK64 calling convention specific use registers
@@ -102,7 +102,7 @@ public:
   int getStackPointerBias() override final { return 2048; }
 
   /// Destroys the object. Bogus destructor overriding base class destructor
-  ~SystemZXPLINK64Registers(){};
+  ~SystemZXPLINK64Registers() = default;
 };
 
 /// ELF calling convention specific use registers
@@ -128,7 +128,7 @@ public:
   int getStackPointerBias() override final { return 0; }
 
   /// Destroys the object. Bogus destructor overriding base class destructor
-  ~SystemZELFRegisters(){};
+  ~SystemZELFRegisters() = default;
 };
 
 struct SystemZRegisterInfo : public SystemZGenRegisterInfo {
diff --git a/llvm/lib/Target/SystemZ/SystemZSchedule.td b/llvm/lib/Target/SystemZ/SystemZSchedule.td
index 119e3ee7c22c..d683cc042e5c 100644
--- a/llvm/lib/Target/SystemZ/SystemZSchedule.td
+++ b/llvm/lib/Target/SystemZ/SystemZSchedule.td
@@ -53,12 +53,14 @@ foreach Num = ["", "2", "3", "4", "5", "6"] in {
   def "DFU"#Num : SchedWrite;
 }
 
-def VecFPd : SchedWrite; // Blocking BFP div/sqrt unit.
+def VecFPd   : SchedWrite; // Blocking BFP div/sqrt unit (30 cycles).
+def VecFPd20 : SchedWrite; // Blocking BFP div/sqrt unit, 20 cycles.
 
 def VBU : SchedWrite; // Virtual branching unit
 
 def MCD : SchedWrite; // Millicode
 
+include "SystemZScheduleZ16.td"
 include "SystemZScheduleZ15.td"
 include "SystemZScheduleZ14.td"
 include "SystemZScheduleZ13.td"
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
index f4777b0097f1..fd01a8a941c9 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
@@ -168,12 +168,12 @@ def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "CL(G)?T(Asm.*)?$")>;
 // Call
 def : InstRW<[WLat1, VBU, FXa2, GroupAlone], (instregex "(Call)?BRAS$")>;
 def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BRASL(_XPLINK64)?$")>;
-def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BAS(R)?(_XPLINK64)?$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BAS(R)?(_XPLINK64|_STACKEXT)?$")>;
 def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
 
 // Return
-def : InstRW<[WLat1, FXb, EndGroup], (instregex "Return$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "CondReturn$")>;
+def : InstRW<[WLat1, FXb, EndGroup], (instregex "Return(_XPLINK)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CondReturn(_XPLINK)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Move instructions
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
index f74c0d594482..3f406736a71f 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
@@ -169,12 +169,12 @@ def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "CL(G)?T(Asm.*)?$")>;
 // Call
 def : InstRW<[WLat1, VBU, FXa2, GroupAlone], (instregex "(Call)?BRAS$")>;
 def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BRASL(_XPLINK64)?$")>;
-def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BAS(R)?(_XPLINK64)?$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BAS(R)?(_XPLINK64|_STACKEXT)?$")>;
 def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
 
 // Return
-def : InstRW<[WLat1, FXb, EndGroup], (instregex "Return$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "CondReturn$")>;
+def : InstRW<[WLat1, FXb, EndGroup], (instregex "Return(_XPLINK)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CondReturn(_XPLINK)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Move instructions
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
index d17e58fc6318..6ae911c3f3eb 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
@@ -169,12 +169,12 @@ def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "CL(G)?T(Asm.*)?$")>;
 // Call
 def : InstRW<[WLat1, VBU, FXa2, GroupAlone], (instregex "(Call)?BRAS$")>;
 def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BRASL(_XPLINK64)?$")>;
-def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BAS(R)?(_XPLINK64)?$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BAS(R)?(_XPLINK64|_STACKEXT)?$")>;
 def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
 
 // Return
-def : InstRW<[WLat1, FXb, EndGroup], (instregex "Return$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "CondReturn$")>;
+def : InstRW<[WLat1, FXb, EndGroup], (instregex "Return(_XPLINK)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CondReturn(_XPLINK)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Move instructions
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
new file mode 100644
index 000000000000..ca688671a7e2
--- /dev/null
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
@@ -0,0 +1,1728 @@
+//-- SystemZScheduleZ16.td - SystemZ Scheduling Definitions ----*- tblgen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Z16 to support instruction
+// scheduling and other instruction cost heuristics.
+//
+// Pseudos expanded right after isel do not need to be modelled here.
+//
+//===----------------------------------------------------------------------===//
+
+def Z16Model : SchedMachineModel {
+
+    let UnsupportedFeatures = Arch14UnsupportedFeatures.List;
+
+    let IssueWidth = 6;             // Number of instructions decoded per cycle.
+    let MicroOpBufferSize = 60;     // Issue queues
+    let LoadLatency = 1;            // Optimistic load latency.
+
+    let PostRAScheduler = 1;
+
+    // Extra cycles for a mispredicted branch.
+    let MispredictPenalty = 20;
+}
+
+let SchedModel = Z16Model in  {
+// These definitions need the SchedModel value. They could be put in a
+// subtarget common include file, but it seems the include system in Tablegen
+// currently (2016) rejects multiple includes of same file.
+
+// Decoder grouping rules
+let NumMicroOps = 1 in {
+  def : WriteRes<NormalGr, []>;
+  def : WriteRes<BeginGroup, []> { let BeginGroup  = 1; }
+  def : WriteRes<EndGroup, []>   { let EndGroup    = 1; }
+}
+def : WriteRes<Cracked, []> {
+  let NumMicroOps = 2;
+  let BeginGroup  = 1;
+}
+def : WriteRes<GroupAlone, []> {
+  let NumMicroOps = 3;
+  let BeginGroup  = 1;
+  let EndGroup    = 1;
+}
+def : WriteRes<GroupAlone2, []> {
+  let NumMicroOps = 6;
+  let BeginGroup  = 1;
+  let EndGroup    = 1;
+}
+def : WriteRes<GroupAlone3, []> {
+  let NumMicroOps = 9;
+  let BeginGroup  = 1;
+  let EndGroup    = 1;
+}
+
+// Incoming latency removed from the register operand which is used together
+// with a memory operand by the instruction.
+def : ReadAdvance<RegReadAdv, 4>;
+
+// LoadLatency (above) is not used for instructions in this file. This is
+// instead the role of LSULatency, which is the latency value added to the
+// result of loads and instructions with folded memory operands.
+def : WriteRes<LSULatency, []> { let Latency = 4; let NumMicroOps = 0; }
+
+let NumMicroOps = 0 in {
+  foreach L = 1-30 in
+    def : WriteRes<!cast<SchedWrite>("WLat"#L), []> { let Latency = L; }
+}
+
+// Execution units.
+def Z16_FXaUnit     : ProcResource<2>;
+def Z16_FXbUnit     : ProcResource<2>;
+def Z16_LSUnit      : ProcResource<2>;
+def Z16_VecUnit     : ProcResource<2>;
+def Z16_VecFPdUnit  : ProcResource<2> { let BufferSize = 1; /* blocking */ }
+def Z16_VBUnit      : ProcResource<2>;
+def Z16_MCD         : ProcResource<1>;
+
+// Subtarget specific definitions of scheduling resources.
+let NumMicroOps = 0 in {
+  def : WriteRes<FXa, [Z16_FXaUnit]>;
+  def : WriteRes<FXb, [Z16_FXbUnit]>;
+  def : WriteRes<LSU, [Z16_LSUnit]>;
+  def : WriteRes<VecBF,  [Z16_VecUnit]>;
+  def : WriteRes<VecDF,  [Z16_VecUnit]>;
+  def : WriteRes<VecDFX, [Z16_VecUnit]>;
+  def : WriteRes<VecMul,  [Z16_VecUnit]>;
+  def : WriteRes<VecStr,  [Z16_VecUnit]>;
+  def : WriteRes<VecXsPm, [Z16_VecUnit]>;
+  foreach Num = 2-5 in { let ResourceCycles = [Num] in {
+    def : WriteRes<!cast<SchedWrite>("FXa"#Num), [Z16_FXaUnit]>;
+    def : WriteRes<!cast<SchedWrite>("FXb"#Num), [Z16_FXbUnit]>;
+    def : WriteRes<!cast<SchedWrite>("LSU"#Num), [Z16_LSUnit]>;
+    def : WriteRes<!cast<SchedWrite>("VecBF"#Num), [Z16_VecUnit]>;
+    def : WriteRes<!cast<SchedWrite>("VecDF"#Num), [Z16_VecUnit]>;
+    def : WriteRes<!cast<SchedWrite>("VecDFX"#Num), [Z16_VecUnit]>;
+    def : WriteRes<!cast<SchedWrite>("VecMul"#Num), [Z16_VecUnit]>;
+    def : WriteRes<!cast<SchedWrite>("VecStr"#Num), [Z16_VecUnit]>;
+    def : WriteRes<!cast<SchedWrite>("VecXsPm"#Num), [Z16_VecUnit]>;
+  }}
+
+  def : WriteRes<VecFPd,   [Z16_VecFPdUnit]> { let ResourceCycles = [30]; }
+  def : WriteRes<VecFPd20, [Z16_VecFPdUnit]> { let ResourceCycles = [20]; }
+
+  def : WriteRes<VBU,     [Z16_VBUnit]>; // Virtual Branching Unit
+}
+
+def : WriteRes<MCD, [Z16_MCD]> { let NumMicroOps = 3;
+                                 let BeginGroup  = 1;
+                                 let EndGroup    = 1; }
+
+// -------------------------- INSTRUCTIONS ---------------------------------- //
+
+// InstRW constructs have been used in order to preserve the
+// readability of the InstrInfo files.
+
+// For each instruction, as matched by a regexp, provide a list of
+// resources that it needs. These will be combined into a SchedClass.
+
+//===----------------------------------------------------------------------===//
+// Stack allocation
+//===----------------------------------------------------------------------===//
+
+// Pseudo -> LA / LAY
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ADJDYNALLOC$")>;
+
+//===----------------------------------------------------------------------===//
+// Branch instructions
+//===----------------------------------------------------------------------===//
+
+// Branch
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Call)?BRC(L)?(Asm.*)?$")>;
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Call)?J(G)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "(Call)?BC(R)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "(Call)?B(R)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "BI(C)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXa, EndGroup], (instregex "BRCT(G)?$")>;
+def : InstRW<[WLat1, FXa, FXb, GroupAlone], (instregex "BRCTH$")>;
+def : InstRW<[WLat1, FXa, FXb, GroupAlone], (instregex "BCT(G)?(R)?$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "B(R)?X(H|L).*$")>;
+
+// Compare and branch
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(L)?(G)?(I|R)J(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb2, GroupAlone],
+             (instregex "C(L)?(G)?(I|R)B(Call|Return|Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Trap instructions
+//===----------------------------------------------------------------------===//
+
+// Trap
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Cond)?Trap$")>;
+
+// Compare and trap
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(G)?(I|R)T(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CL(G)?RT(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CL(F|G)IT(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "CL(G)?T(Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Call and return instructions
+//===----------------------------------------------------------------------===//
+
+// Call
+def : InstRW<[WLat1, VBU, FXa2, GroupAlone], (instregex "(Call)?BRAS$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BRASL(_XPLINK64)?$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BAS(R)?(_XPLINK64|_STACKEXT)?$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
+
+// Return
+def : InstRW<[WLat1, FXb, EndGroup], (instregex "Return(_XPLINK)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CondReturn(_XPLINK)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Move instructions
+//===----------------------------------------------------------------------===//
+
+// Moves
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "MV(G|H)?HI$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "MVI(Y)?$")>;
+
+// Move character
+def : InstRW<[WLat1, FXb, LSU3, GroupAlone], (instregex "MVC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "MVCL(E|U)?$")>;
+def : InstRW<[WLat1, LSU2, GroupAlone], (instregex "MVCRL$")>;
+
+// Pseudo -> reg move
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "COPY(_TO_REGCLASS)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "EXTRACT_SUBREG$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "INSERT_SUBREG$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "REG_SEQUENCE$")>;
+
+// Loads
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L(Y|FH|RL|Mux)?$")>;
+def : InstRW<[LSULatency, LSULatency, LSU, NormalGr], (instregex "LCBB$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LG(RL)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L128$")>;
+
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLIH(F|H|L)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLIL(F|H|L)$")>;
+
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LG(F|H)I$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LHI(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LR$")>;
+
+// Load and zero rightmost byte
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LZR(F|G)$")>;
+
+// Load and trap
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "L(FH|G)?AT$")>;
+
+// Load and test
+def : InstRW<[WLat1LSU, WLat1LSU, LSU, FXa, NormalGr], (instregex "LT(G)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LT(G)?R$")>;
+
+// Stores
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STG(RL)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST128$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(Y|FH|RL|Mux)?$")>;
+
+// String moves.
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "MVST$")>;
+
+//===----------------------------------------------------------------------===//
+// Conditional move instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2, FXa, NormalGr], (instregex "LOCRMux$")>;
+def : InstRW<[WLat2, FXa, NormalGr], (instregex "LOC(G|FH)?R(Asm.*)?$")>;
+def : InstRW<[WLat2, FXa, NormalGr], (instregex "LOC(G|H)?HI(Mux|(Asm.*))?$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "LOC(G|FH|Mux)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr],
+             (instregex "STOC(G|FH|Mux)?(Asm.*)?$")>;
+
+def : InstRW<[WLat2, FXa, NormalGr], (instregex "SELRMux$")>;
+def : InstRW<[WLat2, FXa, NormalGr], (instregex "SEL(G|FH)?R(Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Sign extensions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "L(B|H|G)R$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LG(B|H|F)R$")>;
+
+def : InstRW<[WLat1LSU, WLat1LSU, FXa, LSU, NormalGr], (instregex "LTGF$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LTGFR$")>;
+
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LB(H|Mux)?$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LH(Y)?$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LH(H|Mux|RL)$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LG(B|H|F)$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LG(H|F)RL$")>;
+
+//===----------------------------------------------------------------------===//
+// Zero extensions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLCR(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLHR(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLG(C|H|F|T)R$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLC(Mux)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLH(Mux)?$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LL(C|H)H$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLHRL$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLG(C|H|F|T|HRL|FRL)$")>;
+
+// Load and zero rightmost byte
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLZRGF$")>;
+
+// Load and trap
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "LLG(F|T)?AT$")>;
+
+//===----------------------------------------------------------------------===//
+// Truncations
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STC(H|Y|Mux)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STH(H|Y|RL|Mux)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STCM(H|Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Multi-register moves
+//===----------------------------------------------------------------------===//
+
+// Load multiple (estimated average of 5 ops)
+def : InstRW<[WLat10, WLat10, LSU5, GroupAlone], (instregex "LM(H|Y|G)?$")>;
+
+// Load multiple disjoint
+def : InstRW<[WLat30, WLat30, MCD], (instregex "LMD$")>;
+
+// Store multiple
+def : InstRW<[WLat1, LSU2, FXb3, GroupAlone], (instregex "STM(G|H|Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Byte swaps
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LRV(G)?R$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LRV(G|H)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STRV(G|H)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVCIN$")>;
+
+//===----------------------------------------------------------------------===//
+// Load address instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LA(Y|RL)?$")>;
+
+// Load the Global Offset Table address ( -> larl )
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "GOT$")>;
+
+//===----------------------------------------------------------------------===//
+// Absolute and Negation
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, WLat1, FXa, NormalGr], (instregex "LP(G)?R$")>;
+def : InstRW<[WLat2, WLat2, FXa2, Cracked], (instregex "L(N|P)GFR$")>;
+def : InstRW<[WLat1, WLat1, FXa, NormalGr], (instregex "LN(R|GR)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LC(R|GR)$")>;
+def : InstRW<[WLat2, WLat2, FXa2, Cracked], (instregex "LCGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// Insertion
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1LSU, RegReadAdv, FXa, LSU, NormalGr], (instregex "IC(Y)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "IC32(Y)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, WLat1LSU, FXa, LSU, NormalGr],
+             (instregex "ICM(H|Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "II(F|H|L)Mux$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IIHF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IIHH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IIHL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IILF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IILH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IILL(64)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Addition
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "A(Y)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "AH(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AIH$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AFI(Mux)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "AG$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AGFI$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AGHI(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AHI(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AHIMux(K)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "AL(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AL(FI|HSIK)$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "ALG(F)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALGHSIK$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALGF(I|R)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "A(L)?HHHR$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "A(L)?HHLR$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALSIH(N)?$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "A(L)?(G)?SI$")>;
+
+// Logical addition with carry
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, GroupAlone],
+             (instregex "ALC(G)?$")>;
+def : InstRW<[WLat2, WLat2, FXa, GroupAlone], (instregex "ALC(G)?R$")>;
+
+// Add with sign extension (16/32 -> 64)
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "AG(F|H)$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "AGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// Subtraction
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "S(G|Y)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "SH(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLFI$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "SL(G|GF|Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLGF(I|R)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "S(L)?HHHR$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "S(L)?HHLR$")>;
+
+// Subtraction with borrow
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, GroupAlone],
+             (instregex "SLB(G)?$")>;
+def : InstRW<[WLat2, WLat2, FXa, GroupAlone], (instregex "SLB(G)?R$")>;
+
+// Subtraction with sign extension (16/32 -> 64)
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "SG(F|H)$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "SGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// AND
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "N(G|Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NI(FMux|HMux|LMux)$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "NI(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NIHF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NIHH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NIHL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NILF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NILH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NILL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NR(K)?$")>;
+def : InstRW<[WLat3LSU, LSU2, FXb, Cracked], (instregex "NC$")>;
+
+//===----------------------------------------------------------------------===//
+// OR
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "O(G|Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OGR(K)?$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "OI(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OI(FMux|HMux|LMux)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OIHF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OIHH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OIHL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OILF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OILH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OILL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OR(K)?$")>;
+def : InstRW<[WLat3LSU, LSU2, FXb, Cracked], (instregex "OC$")>;
+
+//===----------------------------------------------------------------------===//
+// XOR
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "X(G|Y)?$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "XI(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XIFMux$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XIHF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XILF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XR(K)?$")>;
+def : InstRW<[WLat3LSU, LSU2, FXb, Cracked], (instregex "XC$")>;
+
+//===----------------------------------------------------------------------===//
+// Combined logical operations
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NC(G)?RK$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OC(G)?RK$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NN(G)?RK$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NO(G)?RK$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NX(G)?RK$")>;
+
+//===----------------------------------------------------------------------===//
+// Multiplication
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat5LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "MS(GF|Y)?$")>;
+def : InstRW<[WLat5, FXa, NormalGr], (instregex "MS(R|FI)$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, FXa, LSU, NormalGr], (instregex "MSG$")>;
+def : InstRW<[WLat7, FXa, NormalGr], (instregex "MSGR$")>;
+def : InstRW<[WLat5, FXa, NormalGr], (instregex "MSGF(I|R)$")>;
+def : InstRW<[WLat8LSU, RegReadAdv, FXa2, LSU, GroupAlone], (instregex "MLG$")>;
+def : InstRW<[WLat8, FXa2, GroupAlone], (instregex "MLGR$")>;
+def : InstRW<[WLat4, FXa, NormalGr], (instregex "MGHI$")>;
+def : InstRW<[WLat4, FXa, NormalGr], (instregex "MHI$")>;
+def : InstRW<[WLat4LSU, RegReadAdv, FXa, LSU, NormalGr], (instregex "MH(Y)?$")>;
+def : InstRW<[WLat6, FXa2, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[WLat6LSU, RegReadAdv, FXa2, LSU, GroupAlone],
+             (instregex "M(FY|L)?$")>;
+def : InstRW<[WLat8, RegReadAdv, FXa, LSU, NormalGr], (instregex "MGH$")>;
+def : InstRW<[WLat12, RegReadAdv, FXa2, LSU, GroupAlone], (instregex "MG$")>;
+def : InstRW<[WLat8, FXa2, GroupAlone], (instregex "MGRK$")>;
+def : InstRW<[WLat6LSU, WLat6LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "MSC$")>;
+def : InstRW<[WLat8LSU, WLat8LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "MSGC$")>;
+def : InstRW<[WLat6, WLat6, FXa, NormalGr], (instregex "MSRKC$")>;
+def : InstRW<[WLat8, WLat8, FXa, NormalGr], (instregex "MSGRKC$")>;
+
+//===----------------------------------------------------------------------===//
+// Division and remainder
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat20, FXa4, GroupAlone], (instregex "DR$")>;
+def : InstRW<[WLat30, RegReadAdv, FXa4, LSU, GroupAlone2], (instregex "D$")>;
+def : InstRW<[WLat30, FXa2, GroupAlone], (instregex "DSG(F)?R$")>;
+def : InstRW<[WLat30, RegReadAdv, FXa2, LSU, GroupAlone2],
+             (instregex "DSG(F)?$")>;
+def : InstRW<[WLat20, FXa4, GroupAlone], (instregex "DLR$")>;
+def : InstRW<[WLat30, FXa4, GroupAlone], (instregex "DLGR$")>;
+def : InstRW<[WLat30, RegReadAdv, FXa4, LSU, GroupAlone2],
+             (instregex "DL(G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Shifts
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLL(G|K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SRL(G|K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SRA(G|K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLA(G|K)?$")>;
+def : InstRW<[WLat5LSU, WLat5LSU, FXa4, LSU, GroupAlone2],
+             (instregex "S(L|R)D(A|L)$")>;
+
+// Rotate
+def : InstRW<[WLat2LSU, FXa, LSU, NormalGr], (instregex "RLL(G)?$")>;
+
+// Rotate and insert
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBG(N|32)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBH(G|H|L)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBL(G|H|L)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBMux$")>;
+
+// Rotate and Select
+def : InstRW<[WLat2, WLat2, FXa2, Cracked], (instregex "R(N|O|X)SBG$")>;
+
+//===----------------------------------------------------------------------===//
+// Comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr],
+             (instregex "C(G|Y|Mux)?$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CRL$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(F|H)I(Mux)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CG(F|H)I$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CG(HSI|RL)$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(G)?R$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CIH$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CHF$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CHSI$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr],
+             (instregex "CL(Y|Mux)?$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLFHSI$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLFI(Mux)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CLG$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLG(HRL|HSI)$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CLGF$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLGFRL$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLGF(I|R)$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLGR$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLGRL$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CLHF$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLH(RL|HSI)$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLIH$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLI(Y)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLR$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLRL$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(L)?HHR$")>;
+def : InstRW<[WLat2, FXb, NormalGr], (instregex "C(L)?HLR$")>;
+
+// Compare halfword
+def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CH(Y)?$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "CHRL$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CGH$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "CGHRL$")>;
+def : InstRW<[WLat2LSU, FXa, FXb, LSU, Cracked], (instregex "CHHSI$")>;
+
+// Compare with sign extension (32 -> 64)
+def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CGF$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "CGFRL$")>;
+def : InstRW<[WLat2, FXb, NormalGr], (instregex "CGFR$")>;
+
+// Compare logical character
+def : InstRW<[WLat6, FXb, LSU2, Cracked], (instregex "CLC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CLCL(E|U)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CLST$")>;
+
+// Test under mask
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "TM(Y)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TM(H|L)Mux$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMHH(64)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMHL(64)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMLH(64)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMLL(64)?$")>;
+
+// Compare logical characters under mask
+def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr],
+             (instregex "CLM(H|Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Prefetch and execution hint
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, LSU, NormalGr], (instregex "PFD(RL)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "BPP$")>;
+def : InstRW<[FXb, EndGroup], (instregex "BPRP$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "NIAI$")>;
+
+//===----------------------------------------------------------------------===//
+// Atomic operations
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, EndGroup], (instregex "Serialize$")>;
+
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAA(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAAL(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAN(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAO(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAX(G)?$")>;
+
+// Test and set
+def : InstRW<[WLat2LSU, FXb, LSU, EndGroup], (instregex "TS$")>;
+
+// Compare and swap
+def : InstRW<[WLat3LSU, WLat3LSU, FXa, FXb, LSU, GroupAlone],
+             (instregex "CS(G|Y)?$")>;
+
+// Compare double and swap
+def : InstRW<[WLat6LSU, WLat6LSU, FXa3, FXb2, LSU, GroupAlone2],
+             (instregex "CDS(Y)?$")>;
+def : InstRW<[WLat15, WLat15, FXa2, FXb4, LSU3,
+              GroupAlone3], (instregex "CDSG$")>;
+
+// Compare and swap and store
+def : InstRW<[WLat30, MCD], (instregex "CSST$")>;
+
+// Perform locked operation
+def : InstRW<[WLat30, MCD], (instregex "PLO$")>;
+
+// Load/store pair from/to quadword
+def : InstRW<[WLat4LSU, LSU2, GroupAlone], (instregex "LPQ$")>;
+def : InstRW<[WLat1, FXb2, LSU, GroupAlone], (instregex "STPQ$")>;
+
+// Load pair disjoint
+def : InstRW<[WLat1LSU, WLat1LSU, LSU2, GroupAlone], (instregex "LPD(G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Translate and convert
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "TR$")>;
+def : InstRW<[WLat30, WLat30, WLat30, FXa3, LSU2, GroupAlone2],
+             (instregex "TRT$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TRTR$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "TRE$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TRT(R)?E(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD],
+             (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Message-security assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD],
+             (instregex "KM(C|F|O|CTR|A)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD],
+             (instregex "(KIMD|KLMD|KMAC|KDSA)$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD],
+             (instregex "(PCC|PPNO|PRNO)$")>;
+
+//===----------------------------------------------------------------------===//
+// Guarded storage
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LGG$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLGFSG$")>;
+def : InstRW<[WLat30, MCD], (instregex "(L|ST)GSC$")>;
+
+//===----------------------------------------------------------------------===//
+// Decimal arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat20, RegReadAdv, FXb, VecDF2, LSU2, GroupAlone2],
+             (instregex "CVBG$")>;
+def : InstRW<[WLat20, RegReadAdv, FXb, VecDF, LSU, GroupAlone2],
+             (instregex "CVB(Y)?$")>;
+def : InstRW<[WLat1, FXb3, VecDF4, LSU, GroupAlone3], (instregex "CVDG$")>;
+def : InstRW<[WLat1, FXb2, VecDF, LSU, GroupAlone2], (instregex "CVD(Y)?$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "MV(N|O|Z)$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
+def : InstRW<[WLat12, LSU5, GroupAlone], (instregex "UNPK(A|U)$")>;
+def : InstRW<[WLat1, FXb, LSU2, Cracked], (instregex "UNPK$")>;
+
+def : InstRW<[WLat5LSU, FXb, VecDFX, LSU3, GroupAlone2],
+             (instregex "(A|S|ZA)P$")>;
+def : InstRW<[WLat1, FXb, VecDFX2, LSU3, GroupAlone2], (instregex "MP$")>;
+def : InstRW<[WLat1, FXb, VecDFX4, LSU3, GroupAlone2], (instregex "DP$")>;
+def : InstRW<[WLat15, FXb, VecDFX2, LSU2, GroupAlone3], (instregex "SRP$")>;
+def : InstRW<[WLat8, VecDFX, LSU, LSU, GroupAlone], (instregex "CP$")>;
+def : InstRW<[WLat3LSU, VecDFX, LSU, Cracked], (instregex "TP$")>;
+def : InstRW<[WLat30, MCD], (instregex "ED(MK)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Access registers
+//===----------------------------------------------------------------------===//
+
+// Extract/set/copy access register
+def : InstRW<[WLat3, LSU, NormalGr], (instregex "(EAR|SAR|CPYA)$")>;
+
+// Load address extended
+def : InstRW<[WLat5, LSU, FXa, Cracked], (instregex "LAE(Y)?$")>;
+
+// Load/store access multiple (not modeled precisely)
+def : InstRW<[WLat20, WLat20, LSU5, GroupAlone], (instregex "LAM(Y)?$")>;
+def : InstRW<[WLat1, LSU5, FXb, GroupAlone2], (instregex "STAM(Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Program mask and addressing mode
+//===----------------------------------------------------------------------===//
+
+// Insert Program Mask
+def : InstRW<[WLat3, FXa, EndGroup], (instregex "IPM$")>;
+
+// Set Program Mask
+def : InstRW<[WLat3, LSU, EndGroup], (instregex "SPM$")>;
+
+// Branch and link
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "BAL(R)?$")>;
+
+// Test addressing mode
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TAM$")>;
+
+// Set addressing mode
+def : InstRW<[WLat1, FXb, EndGroup], (instregex "SAM(24|31|64)$")>;
+
+// Branch (and save) and set mode.
+def : InstRW<[WLat1, FXa, FXb, GroupAlone], (instregex "BSM$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "BASSM$")>;
+
+//===----------------------------------------------------------------------===//
+// Transactional execution
+//===----------------------------------------------------------------------===//
+
+// Transaction begin
+def : InstRW<[WLat9, LSU2, FXb5, GroupAlone2], (instregex "TBEGIN(C)?$")>;
+
+// Transaction end
+def : InstRW<[WLat1, FXb, GroupAlone], (instregex "TEND$")>;
+
+// Transaction abort
+def : InstRW<[WLat30, MCD], (instregex "TABORT$")>;
+
+// Extract Transaction Nesting Depth
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ETND$")>;
+
+// Nontransactional store
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "NTSTG$")>;
+
+//===----------------------------------------------------------------------===//
+// Processor assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, GroupAlone], (instregex "PPA$")>;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions.
+//===----------------------------------------------------------------------===//
+
+// Find leftmost one
+def : InstRW<[WLat5, WLat5, FXa2, GroupAlone], (instregex "FLOGR$")>;
+
+// Population count
+def : InstRW<[WLat3, WLat3, FXa, NormalGr], (instregex "POPCNT(Opt)?$")>;
+
+// String instructions
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "SRST(U)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CUSE$")>;
+
+// Various complex instructions
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "CFC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, WLat30, WLat30, MCD],
+             (instregex "UPT$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CKSM$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "CMPSC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "SORTL$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "DFLTCC$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "NNPA$")>;
+
+// Execute
+def : InstRW<[WLat1, FXb, GroupAlone], (instregex "EX(RL)?$")>;
+
+//===----------------------------------------------------------------------===//
+// .insn directive instructions
+//===----------------------------------------------------------------------===//
+
+// An "empty" sched-class will be assigned instead of the "invalid sched-class".
+// getNumDecoderSlots() will then return 1 instead of 0.
+def : InstRW<[], (instregex "Insn.*")>;
+
+
+// ----------------------------- Floating point ----------------------------- //
+
+//===----------------------------------------------------------------------===//
+// FP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load zero
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER)$")>;
+def : InstRW<[WLat2, FXb2, Cracked], (instregex "LZXR$")>;
+
+// Load
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[WLat3, FXb, NormalGr], (instregex "LGDR$")>;
+def : InstRW<[WLat2, FXb2, GroupAlone], (instregex "LXR$")>;
+
+// Load and Test
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)BR$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)BRCompare$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone],
+             (instregex "LTXBR(Compare)?$")>;
+
+// Copy sign
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "CPSDR(d|s)(d|s)$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Load instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2LSU, VecXsPm, LSU, NormalGr], (instregex "LE(Y)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LD(Y|E32)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Store instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(E|D)(Y)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STX$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "LEDBR(A)?$")>;
+def : InstRW<[WLat9, VecDF2, NormalGr], (instregex "L(E|D)XBR(A)?$")>;
+
+// Load lengthened
+def : InstRW<[WLat6LSU, VecBF, LSU, NormalGr], (instregex "LDEB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "LDEBR$")>;
+def : InstRW<[WLat7LSU, VecBF4, LSU, GroupAlone], (instregex "LX(E|D)B$")>;
+def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "LX(E|D)BR$")>;
+
+// Convert from fixed / logical
+def : InstRW<[WLat7, FXb, VecBF, Cracked], (instregex "C(E|D)(F|G)BR(A)?$")>;
+def : InstRW<[WLat11, FXb, VecDF4, GroupAlone2], (instregex "CX(F|G)BR(A)?$")>;
+def : InstRW<[WLat7, FXb, VecBF, Cracked], (instregex "C(E|D)L(F|G)BR$")>;
+def : InstRW<[WLat11, FXb, VecDF4, GroupAlone2], (instregex "CXL(F|G)BR$")>;
+
+// Convert to fixed / logical
+def : InstRW<[WLat9, WLat9, FXb, VecBF, Cracked],
+             (instregex "C(F|G)(E|D)BR(A)?$")>;
+def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked],
+             (instregex "C(F|G)XBR(A)?$")>;
+def : InstRW<[WLat9, WLat9, FXb, VecBF, GroupAlone], (instregex "CLFEBR$")>;
+def : InstRW<[WLat9, WLat9, FXb, VecBF, Cracked], (instregex "CLFDBR$")>;
+def : InstRW<[WLat9, WLat9, FXb, VecBF, Cracked], (instregex "CLG(E|D)BR$")>;
+def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "CL(F|G)XBR$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load Complement / Negative / Positive
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XBR$")>;
+
+// Square root
+def : InstRW<[WLat30, VecFPd, LSU, NormalGr], (instregex "SQ(E|D)B$")>;
+def : InstRW<[WLat20, VecFPd20, NormalGr], (instregex "SQEBR$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "SQDBR$")>;
+def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "SQXBR$")>;
+
+// Load FP integer
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "FI(E|D)BR(A)?$")>;
+def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "FIXBR(A)?$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[WLat6LSU, WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "A(E|D)B$")>;
+def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "A(E|D)BR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "AXBR$")>;
+
+// Subtraction
+def : InstRW<[WLat6LSU, WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "S(E|D)B$")>;
+def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "S(E|D)BR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "SXBR$")>;
+
+// Multiply
+def : InstRW<[WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "M(D|DE|EE)B$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "M(D|DE|EE)BR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, VecBF4, LSU, GroupAlone],
+             (instregex "MXDB$")>;
+def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "MXDBR$")>;
+def : InstRW<[WLat20, VecDF4, GroupAlone], (instregex "MXBR$")>;
+
+// Multiply and add / subtract
+def : InstRW<[WLat6LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
+             (instregex "M(A|S)EB$")>;
+def : InstRW<[WLat6, VecBF, GroupAlone], (instregex "M(A|S)EBR$")>;
+def : InstRW<[WLat6LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
+             (instregex "M(A|S)DB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "M(A|S)DBR$")>;
+
+// Division
+def : InstRW<[WLat20, RegReadAdv, VecFPd20, LSU, NormalGr], (instregex "DEB$")>;
+def : InstRW<[WLat30, RegReadAdv, VecFPd, LSU, NormalGr], (instregex "DDB$")>;
+def : InstRW<[WLat20, VecFPd20, NormalGr], (instregex "DEBR$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "DDBR$")>;
+def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "DXBR$")>;
+
+// Divide to integer
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "DI(E|D)BR$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[WLat3LSU, RegReadAdv, VecXsPm, LSU, NormalGr],
+             (instregex "(K|C)(E|D)B$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "(K|C)(E|D)BR$")>;
+def : InstRW<[WLat9, VecDF2, GroupAlone], (instregex "(K|C)XBR$")>;
+
+// Test Data Class
+def : InstRW<[WLat5, LSU, VecXsPm, NormalGr], (instregex "TC(E|D)B$")>;
+def : InstRW<[WLat10, LSU, VecDF4, GroupAlone], (instregex "TCXB$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Floating-point control register instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat4, FXa, LSU, GroupAlone], (instregex "EFPC$")>;
+def : InstRW<[WLat1, FXb, LSU, GroupAlone], (instregex "STFPC$")>;
+def : InstRW<[WLat3, LSU, GroupAlone], (instregex "SFPC$")>;
+def : InstRW<[WLat3LSU, LSU2, GroupAlone], (instregex "LFPC$")>;
+def : InstRW<[WLat30, MCD], (instregex "SFASR$")>;
+def : InstRW<[WLat30, MCD], (instregex "LFAS$")>;
+def : InstRW<[WLat3, FXb, GroupAlone], (instregex "SRNM(B|T)?$")>;
+
+
+// --------------------- Hexadecimal floating point ------------------------- //
+
+//===----------------------------------------------------------------------===//
+// HFP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load and Test
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)R$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "LTXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "(LEDR|LRER)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "LEXR$")>;
+def : InstRW<[WLat9, VecDF2, NormalGr], (instregex "(LDXR|LRDR)$")>;
+
+// Load lengthened
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LDE$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LDER$")>;
+def : InstRW<[WLat7LSU, VecBF4, LSU, GroupAlone], (instregex "LX(E|D)$")>;
+def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "LX(E|D)R$")>;
+
+// Convert from fixed
+def : InstRW<[WLat7, FXb, VecBF, Cracked], (instregex "C(E|D)(F|G)R$")>;
+def : InstRW<[WLat11, FXb, VecDF4, GroupAlone2], (instregex "CX(F|G)R$")>;
+
+// Convert to fixed
+def : InstRW<[WLat9, WLat9, FXb, VecBF, Cracked], (instregex "C(F|G)(E|D)R$")>;
+def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "C(F|G)XR$")>;
+
+// Convert BFP to HFP / HFP to BFP.
+def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "THD(E)?R$")>;
+def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "TB(E)?DR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load Complement / Negative / Positive
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)R$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XR$")>;
+
+// Halve
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "H(E|D)R$")>;
+
+// Square root
+def : InstRW<[WLat30, VecFPd, LSU, NormalGr], (instregex "SQ(E|D)$")>;
+def : InstRW<[WLat20, VecFPd20, NormalGr], (instregex "SQER$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "SQDR$")>;
+def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "SQXR$")>;
+
+// Load FP integer
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "FI(E|D)R$")>;
+def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "FIXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[WLat6LSU, WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "A(E|D|U|W)$")>;
+def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "A(E|D|U|W)R$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "AXR$")>;
+
+// Subtraction
+def : InstRW<[WLat6LSU, WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "S(E|D|U|W)$")>;
+def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "S(E|D|U|W)R$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "SXR$")>;
+
+// Multiply
+def : InstRW<[WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "M(D|DE|E|EE)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "M(D|DE|E|EE)R$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, VecBF4, LSU, GroupAlone],
+             (instregex "MXD$")>;
+def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "MXDR$")>;
+def : InstRW<[WLat20, VecDF4, GroupAlone], (instregex "MXR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, VecBF4, LSU, GroupAlone], (instregex "MY$")>;
+def : InstRW<[WLat6LSU, RegReadAdv, VecBF2, LSU, GroupAlone],
+             (instregex "MY(H|L)$")>;
+def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "MYR$")>;
+def : InstRW<[WLat6, VecBF, GroupAlone], (instregex "MY(H|L)R$")>;
+
+// Multiply and add / subtract
+def : InstRW<[WLat6LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
+             (instregex "M(A|S)(E|D)$")>;
+def : InstRW<[WLat6, VecBF, GroupAlone], (instregex "M(A|S)(E|D)R$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, VecBF4, LSU, GroupAlone],
+             (instregex "MAY$")>;
+def : InstRW<[WLat6LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
+             (instregex "MAY(H|L)$")>;
+def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "MAYR$")>;
+def : InstRW<[WLat6, VecBF, GroupAlone], (instregex "MAY(H|L)R$")>;
+
+// Division
+def : InstRW<[WLat20, RegReadAdv, VecFPd20, LSU, NormalGr], (instregex "DE$")>;
+def : InstRW<[WLat30, RegReadAdv, VecFPd, LSU, NormalGr], (instregex "DD$")>;
+def : InstRW<[WLat20, VecFPd20, NormalGr], (instregex "DER$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "DDR$")>;
+def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "DXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "C(E|D)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "C(E|D)R$")>;
+def : InstRW<[WLat10, VecDF2, GroupAlone], (instregex "CXR$")>;
+
+
+// ------------------------ Decimal floating point -------------------------- //
+
+//===----------------------------------------------------------------------===//
+// DFP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load and Test
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "LTDTR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "LTXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[WLat15, VecDF, NormalGr], (instregex "LEDTR$")>;
+def : InstRW<[WLat15, VecDF2, NormalGr], (instregex "LDXTR$")>;
+
+// Load lengthened
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "LDETR$")>;
+def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "LXDTR$")>;
+
+// Convert from fixed / logical
+def : InstRW<[WLat15, FXb, VecDF, Cracked], (instregex "CDFTR(A)?$")>;
+def : InstRW<[WLat20, FXb, VecDF, Cracked], (instregex "CDGTR(A)?$")>;
+def : InstRW<[WLat15, FXb, VecDF4, GroupAlone2], (instregex "CXFTR(A)?$")>;
+def : InstRW<[WLat20, FXb, VecDF4, GroupAlone2], (instregex "CXGTR(A)?$")>;
+def : InstRW<[WLat15, FXb, VecDF, Cracked], (instregex "CDLFTR$")>;
+def : InstRW<[WLat20, FXb, VecDF, Cracked], (instregex "CDLGTR$")>;
+def : InstRW<[WLat15, FXb, VecDF4, GroupAlone2], (instregex "CXLFTR$")>;
+def : InstRW<[WLat20, FXb, VecDF4, GroupAlone2], (instregex "CXLGTR$")>;
+
+// Convert to fixed / logical
+def : InstRW<[WLat20, WLat20, FXb, VecDF, Cracked],
+             (instregex "C(F|G)DTR(A)?$")>;
+def : InstRW<[WLat20, WLat20, FXb, VecDF2, Cracked],
+             (instregex "C(F|G)XTR(A)?$")>;
+def : InstRW<[WLat20, WLat20, FXb, VecDF, Cracked], (instregex "CL(F|G)DTR$")>;
+def : InstRW<[WLat20, WLat20, FXb, VecDF2, Cracked], (instregex "CL(F|G)XTR$")>;
+
+// Convert from / to signed / unsigned packed
+def : InstRW<[WLat9, FXb, VecDF, Cracked], (instregex "CD(S|U)TR$")>;
+def : InstRW<[WLat12, FXb2, VecDF4, GroupAlone2], (instregex "CX(S|U)TR$")>;
+def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "C(S|U)DTR$")>;
+def : InstRW<[WLat15, FXb2, VecDF4, GroupAlone2], (instregex "C(S|U)XTR$")>;
+
+// Convert from / to zoned
+def : InstRW<[WLat8LSU, LSU, VecDF, Cracked], (instregex "CDZT$")>;
+def : InstRW<[WLat16LSU, LSU2, VecDF4, GroupAlone3], (instregex "CXZT$")>;
+def : InstRW<[WLat1, FXb, LSU, VecDF, Cracked], (instregex "CZDT$")>;
+def : InstRW<[WLat1, FXb, LSU, VecDF2, GroupAlone], (instregex "CZXT$")>;
+
+// Convert from / to packed
+def : InstRW<[WLat8LSU, LSU, VecDF, Cracked], (instregex "CDPT$")>;
+def : InstRW<[WLat16LSU, LSU2, VecDF4, GroupAlone3], (instregex "CXPT$")>;
+def : InstRW<[WLat1, FXb, LSU, VecDF, Cracked], (instregex "CPDT$")>;
+def : InstRW<[WLat1, FXb, LSU, VecDF2, GroupAlone], (instregex "CPXT$")>;
+
+// Perform floating-point operation
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "PFPO$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load FP integer
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "FIDTR$")>;
+def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "FIXTR$")>;
+
+// Extract biased exponent
+def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "EEDTR$")>;
+def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "EEXTR$")>;
+
+// Extract significance
+def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "ESDTR$")>;
+def : InstRW<[WLat12, FXb, VecDF2, Cracked], (instregex "ESXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "ADTR(A)?$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "AXTR(A)?$")>;
+
+// Subtraction
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "SDTR(A)?$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "SXTR(A)?$")>;
+
+// Multiply
+def : InstRW<[WLat20, VecDF, NormalGr], (instregex "MDTR(A)?$")>;
+def : InstRW<[WLat30, VecDF4, GroupAlone], (instregex "MXTR(A)?$")>;
+
+// Division
+def : InstRW<[WLat30, VecDF, NormalGr], (instregex "DDTR(A)?$")>;
+def : InstRW<[WLat30, VecDF4, GroupAlone], (instregex "DXTR(A)?$")>;
+
+// Quantize
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "QADTR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "QAXTR$")>;
+
+// Reround
+def : InstRW<[WLat9, WLat9, FXb, VecDF, Cracked], (instregex "RRDTR$")>;
+def : InstRW<[WLat11, WLat11, FXb, VecDF4, GroupAlone2], (instregex "RRXTR$")>;
+
+// Shift significand left/right
+def : InstRW<[WLat11LSU, LSU, VecDF, GroupAlone], (instregex "S(L|R)DT$")>;
+def : InstRW<[WLat11LSU, LSU, VecDF4, GroupAlone], (instregex "S(L|R)XT$")>;
+
+// Insert biased exponent
+def : InstRW<[WLat9, FXb, VecDF, Cracked], (instregex "IEDTR$")>;
+def : InstRW<[WLat11, FXb, VecDF4, GroupAlone2], (instregex "IEXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "(K|C)DTR$")>;
+def : InstRW<[WLat9, VecDF2, GroupAlone], (instregex "(K|C)XTR$")>;
+
+// Compare biased exponent
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "CEDTR$")>;
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "CEXTR$")>;
+
+// Test Data Class/Group
+def : InstRW<[WLat15, LSU, VecDF, NormalGr], (instregex "TD(C|G)(E|D)T$")>;
+def : InstRW<[WLat15, LSU, VecDF2, GroupAlone], (instregex "TD(C|G)XT$")>;
+
+
+// --------------------------------- Vector --------------------------------- //
+
+//===----------------------------------------------------------------------===//
+// Vector: Move instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "VLR(32|64)?$")>;
+def : InstRW<[WLat3, FXb, NormalGr], (instregex "VLGV(B|F|G|H)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "VLVG(B|F|G|H)?$")>;
+def : InstRW<[WLat3, FXb, NormalGr], (instregex "VLVGP(32)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Immediate instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VZERO$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VONE$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VGBM$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VGM(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VREPI(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLEI(B|F|G|H)$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Loads
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(Align)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(L|BB)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(32|64)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEZ(B|F|G|H|LF)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLREP(B|F|G|H)?$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr],
+             (instregex "VLE(B|F|G|H)$")>;
+def : InstRW<[WLat5LSU, RegReadAdv, FXb, LSU, VecXsPm, Cracked],
+             (instregex "VGE(F|G)$")>;
+def : InstRW<[WLat4LSU, WLat4LSU, LSU5, GroupAlone],
+             (instregex "VLM(Align)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLRL(R)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Stores
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|32|64)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTE(F|G)$")>;
+def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTE(B|H)$")>;
+def : InstRW<[WLat1, LSU2, FXb3, GroupAlone2], (instregex "VSTM(Align)?$")>;
+def : InstRW<[WLat1, FXb2, LSU, Cracked], (instregex "VSCE(F|G)$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTRL(R)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Byte swaps
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLBR(H|F|G|Q)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLER(H|F|G)?$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr],
+             (instregex "VLEBR(H|F|G)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEBRZ(H|F|G|E)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLBRREP(H|F|G)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTBR(H|F|G|Q)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTER(H|F|G)?$")>;
+def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTEBRH$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTEBR(F|G)$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Selects and permutes
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMRH(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMRL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPERM$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPDI$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VBPERM$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VREP(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSEL$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Widening and narrowing
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPK(F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPKS(F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VPKS(F|G|H)S$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPKLS(F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VPKLS(F|G|H)S$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSEG(B|F|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPH(B|F|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPL(B|F)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPLH(B|F|H|W)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPLL(B|F|H)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Integer arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VA(B|F|G|H|Q|C|CQ)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VACC(B|F|G|H|Q|C|CQ)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VAVG(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VAVGL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VN(C|O|N|X)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VO(C)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VCKSM$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCLZ(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCTZ(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VX$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VGFM?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VGFMA(B|F|G|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VGFM(B|F|G|H)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLC(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLP(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMX(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMXL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMN(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMNL(B|F|G|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAL(B|F)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMALE(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMALH(B|F|H|W)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMALO(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAO(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAE(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAH(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VME(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMH(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VML(B|F)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMLE(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMLH(B|F|H|W)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMLO(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMO(B|F|H)?$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VMSL(G)?$")>;
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPOPCT(B|F|G|H)?$")>;
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VERLL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VERLLV(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VERIM(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESLV(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRA(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRAV(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRLV(B|F|G|H)?$")>;
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSL(DB)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSLB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSR(A|L)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSR(A|L)B$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSLD$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSRD$")>;
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSB(I|IQ|CBI|CBIQ)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSCBI(B|F|G|H|Q)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VS(F|G|H|Q)?$")>;
+
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VSUM(B|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VSUMG(F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VSUMQ(F|G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Integer comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "VEC(B|F|G|H)?$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "VECL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCEQ(B|F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VCEQ(B|F|G|H)S$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCH(B|F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VCH(B|F|G|H)S$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCHL(B|F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VCHL(B|F|G|H)S$")>;
+def : InstRW<[WLat4, VecStr, NormalGr], (instregex "VTM$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Floating-point arithmetic
+//===----------------------------------------------------------------------===//
+
+// Conversion and rounding
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VCFP(S|L)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VCD(L)?G$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VCD(L)?GB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WCD(L)?GB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VCE(L)?FB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WCE(L)?FB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VC(S|L)FP$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VC(L)?GD$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VC(L)?GDB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WC(L)?GDB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VC(L)?FEB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WC(L)?FEB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VL(DE|ED)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VL(DE|ED)B$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WL(DE|ED)B$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFL(L|R)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFL(LS|RD)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFL(LS|RD)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFLLD$")>;
+def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WFLRX$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFI(DB)?$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFIDB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFISB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFISB$")>;
+def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WFIXB$")>;
+
+// Sign operations
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VFPSO$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FPSODB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FPSOSB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFPSOXB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FL(C|N|P)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FL(C|N|P)SB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFL(C|N|P)XB$")>;
+
+// Minimum / maximum
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(MAX|MIN)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(MAX|MIN)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WF(MAX|MIN)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(MAX|MIN)SB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WF(MAX|MIN)SB$")>;
+def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "WF(MAX|MIN)XB$")>;
+
+// Test data class
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VFTCI$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "(V|W)FTCIDB$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "(V|W)FTCISB$")>;
+def : InstRW<[WLat3, WLat3, VecDFX, NormalGr], (instregex "WFTCIXB$")>;
+
+// Add / subtract
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)DB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)DB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)SB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)SB$")>;
+def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WF(A|S)XB$")>;
+
+// Multiply / multiply-and-add/subtract
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFM(DB)?$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFM(D|S)B$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFMSB$")>;
+def : InstRW<[WLat20, VecDF2, NormalGr], (instregex "WFMXB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)DB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)DB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)SB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)SB$")>;
+def : InstRW<[WLat20, VecDF2, NormalGr], (instregex "WF(N)?M(A|S)XB$")>;
+
+// Divide / square root
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "VFD$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "(V|W)FDDB$")>;
+def : InstRW<[WLat20, VecFPd20, NormalGr], (instregex "WFDSB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "VFDSB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "WFDXB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "VFSQ$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "(V|W)FSQDB$")>;
+def : InstRW<[WLat20, VecFPd20, NormalGr], (instregex "WFSQSB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "VFSQSB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "WFSQXB$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Floating-point comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(C|K)(E|H|HE)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(C|K)(E|H|HE)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFC(E|H|HE)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFK(E|H|HE)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(C|K)(E|H|HE)SB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFC(E|H|HE)SB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFK(E|H|HE)SB$")>;
+def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "WFC(E|H|HE)XB$")>;
+def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "WFK(E|H|HE)XB$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VFC(E|H|HE)DBS$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VFK(E|H|HE)DBS$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr],
+             (instregex "WF(C|K)(E|H|HE)DBS$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr],
+             (instregex "VF(C|K)(E|H|HE)SBS$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "WFC(E|H|HE)SBS$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "WFK(E|H|HE)SBS$")>;
+def : InstRW<[WLat3, WLat3, VecDFX, NormalGr], (instregex "WFC(E|H|HE)XBS$")>;
+def : InstRW<[WLat3, WLat3, VecDFX, NormalGr], (instregex "WFK(E|H|HE)XBS$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "WF(C|K)$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "WF(C|K)DB$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "WF(C|K)SB$")>;
+def : InstRW<[WLat3, VecDFX, NormalGr], (instregex "WF(C|K)XB$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Floating-point insertion and extraction
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LEFR$")>;
+def : InstRW<[WLat3, FXb, NormalGr], (instregex "LFER$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: String instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFAE(B)?$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFAE(F|H)$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VFAE(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFAEZ(B|F|H)$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VFAEZ(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFEE(B|F|H|ZB|ZF|ZH)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr],
+             (instregex "VFEE(B|F|H|ZB|ZF|ZH)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFENE(B|F|H|ZB|ZF|ZH)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr],
+             (instregex "VFENE(B|F|H|ZB|ZF|ZH)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VISTR(B|F|H)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VISTR(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VSTRC(B|F|H)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VSTRC(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VSTRCZ(B|F|H)$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VSTRCZ(B|F|H)S$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VSTRS(B|F|H)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VSTRSZ(B|F|H)$")>;
+
+//===----------------------------------------------------------------------===//
+// NNP assist instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCFN$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCLFN(L|H)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VC(R)?NF$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Packed-decimal instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "VLIP$")>;
+def : InstRW<[WLat6, VecDFX, LSU, GroupAlone2], (instregex "VPKZ$")>;
+def : InstRW<[WLat1, VecDFX, FXb, LSU2, GroupAlone2], (instregex "VUPKZ$")>;
+def : InstRW<[WLat20, WLat20, VecDF2, FXb, GroupAlone],
+             (instregex "VCVB(G)?(Opt)?$")>;
+def : InstRW<[WLat15, WLat15, VecDF2, FXb, GroupAlone],
+             (instregex "VCVD(G)?$")>;
+def : InstRW<[WLat4, WLat4, VecDFX, NormalGr], (instregex "V(A|S)P$")>;
+def : InstRW<[WLat30, WLat30, VecDF2, GroupAlone], (instregex "VM(S)?P$")>;
+def : InstRW<[WLat30, WLat30, VecDF2, GroupAlone], (instregex "V(D|R)P$")>;
+def : InstRW<[WLat30, WLat30, VecDF2, GroupAlone], (instregex "VSDP$")>;
+def : InstRW<[WLat10, WLat10, VecDF2, NormalGr], (instregex "VSRP$")>;
+def : InstRW<[WLat4, WLat4, VecDFX, NormalGr], (instregex "VPSOP$")>;
+def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "V(T|C)P$")>;
+
+def : InstRW<[WLat30, VecDF2, NormalGr], (instregex "VSCH(S|D|X)?P$")>;
+def : InstRW<[WLat30, VecDF2, NormalGr], (instregex "VSCSHP$")>;
+def : InstRW<[WLat30, VecDF2, NormalGr], (instregex "VCSPH")>;
+def : InstRW<[WLat2, WLat2, VecXsPm, NormalGr], (instregex "VCLZDP")>;
+def : InstRW<[WLat10, WLat10, VecDF2, NormalGr], (instregex "VSRPR")>;
+def : InstRW<[WLat2, WLat2, VecDFX, NormalGr], (instregex "VPKZR")>;
+def : InstRW<[WLat2, WLat2, VecDFX, NormalGr], (instregex "VUPKZH")>;
+def : InstRW<[WLat2, WLat2, VecDFX, NormalGr], (instregex "VUPKZL")>;
+
+// -------------------------------- System ---------------------------------- //
+
+//===----------------------------------------------------------------------===//
+// System: Program-Status Word Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, WLat30, MCD], (instregex "EPSW$")>;
+def : InstRW<[WLat20, GroupAlone3], (instregex "LPSW(E)?(Y)?$")>;
+def : InstRW<[WLat3, FXa, GroupAlone], (instregex "IPK$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SPKA$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SSM$")>;
+def : InstRW<[WLat1, FXb, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
+def : InstRW<[WLat3, FXa, NormalGr], (instregex "IAC$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SAC(F)?$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Control Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat4LSU, WLat4LSU, LSU2, GroupAlone], (instregex "LCTL(G)?$")>;
+def : InstRW<[WLat1, LSU5, FXb, GroupAlone2], (instregex "STCT(L|G)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "E(P|S)A(I)?R$")>;
+def : InstRW<[WLat30, MCD], (instregex "SSA(I)?R$")>;
+def : InstRW<[WLat30, MCD], (instregex "ESEA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Prefix-Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "S(T)?PX$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Breaking-Event-Address-Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat3LSU, LSU2, GroupAlone], (instregex "LBEAR")>;
+def : InstRW<[WLat1, LSU2, FXb, GroupAlone], (instregex "STBEAR")>;
+
+//===----------------------------------------------------------------------===//
+// System: Storage-Key and Real Memory Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "ISKE$")>;
+def : InstRW<[WLat30, MCD], (instregex "IVSK$")>;
+def : InstRW<[WLat30, MCD], (instregex "SSKE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "RRB(E|M)$")>;
+def : InstRW<[WLat30, MCD], (instregex "IRBM$")>;
+def : InstRW<[WLat30, MCD], (instregex "PFMF$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "TB$")>;
+def : InstRW<[WLat30, MCD], (instregex "PGIN$")>;
+def : InstRW<[WLat30, MCD], (instregex "PGOUT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Dynamic-Address-Translation Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "IPTE(Opt)?(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "IDTE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "RDP(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "CRDTE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "PTLB$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "CSP(G)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "LPTEA$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "LRA(Y|G)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "STRAG$")>;
+def : InstRW<[WLat30, MCD], (instregex "LURA(G)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "STUR(A|G)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TPROT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Memory-move Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat4LSU, FXa2, FXb, LSU5, GroupAlone2], (instregex "MVC(K|P|S)$")>;
+def : InstRW<[WLat1, FXa, LSU5, GroupAlone2], (instregex "MVC(S|D)K$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVCOS$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVPG$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Address-Space Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "LASP$")>;
+def : InstRW<[WLat1, LSU, GroupAlone], (instregex "PALB$")>;
+def : InstRW<[WLat30, MCD], (instregex "PC$")>;
+def : InstRW<[WLat30, MCD], (instregex "PR$")>;
+def : InstRW<[WLat30, MCD], (instregex "PT(I)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "RP$")>;
+def : InstRW<[WLat30, MCD], (instregex "BS(G|A)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TAR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Linkage-Stack Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "BAKR$")>;
+def : InstRW<[WLat30, MCD], (instregex "EREG(G)?$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "(E|M)STA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Time-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "PTFF$")>;
+def : InstRW<[WLat30, MCD], (instregex "SCK(PF|C)?$")>;
+def : InstRW<[WLat1, LSU2, GroupAlone], (instregex "SPT$")>;
+def : InstRW<[WLat15, LSU3, FXa2, FXb, GroupAlone2], (instregex "STCK(F)?$")>;
+def : InstRW<[WLat20, LSU4, FXa2, FXb2, GroupAlone3], (instregex "STCKE$")>;
+def : InstRW<[WLat30, MCD], (instregex "STCKC$")>;
+def : InstRW<[WLat1, LSU2, FXb, Cracked], (instregex "STPT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "STAP$")>;
+def : InstRW<[WLat30, MCD], (instregex "STIDP$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "STSI$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "STFL(E)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "ECAG$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "ECTG$")>;
+def : InstRW<[WLat30, MCD], (instregex "PTF$")>;
+def : InstRW<[WLat30, MCD], (instregex "PCKMO$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "QPACI$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Miscellaneous Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "SVC$")>;
+def : InstRW<[WLat1, FXb, GroupAlone], (instregex "MC$")>;
+def : InstRW<[WLat30, MCD], (instregex "DIAG$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TRAC(E|G)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TRAP(2|4)$")>;
+def : InstRW<[WLat30, MCD], (instregex "SIG(P|A)$")>;
+def : InstRW<[WLat30, MCD], (instregex "SIE$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Measurement Facility Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LPP$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "ECPGA$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "E(C|P)CTR$")>;
+def : InstRW<[WLat30, MCD], (instregex "LCCTL$")>;
+def : InstRW<[WLat30, MCD], (instregex "L(P|S)CTL$")>;
+def : InstRW<[WLat30, MCD], (instregex "Q(S|CTR)I$")>;
+def : InstRW<[WLat30, MCD], (instregex "S(C|P)CTR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: I/O Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "(C|H|R|X)SCH$")>;
+def : InstRW<[WLat30, MCD], (instregex "(M|S|ST|T)SCH$")>;
+def : InstRW<[WLat30, MCD], (instregex "RCHP$")>;
+def : InstRW<[WLat30, MCD], (instregex "SCHM$")>;
+def : InstRW<[WLat30, MCD], (instregex "STC(PS|RW)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TPI$")>;
+def : InstRW<[WLat30, MCD], (instregex "SAL$")>;
+
+}
+
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
index 0f01a4291cf7..173cf960d2bd 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
@@ -147,12 +147,12 @@ def : InstRW<[WLat1, FXU, NormalGr], (instregex "CL(F|G)IT(Asm.*)?$")>;
 // Call
 def : InstRW<[WLat1, LSU, FXU2, GroupAlone], (instregex "(Call)?BRAS$")>;
 def : InstRW<[WLat1, LSU, FXU2, GroupAlone], (instregex "(Call)?BRASL(_XPLINK64)?$")>;
-def : InstRW<[WLat1, LSU, FXU2, GroupAlone], (instregex "(Call)?BAS(R)?(_XPLINK64)?$")>;
+def : InstRW<[WLat1, LSU, FXU2, GroupAlone], (instregex "(Call)?BAS(R)?(_XPLINK64|_STACKEXT)?$")>;
 def : InstRW<[WLat1, LSU, FXU2, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
 
 // Return
-def : InstRW<[WLat1, LSU, EndGroup], (instregex "Return$")>;
-def : InstRW<[WLat1, LSU, EndGroup], (instregex "CondReturn$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "Return(_XPLINK)?$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "CondReturn(_XPLINK)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Move instructions
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td b/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
index 096a95a82ec8..d2060471d65e 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
@@ -152,12 +152,12 @@ def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "CL(G)?T(Asm.*)?$")>;
 // Call
 def : InstRW<[WLat1, FXU2, VBU, GroupAlone], (instregex "(Call)?BRAS$")>;
 def : InstRW<[WLat1, FXU2, LSU, GroupAlone], (instregex "(Call)?BRASL(_XPLINK64)?$")>;
-def : InstRW<[WLat1, FXU2, LSU, GroupAlone], (instregex "(Call)?BAS(R)?(_XPLINK64)?$")>;
+def : InstRW<[WLat1, FXU2, LSU, GroupAlone], (instregex "(Call)?BAS(R)?(_XPLINK64|_STACKEXT)?$")>;
 def : InstRW<[WLat1, FXU2, LSU, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
 
 // Return
-def : InstRW<[WLat1, LSU, EndGroup], (instregex "Return$")>;
-def : InstRW<[WLat1, LSU, NormalGr], (instregex "CondReturn$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "Return(_XPLINK)?$")>;
+def : InstRW<[WLat1, LSU, NormalGr], (instregex "CondReturn(_XPLINK)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Move instructions
diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index db4b4879b33a..ce30d8ef2cba 100644
--- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -88,7 +88,7 @@ static SDValue memsetStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
 SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset(
     SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst,
     SDValue Byte, SDValue Size, Align Alignment, bool IsVolatile,
-    MachinePointerInfo DstPtrInfo) const {
+    bool AlwaysInline, MachinePointerInfo DstPtrInfo) const {
   EVT PtrVT = Dst.getValueType();
 
   if (IsVolatile)
diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
index da6725777e43..6ac5bf8c6c1a 100644
--- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
@@ -31,7 +31,7 @@ public:
   SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &DL,
                                   SDValue Chain, SDValue Dst, SDValue Byte,
                                   SDValue Size, Align Alignment,
-                                  bool IsVolatile,
+                                  bool IsVolatile, bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo) const override;
 
   std::pair<SDValue, SDValue>
diff --git a/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp b/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp
index 92930dad80ef..30b22fa1ce92 100644
--- a/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp
@@ -162,10 +162,10 @@ bool SystemZShortenInst::shortenFPConv(MachineInstr &MI, unsigned Opcode) {
     MachineOperand Src(MI.getOperand(1));
     MachineOperand Suppress(MI.getOperand(2));
     MachineOperand Mode(MI.getOperand(3));
-    MI.RemoveOperand(3);
-    MI.RemoveOperand(2);
-    MI.RemoveOperand(1);
-    MI.RemoveOperand(0);
+    MI.removeOperand(3);
+    MI.removeOperand(2);
+    MI.removeOperand(1);
+    MI.removeOperand(0);
     MI.setDesc(TII->get(Opcode));
     MachineInstrBuilder(*MI.getParent()->getParent(), &MI)
         .add(Dest)
@@ -190,9 +190,9 @@ bool SystemZShortenInst::shortenFusedFPOp(MachineInstr &MI, unsigned Opcode) {
     MachineOperand Lhs(LHSMO);
     MachineOperand Rhs(RHSMO);
     MachineOperand Src(AccMO);
-    MI.RemoveOperand(3);
-    MI.RemoveOperand(2);
-    MI.RemoveOperand(1);
+    MI.removeOperand(3);
+    MI.removeOperand(2);
+    MI.removeOperand(1);
     MI.setDesc(TII->get(Opcode));
     MachineInstrBuilder(*MI.getParent()->getParent(), &MI)
         .add(Src)
diff --git a/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp b/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
index 75c0d454d904..f6889035b654 100644
--- a/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -27,13 +27,14 @@ static cl::opt<bool> UseSubRegLiveness(
 // Pin the vtable to this file.
 void SystemZSubtarget::anchor() {}
 
-SystemZSubtarget &
-SystemZSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
-  StringRef CPUName = CPU;
-  if (CPUName.empty())
-    CPUName = "generic";
+SystemZSubtarget &SystemZSubtarget::initializeSubtargetDependencies(
+    StringRef CPU, StringRef TuneCPU, StringRef FS) {
+  if (CPU.empty())
+    CPU = "generic";
+  if (TuneCPU.empty())
+    TuneCPU = CPU;
   // Parse features string.
-  ParseSubtargetFeatures(CPUName, /*TuneCPU*/ CPUName, FS);
+  ParseSubtargetFeatures(CPU, TuneCPU, FS);
 
   // -msoft-float implies -mno-vx.
   if (HasSoftFloat)
@@ -64,9 +65,10 @@ SystemZSubtarget::initializeSpecialRegisters() {
 }
 
 SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU,
+                                   const std::string &TuneCPU,
                                    const std::string &FS,
                                    const TargetMachine &TM)
-    : SystemZGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS),
+    : SystemZGenSubtargetInfo(TT, CPU, TuneCPU, FS),
       HasDistinctOps(false), HasLoadStoreOnCond(false), HasHighWord(false),
       HasFPExtension(false), HasPopulationCount(false),
       HasMessageSecurityAssist3(false), HasMessageSecurityAssist4(false),
@@ -88,8 +90,8 @@ SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU,
       HasResetDATProtection(false), HasProcessorActivityInstrumentation(false),
       HasSoftFloat(false), TargetTriple(TT),
       SpecialRegisters(initializeSpecialRegisters()),
-      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
-      FrameLowering(SystemZFrameLowering::create(*this)) {}
+      InstrInfo(initializeSubtargetDependencies(CPU, TuneCPU, FS)),
+      TLInfo(TM, *this), FrameLowering(SystemZFrameLowering::create(*this)) {}
 
 bool SystemZSubtarget::enableSubRegLiveness() const {
   return UseSubRegLiveness;
diff --git a/llvm/lib/Target/SystemZ/SystemZSubtarget.h b/llvm/lib/Target/SystemZ/SystemZSubtarget.h
index 98f7094fcb48..cd16c19f9bfa 100644
--- a/llvm/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/llvm/lib/Target/SystemZ/SystemZSubtarget.h
@@ -84,12 +84,14 @@ private:
   std::unique_ptr<const SystemZFrameLowering> FrameLowering;
 
   SystemZSubtarget &initializeSubtargetDependencies(StringRef CPU,
+                                                    StringRef TuneCPU,
                                                     StringRef FS);
   SystemZCallingConventionRegisters *initializeSpecialRegisters();
 
 public:
   SystemZSubtarget(const Triple &TT, const std::string &CPU,
-                   const std::string &FS, const TargetMachine &TM);
+                   const std::string &TuneCPU, const std::string &FS,
+                   const TargetMachine &TM);
 
   SystemZCallingConventionRegisters *getSpecialRegisters() const {
     assert(SpecialRegisters && "Unsupported SystemZ calling convention");
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
index f1469fe8f56b..31f8ee2f894d 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -118,7 +118,7 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
   // Static code is suitable for use in a dynamic executable; there is no
   // separate DynamicNoPIC model.
-  if (!RM.hasValue() || *RM == Reloc::DynamicNoPIC)
+  if (!RM || *RM == Reloc::DynamicNoPIC)
     return Reloc::Static;
   return *RM;
 }
@@ -187,10 +187,13 @@ SystemZTargetMachine::~SystemZTargetMachine() = default;
 const SystemZSubtarget *
 SystemZTargetMachine::getSubtargetImpl(const Function &F) const {
   Attribute CPUAttr = F.getFnAttribute("target-cpu");
+  Attribute TuneAttr = F.getFnAttribute("tune-cpu");
   Attribute FSAttr = F.getFnAttribute("target-features");
 
   std::string CPU =
       CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
+  std::string TuneCPU =
+      TuneAttr.isValid() ? TuneAttr.getValueAsString().str() : CPU;
   std::string FS =
       FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS;
 
@@ -202,13 +205,14 @@ SystemZTargetMachine::getSubtargetImpl(const Function &F) const {
   if (softFloat)
     FS += FS.empty() ? "+soft-float" : ",+soft-float";
 
-  auto &I = SubtargetMap[CPU + FS];
+  auto &I = SubtargetMap[CPU + TuneCPU + FS];
   if (!I) {
     // This needs to be done before we create a new subtarget since any
     // creation will depend on the TM and the code generation flags on the
     // function that reside in TargetOptions.
     resetTargetOptions(F);
-    I = std::make_unique<SystemZSubtarget>(TargetTriple, CPU, FS, *this);
+    I = std::make_unique<SystemZSubtarget>(TargetTriple, CPU, TuneCPU, FS,
+                                           *this);
   }
 
   return I.get();
@@ -334,6 +338,6 @@ TargetPassConfig *SystemZTargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 TargetTransformInfo
-SystemZTargetMachine::getTargetTransformInfo(const Function &F) {
+SystemZTargetMachine::getTargetTransformInfo(const Function &F) const {
   return TargetTransformInfo(SystemZTTIImpl(this, F));
 }
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetMachine.h b/llvm/lib/Target/SystemZ/SystemZTargetMachine.h
index 9ea03e104fc9..2cdb33a5064b 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetMachine.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetMachine.h
@@ -44,7 +44,7 @@ public:
 
   // Override LLVMTargetMachine
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
-  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+  TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
 
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetStreamer.h b/llvm/lib/Target/SystemZ/SystemZTargetStreamer.h
index a610a90d2069..1b4e93ebe39b 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetStreamer.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetStreamer.h
@@ -10,6 +10,7 @@
 #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZTARGETSTREAMER_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
 
 namespace llvm {
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 6d66ebfced05..69914049a00c 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -30,6 +30,42 @@ using namespace llvm;
 //
 //===----------------------------------------------------------------------===//
 
+static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse) {
+  bool UsedAsMemCpySource = false;
+  for (const User *U : V->users())
+    if (const Instruction *User = dyn_cast<Instruction>(U)) {
+      if (isa<BitCastInst>(User) || isa<GetElementPtrInst>(User)) {
+        UsedAsMemCpySource |= isUsedAsMemCpySource(User, OtherUse);
+        continue;
+      }
+      if (const MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(User)) {
+        if (Memcpy->getOperand(1) == V && !Memcpy->isVolatile()) {
+          UsedAsMemCpySource = true;
+          continue;
+        }
+      }
+      OtherUse = true;
+    }
+  return UsedAsMemCpySource;
+}
+
+unsigned SystemZTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
+  unsigned Bonus = 0;
+
+  // Increase the threshold if an incoming argument is used only as a memcpy
+  // source.
+  if (Function *Callee = CB->getCalledFunction())
+    for (Argument &Arg : Callee->args()) {
+      bool OtherUse = false;
+      if (isUsedAsMemCpySource(&Arg, OtherUse) && !OtherUse)
+        Bonus += 150;
+    }
+
+  LLVM_DEBUG(if (Bonus)
+               dbgs() << "++ SZTTI Adding inlining bonus: " << Bonus << "\n";);
+  return Bonus;
+}
+
 InstructionCost SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
                                               TTI::TargetCostKind CostKind) {
   assert(Ty->isIntegerTy());
@@ -303,8 +339,8 @@ void SystemZTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
   BaseT::getPeelingPreferences(L, SE, PP);
 }
 
-bool SystemZTTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
-                                   TargetTransformInfo::LSRCost &C2) {
+bool SystemZTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
+                                   const TargetTransformInfo::LSRCost &C2) {
   // SystemZ specific: check instruction count (first), and don't care about
   // ImmCost, since offsets are checked explicitly.
   return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
@@ -559,7 +595,8 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
 InstructionCost SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
                                                VectorType *Tp,
                                                ArrayRef<int> Mask, int Index,
-                                               VectorType *SubTp) {
+                                               VectorType *SubTp,
+                                               ArrayRef<const Value *> Args) {
   Kind = improveShuffleKindFromMask(Kind, Mask);
   if (ST->hasVector()) {
     unsigned NumVectors = getNumVectorRegs(Tp);
@@ -781,7 +818,11 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
 
     if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
       if (SrcScalarBits >= 8) {
-        // ZExt/SExt will be handled with one unpack per doubling of width.
+        // ZExt will use either a single unpack or a vector permute.
+        if (Opcode == Instruction::ZExt)
+          return NumDstVectors;
+
+        // SExt will be handled with one unpack per doubling of width.
         unsigned NumUnpacks = getElSizeLog2Diff(Src, Dst);
 
         // For types that spans multiple vector registers, some additional
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index db4ec794b3e4..33317e799eab 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -37,6 +37,7 @@ public:
   /// @{
 
   unsigned getInliningThresholdMultiplier() { return 3; }
+  unsigned adjustInliningThreshold(const CallBase *CB) const;
 
   InstructionCost getIntImmCost(const APInt &Imm, Type *Ty,
                                 TTI::TargetCostKind CostKind);
@@ -58,8 +59,8 @@ public:
   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                              TTI::PeelingPreferences &PP);
 
-  bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
-                     TargetTransformInfo::LSRCost &C2);
+  bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
+                     const TargetTransformInfo::LSRCost &C2);
   /// @}
 
   /// \name Vector TTI Implementations
@@ -92,7 +93,8 @@ public:
       const Instruction *CxtI = nullptr);
   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
                                  ArrayRef<int> Mask, int Index,
-                                 VectorType *SubTp);
+                                 VectorType *SubTp,
+                                 ArrayRef<const Value *> Args = None);
   unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy);
   unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy);
   unsigned getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
diff --git a/llvm/lib/Target/TargetIntrinsicInfo.cpp b/llvm/lib/Target/TargetIntrinsicInfo.cpp
index 256514c8c22d..d44a34984c42 100644
--- a/llvm/lib/Target/TargetIntrinsicInfo.cpp
+++ b/llvm/lib/Target/TargetIntrinsicInfo.cpp
@@ -11,15 +11,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Target/TargetIntrinsicInfo.h"
-#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringMapEntry.h"
 #include "llvm/IR/Function.h"
 using namespace llvm;
 
-TargetIntrinsicInfo::TargetIntrinsicInfo() {
-}
+TargetIntrinsicInfo::TargetIntrinsicInfo() = default;
 
-TargetIntrinsicInfo::~TargetIntrinsicInfo() {
-}
+TargetIntrinsicInfo::~TargetIntrinsicInfo() = default;
 
 unsigned TargetIntrinsicInfo::getIntrinsicID(const Function *F) const {
   const ValueName *ValName = F->getValueName();
diff --git a/llvm/lib/Target/TargetLoweringObjectFile.cpp b/llvm/lib/Target/TargetLoweringObjectFile.cpp
index 7954f0f09faf..8f633adbb9ef 100644
--- a/llvm/lib/Target/TargetLoweringObjectFile.cpp
+++ b/llvm/lib/Target/TargetLoweringObjectFile.cpp
@@ -24,10 +24,8 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/SectionKind.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index 390457dbb2bc..8d1ad617889c 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -13,17 +13,14 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCSectionMachO.h"
-#include "llvm/MC/MCTargetOptions.h"
-#include "llvm/MC/SectionKind.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 using namespace llvm;
 
@@ -63,16 +60,13 @@ void TargetMachine::resetTargetOptions(const Function &F) const {
   RESET_OPTION(NoInfsFPMath, "no-infs-fp-math");
   RESET_OPTION(NoNaNsFPMath, "no-nans-fp-math");
   RESET_OPTION(NoSignedZerosFPMath, "no-signed-zeros-fp-math");
+  RESET_OPTION(ApproxFuncFPMath, "approx-func-fp-math");
 }
 
 /// Returns the code generation relocation model. The choices are static, PIC,
 /// and dynamic-no-pic.
 Reloc::Model TargetMachine::getRelocationModel() const { return RM; }
 
-/// Returns the code model. The choices are small, kernel, medium, large, and
-/// target default.
-CodeModel::Model TargetMachine::getCodeModel() const { return CMModel; }
-
 /// Get the IR-specified TLS model for Var.
 static TLSModel::Model getSelectedTLSModel(const GlobalValue *GV) {
   switch (GV->getThreadLocalMode()) {
@@ -189,7 +183,8 @@ CodeGenOpt::Level TargetMachine::getOptLevel() const { return OptLevel; }
 
 void TargetMachine::setOptLevel(CodeGenOpt::Level Level) { OptLevel = Level; }
 
-TargetTransformInfo TargetMachine::getTargetTransformInfo(const Function &F) {
+TargetTransformInfo
+TargetMachine::getTargetTransformInfo(const Function &F) const {
   return TargetTransformInfo(F.getParent()->getDataLayout());
 }
 
@@ -217,7 +212,7 @@ MCSymbol *TargetMachine::getSymbol(const GlobalValue *GV) const {
   return TLOF->getContext().getOrCreateSymbol(NameStr);
 }
 
-TargetIRAnalysis TargetMachine::getTargetIRAnalysis() {
+TargetIRAnalysis TargetMachine::getTargetIRAnalysis() const {
   // Since Analysis can't depend on Target, use a std::function to invert the
   // dependency.
   return TargetIRAnalysis(
diff --git a/llvm/lib/Target/TargetMachineC.cpp b/llvm/lib/Target/TargetMachineC.cpp
index 55047a1bb3cd..b8cefbe5b6b7 100644
--- a/llvm/lib/Target/TargetMachineC.cpp
+++ b/llvm/lib/Target/TargetMachineC.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm-c/Core.h"
-#include "llvm-c/Target.h"
 #include "llvm-c/TargetMachine.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/DataLayout.h"
@@ -20,13 +19,10 @@
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/CodeGenCWrappers.h"
 #include "llvm/Target/TargetMachine.h"
-#include <cassert>
-#include <cstdlib>
 #include <cstring>
 
 using namespace llvm;
@@ -217,7 +213,9 @@ static LLVMBool LLVMTargetMachineEmit(LLVMTargetMachineRef T, LLVMModuleRef M,
 }
 
 LLVMBool LLVMTargetMachineEmitToFile(LLVMTargetMachineRef T, LLVMModuleRef M,
-  char* Filename, LLVMCodeGenFileType codegen, char** ErrorMessage) {
+                                     const char *Filename,
+                                     LLVMCodeGenFileType codegen,
+                                     char **ErrorMessage) {
   std::error_code EC;
   raw_fd_ostream dest(Filename, EC, sys::fs::OF_None);
   if (EC) {
diff --git a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
index 4a318e493c52..f39be036d21f 100644
--- a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
+++ b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
@@ -17,6 +17,7 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
diff --git a/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp b/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
index 72c40cbe78c4..00487a1f5bb3 100644
--- a/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
+++ b/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
@@ -15,8 +15,8 @@
 #include "VE.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDecoderOps.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
-#include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/TargetRegistry.h"
 
@@ -33,7 +33,7 @@ class VEDisassembler : public MCDisassembler {
 public:
   VEDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
       : MCDisassembler(STI, Ctx) {}
-  virtual ~VEDisassembler() {}
+  virtual ~VEDisassembler() = default;
 
   DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
                               ArrayRef<uint8_t> Bytes, uint64_t Address,
@@ -126,7 +126,7 @@ static const unsigned MiscRegDecoderTable[] = {
 
 static DecodeStatus DecodeI32RegisterClass(MCInst &Inst, unsigned RegNo,
                                            uint64_t Address,
-                                           const void *Decoder) {
+                                           const MCDisassembler *Decoder) {
   if (RegNo > 63)
     return MCDisassembler::Fail;
   unsigned Reg = I32RegDecoderTable[RegNo];
@@ -136,7 +136,7 @@ static DecodeStatus DecodeI32RegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecodeI64RegisterClass(MCInst &Inst, unsigned RegNo,
                                            uint64_t Address,
-                                           const void *Decoder) {
+                                           const MCDisassembler *Decoder) {
   if (RegNo > 63)
     return MCDisassembler::Fail;
   unsigned Reg = I64RegDecoderTable[RegNo];
@@ -146,7 +146,7 @@ static DecodeStatus DecodeI64RegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecodeF32RegisterClass(MCInst &Inst, unsigned RegNo,
                                            uint64_t Address,
-                                           const void *Decoder) {
+                                           const MCDisassembler *Decoder) {
   if (RegNo > 63)
     return MCDisassembler::Fail;
   unsigned Reg = F32RegDecoderTable[RegNo];
@@ -156,7 +156,7 @@ static DecodeStatus DecodeF32RegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecodeF128RegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
-                                            const void *Decoder) {
+                                            const MCDisassembler *Decoder) {
   if (RegNo % 2 || RegNo > 63)
     return MCDisassembler::Fail;
   unsigned Reg = F128RegDecoderTable[RegNo / 2];
@@ -166,7 +166,7 @@ static DecodeStatus DecodeF128RegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecodeV64RegisterClass(MCInst &Inst, unsigned RegNo,
                                            uint64_t Address,
-                                           const void *Decoder) {
+                                           const MCDisassembler *Decoder) {
   unsigned Reg = VE::NoRegister;
   if (RegNo == 255)
     Reg = VE::VIX;
@@ -180,7 +180,7 @@ static DecodeStatus DecodeV64RegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecodeVMRegisterClass(MCInst &Inst, unsigned RegNo,
                                           uint64_t Address,
-                                          const void *Decoder) {
+                                          const MCDisassembler *Decoder) {
   if (RegNo > 15)
     return MCDisassembler::Fail;
   unsigned Reg = VMRegDecoderTable[RegNo];
@@ -190,7 +190,7 @@ static DecodeStatus DecodeVMRegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecodeVM512RegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
-                                             const void *Decoder) {
+                                             const MCDisassembler *Decoder) {
   if (RegNo % 2 || RegNo > 15)
     return MCDisassembler::Fail;
   unsigned Reg = VM512RegDecoderTable[RegNo / 2];
@@ -200,7 +200,7 @@ static DecodeStatus DecodeVM512RegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecodeMISCRegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
-                                            const void *Decoder) {
+                                            const MCDisassembler *Decoder) {
   if (RegNo > 30)
     return MCDisassembler::Fail;
   unsigned Reg = MiscRegDecoderTable[RegNo];
@@ -211,47 +211,56 @@ static DecodeStatus DecodeMISCRegisterClass(MCInst &Inst, unsigned RegNo,
 }
 
 static DecodeStatus DecodeASX(MCInst &Inst, uint64_t insn, uint64_t Address,
-                              const void *Decoder);
+                              const MCDisassembler *Decoder);
 static DecodeStatus DecodeLoadI32(MCInst &Inst, uint64_t insn, uint64_t Address,
-                                  const void *Decoder);
+                                  const MCDisassembler *Decoder);
 static DecodeStatus DecodeStoreI32(MCInst &Inst, uint64_t insn,
-                                   uint64_t Address, const void *Decoder);
+                                   uint64_t Address,
+                                   const MCDisassembler *Decoder);
 static DecodeStatus DecodeLoadI64(MCInst &Inst, uint64_t insn, uint64_t Address,
-                                  const void *Decoder);
+                                  const MCDisassembler *Decoder);
 static DecodeStatus DecodeStoreI64(MCInst &Inst, uint64_t insn,
-                                   uint64_t Address, const void *Decoder);
+                                   uint64_t Address,
+                                   const MCDisassembler *Decoder);
 static DecodeStatus DecodeLoadF32(MCInst &Inst, uint64_t insn, uint64_t Address,
-                                  const void *Decoder);
+                                  const MCDisassembler *Decoder);
 static DecodeStatus DecodeStoreF32(MCInst &Inst, uint64_t insn,
-                                   uint64_t Address, const void *Decoder);
+                                   uint64_t Address,
+                                   const MCDisassembler *Decoder);
 static DecodeStatus DecodeLoadASI64(MCInst &Inst, uint64_t insn,
-                                    uint64_t Address, const void *Decoder);
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder);
 static DecodeStatus DecodeStoreASI64(MCInst &Inst, uint64_t insn,
-                                     uint64_t Address, const void *Decoder);
+                                     uint64_t Address,
+                                     const MCDisassembler *Decoder);
 static DecodeStatus DecodeTS1AMI64(MCInst &Inst, uint64_t insn,
-                                   uint64_t Address, const void *Decoder);
+                                   uint64_t Address,
+                                   const MCDisassembler *Decoder);
 static DecodeStatus DecodeTS1AMI32(MCInst &Inst, uint64_t insn,
-                                   uint64_t Address, const void *Decoder);
+                                   uint64_t Address,
+                                   const MCDisassembler *Decoder);
 static DecodeStatus DecodeCASI64(MCInst &Inst, uint64_t insn, uint64_t Address,
-                                 const void *Decoder);
+                                 const MCDisassembler *Decoder);
 static DecodeStatus DecodeCASI32(MCInst &Inst, uint64_t insn, uint64_t Address,
-                                 const void *Decoder);
+                                 const MCDisassembler *Decoder);
 static DecodeStatus DecodeCall(MCInst &Inst, uint64_t insn, uint64_t Address,
-                               const void *Decoder);
+                               const MCDisassembler *Decoder);
 static DecodeStatus DecodeSIMM7(MCInst &Inst, uint64_t insn, uint64_t Address,
-                                const void *Decoder);
+                                const MCDisassembler *Decoder);
 static DecodeStatus DecodeSIMM32(MCInst &Inst, uint64_t insn, uint64_t Address,
-                                 const void *Decoder);
+                                 const MCDisassembler *Decoder);
 static DecodeStatus DecodeCCOperand(MCInst &Inst, uint64_t insn,
-                                    uint64_t Address, const void *Decoder);
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder);
 static DecodeStatus DecodeRDOperand(MCInst &Inst, uint64_t insn,
-                                    uint64_t Address, const void *Decoder);
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder);
 static DecodeStatus DecodeBranchCondition(MCInst &Inst, uint64_t insn,
                                           uint64_t Address,
-                                          const void *Decoder);
+                                          const MCDisassembler *Decoder);
 static DecodeStatus DecodeBranchConditionAlways(MCInst &Inst, uint64_t insn,
                                                 uint64_t Address,
-                                                const void *Decoder);
+                                                const MCDisassembler *Decoder);
 
 #include "VEGenDisassemblerTables.inc"
 
@@ -302,10 +311,10 @@ DecodeStatus VEDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
 }
 
 typedef DecodeStatus (*DecodeFunc)(MCInst &MI, unsigned RegNo, uint64_t Address,
-                                   const void *Decoder);
+                                   const MCDisassembler *Decoder);
 
 static DecodeStatus DecodeASX(MCInst &MI, uint64_t insn, uint64_t Address,
-                              const void *Decoder) {
+                              const MCDisassembler *Decoder) {
   unsigned sy = fieldFromInstruction(insn, 40, 7);
   bool cy = fieldFromInstruction(insn, 47, 1);
   unsigned sz = fieldFromInstruction(insn, 32, 7);
@@ -338,7 +347,7 @@ static DecodeStatus DecodeASX(MCInst &MI, uint64_t insn, uint64_t Address,
 }
 
 static DecodeStatus DecodeAS(MCInst &MI, uint64_t insn, uint64_t Address,
-                             const void *Decoder) {
+                             const MCDisassembler *Decoder) {
   unsigned sz = fieldFromInstruction(insn, 32, 7);
   bool cz = fieldFromInstruction(insn, 39, 1);
   uint64_t simm32 = SignExtend64<32>(fieldFromInstruction(insn, 0, 32));
@@ -360,7 +369,7 @@ static DecodeStatus DecodeAS(MCInst &MI, uint64_t insn, uint64_t Address,
 }
 
 static DecodeStatus DecodeMem(MCInst &MI, uint64_t insn, uint64_t Address,
-                              const void *Decoder, bool isLoad,
+                              const MCDisassembler *Decoder, bool isLoad,
                               DecodeFunc DecodeSX) {
   unsigned sx = fieldFromInstruction(insn, 48, 7);
 
@@ -384,7 +393,7 @@ static DecodeStatus DecodeMem(MCInst &MI, uint64_t insn, uint64_t Address,
 }
 
 static DecodeStatus DecodeMemAS(MCInst &MI, uint64_t insn, uint64_t Address,
-                                const void *Decoder, bool isLoad,
+                                const MCDisassembler *Decoder, bool isLoad,
                                 DecodeFunc DecodeSX) {
   unsigned sx = fieldFromInstruction(insn, 48, 7);
 
@@ -408,50 +417,55 @@ static DecodeStatus DecodeMemAS(MCInst &MI, uint64_t insn, uint64_t Address,
 }
 
 static DecodeStatus DecodeLoadI32(MCInst &Inst, uint64_t insn, uint64_t Address,
-                                  const void *Decoder) {
+                                  const MCDisassembler *Decoder) {
   return DecodeMem(Inst, insn, Address, Decoder, true, DecodeI32RegisterClass);
 }
 
 static DecodeStatus DecodeStoreI32(MCInst &Inst, uint64_t insn,
-                                   uint64_t Address, const void *Decoder) {
+                                   uint64_t Address,
+                                   const MCDisassembler *Decoder) {
   return DecodeMem(Inst, insn, Address, Decoder, false, DecodeI32RegisterClass);
 }
 
 static DecodeStatus DecodeLoadI64(MCInst &Inst, uint64_t insn, uint64_t Address,
-                                  const void *Decoder) {
+                                  const MCDisassembler *Decoder) {
   return DecodeMem(Inst, insn, Address, Decoder, true, DecodeI64RegisterClass);
 }
 
 static DecodeStatus DecodeStoreI64(MCInst &Inst, uint64_t insn,
-                                   uint64_t Address, const void *Decoder) {
+                                   uint64_t Address,
+                                   const MCDisassembler *Decoder) {
   return DecodeMem(Inst, insn, Address, Decoder, false, DecodeI64RegisterClass);
 }
 
 static DecodeStatus DecodeLoadF32(MCInst &Inst, uint64_t insn, uint64_t Address,
-                                  const void *Decoder) {
+                                  const MCDisassembler *Decoder) {
   return DecodeMem(Inst, insn, Address, Decoder, true, DecodeF32RegisterClass);
 }
 
 static DecodeStatus DecodeStoreF32(MCInst &Inst, uint64_t insn,
-                                   uint64_t Address, const void *Decoder) {
+                                   uint64_t Address,
+                                   const MCDisassembler *Decoder) {
   return DecodeMem(Inst, insn, Address, Decoder, false, DecodeF32RegisterClass);
 }
 
 static DecodeStatus DecodeLoadASI64(MCInst &Inst, uint64_t insn,
-                                    uint64_t Address, const void *Decoder) {
+                                    uint64_t Address,
+                                    const MCDisassembler *Decoder) {
   return DecodeMemAS(Inst, insn, Address, Decoder, true,
                      DecodeI64RegisterClass);
 }
 
 static DecodeStatus DecodeStoreASI64(MCInst &Inst, uint64_t insn,
-                                     uint64_t Address, const void *Decoder) {
+                                     uint64_t Address,
+                                     const MCDisassembler *Decoder) {
   return DecodeMemAS(Inst, insn, Address, Decoder, false,
                      DecodeI64RegisterClass);
 }
 
 static DecodeStatus DecodeCAS(MCInst &MI, uint64_t insn, uint64_t Address,
-                              const void *Decoder, bool isImmOnly, bool isUImm,
-                              DecodeFunc DecodeSX) {
+                              const MCDisassembler *Decoder, bool isImmOnly,
+                              bool isUImm, DecodeFunc DecodeSX) {
   unsigned sx = fieldFromInstruction(insn, 48, 7);
   bool cy = fieldFromInstruction(insn, 47, 1);
   unsigned sy = fieldFromInstruction(insn, 40, 7);
@@ -488,43 +502,43 @@ static DecodeStatus DecodeCAS(MCInst &MI, uint64_t insn, uint64_t Address,
 }
 
 static DecodeStatus DecodeTS1AMI64(MCInst &MI, uint64_t insn, uint64_t Address,
-                                   const void *Decoder) {
+                                   const MCDisassembler *Decoder) {
   return DecodeCAS(MI, insn, Address, Decoder, false, true,
                    DecodeI64RegisterClass);
 }
 
 static DecodeStatus DecodeTS1AMI32(MCInst &MI, uint64_t insn, uint64_t Address,
-                                   const void *Decoder) {
+                                   const MCDisassembler *Decoder) {
   return DecodeCAS(MI, insn, Address, Decoder, false, true,
                    DecodeI32RegisterClass);
 }
 
 static DecodeStatus DecodeCASI64(MCInst &MI, uint64_t insn, uint64_t Address,
-                                 const void *Decoder) {
+                                 const MCDisassembler *Decoder) {
   return DecodeCAS(MI, insn, Address, Decoder, false, false,
                    DecodeI64RegisterClass);
 }
 
 static DecodeStatus DecodeCASI32(MCInst &MI, uint64_t insn, uint64_t Address,
-                                 const void *Decoder) {
+                                 const MCDisassembler *Decoder) {
   return DecodeCAS(MI, insn, Address, Decoder, false, false,
                    DecodeI32RegisterClass);
 }
 
 static DecodeStatus DecodeCall(MCInst &Inst, uint64_t insn, uint64_t Address,
-                               const void *Decoder) {
+                               const MCDisassembler *Decoder) {
   return DecodeMem(Inst, insn, Address, Decoder, true, DecodeI64RegisterClass);
 }
 
 static DecodeStatus DecodeSIMM7(MCInst &MI, uint64_t insn, uint64_t Address,
-                                const void *Decoder) {
+                                const MCDisassembler *Decoder) {
   uint64_t tgt = SignExtend64<7>(insn);
   MI.addOperand(MCOperand::createImm(tgt));
   return MCDisassembler::Success;
 }
 
 static DecodeStatus DecodeSIMM32(MCInst &MI, uint64_t insn, uint64_t Address,
-                                 const void *Decoder) {
+                                 const MCDisassembler *Decoder) {
   uint64_t tgt = SignExtend64<32>(insn);
   MI.addOperand(MCOperand::createImm(tgt));
   return MCDisassembler::Success;
@@ -568,14 +582,14 @@ static bool isIntegerBCKind(MCInst &MI) {
 
 // Decode CC Operand field.
 static DecodeStatus DecodeCCOperand(MCInst &MI, uint64_t cf, uint64_t Address,
-                                    const void *Decoder) {
+                                    const MCDisassembler *Decoder) {
   MI.addOperand(MCOperand::createImm(VEValToCondCode(cf, isIntegerBCKind(MI))));
   return MCDisassembler::Success;
 }
 
 // Decode RD Operand field.
 static DecodeStatus DecodeRDOperand(MCInst &MI, uint64_t cf, uint64_t Address,
-                                    const void *Decoder) {
+                                    const MCDisassembler *Decoder) {
   MI.addOperand(MCOperand::createImm(VEValToRD(cf)));
   return MCDisassembler::Success;
 }
@@ -583,7 +597,7 @@ static DecodeStatus DecodeRDOperand(MCInst &MI, uint64_t cf, uint64_t Address,
 // Decode branch condition instruction and CCOperand field in it.
 static DecodeStatus DecodeBranchCondition(MCInst &MI, uint64_t insn,
                                           uint64_t Address,
-                                          const void *Decoder) {
+                                          const MCDisassembler *Decoder) {
   unsigned cf = fieldFromInstruction(insn, 48, 4);
   bool cy = fieldFromInstruction(insn, 47, 1);
   unsigned sy = fieldFromInstruction(insn, 40, 7);
@@ -607,7 +621,7 @@ static DecodeStatus DecodeBranchCondition(MCInst &MI, uint64_t insn,
 
 static DecodeStatus DecodeBranchConditionAlways(MCInst &MI, uint64_t insn,
                                                 uint64_t Address,
-                                                const void *Decoder) {
+                                                const MCDisassembler *Decoder) {
   // Decode MEMri.
   return DecodeAS(MI, insn, Address, Decoder);
 }
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp
index ae065407409a..1c89d6444d11 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp
@@ -25,7 +25,7 @@ public:
       : MCELFObjectTargetWriter(/* Is64Bit */ true, OSABI, ELF::EM_VE,
                                 /* HasRelocationAddend */ true) {}
 
-  ~VEELFObjectWriter() override {}
+  ~VEELFObjectWriter() override = default;
 
 protected:
   unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEFixupKinds.h b/llvm/lib/Target/VE/MCTargetDesc/VEFixupKinds.h
index 46b995cee840..0e2d55c0182e 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEFixupKinds.h
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEFixupKinds.h
@@ -20,28 +20,28 @@ enum Fixups {
   /// fixup_ve_srel32 - 32-bit fixup corresponding to foo for relative branch
   fixup_ve_srel32,
 
-  /// fixup_ve_hi32 - 32-bit fixup corresponding to foo@hi
+  /// fixup_ve_hi32 - 32-bit fixup corresponding to foo\@hi
   fixup_ve_hi32,
 
-  /// fixup_ve_lo32 - 32-bit fixup corresponding to foo@lo
+  /// fixup_ve_lo32 - 32-bit fixup corresponding to foo\@lo
   fixup_ve_lo32,
 
-  /// fixup_ve_pc_hi32 - 32-bit fixup corresponding to foo@pc_hi
+  /// fixup_ve_pc_hi32 - 32-bit fixup corresponding to foo\@pc_hi
   fixup_ve_pc_hi32,
 
-  /// fixup_ve_pc_lo32 - 32-bit fixup corresponding to foo@pc_lo
+  /// fixup_ve_pc_lo32 - 32-bit fixup corresponding to foo\@pc_lo
   fixup_ve_pc_lo32,
 
-  /// fixup_ve_got_hi32 - 32-bit fixup corresponding to foo@got_hi
+  /// fixup_ve_got_hi32 - 32-bit fixup corresponding to foo\@got_hi
   fixup_ve_got_hi32,
 
-  /// fixup_ve_got_lo32 - 32-bit fixup corresponding to foo@got_lo
+  /// fixup_ve_got_lo32 - 32-bit fixup corresponding to foo\@got_lo
   fixup_ve_got_lo32,
 
-  /// fixup_ve_gotoff_hi32 - 32-bit fixup corresponding to foo@gotoff_hi
+  /// fixup_ve_gotoff_hi32 - 32-bit fixup corresponding to foo\@gotoff_hi
   fixup_ve_gotoff_hi32,
 
-  /// fixup_ve_gotoff_lo32 - 32-bit fixup corresponding to foo@gotoff_lo
+  /// fixup_ve_gotoff_lo32 - 32-bit fixup corresponding to foo\@gotoff_lo
   fixup_ve_gotoff_lo32,
 
   /// fixup_ve_plt_hi32/lo32
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp
index 65bb0cf8b0d7..3eb246f73679 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp
@@ -159,7 +159,6 @@ uint64_t VEMCCodeEmitter::getRDOpValue(const MCInst &MI, unsigned OpNo,
 #include "VEGenMCCodeEmitter.inc"
 
 MCCodeEmitter *llvm::createVEMCCodeEmitter(const MCInstrInfo &MCII,
-                                           const MCRegisterInfo &MRI,
                                            MCContext &Ctx) {
   return new VEMCCodeEmitter(MCII, Ctx);
 }
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp
index 4d45918ad0aa..a1045107a832 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp
@@ -18,6 +18,7 @@
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/Casting.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h
index f0bb6e3acdee..d8f9d0634c24 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h
@@ -28,8 +28,7 @@ class MCSubtargetInfo;
 class MCTargetOptions;
 class Target;
 
-MCCodeEmitter *createVEMCCodeEmitter(const MCInstrInfo &MCII,
-                                     const MCRegisterInfo &MRI, MCContext &Ctx);
+MCCodeEmitter *createVEMCCodeEmitter(const MCInstrInfo &MCII, MCContext &Ctx);
 MCAsmBackend *createVEAsmBackend(const Target &T, const MCSubtargetInfo &STI,
                                  const MCRegisterInfo &MRI,
                                  const MCTargetOptions &Options);
diff --git a/llvm/lib/Target/VE/VE.h b/llvm/lib/Target/VE/VE.h
index 2a729a1a311c..2794d1458be7 100644
--- a/llvm/lib/Target/VE/VE.h
+++ b/llvm/lib/Target/VE/VE.h
@@ -27,7 +27,6 @@ class MCInst;
 class MachineInstr;
 
 FunctionPass *createVEISelDag(VETargetMachine &TM);
-FunctionPass *createVEPromoteToI1Pass();
 FunctionPass *createLVLGenPass();
 
 void LowerVEMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
@@ -370,5 +369,8 @@ inline static uint64_t mimm2Val(uint64_t Val) {
 inline unsigned M0(unsigned Val) { return Val + 64; }
 inline unsigned M1(unsigned Val) { return Val; }
 
+static const unsigned StandardVectorWidth = 256;
+static const unsigned PackedVectorWidth = 512;
+
 } // namespace llvm
 #endif
diff --git a/llvm/lib/Target/VE/VECustomDAG.cpp b/llvm/lib/Target/VE/VECustomDAG.cpp
index af3e4af13814..8f11eba6d5fd 100644
--- a/llvm/lib/Target/VE/VECustomDAG.cpp
+++ b/llvm/lib/Target/VE/VECustomDAG.cpp
@@ -19,17 +19,52 @@
 
 namespace llvm {
 
-static const int StandardVectorWidth = 256;
-
 bool isPackedVectorType(EVT SomeVT) {
   if (!SomeVT.isVector())
     return false;
   return SomeVT.getVectorNumElements() > StandardVectorWidth;
 }
 
+MVT splitVectorType(MVT VT) {
+  if (!VT.isVector())
+    return VT;
+  return MVT::getVectorVT(VT.getVectorElementType(), StandardVectorWidth);
+}
+
+MVT getLegalVectorType(Packing P, MVT ElemVT) {
+  return MVT::getVectorVT(ElemVT, P == Packing::Normal ? StandardVectorWidth
+                                                       : PackedVectorWidth);
+}
+
+Packing getTypePacking(EVT VT) {
+  assert(VT.isVector());
+  return isPackedVectorType(VT) ? Packing::Dense : Packing::Normal;
+}
+
+bool isMaskType(EVT SomeVT) {
+  if (!SomeVT.isVector())
+    return false;
+  return SomeVT.getVectorElementType() == MVT::i1;
+}
+
+bool isMaskArithmetic(SDValue Op) {
+  switch (Op.getOpcode()) {
+  default:
+    return false;
+  case ISD::AND:
+  case ISD::XOR:
+  case ISD::OR:
+    return isMaskType(Op.getValueType());
+  }
+}
+
 /// \returns the VVP_* SDNode opcode corresponsing to \p OC.
 Optional<unsigned> getVVPOpcode(unsigned Opcode) {
   switch (Opcode) {
+  case ISD::MLOAD:
+    return VEISD::VVP_LOAD;
+  case ISD::MSTORE:
+    return VEISD::VVP_STORE;
 #define HANDLE_VP_TO_VVP(VPOPC, VVPNAME)                                       \
   case ISD::VPOPC:                                                             \
     return VEISD::VVPNAME;
@@ -38,10 +73,76 @@ Optional<unsigned> getVVPOpcode(unsigned Opcode) {
   case ISD::SDNAME:                                                            \
     return VEISD::VVPNAME;
 #include "VVPNodes.def"
+  // TODO: Map those in VVPNodes.def too
+  case ISD::EXPERIMENTAL_VP_STRIDED_LOAD:
+    return VEISD::VVP_LOAD;
+  case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
+    return VEISD::VVP_STORE;
   }
   return None;
 }
 
+bool maySafelyIgnoreMask(SDValue Op) {
+  auto VVPOpc = getVVPOpcode(Op->getOpcode());
+  auto Opc = VVPOpc.value_or(Op->getOpcode());
+
+  switch (Opc) {
+  case VEISD::VVP_SDIV:
+  case VEISD::VVP_UDIV:
+  case VEISD::VVP_FDIV:
+  case VEISD::VVP_SELECT:
+    return false;
+
+  default:
+    return true;
+  }
+}
+
+bool supportsPackedMode(unsigned Opcode, EVT IdiomVT) {
+  bool IsPackedOp = isPackedVectorType(IdiomVT);
+  bool IsMaskOp = isMaskType(IdiomVT);
+  switch (Opcode) {
+  default:
+    return false;
+
+  case VEISD::VEC_BROADCAST:
+    return true;
+#define REGISTER_PACKED(VVP_NAME) case VEISD::VVP_NAME:
+#include "VVPNodes.def"
+    return IsPackedOp && !IsMaskOp;
+  }
+}
+
+bool isPackingSupportOpcode(unsigned Opc) {
+  switch (Opc) {
+  case VEISD::VEC_PACK:
+  case VEISD::VEC_UNPACK_LO:
+  case VEISD::VEC_UNPACK_HI:
+    return true;
+  }
+  return false;
+}
+
+bool isVVPOrVEC(unsigned Opcode) {
+  switch (Opcode) {
+  case VEISD::VEC_BROADCAST:
+#define ADD_VVP_OP(VVPNAME, ...) case VEISD::VVPNAME:
+#include "VVPNodes.def"
+    return true;
+  }
+  return false;
+}
+
+bool isVVPUnaryOp(unsigned VVPOpcode) {
+  switch (VVPOpcode) {
+#define ADD_UNARY_VVP_OP(VVPNAME, ...)                                         \
+  case VEISD::VVPNAME:                                                         \
+    return true;
+#include "VVPNodes.def"
+  }
+  return false;
+}
+
 bool isVVPBinaryOp(unsigned VVPOpcode) {
   switch (VVPOpcode) {
 #define ADD_BINARY_VVP_OP(VVPNAME, ...)                                        \
@@ -52,16 +153,308 @@ bool isVVPBinaryOp(unsigned VVPOpcode) {
   return false;
 }
 
+bool isVVPReductionOp(unsigned Opcode) {
+  switch (Opcode) {
+#define ADD_REDUCE_VVP_OP(VVP_NAME, SDNAME) case VEISD::VVP_NAME:
+#include "VVPNodes.def"
+    return true;
+  }
+  return false;
+}
+
+// Return the AVL operand position for this VVP or VEC Op.
+Optional<int> getAVLPos(unsigned Opc) {
+  // This is only available for VP SDNodes
+  auto PosOpt = ISD::getVPExplicitVectorLengthIdx(Opc);
+  if (PosOpt)
+    return *PosOpt;
+
+  // VVP Opcodes.
+  if (isVVPBinaryOp(Opc))
+    return 3;
+
+  // VM Opcodes.
+  switch (Opc) {
+  case VEISD::VEC_BROADCAST:
+    return 1;
+  case VEISD::VVP_SELECT:
+    return 3;
+  case VEISD::VVP_LOAD:
+    return 4;
+  case VEISD::VVP_STORE:
+    return 5;
+  }
+
+  return None;
+}
+
+Optional<int> getMaskPos(unsigned Opc) {
+  // This is only available for VP SDNodes
+  auto PosOpt = ISD::getVPMaskIdx(Opc);
+  if (PosOpt)
+    return *PosOpt;
+
+  // VVP Opcodes.
+  if (isVVPBinaryOp(Opc))
+    return 2;
+
+  // Other opcodes.
+  switch (Opc) {
+  case ISD::MSTORE:
+    return 4;
+  case ISD::MLOAD:
+    return 3;
+  case VEISD::VVP_SELECT:
+    return 2;
+  }
+
+  return None;
+}
+
+bool isLegalAVL(SDValue AVL) { return AVL->getOpcode() == VEISD::LEGALAVL; }
+
+/// Node Properties {
+
+SDValue getNodeChain(SDValue Op) {
+  if (MemSDNode *MemN = dyn_cast<MemSDNode>(Op.getNode()))
+    return MemN->getChain();
+
+  switch (Op->getOpcode()) {
+  case VEISD::VVP_LOAD:
+  case VEISD::VVP_STORE:
+    return Op->getOperand(0);
+  }
+  return SDValue();
+}
+
+SDValue getMemoryPtr(SDValue Op) {
+  if (auto *MemN = dyn_cast<MemSDNode>(Op.getNode()))
+    return MemN->getBasePtr();
+
+  switch (Op->getOpcode()) {
+  case VEISD::VVP_LOAD:
+    return Op->getOperand(1);
+  case VEISD::VVP_STORE:
+    return Op->getOperand(2);
+  }
+  return SDValue();
+}
+
+Optional<EVT> getIdiomaticVectorType(SDNode *Op) {
+  unsigned OC = Op->getOpcode();
+
+  // For memory ops -> the transfered data type
+  if (auto MemN = dyn_cast<MemSDNode>(Op))
+    return MemN->getMemoryVT();
+
+  switch (OC) {
+  // Standard ISD.
+  case ISD::SELECT: // not aliased with VVP_SELECT
+  case ISD::CONCAT_VECTORS:
+  case ISD::EXTRACT_SUBVECTOR:
+  case ISD::VECTOR_SHUFFLE:
+  case ISD::BUILD_VECTOR:
+  case ISD::SCALAR_TO_VECTOR:
+    return Op->getValueType(0);
+  }
+
+  // Translate to VVP where possible.
+  unsigned OriginalOC = OC;
+  if (auto VVPOpc = getVVPOpcode(OC))
+    OC = *VVPOpc;
+
+  if (isVVPReductionOp(OC))
+    return Op->getOperand(hasReductionStartParam(OriginalOC) ? 1 : 0)
+        .getValueType();
+
+  switch (OC) {
+  default:
+  case VEISD::VVP_SETCC:
+    return Op->getOperand(0).getValueType();
+
+  case VEISD::VVP_SELECT:
+#define ADD_BINARY_VVP_OP(VVP_NAME, ...) case VEISD::VVP_NAME:
+#include "VVPNodes.def"
+    return Op->getValueType(0);
+
+  case VEISD::VVP_LOAD:
+    return Op->getValueType(0);
+
+  case VEISD::VVP_STORE:
+    return Op->getOperand(1)->getValueType(0);
+
+  // VEC
+  case VEISD::VEC_BROADCAST:
+    return Op->getValueType(0);
+  }
+}
+
+SDValue getLoadStoreStride(SDValue Op, VECustomDAG &CDAG) {
+  switch (Op->getOpcode()) {
+  case VEISD::VVP_STORE:
+    return Op->getOperand(3);
+  case VEISD::VVP_LOAD:
+    return Op->getOperand(2);
+  }
+
+  if (auto *StoreN = dyn_cast<VPStridedStoreSDNode>(Op.getNode()))
+    return StoreN->getStride();
+  if (auto *StoreN = dyn_cast<VPStridedLoadSDNode>(Op.getNode()))
+    return StoreN->getStride();
+
+  if (isa<MemSDNode>(Op.getNode())) {
+    // Regular MLOAD/MSTORE/LOAD/STORE
+    // No stride argument -> use the contiguous element size as stride.
+    uint64_t ElemStride = getIdiomaticVectorType(Op.getNode())
+                              ->getVectorElementType()
+                              .getStoreSize();
+    return CDAG.getConstant(ElemStride, MVT::i64);
+  }
+  return SDValue();
+}
+
+SDValue getGatherScatterIndex(SDValue Op) {
+  if (auto *N = dyn_cast<MaskedGatherScatterSDNode>(Op.getNode()))
+    return N->getIndex();
+  if (auto *N = dyn_cast<VPGatherScatterSDNode>(Op.getNode()))
+    return N->getIndex();
+  return SDValue();
+}
+
+SDValue getGatherScatterScale(SDValue Op) {
+  if (auto *N = dyn_cast<MaskedGatherScatterSDNode>(Op.getNode()))
+    return N->getScale();
+  if (auto *N = dyn_cast<VPGatherScatterSDNode>(Op.getNode()))
+    return N->getScale();
+  return SDValue();
+}
+
+SDValue getStoredValue(SDValue Op) {
+  switch (Op->getOpcode()) {
+  case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
+  case VEISD::VVP_STORE:
+    return Op->getOperand(1);
+  }
+  if (auto *StoreN = dyn_cast<StoreSDNode>(Op.getNode()))
+    return StoreN->getValue();
+  if (auto *StoreN = dyn_cast<MaskedStoreSDNode>(Op.getNode()))
+    return StoreN->getValue();
+  if (auto *StoreN = dyn_cast<VPStridedStoreSDNode>(Op.getNode()))
+    return StoreN->getValue();
+  if (auto *StoreN = dyn_cast<VPStoreSDNode>(Op.getNode()))
+    return StoreN->getValue();
+  if (auto *StoreN = dyn_cast<MaskedScatterSDNode>(Op.getNode()))
+    return StoreN->getValue();
+  if (auto *StoreN = dyn_cast<VPScatterSDNode>(Op.getNode()))
+    return StoreN->getValue();
+  return SDValue();
+}
+
+SDValue getNodePassthru(SDValue Op) {
+  if (auto *N = dyn_cast<MaskedLoadSDNode>(Op.getNode()))
+    return N->getPassThru();
+  if (auto *N = dyn_cast<MaskedGatherSDNode>(Op.getNode()))
+    return N->getPassThru();
+
+  return SDValue();
+}
+
+bool hasReductionStartParam(unsigned OPC) {
+  // TODO: Ordered reduction opcodes.
+  if (ISD::isVPReduction(OPC))
+    return true;
+  return false;
+}
+
+unsigned getScalarReductionOpcode(unsigned VVPOC, bool IsMask) {
+  assert(!IsMask && "Mask reduction isel");
+
+  switch (VVPOC) {
+#define HANDLE_VVP_REDUCE_TO_SCALAR(VVP_RED_ISD, REDUCE_ISD)                   \
+  case VEISD::VVP_RED_ISD:                                                     \
+    return ISD::REDUCE_ISD;
+#include "VVPNodes.def"
+  default:
+    break;
+  }
+  llvm_unreachable("Cannot not scalarize this reduction Opcode!");
+}
+
+/// } Node Properties
+
+SDValue getNodeAVL(SDValue Op) {
+  auto PosOpt = getAVLPos(Op->getOpcode());
+  return PosOpt ? Op->getOperand(*PosOpt) : SDValue();
+}
+
+SDValue getNodeMask(SDValue Op) {
+  auto PosOpt = getMaskPos(Op->getOpcode());
+  return PosOpt ? Op->getOperand(*PosOpt) : SDValue();
+}
+
+std::pair<SDValue, bool> getAnnotatedNodeAVL(SDValue Op) {
+  SDValue AVL = getNodeAVL(Op);
+  if (!AVL)
+    return {SDValue(), true};
+  if (isLegalAVL(AVL))
+    return {AVL->getOperand(0), true};
+  return {AVL, false};
+}
+
 SDValue VECustomDAG::getConstant(uint64_t Val, EVT VT, bool IsTarget,
                                  bool IsOpaque) const {
   return DAG.getConstant(Val, DL, VT, IsTarget, IsOpaque);
 }
 
+SDValue VECustomDAG::getConstantMask(Packing Packing, bool AllTrue) const {
+  auto MaskVT = getLegalVectorType(Packing, MVT::i1);
+
+  // VEISelDAGtoDAG will replace this pattern with the constant-true VM.
+  auto TrueVal = DAG.getConstant(-1, DL, MVT::i32);
+  auto AVL = getConstant(MaskVT.getVectorNumElements(), MVT::i32);
+  auto Res = getNode(VEISD::VEC_BROADCAST, MaskVT, {TrueVal, AVL});
+  if (AllTrue)
+    return Res;
+
+  return DAG.getNOT(DL, Res, Res.getValueType());
+}
+
+SDValue VECustomDAG::getMaskBroadcast(EVT ResultVT, SDValue Scalar,
+                                      SDValue AVL) const {
+  // Constant mask splat.
+  if (auto BcConst = dyn_cast<ConstantSDNode>(Scalar))
+    return getConstantMask(getTypePacking(ResultVT),
+                           BcConst->getSExtValue() != 0);
+
+  // Expand the broadcast to a vector comparison.
+  auto ScalarBoolVT = Scalar.getSimpleValueType();
+  assert(ScalarBoolVT == MVT::i32);
+
+  // Cast to i32 ty.
+  SDValue CmpElem = DAG.getSExtOrTrunc(Scalar, DL, MVT::i32);
+  unsigned ElemCount = ResultVT.getVectorNumElements();
+  MVT CmpVecTy = MVT::getVectorVT(ScalarBoolVT, ElemCount);
+
+  // Broadcast to vector.
+  SDValue BCVec =
+      DAG.getNode(VEISD::VEC_BROADCAST, DL, CmpVecTy, {CmpElem, AVL});
+  SDValue ZeroVec =
+      getBroadcast(CmpVecTy, {DAG.getConstant(0, DL, ScalarBoolVT)}, AVL);
+
+  MVT BoolVecTy = MVT::getVectorVT(MVT::i1, ElemCount);
+
+  // Broadcast(Data) != Broadcast(0)
+  // TODO: Use a VVP operation for this.
+  return DAG.getSetCC(DL, BoolVecTy, BCVec, ZeroVec, ISD::CondCode::SETNE);
+}
+
 SDValue VECustomDAG::getBroadcast(EVT ResultVT, SDValue Scalar,
                                   SDValue AVL) const {
   assert(ResultVT.isVector());
   auto ScaVT = Scalar.getValueType();
-  assert(ScaVT != MVT::i1 && "TODO: Mask broadcasts");
+
+  if (isMaskType(ResultVT))
+    return getMaskBroadcast(ResultVT, Scalar, AVL);
 
   if (isPackedVectorType(ResultVT)) {
     // v512x packed mode broadcast
@@ -78,4 +471,119 @@ SDValue VECustomDAG::getBroadcast(EVT ResultVT, SDValue Scalar,
   return getNode(VEISD::VEC_BROADCAST, ResultVT, {Scalar, AVL});
 }
 
+SDValue VECustomDAG::annotateLegalAVL(SDValue AVL) const {
+  if (isLegalAVL(AVL))
+    return AVL;
+  return getNode(VEISD::LEGALAVL, AVL.getValueType(), AVL);
+}
+
+SDValue VECustomDAG::getUnpack(EVT DestVT, SDValue Vec, PackElem Part,
+                               SDValue AVL) const {
+  assert(getAnnotatedNodeAVL(AVL).second && "Expected a pack-legalized AVL");
+
+  // TODO: Peek through VEC_PACK and VEC_BROADCAST(REPL_<sth> ..) operands.
+  unsigned OC =
+      (Part == PackElem::Lo) ? VEISD::VEC_UNPACK_LO : VEISD::VEC_UNPACK_HI;
+  return DAG.getNode(OC, DL, DestVT, Vec, AVL);
+}
+
+SDValue VECustomDAG::getPack(EVT DestVT, SDValue LoVec, SDValue HiVec,
+                             SDValue AVL) const {
+  assert(getAnnotatedNodeAVL(AVL).second && "Expected a pack-legalized AVL");
+
+  // TODO: Peek through VEC_UNPACK_LO|HI operands.
+  return DAG.getNode(VEISD::VEC_PACK, DL, DestVT, LoVec, HiVec, AVL);
+}
+
+VETargetMasks VECustomDAG::getTargetSplitMask(SDValue RawMask, SDValue RawAVL,
+                                              PackElem Part) const {
+  // Adjust AVL for this part
+  SDValue NewAVL;
+  SDValue OneV = getConstant(1, MVT::i32);
+  if (Part == PackElem::Hi)
+    NewAVL = getNode(ISD::ADD, MVT::i32, {RawAVL, OneV});
+  else
+    NewAVL = RawAVL;
+  NewAVL = getNode(ISD::SRL, MVT::i32, {NewAVL, OneV});
+
+  NewAVL = annotateLegalAVL(NewAVL);
+
+  // Legalize Mask (unpack or all-true)
+  SDValue NewMask;
+  if (!RawMask)
+    NewMask = getConstantMask(Packing::Normal, true);
+  else
+    NewMask = getUnpack(MVT::v256i1, RawMask, Part, NewAVL);
+
+  return VETargetMasks(NewMask, NewAVL);
+}
+
+SDValue VECustomDAG::getSplitPtrOffset(SDValue Ptr, SDValue ByteStride,
+                                       PackElem Part) const {
+  // High starts at base ptr but has more significant bits in the 64bit vector
+  // element.
+  if (Part == PackElem::Hi)
+    return Ptr;
+  return getNode(ISD::ADD, MVT::i64, {Ptr, ByteStride});
+}
+
+SDValue VECustomDAG::getSplitPtrStride(SDValue PackStride) const {
+  if (auto ConstBytes = dyn_cast<ConstantSDNode>(PackStride))
+    return getConstant(2 * ConstBytes->getSExtValue(), MVT::i64);
+  return getNode(ISD::SHL, MVT::i64, {PackStride, getConstant(1, MVT::i32)});
+}
+
+SDValue VECustomDAG::getGatherScatterAddress(SDValue BasePtr, SDValue Scale,
+                                             SDValue Index, SDValue Mask,
+                                             SDValue AVL) const {
+  EVT IndexVT = Index.getValueType();
+
+  // Apply scale.
+  SDValue ScaledIndex;
+  if (!Scale || isOneConstant(Scale))
+    ScaledIndex = Index;
+  else {
+    SDValue ScaleBroadcast = getBroadcast(IndexVT, Scale, AVL);
+    ScaledIndex =
+        getNode(VEISD::VVP_MUL, IndexVT, {Index, ScaleBroadcast, Mask, AVL});
+  }
+
+  // Add basePtr.
+  if (isNullConstant(BasePtr))
+    return ScaledIndex;
+
+  // re-constitute pointer vector (basePtr + index * scale)
+  SDValue BaseBroadcast = getBroadcast(IndexVT, BasePtr, AVL);
+  auto ResPtr =
+      getNode(VEISD::VVP_ADD, IndexVT, {BaseBroadcast, ScaledIndex, Mask, AVL});
+  return ResPtr;
+}
+
+SDValue VECustomDAG::getLegalReductionOpVVP(unsigned VVPOpcode, EVT ResVT,
+                                            SDValue StartV, SDValue VectorV,
+                                            SDValue Mask, SDValue AVL,
+                                            SDNodeFlags Flags) const {
+
+  // Optionally attach the start param with a scalar op (where it is
+  // unsupported).
+  bool scalarizeStartParam = StartV && !hasReductionStartParam(VVPOpcode);
+  bool IsMaskReduction = isMaskType(VectorV.getValueType());
+  assert(!IsMaskReduction && "TODO Implement");
+  auto AttachStartValue = [&](SDValue ReductionResV) {
+    if (!scalarizeStartParam)
+      return ReductionResV;
+    auto ScalarOC = getScalarReductionOpcode(VVPOpcode, IsMaskReduction);
+    return getNode(ScalarOC, ResVT, {StartV, ReductionResV});
+  };
+
+  // Fixup: Always Use sequential 'fmul' reduction.
+  if (!scalarizeStartParam && StartV) {
+    assert(hasReductionStartParam(VVPOpcode));
+    return AttachStartValue(
+        getNode(VVPOpcode, ResVT, {StartV, VectorV, Mask, AVL}, Flags));
+  } else
+    return AttachStartValue(
+        getNode(VVPOpcode, ResVT, {VectorV, Mask, AVL}, Flags));
+}
+
 } // namespace llvm
diff --git a/llvm/lib/Target/VE/VECustomDAG.h b/llvm/lib/Target/VE/VECustomDAG.h
index ddd6ce783366..0d35c098048e 100644
--- a/llvm/lib/Target/VE/VECustomDAG.h
+++ b/llvm/lib/Target/VE/VECustomDAG.h
@@ -23,10 +23,122 @@ namespace llvm {
 
 Optional<unsigned> getVVPOpcode(unsigned Opcode);
 
+bool isVVPUnaryOp(unsigned Opcode);
 bool isVVPBinaryOp(unsigned Opcode);
+bool isVVPReductionOp(unsigned Opcode);
+
+MVT splitVectorType(MVT VT);
 
 bool isPackedVectorType(EVT SomeVT);
 
+bool isMaskType(EVT SomeVT);
+
+bool isMaskArithmetic(SDValue Op);
+
+bool isVVPOrVEC(unsigned);
+
+bool supportsPackedMode(unsigned Opcode, EVT IdiomVT);
+
+bool isPackingSupportOpcode(unsigned Opc);
+
+bool maySafelyIgnoreMask(SDValue Op);
+
+/// The VE backend uses a two-staged process to lower and legalize vector
+/// instructions:
+//
+/// 1. VP and standard vector SDNodes are lowered to SDNodes of the VVP_* layer.
+//
+//     All VVP nodes have a mask and an Active Vector Length (AVL) parameter.
+//     The AVL parameters refers to the element position in the vector the VVP
+//     node operates on.
+//
+//
+//  2. The VVP SDNodes are legalized. The AVL in a legal VVP node refers to
+//     chunks of 64bit. We track this by wrapping the AVL in a LEGALAVL node.
+//
+//     The AVL mechanism in the VE architecture always refers to chunks of
+//     64bit, regardless of the actual element type vector instructions are
+//     operating on. For vector types v256.32 or v256.64 nothing needs to be
+//     legalized since each element occupies a 64bit chunk - there is no
+//     difference between counting 64bit chunks or element positions. However,
+//     all vector types with > 256 elements store more than one logical element
+//     per 64bit chunk and need to be transformed.
+//     However legalization is performed, the resulting legal VVP SDNodes will
+//     have a LEGALAVL node as their AVL operand. The LEGALAVL nodes wraps
+//     around an AVL that refers to 64 bit chunks just as the architecture
+//     demands - that is, the wrapped AVL is the correct setting for the VL
+//     register for this VVP operation to get the desired behavior.
+//
+/// AVL Functions {
+// The AVL operand position of this node.
+Optional<int> getAVLPos(unsigned);
+
+// Whether this is a LEGALAVL node.
+bool isLegalAVL(SDValue AVL);
+
+// The AVL operand of this node.
+SDValue getNodeAVL(SDValue);
+
+// Mask position of this node.
+Optional<int> getMaskPos(unsigned);
+
+SDValue getNodeMask(SDValue);
+
+// Return the AVL operand of this node. If it is a LEGALAVL node, unwrap it.
+// Return with the boolean whether unwrapping happened.
+std::pair<SDValue, bool> getAnnotatedNodeAVL(SDValue);
+
+/// } AVL Functions
+
+/// Node Properties {
+
+Optional<EVT> getIdiomaticVectorType(SDNode *Op);
+
+SDValue getLoadStoreStride(SDValue Op, VECustomDAG &CDAG);
+
+SDValue getMemoryPtr(SDValue Op);
+
+SDValue getNodeChain(SDValue Op);
+
+SDValue getStoredValue(SDValue Op);
+
+SDValue getNodePassthru(SDValue Op);
+
+SDValue getGatherScatterIndex(SDValue Op);
+
+SDValue getGatherScatterScale(SDValue Op);
+
+unsigned getScalarReductionOpcode(unsigned VVPOC, bool IsMask);
+
+// Whether this VP_REDUCE_*/ VECREDUCE_*/VVP_REDUCE_* SDNode has a start
+// parameter.
+bool hasReductionStartParam(unsigned VVPOC);
+
+/// } Node Properties
+
+enum class Packing {
+  Normal = 0, // 256 element standard mode.
+  Dense = 1   // 512 element packed mode.
+};
+
+// Get the vector or mask register type for this packing and element type.
+MVT getLegalVectorType(Packing P, MVT ElemVT);
+
+// Whether this type belongs to a packed mask or vector register.
+Packing getTypePacking(EVT);
+
+enum class PackElem : int8_t {
+  Lo = 0, // Integer (63, 32]
+  Hi = 1  // Float   (32,  0]
+};
+
+struct VETargetMasks {
+  SDValue Mask;
+  SDValue AVL;
+  VETargetMasks(SDValue Mask = SDValue(), SDValue AVL = SDValue())
+      : Mask(Mask), AVL(AVL) {}
+};
+
 class VECustomDAG {
   SelectionDAG &DAG;
   SDLoc DL;
@@ -68,10 +180,42 @@ public:
   SDValue getUNDEF(EVT VT) const { return DAG.getUNDEF(VT); }
   /// } getNode
 
+  /// Legalizing getNode {
+  SDValue getLegalReductionOpVVP(unsigned VVPOpcode, EVT ResVT, SDValue StartV,
+                                 SDValue VectorV, SDValue Mask, SDValue AVL,
+                                 SDNodeFlags Flags) const;
+  /// } Legalizing getNode
+
+  /// Packing {
+  SDValue getUnpack(EVT DestVT, SDValue Vec, PackElem Part, SDValue AVL) const;
+  SDValue getPack(EVT DestVT, SDValue LoVec, SDValue HiVec, SDValue AVL) const;
+  /// } Packing
+
+  SDValue getMergeValues(ArrayRef<SDValue> Values) const {
+    return DAG.getMergeValues(Values, DL);
+  }
+
   SDValue getConstant(uint64_t Val, EVT VT, bool IsTarget = false,
                       bool IsOpaque = false) const;
 
+  SDValue getConstantMask(Packing Packing, bool AllTrue) const;
+  SDValue getMaskBroadcast(EVT ResultVT, SDValue Scalar, SDValue AVL) const;
   SDValue getBroadcast(EVT ResultVT, SDValue Scalar, SDValue AVL) const;
+
+  // Wrap AVL in a LEGALAVL node (unless it is one already).
+  SDValue annotateLegalAVL(SDValue AVL) const;
+  VETargetMasks getTargetSplitMask(SDValue RawMask, SDValue RawAVL,
+                                   PackElem Part) const;
+
+  // Splitting support
+  SDValue getSplitPtrOffset(SDValue Ptr, SDValue ByteStride,
+                            PackElem Part) const;
+  SDValue getSplitPtrStride(SDValue PackStride) const;
+  SDValue getGatherScatterAddress(SDValue BasePtr, SDValue Scale, SDValue Index,
+                                  SDValue Mask, SDValue AVL) const;
+  EVT getVectorVT(EVT ElemVT, unsigned NumElems) const {
+    return EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElems);
+  }
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/VE/VEISelDAGToDAG.cpp b/llvm/lib/Target/VE/VEISelDAGToDAG.cpp
index e2608e82c9d4..a4319ec1c975 100644
--- a/llvm/lib/Target/VE/VEISelDAGToDAG.cpp
+++ b/llvm/lib/Target/VE/VEISelDAGToDAG.cpp
@@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "VE.h"
 #include "VETargetMachine.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
@@ -335,6 +336,42 @@ void VEDAGToDAGISel::Select(SDNode *N) {
   }
 
   switch (N->getOpcode()) {
+
+  // Late eliminate the LEGALAVL wrapper
+  case VEISD::LEGALAVL:
+    ReplaceNode(N, N->getOperand(0).getNode());
+    return;
+
+  // Lower (broadcast 1) and (broadcast 0) to VM[P]0
+  case VEISD::VEC_BROADCAST: {
+    MVT SplatResTy = N->getSimpleValueType(0);
+    if (SplatResTy.getVectorElementType() != MVT::i1)
+      break;
+
+    // Constant non-zero broadcast.
+    auto BConst = dyn_cast<ConstantSDNode>(N->getOperand(0));
+    if (!BConst)
+      break;
+    bool BCTrueMask = (BConst->getSExtValue() != 0);
+    if (!BCTrueMask)
+      break;
+
+    // Packed or non-packed.
+    SDValue New;
+    if (SplatResTy.getVectorNumElements() == StandardVectorWidth) {
+      New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(N), VE::VM0,
+                                   MVT::v256i1);
+    } else if (SplatResTy.getVectorNumElements() == PackedVectorWidth) {
+      New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(N), VE::VMP0,
+                                   MVT::v512i1);
+    } else
+      break;
+
+    // Replace.
+    ReplaceNode(N, New.getNode());
+    return;
+  }
+
   case VEISD::GLOBAL_BASE_REG:
     ReplaceNode(N, getGlobalBaseReg());
     return;
diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp
index 9137c476777e..2eea65033870 100644
--- a/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -76,6 +76,8 @@ bool VETargetLowering::CanLowerReturn(
 static const MVT AllVectorVTs[] = {MVT::v256i32, MVT::v512i32, MVT::v256i64,
                                    MVT::v256f32, MVT::v512f32, MVT::v256f64};
 
+static const MVT AllMaskVTs[] = {MVT::v256i1, MVT::v512i1};
+
 static const MVT AllPackedVTs[] = {MVT::v512i32, MVT::v512f32};
 
 void VETargetLowering::initRegisterClasses() {
@@ -294,6 +296,12 @@ void VETargetLowering::initSPUActions() {
 }
 
 void VETargetLowering::initVPUActions() {
+  for (MVT LegalMaskVT : AllMaskVTs)
+    setOperationAction(ISD::BUILD_VECTOR, LegalMaskVT, Custom);
+
+  for (unsigned Opc : {ISD::AND, ISD::OR, ISD::XOR})
+    setOperationAction(Opc, MVT::v512i1, Custom);
+
   for (MVT LegalVecVT : AllVectorVTs) {
     setOperationAction(ISD::BUILD_VECTOR, LegalVecVT, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT, LegalVecVT, Legal);
@@ -307,6 +315,8 @@ void VETargetLowering::initVPUActions() {
   setOperationAction(ISD::VP_OPC, LegalVecVT, Custom);
 #define ADD_VVP_OP(VVP_NAME, ISD_NAME)                                         \
   setOperationAction(ISD::ISD_NAME, LegalVecVT, Custom);
+    setOperationAction(ISD::EXPERIMENTAL_VP_STRIDED_LOAD, LegalVecVT, Custom);
+    setOperationAction(ISD::EXPERIMENTAL_VP_STRIDED_STORE, LegalVecVT, Custom);
 #include "VVPNodes.def"
   }
 
@@ -314,6 +324,32 @@ void VETargetLowering::initVPUActions() {
     setOperationAction(ISD::INSERT_VECTOR_ELT, LegalPackedVT, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, LegalPackedVT, Custom);
   }
+
+  // vNt32, vNt64 ops (legal element types)
+  for (MVT VT : MVT::vector_valuetypes()) {
+    MVT ElemVT = VT.getVectorElementType();
+    unsigned ElemBits = ElemVT.getScalarSizeInBits();
+    if (ElemBits != 32 && ElemBits != 64)
+      continue;
+
+    for (unsigned MemOpc : {ISD::MLOAD, ISD::MSTORE, ISD::LOAD, ISD::STORE})
+      setOperationAction(MemOpc, VT, Custom);
+
+    const ISD::NodeType IntReductionOCs[] = {
+        ISD::VECREDUCE_ADD,  ISD::VECREDUCE_MUL,  ISD::VECREDUCE_AND,
+        ISD::VECREDUCE_OR,   ISD::VECREDUCE_XOR,  ISD::VECREDUCE_SMIN,
+        ISD::VECREDUCE_SMAX, ISD::VECREDUCE_UMIN, ISD::VECREDUCE_UMAX};
+
+    for (unsigned IntRedOpc : IntReductionOCs)
+      setOperationAction(IntRedOpc, VT, Custom);
+  }
+
+  // v256i1 and v512i1 ops
+  for (MVT MaskVT : AllMaskVTs) {
+    // Custom lower mask ops
+    setOperationAction(ISD::STORE, MaskVT, Custom);
+    setOperationAction(ISD::LOAD, MaskVT, Custom);
+  }
 }
 
 SDValue
@@ -898,10 +934,15 @@ const char *VETargetLowering::getTargetNodeName(unsigned Opcode) const {
     TARGET_NODE_CASE(MEMBARRIER)
     TARGET_NODE_CASE(RET_FLAG)
     TARGET_NODE_CASE(TS1AM)
+    TARGET_NODE_CASE(VEC_UNPACK_LO)
+    TARGET_NODE_CASE(VEC_UNPACK_HI)
+    TARGET_NODE_CASE(VEC_PACK)
     TARGET_NODE_CASE(VEC_BROADCAST)
     TARGET_NODE_CASE(REPL_I32)
     TARGET_NODE_CASE(REPL_F32)
 
+    TARGET_NODE_CASE(LEGALAVL)
+
     // Register the VVP_* SDNodes.
 #define ADD_VVP_OP(VVP_NAME, ...) TARGET_NODE_CASE(VVP_NAME)
 #include "VVPNodes.def"
@@ -1305,9 +1346,81 @@ static SDValue lowerLoadF128(SDValue Op, SelectionDAG &DAG) {
   return DAG.getMergeValues(Ops, DL);
 }
 
+// Lower a vXi1 load into following instructions
+//   LDrii %1, (,%addr)
+//   LVMxir  %vm, 0, %1
+//   LDrii %2, 8(,%addr)
+//   LVMxir  %vm, 0, %2
+//   ...
+static SDValue lowerLoadI1(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  LoadSDNode *LdNode = dyn_cast<LoadSDNode>(Op.getNode());
+  assert(LdNode && LdNode->getOffset().isUndef() && "Unexpected node type");
+
+  SDValue BasePtr = LdNode->getBasePtr();
+  unsigned Alignment = LdNode->getAlign().value();
+  if (Alignment > 8)
+    Alignment = 8;
+
+  EVT AddrVT = BasePtr.getValueType();
+  EVT MemVT = LdNode->getMemoryVT();
+  if (MemVT == MVT::v256i1 || MemVT == MVT::v4i64) {
+    SDValue OutChains[4];
+    SDNode *VM = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MemVT);
+    for (int i = 0; i < 4; ++i) {
+      // Generate load dag and prepare chains.
+      SDValue Addr = DAG.getNode(ISD::ADD, DL, AddrVT, BasePtr,
+                                 DAG.getConstant(8 * i, DL, AddrVT));
+      SDValue Val =
+          DAG.getLoad(MVT::i64, DL, LdNode->getChain(), Addr,
+                      LdNode->getPointerInfo(), Alignment,
+                      LdNode->isVolatile() ? MachineMemOperand::MOVolatile
+                                           : MachineMemOperand::MONone);
+      OutChains[i] = SDValue(Val.getNode(), 1);
+
+      VM = DAG.getMachineNode(VE::LVMir_m, DL, MVT::i64,
+                              DAG.getTargetConstant(i, DL, MVT::i64), Val,
+                              SDValue(VM, 0));
+    }
+    SDValue OutChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
+    SDValue Ops[2] = {SDValue(VM, 0), OutChain};
+    return DAG.getMergeValues(Ops, DL);
+  } else if (MemVT == MVT::v512i1 || MemVT == MVT::v8i64) {
+    SDValue OutChains[8];
+    SDNode *VM = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MemVT);
+    for (int i = 0; i < 8; ++i) {
+      // Generate load dag and prepare chains.
+      SDValue Addr = DAG.getNode(ISD::ADD, DL, AddrVT, BasePtr,
+                                 DAG.getConstant(8 * i, DL, AddrVT));
+      SDValue Val =
+          DAG.getLoad(MVT::i64, DL, LdNode->getChain(), Addr,
+                      LdNode->getPointerInfo(), Alignment,
+                      LdNode->isVolatile() ? MachineMemOperand::MOVolatile
+                                           : MachineMemOperand::MONone);
+      OutChains[i] = SDValue(Val.getNode(), 1);
+
+      VM = DAG.getMachineNode(VE::LVMyir_y, DL, MVT::i64,
+                              DAG.getTargetConstant(i, DL, MVT::i64), Val,
+                              SDValue(VM, 0));
+    }
+    SDValue OutChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
+    SDValue Ops[2] = {SDValue(VM, 0), OutChain};
+    return DAG.getMergeValues(Ops, DL);
+  } else {
+    // Otherwise, ask llvm to expand it.
+    return SDValue();
+  }
+}
+
 SDValue VETargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   LoadSDNode *LdNode = cast<LoadSDNode>(Op.getNode());
 
+  EVT MemVT = LdNode->getMemoryVT();
+
+  // Dispatch to vector isel.
+  if (MemVT.isVector() && !isMaskType(MemVT))
+    return lowerToVVP(Op, DAG);
+
   SDValue BasePtr = LdNode->getBasePtr();
   if (isa<FrameIndexSDNode>(BasePtr.getNode())) {
     // Do not expand store instruction with frame index here because of
@@ -1315,9 +1428,10 @@ SDValue VETargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
     return Op;
   }
 
-  EVT MemVT = LdNode->getMemoryVT();
   if (MemVT == MVT::f128)
     return lowerLoadF128(Op, DAG);
+  if (isMaskType(MemVT))
+    return lowerLoadI1(Op, DAG);
 
   return Op;
 }
@@ -1358,10 +1472,68 @@ static SDValue lowerStoreF128(SDValue Op, SelectionDAG &DAG) {
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
 }
 
+// Lower a vXi1 store into following instructions
+//   SVMi  %1, %vm, 0
+//   STrii %1, (,%addr)
+//   SVMi  %2, %vm, 1
+//   STrii %2, 8(,%addr)
+//   ...
+static SDValue lowerStoreI1(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  StoreSDNode *StNode = dyn_cast<StoreSDNode>(Op.getNode());
+  assert(StNode && StNode->getOffset().isUndef() && "Unexpected node type");
+
+  SDValue BasePtr = StNode->getBasePtr();
+  unsigned Alignment = StNode->getAlign().value();
+  if (Alignment > 8)
+    Alignment = 8;
+  EVT AddrVT = BasePtr.getValueType();
+  EVT MemVT = StNode->getMemoryVT();
+  if (MemVT == MVT::v256i1 || MemVT == MVT::v4i64) {
+    SDValue OutChains[4];
+    for (int i = 0; i < 4; ++i) {
+      SDNode *V =
+          DAG.getMachineNode(VE::SVMmi, DL, MVT::i64, StNode->getValue(),
+                             DAG.getTargetConstant(i, DL, MVT::i64));
+      SDValue Addr = DAG.getNode(ISD::ADD, DL, AddrVT, BasePtr,
+                                 DAG.getConstant(8 * i, DL, AddrVT));
+      OutChains[i] =
+          DAG.getStore(StNode->getChain(), DL, SDValue(V, 0), Addr,
+                       MachinePointerInfo(), Alignment,
+                       StNode->isVolatile() ? MachineMemOperand::MOVolatile
+                                            : MachineMemOperand::MONone);
+    }
+    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
+  } else if (MemVT == MVT::v512i1 || MemVT == MVT::v8i64) {
+    SDValue OutChains[8];
+    for (int i = 0; i < 8; ++i) {
+      SDNode *V =
+          DAG.getMachineNode(VE::SVMyi, DL, MVT::i64, StNode->getValue(),
+                             DAG.getTargetConstant(i, DL, MVT::i64));
+      SDValue Addr = DAG.getNode(ISD::ADD, DL, AddrVT, BasePtr,
+                                 DAG.getConstant(8 * i, DL, AddrVT));
+      OutChains[i] =
+          DAG.getStore(StNode->getChain(), DL, SDValue(V, 0), Addr,
+                       MachinePointerInfo(), Alignment,
+                       StNode->isVolatile() ? MachineMemOperand::MOVolatile
+                                            : MachineMemOperand::MONone);
+    }
+    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
+  } else {
+    // Otherwise, ask llvm to expand it.
+    return SDValue();
+  }
+}
+
 SDValue VETargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   StoreSDNode *StNode = cast<StoreSDNode>(Op.getNode());
   assert(StNode && StNode->getOffset().isUndef() && "Unexpected node type");
 
+  // always expand non-mask vector loads to VVP
+  EVT MemVT = StNode->getMemoryVT();
+  if (MemVT.isVector() && !isMaskType(MemVT))
+    return lowerToVVP(Op, DAG);
+
   SDValue BasePtr = StNode->getBasePtr();
   if (isa<FrameIndexSDNode>(BasePtr.getNode())) {
     // Do not expand store instruction with frame index here because of
@@ -1369,9 +1541,10 @@ SDValue VETargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
     return Op;
   }
 
-  EVT MemVT = StNode->getMemoryVT();
   if (MemVT == MVT::f128)
     return lowerStoreF128(Op, DAG);
+  if (isMaskType(MemVT))
+    return lowerStoreI1(Op, DAG);
 
   // Otherwise, ask llvm to expand it.
   return SDValue();
@@ -1410,9 +1583,9 @@ SDValue VETargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   SDValue NextPtr;
 
   if (VT == MVT::f128) {
-    // VE f128 values must be stored with 16 bytes alignment.  We doesn't
+    // VE f128 values must be stored with 16 bytes alignment.  We don't
     // know the actual alignment of VAList, so we take alignment of it
-    // dyanmically.
+    // dynamically.
     int Align = 16;
     VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
                          DAG.getConstant(Align - 1, DL, PtrVT));
@@ -1658,25 +1831,37 @@ SDValue VETargetLowering::lowerBUILD_VECTOR(SDValue Op,
   // Else emit a broadcast.
   if (SDValue ScalarV = getSplatValue(Op.getNode())) {
     unsigned NumEls = ResultVT.getVectorNumElements();
-    // TODO: Legalize packed-mode AVL.
-    //       For now, cap the AVL at 256.
-    auto CappedLength = std::min<unsigned>(256, NumEls);
-    auto AVL = CDAG.getConstant(CappedLength, MVT::i32);
-    return CDAG.getBroadcast(ResultVT, Op.getOperand(0), AVL);
+    auto AVL = CDAG.getConstant(NumEls, MVT::i32);
+    return CDAG.getBroadcast(ResultVT, ScalarV, AVL);
   }
 
   // Expand
   return SDValue();
 }
 
+TargetLowering::LegalizeAction
+VETargetLowering::getCustomOperationAction(SDNode &Op) const {
+  // Custom legalization on VVP_* and VEC_* opcodes is required to pack-legalize
+  // these operations (transform nodes such that their AVL parameter refers to
+  // packs of 64bit, instead of number of elements.
+
+  // Packing opcodes are created with a pack-legal AVL (LEGALAVL). No need to
+  // re-visit them.
+  if (isPackingSupportOpcode(Op.getOpcode()))
+    return Legal;
+
+  // Custom lower to legalize AVL for packed mode.
+  if (isVVPOrVEC(Op.getOpcode()))
+    return Custom;
+  return Legal;
+}
+
 SDValue VETargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  LLVM_DEBUG(dbgs() << "::LowerOperation"; Op->print(dbgs()););
   unsigned Opcode = Op.getOpcode();
-  if (ISD::isVPOpcode(Opcode))
-    return lowerToVVP(Op, DAG);
 
+  /// Scalar isel.
   switch (Opcode) {
-  default:
-    llvm_unreachable("Should not custom lower this!");
   case ISD::ATOMIC_FENCE:
     return lowerATOMIC_FENCE(Op, DAG);
   case ISD::ATOMIC_SWAP:
@@ -1720,9 +1905,33 @@ SDValue VETargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return lowerINSERT_VECTOR_ELT(Op, DAG);
   case ISD::EXTRACT_VECTOR_ELT:
     return lowerEXTRACT_VECTOR_ELT(Op, DAG);
+  }
 
+  /// Vector isel.
+  LLVM_DEBUG(dbgs() << "::LowerOperation_VVP"; Op->print(dbgs()););
+  if (ISD::isVPOpcode(Opcode))
+    return lowerToVVP(Op, DAG);
+
+  switch (Opcode) {
+  default:
+    llvm_unreachable("Should not custom lower this!");
+
+  // Legalize the AVL of this internal node.
+  case VEISD::VEC_BROADCAST:
+#define ADD_VVP_OP(VVP_NAME, ...) case VEISD::VVP_NAME:
+#include "VVPNodes.def"
+    // AVL already legalized.
+    if (getAnnotatedNodeAVL(Op).second)
+      return Op;
+    return legalizeInternalVectorOp(Op, DAG);
+
+    // Translate into a VEC_*/VVP_* layer operation.
+  case ISD::MLOAD:
+  case ISD::MSTORE:
 #define ADD_VVP_OP(VVP_NAME, ISD_NAME) case ISD::ISD_NAME:
 #include "VVPNodes.def"
+    if (isMaskArithmetic(Op) && isPackedVectorType(Op.getValueType()))
+      return splitMaskArithmetic(Op, DAG);
     return lowerToVVP(Op, DAG);
   }
 }
@@ -2667,52 +2876,6 @@ bool VETargetLowering::hasAndNot(SDValue Y) const {
   return true;
 }
 
-SDValue VETargetLowering::lowerToVVP(SDValue Op, SelectionDAG &DAG) const {
-  // Can we represent this as a VVP node.
-  const unsigned Opcode = Op->getOpcode();
-  auto VVPOpcodeOpt = getVVPOpcode(Opcode);
-  if (!VVPOpcodeOpt.hasValue())
-    return SDValue();
-  unsigned VVPOpcode = VVPOpcodeOpt.getValue();
-  const bool FromVP = ISD::isVPOpcode(Opcode);
-
-  // The representative and legalized vector type of this operation.
-  VECustomDAG CDAG(DAG, Op);
-  MVT MaskVT = MVT::v256i1; // TODO: packed mode.
-  EVT OpVecVT = Op.getValueType();
-  EVT LegalVecVT = getTypeToTransformTo(*DAG.getContext(), OpVecVT);
-
-  SDValue AVL;
-  SDValue Mask;
-
-  if (FromVP) {
-    // All upstream VP SDNodes always have a mask and avl.
-    auto MaskIdx = ISD::getVPMaskIdx(Opcode).getValue();
-    auto AVLIdx = ISD::getVPExplicitVectorLengthIdx(Opcode).getValue();
-    Mask = Op->getOperand(MaskIdx);
-    AVL = Op->getOperand(AVLIdx);
-
-  } else {
-    // Materialize the VL parameter.
-    AVL = CDAG.getConstant(OpVecVT.getVectorNumElements(), MVT::i32);
-    SDValue ConstTrue = CDAG.getConstant(1, MVT::i32);
-    Mask = CDAG.getBroadcast(MaskVT, ConstTrue, AVL);
-  }
-
-  if (isVVPBinaryOp(VVPOpcode)) {
-    assert(LegalVecVT.isSimple());
-    return CDAG.getNode(VVPOpcode, LegalVecVT,
-                        {Op->getOperand(0), Op->getOperand(1), Mask, AVL});
-  }
-  if (VVPOpcode == VEISD::VVP_SELECT) {
-    auto Mask = Op->getOperand(0);
-    auto OnTrue = Op->getOperand(1);
-    auto OnFalse = Op->getOperand(2);
-    return CDAG.getNode(VVPOpcode, LegalVecVT, {OnTrue, OnFalse, Mask, AVL});
-  }
-  llvm_unreachable("lowerToVVP called for unexpected SDNode.");
-}
-
 SDValue VETargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
                                                   SelectionDAG &DAG) const {
   assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
diff --git a/llvm/lib/Target/VE/VEISelLowering.h b/llvm/lib/Target/VE/VEISelLowering.h
index 09bd19e83717..087b0e215407 100644
--- a/llvm/lib/Target/VE/VEISelLowering.h
+++ b/llvm/lib/Target/VE/VEISelLowering.h
@@ -38,17 +38,30 @@ enum NodeType : unsigned {
   MEMBARRIER,             // Compiler barrier only; generate a no-op.
   RET_FLAG,               // Return with a flag operand.
   TS1AM,                  // A TS1AM instruction used for 1/2 bytes swap.
-  VEC_BROADCAST,          // A vector broadcast instruction.
-                          //   0: scalar value, 1: VL
+  VEC_UNPACK_LO,          // unpack the lo v256 slice of a packed v512 vector.
+  VEC_UNPACK_HI,          // unpack the hi v256 slice of a packed v512 vector.
+                          //    0: v512 vector, 1: AVL
+  VEC_PACK,               // pack a lo and a hi vector into one v512 vector
+                          //    0: v256 lo vector, 1: v256 hi vector, 2: AVL
+
+  VEC_BROADCAST, // A vector broadcast instruction.
+                 //   0: scalar value, 1: VL
   REPL_I32,
   REPL_F32, // Replicate subregister to other half.
 
+  // Annotation as a wrapper. LEGALAVL(VL) means that VL refers to 64bit of
+  // data, whereas the raw EVL coming in from VP nodes always refers to number
+  // of elements, regardless of their size.
+  LEGALAVL,
+
 // VVP_* nodes.
 #define ADD_VVP_OP(VVP_NAME, ...) VVP_NAME,
 #include "VVPNodes.def"
 };
 }
 
+class VECustomDAG;
+
 class VETargetLowering : public TargetLowering {
   const VESubtarget *Subtarget;
 
@@ -105,6 +118,9 @@ public:
   }
 
   /// Custom Lower {
+  TargetLoweringBase::LegalizeAction
+  getCustomOperationAction(SDNode &) const override;
+
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
   unsigned getJumpTableEncoding() const override;
   const MCExpr *LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
@@ -170,6 +186,15 @@ public:
 
   /// VVP Lowering {
   SDValue lowerToVVP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVVP_LOAD_STORE(SDValue Op, VECustomDAG &) const;
+  SDValue lowerVVP_GATHER_SCATTER(SDValue Op, VECustomDAG &) const;
+
+  SDValue legalizeInternalVectorOp(SDValue Op, SelectionDAG &DAG) const;
+  SDValue legalizeInternalLoadStoreOp(SDValue Op, VECustomDAG &CDAG) const;
+  SDValue splitVectorOp(SDValue Op, VECustomDAG &CDAG) const;
+  SDValue splitPackedLoadStore(SDValue Op, VECustomDAG &CDAG) const;
+  SDValue legalizePackedAVL(SDValue Op, VECustomDAG &CDAG) const;
+  SDValue splitMaskArithmetic(SDValue Op, SelectionDAG &DAG) const;
   /// } VVPLowering
 
   /// Custom DAGCombine {
diff --git a/llvm/lib/Target/VE/VEInstrInfo.cpp b/llvm/lib/Target/VE/VEInstrInfo.cpp
index 7c1bd5201867..94ebb59c4c77 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.cpp
+++ b/llvm/lib/Target/VE/VEInstrInfo.cpp
@@ -811,7 +811,7 @@ static void expandPseudoVFMK(const TargetInstrInfo &TI, MachineInstr &MI) {
   // replace to pvfmk.w.up and pvfmk.w.lo
   // replace to pvfmk.s.up and pvfmk.s.lo
 
-  static std::map<unsigned, std::pair<unsigned, unsigned>> VFMKMap = {
+  static const std::pair<unsigned, std::pair<unsigned, unsigned>> VFMKMap[] = {
       {VE::VFMKyal, {VE::VFMKLal, VE::VFMKLal}},
       {VE::VFMKynal, {VE::VFMKLnal, VE::VFMKLnal}},
       {VE::VFMKWyvl, {VE::PVFMKWUPvl, VE::PVFMKWLOvl}},
@@ -822,8 +822,9 @@ static void expandPseudoVFMK(const TargetInstrInfo &TI, MachineInstr &MI) {
 
   unsigned Opcode = MI.getOpcode();
 
-  auto Found = VFMKMap.find(Opcode);
-  if (Found == VFMKMap.end())
+  const auto *Found =
+      llvm::find_if(VFMKMap, [&](auto P) { return P.first == Opcode; });
+  if (Found == std::end(VFMKMap))
     report_fatal_error("unexpected opcode for pseudo vfmk");
 
   unsigned OpcodeUpper = (*Found).second.first;
diff --git a/llvm/lib/Target/VE/VEInstrInfo.td b/llvm/lib/Target/VE/VEInstrInfo.td
index 717427c3f48d..85285749b4fa 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.td
+++ b/llvm/lib/Target/VE/VEInstrInfo.td
@@ -875,14 +875,14 @@ multiclass BCRm<string opcStr, string opcStrAt, string opcStrAf, bits<8> opc,
 //   e.g. LCR
 let hasSideEffects = 1 in
 multiclass LOADCRm<string opcStr, bits<8>opc, RegisterClass RC> {
-  def rr : RR<opc, (outs RC:$sx), (ins RC:$sz, RC:$sy),
+  def rr : RR<opc, (outs RC:$sx), (ins RC:$sy, RC:$sz),
               !strconcat(opcStr, " $sx, $sy, $sz")>;
-  let cy = 0 in def ri : RR<opc, (outs RC:$sx), (ins RC:$sz, simm7:$sy),
+  let cy = 0 in def ir : RR<opc, (outs RC:$sx), (ins simm7:$sy, RC:$sz),
                             !strconcat(opcStr, " $sx, $sy, $sz")>;
-  let cz = 0 in def zr : RR<opc, (outs RC:$sx), (ins zero:$sz, RC:$sy),
+  let cz = 0 in def rz : RR<opc, (outs RC:$sx), (ins RC:$sy, zero:$sz),
                             !strconcat(opcStr, " $sx, $sy, $sz")>;
   let cy = 0, cz = 0 in
-  def zi : RR<opc, (outs RC:$sx), (ins zero:$sz, simm7:$sy),
+  def iz : RR<opc, (outs RC:$sx), (ins simm7:$sy, zero:$sz),
               !strconcat(opcStr, " $sx, $sy, $sz")>;
 }
 
@@ -890,17 +890,31 @@ multiclass LOADCRm<string opcStr, bits<8>opc, RegisterClass RC> {
 //   e.g. SCR
 let hasSideEffects = 1 in
 multiclass STORECRm<string opcStr, bits<8>opc, RegisterClass RC> {
-  def rr : RR<opc, (outs), (ins RC:$sz, RC:$sy, RC:$sx),
+  def rrr : RR<opc, (outs), (ins RC:$sy, RC:$sz, RC:$sx),
               !strconcat(opcStr, " $sx, $sy, $sz")>;
-  let cy = 0 in def ri : RR<opc, (outs), (ins RC:$sz, simm7:$sy, RC:$sx),
-                            !strconcat(opcStr, " $sx, $sy, $sz")>;
-  let cz = 0 in def zr : RR<opc, (outs), (ins zero:$sz, RC:$sy, RC:$sx),
-                            !strconcat(opcStr, " $sx, $sy, $sz")>;
+  let cy = 0 in def irr : RR<opc, (outs), (ins simm7:$sy, RC:$sz, RC:$sx),
+                             !strconcat(opcStr, " $sx, $sy, $sz")>;
+  let cz = 0 in def rzr : RR<opc, (outs), (ins RC:$sy, zero:$sz, RC:$sx),
+                             !strconcat(opcStr, " $sx, $sy, $sz")>;
   let cy = 0, cz = 0 in
-  def zi : RR<opc, (outs), (ins zero:$sz, simm7:$sy, RC:$sx),
-              !strconcat(opcStr, " $sx, $sy, $sz")>;
+  def izr : RR<opc, (outs), (ins simm7:$sy, zero:$sz, RC:$sx),
+               !strconcat(opcStr, " $sx, $sy, $sz")>;
+}
+
+let hasSideEffects = 1, Constraints = "$sx = $sx_in", DisableEncoding = "$sx_in" in
+multiclass TSCRm<string opcStr, bits<8>opc, RegisterClass RC> {
+  def rrr : RR<opc, (outs RC:$sx), (ins RC:$sy, RC:$sz, RC:$sx_in),
+               !strconcat(opcStr, " $sx, $sy, $sz")>;
+  let cy = 0 in def irr : RR<opc, (outs RC:$sx), (ins simm7:$sy, RC:$sz, RC:$sx_in),
+                             !strconcat(opcStr, " $sx, $sy, $sz")>;
+  let cz = 0 in def rzr : RR<opc, (outs RC:$sx), (ins RC:$sy, zero:$sz, RC:$sx_in),
+                             !strconcat(opcStr, " $sx, $sy, $sz")>;
+  let cy = 0, cz = 0 in
+  def izr : RR<opc, (outs RC:$sx), (ins simm7:$sy, zero:$sz, RC:$sx_in),
+               !strconcat(opcStr, " $sx, $sy, $sz")>;
 }
 
+
 // Multiclass for communication register instructions.
 //   e.g. FIDCR
 let cz = 0, hasSideEffects = 1 in
@@ -1528,7 +1542,7 @@ defm LCR : LOADCRm<"lcr", 0x40, I64>;
 defm SCR : STORECRm<"scr", 0x50, I64>;
 
 // Section 8.19.11 - TSCR (Test & Set Communication Register)
-defm TSCR : LOADCRm<"tscr", 0x41, I64>;
+defm TSCR : TSCRm<"tscr", 0x41, I64>;
 
 // Section 8.19.12 - FIDCR (Fetch & Increment/Decrement CR)
 defm FIDCR : FIDCRm<"fidcr", 0x51, I64>;
@@ -2293,6 +2307,18 @@ class IsVLVT<int OpIdx> : SDTCisVT<OpIdx,i32>;
 def vec_broadcast       : SDNode<"VEISD::VEC_BROADCAST", SDTypeProfile<1, 2,
                                  [SDTCisVec<0>, IsVLVT<2>]>>;
 
+///// Packed mode Support /////
+// unpack the lo part of this vector
+def vec_unpack_lo   : SDNode<"VEISD::VEC_UNPACK_LO", SDTypeProfile<1, 2,
+                             [SDTCisVec<0>, SDTCisVec<1>, IsVLVT<2>]>>;
+// unpack the hipart of this vector
+def vec_unpack_hi   : SDNode<"VEISD::VEC_UNPACK_HI", SDTypeProfile<1, 2,
+                             [SDTCisVec<0>, SDTCisVec<1>, IsVLVT<2>]>>;
+// re-pack v256i32, v256f32 back into tone v512.32
+def vec_pack        : SDNode<"VEISD::VEC_PACK", SDTypeProfile<1, 3,
+                             [SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>,
+                              SDTCisSameNumEltsAs<1,2>, IsVLVT<3>]>>;
+
 // replicate lower 32bit to upper 32bit (f32 scalar replication).
 def repl_f32            : SDNode<"VEISD::REPL_F32",
                             SDTypeProfile<1, 1,
diff --git a/llvm/lib/Target/VE/VEInstrIntrinsicVL.gen.td b/llvm/lib/Target/VE/VEInstrIntrinsicVL.gen.td
index 9ec10838db05..2ef621ae7477 100644
--- a/llvm/lib/Target/VE/VEInstrIntrinsicVL.gen.td
+++ b/llvm/lib/Target/VE/VEInstrIntrinsicVL.gen.td
@@ -601,6 +601,42 @@ def : Pat<(int_ve_vl_pveqv_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVEQVrvl i64:$s
 def : Pat<(int_ve_vl_pveqv_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVEQVrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
 def : Pat<(int_ve_vl_pveqv_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVEQVvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
 def : Pat<(int_ve_vl_pveqv_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVEQVrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldz_vvl v256f64:$vz, i32:$vl), (VLDZvl v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldz_vvvl v256f64:$vz, v256f64:$pt, i32:$vl), (VLDZvl_v v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldz_vvmvl v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VLDZvml_v v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvldzlo_vvl v256f64:$vz, i32:$vl), (PVLDZLOvl v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvldzlo_vvvl v256f64:$vz, v256f64:$pt, i32:$vl), (PVLDZLOvl_v v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvldzlo_vvmvl v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (PVLDZLOvml_v v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvldzup_vvl v256f64:$vz, i32:$vl), (PVLDZUPvl v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvldzup_vvvl v256f64:$vz, v256f64:$pt, i32:$vl), (PVLDZUPvl_v v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvldzup_vvmvl v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (PVLDZUPvml_v v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvldz_vvl v256f64:$vz, i32:$vl), (PVLDZvl v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvldz_vvvl v256f64:$vz, v256f64:$pt, i32:$vl), (PVLDZvl_v v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvldz_vvMvl v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVLDZvml_v v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vpcnt_vvl v256f64:$vz, i32:$vl), (VPCNTvl v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vpcnt_vvvl v256f64:$vz, v256f64:$pt, i32:$vl), (VPCNTvl_v v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vpcnt_vvmvl v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VPCNTvml_v v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvpcntlo_vvl v256f64:$vz, i32:$vl), (PVPCNTLOvl v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvpcntlo_vvvl v256f64:$vz, v256f64:$pt, i32:$vl), (PVPCNTLOvl_v v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvpcntlo_vvmvl v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (PVPCNTLOvml_v v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvpcntup_vvl v256f64:$vz, i32:$vl), (PVPCNTUPvl v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvpcntup_vvvl v256f64:$vz, v256f64:$pt, i32:$vl), (PVPCNTUPvl_v v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvpcntup_vvmvl v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (PVPCNTUPvml_v v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvpcnt_vvl v256f64:$vz, i32:$vl), (PVPCNTvl v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvpcnt_vvvl v256f64:$vz, v256f64:$pt, i32:$vl), (PVPCNTvl_v v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvpcnt_vvMvl v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVPCNTvml_v v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vbrv_vvl v256f64:$vz, i32:$vl), (VBRVvl v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vbrv_vvvl v256f64:$vz, v256f64:$pt, i32:$vl), (VBRVvl_v v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vbrv_vvmvl v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VBRVvml_v v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvbrvlo_vvl v256f64:$vz, i32:$vl), (PVBRVLOvl v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvbrvlo_vvvl v256f64:$vz, v256f64:$pt, i32:$vl), (PVBRVLOvl_v v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvbrvlo_vvmvl v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (PVBRVLOvml_v v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvbrvup_vvl v256f64:$vz, i32:$vl), (PVBRVUPvl v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvbrvup_vvvl v256f64:$vz, v256f64:$pt, i32:$vl), (PVBRVUPvl_v v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvbrvup_vvmvl v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (PVBRVUPvml_v v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvbrv_vvl v256f64:$vz, i32:$vl), (PVBRVvl v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvbrv_vvvl v256f64:$vz, v256f64:$pt, i32:$vl), (PVBRVvl_v v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvbrv_vvMvl v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVBRVvml_v v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
 def : Pat<(int_ve_vl_vseq_vl i32:$vl), (VSEQl i32:$vl)>;
 def : Pat<(int_ve_vl_vseq_vvl v256f64:$pt, i32:$vl), (VSEQl_v i32:$vl, v256f64:$pt)>;
 def : Pat<(int_ve_vl_pvseqlo_vl i32:$vl), (PVSEQLOl i32:$vl)>;
@@ -1602,3 +1638,21 @@ def : Pat<(int_ve_vl_negm_MM v512i1:$vmy), (NEGMy v512i1:$vmy)>;
 def : Pat<(int_ve_vl_pcvm_sml v256i1:$vmy, i32:$vl), (PCVMml v256i1:$vmy, i32:$vl)>;
 def : Pat<(int_ve_vl_lzvm_sml v256i1:$vmy, i32:$vl), (LZVMml v256i1:$vmy, i32:$vl)>;
 def : Pat<(int_ve_vl_tovm_sml v256i1:$vmy, i32:$vl), (TOVMml v256i1:$vmy, i32:$vl)>;
+def : Pat<(int_ve_vl_lcr_sss i64:$sy, i64:$sz), (LCRrr i64:$sy, i64:$sz)>;
+def : Pat<(int_ve_vl_lcr_sss i64:$sy, zero:$Z), (LCRrz i64:$sy, (LO7 $Z))>;
+def : Pat<(int_ve_vl_lcr_sss uimm7:$N, i64:$sz), (LCRir (ULO7 $N), i64:$sz)>;
+def : Pat<(int_ve_vl_lcr_sss uimm7:$N, zero:$Z), (LCRiz (ULO7 $N), (LO7 $Z))>;
+def : Pat<(int_ve_vl_scr_sss i64:$sx, i64:$sy, i64:$sz), (SCRrrr i64:$sy, i64:$sz, i64:$sx)>;
+def : Pat<(int_ve_vl_scr_sss i64:$sx, i64:$sy, zero:$Z), (SCRrzr i64:$sy, (LO7 $Z), i64:$sx)>;
+def : Pat<(int_ve_vl_scr_sss i64:$sx, uimm7:$N, i64:$sz), (SCRirr (ULO7 $N), i64:$sz, i64:$sx)>;
+def : Pat<(int_ve_vl_scr_sss i64:$sx, uimm7:$N, zero:$Z), (SCRizr (ULO7 $N), (LO7 $Z), i64:$sx)>;
+def : Pat<(int_ve_vl_tscr_ssss i64:$sx, i64:$sy, i64:$sz), (TSCRrrr i64:$sy, i64:$sz, i64:$sx)>;
+def : Pat<(int_ve_vl_tscr_ssss i64:$sx, i64:$sy, zero:$Z), (TSCRrzr i64:$sy, (LO7 $Z), i64:$sx)>;
+def : Pat<(int_ve_vl_tscr_ssss i64:$sx, uimm7:$N, i64:$sz), (TSCRirr (ULO7 $N), i64:$sz, i64:$sx)>;
+def : Pat<(int_ve_vl_tscr_ssss i64:$sx, uimm7:$N, zero:$Z), (TSCRizr (ULO7 $N), (LO7 $Z), i64:$sx)>;
+def : Pat<(int_ve_vl_fidcr_sss i64:$sy, uimm3:$I), (FIDCRri i64:$sy, (LO7 $I))>;
+def : Pat<(int_ve_vl_fidcr_sss uimm7:$N, uimm3:$I), (FIDCRii (ULO7 $N), (LO7 $I))>;
+def : Pat<(int_ve_vl_fencei ), (FENCEI )>;
+def : Pat<(int_ve_vl_fencem_s uimm2:$I), (FENCEM (LO7 $I))>;
+def : Pat<(int_ve_vl_fencec_s uimm3:$I), (FENCEC (LO7 $I))>;
+def : Pat<(int_ve_vl_svob ), (SVOB )>;
diff --git a/llvm/lib/Target/VE/VEInstrIntrinsicVL.td b/llvm/lib/Target/VE/VEInstrIntrinsicVL.td
index 69ea133ceed0..fca0572cf9b1 100644
--- a/llvm/lib/Target/VE/VEInstrIntrinsicVL.td
+++ b/llvm/lib/Target/VE/VEInstrIntrinsicVL.td
@@ -2,9 +2,6 @@
 
 /// Intrinsic patterns written by hand.
 
-// SVOB pattern.
-def : Pat<(int_ve_vl_svob), (SVOB)>;
-
 // Pack patterns.
 def : Pat<(i64 (int_ve_vl_pack_f32p ADDRrii:$addr0, ADDRrii:$addr1)),
           (ORrr (f2l (LDUrii MEMrii:$addr0)),
diff --git a/llvm/lib/Target/VE/VEInstrPatternsVec.td b/llvm/lib/Target/VE/VEInstrPatternsVec.td
index 6c5b80315efb..71199717a3a2 100644
--- a/llvm/lib/Target/VE/VEInstrPatternsVec.td
+++ b/llvm/lib/Target/VE/VEInstrPatternsVec.td
@@ -105,3 +105,46 @@ defm : vbrd_elem64<v512i32, i64, simm7, LO7>;
 defm : vbrd_elem64<v512f32, i64, simm7, LO7>;
 defm : vbrd_elem64<v512i32, f64, simm7fp, LO7FP>;
 defm : vbrd_elem64<v512f32, f64, simm7fp, LO7FP>;
+
+class Mask_Binary<ValueType MaskVT, SDPatternOperator MaskOp, string InstName> :
+  Pat<(MaskVT (MaskOp MaskVT:$ma, MaskVT:$mb)), (!cast<Instruction>(InstName#"mm") $ma, $mb)>;
+
+def: Mask_Binary<v256i1, and, "ANDM">;
+def: Mask_Binary<v256i1, or,  "ORM">;
+def: Mask_Binary<v256i1, xor, "XORM">;
+
+///// Packing support /////
+
+// v256i1 <> v512i1
+def : Pat<(v256i1 (vec_unpack_lo v512i1:$vm, (i32 srcvalue))),
+          (EXTRACT_SUBREG $vm, sub_vm_odd)>;
+def : Pat<(v256i1 (vec_unpack_hi v512i1:$vm, (i32 srcvalue))),
+          (EXTRACT_SUBREG $vm, sub_vm_even)>;
+def : Pat<(v512i1 (vec_pack v256i1:$vlo, v256i1:$vhi, (i32 srcvalue))),
+          (INSERT_SUBREG (INSERT_SUBREG
+                         (v512i1 (IMPLICIT_DEF)),
+                         $vlo, sub_vm_odd),
+                         $vhi, sub_vm_even)>;
+
+// v256.32 <> v512.32
+multiclass Packing<ValueType PackVT> {
+  // no-op unpacks
+  def : Pat<(v256i32 (vec_unpack_lo PackVT:$vp, (i32 srcvalue))),
+            (COPY_TO_REGCLASS $vp, V64)>;
+  def : Pat<(v256f32 (vec_unpack_hi PackVT:$vp, (i32 srcvalue))),
+            (COPY_TO_REGCLASS $vp, V64)>;
+
+  // shuffle unpacks
+  def : Pat<(v256f32 (vec_unpack_lo PackVT:$vp, i32:$avl)),
+            (VSHFvvil $vp, $vp, 4, $avl)>; // always pick lo
+  def : Pat<(v256i32 (vec_unpack_hi PackVT:$vp, i32:$avl)),
+            (VSHFvvil $vp, $vp, 0, $avl)>; // always pick hi
+}
+
+defm : Packing<v512i32>;
+defm : Packing<v512f32>;
+
+def : Pat<(v512i32 (vec_pack v256i32:$vlo, v256i32:$vhi, i32:$avl)),
+          (VSHFvvil $vlo, $vhi, 13, $avl)>;
+def : Pat<(v512f32 (vec_pack v256f32:$vlo, v256f32:$vhi, i32:$avl)),
+          (VSHFvvil $vlo, $vhi, 8, $avl)>;
diff --git a/llvm/lib/Target/VE/VEMachineFunctionInfo.cpp b/llvm/lib/Target/VE/VEMachineFunctionInfo.cpp
index 1addfc7174eb..2ada2581291d 100644
--- a/llvm/lib/Target/VE/VEMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/VE/VEMachineFunctionInfo.cpp
@@ -11,3 +11,10 @@
 using namespace llvm;
 
 void VEMachineFunctionInfo::anchor() {}
+
+MachineFunctionInfo *VEMachineFunctionInfo::clone(
+    BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+    const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+    const {
+  return DestMF.cloneInfo<VEMachineFunctionInfo>(*this);
+}
diff --git a/llvm/lib/Target/VE/VEMachineFunctionInfo.h b/llvm/lib/Target/VE/VEMachineFunctionInfo.h
index 3160f6a552d7..d9d30ad5b8c5 100644
--- a/llvm/lib/Target/VE/VEMachineFunctionInfo.h
+++ b/llvm/lib/Target/VE/VEMachineFunctionInfo.h
@@ -33,6 +33,11 @@ public:
   explicit VEMachineFunctionInfo(MachineFunction &MF)
       : VarArgsFrameOffset(0), IsLeafProc(false) {}
 
+  MachineFunctionInfo *
+  clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+        const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+      const override;
+
   Register getGlobalBaseReg() const { return GlobalBaseReg; }
   void setGlobalBaseReg(Register Reg) { GlobalBaseReg = Reg; }
 
diff --git a/llvm/lib/Target/VE/VERegisterInfo.td b/llvm/lib/Target/VE/VERegisterInfo.td
index 70ff104b65b7..cca0ad26b3e9 100644
--- a/llvm/lib/Target/VE/VERegisterInfo.td
+++ b/llvm/lib/Target/VE/VERegisterInfo.td
@@ -152,8 +152,10 @@ foreach I = 0-15 in
   def VM#I : VEMaskReg<I, "vm"#I, [], ["vm"#I]>, DwarfRegNum<[!add(128,I)]>;
 
 // Aliases of VMs to use as a pair of two VM for packed instructions
+def VMP0 : VEMaskReg<0, "vm0", [], ["vm0"]>;
+
 let SubRegIndices = [sub_vm_even, sub_vm_odd], CoveredBySubRegs = 1 in
-foreach I = 0-7 in
+foreach I = 1-7 in
   def VMP#I : VEMaskReg<!shl(I,1), "vmp"#I,
                         [!cast<VEMaskReg>("VM"#!shl(I,1)),
                          !cast<VEMaskReg>("VM"#!add(!shl(I,1),1))],
diff --git a/llvm/lib/Target/VE/VETargetMachine.cpp b/llvm/lib/Target/VE/VETargetMachine.cpp
index 9f294f15da91..d7c1457fb0a8 100644
--- a/llvm/lib/Target/VE/VETargetMachine.cpp
+++ b/llvm/lib/Target/VE/VETargetMachine.cpp
@@ -61,7 +61,7 @@ static std::string computeDataLayout(const Triple &T) {
 }
 
 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
-  return RM.getValueOr(Reloc::Static);
+  return RM.value_or(Reloc::Static);
 }
 
 class VEELFTargetObjectFile : public TargetLoweringObjectFileELF {
@@ -90,9 +90,10 @@ VETargetMachine::VETargetMachine(const Target &T, const Triple &TT,
   initAsmInfo();
 }
 
-VETargetMachine::~VETargetMachine() {}
+VETargetMachine::~VETargetMachine() = default;
 
-TargetTransformInfo VETargetMachine::getTargetTransformInfo(const Function &F) {
+TargetTransformInfo
+VETargetMachine::getTargetTransformInfo(const Function &F) const {
   return TargetTransformInfo(VETTIImpl(this, F));
 }
 
diff --git a/llvm/lib/Target/VE/VETargetMachine.h b/llvm/lib/Target/VE/VETargetMachine.h
index 041d3b197ec3..9cf194444aa5 100644
--- a/llvm/lib/Target/VE/VETargetMachine.h
+++ b/llvm/lib/Target/VE/VETargetMachine.h
@@ -49,7 +49,7 @@ public:
 
   bool isMachineVerifierClean() const override { return false; }
 
-  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+  TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
 
   unsigned getSjLjDataSize() const override { return 64; }
 };
diff --git a/llvm/lib/Target/VE/VETargetTransformInfo.h b/llvm/lib/Target/VE/VETargetTransformInfo.h
index 0242fa1b0117..c68844708878 100644
--- a/llvm/lib/Target/VE/VETargetTransformInfo.h
+++ b/llvm/lib/Target/VE/VETargetTransformInfo.h
@@ -21,6 +21,32 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
 
+static llvm::Type *getVectorElementType(llvm::Type *Ty) {
+  return llvm::cast<llvm::FixedVectorType>(Ty)->getElementType();
+}
+
+static llvm::Type *getLaneType(llvm::Type *Ty) {
+  using namespace llvm;
+  if (!isa<VectorType>(Ty))
+    return Ty;
+  return getVectorElementType(Ty);
+}
+
+static bool isVectorLaneType(llvm::Type &ElemTy) {
+  // check element sizes for vregs
+  if (ElemTy.isIntegerTy()) {
+    unsigned ScaBits = ElemTy.getScalarSizeInBits();
+    return ScaBits == 1 || ScaBits == 32 || ScaBits == 64;
+  }
+  if (ElemTy.isPointerTy()) {
+    return true;
+  }
+  if (ElemTy.isFloatTy() || ElemTy.isDoubleTy()) {
+    return true;
+  }
+  return false;
+}
+
 namespace llvm {
 
 class VETTIImpl : public BasicTTIImplBase<VETTIImpl> {
@@ -35,6 +61,25 @@ class VETTIImpl : public BasicTTIImplBase<VETTIImpl> {
 
   bool enableVPU() const { return getST()->enableVPU(); }
 
+  static bool isSupportedReduction(Intrinsic::ID ReductionID) {
+#define VEC_VP_CASE(SUFFIX)                                                    \
+  case Intrinsic::vp_reduce_##SUFFIX:                                          \
+  case Intrinsic::vector_reduce_##SUFFIX:
+
+    switch (ReductionID) {
+      VEC_VP_CASE(add)
+      VEC_VP_CASE(and)
+      VEC_VP_CASE(or)
+      VEC_VP_CASE(xor)
+      VEC_VP_CASE(smax)
+      return true;
+
+    default:
+      return false;
+    }
+#undef VEC_VP_CASE
+  }
+
 public:
   explicit VETTIImpl(const VETargetMachine *TM, const Function &F)
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
@@ -86,6 +131,27 @@ public:
     //   output
     return false;
   }
+
+  // Load & Store {
+  bool isLegalMaskedLoad(Type *DataType, MaybeAlign Alignment) {
+    return isVectorLaneType(*getLaneType(DataType));
+  }
+  bool isLegalMaskedStore(Type *DataType, MaybeAlign Alignment) {
+    return isVectorLaneType(*getLaneType(DataType));
+  }
+  bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) {
+    return isVectorLaneType(*getLaneType(DataType));
+  };
+  bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {
+    return isVectorLaneType(*getLaneType(DataType));
+  }
+  // } Load & Store
+
+  bool shouldExpandReduction(const IntrinsicInst *II) const {
+    if (!enableVPU())
+      return true;
+    return !isSupportedReduction(II->getIntrinsicID());
+  }
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/VE/VVPISelLowering.cpp b/llvm/lib/Target/VE/VVPISelLowering.cpp
new file mode 100644
index 000000000000..330eef4c7c2b
--- /dev/null
+++ b/llvm/lib/Target/VE/VVPISelLowering.cpp
@@ -0,0 +1,443 @@
+//===-- VVPISelLowering.cpp - VE DAG Lowering Implementation --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the lowering and legalization of vector instructions to
+// VVP_*layer SDNodes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "VECustomDAG.h"
+#include "VEISelLowering.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ve-lower"
+
+SDValue VETargetLowering::splitMaskArithmetic(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  VECustomDAG CDAG(DAG, Op);
+  SDValue AVL =
+      CDAG.getConstant(Op.getValueType().getVectorNumElements(), MVT::i32);
+  SDValue A = Op->getOperand(0);
+  SDValue B = Op->getOperand(1);
+  SDValue LoA = CDAG.getUnpack(MVT::v256i1, A, PackElem::Lo, AVL);
+  SDValue HiA = CDAG.getUnpack(MVT::v256i1, A, PackElem::Hi, AVL);
+  SDValue LoB = CDAG.getUnpack(MVT::v256i1, B, PackElem::Lo, AVL);
+  SDValue HiB = CDAG.getUnpack(MVT::v256i1, B, PackElem::Hi, AVL);
+  unsigned Opc = Op.getOpcode();
+  auto LoRes = CDAG.getNode(Opc, MVT::v256i1, {LoA, LoB});
+  auto HiRes = CDAG.getNode(Opc, MVT::v256i1, {HiA, HiB});
+  return CDAG.getPack(MVT::v512i1, LoRes, HiRes, AVL);
+}
+
+SDValue VETargetLowering::lowerToVVP(SDValue Op, SelectionDAG &DAG) const {
+  // Can we represent this as a VVP node.
+  const unsigned Opcode = Op->getOpcode();
+  auto VVPOpcodeOpt = getVVPOpcode(Opcode);
+  if (!VVPOpcodeOpt)
+    return SDValue();
+  unsigned VVPOpcode = VVPOpcodeOpt.getValue();
+  const bool FromVP = ISD::isVPOpcode(Opcode);
+
+  // The representative and legalized vector type of this operation.
+  VECustomDAG CDAG(DAG, Op);
+  // Dispatch to complex lowering functions.
+  switch (VVPOpcode) {
+  case VEISD::VVP_LOAD:
+  case VEISD::VVP_STORE:
+    return lowerVVP_LOAD_STORE(Op, CDAG);
+  case VEISD::VVP_GATHER:
+  case VEISD::VVP_SCATTER:
+    return lowerVVP_GATHER_SCATTER(Op, CDAG);
+  }
+
+  EVT OpVecVT = *getIdiomaticVectorType(Op.getNode());
+  EVT LegalVecVT = getTypeToTransformTo(*DAG.getContext(), OpVecVT);
+  auto Packing = getTypePacking(LegalVecVT.getSimpleVT());
+
+  SDValue AVL;
+  SDValue Mask;
+
+  if (FromVP) {
+    // All upstream VP SDNodes always have a mask and avl.
+    auto MaskIdx = ISD::getVPMaskIdx(Opcode);
+    auto AVLIdx = ISD::getVPExplicitVectorLengthIdx(Opcode);
+    if (MaskIdx)
+      Mask = Op->getOperand(*MaskIdx);
+    if (AVLIdx)
+      AVL = Op->getOperand(*AVLIdx);
+  }
+
+  // Materialize default mask and avl.
+  if (!AVL)
+    AVL = CDAG.getConstant(OpVecVT.getVectorNumElements(), MVT::i32);
+  if (!Mask)
+    Mask = CDAG.getConstantMask(Packing, true);
+
+  assert(LegalVecVT.isSimple());
+  if (isVVPUnaryOp(VVPOpcode))
+    return CDAG.getNode(VVPOpcode, LegalVecVT, {Op->getOperand(0), Mask, AVL});
+  if (isVVPBinaryOp(VVPOpcode))
+    return CDAG.getNode(VVPOpcode, LegalVecVT,
+                        {Op->getOperand(0), Op->getOperand(1), Mask, AVL});
+  if (isVVPReductionOp(VVPOpcode)) {
+    auto SrcHasStart = hasReductionStartParam(Op->getOpcode());
+    SDValue StartV = SrcHasStart ? Op->getOperand(0) : SDValue();
+    SDValue VectorV = Op->getOperand(SrcHasStart ? 1 : 0);
+    return CDAG.getLegalReductionOpVVP(VVPOpcode, Op.getValueType(), StartV,
+                                       VectorV, Mask, AVL, Op->getFlags());
+  }
+
+  switch (VVPOpcode) {
+  default:
+    llvm_unreachable("lowerToVVP called for unexpected SDNode.");
+  case VEISD::VVP_FFMA: {
+    // VE has a swizzled operand order in FMA (compared to LLVM IR and
+    // SDNodes).
+    auto X = Op->getOperand(2);
+    auto Y = Op->getOperand(0);
+    auto Z = Op->getOperand(1);
+    return CDAG.getNode(VVPOpcode, LegalVecVT, {X, Y, Z, Mask, AVL});
+  }
+  case VEISD::VVP_SELECT: {
+    auto Mask = Op->getOperand(0);
+    auto OnTrue = Op->getOperand(1);
+    auto OnFalse = Op->getOperand(2);
+    return CDAG.getNode(VVPOpcode, LegalVecVT, {OnTrue, OnFalse, Mask, AVL});
+  }
+  case VEISD::VVP_SETCC: {
+    EVT LegalResVT = getTypeToTransformTo(*DAG.getContext(), Op.getValueType());
+    auto LHS = Op->getOperand(0);
+    auto RHS = Op->getOperand(1);
+    auto Pred = Op->getOperand(2);
+    return CDAG.getNode(VVPOpcode, LegalResVT, {LHS, RHS, Pred, Mask, AVL});
+  }
+  }
+}
+
+SDValue VETargetLowering::lowerVVP_LOAD_STORE(SDValue Op,
+                                              VECustomDAG &CDAG) const {
+  auto VVPOpc = *getVVPOpcode(Op->getOpcode());
+  const bool IsLoad = (VVPOpc == VEISD::VVP_LOAD);
+
+  // Shares.
+  SDValue BasePtr = getMemoryPtr(Op);
+  SDValue Mask = getNodeMask(Op);
+  SDValue Chain = getNodeChain(Op);
+  SDValue AVL = getNodeAVL(Op);
+  // Store specific.
+  SDValue Data = getStoredValue(Op);
+  // Load specific.
+  SDValue PassThru = getNodePassthru(Op);
+
+  SDValue StrideV = getLoadStoreStride(Op, CDAG);
+
+  auto DataVT = *getIdiomaticVectorType(Op.getNode());
+  auto Packing = getTypePacking(DataVT);
+
+  // TODO: Infer lower AVL from mask.
+  if (!AVL)
+    AVL = CDAG.getConstant(DataVT.getVectorNumElements(), MVT::i32);
+
+  // Default to the all-true mask.
+  if (!Mask)
+    Mask = CDAG.getConstantMask(Packing, true);
+
+  if (IsLoad) {
+    MVT LegalDataVT = getLegalVectorType(
+        Packing, DataVT.getVectorElementType().getSimpleVT());
+
+    auto NewLoadV = CDAG.getNode(VEISD::VVP_LOAD, {LegalDataVT, MVT::Other},
+                                 {Chain, BasePtr, StrideV, Mask, AVL});
+
+    if (!PassThru || PassThru->isUndef())
+      return NewLoadV;
+
+    // Convert passthru to an explicit select node.
+    SDValue DataV = CDAG.getNode(VEISD::VVP_SELECT, DataVT,
+                                 {NewLoadV, PassThru, Mask, AVL});
+    SDValue NewLoadChainV = SDValue(NewLoadV.getNode(), 1);
+
+    // Merge them back into one node.
+    return CDAG.getMergeValues({DataV, NewLoadChainV});
+  }
+
+  // VVP_STORE
+  assert(VVPOpc == VEISD::VVP_STORE);
+  return CDAG.getNode(VEISD::VVP_STORE, Op.getNode()->getVTList(),
+                      {Chain, Data, BasePtr, StrideV, Mask, AVL});
+}
+
+SDValue VETargetLowering::splitPackedLoadStore(SDValue Op,
+                                               VECustomDAG &CDAG) const {
+  auto VVPOC = *getVVPOpcode(Op.getOpcode());
+  assert((VVPOC == VEISD::VVP_LOAD) || (VVPOC == VEISD::VVP_STORE));
+
+  MVT DataVT = getIdiomaticVectorType(Op.getNode())->getSimpleVT();
+  assert(getTypePacking(DataVT) == Packing::Dense &&
+         "Can only split packed load/store");
+  MVT SplitDataVT = splitVectorType(DataVT);
+
+  assert(!getNodePassthru(Op) &&
+         "Should have been folded in lowering to VVP layer");
+
+  // Analyze the operation
+  SDValue PackedMask = getNodeMask(Op);
+  SDValue PackedAVL = getAnnotatedNodeAVL(Op).first;
+  SDValue PackPtr = getMemoryPtr(Op);
+  SDValue PackData = getStoredValue(Op);
+  SDValue PackStride = getLoadStoreStride(Op, CDAG);
+
+  unsigned ChainResIdx = PackData ? 0 : 1;
+
+  SDValue PartOps[2];
+
+  SDValue UpperPartAVL; // we will use this for packing things back together
+  for (PackElem Part : {PackElem::Hi, PackElem::Lo}) {
+    // VP ops already have an explicit mask and AVL. When expanding from non-VP
+    // attach those additional inputs here.
+    auto SplitTM = CDAG.getTargetSplitMask(PackedMask, PackedAVL, Part);
+
+    // Keep track of the (higher) lvl.
+    if (Part == PackElem::Hi)
+      UpperPartAVL = SplitTM.AVL;
+
+    // Attach non-predicating value operands
+    SmallVector<SDValue, 4> OpVec;
+
+    // Chain
+    OpVec.push_back(getNodeChain(Op));
+
+    // Data
+    if (PackData) {
+      SDValue PartData =
+          CDAG.getUnpack(SplitDataVT, PackData, Part, SplitTM.AVL);
+      OpVec.push_back(PartData);
+    }
+
+    // Ptr & Stride
+    // Push (ptr + ElemBytes * <Part>, 2 * ElemBytes)
+    // Stride info
+    // EVT DataVT = LegalizeVectorType(getMemoryDataVT(Op), Op, DAG, Mode);
+    OpVec.push_back(CDAG.getSplitPtrOffset(PackPtr, PackStride, Part));
+    OpVec.push_back(CDAG.getSplitPtrStride(PackStride));
+
+    // Add predicating args and generate part node
+    OpVec.push_back(SplitTM.Mask);
+    OpVec.push_back(SplitTM.AVL);
+
+    if (PackData) {
+      // Store
+      PartOps[(int)Part] = CDAG.getNode(VVPOC, MVT::Other, OpVec);
+    } else {
+      // Load
+      PartOps[(int)Part] =
+          CDAG.getNode(VVPOC, {SplitDataVT, MVT::Other}, OpVec);
+    }
+  }
+
+  // Merge the chains
+  SDValue LowChain = SDValue(PartOps[(int)PackElem::Lo].getNode(), ChainResIdx);
+  SDValue HiChain = SDValue(PartOps[(int)PackElem::Hi].getNode(), ChainResIdx);
+  SDValue FusedChains =
+      CDAG.getNode(ISD::TokenFactor, MVT::Other, {LowChain, HiChain});
+
+  // Chain only [store]
+  if (PackData)
+    return FusedChains;
+
+  // Re-pack into full packed vector result
+  MVT PackedVT =
+      getLegalVectorType(Packing::Dense, DataVT.getVectorElementType());
+  SDValue PackedVals = CDAG.getPack(PackedVT, PartOps[(int)PackElem::Lo],
+                                    PartOps[(int)PackElem::Hi], UpperPartAVL);
+
+  return CDAG.getMergeValues({PackedVals, FusedChains});
+}
+
+SDValue VETargetLowering::lowerVVP_GATHER_SCATTER(SDValue Op,
+                                                  VECustomDAG &CDAG) const {
+  EVT DataVT = *getIdiomaticVectorType(Op.getNode());
+  auto Packing = getTypePacking(DataVT);
+  MVT LegalDataVT =
+      getLegalVectorType(Packing, DataVT.getVectorElementType().getSimpleVT());
+
+  SDValue AVL = getAnnotatedNodeAVL(Op).first;
+  SDValue Index = getGatherScatterIndex(Op);
+  SDValue BasePtr = getMemoryPtr(Op);
+  SDValue Mask = getNodeMask(Op);
+  SDValue Chain = getNodeChain(Op);
+  SDValue Scale = getGatherScatterScale(Op);
+  SDValue PassThru = getNodePassthru(Op);
+  SDValue StoredValue = getStoredValue(Op);
+  if (PassThru && PassThru->isUndef())
+    PassThru = SDValue();
+
+  bool IsScatter = (bool)StoredValue;
+
+  // TODO: Infer lower AVL from mask.
+  if (!AVL)
+    AVL = CDAG.getConstant(DataVT.getVectorNumElements(), MVT::i32);
+
+  // Default to the all-true mask.
+  if (!Mask)
+    Mask = CDAG.getConstantMask(Packing, true);
+
+  SDValue AddressVec =
+      CDAG.getGatherScatterAddress(BasePtr, Scale, Index, Mask, AVL);
+  if (IsScatter)
+    return CDAG.getNode(VEISD::VVP_SCATTER, MVT::Other,
+                        {Chain, StoredValue, AddressVec, Mask, AVL});
+
+  // Gather.
+  SDValue NewLoadV = CDAG.getNode(VEISD::VVP_GATHER, {LegalDataVT, MVT::Other},
+                                  {Chain, AddressVec, Mask, AVL});
+
+  if (!PassThru)
+    return NewLoadV;
+
+  // TODO: Use vvp_select
+  SDValue DataV = CDAG.getNode(VEISD::VVP_SELECT, LegalDataVT,
+                               {NewLoadV, PassThru, Mask, AVL});
+  SDValue NewLoadChainV = SDValue(NewLoadV.getNode(), 1);
+  return CDAG.getMergeValues({DataV, NewLoadChainV});
+}
+
+SDValue VETargetLowering::legalizeInternalLoadStoreOp(SDValue Op,
+                                                      VECustomDAG &CDAG) const {
+  LLVM_DEBUG(dbgs() << "::legalizeInternalLoadStoreOp\n";);
+  MVT DataVT = getIdiomaticVectorType(Op.getNode())->getSimpleVT();
+
+  // TODO: Recognize packable load,store.
+  if (isPackedVectorType(DataVT))
+    return splitPackedLoadStore(Op, CDAG);
+
+  return legalizePackedAVL(Op, CDAG);
+}
+
+SDValue VETargetLowering::legalizeInternalVectorOp(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  LLVM_DEBUG(dbgs() << "::legalizeInternalVectorOp\n";);
+  VECustomDAG CDAG(DAG, Op);
+
+  // Dispatch to specialized legalization functions.
+  switch (Op->getOpcode()) {
+  case VEISD::VVP_LOAD:
+  case VEISD::VVP_STORE:
+    return legalizeInternalLoadStoreOp(Op, CDAG);
+  }
+
+  EVT IdiomVT = Op.getValueType();
+  if (isPackedVectorType(IdiomVT) &&
+      !supportsPackedMode(Op.getOpcode(), IdiomVT))
+    return splitVectorOp(Op, CDAG);
+
+  // TODO: Implement odd/even splitting.
+  return legalizePackedAVL(Op, CDAG);
+}
+
+SDValue VETargetLowering::splitVectorOp(SDValue Op, VECustomDAG &CDAG) const {
+  MVT ResVT = splitVectorType(Op.getValue(0).getSimpleValueType());
+
+  auto AVLPos = getAVLPos(Op->getOpcode());
+  auto MaskPos = getMaskPos(Op->getOpcode());
+
+  SDValue PackedMask = getNodeMask(Op);
+  auto AVLPair = getAnnotatedNodeAVL(Op);
+  SDValue PackedAVL = AVLPair.first;
+  assert(!AVLPair.second && "Expecting non pack-legalized oepration");
+
+  // request the parts
+  SDValue PartOps[2];
+
+  SDValue UpperPartAVL; // we will use this for packing things back together
+  for (PackElem Part : {PackElem::Hi, PackElem::Lo}) {
+    // VP ops already have an explicit mask and AVL. When expanding from non-VP
+    // attach those additional inputs here.
+    auto SplitTM = CDAG.getTargetSplitMask(PackedMask, PackedAVL, Part);
+
+    if (Part == PackElem::Hi)
+      UpperPartAVL = SplitTM.AVL;
+
+    // Attach non-predicating value operands
+    SmallVector<SDValue, 4> OpVec;
+    for (unsigned i = 0; i < Op.getNumOperands(); ++i) {
+      if (AVLPos && ((int)i) == *AVLPos)
+        continue;
+      if (MaskPos && ((int)i) == *MaskPos)
+        continue;
+
+      // Value operand
+      auto PackedOperand = Op.getOperand(i);
+      auto UnpackedOpVT = splitVectorType(PackedOperand.getSimpleValueType());
+      SDValue PartV =
+          CDAG.getUnpack(UnpackedOpVT, PackedOperand, Part, SplitTM.AVL);
+      OpVec.push_back(PartV);
+    }
+
+    // Add predicating args and generate part node.
+    OpVec.push_back(SplitTM.Mask);
+    OpVec.push_back(SplitTM.AVL);
+    // Emit legal VVP nodes.
+    PartOps[(int)Part] =
+        CDAG.getNode(Op.getOpcode(), ResVT, OpVec, Op->getFlags());
+  }
+
+  // Re-package vectors.
+  return CDAG.getPack(Op.getValueType(), PartOps[(int)PackElem::Lo],
+                      PartOps[(int)PackElem::Hi], UpperPartAVL);
+}
+
+SDValue VETargetLowering::legalizePackedAVL(SDValue Op,
+                                            VECustomDAG &CDAG) const {
+  LLVM_DEBUG(dbgs() << "::legalizePackedAVL\n";);
+  // Only required for VEC and VVP ops.
+  if (!isVVPOrVEC(Op->getOpcode()))
+    return Op;
+
+  // Operation already has a legal AVL.
+  auto AVL = getNodeAVL(Op);
+  if (isLegalAVL(AVL))
+    return Op;
+
+  // Half and round up EVL for 32bit element types.
+  SDValue LegalAVL = AVL;
+  MVT IdiomVT = getIdiomaticVectorType(Op.getNode())->getSimpleVT();
+  if (isPackedVectorType(IdiomVT)) {
+    assert(maySafelyIgnoreMask(Op) &&
+           "TODO Shift predication from EVL into Mask");
+
+    if (auto *ConstAVL = dyn_cast<ConstantSDNode>(AVL)) {
+      LegalAVL = CDAG.getConstant((ConstAVL->getZExtValue() + 1) / 2, MVT::i32);
+    } else {
+      auto ConstOne = CDAG.getConstant(1, MVT::i32);
+      auto PlusOne = CDAG.getNode(ISD::ADD, MVT::i32, {AVL, ConstOne});
+      LegalAVL = CDAG.getNode(ISD::SRL, MVT::i32, {PlusOne, ConstOne});
+    }
+  }
+
+  SDValue AnnotatedLegalAVL = CDAG.annotateLegalAVL(LegalAVL);
+
+  // Copy the operand list.
+  int NumOp = Op->getNumOperands();
+  auto AVLPos = getAVLPos(Op->getOpcode());
+  std::vector<SDValue> FixedOperands;
+  for (int i = 0; i < NumOp; ++i) {
+    if (AVLPos && (i == *AVLPos)) {
+      FixedOperands.push_back(AnnotatedLegalAVL);
+      continue;
+    }
+    FixedOperands.push_back(Op->getOperand(i));
+  }
+
+  // Clone the operation with fixed operands.
+  auto Flags = Op->getFlags();
+  SDValue NewN =
+      CDAG.getNode(Op->getOpcode(), Op->getVTList(), FixedOperands, Flags);
+  return NewN;
+}
diff --git a/llvm/lib/Target/VE/VVPInstrInfo.td b/llvm/lib/Target/VE/VVPInstrInfo.td
index ef9c238066c0..a4e4984e3d12 100644
--- a/llvm/lib/Target/VE/VVPInstrInfo.td
+++ b/llvm/lib/Target/VE/VVPInstrInfo.td
@@ -18,7 +18,40 @@
 // TODO explain how VVP nodes relate to VP SDNodes once VP ISel is uptream.
 //===----------------------------------------------------------------------===//
 
-// Binary Operators {
+// vvp_load(ptr, stride, mask, avl)
+def SDTLoadVVP : SDTypeProfile<1, 4, [
+  SDTCisVec<0>,
+  SDTCisPtrTy<1>,
+  SDTCisInt<2>,
+  SDTCisVec<3>,
+  IsVLVT<4>
+]>;
+
+// vvp_store(data, ptr, stride, mask, avl)
+def SDTStoreVVP: SDTypeProfile<0, 5, [
+  SDTCisVec<0>,
+  SDTCisPtrTy<1>,
+  SDTCisInt<2>,
+  SDTCisVec<3>,
+  IsVLVT<4>
+]>;
+
+// vvp_scatter(chain, data, addr, mask, avl)
+def SDTScatterVVP: SDTypeProfile<0, 4, [
+  SDTCisVec<0>,
+  SDTCisVec<1>,
+  SDTCisVec<2>,
+  SDTCisSameNumEltsAs<0, 2>,
+  IsVLVT<3>
+]>;
+
+// vvp_gather(chain, addr, mask, avl)
+def SDTGatherVVP: SDTypeProfile<1, 3, [
+  SDTCisVec<0>,
+  SDTCisVec<1>,
+  SDTCisSameNumEltsAs<0, 2>,
+  IsVLVT<3>
+]>;
 
 // BinaryOp(x,y,mask,vl)
 def SDTIntBinOpVVP : SDTypeProfile<1, 4, [     // vp_add, vp_and, etc.
@@ -29,6 +62,15 @@ def SDTIntBinOpVVP : SDTypeProfile<1, 4, [     // vp_add, vp_and, etc.
   IsVLVT<4>
 ]>;
 
+// UnaryFPOp(x,mask,vl)
+def SDTFPUnaryOpVVP : SDTypeProfile<1, 3, [
+  SDTCisSameAs<0, 1>,
+  SDTCisFP<0>,
+  SDTCisInt<2>,
+  SDTCisSameNumEltsAs<0, 2>,
+  IsVLVT<3>
+]>;
+
 // BinaryFPOp(x,y,mask,vl)
 def SDTFPBinOpVVP : SDTypeProfile<1, 4, [      // vvp_fadd, etc.
   SDTCisSameAs<0, 1>,
@@ -39,6 +81,17 @@ def SDTFPBinOpVVP : SDTypeProfile<1, 4, [      // vvp_fadd, etc.
   IsVLVT<4>
 ]>;
 
+// TernaryFPOp(x,y,z,mask,vl)
+def SDTFPTernaryOpVVP : SDTypeProfile<1, 5, [
+  SDTCisSameAs<0, 1>,
+  SDTCisSameAs<0, 2>,
+  SDTCisSameAs<0, 3>,
+  SDTCisFP<0>,
+  SDTCisInt<4>,
+  SDTCisSameNumEltsAs<0, 4>,
+  IsVLVT<5>
+]>;
+
 // Select(OnTrue, OnFalse, SelMask, vl)
 def SDTSelectVVP : SDTypeProfile<1, 4, [       // vp_select, vp_merge
   SDTCisVec<0>,
@@ -48,6 +101,28 @@ def SDTSelectVVP : SDTypeProfile<1, 4, [       // vp_select, vp_merge
   IsVLVT<4>
 ]>;
 
+// SetCC (lhs, rhs, cc, mask, vl)
+def SDTSetCCVVP : SDTypeProfile<1, 5, [        // vp_setcc
+  SDTCisVec<0>,
+  SDTCisVec<1>,
+  SDTCisSameNumEltsAs<0, 1>,
+  SDTCisSameAs<1, 2>,
+  SDTCisVT<3, OtherVT>,
+  SDTCisInt<4>,
+  SDTCisSameNumEltsAs<0, 4>,
+  IsVLVT<5>
+]>;
+
+// vvp_reduce(vector, mask, vl)
+def SDTReduceVVP : SDTypeProfile<1, 3, [
+  SDTCisVec<1>,
+  SDTCisInt<2>,
+  SDTCisVec<2>,
+  SDTCisSameNumEltsAs<1,2>,
+  IsVLVT<3>
+]>;
+
+
 // Binary operator commutative pattern.
 class vvp_commutative<SDNode RootOp> :
   PatFrags<
@@ -55,6 +130,12 @@ class vvp_commutative<SDNode RootOp> :
   [(RootOp node:$lhs, node:$rhs, node:$mask, node:$vlen),
    (RootOp node:$rhs, node:$lhs, node:$mask, node:$vlen)]>;
 
+class vvp_fma_commutative<SDNode RootOp> :
+  PatFrags<
+  (ops node:$X, node:$Y, node:$Z, node:$mask, node:$vlen),
+  [(RootOp node:$X, node:$Y, node:$Z, node:$mask, node:$vlen),
+   (RootOp node:$X, node:$Z, node:$Y, node:$mask, node:$vlen)]>;
+
 // VVP node definitions.
 def vvp_add    : SDNode<"VEISD::VVP_ADD",  SDTIntBinOpVVP>;
 def c_vvp_add  : vvp_commutative<vvp_add>;
@@ -80,6 +161,8 @@ def vvp_srl    : SDNode<"VEISD::VVP_SRL",  SDTIntBinOpVVP>;
 def vvp_sra    : SDNode<"VEISD::VVP_SRA",  SDTIntBinOpVVP>;
 def vvp_shl    : SDNode<"VEISD::VVP_SHL",  SDTIntBinOpVVP>;
 
+def vvp_fneg    : SDNode<"VEISD::VVP_FNEG",  SDTFPUnaryOpVVP>;
+
 def vvp_fadd    : SDNode<"VEISD::VVP_FADD",  SDTFPBinOpVVP>;
 def c_vvp_fadd  : vvp_commutative<vvp_fadd>;
 def vvp_fsub    : SDNode<"VEISD::VVP_FSUB",  SDTFPBinOpVVP>;
@@ -87,6 +170,30 @@ def vvp_fmul    : SDNode<"VEISD::VVP_FMUL",  SDTFPBinOpVVP>;
 def c_vvp_fmul  : vvp_commutative<vvp_fmul>;
 def vvp_fdiv    : SDNode<"VEISD::VVP_FDIV",  SDTFPBinOpVVP>;
 
-// } Binary Operators
+def vvp_ffma    : SDNode<"VEISD::VVP_FFMA",  SDTFPTernaryOpVVP>;
+def c_vvp_ffma  : vvp_fma_commutative<vvp_ffma>;
+
+def vvp_scatter : SDNode<"VEISD::VVP_SCATTER",  SDTScatterVVP,
+                         [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def vvp_gather  : SDNode<"VEISD::VVP_GATHER",  SDTGatherVVP,
+                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+def vvp_load    : SDNode<"VEISD::VVP_LOAD",  SDTLoadVVP,
+                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand ]>;
+def vvp_store   : SDNode<"VEISD::VVP_STORE", SDTStoreVVP,
+                         [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+// Reductions
+
+// int reductions
+def vvp_reduce_add          : SDNode<"VEISD::VVP_REDUCE_ADD", SDTReduceVVP>;
+def vvp_reduce_and          : SDNode<"VEISD::VVP_REDUCE_AND", SDTReduceVVP>;
+def vvp_reduce_or           : SDNode<"VEISD::VVP_REDUCE_OR",  SDTReduceVVP>;
+def vvp_reduce_xor          : SDNode<"VEISD::VVP_REDUCE_XOR", SDTReduceVVP>;
+def vvp_reduce_smax         : SDNode<"VEISD::VVP_REDUCE_SMAX", SDTReduceVVP>;
+
 
 def vvp_select : SDNode<"VEISD::VVP_SELECT", SDTSelectVVP>;
+
+// setcc (lhs, rhs, cc, mask, vl)
+def vvp_setcc  : SDNode<"VEISD::VVP_SETCC", SDTSetCCVVP>;
diff --git a/llvm/lib/Target/VE/VVPInstrPatternsVec.td b/llvm/lib/Target/VE/VVPInstrPatternsVec.td
index 74720fd1f419..33316ad054c6 100644
--- a/llvm/lib/Target/VE/VVPInstrPatternsVec.td
+++ b/llvm/lib/Target/VE/VVPInstrPatternsVec.td
@@ -17,6 +17,167 @@
 //===----------------------------------------------------------------------===//
 include "VVPInstrInfo.td"
 
+multiclass VectorStore<ValueType DataVT,
+    ValueType PtrVT, ValueType MaskVT,
+    string STWithMask, string STNoMask> {
+  // Unmasked (imm stride).
+  def : Pat<(vvp_store
+               DataVT:$val, PtrVT:$addr,
+               (i64 simm7:$stride), (MaskVT true_mask), i32:$avl),
+            (!cast<Instruction>(STNoMask#"irvl")
+               (LO7 $stride), $addr, $val, $avl)>;
+  // Unmasked.
+  def : Pat<(vvp_store
+               DataVT:$val, PtrVT:$addr,
+               i64:$stride, (MaskVT true_mask), i32:$avl),
+            (!cast<Instruction>(STNoMask#"rrvl")
+               $stride, $addr, $val, $avl)>;
+  // Masked (imm stride).
+  def : Pat<(vvp_store
+               DataVT:$val, PtrVT:$addr,
+               (i64 simm7:$stride), MaskVT:$mask, i32:$avl),
+            (!cast<Instruction>(STWithMask#"irvml")
+               (LO7 $stride), $addr, $val, $mask, $avl)>;
+  // Masked.
+  def : Pat<(vvp_store
+               DataVT:$val, PtrVT:$addr,
+               i64:$stride, MaskVT:$mask, i32:$avl),
+            (!cast<Instruction>(STWithMask#"rrvml")
+               $stride, $addr, $val, $mask, $avl)>;
+}
+
+defm : VectorStore<v256f64, i64, v256i1, "VST",  "VST">;
+defm : VectorStore<v256i64, i64, v256i1, "VST",  "VST">;
+defm : VectorStore<v256f32, i64, v256i1, "VSTU", "VSTU">;
+defm : VectorStore<v256i32, i64, v256i1, "VSTL", "VSTL">;
+
+multiclass VectorLoad<ValueType DataVT,
+    ValueType PtrVT, ValueType MaskVT,
+    string GTWithMask, string LDNoMask> {
+  // Unmasked (imm stride).
+  def : Pat<(DataVT (vvp_load
+               PtrVT:$addr, (i64 simm7:$stride),
+               (MaskVT true_mask), i32:$avl)),
+            (!cast<Instruction>(LDNoMask#"irl")
+               (LO7 $stride), $addr, $avl)>;
+  // Unmasked.
+  def : Pat<(DataVT (vvp_load
+               PtrVT:$addr, i64:$stride,
+               (MaskVT true_mask), i32:$avl)),
+            (!cast<Instruction>(LDNoMask#"rrl")
+               $stride, PtrVT:$addr, $avl)>;
+  // Masked (imm stride).
+  def : Pat<(DataVT (vvp_load
+               PtrVT:$addr, (i64 simm7:$stride),
+               MaskVT:$mask, i32:$avl)),
+            (!cast<Instruction>(GTWithMask#"vizml")
+               (VADDULrvml $addr,
+                  (VMULULivml (LO7 $stride), (VSEQl $avl), $mask, $avl),
+                  $mask, $avl),
+               0, 0,
+               $mask,
+               $avl)>;
+  // Masked.
+  def : Pat<(DataVT (vvp_load
+               PtrVT:$addr, i64:$stride, MaskVT:$mask, i32:$avl)),
+            (!cast<Instruction>(GTWithMask#"vizml")
+               (VADDULrvml $addr,
+                  (VMULULrvml $stride, (VSEQl $avl), $mask, $avl),
+                  $mask, $avl),
+               0, 0,
+               $mask,
+               $avl)>;
+}
+
+defm : VectorLoad<v256f64, i64, v256i1, "VGT",    "VLD">;
+defm : VectorLoad<v256i64, i64, v256i1, "VGT",    "VLD">;
+defm : VectorLoad<v256f32, i64, v256i1, "VGTU",   "VLDU">;
+defm : VectorLoad<v256i32, i64, v256i1, "VGTLZX", "VLDLZX">;
+
+// Vector Gather and scatter
+multiclass VectorGather<ValueType DataVT,
+    ValueType PtrVT, ValueType MaskVT,
+    string GTPrefix> {
+  // Unmasked.
+  def : Pat<(DataVT (vvp_gather
+               PtrVT:$addr, (MaskVT true_mask), i32:$avl)),
+            (!cast<Instruction>(GTPrefix#"vizl") $addr, 0, 0, $avl)>;
+  // Masked.
+  def : Pat<(DataVT (vvp_gather PtrVT:$addr, MaskVT:$mask, i32:$avl)),
+            (!cast<Instruction>(GTPrefix#"vizml") $addr, 0, 0, $mask, $avl)>;
+}
+
+defm : VectorGather<v256f64, v256i64, v256i1, "VGT">;
+defm : VectorGather<v256i64, v256i64, v256i1, "VGT">;
+defm : VectorGather<v256f32, v256i64, v256i1, "VGTU">;
+defm : VectorGather<v256i32, v256i64, v256i1, "VGTLZX">;
+
+multiclass VectorScatter<ValueType DataVT,
+    ValueType PtrVT, ValueType MaskVT,
+    string SCPrefix> {
+  // Unmasked.
+  def : Pat<(vvp_scatter
+               DataVT:$data, PtrVT:$addr, (MaskVT true_mask), i32:$avl),
+            (!cast<Instruction>(SCPrefix#"vizvl") $addr, 0, 0, $data, $avl)>; 
+  // Masked.
+  def : Pat<(vvp_scatter
+               DataVT:$data, PtrVT:$addr, MaskVT:$mask, i32:$avl),
+            (!cast<Instruction>(SCPrefix#"vizvml") $addr, 0, 0, $data, $mask, $avl)>;
+}
+
+defm : VectorScatter<v256f64, v256i64, v256i1, "VSC">;
+defm : VectorScatter<v256i64, v256i64, v256i1, "VSC">;
+defm : VectorScatter<v256f32, v256i64, v256i1, "VSCU">;
+defm : VectorScatter<v256i32, v256i64, v256i1, "VSCL">;
+
+
+/// FNEG {
+// Directly modify the sign bit to flip the sign.
+
+// Set sign bits in a pack of <2 x f32>.
+def packed_fneg_imm     : OutPatFrag<(ins ),
+                          (i64 (SLLri (i64 (ORim 1, (i32 32))), 31))>;
+
+
+multiclass FNeg<ValueType DataVT> {
+  // Masked with select.
+  def : Pat<(vvp_select (vvp_fneg DataVT:$vx, (v256i1 srcvalue), (i32 srcvalue)),
+                        DataVT:$vfalse,
+                        v256i1:$mask,
+                        i32:$avl),
+            (VXORmvml_v (i32 1), $vx, $mask, $avl, $vfalse)>;
+
+  // Unmasked.
+  def : Pat<(vvp_fneg DataVT:$vx, (v256i1 true_mask), i32:$avl),
+            (VXORmvl (i32 1), $vx, $avl)>;
+
+  // Masked.
+  def : Pat<(vvp_fneg DataVT:$vx, v256i1:$mask, i32:$avl),
+            (VXORmvml (i32 1), $vx, $mask, $avl)>;
+}
+
+defm: FNeg<v256f32>;
+defm: FNeg<v256f64>;
+
+///// Packed FNeg /////
+
+// Masked with select.
+def : Pat<(vvp_select (vvp_fneg v512f32:$vx, (v512i1 srcvalue), (i32 srcvalue)),
+                      v512f32:$vfalse,
+                      v512i1:$mask,
+                      i32:$avl),
+          (v512f32 (PVXORrvml_v (packed_fneg_imm ), $vx, $mask, $avl, $vfalse))>;
+
+// Unmasked.
+def : Pat<(vvp_fneg v512f32:$vx, (v512i1 true_mask), i32:$avl),
+          (v512f32 (PVXORrvl (packed_fneg_imm ), $vx, $avl))>;
+
+// Masked.
+def : Pat<(vvp_fneg v512f32:$vx, v512i1:$mask, i32:$avl),
+          (v512f32 (PVXORrvml (packed_fneg_imm ), $vx, $mask, $avl))>;
+
+/// } FNEG
+
 multiclass Binary_rv<SDPatternOperator OpNode,
     ValueType ScalarVT, ValueType DataVT,
     ValueType MaskVT, string OpBaseName> {
@@ -237,6 +398,143 @@ defm : Binary_rv_vr_vv_ShortLong<vvp_fdiv,
                               f64, v256f64, "VFDIVD",
                               f32, v256f32, "VFDIVS">;
 
+defm : Binary_rv_vv<c_vvp_and,
+                    i64, v512i32, v512i1, "PVAND">;
+defm : Binary_rv_vv<c_vvp_or,
+                    i64, v512i32, v512i1, "PVOR">;
+defm : Binary_rv_vv<c_vvp_xor,
+                    i64, v512i32, v512i1, "PVXOR">;
+
+defm : Binary_rv_vv<c_vvp_add,
+                    i64, v512i32, v512i1, "PVADDU">;
+defm : Binary_rv_vv<vvp_sub,
+                    i64, v512i32, v512i1, "PVSUBU">;
+defm : Binary_vr_vv<vvp_srl,
+                    i64, v512i32, v512i1, "PVSRL">;
+defm : Binary_vr_vv<vvp_sra,
+                    i64, v512i32, v512i1, "PVSRA">;
+defm : Binary_vr_vv<vvp_shl,
+                    i64, v512i32, v512i1, "PVSLL">;
+
+defm : Binary_rv_vv<c_vvp_fadd,
+                    i64, v512f32, v512i1, "PVFADD">;
+defm : Binary_rv_vv<c_vvp_fmul,
+                    i64, v512f32, v512i1, "PVFMUL">;
+defm : Binary_rv_vv<vvp_fsub,
+                    i64, v512f32, v512i1, "PVFSUB">;
+
+multiclass Ternary_vvv<
+    SDPatternOperator OpNode, ValueType DataVT,
+    ValueType MaskVT, string OpBaseName> {
+  // Masked with passthru.
+  def : Pat<(vvp_select
+              (OpNode DataVT:$vx, DataVT:$vy, DataVT:$vz,
+                (MaskVT srcvalue), (i32 srcvalue)),
+              DataVT:$vfalse,
+              MaskVT:$mask,
+              i32:$avl),
+            (!cast<Instruction>(OpBaseName#"vvvml_v")
+              $vx, $vy, $vz, $mask, $avl, $vfalse)>;
+
+  // Unmasked.
+  def : Pat<(OpNode DataVT:$vx, DataVT:$vy, DataVT:$vz,
+              (MaskVT true_mask), i32:$avl),
+            (!cast<Instruction>(OpBaseName#"vvvl")
+              $vx, $vy, $vz, $avl)>;
+
+  // Masked.
+  def : Pat<(OpNode DataVT:$vx, DataVT:$vy, DataVT:$vz,
+              MaskVT:$mask, i32:$avl),
+            (!cast<Instruction>(OpBaseName#"vvvml")
+              $vx, $vy, $vz, $mask, $avl)>;
+}
+
+multiclass Ternary_rvv<
+    SDPatternOperator OpNode,
+    ValueType ScalarVT, ValueType DataVT,
+    ValueType MaskVT, string OpBaseName> {
+  // Masked with passthru, broadcast first.
+  def : Pat<(vvp_select
+              (OpNode
+                (any_broadcast ScalarVT:$sx), DataVT:$vy, DataVT:$vz,
+                (MaskVT srcvalue), (i32 srcvalue)),
+              DataVT:$vfalse,
+              MaskVT:$mask,
+              i32:$avl),
+            (!cast<Instruction>(OpBaseName#"rvvml_v")
+              $sx, $vy, $vz, $mask, $avl, $vfalse)>;
+
+  // Unmasked, broadcast first.
+  def : Pat<(OpNode
+              (any_broadcast ScalarVT:$sx), DataVT:$vy, DataVT:$vz,
+              (MaskVT true_mask), i32:$avl),
+            (!cast<Instruction>(OpBaseName#"rvvl")
+              $sx, $vy, $vz, $avl)>;
+
+  // Masked, broadcast first.
+  def : Pat<(OpNode 
+              (any_broadcast ScalarVT:$sx), DataVT:$vy, DataVT:$vz,
+              MaskVT:$mask, i32:$avl),
+            (!cast<Instruction>(OpBaseName#"rvvml")
+              $sx, $vy, $vz, $mask, $avl)>;
+}
+
+multiclass Ternary_vrv<
+    SDPatternOperator OpNode,
+    ValueType ScalarVT, ValueType DataVT,
+    ValueType MaskVT, string OpBaseName> {
+  // Masked with passthru, broadcast second.
+  def : Pat<(vvp_select
+              (OpNode
+                DataVT:$vx, (any_broadcast ScalarVT:$sy), DataVT:$vz,
+                (MaskVT srcvalue), (i32 srcvalue)),
+              DataVT:$vfalse,
+              MaskVT:$mask,
+              i32:$avl),
+            (!cast<Instruction>(OpBaseName#"vrvml_v")
+              $vx, $sy, $vz,
+              $mask, $avl, $vfalse)>;
+
+  // Unmasked, broadcast second.
+  def : Pat<(OpNode
+              DataVT:$vx, (any_broadcast ScalarVT:$sy), DataVT:$vz,
+              (MaskVT true_mask), i32:$avl),
+            (!cast<Instruction>(OpBaseName#"vrvl")
+              $vx, $sy, $vz, $avl)>;
+
+  // Masked, broadcast second.
+  def : Pat<(OpNode
+              DataVT:$vx, (any_broadcast ScalarVT:$sy), DataVT:$vz,
+              MaskVT:$mask, i32:$avl),
+            (!cast<Instruction>(OpBaseName#"vrvml")
+              $vx, $sy, $vz, $mask, $avl)>;
+}
+
+multiclass Ternary_rvv_vrv_vvv<
+    SDPatternOperator OpNode,
+    ValueType ScalarVT, ValueType DataVT,
+    ValueType MaskVT, string OpBaseName> {
+  defm : Ternary_rvv<OpNode, ScalarVT, DataVT, MaskVT, OpBaseName>;
+  defm : Ternary_vrv<OpNode, ScalarVT, DataVT, MaskVT, OpBaseName>;
+  defm : Ternary_vvv<OpNode, DataVT, MaskVT, OpBaseName>;
+}
+
+// Expand both 64bit and 32 bit variant (256 elements)
+multiclass Ternary_ShortLong<
+    SDPatternOperator OpNode,
+    ValueType LongScalarVT, ValueType LongDataVT, string LongOpBaseName,
+    ValueType ShortScalarVT, ValueType ShortDataVT, string ShortOpBaseName> {
+  defm : Ternary_rvv_vrv_vvv<OpNode, LongScalarVT, LongDataVT,
+                             v256i1, LongOpBaseName>;
+  defm : Ternary_rvv_vrv_vvv<OpNode, ShortScalarVT, ShortDataVT,
+                             v256i1, ShortOpBaseName>;
+}
+
+defm : Ternary_ShortLong<c_vvp_ffma,
+                         f64, v256f64, "VFMADD", f32, v256f32, "VFMADS">;
+defm : Ternary_rvv_vrv_vvv<c_vvp_ffma,
+                           i64, v512f32, v512i1, "PVFMAD">;
+
 multiclass Merge_mvv<
     SDPatternOperator OpNode,
     ValueType DataVT, ValueType MaskVT,
@@ -268,3 +566,63 @@ defm : Merge_mvv_ShortLong<vvp_select,
 defm : Merge_mvv_ShortLong<vvp_select,
                            v256i64,
                            v256i32, "VMRG">;
+
+multiclass Set_CC<ValueType DataVT, string FmkBaseName, string CmpBaseName, SDPatternOperator CCMatcher, SDNodeXForm CCConv> {
+  // Unmasked.
+  def : Pat<(v256i1 (vvp_setcc
+              DataVT:$LHS, DataVT:$RHS, CCMatcher:$cond, (v256i1 true_mask), i32:$vl)),
+              (!cast<Instruction>(FmkBaseName#"vl")
+                (CCConv $cond),
+                (!cast<Instruction>(CmpBaseName#"vvl")
+                  $LHS, $RHS, $vl),
+                $vl)>;
+  // Masked.
+  def : Pat<(v256i1 (vvp_setcc
+              DataVT:$LHS, DataVT:$RHS, CCMatcher:$cond, v256i1:$vm, i32:$vl)),
+              (!cast<Instruction>(FmkBaseName#"vml")
+                (CCConv $cond),
+                (!cast<Instruction>(CmpBaseName#"vvl")
+                  $LHS, $RHS, $vl),
+                $vm, $vl)>;
+}
+
+defm : Set_CC<v256i64,"VFMKL","VCMPUL",CCUIOp,icond2cc>;
+defm : Set_CC<v256i64,"VFMKL","VCMPSL",CCSIOp,icond2cc>;
+defm : Set_CC<v256f64,"VFMKL","VFCMPD",cond,fcond2cc>;
+
+defm : Set_CC<v256i32,"VFMKW","VCMPUW",CCUIOp,icond2cc>;
+defm : Set_CC<v256i32,"VFMKW","VCMPSWZX",CCSIOp,icond2cc>;
+defm : Set_CC<v256f32,"VFMKS","VFCMPS",cond,fcond2cc>;
+
+multiclass Reduce_GenericInt<ValueType VectorVT,
+    RegisterClass ResRC, ValueType ResVT,
+    string VVPRedOp, string RedInstName> {
+  // Unmasked.
+  def : Pat <(ResVT (!cast<SDPatternOperator>("vvp_reduce_"#VVPRedOp)
+                VectorVT:$vx, (v256i1 true_mask), i32:$vl)),
+             (COPY_TO_REGCLASS
+               (!cast<Instruction>("LVSvi")
+                 (!cast<Instruction>(RedInstName#"vl") $vx, $vl), 0),
+                 ResRC)>;
+
+  // Masked.
+  def : Pat <(ResVT (!cast<SDPatternOperator>("vvp_reduce_"#VVPRedOp)
+                VectorVT:$vx, v256i1:$vm, i32:$vl)),
+             (COPY_TO_REGCLASS
+                (!cast<Instruction>("LVSvi")
+                   (!cast<Instruction>(RedInstName#"vml") $vx, $vm, $vl), 0),
+                   ResRC)>;
+}
+
+multiclass IntReduce_ShortLong<ValueType VectorVT,
+    RegisterClass ResRC, ValueType ResVT,
+    string SumSuffix, string MinMaxSuffix> {
+  defm: Reduce_GenericInt<VectorVT, ResRC, ResVT, "or",  "VROR">;
+  defm: Reduce_GenericInt<VectorVT, ResRC, ResVT, "and", "VRAND">;
+  defm: Reduce_GenericInt<VectorVT, ResRC, ResVT, "xor", "VRXOR">;
+  defm: Reduce_GenericInt<VectorVT, ResRC, ResVT, "add", "VSUM"#SumSuffix>;
+  defm: Reduce_GenericInt<VectorVT, ResRC, ResVT, "smax", "VRMAX"#MinMaxSuffix>;
+}
+
+defm: IntReduce_ShortLong<v256i64, I64, i64, "L","SLFST">;
+defm: IntReduce_ShortLong<v256i32, I32, i32, "WSX","SWFSTSX">;
diff --git a/llvm/lib/Target/VE/VVPNodes.def b/llvm/lib/Target/VE/VVPNodes.def
index 8000f84c5dbe..a60588672293 100644
--- a/llvm/lib/Target/VE/VVPNodes.def
+++ b/llvm/lib/Target/VE/VVPNodes.def
@@ -24,6 +24,14 @@
 #define ADD_VVP_OP(X, Y)
 #endif
 
+/// ADD_UNARY_VVP_OP(VVPNAME,SDNAME)
+/// \p VVPName is a VVP Unary operator.
+/// \p SDNAME is the generic SD opcode corresponding to \p VVPName.
+#ifndef ADD_UNARY_VVP_OP
+#define ADD_UNARY_VVP_OP(VVPNAME,SDNAME) \
+            ADD_VVP_OP(VVPNAME,SDNAME)
+#endif
+
 /// ADD_BINARY_VVP_OP(VVPNAME,SDNAME)
 /// \p VVPName is a VVP Binary operator.
 /// \p SDNAME is the generic SD opcode corresponding to \p VVPName.
@@ -33,38 +41,95 @@
             HANDLE_VP_TO_VVP(VPNAME, VVPNAME)
 #endif
 
+/// ADD_TERNARY_VVP_OP(VVPNAME,SDNAME)
+/// \p VVPName is a VVP Ternary operator.
+/// \p SDNAME is the generic SD opcode corresponding to \p VVPName.
+#ifndef ADD_TERNARY_VVP_OP
+#define ADD_TERNARY_VVP_OP(VVPNAME,SDNAME) \
+    ADD_VVP_OP(VVPNAME,SDNAME)
+#endif
+
 #ifndef ADD_BINARY_VVP_OP_COMPACT
 #define ADD_BINARY_VVP_OP_COMPACT(NAME) \
     ADD_BINARY_VVP_OP(VVP_##NAME,VP_##NAME,NAME)
 #endif
 
+/// REGISTER_PACKED(OPC)
+/// \p OPC The VVP opcode of the operation.
+#ifndef REGISTER_PACKED
+#define REGISTER_PACKED(OPC)
+#endif
+
+/// ADD_REDUCE_VVP_OP(OPC)
+/// \p OPC The VVP opcode of the operation.
+/// \p SDNAME The standard opcode of the operation.
+#ifndef ADD_REDUCE_VVP_OP
+#define ADD_REDUCE_VVP_OP(OPC, SDNAME) ADD_VVP_OP(OPC, SDNAME)
+#endif
+
+// Scalar standard ISD to perform this reduction.
+#ifndef HANDLE_VVP_REDUCE_TO_SCALAR
+#define HANDLE_VVP_REDUCE_TO_SCALAR(VVP_RED_ISD, REDUCE_ISD)
+#endif
+
+/// Reductions.
+#define HELPER_REDUCTION(OPC, SCALAR_OPC) \
+    ADD_REDUCE_VVP_OP(VVP_REDUCE_##OPC,VECREDUCE_##OPC) \
+    HANDLE_VP_TO_VVP(VP_REDUCE_##OPC, VVP_REDUCE_##OPC) \
+    HANDLE_VVP_REDUCE_TO_SCALAR(VVP_REDUCE_##OPC, SCALAR_OPC)
+
+HELPER_REDUCTION(ADD, ADD)
+HELPER_REDUCTION(AND, AND)
+HELPER_REDUCTION(OR, OR)
+HELPER_REDUCTION(XOR, XOR)
+HELPER_REDUCTION(SMAX, SMAX)
+
+#undef HELPER_REDUCTION
+
+ADD_VVP_OP(VVP_LOAD,LOAD)   HANDLE_VP_TO_VVP(VP_LOAD, VVP_LOAD)   REGISTER_PACKED(VVP_LOAD)
+ADD_VVP_OP(VVP_STORE,STORE) HANDLE_VP_TO_VVP(VP_STORE, VVP_STORE) REGISTER_PACKED(VVP_STORE)
+
+ADD_VVP_OP(VVP_GATHER, MGATHER)     HANDLE_VP_TO_VVP(VP_GATHER, VVP_GATHER)
+ADD_VVP_OP(VVP_SCATTER, MSCATTER)   HANDLE_VP_TO_VVP(VP_SCATTER, VVP_SCATTER)
+
 // Integer arithmetic.
-ADD_BINARY_VVP_OP_COMPACT(ADD)
-ADD_BINARY_VVP_OP_COMPACT(SUB)
+ADD_BINARY_VVP_OP_COMPACT(ADD) REGISTER_PACKED(VVP_ADD)
+ADD_BINARY_VVP_OP_COMPACT(SUB) REGISTER_PACKED(VVP_SUB)
 ADD_BINARY_VVP_OP_COMPACT(MUL)
 ADD_BINARY_VVP_OP_COMPACT(UDIV)
 ADD_BINARY_VVP_OP_COMPACT(SDIV)
 
-ADD_BINARY_VVP_OP(VVP_SRA,VP_ASHR,SRA)
-ADD_BINARY_VVP_OP(VVP_SRL,VP_LSHR,SRL)
-ADD_BINARY_VVP_OP_COMPACT(SHL)
+ADD_BINARY_VVP_OP(VVP_SRA,VP_ASHR,SRA) REGISTER_PACKED(VVP_SRA)
+ADD_BINARY_VVP_OP(VVP_SRL,VP_LSHR,SRL) REGISTER_PACKED(VVP_SRL)
+ADD_BINARY_VVP_OP_COMPACT(SHL) REGISTER_PACKED(VVP_SHL)
 
-ADD_BINARY_VVP_OP_COMPACT(AND)
-ADD_BINARY_VVP_OP_COMPACT(OR)
-ADD_BINARY_VVP_OP_COMPACT(XOR)
+ADD_BINARY_VVP_OP_COMPACT(AND) REGISTER_PACKED(VVP_AND)
+ADD_BINARY_VVP_OP_COMPACT(OR)  REGISTER_PACKED(VVP_OR)
+ADD_BINARY_VVP_OP_COMPACT(XOR) REGISTER_PACKED(VVP_XOR)
 
 // FP arithmetic.
-ADD_BINARY_VVP_OP_COMPACT(FADD)
-ADD_BINARY_VVP_OP_COMPACT(FSUB)
-ADD_BINARY_VVP_OP_COMPACT(FMUL)
+ADD_UNARY_VVP_OP(VVP_FNEG, FNEG) HANDLE_VP_TO_VVP(VP_FNEG, VVP_FNEG) REGISTER_PACKED(VVP_FNEG)
+ADD_BINARY_VVP_OP_COMPACT(FADD) REGISTER_PACKED(VVP_FADD)
+ADD_BINARY_VVP_OP_COMPACT(FSUB) REGISTER_PACKED(VVP_FSUB)
+ADD_BINARY_VVP_OP_COMPACT(FMUL) REGISTER_PACKED(VVP_FMUL)
 ADD_BINARY_VVP_OP_COMPACT(FDIV)
 
+ADD_TERNARY_VVP_OP(VVP_FFMA,FMA) HANDLE_VP_TO_VVP(VP_FMA, VVP_FFMA) REGISTER_PACKED(VVP_FFMA)
+
+ADD_VVP_OP(VVP_SETCC, SETCC)
+
 // Shuffles.
-ADD_VVP_OP(VVP_SELECT,VSELECT)
+ADD_VVP_OP(VVP_SELECT,VSELECT) REGISTER_PACKED(VVP_SELECT)
 HANDLE_VP_TO_VVP(VP_SELECT, VVP_SELECT)
 HANDLE_VP_TO_VVP(VP_MERGE, VVP_SELECT)
 
+
 #undef ADD_BINARY_VVP_OP
+#undef ADD_TERNARY_VVP_OP
+#undef ADD_UNARY_VVP_OP
 #undef ADD_BINARY_VVP_OP_COMPACT
+#undef ADD_REDUCE_VVP_OP
 #undef ADD_VVP_OP
 #undef HANDLE_VP_TO_VVP
+#undef HANDLE_VVP_REDUCE_TO_SCALAR
+#undef REGISTER_PACKED
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index 56689d3ee06b..7bafa53af2af 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -24,6 +24,7 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCSectionWasm.h"
@@ -374,7 +375,7 @@ public:
       auto Type = WebAssembly::parseType(Lexer.getTok().getString());
       if (!Type)
         return error("unknown type: ", Lexer.getTok());
-      Types.push_back(Type.getValue());
+      Types.push_back(*Type);
       Parser.Lex();
       if (!isNext(AsmToken::Comma))
         break;
@@ -670,11 +671,12 @@ public:
         } else {
           // Assume this identifier is a label.
           const MCExpr *Val;
+          SMLoc Start = Id.getLoc();
           SMLoc End;
           if (Parser.parseExpression(Val, End))
             return error("Cannot parse symbol: ", Lexer.getTok());
           Operands.push_back(std::make_unique<WebAssemblyOperand>(
-              WebAssemblyOperand::Symbol, Id.getLoc(), Id.getEndLoc(),
+              WebAssemblyOperand::Symbol, Start, End,
               WebAssemblyOperand::SymOp{Val}));
           if (checkForP2AlignIfLoadStore(Operands, Name))
             return true;
@@ -815,8 +817,7 @@ public:
       // Now set this symbol with the correct type.
       auto WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName));
       WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
-      WasmSym->setGlobalType(
-          wasm::WasmGlobalType{uint8_t(Type.getValue()), Mutable});
+      WasmSym->setGlobalType(wasm::WasmGlobalType{uint8_t(*Type), Mutable});
       // And emit the directive again.
       TOut.emitGlobalType(WasmSym);
       return expect(AsmToken::EndOfStatement, "EOL");
@@ -846,7 +847,7 @@ public:
       // symbol
       auto WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName));
       WasmSym->setType(wasm::WASM_SYMBOL_TYPE_TABLE);
-      wasm::WasmTableType Type = {uint8_t(ElemType.getValue()), Limits};
+      wasm::WasmTableType Type = {uint8_t(*ElemType), Limits};
       WasmSym->setTableType(Type);
       TOut.emitTableType(WasmSym);
       return expect(AsmToken::EndOfStatement, "EOL");
@@ -1016,7 +1017,7 @@ public:
           Inst.setOpcode(Opc64);
         }
       }
-      if (!SkipTypeCheck && TC.typeCheck(IDLoc, Inst))
+      if (!SkipTypeCheck && TC.typeCheck(IDLoc, Inst, Operands))
         return true;
       Out.emitInstruction(Inst, getSTI());
       if (CurrentState == EndFunction) {
@@ -1094,14 +1095,15 @@ public:
     auto *WS =
         getContext().getWasmSection(SecName, SectionKind::getText(), 0, Group,
                                     MCContext::GenericSectionID, nullptr);
-    getStreamer().SwitchSection(WS);
+    getStreamer().switchSection(WS);
     // Also generate DWARF for this section if requested.
     if (getContext().getGenDwarfForAssembly())
       getContext().addGenDwarfSection(WS);
   }
 
   void onEndOfFunction(SMLoc ErrorLoc) {
-    TC.endOfFunction(ErrorLoc);
+    if (!SkipTypeCheck)
+      TC.endOfFunction(ErrorLoc);
     // Reset the type checker state.
     TC.Clear();
 
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp
index 128ce5c4fec0..ec72c1de0503 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp
@@ -86,14 +86,12 @@ bool WebAssemblyAsmTypeCheck::popType(SMLoc ErrorLoc,
                                       Optional<wasm::ValType> EVT) {
   if (Stack.empty()) {
     return typeError(ErrorLoc,
-                      EVT.hasValue()
-                          ? StringRef("empty stack while popping ") +
-                                WebAssembly::typeToString(EVT.getValue())
-                          : StringRef(
-                                    "empty stack while popping value"));
+                     EVT ? StringRef("empty stack while popping ") +
+                               WebAssembly::typeToString(EVT.getValue())
+                         : StringRef("empty stack while popping value"));
   }
   auto PVT = Stack.pop_back_val();
-  if (EVT.hasValue() && EVT.getValue() != PVT) {
+  if (EVT && EVT.getValue() != PVT) {
     return typeError(
         ErrorLoc, StringRef("popped ") + WebAssembly::typeToString(PVT) +
                                     ", expected " +
@@ -102,6 +100,19 @@ bool WebAssemblyAsmTypeCheck::popType(SMLoc ErrorLoc,
   return false;
 }
 
+bool WebAssemblyAsmTypeCheck::popRefType(SMLoc ErrorLoc) {
+  if (Stack.empty()) {
+    return typeError(ErrorLoc, StringRef("empty stack while popping reftype"));
+  }
+  auto PVT = Stack.pop_back_val();
+  if (!WebAssembly::isRefType(PVT)) {
+    return typeError(ErrorLoc, StringRef("popped ") +
+                                   WebAssembly::typeToString(PVT) +
+                                   ", expected reftype");
+  }
+  return false;
+}
+
 bool WebAssemblyAsmTypeCheck::getLocal(SMLoc ErrorLoc, const MCInst &Inst,
                                        wasm::ValType &Type) {
   auto Local = static_cast<size_t>(Inst.getOperand(0).getImm());
@@ -160,7 +171,7 @@ bool WebAssemblyAsmTypeCheck::getGlobal(SMLoc ErrorLoc, const MCInst &Inst,
   if (getSymRef(ErrorLoc, Inst, SymRef))
     return true;
   auto WasmSym = cast<MCSymbolWasm>(&SymRef->getSymbol());
-  switch (WasmSym->getType().getValueOr(wasm::WASM_SYMBOL_TYPE_DATA)) {
+  switch (WasmSym->getType().value_or(wasm::WASM_SYMBOL_TYPE_DATA)) {
   case wasm::WASM_SYMBOL_TYPE_GLOBAL:
     Type = static_cast<wasm::ValType>(WasmSym->getGlobalType().Type);
     break;
@@ -182,6 +193,20 @@ bool WebAssemblyAsmTypeCheck::getGlobal(SMLoc ErrorLoc, const MCInst &Inst,
   return false;
 }
 
+bool WebAssemblyAsmTypeCheck::getTable(SMLoc ErrorLoc, const MCInst &Inst,
+                                       wasm::ValType &Type) {
+  const MCSymbolRefExpr *SymRef;
+  if (getSymRef(ErrorLoc, Inst, SymRef))
+    return true;
+  auto WasmSym = cast<MCSymbolWasm>(&SymRef->getSymbol());
+  if (WasmSym->getType().value_or(wasm::WASM_SYMBOL_TYPE_DATA) !=
+      wasm::WASM_SYMBOL_TYPE_TABLE)
+    return typeError(ErrorLoc, StringRef("symbol ") + WasmSym->getName() +
+                                   " missing .tabletype");
+  Type = static_cast<wasm::ValType>(WasmSym->getTableType().ElemType);
+  return false;
+}
+
 bool WebAssemblyAsmTypeCheck::endOfFunction(SMLoc ErrorLoc) {
   // Check the return types.
   for (auto RVT : llvm::reverse(ReturnTypes)) {
@@ -196,35 +221,58 @@ bool WebAssemblyAsmTypeCheck::endOfFunction(SMLoc ErrorLoc) {
   return false;
 }
 
-bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst) {
+bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst,
+                                        OperandVector &Operands) {
   auto Opc = Inst.getOpcode();
   auto Name = GetMnemonic(Opc);
   dumpTypeStack("typechecking " + Name + ": ");
   wasm::ValType Type;
   if (Name == "local.get") {
-    if (getLocal(ErrorLoc, Inst, Type))
+    if (getLocal(Operands[1]->getStartLoc(), Inst, Type))
       return true;
     Stack.push_back(Type);
   } else if (Name == "local.set") {
-    if (getLocal(ErrorLoc, Inst, Type))
+    if (getLocal(Operands[1]->getStartLoc(), Inst, Type))
       return true;
     if (popType(ErrorLoc, Type))
       return true;
   } else if (Name == "local.tee") {
-    if (getLocal(ErrorLoc, Inst, Type))
+    if (getLocal(Operands[1]->getStartLoc(), Inst, Type))
       return true;
     if (popType(ErrorLoc, Type))
       return true;
     Stack.push_back(Type);
   } else if (Name == "global.get") {
-    if (getGlobal(ErrorLoc, Inst, Type))
+    if (getGlobal(Operands[1]->getStartLoc(), Inst, Type))
       return true;
     Stack.push_back(Type);
   } else if (Name == "global.set") {
-    if (getGlobal(ErrorLoc, Inst, Type))
+    if (getGlobal(Operands[1]->getStartLoc(), Inst, Type))
+      return true;
+    if (popType(ErrorLoc, Type))
+      return true;
+  } else if (Name == "table.get") {
+    if (getTable(Operands[1]->getStartLoc(), Inst, Type))
+      return true;
+    if (popType(ErrorLoc, wasm::ValType::I32))
+      return true;
+    Stack.push_back(Type);
+  } else if (Name == "table.set") {
+    if (getTable(Operands[1]->getStartLoc(), Inst, Type))
       return true;
     if (popType(ErrorLoc, Type))
       return true;
+    if (popType(ErrorLoc, wasm::ValType::I32))
+      return true;
+  } else if (Name == "table.fill") {
+    if (getTable(Operands[1]->getStartLoc(), Inst, Type))
+      return true;
+    if (popType(ErrorLoc, wasm::ValType::I32))
+      return true;
+    if (popType(ErrorLoc, Type))
+      return true;
+    if (popType(ErrorLoc, wasm::ValType::I32))
+      return true;
   } else if (Name == "drop") {
     if (popType(ErrorLoc, {}))
       return true;
@@ -245,33 +293,36 @@ bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst) {
       return true;
   } else if (Name == "call" || Name == "return_call") {
     const MCSymbolRefExpr *SymRef;
-    if (getSymRef(ErrorLoc, Inst, SymRef))
+    if (getSymRef(Operands[1]->getStartLoc(), Inst, SymRef))
       return true;
     auto WasmSym = cast<MCSymbolWasm>(&SymRef->getSymbol());
     auto Sig = WasmSym->getSignature();
     if (!Sig || WasmSym->getType() != wasm::WASM_SYMBOL_TYPE_FUNCTION)
-      return typeError(ErrorLoc, StringRef("symbol ") + WasmSym->getName() +
-                                      " missing .functype");
+      return typeError(Operands[1]->getStartLoc(), StringRef("symbol ") +
+                                                       WasmSym->getName() +
+                                                       " missing .functype");
     if (checkSig(ErrorLoc, *Sig)) return true;
     if (Name == "return_call" && endOfFunction(ErrorLoc))
       return true;
   } else if (Name == "catch") {
     const MCSymbolRefExpr *SymRef;
-    if (getSymRef(ErrorLoc, Inst, SymRef))
+    if (getSymRef(Operands[1]->getStartLoc(), Inst, SymRef))
       return true;
     const auto *WasmSym = cast<MCSymbolWasm>(&SymRef->getSymbol());
     const auto *Sig = WasmSym->getSignature();
     if (!Sig || WasmSym->getType() != wasm::WASM_SYMBOL_TYPE_TAG)
-      return typeError(ErrorLoc, StringRef("symbol ") + WasmSym->getName() +
-                                     " missing .tagtype");
+      return typeError(Operands[1]->getStartLoc(), StringRef("symbol ") +
+                                                       WasmSym->getName() +
+                                                       " missing .tagtype");
     // catch instruction pushes values whose types are specified in the tag's
     // "params" part
     Stack.insert(Stack.end(), Sig->Params.begin(), Sig->Params.end());
-  } else if (Name == "ref.null") {
-    auto VT = static_cast<wasm::ValType>(Inst.getOperand(0).getImm());
-    Stack.push_back(VT);
   } else if (Name == "unreachable") {
     Unreachable = true;
+  } else if (Name == "ref.is_null") {
+    if (popRefType(ErrorLoc))
+      return true;
+    Stack.push_back(wasm::ValType::I32);
   } else {
     // The current instruction is a stack instruction which doesn't have
     // explicit operands that indicate push/pop types, so we get those from
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h
index 2b07faf67a18..3be966b5739c 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h
@@ -16,9 +16,10 @@
 #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_ASMPARSER_TYPECHECK_H
 #define LLVM_LIB_TARGET_WEBASSEMBLY_ASMPARSER_TYPECHECK_H
 
-#include "llvm/MC/MCParser/MCAsmParser.h"
-#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/BinaryFormat/Wasm.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCSymbol.h"
 
 namespace llvm {
@@ -38,12 +39,14 @@ class WebAssemblyAsmTypeCheck final {
   void dumpTypeStack(Twine Msg);
   bool typeError(SMLoc ErrorLoc, const Twine &Msg);
   bool popType(SMLoc ErrorLoc, Optional<wasm::ValType> EVT);
+  bool popRefType(SMLoc ErrorLoc);
   bool getLocal(SMLoc ErrorLoc, const MCInst &Inst, wasm::ValType &Type);
   bool checkEnd(SMLoc ErrorLoc, bool PopVals = false);
   bool checkSig(SMLoc ErrorLoc, const wasm::WasmSignature &Sig);
   bool getSymRef(SMLoc ErrorLoc, const MCInst &Inst,
                  const MCSymbolRefExpr *&SymRef);
   bool getGlobal(SMLoc ErrorLoc, const MCInst &Inst, wasm::ValType &Type);
+  bool getTable(SMLoc ErrorLoc, const MCInst &Inst, wasm::ValType &Type);
 
 public:
   WebAssemblyAsmTypeCheck(MCAsmParser &Parser, const MCInstrInfo &MII, bool is64);
@@ -52,7 +55,7 @@ public:
   void localDecl(const SmallVector<wasm::ValType, 4> &Locals);
   void setLastSig(const wasm::WasmSignature &Sig) { LastSig = Sig; }
   bool endOfFunction(SMLoc ErrorLoc);
-  bool typeCheck(SMLoc ErrorLoc, const MCInst &Inst);
+  bool typeCheck(SMLoc ErrorLoc, const MCInst &Inst, OperandVector &Operands);
 
   void Clear() {
     Stack.clear();
diff --git a/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
index 5d38145559da..ae65a9dc2a4e 100644
--- a/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
+++ b/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
@@ -17,8 +17,8 @@
 #include "TargetInfo/WebAssemblyTargetInfo.h"
 #include "Utils/WebAssemblyTypeUtilities.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDecoderOps.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
-#include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
index d8122950e061..5727708a84ad 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
@@ -52,6 +52,4 @@ WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T,
   // we make sure this info is set correctly.
   if (WebAssembly::WasmEnableEH || WebAssembly::WasmEnableSjLj)
     ExceptionsType = ExceptionHandling::Wasm;
-
-  // TODO: UseIntegratedAssembler?
 }
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
index 8f670ec88897..f52545a65dbb 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -62,7 +62,6 @@ static MCInstPrinter *createMCInstPrinter(const Triple & /*T*/,
 }
 
 static MCCodeEmitter *createCodeEmitter(const MCInstrInfo &MCII,
-                                        const MCRegisterInfo & /*MRI*/,
                                         MCContext &Ctx) {
   return createWebAssemblyMCCodeEmitter(MCII);
 }
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
index 397b9b0ee9da..2da219d54c73 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
@@ -58,8 +58,6 @@ void WebAssemblyTargetAsmStreamer::emitLocal(ArrayRef<wasm::ValType> Types) {
   }
 }
 
-void WebAssemblyTargetAsmStreamer::emitEndFunc() { OS << "\t.endfunc\n"; }
-
 void WebAssemblyTargetAsmStreamer::emitFunctionType(const MCSymbolWasm *Sym) {
   assert(Sym->isFunction());
   OS << "\t.functype\t" << Sym->getName() << " ";
@@ -136,10 +134,6 @@ void WebAssemblyTargetWasmStreamer::emitLocal(ArrayRef<wasm::ValType> Types) {
   }
 }
 
-void WebAssemblyTargetWasmStreamer::emitEndFunc() {
-  llvm_unreachable(".end_func is not needed for direct wasm output");
-}
-
 void WebAssemblyTargetWasmStreamer::emitIndIdx(const MCExpr *Value) {
   llvm_unreachable(".indidx encoding not yet implemented");
 }
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
index c0ad63c8dd50..522f6356c28b 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
@@ -32,8 +32,6 @@ public:
 
   /// .local
   virtual void emitLocal(ArrayRef<wasm::ValType> Types) = 0;
-  /// .endfunc
-  virtual void emitEndFunc() = 0;
   /// .functype
   virtual void emitFunctionType(const MCSymbolWasm *Sym) = 0;
   /// .indidx
@@ -66,7 +64,6 @@ public:
   WebAssemblyTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
 
   void emitLocal(ArrayRef<wasm::ValType> Types) override;
-  void emitEndFunc() override;
   void emitFunctionType(const MCSymbolWasm *Sym) override;
   void emitIndIdx(const MCExpr *Value) override;
   void emitGlobalType(const MCSymbolWasm *Sym) override;
@@ -83,7 +80,6 @@ public:
   explicit WebAssemblyTargetWasmStreamer(MCStreamer &S);
 
   void emitLocal(ArrayRef<wasm::ValType> Types) override;
-  void emitEndFunc() override;
   void emitFunctionType(const MCSymbolWasm *Sym) override {}
   void emitIndIdx(const MCExpr *Value) override;
   void emitGlobalType(const MCSymbolWasm *Sym) override {}
@@ -104,7 +100,6 @@ public:
       : WebAssemblyTargetStreamer(S) {}
 
   void emitLocal(ArrayRef<wasm::ValType>) override {}
-  void emitEndFunc() override {}
   void emitFunctionType(const MCSymbolWasm *) override {}
   void emitIndIdx(const MCExpr *) override {}
   void emitGlobalType(const MCSymbolWasm *) override {}
diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h
index cdb95d48398d..8fc67d37925c 100644
--- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h
+++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h
@@ -80,6 +80,10 @@ inline bool isRefType(const Type *Ty) {
   return isFuncrefType(Ty) || isExternrefType(Ty);
 }
 
+inline bool isRefType(wasm::ValType Type) {
+  return Type == wasm::ValType::EXTERNREF || Type == wasm::ValType::FUNCREF;
+}
+
 // Convert StringRef to ValType / HealType / BlockType
 
 Optional<wasm::ValType> parseType(StringRef Type);
diff --git a/llvm/lib/Target/WebAssembly/WebAssembly.h b/llvm/lib/Target/WebAssembly/WebAssembly.h
index 803786e0c9c2..aee8f160f38d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssembly.h
+++ b/llvm/lib/Target/WebAssembly/WebAssembly.h
@@ -26,7 +26,6 @@ class FunctionPass;
 
 // LLVM IR passes.
 ModulePass *createWebAssemblyLowerEmscriptenEHSjLj();
-ModulePass *createWebAssemblyLowerGlobalDtors();
 ModulePass *createWebAssemblyAddMissingPrototypes();
 ModulePass *createWebAssemblyFixFunctionBitcasts();
 FunctionPass *createWebAssemblyOptimizeReturned();
@@ -41,7 +40,6 @@ FunctionPass *createWebAssemblySetP2AlignOperands();
 // Late passes.
 FunctionPass *createWebAssemblyReplacePhysRegs();
 FunctionPass *createWebAssemblyNullifyDebugValueLists();
-FunctionPass *createWebAssemblyPrepareForLiveIntervals();
 FunctionPass *createWebAssemblyOptimizeLiveIntervals();
 FunctionPass *createWebAssemblyMemIntrinsicResults();
 FunctionPass *createWebAssemblyRegStackify();
@@ -61,14 +59,12 @@ ModulePass *createWebAssemblyMCLowerPrePass();
 // PassRegistry initialization declarations.
 void initializeWebAssemblyAddMissingPrototypesPass(PassRegistry &);
 void initializeWebAssemblyLowerEmscriptenEHSjLjPass(PassRegistry &);
-void initializeLowerGlobalDtorsPass(PassRegistry &);
 void initializeFixFunctionBitcastsPass(PassRegistry &);
 void initializeOptimizeReturnedPass(PassRegistry &);
 void initializeWebAssemblyArgumentMovePass(PassRegistry &);
 void initializeWebAssemblySetP2AlignOperandsPass(PassRegistry &);
 void initializeWebAssemblyReplacePhysRegsPass(PassRegistry &);
 void initializeWebAssemblyNullifyDebugValueListsPass(PassRegistry &);
-void initializeWebAssemblyPrepareForLiveIntervalsPass(PassRegistry &);
 void initializeWebAssemblyOptimizeLiveIntervalsPass(PassRegistry &);
 void initializeWebAssemblyMemIntrinsicResultsPass(PassRegistry &);
 void initializeWebAssemblyRegStackifyPass(PassRegistry &);
diff --git a/llvm/lib/Target/WebAssembly/WebAssembly.td b/llvm/lib/Target/WebAssembly/WebAssembly.td
index a529c6217189..b83dcf3a8e65 100644
--- a/llvm/lib/Target/WebAssembly/WebAssembly.td
+++ b/llvm/lib/Target/WebAssembly/WebAssembly.td
@@ -67,6 +67,10 @@ def FeatureReferenceTypes :
       SubtargetFeature<"reference-types", "HasReferenceTypes", "true",
                        "Enable reference types">;
 
+def FeatureExtendedConst :
+      SubtargetFeature<"extended-const", "HasExtendedConst", "true",
+                       "Enable extended const expressions">;
+
 //===----------------------------------------------------------------------===//
 // Architectures.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index bf326e5106be..57d51634e849 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -180,30 +180,30 @@ void WebAssemblyAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
   MCSymbolWasm *Sym = cast<MCSymbolWasm>(getSymbol(GV));
 
   if (!Sym->getType()) {
-    const WebAssemblyTargetLowering &TLI = *Subtarget->getTargetLowering();
     SmallVector<MVT, 1> VTs;
     Type *GlobalVT = GV->getValueType();
-    computeLegalValueVTs(TLI, GV->getParent()->getContext(),
-                         GV->getParent()->getDataLayout(), GlobalVT, VTs);
+    if (Subtarget) {
+      // Subtarget is only set when a function is defined, because
+      // each function can declare a different subtarget. For example,
+      // on ARM a compilation unit might have a function on ARM and
+      // another on Thumb. Therefore only if Subtarget is non-null we
+      // can actually calculate the legal VTs.
+      const WebAssemblyTargetLowering &TLI = *Subtarget->getTargetLowering();
+      computeLegalValueVTs(TLI, GV->getParent()->getContext(),
+                           GV->getParent()->getDataLayout(), GlobalVT, VTs);
+    }
     WebAssembly::wasmSymbolSetType(Sym, GlobalVT, VTs);
   }
 
-  // If the GlobalVariable refers to a table, we handle it here instead of
-  // in emitExternalDecls
-  if (Sym->isTable()) {
-    getTargetStreamer()->emitTableType(Sym);
-    return;
-  }
-
   emitVisibility(Sym, GV->getVisibility(), !GV->isDeclaration());
+  emitSymbolType(Sym);
   if (GV->hasInitializer()) {
     assert(getSymbolPreferLocal(*GV) == Sym);
     emitLinkage(GV, Sym);
-    getTargetStreamer()->emitGlobalType(Sym);
     OutStreamer->emitLabel(Sym);
     // TODO: Actually emit the initializer value.  Otherwise the global has the
     // default value for its type (0, ref.null, etc).
-    OutStreamer->AddBlankLine();
+    OutStreamer->addBlankLine();
   }
 }
 
@@ -211,7 +211,7 @@ MCSymbol *WebAssemblyAsmPrinter::getOrCreateWasmSymbol(StringRef Name) {
   auto *WasmSym = cast<MCSymbolWasm>(GetExternalSymbolSymbol(Name));
 
   // May be called multiple times, so early out.
-  if (WasmSym->getType().hasValue())
+  if (WasmSym->getType())
     return WasmSym;
 
   const WebAssemblySubtarget &Subtarget = getSubtarget();
@@ -271,31 +271,52 @@ MCSymbol *WebAssemblyAsmPrinter::getOrCreateWasmSymbol(StringRef Name) {
   return WasmSym;
 }
 
-void WebAssemblyAsmPrinter::emitExternalDecls(const Module &M) {
+void WebAssemblyAsmPrinter::emitSymbolType(const MCSymbolWasm *Sym) {
+  Optional<wasm::WasmSymbolType> WasmTy = Sym->getType();
+  if (!WasmTy)
+    return;
+
+  switch (*WasmTy) {
+  case wasm::WASM_SYMBOL_TYPE_GLOBAL:
+    getTargetStreamer()->emitGlobalType(Sym);
+    break;
+  case wasm::WASM_SYMBOL_TYPE_TAG:
+    getTargetStreamer()->emitTagType(Sym);
+    break;
+  case wasm::WASM_SYMBOL_TYPE_TABLE:
+    getTargetStreamer()->emitTableType(Sym);
+    break;
+  default:
+    break; // We only handle globals, tags and tables here
+  }
+}
+
+void WebAssemblyAsmPrinter::emitDecls(const Module &M) {
   if (signaturesEmitted)
     return;
   signaturesEmitted = true;
 
   // Normally symbols for globals get discovered as the MI gets lowered,
-  // but we need to know about them ahead of time.
+  // but we need to know about them ahead of time. This will however,
+  // only find symbols that have been used. Unused symbols from globals will
+  // not be found here.
   MachineModuleInfoWasm &MMIW = MMI->getObjFileInfo<MachineModuleInfoWasm>();
   for (const auto &Name : MMIW.MachineSymbolsUsed) {
-    getOrCreateWasmSymbol(Name.getKey());
+    auto *WasmSym = cast<MCSymbolWasm>(getOrCreateWasmSymbol(Name.getKey()));
+    if (WasmSym->isFunction()) {
+      // TODO(wvo): is there any case where this overlaps with the call to
+      // emitFunctionType in the loop below?
+      getTargetStreamer()->emitFunctionType(WasmSym);
+    }
   }
 
   for (auto &It : OutContext.getSymbols()) {
-    // Emit .globaltype, .tagtype, or .tabletype declarations.
+    // Emit .globaltype, .tagtype, or .tabletype declarations for extern
+    // declarations, i.e. those that have only been declared (but not defined)
+    // in the current module
     auto Sym = cast<MCSymbolWasm>(It.getValue());
-    if (Sym->getType() == wasm::WASM_SYMBOL_TYPE_GLOBAL) {
-      // .globaltype already handled by emitGlobalVariable for defined
-      // variables; here we make sure the types of external wasm globals get
-      // written to the file.
-      if (Sym->isUndefined())
-        getTargetStreamer()->emitGlobalType(Sym);
-    } else if (Sym->getType() == wasm::WASM_SYMBOL_TYPE_TAG)
-      getTargetStreamer()->emitTagType(Sym);
-    else if (Sym->getType() == wasm::WASM_SYMBOL_TYPE_TABLE)
-      getTargetStreamer()->emitTableType(Sym);
+    if (!Sym->isDefined())
+      emitSymbolType(Sym);
   }
 
   DenseSet<MCSymbol *> InvokeSymbols;
@@ -303,55 +324,56 @@ void WebAssemblyAsmPrinter::emitExternalDecls(const Module &M) {
     if (F.isIntrinsic())
       continue;
 
-    // Emit function type info for all undefined functions
-    if (F.isDeclarationForLinker()) {
-      SmallVector<MVT, 4> Results;
-      SmallVector<MVT, 4> Params;
-      computeSignatureVTs(F.getFunctionType(), &F, F, TM, Params, Results);
-      // At this point these MCSymbols may or may not have been created already
-      // and thus also contain a signature, but we need to get the signature
-      // anyway here in case it is an invoke that has not yet been created. We
-      // will discard it later if it turns out not to be necessary.
-      auto Signature = signatureFromMVTs(Results, Params);
-      bool InvokeDetected = false;
-      auto *Sym = getMCSymbolForFunction(
-          &F, WebAssembly::WasmEnableEmEH || WebAssembly::WasmEnableEmSjLj,
-          Signature.get(), InvokeDetected);
-
-      // Multiple functions can be mapped to the same invoke symbol. For
-      // example, two IR functions '__invoke_void_i8*' and '__invoke_void_i32'
-      // are both mapped to '__invoke_vi'. We keep them in a set once we emit an
-      // Emscripten EH symbol so we don't emit the same symbol twice.
-      if (InvokeDetected && !InvokeSymbols.insert(Sym).second)
-        continue;
+    // Emit function type info for all functions. This will emit duplicate
+    // information for defined functions (which already have function type
+    // info emitted alongside their definition), but this is necessary in
+    // order to enable the single-pass WebAssemblyAsmTypeCheck to succeed.
+    SmallVector<MVT, 4> Results;
+    SmallVector<MVT, 4> Params;
+    computeSignatureVTs(F.getFunctionType(), &F, F, TM, Params, Results);
+    // At this point these MCSymbols may or may not have been created already
+    // and thus also contain a signature, but we need to get the signature
+    // anyway here in case it is an invoke that has not yet been created. We
+    // will discard it later if it turns out not to be necessary.
+    auto Signature = signatureFromMVTs(Results, Params);
+    bool InvokeDetected = false;
+    auto *Sym = getMCSymbolForFunction(
+        &F, WebAssembly::WasmEnableEmEH || WebAssembly::WasmEnableEmSjLj,
+        Signature.get(), InvokeDetected);
+
+    // Multiple functions can be mapped to the same invoke symbol. For
+    // example, two IR functions '__invoke_void_i8*' and '__invoke_void_i32'
+    // are both mapped to '__invoke_vi'. We keep them in a set once we emit an
+    // Emscripten EH symbol so we don't emit the same symbol twice.
+    if (InvokeDetected && !InvokeSymbols.insert(Sym).second)
+      continue;
 
-      Sym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
-      if (!Sym->getSignature()) {
-        Sym->setSignature(Signature.get());
-        addSignature(std::move(Signature));
-      } else {
-        // This symbol has already been created and had a signature. Discard it.
-        Signature.reset();
-      }
+    Sym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
+    if (!Sym->getSignature()) {
+      Sym->setSignature(Signature.get());
+      addSignature(std::move(Signature));
+    } else {
+      // This symbol has already been created and had a signature. Discard it.
+      Signature.reset();
+    }
 
-      getTargetStreamer()->emitFunctionType(Sym);
+    getTargetStreamer()->emitFunctionType(Sym);
 
-      if (F.hasFnAttribute("wasm-import-module")) {
-        StringRef Name =
-            F.getFnAttribute("wasm-import-module").getValueAsString();
-        Sym->setImportModule(storeName(Name));
-        getTargetStreamer()->emitImportModule(Sym, Name);
-      }
-      if (F.hasFnAttribute("wasm-import-name")) {
-        // If this is a converted Emscripten EH/SjLj symbol, we shouldn't use
-        // the original function name but the converted symbol name.
-        StringRef Name =
-            InvokeDetected
-                ? Sym->getName()
-                : F.getFnAttribute("wasm-import-name").getValueAsString();
-        Sym->setImportName(storeName(Name));
-        getTargetStreamer()->emitImportName(Sym, Name);
-      }
+    if (F.hasFnAttribute("wasm-import-module")) {
+      StringRef Name =
+          F.getFnAttribute("wasm-import-module").getValueAsString();
+      Sym->setImportModule(storeName(Name));
+      getTargetStreamer()->emitImportModule(Sym, Name);
+    }
+    if (F.hasFnAttribute("wasm-import-name")) {
+      // If this is a converted Emscripten EH/SjLj symbol, we shouldn't use
+      // the original function name but the converted symbol name.
+      StringRef Name =
+          InvokeDetected
+              ? Sym->getName()
+              : F.getFnAttribute("wasm-import-name").getValueAsString();
+      Sym->setImportName(storeName(Name));
+      getTargetStreamer()->emitImportName(Sym, Name);
     }
 
     if (F.hasFnAttribute("wasm-export-name")) {
@@ -362,9 +384,12 @@ void WebAssemblyAsmPrinter::emitExternalDecls(const Module &M) {
     }
   }
 }
-  
+
 void WebAssemblyAsmPrinter::emitEndOfAsmFile(Module &M) {
-  emitExternalDecls(M);
+  // This is required to emit external declarations (like .functypes) when
+  // no functions are defined in the compilation unit and therefore,
+  // emitDecls() is not called until now.
+  emitDecls(M);
 
   // When a function's address is taken, a TABLE_INDEX relocation is emitted
   // against the function symbol at the use site.  However the relocation
@@ -401,13 +426,13 @@ void WebAssemblyAsmPrinter::emitEndOfAsmFile(Module &M) {
       if (!Name || !Contents)
         continue;
 
-      OutStreamer->PushSection();
+      OutStreamer->pushSection();
       std::string SectionName = (".custom_section." + Name->getString()).str();
       MCSectionWasm *MySection =
           OutContext.getWasmSection(SectionName, SectionKind::getMetadata());
-      OutStreamer->SwitchSection(MySection);
+      OutStreamer->switchSection(MySection);
       OutStreamer->emitBytes(Contents->getString());
-      OutStreamer->PopSection();
+      OutStreamer->popSection();
     }
   }
 
@@ -445,8 +470,8 @@ void WebAssemblyAsmPrinter::EmitProducerInfo(Module &M) {
   if (FieldCount != 0) {
     MCSectionWasm *Producers = OutContext.getWasmSection(
         ".custom_section.producers", SectionKind::getMetadata());
-    OutStreamer->PushSection();
-    OutStreamer->SwitchSection(Producers);
+    OutStreamer->pushSection();
+    OutStreamer->switchSection(Producers);
     OutStreamer->emitULEB128IntValue(FieldCount);
     for (auto &Producers : {std::make_pair("language", &Languages),
             std::make_pair("processed-by", &Tools)}) {
@@ -462,7 +487,7 @@ void WebAssemblyAsmPrinter::EmitProducerInfo(Module &M) {
         OutStreamer->emitBytes(Producer.second);
       }
     }
-    OutStreamer->PopSection();
+    OutStreamer->popSection();
   }
 }
 
@@ -518,8 +543,8 @@ void WebAssemblyAsmPrinter::EmitTargetFeatures(Module &M) {
   // Emit features and linkage policies into the "target_features" section
   MCSectionWasm *FeaturesSection = OutContext.getWasmSection(
       ".custom_section.target_features", SectionKind::getMetadata());
-  OutStreamer->PushSection();
-  OutStreamer->SwitchSection(FeaturesSection);
+  OutStreamer->pushSection();
+  OutStreamer->switchSection(FeaturesSection);
 
   OutStreamer->emitULEB128IntValue(EmittedFeatures.size());
   for (auto &F : EmittedFeatures) {
@@ -528,10 +553,11 @@ void WebAssemblyAsmPrinter::EmitTargetFeatures(Module &M) {
     OutStreamer->emitBytes(F.Name);
   }
 
-  OutStreamer->PopSection();
+  OutStreamer->popSection();
 }
 
 void WebAssemblyAsmPrinter::emitConstantPool() {
+  emitDecls(*MMI->getModule());
   assert(MF->getConstantPool()->getConstants().empty() &&
          "WebAssembly disables constant pools");
 }
@@ -540,17 +566,6 @@ void WebAssemblyAsmPrinter::emitJumpTableInfo() {
   // Nothing to do; jump tables are incorporated into the instruction stream.
 }
 
-void WebAssemblyAsmPrinter::emitLinkage(const GlobalValue *GV, MCSymbol *Sym)
-  const {
-  AsmPrinter::emitLinkage(GV, Sym);
-  // This gets called before the function label and type are emitted.
-  // We use it to emit signatures of external functions.
-  // FIXME casts!
-  const_cast<WebAssemblyAsmPrinter *>(this)
-    ->emitExternalDecls(*MMI->getModule());
-}
-
-
 void WebAssemblyAsmPrinter::emitFunctionBodyStart() {
   const Function &F = MF->getFunction();
   SmallVector<MVT, 1> ResultVTs;
@@ -612,7 +627,7 @@ void WebAssemblyAsmPrinter::emitInstruction(const MachineInstr *MI) {
     // function body.
     if (isVerbose()) {
       OutStreamer->AddComment("fallthrough-return");
-      OutStreamer->AddBlankLine();
+      OutStreamer->addBlankLine();
     }
     break;
   }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
index 6b2f2000a0bd..65d6ee415180 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
@@ -66,10 +66,10 @@ public:
   void emitEndOfAsmFile(Module &M) override;
   void EmitProducerInfo(Module &M);
   void EmitTargetFeatures(Module &M);
+  void emitSymbolType(const MCSymbolWasm *Sym);
   void emitGlobalVariable(const GlobalVariable *GV) override;
   void emitJumpTableInfo() override;
   void emitConstantPool() override;
-  void emitLinkage(const GlobalValue *, MCSymbol *) const override;
   void emitFunctionBodyStart() override;
   void emitInstruction(const MachineInstr *MI) override;
   bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
@@ -84,7 +84,7 @@ public:
                                        wasm::WasmSignature *Sig,
                                        bool &InvokeDetected);
   MCSymbol *getOrCreateWasmSymbol(StringRef Name);
-  void emitExternalDecls(const Module &M);
+  void emitDecls(const Module &M);
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index 17e867e4c7d8..02e873a0f9a6 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -1716,7 +1716,7 @@ void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) {
           // Rewrite MBB operands to be depth immediates.
           SmallVector<MachineOperand, 4> Ops(MI.operands());
           while (MI.getNumOperands() > 0)
-            MI.RemoveOperand(MI.getNumOperands() - 1);
+            MI.removeOperand(MI.getNumOperands() - 1);
           for (auto MO : Ops) {
             if (MO.isMBB()) {
               if (MI.getOpcode() == WebAssembly::DELEGATE)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp
index b94981245f8b..81fe5395a6de 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp
@@ -14,6 +14,7 @@
 #include "WebAssemblyExceptionInfo.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "Utils/WebAssemblyUtilities.h"
+#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/CodeGen/MachineDominanceFrontier.h"
 #include "llvm/CodeGen/MachineDominators.h"
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFixBrTableDefaults.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFixBrTableDefaults.cpp
index 5bdec89f1125..fa5b4a508fa5 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFixBrTableDefaults.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFixBrTableDefaults.cpp
@@ -130,7 +130,7 @@ MachineBasicBlock *fixBrTableDefault(MachineInstr &MI, MachineBasicBlock *MBB,
       return nullptr;
 
     // Remove the dummy default target and install the real one.
-    MI.RemoveOperand(MI.getNumExplicitOperands() - 1);
+    MI.removeOperand(MI.getNumExplicitOperands() - 1);
     MI.addOperand(MF, MachineOperand::CreateMBB(TBB));
   }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
index 1ceae59dc993..83e71d731bfa 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
@@ -55,6 +55,7 @@
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "WebAssembly.h"
 #include "WebAssemblySubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/Support/Debug.h"
 using namespace llvm;
@@ -221,10 +222,8 @@ private:
       assert(!Enterers.count(MBB));
       if (Blocks.insert(MBB).second) {
         for (auto *Pred : MBB->predecessors()) {
-          if (!AddedToWorkList.count(Pred)) {
+          if (AddedToWorkList.insert(Pred).second)
             WorkList.push_back(Pred);
-            AddedToWorkList.insert(Pred);
-          }
         }
       }
     }
@@ -491,6 +490,46 @@ FunctionPass *llvm::createWebAssemblyFixIrreducibleControlFlow() {
   return new WebAssemblyFixIrreducibleControlFlow();
 }
 
+// Test whether the given register has an ARGUMENT def.
+static bool hasArgumentDef(unsigned Reg, const MachineRegisterInfo &MRI) {
+  for (const auto &Def : MRI.def_instructions(Reg))
+    if (WebAssembly::isArgument(Def.getOpcode()))
+      return true;
+  return false;
+}
+
+// Add a register definition with IMPLICIT_DEFs for every register to cover for
+// register uses that don't have defs in every possible path.
+// TODO: This is fairly heavy-handed; find a better approach.
+static void addImplicitDefs(MachineFunction &MF) {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  MachineBasicBlock &Entry = *MF.begin();
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I) {
+    Register Reg = Register::index2VirtReg(I);
+
+    // Skip unused registers.
+    if (MRI.use_nodbg_empty(Reg))
+      continue;
+
+    // Skip registers that have an ARGUMENT definition.
+    if (hasArgumentDef(Reg, MRI))
+      continue;
+
+    BuildMI(Entry, Entry.begin(), DebugLoc(),
+            TII.get(WebAssembly::IMPLICIT_DEF), Reg);
+  }
+
+  // Move ARGUMENT_* instructions to the top of the entry block, so that their
+  // liveness reflects the fact that these really are live-in values.
+  for (MachineInstr &MI : llvm::make_early_inc_range(Entry)) {
+    if (WebAssembly::isArgument(MI.getOpcode())) {
+      MI.removeFromParent();
+      Entry.insert(Entry.begin(), &MI);
+    }
+  }
+}
+
 bool WebAssemblyFixIrreducibleControlFlow::runOnMachineFunction(
     MachineFunction &MF) {
   LLVM_DEBUG(dbgs() << "********** Fixing Irreducible Control Flow **********\n"
@@ -505,8 +544,15 @@ bool WebAssemblyFixIrreducibleControlFlow::runOnMachineFunction(
 
   if (LLVM_UNLIKELY(processRegion(&*MF.begin(), AllBlocks, MF))) {
     // We rewrote part of the function; recompute relevant things.
-    MF.getRegInfo().invalidateLiveness();
     MF.RenumberBlocks();
+    // Now we've inserted dispatch blocks, some register uses can have incoming
+    // paths without a def. For example, before this pass register %a was
+    // defined in BB1 and used in BB2, and there was only one path from BB1 and
+    // BB2. But if this pass inserts a dispatch block having multiple
+    // predecessors between the two BBs, now there are paths to BB2 without
+    // visiting BB1, and %a's use in BB2 is not dominated by its def. Adding
+    // IMPLICIT_DEFs to all regs is one simple way to fix it.
+    addImplicitDefs(MF);
     return true;
   }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index a221f37cfd94..2636acaf1604 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -19,6 +19,8 @@
 #include "WebAssemblySubtarget.h"
 #include "WebAssemblyTargetMachine.h"
 #include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
@@ -159,22 +161,17 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
 
     // Combine extends of extract_subvectors into widening ops
-    setTargetDAGCombine(ISD::SIGN_EXTEND);
-    setTargetDAGCombine(ISD::ZERO_EXTEND);
+    setTargetDAGCombine({ISD::SIGN_EXTEND, ISD::ZERO_EXTEND});
 
     // Combine int_to_fp or fp_extend of extract_vectors and vice versa into
     // conversions ops
-    setTargetDAGCombine(ISD::SINT_TO_FP);
-    setTargetDAGCombine(ISD::UINT_TO_FP);
-    setTargetDAGCombine(ISD::FP_EXTEND);
-    setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
+    setTargetDAGCombine({ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_EXTEND,
+                         ISD::EXTRACT_SUBVECTOR});
 
     // Combine fp_to_{s,u}int_sat or fp_round of concat_vectors or vice versa
     // into conversion ops
-    setTargetDAGCombine(ISD::FP_TO_SINT_SAT);
-    setTargetDAGCombine(ISD::FP_TO_UINT_SAT);
-    setTargetDAGCombine(ISD::FP_ROUND);
-    setTargetDAGCombine(ISD::CONCAT_VECTORS);
+    setTargetDAGCombine({ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT,
+                         ISD::FP_ROUND, ISD::CONCAT_VECTORS});
 
     setTargetDAGCombine(ISD::TRUNCATE);
 
@@ -577,7 +574,7 @@ LowerCallResults(MachineInstr &CallResults, DebugLoc DL, MachineBasicBlock *BB,
   // Move the function pointer to the end of the arguments for indirect calls
   if (IsIndirect) {
     auto FnPtr = CallParams.getOperand(0);
-    CallParams.RemoveOperand(0);
+    CallParams.removeOperand(0);
 
     // For funcrefs, call_indirect is done through __funcref_call_table and the
     // funcref is always installed in slot 0 of the table, therefore instead of having
@@ -909,6 +906,30 @@ WebAssemblyTargetLowering::getPreferredVectorAction(MVT VT) const {
   return TargetLoweringBase::getPreferredVectorAction(VT);
 }
 
+bool WebAssemblyTargetLowering::shouldSimplifyDemandedVectorElts(
+    SDValue Op, const TargetLoweringOpt &TLO) const {
+  // ISel process runs DAGCombiner after legalization; this step is called
+  // SelectionDAG optimization phase. This post-legalization combining process
+  // runs DAGCombiner on each node, and if there was a change to be made,
+  // re-runs legalization again on it and its user nodes to make sure
+  // everythiing is in a legalized state.
+  //
+  // The legalization calls lowering routines, and we do our custom lowering for
+  // build_vectors (LowerBUILD_VECTOR), which converts undef vector elements
+  // into zeros. But there is a set of routines in DAGCombiner that turns unused
+  // (= not demanded) nodes into undef, among which SimplifyDemandedVectorElts
+  // turns unused vector elements into undefs. But this routine does not work
+  // with our custom LowerBUILD_VECTOR, which turns undefs into zeros. This
+  // combination can result in a infinite loop, in which undefs are converted to
+  // zeros in legalization and back to undefs in combining.
+  //
+  // So after DAG is legalized, we prevent SimplifyDemandedVectorElts from
+  // running for build_vectors.
+  if (Op.getOpcode() == ISD::BUILD_VECTOR && TLO.LegalOps && TLO.LegalTys)
+    return false;
+  return true;
+}
+
 //===----------------------------------------------------------------------===//
 // WebAssembly Lowering private implementation.
 //===----------------------------------------------------------------------===//
@@ -2110,8 +2131,7 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
 
   auto GetMostCommon = [](auto &Counts) {
     auto CommonIt =
-        std::max_element(Counts.begin(), Counts.end(),
-                         [](auto A, auto B) { return A.second < B.second; });
+        std::max_element(Counts.begin(), Counts.end(), llvm::less_second());
     assert(CommonIt != Counts.end() && "Unexpected all-undef build_vector");
     return *CommonIt;
   };
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index f7b460f61dbb..d86f2e59e3d2 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -113,6 +113,10 @@ private:
     report_fatal_error("llvm.clear_cache is not supported on wasm");
   }
 
+  bool
+  shouldSimplifyDemandedVectorElts(SDValue Op,
+                                   const TargetLoweringOpt &TLO) const override;
+
   // Custom lowering hooks.
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
   SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
index 42183d1645e1..ed80ed39f09c 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
@@ -15,7 +15,7 @@ let UseNamedOperandTable = 1 in
 multiclass ATOMIC_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
                     list<dag> pattern_r, string asmstr_r,
                     string asmstr_s, bits<32> atomic_op,
-                    string is64 = "false"> {
+                    bit is64 = false> {
   defm "" : I<oops_r, iops_r, oops_s, iops_s, pattern_r, asmstr_r, asmstr_s,
               !or(0xfe00, !and(0xff, atomic_op)), is64>,
             Requires<[HasAtomics]>;
@@ -38,13 +38,13 @@ defm MEMORY_ATOMIC_NOTIFY_A32 :
            (ins P2Align:$p2align, offset32_op:$off, I32:$addr, I32:$count),
            (outs), (ins P2Align:$p2align, offset32_op:$off), [],
            "memory.atomic.notify \t$dst, ${off}(${addr})${p2align}, $count",
-           "memory.atomic.notify \t${off}${p2align}", 0x00, "false">;
+           "memory.atomic.notify \t${off}${p2align}", 0x00, false>;
 defm MEMORY_ATOMIC_NOTIFY_A64 :
   ATOMIC_I<(outs I32:$dst),
            (ins P2Align:$p2align, offset64_op:$off, I64:$addr, I32:$count),
            (outs), (ins P2Align:$p2align, offset64_op:$off), [],
            "memory.atomic.notify \t$dst, ${off}(${addr})${p2align}, $count",
-           "memory.atomic.notify \t${off}${p2align}", 0x00, "true">;
+           "memory.atomic.notify \t${off}${p2align}", 0x00, true>;
 let mayLoad = 1 in {
 defm MEMORY_ATOMIC_WAIT32_A32 :
   ATOMIC_I<(outs I32:$dst),
@@ -52,28 +52,28 @@ defm MEMORY_ATOMIC_WAIT32_A32 :
                 I64:$timeout),
            (outs), (ins P2Align:$p2align, offset32_op:$off), [],
            "memory.atomic.wait32 \t$dst, ${off}(${addr})${p2align}, $exp, $timeout",
-           "memory.atomic.wait32 \t${off}${p2align}", 0x01, "false">;
+           "memory.atomic.wait32 \t${off}${p2align}", 0x01, false>;
 defm MEMORY_ATOMIC_WAIT32_A64 :
   ATOMIC_I<(outs I32:$dst),
            (ins P2Align:$p2align, offset64_op:$off, I64:$addr, I32:$exp,
                 I64:$timeout),
            (outs), (ins P2Align:$p2align, offset64_op:$off), [],
            "memory.atomic.wait32 \t$dst, ${off}(${addr})${p2align}, $exp, $timeout",
-           "memory.atomic.wait32 \t${off}${p2align}", 0x01, "true">;
+           "memory.atomic.wait32 \t${off}${p2align}", 0x01, true>;
 defm MEMORY_ATOMIC_WAIT64_A32 :
   ATOMIC_I<(outs I32:$dst),
            (ins P2Align:$p2align, offset32_op:$off, I32:$addr, I64:$exp,
                 I64:$timeout),
            (outs), (ins P2Align:$p2align, offset32_op:$off), [],
            "memory.atomic.wait64 \t$dst, ${off}(${addr})${p2align}, $exp, $timeout",
-           "memory.atomic.wait64 \t${off}${p2align}", 0x02, "false">;
+           "memory.atomic.wait64 \t${off}${p2align}", 0x02, false>;
 defm MEMORY_ATOMIC_WAIT64_A64 :
   ATOMIC_I<(outs I32:$dst),
            (ins P2Align:$p2align, offset64_op:$off, I64:$addr, I64:$exp,
                 I64:$timeout),
            (outs), (ins P2Align:$p2align, offset64_op:$off), [],
            "memory.atomic.wait64 \t$dst, ${off}(${addr})${p2align}, $exp, $timeout",
-           "memory.atomic.wait64 \t${off}${p2align}", 0x02, "true">;
+           "memory.atomic.wait64 \t${off}${p2align}", 0x02, true>;
 } // mayLoad = 1
 } // hasSideEffects = 1
 
@@ -469,13 +469,13 @@ multiclass WebAssemblyBinRMW<WebAssemblyRegClass rc, string name,
              (ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$val),
              (outs), (ins P2Align:$p2align, offset32_op:$off), [],
              !strconcat(name, "\t$dst, ${off}(${addr})${p2align}, $val"),
-             !strconcat(name, "\t${off}${p2align}"), atomic_op, "false">;
+             !strconcat(name, "\t${off}${p2align}"), atomic_op, false>;
   defm "_A64" :
     ATOMIC_I<(outs rc:$dst),
              (ins P2Align:$p2align, offset64_op:$off, I64:$addr, rc:$val),
              (outs), (ins P2Align:$p2align, offset64_op:$off), [],
              !strconcat(name, "\t$dst, ${off}(${addr})${p2align}, $val"),
-             !strconcat(name, "\t${off}${p2align}"), atomic_op, "true">;
+             !strconcat(name, "\t${off}${p2align}"), atomic_op, true>;
 }
 
 defm ATOMIC_RMW_ADD_I32 : WebAssemblyBinRMW<I32, "i32.atomic.rmw.add", 0x1e>;
@@ -767,14 +767,14 @@ multiclass WebAssemblyTerRMW<WebAssemblyRegClass rc, string name,
                   rc:$new_),
              (outs), (ins P2Align:$p2align, offset32_op:$off), [],
              !strconcat(name, "\t$dst, ${off}(${addr})${p2align}, $exp, $new_"),
-             !strconcat(name, "\t${off}${p2align}"), atomic_op, "false">;
+             !strconcat(name, "\t${off}${p2align}"), atomic_op, false>;
   defm "_A64" :
     ATOMIC_I<(outs rc:$dst),
              (ins P2Align:$p2align, offset64_op:$off, I64:$addr, rc:$exp,
                   rc:$new_),
              (outs), (ins P2Align:$p2align, offset64_op:$off), [],
              !strconcat(name, "\t$dst, ${off}(${addr})${p2align}, $exp, $new_"),
-             !strconcat(name, "\t${off}${p2align}"), atomic_op, "true">;
+             !strconcat(name, "\t${off}${p2align}"), atomic_op, true>;
 }
 
 defm ATOMIC_RMW_CMPXCHG_I32 :
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
index 4dc0c9a46c38..f2e73dd19d6b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
@@ -14,12 +14,12 @@
 // WebAssembly Instruction Format.
 // We instantiate 2 of these for every actual instruction (register based
 // and stack based), see below.
-class WebAssemblyInst<bits<32> inst, string asmstr, string stack, string is64>
+class WebAssemblyInst<bits<32> inst, string asmstr, bit stack, bit is64>
   : StackRel, RegisterRel, Wasm64Rel, Instruction {
   bits<32> Inst = inst; // Instruction encoding.
-  string StackBased = stack;
+  bit StackBased = stack;
   string BaseName = NAME;
-  string IsWasm64 = is64;
+  bit IsWasm64 = is64;
   string Wasm32Name = !subst("_A64", "_A32", NAME);
   let Namespace   = "WebAssembly";
   let Pattern     = [];
@@ -30,8 +30,8 @@ class WebAssemblyInst<bits<32> inst, string asmstr, string stack, string is64>
 }
 
 // Normal instructions. Default instantiation of a WebAssemblyInst.
-class NI<dag oops, dag iops, list<dag> pattern, string stack,
-         string asmstr = "", bits<32> inst = -1, string is64 = "false">
+class NI<dag oops, dag iops, list<dag> pattern, bit stack,
+         string asmstr = "", bits<32> inst = -1, bit is64 = false>
     : WebAssemblyInst<inst, asmstr, stack, is64> {
   dag OutOperandList = oops;
   dag InOperandList  = iops;
@@ -54,11 +54,11 @@ class NI<dag oops, dag iops, list<dag> pattern, string stack,
 // there is always an equivalent pair of instructions.
 multiclass I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
              list<dag> pattern_r, string asmstr_r = "", string asmstr_s = "",
-             bits<32> inst = -1, string is64 = "false"> {
+             bits<32> inst = -1, bit is64 = false> {
   let isCodeGenOnly = 1 in
-  def "" : NI<oops_r, iops_r, pattern_r, "false", asmstr_r, inst, is64>;
+  def "" : NI<oops_r, iops_r, pattern_r, false, asmstr_r, inst, is64>;
   let BaseName = NAME in
-  def _S : NI<oops_s, iops_s, [], "true", asmstr_s, inst, is64>;
+  def _S : NI<oops_s, iops_s, [], true, asmstr_s, inst, is64>;
 }
 
 // For instructions that have no register ops, so both sets are the same.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index 3fb0af1d47a0..134a0efc6822 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -66,6 +66,10 @@ def HasReferenceTypes :
     Predicate<"Subtarget->hasReferenceTypes()">,
     AssemblerPredicate<(all_of FeatureReferenceTypes), "reference-types">;
 
+def HasExtendedConst :
+    Predicate<"Subtarget->hasExtendedConst()">,
+    AssemblerPredicate<(all_of FeatureExtendedConst), "extended-const">;
+
 //===----------------------------------------------------------------------===//
 // WebAssembly-specific DAG Node Types.
 //===----------------------------------------------------------------------===//
@@ -221,8 +225,8 @@ def getStackOpcode : InstrMapping {
   let FilterClass = "StackRel";
   let RowFields = ["BaseName"];
   let ColFields = ["StackBased"];
-  let KeyCol = ["false"];
-  let ValueCols = [["true"]];
+  let KeyCol = ["0"];
+  let ValueCols = [["1"]];
 }
 
 //===----------------------------------------------------------------------===//
@@ -234,8 +238,8 @@ def getRegisterOpcode : InstrMapping {
   let FilterClass = "RegisterRel";
   let RowFields = ["BaseName"];
   let ColFields = ["StackBased"];
-  let KeyCol = ["true"];
-  let ValueCols = [["false"]];
+  let KeyCol = ["1"];
+  let ValueCols = [["0"]];
 }
 
 //===----------------------------------------------------------------------===//
@@ -247,8 +251,8 @@ def getWasm64Opcode : InstrMapping {
   let FilterClass = "Wasm64Rel";
   let RowFields = ["Wasm32Name"];
   let ColFields = ["IsWasm64"];
-  let KeyCol = ["false"];
-  let ValueCols = [["true"]];
+  let KeyCol = ["0"];
+  let ValueCols = [["1"]];
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
index a70f62dde845..d5bb9e9e48b4 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
@@ -47,13 +47,13 @@ multiclass WebAssemblyLoad<WebAssemblyRegClass rc, string Name, int Opcode,
                  (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
                  (outs), (ins P2Align:$p2align, offset32_op:$off),
                  [], !strconcat(Name, "\t$dst, ${off}(${addr})${p2align}"),
-                 !strconcat(Name, "\t${off}${p2align}"), Opcode, "false">,
+                 !strconcat(Name, "\t${off}${p2align}"), Opcode, false>,
                Requires<reqs>;
   defm "_A64": I<(outs rc:$dst),
                  (ins P2Align:$p2align, offset64_op:$off, I64:$addr),
                  (outs), (ins P2Align:$p2align, offset64_op:$off),
                  [], !strconcat(Name, "\t$dst, ${off}(${addr})${p2align}"),
-                 !strconcat(Name, "\t${off}${p2align}"), Opcode, "true">,
+                 !strconcat(Name, "\t${off}${p2align}"), Opcode, true>,
                Requires<reqs>;
   }
 }
@@ -244,7 +244,7 @@ multiclass WebAssemblyStore<WebAssemblyRegClass rc, string Name, int Opcode,
                   (outs),
                   (ins P2Align:$p2align, offset32_op:$off), [],
                   !strconcat(Name, "\t${off}(${addr})${p2align}, $val"),
-                  !strconcat(Name, "\t${off}${p2align}"), Opcode, "false">,
+                  !strconcat(Name, "\t${off}${p2align}"), Opcode, false>,
                 Requires<reqs>;
   let mayStore = 1, UseNamedOperandTable = 1 in
   defm "_A64" : I<(outs),
@@ -252,7 +252,7 @@ multiclass WebAssemblyStore<WebAssemblyRegClass rc, string Name, int Opcode,
                   (outs),
                   (ins P2Align:$p2align, offset64_op:$off), [],
                   !strconcat(Name, "\t${off}(${addr})${p2align}, $val"),
-                  !strconcat(Name, "\t${off}${p2align}"), Opcode, "true">,
+                  !strconcat(Name, "\t${off}${p2align}"), Opcode, true>,
                 Requires<reqs>;
 }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
index 76a88caafc47..608963d58863 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
@@ -27,6 +27,12 @@ multiclass REF_I<WebAssemblyRegClass rc, ValueType vt, string ht> {
                      vt#".select\t$dst, $lhs, $rhs, $cond",
                      vt#".select", 0x1b>,
                    Requires<[HasReferenceTypes]>;
+  defm REF_IS_NULL_#rc
+      : I<(outs I32:$dst), (ins rc:$ref), (outs), (ins),
+          [(set I32:$dst, (!cast<Intrinsic>("int_wasm_ref_is_null_" # ht) rc:$ref))],
+          "ref.is_null\t$ref",
+          "ref.is_null", 0xd1>,
+        Requires<[HasReferenceTypes]>;
 }
 
 defm "" : REF_I<FUNCREF, funcref, "func">;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 5bb12c7fbdc7..ed3cc7ed1c53 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1229,9 +1229,9 @@ def trunc_sat_zero_s :
   SDNode<"WebAssemblyISD::TRUNC_SAT_ZERO_S", trunc_sat_zero_t>;
 def trunc_sat_zero_u :
   SDNode<"WebAssemblyISD::TRUNC_SAT_ZERO_U", trunc_sat_zero_t>;
-defm "" : SIMDConvert<I32x4, F64x2, trunc_sat_zero_s, "trunc_sat_zero_f64x2_s",
+defm "" : SIMDConvert<I32x4, F64x2, trunc_sat_zero_s, "trunc_sat_f64x2_s_zero",
                       0xfc>;
-defm "" : SIMDConvert<I32x4, F64x2, trunc_sat_zero_u, "trunc_sat_zero_f64x2_u",
+defm "" : SIMDConvert<I32x4, F64x2, trunc_sat_zero_u, "trunc_sat_f64x2_u_zero",
                       0xfd>;
 
 // Integer to floating point: convert
@@ -1307,7 +1307,7 @@ defm "" : SIMDConvert<I32x4, I16x8, int_wasm_extadd_pairwise_unsigned,
 def demote_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
 def demote_zero : SDNode<"WebAssemblyISD::DEMOTE_ZERO", demote_t>;
 defm "" : SIMDConvert<F32x4, F64x2, demote_zero,
-                      "demote_zero_f64x2", 0x5e>;
+                      "demote_f64x2_zero", 0x5e>;
 
 def promote_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
 def promote_low : SDNode<"WebAssemblyISD::PROMOTE_LOW", promote_t>;
@@ -1334,7 +1334,37 @@ defm Q15MULR_SAT_S :
   SIMDBinary<I16x8, int_wasm_q15mulr_sat_signed, "q15mulr_sat_s", 0x82>;
 
 //===----------------------------------------------------------------------===//
-// Fused Multiply- Add and Subtract (FMA/FMS)
+// Relaxed swizzle
+//===----------------------------------------------------------------------===//
+
+defm RELAXED_SWIZZLE :
+  RELAXED_I<(outs V128:$dst), (ins V128:$src, V128:$mask), (outs), (ins),
+         [(set (v16i8 V128:$dst),
+           (int_wasm_relaxed_swizzle (v16i8 V128:$src), (v16i8 V128:$mask)))],
+         "i8x16.relaxed_swizzle\t$dst, $src, $mask", "i8x16.relaxed_swizzle", 0x100>;
+
+//===----------------------------------------------------------------------===//
+// Relaxed floating-point to int conversions
+//===----------------------------------------------------------------------===//
+
+multiclass RelaxedConvert<Vec vec, Vec arg, SDPatternOperator op, string name, bits<32> simdop> {
+  defm op#_#vec :
+    RELAXED_I<(outs V128:$dst), (ins V128:$vec), (outs), (ins),
+              [(set (vec.vt V128:$dst), (vec.vt (op (arg.vt V128:$vec))))],
+              vec.prefix#"."#name#"\t$dst, $vec", vec.prefix#"."#name, simdop>;
+}
+
+defm "" : RelaxedConvert<I32x4, F32x4, int_wasm_relaxed_trunc_signed,
+                         "relaxed_trunc_f32x4_s", 0x101>;
+defm "" : RelaxedConvert<I32x4, F32x4, int_wasm_relaxed_trunc_unsigned,
+                         "relaxed_trunc_f32x4_u", 0x102>;
+defm "" : RelaxedConvert<I32x4, F64x2, int_wasm_relaxed_trunc_signed_zero,
+                         "relaxed_trunc_f64x2_s_zero", 0x103>;
+defm "" : RelaxedConvert<I32x4, F64x2, int_wasm_relaxed_trunc_unsigned_zero,
+                         "relaxed_trunc_f64x2_u_zero", 0x104>;
+
+//===----------------------------------------------------------------------===//
+// Relaxed Fused Multiply- Add and Subtract (FMA/FMS)
 //===----------------------------------------------------------------------===//
 
 multiclass SIMDFM<Vec vec, bits<32> simdopA, bits<32> simdopS> {
@@ -1342,16 +1372,18 @@ multiclass SIMDFM<Vec vec, bits<32> simdopA, bits<32> simdopS> {
     RELAXED_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins),
               [(set (vec.vt V128:$dst), (int_wasm_fma
                 (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)))],
-              vec.prefix#".fma\t$dst, $a, $b, $c", vec.prefix#".fma", simdopA>;
+              vec.prefix#".relaxed_fma\t$dst, $a, $b, $c",
+              vec.prefix#".relaxed_fma", simdopA>;
   defm FMS_#vec :
     RELAXED_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins),
               [(set (vec.vt V128:$dst), (int_wasm_fms
                 (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)))],
-              vec.prefix#".fms\t$dst, $a, $b, $c", vec.prefix#".fms", simdopS>;
+              vec.prefix#".relaxed_fms\t$dst, $a, $b, $c",
+              vec.prefix#".relaxed_fms", simdopS>;
 }
 
-defm "" : SIMDFM<F32x4, 0xaf, 0xb0>;
-defm "" : SIMDFM<F64x2, 0xcf, 0xd0>;
+defm "" : SIMDFM<F32x4, 0x105, 0x106>;
+defm "" : SIMDFM<F64x2, 0x107, 0x108>;
 
 //===----------------------------------------------------------------------===//
 // Laneselect
@@ -1362,58 +1394,61 @@ multiclass SIMDLANESELECT<Vec vec, bits<32> op> {
     RELAXED_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins),
               [(set (vec.vt V128:$dst), (int_wasm_laneselect
                 (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)))],
-              vec.prefix#".laneselect\t$dst, $a, $b, $c", vec.prefix#".laneselect", op>;
+              vec.prefix#".relaxed_laneselect\t$dst, $a, $b, $c",
+              vec.prefix#".relaxed_laneselect", op>;
 }
 
-defm "" : SIMDLANESELECT<I8x16, 0xb2>;
-defm "" : SIMDLANESELECT<I16x8, 0xb3>;
-defm "" : SIMDLANESELECT<I32x4, 0xd2>;
-defm "" : SIMDLANESELECT<I64x2, 0xd3>;
-
-
-//===----------------------------------------------------------------------===//
-// Relaxed swizzle
-//===----------------------------------------------------------------------===//
-
-defm RELAXED_SWIZZLE :
-  RELAXED_I<(outs V128:$dst), (ins V128:$src, V128:$mask), (outs), (ins),
-         [(set (v16i8 V128:$dst),
-           (int_wasm_relaxed_swizzle (v16i8 V128:$src), (v16i8 V128:$mask)))],
-         "i8x16.relaxed_swizzle\t$dst, $src, $mask", "i8x16.relaxed_swizzle", 162>;
+defm "" : SIMDLANESELECT<I8x16, 0x109>;
+defm "" : SIMDLANESELECT<I16x8, 0x10a>;
+defm "" : SIMDLANESELECT<I32x4, 0x10b>;
+defm "" : SIMDLANESELECT<I64x2, 0x10c>;
 
 //===----------------------------------------------------------------------===//
 // Relaxed floating-point min and max.
 //===----------------------------------------------------------------------===//
 
-multiclass SIMD_RELAXED_FMINMAX<Vec vec, bits<32> simdopMin, bits<32> simdopMax> {
-  defm RELAXED_FMIN_#vec :
-    RELAXED_I<(outs V128:$dst), (ins V128:$a, V128:$b), (outs), (ins),
-              [(set (vec.vt V128:$dst), (int_wasm_relaxed_min
-                (vec.vt V128:$a), (vec.vt V128:$b)))],
-              vec.prefix#".relaxed_min\t$dst, $a, $b", vec.prefix#".relaxed_min", simdopMin>;
-  defm RELAXED_FMAX_#vec :
-    RELAXED_I<(outs V128:$dst), (ins V128:$a, V128:$b), (outs), (ins),
-              [(set (vec.vt V128:$dst), (int_wasm_relaxed_max
-                (vec.vt V128:$a), (vec.vt V128:$b)))],
-              vec.prefix#".relaxed_max\t$dst, $a, $b", vec.prefix#".relaxed_max", simdopMax>;
+multiclass RelaxedBinary<Vec vec, SDPatternOperator node, string name,
+                         bits<32> simdop> {
+  defm _#vec : RELAXED_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
+                         (outs), (ins),
+                         [(set (vec.vt V128:$dst),
+                           (node (vec.vt V128:$lhs), (vec.vt V128:$rhs)))],
+                         vec.prefix#"."#name#"\t$dst, $lhs, $rhs",
+                         vec.prefix#"."#name, simdop>;
 }
 
-defm "" : SIMD_RELAXED_FMINMAX<F32x4, 0xb4, 0xe2>;
-defm "" : SIMD_RELAXED_FMINMAX<F64x2, 0xd4, 0xee>;
+defm SIMD_RELAXED_FMIN :
+   RelaxedBinary<F32x4, int_wasm_relaxed_min, "relaxed_min", 0x10d>;
+defm SIMD_RELAXED_FMAX :
+   RelaxedBinary<F32x4, int_wasm_relaxed_max, "relaxed_max", 0x10e>;
+defm SIMD_RELAXED_FMIN :
+   RelaxedBinary<F64x2, int_wasm_relaxed_min, "relaxed_min", 0x10f>;
+defm SIMD_RELAXED_FMAX :
+   RelaxedBinary<F64x2, int_wasm_relaxed_max, "relaxed_max", 0x110>;
 
 //===----------------------------------------------------------------------===//
-// Relaxed floating-point to int conversions
+// Relaxed rounding q15 multiplication
 //===----------------------------------------------------------------------===//
 
-multiclass SIMD_RELAXED_CONVERT<Vec vec, Vec arg, SDPatternOperator op, string name, bits<32> simdop> {
-  defm op#_#vec :
-    RELAXED_I<(outs V128:$dst), (ins V128:$vec), (outs), (ins),
-              [(set (vec.vt V128:$dst), (vec.vt (op (arg.vt V128:$vec))))],
-              vec.prefix#"."#name#"\t$dst, $vec", vec.prefix#"."#name, simdop>;
-}
+defm RELAXED_Q15MULR_S :
+  RelaxedBinary<I16x8, int_wasm_relaxed_q15mulr_signed, "relaxed_q15mulr_s",
+                0x111>;
 
-defm "" : SIMD_RELAXED_CONVERT<I32x4, F32x4, int_wasm_relaxed_trunc_signed, "relaxed_trunc_f32x4_s", 0xa5>;
-defm "" : SIMD_RELAXED_CONVERT<I32x4, F32x4, int_wasm_relaxed_trunc_unsigned, "relaxed_trunc_f32x4_u", 0xa6>;
+//===----------------------------------------------------------------------===//
+// Relaxed integer dot product
+//===----------------------------------------------------------------------===//
 
-defm "" : SIMD_RELAXED_CONVERT<I32x4, F64x2, int_wasm_relaxed_trunc_zero_signed, "relaxed_trunc_f64x2_s_zero", 0xc5>;
-defm "" : SIMD_RELAXED_CONVERT<I32x4, F64x2, int_wasm_relaxed_trunc_zero_unsigned, "relaxed_trunc_f64x2_u_zero", 0xc6>;
+defm RELAXED_DOT :
+  RELAXED_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs), (outs), (ins),
+            [(set (v8i16 V128:$dst), (int_wasm_dot_i8x16_i7x16_signed
+               (v16i8 V128:$lhs), (v16i8 V128:$rhs)))],
+            "i16x8.dot_i8x16_i7x16_s\t$dst, $lhs, $rhs",
+            "i16x8.dot_i8x16_i7x16_s", 0x112>;
+
+defm RELAXED_DOT_ADD :
+  RELAXED_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs, V128:$acc),
+            (outs), (ins),
+            [(set (v4i32 V128:$dst), (int_wasm_dot_i8x16_i7x16_add_signed
+               (v16i8 V128:$lhs), (v16i8 V128:$rhs), (v4i32 V128:$acc)))],
+            "i32x4.dot_i8x16_i7x16_add_s\t$dst, $lhs, $rhs, $acc",
+            "i32x4.dot_i8x16_i7x16_add_s", 0x113>;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
index 309fcaf340eb..d16bb6b6648a 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
@@ -16,6 +16,7 @@
 #include "WebAssembly.h"
 #include "WebAssemblySubtarget.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/WasmEHFuncInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -72,9 +73,8 @@ WebAssemblyLateEHPrepare::getMatchingEHPad(MachineInstr *MI) {
   MachineBasicBlock *EHPad = nullptr;
   while (!WL.empty()) {
     MachineBasicBlock *MBB = WL.pop_back_val();
-    if (Visited.count(MBB))
+    if (!Visited.insert(MBB).second)
       continue;
-    Visited.insert(MBB);
     if (MBB->isEHPad()) {
       if (EHPad && EHPad != MBB)
         return nullptr;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index b6c43be03aba..2db4bd822349 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -406,8 +406,9 @@ static bool canThrow(const Value *V) {
   return true;
 }
 
-// Get a global variable with the given name. If it doesn't exist declare it,
-// which will generate an import and assume that it will exist at link time.
+// Get a thread-local global variable with the given name. If it doesn't exist
+// declare it, which will generate an import and assume that it will exist at
+// link time.
 static GlobalVariable *getGlobalVariable(Module &M, Type *Ty,
                                          WebAssemblyTargetMachine &TM,
                                          const char *Name) {
@@ -415,16 +416,11 @@ static GlobalVariable *getGlobalVariable(Module &M, Type *Ty,
   if (!GV)
     report_fatal_error(Twine("unable to create global: ") + Name);
 
-  // If the target supports TLS, make this variable thread-local. We can't just
-  // unconditionally make it thread-local and depend on
-  // CoalesceFeaturesAndStripAtomics to downgrade it, because stripping TLS has
-  // the side effect of disallowing the object from being linked into a
-  // shared-memory module, which we don't want to be responsible for.
-  auto *Subtarget = TM.getSubtargetImpl();
-  auto TLS = Subtarget->hasAtomics() && Subtarget->hasBulkMemory()
-                 ? GlobalValue::LocalExecTLSModel
-                 : GlobalValue::NotThreadLocal;
-  GV->setThreadLocalMode(TLS);
+  // Variables created by this function are thread local. If the target does not
+  // support TLS, we depend on CoalesceFeaturesAndStripAtomics to downgrade it
+  // to non-thread-local ones, in which case we don't allow this object to be
+  // linked with other objects using shared memory.
+  GV->setThreadLocalMode(GlobalValue::GeneralDynamicTLSModel);
   return GV;
 }
 
@@ -556,7 +552,7 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallBase *CI) {
     Optional<unsigned> NEltArg;
     std::tie(SizeArg, NEltArg) = FnAttrs.getAllocSizeArgs();
     SizeArg += 1;
-    if (NEltArg.hasValue())
+    if (NEltArg)
       NEltArg = NEltArg.getValue() + 1;
     FnAttrs.addAllocSizeAttr(SizeArg, NEltArg);
   }
@@ -1064,22 +1060,16 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
       nullifySetjmp(F);
   }
 
-  if (!Changed) {
-    // Delete unused global variables and functions
-    if (ResumeF)
-      ResumeF->eraseFromParent();
-    if (EHTypeIDF)
-      EHTypeIDF->eraseFromParent();
-    if (EmLongjmpF)
-      EmLongjmpF->eraseFromParent();
-    if (SaveSetjmpF)
-      SaveSetjmpF->eraseFromParent();
-    if (TestSetjmpF)
-      TestSetjmpF->eraseFromParent();
-    return false;
-  }
+  // Delete unused global variables and functions
+  for (auto *V : {ThrewGV, ThrewValueGV})
+    if (V && V->use_empty())
+      V->eraseFromParent();
+  for (auto *V : {GetTempRet0F, SetTempRet0F, ResumeF, EHTypeIDF, EmLongjmpF,
+                  SaveSetjmpF, TestSetjmpF, WasmLongjmpF, CatchF})
+    if (V && V->use_empty())
+      V->eraseFromParent();
 
-  return true;
+  return Changed;
 }
 
 bool WebAssemblyLowerEmscriptenEHSjLj::runEHOnFunction(Function &F) {
@@ -1324,9 +1314,14 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
     BasicBlock *BB = CB->getParent();
     if (BB->getParent() != &F) // in other function
       continue;
-    if (CB->getOperandBundle(LLVMContext::OB_funclet))
-      report_fatal_error(
-          "setjmp within a catch clause is not supported in Wasm EH");
+    if (CB->getOperandBundle(LLVMContext::OB_funclet)) {
+      std::string S;
+      raw_string_ostream SS(S);
+      SS << "In function " + F.getName() +
+                ": setjmp within a catch clause is not supported in Wasm EH:\n";
+      SS << *CB;
+      report_fatal_error(StringRef(SS.str()));
+    }
 
     CallInst *CI = nullptr;
     // setjmp cannot throw. So if it is an invoke, lower it to a call
@@ -1502,10 +1497,16 @@ void WebAssemblyLowerEmscriptenEHSjLj::handleLongjmpableCallsForEmscriptenSjLj(
   for (unsigned I = 0; I < BBs.size(); I++) {
     BasicBlock *BB = BBs[I];
     for (Instruction &I : *BB) {
-      if (isa<InvokeInst>(&I))
-        report_fatal_error("When using Wasm EH with Emscripten SjLj, there is "
-                           "a restriction that `setjmp` function call and "
-                           "exception cannot be used within the same function");
+      if (isa<InvokeInst>(&I)) {
+        std::string S;
+        raw_string_ostream SS(S);
+        SS << "In function " << F.getName()
+           << ": When using Wasm EH with Emscripten SjLj, there is a "
+              "restriction that `setjmp` function call and exception cannot be "
+              "used within the same function:\n";
+        SS << I;
+        report_fatal_error(StringRef(SS.str()));
+      }
       auto *CI = dyn_cast<CallInst>(&I);
       if (!CI)
         continue;
@@ -1829,7 +1830,8 @@ void WebAssemblyLowerEmscriptenEHSjLj::handleLongjmpableCallsForWasmSjLj(
         if (auto *CPI = dyn_cast<CatchPadInst>(FromPad)) {
           UnwindDest = CPI->getCatchSwitch()->getUnwindDest();
           break;
-        } else if (auto *CPI = dyn_cast<CleanupPadInst>(FromPad)) {
+        }
+        if (auto *CPI = dyn_cast<CleanupPadInst>(FromPad)) {
           // getCleanupRetUnwindDest() can return nullptr when
           // 1. This cleanuppad's matching cleanupret uwninds to caller
           // 2. There is no matching cleanupret because it ends with
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp
deleted file mode 100644
index ca6f3f194645..000000000000
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp
+++ /dev/null
@@ -1,210 +0,0 @@
-//===-- WebAssemblyLowerGlobalDtors.cpp - Lower @llvm.global_dtors --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// Lower @llvm.global_dtors.
-///
-/// WebAssembly doesn't have a builtin way to invoke static destructors.
-/// Implement @llvm.global_dtors by creating wrapper functions that are
-/// registered in @llvm.global_ctors and which contain a call to
-/// `__cxa_atexit` to register their destructor functions.
-///
-//===----------------------------------------------------------------------===//
-
-#include "WebAssembly.h"
-#include "llvm/ADT/MapVector.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/ModuleUtils.h"
-#include <map>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "wasm-lower-global-dtors"
-
-namespace {
-class LowerGlobalDtors final : public ModulePass {
-  StringRef getPassName() const override {
-    return "WebAssembly Lower @llvm.global_dtors";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    ModulePass::getAnalysisUsage(AU);
-  }
-
-  bool runOnModule(Module &M) override;
-
-public:
-  static char ID;
-  LowerGlobalDtors() : ModulePass(ID) {}
-};
-} // End anonymous namespace
-
-char LowerGlobalDtors::ID = 0;
-INITIALIZE_PASS(LowerGlobalDtors, DEBUG_TYPE,
-                "Lower @llvm.global_dtors for WebAssembly", false, false)
-
-ModulePass *llvm::createWebAssemblyLowerGlobalDtors() {
-  return new LowerGlobalDtors();
-}
-
-bool LowerGlobalDtors::runOnModule(Module &M) {
-  LLVM_DEBUG(dbgs() << "********** Lower Global Destructors **********\n");
-
-  GlobalVariable *GV = M.getGlobalVariable("llvm.global_dtors");
-  if (!GV || !GV->hasInitializer())
-    return false;
-
-  const ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer());
-  if (!InitList)
-    return false;
-
-  // Validate @llvm.global_dtor's type.
-  auto *ETy = dyn_cast<StructType>(InitList->getType()->getElementType());
-  if (!ETy || ETy->getNumElements() != 3 ||
-      !ETy->getTypeAtIndex(0U)->isIntegerTy() ||
-      !ETy->getTypeAtIndex(1U)->isPointerTy() ||
-      !ETy->getTypeAtIndex(2U)->isPointerTy())
-    return false; // Not (int, ptr, ptr).
-
-  // Collect the contents of @llvm.global_dtors, ordered by priority. Within a
-  // priority, sequences of destructors with the same associated object are
-  // recorded so that we can register them as a group.
-  std::map<
-      uint16_t,
-      std::vector<std::pair<Constant *, std::vector<Constant *>>>
-  > DtorFuncs;
-  for (Value *O : InitList->operands()) {
-    auto *CS = dyn_cast<ConstantStruct>(O);
-    if (!CS)
-      continue; // Malformed.
-
-    auto *Priority = dyn_cast<ConstantInt>(CS->getOperand(0));
-    if (!Priority)
-      continue; // Malformed.
-    uint16_t PriorityValue = Priority->getLimitedValue(UINT16_MAX);
-
-    Constant *DtorFunc = CS->getOperand(1);
-    if (DtorFunc->isNullValue())
-      break; // Found a null terminator, skip the rest.
-
-    Constant *Associated = CS->getOperand(2);
-    Associated = cast<Constant>(Associated->stripPointerCasts());
-
-    auto &AtThisPriority = DtorFuncs[PriorityValue];
-    if (AtThisPriority.empty() || AtThisPriority.back().first != Associated) {
-        std::vector<Constant *> NewList;
-        NewList.push_back(DtorFunc);
-        AtThisPriority.push_back(std::make_pair(Associated, NewList));
-    } else {
-        AtThisPriority.back().second.push_back(DtorFunc);
-    }
-  }
-  if (DtorFuncs.empty())
-    return false;
-
-  // extern "C" int __cxa_atexit(void (*f)(void *), void *p, void *d);
-  LLVMContext &C = M.getContext();
-  PointerType *VoidStar = Type::getInt8PtrTy(C);
-  Type *AtExitFuncArgs[] = {VoidStar};
-  FunctionType *AtExitFuncTy =
-      FunctionType::get(Type::getVoidTy(C), AtExitFuncArgs,
-                        /*isVarArg=*/false);
-
-  FunctionCallee AtExit = M.getOrInsertFunction(
-      "__cxa_atexit",
-      FunctionType::get(Type::getInt32Ty(C),
-                        {PointerType::get(AtExitFuncTy, 0), VoidStar, VoidStar},
-                        /*isVarArg=*/false));
-
-  // Declare __dso_local.
-  Constant *DsoHandle = M.getNamedValue("__dso_handle");
-  if (!DsoHandle) {
-    Type *DsoHandleTy = Type::getInt8Ty(C);
-    GlobalVariable *Handle = new GlobalVariable(
-        M, DsoHandleTy, /*isConstant=*/true,
-        GlobalVariable::ExternalWeakLinkage, nullptr, "__dso_handle");
-    Handle->setVisibility(GlobalVariable::HiddenVisibility);
-    DsoHandle = Handle;
-  }
-
-  // For each unique priority level and associated symbol, generate a function
-  // to call all the destructors at that level, and a function to register the
-  // first function with __cxa_atexit.
-  for (auto &PriorityAndMore : DtorFuncs) {
-    uint16_t Priority = PriorityAndMore.first;
-    uint64_t Id = 0;
-    auto &AtThisPriority = PriorityAndMore.second;
-    for (auto &AssociatedAndMore : AtThisPriority) {
-      Constant *Associated = AssociatedAndMore.first;
-      auto ThisId = Id++;
-
-      Function *CallDtors = Function::Create(
-          AtExitFuncTy, Function::PrivateLinkage,
-          "call_dtors" +
-              (Priority != UINT16_MAX ? (Twine(".") + Twine(Priority))
-                                      : Twine()) +
-              (AtThisPriority.size() > 1 ? Twine("$") + Twine(ThisId)
-                                         : Twine()) +
-              (!Associated->isNullValue() ? (Twine(".") + Associated->getName())
-                                          : Twine()),
-          &M);
-      BasicBlock *BB = BasicBlock::Create(C, "body", CallDtors);
-      FunctionType *VoidVoid = FunctionType::get(Type::getVoidTy(C),
-                                                 /*isVarArg=*/false);
-
-      for (auto Dtor : reverse(AssociatedAndMore.second))
-        CallInst::Create(VoidVoid, Dtor, "", BB);
-      ReturnInst::Create(C, BB);
-
-      Function *RegisterCallDtors = Function::Create(
-          VoidVoid, Function::PrivateLinkage,
-          "register_call_dtors" +
-              (Priority != UINT16_MAX ? (Twine(".") + Twine(Priority))
-                                      : Twine()) +
-              (AtThisPriority.size() > 1 ? Twine("$") + Twine(ThisId)
-                                         : Twine()) +
-              (!Associated->isNullValue() ? (Twine(".") + Associated->getName())
-                                          : Twine()),
-          &M);
-      BasicBlock *EntryBB = BasicBlock::Create(C, "entry", RegisterCallDtors);
-      BasicBlock *FailBB = BasicBlock::Create(C, "fail", RegisterCallDtors);
-      BasicBlock *RetBB = BasicBlock::Create(C, "return", RegisterCallDtors);
-
-      Value *Null = ConstantPointerNull::get(VoidStar);
-      Value *Args[] = {CallDtors, Null, DsoHandle};
-      Value *Res = CallInst::Create(AtExit, Args, "call", EntryBB);
-      Value *Cmp = new ICmpInst(*EntryBB, ICmpInst::ICMP_NE, Res,
-                                Constant::getNullValue(Res->getType()));
-      BranchInst::Create(FailBB, RetBB, Cmp, EntryBB);
-
-      // If `__cxa_atexit` hits out-of-memory, trap, so that we don't misbehave.
-      // This should be very rare, because if the process is running out of
-      // memory before main has even started, something is wrong.
-      CallInst::Create(Intrinsic::getDeclaration(&M, Intrinsic::trap), "",
-                       FailBB);
-      new UnreachableInst(C, FailBB);
-
-      ReturnInst::Create(C, RetBB);
-
-      // Now register the registration function with @llvm.global_ctors.
-      appendToGlobalCtors(M, RegisterCallDtors, Priority, Associated);
-    }
-  }
-
-  // Now that we've lowered everything, remove @llvm.global_dtors.
-  GV->eraseFromParent();
-
-  return true;
-}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMCLowerPrePass.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMCLowerPrePass.cpp
index 37ac8e75f4b7..21f6fd37d402 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMCLowerPrePass.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMCLowerPrePass.cpp
@@ -65,6 +65,9 @@ ModulePass *llvm::createWebAssemblyMCLowerPrePass() {
 // for all functions before AsmPrinter. If this way of doing things is ever
 // suboptimal, we could opt to make it a MachineFunctionPass and instead use
 // something like createBarrierNoopPass() to enforce ordering.
+//
+// The information stored here is essential for emitExternalDecls in the Wasm
+// AsmPrinter
 bool WebAssemblyMCLowerPrePass::runOnModule(Module &M) {
   auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>();
   if (!MMIWP)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
index ea80e96d50de..96284687971c 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
@@ -24,6 +24,16 @@ using namespace llvm;
 
 WebAssemblyFunctionInfo::~WebAssemblyFunctionInfo() = default; // anchor.
 
+MachineFunctionInfo *WebAssemblyFunctionInfo::clone(
+    BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+    const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+    const {
+  WebAssemblyFunctionInfo *Clone =
+      DestMF.cloneInfo<WebAssemblyFunctionInfo>(*this);
+  Clone->MF = &DestMF;
+  return Clone;
+}
+
 void WebAssemblyFunctionInfo::initWARegs(MachineRegisterInfo &MRI) {
   assert(WARegs.empty());
   unsigned Reg = UnusedReg;
@@ -153,7 +163,7 @@ void WebAssemblyFunctionInfo::initializeBaseYamlFields(
     addResult(WebAssembly::parseMVT(VT.Value));
   if (WasmEHInfo) {
     for (auto KV : YamlMFI.SrcToUnwindDest)
-      WasmEHInfo->setUnwindDest(MF.getBlockNumbered(KV.first),
-                                MF.getBlockNumbered(KV.second));
+      WasmEHInfo->setUnwindDest(MF->getBlockNumbered(KV.first),
+                                MF->getBlockNumbered(KV.second));
   }
 }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
index 413d0d1dc554..619617049bb2 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -31,7 +31,7 @@ struct WebAssemblyFunctionInfo;
 /// This class is derived from MachineFunctionInfo and contains private
 /// WebAssembly-specific information for each MachineFunction.
 class WebAssemblyFunctionInfo final : public MachineFunctionInfo {
-  const MachineFunction &MF;
+  const MachineFunction *MF;
 
   std::vector<MVT> Params;
   std::vector<MVT> Results;
@@ -70,11 +70,16 @@ class WebAssemblyFunctionInfo final : public MachineFunctionInfo {
   WasmEHFuncInfo *WasmEHInfo = nullptr;
 
 public:
-  explicit WebAssemblyFunctionInfo(MachineFunction &MF)
-      : MF(MF), WasmEHInfo(MF.getWasmEHFuncInfo()) {}
+  explicit WebAssemblyFunctionInfo(MachineFunction &MF_)
+      : MF(&MF_), WasmEHInfo(MF_.getWasmEHFuncInfo()) {}
   ~WebAssemblyFunctionInfo() override;
 
-  const MachineFunction &getMachineFunction() const { return MF; }
+  MachineFunctionInfo *
+  clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+        const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+      const override;
+
+  const MachineFunction &getMachineFunction() const { return *MF; }
 
   void initializeBaseYamlFields(const yaml::WebAssemblyFunctionInfo &YamlMFI);
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyNullifyDebugValueLists.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyNullifyDebugValueLists.cpp
index 62fa089a94d4..5d8c58dcc334 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyNullifyDebugValueLists.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyNullifyDebugValueLists.cpp
@@ -16,6 +16,7 @@
 
 #include "WebAssembly.h"
 #include "WebAssemblySubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "wasm-nullify-dbg-value-lists"
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
index 6a6cac6d956f..d542ddb45c2e 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
@@ -49,6 +49,11 @@ class WebAssemblyOptimizeLiveIntervals final : public MachineFunctionPass {
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::TracksLiveness);
+  }
+
   bool runOnMachineFunction(MachineFunction &MF) override;
 
 public:
@@ -102,7 +107,7 @@ bool WebAssemblyOptimizeLiveIntervals::runOnMachineFunction(
     SplitLIs.clear();
   }
 
-  // In PrepareForLiveIntervals, we conservatively inserted IMPLICIT_DEF
+  // In FixIrreducibleControlFlow, we conservatively inserted IMPLICIT_DEF
   // instructions to satisfy LiveIntervals' requirement that all uses be
   // dominated by defs. Now that LiveIntervals has computed which of these
   // defs are actually needed and which are dead, remove the dead ones.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
deleted file mode 100644
index 5682cadc1a64..000000000000
--- a/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-//===- WebAssemblyPrepareForLiveIntervals.cpp - Prepare for LiveIntervals -===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// Fix up code to meet LiveInterval's requirements.
-///
-/// Some CodeGen passes don't preserve LiveInterval's requirements, because
-/// they run after register allocation and it isn't important. However,
-/// WebAssembly runs LiveIntervals in a late pass. This pass transforms code
-/// to meet LiveIntervals' requirements; primarily, it ensures that all
-/// virtual register uses have definitions (IMPLICIT_DEF definitions if
-/// nothing else).
-///
-//===----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
-#include "Utils/WebAssemblyUtilities.h"
-#include "WebAssembly.h"
-#include "WebAssemblyMachineFunctionInfo.h"
-#include "WebAssemblySubtarget.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "wasm-prepare-for-live-intervals"
-
-namespace {
-class WebAssemblyPrepareForLiveIntervals final : public MachineFunctionPass {
-public:
-  static char ID; // Pass identification, replacement for typeid
-  WebAssemblyPrepareForLiveIntervals() : MachineFunctionPass(ID) {}
-
-private:
-  StringRef getPassName() const override {
-    return "WebAssembly Prepare For LiveIntervals";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-};
-} // end anonymous namespace
-
-char WebAssemblyPrepareForLiveIntervals::ID = 0;
-INITIALIZE_PASS(WebAssemblyPrepareForLiveIntervals, DEBUG_TYPE,
-                "Fix up code for LiveIntervals", false, false)
-
-FunctionPass *llvm::createWebAssemblyPrepareForLiveIntervals() {
-  return new WebAssemblyPrepareForLiveIntervals();
-}
-
-// Test whether the given register has an ARGUMENT def.
-static bool hasArgumentDef(unsigned Reg, const MachineRegisterInfo &MRI) {
-  for (const auto &Def : MRI.def_instructions(Reg))
-    if (WebAssembly::isArgument(Def.getOpcode()))
-      return true;
-  return false;
-}
-
-bool WebAssemblyPrepareForLiveIntervals::runOnMachineFunction(
-    MachineFunction &MF) {
-  LLVM_DEBUG({
-    dbgs() << "********** Prepare For LiveIntervals **********\n"
-           << "********** Function: " << MF.getName() << '\n';
-  });
-
-  bool Changed = false;
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
-  MachineBasicBlock &Entry = *MF.begin();
-
-  assert(!mustPreserveAnalysisID(LiveIntervalsID) &&
-         "LiveIntervals shouldn't be active yet!");
-
-  // We don't preserve SSA form.
-  MRI.leaveSSA();
-
-  // BranchFolding and perhaps other passes don't preserve IMPLICIT_DEF
-  // instructions. LiveIntervals requires that all paths to virtual register
-  // uses provide a definition. Insert IMPLICIT_DEFs in the entry block to
-  // conservatively satisfy this.
-  //
-  // TODO: This is fairly heavy-handed; find a better approach.
-  //
-  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I) {
-    Register Reg = Register::index2VirtReg(I);
-
-    // Skip unused registers.
-    if (MRI.use_nodbg_empty(Reg))
-      continue;
-
-    // Skip registers that have an ARGUMENT definition.
-    if (hasArgumentDef(Reg, MRI))
-      continue;
-
-    BuildMI(Entry, Entry.begin(), DebugLoc(),
-            TII.get(WebAssembly::IMPLICIT_DEF), Reg);
-    Changed = true;
-  }
-
-  // Move ARGUMENT_* instructions to the top of the entry block, so that their
-  // liveness reflects the fact that these really are live-in values.
-  for (MachineInstr &MI : llvm::make_early_inc_range(Entry)) {
-    if (WebAssembly::isArgument(MI.getOpcode())) {
-      MI.removeFromParent();
-      Entry.insert(Entry.begin(), &MI);
-    }
-  }
-
-  // Ok, we're now ready to run the LiveIntervals analysis again.
-  MF.getProperties().set(MachineFunctionProperties::Property::TracksLiveness);
-
-  return Changed;
-}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp
index 71f0bd28e1be..1e2bee7a5c73 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp
@@ -72,9 +72,6 @@ bool WebAssemblyReplacePhysRegs::runOnMachineFunction(MachineFunction &MF) {
 
   assert(!mustPreserveAnalysisID(LiveIntervalsID) &&
          "LiveIntervals shouldn't be active yet!");
-  // We don't preserve SSA or liveness.
-  MRI.leaveSSA();
-  MRI.invalidateLiveness();
 
   for (unsigned PReg = WebAssembly::NoRegister + 1;
        PReg < WebAssembly::NUM_TARGET_REGS; ++PReg) {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
index 16e05150c64e..74af4c8873f7 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
@@ -44,7 +44,7 @@ SDValue WebAssemblySelectionDAGInfo::EmitTargetCodeForMemmove(
 
 SDValue WebAssemblySelectionDAGInfo::EmitTargetCodeForMemset(
     SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue Val,
-    SDValue Size, Align Alignment, bool IsVolatile,
+    SDValue Size, Align Alignment, bool IsVolatile, bool AlwaysInline,
     MachinePointerInfo DstPtrInfo) const {
   auto &ST = DAG.getMachineFunction().getSubtarget<WebAssemblySubtarget>();
   if (!ST.hasBulkMemory())
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
index f4d2132fd3af..fd517b238715 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
@@ -37,6 +37,7 @@ public:
   SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &DL,
                                   SDValue Chain, SDValue Op1, SDValue Op2,
                                   SDValue Op3, Align Alignment, bool IsVolatile,
+                                  bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo) const override;
 };
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
index b553c8150652..780694980523 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
@@ -48,6 +48,7 @@ class WebAssemblySubtarget final : public WebAssemblyGenSubtargetInfo {
   bool HasMutableGlobals = false;
   bool HasTailCall = false;
   bool HasReferenceTypes = false;
+  bool HasExtendedConst = false;
 
   /// What processor and OS we're targeting.
   Triple TargetTriple;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 482837178f3d..76f036358ae8 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -25,11 +25,12 @@
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Function.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/LowerAtomic.h"
+#include "llvm/Transforms/Scalar/LowerAtomicPass.h"
 #include "llvm/Transforms/Utils.h"
 using namespace llvm;
 
@@ -56,13 +57,12 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTarget() {
   auto &PR = *PassRegistry::getPassRegistry();
   initializeWebAssemblyAddMissingPrototypesPass(PR);
   initializeWebAssemblyLowerEmscriptenEHSjLjPass(PR);
-  initializeLowerGlobalDtorsPass(PR);
+  initializeLowerGlobalDtorsLegacyPassPass(PR);
   initializeFixFunctionBitcastsPass(PR);
   initializeOptimizeReturnedPass(PR);
   initializeWebAssemblyArgumentMovePass(PR);
   initializeWebAssemblySetP2AlignOperandsPass(PR);
   initializeWebAssemblyReplacePhysRegsPass(PR);
-  initializeWebAssemblyPrepareForLiveIntervalsPass(PR);
   initializeWebAssemblyOptimizeLiveIntervalsPass(PR);
   initializeWebAssemblyMemIntrinsicResultsPass(PR);
   initializeWebAssemblyRegStackifyPass(PR);
@@ -87,7 +87,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTarget() {
 
 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM,
                                            const Triple &TT) {
-  if (!RM.hasValue()) {
+  if (!RM) {
     // Default to static relocation model.  This should always be more optimial
     // than PIC since the static linker can determine all global addresses and
     // assume direct function calls.
@@ -203,11 +203,12 @@ public:
     bool StrippedAtomics = false;
     bool StrippedTLS = false;
 
-    if (!Features[WebAssembly::FeatureAtomics])
+    if (!Features[WebAssembly::FeatureAtomics]) {
       StrippedAtomics = stripAtomics(M);
-
-    if (!Features[WebAssembly::FeatureBulkMemory])
       StrippedTLS = stripThreadLocals(M);
+    } else if (!Features[WebAssembly::FeatureBulkMemory]) {
+      StrippedTLS |= stripThreadLocals(M);
+    }
 
     if (StrippedAtomics && !StrippedTLS)
       stripThreadLocals(M);
@@ -320,6 +321,7 @@ public:
   FunctionPass *createTargetRegisterAllocator(bool) override;
 
   void addIRPasses() override;
+  void addISelPrepare() override;
   bool addInstSelector() override;
   void addPostRegAlloc() override;
   bool addGCPasses() override { return false; }
@@ -335,7 +337,7 @@ public:
 } // end anonymous namespace
 
 TargetTransformInfo
-WebAssemblyTargetMachine::getTargetTransformInfo(const Function &F) {
+WebAssemblyTargetMachine::getTargetTransformInfo(const Function &F) const {
   return TargetTransformInfo(WebAssemblyTTIImpl(this, F));
 }
 
@@ -407,17 +409,11 @@ static void basicCheckForEHAndSjLj(TargetMachine *TM) {
 //===----------------------------------------------------------------------===//
 
 void WebAssemblyPassConfig::addIRPasses() {
-  // Lower atomics and TLS if necessary
-  addPass(new CoalesceFeaturesAndStripAtomics(&getWebAssemblyTargetMachine()));
-
-  // This is a no-op if atomics are not used in the module
-  addPass(createAtomicExpandPass());
-
   // Add signatures to prototype-less function declarations
   addPass(createWebAssemblyAddMissingPrototypes());
 
   // Lower .llvm.global_dtors into .llvm_global_ctors with __cxa_atexit calls.
-  addPass(createWebAssemblyLowerGlobalDtors());
+  addPass(createLowerGlobalDtorsLegacyPass());
 
   // Fix function bitcasts, as WebAssembly requires caller and callee signatures
   // to match.
@@ -455,6 +451,16 @@ void WebAssemblyPassConfig::addIRPasses() {
   TargetPassConfig::addIRPasses();
 }
 
+void WebAssemblyPassConfig::addISelPrepare() {
+  // Lower atomics and TLS if necessary
+  addPass(new CoalesceFeaturesAndStripAtomics(&getWebAssemblyTargetMachine()));
+
+  // This is a no-op if atomics are not used in the module
+  addPass(createAtomicExpandPass());
+
+  TargetPassConfig::addISelPrepare();
+}
+
 bool WebAssemblyPassConfig::addInstSelector() {
   (void)TargetPassConfig::addInstSelector();
   addPass(
@@ -517,9 +523,6 @@ void WebAssemblyPassConfig::addPreEmitPass() {
 
   // Preparations and optimizations related to register stackification.
   if (getOptLevel() != CodeGenOpt::None) {
-    // LiveIntervals isn't commonly run this late. Re-establish preconditions.
-    addPass(createWebAssemblyPrepareForLiveIntervals());
-
     // Depend on LiveIntervals and perform some optimizations on it.
     addPass(createWebAssemblyOptimizeLiveIntervals());
 
@@ -588,8 +591,7 @@ yaml::MachineFunctionInfo *WebAssemblyTargetMachine::convertFuncInfoToYAML(
 bool WebAssemblyTargetMachine::parseMachineFunctionInfo(
     const yaml::MachineFunctionInfo &MFI, PerFunctionMIParsingState &PFS,
     SMDiagnostic &Error, SMRange &SourceRange) const {
-  const auto &YamlMFI =
-      reinterpret_cast<const yaml::WebAssemblyFunctionInfo &>(MFI);
+  const auto &YamlMFI = static_cast<const yaml::WebAssemblyFunctionInfo &>(MFI);
   MachineFunction &MF = PFS.MF;
   MF.getInfo<WebAssemblyFunctionInfo>()->initializeBaseYamlFields(YamlMFI);
   return false;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
index 29e968bfe8eb..5d5378f76567 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
@@ -46,7 +46,7 @@ public:
     return TLOF.get();
   }
 
-  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+  TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
 
   bool usesPhysRegsForValues() const override { return false; }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index f1ebcbc6fc51..62f7155e794a 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -139,3 +139,7 @@ void WebAssemblyTTIImpl::getUnrollingPreferences(
   // becomes "fall through" to default value of 2.
   UP.BEInsns = 2;
 }
+
+bool WebAssemblyTTIImpl::supportsTailCalls() const {
+  return getST()->hasTailCall();
+}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 50036f7f7e98..fde58a9587b6 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -74,6 +74,8 @@ public:
 
   bool areInlineCompatible(const Function *Caller,
                            const Function *Callee) const;
+
+  bool supportsTailCalls() const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index e9ecff3bf514..871b23f80efe 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -9,6 +9,7 @@
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "MCTargetDesc/X86IntelInstPrinter.h"
 #include "MCTargetDesc/X86MCExpr.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
 #include "MCTargetDesc/X86TargetStreamer.h"
 #include "TargetInfo/X86TargetInfo.h"
 #include "X86AsmParserCommon.h"
@@ -124,12 +125,12 @@ private:
                             bool matchingInlineAsm, unsigned VariantID = 0) {
     // In Code16GCC mode, match as 32-bit.
     if (Code16GCC)
-      SwitchMode(X86::Mode32Bit);
+      SwitchMode(X86::Is32Bit);
     unsigned rv = MatchInstructionImpl(Operands, Inst, ErrorInfo,
                                        MissingFeatures, matchingInlineAsm,
                                        VariantID);
     if (Code16GCC)
-      SwitchMode(X86::Mode16Bit);
+      SwitchMode(X86::Is16Bit);
     return rv;
   }
 
@@ -422,16 +423,18 @@ private:
   };
 
   class IntelExprStateMachine {
-    IntelExprState State, PrevState;
-    unsigned BaseReg, IndexReg, TmpReg, Scale;
-    int64_t Imm;
-    const MCExpr *Sym;
+    IntelExprState State = IES_INIT, PrevState = IES_ERROR;
+    unsigned BaseReg = 0, IndexReg = 0, TmpReg = 0, Scale = 0;
+    int64_t Imm = 0;
+    const MCExpr *Sym = nullptr;
     StringRef SymName;
     InfixCalculator IC;
     InlineAsmIdentifierInfo Info;
-    short BracCount;
-    bool MemExpr;
-    bool OffsetOperator;
+    short BracCount = 0;
+    bool MemExpr = false;
+    bool OffsetOperator = false;
+    bool AttachToOperandIdx = false;
+    bool IsPIC = false;
     SMLoc OffsetOperatorLoc;
     AsmTypeInfo CurType;
 
@@ -446,10 +449,7 @@ private:
     }
 
   public:
-    IntelExprStateMachine()
-        : State(IES_INIT), PrevState(IES_ERROR), BaseReg(0), IndexReg(0),
-          TmpReg(0), Scale(0), Imm(0), Sym(nullptr), BracCount(0),
-          MemExpr(false), OffsetOperator(false) {}
+    IntelExprStateMachine() = default;
 
     void addImm(int64_t imm) { Imm += imm; }
     short getBracCount() const { return BracCount; }
@@ -469,9 +469,29 @@ private:
     bool isValidEndState() const {
       return State == IES_RBRAC || State == IES_INTEGER;
     }
+
+    // Is the intel expression appended after an operand index.
+    // [OperandIdx][Intel Expression]
+    // This is neccessary for checking if it is an independent
+    // intel expression at back end when parse inline asm.
+    void setAppendAfterOperand() { AttachToOperandIdx = true; }
+
+    bool isPIC() const { return IsPIC; }
+    void setPIC() { IsPIC = true; }
+
     bool hadError() const { return State == IES_ERROR; }
     const InlineAsmIdentifierInfo &getIdentifierInfo() const { return Info; }
 
+    bool regsUseUpError(StringRef &ErrMsg) {
+      // This case mostly happen in inline asm, e.g. Arr[BaseReg + IndexReg]
+      // can not intruduce additional register in inline asm in PIC model.
+      if (IsPIC && AttachToOperandIdx)
+        ErrMsg = "Don't use 2 or more regs for mem offset in PIC model!";
+      else
+        ErrMsg = "BaseReg/IndexReg already set!";
+      return true;
+    }
+
     void onOr() {
       IntelExprState CurrState = State;
       switch (State) {
@@ -655,10 +675,8 @@ private:
           if (!BaseReg) {
             BaseReg = TmpReg;
           } else {
-            if (IndexReg) {
-              ErrMsg = "BaseReg/IndexReg already set!";
-              return true;
-            }
+            if (IndexReg)
+              return regsUseUpError(ErrMsg);
             IndexReg = TmpReg;
             Scale = 0;
           }
@@ -716,10 +734,8 @@ private:
           if (!BaseReg) {
             BaseReg = TmpReg;
           } else {
-            if (IndexReg) {
-              ErrMsg = "BaseReg/IndexReg already set!";
-              return true;
-            }
+            if (IndexReg)
+              return regsUseUpError(ErrMsg);
             IndexReg = TmpReg;
             Scale = 0;
           }
@@ -777,10 +793,8 @@ private:
       case IES_MULTIPLY:
         // Index Register - Scale * Register
         if (PrevState == IES_INTEGER) {
-          if (IndexReg) {
-            ErrMsg = "BaseReg/IndexReg already set!";
-            return true;
-          }
+          if (IndexReg)
+            return regsUseUpError(ErrMsg);
           State = IES_REGISTER;
           IndexReg = Reg;
           // Get the scale and replace the 'Scale * Register' with '0'.
@@ -861,10 +875,8 @@ private:
         State = IES_INTEGER;
         if (PrevState == IES_REGISTER && CurrState == IES_MULTIPLY) {
           // Index Register - Register * Scale
-          if (IndexReg) {
-            ErrMsg = "BaseReg/IndexReg already set!";
-            return true;
-          }
+          if (IndexReg)
+            return regsUseUpError(ErrMsg);
           IndexReg = TmpReg;
           Scale = TmpInt;
           if (checkScale(Scale, ErrMsg))
@@ -945,7 +957,7 @@ private:
       BracCount++;
       return false;
     }
-    bool onRBrac() {
+    bool onRBrac(StringRef &ErrMsg) {
       IntelExprState CurrState = State;
       switch (State) {
       default:
@@ -955,8 +967,10 @@ private:
       case IES_OFFSET:
       case IES_REGISTER:
       case IES_RPAREN:
-        if (BracCount-- != 1)
+        if (BracCount-- != 1) {
+          ErrMsg = "unexpected bracket encountered";
           return true;
+        }
         State = IES_RBRAC;
         if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
           // If we already have a BaseReg, then assume this is the IndexReg with
@@ -964,7 +978,8 @@ private:
           if (!BaseReg) {
             BaseReg = TmpReg;
           } else {
-            assert (!IndexReg && "BaseReg/IndexReg already set!");
+            if (IndexReg)
+              return regsUseUpError(ErrMsg);
             IndexReg = TmpReg;
             Scale = 0;
           }
@@ -1089,9 +1104,9 @@ private:
                             std::unique_ptr<llvm::MCParsedAsmOperand> &&Dst);
   bool VerifyAndAdjustOperands(OperandVector &OrigOperands,
                                OperandVector &FinalOperands);
-  bool ParseOperand(OperandVector &Operands);
-  bool ParseATTOperand(OperandVector &Operands);
-  bool ParseIntelOperand(OperandVector &Operands);
+  bool parseOperand(OperandVector &Operands, StringRef Name);
+  bool parseATTOperand(OperandVector &Operands);
+  bool parseIntelOperand(OperandVector &Operands, StringRef Name);
   bool ParseIntelOffsetOperator(const MCExpr *&Val, StringRef &ID,
                                 InlineAsmIdentifierInfo &Info, SMLoc &End);
   bool ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End);
@@ -1111,6 +1126,8 @@ private:
                                      InlineAsmIdentifierInfo &Info,
                                      bool IsUnevaluatedOperand, SMLoc &End,
                                      bool IsParsingOffsetOperator = false);
+  void tryParseOperandIdx(AsmToken::TokenKind PrevTK,
+                          IntelExprStateMachine &SM);
 
   bool ParseMemOperand(unsigned SegReg, const MCExpr *Disp, SMLoc StartLoc,
                        SMLoc EndLoc, OperandVector &Operands);
@@ -1193,19 +1210,19 @@ private:
 
   bool is64BitMode() const {
     // FIXME: Can tablegen auto-generate this?
-    return getSTI().getFeatureBits()[X86::Mode64Bit];
+    return getSTI().getFeatureBits()[X86::Is64Bit];
   }
   bool is32BitMode() const {
     // FIXME: Can tablegen auto-generate this?
-    return getSTI().getFeatureBits()[X86::Mode32Bit];
+    return getSTI().getFeatureBits()[X86::Is32Bit];
   }
   bool is16BitMode() const {
     // FIXME: Can tablegen auto-generate this?
-    return getSTI().getFeatureBits()[X86::Mode16Bit];
+    return getSTI().getFeatureBits()[X86::Is16Bit];
   }
   void SwitchMode(unsigned mode) {
     MCSubtargetInfo &STI = copySTI();
-    FeatureBitset AllModes({X86::Mode64Bit, X86::Mode32Bit, X86::Mode16Bit});
+    FeatureBitset AllModes({X86::Is64Bit, X86::Is32Bit, X86::Is16Bit});
     FeatureBitset OldMode = STI.getFeatureBits() & AllModes;
     FeatureBitset FB = ComputeAvailableFeatures(
       STI.ToggleFeature(OldMode.flip(mode)));
@@ -1716,11 +1733,11 @@ bool X86AsmParser::VerifyAndAdjustOperands(OperandVector &OrigOperands,
   return false;
 }
 
-bool X86AsmParser::ParseOperand(OperandVector &Operands) {
+bool X86AsmParser::parseOperand(OperandVector &Operands, StringRef Name) {
   if (isParsingIntelSyntax())
-    return ParseIntelOperand(Operands);
+    return parseIntelOperand(Operands, Name);
 
-  return ParseATTOperand(Operands);
+  return parseATTOperand(Operands);
 }
 
 bool X86AsmParser::CreateMemForMSInlineAsm(
@@ -1759,8 +1776,8 @@ bool X86AsmParser::CreateMemForMSInlineAsm(
   // registers in a mmory expression, and though unaccessible via rip/eip.
   if (IsGlobalLV && (BaseReg || IndexReg)) {
     Operands.push_back(X86Operand::CreateMem(getPointerWidth(), Disp, Start,
-                                             End, Size, Identifier, Decl,
-                                             FrontendSize));
+                                             End, Size, Identifier, Decl, 0,
+                                             BaseReg && IndexReg));
     return false;
   }
   // Otherwise, we set the base register to a non-zero value
@@ -1841,11 +1858,25 @@ bool X86AsmParser::ParseMasmNamedOperator(StringRef Name,
   return true;
 }
 
+// Check if current intel expression append after an operand.
+// Like: [Operand][Intel Expression]
+void X86AsmParser::tryParseOperandIdx(AsmToken::TokenKind PrevTK,
+                                      IntelExprStateMachine &SM) {
+  if (PrevTK != AsmToken::RBrac)
+    return;
+
+  SM.setAppendAfterOperand();
+}
+
 bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
   MCAsmParser &Parser = getParser();
   StringRef ErrMsg;
 
   AsmToken::TokenKind PrevTK = AsmToken::Error;
+
+  if (getContext().getObjectFileInfo()->isPositionIndependent())
+    SM.setPIC();
+
   bool Done = false;
   while (!Done) {
     // Get a fresh reference on each loop iteration in case the previous
@@ -2123,10 +2154,12 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
     case AsmToken::LBrac:
       if (SM.onLBrac())
         return Error(Tok.getLoc(), "unexpected bracket encountered");
+      tryParseOperandIdx(PrevTK, SM);
       break;
     case AsmToken::RBrac:
-      if (SM.onRBrac())
-        return Error(Tok.getLoc(), "unexpected bracket encountered");
+      if (SM.onRBrac(ErrMsg)) {
+        return Error(Tok.getLoc(), ErrMsg);
+      }
       break;
     case AsmToken::LParen:  SM.onLParen(); break;
     case AsmToken::RParen:  SM.onRParen(); break;
@@ -2477,7 +2510,7 @@ bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) {
   return false;
 }
 
-bool X86AsmParser::ParseIntelOperand(OperandVector &Operands) {
+bool X86AsmParser::parseIntelOperand(OperandVector &Operands, StringRef Name) {
   MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   SMLoc Start, End;
@@ -2552,6 +2585,8 @@ bool X86AsmParser::ParseIntelOperand(OperandVector &Operands) {
   StringRef ErrMsg;
   unsigned BaseReg = SM.getBaseReg();
   unsigned IndexReg = SM.getIndexReg();
+  if (IndexReg && BaseReg == X86::RIP)
+    BaseReg = 0;
   unsigned Scale = SM.getScale();
   if (!PtrInOperand)
     Size = SM.getElementSize() << 3;
@@ -2597,25 +2632,49 @@ bool X86AsmParser::ParseIntelOperand(OperandVector &Operands) {
 
   // When parsing x64 MS-style assembly, all non-absolute references to a named
   // variable default to RIP-relative.
-  if (Parser.isParsingMasm() && is64BitMode() && SM.getElementSize() > 0) {
-    Operands.push_back(X86Operand::CreateMem(getPointerWidth(), RegNo, Disp,
-                                             BaseReg, IndexReg, Scale, Start,
-                                             End, Size,
-                                             /*DefaultBaseReg=*/X86::RIP));
-    return false;
+  unsigned DefaultBaseReg = X86::NoRegister;
+  bool MaybeDirectBranchDest = true;
+
+  if (Parser.isParsingMasm()) {
+    bool IsUnconditionalBranch =
+        Name.equals_insensitive("jmp") || Name.equals_insensitive("call");
+    if (is64BitMode() && SM.getElementSize() > 0) {
+      DefaultBaseReg = X86::RIP;
+    }
+    if (IsUnconditionalBranch) {
+      if (PtrInOperand) {
+        MaybeDirectBranchDest = false;
+        if (is64BitMode())
+          DefaultBaseReg = X86::RIP;
+      } else if (!BaseReg && !IndexReg && Disp &&
+                 Disp->getKind() == MCExpr::SymbolRef) {
+        if (is64BitMode()) {
+          if (SM.getSize() == 8) {
+            MaybeDirectBranchDest = false;
+            DefaultBaseReg = X86::RIP;
+          }
+        } else {
+          if (SM.getSize() == 4 || SM.getSize() == 2)
+            MaybeDirectBranchDest = false;
+        }
+      }
+    }
   }
 
-  if ((BaseReg || IndexReg || RegNo))
-    Operands.push_back(X86Operand::CreateMem(getPointerWidth(), RegNo, Disp,
-                                             BaseReg, IndexReg, Scale, Start,
-                                             End, Size));
+  if ((BaseReg || IndexReg || RegNo || DefaultBaseReg != X86::NoRegister))
+    Operands.push_back(X86Operand::CreateMem(
+        getPointerWidth(), RegNo, Disp, BaseReg, IndexReg, Scale, Start, End,
+        Size, DefaultBaseReg, /*SymName=*/StringRef(), /*OpDecl=*/nullptr,
+        /*FrontendSize=*/0, /*UseUpRegs=*/false, MaybeDirectBranchDest));
   else
-    Operands.push_back(
-        X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size));
+    Operands.push_back(X86Operand::CreateMem(
+        getPointerWidth(), Disp, Start, End, Size, /*SymName=*/StringRef(),
+        /*OpDecl=*/nullptr, /*FrontendSize=*/0, /*UseUpRegs=*/false,
+        MaybeDirectBranchDest));
   return false;
 }
 
-bool X86AsmParser::ParseATTOperand(OperandVector &Operands) {
+bool X86AsmParser::parseATTOperand(OperandVector &Operands) {
   MCAsmParser &Parser = getParser();
   switch (getLexer().getKind()) {
   case AsmToken::Dollar: {
@@ -2722,7 +2781,7 @@ bool X86AsmParser::ParseZ(std::unique_ptr<X86Operand> &Z,
   if (!getLexer().is(AsmToken::RCurly))
     return Error(getLexer().getLoc(), "Expected } at this point");
   Parser.Lex(); // Eat '}'
-  // Assign Z with the {z} mark opernad
+  // Assign Z with the {z} mark operand
   Z = X86Operand::CreateToken("{z}", StartLoc);
   return false;
 }
@@ -3346,7 +3405,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
 
       Name = Next;
       PatchedName = Name;
-      ForcedDataPrefix = X86::Mode32Bit;
+      ForcedDataPrefix = X86::Is32Bit;
       IsPrefix = false;
     }
   }
@@ -3371,7 +3430,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
 
     // Read the operands.
     while (true) {
-      if (ParseOperand(Operands))
+      if (parseOperand(Operands, Name))
         return true;
       if (HandleAVX512Operand(Operands))
         return true;
@@ -3774,84 +3833,27 @@ bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) {
 }
 
 bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
+  using namespace X86;
   const MCRegisterInfo *MRI = getContext().getRegisterInfo();
-
-  switch (Inst.getOpcode()) {
-  case X86::VGATHERDPDYrm:
-  case X86::VGATHERDPDrm:
-  case X86::VGATHERDPSYrm:
-  case X86::VGATHERDPSrm:
-  case X86::VGATHERQPDYrm:
-  case X86::VGATHERQPDrm:
-  case X86::VGATHERQPSYrm:
-  case X86::VGATHERQPSrm:
-  case X86::VPGATHERDDYrm:
-  case X86::VPGATHERDDrm:
-  case X86::VPGATHERDQYrm:
-  case X86::VPGATHERDQrm:
-  case X86::VPGATHERQDYrm:
-  case X86::VPGATHERQDrm:
-  case X86::VPGATHERQQYrm:
-  case X86::VPGATHERQQrm: {
-    unsigned Dest = MRI->getEncodingValue(Inst.getOperand(0).getReg());
-    unsigned Mask = MRI->getEncodingValue(Inst.getOperand(1).getReg());
-    unsigned Index =
-      MRI->getEncodingValue(Inst.getOperand(3 + X86::AddrIndexReg).getReg());
-    if (Dest == Mask || Dest == Index || Mask == Index)
-      return Warning(Ops[0]->getStartLoc(), "mask, index, and destination "
-                                            "registers should be distinct");
-    break;
-  }
-  case X86::VGATHERDPDZ128rm:
-  case X86::VGATHERDPDZ256rm:
-  case X86::VGATHERDPDZrm:
-  case X86::VGATHERDPSZ128rm:
-  case X86::VGATHERDPSZ256rm:
-  case X86::VGATHERDPSZrm:
-  case X86::VGATHERQPDZ128rm:
-  case X86::VGATHERQPDZ256rm:
-  case X86::VGATHERQPDZrm:
-  case X86::VGATHERQPSZ128rm:
-  case X86::VGATHERQPSZ256rm:
-  case X86::VGATHERQPSZrm:
-  case X86::VPGATHERDDZ128rm:
-  case X86::VPGATHERDDZ256rm:
-  case X86::VPGATHERDDZrm:
-  case X86::VPGATHERDQZ128rm:
-  case X86::VPGATHERDQZ256rm:
-  case X86::VPGATHERDQZrm:
-  case X86::VPGATHERQDZ128rm:
-  case X86::VPGATHERQDZ256rm:
-  case X86::VPGATHERQDZrm:
-  case X86::VPGATHERQQZ128rm:
-  case X86::VPGATHERQQZ256rm:
-  case X86::VPGATHERQQZrm: {
-    unsigned Dest = MRI->getEncodingValue(Inst.getOperand(0).getReg());
-    unsigned Index =
-      MRI->getEncodingValue(Inst.getOperand(4 + X86::AddrIndexReg).getReg());
-    if (Dest == Index)
-      return Warning(Ops[0]->getStartLoc(), "index and destination registers "
-                                            "should be distinct");
-    break;
-  }
-  case X86::V4FMADDPSrm:
-  case X86::V4FMADDPSrmk:
-  case X86::V4FMADDPSrmkz:
-  case X86::V4FMADDSSrm:
-  case X86::V4FMADDSSrmk:
-  case X86::V4FMADDSSrmkz:
-  case X86::V4FNMADDPSrm:
-  case X86::V4FNMADDPSrmk:
-  case X86::V4FNMADDPSrmkz:
-  case X86::V4FNMADDSSrm:
-  case X86::V4FNMADDSSrmk:
-  case X86::V4FNMADDSSrmkz:
-  case X86::VP4DPWSSDSrm:
-  case X86::VP4DPWSSDSrmk:
-  case X86::VP4DPWSSDSrmkz:
-  case X86::VP4DPWSSDrm:
-  case X86::VP4DPWSSDrmk:
-  case X86::VP4DPWSSDrmkz: {
+  unsigned Opcode = Inst.getOpcode();
+  uint64_t TSFlags = MII.get(Opcode).TSFlags;
+  if (isVFCMADDCPH(Opcode) || isVFCMADDCSH(Opcode) || isVFMADDCPH(Opcode) ||
+      isVFMADDCSH(Opcode)) {
+    unsigned Dest = Inst.getOperand(0).getReg();
+    for (unsigned i = 2; i < Inst.getNumOperands(); i++)
+      if (Inst.getOperand(i).isReg() && Dest == Inst.getOperand(i).getReg())
+        return Warning(Ops[0]->getStartLoc(), "Destination register should be "
+                                              "distinct from source registers");
+  } else if (isVFCMULCPH(Opcode) || isVFCMULCSH(Opcode) || isVFMULCPH(Opcode) ||
+             isVFMULCSH(Opcode)) {
+    unsigned Dest = Inst.getOperand(0).getReg();
+    for (unsigned i = 1; i < Inst.getNumOperands(); i++)
+      if (Inst.getOperand(i).isReg() && Dest == Inst.getOperand(i).getReg())
+        return Warning(Ops[0]->getStartLoc(), "Destination register should be "
+                                              "distinct from source registers");
+  } else if (isV4FMADDPS(Opcode) || isV4FMADDSS(Opcode) ||
+             isV4FNMADDPS(Opcode) || isV4FNMADDSS(Opcode) ||
+             isVP4DPWSSDS(Opcode) || isVP4DPWSSD(Opcode)) {
     unsigned Src2 = Inst.getOperand(Inst.getNumOperands() -
                                     X86::AddrNumOperands - 1).getReg();
     unsigned Src2Enc = MRI->getEncodingValue(Src2);
@@ -3865,186 +3867,34 @@ bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
                      RegName.take_front(3) + Twine(GroupEnd) +
                      "' source group");
     }
-    break;
-  }
-  case X86::VFCMADDCPHZ128m:
-  case X86::VFCMADDCPHZ256m:
-  case X86::VFCMADDCPHZm:
-  case X86::VFCMADDCPHZ128mb:
-  case X86::VFCMADDCPHZ256mb:
-  case X86::VFCMADDCPHZmb:
-  case X86::VFCMADDCPHZ128mbk:
-  case X86::VFCMADDCPHZ256mbk:
-  case X86::VFCMADDCPHZmbk:
-  case X86::VFCMADDCPHZ128mbkz:
-  case X86::VFCMADDCPHZ256mbkz:
-  case X86::VFCMADDCPHZmbkz:
-  case X86::VFCMADDCPHZ128mk:
-  case X86::VFCMADDCPHZ256mk:
-  case X86::VFCMADDCPHZmk:
-  case X86::VFCMADDCPHZ128mkz:
-  case X86::VFCMADDCPHZ256mkz:
-  case X86::VFCMADDCPHZmkz:
-  case X86::VFCMADDCPHZ128r:
-  case X86::VFCMADDCPHZ256r:
-  case X86::VFCMADDCPHZr:
-  case X86::VFCMADDCPHZ128rk:
-  case X86::VFCMADDCPHZ256rk:
-  case X86::VFCMADDCPHZrk:
-  case X86::VFCMADDCPHZ128rkz:
-  case X86::VFCMADDCPHZ256rkz:
-  case X86::VFCMADDCPHZrkz:
-  case X86::VFCMADDCPHZrb:
-  case X86::VFCMADDCPHZrbk:
-  case X86::VFCMADDCPHZrbkz:
-  case X86::VFCMADDCSHZm:
-  case X86::VFCMADDCSHZmk:
-  case X86::VFCMADDCSHZmkz:
-  case X86::VFCMADDCSHZr:
-  case X86::VFCMADDCSHZrb:
-  case X86::VFCMADDCSHZrbk:
-  case X86::VFCMADDCSHZrbkz:
-  case X86::VFCMADDCSHZrk:
-  case X86::VFCMADDCSHZrkz:
-  case X86::VFMADDCPHZ128m:
-  case X86::VFMADDCPHZ256m:
-  case X86::VFMADDCPHZm:
-  case X86::VFMADDCPHZ128mb:
-  case X86::VFMADDCPHZ256mb:
-  case X86::VFMADDCPHZmb:
-  case X86::VFMADDCPHZ128mbk:
-  case X86::VFMADDCPHZ256mbk:
-  case X86::VFMADDCPHZmbk:
-  case X86::VFMADDCPHZ128mbkz:
-  case X86::VFMADDCPHZ256mbkz:
-  case X86::VFMADDCPHZmbkz:
-  case X86::VFMADDCPHZ128mk:
-  case X86::VFMADDCPHZ256mk:
-  case X86::VFMADDCPHZmk:
-  case X86::VFMADDCPHZ128mkz:
-  case X86::VFMADDCPHZ256mkz:
-  case X86::VFMADDCPHZmkz:
-  case X86::VFMADDCPHZ128r:
-  case X86::VFMADDCPHZ256r:
-  case X86::VFMADDCPHZr:
-  case X86::VFMADDCPHZ128rk:
-  case X86::VFMADDCPHZ256rk:
-  case X86::VFMADDCPHZrk:
-  case X86::VFMADDCPHZ128rkz:
-  case X86::VFMADDCPHZ256rkz:
-  case X86::VFMADDCPHZrkz:
-  case X86::VFMADDCPHZrb:
-  case X86::VFMADDCPHZrbk:
-  case X86::VFMADDCPHZrbkz:
-  case X86::VFMADDCSHZm:
-  case X86::VFMADDCSHZmk:
-  case X86::VFMADDCSHZmkz:
-  case X86::VFMADDCSHZr:
-  case X86::VFMADDCSHZrb:
-  case X86::VFMADDCSHZrbk:
-  case X86::VFMADDCSHZrbkz:
-  case X86::VFMADDCSHZrk:
-  case X86::VFMADDCSHZrkz: {
-    unsigned Dest = Inst.getOperand(0).getReg();
-    for (unsigned i = 2; i < Inst.getNumOperands(); i++)
-      if (Inst.getOperand(i).isReg() && Dest == Inst.getOperand(i).getReg())
-        return Warning(Ops[0]->getStartLoc(), "Destination register should be "
-                                              "distinct from source registers");
-    break;
-  }
-  case X86::VFCMULCPHZ128rm:
-  case X86::VFCMULCPHZ256rm:
-  case X86::VFCMULCPHZrm:
-  case X86::VFCMULCPHZ128rmb:
-  case X86::VFCMULCPHZ256rmb:
-  case X86::VFCMULCPHZrmb:
-  case X86::VFCMULCPHZ128rmbk:
-  case X86::VFCMULCPHZ256rmbk:
-  case X86::VFCMULCPHZrmbk:
-  case X86::VFCMULCPHZ128rmbkz:
-  case X86::VFCMULCPHZ256rmbkz:
-  case X86::VFCMULCPHZrmbkz:
-  case X86::VFCMULCPHZ128rmk:
-  case X86::VFCMULCPHZ256rmk:
-  case X86::VFCMULCPHZrmk:
-  case X86::VFCMULCPHZ128rmkz:
-  case X86::VFCMULCPHZ256rmkz:
-  case X86::VFCMULCPHZrmkz:
-  case X86::VFCMULCPHZ128rr:
-  case X86::VFCMULCPHZ256rr:
-  case X86::VFCMULCPHZrr:
-  case X86::VFCMULCPHZ128rrk:
-  case X86::VFCMULCPHZ256rrk:
-  case X86::VFCMULCPHZrrk:
-  case X86::VFCMULCPHZ128rrkz:
-  case X86::VFCMULCPHZ256rrkz:
-  case X86::VFCMULCPHZrrkz:
-  case X86::VFCMULCPHZrrb:
-  case X86::VFCMULCPHZrrbk:
-  case X86::VFCMULCPHZrrbkz:
-  case X86::VFCMULCSHZrm:
-  case X86::VFCMULCSHZrmk:
-  case X86::VFCMULCSHZrmkz:
-  case X86::VFCMULCSHZrr:
-  case X86::VFCMULCSHZrrb:
-  case X86::VFCMULCSHZrrbk:
-  case X86::VFCMULCSHZrrbkz:
-  case X86::VFCMULCSHZrrk:
-  case X86::VFCMULCSHZrrkz:
-  case X86::VFMULCPHZ128rm:
-  case X86::VFMULCPHZ256rm:
-  case X86::VFMULCPHZrm:
-  case X86::VFMULCPHZ128rmb:
-  case X86::VFMULCPHZ256rmb:
-  case X86::VFMULCPHZrmb:
-  case X86::VFMULCPHZ128rmbk:
-  case X86::VFMULCPHZ256rmbk:
-  case X86::VFMULCPHZrmbk:
-  case X86::VFMULCPHZ128rmbkz:
-  case X86::VFMULCPHZ256rmbkz:
-  case X86::VFMULCPHZrmbkz:
-  case X86::VFMULCPHZ128rmk:
-  case X86::VFMULCPHZ256rmk:
-  case X86::VFMULCPHZrmk:
-  case X86::VFMULCPHZ128rmkz:
-  case X86::VFMULCPHZ256rmkz:
-  case X86::VFMULCPHZrmkz:
-  case X86::VFMULCPHZ128rr:
-  case X86::VFMULCPHZ256rr:
-  case X86::VFMULCPHZrr:
-  case X86::VFMULCPHZ128rrk:
-  case X86::VFMULCPHZ256rrk:
-  case X86::VFMULCPHZrrk:
-  case X86::VFMULCPHZ128rrkz:
-  case X86::VFMULCPHZ256rrkz:
-  case X86::VFMULCPHZrrkz:
-  case X86::VFMULCPHZrrb:
-  case X86::VFMULCPHZrrbk:
-  case X86::VFMULCPHZrrbkz:
-  case X86::VFMULCSHZrm:
-  case X86::VFMULCSHZrmk:
-  case X86::VFMULCSHZrmkz:
-  case X86::VFMULCSHZrr:
-  case X86::VFMULCSHZrrb:
-  case X86::VFMULCSHZrrbk:
-  case X86::VFMULCSHZrrbkz:
-  case X86::VFMULCSHZrrk:
-  case X86::VFMULCSHZrrkz: {
-    unsigned Dest = Inst.getOperand(0).getReg();
-    for (unsigned i = 1; i < Inst.getNumOperands(); i++)
-      if (Inst.getOperand(i).isReg() && Dest == Inst.getOperand(i).getReg())
-        return Warning(Ops[0]->getStartLoc(), "Destination register should be "
-                                              "distinct from source registers");
-    break;
-  }
+  } else if (isVGATHERDPD(Opcode) || isVGATHERDPS(Opcode) ||
+             isVGATHERQPD(Opcode) || isVGATHERQPS(Opcode) ||
+             isVPGATHERDD(Opcode) || isVPGATHERDQ(Opcode) ||
+             isVPGATHERQD(Opcode) || isVPGATHERQQ(Opcode)) {
+    bool HasEVEX = (TSFlags & X86II::EncodingMask) == X86II::EVEX;
+    if (HasEVEX) {
+      unsigned Dest = MRI->getEncodingValue(Inst.getOperand(0).getReg());
+      unsigned Index = MRI->getEncodingValue(
+          Inst.getOperand(4 + X86::AddrIndexReg).getReg());
+      if (Dest == Index)
+        return Warning(Ops[0]->getStartLoc(), "index and destination registers "
+                                              "should be distinct");
+    } else {
+      unsigned Dest = MRI->getEncodingValue(Inst.getOperand(0).getReg());
+      unsigned Mask = MRI->getEncodingValue(Inst.getOperand(1).getReg());
+      unsigned Index = MRI->getEncodingValue(
+          Inst.getOperand(3 + X86::AddrIndexReg).getReg());
+      if (Dest == Mask || Dest == Index || Mask == Index)
+        return Warning(Ops[0]->getStartLoc(), "mask, index, and destination "
+                                              "registers should be distinct");
+    }
   }
 
-  const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
   // Check that we aren't mixing AH/BH/CH/DH with REX prefix. We only need to
   // check this with the legacy encoding, VEX/EVEX/XOP don't use REX.
-  if ((MCID.TSFlags & X86II::EncodingMask) == 0) {
+  if ((TSFlags & X86II::EncodingMask) == 0) {
     MCPhysReg HReg = X86::NoRegister;
-    bool UsesRex = MCID.TSFlags & X86II::REX_W;
+    bool UsesRex = TSFlags & X86II::REX_W;
     unsigned NumOps = Inst.getNumOperands();
     for (unsigned i = 0; i != NumOps; ++i) {
       const MCOperand &MO = Inst.getOperand(i);
@@ -4313,15 +4163,15 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
 
   // In 16-bit mode, if data32 is specified, temporarily switch to 32-bit mode
   // when matching the instruction.
-  if (ForcedDataPrefix == X86::Mode32Bit)
-    SwitchMode(X86::Mode32Bit);
+  if (ForcedDataPrefix == X86::Is32Bit)
+    SwitchMode(X86::Is32Bit);
   // First, try a direct match.
   FeatureBitset MissingFeatures;
   unsigned OriginalError = MatchInstruction(Operands, Inst, ErrorInfo,
                                             MissingFeatures, MatchingInlineAsm,
                                             isParsingIntelSyntax());
-  if (ForcedDataPrefix == X86::Mode32Bit) {
-    SwitchMode(X86::Mode16Bit);
+  if (ForcedDataPrefix == X86::Is32Bit) {
+    SwitchMode(X86::Is16Bit);
     ForcedDataPrefix = 0;
   }
   switch (OriginalError) {
@@ -4840,8 +4690,7 @@ bool X86AsmParser::parseDirectiveNops(SMLoc L) {
     if (getParser().parseAbsoluteExpression(Control))
       return true;
   }
-  if (getParser().parseToken(AsmToken::EndOfStatement,
-                             "unexpected token in '.nops' directive"))
+  if (getParser().parseEOL())
     return true;
 
   if (NumBytes <= 0) {
@@ -4863,7 +4712,7 @@ bool X86AsmParser::parseDirectiveNops(SMLoc L) {
 /// parseDirectiveEven
 ///  ::= .even
 bool X86AsmParser::parseDirectiveEven(SMLoc L) {
-  if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
+  if (parseEOL())
     return false;
 
   const MCSection *Section = getStreamer().getCurrentSectionOnly();
@@ -4871,7 +4720,7 @@ bool X86AsmParser::parseDirectiveEven(SMLoc L) {
     getStreamer().initSections(false, getSTI());
     Section = getStreamer().getCurrentSectionOnly();
   }
-  if (Section->UseCodeAlign())
+  if (Section->useCodeAlign())
     getStreamer().emitCodeAlignment(2, &getSTI(), 0);
   else
     getStreamer().emitValueToAlignment(2, 0, 1, 0);
@@ -4886,7 +4735,7 @@ bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) {
   if (IDVal == ".code16") {
     Parser.Lex();
     if (!is16BitMode()) {
-      SwitchMode(X86::Mode16Bit);
+      SwitchMode(X86::Is16Bit);
       getParser().getStreamer().emitAssemblerFlag(MCAF_Code16);
     }
   } else if (IDVal == ".code16gcc") {
@@ -4894,19 +4743,19 @@ bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) {
     Parser.Lex();
     Code16GCC = true;
     if (!is16BitMode()) {
-      SwitchMode(X86::Mode16Bit);
+      SwitchMode(X86::Is16Bit);
       getParser().getStreamer().emitAssemblerFlag(MCAF_Code16);
     }
   } else if (IDVal == ".code32") {
     Parser.Lex();
     if (!is32BitMode()) {
-      SwitchMode(X86::Mode32Bit);
+      SwitchMode(X86::Is32Bit);
       getParser().getStreamer().emitAssemblerFlag(MCAF_Code32);
     }
   } else if (IDVal == ".code64") {
     Parser.Lex();
     if (!is64BitMode()) {
-      SwitchMode(X86::Mode64Bit);
+      SwitchMode(X86::Is64Bit);
       getParser().getStreamer().emitAssemblerFlag(MCAF_Code64);
     }
   } else {
@@ -5035,7 +4884,7 @@ bool X86AsmParser::parseDirectiveSEHPushReg(SMLoc Loc) {
     return TokError("unexpected token in directive");
 
   getParser().Lex();
-  getStreamer().EmitWinCFIPushReg(Reg, Loc);
+  getStreamer().emitWinCFIPushReg(Reg, Loc);
   return false;
 }
 
@@ -5055,7 +4904,7 @@ bool X86AsmParser::parseDirectiveSEHSetFrame(SMLoc Loc) {
     return TokError("unexpected token in directive");
 
   getParser().Lex();
-  getStreamer().EmitWinCFISetFrame(Reg, Off, Loc);
+  getStreamer().emitWinCFISetFrame(Reg, Off, Loc);
   return false;
 }
 
@@ -5075,7 +4924,7 @@ bool X86AsmParser::parseDirectiveSEHSaveReg(SMLoc Loc) {
     return TokError("unexpected token in directive");
 
   getParser().Lex();
-  getStreamer().EmitWinCFISaveReg(Reg, Off, Loc);
+  getStreamer().emitWinCFISaveReg(Reg, Off, Loc);
   return false;
 }
 
@@ -5095,7 +4944,7 @@ bool X86AsmParser::parseDirectiveSEHSaveXMM(SMLoc Loc) {
     return TokError("unexpected token in directive");
 
   getParser().Lex();
-  getStreamer().EmitWinCFISaveXMM(Reg, Off, Loc);
+  getStreamer().emitWinCFISaveXMM(Reg, Off, Loc);
   return false;
 }
 
@@ -5116,7 +4965,7 @@ bool X86AsmParser::parseDirectiveSEHPushFrame(SMLoc Loc) {
     return TokError("unexpected token in directive");
 
   getParser().Lex();
-  getStreamer().EmitWinCFIPushFrame(Code, Loc);
+  getStreamer().emitWinCFIPushFrame(Code, Loc);
   return false;
 }
 
diff --git a/llvm/lib/Target/X86/AsmParser/X86Operand.h b/llvm/lib/Target/X86/AsmParser/X86Operand.h
index 67b1244708a8..075b800f9e20 100644
--- a/llvm/lib/Target/X86/AsmParser/X86Operand.h
+++ b/llvm/lib/Target/X86/AsmParser/X86Operand.h
@@ -17,6 +17,8 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/SMLoc.h"
 #include <cassert>
@@ -35,6 +37,10 @@ struct X86Operand final : public MCParsedAsmOperand {
   void *OpDecl;
   bool AddressOf;
 
+  /// This used for inline asm which may specify base reg and index reg for
+  /// MemOp. e.g. ARR[eax + ecx*4], so no extra reg can be used for MemOp.
+  bool UseUpRegs = false;
+
   struct TokOp {
     const char *Data;
     unsigned Length;
@@ -66,6 +72,11 @@ struct X86Operand final : public MCParsedAsmOperand {
     /// If the memory operand is unsized and there are multiple instruction
     /// matches, prefer the one with this size.
     unsigned FrontendSize;
+
+    /// If false, then this operand must be a memory operand for an indirect
+    /// branch instruction. Otherwise, this operand may belong to either a
+    /// direct or indirect branch instruction.
+    bool MaybeDirectBranchDest;
   };
 
   union {
@@ -203,6 +214,10 @@ struct X86Operand final : public MCParsedAsmOperand {
     assert(Kind == Memory && "Invalid access!");
     return Mem.FrontendSize;
   }
+  bool isMaybeDirectBranchDest() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.MaybeDirectBranchDest;
+  }
 
   bool isToken() const override {return Kind == Token; }
 
@@ -285,12 +300,6 @@ struct X86Operand final : public MCParsedAsmOperand {
 
   bool isOffsetOfLocal() const override { return isImm() && Imm.LocalRef; }
 
-  bool isMemPlaceholder(const MCInstrDesc &Desc) const override {
-    // Only MS InlineAsm uses global variables with registers rather than
-    // rip/eip.
-    return isMem() && !Mem.DefaultBaseReg && Mem.FrontendSize;
-  }
-
   bool needAddressOf() const override { return AddressOf; }
 
   bool isMem() const override { return Kind == Memory; }
@@ -374,8 +383,9 @@ struct X86Operand final : public MCParsedAsmOperand {
 
   bool isAbsMem() const {
     return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
-      !getMemIndexReg() && getMemScale() == 1;
+           !getMemIndexReg() && getMemScale() == 1 && isMaybeDirectBranchDest();
   }
+
   bool isAVX512RC() const{
       return isImm();
   }
@@ -384,6 +394,8 @@ struct X86Operand final : public MCParsedAsmOperand {
     return isAbsMem() && Mem.ModeSize == 16;
   }
 
+  bool isMemUseUpRegs() const override { return UseUpRegs; }
+
   bool isSrcIdx() const {
     return !getMemIndexReg() && getMemScale() == 1 &&
       (getMemBaseReg() == X86::RSI || getMemBaseReg() == X86::ESI ||
@@ -669,7 +681,8 @@ struct X86Operand final : public MCParsedAsmOperand {
   static std::unique_ptr<X86Operand>
   CreateMem(unsigned ModeSize, const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc,
             unsigned Size = 0, StringRef SymName = StringRef(),
-            void *OpDecl = nullptr, unsigned FrontendSize = 0) {
+            void *OpDecl = nullptr, unsigned FrontendSize = 0,
+            bool UseUpRegs = false, bool MaybeDirectBranchDest = true) {
     auto Res = std::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
     Res->Mem.SegReg   = 0;
     Res->Mem.Disp     = Disp;
@@ -680,6 +693,8 @@ struct X86Operand final : public MCParsedAsmOperand {
     Res->Mem.Size     = Size;
     Res->Mem.ModeSize = ModeSize;
     Res->Mem.FrontendSize = FrontendSize;
+    Res->Mem.MaybeDirectBranchDest = MaybeDirectBranchDest;
+    Res->UseUpRegs = UseUpRegs;
     Res->SymName      = SymName;
     Res->OpDecl       = OpDecl;
     Res->AddressOf    = false;
@@ -693,7 +708,8 @@ struct X86Operand final : public MCParsedAsmOperand {
             SMLoc EndLoc, unsigned Size = 0,
             unsigned DefaultBaseReg = X86::NoRegister,
             StringRef SymName = StringRef(), void *OpDecl = nullptr,
-            unsigned FrontendSize = 0) {
+            unsigned FrontendSize = 0, bool UseUpRegs = false,
+            bool MaybeDirectBranchDest = true) {
     // We should never just have a displacement, that should be parsed as an
     // absolute memory operand.
     assert((SegReg || BaseReg || IndexReg || DefaultBaseReg) &&
@@ -712,6 +728,8 @@ struct X86Operand final : public MCParsedAsmOperand {
     Res->Mem.Size     = Size;
     Res->Mem.ModeSize = ModeSize;
     Res->Mem.FrontendSize = FrontendSize;
+    Res->Mem.MaybeDirectBranchDest = MaybeDirectBranchDest;
+    Res->UseUpRegs = UseUpRegs;
     Res->SymName      = SymName;
     Res->OpDecl       = OpDecl;
     Res->AddressOf    = false;
diff --git a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 908eb6d1fab1..1da6bf86397e 100644
--- a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -493,16 +493,15 @@ static int readPrefixes(struct InternalInstruction *insn) {
     insn->displacementSize = (insn->hasAdSize ? 2 : 4);
     insn->immediateSize = (insn->hasOpSize ? 2 : 4);
   } else if (insn->mode == MODE_64BIT) {
+    insn->displacementSize = 4;
     if (insn->rexPrefix && wFromREX(insn->rexPrefix)) {
       insn->registerSize = 8;
       insn->addressSize = (insn->hasAdSize ? 4 : 8);
-      insn->displacementSize = 4;
       insn->immediateSize = 4;
       insn->hasOpSize = false;
     } else {
       insn->registerSize = (insn->hasOpSize ? 2 : 4);
       insn->addressSize = (insn->hasAdSize ? 4 : 8);
-      insn->displacementSize = (insn->hasOpSize ? 2 : 4);
       insn->immediateSize = (insn->hasOpSize ? 2 : 4);
     }
   }
@@ -1722,13 +1721,13 @@ X86GenericDisassembler::X86GenericDisassembler(
                                          std::unique_ptr<const MCInstrInfo> MII)
   : MCDisassembler(STI, Ctx), MII(std::move(MII)) {
   const FeatureBitset &FB = STI.getFeatureBits();
-  if (FB[X86::Mode16Bit]) {
+  if (FB[X86::Is16Bit]) {
     fMode = MODE_16BIT;
     return;
-  } else if (FB[X86::Mode32Bit]) {
+  } else if (FB[X86::Is32Bit]) {
     fMode = MODE_32BIT;
     return;
-  } else if (FB[X86::Mode64Bit]) {
+  } else if (FB[X86::Is64Bit]) {
     fMode = MODE_64BIT;
     return;
   }
@@ -1801,46 +1800,6 @@ static void translateRegister(MCInst &mcInst, Reg reg) {
   mcInst.addOperand(MCOperand::createReg(llvmRegnum));
 }
 
-/// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the
-/// immediate Value in the MCInst.
-///
-/// @param Value      - The immediate Value, has had any PC adjustment made by
-///                     the caller.
-/// @param isBranch   - If the instruction is a branch instruction
-/// @param Address    - The starting address of the instruction
-/// @param Offset     - The byte offset to this immediate in the instruction
-/// @param Width      - The byte width of this immediate in the instruction
-///
-/// If the getOpInfo() function was set when setupForSymbolicDisassembly() was
-/// called then that function is called to get any symbolic information for the
-/// immediate in the instruction using the Address, Offset and Width.  If that
-/// returns non-zero then the symbolic information it returns is used to create
-/// an MCExpr and that is added as an operand to the MCInst.  If getOpInfo()
-/// returns zero and isBranch is true then a symbol look up for immediate Value
-/// is done and if a symbol is found an MCExpr is created with that, else
-/// an MCExpr with the immediate Value is created.  This function returns true
-/// if it adds an operand to the MCInst and false otherwise.
-static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch,
-                                     uint64_t Address, uint64_t Offset,
-                                     uint64_t Width, MCInst &MI,
-                                     const MCDisassembler *Dis) {
-  return Dis->tryAddingSymbolicOperand(MI, Value, Address, isBranch,
-                                       Offset, Width);
-}
-
-/// tryAddingPcLoadReferenceComment - trys to add a comment as to what is being
-/// referenced by a load instruction with the base register that is the rip.
-/// These can often be addresses in a literal pool.  The Address of the
-/// instruction and its immediate Value are used to determine the address
-/// being referenced in the literal pool entry.  The SymbolLookUp call back will
-/// return a pointer to a literal 'C' string if the referenced address is an
-/// address into a section with 'C' string literals.
-static void tryAddingPcLoadReferenceComment(uint64_t Address, uint64_t Value,
-                                            const void *Decoder) {
-  const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
-  Dis->tryAddingPcLoadReferenceComment(Value, Address);
-}
-
 static const uint8_t segmentRegnums[SEG_OVERRIDE_max] = {
   0,        // SEG_OVERRIDE_NONE
   X86::CS,
@@ -1914,8 +1873,7 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
   uint64_t pcrel = 0;
   if (type == TYPE_REL) {
     isBranch = true;
-    pcrel = insn.startLocation +
-            insn.immediateOffset + insn.immediateSize;
+    pcrel = insn.startLocation + insn.length;
     switch (operand.encoding) {
     default:
       break;
@@ -1990,9 +1948,9 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
     break;
   }
 
-  if(!tryAddingSymbolicOperand(immediate + pcrel, isBranch, insn.startLocation,
-                               insn.immediateOffset, insn.immediateSize,
-                               mcInst, Dis))
+  if (!Dis->tryAddingSymbolicOperand(
+          mcInst, immediate + pcrel, insn.startLocation, isBranch,
+          insn.immediateOffset, insn.immediateSize, insn.length))
     mcInst.addOperand(MCOperand::createImm(immediate));
 
   if (type == TYPE_MOFFS) {
@@ -2129,11 +2087,10 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
         return true;
       }
       if (insn.mode == MODE_64BIT){
-        pcrel = insn.startLocation +
-                insn.displacementOffset + insn.displacementSize;
-        tryAddingPcLoadReferenceComment(insn.startLocation +
-                                        insn.displacementOffset,
-                                        insn.displacement + pcrel, Dis);
+        pcrel = insn.startLocation + insn.length;
+        Dis->tryAddingPcLoadReferenceComment(insn.displacement + pcrel,
+                                             insn.startLocation +
+                                                 insn.displacementOffset);
         // Section 2.2.1.6
         baseReg = MCOperand::createReg(insn.addressSize == 4 ? X86::EIP :
                                                                X86::RIP);
@@ -2193,9 +2150,13 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
   mcInst.addOperand(baseReg);
   mcInst.addOperand(scaleAmount);
   mcInst.addOperand(indexReg);
-  if(!tryAddingSymbolicOperand(insn.displacement + pcrel, false,
-                               insn.startLocation, insn.displacementOffset,
-                               insn.displacementSize, mcInst, Dis))
+
+  const uint8_t dispSize =
+      (insn.eaDisplacement == EA_DISP_NONE) ? 0 : insn.displacementSize;
+
+  if (!Dis->tryAddingSymbolicOperand(
+          mcInst, insn.displacement + pcrel, insn.startLocation, false,
+          insn.displacementOffset, dispSize, insn.length))
     mcInst.addOperand(displacement);
   mcInst.addOperand(segmentReg);
   return false;
diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
index 24d26751f0a1..61e1b6b27a85 100644
--- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
+++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
@@ -35,7 +35,7 @@ public:
   X86InstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII)
       : InstrPostProcess(STI, MCII) {}
 
-  ~X86InstrPostProcess() {}
+  ~X86InstrPostProcess() = default;
 
   void postProcessInstruction(std::unique_ptr<Instruction> &Inst,
                               const MCInst &MCI) override;
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
index baacf2f46183..6fd3db4515ec 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
@@ -46,7 +46,7 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, uint64_t Address,
   if (CommentStream)
     HasCustomInstComment = EmitAnyX86InstComments(MI, *CommentStream, MII);
 
-  printInstFlags(MI, OS);
+  printInstFlags(MI, OS, STI);
 
   // Output CALLpcrel32 as "callq" in 64-bit mode.
   // In Intel annotation it's always emitted as "call".
@@ -55,7 +55,7 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, uint64_t Address,
   // InstrInfo.td as soon as Requires clause is supported properly
   // for InstAlias.
   if (MI->getOpcode() == X86::CALLpcrel32 &&
-      (STI.getFeatureBits()[X86::Mode64Bit])) {
+      (STI.getFeatureBits()[X86::Is64Bit])) {
     OS << "\tcallq\t";
     printPCRelImm(MI, Address, 0, OS);
   }
@@ -65,8 +65,8 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, uint64_t Address,
   // 0x66 to be interpreted as "data16" by the asm printer.
   // Thus we add an adjustment here in order to print the "right" instruction.
   else if (MI->getOpcode() == X86::DATA16_PREFIX &&
-           STI.getFeatureBits()[X86::Mode16Bit]) {
-   OS << "\tdata32";
+           STI.getFeatureBits()[X86::Is16Bit]) {
+    OS << "\tdata32";
   }
   // Try to print any aliases first.
   else if (!printAliasInstr(MI, Address, OS) && !printVecCompareInstr(MI, OS))
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 3df48b466d07..2d92b8d5b574 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -8,6 +8,7 @@
 
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "MCTargetDesc/X86FixupKinds.h"
+#include "MCTargetDesc/X86InstrRelaxTables.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/BinaryFormat/MachO.h"
@@ -222,87 +223,7 @@ static unsigned getRelaxedOpcodeBranch(const MCInst &Inst, bool Is16BitMode) {
 
 static unsigned getRelaxedOpcodeArith(const MCInst &Inst) {
   unsigned Op = Inst.getOpcode();
-  switch (Op) {
-  default:
-    return Op;
-
-    // IMUL
-  case X86::IMUL16rri8: return X86::IMUL16rri;
-  case X86::IMUL16rmi8: return X86::IMUL16rmi;
-  case X86::IMUL32rri8: return X86::IMUL32rri;
-  case X86::IMUL32rmi8: return X86::IMUL32rmi;
-  case X86::IMUL64rri8: return X86::IMUL64rri32;
-  case X86::IMUL64rmi8: return X86::IMUL64rmi32;
-
-    // AND
-  case X86::AND16ri8: return X86::AND16ri;
-  case X86::AND16mi8: return X86::AND16mi;
-  case X86::AND32ri8: return X86::AND32ri;
-  case X86::AND32mi8: return X86::AND32mi;
-  case X86::AND64ri8: return X86::AND64ri32;
-  case X86::AND64mi8: return X86::AND64mi32;
-
-    // OR
-  case X86::OR16ri8: return X86::OR16ri;
-  case X86::OR16mi8: return X86::OR16mi;
-  case X86::OR32ri8: return X86::OR32ri;
-  case X86::OR32mi8: return X86::OR32mi;
-  case X86::OR64ri8: return X86::OR64ri32;
-  case X86::OR64mi8: return X86::OR64mi32;
-
-    // XOR
-  case X86::XOR16ri8: return X86::XOR16ri;
-  case X86::XOR16mi8: return X86::XOR16mi;
-  case X86::XOR32ri8: return X86::XOR32ri;
-  case X86::XOR32mi8: return X86::XOR32mi;
-  case X86::XOR64ri8: return X86::XOR64ri32;
-  case X86::XOR64mi8: return X86::XOR64mi32;
-
-    // ADD
-  case X86::ADD16ri8: return X86::ADD16ri;
-  case X86::ADD16mi8: return X86::ADD16mi;
-  case X86::ADD32ri8: return X86::ADD32ri;
-  case X86::ADD32mi8: return X86::ADD32mi;
-  case X86::ADD64ri8: return X86::ADD64ri32;
-  case X86::ADD64mi8: return X86::ADD64mi32;
-
-   // ADC
-  case X86::ADC16ri8: return X86::ADC16ri;
-  case X86::ADC16mi8: return X86::ADC16mi;
-  case X86::ADC32ri8: return X86::ADC32ri;
-  case X86::ADC32mi8: return X86::ADC32mi;
-  case X86::ADC64ri8: return X86::ADC64ri32;
-  case X86::ADC64mi8: return X86::ADC64mi32;
-
-    // SUB
-  case X86::SUB16ri8: return X86::SUB16ri;
-  case X86::SUB16mi8: return X86::SUB16mi;
-  case X86::SUB32ri8: return X86::SUB32ri;
-  case X86::SUB32mi8: return X86::SUB32mi;
-  case X86::SUB64ri8: return X86::SUB64ri32;
-  case X86::SUB64mi8: return X86::SUB64mi32;
-
-   // SBB
-  case X86::SBB16ri8: return X86::SBB16ri;
-  case X86::SBB16mi8: return X86::SBB16mi;
-  case X86::SBB32ri8: return X86::SBB32ri;
-  case X86::SBB32mi8: return X86::SBB32mi;
-  case X86::SBB64ri8: return X86::SBB64ri32;
-  case X86::SBB64mi8: return X86::SBB64mi32;
-
-    // CMP
-  case X86::CMP16ri8: return X86::CMP16ri;
-  case X86::CMP16mi8: return X86::CMP16mi;
-  case X86::CMP32ri8: return X86::CMP32ri;
-  case X86::CMP32mi8: return X86::CMP32mi;
-  case X86::CMP64ri8: return X86::CMP64ri32;
-  case X86::CMP64mi8: return X86::CMP64mi32;
-
-    // PUSH
-  case X86::PUSH32i8:  return X86::PUSHi32;
-  case X86::PUSH16i8:  return X86::PUSHi16;
-  case X86::PUSH64i8:  return X86::PUSH64i32;
-  }
+  return X86::getRelaxedOpcodeArith(Op);
 }
 
 static unsigned getRelaxedOpcode(const MCInst &Inst, bool Is16BitMode) {
@@ -372,7 +293,7 @@ static bool isFirstMacroFusibleInst(const MCInst &Inst,
 ///   - If the instruction has a ESP/EBP base register, use SS.
 ///   - Otherwise use DS.
 uint8_t X86AsmBackend::determinePaddingPrefix(const MCInst &Inst) const {
-  assert((STI.hasFeature(X86::Mode32Bit) || STI.hasFeature(X86::Mode64Bit)) &&
+  assert((STI.hasFeature(X86::Is32Bit) || STI.hasFeature(X86::Is64Bit)) &&
          "Prefixes can be added only in 32-bit or 64-bit mode.");
   const MCInstrDesc &Desc = MCII->get(Inst.getOpcode());
   uint64_t TSFlags = Desc.TSFlags;
@@ -413,7 +334,7 @@ uint8_t X86AsmBackend::determinePaddingPrefix(const MCInst &Inst) const {
   if (SegmentReg != 0)
     return X86::getSegmentOverridePrefixForReg(SegmentReg);
 
-  if (STI.hasFeature(X86::Mode64Bit))
+  if (STI.hasFeature(X86::Is64Bit))
     return X86::CS_Encoding;
 
   if (MemoryOperand >= 0) {
@@ -572,7 +493,7 @@ bool X86AsmBackend::canPadBranches(MCObjectStreamer &OS) const {
     return false;
 
   // Branches only need to be aligned in 32-bit or 64-bit mode.
-  if (!(STI.hasFeature(X86::Mode64Bit) || STI.hasFeature(X86::Mode32Bit)))
+  if (!(STI.hasFeature(X86::Is64Bit) || STI.hasFeature(X86::Is32Bit)))
     return false;
 
   return true;
@@ -834,7 +755,7 @@ bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
 void X86AsmBackend::relaxInstruction(MCInst &Inst,
                                      const MCSubtargetInfo &STI) const {
   // The only relaxations X86 does is from a 1byte pcrel to a 4byte pcrel.
-  bool Is16BitMode = STI.getFeatureBits()[X86::Mode16Bit];
+  bool Is16BitMode = STI.getFeatureBits()[X86::Is16Bit];
   unsigned RelaxedOp = getRelaxedOpcode(Inst, Is16BitMode);
 
   if (RelaxedOp == Inst.getOpcode()) {
@@ -853,7 +774,7 @@ void X86AsmBackend::relaxInstruction(MCInst &Inst,
 static bool isFullyRelaxed(const MCRelaxableFragment &RF) {
   auto &Inst = RF.getInst();
   auto &STI = *RF.getSubtargetInfo();
-  bool Is16BitMode = STI.getFeatureBits()[X86::Mode16Bit];
+  bool Is16BitMode = STI.getFeatureBits()[X86::Is16Bit];
   return getRelaxedOpcode(Inst, Is16BitMode) == Inst.getOpcode();
 }
 
@@ -1077,9 +998,9 @@ void X86AsmBackend::finishLayout(MCAssembler const &Asm,
 }
 
 unsigned X86AsmBackend::getMaximumNopSize(const MCSubtargetInfo &STI) const {
-  if (STI.hasFeature(X86::Mode16Bit))
+  if (STI.hasFeature(X86::Is16Bit))
     return 4;
-  if (!STI.hasFeature(X86::FeatureNOPL) && !STI.hasFeature(X86::Mode64Bit))
+  if (!STI.hasFeature(X86::FeatureNOPL) && !STI.hasFeature(X86::Is64Bit))
     return 1;
   if (STI.getFeatureBits()[X86::TuningFast7ByteNOP])
     return 7;
@@ -1134,7 +1055,7 @@ bool X86AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
   };
 
   const char(*Nops)[11] =
-      STI->getFeatureBits()[X86::Mode16Bit] ? Nops16Bit : Nops32Bit;
+      STI->getFeatureBits()[X86::Is16Bit] ? Nops16Bit : Nops32Bit;
 
   uint64_t MaxNopLength = (uint64_t)getMaximumNopSize(*STI);
 
@@ -1449,7 +1370,6 @@ public:
     unsigned InstrOffset = 0;
     unsigned StackAdjust = 0;
     unsigned StackSize = 0;
-    unsigned NumDefCFAOffsets = 0;
     int MinAbsOffset = std::numeric_limits<int>::max();
 
     for (const MCCFIInstruction &Inst : Instrs) {
@@ -1457,7 +1377,7 @@ public:
       default:
         // Any other CFI directives indicate a frame that we aren't prepared
         // to represent via compact unwind, so just bail out.
-        return 0;
+        return CU::UNWIND_MODE_DWARF;
       case MCCFIInstruction::OpDefCfaRegister: {
         // Defines a frame pointer. E.g.
         //
@@ -1471,7 +1391,7 @@ public:
         // generate a compact unwinding representation, so bail out.
         if (*MRI.getLLVMRegNum(Inst.getRegister(), true) !=
             (Is64Bit ? X86::RBP : X86::EBP))
-          return 0;
+          return CU::UNWIND_MODE_DWARF;
 
         // Reset the counts.
         memset(SavedRegs, 0, sizeof(SavedRegs));
@@ -1497,7 +1417,6 @@ public:
         //     .cfi_def_cfa_offset 80
         //
         StackSize = Inst.getOffset() / StackDivide;
-        ++NumDefCFAOffsets;
         break;
       }
       case MCCFIInstruction::OpOffset: {
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
index 167580ec1ed0..e78e98cfc09e 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
@@ -18,10 +18,11 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/Casting.h"
-#include <cstdint>
+#include "llvm/Support/raw_ostream.h"
 #include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
@@ -349,7 +350,8 @@ void X86InstPrinterCommon::printOptionalSegReg(const MCInst *MI, unsigned OpNo,
   }
 }
 
-void X86InstPrinterCommon::printInstFlags(const MCInst *MI, raw_ostream &O) {
+void X86InstPrinterCommon::printInstFlags(const MCInst *MI, raw_ostream &O,
+                                          const MCSubtargetInfo &STI) {
   const MCInstrDesc &Desc = MII.get(MI->getOpcode());
   uint64_t TSFlags = Desc.TSFlags;
   unsigned Flags = MI->getFlags();
@@ -379,6 +381,20 @@ void X86InstPrinterCommon::printInstFlags(const MCInst *MI, raw_ostream &O) {
     O << "\t{disp8}";
   else if (Flags & X86::IP_USE_DISP32)
     O << "\t{disp32}";
+
+  // Determine where the memory operand starts, if present
+  int MemoryOperand = X86II::getMemoryOperandNo(TSFlags);
+  if (MemoryOperand != -1)
+    MemoryOperand += X86II::getOperandBias(Desc);
+
+  // Address-Size override prefix
+  if (Flags & X86::IP_HAS_AD_SIZE &&
+      !X86_MC::needsAddressSizeOverride(*MI, STI, MemoryOperand, TSFlags)) {
+    if (STI.hasFeature(X86::Is16Bit) || STI.hasFeature(X86::Is64Bit))
+      O << "\taddr32\t";
+    else if (STI.hasFeature(X86::Is32Bit))
+      O << "\taddr16\t";
+  }
 }
 
 void X86InstPrinterCommon::printVKPair(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
index fd82bdcd1a23..0cb5bf014b20 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
@@ -33,7 +33,8 @@ public:
                      raw_ostream &O);
 
 protected:
-  void printInstFlags(const MCInst *MI, raw_ostream &O);
+  void printInstFlags(const MCInst *MI, raw_ostream &O,
+                      const MCSubtargetInfo &STI);
   void printOptionalSegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printVKPair(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
 };
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.cpp
new file mode 100644
index 000000000000..901082ce6cf3
--- /dev/null
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.cpp
@@ -0,0 +1,165 @@
+//===- X86InstrRelaxTables.cpp - X86 Instruction Relaxation Tables -*- C++ -*-//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 instruction relaxation tables.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrRelaxTables.h"
+#include "X86InstrInfo.h"
+#include "llvm/ADT/STLExtras.h"
+
+using namespace llvm;
+
+// These tables are sorted by their ShortOp value allowing them to be binary
+// searched at runtime without the need for additional storage. The enum values
+// are currently emitted in X86GenInstrInfo.inc in alphabetical order. Which
+// makes sorting these tables a simple matter of alphabetizing the table.
+static const X86InstrRelaxTableEntry InstrRelaxTable[] = {
+  // ADC
+  { X86::ADC16mi8,   X86::ADC16mi     },
+  { X86::ADC16ri8,   X86::ADC16ri     },
+  { X86::ADC32mi8,   X86::ADC32mi     },
+  { X86::ADC32ri8,   X86::ADC32ri     },
+  { X86::ADC64mi8,   X86::ADC64mi32   },
+  { X86::ADC64ri8,   X86::ADC64ri32   },
+  // ADD
+  { X86::ADD16mi8,   X86::ADD16mi     },
+  { X86::ADD16ri8,   X86::ADD16ri     },
+  { X86::ADD32mi8,   X86::ADD32mi     },
+  { X86::ADD32ri8,   X86::ADD32ri     },
+  { X86::ADD64mi8,   X86::ADD64mi32   },
+  { X86::ADD64ri8,   X86::ADD64ri32   },
+  // AND
+  { X86::AND16mi8,   X86::AND16mi     },
+  { X86::AND16ri8,   X86::AND16ri     },
+  { X86::AND32mi8,   X86::AND32mi     },
+  { X86::AND32ri8,   X86::AND32ri     },
+  { X86::AND64mi8,   X86::AND64mi32   },
+  { X86::AND64ri8,   X86::AND64ri32   },
+  // CMP
+  { X86::CMP16mi8,   X86::CMP16mi     },
+  { X86::CMP16ri8,   X86::CMP16ri     },
+  { X86::CMP32mi8,   X86::CMP32mi     },
+  { X86::CMP32ri8,   X86::CMP32ri     },
+  { X86::CMP64mi8,   X86::CMP64mi32   },
+  { X86::CMP64ri8,   X86::CMP64ri32   },
+  // IMUL
+  { X86::IMUL16rmi8, X86::IMUL16rmi   },
+  { X86::IMUL16rri8, X86::IMUL16rri   },
+  { X86::IMUL32rmi8, X86::IMUL32rmi   },
+  { X86::IMUL32rri8, X86::IMUL32rri   },
+  { X86::IMUL64rmi8, X86::IMUL64rmi32 },
+  { X86::IMUL64rri8, X86::IMUL64rri32 },
+  // OR
+  { X86::OR16mi8,    X86::OR16mi      },
+  { X86::OR16ri8,    X86::OR16ri      },
+  { X86::OR32mi8,    X86::OR32mi      },
+  { X86::OR32ri8,    X86::OR32ri      },
+  { X86::OR64mi8,    X86::OR64mi32    },
+  { X86::OR64ri8,    X86::OR64ri32    },
+  // PUSH
+  { X86::PUSH16i8,   X86::PUSHi16     },
+  { X86::PUSH32i8,   X86::PUSHi32     },
+  { X86::PUSH64i8,   X86::PUSH64i32   },
+  // SBB
+  { X86::SBB16mi8,   X86::SBB16mi     },
+  { X86::SBB16ri8,   X86::SBB16ri     },
+  { X86::SBB32mi8,   X86::SBB32mi     },
+  { X86::SBB32ri8,   X86::SBB32ri     },
+  { X86::SBB64mi8,   X86::SBB64mi32   },
+  { X86::SBB64ri8,   X86::SBB64ri32   },
+  // SUB
+  { X86::SUB16mi8,   X86::SUB16mi     },
+  { X86::SUB16ri8,   X86::SUB16ri     },
+  { X86::SUB32mi8,   X86::SUB32mi     },
+  { X86::SUB32ri8,   X86::SUB32ri     },
+  { X86::SUB64mi8,   X86::SUB64mi32   },
+  { X86::SUB64ri8,   X86::SUB64ri32   },
+  // XOR
+  { X86::XOR16mi8,   X86::XOR16mi     },
+  { X86::XOR16ri8,   X86::XOR16ri     },
+  { X86::XOR32mi8,   X86::XOR32mi     },
+  { X86::XOR32ri8,   X86::XOR32ri     },
+  { X86::XOR64mi8,   X86::XOR64mi32   },
+  { X86::XOR64ri8,   X86::XOR64ri32   },
+};
+
+static const X86InstrRelaxTableEntry *
+lookupRelaxTableImpl(ArrayRef<X86InstrRelaxTableEntry> Table,
+                     unsigned ShortOp) {
+#ifndef NDEBUG
+  // Make sure the tables are sorted.
+  static std::atomic<bool> RelaxTableChecked(false);
+  if (!RelaxTableChecked.load(std::memory_order_relaxed)) {
+    assert(llvm::is_sorted(InstrRelaxTable) &&
+           std::adjacent_find(std::begin(InstrRelaxTable),
+                              std::end(InstrRelaxTable)) ==
+               std::end(InstrRelaxTable) &&
+           "InstrRelaxTable is not sorted and unique!");
+    RelaxTableChecked.store(true, std::memory_order_relaxed);
+  }
+#endif
+
+  const X86InstrRelaxTableEntry *Data = llvm::lower_bound(Table, ShortOp);
+  if (Data != Table.end() && Data->KeyOp == ShortOp)
+    return Data;
+  return nullptr;
+}
+
+const X86InstrRelaxTableEntry *llvm::lookupRelaxTable(unsigned ShortOp) {
+  return lookupRelaxTableImpl(InstrRelaxTable, ShortOp);
+}
+
+namespace {
+
+// This class stores the short form tables. It is instantiated as a
+// ManagedStatic to lazily init the short form table.
+struct X86ShortFormTable {
+  // Stores relaxation table entries sorted by relaxed form opcode.
+  SmallVector<X86InstrRelaxTableEntry, 0> Table;
+
+  X86ShortFormTable() {
+    for (const X86InstrRelaxTableEntry &Entry : InstrRelaxTable)
+      Table.push_back({Entry.DstOp, Entry.KeyOp});
+
+    llvm::sort(Table);
+
+    // Now that it's sorted, ensure its unique.
+    assert(std::adjacent_find(Table.begin(), Table.end()) == Table.end() &&
+           "Short form table is not unique!");
+  }
+};
+} // namespace
+
+static ManagedStatic<X86ShortFormTable> ShortTable;
+
+const X86InstrRelaxTableEntry *llvm::lookupShortTable(unsigned RelaxOp) {
+  auto &Table = ShortTable->Table;
+  auto I = llvm::lower_bound(Table, RelaxOp);
+  if (I != Table.end() && I->KeyOp == RelaxOp)
+    return &*I;
+  return nullptr;
+}
+
+namespace llvm {
+
+/// Get the short instruction opcode for a given relaxed opcode.
+unsigned X86::getShortOpcodeArith(unsigned RelaxOp) {
+  if (const X86InstrRelaxTableEntry *I = lookupShortTable(RelaxOp))
+    return I->DstOp;
+  return RelaxOp;
+}
+
+/// Get the relaxed instruction opcode for a given short opcode.
+unsigned X86::getRelaxedOpcodeArith(unsigned ShortOp) {
+  if (const X86InstrRelaxTableEntry *I = lookupRelaxTable(ShortOp))
+    return I->DstOp;
+  return ShortOp;
+}
+} // namespace llvm
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.h b/llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.h
new file mode 100644
index 000000000000..0551c1861a58
--- /dev/null
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.h
@@ -0,0 +1,54 @@
+//===-- X86InstrRelaxTables.h - X86 Instruction Relaxation Tables -*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the interface to query the X86 instruction relaxation
+// tables.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86INSTRRELAXTABLES_H
+#define LLVM_LIB_TARGET_X86_X86INSTRRELAXTABLES_H
+
+#include <cstdint>
+
+namespace llvm {
+
+// This struct is used for both the relaxed and short tables. The KeyOp is used
+// to determine the sorting order.
+struct X86InstrRelaxTableEntry {
+  uint16_t KeyOp;
+  uint16_t DstOp;
+
+  bool operator<(const X86InstrRelaxTableEntry &RHS) const {
+    return KeyOp < RHS.KeyOp;
+  }
+  bool operator==(const X86InstrRelaxTableEntry &RHS) const {
+    return KeyOp == RHS.KeyOp;
+  }
+  friend bool operator<(const X86InstrRelaxTableEntry &TE, unsigned Opcode) {
+    return TE.KeyOp < Opcode;
+  }
+};
+
+/// Look up the relaxed form table entry for a given \p ShortOp.
+const X86InstrRelaxTableEntry *lookupRelaxTable(unsigned ShortOp);
+
+/// Look up the short form table entry for a given \p RelaxOp.
+const X86InstrRelaxTableEntry *lookupShortTable(unsigned RelaxOp);
+
+namespace X86 {
+
+/// Get the short instruction opcode for a given relaxed opcode.
+unsigned getShortOpcodeArith(unsigned RelaxOp);
+
+/// Get the relaxed instruction opcode for a given short opcode.
+unsigned getRelaxedOpcodeArith(unsigned ShortOp);
+} // namespace X86
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
index 48c335f9a777..2a2afa925a9c 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
@@ -40,11 +40,11 @@ void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
 void X86IntelInstPrinter::printInst(const MCInst *MI, uint64_t Address,
                                     StringRef Annot, const MCSubtargetInfo &STI,
                                     raw_ostream &OS) {
-  printInstFlags(MI, OS);
+  printInstFlags(MI, OS, STI);
 
   // In 16-bit mode, print data16 as data32.
   if (MI->getOpcode() == X86::DATA16_PREFIX &&
-      STI.getFeatureBits()[X86::Mode16Bit]) {
+      STI.getFeatureBits()[X86::Is16Bit]) {
     OS << "\tdata32";
   } else if (!printAliasInstr(MI, Address, OS) && !printVecCompareInstr(MI, OS))
     printInstruction(MI, Address, OS);
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 4fa8bc64b245..a21bb6da86de 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -24,6 +24,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
@@ -155,65 +156,6 @@ static MCFixupKind getImmFixupKind(uint64_t TSFlags) {
   return MCFixup::getKindForSize(Size, isPCRel);
 }
 
-/// \param Op operand # of the memory operand.
-///
-/// \returns true if the specified instruction has a 16-bit memory operand.
-static bool is16BitMemOperand(const MCInst &MI, unsigned Op,
-                              const MCSubtargetInfo &STI) {
-  const MCOperand &Base = MI.getOperand(Op + X86::AddrBaseReg);
-  const MCOperand &Index = MI.getOperand(Op + X86::AddrIndexReg);
-
-  unsigned BaseReg = Base.getReg();
-  unsigned IndexReg = Index.getReg();
-
-  if (STI.hasFeature(X86::Mode16Bit) && BaseReg == 0 && IndexReg == 0)
-    return true;
-  if ((BaseReg != 0 &&
-       X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg)) ||
-      (IndexReg != 0 &&
-       X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg)))
-    return true;
-  return false;
-}
-
-/// \param Op operand # of the memory operand.
-///
-/// \returns true if the specified instruction has a 32-bit memory operand.
-static bool is32BitMemOperand(const MCInst &MI, unsigned Op) {
-  const MCOperand &BaseReg = MI.getOperand(Op + X86::AddrBaseReg);
-  const MCOperand &IndexReg = MI.getOperand(Op + X86::AddrIndexReg);
-
-  if ((BaseReg.getReg() != 0 &&
-       X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg.getReg())) ||
-      (IndexReg.getReg() != 0 &&
-       X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg.getReg())))
-    return true;
-  if (BaseReg.getReg() == X86::EIP) {
-    assert(IndexReg.getReg() == 0 && "Invalid eip-based address.");
-    return true;
-  }
-  if (IndexReg.getReg() == X86::EIZ)
-    return true;
-  return false;
-}
-
-/// \param Op operand # of the memory operand.
-///
-/// \returns true if the specified instruction has a 64-bit memory operand.
-#ifndef NDEBUG
-static bool is64BitMemOperand(const MCInst &MI, unsigned Op) {
-  const MCOperand &BaseReg = MI.getOperand(Op + X86::AddrBaseReg);
-  const MCOperand &IndexReg = MI.getOperand(Op + X86::AddrIndexReg);
-
-  if ((BaseReg.getReg() != 0 &&
-       X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg.getReg())) ||
-      (IndexReg.getReg() != 0 &&
-       X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg.getReg())))
-    return true;
-  return false;
-}
-#endif
-
 enum GlobalOffsetTableExprKind { GOT_None, GOT_Normal, GOT_SymDiff };
 
 /// Check if this expression starts with  _GLOBAL_OFFSET_TABLE_ and if it is
@@ -391,7 +333,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
   // Handle %rip relative addressing.
   if (BaseReg == X86::RIP ||
       BaseReg == X86::EIP) { // [disp32+rIP] in X86-64 mode
-    assert(STI.hasFeature(X86::Mode64Bit) &&
+    assert(STI.hasFeature(X86::Is64Bit) &&
            "Rip-relative addressing requires 64-bit mode");
     assert(IndexReg.getReg() == 0 && !ForceSIB &&
            "Invalid rip-relative address");
@@ -462,7 +404,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
 
   // 16-bit addressing forms of the ModR/M byte have a different encoding for
   // the R/M field and are far more limited in which registers can be used.
-  if (is16BitMemOperand(MI, Op, STI)) {
+  if (X86_MC::is16BitMemOperand(MI, Op, STI)) {
     if (BaseReg) {
       // For 32-bit addressing, the row and column values in Table 2-2 are
       // basically the same. It's AX/CX/DX/BX/SP/BP/SI/DI in that order, with
@@ -540,7 +482,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
       BaseRegNo != N86::ESP &&
       // If there is no base register and we're in 64-bit mode, we need a SIB
       // byte to emit an addr that is just 'disp32' (the non-RIP relative form).
-      (!STI.hasFeature(X86::Mode64Bit) || BaseReg != 0)) {
+      (!STI.hasFeature(X86::Is64Bit) || BaseReg != 0)) {
 
     if (BaseReg == 0) { // [disp32]     in X86-32 mode
       emitByte(modRMByte(0, RegOpcodeField, 5), OS);
@@ -671,75 +613,29 @@ bool X86MCCodeEmitter::emitPrefixImpl(unsigned &CurOp, const MCInst &MI,
     emitByte(0xF2, OS);
 
   // Emit the address size opcode prefix as needed.
-  bool NeedAddressOverride;
-  uint64_t AdSize = TSFlags & X86II::AdSizeMask;
-  if ((STI.hasFeature(X86::Mode16Bit) && AdSize == X86II::AdSize32) ||
-      (STI.hasFeature(X86::Mode32Bit) && AdSize == X86II::AdSize16) ||
-      (STI.hasFeature(X86::Mode64Bit) && AdSize == X86II::AdSize32)) {
-    NeedAddressOverride = true;
-  } else if (MemoryOperand < 0) {
-    NeedAddressOverride = false;
-  } else if (STI.hasFeature(X86::Mode64Bit)) {
-    assert(!is16BitMemOperand(MI, MemoryOperand, STI));
-    NeedAddressOverride = is32BitMemOperand(MI, MemoryOperand);
-  } else if (STI.hasFeature(X86::Mode32Bit)) {
-    assert(!is64BitMemOperand(MI, MemoryOperand));
-    NeedAddressOverride = is16BitMemOperand(MI, MemoryOperand, STI);
-  } else {
-    assert(STI.hasFeature(X86::Mode16Bit));
-    assert(!is64BitMemOperand(MI, MemoryOperand));
-    NeedAddressOverride = !is16BitMemOperand(MI, MemoryOperand, STI);
-  }
-
-  if (NeedAddressOverride)
+  if (X86_MC::needsAddressSizeOverride(MI, STI, MemoryOperand, TSFlags) ||
+      Flags & X86::IP_HAS_AD_SIZE)
     emitByte(0x67, OS);
 
-  // Encoding type for this instruction.
-  uint64_t Encoding = TSFlags & X86II::EncodingMask;
-  bool HasREX = false;
-  if (Encoding)
-    emitVEXOpcodePrefix(MemoryOperand, MI, OS);
-  else
-    HasREX = emitOpcodePrefix(MemoryOperand, MI, STI, OS);
-
   uint64_t Form = TSFlags & X86II::FormMask;
   switch (Form) {
   default:
     break;
   case X86II::RawFrmDstSrc: {
-    unsigned siReg = MI.getOperand(1).getReg();
-    assert(((siReg == X86::SI && MI.getOperand(0).getReg() == X86::DI) ||
-            (siReg == X86::ESI && MI.getOperand(0).getReg() == X86::EDI) ||
-            (siReg == X86::RSI && MI.getOperand(0).getReg() == X86::RDI)) &&
-           "SI and DI register sizes do not match");
     // Emit segment override opcode prefix as needed (not for %ds).
     if (MI.getOperand(2).getReg() != X86::DS)
       emitSegmentOverridePrefix(2, MI, OS);
-    // Emit AdSize prefix as needed.
-    if ((!STI.hasFeature(X86::Mode32Bit) && siReg == X86::ESI) ||
-        (STI.hasFeature(X86::Mode32Bit) && siReg == X86::SI))
-      emitByte(0x67, OS);
     CurOp += 3; // Consume operands.
     break;
   }
   case X86II::RawFrmSrc: {
-    unsigned siReg = MI.getOperand(0).getReg();
     // Emit segment override opcode prefix as needed (not for %ds).
     if (MI.getOperand(1).getReg() != X86::DS)
       emitSegmentOverridePrefix(1, MI, OS);
-    // Emit AdSize prefix as needed.
-    if ((!STI.hasFeature(X86::Mode32Bit) && siReg == X86::ESI) ||
-        (STI.hasFeature(X86::Mode32Bit) && siReg == X86::SI))
-      emitByte(0x67, OS);
     CurOp += 2; // Consume operands.
     break;
   }
   case X86II::RawFrmDst: {
-    unsigned siReg = MI.getOperand(0).getReg();
-    // Emit AdSize prefix as needed.
-    if ((!STI.hasFeature(X86::Mode32Bit) && siReg == X86::EDI) ||
-        (STI.hasFeature(X86::Mode32Bit) && siReg == X86::DI))
-      emitByte(0x67, OS);
     ++CurOp; // Consume operand.
     break;
   }
@@ -750,6 +646,15 @@ bool X86MCCodeEmitter::emitPrefixImpl(unsigned &CurOp, const MCInst &MI,
   }
   }
 
+  // REX prefix is optional, but if used must be immediately before the opcode
+  // Encoding type for this instruction.
+  uint64_t Encoding = TSFlags & X86II::EncodingMask;
+  bool HasREX = false;
+  if (Encoding)
+    emitVEXOpcodePrefix(MemoryOperand, MI, OS);
+  else
+    HasREX = emitOpcodePrefix(MemoryOperand, MI, STI, OS);
+
   return HasREX;
 }
 
@@ -1347,7 +1252,7 @@ bool X86MCCodeEmitter::emitOpcodePrefix(int MemOperand, const MCInst &MI,
 
   // Emit the operand size opcode prefix as needed.
   if ((TSFlags & X86II::OpSizeMask) ==
-      (STI.hasFeature(X86::Mode16Bit) ? X86II::OpSize32 : X86II::OpSize16))
+      (STI.hasFeature(X86::Is16Bit) ? X86II::OpSize32 : X86II::OpSize16))
     emitByte(0x66, OS);
 
   // Emit the LOCK opcode prefix.
@@ -1371,9 +1276,9 @@ bool X86MCCodeEmitter::emitOpcodePrefix(int MemOperand, const MCInst &MI,
   }
 
   // Handle REX prefix.
-  assert((STI.hasFeature(X86::Mode64Bit) || !(TSFlags & X86II::REX_W)) &&
+  assert((STI.hasFeature(X86::Is64Bit) || !(TSFlags & X86II::REX_W)) &&
          "REX.W requires 64bit mode.");
-  bool HasREX = STI.hasFeature(X86::Mode64Bit)
+  bool HasREX = STI.hasFeature(X86::Is64Bit)
                     ? emitREXPrefix(MemOperand, MI, STI, OS)
                     : false;
 
@@ -1472,7 +1377,7 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
   case X86II::RawFrm:
     emitByte(BaseOpcode + OpcodeOffset, OS);
 
-    if (!STI.hasFeature(X86::Mode64Bit) || !isPCRel32Branch(MI, MCII))
+    if (!STI.hasFeature(X86::Is64Bit) || !isPCRel32Branch(MI, MCII))
       break;
 
     const MCOperand &Op = MI.getOperand(CurOp++);
@@ -1842,7 +1747,6 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
 }
 
 MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII,
-                                            const MCRegisterInfo &MRI,
                                             MCContext &Ctx) {
   return new X86MCCodeEmitter(MCII, Ctx);
 }
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h b/llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h
index 532fecd9951b..cd2baeb1c98e 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h
@@ -18,6 +18,7 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 
 namespace llvm {
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 8913e405539e..49660883ad83 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -72,6 +72,97 @@ bool X86_MC::hasLockPrefix(const MCInst &MI) {
   return MI.getFlags() & X86::IP_HAS_LOCK;
 }
 
+static bool isMemOperand(const MCInst &MI, unsigned Op, unsigned RegClassID) {
+  const MCOperand &Base = MI.getOperand(Op + X86::AddrBaseReg);
+  const MCOperand &Index = MI.getOperand(Op + X86::AddrIndexReg);
+  const MCRegisterClass &RC = X86MCRegisterClasses[RegClassID];
+
+  return (Base.isReg() && Base.getReg() != 0 && RC.contains(Base.getReg())) ||
+         (Index.isReg() && Index.getReg() != 0 && RC.contains(Index.getReg()));
+}
+
+bool X86_MC::is16BitMemOperand(const MCInst &MI, unsigned Op,
+                               const MCSubtargetInfo &STI) {
+  const MCOperand &Base = MI.getOperand(Op + X86::AddrBaseReg);
+  const MCOperand &Index = MI.getOperand(Op + X86::AddrIndexReg);
+
+  if (STI.hasFeature(X86::Is16Bit) && Base.isReg() && Base.getReg() == 0 &&
+      Index.isReg() && Index.getReg() == 0)
+    return true;
+  return isMemOperand(MI, Op, X86::GR16RegClassID);
+}
+
+bool X86_MC::is32BitMemOperand(const MCInst &MI, unsigned Op) {
+  const MCOperand &Base = MI.getOperand(Op + X86::AddrBaseReg);
+  const MCOperand &Index = MI.getOperand(Op + X86::AddrIndexReg);
+  if (Base.isReg() && Base.getReg() == X86::EIP) {
+    assert(Index.isReg() && Index.getReg() == 0 && "Invalid eip-based address");
+    return true;
+  }
+  if (Index.isReg() && Index.getReg() == X86::EIZ)
+    return true;
+  return isMemOperand(MI, Op, X86::GR32RegClassID);
+}
+
+#ifndef NDEBUG
+bool X86_MC::is64BitMemOperand(const MCInst &MI, unsigned Op) {
+  return isMemOperand(MI, Op, X86::GR64RegClassID);
+}
+#endif
+
+bool X86_MC::needsAddressSizeOverride(const MCInst &MI,
+                                      const MCSubtargetInfo &STI,
+                                      int MemoryOperand, uint64_t TSFlags) {
+  uint64_t AdSize = TSFlags & X86II::AdSizeMask;
+  bool Is16BitMode = STI.hasFeature(X86::Is16Bit);
+  bool Is32BitMode = STI.hasFeature(X86::Is32Bit);
+  bool Is64BitMode = STI.hasFeature(X86::Is64Bit);
+  if ((Is16BitMode && AdSize == X86II::AdSize32) ||
+      (Is32BitMode && AdSize == X86II::AdSize16) ||
+      (Is64BitMode && AdSize == X86II::AdSize32))
+    return true;
+  uint64_t Form = TSFlags & X86II::FormMask;
+  switch (Form) {
+  default:
+    break;
+  case X86II::RawFrmDstSrc: {
+    unsigned siReg = MI.getOperand(1).getReg();
+    assert(((siReg == X86::SI && MI.getOperand(0).getReg() == X86::DI) ||
+            (siReg == X86::ESI && MI.getOperand(0).getReg() == X86::EDI) ||
+            (siReg == X86::RSI && MI.getOperand(0).getReg() == X86::RDI)) &&
+           "SI and DI register sizes do not match");
+    return (!Is32BitMode && siReg == X86::ESI) ||
+           (Is32BitMode && siReg == X86::SI);
+  }
+  case X86II::RawFrmSrc: {
+    unsigned siReg = MI.getOperand(0).getReg();
+    return (!Is32BitMode && siReg == X86::ESI) ||
+           (Is32BitMode && siReg == X86::SI);
+  }
+  case X86II::RawFrmDst: {
+    unsigned siReg = MI.getOperand(0).getReg();
+    return (!Is32BitMode && siReg == X86::EDI) ||
+           (Is32BitMode && siReg == X86::DI);
+  }
+  }
+
+  // Determine where the memory operand starts, if present.
+  if (MemoryOperand < 0)
+    return false;
+
+  if (STI.hasFeature(X86::Is64Bit)) {
+    assert(!is16BitMemOperand(MI, MemoryOperand, STI));
+    return is32BitMemOperand(MI, MemoryOperand);
+  }
+  if (STI.hasFeature(X86::Is32Bit)) {
+    assert(!is64BitMemOperand(MI, MemoryOperand));
+    return is16BitMemOperand(MI, MemoryOperand, STI);
+  }
+  assert(STI.hasFeature(X86::Is16Bit));
+  assert(!is64BitMemOperand(MI, MemoryOperand));
+  return !is16BitMemOperand(MI, MemoryOperand, STI);
+}
+
 void X86_MC::initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI) {
   // FIXME: TableGen these.
   for (unsigned Reg = X86::NoRegister + 1; Reg < X86::NUM_TARGET_REGS; ++Reg) {
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 35604cd3ec0a..d0530bd4d650 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -63,6 +63,28 @@ void initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI);
 /// Returns true if this instruction has a LOCK prefix.
 bool hasLockPrefix(const MCInst &MI);
 
+/// \param Op operand # of the memory operand.
+///
+/// \returns true if the specified instruction has a 16-bit memory operand.
+bool is16BitMemOperand(const MCInst &MI, unsigned Op,
+                       const MCSubtargetInfo &STI);
+
+/// \param Op operand # of the memory operand.
+///
+/// \returns true if the specified instruction has a 32-bit memory operand.
+bool is32BitMemOperand(const MCInst &MI, unsigned Op);
+
+/// \param Op operand # of the memory operand.
+///
+/// \returns true if the specified instruction has a 64-bit memory operand.
+#ifndef NDEBUG
+bool is64BitMemOperand(const MCInst &MI, unsigned Op);
+#endif
+
+/// Returns true if this instruction needs an Address-Size override prefix.
+bool needsAddressSizeOverride(const MCInst &MI, const MCSubtargetInfo &STI,
+                              int MemoryOperand, uint64_t TSFlags);
+
 /// Create a X86 MCSubtargetInfo instance. This is exposed so Asm parser, etc.
 /// do not need to go through TargetRegistry.
 MCSubtargetInfo *createX86MCSubtargetInfo(const Triple &TT, StringRef CPU,
@@ -70,7 +92,6 @@ MCSubtargetInfo *createX86MCSubtargetInfo(const Triple &TT, StringRef CPU,
 }
 
 MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII,
-                                      const MCRegisterInfo &MRI,
                                       MCContext &Ctx);
 
 MCAsmBackend *createX86_32AsmBackend(const Target &T,
@@ -142,4 +163,7 @@ MCRegister getX86SubSuperRegisterOrZero(MCRegister, unsigned,
 #define GET_SUBTARGETINFO_ENUM
 #include "X86GenSubtargetInfo.inc"
 
+#define GET_X86_MNEMONIC_TABLES_H
+#include "X86GenMnemonicTables.inc"
+
 #endif
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MnemonicTables.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MnemonicTables.cpp
new file mode 100644
index 000000000000..39b7f0f4160e
--- /dev/null
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MnemonicTables.cpp
@@ -0,0 +1,16 @@
+//===-- X86MnemonicTables.cpp - X86 Mnemonic Tables -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides X86 mnemonic tables.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrInfo.h"
+
+#define GET_X86_MNEMONIC_TABLES_CPP
+#include "X86GenMnemonicTables.inc"
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
index c29211246123..36945d1f6746 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -9,6 +9,7 @@
 #include "X86MCTargetDesc.h"
 #include "X86TargetStreamer.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCWin64EH.h"
@@ -25,15 +26,15 @@ public:
                      std::unique_ptr<MCObjectWriter> OW)
       : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW)) {}
 
-  void EmitWinEHHandlerData(SMLoc Loc) override;
-  void EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) override;
-  void EmitWindowsUnwindTables() override;
-  void EmitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) override;
+  void emitWinEHHandlerData(SMLoc Loc) override;
+  void emitWindowsUnwindTables(WinEH::FrameInfo *Frame) override;
+  void emitWindowsUnwindTables() override;
+  void emitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) override;
   void finishImpl() override;
 };
 
-void X86WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) {
-  MCStreamer::EmitWinEHHandlerData(Loc);
+void X86WinCOFFStreamer::emitWinEHHandlerData(SMLoc Loc) {
+  MCStreamer::emitWinEHHandlerData(Loc);
 
   // We have to emit the unwind info now, because this directive
   // actually switches to the .xdata section.
@@ -41,17 +42,17 @@ void X86WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) {
     EHStreamer.EmitUnwindInfo(*this, CurFrame, /* HandlerData = */ true);
 }
 
-void X86WinCOFFStreamer::EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) {
+void X86WinCOFFStreamer::emitWindowsUnwindTables(WinEH::FrameInfo *Frame) {
   EHStreamer.EmitUnwindInfo(*this, Frame, /* HandlerData = */ false);
 }
 
-void X86WinCOFFStreamer::EmitWindowsUnwindTables() {
+void X86WinCOFFStreamer::emitWindowsUnwindTables() {
   if (!getNumWinFrameInfos())
     return;
   EHStreamer.Emit(*this);
 }
 
-void X86WinCOFFStreamer::EmitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) {
+void X86WinCOFFStreamer::emitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) {
   X86TargetStreamer *XTS =
       static_cast<X86TargetStreamer *>(getTargetStreamer());
   XTS->emitFPOData(ProcSym, Loc);
@@ -59,7 +60,7 @@ void X86WinCOFFStreamer::EmitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) {
 
 void X86WinCOFFStreamer::finishImpl() {
   emitFrames(nullptr);
-  EmitWindowsUnwindTables();
+  emitWindowsUnwindTables();
 
   MCWinCOFFStreamer::finishImpl();
 }
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
index bf3f4e990ecc..f2827c568109 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
@@ -14,6 +14,7 @@
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/FormattedStream.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index 10e1c5d6ed38..7344900f2e31 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -79,6 +79,9 @@ FunctionPass *createX86DynAllocaExpander();
 /// Return a pass that config the tile registers.
 FunctionPass *createX86TileConfigPass();
 
+/// Return a pass that preconfig the tile registers before fast reg allocation.
+FunctionPass *createX86FastPreTileConfigPass();
+
 /// Return a pass that config the tile registers after fast reg allocation.
 FunctionPass *createX86FastTileConfigPass();
 
@@ -175,6 +178,7 @@ void initializeX86PartialReductionPass(PassRegistry &);
 void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
 void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &);
 void initializeX86PreTileConfigPass(PassRegistry &);
+void initializeX86FastPreTileConfigPass(PassRegistry &);
 void initializeX86FastTileConfigPass(PassRegistry &);
 void initializeX86TileConfigPass(PassRegistry &);
 void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &);
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 380507308c3d..a5c6b40c493c 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -18,13 +18,13 @@ include "llvm/Target/Target.td"
 //===----------------------------------------------------------------------===//
 // X86 Subtarget state
 //
-
-def Mode64Bit : SubtargetFeature<"64bit-mode", "In64BitMode", "true",
-                                  "64-bit mode (x86_64)">;
-def Mode32Bit : SubtargetFeature<"32bit-mode", "In32BitMode", "true",
-                                  "32-bit mode (80386)">;
-def Mode16Bit : SubtargetFeature<"16bit-mode", "In16BitMode", "true",
-                                  "16-bit mode (i8086)">;
+// disregarding specific ABI / programming model
+def Is64Bit : SubtargetFeature<"64bit-mode", "Is64Bit", "true",
+                               "64-bit mode (x86_64)">;
+def Is32Bit : SubtargetFeature<"32bit-mode", "Is32Bit", "true",
+                               "32-bit mode (80386)">;
+def Is16Bit : SubtargetFeature<"16bit-mode", "Is16Bit", "true",
+                               "16-bit mode (i8086)">;
 
 //===----------------------------------------------------------------------===//
 // X86 Subtarget ISA features
@@ -34,16 +34,16 @@ def FeatureX87     : SubtargetFeature<"x87","HasX87", "true",
                                       "Enable X87 float instructions">;
 
 def FeatureNOPL    : SubtargetFeature<"nopl", "HasNOPL", "true",
-                                      "Enable NOPL instruction">;
+                                      "Enable NOPL instruction (generally pentium pro+)">;
 
-def FeatureCMOV    : SubtargetFeature<"cmov","HasCMov", "true",
+def FeatureCMOV    : SubtargetFeature<"cmov","HasCMOV", "true",
                                       "Enable conditional move instructions">;
 
-def FeatureCMPXCHG8B : SubtargetFeature<"cx8", "HasCmpxchg8b", "true",
-                                        "Support CMPXCHG8B instructions">;
+def FeatureCX8     : SubtargetFeature<"cx8", "HasCX8", "true",
+                                      "Support CMPXCHG8B instructions">;
 
 def FeatureCRC32   : SubtargetFeature<"crc32", "HasCRC32", "true",
-                                      "Enable SSE 4.2 CRC32 instruction">;
+                                      "Enable SSE 4.2 CRC32 instruction (used when SSE4.2 is supported but function is GPR only)">;
 
 def FeaturePOPCNT   : SubtargetFeature<"popcnt", "HasPOPCNT", "true",
                                        "Support POPCNT instruction">;
@@ -98,11 +98,11 @@ def Feature3DNowA  : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
 // feature, because SSE2 can be disabled (e.g. for compiling OS kernels)
 // without disabling 64-bit mode. Nothing should imply this feature bit. It
 // is used to enforce that only 64-bit capable CPUs are used in 64-bit mode.
-def Feature64Bit   : SubtargetFeature<"64bit", "HasX86_64", "true",
+def FeatureX86_64   : SubtargetFeature<"64bit", "HasX86_64", "true",
                                       "Support 64-bit instructions">;
-def FeatureCMPXCHG16B : SubtargetFeature<"cx16", "HasCmpxchg16b", "true",
-                                      "64-bit with cmpxchg16b",
-                                      [FeatureCMPXCHG8B]>;
+def FeatureCX16     : SubtargetFeature<"cx16", "HasCX16", "true",
+                                       "64-bit with cmpxchg16b (this is true for most x86-64 chips, but not the first AMD chips)",
+                                       [FeatureCX8]>;
 def FeatureSSE4A   : SubtargetFeature<"sse4a", "HasSSE4A", "true",
                                       "Support SSE 4a instructions",
                                       [FeatureSSE3]>;
@@ -119,7 +119,7 @@ def FeatureFMA     : SubtargetFeature<"fma", "HasFMA", "true",
 def FeatureF16C    : SubtargetFeature<"f16c", "HasF16C", "true",
                        "Support 16-bit floating point conversion instructions",
                        [FeatureAVX]>;
-def FeatureAVX512   : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512F",
+def FeatureAVX512   : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512",
                                       "Enable AVX-512 instructions",
                                       [FeatureAVX2, FeatureFMA, FeatureF16C]>;
 def FeatureERI      : SubtargetFeature<"avx512er", "HasERI", "true",
@@ -198,7 +198,7 @@ def FeatureXOP     : SubtargetFeature<"xop", "HasXOP", "true",
                                       [FeatureFMA4]>;
 def FeatureSSEUnalignedMem : SubtargetFeature<"sse-unaligned-mem",
                                           "HasSSEUnalignedMem", "true",
-                      "Allow unaligned memory operands with SSE instructions">;
+                      "Allow unaligned memory operands with SSE instructions (this may require setting a configuration bit in the processor)">;
 def FeatureAES     : SubtargetFeature<"aes", "HasAES", "true",
                                       "Enable AES instructions",
                                       [FeatureSSE2]>;
@@ -228,20 +228,22 @@ def FeatureADX     : SubtargetFeature<"adx", "HasADX", "true",
 def FeatureSHA     : SubtargetFeature<"sha", "HasSHA", "true",
                                       "Enable SHA instructions",
                                       [FeatureSSE2]>;
+// Processor supports CET SHSTK - Control-Flow Enforcement Technology
+// using Shadow Stack
 def FeatureSHSTK   : SubtargetFeature<"shstk", "HasSHSTK", "true",
                        "Support CET Shadow-Stack instructions">;
 def FeaturePRFCHW  : SubtargetFeature<"prfchw", "HasPRFCHW", "true",
                                       "Support PRFCHW instructions">;
 def FeatureRDSEED  : SubtargetFeature<"rdseed", "HasRDSEED", "true",
                                       "Support RDSEED instruction">;
-def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF64", "true",
+def FeatureLAHFSAHF64 : SubtargetFeature<"sahf", "HasLAHFSAHF64", "true",
                            "Support LAHF and SAHF instructions in 64-bit mode">;
 def FeatureMWAITX  : SubtargetFeature<"mwaitx", "HasMWAITX", "true",
                                       "Enable MONITORX/MWAITX timer functionality">;
 def FeatureCLZERO  : SubtargetFeature<"clzero", "HasCLZERO", "true",
                                       "Enable Cache Line Zero">;
 def FeatureCLDEMOTE  : SubtargetFeature<"cldemote", "HasCLDEMOTE", "true",
-                                      "Enable Cache Demote">;
+                                      "Enable Cache Line Demote">;
 def FeaturePTWRITE  : SubtargetFeature<"ptwrite", "HasPTWRITE", "true",
                                       "Support ptwrite instruction">;
 def FeatureAMXTILE     : SubtargetFeature<"amx-tile", "HasAMXTILE", "true",
@@ -285,9 +287,9 @@ def FeatureUINTR : SubtargetFeature<"uintr", "HasUINTR", "true",
 def FeaturePCONFIG : SubtargetFeature<"pconfig", "HasPCONFIG", "true",
                                       "platform configuration instruction">;
 def FeatureMOVDIRI  : SubtargetFeature<"movdiri", "HasMOVDIRI", "true",
-                                       "Support movdiri instruction">;
+                                       "Support movdiri instruction (direct store integer)">;
 def FeatureMOVDIR64B : SubtargetFeature<"movdir64b", "HasMOVDIR64B", "true",
-                                        "Support movdir64b instruction">;
+                                        "Support movdir64b instruction (direct store 64 bytes)">;
 
 // Ivy Bridge and newer processors have enhanced REP MOVSB and STOSB (aka
 // "string operations"). See "REP String Enhancement" in the Intel Software
@@ -380,6 +382,17 @@ def FeatureTaggedGlobals
           "Use an instruction sequence for taking the address of a global "
           "that allows a memory tag in the upper address bits.">;
 
+// Control codegen mitigation against Straight Line Speculation vulnerability.
+def FeatureHardenSlsRet
+    : SubtargetFeature<
+          "harden-sls-ret", "HardenSlsRet", "true",
+          "Harden against straight line speculation across RET instructions.">;
+
+def FeatureHardenSlsIJmp
+    : SubtargetFeature<
+          "harden-sls-ijmp", "HardenSlsIJmp", "true",
+          "Harden against straight line speculation across indirect JMP instructions.">;
+
 //===----------------------------------------------------------------------===//
 // X86 Subtarget Tuning features
 //===----------------------------------------------------------------------===//
@@ -388,7 +401,7 @@ def TuningSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
                                        "SHLD instruction is slow">;
 
 def TuningSlowPMULLD : SubtargetFeature<"slow-pmulld", "IsPMULLDSlow", "true",
-                                        "PMULLD instruction is slow">;
+                                        "PMULLD instruction is slow (compared to PMULLW/PMULHW and PMULUDQ)">;
 
 def TuningSlowPMADDWD : SubtargetFeature<"slow-pmaddwd", "IsPMADDWDSlow",
                                           "true",
@@ -396,27 +409,31 @@ def TuningSlowPMADDWD : SubtargetFeature<"slow-pmaddwd", "IsPMADDWDSlow",
 
 // FIXME: This should not apply to CPUs that do not have SSE.
 def TuningSlowUAMem16 : SubtargetFeature<"slow-unaligned-mem-16",
-                                "IsUAMem16Slow", "true",
+                                "IsUnalignedMem16Slow", "true",
                                 "Slow unaligned 16-byte memory access">;
 
 def TuningSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32",
-                                "IsUAMem32Slow", "true",
+                                "IsUnalignedMem32Slow", "true",
                                 "Slow unaligned 32-byte memory access">;
 
 def TuningLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
-                                     "Use LEA for adjusting the stack pointer">;
+                                     "Use LEA for adjusting the stack pointer (this is an optimization for Intel Atom processors)">;
 
+// True if 8-bit divisions are significantly faster than
+// 32-bit divisions and should be used when possible.
 def TuningSlowDivide32 : SubtargetFeature<"idivl-to-divb",
                                      "HasSlowDivide32", "true",
                                      "Use 8-bit divide for positive values less than 256">;
 
+// True if 32-bit divides are significantly faster than
+// 64-bit divisions and should be used when possible.
 def TuningSlowDivide64 : SubtargetFeature<"idivq-to-divl",
                                      "HasSlowDivide64", "true",
                                      "Use 32-bit divide for positive values less than 2^32">;
 
 def TuningPadShortFunctions : SubtargetFeature<"pad-short-functions",
                                      "PadShortFunctions", "true",
-                                     "Pad short functions">;
+                                     "Pad short functions (to prevent a stall when returning too early)">;
 
 // On some processors, instructions that implicitly take two memory operands are
 // slow. In practice, this means that CALL, PUSH, and POP with memory operands
@@ -425,15 +442,21 @@ def TuningSlowTwoMemOps : SubtargetFeature<"slow-two-mem-ops",
                                      "SlowTwoMemOps", "true",
                                      "Two memory operand instructions are slow">;
 
-def TuningLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
+// True if the LEA instruction inputs have to be ready at address generation
+// (AG) time.
+def TuningLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LeaUsesAG", "true",
                                    "LEA instruction needs inputs at AG stage">;
 
 def TuningSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
                                    "LEA instruction with certain arguments is slow">;
 
+// True if the LEA instruction has all three source operands: base, index,
+// and offset or if the LEA instruction uses base and index registers where
+// the base is EBP, RBP,or R13
 def TuningSlow3OpsLEA : SubtargetFeature<"slow-3ops-lea", "Slow3OpsLEA", "true",
                                    "LEA instruction with 3 ops or certain registers is slow">;
 
+// True if INC and DEC instructions are slow when writing to flags
 def TuningSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
                                    "INC and DEC instructions are slower than ADD and SUB">;
 
@@ -445,6 +468,31 @@ def TuningLZCNTFalseDeps : SubtargetFeature<"false-deps-lzcnt-tzcnt",
                                      "HasLZCNTFalseDeps", "true",
                                      "LZCNT/TZCNT have a false dependency on dest register">;
 
+def TuningMULCFalseDeps : SubtargetFeature<"false-deps-mulc",
+                               "HasMULCFalseDeps", "true",
+                               "VF[C]MULCPH/SH has a false dependency on dest register">;
+
+def TuningPERMFalseDeps : SubtargetFeature<"false-deps-perm",
+                               "HasPERMFalseDeps", "true",
+                               "VPERMD/Q/PS/PD has a false dependency on dest register">;
+
+def TuningRANGEFalseDeps : SubtargetFeature<"false-deps-range",
+                               "HasRANGEFalseDeps", "true",
+                               "VRANGEPD/PS/SD/SS has a false dependency on dest register">;
+
+def TuningGETMANTFalseDeps : SubtargetFeature<"false-deps-getmant",
+                               "HasGETMANTFalseDeps", "true",
+                               "VGETMANTSS/SD/SH and VGETMANDPS/PD(memory version) has a"
+                               " false dependency on dest register">;
+
+def TuningMULLQFalseDeps : SubtargetFeature<"false-deps-mullq",
+                               "HasMULLQFalseDeps", "true",
+                               "VPMULLQ has a false dependency on dest register">;
+
+def TuningSBBDepBreaking : SubtargetFeature<"sbb-dep-breaking",
+                                     "HasSBBDepBreaking", "true",
+                                     "SBB with same register has no source dependency">;
+
 // On recent X86 (port bound) processors, its preferable to combine to a single shuffle
 // using a variable mask over multiple fixed shuffles.
 def TuningFastVariableCrossLaneShuffle
@@ -470,9 +518,14 @@ def TuningInsertVZEROUPPER
 // vectorized code we should care about the throughput of SQRT operations.
 // But if the code is scalar that probably means that the code has some kind of
 // dependency and we should care more about reducing the latency.
+
+// True if hardware SQRTSS instruction is at least as fast (latency) as
+// RSQRTSS followed by a Newton-Raphson iteration.
 def TuningFastScalarFSQRT
     : SubtargetFeature<"fast-scalar-fsqrt", "HasFastScalarFSQRT",
                        "true", "Scalar SQRT is fast (disable Newton-Raphson)">;
+// True if hardware SQRTPS/VSQRTPS instructions are at least as fast
+// (throughput) as RSQRTPS/VRSQRTPS followed by a Newton-Raphson iteration.
 def TuningFastVectorFSQRT
     : SubtargetFeature<"fast-vector-fsqrt", "HasFastVectorFSQRT",
                        "true", "Vector SQRT is fast (disable Newton-Raphson)">;
@@ -529,7 +582,7 @@ def TuningMacroFusion
 // similar to Skylake Server (AVX-512).
 def TuningFastGather
     : SubtargetFeature<"fast-gather", "HasFastGather", "true",
-                       "Indicates if gather is reasonably fast">;
+                       "Indicates if gather is reasonably fast (this is true for Skylake client and all AVX-512 CPUs)">;
 
 def TuningPrefer128Bit
     : SubtargetFeature<"prefer-128-bit", "Prefer128Bit", "true",
@@ -578,17 +631,13 @@ def TuningUseGLMDivSqrtCosts
     : SubtargetFeature<"use-glm-div-sqrt-costs", "UseGLMDivSqrtCosts", "true",
         "Use Goldmont specific floating point div/sqrt costs">;
 
-// Enable use of alias analysis during code generation.
-def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true",
-                                    "Use alias analysis during codegen">;
-
 //===----------------------------------------------------------------------===//
 // X86 CPU Families
 // TODO: Remove these - use general tuning features to determine codegen.
 //===----------------------------------------------------------------------===//
 
 // Bonnell
-def ProcIntelAtom : SubtargetFeature<"", "X86ProcFamily", "IntelAtom", "">;
+def ProcIntelAtom : SubtargetFeature<"", "IsAtom", "true", "Is Intel Atom processor">;
 
 //===----------------------------------------------------------------------===//
 // Register File Description
@@ -632,11 +681,11 @@ include "X86SchedIceLake.td"
 def ProcessorFeatures {
   // x86-64 and x86-64-v[234]
   list<SubtargetFeature> X86_64V1Features = [
-    FeatureX87, FeatureCMPXCHG8B, FeatureCMOV, FeatureMMX, FeatureSSE2,
-    FeatureFXSR, FeatureNOPL, Feature64Bit
+    FeatureX87, FeatureCX8, FeatureCMOV, FeatureMMX, FeatureSSE2,
+    FeatureFXSR, FeatureNOPL, FeatureX86_64,
   ];
   list<SubtargetFeature> X86_64V2Features = !listconcat(X86_64V1Features, [
-    FeatureCMPXCHG16B, FeatureLAHFSAHF, FeatureCRC32, FeaturePOPCNT,
+    FeatureCX16, FeatureLAHFSAHF64, FeatureCRC32, FeaturePOPCNT,
     FeatureSSE42
   ]);
   list<SubtargetFeature> X86_64V3Features = !listconcat(X86_64V2Features, [
@@ -862,22 +911,27 @@ def ProcessorFeatures {
                                                   FeatureMOVDIRI,
                                                   FeatureMOVDIR64B,
                                                   FeatureUINTR];
-  list<SubtargetFeature> SPRTuning = ICXTuning;
+  list<SubtargetFeature> SPRAdditionalTuning = [TuningMULCFalseDeps,
+                                                TuningPERMFalseDeps,
+                                                TuningRANGEFalseDeps,
+                                                TuningGETMANTFalseDeps,
+                                                TuningMULLQFalseDeps];
+  list<SubtargetFeature> SPRTuning = !listconcat(ICXTuning, SPRAdditionalTuning);
   list<SubtargetFeature> SPRFeatures =
     !listconcat(ICXFeatures, SPRAdditionalFeatures);
 
   // Atom
   list<SubtargetFeature> AtomFeatures = [FeatureX87,
-                                         FeatureCMPXCHG8B,
+                                         FeatureCX8,
                                          FeatureCMOV,
                                          FeatureMMX,
                                          FeatureSSSE3,
                                          FeatureFXSR,
                                          FeatureNOPL,
-                                         Feature64Bit,
-                                         FeatureCMPXCHG16B,
+                                         FeatureX86_64,
+                                         FeatureCX16,
                                          FeatureMOVBE,
-                                         FeatureLAHFSAHF];
+                                         FeatureLAHFSAHF64];
   list<SubtargetFeature> AtomTuning = [ProcIntelAtom,
                                        TuningSlowUAMem16,
                                        TuningLEAForSP,
@@ -968,25 +1022,26 @@ def ProcessorFeatures {
                                                   FeatureMOVDIRI,
                                                   FeatureMOVDIR64B,
                                                   FeatureWAITPKG];
-  list<SubtargetFeature> ADLTuning = SKLTuning;
+  list<SubtargetFeature> ADLAdditionalTuning = [TuningPERMFalseDeps];
+  list<SubtargetFeature> ADLTuning = !listconcat(SKLTuning, ADLAdditionalTuning);
   list<SubtargetFeature> ADLFeatures =
     !listconcat(TRMFeatures, ADLAdditionalFeatures);
 
   // Knights Landing
   list<SubtargetFeature> KNLFeatures = [FeatureX87,
-                                        FeatureCMPXCHG8B,
+                                        FeatureCX8,
                                         FeatureCMOV,
                                         FeatureMMX,
                                         FeatureFXSR,
                                         FeatureNOPL,
-                                        Feature64Bit,
-                                        FeatureCMPXCHG16B,
+                                        FeatureX86_64,
+                                        FeatureCX16,
                                         FeatureCRC32,
                                         FeaturePOPCNT,
                                         FeaturePCLMUL,
                                         FeatureXSAVE,
                                         FeatureXSAVEOPT,
-                                        FeatureLAHFSAHF,
+                                        FeatureLAHFSAHF64,
                                         FeatureAES,
                                         FeatureRDRAND,
                                         FeatureF16C,
@@ -1018,41 +1073,43 @@ def ProcessorFeatures {
 
   // Barcelona
   list<SubtargetFeature> BarcelonaFeatures = [FeatureX87,
-                                              FeatureCMPXCHG8B,
+                                              FeatureCX8,
                                               FeatureSSE4A,
                                               Feature3DNowA,
                                               FeatureFXSR,
                                               FeatureNOPL,
-                                              FeatureCMPXCHG16B,
+                                              FeatureCX16,
                                               FeaturePRFCHW,
                                               FeatureLZCNT,
                                               FeaturePOPCNT,
-                                              FeatureLAHFSAHF,
+                                              FeatureLAHFSAHF64,
                                               FeatureCMOV,
-                                              Feature64Bit];
+                                              FeatureX86_64];
   list<SubtargetFeature> BarcelonaTuning = [TuningFastScalarShiftMasks,
                                             TuningSlowSHLD,
+                                            TuningSBBDepBreaking,
                                             TuningInsertVZEROUPPER];
 
   // Bobcat
   list<SubtargetFeature> BtVer1Features = [FeatureX87,
-                                           FeatureCMPXCHG8B,
+                                           FeatureCX8,
                                            FeatureCMOV,
                                            FeatureMMX,
                                            FeatureSSSE3,
                                            FeatureSSE4A,
                                            FeatureFXSR,
                                            FeatureNOPL,
-                                           Feature64Bit,
-                                           FeatureCMPXCHG16B,
+                                           FeatureX86_64,
+                                           FeatureCX16,
                                            FeaturePRFCHW,
                                            FeatureLZCNT,
                                            FeaturePOPCNT,
-                                           FeatureLAHFSAHF];
+                                           FeatureLAHFSAHF64];
   list<SubtargetFeature> BtVer1Tuning = [TuningFast15ByteNOP,
                                          TuningFastScalarShiftMasks,
                                          TuningFastVectorShiftMasks,
                                          TuningSlowSHLD,
+                                         TuningSBBDepBreaking,
                                          TuningInsertVZEROUPPER];
 
   // Jaguar
@@ -1072,17 +1129,18 @@ def ProcessorFeatures {
                                          TuningFastScalarShiftMasks,
                                          TuningFastVectorShiftMasks,
                                          TuningFastMOVBE,
+                                         TuningSBBDepBreaking,
                                          TuningSlowSHLD];
   list<SubtargetFeature> BtVer2Features =
     !listconcat(BtVer1Features, BtVer2AdditionalFeatures);
 
   // Bulldozer
   list<SubtargetFeature> BdVer1Features = [FeatureX87,
-                                           FeatureCMPXCHG8B,
+                                           FeatureCX8,
                                            FeatureCMOV,
                                            FeatureXOP,
-                                           Feature64Bit,
-                                           FeatureCMPXCHG16B,
+                                           FeatureX86_64,
+                                           FeatureCX16,
                                            FeatureAES,
                                            FeatureCRC32,
                                            FeaturePRFCHW,
@@ -1094,11 +1152,12 @@ def ProcessorFeatures {
                                            FeaturePOPCNT,
                                            FeatureXSAVE,
                                            FeatureLWP,
-                                           FeatureLAHFSAHF];
+                                           FeatureLAHFSAHF64];
   list<SubtargetFeature> BdVer1Tuning = [TuningSlowSHLD,
                                          TuningFast11ByteNOP,
                                          TuningFastScalarShiftMasks,
                                          TuningBranchFusion,
+                                         TuningSBBDepBreaking,
                                          TuningInsertVZEROUPPER];
 
   // PileDriver
@@ -1140,15 +1199,15 @@ def ProcessorFeatures {
                                        FeatureCLFLUSHOPT,
                                        FeatureCLZERO,
                                        FeatureCMOV,
-                                       Feature64Bit,
-                                       FeatureCMPXCHG16B,
+                                       FeatureX86_64,
+                                       FeatureCX16,
                                        FeatureCRC32,
                                        FeatureF16C,
                                        FeatureFMA,
                                        FeatureFSGSBase,
                                        FeatureFXSR,
                                        FeatureNOPL,
-                                       FeatureLAHFSAHF,
+                                       FeatureLAHFSAHF64,
                                        FeatureLZCNT,
                                        FeatureMMX,
                                        FeatureMOVBE,
@@ -1169,9 +1228,13 @@ def ProcessorFeatures {
                                      TuningFastBEXTR,
                                      TuningFast15ByteNOP,
                                      TuningBranchFusion,
+                                     TuningFastScalarFSQRT,
+                                     TuningFastVectorFSQRT,
                                      TuningFastScalarShiftMasks,
+                                     TuningFastVariablePerLaneShuffle,
                                      TuningFastMOVBE,
                                      TuningSlowSHLD,
+                                     TuningSBBDepBreaking,
                                      TuningInsertVZEROUPPER];
   list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
                                                   FeatureRDPID,
@@ -1184,11 +1247,9 @@ def ProcessorFeatures {
                                                   FeaturePKU,
                                                   FeatureVAES,
                                                   FeatureVPCLMULQDQ];
-  list<SubtargetFeature> ZN3AdditionalTuning =
-    [TuningMacroFusion,
-     TuningFastVariablePerLaneShuffle];
+  list<SubtargetFeature> ZN3AdditionalTuning = [TuningMacroFusion];
   list<SubtargetFeature> ZN3Tuning =
-    !listconcat(ZNTuning, ZN3AdditionalTuning);
+    !listconcat(ZN2Tuning, ZN3AdditionalTuning);
   list<SubtargetFeature> ZN3Features =
     !listconcat(ZN2Features, ZN3AdditionalFeatures);
 }
@@ -1209,39 +1270,43 @@ class ProcModel<string Name, SchedMachineModel Model,
 // NOTE: CMPXCHG8B is here for legacy compatibility so that it is only disabled
 // if i386/i486 is specifically requested.
 // NOTE: 64Bit is here as "generic" is the default llc CPU. The X86Subtarget
-// constructor checks that any CPU used in 64-bit mode has Feature64Bit enabled.
-// It has no effect on code generation.
+// constructor checks that any CPU used in 64-bit mode has FeatureX86_64
+// enabled. It has no effect on code generation.
+// NOTE: As a default tuning, "generic" aims to produce code optimized for the
+// most common X86 processors. The tunings might be changed over time. It is
+// recommended to use "x86-64" in lit tests for consistency.
 def : ProcModel<"generic", SandyBridgeModel,
-                [FeatureX87, FeatureCMPXCHG8B, Feature64Bit],
+                [FeatureX87, FeatureCX8, FeatureX86_64],
                 [TuningSlow3OpsLEA,
                  TuningSlowDivide64,
-                 TuningSlowIncDec,
                  TuningMacroFusion,
+                 TuningFastScalarFSQRT,
+                 TuningFast15ByteNOP,
                  TuningInsertVZEROUPPER]>;
 
 def : Proc<"i386",            [FeatureX87],
                               [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 def : Proc<"i486",            [FeatureX87],
                               [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
-def : Proc<"i586",            [FeatureX87, FeatureCMPXCHG8B],
+def : Proc<"i586",            [FeatureX87, FeatureCX8],
                               [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
-def : Proc<"pentium",         [FeatureX87, FeatureCMPXCHG8B],
+def : Proc<"pentium",         [FeatureX87, FeatureCX8],
                               [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
-def : Proc<"pentium-mmx",     [FeatureX87, FeatureCMPXCHG8B, FeatureMMX],
+def : Proc<"pentium-mmx",     [FeatureX87, FeatureCX8, FeatureMMX],
                               [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 
-def : Proc<"i686", [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV],
+def : Proc<"i686", [FeatureX87, FeatureCX8, FeatureCMOV],
                    [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
-def : Proc<"pentiumpro", [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV,
+def : Proc<"pentiumpro", [FeatureX87, FeatureCX8, FeatureCMOV,
                           FeatureNOPL],
                          [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 
-def : Proc<"pentium2", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureCMOV,
+def : Proc<"pentium2", [FeatureX87, FeatureCX8, FeatureMMX, FeatureCMOV,
                         FeatureFXSR, FeatureNOPL],
                        [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 
 foreach P = ["pentium3", "pentium3m"] in {
-  def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureMMX,
+  def : Proc<P, [FeatureX87, FeatureCX8, FeatureMMX,
                  FeatureSSE1, FeatureFXSR, FeatureNOPL, FeatureCMOV],
                 [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 }
@@ -1257,42 +1322,42 @@ foreach P = ["pentium3", "pentium3m"] in {
 // changes slightly.
 
 def : ProcModel<"pentium-m", GenericPostRAModel,
-                [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE2,
+                [FeatureX87, FeatureCX8, FeatureMMX, FeatureSSE2,
                 FeatureFXSR, FeatureNOPL, FeatureCMOV],
                 [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 
 foreach P = ["pentium4", "pentium4m"] in {
   def : ProcModel<P, GenericPostRAModel,
-                  [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE2,
+                  [FeatureX87, FeatureCX8, FeatureMMX, FeatureSSE2,
                    FeatureFXSR, FeatureNOPL, FeatureCMOV],
                   [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 }
 
 // Intel Quark.
-def : Proc<"lakemont", [FeatureCMPXCHG8B],
+def : Proc<"lakemont", [FeatureCX8],
                        [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 
 // Intel Core Duo.
 def : ProcModel<"yonah", SandyBridgeModel,
-                [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE3,
+                [FeatureX87, FeatureCX8, FeatureMMX, FeatureSSE3,
                  FeatureFXSR, FeatureNOPL, FeatureCMOV],
                 [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 
 // NetBurst.
 def : ProcModel<"prescott", GenericPostRAModel,
-                [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE3,
+                [FeatureX87, FeatureCX8, FeatureMMX, FeatureSSE3,
                  FeatureFXSR, FeatureNOPL, FeatureCMOV],
                 [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 def : ProcModel<"nocona", GenericPostRAModel, [
   FeatureX87,
-  FeatureCMPXCHG8B,
+  FeatureCX8,
   FeatureCMOV,
   FeatureMMX,
   FeatureSSE3,
   FeatureFXSR,
   FeatureNOPL,
-  Feature64Bit,
-  FeatureCMPXCHG16B,
+  FeatureX86_64,
+  FeatureCX16,
 ],
 [
   TuningSlowUAMem16,
@@ -1302,15 +1367,15 @@ def : ProcModel<"nocona", GenericPostRAModel, [
 // Intel Core 2 Solo/Duo.
 def : ProcModel<"core2", SandyBridgeModel, [
   FeatureX87,
-  FeatureCMPXCHG8B,
+  FeatureCX8,
   FeatureCMOV,
   FeatureMMX,
   FeatureSSSE3,
   FeatureFXSR,
   FeatureNOPL,
-  Feature64Bit,
-  FeatureCMPXCHG16B,
-  FeatureLAHFSAHF
+  FeatureX86_64,
+  FeatureCX16,
+  FeatureLAHFSAHF64
 ],
 [
   TuningMacroFusion,
@@ -1319,15 +1384,15 @@ def : ProcModel<"core2", SandyBridgeModel, [
 ]>;
 def : ProcModel<"penryn", SandyBridgeModel, [
   FeatureX87,
-  FeatureCMPXCHG8B,
+  FeatureCX8,
   FeatureCMOV,
   FeatureMMX,
   FeatureSSE41,
   FeatureFXSR,
   FeatureNOPL,
-  Feature64Bit,
-  FeatureCMPXCHG16B,
-  FeatureLAHFSAHF
+  FeatureX86_64,
+  FeatureCX16,
+  FeatureLAHFSAHF64
 ],
 [
   TuningMacroFusion,
@@ -1416,38 +1481,38 @@ def : ProcModel<"alderlake", SkylakeClientModel,
 
 // AMD CPUs.
 
-def : Proc<"k6",   [FeatureX87, FeatureCMPXCHG8B, FeatureMMX],
+def : Proc<"k6",   [FeatureX87, FeatureCX8, FeatureMMX],
                    [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
-def : Proc<"k6-2", [FeatureX87, FeatureCMPXCHG8B, Feature3DNow],
+def : Proc<"k6-2", [FeatureX87, FeatureCX8, Feature3DNow],
                    [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
-def : Proc<"k6-3", [FeatureX87, FeatureCMPXCHG8B, Feature3DNow],
+def : Proc<"k6-3", [FeatureX87, FeatureCX8, Feature3DNow],
                    [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 
 foreach P = ["athlon", "athlon-tbird"] in {
-  def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV, Feature3DNowA,
+  def : Proc<P, [FeatureX87, FeatureCX8, FeatureCMOV, Feature3DNowA,
                  FeatureNOPL],
                 [TuningSlowSHLD, TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 }
 
 foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in {
-  def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV,
+  def : Proc<P, [FeatureX87, FeatureCX8, FeatureCMOV,
                  FeatureSSE1, Feature3DNowA, FeatureFXSR, FeatureNOPL],
                 [TuningSlowSHLD, TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 }
 
 foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in {
-  def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE2, Feature3DNowA,
-                 FeatureFXSR, FeatureNOPL, Feature64Bit, FeatureCMOV],
+  def : Proc<P, [FeatureX87, FeatureCX8, FeatureSSE2, Feature3DNowA,
+                 FeatureFXSR, FeatureNOPL, FeatureX86_64, FeatureCMOV],
                 [TuningFastScalarShiftMasks, TuningSlowSHLD, TuningSlowUAMem16,
-                 TuningInsertVZEROUPPER]>;
+                 TuningSBBDepBreaking, TuningInsertVZEROUPPER]>;
 }
 
 foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in {
-  def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE3, Feature3DNowA,
-                 FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, FeatureCMOV,
-                 Feature64Bit],
+  def : Proc<P, [FeatureX87, FeatureCX8, FeatureSSE3, Feature3DNowA,
+                 FeatureFXSR, FeatureNOPL, FeatureCX16, FeatureCMOV,
+                 FeatureX86_64],
                 [TuningFastScalarShiftMasks, TuningSlowSHLD, TuningSlowUAMem16,
-                 TuningInsertVZEROUPPER]>;
+                 TuningSBBDepBreaking, TuningInsertVZEROUPPER]>;
 }
 
 foreach P = ["amdfam10", "barcelona"] in {
@@ -1482,7 +1547,7 @@ def : ProcModel<"znver2", Znver2Model, ProcessorFeatures.ZN2Features,
 def : ProcModel<"znver3", Znver3Model, ProcessorFeatures.ZN3Features,
                 ProcessorFeatures.ZN3Tuning>;
 
-def : Proc<"geode",           [FeatureX87, FeatureCMPXCHG8B, Feature3DNowA],
+def : Proc<"geode",           [FeatureX87, FeatureCX8, Feature3DNowA],
                               [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 
 def : Proc<"winchip-c6",      [FeatureX87, FeatureMMX],
@@ -1491,7 +1556,7 @@ def : Proc<"winchip2",        [FeatureX87, Feature3DNow],
                               [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 def : Proc<"c3",              [FeatureX87, Feature3DNow],
                               [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
-def : Proc<"c3-2",            [FeatureX87, FeatureCMPXCHG8B, FeatureMMX,
+def : Proc<"c3-2",            [FeatureX87, FeatureCX8, FeatureMMX,
                                FeatureSSE1, FeatureFXSR, FeatureCMOV],
                               [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.cpp b/llvm/lib/Target/X86/X86AsmPrinter.cpp
index d48b8e458219..c205395aa084 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -29,6 +29,7 @@
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
@@ -60,8 +61,7 @@ bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
   SMShadowTracker.startFunction(MF);
   CodeEmitter.reset(TM.getTarget().createMCCodeEmitter(
-      *Subtarget->getInstrInfo(), *Subtarget->getRegisterInfo(),
-      MF.getContext()));
+      *Subtarget->getInstrInfo(), MF.getContext()));
 
   EmitFPOData =
       Subtarget->isTargetWin32() && MF.getMMI().getModule()->getCodeViewFlag();
@@ -70,12 +70,12 @@ bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
   if (Subtarget->isTargetCOFF()) {
     bool Local = MF.getFunction().hasLocalLinkage();
-    OutStreamer->BeginCOFFSymbolDef(CurrentFnSym);
-    OutStreamer->EmitCOFFSymbolStorageClass(
+    OutStreamer->beginCOFFSymbolDef(CurrentFnSym);
+    OutStreamer->emitCOFFSymbolStorageClass(
         Local ? COFF::IMAGE_SYM_CLASS_STATIC : COFF::IMAGE_SYM_CLASS_EXTERNAL);
-    OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION
-                                               << COFF::SCT_COMPLEX_TYPE_SHIFT);
-    OutStreamer->EndCOFFSymbolDef();
+    OutStreamer->emitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION
+                                    << COFF::SCT_COMPLEX_TYPE_SHIFT);
+    OutStreamer->endCOFFSymbolDef();
   }
 
   // Emit the rest of the function body.
@@ -249,7 +249,7 @@ void X86AsmPrinter::PrintOperand(const MachineInstr *MI, unsigned OpNo,
 void X86AsmPrinter::PrintModifiedOperand(const MachineInstr *MI, unsigned OpNo,
                                          raw_ostream &O, const char *Modifier) {
   const MachineOperand &MO = MI->getOperand(OpNo);
-  if (!Modifier || MO.getType() != MachineOperand::MO_Register)
+  if (!Modifier || !MO.isReg())
     return PrintOperand(MI, OpNo, O);
   if (MI->getInlineAsmDialect() == InlineAsm::AD_ATT)
     O << '%';
@@ -336,6 +336,37 @@ void X86AsmPrinter::PrintLeaMemReference(const MachineInstr *MI, unsigned OpNo,
   }
 }
 
+static bool isSimpleReturn(const MachineInstr &MI) {
+  // We exclude all tail calls here which set both isReturn and isCall.
+  return MI.getDesc().isReturn() && !MI.getDesc().isCall();
+}
+
+static bool isIndirectBranchOrTailCall(const MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  return MI.getDesc().isIndirectBranch() /*Make below code in a good shape*/ ||
+         Opc == X86::TAILJMPr || Opc == X86::TAILJMPm ||
+         Opc == X86::TAILJMPr64 || Opc == X86::TAILJMPm64 ||
+         Opc == X86::TCRETURNri || Opc == X86::TCRETURNmi ||
+         Opc == X86::TCRETURNri64 || Opc == X86::TCRETURNmi64 ||
+         Opc == X86::TAILJMPr64_REX || Opc == X86::TAILJMPm64_REX;
+}
+
+void X86AsmPrinter::emitBasicBlockEnd(const MachineBasicBlock &MBB) {
+  if (Subtarget->hardenSlsRet() || Subtarget->hardenSlsIJmp()) {
+    auto I = MBB.getLastNonDebugInstr();
+    if (I != MBB.end()) {
+      if ((Subtarget->hardenSlsRet() && isSimpleReturn(*I)) ||
+          (Subtarget->hardenSlsIJmp() && isIndirectBranchOrTailCall(*I))) {
+        MCInst TmpInst;
+        TmpInst.setOpcode(X86::INT3);
+        EmitToStreamer(*OutStreamer, TmpInst);
+      }
+    }
+  }
+  AsmPrinter::emitBasicBlockEnd(MBB);
+  SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
+}
+
 void X86AsmPrinter::PrintMemReference(const MachineInstr *MI, unsigned OpNo,
                                       raw_ostream &O, const char *Modifier) {
   assert(isMem(*MI, OpNo) && "Invalid memory reference!");
@@ -363,6 +394,12 @@ void X86AsmPrinter::PrintIntelMemReference(const MachineInstr *MI,
       BaseReg.getReg() == X86::RIP)
     HasBaseReg = false;
 
+  // If we really just want to print out displacement.
+  if (Modifier && (DispSpec.isGlobal() || DispSpec.isSymbol()) &&
+      !strcmp(Modifier, "disp-only")) {
+    HasBaseReg = false;
+  }
+
   // If this has a segment register, print it.
   if (SegReg.getReg()) {
     PrintOperand(MI, OpNo + X86::AddrSegmentReg, O);
@@ -606,11 +643,14 @@ bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
         PrintMemReference(MI, OpNo, O, "H");
       }
       return false;
-    case 'P': // Don't print @PLT, but do print as memory.
+   // Print memory only with displacement. The Modifer 'P' is used in inline
+   // asm to present a call symbol or a global symbol which can not use base
+   // reg or index reg.
+    case 'P':
       if (MI->getInlineAsmDialect() == InlineAsm::AD_Intel) {
-        PrintIntelMemReference(MI, OpNo, O, "no-rip");
+        PrintIntelMemReference(MI, OpNo, O, "disp-only");
       } else {
-        PrintMemReference(MI, OpNo, O, "no-rip");
+        PrintMemReference(MI, OpNo, O, "disp-only");
       }
       return false;
     }
@@ -641,7 +681,7 @@ void X86AsmPrinter::emitStartOfAsmFile(Module &M) {
       MCSection *Cur = OutStreamer->getCurrentSectionOnly();
       MCSection *Nt = MMI->getContext().getELFSection(
           ".note.gnu.property", ELF::SHT_NOTE, ELF::SHF_ALLOC);
-      OutStreamer->SwitchSection(Nt);
+      OutStreamer->switchSection(Nt);
 
       // Emitting note header.
       const int WordSize = TT.isArch64Bit() && !TT.isX32() ? 8 : 4;
@@ -658,21 +698,21 @@ void X86AsmPrinter::emitStartOfAsmFile(Module &M) {
       emitAlignment(WordSize == 4 ? Align(4) : Align(8)); // padding
 
       OutStreamer->endSection(Nt);
-      OutStreamer->SwitchSection(Cur);
+      OutStreamer->switchSection(Cur);
     }
   }
 
   if (TT.isOSBinFormatMachO())
-    OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
+    OutStreamer->switchSection(getObjFileLowering().getTextSection());
 
   if (TT.isOSBinFormatCOFF()) {
     // Emit an absolute @feat.00 symbol.  This appears to be some kind of
     // compiler features bitfield read by link.exe.
     MCSymbol *S = MMI->getContext().getOrCreateSymbol(StringRef("@feat.00"));
-    OutStreamer->BeginCOFFSymbolDef(S);
-    OutStreamer->EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC);
-    OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_NULL);
-    OutStreamer->EndCOFFSymbolDef();
+    OutStreamer->beginCOFFSymbolDef(S);
+    OutStreamer->emitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC);
+    OutStreamer->emitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_NULL);
+    OutStreamer->endCOFFSymbolDef();
     int64_t Feat00Flags = 0;
 
     if (TT.getArch() == Triple::x86) {
@@ -739,7 +779,7 @@ static void emitNonLazyStubs(MachineModuleInfo *MMI, MCStreamer &OutStreamer) {
   // Output stubs for external and common global variables.
   Stubs = MMIMacho.GetGVStubList();
   if (!Stubs.empty()) {
-    OutStreamer.SwitchSection(MMI->getContext().getMachOSection(
+    OutStreamer.switchSection(MMI->getContext().getMachOSection(
         "__IMPORT", "__pointers", MachO::S_NON_LAZY_SYMBOL_POINTERS,
         SectionKind::getMetadata()));
 
@@ -747,7 +787,7 @@ static void emitNonLazyStubs(MachineModuleInfo *MMI, MCStreamer &OutStreamer) {
       emitNonLazySymbolPointer(OutStreamer, Stub.first, Stub.second);
 
     Stubs.clear();
-    OutStreamer.AddBlankLine();
+    OutStreamer.addBlankLine();
   }
 }
 
@@ -795,6 +835,22 @@ void X86AsmPrinter::emitEndOfAsmFile(Module &M) {
     emitStackMaps(SM);
     FM.serializeToFaultMapSection();
   }
+
+  // Emit __morestack address if needed for indirect calls.
+  if (TT.getArch() == Triple::x86_64 && TM.getCodeModel() == CodeModel::Large) {
+    if (MCSymbol *AddrSymbol = OutContext.lookupSymbol("__morestack_addr")) {
+      Align Alignment(1);
+      MCSection *ReadOnlySection = getObjFileLowering().getSectionForConstant(
+          getDataLayout(), SectionKind::getReadOnly(),
+          /*C=*/nullptr, Alignment);
+      OutStreamer->switchSection(ReadOnlySection);
+      OutStreamer->emitLabel(AddrSymbol);
+
+      unsigned PtrSize = MAI->getCodePointerSize();
+      OutStreamer->emitSymbolValue(GetExternalSymbolSymbol("__morestack"),
+                                   PtrSize);
+    }
+  }
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.h b/llvm/lib/Target/X86/X86AsmPrinter.h
index 94679e6e3d11..d53c26b729ef 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.h
+++ b/llvm/lib/Target/X86/X86AsmPrinter.h
@@ -131,10 +131,7 @@ public:
 
   void emitInstruction(const MachineInstr *MI) override;
 
-  void emitBasicBlockEnd(const MachineBasicBlock &MBB) override {
-    AsmPrinter::emitBasicBlockEnd(MBB);
-    SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
-  }
+  void emitBasicBlockEnd(const MachineBasicBlock &MBB) override;
 
   bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                        const char *ExtraCode, raw_ostream &O) override;
diff --git a/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp b/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp
index 0899783d5f60..2ecf49382d29 100644
--- a/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp
+++ b/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp
@@ -35,6 +35,7 @@
 #include "X86.h"
 #include "X86InstrInfo.h"
 #include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 
 #define AVOIDCALL_DESC "X86 avoid trailing call pass"
@@ -69,8 +70,8 @@ INITIALIZE_PASS(X86AvoidTrailingCallPass, AVOIDCALL_NAME, AVOIDCALL_DESC, false,
 // A real instruction is a non-meta, non-pseudo instruction.  Some pseudos
 // expand to nothing, and some expand to code. This logic conservatively assumes
 // they might expand to nothing.
-static bool isRealInstruction(MachineInstr &MI) {
-  return !MI.isPseudo() && !MI.isMetaInstruction();
+static bool isCallOrRealInstruction(MachineInstr &MI) {
+  return MI.isCall() || (!MI.isPseudo() && !MI.isMetaInstruction());
 }
 
 // Return true if this is a call instruction, but not a tail call.
@@ -100,7 +101,7 @@ bool X86AvoidTrailingCallPass::runOnMachineFunction(MachineFunction &MF) {
       continue;
 
     // Find the last real instruction in this block.
-    auto LastRealInstr = llvm::find_if(reverse(MBB), isRealInstruction);
+    auto LastRealInstr = llvm::find_if(reverse(MBB), isCallOrRealInstruction);
 
     // If the block is empty or the last real instruction is a call instruction,
     // insert an int3. If there is a call instruction, insert the int3 between
diff --git a/llvm/lib/Target/X86/X86CallingConv.cpp b/llvm/lib/Target/X86/X86CallingConv.cpp
index c80a5d5bb332..ded93fdc011c 100644
--- a/llvm/lib/Target/X86/X86CallingConv.cpp
+++ b/llvm/lib/Target/X86/X86CallingConv.cpp
@@ -299,7 +299,7 @@ static bool CC_X86_Intr(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                         ISD::ArgFlagsTy &ArgFlags, CCState &State) {
   const MachineFunction &MF = State.getMachineFunction();
   size_t ArgCount = State.getMachineFunction().getFunction().arg_size();
-  bool Is64Bit = static_cast<const X86Subtarget &>(MF.getSubtarget()).is64Bit();
+  bool Is64Bit = MF.getSubtarget<X86Subtarget>().is64Bit();
   unsigned SlotSize = Is64Bit ? 8 : 4;
   unsigned Offset;
   if (ArgCount == 1 && ValNo == 0) {
diff --git a/llvm/lib/Target/X86/X86CmovConversion.cpp b/llvm/lib/Target/X86/X86CmovConversion.cpp
index 96d3d1390a59..f32891552a82 100644
--- a/llvm/lib/Target/X86/X86CmovConversion.cpp
+++ b/llvm/lib/Target/X86/X86CmovConversion.cpp
@@ -97,6 +97,11 @@ static cl::opt<bool> ForceMemOperand(
     cl::desc("Convert cmovs to branches whenever they have memory operands."),
     cl::init(true), cl::Hidden);
 
+static cl::opt<bool> ForceAll(
+    "x86-cmov-converter-force-all",
+    cl::desc("Convert all cmovs to branches."),
+    cl::init(false), cl::Hidden);
+
 namespace {
 
 /// Converts X86 cmov instructions into branches when profitable.
@@ -174,11 +179,11 @@ bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {
   TSchedModel.init(&STI);
 
   // Before we handle the more subtle cases of register-register CMOVs inside
-  // of potentially hot loops, we want to quickly remove all CMOVs with
-  // a memory operand. The CMOV will risk a stall waiting for the load to
-  // complete that speculative execution behind a branch is better suited to
-  // handle on modern x86 chips.
-  if (ForceMemOperand) {
+  // of potentially hot loops, we want to quickly remove all CMOVs (ForceAll) or
+  // the ones with a memory operand (ForceMemOperand option). The latter CMOV
+  // will risk a stall waiting for the load to complete that speculative
+  // execution behind a branch is better suited to handle on modern x86 chips.
+  if (ForceMemOperand || ForceAll) {
     CmovGroups AllCmovGroups;
     SmallVector<MachineBasicBlock *, 4> Blocks;
     for (auto &MBB : MF)
@@ -186,7 +191,8 @@ bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {
     if (collectCmovCandidates(Blocks, AllCmovGroups, /*IncludeLoads*/ true)) {
       for (auto &Group : AllCmovGroups) {
         // Skip any group that doesn't do at least one memory operand cmov.
-        if (llvm::none_of(Group, [&](MachineInstr *I) { return I->mayLoad(); }))
+        if (ForceMemOperand && !ForceAll &&
+            llvm::none_of(Group, [&](MachineInstr *I) { return I->mayLoad(); }))
           continue;
 
         // For CMOV groups which we can rewrite and which contain a memory load,
@@ -196,12 +202,15 @@ bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {
         convertCmovInstsToBranches(Group);
       }
     }
+    // Early return as ForceAll converts all CmovGroups.
+    if (ForceAll)
+      return Changed;
   }
 
   //===--------------------------------------------------------------------===//
   // Register-operand Conversion Algorithm
   // ---------
-  //   For each inner most loop
+  //   For each innermost loop
   //     collectCmovCandidates() {
   //       Find all CMOV-group-candidates.
   //     }
@@ -230,7 +239,7 @@ bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {
       Loops.push_back(Child);
 
   for (MachineLoop *CurrLoop : Loops) {
-    // Optimize only inner most loops.
+    // Optimize only innermost loops.
     if (!CurrLoop->getSubLoops().empty())
       continue;
 
@@ -520,7 +529,7 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates(
   //===--------------------------------------------------------------------===//
   // Step 3: Check for each CMOV-group-candidate if it worth to be optimized.
   // Worth-Optimize-Group:
-  //   Iff it worths to optimize all CMOV instructions in the group.
+  //   Iff it is worth to optimize all CMOV instructions in the group.
   //
   // Worth-Optimize-CMOV:
   //   Predicted branch is faster than CMOV by the difference between depth of
diff --git a/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp b/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp
index 2ff8ee19561b..29668f4b2761 100644
--- a/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp
+++ b/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp
@@ -16,6 +16,7 @@
 #include "X86InstrInfo.h"
 #include "X86MachineFunctionInfo.h"
 #include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/ProfileData/SampleProf.h"
@@ -159,7 +160,7 @@ bool X86DiscriminateMemOps::runOnMachineFunction(MachineFunction &MF) {
         }
         // Since we were able to encode, bump the MemOpDiscriminators.
         ++MemOpDiscriminators[L];
-        DI = DI->cloneWithDiscriminator(EncodedDiscriminator.getValue());
+        DI = DI->cloneWithDiscriminator(*EncodedDiscriminator);
         assert(DI && "DI should not be nullptr");
         updateDebugInfo(&MI, DI);
         Changed = true;
diff --git a/llvm/lib/Target/X86/X86DomainReassignment.cpp b/llvm/lib/Target/X86/X86DomainReassignment.cpp
index 9826bf4bf861..9d4338deca35 100644
--- a/llvm/lib/Target/X86/X86DomainReassignment.cpp
+++ b/llvm/lib/Target/X86/X86DomainReassignment.cpp
@@ -15,6 +15,7 @@
 #include "X86.h"
 #include "X86InstrInfo.h"
 #include "X86Subtarget.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/STLExtras.h"
@@ -86,7 +87,7 @@ protected:
 public:
   InstrConverterBase(unsigned SrcOpcode) : SrcOpcode(SrcOpcode) {}
 
-  virtual ~InstrConverterBase() {}
+  virtual ~InstrConverterBase() = default;
 
   /// \returns true if \p MI is legal to convert.
   virtual bool isLegal(const MachineInstr *MI,
@@ -374,7 +375,7 @@ class X86DomainReassignment : public MachineFunctionPass {
   const X86InstrInfo *TII = nullptr;
 
   /// All edges that are included in some closure
-  DenseSet<unsigned> EnclosedEdges;
+  BitVector EnclosedEdges{8, false};
 
   /// All instructions that are included in some closure.
   DenseMap<MachineInstr *, unsigned> EnclosedInstrs;
@@ -429,10 +430,10 @@ char X86DomainReassignment::ID = 0;
 void X86DomainReassignment::visitRegister(Closure &C, Register Reg,
                                           RegDomain &Domain,
                                           SmallVectorImpl<unsigned> &Worklist) {
-  if (EnclosedEdges.count(Reg))
+  if (!Reg.isVirtual())
     return;
 
-  if (!Reg.isVirtual())
+  if (EnclosedEdges.test(Register::virtReg2Index(Reg)))
     return;
 
   if (!MRI->hasOneDef(Reg))
@@ -550,7 +551,7 @@ void X86DomainReassignment::buildClosure(Closure &C, Register Reg) {
     // Register already in this closure.
     if (!C.insertEdge(CurReg))
       continue;
-    EnclosedEdges.insert(Reg);
+    EnclosedEdges.set(Register::virtReg2Index(Reg));
 
     MachineInstr *DefMI = MRI->getVRegDef(CurReg);
     encloseInstr(C, DefMI);
@@ -742,6 +743,7 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
 
   EnclosedEdges.clear();
+  EnclosedEdges.resize(MRI->getNumVirtRegs());
   EnclosedInstrs.clear();
 
   std::vector<Closure> Closures;
@@ -756,7 +758,7 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {
       continue;
 
     // Register already in closure.
-    if (EnclosedEdges.count(Reg))
+    if (EnclosedEdges.test(Idx))
       continue;
 
     // Calculate closure starting with Reg.
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index 6a047838f0b5..aebeec5a6d27 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -19,6 +19,7 @@
 #include "X86MachineFunctionInfo.h"
 #include "X86Subtarget.h"
 #include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/Passes.h" // For IDs of passes that are preserved.
@@ -552,7 +553,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
   case X86::PTILELOADDV:
   case X86::PTILELOADDT1V: {
     for (unsigned i = 2; i > 0; --i)
-      MI.RemoveOperand(i);
+      MI.removeOperand(i);
     unsigned Opc =
         Opcode == X86::PTILELOADDV ? X86::TILELOADD : X86::TILELOADDT1;
     MI.setDesc(TII->get(Opc));
@@ -565,7 +566,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
   case X86::PTDPBF16PSV: {
     MI.untieRegOperand(4);
     for (unsigned i = 3; i > 0; --i)
-      MI.RemoveOperand(i);
+      MI.removeOperand(i);
     unsigned Opc;
     switch (Opcode) {
     case X86::PTDPBSSDV:   Opc = X86::TDPBSSD; break;
@@ -581,13 +582,13 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
   }
   case X86::PTILESTOREDV: {
     for (int i = 1; i >= 0; --i)
-      MI.RemoveOperand(i);
+      MI.removeOperand(i);
     MI.setDesc(TII->get(X86::TILESTORED));
     return true;
   }
   case X86::PTILEZEROV: {
     for (int i = 2; i > 0; --i) // Remove row, col
-      MI.RemoveOperand(i);
+      MI.removeOperand(i);
     MI.setDesc(TII->get(X86::TILEZERO));
     return true;
   }
@@ -729,7 +730,7 @@ bool X86ExpandPseudo::ExpandPseudosWhichAffectControlFlow(MachineFunction &MF) {
 }
 
 bool X86ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
-  STI = &static_cast<const X86Subtarget &>(MF.getSubtarget());
+  STI = &MF.getSubtarget<X86Subtarget>();
   TII = STI->getInstrInfo();
   TRI = STI->getRegisterInfo();
   X86FI = MF.getInfo<X86MachineFunctionInfo>();
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index 1ac998b7ff7e..f2c362eeaa48 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -49,22 +49,11 @@ class X86FastISel final : public FastISel {
   /// make the right decision when generating code for different targets.
   const X86Subtarget *Subtarget;
 
-  /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
-  /// floating point ops.
-  /// When SSE is available, use it for f32 operations.
-  /// When SSE2 is available, use it for f64 operations.
-  bool X86ScalarSSEf64;
-  bool X86ScalarSSEf32;
-  bool X86ScalarSSEf16;
-
 public:
   explicit X86FastISel(FunctionLoweringInfo &funcInfo,
                        const TargetLibraryInfo *libInfo)
       : FastISel(funcInfo, libInfo) {
     Subtarget = &funcInfo.MF->getSubtarget<X86Subtarget>();
-    X86ScalarSSEf64 = Subtarget->hasSSE2();
-    X86ScalarSSEf32 = Subtarget->hasSSE1();
-    X86ScalarSSEf16 = Subtarget->hasFP16();
   }
 
   bool fastSelectInstruction(const Instruction *I) override;
@@ -158,9 +147,8 @@ private:
   /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
   /// computed in an SSE register, not on the X87 floating point stack.
   bool isScalarFPTypeInSSEReg(EVT VT) const {
-    return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
-           (VT == MVT::f32 && X86ScalarSSEf32) || // f32 is when SSE1
-           (VT == MVT::f16 && X86ScalarSSEf16);   // f16 is when AVX512FP16
+    return (VT == MVT::f64 && Subtarget->hasSSE2()) ||
+           (VT == MVT::f32 && Subtarget->hasSSE1()) || VT == MVT::f16;
   }
 
   bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false);
@@ -292,6 +280,11 @@ bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
   if (I->isTerminator() && llvm::any_of(successors(I), HasPhis))
     return false;
 
+  // Make sure there are no potentially eflags clobbering constant
+  // materializations in between.
+  if (llvm::any_of(I->operands(), [](Value *V) { return isa<Constant>(V); }))
+    return false;
+
   CC = TmpCC;
   return true;
 }
@@ -305,9 +298,9 @@ bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
   VT = evt.getSimpleVT();
   // For now, require SSE/SSE2 for performing floating-point operations,
   // since x87 requires additional work.
-  if (VT == MVT::f64 && !X86ScalarSSEf64)
+  if (VT == MVT::f64 && !Subtarget->hasSSE2())
     return false;
-  if (VT == MVT::f32 && !X86ScalarSSEf32)
+  if (VT == MVT::f32 && !Subtarget->hasSSE1())
     return false;
   // Similarly, no f80 support yet.
   if (VT == MVT::f80)
@@ -325,6 +318,8 @@ bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
 bool X86FastISel::X86FastEmitLoad(MVT VT, X86AddressMode &AM,
                                   MachineMemOperand *MMO, unsigned &ResultReg,
                                   unsigned Alignment) {
+  bool HasSSE1 = Subtarget->hasSSE1();
+  bool HasSSE2 = Subtarget->hasSSE2();
   bool HasSSE41 = Subtarget->hasSSE41();
   bool HasAVX = Subtarget->hasAVX();
   bool HasAVX2 = Subtarget->hasAVX2();
@@ -354,20 +349,16 @@ bool X86FastISel::X86FastEmitLoad(MVT VT, X86AddressMode &AM,
     Opc = X86::MOV64rm;
     break;
   case MVT::f32:
-    if (X86ScalarSSEf32)
-      Opc = HasAVX512 ? X86::VMOVSSZrm_alt :
-            HasAVX    ? X86::VMOVSSrm_alt :
-                        X86::MOVSSrm_alt;
-    else
-      Opc = X86::LD_Fp32m;
+    Opc = HasAVX512 ? X86::VMOVSSZrm_alt
+          : HasAVX  ? X86::VMOVSSrm_alt
+          : HasSSE1 ? X86::MOVSSrm_alt
+                    : X86::LD_Fp32m;
     break;
   case MVT::f64:
-    if (X86ScalarSSEf64)
-      Opc = HasAVX512 ? X86::VMOVSDZrm_alt :
-            HasAVX    ? X86::VMOVSDrm_alt :
-                        X86::MOVSDrm_alt;
-    else
-      Opc = X86::LD_Fp64m;
+    Opc = HasAVX512 ? X86::VMOVSDZrm_alt
+          : HasAVX  ? X86::VMOVSDrm_alt
+          : HasSSE2 ? X86::MOVSDrm_alt
+                    : X86::LD_Fp64m;
     break;
   case MVT::f80:
     // No f80 support yet.
@@ -521,7 +512,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, X86AddressMode &AM,
     Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTI_64mr : X86::MOV64mr;
     break;
   case MVT::f32:
-    if (X86ScalarSSEf32) {
+    if (HasSSE1) {
       if (IsNonTemporal && HasSSE4A)
         Opc = X86::MOVNTSS;
       else
@@ -531,7 +522,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, X86AddressMode &AM,
       Opc = X86::ST_Fp32m;
     break;
   case MVT::f64:
-    if (X86ScalarSSEf32) {
+    if (HasSSE2) {
       if (IsNonTemporal && HasSSE4A)
         Opc = X86::MOVNTSD;
       else
@@ -1362,8 +1353,8 @@ bool X86FastISel::X86SelectLoad(const Instruction *I) {
 static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
   bool HasAVX512 = Subtarget->hasAVX512();
   bool HasAVX = Subtarget->hasAVX();
-  bool X86ScalarSSEf32 = Subtarget->hasSSE1();
-  bool X86ScalarSSEf64 = Subtarget->hasSSE2();
+  bool HasSSE1 = Subtarget->hasSSE1();
+  bool HasSSE2 = Subtarget->hasSSE2();
 
   switch (VT.getSimpleVT().SimpleTy) {
   default:       return 0;
@@ -1372,15 +1363,15 @@ static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
   case MVT::i32: return X86::CMP32rr;
   case MVT::i64: return X86::CMP64rr;
   case MVT::f32:
-    return X86ScalarSSEf32
-               ? (HasAVX512 ? X86::VUCOMISSZrr
-                            : HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr)
-               : 0;
+    return HasAVX512 ? X86::VUCOMISSZrr
+           : HasAVX  ? X86::VUCOMISSrr
+           : HasSSE1 ? X86::UCOMISSrr
+                     : 0;
   case MVT::f64:
-    return X86ScalarSSEf64
-               ? (HasAVX512 ? X86::VUCOMISDZrr
-                            : HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr)
-               : 0;
+    return HasAVX512 ? X86::VUCOMISDZrr
+           : HasAVX  ? X86::VUCOMISDrr
+           : HasSSE2 ? X86::UCOMISDrr
+                     : 0;
   }
 }
 
@@ -2036,7 +2027,7 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
 /// the select.
 bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
   // Check if the subtarget supports these instructions.
-  if (!Subtarget->hasCMov())
+  if (!Subtarget->canUseCMOV())
     return false;
 
   // FIXME: Add support for i8.
@@ -2289,12 +2280,13 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
   default: return false;
   case MVT::i8:  Opc = X86::CMOV_GR8;   break;
   case MVT::i16: Opc = X86::CMOV_GR16;  break;
-  case MVT::f16: Opc = X86::CMOV_FR16X; break;
   case MVT::i32: Opc = X86::CMOV_GR32;  break;
-  case MVT::f32: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR32X
-                                              : X86::CMOV_FR32; break;
-  case MVT::f64: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR64X
-                                              : X86::CMOV_FR64; break;
+  case MVT::f16:
+    Opc = Subtarget->hasAVX512() ? X86::CMOV_FR16X : X86::CMOV_FR16; break;
+  case MVT::f32:
+    Opc = Subtarget->hasAVX512() ? X86::CMOV_FR32X : X86::CMOV_FR32; break;
+  case MVT::f64:
+    Opc = Subtarget->hasAVX512() ? X86::CMOV_FR64X : X86::CMOV_FR64; break;
   }
 
   const Value *Cond = I->getOperand(0);
@@ -2495,7 +2487,7 @@ bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I,
 }
 
 bool X86FastISel::X86SelectFPExt(const Instruction *I) {
-  if (X86ScalarSSEf64 && I->getType()->isDoubleTy() &&
+  if (Subtarget->hasSSE2() && I->getType()->isDoubleTy() &&
       I->getOperand(0)->getType()->isFloatTy()) {
     bool HasAVX512 = Subtarget->hasAVX512();
     // fpext from float to double.
@@ -2509,7 +2501,7 @@ bool X86FastISel::X86SelectFPExt(const Instruction *I) {
 }
 
 bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
-  if (X86ScalarSSEf64 && I->getType()->isFloatTy() &&
+  if (Subtarget->hasSSE2() && I->getType()->isFloatTy() &&
       I->getOperand(0)->getType()->isDoubleTy()) {
     bool HasAVX512 = Subtarget->hasAVX512();
     // fptrunc from double to float.
@@ -3733,25 +3725,23 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
 
   // Get opcode and regclass of the output for the given load instruction.
   unsigned Opc = 0;
+  bool HasSSE1 = Subtarget->hasSSE1();
+  bool HasSSE2 = Subtarget->hasSSE2();
   bool HasAVX = Subtarget->hasAVX();
   bool HasAVX512 = Subtarget->hasAVX512();
   switch (VT.SimpleTy) {
   default: return 0;
   case MVT::f32:
-    if (X86ScalarSSEf32)
-      Opc = HasAVX512 ? X86::VMOVSSZrm_alt :
-            HasAVX    ? X86::VMOVSSrm_alt :
-                        X86::MOVSSrm_alt;
-    else
-      Opc = X86::LD_Fp32m;
+    Opc = HasAVX512 ? X86::VMOVSSZrm_alt
+          : HasAVX  ? X86::VMOVSSrm_alt
+          : HasSSE1 ? X86::MOVSSrm_alt
+                    : X86::LD_Fp32m;
     break;
   case MVT::f64:
-    if (X86ScalarSSEf64)
-      Opc = HasAVX512 ? X86::VMOVSDZrm_alt :
-            HasAVX    ? X86::VMOVSDrm_alt :
-                        X86::MOVSDrm_alt;
-    else
-      Opc = X86::LD_Fp64m;
+    Opc = HasAVX512 ? X86::VMOVSDZrm_alt
+          : HasAVX  ? X86::VMOVSDrm_alt
+          : HasSSE2 ? X86::MOVSDrm_alt
+                    : X86::LD_Fp64m;
     break;
   case MVT::f80:
     // No f80 support yet.
@@ -3852,11 +3842,11 @@ unsigned X86FastISel::fastMaterializeConstant(const Constant *C) {
     default:
       break;
     case MVT::f32:
-      if (!X86ScalarSSEf32)
+      if (!Subtarget->hasSSE1())
         Opc = X86::LD_Fp032;
       break;
     case MVT::f64:
-      if (!X86ScalarSSEf64)
+      if (!Subtarget->hasSSE2())
         Opc = X86::LD_Fp064;
       break;
     case MVT::f80:
@@ -3907,21 +3897,24 @@ unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) {
     return 0;
 
   // Get opcode and regclass for the given zero.
+  bool HasSSE1 = Subtarget->hasSSE1();
+  bool HasSSE2 = Subtarget->hasSSE2();
   bool HasAVX512 = Subtarget->hasAVX512();
   unsigned Opc = 0;
   switch (VT.SimpleTy) {
   default: return 0;
+  case MVT::f16:
+    Opc = HasAVX512 ? X86::AVX512_FsFLD0SH : X86::FsFLD0SH;
+    break;
   case MVT::f32:
-    if (X86ScalarSSEf32)
-      Opc = HasAVX512 ? X86::AVX512_FsFLD0SS : X86::FsFLD0SS;
-    else
-      Opc = X86::LD_Fp032;
+    Opc = HasAVX512 ? X86::AVX512_FsFLD0SS
+          : HasSSE1 ? X86::FsFLD0SS
+                    : X86::LD_Fp032;
     break;
   case MVT::f64:
-    if (X86ScalarSSEf64)
-      Opc = HasAVX512 ? X86::AVX512_FsFLD0SD : X86::FsFLD0SD;
-    else
-      Opc = X86::LD_Fp064;
+    Opc = HasAVX512 ? X86::AVX512_FsFLD0SD
+          : HasSSE2 ? X86::FsFLD0SD
+                    : X86::LD_Fp064;
     break;
   case MVT::f80:
     // No f80 support yet.
diff --git a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
new file mode 100644
index 000000000000..7e5540022cc8
--- /dev/null
+++ b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
@@ -0,0 +1,709 @@
+//===-- X86FastPreTileConfig.cpp - Fast Tile Register Configure------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Pass to preconfig the shape of physical tile registers
+/// It inserts ldtilecfg ahead of each group of tile registers. The algorithm
+/// walk each instruction of basic block in reverse order. All the tile
+/// registers that live out the basic block would be spilled and reloaded
+/// before its user. It also check the depenedency of the shape to ensure
+/// the shape is defined before ldtilecfg.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "fastpretileconfig"
+
+STATISTIC(NumStores, "Number of stores added");
+STATISTIC(NumLoads, "Number of loads added");
+
+namespace {
+
+class X86FastPreTileConfig : public MachineFunctionPass {
+  MachineFunction *MF = nullptr;
+  const X86Subtarget *ST = nullptr;
+  const TargetInstrInfo *TII = nullptr;
+  MachineRegisterInfo *MRI = nullptr;
+  X86MachineFunctionInfo *X86FI = nullptr;
+  MachineFrameInfo *MFI = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
+  MachineBasicBlock *MBB = nullptr;
+  int CfgSS = -1;
+  struct PHIInfo {
+    Register Row;
+    Register Col;
+    Register StackAddr;
+  };
+  DenseMap<MachineInstr *, struct PHIInfo> VisitedPHIs;
+
+  /// Maps virtual regs to the frame index where these values are spilled.
+  IndexedMap<int, VirtReg2IndexFunctor> StackSlotForVirtReg;
+
+  /// Has a bit set for tile virtual register for which it was determined
+  /// that it is alive across blocks.
+  BitVector MayLiveAcrossBlocks;
+
+  int getStackSpaceFor(Register VirtReg);
+  void InitializeTileConfigStackSpace();
+  bool mayLiveOut(Register VirtReg, MachineInstr *CfgMI);
+  void spill(MachineBasicBlock::iterator Before, Register VirtReg, bool Kill);
+  void reload(MachineBasicBlock::iterator UseMI, Register VirtReg,
+              MachineOperand *RowMO, MachineOperand *ColMO);
+  void canonicalizePHIs(MachineBasicBlock &MBB);
+  void convertPHI(MachineBasicBlock *MBB, MachineInstr &PHI);
+  void convertPHIs(MachineBasicBlock &MBB);
+  bool configBasicBlock(MachineBasicBlock &MBB);
+
+public:
+  X86FastPreTileConfig() : MachineFunctionPass(ID), StackSlotForVirtReg(-1) {}
+
+  /// Return the pass name.
+  StringRef getPassName() const override {
+    return "Fast Tile Register Preconfigure";
+  }
+
+  /// Perform tile register configure.
+  bool runOnMachineFunction(MachineFunction &MFunc) override;
+
+  static char ID;
+};
+
+} // end anonymous namespace
+
+char X86FastPreTileConfig::ID = 0;
+
+INITIALIZE_PASS_BEGIN(X86FastPreTileConfig, DEBUG_TYPE,
+                      "Fast Tile Register Preconfigure", false, false)
+INITIALIZE_PASS_END(X86FastPreTileConfig, DEBUG_TYPE,
+                    "Fast Tile Register Preconfigure", false, false)
+
+static bool dominates(MachineBasicBlock &MBB,
+                      MachineBasicBlock::const_iterator A,
+                      MachineBasicBlock::const_iterator B) {
+  auto MBBEnd = MBB.end();
+  if (B == MBBEnd)
+    return true;
+
+  MachineBasicBlock::const_iterator I = MBB.begin();
+  for (; &*I != A && &*I != B; ++I)
+    ;
+
+  return &*I == A;
+}
+
+/// This allocates space for the specified virtual register to be held on the
+/// stack.
+int X86FastPreTileConfig::getStackSpaceFor(Register VirtReg) {
+  // Find the location Reg would belong...
+  int SS = StackSlotForVirtReg[VirtReg];
+  // Already has space allocated?
+  if (SS != -1)
+    return SS;
+
+  // Allocate a new stack object for this spill location...
+  const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+  unsigned Size = TRI->getSpillSize(RC);
+  Align Alignment = TRI->getSpillAlign(RC);
+  int FrameIdx = MFI->CreateSpillStackObject(Size, Alignment);
+
+  // Assign the slot.
+  StackSlotForVirtReg[VirtReg] = FrameIdx;
+  return FrameIdx;
+}
+
+/// Returns false if \p VirtReg is known to not live out of the current config.
+/// If \p VirtReg live out of the current MBB, it must live out of the current
+/// config
+bool X86FastPreTileConfig::mayLiveOut(Register VirtReg, MachineInstr *CfgMI) {
+  if (MayLiveAcrossBlocks.test(Register::virtReg2Index(VirtReg)))
+    return true;
+
+  for (const MachineInstr &UseInst : MRI->use_nodbg_instructions(VirtReg)) {
+    if (UseInst.getParent() != MBB) {
+      MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
+      return true;
+    }
+
+    // The use and def are in the same MBB. If the tile register is
+    // reconfigured, it is crobbered and we need to spill and reload
+    // tile register.
+    if (CfgMI) {
+      if (dominates(*MBB, *CfgMI, UseInst)) {
+        MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+void X86FastPreTileConfig::InitializeTileConfigStackSpace() {
+  MachineBasicBlock &MBB = MF->front();
+  MachineInstr *MI = &*MBB.getFirstNonPHI();
+  DebugLoc DL;
+  if (ST->hasAVX512()) {
+    Register Zmm = MRI->createVirtualRegister(&X86::VR512RegClass);
+    BuildMI(MBB, MI, DL, TII->get(X86::AVX512_512_SET0), Zmm);
+    addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSZmr)), CfgSS)
+        .addReg(Zmm);
+  } else if (ST->hasAVX2()) {
+    Register Ymm = MRI->createVirtualRegister(&X86::VR256RegClass);
+    BuildMI(MBB, MI, DL, TII->get(X86::AVX_SET0), Ymm);
+    addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSYmr)), CfgSS)
+        .addReg(Ymm);
+    addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSYmr)), CfgSS,
+                      32)
+        .addReg(Ymm);
+  } else {
+    assert(ST->hasSSE2() && "AMX should assume SSE2 enabled");
+    unsigned StoreOpc = ST->hasAVX() ? X86::VMOVUPSmr : X86::MOVUPSmr;
+    Register Xmm = MRI->createVirtualRegister(&X86::VR128RegClass);
+    BuildMI(MBB, MI, DL, TII->get(X86::V_SET0), Xmm);
+    addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), CfgSS)
+        .addReg(Xmm);
+    addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), CfgSS, 16)
+        .addReg(Xmm);
+    addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), CfgSS, 32)
+        .addReg(Xmm);
+    addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), CfgSS, 48)
+        .addReg(Xmm);
+  }
+  // Fill in the palette first.
+  addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOV8mi)), CfgSS)
+      .addImm(1);
+}
+
+/// Insert spill instruction for \p AssignedReg before \p Before.
+/// TODO: Update DBG_VALUEs with \p VirtReg operands with the stack slot.
+void X86FastPreTileConfig::spill(MachineBasicBlock::iterator Before,
+                                 Register VirtReg, bool Kill) {
+  LLVM_DEBUG(dbgs() << "Spilling " << printReg(VirtReg, TRI) << " \n");
+  int FI = getStackSpaceFor(VirtReg);
+  LLVM_DEBUG(dbgs() << " to stack slot #" << FI << '\n');
+
+  const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+  // Don't need shape information for tile store, becasue it is adjacent to
+  // the tile def instruction.
+  TII->storeRegToStackSlot(*MBB, Before, VirtReg, Kill, FI, &RC, TRI);
+  ++NumStores;
+
+  // TODO: update DBG_VALUEs
+}
+
+/// Insert reload instruction for \p PhysReg before \p Before.
+void X86FastPreTileConfig::reload(MachineBasicBlock::iterator UseMI,
+                                  Register OrigReg, MachineOperand *RowMO,
+                                  MachineOperand *ColMO) {
+  int FI = getStackSpaceFor(OrigReg);
+  const TargetRegisterClass &RC = *MRI->getRegClass(OrigReg);
+  Register TileReg;
+  // Fold copy to tileload
+  // BB1:
+  // spill src to s
+  //
+  // BB2:
+  // t = copy src
+  // -->
+  // t = tileload (s)
+  if (UseMI->isCopy())
+    TileReg = UseMI->getOperand(0).getReg();
+  else
+    TileReg = MRI->createVirtualRegister(&RC);
+  // Can't use TII->loadRegFromStackSlot(), because we need the shape
+  // information for reload.
+  // tileloadd (%sp, %idx), %tmm
+  unsigned Opc = X86::PTILELOADDV;
+  Register StrideReg = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
+  // FIXME: MBB is not the parent of UseMI.
+  MachineInstr *NewMI = BuildMI(*UseMI->getParent(), UseMI, DebugLoc(),
+                                TII->get(X86::MOV64ri), StrideReg)
+                            .addImm(64);
+  NewMI = addFrameReference(
+      BuildMI(*UseMI->getParent(), UseMI, DebugLoc(), TII->get(Opc), TileReg)
+          .addReg(RowMO->getReg())
+          .addReg(ColMO->getReg()),
+      FI);
+  MachineOperand &MO = NewMI->getOperand(5);
+  MO.setReg(StrideReg);
+  MO.setIsKill(true);
+  RowMO->setIsKill(false);
+  ColMO->setIsKill(false);
+  // Erase copy instruction after it is folded.
+  if (UseMI->isCopy()) {
+    UseMI->eraseFromParent();
+  } else {
+    // Replace the register in the user MI.
+    for (auto &MO : UseMI->operands()) {
+      if (MO.isReg() && MO.getReg() == OrigReg)
+        MO.setReg(TileReg);
+    }
+  }
+
+  ++NumLoads;
+  LLVM_DEBUG(dbgs() << "Reloading " << printReg(OrigReg, TRI) << " into "
+                    << printReg(TileReg, TRI) << '\n');
+}
+
+static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) {
+  // The instruction must have 3 operands: tile def, row, col.
+  if (MI.isDebugInstr() || MI.getNumOperands() < 3 || !MI.isPseudo())
+    return false;
+  MachineOperand &MO = MI.getOperand(0);
+
+  if (MO.isReg()) {
+    Register Reg = MO.getReg();
+    // FIXME it may be used after Greedy RA and the physical
+    // register is not rewritten yet.
+    if (Reg.isVirtual() &&
+        MRI->getRegClass(Reg)->getID() == X86::TILERegClassID)
+      return true;
+    if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
+      return true;
+  }
+
+  return false;
+}
+
+static ShapeT getShape(MachineRegisterInfo *MRI, Register TileReg) {
+  MachineInstr *MI = MRI->getVRegDef(TileReg);
+  if (isTileDef(MRI, *MI)) {
+    MachineOperand *RowMO = &MI->getOperand(1);
+    MachineOperand *ColMO = &MI->getOperand(2);
+    return ShapeT(RowMO, ColMO, MRI);
+  } else if (MI->isCopy()) {
+    TileReg = MI->getOperand(1).getReg();
+    return getShape(MRI, TileReg);
+  }
+
+  // The def should not be PHI node, because we walk the MBB in reverse post
+  // order.
+  assert(MI->isPHI() && "Unexpected PHI when get shape.");
+  llvm_unreachable("Unexpected MI when get shape.");
+}
+
+// BB0:
+// spill t0 to s0
+// BB1:
+// spill t1 to s1
+//
+// BB2:
+// t = phi [t0, bb0] [t1, bb1]
+// -->
+// row = phi [r0, bb0] [r1, bb1]
+// col = phi [c0, bb0] [c1, bb1]
+//   s = phi [s0, bb0] [s1, bb1]
+//   t = tileload row, col, s
+// The new instruction is inserted at the end of the phi node. The order
+// of the original phi node is not ensured.
+void X86FastPreTileConfig::convertPHI(MachineBasicBlock *MBB,
+                                      MachineInstr &PHI) {
+  // 1. Create instruction to get stack slot address of each incoming block.
+  // 2. Create PHI node for the stack address.
+  // 3. Create PHI node for shape. If one of the incoming shape is immediate
+  //    use the immediate and delete the PHI node.
+  // 4. Create tileload instruction from the stack address.
+  Register StackAddrReg = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
+  MachineInstrBuilder AddrPHI = BuildMI(*MBB, ++PHI.getIterator(), DebugLoc(),
+                                        TII->get(X86::PHI), StackAddrReg);
+  Register RowReg = MRI->createVirtualRegister(&X86::GR16RegClass);
+  MachineInstrBuilder RowPHI = BuildMI(*MBB, ++PHI.getIterator(), DebugLoc(),
+                                       TII->get(X86::PHI), RowReg);
+  Register ColReg = MRI->createVirtualRegister(&X86::GR16RegClass);
+  MachineInstrBuilder ColPHI = BuildMI(*MBB, ++PHI.getIterator(), DebugLoc(),
+                                       TII->get(X86::PHI), ColReg);
+  // Record the mapping of phi node and its row/column information.
+  VisitedPHIs[&PHI] = {RowReg, ColReg, StackAddrReg};
+
+  for (unsigned I = 1, E = PHI.getNumOperands(); I != E; I += 2) {
+    // Get the 2 incoming value of tile register and MBB.
+    Register InTileReg = PHI.getOperand(I).getReg();
+    // Mark it as liveout, so that it will be spilled when visit
+    // the incoming MBB. Otherwise since phi will be deleted, it
+    // would miss spill when visit incoming MBB.
+    MayLiveAcrossBlocks.set(Register::virtReg2Index(InTileReg));
+    MachineBasicBlock *InMBB = PHI.getOperand(I + 1).getMBB();
+
+    MachineInstr *TileDefMI = MRI->getVRegDef(InTileReg);
+    MachineBasicBlock::iterator InsertPos;
+    if (TileDefMI->isPHI()) {
+      InsertPos = TileDefMI->getParent()->getFirstNonPHI();
+      if (VisitedPHIs.count(TileDefMI)) { // circular phi reference
+        //        def t1
+        //       /       \
+        //  def t2       t3 = phi(t1, t4) <--
+        //       \       /                  |
+        //      t4 = phi(t2, t3)-------------
+        //
+        // For each (row, column and stack address) append phi incoming value.
+        // Create r3 = phi(r1, r4)
+        // Create r4 = phi(r2, r3)
+        Register InRowReg = VisitedPHIs[TileDefMI].Row;
+        Register InColReg = VisitedPHIs[TileDefMI].Col;
+        Register InStackAddrReg = VisitedPHIs[TileDefMI].StackAddr;
+        RowPHI.addReg(InRowReg).addMBB(InMBB);
+        ColPHI.addReg(InColReg).addMBB(InMBB);
+        AddrPHI.addReg(InStackAddrReg).addMBB(InMBB);
+        continue;
+      } else {
+        // Recursively convert PHI to tileload
+        convertPHI(TileDefMI->getParent(), *TileDefMI);
+        // The PHI node is coverted to tileload instruction. Get the stack
+        // address from tileload operands.
+        MachineInstr *TileLoad = MRI->getVRegDef(InTileReg);
+        assert(TileLoad && TileLoad->getOpcode() == X86::PTILELOADDV);
+        Register InRowReg = TileLoad->getOperand(1).getReg();
+        Register InColReg = TileLoad->getOperand(2).getReg();
+        Register InStackAddrReg = TileLoad->getOperand(3).getReg();
+        RowPHI.addReg(InRowReg).addMBB(InMBB);
+        ColPHI.addReg(InColReg).addMBB(InMBB);
+        AddrPHI.addReg(InStackAddrReg).addMBB(InMBB);
+      }
+    } else {
+      InsertPos = TileDefMI->getIterator();
+
+      // Fill the incoming operand of row/column phi instruction.
+      ShapeT Shape = getShape(MRI, InTileReg);
+      Shape.getRow()->setIsKill(false);
+      Shape.getCol()->setIsKill(false);
+      RowPHI.addReg(Shape.getRow()->getReg()).addMBB(InMBB);
+      ColPHI.addReg(Shape.getCol()->getReg()).addMBB(InMBB);
+
+      // The incoming tile register live out of its def BB, it would be spilled.
+      // Create MI to get the spill stack slot address for the tile register
+      int FI = getStackSpaceFor(InTileReg);
+      Register InStackAddrReg =
+          MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
+      addOffset(BuildMI(*TileDefMI->getParent(), InsertPos, DebugLoc(),
+                        TII->get(X86::LEA64r), InStackAddrReg)
+                    .addFrameIndex(FI),
+                0);
+      AddrPHI.addReg(InStackAddrReg).addMBB(InMBB);
+    }
+  }
+
+  MachineBasicBlock::iterator InsertPos = MBB->getFirstNonPHI();
+  Register StrideReg = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
+  BuildMI(*MBB, InsertPos, DebugLoc(), TII->get(X86::MOV64ri), StrideReg)
+      .addImm(64);
+  Register TileReg = PHI.getOperand(0).getReg();
+  MachineInstr *NewMI = addDirectMem(
+      BuildMI(*MBB, InsertPos, DebugLoc(), TII->get(X86::PTILELOADDV), TileReg)
+          .addReg(RowReg)
+          .addReg(ColReg),
+      StackAddrReg);
+  MachineOperand &MO = NewMI->getOperand(5);
+  MO.setReg(StrideReg);
+  MO.setIsKill(true);
+  PHI.eraseFromParent();
+  VisitedPHIs.erase(&PHI);
+}
+
+static bool isTileRegDef(MachineRegisterInfo *MRI, MachineInstr &MI) {
+  MachineOperand &MO = MI.getOperand(0);
+  if (MO.isReg() && MO.getReg().isVirtual() &&
+      MRI->getRegClass(MO.getReg())->getID() == X86::TILERegClassID)
+    return true;
+  return false;
+}
+
+void X86FastPreTileConfig::canonicalizePHIs(MachineBasicBlock &MBB) {
+  SmallVector<MachineInstr *, 8> PHIs;
+
+  for (MachineInstr &MI : MBB) {
+    if (!MI.isPHI())
+      break;
+    if (!isTileRegDef(MRI, MI))
+      continue;
+    PHIs.push_back(&MI);
+  }
+  // Canonicalize the phi node first. One tile phi may depeneds previous
+  // phi node. For below case, we need convert %t4.
+  //
+  // BB0:
+  // %t3 = phi (t1 BB1, t2 BB0)
+  // %t4 = phi (t5 BB1, t3 BB0)
+  // -->
+  // %t3 = phi (t1 BB1, t2 BB0)
+  // %t4 = phi (t5 BB1, t2 BB0)
+  //
+  while (!PHIs.empty()) {
+    MachineInstr *PHI = PHIs.pop_back_val();
+
+    // Find the operand that is incoming from the same MBB and the def
+    // is also phi node.
+    MachineOperand *InMO = nullptr;
+    MachineInstr *DefMI = nullptr;
+    for (unsigned I = 1, E = PHI->getNumOperands(); I != E; I += 2) {
+      Register InTileReg = PHI->getOperand(I).getReg();
+      MachineBasicBlock *InMBB = PHI->getOperand(I + 1).getMBB();
+      DefMI = MRI->getVRegDef(InTileReg);
+      if (InMBB != &MBB || !DefMI->isPHI())
+        continue;
+
+      InMO = &PHI->getOperand(I);
+      break;
+    }
+    // If can't find such operand, do nothing.
+    if (!InMO)
+      continue;
+
+    // Current phi node depends on previous phi node. Break the
+    // dependency.
+    Register DefTileReg;
+    for (unsigned I = 1, E = DefMI->getNumOperands(); I != E; I += 2) {
+      MachineBasicBlock *InMBB = PHI->getOperand(I + 1).getMBB();
+      if (InMBB != &MBB)
+        continue;
+      DefTileReg = DefMI->getOperand(I).getReg();
+      InMO->setReg(DefTileReg);
+      break;
+    }
+  }
+}
+
+void X86FastPreTileConfig::convertPHIs(MachineBasicBlock &MBB) {
+  SmallVector<MachineInstr *, 8> PHIs;
+  for (MachineInstr &MI : MBB) {
+    if (!MI.isPHI())
+      break;
+    if (!isTileRegDef(MRI, MI))
+      continue;
+    PHIs.push_back(&MI);
+  }
+  while (!PHIs.empty()) {
+    MachineInstr *MI = PHIs.pop_back_val();
+    VisitedPHIs.clear();
+    convertPHI(&MBB, *MI);
+  }
+}
+
+// PreTileConfig should configure the tile registers based on basic
+// block.
+bool X86FastPreTileConfig::configBasicBlock(MachineBasicBlock &MBB) {
+  this->MBB = &MBB;
+  bool Change = false;
+  MachineInstr *LastShapeMI = nullptr;
+  MachineInstr *LastTileCfg = nullptr;
+  bool HasUnconfigTile = false;
+
+  auto Config = [&](MachineInstr &Before) {
+    if (CfgSS == -1)
+      CfgSS = MFI->CreateStackObject(ST->getTileConfigSize(),
+                                     ST->getTileConfigAlignment(), false);
+    LastTileCfg = addFrameReference(
+        BuildMI(MBB, Before, DebugLoc(), TII->get(X86::PLDTILECFGV)), CfgSS);
+    LastShapeMI = nullptr;
+    Change = true;
+  };
+  auto HasTileOperand = [](MachineRegisterInfo *MRI, MachineInstr &MI) {
+    for (const MachineOperand &MO : MI.operands()) {
+      if (!MO.isReg())
+        continue;
+      Register Reg = MO.getReg();
+      if (Reg.isVirtual() &&
+          MRI->getRegClass(Reg)->getID() == X86::TILERegClassID)
+        return true;
+    }
+    return false;
+  };
+  for (MachineInstr &MI : reverse(MBB)) {
+    // We have transformed phi node before configuring BB.
+    if (MI.isPHI())
+      break;
+    // Don't collect the shape of used tile, the tile should be defined
+    // before the tile use. Spill and reload would happen if there is only
+    // tile use after ldtilecfg, so the shape can be collected from reload.
+    // Take below code for example. %t would be reloaded before tilestore
+    // call
+    // ....
+    // tilestore %r, %c, %t
+    // -->
+    // call
+    // ldtilecfg
+    // %t = tileload %r, %c
+    // tilestore %r, %c, %t
+    if (HasTileOperand(MRI, MI))
+      HasUnconfigTile = true;
+    // According to AMX ABI, all the tile registers including config register
+    // are volatile. Caller need to save/restore config register.
+    if (MI.isCall() && HasUnconfigTile) {
+      MachineBasicBlock::iterator I;
+      if (LastShapeMI && dominates(MBB, MI, LastShapeMI))
+        I = ++LastShapeMI->getIterator();
+      else
+        I = ++MI.getIterator();
+      Config(*I);
+      HasUnconfigTile = false;
+      continue;
+    }
+    if (!isTileDef(MRI, MI))
+      continue;
+    //
+    //---------------------------------------------------------------------
+    // Don't handle COPY instruction. If the src and dst of the COPY can be
+    // in the same config in below case, we just check the shape of t0.
+    // def row0
+    // def col0
+    // ldtilecfg
+    // t0 = tielzero(row0, col0)
+    // t1 = copy t0
+    // ...
+    // If the src and dst of the COPY can NOT be in the same config in below
+    // case. Reload would be generated befor the copy instruction.
+    // def row0
+    // def col0
+    // t0 = tielzero(row0, col0)
+    // spill t0
+    // ...
+    // def row1
+    // def col1
+    // ldtilecfg
+    // t1 = tilezero(row1, col1)
+    // reload t0
+    // t1 = copy t0
+    //---------------------------------------------------------------------
+    //
+    // If MI dominate the last shape def instruction, we need insert
+    // ldtilecfg after LastShapeMI now. The config doesn't include
+    // current MI.
+    //   def row0
+    //   def col0
+    //   tilezero(row0, col0)  <- MI
+    //   def row1
+    //   def col1
+    //   ldtilecfg             <- insert
+    //   tilezero(row1, col1)
+    if (LastShapeMI && dominates(MBB, MI, LastShapeMI))
+      Config(*(++LastShapeMI->getIterator()));
+    MachineOperand *RowMO = &MI.getOperand(1);
+    MachineOperand *ColMO = &MI.getOperand(2);
+    MachineInstr *RowMI = MRI->getVRegDef(RowMO->getReg());
+    MachineInstr *ColMI = MRI->getVRegDef(ColMO->getReg());
+    // If the shape is defined in current MBB, check the domination.
+    // FIXME how about loop?
+    if (RowMI->getParent() == &MBB) {
+      if (!LastShapeMI)
+        LastShapeMI = RowMI;
+      else if (dominates(MBB, LastShapeMI, RowMI))
+        LastShapeMI = RowMI;
+    }
+    if (ColMI->getParent() == &MBB) {
+      if (!LastShapeMI)
+        LastShapeMI = ColMI;
+      else if (dominates(MBB, LastShapeMI, ColMI))
+        LastShapeMI = ColMI;
+    }
+    // If there is user live out of the tilecfg, spill it and reload in
+    // before the user.
+    Register TileReg = MI.getOperand(0).getReg();
+    if (mayLiveOut(TileReg, LastTileCfg))
+      spill(++MI.getIterator(), TileReg, false);
+    for (MachineInstr &UseMI : MRI->use_instructions(TileReg)) {
+      if (UseMI.getParent() == &MBB) {
+        // check user should not across ldtilecfg
+        if (!LastTileCfg || !dominates(MBB, LastTileCfg, UseMI))
+          continue;
+        // reload befor UseMI
+        reload(UseMI.getIterator(), TileReg, RowMO, ColMO);
+      } else {
+        // Don't reload for phi instruction, we handle phi reload separately.
+        // TODO: merge the reload for the same user MBB.
+        if (!UseMI.isPHI())
+          reload(UseMI.getIterator(), TileReg, RowMO, ColMO);
+      }
+    }
+  }
+
+  // Configure tile registers at the head of the MBB
+  if (HasUnconfigTile) {
+    MachineInstr *Before;
+    if (LastShapeMI == nullptr || LastShapeMI->isPHI())
+      Before = &*MBB.getFirstNonPHI();
+    else
+      Before = &*(++LastShapeMI->getIterator());
+
+    Config(*Before);
+  }
+
+  return Change;
+}
+
+bool X86FastPreTileConfig::runOnMachineFunction(MachineFunction &MFunc) {
+  MF = &MFunc;
+  MRI = &MFunc.getRegInfo();
+  ST = &MFunc.getSubtarget<X86Subtarget>();
+  TII = ST->getInstrInfo();
+  X86FI = MFunc.getInfo<X86MachineFunctionInfo>();
+  MFI = &MFunc.getFrameInfo();
+  TRI = ST->getRegisterInfo();
+  CfgSS = -1;
+
+  unsigned NumVirtRegs = MRI->getNumVirtRegs();
+  // Abandon early if there is no tile register to config.
+  bool HasVirtTileReg = false;
+  for (unsigned I = 0, E = NumVirtRegs; I != E; ++I) {
+    Register VirtReg = Register::index2VirtReg(I);
+    if (MRI->getRegClass(VirtReg)->getID() == X86::TILERegClassID) {
+      HasVirtTileReg = true;
+      break;
+    }
+  }
+  if (!HasVirtTileReg)
+    return false;
+
+  StackSlotForVirtReg.resize(NumVirtRegs);
+  MayLiveAcrossBlocks.clear();
+  // We will create register during config. *3 is to make sure
+  // the virtual register number doesn't exceed the size of
+  // the bit vector.
+  MayLiveAcrossBlocks.resize(NumVirtRegs * 3);
+  bool Change = false;
+  assert(MRI->isSSA());
+
+  // Canonicalize the phi node first.
+  for (MachineBasicBlock &MBB : MFunc)
+    canonicalizePHIs(MBB);
+
+  // Loop over all of the basic blocks in reverse post order and insert
+  // ldtilecfg for tile registers. The reserse post order is to facilitate
+  // PHI node convert.
+  ReversePostOrderTraversal<MachineFunction *> RPOT(MF);
+  for (MachineBasicBlock *MBB : RPOT) {
+    convertPHIs(*MBB);
+    Change |= configBasicBlock(*MBB);
+  }
+
+  if (Change)
+    InitializeTileConfigStackSpace();
+
+  StackSlotForVirtReg.clear();
+  return Change;
+}
+
+FunctionPass *llvm::createX86FastPreTileConfigPass() {
+  return new X86FastPreTileConfig();
+}
diff --git a/llvm/lib/Target/X86/X86FastTileConfig.cpp b/llvm/lib/Target/X86/X86FastTileConfig.cpp
index 061fff50bcea..2a20cd13791d 100644
--- a/llvm/lib/Target/X86/X86FastTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86FastTileConfig.cpp
@@ -40,40 +40,25 @@ namespace {
 class X86FastTileConfig : public MachineFunctionPass {
   // context
   MachineFunction *MF = nullptr;
-  const X86Subtarget *ST = nullptr;
-  const TargetRegisterInfo *TRI = nullptr;
   const TargetInstrInfo *TII = nullptr;
   MachineRegisterInfo *MRI = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
   X86MachineFunctionInfo *X86FI = nullptr;
 
-  MachineInstr *getTileConfigPoint();
-  void tileConfig();
+  bool configBasicBlock(MachineBasicBlock &MBB);
 
 public:
   X86FastTileConfig() : MachineFunctionPass(ID) {}
 
-  bool fastTileConfig();
-  bool isTileLoad(MachineInstr &MI);
-  bool isTileStore(MachineInstr &MI);
-  bool isAMXInstr(MachineInstr &MI);
-
-  MachineInstr *getKeyAMXInstr(MachineInstr *MI);
-  void getTileShapesCfg(MachineInstr *MI,
-                        SmallVector<MachineOperand *> &ShapedTiles);
-  void getShapeCfgInstrs(MachineInstr *MI,
-                         std::map<unsigned, MachineInstr *> &RowCfgs,
-                         std::map<unsigned, MachineInstr *> &ColCfgs);
-
   /// Return the pass name.
   StringRef getPassName() const override {
     return "Fast Tile Register Configure";
   }
 
-  void materializeTileCfg(MachineInstr *MI);
-
-  void rewriteTileCfg(SmallVector<MachineOperand *> &ShapedTiles,
-                      std::map<unsigned, MachineInstr *> &RowCfgs,
-                      std::map<unsigned, MachineInstr *> &ColCfgs);
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
 
   /// Perform register allocation.
   bool runOnMachineFunction(MachineFunction &MFunc) override;
@@ -95,209 +80,107 @@ INITIALIZE_PASS_BEGIN(X86FastTileConfig, DEBUG_TYPE,
 INITIALIZE_PASS_END(X86FastTileConfig, DEBUG_TYPE,
                     "Fast Tile Register Configure", false, false)
 
-static bool isTilePhysReg(MachineOperand &Op) {
-  if (!Op.isReg())
+static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) {
+  // There is no phi instruction after register allocation.
+  assert(MI.isPHI() == false);
+  // The instruction must have 3 operands: tile def, row, col.
+  // It should be AMX pseudo instruction that have shape operand.
+  if (MI.isDebugInstr() || MI.isCopy() || MI.getNumOperands() < 3 ||
+      !MI.isPseudo())
     return false;
+  MachineOperand &MO = MI.getOperand(0);
+
+  if (MO.isReg()) {
+    Register Reg = MO.getReg();
+    // FIXME it may be used after Greedy RA and the physical
+    // register is not rewritten yet.
+    if (Reg.isVirtual() &&
+        MRI->getRegClass(Reg)->getID() == X86::TILERegClassID)
+      return true;
+    if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
+      return true;
+  }
 
-  Register Reg = Op.getReg();
-  if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
-    return true;
   return false;
 }
 
-static unsigned getTilePhysRegIdx(MachineOperand *Op) {
-  assert(isTilePhysReg(*Op) && "Tile Operand is invalid");
-  return Op->getReg() - X86::TMM0;
-}
-
-static inline void adjustRowCfg(unsigned TIdx, MachineInstr *MI) {
-  unsigned Offset = 48 + TIdx;
-  MI->getOperand(3).ChangeToImmediate(Offset);
-}
-
-static inline void adjustColCfg(unsigned TIdx, MachineInstr *MI) {
-  unsigned Offset = 16 + TIdx * 2;
-  MI->getOperand(3).ChangeToImmediate(Offset);
-}
-
-bool X86FastTileConfig::isTileLoad(MachineInstr &MI) {
-  return MI.getOpcode() == X86::PTILELOADDV ||
-         MI.getOpcode() == X86::PTILELOADDT1V;
-}
-bool X86FastTileConfig::isTileStore(MachineInstr &MI) {
-  return MI.getOpcode() == X86::PTILESTOREDV;
-}
-bool X86FastTileConfig::isAMXInstr(MachineInstr &MI) {
-  // TODO: May need to handle some special nontile amx instrucion.
-  if (MI.getOpcode() == X86::PLDTILECFGV || MI.isDebugInstr())
-    return false;
-
-  return llvm::any_of(MI.operands(), isTilePhysReg);
-}
-
-MachineInstr *X86FastTileConfig::getKeyAMXInstr(MachineInstr *MI) {
-  auto Cfg = MachineBasicBlock::iterator(MI);
-  MachineBasicBlock *MBB = MI->getParent();
-  MachineInstr *KeyMI = nullptr;
-  int KeyAMXNum = 0;
-
-  for (auto II = Cfg; II != MBB->end(); II++) {
-    if (isTileLoad(*II)) {
-      KeyMI = &*II;
+// PreTileConfig should configure the tile registers based on basic
+// block.
+bool X86FastTileConfig::configBasicBlock(MachineBasicBlock &MBB) {
+  bool Change = false;
+  SmallVector<std::pair<unsigned, ShapeT>, 6> ShapeInfos;
+  for (MachineInstr &MI : reverse(MBB)) {
+    if (!isTileDef(MRI, MI) && MI.getOpcode() != X86::PLDTILECFGV)
       continue;
+    // AMX instructions that define tile register.
+    if (MI.getOpcode() != X86::PLDTILECFGV) {
+      MachineOperand &Row = MI.getOperand(1);
+      MachineOperand &Col = MI.getOperand(2);
+      unsigned TMMIdx = MI.getOperand(0).getReg() - X86::TMM0;
+      ShapeInfos.push_back({TMMIdx, ShapeT(&Row, &Col)});
+    } else { // PLDTILECFGV
+      // Rewrite the shape information to memory. Stack slot should have
+      // been initialized to zero in pre config.
+      int SS = MI.getOperand(0).getIndex(); // tile config stack slot.
+      for (auto &ShapeInfo : ShapeInfos) {
+        DebugLoc DL;
+        unsigned TMMIdx = ShapeInfo.first;
+        Register RowReg = ShapeInfo.second.getRow()->getReg();
+        Register ColReg = ShapeInfo.second.getCol()->getReg();
+        // Here is the data format for the tile config.
+        // 0      palette
+        // 1      start_row
+        // 2-15   reserved, must be zero
+        // 16-17  tile0.colsb Tile 0 bytes per row.
+        // 18-19  tile1.colsb Tile 1 bytes per row.
+        // 20-21  tile2.colsb Tile 2 bytes per row.
+        // ... (sequence continues)
+        // 30-31  tile7.colsb Tile 7 bytes per row.
+        // 32-47  reserved, must be zero
+        // 48     tile0.rows Tile 0 rows.
+        // 49     tile1.rows Tile 1 rows.
+        // 50     tile2.rows Tile 2 rows.
+        // ... (sequence continues)
+        // 55     tile7.rows Tile 7 rows.
+        // 56-63  reserved, must be zero
+        int RowOffset = 48 + TMMIdx;
+        int ColOffset = 16 + TMMIdx * 2;
+
+        Register SubRowReg = TRI->getSubReg(RowReg, X86::sub_8bit);
+        BuildMI(MBB, MI, DL, TII->get(X86::IMPLICIT_DEF), SubRowReg);
+        MachineInstrBuilder StoreRow =
+            BuildMI(MBB, MI, DL, TII->get(X86::MOV8mr));
+        addFrameReference(StoreRow, SS, RowOffset).addReg(SubRowReg);
+
+        MachineInstrBuilder StoreCol =
+            BuildMI(MBB, MI, DL, TII->get(X86::MOV16mr));
+        addFrameReference(StoreCol, SS, ColOffset).addReg(ColReg);
+      }
+      ShapeInfos.clear();
+      Change = true;
     }
-
-    if (isTileStore(*II)) {
-      assert(KeyMI && "Key AMX Should be found before!");
-      break;
-    }
-
-    if (isAMXInstr(*II)) {
-      assert((KeyAMXNum == 0) && "Too many Key AMX instruction!");
-      KeyAMXNum++;
-      KeyMI = &*II;
-    }
-  }
-  assert(KeyMI && "There must be an AMX instruction.");
-  return KeyMI;
-}
-
-// Orderly get the tiles in key amx instruction, uses before defs.
-void X86FastTileConfig::getTileShapesCfg(
-    MachineInstr *CfgMI, SmallVector<MachineOperand *> &ShapedTiles) {
-  MachineInstr *KeyMI = getKeyAMXInstr(CfgMI);
-
-  SmallVector<MachineOperand *> DefTiles;
-  for (MachineOperand &MO : KeyMI->operands()) {
-    if (!isTilePhysReg(MO))
-      continue;
-    if (MO.isDef())
-      DefTiles.push_back(&MO);
-    else
-      ShapedTiles.push_back(&MO);
-  }
-  ShapedTiles.append(DefTiles);
-}
-
-// We pre-config the shapes at position named with "amx.tmm.N.shape.row* and
-// amx.shape.N.col*" at pass "Pre AMX Tile Config".
-// The 'N' implies the order of tiles in key amx intrinsic.
-void X86FastTileConfig::getShapeCfgInstrs(
-    MachineInstr *MI, std::map<unsigned, MachineInstr *> &RowCfgs,
-    std::map<unsigned, MachineInstr *> &ColCfgs) {
-  auto Cfg = MachineBasicBlock::iterator(MI);
-  MachineBasicBlock *MBB = MI->getParent();
-
-  for (auto II = Cfg; II != MBB->begin(); II--) {
-    if (isAMXInstr(*II) || II->isTerminator() || II->isCall())
-      break;
-    if (!II->mayStore() || !II->hasOneMemOperand())
-      continue;
-    const Value *MemPtr = II->memoperands()[0]->getValue();
-    if (!MemPtr)
-      continue;
-
-    StringRef Name = MemPtr->getName();
-    if (!Name.startswith("amx.tmm."))
-      continue;
-
-    // Get the 'N'th tile shape config in key amx instruction.
-    auto N = Name.find(".shape");
-    StringRef STileIdx = Name.slice(8, N);
-    unsigned Idx;
-    STileIdx.getAsInteger(10, Idx);
-
-    // And related them with their store instructions.
-    if (Name.contains("row"))
-      RowCfgs[Idx] = &*II;
-    else if (Name.contains("col"))
-      ColCfgs[Idx] = &*II;
-    else
-      llvm_unreachable("Invalid tile shape info!");
   }
-  assert((RowCfgs.size() == ColCfgs.size()) &&
-         "The number of tile row and col must be equal!");
-}
-
-// Here is the data format for the tile config.
-// 0      palette   = 1 now.
-// 1      start_row = 0 now.
-// 2-15   reserved, must be zero
-// 16-17  tile0.colsb Tile 0 bytes per row.
-// 18-19  tile1.colsb Tile 1 bytes per row.
-// 20-21  tile2.colsb Tile 2 bytes per row.
-// ... (sequence continues)
-// 30-31  tile7.colsb Tile 7 bytes per row.
-// 32-47  reserved, must be zero
-// 48     tile0.rows Tile 0 rows.
-// 49     tile1.rows Tile 1 rows.
-// 50     tile2.rows Tile 2 rows.
-// ... (sequence continues)
-// 55     tile7.rows Tile 7 rows.
-// 56-63  reserved, must be zero
-void X86FastTileConfig::rewriteTileCfg(
-    SmallVector<MachineOperand *> &ShapedTiles,
-    std::map<unsigned, MachineInstr *> &RowCfgs,
-    std::map<unsigned, MachineInstr *> &ColCfgs) {
-  assert((RowCfgs.size() == ShapedTiles.size()) &&
-         "The number of tile shapes not equal with the number of tiles!");
 
-  // Orderly get the tiles and adjust the shape config.
-  for (unsigned I = 0, E = ShapedTiles.size(); I < E; I++) {
-    MachineOperand *MO = ShapedTiles[I];
-    unsigned TmmIdx = getTilePhysRegIdx(MO);
-    if (I == TmmIdx)
-      continue;
-    adjustRowCfg(TmmIdx, RowCfgs[I]);
-    adjustColCfg(TmmIdx, ColCfgs[I]);
-  }
-}
-
-// We have already preconfig the shapes before fast register allocation at
-// X86PreAMXConfig::preWriteTileCfg(). Now, we have done fast register
-// allocation, the shapes pre-written before may not rightly corresponding
-// to the correct tmm registers, so we need adjust them.
-void X86FastTileConfig::materializeTileCfg(MachineInstr *CfgMI) {
-  SmallVector<MachineOperand *> ShapedTiles;
-  std::map<unsigned, MachineInstr *> RowCfgs;
-  std::map<unsigned, MachineInstr *> ColCfgs;
-
-  // Orderly keep the tile uses and def in ShapedTiles;
-  getTileShapesCfg(CfgMI, ShapedTiles);
-  assert(ShapedTiles.size() && "Not find shapes config!");
-
-  getShapeCfgInstrs(CfgMI, RowCfgs, ColCfgs);
-
-  rewriteTileCfg(ShapedTiles, RowCfgs, ColCfgs);
-}
-
-bool X86FastTileConfig::fastTileConfig() {
-  bool Changed = false;
-
-  for (MachineBasicBlock &MBB : *MF) {
-    SmallVector<MachineInstr *, 2> CFGs;
-    for (MachineInstr &MI : MBB)
-      if (MI.getOpcode() == X86::PLDTILECFGV)
-        CFGs.push_back(&MI);
-    for (auto *MI : CFGs)
-      materializeTileCfg(MI);
-    if (!CFGs.empty())
-      Changed = true;
-  }
-  if (Changed)
+  if (Change)
     X86FI->setHasVirtualTileReg(true);
-  return Changed;
+
+  return Change;
 }
 
 bool X86FastTileConfig::runOnMachineFunction(MachineFunction &MFunc) {
   MF = &MFunc;
   MRI = &MFunc.getRegInfo();
-  ST = &MFunc.getSubtarget<X86Subtarget>();
+  const TargetSubtargetInfo *ST = &MFunc.getSubtarget<X86Subtarget>();
   TRI = ST->getRegisterInfo();
   TII = MFunc.getSubtarget().getInstrInfo();
   X86FI = MFunc.getInfo<X86MachineFunctionInfo>();
+  bool Change = false;
+
+  // Loop over all of the basic blocks, eliminating virtual register references
+  for (MachineBasicBlock &MBB : MFunc)
+    Change |= configBasicBlock(MBB);
 
-  return fastTileConfig();
+  return Change;
 }
 
 FunctionPass *llvm::createX86FastTileConfigPass() {
diff --git a/llvm/lib/Target/X86/X86FixupLEAs.cpp b/llvm/lib/Target/X86/X86FixupLEAs.cpp
index 4730b936ec1f..b01145809ac6 100644
--- a/llvm/lib/Target/X86/X86FixupLEAs.cpp
+++ b/llvm/lib/Target/X86/X86FixupLEAs.cpp
@@ -229,7 +229,7 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &MF) {
   const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
   bool IsSlowLEA = ST.slowLEA();
   bool IsSlow3OpsLEA = ST.slow3OpsLEA();
-  bool LEAUsesAG = ST.LEAusesAG();
+  bool LEAUsesAG = ST.leaUsesAG();
 
   bool OptIncDec = !ST.slowIncDec() || MF.getFunction().hasOptSize();
   bool UseLEAForSP = ST.useLeaForSP();
@@ -546,7 +546,6 @@ bool FixupLEAPass::optLEAALU(MachineBasicBlock::iterator &I,
   if (KilledIndex)
     KilledIndex->setIsKill(false);
 
-  MBB.getParent()->substituteDebugValuesForInst(*AluI, *NewMI1, 1);
   MBB.getParent()->substituteDebugValuesForInst(*AluI, *NewMI2, 1);
   MBB.erase(I);
   MBB.erase(AluI);
diff --git a/llvm/lib/Target/X86/X86FloatingPoint.cpp b/llvm/lib/Target/X86/X86FloatingPoint.cpp
index 2f0ab4ca9de4..33f5bb365da8 100644
--- a/llvm/lib/Target/X86/X86FloatingPoint.cpp
+++ b/llvm/lib/Target/X86/X86FloatingPoint.cpp
@@ -99,17 +99,17 @@ namespace {
     // but the exact mapping of FP registers to stack slots is fixed later.
     struct LiveBundle {
       // Bit mask of live FP registers. Bit 0 = FP0, bit 1 = FP1, &c.
-      unsigned Mask;
+      unsigned Mask = 0;
 
       // Number of pre-assigned live registers in FixStack. This is 0 when the
       // stack order has not yet been fixed.
-      unsigned FixCount;
+      unsigned FixCount = 0;
 
       // Assigned stack order for live-in registers.
       // FixStack[i] == getStackEntry(i) for all i < FixCount.
       unsigned char FixStack[8];
 
-      LiveBundle() : Mask(0), FixCount(0) {}
+      LiveBundle() = default;
 
       // Have the live registers been assigned a stack order yet?
       bool isFixed() const { return !Mask || FixCount; }
@@ -866,7 +866,7 @@ void FPS::popStackAfter(MachineBasicBlock::iterator &I) {
   if (Opcode != -1) {
     I->setDesc(TII->get(Opcode));
     if (Opcode == X86::FCOMPP || Opcode == X86::UCOM_FPPr)
-      I->RemoveOperand(0);
+      I->removeOperand(0);
     MI.dropDebugNumber();
   } else {    // Insert an explicit pop
     // If this instruction sets FPSW, which is read in following instruction,
@@ -1034,7 +1034,7 @@ void FPS::handleCall(MachineBasicBlock::iterator &I) {
       STReturns |= 1 << getFPReg(Op);
 
     // Remove the operand so that later passes don't see it.
-    MI.RemoveOperand(i);
+    MI.removeOperand(i);
     --i;
     --e;
   }
@@ -1098,7 +1098,7 @@ void FPS::handleReturn(MachineBasicBlock::iterator &I) {
     LiveMask |= (1 << getFPReg(Op));
 
     // Remove the operand so that later passes don't see it.
-    MI.RemoveOperand(i);
+    MI.removeOperand(i);
     --i;
     --e;
   }
@@ -1162,7 +1162,7 @@ void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) {
   unsigned DestReg = getFPReg(MI.getOperand(0));
 
   // Change from the pseudo instruction to the concrete instruction.
-  MI.RemoveOperand(0); // Remove the explicit ST(0) operand
+  MI.removeOperand(0); // Remove the explicit ST(0) operand
   MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
   MI.addOperand(
       MachineOperand::CreateReg(X86::ST0, /*isDef*/ true, /*isImp*/ true));
@@ -1210,7 +1210,7 @@ void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) {
   }
 
   // Convert from the pseudo instruction to the concrete instruction.
-  MI.RemoveOperand(NumOps - 1); // Remove explicit ST(0) operand
+  MI.removeOperand(NumOps - 1); // Remove explicit ST(0) operand
   MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
   MI.addOperand(
       MachineOperand::CreateReg(X86::ST0, /*isDef*/ false, /*isImp*/ true));
@@ -1263,8 +1263,8 @@ void FPS::handleOneArgFPRW(MachineBasicBlock::iterator &I) {
   }
 
   // Change from the pseudo instruction to the concrete instruction.
-  MI.RemoveOperand(1); // Drop the source operand.
-  MI.RemoveOperand(0); // Drop the destination operand.
+  MI.removeOperand(1); // Drop the source operand.
+  MI.removeOperand(0); // Drop the destination operand.
   MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
   MI.dropDebugNumber();
 }
@@ -1464,7 +1464,7 @@ void FPS::handleCompareFP(MachineBasicBlock::iterator &I) {
 
   // Change from the pseudo instruction to the concrete instruction.
   MI.getOperand(0).setReg(getSTReg(Op1));
-  MI.RemoveOperand(1);
+  MI.removeOperand(1);
   MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
   MI.dropDebugNumber();
 
@@ -1489,8 +1489,8 @@ void FPS::handleCondMovFP(MachineBasicBlock::iterator &I) {
 
   // Change the second operand to the stack register that the operand is in.
   // Change from the pseudo instruction to the concrete instruction.
-  MI.RemoveOperand(0);
-  MI.RemoveOperand(1);
+  MI.removeOperand(0);
+  MI.removeOperand(1);
   MI.getOperand(0).setReg(getSTReg(Op1));
   MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
   MI.dropDebugNumber();
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index 51f2ced321bb..d524090f902e 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86FrameLowering.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
 #include "X86InstrBuilder.h"
 #include "X86InstrInfo.h"
 #include "X86MachineFunctionInfo.h"
@@ -19,6 +20,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -99,7 +101,7 @@ bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
           MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall() ||
           MF.callsUnwindInit() || MF.hasEHFunclets() || MF.callsEHReturn() ||
           MFI.hasStackMap() || MFI.hasPatchPoint() ||
-          MFI.hasCopyImplyingStackAdjustment());
+          (isWin64Prologue(MF) && MFI.hasCopyImplyingStackAdjustment()));
 }
 
 static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {
@@ -435,11 +437,13 @@ int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
 void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MBBI,
                                 const DebugLoc &DL,
-                                const MCCFIInstruction &CFIInst) const {
+                                const MCCFIInstruction &CFIInst,
+                                MachineInstr::MIFlag Flag) const {
   MachineFunction &MF = *MBB.getParent();
   unsigned CFIIndex = MF.addFrameInst(CFIInst);
   BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
-      .addCFIIndex(CFIIndex);
+      .addCFIIndex(CFIIndex)
+      .setMIFlag(Flag);
 }
 
 /// Emits Dwarf Info specifying offsets of callee saved registers and
@@ -492,6 +496,87 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(
   }
 }
 
+void X86FrameLowering::emitZeroCallUsedRegs(BitVector RegsToZero,
+                                            MachineBasicBlock &MBB) const {
+  const MachineFunction &MF = *MBB.getParent();
+
+  // Insertion point.
+  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+
+  // Fake a debug loc.
+  DebugLoc DL;
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
+
+  // Zero out FP stack if referenced. Do this outside of the loop below so that
+  // it's done only once.
+  const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
+  for (MCRegister Reg : RegsToZero.set_bits()) {
+    if (!X86::RFP80RegClass.contains(Reg))
+      continue;
+
+    unsigned NumFPRegs = ST.is64Bit() ? 8 : 7;
+    for (unsigned i = 0; i != NumFPRegs; ++i)
+      BuildMI(MBB, MBBI, DL, TII.get(X86::LD_F0));
+
+    for (unsigned i = 0; i != NumFPRegs; ++i)
+      BuildMI(MBB, MBBI, DL, TII.get(X86::ST_FPrr)).addReg(X86::ST0);
+    break;
+  }
+
+  // For GPRs, we only care to clear out the 32-bit register.
+  BitVector GPRsToZero(TRI->getNumRegs());
+  for (MCRegister Reg : RegsToZero.set_bits())
+    if (TRI->isGeneralPurposeRegister(MF, Reg)) {
+      GPRsToZero.set(getX86SubSuperRegisterOrZero(Reg, 32));
+      RegsToZero.reset(Reg);
+    }
+
+  for (MCRegister Reg : GPRsToZero.set_bits())
+    BuildMI(MBB, MBBI, DL, TII.get(X86::XOR32rr), Reg)
+        .addReg(Reg, RegState::Undef)
+        .addReg(Reg, RegState::Undef);
+
+  // Zero out registers.
+  for (MCRegister Reg : RegsToZero.set_bits()) {
+    if (ST.hasMMX() && X86::VR64RegClass.contains(Reg))
+      // FIXME: Ignore MMX registers?
+      continue;
+
+    unsigned XorOp;
+    if (X86::VR128RegClass.contains(Reg)) {
+      // XMM#
+      if (!ST.hasSSE1())
+        continue;
+      XorOp = X86::PXORrr;
+    } else if (X86::VR256RegClass.contains(Reg)) {
+      // YMM#
+      if (!ST.hasAVX())
+        continue;
+      XorOp = X86::VPXORrr;
+    } else if (X86::VR512RegClass.contains(Reg)) {
+      // ZMM#
+      if (!ST.hasAVX512())
+        continue;
+      XorOp = X86::VPXORYrr;
+    } else if (X86::VK1RegClass.contains(Reg) ||
+               X86::VK2RegClass.contains(Reg) ||
+               X86::VK4RegClass.contains(Reg) ||
+               X86::VK8RegClass.contains(Reg) ||
+               X86::VK16RegClass.contains(Reg)) {
+      if (!ST.hasVLX())
+        continue;
+      XorOp = ST.hasBWI() ? X86::KXORQrr : X86::KXORWrr;
+    } else {
+      continue;
+    }
+
+    BuildMI(MBB, MBBI, DL, TII.get(XorOp), Reg)
+      .addReg(Reg, RegState::Undef)
+      .addReg(Reg, RegState::Undef);
+  }
+}
+
 void X86FrameLowering::emitStackProbe(
     MachineFunction &MF, MachineBasicBlock &MBB,
     MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog,
@@ -1289,6 +1374,9 @@ bool X86FrameLowering::has128ByteRedZone(const MachineFunction& MF) const {
   return Is64Bit && !IsWin64CC && !Fn.hasFnAttribute(Attribute::NoRedZone);
 }
 
+/// Return true if we need to use the restricted Windows x64 prologue and
+/// epilogue code patterns that can be described with WinCFI (.seh_*
+/// directives).
 bool X86FrameLowering::isWin64Prologue(const MachineFunction &MF) const {
   return MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
 }
@@ -1558,12 +1646,15 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
       // Define the current CFA rule to use the provided offset.
       assert(StackSize);
       BuildCFI(MBB, MBBI, DL,
-               MCCFIInstruction::cfiDefCfaOffset(nullptr, -2 * stackGrowth));
+               MCCFIInstruction::cfiDefCfaOffset(nullptr, -2 * stackGrowth),
+               MachineInstr::FrameSetup);
 
       // Change the rule for the FramePtr to be an "offset" rule.
       unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
-      BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createOffset(
-                                  nullptr, DwarfFramePtr, 2 * stackGrowth));
+      BuildCFI(MBB, MBBI, DL,
+               MCCFIInstruction::createOffset(nullptr, DwarfFramePtr,
+                                              2 * stackGrowth),
+               MachineInstr::FrameSetup);
     }
 
     if (NeedsWinCFI) {
@@ -1630,7 +1721,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
           unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
           BuildCFI(
               MBB, MBBI, DL,
-              MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr));
+              MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr),
+              MachineInstr::FrameSetup);
         }
 
         if (NeedsWinFPO) {
@@ -1681,7 +1773,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
       // Define the current CFA rule to use the provided offset.
       assert(StackSize);
       BuildCFI(MBB, MBBI, DL,
-               MCCFIInstruction::cfiDefCfaOffset(nullptr, -StackOffset));
+               MCCFIInstruction::cfiDefCfaOffset(nullptr, -StackOffset),
+               MachineInstr::FrameSetup);
       StackOffset += stackGrowth;
     }
 
@@ -1962,7 +2055,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
       assert(StackSize);
       BuildCFI(
           MBB, MBBI, DL,
-          MCCFIInstruction::cfiDefCfaOffset(nullptr, StackSize - stackGrowth));
+          MCCFIInstruction::cfiDefCfaOffset(nullptr, StackSize - stackGrowth),
+          MachineInstr::FrameSetup);
     }
 
     // Emit DWARF info specifying the offsets of the callee-saved registers.
@@ -2145,11 +2239,13 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
       unsigned DwarfStackPtr =
           TRI->getDwarfRegNum(Is64Bit ? X86::RSP : X86::ESP, true);
       BuildCFI(MBB, MBBI, DL,
-               MCCFIInstruction::cfiDefCfa(nullptr, DwarfStackPtr, SlotSize));
+               MCCFIInstruction::cfiDefCfa(nullptr, DwarfStackPtr, SlotSize),
+               MachineInstr::FrameDestroy);
       if (!MBB.succ_empty() && !MBB.isReturnBlock()) {
         unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
         BuildCFI(MBB, AfterPop, DL,
-                 MCCFIInstruction::createRestore(nullptr, DwarfFramePtr));
+                 MCCFIInstruction::createRestore(nullptr, DwarfFramePtr),
+                 MachineInstr::FrameDestroy);
         --MBBI;
         --AfterPop;
       }
@@ -2226,7 +2322,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
       // Define the current CFA rule to use the provided offset.
       BuildCFI(MBB, MBBI, DL,
                MCCFIInstruction::cfiDefCfaOffset(
-                   nullptr, CSSize + TailCallArgReserveSize + SlotSize));
+                   nullptr, CSSize + TailCallArgReserveSize + SlotSize),
+               MachineInstr::FrameDestroy);
     }
     --MBBI;
   }
@@ -2252,7 +2349,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
       if (Opc == X86::POP32r || Opc == X86::POP64r) {
         Offset += SlotSize;
         BuildCFI(MBB, MBBI, DL,
-                 MCCFIInstruction::cfiDefCfaOffset(nullptr, -Offset));
+                 MCCFIInstruction::cfiDefCfaOffset(nullptr, -Offset),
+                 MachineInstr::FrameDestroy);
       }
     }
   }
@@ -2830,17 +2928,8 @@ void X86FrameLowering::adjustForSegmentedStacks(
   // prologue.
   StackSize = MFI.getStackSize();
 
-  // Do not generate a prologue for leaf functions with a stack of size zero.
-  // For non-leaf functions we have to allow for the possibility that the
-  // callis to a non-split function, as in PR37807. This function could also
-  // take the address of a non-split function. When the linker tries to adjust
-  // its non-existent prologue, it would fail with an error. Mark the object
-  // file so that such failures are not errors. See this Go language bug-report
-  // https://go-review.googlesource.com/c/go/+/148819/
-  if (StackSize == 0 && !MFI.hasTailCall()) {
-    MF.getMMI().setHasNosplitStack(true);
+  if (!MFI.needsSplitStackProlog())
     return;
-  }
 
   MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();
   MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock();
@@ -3023,7 +3112,6 @@ void X86FrameLowering::adjustForSegmentedStacks(
         .addReg(0)
         .addExternalSymbol("__morestack_addr")
         .addReg(0);
-    MF.getMMI().setUsesMorestackAddr(true);
   } else {
     if (Is64Bit)
       BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32))
diff --git a/llvm/lib/Target/X86/X86FrameLowering.h b/llvm/lib/Target/X86/X86FrameLowering.h
index 987facbfeae4..9b83fe77d505 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.h
+++ b/llvm/lib/Target/X86/X86FrameLowering.h
@@ -176,7 +176,8 @@ public:
 
   /// Wraps up getting a CFI index and building a MachineInstr for it.
   void BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-                const DebugLoc &DL, const MCCFIInstruction &CFIInst) const;
+                const DebugLoc &DL, const MCCFIInstruction &CFIInst,
+                MachineInstr::MIFlag Flag = MachineInstr::NoFlags) const;
 
   /// Sets up EBP and optionally ESI based on the incoming EBP value.  Only
   /// needed for 32-bit. Used in funclet prologues and at catchret destinations.
@@ -233,6 +234,10 @@ private:
                                        const DebugLoc &DL, uint64_t Offset,
                                        uint64_t Align) const;
 
+  /// Emit target zero call-used regs.
+  void emitZeroCallUsedRegs(BitVector RegsToZero,
+                            MachineBasicBlock &MBB) const override;
+
   void adjustFrameForMsvcCxxEh(MachineFunction &MF) const;
 
   /// Aligns the stack pointer by ANDing it with -MaxAlign.
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 5b90c67deae6..f88037e95d33 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -59,30 +59,27 @@ namespace {
     enum {
       RegBase,
       FrameIndexBase
-    } BaseType;
+    } BaseType = RegBase;
 
     // This is really a union, discriminated by BaseType!
     SDValue Base_Reg;
-    int Base_FrameIndex;
+    int Base_FrameIndex = 0;
 
-    unsigned Scale;
+    unsigned Scale = 1;
     SDValue IndexReg;
-    int32_t Disp;
+    int32_t Disp = 0;
     SDValue Segment;
-    const GlobalValue *GV;
-    const Constant *CP;
-    const BlockAddress *BlockAddr;
-    const char *ES;
-    MCSymbol *MCSym;
-    int JT;
+    const GlobalValue *GV = nullptr;
+    const Constant *CP = nullptr;
+    const BlockAddress *BlockAddr = nullptr;
+    const char *ES = nullptr;
+    MCSymbol *MCSym = nullptr;
+    int JT = -1;
     Align Alignment;            // CP alignment.
-    unsigned char SymbolFlags;  // X86II::MO_*
+    unsigned char SymbolFlags = X86II::MO_NO_FLAG;  // X86II::MO_*
     bool NegateIndex = false;
 
-    X86ISelAddressMode()
-        : BaseType(RegBase), Base_FrameIndex(0), Scale(1), Disp(0), GV(nullptr),
-          CP(nullptr), BlockAddr(nullptr), ES(nullptr), MCSym(nullptr), JT(-1),
-          SymbolFlags(X86II::MO_NO_FLAG) {}
+    X86ISelAddressMode() = default;
 
     bool hasSymbolicDisplacement() const {
       return GV != nullptr || CP != nullptr || ES != nullptr ||
@@ -446,6 +443,43 @@ namespace {
       return getI8Imm(InsertIdx ? 0x02 : 0x30, DL);
     }
 
+    SDValue getSBBZero(SDNode *N) {
+      SDLoc dl(N);
+      MVT VT = N->getSimpleValueType(0);
+
+      // Create zero.
+      SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
+      SDValue Zero =
+          SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, None), 0);
+      if (VT == MVT::i64) {
+        Zero = SDValue(
+            CurDAG->getMachineNode(
+                TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
+                CurDAG->getTargetConstant(0, dl, MVT::i64), Zero,
+                CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
+            0);
+      }
+
+      // Copy flags to the EFLAGS register and glue it to next node.
+      unsigned Opcode = N->getOpcode();
+      assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) &&
+             "Unexpected opcode for SBB materialization");
+      unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1;
+      SDValue EFLAGS =
+          CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
+                               N->getOperand(FlagOpIndex), SDValue());
+
+      // Create a 64-bit instruction if the result is 64-bits otherwise use the
+      // 32-bit version.
+      unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
+      MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
+      VTs = CurDAG->getVTList(SBBVT, MVT::i32);
+      return SDValue(
+          CurDAG->getMachineNode(Opc, dl, VTs,
+                                 {Zero, Zero, EFLAGS, EFLAGS.getValue(1)}),
+          0);
+    }
+
     // Helper to detect unneeded and instructions on shift amounts. Called
     // from PatFrags in tablegen.
     bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
@@ -476,6 +510,9 @@ namespace {
       return Subtarget->getInstrInfo();
     }
 
+    /// Return a condition code of the given SDNode
+    X86::CondCode getCondFromNode(SDNode *N) const;
+
     /// Address-mode matching performs shift-of-and to and-of-shift
     /// reassociation in order to expose more scaled addressing
     /// opportunities.
@@ -492,7 +529,7 @@ namespace {
 
       unsigned StoreSize = N->getMemoryVT().getStoreSize();
 
-      if (N->getAlignment() < StoreSize)
+      if (N->getAlign().value() < StoreSize)
         return false;
 
       switch (StoreSize) {
@@ -2391,6 +2428,14 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
       return false;
     break;
 
+  case ISD::XOR:
+    // We want to look through a transform in InstCombine that
+    // turns 'add' with min_signed_val into 'xor', so we can treat this 'xor'
+    // exactly like an 'add'.
+    if (isMinSignedConstant(N.getOperand(1)) && !matchAdd(N, AM, Depth))
+      return false;
+    break;
+
   case ISD::AND: {
     // Perform some heroic transforms on an and of a constant-count shift
     // with a constant to enable use of the scaled offset field.
@@ -2745,10 +2790,10 @@ bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
       case X86ISD::SUB:
       case X86ISD::ADC:
       case X86ISD::SBB:
-      /* TODO: These opcodes can be added safely, but we may want to justify
-               their inclusion for different reasons (better for reg-alloc).
       case X86ISD::SMUL:
       case X86ISD::UMUL:
+      /* TODO: These opcodes can be added safely, but we may want to justify
+               their inclusion for different reasons (better for reg-alloc).
       case X86ISD::OR:
       case X86ISD::XOR:
       case X86ISD::AND:
@@ -2759,10 +2804,9 @@ bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
         return false;
       }
     };
-    // TODO: This could be an 'or' rather than 'and' to make the transform more
-    //       likely to happen. We might want to factor in whether there's a
-    //       load folding opportunity for the math op that disappears with LEA.
-    if (isMathWithFlags(N.getOperand(0)) && isMathWithFlags(N.getOperand(1)))
+    // TODO: We might want to factor in whether there's a load folding
+    // opportunity for the math op that disappears with LEA.
+    if (isMathWithFlags(N.getOperand(0)) || isMathWithFlags(N.getOperand(1)))
       Complexity++;
   }
 
@@ -2891,24 +2935,15 @@ bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
          CR->getSignedMax().slt(1ull << Width);
 }
 
-static X86::CondCode getCondFromNode(SDNode *N) {
+X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const {
   assert(N->isMachineOpcode() && "Unexpected node");
-  X86::CondCode CC = X86::COND_INVALID;
   unsigned Opc = N->getMachineOpcode();
-  if (Opc == X86::JCC_1)
-    CC = static_cast<X86::CondCode>(N->getConstantOperandVal(1));
-  else if (Opc == X86::SETCCr)
-    CC = static_cast<X86::CondCode>(N->getConstantOperandVal(0));
-  else if (Opc == X86::SETCCm)
-    CC = static_cast<X86::CondCode>(N->getConstantOperandVal(5));
-  else if (Opc == X86::CMOV16rr || Opc == X86::CMOV32rr ||
-           Opc == X86::CMOV64rr)
-    CC = static_cast<X86::CondCode>(N->getConstantOperandVal(2));
-  else if (Opc == X86::CMOV16rm || Opc == X86::CMOV32rm ||
-           Opc == X86::CMOV64rm)
-    CC = static_cast<X86::CondCode>(N->getConstantOperandVal(6));
-
-  return CC;
+  const MCInstrDesc &MCID = getInstrInfo()->get(Opc);
+  int CondNo = X86::getCondSrcNoFromDesc(MCID);
+  if (CondNo < 0)
+    return X86::COND_INVALID;
+
+  return static_cast<X86::CondCode>(N->getConstantOperandVal(CondNo));
 }
 
 /// Test whether the given X86ISD::CMP node has any users that use a flag
@@ -3464,7 +3499,7 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
   const bool AllowExtraUsesByDefault = Subtarget->hasBMI2();
   auto checkUses = [AllowExtraUsesByDefault](SDValue Op, unsigned NUses,
                                              Optional<bool> AllowExtraUses) {
-    return AllowExtraUses.getValueOr(AllowExtraUsesByDefault) ||
+    return AllowExtraUses.value_or(AllowExtraUsesByDefault) ||
            Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
   };
   auto checkOneUse = [checkUses](SDValue Op,
@@ -5478,7 +5513,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     MVT CmpVT = N0.getSimpleValueType();
 
     // Floating point needs special handling if we don't have FCOMI.
-    if (Subtarget->hasCMov())
+    if (Subtarget->canUseCMOV())
       break;
 
     bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
@@ -5518,7 +5553,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
 
     // Move AH into flags.
     // Some 64-bit targets lack SAHF support, but they do support FCOMI.
-    assert(Subtarget->hasLAHFSAHF() &&
+    assert(Subtarget->canUseLAHFSAHF() &&
            "Target doesn't support SAHF or FCOMI?");
     SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue());
     Chain = AH;
@@ -5567,40 +5602,86 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
     // use a smaller encoding.
     // Look past the truncate if CMP is the only use of it.
-    if (N0.getOpcode() == ISD::AND &&
-        N0.getNode()->hasOneUse() &&
+    if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
         N0.getValueType() != MVT::i8) {
-      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
-      if (!C) break;
-      uint64_t Mask = C->getZExtValue();
+      auto *MaskC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+      if (!MaskC)
+        break;
+
       // We may have looked through a truncate so mask off any bits that
       // shouldn't be part of the compare.
+      uint64_t Mask = MaskC->getZExtValue();
       Mask &= maskTrailingOnes<uint64_t>(CmpVT.getScalarSizeInBits());
 
-      // Check if we can replace AND+IMM64 with a shift. This is possible for
-      // masks/ like 0xFF000000 or 0x00FFFFFF and if we care only about the zero
-      // flag.
-      if (CmpVT == MVT::i64 && !isInt<32>(Mask) &&
+      // Check if we can replace AND+IMM{32,64} with a shift. This is possible
+      // for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the
+      // zero flag.
+      if (CmpVT == MVT::i64 && !isInt<8>(Mask) && isShiftedMask_64(Mask) &&
           onlyUsesZeroFlag(SDValue(Node, 0))) {
-        if (isMask_64(~Mask)) {
-          unsigned TrailingZeros = countTrailingZeros(Mask);
-          SDValue Imm = CurDAG->getTargetConstant(TrailingZeros, dl, MVT::i64);
-          SDValue Shift =
-            SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64, MVT::i32,
-                                           N0.getOperand(0), Imm), 0);
-          MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
-                                                       MVT::i32, Shift, Shift);
-          ReplaceNode(Node, Test);
-          return;
+        unsigned ShiftOpcode = ISD::DELETED_NODE;
+        unsigned ShiftAmt;
+        unsigned SubRegIdx;
+        MVT SubRegVT;
+        unsigned TestOpcode;
+        unsigned LeadingZeros = countLeadingZeros(Mask);
+        unsigned TrailingZeros = countTrailingZeros(Mask);
+
+        // With leading/trailing zeros, the transform is profitable if we can
+        // eliminate a movabsq or shrink a 32-bit immediate to 8-bit without
+        // incurring any extra register moves.
+        bool SavesBytes = !isInt<32>(Mask) || N0.getOperand(0).hasOneUse();
+        if (LeadingZeros == 0 && SavesBytes) {
+          // If the mask covers the most significant bit, then we can replace
+          // TEST+AND with a SHR and check eflags.
+          // This emits a redundant TEST which is subsequently eliminated.
+          ShiftOpcode = X86::SHR64ri;
+          ShiftAmt = TrailingZeros;
+          SubRegIdx = 0;
+          TestOpcode = X86::TEST64rr;
+        } else if (TrailingZeros == 0 && SavesBytes) {
+          // If the mask covers the least significant bit, then we can replace
+          // TEST+AND with a SHL and check eflags.
+          // This emits a redundant TEST which is subsequently eliminated.
+          ShiftOpcode = X86::SHL64ri;
+          ShiftAmt = LeadingZeros;
+          SubRegIdx = 0;
+          TestOpcode = X86::TEST64rr;
+        } else if (MaskC->hasOneUse() && !isInt<32>(Mask)) {
+          // If the shifted mask extends into the high half and is 8/16/32 bits
+          // wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr.
+          unsigned PopCount = 64 - LeadingZeros - TrailingZeros;
+          if (PopCount == 8) {
+            ShiftOpcode = X86::SHR64ri;
+            ShiftAmt = TrailingZeros;
+            SubRegIdx = X86::sub_8bit;
+            SubRegVT = MVT::i8;
+            TestOpcode = X86::TEST8rr;
+          } else if (PopCount == 16) {
+            ShiftOpcode = X86::SHR64ri;
+            ShiftAmt = TrailingZeros;
+            SubRegIdx = X86::sub_16bit;
+            SubRegVT = MVT::i16;
+            TestOpcode = X86::TEST16rr;
+          } else if (PopCount == 32) {
+            ShiftOpcode = X86::SHR64ri;
+            ShiftAmt = TrailingZeros;
+            SubRegIdx = X86::sub_32bit;
+            SubRegVT = MVT::i32;
+            TestOpcode = X86::TEST32rr;
+          }
         }
-        if (isMask_64(Mask)) {
-          unsigned LeadingZeros = countLeadingZeros(Mask);
-          SDValue Imm = CurDAG->getTargetConstant(LeadingZeros, dl, MVT::i64);
-          SDValue Shift =
-            SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64, MVT::i32,
-                                           N0.getOperand(0), Imm), 0);
-          MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
-                                                       MVT::i32, Shift, Shift);
+        if (ShiftOpcode != ISD::DELETED_NODE) {
+          SDValue ShiftC = CurDAG->getTargetConstant(ShiftAmt, dl, MVT::i64);
+          SDValue Shift = SDValue(
+              CurDAG->getMachineNode(ShiftOpcode, dl, MVT::i64, MVT::i32,
+                                     N0.getOperand(0), ShiftC),
+              0);
+          if (SubRegIdx != 0) {
+            Shift =
+                CurDAG->getTargetExtractSubreg(SubRegIdx, dl, SubRegVT, Shift);
+          }
+          MachineSDNode *Test =
+              CurDAG->getMachineNode(TestOpcode, dl, MVT::i32, Shift, Shift);
           ReplaceNode(Node, Test);
           return;
         }
@@ -5769,21 +5850,28 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     break;
 
   case X86ISD::SETCC_CARRY: {
-    // We have to do this manually because tblgen will put the eflags copy in
-    // the wrong place if we use an extract_subreg in the pattern.
     MVT VT = Node->getSimpleValueType(0);
+    SDValue Result;
+    if (Subtarget->hasSBBDepBreaking()) {
+      // We have to do this manually because tblgen will put the eflags copy in
+      // the wrong place if we use an extract_subreg in the pattern.
+      // Copy flags to the EFLAGS register and glue it to next node.
+      SDValue EFLAGS =
+          CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
+                               Node->getOperand(1), SDValue());
 
-    // Copy flags to the EFLAGS register and glue it to next node.
-    SDValue EFLAGS =
-        CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
-                             Node->getOperand(1), SDValue());
-
-    // Create a 64-bit instruction if the result is 64-bits otherwise use the
-    // 32-bit version.
-    unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
-    MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
-    SDValue Result = SDValue(
-        CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)), 0);
+      // Create a 64-bit instruction if the result is 64-bits otherwise use the
+      // 32-bit version.
+      unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
+      MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
+      Result = SDValue(
+          CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)),
+          0);
+    } else {
+      // The target does not recognize sbb with the same reg operand as a
+      // no-source idiom, so we explicitly zero the input values.
+      Result = getSBBZero(Node);
+    }
 
     // For less than 32-bits we need to extract from the 32-bit node.
     if (VT == MVT::i8 || VT == MVT::i16) {
@@ -5798,35 +5886,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
   case X86ISD::SBB: {
     if (isNullConstant(Node->getOperand(0)) &&
         isNullConstant(Node->getOperand(1))) {
-      MVT VT = Node->getSimpleValueType(0);
-
-      // Create zero.
-      SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
-      SDValue Zero =
-          SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, None), 0);
-      if (VT == MVT::i64) {
-        Zero = SDValue(
-            CurDAG->getMachineNode(
-                TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
-                CurDAG->getTargetConstant(0, dl, MVT::i64), Zero,
-                CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
-            0);
-      }
-
-      // Copy flags to the EFLAGS register and glue it to next node.
-      SDValue EFLAGS =
-          CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
-                               Node->getOperand(2), SDValue());
-
-      // Create a 64-bit instruction if the result is 64-bits otherwise use the
-      // 32-bit version.
-      unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
-      MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
-      VTs = CurDAG->getVTList(SBBVT, MVT::i32);
-      SDValue Result =
-          SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {Zero, Zero, EFLAGS,
-                                         EFLAGS.getValue(1)}),
-                  0);
+      SDValue Result = getSBBZero(Node);
 
       // Replace the flag use.
       ReplaceUses(SDValue(Node, 1), Result.getValue(1));
@@ -5834,6 +5894,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       // Replace the result use.
       if (!SDValue(Node, 0).use_empty()) {
         // For less than 32-bits we need to extract from the 32-bit node.
+        MVT VT = Node->getSimpleValueType(0);
         if (VT == MVT::i8 || VT == MVT::i16) {
           int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
           Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
@@ -6112,6 +6173,7 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
   case InlineAsm::Constraint_v: // not offsetable    ??
   case InlineAsm::Constraint_m: // memory
   case InlineAsm::Constraint_X:
+  case InlineAsm::Constraint_p: // address
     if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
       return true;
     break;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 90753b5b4d33..61c1fd25031d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -108,9 +108,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
                                      const X86Subtarget &STI)
     : TargetLowering(TM), Subtarget(STI) {
   bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
-  X86ScalarSSEf64 = Subtarget.hasSSE2();
-  X86ScalarSSEf32 = Subtarget.hasSSE1();
-  X86ScalarSSEf16 = Subtarget.hasFP16();
   MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
 
   // Set up the TargetLowering object.
@@ -170,7 +167,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
   // FIXME: Should we be limiting the atomic size on other configs? Default is
   // 1024.
-  if (!Subtarget.hasCmpxchg8b())
+  if (!Subtarget.canUseCMPXCHG8B())
     setMaxAtomicSizeInBitsSupported(32);
 
   // Set up the register classes.
@@ -200,7 +197,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   }
 
   // Integer absolute.
-  if (Subtarget.hasCMov()) {
+  if (Subtarget.canUseCMOV()) {
     setOperationAction(ISD::ABS            , MVT::i16  , Custom);
     setOperationAction(ISD::ABS            , MVT::i32  , Custom);
     if (Subtarget.is64Bit())
@@ -314,7 +311,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
 
   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
-  if (!X86ScalarSSEf64) {
+  if (!Subtarget.hasSSE2()) {
     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
     if (Subtarget.is64Bit()) {
@@ -415,14 +412,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(Op, MVT::f128, Expand);
   }
 
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand);
-  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
-  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
-  setTruncStoreAction(MVT::f80, MVT::f16, Expand);
-  setTruncStoreAction(MVT::f128, MVT::f16, Expand);
+  for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
+    setTruncStoreAction(VT, MVT::f16, Expand);
+    setTruncStoreAction(VT, MVT::bf16, Expand);
+
+    setOperationAction(ISD::BF16_TO_FP, VT, Expand);
+    setOperationAction(ISD::FP_TO_BF16, VT, Expand);
+  }
 
   setOperationAction(ISD::PARITY, MVT::i8, Custom);
   setOperationAction(ISD::PARITY, MVT::i16, Custom);
@@ -497,7 +495,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SRL_PARTS, VT, Custom);
   }
 
-  if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
+  if (Subtarget.hasSSEPrefetch() || Subtarget.hasThreeDNow())
     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
 
   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
@@ -516,9 +514,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   if (!Subtarget.is64Bit())
     setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
 
-  if (Subtarget.hasCmpxchg16b()) {
+  if (Subtarget.canUseCMPXCHG16B())
     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
-  }
 
   // FIXME - use subtarget debug flags
   if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
@@ -535,7 +532,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
-  if (Subtarget.getTargetTriple().isPS4CPU())
+  if (Subtarget.isTargetPS())
     setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);
   else
     setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
@@ -556,9 +553,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
   setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
 
-  if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
-    // f32 and f64 use SSE.
+  setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
+
+  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
+    // f16, f32 and f64 use SSE.
     // Set up the FP register classes.
+    addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
+                                                     : &X86::FR16RegClass);
     addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
                                                      : &X86::FR32RegClass);
     addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
@@ -590,11 +591,54 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::FSINCOS, VT, Expand);
     }
 
+    // Half type will be promoted by default.
+    setOperationAction(ISD::FABS, MVT::f16, Promote);
+    setOperationAction(ISD::FNEG, MVT::f16, Promote);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);
+    setOperationAction(ISD::FADD, MVT::f16, Promote);
+    setOperationAction(ISD::FSUB, MVT::f16, Promote);
+    setOperationAction(ISD::FMUL, MVT::f16, Promote);
+    setOperationAction(ISD::FDIV, MVT::f16, Promote);
+    setOperationAction(ISD::FREM, MVT::f16, Promote);
+    setOperationAction(ISD::FMA, MVT::f16, Promote);
+    setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
+    setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
+    setOperationAction(ISD::FMINIMUM, MVT::f16, Promote);
+    setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote);
+    setOperationAction(ISD::FSIN, MVT::f16, Promote);
+    setOperationAction(ISD::FCOS, MVT::f16, Promote);
+    setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
+    setOperationAction(ISD::FSQRT, MVT::f16, Promote);
+    setOperationAction(ISD::FPOW, MVT::f16, Promote);
+    setOperationAction(ISD::FLOG, MVT::f16, Promote);
+    setOperationAction(ISD::FLOG2, MVT::f16, Promote);
+    setOperationAction(ISD::FLOG10, MVT::f16, Promote);
+    setOperationAction(ISD::FEXP, MVT::f16, Promote);
+    setOperationAction(ISD::FEXP2, MVT::f16, Promote);
+    setOperationAction(ISD::FCEIL, MVT::f16, Promote);
+    setOperationAction(ISD::FFLOOR, MVT::f16, Promote);
+    setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote);
+    setOperationAction(ISD::FRINT, MVT::f16, Promote);
+    setOperationAction(ISD::BR_CC, MVT::f16, Promote);
+    setOperationAction(ISD::SETCC, MVT::f16, Promote);
+    setOperationAction(ISD::SELECT, MVT::f16, Custom);
+    setOperationAction(ISD::SELECT_CC, MVT::f16, Promote);
+    setOperationAction(ISD::FROUND, MVT::f16, Promote);
+    setOperationAction(ISD::FROUNDEVEN, MVT::f16, Promote);
+    setOperationAction(ISD::FTRUNC, MVT::f16, Promote);
+    setOperationAction(ISD::FP_ROUND, MVT::f16, LibCall);
+    setOperationAction(ISD::FP_EXTEND, MVT::f32, LibCall);
+    setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
+    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);
+
+    setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
+    setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
+
     // Lower this to MOVMSK plus an AND.
     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
 
-  } else if (!Subtarget.useSoftFloat() && X86ScalarSSEf32 &&
+  } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
              (UseX87 || Is64Bit)) {
     // Use SSE for f32, x87 for f64.
     // Set up the FP register classes.
@@ -664,6 +708,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     } else // SSE immediates.
       addLegalFPImmediate(APFloat(+0.0)); // xorpd
   }
+  // Support fp16 0 immediate.
+  if (isTypeLegal(MVT::f16))
+    addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
+
   // Handle constrained floating-point operations of scalar.
   setOperationAction(ISD::STRICT_FADD,      MVT::f32, Legal);
   setOperationAction(ISD::STRICT_FADD,      MVT::f64, Legal);
@@ -673,7 +721,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::STRICT_FMUL,      MVT::f64, Legal);
   setOperationAction(ISD::STRICT_FDIV,      MVT::f32, Legal);
   setOperationAction(ISD::STRICT_FDIV,      MVT::f64, Legal);
-  setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
   setOperationAction(ISD::STRICT_FP_ROUND,  MVT::f32, Legal);
   setOperationAction(ISD::STRICT_FP_ROUND,  MVT::f64, Legal);
   setOperationAction(ISD::STRICT_FSQRT,     MVT::f32, Legal);
@@ -725,7 +772,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::STRICT_FMUL     , MVT::f80, Legal);
     setOperationAction(ISD::STRICT_FDIV     , MVT::f80, Legal);
     setOperationAction(ISD::STRICT_FSQRT    , MVT::f80, Legal);
-    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
+    if (isTypeLegal(MVT::f16)) {
+      setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
+      setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom);
+    } else {
+      setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
+    }
     // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
     // as Custom.
     setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
@@ -877,7 +929,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
       // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
       // split/scalarized right now.
-      if (VT.getVectorElementType() == MVT::f16)
+      if (VT.getVectorElementType() == MVT::f16 ||
+          VT.getVectorElementType() == MVT::bf16)
         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
     }
   }
@@ -949,6 +1002,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
+    setOperationAction(ISD::AVGCEILU,           MVT::v16i8, Legal);
+    setOperationAction(ISD::AVGCEILU,           MVT::v8i16, Legal);
 
     setOperationAction(ISD::SMULO,              MVT::v16i8, Custom);
     setOperationAction(ISD::UMULO,              MVT::v16i8, Custom);
@@ -1067,6 +1122,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::STORE,              MVT::v4i16, Custom);
     setOperationAction(ISD::STORE,              MVT::v8i8,  Custom);
 
+    // Add 32-bit vector stores to help vectorization opportunities.
+    setOperationAction(ISD::STORE,              MVT::v2i16, Custom);
+    setOperationAction(ISD::STORE,              MVT::v4i8,  Custom);
+
     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
@@ -1285,13 +1344,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       if (VT == MVT::v4i64) continue;
       setOperationAction(ISD::ROTL, VT, Custom);
       setOperationAction(ISD::ROTR, VT, Custom);
+      setOperationAction(ISD::FSHL, VT, Custom);
+      setOperationAction(ISD::FSHR, VT, Custom);
     }
 
-    setOperationAction(ISD::FSHL,       MVT::v32i8, Custom);
-    setOperationAction(ISD::FSHR,       MVT::v32i8, Custom);
-    setOperationAction(ISD::FSHL,       MVT::v8i32, Custom);
-    setOperationAction(ISD::FSHR,       MVT::v8i32, Custom);
-
     // These types need custom splitting if their input is a 128-bit vector.
     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i64,  Custom);
     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i32, Custom);
@@ -1353,6 +1409,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::MULHS,     MVT::v16i16, HasInt256 ? Legal : Custom);
     setOperationAction(ISD::MULHU,     MVT::v32i8,  Custom);
     setOperationAction(ISD::MULHS,     MVT::v32i8,  Custom);
+    setOperationAction(ISD::AVGCEILU,  MVT::v16i16, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::AVGCEILU,  MVT::v32i8,  HasInt256 ? Legal : Custom);
 
     setOperationAction(ISD::SMULO,     MVT::v32i8, Custom);
     setOperationAction(ISD::UMULO,     MVT::v32i8, Custom);
@@ -1446,6 +1504,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     }
   }
 
+  if (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) {
+    setOperationAction(ISD::FP_ROUND,             MVT::f16,    Custom);
+    setOperationAction(ISD::STRICT_FP_ROUND,      MVT::f16,    Custom);
+    setOperationAction(ISD::FP_EXTEND,            MVT::f32,    Custom);
+    setOperationAction(ISD::STRICT_FP_EXTEND,     MVT::f32,    Custom);
+  }
+
   // This block controls legalization of the mask vector sizes that are
   // available with AVX512. 512-bit vectors are in a separate block controlled
   // by useAVX512Regs.
@@ -1652,6 +1717,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
     setOperationAction(ISD::MULHS, MVT::v64i8,  Custom);
     setOperationAction(ISD::MULHU, MVT::v64i8,  Custom);
+    setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
+    setOperationAction(ISD::AVGCEILU, MVT::v64i8,  HasBWI ? Legal : Custom);
 
     setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
     setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
@@ -1698,6 +1765,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     setOperationAction(ISD::FSHL,       MVT::v64i8, Custom);
     setOperationAction(ISD::FSHR,       MVT::v64i8, Custom);
+    setOperationAction(ISD::FSHL,      MVT::v32i16, Custom);
+    setOperationAction(ISD::FSHR,      MVT::v32i16, Custom);
     setOperationAction(ISD::FSHL,      MVT::v16i32, Custom);
     setOperationAction(ISD::FSHR,      MVT::v16i32, Custom);
 
@@ -1970,10 +2039,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FP_ROUND,             MVT::f16, Custom);
     setOperationAction(ISD::STRICT_FP_ROUND,      MVT::f16, Custom);
     setOperationAction(ISD::STRICT_FP_EXTEND,     MVT::f32, Legal);
-    if (isTypeLegal(MVT::f80)) {
-      setOperationAction(ISD::FP_EXTEND,          MVT::f80, Custom);
-      setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::f80, Custom);
-    }
 
     setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);
     setCondCodeAction(ISD::SETUNE, MVT::f16, Expand);
@@ -2059,9 +2124,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::LOAD,  MVT::v4f16, Custom);
       setOperationAction(ISD::STORE, MVT::v4f16, Custom);
     }
-
-    // Support fp16 0 immediate
-    addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
@@ -2209,55 +2271,55 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
         setOperationAction(Op, MVT::f32, Promote);
 
   // We have target-specific dag combine patterns for the following nodes:
-  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
-  setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
-  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
-  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
-  setTargetDAGCombine(ISD::CONCAT_VECTORS);
-  setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
-  setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
-  setTargetDAGCombine(ISD::BITCAST);
-  setTargetDAGCombine(ISD::VSELECT);
-  setTargetDAGCombine(ISD::SELECT);
-  setTargetDAGCombine(ISD::SHL);
-  setTargetDAGCombine(ISD::SRA);
-  setTargetDAGCombine(ISD::SRL);
-  setTargetDAGCombine(ISD::OR);
-  setTargetDAGCombine(ISD::AND);
-  setTargetDAGCombine(ISD::ADD);
-  setTargetDAGCombine(ISD::FADD);
-  setTargetDAGCombine(ISD::FSUB);
-  setTargetDAGCombine(ISD::FNEG);
-  setTargetDAGCombine(ISD::FMA);
-  setTargetDAGCombine(ISD::STRICT_FMA);
-  setTargetDAGCombine(ISD::FMINNUM);
-  setTargetDAGCombine(ISD::FMAXNUM);
-  setTargetDAGCombine(ISD::SUB);
-  setTargetDAGCombine(ISD::LOAD);
-  setTargetDAGCombine(ISD::MLOAD);
-  setTargetDAGCombine(ISD::STORE);
-  setTargetDAGCombine(ISD::MSTORE);
-  setTargetDAGCombine(ISD::TRUNCATE);
-  setTargetDAGCombine(ISD::ZERO_EXTEND);
-  setTargetDAGCombine(ISD::ANY_EXTEND);
-  setTargetDAGCombine(ISD::SIGN_EXTEND);
-  setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
-  setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG);
-  setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
-  setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
-  setTargetDAGCombine(ISD::SINT_TO_FP);
-  setTargetDAGCombine(ISD::UINT_TO_FP);
-  setTargetDAGCombine(ISD::STRICT_SINT_TO_FP);
-  setTargetDAGCombine(ISD::STRICT_UINT_TO_FP);
-  setTargetDAGCombine(ISD::SETCC);
-  setTargetDAGCombine(ISD::MUL);
-  setTargetDAGCombine(ISD::XOR);
-  setTargetDAGCombine(ISD::MSCATTER);
-  setTargetDAGCombine(ISD::MGATHER);
-  setTargetDAGCombine(ISD::FP16_TO_FP);
-  setTargetDAGCombine(ISD::FP_EXTEND);
-  setTargetDAGCombine(ISD::STRICT_FP_EXTEND);
-  setTargetDAGCombine(ISD::FP_ROUND);
+  setTargetDAGCombine({ISD::VECTOR_SHUFFLE,
+                       ISD::SCALAR_TO_VECTOR,
+                       ISD::INSERT_VECTOR_ELT,
+                       ISD::EXTRACT_VECTOR_ELT,
+                       ISD::CONCAT_VECTORS,
+                       ISD::INSERT_SUBVECTOR,
+                       ISD::EXTRACT_SUBVECTOR,
+                       ISD::BITCAST,
+                       ISD::VSELECT,
+                       ISD::SELECT,
+                       ISD::SHL,
+                       ISD::SRA,
+                       ISD::SRL,
+                       ISD::OR,
+                       ISD::AND,
+                       ISD::ADD,
+                       ISD::FADD,
+                       ISD::FSUB,
+                       ISD::FNEG,
+                       ISD::FMA,
+                       ISD::STRICT_FMA,
+                       ISD::FMINNUM,
+                       ISD::FMAXNUM,
+                       ISD::SUB,
+                       ISD::LOAD,
+                       ISD::MLOAD,
+                       ISD::STORE,
+                       ISD::MSTORE,
+                       ISD::TRUNCATE,
+                       ISD::ZERO_EXTEND,
+                       ISD::ANY_EXTEND,
+                       ISD::SIGN_EXTEND,
+                       ISD::SIGN_EXTEND_INREG,
+                       ISD::ANY_EXTEND_VECTOR_INREG,
+                       ISD::SIGN_EXTEND_VECTOR_INREG,
+                       ISD::ZERO_EXTEND_VECTOR_INREG,
+                       ISD::SINT_TO_FP,
+                       ISD::UINT_TO_FP,
+                       ISD::STRICT_SINT_TO_FP,
+                       ISD::STRICT_UINT_TO_FP,
+                       ISD::SETCC,
+                       ISD::MUL,
+                       ISD::XOR,
+                       ISD::MSCATTER,
+                       ISD::MGATHER,
+                       ISD::FP16_TO_FP,
+                       ISD::FP_EXTEND,
+                       ISD::STRICT_FP_EXTEND,
+                       ISD::FP_ROUND});
 
   computeRegisterProperties(Subtarget.getRegisterInfo());
 
@@ -2568,9 +2630,9 @@ EVT X86TargetLowering::getOptimalMemOpType(
 
 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
   if (VT == MVT::f32)
-    return X86ScalarSSEf32;
+    return Subtarget.hasSSE1();
   if (VT == MVT::f64)
-    return X86ScalarSSEf64;
+    return Subtarget.hasSSE2();
   return true;
 }
 
@@ -3566,10 +3628,15 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
     MFI.setObjectSExt(FI, true);
   }
 
+  MaybeAlign Alignment;
+  if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
+      ValVT != MVT::f80)
+    Alignment = MaybeAlign(4);
   SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
   SDValue Val = DAG.getLoad(
       ValVT, dl, Chain, FIN,
-      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
+      Alignment);
   return ExtendedInMem
              ? (VA.getValVT().isVector()
                     ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
@@ -3906,7 +3973,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
         else if (Is64Bit && RegVT == MVT::i64)
           RC = &X86::GR64RegClass;
         else if (RegVT == MVT::f16)
-          RC = &X86::FR16XRegClass;
+          RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass;
         else if (RegVT == MVT::f32)
           RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
         else if (RegVT == MVT::f64)
@@ -4088,9 +4155,14 @@ SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
   if (isByVal)
     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
 
+  MaybeAlign Alignment;
+  if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
+      Arg.getSimpleValueType() != MVT::f80)
+    Alignment = MaybeAlign(4);
   return DAG.getStore(
       Chain, dl, Arg, PtrOff,
-      MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
+      MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
+      Alignment);
 }
 
 /// Emit a load of return address if tail call
@@ -5076,7 +5148,7 @@ bool X86::mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,
   // If this is an unaligned vector, make sure the target supports folding it.
   auto *Ld = cast<LoadSDNode>(Op.getNode());
   if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
-      Ld->getValueSizeInBits(0) == 128 && Ld->getAlignment() < 16)
+      Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
     return false;
 
   // TODO: If this is a non-temporal load and the target has an instruction
@@ -5171,13 +5243,6 @@ static bool isTargetShuffleVariableMask(unsigned Opcode) {
   }
 }
 
-static bool isTargetShuffleSplat(SDValue Op) {
-  unsigned Opcode = Op.getOpcode();
-  if (Opcode == ISD::EXTRACT_SUBVECTOR)
-    return isTargetShuffleSplat(Op.getOperand(0));
-  return Opcode == X86ISD::VBROADCAST || Opcode == X86ISD::VBROADCAST_LOAD;
-}
-
 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
@@ -5429,6 +5494,18 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
       Info.align = Align(1);
       Info.flags |= MachineMemOperand::MOLoad;
       return true;
+    case Intrinsic::x86_atomic_bts:
+    case Intrinsic::x86_atomic_btc:
+    case Intrinsic::x86_atomic_btr: {
+      Info.opc = ISD::INTRINSIC_W_CHAIN;
+      Info.ptrVal = I.getArgOperand(0);
+      unsigned Size = I.getType()->getScalarSizeInBits();
+      Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
+      Info.align = Align(Size);
+      Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
+                    MachineMemOperand::MOVolatile;
+      return true;
+    }
     }
     return false;
   }
@@ -5643,6 +5720,22 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const {
   return Subtarget.hasLZCNT();
 }
 
+bool X86TargetLowering::hasBitPreservingFPLogic(EVT VT) const {
+  return VT == MVT::f32 || VT == MVT::f64 || VT.isVector();
+}
+
+bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const {
+  // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
+  // expensive than a straight movsd. On the other hand, it's important to
+  // shrink long double fp constant since fldt is very slow.
+  return !Subtarget.hasSSE2() || VT == MVT::f80;
+}
+
+bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const {
+  return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
+         (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
+}
+
 bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
                                                 const SelectionDAG &DAG,
                                                 const MachineMemOperand &MMO) const {
@@ -5755,6 +5848,7 @@ bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
           (N->getOpcode() == ISD::SRL &&
            N->getOperand(0).getOpcode() == ISD::SHL)) &&
          "Expected shift-shift mask");
+  // TODO: Should we always create i64 masks? Or only folded immediates?
   EVT VT = N->getValueType(0);
   if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
       (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
@@ -6281,7 +6375,8 @@ static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
 // Helper function to collect subvector ops that are concatenated together,
 // either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
 // The subvectors in Ops are guaranteed to be the same type.
-static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
+static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops,
+                             SelectionDAG &DAG) {
   assert(Ops.empty() && "Expected an empty ops vector");
 
   if (N->getOpcode() == ISD::CONCAT_VECTORS) {
@@ -6297,21 +6392,34 @@ static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
     EVT SubVT = Sub.getValueType();
 
     // TODO - Handle more general insert_subvector chains.
-    if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
-        Idx == (VT.getVectorNumElements() / 2)) {
-      // insert_subvector(insert_subvector(undef, x, lo), y, hi)
-      if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
-          Src.getOperand(1).getValueType() == SubVT &&
-          isNullConstant(Src.getOperand(2))) {
-        Ops.push_back(Src.getOperand(1));
+    if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
+      // insert_subvector(undef, x, lo)
+      if (Idx == 0 && Src.isUndef()) {
         Ops.push_back(Sub);
+        Ops.push_back(DAG.getUNDEF(SubVT));
         return true;
       }
-      // insert_subvector(x, extract_subvector(x, lo), hi)
-      if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
-          Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
-        Ops.append(2, Sub);
-        return true;
+      if (Idx == (VT.getVectorNumElements() / 2)) {
+        // insert_subvector(insert_subvector(undef, x, lo), y, hi)
+        if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
+            Src.getOperand(1).getValueType() == SubVT &&
+            isNullConstant(Src.getOperand(2))) {
+          Ops.push_back(Src.getOperand(1));
+          Ops.push_back(Sub);
+          return true;
+        }
+        // insert_subvector(x, extract_subvector(x, lo), hi)
+        if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+            Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
+          Ops.append(2, Sub);
+          return true;
+        }
+        // insert_subvector(undef, x, hi)
+        if (Src.isUndef()) {
+          Ops.push_back(DAG.getUNDEF(SubVT));
+          Ops.push_back(Sub);
+          return true;
+        }
       }
     }
   }
@@ -6770,7 +6878,7 @@ static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
     }
   }
   SmallVector<SDValue, 2> CatOps;
-  if (collectConcatOps(V.getNode(), CatOps)) {
+  if (collectConcatOps(V.getNode(), CatOps, DAG)) {
     for (SDValue &CatOp : CatOps) {
       SDValue NotCat = IsNOT(CatOp, DAG);
       if (!NotCat) return SDValue();
@@ -7934,8 +8042,35 @@ static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
   }
 }
 
+// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
+static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
+                                         SDValue Cond, bool IsBLENDV = false) {
+  EVT CondVT = Cond.getValueType();
+  unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
+  unsigned NumElts = CondVT.getVectorNumElements();
+
+  APInt UndefElts;
+  SmallVector<APInt, 32> EltBits;
+  if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
+                                     true, false))
+    return false;
+
+  Mask.resize(NumElts, SM_SentinelUndef);
+
+  for (int i = 0; i != (int)NumElts; ++i) {
+    Mask[i] = i;
+    // Arbitrarily choose from the 2nd operand if the select condition element
+    // is undef.
+    // TODO: Can we do better by matching patterns such as even/odd?
+    if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
+        (IsBLENDV && EltBits[i].isNonNegative()))
+      Mask[i] += NumElts;
+  }
+
+  return true;
+}
+
 // Forward declaration (for getFauxShuffleMask recursive check).
-// TODO: Use DemandedElts variant.
 static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
                                    SmallVectorImpl<int> &Mask,
                                    const SelectionDAG &DAG, unsigned Depth,
@@ -7987,11 +8122,11 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
     uint64_t ZeroMask = IsAndN ? 255 : 0;
     if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
       return false;
+    // We can't assume an undef src element gives an undef dst - the other src
+    // might be zero.
+    if (!UndefElts.isZero())
+      return false;
     for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
-      if (UndefElts[i]) {
-        Mask.push_back(SM_SentinelUndef);
-        continue;
-      }
       const APInt &ByteBits = EltBits[i];
       if (ByteBits != 0 && ByteBits != 255)
         return false;
@@ -8240,6 +8375,16 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
     }
     return true;
   }
+  case ISD::VSELECT:
+  case X86ISD::BLENDV: {
+    SDValue Cond = N.getOperand(0);
+    if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
+      Ops.push_back(N.getOperand(1));
+      Ops.push_back(N.getOperand(2));
+      return true;
+    }
+    return false;
+  }
   case X86ISD::VTRUNC: {
     SDValue Src = N.getOperand(0);
     EVT SrcVT = Src.getValueType();
@@ -9076,7 +9221,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
 
     // Don't create 256-bit non-temporal aligned loads without AVX2 as these
     // will lower to regular temporal loads and use the cache.
-    if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
+    if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
         VT.is256BitVector() && !Subtarget.hasInt256())
       return SDValue();
 
@@ -9462,7 +9607,8 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
     // For size optimization, also splat v2f64 and v2i64, and for size opt
     // with AVX2, also splat i8 and i16.
     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
-    if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
+    if (ScalarSize == 32 ||
+        (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
         (ScalarSize == 16 && Subtarget.hasFP16() && CVT.isFloatingPoint()) ||
         (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
       const Constant *C = nullptr;
@@ -11651,33 +11797,6 @@ static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
   return true;
 }
 
-// Attempt to create a shuffle mask from a VSELECT condition mask.
-static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
-                                         SDValue Cond) {
-  EVT CondVT = Cond.getValueType();
-  unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
-  unsigned NumElts = CondVT.getVectorNumElements();
-
-  APInt UndefElts;
-  SmallVector<APInt, 32> EltBits;
-  if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
-                                     true, false))
-    return false;
-
-  Mask.resize(NumElts, SM_SentinelUndef);
-
-  for (int i = 0; i != (int)NumElts; ++i) {
-    Mask[i] = i;
-    // Arbitrarily choose from the 2nd operand if the select condition element
-    // is undef.
-    // TODO: Can we do better by matching patterns such as even/odd?
-    if (UndefElts[i] || EltBits[i].isZero())
-      Mask[i] += NumElts;
-  }
-
-  return true;
-}
-
 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
 // instructions.
 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
@@ -13943,8 +14062,8 @@ static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
 /// This is particularly important because the set of instructions varies
 /// significantly based on whether the operand is a load or not.
 static bool isShuffleFoldableLoad(SDValue V) {
-  V = peekThroughBitcasts(V);
-  return ISD::isNON_EXTLoad(V.getNode());
+  return V->hasOneUse() &&
+         ISD::isNON_EXTLoad(peekThroughOneUseBitcasts(V).getNode());
 }
 
 /// Try to lower insertion of a single element into a zero vector.
@@ -15796,7 +15915,8 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
       V1 = extract128BitVector(V1V2, 0, DAG, DL);
       V2 = extract128BitVector(V1V2, 4, DAG, DL);
     } else {
-      SmallVector<SDValue> DWordClearOps(4, DAG.getConstant(0, DL, MVT::i32));
+      SmallVector<SDValue, 4> DWordClearOps(4,
+                                            DAG.getConstant(0, DL, MVT::i32));
       for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
         DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
       SDValue DWordClearMask =
@@ -16615,9 +16735,7 @@ static SDValue lowerShuffleAsLanePermuteAndShuffle(
   // otherwise we're (probably) better off doing a split.
   if (VT == MVT::v4f64 &&
       !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
-    if (SDValue V =
-            lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG))
-      return V;
+    return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
 
   // If there are only inputs from one 128-bit lane, splitting will in fact be
   // less expensive. The flags track whether the given lane contains an element
@@ -17229,114 +17347,135 @@ static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
     return SDValue();
 
   // Bail if we already have a repeated lane shuffle mask.
-  SmallVector<int, 8> RepeatedShuffleMask;
-  if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
+  if (is128BitLaneRepeatedShuffleMask(VT, Mask))
     return SDValue();
 
-  // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
-  // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
-  int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
-  int NumSubLanes = NumLanes * SubLaneScale;
-  int NumSubLaneElts = NumLaneElts / SubLaneScale;
-
-  // Check that all the sources are coming from the same lane and see if we can
-  // form a repeating shuffle mask (local to each sub-lane). At the same time,
-  // determine the source sub-lane for each destination sub-lane.
-  int TopSrcSubLane = -1;
-  SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
-  SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
-      SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
-      SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
-
-  for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
-    // Extract the sub-lane mask, check that it all comes from the same lane
-    // and normalize the mask entries to come from the first lane.
-    int SrcLane = -1;
-    SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
-    for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
-      int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
-      if (M < 0)
+  // Helper to look for repeated mask in each split sublane, and that those
+  // sublanes can then be permuted into place.
+  auto ShuffleSubLanes = [&](int SubLaneScale) {
+    int NumSubLanes = NumLanes * SubLaneScale;
+    int NumSubLaneElts = NumLaneElts / SubLaneScale;
+
+    // Check that all the sources are coming from the same lane and see if we
+    // can form a repeating shuffle mask (local to each sub-lane). At the same
+    // time, determine the source sub-lane for each destination sub-lane.
+    int TopSrcSubLane = -1;
+    SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
+    SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
+        SubLaneScale,
+        SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
+
+    for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
+      // Extract the sub-lane mask, check that it all comes from the same lane
+      // and normalize the mask entries to come from the first lane.
+      int SrcLane = -1;
+      SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
+      for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
+        int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
+        if (M < 0)
+          continue;
+        int Lane = (M % NumElts) / NumLaneElts;
+        if ((0 <= SrcLane) && (SrcLane != Lane))
+          return SDValue();
+        SrcLane = Lane;
+        int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
+        SubLaneMask[Elt] = LocalM;
+      }
+
+      // Whole sub-lane is UNDEF.
+      if (SrcLane < 0)
         continue;
-      int Lane = (M % NumElts) / NumLaneElts;
-      if ((0 <= SrcLane) && (SrcLane != Lane))
-        return SDValue();
-      SrcLane = Lane;
-      int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
-      SubLaneMask[Elt] = LocalM;
-    }
 
-    // Whole sub-lane is UNDEF.
-    if (SrcLane < 0)
-      continue;
+      // Attempt to match against the candidate repeated sub-lane masks.
+      for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
+        auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
+          for (int i = 0; i != NumSubLaneElts; ++i) {
+            if (M1[i] < 0 || M2[i] < 0)
+              continue;
+            if (M1[i] != M2[i])
+              return false;
+          }
+          return true;
+        };
+
+        auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
+        if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
+          continue;
 
-    // Attempt to match against the candidate repeated sub-lane masks.
-    for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
-      auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
+        // Merge the sub-lane mask into the matching repeated sub-lane mask.
         for (int i = 0; i != NumSubLaneElts; ++i) {
-          if (M1[i] < 0 || M2[i] < 0)
+          int M = SubLaneMask[i];
+          if (M < 0)
             continue;
-          if (M1[i] != M2[i])
-            return false;
+          assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
+                 "Unexpected mask element");
+          RepeatedSubLaneMask[i] = M;
         }
-        return true;
-      };
 
-      auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
-      if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
-        continue;
+        // Track the top most source sub-lane - by setting the remaining to
+        // UNDEF we can greatly simplify shuffle matching.
+        int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
+        TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
+        Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
+        break;
+      }
 
-      // Merge the sub-lane mask into the matching repeated sub-lane mask.
-      for (int i = 0; i != NumSubLaneElts; ++i) {
-        int M = SubLaneMask[i];
+      // Bail if we failed to find a matching repeated sub-lane mask.
+      if (Dst2SrcSubLanes[DstSubLane] < 0)
+        return SDValue();
+    }
+    assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
+           "Unexpected source lane");
+
+    // Create a repeating shuffle mask for the entire vector.
+    SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
+    for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
+      int Lane = SubLane / SubLaneScale;
+      auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
+      for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
+        int M = RepeatedSubLaneMask[Elt];
         if (M < 0)
           continue;
-        assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
-               "Unexpected mask element");
-        RepeatedSubLaneMask[i] = M;
+        int Idx = (SubLane * NumSubLaneElts) + Elt;
+        RepeatedMask[Idx] = M + (Lane * NumLaneElts);
       }
-
-      // Track the top most source sub-lane - by setting the remaining to UNDEF
-      // we can greatly simplify shuffle matching.
-      int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
-      TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
-      Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
-      break;
     }
+    SDValue RepeatedShuffle =
+        DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
 
-    // Bail if we failed to find a matching repeated sub-lane mask.
-    if (Dst2SrcSubLanes[DstSubLane] < 0)
-      return SDValue();
-  }
-  assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
-         "Unexpected source lane");
-
-  // Create a repeating shuffle mask for the entire vector.
-  SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
-  for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
-    int Lane = SubLane / SubLaneScale;
-    auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
-    for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
-      int M = RepeatedSubLaneMask[Elt];
-      if (M < 0)
+    // Shuffle each source sub-lane to its destination.
+    SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
+    for (int i = 0; i != NumElts; i += NumSubLaneElts) {
+      int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
+      if (SrcSubLane < 0)
         continue;
-      int Idx = (SubLane * NumSubLaneElts) + Elt;
-      RepeatedMask[Idx] = M + (Lane * NumLaneElts);
+      for (int j = 0; j != NumSubLaneElts; ++j)
+        SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
     }
-  }
-  SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
 
-  // Shuffle each source sub-lane to its destination.
-  SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
-  for (int i = 0; i != NumElts; i += NumSubLaneElts) {
-    int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
-    if (SrcSubLane < 0)
-      continue;
-    for (int j = 0; j != NumSubLaneElts; ++j)
-      SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
-  }
+    return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
+                                SubLaneMask);
+  };
 
-  return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
-                              SubLaneMask);
+  // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
+  // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
+  // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
+  // Otherwise we can only permute whole 128-bit lanes.
+  int MinSubLaneScale = 1, MaxSubLaneScale = 1;
+  if (Subtarget.hasAVX2() && VT.is256BitVector()) {
+    bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
+    MinSubLaneScale = 2;
+    MaxSubLaneScale =
+        (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
+  }
+  if (Subtarget.hasBWI() && VT == MVT::v64i8)
+    MinSubLaneScale = MaxSubLaneScale = 4;
+
+  for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
+    if (SDValue Shuffle = ShuffleSubLanes(Scale))
+      return Shuffle;
+
+  return SDValue();
 }
 
 static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
@@ -17513,6 +17652,9 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                           Zeroable, Subtarget, DAG))
     return Op;
 
+  bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
+  bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
+
   // If we have lane crossing shuffles AND they don't all come from the lower
   // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
   // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
@@ -17521,13 +17663,11 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
       !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
       (V1.getOpcode() != ISD::BUILD_VECTOR) &&
       (V2.getOpcode() != ISD::BUILD_VECTOR))
-    if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2,
-                                                       Mask, DAG))
-      return Op;
+    return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
 
   // If we have one input in place, then we can permute the other input and
   // blend the result.
-  if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
+  if (V1IsInPlace || V2IsInPlace)
     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
                                                 Subtarget, DAG);
 
@@ -17541,8 +17681,7 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   // shuffle. However, if we have AVX2 and either inputs are already in place,
   // we will be able to shuffle even across lanes the other input in a single
   // instruction so skip this pattern.
-  if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
-                                isShuffleMaskInputInPlace(1, Mask))))
+  if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
     if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
       return V;
@@ -17635,9 +17774,12 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
     return V;
 
+  bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
+  bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
+
   // If we have one input in place, then we can permute the other input and
   // blend the result.
-  if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
+  if (V1IsInPlace || V2IsInPlace)
     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
                                                 Subtarget, DAG);
 
@@ -17647,12 +17789,16 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
           DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
     return V;
 
+  // Try to lower to PERMQ(BLENDD(V1,V2)).
+  if (SDValue V =
+          lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
+    return V;
+
   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   // shuffle. However, if we have AVX2 and either inputs are already in place,
   // we will be able to shuffle even across lanes the other input in a single
   // instruction so skip this pattern.
-  if (!isShuffleMaskInputInPlace(0, Mask) &&
-      !isShuffleMaskInputInPlace(1, Mask))
+  if (!V1IsInPlace && !V2IsInPlace)
     if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
             DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
       return Result;
@@ -18657,20 +18803,34 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                               Zeroable, Subtarget, DAG))
     return PSHUFB;
 
-  // VBMI can use VPERMV/VPERMV3 byte shuffles.
-  if (Subtarget.hasVBMI())
-    return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
-
   // Try to create an in-lane repeating shuffle mask and then shuffle the
   // results into the target lanes.
   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
           DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
     return V;
 
+  if (SDValue Result = lowerShuffleAsLanePermuteAndPermute(
+          DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
+    return Result;
+
   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
                                           Zeroable, Subtarget, DAG))
     return Blend;
 
+  if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
+    // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
+    // PALIGNR will be cheaper than the second PSHUFB+OR.
+    if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
+                                                       Mask, Subtarget, DAG))
+      return V;
+
+    // If we can't directly blend but can use PSHUFB, that will be better as it
+    // can both shuffle and set up the inefficient blend.
+    bool V1InUse, V2InUse;
+    return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
+                                        DAG, V1InUse, V2InUse);
+  }
+
   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   // shuffle.
   if (!V2.isUndef())
@@ -18678,7 +18838,10 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
             DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
       return Result;
 
-  // FIXME: Implement direct support for this type!
+  // VBMI can use VPERMV/VPERMV3 byte shuffles.
+  if (Subtarget.hasVBMI())
+    return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
+
   return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
 }
 
@@ -18915,7 +19078,18 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
     Offset += NumElts; // Increment for next iteration.
   }
 
-
+  // If we're broadcasting a SETCC result, try to broadcast the ops instead.
+  // TODO: What other unary shuffles would benefit from this?
+  if (isBroadcastShuffleMask(Mask) && V1.getOpcode() == ISD::SETCC &&
+      V1->hasOneUse()) {
+    SDValue Op0 = V1.getOperand(0);
+    SDValue Op1 = V1.getOperand(1);
+    ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();
+    EVT OpVT = Op0.getValueType();
+    return DAG.getSetCC(
+        DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
+        DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
+  }
 
   MVT ExtVT;
   switch (VT.SimpleTy) {
@@ -19619,9 +19793,11 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
   bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
 
   if (IsZeroElt || IsAllOnesElt) {
-    // Lower insertion of i8 -1 as an 'OR' blend.
+    // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
     // We don't deal with i8 0 since it appears to be handled elsewhere.
-    if (IsAllOnesElt && EltSizeInBits == 8 && !Subtarget.hasSSE41()) {
+    if (IsAllOnesElt &&
+        ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
+         ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
       SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
       SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
       SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
@@ -19652,7 +19828,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
       // and incur a domain crossing penalty if that's what we'll end up
       // doing anyway after extracting to a 128-bit vector.
       if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
-          (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
+          (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
         SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
                            DAG.getTargetConstant(1, dl, MVT::i8));
@@ -19666,7 +19842,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
     // If we are not inserting into the low 128-bit vector chunk,
     // then prefer the broadcast+blend sequence.
     // FIXME: relax the profitability check iff all N1 uses are insertions.
-    if (!VT.is128BitVector() && IdxVal >= NumEltsIn128 &&
+    if (IdxVal >= NumEltsIn128 &&
         ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
          (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
           X86::mayFoldLoad(N1, Subtarget)))) {
@@ -20617,6 +20793,35 @@ static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
   return Cvt;
 }
 
+template<typename T>
+static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) {
+  return VT == MVT::f16 && !Subtarget.hasFP16();
+}
+
+template<typename T>
+bool X86TargetLowering::isSoftFP16(T VT) const {
+  return ::isSoftFP16(VT, Subtarget);
+}
+
+static SDValue promoteXINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+  bool IsStrict = Op->isStrictFPOpcode();
+  SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
+  SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
+  MVT VT = Op.getSimpleValueType();
+  MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
+  SDLoc dl(Op);
+
+  SDValue Rnd = DAG.getIntPtrConstant(0, dl);
+  if (IsStrict)
+    return DAG.getNode(
+        ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
+        {Chain,
+         DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
+         Rnd});
+  return DAG.getNode(ISD::FP_ROUND, dl, VT,
+                     DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
+}
+
 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
                                            SelectionDAG &DAG) const {
   bool IsStrict = Op->isStrictFPOpcode();
@@ -20627,6 +20832,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
 
+  if (isSoftFP16(VT))
+    return promoteXINT_TO_FP(Op, DAG);
+
   if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
     return LowerWin64_INT128_TO_FP(Op, DAG);
 
@@ -21123,9 +21331,13 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   MVT DstVT = Op->getSimpleValueType(0);
   SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
 
+  // Bail out when we don't have native conversion instructions.
   if (DstVT == MVT::f128)
     return SDValue();
 
+  if (isSoftFP16(DstVT))
+    return promoteXINT_TO_FP(Op, DAG);
+
   if (DstVT.isVector())
     return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
 
@@ -21158,9 +21370,13 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
 
   // The transform for i64->f64 isn't correct for 0 when rounding to negative
   // infinity. It produces -0.0, so disable under strictfp.
-  if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64 && !IsStrict)
+  if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
+      !IsStrict)
     return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
-  if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80)
+  // The transform for i32->f64/f32 isn't correct for 0 when rounding to
+  // negative infinity. So disable under strictfp. Using FILD instead.
+  if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
+      !IsStrict)
     return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
   if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
       (DstVT == MVT::f32 || DstVT == MVT::f64))
@@ -21819,27 +22035,25 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
 
   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
-    In = DAG.getBitcast(MVT::v8i32, In);
-
     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
     if (Subtarget.hasInt256()) {
       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
+      In = DAG.getBitcast(MVT::v8i32, In);
       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
                          DAG.getIntPtrConstant(0, DL));
     }
 
-    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
                                DAG.getIntPtrConstant(0, DL));
-    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
-                               DAG.getIntPtrConstant(4, DL));
+    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+                               DAG.getIntPtrConstant(2, DL));
     static const int ShufMask[] = {0, 2, 4, 6};
-    return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
+    return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
+                                DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
   }
 
   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
-    In = DAG.getBitcast(MVT::v32i8, In);
-
     // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
     if (Subtarget.hasInt256()) {
       // The PSHUFB mask:
@@ -21847,27 +22061,30 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
                                       -1, -1, -1, -1, -1, -1, -1, -1,
                                       16, 17, 20, 21, 24, 25, 28, 29,
                                       -1, -1, -1, -1, -1, -1, -1, -1 };
+      In = DAG.getBitcast(MVT::v32i8, In);
       In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
       In = DAG.getBitcast(MVT::v4i64, In);
 
       static const int ShufMask2[] = {0, 2, -1, -1};
       In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
-      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16,
-                         DAG.getBitcast(MVT::v16i16, In),
-                         DAG.getIntPtrConstant(0, DL));
+      In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+                       DAG.getIntPtrConstant(0, DL));
+      return DAG.getBitcast(MVT::v8i16, In);
     }
 
-    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
+    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
                                DAG.getIntPtrConstant(0, DL));
-    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
-                               DAG.getIntPtrConstant(16, DL));
+    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+                               DAG.getIntPtrConstant(4, DL));
 
     // The PSHUFB mask:
-    static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
-                                   -1, -1, -1, -1, -1, -1, -1, -1};
+    static const int ShufMask1[] = {0, 2, 4, 6, -1, -1, -1, -1};
 
-    OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
-    OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
+    OpLo = DAG.getBitcast(MVT::v8i16, OpLo);
+    OpHi = DAG.getBitcast(MVT::v8i16, OpHi);
+
+    OpLo = DAG.getVectorShuffle(MVT::v8i16, DL, OpLo, OpLo, ShufMask1);
+    OpHi = DAG.getVectorShuffle(MVT::v8i16, DL, OpHi, OpHi, ShufMask1);
 
     OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
     OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
@@ -21941,6 +22158,16 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
 
   SDValue Res;
+  if (isSoftFP16(SrcVT)) {
+    MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
+    if (IsStrict)
+      return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
+                         {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
+                                             {NVT, MVT::Other}, {Chain, Src})});
+    return DAG.getNode(Op.getOpcode(), dl, VT,
+                       DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
+  }
+
   if (VT.isVector()) {
     if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
       MVT ResVT = MVT::v4i32;
@@ -22278,6 +22505,9 @@ SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
   SDValue Src = Op.getOperand(0);
   MVT SrcVT = Src.getSimpleValueType();
 
+  if (SrcVT == MVT::f16)
+    return SDValue();
+
   // If the source is in an SSE register, the node is Legal.
   if (isScalarFPTypeInSSEReg(SrcVT))
     return Op;
@@ -22349,7 +22579,7 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
 
   // This code is only for floats and doubles. Fall back to generic code for
   // anything else.
-  if (!isScalarFPTypeInSSEReg(SrcVT))
+  if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftFP16(SrcVT))
     return SDValue();
 
   EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
@@ -22381,11 +22611,11 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
   // floating-point values.
   APInt MinInt, MaxInt;
   if (IsSigned) {
-    MinInt = APInt::getSignedMinValue(SatWidth).sextOrSelf(DstWidth);
-    MaxInt = APInt::getSignedMaxValue(SatWidth).sextOrSelf(DstWidth);
+    MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
+    MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
   } else {
-    MinInt = APInt::getMinValue(SatWidth).zextOrSelf(DstWidth);
-    MaxInt = APInt::getMaxValue(SatWidth).zextOrSelf(DstWidth);
+    MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
+    MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
   }
 
   APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
@@ -22484,28 +22714,54 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
 
   SDLoc DL(Op);
   MVT VT = Op.getSimpleValueType();
+  SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
   SDValue In = Op.getOperand(IsStrict ? 1 : 0);
   MVT SVT = In.getSimpleValueType();
 
-  if (VT == MVT::f128)
+  if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80))
     return SDValue();
 
-  if (VT == MVT::f80) {
-    if (SVT == MVT::f16) {
-      assert(Subtarget.hasFP16() && "Unexpected features!");
-      RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT);
-      MakeLibCallOptions CallOptions;
-      std::pair<SDValue, SDValue> Tmp =
-          makeLibCall(DAG, LC, VT, In, CallOptions, DL,
-                      IsStrict ? Op.getOperand(0) : SDValue());
+  if (SVT == MVT::f16) {
+    if (Subtarget.hasFP16())
+      return Op;
+
+    if (VT != MVT::f32) {
       if (IsStrict)
-        return DAG.getMergeValues({Tmp.first, Tmp.second}, DL);
-      else
-        return Tmp.first;
+        return DAG.getNode(
+            ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
+            {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
+                                {MVT::f32, MVT::Other}, {Chain, In})});
+
+      return DAG.getNode(ISD::FP_EXTEND, DL, VT,
+                         DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
     }
-    return Op;
+
+    if (!Subtarget.hasF16C())
+      return SDValue();
+
+    In = DAG.getBitcast(MVT::i16, In);
+    In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
+                     getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
+                     DAG.getIntPtrConstant(0, DL));
+    SDValue Res;
+    if (IsStrict) {
+      Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
+                        {Chain, In});
+      Chain = Res.getValue(1);
+    } else {
+      Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
+                        DAG.getTargetConstant(4, DL, MVT::i32));
+    }
+    Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
+                      DAG.getIntPtrConstant(0, DL));
+    if (IsStrict)
+      return DAG.getMergeValues({Res, Chain}, DL);
+    return Res;
   }
 
+  if (!SVT.isVector())
+    return Op;
+
   if (SVT.getVectorElementType() == MVT::f16) {
     assert(Subtarget.hasFP16() && Subtarget.hasVLX() && "Unexpected features!");
     if (SVT == MVT::v2f16)
@@ -22531,15 +22787,65 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
   bool IsStrict = Op->isStrictFPOpcode();
+
+  SDLoc DL(Op);
+  SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
   SDValue In = Op.getOperand(IsStrict ? 1 : 0);
+  SDValue Op2 = Op.getOperand(IsStrict ? 2 : 1);
   MVT VT = Op.getSimpleValueType();
   MVT SVT = In.getSimpleValueType();
 
-  // It's legal except when f128 is involved or we're converting f80->f16.
-  if (SVT != MVT::f128 && !(VT == MVT::f16 && SVT == MVT::f80))
-    return Op;
+  if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
+    return SDValue();
 
-  return SDValue();
+  if (VT == MVT::f16) {
+    if (Subtarget.hasFP16())
+      return Op;
+
+    if (SVT != MVT::f32) {
+      if (IsStrict)
+        return DAG.getNode(
+            ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other},
+            {Chain,
+             DAG.getNode(ISD::STRICT_FP_ROUND, DL, {MVT::f32, MVT::Other},
+                         {Chain, In, Op2}),
+             Op2});
+
+      return DAG.getNode(ISD::FP_ROUND, DL, VT,
+                         DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, In, Op2),
+                         Op2);
+    }
+
+    if (!Subtarget.hasF16C())
+      return SDValue();
+
+    SDValue Res;
+    SDValue Rnd = DAG.getTargetConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, DL,
+                                        MVT::i32);
+    if (IsStrict) {
+      Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
+                        DAG.getConstantFP(0, DL, MVT::v4f32), In,
+                        DAG.getIntPtrConstant(0, DL));
+      Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
+                        {Chain, Res, Rnd});
+      Chain = Res.getValue(1);
+    } else {
+      // FIXME: Should we use zeros for upper elements for non-strict?
+      Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
+      Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
+    }
+
+    Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
+                      DAG.getIntPtrConstant(0, DL));
+    Res = DAG.getBitcast(MVT::f16, Res);
+
+    if (IsStrict)
+      return DAG.getMergeValues({Res, Chain}, DL);
+
+    return Res;
+  }
+
+  return Op;
 }
 
 static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
@@ -22857,6 +23163,47 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
   return Res;
 }
 
+/// Helper for attempting to create a X86ISD::BT node.
+static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
+  // If Src is i8, promote it to i32 with any_extend.  There is no i8 BT
+  // instruction.  Since the shift amount is in-range-or-undefined, we know
+  // that doing a bittest on the i32 value is ok.  We extend to i32 because
+  // the encoding for the i16 version is larger than the i32 version.
+  // Also promote i16 to i32 for performance / code size reason.
+  if (Src.getValueType().getScalarSizeInBits() < 32)
+    Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
+
+  // No legal type found, give up.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
+    return SDValue();
+
+  // See if we can use the 32-bit instruction instead of the 64-bit one for a
+  // shorter encoding. Since the former takes the modulo 32 of BitNo and the
+  // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
+  // known to be zero.
+  if (Src.getValueType() == MVT::i64 &&
+      DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
+    Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
+
+  // If the operand types disagree, extend the shift amount to match.  Since
+  // BT ignores high bits (like shifts) we can use anyextend.
+  if (Src.getValueType() != BitNo.getValueType()) {
+    // Peek through a mask/modulo operation.
+    // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
+    // we probably need a better IsDesirableToPromoteOp to handle this as well.
+    if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
+      BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
+                          DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
+                                      BitNo.getOperand(0)),
+                          DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
+                                      BitNo.getOperand(1)));
+    else
+      BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
+  }
+
+  return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
+}
+
 /// Helper for creating a X86ISD::SETCC node.
 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
                         SelectionDAG &DAG) {
@@ -23303,7 +23650,7 @@ bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
     return true;
 
   // We never want to use both SQRT and RSQRT instructions for the same input.
-  if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
+  if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
     return false;
 
   if (VT.isVector())
@@ -23439,7 +23786,7 @@ X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
 
   // Only perform this transform if CMOV is supported otherwise the select
   // below will become a branch.
-  if (!Subtarget.hasCMov())
+  if (!Subtarget.canUseCMOV())
     return SDValue();
 
   // fold (sdiv X, pow2)
@@ -23485,9 +23832,8 @@ X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
 
 /// Result of 'and' is compared against zero. Change to a BT node if possible.
 /// Returns the BT node and the condition code needed to use it.
-static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
-                            const SDLoc &dl, SelectionDAG &DAG,
-                            SDValue &X86CC) {
+static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,
+                            SelectionDAG &DAG, X86::CondCode &X86CC) {
   assert(And.getOpcode() == ISD::AND && "Expected AND node!");
   SDValue Op0 = And.getOperand(0);
   SDValue Op1 = And.getOperand(1);
@@ -23538,30 +23884,24 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
   if (!Src.getNode())
     return SDValue();
 
-  // If Src is i8, promote it to i32 with any_extend.  There is no i8 BT
-  // instruction.  Since the shift amount is in-range-or-undefined, we know
-  // that doing a bittest on the i32 value is ok.  We extend to i32 because
-  // the encoding for the i16 version is larger than the i32 version.
-  // Also promote i16 to i32 for performance / code size reason.
-  if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
-    Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
+  // Remove any bit flip.
+  if (isBitwiseNot(Src)) {
+    Src = Src.getOperand(0);
+    CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
+  }
 
-  // See if we can use the 32-bit instruction instead of the 64-bit one for a
-  // shorter encoding. Since the former takes the modulo 32 of BitNo and the
-  // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
-  // known to be zero.
-  if (Src.getValueType() == MVT::i64 &&
-      DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
-    Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
+  // Attempt to create the X86ISD::BT node.
+  if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
+    X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
+    return BT;
+  }
 
-  // If the operand types disagree, extend the shift amount to match.  Since
-  // BT ignores high bits (like shifts) we can use anyextend.
-  if (Src.getValueType() != BitNo.getValueType())
-    BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
+  return SDValue();
+}
 
-  X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,
-                                dl, MVT::i8);
-  return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
+// Check if pre-AVX condcode can be performed by a single FCMP op.
+static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
+  return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
 }
 
 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
@@ -23831,7 +24171,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
 
       // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
       // emit two comparisons and a logic op to tie them together.
-      if (SSECC >= 8) {
+      if (!cheapX86FSETCC_SSE(Cond)) {
         // LLVM predicate is SETUEQ or SETONE.
         unsigned CC0, CC1;
         unsigned CombineOpc;
@@ -23996,10 +24336,10 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
   if (VT.is256BitVector() && !Subtarget.hasInt256())
     return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
 
-  if (VT == MVT::v32i16 || VT == MVT::v64i8) {
-    assert(!Subtarget.hasBWI() && "Unexpected VT with AVX512BW!");
+  // Break 512-bit integer vector compare into smaller ones.
+  // TODO: Try harder to use VPCMPx + VPMOV2x?
+  if (VT.is512BitVector())
     return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
-  }
 
   // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
   // not-of-PCMPEQ:
@@ -24117,12 +24457,10 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
       // Since SSE has no unsigned integer comparisons, we need to flip the sign
       // bits of the inputs before performing those operations. The lower
       // compare is always unsigned.
-      SDValue SB;
-      if (FlipSigns) {
-        SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64);
-      } else {
-        SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64);
-      }
+      SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
+                                             : 0x0000000080000000ULL,
+                                   dl, MVT::v2i64);
+
       Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
       Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
 
@@ -24261,8 +24599,11 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
   // Lower ((X >>s N) & 1) != 0 to BT(X, N).
   if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
-    if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CC))
+    X86::CondCode X86CondCode;
+    if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
+      X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
       return BT;
+    }
   }
 
   // Try to use PTEST/PMOVMSKB for a tree ORs equality compared with 0.
@@ -24527,6 +24868,11 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   MVT VT = Op1.getSimpleValueType();
   SDValue CC;
 
+  if (isSoftFP16(VT))
+    return DAG.getBitcast(MVT::f16, DAG.getNode(ISD::SELECT, DL, MVT::i16, Cond,
+                                                DAG.getBitcast(MVT::i16, Op1),
+                                                DAG.getBitcast(MVT::i16, Op2)));
+
   // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
   // are available or VBLENDV if AVX is available.
   // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
@@ -24591,7 +24937,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
     return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
   }
 
-  if (Cond.getOpcode() == ISD::SETCC) {
+  if (Cond.getOpcode() == ISD::SETCC &&
+      !isSoftFP16(Cond.getOperand(0).getSimpleValueType())) {
     if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
       Cond = NewCond;
       // If the condition was updated, it's possible that the operands of the
@@ -24608,6 +24955,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
   // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
   // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
+  // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
+  // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
   if (Cond.getOpcode() == X86ISD::SETCC &&
       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
       isNullConstant(Cond.getOperand(1).getOperand(1))) {
@@ -24624,7 +24973,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
       return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
               Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
     };
-    if (Subtarget.hasCMov() && (VT == MVT::i32 || VT == MVT::i64) &&
+    if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
         ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
          (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
       // Keep Cmp.
@@ -24652,7 +25001,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
                                 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
                                 Sub.getValue(1));
       return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
-    } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
+    } else if (!Subtarget.canUseCMOV() && CondCode == X86::COND_E &&
                Cmp.getOperand(0).getOpcode() == ISD::AND &&
                isOneConstant(Cmp.getOperand(0).getOperand(1))) {
       SDValue Src1, Src2;
@@ -24688,6 +25037,22 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
         SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
         return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2);  // And Op y
       }
+    } else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) &&
+               Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
+               ((CondCode == X86::COND_S) ||                    // smin(x, 0)
+                (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
+      // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
+      //
+      // If the comparison is testing for a positive value, we have to invert
+      // the sign bit mask, so only do that transform if the target has a
+      // bitwise 'and not' instruction (the invert is free).
+      // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
+      unsigned ShCt = VT.getSizeInBits() - 1;
+      SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
+      SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
+      if (CondCode == X86::COND_G)
+        Shift = DAG.getNOT(DL, Shift, VT);
+      return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
     }
   }
 
@@ -24707,7 +25072,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
     SDValue Cmp = Cond.getOperand(1);
     bool IllegalFPCMov = false;
     if (VT.isFloatingPoint() && !VT.isVector() &&
-        !isScalarFPTypeInSSEReg(VT) && Subtarget.hasCMov())  // FPStack?
+        !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV())  // FPStack?
       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
 
     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
@@ -24734,9 +25099,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
     // We know the result of AND is compared against zero. Try to match
     // it to BT.
     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
-      SDValue BTCC;
-      if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, BTCC)) {
-        CC = BTCC;
+      X86::CondCode X86CondCode;
+      if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
+        CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
         Cond = BT;
         AddTest = false;
       }
@@ -24788,7 +25153,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   //        legal, but EmitLoweredSelect() can not deal with these extensions
   //        being inserted between two CMOV's. (in i16 case too TBN)
   //        https://bugs.llvm.org/show_bug.cgi?id=40974
-  if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) ||
+  if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
       (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
        !X86::mayFoldLoad(Op2, Subtarget))) {
     Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
@@ -25153,16 +25518,20 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
       ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
        !Subtarget.hasBWI())) {
     SmallVector<SDValue, 4> CatOps;
-    if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
+    if (StoredVal.hasOneUse() &&
+        collectConcatOps(StoredVal.getNode(), CatOps, DAG))
       return splitVectorStore(St, DAG);
     return SDValue();
   }
 
+  if (StoreVT.is32BitVector())
+    return SDValue();
+
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&
-         "Unexpected VT");
+  assert(StoreVT.is64BitVector() && "Unexpected VT");
   assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
-             TargetLowering::TypeWidenVector && "Unexpected type action!");
+             TargetLowering::TypeWidenVector &&
+         "Unexpected type action!");
 
   EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
   StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
@@ -25247,8 +25616,10 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   SDValue Dest  = Op.getOperand(2);
   SDLoc dl(Op);
 
+  // Bail out when we don't have native compare instructions.
   if (Cond.getOpcode() == ISD::SETCC &&
-      Cond.getOperand(0).getValueType() != MVT::f128) {
+      Cond.getOperand(0).getValueType() != MVT::f128 &&
+      !isSoftFP16(Cond.getOperand(0).getValueType())) {
     SDValue LHS = Cond.getOperand(0);
     SDValue RHS = Cond.getOperand(1);
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
@@ -25647,116 +26018,116 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
   // Fold this packed vector shift into a build vector if SrcOp is a
   // vector of Constants or UNDEFs.
   if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
-    SmallVector<SDValue, 8> Elts;
-    unsigned NumElts = SrcOp->getNumOperands();
-
+    unsigned ShiftOpc;
     switch (Opc) {
     default: llvm_unreachable("Unknown opcode!");
     case X86ISD::VSHLI:
-      for (unsigned i = 0; i != NumElts; ++i) {
-        SDValue CurrentOp = SrcOp->getOperand(i);
-        if (CurrentOp->isUndef()) {
-          // Must produce 0s in the correct bits.
-          Elts.push_back(DAG.getConstant(0, dl, ElementType));
-          continue;
-        }
-        auto *ND = cast<ConstantSDNode>(CurrentOp);
-        const APInt &C = ND->getAPIntValue();
-        Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
-      }
+      ShiftOpc = ISD::SHL;
       break;
     case X86ISD::VSRLI:
-      for (unsigned i = 0; i != NumElts; ++i) {
-        SDValue CurrentOp = SrcOp->getOperand(i);
-        if (CurrentOp->isUndef()) {
-          // Must produce 0s in the correct bits.
-          Elts.push_back(DAG.getConstant(0, dl, ElementType));
-          continue;
-        }
-        auto *ND = cast<ConstantSDNode>(CurrentOp);
-        const APInt &C = ND->getAPIntValue();
-        Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
-      }
+      ShiftOpc = ISD::SRL;
       break;
     case X86ISD::VSRAI:
-      for (unsigned i = 0; i != NumElts; ++i) {
-        SDValue CurrentOp = SrcOp->getOperand(i);
-        if (CurrentOp->isUndef()) {
-          // All shifted in bits must be the same so use 0.
-          Elts.push_back(DAG.getConstant(0, dl, ElementType));
-          continue;
-        }
-        auto *ND = cast<ConstantSDNode>(CurrentOp);
-        const APInt &C = ND->getAPIntValue();
-        Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
-      }
+      ShiftOpc = ISD::SRA;
       break;
     }
 
-    return DAG.getBuildVector(VT, dl, Elts);
+    SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
+    if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
+      return C;
   }
 
   return DAG.getNode(Opc, dl, VT, SrcOp,
                      DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
 }
 
-/// Handle vector element shifts where the shift amount may or may not be a
-/// constant. Takes immediate version of shift as input.
-/// TODO: Replace with vector + (splat) idx to avoid extract_element nodes.
+/// Handle vector element shifts by a splat shift amount
 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
-                                   SDValue SrcOp, SDValue ShAmt,
+                                   SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
                                    const X86Subtarget &Subtarget,
                                    SelectionDAG &DAG) {
-  MVT SVT = ShAmt.getSimpleValueType();
-  assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
-
-  // Change opcode to non-immediate version.
-  Opc = getTargetVShiftUniformOpcode(Opc, true);
-
-  // Need to build a vector containing shift amount.
-  // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
-  // +====================+============+=======================================+
-  // | ShAmt is           | HasSSE4.1? | Construct ShAmt vector as             |
-  // +====================+============+=======================================+
-  // | i64                | Yes, No    | Use ShAmt as lowest elt               |
-  // | i32                | Yes        | zero-extend in-reg                    |
-  // | (i32 zext(i16/i8)) | Yes        | zero-extend in-reg                    |
-  // | (i32 zext(i16/i8)) | No         | byte-shift-in-reg                     |
-  // | i16/i32            | No         | v4i32 build_vector(ShAmt, 0, ud, ud)) |
-  // +====================+============+=======================================+
-
-  if (SVT == MVT::i64)
-    ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
-  else if (ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
-           ShAmt.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-           (ShAmt.getOperand(0).getSimpleValueType() == MVT::i16 ||
-            ShAmt.getOperand(0).getSimpleValueType() == MVT::i8)) {
+  MVT AmtVT = ShAmt.getSimpleValueType();
+  assert(AmtVT.isVector() && "Vector shift type mismatch");
+  assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&
+         "Illegal vector splat index");
+
+  // Move the splat element to the bottom element.
+  if (ShAmtIdx != 0) {
+    SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
+    Mask[0] = ShAmtIdx;
+    ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
+  }
+
+  // Peek through any zext node if we can get back to a 128-bit source.
+  if (AmtVT.getScalarSizeInBits() == 64 &&
+      (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
+       ShAmt.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) &&
+      ShAmt.getOperand(0).getValueType().isSimple() &&
+      ShAmt.getOperand(0).getValueType().is128BitVector()) {
     ShAmt = ShAmt.getOperand(0);
-    MVT AmtTy = ShAmt.getSimpleValueType() == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
-    ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), AmtTy, ShAmt);
-    if (Subtarget.hasSSE41())
+    AmtVT = ShAmt.getSimpleValueType();
+  }
+
+  // See if we can mask off the upper elements using the existing source node.
+  // The shift uses the entire lower 64-bits of the amount vector, so no need to
+  // do this for vXi64 types.
+  bool IsMasked = false;
+  if (AmtVT.getScalarSizeInBits() < 64) {
+    if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
+        ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+      // If the shift amount has come from a scalar, then zero-extend the scalar
+      // before moving to the vector.
+      ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
+      ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
+      ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
+      AmtVT = MVT::v4i32;
+      IsMasked = true;
+    } else if (ShAmt.getOpcode() == ISD::AND) {
+      // See if the shift amount is already masked (e.g. for rotation modulo),
+      // then we can zero-extend it by setting all the other mask elements to
+      // zero.
+      SmallVector<SDValue> MaskElts(
+          AmtVT.getVectorNumElements(),
+          DAG.getConstant(0, dl, AmtVT.getScalarType()));
+      MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
+      SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
+      if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
+                                             {ShAmt.getOperand(1), Mask}))) {
+        ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
+        IsMasked = true;
+      }
+    }
+  }
+
+  // Extract if the shift amount vector is larger than 128-bits.
+  if (AmtVT.getSizeInBits() > 128) {
+    ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
+    AmtVT = ShAmt.getSimpleValueType();
+  }
+
+  // Zero-extend bottom element to v2i64 vector type, either by extension or
+  // shuffle masking.
+  if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
+    if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
+                                ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
+      ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
+    } else if (Subtarget.hasSSE41()) {
       ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
                           MVT::v2i64, ShAmt);
-    else {
+    } else {
       SDValue ByteShift = DAG.getTargetConstant(
-          (128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
+          (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
       ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
       ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
                           ByteShift);
       ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
                           ByteShift);
     }
-  } else if (Subtarget.hasSSE41() &&
-             ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
-    ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
-    ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
-                        MVT::v2i64, ShAmt);
-  } else {
-    SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT),
-                        DAG.getUNDEF(SVT)};
-    ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
   }
 
+  // Change opcode to non-immediate version.
+  Opc = getTargetVShiftUniformOpcode(Opc, true);
+
   // The return type has to be a 128-bit type with the same element
   // type as the input type.
   MVT EltVT = VT.getVectorElementType();
@@ -25907,8 +26278,7 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
 
   // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
   // prologue to RBP in the parent function.
-  const X86Subtarget &Subtarget =
-      static_cast<const X86Subtarget &>(DAG.getSubtarget());
+  const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
   if (Subtarget.is64Bit())
     return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
 
@@ -26444,6 +26814,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     case VSHIFT: {
       SDValue SrcOp = Op.getOperand(1);
       SDValue ShAmt = Op.getOperand(2);
+      assert(ShAmt.getValueType() == MVT::i32 &&
+             "Unexpected VSHIFT amount type");
 
       // Catch shift-by-constant.
       if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
@@ -26451,8 +26823,9 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                           Op.getSimpleValueType(), SrcOp,
                                           CShAmt->getZExtValue(), DAG);
 
+      ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
-                                 SrcOp, ShAmt, Subtarget, DAG);
+                                 SrcOp, ShAmt, 0, Subtarget, DAG);
     }
     case COMPRESS_EXPAND_IN_REG: {
       SDValue Mask = Op.getOperand(3);
@@ -27411,6 +27784,30 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
                          Operation.getValue(1));
     }
+    case Intrinsic::x86_atomic_bts:
+    case Intrinsic::x86_atomic_btc:
+    case Intrinsic::x86_atomic_btr: {
+      SDLoc DL(Op);
+      MVT VT = Op.getSimpleValueType();
+      SDValue Chain = Op.getOperand(0);
+      SDValue Op1 = Op.getOperand(2);
+      SDValue Op2 = Op.getOperand(3);
+      unsigned Opc = IntNo == Intrinsic::x86_atomic_bts   ? X86ISD::LBTS
+                     : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
+                                                          : X86ISD::LBTR;
+      SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
+      MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
+      SDValue Res =
+          DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
+                                  {Chain, Op1, Op2, Size}, VT, MMO);
+      Chain = Res.getValue(1);
+      Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
+      unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue();
+      if (Imm)
+        Res = DAG.getNode(ISD::SHL, DL, VT, Res,
+                          DAG.getShiftAmountConstant(Imm, VT, DL));
+      return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
+    }
     }
     return SDValue();
   }
@@ -28394,11 +28791,27 @@ static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
   return SDValue();
 }
 
-static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
+                        SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+
+  // For AVX1 cases, split to use legal ops.
+  if (VT.is256BitVector() && !Subtarget.hasInt256())
+    return splitVectorIntBinary(Op, DAG);
+
+  if (VT == MVT::v32i16 || VT == MVT::v64i8)
+    return splitVectorIntBinary(Op, DAG);
+
+  // Default to expand.
+  return SDValue();
+}
+
+static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
+                           SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
 
-  // For AVX1 cases, split to use legal ops (everything but v4i64).
-  if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
+  // For AVX1 cases, split to use legal ops.
+  if (VT.is256BitVector() && !Subtarget.hasInt256())
     return splitVectorIntBinary(Op, DAG);
 
   if (VT == MVT::v32i16 || VT == MVT::v64i8)
@@ -29188,19 +29601,12 @@ static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG,
   SDValue Amt = Op.getOperand(1);
   unsigned Opcode = Op.getOpcode();
   unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
-  unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true);
 
-  if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) {
-    if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
-      MVT EltVT = VT.getVectorElementType();
-      assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
-      if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
-        BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
-      else if (EltVT.bitsLT(MVT::i32))
-        BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
-
-      return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
-    }
+  int BaseShAmtIdx = -1;
+  if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
+    if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
+      return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
+                                 Subtarget, DAG);
 
     // vXi8 shifts - shift as v8i16 + mask result.
     if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
@@ -29212,13 +29618,12 @@ static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG,
       if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
         unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
         unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
-        BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
 
         // Create the mask using vXi16 shifts. For shift-rights we need to move
         // the upper byte down before splatting the vXi8 mask.
         SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
         BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
-                                      BaseShAmt, Subtarget, DAG);
+                                      BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
         if (Opcode != ISD::SHL)
           BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
                                                8, DAG);
@@ -29228,7 +29633,7 @@ static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG,
 
         SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
                                           DAG.getBitcast(ExtVT, R), BaseShAmt,
-                                          Subtarget, DAG);
+                                          BaseShAmtIdx, Subtarget, DAG);
         Res = DAG.getBitcast(VT, Res);
         Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
 
@@ -29236,8 +29641,9 @@ static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG,
           // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
           // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
           SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
-          SignMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask,
-                                         BaseShAmt, Subtarget, DAG);
+          SignMask =
+              getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
+                                  BaseShAmtIdx, Subtarget, DAG);
           SignMask = DAG.getBitcast(VT, SignMask);
           Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
           Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
@@ -29247,23 +29653,6 @@ static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG,
     }
   }
 
-  // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
-  if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
-      Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
-    Amt = Amt.getOperand(0);
-    unsigned Ratio = 64 / Amt.getScalarValueSizeInBits();
-    std::vector<SDValue> Vals(Ratio);
-    for (unsigned i = 0; i != Ratio; ++i)
-      Vals[i] = Amt.getOperand(i);
-    for (unsigned i = Ratio, e = Amt.getNumOperands(); i != e; i += Ratio) {
-      for (unsigned j = 0; j != Ratio; ++j)
-        if (Vals[j] != Amt.getOperand(i + j))
-          return SDValue();
-    }
-
-    if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
-      return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
-  }
   return SDValue();
 }
 
@@ -29843,8 +30232,8 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
                            {Op0, Op1, Amt}, DAG, Subtarget);
     }
     assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
-            VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
-            VT == MVT::v16i32) &&
+            VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||
+            VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
            "Unexpected funnel shift type!");
 
     // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
@@ -29867,7 +30256,7 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
 
     // Split 256-bit integers on XOP/pre-AVX2 targets.
     // Split 512-bit integers on non 512-bit BWI targets.
-    if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 32) ||
+    if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
                                  !Subtarget.hasAVX2())) ||
         (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
          EltSizeInBits < 32)) {
@@ -29878,18 +30267,18 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
 
     // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
     if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
-      if (SDValue ScalarAmt = DAG.getSplatValue(AmtMod)) {
+      int ScalarAmtIdx = -1;
+      if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
         // Uniform vXi16 funnel shifts can be efficiently handled by default.
         if (EltSizeInBits == 16)
           return SDValue();
 
         SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
         SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
-        ScalarAmt = DAG.getZExtOrTrunc(ScalarAmt, DL, MVT::i32);
-        Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt, Subtarget,
-                                 DAG);
-        Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt, Subtarget,
-                                 DAG);
+        Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
+                                 ScalarAmtIdx, Subtarget, DAG);
+        Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
+                                 ScalarAmtIdx, Subtarget, DAG);
         return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
       }
     }
@@ -30079,18 +30468,20 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
   // Attempt to fold as unpack(x,x) << zext(splat(y)):
   // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
   // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
-  // TODO: Handle vXi16 cases on all targets.
-  if (EltSizeInBits == 8 || EltSizeInBits == 32 ||
-      (IsROTL && EltSizeInBits == 16 && !Subtarget.hasAVX())) {
-    if (SDValue BaseRotAmt = DAG.getSplatValue(AmtMod)) {
+  if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
+    int BaseRotAmtIdx = -1;
+    if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
+      if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
+        unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
+        return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
+      }
       unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
       SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
       SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
-      BaseRotAmt = DAG.getZExtOrTrunc(BaseRotAmt, DL, MVT::i32);
       Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
-                               Subtarget, DAG);
+                               BaseRotAmtIdx, Subtarget, DAG);
       Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
-                               Subtarget, DAG);
+                               BaseRotAmtIdx, Subtarget, DAG);
       return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
     }
   }
@@ -30273,14 +30664,15 @@ bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
 
   if (OpWidth == 64)
-    return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit();
+    return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
   if (OpWidth == 128)
-    return Subtarget.hasCmpxchg16b();
+    return Subtarget.canUseCMPXCHG16B();
 
   return false;
 }
 
-bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+TargetLoweringBase::AtomicExpansionKind
+X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
   Type *MemType = SI->getValueOperand()->getType();
 
   bool NoImplicitFloatOps =
@@ -30288,9 +30680,10 @@ bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
   if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
       !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
       (Subtarget.hasSSE1() || Subtarget.hasX87()))
-    return false;
+    return AtomicExpansionKind::None;
 
-  return needsCmpXchgNb(MemType);
+  return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
+                                 : AtomicExpansionKind::None;
 }
 
 // Note: this turns large loads into lock cmpxchg8b/16b.
@@ -30313,6 +30706,65 @@ X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
                                  : AtomicExpansionKind::None;
 }
 
+TargetLowering::AtomicExpansionKind
+X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
+  // If the atomicrmw's result isn't actually used, we can just add a "lock"
+  // prefix to a normal instruction for these operations.
+  if (AI->use_empty())
+    return AtomicExpansionKind::None;
+
+  // If the atomicrmw's result is used by a single bit AND, we may use
+  // bts/btr/btc instruction for these operations.
+  auto *C1 = dyn_cast<ConstantInt>(AI->getValOperand());
+  Instruction *I = AI->user_back();
+  if (!C1 || !AI->hasOneUse() || I->getOpcode() != Instruction::And ||
+      AI->getParent() != I->getParent())
+    return AtomicExpansionKind::CmpXChg;
+  // The following instruction must be a AND single bit.
+  auto *C2 = dyn_cast<ConstantInt>(I->getOperand(1));
+  unsigned Bits = AI->getType()->getPrimitiveSizeInBits();
+  if (!C2 || Bits == 8 || !isPowerOf2_64(C2->getZExtValue()))
+    return AtomicExpansionKind::CmpXChg;
+
+  if (AI->getOperation() == AtomicRMWInst::And)
+    return ~C1->getValue() == C2->getValue()
+               ? AtomicExpansionKind::BitTestIntrinsic
+               : AtomicExpansionKind::CmpXChg;
+
+  return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic
+                  : AtomicExpansionKind::CmpXChg;
+}
+
+void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
+  IRBuilder<> Builder(AI);
+  Intrinsic::ID IID = Intrinsic::not_intrinsic;
+  switch (AI->getOperation()) {
+  default:
+    llvm_unreachable("Unknown atomic operation");
+  case AtomicRMWInst::Or:
+    IID = Intrinsic::x86_atomic_bts;
+    break;
+  case AtomicRMWInst::Xor:
+    IID = Intrinsic::x86_atomic_btc;
+    break;
+  case AtomicRMWInst::And:
+    IID = Intrinsic::x86_atomic_btr;
+    break;
+  }
+  Instruction *I = AI->user_back();
+  LLVMContext &Ctx = AI->getContext();
+  unsigned Imm =
+      countTrailingZeros(cast<ConstantInt>(I->getOperand(1))->getZExtValue());
+  Function *BitTest =
+      Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType());
+  Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
+                                          Type::getInt8PtrTy(Ctx));
+  Value *Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});
+  I->replaceAllUsesWith(Result);
+  I->eraseFromParent();
+  AI->eraseFromParent();
+}
+
 TargetLowering::AtomicExpansionKind
 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
@@ -30337,10 +30789,7 @@ X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   case AtomicRMWInst::Or:
   case AtomicRMWInst::And:
   case AtomicRMWInst::Xor:
-    // If the atomicrmw's result isn't actually used, we can just add a "lock"
-    // prefix to a normal instruction for these operations.
-    return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
-                            : AtomicExpansionKind::None;
+    return shouldExpandLogicAtomicRMWInIR(AI);
   case AtomicRMWInst::Nand:
   case AtomicRMWInst::Max:
   case AtomicRMWInst::Min:
@@ -31552,16 +32001,12 @@ SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
   // require special handling for these nodes), lower them as literal NOOPs for
   // the time being.
   SmallVector<SDValue, 2> Ops;
-
   Ops.push_back(Op.getOperand(0));
   if (Op->getGluedNode())
     Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
 
-  SDLoc OpDL(Op);
   SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
-  SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
-
-  return NOOP;
+  return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
 }
 
 // Custom split CVTPS2PH with wide types.
@@ -31710,8 +32155,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SMAX:
   case ISD::SMIN:
   case ISD::UMAX:
-  case ISD::UMIN:               return LowerMINMAX(Op, DAG);
+  case ISD::UMIN:               return LowerMINMAX(Op, Subtarget, DAG);
   case ISD::ABS:                return LowerABS(Op, Subtarget, DAG);
+  case ISD::AVGCEILU:           return LowerAVG(Op, Subtarget, DAG);
   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
   case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
   case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
@@ -31807,9 +32253,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     Results.push_back(Res);
     return;
   }
-  case X86ISD::VPMADDWD:
-  case X86ISD::AVG: {
-    // Legalize types for X86ISD::AVG/VPMADDWD by widening.
+  case X86ISD::VPMADDWD: {
+    // Legalize types for X86ISD::VPMADDWD by widening.
     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
 
     EVT VT = N->getValueType(0);
@@ -32462,7 +32907,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     EVT T = N->getValueType(0);
     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
     bool Regs64bit = T == MVT::i128;
-    assert((!Regs64bit || Subtarget.hasCmpxchg16b()) &&
+    assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&
            "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
     MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
     SDValue cpInL, cpInH;
@@ -32821,6 +33266,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(LOR)
   NODE_NAME_CASE(LXOR)
   NODE_NAME_CASE(LAND)
+  NODE_NAME_CASE(LBTS)
+  NODE_NAME_CASE(LBTC)
+  NODE_NAME_CASE(LBTR)
   NODE_NAME_CASE(VZEXT_MOVL)
   NODE_NAME_CASE(VZEXT_LOAD)
   NODE_NAME_CASE(VEXTRACT_STORE)
@@ -33041,7 +33489,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(SCALEF_RND)
   NODE_NAME_CASE(SCALEFS)
   NODE_NAME_CASE(SCALEFS_RND)
-  NODE_NAME_CASE(AVG)
   NODE_NAME_CASE(MULHRS)
   NODE_NAME_CASE(SINT_TO_FP_RND)
   NODE_NAME_CASE(UINT_TO_FP_RND)
@@ -33222,7 +33669,6 @@ bool X86TargetLowering::isBinOp(unsigned Opcode) const {
 bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
   switch (Opcode) {
   // TODO: Add more X86ISD opcodes once we have test coverage.
-  case X86ISD::AVG:
   case X86ISD::PCMPEQ:
   case X86ISD::PMULDQ:
   case X86ISD::PMULUDQ:
@@ -33418,6 +33864,20 @@ bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
 }
 
+bool X86TargetLowering::shouldFoldSelectWithIdentityConstant(unsigned Opcode,
+                                                             EVT VT) const {
+  // TODO: This is too general. There are cases where pre-AVX512 codegen would
+  //       benefit. The transform may also be profitable for scalar code.
+  if (!Subtarget.hasAVX512())
+    return false;
+  if (!Subtarget.hasVLX() && !VT.is512BitVector())
+    return false;
+  if (!VT.isVector())
+    return false;
+
+  return true;
+}
+
 /// Targets can use this to indicate that they only support *some*
 /// VECTOR_SHUFFLE operations, those with specific masks.
 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
@@ -33460,6 +33920,16 @@ bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
   return TargetLowering::areJTsAllowed(Fn);
 }
 
+MVT X86TargetLowering::getPreferredSwitchConditionType(LLVMContext &Context,
+                                                       EVT ConditionVT) const {
+  // Avoid 8 and 16 bit types because they increase the chance for unnecessary
+  // zero-extensions.
+  if (ConditionVT.getSizeInBits() < 32)
+    return MVT::i32;
+  return TargetLoweringBase::getPreferredSwitchConditionType(Context,
+                                                             ConditionVT);
+}
+
 //===----------------------------------------------------------------------===//
 //                           X86 Scheduler Hooks
 //===----------------------------------------------------------------------===//
@@ -33871,6 +34341,7 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
 // conditional jump around it.
 static bool isCMOVPseudo(MachineInstr &MI) {
   switch (MI.getOpcode()) {
+  case X86::CMOV_FR16:
   case X86::CMOV_FR16X:
   case X86::CMOV_FR32:
   case X86::CMOV_FR32X:
@@ -34090,7 +34561,7 @@ X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
 
   //  SinkMBB:
   //   %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
-  Register DestReg = FirstCMOV.getOperand(0).getReg();
+  Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
   Register Op1Reg = FirstCMOV.getOperand(1).getReg();
   Register Op2Reg = FirstCMOV.getOperand(2).getReg();
   MachineInstrBuilder MIB =
@@ -34103,11 +34574,6 @@ X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
   // The second SecondInsertedMBB provides the same incoming value as the
   // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
   MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
-  // Copy the PHI result to the register defined by the second CMOV.
-  BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
-          TII->get(TargetOpcode::COPY),
-          SecondCascadedCMOV.getOperand(0).getReg())
-      .addReg(FirstCMOV.getOperand(0).getReg());
 
   // Now remove the CMOVs.
   FirstCMOV.eraseFromParent();
@@ -35546,6 +36012,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case X86::TLSCall_32:
   case X86::TLSCall_64:
     return EmitLoweredTLSCall(MI, BB);
+  case X86::CMOV_FR16:
+  case X86::CMOV_FR16X:
   case X86::CMOV_FR32:
   case X86::CMOV_FR32X:
   case X86::CMOV_FR64:
@@ -36116,6 +36584,15 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     }
     break;
   }
+  case X86ISD::AND: {
+    if (Op.getResNo() == 0) {
+      KnownBits Known2;
+      Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+      Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+      Known &= Known2;
+    }
+    break;
+  }
   case X86ISD::ANDNP: {
     KnownBits Known2;
     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
@@ -36257,6 +36734,28 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
       Known.setAllZero();
     break;
   }
+  case X86ISD::VBROADCAST_LOAD: {
+    APInt UndefElts;
+    SmallVector<APInt, 16> EltBits;
+    if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
+                                      /*AllowWholeUndefs*/ false,
+                                      /*AllowPartialUndefs*/ false)) {
+      Known.Zero.setAllBits();
+      Known.One.setAllBits();
+      for (unsigned I = 0; I != NumElts; ++I) {
+        if (!DemandedElts[I])
+          continue;
+        if (UndefElts[I]) {
+          Known.resetAll();
+          break;
+        }
+        KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
+        Known = KnownBits::commonBits(Known, Known2);
+      }
+      return;
+    }
+    break;
+  }
   }
 
   // Handle target shuffles.
@@ -37113,9 +37612,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   unsigned NumRootElts = RootVT.getVectorNumElements();
 
   // Canonicalize shuffle input op to the requested type.
-  // TODO: Support cases where Op is smaller than VT.
   auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
-    if (VT.getSizeInBits() < Op.getValueSizeInBits())
+    if (VT.getSizeInBits() > Op.getValueSizeInBits())
+      Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
+    else if (VT.getSizeInBits() < Op.getValueSizeInBits())
       Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
     return DAG.getBitcast(VT, Op);
   };
@@ -37129,8 +37629,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
 
   MVT VT1 = V1.getSimpleValueType();
   MVT VT2 = V2.getSimpleValueType();
-  assert(VT1.getSizeInBits() == RootSizeInBits &&
-         VT2.getSizeInBits() == RootSizeInBits && "Vector size mismatch");
+  assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&
+         (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch");
 
   SDValue Res;
 
@@ -37157,12 +37657,13 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     }
   }
 
-  // If we are shuffling a broadcast (and not introducing zeros) then
-  // we can just use the broadcast directly. This works for smaller broadcast
-  // elements as well as they already repeat across each mask element
-  if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
+  // If we are shuffling a splat (and not introducing zeros) then we can just
+  // use it directly. This works for smaller elements as well as they already
+  // repeat across each mask element.
+  if (UnaryShuffle && !isAnyZero(BaseMask) &&
+      V1.getValueSizeInBits() >= RootSizeInBits &&
       (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
-      V1.getValueSizeInBits() >= RootSizeInBits) {
+      DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
     return CanonicalizeShuffleInput(RootVT, V1);
   }
 
@@ -37543,7 +38044,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
          (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
         (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
         isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
-      if (Depth == 0 && Root.getOpcode() == ISD::TRUNCATE)
+      // Bail if this was already a truncation or PACK node.
+      // We sometimes fail to match PACK if we demand known undef elements.
+      if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE ||
+                         Root.getOpcode() == X86ISD::PACKSS ||
+                         Root.getOpcode() == X86ISD::PACKUS))
         return SDValue(); // Nothing to do!
       ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
       ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
@@ -37852,6 +38357,12 @@ static SDValue combineX86ShuffleChainWithExtract(
   unsigned RootSizeInBits = RootVT.getSizeInBits();
   assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
 
+  // Bail if we have any smaller inputs.
+  if (llvm::any_of(Inputs, [RootSizeInBits](SDValue Input) {
+        return Input.getValueSizeInBits() < RootSizeInBits;
+      }))
+    return SDValue();
+
   SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
   SmallVector<unsigned, 4> Offsets(NumInputs, 0);
 
@@ -37894,16 +38405,6 @@ static SDValue combineX86ShuffleChainWithExtract(
       }))
     return SDValue();
 
-  for (SDValue &NewInput : WideInputs) {
-    assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&
-           "Shuffle vector size mismatch");
-    if (WideSizeInBits > NewInput.getValueSizeInBits())
-      NewInput = widenSubVector(NewInput, false, Subtarget, DAG,
-                                SDLoc(NewInput), WideSizeInBits);
-    assert(WideSizeInBits == NewInput.getValueSizeInBits() &&
-           "Unexpected subvector extraction");
-  }
-
   // Create new mask for larger type.
   for (unsigned i = 1; i != NumInputs; ++i)
     Offsets[i] += i * Scale * NumMaskElts;
@@ -37928,7 +38429,10 @@ static SDValue combineX86ShuffleChainWithExtract(
 
   // Attempt to combine wider chain.
   // TODO: Can we use a better Root?
-  SDValue WideRoot = WideInputs[0];
+  SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
+                             WideInputs.back().getValueSizeInBits()
+                         ? WideInputs.front()
+                         : WideInputs.back();
   if (SDValue WideShuffle =
           combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
                                  HasVariableMask, AllowVariableCrossLaneMask,
@@ -38267,9 +38771,9 @@ static SDValue combineX86ShufflesRecursively(
   assert(RootMask.size() > 0 &&
          (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
          "Illegal shuffle root mask");
-  assert(Root.getSimpleValueType().isVector() &&
-         "Shuffles operate on vector types!");
-  unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits();
+  MVT RootVT = Root.getSimpleValueType();
+  assert(RootVT.isVector() && "Shuffles operate on vector types!");
+  unsigned RootSizeInBits = RootVT.getSizeInBits();
 
   // Bound the depth of our recursive combine because this is ultimately
   // quadratic in nature.
@@ -38298,16 +38802,27 @@ static SDValue combineX86ShufflesRecursively(
   APInt OpUndef, OpZero;
   APInt OpDemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
   bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
-  if (!getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
-                              OpZero, DAG, Depth, false))
-    return SDValue();
-
-  // Shuffle inputs must not be larger than the shuffle result.
-  // TODO: Relax this for single input faux shuffles (trunc/extract_subvector).
-  if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
-        return OpInput.getValueSizeInBits() > VT.getSizeInBits();
-      }))
+  if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
+                             OpZero, DAG, Depth, false)) {
+    // Shuffle inputs must not be larger than the shuffle result.
+    // TODO: Relax this for single input faux shuffles (e.g. trunc).
+    if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
+          return OpInput.getValueSizeInBits() > VT.getSizeInBits();
+        }))
+      return SDValue();
+  } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+             (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
+             !isNullConstant(Op.getOperand(1))) {
+    SDValue SrcVec = Op.getOperand(0);
+    int ExtractIdx = Op.getConstantOperandVal(1);
+    unsigned NumElts = VT.getVectorNumElements();
+    OpInputs.assign({SrcVec});
+    OpMask.assign(NumElts, SM_SentinelUndef);
+    std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
+    OpZero = OpUndef = APInt::getNullValue(NumElts);
+  } else {
     return SDValue();
+  }
 
   // If the shuffle result was smaller than the root, we need to adjust the
   // mask indices and pad the mask with undefs.
@@ -38467,13 +38982,12 @@ static SDValue combineX86ShufflesRecursively(
 
   // Handle the all undef/zero/ones cases early.
   if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
-    return DAG.getUNDEF(Root.getValueType());
+    return DAG.getUNDEF(RootVT);
   if (all_of(Mask, [](int Idx) { return Idx < 0; }))
-    return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
-                         SDLoc(Root));
+    return getZeroVector(RootVT, Subtarget, DAG, SDLoc(Root));
   if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
       none_of(Mask, [](int M) { return M == SM_SentinelZero; }))
-    return getOnesVector(Root.getValueType(), DAG, SDLoc(Root));
+    return getOnesVector(RootVT, DAG, SDLoc(Root));
 
   assert(!Ops.empty() && "Shuffle with no inputs detected");
   HasVariableMask |= IsOpVariableMask;
@@ -38533,7 +39047,7 @@ static SDValue combineX86ShufflesRecursively(
   // NOTE: This will update the Ops and Mask.
   if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
           Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
-    return DAG.getBitcast(Root.getValueType(), HOp);
+    return DAG.getBitcast(RootVT, HOp);
 
   // Try to refine our inputs given our knowledge of target shuffle mask.
   for (auto I : enumerate(Ops)) {
@@ -38578,6 +39092,8 @@ static SDValue combineX86ShufflesRecursively(
   // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
 
   // Widen any subvector shuffle inputs we've collected.
+  // TODO: Remove this to avoid generating temporary nodes, we should only
+  // widen once combineX86ShuffleChain has found a match.
   if (any_of(Ops, [RootSizeInBits](SDValue Op) {
         return Op.getValueSizeInBits() < RootSizeInBits;
       })) {
@@ -38823,8 +39339,7 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
     SDValue N0 = V.getOperand(0);
     SDValue N1 = V.getOperand(1);
     unsigned Imm = V.getConstantOperandVal(2);
-    const X86Subtarget &Subtarget =
-        static_cast<const X86Subtarget &>(DAG.getSubtarget());
+    const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
     if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
         X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))
       return SDValue();
@@ -38869,21 +39384,24 @@ static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT ShuffleVT = N.getValueType();
 
-  auto IsMergeableWithShuffle = [](SDValue Op) {
+  auto IsMergeableWithShuffle = [&DAG](SDValue Op, bool FoldLoad = false) {
     // AllZeros/AllOnes constants are freely shuffled and will peek through
     // bitcasts. Other constant build vectors do not peek through bitcasts. Only
     // merge with target shuffles if it has one use so shuffle combining is
-    // likely to kick in.
+    // likely to kick in. Shuffles of splats are expected to be removed.
     return ISD::isBuildVectorAllOnes(Op.getNode()) ||
            ISD::isBuildVectorAllZeros(Op.getNode()) ||
            ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
            ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||
-           (isTargetShuffle(Op.getOpcode()) && Op->hasOneUse());
+           (isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
+           (FoldLoad && isShuffleFoldableLoad(Op)) ||
+           DAG.isSplatValue(Op, /*AllowUndefs*/ false);
   };
   auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
     // Ensure we only shuffle whole vector src elements, unless its a logical
     // binops where we can more aggressively move shuffles from dst to src.
     return BinOp == ISD::AND || BinOp == ISD::OR || BinOp == ISD::XOR ||
+           BinOp == X86ISD::ANDNP ||
            (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
   };
 
@@ -38913,7 +39431,8 @@ static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,
       if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
         SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
         SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
-        if (IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op01)) {
+        if (IsMergeableWithShuffle(Op00, Opc != X86ISD::PSHUFB) ||
+            IsMergeableWithShuffle(Op01, Opc != X86ISD::PSHUFB)) {
           SDValue LHS, RHS;
           Op00 = DAG.getBitcast(ShuffleVT, Op00);
           Op01 = DAG.getBitcast(ShuffleVT, Op01);
@@ -39054,6 +39573,11 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
   SmallVector<int, 4> Mask;
   unsigned Opcode = N.getOpcode();
 
+  // FIXME: Remove this after we support vector FP16
+  if (isSoftFP16(peekThroughBitcasts(N.getOperand(0)).getSimpleValueType(),
+                 Subtarget))
+    return SDValue();
+
   if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
     return R;
 
@@ -39471,7 +39995,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
         return SDValue();
       SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
       SmallVector<SDValue> SubOps;
-      if (collectConcatOps(Src.getNode(), SubOps) && SubOps.size() == 2)
+      if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
         return SubOps[Idx & 1];
       unsigned NumElts = Src.getValueType().getVectorNumElements();
       if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
@@ -39581,7 +40105,9 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
         if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
           // No change if element is already zero or the inserted element.
           continue;
-        } else if (KnownUndef0[i] || KnownZero0[i]) {
+        }
+
+        if (KnownUndef0[i] || KnownZero0[i]) {
           // If the target mask is undef/zero then we must zero the element.
           InsertPSMask |= (1u << i);
           Updated = true;
@@ -40016,16 +40542,14 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
 
     // Simplify source operands based on shuffle mask.
     // TODO - merge this into combineX86ShufflesRecursively.
-    APInt KnownUndef, KnownZero;
     APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
-    if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero,
-                                       DCI))
+    if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
       return SDValue(N, 0);
 
     // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
     // Perform this after other shuffle combines to allow inner shuffles to be
     // combined away first.
-    if (SDValue BinOp = canonicalizeShuffleWithBinOps(Op, DAG, SDLoc(N)))
+    if (SDValue BinOp = canonicalizeShuffleWithBinOps(Op, DAG, dl))
       return BinOp;
   }
 
@@ -40212,6 +40736,11 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
                                    Depth + 1))
       return true;
 
+    // Fold shift(0,x) -> 0
+    if (DemandedElts.isSubsetOf(KnownZero))
+      return TLO.CombineTo(
+          Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
+
     // Aggressively peek through ops to get at the demanded elts.
     if (!DemandedElts.isAllOnes())
       if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
@@ -40232,9 +40761,16 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
     if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
                                    Depth + 1))
       return true;
+
+    // Fold shift(0,x) -> 0
+    if (DemandedElts.isSubsetOf(LHSZero))
+      return TLO.CombineTo(
+          Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
+
     if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
                                    Depth + 1))
       return true;
+
     KnownZero = LHSZero;
     break;
   }
@@ -40316,6 +40852,57 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
     KnownZero.setHighBits(ShiftAmt);
     break;
   }
+  case X86ISD::ANDNP: {
+    // ANDNP = (~LHS & RHS);
+    SDValue LHS = Op.getOperand(0);
+    SDValue RHS = Op.getOperand(1);
+
+    auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
+      APInt UndefElts;
+      SmallVector<APInt> EltBits;
+      int NumElts = VT.getVectorNumElements();
+      int EltSizeInBits = VT.getScalarSizeInBits();
+      APInt OpBits = APInt::getAllOnes(EltSizeInBits);
+      APInt OpElts = DemandedElts;
+      if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
+                                        EltBits)) {
+        OpBits.clearAllBits();
+        OpElts.clearAllBits();
+        for (int I = 0; I != NumElts; ++I)
+          if (DemandedElts[I] && ((Invert && !EltBits[I].isAllOnes()) ||
+                                  (!Invert && !EltBits[I].isZero()))) {
+            OpBits |= Invert ? ~EltBits[I] : EltBits[I];
+            OpElts.setBit(I);
+          }
+      }
+      return std::make_pair(OpBits, OpElts);
+    };
+    std::pair<APInt, APInt> DemandLHS = GetDemandedMasks(RHS);
+    std::pair<APInt, APInt> DemandRHS = GetDemandedMasks(LHS, true);
+
+    APInt LHSUndef, LHSZero;
+    APInt RHSUndef, RHSZero;
+    if (SimplifyDemandedVectorElts(LHS, DemandLHS.second, LHSUndef, LHSZero,
+                                   TLO, Depth + 1))
+      return true;
+    if (SimplifyDemandedVectorElts(RHS, DemandRHS.second, RHSUndef, RHSZero,
+                                   TLO, Depth + 1))
+      return true;
+
+    if (!DemandedElts.isAllOnes()) {
+      SDValue NewLHS = SimplifyMultipleUseDemandedBits(
+          LHS, DemandLHS.first, DemandLHS.second, TLO.DAG, Depth + 1);
+      SDValue NewRHS = SimplifyMultipleUseDemandedBits(
+          RHS, DemandRHS.first, DemandRHS.second, TLO.DAG, Depth + 1);
+      if (NewLHS || NewRHS) {
+        NewLHS = NewLHS ? NewLHS : LHS;
+        NewRHS = NewRHS ? NewRHS : RHS;
+        return TLO.CombineTo(
+            Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
+      }
+    }
+    break;
+  }
   case X86ISD::CVTSI2P:
   case X86ISD::CVTUI2P: {
     SDValue Src = Op.getOperand(0);
@@ -40620,7 +41207,6 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
     case X86ISD::UNPCKH:
     case X86ISD::BLENDI:
       // Integer ops.
-    case X86ISD::AVG:
     case X86ISD::PACKSS:
     case X86ISD::PACKUS:
       // Horizontal Ops.
@@ -40651,10 +41237,10 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
     }
   }
 
-  // For broadcasts, unless we *only* demand the 0'th element,
+  // For splats, unless we *only* demand the 0'th element,
   // stop attempts at simplification here, we aren't going to improve things,
   // this is better than any potential shuffle.
-  if (isTargetShuffleSplat(Op) && !DemandedElts.isOne())
+  if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
     return false;
 
   // Get target/faux shuffle mask.
@@ -40770,20 +41356,31 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
     KnownBits KnownOp;
     SDValue LHS = Op.getOperand(0);
     SDValue RHS = Op.getOperand(1);
+
+    // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
     // FIXME: Can we bound this better?
     APInt DemandedMask = APInt::getLowBitsSet(64, 32);
-    if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp,
-                             TLO, Depth + 1))
+    APInt DemandedMaskLHS = APInt::getAllOnes(64);
+    APInt DemandedMaskRHS = APInt::getAllOnes(64);
+
+    bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
+    if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
+      DemandedMaskLHS = DemandedMask;
+    if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
+      DemandedMaskRHS = DemandedMask;
+
+    if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
+                             KnownOp, TLO, Depth + 1))
       return true;
-    if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,
-                             TLO, Depth + 1))
+    if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
+                             KnownOp, TLO, Depth + 1))
       return true;
 
     // Aggressively peek through ops to get at the demanded low bits.
     SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
-        LHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
+        LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
     SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
-        RHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
+        RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
     if (DemandedLHS || DemandedRHS) {
       DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
       DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
@@ -41084,7 +41681,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
                                    TLO, Depth + 1))
       return true;
 
-    Known.Zero = KnownZero.zextOrSelf(BitWidth);
+    Known.Zero = KnownZero.zext(BitWidth);
     Known.Zero.setHighBits(BitWidth - NumElts);
 
     // MOVMSK only uses the MSB from each vector element.
@@ -41291,12 +41888,8 @@ bool X86TargetLowering::isSplatValueForTargetNode(SDValue Op,
   switch (Opc) {
   case X86ISD::VBROADCAST:
   case X86ISD::VBROADCAST_LOAD:
-    // TODO: Permit vXi64 types on 32-bit targets.
-    if (isTypeLegal(Op.getValueType().getVectorElementType())) {
-      UndefElts = APInt::getNullValue(NumElts);
-      return true;
-    }
-    return false;
+    UndefElts = APInt::getNullValue(NumElts);
+    return true;
   }
 
   return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
@@ -42840,10 +43433,29 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
     return SDValue();
 
   SDLoc DL(ExtElt);
+  unsigned NumElts = VecVT.getVectorNumElements();
+  unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
+
+  // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
+  auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
+    if (V.getValueType() == MVT::v4i8) {
+      if (ZeroExtend && Subtarget.hasSSE41()) {
+        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
+                        DAG.getConstant(0, DL, MVT::v4i32),
+                        DAG.getBitcast(MVT::i32, V),
+                        DAG.getIntPtrConstant(0, DL));
+        return DAG.getBitcast(MVT::v16i8, V);
+      }
+      V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
+                      ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
+                                 : DAG.getUNDEF(MVT::v4i8));
+    }
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
+                       DAG.getUNDEF(MVT::v8i8));
+  };
 
   // vXi8 mul reduction - promote to vXi16 mul reduction.
   if (Opc == ISD::MUL) {
-    unsigned NumElts = VecVT.getVectorNumElements();
     if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
       return SDValue();
     if (VecVT.getSizeInBits() >= 128) {
@@ -42858,11 +43470,7 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
         Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
       }
     } else {
-      if (VecVT == MVT::v4i8)
-        Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
-                          DAG.getUNDEF(MVT::v4i8));
-      Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
-                        DAG.getUNDEF(MVT::v8i8));
+      Rdx = WidenToV16I8(Rdx, false);
       Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
       Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
     }
@@ -42882,24 +43490,7 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
 
   // vXi8 add reduction - sub 128-bit vector.
   if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
-    if (VecVT == MVT::v4i8) {
-      // Pad with zero.
-      if (Subtarget.hasSSE41()) {
-        Rdx = DAG.getBitcast(MVT::i32, Rdx);
-        Rdx = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
-                          DAG.getConstant(0, DL, MVT::v4i32), Rdx,
-                          DAG.getIntPtrConstant(0, DL));
-        Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
-      } else {
-        Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
-                          DAG.getConstant(0, DL, VecVT));
-      }
-    }
-    if (Rdx.getValueType() == MVT::v8i8) {
-      // Pad with undef.
-      Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
-                        DAG.getUNDEF(MVT::v8i8));
-    }
+    Rdx = WidenToV16I8(Rdx, true);
     Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
                       DAG.getConstant(0, DL, MVT::v16i8));
     Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
@@ -42907,8 +43498,7 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
   }
 
   // Must be a >=128-bit vector with pow2 elements.
-  if ((VecVT.getSizeInBits() % 128) != 0 ||
-      !isPowerOf2_32(VecVT.getVectorNumElements()))
+  if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
     return SDValue();
 
   // vXi8 add reduction - sum lo/hi halves then use PSADBW.
@@ -42931,6 +43521,48 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
   }
 
+  // See if we can use vXi8 PSADBW add reduction for larger zext types.
+  // If the source vector values are 0-255, then we can use PSADBW to
+  // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
+  // TODO: See if its worth avoiding vXi16/i32 truncations?
+  if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
+      DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
+      (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
+       Subtarget.hasAVX512())) {
+    EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
+    Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
+    if (ByteVT.getSizeInBits() < 128)
+      Rdx = WidenToV16I8(Rdx, true);
+
+    // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
+    auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+                            ArrayRef<SDValue> Ops) {
+      MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
+      SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
+      return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
+    };
+    MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
+    Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
+
+    // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
+    while (Rdx.getValueSizeInBits() > 128) {
+      SDValue Lo, Hi;
+      std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
+      VecVT = Lo.getValueType();
+      Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
+    }
+    assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected");
+
+    if (NumElts > 8) {
+      SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
+      Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
+    }
+
+    VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
+    Rdx = DAG.getBitcast(VecVT, Rdx);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
+  }
+
   // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
   if (!shouldUseHorizontalOp(true, DAG, Subtarget))
     return SDValue();
@@ -42994,8 +43626,8 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
       uint64_t Idx = CIdx->getZExtValue();
       if (UndefVecElts[Idx])
         return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
-      return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()),
-                             dl, VT);
+      return DAG.getConstant(EltBits[Idx].zext(VT.getScalarSizeInBits()), dl,
+                             VT);
     }
   }
 
@@ -43076,29 +43708,32 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
   // but not
   //   i1 = extract_vector_elt t0:1, Constant:i64<2>
   // since the latter would need its own MOVMSK.
-  if (CIdx && SrcVT.getScalarType() == MVT::i1) {
+  if (SrcVT.getScalarType() == MVT::i1) {
+    bool IsVar = !CIdx;
     SmallVector<SDNode *, 16> BoolExtracts;
     unsigned ResNo = InputVector.getResNo();
-    auto IsBoolExtract = [&BoolExtracts, &ResNo](SDNode *Use) {
+    auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
       if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-          isa<ConstantSDNode>(Use->getOperand(1)) &&
           Use->getOperand(0).getResNo() == ResNo &&
           Use->getValueType(0) == MVT::i1) {
         BoolExtracts.push_back(Use);
+        IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
         return true;
       }
       return false;
     };
+    // TODO: Can we drop the oneuse check for constant extracts?
     if (all_of(InputVector->uses(), IsBoolExtract) &&
-        BoolExtracts.size() > 1) {
+        (IsVar || BoolExtracts.size() > 1)) {
       EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
       if (SDValue BC =
               combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
         for (SDNode *Use : BoolExtracts) {
           // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
-          unsigned MaskIdx = Use->getConstantOperandVal(1);
-          APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx);
-          SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT);
+          // Mask = 1 << MaskIdx
+          SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
+          SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
+          SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
           SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
           Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
           DCI.CombineTo(Use, Res);
@@ -43123,7 +43758,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
   auto *LoadVec = dyn_cast<LoadSDNode>(InputVector);
   if (LoadVec && CIdx && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
       SrcVT.getVectorElementType() == VT && DCI.isAfterLegalizeDAG() &&
-      !LikelyUsedAsVector) {
+      !LikelyUsedAsVector && LoadVec->isSimple()) {
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     SDValue NewPtr =
         TLI.getVectorElementPointer(DAG, LoadVec->getBasePtr(), SrcVT, EltIdx);
@@ -43133,16 +43768,111 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
     SDValue Load =
         DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
                     LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
-    SDValue Chain = Load.getValue(1);
-    SDValue From[] = {SDValue(N, 0), SDValue(LoadVec, 1)};
-    SDValue To[] = {Load, Chain};
-    DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
-    return SDValue(N, 0);
+    DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
+    return Load;
   }
 
   return SDValue();
 }
 
+// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
+// This is more or less the reverse of combineBitcastvxi1.
+static SDValue combineToExtendBoolVectorInReg(
+    unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
+    TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
+  if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
+      Opcode != ISD::ANY_EXTEND)
+    return SDValue();
+  if (!DCI.isBeforeLegalizeOps())
+    return SDValue();
+  if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
+    return SDValue();
+
+  EVT SVT = VT.getScalarType();
+  EVT InSVT = N0.getValueType().getScalarType();
+  unsigned EltSizeInBits = SVT.getSizeInBits();
+
+  // Input type must be extending a bool vector (bit-casted from a scalar
+  // integer) to legal integer types.
+  if (!VT.isVector())
+    return SDValue();
+  if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
+    return SDValue();
+  if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
+    return SDValue();
+
+  SDValue N00 = N0.getOperand(0);
+  EVT SclVT = N00.getValueType();
+  if (!SclVT.isScalarInteger())
+    return SDValue();
+
+  SDValue Vec;
+  SmallVector<int> ShuffleMask;
+  unsigned NumElts = VT.getVectorNumElements();
+  assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
+
+  // Broadcast the scalar integer to the vector elements.
+  if (NumElts > EltSizeInBits) {
+    // If the scalar integer is greater than the vector element size, then we
+    // must split it down into sub-sections for broadcasting. For example:
+    //   i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
+    //   i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
+    assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
+    unsigned Scale = NumElts / EltSizeInBits;
+    EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
+    Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
+    Vec = DAG.getBitcast(VT, Vec);
+
+    for (unsigned i = 0; i != Scale; ++i)
+      ShuffleMask.append(EltSizeInBits, i);
+    Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
+  } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
+             (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
+    // If we have register broadcast instructions, use the scalar size as the
+    // element type for the shuffle. Then cast to the wider element type. The
+    // widened bits won't be used, and this might allow the use of a broadcast
+    // load.
+    assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
+    unsigned Scale = EltSizeInBits / NumElts;
+    EVT BroadcastVT =
+        EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
+    Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
+    ShuffleMask.append(NumElts * Scale, 0);
+    Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
+    Vec = DAG.getBitcast(VT, Vec);
+  } else {
+    // For smaller scalar integers, we can simply any-extend it to the vector
+    // element size (we don't care about the upper bits) and broadcast it to all
+    // elements.
+    SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
+    Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
+    ShuffleMask.append(NumElts, 0);
+    Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
+  }
+
+  // Now, mask the relevant bit in each element.
+  SmallVector<SDValue, 32> Bits;
+  for (unsigned i = 0; i != NumElts; ++i) {
+    int BitIdx = (i % EltSizeInBits);
+    APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
+    Bits.push_back(DAG.getConstant(Bit, DL, SVT));
+  }
+  SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
+  Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
+
+  // Compare against the bitmask and extend the result.
+  EVT CCVT = VT.changeVectorElementType(MVT::i1);
+  Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
+  Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
+
+  // For SEXT, this is now done, otherwise shift the result down for
+  // zero-extension.
+  if (Opcode == ISD::SIGN_EXTEND)
+    return Vec;
+  return DAG.getNode(ISD::SRL, DL, VT, Vec,
+                     DAG.getConstant(EltSizeInBits - 1, DL, VT));
+}
+
 /// If a vector select has an operand that is -1 or 0, try to simplify the
 /// select to a bitwise logic operation.
 /// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
@@ -43270,8 +44000,8 @@ static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
   SDValue FVal = N->getOperand(2);
   SmallVector<SDValue, 4> CatOpsT, CatOpsF;
   if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
-      !collectConcatOps(TVal.getNode(), CatOpsT) ||
-      !collectConcatOps(FVal.getNode(), CatOpsF))
+      !collectConcatOps(TVal.getNode(), CatOpsT, DAG) ||
+      !collectConcatOps(FVal.getNode(), CatOpsF, DAG))
     return SDValue();
 
   auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
@@ -43360,19 +44090,17 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
 /// This function will also call SimplifyDemandedBits on already created
 /// BLENDV to perform additional simplifications.
 static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
-                                           TargetLowering::DAGCombinerInfo &DCI,
-                                           const X86Subtarget &Subtarget) {
+                                      TargetLowering::DAGCombinerInfo &DCI,
+                                      const X86Subtarget &Subtarget) {
   SDValue Cond = N->getOperand(0);
   if ((N->getOpcode() != ISD::VSELECT &&
        N->getOpcode() != X86ISD::BLENDV) ||
       ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
     return SDValue();
 
-  // Don't optimize before the condition has been transformed to a legal type
-  // and don't ever optimize vector selects that map to AVX512 mask-registers.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   unsigned BitWidth = Cond.getScalarValueSizeInBits();
-  if (BitWidth < 8 || BitWidth > 64)
-    return SDValue();
+  EVT VT = N->getValueType(0);
 
   // We can only handle the cases where VSELECT is directly legal on the
   // subtarget. We custom lower VSELECT nodes with constant conditions and
@@ -43384,8 +44112,6 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
   // Potentially, we should combine constant-condition vselect nodes
   // pre-legalization into shuffles and not mark as many types as custom
   // lowered.
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  EVT VT = N->getValueType(0);
   if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
     return SDValue();
   // FIXME: We don't support i16-element blends currently. We could and
@@ -43403,6 +44129,11 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
   if (VT.is512BitVector())
     return SDValue();
 
+  // Don't optimize before the condition has been transformed to a legal type
+  // and don't ever optimize vector selects that map to AVX512 mask-registers.
+  if (BitWidth < 8 || BitWidth > 64)
+    return SDValue();
+
   auto OnlyUsedAsSelectCond = [](SDValue Cond) {
     for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
          UI != UE; ++UI)
@@ -43542,9 +44273,11 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
       return V;
 
   // Convert vselects with constant condition into shuffles.
-  if (CondConstantVector && DCI.isBeforeLegalizeOps()) {
+  if (CondConstantVector && DCI.isBeforeLegalizeOps() &&
+      (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) {
     SmallVector<int, 64> Mask;
-    if (createShuffleMaskFromVSELECT(Mask, Cond))
+    if (createShuffleMaskFromVSELECT(Mask, Cond,
+                                     N->getOpcode() == X86ISD::BLENDV))
       return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
   }
 
@@ -43565,11 +44298,11 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
         // getConstVector sets negative shuffle mask values as undef, so ensure
         // we hardcode SM_SentinelZero values to zero (0x80).
         if (CondMask[i] < NumElts) {
-          LHSMask[i] = (LHSMask[i] == SM_SentinelZero) ? 0x80 : LHSMask[i];
+          LHSMask[i] = isUndefOrZero(LHSMask[i]) ? 0x80 : LHSMask[i];
           RHSMask[i] = 0x80;
         } else {
           LHSMask[i] = 0x80;
-          RHSMask[i] = (RHSMask[i] == SM_SentinelZero) ? 0x80 : RHSMask[i];
+          RHSMask[i] = isUndefOrZero(RHSMask[i]) ? 0x80 : RHSMask[i];
         }
       }
       LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
@@ -43586,7 +44319,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
   // ignored in unsafe-math mode).
   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
-      VT != MVT::f80 && VT != MVT::f128 &&
+      VT != MVT::f80 && VT != MVT::f128 && !isSoftFP16(VT, Subtarget) &&
       (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
       (Subtarget.hasSSE2() ||
        (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
@@ -43880,7 +44613,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
   // If this an avx512 target we can improve the use of zero masking by
   // swapping the operands and inverting the condition.
   if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
-       Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
+      Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
       ISD::isBuildVectorAllZeros(LHS.getNode()) &&
       !ISD::isBuildVectorAllZeros(RHS.getNode())) {
     // Invert the cond to not(cond) : xor(op,allones)=not(op)
@@ -43889,6 +44622,19 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
     return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
   }
 
+  // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
+  // get split by legalization.
+  if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
+      CondVT.getVectorElementType() == MVT::i1 && Cond.hasOneUse() &&
+      TLI.isTypeLegal(VT.getScalarType())) {
+    EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
+    if (SDValue ExtCond = combineToExtendBoolVectorInReg(
+            ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
+      ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
+      return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
+    }
+  }
+
   // Early exit check
   if (!TLI.isTypeLegal(VT))
     return SDValue();
@@ -44301,14 +45047,15 @@ static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
 static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
   if (EFLAGS.getOpcode() == X86ISD::ADD) {
     if (isAllOnesConstant(EFLAGS.getOperand(1))) {
+      bool FoundAndLSB = false;
       SDValue Carry = EFLAGS.getOperand(0);
       while (Carry.getOpcode() == ISD::TRUNCATE ||
              Carry.getOpcode() == ISD::ZERO_EXTEND ||
-             Carry.getOpcode() == ISD::SIGN_EXTEND ||
-             Carry.getOpcode() == ISD::ANY_EXTEND ||
              (Carry.getOpcode() == ISD::AND &&
-              isOneConstant(Carry.getOperand(1))))
+              isOneConstant(Carry.getOperand(1)))) {
+        FoundAndLSB |= Carry.getOpcode() == ISD::AND;
         Carry = Carry.getOperand(0);
+      }
       if (Carry.getOpcode() == X86ISD::SETCC ||
           Carry.getOpcode() == X86ISD::SETCC_CARRY) {
         // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
@@ -44339,6 +45086,14 @@ static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
             CarryOp1.getOpcode() == X86ISD::ADD &&
             isOneConstant(CarryOp1.getOperand(1)))
           return CarryOp1;
+      } else if (FoundAndLSB) {
+        SDLoc DL(Carry);
+        SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
+        if (Carry.getOpcode() == ISD::SRL) {
+          BitNo = Carry.getOperand(1);
+          Carry = Carry.getOperand(0);
+        }
+        return getBT(Carry, BitNo, DL, DAG);
       }
     }
   }
@@ -44533,6 +45288,12 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
   if (!IsAnyOf && !IsAllOf)
     return SDValue();
 
+  // TODO: Check more combining cases for me.
+  // Here we check the cmp use number to decide do combining or not.
+  // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
+  // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
+  bool IsOneUse = CmpOp.getNode()->hasOneUse();
+
   // See if we can peek through to a vector with a wider element type, if the
   // signbits extend down to all the sub-elements as well.
   // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
@@ -44561,9 +45322,9 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
   // MOVMSK(CONCAT(X,Y)) != 0 ->  MOVMSK(OR(X,Y)).
   // MOVMSK(CONCAT(X,Y)) == -1 ->  MOVMSK(AND(X,Y)).
   // MOVMSK(CONCAT(X,Y)) != -1 ->  MOVMSK(AND(X,Y)).
-  if (VecVT.is256BitVector() && NumElts <= CmpBits) {
+  if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
     SmallVector<SDValue> Ops;
-    if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops) &&
+    if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
         Ops.size() == 2) {
       SDLoc DL(EFLAGS);
       EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
@@ -44582,7 +45343,7 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
   // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
   // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(SUB(X,Y),SUB(X,Y)).
   // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(SUB(X,Y),SUB(X,Y)).
-  if (IsAllOf && Subtarget.hasSSE41()) {
+  if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
     MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
     SDValue BC = peekThroughBitcasts(Vec);
     // Ensure MOVMSK was testing every signbit of BC.
@@ -44734,7 +45495,7 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
     if (!(FalseOp.getValueType() == MVT::f80 ||
           (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
           (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
-        !Subtarget.hasCMov() || hasFPCMov(CC)) {
+        !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
       SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
                        Flags};
       return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
@@ -45181,8 +45942,6 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
   if (NumElts == 1 || !isPowerOf2_32(NumElts))
     return SDValue();
 
-  EVT WVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, 2 * NumElts);
-
   // With AVX512 but without BWI, we would need to split v32i16.
   if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
     return SDValue();
@@ -45265,11 +46024,13 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
   // Use SplitOpsAndApply to handle AVX splitting.
   auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
                            ArrayRef<SDValue> Ops) {
-    MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
-    return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
+    MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
+    MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
+    return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
+                       DAG.getBitcast(OpVT, Ops[0]),
+                       DAG.getBitcast(OpVT, Ops[1]));
   };
-  return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
-                          { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
+  return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {N0, N1},
                           PMADDWDBuilder);
 }
 
@@ -45622,12 +46383,11 @@ static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
     SarConst = SarConst - (Size - ShiftSize);
     if (SarConst == 0)
       return NN;
-    else if (SarConst.isNegative())
+    if (SarConst.isNegative())
       return DAG.getNode(ISD::SHL, DL, VT, NN,
                          DAG.getConstant(-SarConst, DL, CVT));
-    else
-      return DAG.getNode(ISD::SRA, DL, VT, NN,
-                         DAG.getConstant(SarConst, DL, CVT));
+    return DAG.getNode(ISD::SRA, DL, VT, NN,
+                       DAG.getConstant(SarConst, DL, CVT));
   }
   return SDValue();
 }
@@ -46034,11 +46794,9 @@ static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
                                       EltBits[0].getZExtValue(), DAG);
   }
 
-  APInt KnownUndef, KnownZero;
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
-  if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
-                                     KnownZero, DCI))
+  if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
     return SDValue(N, 0);
 
   return SDValue();
@@ -46461,11 +47219,17 @@ static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
     return DAG.getBitcast(VT, FPLogic);
   }
 
+  if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
+      !N1.hasOneUse())
+    return SDValue();
+
+  ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
+  ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
+
   // The vector ISA for FP predicates is incomplete before AVX, so converting
   // COMIS* to CMPS* may not be a win before AVX.
-  // TODO: Check types/predicates to see if they are available with SSE/SSE2.
-  if (!Subtarget.hasAVX() || VT != MVT::i1 || N0.getOpcode() != ISD::SETCC ||
-      !N0.hasOneUse() || !N1.hasOneUse())
+  if (!Subtarget.hasAVX() &&
+      !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
     return SDValue();
 
   // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
@@ -46482,10 +47246,8 @@ static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
   SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
   SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
   SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
-  SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01,
-                                cast<CondCodeSDNode>(N0.getOperand(2))->get());
-  SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11,
-                                cast<CondCodeSDNode>(N1.getOperand(2))->get());
+  SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
+  SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
   SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1);
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
 }
@@ -46891,6 +47653,53 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
   if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
     return R;
 
+  // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
+  // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
+  // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
+  if (VT.isVector() && getTargetConstantFromNode(N1)) {
+    unsigned Opc0 = N0.getOpcode();
+    if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
+        getTargetConstantFromNode(N0.getOperand(1)) &&
+        DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
+        N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
+      SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
+      return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
+    }
+  }
+
+  // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
+  // avoids slow variable shift (moving shift amount to ECX etc.)
+  if (isOneConstant(N1) && N0->hasOneUse()) {
+    SDValue Src = N0;
+    while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
+            Src.getOpcode() == ISD::TRUNCATE) &&
+           Src.getOperand(0)->hasOneUse())
+      Src = Src.getOperand(0);
+    bool ContainsNOT = false;
+    X86::CondCode X86CC = X86::COND_B;
+    // Peek through AND(NOT(SRL(X,Y)),1).
+    if (isBitwiseNot(Src)) {
+      Src = Src.getOperand(0);
+      X86CC = X86::COND_AE;
+      ContainsNOT = true;
+    }
+    if (Src.getOpcode() == ISD::SRL &&
+        !isa<ConstantSDNode>(Src.getOperand(1))) {
+      SDValue BitNo = Src.getOperand(1);
+      Src = Src.getOperand(0);
+      // Peek through AND(SRL(NOT(X),Y),1).
+      if (isBitwiseNot(Src)) {
+        Src = Src.getOperand(0);
+        X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
+        ContainsNOT = true;
+      }
+      // If we have BMI2 then SHRX should be faster for i32/i64 cases.
+      if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
+        if (SDValue BT = getBT(Src, BitNo, dl, DAG))
+          return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
+    }
+  }
+
   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
     // Attempt to recursively combine a bitmask AND with shuffles.
     SDValue Op(N, 0);
@@ -46899,32 +47708,44 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
 
     // If either operand is a constant mask, then only the elements that aren't
     // zero are actually demanded by the other operand.
-    auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
+    auto GetDemandedMasks = [&](SDValue Op) {
       APInt UndefElts;
       SmallVector<APInt> EltBits;
       int NumElts = VT.getVectorNumElements();
       int EltSizeInBits = VT.getScalarSizeInBits();
-      if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
-        return false;
-
-      APInt DemandedBits = APInt::getZero(EltSizeInBits);
-      APInt DemandedElts = APInt::getZero(NumElts);
-      for (int I = 0; I != NumElts; ++I)
-        if (!EltBits[I].isZero()) {
-          DemandedBits |= EltBits[I];
-          DemandedElts.setBit(I);
-        }
-
-      APInt KnownUndef, KnownZero;
-      return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, KnownUndef,
-                                            KnownZero, DCI) ||
-             TLI.SimplifyDemandedBits(OtherOp, DemandedBits, DemandedElts, DCI);
+      APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
+      APInt DemandedElts = APInt::getAllOnes(NumElts);
+      if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
+                                        EltBits)) {
+        DemandedBits.clearAllBits();
+        DemandedElts.clearAllBits();
+        for (int I = 0; I != NumElts; ++I)
+          if (!EltBits[I].isZero()) {
+            DemandedBits |= EltBits[I];
+            DemandedElts.setBit(I);
+          }
+      }
+      return std::make_pair(DemandedBits, DemandedElts);
     };
-    if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
+    std::pair<APInt, APInt> Demand0 = GetDemandedMasks(N1);
+    std::pair<APInt, APInt> Demand1 = GetDemandedMasks(N0);
+
+    if (TLI.SimplifyDemandedVectorElts(N0, Demand0.second, DCI) ||
+        TLI.SimplifyDemandedVectorElts(N1, Demand1.second, DCI) ||
+        TLI.SimplifyDemandedBits(N0, Demand0.first, Demand0.second, DCI) ||
+        TLI.SimplifyDemandedBits(N1, Demand1.first, Demand1.second, DCI)) {
       if (N->getOpcode() != ISD::DELETED_NODE)
         DCI.AddToWorklist(N);
       return SDValue(N, 0);
     }
+
+    SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Demand0.first,
+                                                        Demand0.second, DAG);
+    SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Demand1.first,
+                                                        Demand1.second, DAG);
+    if (NewN0 || NewN1)
+      return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
+                         NewN1 ? NewN1 : N1);
   }
 
   // Attempt to combine a scalar bitmask AND with an extracted shuffle.
@@ -47127,8 +47948,7 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
 //   into:
 //   srl(ctlz x), log2(bitsize(x))
 // Input pattern is checked by caller.
-static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
-                                          SelectionDAG &DAG) {
+static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) {
   SDValue Cmp = Op.getOperand(1);
   EVT VT = Cmp.getOperand(0).getValueType();
   unsigned Log2b = Log2_32(VT.getSizeInBits());
@@ -47139,7 +47959,7 @@ static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
   SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
   SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
                             DAG.getConstant(Log2b, dl, MVT::i8));
-  return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
+  return Scc;
 }
 
 // Try to transform:
@@ -47199,11 +48019,10 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
   // or(srl(ctlz),srl(ctlz)).
   // The dag combiner can then fold it into:
   // srl(or(ctlz, ctlz)).
-  EVT VT = OR->getValueType(0);
-  SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
+  SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
   SDValue Ret, NewRHS;
-  if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
-    Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
+  if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
+    Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
 
   if (!Ret)
     return SDValue();
@@ -47216,21 +48035,18 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
     // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
     if (RHS->getOpcode() == ISD::OR)
       std::swap(LHS, RHS);
-    NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
+    NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
     if (!NewRHS)
       return SDValue();
-    Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
+    Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
   }
 
-  if (Ret)
-    Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
-
-  return Ret;
+  return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
 }
 
 static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R,
-                                   SDValue And1_L, SDValue And1_R, SDLoc DL,
-                                   SelectionDAG &DAG) {
+                                   SDValue And1_L, SDValue And1_R,
+                                   const SDLoc &DL, SelectionDAG &DAG) {
   if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
     return SDValue();
   SDValue NotOp = And0_L->getOperand(0);
@@ -47352,7 +48168,7 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
     APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
     if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
         N1.getConstantOperandAPInt(1) == HalfElts &&
-        DAG.MaskedValueIsZero(N0, APInt(1, 1), UpperElts)) {
+        DAG.MaskedVectorIsZero(N0, UpperElts)) {
       return DAG.getNode(
           ISD::CONCAT_VECTORS, dl, VT,
           extractSubVector(N0, 0, DAG, dl, HalfElts),
@@ -47360,7 +48176,7 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
     }
     if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
         N0.getConstantOperandAPInt(1) == HalfElts &&
-        DAG.MaskedValueIsZero(N1, APInt(1, 1), UpperElts)) {
+        DAG.MaskedVectorIsZero(N1, UpperElts)) {
       return DAG.getNode(
           ISD::CONCAT_VECTORS, dl, VT,
           extractSubVector(N1, 0, DAG, dl, HalfElts),
@@ -47389,9 +48205,7 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
         if (!EltBits[I].isAllOnes())
           DemandedElts.setBit(I);
 
-      APInt KnownUndef, KnownZero;
-      return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, KnownUndef,
-                                            KnownZero, DCI);
+      return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
     };
     if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
       if (N->getOpcode() != ISD::DELETED_NODE)
@@ -47618,7 +48432,7 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
   // clip to 0-255.
   if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
       InVT == MVT::v16i32 && VT == MVT::v16i8) {
-    if (auto USatVal = detectSSatPattern(In, VT, true)) {
+    if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
       // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
       SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
                                            DL, DAG, Subtarget);
@@ -47643,7 +48457,7 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
       VT.getSizeInBits() >= 64 &&
       (SVT == MVT::i8 || SVT == MVT::i16) &&
       (InSVT == MVT::i16 || InSVT == MVT::i32)) {
-    if (auto USatVal = detectSSatPattern(In, VT, true)) {
+    if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
       // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
       // Only do this when the result is at least 64 bits or we'll leaving
       // dangling PACKSSDW nodes.
@@ -47660,7 +48474,7 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
         return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
                                       Subtarget);
     }
-    if (auto SSatVal = detectSSatPattern(In, VT))
+    if (SDValue SSatVal = detectSSatPattern(In, VT))
       return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
                                     Subtarget);
   }
@@ -47671,10 +48485,10 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
       (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
     unsigned TruncOpc = 0;
     SDValue SatVal;
-    if (auto SSatVal = detectSSatPattern(In, VT)) {
+    if (SDValue SSatVal = detectSSatPattern(In, VT)) {
       SatVal = SSatVal;
       TruncOpc = X86ISD::VTRUNCS;
-    } else if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) {
+    } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
       SatVal = USatVal;
       TruncOpc = X86ISD::VTRUNCUS;
     }
@@ -47706,7 +48520,7 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
 
 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
-/// X86ISD::AVG instruction.
+/// ISD::AVGCEILU (AVG) instruction.
 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
                                 const X86Subtarget &Subtarget,
                                 const SDLoc &DL) {
@@ -47769,7 +48583,7 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
 
   auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
                        ArrayRef<SDValue> Ops) {
-    return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
+    return DAG.getNode(ISD::AVGCEILU, DL, Ops[0].getValueType(), Ops);
   };
 
   auto AVGSplitter = [&](std::array<SDValue, 2> Ops) {
@@ -47872,7 +48686,7 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
   if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
       Ext == ISD::NON_EXTLOAD &&
       ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
-        Ld->getAlignment() >= 16) ||
+        Ld->getAlign() >= Align(16)) ||
        (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
                                *Ld->getMemOperand(), &Fast) &&
         !Fast))) {
@@ -48340,7 +49154,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
 
   // Split under-aligned vector non-temporal stores.
   if (St->isNonTemporal() && StVT == VT &&
-      St->getAlignment() < VT.getStoreSize()) {
+      St->getAlign().value() < VT.getStoreSize()) {
     // ZMM/YMM nt-stores - either it can be stored as a series of shorter
     // vectors or the legalizer can scalarize it to use MOVNTI.
     if (VT.is256BitVector() || VT.is512BitVector()) {
@@ -48374,9 +49188,10 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
   }
 
   // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
-  if (!St->isTruncatingStore() && StoredVal.hasOneUse() &&
+  if (!St->isTruncatingStore() &&
       (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
        StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
+      StoredVal.hasOneUse() &&
       TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
     bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
     return EmitTruncSStore(IsSigned, St->getChain(),
@@ -48385,15 +49200,15 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
   }
 
   // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
-  if (!St->isTruncatingStore() && StoredVal.hasOneUse()) {
+  if (!St->isTruncatingStore()) {
     auto IsExtractedElement = [](SDValue V) {
-      if (V.getOpcode() == ISD::TRUNCATE && V.getOperand(0).hasOneUse())
+      if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
         V = V.getOperand(0);
       unsigned Opc = V.getOpcode();
-      if (Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) {
-        if (V.getOperand(0).hasOneUse() && isNullConstant(V.getOperand(1)))
-          return V.getOperand(0);
-      }
+      if ((Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) &&
+          isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
+          V.getOperand(0).hasOneUse())
+        return V.getOperand(0);
       return SDValue();
     };
     if (SDValue Extract = IsExtractedElement(StoredVal)) {
@@ -48531,10 +49346,8 @@ static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
   unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
   APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
 
-  APInt KnownUndef, KnownZero;
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, KnownUndef,
-                                     KnownZero, DCI)) {
+  if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
     if (N->getOpcode() != ISD::DELETED_NODE)
       DCI.AddToWorklist(N);
     return SDValue(N, 0);
@@ -49165,7 +49978,8 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
     // PACK should still be worth it for 128-bit vectors if the sources were
     // originally concatenated from subvectors.
     SmallVector<SDValue> ConcatOps;
-    if (VT.getSizeInBits() > 128 || !collectConcatOps(In.getNode(), ConcatOps))
+    if (VT.getSizeInBits() > 128 ||
+        !collectConcatOps(In.getNode(), ConcatOps, DAG))
       return SDValue();
   }
 
@@ -49478,9 +50292,9 @@ static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,
   SDValue In = N->getOperand(0);
   SDLoc DL(N);
 
-  if (auto SSatVal = detectSSatPattern(In, VT))
+  if (SDValue SSatVal = detectSSatPattern(In, VT))
     return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
-  if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
+  if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
     return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -49567,10 +50381,14 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
         if (!UndefElts[I] && !EltBits[I].isSignMask())
           return SDValue();
 
-      return peekThroughBitcasts(Op0);
+      // Only allow bitcast from correctly-sized constant.
+      Op0 = peekThroughBitcasts(Op0);
+      if (Op0.getScalarValueSizeInBits() == ScalarSize)
+        return Op0;
     }
-  }
-  }
+    break;
+  } // case
+  } // switch
 
   return SDValue();
 }
@@ -50074,10 +50892,8 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
   EVT VT = N->getValueType(0);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
-  APInt KnownUndef, KnownZero;
   APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
-  if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
-                                     KnownZero, DCI))
+  if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
     return SDValue(N, 0);
 
   // Convert a full vector load into vzload when not all bits are needed.
@@ -50144,26 +50960,70 @@ static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
                             TargetLowering::DAGCombinerInfo &DCI,
                             const X86Subtarget &Subtarget) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
   MVT VT = N->getSimpleValueType(0);
 
+  // ANDNP(undef, x) -> 0
+  // ANDNP(x, undef) -> 0
+  if (N0.isUndef() || N1.isUndef())
+    return DAG.getConstant(0, SDLoc(N), VT);
+
   // ANDNP(0, x) -> x
-  if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
-    return N->getOperand(1);
+  if (ISD::isBuildVectorAllZeros(N0.getNode()))
+    return N1;
 
   // ANDNP(x, 0) -> 0
-  if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
+  if (ISD::isBuildVectorAllZeros(N1.getNode()))
     return DAG.getConstant(0, SDLoc(N), VT);
 
   // Turn ANDNP back to AND if input is inverted.
-  if (SDValue Not = IsNOT(N->getOperand(0), DAG))
-    return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not),
-                       N->getOperand(1));
+  if (SDValue Not = IsNOT(N0, DAG))
+    return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not), N1);
+
+  // TODO: Constant fold NOT(N0) to allow us to use AND.
+  // TODO: Do this in IsNOT with suitable oneuse checks?
 
   // Attempt to recursively combine a bitmask ANDNP with shuffles.
   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
     SDValue Op(N, 0);
     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
       return Res;
+
+    // If either operand is a constant mask, then only the elements that aren't
+    // zero are actually demanded by the other operand.
+    auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
+      APInt UndefElts;
+      SmallVector<APInt> EltBits;
+      int NumElts = VT.getVectorNumElements();
+      int EltSizeInBits = VT.getScalarSizeInBits();
+      APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
+      APInt DemandedElts = APInt::getAllOnes(NumElts);
+      if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
+                                        EltBits)) {
+        DemandedBits.clearAllBits();
+        DemandedElts.clearAllBits();
+        for (int I = 0; I != NumElts; ++I)
+          if ((Invert && !EltBits[I].isAllOnes()) ||
+              (!Invert && !EltBits[I].isZero())) {
+            DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
+            DemandedElts.setBit(I);
+          }
+      }
+      return std::make_pair(DemandedBits, DemandedElts);
+    };
+    std::pair<APInt, APInt> Demand0 = GetDemandedMasks(N1);
+    std::pair<APInt, APInt> Demand1 = GetDemandedMasks(N0, true);
+
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    if (TLI.SimplifyDemandedVectorElts(N0, Demand0.second, DCI) ||
+        TLI.SimplifyDemandedVectorElts(N1, Demand1.second, DCI) ||
+        TLI.SimplifyDemandedBits(N0, Demand0.first, Demand0.second, DCI) ||
+        TLI.SimplifyDemandedBits(N1, Demand1.first, Demand1.second, DCI)) {
+      if (N->getOpcode() != ISD::DELETED_NODE)
+        DCI.AddToWorklist(N);
+      return SDValue(N, 0);
+    }
   }
 
   return SDValue();
@@ -50191,11 +51051,9 @@ static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
   SDValue Src = N->getOperand(IsStrict ? 1 : 0);
 
   if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
-    APInt KnownUndef, KnownZero;
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     APInt DemandedElts = APInt::getLowBitsSet(8, 4);
-    if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
-                                       DCI)) {
+    if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
       if (N->getOpcode() != ISD::DELETED_NODE)
         DCI.AddToWorklist(N);
       return SDValue(N, 0);
@@ -50453,110 +51311,6 @@ static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
   return Res;
 }
 
-// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
-// This is more or less the reverse of combineBitcastvxi1.
-static SDValue
-combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
-                               TargetLowering::DAGCombinerInfo &DCI,
-                               const X86Subtarget &Subtarget) {
-  unsigned Opcode = N->getOpcode();
-  if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
-      Opcode != ISD::ANY_EXTEND)
-    return SDValue();
-  if (!DCI.isBeforeLegalizeOps())
-    return SDValue();
-  if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
-    return SDValue();
-
-  SDValue N0 = N->getOperand(0);
-  EVT VT = N->getValueType(0);
-  EVT SVT = VT.getScalarType();
-  EVT InSVT = N0.getValueType().getScalarType();
-  unsigned EltSizeInBits = SVT.getSizeInBits();
-
-  // Input type must be extending a bool vector (bit-casted from a scalar
-  // integer) to legal integer types.
-  if (!VT.isVector())
-    return SDValue();
-  if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
-    return SDValue();
-  if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
-    return SDValue();
-
-  SDValue N00 = N0.getOperand(0);
-  EVT SclVT = N0.getOperand(0).getValueType();
-  if (!SclVT.isScalarInteger())
-    return SDValue();
-
-  SDLoc DL(N);
-  SDValue Vec;
-  SmallVector<int, 32> ShuffleMask;
-  unsigned NumElts = VT.getVectorNumElements();
-  assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
-
-  // Broadcast the scalar integer to the vector elements.
-  if (NumElts > EltSizeInBits) {
-    // If the scalar integer is greater than the vector element size, then we
-    // must split it down into sub-sections for broadcasting. For example:
-    //   i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
-    //   i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
-    assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
-    unsigned Scale = NumElts / EltSizeInBits;
-    EVT BroadcastVT =
-        EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
-    Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
-    Vec = DAG.getBitcast(VT, Vec);
-
-    for (unsigned i = 0; i != Scale; ++i)
-      ShuffleMask.append(EltSizeInBits, i);
-    Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
-  } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
-             (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
-    // If we have register broadcast instructions, use the scalar size as the
-    // element type for the shuffle. Then cast to the wider element type. The
-    // widened bits won't be used, and this might allow the use of a broadcast
-    // load.
-    assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
-    unsigned Scale = EltSizeInBits / NumElts;
-    EVT BroadcastVT =
-        EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
-    Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
-    ShuffleMask.append(NumElts * Scale, 0);
-    Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
-    Vec = DAG.getBitcast(VT, Vec);
-  } else {
-    // For smaller scalar integers, we can simply any-extend it to the vector
-    // element size (we don't care about the upper bits) and broadcast it to all
-    // elements.
-    SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
-    Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
-    ShuffleMask.append(NumElts, 0);
-    Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
-  }
-
-  // Now, mask the relevant bit in each element.
-  SmallVector<SDValue, 32> Bits;
-  for (unsigned i = 0; i != NumElts; ++i) {
-    int BitIdx = (i % EltSizeInBits);
-    APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
-    Bits.push_back(DAG.getConstant(Bit, DL, SVT));
-  }
-  SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
-  Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
-
-  // Compare against the bitmask and extend the result.
-  EVT CCVT = VT.changeVectorElementType(MVT::i1);
-  Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
-  Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
-
-  // For SEXT, this is now done, otherwise shift the result down for
-  // zero-extension.
-  if (Opcode == ISD::SIGN_EXTEND)
-    return Vec;
-  return DAG.getNode(ISD::SRL, DL, VT, Vec,
-                     DAG.getConstant(EltSizeInBits - 1, DL, VT));
-}
-
 // Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
 // result type.
 static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
@@ -50636,7 +51390,8 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
   if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
     return V;
 
-  if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
+  if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
+                                                 DAG, DCI, Subtarget))
     return V;
 
   if (VT.isVector()) {
@@ -50790,7 +51545,8 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
     if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
       return V;
 
-  if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
+  if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
+                                                 DAG, DCI, Subtarget))
     return V;
 
   if (VT.isVector())
@@ -50832,7 +51588,7 @@ static bool isOrXorXorTree(SDValue X, bool Root = true) {
 
 /// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
 /// expansion.
-template<typename F>
+template <typename F>
 static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,
                                 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
   SDValue Op0 = X.getOperand(0);
@@ -50845,7 +51601,8 @@ static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,
     if (HasPT)
       return DAG.getNode(ISD::OR, DL, VecVT, A, B);
     return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
-  } else if (X.getOpcode() == ISD::XOR) {
+  }
+  if (X.getOpcode() == ISD::XOR) {
     SDValue A = SToV(Op0);
     SDValue B = SToV(Op1);
     if (VecVT != CmpVT)
@@ -51134,6 +51891,16 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
       LHS.getValueType() == MVT::v4f32)
     return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
 
+  // X pred 0.0 --> X pred -X
+  // If the negation of X already exists, use it in the comparison. This removes
+  // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
+  // instructions in patterns with a 'select' node.
+  if (isNullFPScalarOrVectorConst(RHS)) {
+    SDVTList FNegVT = DAG.getVTList(OpVT);
+    if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
+      return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
+  }
+
   return SDValue();
 }
 
@@ -51145,16 +51912,18 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
   MVT VT = N->getSimpleValueType(0);
   unsigned NumBits = VT.getScalarSizeInBits();
   unsigned NumElts = SrcVT.getVectorNumElements();
+  unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
+  assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types");
 
   // Perform constant folding.
-  if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
-    assert(VT == MVT::i32 && "Unexpected result type");
+  APInt UndefElts;
+  SmallVector<APInt, 32> EltBits;
+  if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits)) {
     APInt Imm(32, 0);
-    for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
-      if (!Src.getOperand(Idx).isUndef() &&
-          Src.getConstantOperandAPInt(Idx).isNegative())
+    for (unsigned Idx = 0; Idx != NumElts; ++Idx)
+      if (!UndefElts[Idx] && EltBits[Idx].isNegative())
         Imm.setBit(Idx);
-    }
+
     return DAG.getConstant(Imm, SDLoc(N), VT);
   }
 
@@ -51713,8 +52482,6 @@ static bool needCarryOrOverflowFlag(SDValue Flags) {
       CC = (X86::CondCode)User->getConstantOperandVal(0);
       break;
     case X86ISD::BRCOND:
-      CC = (X86::CondCode)User->getConstantOperandVal(2);
-      break;
     case X86ISD::CMOV:
       CC = (X86::CondCode)User->getConstantOperandVal(2);
       break;
@@ -51743,10 +52510,14 @@ static bool onlyZeroFlagUsed(SDValue Flags) {
     default:
       // Be conservative.
       return false;
-    case X86ISD::SETCC:       CCOpNo = 0; break;
-    case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
-    case X86ISD::BRCOND:      CCOpNo = 2; break;
-    case X86ISD::CMOV:        CCOpNo = 2; break;
+    case X86ISD::SETCC:
+    case X86ISD::SETCC_CARRY:
+      CCOpNo = 0;
+      break;
+    case X86ISD::BRCOND:
+    case X86ISD::CMOV:
+      CCOpNo = 2;
+      break;
     }
 
     X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
@@ -51757,6 +52528,215 @@ static bool onlyZeroFlagUsed(SDValue Flags) {
   return true;
 }
 
+/// If this is an add or subtract where one operand is produced by a cmp+setcc,
+/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
+/// with CMP+{ADC, SBB}.
+/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
+static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
+                                         SDValue X, SDValue Y,
+                                         SelectionDAG &DAG,
+                                         bool ZeroSecondOpOnly = false) {
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  // Look through a one-use zext.
+  if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
+    Y = Y.getOperand(0);
+
+  X86::CondCode CC;
+  SDValue EFLAGS;
+  if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
+    CC = (X86::CondCode)Y.getConstantOperandVal(0);
+    EFLAGS = Y.getOperand(1);
+  } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
+             Y.hasOneUse()) {
+    EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
+  }
+
+  if (!EFLAGS)
+    return SDValue();
+
+  // If X is -1 or 0, then we have an opportunity to avoid constants required in
+  // the general case below.
+  auto *ConstantX = dyn_cast<ConstantSDNode>(X);
+  if (ConstantX && !ZeroSecondOpOnly) {
+    if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
+        (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
+      // This is a complicated way to get -1 or 0 from the carry flag:
+      // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
+      //  0 - SETB  -->  0 -  (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
+      return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+                         DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
+                         EFLAGS);
+    }
+
+    if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
+        (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
+      if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
+          EFLAGS.getValueType().isInteger() &&
+          !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+        // Swap the operands of a SUB, and we have the same pattern as above.
+        // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
+        //  0 - SETA  (SUB A, B) -->  0 - SETB  (SUB B, A) --> SUB + SBB
+        SDValue NewSub = DAG.getNode(
+            X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
+            EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+        SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
+        return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+                           DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
+                           NewEFLAGS);
+      }
+    }
+  }
+
+  if (CC == X86::COND_B) {
+    // X + SETB Z --> adc X, 0
+    // X - SETB Z --> sbb X, 0
+    return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
+                       DAG.getVTList(VT, MVT::i32), X,
+                       DAG.getConstant(0, DL, VT), EFLAGS);
+  }
+
+  if (ZeroSecondOpOnly)
+    return SDValue();
+
+  if (CC == X86::COND_A) {
+    // Try to convert COND_A into COND_B in an attempt to facilitate
+    // materializing "setb reg".
+    //
+    // Do not flip "e > c", where "c" is a constant, because Cmp instruction
+    // cannot take an immediate as its first operand.
+    //
+    if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
+        EFLAGS.getValueType().isInteger() &&
+        !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+      SDValue NewSub =
+          DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
+                      EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+      SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
+      return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
+                         DAG.getVTList(VT, MVT::i32), X,
+                         DAG.getConstant(0, DL, VT), NewEFLAGS);
+    }
+  }
+
+  if (CC == X86::COND_AE) {
+    // X + SETAE --> sbb X, -1
+    // X - SETAE --> adc X, -1
+    return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
+                       DAG.getVTList(VT, MVT::i32), X,
+                       DAG.getConstant(-1, DL, VT), EFLAGS);
+  }
+
+  if (CC == X86::COND_BE) {
+    // X + SETBE --> sbb X, -1
+    // X - SETBE --> adc X, -1
+    // Try to convert COND_BE into COND_AE in an attempt to facilitate
+    // materializing "setae reg".
+    //
+    // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
+    // cannot take an immediate as its first operand.
+    //
+    if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
+        EFLAGS.getValueType().isInteger() &&
+        !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+      SDValue NewSub =
+          DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
+                      EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+      SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
+      return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
+                         DAG.getVTList(VT, MVT::i32), X,
+                         DAG.getConstant(-1, DL, VT), NewEFLAGS);
+    }
+  }
+
+  if (CC != X86::COND_E && CC != X86::COND_NE)
+    return SDValue();
+
+  if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
+      !X86::isZeroNode(EFLAGS.getOperand(1)) ||
+      !EFLAGS.getOperand(0).getValueType().isInteger())
+    return SDValue();
+
+  SDValue Z = EFLAGS.getOperand(0);
+  EVT ZVT = Z.getValueType();
+
+  // If X is -1 or 0, then we have an opportunity to avoid constants required in
+  // the general case below.
+  if (ConstantX) {
+    // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
+    // fake operands:
+    //  0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
+    // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
+    if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
+        (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
+      SDValue Zero = DAG.getConstant(0, DL, ZVT);
+      SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
+      SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
+      return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+                         DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
+                         SDValue(Neg.getNode(), 1));
+    }
+
+    // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
+    // with fake operands:
+    //  0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
+    // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
+    if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
+        (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
+      SDValue One = DAG.getConstant(1, DL, ZVT);
+      SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
+      SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
+      return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+                         DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
+                         Cmp1.getValue(1));
+    }
+  }
+
+  // (cmp Z, 1) sets the carry flag if Z is 0.
+  SDValue One = DAG.getConstant(1, DL, ZVT);
+  SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
+  SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
+
+  // Add the flags type for ADC/SBB nodes.
+  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+
+  // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
+  // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
+  if (CC == X86::COND_NE)
+    return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
+                       DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
+
+  // X - (Z == 0) --> sub X, (zext(sete  Z, 0)) --> sbb X, 0, (cmp Z, 1)
+  // X + (Z == 0) --> add X, (zext(sete  Z, 0)) --> adc X, 0, (cmp Z, 1)
+  return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
+                     DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
+}
+
+/// If this is an add or subtract where one operand is produced by a cmp+setcc,
+/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
+/// with CMP+{ADC, SBB}.
+static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
+  bool IsSub = N->getOpcode() == ISD::SUB;
+  SDValue X = N->getOperand(0);
+  SDValue Y = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
+    return ADCOrSBB;
+
+  // Commute and try again (negate the result for subtracts).
+  if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
+    if (IsSub)
+      ADCOrSBB =
+          DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), ADCOrSBB);
+    return ADCOrSBB;
+  }
+
+  return SDValue();
+}
+
 static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
   // Only handle test patterns.
   if (!isNullConstant(N->getOperand(1)))
@@ -51792,6 +52772,16 @@ static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
     }
   }
 
+  // Peek through any zero-extend if we're only testing for a zero result.
+  if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
+    SDValue Src = Op.getOperand(0);
+    EVT SrcVT = Src.getValueType();
+    if (SrcVT.getScalarSizeInBits() >= 8 &&
+        DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
+      return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
+                         DAG.getConstant(0, dl, SrcVT));
+  }
+
   // Look for a truncate.
   if (Op.getOpcode() != ISD::TRUNCATE)
     return SDValue();
@@ -51867,7 +52857,8 @@ static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
   MVT VT = LHS.getSimpleValueType();
-  unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB;
+  bool IsSub = X86ISD::SUB == N->getOpcode();
+  unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
 
   // If we don't use the flag result, simplify back to a generic ADD/SUB.
   if (!N->hasAnyUseOfValue(1)) {
@@ -51889,26 +52880,29 @@ static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
   MatchGeneric(LHS, RHS, false);
   MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
 
-  return SDValue();
+  // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
+  // EFLAGS result doesn't change.
+  return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
+                                   /*ZeroSecondOpOnly*/ true);
 }
 
 static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
-  if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDValue BorrowIn = N->getOperand(2);
+
+  if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
     MVT VT = N->getSimpleValueType(0);
     SDVTList VTs = DAG.getVTList(VT, MVT::i32);
-    return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
-                       N->getOperand(0), N->getOperand(1),
-                       Flags);
+    return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
   }
 
   // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
   // iff the flag result is dead.
-  SDValue Op0 = N->getOperand(0);
-  SDValue Op1 = N->getOperand(1);
-  if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) &&
+  if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
       !N->hasAnyUseOfValue(1))
-    return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0),
-                       Op0.getOperand(1), N->getOperand(2));
+    return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
+                       LHS.getOperand(1), BorrowIn);
 
   return SDValue();
 }
@@ -51916,228 +52910,60 @@ static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
                           TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDValue CarryIn = N->getOperand(2);
+  auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
+  auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
+
+  // Canonicalize constant to RHS.
+  if (LHSC && !RHSC)
+    return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
+                       CarryIn);
+
   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
   // the result is either zero or one (depending on the input carry bit).
   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
-  if (X86::isZeroNode(N->getOperand(0)) &&
-      X86::isZeroNode(N->getOperand(1)) &&
+  if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
       // We don't have a good way to replace an EFLAGS use, so only do this when
       // dead right now.
       SDValue(N, 1).use_empty()) {
     SDLoc DL(N);
     EVT VT = N->getValueType(0);
     SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
-    SDValue Res1 =
-        DAG.getNode(ISD::AND, DL, VT,
-                    DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
-                                DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
-                                N->getOperand(2)),
-                    DAG.getConstant(1, DL, VT));
+    SDValue Res1 = DAG.getNode(
+        ISD::AND, DL, VT,
+        DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+                    DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
+        DAG.getConstant(1, DL, VT));
     return DCI.CombineTo(N, Res1, CarryOut);
   }
 
-  if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
-    MVT VT = N->getSimpleValueType(0);
-    SDVTList VTs = DAG.getVTList(VT, MVT::i32);
-    return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
-                       N->getOperand(0), N->getOperand(1),
-                       Flags);
-  }
-
-  return SDValue();
-}
-
-/// If this is an add or subtract where one operand is produced by a cmp+setcc,
-/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
-/// with CMP+{ADC, SBB}.
-static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
-  bool IsSub = N->getOpcode() == ISD::SUB;
-  SDValue X = N->getOperand(0);
-  SDValue Y = N->getOperand(1);
-
-  // If this is an add, canonicalize a zext operand to the RHS.
-  // TODO: Incomplete? What if both sides are zexts?
-  if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
-      Y.getOpcode() != ISD::ZERO_EXTEND)
-    std::swap(X, Y);
-
-  // Look through a one-use zext.
-  bool PeekedThroughZext = false;
-  if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
-    Y = Y.getOperand(0);
-    PeekedThroughZext = true;
-  }
-
-  // If this is an add, canonicalize a setcc operand to the RHS.
-  // TODO: Incomplete? What if both sides are setcc?
-  // TODO: Should we allow peeking through a zext of the other operand?
-  if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
-      Y.getOpcode() != X86ISD::SETCC)
-    std::swap(X, Y);
-
-  if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
-    return SDValue();
-
-  SDLoc DL(N);
-  EVT VT = N->getValueType(0);
-  X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
-
-  // If X is -1 or 0, then we have an opportunity to avoid constants required in
-  // the general case below.
-  auto *ConstantX = dyn_cast<ConstantSDNode>(X);
-  if (ConstantX) {
-    if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
-        (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
-      // This is a complicated way to get -1 or 0 from the carry flag:
-      // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
-      //  0 - SETB  -->  0 -  (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
-      return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
-                         DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
-                         Y.getOperand(1));
-    }
-
-    if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
-        (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
-      SDValue EFLAGS = Y->getOperand(1);
-      if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
-          EFLAGS.getValueType().isInteger() &&
-          !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
-        // Swap the operands of a SUB, and we have the same pattern as above.
-        // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
-        //  0 - SETA  (SUB A, B) -->  0 - SETB  (SUB B, A) --> SUB + SBB
-        SDValue NewSub = DAG.getNode(
-            X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
-            EFLAGS.getOperand(1), EFLAGS.getOperand(0));
-        SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
-        return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
-                           DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
-                           NewEFLAGS);
-      }
-    }
-  }
-
-  if (CC == X86::COND_B) {
-    // X + SETB Z --> adc X, 0
-    // X - SETB Z --> sbb X, 0
-    return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
-                       DAG.getVTList(VT, MVT::i32), X,
-                       DAG.getConstant(0, DL, VT), Y.getOperand(1));
-  }
-
-  if (CC == X86::COND_A) {
-    SDValue EFLAGS = Y.getOperand(1);
-    // Try to convert COND_A into COND_B in an attempt to facilitate
-    // materializing "setb reg".
-    //
-    // Do not flip "e > c", where "c" is a constant, because Cmp instruction
-    // cannot take an immediate as its first operand.
-    //
-    if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
-        EFLAGS.getValueType().isInteger() &&
-        !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
-      SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
-                                   EFLAGS.getNode()->getVTList(),
-                                   EFLAGS.getOperand(1), EFLAGS.getOperand(0));
-      SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
-      return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
-                         DAG.getVTList(VT, MVT::i32), X,
-                         DAG.getConstant(0, DL, VT), NewEFLAGS);
-    }
-  }
-
-  if (CC == X86::COND_AE) {
-    // X + SETAE --> sbb X, -1
-    // X - SETAE --> adc X, -1
-    return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
-                       DAG.getVTList(VT, MVT::i32), X,
-                       DAG.getConstant(-1, DL, VT), Y.getOperand(1));
-  }
-
-  if (CC == X86::COND_BE) {
-    // X + SETBE --> sbb X, -1
-    // X - SETBE --> adc X, -1
-    SDValue EFLAGS = Y.getOperand(1);
-    // Try to convert COND_BE into COND_AE in an attempt to facilitate
-    // materializing "setae reg".
-    //
-    // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
-    // cannot take an immediate as its first operand.
-    //
-    if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
-        EFLAGS.getValueType().isInteger() &&
-        !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
-      SDValue NewSub = DAG.getNode(
-          X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
-          EFLAGS.getOperand(1), EFLAGS.getOperand(0));
-      SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
-      return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
-                         DAG.getVTList(VT, MVT::i32), X,
-                         DAG.getConstant(-1, DL, VT), NewEFLAGS);
-    }
+  // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
+  // iff the flag result is dead.
+  // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
+  if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
+    SDLoc DL(N);
+    APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
+    return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
+                       DAG.getConstant(0, DL, LHS.getValueType()),
+                       DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
   }
 
-  if (CC != X86::COND_E && CC != X86::COND_NE)
-    return SDValue();
-
-  SDValue Cmp = Y.getOperand(1);
-  if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
-      !X86::isZeroNode(Cmp.getOperand(1)) ||
-      !Cmp.getOperand(0).getValueType().isInteger())
-    return SDValue();
-
-  SDValue Z = Cmp.getOperand(0);
-  EVT ZVT = Z.getValueType();
-
-  // If X is -1 or 0, then we have an opportunity to avoid constants required in
-  // the general case below.
-  if (ConstantX) {
-    // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
-    // fake operands:
-    //  0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
-    // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
-    if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
-        (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
-      SDValue Zero = DAG.getConstant(0, DL, ZVT);
-      SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
-      SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
-      return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
-                         DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
-                         SDValue(Neg.getNode(), 1));
-    }
-
-    // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
-    // with fake operands:
-    //  0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
-    // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
-    if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
-        (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
-      SDValue One = DAG.getConstant(1, DL, ZVT);
-      SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
-      SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
-      return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
-                         DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
-                         Cmp1.getValue(1));
-    }
+  if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
+    MVT VT = N->getSimpleValueType(0);
+    SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+    return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
   }
 
-  // (cmp Z, 1) sets the carry flag if Z is 0.
-  SDValue One = DAG.getConstant(1, DL, ZVT);
-  SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
-  SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
-
-  // Add the flags type for ADC/SBB nodes.
-  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
-
-  // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
-  // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
-  if (CC == X86::COND_NE)
-    return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
-                       DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
+  // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
+  // iff the flag result is dead.
+  if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
+      !N->hasAnyUseOfValue(1))
+    return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
+                       LHS.getOperand(1), CarryIn);
 
-  // X - (Z == 0) --> sub X, (zext(sete  Z, 0)) --> sbb X, 0, (cmp Z, 1)
-  // X + (Z == 0) --> add X, (zext(sete  Z, 0)) --> adc X, 0, (cmp Z, 1)
-  return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
-                     DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
+  return SDValue();
 }
 
 static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
@@ -52432,7 +53258,8 @@ static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1,
 /// Try to fold those constants into an 'add' instruction to reduce instruction
 /// count. We do this with CMOV rather the generic 'select' because there are
 /// earlier folds that may be used to turn select-of-constants into logic hacks.
-static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG) {
+static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG,
+                                       const X86Subtarget &Subtarget) {
   // If an operand is zero, add-of-0 gets simplified away, so that's clearly
   // better because we eliminate 1-2 instructions. This transform is still
   // an improvement without zero operands because we trade 2 move constants and
@@ -52457,6 +53284,11 @@ static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG) {
   if (!isSuitableCmov(Cmov))
     return SDValue();
 
+  // Don't remove a load folding opportunity for the add. That would neutralize
+  // any improvements from removing constant materializations.
+  if (X86::mayFoldLoad(OtherOp, Subtarget))
+    return SDValue();
+
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
   SDValue FalseOp = Cmov.getOperand(0);
@@ -52499,7 +53331,7 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
   SDValue Op1 = N->getOperand(1);
   SDLoc DL(N);
 
-  if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG))
+  if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG, Subtarget))
     return Select;
 
   if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget))
@@ -52535,6 +53367,14 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
+  if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
+      X86::isZeroNode(Op0.getOperand(1))) {
+    assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use");
+    return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
+                       Op0.getOperand(0), Op0.getOperand(2));
+  }
+
   return combineAddOrSubToADCOrSBB(N, DAG);
 }
 
@@ -52617,6 +53457,25 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
   if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
     return V;
 
+  // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
+  if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
+      X86::isZeroNode(Op1.getOperand(1))) {
+    assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
+    return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
+                       Op1.getOperand(0), Op1.getOperand(2));
+  }
+
+  // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
+  // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
+  if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
+      !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
+    assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
+    SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
+                              Op1.getOperand(1), Op1.getOperand(2));
+    return DAG.getNode(ISD::SUB, SDLoc(N), Op0.getValueType(), ADC.getValue(0),
+                       Op1.getOperand(0));
+  }
+
   return combineAddOrSubToADCOrSBB(N, DAG);
 }
 
@@ -52745,6 +53604,17 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
         Subs.push_back(SubOp.getOperand(I));
       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
     };
+    auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
+      for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
+        SDValue Sub = SubOps[I].getOperand(Op);
+        unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
+        if (Sub.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+            Sub.getOperand(0).getValueType() != VT ||
+            Sub.getConstantOperandAPInt(1) != (I * NumSubElts))
+          return false;
+      }
+      return true;
+    };
 
     unsigned NumOps = Ops.size();
     switch (Op0.getOpcode()) {
@@ -52802,6 +53672,14 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
                            DAG.getTargetConstant(Idx, DL, MVT::i8));
       }
       break;
+    case X86ISD::PSHUFB:
+      if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
+                       (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
+        return DAG.getNode(Op0.getOpcode(), DL, VT,
+                           ConcatSubOperand(VT, Ops, 0),
+                           ConcatSubOperand(VT, Ops, 1));
+      }
+      break;
     case X86ISD::VPERMV3:
       if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
         MVT OpVT = Op0.getSimpleValueType();
@@ -52920,6 +53798,19 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
                            ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
       }
       break;
+    case ISD::VSELECT:
+    case X86ISD::BLENDV:
+      if (!IsSplat && VT.is256BitVector() && Ops.size() == 2 &&
+          (VT.getScalarSizeInBits() >= 32 || Subtarget.hasInt256()) &&
+          IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
+        EVT SelVT = Ops[0].getOperand(0).getValueType();
+        SelVT = SelVT.getDoubleNumVectorElementsVT(*DAG.getContext());
+        return DAG.getNode(Op0.getOpcode(), DL, VT,
+                           ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
+                           ConcatSubOperand(VT, Ops, 1),
+                           ConcatSubOperand(VT, Ops, 2));
+      }
+      break;
     }
   }
 
@@ -52937,12 +53828,29 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
     }
   }
 
+  // Attempt to fold target constant loads.
+  if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
+    SmallVector<APInt> EltBits;
+    APInt UndefElts = APInt::getNullValue(VT.getVectorNumElements());
+    for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
+      APInt OpUndefElts;
+      SmallVector<APInt> OpEltBits;
+      if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
+                                        OpEltBits, true, false))
+          break;
+      EltBits.append(OpEltBits);
+      UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
+    }
+    if (EltBits.size() == VT.getVectorNumElements())
+      return getConstVector(EltBits, UndefElts, VT, DAG, DL);
+  }
+
   return SDValue();
 }
 
-static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG,
-                                    TargetLowering::DAGCombinerInfo &DCI,
-                                    const X86Subtarget &Subtarget) {
+static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
   EVT SrcVT = N->getOperand(0).getValueType();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -52961,9 +53869,9 @@ static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
-                                      TargetLowering::DAGCombinerInfo &DCI,
-                                      const X86Subtarget &Subtarget) {
+static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
+                                       TargetLowering::DAGCombinerInfo &DCI,
+                                       const X86Subtarget &Subtarget) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
@@ -53044,7 +53952,7 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
 
   // Match concat_vector style patterns.
   SmallVector<SDValue, 2> SubVectorOps;
-  if (collectConcatOps(N, SubVectorOps)) {
+  if (collectConcatOps(N, SubVectorOps, DAG)) {
     if (SDValue Fold =
             combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
       return Fold;
@@ -53103,10 +54011,10 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
 /// This function should only be called with legal types (otherwise, the calls
 /// to get simple value types will assert).
 static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
-  SDValue Sel = peekThroughBitcasts(Ext->getOperand(0));
+  SDValue Sel = Ext->getOperand(0);
   SmallVector<SDValue, 4> CatOps;
   if (Sel.getOpcode() != ISD::VSELECT ||
-      !collectConcatOps(Sel.getOperand(0).getNode(), CatOps))
+      !collectConcatOps(Sel.getOperand(0).getNode(), CatOps, DAG))
     return SDValue();
 
   // Note: We assume simple value types because this should only be called with
@@ -53154,9 +54062,9 @@ static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
   return DAG.getBitcast(VT, NarrowSel);
 }
 
-static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
-                                       TargetLowering::DAGCombinerInfo &DCI,
-                                       const X86Subtarget &Subtarget) {
+static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
+                                        TargetLowering::DAGCombinerInfo &DCI,
+                                        const X86Subtarget &Subtarget) {
   // For AVX1 only, if we are extracting from a 256-bit and+not (which will
   // eventually get combined/lowered into ANDNP) with a concatenated operand,
   // split the 'and' into 128-bit ops to avoid the concatenate and extract.
@@ -53177,6 +54085,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
   EVT InVecVT = InVec.getValueType();
   unsigned SizeInBits = VT.getSizeInBits();
   unsigned InSizeInBits = InVecVT.getSizeInBits();
+  unsigned NumSubElts = VT.getVectorNumElements();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
@@ -53214,22 +54123,24 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
   }
 
   if (InVec.getOpcode() == ISD::BUILD_VECTOR)
-    return DAG.getBuildVector(
-        VT, SDLoc(N),
-        InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));
+    return DAG.getBuildVector(VT, SDLoc(N),
+                              InVec->ops().slice(IdxVal, NumSubElts));
 
-  // If we are extracting from an insert into a zero vector, replace with a
-  // smaller insert into zero if we don't access less than the original
-  // subvector. Don't do this for i1 vectors.
+  // If we are extracting from an insert into a larger vector, replace with a
+  // smaller insert if we don't access less than the original subvector. Don't
+  // do this for i1 vectors.
+  // TODO: Relax the matching indices requirement?
   if (VT.getVectorElementType() != MVT::i1 &&
-      InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 &&
-      InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) &&
-      ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) &&
+      InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() &&
+      IdxVal == InVec.getConstantOperandVal(2) &&
       InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
     SDLoc DL(N);
-    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
-                       getZeroVector(VT, Subtarget, DAG, DL),
-                       InVec.getOperand(1), InVec.getOperand(2));
+    SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
+                                 InVec.getOperand(0), N->getOperand(1));
+    unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt,
+                       InVec.getOperand(1),
+                       DAG.getVectorIdxConstant(NewIdxVal, DL));
   }
 
   // If we're extracting an upper subvector from a broadcast we should just
@@ -53246,8 +54157,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
     return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
 
   // Attempt to extract from the source of a shuffle vector.
-  if ((InSizeInBits % SizeInBits) == 0 &&
-      (IdxVal % VT.getVectorNumElements()) == 0) {
+  if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
     SmallVector<int, 32> ShuffleMask;
     SmallVector<int, 32> ScaledMask;
     SmallVector<SDValue, 2> ShuffleInputs;
@@ -53255,7 +54165,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
     // Decode the shuffle mask and scale it so its shuffling subvectors.
     if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
         scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
-      unsigned SubVecIdx = IdxVal / VT.getVectorNumElements();
+      unsigned SubVecIdx = IdxVal / NumSubElts;
       if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
         return DAG.getUNDEF(VT);
       if (ScaledMask[SubVecIdx] == SM_SentinelZero)
@@ -53263,7 +54173,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
       SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
       if (Src.getValueSizeInBits() == InSizeInBits) {
         unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
-        unsigned SrcEltIdx = SrcSubVecIdx * VT.getVectorNumElements();
+        unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
         return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
                                 SDLoc(N), SizeInBits);
       }
@@ -53273,8 +54183,8 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
   // If we're extracting the lowest subvector and we're the only user,
   // we may be able to perform this with a smaller vector width.
   unsigned InOpcode = InVec.getOpcode();
-  if (IdxVal == 0 && InVec.hasOneUse()) {
-    if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
+  if (InVec.hasOneUse()) {
+    if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
       // v2f64 CVTDQ2PD(v4i32).
       if (InOpcode == ISD::SINT_TO_FP &&
           InVec.getOperand(0).getValueType() == MVT::v4i32) {
@@ -53291,7 +54201,8 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
         return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
       }
     }
-    if ((InOpcode == ISD::ANY_EXTEND ||
+    if (IdxVal == 0 &&
+        (InOpcode == ISD::ANY_EXTEND ||
          InOpcode == ISD::ANY_EXTEND_VECTOR_INREG ||
          InOpcode == ISD::ZERO_EXTEND ||
          InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
@@ -53306,7 +54217,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
       unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);
       return DAG.getNode(ExtOp, DL, VT, Ext);
     }
-    if (InOpcode == ISD::VSELECT &&
+    if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
         InVec.getOperand(0).getValueType().is256BitVector() &&
         InVec.getOperand(1).getValueType().is256BitVector() &&
         InVec.getOperand(2).getValueType().is256BitVector()) {
@@ -53316,7 +54227,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
       SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
       return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
     }
-    if (InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
+    if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
         (VT.is128BitVector() || VT.is256BitVector())) {
       SDLoc DL(N);
       SDValue InVecSrc = InVec.getOperand(0);
@@ -53324,6 +54235,13 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
       SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
       return DAG.getNode(InOpcode, DL, VT, Ext);
     }
+    if (InOpcode == X86ISD::MOVDDUP &&
+        (VT.is128BitVector() || VT.is256BitVector())) {
+      SDLoc DL(N);
+      SDValue Ext0 =
+          extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
+      return DAG.getNode(InOpcode, DL, VT, Ext0);
+    }
   }
 
   // Always split vXi64 logical shifts where we're extracting the upper 32-bits
@@ -53476,11 +54394,9 @@ static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG,
       ISD::isBuildVectorAllZeros(RHS.getNode()))
     return DAG.getConstant(0, SDLoc(N), VT);
 
-  APInt KnownUndef, KnownZero;
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
-  if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
-                                     KnownZero, DCI))
+  if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
     return SDValue(N, 0);
 
   return SDValue();
@@ -53494,6 +54410,7 @@ static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
   unsigned Opcode = N->getOpcode();
   unsigned InOpcode = In.getOpcode();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDLoc DL(N);
 
   // Try to merge vector loads and extend_inreg to an extload.
   if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
@@ -53506,10 +54423,9 @@ static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
                                  : ISD::ZEXTLOAD;
       EVT MemVT = VT.changeVectorElementType(SVT);
       if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
-        SDValue Load =
-            DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
-                           Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(),
-                           Ld->getMemOperand()->getFlags());
+        SDValue Load = DAG.getExtLoad(
+            Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
+            MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags());
         DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
         return Load;
       }
@@ -53518,7 +54434,7 @@ static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
 
   // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
   if (Opcode == InOpcode)
-    return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0));
+    return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
 
   // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
   // -> EXTEND_VECTOR_INREG(X).
@@ -53527,12 +54443,26 @@ static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
       In.getOperand(0).getOpcode() == getOpcode_EXTEND(Opcode) &&
       In.getOperand(0).getOperand(0).getValueSizeInBits() ==
           In.getValueSizeInBits())
-    return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0).getOperand(0));
+    return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
 
-  // Attempt to combine as a shuffle.
-  // TODO: General ZERO_EXTEND_VECTOR_INREG support.
-  if (Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||
-      (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG && Subtarget.hasSSE41())) {
+  // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
+  // TODO: Move to DAGCombine?
+  if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
+      In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
+      In.getValueSizeInBits() == VT.getSizeInBits()) {
+    unsigned NumElts = VT.getVectorNumElements();
+    unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
+    EVT EltVT = In.getOperand(0).getValueType();
+    SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
+    for (unsigned I = 0; I != NumElts; ++I)
+      Elts[I * Scale] = In.getOperand(I);
+    return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
+  }
+
+  // Attempt to combine as a shuffle on SSE41+ targets.
+  if ((Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||
+       Opcode == ISD::ZERO_EXTEND_VECTOR_INREG) &&
+      Subtarget.hasSSE41()) {
     SDValue Op(N, 0);
     if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
       if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
@@ -53549,11 +54479,9 @@ static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
   if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
     return DAG.getConstant(0, SDLoc(N), VT);
 
-  APInt KnownUndef, KnownZero;
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
-  if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
-                                     KnownZero, DCI))
+  if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
     return SDValue(N, 0);
 
   return SDValue();
@@ -53781,11 +54709,11 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::PEXTRB:
     return combineExtractVectorElt(N, DAG, DCI, Subtarget);
   case ISD::CONCAT_VECTORS:
-    return combineConcatVectors(N, DAG, DCI, Subtarget);
+    return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
   case ISD::INSERT_SUBVECTOR:
-    return combineInsertSubvector(N, DAG, DCI, Subtarget);
+    return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
   case ISD::EXTRACT_SUBVECTOR:
-    return combineExtractSubvector(N, DAG, DCI, Subtarget);
+    return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
   case ISD::VSELECT:
   case ISD::SELECT:
   case X86ISD::BLENDV:      return combineSelect(N, DAG, DCI, Subtarget);
@@ -54397,37 +55325,37 @@ TargetLowering::ConstraintWeight
       weight = CW_Register;
     break;
   case 'I':
-    if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
+    if (auto *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
       if (C->getZExtValue() <= 31)
         weight = CW_Constant;
     }
     break;
   case 'J':
-    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
       if (C->getZExtValue() <= 63)
         weight = CW_Constant;
     }
     break;
   case 'K':
-    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
       if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
         weight = CW_Constant;
     }
     break;
   case 'L':
-    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
       if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
         weight = CW_Constant;
     }
     break;
   case 'M':
-    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
       if (C->getZExtValue() <= 3)
         weight = CW_Constant;
     }
     break;
   case 'N':
-    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
       if (C->getZExtValue() <= 0xff)
         weight = CW_Constant;
     }
@@ -54439,14 +55367,14 @@ TargetLowering::ConstraintWeight
     }
     break;
   case 'e':
-    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
       if ((C->getSExtValue() >= -0x80000000LL) &&
           (C->getSExtValue() <= 0x7fffffffLL))
         weight = CW_Constant;
     }
     break;
   case 'Z':
-    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
       if (C->getZExtValue() <= 0xffffffff)
         weight = CW_Constant;
     }
@@ -54511,7 +55439,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   switch (ConstraintLetter) {
   default: break;
   case 'I':
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+    if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
       if (C->getZExtValue() <= 31) {
         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
                                        Op.getValueType());
@@ -54520,7 +55448,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
     }
     return;
   case 'J':
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+    if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
       if (C->getZExtValue() <= 63) {
         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
                                        Op.getValueType());
@@ -54529,7 +55457,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
     }
     return;
   case 'K':
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+    if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
       if (isInt<8>(C->getSExtValue())) {
         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
                                        Op.getValueType());
@@ -54538,7 +55466,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
     }
     return;
   case 'L':
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+    if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
       if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
           (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
@@ -54548,7 +55476,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
     }
     return;
   case 'M':
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+    if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
       if (C->getZExtValue() <= 3) {
         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
                                        Op.getValueType());
@@ -54557,7 +55485,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
     }
     return;
   case 'N':
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+    if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
       if (C->getZExtValue() <= 255) {
         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
                                        Op.getValueType());
@@ -54566,7 +55494,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
     }
     return;
   case 'O':
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+    if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
       if (C->getZExtValue() <= 127) {
         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
                                        Op.getValueType());
@@ -54576,7 +55504,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
     return;
   case 'e': {
     // 32-bit signed value
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+    if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
                                            C->getSExtValue())) {
         // Widen to 64 bits here to get it sign extended.
@@ -54590,7 +55518,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   }
   case 'Z': {
     // 32-bit unsigned value
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+    if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
                                            C->getZExtValue())) {
         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
@@ -54604,7 +55532,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   }
   case 'i': {
     // Literal immediates are always ok.
-    if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
+    if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
       bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
       BooleanContent BCont = getBooleanContents(MVT::i64);
       ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
@@ -54617,8 +55545,9 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
 
     // In any sort of PIC mode addresses need to be computed at runtime by
     // adding in a register or some sort of table lookup.  These can't
-    // be used as immediates.
-    if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
+    // be used as immediates. BlockAddresses are fine though.
+    if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
+        !isa<BlockAddressSDNode>(Op))
       return;
 
     // If we are in non-pic codegen mode, we allow the address of a global (with
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 3f6d567d3f4d..af110884049b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -249,9 +249,6 @@ namespace llvm {
     SCALEFS,
     SCALEFS_RND,
 
-    // Unsigned Integer average.
-    AVG,
-
     /// Integer horizontal add/sub.
     HADD,
     HSUB,
@@ -790,6 +787,9 @@ namespace llvm {
     LOR,
     LXOR,
     LAND,
+    LBTS,
+    LBTC,
+    LBTR,
 
     // Load, scalar_to_vector, and zero extend.
     VZEXT_LOAD,
@@ -1039,10 +1039,7 @@ namespace llvm {
 
     bool isCtlzFast() const override;
 
-    bool hasBitPreservingFPLogic(EVT VT) const override {
-      return VT == MVT::f32 || VT == MVT::f64 || VT.isVector() ||
-             (VT == MVT::f16 && X86ScalarSSEf16);
-    }
+    bool hasBitPreservingFPLogic(EVT VT) const override;
 
     bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override {
       // If the pair to store is a mixture of float and int values, we will
@@ -1163,6 +1160,19 @@ namespace llvm {
                                    APInt &UndefElts,
                                    unsigned Depth) const override;
 
+    bool isTargetCanonicalConstantNode(SDValue Op) const override {
+      // Peek through bitcasts/extracts/inserts to see if we have a broadcast
+      // vector from memory.
+      while (Op.getOpcode() == ISD::BITCAST ||
+             Op.getOpcode() == ISD::EXTRACT_SUBVECTOR ||
+             (Op.getOpcode() == ISD::INSERT_SUBVECTOR &&
+              Op.getOperand(0).isUndef()))
+        Op = Op.getOperand(Op.getOpcode() == ISD::INSERT_SUBVECTOR ? 1 : 0);
+
+      return Op.getOpcode() == X86ISD::VBROADCAST_LOAD ||
+             TargetLowering::isTargetCanonicalConstantNode(Op);
+    }
+
     const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override;
 
     SDValue unwrapAddress(SDValue N) const override;
@@ -1288,6 +1298,9 @@ namespace llvm {
     /// from i32 to i8 but not from i32 to i16.
     bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
 
+    bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode,
+                                              EVT VT) const override;
+
     /// Given an intrinsic, checks if on the target the intrinsic will need to map
     /// to a MemIntrinsicNode (touches memory). If this is the case, it returns
     /// true and stores the intrinsic information into the IntrinsicInfo that was
@@ -1316,15 +1329,13 @@ namespace llvm {
     /// Returns true if lowering to a jump table is allowed.
     bool areJTsAllowed(const Function *Fn) const override;
 
+    MVT getPreferredSwitchConditionType(LLVMContext &Context,
+                                        EVT ConditionVT) const override;
+
     /// If true, then instruction selection should
     /// seek to shrink the FP constant of the specified type to a smaller type
     /// in order to save space and / or reduce runtime.
-    bool ShouldShrinkFPConstant(EVT VT) const override {
-      // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
-      // expensive than a straight movsd. On the other hand, it's important to
-      // shrink long double fp constant since fldt is very slow.
-      return !X86ScalarSSEf64 || VT == MVT::f80;
-    }
+    bool ShouldShrinkFPConstant(EVT VT) const override;
 
     /// Return true if we believe it is correct and profitable to reduce the
     /// load node to a smaller type.
@@ -1333,11 +1344,7 @@ namespace llvm {
 
     /// Return true if the specified scalar FP type is computed in an SSE
     /// register, not on the X87 floating point stack.
-    bool isScalarFPTypeInSSEReg(EVT VT) const {
-      return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
-             (VT == MVT::f32 && X86ScalarSSEf32) || // f32 is when SSE1
-             (VT == MVT::f16 && X86ScalarSSEf16);   // f16 is when AVX512FP16
-    }
+    bool isScalarFPTypeInSSEReg(EVT VT) const;
 
     /// Returns true if it is beneficial to convert a load of a constant
     /// to just the constant itself.
@@ -1491,13 +1498,6 @@ namespace llvm {
     /// make the right decision when generating code for different targets.
     const X86Subtarget &Subtarget;
 
-    /// Select between SSE or x87 floating point ops.
-    /// When SSE is available, use it for f32 operations.
-    /// When SSE2 is available, use it for f64 operations.
-    bool X86ScalarSSEf32;
-    bool X86ScalarSSEf64;
-    bool X86ScalarSSEf16;
-
     /// A list of legal FP immediates.
     std::vector<APFloat> LegalFPImmediates;
 
@@ -1637,9 +1637,13 @@ namespace llvm {
 
     TargetLoweringBase::AtomicExpansionKind
     shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
-    bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+    TargetLoweringBase::AtomicExpansionKind
+    shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
     TargetLoweringBase::AtomicExpansionKind
     shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+    TargetLoweringBase::AtomicExpansionKind
+    shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const;
+    void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
 
     LoadInst *
     lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
@@ -1649,6 +1653,8 @@ namespace llvm {
 
     bool needsCmpXchgNb(Type *MemType) const;
 
+    template<typename T> bool isSoftFP16(T VT) const;
+
     void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
                                 MachineBasicBlock *DispatchBB, int FI) const;
 
diff --git a/llvm/lib/Target/X86/X86IndirectThunks.cpp b/llvm/lib/Target/X86/X86IndirectThunks.cpp
index e08b4b7c03c6..001aa2dcb879 100644
--- a/llvm/lib/Target/X86/X86IndirectThunks.cpp
+++ b/llvm/lib/Target/X86/X86IndirectThunks.cpp
@@ -31,6 +31,7 @@
 #include "X86Subtarget.h"
 #include "llvm/CodeGen/IndirectThunks.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
diff --git a/llvm/lib/Target/X86/X86InsertPrefetch.cpp b/llvm/lib/Target/X86/X86InsertPrefetch.cpp
index 004e6fa5ebf4..08dc514a6476 100644
--- a/llvm/lib/Target/X86/X86InsertPrefetch.cpp
+++ b/llvm/lib/Target/X86/X86InsertPrefetch.cpp
@@ -23,6 +23,7 @@
 #include "X86InstrInfo.h"
 #include "X86MachineFunctionInfo.h"
 #include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/ProfileData/SampleProf.h"
diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
index ff8710634e89..c098122685be 100644
--- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -354,10 +354,9 @@ static Value *simplifyX86varShift(const IntrinsicInst &II,
 
   // If the shift amount is guaranteed to be in-range we can replace it with a
   // generic shift.
-  APInt UpperBits =
-      APInt::getHighBitsSet(BitWidth, BitWidth - Log2_32(BitWidth));
-  if (llvm::MaskedValueIsZero(Amt, UpperBits,
-                              II.getModule()->getDataLayout())) {
+  KnownBits KnownAmt =
+      llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
+  if (KnownAmt.getMaxValue().ult(BitWidth)) {
     return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
                                       : Builder.CreateLShr(Vec, Amt))
                          : Builder.CreateAShr(Vec, Amt));
@@ -521,11 +520,10 @@ static Value *simplifyX86movmsk(const IntrinsicInst &II,
   // %int = bitcast <16 x i1> %cmp to i16
   // %res = zext i16 %int to i32
   unsigned NumElts = ArgTy->getNumElements();
-  Type *IntegerVecTy = VectorType::getInteger(ArgTy);
   Type *IntegerTy = Builder.getIntNTy(NumElts);
 
-  Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy);
-  Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy));
+  Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy));
+  Res = Builder.CreateIsNeg(Res);
   Res = Builder.CreateBitCast(Res, IntegerTy);
   Res = Builder.CreateZExtOrTrunc(Res, ResTy);
   return Res;
@@ -997,20 +995,18 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
       }
 
-      if (MaskC->getValue().isShiftedMask()) {
+      unsigned MaskIdx, MaskLen;
+      if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
         // any single contingous sequence of 1s anywhere in the mask simply
         // describes a subset of the input bits shifted to the appropriate
         // position.  Replace with the straight forward IR.
-        unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
         Value *Input = II.getArgOperand(0);
         Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
-        Value *Shifted = IC.Builder.CreateLShr(Masked,
-                                               ConstantInt::get(II.getType(),
-                                                                ShiftAmount));
+        Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
+        Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt);
         return IC.replaceInstUsesWith(II, Shifted);
       }
 
-
       if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
         uint64_t Src = SrcC->getZExtValue();
         uint64_t Mask = MaskC->getZExtValue();
@@ -1042,15 +1038,15 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
       if (MaskC->isAllOnesValue()) {
         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
       }
-      if (MaskC->getValue().isShiftedMask()) {
+
+      unsigned MaskIdx, MaskLen;
+      if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
         // any single contingous sequence of 1s anywhere in the mask simply
         // describes a subset of the input bits shifted to the appropriate
         // position.  Replace with the straight forward IR.
-        unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
         Value *Input = II.getArgOperand(0);
-        Value *Shifted = IC.Builder.CreateShl(Input,
-                                              ConstantInt::get(II.getType(),
-                                                               ShiftAmount));
+        Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
+        Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt);
         Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
         return IC.replaceInstUsesWith(II, Masked);
       }
@@ -1934,6 +1930,23 @@ Optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
     break;
   }
 
+  // General per-element vector operations.
+  case Intrinsic::x86_avx2_psllv_d:
+  case Intrinsic::x86_avx2_psllv_d_256:
+  case Intrinsic::x86_avx2_psllv_q:
+  case Intrinsic::x86_avx2_psllv_q_256:
+  case Intrinsic::x86_avx2_psrlv_d:
+  case Intrinsic::x86_avx2_psrlv_d_256:
+  case Intrinsic::x86_avx2_psrlv_q:
+  case Intrinsic::x86_avx2_psrlv_q_256:
+  case Intrinsic::x86_avx2_psrav_d:
+  case Intrinsic::x86_avx2_psrav_d_256: {
+    simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
+    simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
+    UndefElts &= UndefElts2;
+    break;
+  }
+
   case Intrinsic::x86_sse2_packssdw_128:
   case Intrinsic::x86_sse2_packsswb_128:
   case Intrinsic::x86_sse2_packuswb_128:
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index d825981a6b36..5da06bc87b06 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -48,18 +48,23 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
                      VEX, T8XD;
 
     // Pseduo instruction for RA.
-    def PLDTILECFGV : PseudoI<(outs), (ins opaquemem:$src),
-                              [(int_x86_ldtilecfg_internal addr:$src)]>;
+    let isPseudo = true, mayLoad = 1, hasSideEffects = 1,
+        Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
+    def PLDTILECFGV : PseudoI<(outs), (ins opaquemem:$src), []>;
+    let isPseudo = true, mayLoad = 1 in
     def PTILELOADDV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
                                                      GR16:$src2,
                                                      opaquemem:$src3), []>;
+    let isPseudo = true, mayLoad = 1 in
     def PTILELOADDT1V : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
                                                        GR16:$src2,
                                                        opaquemem:$src3), []>;
+    let isPseudo = true, mayStore = 1 in
     def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1,
                                             GR16:$src2, opaquemem:$src3,
                                             TILE:$src4), []>;
-    let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1 in
+    let isPseudo = true, isReMaterializable = 1, isAsCheapAsAMove = 1,
+        canFoldAsLoad = 1 in
       def PTILEZEROV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2),
                                 [(set TILE:$dst, (int_x86_tilezero_internal
                                   GR16:$src1, GR16:$src2))]>;
@@ -67,9 +72,12 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
     let usesCustomInserter = 1 in {
       // Pseudo instructions, using immediates instead of tile registers.
       // To be translated to the actual instructions in X86ISelLowering.cpp
+      let mayLoad = 1 in
       def PTILELOADD : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>;
+      let mayLoad = 1 in
       def PTILELOADDT1 : PseudoI<(outs), (ins u8imm:$src1,
                                           sibmem:$src2), []>;
+      let mayStore = 1 in
       def PTILESTORED : PseudoI<(outs), (ins i8mem:$dst, u8imm:$src), []>;
       def PTILEZERO : PseudoI<(outs), (ins u8imm:$src),
                               [(int_x86_tilezero timm:$src)]>;
@@ -99,7 +107,7 @@ let Predicates = [HasAMXINT8, In64BitMode] in {
     }
 
     // Pseduo instruction for RA.
-    let Constraints = "$src4 = $dst" in {
+    let isPseudo = true, Constraints = "$src4 = $dst" in {
       def PTDPBSSDV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
                               GR16:$src2, GR16:$src3, TILE:$src4,
                               TILE:$src5, TILE:$src6),
@@ -158,7 +166,7 @@ let Predicates = [HasAMXBF16, In64BitMode] in {
                       []>, VEX_4V, T8XS;
 
     // Pseduo instruction for RA.
-    let Constraints = "$src4 = $dst" in
+    let isPseudo = true, Constraints = "$src4 = $dst" in
       def PTDPBF16PSV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
                                  GR16:$src2, GR16:$src3, TILE:$src4,
                                  TILE:$src5, TILE:$src6),
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index bc67d1f89d7f..48da7b3ac882 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -476,6 +476,7 @@ let Predicates = [HasAVX512] in {
 def : Pat<(v64i8 immAllZerosV), (AVX512_512_SET0)>;
 def : Pat<(v32i16 immAllZerosV), (AVX512_512_SET0)>;
 def : Pat<(v8i64 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v32f16 immAllZerosV), (AVX512_512_SET0)>;
 def : Pat<(v16f32 immAllZerosV), (AVX512_512_SET0)>;
 def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>;
 }
@@ -508,25 +509,23 @@ let Predicates = [HasAVX512] in {
 def : Pat<(v8i16 immAllZerosV), (AVX512_128_SET0)>;
 def : Pat<(v16i8 immAllZerosV), (AVX512_128_SET0)>;
 def : Pat<(v2i64 immAllZerosV), (AVX512_128_SET0)>;
+def : Pat<(v8f16 immAllZerosV), (AVX512_128_SET0)>;
 def : Pat<(v4f32 immAllZerosV), (AVX512_128_SET0)>;
 def : Pat<(v2f64 immAllZerosV), (AVX512_128_SET0)>;
 def : Pat<(v32i8 immAllZerosV), (AVX512_256_SET0)>;
 def : Pat<(v16i16 immAllZerosV), (AVX512_256_SET0)>;
 def : Pat<(v4i64 immAllZerosV), (AVX512_256_SET0)>;
+def : Pat<(v16f16 immAllZerosV), (AVX512_256_SET0)>;
 def : Pat<(v8f32 immAllZerosV), (AVX512_256_SET0)>;
 def : Pat<(v4f64 immAllZerosV), (AVX512_256_SET0)>;
 }
 
-let Predicates = [HasFP16] in {
-def : Pat<(v8f16 immAllZerosV), (AVX512_128_SET0)>;
-def : Pat<(v16f16 immAllZerosV), (AVX512_256_SET0)>;
-def : Pat<(v32f16 immAllZerosV), (AVX512_512_SET0)>;
-}
-
 // Alias instructions that map fld0 to xorps for sse or vxorps for avx.
 // This is expanded by ExpandPostRAPseudos.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
     isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasAVX512] in {
+  def AVX512_FsFLD0SH : I<0, Pseudo, (outs FR16X:$dst), (ins), "",
+                          [(set FR16X:$dst, fp16imm0)]>;
   def AVX512_FsFLD0SS : I<0, Pseudo, (outs FR32X:$dst), (ins), "",
                           [(set FR32X:$dst, fp32imm0)]>;
   def AVX512_FsFLD0SD : I<0, Pseudo, (outs FR64X:$dst), (ins), "",
@@ -535,12 +534,6 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
                             [(set VR128X:$dst, fp128imm0)]>;
 }
 
-let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasFP16] in {
-  def AVX512_FsFLD0SH : I<0, Pseudo, (outs FR16X:$dst), (ins), "",
-                          [(set FR16X:$dst, fp16imm0)]>;
-}
-
 //===----------------------------------------------------------------------===//
 // AVX-512 - VECTOR INSERT
 //
@@ -678,21 +671,21 @@ defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v8i16x_info, v16i16x_info,
 defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v16i8x_info, v32i8x_info,
               vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
 defm : vinsert_for_size_lowering<"VINSERTF32x4Z256", v8f16x_info, v16f16x_info,
-              vinsert128_insert, INSERT_get_vinsert128_imm, [HasFP16, HasVLX]>;
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
 // Codegen pattern with the alternative types insert VEC128 into VEC512
 defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v8i16x_info, v32i16_info,
               vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
 defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v16i8x_info, v64i8_info,
                vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
 defm : vinsert_for_size_lowering<"VINSERTF32x4Z", v8f16x_info, v32f16_info,
-              vinsert128_insert, INSERT_get_vinsert128_imm, [HasFP16]>;
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
 // Codegen pattern with the alternative types insert VEC256 into VEC512
 defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v16i16x_info, v32i16_info,
               vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
 defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v32i8x_info, v64i8_info,
               vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
 defm : vinsert_for_size_lowering<"VINSERTF64x4Z", v16f16x_info, v32f16_info,
-              vinsert256_insert, INSERT_get_vinsert256_imm, [HasFP16]>;
+              vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
 
 
 multiclass vinsert_for_mask_cast<string InstrStr, X86VectorVTInfo From,
@@ -979,7 +972,7 @@ defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v16i16x_info, v8i16x_info
 defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v32i8x_info, v16i8x_info,
           vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
 defm : vextract_for_size_lowering<"VEXTRACTF32x4Z256", v16f16x_info, v8f16x_info,
-          vextract128_extract, EXTRACT_get_vextract128_imm, [HasFP16, HasVLX]>;
+          vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
 
 // Codegen pattern with the alternative types extract VEC128 from VEC512
 defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info,
@@ -987,14 +980,14 @@ defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info,
 defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v64i8_info, v16i8x_info,
                  vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
 defm : vextract_for_size_lowering<"VEXTRACTF32x4Z", v32f16_info, v8f16x_info,
-                 vextract128_extract, EXTRACT_get_vextract128_imm, [HasFP16]>;
+                 vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
 // Codegen pattern with the alternative types extract VEC256 from VEC512
 defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v32i16_info, v16i16x_info,
                  vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
 defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info,
                  vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
 defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v32f16_info, v16f16x_info,
-                 vextract256_extract, EXTRACT_get_vextract256_imm, [HasFP16]>;
+                 vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
 
 
 // A 128-bit extract from bits [255:128] of a 512-bit vector should use a
@@ -1020,6 +1013,10 @@ def : Pat<(v8i16 (extract_subvector (v32i16 VR512:$src), (iPTR 8))),
           (v8i16 (VEXTRACTI128rr
                   (v16i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_ymm)),
                   (iPTR 1)))>;
+def : Pat<(v8f16 (extract_subvector (v32f16 VR512:$src), (iPTR 8))),
+          (v8f16 (VEXTRACTF128rr
+                  (v16f16 (EXTRACT_SUBREG (v32f16 VR512:$src), sub_ymm)),
+                  (iPTR 1)))>;
 def : Pat<(v16i8 (extract_subvector (v64i8 VR512:$src), (iPTR 16))),
           (v16i8 (VEXTRACTI128rr
                   (v32i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_ymm)),
@@ -1049,18 +1046,16 @@ def : Pat<(v8i16 (extract_subvector (v32i16 VR512:$src), (iPTR 8))),
           (v8i16 (VEXTRACTI32x4Z256rr
                   (v16i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_ymm)),
                   (iPTR 1)))>;
+def : Pat<(v8f16 (extract_subvector (v32f16 VR512:$src), (iPTR 8))),
+          (v8f16 (VEXTRACTF32x4Z256rr
+                  (v16f16 (EXTRACT_SUBREG (v32f16 VR512:$src), sub_ymm)),
+                  (iPTR 1)))>;
 def : Pat<(v16i8 (extract_subvector (v64i8 VR512:$src), (iPTR 16))),
           (v16i8 (VEXTRACTI32x4Z256rr
                   (v32i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_ymm)),
                   (iPTR 1)))>;
 }
 
-let Predicates = [HasFP16, HasVLX] in
-def : Pat<(v8f16 (extract_subvector (v32f16 VR512:$src), (iPTR 8))),
-          (v8f16 (VEXTRACTF32x4Z256rr
-                  (v16f16 (EXTRACT_SUBREG (v32f16 VR512:$src), sub_ymm)),
-                  (iPTR 1)))>;
-
 
 // Additional patterns for handling a bitcast between the vselect and the
 // extract_subvector.
@@ -1478,7 +1473,7 @@ multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr,
                            Sched<[SchedWriteShuffle.YMM.Folded]>,
                            AVX5128IBase, EVEX;
 }
-let Predicates = [HasFP16] in {
+let Predicates = [HasBWI] in {
   def : Pat<(v32f16 (X86VBroadcastld16 addr:$src)),
             (VPBROADCASTWZrm addr:$src)>;
 
@@ -1487,7 +1482,7 @@ let Predicates = [HasFP16] in {
   def : Pat<(v32f16 (X86VBroadcast (f16 FR16X:$src))),
             (VPBROADCASTWZrr (COPY_TO_REGCLASS FR16X:$src, VR128X))>;
 }
-let Predicates = [HasVLX, HasFP16] in {
+let Predicates = [HasVLX, HasBWI] in {
   def : Pat<(v8f16 (X86VBroadcastld16 addr:$src)),
             (VPBROADCASTWZ128rm addr:$src)>;
   def : Pat<(v16f16 (X86VBroadcastld16 addr:$src)),
@@ -3763,6 +3758,9 @@ let Predicates = [HasBWI, NoVLX] in {
 
   defm : mask_move_lowering<"VMOVDQU16Z", v8i16x_info, v32i16_info>;
   defm : mask_move_lowering<"VMOVDQU16Z", v16i16x_info, v32i16_info>;
+
+  defm : mask_move_lowering<"VMOVDQU16Z", v8f16x_info, v32f16_info>;
+  defm : mask_move_lowering<"VMOVDQU16Z", v16f16x_info, v32f16_info>;
 }
 
 let Predicates = [HasAVX512] in {
@@ -3852,7 +3850,7 @@ let Predicates = [HasVLX] in {
   def : Pat<(store (v32i8 VR256X:$src), addr:$dst),
             (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>;
 }
-let Predicates = [HasFP16] in {
+let Predicates = [HasBWI] in {
   def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 VR512:$src1), (v32f16 VR512:$src0))),
             (VMOVDQU16Zrrk VR512:$src0, VK32WM:$mask, VR512:$src1)>;
   def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 VR512:$src1), v32f16_info.ImmAllZerosV)),
@@ -3887,7 +3885,7 @@ let Predicates = [HasFP16] in {
   def : Pat<(masked_store (v32f16 VR512:$src), addr:$dst, VK32WM:$mask),
             (VMOVDQU16Zmrk addr:$dst, VK32WM:$mask, VR512:$src)>;
 }
-let Predicates = [HasFP16, HasVLX] in {
+let Predicates = [HasBWI, HasVLX] in {
   def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 VR256X:$src1), (v16f16 VR256X:$src0))),
             (VMOVDQU16Z256rrk VR256X:$src0, VK16WM:$mask, VR256X:$src1)>;
   def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 VR256X:$src1), v16f16x_info.ImmAllZerosV)),
@@ -4099,14 +4097,14 @@ def : Pat<(f64 (bitconvert VK64:$src)),
 //===----------------------------------------------------------------------===//
 
 multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag,
-                              X86VectorVTInfo _,
-                              list<Predicate> prd = [HasAVX512, OptForSize]> {
-  let Predicates = prd in
+                              X86VectorVTInfo _, Predicate prd = HasAVX512> {
+  let Predicates = !if (!eq (prd, HasFP16), [HasFP16], [prd, OptForSize]) in
   def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
              (ins _.RC:$src1, _.RC:$src2),
              !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, _.RC:$src2)))],
              _.ExeDomain>, EVEX_4V, Sched<[SchedWriteFShuffle.XMM]>;
+  let Predicates = [prd] in {
   def rrkz : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
               (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
               !strconcat(asm, "\t{$src2, $src1, $dst {${mask}} {z}|",
@@ -4159,6 +4157,7 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag,
               !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
               [], _.ExeDomain>, EVEX, EVEX_K, Sched<[WriteFStore]>,
               NotMemoryFoldable;
+  }
 }
 
 defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, X86vzload32, f32x_info>,
@@ -4168,7 +4167,7 @@ defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, X86vzload64, f64x_info>,
                                   VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
 
 defm VMOVSHZ : avx512_move_scalar<"vmovsh", X86Movsh, X86vzload16, f16x_info,
-                                  [HasFP16]>,
+                                  HasFP16>,
                                   VEX_LIG, T_MAP5XS, EVEX_CD8<16, CD8VT1>;
 
 multiclass avx512_move_scalar_lowering<string InstrStr, SDNode OpNode,
@@ -4338,14 +4337,9 @@ def : Pat<(_.info128.VT (masked_load addr:$srcAddr, Mask128,
                       addr:$srcAddr)>;
 }
 
-defm : avx512_move_scalar_lowering<"VMOVSHZ", X86Movsh, fp16imm0, v8f16x_info>;
 defm : avx512_move_scalar_lowering<"VMOVSSZ", X86Movss, fp32imm0, v4f32x_info>;
 defm : avx512_move_scalar_lowering<"VMOVSDZ", X86Movsd, fp64imm0, v2f64x_info>;
 
-defm : avx512_store_scalar_lowering<"VMOVSHZ", avx512vl_f16_info,
-                   (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32>;
-defm : avx512_store_scalar_lowering_subreg<"VMOVSHZ", avx512vl_f16_info,
-                   (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32, sub_32bit>;
 defm : avx512_store_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
                    (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>;
 defm : avx512_store_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
@@ -4353,6 +4347,12 @@ defm : avx512_store_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
 defm : avx512_store_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info,
                    (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>;
 
+let Predicates = [HasFP16] in {
+defm : avx512_move_scalar_lowering<"VMOVSHZ", X86Movsh, fp16imm0, v8f16x_info>;
+defm : avx512_store_scalar_lowering<"VMOVSHZ", avx512vl_f16_info,
+                   (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32>;
+defm : avx512_store_scalar_lowering_subreg<"VMOVSHZ", avx512vl_f16_info,
+                   (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32, sub_32bit>;
 defm : avx512_store_scalar_lowering_subreg2<"VMOVSHZ", avx512vl_f16_info,
                    (v32i1 (insert_subvector
                            (v32i1 immAllZerosV),
@@ -4360,6 +4360,30 @@ defm : avx512_store_scalar_lowering_subreg2<"VMOVSHZ", avx512vl_f16_info,
                            (iPTR 0))),
                    (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
                    GR8, sub_8bit>;
+
+defm : avx512_load_scalar_lowering<"VMOVSHZ", avx512vl_f16_info,
+                   (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32>;
+defm : avx512_load_scalar_lowering_subreg<"VMOVSHZ", avx512vl_f16_info,
+                   (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32, sub_32bit>;
+defm : avx512_load_scalar_lowering_subreg2<"VMOVSHZ", avx512vl_f16_info,
+                   (v32i1 (insert_subvector
+                           (v32i1 immAllZerosV),
+                           (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
+                           (iPTR 0))),
+                   (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
+                   GR8, sub_8bit>;
+
+def : Pat<(f16 (X86selects VK1WM:$mask, (f16 FR16X:$src1), (f16 FR16X:$src2))),
+          (COPY_TO_REGCLASS (v8f16 (VMOVSHZrrk
+           (v8f16 (COPY_TO_REGCLASS FR16X:$src2, VR128X)),
+           VK1WM:$mask, (v8f16 (IMPLICIT_DEF)),
+           (v8f16 (COPY_TO_REGCLASS FR16X:$src1, VR128X)))), FR16X)>;
+
+def : Pat<(f16 (X86selects VK1WM:$mask, (f16 FR16X:$src1), fp16imm0)),
+          (COPY_TO_REGCLASS (v8f16 (VMOVSHZrrkz VK1WM:$mask, (v8f16 (IMPLICIT_DEF)),
+           (v8f16 (COPY_TO_REGCLASS FR16X:$src1, VR128X)))), FR16X)>;
+}
+
 defm : avx512_store_scalar_lowering_subreg2<"VMOVSSZ", avx512vl_f32_info,
                    (v16i1 (insert_subvector
                            (v16i1 immAllZerosV),
@@ -4385,10 +4409,6 @@ defm : avx512_store_scalar_lowering_subreg2<"VMOVSDZ", avx512vl_f64_info,
                           (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))),
                           (iPTR 0))), GR8, sub_8bit>;
 
-defm : avx512_load_scalar_lowering<"VMOVSHZ", avx512vl_f16_info,
-                   (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32>;
-defm : avx512_load_scalar_lowering_subreg<"VMOVSHZ", avx512vl_f16_info,
-                   (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32, sub_32bit>;
 defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
                    (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>;
 defm : avx512_load_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
@@ -4396,13 +4416,6 @@ defm : avx512_load_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
 defm : avx512_load_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info,
                    (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>;
 
-defm : avx512_load_scalar_lowering_subreg2<"VMOVSHZ", avx512vl_f16_info,
-                   (v32i1 (insert_subvector
-                           (v32i1 immAllZerosV),
-                           (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
-                           (iPTR 0))),
-                   (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
-                   GR8, sub_8bit>;
 defm : avx512_load_scalar_lowering_subreg2<"VMOVSSZ", avx512vl_f32_info,
                    (v16i1 (insert_subvector
                            (v16i1 immAllZerosV),
@@ -4428,16 +4441,6 @@ defm : avx512_load_scalar_lowering_subreg2<"VMOVSDZ", avx512vl_f64_info,
                           (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))),
                           (iPTR 0))), GR8, sub_8bit>;
 
-def : Pat<(f16 (X86selects VK1WM:$mask, (f16 FR16X:$src1), (f16 FR16X:$src2))),
-          (COPY_TO_REGCLASS (v8f16 (VMOVSHZrrk
-           (v8f16 (COPY_TO_REGCLASS FR16X:$src2, VR128X)),
-           VK1WM:$mask, (v8f16 (IMPLICIT_DEF)),
-           (v8f16 (COPY_TO_REGCLASS FR16X:$src1, VR128X)))), FR16X)>;
-
-def : Pat<(f16 (X86selects VK1WM:$mask, (f16 FR16X:$src1), fp16imm0)),
-          (COPY_TO_REGCLASS (v8f16 (VMOVSHZrrkz VK1WM:$mask, (v8f16 (IMPLICIT_DEF)),
-           (v8f16 (COPY_TO_REGCLASS FR16X:$src1, VR128X)))), FR16X)>;
-
 def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))),
           (COPY_TO_REGCLASS (v4f32 (VMOVSSZrrk
            (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)),
@@ -5039,7 +5042,7 @@ defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhuw", mulhu, SchedWriteVecIMul,
                                      HasBWI, 1>;
 defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrsw", X86mulhrs,
                                       SchedWriteVecIMul, HasBWI, 1>, T8PD;
-defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg,
+defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", avgceilu,
                                    SchedWriteVecALU, HasBWI, 1>;
 defm VPMULDQ : avx512_binop_rm_vl_q<0x28, "vpmuldq", X86pmuldq,
                                     SchedWriteVecIMul, HasAVX512, 1>, T8PD;
@@ -11651,6 +11654,14 @@ defm VPINSRWZ : avx512_insert_elt_bw<0xC4, "vpinsrw", X86pinsrw, v8i16x_info,
 defm VPINSRDZ : avx512_insert_elt_dq<0x22, "vpinsrd", v4i32x_info, GR32>;
 defm VPINSRQZ : avx512_insert_elt_dq<0x22, "vpinsrq", v2i64x_info, GR64>, VEX_W;
 
+// Always select FP16 instructions if available.
+let Predicates = [HasBWI], AddedComplexity = -10 in {
+  def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (VPINSRWZrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16X)>;
+  def : Pat<(store f16:$src, addr:$dst), (VPEXTRWZmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>;
+  def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (VPEXTRWZrr (v8i16 (COPY_TO_REGCLASS FR16X:$src, VR128X)), 0), sub_16bit)>;
+  def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (VPINSRWZrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16X)>;
+}
+
 //===----------------------------------------------------------------------===//
 // VSHUFPS - VSHUFPD Operations
 //===----------------------------------------------------------------------===//
@@ -12988,7 +12999,6 @@ def : Pat<(i16 (bitconvert FR16X:$src)),
                 sub_16bit))>;
 def : Pat<(i16 (extractelt (v8i16 VR128X:$src), (iPTR 0))),
           (i16 (EXTRACT_SUBREG (VMOVSH2Wrr VR128X:$src), sub_16bit))>;
-}
 
 // Allow "vmovw" to use GR64
 let hasSideEffects = 0 in {
@@ -12997,6 +13007,7 @@ let hasSideEffects = 0 in {
   def VMOVSHtoW64rr : AVX512<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
                      "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5PD, EVEX, VEX_W, Sched<[WriteVecMoveToGpr]>;
 }
+}
 
 // Convert 16-bit float to i16/u16
 multiclass avx512_cvtph2w<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index 8337d2b37383..f08ecdf6afc9 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -541,7 +541,7 @@ class X86TypeInfo<ValueType vt, string instrsuffix, RegisterClass regclass,
                   Operand immoperand, SDPatternOperator immoperator,
                   Operand imm8operand, SDPatternOperator imm8operator,
                   bit hasOddOpcode, OperandSize opSize,
-                  bit hasREX_WPrefix> {
+                  bit hasREX_W> {
   /// VT - This is the value type itself.
   ValueType VT = vt;
 
@@ -596,9 +596,9 @@ class X86TypeInfo<ValueType vt, string instrsuffix, RegisterClass regclass,
   /// to Opsize16. i32 sets this to OpSize32.
   OperandSize OpSize = opSize;
 
-  /// HasREX_WPrefix - This bit is set to true if the instruction should have
+  /// HasREX_W - This bit is set to true if the instruction should have
   /// the 0x40 REX prefix.  This is set for i64 types.
-  bit HasREX_WPrefix = hasREX_WPrefix;
+  bit HasREX_W = hasREX_W;
 }
 
 def invalid_node : SDNode<"<<invalid_node>>", SDTIntLeaf,[],"<<invalid_node>>">;
@@ -634,7 +634,7 @@ class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins,
 
   // Infer instruction prefixes from type info.
   let OpSize = typeinfo.OpSize;
-  let hasREX_WPrefix  = typeinfo.HasREX_WPrefix;
+  let hasREX_W  = typeinfo.HasREX_W;
 }
 
 // BinOpRR - Instructions like "add reg, reg, reg".
diff --git a/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/llvm/lib/Target/X86/X86InstrCMovSetCC.td
index 330b8c7a8a43..79ac2a2d8019 100644
--- a/llvm/lib/Target/X86/X86InstrCMovSetCC.td
+++ b/llvm/lib/Target/X86/X86InstrCMovSetCC.td
@@ -14,7 +14,7 @@
 
 // CMOV instructions.
 let isCodeGenOnly = 1, ForceDisassemble = 1 in {
-let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
+let Uses = [EFLAGS], Predicates = [HasCMOV], Constraints = "$src1 = $dst",
     isCommutable = 1, SchedRW = [WriteCMOV] in {
   def CMOV16rr
     : I<0x40, MRMSrcRegCC, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, ccode:$cond),
@@ -35,7 +35,7 @@ let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
               (X86cmov GR64:$src1, GR64:$src2, timm:$cond, EFLAGS))]>, TB;
 }
 
-let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
+let Uses = [EFLAGS], Predicates = [HasCMOV], Constraints = "$src1 = $dst",
     SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold] in {
   def CMOV16rm
     : I<0x40, MRMSrcMemCC, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2, ccode:$cond),
@@ -52,7 +52,7 @@ let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
         "cmov${cond}{q}\t{$src2, $dst|$dst, $src2}",
         [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
                                   timm:$cond, EFLAGS))]>, TB;
-} // Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst"
+} // Uses = [EFLAGS], Predicates = [HasCMOV], Constraints = "$src1 = $dst"
 } // isCodeGenOnly = 1, ForceDisassemble = 1
 
 def inv_cond_XFORM : SDNodeXForm<imm, [{
@@ -63,7 +63,7 @@ def inv_cond_XFORM : SDNodeXForm<imm, [{
 
 // Conditional moves with folded loads with operands swapped and conditions
 // inverted.
-let Predicates = [HasCMov] in {
+let Predicates = [HasCMOV] in {
   def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, timm:$cond, EFLAGS),
             (CMOV16rm GR16:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
   def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, timm:$cond, EFLAGS),
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 7288ce812138..a55b95960aa6 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -544,10 +544,10 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in {
   // i8 register pressure.
   defm _GR8 : CMOVrr_PSEUDO<GR8, i8>;
 
-  let Predicates = [NoCMov] in {
+  let Predicates = [NoCMOV] in {
     defm _GR32 : CMOVrr_PSEUDO<GR32, i32>;
     defm _GR16 : CMOVrr_PSEUDO<GR16, i16>;
-  } // Predicates = [NoCMov]
+  } // Predicates = [NoCMOV]
 
   // fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no
   // SSE1/SSE2.
@@ -562,12 +562,14 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in {
   let Predicates = [HasMMX] in
     defm _VR64   : CMOVrr_PSEUDO<VR64, x86mmx>;
 
-  defm _FR16X    : CMOVrr_PSEUDO<FR16X, f16>;
   let Predicates = [HasSSE1,NoAVX512] in
     defm _FR32   : CMOVrr_PSEUDO<FR32, f32>;
-  let Predicates = [HasSSE2,NoAVX512] in
+  let Predicates = [HasSSE2,NoAVX512] in {
+    defm _FR16   : CMOVrr_PSEUDO<FR16, f16>;
     defm _FR64   : CMOVrr_PSEUDO<FR64, f64>;
+  }
   let Predicates = [HasAVX512] in {
+    defm _FR16X  : CMOVrr_PSEUDO<FR16X, f16>;
     defm _FR32X  : CMOVrr_PSEUDO<FR32X, f32>;
     defm _FR64X  : CMOVrr_PSEUDO<FR64X, f64>;
   }
@@ -670,7 +672,7 @@ def OR32mi8Locked  : Ii8<0x83, MRM1m, (outs), (ins i32mem:$dst, i32i8imm:$zero),
                          Requires<[Not64BitMode]>, OpSize32, LOCK,
                          Sched<[WriteALURMW]>;
 
-let hasSideEffects = 1 in
+let hasSideEffects = 1, isMeta = 1 in
 def Int_MemBarrier : I<0, Pseudo, (outs), (ins),
                      "#MEMBARRIER",
                      [(X86MemBarrier)]>, Sched<[WriteLoad]>;
@@ -839,6 +841,38 @@ let Predicates = [UseIncDec] in {
   def : Pat<(X86lock_sub addr:$dst, (i64 -1)), (LOCK_INC64m addr:$dst)>;
 }
 
+// Atomic bit test.
+def X86LBTest : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisPtrTy<1>,
+                                     SDTCisVT<2, i8>, SDTCisVT<3, i32>]>;
+def x86bts : SDNode<"X86ISD::LBTS", X86LBTest,
+                    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def x86btc : SDNode<"X86ISD::LBTC", X86LBTest,
+                    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def x86btr : SDNode<"X86ISD::LBTR", X86LBTest,
+                    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+
+multiclass ATOMIC_LOGIC_OP<Format Form, string s> {
+  let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
+      SchedRW = [WriteBitTestSetRegRMW]  in {
+    def 16m : Ii8<0xBA, Form, (outs), (ins i16mem:$src1, i8imm:$src2),
+                  !strconcat(s, "{w}\t{$src2, $src1|$src1, $src2}"),
+                  [(set EFLAGS, (!cast<SDNode>("x86" # s) addr:$src1, timm:$src2, (i32 16)))]>,
+              OpSize16, TB, LOCK;
+    def 32m : Ii8<0xBA, Form, (outs), (ins i32mem:$src1, i8imm:$src2),
+                  !strconcat(s, "{l}\t{$src2, $src1|$src1, $src2}"),
+                  [(set EFLAGS, (!cast<SDNode>("x86" # s) addr:$src1, timm:$src2, (i32 32)))]>,
+              OpSize32, TB, LOCK;
+    def 64m : RIi8<0xBA, Form, (outs), (ins i64mem:$src1, i8imm:$src2),
+                   !strconcat(s, "{q}\t{$src2, $src1|$src1, $src2}"),
+                   [(set EFLAGS, (!cast<SDNode>("x86" # s) addr:$src1, timm:$src2, (i32 64)))]>,
+              TB, LOCK;
+  }
+}
+
+defm LOCK_BTS : ATOMIC_LOGIC_OP<MRM5m, "bts">;
+defm LOCK_BTC : ATOMIC_LOGIC_OP<MRM7m, "btc">;
+defm LOCK_BTR : ATOMIC_LOGIC_OP<MRM6m, "btr">;
+
 // Atomic compare and swap.
 multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form,
                           string mnemonic, SDPatternOperator frag> {
@@ -863,7 +897,7 @@ let isCodeGenOnly = 1, SchedRW = [WriteCMPXCHGRMW] in {
 }
 
 let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX],
-    Predicates = [HasCmpxchg8b], SchedRW = [WriteCMPXCHGRMW],
+    Predicates = [HasCX8], SchedRW = [WriteCMPXCHGRMW],
     isCodeGenOnly = 1, usesCustomInserter = 1 in {
 def LCMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$ptr),
                    "cmpxchg8b\t$ptr",
@@ -871,7 +905,7 @@ def LCMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$ptr),
 }
 
 let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
-    Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
+    Predicates = [HasCX16,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
     isCodeGenOnly = 1, mayLoad = 1, mayStore = 1, hasSideEffects = 0 in {
 def LCMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$ptr),
                      "cmpxchg16b\t$ptr",
@@ -898,7 +932,7 @@ def LCMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$ptr),
 // the instruction and we are sure we will have a valid register to restore
 // the value of RBX.
 let Defs = [RAX, RDX, RBX, EFLAGS], Uses = [RAX, RCX, RDX],
-    Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
+    Predicates = [HasCX16,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
     isCodeGenOnly = 1, isPseudo = 1,
     mayLoad = 1, mayStore = 1, hasSideEffects = 0,
     Constraints = "$rbx_save = $dst" in {
@@ -910,7 +944,7 @@ def LCMPXCHG16B_SAVE_RBX :
 // Pseudo instruction that doesn't read/write RBX. Will be turned into either
 // LCMPXCHG16B_SAVE_RBX or LCMPXCHG16B via a custom inserter.
 let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RCX, RDX],
-    Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
+    Predicates = [HasCX16,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
     isCodeGenOnly = 1, isPseudo = 1,
     mayLoad = 1, mayStore = 1, hasSideEffects = 0,
     usesCustomInserter = 1 in {
@@ -1235,6 +1269,21 @@ def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off),
   return true;
 }]>;
 
+def X86tcret_1reg : PatFrag<(ops node:$ptr, node:$off),
+                             (X86tcret node:$ptr, node:$off), [{
+  // X86tcret args: (*chain, ptr, imm, regs..., glue)
+  unsigned NumRegs = 1;
+  const SDValue& BasePtr = cast<LoadSDNode>(N->getOperand(1))->getBasePtr();
+  if (isa<FrameIndexSDNode>(BasePtr))
+    NumRegs = 3;
+  else if (BasePtr->getNumOperands() && isa<GlobalAddressSDNode>(BasePtr->getOperand(0)))
+    NumRegs = 3;
+  for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i)
+    if (isa<RegisterSDNode>(N->getOperand(i)) && ( NumRegs-- == 0))
+      return false;
+  return true;
+}]>;
+
 def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
           (TCRETURNri ptr_rc_tailcall:$dst, timm:$off)>,
           Requires<[Not64BitMode, NotUseIndirectThunkCalls]>;
@@ -1242,7 +1291,8 @@ def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
 // FIXME: This is disabled for 32-bit PIC mode because the global base
 // register which is part of the address mode may be assigned a
 // callee-saved register.
-def : Pat<(X86tcret (load addr:$dst), timm:$off),
+// Similar to X86tcret_6regs, here we only have 1 register left
+def : Pat<(X86tcret_1reg (load addr:$dst), timm:$off),
           (TCRETURNmi addr:$dst, timm:$off)>,
           Requires<[Not64BitMode, IsNotPIC, NotUseIndirectThunkCalls]>;
 
@@ -1466,6 +1516,21 @@ def ADD64ri32_DB : I<0, Pseudo,
 }
 } // AddedComplexity, SchedRW
 
+//===----------------------------------------------------------------------===//
+// Pattern match XOR as ADD
+//===----------------------------------------------------------------------===//
+
+// Prefer to pattern match XOR with min_signed_value as ADD at isel time.
+// ADD can be 3-addressified into an LEA instruction to avoid copies.
+let AddedComplexity = 5 in {
+def : Pat<(xor GR8:$src1, -128),
+          (ADD8ri GR8:$src1, -128)>;
+def : Pat<(xor GR16:$src1, -32768),
+          (ADD16ri GR16:$src1, -32768)>;
+def : Pat<(xor GR32:$src1, -2147483648),
+          (ADD32ri GR32:$src1, -2147483648)>;
+}
+
 //===----------------------------------------------------------------------===//
 // Pattern match SUB as XOR
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86InstrControl.td b/llvm/lib/Target/X86/X86InstrControl.td
index 6d969962afff..aa89a6f0ff9d 100644
--- a/llvm/lib/Target/X86/X86InstrControl.td
+++ b/llvm/lib/Target/X86/X86InstrControl.td
@@ -147,7 +147,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
 
   // Win64 wants indirect jumps leaving the function to have a REX_W prefix.
   // These are switched from TAILJMPr/m64_REX in MCInstLower.
-  let isCodeGenOnly = 1, hasREX_WPrefix = 1 in {
+  let isCodeGenOnly = 1, hasREX_W = 1 in {
     def JMP64r_REX : I<0xFF, MRM4r, (outs), (ins GR64:$dst),
                        "rex64 jmp{q}\t{*}$dst", []>, Sched<[WriteJump]>;
     let mayLoad = 1 in
@@ -384,7 +384,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
                            []>, Sched<[WriteJumpLd]>;
 
   // Win64 wants indirect jumps leaving the function to have a REX_W prefix.
-  let hasREX_WPrefix = 1 in {
+  let hasREX_W = 1 in {
     def TAILJMPr64_REX : PseudoI<(outs), (ins ptr_rc_tailcall:$dst),
                                  []>, Sched<[WriteJump]>;
 
diff --git a/llvm/lib/Target/X86/X86InstrFPStack.td b/llvm/lib/Target/X86/X86InstrFPStack.td
index e310f369be08..a68d61043c5c 100644
--- a/llvm/lib/Target/X86/X86InstrFPStack.td
+++ b/llvm/lib/Target/X86/X86InstrFPStack.td
@@ -423,9 +423,9 @@ def FBSTPm   : FPI<0xDF, MRM6m, (outs), (ins f80mem:$dst), "fbstp\t$dst">;
 
 // Floating point cmovs.
 class FpIf32CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
-  FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32, HasCMov]>;
+  FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32, HasCMOV]>;
 class FpIf64CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
-  FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64, HasCMov]>;
+  FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64, HasCMOV]>;
 
 multiclass FPCMov<PatLeaf cc> {
   def _Fp32  : FpIf32CMov<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2),
@@ -440,7 +440,7 @@ multiclass FPCMov<PatLeaf cc> {
                      CondMovFP,
                      [(set RFP80:$dst, (X86cmov RFP80:$src1, RFP80:$src2,
                                         cc, EFLAGS))]>,
-                                        Requires<[HasCMov]>;
+                                        Requires<[HasCMOV]>;
 }
 
 let SchedRW = [WriteFCMOV] in {
@@ -455,7 +455,7 @@ defm CMOVNE : FPCMov<X86_COND_NE>;
 defm CMOVNP : FPCMov<X86_COND_NP>;
 } // Uses = [EFLAGS], Constraints = "$src1 = $dst"
 
-let Predicates = [HasCMov] in {
+let Predicates = [HasCMOV] in {
 // These are not factored because there's no clean way to pass DA/DB.
 def CMOVB_F  : FPI<0xDA, MRM0r, (outs), (ins RSTi:$op),
                   "fcmovb\t{$op, %st|st, $op}">;
@@ -473,7 +473,7 @@ def CMOVNE_F : FPI<0xDB, MRM1r, (outs), (ins RSTi:$op),
                   "fcmovne\t{$op, %st|st, $op}">;
 def CMOVNP_F : FPI<0xDB, MRM3r, (outs), (ins RSTi:$op),
                   "fcmovnu\t{$op, %st|st, $op}">;
-} // Predicates = [HasCMov]
+} // Predicates = [HasCMOV]
 } // SchedRW
 
 let mayRaiseFPException = 1 in {
@@ -664,22 +664,22 @@ let SchedRW = [WriteFCom], mayRaiseFPException = 1 in {
 let Defs = [EFLAGS, FPSW], Uses = [FPCW] in {
 def UCOM_FpIr32: FpI_<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
                   [(set EFLAGS, (X86any_fcmp RFP32:$lhs, RFP32:$rhs))]>,
-                  Requires<[FPStackf32, HasCMov]>;
+                  Requires<[FPStackf32, HasCMOV]>;
 def UCOM_FpIr64: FpI_<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
                   [(set EFLAGS, (X86any_fcmp RFP64:$lhs, RFP64:$rhs))]>,
-                  Requires<[FPStackf64, HasCMov]>;
+                  Requires<[FPStackf64, HasCMOV]>;
 def UCOM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
                   [(set EFLAGS, (X86any_fcmp RFP80:$lhs, RFP80:$rhs))]>,
-                  Requires<[HasCMov]>;
+                  Requires<[HasCMOV]>;
 def COM_FpIr32: FpI_<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
                   [(set EFLAGS, (X86strict_fcmps RFP32:$lhs, RFP32:$rhs))]>,
-                  Requires<[FPStackf32, HasCMov]>;
+                  Requires<[FPStackf32, HasCMOV]>;
 def COM_FpIr64: FpI_<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
                   [(set EFLAGS, (X86strict_fcmps RFP64:$lhs, RFP64:$rhs))]>,
-                  Requires<[FPStackf64, HasCMov]>;
+                  Requires<[FPStackf64, HasCMOV]>;
 def COM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
                   [(set EFLAGS, (X86strict_fcmps RFP80:$lhs, RFP80:$rhs))]>,
-                  Requires<[HasCMov]>;
+                  Requires<[HasCMOV]>;
 }
 
 let Uses = [ST0, FPCW] in {
diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
index 226349485238..27220a8d4d99 100644
--- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
@@ -292,8 +292,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
   { X86::JMP32r_NT,           X86::JMP32m_NT,           TB_FOLDED_LOAD },
   { X86::JMP64r,              X86::JMP64m,              TB_FOLDED_LOAD },
   { X86::JMP64r_NT,           X86::JMP64m_NT,           TB_FOLDED_LOAD },
-  { X86::MMX_MOVD64from64rr,  X86::MMX_MOVD64from64rm,  TB_FOLDED_STORE | TB_NO_REVERSE },
-  { X86::MMX_MOVD64grr,       X86::MMX_MOVD64mr,        TB_FOLDED_STORE | TB_NO_REVERSE },
+  { X86::MMX_MOVD64from64rr,  X86::MMX_MOVQ64mr,        TB_FOLDED_STORE },
+  { X86::MMX_MOVD64grr,       X86::MMX_MOVD64mr,        TB_FOLDED_STORE },
   { X86::MOV16ri,             X86::MOV16mi,             TB_FOLDED_STORE },
   { X86::MOV16rr,             X86::MOV16mr,             TB_FOLDED_STORE },
   { X86::MOV32ri,             X86::MOV32mi,             TB_FOLDED_STORE },
diff --git a/llvm/lib/Target/X86/X86InstrFormats.td b/llvm/lib/Target/X86/X86InstrFormats.td
index 0e7033fc233a..3a44b4570e9b 100644
--- a/llvm/lib/Target/X86/X86InstrFormats.td
+++ b/llvm/lib/Target/X86/X86InstrFormats.td
@@ -196,7 +196,7 @@ class OpSize32 { OperandSize OpSize = OpSize32; }
 class AdSize16 { AddressSize AdSize = AdSize16; }
 class AdSize32 { AddressSize AdSize = AdSize32; }
 class AdSize64 { AddressSize AdSize = AdSize64; }
-class REX_W  { bit hasREX_WPrefix = 1; }
+class REX_W  { bit hasREX_W = 1; }
 class LOCK   { bit hasLockPrefix = 1; }
 class REP    { bit hasREPPrefix = 1; }
 class TB     { Map OpMap = TB; }
@@ -316,7 +316,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   bits<3> OpPrefixBits = OpPrefix.Value;
   Map OpMap = OB;           // Which opcode map does this inst have?
   bits<4> OpMapBits = OpMap.Value;
-  bit hasREX_WPrefix  = 0;  // Does this inst require the REX.W prefix?
+  bit hasREX_W  = 0;  // Does this inst require the REX.W prefix?
   FPFormat FPForm = NotFP;  // What flavor of FP instruction is this?
   bit hasLockPrefix = 0;    // Does this inst have a 0xF0 prefix?
   Domain ExeDomain = d;
@@ -375,7 +375,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   // No need for 3rd bit, we don't need to distinguish NoPrfx from PS.
   let TSFlags{12-11} = OpPrefixBits{1-0};
   let TSFlags{16-13} = OpMapBits;
-  let TSFlags{17}    = hasREX_WPrefix;
+  let TSFlags{17}    = hasREX_W;
   let TSFlags{21-18} = ImmT.Value;
   let TSFlags{24-22} = FPForm.Value;
   let TSFlags{25}    = hasLockPrefix;
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index 166f1f8c3251..57ba4683c6a4 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -287,7 +287,6 @@ def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
                                           SDTCisSameAs<2, 1>]>;
 
 def X86mulhrs  : SDNode<"X86ISD::MULHRS", SDTIntBinOp, [SDNPCommutative]>;
-def X86avg     : SDNode<"X86ISD::AVG" , SDTIntBinOp, [SDNPCommutative]>;
 def X86ptest   : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
 def X86testp   : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
 def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 4dcd886fa3b2..ec32ac2acad1 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -25,13 +25,16 @@
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
@@ -137,298 +140,70 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
 }
 
 bool X86InstrInfo::isDataInvariant(MachineInstr &MI) {
-  switch (MI.getOpcode()) {
-  default:
-    // By default, assume that the instruction is not data invariant.
+  if (MI.mayLoad() || MI.mayStore())
     return false;
 
-    // Some target-independent operations that trivially lower to data-invariant
-    // instructions.
-  case TargetOpcode::COPY:
-  case TargetOpcode::INSERT_SUBREG:
-  case TargetOpcode::SUBREG_TO_REG:
+  // Some target-independent operations that trivially lower to data-invariant
+  // instructions.
+  if (MI.isCopyLike() || MI.isInsertSubreg())
     return true;
 
+  unsigned Opcode = MI.getOpcode();
+  using namespace X86;
   // On x86 it is believed that imul is constant time w.r.t. the loaded data.
   // However, they set flags and are perhaps the most surprisingly constant
   // time operations so we call them out here separately.
-  case X86::IMUL16rr:
-  case X86::IMUL16rri8:
-  case X86::IMUL16rri:
-  case X86::IMUL32rr:
-  case X86::IMUL32rri8:
-  case X86::IMUL32rri:
-  case X86::IMUL64rr:
-  case X86::IMUL64rri32:
-  case X86::IMUL64rri8:
-
+  if (isIMUL(Opcode))
+    return true;
   // Bit scanning and counting instructions that are somewhat surprisingly
   // constant time as they scan across bits and do other fairly complex
   // operations like popcnt, but are believed to be constant time on x86.
   // However, these set flags.
-  case X86::BSF16rr:
-  case X86::BSF32rr:
-  case X86::BSF64rr:
-  case X86::BSR16rr:
-  case X86::BSR32rr:
-  case X86::BSR64rr:
-  case X86::LZCNT16rr:
-  case X86::LZCNT32rr:
-  case X86::LZCNT64rr:
-  case X86::POPCNT16rr:
-  case X86::POPCNT32rr:
-  case X86::POPCNT64rr:
-  case X86::TZCNT16rr:
-  case X86::TZCNT32rr:
-  case X86::TZCNT64rr:
-
+  if (isBSF(Opcode) || isBSR(Opcode) || isLZCNT(Opcode) || isPOPCNT(Opcode) ||
+      isTZCNT(Opcode))
+    return true;
   // Bit manipulation instructions are effectively combinations of basic
   // arithmetic ops, and should still execute in constant time. These also
   // set flags.
-  case X86::BLCFILL32rr:
-  case X86::BLCFILL64rr:
-  case X86::BLCI32rr:
-  case X86::BLCI64rr:
-  case X86::BLCIC32rr:
-  case X86::BLCIC64rr:
-  case X86::BLCMSK32rr:
-  case X86::BLCMSK64rr:
-  case X86::BLCS32rr:
-  case X86::BLCS64rr:
-  case X86::BLSFILL32rr:
-  case X86::BLSFILL64rr:
-  case X86::BLSI32rr:
-  case X86::BLSI64rr:
-  case X86::BLSIC32rr:
-  case X86::BLSIC64rr:
-  case X86::BLSMSK32rr:
-  case X86::BLSMSK64rr:
-  case X86::BLSR32rr:
-  case X86::BLSR64rr:
-  case X86::TZMSK32rr:
-  case X86::TZMSK64rr:
-
+  if (isBLCFILL(Opcode) || isBLCI(Opcode) || isBLCIC(Opcode) ||
+      isBLCMSK(Opcode) || isBLCS(Opcode) || isBLSFILL(Opcode) ||
+      isBLSI(Opcode) || isBLSIC(Opcode) || isBLSMSK(Opcode) || isBLSR(Opcode) ||
+      isTZMSK(Opcode))
+    return true;
   // Bit extracting and clearing instructions should execute in constant time,
   // and set flags.
-  case X86::BEXTR32rr:
-  case X86::BEXTR64rr:
-  case X86::BEXTRI32ri:
-  case X86::BEXTRI64ri:
-  case X86::BZHI32rr:
-  case X86::BZHI64rr:
-
+  if (isBEXTR(Opcode) || isBZHI(Opcode))
+    return true;
   // Shift and rotate.
-  case X86::ROL8r1:
-  case X86::ROL16r1:
-  case X86::ROL32r1:
-  case X86::ROL64r1:
-  case X86::ROL8rCL:
-  case X86::ROL16rCL:
-  case X86::ROL32rCL:
-  case X86::ROL64rCL:
-  case X86::ROL8ri:
-  case X86::ROL16ri:
-  case X86::ROL32ri:
-  case X86::ROL64ri:
-  case X86::ROR8r1:
-  case X86::ROR16r1:
-  case X86::ROR32r1:
-  case X86::ROR64r1:
-  case X86::ROR8rCL:
-  case X86::ROR16rCL:
-  case X86::ROR32rCL:
-  case X86::ROR64rCL:
-  case X86::ROR8ri:
-  case X86::ROR16ri:
-  case X86::ROR32ri:
-  case X86::ROR64ri:
-  case X86::SAR8r1:
-  case X86::SAR16r1:
-  case X86::SAR32r1:
-  case X86::SAR64r1:
-  case X86::SAR8rCL:
-  case X86::SAR16rCL:
-  case X86::SAR32rCL:
-  case X86::SAR64rCL:
-  case X86::SAR8ri:
-  case X86::SAR16ri:
-  case X86::SAR32ri:
-  case X86::SAR64ri:
-  case X86::SHL8r1:
-  case X86::SHL16r1:
-  case X86::SHL32r1:
-  case X86::SHL64r1:
-  case X86::SHL8rCL:
-  case X86::SHL16rCL:
-  case X86::SHL32rCL:
-  case X86::SHL64rCL:
-  case X86::SHL8ri:
-  case X86::SHL16ri:
-  case X86::SHL32ri:
-  case X86::SHL64ri:
-  case X86::SHR8r1:
-  case X86::SHR16r1:
-  case X86::SHR32r1:
-  case X86::SHR64r1:
-  case X86::SHR8rCL:
-  case X86::SHR16rCL:
-  case X86::SHR32rCL:
-  case X86::SHR64rCL:
-  case X86::SHR8ri:
-  case X86::SHR16ri:
-  case X86::SHR32ri:
-  case X86::SHR64ri:
-  case X86::SHLD16rrCL:
-  case X86::SHLD32rrCL:
-  case X86::SHLD64rrCL:
-  case X86::SHLD16rri8:
-  case X86::SHLD32rri8:
-  case X86::SHLD64rri8:
-  case X86::SHRD16rrCL:
-  case X86::SHRD32rrCL:
-  case X86::SHRD64rrCL:
-  case X86::SHRD16rri8:
-  case X86::SHRD32rri8:
-  case X86::SHRD64rri8:
-
+  if (isROL(Opcode) || isROR(Opcode) || isSAR(Opcode) || isSHL(Opcode) ||
+      isSHR(Opcode) || isSHLD(Opcode) || isSHRD(Opcode))
+    return true;
   // Basic arithmetic is constant time on the input but does set flags.
-  case X86::ADC8rr:
-  case X86::ADC8ri:
-  case X86::ADC16rr:
-  case X86::ADC16ri:
-  case X86::ADC16ri8:
-  case X86::ADC32rr:
-  case X86::ADC32ri:
-  case X86::ADC32ri8:
-  case X86::ADC64rr:
-  case X86::ADC64ri8:
-  case X86::ADC64ri32:
-  case X86::ADD8rr:
-  case X86::ADD8ri:
-  case X86::ADD16rr:
-  case X86::ADD16ri:
-  case X86::ADD16ri8:
-  case X86::ADD32rr:
-  case X86::ADD32ri:
-  case X86::ADD32ri8:
-  case X86::ADD64rr:
-  case X86::ADD64ri8:
-  case X86::ADD64ri32:
-  case X86::AND8rr:
-  case X86::AND8ri:
-  case X86::AND16rr:
-  case X86::AND16ri:
-  case X86::AND16ri8:
-  case X86::AND32rr:
-  case X86::AND32ri:
-  case X86::AND32ri8:
-  case X86::AND64rr:
-  case X86::AND64ri8:
-  case X86::AND64ri32:
-  case X86::OR8rr:
-  case X86::OR8ri:
-  case X86::OR16rr:
-  case X86::OR16ri:
-  case X86::OR16ri8:
-  case X86::OR32rr:
-  case X86::OR32ri:
-  case X86::OR32ri8:
-  case X86::OR64rr:
-  case X86::OR64ri8:
-  case X86::OR64ri32:
-  case X86::SBB8rr:
-  case X86::SBB8ri:
-  case X86::SBB16rr:
-  case X86::SBB16ri:
-  case X86::SBB16ri8:
-  case X86::SBB32rr:
-  case X86::SBB32ri:
-  case X86::SBB32ri8:
-  case X86::SBB64rr:
-  case X86::SBB64ri8:
-  case X86::SBB64ri32:
-  case X86::SUB8rr:
-  case X86::SUB8ri:
-  case X86::SUB16rr:
-  case X86::SUB16ri:
-  case X86::SUB16ri8:
-  case X86::SUB32rr:
-  case X86::SUB32ri:
-  case X86::SUB32ri8:
-  case X86::SUB64rr:
-  case X86::SUB64ri8:
-  case X86::SUB64ri32:
-  case X86::XOR8rr:
-  case X86::XOR8ri:
-  case X86::XOR16rr:
-  case X86::XOR16ri:
-  case X86::XOR16ri8:
-  case X86::XOR32rr:
-  case X86::XOR32ri:
-  case X86::XOR32ri8:
-  case X86::XOR64rr:
-  case X86::XOR64ri8:
-  case X86::XOR64ri32:
+  if (isADC(Opcode) || isADD(Opcode) || isAND(Opcode) || isOR(Opcode) ||
+      isSBB(Opcode) || isSUB(Opcode) || isXOR(Opcode))
+    return true;
   // Arithmetic with just 32-bit and 64-bit variants and no immediates.
-  case X86::ADCX32rr:
-  case X86::ADCX64rr:
-  case X86::ADOX32rr:
-  case X86::ADOX64rr:
-  case X86::ANDN32rr:
-  case X86::ANDN64rr:
+  if (isADCX(Opcode) || isADOX(Opcode) || isANDN(Opcode))
+    return true;
   // Unary arithmetic operations.
-  case X86::DEC8r:
-  case X86::DEC16r:
-  case X86::DEC32r:
-  case X86::DEC64r:
-  case X86::INC8r:
-  case X86::INC16r:
-  case X86::INC32r:
-  case X86::INC64r:
-  case X86::NEG8r:
-  case X86::NEG16r:
-  case X86::NEG32r:
-  case X86::NEG64r:
-
+  if (isDEC(Opcode) || isINC(Opcode) || isNEG(Opcode))
+    return true;
   // Unlike other arithmetic, NOT doesn't set EFLAGS.
-  case X86::NOT8r:
-  case X86::NOT16r:
-  case X86::NOT32r:
-  case X86::NOT64r:
-
+  if (isNOT(Opcode))
+    return true;
   // Various move instructions used to zero or sign extend things. Note that we
   // intentionally don't support the _NOREX variants as we can't handle that
   // register constraint anyways.
-  case X86::MOVSX16rr8:
-  case X86::MOVSX32rr8:
-  case X86::MOVSX32rr16:
-  case X86::MOVSX64rr8:
-  case X86::MOVSX64rr16:
-  case X86::MOVSX64rr32:
-  case X86::MOVZX16rr8:
-  case X86::MOVZX32rr8:
-  case X86::MOVZX32rr16:
-  case X86::MOVZX64rr8:
-  case X86::MOVZX64rr16:
-  case X86::MOV32rr:
-
+  if (isMOVSX(Opcode) || isMOVZX(Opcode) || isMOVSXD(Opcode) || isMOV(Opcode))
+    return true;
   // Arithmetic instructions that are both constant time and don't set flags.
-  case X86::RORX32ri:
-  case X86::RORX64ri:
-  case X86::SARX32rr:
-  case X86::SARX64rr:
-  case X86::SHLX32rr:
-  case X86::SHLX64rr:
-  case X86::SHRX32rr:
-  case X86::SHRX64rr:
-
+  if (isRORX(Opcode) || isSARX(Opcode) || isSHLX(Opcode) || isSHRX(Opcode))
+    return true;
   // LEA doesn't actually access memory, and its arithmetic is constant time.
-  case X86::LEA16r:
-  case X86::LEA32r:
-  case X86::LEA64_32r:
-  case X86::LEA64r:
+  if (isLEA(Opcode))
     return true;
-  }
+  // By default, assume that the instruction is not data invariant.
+  return false;
 }
 
 bool X86InstrInfo::isDataInvariantLoad(MachineInstr &MI) {
@@ -990,6 +765,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
   case X86::AVX_SET0:
   case X86::FsFLD0SD:
   case X86::FsFLD0SS:
+  case X86::FsFLD0SH:
   case X86::FsFLD0F128:
   case X86::KSET0D:
   case X86::KSET0Q:
@@ -1192,6 +968,102 @@ inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
   return ShAmt < 4 && ShAmt > 0;
 }
 
+static bool findRedundantFlagInstr(MachineInstr &CmpInstr,
+                                   MachineInstr &CmpValDefInstr,
+                                   const MachineRegisterInfo *MRI,
+                                   MachineInstr **AndInstr,
+                                   const TargetRegisterInfo *TRI,
+                                   bool &NoSignFlag, bool &ClearsOverflowFlag) {
+  if (CmpValDefInstr.getOpcode() != X86::SUBREG_TO_REG)
+    return false;
+
+  if (CmpInstr.getOpcode() != X86::TEST64rr)
+    return false;
+
+  // CmpInstr is a TEST64rr instruction, and `X86InstrInfo::analyzeCompare`
+  // guarantees that it's analyzable only if two registers are identical.
+  assert(
+      (CmpInstr.getOperand(0).getReg() == CmpInstr.getOperand(1).getReg()) &&
+      "CmpInstr is an analyzable TEST64rr, and `X86InstrInfo::analyzeCompare` "
+      "requires two reg operands are the same.");
+
+  // Caller (`X86InstrInfo::optimizeCompareInstr`) guarantees that
+  // `CmpValDefInstr` defines the value that's used by `CmpInstr`; in this case
+  // if `CmpValDefInstr` sets the EFLAGS, it is likely that `CmpInstr` is
+  // redundant.
+  assert(
+      (MRI->getVRegDef(CmpInstr.getOperand(0).getReg()) == &CmpValDefInstr) &&
+      "Caller guarantees that TEST64rr is a user of SUBREG_TO_REG.");
+
+  // As seen in X86 td files, CmpValDefInstr.getOperand(1).getImm() is typically
+  // 0.
+  if (CmpValDefInstr.getOperand(1).getImm() != 0)
+    return false;
+
+  // As seen in X86 td files, CmpValDefInstr.getOperand(3) is typically
+  // sub_32bit or sub_xmm.
+  if (CmpValDefInstr.getOperand(3).getImm() != X86::sub_32bit)
+    return false;
+
+  MachineInstr *VregDefInstr =
+      MRI->getVRegDef(CmpValDefInstr.getOperand(2).getReg());
+
+  assert(VregDefInstr && "Must have a definition (SSA)");
+
+  // Requires `CmpValDefInstr` and `VregDefInstr` are from the same MBB
+  // to simplify the subsequent analysis.
+  //
+  // FIXME: If `VregDefInstr->getParent()` is the only predecessor of
+  // `CmpValDefInstr.getParent()`, this could be handled.
+  if (VregDefInstr->getParent() != CmpValDefInstr.getParent())
+    return false;
+
+  if (X86::isAND(VregDefInstr->getOpcode())) {
+    // Get a sequence of instructions like
+    //   %reg = and* ...                    // Set EFLAGS
+    //   ...                                // EFLAGS not changed
+    //   %extended_reg = subreg_to_reg 0, %reg, %subreg.sub_32bit
+    //   test64rr %extended_reg, %extended_reg, implicit-def $eflags
+    //
+    // If subsequent readers use a subset of bits that don't change
+    // after `and*` instructions, it's likely that the test64rr could
+    // be optimized away.
+    for (const MachineInstr &Instr :
+         make_range(std::next(MachineBasicBlock::iterator(VregDefInstr)),
+                    MachineBasicBlock::iterator(CmpValDefInstr))) {
+      // There are instructions between 'VregDefInstr' and
+      // 'CmpValDefInstr' that modifies EFLAGS.
+      if (Instr.modifiesRegister(X86::EFLAGS, TRI))
+        return false;
+    }
+
+    *AndInstr = VregDefInstr;
+
+    // AND instruction will essentially update SF and clear OF, so
+    // NoSignFlag should be false in the sense that SF is modified by `AND`.
+    //
+    // However, the implementation artifically sets `NoSignFlag` to true
+    // to poison the SF bit; that is to say, if SF is looked at later, the
+    // optimization (to erase TEST64rr) will be disabled.
+    //
+    // The reason to poison SF bit is that SF bit value could be different
+    // in the `AND` and `TEST` operation; signed bit is not known for `AND`,
+    // and is known to be 0 as a result of `TEST64rr`.
+    //
+    // FIXME: As opposed to poisoning the SF bit directly, consider peeking into
+    // the AND instruction and using the static information to guide peephole
+    // optimization if possible. For example, it's possible to fold a
+    // conditional move into a copy if the relevant EFLAG bits could be deduced
+    // from an immediate operand of and operation.
+    //
+    NoSignFlag = true;
+    // ClearsOverflowFlag is true for AND operation (no surprise).
+    ClearsOverflowFlag = true;
+    return true;
+  }
+  return false;
+}
+
 bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
                                   unsigned Opc, bool AllowSP, Register &NewSrc,
                                   bool &isKill, MachineOperand &ImplicitOp,
@@ -1314,8 +1186,11 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
   case X86::SHL8ri:
   case X86::SHL16ri: {
     unsigned ShAmt = MI.getOperand(2).getImm();
-    MIB.addReg(0).addImm(1ULL << ShAmt)
-       .addReg(InRegLEA, RegState::Kill).addImm(0).addReg(0);
+    MIB.addReg(0)
+        .addImm(1LL << ShAmt)
+        .addReg(InRegLEA, RegState::Kill)
+        .addImm(0)
+        .addReg(0);
     break;
   }
   case X86::INC8r:
@@ -1478,7 +1353,7 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
     NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
                 .add(Dest)
                 .addReg(0)
-                .addImm(1ULL << ShAmt)
+                .addImm(1LL << ShAmt)
                 .add(Src)
                 .addImm(0)
                 .addReg(0);
@@ -1502,7 +1377,7 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
         BuildMI(MF, MI.getDebugLoc(), get(Opc))
             .add(Dest)
             .addReg(0)
-            .addImm(1ULL << ShAmt)
+            .addImm(1LL << ShAmt)
             .addReg(SrcReg, getKillRegState(isKill))
             .addImm(0)
             .addReg(0);
@@ -1957,14 +1832,13 @@ unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(
   FMAForms[0] = FMA3Group.get132Opcode();
   FMAForms[1] = FMA3Group.get213Opcode();
   FMAForms[2] = FMA3Group.get231Opcode();
-  unsigned FormIndex;
-  for (FormIndex = 0; FormIndex < 3; FormIndex++)
-    if (Opc == FMAForms[FormIndex])
-      break;
 
   // Everything is ready, just adjust the FMA opcode and return it.
-  FormIndex = FormMapping[Case][FormIndex];
-  return FMAForms[FormIndex];
+  for (unsigned FormIndex = 0; FormIndex < 3; FormIndex++)
+    if (Opc == FMAForms[FormIndex])
+      return FMAForms[FormMapping[Case][FormIndex]];
+
+  llvm_unreachable("Illegal FMA3 format");
 }
 
 static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
@@ -2141,7 +2015,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
       if ((MI.getOperand(3).getImm() ^ Mask) == 1) {
         auto &WorkingMI = cloneIfNew(MI);
         WorkingMI.setDesc(get(Opc));
-        WorkingMI.RemoveOperand(3);
+        WorkingMI.removeOperand(3);
         return TargetInstrInfo::commuteInstructionImpl(WorkingMI,
                                                        /*NewMI=*/false,
                                                        OpIdx1, OpIdx2);
@@ -2238,7 +2112,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
     assert(MI.getOperand(3).getImm() == 0x02 && "Unexpected immediate!");
     auto &WorkingMI = cloneIfNew(MI);
     WorkingMI.setDesc(get(X86::MOVSDrr));
-    WorkingMI.RemoveOperand(3);
+    WorkingMI.removeOperand(3);
     return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
                                                    OpIdx1, OpIdx2);
   }
@@ -2813,34 +2687,37 @@ bool X86InstrInfo::hasCommutePreference(MachineInstr &MI, bool &Commute) const {
   return false;
 }
 
+int X86::getCondSrcNoFromDesc(const MCInstrDesc &MCID) {
+  unsigned Opcode = MCID.getOpcode();
+  if (!(X86::isJCC(Opcode) || X86::isSETCC(Opcode) || X86::isCMOVCC(Opcode)))
+    return -1;
+  // Assume that condition code is always the last use operand.
+  unsigned NumUses = MCID.getNumOperands() - MCID.getNumDefs();
+  return NumUses - 1;
+}
+
+X86::CondCode X86::getCondFromMI(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+  int CondNo = getCondSrcNoFromDesc(MCID);
+  if (CondNo < 0)
+    return X86::COND_INVALID;
+  CondNo += MCID.getNumDefs();
+  return static_cast<X86::CondCode>(MI.getOperand(CondNo).getImm());
+}
+
 X86::CondCode X86::getCondFromBranch(const MachineInstr &MI) {
-  switch (MI.getOpcode()) {
-  default: return X86::COND_INVALID;
-  case X86::JCC_1:
-    return static_cast<X86::CondCode>(
-        MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm());
-  }
+  return X86::isJCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
+                                    : X86::COND_INVALID;
 }
 
-/// Return condition code of a SETCC opcode.
 X86::CondCode X86::getCondFromSETCC(const MachineInstr &MI) {
-  switch (MI.getOpcode()) {
-  default: return X86::COND_INVALID;
-  case X86::SETCCr: case X86::SETCCm:
-    return static_cast<X86::CondCode>(
-        MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm());
-  }
+  return X86::isSETCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
+                                      : X86::COND_INVALID;
 }
 
-/// Return condition code of a CMov opcode.
 X86::CondCode X86::getCondFromCMov(const MachineInstr &MI) {
-  switch (MI.getOpcode()) {
-  default: return X86::COND_INVALID;
-  case X86::CMOV16rr: case X86::CMOV32rr: case X86::CMOV64rr:
-  case X86::CMOV16rm: case X86::CMOV32rm: case X86::CMOV64rm:
-    return static_cast<X86::CondCode>(
-        MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm());
-  }
+  return X86::isCMOVCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
+                                       : X86::COND_INVALID;
 }
 
 /// Return the inverse of the specified condition,
@@ -3166,8 +3043,7 @@ bool X86InstrInfo::AnalyzeBranchImpl(
       }
 
       // If the block has any instructions after a JMP, delete them.
-      while (std::next(I) != MBB.end())
-        std::next(I)->eraseFromParent();
+      MBB.erase(std::next(I), MBB.end());
 
       Cond.clear();
       FBB = nullptr;
@@ -3464,7 +3340,7 @@ bool X86InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
                                    Register FalseReg, int &CondCycles,
                                    int &TrueCycles, int &FalseCycles) const {
   // Not all subtargets have cmov instructions.
-  if (!Subtarget.hasCMov())
+  if (!Subtarget.canUseCMOV())
     return false;
   if (Cond.size() != 1)
     return false;
@@ -3708,10 +3584,6 @@ static unsigned getLoadStoreRegOpcode(Register Reg,
   case 2:
     if (X86::VK16RegClass.hasSubClassEq(RC))
       return load ? X86::KMOVWkm : X86::KMOVWmk;
-    if (X86::FR16XRegClass.hasSubClassEq(RC)) {
-      assert(STI.hasFP16());
-      return load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr;
-    }
     assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
     return load ? X86::MOV16rm : X86::MOV16mr;
   case 4:
@@ -3739,6 +3611,10 @@ static unsigned getLoadStoreRegOpcode(Register Reg,
         X86::VK8PAIRRegClass.hasSubClassEq(RC) ||
         X86::VK16PAIRRegClass.hasSubClassEq(RC))
       return load ? X86::MASKPAIR16LOAD : X86::MASKPAIR16STORE;
+    if ((X86::FR16RegClass.hasSubClassEq(RC) ||
+         X86::FR16XRegClass.hasSubClassEq(RC)) &&
+        STI.hasFP16())
+      return load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr;
     llvm_unreachable("Unknown 4-byte regclass");
   case 8:
     if (X86::GR64RegClass.hasSubClassEq(RC))
@@ -3845,6 +3721,35 @@ X86InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI,
   return AM;
 }
 
+bool X86InstrInfo::verifyInstruction(const MachineInstr &MI,
+                                     StringRef &ErrInfo) const {
+  Optional<ExtAddrMode> AMOrNone = getAddrModeFromMemoryOp(MI, nullptr);
+  if (!AMOrNone)
+    return true;
+
+  ExtAddrMode AM = *AMOrNone;
+
+  if (AM.ScaledReg != X86::NoRegister) {
+    switch (AM.Scale) {
+    case 1:
+    case 2:
+    case 4:
+    case 8:
+      break;
+    default:
+      ErrInfo = "Scale factor in address must be 1, 2, 4 or 8";
+      return false;
+    }
+  }
+  if (!isInt<32>(AM.Displacement)) {
+    ErrInfo = "Displacement in address must fit into 32-bit signed "
+              "integer";
+    return false;
+  }
+
+  return true;
+}
+
 bool X86InstrInfo::getConstValDefinedInReg(const MachineInstr &MI,
                                            const Register Reg,
                                            int64_t &ImmVal) const {
@@ -3949,12 +3854,12 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                        const TargetRegisterInfo *TRI) const {
   const MachineFunction &MF = *MBB.getParent();
   const MachineFrameInfo &MFI = MF.getFrameInfo();
+  MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
   assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
          "Stack slot too small for store");
   if (RC->getID() == X86::TILERegClassID) {
     unsigned Opc = X86::TILESTORED;
     // tilestored %tmm, (%sp, %idx)
-    MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
     Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
     BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
     MachineInstr *NewMI =
@@ -3963,6 +3868,14 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     MachineOperand &MO = NewMI->getOperand(2);
     MO.setReg(VirtReg);
     MO.setIsKill(true);
+  } else if ((RC->getID() == X86::FR16RegClassID ||
+              RC->getID() == X86::FR16XRegClassID) &&
+             !Subtarget.hasFP16()) {
+    unsigned Opc = Subtarget.hasAVX512() ? X86::VMOVSSZmr
+                   : Subtarget.hasAVX()  ? X86::VMOVSSmr
+                                         : X86::MOVSSmr;
+    addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
+        .addReg(SrcReg, getKillRegState(isKill));
   } else {
     unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
     bool isAligned =
@@ -3991,6 +3904,14 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     MachineOperand &MO = NewMI->getOperand(3);
     MO.setReg(VirtReg);
     MO.setIsKill(true);
+  } else if ((RC->getID() == X86::FR16RegClassID ||
+              RC->getID() == X86::FR16XRegClassID) &&
+             !Subtarget.hasFP16()) {
+    unsigned Opc = Subtarget.hasAVX512() ? X86::VMOVSSZrm
+                   : Subtarget.hasAVX()  ? X86::VMOVSSrm
+                                         : X86::MOVSSrm;
+    addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
+                      FrameIdx);
   } else {
     const MachineFunction &MF = *MBB.getParent();
     const MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -4375,7 +4296,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
     case X86::SUB8ri:    NewOpcode = X86::CMP8ri;    break;
     }
     CmpInstr.setDesc(get(NewOpcode));
-    CmpInstr.RemoveOperand(0);
+    CmpInstr.removeOperand(0);
     // Mutating this instruction invalidates any debug data associated with it.
     CmpInstr.dropDebugNumber();
     // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
@@ -4423,6 +4344,23 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
           MI = &Inst;
           break;
         }
+
+        // Look back for the following pattern, in which case the test64rr
+        // instruction could be erased.
+        //
+        // Example:
+        //  %reg = and32ri %in_reg, 5
+        //  ...                         // EFLAGS not changed.
+        //  %src_reg = subreg_to_reg 0, %reg, %subreg.sub_index
+        //  test64rr %src_reg, %src_reg, implicit-def $eflags
+        MachineInstr *AndInstr = nullptr;
+        if (IsCmpZero &&
+            findRedundantFlagInstr(CmpInstr, Inst, MRI, &AndInstr, TRI,
+                                   NoSignFlag, ClearsOverflowFlag)) {
+          assert(AndInstr != nullptr && X86::isAND(AndInstr->getOpcode()));
+          MI = AndInstr;
+          break;
+        }
         // Cannot find other candidates before definition of SrcReg.
         return false;
       }
@@ -4524,6 +4462,11 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
         return false;
       case X86::COND_G: case X86::COND_GE:
       case X86::COND_L: case X86::COND_LE:
+        // If SF is used, but the instruction doesn't update the SF, then we
+        // can't do the optimization.
+        if (NoSignFlag)
+          return false;
+        LLVM_FALLTHROUGH;
       case X86::COND_O: case X86::COND_NO:
         // If OF is used, the instruction needs to clear it like CmpZero does.
         if (!ClearsOverflowFlag)
@@ -4811,7 +4754,7 @@ static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB,
     BuildMI(MBB, I, DL, TII.get(X86::PUSH32i8)).addImm(Imm);
     MIB->setDesc(TII.get(X86::POP32r));
   }
-  MIB->RemoveOperand(1);
+  MIB->removeOperand(1);
   MIB->addImplicitDefUseOperands(*MBB.getParent());
 
   // Build CFI if necessary.
@@ -4918,7 +4861,7 @@ static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) {
   MIB->setDesc(Desc);
   int64_t ShiftAmt = MIB->getOperand(2).getImm();
   // Temporarily remove the immediate so we can add another source register.
-  MIB->RemoveOperand(2);
+  MIB->removeOperand(2);
   // Add the register. Don't copy the kill flag if there is one.
   MIB.addReg(MIB.getReg(1),
              getUndefRegState(MIB->getOperand(1).isUndef()));
@@ -4949,6 +4892,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case X86::V_SET0:
   case X86::FsFLD0SS:
   case X86::FsFLD0SD:
+  case X86::FsFLD0SH:
   case X86::FsFLD0F128:
     return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
   case X86::AVX_SET0: {
@@ -5026,7 +4970,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     unsigned MaskState = getRegState(MIB->getOperand(1));
     unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) ?
                    X86::VPTERNLOGQZrrikz : X86::VPTERNLOGDZrrikz;
-    MI.RemoveOperand(1);
+    MI.removeOperand(1);
     MIB->setDesc(get(Opc));
     // VPTERNLOG needs 3 register inputs and an immediate.
     // 0xff will return 1s for any input.
@@ -5165,6 +5109,255 @@ static bool hasPartialRegUpdate(unsigned Opcode,
   case X86::SQRTSDr_Int:
   case X86::SQRTSDm_Int:
     return true;
+  case X86::VFCMULCPHZ128rm:
+  case X86::VFCMULCPHZ128rmb:
+  case X86::VFCMULCPHZ128rmbkz:
+  case X86::VFCMULCPHZ128rmkz:
+  case X86::VFCMULCPHZ128rr:
+  case X86::VFCMULCPHZ128rrkz:
+  case X86::VFCMULCPHZ256rm:
+  case X86::VFCMULCPHZ256rmb:
+  case X86::VFCMULCPHZ256rmbkz:
+  case X86::VFCMULCPHZ256rmkz:
+  case X86::VFCMULCPHZ256rr:
+  case X86::VFCMULCPHZ256rrkz:
+  case X86::VFCMULCPHZrm:
+  case X86::VFCMULCPHZrmb:
+  case X86::VFCMULCPHZrmbkz:
+  case X86::VFCMULCPHZrmkz:
+  case X86::VFCMULCPHZrr:
+  case X86::VFCMULCPHZrrb:
+  case X86::VFCMULCPHZrrbkz:
+  case X86::VFCMULCPHZrrkz:
+  case X86::VFMULCPHZ128rm:
+  case X86::VFMULCPHZ128rmb:
+  case X86::VFMULCPHZ128rmbkz:
+  case X86::VFMULCPHZ128rmkz:
+  case X86::VFMULCPHZ128rr:
+  case X86::VFMULCPHZ128rrkz:
+  case X86::VFMULCPHZ256rm:
+  case X86::VFMULCPHZ256rmb:
+  case X86::VFMULCPHZ256rmbkz:
+  case X86::VFMULCPHZ256rmkz:
+  case X86::VFMULCPHZ256rr:
+  case X86::VFMULCPHZ256rrkz:
+  case X86::VFMULCPHZrm:
+  case X86::VFMULCPHZrmb:
+  case X86::VFMULCPHZrmbkz:
+  case X86::VFMULCPHZrmkz:
+  case X86::VFMULCPHZrr:
+  case X86::VFMULCPHZrrb:
+  case X86::VFMULCPHZrrbkz:
+  case X86::VFMULCPHZrrkz:
+  case X86::VFCMULCSHZrm:
+  case X86::VFCMULCSHZrmkz:
+  case X86::VFCMULCSHZrr:
+  case X86::VFCMULCSHZrrb:
+  case X86::VFCMULCSHZrrbkz:
+  case X86::VFCMULCSHZrrkz:
+  case X86::VFMULCSHZrm:
+  case X86::VFMULCSHZrmkz:
+  case X86::VFMULCSHZrr:
+  case X86::VFMULCSHZrrb:
+  case X86::VFMULCSHZrrbkz:
+  case X86::VFMULCSHZrrkz:
+    return Subtarget.hasMULCFalseDeps();
+  case X86::VPERMDYrm:
+  case X86::VPERMDYrr:
+  case X86::VPERMQYmi:
+  case X86::VPERMQYri:
+  case X86::VPERMPSYrm:
+  case X86::VPERMPSYrr:
+  case X86::VPERMPDYmi:
+  case X86::VPERMPDYri:
+  case X86::VPERMDZ256rm:
+  case X86::VPERMDZ256rmb:
+  case X86::VPERMDZ256rmbkz:
+  case X86::VPERMDZ256rmkz:
+  case X86::VPERMDZ256rr:
+  case X86::VPERMDZ256rrkz:
+  case X86::VPERMDZrm:
+  case X86::VPERMDZrmb:
+  case X86::VPERMDZrmbkz:
+  case X86::VPERMDZrmkz:
+  case X86::VPERMDZrr:
+  case X86::VPERMDZrrkz:
+  case X86::VPERMQZ256mbi:
+  case X86::VPERMQZ256mbikz:
+  case X86::VPERMQZ256mi:
+  case X86::VPERMQZ256mikz:
+  case X86::VPERMQZ256ri:
+  case X86::VPERMQZ256rikz:
+  case X86::VPERMQZ256rm:
+  case X86::VPERMQZ256rmb:
+  case X86::VPERMQZ256rmbkz:
+  case X86::VPERMQZ256rmkz:
+  case X86::VPERMQZ256rr:
+  case X86::VPERMQZ256rrkz:
+  case X86::VPERMQZmbi:
+  case X86::VPERMQZmbikz:
+  case X86::VPERMQZmi:
+  case X86::VPERMQZmikz:
+  case X86::VPERMQZri:
+  case X86::VPERMQZrikz:
+  case X86::VPERMQZrm:
+  case X86::VPERMQZrmb:
+  case X86::VPERMQZrmbkz:
+  case X86::VPERMQZrmkz:
+  case X86::VPERMQZrr:
+  case X86::VPERMQZrrkz:
+  case X86::VPERMPSZ256rm:
+  case X86::VPERMPSZ256rmb:
+  case X86::VPERMPSZ256rmbkz:
+  case X86::VPERMPSZ256rmkz:
+  case X86::VPERMPSZ256rr:
+  case X86::VPERMPSZ256rrkz:
+  case X86::VPERMPSZrm:
+  case X86::VPERMPSZrmb:
+  case X86::VPERMPSZrmbkz:
+  case X86::VPERMPSZrmkz:
+  case X86::VPERMPSZrr:
+  case X86::VPERMPSZrrkz:
+  case X86::VPERMPDZ256mbi:
+  case X86::VPERMPDZ256mbikz:
+  case X86::VPERMPDZ256mi:
+  case X86::VPERMPDZ256mikz:
+  case X86::VPERMPDZ256ri:
+  case X86::VPERMPDZ256rikz:
+  case X86::VPERMPDZ256rm:
+  case X86::VPERMPDZ256rmb:
+  case X86::VPERMPDZ256rmbkz:
+  case X86::VPERMPDZ256rmkz:
+  case X86::VPERMPDZ256rr:
+  case X86::VPERMPDZ256rrkz:
+  case X86::VPERMPDZmbi:
+  case X86::VPERMPDZmbikz:
+  case X86::VPERMPDZmi:
+  case X86::VPERMPDZmikz:
+  case X86::VPERMPDZri:
+  case X86::VPERMPDZrikz:
+  case X86::VPERMPDZrm:
+  case X86::VPERMPDZrmb:
+  case X86::VPERMPDZrmbkz:
+  case X86::VPERMPDZrmkz:
+  case X86::VPERMPDZrr:
+  case X86::VPERMPDZrrkz:
+    return Subtarget.hasPERMFalseDeps();
+  case X86::VRANGEPDZ128rmbi:
+  case X86::VRANGEPDZ128rmbikz:
+  case X86::VRANGEPDZ128rmi:
+  case X86::VRANGEPDZ128rmikz:
+  case X86::VRANGEPDZ128rri:
+  case X86::VRANGEPDZ128rrikz:
+  case X86::VRANGEPDZ256rmbi:
+  case X86::VRANGEPDZ256rmbikz:
+  case X86::VRANGEPDZ256rmi:
+  case X86::VRANGEPDZ256rmikz:
+  case X86::VRANGEPDZ256rri:
+  case X86::VRANGEPDZ256rrikz:
+  case X86::VRANGEPDZrmbi:
+  case X86::VRANGEPDZrmbikz:
+  case X86::VRANGEPDZrmi:
+  case X86::VRANGEPDZrmikz:
+  case X86::VRANGEPDZrri:
+  case X86::VRANGEPDZrrib:
+  case X86::VRANGEPDZrribkz:
+  case X86::VRANGEPDZrrikz:
+  case X86::VRANGEPSZ128rmbi:
+  case X86::VRANGEPSZ128rmbikz:
+  case X86::VRANGEPSZ128rmi:
+  case X86::VRANGEPSZ128rmikz:
+  case X86::VRANGEPSZ128rri:
+  case X86::VRANGEPSZ128rrikz:
+  case X86::VRANGEPSZ256rmbi:
+  case X86::VRANGEPSZ256rmbikz:
+  case X86::VRANGEPSZ256rmi:
+  case X86::VRANGEPSZ256rmikz:
+  case X86::VRANGEPSZ256rri:
+  case X86::VRANGEPSZ256rrikz:
+  case X86::VRANGEPSZrmbi:
+  case X86::VRANGEPSZrmbikz:
+  case X86::VRANGEPSZrmi:
+  case X86::VRANGEPSZrmikz:
+  case X86::VRANGEPSZrri:
+  case X86::VRANGEPSZrrib:
+  case X86::VRANGEPSZrribkz:
+  case X86::VRANGEPSZrrikz:
+  case X86::VRANGESDZrmi:
+  case X86::VRANGESDZrmikz:
+  case X86::VRANGESDZrri:
+  case X86::VRANGESDZrrib:
+  case X86::VRANGESDZrribkz:
+  case X86::VRANGESDZrrikz:
+  case X86::VRANGESSZrmi:
+  case X86::VRANGESSZrmikz:
+  case X86::VRANGESSZrri:
+  case X86::VRANGESSZrrib:
+  case X86::VRANGESSZrribkz:
+  case X86::VRANGESSZrrikz:
+    return Subtarget.hasRANGEFalseDeps();
+  case X86::VGETMANTSSZrmi:
+  case X86::VGETMANTSSZrmikz:
+  case X86::VGETMANTSSZrri:
+  case X86::VGETMANTSSZrrib:
+  case X86::VGETMANTSSZrribkz:
+  case X86::VGETMANTSSZrrikz:
+  case X86::VGETMANTSDZrmi:
+  case X86::VGETMANTSDZrmikz:
+  case X86::VGETMANTSDZrri:
+  case X86::VGETMANTSDZrrib:
+  case X86::VGETMANTSDZrribkz:
+  case X86::VGETMANTSDZrrikz:
+  case X86::VGETMANTSHZrmi:
+  case X86::VGETMANTSHZrmikz:
+  case X86::VGETMANTSHZrri:
+  case X86::VGETMANTSHZrrib:
+  case X86::VGETMANTSHZrribkz:
+  case X86::VGETMANTSHZrrikz:
+  case X86::VGETMANTPSZ128rmbi:
+  case X86::VGETMANTPSZ128rmbikz:
+  case X86::VGETMANTPSZ128rmi:
+  case X86::VGETMANTPSZ128rmikz:
+  case X86::VGETMANTPSZ256rmbi:
+  case X86::VGETMANTPSZ256rmbikz:
+  case X86::VGETMANTPSZ256rmi:
+  case X86::VGETMANTPSZ256rmikz:
+  case X86::VGETMANTPSZrmbi:
+  case X86::VGETMANTPSZrmbikz:
+  case X86::VGETMANTPSZrmi:
+  case X86::VGETMANTPSZrmikz:
+  case X86::VGETMANTPDZ128rmbi:
+  case X86::VGETMANTPDZ128rmbikz:
+  case X86::VGETMANTPDZ128rmi:
+  case X86::VGETMANTPDZ128rmikz:
+  case X86::VGETMANTPDZ256rmbi:
+  case X86::VGETMANTPDZ256rmbikz:
+  case X86::VGETMANTPDZ256rmi:
+  case X86::VGETMANTPDZ256rmikz:
+  case X86::VGETMANTPDZrmbi:
+  case X86::VGETMANTPDZrmbikz:
+  case X86::VGETMANTPDZrmi:
+  case X86::VGETMANTPDZrmikz:
+    return Subtarget.hasGETMANTFalseDeps();
+  case X86::VPMULLQZ128rm:
+  case X86::VPMULLQZ128rmb:
+  case X86::VPMULLQZ128rmbkz:
+  case X86::VPMULLQZ128rmkz:
+  case X86::VPMULLQZ128rr:
+  case X86::VPMULLQZ128rrkz:
+  case X86::VPMULLQZ256rm:
+  case X86::VPMULLQZ256rmb:
+  case X86::VPMULLQZ256rmbkz:
+  case X86::VPMULLQZ256rmkz:
+  case X86::VPMULLQZ256rr:
+  case X86::VPMULLQZ256rrkz:
+  case X86::VPMULLQZrm:
+  case X86::VPMULLQZrmb:
+  case X86::VPMULLQZrmbkz:
+  case X86::VPMULLQZrmkz:
+  case X86::VPMULLQZrr:
+  case X86::VPMULLQZrrkz:
+    return Subtarget.hasMULLQFalseDeps();
   // GPR
   case X86::POPCNT32rm:
   case X86::POPCNT32rr:
@@ -5591,6 +5784,28 @@ void X86InstrInfo::breakPartialRegDependency(
         .addReg(XReg, RegState::Undef)
         .addReg(Reg, RegState::ImplicitDefine);
     MI.addRegisterKilled(Reg, TRI, true);
+  } else if (X86::VR128XRegClass.contains(Reg)) {
+    // Only handle VLX targets.
+    if (!Subtarget.hasVLX())
+      return;
+    // Since vxorps requires AVX512DQ, vpxord should be the best choice.
+    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VPXORDZ128rr), Reg)
+        .addReg(Reg, RegState::Undef)
+        .addReg(Reg, RegState::Undef);
+    MI.addRegisterKilled(Reg, TRI, true);
+  } else if (X86::VR256XRegClass.contains(Reg) ||
+             X86::VR512RegClass.contains(Reg)) {
+    // Only handle VLX targets.
+    if (!Subtarget.hasVLX())
+      return;
+    // Use vpxord to clear the full ymm/zmm register.
+    // It wants to read and write the xmm sub-register.
+    Register XReg = TRI->getSubReg(Reg, X86::sub_xmm);
+    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VPXORDZ128rr), XReg)
+        .addReg(XReg, RegState::Undef)
+        .addReg(XReg, RegState::Undef)
+        .addReg(Reg, RegState::ImplicitDefine);
+    MI.addRegisterKilled(Reg, TRI, true);
   } else if (X86::GR64RegClass.contains(Reg)) {
     // Using XOR32rr because it has shorter encoding and zeros up the upper bits
     // as well.
@@ -6413,6 +6628,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     case X86::AVX512_FsFLD0SS:
       Alignment = Align(4);
       break;
+    case X86::FsFLD0SH:
     case X86::AVX512_FsFLD0SH:
       Alignment = Align(2);
       break;
@@ -6451,6 +6667,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   case X86::AVX512_256_SET0:
   case X86::AVX512_512_SET0:
   case X86::AVX512_512_SETALLONES:
+  case X86::FsFLD0SH:
   case X86::AVX512_FsFLD0SH:
   case X86::FsFLD0SD:
   case X86::AVX512_FsFLD0SD:
@@ -6490,7 +6707,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
       Ty = Type::getDoubleTy(MF.getFunction().getContext());
     else if (Opc == X86::FsFLD0F128 || Opc == X86::AVX512_FsFLD0F128)
       Ty = Type::getFP128Ty(MF.getFunction().getContext());
-    else if (Opc == X86::AVX512_FsFLD0SH)
+    else if (Opc == X86::FsFLD0SH || Opc == X86::AVX512_FsFLD0SH)
       Ty = Type::getHalfTy(MF.getFunction().getContext());
     else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES)
       Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),
@@ -7170,7 +7387,7 @@ bool X86InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
   // ENDBR instructions should not be scheduled around.
   unsigned Opcode = MI.getOpcode();
   if (Opcode == X86::ENDBR64 || Opcode == X86::ENDBR32 ||
-      Opcode == X86::LDTILECFG)
+      Opcode == X86::PLDTILECFGV)
     return true;
 
   return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF);
@@ -9298,12 +9515,10 @@ outliner::OutlinedFunction X86InstrInfo::getOutliningCandidateInfo(
   // We check to see if CFI Instructions are present, and if they are
   // we find the number of CFI Instructions in the candidates.
   unsigned CFICount = 0;
-  MachineBasicBlock::iterator MBBI = RepeatedSequenceLocs[0].front();
-  for (unsigned Loc = RepeatedSequenceLocs[0].getStartIdx();
-       Loc < RepeatedSequenceLocs[0].getEndIdx() + 1; Loc++) {
-    if (MBBI->isCFIInstruction())
+  for (auto &I : make_range(RepeatedSequenceLocs[0].front(),
+                            std::next(RepeatedSequenceLocs[0].back()))) {
+    if (I.isCFIInstruction())
       CFICount++;
-    MBBI++;
   }
 
   // We compare the number of found CFI Instructions to  the number of CFI
@@ -9440,7 +9655,7 @@ MachineBasicBlock::iterator
 X86InstrInfo::insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator &It,
                                  MachineFunction &MF,
-                                 const outliner::Candidate &C) const {
+                                 outliner::Candidate &C) const {
   // Is it a tail call?
   if (C.CallConstructionID == MachineOutlinerTailCall) {
     // Yes, just insert a JMP.
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 33ce55bbdb2b..4943d2152fd2 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -40,13 +40,21 @@ std::pair<CondCode, bool> getX86ConditionCode(CmpInst::Predicate Predicate);
 /// Return a cmov opcode for the given register size in bytes, and operand type.
 unsigned getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand = false);
 
-// Turn jCC instruction into condition code.
+/// Return the source operand # for condition code by \p MCID. If the
+/// instruction doesn't have a condition code, return -1.
+int getCondSrcNoFromDesc(const MCInstrDesc &MCID);
+
+/// Return the condition code of the instruction. If the instruction doesn't
+/// have a condition code, return X86::COND_INVALID.
+CondCode getCondFromMI(const MachineInstr &MI);
+
+// Turn JCC instruction into condition code.
 CondCode getCondFromBranch(const MachineInstr &MI);
 
-// Turn setCC instruction into condition code.
+// Turn SETCC instruction into condition code.
 CondCode getCondFromSETCC(const MachineInstr &MI);
 
-// Turn CMov instruction into condition code.
+// Turn CMOV instruction into condition code.
 CondCode getCondFromCMov(const MachineInstr &MI);
 
 /// GetOppositeBranchCondition - Return the inverse of the specified cond,
@@ -552,8 +560,10 @@ public:
   MachineBasicBlock::iterator
   insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
                      MachineBasicBlock::iterator &It, MachineFunction &MF,
-                     const outliner::Candidate &C) const override;
+                     outliner::Candidate &C) const override;
 
+  bool verifyInstruction(const MachineInstr &MI,
+                         StringRef &ErrInfo) const override;
 #define GET_INSTRINFO_HELPER_DECLS
 #include "X86GenInstrInfo.inc"
 
diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td
index fee9939b8dfc..7f6ef3479d40 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@@ -388,17 +388,19 @@ def X86AbsMemAsmOperand : AsmOperandClass {
 }
 
 class X86MemOperand<string printMethod,
-          AsmOperandClass parserMatchClass = X86MemAsmOperand> : Operand<iPTR> {
+                    AsmOperandClass parserMatchClass = X86MemAsmOperand,
+                    int size = 0> : Operand<iPTR> {
   let PrintMethod = printMethod;
   let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, SEGMENT_REG);
   let ParserMatchClass = parserMatchClass;
   let OperandType = "OPERAND_MEMORY";
+  int Size = size;
 }
 
 // Gather mem operands
 class X86VMemOperand<RegisterClass RC, string printMethod,
-                     AsmOperandClass parserMatchClass>
-    : X86MemOperand<printMethod, parserMatchClass> {
+                     AsmOperandClass parserMatchClass, int size = 0>
+    : X86MemOperand<printMethod, parserMatchClass, size> {
   let MIOperandInfo = (ops ptr_rc, i8imm, RC, i32imm, SEGMENT_REG);
 }
 
@@ -413,48 +415,45 @@ def opaquemem : X86MemOperand<"printMemReference">;
 
 def sibmem: X86MemOperand<"printMemReference", X86SibMemOperand>;
 
-def i8mem   : X86MemOperand<"printbytemem",   X86Mem8AsmOperand>;
-def i16mem  : X86MemOperand<"printwordmem",  X86Mem16AsmOperand>;
-def i32mem  : X86MemOperand<"printdwordmem",  X86Mem32AsmOperand>;
-def i64mem  : X86MemOperand<"printqwordmem",  X86Mem64AsmOperand>;
-def i128mem : X86MemOperand<"printxmmwordmem", X86Mem128AsmOperand>;
-def i256mem : X86MemOperand<"printymmwordmem", X86Mem256AsmOperand>;
-def i512mem : X86MemOperand<"printzmmwordmem", X86Mem512AsmOperand>;
-def f16mem  : X86MemOperand<"printwordmem",   X86Mem16AsmOperand>;
-def f32mem  : X86MemOperand<"printdwordmem",  X86Mem32AsmOperand>;
-def f64mem  : X86MemOperand<"printqwordmem",  X86Mem64AsmOperand>;
-def f80mem  : X86MemOperand<"printtbytemem",  X86Mem80AsmOperand>;
-def f128mem : X86MemOperand<"printxmmwordmem", X86Mem128AsmOperand>;
-def f256mem : X86MemOperand<"printymmwordmem", X86Mem256AsmOperand>;
-def f512mem : X86MemOperand<"printzmmwordmem", X86Mem512AsmOperand>;
+def i8mem   : X86MemOperand<"printbytemem",   X86Mem8AsmOperand, 8>;
+def i16mem  : X86MemOperand<"printwordmem",  X86Mem16AsmOperand, 16>;
+def i32mem  : X86MemOperand<"printdwordmem",  X86Mem32AsmOperand, 32>;
+def i64mem  : X86MemOperand<"printqwordmem",  X86Mem64AsmOperand, 64>;
+def i128mem : X86MemOperand<"printxmmwordmem", X86Mem128AsmOperand, 128>;
+def i256mem : X86MemOperand<"printymmwordmem", X86Mem256AsmOperand, 256>;
+def i512mem : X86MemOperand<"printzmmwordmem", X86Mem512AsmOperand, 512>;
+def f16mem  : X86MemOperand<"printwordmem",   X86Mem16AsmOperand, 16>;
+def f32mem  : X86MemOperand<"printdwordmem",  X86Mem32AsmOperand, 32>;
+def f64mem  : X86MemOperand<"printqwordmem",  X86Mem64AsmOperand, 64>;
+def f80mem  : X86MemOperand<"printtbytemem",  X86Mem80AsmOperand, 80>;
+def f128mem : X86MemOperand<"printxmmwordmem", X86Mem128AsmOperand, 128>;
+def f256mem : X86MemOperand<"printymmwordmem", X86Mem256AsmOperand, 256>;
+def f512mem : X86MemOperand<"printzmmwordmem", X86Mem512AsmOperand, 512>;
 
 // Gather mem operands
-def vx64mem  : X86VMemOperand<VR128,  "printqwordmem",  X86Mem64_RC128Operand>;
-def vx128mem : X86VMemOperand<VR128,  "printxmmwordmem", X86Mem128_RC128Operand>;
-def vx256mem : X86VMemOperand<VR128,  "printymmwordmem", X86Mem256_RC128Operand>;
-def vy128mem : X86VMemOperand<VR256,  "printxmmwordmem", X86Mem128_RC256Operand>;
-def vy256mem : X86VMemOperand<VR256,  "printymmwordmem", X86Mem256_RC256Operand>;
-
-def vx64xmem  : X86VMemOperand<VR128X, "printqwordmem",  X86Mem64_RC128XOperand>;
-def vx128xmem : X86VMemOperand<VR128X, "printxmmwordmem", X86Mem128_RC128XOperand>;
-def vx256xmem : X86VMemOperand<VR128X, "printymmwordmem", X86Mem256_RC128XOperand>;
-def vy128xmem : X86VMemOperand<VR256X, "printxmmwordmem", X86Mem128_RC256XOperand>;
-def vy256xmem : X86VMemOperand<VR256X, "printymmwordmem", X86Mem256_RC256XOperand>;
-def vy512xmem : X86VMemOperand<VR256X, "printzmmwordmem", X86Mem512_RC256XOperand>;
-def vz256mem  : X86VMemOperand<VR512,  "printymmwordmem", X86Mem256_RC512Operand>;
-def vz512mem  : X86VMemOperand<VR512,  "printzmmwordmem", X86Mem512_RC512Operand>;
+def vx64mem  : X86VMemOperand<VR128,  "printqwordmem",  X86Mem64_RC128Operand, 64>;
+def vx128mem : X86VMemOperand<VR128,  "printxmmwordmem", X86Mem128_RC128Operand, 128>;
+def vx256mem : X86VMemOperand<VR128,  "printymmwordmem", X86Mem256_RC128Operand, 256>;
+def vy128mem : X86VMemOperand<VR256,  "printxmmwordmem", X86Mem128_RC256Operand, 128>;
+def vy256mem : X86VMemOperand<VR256,  "printymmwordmem", X86Mem256_RC256Operand, 256>;
+
+def vx64xmem  : X86VMemOperand<VR128X, "printqwordmem",  X86Mem64_RC128XOperand, 64>;
+def vx128xmem : X86VMemOperand<VR128X, "printxmmwordmem", X86Mem128_RC128XOperand, 128>;
+def vx256xmem : X86VMemOperand<VR128X, "printymmwordmem", X86Mem256_RC128XOperand, 256>;
+def vy128xmem : X86VMemOperand<VR256X, "printxmmwordmem", X86Mem128_RC256XOperand, 128>;
+def vy256xmem : X86VMemOperand<VR256X, "printymmwordmem", X86Mem256_RC256XOperand, 256>;
+def vy512xmem : X86VMemOperand<VR256X, "printzmmwordmem", X86Mem512_RC256XOperand, 512>;
+def vz256mem  : X86VMemOperand<VR512,  "printymmwordmem", X86Mem256_RC512Operand, 256>;
+def vz512mem  : X86VMemOperand<VR512,  "printzmmwordmem", X86Mem512_RC512Operand, 512>;
 
 // A version of i8mem for use on x86-64 and x32 that uses a NOREX GPR instead
 // of a plain GPR, so that it doesn't potentially require a REX prefix.
 def ptr_rc_norex : PointerLikeRegClass<2>;
 def ptr_rc_norex_nosp : PointerLikeRegClass<3>;
 
-def i8mem_NOREX : Operand<iPTR> {
-  let PrintMethod = "printbytemem";
+def i8mem_NOREX : X86MemOperand<"printbytemem", X86Mem8AsmOperand, 8> {
   let MIOperandInfo = (ops ptr_rc_norex, i8imm, ptr_rc_norex_nosp, i32imm,
                        SEGMENT_REG);
-  let ParserMatchClass = X86Mem8AsmOperand;
-  let OperandType = "OPERAND_MEMORY";
 }
 
 // GPRs available for tailcall.
@@ -840,11 +839,11 @@ def VK16Pair : RegisterOperand<VK16PAIR, "printVKPair"> {
 // Define X86-specific addressing mode.
 def addr      : ComplexPattern<iPTR, 5, "selectAddr", [], [SDNPWantParent]>;
 def lea32addr : ComplexPattern<i32, 5, "selectLEAAddr",
-                               [add, sub, mul, X86mul_imm, shl, or, frameindex],
+                               [add, sub, mul, X86mul_imm, shl, or, xor, frameindex],
                                []>;
 // In 64-bit mode 32-bit LEAs can use RIP-relative addressing.
 def lea64_32addr : ComplexPattern<i32, 5, "selectLEA64_32Addr",
-                                  [add, sub, mul, X86mul_imm, shl, or,
+                                  [add, sub, mul, X86mul_imm, shl, or, xor,
                                    frameindex, X86WrapperRIP],
                                   []>;
 
@@ -855,7 +854,7 @@ def tls32baseaddr : ComplexPattern<i32, 5, "selectTLSADDRAddr",
                                [tglobaltlsaddr], []>;
 
 def lea64addr : ComplexPattern<i64, 5, "selectLEAAddr",
-                        [add, sub, mul, X86mul_imm, shl, or, frameindex,
+                        [add, sub, mul, X86mul_imm, shl, or, xor, frameindex,
                          X86WrapperRIP], []>;
 
 def tls64addr : ComplexPattern<i64, 5, "selectTLSADDRAddr",
@@ -875,12 +874,12 @@ def relocImm : ComplexPattern<iAny, 1, "selectRelocImm",
 // X86 Instruction Predicate Definitions.
 def TruePredicate : Predicate<"true">;
 
-def HasCMov      : Predicate<"Subtarget->hasCMov()">;
-def NoCMov       : Predicate<"!Subtarget->hasCMov()">;
+def HasCMOV      : Predicate<"Subtarget->canUseCMOV()">;
+def NoCMOV       : Predicate<"!Subtarget->canUseCMOV()">;
 
 def HasMMX       : Predicate<"Subtarget->hasMMX()">;
-def Has3DNow     : Predicate<"Subtarget->has3DNow()">;
-def Has3DNowA    : Predicate<"Subtarget->has3DNowA()">;
+def Has3DNow     : Predicate<"Subtarget->hasThreeDNow()">;
+def Has3DNowA    : Predicate<"Subtarget->hasThreeDNowA()">;
 def HasSSE1      : Predicate<"Subtarget->hasSSE1()">;
 def UseSSE1      : Predicate<"Subtarget->hasSSE1() && !Subtarget->hasAVX()">;
 def HasSSE2      : Predicate<"Subtarget->hasSSE2()">;
@@ -981,8 +980,8 @@ def HasWBNOINVD  : Predicate<"Subtarget->hasWBNOINVD()">;
 def HasRDPID     : Predicate<"Subtarget->hasRDPID()">;
 def HasWAITPKG   : Predicate<"Subtarget->hasWAITPKG()">;
 def HasINVPCID   : Predicate<"Subtarget->hasINVPCID()">;
-def HasCmpxchg8b : Predicate<"Subtarget->hasCmpxchg8b()">;
-def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
+def HasCX8       : Predicate<"Subtarget->hasCX8()">;
+def HasCX16      : Predicate<"Subtarget->hasCX16()">;
 def HasPCONFIG   : Predicate<"Subtarget->hasPCONFIG()">;
 def HasENQCMD    : Predicate<"Subtarget->hasENQCMD()">;
 def HasKL        : Predicate<"Subtarget->hasKL()">;
@@ -996,25 +995,25 @@ def HasAMXINT8   : Predicate<"Subtarget->hasAMXINT8()">;
 def HasUINTR     : Predicate<"Subtarget->hasUINTR()">;
 def HasCRC32     : Predicate<"Subtarget->hasCRC32()">;
 def Not64BitMode : Predicate<"!Subtarget->is64Bit()">,
-                             AssemblerPredicate<(all_of (not Mode64Bit)), "Not 64-bit mode">;
+                             AssemblerPredicate<(all_of (not Is64Bit)), "Not 64-bit mode">;
 def In64BitMode  : Predicate<"Subtarget->is64Bit()">,
-                             AssemblerPredicate<(all_of Mode64Bit), "64-bit mode">;
+                             AssemblerPredicate<(all_of Is64Bit), "64-bit mode">;
 def IsLP64  : Predicate<"Subtarget->isTarget64BitLP64()">;
 def NotLP64 : Predicate<"!Subtarget->isTarget64BitLP64()">;
 def In16BitMode  : Predicate<"Subtarget->is16Bit()">,
-                             AssemblerPredicate<(all_of Mode16Bit), "16-bit mode">;
+                             AssemblerPredicate<(all_of Is16Bit), "16-bit mode">;
 def Not16BitMode : Predicate<"!Subtarget->is16Bit()">,
-                             AssemblerPredicate<(all_of (not Mode16Bit)), "Not 16-bit mode">;
+                             AssemblerPredicate<(all_of (not Is16Bit)), "Not 16-bit mode">;
 def In32BitMode  : Predicate<"Subtarget->is32Bit()">,
-                             AssemblerPredicate<(all_of Mode32Bit), "32-bit mode">;
+                             AssemblerPredicate<(all_of Is32Bit), "32-bit mode">;
 def IsWin64      : Predicate<"Subtarget->isTargetWin64()">;
 def NotWin64     : Predicate<"!Subtarget->isTargetWin64()">;
 def NotWin64WithoutFP : Predicate<"!Subtarget->isTargetWin64() ||"
                                   "Subtarget->getFrameLowering()->hasFP(*MF)"> {
   let RecomputePerFunction = 1;
 }
-def IsPS4        : Predicate<"Subtarget->isTargetPS4()">;
-def NotPS4       : Predicate<"!Subtarget->isTargetPS4()">;
+def IsPS         : Predicate<"Subtarget->isTargetPS()">;
+def NotPS        : Predicate<"!Subtarget->isTargetPS()">;
 def IsNaCl       : Predicate<"Subtarget->isTargetNaCl()">;
 def NotNaCl      : Predicate<"!Subtarget->isTargetNaCl()">;
 def SmallCode    : Predicate<"TM.getCodeModel() == CodeModel::Small">;
@@ -2229,13 +2228,13 @@ def CMPXCHG64rm  : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
 
 let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in
 def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst),
-                  "cmpxchg8b\t$dst", []>, TB, Requires<[HasCmpxchg8b]>;
+                  "cmpxchg8b\t$dst", []>, TB, Requires<[HasCX8]>;
 
 let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in
 // NOTE: In64BitMode check needed for the AssemblerPredicate.
 def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst),
                     "cmpxchg16b\t$dst", []>,
-                    TB, Requires<[HasCmpxchg16b,In64BitMode]>;
+                    TB, Requires<[HasCX16,In64BitMode]>;
 } // SchedRW, mayLoad, mayStore, hasSideEffects
 
 
@@ -2851,7 +2850,7 @@ let SchedRW = [WriteSystem] in {
     def TPAUSE : I<0xAE, MRM6r,
                      (outs), (ins GR32orGR64:$src), "tpause\t$src",
                      [(set EFLAGS, (X86tpause GR32orGR64:$src, EDX, EAX))]>,
-                     PD, Requires<[HasWAITPKG]>, NotMemoryFoldable;
+                     PD, Requires<[HasWAITPKG]>;
   }
 } // SchedRW
 
@@ -2939,7 +2938,7 @@ def : InstAlias<"clzero\t{%rax|rax}", (CLZERO64r)>, Requires<[In64BitMode]>;
 let SchedRW = [WriteSystem] in {
   let Uses = [EAX, EDX] in
   def INVLPGB32 : I<0x01, MRM_FE, (outs), (ins),
-                  "invlpgb}", []>,
+                  "invlpgb", []>,
                   PS, Requires<[Not64BitMode]>;
   let Uses = [RAX, EDX] in
   def INVLPGB64 : I<0x01, MRM_FE, (outs), (ins),
@@ -3124,7 +3123,7 @@ def CLFLUSHOPT : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
 
 let Predicates = [HasCLWB], SchedRW = [WriteLoad] in
 def CLWB       : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src",
-                   [(int_x86_clwb addr:$src)]>, PD, NotMemoryFoldable;
+                   [(int_x86_clwb addr:$src)]>, PD;
 
 let Predicates = [HasCLDEMOTE], SchedRW = [WriteLoad] in
 def CLDEMOTE : I<0x1C, MRM0m, (outs), (ins i8mem:$src), "cldemote\t$src",
diff --git a/llvm/lib/Target/X86/X86InstrMMX.td b/llvm/lib/Target/X86/X86InstrMMX.td
index aeecc25ddea2..4196aff240c4 100644
--- a/llvm/lib/Target/X86/X86InstrMMX.td
+++ b/llvm/lib/Target/X86/X86InstrMMX.td
@@ -211,10 +211,10 @@ def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}",
                 (MMX_MOVQ64rr_REV VR64:$dst, VR64:$src), 0>;
 
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
-def MMX_MOVD64from64rm : MMXRI<0x7E, MRMDestMem,
+def MMX_MOVD64from64mr : MMXRI<0x7E, MRMDestMem,
                                (outs), (ins i64mem:$dst, VR64:$src),
                                "movq\t{$src, $dst|$dst, $src}", []>,
-                               Sched<[SchedWriteVecMoveLS.MMX.MR]>;
+                               Sched<[SchedWriteVecMoveLS.MMX.MR]>, NotMemoryFoldable;
 
 let SchedRW = [SchedWriteVecMoveLS.MMX.RM] in {
 let canFoldAsLoad = 1 in
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 035f139e6f33..06cb280e860a 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -112,6 +112,8 @@ multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
 // This is expanded by ExpandPostRAPseudos.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
     isPseudo = 1, SchedRW = [WriteZero] in {
+  def FsFLD0SH : I<0, Pseudo, (outs FR16:$dst), (ins), "",
+                   [(set FR16:$dst, fp16imm0)]>, Requires<[HasSSE2, NoAVX512]>;
   def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
                    [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>;
   def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
@@ -3471,9 +3473,9 @@ defm PMAXUB  : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
                              SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
 defm PMAXSW  : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
                              SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
-defm PAVGB   : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8,
+defm PAVGB   : PDI_binop_all<0xE0, "pavgb", avgceilu, v16i8, v32i8,
                              SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
-defm PAVGW   : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
+defm PAVGW   : PDI_binop_all<0xE3, "pavgw", avgceilu, v8i16, v16i16,
                              SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
 defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
                              SchedWriteVecIMul, 1, NoVLX>;
@@ -3965,6 +3967,20 @@ defm PINSRW : sse2_pinsrw, PD;
 
 } // ExeDomain = SSEPackedInt
 
+// Always select FP16 instructions if available.
+let Predicates = [UseSSE2], AddedComplexity = -10 in {
+  def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (PINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>;
+  def : Pat<(store f16:$src, addr:$dst), (MOV16mr addr:$dst, (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit))>;
+  def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>;
+  def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (PINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>;
+}
+
+let Predicates = [HasAVX, NoBWI] in {
+  def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (VPINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>;
+  def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (VPEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>;
+  def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (VPINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>;
+}
+
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Mask Creation
 //===---------------------------------------------------------------------===//
@@ -3997,7 +4013,10 @@ def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
 //===---------------------------------------------------------------------===//
 
 let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
-let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in
+// As VEX does not have separate instruction contexts for address size
+// overrides, VMASKMOVDQU and VMASKMOVDQU64 would have a decode conflict.
+// Prefer VMASKMODDQU64.
+let Uses = [EDI], Predicates = [HasAVX], isAsmParserOnly = 1 in
 def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
            (ins VR128:$src, VR128:$mask),
            "maskmovdqu\t{$mask, $src|$src, $mask}",
@@ -4008,32 +4027,16 @@ def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
            (ins VR128:$src, VR128:$mask),
            "maskmovdqu\t{$mask, $src|$src, $mask}",
            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>,
-           VEX, VEX_WIG, AdSize64;
-let Uses = [EDI], Predicates = [HasAVX,In64BitMode] in
-def VMASKMOVDQUX32 : VPDI<0xF7, MRMSrcReg, (outs),
-           (ins VR128:$src, VR128:$mask), "",
-           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
-           VEX, VEX_WIG, AdSize32 {
-  let AsmString = "addr32 vmaskmovdqu\t{$mask, $src|$src, $mask}";
-  let AsmVariantName = "NonParsable";
-}
+           VEX, VEX_WIG;
 
-let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
+let Uses = [EDI], Predicates = [UseSSE2] in
 def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
            "maskmovdqu\t{$mask, $src|$src, $mask}",
            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
 let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
 def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
            "maskmovdqu\t{$mask, $src|$src, $mask}",
-           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>,
-           AdSize64;
-let Uses = [EDI], Predicates = [UseSSE2,In64BitMode] in
-def MASKMOVDQUX32 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
-           "addr32 maskmovdqu\t{$mask, $src|$src, $mask}",
-           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
-           AdSize32 {
-  let AsmVariantName = "NonParsable";
-}
+           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
 
 } // ExeDomain = SSEPackedInt
 
@@ -5206,6 +5209,12 @@ let Predicates = [HasAVX, NoBWI] in
 
 defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;
 
+let Predicates = [UseSSE41] in
+  def : Pat<(store f16:$src, addr:$dst), (PEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>;
+
+let Predicates = [HasAVX, NoBWI] in
+  def : Pat<(store f16:$src, addr:$dst), (VPEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>;
+
 
 /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
 multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
@@ -7588,6 +7597,21 @@ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
         (VPBROADCASTWYrr (VMOVDI2PDIrr
                           (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
                                               GR16:$src, sub_16bit))))>;
+
+  def : Pat<(v8f16 (X86VBroadcastld16 addr:$src)),
+            (VPBROADCASTWrm addr:$src)>;
+  def : Pat<(v16f16 (X86VBroadcastld16 addr:$src)),
+            (VPBROADCASTWYrm addr:$src)>;
+
+  def : Pat<(v8f16 (X86VBroadcast (v8f16 VR128:$src))),
+            (VPBROADCASTWrr VR128:$src)>;
+  def : Pat<(v16f16 (X86VBroadcast (v8f16 VR128:$src))),
+            (VPBROADCASTWYrr VR128:$src)>;
+
+  def : Pat<(v8f16 (X86VBroadcast (f16 FR16:$src))),
+            (VPBROADCASTWrr (COPY_TO_REGCLASS FR16:$src, VR128))>;
+  def : Pat<(v16f16 (X86VBroadcast (f16 FR16:$src))),
+            (VPBROADCASTWYrr (COPY_TO_REGCLASS FR16:$src, VR128))>;
 }
 let Predicates = [HasAVX2, NoVLX] in {
   def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
diff --git a/llvm/lib/Target/X86/X86InstrSystem.td b/llvm/lib/Target/X86/X86InstrSystem.td
index b4dd99d08a62..3a653a56e534 100644
--- a/llvm/lib/Target/X86/X86InstrSystem.td
+++ b/llvm/lib/Target/X86/X86InstrSystem.td
@@ -25,18 +25,18 @@ let mayLoad = 1, mayStore = 0, hasSideEffects = 1, isTrap = 1 in {
   def TRAP    : I<0x0B, RawFrm, (outs), (ins), "ud2", [(trap)]>, TB;
 
   def UD1Wm   : I<0xB9, MRMSrcMem, (outs), (ins GR16:$src1, i16mem:$src2),
-                  "ud1{w} {$src2, $src1|$src1, $src2}", []>, TB, OpSize16;
+                  "ud1{w}\t{$src2, $src1|$src1, $src2}", []>, TB, OpSize16;
   def UD1Lm   : I<0xB9, MRMSrcMem, (outs), (ins GR32:$src1, i32mem:$src2),
-                  "ud1{l} {$src2, $src1|$src1, $src2}", []>, TB, OpSize32;
+                  "ud1{l}\t{$src2, $src1|$src1, $src2}", []>, TB, OpSize32;
   def UD1Qm   : RI<0xB9, MRMSrcMem, (outs), (ins GR64:$src1, i64mem:$src2),
-                   "ud1{q} {$src2, $src1|$src1, $src2}", []>, TB;
+                   "ud1{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
 
   def UD1Wr   : I<0xB9, MRMSrcReg, (outs), (ins GR16:$src1, GR16:$src2),
-                  "ud1{w} {$src2, $src1|$src1, $src2}", []>, TB, OpSize16;
+                  "ud1{w}\t{$src2, $src1|$src1, $src2}", []>, TB, OpSize16;
   def UD1Lr   : I<0xB9, MRMSrcReg, (outs), (ins GR32:$src1, GR32:$src2),
-                  "ud1{l} {$src2, $src1|$src1, $src2}", []>, TB, OpSize32;
+                  "ud1{l}\t{$src2, $src1|$src1, $src2}", []>, TB, OpSize32;
   def UD1Qr   : RI<0xB9, MRMSrcReg, (outs), (ins GR64:$src1, GR64:$src2),
-                   "ud1{q} {$src2, $src1|$src1, $src2}", []>, TB;
+                   "ud1{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
 }
 
 let isTerminator = 1 in
@@ -71,9 +71,9 @@ def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexitq", []>, TB,
 } // SchedRW
 
 def : Pat<(debugtrap),
-          (INT3)>, Requires<[NotPS4]>;
+          (INT3)>, Requires<[NotPS]>;
 def : Pat<(debugtrap),
-          (INT (i8 0x41))>, Requires<[IsPS4]>;
+          (INT (i8 0x41))>, Requires<[IsPS]>;
 
 //===----------------------------------------------------------------------===//
 //  Input/Output Instructions.
diff --git a/llvm/lib/Target/X86/X86InstrTSX.td b/llvm/lib/Target/X86/X86InstrTSX.td
index 28563eeb4484..7671eb4676ee 100644
--- a/llvm/lib/Target/X86/X86InstrTSX.td
+++ b/llvm/lib/Target/X86/X86InstrTSX.td
@@ -51,6 +51,8 @@ def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
 // HLE prefixes
 let SchedRW = [WriteSystem] in {
 
+// XACQUIRE and XRELEASE reuse REPNE and REP respectively.
+// For now, just prefer the REP versions.
 let isAsmParserOnly = 1 in {
 def XACQUIRE_PREFIX : I<0xF2, PrefixByte, (outs), (ins), "xacquire", []>;
 def XRELEASE_PREFIX : I<0xF3, PrefixByte, (outs), (ins), "xrelease", []>;
diff --git a/llvm/lib/Target/X86/X86InstrVecCompiler.td b/llvm/lib/Target/X86/X86InstrVecCompiler.td
index 2429aa113fb1..e6ecbb652100 100644
--- a/llvm/lib/Target/X86/X86InstrVecCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrVecCompiler.td
@@ -17,6 +17,8 @@
 
 let Predicates = [NoAVX512] in {
   // A vector extract of the first f32/f64 position is a subregister copy
+  def : Pat<(f16 (extractelt (v8f16 VR128:$src), (iPTR 0))),
+            (COPY_TO_REGCLASS (v8f16 VR128:$src), FR16)>;
   def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
             (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>;
   def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
@@ -34,8 +36,8 @@ let Predicates = [HasAVX512] in {
 }
 
 let Predicates = [NoVLX] in {
-  def : Pat<(v8f16 (scalar_to_vector FR16X:$src)),
-            (COPY_TO_REGCLASS FR16X:$src, VR128)>;
+  def : Pat<(v8f16 (scalar_to_vector FR16:$src)),
+            (COPY_TO_REGCLASS FR16:$src, VR128)>;
   // Implicitly promote a 32-bit scalar to a vector.
   def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
             (COPY_TO_REGCLASS FR32:$src, VR128)>;
diff --git a/llvm/lib/Target/X86/X86InstrXOP.td b/llvm/lib/Target/X86/X86InstrXOP.td
index a5976b7d2d74..d89e481f4522 100644
--- a/llvm/lib/Target/X86/X86InstrXOP.td
+++ b/llvm/lib/Target/X86/X86InstrXOP.td
@@ -13,11 +13,11 @@
 multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
   def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[SchedWritePHAdd.XMM]>;
+           [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[SchedWriteVecALU.XMM]>;
   def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
            [(set VR128:$dst, (Int (load addr:$src)))]>, XOP,
-           Sched<[SchedWritePHAdd.XMM.Folded, SchedWritePHAdd.XMM.ReadAfterFold]>;
+           Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>;
 }
 
 let ExeDomain = SSEPackedInt in {
diff --git a/llvm/lib/Target/X86/X86InstructionSelector.cpp b/llvm/lib/Target/X86/X86InstructionSelector.cpp
index 28d57ca9ae3c..ff701159b95e 100644
--- a/llvm/lib/Target/X86/X86InstructionSelector.cpp
+++ b/llvm/lib/Target/X86/X86InstructionSelector.cpp
@@ -21,7 +21,6 @@
 #include "X86TargetMachine.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
@@ -31,6 +30,7 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterBank.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/IR/DataLayout.h"
@@ -179,6 +179,8 @@ X86InstructionSelector::getRegClass(LLT Ty, const RegisterBank &RB) const {
       return &X86::GR64RegClass;
   }
   if (RB.getID() == X86::VECRRegBankID) {
+    if (Ty.getSizeInBits() == 16)
+      return STI.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass;
     if (Ty.getSizeInBits() == 32)
       return STI.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
     if (Ty.getSizeInBits() == 64)
@@ -516,7 +518,7 @@ bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I,
     // is already on the instruction we're mutating, and thus we don't need to
     // make any changes.  So long as we select an opcode which is capable of
     // loading or storing the appropriate size atomically, the rest of the
-    // backend is required to respect the MMO state. 
+    // backend is required to respect the MMO state.
     if (!MemOp.isUnordered()) {
       LLVM_DEBUG(dbgs() << "Atomic ordering not supported yet\n");
       return false;
@@ -537,12 +539,12 @@ bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I,
   I.setDesc(TII.get(NewOpc));
   MachineInstrBuilder MIB(MF, I);
   if (Opc == TargetOpcode::G_LOAD) {
-    I.RemoveOperand(1);
+    I.removeOperand(1);
     addFullAddress(MIB, AM);
   } else {
     // G_STORE (VAL, Addr), X86Store instruction (Addr, VAL)
-    I.RemoveOperand(1);
-    I.RemoveOperand(0);
+    I.removeOperand(1);
+    I.removeOperand(0);
     addFullAddress(MIB, AM).addUse(DefReg);
   }
   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
@@ -625,7 +627,7 @@ bool X86InstructionSelector::selectGlobalValue(MachineInstr &I,
   I.setDesc(TII.get(NewOpc));
   MachineInstrBuilder MIB(MF, I);
 
-  I.RemoveOperand(1);
+  I.removeOperand(1);
   addFullAddress(MIB, AM);
 
   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
@@ -1412,7 +1414,7 @@ bool X86InstructionSelector::materializeFP(MachineInstr &I,
 
     MachineMemOperand *MMO = MF.getMachineMemOperand(
         MachinePointerInfo::getConstantPool(MF), MachineMemOperand::MOLoad,
-        MF.getDataLayout().getPointerSize(), Alignment);
+        LLT::pointer(0, MF.getDataLayout().getPointerSizeInBits()), Alignment);
 
     LoadInst =
         addDirectMem(BuildMI(*I.getParent(), I, DbgLoc, TII.get(Opc), DstReg),
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 1edec96bbec3..3c8be95b43e3 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -371,8 +371,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
   X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
-  X86_INTRINSIC_DATA(avx2_pavg_b,  INTR_TYPE_2OP, X86ISD::AVG, 0),
-  X86_INTRINSIC_DATA(avx2_pavg_w,  INTR_TYPE_2OP, X86ISD::AVG, 0),
+  X86_INTRINSIC_DATA(avx2_pavg_b,  INTR_TYPE_2OP, ISD::AVGCEILU, 0),
+  X86_INTRINSIC_DATA(avx2_pavg_w,  INTR_TYPE_2OP, ISD::AVGCEILU, 0),
   X86_INTRINSIC_DATA(avx2_pblendvb, BLENDV, X86ISD::BLENDV, 0),
   X86_INTRINSIC_DATA(avx2_permd, VPERM_2OP, X86ISD::VPERMV, 0),
   X86_INTRINSIC_DATA(avx2_permps, VPERM_2OP, X86ISD::VPERMV, 0),
@@ -818,8 +818,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
   X86_INTRINSIC_DATA(avx512_packuswb_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
-  X86_INTRINSIC_DATA(avx512_pavg_b_512, INTR_TYPE_2OP, X86ISD::AVG, 0),
-  X86_INTRINSIC_DATA(avx512_pavg_w_512, INTR_TYPE_2OP, X86ISD::AVG, 0),
+  X86_INTRINSIC_DATA(avx512_pavg_b_512, INTR_TYPE_2OP, ISD::AVGCEILU, 0),
+  X86_INTRINSIC_DATA(avx512_pavg_w_512, INTR_TYPE_2OP, ISD::AVGCEILU, 0),
   X86_INTRINSIC_DATA(avx512_permvar_df_256, VPERM_2OP, X86ISD::VPERMV, 0),
   X86_INTRINSIC_DATA(avx512_permvar_df_512, VPERM_2OP, X86ISD::VPERMV, 0),
   X86_INTRINSIC_DATA(avx512_permvar_di_256, VPERM_2OP, X86ISD::VPERMV, 0),
@@ -1281,8 +1281,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
-  X86_INTRINSIC_DATA(sse2_pavg_b,       INTR_TYPE_2OP, X86ISD::AVG, 0),
-  X86_INTRINSIC_DATA(sse2_pavg_w,       INTR_TYPE_2OP, X86ISD::AVG, 0),
+  X86_INTRINSIC_DATA(sse2_pavg_b,       INTR_TYPE_2OP, ISD::AVGCEILU, 0),
+  X86_INTRINSIC_DATA(sse2_pavg_w,       INTR_TYPE_2OP, ISD::AVGCEILU, 0),
   X86_INTRINSIC_DATA(sse2_pmadd_wd,     INTR_TYPE_2OP, X86ISD::VPMADDWD, 0),
   X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
   X86_INTRINSIC_DATA(sse2_pmulh_w,      INTR_TYPE_2OP, ISD::MULHS, 0),
diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
index 4710e524931c..23976fb1a142 100644
--- a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
+++ b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
@@ -558,7 +558,7 @@ int X86LoadValueInjectionLoadHardeningPass::elimMitigatedEdgesAndNodes(
   }
 
   // Find and eliminate gadget edges that have been mitigated.
-  int MitigatedGadgets = 0, RemainingGadgets = 0;
+  int RemainingGadgets = 0;
   NodeSet ReachableNodes{G};
   for (const Node &RootN : G.nodes()) {
     if (llvm::none_of(RootN.edges(), MachineGadgetGraph::isGadgetEdge))
@@ -586,7 +586,6 @@ int X86LoadValueInjectionLoadHardeningPass::elimMitigatedEdgesAndNodes(
           // This gadget's sink is reachable
           ++RemainingGadgets;
         } else { // This gadget's sink is unreachable, and therefore mitigated
-          ++MitigatedGadgets;
           ElimEdges.insert(E);
         }
       }
diff --git a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
index 6b564a0356a6..70964b352b8c 100644
--- a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
@@ -20,6 +20,7 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
index 6206d8efb3d0..540182cb7911 100644
--- a/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -74,6 +74,24 @@ static bool isAMXCast(Instruction *II) {
          match(II, m_Intrinsic<Intrinsic::x86_cast_tile_to_vector>(m_Value()));
 }
 
+static bool isAMXIntrinsic(Value *I) {
+  auto *II = dyn_cast<IntrinsicInst>(I);
+  if (!II)
+    return false;
+  if (isAMXCast(II))
+    return false;
+  // Check if return type or parameter is x86_amx. If it is x86_amx
+  // the intrinsic must be x86 amx intrinsics.
+  if (II->getType()->isX86_AMXTy())
+    return true;
+  for (Value *V : II->args()) {
+    if (V->getType()->isX86_AMXTy())
+      return true;
+  }
+
+  return false;
+}
+
 static AllocaInst *createAllocaInstAtEntry(IRBuilder<> &Builder, BasicBlock *BB,
                                            Type *Ty) {
   Function &F = *BB->getParent();
@@ -162,6 +180,36 @@ static std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo) {
   return std::make_pair(Row, Col);
 }
 
+static std::pair<Value *, Value *> getShape(PHINode *Phi) {
+  Use &U = *(Phi->use_begin());
+  unsigned OpNo = U.getOperandNo();
+  User *V = U.getUser();
+  // TODO We don't traverse all users. To make the algorithm simple, here we
+  // just traverse the first user. If we can find shape, then return the shape,
+  // otherwise just return nullptr and the optimization for undef/zero will be
+  // abandoned.
+  while (V) {
+    if (isAMXCast(dyn_cast<Instruction>(V))) {
+      if (V->use_empty())
+        break;
+      Use &U = *(V->use_begin());
+      OpNo = U.getOperandNo();
+      V = U.getUser();
+    } else if (isAMXIntrinsic(V)) {
+      return getShape(cast<IntrinsicInst>(V), OpNo);
+    } else if (isa<PHINode>(V)) {
+      if (V->use_empty())
+        break;
+      Use &U = *(V->use_begin());
+      V = U.getUser();
+    } else {
+      break;
+    }
+  }
+
+  return std::make_pair(nullptr, nullptr);
+}
+
 namespace {
 class X86LowerAMXType {
   Function &Func;
@@ -655,6 +703,9 @@ class X86LowerAMXCast {
 
 public:
   X86LowerAMXCast(Function &F) : Func(F) {}
+  void combineCastStore(IntrinsicInst *Cast, StoreInst *ST);
+  void combineLoadCast(IntrinsicInst *Cast, LoadInst *LD);
+  bool combineLdSt(SmallVectorImpl<Instruction *> &Casts);
   bool combineAMXcast(TargetLibraryInfo *TLI);
   bool transformAMXCast(IntrinsicInst *AMXCast);
   bool transformAllAMXCast();
@@ -720,11 +771,33 @@ bool X86LowerAMXCast::optimizeAMXCastFromPhi(
   OldPhiNodes.insert(PN);
   while (!PhiWorklist.empty()) {
     auto *OldPN = PhiWorklist.pop_back_val();
-    for (Value *IncValue : OldPN->incoming_values()) {
+    for (unsigned I = 0; I < OldPN->getNumOperands(); ++I) {
+      Value *IncValue = OldPN->getIncomingValue(I);
       // TODO: currently, We ignore cases where it is a const. In the future, we
       // might support const.
-      if (isa<Constant>(IncValue))
-        return false;
+      if (isa<Constant>(IncValue)) {
+        auto *IncConst = dyn_cast<Constant>(IncValue);
+        if (!isa<UndefValue>(IncValue) && !IncConst->isZeroValue())
+          return false;
+        Value *Row = nullptr, *Col = nullptr;
+        std::tie(Row, Col) = getShape(OldPN);
+        // TODO: If it is not constant the Row and Col must domoniate tilezero
+        // that we are going to create.
+        if (!Row || !Col || !isa<Constant>(Row) || !isa<Constant>(Col))
+          return false;
+        // Create tilezero at the end of incoming block.
+        auto *Block = OldPN->getIncomingBlock(I);
+        BasicBlock::iterator Iter = Block->getTerminator()->getIterator();
+        Instruction *NewInst = Builder.CreateIntrinsic(
+            Intrinsic::x86_tilezero_internal, None, {Row, Col});
+        NewInst->moveBefore(&*Iter);
+        NewInst = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector,
+                                          {IncValue->getType()}, {NewInst});
+        NewInst->moveBefore(&*Iter);
+        // Replace InValue with new Value.
+        OldPN->setIncomingValue(I, NewInst);
+        IncValue = NewInst;
+      }
 
       if (auto *PNode = dyn_cast<PHINode>(IncValue)) {
         if (OldPhiNodes.insert(PNode))
@@ -838,6 +911,99 @@ bool X86LowerAMXCast::optimizeAMXCastFromPhi(
   return true;
 }
 
+// %43 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %42)
+// store <256 x i32> %43, <256 x i32>* %p, align 64
+// -->
+// call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %p,
+//                                           i64 64, x86_amx %42)
+void X86LowerAMXCast::combineCastStore(IntrinsicInst *Cast, StoreInst *ST) {
+  Value *Tile = Cast->getOperand(0);
+  // TODO: If it is cast intrinsic or phi node, we can propagate the
+  // shape information through def-use chain.
+  if (!isAMXIntrinsic(Tile))
+    return;
+  auto *II = cast<IntrinsicInst>(Tile);
+  // Tile is output from AMX intrinsic. The first operand of the
+  // intrinsic is row, the second operand of the intrinsic is column.
+  Value *Row = II->getOperand(0);
+  Value *Col = II->getOperand(1);
+  IRBuilder<> Builder(ST);
+  // Use the maximum column as stride. It must be the same with load
+  // stride.
+  Value *Stride = Builder.getInt64(64);
+  Value *I8Ptr =
+      Builder.CreateBitCast(ST->getOperand(1), Builder.getInt8PtrTy());
+  std::array<Value *, 5> Args = {Row, Col, I8Ptr, Stride, Tile};
+  Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args);
+}
+
+// %65 = load <256 x i32>, <256 x i32>* %p, align 64
+// %66 = call x86_amx @llvm.x86.cast.vector.to.tile(<256 x i32> %65)
+// -->
+// %66 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,
+//                                                   i8* %p, i64 64)
+void X86LowerAMXCast::combineLoadCast(IntrinsicInst *Cast, LoadInst *LD) {
+  Value *Row = nullptr, *Col = nullptr;
+  Use &U = *(Cast->use_begin());
+  unsigned OpNo = U.getOperandNo();
+  auto *II = cast<IntrinsicInst>(U.getUser());
+  // TODO: If it is cast intrinsic or phi node, we can propagate the
+  // shape information through def-use chain.
+  if (!isAMXIntrinsic(II))
+    return;
+  std::tie(Row, Col) = getShape(II, OpNo);
+  IRBuilder<> Builder(LD);
+  // Use the maximun column as stride.
+  Value *Stride = Builder.getInt64(64);
+  Value *I8Ptr =
+      Builder.CreateBitCast(LD->getOperand(0), Builder.getInt8PtrTy());
+  std::array<Value *, 4> Args = {Row, Col, I8Ptr, Stride};
+
+  Value *NewInst =
+      Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, None, Args);
+  Cast->replaceAllUsesWith(NewInst);
+}
+
+bool X86LowerAMXCast::combineLdSt(SmallVectorImpl<Instruction *> &Casts) {
+  bool Change = false;
+  for (auto *Cast : Casts) {
+    auto *II = cast<IntrinsicInst>(Cast);
+    // %43 = call <256 x i32> @llvm.x86.cast.tile.to.vector(x86_amx %42)
+    // store <256 x i32> %43, <256 x i32>* %p, align 64
+    // -->
+    // call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %p,
+    //                                           i64 64, x86_amx %42)
+    if (II->getIntrinsicID() == Intrinsic::x86_cast_tile_to_vector) {
+      SmallVector<Instruction *, 2> DeadStores;
+      for (User *U : Cast->users()) {
+        StoreInst *Store = dyn_cast<StoreInst>(U);
+        if (!Store)
+          continue;
+        combineCastStore(cast<IntrinsicInst>(Cast), Store);
+        DeadStores.push_back(Store);
+        Change = true;
+      }
+      for (auto *Store : DeadStores)
+        Store->eraseFromParent();
+    } else { // x86_cast_vector_to_tile
+      SmallVector<Instruction *, 2> DeadLoads;
+      auto *Load = dyn_cast<LoadInst>(Cast->getOperand(0));
+      if (!Load || !Load->hasOneUse())
+        continue;
+      // %65 = load <256 x i32>, <256 x i32>* %p, align 64
+      // %66 = call x86_amx @llvm.x86.cast.vector.to.tile(<256 x i32> %65)
+      // -->
+      // %66 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,
+      //                                                   i8* %p, i64 64)
+      combineLoadCast(cast<IntrinsicInst>(Cast), Load);
+      // Set the operand is null so that load instruction can be erased.
+      Cast->setOperand(0, nullptr);
+      Load->eraseFromParent();
+    }
+  }
+  return Change;
+}
+
 bool X86LowerAMXCast::combineAMXcast(TargetLibraryInfo *TLI) {
   bool Change = false;
   // Collect tile cast instruction.
@@ -879,17 +1045,22 @@ bool X86LowerAMXCast::combineAMXcast(TargetLibraryInfo *TLI) {
   Convert(Vec2TileInsts, Intrinsic::x86_cast_tile_to_vector);
   Convert(Tile2VecInsts, Intrinsic::x86_cast_vector_to_tile);
 
+  SmallVector<Instruction *, 8> LiveCasts;
   auto EraseInst = [&](SmallVectorImpl<Instruction *> &Insts) {
     for (auto *Inst : Insts) {
       if (Inst->use_empty()) {
         Inst->eraseFromParent();
         Change = true;
+      } else {
+        LiveCasts.push_back(Inst);
       }
     }
   };
 
   EraseInst(Vec2TileInsts);
   EraseInst(Tile2VecInsts);
+  Change |= combineLdSt(LiveCasts);
+  EraseInst(LiveCasts);
 
   // Handle the A->B->A cast, and there is an intervening PHI node.
   for (BasicBlock &BB : Func) {
@@ -947,6 +1118,10 @@ bool X86LowerAMXCast::transformAMXCast(IntrinsicInst *AMXCast) {
     //                                                  i64 60)
     // call void @llvm.x86.tilestored64.internal(i16 15, i16 60,
     //                                           i8* %addr3, i64 60, x86_amx %2)
+    if (AMXCast->use_empty()) {
+      AMXCast->eraseFromParent();
+      return true;
+    }
     Use &U = *(AMXCast->use_begin());
     unsigned OpNo = U.getOperandNo();
     auto *II = dyn_cast<IntrinsicInst>(U.getUser());
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 9044f10ec630..b107de692365 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -501,7 +501,7 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
 
   for (const MachineOperand &MO : MI->operands())
     if (auto MaybeMCOp = LowerMachineOperand(MI, MO))
-      OutMI.addOperand(MaybeMCOp.getValue());
+      OutMI.addOperand(*MaybeMCOp);
 
   // Handle a few special cases to eliminate operand modifiers.
   switch (OutMI.getOpcode()) {
@@ -962,6 +962,12 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
     // These are not truly commutable so hide them from the default case.
     break;
 
+  case X86::MASKMOVDQU:
+  case X86::VMASKMOVDQU:
+    if (AsmPrinter.getSubtarget().is64Bit())
+      OutMI.setFlags(X86::IP_HAS_AD_SIZE);
+    break;
+
   default: {
     // If the instruction is a commutable arithmetic instruction we might be
     // able to commute the operands to get a 2 byte VEX prefix.
@@ -1311,7 +1317,7 @@ void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI,
             E = FaultingMI.operands_end();
        I != E; ++I)
     if (auto MaybeOperand = MCIL.LowerMachineOperand(&FaultingMI, *I))
-      MI.addOperand(MaybeOperand.getValue());
+      MI.addOperand(*MaybeOperand);
 
   OutStreamer->AddComment("on-fault: " + HandlerLabel->getName());
   OutStreamer->emitInstruction(MI, getSubtargetInfo());
@@ -1347,11 +1353,12 @@ void X86AsmPrinter::LowerASAN_CHECK_MEMACCESS(const MachineInstr &MI) {
                             AccessInfo.CompileKernel, &ShadowBase,
                             &MappingScale, &OrShadowOffset);
 
-  std::string Name = AccessInfo.IsWrite ? "store" : "load";
-  std::string Op = OrShadowOffset ? "or" : "add";
-  std::string SymName = "__asan_check_" + Name + "_" + Op + "_" +
-                        utostr(1ULL << AccessInfo.AccessSizeIndex) + "_" +
-                        TM.getMCRegisterInfo()->getName(Reg.asMCReg());
+  StringRef Name = AccessInfo.IsWrite ? "store" : "load";
+  StringRef Op = OrShadowOffset ? "or" : "add";
+  std::string SymName = ("__asan_check_" + Name + "_" + Op + "_" +
+                         Twine(1ULL << AccessInfo.AccessSizeIndex) + "_" +
+                         TM.getMCRegisterInfo()->getName(Reg.asMCReg()))
+                            .str();
   if (OrShadowOffset)
     report_fatal_error(
         "OrShadowOffset is not supported with optimized callbacks");
@@ -1375,7 +1382,7 @@ void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
   MCI.setOpcode(Opcode);
   for (auto &MO : drop_begin(MI.operands(), 2))
     if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
-      MCI.addOperand(MaybeOperand.getValue());
+      MCI.addOperand(*MaybeOperand);
 
   SmallString<256> Code;
   SmallVector<MCFixup, 4> Fixups;
@@ -1751,7 +1758,7 @@ void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI,
   Ret.setOpcode(OpCode);
   for (auto &MO : drop_begin(MI.operands()))
     if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
-      Ret.addOperand(MaybeOperand.getValue());
+      Ret.addOperand(*MaybeOperand);
   OutStreamer->emitInstruction(Ret, getSubtargetInfo());
   emitX86Nops(*OutStreamer, 10, Subtarget);
   recordSled(CurSled, MI, SledKind::FUNCTION_EXIT, 2);
@@ -1790,7 +1797,7 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI,
   OutStreamer->AddComment("TAILCALL");
   for (auto &MO : drop_begin(MI.operands()))
     if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
-      TC.addOperand(MaybeOperand.getValue());
+      TC.addOperand(*MaybeOperand);
   OutStreamer->emitInstruction(TC, getSubtargetInfo());
 }
 
@@ -1985,34 +1992,34 @@ void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) {
   // Otherwise, use the .seh_ directives for all other Windows platforms.
   switch (MI->getOpcode()) {
   case X86::SEH_PushReg:
-    OutStreamer->EmitWinCFIPushReg(MI->getOperand(0).getImm());
+    OutStreamer->emitWinCFIPushReg(MI->getOperand(0).getImm());
     break;
 
   case X86::SEH_SaveReg:
-    OutStreamer->EmitWinCFISaveReg(MI->getOperand(0).getImm(),
+    OutStreamer->emitWinCFISaveReg(MI->getOperand(0).getImm(),
                                    MI->getOperand(1).getImm());
     break;
 
   case X86::SEH_SaveXMM:
-    OutStreamer->EmitWinCFISaveXMM(MI->getOperand(0).getImm(),
+    OutStreamer->emitWinCFISaveXMM(MI->getOperand(0).getImm(),
                                    MI->getOperand(1).getImm());
     break;
 
   case X86::SEH_StackAlloc:
-    OutStreamer->EmitWinCFIAllocStack(MI->getOperand(0).getImm());
+    OutStreamer->emitWinCFIAllocStack(MI->getOperand(0).getImm());
     break;
 
   case X86::SEH_SetFrame:
-    OutStreamer->EmitWinCFISetFrame(MI->getOperand(0).getImm(),
+    OutStreamer->emitWinCFISetFrame(MI->getOperand(0).getImm(),
                                     MI->getOperand(1).getImm());
     break;
 
   case X86::SEH_PushFrame:
-    OutStreamer->EmitWinCFIPushFrame(MI->getOperand(0).getImm());
+    OutStreamer->emitWinCFIPushFrame(MI->getOperand(0).getImm());
     break;
 
   case X86::SEH_EndPrologue:
-    OutStreamer->EmitWinCFIEndProlog();
+    OutStreamer->emitWinCFIEndProlog();
     break;
 
   default:
diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp b/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
index 05f846bfb219..2e88e01ce7fd 100644
--- a/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
@@ -13,6 +13,13 @@
 
 using namespace llvm;
 
+MachineFunctionInfo *X86MachineFunctionInfo::clone(
+    BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+    const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+    const {
+  return DestMF.cloneInfo<X86MachineFunctionInfo>(*this);
+}
+
 void X86MachineFunctionInfo::anchor() { }
 
 void X86MachineFunctionInfo::setRestoreBasePointer(const MachineFunction *MF) {
diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
index 99d1a97380dd..99cc9f525b2c 100644
--- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -119,7 +119,9 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
 
   Optional<int> SwiftAsyncContextFrameIdx;
 
-  ValueMap<const Value *, size_t> PreallocatedIds;
+  // Preallocated fields are only used during isel.
+  // FIXME: Can we find somewhere else to store these?
+  DenseMap<const Value *, size_t> PreallocatedIds;
   SmallVector<size_t, 0> PreallocatedStackSizes;
   SmallVector<SmallVector<size_t, 4>, 0> PreallocatedArgOffsets;
 
@@ -132,6 +134,12 @@ public:
   X86MachineFunctionInfo() = default;
 
   explicit X86MachineFunctionInfo(MachineFunction &MF) {}
+  explicit X86MachineFunctionInfo(const X86MachineFunctionInfo &) = default;
+
+  MachineFunctionInfo *
+  clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+        const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+      const override;
 
   bool getForceFramePointer() const { return ForceFramePointer;}
   void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
diff --git a/llvm/lib/Target/X86/X86MacroFusion.cpp b/llvm/lib/Target/X86/X86MacroFusion.cpp
index 425054cfdd92..aa6e8645e092 100644
--- a/llvm/lib/Target/X86/X86MacroFusion.cpp
+++ b/llvm/lib/Target/X86/X86MacroFusion.cpp
@@ -15,6 +15,7 @@
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "X86Subtarget.h"
 #include "llvm/CodeGen/MacroFusion.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Target/X86/X86PadShortFunction.cpp b/llvm/lib/Target/X86/X86PadShortFunction.cpp
index e92b1b002bb0..bb59cee8badb 100644
--- a/llvm/lib/Target/X86/X86PadShortFunction.cpp
+++ b/llvm/lib/Target/X86/X86PadShortFunction.cpp
@@ -37,21 +37,20 @@ STATISTIC(NumBBsPadded, "Number of basic blocks padded");
 namespace {
   struct VisitedBBInfo {
     // HasReturn - Whether the BB contains a return instruction
-    bool HasReturn;
+    bool HasReturn = false;
 
     // Cycles - Number of cycles until return if HasReturn is true, otherwise
     // number of cycles until end of the BB
-    unsigned int Cycles;
+    unsigned int Cycles = 0;
 
-    VisitedBBInfo() : HasReturn(false), Cycles(0) {}
+    VisitedBBInfo() = default;
     VisitedBBInfo(bool HasReturn, unsigned int Cycles)
       : HasReturn(HasReturn), Cycles(Cycles) {}
   };
 
   struct PadShortFunc : public MachineFunctionPass {
     static char ID;
-    PadShortFunc() : MachineFunctionPass(ID)
-                   , Threshold(4) {}
+    PadShortFunc() : MachineFunctionPass(ID) {}
 
     bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -82,7 +81,7 @@ namespace {
                     MachineBasicBlock::iterator &MBBI,
                     unsigned int NOOPsToAdd);
 
-    const unsigned int Threshold;
+    const unsigned int Threshold = 4;
 
     // ReturnBBs - Maps basic blocks that return to the minimum number of
     // cycles until the return, starting from the entry block.
diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp
index 4342ac089cae..7761f7323358 100644
--- a/llvm/lib/Target/X86/X86PartialReduction.cpp
+++ b/llvm/lib/Target/X86/X86PartialReduction.cpp
@@ -19,8 +19,10 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/KnownBits.h"
 
@@ -220,16 +222,21 @@ bool X86PartialReduction::trySADReplacement(Instruction *Op) {
   if (!cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(32))
     return false;
 
-  // Operand should be a select.
-  auto *SI = dyn_cast<SelectInst>(Op);
-  if (!SI)
-    return false;
-
-  // Select needs to implement absolute value.
-  Value *LHS, *RHS;
-  auto SPR = matchSelectPattern(SI, LHS, RHS);
-  if (SPR.Flavor != SPF_ABS)
-    return false;
+  Value *LHS;
+  if (match(Op, PatternMatch::m_Intrinsic<Intrinsic::abs>())) {
+    LHS = Op->getOperand(0);
+  } else {
+    // Operand should be a select.
+    auto *SI = dyn_cast<SelectInst>(Op);
+    if (!SI)
+      return false;
+
+    Value *RHS;
+    // Select needs to implement absolute value.
+    auto SPR = matchSelectPattern(SI, LHS, RHS);
+    if (SPR.Flavor != SPF_ABS)
+      return false;
+  }
 
   // Need a subtract of two values.
   auto *Sub = dyn_cast<BinaryOperator>(LHS);
@@ -253,7 +260,7 @@ bool X86PartialReduction::trySADReplacement(Instruction *Op) {
   if (!Op0 || !Op1)
     return false;
 
-  IRBuilder<> Builder(SI);
+  IRBuilder<> Builder(Op);
 
   auto *OpTy = cast<FixedVectorType>(Op->getType());
   unsigned NumElts = OpTy->getNumElements();
@@ -271,7 +278,7 @@ bool X86PartialReduction::trySADReplacement(Instruction *Op) {
     IntrinsicNumElts = 16;
   }
 
-  Function *PSADBWFn = Intrinsic::getDeclaration(SI->getModule(), IID);
+  Function *PSADBWFn = Intrinsic::getDeclaration(Op->getModule(), IID);
 
   if (NumElts < 16) {
     // Pad input with zeroes.
@@ -336,8 +343,8 @@ bool X86PartialReduction::trySADReplacement(Instruction *Op) {
     Ops[0] = Builder.CreateShuffleVector(Ops[0], Zero, ConcatMask);
   }
 
-  SI->replaceAllUsesWith(Ops[0]);
-  SI->eraseFromParent();
+  Op->replaceAllUsesWith(Ops[0]);
+  Op->eraseFromParent();
 
   return true;
 }
diff --git a/llvm/lib/Target/X86/X86PreAMXConfig.cpp b/llvm/lib/Target/X86/X86PreAMXConfig.cpp
index d9c6d08ada73..cd0d448238a6 100644
--- a/llvm/lib/Target/X86/X86PreAMXConfig.cpp
+++ b/llvm/lib/Target/X86/X86PreAMXConfig.cpp
@@ -91,16 +91,17 @@ static bool brokenVolatile(Instruction *I) {
 
 namespace {
 class X86PreAMXConfig {
+  using PosAndShapesMap = MapVector<Instruction *, SmallVector<Value *, 8>>;
+
   Function &F;
 
 public:
   X86PreAMXConfig(Function &Func) : F(Func) {}
   bool preTileConfig();
-  bool addTileConfig(Instruction *ModelStart, SmallVector<Value *, 8> &Shapes);
-  bool findConfigShapes(
-      DenseMap<Instruction *, SmallVector<Value *, 8>> &PosAndShapes);
+  void addTileConfig(Instruction *ModelStart, SmallVector<Value *, 8> &Shapes);
+  bool findConfigShapes(PosAndShapesMap &PosAndShapes);
   bool getKeyAMXShapes(IntrinsicInst *KeyAMX, SmallVector<Value *, 8> &Shapes);
-  bool preWriteTileCfg(Value *I8Ptr, Instruction *Pos,
+  void preWriteTileCfg(Value *I8Ptr, IRBuilderBase &Builder,
                        SmallVector<Value *, 8> &Shapes);
   BasicBlock::iterator
   getShapesAndConfigPosEnd(BasicBlock::iterator Iter,
@@ -149,10 +150,9 @@ public:
 // %td = tail call x86_amx @llvm.x86.tdpbssd.internal(m, n, k, t1, t2, t3)
 // call void @llvm.x86.tilestored64.internal(... td)                     area
 // --------------------------------------------------------------------------
-bool X86PreAMXConfig::preWriteTileCfg(Value *I8Ptr, Instruction *Pos,
+void X86PreAMXConfig::preWriteTileCfg(Value *I8Ptr, IRBuilderBase &Builder,
                                       SmallVector<Value *, 8> &Shapes) {
-  bool Write = false;
-  LLVMContext &Ctx = Pos->getParent()->getContext();
+  LLVMContext &Ctx = Builder.getContext();
   Type *I8Ty = Type::getInt8Ty(Ctx);
   Type *I16Ty = Type::getInt16Ty(Ctx);
 
@@ -160,30 +160,27 @@ bool X86PreAMXConfig::preWriteTileCfg(Value *I8Ptr, Instruction *Pos,
   // other value in the future.
   Value *PaletteOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 0);
   Value *PaletteValue = ConstantInt::get(Type::getInt8Ty(Ctx), 1);
-  Value *PalettePos =
-      GetElementPtrInst::Create(I8Ty, I8Ptr, PaletteOffset, "", Pos);
-  new StoreInst(PaletteValue, PalettePos, Pos);
+  Value *PalettePos = Builder.CreateGEP(I8Ty, I8Ptr, PaletteOffset);
+  Builder.CreateStore(PaletteValue, PalettePos);
 
   for (int I = 0, E = Shapes.size() / 2; I < E; I++) {
     Value *RowOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 48 + I);
     Value *ColOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 16 + I * 2);
     const std::string ShapeName = "amx.tmm." + itostr(I);
-    Value *RowPos = GetElementPtrInst::Create(I8Ty, I8Ptr, RowOffset,
-                                              ShapeName + ".shape.row", Pos);
-    Value *ColPos = GetElementPtrInst::Create(I8Ty, I8Ptr, ColOffset, "", Pos);
-    ColPos = new BitCastInst(ColPos, PointerType::get(I16Ty, 0),
-                             ShapeName + ".shape.col", Pos);
+    Value *RowPos = Builder.CreateGEP(I8Ty, I8Ptr, RowOffset,
+                                      ShapeName + ".shape.row");
+    Value *ColPos = Builder.CreateGEP(I8Ty, I8Ptr, ColOffset);
+    ColPos = Builder.CreateBitCast(ColPos, PointerType::get(I16Ty, 0),
+                                   ShapeName + ".shape.col");
     Value *Row = Shapes[I * 2];
     Value *Col = Shapes[I * 2 + 1];
-    Row = new TruncInst(Row, I8Ty, "", Pos);
-    new StoreInst(Row, RowPos, Pos);
-    new StoreInst(Col, ColPos, Pos);
-    Write = true;
+    Row = Builder.CreateTrunc(Row, I8Ty);
+    Builder.CreateStore(Row, RowPos);
+    Builder.CreateStore(Col, ColPos);
   }
-  return Write;
 }
 
-bool X86PreAMXConfig::addTileConfig(Instruction *ModelStart,
+void X86PreAMXConfig::addTileConfig(Instruction *ModelStart,
                                     SmallVector<Value *, 8> &Shapes) {
   Module *M = F.getParent();
   IRBuilder<> Builder(ModelStart);
@@ -198,17 +195,11 @@ bool X86PreAMXConfig::addTileConfig(Instruction *ModelStart,
   Addr->setAlignment(Alignment);
   Value *I8Ptr = Builder.CreateBitCast(Addr, Builder.getInt8PtrTy());
 
-  std::array<Value *, 1> Args = {I8Ptr};
-  Instruction *Cfg =
-      Builder.CreateIntrinsic(Intrinsic::x86_ldtilecfg_internal, None, Args);
-
-  Value *Val0 = Constant::getNullValue(V512Ty);
-  Instruction *Init0 = new StoreInst(Val0, Addr, false, Alignment, Cfg);
-  assert(Init0 && "Not Zero initilizate the cfg mem!");
+  Builder.CreateAlignedStore(Constant::getNullValue(V512Ty), Addr, Alignment);
 
-  preWriteTileCfg(I8Ptr, Cfg, Shapes);
+  preWriteTileCfg(I8Ptr, Builder, Shapes);
 
-  return Init0;
+  Builder.CreateIntrinsic(Intrinsic::x86_ldtilecfg_internal, None, {I8Ptr});
 }
 
 // Todo: We may need to handle "more than one store" case in the future.
@@ -315,8 +306,7 @@ X86PreAMXConfig::getShapesAndConfigPosEnd(BasicBlock::iterator Iter,
 // %td = call x86_amx @llvm.x86.tdpbssd.internal(...t1, t2, t3)    (m,k)(k,n)
 // call void @llvm.x86.tilestored64.internal(m, n,... td)          (m,n)(m,n)
 // --------------------------------------------------------------------------
-bool X86PreAMXConfig::findConfigShapes(
-    DenseMap<Instruction *, SmallVector<Value *, 8>> &PosAndShapes) {
+bool X86PreAMXConfig::findConfigShapes(PosAndShapesMap &PosAndShapes) {
   bool Find = false;
   for (BasicBlock &BB : F) {
     for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I) {
@@ -365,7 +355,7 @@ bool X86PreAMXConfig::findConfigShapes(
 // call void @llvm.x86.tilestored64.internal(... td)                     area
 // --------------------------------------------------------------------------
 bool X86PreAMXConfig::preTileConfig() {
-  DenseMap<Instruction *, SmallVector<Value *, 8>> PosAndShapes;
+  PosAndShapesMap PosAndShapes;
   bool NeedCfg = findConfigShapes(PosAndShapes);
   if (!NeedCfg)
     return false;
diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp
index 5d21f8666ec6..479db8585ca0 100644
--- a/llvm/lib/Target/X86/X86PreTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp
@@ -31,6 +31,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
@@ -40,10 +41,15 @@
 using namespace llvm;
 
 #define DEBUG_TYPE "tile-pre-config"
-#define REPORT_CONFIG_FAIL                                                     \
-  report_fatal_error(                                                          \
-      MF.getName() +                                                           \
-      ": Failed to config tile register, please define the shape earlier");
+
+static void emitErrorMsg(MachineFunction &MF) {
+  SmallString<32> Str;
+  Twine ErrorMsg =
+      MF.getName() +
+      ": Failed to config tile register, please define the shape earlier";
+  LLVMContext &Context = MF.getMMI().getModule()->getContext();
+  Context.emitError(ErrorMsg);
+}
 
 namespace {
 
@@ -302,12 +308,19 @@ bool X86PreTileConfig::runOnMachineFunction(MachineFunction &MF) {
   SmallVector<MachineBasicBlock *, 8> WorkList;
   for (auto &I : ShapeBBs) {
     // TODO: We can hoist shapes across BBs here.
-    if (BBVisitedInfo[I.first].HasAMXRegLiveIn)
-      REPORT_CONFIG_FAIL
+    if (BBVisitedInfo[I.first].HasAMXRegLiveIn) {
+      // We are not able to config tile registers since the shape to config
+      // is not defined yet. Emit error message and continue. The function
+      // would not config tile registers.
+      emitErrorMsg(MF);
+      return false;
+    }
     if (BBVisitedInfo[I.first].FirstAMX &&
         BBVisitedInfo[I.first].FirstAMX < I.second.back() &&
-        !hoistShapesInBB(I.first, I.second))
-      REPORT_CONFIG_FAIL
+        !hoistShapesInBB(I.first, I.second)) {
+      emitErrorMsg(MF);
+      return false;
+    }
     WorkList.push_back(I.first);
   }
   while (!WorkList.empty()) {
@@ -356,7 +369,7 @@ bool X86PreTileConfig::runOnMachineFunction(MachineFunction &MF) {
       // multi insert.
       if (VisitedOrInserted.insert(I).second) {
         auto II = I.MI ? I.MI->getIterator() : I.MBB->instr_begin();
-        addFrameReference(BuildMI(*I.MBB, ++II, DL, TII->get(X86::LDTILECFG)),
+        addFrameReference(BuildMI(*I.MBB, ++II, DL, TII->get(X86::PLDTILECFGV)),
                           SS);
       }
     }
@@ -367,33 +380,27 @@ bool X86PreTileConfig::runOnMachineFunction(MachineFunction &MF) {
   MachineInstr *MI = &*MBB.begin();
   if (ST.hasAVX512()) {
     Register Zmm = MRI->createVirtualRegister(&X86::VR512RegClass);
-    BuildMI(MBB, MI, DL, TII->get(X86::VPXORDZrr), Zmm)
-        .addReg(Zmm, RegState::Undef)
-        .addReg(Zmm, RegState::Undef);
+    BuildMI(MBB, MI, DL, TII->get(X86::AVX512_512_SET0), Zmm);
     addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSZmr)), SS)
         .addReg(Zmm);
   } else if (ST.hasAVX2()) {
     Register Ymm = MRI->createVirtualRegister(&X86::VR256RegClass);
-    BuildMI(MBB, MI, DL, TII->get(X86::VPXORYrr), Ymm)
-        .addReg(Ymm, RegState::Undef)
-        .addReg(Ymm, RegState::Undef);
+    BuildMI(MBB, MI, DL, TII->get(X86::AVX_SET0), Ymm);
     addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSYmr)), SS)
         .addReg(Ymm);
     addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSYmr)), SS, 32)
         .addReg(Ymm);
   } else {
     assert(ST.hasSSE2() && "AMX should assume SSE2 enabled");
+    unsigned StoreOpc = ST.hasAVX() ? X86::VMOVUPSmr : X86::MOVUPSmr;
     Register Xmm = MRI->createVirtualRegister(&X86::VR128RegClass);
-    BuildMI(MBB, MI, DL, TII->get(X86::PXORrr), Xmm)
-        .addReg(Xmm, RegState::Undef)
-        .addReg(Xmm, RegState::Undef);
-    addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOVUPSmr)), SS)
-        .addReg(Xmm);
-    addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOVUPSmr)), SS, 16)
+    BuildMI(MBB, MI, DL, TII->get(X86::V_SET0), Xmm);
+    addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), SS).addReg(Xmm);
+    addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), SS, 16)
         .addReg(Xmm);
-    addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOVUPSmr)), SS, 32)
+    addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), SS, 32)
         .addReg(Xmm);
-    addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOVUPSmr)), SS, 48)
+    addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), SS, 48)
         .addReg(Xmm);
   }
   // Fill in the palette first.
diff --git a/llvm/lib/Target/X86/X86RegisterBankInfo.cpp b/llvm/lib/Target/X86/X86RegisterBankInfo.cpp
index 9c076d2d6769..c49fc458eab3 100644
--- a/llvm/lib/Target/X86/X86RegisterBankInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterBankInfo.cpp
@@ -12,9 +12,9 @@
 
 #include "X86RegisterBankInfo.h"
 #include "X86InstrInfo.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterBank.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 
 #define GET_TARGET_REGBANK_IMPL
@@ -25,8 +25,7 @@ using namespace llvm;
 #define GET_TARGET_REGBANK_INFO_IMPL
 #include "X86GenRegisterBankInfo.def"
 
-X86RegisterBankInfo::X86RegisterBankInfo(const TargetRegisterInfo &TRI)
-    : X86GenRegisterBankInfo() {
+X86RegisterBankInfo::X86RegisterBankInfo(const TargetRegisterInfo &TRI) {
 
   // validate RegBank initialization.
   const RegisterBank &RBGPR = getRegBank(X86::GPRRegBankID);
diff --git a/llvm/lib/Target/X86/X86RegisterBankInfo.h b/llvm/lib/Target/X86/X86RegisterBankInfo.h
index d5afd2cae761..fca36a317b58 100644
--- a/llvm/lib/Target/X86/X86RegisterBankInfo.h
+++ b/llvm/lib/Target/X86/X86RegisterBankInfo.h
@@ -13,7 +13,7 @@
 #ifndef LLVM_LIB_TARGET_X86_X86REGISTERBANKINFO_H
 #define LLVM_LIB_TARGET_X86_X86REGISTERBANKINFO_H
 
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
 
 #define GET_REGBANK_DECLARATIONS
 #include "X86GenRegisterBank.inc"
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 130cb61cdde2..f2658f70434b 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -26,6 +26,8 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TileShapeInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
@@ -618,6 +620,66 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   return Reserved;
 }
 
+bool X86RegisterInfo::isArgumentRegister(const MachineFunction &MF,
+                                         MCRegister Reg) const {
+  const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
+  const TargetRegisterInfo &TRI = *ST.getRegisterInfo();
+  auto IsSubReg = [&](MCRegister RegA, MCRegister RegB) {
+    return TRI.isSuperOrSubRegisterEq(RegA, RegB);
+  };
+
+  if (!ST.is64Bit())
+    return llvm::any_of(
+               SmallVector<MCRegister>{X86::EAX, X86::ECX, X86::EDX},
+               [&](MCRegister &RegA) { return IsSubReg(RegA, Reg); }) ||
+           (ST.hasMMX() && X86::VR64RegClass.contains(Reg));
+
+  CallingConv::ID CC = MF.getFunction().getCallingConv();
+
+  if (CC == CallingConv::X86_64_SysV && IsSubReg(X86::RAX, Reg))
+    return true;
+
+  if (llvm::any_of(
+          SmallVector<MCRegister>{X86::RDX, X86::RCX, X86::R8, X86::R9},
+          [&](MCRegister &RegA) { return IsSubReg(RegA, Reg); }))
+    return true;
+
+  if (CC != CallingConv::Win64 &&
+      llvm::any_of(SmallVector<MCRegister>{X86::RDI, X86::RSI},
+                   [&](MCRegister &RegA) { return IsSubReg(RegA, Reg); }))
+    return true;
+
+  if (ST.hasSSE1() &&
+      llvm::any_of(SmallVector<MCRegister>{X86::XMM0, X86::XMM1, X86::XMM2,
+                                           X86::XMM3, X86::XMM4, X86::XMM5,
+                                           X86::XMM6, X86::XMM7},
+                   [&](MCRegister &RegA) { return IsSubReg(RegA, Reg); }))
+    return true;
+
+  return X86GenRegisterInfo::isArgumentRegister(MF, Reg);
+}
+
+bool X86RegisterInfo::isFixedRegister(const MachineFunction &MF,
+                                      MCRegister PhysReg) const {
+  const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
+  const TargetRegisterInfo &TRI = *ST.getRegisterInfo();
+
+  // Stack pointer.
+  if (TRI.isSuperOrSubRegisterEq(X86::RSP, PhysReg))
+    return true;
+
+  // Don't use the frame pointer if it's being used.
+  const X86FrameLowering &TFI = *getFrameLowering(MF);
+  if (TFI.hasFP(MF) && TRI.isSuperOrSubRegisterEq(X86::RBP, PhysReg))
+    return true;
+
+  return X86GenRegisterInfo::isFixedRegister(MF, PhysReg);
+}
+
+bool X86RegisterInfo::isTileRegisterClass(const TargetRegisterClass *RC) const {
+  return RC->getID() == X86::TILERegClassID;
+}
+
 void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const {
   // Check if the EFLAGS register is marked as live-out. This shouldn't happen,
   // because the calling convention defines the EFLAGS register as NOT
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.h b/llvm/lib/Target/X86/X86RegisterInfo.h
index 7fd10ddd1a15..6f4fb405d29f 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.h
+++ b/llvm/lib/Target/X86/X86RegisterInfo.h
@@ -115,6 +115,18 @@ public:
   /// register scavenger to determine what registers are free.
   BitVector getReservedRegs(const MachineFunction &MF) const override;
 
+  /// isArgumentReg - Returns true if Reg can be used as an argument to a
+  /// function.
+  bool isArgumentRegister(const MachineFunction &MF,
+                          MCRegister Reg) const override;
+
+  /// Return true if it is tile register class.
+  bool isTileRegisterClass(const TargetRegisterClass *RC) const;
+
+  /// Returns true if PhysReg is a fixed register.
+  bool isFixedRegister(const MachineFunction &MF,
+                       MCRegister PhysReg) const override;
+
   void adjustStackMapLiveOutMask(uint32_t *Mask) const override;
 
   bool hasBasePointer(const MachineFunction &MF) const;
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td
index 1b704bcb8e08..6dc51e37d3c2 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -537,6 +537,8 @@ def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>;
 
 def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>;
 
+def FR16 : RegisterClass<"X86", [f16], 16, (add FR32)> {let Size = 32;}
+
 
 // FIXME: This sets up the floating point register files as though they are f64
 // values, though they really are f80 values.  This will cause us to spill
@@ -599,7 +601,7 @@ def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>;
 
 def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>;
 
-def FR16X : RegisterClass<"X86", [f16], 16, (add FR32X)>;
+def FR16X : RegisterClass<"X86", [f16], 16, (add FR32X)> {let Size = 32;}
 
 // Extended VR128 and VR256 for AVX-512 instructions
 def VR128X : RegisterClass<"X86", [v4f32, v2f64, v8f16, v16i8, v8i16, v4i32, v2i64, f128],
@@ -638,3 +640,14 @@ def VK64WM  : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;}
 let CopyCost = -1 in // Don't allow copying of tile registers
 def TILE : RegisterClass<"X86", [x86amx], 8192,
                          (sequence "TMM%u", 0, 7)> {let Size = 8192;}
+
+//===----------------------------------------------------------------------===//
+// Register categories.
+//
+
+// The TILE and VK*PAIR registers may not be "fixed", but we don't want them
+// anyway.
+def FixedRegisters : RegisterCategory<[DEBUG_REG, CONTROL_REG, CCR, FPCCR,
+                                       DFCCR, TILE, VK1PAIR, VK2PAIR, VK4PAIR,
+                                       VK8PAIR, VK16PAIR]>;
+def GeneralPurposeRegisters : RegisterCategory<[GR64, GR32, GR16, GR8]>;
diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td
index 8e317dc22bd6..e4b95cb0807f 100644
--- a/llvm/lib/Target/X86/X86SchedBroadwell.td
+++ b/llvm/lib/Target/X86/X86SchedBroadwell.td
@@ -814,12 +814,26 @@ def BWWriteResGroup34 : SchedWriteRes<[BWPort6,BWPort0156]> {
 def: InstRW<[BWWriteResGroup34], (instregex "CLD")>;
 
 def BWWriteResGroup35 : SchedWriteRes<[BWPort06,BWPort0156]> {
-  let Latency = 3;
+  let Latency = 2;
   let NumMicroOps = 3;
   let ResourceCycles = [1,2];
 }
-def: InstRW<[BWWriteResGroup35], (instregex "RCL(8|16|32|64)r(1|i)",
-                                            "RCR(8|16|32|64)r(1|i)")>;
+def: InstRW<[BWWriteResGroup35], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1,
+                                         RCR8r1, RCR16r1, RCR32r1, RCR64r1)>;
+
+def BWWriteResGroup36 : SchedWriteRes<[BWPort1,BWPort06,BWPort0156]> {
+  let Latency = 5;
+  let NumMicroOps = 8;
+  let ResourceCycles = [2,4,2];
+}
+def: InstRW<[BWWriteResGroup36], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>;
+
+def BWWriteResGroup36b : SchedWriteRes<[BWPort1,BWPort06,BWPort0156]> {
+  let Latency = 6;
+  let NumMicroOps = 8;
+  let ResourceCycles = [2,4,2];
+}
+def: InstRW<[BWWriteResGroup36b], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>;
 
 def BWWriteResGroup37 : SchedWriteRes<[BWPort4,BWPort6,BWPort237,BWPort0156]> {
   let Latency = 3;
diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td
index 1cd0b3379684..7b1a31d2a4df 100644
--- a/llvm/lib/Target/X86/X86SchedHaswell.td
+++ b/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -1299,12 +1299,26 @@ def HWWriteResGroup58 : SchedWriteRes<[HWPort6,HWPort0156]> {
 def: InstRW<[HWWriteResGroup58], (instregex "CLD")>;
 
 def HWWriteResGroup59 : SchedWriteRes<[HWPort06,HWPort0156]> {
-  let Latency = 3;
+  let Latency = 2;
   let NumMicroOps = 3;
   let ResourceCycles = [1,2];
 }
-def: InstRW<[HWWriteResGroup59], (instregex "RCL(8|16|32|64)r(1|i)",
-                                            "RCR(8|16|32|64)r(1|i)")>;
+def: InstRW<[HWWriteResGroup59], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1,
+                                         RCR8r1, RCR16r1, RCR32r1, RCR64r1)>;
+
+def HWWriteResGroup60 : SchedWriteRes<[HWPort1,HWPort06,HWPort0156]> {
+  let Latency = 5;
+  let NumMicroOps = 8;
+  let ResourceCycles = [2,4,2];
+}
+def: InstRW<[HWWriteResGroup60], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>;
+
+def HWWriteResGroup60b : SchedWriteRes<[HWPort1,HWPort06,HWPort0156]> {
+  let Latency = 6;
+  let NumMicroOps = 8;
+  let ResourceCycles = [2,4,2];
+}
+def: InstRW<[HWWriteResGroup60b], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>;
 
 def HWWriteResGroup61 : SchedWriteRes<[HWPort0,HWPort4,HWPort237]> {
   let Latency = 4;
diff --git a/llvm/lib/Target/X86/X86SchedIceLake.td b/llvm/lib/Target/X86/X86SchedIceLake.td
index 9fd986e34181..b66db7e7e73a 100644
--- a/llvm/lib/Target/X86/X86SchedIceLake.td
+++ b/llvm/lib/Target/X86/X86SchedIceLake.td
@@ -923,12 +923,26 @@ def ICXWriteResGroup43 : SchedWriteRes<[ICXPort237,ICXPort0156]> {
 def: InstRW<[ICXWriteResGroup43], (instrs MFENCE)>;
 
 def ICXWriteResGroup44 : SchedWriteRes<[ICXPort06,ICXPort0156]> {
-  let Latency = 3;
+  let Latency = 2;
   let NumMicroOps = 3;
   let ResourceCycles = [1,2];
 }
-def: InstRW<[ICXWriteResGroup44], (instregex "RCL(8|16|32|64)r(1|i)",
-                                             "RCR(8|16|32|64)r(1|i)")>;
+def: InstRW<[ICXWriteResGroup44], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1,
+                                          RCR8r1, RCR16r1, RCR32r1, RCR64r1)>;
+
+def ICXWriteResGroup44b : SchedWriteRes<[ICXPort1,ICXPort06,ICXPort0156]> {
+  let Latency = 5;
+  let NumMicroOps = 7;
+  let ResourceCycles = [2,3,2];
+}
+def: InstRW<[ICXWriteResGroup44b], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>;
+
+def ICXWriteResGroup44c : SchedWriteRes<[ICXPort1,ICXPort06,ICXPort0156]> {
+  let Latency = 6;
+  let NumMicroOps = 7;
+  let ResourceCycles = [2,3,2];
+}
+def: InstRW<[ICXWriteResGroup44c], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>;
 
 def ICXWriteResGroup45 : SchedWriteRes<[ICXPort0,ICXPort4,ICXPort237]> {
   let Latency = 3;
diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td
index 7e619a3a8722..49858ca0a800 100644
--- a/llvm/lib/Target/X86/X86SchedSandyBridge.td
+++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td
@@ -111,8 +111,17 @@ def : WriteRes<WriteStore,   [SBPort23, SBPort4]>;
 def : WriteRes<WriteStoreNT, [SBPort23, SBPort4]>;
 def : WriteRes<WriteLoad,    [SBPort23]> { let Latency = 5; }
 def : WriteRes<WriteMove,    [SBPort015]>;
+
+// Treat misc copies as a move.
+def : InstRW<[WriteMove], (instrs COPY)>;
+
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
 def : WriteRes<WriteZero,    []>;
-def : WriteRes<WriteVecMaskedGatherWriteback, []> { let Latency = 5; let NumMicroOps = 0; }
+
+// Model the effect of clobbering the read-write mask operand of the GATHER operation.
+// Does not cost anything by itself, only has latency, matching that of the WriteLoad,
+defm : X86WriteRes<WriteVecMaskedGatherWriteback, [], 5, [], 0>;
 
 // Arithmetic.
 defm : SBWriteResPair<WriteALU,    [SBPort015], 1>;
@@ -678,13 +687,27 @@ def SBWriteResGroup22 : SchedWriteRes<[SBPort0,SBPort5]> {
 }
 def: InstRW<[SBWriteResGroup22], (instregex "(V?)EXTRACTPSrr")>;
 
-def SBWriteResGroup23 : SchedWriteRes<[SBPort05]> {
+def SBWriteResGroup23 : SchedWriteRes<[SBPort05,SBPort015]> {
   let Latency = 2;
   let NumMicroOps = 3;
-  let ResourceCycles = [3];
+  let ResourceCycles = [2,1];
+}
+def: InstRW<[SBWriteResGroup23], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1,
+                                         RCR8r1, RCR16r1, RCR32r1, RCR64r1)>;
+
+def SBWriteResGroup24 : SchedWriteRes<[SBPort1,SBPort5,SBPort05,SBPort015]> {
+  let Latency = 3;
+  let NumMicroOps = 8;
+  let ResourceCycles = [1,1,4,2];
+}
+def: InstRW<[SBWriteResGroup24], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>;
+
+def SBWriteResGroup24b : SchedWriteRes<[SBPort1,SBPort5,SBPort05,SBPort015]> {
+  let Latency = 4;
+  let NumMicroOps = 8;
+  let ResourceCycles = [1,1,4,2];
 }
-def: InstRW<[SBWriteResGroup23], (instregex "RCL(8|16|32|64)r1",
-                                            "RCR(8|16|32|64)r1")>;
+def: InstRW<[SBWriteResGroup24b], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>;
 
 def SBWriteResGroup25_1 : SchedWriteRes<[SBPort23,SBPort015]> {
   let Latency = 7;
@@ -727,8 +750,8 @@ def SBWriteResGroup76 : SchedWriteRes<[SBPort05]> {
   let NumMicroOps = 8;
   let ResourceCycles = [8];
 }
-def: InstRW<[SBWriteResGroup76], (instregex "RCL(8|16|32|64)r(i|CL)",
-                                            "RCR(8|16|32|64)r(i|CL)")>;
+def: InstRW<[SBWriteResGroup76], (instregex "RCL(8|16|32|64)rCL",
+                                            "RCR(8|16|32|64)rCL")>;
 
 def SBWriteResGroup33 : SchedWriteRes<[SBPort4,SBPort23]> {
   let Latency = 5;
@@ -802,8 +825,7 @@ def SBWriteResGroup48 : SchedWriteRes<[SBPort23]> {
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SBWriteResGroup48], (instrs MMX_MOVD64from64rm,
-                                         VBROADCASTSSrm)>;
+def: InstRW<[SBWriteResGroup48], (instrs VBROADCASTSSrm)>;
 def: InstRW<[SBWriteResGroup48], (instregex "POP(16|32|64)r",
                                             "(V?)MOV64toPQIrm",
                                             "(V?)MOVDDUPrm",
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
index 0a88bac5aa66..05364e3434e4 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
@@ -836,12 +836,26 @@ def SKLWriteResGroup41 : SchedWriteRes<[SKLPort237,SKLPort0156]> {
 def: InstRW<[SKLWriteResGroup41], (instrs MFENCE)>;
 
 def SKLWriteResGroup42 : SchedWriteRes<[SKLPort06,SKLPort0156]> {
-  let Latency = 3;
+  let Latency = 2;
   let NumMicroOps = 3;
   let ResourceCycles = [1,2];
 }
-def: InstRW<[SKLWriteResGroup42], (instregex "RCL(8|16|32|64)r(1|i)",
-                                             "RCR(8|16|32|64)r(1|i)")>;
+def: InstRW<[SKLWriteResGroup42], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1,
+                                          RCR8r1, RCR16r1, RCR32r1, RCR64r1)>;
+
+def SKLWriteResGroup42b : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> {
+  let Latency = 5;
+  let NumMicroOps = 8;
+  let ResourceCycles = [2,4,2];
+}
+def: InstRW<[SKLWriteResGroup42b], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>;
+
+def SKLWriteResGroup42c : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> {
+  let Latency = 6;
+  let NumMicroOps = 8;
+  let ResourceCycles = [2,4,2];
+}
+def: InstRW<[SKLWriteResGroup42c], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>;
 
 def SKLWriteResGroup43 : SchedWriteRes<[SKLPort0,SKLPort4,SKLPort237]> {
   let Latency = 3;
@@ -921,8 +935,7 @@ def SKLWriteResGroup58 : SchedWriteRes<[SKLPort23]> {
   let ResourceCycles = [1];
 }
 def: InstRW<[SKLWriteResGroup58], (instregex "MOVSX(16|32|64)rm(8|16|32)",
-                                             "MOVZX(16|32|64)rm(8|16)",
-                                             "(V?)MOVDDUPrm")>; // TODO: Should this be SKLWriteResGroup67?
+                                             "MOVZX(16|32|64)rm(8|16)")>;
 
 def SKLWriteResGroup59 : SchedWriteRes<[SKLPort0,SKLPort5]> {
   let Latency = 5;
@@ -979,7 +992,8 @@ def: InstRW<[SKLWriteResGroup67], (instrs VBROADCASTSSrm,
                                           VPBROADCASTDrm,
                                           VPBROADCASTQrm)>;
 def: InstRW<[SKLWriteResGroup67], (instregex "(V?)MOVSHDUPrm",
-                                             "(V?)MOVSLDUPrm")>;
+                                             "(V?)MOVSLDUPrm",
+                                             "(V?)MOVDDUPrm")>;
 
 def SKLWriteResGroup68 : SchedWriteRes<[SKLPort0]> {
   let Latency = 6;
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
index b28a18f0dcd7..b682b51c298a 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
@@ -905,12 +905,26 @@ def SKXWriteResGroup43 : SchedWriteRes<[SKXPort237,SKXPort0156]> {
 def: InstRW<[SKXWriteResGroup43], (instrs MFENCE)>;
 
 def SKXWriteResGroup44 : SchedWriteRes<[SKXPort06,SKXPort0156]> {
-  let Latency = 3;
+  let Latency = 2;
   let NumMicroOps = 3;
   let ResourceCycles = [1,2];
 }
-def: InstRW<[SKXWriteResGroup44], (instregex "RCL(8|16|32|64)r(1|i)",
-                                             "RCR(8|16|32|64)r(1|i)")>;
+def: InstRW<[SKXWriteResGroup44], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1,
+                                          RCR8r1, RCR16r1, RCR32r1, RCR64r1)>;
+
+def SKXWriteResGroup44b : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> {
+  let Latency = 5;
+  let NumMicroOps = 8;
+  let ResourceCycles = [2,4,2];
+}
+def: InstRW<[SKXWriteResGroup44b], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>;
+
+def SKXWriteResGroup44c : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> {
+  let Latency = 6;
+  let NumMicroOps = 8;
+  let ResourceCycles = [2,4,2];
+}
+def: InstRW<[SKXWriteResGroup44c], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>;
 
 def SKXWriteResGroup45 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237]> {
   let Latency = 3;
@@ -1041,8 +1055,7 @@ def SKXWriteResGroup58 : SchedWriteRes<[SKXPort23]> {
   let ResourceCycles = [1];
 }
 def: InstRW<[SKXWriteResGroup58], (instregex "MOVSX(16|32|64)rm(8|16|32)",
-                                             "MOVZX(16|32|64)rm(8|16)",
-                                             "(V?)MOVDDUPrm")>;  // TODO: Should this be SKXWriteResGroup71?
+                                             "MOVZX(16|32|64)rm(8|16)")>;
 
 def SKXWriteResGroup61 : SchedWriteRes<[SKXPort5,SKXPort015]> {
   let Latency = 5;
@@ -1145,11 +1158,10 @@ def SKXWriteResGroup71 : SchedWriteRes<[SKXPort23]> {
 }
 def: InstRW<[SKXWriteResGroup71], (instrs VBROADCASTSSrm,
                                           VPBROADCASTDrm,
-                                          VPBROADCASTQrm,
-                                          VMOVSHDUPrm,
-                                          VMOVSLDUPrm,
-                                          MOVSHDUPrm,
-                                          MOVSLDUPrm)>;
+                                          VPBROADCASTQrm)>;
+def: InstRW<[SKXWriteResGroup71], (instregex "(V?)MOVSHDUPrm",
+                                             "(V?)MOVSLDUPrm",
+                                             "(V?)MOVDDUPrm")>;
 
 def SKXWriteResGroup72 : SchedWriteRes<[SKXPort5]> {
   let Latency = 6;
diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
index 4b2fa87a25b5..1e9fcf6cc8cf 100644
--- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
@@ -840,8 +840,8 @@ def JWriteMASKMOVDQU: SchedWriteRes<[JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JAL
   let ResourceCycles = [1, 1, 2, 2, 2, 16, 42];
   let NumMicroOps = 63;
 }
-def : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64, MASKMOVDQUX32,
-                                         VMASKMOVDQU, VMASKMOVDQU64, VMASKMOVDQUX32)>;
+def : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64,
+                                         VMASKMOVDQU, VMASKMOVDQU64)>;
 
 ///////////////////////////////////////////////////////////////////////////////
 //  SchedWriteVariant definitions.
diff --git a/llvm/lib/Target/X86/X86ScheduleSLM.td b/llvm/lib/Target/X86/X86ScheduleSLM.td
index 52605c031617..de4e7dd3cb90 100644
--- a/llvm/lib/Target/X86/X86ScheduleSLM.td
+++ b/llvm/lib/Target/X86/X86ScheduleSLM.td
@@ -377,10 +377,8 @@ defm : SLMWriteResPair<WriteVecIMul,  [SLM_FPC_RSV0],   4>;
 defm : SLMWriteResPair<WriteVecIMulX, [SLM_FPC_RSV0],   5, [2]>;
 defm : SLMWriteResPair<WriteVecIMulY, [SLM_FPC_RSV0],   5, [2]>;
 defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
-// FIXME: The below is closer to correct, but caused some perf regressions.
-//defm : SLMWriteResPair<WritePMULLD,  [SLM_FPC_RSV0],   11, [11], 7>;
-defm : SLMWriteResPair<WritePMULLD,  [SLM_FPC_RSV0],   4>;
-defm : SLMWriteResPair<WritePMULLDY, [SLM_FPC_RSV0],   4>;
+defm : SLMWriteResPair<WritePMULLD,  [SLM_FPC_RSV0],   11, [11], 7>;
+defm : X86WriteResPairUnsupported<WritePMULLDY>;
 defm : X86WriteResPairUnsupported<WritePMULLDZ>;
 defm : SLMWriteResPair<WriteShuffle,  [SLM_FPC_RSV0],  1>;
 defm : SLMWriteResPair<WriteShuffleY, [SLM_FPC_RSV0],  1>;
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver1.td b/llvm/lib/Target/X86/X86ScheduleZnver1.td
index fe0484afd227..aada3e0bd906 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver1.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver1.td
@@ -189,15 +189,6 @@ defm : ZnWriteResPair<WriteALU,   [ZnALU], 1>;
 defm : ZnWriteResPair<WriteADC,   [ZnALU], 1>;
 
 defm : ZnWriteResPair<WriteIMul8,     [ZnALU1, ZnMultiplier], 4>;
-//defm : ZnWriteResPair<WriteIMul16,    [ZnALU1, ZnMultiplier], 4>;
-//defm : ZnWriteResPair<WriteIMul16Imm, [ZnALU1, ZnMultiplier], 4>;
-//defm : ZnWriteResPair<WriteIMul16Reg, [ZnALU1, ZnMultiplier], 4>;
-//defm : ZnWriteResPair<WriteIMul32,    [ZnALU1, ZnMultiplier], 4>;
-//defm : ZnWriteResPair<WriteIMul32Imm, [ZnALU1, ZnMultiplier], 4>;
-//defm : ZnWriteResPair<WriteIMul32Reg, [ZnALU1, ZnMultiplier], 4>;
-//defm : ZnWriteResPair<WriteIMul64,    [ZnALU1, ZnMultiplier], 4, [1,1], 2>;
-//defm : ZnWriteResPair<WriteIMul64Imm, [ZnALU1, ZnMultiplier], 4, [1,1], 2>;
-//defm : ZnWriteResPair<WriteIMul64Reg, [ZnALU1, ZnMultiplier], 4, [1,1], 2>;
 
 defm : X86WriteRes<WriteBSWAP32, [ZnALU], 1, [4], 1>;
 defm : X86WriteRes<WriteBSWAP64, [ZnALU], 1, [4], 1>;
@@ -227,12 +218,10 @@ defm : X86WriteRes<WriteBitTest,         [ZnALU], 1, [1], 1>;
 defm : X86WriteRes<WriteBitTestImmLd,    [ZnALU,ZnAGU], 5, [1,1], 2>;
 defm : X86WriteRes<WriteBitTestRegLd,    [ZnALU,ZnAGU], 5, [1,1], 2>;
 defm : X86WriteRes<WriteBitTestSet,      [ZnALU], 2, [1], 2>;
-//defm : X86WriteRes<WriteBitTestSetImmLd, [ZnALU,ZnAGU], 5, [1,1], 2>;
-//defm : X86WriteRes<WriteBitTestSetRegLd, [ZnALU,ZnAGU], 5, [1,1], 2>;
 
 // Bit counts.
-defm : ZnWriteResPair<WriteBSF, [ZnALU], 3>;
-defm : ZnWriteResPair<WriteBSR, [ZnALU], 3>;
+defm : ZnWriteResPair<WriteBSF, [ZnALU], 3, [12], 6, 4, 2>;
+defm : ZnWriteResPair<WriteBSR, [ZnALU], 4, [16], 6, 4, 2>;
 defm : ZnWriteResPair<WriteLZCNT,          [ZnALU], 2>;
 defm : ZnWriteResPair<WriteTZCNT,          [ZnALU], 2>;
 defm : ZnWriteResPair<WritePOPCNT,         [ZnALU], 1>;
@@ -240,9 +229,8 @@ defm : ZnWriteResPair<WritePOPCNT,         [ZnALU], 1>;
 // Treat misc copies as a move.
 def : InstRW<[WriteMove], (instrs COPY)>;
 
-// BMI1 BEXTR/BLS, BMI2 BZHI
+// BMI1 BEXTR, BMI2 BZHI
 defm : ZnWriteResPair<WriteBEXTR, [ZnALU], 1>;
-//defm : ZnWriteResPair<WriteBLS,   [ZnALU], 2>;
 defm : ZnWriteResPair<WriteBZHI,  [ZnALU], 1>;
 
 // IDIV
@@ -271,13 +259,13 @@ defm : X86WriteRes<WriteFLoadX,        [ZnAGU], 8, [1], 1>;
 defm : X86WriteRes<WriteFLoadY,        [ZnAGU], 8, [1], 1>;
 defm : X86WriteRes<WriteFMaskedLoad,   [ZnAGU,ZnFPU01], 8, [1,1], 1>;
 defm : X86WriteRes<WriteFMaskedLoadY,  [ZnAGU,ZnFPU01], 8, [1,2], 2>;
+
 defm : X86WriteRes<WriteFStore,        [ZnAGU], 1, [1], 1>;
 defm : X86WriteRes<WriteFStoreX,       [ZnAGU], 1, [1], 1>;
 defm : X86WriteRes<WriteFStoreY,       [ZnAGU], 1, [1], 1>;
 defm : X86WriteRes<WriteFStoreNT,      [ZnAGU,ZnFPU2], 8, [1,1], 1>;
 defm : X86WriteRes<WriteFStoreNTX,     [ZnAGU], 1, [1], 1>;
 defm : X86WriteRes<WriteFStoreNTY,     [ZnAGU], 1, [1], 1>;
-
 defm : X86WriteRes<WriteFMaskedStore32,  [ZnAGU,ZnFPU01], 4, [1,1], 1>;
 defm : X86WriteRes<WriteFMaskedStore32Y, [ZnAGU,ZnFPU01], 5, [1,2], 2>;
 defm : X86WriteRes<WriteFMaskedStore64,  [ZnAGU,ZnFPU01], 4, [1,1], 1>;
@@ -288,24 +276,24 @@ defm : X86WriteRes<WriteFMoveX,        [ZnFPU], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveY,        [ZnFPU], 1, [1], 1>;
 defm : X86WriteResUnsupported<WriteFMoveZ>;
 
-defm : ZnWriteResFpuPair<WriteFAdd,      [ZnFPU0],  3>;
-defm : ZnWriteResFpuPair<WriteFAddX,     [ZnFPU0],  3>;
-defm : ZnWriteResFpuPair<WriteFAddY,     [ZnFPU0],  3>;
+defm : ZnWriteResFpuPair<WriteFAdd,      [ZnFPU23], 3>;
+defm : ZnWriteResFpuPair<WriteFAddX,     [ZnFPU23], 3>;
+defm : ZnWriteResFpuPair<WriteFAddY,     [ZnFPU23], 3, [2], 2>;
 defm : X86WriteResPairUnsupported<WriteFAddZ>;
-defm : ZnWriteResFpuPair<WriteFAdd64,    [ZnFPU0],  3>;
-defm : ZnWriteResFpuPair<WriteFAdd64X,   [ZnFPU0],  3>;
-defm : ZnWriteResFpuPair<WriteFAdd64Y,   [ZnFPU0],  3>;
+defm : ZnWriteResFpuPair<WriteFAdd64,    [ZnFPU23], 3>;
+defm : ZnWriteResFpuPair<WriteFAdd64X,   [ZnFPU23], 3>;
+defm : ZnWriteResFpuPair<WriteFAdd64Y,   [ZnFPU23], 3, [2], 2>;
 defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
-defm : ZnWriteResFpuPair<WriteFCmp,      [ZnFPU0],  3>;
-defm : ZnWriteResFpuPair<WriteFCmpX,     [ZnFPU0],  3>;
-defm : ZnWriteResFpuPair<WriteFCmpY,     [ZnFPU0],  3>;
+defm : ZnWriteResFpuPair<WriteFCmp,      [ZnFPU01], 1>;
+defm : ZnWriteResFpuPair<WriteFCmpX,     [ZnFPU01], 1>;
+defm : ZnWriteResFpuPair<WriteFCmpY,     [ZnFPU01], 1, [2], 2>;
 defm : X86WriteResPairUnsupported<WriteFCmpZ>;
-defm : ZnWriteResFpuPair<WriteFCmp64,    [ZnFPU0],  3>;
-defm : ZnWriteResFpuPair<WriteFCmp64X,   [ZnFPU0],  3>;
-defm : ZnWriteResFpuPair<WriteFCmp64Y,   [ZnFPU0],  3>;
+defm : ZnWriteResFpuPair<WriteFCmp64,    [ZnFPU01], 1>;
+defm : ZnWriteResFpuPair<WriteFCmp64X,   [ZnFPU01], 1>;
+defm : ZnWriteResFpuPair<WriteFCmp64Y,   [ZnFPU01], 1, [2], 2>;
 defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
-defm : ZnWriteResFpuPair<WriteFCom,      [ZnFPU0],  3>;
-defm : ZnWriteResFpuPair<WriteFComX,     [ZnFPU0],  3>;
+defm : ZnWriteResFpuPair<WriteFCom,      [ZnFPU01,ZnFPU2], 3, [1,1], 2>;
+defm : ZnWriteResFpuPair<WriteFComX,     [ZnFPU01,ZnFPU2], 3, [1,1], 2>;
 defm : ZnWriteResFpuPair<WriteFBlend,    [ZnFPU01], 1>;
 defm : ZnWriteResFpuPair<WriteFBlendY,   [ZnFPU01], 1>;
 defm : X86WriteResPairUnsupported<WriteFBlendZ>;
@@ -346,8 +334,8 @@ defm : X86WriteResPairUnsupported<WriteFRndZ>;
 defm : ZnWriteResFpuPair<WriteFLogic,    [ZnFPU],   1>;
 defm : ZnWriteResFpuPair<WriteFLogicY,   [ZnFPU],   1>;
 defm : X86WriteResPairUnsupported<WriteFLogicZ>;
-defm : ZnWriteResFpuPair<WriteFTest,     [ZnFPU],   1>;
-defm : ZnWriteResFpuPair<WriteFTestY,    [ZnFPU],   1>;
+defm : ZnWriteResFpuPair<WriteFTest,     [ZnFPU12], 2, [2], 1, 7, 1>;
+defm : ZnWriteResFpuPair<WriteFTestY,    [ZnFPU12], 4, [4], 3, 7, 2>;
 defm : X86WriteResPairUnsupported<WriteFTestZ>;
 defm : ZnWriteResFpuPair<WriteFShuffle,  [ZnFPU12], 1>;
 defm : ZnWriteResFpuPair<WriteFShuffleY, [ZnFPU12], 1>;
@@ -410,20 +398,23 @@ defm : X86WriteRes<WriteVecMoveToGpr,    [ZnFPU2], 2, [1], 1>;
 defm : X86WriteRes<WriteVecMoveFromGpr,  [ZnFPU2], 3, [1], 1>;
 defm : X86WriteRes<WriteEMMS,            [ZnFPU], 2, [1], 1>;
 
-defm : ZnWriteResFpuPair<WriteVecShift,   [ZnFPU],   1>;
+defm : ZnWriteResFpuPair<WriteVecShift,   [ZnFPU2],  1>;
 defm : ZnWriteResFpuPair<WriteVecShiftX,  [ZnFPU2],  1>;
-defm : ZnWriteResFpuPair<WriteVecShiftY,  [ZnFPU2],  2>;
+defm : ZnWriteResFpuPair<WriteVecShiftY,  [ZnFPU2],  1, [2], 2>;
 defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
-defm : ZnWriteResFpuPair<WriteVecShiftImm,  [ZnFPU], 1>;
-defm : ZnWriteResFpuPair<WriteVecShiftImmX, [ZnFPU], 1>;
-defm : ZnWriteResFpuPair<WriteVecShiftImmY, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteVecShiftImm,  [ZnFPU2], 1>;
+defm : ZnWriteResFpuPair<WriteVecShiftImmX, [ZnFPU2], 1>;
+defm : ZnWriteResFpuPair<WriteVecShiftImmY, [ZnFPU2], 1, [2], 2>;
 defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
+defm : ZnWriteResFpuPair<WriteVarVecShift,  [ZnFPU1], 3, [2], 1>;
+defm : ZnWriteResFpuPair<WriteVarVecShiftY, [ZnFPU1], 3, [4], 2>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
 defm : ZnWriteResFpuPair<WriteVecLogic,   [ZnFPU],   1>;
 defm : ZnWriteResFpuPair<WriteVecLogicX,  [ZnFPU],   1>;
 defm : ZnWriteResFpuPair<WriteVecLogicY,  [ZnFPU],   1>;
 defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
-defm : ZnWriteResFpuPair<WriteVecTest,    [ZnFPU12], 1, [2], 1, 7, 1>;
-defm : ZnWriteResFpuPair<WriteVecTestY,   [ZnFPU12], 1, [2], 1, 7, 1>;
+defm : ZnWriteResFpuPair<WriteVecTest,    [ZnFPU12], 2, [2], 1, 7, 1>;
+defm : ZnWriteResFpuPair<WriteVecTestY,   [ZnFPU12], 4, [4], 3, 7, 2>;
 defm : X86WriteResPairUnsupported<WriteVecTestZ>;
 defm : ZnWriteResFpuPair<WriteVecALU,     [ZnFPU],   1>;
 defm : ZnWriteResFpuPair<WriteVecALUX,    [ZnFPU],   1>;
@@ -448,7 +439,7 @@ defm : ZnWriteResFpuPair<WriteBlend,      [ZnFPU01], 1>;
 defm : ZnWriteResFpuPair<WriteBlendY,     [ZnFPU01], 1>;
 defm : X86WriteResPairUnsupported<WriteBlendZ>;
 defm : ZnWriteResFpuPair<WriteShuffle256, [ZnFPU],   2>;
-defm : ZnWriteResFpuPair<WriteVPMOV256,   [ZnFPU12],  1, [1], 2>;
+defm : ZnWriteResFpuPair<WriteVPMOV256,   [ZnFPU12],  1, [4], 3>;
 defm : ZnWriteResFpuPair<WriteVarShuffle256, [ZnFPU],   2>;
 defm : ZnWriteResFpuPair<WritePSADBW,     [ZnFPU0],  3>;
 defm : ZnWriteResFpuPair<WritePSADBWX,    [ZnFPU0],  3>;
@@ -456,11 +447,6 @@ defm : ZnWriteResFpuPair<WritePSADBWY,    [ZnFPU0],  3>;
 defm : X86WriteResPairUnsupported<WritePSADBWZ>;
 defm : ZnWriteResFpuPair<WritePHMINPOS,   [ZnFPU0],  4>;
 
-// Vector Shift Operations
-defm : ZnWriteResFpuPair<WriteVarVecShift,  [ZnFPU12], 1>;
-defm : ZnWriteResFpuPair<WriteVarVecShiftY, [ZnFPU12], 1>;
-defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
-
 // Vector insert/extract operations.
 defm : ZnWriteResFpuPair<WriteVecInsert,   [ZnFPU],   1>;
 
@@ -623,15 +609,14 @@ def ZnWriteMul16 : SchedWriteRes<[ZnALU1, ZnMultiplier]> {
 def : SchedAlias<WriteIMul16, ZnWriteMul16>;
 def : SchedAlias<WriteIMul16Imm, ZnWriteMul16>; // TODO: is this right?
 def : SchedAlias<WriteIMul16Reg, ZnWriteMul16>; // TODO: is this right?
-def : SchedAlias<WriteIMul16ImmLd, ZnWriteMul16>; // TODO: this is definitely wrong but matches what the instregex did.
-def : SchedAlias<WriteIMul16RegLd, ZnWriteMul16>; // TODO: this is definitely wrong but matches what the instregex did.
 
 // m16.
 def ZnWriteMul16Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
   let Latency = 8;
 }
 def : SchedAlias<WriteIMul16Ld, ZnWriteMul16Ld>;
-
+def : SchedAlias<WriteIMul16ImmLd, ZnWriteMul16>; // TODO: this is definitely wrong but matches what the instregex did.
+def : SchedAlias<WriteIMul16RegLd, ZnWriteMul16>; // TODO: this is definitely wrong but matches what the instregex did.
 // r32.
 def ZnWriteMul32 : SchedWriteRes<[ZnALU1, ZnMultiplier]> {
   let Latency = 3;
@@ -639,14 +624,14 @@ def ZnWriteMul32 : SchedWriteRes<[ZnALU1, ZnMultiplier]> {
 def : SchedAlias<WriteIMul32, ZnWriteMul32>;
 def : SchedAlias<WriteIMul32Imm, ZnWriteMul32>; // TODO: is this right?
 def : SchedAlias<WriteIMul32Reg, ZnWriteMul32>; // TODO: is this right?
-def : SchedAlias<WriteIMul32ImmLd, ZnWriteMul32>; // TODO: this is definitely wrong but matches what the instregex did.
-def : SchedAlias<WriteIMul32RegLd, ZnWriteMul32>; // TODO: this is definitely wrong but matches what the instregex did.
 
 // m32.
 def ZnWriteMul32Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
   let Latency = 8;
 }
 def : SchedAlias<WriteIMul32Ld, ZnWriteMul32Ld>;
+def : SchedAlias<WriteIMul32ImmLd, ZnWriteMul32>; // TODO: this is definitely wrong but matches what the instregex did.
+def : SchedAlias<WriteIMul32RegLd, ZnWriteMul32>; // TODO: this is definitely wrong but matches what the instregex did.
 
 // r64.
 def ZnWriteMul64 : SchedWriteRes<[ZnALU1, ZnMultiplier]> {
@@ -656,8 +641,6 @@ def ZnWriteMul64 : SchedWriteRes<[ZnALU1, ZnMultiplier]> {
 def : SchedAlias<WriteIMul64, ZnWriteMul64>;
 def : SchedAlias<WriteIMul64Imm, ZnWriteMul64>; // TODO: is this right?
 def : SchedAlias<WriteIMul64Reg, ZnWriteMul64>; // TODO: is this right?
-def : SchedAlias<WriteIMul64ImmLd, ZnWriteMul64>; // TODO: this is definitely wrong but matches what the instregex did.
-def : SchedAlias<WriteIMul64RegLd, ZnWriteMul64>; // TODO: this is definitely wrong but matches what the instregex did.
 
 // m64.
 def ZnWriteMul64Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
@@ -665,6 +648,8 @@ def ZnWriteMul64Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
   let NumMicroOps = 2;
 }
 def : SchedAlias<WriteIMul64Ld, ZnWriteMul64Ld>;
+def : SchedAlias<WriteIMul64ImmLd, ZnWriteMul64>; // TODO: this is definitely wrong but matches what the instregex did.
+def : SchedAlias<WriteIMul64RegLd, ZnWriteMul64>; // TODO: this is definitely wrong but matches what the instregex did.
 
 // MULX
 // Numbers are based on the AMD SOG for Family 17h - Instruction Latencies.
@@ -1101,12 +1086,11 @@ def : InstRW<[WriteMicrocoded], (instregex "VPGATHER(Q|D)(Q|D)(Y?)rm")>;
 
 // HADD, HSUB PS/PD
 // PHADD|PHSUB (S) W/D.
-def : SchedAlias<WritePHAdd,    ZnWriteMicrocoded>;
-def : SchedAlias<WritePHAddLd,  ZnWriteMicrocoded>;
-def : SchedAlias<WritePHAddX,   ZnWriteMicrocoded>;
-def : SchedAlias<WritePHAddXLd, ZnWriteMicrocoded>;
-def : SchedAlias<WritePHAddY,   ZnWriteMicrocoded>;
-def : SchedAlias<WritePHAddYLd, ZnWriteMicrocoded>;
+defm : ZnWriteResFpuPair<WriteFHAdd, [], 7>;
+defm : ZnWriteResFpuPair<WriteFHAddY, [], 7>;
+defm : ZnWriteResFpuPair<WritePHAdd, [], 3>;
+defm : ZnWriteResFpuPair<WritePHAddX, [], 3>;
+defm : ZnWriteResFpuPair<WritePHAddY, [], 3>;
 
 // PCMPGTQ.
 def ZnWritePCMPGTQr : SchedWriteRes<[ZnFPU03]>;
@@ -1446,12 +1430,6 @@ def : InstRW<[ZnWriteSHA256RNDS2Ld], (instrs SHA256RNDS2rm)>;
 
 //-- Arithmetic instructions --//
 
-// HADD, HSUB PS/PD
-def : SchedAlias<WriteFHAdd,    ZnWriteMicrocoded>;
-def : SchedAlias<WriteFHAddLd,  ZnWriteMicrocoded>;
-def : SchedAlias<WriteFHAddY,   ZnWriteMicrocoded>;
-def : SchedAlias<WriteFHAddYLd, ZnWriteMicrocoded>;
-
 // VDIVPS.
 // TODO - convert to ZnWriteResFpuPair
 // y,y,y.
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver2.td b/llvm/lib/Target/X86/X86ScheduleZnver2.td
index 38908a987595..c47d235eab9b 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver2.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver2.td
@@ -195,7 +195,7 @@ defm : X86WriteRes<WriteCMPXCHG, [Zn2ALU], 3, [1], 1>;
 defm : X86WriteRes<WriteCMPXCHGRMW,[Zn2ALU,Zn2AGU], 8, [1,1], 5>;
 defm : X86WriteRes<WriteXCHG, [Zn2ALU], 1, [2], 2>;
 
-defm : Zn2WriteResPair<WriteShift, [Zn2ALU], 1>;
+defm : Zn2WriteResPair<WriteShift,    [Zn2ALU], 1>;
 defm : Zn2WriteResPair<WriteShiftCL,  [Zn2ALU], 1>;
 defm : Zn2WriteResPair<WriteRotate,   [Zn2ALU], 1>;
 defm : Zn2WriteResPair<WriteRotateCL, [Zn2ALU], 1>;
@@ -219,8 +219,8 @@ defm : X86WriteRes<WriteBitTestRegLd,    [Zn2ALU,Zn2AGU], 5, [1,1], 2>;
 defm : X86WriteRes<WriteBitTestSet,      [Zn2ALU], 2, [1], 2>;
 
 // Bit counts.
-defm : Zn2WriteResPair<WriteBSF, [Zn2ALU], 3>;
-defm : Zn2WriteResPair<WriteBSR, [Zn2ALU], 4>;
+defm : Zn2WriteResPair<WriteBSF, [Zn2ALU], 3, [12], 6, 4, 2>;
+defm : Zn2WriteResPair<WriteBSR, [Zn2ALU], 4, [16], 6, 4, 2>;
 defm : Zn2WriteResPair<WriteLZCNT,          [Zn2ALU], 1>;
 defm : Zn2WriteResPair<WriteTZCNT,          [Zn2ALU], 2>;
 defm : Zn2WriteResPair<WritePOPCNT,         [Zn2ALU], 1>;
@@ -230,7 +230,7 @@ def : InstRW<[WriteMove], (instrs COPY)>;
 
 // BMI1 BEXTR, BMI2 BZHI
 defm : Zn2WriteResPair<WriteBEXTR, [Zn2ALU], 1>;
-defm : Zn2WriteResPair<WriteBZHI, [Zn2ALU], 1>;
+defm : Zn2WriteResPair<WriteBZHI,  [Zn2ALU], 1>;
 
 // IDIV
 defm : Zn2WriteResPair<WriteDiv8,   [Zn2ALU2, Zn2Divider], 15, [1,15], 1>;
@@ -247,23 +247,17 @@ def Zn2WriteIMulH : WriteRes<WriteIMulH, [Zn2Multiplier]>{
   let Latency = 3;
   let NumMicroOps = 0;
 }
-
 def  : WriteRes<WriteIMulHLd, [Zn2Multiplier]>{
   let Latency = !add(Zn2WriteIMulH.Latency, Znver2Model.LoadLatency);
   let NumMicroOps = Zn2WriteIMulH.NumMicroOps;
 }
 
-
 // Floating point operations
 defm : X86WriteRes<WriteFLoad,         [Zn2AGU], 8, [1], 1>;
 defm : X86WriteRes<WriteFLoadX,        [Zn2AGU], 8, [1], 1>;
 defm : X86WriteRes<WriteFLoadY,        [Zn2AGU], 8, [1], 1>;
 defm : X86WriteRes<WriteFMaskedLoad,   [Zn2AGU,Zn2FPU01], 8, [1,1], 1>;
 defm : X86WriteRes<WriteFMaskedLoadY,  [Zn2AGU,Zn2FPU01], 8, [1,1], 2>;
-defm : X86WriteRes<WriteFMaskedStore32,  [Zn2AGU,Zn2FPU01], 4, [1,1], 1>;
-defm : X86WriteRes<WriteFMaskedStore32Y, [Zn2AGU,Zn2FPU01], 5, [1,2], 2>;
-defm : X86WriteRes<WriteFMaskedStore64,  [Zn2AGU,Zn2FPU01], 4, [1,1], 1>;
-defm : X86WriteRes<WriteFMaskedStore64Y, [Zn2AGU,Zn2FPU01], 5, [1,2], 2>;
 
 defm : X86WriteRes<WriteFStore,        [Zn2AGU], 1, [1], 1>;
 defm : X86WriteRes<WriteFStoreX,       [Zn2AGU], 1, [1], 1>;
@@ -271,29 +265,34 @@ defm : X86WriteRes<WriteFStoreY,       [Zn2AGU], 1, [1], 1>;
 defm : X86WriteRes<WriteFStoreNT,      [Zn2AGU,Zn2FPU2], 8, [1,1], 1>;
 defm : X86WriteRes<WriteFStoreNTX,     [Zn2AGU], 1, [1], 1>;
 defm : X86WriteRes<WriteFStoreNTY,     [Zn2AGU], 1, [1], 1>;
+defm : X86WriteRes<WriteFMaskedStore32,  [Zn2AGU,Zn2FPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [Zn2AGU,Zn2FPU01], 5, [1,2], 2>;
+defm : X86WriteRes<WriteFMaskedStore64,  [Zn2AGU,Zn2FPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [Zn2AGU,Zn2FPU01], 5, [1,2], 2>;
+
 defm : X86WriteRes<WriteFMove,         [Zn2FPU], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveX,        [Zn2FPU], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveY,        [Zn2FPU], 1, [1], 1>;
 defm : X86WriteResUnsupported<WriteFMoveZ>;
 
-defm : Zn2WriteResFpuPair<WriteFAdd,      [Zn2FPU0],  3>;
-defm : Zn2WriteResFpuPair<WriteFAddX,     [Zn2FPU0],  3>;
-defm : Zn2WriteResFpuPair<WriteFAddY,     [Zn2FPU0],  3>;
+defm : Zn2WriteResFpuPair<WriteFAdd,      [Zn2FPU23], 3>;
+defm : Zn2WriteResFpuPair<WriteFAddX,     [Zn2FPU23], 3>;
+defm : Zn2WriteResFpuPair<WriteFAddY,     [Zn2FPU23], 3>;
 defm : X86WriteResPairUnsupported<WriteFAddZ>;
-defm : Zn2WriteResFpuPair<WriteFAdd64,    [Zn2FPU0],  3>;
-defm : Zn2WriteResFpuPair<WriteFAdd64X,   [Zn2FPU0],  3>;
-defm : Zn2WriteResFpuPair<WriteFAdd64Y,   [Zn2FPU0],  3>;
+defm : Zn2WriteResFpuPair<WriteFAdd64,    [Zn2FPU23], 3>;
+defm : Zn2WriteResFpuPair<WriteFAdd64X,   [Zn2FPU23], 3>;
+defm : Zn2WriteResFpuPair<WriteFAdd64Y,   [Zn2FPU23], 3>;
 defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
-defm : Zn2WriteResFpuPair<WriteFCmp,      [Zn2FPU0],  1>;
-defm : Zn2WriteResFpuPair<WriteFCmpX,     [Zn2FPU0],  1>;
-defm : Zn2WriteResFpuPair<WriteFCmpY,     [Zn2FPU0],  1>;
+defm : Zn2WriteResFpuPair<WriteFCmp,      [Zn2FPU01], 1>;
+defm : Zn2WriteResFpuPair<WriteFCmpX,     [Zn2FPU01], 1>;
+defm : Zn2WriteResFpuPair<WriteFCmpY,     [Zn2FPU01], 1>;
 defm : X86WriteResPairUnsupported<WriteFCmpZ>;
-defm : Zn2WriteResFpuPair<WriteFCmp64,    [Zn2FPU0],  1>;
-defm : Zn2WriteResFpuPair<WriteFCmp64X,   [Zn2FPU0],  1>;
-defm : Zn2WriteResFpuPair<WriteFCmp64Y,   [Zn2FPU0],  1>;
+defm : Zn2WriteResFpuPair<WriteFCmp64,    [Zn2FPU01], 1>;
+defm : Zn2WriteResFpuPair<WriteFCmp64X,   [Zn2FPU01], 1>;
+defm : Zn2WriteResFpuPair<WriteFCmp64Y,   [Zn2FPU01], 1>;
 defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
-defm : Zn2WriteResFpuPair<WriteFCom,      [Zn2FPU0],  3>;
-defm : Zn2WriteResFpuPair<WriteFComX,     [Zn2FPU0],  3>;
+defm : Zn2WriteResFpuPair<WriteFCom,      [Zn2FPU01,Zn2FPU2], 3, [1,1], 2>;
+defm : Zn2WriteResFpuPair<WriteFComX,     [Zn2FPU01,Zn2FPU2], 3, [1,1], 2>;
 defm : Zn2WriteResFpuPair<WriteFBlend,    [Zn2FPU01], 1>;
 defm : Zn2WriteResFpuPair<WriteFBlendY,   [Zn2FPU01], 1>;
 defm : X86WriteResPairUnsupported<WriteFBlendZ>;
@@ -332,8 +331,8 @@ defm : X86WriteResPairUnsupported<WriteFRndZ>;
 defm : Zn2WriteResFpuPair<WriteFLogic,    [Zn2FPU],   1>;
 defm : Zn2WriteResFpuPair<WriteFLogicY,   [Zn2FPU],   1>;
 defm : X86WriteResPairUnsupported<WriteFLogicZ>;
-defm : Zn2WriteResFpuPair<WriteFTest,     [Zn2FPU],   1>;
-defm : Zn2WriteResFpuPair<WriteFTestY,    [Zn2FPU],   1>;
+defm : Zn2WriteResFpuPair<WriteFTest,     [Zn2FPU12], 3, [2], 1, 7, 1>;
+defm : Zn2WriteResFpuPair<WriteFTestY,    [Zn2FPU12], 3, [2], 1, 7, 1>;
 defm : X86WriteResPairUnsupported<WriteFTestZ>;
 defm : Zn2WriteResFpuPair<WriteFShuffle,  [Zn2FPU12], 1>;
 defm : Zn2WriteResFpuPair<WriteFShuffleY, [Zn2FPU12], 1>;
@@ -394,20 +393,23 @@ defm : X86WriteRes<WriteVecMoveToGpr,    [Zn2FPU2], 2, [1], 1>;
 defm : X86WriteRes<WriteVecMoveFromGpr,  [Zn2FPU2], 3, [1], 1>;
 defm : X86WriteRes<WriteEMMS,            [Zn2FPU], 2, [1], 1>;
 
-defm : Zn2WriteResFpuPair<WriteVecShift,   [Zn2FPU],   1>;
+defm : Zn2WriteResFpuPair<WriteVecShift,   [Zn2FPU2],  1>;
 defm : Zn2WriteResFpuPair<WriteVecShiftX,  [Zn2FPU2],  1>;
 defm : Zn2WriteResFpuPair<WriteVecShiftY,  [Zn2FPU2],  1>;
 defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
-defm : Zn2WriteResFpuPair<WriteVecShiftImm,  [Zn2FPU], 1>;
-defm : Zn2WriteResFpuPair<WriteVecShiftImmX, [Zn2FPU], 1>;
-defm : Zn2WriteResFpuPair<WriteVecShiftImmY, [Zn2FPU], 1>;
+defm : Zn2WriteResFpuPair<WriteVecShiftImm,  [Zn2FPU2], 1>;
+defm : Zn2WriteResFpuPair<WriteVecShiftImmX, [Zn2FPU2], 1>;
+defm : Zn2WriteResFpuPair<WriteVecShiftImmY, [Zn2FPU2], 1>;
 defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
+defm : Zn2WriteResFpuPair<WriteVarVecShift,  [Zn2FPU1], 3, [2], 1>;
+defm : Zn2WriteResFpuPair<WriteVarVecShiftY, [Zn2FPU1], 3, [2], 1>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
 defm : Zn2WriteResFpuPair<WriteVecLogic,   [Zn2FPU],   1>;
 defm : Zn2WriteResFpuPair<WriteVecLogicX,  [Zn2FPU],   1>;
 defm : Zn2WriteResFpuPair<WriteVecLogicY,  [Zn2FPU],   1>;
 defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
-defm : Zn2WriteResFpuPair<WriteVecTest,    [Zn2FPU12], 1, [2], 1, 7, 1>;
-defm : Zn2WriteResFpuPair<WriteVecTestY,   [Zn2FPU12], 1, [2], 1, 7, 1>;
+defm : Zn2WriteResFpuPair<WriteVecTest,    [Zn2FPU12], 3, [2], 1, 7, 1>;
+defm : Zn2WriteResFpuPair<WriteVecTestY,   [Zn2FPU12], 3, [2], 1, 7, 1>;
 defm : X86WriteResPairUnsupported<WriteVecTestZ>;
 defm : Zn2WriteResFpuPair<WriteVecALU,     [Zn2FPU],   1>;
 defm : Zn2WriteResFpuPair<WriteVecALUX,    [Zn2FPU],   1>;
@@ -440,11 +442,6 @@ defm : Zn2WriteResFpuPair<WritePSADBWY,    [Zn2FPU0],  3>;
 defm : X86WriteResPairUnsupported<WritePSADBWZ>;
 defm : Zn2WriteResFpuPair<WritePHMINPOS,   [Zn2FPU0],  4>;
 
-// Vector Shift Operations
-defm : Zn2WriteResFpuPair<WriteVarVecShift,  [Zn2FPU12], 3>;
-defm : Zn2WriteResFpuPair<WriteVarVecShiftY, [Zn2FPU12], 3>;
-defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
-
 // Vector insert/extract operations.
 defm : Zn2WriteResFpuPair<WriteVecInsert,   [Zn2FPU],   1>;
 
@@ -486,12 +483,6 @@ defm : Zn2WriteResFpuPair<WriteFVarShuffle256, [Zn2FPU], 100>;
 def Zn2WriteMicrocoded : SchedWriteRes<[]> {
   let Latency = 100;
 }
-defm : Zn2WriteResPair<WriteDPPS, [], 15>;
-defm : Zn2WriteResPair<WriteFHAdd, [], 7>;
-defm : Zn2WriteResPair<WriteFHAddY, [], 7>;
-defm : Zn2WriteResPair<WritePHAdd, [], 3>;
-defm : Zn2WriteResPair<WritePHAddX, [], 3>;
-defm : Zn2WriteResPair<WritePHAddY, [], 3>;
 
 def : SchedAlias<WriteMicrocoded, Zn2WriteMicrocoded>;
 def : SchedAlias<WriteFCMOV, Zn2WriteMicrocoded>;
@@ -1109,6 +1100,14 @@ def : InstRW<[WriteMicrocoded], (instregex "VPGATHER(Q|D)(Q|D)(Y?)rm")>;
 
 //-- Arithmetic instructions --//
 
+// HADD, HSUB PS/PD
+// PHADD|PHSUB (S) W/D.
+defm : Zn2WriteResFpuPair<WriteFHAdd, [], 7>;
+defm : Zn2WriteResFpuPair<WriteFHAddY, [], 7>;
+defm : Zn2WriteResFpuPair<WritePHAdd, [], 3>;
+defm : Zn2WriteResFpuPair<WritePHAddX, [], 3>;
+defm : Zn2WriteResFpuPair<WritePHAddY, [], 3>;
+
 // PCMPGTQ.
 def Zn2WritePCMPGTQr : SchedWriteRes<[Zn2FPU03]>;
 def : InstRW<[Zn2WritePCMPGTQr], (instregex "(V?)PCMPGTQ(Y?)rr")>;
@@ -1479,6 +1478,7 @@ def : SchedAlias<WriteFDiv64YLd, Zn2WriteVDIVPDYLd>;
 
 // DPPS.
 // x,x,i / v,v,v,i.
+defm : Zn2WriteResPair<WriteDPPS, [], 15>;
 def : SchedAlias<WriteDPPSY,  Zn2WriteMicrocoded>;
 
 // x,m,i / v,v,m,i.
diff --git a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
index 5e59081c63b0..78a286ae5b28 100644
--- a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -46,7 +46,7 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible(
 
 SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
-    SDValue Size, Align Alignment, bool isVolatile,
+    SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
     MachinePointerInfo DstPtrInfo) const {
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
   const X86Subtarget &Subtarget =
@@ -67,40 +67,8 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
   // The libc version is likely to be faster for these cases. It can use the
   // address value and run time information about the CPU.
   if (Alignment < Align(4) || !ConstantSize ||
-      ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold()) {
-    // Check to see if there is a specialized entry-point for memory zeroing.
-    ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Val);
-
-    if (const char *bzeroName =
-            (ValC && ValC->isZero())
-                ? DAG.getTargetLoweringInfo().getLibcallName(RTLIB::BZERO)
-                : nullptr) {
-      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-      EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout());
-      Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
-      TargetLowering::ArgListTy Args;
-      TargetLowering::ArgListEntry Entry;
-      Entry.Node = Dst;
-      Entry.Ty = IntPtrTy;
-      Args.push_back(Entry);
-      Entry.Node = Size;
-      Args.push_back(Entry);
-
-      TargetLowering::CallLoweringInfo CLI(DAG);
-      CLI.setDebugLoc(dl)
-          .setChain(Chain)
-          .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-                        DAG.getExternalSymbol(bzeroName, IntPtr),
-                        std::move(Args))
-          .setDiscardResult();
-
-      std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
-      return CallResult.second;
-    }
-
-    // Otherwise have the target-independent code call memset.
+      ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold()) 
     return SDValue();
-  }
 
   uint64_t SizeVal = ConstantSize->getZExtValue();
   SDValue InFlag;
@@ -175,7 +143,8 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
                       DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
                                   DAG.getConstant(Offset, dl, AddrVT)),
                       Val, DAG.getConstant(BytesLeft, dl, SizeVT), Alignment,
-                      isVolatile, false, DstPtrInfo.getWithOffset(Offset));
+                      isVolatile, AlwaysInline,
+                      /* isTailCall */ false, DstPtrInfo.getWithOffset(Offset));
   }
 
   // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
diff --git a/llvm/lib/Target/X86/X86SelectionDAGInfo.h b/llvm/lib/Target/X86/X86SelectionDAGInfo.h
index dac62973636c..19136ca4f6f5 100644
--- a/llvm/lib/Target/X86/X86SelectionDAGInfo.h
+++ b/llvm/lib/Target/X86/X86SelectionDAGInfo.h
@@ -29,7 +29,7 @@ public:
   SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
                                   SDValue Chain, SDValue Dst, SDValue Src,
                                   SDValue Size, Align Alignment,
-                                  bool isVolatile,
+                                  bool isVolatile, bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo) const override;
 
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
diff --git a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
index dba11e8b4000..3317db891cf0 100644
--- a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -181,17 +181,18 @@ private:
   void tracePredStateThroughBlocksAndHarden(MachineFunction &MF);
 
   unsigned saveEFLAGS(MachineBasicBlock &MBB,
-                      MachineBasicBlock::iterator InsertPt, DebugLoc Loc);
+                      MachineBasicBlock::iterator InsertPt,
+                      const DebugLoc &Loc);
   void restoreEFLAGS(MachineBasicBlock &MBB,
-                     MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
+                     MachineBasicBlock::iterator InsertPt, const DebugLoc &Loc,
                      Register Reg);
 
   void mergePredStateIntoSP(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
-                            unsigned PredStateReg);
+                            MachineBasicBlock::iterator InsertPt,
+                            const DebugLoc &Loc, unsigned PredStateReg);
   unsigned extractPredStateFromSP(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator InsertPt,
-                                  DebugLoc Loc);
+                                  const DebugLoc &Loc);
 
   void
   hardenLoadAddr(MachineInstr &MI, MachineOperand &BaseMO,
@@ -203,7 +204,7 @@ private:
   bool canHardenRegister(Register Reg);
   unsigned hardenValueInRegister(Register Reg, MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator InsertPt,
-                                 DebugLoc Loc);
+                                 const DebugLoc &Loc);
   unsigned hardenPostLoad(MachineInstr &MI);
   void hardenReturnInstr(MachineInstr &MI);
   void tracePredStateThroughCall(MachineInstr &MI);
@@ -356,8 +357,8 @@ static void canonicalizePHIOperands(MachineFunction &MF) {
         int OpIdx = DupIndices.pop_back_val();
         // Remove both the block and value operand, again in reverse order to
         // preserve indices.
-        MI.RemoveOperand(OpIdx + 1);
-        MI.RemoveOperand(OpIdx);
+        MI.removeOperand(OpIdx + 1);
+        MI.removeOperand(OpIdx);
       }
 
       Preds.clear();
@@ -1500,7 +1501,7 @@ void X86SpeculativeLoadHardeningPass::tracePredStateThroughBlocksAndHarden(
 /// as the save so that no PHI nodes are inserted.
 unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
-    DebugLoc Loc) {
+    const DebugLoc &Loc) {
   // FIXME: Hard coding this to a 32-bit register class seems weird, but matches
   // what instruction selection does.
   Register Reg = MRI->createVirtualRegister(&X86::GR32RegClass);
@@ -1517,8 +1518,8 @@ unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS(
 /// This must be done within the same basic block as the save in order to
 /// reliably lower.
 void X86SpeculativeLoadHardeningPass::restoreEFLAGS(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
-    Register Reg) {
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
+    const DebugLoc &Loc, Register Reg) {
   BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), X86::EFLAGS).addReg(Reg);
   ++NumInstsInserted;
 }
@@ -1528,8 +1529,8 @@ void X86SpeculativeLoadHardeningPass::restoreEFLAGS(
 /// a way that won't form non-canonical pointers and also will be preserved
 /// across normal stack adjustments.
 void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
-    unsigned PredStateReg) {
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
+    const DebugLoc &Loc, unsigned PredStateReg) {
   Register TmpReg = MRI->createVirtualRegister(PS->RC);
   // FIXME: This hard codes a shift distance based on the number of bits needed
   // to stay canonical on 64-bit. We should compute this somehow and support
@@ -1549,7 +1550,7 @@ void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP(
 /// Extracts the predicate state stored in the high bits of the stack pointer.
 unsigned X86SpeculativeLoadHardeningPass::extractPredStateFromSP(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
-    DebugLoc Loc) {
+    const DebugLoc &Loc) {
   Register PredStateReg = MRI->createVirtualRegister(PS->RC);
   Register TmpReg = MRI->createVirtualRegister(PS->RC);
 
@@ -1907,7 +1908,7 @@ bool X86SpeculativeLoadHardeningPass::canHardenRegister(Register Reg) {
 /// register class as `Reg`.
 unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister(
     Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
-    DebugLoc Loc) {
+    const DebugLoc &Loc) {
   assert(canHardenRegister(Reg) && "Cannot harden this register!");
   assert(Reg.isVirtual() && "Cannot harden a physical register!");
 
diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp
index a3d4d04b1e0d..0d091adc8e77 100644
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -21,6 +21,8 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Function.h"
@@ -247,7 +249,7 @@ bool X86Subtarget::isLegalToCallImmediateAddr() const {
   // FIXME: I386 PE/COFF supports PC relative calls using IMAGE_REL_I386_REL32
   // but WinCOFFObjectWriter::RecordRelocation cannot emit them.  Once it does,
   // the following check for Win32 should be removed.
-  if (In64BitMode || isTargetWin32())
+  if (Is64Bit || isTargetWin32())
     return false;
   return isTargetELF() || TM.getRelocationModel() == Reloc::Static;
 }
@@ -274,12 +276,12 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU,
   // introduced with Intel's Nehalem/Silvermont and AMD's Family10h
   // micro-architectures respectively.
   if (hasSSE42() || hasSSE4A())
-    IsUAMem16Slow = false;
+    IsUnalignedMem16Slow = false;
 
   LLVM_DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel
                     << ", 3DNowLevel " << X863DNowLevel << ", 64bit "
                     << HasX86_64 << "\n");
-  if (In64BitMode && !HasX86_64)
+  if (Is64Bit && !HasX86_64)
     report_fatal_error("64-bit code requested on a subtarget that doesn't "
                        "support it!");
 
@@ -289,7 +291,7 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU,
   if (StackAlignOverride)
     stackAlignment = *StackAlignOverride;
   else if (isTargetDarwin() || isTargetLinux() || isTargetKFreeBSD() ||
-           isTargetNaCl() || In64BitMode)
+           isTargetNaCl() || Is64Bit)
     stackAlignment = Align(16);
 
   // Consume the vector width attribute or apply any target specific limit.
@@ -357,7 +359,7 @@ const RegisterBankInfo *X86Subtarget::getRegBankInfo() const {
 }
 
 bool X86Subtarget::enableEarlyIfConversion() const {
-  return hasCMov() && X86EarlyIfConv;
+  return canUseCMOV() && X86EarlyIfConv;
 }
 
 void X86Subtarget::getPostRAMutations(
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index 5d773f0c57df..09a8b1f1aafb 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -50,24 +50,14 @@ enum class Style {
 } // end namespace PICStyles
 
 class X86Subtarget final : public X86GenSubtargetInfo {
-  // NOTE: Do not add anything new to this list. Coarse, CPU name based flags
-  // are not a good idea. We should be migrating away from these.
-  enum X86ProcFamilyEnum {
-    Others,
-    IntelAtom
-  };
-
   enum X86SSEEnum {
-    NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
+    NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512
   };
 
   enum X863DNowEnum {
     NoThreeDNow, MMX, ThreeDNow, ThreeDNowA
   };
 
-  /// X86 processor family: Intel Atom, and others
-  X86ProcFamilyEnum X86ProcFamily = Others;
-
   /// Which PIC style to use
   PICStyles::Style PICStyle;
 
@@ -79,412 +69,9 @@ class X86Subtarget final : public X86GenSubtargetInfo {
   /// MMX, 3DNow, 3DNow Athlon, or none supported.
   X863DNowEnum X863DNowLevel = NoThreeDNow;
 
-  /// True if the processor supports X87 instructions.
-  bool HasX87 = false;
-
-  /// True if the processor supports CMPXCHG8B.
-  bool HasCmpxchg8b = false;
-
-  /// True if this processor has NOPL instruction
-  /// (generally pentium pro+).
-  bool HasNOPL = false;
-
-  /// True if this processor has conditional move instructions
-  /// (generally pentium pro+).
-  bool HasCMov = false;
-
-  /// True if the processor supports X86-64 instructions.
-  bool HasX86_64 = false;
-
-  /// True if the processor supports POPCNT.
-  bool HasPOPCNT = false;
-
-  /// True if the processor supports SSE4A instructions.
-  bool HasSSE4A = false;
-
-  /// Target has AES instructions
-  bool HasAES = false;
-  bool HasVAES = false;
-
-  /// Target has FXSAVE/FXRESTOR instructions
-  bool HasFXSR = false;
-
-  /// Target has XSAVE instructions
-  bool HasXSAVE = false;
-
-  /// Target has XSAVEOPT instructions
-  bool HasXSAVEOPT = false;
-
-  /// Target has XSAVEC instructions
-  bool HasXSAVEC = false;
-
-  /// Target has XSAVES instructions
-  bool HasXSAVES = false;
-
-  /// Target has carry-less multiplication
-  bool HasPCLMUL = false;
-  bool HasVPCLMULQDQ = false;
-
-  /// Target has Galois Field Arithmetic instructions
-  bool HasGFNI = false;
-
-  /// Target has 3-operand fused multiply-add
-  bool HasFMA = false;
-
-  /// Target has 4-operand fused multiply-add
-  bool HasFMA4 = false;
-
-  /// Target has XOP instructions
-  bool HasXOP = false;
-
-  /// Target has TBM instructions.
-  bool HasTBM = false;
-
-  /// Target has LWP instructions
-  bool HasLWP = false;
-
-  /// True if the processor has the MOVBE instruction.
-  bool HasMOVBE = false;
-
-  /// True if the processor has the RDRAND instruction.
-  bool HasRDRAND = false;
-
-  /// Processor has 16-bit floating point conversion instructions.
-  bool HasF16C = false;
-
-  /// Processor has FS/GS base insturctions.
-  bool HasFSGSBase = false;
-
-  /// Processor has LZCNT instruction.
-  bool HasLZCNT = false;
-
-  /// Processor has BMI1 instructions.
-  bool HasBMI = false;
-
-  /// Processor has BMI2 instructions.
-  bool HasBMI2 = false;
-
-  /// Processor has VBMI instructions.
-  bool HasVBMI = false;
-
-  /// Processor has VBMI2 instructions.
-  bool HasVBMI2 = false;
-
-  /// Processor has Integer Fused Multiply Add
-  bool HasIFMA = false;
-
-  /// Processor has RTM instructions.
-  bool HasRTM = false;
-
-  /// Processor has ADX instructions.
-  bool HasADX = false;
-
-  /// Processor has SHA instructions.
-  bool HasSHA = false;
-
-  /// Processor has PRFCHW instructions.
-  bool HasPRFCHW = false;
-
-  /// Processor has RDSEED instructions.
-  bool HasRDSEED = false;
-
-  /// Processor has LAHF/SAHF instructions in 64-bit mode.
-  bool HasLAHFSAHF64 = false;
-
-  /// Processor has MONITORX/MWAITX instructions.
-  bool HasMWAITX = false;
-
-  /// Processor has Cache Line Zero instruction
-  bool HasCLZERO = false;
-
-  /// Processor has Cache Line Demote instruction
-  bool HasCLDEMOTE = false;
-
-  /// Processor has MOVDIRI instruction (direct store integer).
-  bool HasMOVDIRI = false;
-
-  /// Processor has MOVDIR64B instruction (direct store 64 bytes).
-  bool HasMOVDIR64B = false;
-
-  /// Processor has ptwrite instruction.
-  bool HasPTWRITE = false;
-
-  /// Processor has Prefetch with intent to Write instruction
-  bool HasPREFETCHWT1 = false;
-
-  /// True if SHLD instructions are slow.
-  bool IsSHLDSlow = false;
-
-  /// True if the PMULLD instruction is slow compared to PMULLW/PMULHW and
-  //  PMULUDQ.
-  bool IsPMULLDSlow = false;
-
-  /// True if the PMADDWD instruction is slow compared to PMULLD.
-  bool IsPMADDWDSlow = false;
-
-  /// True if unaligned memory accesses of 16-bytes are slow.
-  bool IsUAMem16Slow = false;
-
-  /// True if unaligned memory accesses of 32-bytes are slow.
-  bool IsUAMem32Slow = false;
-
-  /// True if SSE operations can have unaligned memory operands.
-  /// This may require setting a configuration bit in the processor.
-  bool HasSSEUnalignedMem = false;
-
-  /// True if this processor has the CMPXCHG16B instruction;
-  /// this is true for most x86-64 chips, but not the first AMD chips.
-  bool HasCmpxchg16b = false;
-
-  /// True if the LEA instruction should be used for adjusting
-  /// the stack pointer. This is an optimization for Intel Atom processors.
-  bool UseLeaForSP = false;
-
-  /// True if POPCNT instruction has a false dependency on the destination register.
-  bool HasPOPCNTFalseDeps = false;
-
-  /// True if LZCNT/TZCNT instructions have a false dependency on the destination register.
-  bool HasLZCNTFalseDeps = false;
-
-  /// True if its preferable to combine to a single cross-lane shuffle
-  /// using a variable mask over multiple fixed shuffles.
-  bool HasFastVariableCrossLaneShuffle = false;
-
-  /// True if its preferable to combine to a single per-lane shuffle
-  /// using a variable mask over multiple fixed shuffles.
-  bool HasFastVariablePerLaneShuffle = false;
-
-  /// True if vzeroupper instructions should be inserted after code that uses
-  /// ymm or zmm registers.
-  bool InsertVZEROUPPER = false;
-
-  /// True if there is no performance penalty for writing NOPs with up to
-  /// 7 bytes.
-  bool HasFast7ByteNOP = false;
-
-  /// True if there is no performance penalty for writing NOPs with up to
-  /// 11 bytes.
-  bool HasFast11ByteNOP = false;
-
-  /// True if there is no performance penalty for writing NOPs with up to
-  /// 15 bytes.
-  bool HasFast15ByteNOP = false;
-
-  /// True if gather is reasonably fast. This is true for Skylake client and
-  /// all AVX-512 CPUs.
-  bool HasFastGather = false;
-
-  /// True if hardware SQRTSS instruction is at least as fast (latency) as
-  /// RSQRTSS followed by a Newton-Raphson iteration.
-  bool HasFastScalarFSQRT = false;
-
-  /// True if hardware SQRTPS/VSQRTPS instructions are at least as fast
-  /// (throughput) as RSQRTPS/VRSQRTPS followed by a Newton-Raphson iteration.
-  bool HasFastVectorFSQRT = false;
-
-  /// True if 8-bit divisions are significantly faster than
-  /// 32-bit divisions and should be used when possible.
-  bool HasSlowDivide32 = false;
-
-  /// True if 32-bit divides are significantly faster than
-  /// 64-bit divisions and should be used when possible.
-  bool HasSlowDivide64 = false;
-
-  /// True if LZCNT instruction is fast.
-  bool HasFastLZCNT = false;
-
-  /// True if SHLD based rotate is fast.
-  bool HasFastSHLDRotate = false;
-
-  /// True if the processor supports macrofusion.
-  bool HasMacroFusion = false;
-
-  /// True if the processor supports branch fusion.
-  bool HasBranchFusion = false;
-
-  /// True if the processor has enhanced REP MOVSB/STOSB.
-  bool HasERMSB = false;
-
-  /// True if the processor has fast short REP MOV.
-  bool HasFSRM = false;
-
-  /// True if the short functions should be padded to prevent
-  /// a stall when returning too early.
-  bool PadShortFunctions = false;
-
-  /// True if two memory operand instructions should use a temporary register
-  /// instead.
-  bool SlowTwoMemOps = false;
-
-  /// True if the LEA instruction inputs have to be ready at address generation
-  /// (AG) time.
-  bool LEAUsesAG = false;
-
-  /// True if the LEA instruction with certain arguments is slow
-  bool SlowLEA = false;
-
-  /// True if the LEA instruction has all three source operands: base, index,
-  /// and offset or if the LEA instruction uses base and index registers where
-  /// the base is EBP, RBP,or R13
-  bool Slow3OpsLEA = false;
-
-  /// True if INC and DEC instructions are slow when writing to flags
-  bool SlowIncDec = false;
-
-  /// Processor has AVX-512 PreFetch Instructions
-  bool HasPFI = false;
-
-  /// Processor has AVX-512 Exponential and Reciprocal Instructions
-  bool HasERI = false;
-
-  /// Processor has AVX-512 Conflict Detection Instructions
-  bool HasCDI = false;
-
-  /// Processor has AVX-512 population count Instructions
-  bool HasVPOPCNTDQ = false;
-
-  /// Processor has AVX-512 Doubleword and Quadword instructions
-  bool HasDQI = false;
-
-  /// Processor has AVX-512 Byte and Word instructions
-  bool HasBWI = false;
-
-  /// Processor has AVX-512 Vector Length eXtenstions
-  bool HasVLX = false;
-
-  /// Processor has AVX-512 16 bit floating-point extenstions
-  bool HasFP16 = false;
-
-  /// Processor has PKU extenstions
-  bool HasPKU = false;
-
-  /// Processor has AVX-512 Vector Neural Network Instructions
-  bool HasVNNI = false;
-
-  /// Processor has AVX Vector Neural Network Instructions
-  bool HasAVXVNNI = false;
-
-  /// Processor has AVX-512 bfloat16 floating-point extensions
-  bool HasBF16 = false;
-
-  /// Processor supports ENQCMD instructions
-  bool HasENQCMD = false;
-
-  /// Processor has AVX-512 Bit Algorithms instructions
-  bool HasBITALG = false;
-
-  /// Processor has AVX-512 vp2intersect instructions
-  bool HasVP2INTERSECT = false;
-
-  /// Processor supports CET SHSTK - Control-Flow Enforcement Technology
-  /// using Shadow Stack
-  bool HasSHSTK = false;
-
-  /// Processor supports Invalidate Process-Context Identifier
-  bool HasINVPCID = false;
-
-  /// Processor has Software Guard Extensions
-  bool HasSGX = false;
-
-  /// Processor supports Flush Cache Line instruction
-  bool HasCLFLUSHOPT = false;
-
-  /// Processor supports Cache Line Write Back instruction
-  bool HasCLWB = false;
-
-  /// Processor supports Write Back No Invalidate instruction
-  bool HasWBNOINVD = false;
-
-  /// Processor support RDPID instruction
-  bool HasRDPID = false;
-
-  /// Processor supports WaitPKG instructions
-  bool HasWAITPKG = false;
-
-  /// Processor supports PCONFIG instruction
-  bool HasPCONFIG = false;
-
-  /// Processor support key locker instructions
-  bool HasKL = false;
-
-  /// Processor support key locker wide instructions
-  bool HasWIDEKL = false;
-
-  /// Processor supports HRESET instruction
-  bool HasHRESET = false;
-
-  /// Processor supports SERIALIZE instruction
-  bool HasSERIALIZE = false;
-
-  /// Processor supports TSXLDTRK instruction
-  bool HasTSXLDTRK = false;
-
-  /// Processor has AMX support
-  bool HasAMXTILE = false;
-  bool HasAMXBF16 = false;
-  bool HasAMXINT8 = false;
-
-  /// Processor supports User Level Interrupt instructions
-  bool HasUINTR = false;
-
-  /// Enable SSE4.2 CRC32 instruction (Used when SSE4.2 is supported but
-  /// function is GPR only)
-  bool HasCRC32 = false;
-
-  /// Processor has a single uop BEXTR implementation.
-  bool HasFastBEXTR = false;
-
-  /// Try harder to combine to horizontal vector ops if they are fast.
-  bool HasFastHorizontalOps = false;
-
-  /// Prefer a left/right scalar logical shifts pair over a shift+and pair.
-  bool HasFastScalarShiftMasks = false;
-
-  /// Prefer a left/right vector logical shifts pair over a shift+and pair.
-  bool HasFastVectorShiftMasks = false;
-
-  /// Prefer a movbe over a single-use load + bswap / single-use bswap + store.
-  bool HasFastMOVBE = false;
-
-  /// Use a retpoline thunk rather than indirect calls to block speculative
-  /// execution.
-  bool UseRetpolineIndirectCalls = false;
-
-  /// Use a retpoline thunk or remove any indirect branch to block speculative
-  /// execution.
-  bool UseRetpolineIndirectBranches = false;
-
-  /// Deprecated flag, query `UseRetpolineIndirectCalls` and
-  /// `UseRetpolineIndirectBranches` instead.
-  bool DeprecatedUseRetpoline = false;
-
-  /// When using a retpoline thunk, call an externally provided thunk rather
-  /// than emitting one inside the compiler.
-  bool UseRetpolineExternalThunk = false;
-
-  /// Prevent generation of indirect call/branch instructions from memory,
-  /// and force all indirect call/branch instructions from a register to be
-  /// preceded by an LFENCE. Also decompose RET instructions into a
-  /// POP+LFENCE+JMP sequence.
-  bool UseLVIControlFlowIntegrity = false;
-
-  /// Enable Speculative Execution Side Effect Suppression
-  bool UseSpeculativeExecutionSideEffectSuppression = false;
-
-  /// Insert LFENCE instructions to prevent data speculatively injected into
-  /// loads from being used maliciously.
-  bool UseLVILoadHardening = false;
-
-  /// Use an instruction sequence for taking the address of a global that allows
-  /// a memory tag in the upper address bits.
-  bool AllowTaggedGlobals = false;
-
-  /// Use software floating point for code generation.
-  bool UseSoftFloat = false;
-
-  /// Use alias analysis during code generation.
-  bool UseAA = false;
-
+#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER)                    \
+  bool ATTRIBUTE = DEFAULT;
+#include "X86GenSubtargetInfo.inc"
   /// The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
   Align stackAlignment = Align(4);
@@ -496,21 +83,6 @@ class X86Subtarget final : public X86GenSubtargetInfo {
   // FIXME: this is a known good value for Yonah. How about others?
   unsigned MaxInlineSizeThreshold = 128;
 
-  /// Indicates target prefers 128 bit instructions.
-  bool Prefer128Bit = false;
-
-  /// Indicates target prefers 256 bit instructions.
-  bool Prefer256Bit = false;
-
-  /// Indicates target prefers AVX512 mask registers.
-  bool PreferMaskRegisters = false;
-
-  /// Use Silvermont specific arithmetic costs.
-  bool UseSLMArithCosts = false;
-
-  /// Use Goldmont specific floating point div/sqrt costs.
-  bool UseGLMDivSqrtCosts = false;
-
   /// What processor and OS we're targeting.
   Triple TargetTriple;
 
@@ -520,7 +92,6 @@ class X86Subtarget final : public X86GenSubtargetInfo {
   std::unique_ptr<RegisterBankInfo> RegBankInfo;
   std::unique_ptr<InstructionSelector> InstSelector;
 
-private:
   /// Override the stack alignment.
   MaybeAlign StackAlignOverride;
 
@@ -534,15 +105,6 @@ private:
   /// Required vector width from function attribute.
   unsigned RequiredVectorWidth;
 
-  /// True if compiling for 64-bit, false for 16-bit or 32-bit.
-  bool In64BitMode = false;
-
-  /// True if compiling for 32-bit, false for 16-bit or 64-bit.
-  bool In32BitMode = false;
-
-  /// True if compiling for 16-bit, false for 32-bit or 64-bit.
-  bool In16BitMode = false;
-
   X86SelectionDAGInfo TSInfo;
   // Ordering here is important. X86InstrInfo initializes X86RegisterInfo which
   // X86TargetLowering needs.
@@ -608,38 +170,32 @@ private:
   void initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
 
 public:
-  /// Is this x86_64? (disregarding specific ABI / programming model)
-  bool is64Bit() const {
-    return In64BitMode;
-  }
 
-  bool is32Bit() const {
-    return In32BitMode;
-  }
-
-  bool is16Bit() const {
-    return In16BitMode;
-  }
+#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER)                    \
+  bool GETTER() const { return ATTRIBUTE; }
+#include "X86GenSubtargetInfo.inc"
 
   /// Is this x86_64 with the ILP32 programming model (x32 ABI)?
   bool isTarget64BitILP32() const {
-    return In64BitMode && (TargetTriple.isX32() || TargetTriple.isOSNaCl());
+    return Is64Bit && (TargetTriple.isX32() || TargetTriple.isOSNaCl());
   }
 
   /// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
   bool isTarget64BitLP64() const {
-    return In64BitMode && (!TargetTriple.isX32() && !TargetTriple.isOSNaCl());
+    return Is64Bit && (!TargetTriple.isX32() && !TargetTriple.isOSNaCl());
   }
 
   PICStyles::Style getPICStyle() const { return PICStyle; }
   void setPICStyle(PICStyles::Style Style)  { PICStyle = Style; }
 
-  bool hasX87() const { return HasX87; }
-  bool hasCmpxchg8b() const { return HasCmpxchg8b; }
-  bool hasNOPL() const { return HasNOPL; }
+  bool canUseCMPXCHG8B() const { return hasCX8(); }
+  bool canUseCMPXCHG16B() const {
+    // CX16 is just the CPUID bit, instruction requires 64-bit mode too.
+    return hasCX16() && is64Bit();
+  }
   // SSE codegen depends on cmovs, and all SSE1+ processors support them.
   // All 64-bit processors support cmov.
-  bool hasCMov() const { return HasCMov || X86SSELevel >= SSE1 || is64Bit(); }
+  bool canUseCMOV() const { return hasCMOV() || hasSSE1() || is64Bit(); }
   bool hasSSE1() const { return X86SSELevel >= SSE1; }
   bool hasSSE2() const { return X86SSELevel >= SSE2; }
   bool hasSSE3() const { return X86SSELevel >= SSE3; }
@@ -648,146 +204,26 @@ public:
   bool hasSSE42() const { return X86SSELevel >= SSE42; }
   bool hasAVX() const { return X86SSELevel >= AVX; }
   bool hasAVX2() const { return X86SSELevel >= AVX2; }
-  bool hasAVX512() const { return X86SSELevel >= AVX512F; }
+  bool hasAVX512() const { return X86SSELevel >= AVX512; }
   bool hasInt256() const { return hasAVX2(); }
-  bool hasSSE4A() const { return HasSSE4A; }
   bool hasMMX() const { return X863DNowLevel >= MMX; }
-  bool has3DNow() const { return X863DNowLevel >= ThreeDNow; }
-  bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
-  bool hasPOPCNT() const { return HasPOPCNT; }
-  bool hasAES() const { return HasAES; }
-  bool hasVAES() const { return HasVAES; }
-  bool hasFXSR() const { return HasFXSR; }
-  bool hasXSAVE() const { return HasXSAVE; }
-  bool hasXSAVEOPT() const { return HasXSAVEOPT; }
-  bool hasXSAVEC() const { return HasXSAVEC; }
-  bool hasXSAVES() const { return HasXSAVES; }
-  bool hasPCLMUL() const { return HasPCLMUL; }
-  bool hasVPCLMULQDQ() const { return HasVPCLMULQDQ; }
-  bool hasGFNI() const { return HasGFNI; }
-  // Prefer FMA4 to FMA - its better for commutation/memory folding and
-  // has equal or better performance on all supported targets.
-  bool hasFMA() const { return HasFMA; }
-  bool hasFMA4() const { return HasFMA4; }
+  bool hasThreeDNow() const { return X863DNowLevel >= ThreeDNow; }
+  bool hasThreeDNowA() const { return X863DNowLevel >= ThreeDNowA; }
   bool hasAnyFMA() const { return hasFMA() || hasFMA4(); }
-  bool hasXOP() const { return HasXOP; }
-  bool hasTBM() const { return HasTBM; }
-  bool hasLWP() const { return HasLWP; }
-  bool hasMOVBE() const { return HasMOVBE; }
-  bool hasRDRAND() const { return HasRDRAND; }
-  bool hasF16C() const { return HasF16C; }
-  bool hasFSGSBase() const { return HasFSGSBase; }
-  bool hasLZCNT() const { return HasLZCNT; }
-  bool hasBMI() const { return HasBMI; }
-  bool hasBMI2() const { return HasBMI2; }
-  bool hasVBMI() const { return HasVBMI; }
-  bool hasVBMI2() const { return HasVBMI2; }
-  bool hasIFMA() const { return HasIFMA; }
-  bool hasRTM() const { return HasRTM; }
-  bool hasADX() const { return HasADX; }
-  bool hasSHA() const { return HasSHA; }
-  bool hasPRFCHW() const { return HasPRFCHW; }
-  bool hasPREFETCHWT1() const { return HasPREFETCHWT1; }
   bool hasPrefetchW() const {
     // The PREFETCHW instruction was added with 3DNow but later CPUs gave it
     // its own CPUID bit as part of deprecating 3DNow. Intel eventually added
     // it and KNL has another that prefetches to L2 cache. We assume the
     // L1 version exists if the L2 version does.
-    return has3DNow() || hasPRFCHW() || hasPREFETCHWT1();
+    return hasThreeDNow() || hasPRFCHW() || hasPREFETCHWT1();
   }
   bool hasSSEPrefetch() const {
     // We implicitly enable these when we have a write prefix supporting cache
     // level OR if we have prfchw, but don't already have a read prefetch from
     // 3dnow.
-    return hasSSE1() || (hasPRFCHW() && !has3DNow()) || hasPREFETCHWT1();
-  }
-  bool hasRDSEED() const { return HasRDSEED; }
-  bool hasLAHFSAHF() const { return HasLAHFSAHF64 || !is64Bit(); }
-  bool hasMWAITX() const { return HasMWAITX; }
-  bool hasCLZERO() const { return HasCLZERO; }
-  bool hasCLDEMOTE() const { return HasCLDEMOTE; }
-  bool hasMOVDIRI() const { return HasMOVDIRI; }
-  bool hasMOVDIR64B() const { return HasMOVDIR64B; }
-  bool hasPTWRITE() const { return HasPTWRITE; }
-  bool isSHLDSlow() const { return IsSHLDSlow; }
-  bool isPMULLDSlow() const { return IsPMULLDSlow; }
-  bool isPMADDWDSlow() const { return IsPMADDWDSlow; }
-  bool isUnalignedMem16Slow() const { return IsUAMem16Slow; }
-  bool isUnalignedMem32Slow() const { return IsUAMem32Slow; }
-  bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
-  bool hasCmpxchg16b() const { return HasCmpxchg16b && is64Bit(); }
-  bool useLeaForSP() const { return UseLeaForSP; }
-  bool hasPOPCNTFalseDeps() const { return HasPOPCNTFalseDeps; }
-  bool hasLZCNTFalseDeps() const { return HasLZCNTFalseDeps; }
-  bool hasFastVariableCrossLaneShuffle() const {
-    return HasFastVariableCrossLaneShuffle;
-  }
-  bool hasFastVariablePerLaneShuffle() const {
-    return HasFastVariablePerLaneShuffle;
+    return hasSSE1() || (hasPRFCHW() && !hasThreeDNow()) || hasPREFETCHWT1();
   }
-  bool insertVZEROUPPER() const { return InsertVZEROUPPER; }
-  bool hasFastGather() const { return HasFastGather; }
-  bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
-  bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
-  bool hasFastLZCNT() const { return HasFastLZCNT; }
-  bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
-  bool hasFastBEXTR() const { return HasFastBEXTR; }
-  bool hasFastHorizontalOps() const { return HasFastHorizontalOps; }
-  bool hasFastScalarShiftMasks() const { return HasFastScalarShiftMasks; }
-  bool hasFastVectorShiftMasks() const { return HasFastVectorShiftMasks; }
-  bool hasFastMOVBE() const { return HasFastMOVBE; }
-  bool hasMacroFusion() const { return HasMacroFusion; }
-  bool hasBranchFusion() const { return HasBranchFusion; }
-  bool hasERMSB() const { return HasERMSB; }
-  bool hasFSRM() const { return HasFSRM; }
-  bool hasSlowDivide32() const { return HasSlowDivide32; }
-  bool hasSlowDivide64() const { return HasSlowDivide64; }
-  bool padShortFunctions() const { return PadShortFunctions; }
-  bool slowTwoMemOps() const { return SlowTwoMemOps; }
-  bool LEAusesAG() const { return LEAUsesAG; }
-  bool slowLEA() const { return SlowLEA; }
-  bool slow3OpsLEA() const { return Slow3OpsLEA; }
-  bool slowIncDec() const { return SlowIncDec; }
-  bool hasCDI() const { return HasCDI; }
-  bool hasVPOPCNTDQ() const { return HasVPOPCNTDQ; }
-  bool hasPFI() const { return HasPFI; }
-  bool hasERI() const { return HasERI; }
-  bool hasDQI() const { return HasDQI; }
-  bool hasBWI() const { return HasBWI; }
-  bool hasVLX() const { return HasVLX; }
-  bool hasFP16() const { return HasFP16; }
-  bool hasPKU() const { return HasPKU; }
-  bool hasVNNI() const { return HasVNNI; }
-  bool hasBF16() const { return HasBF16; }
-  bool hasVP2INTERSECT() const { return HasVP2INTERSECT; }
-  bool hasBITALG() const { return HasBITALG; }
-  bool hasSHSTK() const { return HasSHSTK; }
-  bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; }
-  bool hasCLWB() const { return HasCLWB; }
-  bool hasWBNOINVD() const { return HasWBNOINVD; }
-  bool hasRDPID() const { return HasRDPID; }
-  bool hasWAITPKG() const { return HasWAITPKG; }
-  bool hasPCONFIG() const { return HasPCONFIG; }
-  bool hasSGX() const { return HasSGX; }
-  bool hasINVPCID() const { return HasINVPCID; }
-  bool hasENQCMD() const { return HasENQCMD; }
-  bool hasKL() const { return HasKL; }
-  bool hasWIDEKL() const { return HasWIDEKL; }
-  bool hasHRESET() const { return HasHRESET; }
-  bool hasSERIALIZE() const { return HasSERIALIZE; }
-  bool hasTSXLDTRK() const { return HasTSXLDTRK; }
-  bool hasUINTR() const { return HasUINTR; }
-  bool hasCRC32() const { return HasCRC32; }
-  bool useRetpolineIndirectCalls() const { return UseRetpolineIndirectCalls; }
-  bool useRetpolineIndirectBranches() const {
-    return UseRetpolineIndirectBranches;
-  }
-  bool hasAVXVNNI() const { return HasAVXVNNI; }
-  bool hasAMXTILE() const { return HasAMXTILE; }
-  bool hasAMXBF16() const { return HasAMXBF16; }
-  bool hasAMXINT8() const { return HasAMXINT8; }
-  bool useRetpolineExternalThunk() const { return UseRetpolineExternalThunk; }
-
+  bool canUseLAHFSAHF() const { return hasLAHFSAHF64() || !is64Bit(); }
   // These are generic getters that OR together all of the thunk types
   // supported by the subtarget. Therefore useIndirectThunk*() will return true
   // if any respective thunk feature is enabled.
@@ -798,16 +234,6 @@ public:
     return useRetpolineIndirectBranches() || useLVIControlFlowIntegrity();
   }
 
-  bool preferMaskRegisters() const { return PreferMaskRegisters; }
-  bool useSLMArithCosts() const { return UseSLMArithCosts; }
-  bool useGLMDivSqrtCosts() const { return UseGLMDivSqrtCosts; }
-  bool useLVIControlFlowIntegrity() const { return UseLVIControlFlowIntegrity; }
-  bool allowTaggedGlobals() const { return AllowTaggedGlobals; }
-  bool useLVILoadHardening() const { return UseLVILoadHardening; }
-  bool useSpeculativeExecutionSideEffectSuppression() const {
-    return UseSpeculativeExecutionSideEffectSuppression;
-  }
-
   unsigned getPreferVectorWidth() const { return PreferVectorWidth; }
   unsigned getRequiredVectorWidth() const { return RequiredVectorWidth; }
 
@@ -834,11 +260,6 @@ public:
 
   bool isXRaySupported() const override { return is64Bit(); }
 
-  /// TODO: to be removed later and replaced with suitable properties
-  bool isAtom() const { return X86ProcFamily == IntelAtom; }
-  bool useSoftFloat() const { return UseSoftFloat; }
-  bool useAA() const override { return UseAA; }
-
   /// Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
   /// no-sse2). There isn't any reason to disable it if the target processor
   /// supports it.
@@ -850,7 +271,7 @@ public:
   bool isTargetFreeBSD() const { return TargetTriple.isOSFreeBSD(); }
   bool isTargetDragonFly() const { return TargetTriple.isOSDragonFly(); }
   bool isTargetSolaris() const { return TargetTriple.isOSSolaris(); }
-  bool isTargetPS4() const { return TargetTriple.isPS4CPU(); }
+  bool isTargetPS() const { return TargetTriple.isPS(); }
 
   bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
   bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
@@ -890,9 +311,9 @@ public:
 
   bool isOSWindows() const { return TargetTriple.isOSWindows(); }
 
-  bool isTargetWin64() const { return In64BitMode && isOSWindows(); }
+  bool isTargetWin64() const { return Is64Bit && isOSWindows(); }
 
-  bool isTargetWin32() const { return !In64BitMode && isOSWindows(); }
+  bool isTargetWin32() const { return !Is64Bit && isOSWindows(); }
 
   bool isPICStyleGOT() const { return PICStyle == PICStyles::Style::GOT; }
   bool isPICStyleRIPRel() const { return PICStyle == PICStyles::Style::RIPRel; }
@@ -990,8 +411,6 @@ public:
   AntiDepBreakMode getAntiDepBreakMode() const override {
     return TargetSubtargetInfo::ANTIDEP_CRITICAL;
   }
-
-  bool enableAdvancedRASplitCost() const override { return false; }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index e3d0128dd73d..4249788e3540 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -27,13 +27,16 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/ExecutionDomainFix.h"
+#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/DataLayout.h"
@@ -56,6 +59,11 @@ static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
                                cl::desc("Enable the machine combiner pass"),
                                cl::init(true), cl::Hidden);
 
+static cl::opt<bool>
+    EnableTileRAPass("x86-tile-ra",
+                     cl::desc("Enable the tile register allocation pass"),
+                     cl::init(true), cl::Hidden);
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
   // Register the target.
   RegisterTargetMachine<X86TargetMachine> X(getTheX86_32Target());
@@ -65,6 +73,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
   initializeX86LowerAMXIntrinsicsLegacyPassPass(PR);
   initializeX86LowerAMXTypeLegacyPassPass(PR);
   initializeX86PreAMXConfigPassPass(PR);
+  initializeX86PreTileConfigPass(PR);
   initializeGlobalISel(PR);
   initializeWinEHStatePassPass(PR);
   initializeFixupBWInstPassPass(PR);
@@ -75,6 +84,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
   initializeX86CallFrameOptimizationPass(PR);
   initializeX86CmovConverterPassPass(PR);
   initializeX86TileConfigPass(PR);
+  initializeX86FastPreTileConfigPass(PR);
   initializeX86FastTileConfigPass(PR);
   initializeX86LowerTileCopyPass(PR);
   initializeX86ExpandPseudoPass(PR);
@@ -154,7 +164,7 @@ static Reloc::Model getEffectiveRelocModel(const Triple &TT,
                                            bool JIT,
                                            Optional<Reloc::Model> RM) {
   bool is64Bit = TT.getArch() == Triple::x86_64;
-  if (!RM.hasValue()) {
+  if (!RM) {
     // JIT codegen should use static relocations by default, since it's
     // typically executed in process and not relocatable.
     if (JIT)
@@ -218,9 +228,9 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT,
           getEffectiveX86CodeModel(CM, JIT, TT.getArch() == Triple::x86_64),
           OL),
       TLOF(createTLOF(getTargetTriple())), IsJIT(JIT) {
-  // On PS4, the "return address" of a 'noreturn' call must still be within
+  // On PS4/PS5, the "return address" of a 'noreturn' call must still be within
   // the calling function, and TrapUnreachable is an easy way to get that.
-  if (TT.isPS4() || TT.isOSBinFormatMachO()) {
+  if (TT.isPS() || TT.isOSBinFormatMachO()) {
     this->Options.TrapUnreachable = true;
     this->Options.NoTrapAfterNoreturn = TT.isOSBinFormatMachO();
   }
@@ -333,7 +343,7 @@ bool X86TargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
 //===----------------------------------------------------------------------===//
 
 TargetTransformInfo
-X86TargetMachine::getTargetTransformInfo(const Function &F) {
+X86TargetMachine::getTargetTransformInfo(const Function &F) const {
   return TargetTransformInfo(X86TTIImpl(this, F));
 }
 
@@ -382,7 +392,7 @@ public:
   void addPreEmitPass() override;
   void addPreEmitPass2() override;
   void addPreSched2() override;
-  bool addPreRewrite() override;
+  bool addRegAssignAndRewriteOptimized() override;
 
   std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
 };
@@ -417,9 +427,6 @@ void X86PassConfig::addIRPasses() {
   addPass(createX86LowerAMXIntrinsicsPass());
   addPass(createX86LowerAMXTypePass());
 
-  if (TM->getOptLevel() == CodeGenOpt::None)
-    addPass(createX86PreAMXConfigPass());
-
   TargetPassConfig::addIRPasses();
 
   if (TM->getOptLevel() != CodeGenOpt::None) {
@@ -441,6 +448,9 @@ void X86PassConfig::addIRPasses() {
       addPass(createCFGuardCheckPass());
     }
   }
+
+  if (TM->Options.JMCInstrument)
+    addPass(createJMCInstrumenterPass());
 }
 
 bool X86PassConfig::addInstSelector() {
@@ -505,9 +515,10 @@ void X86PassConfig::addPreRegAlloc() {
   addPass(createX86FlagsCopyLoweringPass());
   addPass(createX86DynAllocaExpander());
 
-  if (getOptLevel() != CodeGenOpt::None) {
+  if (getOptLevel() != CodeGenOpt::None)
     addPass(createX86PreTileConfigPass());
-  }
+  else
+    addPass(createX86FastPreTileConfigPass());
 }
 
 void X86PassConfig::addMachineSSAOptimization() {
@@ -607,11 +618,21 @@ bool X86PassConfig::addPostFastRegAllocRewrite() {
   return true;
 }
 
-bool X86PassConfig::addPreRewrite() {
-  addPass(createX86TileConfigPass());
-  return true;
-}
-
 std::unique_ptr<CSEConfigBase> X86PassConfig::getCSEConfig() const {
   return getStandardCSEConfigForOpt(TM->getOptLevel());
 }
+
+static bool onlyAllocateTileRegisters(const TargetRegisterInfo &TRI,
+                                      const TargetRegisterClass &RC) {
+  return static_cast<const X86RegisterInfo &>(TRI).isTileRegisterClass(&RC);
+}
+
+bool X86PassConfig::addRegAssignAndRewriteOptimized() {
+  // Don't support tile RA when RA is specified by command line "-regalloc".
+  if (!isCustomizedRegAlloc() && EnableTileRAPass) {
+    // Allocate tile register first.
+    addPass(createGreedyRegisterAllocator(onlyAllocateTileRegisters));
+    addPass(createX86TileConfigPass());
+  }
+  return TargetPassConfig::addRegAssignAndRewriteOptimized();
+}
diff --git a/llvm/lib/Target/X86/X86TargetMachine.h b/llvm/lib/Target/X86/X86TargetMachine.h
index 69d7e48b8977..70df8da77641 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.h
+++ b/llvm/lib/Target/X86/X86TargetMachine.h
@@ -44,7 +44,7 @@ public:
   // attributes of each function.
   const X86Subtarget *getSubtargetImpl() const = delete;
 
-  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+  TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
 
   // Set up the pass pipeline.
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 5b95c10332dc..b36f8a3d06d0 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1085,7 +1085,8 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
 InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
                                            VectorType *BaseTp,
                                            ArrayRef<int> Mask, int Index,
-                                           VectorType *SubTp) {
+                                           VectorType *SubTp,
+                                           ArrayRef<const Value *> Args) {
   // 64-bit packed float vectors (v2f32) are widened to type v4f32.
   // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp);
@@ -1223,6 +1224,63 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
       auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
                                               LegalVT.getVectorNumElements());
 
+      if (!Mask.empty() && NumOfDests.isValid()) {
+        // Try to perform better estimation of the permutation.
+        // 1. Split the source/destination vectors into real registers.
+        // 2. Do the mask analysis to identify which real registers are
+        // permuted. If more than 1 source registers are used for the
+        // destination register building, the cost for this destination register
+        // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
+        // source register is used, build mask and calculate the cost as a cost
+        // of PermuteSingleSrc.
+        // Also, for the single register permute we try to identify if the
+        // destination register is just a copy of the source register or the
+        // copy of the previous destination register (the cost is
+        // TTI::TCC_Basic). If the source register is just reused, the cost for
+        // this operation is 0.
+        unsigned E = *NumOfDests.getValue();
+        unsigned NormalizedVF =
+            LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
+        unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
+        unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
+        SmallVector<int> NormalizedMask(NormalizedVF, UndefMaskElem);
+        copy(Mask, NormalizedMask.begin());
+        unsigned PrevSrcReg = 0;
+        ArrayRef<int> PrevRegMask;
+        InstructionCost Cost = 0;
+        processShuffleMasks(
+            NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
+            [this, SingleOpTy, &PrevSrcReg, &PrevRegMask,
+             &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
+              if (!ShuffleVectorInst::isIdentityMask(RegMask)) {
+                // Check if the previous register can be just copied to the next
+                // one.
+                if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
+                    PrevRegMask != RegMask)
+                  Cost += getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
+                                         RegMask, 0, nullptr);
+                else
+                  // Just a copy of previous destination register.
+                  Cost += TTI::TCC_Basic;
+                return;
+              }
+              if (SrcReg != DestReg &&
+                  any_of(RegMask, [](int I) { return I != UndefMaskElem; })) {
+                // Just a copy of the source register.
+                Cost += TTI::TCC_Basic;
+              }
+              PrevSrcReg = SrcReg;
+              PrevRegMask = RegMask;
+            },
+            [this, SingleOpTy, &Cost](ArrayRef<int> RegMask,
+                                      unsigned /*Unused*/,
+                                      unsigned /*Unused*/) {
+              Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
+                                     0, nullptr);
+            });
+        return Cost;
+      }
+
       InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
       return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
                                             None, 0, nullptr);
@@ -1545,9 +1603,25 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
     { TTI::SK_PermuteTwoSrc,    MVT::v16i8, 13 }, // blend+permute
   };
 
-  if (ST->hasSSE2())
+  static const CostTblEntry SSE3BroadcastLoadTbl[] = {
+      {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
+  };
+
+  if (ST->hasSSE2()) {
+    bool IsLoad =
+        llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
+    if (ST->hasSSE3() && IsLoad)
+      if (const auto *Entry =
+              CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
+        assert(isLegalBroadcastLoad(BaseTp->getElementType(),
+                                    LT.second.getVectorElementCount()) &&
+               "Table entry missing from isLegalBroadcastLoad()");
+        return LT.first * Entry->Cost;
+      }
+
     if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
       return LT.first * Entry->Cost;
+  }
 
   static const CostTblEntry SSE1ShuffleTbl[] = {
     { TTI::SK_Broadcast,        MVT::v4f32, 1 }, // shufps
@@ -2444,6 +2518,10 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
   std::pair<InstructionCost, MVT> LTDest =
       TLI->getTypeLegalizationCost(DL, Dst);
 
+  // If we're truncating to the same legalized type - just assume its free.
+  if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
+    return TTI::TCC_Free;
+
   if (ST->useAVX512Regs()) {
     if (ST->hasBWI())
       if (const auto *Entry = ConvertCostTableLookup(
@@ -2545,7 +2623,7 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
-  unsigned ExtraCost = 0;
+  InstructionCost ExtraCost = 0;
   if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
     // Some vector comparison predicates cost extra instructions.
     // TODO: Should we invert this and assume worst case cmp costs
@@ -2619,15 +2697,29 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
     { ISD::SETCC,   MVT::v16f32,  1 },
 
     { ISD::SELECT,  MVT::v8i64,   1 },
+    { ISD::SELECT,  MVT::v4i64,   1 },
+    { ISD::SELECT,  MVT::v2i64,   1 },
     { ISD::SELECT,  MVT::v16i32,  1 },
+    { ISD::SELECT,  MVT::v8i32,   1 },
+    { ISD::SELECT,  MVT::v4i32,   1 },
     { ISD::SELECT,  MVT::v8f64,   1 },
+    { ISD::SELECT,  MVT::v4f64,   1 },
+    { ISD::SELECT,  MVT::v2f64,   1 },
+    { ISD::SELECT,  MVT::f64,     1 },
     { ISD::SELECT,  MVT::v16f32,  1 },
+    { ISD::SELECT,  MVT::v8f32 ,  1 },
+    { ISD::SELECT,  MVT::v4f32,   1 },
+    { ISD::SELECT,  MVT::f32  ,   1 },
 
     { ISD::SETCC,   MVT::v32i16,  2 }, // FIXME: should probably be 4
     { ISD::SETCC,   MVT::v64i8,   2 }, // FIXME: should probably be 4
 
-    { ISD::SELECT,  MVT::v32i16,  2 }, // FIXME: should be 3
-    { ISD::SELECT,  MVT::v64i8,   2 }, // FIXME: should be 3
+    { ISD::SELECT,  MVT::v32i16,  2 },
+    { ISD::SELECT,  MVT::v16i16,  1 },
+    { ISD::SELECT,  MVT::v8i16,   1 },
+    { ISD::SELECT,  MVT::v64i8,   2 },
+    { ISD::SELECT,  MVT::v32i8,   1 },
+    { ISD::SELECT,  MVT::v16i8,   1 },
   };
 
   static const CostTblEntry AVX2CostTbl[] = {
@@ -2636,10 +2728,12 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
     { ISD::SETCC,   MVT::v16i16,  1 },
     { ISD::SETCC,   MVT::v32i8,   1 },
 
-    { ISD::SELECT,  MVT::v4i64,   1 }, // pblendvb
-    { ISD::SELECT,  MVT::v8i32,   1 }, // pblendvb
-    { ISD::SELECT,  MVT::v16i16,  1 }, // pblendvb
-    { ISD::SELECT,  MVT::v32i8,   1 }, // pblendvb
+    { ISD::SELECT,  MVT::v4f64,   2 }, // vblendvpd
+    { ISD::SELECT,  MVT::v8f32,   2 }, // vblendvps
+    { ISD::SELECT,  MVT::v4i64,   2 }, // pblendvb
+    { ISD::SELECT,  MVT::v8i32,   2 }, // pblendvb
+    { ISD::SELECT,  MVT::v16i16,  2 }, // pblendvb
+    { ISD::SELECT,  MVT::v32i8,   2 }, // pblendvb
   };
 
   static const CostTblEntry AVX1CostTbl[] = {
@@ -2651,49 +2745,54 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
     { ISD::SETCC,   MVT::v16i16,  4 },
     { ISD::SETCC,   MVT::v32i8,   4 },
 
-    { ISD::SELECT,  MVT::v4f64,   1 }, // vblendvpd
-    { ISD::SELECT,  MVT::v8f32,   1 }, // vblendvps
-    { ISD::SELECT,  MVT::v4i64,   1 }, // vblendvpd
-    { ISD::SELECT,  MVT::v8i32,   1 }, // vblendvps
+    { ISD::SELECT,  MVT::v4f64,   3 }, // vblendvpd
+    { ISD::SELECT,  MVT::v8f32,   3 }, // vblendvps
+    { ISD::SELECT,  MVT::v4i64,   3 }, // vblendvpd
+    { ISD::SELECT,  MVT::v8i32,   3 }, // vblendvps
     { ISD::SELECT,  MVT::v16i16,  3 }, // vandps + vandnps + vorps
     { ISD::SELECT,  MVT::v32i8,   3 }, // vandps + vandnps + vorps
   };
 
   static const CostTblEntry SSE42CostTbl[] = {
-    { ISD::SETCC,   MVT::v2f64,   1 },
-    { ISD::SETCC,   MVT::v4f32,   1 },
     { ISD::SETCC,   MVT::v2i64,   1 },
   };
 
   static const CostTblEntry SSE41CostTbl[] = {
-    { ISD::SELECT,  MVT::v2f64,   1 }, // blendvpd
-    { ISD::SELECT,  MVT::v4f32,   1 }, // blendvps
-    { ISD::SELECT,  MVT::v2i64,   1 }, // pblendvb
-    { ISD::SELECT,  MVT::v4i32,   1 }, // pblendvb
-    { ISD::SELECT,  MVT::v8i16,   1 }, // pblendvb
-    { ISD::SELECT,  MVT::v16i8,   1 }, // pblendvb
+    { ISD::SETCC,   MVT::v2f64,   1 },
+    { ISD::SETCC,   MVT::v4f32,   1 },
+
+    { ISD::SELECT,  MVT::v2f64,   2 }, // blendvpd
+    { ISD::SELECT,  MVT::f64,     2 }, // blendvpd
+    { ISD::SELECT,  MVT::v4f32,   2 }, // blendvps
+    { ISD::SELECT,  MVT::f32  ,   2 }, // blendvps
+    { ISD::SELECT,  MVT::v2i64,   2 }, // pblendvb
+    { ISD::SELECT,  MVT::v4i32,   2 }, // pblendvb
+    { ISD::SELECT,  MVT::v8i16,   2 }, // pblendvb
+    { ISD::SELECT,  MVT::v16i8,   2 }, // pblendvb
   };
 
   static const CostTblEntry SSE2CostTbl[] = {
     { ISD::SETCC,   MVT::v2f64,   2 },
     { ISD::SETCC,   MVT::f64,     1 },
-    { ISD::SETCC,   MVT::v2i64,   8 },
+    { ISD::SETCC,   MVT::v2i64,   5 }, // pcmpeqd/pcmpgtd expansion
     { ISD::SETCC,   MVT::v4i32,   1 },
     { ISD::SETCC,   MVT::v8i16,   1 },
     { ISD::SETCC,   MVT::v16i8,   1 },
 
-    { ISD::SELECT,  MVT::v2f64,   3 }, // andpd + andnpd + orpd
-    { ISD::SELECT,  MVT::v2i64,   3 }, // pand + pandn + por
-    { ISD::SELECT,  MVT::v4i32,   3 }, // pand + pandn + por
-    { ISD::SELECT,  MVT::v8i16,   3 }, // pand + pandn + por
-    { ISD::SELECT,  MVT::v16i8,   3 }, // pand + pandn + por
+    { ISD::SELECT,  MVT::v2f64,   2 }, // andpd + andnpd + orpd
+    { ISD::SELECT,  MVT::f64,     2 }, // andpd + andnpd + orpd
+    { ISD::SELECT,  MVT::v2i64,   2 }, // pand + pandn + por
+    { ISD::SELECT,  MVT::v4i32,   2 }, // pand + pandn + por
+    { ISD::SELECT,  MVT::v8i16,   2 }, // pand + pandn + por
+    { ISD::SELECT,  MVT::v16i8,   2 }, // pand + pandn + por
   };
 
   static const CostTblEntry SSE1CostTbl[] = {
     { ISD::SETCC,   MVT::v4f32,   2 },
     { ISD::SETCC,   MVT::f32,     1 },
 
-    { ISD::SELECT,  MVT::v4f32,   3 }, // andps + andnps + orps
+    { ISD::SELECT,  MVT::v4f32,   2 }, // andps + andnps + orps
+    { ISD::SELECT,  MVT::f32,     2 }, // andps + andnps + orps
   };
 
   if (ST->useSLMArithCosts())
@@ -3555,7 +3654,7 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
 
   assert(Val->isVectorTy() && "This must be a vector type");
   Type *ScalarType = Val->getScalarType();
-  int RegisterFileMoveCost = 0;
+  InstructionCost RegisterFileMoveCost = 0;
 
   // Non-immediate extraction/insertion can be handled as a sequence of
   // aliased loads+stores via the stack.
@@ -3589,6 +3688,12 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
 
   if (Index != -1U && (Opcode == Instruction::ExtractElement ||
                        Opcode == Instruction::InsertElement)) {
+    // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
+    if (Opcode == Instruction::ExtractElement &&
+        ScalarType->getScalarSizeInBits() == 1 &&
+        cast<FixedVectorType>(Val)->getNumElements() > 1)
+      return 1;
+
     // Legalize the type.
     std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
 
@@ -3597,15 +3702,16 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
       return 0;
 
     // The type may be split. Normalize the index to the new type.
+    unsigned SizeInBits = LT.second.getSizeInBits();
     unsigned NumElts = LT.second.getVectorNumElements();
     unsigned SubNumElts = NumElts;
     Index = Index % NumElts;
 
     // For >128-bit vectors, we need to extract higher 128-bit subvectors.
     // For inserts, we also need to insert the subvector back.
-    if (LT.second.getSizeInBits() > 128) {
-      assert((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector");
-      unsigned NumSubVecs = LT.second.getSizeInBits() / 128;
+    if (SizeInBits > 128) {
+      assert((SizeInBits % 128) == 0 && "Illegal vector");
+      unsigned NumSubVecs = SizeInBits / 128;
       SubNumElts = NumElts / NumSubVecs;
       if (SubNumElts <= Index) {
         RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
@@ -3673,20 +3779,25 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
                                                      const APInt &DemandedElts,
                                                      bool Insert,
                                                      bool Extract) {
+  assert(DemandedElts.getBitWidth() ==
+             cast<FixedVectorType>(Ty)->getNumElements() &&
+         "Vector size mismatch");
+
+  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+  MVT MScalarTy = LT.second.getScalarType();
+  unsigned SizeInBits = LT.second.getSizeInBits();
+
   InstructionCost Cost = 0;
 
   // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
   // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
   if (Insert) {
-    std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
-    MVT MScalarTy = LT.second.getScalarType();
-
     if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
         (MScalarTy.isInteger() && ST->hasSSE41()) ||
         (MScalarTy == MVT::f32 && ST->hasSSE41())) {
       // For types we can insert directly, insertion into 128-bit sub vectors is
       // cheap, followed by a cheap chain of concatenations.
-      if (LT.second.getSizeInBits() <= 128) {
+      if (SizeInBits <= 128) {
         Cost +=
             BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false);
       } else {
@@ -3704,9 +3815,9 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
         // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
         const int CostValue = *LT.first.getValue();
         assert(CostValue >= 0 && "Negative cost!");
-        unsigned Num128Lanes = LT.second.getSizeInBits() / 128 * CostValue;
+        unsigned Num128Lanes = SizeInBits / 128 * CostValue;
         unsigned NumElts = LT.second.getVectorNumElements() * CostValue;
-        APInt WidenedDemandedElts = DemandedElts.zextOrSelf(NumElts);
+        APInt WidenedDemandedElts = DemandedElts.zext(NumElts);
         unsigned Scale = NumElts / Num128Lanes;
         // We iterate each 128-lane, and check if we need a
         // extracti128/inserti128 for this 128-lane.
@@ -3747,10 +3858,59 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
     }
   }
 
-  // TODO: Use default extraction for now, but we should investigate extending this
-  // to handle repeated subvector extraction.
-  if (Extract)
+  if (Extract) {
+    // vXi1 can be efficiently extracted with MOVMSK.
+    // TODO: AVX512 predicate mask handling.
+    // NOTE: This doesn't work well for roundtrip scalarization.
+    if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
+      unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
+      unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
+      unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
+      return MOVMSKCost;
+    }
+
+    if (LT.second.isVector()) {
+      int CostValue = *LT.first.getValue();
+      assert(CostValue >= 0 && "Negative cost!");
+
+      unsigned NumElts = LT.second.getVectorNumElements() * CostValue;
+      assert(NumElts >= DemandedElts.getBitWidth() &&
+             "Vector has been legalized to smaller element count");
+
+      // If we're extracting elements from a 128-bit subvector lane, we only need
+      // to extract each lane once, not for every element.
+      if (SizeInBits > 128) {
+        assert((SizeInBits % 128) == 0 && "Illegal vector");
+        unsigned NumLegal128Lanes = SizeInBits / 128;
+        unsigned Num128Lanes = NumLegal128Lanes * CostValue;
+        APInt WidenedDemandedElts = DemandedElts.zext(NumElts);
+        unsigned Scale = NumElts / Num128Lanes;
+
+        // Add cost for each demanded 128-bit subvector extraction.
+        // Luckily this is a lot easier than for insertion.
+        APInt DemandedUpper128Lanes =
+            APIntOps::ScaleBitMask(WidenedDemandedElts, Num128Lanes);
+        auto *Ty128 = FixedVectorType::get(Ty->getElementType(), Scale);
+        for (unsigned I = 0; I != Num128Lanes; ++I)
+          if (DemandedUpper128Lanes[I])
+            Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
+                                   I * Scale, Ty128);
+
+        // Add all the demanded element extractions together, but adjust the
+        // index to use the equivalent of the bottom 128 bit lane.
+        for (unsigned I = 0; I != NumElts; ++I)
+          if (WidenedDemandedElts[I]) {
+            unsigned Idx = I % Scale;
+            Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, Idx);
+          }
+
+        return Cost;
+      }
+    }
+
+    // Fallback to default extraction.
     Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract);
+  }
 
   return Cost;
 }
@@ -3855,8 +4015,7 @@ X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
   // if all elements that will form a single Dst vector aren't demanded,
   // then we won't need to do that shuffle, so adjust the cost accordingly.
   APInt DemandedDstVectors = APIntOps::ScaleBitMask(
-      DemandedDstElts.zextOrSelf(NumDstVectors * NumEltsPerDstVec),
-      NumDstVectors);
+      DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
   unsigned NumDstVectorsDemanded = DemandedDstVectors.countPopulation();
 
   InstructionCost SingleShuffleCost =
@@ -5029,8 +5188,8 @@ InstructionCost X86TTIImpl::getGatherScatterOpCost(
   return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
 }
 
-bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
-                               TargetTransformInfo::LSRCost &C2) {
+bool X86TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
+                               const TargetTransformInfo::LSRCost &C2) {
     // X86 specific here are "instruction number 1st priority".
     return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
                     C1.NumIVMuls, C1.NumBaseAdds,
@@ -5110,6 +5269,14 @@ bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
   return true;
 }
 
+bool X86TTIImpl::isLegalBroadcastLoad(Type *ElementTy,
+                                      ElementCount NumElements) const {
+  // movddup
+  return ST->hasSSE3() && !NumElements.isScalable() &&
+         NumElements.getFixedValue() == 2 &&
+         ElementTy == Type::getDoubleTy(ElementTy->getContext());
+}
+
 bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) {
   if (!isa<VectorType>(DataTy))
     return false;
@@ -5174,6 +5341,39 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
   return IntWidth == 32 || IntWidth == 64;
 }
 
+bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
+                                 unsigned Opcode1,
+                                 const SmallBitVector &OpcodeMask) const {
+  // ADDSUBPS  4xf32 SSE3
+  // VADDSUBPS 4xf32 AVX
+  // VADDSUBPS 8xf32 AVX2
+  // ADDSUBPD  2xf64 SSE3
+  // VADDSUBPD 2xf64 AVX
+  // VADDSUBPD 4xf64 AVX2
+
+  unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
+  assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
+  if (!isPowerOf2_32(NumElements))
+    return false;
+  // Check the opcode pattern. We apply the mask on the opcode arguments and
+  // then check if it is what we expect.
+  for (int Lane : seq<int>(0, NumElements)) {
+    unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
+    // We expect FSub for even lanes and FAdd for odd lanes.
+    if (Lane % 2 == 0 && Opc != Instruction::FSub)
+      return false;
+    if (Lane % 2 == 1 && Opc != Instruction::FAdd)
+      return false;
+  }
+  // Now check that the pattern is supported by the target ISA.
+  Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
+  if (ElemTy->isFloatTy())
+    return ST->hasSSE3() && NumElements % 4 == 0;
+  if (ElemTy->isDoubleTy())
+    return ST->hasSSE3() && NumElements % 2 == 0;
+  return false;
+}
+
 bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
   // AVX2 doesn't support scatter
   if (!ST->hasAVX512())
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 69715072426f..bd3c3fb1bb2f 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -38,12 +38,12 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
   const FeatureBitset InlineFeatureIgnoreList = {
       // This indicates the CPU is 64 bit capable not that we are in 64-bit
       // mode.
-      X86::Feature64Bit,
+      X86::FeatureX86_64,
 
       // These features don't have any intrinsics or ABI effect.
       X86::FeatureNOPL,
-      X86::FeatureCMPXCHG16B,
-      X86::FeatureLAHFSAHF,
+      X86::FeatureCX16,
+      X86::FeatureLAHFSAHF64,
 
       // Some older targets can be setup to fold unaligned loads.
       X86::FeatureSSEUnalignedMem,
@@ -68,6 +68,11 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
       X86::TuningMacroFusion,
       X86::TuningPadShortFunctions,
       X86::TuningPOPCNTFalseDeps,
+      X86::TuningMULCFalseDeps,
+      X86::TuningPERMFalseDeps,
+      X86::TuningRANGEFalseDeps,
+      X86::TuningGETMANTFalseDeps,
+      X86::TuningMULLQFalseDeps,
       X86::TuningSlow3OpsLEA,
       X86::TuningSlowDivide32,
       X86::TuningSlowDivide64,
@@ -131,7 +136,8 @@ public:
       const Instruction *CxtI = nullptr);
   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
                                  ArrayRef<int> Mask, int Index,
-                                 VectorType *SubTp);
+                                 VectorType *SubTp,
+                                 ArrayRef<const Value *> Args = None);
   InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                                    TTI::CastContextHint CCH,
                                    TTI::TargetCostKind CostKind,
@@ -219,13 +225,14 @@ public:
   InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
                                       const APInt &Imm, Type *Ty,
                                       TTI::TargetCostKind CostKind);
-  bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
-                     TargetTransformInfo::LSRCost &C2);
+  bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
+                     const TargetTransformInfo::LSRCost &C2);
   bool canMacroFuseCmp();
   bool isLegalMaskedLoad(Type *DataType, Align Alignment);
   bool isLegalMaskedStore(Type *DataType, Align Alignment);
   bool isLegalNTLoad(Type *DataType, Align Alignment);
   bool isLegalNTStore(Type *DataType, Align Alignment);
+  bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const;
   bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment);
   bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment) {
     return forceScalarizeMaskedGather(VTy, Alignment);
@@ -234,6 +241,8 @@ public:
   bool isLegalMaskedScatter(Type *DataType, Align Alignment);
   bool isLegalMaskedExpandLoad(Type *DataType);
   bool isLegalMaskedCompressStore(Type *DataType);
+  bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1,
+                       const SmallBitVector &OpcodeMask) const;
   bool hasDivRemOp(Type *DataType, bool IsSigned);
   bool isFCmpOrdCheaperThanFCmpZero(Type *Ty);
   bool areInlineCompatible(const Function *Caller,
diff --git a/llvm/lib/Target/X86/X86TileConfig.cpp b/llvm/lib/Target/X86/X86TileConfig.cpp
index 8114a0b2d423..5cada924e006 100644
--- a/llvm/lib/Target/X86/X86TileConfig.cpp
+++ b/llvm/lib/Target/X86/X86TileConfig.cpp
@@ -36,7 +36,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "tile-config"
+#define DEBUG_TYPE "tileconfig"
 
 namespace {
 
@@ -70,11 +70,11 @@ struct X86TileConfig : public MachineFunctionPass {
 
 char X86TileConfig::ID = 0;
 
-INITIALIZE_PASS_BEGIN(X86TileConfig, "tileconfig", "Tile Register Configure",
+INITIALIZE_PASS_BEGIN(X86TileConfig, DEBUG_TYPE, "Tile Register Configure",
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
-INITIALIZE_PASS_END(X86TileConfig, "tileconfig", "Tile Register Configure",
-                    false, false)
+INITIALIZE_PASS_END(X86TileConfig, DEBUG_TYPE, "Tile Register Configure", false,
+                    false)
 
 bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) {
   const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
@@ -90,7 +90,7 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) {
   int SS = INT_MAX;
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &MI : MBB) {
-      if (MI.getOpcode() == X86::LDTILECFG) {
+      if (MI.getOpcode() == X86::PLDTILECFGV) {
         SS = MI.getOperand(0).getIndex();
         break;
       }
@@ -98,6 +98,9 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) {
     if (SS != INT_MAX)
       break;
   }
+  // Didn't find PLDTILECFGV, just return false;
+  if (SS == INT_MAX)
+    return false;
 
   // Try to find a point to insert MIs for constant shapes.
   // Here we are leveraging the palette id inserted in PreRA pass.
@@ -120,6 +123,8 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) {
       continue;
     if (MRI.getRegClass(VirtReg)->getID() != X86::TILERegClassID)
       continue;
+    if (VRM.getPhys(VirtReg) == VirtRegMap::NO_PHYS_REG)
+      continue;
     unsigned Index = VRM.getPhys(VirtReg) - X86::TMM0;
     if (!Phys2Virt[Index])
       Phys2Virt[Index] = VirtReg;
diff --git a/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
index f6b97e9e84b3..57801752f170 100644
--- a/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
+++ b/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
@@ -15,8 +15,8 @@
 #include "XCore.h"
 #include "XCoreRegisterInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDecoderOps.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
-#include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
@@ -66,140 +66,116 @@ static bool readInstruction32(ArrayRef<uint8_t> Bytes, uint64_t Address,
   return true;
 }
 
-static unsigned getReg(const void *D, unsigned RC, unsigned RegNo) {
-  const XCoreDisassembler *Dis = static_cast<const XCoreDisassembler*>(D);
-  const MCRegisterInfo *RegInfo = Dis->getContext().getRegisterInfo();
+static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo) {
+  const MCRegisterInfo *RegInfo = D->getContext().getRegisterInfo();
   return *(RegInfo->getRegClass(RC).begin() + RegNo);
 }
 
-static DecodeStatus DecodeGRRegsRegisterClass(MCInst &Inst,
-                                              unsigned RegNo,
+static DecodeStatus DecodeGRRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                               uint64_t Address,
-                                              const void *Decoder);
+                                              const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeRRegsRegisterClass(MCInst &Inst,
-                                             unsigned RegNo,
+static DecodeStatus DecodeRRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
-                                             const void *Decoder);
+                                             const MCDisassembler *Decoder);
 
 static DecodeStatus DecodeBitpOperand(MCInst &Inst, unsigned Val,
-                                      uint64_t Address, const void *Decoder);
+                                      uint64_t Address,
+                                      const MCDisassembler *Decoder);
 
 static DecodeStatus DecodeNegImmOperand(MCInst &Inst, unsigned Val,
-                                        uint64_t Address, const void *Decoder);
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder);
 
-static DecodeStatus Decode2RInstruction(MCInst &Inst,
-                                        unsigned Insn,
+static DecodeStatus Decode2RInstruction(MCInst &Inst, unsigned Insn,
                                         uint64_t Address,
-                                        const void *Decoder);
+                                        const MCDisassembler *Decoder);
 
-static DecodeStatus Decode2RImmInstruction(MCInst &Inst,
-                                           unsigned Insn,
+static DecodeStatus Decode2RImmInstruction(MCInst &Inst, unsigned Insn,
                                            uint64_t Address,
-                                           const void *Decoder);
+                                           const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeR2RInstruction(MCInst &Inst,
-                                         unsigned Insn,
+static DecodeStatus DecodeR2RInstruction(MCInst &Inst, unsigned Insn,
                                          uint64_t Address,
-                                         const void *Decoder);
+                                         const MCDisassembler *Decoder);
 
-static DecodeStatus Decode2RSrcDstInstruction(MCInst &Inst,
-                                              unsigned Insn,
+static DecodeStatus Decode2RSrcDstInstruction(MCInst &Inst, unsigned Insn,
                                               uint64_t Address,
-                                              const void *Decoder);
+                                              const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeRUSInstruction(MCInst &Inst,
-                                         unsigned Insn,
+static DecodeStatus DecodeRUSInstruction(MCInst &Inst, unsigned Insn,
                                          uint64_t Address,
-                                         const void *Decoder);
+                                         const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeRUSBitpInstruction(MCInst &Inst,
-                                             unsigned Insn,
+static DecodeStatus DecodeRUSBitpInstruction(MCInst &Inst, unsigned Insn,
                                              uint64_t Address,
-                                             const void *Decoder);
+                                             const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeRUSSrcDstBitpInstruction(MCInst &Inst,
-                                                   unsigned Insn,
-                                                   uint64_t Address,
-                                                   const void *Decoder);
+static DecodeStatus
+DecodeRUSSrcDstBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                               const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeL2RInstruction(MCInst &Inst,
-                                         unsigned Insn,
+static DecodeStatus DecodeL2RInstruction(MCInst &Inst, unsigned Insn,
                                          uint64_t Address,
-                                         const void *Decoder);
+                                         const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeLR2RInstruction(MCInst &Inst,
-                                          unsigned Insn,
+static DecodeStatus DecodeLR2RInstruction(MCInst &Inst, unsigned Insn,
                                           uint64_t Address,
-                                          const void *Decoder);
+                                          const MCDisassembler *Decoder);
 
-static DecodeStatus Decode3RInstruction(MCInst &Inst,
-                                        unsigned Insn,
+static DecodeStatus Decode3RInstruction(MCInst &Inst, unsigned Insn,
                                         uint64_t Address,
-                                        const void *Decoder);
+                                        const MCDisassembler *Decoder);
 
-static DecodeStatus Decode3RImmInstruction(MCInst &Inst,
-                                           unsigned Insn,
+static DecodeStatus Decode3RImmInstruction(MCInst &Inst, unsigned Insn,
                                            uint64_t Address,
-                                           const void *Decoder);
+                                           const MCDisassembler *Decoder);
 
-static DecodeStatus Decode2RUSInstruction(MCInst &Inst,
-                                          unsigned Insn,
+static DecodeStatus Decode2RUSInstruction(MCInst &Inst, unsigned Insn,
                                           uint64_t Address,
-                                          const void *Decoder);
+                                          const MCDisassembler *Decoder);
 
-static DecodeStatus Decode2RUSBitpInstruction(MCInst &Inst,
-                                              unsigned Insn,
+static DecodeStatus Decode2RUSBitpInstruction(MCInst &Inst, unsigned Insn,
                                               uint64_t Address,
-                                              const void *Decoder);
+                                              const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeL3RInstruction(MCInst &Inst,
-                                         unsigned Insn,
+static DecodeStatus DecodeL3RInstruction(MCInst &Inst, unsigned Insn,
                                          uint64_t Address,
-                                         const void *Decoder);
+                                         const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeL3RSrcDstInstruction(MCInst &Inst,
-                                               unsigned Insn,
+static DecodeStatus DecodeL3RSrcDstInstruction(MCInst &Inst, unsigned Insn,
                                                uint64_t Address,
-                                               const void *Decoder);
+                                               const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeL2RUSInstruction(MCInst &Inst,
-                                           unsigned Insn,
+static DecodeStatus DecodeL2RUSInstruction(MCInst &Inst, unsigned Insn,
                                            uint64_t Address,
-                                           const void *Decoder);
+                                           const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeL2RUSBitpInstruction(MCInst &Inst,
-                                               unsigned Insn,
+static DecodeStatus DecodeL2RUSBitpInstruction(MCInst &Inst, unsigned Insn,
                                                uint64_t Address,
-                                               const void *Decoder);
+                                               const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeL6RInstruction(MCInst &Inst,
-                                         unsigned Insn,
+static DecodeStatus DecodeL6RInstruction(MCInst &Inst, unsigned Insn,
                                          uint64_t Address,
-                                         const void *Decoder);
+                                         const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeL5RInstruction(MCInst &Inst,
-                                         unsigned Insn,
+static DecodeStatus DecodeL5RInstruction(MCInst &Inst, unsigned Insn,
                                          uint64_t Address,
-                                         const void *Decoder);
+                                         const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeL4RSrcDstInstruction(MCInst &Inst,
-                                               unsigned Insn,
+static DecodeStatus DecodeL4RSrcDstInstruction(MCInst &Inst, unsigned Insn,
                                                uint64_t Address,
-                                               const void *Decoder);
+                                               const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeL4RSrcDstSrcDstInstruction(MCInst &Inst,
-                                                     unsigned Insn,
-                                                     uint64_t Address,
-                                                     const void *Decoder);
+static DecodeStatus
+DecodeL4RSrcDstSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                 const MCDisassembler *Decoder);
 
 #include "XCoreGenDisassemblerTables.inc"
 
-static DecodeStatus DecodeGRRegsRegisterClass(MCInst &Inst,
-                                              unsigned RegNo,
+static DecodeStatus DecodeGRRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                               uint64_t Address,
-                                              const void *Decoder)
-{
+                                              const MCDisassembler *Decoder) {
   if (RegNo > 11)
     return MCDisassembler::Fail;
   unsigned Reg = getReg(Decoder, XCore::GRRegsRegClassID, RegNo);
@@ -207,11 +183,9 @@ static DecodeStatus DecodeGRRegsRegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeRRegsRegisterClass(MCInst &Inst,
-                                             unsigned RegNo,
+static DecodeStatus DecodeRRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
-                                             const void *Decoder)
-{
+                                             const MCDisassembler *Decoder) {
   if (RegNo > 15)
     return MCDisassembler::Fail;
   unsigned Reg = getReg(Decoder, XCore::RRegsRegClassID, RegNo);
@@ -220,7 +194,8 @@ static DecodeStatus DecodeRRegsRegisterClass(MCInst &Inst,
 }
 
 static DecodeStatus DecodeBitpOperand(MCInst &Inst, unsigned Val,
-                                      uint64_t Address, const void *Decoder) {
+                                      uint64_t Address,
+                                      const MCDisassembler *Decoder) {
   if (Val > 11)
     return MCDisassembler::Fail;
   static const unsigned Values[] = {
@@ -231,7 +206,8 @@ static DecodeStatus DecodeBitpOperand(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeNegImmOperand(MCInst &Inst, unsigned Val,
-                                        uint64_t Address, const void *Decoder) {
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder) {
   Inst.addOperand(MCOperand::createImm(-(int64_t)Val));
   return MCDisassembler::Success;
 }
@@ -270,9 +246,9 @@ Decode3OpInstruction(unsigned Insn, unsigned &Op1, unsigned &Op2,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus
-Decode2OpInstructionFail(MCInst &Inst, unsigned Insn, uint64_t Address,
-                         const void *Decoder) {
+static DecodeStatus Decode2OpInstructionFail(MCInst &Inst, unsigned Insn,
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder) {
   // Try and decode as a 3R instruction.
   unsigned Opcode = fieldFromInstruction(Insn, 11, 5);
   switch (Opcode) {
@@ -340,9 +316,9 @@ Decode2OpInstructionFail(MCInst &Inst, unsigned Insn, uint64_t Address,
   return MCDisassembler::Fail;
 }
 
-static DecodeStatus
-Decode2RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
-                    const void *Decoder) {
+static DecodeStatus Decode2RInstruction(MCInst &Inst, unsigned Insn,
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder) {
   unsigned Op1, Op2;
   DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
   if (S != MCDisassembler::Success)
@@ -353,9 +329,9 @@ Decode2RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
   return S;
 }
 
-static DecodeStatus
-Decode2RImmInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
-                       const void *Decoder) {
+static DecodeStatus Decode2RImmInstruction(MCInst &Inst, unsigned Insn,
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder) {
   unsigned Op1, Op2;
   DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
   if (S != MCDisassembler::Success)
@@ -366,9 +342,9 @@ Decode2RImmInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
   return S;
 }
 
-static DecodeStatus
-DecodeR2RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
-                     const void *Decoder) {
+static DecodeStatus DecodeR2RInstruction(MCInst &Inst, unsigned Insn,
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder) {
   unsigned Op1, Op2;
   DecodeStatus S = Decode2OpInstruction(Insn, Op2, Op1);
   if (S != MCDisassembler::Success)
@@ -379,9 +355,9 @@ DecodeR2RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
   return S;
 }
 
-static DecodeStatus
-Decode2RSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
-                          const void *Decoder) {
+static DecodeStatus Decode2RSrcDstInstruction(MCInst &Inst, unsigned Insn,
+                                              uint64_t Address,
+                                              const MCDisassembler *Decoder) {
   unsigned Op1, Op2;
   DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
   if (S != MCDisassembler::Success)
@@ -393,9 +369,9 @@ Decode2RSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
   return S;
 }
 
-static DecodeStatus
-DecodeRUSInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
-                     const void *Decoder) {
+static DecodeStatus DecodeRUSInstruction(MCInst &Inst, unsigned Insn,
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder) {
   unsigned Op1, Op2;
   DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
   if (S != MCDisassembler::Success)
@@ -406,9 +382,9 @@ DecodeRUSInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
   return S;
 }
 
-static DecodeStatus
-DecodeRUSBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
-                         const void *Decoder) {
+static DecodeStatus DecodeRUSBitpInstruction(MCInst &Inst, unsigned Insn,
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder) {
   unsigned Op1, Op2;
   DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
   if (S != MCDisassembler::Success)
@@ -421,7 +397,7 @@ DecodeRUSBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
 
 static DecodeStatus
 DecodeRUSSrcDstBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
-                               const void *Decoder) {
+                               const MCDisassembler *Decoder) {
   unsigned Op1, Op2;
   DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
   if (S != MCDisassembler::Success)
@@ -433,9 +409,9 @@ DecodeRUSSrcDstBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
   return S;
 }
 
-static DecodeStatus
-DecodeL2OpInstructionFail(MCInst &Inst, unsigned Insn, uint64_t Address,
-                          const void *Decoder) {
+static DecodeStatus DecodeL2OpInstructionFail(MCInst &Inst, unsigned Insn,
+                                              uint64_t Address,
+                                              const MCDisassembler *Decoder) {
   // Try and decode as a L3R / L2RUS instruction.
   unsigned Opcode = fieldFromInstruction(Insn, 16, 4) |
                     fieldFromInstruction(Insn, 27, 5) << 4;
@@ -504,9 +480,9 @@ DecodeL2OpInstructionFail(MCInst &Inst, unsigned Insn, uint64_t Address,
   return MCDisassembler::Fail;
 }
 
-static DecodeStatus
-DecodeL2RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
-                               const void *Decoder) {
+static DecodeStatus DecodeL2RInstruction(MCInst &Inst, unsigned Insn,
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder) {
   unsigned Op1, Op2;
   DecodeStatus S = Decode2OpInstruction(fieldFromInstruction(Insn, 0, 16),
                                         Op1, Op2);
@@ -518,9 +494,9 @@ DecodeL2RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
   return S;
 }
 
-static DecodeStatus
-DecodeLR2RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
-                               const void *Decoder) {
+static DecodeStatus DecodeLR2RInstruction(MCInst &Inst, unsigned Insn,
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder) {
   unsigned Op1, Op2;
   DecodeStatus S = Decode2OpInstruction(fieldFromInstruction(Insn, 0, 16),
                                         Op1, Op2);
@@ -532,9 +508,9 @@ DecodeLR2RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
   return S;
 }
 
-static DecodeStatus
-Decode3RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
-                    const void *Decoder) {
+static DecodeStatus Decode3RInstruction(MCInst &Inst, unsigned Insn,
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder) {
   unsigned Op1, Op2, Op3;
   DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
   if (S == MCDisassembler::Success) {
@@ -545,9 +521,9 @@ Decode3RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
   return S;
 }
 
-static DecodeStatus
-Decode3RImmInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
-                       const void *Decoder) {
+static DecodeStatus Decode3RImmInstruction(MCInst &Inst, unsigned Insn,
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder) {
   unsigned Op1, Op2, Op3;
   DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
   if (S == MCDisassembler::Success) {
@@ -558,9 +534,9 @@ Decode3RImmInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
   return S;
 }
 
-static DecodeStatus
-Decode2RUSInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
-                      const void *Decoder) {
+static DecodeStatus Decode2RUSInstruction(MCInst &Inst, unsigned Insn,
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder) {
   unsigned Op1, Op2, Op3;
   DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
   if (S == MCDisassembler::Success) {
@@ -571,9 +547,9 @@ Decode2RUSInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
   return S;
 }
 
-static DecodeStatus
-Decode2RUSBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
-                      const void *Decoder) {
+static DecodeStatus Decode2RUSBitpInstruction(MCInst &Inst, unsigned Insn,
+                                              uint64_t Address,
+                                              const MCDisassembler *Decoder) {
   unsigned Op1, Op2, Op3;
   DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
   if (S == MCDisassembler::Success) {
@@ -584,9 +560,9 @@ Decode2RUSBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
   return S;
 }
 
-static DecodeStatus
-DecodeL3RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
-                     const void *Decoder) {
+static DecodeStatus DecodeL3RInstruction(MCInst &Inst, unsigned Insn,
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder) {
   unsigned Op1, Op2, Op3;
   DecodeStatus S =
     Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
@@ -598,9 +574,9 @@ DecodeL3RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
   return S;
 }
 
-static DecodeStatus
-DecodeL3RSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
-                           const void *Decoder) {
+static DecodeStatus DecodeL3RSrcDstInstruction(MCInst &Inst, unsigned Insn,
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder) {
   unsigned Op1, Op2, Op3;
   DecodeStatus S =
   Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
@@ -613,9 +589,9 @@ DecodeL3RSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
   return S;
 }
 
-static DecodeStatus
-DecodeL2RUSInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
-                       const void *Decoder) {
+static DecodeStatus DecodeL2RUSInstruction(MCInst &Inst, unsigned Insn,
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder) {
   unsigned Op1, Op2, Op3;
   DecodeStatus S =
   Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
@@ -627,9 +603,9 @@ DecodeL2RUSInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
   return S;
 }
 
-static DecodeStatus
-DecodeL2RUSBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
-                           const void *Decoder) {
+static DecodeStatus DecodeL2RUSBitpInstruction(MCInst &Inst, unsigned Insn,
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder) {
   unsigned Op1, Op2, Op3;
   DecodeStatus S =
   Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
@@ -641,9 +617,9 @@ DecodeL2RUSBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
   return S;
 }
 
-static DecodeStatus
-DecodeL6RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
-                     const void *Decoder) {
+static DecodeStatus DecodeL6RInstruction(MCInst &Inst, unsigned Insn,
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder) {
   unsigned Op1, Op2, Op3, Op4, Op5, Op6;
   DecodeStatus S =
     Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
@@ -661,9 +637,9 @@ DecodeL6RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
   return S;
 }
 
-static DecodeStatus
-DecodeL5RInstructionFail(MCInst &Inst, unsigned Insn, uint64_t Address,
-                     const void *Decoder) {
+static DecodeStatus DecodeL5RInstructionFail(MCInst &Inst, unsigned Insn,
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder) {
   // Try and decode as a L6R instruction.
   Inst.clear();
   unsigned Opcode = fieldFromInstruction(Insn, 27, 5);
@@ -675,9 +651,9 @@ DecodeL5RInstructionFail(MCInst &Inst, unsigned Insn, uint64_t Address,
   return MCDisassembler::Fail;
 }
 
-static DecodeStatus
-DecodeL5RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
-                     const void *Decoder) {
+static DecodeStatus DecodeL5RInstruction(MCInst &Inst, unsigned Insn,
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder) {
   unsigned Op1, Op2, Op3, Op4, Op5;
   DecodeStatus S =
     Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
@@ -695,9 +671,9 @@ DecodeL5RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
   return S;
 }
 
-static DecodeStatus
-DecodeL4RSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
-                           const void *Decoder) {
+static DecodeStatus DecodeL4RSrcDstInstruction(MCInst &Inst, unsigned Insn,
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder) {
   unsigned Op1, Op2, Op3;
   unsigned Op4 = fieldFromInstruction(Insn, 16, 4);
   DecodeStatus S =
@@ -716,7 +692,7 @@ DecodeL4RSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
 
 static DecodeStatus
 DecodeL4RSrcDstSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const void *Decoder) {
+                                 const MCDisassembler *Decoder) {
   unsigned Op1, Op2, Op3;
   unsigned Op4 = fieldFromInstruction(Insn, 16, 4);
   DecodeStatus S =
diff --git a/llvm/lib/Target/XCore/MCTargetDesc/XCoreInstPrinter.h b/llvm/lib/Target/XCore/MCTargetDesc/XCoreInstPrinter.h
index 0ea47106434c..a8801fc2c5bc 100644
--- a/llvm/lib/Target/XCore/MCTargetDesc/XCoreInstPrinter.h
+++ b/llvm/lib/Target/XCore/MCTargetDesc/XCoreInstPrinter.h
@@ -15,10 +15,10 @@
 #ifndef LLVM_LIB_TARGET_XCORE_MCTARGETDESC_XCOREINSTPRINTER_H
 #define LLVM_LIB_TARGET_XCORE_MCTARGETDESC_XCOREINSTPRINTER_H
 
-#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCInstPrinter.h"
 
 namespace llvm {
+class StringRef;
 
 class XCoreInstPrinter : public MCInstPrinter {
 public:
@@ -39,7 +39,6 @@ private:
   void printInlineJT(const MCInst *MI, int opNum, raw_ostream &O);
   void printInlineJT32(const MCInst *MI, int opNum, raw_ostream &O);
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printMemOperand(const MCInst *MI, int opNum, raw_ostream &O);
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/XCore/XCore.h b/llvm/lib/Target/XCore/XCore.h
index d31c34910ef6..6118775d16fe 100644
--- a/llvm/lib/Target/XCore/XCore.h
+++ b/llvm/lib/Target/XCore/XCore.h
@@ -15,6 +15,7 @@
 #define LLVM_LIB_TARGET_XCORE_XCORE_H
 
 #include "MCTargetDesc/XCoreMCTargetDesc.h"
+#include "llvm/PassRegistry.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
diff --git a/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp b/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
index 38b613700674..8fea61d125d2 100644
--- a/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -110,7 +110,7 @@ void XCoreAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
     return;
 
   const DataLayout &DL = getDataLayout();
-  OutStreamer->SwitchSection(getObjFileLowering().SectionForGlobal(GV, TM));
+  OutStreamer->switchSection(getObjFileLowering().SectionForGlobal(GV, TM));
 
   MCSymbol *GVSym = getSymbol(GV);
   const Constant *C = GV->getInitializer();
diff --git a/llvm/lib/Target/XCore/XCoreISelLowering.cpp b/llvm/lib/Target/XCore/XCoreISelLowering.cpp
index 7c86262269fc..70a1901bb04f 100644
--- a/llvm/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/llvm/lib/Target/XCore/XCoreISelLowering.cpp
@@ -167,10 +167,8 @@ XCoreTargetLowering::XCoreTargetLowering(const TargetMachine &TM,
     = MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 2;
 
   // We have target-specific dag combine patterns for the following nodes:
-  setTargetDAGCombine(ISD::STORE);
-  setTargetDAGCombine(ISD::ADD);
-  setTargetDAGCombine(ISD::INTRINSIC_VOID);
-  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
+  setTargetDAGCombine(
+      {ISD::STORE, ISD::ADD, ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN});
 
   setMinFunctionAlignment(Align(2));
   setPrefFunctionAlignment(Align(4));
@@ -442,7 +440,7 @@ SDValue XCoreTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
     }
   }
 
-  if (LD->getAlignment() == 2) {
+  if (LD->getAlign() == Align(2)) {
     SDValue Low = DAG.getExtLoad(ISD::ZEXTLOAD, DL, MVT::i32, Chain, BasePtr,
                                  LD->getPointerInfo(), MVT::i16, Align(2),
                                  LD->getMemOperand()->getFlags());
@@ -497,7 +495,7 @@ SDValue XCoreTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   SDValue Value = ST->getValue();
   SDLoc dl(Op);
 
-  if (ST->getAlignment() == 2) {
+  if (ST->getAlign() == Align(2)) {
     SDValue Low = Value;
     SDValue High = DAG.getNode(ISD::SRL, dl, MVT::i32, Value,
                                DAG.getConstant(16, dl, MVT::i32));
@@ -941,25 +939,25 @@ LowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const {
           N->getSuccessOrdering() == AtomicOrdering::Monotonic) &&
          "setInsertFencesForAtomic(true) expects unordered / monotonic");
   if (N->getMemoryVT() == MVT::i32) {
-    if (N->getAlignment() < 4)
+    if (N->getAlign() < Align(4))
       report_fatal_error("atomic load must be aligned");
     return DAG.getLoad(getPointerTy(DAG.getDataLayout()), SDLoc(Op),
                        N->getChain(), N->getBasePtr(), N->getPointerInfo(),
-                       N->getAlignment(), N->getMemOperand()->getFlags(),
+                       N->getAlign(), N->getMemOperand()->getFlags(),
                        N->getAAInfo(), N->getRanges());
   }
   if (N->getMemoryVT() == MVT::i16) {
-    if (N->getAlignment() < 2)
+    if (N->getAlign() < Align(2))
       report_fatal_error("atomic load must be aligned");
     return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), MVT::i32, N->getChain(),
                           N->getBasePtr(), N->getPointerInfo(), MVT::i16,
-                          N->getAlignment(), N->getMemOperand()->getFlags(),
+                          N->getAlign(), N->getMemOperand()->getFlags(),
                           N->getAAInfo());
   }
   if (N->getMemoryVT() == MVT::i8)
     return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), MVT::i32, N->getChain(),
                           N->getBasePtr(), N->getPointerInfo(), MVT::i8,
-                          N->getAlignment(), N->getMemOperand()->getFlags(),
+                          N->getAlign(), N->getMemOperand()->getFlags(),
                           N->getAAInfo());
   return SDValue();
 }
@@ -972,24 +970,24 @@ LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const {
           N->getSuccessOrdering() == AtomicOrdering::Monotonic) &&
          "setInsertFencesForAtomic(true) expects unordered / monotonic");
   if (N->getMemoryVT() == MVT::i32) {
-    if (N->getAlignment() < 4)
+    if (N->getAlign() < Align(4))
       report_fatal_error("atomic store must be aligned");
     return DAG.getStore(N->getChain(), SDLoc(Op), N->getVal(), N->getBasePtr(),
-                        N->getPointerInfo(), N->getAlignment(),
+                        N->getPointerInfo(), N->getAlign(),
                         N->getMemOperand()->getFlags(), N->getAAInfo());
   }
   if (N->getMemoryVT() == MVT::i16) {
-    if (N->getAlignment() < 2)
+    if (N->getAlign() < Align(2))
       report_fatal_error("atomic store must be aligned");
     return DAG.getTruncStore(N->getChain(), SDLoc(Op), N->getVal(),
                              N->getBasePtr(), N->getPointerInfo(), MVT::i16,
-                             N->getAlignment(), N->getMemOperand()->getFlags(),
+                             N->getAlign(), N->getMemOperand()->getFlags(),
                              N->getAAInfo());
   }
   if (N->getMemoryVT() == MVT::i8)
     return DAG.getTruncStore(N->getChain(), SDLoc(Op), N->getVal(),
                              N->getBasePtr(), N->getPointerInfo(), MVT::i8,
-                             N->getAlignment(), N->getMemOperand()->getFlags(),
+                             N->getAlign(), N->getMemOperand()->getFlags(),
                              N->getAAInfo());
   return SDValue();
 }
@@ -1791,17 +1789,17 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
     unsigned StoreBits = ST->getMemoryVT().getStoreSizeInBits();
     assert((StoreBits % 8) == 0 &&
            "Store size in bits must be a multiple of 8");
-    unsigned Alignment = ST->getAlignment();
+    Align Alignment = ST->getAlign();
 
     if (LoadSDNode *LD = dyn_cast<LoadSDNode>(ST->getValue())) {
       if (LD->hasNUsesOfValue(1, 0) && ST->getMemoryVT() == LD->getMemoryVT() &&
-        LD->getAlignment() == Alignment &&
+        LD->getAlign() == Alignment &&
         !LD->isVolatile() && !LD->isIndexed() &&
         Chain.reachesChainWithoutSideEffects(SDValue(LD, 1))) {
         bool isTail = isInTailCallPosition(DAG, ST, Chain);
         return DAG.getMemmove(Chain, dl, ST->getBasePtr(), LD->getBasePtr(),
                               DAG.getConstant(StoreBits / 8, dl, MVT::i32),
-                              Align(Alignment), false, isTail,
+                              Alignment, false, isTail,
                               ST->getPointerInfo(), LD->getPointerInfo());
       }
     }
diff --git a/llvm/lib/Target/XCore/XCoreInstrInfo.td b/llvm/lib/Target/XCore/XCoreInstrInfo.td
index aa3739d0335e..23f80b126404 100644
--- a/llvm/lib/Target/XCore/XCoreInstrInfo.td
+++ b/llvm/lib/Target/XCore/XCoreInstrInfo.td
@@ -363,7 +363,7 @@ let usesCustomInserter = 1 in {
                                  (select GRRegs:$cond, GRRegs:$T, GRRegs:$F))]>;
 }
 
-let hasSideEffects = 1 in
+let hasSideEffects = 1, isMeta = 1 in
 def Int_MemBarrier : PseudoInstXCore<(outs), (ins), "#MEMBARRIER",
                                      [(XCoreMemBarrier)]>;
 
diff --git a/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp b/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp
index ec44d2899dd5..f039f4f67955 100644
--- a/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp
@@ -15,6 +15,13 @@ using namespace llvm;
 
 void XCoreFunctionInfo::anchor() { }
 
+MachineFunctionInfo *XCoreFunctionInfo::clone(
+    BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+    const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+    const {
+  return DestMF.cloneInfo<XCoreFunctionInfo>(*this);
+}
+
 bool XCoreFunctionInfo::isLargeFrame(const MachineFunction &MF) const {
   if (CachedEStackSize == -1) {
     CachedEStackSize = MF.getFrameInfo().estimateStackSize(MF);
diff --git a/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h b/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h
index aebe11b15b54..6cdb1239750a 100644
--- a/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h
+++ b/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h
@@ -45,6 +45,11 @@ public:
 
   explicit XCoreFunctionInfo(MachineFunction &MF) {}
 
+  MachineFunctionInfo *
+  clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+        const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+      const override;
+
   ~XCoreFunctionInfo() override = default;
 
   void setVarArgsFrameIndex(int off) { VarArgsFrameIndex = off; }
diff --git a/llvm/lib/Target/XCore/XCoreTargetMachine.cpp b/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
index 2e49627a19bf..3c27fcd9ba53 100644
--- a/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -26,7 +26,7 @@
 using namespace llvm;
 
 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
-  return RM.getValueOr(Reloc::Static);
+  return RM.value_or(Reloc::Static);
 }
 
 static CodeModel::Model
@@ -108,6 +108,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeXCoreTarget() {
 }
 
 TargetTransformInfo
-XCoreTargetMachine::getTargetTransformInfo(const Function &F) {
+XCoreTargetMachine::getTargetTransformInfo(const Function &F) const {
   return TargetTransformInfo(XCoreTTIImpl(this, F));
 }
diff --git a/llvm/lib/Target/XCore/XCoreTargetMachine.h b/llvm/lib/Target/XCore/XCoreTargetMachine.h
index 9c3bdcf78f9c..a4754fd77e65 100644
--- a/llvm/lib/Target/XCore/XCoreTargetMachine.h
+++ b/llvm/lib/Target/XCore/XCoreTargetMachine.h
@@ -15,13 +15,13 @@
 
 #include "XCoreSubtarget.h"
 #include "llvm/ADT/Optional.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Target/TargetMachine.h"
 #include <memory>
 
 namespace llvm {
+class StringRef;
 
 class XCoreTargetMachine : public LLVMTargetMachine {
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
@@ -42,7 +42,7 @@ public:
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
-  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+  TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
 
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
diff --git a/llvm/lib/Testing/Support/Annotations.cpp b/llvm/lib/Testing/Support/Annotations.cpp
index 44d3acccfdb2..557b6cdf98ce 100644
--- a/llvm/lib/Testing/Support/Annotations.cpp
+++ b/llvm/lib/Testing/Support/Annotations.cpp
@@ -33,12 +33,12 @@ Annotations::Annotations(llvm::StringRef Text) {
   Code.reserve(Text.size());
   while (!Text.empty()) {
     if (Text.consume_front("^")) {
-      Points[Name.getValueOr("")].push_back(Code.size());
+      Points[Name.value_or("")].push_back(Code.size());
       Name = llvm::None;
       continue;
     }
     if (Text.consume_front("[[")) {
-      OpenRanges.emplace_back(Name.getValueOr(""), Code.size());
+      OpenRanges.emplace_back(Name.value_or(""), Code.size());
       Name = llvm::None;
       continue;
     }
diff --git a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
index 8f69282d3443..5f4d0cdf2b57 100644
--- a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
+++ b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
@@ -77,7 +77,7 @@ static std::vector<StringRef> getSearchPaths(opt::InputArgList *Args,
 
   // Add $LIB.
   Optional<std::string> EnvOpt = sys::Process::GetEnv("LIB");
-  if (!EnvOpt.hasValue())
+  if (!EnvOpt)
     return Ret;
   StringRef Env = Saver.save(*EnvOpt);
   while (!Env.empty()) {
@@ -229,10 +229,11 @@ static void appendFile(std::vector<NewArchiveMember> &Members,
         (Magic == file_magic::coff_object) ? getCOFFFileMachine(MB)
                                            : getBitcodeFileMachine(MB);
     if (!MaybeFileMachine) {
-      handleAllErrors(MaybeFileMachine.takeError(), [&](const ErrorInfoBase &EIB) {
-        llvm::errs() << MB.getBufferIdentifier() << ": " << EIB.message()
-                     << "\n";
-      });
+      handleAllErrors(MaybeFileMachine.takeError(),
+                      [&](const ErrorInfoBase &EIB) {
+                        llvm::errs() << MB.getBufferIdentifier() << ": "
+                                     << EIB.message() << "\n";
+                      });
       exit(1);
     }
     COFF::MachineTypes FileMachine = *MaybeFileMachine;
@@ -291,10 +292,25 @@ int llvm::libDriverMain(ArrayRef<const char *> ArgsArr) {
     return 0;
   }
 
+  // Parse /ignore:
+  llvm::StringSet<> IgnoredWarnings;
+  for (auto *Arg : Args.filtered(OPT_ignore))
+    IgnoredWarnings.insert(Arg->getValue());
+
   // If no input files and not told otherwise, silently do nothing to match
   // lib.exe
-  if (!Args.hasArgNoClaim(OPT_INPUT) && !Args.hasArg(OPT_llvmlibempty))
+  if (!Args.hasArgNoClaim(OPT_INPUT) && !Args.hasArg(OPT_llvmlibempty)) {
+    if (!IgnoredWarnings.contains("emptyoutput")) {
+      llvm::errs() << "warning: no input files, not writing output file\n";
+      llvm::errs() << "         pass /llvmlibempty to write empty .lib file,\n";
+      llvm::errs() << "         pass /ignore:emptyoutput to suppress warning\n";
+      if (Args.hasFlag(OPT_WX, OPT_WX_no, false)) {
+        llvm::errs() << "treating warning as error due to /WX\n";
+        return 1;
+      }
+    }
     return 0;
+  }
 
   if (Args.hasArg(OPT_lst)) {
     doList(Args);
diff --git a/llvm/lib/ToolDrivers/llvm-lib/Options.td b/llvm/lib/ToolDrivers/llvm-lib/Options.td
index 5891e238a328..0d97f77e525f 100644
--- a/llvm/lib/ToolDrivers/llvm-lib/Options.td
+++ b/llvm/lib/ToolDrivers/llvm-lib/Options.td
@@ -9,6 +9,14 @@ class F<string name> : Flag<["/", "-", "/?", "-?"], name>;
 class P<string name, string help> :
       Joined<["/", "-", "/?", "-?"], name#":">, HelpText<help>;
 
+// Boolean flag which can be suffixed by ":no". Using it unsuffixed turns the
+// flag on and using it suffixed by ":no" turns it off.
+multiclass B<string name, string help_on, string help_off> {
+  def "" : F<name>, HelpText<help_on>;
+  def _no : F<name#":no">, HelpText<help_off>;
+}
+
+def ignore : P<"ignore", "Specify warning codes to ignore">;
 def libpath: P<"libpath", "Object file search path">;
 
 // Can't be called "list" since that's a keyword.
@@ -23,6 +31,9 @@ def llvmlibempty : F<"llvmlibempty">,
 
 def machine: P<"machine", "Specify target platform">;
 
+defm WX : B<"WX", "Treat warnings as errors",
+            "Don't treat warnings as errors (default)">;
+
 def help : F<"help">;
 
 // /?? and -?? must be before /? and -? to not confuse lib/Options.
@@ -32,7 +43,6 @@ def help_q : Flag<["/??", "-??", "/?", "-?"], "">, Alias<help>;
 // The flags below do nothing. They are defined only for lib.exe compatibility.
 //==============================================================================
 
-class QF<string name> : Joined<["/", "-", "/?", "-?"], name#":">;
-
-def ignore : QF<"ignore">;
+def ltcg : F<"ltcg">;
 def nologo : F<"nologo">;
+def subsystem : P<"subsystem", "">;
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 7243e39c9029..1fd8b88dd776 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -22,8 +22,8 @@
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
@@ -36,6 +36,10 @@
 using namespace llvm;
 using namespace PatternMatch;
 
+namespace llvm {
+class DataLayout;
+}
+
 #define DEBUG_TYPE "aggressive-instcombine"
 
 STATISTIC(NumAnyOrAllBitsSet, "Number of any/all-bits-set patterns folded");
@@ -200,14 +204,13 @@ static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) {
 /// of 'and' ops, then we also need to capture the fact that we saw an
 /// "and X, 1", so that's an extra return value for that case.
 struct MaskOps {
-  Value *Root;
+  Value *Root = nullptr;
   APInt Mask;
   bool MatchAndChain;
-  bool FoundAnd1;
+  bool FoundAnd1 = false;
 
   MaskOps(unsigned BitWidth, bool MatchAnds)
-      : Root(nullptr), Mask(APInt::getZero(BitWidth)), MatchAndChain(MatchAnds),
-        FoundAnd1(false) {}
+      : Mask(APInt::getZero(BitWidth)), MatchAndChain(MatchAnds) {}
 };
 
 /// This is a recursive helper for foldAnyOrAllBitsSet() that walks through a
@@ -363,10 +366,72 @@ static bool tryToRecognizePopCount(Instruction &I) {
   return false;
 }
 
+/// Fold smin(smax(fptosi(x), C1), C2) to llvm.fptosi.sat(x), providing C1 and
+/// C2 saturate the value of the fp conversion. The transform is not reversable
+/// as the fptosi.sat is more defined than the input - all values produce a
+/// valid value for the fptosi.sat, where as some produce poison for original
+/// that were out of range of the integer conversion. The reversed pattern may
+/// use fmax and fmin instead. As we cannot directly reverse the transform, and
+/// it is not always profitable, we make it conditional on the cost being
+/// reported as lower by TTI.
+static bool tryToFPToSat(Instruction &I, TargetTransformInfo &TTI) {
+  // Look for min(max(fptosi, converting to fptosi_sat.
+  Value *In;
+  const APInt *MinC, *MaxC;
+  if (!match(&I, m_SMax(m_OneUse(m_SMin(m_OneUse(m_FPToSI(m_Value(In))),
+                                        m_APInt(MinC))),
+                        m_APInt(MaxC))) &&
+      !match(&I, m_SMin(m_OneUse(m_SMax(m_OneUse(m_FPToSI(m_Value(In))),
+                                        m_APInt(MaxC))),
+                        m_APInt(MinC))))
+    return false;
+
+  // Check that the constants clamp a saturate.
+  if (!(*MinC + 1).isPowerOf2() || -*MaxC != *MinC + 1)
+    return false;
+
+  Type *IntTy = I.getType();
+  Type *FpTy = In->getType();
+  Type *SatTy =
+      IntegerType::get(IntTy->getContext(), (*MinC + 1).exactLogBase2() + 1);
+  if (auto *VecTy = dyn_cast<VectorType>(IntTy))
+    SatTy = VectorType::get(SatTy, VecTy->getElementCount());
+
+  // Get the cost of the intrinsic, and check that against the cost of
+  // fptosi+smin+smax
+  InstructionCost SatCost = TTI.getIntrinsicInstrCost(
+      IntrinsicCostAttributes(Intrinsic::fptosi_sat, SatTy, {In}, {FpTy}),
+      TTI::TCK_RecipThroughput);
+  SatCost += TTI.getCastInstrCost(Instruction::SExt, SatTy, IntTy,
+                                  TTI::CastContextHint::None,
+                                  TTI::TCK_RecipThroughput);
+
+  InstructionCost MinMaxCost = TTI.getCastInstrCost(
+      Instruction::FPToSI, IntTy, FpTy, TTI::CastContextHint::None,
+      TTI::TCK_RecipThroughput);
+  MinMaxCost += TTI.getIntrinsicInstrCost(
+      IntrinsicCostAttributes(Intrinsic::smin, IntTy, {IntTy}),
+      TTI::TCK_RecipThroughput);
+  MinMaxCost += TTI.getIntrinsicInstrCost(
+      IntrinsicCostAttributes(Intrinsic::smax, IntTy, {IntTy}),
+      TTI::TCK_RecipThroughput);
+
+  if (SatCost >= MinMaxCost)
+    return false;
+
+  IRBuilder<> Builder(&I);
+  Function *Fn = Intrinsic::getDeclaration(I.getModule(), Intrinsic::fptosi_sat,
+                                           {SatTy, FpTy});
+  Value *Sat = Builder.CreateCall(Fn, In);
+  I.replaceAllUsesWith(Builder.CreateSExt(Sat, IntTy));
+  return true;
+}
+
 /// This is the entry point for folds that could be implemented in regular
 /// InstCombine, but they are separated because they are not expected to
 /// occur frequently and/or have more than a constant-length pattern match.
-static bool foldUnusualPatterns(Function &F, DominatorTree &DT) {
+static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
+                                TargetTransformInfo &TTI) {
   bool MadeChange = false;
   for (BasicBlock &BB : F) {
     // Ignore unreachable basic blocks.
@@ -382,6 +447,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT) {
       MadeChange |= foldAnyOrAllBitsSet(I);
       MadeChange |= foldGuardedFunnelShift(I, DT);
       MadeChange |= tryToRecognizePopCount(I);
+      MadeChange |= tryToFPToSat(I, TTI);
     }
   }
 
@@ -395,13 +461,13 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT) {
 
 /// This is the entry point for all transforms. Pass manager differences are
 /// handled in the callers of this function.
-static bool runImpl(Function &F, AssumptionCache &AC, TargetLibraryInfo &TLI,
-                    DominatorTree &DT) {
+static bool runImpl(Function &F, AssumptionCache &AC, TargetTransformInfo &TTI,
+                    TargetLibraryInfo &TLI, DominatorTree &DT) {
   bool MadeChange = false;
   const DataLayout &DL = F.getParent()->getDataLayout();
   TruncInstCombine TIC(AC, TLI, DL, DT);
   MadeChange |= TIC.run(F);
-  MadeChange |= foldUnusualPatterns(F, DT);
+  MadeChange |= foldUnusualPatterns(F, DT, TTI);
   return MadeChange;
 }
 
@@ -411,6 +477,7 @@ void AggressiveInstCombinerLegacyPass::getAnalysisUsage(
   AU.addRequired<AssumptionCacheTracker>();
   AU.addRequired<DominatorTreeWrapperPass>();
   AU.addRequired<TargetLibraryInfoWrapperPass>();
+  AU.addRequired<TargetTransformInfoWrapperPass>();
   AU.addPreserved<AAResultsWrapperPass>();
   AU.addPreserved<BasicAAWrapperPass>();
   AU.addPreserved<DominatorTreeWrapperPass>();
@@ -421,7 +488,8 @@ bool AggressiveInstCombinerLegacyPass::runOnFunction(Function &F) {
   auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  return runImpl(F, AC, TLI, DT);
+  auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  return runImpl(F, AC, TTI, TLI, DT);
 }
 
 PreservedAnalyses AggressiveInstCombinePass::run(Function &F,
@@ -429,7 +497,8 @@ PreservedAnalyses AggressiveInstCombinePass::run(Function &F,
   auto &AC = AM.getResult<AssumptionAnalysis>(F);
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
-  if (!runImpl(F, AC, TLI, DT)) {
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  if (!runImpl(F, AC, TTI, TLI, DT)) {
     // No changes, all analyses are preserved.
     return PreservedAnalyses::all();
   }
@@ -446,6 +515,7 @@ INITIALIZE_PASS_BEGIN(AggressiveInstCombinerLegacyPass,
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(AggressiveInstCombinerLegacyPass, "aggressive-instcombine",
                     "Combine pattern based expressions", false, false)
 
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h
index 5d69e26d6ecc..9fc103d45d98 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h
@@ -23,14 +23,14 @@
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
-// TruncInstCombine - looks for expression dags dominated by trunc instructions
-// and for each eligible dag, it will create a reduced bit-width expression and
-// replace the old expression with this new one and remove the old one.
-// Eligible expression dag is such that:
+// TruncInstCombine - looks for expression graphs dominated by trunc
+// instructions and for each eligible graph, it will create a reduced bit-width
+// expression and replace the old expression with this new one and remove the
+// old one. Eligible expression graph is such that:
 //   1. Contains only supported instructions.
 //   2. Supported leaves: ZExtInst, SExtInst, TruncInst and Constant value.
 //   3. Can be evaluated into type with reduced legal bit-width (or Trunc type).
-//   4. All instructions in the dag must not have users outside the dag.
+//   4. All instructions in the graph must not have users outside the graph.
 //      Only exception is for {ZExt, SExt}Inst with operand type equal to the
 //      new reduced type chosen in (3).
 //
@@ -61,9 +61,9 @@ class TruncInstCombine {
   SmallVector<TruncInst *, 4> Worklist;
 
   /// Current processed TruncInst instruction.
-  TruncInst *CurrentTruncInst;
+  TruncInst *CurrentTruncInst = nullptr;
 
-  /// Information per each instruction in the expression dag.
+  /// Information per each instruction in the expression graph.
   struct Info {
     /// Number of LSBs that are needed to generate a valid expression.
     unsigned ValidBitWidth = 0;
@@ -72,26 +72,26 @@ class TruncInstCombine {
     /// The reduced value generated to replace the old instruction.
     Value *NewValue = nullptr;
   };
-  /// An ordered map representing expression dag post-dominated by current
-  /// processed TruncInst. It maps each instruction in the dag to its Info
+  /// An ordered map representing expression graph post-dominated by current
+  /// processed TruncInst. It maps each instruction in the graph to its Info
   /// structure. The map is ordered such that each instruction appears before
-  /// all other instructions in the dag that uses it.
+  /// all other instructions in the graph that uses it.
   MapVector<Instruction *, Info> InstInfoMap;
 
 public:
   TruncInstCombine(AssumptionCache &AC, TargetLibraryInfo &TLI,
                    const DataLayout &DL, const DominatorTree &DT)
-      : AC(AC), TLI(TLI), DL(DL), DT(DT), CurrentTruncInst(nullptr) {}
+      : AC(AC), TLI(TLI), DL(DL), DT(DT) {}
 
   /// Perform TruncInst pattern optimization on given function.
   bool run(Function &F);
 
 private:
-  /// Build expression dag dominated by the /p CurrentTruncInst and append it to
-  /// the InstInfoMap container.
+  /// Build expression graph dominated by the /p CurrentTruncInst and append it
+  /// to the InstInfoMap container.
   ///
-  /// \return true only if succeed to generate an eligible sub expression dag.
-  bool buildTruncExpressionDag();
+  /// \return true only if succeed to generate an eligible sub expression graph.
+  bool buildTruncExpressionGraph();
 
   /// Calculate the minimal allowed bit-width of the chain ending with the
   /// currently visited truncate's operand.
@@ -100,12 +100,12 @@ private:
   /// truncate's operand can be shrunk to.
   unsigned getMinBitWidth();
 
-  /// Build an expression dag dominated by the current processed TruncInst and
+  /// Build an expression graph dominated by the current processed TruncInst and
   /// Check if it is eligible to be reduced to a smaller type.
   ///
   /// \return the scalar version of the new type to be used for the reduced
-  ///         expression dag, or nullptr if the expression dag is not eligible
-  ///         to be reduced.
+  ///         expression graph, or nullptr if the expression graph is not
+  ///         eligible to be reduced.
   Type *getBestTruncatedType();
 
   KnownBits computeKnownBits(const Value *V) const {
@@ -128,12 +128,12 @@ private:
   /// \return the new reduced value.
   Value *getReducedOperand(Value *V, Type *SclTy);
 
-  /// Create a new expression dag using the reduced /p SclTy type and replace
-  /// the old expression dag with it. Also erase all instructions in the old
-  /// dag, except those that are still needed outside the dag.
+  /// Create a new expression graph using the reduced /p SclTy type and replace
+  /// the old expression graph with it. Also erase all instructions in the old
+  /// graph, except those that are still needed outside the graph.
   ///
-  /// \param SclTy scalar version of new type to reduce expression dag into.
-  void ReduceExpressionDag(Type *SclTy);
+  /// \param SclTy scalar version of new type to reduce expression graph into.
+  void ReduceExpressionGraph(Type *SclTy);
 };
 } // end namespace llvm.
 
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
index 4624b735bef8..70ea68587b8e 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
@@ -6,14 +6,14 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TruncInstCombine - looks for expression dags post-dominated by TruncInst and
-// for each eligible dag, it will create a reduced bit-width expression, replace
-// the old expression with this new one and remove the old expression.
-// Eligible expression dag is such that:
+// TruncInstCombine - looks for expression graphs post-dominated by TruncInst
+// and for each eligible graph, it will create a reduced bit-width expression,
+// replace the old expression with this new one and remove the old expression.
+// Eligible expression graph is such that:
 //   1. Contains only supported instructions.
 //   2. Supported leaves: ZExtInst, SExtInst, TruncInst and Constant value.
 //   3. Can be evaluated into type with reduced legal bit-width.
-//   4. All instructions in the dag must not have users outside the dag.
+//   4. All instructions in the graph must not have users outside the graph.
 //      The only exception is for {ZExt, SExt}Inst with operand type equal to
 //      the new reduced type evaluated in (3).
 //
@@ -28,7 +28,6 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
@@ -39,14 +38,13 @@ using namespace llvm;
 
 #define DEBUG_TYPE "aggressive-instcombine"
 
-STATISTIC(
-    NumDAGsReduced,
-    "Number of truncations eliminated by reducing bit width of expression DAG");
+STATISTIC(NumExprsReduced, "Number of truncations eliminated by reducing bit "
+                           "width of expression graph");
 STATISTIC(NumInstrsReduced,
           "Number of instructions whose bit width was reduced");
 
 /// Given an instruction and a container, it fills all the relevant operands of
-/// that instruction, with respect to the Trunc expression dag optimizaton.
+/// that instruction, with respect to the Trunc expression graph optimizaton.
 static void getRelevantOperands(Instruction *I, SmallVectorImpl<Value *> &Ops) {
   unsigned Opc = I->getOpcode();
   switch (Opc) {
@@ -78,15 +76,19 @@ static void getRelevantOperands(Instruction *I, SmallVectorImpl<Value *> &Ops) {
     Ops.push_back(I->getOperand(1));
     Ops.push_back(I->getOperand(2));
     break;
+  case Instruction::PHI:
+    for (Value *V : cast<PHINode>(I)->incoming_values())
+      Ops.push_back(V);
+    break;
   default:
     llvm_unreachable("Unreachable!");
   }
 }
 
-bool TruncInstCombine::buildTruncExpressionDag() {
+bool TruncInstCombine::buildTruncExpressionGraph() {
   SmallVector<Value *, 8> Worklist;
   SmallVector<Instruction *, 8> Stack;
-  // Clear old expression dag.
+  // Clear old instructions info.
   InstInfoMap.clear();
 
   Worklist.push_back(CurrentTruncInst->getOperand(0));
@@ -150,11 +152,19 @@ bool TruncInstCombine::buildTruncExpressionDag() {
       append_range(Worklist, Operands);
       break;
     }
+    case Instruction::PHI: {
+      SmallVector<Value *, 2> Operands;
+      getRelevantOperands(I, Operands);
+      // Add only operands not in Stack to prevent cycle
+      for (auto *Op : Operands)
+        if (all_of(Stack, [Op](Value *V) { return Op != V; }))
+          Worklist.push_back(Op);
+      break;
+    }
     default:
       // TODO: Can handle more cases here:
       // 1. shufflevector
       // 2. sdiv, srem
-      // 3. phi node(and loop handling)
       // ...
       return false;
     }
@@ -254,7 +264,7 @@ unsigned TruncInstCombine::getMinBitWidth() {
 }
 
 Type *TruncInstCombine::getBestTruncatedType() {
-  if (!buildTruncExpressionDag())
+  if (!buildTruncExpressionGraph())
     return nullptr;
 
   // We don't want to duplicate instructions, which isn't profitable. Thus, we
@@ -367,8 +377,10 @@ Value *TruncInstCombine::getReducedOperand(Value *V, Type *SclTy) {
   return Entry.NewValue;
 }
 
-void TruncInstCombine::ReduceExpressionDag(Type *SclTy) {
+void TruncInstCombine::ReduceExpressionGraph(Type *SclTy) {
   NumInstrsReduced += InstInfoMap.size();
+  // Pairs of old and new phi-nodes
+  SmallVector<std::pair<PHINode *, PHINode *>, 2> OldNewPHINodes;
   for (auto &Itr : InstInfoMap) { // Forward
     Instruction *I = Itr.first;
     TruncInstCombine::Info &NodeInfo = Itr.second;
@@ -451,6 +463,12 @@ void TruncInstCombine::ReduceExpressionDag(Type *SclTy) {
       Res = Builder.CreateSelect(Op0, LHS, RHS);
       break;
     }
+    case Instruction::PHI: {
+      Res = Builder.CreatePHI(getReducedType(I, SclTy), I->getNumOperands());
+      OldNewPHINodes.push_back(
+          std::make_pair(cast<PHINode>(I), cast<PHINode>(Res)));
+      break;
+    }
     default:
       llvm_unreachable("Unhandled instruction");
     }
@@ -460,6 +478,14 @@ void TruncInstCombine::ReduceExpressionDag(Type *SclTy) {
       ResI->takeName(I);
   }
 
+  for (auto &Node : OldNewPHINodes) {
+    PHINode *OldPN = Node.first;
+    PHINode *NewPN = Node.second;
+    for (auto Incoming : zip(OldPN->incoming_values(), OldPN->blocks()))
+      NewPN->addIncoming(getReducedOperand(std::get<0>(Incoming), SclTy),
+                         std::get<1>(Incoming));
+  }
+
   Value *Res = getReducedOperand(CurrentTruncInst->getOperand(0), SclTy);
   Type *DstTy = CurrentTruncInst->getType();
   if (Res->getType() != DstTy) {
@@ -470,17 +496,29 @@ void TruncInstCombine::ReduceExpressionDag(Type *SclTy) {
   }
   CurrentTruncInst->replaceAllUsesWith(Res);
 
-  // Erase old expression dag, which was replaced by the reduced expression dag.
-  // We iterate backward, which means we visit the instruction before we visit
-  // any of its operands, this way, when we get to the operand, we already
-  // removed the instructions (from the expression dag) that uses it.
+  // Erase old expression graph, which was replaced by the reduced expression
+  // graph.
   CurrentTruncInst->eraseFromParent();
+  // First, erase old phi-nodes and its uses
+  for (auto &Node : OldNewPHINodes) {
+    PHINode *OldPN = Node.first;
+    OldPN->replaceAllUsesWith(PoisonValue::get(OldPN->getType()));
+    InstInfoMap.erase(OldPN);
+    OldPN->eraseFromParent();
+  }
+  // Now we have expression graph turned into dag.
+  // We iterate backward, which means we visit the instruction before we
+  // visit any of its operands, this way, when we get to the operand, we already
+  // removed the instructions (from the expression dag) that uses it.
   for (auto &I : llvm::reverse(InstInfoMap)) {
     // We still need to check that the instruction has no users before we erase
     // it, because {SExt, ZExt}Inst Instruction might have other users that was
     // not reduced, in such case, we need to keep that instruction.
     if (I.first->use_empty())
       I.first->eraseFromParent();
+    else
+      assert((isa<SExtInst>(I.first) || isa<ZExtInst>(I.first)) &&
+             "Only {SExt, ZExt}Inst might have unreduced users");
   }
 }
 
@@ -498,18 +536,18 @@ bool TruncInstCombine::run(Function &F) {
   }
 
   // Process all TruncInst in the Worklist, for each instruction:
-  //   1. Check if it dominates an eligible expression dag to be reduced.
-  //   2. Create a reduced expression dag and replace the old one with it.
+  //   1. Check if it dominates an eligible expression graph to be reduced.
+  //   2. Create a reduced expression graph and replace the old one with it.
   while (!Worklist.empty()) {
     CurrentTruncInst = Worklist.pop_back_val();
 
     if (Type *NewDstSclTy = getBestTruncatedType()) {
       LLVM_DEBUG(
-          dbgs() << "ICE: TruncInstCombine reducing type of expression dag "
+          dbgs() << "ICE: TruncInstCombine reducing type of expression graph "
                     "dominated by: "
                  << CurrentTruncInst << '\n');
-      ReduceExpressionDag(NewDstSclTy);
-      ++NumDAGsReduced;
+      ReduceExpressionGraph(NewDstSclTy);
+      ++NumExprsReduced;
       MadeIRChange = true;
     }
   }
diff --git a/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp b/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
index 67f8828e4c75..f7bbdcffd2ec 100644
--- a/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
@@ -10,9 +10,9 @@
 #include "CoroInternal.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Transforms/Scalar/SimplifyCFG.h"
 
 using namespace llvm;
 
@@ -23,19 +23,10 @@ namespace {
 struct Lowerer : coro::LowererBase {
   IRBuilder<> Builder;
   Lowerer(Module &M) : LowererBase(M), Builder(Context) {}
-  bool lowerRemainingCoroIntrinsics(Function &F);
+  bool lower(Function &F);
 };
 }
 
-static void simplifyCFG(Function &F) {
-  llvm::legacy::FunctionPassManager FPM(F.getParent());
-  FPM.add(createCFGSimplificationPass());
-
-  FPM.doInitialization();
-  FPM.run(F);
-  FPM.doFinalization();
-}
-
 static void lowerSubFn(IRBuilder<> &Builder, CoroSubFnInst *SubFn) {
   Builder.SetInsertPoint(SubFn);
   Value *FrameRaw = SubFn->getFrame();
@@ -53,12 +44,10 @@ static void lowerSubFn(IRBuilder<> &Builder, CoroSubFnInst *SubFn) {
   SubFn->replaceAllUsesWith(Load);
 }
 
-bool Lowerer::lowerRemainingCoroIntrinsics(Function &F) {
+bool Lowerer::lower(Function &F) {
+  bool IsPrivateAndUnprocessed = F.isPresplitCoroutine() && F.hasLocalLinkage();
   bool Changed = false;
 
-  bool IsPrivateAndUnprocessed =
-      F.hasFnAttribute(CORO_PRESPLIT_ATTR) && F.hasLocalLinkage();
-
   for (Instruction &I : llvm::make_early_inc_range(instructions(F))) {
     if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
       switch (II->getIntrinsicID()) {
@@ -116,11 +105,6 @@ bool Lowerer::lowerRemainingCoroIntrinsics(Function &F) {
     }
   }
 
-  if (Changed) {
-    // After replacement were made we can cleanup the function body a little.
-    simplifyCFG(F);
-  }
-
   return Changed;
 }
 
@@ -132,50 +116,21 @@ static bool declaresCoroCleanupIntrinsics(const Module &M) {
           "llvm.coro.async.resume"});
 }
 
-PreservedAnalyses CoroCleanupPass::run(Function &F,
-                                       FunctionAnalysisManager &AM) {
-  auto &M = *F.getParent();
-  if (!declaresCoroCleanupIntrinsics(M) ||
-      !Lowerer(M).lowerRemainingCoroIntrinsics(F))
+PreservedAnalyses CoroCleanupPass::run(Module &M,
+                                       ModuleAnalysisManager &MAM) {
+  if (!declaresCoroCleanupIntrinsics(M))
     return PreservedAnalyses::all();
 
-  return PreservedAnalyses::none();
-}
-
-namespace {
-
-struct CoroCleanupLegacy : FunctionPass {
-  static char ID; // Pass identification, replacement for typeid
+  FunctionAnalysisManager &FAM =
+      MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
 
-  CoroCleanupLegacy() : FunctionPass(ID) {
-    initializeCoroCleanupLegacyPass(*PassRegistry::getPassRegistry());
-  }
+  FunctionPassManager FPM;
+  FPM.addPass(SimplifyCFGPass());
 
-  std::unique_ptr<Lowerer> L;
+  Lowerer L(M);
+  for (auto &F : M)
+    if (L.lower(F))
+      FPM.run(F, FAM);
 
-  // This pass has work to do only if we find intrinsics we are going to lower
-  // in the module.
-  bool doInitialization(Module &M) override {
-    if (declaresCoroCleanupIntrinsics(M))
-      L = std::make_unique<Lowerer>(M);
-    return false;
-  }
-
-  bool runOnFunction(Function &F) override {
-    if (L)
-      return L->lowerRemainingCoroIntrinsics(F);
-    return false;
-  }
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    if (!L)
-      AU.setPreservesAll();
-  }
-  StringRef getPassName() const override { return "Coroutine Cleanup"; }
-};
+  return PreservedAnalyses::none();
 }
-
-char CoroCleanupLegacy::ID = 0;
-INITIALIZE_PASS(CoroCleanupLegacy, "coro-cleanup",
-                "Lower all coroutine related intrinsics", false, false)
-
-Pass *llvm::createCoroCleanupLegacyPass() { return new CoroCleanupLegacy(); }
diff --git a/llvm/lib/Transforms/Coroutines/CoroConditionalWrapper.cpp b/llvm/lib/Transforms/Coroutines/CoroConditionalWrapper.cpp
new file mode 100644
index 000000000000..3d26a43ceba7
--- /dev/null
+++ b/llvm/lib/Transforms/Coroutines/CoroConditionalWrapper.cpp
@@ -0,0 +1,24 @@
+//===- CoroConditionalWrapper.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Coroutines/CoroConditionalWrapper.h"
+#include "CoroInternal.h"
+#include "llvm/IR/Module.h"
+
+using namespace llvm;
+
+CoroConditionalWrapper::CoroConditionalWrapper(ModulePassManager &&PM)
+    : PM(std::move(PM)) {}
+
+PreservedAnalyses CoroConditionalWrapper::run(Module &M,
+                                              ModuleAnalysisManager &AM) {
+  if (!coro::declaresAnyIntrinsic(M))
+    return PreservedAnalyses::all();
+
+  return PM.run(M, AM);
+}
diff --git a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
index 1533e1805f17..dd7cb23f3f3d 100644
--- a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
@@ -8,10 +8,10 @@
 
 #include "llvm/Transforms/Coroutines/CoroEarly.h"
 #include "CoroInternal.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
 
 using namespace llvm;
 
@@ -35,7 +35,7 @@ public:
         AnyResumeFnPtrTy(FunctionType::get(Type::getVoidTy(Context), Int8Ptr,
                                            /*isVarArg=*/false)
                              ->getPointerTo()) {}
-  bool lowerEarlyIntrinsics(Function &F);
+  void lowerEarlyIntrinsics(Function &F);
 };
 }
 
@@ -145,14 +145,16 @@ static void setCannotDuplicate(CoroIdInst *CoroId) {
       CB->setCannotDuplicate();
 }
 
-bool Lowerer::lowerEarlyIntrinsics(Function &F) {
-  bool Changed = false;
+void Lowerer::lowerEarlyIntrinsics(Function &F) {
   CoroIdInst *CoroId = nullptr;
   SmallVector<CoroFreeInst *, 4> CoroFrees;
   bool HasCoroSuspend = false;
   for (Instruction &I : llvm::make_early_inc_range(instructions(F))) {
-    if (auto *CB = dyn_cast<CallBase>(&I)) {
-      switch (CB->getIntrinsicID()) {
+    auto *CB = dyn_cast<CallBase>(&I);
+    if (!CB)
+      continue;
+
+    switch (CB->getIntrinsicID()) {
       default:
         continue;
       case Intrinsic::coro_free:
@@ -178,12 +180,9 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) {
       case Intrinsic::coro_id:
         if (auto *CII = cast<CoroIdInst>(&I)) {
           if (CII->getInfo().isPreSplit()) {
-            assert(F.hasFnAttribute(CORO_PRESPLIT_ATTR) &&
-                   F.getFnAttribute(CORO_PRESPLIT_ATTR).getValueAsString() ==
-                       UNPREPARED_FOR_SPLIT &&
+            assert(F.isPresplitCoroutine() &&
                    "The frontend uses Swtich-Resumed ABI should emit "
-                   "\"coroutine.presplit\" attribute with value \"0\" for the "
-                   "coroutine.");
+                   "\"coroutine.presplit\" attribute for the coroutine.");
             setCannotDuplicate(CII);
             CII->setCoroutineSelf();
             CoroId = cast<CoroIdInst>(&I);
@@ -193,9 +192,7 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) {
       case Intrinsic::coro_id_retcon:
       case Intrinsic::coro_id_retcon_once:
       case Intrinsic::coro_id_async:
-        // TODO: Remove the line once we support it in the corresponding
-        // frontend.
-        F.addFnAttr(CORO_PRESPLIT_ATTR, PREPARED_FOR_SPLIT);
+        F.setPresplitCoroutine();
         break;
       case Intrinsic::coro_resume:
         lowerResumeOrDestroy(*CB, CoroSubFnInst::ResumeIndex);
@@ -209,16 +206,16 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) {
       case Intrinsic::coro_done:
         lowerCoroDone(cast<IntrinsicInst>(&I));
         break;
-      }
-      Changed = true;
     }
   }
+
   // Make sure that all CoroFree reference the coro.id intrinsic.
   // Token type is not exposed through coroutine C/C++ builtins to plain C, so
   // we allow specifying none and fixing it up here.
   if (CoroId)
     for (CoroFreeInst *CF : CoroFrees)
       CF->setArgOperand(0, CoroId);
+
   // Coroutine suspention could potentially lead to any argument modified
   // outside of the function, hence arguments should not have noalias
   // attributes.
@@ -226,7 +223,6 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) {
     for (Argument &A : F.args())
       if (A.hasNoAliasAttr())
         A.removeAttr(Attribute::NoAlias);
-  return Changed;
 }
 
 static bool declaresCoroEarlyIntrinsics(const Module &M) {
@@ -238,52 +234,15 @@ static bool declaresCoroEarlyIntrinsics(const Module &M) {
           "llvm.coro.suspend"});
 }
 
-PreservedAnalyses CoroEarlyPass::run(Function &F, FunctionAnalysisManager &) {
-  Module &M = *F.getParent();
-  if (!declaresCoroEarlyIntrinsics(M) || !Lowerer(M).lowerEarlyIntrinsics(F))
+PreservedAnalyses CoroEarlyPass::run(Module &M, ModuleAnalysisManager &) {
+  if (!declaresCoroEarlyIntrinsics(M))
     return PreservedAnalyses::all();
 
+  Lowerer L(M);
+  for (auto &F : M)
+    L.lowerEarlyIntrinsics(F);
+
   PreservedAnalyses PA;
   PA.preserveSet<CFGAnalyses>();
   return PA;
 }
-
-namespace {
-
-struct CoroEarlyLegacy : public FunctionPass {
-  static char ID; // Pass identification, replacement for typeid.
-  CoroEarlyLegacy() : FunctionPass(ID) {
-    initializeCoroEarlyLegacyPass(*PassRegistry::getPassRegistry());
-  }
-
-  std::unique_ptr<Lowerer> L;
-
-  // This pass has work to do only if we find intrinsics we are going to lower
-  // in the module.
-  bool doInitialization(Module &M) override {
-    if (declaresCoroEarlyIntrinsics(M))
-      L = std::make_unique<Lowerer>(M);
-    return false;
-  }
-
-  bool runOnFunction(Function &F) override {
-    if (!L)
-      return false;
-
-    return L->lowerEarlyIntrinsics(F);
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-  }
-  StringRef getPassName() const override {
-    return "Lower early coroutine intrinsics";
-  }
-};
-}
-
-char CoroEarlyLegacy::ID = 0;
-INITIALIZE_PASS(CoroEarlyLegacy, "coro-early",
-                "Lower early coroutine intrinsics", false, false)
-
-Pass *llvm::createCoroEarlyLegacyPass() { return new CoroEarlyLegacy(); }
diff --git a/llvm/lib/Transforms/Coroutines/CoroElide.cpp b/llvm/lib/Transforms/Coroutines/CoroElide.cpp
index 84bebb7bf42d..6f78fc8db311 100644
--- a/llvm/lib/Transforms/Coroutines/CoroElide.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroElide.cpp
@@ -14,8 +14,6 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/InstIterator.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
 
@@ -103,21 +101,12 @@ static void removeTailCallAttribute(AllocaInst *Frame, AAResults &AA) {
 
 // Given a resume function @f.resume(%f.frame* %frame), returns the size
 // and expected alignment of %f.frame type.
-static std::pair<uint64_t, Align> getFrameLayout(Function *Resume) {
-  // Prefer to pull information from the function attributes.
+static Optional<std::pair<uint64_t, Align>> getFrameLayout(Function *Resume) {
+  // Pull information from the function attributes.
   auto Size = Resume->getParamDereferenceableBytes(0);
-  auto Align = Resume->getParamAlign(0);
-
-  // If those aren't given, extract them from the type.
-  if (Size == 0 || !Align) {
-    auto *FrameTy = Resume->arg_begin()->getType()->getPointerElementType();
-
-    const DataLayout &DL = Resume->getParent()->getDataLayout();
-    if (!Size) Size = DL.getTypeAllocSize(FrameTy);
-    if (!Align) Align = DL.getABITypeAlign(FrameTy);
-  }
-
-  return std::make_pair(Size, *Align);
+  if (!Size)
+    return None;
+  return std::make_pair(Size, Resume->getParamAlign(0).valueOrOne());
 }
 
 // Finds first non alloca instruction in the entry block of a function.
@@ -347,56 +336,37 @@ bool Lowerer::processCoroId(CoroIdInst *CoroId, AAResults &AA,
   assert(Resumers && "PostSplit coro.id Info argument must refer to an array"
                      "of coroutine subfunctions");
   auto *ResumeAddrConstant =
-      ConstantExpr::getExtractValue(Resumers, CoroSubFnInst::ResumeIndex);
+      Resumers->getAggregateElement(CoroSubFnInst::ResumeIndex);
 
   replaceWithConstant(ResumeAddrConstant, ResumeAddr);
 
   bool ShouldElide = shouldElide(CoroId->getFunction(), DT);
 
-  auto *DestroyAddrConstant = ConstantExpr::getExtractValue(
-      Resumers,
+  auto *DestroyAddrConstant = Resumers->getAggregateElement(
       ShouldElide ? CoroSubFnInst::CleanupIndex : CoroSubFnInst::DestroyIndex);
 
   for (auto &It : DestroyAddr)
     replaceWithConstant(DestroyAddrConstant, It.second);
 
   if (ShouldElide) {
-    auto FrameSizeAndAlign = getFrameLayout(cast<Function>(ResumeAddrConstant));
-    elideHeapAllocations(CoroId->getFunction(), FrameSizeAndAlign.first,
-                         FrameSizeAndAlign.second, AA);
-    coro::replaceCoroFree(CoroId, /*Elide=*/true);
-    NumOfCoroElided++;
+    if (auto FrameSizeAndAlign =
+            getFrameLayout(cast<Function>(ResumeAddrConstant))) {
+      elideHeapAllocations(CoroId->getFunction(), FrameSizeAndAlign->first,
+                           FrameSizeAndAlign->second, AA);
+      coro::replaceCoroFree(CoroId, /*Elide=*/true);
+      NumOfCoroElided++;
 #ifndef NDEBUG
-    if (!CoroElideInfoOutputFilename.empty())
-      *getOrCreateLogFile()
-          << "Elide " << CoroId->getCoroutine()->getName() << " in "
-          << CoroId->getFunction()->getName() << "\n";
+      if (!CoroElideInfoOutputFilename.empty())
+        *getOrCreateLogFile()
+            << "Elide " << CoroId->getCoroutine()->getName() << " in "
+            << CoroId->getFunction()->getName() << "\n";
 #endif
+    }
   }
 
   return true;
 }
 
-// See if there are any coro.subfn.addr instructions referring to coro.devirt
-// trigger, if so, replace them with a direct call to devirt trigger function.
-static bool replaceDevirtTrigger(Function &F) {
-  SmallVector<CoroSubFnInst *, 1> DevirtAddr;
-  for (auto &I : instructions(F))
-    if (auto *SubFn = dyn_cast<CoroSubFnInst>(&I))
-      if (SubFn->getIndex() == CoroSubFnInst::RestartTrigger)
-        DevirtAddr.push_back(SubFn);
-
-  if (DevirtAddr.empty())
-    return false;
-
-  Module &M = *F.getParent();
-  Function *DevirtFn = M.getFunction(CORO_DEVIRT_TRIGGER_FN);
-  assert(DevirtFn && "coro.devirt.fn not found");
-  replaceWithConstant(DevirtFn, DevirtAddr);
-
-  return true;
-}
-
 static bool declaresCoroElideIntrinsics(Module &M) {
   return coro::declaresIntrinsics(M, {"llvm.coro.id", "llvm.coro.id.async"});
 }
@@ -422,62 +392,3 @@ PreservedAnalyses CoroElidePass::run(Function &F, FunctionAnalysisManager &AM) {
 
   return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
 }
-
-namespace {
-struct CoroElideLegacy : FunctionPass {
-  static char ID;
-  CoroElideLegacy() : FunctionPass(ID) {
-    initializeCoroElideLegacyPass(*PassRegistry::getPassRegistry());
-  }
-
-  std::unique_ptr<Lowerer> L;
-
-  bool doInitialization(Module &M) override {
-    if (declaresCoroElideIntrinsics(M))
-      L = std::make_unique<Lowerer>(M);
-    return false;
-  }
-
-  bool runOnFunction(Function &F) override {
-    if (!L)
-      return false;
-
-    bool Changed = false;
-
-    if (F.hasFnAttribute(CORO_PRESPLIT_ATTR))
-      Changed = replaceDevirtTrigger(F);
-
-    L->CoroIds.clear();
-    L->collectPostSplitCoroIds(&F);
-    // If we did not find any coro.id, there is nothing to do.
-    if (L->CoroIds.empty())
-      return Changed;
-
-    AAResults &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
-    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-
-    for (auto *CII : L->CoroIds)
-      Changed |= L->processCoroId(CII, AA, DT);
-
-    return Changed;
-  }
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<AAResultsWrapperPass>();
-    AU.addRequired<DominatorTreeWrapperPass>();
-  }
-  StringRef getPassName() const override { return "Coroutine Elision"; }
-};
-}
-
-char CoroElideLegacy::ID = 0;
-INITIALIZE_PASS_BEGIN(
-    CoroElideLegacy, "coro-elide",
-    "Coroutine frame allocation elision and indirect calls replacement", false,
-    false)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(
-    CoroElideLegacy, "coro-elide",
-    "Coroutine frame allocation elision and indirect calls replacement", false,
-    false)
-
-Pass *llvm::createCoroElideLegacyPass() { return new CoroElideLegacy(); }
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index 9c16d3750998..d09607bb1c4c 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -27,7 +27,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
-#include "llvm/Support/CommandLine.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/OptimizedStructLayout.h"
@@ -44,13 +44,6 @@ using namespace llvm;
 // "coro-frame", which results in leaner debug spew.
 #define DEBUG_TYPE "coro-suspend-crossing"
 
-static cl::opt<bool> EnableReuseStorageInFrame(
-    "reuse-storage-in-coroutine-frame", cl::Hidden,
-    cl::desc(
-        "Enable the optimization which would reuse the storage in the coroutine \
-         frame for allocas whose liferanges are not overlapped, for testing purposes"),
-    llvm::cl::init(false));
-
 enum { SmallVectorThreshold = 32 };
 
 // Provides two way mapping between the blocks and numbers.
@@ -347,15 +340,26 @@ struct FrameDataInfo {
     FieldIndexMap[V] = Index;
   }
 
-  uint64_t getAlign(Value *V) const {
+  Align getAlign(Value *V) const {
     auto Iter = FieldAlignMap.find(V);
     assert(Iter != FieldAlignMap.end());
     return Iter->second;
   }
 
-  void setAlign(Value *V, uint64_t Align) {
+  void setAlign(Value *V, Align AL) {
     assert(FieldAlignMap.count(V) == 0);
-    FieldAlignMap.insert({V, Align});
+    FieldAlignMap.insert({V, AL});
+  }
+
+  uint64_t getDynamicAlign(Value *V) const {
+    auto Iter = FieldDynamicAlignMap.find(V);
+    assert(Iter != FieldDynamicAlignMap.end());
+    return Iter->second;
+  }
+
+  void setDynamicAlign(Value *V, uint64_t Align) {
+    assert(FieldDynamicAlignMap.count(V) == 0);
+    FieldDynamicAlignMap.insert({V, Align});
   }
 
   uint64_t getOffset(Value *V) const {
@@ -382,7 +386,8 @@ private:
   DenseMap<Value *, uint32_t> FieldIndexMap;
   // Map from values to their alignment on the frame. They would be set after
   // the frame is built.
-  DenseMap<Value *, uint64_t> FieldAlignMap;
+  DenseMap<Value *, Align> FieldAlignMap;
+  DenseMap<Value *, uint64_t> FieldDynamicAlignMap;
   // Map from values to their offset on the frame. They would be set after
   // the frame is built.
   DenseMap<Value *, uint64_t> FieldOffsetMap;
@@ -423,6 +428,7 @@ private:
     FieldIDType LayoutFieldIndex;
     Align Alignment;
     Align TyAlignment;
+    uint64_t DynamicAlignBuffer;
   };
 
   const DataLayout &DL;
@@ -489,7 +495,7 @@ public:
                           coro::Shape &Shape);
 
   /// Add a field to this structure.
-  LLVM_NODISCARD FieldIDType addField(Type *Ty, MaybeAlign FieldAlignment,
+  LLVM_NODISCARD FieldIDType addField(Type *Ty, MaybeAlign MaybeFieldAlignment,
                                       bool IsHeader = false,
                                       bool IsSpillOfValue = false) {
     assert(!IsFinished && "adding fields to a finished builder");
@@ -508,13 +514,21 @@ public:
     // to remember the type alignment anyway to build the type.
     // If we are spilling values we don't need to worry about ABI alignment
     // concerns.
-    auto ABIAlign = DL.getABITypeAlign(Ty);
-    Align TyAlignment =
-        (IsSpillOfValue && MaxFrameAlignment)
-            ? (*MaxFrameAlignment < ABIAlign ? *MaxFrameAlignment : ABIAlign)
-            : ABIAlign;
-    if (!FieldAlignment) {
-      FieldAlignment = TyAlignment;
+    Align ABIAlign = DL.getABITypeAlign(Ty);
+    Align TyAlignment = ABIAlign;
+    if (IsSpillOfValue && MaxFrameAlignment && *MaxFrameAlignment < ABIAlign)
+      TyAlignment = *MaxFrameAlignment;
+    Align FieldAlignment = MaybeFieldAlignment.value_or(TyAlignment);
+
+    // The field alignment could be bigger than the max frame case, in that case
+    // we request additional storage to be able to dynamically align the
+    // pointer.
+    uint64_t DynamicAlignBuffer = 0;
+    if (MaxFrameAlignment && (FieldAlignment > *MaxFrameAlignment)) {
+      DynamicAlignBuffer =
+          offsetToAlignment(MaxFrameAlignment->value(), FieldAlignment);
+      FieldAlignment = *MaxFrameAlignment;
+      FieldSize = FieldSize + DynamicAlignBuffer;
     }
 
     // Lay out header fields immediately.
@@ -523,12 +537,13 @@ public:
       Offset = alignTo(StructSize, FieldAlignment);
       StructSize = Offset + FieldSize;
 
-    // Everything else has a flexible offset.
+      // Everything else has a flexible offset.
     } else {
       Offset = OptimizedStructLayoutField::FlexibleOffset;
     }
 
-    Fields.push_back({FieldSize, Offset, Ty, 0, *FieldAlignment, TyAlignment});
+    Fields.push_back({FieldSize, Offset, Ty, 0, FieldAlignment, TyAlignment,
+                      DynamicAlignBuffer});
     return Fields.size() - 1;
   }
 
@@ -561,7 +576,12 @@ void FrameDataInfo::updateLayoutIndex(FrameTypeBuilder &B) {
   auto Updater = [&](Value *I) {
     auto Field = B.getLayoutField(getFieldIndex(I));
     setFieldIndex(I, Field.LayoutFieldIndex);
-    setAlign(I, Field.Alignment.value());
+    setAlign(I, Field.Alignment);
+    uint64_t dynamicAlign =
+        Field.DynamicAlignBuffer
+            ? Field.DynamicAlignBuffer + Field.Alignment.value()
+            : 0;
+    setDynamicAlign(I, dynamicAlign);
     setOffset(I, Field.Offset);
   };
   LayoutIndexUpdateStarted = true;
@@ -588,7 +608,7 @@ void FrameTypeBuilder::addFieldForAllocas(const Function &F,
     }
   });
 
-  if (!Shape.OptimizeFrame && !EnableReuseStorageInFrame) {
+  if (!Shape.OptimizeFrame) {
     for (const auto &A : FrameData.Allocas) {
       AllocaInst *Alloca = A.Alloca;
       NonOverlapedAllocas.emplace_back(AllocaSetType(1, Alloca));
@@ -755,6 +775,10 @@ void FrameTypeBuilder::finish(StructType *Ty) {
     F.LayoutFieldIndex = FieldTypes.size();
 
     FieldTypes.push_back(F.Ty);
+    if (F.DynamicAlignBuffer) {
+      FieldTypes.push_back(
+          ArrayType::get(Type::getInt8Ty(Context), F.DynamicAlignBuffer));
+    }
     LastOffset = Offset + F.Size;
   }
 
@@ -807,9 +831,10 @@ static StringRef solveTypeName(Type *Ty) {
     return "__floating_type_";
   }
 
-  if (Ty->isPointerTy()) {
-    auto *PtrTy = cast<PointerType>(Ty);
-    Type *PointeeTy = PtrTy->getPointerElementType();
+  if (auto *PtrTy = dyn_cast<PointerType>(Ty)) {
+    if (PtrTy->isOpaque())
+      return "PointerType";
+    Type *PointeeTy = PtrTy->getNonOpaquePointerElementType();
     auto Name = solveTypeName(PointeeTy);
     if (Name == "UnknownType")
       return "PointerType";
@@ -826,10 +851,9 @@ static StringRef solveTypeName(Type *Ty) {
     auto Name = Ty->getStructName();
 
     SmallString<16> Buffer(Name);
-    for_each(Buffer, [](auto &Iter) {
+    for (auto &Iter : Buffer)
       if (Iter == '.' || Iter == ':')
         Iter = '_';
-    });
     auto *MDName = MDString::get(Ty->getContext(), Buffer.str());
     return MDName->getString();
   }
@@ -1012,7 +1036,7 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape,
     auto Index = FrameData.getFieldIndex(V);
 
     OffsetCache.insert(
-        {Index, {FrameData.getAlign(V), FrameData.getOffset(V)}});
+        {Index, {FrameData.getAlign(V).value(), FrameData.getOffset(V)}});
   }
 
   DenseMap<Type *, DIType *> DITypeCache;
@@ -1078,7 +1102,7 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape,
 
   DBuilder.insertDeclare(Shape.FramePtr, FrameDIVar,
                          DBuilder.createExpression(), DILoc,
-                         Shape.FramePtr->getNextNode());
+                         Shape.getInsertPtAfterFramePtr());
 }
 
 // Build a struct that will keep state for an active coroutine.
@@ -1367,7 +1391,7 @@ struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> {
   bool getShouldLiveOnFrame() const {
     if (!ShouldLiveOnFrame)
       ShouldLiveOnFrame = computeShouldLiveOnFrame();
-    return ShouldLiveOnFrame.getValue();
+    return *ShouldLiveOnFrame;
   }
 
   bool getMayWriteBeforeCoroBegin() const { return MayWriteBeforeCoroBegin; }
@@ -1455,7 +1479,7 @@ private:
       auto Itr = AliasOffetMap.find(&I);
       if (Itr == AliasOffetMap.end()) {
         AliasOffetMap[&I] = Offset;
-      } else if (Itr->second.hasValue() && Itr->second.getValue() != Offset) {
+      } else if (Itr->second && *Itr->second != Offset) {
         // If we have seen two different possible values for this alias, we set
         // it to empty.
         AliasOffetMap[&I].reset();
@@ -1517,13 +1541,12 @@ static void createFramePtr(coro::Shape &Shape) {
 //    whatever
 //
 //
-static Instruction *insertSpills(const FrameDataInfo &FrameData,
-                                 coro::Shape &Shape) {
+static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) {
   auto *CB = Shape.CoroBegin;
   LLVMContext &C = CB->getContext();
   IRBuilder<> Builder(C);
   StructType *FrameTy = Shape.FrameTy;
-  Instruction *FramePtr = Shape.FramePtr;
+  Value *FramePtr = Shape.FramePtr;
   DominatorTree DT(*CB->getFunction());
   SmallDenseMap<llvm::Value *, llvm::AllocaInst *, 4> DbgPtrAllocaCache;
 
@@ -1550,7 +1573,18 @@ static Instruction *insertSpills(const FrameDataInfo &FrameData,
 
     auto GEP = cast<GetElementPtrInst>(
         Builder.CreateInBoundsGEP(FrameTy, FramePtr, Indices));
-    if (isa<AllocaInst>(Orig)) {
+    if (auto *AI = dyn_cast<AllocaInst>(Orig)) {
+      if (FrameData.getDynamicAlign(Orig) != 0) {
+        assert(FrameData.getDynamicAlign(Orig) == AI->getAlign().value());
+        auto *M = AI->getModule();
+        auto *IntPtrTy = M->getDataLayout().getIntPtrType(AI->getType());
+        auto *PtrValue = Builder.CreatePtrToInt(GEP, IntPtrTy);
+        auto *AlignMask =
+            ConstantInt::get(IntPtrTy, AI->getAlign().value() - 1);
+        PtrValue = Builder.CreateAdd(PtrValue, AlignMask);
+        PtrValue = Builder.CreateAnd(PtrValue, Builder.CreateNot(AlignMask));
+        return Builder.CreateIntToPtr(PtrValue, AI->getType());
+      }
       // If the type of GEP is not equal to the type of AllocaInst, it implies
       // that the AllocaInst may be reused in the Frame slot of other
       // AllocaInst. So We cast GEP to the AllocaInst here to re-use
@@ -1571,20 +1605,19 @@ static Instruction *insertSpills(const FrameDataInfo &FrameData,
     // Create a store instruction storing the value into the
     // coroutine frame.
     Instruction *InsertPt = nullptr;
-    bool NeedToCopyArgPtrValue = false;
+    Type *ByValTy = nullptr;
     if (auto *Arg = dyn_cast<Argument>(Def)) {
       // For arguments, we will place the store instruction right after
       // the coroutine frame pointer instruction, i.e. bitcast of
       // coro.begin from i8* to %f.frame*.
-      InsertPt = FramePtr->getNextNode();
+      InsertPt = Shape.getInsertPtAfterFramePtr();
 
       // If we're spilling an Argument, make sure we clear 'nocapture'
       // from the coroutine function.
       Arg->getParent()->removeParamAttr(Arg->getArgNo(), Attribute::NoCapture);
 
       if (Arg->hasByValAttr())
-        NeedToCopyArgPtrValue = true;
-
+        ByValTy = Arg->getParamByValType();
     } else if (auto *CSI = dyn_cast<AnyCoroSuspendInst>(Def)) {
       // Don't spill immediately after a suspend; splitting assumes
       // that the suspend will be followed by a branch.
@@ -1594,7 +1627,7 @@ static Instruction *insertSpills(const FrameDataInfo &FrameData,
       if (!DT.dominates(CB, I)) {
         // If it is not dominated by CoroBegin, then spill should be
         // inserted immediately after CoroFrame is computed.
-        InsertPt = FramePtr->getNextNode();
+        InsertPt = Shape.getInsertPtAfterFramePtr();
       } else if (auto *II = dyn_cast<InvokeInst>(I)) {
         // If we are spilling the result of the invoke instruction, split
         // the normal edge and insert the spill in the new block.
@@ -1619,11 +1652,10 @@ static Instruction *insertSpills(const FrameDataInfo &FrameData,
     Builder.SetInsertPoint(InsertPt);
     auto *G = Builder.CreateConstInBoundsGEP2_32(
         FrameTy, FramePtr, 0, Index, Def->getName() + Twine(".spill.addr"));
-    if (NeedToCopyArgPtrValue) {
+    if (ByValTy) {
       // For byval arguments, we need to store the pointed value in the frame,
       // instead of the pointer itself.
-      auto *Value =
-          Builder.CreateLoad(Def->getType()->getPointerElementType(), Def);
+      auto *Value = Builder.CreateLoad(ByValTy, Def);
       Builder.CreateAlignedStore(Value, G, SpillAlignment);
     } else {
       Builder.CreateAlignedStore(Def, G, SpillAlignment);
@@ -1641,7 +1673,7 @@ static Instruction *insertSpills(const FrameDataInfo &FrameData,
 
         auto *GEP = GetFramePointer(E.first);
         GEP->setName(E.first->getName() + Twine(".reload.addr"));
-        if (NeedToCopyArgPtrValue)
+        if (ByValTy)
           CurrentReload = GEP;
         else
           CurrentReload = Builder.CreateAlignedLoad(
@@ -1664,6 +1696,12 @@ static Instruction *insertSpills(const FrameDataInfo &FrameData,
         }
       }
 
+      // Salvage debug info on any dbg.addr that we see. We do not insert them
+      // into each block where we have a use though.
+      if (auto *DI = dyn_cast<DbgAddrIntrinsic>(U)) {
+        coro::salvageDebugInfo(DbgPtrAllocaCache, DI, Shape.OptimizeFrame);
+      }
+
       // If we have a single edge PHINode, remove it and replace it with a
       // reload from the coroutine frame. (We already took care of multi edge
       // PHINodes by rewriting them in the rewritePHIs function).
@@ -1682,10 +1720,10 @@ static Instruction *insertSpills(const FrameDataInfo &FrameData,
     }
   }
 
-  BasicBlock *FramePtrBB = FramePtr->getParent();
+  BasicBlock *FramePtrBB = Shape.getInsertPtAfterFramePtr()->getParent();
 
-  auto SpillBlock =
-      FramePtrBB->splitBasicBlock(FramePtr->getNextNode(), "AllocaSpillBB");
+  auto SpillBlock = FramePtrBB->splitBasicBlock(
+      Shape.getInsertPtAfterFramePtr(), "AllocaSpillBB");
   SpillBlock->splitBasicBlock(&SpillBlock->front(), "PostSpill");
   Shape.AllocaSpillBlock = SpillBlock;
 
@@ -1704,7 +1742,7 @@ static Instruction *insertSpills(const FrameDataInfo &FrameData,
       Alloca->replaceAllUsesWith(G);
       Alloca->eraseFromParent();
     }
-    return FramePtr;
+    return;
   }
 
   // If we found any alloca, replace all of their remaining uses with GEP
@@ -1735,7 +1773,7 @@ static Instruction *insertSpills(const FrameDataInfo &FrameData,
     for (Instruction *I : UsersToUpdate)
       I->replaceUsesOfWith(Alloca, G);
   }
-  Builder.SetInsertPoint(FramePtr->getNextNode());
+  Builder.SetInsertPoint(Shape.getInsertPtAfterFramePtr());
   for (const auto &A : FrameData.Allocas) {
     AllocaInst *Alloca = A.Alloca;
     if (A.MayWriteBeforeCoroBegin) {
@@ -1755,16 +1793,16 @@ static Instruction *insertSpills(const FrameDataInfo &FrameData,
       auto *FramePtr = GetFramePointer(Alloca);
       auto *FramePtrRaw =
           Builder.CreateBitCast(FramePtr, Type::getInt8PtrTy(C));
-      auto *AliasPtr = Builder.CreateGEP(
-          Type::getInt8Ty(C), FramePtrRaw,
-          ConstantInt::get(Type::getInt64Ty(C), Alias.second.getValue()));
+      auto &Value = *Alias.second;
+      auto ITy = IntegerType::get(C, Value.getBitWidth());
+      auto *AliasPtr = Builder.CreateGEP(Type::getInt8Ty(C), FramePtrRaw,
+                                         ConstantInt::get(ITy, Value));
       auto *AliasPtrTyped =
           Builder.CreateBitCast(AliasPtr, Alias.first->getType());
       Alias.first->replaceUsesWithIf(
           AliasPtrTyped, [&](Use &U) { return DT.dominates(CB, U); });
     }
   }
-  return FramePtr;
 }
 
 // Moves the values in the PHIs in SuccBB that correspong to PredBB into a new
@@ -2130,7 +2168,7 @@ static void lowerLocalAllocas(ArrayRef<CoroAllocaAllocInst*> LocalAllocas,
 
     // Allocate memory.
     auto Alloca = Builder.CreateAlloca(Builder.getInt8Ty(), AI->getSize());
-    Alloca->setAlignment(Align(AI->getAlignment()));
+    Alloca->setAlignment(AI->getAlignment());
 
     for (auto U : AI->users()) {
       // Replace gets with the allocation.
@@ -2279,7 +2317,10 @@ static void eliminateSwiftErrorArgument(Function &F, Argument &Arg,
   IRBuilder<> Builder(F.getEntryBlock().getFirstNonPHIOrDbg());
 
   auto ArgTy = cast<PointerType>(Arg.getType());
-  auto ValueTy = ArgTy->getPointerElementType();
+  // swifterror arguments are required to have pointer-to-pointer type,
+  // so create a pointer-typed alloca with opaque pointers.
+  auto ValueTy = ArgTy->isOpaque() ? PointerType::getUnqual(F.getContext())
+                                   : ArgTy->getNonOpaquePointerElementType();
 
   // Reduce to the alloca case:
 
@@ -2520,6 +2561,7 @@ void coro::salvageDebugInfo(
   bool SkipOutermostLoad = !isa<DbgValueInst>(DVI);
   Value *Storage = DVI->getVariableLocationOp(0);
   Value *OriginalStorage = Storage;
+
   while (auto *Inst = dyn_cast_or_null<Instruction>(Storage)) {
     if (auto *LdInst = dyn_cast<LoadInst>(Inst)) {
       Storage = LdInst->getOperand(0);
@@ -2559,7 +2601,7 @@ void coro::salvageDebugInfo(
   //
   // Avoid to create the alloca would be eliminated by optimization
   // passes and the corresponding dbg.declares would be invalid.
-  if (!OptimizeFrame && !EnableReuseStorageInFrame)
+  if (!OptimizeFrame)
     if (auto *Arg = dyn_cast<llvm::Argument>(Storage)) {
       auto &Cached = DbgPtrAllocaCache[Storage];
       if (!Cached) {
@@ -2575,14 +2617,15 @@ void coro::salvageDebugInfo(
       // expression, we need to add a DW_OP_deref at the *start* of the
       // expression to first load the contents of the alloca before
       // adjusting it with the expression.
-      if (Expr && Expr->isComplex())
-        Expr = DIExpression::prepend(Expr, DIExpression::DerefBefore);
+      Expr = DIExpression::prepend(Expr, DIExpression::DerefBefore);
     }
 
   DVI->replaceVariableLocationOp(OriginalStorage, Storage);
   DVI->setExpression(Expr);
-  /// It makes no sense to move the dbg.value intrinsic.
-  if (!isa<DbgValueInst>(DVI)) {
+  // We only hoist dbg.declare today since it doesn't make sense to hoist
+  // dbg.value or dbg.addr since they do not have the same function wide
+  // guarantees that dbg.declare does.
+  if (!isa<DbgValueInst>(DVI) && !isa<DbgAddrIntrinsic>(DVI)) {
     if (auto *II = dyn_cast<InvokeInst>(Storage))
       DVI->moveBefore(II->getNormalDest()->getFirstNonPHI());
     else if (auto *CBI = dyn_cast<CallBrInst>(Storage))
@@ -2661,13 +2704,6 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
           for (User *U : I.users())
             if (Checker.isDefinitionAcrossSuspend(I, U))
               Spills[&I].push_back(cast<Instruction>(U));
-
-          // Manually add dbg.value metadata uses of I.
-          SmallVector<DbgValueInst *, 16> DVIs;
-          findDbgValues(DVIs, &I);
-          for (auto *DVI : DVIs)
-            if (Checker.isDefinitionAcrossSuspend(I, DVI))
-              Spills[&I].push_back(DVI);
         }
 
       if (Spills.empty())
@@ -2754,10 +2790,9 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
     auto *V = Iter.first;
     SmallVector<DbgValueInst *, 16> DVIs;
     findDbgValues(DVIs, V);
-    llvm::for_each(DVIs, [&](DbgValueInst *DVI) {
+    for (DbgValueInst *DVI : DVIs)
       if (Checker.isDefinitionAcrossSuspend(*V, DVI))
         FrameData.Spills[V].push_back(DVI);
-    });
   }
 
   LLVM_DEBUG(dumpSpills("Spills", FrameData.Spills));
diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h
index 9a17068df3a9..5557370c82ba 100644
--- a/llvm/lib/Transforms/Coroutines/CoroInternal.h
+++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h
@@ -13,7 +13,6 @@
 
 #include "CoroInstr.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/Transforms/Coroutines.h"
 
 namespace llvm {
 
@@ -21,40 +20,13 @@ class CallGraph;
 class CallGraphSCC;
 class PassRegistry;
 
-void initializeCoroEarlyLegacyPass(PassRegistry &);
-void initializeCoroSplitLegacyPass(PassRegistry &);
-void initializeCoroElideLegacyPass(PassRegistry &);
-void initializeCoroCleanupLegacyPass(PassRegistry &);
-
-// CoroEarly pass marks every function that has coro.begin with a string
-// attribute "coroutine.presplit"="0". CoroSplit pass processes the coroutine
-// twice. First, it lets it go through complete IPO optimization pipeline as a
-// single function. It forces restart of the pipeline by inserting an indirect
-// call to an empty function "coro.devirt.trigger" which is devirtualized by
-// CoroElide pass that triggers a restart of the pipeline by CGPassManager.
-// When CoroSplit pass sees the same coroutine the second time, it splits it up,
-// adds coroutine subfunctions to the SCC to be processed by IPO pipeline.
-// Async lowering similarily triggers a restart of the pipeline after it has
-// split the coroutine.
-//
-// FIXME: Refactor these attributes as LLVM attributes instead of string
-// attributes since these attributes are already used outside LLVM's
-// coroutine module.
-// FIXME: Remove these values once we remove the Legacy PM.
-#define CORO_PRESPLIT_ATTR "coroutine.presplit"
-#define UNPREPARED_FOR_SPLIT "0"
-#define PREPARED_FOR_SPLIT "1"
-#define ASYNC_RESTART_AFTER_SPLIT "2"
-
-#define CORO_DEVIRT_TRIGGER_FN "coro.devirt.trigger"
-
 namespace coro {
 
+bool declaresAnyIntrinsic(const Module &M);
 bool declaresIntrinsics(const Module &M,
                         const std::initializer_list<StringRef>);
 void replaceCoroFree(CoroIdInst *CoroId, bool Elide);
-void updateCallGraph(Function &Caller, ArrayRef<Function *> Funcs,
-                     CallGraph &CG, CallGraphSCC &SCC);
+
 /// Recover a dbg.declare prepared by the frontend and emit an alloca
 /// holding a pointer to the coroutine frame.
 void salvageDebugInfo(
@@ -128,7 +100,7 @@ struct LLVM_LIBRARY_VISIBILITY Shape {
   StructType *FrameTy;
   Align FrameAlign;
   uint64_t FrameSize;
-  Instruction *FramePtr;
+  Value *FramePtr;
   BasicBlock *AllocaSpillBlock;
 
   /// This would only be true if optimization are enabled.
@@ -210,10 +182,9 @@ struct LLVM_LIBRARY_VISIBILITY Shape {
 
   FunctionType *getResumeFunctionType() const {
     switch (ABI) {
-    case coro::ABI::Switch: {
-      auto *FnPtrTy = getSwitchResumePointerType();
-      return cast<FunctionType>(FnPtrTy->getPointerElementType());
-    }
+    case coro::ABI::Switch:
+      return FunctionType::get(Type::getVoidTy(FrameTy->getContext()),
+                               FrameTy->getPointerTo(), /*IsVarArg*/false);
     case coro::ABI::Retcon:
     case coro::ABI::RetconOnce:
       return RetconLowering.ResumePrototype->getFunctionType();
@@ -267,6 +238,12 @@ struct LLVM_LIBRARY_VISIBILITY Shape {
     return nullptr;
   }
 
+  Instruction *getInsertPtAfterFramePtr() const {
+    if (auto *I = dyn_cast<Instruction>(FramePtr))
+      return I->getNextNode();
+    return &cast<Argument>(FramePtr)->getParent()->getEntryBlock().front();
+  }
+
   /// Allocate memory according to the rules of the active lowering.
   ///
   /// \param CG - if non-null, will be updated for the new call
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index b5129809c6a6..ead552d9be4e 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -22,15 +22,17 @@
 #include "CoroInstr.h"
 #include "CoroInternal.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PriorityWorklist.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/LazyCallGraph.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
@@ -50,13 +52,10 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/Verifier.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/PrettyStackTrace.h"
@@ -869,11 +868,16 @@ void CoroCloner::create() {
                                   OrigF.getParent()->end(), ActiveSuspend);
   }
 
-  // Replace all args with undefs. The buildCoroutineFrame algorithm already
-  // rewritten access to the args that occurs after suspend points with loads
-  // and stores to/from the coroutine frame.
-  for (Argument &A : OrigF.args())
-    VMap[&A] = UndefValue::get(A.getType());
+  // Replace all args with dummy instructions. If an argument is the old frame
+  // pointer, the dummy will be replaced by the new frame pointer once it is
+  // computed below. Uses of all other arguments should have already been
+  // rewritten by buildCoroutineFrame() to use loads/stores on the coroutine
+  // frame.
+  SmallVector<Instruction *> DummyArgs;
+  for (Argument &A : OrigF.args()) {
+    DummyArgs.push_back(new FreezeInst(UndefValue::get(A.getType())));
+    VMap[&A] = DummyArgs.back();
+  }
 
   SmallVector<ReturnInst *, 4> Returns;
 
@@ -923,6 +927,12 @@ void CoroCloner::create() {
   NewF->setVisibility(savedVisibility);
   NewF->setUnnamedAddr(savedUnnamedAddr);
   NewF->setDLLStorageClass(savedDLLStorageClass);
+  // The function sanitizer metadata needs to match the signature of the
+  // function it is being attached to. However this does not hold for split
+  // functions here. Thus remove the metadata for split functions.
+  if (Shape.ABI == coro::ABI::Switch &&
+      NewF->hasMetadata(LLVMContext::MD_func_sanitize))
+    NewF->eraseMetadata(LLVMContext::MD_func_sanitize);
 
   // Replace the attributes of the new function:
   auto OrigAttrs = NewF->getAttributes();
@@ -932,7 +942,8 @@ void CoroCloner::create() {
   case coro::ABI::Switch:
     // Bootstrap attributes by copying function attributes from the
     // original function.  This should include optimization settings and so on.
-    NewAttrs = NewAttrs.addFnAttributes(Context, AttrBuilder(Context, OrigAttrs.getFnAttrs()));
+    NewAttrs = NewAttrs.addFnAttributes(
+        Context, AttrBuilder(Context, OrigAttrs.getFnAttrs()));
 
     addFramePointerAttrs(NewAttrs, Context, 0,
                          Shape.FrameSize, Shape.FrameAlign);
@@ -1013,7 +1024,15 @@ void CoroCloner::create() {
   auto *NewVFrame = Builder.CreateBitCast(
       NewFramePtr, Type::getInt8PtrTy(Builder.getContext()), "vFrame");
   Value *OldVFrame = cast<Value>(VMap[Shape.CoroBegin]);
-  OldVFrame->replaceAllUsesWith(NewVFrame);
+  if (OldVFrame != NewVFrame)
+    OldVFrame->replaceAllUsesWith(NewVFrame);
+
+  // All uses of the arguments should have been resolved by this point,
+  // so we can safely remove the dummy values.
+  for (Instruction *DummyArg : DummyArgs) {
+    DummyArg->replaceAllUsesWith(UndefValue::get(DummyArg->getType()));
+    DummyArg->deleteValue();
+  }
 
   switch (Shape.ABI) {
   case coro::ABI::Switch:
@@ -1063,13 +1082,6 @@ static Function *createClone(Function &F, const Twine &Suffix,
   return Cloner.getFunction();
 }
 
-/// Remove calls to llvm.coro.end in the original function.
-static void removeCoroEnds(const coro::Shape &Shape, CallGraph *CG) {
-  for (auto End : Shape.CoroEnds) {
-    replaceCoroEnd(End, Shape, Shape.FramePtr, /*in resume*/ false, CG);
-  }
-}
-
 static void updateAsyncFuncPointerContextSize(coro::Shape &Shape) {
   assert(Shape.ABI == coro::ABI::Async);
 
@@ -1150,7 +1162,8 @@ static void updateCoroFrame(coro::Shape &Shape, Function *ResumeFn,
                             Function *DestroyFn, Function *CleanupFn) {
   assert(Shape.ABI == coro::ABI::Switch);
 
-  IRBuilder<> Builder(Shape.FramePtr->getNextNode());
+  IRBuilder<> Builder(Shape.getInsertPtAfterFramePtr());
+
   auto *ResumeAddr = Builder.CreateStructGEP(
       Shape.FrameTy, Shape.FramePtr, coro::Shape::SwitchFieldIndex::Resume,
       "resume.addr");
@@ -1559,7 +1572,8 @@ static void simplifySuspendPoints(coro::Shape &Shape) {
 }
 
 static void splitSwitchCoroutine(Function &F, coro::Shape &Shape,
-                                 SmallVectorImpl<Function *> &Clones) {
+                                 SmallVectorImpl<Function *> &Clones,
+                                 TargetTransformInfo &TTI) {
   assert(Shape.ABI == coro::ABI::Switch);
 
   createResumeEntryBlock(F, Shape);
@@ -1574,7 +1588,13 @@ static void splitSwitchCoroutine(Function &F, coro::Shape &Shape,
   postSplitCleanup(*DestroyClone);
   postSplitCleanup(*CleanupClone);
 
-  addMustTailToCoroResumes(*ResumeClone);
+  // Adding musttail call to support symmetric transfer.
+  // Skip targets which don't support tail call.
+  //
+  // FIXME: Could we support symmetric transfer effectively without musttail
+  // call?
+  if (TTI.supportsTailCalls())
+    addMustTailToCoroResumes(*ResumeClone);
 
   // Store addresses resume/destroy/cleanup functions in the coroutine frame.
   updateCoroFrame(Shape, ResumeClone, DestroyClone, CleanupClone);
@@ -1661,7 +1681,7 @@ static void splitAsyncCoroutine(Function &F, coro::Shape &Shape,
   // Map all uses of llvm.coro.begin to the allocated frame pointer.
   {
     // Make sure we don't invalidate Shape.FramePtr.
-    TrackingVH<Instruction> Handle(Shape.FramePtr);
+    TrackingVH<Value> Handle(Shape.FramePtr);
     Shape.CoroBegin->replaceAllUsesWith(FramePtr);
     Shape.FramePtr = Handle.getValPtr();
   }
@@ -1773,7 +1793,7 @@ static void splitRetconCoroutine(Function &F, coro::Shape &Shape,
   // Map all uses of llvm.coro.begin to the allocated frame pointer.
   {
     // Make sure we don't invalidate Shape.FramePtr.
-    TrackingVH<Instruction> Handle(Shape.FramePtr);
+    TrackingVH<Value> Handle(Shape.FramePtr);
     Shape.CoroBegin->replaceAllUsesWith(RawFramePtr);
     Shape.FramePtr = Handle.getValPtr();
   }
@@ -1879,6 +1899,7 @@ namespace {
 
 static coro::Shape splitCoroutine(Function &F,
                                   SmallVectorImpl<Function *> &Clones,
+                                  TargetTransformInfo &TTI,
                                   bool OptimizeFrame) {
   PrettyStackTraceFunction prettyStackTrace(F);
 
@@ -1901,7 +1922,7 @@ static coro::Shape splitCoroutine(Function &F,
   } else {
     switch (Shape.ABI) {
     case coro::ABI::Switch:
-      splitSwitchCoroutine(F, Shape, Clones);
+      splitSwitchCoroutine(F, Shape, Clones, TTI);
       break;
     case coro::ABI::Async:
       splitAsyncCoroutine(F, Shape, Clones);
@@ -1917,21 +1938,27 @@ static coro::Shape splitCoroutine(Function &F,
   // This invalidates SwiftErrorOps in the Shape.
   replaceSwiftErrorOps(F, Shape, nullptr);
 
-  return Shape;
-}
-
-static void
-updateCallGraphAfterCoroutineSplit(Function &F, const coro::Shape &Shape,
-                                   const SmallVectorImpl<Function *> &Clones,
-                                   CallGraph &CG, CallGraphSCC &SCC) {
-  if (!Shape.CoroBegin)
-    return;
-
-  removeCoroEnds(Shape, &CG);
-  postSplitCleanup(F);
+  // Finally, salvage the llvm.dbg.{declare,addr} in our original function that
+  // point into the coroutine frame. We only do this for the current function
+  // since the Cloner salvaged debug info for us in the new coroutine funclets.
+  SmallVector<DbgVariableIntrinsic *, 8> Worklist;
+  SmallDenseMap<llvm::Value *, llvm::AllocaInst *, 4> DbgPtrAllocaCache;
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      if (auto *DDI = dyn_cast<DbgDeclareInst>(&I)) {
+        Worklist.push_back(DDI);
+        continue;
+      }
+      if (auto *DDI = dyn_cast<DbgAddrIntrinsic>(&I)) {
+        Worklist.push_back(DDI);
+        continue;
+      }
+    }
+  }
+  for (auto *DDI : Worklist)
+    coro::salvageDebugInfo(DbgPtrAllocaCache, DDI, Shape.OptimizeFrame);
 
-  // Update call graph and add the functions we created to the SCC.
-  coro::updateCallGraph(F, Clones, CG, SCC);
+  return Shape;
 }
 
 static void updateCallGraphAfterCoroutineSplit(
@@ -1976,70 +2003,6 @@ static void updateCallGraphAfterCoroutineSplit(
   updateCGAndAnalysisManagerForFunctionPass(CG, C, N, AM, UR, FAM);
 }
 
-// When we see the coroutine the first time, we insert an indirect call to a
-// devirt trigger function and mark the coroutine that it is now ready for
-// split.
-// Async lowering uses this after it has split the function to restart the
-// pipeline.
-static void prepareForSplit(Function &F, CallGraph &CG,
-                            bool MarkForAsyncRestart = false) {
-  Module &M = *F.getParent();
-  LLVMContext &Context = F.getContext();
-#ifndef NDEBUG
-  Function *DevirtFn = M.getFunction(CORO_DEVIRT_TRIGGER_FN);
-  assert(DevirtFn && "coro.devirt.trigger function not found");
-#endif
-
-  F.addFnAttr(CORO_PRESPLIT_ATTR, MarkForAsyncRestart
-                                      ? ASYNC_RESTART_AFTER_SPLIT
-                                      : PREPARED_FOR_SPLIT);
-
-  // Insert an indirect call sequence that will be devirtualized by CoroElide
-  // pass:
-  //    %0 = call i8* @llvm.coro.subfn.addr(i8* null, i8 -1)
-  //    %1 = bitcast i8* %0 to void(i8*)*
-  //    call void %1(i8* null)
-  coro::LowererBase Lowerer(M);
-  Instruction *InsertPt =
-      MarkForAsyncRestart ? F.getEntryBlock().getFirstNonPHIOrDbgOrLifetime()
-                          : F.getEntryBlock().getTerminator();
-  auto *Null = ConstantPointerNull::get(Type::getInt8PtrTy(Context));
-  auto *DevirtFnAddr =
-      Lowerer.makeSubFnCall(Null, CoroSubFnInst::RestartTrigger, InsertPt);
-  FunctionType *FnTy = FunctionType::get(Type::getVoidTy(Context),
-                                         {Type::getInt8PtrTy(Context)}, false);
-  auto *IndirectCall = CallInst::Create(FnTy, DevirtFnAddr, Null, "", InsertPt);
-
-  // Update CG graph with an indirect call we just added.
-  CG[&F]->addCalledFunction(IndirectCall, CG.getCallsExternalNode());
-}
-
-// Make sure that there is a devirtualization trigger function that the
-// coro-split pass uses to force a restart of the CGSCC pipeline. If the devirt
-// trigger function is not found, we will create one and add it to the current
-// SCC.
-static void createDevirtTriggerFunc(CallGraph &CG, CallGraphSCC &SCC) {
-  Module &M = CG.getModule();
-  if (M.getFunction(CORO_DEVIRT_TRIGGER_FN))
-    return;
-
-  LLVMContext &C = M.getContext();
-  auto *FnTy = FunctionType::get(Type::getVoidTy(C), Type::getInt8PtrTy(C),
-                                 /*isVarArg=*/false);
-  Function *DevirtFn =
-      Function::Create(FnTy, GlobalValue::LinkageTypes::PrivateLinkage,
-                       CORO_DEVIRT_TRIGGER_FN, &M);
-  DevirtFn->addFnAttr(Attribute::AlwaysInline);
-  auto *Entry = BasicBlock::Create(C, "entry", DevirtFn);
-  ReturnInst::Create(C, Entry);
-
-  auto *Node = CG.getOrInsertFunction(DevirtFn);
-
-  SmallVector<CallGraphNode *, 8> Nodes(SCC.begin(), SCC.end());
-  Nodes.push_back(Node);
-  SCC.initialize(Nodes);
-}
-
 /// Replace a call to llvm.coro.prepare.retcon.
 static void replacePrepare(CallInst *Prepare, LazyCallGraph &CG,
                            LazyCallGraph::SCC &C) {
@@ -2076,59 +2039,6 @@ static void replacePrepare(CallInst *Prepare, LazyCallGraph &CG,
     Cast->eraseFromParent();
   }
 }
-/// Replace a call to llvm.coro.prepare.retcon.
-static void replacePrepare(CallInst *Prepare, CallGraph &CG) {
-  auto CastFn = Prepare->getArgOperand(0); // as an i8*
-  auto Fn = CastFn->stripPointerCasts(); // as its original type
-
-  // Find call graph nodes for the preparation.
-  CallGraphNode *PrepareUserNode = nullptr, *FnNode = nullptr;
-  if (auto ConcreteFn = dyn_cast<Function>(Fn)) {
-    PrepareUserNode = CG[Prepare->getFunction()];
-    FnNode = CG[ConcreteFn];
-  }
-
-  // Attempt to peephole this pattern:
-  //    %0 = bitcast [[TYPE]] @some_function to i8*
-  //    %1 = call @llvm.coro.prepare.retcon(i8* %0)
-  //    %2 = bitcast %1 to [[TYPE]]
-  // ==>
-  //    %2 = @some_function
-  for (Use &U : llvm::make_early_inc_range(Prepare->uses())) {
-    // Look for bitcasts back to the original function type.
-    auto *Cast = dyn_cast<BitCastInst>(U.getUser());
-    if (!Cast || Cast->getType() != Fn->getType()) continue;
-
-    // Check whether the replacement will introduce new direct calls.
-    // If so, we'll need to update the call graph.
-    if (PrepareUserNode) {
-      for (auto &Use : Cast->uses()) {
-        if (auto *CB = dyn_cast<CallBase>(Use.getUser())) {
-          if (!CB->isCallee(&Use))
-            continue;
-          PrepareUserNode->removeCallEdgeFor(*CB);
-          PrepareUserNode->addCalledFunction(CB, FnNode);
-        }
-      }
-    }
-
-    // Replace and remove the cast.
-    Cast->replaceAllUsesWith(Fn);
-    Cast->eraseFromParent();
-  }
-
-  // Replace any remaining uses with the function as an i8*.
-  // This can never directly be a callee, so we don't need to update CG.
-  Prepare->replaceAllUsesWith(CastFn);
-  Prepare->eraseFromParent();
-
-  // Kill dead bitcasts.
-  while (auto *Cast = dyn_cast<BitCastInst>(CastFn)) {
-    if (!Cast->use_empty()) break;
-    CastFn = Cast->getOperand(0);
-    Cast->eraseFromParent();
-  }
-}
 
 static bool replaceAllPrepares(Function *PrepareFn, LazyCallGraph &CG,
                                LazyCallGraph::SCC &C) {
@@ -2143,30 +2053,6 @@ static bool replaceAllPrepares(Function *PrepareFn, LazyCallGraph &CG,
   return Changed;
 }
 
-/// Remove calls to llvm.coro.prepare.retcon, a barrier meant to prevent
-/// IPO from operating on calls to a retcon coroutine before it's been
-/// split.  This is only safe to do after we've split all retcon
-/// coroutines in the module.  We can do that this in this pass because
-/// this pass does promise to split all retcon coroutines (as opposed to
-/// switch coroutines, which are lowered in multiple stages).
-static bool replaceAllPrepares(Function *PrepareFn, CallGraph &CG) {
-  bool Changed = false;
-  for (Use &P : llvm::make_early_inc_range(PrepareFn->uses())) {
-    // Intrinsics can only be used in calls.
-    auto *Prepare = cast<CallInst>(P.getUser());
-    replacePrepare(Prepare, CG);
-    Changed = true;
-  }
-
-  return Changed;
-}
-
-static bool declaresCoroSplitIntrinsics(const Module &M) {
-  return coro::declaresIntrinsics(M, {"llvm.coro.begin",
-                                      "llvm.coro.prepare.retcon",
-                                      "llvm.coro.prepare.async"});
-}
-
 static void addPrepareFunction(const Module &M,
                                SmallVectorImpl<Function *> &Fns,
                                StringRef Name) {
@@ -2185,18 +2071,15 @@ PreservedAnalyses CoroSplitPass::run(LazyCallGraph::SCC &C,
   auto &FAM =
       AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
 
-  if (!declaresCoroSplitIntrinsics(M))
-    return PreservedAnalyses::all();
-
   // Check for uses of llvm.coro.prepare.retcon/async.
   SmallVector<Function *, 2> PrepareFns;
   addPrepareFunction(M, PrepareFns, "llvm.coro.prepare.retcon");
   addPrepareFunction(M, PrepareFns, "llvm.coro.prepare.async");
 
   // Find coroutines for processing.
-  SmallVector<LazyCallGraph::Node *, 4> Coroutines;
+  SmallVector<LazyCallGraph::Node *> Coroutines;
   for (LazyCallGraph::Node &N : C)
-    if (N.getFunction().hasFnAttribute(CORO_PRESPLIT_ATTR))
+    if (N.getFunction().isPresplitCoroutine())
       Coroutines.push_back(&N);
 
   if (Coroutines.empty() && PrepareFns.empty())
@@ -2212,13 +2095,12 @@ PreservedAnalyses CoroSplitPass::run(LazyCallGraph::SCC &C,
   for (LazyCallGraph::Node *N : Coroutines) {
     Function &F = N->getFunction();
     LLVM_DEBUG(dbgs() << "CoroSplit: Processing coroutine '" << F.getName()
-                      << "' state: "
-                      << F.getFnAttribute(CORO_PRESPLIT_ATTR).getValueAsString()
                       << "\n");
-    F.removeFnAttr(CORO_PRESPLIT_ATTR);
+    F.setSplittedCoroutine();
 
     SmallVector<Function *, 4> Clones;
-    const coro::Shape Shape = splitCoroutine(F, Clones, OptimizeFrame);
+    const coro::Shape Shape = splitCoroutine(
+        F, Clones, FAM.getResult<TargetIRAnalysis>(F), OptimizeFrame);
     updateCallGraphAfterCoroutineSplit(*N, Shape, Clones, C, CG, AM, UR, FAM);
 
     if (!Shape.CoroSuspends.empty()) {
@@ -2237,122 +2119,3 @@ PreservedAnalyses CoroSplitPass::run(LazyCallGraph::SCC &C,
 
   return PreservedAnalyses::none();
 }
-
-namespace {
-
-// We present a coroutine to LLVM as an ordinary function with suspension
-// points marked up with intrinsics. We let the optimizer party on the coroutine
-// as a single function for as long as possible. Shortly before the coroutine is
-// eligible to be inlined into its callers, we split up the coroutine into parts
-// corresponding to initial, resume and destroy invocations of the coroutine,
-// add them to the current SCC and restart the IPO pipeline to optimize the
-// coroutine subfunctions we extracted before proceeding to the caller of the
-// coroutine.
-struct CoroSplitLegacy : public CallGraphSCCPass {
-  static char ID; // Pass identification, replacement for typeid
-
-  CoroSplitLegacy(bool OptimizeFrame = false)
-      : CallGraphSCCPass(ID), OptimizeFrame(OptimizeFrame) {
-    initializeCoroSplitLegacyPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool Run = false;
-  bool OptimizeFrame;
-
-  // A coroutine is identified by the presence of coro.begin intrinsic, if
-  // we don't have any, this pass has nothing to do.
-  bool doInitialization(CallGraph &CG) override {
-    Run = declaresCoroSplitIntrinsics(CG.getModule());
-    return CallGraphSCCPass::doInitialization(CG);
-  }
-
-  bool runOnSCC(CallGraphSCC &SCC) override {
-    if (!Run)
-      return false;
-
-    // Check for uses of llvm.coro.prepare.retcon.
-    SmallVector<Function *, 2> PrepareFns;
-    auto &M = SCC.getCallGraph().getModule();
-    addPrepareFunction(M, PrepareFns, "llvm.coro.prepare.retcon");
-    addPrepareFunction(M, PrepareFns, "llvm.coro.prepare.async");
-
-    // Find coroutines for processing.
-    SmallVector<Function *, 4> Coroutines;
-    for (CallGraphNode *CGN : SCC)
-      if (auto *F = CGN->getFunction())
-        if (F->hasFnAttribute(CORO_PRESPLIT_ATTR))
-          Coroutines.push_back(F);
-
-    if (Coroutines.empty() && PrepareFns.empty())
-      return false;
-
-    CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
-
-    if (Coroutines.empty()) {
-      bool Changed = false;
-      for (auto *PrepareFn : PrepareFns)
-        Changed |= replaceAllPrepares(PrepareFn, CG);
-      return Changed;
-    }
-
-    createDevirtTriggerFunc(CG, SCC);
-
-    // Split all the coroutines.
-    for (Function *F : Coroutines) {
-      Attribute Attr = F->getFnAttribute(CORO_PRESPLIT_ATTR);
-      StringRef Value = Attr.getValueAsString();
-      LLVM_DEBUG(dbgs() << "CoroSplit: Processing coroutine '" << F->getName()
-                        << "' state: " << Value << "\n");
-      // Async lowering marks coroutines to trigger a restart of the pipeline
-      // after it has split them.
-      if (Value == ASYNC_RESTART_AFTER_SPLIT) {
-        F->removeFnAttr(CORO_PRESPLIT_ATTR);
-        continue;
-      }
-      if (Value == UNPREPARED_FOR_SPLIT) {
-        prepareForSplit(*F, CG);
-        continue;
-      }
-      F->removeFnAttr(CORO_PRESPLIT_ATTR);
-
-      SmallVector<Function *, 4> Clones;
-      const coro::Shape Shape = splitCoroutine(*F, Clones, OptimizeFrame);
-      updateCallGraphAfterCoroutineSplit(*F, Shape, Clones, CG, SCC);
-      if (Shape.ABI == coro::ABI::Async) {
-        // Restart SCC passes.
-        // Mark function for CoroElide pass. It will devirtualize causing a
-        // restart of the SCC pipeline.
-        prepareForSplit(*F, CG, true /*MarkForAsyncRestart*/);
-      }
-    }
-
-    for (auto *PrepareFn : PrepareFns)
-      replaceAllPrepares(PrepareFn, CG);
-
-    return true;
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    CallGraphSCCPass::getAnalysisUsage(AU);
-  }
-
-  StringRef getPassName() const override { return "Coroutine Splitting"; }
-};
-
-} // end anonymous namespace
-
-char CoroSplitLegacy::ID = 0;
-
-INITIALIZE_PASS_BEGIN(
-    CoroSplitLegacy, "coro-split",
-    "Split coroutine into a set of functions driving its state machine", false,
-    false)
-INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
-INITIALIZE_PASS_END(
-    CoroSplitLegacy, "coro-split",
-    "Split coroutine into a set of functions driving its state machine", false,
-    false)
-
-Pass *llvm::createCoroSplitLegacyPass(bool OptimizeFrame) {
-  return new CoroSplitLegacy(OptimizeFrame);
-}
diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
index 965a146c143f..1742e9319c3b 100644
--- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -10,14 +10,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Coroutines.h"
 #include "CoroInstr.h"
 #include "CoroInternal.h"
-#include "llvm-c/Transforms/Coroutines.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -26,14 +23,10 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
-#include "llvm/InitializePasses.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <cstddef>
@@ -41,55 +34,6 @@
 
 using namespace llvm;
 
-void llvm::initializeCoroutines(PassRegistry &Registry) {
-  initializeCoroEarlyLegacyPass(Registry);
-  initializeCoroSplitLegacyPass(Registry);
-  initializeCoroElideLegacyPass(Registry);
-  initializeCoroCleanupLegacyPass(Registry);
-}
-
-static void addCoroutineOpt0Passes(const PassManagerBuilder &Builder,
-                                   legacy::PassManagerBase &PM) {
-  PM.add(createCoroSplitLegacyPass());
-  PM.add(createCoroElideLegacyPass());
-
-  PM.add(createBarrierNoopPass());
-  PM.add(createCoroCleanupLegacyPass());
-}
-
-static void addCoroutineEarlyPasses(const PassManagerBuilder &Builder,
-                                    legacy::PassManagerBase &PM) {
-  PM.add(createCoroEarlyLegacyPass());
-}
-
-static void addCoroutineScalarOptimizerPasses(const PassManagerBuilder &Builder,
-                                              legacy::PassManagerBase &PM) {
-  PM.add(createCoroElideLegacyPass());
-}
-
-static void addCoroutineSCCPasses(const PassManagerBuilder &Builder,
-                                  legacy::PassManagerBase &PM) {
-  PM.add(createCoroSplitLegacyPass(Builder.OptLevel != 0));
-}
-
-static void addCoroutineOptimizerLastPasses(const PassManagerBuilder &Builder,
-                                            legacy::PassManagerBase &PM) {
-  PM.add(createCoroCleanupLegacyPass());
-}
-
-void llvm::addCoroutinePassesToExtensionPoints(PassManagerBuilder &Builder) {
-  Builder.addExtension(PassManagerBuilder::EP_EarlyAsPossible,
-                       addCoroutineEarlyPasses);
-  Builder.addExtension(PassManagerBuilder::EP_EnabledOnOptLevel0,
-                       addCoroutineOpt0Passes);
-  Builder.addExtension(PassManagerBuilder::EP_CGSCCOptimizerLate,
-                       addCoroutineSCCPasses);
-  Builder.addExtension(PassManagerBuilder::EP_ScalarOptimizerLate,
-                       addCoroutineScalarOptimizerPasses);
-  Builder.addExtension(PassManagerBuilder::EP_OptimizerLast,
-                       addCoroutineOptimizerLastPasses);
-}
-
 // Construct the lowerer base class and initialize its members.
 coro::LowererBase::LowererBase(Module &M)
     : TheModule(M), Context(M.getContext()),
@@ -119,44 +63,55 @@ Value *coro::LowererBase::makeSubFnCall(Value *Arg, int Index,
   return Bitcast;
 }
 
+// NOTE: Must be sorted!
+static const char *const CoroIntrinsics[] = {
+    "llvm.coro.align",
+    "llvm.coro.alloc",
+    "llvm.coro.async.context.alloc",
+    "llvm.coro.async.context.dealloc",
+    "llvm.coro.async.resume",
+    "llvm.coro.async.size.replace",
+    "llvm.coro.async.store_resume",
+    "llvm.coro.begin",
+    "llvm.coro.destroy",
+    "llvm.coro.done",
+    "llvm.coro.end",
+    "llvm.coro.end.async",
+    "llvm.coro.frame",
+    "llvm.coro.free",
+    "llvm.coro.id",
+    "llvm.coro.id.async",
+    "llvm.coro.id.retcon",
+    "llvm.coro.id.retcon.once",
+    "llvm.coro.noop",
+    "llvm.coro.prepare.async",
+    "llvm.coro.prepare.retcon",
+    "llvm.coro.promise",
+    "llvm.coro.resume",
+    "llvm.coro.save",
+    "llvm.coro.size",
+    "llvm.coro.subfn.addr",
+    "llvm.coro.suspend",
+    "llvm.coro.suspend.async",
+    "llvm.coro.suspend.retcon",
+};
+
 #ifndef NDEBUG
 static bool isCoroutineIntrinsicName(StringRef Name) {
-  // NOTE: Must be sorted!
-  static const char *const CoroIntrinsics[] = {
-      "llvm.coro.align",
-      "llvm.coro.alloc",
-      "llvm.coro.async.context.alloc",
-      "llvm.coro.async.context.dealloc",
-      "llvm.coro.async.resume",
-      "llvm.coro.async.size.replace",
-      "llvm.coro.async.store_resume",
-      "llvm.coro.begin",
-      "llvm.coro.destroy",
-      "llvm.coro.done",
-      "llvm.coro.end",
-      "llvm.coro.end.async",
-      "llvm.coro.frame",
-      "llvm.coro.free",
-      "llvm.coro.id",
-      "llvm.coro.id.async",
-      "llvm.coro.id.retcon",
-      "llvm.coro.id.retcon.once",
-      "llvm.coro.noop",
-      "llvm.coro.prepare.async",
-      "llvm.coro.prepare.retcon",
-      "llvm.coro.promise",
-      "llvm.coro.resume",
-      "llvm.coro.save",
-      "llvm.coro.size",
-      "llvm.coro.subfn.addr",
-      "llvm.coro.suspend",
-      "llvm.coro.suspend.async",
-      "llvm.coro.suspend.retcon",
-  };
   return Intrinsic::lookupLLVMIntrinsicByName(CoroIntrinsics, Name) != -1;
 }
 #endif
 
+bool coro::declaresAnyIntrinsic(const Module &M) {
+  for (StringRef Name : CoroIntrinsics) {
+    assert(isCoroutineIntrinsicName(Name) && "not a coroutine intrinsic");
+    if (M.getNamedValue(Name))
+      return true;
+  }
+
+  return false;
+}
+
 // Verifies if a module has named values listed. Also, in debug mode verifies
 // that names are intrinsic names.
 bool coro::declaresIntrinsics(const Module &M,
@@ -191,46 +146,6 @@ void coro::replaceCoroFree(CoroIdInst *CoroId, bool Elide) {
   }
 }
 
-// FIXME: This code is stolen from CallGraph::addToCallGraph(Function *F), which
-// happens to be private. It is better for this functionality exposed by the
-// CallGraph.
-static void buildCGN(CallGraph &CG, CallGraphNode *Node) {
-  Function *F = Node->getFunction();
-
-  // Look for calls by this function.
-  for (Instruction &I : instructions(F))
-    if (auto *Call = dyn_cast<CallBase>(&I)) {
-      const Function *Callee = Call->getCalledFunction();
-      if (!Callee || !Intrinsic::isLeaf(Callee->getIntrinsicID()))
-        // Indirect calls of intrinsics are not allowed so no need to check.
-        // We can be more precise here by using TargetArg returned by
-        // Intrinsic::isLeaf.
-        Node->addCalledFunction(Call, CG.getCallsExternalNode());
-      else if (!Callee->isIntrinsic())
-        Node->addCalledFunction(Call, CG.getOrInsertFunction(Callee));
-    }
-}
-
-// Rebuild CGN after we extracted parts of the code from ParentFunc into
-// NewFuncs. Builds CGNs for the NewFuncs and adds them to the current SCC.
-void coro::updateCallGraph(Function &ParentFunc, ArrayRef<Function *> NewFuncs,
-                           CallGraph &CG, CallGraphSCC &SCC) {
-  // Rebuild CGN from scratch for the ParentFunc
-  auto *ParentNode = CG[&ParentFunc];
-  ParentNode->removeAllCalledFunctions();
-  buildCGN(CG, ParentNode);
-
-  SmallVector<CallGraphNode *, 8> Nodes(SCC.begin(), SCC.end());
-
-  for (Function *F : NewFuncs) {
-    CallGraphNode *Callee = CG.getOrInsertFunction(F);
-    Nodes.push_back(Callee);
-    buildCGN(CG, Callee);
-  }
-
-  SCC.initialize(Nodes);
-}
-
 static void clear(coro::Shape &Shape) {
   Shape.CoroBegin = nullptr;
   Shape.CoroEnds.clear();
@@ -735,25 +650,3 @@ void CoroAsyncEndInst::checkWellFormed() const {
          "match the tail arguments",
          MustTailCallFunc);
 }
-
-void LLVMAddCoroEarlyPass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createCoroEarlyLegacyPass());
-}
-
-void LLVMAddCoroSplitPass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createCoroSplitLegacyPass());
-}
-
-void LLVMAddCoroElidePass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createCoroElideLegacyPass());
-}
-
-void LLVMAddCoroCleanupPass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createCoroCleanupLegacyPass());
-}
-
-void
-LLVMPassManagerBuilderAddCoroutinePassesToExtensionPoints(LLVMPassManagerBuilderRef PMB) {
-  PassManagerBuilder *Builder = unwrap(PMB);
-  addCoroutinePassesToExtensionPoints(*Builder);
-}
diff --git a/llvm/lib/Transforms/IPO/AlwaysInliner.cpp b/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
index a6d9ce1033f3..58cea7ebb749 100644
--- a/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
+++ b/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
@@ -1,4 +1,4 @@
-//===- InlineAlways.cpp - Code to inline always_inline functions ----------===//
+//===- AlwaysInliner.cpp - Code to inline always_inline functions ----------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -16,15 +16,10 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/Inliner.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -60,31 +55,38 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M,
       for (User *U : F.users())
         if (auto *CB = dyn_cast<CallBase>(U))
           if (CB->getCalledFunction() == &F &&
-              CB->hasFnAttr(Attribute::AlwaysInline))
-            Calls.insert(CB);
+                CB->hasFnAttr(Attribute::AlwaysInline) &&
+                !CB->getAttributes().hasFnAttr(Attribute::NoInline))
+              Calls.insert(CB);
 
       for (CallBase *CB : Calls) {
         Function *Caller = CB->getCaller();
         OptimizationRemarkEmitter ORE(Caller);
-        auto OIC = shouldInline(
-            *CB,
-            [&](CallBase &CB) {
-              return InlineCost::getAlways("always inline attribute");
-            },
-            ORE);
-        assert(OIC);
-        emitInlinedIntoBasedOnCost(ORE, CB->getDebugLoc(), CB->getParent(), F,
-                                   *Caller, *OIC, false, DEBUG_TYPE);
+        DebugLoc DLoc = CB->getDebugLoc();
+        BasicBlock *Block = CB->getParent();
 
         InlineFunctionInfo IFI(
             /*cg=*/nullptr, GetAssumptionCache, &PSI,
-            &FAM.getResult<BlockFrequencyAnalysis>(*(CB->getCaller())),
+            &FAM.getResult<BlockFrequencyAnalysis>(*Caller),
             &FAM.getResult<BlockFrequencyAnalysis>(F));
 
         InlineResult Res = InlineFunction(
             *CB, IFI, &FAM.getResult<AAManager>(F), InsertLifetime);
-        assert(Res.isSuccess() && "unexpected failure to inline");
-        (void)Res;
+        if (!Res.isSuccess()) {
+          ORE.emit([&]() {
+            return OptimizationRemarkMissed(DEBUG_TYPE, "NotInlined", DLoc,
+                                            Block)
+                   << "'" << ore::NV("Callee", &F) << "' is not inlined into '"
+                   << ore::NV("Caller", Caller)
+                   << "': " << ore::NV("Reason", Res.getFailureReason());
+          });
+          continue;
+        }
+
+        emitInlinedIntoBasedOnCost(
+            ORE, DLoc, Block, F, *Caller,
+            InlineCost::getAlways("always inline attribute"),
+            /*ForProfileContext=*/false, DEBUG_TYPE);
 
         // Merge the attributes based on the inlining.
         AttributeFuncs::mergeAttributesForInlining(*Caller, F);
@@ -210,6 +212,9 @@ InlineCost AlwaysInlinerLegacyPass::getInlineCost(CallBase &CB) {
   if (!CB.hasFnAttr(Attribute::AlwaysInline))
     return InlineCost::getNever("no alwaysinline attribute");
 
+  if (Callee->hasFnAttribute(Attribute::AlwaysInline) && CB.isNoInline())
+    return InlineCost::getNever("noinline call site attribute");
+
   auto IsViable = isInlineViable(*Callee);
   if (!IsViable.isSuccess())
     return InlineCost::getNever(IsViable.getFailureReason());
diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index e6a542385662..62cfc3294968 100644
--- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -29,9 +29,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO/ArgumentPromotion.h"
+
 #include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -40,15 +39,11 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/CallGraphSCCPass.h"
-#include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryLocation.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
@@ -56,33 +51,26 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/NoFolder.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
-#include <functional>
-#include <iterator>
-#include <map>
-#include <set>
 #include <utility>
 #include <vector>
 
@@ -91,43 +79,81 @@ using namespace llvm;
 #define DEBUG_TYPE "argpromotion"
 
 STATISTIC(NumArgumentsPromoted, "Number of pointer arguments promoted");
-STATISTIC(NumAggregatesPromoted, "Number of aggregate arguments promoted");
-STATISTIC(NumByValArgsPromoted, "Number of byval arguments promoted");
 STATISTIC(NumArgumentsDead, "Number of dead pointer args eliminated");
 
-/// A vector used to hold the indices of a single GEP instruction
-using IndicesVector = std::vector<uint64_t>;
+namespace {
+
+struct ArgPart {
+  Type *Ty;
+  Align Alignment;
+  /// A representative guaranteed-executed load or store instruction for use by
+  /// metadata transfer.
+  Instruction *MustExecInstr;
+};
+
+using OffsetAndArgPart = std::pair<int64_t, ArgPart>;
+
+} // end anonymous namespace
+
+static Value *createByteGEP(IRBuilderBase &IRB, const DataLayout &DL,
+                            Value *Ptr, Type *ResElemTy, int64_t Offset) {
+  // For non-opaque pointers, try to create a "nice" GEP if possible, otherwise
+  // fall back to an i8 GEP to a specific offset.
+  unsigned AddrSpace = Ptr->getType()->getPointerAddressSpace();
+  APInt OrigOffset(DL.getIndexTypeSizeInBits(Ptr->getType()), Offset);
+  if (!Ptr->getType()->isOpaquePointerTy()) {
+    Type *OrigElemTy = Ptr->getType()->getNonOpaquePointerElementType();
+    if (OrigOffset == 0 && OrigElemTy == ResElemTy)
+      return Ptr;
+
+    if (OrigElemTy->isSized()) {
+      APInt TmpOffset = OrigOffset;
+      Type *TmpTy = OrigElemTy;
+      SmallVector<APInt> IntIndices =
+          DL.getGEPIndicesForOffset(TmpTy, TmpOffset);
+      if (TmpOffset == 0) {
+        // Try to add trailing zero indices to reach the right type.
+        while (TmpTy != ResElemTy) {
+          Type *NextTy = GetElementPtrInst::getTypeAtIndex(TmpTy, (uint64_t)0);
+          if (!NextTy)
+            break;
+
+          IntIndices.push_back(APInt::getZero(
+              isa<StructType>(TmpTy) ? 32 : OrigOffset.getBitWidth()));
+          TmpTy = NextTy;
+        }
+
+        SmallVector<Value *> Indices;
+        for (const APInt &Index : IntIndices)
+          Indices.push_back(IRB.getInt(Index));
+
+        if (OrigOffset != 0 || TmpTy == ResElemTy) {
+          Ptr = IRB.CreateGEP(OrigElemTy, Ptr, Indices);
+          return IRB.CreateBitCast(Ptr, ResElemTy->getPointerTo(AddrSpace));
+        }
+      }
+    }
+  }
+
+  if (OrigOffset != 0) {
+    Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy(AddrSpace));
+    Ptr = IRB.CreateGEP(IRB.getInt8Ty(), Ptr, IRB.getInt(OrigOffset));
+  }
+  return IRB.CreateBitCast(Ptr, ResElemTy->getPointerTo(AddrSpace));
+}
 
 /// DoPromotion - This method actually performs the promotion of the specified
 /// arguments, and returns the new function.  At this point, we know that it's
 /// safe to do so.
 static Function *
-doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
-            SmallPtrSetImpl<Argument *> &ByValArgsToTransform,
-            Optional<function_ref<void(CallBase &OldCS, CallBase &NewCS)>>
-                ReplaceCallSite) {
+doPromotion(Function *F, FunctionAnalysisManager &FAM,
+            const DenseMap<Argument *, SmallVector<OffsetAndArgPart, 4>>
+                &ArgsToPromote) {
   // Start by computing a new prototype for the function, which is the same as
   // the old function, but has modified arguments.
   FunctionType *FTy = F->getFunctionType();
   std::vector<Type *> Params;
 
-  using ScalarizeTable = std::set<std::pair<Type *, IndicesVector>>;
-
-  // ScalarizedElements - If we are promoting a pointer that has elements
-  // accessed out of it, keep track of which elements are accessed so that we
-  // can add one argument for each.
-  //
-  // Arguments that are directly loaded will have a zero element value here, to
-  // handle cases where there are both a direct load and GEP accesses.
-  std::map<Argument *, ScalarizeTable> ScalarizedElements;
-
-  // OriginalLoads - Keep track of a representative load instruction from the
-  // original function so that we can tell the alias analysis implementation
-  // what the new GEP/Load instructions we are inserting look like.
-  // We need to keep the original loads for each argument and the elements
-  // of the argument that are accessed.
-  std::map<std::pair<Argument *, IndicesVector>, LoadInst *> OriginalLoads;
-
   // Attribute - Keep track of the parameter attributes for the arguments
   // that we are *not* promoting. For the ones that we do promote, the parameter
   // attributes are lost
@@ -138,15 +164,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
   unsigned ArgNo = 0;
   for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
        ++I, ++ArgNo) {
-    if (ByValArgsToTransform.count(&*I)) {
-      // Simple byval argument? Just add all the struct element types.
-      Type *AgTy = I->getParamByValType();
-      StructType *STy = cast<StructType>(AgTy);
-      llvm::append_range(Params, STy->elements());
-      ArgAttrVec.insert(ArgAttrVec.end(), STy->getNumElements(),
-                        AttributeSet());
-      ++NumByValArgsPromoted;
-    } else if (!ArgsToPromote.count(&*I)) {
+    if (!ArgsToPromote.count(&*I)) {
       // Unchanged argument
       Params.push_back(I->getType());
       ArgAttrVec.push_back(PAL.getParamAttrs(ArgNo));
@@ -154,58 +172,12 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
       // Dead argument (which are always marked as promotable)
       ++NumArgumentsDead;
     } else {
-      // Okay, this is being promoted. This means that the only uses are loads
-      // or GEPs which are only used by loads
-
-      // In this table, we will track which indices are loaded from the argument
-      // (where direct loads are tracked as no indices).
-      ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
-      for (User *U : make_early_inc_range(I->users())) {
-        Instruction *UI = cast<Instruction>(U);
-        Type *SrcTy;
-        if (LoadInst *L = dyn_cast<LoadInst>(UI))
-          SrcTy = L->getType();
-        else
-          SrcTy = cast<GetElementPtrInst>(UI)->getSourceElementType();
-        // Skip dead GEPs and remove them.
-        if (isa<GetElementPtrInst>(UI) && UI->use_empty()) {
-          UI->eraseFromParent();
-          continue;
-        }
-
-        IndicesVector Indices;
-        Indices.reserve(UI->getNumOperands() - 1);
-        // Since loads will only have a single operand, and GEPs only a single
-        // non-index operand, this will record direct loads without any indices,
-        // and gep+loads with the GEP indices.
-        for (const Use &I : llvm::drop_begin(UI->operands()))
-          Indices.push_back(cast<ConstantInt>(I)->getSExtValue());
-        // GEPs with a single 0 index can be merged with direct loads
-        if (Indices.size() == 1 && Indices.front() == 0)
-          Indices.clear();
-        ArgIndices.insert(std::make_pair(SrcTy, Indices));
-        LoadInst *OrigLoad;
-        if (LoadInst *L = dyn_cast<LoadInst>(UI))
-          OrigLoad = L;
-        else
-          // Take any load, we will use it only to update Alias Analysis
-          OrigLoad = cast<LoadInst>(UI->user_back());
-        OriginalLoads[std::make_pair(&*I, Indices)] = OrigLoad;
-      }
-
-      // Add a parameter to the function for each element passed in.
-      for (const auto &ArgIndex : ArgIndices) {
-        // not allowed to dereference ->begin() if size() is 0
-        Params.push_back(GetElementPtrInst::getIndexedType(
-            I->getType()->getPointerElementType(), ArgIndex.second));
+      const auto &ArgParts = ArgsToPromote.find(&*I)->second;
+      for (const auto &Pair : ArgParts) {
+        Params.push_back(Pair.second.Ty);
         ArgAttrVec.push_back(AttributeSet());
-        assert(Params.back());
       }
-
-      if (ArgIndices.size() == 1 && ArgIndices.begin()->second.empty())
-        ++NumArgumentsPromoted;
-      else
-        ++NumAggregatesPromoted;
+      ++NumArgumentsPromoted;
     }
   }
 
@@ -222,24 +194,30 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
 
   // The new function will have the !dbg metadata copied from the original
   // function. The original function may not be deleted, and dbg metadata need
-  // to be unique so we need to drop it.
+  // to be unique, so we need to drop it.
   F->setSubprogram(nullptr);
 
   LLVM_DEBUG(dbgs() << "ARG PROMOTION:  Promoting to:" << *NF << "\n"
                     << "From: " << *F);
 
+  uint64_t LargestVectorWidth = 0;
+  for (auto *I : Params)
+    if (auto *VT = dyn_cast<llvm::VectorType>(I))
+      LargestVectorWidth = std::max(
+          LargestVectorWidth, VT->getPrimitiveSizeInBits().getKnownMinSize());
+
   // Recompute the parameter attributes list based on the new arguments for
   // the function.
   NF->setAttributes(AttributeList::get(F->getContext(), PAL.getFnAttrs(),
                                        PAL.getRetAttrs(), ArgAttrVec));
+  AttributeFuncs::updateMinLegalVectorWidthAttr(*NF, LargestVectorWidth);
   ArgAttrVec.clear();
 
   F->getParent()->getFunctionList().insert(F->getIterator(), NF);
   NF->takeName(F);
 
-  // Loop over all of the callers of the function, transforming the call sites
-  // to pass in the loaded pointers.
-  //
+  // Loop over all the callers of the function, transforming the call sites to
+  // pass in the loaded pointers.
   SmallVector<Value *, 16> Args;
   const DataLayout &DL = F->getParent()->getDataLayout();
   while (!F->use_empty()) {
@@ -250,74 +228,34 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
 
     // Loop over the operands, inserting GEP and loads in the caller as
     // appropriate.
-    auto AI = CB.arg_begin();
+    auto *AI = CB.arg_begin();
     ArgNo = 0;
     for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
-         ++I, ++AI, ++ArgNo)
-      if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) {
+         ++I, ++AI, ++ArgNo) {
+      if (!ArgsToPromote.count(&*I)) {
         Args.push_back(*AI); // Unmodified argument
         ArgAttrVec.push_back(CallPAL.getParamAttrs(ArgNo));
-      } else if (ByValArgsToTransform.count(&*I)) {
-        // Emit a GEP and load for each element of the struct.
-        Type *AgTy = I->getParamByValType();
-        StructType *STy = cast<StructType>(AgTy);
-        Value *Idxs[2] = {
-            ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), nullptr};
-        const StructLayout *SL = DL.getStructLayout(STy);
-        Align StructAlign = *I->getParamAlign();
-        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
-          Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i);
-          auto *Idx =
-              IRB.CreateGEP(STy, *AI, Idxs, (*AI)->getName() + "." + Twine(i));
-          // TODO: Tell AA about the new values?
-          Align Alignment =
-              commonAlignment(StructAlign, SL->getElementOffset(i));
-          Args.push_back(IRB.CreateAlignedLoad(
-              STy->getElementType(i), Idx, Alignment, Idx->getName() + ".val"));
-          ArgAttrVec.push_back(AttributeSet());
-        }
       } else if (!I->use_empty()) {
-        // Non-dead argument: insert GEPs and loads as appropriate.
-        ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
-        // Store the Value* version of the indices in here, but declare it now
-        // for reuse.
-        std::vector<Value *> Ops;
-        for (const auto &ArgIndex : ArgIndices) {
-          Value *V = *AI;
-          LoadInst *OrigLoad =
-              OriginalLoads[std::make_pair(&*I, ArgIndex.second)];
-          if (!ArgIndex.second.empty()) {
-            Ops.reserve(ArgIndex.second.size());
-            Type *ElTy = V->getType();
-            for (auto II : ArgIndex.second) {
-              // Use i32 to index structs, and i64 for others (pointers/arrays).
-              // This satisfies GEP constraints.
-              Type *IdxTy =
-                  (ElTy->isStructTy() ? Type::getInt32Ty(F->getContext())
-                                      : Type::getInt64Ty(F->getContext()));
-              Ops.push_back(ConstantInt::get(IdxTy, II));
-              // Keep track of the type we're currently indexing.
-              if (auto *ElPTy = dyn_cast<PointerType>(ElTy))
-                ElTy = ElPTy->getPointerElementType();
-              else
-                ElTy = GetElementPtrInst::getTypeAtIndex(ElTy, II);
-            }
-            // And create a GEP to extract those indices.
-            V = IRB.CreateGEP(ArgIndex.first, V, Ops, V->getName() + ".idx");
-            Ops.clear();
+        Value *V = *AI;
+        const auto &ArgParts = ArgsToPromote.find(&*I)->second;
+        for (const auto &Pair : ArgParts) {
+          LoadInst *LI = IRB.CreateAlignedLoad(
+              Pair.second.Ty,
+              createByteGEP(IRB, DL, V, Pair.second.Ty, Pair.first),
+              Pair.second.Alignment, V->getName() + ".val");
+          if (Pair.second.MustExecInstr) {
+            LI->setAAMetadata(Pair.second.MustExecInstr->getAAMetadata());
+            LI->copyMetadata(*Pair.second.MustExecInstr,
+                             {LLVMContext::MD_range, LLVMContext::MD_nonnull,
+                              LLVMContext::MD_dereferenceable,
+                              LLVMContext::MD_dereferenceable_or_null,
+                              LLVMContext::MD_align, LLVMContext::MD_noundef});
           }
-          // Since we're replacing a load make sure we take the alignment
-          // of the previous load.
-          LoadInst *newLoad =
-              IRB.CreateLoad(OrigLoad->getType(), V, V->getName() + ".val");
-          newLoad->setAlignment(OrigLoad->getAlign());
-          // Transfer the AA info too.
-          newLoad->setAAMetadata(OrigLoad->getAAMetadata());
-
-          Args.push_back(newLoad);
+          Args.push_back(LI);
           ArgAttrVec.push_back(AttributeSet());
         }
       }
+    }
 
     // Push any varargs arguments on the list.
     for (; AI != CB.arg_end(); ++AI, ++ArgNo) {
@@ -345,9 +283,8 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
     Args.clear();
     ArgAttrVec.clear();
 
-    // Update the callgraph to know that the callsite has been transformed.
-    if (ReplaceCallSite)
-      (*ReplaceCallSite)(CB, *NewCS);
+    AttributeFuncs::updateMinLegalVectorWidthAttr(*CB.getCaller(),
+                                                  LargestVectorWidth);
 
     if (!CB.use_empty()) {
       CB.replaceAllUsesWith(NewCS);
@@ -364,11 +301,15 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
   // function empty.
   NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList());
 
+  // We will collect all the new created allocas to promote them into registers
+  // after the following loop
+  SmallVector<AllocaInst *, 4> Allocas;
+
   // Loop over the argument list, transferring uses of the old arguments over to
   // the new arguments, also transferring over the names as well.
   Function::arg_iterator I2 = NF->arg_begin();
   for (Argument &Arg : F->args()) {
-    if (!ArgsToPromote.count(&Arg) && !ByValArgsToTransform.count(&Arg)) {
+    if (!ArgsToPromote.count(&Arg)) {
       // If this is an unmodified argument, move the name and users over to the
       // new version.
       Arg.replaceAllUsesWith(&*I2);
@@ -377,37 +318,6 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
       continue;
     }
 
-    if (ByValArgsToTransform.count(&Arg)) {
-      // In the callee, we create an alloca, and store each of the new incoming
-      // arguments into the alloca.
-      Instruction *InsertPt = &NF->begin()->front();
-
-      // Just add all the struct element types.
-      Type *AgTy = Arg.getParamByValType();
-      Align StructAlign = *Arg.getParamAlign();
-      Value *TheAlloca = new AllocaInst(AgTy, DL.getAllocaAddrSpace(), nullptr,
-                                        StructAlign, "", InsertPt);
-      StructType *STy = cast<StructType>(AgTy);
-      Value *Idxs[2] = {ConstantInt::get(Type::getInt32Ty(F->getContext()), 0),
-                        nullptr};
-      const StructLayout *SL = DL.getStructLayout(STy);
-
-      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
-        Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i);
-        Value *Idx = GetElementPtrInst::Create(
-            AgTy, TheAlloca, Idxs, TheAlloca->getName() + "." + Twine(i),
-            InsertPt);
-        I2->setName(Arg.getName() + "." + Twine(i));
-        Align Alignment = commonAlignment(StructAlign, SL->getElementOffset(i));
-        new StoreInst(&*I2++, Idx, false, Alignment, InsertPt);
-      }
-
-      // Anything that used the arg should now use the alloca.
-      Arg.replaceAllUsesWith(TheAlloca);
-      TheAlloca->takeName(&Arg);
-      continue;
-    }
-
     // There potentially are metadata uses for things like llvm.dbg.value.
     // Replace them with undef, after handling the other regular uses.
     auto RauwUndefMetadata = make_scope_exit(
@@ -416,57 +326,95 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
     if (Arg.use_empty())
       continue;
 
-    // Otherwise, if we promoted this argument, then all users are load
-    // instructions (or GEPs with only load users), and all loads should be
-    // using the new argument that we added.
-    ScalarizeTable &ArgIndices = ScalarizedElements[&Arg];
-
-    while (!Arg.use_empty()) {
-      if (LoadInst *LI = dyn_cast<LoadInst>(Arg.user_back())) {
-        assert(ArgIndices.begin()->second.empty() &&
-               "Load element should sort to front!");
-        I2->setName(Arg.getName() + ".val");
-        LI->replaceAllUsesWith(&*I2);
-        LI->eraseFromParent();
-        LLVM_DEBUG(dbgs() << "*** Promoted load of argument '" << Arg.getName()
-                          << "' in function '" << F->getName() << "'\n");
-      } else {
-        GetElementPtrInst *GEP = cast<GetElementPtrInst>(Arg.user_back());
-        assert(!GEP->use_empty() &&
-               "GEPs without uses should be cleaned up already");
-        IndicesVector Operands;
-        Operands.reserve(GEP->getNumIndices());
-        for (const Use &Idx : GEP->indices())
-          Operands.push_back(cast<ConstantInt>(Idx)->getSExtValue());
-
-        // GEPs with a single 0 index can be merged with direct loads
-        if (Operands.size() == 1 && Operands.front() == 0)
-          Operands.clear();
-
-        Function::arg_iterator TheArg = I2;
-        for (ScalarizeTable::iterator It = ArgIndices.begin();
-             It->second != Operands; ++It, ++TheArg) {
-          assert(It != ArgIndices.end() && "GEP not handled??");
-        }
+    // Otherwise, if we promoted this argument, we have to create an alloca in
+    // the callee for every promotable part and store each of the new incoming
+    // arguments into the corresponding alloca, what lets the old code (the
+    // store instructions if they are allowed especially) a chance to work as
+    // before.
+    assert(Arg.getType()->isPointerTy() &&
+           "Only arguments with a pointer type are promotable");
 
-        TheArg->setName(formatv("{0}.{1:$[.]}.val", Arg.getName(),
-                                make_range(Operands.begin(), Operands.end())));
+    IRBuilder<NoFolder> IRB(&NF->begin()->front());
 
-        LLVM_DEBUG(dbgs() << "*** Promoted agg argument '" << TheArg->getName()
-                          << "' of function '" << NF->getName() << "'\n");
+    // Add only the promoted elements, so parts from ArgsToPromote
+    SmallDenseMap<int64_t, AllocaInst *> OffsetToAlloca;
+    for (const auto &Pair : ArgsToPromote.find(&Arg)->second) {
+      int64_t Offset = Pair.first;
+      const ArgPart &Part = Pair.second;
 
-        // All of the uses must be load instructions.  Replace them all with
-        // the argument specified by ArgNo.
-        while (!GEP->use_empty()) {
-          LoadInst *L = cast<LoadInst>(GEP->user_back());
-          L->replaceAllUsesWith(&*TheArg);
-          L->eraseFromParent();
-        }
-        GEP->eraseFromParent();
+      Argument *NewArg = I2++;
+      NewArg->setName(Arg.getName() + "." + Twine(Offset) + ".val");
+
+      AllocaInst *NewAlloca = IRB.CreateAlloca(
+          Part.Ty, nullptr, Arg.getName() + "." + Twine(Offset) + ".allc");
+      NewAlloca->setAlignment(Pair.second.Alignment);
+      IRB.CreateAlignedStore(NewArg, NewAlloca, Pair.second.Alignment);
+
+      // Collect the alloca to retarget the users to
+      OffsetToAlloca.insert({Offset, NewAlloca});
+    }
+
+    auto GetAlloca = [&](Value *Ptr) {
+      APInt Offset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
+      Ptr = Ptr->stripAndAccumulateConstantOffsets(DL, Offset,
+                                                   /* AllowNonInbounds */ true);
+      assert(Ptr == &Arg && "Not constant offset from arg?");
+      return OffsetToAlloca.lookup(Offset.getSExtValue());
+    };
+
+    // Cleanup the code from the dead instructions: GEPs and BitCasts in between
+    // the original argument and its users: loads and stores. Retarget every
+    // user to the new created alloca.
+    SmallVector<Value *, 16> Worklist;
+    SmallVector<Instruction *, 16> DeadInsts;
+    append_range(Worklist, Arg.users());
+    while (!Worklist.empty()) {
+      Value *V = Worklist.pop_back_val();
+      if (isa<BitCastInst>(V) || isa<GetElementPtrInst>(V)) {
+        DeadInsts.push_back(cast<Instruction>(V));
+        append_range(Worklist, V->users());
+        continue;
+      }
+
+      if (auto *LI = dyn_cast<LoadInst>(V)) {
+        Value *Ptr = LI->getPointerOperand();
+        LI->setOperand(LoadInst::getPointerOperandIndex(), GetAlloca(Ptr));
+        continue;
       }
+
+      if (auto *SI = dyn_cast<StoreInst>(V)) {
+        assert(!SI->isVolatile() && "Volatile operations can't be promoted.");
+        Value *Ptr = SI->getPointerOperand();
+        SI->setOperand(StoreInst::getPointerOperandIndex(), GetAlloca(Ptr));
+        continue;
+      }
+
+      llvm_unreachable("Unexpected user");
+    }
+
+    for (Instruction *I : DeadInsts) {
+      I->replaceAllUsesWith(PoisonValue::get(I->getType()));
+      I->eraseFromParent();
     }
-    // Increment I2 past all of the arguments added for this promoted pointer.
-    std::advance(I2, ArgIndices.size());
+
+    // Collect the allocas for promotion
+    for (const auto &Pair : OffsetToAlloca) {
+      assert(isAllocaPromotable(Pair.second) &&
+             "By design, only promotable allocas should be produced.");
+      Allocas.push_back(Pair.second);
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "ARG PROMOTION: " << Allocas.size()
+                    << " alloca(s) are promotable by Mem2Reg\n");
+
+  if (!Allocas.empty()) {
+    // And we are able to call the `promoteMemoryToRegister()` function.
+    // Our earlier checks have ensured that PromoteMemToReg() will
+    // succeed.
+    auto &DT = FAM.getResult<DominatorTreeAnalysis>(*NF);
+    auto &AC = FAM.getResult<AssumptionAnalysis>(*NF);
+    PromoteMemToReg(Allocas, DT, &AC);
   }
 
   return NF;
@@ -474,100 +422,37 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
 
 /// Return true if we can prove that all callees pass in a valid pointer for the
 /// specified function argument.
-static bool allCallersPassValidPointerForArgument(Argument *Arg, Type *Ty) {
+static bool allCallersPassValidPointerForArgument(Argument *Arg,
+                                                  Align NeededAlign,
+                                                  uint64_t NeededDerefBytes) {
   Function *Callee = Arg->getParent();
   const DataLayout &DL = Callee->getParent()->getDataLayout();
+  APInt Bytes(64, NeededDerefBytes);
 
-  unsigned ArgNo = Arg->getArgNo();
+  // Check if the argument itself is marked dereferenceable and aligned.
+  if (isDereferenceableAndAlignedPointer(Arg, NeededAlign, Bytes, DL))
+    return true;
 
   // Look at all call sites of the function.  At this point we know we only have
   // direct callees.
-  for (User *U : Callee->users()) {
+  return all_of(Callee->users(), [&](User *U) {
     CallBase &CB = cast<CallBase>(*U);
-
-    if (!isDereferenceablePointer(CB.getArgOperand(ArgNo), Ty, DL))
-      return false;
-  }
-  return true;
+    return isDereferenceableAndAlignedPointer(CB.getArgOperand(Arg->getArgNo()),
+                                              NeededAlign, Bytes, DL);
+  });
 }
 
-/// Returns true if Prefix is a prefix of longer. That means, Longer has a size
-/// that is greater than or equal to the size of prefix, and each of the
-/// elements in Prefix is the same as the corresponding elements in Longer.
-///
-/// This means it also returns true when Prefix and Longer are equal!
-static bool isPrefix(const IndicesVector &Prefix, const IndicesVector &Longer) {
-  if (Prefix.size() > Longer.size())
-    return false;
-  return std::equal(Prefix.begin(), Prefix.end(), Longer.begin());
-}
-
-/// Checks if Indices, or a prefix of Indices, is in Set.
-static bool prefixIn(const IndicesVector &Indices,
-                     std::set<IndicesVector> &Set) {
-  std::set<IndicesVector>::iterator Low;
-  Low = Set.upper_bound(Indices);
-  if (Low != Set.begin())
-    Low--;
-  // Low is now the last element smaller than or equal to Indices. This means
-  // it points to a prefix of Indices (possibly Indices itself), if such
-  // prefix exists.
-  //
-  // This load is safe if any prefix of its operands is safe to load.
-  return Low != Set.end() && isPrefix(*Low, Indices);
-}
-
-/// Mark the given indices (ToMark) as safe in the given set of indices
-/// (Safe). Marking safe usually means adding ToMark to Safe. However, if there
-/// is already a prefix of Indices in Safe, Indices are implicitely marked safe
-/// already. Furthermore, any indices that Indices is itself a prefix of, are
-/// removed from Safe (since they are implicitely safe because of Indices now).
-static void markIndicesSafe(const IndicesVector &ToMark,
-                            std::set<IndicesVector> &Safe) {
-  std::set<IndicesVector>::iterator Low;
-  Low = Safe.upper_bound(ToMark);
-  // Guard against the case where Safe is empty
-  if (Low != Safe.begin())
-    Low--;
-  // Low is now the last element smaller than or equal to Indices. This
-  // means it points to a prefix of Indices (possibly Indices itself), if
-  // such prefix exists.
-  if (Low != Safe.end()) {
-    if (isPrefix(*Low, ToMark))
-      // If there is already a prefix of these indices (or exactly these
-      // indices) marked a safe, don't bother adding these indices
-      return;
-
-    // Increment Low, so we can use it as a "insert before" hint
-    ++Low;
-  }
-  // Insert
-  Low = Safe.insert(Low, ToMark);
-  ++Low;
-  // If there we're a prefix of longer index list(s), remove those
-  std::set<IndicesVector>::iterator End = Safe.end();
-  while (Low != End && isPrefix(ToMark, *Low)) {
-    std::set<IndicesVector>::iterator Remove = Low;
-    ++Low;
-    Safe.erase(Remove);
-  }
-}
-
-/// isSafeToPromoteArgument - As you might guess from the name of this method,
-/// it checks to see if it is both safe and useful to promote the argument.
-/// This method limits promotion of aggregates to only promote up to three
-/// elements of the aggregate in order to avoid exploding the number of
-/// arguments passed in.
-static bool isSafeToPromoteArgument(Argument *Arg, Type *ByValTy, AAResults &AAR,
-                                    unsigned MaxElements) {
-  using GEPIndicesSet = std::set<IndicesVector>;
-
+/// Determine that this argument is safe to promote, and find the argument
+/// parts it can be promoted into.
+static bool findArgParts(Argument *Arg, const DataLayout &DL, AAResults &AAR,
+                         unsigned MaxElements, bool IsRecursive,
+                         SmallVectorImpl<OffsetAndArgPart> &ArgPartsVec) {
   // Quick exit for unused arguments
   if (Arg->use_empty())
     return true;
 
-  // We can only promote this argument if all of the uses are loads, or are GEP
-  // instructions (with constant indices) that are subsequently loaded.
+  // We can only promote this argument if all the uses are loads at known
+  // offsets.
   //
   // Promoting the argument causes it to be loaded in the caller
   // unconditionally. This is only safe if we can prove that either the load
@@ -578,157 +463,193 @@ static bool isSafeToPromoteArgument(Argument *Arg, Type *ByValTy, AAResults &AAR
   // anyway, in the latter case, invalid loads won't happen. This prevents us
   // from introducing an invalid load that wouldn't have happened in the
   // original code.
-  //
-  // This set will contain all sets of indices that are loaded in the entry
-  // block, and thus are safe to unconditionally load in the caller.
-  GEPIndicesSet SafeToUnconditionallyLoad;
-
-  // This set contains all the sets of indices that we are planning to promote.
-  // This makes it possible to limit the number of arguments added.
-  GEPIndicesSet ToPromote;
-
-  // If the pointer is always valid, any load with first index 0 is valid.
-
-  if (ByValTy)
-    SafeToUnconditionallyLoad.insert(IndicesVector(1, 0));
-
-  // Whenever a new underlying type for the operand is found, make sure it's
-  // consistent with the GEPs and loads we've already seen and, if necessary,
-  // use it to see if all incoming pointers are valid (which implies the 0-index
-  // is safe).
-  Type *BaseTy = ByValTy;
-  auto UpdateBaseTy = [&](Type *NewBaseTy) {
-    if (BaseTy)
-      return BaseTy == NewBaseTy;
-
-    BaseTy = NewBaseTy;
-    if (allCallersPassValidPointerForArgument(Arg, BaseTy)) {
-      assert(SafeToUnconditionallyLoad.empty());
-      SafeToUnconditionallyLoad.insert(IndicesVector(1, 0));
+
+  SmallDenseMap<int64_t, ArgPart, 4> ArgParts;
+  Align NeededAlign(1);
+  uint64_t NeededDerefBytes = 0;
+
+  // And if this is a byval argument we also allow to have store instructions.
+  // Only handle in such way arguments with specified alignment;
+  // if it's unspecified, the actual alignment of the argument is
+  // target-specific.
+  bool AreStoresAllowed = Arg->getParamByValType() && Arg->getParamAlign();
+
+  // An end user of a pointer argument is a load or store instruction.
+  // Returns None if this load or store is not based on the argument. Return
+  // true if we can promote the instruction, false otherwise.
+  auto HandleEndUser = [&](auto *I, Type *Ty,
+                           bool GuaranteedToExecute) -> Optional<bool> {
+    // Don't promote volatile or atomic instructions.
+    if (!I->isSimple())
+      return false;
+
+    Value *Ptr = I->getPointerOperand();
+    APInt Offset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
+    Ptr = Ptr->stripAndAccumulateConstantOffsets(DL, Offset,
+                                                 /* AllowNonInbounds */ true);
+    if (Ptr != Arg)
+      return None;
+
+    if (Offset.getSignificantBits() >= 64)
+      return false;
+
+    TypeSize Size = DL.getTypeStoreSize(Ty);
+    // Don't try to promote scalable types.
+    if (Size.isScalable())
+      return false;
+
+    // If this is a recursive function and one of the types is a pointer,
+    // then promoting it might lead to recursive promotion.
+    if (IsRecursive && Ty->isPointerTy())
+      return false;
+
+    int64_t Off = Offset.getSExtValue();
+    auto Pair = ArgParts.try_emplace(
+        Off, ArgPart{Ty, I->getAlign(), GuaranteedToExecute ? I : nullptr});
+    ArgPart &Part = Pair.first->second;
+    bool OffsetNotSeenBefore = Pair.second;
+
+    // We limit promotion to only promoting up to a fixed number of elements of
+    // the aggregate.
+    if (MaxElements > 0 && ArgParts.size() > MaxElements) {
+      LLVM_DEBUG(dbgs() << "ArgPromotion of " << *Arg << " failed: "
+                        << "more than " << MaxElements << " parts\n");
+      return false;
     }
 
-    return true;
-  };
+    // For now, we only support loading/storing one specific type at a given
+    // offset.
+    if (Part.Ty != Ty) {
+      LLVM_DEBUG(dbgs() << "ArgPromotion of " << *Arg << " failed: "
+                        << "accessed as both " << *Part.Ty << " and " << *Ty
+                        << " at offset " << Off << "\n");
+      return false;
+    }
 
-  // First, iterate functions that are guaranteed to execution on function
-  // entry and mark loads of (geps of) arguments as safe.
-  BasicBlock &EntryBlock = Arg->getParent()->front();
-  // Declare this here so we can reuse it
-  IndicesVector Indices;
-  for (Instruction &I : EntryBlock) {
-    if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
-      Value *V = LI->getPointerOperand();
-      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(V)) {
-        V = GEP->getPointerOperand();
-        if (V == Arg) {
-          // This load actually loads (part of) Arg? Check the indices then.
-          Indices.reserve(GEP->getNumIndices());
-          for (Use &Idx : GEP->indices())
-            if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
-              Indices.push_back(CI->getSExtValue());
-            else
-              // We found a non-constant GEP index for this argument? Bail out
-              // right away, can't promote this argument at all.
-              return false;
-
-          if (!UpdateBaseTy(GEP->getSourceElementType()))
-            return false;
-
-          // Indices checked out, mark them as safe
-          markIndicesSafe(Indices, SafeToUnconditionallyLoad);
-          Indices.clear();
-        }
-      } else if (V == Arg) {
-        // Direct loads are equivalent to a GEP with a single 0 index.
-        markIndicesSafe(IndicesVector(1, 0), SafeToUnconditionallyLoad);
+    // If this instruction is not guaranteed to execute, and we haven't seen a
+    // load or store at this offset before (or it had lower alignment), then we
+    // need to remember that requirement.
+    // Note that skipping instructions of previously seen offsets is only
+    // correct because we only allow a single type for a given offset, which
+    // also means that the number of accessed bytes will be the same.
+    if (!GuaranteedToExecute &&
+        (OffsetNotSeenBefore || Part.Alignment < I->getAlign())) {
+      // We won't be able to prove dereferenceability for negative offsets.
+      if (Off < 0)
+        return false;
 
-        if (BaseTy && LI->getType() != BaseTy)
-          return false;
+      // If the offset is not aligned, an aligned base pointer won't help.
+      if (!isAligned(I->getAlign(), Off))
+        return false;
 
-        BaseTy = LI->getType();
-      }
+      NeededDerefBytes = std::max(NeededDerefBytes, Off + Size.getFixedValue());
+      NeededAlign = std::max(NeededAlign, I->getAlign());
     }
 
+    Part.Alignment = std::max(Part.Alignment, I->getAlign());
+    return true;
+  };
+
+  // Look for loads and stores that are guaranteed to execute on entry.
+  for (Instruction &I : Arg->getParent()->getEntryBlock()) {
+    Optional<bool> Res{};
+    if (LoadInst *LI = dyn_cast<LoadInst>(&I))
+      Res = HandleEndUser(LI, LI->getType(), /* GuaranteedToExecute */ true);
+    else if (StoreInst *SI = dyn_cast<StoreInst>(&I))
+      Res = HandleEndUser(SI, SI->getValueOperand()->getType(),
+                          /* GuaranteedToExecute */ true);
+    if (Res && !*Res)
+      return false;
+
     if (!isGuaranteedToTransferExecutionToSuccessor(&I))
       break;
   }
 
-  // Now, iterate all uses of the argument to see if there are any uses that are
-  // not (GEP+)loads, or any (GEP+)loads that are not safe to promote.
+  // Now look at all loads of the argument. Remember the load instructions
+  // for the aliasing check below.
+  SmallVector<const Use *, 16> Worklist;
+  SmallPtrSet<const Use *, 16> Visited;
   SmallVector<LoadInst *, 16> Loads;
-  IndicesVector Operands;
-  for (Use &U : Arg->uses()) {
-    User *UR = U.getUser();
-    Operands.clear();
-    if (LoadInst *LI = dyn_cast<LoadInst>(UR)) {
-      // Don't hack volatile/atomic loads
-      if (!LI->isSimple())
-        return false;
-      Loads.push_back(LI);
-      // Direct loads are equivalent to a GEP with a zero index and then a load.
-      Operands.push_back(0);
+  auto AppendUses = [&](const Value *V) {
+    for (const Use &U : V->uses())
+      if (Visited.insert(&U).second)
+        Worklist.push_back(&U);
+  };
+  AppendUses(Arg);
+  while (!Worklist.empty()) {
+    const Use *U = Worklist.pop_back_val();
+    Value *V = U->getUser();
+    if (isa<BitCastInst>(V)) {
+      AppendUses(V);
+      continue;
+    }
 
-      if (!UpdateBaseTy(LI->getType()))
+    if (auto *GEP = dyn_cast<GetElementPtrInst>(V)) {
+      if (!GEP->hasAllConstantIndices())
         return false;
-    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UR)) {
-      if (GEP->use_empty()) {
-        // Dead GEP's cause trouble later.  Just remove them if we run into
-        // them.
-        continue;
-      }
+      AppendUses(V);
+      continue;
+    }
 
-      if (!UpdateBaseTy(GEP->getSourceElementType()))
+    if (auto *LI = dyn_cast<LoadInst>(V)) {
+      if (!*HandleEndUser(LI, LI->getType(), /* GuaranteedToExecute */ false))
         return false;
+      Loads.push_back(LI);
+      continue;
+    }
 
-      // Ensure that all of the indices are constants.
-      for (Use &Idx : GEP->indices())
-        if (ConstantInt *C = dyn_cast<ConstantInt>(Idx))
-          Operands.push_back(C->getSExtValue());
-        else
-          return false; // Not a constant operand GEP!
-
-      // Ensure that the only users of the GEP are load instructions.
-      for (User *GEPU : GEP->users())
-        if (LoadInst *LI = dyn_cast<LoadInst>(GEPU)) {
-          // Don't hack volatile/atomic loads
-          if (!LI->isSimple())
-            return false;
-          Loads.push_back(LI);
-        } else {
-          // Other uses than load?
-          return false;
-        }
-    } else {
-      return false; // Not a load or a GEP.
+    // Stores are allowed for byval arguments
+    auto *SI = dyn_cast<StoreInst>(V);
+    if (AreStoresAllowed && SI &&
+        U->getOperandNo() == StoreInst::getPointerOperandIndex()) {
+      if (!*HandleEndUser(SI, SI->getValueOperand()->getType(),
+                          /* GuaranteedToExecute */ false))
+        return false;
+      continue;
+      // Only stores TO the argument is allowed, all the other stores are
+      // unknown users
     }
 
-    // Now, see if it is safe to promote this load / loads of this GEP. Loading
-    // is safe if Operands, or a prefix of Operands, is marked as safe.
-    if (!prefixIn(Operands, SafeToUnconditionallyLoad))
-      return false;
+    // Unknown user.
+    LLVM_DEBUG(dbgs() << "ArgPromotion of " << *Arg << " failed: "
+                      << "unknown user " << *V << "\n");
+    return false;
+  }
 
-    // See if we are already promoting a load with these indices. If not, check
-    // to make sure that we aren't promoting too many elements.  If so, nothing
-    // to do.
-    if (ToPromote.find(Operands) == ToPromote.end()) {
-      if (MaxElements > 0 && ToPromote.size() == MaxElements) {
-        LLVM_DEBUG(dbgs() << "argpromotion not promoting argument '"
-                          << Arg->getName()
-                          << "' because it would require adding more "
-                          << "than " << MaxElements
-                          << " arguments to the function.\n");
-        // We limit aggregate promotion to only promoting up to a fixed number
-        // of elements of the aggregate.
-        return false;
-      }
-      ToPromote.insert(std::move(Operands));
+  if (NeededDerefBytes || NeededAlign > 1) {
+    // Try to prove a required deref / aligned requirement.
+    if (!allCallersPassValidPointerForArgument(Arg, NeededAlign,
+                                               NeededDerefBytes)) {
+      LLVM_DEBUG(dbgs() << "ArgPromotion of " << *Arg << " failed: "
+                        << "not dereferenceable or aligned\n");
+      return false;
     }
   }
 
-  if (Loads.empty())
+  if (ArgParts.empty())
     return true; // No users, this is a dead argument.
 
-  // Okay, now we know that the argument is only used by load instructions and
+  // Sort parts by offset.
+  append_range(ArgPartsVec, ArgParts);
+  sort(ArgPartsVec,
+       [](const auto &A, const auto &B) { return A.first < B.first; });
+
+  // Make sure the parts are non-overlapping.
+  int64_t Offset = ArgPartsVec[0].first;
+  for (const auto &Pair : ArgPartsVec) {
+    if (Pair.first < Offset)
+      return false; // Overlap with previous part.
+
+    Offset = Pair.first + DL.getTypeStoreSize(Pair.second.Ty);
+  }
+
+  // If store instructions are allowed, the path from the entry of the function
+  // to each load may be not free of instructions that potentially invalidate
+  // the load, and this is an admissible situation.
+  if (AreStoresAllowed)
+    return true;
+
+  // Okay, now we know that the argument is only used by load instructions, and
   // it is safe to unconditionally perform all of them. Use alias analysis to
   // check to see if the pointer is guaranteed to not be modified from entry of
   // the function to each of the load instructions.
@@ -762,118 +683,31 @@ static bool isSafeToPromoteArgument(Argument *Arg, Type *ByValTy, AAResults &AAR
   return true;
 }
 
-bool ArgumentPromotionPass::isDenselyPacked(Type *type, const DataLayout &DL) {
-  // There is no size information, so be conservative.
-  if (!type->isSized())
-    return false;
-
-  // If the alloc size is not equal to the storage size, then there are padding
-  // bytes. For x86_fp80 on x86-64, size: 80 alloc size: 128.
-  if (DL.getTypeSizeInBits(type) != DL.getTypeAllocSizeInBits(type))
-    return false;
-
-  // FIXME: This isn't the right way to check for padding in vectors with
-  // non-byte-size elements.
-  if (VectorType *seqTy = dyn_cast<VectorType>(type))
-    return isDenselyPacked(seqTy->getElementType(), DL);
-
-  // For array types, check for padding within members.
-  if (ArrayType *seqTy = dyn_cast<ArrayType>(type))
-    return isDenselyPacked(seqTy->getElementType(), DL);
-
-  if (!isa<StructType>(type))
-    return true;
-
-  // Check for padding within and between elements of a struct.
-  StructType *StructTy = cast<StructType>(type);
-  const StructLayout *Layout = DL.getStructLayout(StructTy);
-  uint64_t StartPos = 0;
-  for (unsigned i = 0, E = StructTy->getNumElements(); i < E; ++i) {
-    Type *ElTy = StructTy->getElementType(i);
-    if (!isDenselyPacked(ElTy, DL))
-      return false;
-    if (StartPos != Layout->getElementOffsetInBits(i))
-      return false;
-    StartPos += DL.getTypeAllocSizeInBits(ElTy);
-  }
-
-  return true;
-}
-
-/// Checks if the padding bytes of an argument could be accessed.
-static bool canPaddingBeAccessed(Argument *arg) {
-  assert(arg->hasByValAttr());
-
-  // Track all the pointers to the argument to make sure they are not captured.
-  SmallPtrSet<Value *, 16> PtrValues;
-  PtrValues.insert(arg);
-
-  // Track all of the stores.
-  SmallVector<StoreInst *, 16> Stores;
-
-  // Scan through the uses recursively to make sure the pointer is always used
-  // sanely.
-  SmallVector<Value *, 16> WorkList(arg->users());
-  while (!WorkList.empty()) {
-    Value *V = WorkList.pop_back_val();
-    if (isa<GetElementPtrInst>(V) || isa<PHINode>(V)) {
-      if (PtrValues.insert(V).second)
-        llvm::append_range(WorkList, V->users());
-    } else if (StoreInst *Store = dyn_cast<StoreInst>(V)) {
-      Stores.push_back(Store);
-    } else if (!isa<LoadInst>(V)) {
-      return true;
-    }
-  }
-
-  // Check to make sure the pointers aren't captured
-  for (StoreInst *Store : Stores)
-    if (PtrValues.count(Store->getValueOperand()))
-      return true;
-
-  return false;
-}
-
-/// Check if callers and the callee \p F agree how promoted arguments would be
-/// passed. The ones that they do not agree on are eliminated from the sets but
-/// the return value has to be observed as well.
-static bool areFunctionArgsABICompatible(
-    const Function &F, const TargetTransformInfo &TTI,
-    SmallPtrSetImpl<Argument *> &ArgsToPromote,
-    SmallPtrSetImpl<Argument *> &ByValArgsToTransform) {
-  // TODO: Check individual arguments so we can promote a subset?
-  SmallVector<Type *, 32> Types;
-  for (Argument *Arg : ArgsToPromote)
-    Types.push_back(Arg->getType()->getPointerElementType());
-  for (Argument *Arg : ByValArgsToTransform)
-    Types.push_back(Arg->getParamByValType());
-
-  for (const Use &U : F.uses()) {
+/// Check if callers and callee agree on how promoted arguments would be
+/// passed.
+static bool areTypesABICompatible(ArrayRef<Type *> Types, const Function &F,
+                                  const TargetTransformInfo &TTI) {
+  return all_of(F.uses(), [&](const Use &U) {
     CallBase *CB = dyn_cast<CallBase>(U.getUser());
     if (!CB)
       return false;
+
     const Function *Caller = CB->getCaller();
     const Function *Callee = CB->getCalledFunction();
-    if (!TTI.areTypesABICompatible(Caller, Callee, Types))
-      return false;
-  }
-  return true;
+    return TTI.areTypesABICompatible(Caller, Callee, Types);
+  });
 }
 
 /// PromoteArguments - This method checks the specified function to see if there
 /// are any promotable arguments and if it is safe to promote the function (for
 /// example, all callers are direct).  If safe to promote some arguments, it
 /// calls the DoPromotion method.
-static Function *
-promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
-                 unsigned MaxElements,
-                 Optional<function_ref<void(CallBase &OldCS, CallBase &NewCS)>>
-                     ReplaceCallSite,
-                 const TargetTransformInfo &TTI) {
+static Function *promoteArguments(Function *F, FunctionAnalysisManager &FAM,
+                                  unsigned MaxElements, bool IsRecursive) {
   // Don't perform argument promotion for naked functions; otherwise we can end
   // up removing parameters that are seemingly 'not used' as they are referred
   // to in the assembly.
-  if(F->hasFnAttribute(Attribute::Naked))
+  if (F->hasFnAttribute(Attribute::Naked))
     return nullptr;
 
   // Make sure that it is local to this module.
@@ -903,20 +737,20 @@ promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
 
   // Second check: make sure that all callers are direct callers.  We can't
   // transform functions that have indirect callers.  Also see if the function
-  // is self-recursive and check that target features are compatible.
-  bool isSelfRecursive = false;
+  // is self-recursive.
   for (Use &U : F->uses()) {
     CallBase *CB = dyn_cast<CallBase>(U.getUser());
     // Must be a direct call.
-    if (CB == nullptr || !CB->isCallee(&U))
+    if (CB == nullptr || !CB->isCallee(&U) ||
+        CB->getFunctionType() != F->getFunctionType())
       return nullptr;
 
     // Can't change signature of musttail callee
     if (CB->isMustTailCall())
       return nullptr;
 
-    if (CB->getParent()->getParent() == F)
-      isSelfRecursive = true;
+    if (CB->getFunction() == F)
+      IsRecursive = true;
   }
 
   // Can't change signature of musttail caller
@@ -926,16 +760,13 @@ promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
       return nullptr;
 
   const DataLayout &DL = F->getParent()->getDataLayout();
-
-  AAResults &AAR = AARGetter(*F);
+  auto &AAR = FAM.getResult<AAManager>(*F);
+  const auto &TTI = FAM.getResult<TargetIRAnalysis>(*F);
 
   // Check to see which arguments are promotable.  If an argument is promotable,
   // add it to ArgsToPromote.
-  SmallPtrSet<Argument *, 8> ArgsToPromote;
-  SmallPtrSet<Argument *, 8> ByValArgsToTransform;
+  DenseMap<Argument *, SmallVector<OffsetAndArgPart, 4>> ArgsToPromote;
   for (Argument *PtrArg : PointerArgs) {
-    Type *AgTy = PtrArg->getType()->getPointerElementType();
-
     // Replace sret attribute with noalias. This reduces register pressure by
     // avoiding a register copy.
     if (PtrArg->hasStructRetAttr()) {
@@ -949,72 +780,25 @@ promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
       }
     }
 
-    // If this is a byval argument, and if the aggregate type is small, just
-    // pass the elements, which is always safe, if the passed value is densely
-    // packed or if we can prove the padding bytes are never accessed.
-    //
-    // Only handle arguments with specified alignment; if it's unspecified, the
-    // actual alignment of the argument is target-specific.
-    bool isSafeToPromote = PtrArg->hasByValAttr() && PtrArg->getParamAlign() &&
-                           (ArgumentPromotionPass::isDenselyPacked(AgTy, DL) ||
-                            !canPaddingBeAccessed(PtrArg));
-    if (isSafeToPromote) {
-      if (StructType *STy = dyn_cast<StructType>(AgTy)) {
-        if (MaxElements > 0 && STy->getNumElements() > MaxElements) {
-          LLVM_DEBUG(dbgs() << "argpromotion disable promoting argument '"
-                            << PtrArg->getName()
-                            << "' because it would require adding more"
-                            << " than " << MaxElements
-                            << " arguments to the function.\n");
-          continue;
-        }
-
-        // If all the elements are single-value types, we can promote it.
-        bool AllSimple = true;
-        for (const auto *EltTy : STy->elements()) {
-          if (!EltTy->isSingleValueType()) {
-            AllSimple = false;
-            break;
-          }
-        }
+    // If we can promote the pointer to its value.
+    SmallVector<OffsetAndArgPart, 4> ArgParts;
 
-        // Safe to transform, don't even bother trying to "promote" it.
-        // Passing the elements as a scalar will allow sroa to hack on
-        // the new alloca we introduce.
-        if (AllSimple) {
-          ByValArgsToTransform.insert(PtrArg);
-          continue;
-        }
-      }
-    }
+    if (findArgParts(PtrArg, DL, AAR, MaxElements, IsRecursive, ArgParts)) {
+      SmallVector<Type *, 4> Types;
+      for (const auto &Pair : ArgParts)
+        Types.push_back(Pair.second.Ty);
 
-    // If the argument is a recursive type and we're in a recursive
-    // function, we could end up infinitely peeling the function argument.
-    if (isSelfRecursive) {
-      if (StructType *STy = dyn_cast<StructType>(AgTy)) {
-        bool RecursiveType =
-            llvm::is_contained(STy->elements(), PtrArg->getType());
-        if (RecursiveType)
-          continue;
+      if (areTypesABICompatible(Types, *F, TTI)) {
+        ArgsToPromote.insert({PtrArg, std::move(ArgParts)});
       }
     }
-
-    // Otherwise, see if we can promote the pointer to its value.
-    Type *ByValTy =
-        PtrArg->hasByValAttr() ? PtrArg->getParamByValType() : nullptr;
-    if (isSafeToPromoteArgument(PtrArg, ByValTy, AAR, MaxElements))
-      ArgsToPromote.insert(PtrArg);
   }
 
   // No promotable pointer arguments.
-  if (ArgsToPromote.empty() && ByValArgsToTransform.empty())
+  if (ArgsToPromote.empty())
     return nullptr;
 
-  if (!areFunctionArgsABICompatible(
-          *F, TTI, ArgsToPromote, ByValArgsToTransform))
-    return nullptr;
-
-  return doPromotion(F, ArgsToPromote, ByValArgsToTransform, ReplaceCallSite);
+  return doPromotion(F, FAM, ArgsToPromote);
 }
 
 PreservedAnalyses ArgumentPromotionPass::run(LazyCallGraph::SCC &C,
@@ -1030,19 +814,10 @@ PreservedAnalyses ArgumentPromotionPass::run(LazyCallGraph::SCC &C,
     FunctionAnalysisManager &FAM =
         AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
 
+    bool IsRecursive = C.size() > 1;
     for (LazyCallGraph::Node &N : C) {
       Function &OldF = N.getFunction();
-
-      // FIXME: This lambda must only be used with this function. We should
-      // skip the lambda and just get the AA results directly.
-      auto AARGetter = [&](Function &F) -> AAResults & {
-        assert(&F == &OldF && "Called with an unexpected function!");
-        return FAM.getResult<AAManager>(F);
-      };
-
-      const TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(OldF);
-      Function *NewF =
-          promoteArguments(&OldF, AARGetter, MaxElements, None, TTI);
+      Function *NewF = promoteArguments(&OldF, FAM, MaxElements, IsRecursive);
       if (!NewF)
         continue;
       LocalChange = true;
@@ -1077,111 +852,3 @@ PreservedAnalyses ArgumentPromotionPass::run(LazyCallGraph::SCC &C,
   PA.preserveSet<AllAnalysesOn<Function>>();
   return PA;
 }
-
-namespace {
-
-/// ArgPromotion - The 'by reference' to 'by value' argument promotion pass.
-struct ArgPromotion : public CallGraphSCCPass {
-  // Pass identification, replacement for typeid
-  static char ID;
-
-  explicit ArgPromotion(unsigned MaxElements = 3)
-      : CallGraphSCCPass(ID), MaxElements(MaxElements) {
-    initializeArgPromotionPass(*PassRegistry::getPassRegistry());
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<AssumptionCacheTracker>();
-    AU.addRequired<TargetLibraryInfoWrapperPass>();
-    AU.addRequired<TargetTransformInfoWrapperPass>();
-    getAAResultsAnalysisUsage(AU);
-    CallGraphSCCPass::getAnalysisUsage(AU);
-  }
-
-  bool runOnSCC(CallGraphSCC &SCC) override;
-
-private:
-  using llvm::Pass::doInitialization;
-
-  bool doInitialization(CallGraph &CG) override;
-
-  /// The maximum number of elements to expand, or 0 for unlimited.
-  unsigned MaxElements;
-};
-
-} // end anonymous namespace
-
-char ArgPromotion::ID = 0;
-
-INITIALIZE_PASS_BEGIN(ArgPromotion, "argpromotion",
-                      "Promote 'by reference' arguments to scalars", false,
-                      false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_END(ArgPromotion, "argpromotion",
-                    "Promote 'by reference' arguments to scalars", false, false)
-
-Pass *llvm::createArgumentPromotionPass(unsigned MaxElements) {
-  return new ArgPromotion(MaxElements);
-}
-
-bool ArgPromotion::runOnSCC(CallGraphSCC &SCC) {
-  if (skipSCC(SCC))
-    return false;
-
-  // Get the callgraph information that we need to update to reflect our
-  // changes.
-  CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
-
-  LegacyAARGetter AARGetter(*this);
-
-  bool Changed = false, LocalChange;
-
-  // Iterate until we stop promoting from this SCC.
-  do {
-    LocalChange = false;
-    // Attempt to promote arguments from all functions in this SCC.
-    for (CallGraphNode *OldNode : SCC) {
-      Function *OldF = OldNode->getFunction();
-      if (!OldF)
-        continue;
-
-      auto ReplaceCallSite = [&](CallBase &OldCS, CallBase &NewCS) {
-        Function *Caller = OldCS.getParent()->getParent();
-        CallGraphNode *NewCalleeNode =
-            CG.getOrInsertFunction(NewCS.getCalledFunction());
-        CallGraphNode *CallerNode = CG[Caller];
-        CallerNode->replaceCallEdge(cast<CallBase>(OldCS),
-                                    cast<CallBase>(NewCS), NewCalleeNode);
-      };
-
-      const TargetTransformInfo &TTI =
-          getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*OldF);
-      if (Function *NewF = promoteArguments(OldF, AARGetter, MaxElements,
-                                            {ReplaceCallSite}, TTI)) {
-        LocalChange = true;
-
-        // Update the call graph for the newly promoted function.
-        CallGraphNode *NewNode = CG.getOrInsertFunction(NewF);
-        NewNode->stealCalledFunctionsFrom(OldNode);
-        if (OldNode->getNumReferences() == 0)
-          delete CG.removeFunctionFromModule(OldNode);
-        else
-          OldF->setLinkage(Function::ExternalLinkage);
-
-        // And updat ethe SCC we're iterating as well.
-        SCC.ReplaceNode(OldNode, NewNode);
-      }
-    }
-    // Remember that we changed something.
-    Changed |= LocalChange;
-  } while (LocalChange);
-
-  return Changed;
-}
-
-bool ArgPromotion::doInitialization(CallGraph &CG) {
-  return CallGraphSCCPass::doInitialization(CG);
-}
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index d66140a726f6..b05b7990e3f0 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -15,29 +15,25 @@
 
 #include "llvm/Transforms/IPO/Attributor.h"
 
-#include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/InlineCost.h"
-#include "llvm/Analysis/LazyValueInfo.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/MustExecute.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/NoFolder.h"
 #include "llvm/IR/ValueHandle.h"
-#include "llvm/IR/Verifier.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -50,6 +46,10 @@
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 
+#ifdef EXPENSIVE_CHECKS
+#include "llvm/IR/Verifier.h"
+#endif
+
 #include <cassert>
 #include <string>
 
@@ -123,13 +123,13 @@ static cl::list<std::string>
     SeedAllowList("attributor-seed-allow-list", cl::Hidden,
                   cl::desc("Comma seperated list of attribute names that are "
                            "allowed to be seeded."),
-                  cl::ZeroOrMore, cl::CommaSeparated);
+                  cl::CommaSeparated);
 
 static cl::list<std::string> FunctionSeedAllowList(
     "attributor-function-seed-allow-list", cl::Hidden,
     cl::desc("Comma seperated list of function names that are "
              "allowed to be seeded."),
-    cl::ZeroOrMore, cl::CommaSeparated);
+    cl::CommaSeparated);
 #endif
 
 static cl::opt<bool>
@@ -209,33 +209,25 @@ bool AA::isNoSyncInst(Attributor &A, const Instruction &I,
 }
 
 bool AA::isDynamicallyUnique(Attributor &A, const AbstractAttribute &QueryingAA,
-                             const Value &V) {
-  if (auto *C = dyn_cast<Constant>(&V))
-    return !C->isThreadDependent();
-  // TODO: Inspect and cache more complex instructions.
-  if (auto *CB = dyn_cast<CallBase>(&V))
-    return CB->getNumOperands() == 0 && !CB->mayHaveSideEffects() &&
-           !CB->mayReadFromMemory();
-  const Function *Scope = nullptr;
-  if (auto *I = dyn_cast<Instruction>(&V))
-    Scope = I->getFunction();
-  if (auto *A = dyn_cast<Argument>(&V))
-    Scope = A->getParent();
-  if (!Scope)
+                             const Value &V, bool ForAnalysisOnly) {
+  // TODO: See the AAInstanceInfo class comment.
+  if (!ForAnalysisOnly)
     return false;
-  auto &NoRecurseAA = A.getAAFor<AANoRecurse>(
-      QueryingAA, IRPosition::function(*Scope), DepClassTy::OPTIONAL);
-  return NoRecurseAA.isAssumedNoRecurse();
+  auto &InstanceInfoAA = A.getAAFor<AAInstanceInfo>(
+      QueryingAA, IRPosition::value(V), DepClassTy::OPTIONAL);
+  return InstanceInfoAA.isAssumedUniqueForAnalysis();
 }
 
 Constant *AA::getInitialValueForObj(Value &Obj, Type &Ty,
                                     const TargetLibraryInfo *TLI) {
   if (isa<AllocaInst>(Obj))
     return UndefValue::get(&Ty);
-  if (isAllocationFn(&Obj, TLI))
-    return getInitialValueOfAllocation(&cast<CallBase>(Obj), TLI, &Ty);
+  if (Constant *Init = getInitialValueOfAllocation(&Obj, TLI, &Ty))
+    return Init;
   auto *GV = dyn_cast<GlobalVariable>(&Obj);
-  if (!GV || !GV->hasLocalLinkage())
+  if (!GV)
+    return nullptr;
+  if (!GV->hasLocalLinkage() && !(GV->isConstant() && GV->hasInitializer()))
     return nullptr;
   if (!GV->hasInitializer())
     return UndefValue::get(&Ty);
@@ -252,19 +244,29 @@ bool AA::isValidInScope(const Value &V, const Function *Scope) {
   return false;
 }
 
-bool AA::isValidAtPosition(const Value &V, const Instruction &CtxI,
+bool AA::isValidAtPosition(const AA::ValueAndContext &VAC,
                            InformationCache &InfoCache) {
-  if (isa<Constant>(V))
+  if (isa<Constant>(VAC.getValue()) || VAC.getValue() == VAC.getCtxI())
     return true;
-  const Function *Scope = CtxI.getFunction();
-  if (auto *A = dyn_cast<Argument>(&V))
+  const Function *Scope = nullptr;
+  const Instruction *CtxI = VAC.getCtxI();
+  if (CtxI)
+    Scope = CtxI->getFunction();
+  if (auto *A = dyn_cast<Argument>(VAC.getValue()))
     return A->getParent() == Scope;
-  if (auto *I = dyn_cast<Instruction>(&V))
+  if (auto *I = dyn_cast<Instruction>(VAC.getValue())) {
     if (I->getFunction() == Scope) {
-      const DominatorTree *DT =
-          InfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(*Scope);
-      return DT && DT->dominates(I, &CtxI);
+      if (const DominatorTree *DT =
+              InfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(
+                  *Scope))
+        return DT->dominates(I, CtxI);
+      // Local dominance check mostly for the old PM passes.
+      if (CtxI && I->getParent() == CtxI->getParent())
+        return llvm::any_of(
+            make_range(I->getIterator(), I->getParent()->end()),
+            [&](const Instruction &AfterI) { return &AfterI == CtxI; });
     }
+  }
   return false;
 }
 
@@ -295,11 +297,11 @@ AA::combineOptionalValuesInAAValueLatice(const Optional<Value *> &A,
                                          const Optional<Value *> &B, Type *Ty) {
   if (A == B)
     return A;
-  if (!B.hasValue())
+  if (!B)
     return A;
   if (*B == nullptr)
     return nullptr;
-  if (!A.hasValue())
+  if (!A)
     return Ty ? getWithType(**B, *Ty) : nullptr;
   if (*A == nullptr)
     return nullptr;
@@ -314,21 +316,33 @@ AA::combineOptionalValuesInAAValueLatice(const Optional<Value *> &A,
   return nullptr;
 }
 
-bool AA::getPotentialCopiesOfStoredValue(
-    Attributor &A, StoreInst &SI, SmallSetVector<Value *, 4> &PotentialCopies,
-    const AbstractAttribute &QueryingAA, bool &UsedAssumedInformation) {
+template <bool IsLoad, typename Ty>
+static bool getPotentialCopiesOfMemoryValue(
+    Attributor &A, Ty &I, SmallSetVector<Value *, 4> &PotentialCopies,
+    SmallSetVector<Instruction *, 4> &PotentialValueOrigins,
+    const AbstractAttribute &QueryingAA, bool &UsedAssumedInformation,
+    bool OnlyExact) {
+  LLVM_DEBUG(dbgs() << "Trying to determine the potential copies of " << I
+                    << " (only exact: " << OnlyExact << ")\n";);
 
-  Value &Ptr = *SI.getPointerOperand();
+  Value &Ptr = *I.getPointerOperand();
   SmallVector<Value *, 8> Objects;
-  if (!AA::getAssumedUnderlyingObjects(A, Ptr, Objects, QueryingAA, &SI)) {
+  if (!AA::getAssumedUnderlyingObjects(A, Ptr, Objects, QueryingAA, &I,
+                                       UsedAssumedInformation)) {
     LLVM_DEBUG(
         dbgs() << "Underlying objects stored into could not be determined\n";);
     return false;
   }
 
+  // Containers to remember the pointer infos and new copies while we are not
+  // sure that we can find all of them. If we abort we want to avoid spurious
+  // dependences and potential copies in the provided container.
   SmallVector<const AAPointerInfo *> PIs;
   SmallVector<Value *> NewCopies;
+  SmallVector<Instruction *> NewCopyOrigins;
 
+  const auto *TLI =
+      A.getInfoCache().getTargetLibraryInfoForFunction(*I.getFunction());
   for (Value *Obj : Objects) {
     LLVM_DEBUG(dbgs() << "Visit underlying object " << *Obj << "\n");
     if (isa<UndefValue>(Obj))
@@ -336,7 +350,7 @@ bool AA::getPotentialCopiesOfStoredValue(
     if (isa<ConstantPointerNull>(Obj)) {
       // A null pointer access can be undefined but any offset from null may
       // be OK. We do not try to optimize the latter.
-      if (!NullPointerIsDefined(SI.getFunction(),
+      if (!NullPointerIsDefined(I.getFunction(),
                                 Ptr.getType()->getPointerAddressSpace()) &&
           A.getAssumedSimplified(Ptr, QueryingAA, UsedAssumedInformation) ==
               Obj)
@@ -345,37 +359,74 @@ bool AA::getPotentialCopiesOfStoredValue(
           dbgs() << "Underlying object is a valid nullptr, giving up.\n";);
       return false;
     }
+    // TODO: Use assumed noalias return.
     if (!isa<AllocaInst>(Obj) && !isa<GlobalVariable>(Obj) &&
-        !isNoAliasCall(Obj)) {
+        !(IsLoad ? isAllocationFn(Obj, TLI) : isNoAliasCall(Obj))) {
       LLVM_DEBUG(dbgs() << "Underlying object is not supported yet: " << *Obj
                         << "\n";);
       return false;
     }
     if (auto *GV = dyn_cast<GlobalVariable>(Obj))
-      if (!GV->hasLocalLinkage()) {
+      if (!GV->hasLocalLinkage() &&
+          !(GV->isConstant() && GV->hasInitializer())) {
         LLVM_DEBUG(dbgs() << "Underlying object is global with external "
                              "linkage, not supported yet: "
                           << *Obj << "\n";);
         return false;
       }
 
+    if (IsLoad) {
+      Value *InitialValue = AA::getInitialValueForObj(*Obj, *I.getType(), TLI);
+      if (!InitialValue)
+        return false;
+      NewCopies.push_back(InitialValue);
+      NewCopyOrigins.push_back(nullptr);
+    }
+
     auto CheckAccess = [&](const AAPointerInfo::Access &Acc, bool IsExact) {
-      if (!Acc.isRead())
+      if ((IsLoad && !Acc.isWrite()) || (!IsLoad && !Acc.isRead()))
+        return true;
+      if (IsLoad && Acc.isWrittenValueYetUndetermined())
         return true;
-      auto *LI = dyn_cast<LoadInst>(Acc.getRemoteInst());
-      if (!LI) {
-        LLVM_DEBUG(dbgs() << "Underlying object read through a non-load "
-                             "instruction not supported yet: "
-                          << *Acc.getRemoteInst() << "\n";);
+      if (OnlyExact && !IsExact &&
+          !isa_and_nonnull<UndefValue>(Acc.getWrittenValue())) {
+        LLVM_DEBUG(dbgs() << "Non exact access " << *Acc.getRemoteInst()
+                          << ", abort!\n");
         return false;
       }
-      NewCopies.push_back(LI);
+      if (IsLoad) {
+        assert(isa<LoadInst>(I) && "Expected load or store instruction only!");
+        if (!Acc.isWrittenValueUnknown()) {
+          NewCopies.push_back(Acc.getWrittenValue());
+          NewCopyOrigins.push_back(Acc.getRemoteInst());
+          return true;
+        }
+        auto *SI = dyn_cast<StoreInst>(Acc.getRemoteInst());
+        if (!SI) {
+          LLVM_DEBUG(dbgs() << "Underlying object written through a non-store "
+                               "instruction not supported yet: "
+                            << *Acc.getRemoteInst() << "\n";);
+          return false;
+        }
+        NewCopies.push_back(SI->getValueOperand());
+        NewCopyOrigins.push_back(SI);
+      } else {
+        assert(isa<StoreInst>(I) && "Expected load or store instruction only!");
+        auto *LI = dyn_cast<LoadInst>(Acc.getRemoteInst());
+        if (!LI && OnlyExact) {
+          LLVM_DEBUG(dbgs() << "Underlying object read through a non-load "
+                               "instruction not supported yet: "
+                            << *Acc.getRemoteInst() << "\n";);
+          return false;
+        }
+        NewCopies.push_back(Acc.getRemoteInst());
+      }
       return true;
     };
 
     auto &PI = A.getAAFor<AAPointerInfo>(QueryingAA, IRPosition::value(*Obj),
                                          DepClassTy::NONE);
-    if (!PI.forallInterferingAccesses(SI, CheckAccess)) {
+    if (!PI.forallInterferingAccesses(A, QueryingAA, I, CheckAccess)) {
       LLVM_DEBUG(
           dbgs()
           << "Failed to verify all interfering accesses for underlying object: "
@@ -385,16 +436,40 @@ bool AA::getPotentialCopiesOfStoredValue(
     PIs.push_back(&PI);
   }
 
+  // Only if we were successful collection all potential copies we record
+  // dependences (on non-fix AAPointerInfo AAs). We also only then modify the
+  // given PotentialCopies container.
   for (auto *PI : PIs) {
     if (!PI->getState().isAtFixpoint())
       UsedAssumedInformation = true;
     A.recordDependence(*PI, QueryingAA, DepClassTy::OPTIONAL);
   }
   PotentialCopies.insert(NewCopies.begin(), NewCopies.end());
+  PotentialValueOrigins.insert(NewCopyOrigins.begin(), NewCopyOrigins.end());
 
   return true;
 }
 
+bool AA::getPotentiallyLoadedValues(
+    Attributor &A, LoadInst &LI, SmallSetVector<Value *, 4> &PotentialValues,
+    SmallSetVector<Instruction *, 4> &PotentialValueOrigins,
+    const AbstractAttribute &QueryingAA, bool &UsedAssumedInformation,
+    bool OnlyExact) {
+  return getPotentialCopiesOfMemoryValue</* IsLoad */ true>(
+      A, LI, PotentialValues, PotentialValueOrigins, QueryingAA,
+      UsedAssumedInformation, OnlyExact);
+}
+
+bool AA::getPotentialCopiesOfStoredValue(
+    Attributor &A, StoreInst &SI, SmallSetVector<Value *, 4> &PotentialCopies,
+    const AbstractAttribute &QueryingAA, bool &UsedAssumedInformation,
+    bool OnlyExact) {
+  SmallSetVector<Instruction *, 4> PotentialValueOrigins;
+  return getPotentialCopiesOfMemoryValue</* IsLoad */ false>(
+      A, SI, PotentialCopies, PotentialValueOrigins, QueryingAA,
+      UsedAssumedInformation, OnlyExact);
+}
+
 static bool isAssumedReadOnlyOrReadNone(Attributor &A, const IRPosition &IRP,
                                         const AbstractAttribute &QueryingAA,
                                         bool RequireReadNone, bool &IsKnown) {
@@ -449,6 +524,8 @@ isPotentiallyReachable(Attributor &A, const Instruction &FromI,
   SmallVector<const Instruction *> Worklist;
   Worklist.push_back(&FromI);
 
+  const auto &NoRecurseAA = A.getAAFor<AANoRecurse>(
+      QueryingAA, IRPosition::function(ToFn), DepClassTy::OPTIONAL);
   while (!Worklist.empty()) {
     const Instruction *CurFromI = Worklist.pop_back_val();
     if (!Visited.insert(CurFromI).second)
@@ -468,7 +545,8 @@ isPotentiallyReachable(Attributor &A, const Instruction &FromI,
                         << *ToI << " [Intra]\n");
       if (Result)
         return true;
-      continue;
+      if (NoRecurseAA.isAssumedNoRecurse())
+        continue;
     }
 
     // TODO: If we can go arbitrarily backwards we will eventually reach an
@@ -514,10 +592,10 @@ isPotentiallyReachable(Attributor &A, const Instruction &FromI,
       return true;
     };
 
-    bool AllCallSitesKnown;
+    bool UsedAssumedInformation = false;
     Result = !A.checkForAllCallSites(CheckCallSite, *FromFn,
                                      /* RequireAllCallSites */ true,
-                                     &QueryingAA, AllCallSitesKnown);
+                                     &QueryingAA, UsedAssumedInformation);
     if (Result) {
       LLVM_DEBUG(dbgs() << "[AA] stepping back to call sites from " << *CurFromI
                         << " in @" << FromFn->getName()
@@ -631,7 +709,7 @@ Argument *IRPosition::getAssociatedArgument() const {
 
       assert(ACS.getCalledFunction()->arg_size() > u &&
              "ACS mapped into var-args arguments!");
-      if (CBCandidateArg.hasValue()) {
+      if (CBCandidateArg) {
         CBCandidateArg = nullptr;
         break;
       }
@@ -640,7 +718,7 @@ Argument *IRPosition::getAssociatedArgument() const {
   }
 
   // If we found a unique callback candidate argument, return it.
-  if (CBCandidateArg.hasValue() && CBCandidateArg.getValue())
+  if (CBCandidateArg && CBCandidateArg.getValue())
     return CBCandidateArg.getValue();
 
   // If no callbacks were found, or none used the underlying call site operand
@@ -949,22 +1027,24 @@ Attributor::getAssumedConstant(const IRPosition &IRP,
                                bool &UsedAssumedInformation) {
   // First check all callbacks provided by outside AAs. If any of them returns
   // a non-null value that is different from the associated value, or None, we
-  // assume it's simpliied.
+  // assume it's simplified.
   for (auto &CB : SimplificationCallbacks.lookup(IRP)) {
     Optional<Value *> SimplifiedV = CB(IRP, &AA, UsedAssumedInformation);
-    if (!SimplifiedV.hasValue())
+    if (!SimplifiedV)
       return llvm::None;
     if (isa_and_nonnull<Constant>(*SimplifiedV))
       return cast<Constant>(*SimplifiedV);
     return nullptr;
   }
+  if (auto *C = dyn_cast<Constant>(&IRP.getAssociatedValue()))
+    return C;
   const auto &ValueSimplifyAA =
       getAAFor<AAValueSimplify>(AA, IRP, DepClassTy::NONE);
   Optional<Value *> SimplifiedV =
       ValueSimplifyAA.getAssumedSimplifiedValue(*this);
   bool IsKnown = ValueSimplifyAA.isAtFixpoint();
   UsedAssumedInformation |= !IsKnown;
-  if (!SimplifiedV.hasValue()) {
+  if (!SimplifiedV) {
     recordDependence(ValueSimplifyAA, AA, DepClassTy::OPTIONAL);
     return llvm::None;
   }
@@ -987,18 +1067,18 @@ Attributor::getAssumedSimplified(const IRPosition &IRP,
                                  bool &UsedAssumedInformation) {
   // First check all callbacks provided by outside AAs. If any of them returns
   // a non-null value that is different from the associated value, or None, we
-  // assume it's simpliied.
+  // assume it's simplified.
   for (auto &CB : SimplificationCallbacks.lookup(IRP))
     return CB(IRP, AA, UsedAssumedInformation);
 
-  // If no high-level/outside simplification occured, use AAValueSimplify.
+  // If no high-level/outside simplification occurred, use AAValueSimplify.
   const auto &ValueSimplifyAA =
       getOrCreateAAFor<AAValueSimplify>(IRP, AA, DepClassTy::NONE);
   Optional<Value *> SimplifiedV =
       ValueSimplifyAA.getAssumedSimplifiedValue(*this);
   bool IsKnown = ValueSimplifyAA.isAtFixpoint();
   UsedAssumedInformation |= !IsKnown;
-  if (!SimplifiedV.hasValue()) {
+  if (!SimplifiedV) {
     if (AA)
       recordDependence(ValueSimplifyAA, *AA, DepClassTy::OPTIONAL);
     return llvm::None;
@@ -1017,7 +1097,7 @@ Attributor::getAssumedSimplified(const IRPosition &IRP,
 Optional<Value *> Attributor::translateArgumentToCallSiteContent(
     Optional<Value *> V, CallBase &CB, const AbstractAttribute &AA,
     bool &UsedAssumedInformation) {
-  if (!V.hasValue())
+  if (!V)
     return V;
   if (*V == nullptr || isa<Constant>(*V))
     return V;
@@ -1078,6 +1158,19 @@ bool Attributor::isAssumedDead(const Use &U,
     BasicBlock *IncomingBB = PHI->getIncomingBlock(U);
     return isAssumedDead(*IncomingBB->getTerminator(), QueryingAA, FnLivenessAA,
                          UsedAssumedInformation, CheckBBLivenessOnly, DepClass);
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(UserI)) {
+    if (!CheckBBLivenessOnly && SI->getPointerOperand() != U.get()) {
+      const IRPosition IRP = IRPosition::inst(*SI);
+      const AAIsDead &IsDeadAA =
+          getOrCreateAAFor<AAIsDead>(IRP, QueryingAA, DepClassTy::NONE);
+      if (IsDeadAA.isRemovableStore()) {
+        if (QueryingAA)
+          recordDependence(IsDeadAA, *QueryingAA, DepClass);
+        if (!IsDeadAA.isKnown(AAIsDead::IS_REMOVABLE))
+          UsedAssumedInformation = true;
+        return true;
+      }
+    }
   }
 
   return isAssumedDead(IRPosition::inst(*UserI), QueryingAA, FnLivenessAA,
@@ -1191,6 +1284,7 @@ bool Attributor::checkForAllUses(
     function_ref<bool(const Use &, bool &)> Pred,
     const AbstractAttribute &QueryingAA, const Value &V,
     bool CheckBBLivenessOnly, DepClassTy LivenessDepClass,
+    bool IgnoreDroppableUses,
     function_ref<bool(const Use &OldU, const Use &NewU)> EquivalentUseCB) {
 
   // Check the trivial case first as it catches void values.
@@ -1231,7 +1325,7 @@ bool Attributor::checkForAllUses(
       LLVM_DEBUG(dbgs() << "[Attributor] Dead use, skip!\n");
       continue;
     }
-    if (U->getUser()->isDroppable()) {
+    if (IgnoreDroppableUses && U->getUser()->isDroppable()) {
       LLVM_DEBUG(dbgs() << "[Attributor] Droppable user, skip!\n");
       continue;
     }
@@ -1241,9 +1335,9 @@ bool Attributor::checkForAllUses(
         if (!Visited.insert(U).second)
           continue;
         SmallSetVector<Value *, 4> PotentialCopies;
-        if (AA::getPotentialCopiesOfStoredValue(*this, *SI, PotentialCopies,
-                                                QueryingAA,
-                                                UsedAssumedInformation)) {
+        if (AA::getPotentialCopiesOfStoredValue(
+                *this, *SI, PotentialCopies, QueryingAA, UsedAssumedInformation,
+                /* OnlyExact */ true)) {
           LLVM_DEBUG(dbgs() << "[Attributor] Value is stored, continue with "
                             << PotentialCopies.size()
                             << " potential copies instead!\n");
@@ -1277,7 +1371,7 @@ bool Attributor::checkForAllUses(
 bool Attributor::checkForAllCallSites(function_ref<bool(AbstractCallSite)> Pred,
                                       const AbstractAttribute &QueryingAA,
                                       bool RequireAllCallSites,
-                                      bool &AllCallSitesKnown) {
+                                      bool &UsedAssumedInformation) {
   // We can try to determine information from
   // the call sites. However, this is only possible all call sites are known,
   // hence the function has internal linkage.
@@ -1286,31 +1380,26 @@ bool Attributor::checkForAllCallSites(function_ref<bool(AbstractCallSite)> Pred,
   if (!AssociatedFunction) {
     LLVM_DEBUG(dbgs() << "[Attributor] No function associated with " << IRP
                       << "\n");
-    AllCallSitesKnown = false;
     return false;
   }
 
   return checkForAllCallSites(Pred, *AssociatedFunction, RequireAllCallSites,
-                              &QueryingAA, AllCallSitesKnown);
+                              &QueryingAA, UsedAssumedInformation);
 }
 
 bool Attributor::checkForAllCallSites(function_ref<bool(AbstractCallSite)> Pred,
                                       const Function &Fn,
                                       bool RequireAllCallSites,
                                       const AbstractAttribute *QueryingAA,
-                                      bool &AllCallSitesKnown) {
+                                      bool &UsedAssumedInformation) {
   if (RequireAllCallSites && !Fn.hasLocalLinkage()) {
     LLVM_DEBUG(
         dbgs()
         << "[Attributor] Function " << Fn.getName()
         << " has no internal linkage, hence not all call sites are known\n");
-    AllCallSitesKnown = false;
     return false;
   }
 
-  // If we do not require all call sites we might not see all.
-  AllCallSitesKnown = RequireAllCallSites;
-
   SmallVector<const Use *, 8> Uses(make_pointer_range(Fn.uses()));
   for (unsigned u = 0; u < Uses.size(); ++u) {
     const Use &U = *Uses[u];
@@ -1322,15 +1411,13 @@ bool Attributor::checkForAllCallSites(function_ref<bool(AbstractCallSite)> Pred,
         dbgs() << "[Attributor] Check use: " << *U << " in " << *U.getUser()
                << "\n";
     });
-    bool UsedAssumedInformation = false;
     if (isAssumedDead(U, QueryingAA, nullptr, UsedAssumedInformation,
                       /* CheckBBLivenessOnly */ true)) {
       LLVM_DEBUG(dbgs() << "[Attributor] Dead use, skip!\n");
       continue;
     }
     if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U.getUser())) {
-      if (CE->isCast() && CE->getType()->isPointerTy() &&
-          CE->getType()->getPointerElementType()->isFunctionTy()) {
+      if (CE->isCast() && CE->getType()->isPointerTy()) {
         LLVM_DEBUG(
             dbgs() << "[Attributor] Use, is constant cast expression, add "
                    << CE->getNumUses()
@@ -1477,30 +1564,24 @@ static bool checkForAllInstructionsImpl(
 }
 
 bool Attributor::checkForAllInstructions(function_ref<bool(Instruction &)> Pred,
+                                         const Function *Fn,
                                          const AbstractAttribute &QueryingAA,
                                          const ArrayRef<unsigned> &Opcodes,
                                          bool &UsedAssumedInformation,
                                          bool CheckBBLivenessOnly,
                                          bool CheckPotentiallyDead) {
-
-  const IRPosition &IRP = QueryingAA.getIRPosition();
   // Since we need to provide instructions we have to have an exact definition.
-  const Function *AssociatedFunction = IRP.getAssociatedFunction();
-  if (!AssociatedFunction)
-    return false;
-
-  if (AssociatedFunction->isDeclaration())
+  if (!Fn || Fn->isDeclaration())
     return false;
 
   // TODO: use the function scope once we have call site AAReturnedValues.
-  const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction);
+  const IRPosition &QueryIRP = IRPosition::function(*Fn);
   const auto *LivenessAA =
       (CheckBBLivenessOnly || CheckPotentiallyDead)
           ? nullptr
           : &(getAAFor<AAIsDead>(QueryingAA, QueryIRP, DepClassTy::NONE));
 
-  auto &OpcodeInstMap =
-      InfoCache.getOpcodeInstMapForFunction(*AssociatedFunction);
+  auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(*Fn);
   if (!checkForAllInstructionsImpl(this, OpcodeInstMap, Pred, &QueryingAA,
                                    LivenessAA, Opcodes, UsedAssumedInformation,
                                    CheckBBLivenessOnly, CheckPotentiallyDead))
@@ -1509,6 +1590,19 @@ bool Attributor::checkForAllInstructions(function_ref<bool(Instruction &)> Pred,
   return true;
 }
 
+bool Attributor::checkForAllInstructions(function_ref<bool(Instruction &)> Pred,
+                                         const AbstractAttribute &QueryingAA,
+                                         const ArrayRef<unsigned> &Opcodes,
+                                         bool &UsedAssumedInformation,
+                                         bool CheckBBLivenessOnly,
+                                         bool CheckPotentiallyDead) {
+  const IRPosition &IRP = QueryingAA.getIRPosition();
+  const Function *AssociatedFunction = IRP.getAssociatedFunction();
+  return checkForAllInstructions(Pred, AssociatedFunction, QueryingAA, Opcodes,
+                                 UsedAssumedInformation, CheckBBLivenessOnly,
+                                 CheckPotentiallyDead);
+}
+
 bool Attributor::checkForAllReadWriteInstructions(
     function_ref<bool(Instruction &)> Pred, AbstractAttribute &QueryingAA,
     bool &UsedAssumedInformation) {
@@ -1547,11 +1641,8 @@ void Attributor::runTillFixpoint() {
   // the abstract analysis.
 
   unsigned IterationCounter = 1;
-  unsigned MaxFixedPointIterations;
-  if (MaxFixpointIterations)
-    MaxFixedPointIterations = MaxFixpointIterations.getValue();
-  else
-    MaxFixedPointIterations = SetFixpointIterations;
+  unsigned MaxIterations =
+      Configuration.MaxFixpointIterations.value_or(SetFixpointIterations);
 
   SmallVector<AbstractAttribute *, 32> ChangedAAs;
   SetVector<AbstractAttribute *> Worklist, InvalidAAs;
@@ -1636,21 +1727,20 @@ void Attributor::runTillFixpoint() {
                     QueryAAsAwaitingUpdate.end());
     QueryAAsAwaitingUpdate.clear();
 
-  } while (!Worklist.empty() && (IterationCounter++ < MaxFixedPointIterations ||
-                                 VerifyMaxFixpointIterations));
+  } while (!Worklist.empty() &&
+           (IterationCounter++ < MaxIterations || VerifyMaxFixpointIterations));
 
-  if (IterationCounter > MaxFixedPointIterations && !Worklist.empty()) {
+  if (IterationCounter > MaxIterations && !Functions.empty()) {
     auto Remark = [&](OptimizationRemarkMissed ORM) {
       return ORM << "Attributor did not reach a fixpoint after "
-                 << ore::NV("Iterations", MaxFixedPointIterations)
-                 << " iterations.";
+                 << ore::NV("Iterations", MaxIterations) << " iterations.";
     };
-    Function *F = Worklist.front()->getIRPosition().getAssociatedFunction();
+    Function *F = Functions.front();
     emitRemark<OptimizationRemarkMissed>(F, "FixedPoint", Remark);
   }
 
   LLVM_DEBUG(dbgs() << "\n[Attributor] Fixpoint iteration done after: "
-                    << IterationCounter << "/" << MaxFixpointIterations
+                    << IterationCounter << "/" << MaxIterations
                     << " iterations\n");
 
   // Reset abstract arguments not settled in a sound fixpoint by now. This
@@ -1684,11 +1774,9 @@ void Attributor::runTillFixpoint() {
              << " abstract attributes.\n";
   });
 
-  if (VerifyMaxFixpointIterations &&
-      IterationCounter != MaxFixedPointIterations) {
+  if (VerifyMaxFixpointIterations && IterationCounter != MaxIterations) {
     errs() << "\n[Attributor] Fixpoint iteration done after: "
-           << IterationCounter << "/" << MaxFixedPointIterations
-           << " iterations\n";
+           << IterationCounter << "/" << MaxIterations << " iterations\n";
     llvm_unreachable("The fixpoint was not reached with exactly the number of "
                      "specified iterations!");
   }
@@ -1725,6 +1813,9 @@ ChangeStatus Attributor::manifestAttributes() {
     if (!State.isValidState())
       continue;
 
+    if (AA->getCtxI() && !isRunOn(*AA->getAnchorScope()))
+      continue;
+
     // Skip dead code.
     bool UsedAssumedInformation = false;
     if (isAssumedDead(*AA, nullptr, UsedAssumedInformation,
@@ -1774,7 +1865,7 @@ ChangeStatus Attributor::manifestAttributes() {
 
 void Attributor::identifyDeadInternalFunctions() {
   // Early exit if we don't intend to delete functions.
-  if (!DeleteFns)
+  if (!Configuration.DeleteFns)
     return;
 
   // Identify dead internal functions and delete them. This happens outside
@@ -1795,7 +1886,7 @@ void Attributor::identifyDeadInternalFunctions() {
       if (!F)
         continue;
 
-      bool AllCallSitesKnown;
+      bool UsedAssumedInformation = false;
       if (checkForAllCallSites(
               [&](AbstractCallSite ACS) {
                 Function *Callee = ACS.getInstruction()->getFunction();
@@ -1803,7 +1894,7 @@ void Attributor::identifyDeadInternalFunctions() {
                        (Functions.count(Callee) && Callee->hasLocalLinkage() &&
                         !LiveInternalFns.count(Callee));
               },
-              *F, true, nullptr, AllCallSitesKnown)) {
+              *F, true, nullptr, UsedAssumedInformation)) {
         continue;
       }
 
@@ -1826,7 +1917,8 @@ ChangeStatus Attributor::cleanupIR() {
                     << ToBeDeletedBlocks.size() << " blocks and "
                     << ToBeDeletedInsts.size() << " instructions and "
                     << ToBeChangedValues.size() << " values and "
-                    << ToBeChangedUses.size() << " uses. "
+                    << ToBeChangedUses.size() << " uses. To insert "
+                    << ToBeChangedToUnreachableInsts.size() << " unreachables."
                     << "Preserve manifest added " << ManifestAddedBlocks.size()
                     << " blocks\n");
 
@@ -1844,12 +1936,15 @@ ChangeStatus Attributor::cleanupIR() {
       NewV = Entry.first;
     } while (true);
 
+    Instruction *I = dyn_cast<Instruction>(U->getUser());
+    assert((!I || isRunOn(*I->getFunction())) &&
+           "Cannot replace an instruction outside the current SCC!");
+
     // Do not replace uses in returns if the value is a must-tail call we will
     // not delete.
-    if (auto *RI = dyn_cast<ReturnInst>(U->getUser())) {
+    if (auto *RI = dyn_cast_or_null<ReturnInst>(I)) {
       if (auto *CI = dyn_cast<CallInst>(OldV->stripPointerCasts()))
-        if (CI->isMustTailCall() &&
-            (!ToBeDeletedInsts.count(CI) || !isRunOn(*CI->getCaller())))
+        if (CI->isMustTailCall() && !ToBeDeletedInsts.count(CI))
           return;
       // If we rewrite a return and the new value is not an argument, strip the
       // `returned` attribute as it is wrong now.
@@ -1859,8 +1954,8 @@ ChangeStatus Attributor::cleanupIR() {
     }
 
     // Do not perform call graph altering changes outside the SCC.
-    if (auto *CB = dyn_cast<CallBase>(U->getUser()))
-      if (CB->isCallee(U) && !isRunOn(*CB->getCaller()))
+    if (auto *CB = dyn_cast_or_null<CallBase>(I))
+      if (CB->isCallee(U))
         return;
 
     LLVM_DEBUG(dbgs() << "Use " << *NewV << " in " << *U->getUser()
@@ -1908,8 +2003,12 @@ ChangeStatus Attributor::cleanupIR() {
     for (auto &U : OldV->uses())
       if (Entry.second || !U.getUser()->isDroppable())
         Uses.push_back(&U);
-    for (Use *U : Uses)
+    for (Use *U : Uses) {
+      if (auto *I = dyn_cast<Instruction>(U->getUser()))
+        if (!isRunOn(*I->getFunction()))
+          continue;
       ReplaceUse(U, NewV);
+    }
   }
 
   for (auto &V : InvokeWithDeadSuccessor)
@@ -1940,15 +2039,15 @@ ChangeStatus Attributor::cleanupIR() {
       }
     }
   for (Instruction *I : TerminatorsToFold) {
-    if (!isRunOn(*I->getFunction()))
-      continue;
+    assert(isRunOn(*I->getFunction()) &&
+           "Cannot replace a terminator outside the current SCC!");
     CGModifiedFunctions.insert(I->getFunction());
     ConstantFoldTerminator(I->getParent());
   }
   for (auto &V : ToBeChangedToUnreachableInsts)
     if (Instruction *I = dyn_cast_or_null<Instruction>(V)) {
-      if (!isRunOn(*I->getFunction()))
-        continue;
+      assert(isRunOn(*I->getFunction()) &&
+             "Cannot replace an instruction outside the current SCC!");
       CGModifiedFunctions.insert(I->getFunction());
       changeToUnreachable(I);
     }
@@ -1956,10 +2055,10 @@ ChangeStatus Attributor::cleanupIR() {
   for (auto &V : ToBeDeletedInsts) {
     if (Instruction *I = dyn_cast_or_null<Instruction>(V)) {
       if (auto *CB = dyn_cast<CallBase>(I)) {
-        if (!isRunOn(*I->getFunction()))
-          continue;
+        assert(isRunOn(*I->getFunction()) &&
+               "Cannot delete an instruction outside the current SCC!");
         if (!isa<IntrinsicInst>(CB))
-          CGUpdater.removeCallSite(*CB);
+          Configuration.CGUpdater.removeCallSite(*CB);
       }
       I->dropDroppableUses();
       CGModifiedFunctions.insert(I->getFunction());
@@ -1972,9 +2071,7 @@ ChangeStatus Attributor::cleanupIR() {
     }
   }
 
-  llvm::erase_if(DeadInsts, [&](WeakTrackingVH I) {
-    return !I || !isRunOn(*cast<Instruction>(I)->getFunction());
-  });
+  llvm::erase_if(DeadInsts, [&](WeakTrackingVH I) { return !I; });
 
   LLVM_DEBUG({
     dbgs() << "[Attributor] DeadInsts size: " << DeadInsts.size() << "\n";
@@ -2010,12 +2107,12 @@ ChangeStatus Attributor::cleanupIR() {
 
   for (Function *Fn : CGModifiedFunctions)
     if (!ToBeDeletedFunctions.count(Fn) && Functions.count(Fn))
-      CGUpdater.reanalyzeFunction(*Fn);
+      Configuration.CGUpdater.reanalyzeFunction(*Fn);
 
   for (Function *Fn : ToBeDeletedFunctions) {
     if (!Functions.count(Fn))
       continue;
-    CGUpdater.removeFunction(*Fn);
+    Configuration.CGUpdater.removeFunction(*Fn);
   }
 
   if (!ToBeChangedUses.empty())
@@ -2254,7 +2351,7 @@ bool Attributor::internalizeFunctions(SmallPtrSetImpl<Function *> &FnSet,
 bool Attributor::isValidFunctionSignatureRewrite(
     Argument &Arg, ArrayRef<Type *> ReplacementTypes) {
 
-  if (!RewriteSignatures)
+  if (!Configuration.RewriteSignatures)
     return false;
 
   Function *Fn = Arg.getParent();
@@ -2290,9 +2387,9 @@ bool Attributor::isValidFunctionSignatureRewrite(
   }
 
   // Avoid callbacks for now.
-  bool AllCallSitesKnown;
+  bool UsedAssumedInformation = false;
   if (!checkForAllCallSites(CallSiteCanBeChanged, *Fn, true, nullptr,
-                            AllCallSitesKnown)) {
+                            UsedAssumedInformation)) {
     LLVM_DEBUG(dbgs() << "[Attributor] Cannot rewrite all call sites\n");
     return false;
   }
@@ -2305,7 +2402,6 @@ bool Attributor::isValidFunctionSignatureRewrite(
 
   // Forbid must-tail calls for now.
   // TODO:
-  bool UsedAssumedInformation = false;
   auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(*Fn);
   if (!checkForAllInstructionsImpl(nullptr, OpcodeInstMap, InstPred, nullptr,
                                    nullptr, {Instruction::Call},
@@ -2370,7 +2466,7 @@ bool Attributor::shouldSeedAttribute(AbstractAttribute &AA) {
 }
 
 ChangeStatus Attributor::rewriteFunctionSignatures(
-    SmallPtrSetImpl<Function *> &ModifiedFns) {
+    SmallSetVector<Function *, 8> &ModifiedFns) {
   ChangeStatus Changed = ChangeStatus::UNCHANGED;
 
   for (auto &It : ArgumentReplacementMap) {
@@ -2403,6 +2499,12 @@ ChangeStatus Attributor::rewriteFunctionSignatures(
       }
     }
 
+    uint64_t LargestVectorWidth = 0;
+    for (auto *I : NewArgumentTypes)
+      if (auto *VT = dyn_cast<llvm::VectorType>(I))
+        LargestVectorWidth = std::max(
+            LargestVectorWidth, VT->getPrimitiveSizeInBits().getKnownMinSize());
+
     FunctionType *OldFnTy = OldFn->getFunctionType();
     Type *RetTy = OldFnTy->getReturnType();
 
@@ -2432,6 +2534,7 @@ ChangeStatus Attributor::rewriteFunctionSignatures(
     NewFn->setAttributes(AttributeList::get(
         Ctx, OldFnAttributeList.getFnAttrs(), OldFnAttributeList.getRetAttrs(),
         NewArgumentAttributes));
+    AttributeFuncs::updateMinLegalVectorWidthAttr(*NewFn, LargestVectorWidth);
 
     // Since we have now created the new function, splice the body of the old
     // function right into the new function, leaving the old rotting hulk of the
@@ -2509,14 +2612,17 @@ ChangeStatus Attributor::rewriteFunctionSignatures(
           Ctx, OldCallAttributeList.getFnAttrs(),
           OldCallAttributeList.getRetAttrs(), NewArgOperandAttributes));
 
+      AttributeFuncs::updateMinLegalVectorWidthAttr(*NewCB->getCaller(),
+                                                    LargestVectorWidth);
+
       CallSitePairs.push_back({OldCB, NewCB});
       return true;
     };
 
     // Use the CallSiteReplacementCreator to create replacement call sites.
-    bool AllCallSitesKnown;
+    bool UsedAssumedInformation = false;
     bool Success = checkForAllCallSites(CallSiteReplacementCreator, *OldFn,
-                                        true, nullptr, AllCallSitesKnown);
+                                        true, nullptr, UsedAssumedInformation);
     (void)Success;
     assert(Success && "Assumed call site replacement to succeed!");
 
@@ -2529,6 +2635,9 @@ ChangeStatus Attributor::rewriteFunctionSignatures(
               ARIs[OldArgNum]) {
         if (ARI->CalleeRepairCB)
           ARI->CalleeRepairCB(*ARI, *NewFn, NewFnArgIt);
+        if (ARI->ReplacementTypes.empty())
+          OldFnArgIt->replaceAllUsesWith(
+              PoisonValue::get(OldFnArgIt->getType()));
         NewFnArgIt += ARI->ReplacementTypes.size();
       } else {
         NewFnArgIt->takeName(&*OldFnArgIt);
@@ -2544,17 +2653,17 @@ ChangeStatus Attributor::rewriteFunctionSignatures(
       assert(OldCB.getType() == NewCB.getType() &&
              "Cannot handle call sites with different types!");
       ModifiedFns.insert(OldCB.getFunction());
-      CGUpdater.replaceCallSite(OldCB, NewCB);
+      Configuration.CGUpdater.replaceCallSite(OldCB, NewCB);
       OldCB.replaceAllUsesWith(&NewCB);
       OldCB.eraseFromParent();
     }
 
     // Replace the function in the call graph (if any).
-    CGUpdater.replaceFunctionWith(*OldFn, *NewFn);
+    Configuration.CGUpdater.replaceFunctionWith(*OldFn, *NewFn);
 
     // If the old function was modified and needed to be reanalyzed, the new one
     // does now.
-    if (ModifiedFns.erase(OldFn))
+    if (ModifiedFns.remove(OldFn))
       ModifiedFns.insert(NewFn);
 
     Changed = ChangeStatus::CHANGED;
@@ -2574,6 +2683,30 @@ void InformationCache::initializeInformationCache(const Function &CF,
   // queried by abstract attributes during their initialization or update.
   // This has to happen before we create attributes.
 
+  DenseMap<const Value *, Optional<short>> AssumeUsesMap;
+
+  // Add \p V to the assume uses map which track the number of uses outside of
+  // "visited" assumes. If no outside uses are left the value is added to the
+  // assume only use vector.
+  auto AddToAssumeUsesMap = [&](const Value &V) -> void {
+    SmallVector<const Instruction *> Worklist;
+    if (auto *I = dyn_cast<Instruction>(&V))
+      Worklist.push_back(I);
+    while (!Worklist.empty()) {
+      const Instruction *I = Worklist.pop_back_val();
+      Optional<short> &NumUses = AssumeUsesMap[I];
+      if (!NumUses)
+        NumUses = I->getNumUses();
+      NumUses = NumUses.getValue() - /* this assume */ 1;
+      if (NumUses.getValue() != 0)
+        continue;
+      AssumeOnlyValues.insert(I);
+      for (const Value *Op : I->operands())
+        if (auto *OpI = dyn_cast<Instruction>(Op))
+          Worklist.push_back(OpI);
+    }
+  };
+
   for (Instruction &I : instructions(&F)) {
     bool IsInterestingOpcode = false;
 
@@ -2594,6 +2727,7 @@ void InformationCache::initializeInformationCache(const Function &CF,
       // For `must-tail` calls we remember the caller and callee.
       if (auto *Assume = dyn_cast<AssumeInst>(&I)) {
         fillMapFromAssume(*Assume, KnowledgeMap);
+        AddToAssumeUsesMap(*Assume->getArgOperand(0));
       } else if (cast<CallInst>(I).isMustTailCall()) {
         FI.ContainsMustTailCall = true;
         if (const Function *Callee = cast<CallInst>(I).getCalledFunction())
@@ -2742,7 +2876,8 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) {
     getOrCreateAAFor<AAIsDead>(RetPos);
 
     // Every function might be simplified.
-    getOrCreateAAFor<AAValueSimplify>(RetPos);
+    bool UsedAssumedInformation = false;
+    getAssumedSimplified(RetPos, nullptr, UsedAssumedInformation);
 
     // Every returned value might be marked noundef.
     getOrCreateAAFor<AANoUndef>(RetPos);
@@ -2834,7 +2969,8 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) {
     if (!Callee->getReturnType()->isVoidTy() && !CB.use_empty()) {
 
       IRPosition CBRetPos = IRPosition::callsite_returned(CB);
-      getOrCreateAAFor<AAValueSimplify>(CBRetPos);
+      bool UsedAssumedInformation = false;
+      getAssumedSimplified(CBRetPos, nullptr, UsedAssumedInformation);
     }
 
     for (int I = 0, E = CB.arg_size(); I < E; ++I) {
@@ -2897,10 +3033,15 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) {
       getOrCreateAAFor<AAAlign>(
           IRPosition::value(*cast<LoadInst>(I).getPointerOperand()));
       if (SimplifyAllLoads)
-        getOrCreateAAFor<AAValueSimplify>(IRPosition::value(I));
-    } else
-      getOrCreateAAFor<AAAlign>(
-          IRPosition::value(*cast<StoreInst>(I).getPointerOperand()));
+        getAssumedSimplified(IRPosition::value(I), nullptr,
+                             UsedAssumedInformation);
+    } else {
+      auto &SI = cast<StoreInst>(I);
+      getOrCreateAAFor<AAIsDead>(IRPosition::inst(I));
+      getAssumedSimplified(IRPosition::value(*SI.getValueOperand()), nullptr,
+                           UsedAssumedInformation);
+      getOrCreateAAFor<AAAlign>(IRPosition::value(*SI.getPointerOperand()));
+    }
     return true;
   };
   Success = checkForAllInstructionsImpl(
@@ -2975,8 +3116,8 @@ raw_ostream &llvm::operator<<(raw_ostream &OS,
   if (!S.isValidState())
     OS << "full-set";
   else {
-    for (auto &it : S.getAssumedSet())
-      OS << it << ", ";
+    for (auto &It : S.getAssumedSet())
+      OS << It << ", ";
     if (S.undefIsContained())
       OS << "undef ";
   }
@@ -3018,8 +3159,12 @@ raw_ostream &llvm::operator<<(raw_ostream &OS,
   OS << " [" << Acc.getKind() << "] " << *Acc.getRemoteInst();
   if (Acc.getLocalInst() != Acc.getRemoteInst())
     OS << " via " << *Acc.getLocalInst();
-  if (Acc.getContent().hasValue())
-    OS << " [" << *Acc.getContent() << "]";
+  if (Acc.getContent()) {
+    if (*Acc.getContent())
+      OS << " [" << **Acc.getContent() << "]";
+    else
+      OS << " [ <unknown> ]";
+  }
   return OS;
 }
 ///}
@@ -3032,7 +3177,7 @@ static bool runAttributorOnFunctions(InformationCache &InfoCache,
                                      SetVector<Function *> &Functions,
                                      AnalysisGetter &AG,
                                      CallGraphUpdater &CGUpdater,
-                                     bool DeleteFns) {
+                                     bool DeleteFns, bool IsModulePass) {
   if (Functions.empty())
     return false;
 
@@ -3045,8 +3190,10 @@ static bool runAttributorOnFunctions(InformationCache &InfoCache,
 
   // Create an Attributor and initially empty information cache that is filled
   // while we identify default attribute opportunities.
-  Attributor A(Functions, InfoCache, CGUpdater, /* Allowed */ nullptr,
-               DeleteFns);
+  AttributorConfig AC(CGUpdater);
+  AC.IsModulePass = IsModulePass;
+  AC.DeleteFns = DeleteFns;
+  Attributor A(Functions, InfoCache, AC);
 
   // Create shallow wrappers for all functions that are not IPO amendable
   if (AllowShallowWrappers)
@@ -3151,7 +3298,7 @@ PreservedAnalyses AttributorPass::run(Module &M, ModuleAnalysisManager &AM) {
   BumpPtrAllocator Allocator;
   InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ nullptr);
   if (runAttributorOnFunctions(InfoCache, Functions, AG, CGUpdater,
-                               /* DeleteFns */ true)) {
+                               /* DeleteFns */ true, /* IsModulePass */ true)) {
     // FIXME: Think about passes we will preserve and add them here.
     return PreservedAnalyses::none();
   }
@@ -3179,7 +3326,8 @@ PreservedAnalyses AttributorCGSCCPass::run(LazyCallGraph::SCC &C,
   BumpPtrAllocator Allocator;
   InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ &Functions);
   if (runAttributorOnFunctions(InfoCache, Functions, AG, CGUpdater,
-                               /* DeleteFns */ false)) {
+                               /* DeleteFns */ false,
+                               /* IsModulePass */ false)) {
     // FIXME: Think about passes we will preserve and add them here.
     PreservedAnalyses PA;
     PA.preserve<FunctionAnalysisManagerCGSCCProxy>();
@@ -3255,7 +3403,8 @@ struct AttributorLegacyPass : public ModulePass {
     BumpPtrAllocator Allocator;
     InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ nullptr);
     return runAttributorOnFunctions(InfoCache, Functions, AG, CGUpdater,
-                                    /* DeleteFns*/ true);
+                                    /* DeleteFns*/ true,
+                                    /* IsModulePass */ true);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -3292,7 +3441,8 @@ struct AttributorCGSCCLegacyPass : public CallGraphSCCPass {
     BumpPtrAllocator Allocator;
     InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ &Functions);
     return runAttributorOnFunctions(InfoCache, Functions, AG, CGUpdater,
-                                    /* DeleteFns */ false);
+                                    /* DeleteFns */ false,
+                                    /* IsModulePass */ false);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 2d88e329e093..4d99ce7e3175 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -14,9 +14,11 @@
 #include "llvm/Transforms/IPO/Attributor.h"
 
 #include "llvm/ADT/APInt.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -30,21 +32,29 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
 #include "llvm/IR/Assumptions.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/NoFolder.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/IPO/ArgumentPromotion.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 #include <cassert>
 
 using namespace llvm;
@@ -69,11 +79,11 @@ static cl::opt<unsigned, true> MaxPotentialValues(
     cl::location(llvm::PotentialConstantIntValuesState::MaxPotentialValues),
     cl::init(7));
 
-static cl::opt<unsigned>
-    MaxInterferingWrites("attributor-max-interfering-writes", cl::Hidden,
-                         cl::desc("Maximum number of interfering writes to "
-                                  "check before assuming all might interfere."),
-                         cl::init(6));
+static cl::opt<unsigned> MaxInterferingAccesses(
+    "attributor-max-interfering-accesses", cl::Hidden,
+    cl::desc("Maximum number of interfering accesses to "
+             "check before assuming all might interfere."),
+    cl::init(6));
 
 STATISTIC(NumAAs, "Number of abstract attributes created");
 
@@ -140,6 +150,7 @@ PIPE_OPERATOR(AANonNull)
 PIPE_OPERATOR(AANoAlias)
 PIPE_OPERATOR(AADereferenceable)
 PIPE_OPERATOR(AAAlign)
+PIPE_OPERATOR(AAInstanceInfo)
 PIPE_OPERATOR(AANoCapture)
 PIPE_OPERATOR(AAValueSimplify)
 PIPE_OPERATOR(AANoFree)
@@ -150,7 +161,7 @@ PIPE_OPERATOR(AAMemoryLocation)
 PIPE_OPERATOR(AAValueConstantRange)
 PIPE_OPERATOR(AAPrivatizablePtr)
 PIPE_OPERATOR(AAUndefinedBehavior)
-PIPE_OPERATOR(AAPotentialValues)
+PIPE_OPERATOR(AAPotentialConstantValues)
 PIPE_OPERATOR(AANoUndef)
 PIPE_OPERATOR(AACallEdges)
 PIPE_OPERATOR(AAFunctionReachability)
@@ -170,6 +181,45 @@ ChangeStatus clampStateAndIndicateChange<DerefState>(DerefState &S,
 
 } // namespace llvm
 
+/// Checks if a type could have padding bytes.
+static bool isDenselyPacked(Type *Ty, const DataLayout &DL) {
+  // There is no size information, so be conservative.
+  if (!Ty->isSized())
+    return false;
+
+  // If the alloc size is not equal to the storage size, then there are padding
+  // bytes. For x86_fp80 on x86-64, size: 80 alloc size: 128.
+  if (DL.getTypeSizeInBits(Ty) != DL.getTypeAllocSizeInBits(Ty))
+    return false;
+
+  // FIXME: This isn't the right way to check for padding in vectors with
+  // non-byte-size elements.
+  if (VectorType *SeqTy = dyn_cast<VectorType>(Ty))
+    return isDenselyPacked(SeqTy->getElementType(), DL);
+
+  // For array types, check for padding within members.
+  if (ArrayType *SeqTy = dyn_cast<ArrayType>(Ty))
+    return isDenselyPacked(SeqTy->getElementType(), DL);
+
+  if (!isa<StructType>(Ty))
+    return true;
+
+  // Check for padding within and between elements of a struct.
+  StructType *StructTy = cast<StructType>(Ty);
+  const StructLayout *Layout = DL.getStructLayout(StructTy);
+  uint64_t StartPos = 0;
+  for (unsigned I = 0, E = StructTy->getNumElements(); I < E; ++I) {
+    Type *ElTy = StructTy->getElementType(I);
+    if (!isDenselyPacked(ElTy, DL))
+      return false;
+    if (StartPos != Layout->getElementOffsetInBits(I))
+      return false;
+    StartPos += DL.getTypeAllocSizeInBits(ElTy);
+  }
+
+  return true;
+}
+
 /// Get pointer operand of memory accessing instruction. If \p I is
 /// not a memory accessing instruction, return nullptr. If \p AllowVolatile,
 /// is set to false and the instruction is volatile, return nullptr.
@@ -236,7 +286,8 @@ static Value *constructPointer(Type *ResTy, Type *PtrElemTy, Value *Ptr,
   }
 
   // Ensure the result has the requested type.
-  Ptr = IRB.CreateBitOrPointerCast(Ptr, ResTy, Ptr->getName() + ".cast");
+  Ptr = IRB.CreatePointerBitCastOrAddrSpaceCast(Ptr, ResTy,
+                                                Ptr->getName() + ".cast");
 
   LLVM_DEBUG(dbgs() << "Constructed pointer: " << *Ptr << "\n");
   return Ptr;
@@ -251,25 +302,32 @@ static Value *constructPointer(Type *ResTy, Type *PtrElemTy, Value *Ptr,
 /// once. Note that the value used for the callback may still be the value
 /// associated with \p IRP (due to PHIs). To limit how much effort is invested,
 /// we will never visit more values than specified by \p MaxValues.
-/// If \p Intraprocedural is set to true only values valid in the scope of
-/// \p CtxI will be visited and simplification into other scopes is prevented.
+/// If \p VS does not contain the Interprocedural bit, only values valid in the
+/// scope of \p CtxI will be visited and simplification into other scopes is
+/// prevented.
 template <typename StateTy>
 static bool genericValueTraversal(
     Attributor &A, IRPosition IRP, const AbstractAttribute &QueryingAA,
     StateTy &State,
     function_ref<bool(Value &, const Instruction *, StateTy &, bool)>
         VisitValueCB,
-    const Instruction *CtxI, bool UseValueSimplify = true, int MaxValues = 16,
+    const Instruction *CtxI, bool &UsedAssumedInformation,
+    bool UseValueSimplify = true, int MaxValues = 16,
     function_ref<Value *(Value *)> StripCB = nullptr,
-    bool Intraprocedural = false) {
+    AA::ValueScope VS = AA::Interprocedural) {
 
-  const AAIsDead *LivenessAA = nullptr;
-  if (IRP.getAnchorScope())
-    LivenessAA = &A.getAAFor<AAIsDead>(
-        QueryingAA,
-        IRPosition::function(*IRP.getAnchorScope(), IRP.getCallBaseContext()),
-        DepClassTy::NONE);
-  bool AnyDead = false;
+  struct LivenessInfo {
+    const AAIsDead *LivenessAA = nullptr;
+    bool AnyDead = false;
+  };
+  SmallMapVector<const Function *, LivenessInfo, 4> LivenessAAs;
+  auto GetLivenessInfo = [&](const Function &F) -> LivenessInfo & {
+    LivenessInfo &LI = LivenessAAs[&F];
+    if (!LI.LivenessAA)
+      LI.LivenessAA = &A.getAAFor<AAIsDead>(QueryingAA, IRPosition::function(F),
+                                            DepClassTy::NONE);
+    return LI;
+  };
 
   Value *InitialV = &IRP.getAssociatedValue();
   using Item = std::pair<Value *, const Instruction *>;
@@ -319,10 +377,9 @@ static bool genericValueTraversal(
 
     // Look through select instructions, visit assumed potential values.
     if (auto *SI = dyn_cast<SelectInst>(V)) {
-      bool UsedAssumedInformation = false;
       Optional<Constant *> C = A.getAssumedConstant(
           *SI->getCondition(), QueryingAA, UsedAssumedInformation);
-      bool NoValueYet = !C.hasValue();
+      bool NoValueYet = !C;
       if (NoValueYet || isa_and_nonnull<UndefValue>(*C))
         continue;
       if (auto *CI = dyn_cast_or_null<ConstantInt>(*C)) {
@@ -340,12 +397,12 @@ static bool genericValueTraversal(
 
     // Look through phi nodes, visit all live operands.
     if (auto *PHI = dyn_cast<PHINode>(V)) {
-      assert(LivenessAA &&
-             "Expected liveness in the presence of instructions!");
+      LivenessInfo &LI = GetLivenessInfo(*PHI->getFunction());
       for (unsigned u = 0, e = PHI->getNumIncomingValues(); u < e; u++) {
         BasicBlock *IncomingBB = PHI->getIncomingBlock(u);
-        if (LivenessAA->isEdgeDead(IncomingBB, PHI->getParent())) {
-          AnyDead = true;
+        if (LI.LivenessAA->isEdgeDead(IncomingBB, PHI->getParent())) {
+          LI.AnyDead = true;
+          UsedAssumedInformation |= !LI.LivenessAA->isAtFixpoint();
           continue;
         }
         Worklist.push_back(
@@ -355,9 +412,9 @@ static bool genericValueTraversal(
     }
 
     if (auto *Arg = dyn_cast<Argument>(V)) {
-      if (!Intraprocedural && !Arg->hasPassPointeeByValueCopyAttr()) {
+      if ((VS & AA::Interprocedural) && !Arg->hasPassPointeeByValueCopyAttr()) {
         SmallVector<Item> CallSiteValues;
-        bool AllCallSitesKnown = true;
+        bool UsedAssumedInformation = false;
         if (A.checkForAllCallSites(
                 [&](AbstractCallSite ACS) {
                   // Callbacks might not have a corresponding call site operand,
@@ -368,7 +425,7 @@ static bool genericValueTraversal(
                   CallSiteValues.push_back({CSOp, ACS.getInstruction()});
                   return true;
                 },
-                *Arg->getParent(), true, &QueryingAA, AllCallSitesKnown)) {
+                *Arg->getParent(), true, &QueryingAA, UsedAssumedInformation)) {
           Worklist.append(CallSiteValues);
           continue;
         }
@@ -376,14 +433,13 @@ static bool genericValueTraversal(
     }
 
     if (UseValueSimplify && !isa<Constant>(V)) {
-      bool UsedAssumedInformation = false;
       Optional<Value *> SimpleV =
           A.getAssumedSimplified(*V, QueryingAA, UsedAssumedInformation);
-      if (!SimpleV.hasValue())
+      if (!SimpleV)
         continue;
       Value *NewV = SimpleV.getValue();
       if (NewV && NewV != V) {
-        if (!Intraprocedural || !CtxI ||
+        if ((VS & AA::Interprocedural) || !CtxI ||
             AA::isValidInScope(*NewV, CtxI->getFunction())) {
           Worklist.push_back({NewV, CtxI});
           continue;
@@ -391,6 +447,37 @@ static bool genericValueTraversal(
       }
     }
 
+    if (auto *LI = dyn_cast<LoadInst>(V)) {
+      bool UsedAssumedInformation = false;
+      // If we ask for the potentially loaded values from the initial pointer we
+      // will simply end up here again. The load is as far as we can make it.
+      if (LI->getPointerOperand() != InitialV) {
+        SmallSetVector<Value *, 4> PotentialCopies;
+        SmallSetVector<Instruction *, 4> PotentialValueOrigins;
+        if (AA::getPotentiallyLoadedValues(A, *LI, PotentialCopies,
+                                           PotentialValueOrigins, QueryingAA,
+                                           UsedAssumedInformation,
+                                           /* OnlyExact */ true)) {
+          // Values have to be dynamically unique or we loose the fact that a
+          // single llvm::Value might represent two runtime values (e.g., stack
+          // locations in different recursive calls).
+          bool DynamicallyUnique =
+              llvm::all_of(PotentialCopies, [&A, &QueryingAA](Value *PC) {
+                return AA::isDynamicallyUnique(A, QueryingAA, *PC);
+              });
+          if (DynamicallyUnique &&
+              ((VS & AA::Interprocedural) || !CtxI ||
+               llvm::all_of(PotentialCopies, [CtxI](Value *PC) {
+                 return AA::isValidInScope(*PC, CtxI->getFunction());
+               }))) {
+            for (auto *PotentialCopy : PotentialCopies)
+              Worklist.push_back({PotentialCopy, CtxI});
+            continue;
+          }
+        }
+      }
+    }
+
     // Once a leaf is reached we inform the user through the callback.
     if (!VisitValueCB(*V, CtxI, State, Iteration > 1)) {
       LLVM_DEBUG(dbgs() << "Generic value traversal visit callback failed for: "
@@ -400,8 +487,10 @@ static bool genericValueTraversal(
   } while (!Worklist.empty());
 
   // If we actually used liveness information so we have to record a dependence.
-  if (AnyDead)
-    A.recordDependence(*LivenessAA, QueryingAA, DepClassTy::OPTIONAL);
+  for (auto &It : LivenessAAs)
+    if (It.second.AnyDead)
+      A.recordDependence(*It.second.LivenessAA, QueryingAA,
+                         DepClassTy::OPTIONAL);
 
   // All values have been visited.
   return true;
@@ -411,7 +500,8 @@ bool AA::getAssumedUnderlyingObjects(Attributor &A, const Value &Ptr,
                                      SmallVectorImpl<Value *> &Objects,
                                      const AbstractAttribute &QueryingAA,
                                      const Instruction *CtxI,
-                                     bool Intraprocedural) {
+                                     bool &UsedAssumedInformation,
+                                     AA::ValueScope VS) {
   auto StripCB = [&](Value *V) { return getUnderlyingObject(V); };
   SmallPtrSet<Value *, 8> SeenObjects;
   auto VisitValueCB = [&SeenObjects](Value &Val, const Instruction *,
@@ -423,15 +513,16 @@ bool AA::getAssumedUnderlyingObjects(Attributor &A, const Value &Ptr,
   };
   if (!genericValueTraversal<decltype(Objects)>(
           A, IRPosition::value(Ptr), QueryingAA, Objects, VisitValueCB, CtxI,
-          true, 32, StripCB, Intraprocedural))
+          UsedAssumedInformation, true, 32, StripCB, VS))
     return false;
   return true;
 }
 
-const Value *stripAndAccumulateMinimalOffsets(
-    Attributor &A, const AbstractAttribute &QueryingAA, const Value *Val,
-    const DataLayout &DL, APInt &Offset, bool AllowNonInbounds,
-    bool UseAssumed = false) {
+static const Value *
+stripAndAccumulateOffsets(Attributor &A, const AbstractAttribute &QueryingAA,
+                          const Value *Val, const DataLayout &DL, APInt &Offset,
+                          bool GetMinOffset, bool AllowNonInbounds,
+                          bool UseAssumed = false) {
 
   auto AttributorAnalysis = [&](Value &V, APInt &ROffset) -> bool {
     const IRPosition &Pos = IRPosition::value(V);
@@ -442,14 +533,20 @@ const Value *stripAndAccumulateMinimalOffsets(
                                                     : DepClassTy::NONE);
     ConstantRange Range = UseAssumed ? ValueConstantRangeAA.getAssumed()
                                      : ValueConstantRangeAA.getKnown();
+    if (Range.isFullSet())
+      return false;
+
     // We can only use the lower part of the range because the upper part can
     // be higher than what the value can really be.
-    ROffset = Range.getSignedMin();
+    if (GetMinOffset)
+      ROffset = Range.getSignedMin();
+    else
+      ROffset = Range.getSignedMax();
     return true;
   };
 
   return Val->stripAndAccumulateConstantOffsets(DL, Offset, AllowNonInbounds,
-                                                /* AllowInvariant */ false,
+                                                /* AllowInvariant */ true,
                                                 AttributorAnalysis);
 }
 
@@ -458,8 +555,9 @@ getMinimalBaseOfPointer(Attributor &A, const AbstractAttribute &QueryingAA,
                         const Value *Ptr, int64_t &BytesOffset,
                         const DataLayout &DL, bool AllowNonInbounds = false) {
   APInt OffsetAPInt(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
-  const Value *Base = stripAndAccumulateMinimalOffsets(
-      A, QueryingAA, Ptr, DL, OffsetAPInt, AllowNonInbounds);
+  const Value *Base =
+      stripAndAccumulateOffsets(A, QueryingAA, Ptr, DL, OffsetAPInt,
+                                /* GetMinOffset */ true, AllowNonInbounds);
 
   BytesOffset = OffsetAPInt.getSExtValue();
   return Base;
@@ -493,10 +591,9 @@ static void clampReturnedValueStates(
     LLVM_DEBUG(dbgs() << "[Attributor] RV: " << RV << " AA: " << AA.getAsStr()
                       << " @ " << RVPos << "\n");
     const StateType &AAS = AA.getState();
-    if (T.hasValue())
-      *T &= AAS;
-    else
-      T = AAS;
+    if (!T)
+      T = StateType::getBestState(AAS);
+    *T &= AAS;
     LLVM_DEBUG(dbgs() << "[Attributor] AA State: " << AAS << " RV State: " << T
                       << "\n");
     return T->isValidState();
@@ -504,7 +601,7 @@ static void clampReturnedValueStates(
 
   if (!A.checkForAllReturnedValues(CheckReturnValue, QueryingAA))
     S.indicatePessimisticFixpoint();
-  else if (T.hasValue())
+  else if (T)
     S ^= *T;
 }
 
@@ -560,20 +657,19 @@ static void clampCallSiteArgumentStates(Attributor &A, const AAType &QueryingAA,
     LLVM_DEBUG(dbgs() << "[Attributor] ACS: " << *ACS.getInstruction()
                       << " AA: " << AA.getAsStr() << " @" << ACSArgPos << "\n");
     const StateType &AAS = AA.getState();
-    if (T.hasValue())
-      *T &= AAS;
-    else
-      T = AAS;
+    if (!T)
+      T = StateType::getBestState(AAS);
+    *T &= AAS;
     LLVM_DEBUG(dbgs() << "[Attributor] AA State: " << AAS << " CSA State: " << T
                       << "\n");
     return T->isValidState();
   };
 
-  bool AllCallSitesKnown;
+  bool UsedAssumedInformation = false;
   if (!A.checkForAllCallSites(CallSiteCheck, QueryingAA, true,
-                              AllCallSitesKnown))
+                              UsedAssumedInformation))
     S.indicatePessimisticFixpoint();
-  else if (T.hasValue())
+  else if (T)
     S ^= *T;
 }
 
@@ -667,7 +763,6 @@ struct AACallSiteReturnedFromReturned : public BaseType {
     return clampStateAndIndicateChange(S, AA.getState());
   }
 };
-} // namespace
 
 /// Helper function to accumulate uses.
 template <class AAType, typename StateType = typename AAType::StateType>
@@ -779,6 +874,7 @@ static void followUsesInMBEC(AAType &AA, Attributor &A, StateType &S,
     S += ParentState;
   }
 }
+} // namespace
 
 /// ------------------------ PointerInfo ---------------------------------------
 
@@ -786,9 +882,6 @@ namespace llvm {
 namespace AA {
 namespace PointerInfo {
 
-/// An access kind description as used by AAPointerInfo.
-struct OffsetAndSize;
-
 struct State;
 
 } // namespace PointerInfo
@@ -806,7 +899,7 @@ struct DenseMapInfo<AAPointerInfo::Access> : DenseMapInfo<Instruction *> {
 
 /// Helper that allows OffsetAndSize as a key in a DenseMap.
 template <>
-struct DenseMapInfo<AA::PointerInfo ::OffsetAndSize>
+struct DenseMapInfo<AAPointerInfo ::OffsetAndSize>
     : DenseMapInfo<std::pair<int64_t, int64_t>> {};
 
 /// Helper for AA::PointerInfo::Acccess DenseMap/Set usage ignoring everythign
@@ -822,90 +915,15 @@ struct AccessAsInstructionInfo : DenseMapInfo<Instruction *> {
 
 } // namespace llvm
 
-/// Helper to represent an access offset and size, with logic to deal with
-/// uncertainty and check for overlapping accesses.
-struct AA::PointerInfo::OffsetAndSize : public std::pair<int64_t, int64_t> {
-  using BaseTy = std::pair<int64_t, int64_t>;
-  OffsetAndSize(int64_t Offset, int64_t Size) : BaseTy(Offset, Size) {}
-  OffsetAndSize(const BaseTy &P) : BaseTy(P) {}
-  int64_t getOffset() const { return first; }
-  int64_t getSize() const { return second; }
-  static OffsetAndSize getUnknown() { return OffsetAndSize(Unknown, Unknown); }
-
-  /// Return true if offset or size are unknown.
-  bool offsetOrSizeAreUnknown() const {
-    return getOffset() == OffsetAndSize::Unknown ||
-           getSize() == OffsetAndSize::Unknown;
-  }
-
-  /// Return true if this offset and size pair might describe an address that
-  /// overlaps with \p OAS.
-  bool mayOverlap(const OffsetAndSize &OAS) const {
-    // Any unknown value and we are giving up -> overlap.
-    if (offsetOrSizeAreUnknown() || OAS.offsetOrSizeAreUnknown())
-      return true;
-
-    // Check if one offset point is in the other interval [offset, offset+size].
-    return OAS.getOffset() + OAS.getSize() > getOffset() &&
-           OAS.getOffset() < getOffset() + getSize();
-  }
-
-  /// Constant used to represent unknown offset or sizes.
-  static constexpr int64_t Unknown = 1 << 31;
-};
-
-/// Implementation of the DenseMapInfo.
-///
-///{
-inline llvm::AccessAsInstructionInfo::Access
-llvm::AccessAsInstructionInfo::getEmptyKey() {
-  return Access(Base::getEmptyKey(), nullptr, AAPointerInfo::AK_READ, nullptr);
-}
-inline llvm::AccessAsInstructionInfo::Access
-llvm::AccessAsInstructionInfo::getTombstoneKey() {
-  return Access(Base::getTombstoneKey(), nullptr, AAPointerInfo::AK_READ,
-                nullptr);
-}
-unsigned llvm::AccessAsInstructionInfo::getHashValue(
-    const llvm::AccessAsInstructionInfo::Access &A) {
-  return Base::getHashValue(A.getRemoteInst());
-}
-bool llvm::AccessAsInstructionInfo::isEqual(
-    const llvm::AccessAsInstructionInfo::Access &LHS,
-    const llvm::AccessAsInstructionInfo::Access &RHS) {
-  return LHS.getRemoteInst() == RHS.getRemoteInst();
-}
-inline llvm::DenseMapInfo<AAPointerInfo::Access>::Access
-llvm::DenseMapInfo<AAPointerInfo::Access>::getEmptyKey() {
-  return AAPointerInfo::Access(nullptr, nullptr, AAPointerInfo::AK_READ,
-                               nullptr);
-}
-inline llvm::DenseMapInfo<AAPointerInfo::Access>::Access
-llvm::DenseMapInfo<AAPointerInfo::Access>::getTombstoneKey() {
-  return AAPointerInfo::Access(nullptr, nullptr, AAPointerInfo::AK_WRITE,
-                               nullptr);
-}
-
-unsigned llvm::DenseMapInfo<AAPointerInfo::Access>::getHashValue(
-    const llvm::DenseMapInfo<AAPointerInfo::Access>::Access &A) {
-  return detail::combineHashValue(
-             DenseMapInfo<Instruction *>::getHashValue(A.getRemoteInst()),
-             (A.isWrittenValueYetUndetermined()
-                  ? ~0
-                  : DenseMapInfo<Value *>::getHashValue(A.getWrittenValue()))) +
-         A.getKind();
-}
-
-bool llvm::DenseMapInfo<AAPointerInfo::Access>::isEqual(
-    const llvm::DenseMapInfo<AAPointerInfo::Access>::Access &LHS,
-    const llvm::DenseMapInfo<AAPointerInfo::Access>::Access &RHS) {
-  return LHS == RHS;
-}
-///}
-
 /// A type to track pointer/struct usage and accesses for AAPointerInfo.
 struct AA::PointerInfo::State : public AbstractState {
 
+  ~State() {
+    // We do not delete the Accesses objects but need to destroy them still.
+    for (auto &It : AccessBins)
+      It.second->~Accesses();
+  }
+
   /// Return the best possible representable state.
   static State getBestState(const State &SIS) { return State(); }
 
@@ -916,9 +934,10 @@ struct AA::PointerInfo::State : public AbstractState {
     return R;
   }
 
-  State() {}
-  State(const State &SIS) : AccessBins(SIS.AccessBins) {}
-  State(State &&SIS) : AccessBins(std::move(SIS.AccessBins)) {}
+  State() = default;
+  State(State &&SIS) : AccessBins(std::move(SIS.AccessBins)) {
+    SIS.AccessBins.clear();
+  }
 
   const State &getAssumed() const { return *this; }
 
@@ -967,15 +986,11 @@ struct AA::PointerInfo::State : public AbstractState {
         return false;
       auto &Accs = It->getSecond();
       auto &RAccs = RIt->getSecond();
-      if (Accs.size() != RAccs.size())
+      if (Accs->size() != RAccs->size())
         return false;
-      auto AccIt = Accs.begin(), RAccIt = RAccs.begin(), AccE = Accs.end();
-      while (AccIt != AccE) {
-        if (*AccIt != *RAccIt)
+      for (const auto &ZipIt : llvm::zip(*Accs, *RAccs))
+        if (std::get<0>(ZipIt) != std::get<1>(ZipIt))
           return false;
-        ++AccIt;
-        ++RAccIt;
-      }
       ++It;
       ++RIt;
     }
@@ -984,42 +999,88 @@ struct AA::PointerInfo::State : public AbstractState {
   bool operator!=(const State &R) const { return !(*this == R); }
 
   /// We store accesses in a set with the instruction as key.
-  using Accesses = DenseSet<AAPointerInfo::Access, AccessAsInstructionInfo>;
+  struct Accesses {
+    SmallVector<AAPointerInfo::Access, 4> Accesses;
+    DenseMap<const Instruction *, unsigned> Map;
+
+    unsigned size() const { return Accesses.size(); }
+
+    using vec_iterator = decltype(Accesses)::iterator;
+    vec_iterator begin() { return Accesses.begin(); }
+    vec_iterator end() { return Accesses.end(); }
+
+    using iterator = decltype(Map)::const_iterator;
+    iterator find(AAPointerInfo::Access &Acc) {
+      return Map.find(Acc.getRemoteInst());
+    }
+    iterator find_end() { return Map.end(); }
+
+    AAPointerInfo::Access &get(iterator &It) {
+      return Accesses[It->getSecond()];
+    }
+
+    void insert(AAPointerInfo::Access &Acc) {
+      Map[Acc.getRemoteInst()] = Accesses.size();
+      Accesses.push_back(Acc);
+    }
+  };
 
   /// We store all accesses in bins denoted by their offset and size.
-  using AccessBinsTy = DenseMap<OffsetAndSize, Accesses>;
+  using AccessBinsTy = DenseMap<AAPointerInfo::OffsetAndSize, Accesses *>;
 
   AccessBinsTy::const_iterator begin() const { return AccessBins.begin(); }
   AccessBinsTy::const_iterator end() const { return AccessBins.end(); }
 
 protected:
   /// The bins with all the accesses for the associated pointer.
-  DenseMap<OffsetAndSize, Accesses> AccessBins;
+  AccessBinsTy AccessBins;
 
   /// Add a new access to the state at offset \p Offset and with size \p Size.
   /// The access is associated with \p I, writes \p Content (if anything), and
   /// is of kind \p Kind.
   /// \Returns CHANGED, if the state changed, UNCHANGED otherwise.
-  ChangeStatus addAccess(int64_t Offset, int64_t Size, Instruction &I,
-                         Optional<Value *> Content,
+  ChangeStatus addAccess(Attributor &A, int64_t Offset, int64_t Size,
+                         Instruction &I, Optional<Value *> Content,
                          AAPointerInfo::AccessKind Kind, Type *Ty,
                          Instruction *RemoteI = nullptr,
                          Accesses *BinPtr = nullptr) {
-    OffsetAndSize Key{Offset, Size};
-    Accesses &Bin = BinPtr ? *BinPtr : AccessBins[Key];
+    AAPointerInfo::OffsetAndSize Key{Offset, Size};
+    Accesses *&Bin = BinPtr ? BinPtr : AccessBins[Key];
+    if (!Bin)
+      Bin = new (A.Allocator) Accesses;
     AAPointerInfo::Access Acc(&I, RemoteI ? RemoteI : &I, Content, Kind, Ty);
     // Check if we have an access for this instruction in this bin, if not,
     // simply add it.
-    auto It = Bin.find(Acc);
-    if (It == Bin.end()) {
-      Bin.insert(Acc);
+    auto It = Bin->find(Acc);
+    if (It == Bin->find_end()) {
+      Bin->insert(Acc);
       return ChangeStatus::CHANGED;
     }
     // If the existing access is the same as then new one, nothing changed.
-    AAPointerInfo::Access Before = *It;
+    AAPointerInfo::Access &Current = Bin->get(It);
+    AAPointerInfo::Access Before = Current;
     // The new one will be combined with the existing one.
-    *It &= Acc;
-    return *It == Before ? ChangeStatus::UNCHANGED : ChangeStatus::CHANGED;
+    Current &= Acc;
+    return Current == Before ? ChangeStatus::UNCHANGED : ChangeStatus::CHANGED;
+  }
+
+  /// See AAPointerInfo::forallInterferingAccesses.
+  bool forallInterferingAccesses(
+      AAPointerInfo::OffsetAndSize OAS,
+      function_ref<bool(const AAPointerInfo::Access &, bool)> CB) const {
+    if (!isValidState())
+      return false;
+
+    for (auto &It : AccessBins) {
+      AAPointerInfo::OffsetAndSize ItOAS = It.getFirst();
+      if (!OAS.mayOverlap(ItOAS))
+        continue;
+      bool IsExact = OAS == ItOAS && !OAS.offsetOrSizeAreUnknown();
+      for (auto &Access : *It.getSecond())
+        if (!CB(Access, IsExact))
+          return false;
+    }
+    return true;
   }
 
   /// See AAPointerInfo::forallInterferingAccesses.
@@ -1028,10 +1089,11 @@ protected:
       function_ref<bool(const AAPointerInfo::Access &, bool)> CB) const {
     if (!isValidState())
       return false;
+
     // First find the offset and size of I.
-    OffsetAndSize OAS(-1, -1);
+    AAPointerInfo::OffsetAndSize OAS(-1, -1);
     for (auto &It : AccessBins) {
-      for (auto &Access : It.getSecond()) {
+      for (auto &Access : *It.getSecond()) {
         if (Access.getRemoteInst() == &I) {
           OAS = It.getFirst();
           break;
@@ -1040,21 +1102,13 @@ protected:
       if (OAS.getSize() != -1)
         break;
     }
+    // No access for I was found, we are done.
     if (OAS.getSize() == -1)
       return true;
 
     // Now that we have an offset and size, find all overlapping ones and use
     // the callback on the accesses.
-    for (auto &It : AccessBins) {
-      OffsetAndSize ItOAS = It.getFirst();
-      if (!OAS.mayOverlap(ItOAS))
-        continue;
-      bool IsExact = OAS == ItOAS && !OAS.offsetOrSizeAreUnknown();
-      for (auto &Access : It.getSecond())
-        if (!CB(Access, IsExact))
-          return false;
-    }
-    return true;
+    return forallInterferingAccesses(OAS, CB);
   }
 
 private:
@@ -1062,6 +1116,7 @@ private:
   BooleanState BS;
 };
 
+namespace {
 struct AAPointerInfoImpl
     : public StateWrapper<AA::PointerInfo::State, AAPointerInfo> {
   using BaseTy = StateWrapper<AA::PointerInfo::State, AAPointerInfo>;
@@ -1084,22 +1139,18 @@ struct AAPointerInfoImpl
   }
 
   bool forallInterferingAccesses(
-      LoadInst &LI, function_ref<bool(const AAPointerInfo::Access &, bool)> CB)
+      OffsetAndSize OAS,
+      function_ref<bool(const AAPointerInfo::Access &, bool)> CB)
       const override {
-    return State::forallInterferingAccesses(LI, CB);
+    return State::forallInterferingAccesses(OAS, CB);
   }
   bool forallInterferingAccesses(
-      StoreInst &SI, function_ref<bool(const AAPointerInfo::Access &, bool)> CB)
-      const override {
-    return State::forallInterferingAccesses(SI, CB);
-  }
-  bool forallInterferingWrites(
-      Attributor &A, const AbstractAttribute &QueryingAA, LoadInst &LI,
+      Attributor &A, const AbstractAttribute &QueryingAA, Instruction &I,
       function_ref<bool(const Access &, bool)> UserCB) const override {
     SmallPtrSet<const Access *, 8> DominatingWrites;
-    SmallVector<std::pair<const Access *, bool>, 8> InterferingWrites;
+    SmallVector<std::pair<const Access *, bool>, 8> InterferingAccesses;
 
-    Function &Scope = *LI.getFunction();
+    Function &Scope = *I.getFunction();
     const auto &NoSyncAA = A.getAAFor<AANoSync>(
         QueryingAA, IRPosition::function(Scope), DepClassTy::OPTIONAL);
     const auto *ExecDomainAA = A.lookupAAFor<AAExecutionDomain>(
@@ -1127,13 +1178,15 @@ struct AAPointerInfoImpl
 
     // TODO: Use inter-procedural reachability and dominance.
     const auto &NoRecurseAA = A.getAAFor<AANoRecurse>(
-        QueryingAA, IRPosition::function(*LI.getFunction()),
-        DepClassTy::OPTIONAL);
+        QueryingAA, IRPosition::function(Scope), DepClassTy::OPTIONAL);
 
-    const bool CanUseCFGResoning = CanIgnoreThreading(LI);
+    const bool FindInterferingWrites = I.mayReadFromMemory();
+    const bool FindInterferingReads = I.mayWriteToMemory();
+    const bool UseDominanceReasoning = FindInterferingWrites;
+    const bool CanUseCFGResoning = CanIgnoreThreading(I);
     InformationCache &InfoCache = A.getInfoCache();
     const DominatorTree *DT =
-        NoRecurseAA.isKnownNoRecurse()
+        NoRecurseAA.isKnownNoRecurse() && UseDominanceReasoning
             ? InfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(
                   Scope)
             : nullptr;
@@ -1189,33 +1242,37 @@ struct AAPointerInfoImpl
     }
 
     auto AccessCB = [&](const Access &Acc, bool Exact) {
-      if (!Acc.isWrite())
+      if ((!FindInterferingWrites || !Acc.isWrite()) &&
+          (!FindInterferingReads || !Acc.isRead()))
         return true;
 
       // For now we only filter accesses based on CFG reasoning which does not
       // work yet if we have threading effects, or the access is complicated.
       if (CanUseCFGResoning) {
-        if (!AA::isPotentiallyReachable(A, *Acc.getLocalInst(), LI, QueryingAA,
-                                        IsLiveInCalleeCB))
+        if ((!Acc.isWrite() ||
+             !AA::isPotentiallyReachable(A, *Acc.getLocalInst(), I, QueryingAA,
+                                         IsLiveInCalleeCB)) &&
+            (!Acc.isRead() ||
+             !AA::isPotentiallyReachable(A, I, *Acc.getLocalInst(), QueryingAA,
+                                         IsLiveInCalleeCB)))
           return true;
-        if (DT && Exact &&
-            (Acc.getLocalInst()->getFunction() == LI.getFunction()) &&
+        if (DT && Exact && (Acc.getLocalInst()->getFunction() == &Scope) &&
             IsSameThreadAsLoad(Acc)) {
-          if (DT->dominates(Acc.getLocalInst(), &LI))
+          if (DT->dominates(Acc.getLocalInst(), &I))
             DominatingWrites.insert(&Acc);
         }
       }
 
-      InterferingWrites.push_back({&Acc, Exact});
+      InterferingAccesses.push_back({&Acc, Exact});
       return true;
     };
-    if (!State::forallInterferingAccesses(LI, AccessCB))
+    if (!State::forallInterferingAccesses(I, AccessCB))
       return false;
 
     // If we cannot use CFG reasoning we only filter the non-write accesses
     // and are done here.
     if (!CanUseCFGResoning) {
-      for (auto &It : InterferingWrites)
+      for (auto &It : InterferingAccesses)
         if (!UserCB(*It.first, It.second))
           return false;
       return true;
@@ -1242,47 +1299,52 @@ struct AAPointerInfoImpl
       return false;
     };
 
-    // Run the user callback on all writes we cannot skip and return if that
+    // Run the user callback on all accesses we cannot skip and return if that
     // succeeded for all or not.
-    unsigned NumInterferingWrites = InterferingWrites.size();
-    for (auto &It : InterferingWrites)
-      if (!DT || NumInterferingWrites > MaxInterferingWrites ||
-          !CanSkipAccess(*It.first, It.second))
+    unsigned NumInterferingAccesses = InterferingAccesses.size();
+    for (auto &It : InterferingAccesses) {
+      if (!DT || NumInterferingAccesses > MaxInterferingAccesses ||
+          !CanSkipAccess(*It.first, It.second)) {
         if (!UserCB(*It.first, It.second))
           return false;
+      }
+    }
     return true;
   }
 
-  ChangeStatus translateAndAddCalleeState(Attributor &A,
-                                          const AAPointerInfo &CalleeAA,
-                                          int64_t CallArgOffset, CallBase &CB) {
+  ChangeStatus translateAndAddState(Attributor &A, const AAPointerInfo &OtherAA,
+                                    int64_t Offset, CallBase &CB,
+                                    bool FromCallee = false) {
     using namespace AA::PointerInfo;
-    if (!CalleeAA.getState().isValidState() || !isValidState())
+    if (!OtherAA.getState().isValidState() || !isValidState())
       return indicatePessimisticFixpoint();
 
-    const auto &CalleeImplAA = static_cast<const AAPointerInfoImpl &>(CalleeAA);
-    bool IsByval = CalleeImplAA.getAssociatedArgument()->hasByValAttr();
+    const auto &OtherAAImpl = static_cast<const AAPointerInfoImpl &>(OtherAA);
+    bool IsByval =
+        FromCallee && OtherAAImpl.getAssociatedArgument()->hasByValAttr();
 
     // Combine the accesses bin by bin.
     ChangeStatus Changed = ChangeStatus::UNCHANGED;
-    for (auto &It : CalleeImplAA.getState()) {
+    for (auto &It : OtherAAImpl.getState()) {
       OffsetAndSize OAS = OffsetAndSize::getUnknown();
-      if (CallArgOffset != OffsetAndSize::Unknown)
-        OAS = OffsetAndSize(It.first.getOffset() + CallArgOffset,
-                            It.first.getSize());
-      Accesses &Bin = AccessBins[OAS];
-      for (const AAPointerInfo::Access &RAcc : It.second) {
+      if (Offset != OffsetAndSize::Unknown)
+        OAS = OffsetAndSize(It.first.getOffset() + Offset, It.first.getSize());
+      Accesses *Bin = AccessBins.lookup(OAS);
+      for (const AAPointerInfo::Access &RAcc : *It.second) {
         if (IsByval && !RAcc.isRead())
           continue;
         bool UsedAssumedInformation = false;
-        Optional<Value *> Content = A.translateArgumentToCallSiteContent(
-            RAcc.getContent(), CB, *this, UsedAssumedInformation);
-        AccessKind AK =
-            AccessKind(RAcc.getKind() & (IsByval ? AccessKind::AK_READ
-                                                 : AccessKind::AK_READ_WRITE));
+        AccessKind AK = RAcc.getKind();
+        Optional<Value *> Content = RAcc.getContent();
+        if (FromCallee) {
+          Content = A.translateArgumentToCallSiteContent(
+              RAcc.getContent(), CB, *this, UsedAssumedInformation);
+          AK = AccessKind(
+              AK & (IsByval ? AccessKind::AK_READ : AccessKind::AK_READ_WRITE));
+        }
         Changed =
-            Changed | addAccess(OAS.getOffset(), OAS.getSize(), CB, Content, AK,
-                                RAcc.getType(), RAcc.getRemoteInst(), &Bin);
+            Changed | addAccess(A, OAS.getOffset(), OAS.getSize(), CB, Content,
+                                AK, RAcc.getType(), RAcc.getRemoteInst(), Bin);
       }
     }
     return Changed;
@@ -1305,7 +1367,7 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl {
   bool handleAccess(Attributor &A, Instruction &I, Value &Ptr,
                     Optional<Value *> Content, AccessKind Kind, int64_t Offset,
                     ChangeStatus &Changed, Type *Ty,
-                    int64_t Size = AA::PointerInfo::OffsetAndSize::Unknown) {
+                    int64_t Size = OffsetAndSize::Unknown) {
     using namespace AA::PointerInfo;
     // No need to find a size if one is given or the offset is unknown.
     if (Offset != OffsetAndSize::Unknown && Size == OffsetAndSize::Unknown &&
@@ -1315,13 +1377,13 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl {
       if (!AccessSize.isScalable())
         Size = AccessSize.getFixedSize();
     }
-    Changed = Changed | addAccess(Offset, Size, I, Content, Kind, Ty);
+    Changed = Changed | addAccess(A, Offset, Size, I, Content, Kind, Ty);
     return true;
   };
 
   /// Helper struct, will support ranges eventually.
   struct OffsetInfo {
-    int64_t Offset = AA::PointerInfo::OffsetAndSize::Unknown;
+    int64_t Offset = OffsetAndSize::Unknown;
 
     bool operator==(const OffsetInfo &OI) const { return Offset == OI.Offset; }
   };
@@ -1329,7 +1391,6 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl {
   /// See AbstractAttribute::updateImpl(...).
   ChangeStatus updateImpl(Attributor &A) override {
     using namespace AA::PointerInfo;
-    State S = getState();
     ChangeStatus Changed = ChangeStatus::UNCHANGED;
     Value &AssociatedValue = getAssociatedValue();
 
@@ -1337,7 +1398,7 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl {
     DenseMap<Value *, OffsetInfo> OffsetInfoMap;
     OffsetInfoMap[&AssociatedValue] = OffsetInfo{0};
 
-    auto HandlePassthroughUser = [&](Value *Usr, OffsetInfo &PtrOI,
+    auto HandlePassthroughUser = [&](Value *Usr, OffsetInfo PtrOI,
                                      bool &Follow) {
       OffsetInfo &UsrOI = OffsetInfoMap[Usr];
       UsrOI = PtrOI;
@@ -1475,8 +1536,8 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl {
           const auto &CSArgPI = A.getAAFor<AAPointerInfo>(
               *this, IRPosition::callsite_argument(*CB, ArgNo),
               DepClassTy::REQUIRED);
-          Changed = translateAndAddCalleeState(
-                        A, CSArgPI, OffsetInfoMap[CurPtr].Offset, *CB) |
+          Changed = translateAndAddState(A, CSArgPI,
+                                         OffsetInfoMap[CurPtr].Offset, *CB) |
                     Changed;
           return true;
         }
@@ -1497,7 +1558,7 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl {
     };
     if (!A.checkForAllUses(UsePred, *this, AssociatedValue,
                            /* CheckBBLivenessOnly */ true, DepClassTy::OPTIONAL,
-                           EquivalentUseCB))
+                           /* IgnoreDroppableUses */ true, EquivalentUseCB))
       return indicatePessimisticFixpoint();
 
     LLVM_DEBUG({
@@ -1505,15 +1566,19 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl {
       for (auto &It : AccessBins) {
         dbgs() << "[" << It.first.getOffset() << "-"
                << It.first.getOffset() + It.first.getSize()
-               << "] : " << It.getSecond().size() << "\n";
-        for (auto &Acc : It.getSecond()) {
+               << "] : " << It.getSecond()->size() << "\n";
+        for (auto &Acc : *It.getSecond()) {
           dbgs() << "     - " << Acc.getKind() << " - " << *Acc.getLocalInst()
                  << "\n";
           if (Acc.getLocalInst() != Acc.getRemoteInst())
             dbgs() << "     -->                         "
                    << *Acc.getRemoteInst() << "\n";
-          if (!Acc.isWrittenValueYetUndetermined())
-            dbgs() << "     - " << Acc.getWrittenValue() << "\n";
+          if (!Acc.isWrittenValueYetUndetermined()) {
+            if (Acc.getWrittenValue())
+              dbgs() << "       - c: " << *Acc.getWrittenValue() << "\n";
+            else
+              dbgs() << "       - c: <unknown>\n";
+          }
         }
       }
     });
@@ -1576,7 +1641,7 @@ struct AAPointerInfoCallSiteArgument final : AAPointerInfoFloating {
         LengthVal = Length->getSExtValue();
       Value &Ptr = getAssociatedValue();
       unsigned ArgNo = getIRPosition().getCallSiteArgNo();
-      ChangeStatus Changed;
+      ChangeStatus Changed = ChangeStatus::UNCHANGED;
       if (ArgNo == 0) {
         handleAccess(A, *MI, Ptr, nullptr, AccessKind::AK_WRITE, 0, Changed,
                      nullptr, LengthVal);
@@ -1601,7 +1666,8 @@ struct AAPointerInfoCallSiteArgument final : AAPointerInfoFloating {
     const IRPosition &ArgPos = IRPosition::argument(*Arg);
     auto &ArgAA =
         A.getAAFor<AAPointerInfo>(*this, ArgPos, DepClassTy::REQUIRED);
-    return translateAndAddCalleeState(A, ArgAA, 0, *cast<CallBase>(getCtxI()));
+    return translateAndAddState(A, ArgAA, 0, *cast<CallBase>(getCtxI()),
+                                /* FromCallee */ true);
   }
 
   /// See AbstractAttribute::trackStatistics()
@@ -1619,9 +1685,11 @@ struct AAPointerInfoCallSiteReturned final : AAPointerInfoFloating {
     AAPointerInfoImpl::trackPointerInfoStatistics(getIRPosition());
   }
 };
+} // namespace
 
 /// -----------------------NoUnwind Function Attribute--------------------------
 
+namespace {
 struct AANoUnwindImpl : AANoUnwind {
   AANoUnwindImpl(const IRPosition &IRP, Attributor &A) : AANoUnwind(IRP, A) {}
 
@@ -1693,9 +1761,11 @@ struct AANoUnwindCallSite final : AANoUnwindImpl {
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(nounwind); }
 };
+} // namespace
 
 /// --------------------- Function Return Values -------------------------------
 
+namespace {
 /// "Attribute" that collects all potential returned values and the return
 /// instructions that they arise from.
 ///
@@ -1821,7 +1891,7 @@ ChangeStatus AAReturnedValuesImpl::manifest(Attributor &A) {
   // Check if we have an assumed unique return value that we could manifest.
   Optional<Value *> UniqueRV = getAssumedUniqueReturnValue(A);
 
-  if (!UniqueRV.hasValue() || !UniqueRV.getValue())
+  if (!UniqueRV || !UniqueRV.getValue())
     return Changed;
 
   // Bookkeeping.
@@ -1893,17 +1963,18 @@ ChangeStatus AAReturnedValuesImpl::updateImpl(Attributor &A) {
     return true;
   };
 
+  bool UsedAssumedInformation = false;
   auto ReturnInstCB = [&](Instruction &I) {
     ReturnInst &Ret = cast<ReturnInst>(I);
     return genericValueTraversal<ReturnInst>(
         A, IRPosition::value(*Ret.getReturnValue()), *this, Ret, ReturnValueCB,
-        &I, /* UseValueSimplify */ true, /* MaxValues */ 16,
-        /* StripCB */ nullptr, /* Intraprocedural */ true);
+        &I, UsedAssumedInformation, /* UseValueSimplify */ true,
+        /* MaxValues */ 16,
+        /* StripCB */ nullptr, AA::Intraprocedural);
   };
 
   // Discover returned values from all live returned instructions in the
   // associated function.
-  bool UsedAssumedInformation = false;
   if (!A.checkForAllInstructions(ReturnInstCB, *this, {Instruction::Ret},
                                  UsedAssumedInformation))
     return indicatePessimisticFixpoint();
@@ -1941,20 +2012,10 @@ struct AAReturnedValuesCallSite final : AAReturnedValuesImpl {
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override {}
 };
+} // namespace
 
 /// ------------------------ NoSync Function Attribute -------------------------
 
-struct AANoSyncImpl : AANoSync {
-  AANoSyncImpl(const IRPosition &IRP, Attributor &A) : AANoSync(IRP, A) {}
-
-  const std::string getAsStr() const override {
-    return getAssumed() ? "nosync" : "may-sync";
-  }
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override;
-};
-
 bool AANoSync::isNonRelaxedAtomic(const Instruction *I) {
   if (!I->isAtomic())
     return false;
@@ -1997,6 +2058,18 @@ bool AANoSync::isNoSyncIntrinsic(const Instruction *I) {
   return false;
 }
 
+namespace {
+struct AANoSyncImpl : AANoSync {
+  AANoSyncImpl(const IRPosition &IRP, Attributor &A) : AANoSync(IRP, A) {}
+
+  const std::string getAsStr() const override {
+    return getAssumed() ? "nosync" : "may-sync";
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override;
+};
+
 ChangeStatus AANoSyncImpl::updateImpl(Attributor &A) {
 
   auto CheckRWInstForNoSync = [&](Instruction &I) {
@@ -2059,9 +2132,11 @@ struct AANoSyncCallSite final : AANoSyncImpl {
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(nosync); }
 };
+} // namespace
 
 /// ------------------------ No-Free Attributes ----------------------------
 
+namespace {
 struct AANoFreeImpl : public AANoFree {
   AANoFreeImpl(const IRPosition &IRP, Attributor &A) : AANoFree(IRP, A) {}
 
@@ -2243,8 +2318,10 @@ struct AANoFreeCallSiteReturned final : AANoFreeFloating {
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(nofree) }
 };
+} // namespace
 
 /// ------------------------ NonNull Argument Attribute ------------------------
+namespace {
 static int64_t getKnownNonNullAndDerefBytesForUse(
     Attributor &A, const AbstractAttribute &QueryingAA, Value &AssociatedValue,
     const Use *U, const Instruction *I, bool &IsNonNull, bool &TrackUse) {
@@ -2332,7 +2409,7 @@ struct AANonNullImpl : AANonNull {
 
   /// See AbstractAttribute::initialize(...).
   void initialize(Attributor &A) override {
-    Value &V = getAssociatedValue();
+    Value &V = *getAssociatedValue().stripPointerCasts();
     if (!NullIsDefined &&
         hasAttr({Attribute::NonNull, Attribute::Dereferenceable},
                 /* IgnoreSubsumingPositions */ false, &A)) {
@@ -2356,7 +2433,7 @@ struct AANonNullImpl : AANonNull {
       }
     }
 
-    if (isa<GlobalValue>(&getAssociatedValue())) {
+    if (isa<GlobalValue>(V)) {
       indicatePessimisticFixpoint();
       return;
     }
@@ -2419,8 +2496,10 @@ struct AANonNullFloating : public AANonNullImpl {
     };
 
     StateType T;
+    bool UsedAssumedInformation = false;
     if (!genericValueTraversal<StateType>(A, getIRPosition(), *this, T,
-                                          VisitValueCB, getCtxI()))
+                                          VisitValueCB, getCtxI(),
+                                          UsedAssumedInformation))
       return indicatePessimisticFixpoint();
 
     return clampStateAndIndicateChange(getState(), T);
@@ -2472,9 +2551,11 @@ struct AANonNullCallSiteReturned final
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(nonnull) }
 };
+} // namespace
 
 /// ------------------------ No-Recurse Attributes ----------------------------
 
+namespace {
 struct AANoRecurseImpl : public AANoRecurse {
   AANoRecurseImpl(const IRPosition &IRP, Attributor &A) : AANoRecurse(IRP, A) {}
 
@@ -2498,14 +2579,15 @@ struct AANoRecurseFunction final : AANoRecurseImpl {
           DepClassTy::NONE);
       return NoRecurseAA.isKnownNoRecurse();
     };
-    bool AllCallSitesKnown;
-    if (A.checkForAllCallSites(CallSitePred, *this, true, AllCallSitesKnown)) {
+    bool UsedAssumedInformation = false;
+    if (A.checkForAllCallSites(CallSitePred, *this, true,
+                               UsedAssumedInformation)) {
       // If we know all call sites and all are known no-recurse, we are done.
       // If all known call sites, which might not be all that exist, are known
       // to be no-recurse, we are not done but we can continue to assume
       // no-recurse. If one of the call sites we have not visited will become
       // live, another update is triggered.
-      if (AllCallSitesKnown)
+      if (!UsedAssumedInformation)
         indicateOptimisticFixpoint();
       return ChangeStatus::UNCHANGED;
     }
@@ -2549,9 +2631,11 @@ struct AANoRecurseCallSite final : AANoRecurseImpl {
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(norecurse); }
 };
+} // namespace
 
 /// -------------------- Undefined-Behavior Attributes ------------------------
 
+namespace {
 struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior {
   AAUndefinedBehaviorImpl(const IRPosition &IRP, Attributor &A)
       : AAUndefinedBehavior(IRP, A) {}
@@ -2582,7 +2666,7 @@ struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior {
       // Either we stopped and the appropriate action was taken,
       // or we got back a simplified value to continue.
       Optional<Value *> SimplifiedPtrOp = stopOnUndefOrAssumed(A, PtrOp, &I);
-      if (!SimplifiedPtrOp.hasValue() || !SimplifiedPtrOp.getValue())
+      if (!SimplifiedPtrOp || !SimplifiedPtrOp.getValue())
         return true;
       const Value *PtrOpVal = SimplifiedPtrOp.getValue();
 
@@ -2627,7 +2711,7 @@ struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior {
       // or we got back a simplified value to continue.
       Optional<Value *> SimplifiedCond =
           stopOnUndefOrAssumed(A, BrInst->getCondition(), BrInst);
-      if (!SimplifiedCond.hasValue() || !SimplifiedCond.getValue())
+      if (!SimplifiedCond || !*SimplifiedCond)
         return true;
       AssumedNoUBInsts.insert(&I);
       return true;
@@ -2673,10 +2757,9 @@ struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior {
             IRPosition::value(*ArgVal), *this, UsedAssumedInformation);
         if (UsedAssumedInformation)
           continue;
-        if (SimplifiedVal.hasValue() && !SimplifiedVal.getValue())
+        if (SimplifiedVal && !SimplifiedVal.getValue())
           return true;
-        if (!SimplifiedVal.hasValue() ||
-            isa<UndefValue>(*SimplifiedVal.getValue())) {
+        if (!SimplifiedVal || isa<UndefValue>(*SimplifiedVal.getValue())) {
           KnownUBInsts.insert(&I);
           continue;
         }
@@ -2691,40 +2774,38 @@ struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior {
       return true;
     };
 
-    auto InspectReturnInstForUB =
-        [&](Value &V, const SmallSetVector<ReturnInst *, 4> RetInsts) {
-          // Check if a return instruction always cause UB or not
-          // Note: It is guaranteed that the returned position of the anchor
-          //       scope has noundef attribute when this is called.
-          //       We also ensure the return position is not "assumed dead"
-          //       because the returned value was then potentially simplified to
-          //       `undef` in AAReturnedValues without removing the `noundef`
-          //       attribute yet.
-
-          // When the returned position has noundef attriubte, UB occur in the
-          // following cases.
-          //   (1) Returned value is known to be undef.
-          //   (2) The value is known to be a null pointer and the returned
-          //       position has nonnull attribute (because the returned value is
-          //       poison).
-          bool FoundUB = false;
-          if (isa<UndefValue>(V)) {
-            FoundUB = true;
-          } else {
-            if (isa<ConstantPointerNull>(V)) {
-              auto &NonNullAA = A.getAAFor<AANonNull>(
-                  *this, IRPosition::returned(*getAnchorScope()),
-                  DepClassTy::NONE);
-              if (NonNullAA.isKnownNonNull())
-                FoundUB = true;
-            }
-          }
+    auto InspectReturnInstForUB = [&](Instruction &I) {
+      auto &RI = cast<ReturnInst>(I);
+      // Either we stopped and the appropriate action was taken,
+      // or we got back a simplified return value to continue.
+      Optional<Value *> SimplifiedRetValue =
+          stopOnUndefOrAssumed(A, RI.getReturnValue(), &I);
+      if (!SimplifiedRetValue || !*SimplifiedRetValue)
+        return true;
 
-          if (FoundUB)
-            for (ReturnInst *RI : RetInsts)
-              KnownUBInsts.insert(RI);
-          return true;
-        };
+      // Check if a return instruction always cause UB or not
+      // Note: It is guaranteed that the returned position of the anchor
+      //       scope has noundef attribute when this is called.
+      //       We also ensure the return position is not "assumed dead"
+      //       because the returned value was then potentially simplified to
+      //       `undef` in AAReturnedValues without removing the `noundef`
+      //       attribute yet.
+
+      // When the returned position has noundef attriubte, UB occurs in the
+      // following cases.
+      //   (1) Returned value is known to be undef.
+      //   (2) The value is known to be a null pointer and the returned
+      //       position has nonnull attribute (because the returned value is
+      //       poison).
+      if (isa<ConstantPointerNull>(*SimplifiedRetValue)) {
+        auto &NonNullAA = A.getAAFor<AANonNull>(
+            *this, IRPosition::returned(*getAnchorScope()), DepClassTy::NONE);
+        if (NonNullAA.isKnownNonNull())
+          KnownUBInsts.insert(&I);
+      }
+
+      return true;
+    };
 
     bool UsedAssumedInformation = false;
     A.checkForAllInstructions(InspectMemAccessInstForUB, *this,
@@ -2747,8 +2828,9 @@ struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior {
         auto &RetPosNoUndefAA =
             A.getAAFor<AANoUndef>(*this, ReturnIRP, DepClassTy::NONE);
         if (RetPosNoUndefAA.isKnownNoUndef())
-          A.checkForAllReturnedValuesAndReturnInsts(InspectReturnInstForUB,
-                                                    *this);
+          A.checkForAllInstructions(InspectReturnInstForUB, *this,
+                                    {Instruction::Ret}, UsedAssumedInformation,
+                                    /* CheckBBLivenessOnly */ true);
       }
     }
 
@@ -2776,7 +2858,7 @@ struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior {
     case Instruction::AtomicRMW:
       return !AssumedNoUBInsts.count(I);
     case Instruction::Br: {
-      auto BrInst = cast<BranchInst>(I);
+      auto *BrInst = cast<BranchInst>(I);
       if (BrInst->isUnconditional())
         return false;
       return !AssumedNoUBInsts.count(I);
@@ -2847,13 +2929,13 @@ private:
         IRPosition::value(*V), *this, UsedAssumedInformation);
     if (!UsedAssumedInformation) {
       // Don't depend on assumed values.
-      if (!SimplifiedV.hasValue()) {
+      if (!SimplifiedV) {
         // If it is known (which we tested above) but it doesn't have a value,
         // then we can assume `undef` and hence the instruction is UB.
         KnownUBInsts.insert(I);
         return llvm::None;
       }
-      if (!SimplifiedV.getValue())
+      if (!*SimplifiedV)
         return nullptr;
       V = *SimplifiedV;
     }
@@ -2877,9 +2959,11 @@ struct AAUndefinedBehaviorFunction final : AAUndefinedBehaviorImpl {
         KnownUBInsts.size();
   }
 };
+} // namespace
 
 /// ------------------------ Will-Return Attributes ----------------------------
 
+namespace {
 // Helper function that checks whether a function has any cycle which we don't
 // know if it is bounded or not.
 // Loops with maximum trip count are considered bounded, any other cycle not.
@@ -3018,9 +3102,11 @@ struct AAWillReturnCallSite final : AAWillReturnImpl {
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(willreturn); }
 };
+} // namespace
 
 /// -------------------AAReachability Attribute--------------------------
 
+namespace {
 struct AAReachabilityImpl : AAReachability {
   AAReachabilityImpl(const IRPosition &IRP, Attributor &A)
       : AAReachability(IRP, A) {}
@@ -3032,10 +3118,6 @@ struct AAReachabilityImpl : AAReachability {
 
   /// See AbstractAttribute::updateImpl(...).
   ChangeStatus updateImpl(Attributor &A) override {
-    const auto &NoRecurseAA = A.getAAFor<AANoRecurse>(
-        *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
-    if (!NoRecurseAA.isAssumedNoRecurse())
-      return indicatePessimisticFixpoint();
     return ChangeStatus::UNCHANGED;
   }
 };
@@ -3047,9 +3129,11 @@ struct AAReachabilityFunction final : public AAReachabilityImpl {
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(reachable); }
 };
+} // namespace
 
 /// ------------------------ NoAlias Argument Attribute ------------------------
 
+namespace {
 struct AANoAliasImpl : AANoAlias {
   AANoAliasImpl(const IRPosition &IRP, Attributor &A) : AANoAlias(IRP, A) {
     assert(getAssociatedType()->isPointerTy() &&
@@ -3146,10 +3230,10 @@ struct AANoAliasArgument final
 
     // If the argument is never passed through callbacks, no-alias cannot break
     // synchronization.
-    bool AllCallSitesKnown;
+    bool UsedAssumedInformation = false;
     if (A.checkForAllCallSites(
             [](AbstractCallSite ACS) { return !ACS.isCallbackCall(); }, *this,
-            true, AllCallSitesKnown))
+            true, UsedAssumedInformation))
       return Base::updateImpl(A);
 
     // TODO: add no-alias but make sure it doesn't break synchronization by
@@ -3246,14 +3330,20 @@ struct AANoAliasCallSiteArgument final : AANoAliasImpl {
       return false;
     }
 
+    auto IsDereferenceableOrNull = [&](Value *O, const DataLayout &DL) {
+      const auto &DerefAA = A.getAAFor<AADereferenceable>(
+          *this, IRPosition::value(*O), DepClassTy::OPTIONAL);
+      return DerefAA.getAssumedDereferenceableBytes();
+    };
+
     A.recordDependence(NoAliasAA, *this, DepClassTy::OPTIONAL);
 
     const IRPosition &VIRP = IRPosition::value(getAssociatedValue());
     const Function *ScopeFn = VIRP.getAnchorScope();
     auto &NoCaptureAA = A.getAAFor<AANoCapture>(*this, VIRP, DepClassTy::NONE);
     // Check whether the value is captured in the scope using AANoCapture.
-    //      Look at CFG and check only uses possibly executed before this
-    //      callsite.
+    // Look at CFG and check only uses possibly executed before this
+    // callsite.
     auto UsePred = [&](const Use &U, bool &Follow) -> bool {
       Instruction *UserI = cast<Instruction>(U.getUser());
 
@@ -3265,12 +3355,6 @@ struct AANoAliasCallSiteArgument final : AANoAliasImpl {
         return true;
 
       if (ScopeFn) {
-        const auto &ReachabilityAA = A.getAAFor<AAReachability>(
-            *this, IRPosition::function(*ScopeFn), DepClassTy::OPTIONAL);
-
-        if (!ReachabilityAA.isAssumedReachable(A, *UserI, *getCtxI()))
-          return true;
-
         if (auto *CB = dyn_cast<CallBase>(UserI)) {
           if (CB->isArgOperand(&U)) {
 
@@ -3284,17 +3368,26 @@ struct AANoAliasCallSiteArgument final : AANoAliasImpl {
               return true;
           }
         }
+
+        if (!AA::isPotentiallyReachable(A, *UserI, *getCtxI(), *this))
+          return true;
       }
 
-      // For cases which can potentially have more users
-      if (isa<GetElementPtrInst>(U) || isa<BitCastInst>(U) || isa<PHINode>(U) ||
-          isa<SelectInst>(U)) {
+      // TODO: We should track the capturing uses in AANoCapture but the problem
+      //       is CGSCC runs. For those we would need to "allow" AANoCapture for
+      //       a value in the module slice.
+      switch (DetermineUseCaptureKind(U, IsDereferenceableOrNull)) {
+      case UseCaptureKind::NO_CAPTURE:
+        return true;
+      case UseCaptureKind::MAY_CAPTURE:
+        LLVM_DEBUG(dbgs() << "[AANoAliasCSArg] Unknown user: " << *UserI
+                          << "\n");
+        return false;
+      case UseCaptureKind::PASSTHROUGH:
         Follow = true;
         return true;
       }
-
-      LLVM_DEBUG(dbgs() << "[AANoAliasCSArg] Unknown user: " << *U << "\n");
-      return false;
+      llvm_unreachable("unknown UseCaptureKind");
     };
 
     if (!NoCaptureAA.isAssumedNoCaptureMaybeReturned()) {
@@ -3423,12 +3516,21 @@ struct AANoAliasCallSiteReturned final : AANoAliasImpl {
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(noalias); }
 };
+} // namespace
 
 /// -------------------AAIsDead Function Attribute-----------------------
 
+namespace {
 struct AAIsDeadValueImpl : public AAIsDead {
   AAIsDeadValueImpl(const IRPosition &IRP, Attributor &A) : AAIsDead(IRP, A) {}
 
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    if (auto *Scope = getAnchorScope())
+      if (!A.isRunOn(*Scope))
+        indicatePessimisticFixpoint();
+  }
+
   /// See AAIsDead::isAssumedDead().
   bool isAssumedDead() const override { return isAssumed(IS_DEAD); }
 
@@ -3452,22 +3554,25 @@ struct AAIsDeadValueImpl : public AAIsDead {
   }
 
   /// See AbstractAttribute::getAsStr().
-  const std::string getAsStr() const override {
+  virtual const std::string getAsStr() const override {
     return isAssumedDead() ? "assumed-dead" : "assumed-live";
   }
 
   /// Check if all uses are assumed dead.
   bool areAllUsesAssumedDead(Attributor &A, Value &V) {
     // Callers might not check the type, void has no uses.
-    if (V.getType()->isVoidTy())
+    if (V.getType()->isVoidTy() || V.use_empty())
       return true;
 
     // If we replace a value with a constant there are no uses left afterwards.
     if (!isa<Constant>(V)) {
+      if (auto *I = dyn_cast<Instruction>(&V))
+        if (!A.isRunOn(*I->getFunction()))
+          return false;
       bool UsedAssumedInformation = false;
       Optional<Constant *> C =
           A.getAssumedConstant(V, *this, UsedAssumedInformation);
-      if (!C.hasValue() || *C)
+      if (!C || *C)
         return true;
     }
 
@@ -3477,7 +3582,8 @@ struct AAIsDeadValueImpl : public AAIsDead {
     // without going through N update cycles. This is not required for
     // correctness.
     return A.checkForAllUses(UsePred, *this, V, /* CheckBBLivenessOnly */ false,
-                             DepClassTy::REQUIRED);
+                             DepClassTy::REQUIRED,
+                             /* IgnoreDroppableUses */ false);
   }
 
   /// Determine if \p I is assumed to be side-effect free.
@@ -3508,6 +3614,8 @@ struct AAIsDeadFloating : public AAIsDeadValueImpl {
 
   /// See AbstractAttribute::initialize(...).
   void initialize(Attributor &A) override {
+    AAIsDeadValueImpl::initialize(A);
+
     if (isa<UndefValue>(getAssociatedValue())) {
       indicatePessimisticFixpoint();
       return;
@@ -3538,6 +3646,15 @@ struct AAIsDeadFloating : public AAIsDeadValueImpl {
     });
   }
 
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    Instruction *I = dyn_cast<Instruction>(&getAssociatedValue());
+    if (isa_and_nonnull<StoreInst>(I))
+      if (isValidState())
+        return "assumed-dead-store";
+    return AAIsDeadValueImpl::getAsStr();
+  }
+
   /// See AbstractAttribute::updateImpl(...).
   ChangeStatus updateImpl(Attributor &A) override {
     Instruction *I = dyn_cast<Instruction>(&getAssociatedValue());
@@ -3553,6 +3670,10 @@ struct AAIsDeadFloating : public AAIsDeadValueImpl {
     return ChangeStatus::UNCHANGED;
   }
 
+  bool isRemovableStore() const override {
+    return isAssumed(IS_REMOVABLE) && isa<StoreInst>(&getAssociatedValue());
+  }
+
   /// See AbstractAttribute::manifest(...).
   ChangeStatus manifest(Attributor &A) override {
     Value &V = getAssociatedValue();
@@ -3567,21 +3688,7 @@ struct AAIsDeadFloating : public AAIsDeadValueImpl {
         return ChangeStatus::CHANGED;
       }
     }
-    if (V.use_empty())
-      return ChangeStatus::UNCHANGED;
-
-    bool UsedAssumedInformation = false;
-    Optional<Constant *> C =
-        A.getAssumedConstant(V, *this, UsedAssumedInformation);
-    if (C.hasValue() && C.getValue())
-      return ChangeStatus::UNCHANGED;
-
-    // Replace the value with undef as it is dead but keep droppable uses around
-    // as they provide information we don't want to give up on just yet.
-    UndefValue &UV = *UndefValue::get(V.getType());
-    bool AnyChange =
-        A.changeValueAfterManifest(V, UV, /* ChangeDropppable */ false);
-    return AnyChange ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
+    return ChangeStatus::UNCHANGED;
   }
 
   /// See AbstractAttribute::trackStatistics()
@@ -3596,23 +3703,22 @@ struct AAIsDeadArgument : public AAIsDeadFloating {
 
   /// See AbstractAttribute::initialize(...).
   void initialize(Attributor &A) override {
+    AAIsDeadFloating::initialize(A);
     if (!A.isFunctionIPOAmendable(*getAnchorScope()))
       indicatePessimisticFixpoint();
   }
 
   /// See AbstractAttribute::manifest(...).
   ChangeStatus manifest(Attributor &A) override {
-    ChangeStatus Changed = AAIsDeadFloating::manifest(A);
     Argument &Arg = *getAssociatedArgument();
     if (A.isValidFunctionSignatureRewrite(Arg, /* ReplacementTypes */ {}))
       if (A.registerFunctionSignatureRewrite(
               Arg, /* ReplacementTypes */ {},
               Attributor::ArgumentReplacementInfo::CalleeRepairCBTy{},
               Attributor::ArgumentReplacementInfo::ACSRepairCBTy{})) {
-        Arg.dropDroppableUses();
         return ChangeStatus::CHANGED;
       }
-    return Changed;
+    return ChangeStatus::UNCHANGED;
   }
 
   /// See AbstractAttribute::trackStatistics()
@@ -3625,6 +3731,7 @@ struct AAIsDeadCallSiteArgument : public AAIsDeadValueImpl {
 
   /// See AbstractAttribute::initialize(...).
   void initialize(Attributor &A) override {
+    AAIsDeadValueImpl::initialize(A);
     if (isa<UndefValue>(getAssociatedValue()))
       indicatePessimisticFixpoint();
   }
@@ -3661,7 +3768,7 @@ struct AAIsDeadCallSiteArgument : public AAIsDeadValueImpl {
 
 struct AAIsDeadCallSiteReturned : public AAIsDeadFloating {
   AAIsDeadCallSiteReturned(const IRPosition &IRP, Attributor &A)
-      : AAIsDeadFloating(IRP, A), IsAssumedSideEffectFree(true) {}
+      : AAIsDeadFloating(IRP, A) {}
 
   /// See AAIsDead::isAssumedDead().
   bool isAssumedDead() const override {
@@ -3670,6 +3777,7 @@ struct AAIsDeadCallSiteReturned : public AAIsDeadFloating {
 
   /// See AbstractAttribute::initialize(...).
   void initialize(Attributor &A) override {
+    AAIsDeadFloating::initialize(A);
     if (isa<UndefValue>(getAssociatedValue())) {
       indicatePessimisticFixpoint();
       return;
@@ -3707,7 +3815,7 @@ struct AAIsDeadCallSiteReturned : public AAIsDeadFloating {
   }
 
 private:
-  bool IsAssumedSideEffectFree;
+  bool IsAssumedSideEffectFree = true;
 };
 
 struct AAIsDeadReturned : public AAIsDeadValueImpl {
@@ -3727,9 +3835,8 @@ struct AAIsDeadReturned : public AAIsDeadValueImpl {
       return areAllUsesAssumedDead(A, *ACS.getInstruction());
     };
 
-    bool AllCallSitesKnown;
     if (!A.checkForAllCallSites(PredForCallSite, *this, true,
-                                AllCallSitesKnown))
+                                UsedAssumedInformation))
       return indicatePessimisticFixpoint();
 
     return ChangeStatus::UNCHANGED;
@@ -3761,17 +3868,13 @@ struct AAIsDeadFunction : public AAIsDead {
 
   /// See AbstractAttribute::initialize(...).
   void initialize(Attributor &A) override {
-    const Function *F = getAnchorScope();
-    if (F && !F->isDeclaration()) {
-      // We only want to compute liveness once. If the function is not part of
-      // the SCC, skip it.
-      if (A.isRunOn(*const_cast<Function *>(F))) {
-        ToBeExploredFrom.insert(&F->getEntryBlock().front());
-        assumeLive(A, F->getEntryBlock());
-      } else {
-        indicatePessimisticFixpoint();
-      }
+    Function *F = getAnchorScope();
+    if (!F || F->isDeclaration() || !A.isRunOn(*F)) {
+      indicatePessimisticFixpoint();
+      return;
     }
+    ToBeExploredFrom.insert(&F->getEntryBlock().front());
+    assumeLive(A, F->getEntryBlock());
   }
 
   /// See AbstractAttribute::getAsStr().
@@ -3834,6 +3937,9 @@ struct AAIsDeadFunction : public AAIsDead {
   ChangeStatus updateImpl(Attributor &A) override;
 
   bool isEdgeDead(const BasicBlock *From, const BasicBlock *To) const override {
+    assert(From->getParent() == getAnchorScope() &&
+           To->getParent() == getAnchorScope() &&
+           "Used AAIsDead of the wrong function");
     return isValidState() && !AssumedLiveEdges.count(std::make_pair(From, To));
   }
 
@@ -3973,7 +4079,7 @@ identifyAliveSuccessors(Attributor &A, const BranchInst &BI,
   } else {
     Optional<Constant *> C =
         A.getAssumedConstant(*BI.getCondition(), AA, UsedAssumedInformation);
-    if (!C.hasValue() || isa_and_nonnull<UndefValue>(C.getValue())) {
+    if (!C || isa_and_nonnull<UndefValue>(*C)) {
       // No value yet, assume both edges are dead.
     } else if (isa_and_nonnull<ConstantInt>(*C)) {
       const BasicBlock *SuccBB =
@@ -3995,7 +4101,7 @@ identifyAliveSuccessors(Attributor &A, const SwitchInst &SI,
   bool UsedAssumedInformation = false;
   Optional<Constant *> C =
       A.getAssumedConstant(*SI.getCondition(), AA, UsedAssumedInformation);
-  if (!C.hasValue() || isa_and_nonnull<UndefValue>(C.getValue())) {
+  if (!C || isa_and_nonnull<UndefValue>(C.getValue())) {
     // No value yet, assume all edges are dead.
   } else if (isa_and_nonnull<ConstantInt>(C.getValue())) {
     for (auto &CaseIt : SI.cases()) {
@@ -4142,9 +4248,11 @@ struct AAIsDeadCallSite final : AAIsDeadFunction {
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override {}
 };
+} // namespace
 
 /// -------------------- Dereferenceable Argument Attribute --------------------
 
+namespace {
 struct AADereferenceableImpl : AADereferenceable {
   AADereferenceableImpl(const IRPosition &IRP, Attributor &A)
       : AADereferenceable(IRP, A) {}
@@ -4152,6 +4260,7 @@ struct AADereferenceableImpl : AADereferenceable {
 
   /// See AbstractAttribute::initialize(...).
   void initialize(Attributor &A) override {
+    Value &V = *getAssociatedValue().stripPointerCasts();
     SmallVector<Attribute, 4> Attrs;
     getAttrs({Attribute::Dereferenceable, Attribute::DereferenceableOrNull},
              Attrs, /* IgnoreSubsumingPositions */ false, &A);
@@ -4162,9 +4271,8 @@ struct AADereferenceableImpl : AADereferenceable {
     NonNullAA = &A.getAAFor<AANonNull>(*this, IRP, DepClassTy::NONE);
 
     bool CanBeNull, CanBeFreed;
-    takeKnownDerefBytesMaximum(
-        IRP.getAssociatedValue().getPointerDereferenceableBytes(
-            A.getDataLayout(), CanBeNull, CanBeFreed));
+    takeKnownDerefBytesMaximum(V.getPointerDereferenceableBytes(
+        A.getDataLayout(), CanBeNull, CanBeFreed));
 
     bool IsFnInterface = IRP.isFnInterfaceKind();
     Function *FnScope = IRP.getAnchorScope();
@@ -4263,8 +4371,9 @@ struct AADereferenceableFloating : AADereferenceableImpl {
       unsigned IdxWidth =
           DL.getIndexSizeInBits(V.getType()->getPointerAddressSpace());
       APInt Offset(IdxWidth, 0);
-      const Value *Base =
-          stripAndAccumulateMinimalOffsets(A, *this, &V, DL, Offset, false);
+      const Value *Base = stripAndAccumulateOffsets(
+          A, *this, &V, DL, Offset, /* GetMinOffset */ false,
+          /* AllowNonInbounds */ true);
 
       const auto &AA = A.getAAFor<AADereferenceable>(
           *this, IRPosition::value(*Base), DepClassTy::REQUIRED);
@@ -4312,8 +4421,10 @@ struct AADereferenceableFloating : AADereferenceableImpl {
     };
 
     DerefState T;
+    bool UsedAssumedInformation = false;
     if (!genericValueTraversal<DerefState>(A, getIRPosition(), *this, T,
-                                           VisitValueCB, getCtxI()))
+                                           VisitValueCB, getCtxI(),
+                                           UsedAssumedInformation))
       return indicatePessimisticFixpoint();
 
     return clampStateAndIndicateChange(getState(), T);
@@ -4377,9 +4488,11 @@ struct AADereferenceableCallSiteReturned final
     STATS_DECLTRACK_CS_ATTR(dereferenceable);
   }
 };
+} // namespace
 
 // ------------------------ Align Argument Attribute ------------------------
 
+namespace {
 static unsigned getKnownAlignForUse(Attributor &A, AAAlign &QueryingAA,
                                     Value &AssociatedValue, const Use *U,
                                     const Instruction *I, bool &TrackUse) {
@@ -4450,14 +4563,8 @@ struct AAAlignImpl : AAAlign {
     for (const Attribute &Attr : Attrs)
       takeKnownMaximum(Attr.getValueAsInt());
 
-    Value &V = getAssociatedValue();
-    // TODO: This is a HACK to avoid getPointerAlignment to introduce a ptr2int
-    //       use of the function pointer. This was caused by D73131. We want to
-    //       avoid this for function pointers especially because we iterate
-    //       their uses and int2ptr is not handled. It is not a correctness
-    //       problem though!
-    if (!V.getType()->getPointerElementType()->isFunctionTy())
-      takeKnownMaximum(V.getPointerAlignment(A.getDataLayout()).value());
+    Value &V = *getAssociatedValue().stripPointerCasts();
+    takeKnownMaximum(V.getPointerAlignment(A.getDataLayout()).value());
 
     if (getIRPosition().isFnInterfaceKind() &&
         (!getAnchorScope() ||
@@ -4479,16 +4586,16 @@ struct AAAlignImpl : AAAlign {
     for (const Use &U : AssociatedValue.uses()) {
       if (auto *SI = dyn_cast<StoreInst>(U.getUser())) {
         if (SI->getPointerOperand() == &AssociatedValue)
-          if (SI->getAlignment() < getAssumedAlign()) {
+          if (SI->getAlign() < getAssumedAlign()) {
             STATS_DECLTRACK(AAAlign, Store,
                             "Number of times alignment added to a store");
-            SI->setAlignment(Align(getAssumedAlign()));
+            SI->setAlignment(getAssumedAlign());
             LoadStoreChanged = ChangeStatus::CHANGED;
           }
       } else if (auto *LI = dyn_cast<LoadInst>(U.getUser())) {
         if (LI->getPointerOperand() == &AssociatedValue)
-          if (LI->getAlignment() < getAssumedAlign()) {
-            LI->setAlignment(Align(getAssumedAlign()));
+          if (LI->getAlign() < getAssumedAlign()) {
+            LI->setAlignment(getAssumedAlign());
             STATS_DECLTRACK(AAAlign, Load,
                             "Number of times alignment added to a load");
             LoadStoreChanged = ChangeStatus::CHANGED;
@@ -4532,9 +4639,8 @@ struct AAAlignImpl : AAAlign {
 
   /// See AbstractAttribute::getAsStr().
   const std::string getAsStr() const override {
-    return getAssumedAlign() ? ("align<" + std::to_string(getKnownAlign()) +
-                                "-" + std::to_string(getAssumedAlign()) + ">")
-                             : "unknown-align";
+    return "align<" + std::to_string(getKnownAlign().value()) + "-" +
+           std::to_string(getAssumedAlign().value()) + ">";
   }
 };
 
@@ -4548,6 +4654,8 @@ struct AAAlignFloating : AAAlignImpl {
 
     auto VisitValueCB = [&](Value &V, const Instruction *,
                             AAAlign::StateType &T, bool Stripped) -> bool {
+      if (isa<UndefValue>(V) || isa<ConstantPointerNull>(V))
+        return true;
       const auto &AA = A.getAAFor<AAAlign>(*this, IRPosition::value(V),
                                            DepClassTy::REQUIRED);
       if (!Stripped && this == &AA) {
@@ -4555,6 +4663,7 @@ struct AAAlignFloating : AAAlignImpl {
         unsigned Alignment = 1;
         if (const Value *Base =
                 GetPointerBaseWithConstantOffset(&V, Offset, DL)) {
+          // TODO: Use AAAlign for the base too.
           Align PA = Base->getPointerAlignment(DL);
           // BasePointerAddr + Offset = Alignment * Q for some integer Q.
           // So we can say that the maximum power of two which is a divisor of
@@ -4578,8 +4687,10 @@ struct AAAlignFloating : AAAlignImpl {
     };
 
     StateType T;
+    bool UsedAssumedInformation = false;
     if (!genericValueTraversal<StateType>(A, getIRPosition(), *this, T,
-                                          VisitValueCB, getCtxI()))
+                                          VisitValueCB, getCtxI(),
+                                          UsedAssumedInformation))
       return indicatePessimisticFixpoint();
 
     // TODO: If we know we visited all incoming values, thus no are assumed
@@ -4657,7 +4768,7 @@ struct AAAlignCallSiteArgument final : AAAlignFloating {
       // so we do not need to track a dependence.
       const auto &ArgAlignAA = A.getAAFor<AAAlign>(
           *this, IRPosition::argument(*Arg), DepClassTy::NONE);
-      takeKnownMaximum(ArgAlignAA.getKnownAlign());
+      takeKnownMaximum(ArgAlignAA.getKnownAlign().value());
     }
     return Changed;
   }
@@ -4684,8 +4795,10 @@ struct AAAlignCallSiteReturned final
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(align); }
 };
+} // namespace
 
 /// ------------------ Function No-Return Attribute ----------------------------
+namespace {
 struct AANoReturnImpl : public AANoReturn {
   AANoReturnImpl(const IRPosition &IRP, Attributor &A) : AANoReturn(IRP, A) {}
 
@@ -4712,31 +4825,175 @@ struct AANoReturnImpl : public AANoReturn {
       return indicatePessimisticFixpoint();
     return ChangeStatus::UNCHANGED;
   }
-};
-
-struct AANoReturnFunction final : AANoReturnImpl {
-  AANoReturnFunction(const IRPosition &IRP, Attributor &A)
-      : AANoReturnImpl(IRP, A) {}
+};
+
+struct AANoReturnFunction final : AANoReturnImpl {
+  AANoReturnFunction(const IRPosition &IRP, Attributor &A)
+      : AANoReturnImpl(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(noreturn) }
+};
+
+/// NoReturn attribute deduction for a call sites.
+struct AANoReturnCallSite final : AANoReturnImpl {
+  AANoReturnCallSite(const IRPosition &IRP, Attributor &A)
+      : AANoReturnImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoReturnImpl::initialize(A);
+    if (Function *F = getAssociatedFunction()) {
+      const IRPosition &FnPos = IRPosition::function(*F);
+      auto &FnAA = A.getAAFor<AANoReturn>(*this, FnPos, DepClassTy::REQUIRED);
+      if (!FnAA.isAssumedNoReturn())
+        indicatePessimisticFixpoint();
+    }
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::function(*F);
+    auto &FnAA = A.getAAFor<AANoReturn>(*this, FnPos, DepClassTy::REQUIRED);
+    return clampStateAndIndicateChange(getState(), FnAA.getState());
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(noreturn); }
+};
+} // namespace
+
+/// ----------------------- Instance Info ---------------------------------
+
+namespace {
+/// A class to hold the state of for no-capture attributes.
+struct AAInstanceInfoImpl : public AAInstanceInfo {
+  AAInstanceInfoImpl(const IRPosition &IRP, Attributor &A)
+      : AAInstanceInfo(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    Value &V = getAssociatedValue();
+    if (auto *C = dyn_cast<Constant>(&V)) {
+      if (C->isThreadDependent())
+        indicatePessimisticFixpoint();
+      else
+        indicateOptimisticFixpoint();
+      return;
+    }
+    if (auto *CB = dyn_cast<CallBase>(&V))
+      if (CB->arg_size() == 0 && !CB->mayHaveSideEffects() &&
+          !CB->mayReadFromMemory()) {
+        indicateOptimisticFixpoint();
+        return;
+      }
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    ChangeStatus Changed = ChangeStatus::UNCHANGED;
+
+    Value &V = getAssociatedValue();
+    const Function *Scope = nullptr;
+    if (auto *I = dyn_cast<Instruction>(&V))
+      Scope = I->getFunction();
+    if (auto *A = dyn_cast<Argument>(&V)) {
+      Scope = A->getParent();
+      if (!Scope->hasLocalLinkage())
+        return Changed;
+    }
+    if (!Scope)
+      return indicateOptimisticFixpoint();
+
+    auto &NoRecurseAA = A.getAAFor<AANoRecurse>(
+        *this, IRPosition::function(*Scope), DepClassTy::OPTIONAL);
+    if (NoRecurseAA.isAssumedNoRecurse())
+      return Changed;
+
+    auto UsePred = [&](const Use &U, bool &Follow) {
+      const Instruction *UserI = dyn_cast<Instruction>(U.getUser());
+      if (!UserI || isa<GetElementPtrInst>(UserI) || isa<CastInst>(UserI) ||
+          isa<PHINode>(UserI) || isa<SelectInst>(UserI)) {
+        Follow = true;
+        return true;
+      }
+      if (isa<LoadInst>(UserI) || isa<CmpInst>(UserI) ||
+          (isa<StoreInst>(UserI) &&
+           cast<StoreInst>(UserI)->getValueOperand() != U.get()))
+        return true;
+      if (auto *CB = dyn_cast<CallBase>(UserI)) {
+        // This check is not guaranteeing uniqueness but for now that we cannot
+        // end up with two versions of \p U thinking it was one.
+        if (!CB->getCalledFunction() ||
+            !CB->getCalledFunction()->hasLocalLinkage())
+          return true;
+        if (!CB->isArgOperand(&U))
+          return false;
+        const auto &ArgInstanceInfoAA = A.getAAFor<AAInstanceInfo>(
+            *this, IRPosition::callsite_argument(*CB, CB->getArgOperandNo(&U)),
+            DepClassTy::OPTIONAL);
+        if (!ArgInstanceInfoAA.isAssumedUniqueForAnalysis())
+          return false;
+        // If this call base might reach the scope again we might forward the
+        // argument back here. This is very conservative.
+        if (AA::isPotentiallyReachable(A, *CB, *Scope, *this, nullptr))
+          return false;
+        return true;
+      }
+      return false;
+    };
+
+    auto EquivalentUseCB = [&](const Use &OldU, const Use &NewU) {
+      if (auto *SI = dyn_cast<StoreInst>(OldU.getUser())) {
+        auto *Ptr = SI->getPointerOperand()->stripPointerCasts();
+        if (isa<AllocaInst>(Ptr) && AA::isDynamicallyUnique(A, *this, *Ptr))
+          return true;
+        auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(
+            *SI->getFunction());
+        if (isAllocationFn(Ptr, TLI) && AA::isDynamicallyUnique(A, *this, *Ptr))
+          return true;
+      }
+      return false;
+    };
+
+    if (!A.checkForAllUses(UsePred, *this, V, /* CheckBBLivenessOnly */ true,
+                           DepClassTy::OPTIONAL,
+                           /* IgnoreDroppableUses */ true, EquivalentUseCB))
+      return indicatePessimisticFixpoint();
+
+    return Changed;
+  }
+
+  /// See AbstractState::getAsStr().
+  const std::string getAsStr() const override {
+    return isAssumedUniqueForAnalysis() ? "<unique [fAa]>" : "<unknown>";
+  }
 
   /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(noreturn) }
+  void trackStatistics() const override {}
 };
 
-/// NoReturn attribute deduction for a call sites.
-struct AANoReturnCallSite final : AANoReturnImpl {
-  AANoReturnCallSite(const IRPosition &IRP, Attributor &A)
-      : AANoReturnImpl(IRP, A) {}
+/// InstanceInfo attribute for floating values.
+struct AAInstanceInfoFloating : AAInstanceInfoImpl {
+  AAInstanceInfoFloating(const IRPosition &IRP, Attributor &A)
+      : AAInstanceInfoImpl(IRP, A) {}
+};
 
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    AANoReturnImpl::initialize(A);
-    if (Function *F = getAssociatedFunction()) {
-      const IRPosition &FnPos = IRPosition::function(*F);
-      auto &FnAA = A.getAAFor<AANoReturn>(*this, FnPos, DepClassTy::REQUIRED);
-      if (!FnAA.isAssumedNoReturn())
-        indicatePessimisticFixpoint();
-    }
-  }
+/// NoCapture attribute for function arguments.
+struct AAInstanceInfoArgument final : AAInstanceInfoFloating {
+  AAInstanceInfoArgument(const IRPosition &IRP, Attributor &A)
+      : AAInstanceInfoFloating(IRP, A) {}
+};
+
+/// InstanceInfo attribute for call site arguments.
+struct AAInstanceInfoCallSiteArgument final : AAInstanceInfoImpl {
+  AAInstanceInfoCallSiteArgument(const IRPosition &IRP, Attributor &A)
+      : AAInstanceInfoImpl(IRP, A) {}
 
   /// See AbstractAttribute::updateImpl(...).
   ChangeStatus updateImpl(Attributor &A) override {
@@ -4744,18 +5001,44 @@ struct AANoReturnCallSite final : AANoReturnImpl {
     //       call site specific liveness information and then it makes
     //       sense to specialize attributes for call sites arguments instead of
     //       redirecting requests to the callee argument.
-    Function *F = getAssociatedFunction();
-    const IRPosition &FnPos = IRPosition::function(*F);
-    auto &FnAA = A.getAAFor<AANoReturn>(*this, FnPos, DepClassTy::REQUIRED);
-    return clampStateAndIndicateChange(getState(), FnAA.getState());
+    Argument *Arg = getAssociatedArgument();
+    if (!Arg)
+      return indicatePessimisticFixpoint();
+    const IRPosition &ArgPos = IRPosition::argument(*Arg);
+    auto &ArgAA =
+        A.getAAFor<AAInstanceInfo>(*this, ArgPos, DepClassTy::REQUIRED);
+    return clampStateAndIndicateChange(getState(), ArgAA.getState());
+  }
+};
+
+/// InstanceInfo attribute for function return value.
+struct AAInstanceInfoReturned final : AAInstanceInfoImpl {
+  AAInstanceInfoReturned(const IRPosition &IRP, Attributor &A)
+      : AAInstanceInfoImpl(IRP, A) {
+    llvm_unreachable("InstanceInfo is not applicable to function returns!");
   }
 
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(noreturn); }
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    llvm_unreachable("InstanceInfo is not applicable to function returns!");
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    llvm_unreachable("InstanceInfo is not applicable to function returns!");
+  }
+};
+
+/// InstanceInfo attribute deduction for a call site return value.
+struct AAInstanceInfoCallSiteReturned final : AAInstanceInfoFloating {
+  AAInstanceInfoCallSiteReturned(const IRPosition &IRP, Attributor &A)
+      : AAInstanceInfoFloating(IRP, A) {}
 };
+} // namespace
 
 /// ----------------------- Variable Capturing ---------------------------------
 
+namespace {
 /// A class to hold the state of for no-capture attributes.
 struct AANoCaptureImpl : public AANoCapture {
   AANoCaptureImpl(const IRPosition &IRP, Attributor &A) : AANoCapture(IRP, A) {}
@@ -4863,143 +5146,69 @@ struct AANoCaptureImpl : public AANoCapture {
       return "assumed not-captured-maybe-returned";
     return "assumed-captured";
   }
-};
-
-/// Attributor-aware capture tracker.
-struct AACaptureUseTracker final : public CaptureTracker {
-
-  /// Create a capture tracker that can lookup in-flight abstract attributes
-  /// through the Attributor \p A.
-  ///
-  /// If a use leads to a potential capture, \p CapturedInMemory is set and the
-  /// search is stopped. If a use leads to a return instruction,
-  /// \p CommunicatedBack is set to true and \p CapturedInMemory is not changed.
-  /// If a use leads to a ptr2int which may capture the value,
-  /// \p CapturedInInteger is set. If a use is found that is currently assumed
-  /// "no-capture-maybe-returned", the user is added to the \p PotentialCopies
-  /// set. All values in \p PotentialCopies are later tracked as well. For every
-  /// explored use we decrement \p RemainingUsesToExplore. Once it reaches 0,
-  /// the search is stopped with \p CapturedInMemory and \p CapturedInInteger
-  /// conservatively set to true.
-  AACaptureUseTracker(Attributor &A, AANoCapture &NoCaptureAA,
-                      const AAIsDead &IsDeadAA, AANoCapture::StateType &State,
-                      SmallSetVector<Value *, 4> &PotentialCopies,
-                      unsigned &RemainingUsesToExplore)
-      : A(A), NoCaptureAA(NoCaptureAA), IsDeadAA(IsDeadAA), State(State),
-        PotentialCopies(PotentialCopies),
-        RemainingUsesToExplore(RemainingUsesToExplore) {}
-
-  /// Determine if \p V maybe captured. *Also updates the state!*
-  bool valueMayBeCaptured(const Value *V) {
-    if (V->getType()->isPointerTy()) {
-      PointerMayBeCaptured(V, this);
-    } else {
-      State.indicatePessimisticFixpoint();
-    }
-    return State.isAssumed(AANoCapture::NO_CAPTURE_MAYBE_RETURNED);
-  }
-
-  /// See CaptureTracker::tooManyUses().
-  void tooManyUses() override {
-    State.removeAssumedBits(AANoCapture::NO_CAPTURE);
-  }
-
-  bool isDereferenceableOrNull(Value *O, const DataLayout &DL) override {
-    if (CaptureTracker::isDereferenceableOrNull(O, DL))
-      return true;
-    const auto &DerefAA = A.getAAFor<AADereferenceable>(
-        NoCaptureAA, IRPosition::value(*O), DepClassTy::OPTIONAL);
-    return DerefAA.getAssumedDereferenceableBytes();
-  }
-
-  /// See CaptureTracker::captured(...).
-  bool captured(const Use *U) override {
-    Instruction *UInst = cast<Instruction>(U->getUser());
-    LLVM_DEBUG(dbgs() << "Check use: " << *U->get() << " in " << *UInst
-                      << "\n");
 
-    // Because we may reuse the tracker multiple times we keep track of the
-    // number of explored uses ourselves as well.
-    if (RemainingUsesToExplore-- == 0) {
-      LLVM_DEBUG(dbgs() << " - too many uses to explore!\n");
-      return isCapturedIn(/* Memory */ true, /* Integer */ true,
-                          /* Return */ true);
-    }
+  /// Check the use \p U and update \p State accordingly. Return true if we
+  /// should continue to update the state.
+  bool checkUse(Attributor &A, AANoCapture::StateType &State, const Use &U,
+                bool &Follow) {
+    Instruction *UInst = cast<Instruction>(U.getUser());
+    LLVM_DEBUG(dbgs() << "[AANoCapture] Check use: " << *U.get() << " in "
+                      << *UInst << "\n");
 
     // Deal with ptr2int by following uses.
     if (isa<PtrToIntInst>(UInst)) {
       LLVM_DEBUG(dbgs() << " - ptr2int assume the worst!\n");
-      return valueMayBeCaptured(UInst);
+      return isCapturedIn(State, /* Memory */ true, /* Integer */ true,
+                          /* Return */ true);
     }
 
-    // For stores we check if we can follow the value through memory or not.
-    if (auto *SI = dyn_cast<StoreInst>(UInst)) {
-      if (SI->isVolatile())
-        return isCapturedIn(/* Memory */ true, /* Integer */ false,
-                            /* Return */ false);
-      bool UsedAssumedInformation = false;
-      if (!AA::getPotentialCopiesOfStoredValue(
-              A, *SI, PotentialCopies, NoCaptureAA, UsedAssumedInformation))
-        return isCapturedIn(/* Memory */ true, /* Integer */ false,
-                            /* Return */ false);
-      // Not captured directly, potential copies will be checked.
-      return isCapturedIn(/* Memory */ false, /* Integer */ false,
+    // For stores we already checked if we can follow them, if they make it
+    // here we give up.
+    if (isa<StoreInst>(UInst))
+      return isCapturedIn(State, /* Memory */ true, /* Integer */ false,
                           /* Return */ false);
-    }
 
     // Explicitly catch return instructions.
     if (isa<ReturnInst>(UInst)) {
-      if (UInst->getFunction() == NoCaptureAA.getAnchorScope())
-        return isCapturedIn(/* Memory */ false, /* Integer */ false,
+      if (UInst->getFunction() == getAnchorScope())
+        return isCapturedIn(State, /* Memory */ false, /* Integer */ false,
                             /* Return */ true);
-      return isCapturedIn(/* Memory */ true, /* Integer */ true,
+      return isCapturedIn(State, /* Memory */ true, /* Integer */ true,
                           /* Return */ true);
     }
 
     // For now we only use special logic for call sites. However, the tracker
     // itself knows about a lot of other non-capturing cases already.
     auto *CB = dyn_cast<CallBase>(UInst);
-    if (!CB || !CB->isArgOperand(U))
-      return isCapturedIn(/* Memory */ true, /* Integer */ true,
+    if (!CB || !CB->isArgOperand(&U))
+      return isCapturedIn(State, /* Memory */ true, /* Integer */ true,
                           /* Return */ true);
 
-    unsigned ArgNo = CB->getArgOperandNo(U);
+    unsigned ArgNo = CB->getArgOperandNo(&U);
     const IRPosition &CSArgPos = IRPosition::callsite_argument(*CB, ArgNo);
     // If we have a abstract no-capture attribute for the argument we can use
     // it to justify a non-capture attribute here. This allows recursion!
     auto &ArgNoCaptureAA =
-        A.getAAFor<AANoCapture>(NoCaptureAA, CSArgPos, DepClassTy::REQUIRED);
+        A.getAAFor<AANoCapture>(*this, CSArgPos, DepClassTy::REQUIRED);
     if (ArgNoCaptureAA.isAssumedNoCapture())
-      return isCapturedIn(/* Memory */ false, /* Integer */ false,
+      return isCapturedIn(State, /* Memory */ false, /* Integer */ false,
                           /* Return */ false);
     if (ArgNoCaptureAA.isAssumedNoCaptureMaybeReturned()) {
-      addPotentialCopy(*CB);
-      return isCapturedIn(/* Memory */ false, /* Integer */ false,
+      Follow = true;
+      return isCapturedIn(State, /* Memory */ false, /* Integer */ false,
                           /* Return */ false);
     }
 
     // Lastly, we could not find a reason no-capture can be assumed so we don't.
-    return isCapturedIn(/* Memory */ true, /* Integer */ true,
+    return isCapturedIn(State, /* Memory */ true, /* Integer */ true,
                         /* Return */ true);
   }
 
-  /// Register \p CS as potential copy of the value we are checking.
-  void addPotentialCopy(CallBase &CB) { PotentialCopies.insert(&CB); }
-
-  /// See CaptureTracker::shouldExplore(...).
-  bool shouldExplore(const Use *U) override {
-    // Check liveness and ignore droppable users.
-    bool UsedAssumedInformation = false;
-    return !U->getUser()->isDroppable() &&
-           !A.isAssumedDead(*U, &NoCaptureAA, &IsDeadAA,
-                            UsedAssumedInformation);
-  }
-
-  /// Update the state according to \p CapturedInMem, \p CapturedInInt, and
-  /// \p CapturedInRet, then return the appropriate value for use in the
-  /// CaptureTracker::captured() interface.
-  bool isCapturedIn(bool CapturedInMem, bool CapturedInInt,
-                    bool CapturedInRet) {
+  /// Update \p State according to \p CapturedInMem, \p CapturedInInt, and
+  /// \p CapturedInRet, then return true if we should continue updating the
+  /// state.
+  static bool isCapturedIn(AANoCapture::StateType &State, bool CapturedInMem,
+                           bool CapturedInInt, bool CapturedInRet) {
     LLVM_DEBUG(dbgs() << " - captures [Mem " << CapturedInMem << "|Int "
                       << CapturedInInt << "|Ret " << CapturedInRet << "]\n");
     if (CapturedInMem)
@@ -5008,27 +5217,8 @@ struct AACaptureUseTracker final : public CaptureTracker {
       State.removeAssumedBits(AANoCapture::NOT_CAPTURED_IN_INT);
     if (CapturedInRet)
       State.removeAssumedBits(AANoCapture::NOT_CAPTURED_IN_RET);
-    return !State.isAssumed(AANoCapture::NO_CAPTURE_MAYBE_RETURNED);
+    return State.isAssumed(AANoCapture::NO_CAPTURE_MAYBE_RETURNED);
   }
-
-private:
-  /// The attributor providing in-flight abstract attributes.
-  Attributor &A;
-
-  /// The abstract attribute currently updated.
-  AANoCapture &NoCaptureAA;
-
-  /// The abstract liveness state.
-  const AAIsDead &IsDeadAA;
-
-  /// The state currently updated.
-  AANoCapture::StateType &State;
-
-  /// Set of potential copies of the tracked value.
-  SmallSetVector<Value *, 4> &PotentialCopies;
-
-  /// Global counter to limit the number of explored uses.
-  unsigned &RemainingUsesToExplore;
 };
 
 ChangeStatus AANoCaptureImpl::updateImpl(Attributor &A) {
@@ -5042,7 +5232,6 @@ ChangeStatus AANoCaptureImpl::updateImpl(Attributor &A) {
       isArgumentPosition() ? IRP.getAssociatedFunction() : IRP.getAnchorScope();
   assert(F && "Expected a function!");
   const IRPosition &FnPos = IRPosition::function(*F);
-  const auto &IsDeadAA = A.getAAFor<AAIsDead>(*this, FnPos, DepClassTy::NONE);
 
   AANoCapture::StateType T;
 
@@ -5059,6 +5248,8 @@ ChangeStatus AANoCaptureImpl::updateImpl(Attributor &A) {
   //       AAReturnedValues, e.g., track all values that escape through returns
   //       directly somehow.
   auto CheckReturnedArgs = [&](const AAReturnedValues &RVAA) {
+    if (!RVAA.getState().isValidState())
+      return false;
     bool SeenConstant = false;
     for (auto &It : RVAA.returned_values()) {
       if (isa<Constant>(It.first)) {
@@ -5094,21 +5285,27 @@ ChangeStatus AANoCaptureImpl::updateImpl(Attributor &A) {
     }
   }
 
-  // Use the CaptureTracker interface and logic with the specialized tracker,
-  // defined in AACaptureUseTracker, that can look at in-flight abstract
-  // attributes and directly updates the assumed state.
-  SmallSetVector<Value *, 4> PotentialCopies;
-  unsigned RemainingUsesToExplore =
-      getDefaultMaxUsesToExploreForCaptureTracking();
-  AACaptureUseTracker Tracker(A, *this, IsDeadAA, T, PotentialCopies,
-                              RemainingUsesToExplore);
+  auto IsDereferenceableOrNull = [&](Value *O, const DataLayout &DL) {
+    const auto &DerefAA = A.getAAFor<AADereferenceable>(
+        *this, IRPosition::value(*O), DepClassTy::OPTIONAL);
+    return DerefAA.getAssumedDereferenceableBytes();
+  };
+
+  auto UseCheck = [&](const Use &U, bool &Follow) -> bool {
+    switch (DetermineUseCaptureKind(U, IsDereferenceableOrNull)) {
+    case UseCaptureKind::NO_CAPTURE:
+      return true;
+    case UseCaptureKind::MAY_CAPTURE:
+      return checkUse(A, T, U, Follow);
+    case UseCaptureKind::PASSTHROUGH:
+      Follow = true;
+      return true;
+    }
+    llvm_unreachable("Unexpected use capture kind!");
+  };
 
-  // Check all potential copies of the associated value until we can assume
-  // none will be captured or we have to assume at least one might be.
-  unsigned Idx = 0;
-  PotentialCopies.insert(V);
-  while (T.isAssumed(NO_CAPTURE_MAYBE_RETURNED) && Idx < PotentialCopies.size())
-    Tracker.valueMayBeCaptured(PotentialCopies[Idx++]);
+  if (!A.checkForAllUses(UseCheck, *this, *V))
+    return indicatePessimisticFixpoint();
 
   AANoCapture::StateType &S = getState();
   auto Assumed = S.getAssumed();
@@ -5208,6 +5405,7 @@ struct AANoCaptureCallSiteReturned final : AANoCaptureImpl {
     STATS_DECLTRACK_CSRET_ATTR(nocapture)
   }
 };
+} // namespace
 
 /// ------------------ Value Simplify Attribute ----------------------------
 
@@ -5219,7 +5417,7 @@ bool ValueSimplifyStateType::unionAssumed(Optional<Value *> Other) {
     return false;
 
   LLVM_DEBUG({
-    if (SimplifiedAssociatedValue.hasValue())
+    if (SimplifiedAssociatedValue)
       dbgs() << "[ValueSimplify] is assumed to be "
              << **SimplifiedAssociatedValue << "\n";
     else
@@ -5228,6 +5426,7 @@ bool ValueSimplifyStateType::unionAssumed(Optional<Value *> Other) {
   return true;
 }
 
+namespace {
 struct AAValueSimplifyImpl : AAValueSimplify {
   AAValueSimplifyImpl(const IRPosition &IRP, Attributor &A)
       : AAValueSimplify(IRP, A) {}
@@ -5243,9 +5442,9 @@ struct AAValueSimplifyImpl : AAValueSimplify {
   /// See AbstractAttribute::getAsStr().
   const std::string getAsStr() const override {
     LLVM_DEBUG({
-      errs() << "SAV: " << SimplifiedAssociatedValue << " ";
+      dbgs() << "SAV: " << (bool)SimplifiedAssociatedValue << " ";
       if (SimplifiedAssociatedValue && *SimplifiedAssociatedValue)
-        errs() << "SAV: " << **SimplifiedAssociatedValue << " ";
+        dbgs() << "SAV: " << **SimplifiedAssociatedValue << " ";
     });
     return isValidState() ? (isAtFixpoint() ? "simplified" : "maybe-simple")
                           : "not-simple";
@@ -5259,24 +5458,101 @@ struct AAValueSimplifyImpl : AAValueSimplify {
     return SimplifiedAssociatedValue;
   }
 
+  /// Ensure the return value is \p V with type \p Ty, if not possible return
+  /// nullptr. If \p Check is true we will only verify such an operation would
+  /// suceed and return a non-nullptr value if that is the case. No IR is
+  /// generated or modified.
+  static Value *ensureType(Attributor &A, Value &V, Type &Ty, Instruction *CtxI,
+                           bool Check) {
+    if (auto *TypedV = AA::getWithType(V, Ty))
+      return TypedV;
+    if (CtxI && V.getType()->canLosslesslyBitCastTo(&Ty))
+      return Check ? &V
+                   : BitCastInst::CreatePointerBitCastOrAddrSpaceCast(&V, &Ty,
+                                                                      "", CtxI);
+    return nullptr;
+  }
+
+  /// Reproduce \p I with type \p Ty or return nullptr if that is not posisble.
+  /// If \p Check is true we will only verify such an operation would suceed and
+  /// return a non-nullptr value if that is the case. No IR is generated or
+  /// modified.
+  static Value *reproduceInst(Attributor &A,
+                              const AbstractAttribute &QueryingAA,
+                              Instruction &I, Type &Ty, Instruction *CtxI,
+                              bool Check, ValueToValueMapTy &VMap) {
+    assert(CtxI && "Cannot reproduce an instruction without context!");
+    if (Check && (I.mayReadFromMemory() ||
+                  !isSafeToSpeculativelyExecute(&I, CtxI, /* DT */ nullptr,
+                                                /* TLI */ nullptr)))
+      return nullptr;
+    for (Value *Op : I.operands()) {
+      Value *NewOp = reproduceValue(A, QueryingAA, *Op, Ty, CtxI, Check, VMap);
+      if (!NewOp) {
+        assert(Check && "Manifest of new value unexpectedly failed!");
+        return nullptr;
+      }
+      if (!Check)
+        VMap[Op] = NewOp;
+    }
+    if (Check)
+      return &I;
+
+    Instruction *CloneI = I.clone();
+    // TODO: Try to salvage debug information here.
+    CloneI->setDebugLoc(DebugLoc());
+    VMap[&I] = CloneI;
+    CloneI->insertBefore(CtxI);
+    RemapInstruction(CloneI, VMap);
+    return CloneI;
+  }
+
+  /// Reproduce \p V with type \p Ty or return nullptr if that is not posisble.
+  /// If \p Check is true we will only verify such an operation would suceed and
+  /// return a non-nullptr value if that is the case. No IR is generated or
+  /// modified.
+  static Value *reproduceValue(Attributor &A,
+                               const AbstractAttribute &QueryingAA, Value &V,
+                               Type &Ty, Instruction *CtxI, bool Check,
+                               ValueToValueMapTy &VMap) {
+    if (const auto &NewV = VMap.lookup(&V))
+      return NewV;
+    bool UsedAssumedInformation = false;
+    Optional<Value *> SimpleV =
+        A.getAssumedSimplified(V, QueryingAA, UsedAssumedInformation);
+    if (!SimpleV)
+      return PoisonValue::get(&Ty);
+    Value *EffectiveV = &V;
+    if (SimpleV.getValue())
+      EffectiveV = SimpleV.getValue();
+    if (auto *C = dyn_cast<Constant>(EffectiveV))
+      if (!C->canTrap())
+        return C;
+    if (CtxI && AA::isValidAtPosition(AA::ValueAndContext(*EffectiveV, *CtxI),
+                                      A.getInfoCache()))
+      return ensureType(A, *EffectiveV, Ty, CtxI, Check);
+    if (auto *I = dyn_cast<Instruction>(EffectiveV))
+      if (Value *NewV = reproduceInst(A, QueryingAA, *I, Ty, CtxI, Check, VMap))
+        return ensureType(A, *NewV, Ty, CtxI, Check);
+    return nullptr;
+  }
+
   /// Return a value we can use as replacement for the associated one, or
   /// nullptr if we don't have one that makes sense.
-  Value *getReplacementValue(Attributor &A) const {
-    Value *NewV;
-    NewV = SimplifiedAssociatedValue.hasValue()
-               ? SimplifiedAssociatedValue.getValue()
-               : UndefValue::get(getAssociatedType());
-    if (!NewV)
-      return nullptr;
-    NewV = AA::getWithType(*NewV, *getAssociatedType());
-    if (!NewV || NewV == &getAssociatedValue())
-      return nullptr;
-    const Instruction *CtxI = getCtxI();
-    if (CtxI && !AA::isValidAtPosition(*NewV, *CtxI, A.getInfoCache()))
-      return nullptr;
-    if (!CtxI && !AA::isValidInScope(*NewV, getAnchorScope()))
-      return nullptr;
-    return NewV;
+  Value *manifestReplacementValue(Attributor &A, Instruction *CtxI) const {
+    Value *NewV = SimplifiedAssociatedValue
+                      ? SimplifiedAssociatedValue.getValue()
+                      : UndefValue::get(getAssociatedType());
+    if (NewV && NewV != &getAssociatedValue()) {
+      ValueToValueMapTy VMap;
+      // First verify we can reprduce the value with the required type at the
+      // context location before we actually start modifying the IR.
+      if (reproduceValue(A, *this, *NewV, *getAssociatedType(), CtxI,
+                         /* CheckOnly */ true, VMap))
+        return reproduceValue(A, *this, *NewV, *getAssociatedType(), CtxI,
+                              /* CheckOnly */ false, VMap);
+    }
+    return nullptr;
   }
 
   /// Helper function for querying AAValueSimplify and updating candicate.
@@ -5300,14 +5576,14 @@ struct AAValueSimplifyImpl : AAValueSimplify {
     const auto &AA =
         A.getAAFor<AAType>(*this, getIRPosition(), DepClassTy::NONE);
 
-    Optional<ConstantInt *> COpt = AA.getAssumedConstantInt(A);
+    Optional<Constant *> COpt = AA.getAssumedConstant(A);
 
-    if (!COpt.hasValue()) {
+    if (!COpt) {
       SimplifiedAssociatedValue = llvm::None;
       A.recordDependence(AA, *this, DepClassTy::OPTIONAL);
       return true;
     }
-    if (auto *C = COpt.getValue()) {
+    if (auto *C = *COpt) {
       SimplifiedAssociatedValue = C;
       A.recordDependence(AA, *this, DepClassTy::OPTIONAL);
       return true;
@@ -5318,7 +5594,7 @@ struct AAValueSimplifyImpl : AAValueSimplify {
   bool askSimplifiedValueForOtherAAs(Attributor &A) {
     if (askSimplifiedValueFor<AAValueConstantRange>(A))
       return true;
-    if (askSimplifiedValueFor<AAPotentialValues>(A))
+    if (askSimplifiedValueFor<AAPotentialConstantValues>(A))
       return true;
     return false;
   }
@@ -5326,14 +5602,18 @@ struct AAValueSimplifyImpl : AAValueSimplify {
   /// See AbstractAttribute::manifest(...).
   ChangeStatus manifest(Attributor &A) override {
     ChangeStatus Changed = ChangeStatus::UNCHANGED;
-    if (getAssociatedValue().user_empty())
-      return Changed;
-
-    if (auto *NewV = getReplacementValue(A)) {
-      LLVM_DEBUG(dbgs() << "[ValueSimplify] " << getAssociatedValue() << " -> "
-                        << *NewV << " :: " << *this << "\n");
-      if (A.changeValueAfterManifest(getAssociatedValue(), *NewV))
-        Changed = ChangeStatus::CHANGED;
+    for (auto &U : getAssociatedValue().uses()) {
+      // Check if we need to adjust the insertion point to make sure the IR is
+      // valid.
+      Instruction *IP = dyn_cast<Instruction>(U.getUser());
+      if (auto *PHI = dyn_cast_or_null<PHINode>(IP))
+        IP = PHI->getIncomingBlock(U)->getTerminator();
+      if (auto *NewV = manifestReplacementValue(A, IP)) {
+        LLVM_DEBUG(dbgs() << "[ValueSimplify] " << getAssociatedValue()
+                          << " -> " << *NewV << " :: " << *this << "\n");
+        if (A.changeUseAfterManifest(U, *NewV))
+          Changed = ChangeStatus::CHANGED;
+      }
     }
 
     return Changed | AAValueSimplify::manifest(A);
@@ -5344,73 +5624,6 @@ struct AAValueSimplifyImpl : AAValueSimplify {
     SimplifiedAssociatedValue = &getAssociatedValue();
     return AAValueSimplify::indicatePessimisticFixpoint();
   }
-
-  static bool handleLoad(Attributor &A, const AbstractAttribute &AA,
-                         LoadInst &L, function_ref<bool(Value &)> Union) {
-    auto UnionWrapper = [&](Value &V, Value &Obj) {
-      if (isa<AllocaInst>(Obj))
-        return Union(V);
-      if (!AA::isDynamicallyUnique(A, AA, V))
-        return false;
-      if (!AA::isValidAtPosition(V, L, A.getInfoCache()))
-        return false;
-      return Union(V);
-    };
-
-    Value &Ptr = *L.getPointerOperand();
-    SmallVector<Value *, 8> Objects;
-    if (!AA::getAssumedUnderlyingObjects(A, Ptr, Objects, AA, &L))
-      return false;
-
-    const auto *TLI =
-        A.getInfoCache().getTargetLibraryInfoForFunction(*L.getFunction());
-    for (Value *Obj : Objects) {
-      LLVM_DEBUG(dbgs() << "Visit underlying object " << *Obj << "\n");
-      if (isa<UndefValue>(Obj))
-        continue;
-      if (isa<ConstantPointerNull>(Obj)) {
-        // A null pointer access can be undefined but any offset from null may
-        // be OK. We do not try to optimize the latter.
-        bool UsedAssumedInformation = false;
-        if (!NullPointerIsDefined(L.getFunction(),
-                                  Ptr.getType()->getPointerAddressSpace()) &&
-            A.getAssumedSimplified(Ptr, AA, UsedAssumedInformation) == Obj)
-          continue;
-        return false;
-      }
-      Constant *InitialVal = AA::getInitialValueForObj(*Obj, *L.getType(), TLI);
-      if (!InitialVal || !Union(*InitialVal))
-        return false;
-
-      LLVM_DEBUG(dbgs() << "Underlying object amenable to load-store "
-                           "propagation, checking accesses next.\n");
-
-      auto CheckAccess = [&](const AAPointerInfo::Access &Acc, bool IsExact) {
-        LLVM_DEBUG(dbgs() << " - visit access " << Acc << "\n");
-        if (Acc.isWrittenValueYetUndetermined())
-          return true;
-        Value *Content = Acc.getWrittenValue();
-        if (!Content)
-          return false;
-        Value *CastedContent =
-            AA::getWithType(*Content, *AA.getAssociatedType());
-        if (!CastedContent)
-          return false;
-        if (IsExact)
-          return UnionWrapper(*CastedContent, *Obj);
-        if (auto *C = dyn_cast<Constant>(CastedContent))
-          if (C->isNullValue() || C->isAllOnesValue() || isa<UndefValue>(C))
-            return UnionWrapper(*CastedContent, *Obj);
-        return false;
-      };
-
-      auto &PI = A.getAAFor<AAPointerInfo>(AA, IRPosition::value(*Obj),
-                                           DepClassTy::REQUIRED);
-      if (!PI.forallInterferingWrites(A, AA, L, CheckAccess))
-        return false;
-    }
-    return true;
-  }
 };
 
 struct AAValueSimplifyArgument final : AAValueSimplifyImpl {
@@ -5425,15 +5638,6 @@ struct AAValueSimplifyArgument final : AAValueSimplifyImpl {
                  Attribute::StructRet, Attribute::Nest, Attribute::ByVal},
                 /* IgnoreSubsumingPositions */ true))
       indicatePessimisticFixpoint();
-
-    // FIXME: This is a hack to prevent us from propagating function poiner in
-    // the new pass manager CGSCC pass as it creates call edges the
-    // CallGraphUpdater cannot handle yet.
-    Value &V = getAssociatedValue();
-    if (V.getType()->isPointerTy() &&
-        V.getType()->getPointerElementType()->isFunctionTy() &&
-        !A.isModulePass())
-      indicatePessimisticFixpoint();
   }
 
   /// See AbstractAttribute::updateImpl(...).
@@ -5466,7 +5670,7 @@ struct AAValueSimplifyArgument final : AAValueSimplifyImpl {
       bool UsedAssumedInformation = false;
       Optional<Constant *> SimpleArgOp =
           A.getAssumedConstant(ACSArgPos, *this, UsedAssumedInformation);
-      if (!SimpleArgOp.hasValue())
+      if (!SimpleArgOp)
         return true;
       if (!SimpleArgOp.getValue())
         return false;
@@ -5477,14 +5681,14 @@ struct AAValueSimplifyArgument final : AAValueSimplifyImpl {
 
     // Generate a answer specific to a call site context.
     bool Success;
-    bool AllCallSitesKnown;
+    bool UsedAssumedInformation = false;
     if (hasCallBaseContext() &&
         getCallBaseContext()->getCalledFunction() == Arg->getParent())
       Success = PredForCallSite(
           AbstractCallSite(&getCallBaseContext()->getCalledOperandUse()));
     else
       Success = A.checkForAllCallSites(PredForCallSite, *this, true,
-                                       AllCallSitesKnown);
+                                       UsedAssumedInformation);
 
     if (!Success)
       if (!askSimplifiedValueForOtherAAs(A))
@@ -5516,12 +5720,16 @@ struct AAValueSimplifyReturned : AAValueSimplifyImpl {
   ChangeStatus updateImpl(Attributor &A) override {
     auto Before = SimplifiedAssociatedValue;
 
-    auto PredForReturned = [&](Value &V) {
-      return checkAndUpdate(A, *this,
-                            IRPosition::value(V, getCallBaseContext()));
+    auto ReturnInstCB = [&](Instruction &I) {
+      auto &RI = cast<ReturnInst>(I);
+      return checkAndUpdate(
+          A, *this,
+          IRPosition::value(*RI.getReturnValue(), getCallBaseContext()));
     };
 
-    if (!A.checkForAllReturnedValues(PredForReturned, *this))
+    bool UsedAssumedInformation = false;
+    if (!A.checkForAllInstructions(ReturnInstCB, *this, {Instruction::Ret},
+                                   UsedAssumedInformation))
       if (!askSimplifiedValueForOtherAAs(A))
         return indicatePessimisticFixpoint();
 
@@ -5531,29 +5739,9 @@ struct AAValueSimplifyReturned : AAValueSimplifyImpl {
   }
 
   ChangeStatus manifest(Attributor &A) override {
-    ChangeStatus Changed = ChangeStatus::UNCHANGED;
-
-    if (auto *NewV = getReplacementValue(A)) {
-      auto PredForReturned =
-          [&](Value &, const SmallSetVector<ReturnInst *, 4> &RetInsts) {
-            for (ReturnInst *RI : RetInsts) {
-              Value *ReturnedVal = RI->getReturnValue();
-              if (ReturnedVal == NewV || isa<UndefValue>(ReturnedVal))
-                return true;
-              assert(RI->getFunction() == getAnchorScope() &&
-                     "ReturnInst in wrong function!");
-              LLVM_DEBUG(dbgs()
-                         << "[ValueSimplify] " << *ReturnedVal << " -> "
-                         << *NewV << " in " << *RI << " :: " << *this << "\n");
-              if (A.changeUseAfterManifest(RI->getOperandUse(0), *NewV))
-                Changed = ChangeStatus::CHANGED;
-            }
-            return true;
-          };
-      A.checkForAllReturnedValuesAndReturnInsts(PredForReturned, *this);
-    }
-
-    return Changed | AAValueSimplify::manifest(A);
+    // We queried AAValueSimplify for the returned values so they will be
+    // replaced if a simplified form was found. Nothing to do here.
+    return ChangeStatus::UNCHANGED;
   }
 
   /// See AbstractAttribute::trackStatistics()
@@ -5597,7 +5785,7 @@ struct AAValueSimplifyFloating : AAValueSimplifyImpl {
     const auto &SimplifiedLHS =
         A.getAssumedSimplified(IRPosition::value(*LHS, getCallBaseContext()),
                                *this, UsedAssumedInformation);
-    if (!SimplifiedLHS.hasValue())
+    if (!SimplifiedLHS)
       return true;
     if (!SimplifiedLHS.getValue())
       return false;
@@ -5606,7 +5794,7 @@ struct AAValueSimplifyFloating : AAValueSimplifyImpl {
     const auto &SimplifiedRHS =
         A.getAssumedSimplified(IRPosition::value(*RHS, getCallBaseContext()),
                                *this, UsedAssumedInformation);
-    if (!SimplifiedRHS.hasValue())
+    if (!SimplifiedRHS)
       return true;
     if (!SimplifiedRHS.getValue())
       return false;
@@ -5662,15 +5850,6 @@ struct AAValueSimplifyFloating : AAValueSimplifyImpl {
     return true;
   }
 
-  bool updateWithLoad(Attributor &A, LoadInst &L) {
-    auto Union = [&](Value &V) {
-      SimplifiedAssociatedValue = AA::combineOptionalValuesInAAValueLatice(
-          SimplifiedAssociatedValue, &V, L.getType());
-      return SimplifiedAssociatedValue != Optional<Value *>(nullptr);
-    };
-    return handleLoad(A, *this, L, Union);
-  }
-
   /// Use the generic, non-optimistic InstSimplfy functionality if we managed to
   /// simplify any operand of the instruction \p I. Return true if successful,
   /// in that case SimplifiedAssociatedValue will be updated.
@@ -5686,7 +5865,7 @@ struct AAValueSimplifyFloating : AAValueSimplifyImpl {
                                  *this, UsedAssumedInformation);
       // If we are not sure about any operand we are not sure about the entire
       // instruction, we'll wait.
-      if (!SimplifiedOp.hasValue())
+      if (!SimplifiedOp)
         return true;
 
       if (SimplifiedOp.getValue())
@@ -5714,7 +5893,7 @@ struct AAValueSimplifyFloating : AAValueSimplifyImpl {
     const DataLayout &DL = I.getModule()->getDataLayout();
     SimplifyQuery Q(DL, TLI, DT, AC, &I);
     if (Value *SimplifiedI =
-            SimplifyInstructionWithOperands(&I, NewOps, Q, ORE)) {
+            simplifyInstructionWithOperands(&I, NewOps, Q, ORE)) {
       SimplifiedAssociatedValue = AA::combineOptionalValuesInAAValueLatice(
           SimplifiedAssociatedValue, SimplifiedI, I.getType());
       return SimplifiedAssociatedValue != Optional<Value *>(nullptr);
@@ -5726,6 +5905,36 @@ struct AAValueSimplifyFloating : AAValueSimplifyImpl {
   ChangeStatus updateImpl(Attributor &A) override {
     auto Before = SimplifiedAssociatedValue;
 
+    // Do not simplify loads that are only used in llvm.assume if we cannot also
+    // remove all stores that may feed into the load. The reason is that the
+    // assume is probably worth something as long as the stores are around.
+    if (auto *LI = dyn_cast<LoadInst>(&getAssociatedValue())) {
+      InformationCache &InfoCache = A.getInfoCache();
+      if (InfoCache.isOnlyUsedByAssume(*LI)) {
+        SmallSetVector<Value *, 4> PotentialCopies;
+        SmallSetVector<Instruction *, 4> PotentialValueOrigins;
+        bool UsedAssumedInformation = false;
+        if (AA::getPotentiallyLoadedValues(A, *LI, PotentialCopies,
+                                           PotentialValueOrigins, *this,
+                                           UsedAssumedInformation,
+                                           /* OnlyExact */ true)) {
+          if (!llvm::all_of(PotentialValueOrigins, [&](Instruction *I) {
+                if (!I)
+                  return true;
+                if (auto *SI = dyn_cast<StoreInst>(I))
+                  return A.isAssumedDead(SI->getOperandUse(0), this,
+                                         /* LivenessAA */ nullptr,
+                                         UsedAssumedInformation,
+                                         /* CheckBBLivenessOnly */ false);
+                return A.isAssumedDead(*I, this, /* LivenessAA */ nullptr,
+                                       UsedAssumedInformation,
+                                       /* CheckBBLivenessOnly */ false);
+              }))
+            return indicatePessimisticFixpoint();
+        }
+      }
+    }
+
     auto VisitValueCB = [&](Value &V, const Instruction *CtxI, bool &,
                             bool Stripped) -> bool {
       auto &AA = A.getAAFor<AAValueSimplify>(
@@ -5734,9 +5943,6 @@ struct AAValueSimplifyFloating : AAValueSimplifyImpl {
       if (!Stripped && this == &AA) {
 
         if (auto *I = dyn_cast<Instruction>(&V)) {
-          if (auto *LI = dyn_cast<LoadInst>(&V))
-            if (updateWithLoad(A, *LI))
-              return true;
           if (auto *Cmp = dyn_cast<CmpInst>(&V))
             if (handleCmp(A, *Cmp))
               return true;
@@ -5754,8 +5960,10 @@ struct AAValueSimplifyFloating : AAValueSimplifyImpl {
     };
 
     bool Dummy = false;
+    bool UsedAssumedInformation = false;
     if (!genericValueTraversal<bool>(A, getIRPosition(), *this, Dummy,
                                      VisitValueCB, getCtxI(),
+                                     UsedAssumedInformation,
                                      /* UseValueSimplify */ false))
       if (!askSimplifiedValueForOtherAAs(A))
         return indicatePessimisticFixpoint();
@@ -5806,8 +6014,23 @@ struct AAValueSimplifyCallSiteReturned : AAValueSimplifyImpl {
 
   void initialize(Attributor &A) override {
     AAValueSimplifyImpl::initialize(A);
-    if (!getAssociatedFunction())
+    Function *Fn = getAssociatedFunction();
+    if (!Fn) {
       indicatePessimisticFixpoint();
+      return;
+    }
+    for (Argument &Arg : Fn->args()) {
+      if (Arg.hasReturnedAttr()) {
+        auto IRP = IRPosition::callsite_argument(*cast<CallBase>(getCtxI()),
+                                                 Arg.getArgNo());
+        if (IRP.getPositionKind() == IRPosition::IRP_CALL_SITE_ARGUMENT &&
+            checkAndUpdate(A, *this, IRP))
+          indicateOptimisticFixpoint();
+        else
+          indicatePessimisticFixpoint();
+        return;
+      }
+    }
   }
 
   /// See AbstractAttribute::updateImpl(...).
@@ -5845,8 +6068,13 @@ struct AAValueSimplifyCallSiteArgument : AAValueSimplifyFloating {
   /// See AbstractAttribute::manifest(...).
   ChangeStatus manifest(Attributor &A) override {
     ChangeStatus Changed = ChangeStatus::UNCHANGED;
+    // TODO: We should avoid simplification duplication to begin with.
+    auto *FloatAA = A.lookupAAFor<AAValueSimplify>(
+        IRPosition::value(getAssociatedValue()), this, DepClassTy::NONE);
+    if (FloatAA && FloatAA->getState().isValidState())
+      return Changed;
 
-    if (auto *NewV = getReplacementValue(A)) {
+    if (auto *NewV = manifestReplacementValue(A, getCtxI())) {
       Use &U = cast<CallBase>(&getAnchorValue())
                    ->getArgOperandUse(getCallSiteArgNo());
       if (A.changeUseAfterManifest(U, *NewV))
@@ -5860,8 +6088,10 @@ struct AAValueSimplifyCallSiteArgument : AAValueSimplifyFloating {
     STATS_DECLTRACK_CSARG_ATTR(value_simplify)
   }
 };
+} // namespace
 
 /// ----------------------- Heap-To-Stack Conversion ---------------------------
+namespace {
 struct AAHeapToStackFunction final : public AAHeapToStack {
 
   struct AllocationInfo {
@@ -5883,7 +6113,7 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
     bool HasPotentiallyFreeingUnknownUses = false;
 
     /// The set of free calls that use this allocation.
-    SmallPtrSet<CallBase *, 1> PotentialFreeCalls{};
+    SmallSetVector<CallBase *, 1> PotentialFreeCalls{};
   };
 
   struct DeallocationInfo {
@@ -5895,7 +6125,7 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
     bool MightFreeUnknownObjects = false;
 
     /// The set of allocation calls that are potentially freed.
-    SmallPtrSet<CallBase *, 1> PotentialAllocationCalls{};
+    SmallSetVector<CallBase *, 1> PotentialAllocationCalls{};
   };
 
   AAHeapToStackFunction(const IRPosition &IRP, Attributor &A)
@@ -5905,9 +6135,9 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
     // Ensure we call the destructor so we release any memory allocated in the
     // sets.
     for (auto &It : AllocationInfos)
-      It.getSecond()->~AllocationInfo();
+      It.second->~AllocationInfo();
     for (auto &It : DeallocationInfos)
-      It.getSecond()->~DeallocationInfo();
+      It.second->~DeallocationInfo();
   }
 
   void initialize(Attributor &A) override {
@@ -5932,7 +6162,8 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
         if (nullptr != getInitialValueOfAllocation(CB, TLI, I8Ty)) {
           AllocationInfo *AI = new (A.Allocator) AllocationInfo{CB};
           AllocationInfos[CB] = AI;
-          TLI->getLibFunc(*CB, AI->LibraryFunctionId);
+          if (TLI)
+            TLI->getLibFunc(*CB, AI->LibraryFunctionId);
         }
       }
       return true;
@@ -5945,6 +6176,16 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
         /* CheckPotentiallyDead */ true);
     (void)Success;
     assert(Success && "Did not expect the call base visit callback to fail!");
+
+    Attributor::SimplifictionCallbackTy SCB =
+        [](const IRPosition &, const AbstractAttribute *,
+           bool &) -> Optional<Value *> { return nullptr; };
+    for (const auto &It : AllocationInfos)
+      A.registerSimplificationCallback(IRPosition::callsite_returned(*It.first),
+                                       SCB);
+    for (const auto &It : DeallocationInfos)
+      A.registerSimplificationCallback(IRPosition::callsite_returned(*It.first),
+                                       SCB);
   }
 
   const std::string getAsStr() const override {
@@ -5971,7 +6212,8 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
 
   bool isAssumedHeapToStack(const CallBase &CB) const override {
     if (isValidState())
-      if (AllocationInfo *AI = AllocationInfos.lookup(&CB))
+      if (AllocationInfo *AI =
+              AllocationInfos.lookup(const_cast<CallBase *>(&CB)))
         return AI->Status != AllocationInfo::INVALID;
     return false;
   }
@@ -6000,6 +6242,17 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
     Function *F = getAnchorScope();
     const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F);
 
+    LoopInfo *LI =
+        A.getInfoCache().getAnalysisResultForFunction<LoopAnalysis>(*F);
+    Optional<bool> MayContainIrreducibleControl;
+    auto IsInLoop = [&](BasicBlock &BB) {
+      if (!MayContainIrreducibleControl.has_value())
+        MayContainIrreducibleControl = mayContainIrreducibleControl(*F, LI);
+      if (MayContainIrreducibleControl.value())
+        return true;
+      return LI->getLoopFor(&BB) != nullptr;
+    };
+
     for (auto &It : AllocationInfos) {
       AllocationInfo &AI = *It.second;
       if (AI.Status == AllocationInfo::INVALID)
@@ -6026,13 +6279,13 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
       else
         A.emitRemark<OptimizationRemark>(AI.CB, "HeapToStack", Remark);
 
+      const DataLayout &DL = A.getInfoCache().getDL();
       Value *Size;
       Optional<APInt> SizeAPI = getSize(A, *this, AI);
-      if (SizeAPI.hasValue()) {
+      if (SizeAPI) {
         Size = ConstantInt::get(AI.CB->getContext(), *SizeAPI);
       } else {
         LLVMContext &Ctx = AI.CB->getContext();
-        auto &DL = A.getInfoCache().getDL();
         ObjectSizeOpts Opts;
         ObjectSizeOffsetEvaluator Eval(DL, TLI, Ctx, Opts);
         SizeOffsetEvalType SizeOffsetPair = Eval.compute(AI.CB);
@@ -6041,32 +6294,36 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
         Size = SizeOffsetPair.first;
       }
 
+      Instruction *IP = (!SizeAPI.has_value() || IsInLoop(*AI.CB->getParent()))
+                            ? AI.CB
+                            : &F->getEntryBlock().front();
+
       Align Alignment(1);
       if (MaybeAlign RetAlign = AI.CB->getRetAlign())
-        Alignment = max(Alignment, RetAlign);
+        Alignment = std::max(Alignment, *RetAlign);
       if (Value *Align = getAllocAlignment(AI.CB, TLI)) {
         Optional<APInt> AlignmentAPI = getAPInt(A, *this, *Align);
-        assert(AlignmentAPI.hasValue() &&
+        assert(AlignmentAPI && AlignmentAPI.getValue().getZExtValue() > 0 &&
                "Expected an alignment during manifest!");
-        Alignment =
-            max(Alignment, MaybeAlign(AlignmentAPI.getValue().getZExtValue()));
+        Alignment = std::max(
+            Alignment, assumeAligned(AlignmentAPI.getValue().getZExtValue()));
       }
 
-      unsigned AS = cast<PointerType>(AI.CB->getType())->getAddressSpace();
-      Instruction *Alloca =
-          new AllocaInst(Type::getInt8Ty(F->getContext()), AS, Size, Alignment,
-                         "", AI.CB->getNextNode());
+      // TODO: Hoist the alloca towards the function entry.
+      unsigned AS = DL.getAllocaAddrSpace();
+      Instruction *Alloca = new AllocaInst(Type::getInt8Ty(F->getContext()), AS,
+                                           Size, Alignment, "", IP);
 
       if (Alloca->getType() != AI.CB->getType())
-        Alloca = new BitCastInst(Alloca, AI.CB->getType(), "malloc_bc",
-                                 Alloca->getNextNode());
+        Alloca = BitCastInst::CreatePointerBitCastOrAddrSpaceCast(
+            Alloca, AI.CB->getType(), "malloc_cast", AI.CB);
 
       auto *I8Ty = Type::getInt8Ty(F->getContext());
       auto *InitVal = getInitialValueOfAllocation(AI.CB, TLI, I8Ty);
       assert(InitVal &&
              "Must be able to materialize initial memory state of allocation");
 
-      A.changeValueAfterManifest(*AI.CB, *Alloca);
+      A.changeAfterManifest(IRPosition::inst(*AI.CB), *Alloca);
 
       if (auto *II = dyn_cast<InvokeInst>(AI.CB)) {
         auto *NBB = II->getNormalDest();
@@ -6095,7 +6352,7 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
     bool UsedAssumedInformation = false;
     Optional<Constant *> SimpleV =
         A.getAssumedConstant(V, AA, UsedAssumedInformation);
-    if (!SimpleV.hasValue())
+    if (!SimpleV)
       return APInt(64, 0);
     if (auto *CI = dyn_cast_or_null<ConstantInt>(SimpleV.getValue()))
       return CI->getValue();
@@ -6120,11 +6377,11 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
 
   /// Collection of all malloc-like calls in a function with associated
   /// information.
-  DenseMap<CallBase *, AllocationInfo *> AllocationInfos;
+  MapVector<CallBase *, AllocationInfo *> AllocationInfos;
 
   /// Collection of all free-like calls in a function with associated
   /// information.
-  DenseMap<CallBase *, DeallocationInfo *> DeallocationInfos;
+  MapVector<CallBase *, DeallocationInfo *> DeallocationInfos;
 
   ChangeStatus updateImpl(Attributor &A) override;
 };
@@ -6167,7 +6424,8 @@ ChangeStatus AAHeapToStackFunction::updateImpl(Attributor &A) {
       // branches etc.
       SmallVector<Value *, 8> Objects;
       if (!AA::getAssumedUnderlyingObjects(A, *DI.CB->getArgOperand(0), Objects,
-                                           *this, DI.CB)) {
+                                           *this, DI.CB,
+                                           UsedAssumedInformation)) {
         LLVM_DEBUG(
             dbgs()
             << "[H2S] Unexpected failure in getAssumedUnderlyingObjects!\n");
@@ -6239,6 +6497,8 @@ ChangeStatus AAHeapToStackFunction::updateImpl(Attributor &A) {
           dbgs() << "[H2S] unique free call might free unknown allocations\n");
       return false;
     }
+    if (DI->PotentialAllocationCalls.empty())
+      return true;
     if (DI->PotentialAllocationCalls.size() > 1) {
       LLVM_DEBUG(dbgs() << "[H2S] unique free call might free "
                         << DI->PotentialAllocationCalls.size()
@@ -6316,7 +6576,7 @@ ChangeStatus AAHeapToStackFunction::updateImpl(Attributor &A) {
 
           if (ValidUsesOnly &&
               AI.LibraryFunctionId == LibFunc___kmpc_alloc_shared)
-            A.emitRemark<OptimizationRemarkMissed>(AI.CB, "OMP113", Remark);
+            A.emitRemark<OptimizationRemarkMissed>(CB, "OMP113", Remark);
 
           LLVM_DEBUG(dbgs() << "[H2S] Bad user: " << *UserI << "\n");
           ValidUsesOnly = false;
@@ -6348,7 +6608,8 @@ ChangeStatus AAHeapToStackFunction::updateImpl(Attributor &A) {
       continue;
 
     if (Value *Align = getAllocAlignment(AI.CB, TLI)) {
-      if (!getAPInt(A, *this, *Align)) {
+      Optional<APInt> APAlign = getAPInt(A, *this, *Align);
+      if (!APAlign) {
         // Can't generate an alloca which respects the required alignment
         // on the allocation.
         LLVM_DEBUG(dbgs() << "[H2S] Unknown allocation alignment: " << *AI.CB
@@ -6356,14 +6617,23 @@ ChangeStatus AAHeapToStackFunction::updateImpl(Attributor &A) {
         AI.Status = AllocationInfo::INVALID;
         Changed = ChangeStatus::CHANGED;
         continue;
+      } else {
+        if (APAlign->ugt(llvm::Value::MaximumAlignment) ||
+            !APAlign->isPowerOf2()) {
+          LLVM_DEBUG(dbgs() << "[H2S] Invalid allocation alignment: " << APAlign
+                            << "\n");
+          AI.Status = AllocationInfo::INVALID;
+          Changed = ChangeStatus::CHANGED;
+          continue;
+        }
       }
     }
 
     if (MaxHeapToStackSize != -1) {
       Optional<APInt> Size = getSize(A, *this, AI);
-      if (!Size.hasValue() || Size.getValue().ugt(MaxHeapToStackSize)) {
+      if (!Size || Size.getValue().ugt(MaxHeapToStackSize)) {
         LLVM_DEBUG({
-          if (!Size.hasValue())
+          if (!Size)
             dbgs() << "[H2S] Unknown allocation size: " << *AI.CB << "\n";
           else
             dbgs() << "[H2S] Allocation size too large: " << *AI.CB << " vs. "
@@ -6395,8 +6665,10 @@ ChangeStatus AAHeapToStackFunction::updateImpl(Attributor &A) {
 
   return Changed;
 }
+} // namespace
 
 /// ----------------------- Privatizable Pointers ------------------------------
+namespace {
 struct AAPrivatizablePtrImpl : public AAPrivatizablePtr {
   AAPrivatizablePtrImpl(const IRPosition &IRP, Attributor &A)
       : AAPrivatizablePtr(IRP, A), PrivatizableType(llvm::None) {}
@@ -6414,9 +6686,9 @@ struct AAPrivatizablePtrImpl : public AAPrivatizablePtr {
   /// Return a privatizable type that encloses both T0 and T1.
   /// TODO: This is merely a stub for now as we should manage a mapping as well.
   Optional<Type *> combineTypes(Optional<Type *> T0, Optional<Type *> T1) {
-    if (!T0.hasValue())
+    if (!T0)
       return T1;
-    if (!T1.hasValue())
+    if (!T1)
       return T0;
     if (T0 == T1)
       return T0;
@@ -6445,11 +6717,13 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
   Optional<Type *> identifyPrivatizableType(Attributor &A) override {
     // If this is a byval argument and we know all the call sites (so we can
     // rewrite them), there is no need to check them explicitly.
-    bool AllCallSitesKnown;
-    if (getIRPosition().hasAttr(Attribute::ByVal) &&
+    bool UsedAssumedInformation = false;
+    SmallVector<Attribute, 1> Attrs;
+    getAttrs({Attribute::ByVal}, Attrs, /* IgnoreSubsumingPositions */ true);
+    if (!Attrs.empty() &&
         A.checkForAllCallSites([](AbstractCallSite ACS) { return true; }, *this,
-                               true, AllCallSitesKnown))
-      return getAssociatedValue().getType()->getPointerElementType();
+                               true, UsedAssumedInformation))
+      return Attrs[0].getValueAsType();
 
     Optional<Type *> Ty;
     unsigned ArgNo = getIRPosition().getCallSiteArgNo();
@@ -6474,9 +6748,9 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
 
       LLVM_DEBUG({
         dbgs() << "[AAPrivatizablePtr] ACSPos: " << ACSArgPos << ", CSTy: ";
-        if (CSTy.hasValue() && CSTy.getValue())
+        if (CSTy && CSTy.getValue())
           CSTy.getValue()->print(dbgs());
-        else if (CSTy.hasValue())
+        else if (CSTy)
           dbgs() << "<nullptr>";
         else
           dbgs() << "<none>";
@@ -6486,19 +6760,20 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
 
       LLVM_DEBUG({
         dbgs() << " : New Type: ";
-        if (Ty.hasValue() && Ty.getValue())
+        if (Ty && Ty.getValue())
           Ty.getValue()->print(dbgs());
-        else if (Ty.hasValue())
+        else if (Ty)
           dbgs() << "<nullptr>";
         else
           dbgs() << "<none>";
         dbgs() << "\n";
       });
 
-      return !Ty.hasValue() || Ty.getValue();
+      return !Ty || Ty.getValue();
     };
 
-    if (!A.checkForAllCallSites(CallSiteCheck, *this, true, AllCallSitesKnown))
+    if (!A.checkForAllCallSites(CallSiteCheck, *this, true,
+                                UsedAssumedInformation))
       return nullptr;
     return Ty;
   }
@@ -6506,7 +6781,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
   /// See AbstractAttribute::updateImpl(...).
   ChangeStatus updateImpl(Attributor &A) override {
     PrivatizableType = identifyPrivatizableType(A);
-    if (!PrivatizableType.hasValue())
+    if (!PrivatizableType)
       return ChangeStatus::UNCHANGED;
     if (!PrivatizableType.getValue())
       return indicatePessimisticFixpoint();
@@ -6518,8 +6793,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
 
     // Avoid arguments with padding for now.
     if (!getIRPosition().hasAttr(Attribute::ByVal) &&
-        !ArgumentPromotionPass::isDenselyPacked(PrivatizableType.getValue(),
-                                                A.getInfoCache().getDL())) {
+        !isDenselyPacked(*PrivatizableType, A.getInfoCache().getDL())) {
       LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] Padding detected\n");
       return indicatePessimisticFixpoint();
     }
@@ -6527,7 +6801,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
     // Collect the types that will replace the privatizable type in the function
     // signature.
     SmallVector<Type *, 16> ReplacementTypes;
-    identifyReplacementTypes(PrivatizableType.getValue(), ReplacementTypes);
+    identifyReplacementTypes(*PrivatizableType, ReplacementTypes);
 
     // Verify callee and caller agree on how the promoted argument would be
     // passed.
@@ -6545,9 +6819,9 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
       return TTI->areTypesABICompatible(
           CB->getCaller(), CB->getCalledFunction(), ReplacementTypes);
     };
-    bool AllCallSitesKnown;
+    bool UsedAssumedInformation = false;
     if (!A.checkForAllCallSites(CallSiteCheck, *this, true,
-                                AllCallSitesKnown)) {
+                                UsedAssumedInformation)) {
       LLVM_DEBUG(
           dbgs() << "[AAPrivatizablePtr] ABI incompatibility detected for "
                  << Fn.getName() << "\n");
@@ -6595,7 +6869,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
               *this, IRPosition::argument(CBArg), DepClassTy::REQUIRED);
           if (CBArgPrivAA.isValidState()) {
             auto CBArgPrivTy = CBArgPrivAA.getPrivatizableType();
-            if (!CBArgPrivTy.hasValue())
+            if (!CBArgPrivTy)
               continue;
             if (CBArgPrivTy.getValue() == PrivatizableType)
               continue;
@@ -6642,7 +6916,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
             DepClassTy::REQUIRED);
         if (DCArgPrivAA.isValidState()) {
           auto DCArgPrivTy = DCArgPrivAA.getPrivatizableType();
-          if (!DCArgPrivTy.hasValue())
+          if (!DCArgPrivTy)
             return true;
           if (DCArgPrivTy.getValue() == PrivatizableType)
             return true;
@@ -6674,7 +6948,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
     };
 
     if (!A.checkForAllCallSites(IsCompatiblePrivArgOfOtherCallSite, *this, true,
-                                AllCallSitesKnown))
+                                UsedAssumedInformation))
       return indicatePessimisticFixpoint();
 
     return ChangeStatus::UNCHANGED;
@@ -6749,8 +7023,8 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
 
     Type *PrivPtrType = PrivType->getPointerTo();
     if (Base->getType() != PrivPtrType)
-      Base = BitCastInst::CreateBitOrPointerCast(Base, PrivPtrType, "",
-                                                 ACS.getInstruction());
+      Base = BitCastInst::CreatePointerBitCastOrAddrSpaceCast(
+          Base, PrivPtrType, "", ACS.getInstruction());
 
     // Traverse the type, build GEPs and loads.
     if (auto *PrivStructType = dyn_cast<StructType>(PrivType)) {
@@ -6784,7 +7058,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
 
   /// See AbstractAttribute::manifest(...)
   ChangeStatus manifest(Attributor &A) override {
-    if (!PrivatizableType.hasValue())
+    if (!PrivatizableType)
       return ChangeStatus::UNCHANGED;
     assert(PrivatizableType.getValue() && "Expected privatizable type!");
 
@@ -6817,14 +7091,16 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
             Function &ReplacementFn, Function::arg_iterator ArgIt) {
           BasicBlock &EntryBB = ReplacementFn.getEntryBlock();
           Instruction *IP = &*EntryBB.getFirstInsertionPt();
-          Instruction *AI = new AllocaInst(PrivatizableType.getValue(), 0,
+          const DataLayout &DL = IP->getModule()->getDataLayout();
+          unsigned AS = DL.getAllocaAddrSpace();
+          Instruction *AI = new AllocaInst(PrivatizableType.getValue(), AS,
                                            Arg->getName() + ".priv", IP);
           createInitialization(PrivatizableType.getValue(), *AI, ReplacementFn,
                                ArgIt->getArgNo(), *IP);
 
           if (AI->getType() != Arg->getType())
-            AI =
-                BitCastInst::CreateBitOrPointerCast(AI, Arg->getType(), "", IP);
+            AI = BitCastInst::CreatePointerBitCastOrAddrSpaceCast(
+                AI, Arg->getType(), "", IP);
           Arg->replaceAllUsesWith(AI);
 
           for (CallInst *CI : TailCalls)
@@ -6841,8 +7117,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
           // When no alignment is specified for the load instruction,
           // natural alignment is assumed.
           createReplacementValues(
-              assumeAligned(AlignAA.getAssumedAlign()),
-              PrivatizableType.getValue(), ACS,
+              AlignAA.getAssumedAlign(), *PrivatizableType, ACS,
               ACS.getCallArgOperand(ARI.getReplacedArg().getArgNo()),
               NewArgOperands);
         };
@@ -6850,7 +7125,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
     // Collect the types that will replace the privatizable type in the function
     // signature.
     SmallVector<Type *, 16> ReplacementTypes;
-    identifyReplacementTypes(PrivatizableType.getValue(), ReplacementTypes);
+    identifyReplacementTypes(*PrivatizableType, ReplacementTypes);
 
     // Register a rewrite of the argument.
     if (A.registerFunctionSignatureRewrite(*Arg, ReplacementTypes,
@@ -6897,7 +7172,7 @@ struct AAPrivatizablePtrFloating : public AAPrivatizablePtrImpl {
       auto &PrivArgAA = A.getAAFor<AAPrivatizablePtr>(
           *this, IRPosition::argument(*Arg), DepClassTy::REQUIRED);
       if (PrivArgAA.isAssumedPrivatizablePtr())
-        return Obj->getType()->getPointerElementType();
+        return PrivArgAA.getPrivatizableType();
     }
 
     LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] Underlying object neither valid "
@@ -6926,7 +7201,7 @@ struct AAPrivatizablePtrCallSiteArgument final
   /// See AbstractAttribute::updateImpl(...).
   ChangeStatus updateImpl(Attributor &A) override {
     PrivatizableType = identifyPrivatizableType(A);
-    if (!PrivatizableType.hasValue())
+    if (!PrivatizableType)
       return ChangeStatus::UNCHANGED;
     if (!PrivatizableType.getValue())
       return indicatePessimisticFixpoint();
@@ -6992,10 +7267,12 @@ struct AAPrivatizablePtrReturned final : public AAPrivatizablePtrFloating {
     STATS_DECLTRACK_FNRET_ATTR(privatizable_ptr);
   }
 };
+} // namespace
 
 /// -------------------- Memory Behavior Attributes ----------------------------
 /// Includes read-none, read-only, and write-only.
 /// ----------------------------------------------------------------------------
+namespace {
 struct AAMemoryBehaviorImpl : public AAMemoryBehavior {
   AAMemoryBehaviorImpl(const IRPosition &IRP, Attributor &A)
       : AAMemoryBehavior(IRP, A) {}
@@ -7495,6 +7772,7 @@ void AAMemoryBehaviorFloating::analyzeUseIn(Attributor &A, const Use &U,
   if (UserI->mayWriteToMemory())
     removeAssumedBits(NO_WRITES);
 }
+} // namespace
 
 /// -------------------- Memory Locations Attributes ---------------------------
 /// Includes read-none, argmemonly, inaccessiblememonly,
@@ -7528,6 +7806,7 @@ std::string AAMemoryLocation::getMemoryLocationsAsStr(
   return S;
 }
 
+namespace {
 struct AAMemoryLocationImpl : public AAMemoryLocation {
 
   AAMemoryLocationImpl(const IRPosition &IRP, Attributor &A)
@@ -7772,8 +8051,10 @@ void AAMemoryLocationImpl::categorizePtrValue(
                     << getMemoryLocationsAsStr(State.getAssumed()) << "]\n");
 
   SmallVector<Value *, 8> Objects;
+  bool UsedAssumedInformation = false;
   if (!AA::getAssumedUnderlyingObjects(A, Ptr, Objects, *this, &I,
-                                       /* Intraprocedural */ true)) {
+                                       UsedAssumedInformation,
+                                       AA::Intraprocedural)) {
     LLVM_DEBUG(
         dbgs() << "[AAMemoryLocation] Pointer locations not categorized\n");
     updateStateAndAccessesMap(State, NO_UNKOWN_MEM, &I, nullptr, Changed,
@@ -8042,9 +8323,11 @@ struct AAMemoryLocationCallSite final : AAMemoryLocationImpl {
       STATS_DECLTRACK_CS_ATTR(readnone)
   }
 };
+} // namespace
 
 /// ------------------ Value Constant Range Attribute -------------------------
 
+namespace {
 struct AAValueConstantRangeImpl : AAValueConstantRange {
   using StateType = IntegerRangeState;
   AAValueConstantRangeImpl(const IRPosition &IRP, Attributor &A)
@@ -8379,7 +8662,7 @@ struct AAValueConstantRangeFloating : AAValueConstantRangeImpl {
     const auto &SimplifiedLHS =
         A.getAssumedSimplified(IRPosition::value(*LHS, getCallBaseContext()),
                                *this, UsedAssumedInformation);
-    if (!SimplifiedLHS.hasValue())
+    if (!SimplifiedLHS)
       return true;
     if (!SimplifiedLHS.getValue())
       return false;
@@ -8388,7 +8671,7 @@ struct AAValueConstantRangeFloating : AAValueConstantRangeImpl {
     const auto &SimplifiedRHS =
         A.getAssumedSimplified(IRPosition::value(*RHS, getCallBaseContext()),
                                *this, UsedAssumedInformation);
-    if (!SimplifiedRHS.hasValue())
+    if (!SimplifiedRHS)
       return true;
     if (!SimplifiedRHS.getValue())
       return false;
@@ -8432,7 +8715,7 @@ struct AAValueConstantRangeFloating : AAValueConstantRangeImpl {
     const auto &SimplifiedOpV =
         A.getAssumedSimplified(IRPosition::value(*OpV, getCallBaseContext()),
                                *this, UsedAssumedInformation);
-    if (!SimplifiedOpV.hasValue())
+    if (!SimplifiedOpV)
       return true;
     if (!SimplifiedOpV.getValue())
       return false;
@@ -8462,7 +8745,7 @@ struct AAValueConstantRangeFloating : AAValueConstantRangeImpl {
     const auto &SimplifiedLHS =
         A.getAssumedSimplified(IRPosition::value(*LHS, getCallBaseContext()),
                                *this, UsedAssumedInformation);
-    if (!SimplifiedLHS.hasValue())
+    if (!SimplifiedLHS)
       return true;
     if (!SimplifiedLHS.getValue())
       return false;
@@ -8471,7 +8754,7 @@ struct AAValueConstantRangeFloating : AAValueConstantRangeImpl {
     const auto &SimplifiedRHS =
         A.getAssumedSimplified(IRPosition::value(*RHS, getCallBaseContext()),
                                *this, UsedAssumedInformation);
-    if (!SimplifiedRHS.hasValue())
+    if (!SimplifiedRHS)
       return true;
     if (!SimplifiedRHS.getValue())
       return false;
@@ -8536,7 +8819,7 @@ struct AAValueConstantRangeFloating : AAValueConstantRangeImpl {
         const auto &SimplifiedOpV =
             A.getAssumedSimplified(IRPosition::value(V, getCallBaseContext()),
                                    *this, UsedAssumedInformation);
-        if (!SimplifiedOpV.hasValue())
+        if (!SimplifiedOpV)
           return true;
         if (!SimplifiedOpV.getValue())
           return false;
@@ -8588,8 +8871,10 @@ struct AAValueConstantRangeFloating : AAValueConstantRangeImpl {
 
     IntegerRangeState T(getBitWidth());
 
+    bool UsedAssumedInformation = false;
     if (!genericValueTraversal<IntegerRangeState>(A, getIRPosition(), *this, T,
                                                   VisitValueCB, getCtxI(),
+                                                  UsedAssumedInformation,
                                                   /* UseValueSimplify */ false))
       return indicatePessimisticFixpoint();
 
@@ -8683,21 +8968,23 @@ struct AAValueConstantRangeCallSiteArgument : AAValueConstantRangeFloating {
     STATS_DECLTRACK_CSARG_ATTR(value_range)
   }
 };
+} // namespace
 
 /// ------------------ Potential Values Attribute -------------------------
 
-struct AAPotentialValuesImpl : AAPotentialValues {
+namespace {
+struct AAPotentialConstantValuesImpl : AAPotentialConstantValues {
   using StateType = PotentialConstantIntValuesState;
 
-  AAPotentialValuesImpl(const IRPosition &IRP, Attributor &A)
-      : AAPotentialValues(IRP, A) {}
+  AAPotentialConstantValuesImpl(const IRPosition &IRP, Attributor &A)
+      : AAPotentialConstantValues(IRP, A) {}
 
   /// See AbstractAttribute::initialize(..).
   void initialize(Attributor &A) override {
     if (A.hasSimplificationCallback(getIRPosition()))
       indicatePessimisticFixpoint();
     else
-      AAPotentialValues::initialize(A);
+      AAPotentialConstantValues::initialize(A);
   }
 
   /// See AbstractAttribute::getAsStr().
@@ -8714,13 +9001,14 @@ struct AAPotentialValuesImpl : AAPotentialValues {
   }
 };
 
-struct AAPotentialValuesArgument final
-    : AAArgumentFromCallSiteArguments<AAPotentialValues, AAPotentialValuesImpl,
+struct AAPotentialConstantValuesArgument final
+    : AAArgumentFromCallSiteArguments<AAPotentialConstantValues,
+                                      AAPotentialConstantValuesImpl,
                                       PotentialConstantIntValuesState> {
-  using Base =
-      AAArgumentFromCallSiteArguments<AAPotentialValues, AAPotentialValuesImpl,
-                                      PotentialConstantIntValuesState>;
-  AAPotentialValuesArgument(const IRPosition &IRP, Attributor &A)
+  using Base = AAArgumentFromCallSiteArguments<AAPotentialConstantValues,
+                                               AAPotentialConstantValuesImpl,
+                                               PotentialConstantIntValuesState>;
+  AAPotentialConstantValuesArgument(const IRPosition &IRP, Attributor &A)
       : Base(IRP, A) {}
 
   /// See AbstractAttribute::initialize(..).
@@ -8738,11 +9026,12 @@ struct AAPotentialValuesArgument final
   }
 };
 
-struct AAPotentialValuesReturned
-    : AAReturnedFromReturnedValues<AAPotentialValues, AAPotentialValuesImpl> {
-  using Base =
-      AAReturnedFromReturnedValues<AAPotentialValues, AAPotentialValuesImpl>;
-  AAPotentialValuesReturned(const IRPosition &IRP, Attributor &A)
+struct AAPotentialConstantValuesReturned
+    : AAReturnedFromReturnedValues<AAPotentialConstantValues,
+                                   AAPotentialConstantValuesImpl> {
+  using Base = AAReturnedFromReturnedValues<AAPotentialConstantValues,
+                                            AAPotentialConstantValuesImpl>;
+  AAPotentialConstantValuesReturned(const IRPosition &IRP, Attributor &A)
       : Base(IRP, A) {}
 
   /// See AbstractAttribute::trackStatistics()
@@ -8751,13 +9040,13 @@ struct AAPotentialValuesReturned
   }
 };
 
-struct AAPotentialValuesFloating : AAPotentialValuesImpl {
-  AAPotentialValuesFloating(const IRPosition &IRP, Attributor &A)
-      : AAPotentialValuesImpl(IRP, A) {}
+struct AAPotentialConstantValuesFloating : AAPotentialConstantValuesImpl {
+  AAPotentialConstantValuesFloating(const IRPosition &IRP, Attributor &A)
+      : AAPotentialConstantValuesImpl(IRP, A) {}
 
   /// See AbstractAttribute::initialize(..).
   void initialize(Attributor &A) override {
-    AAPotentialValuesImpl::initialize(A);
+    AAPotentialConstantValuesImpl::initialize(A);
     if (isAtFixpoint())
       return;
 
@@ -8783,7 +9072,7 @@ struct AAPotentialValuesFloating : AAPotentialValuesImpl {
 
     indicatePessimisticFixpoint();
 
-    LLVM_DEBUG(dbgs() << "[AAPotentialValues] We give up: "
+    LLVM_DEBUG(dbgs() << "[AAPotentialConstantValues] We give up: "
                       << getAssociatedValue() << "\n");
   }
 
@@ -8891,7 +9180,7 @@ struct AAPotentialValuesFloating : AAPotentialValuesImpl {
     const auto &SimplifiedLHS =
         A.getAssumedSimplified(IRPosition::value(*LHS, getCallBaseContext()),
                                *this, UsedAssumedInformation);
-    if (!SimplifiedLHS.hasValue())
+    if (!SimplifiedLHS)
       return ChangeStatus::UNCHANGED;
     if (!SimplifiedLHS.getValue())
       return indicatePessimisticFixpoint();
@@ -8900,7 +9189,7 @@ struct AAPotentialValuesFloating : AAPotentialValuesImpl {
     const auto &SimplifiedRHS =
         A.getAssumedSimplified(IRPosition::value(*RHS, getCallBaseContext()),
                                *this, UsedAssumedInformation);
-    if (!SimplifiedRHS.hasValue())
+    if (!SimplifiedRHS)
       return ChangeStatus::UNCHANGED;
     if (!SimplifiedRHS.getValue())
       return indicatePessimisticFixpoint();
@@ -8909,18 +9198,18 @@ struct AAPotentialValuesFloating : AAPotentialValuesImpl {
     if (!LHS->getType()->isIntegerTy() || !RHS->getType()->isIntegerTy())
       return indicatePessimisticFixpoint();
 
-    auto &LHSAA = A.getAAFor<AAPotentialValues>(*this, IRPosition::value(*LHS),
-                                                DepClassTy::REQUIRED);
+    auto &LHSAA = A.getAAFor<AAPotentialConstantValues>(
+        *this, IRPosition::value(*LHS), DepClassTy::REQUIRED);
     if (!LHSAA.isValidState())
       return indicatePessimisticFixpoint();
 
-    auto &RHSAA = A.getAAFor<AAPotentialValues>(*this, IRPosition::value(*RHS),
-                                                DepClassTy::REQUIRED);
+    auto &RHSAA = A.getAAFor<AAPotentialConstantValues>(
+        *this, IRPosition::value(*RHS), DepClassTy::REQUIRED);
     if (!RHSAA.isValidState())
       return indicatePessimisticFixpoint();
 
-    const DenseSet<APInt> &LHSAAPVS = LHSAA.getAssumedSet();
-    const DenseSet<APInt> &RHSAAPVS = RHSAA.getAssumedSet();
+    const SetTy &LHSAAPVS = LHSAA.getAssumedSet();
+    const SetTy &RHSAAPVS = RHSAA.getAssumedSet();
 
     // TODO: make use of undef flag to limit potential values aggressively.
     bool MaybeTrue = false, MaybeFalse = false;
@@ -8974,7 +9263,7 @@ struct AAPotentialValuesFloating : AAPotentialValuesImpl {
     const auto &SimplifiedLHS =
         A.getAssumedSimplified(IRPosition::value(*LHS, getCallBaseContext()),
                                *this, UsedAssumedInformation);
-    if (!SimplifiedLHS.hasValue())
+    if (!SimplifiedLHS)
       return ChangeStatus::UNCHANGED;
     if (!SimplifiedLHS.getValue())
       return indicatePessimisticFixpoint();
@@ -8983,7 +9272,7 @@ struct AAPotentialValuesFloating : AAPotentialValuesImpl {
     const auto &SimplifiedRHS =
         A.getAssumedSimplified(IRPosition::value(*RHS, getCallBaseContext()),
                                *this, UsedAssumedInformation);
-    if (!SimplifiedRHS.hasValue())
+    if (!SimplifiedRHS)
       return ChangeStatus::UNCHANGED;
     if (!SimplifiedRHS.getValue())
       return indicatePessimisticFixpoint();
@@ -8997,21 +9286,21 @@ struct AAPotentialValuesFloating : AAPotentialValuesImpl {
 
     // Check if we only need one operand.
     bool OnlyLeft = false, OnlyRight = false;
-    if (C.hasValue() && *C && (*C)->isOneValue())
+    if (C && *C && (*C)->isOneValue())
       OnlyLeft = true;
-    else if (C.hasValue() && *C && (*C)->isZeroValue())
+    else if (C && *C && (*C)->isZeroValue())
       OnlyRight = true;
 
-    const AAPotentialValues *LHSAA = nullptr, *RHSAA = nullptr;
+    const AAPotentialConstantValues *LHSAA = nullptr, *RHSAA = nullptr;
     if (!OnlyRight) {
-      LHSAA = &A.getAAFor<AAPotentialValues>(*this, IRPosition::value(*LHS),
-                                             DepClassTy::REQUIRED);
+      LHSAA = &A.getAAFor<AAPotentialConstantValues>(
+          *this, IRPosition::value(*LHS), DepClassTy::REQUIRED);
       if (!LHSAA->isValidState())
         return indicatePessimisticFixpoint();
     }
     if (!OnlyLeft) {
-      RHSAA = &A.getAAFor<AAPotentialValues>(*this, IRPosition::value(*RHS),
-                                             DepClassTy::REQUIRED);
+      RHSAA = &A.getAAFor<AAPotentialConstantValues>(
+          *this, IRPosition::value(*RHS), DepClassTy::REQUIRED);
       if (!RHSAA->isValidState())
         return indicatePessimisticFixpoint();
     }
@@ -9049,17 +9338,17 @@ struct AAPotentialValuesFloating : AAPotentialValuesImpl {
     const auto &SimplifiedSrc =
         A.getAssumedSimplified(IRPosition::value(*Src, getCallBaseContext()),
                                *this, UsedAssumedInformation);
-    if (!SimplifiedSrc.hasValue())
+    if (!SimplifiedSrc)
       return ChangeStatus::UNCHANGED;
     if (!SimplifiedSrc.getValue())
       return indicatePessimisticFixpoint();
     Src = *SimplifiedSrc;
 
-    auto &SrcAA = A.getAAFor<AAPotentialValues>(*this, IRPosition::value(*Src),
-                                                DepClassTy::REQUIRED);
+    auto &SrcAA = A.getAAFor<AAPotentialConstantValues>(
+        *this, IRPosition::value(*Src), DepClassTy::REQUIRED);
     if (!SrcAA.isValidState())
       return indicatePessimisticFixpoint();
-    const DenseSet<APInt> &SrcAAPVS = SrcAA.getAssumedSet();
+    const SetTy &SrcAAPVS = SrcAA.getAssumedSet();
     if (SrcAA.undefIsContained())
       unionAssumedWithUndef();
     else {
@@ -9082,7 +9371,7 @@ struct AAPotentialValuesFloating : AAPotentialValuesImpl {
     const auto &SimplifiedLHS =
         A.getAssumedSimplified(IRPosition::value(*LHS, getCallBaseContext()),
                                *this, UsedAssumedInformation);
-    if (!SimplifiedLHS.hasValue())
+    if (!SimplifiedLHS)
       return ChangeStatus::UNCHANGED;
     if (!SimplifiedLHS.getValue())
       return indicatePessimisticFixpoint();
@@ -9091,7 +9380,7 @@ struct AAPotentialValuesFloating : AAPotentialValuesImpl {
     const auto &SimplifiedRHS =
         A.getAssumedSimplified(IRPosition::value(*RHS, getCallBaseContext()),
                                *this, UsedAssumedInformation);
-    if (!SimplifiedRHS.hasValue())
+    if (!SimplifiedRHS)
       return ChangeStatus::UNCHANGED;
     if (!SimplifiedRHS.getValue())
       return indicatePessimisticFixpoint();
@@ -9100,18 +9389,18 @@ struct AAPotentialValuesFloating : AAPotentialValuesImpl {
     if (!LHS->getType()->isIntegerTy() || !RHS->getType()->isIntegerTy())
       return indicatePessimisticFixpoint();
 
-    auto &LHSAA = A.getAAFor<AAPotentialValues>(*this, IRPosition::value(*LHS),
-                                                DepClassTy::REQUIRED);
+    auto &LHSAA = A.getAAFor<AAPotentialConstantValues>(
+        *this, IRPosition::value(*LHS), DepClassTy::REQUIRED);
     if (!LHSAA.isValidState())
       return indicatePessimisticFixpoint();
 
-    auto &RHSAA = A.getAAFor<AAPotentialValues>(*this, IRPosition::value(*RHS),
-                                                DepClassTy::REQUIRED);
+    auto &RHSAA = A.getAAFor<AAPotentialConstantValues>(
+        *this, IRPosition::value(*RHS), DepClassTy::REQUIRED);
     if (!RHSAA.isValidState())
       return indicatePessimisticFixpoint();
 
-    const DenseSet<APInt> &LHSAAPVS = LHSAA.getAssumedSet();
-    const DenseSet<APInt> &RHSAAPVS = RHSAA.getAssumedSet();
+    const SetTy &LHSAAPVS = LHSAA.getAssumedSet();
+    const SetTy &RHSAAPVS = RHSAA.getAssumedSet();
     const APInt Zero = APInt(LHS->getType()->getIntegerBitWidth(), 0);
 
     // TODO: make use of undef flag to limit potential values aggressively.
@@ -9150,13 +9439,13 @@ struct AAPotentialValuesFloating : AAPotentialValuesImpl {
       const auto &SimplifiedIncomingValue = A.getAssumedSimplified(
           IRPosition::value(*IncomingValue, getCallBaseContext()), *this,
           UsedAssumedInformation);
-      if (!SimplifiedIncomingValue.hasValue())
+      if (!SimplifiedIncomingValue)
         continue;
       if (!SimplifiedIncomingValue.getValue())
         return indicatePessimisticFixpoint();
       IncomingValue = *SimplifiedIncomingValue;
 
-      auto &PotentialValuesAA = A.getAAFor<AAPotentialValues>(
+      auto &PotentialValuesAA = A.getAAFor<AAPotentialConstantValues>(
           *this, IRPosition::value(*IncomingValue), DepClassTy::REQUIRED);
       if (!PotentialValuesAA.isValidState())
         return indicatePessimisticFixpoint();
@@ -9169,30 +9458,6 @@ struct AAPotentialValuesFloating : AAPotentialValuesImpl {
                                          : ChangeStatus::CHANGED;
   }
 
-  ChangeStatus updateWithLoad(Attributor &A, LoadInst &L) {
-    if (!L.getType()->isIntegerTy())
-      return indicatePessimisticFixpoint();
-
-    auto Union = [&](Value &V) {
-      if (isa<UndefValue>(V)) {
-        unionAssumedWithUndef();
-        return true;
-      }
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(&V)) {
-        unionAssumed(CI->getValue());
-        return true;
-      }
-      return false;
-    };
-    auto AssumedBefore = getAssumed();
-
-    if (!AAValueSimplifyImpl::handleLoad(A, *this, L, Union))
-      return indicatePessimisticFixpoint();
-
-    return AssumedBefore == getAssumed() ? ChangeStatus::UNCHANGED
-                                         : ChangeStatus::CHANGED;
-  }
-
   /// See AbstractAttribute::updateImpl(...).
   ChangeStatus updateImpl(Attributor &A) override {
     Value &V = getAssociatedValue();
@@ -9213,9 +9478,6 @@ struct AAPotentialValuesFloating : AAPotentialValuesImpl {
     if (auto *PHI = dyn_cast<PHINode>(I))
       return updateWithPHINode(A, PHI);
 
-    if (auto *L = dyn_cast<LoadInst>(I))
-      return updateWithLoad(A, *L);
-
     return indicatePessimisticFixpoint();
   }
 
@@ -9225,14 +9487,15 @@ struct AAPotentialValuesFloating : AAPotentialValuesImpl {
   }
 };
 
-struct AAPotentialValuesFunction : AAPotentialValuesImpl {
-  AAPotentialValuesFunction(const IRPosition &IRP, Attributor &A)
-      : AAPotentialValuesImpl(IRP, A) {}
+struct AAPotentialConstantValuesFunction : AAPotentialConstantValuesImpl {
+  AAPotentialConstantValuesFunction(const IRPosition &IRP, Attributor &A)
+      : AAPotentialConstantValuesImpl(IRP, A) {}
 
   /// See AbstractAttribute::initialize(...).
   ChangeStatus updateImpl(Attributor &A) override {
-    llvm_unreachable("AAPotentialValues(Function|CallSite)::updateImpl will "
-                     "not be called");
+    llvm_unreachable(
+        "AAPotentialConstantValues(Function|CallSite)::updateImpl will "
+        "not be called");
   }
 
   /// See AbstractAttribute::trackStatistics()
@@ -9241,9 +9504,9 @@ struct AAPotentialValuesFunction : AAPotentialValuesImpl {
   }
 };
 
-struct AAPotentialValuesCallSite : AAPotentialValuesFunction {
-  AAPotentialValuesCallSite(const IRPosition &IRP, Attributor &A)
-      : AAPotentialValuesFunction(IRP, A) {}
+struct AAPotentialConstantValuesCallSite : AAPotentialConstantValuesFunction {
+  AAPotentialConstantValuesCallSite(const IRPosition &IRP, Attributor &A)
+      : AAPotentialConstantValuesFunction(IRP, A) {}
 
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override {
@@ -9251,11 +9514,13 @@ struct AAPotentialValuesCallSite : AAPotentialValuesFunction {
   }
 };
 
-struct AAPotentialValuesCallSiteReturned
-    : AACallSiteReturnedFromReturned<AAPotentialValues, AAPotentialValuesImpl> {
-  AAPotentialValuesCallSiteReturned(const IRPosition &IRP, Attributor &A)
-      : AACallSiteReturnedFromReturned<AAPotentialValues,
-                                       AAPotentialValuesImpl>(IRP, A) {}
+struct AAPotentialConstantValuesCallSiteReturned
+    : AACallSiteReturnedFromReturned<AAPotentialConstantValues,
+                                     AAPotentialConstantValuesImpl> {
+  AAPotentialConstantValuesCallSiteReturned(const IRPosition &IRP,
+                                            Attributor &A)
+      : AACallSiteReturnedFromReturned<AAPotentialConstantValues,
+                                       AAPotentialConstantValuesImpl>(IRP, A) {}
 
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override {
@@ -9263,13 +9528,15 @@ struct AAPotentialValuesCallSiteReturned
   }
 };
 
-struct AAPotentialValuesCallSiteArgument : AAPotentialValuesFloating {
-  AAPotentialValuesCallSiteArgument(const IRPosition &IRP, Attributor &A)
-      : AAPotentialValuesFloating(IRP, A) {}
+struct AAPotentialConstantValuesCallSiteArgument
+    : AAPotentialConstantValuesFloating {
+  AAPotentialConstantValuesCallSiteArgument(const IRPosition &IRP,
+                                            Attributor &A)
+      : AAPotentialConstantValuesFloating(IRP, A) {}
 
   /// See AbstractAttribute::initialize(..).
   void initialize(Attributor &A) override {
-    AAPotentialValuesImpl::initialize(A);
+    AAPotentialConstantValuesImpl::initialize(A);
     if (isAtFixpoint())
       return;
 
@@ -9292,8 +9559,8 @@ struct AAPotentialValuesCallSiteArgument : AAPotentialValuesFloating {
   ChangeStatus updateImpl(Attributor &A) override {
     Value &V = getAssociatedValue();
     auto AssumedBefore = getAssumed();
-    auto &AA = A.getAAFor<AAPotentialValues>(*this, IRPosition::value(V),
-                                             DepClassTy::REQUIRED);
+    auto &AA = A.getAAFor<AAPotentialConstantValues>(
+        *this, IRPosition::value(V), DepClassTy::REQUIRED);
     const auto &S = AA.getAssumed();
     unionAssumed(S);
     return AssumedBefore == getAssumed() ? ChangeStatus::UNCHANGED
@@ -9365,7 +9632,7 @@ struct AANoUndefImpl : AANoUndef {
     // considered to be dead. We don't manifest noundef in such positions for
     // the same reason above.
     if (!A.getAssumedSimplified(getIRPosition(), *this, UsedAssumedInformation)
-             .hasValue())
+             .has_value())
       return ChangeStatus::UNCHANGED;
     return AANoUndef::manifest(A);
   }
@@ -9400,8 +9667,10 @@ struct AANoUndefFloating : public AANoUndefImpl {
     };
 
     StateType T;
+    bool UsedAssumedInformation = false;
     if (!genericValueTraversal<StateType>(A, getIRPosition(), *this, T,
-                                          VisitValueCB, getCtxI()))
+                                          VisitValueCB, getCtxI(),
+                                          UsedAssumedInformation))
       return indicatePessimisticFixpoint();
 
     return clampStateAndIndicateChange(getState(), T);
@@ -9518,9 +9787,10 @@ struct AACallEdgesCallSite : public AACallEdgesImpl {
     // Process any value that we might call.
     auto ProcessCalledOperand = [&](Value *V) {
       bool DummyValue = false;
+      bool UsedAssumedInformation = false;
       if (!genericValueTraversal<bool>(A, IRPosition::value(*V), *this,
                                        DummyValue, VisitValue, nullptr,
-                                       false)) {
+                                       UsedAssumedInformation, false)) {
         // If we haven't gone through all values, assume that there are unknown
         // callees.
         setHasUnknownCallee(true, Change);
@@ -9530,7 +9800,9 @@ struct AACallEdgesCallSite : public AACallEdgesImpl {
     CallBase *CB = cast<CallBase>(getCtxI());
 
     if (CB->isInlineAsm()) {
-      setHasUnknownCallee(false, Change);
+      if (!hasAssumption(*CB->getCaller(), "ompx_no_call_asm") &&
+          !hasAssumption(*CB, "ompx_no_call_asm"))
+        setHasUnknownCallee(false, Change);
       return Change;
     }
 
@@ -9584,7 +9856,8 @@ struct AACallEdgesFunction : public AACallEdgesImpl {
     // Visit all callable instructions.
     bool UsedAssumedInformation = false;
     if (!A.checkForAllCallLikeInstructions(ProcessCallInst, *this,
-                                           UsedAssumedInformation)) {
+                                           UsedAssumedInformation,
+                                           /* CheckBBLivenessOnly */ true)) {
       // If we haven't looked at all call like instructions, assume that there
       // are unknown callees.
       setHasUnknownCallee(true, Change);
@@ -9656,7 +9929,7 @@ private:
                      ArrayRef<const AACallEdges *> AAEdgesList,
                      const Function &Fn) {
       Optional<bool> Cached = isCachedReachable(Fn);
-      if (Cached.hasValue())
+      if (Cached)
         return Cached.getValue();
 
       // The query was not cached, thus it is new. We need to request an update
@@ -9691,6 +9964,10 @@ private:
         const SetVector<Function *> &Edges = AAEdges->getOptimisticEdges();
 
         for (Function *Edge : Edges) {
+          // Functions that do not call back into the module can be ignored.
+          if (Edge->hasFnAttribute(Attribute::NoCallback))
+            continue;
+
           // We don't need a dependency if the result is reachable.
           const AAFunctionReachability &EdgeReachability =
               A.getAAFor<AAFunctionReachability>(
@@ -9820,22 +10097,21 @@ public:
     }
 
     // Update the Instruction queries.
-    const AAReachability *Reachability;
     if (!InstQueries.empty()) {
-      Reachability = &A.getAAFor<AAReachability>(
+      const AAReachability *Reachability = &A.getAAFor<AAReachability>(
           *this, IRPosition::function(*getAssociatedFunction()),
           DepClassTy::REQUIRED);
-    }
 
-    // Check for local callbases first.
-    for (auto &InstPair : InstQueries) {
-      SmallVector<const AACallEdges *> CallEdges;
-      bool AllKnown =
-          getReachableCallEdges(A, *Reachability, *InstPair.first, CallEdges);
-      // Update will return change if we this effects any queries.
-      if (!AllKnown)
-        InstPair.second.CanReachUnknownCallee = true;
-      Change |= InstPair.second.update(A, *this, CallEdges);
+      // Check for local callbases first.
+      for (auto &InstPair : InstQueries) {
+        SmallVector<const AACallEdges *> CallEdges;
+        bool AllKnown =
+            getReachableCallEdges(A, *Reachability, *InstPair.first, CallEdges);
+        // Update will return change if we this effects any queries.
+        if (!AllKnown)
+          InstPair.second.CanReachUnknownCallee = true;
+        Change |= InstPair.second.update(A, *this, CallEdges);
+      }
     }
 
     return Change;
@@ -9862,13 +10138,15 @@ private:
 
   /// Used to answer if a call base inside this function can reach a specific
   /// function.
-  DenseMap<const CallBase *, QueryResolver> CBQueries;
+  MapVector<const CallBase *, QueryResolver> CBQueries;
 
   /// This is for instruction queries than scan "forward".
-  DenseMap<const Instruction *, QueryResolver> InstQueries;
+  MapVector<const Instruction *, QueryResolver> InstQueries;
 };
+} // namespace
 
 /// ---------------------- Assumption Propagation ------------------------------
+namespace {
 struct AAAssumptionInfoImpl : public AAAssumptionInfo {
   AAAssumptionInfoImpl(const IRPosition &IRP, Attributor &A,
                        const DenseSet<StringRef> &Known)
@@ -9938,12 +10216,13 @@ struct AAAssumptionInfoFunction final : AAAssumptionInfoImpl {
       return !getAssumed().empty() || !getKnown().empty();
     };
 
-    bool AllCallSitesKnown;
+    bool UsedAssumedInformation = false;
     // Get the intersection of all assumptions held by this node's predecessors.
     // If we don't know all the call sites then this is either an entry into the
     // call graph or an empty node. This node is known to only contain its own
     // assumptions and can be propagated to its successors.
-    if (!A.checkForAllCallSites(CallSitePred, *this, true, AllCallSitesKnown))
+    if (!A.checkForAllCallSites(CallSitePred, *this, true,
+                                UsedAssumedInformation))
       return indicatePessimisticFixpoint();
 
     return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
@@ -10001,6 +10280,7 @@ private:
     return Assumptions;
   }
 };
+} // namespace
 
 AACallGraphNode *AACallEdgeIterator::operator*() const {
   return static_cast<AACallGraphNode *>(const_cast<AACallEdges *>(
@@ -10023,6 +10303,7 @@ const char AANoReturn::ID = 0;
 const char AAIsDead::ID = 0;
 const char AADereferenceable::ID = 0;
 const char AAAlign::ID = 0;
+const char AAInstanceInfo::ID = 0;
 const char AANoCapture::ID = 0;
 const char AAValueSimplify::ID = 0;
 const char AAHeapToStack::ID = 0;
@@ -10030,7 +10311,7 @@ const char AAPrivatizablePtr::ID = 0;
 const char AAMemoryBehavior::ID = 0;
 const char AAMemoryLocation::ID = 0;
 const char AAValueConstantRange::ID = 0;
-const char AAPotentialValues::ID = 0;
+const char AAPotentialConstantValues::ID = 0;
 const char AANoUndef::ID = 0;
 const char AACallEdges::ID = 0;
 const char AAFunctionReachability::ID = 0;
@@ -10145,9 +10426,10 @@ CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoAlias)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAPrivatizablePtr)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AADereferenceable)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAAlign)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAInstanceInfo)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoCapture)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAValueConstantRange)
-CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAPotentialValues)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAPotentialConstantValues)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoUndef)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAPointerInfo)
 
diff --git a/llvm/lib/Transforms/IPO/BlockExtractor.cpp b/llvm/lib/Transforms/IPO/BlockExtractor.cpp
index 7c178f9a9834..9e27ae49a901 100644
--- a/llvm/lib/Transforms/IPO/BlockExtractor.cpp
+++ b/llvm/lib/Transforms/IPO/BlockExtractor.cpp
@@ -135,7 +135,8 @@ void BlockExtractor::loadFile() {
     if (LineSplit.empty())
       continue;
     if (LineSplit.size()!=2)
-      report_fatal_error("Invalid line format, expecting lines like: 'funcname bb1[;bb2..]'");
+      report_fatal_error("Invalid line format, expecting lines like: 'funcname bb1[;bb2..]'",
+                         /*GenCrashDiag=*/false);
     SmallVector<StringRef, 4> BBNames;
     LineSplit[1].split(BBNames, ';', /*MaxSplit=*/-1,
                        /*KeepEmpty=*/false);
@@ -194,13 +195,15 @@ bool BlockExtractor::runOnModule(Module &M) {
   for (const auto &BInfo : BlocksByName) {
     Function *F = M.getFunction(BInfo.first);
     if (!F)
-      report_fatal_error("Invalid function name specified in the input file");
+      report_fatal_error("Invalid function name specified in the input file",
+                         /*GenCrashDiag=*/false);
     for (const auto &BBInfo : BInfo.second) {
       auto Res = llvm::find_if(*F, [&](const BasicBlock &BB) {
         return BB.getName().equals(BBInfo);
       });
       if (Res == F->end())
-        report_fatal_error("Invalid block name specified in the input file");
+        report_fatal_error("Invalid block name specified in the input file",
+                           /*GenCrashDiag=*/false);
       GroupsOfBlocks[NextGroupIdx].push_back(&*Res);
     }
     ++NextGroupIdx;
@@ -212,7 +215,7 @@ bool BlockExtractor::runOnModule(Module &M) {
     for (BasicBlock *BB : BBs) {
       // Check if the module contains BB.
       if (BB->getParent()->getParent() != &M)
-        report_fatal_error("Invalid basic block");
+        report_fatal_error("Invalid basic block", /*GenCrashDiag=*/false);
       LLVM_DEBUG(dbgs() << "BlockExtractor: Extracting "
                         << BB->getParent()->getName() << ":" << BB->getName()
                         << "\n");
diff --git a/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp b/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp
index 927dceec8865..64bfcb2a9a9f 100644
--- a/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp
+++ b/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp
@@ -19,11 +19,13 @@
 #include "llvm/Transforms/IPO/CalledValuePropagation.h"
 #include "llvm/Analysis/SparsePropagation.h"
 #include "llvm/Analysis/ValueLatticeUtils.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/IPO.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "called-value-propagation"
@@ -68,7 +70,7 @@ public:
     }
   };
 
-  CVPLatticeVal() : LatticeState(Undefined) {}
+  CVPLatticeVal() = default;
   CVPLatticeVal(CVPLatticeStateTy LatticeState) : LatticeState(LatticeState) {}
   CVPLatticeVal(std::vector<Function *> &&Functions)
       : LatticeState(FunctionSet), Functions(std::move(Functions)) {
@@ -94,7 +96,7 @@ public:
 
 private:
   /// Holds the state this lattice value is in.
-  CVPLatticeStateTy LatticeState;
+  CVPLatticeStateTy LatticeState = Undefined;
 
   /// Holds functions indicating the possible targets of call sites. This set
   /// is empty for lattice values in the undefined, overdefined, and untracked
diff --git a/llvm/lib/Transforms/IPO/ConstantMerge.cpp b/llvm/lib/Transforms/IPO/ConstantMerge.cpp
index 178d3f41963e..73af30ece47c 100644
--- a/llvm/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/llvm/lib/Transforms/IPO/ConstantMerge.cpp
@@ -85,7 +85,7 @@ static void copyDebugLocMetadata(const GlobalVariable *From,
 }
 
 static Align getAlign(GlobalVariable *GV) {
-  return GV->getAlign().getValueOr(
+  return GV->getAlign().value_or(
       GV->getParent()->getDataLayout().getPreferredAlign(GV));
 }
 
diff --git a/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp b/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp
index 2fe9a59ad210..dfe33ac9da0d 100644
--- a/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp
+++ b/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp
@@ -15,21 +15,16 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalObject.h"
-#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 2a6e38b0437f..99fa4baf355d 100644
--- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -16,18 +16,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/IPO/DeadArgumentElimination.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
@@ -44,9 +43,9 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/DeadArgumentElimination.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include <cassert>
-#include <cstdint>
 #include <utility>
 #include <vector>
 
@@ -55,36 +54,36 @@ using namespace llvm;
 #define DEBUG_TYPE "deadargelim"
 
 STATISTIC(NumArgumentsEliminated, "Number of unread args removed");
-STATISTIC(NumRetValsEliminated  , "Number of unused return values removed");
-STATISTIC(NumArgumentsReplacedWithUndef,
-          "Number of unread args replaced with undef");
+STATISTIC(NumRetValsEliminated, "Number of unused return values removed");
+STATISTIC(NumArgumentsReplacedWithPoison,
+          "Number of unread args replaced with poison");
 
 namespace {
 
-  /// DAE - The dead argument elimination pass.
-  class DAE : public ModulePass {
-  protected:
-    // DAH uses this to specify a different ID.
-    explicit DAE(char &ID) : ModulePass(ID) {}
+/// The dead argument elimination pass.
+class DAE : public ModulePass {
+protected:
+  // DAH uses this to specify a different ID.
+  explicit DAE(char &ID) : ModulePass(ID) {}
 
-  public:
-    static char ID; // Pass identification, replacement for typeid
+public:
+  static char ID; // Pass identification, replacement for typeid
 
-    DAE() : ModulePass(ID) {
-      initializeDAEPass(*PassRegistry::getPassRegistry());
-    }
+  DAE() : ModulePass(ID) {
+    initializeDAEPass(*PassRegistry::getPassRegistry());
+  }
 
-    bool runOnModule(Module &M) override {
-      if (skipModule(M))
-        return false;
-      DeadArgumentEliminationPass DAEP(ShouldHackArguments());
-      ModuleAnalysisManager DummyMAM;
-      PreservedAnalyses PA = DAEP.run(M, DummyMAM);
-      return !PA.areAllPreserved();
-    }
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+    DeadArgumentEliminationPass DAEP(shouldHackArguments());
+    ModuleAnalysisManager DummyMAM;
+    PreservedAnalyses PA = DAEP.run(M, DummyMAM);
+    return !PA.areAllPreserved();
+  }
 
-    virtual bool ShouldHackArguments() const { return false; }
-  };
+  virtual bool shouldHackArguments() const { return false; }
+};
 
 } // end anonymous namespace
 
@@ -94,51 +93,51 @@ INITIALIZE_PASS(DAE, "deadargelim", "Dead Argument Elimination", false, false)
 
 namespace {
 
-  /// DAH - DeadArgumentHacking pass - Same as dead argument elimination, but
-  /// deletes arguments to functions which are external.  This is only for use
-  /// by bugpoint.
-  struct DAH : public DAE {
-    static char ID;
+/// The DeadArgumentHacking pass, same as dead argument elimination, but deletes
+/// arguments to functions which are external. This is only for use by bugpoint.
+struct DAH : public DAE {
+  static char ID;
 
-    DAH() : DAE(ID) {}
+  DAH() : DAE(ID) {}
 
-    bool ShouldHackArguments() const override { return true; }
-  };
+  bool shouldHackArguments() const override { return true; }
+};
 
 } // end anonymous namespace
 
 char DAH::ID = 0;
 
 INITIALIZE_PASS(DAH, "deadarghaX0r",
-                "Dead Argument Hacking (BUGPOINT USE ONLY; DO NOT USE)",
-                false, false)
+                "Dead Argument Hacking (BUGPOINT USE ONLY; DO NOT USE)", false,
+                false)
 
-/// createDeadArgEliminationPass - This pass removes arguments from functions
-/// which are not used by the body of the function.
+/// This pass removes arguments from functions which are not used by the body of
+/// the function.
 ModulePass *llvm::createDeadArgEliminationPass() { return new DAE(); }
 
 ModulePass *llvm::createDeadArgHackingPass() { return new DAH(); }
 
-/// DeleteDeadVarargs - If this is an function that takes a ... list, and if
-/// llvm.vastart is never called, the varargs list is dead for the function.
-bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) {
-  assert(Fn.getFunctionType()->isVarArg() && "Function isn't varargs!");
-  if (Fn.isDeclaration() || !Fn.hasLocalLinkage()) return false;
+/// If this is an function that takes a ... list, and if llvm.vastart is never
+/// called, the varargs list is dead for the function.
+bool DeadArgumentEliminationPass::deleteDeadVarargs(Function &F) {
+  assert(F.getFunctionType()->isVarArg() && "Function isn't varargs!");
+  if (F.isDeclaration() || !F.hasLocalLinkage())
+    return false;
 
   // Ensure that the function is only directly called.
-  if (Fn.hasAddressTaken())
+  if (F.hasAddressTaken())
     return false;
 
   // Don't touch naked functions. The assembly might be using an argument, or
   // otherwise rely on the frame layout in a way that this analysis will not
   // see.
-  if (Fn.hasFnAttribute(Attribute::Naked)) {
+  if (F.hasFnAttribute(Attribute::Naked)) {
     return false;
   }
 
   // Okay, we know we can transform this function if safe.  Scan its body
   // looking for calls marked musttail or calls to llvm.vastart.
-  for (BasicBlock &BB : Fn) {
+  for (BasicBlock &BB : F) {
     for (Instruction &I : BB) {
       CallInst *CI = dyn_cast<CallInst>(&I);
       if (!CI)
@@ -157,25 +156,24 @@ bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) {
 
   // Start by computing a new prototype for the function, which is the same as
   // the old function, but doesn't have isVarArg set.
-  FunctionType *FTy = Fn.getFunctionType();
+  FunctionType *FTy = F.getFunctionType();
 
   std::vector<Type *> Params(FTy->param_begin(), FTy->param_end());
-  FunctionType *NFTy = FunctionType::get(FTy->getReturnType(),
-                                                Params, false);
+  FunctionType *NFTy = FunctionType::get(FTy->getReturnType(), Params, false);
   unsigned NumArgs = Params.size();
 
   // Create the new function body and insert it into the module...
-  Function *NF = Function::Create(NFTy, Fn.getLinkage(), Fn.getAddressSpace());
-  NF->copyAttributesFrom(&Fn);
-  NF->setComdat(Fn.getComdat());
-  Fn.getParent()->getFunctionList().insert(Fn.getIterator(), NF);
-  NF->takeName(&Fn);
+  Function *NF = Function::Create(NFTy, F.getLinkage(), F.getAddressSpace());
+  NF->copyAttributesFrom(&F);
+  NF->setComdat(F.getComdat());
+  F.getParent()->getFunctionList().insert(F.getIterator(), NF);
+  NF->takeName(&F);
 
-  // Loop over all of the callers of the function, transforming the call sites
+  // Loop over all the callers of the function, transforming the call sites
   // to pass in a smaller number of arguments into the new function.
   //
   std::vector<Value *> Args;
-  for (User *U : llvm::make_early_inc_range(Fn.users())) {
+  for (User *U : llvm::make_early_inc_range(F.users())) {
     CallBase *CB = dyn_cast<CallBase>(U);
     if (!CB)
       continue;
@@ -189,7 +187,7 @@ bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) {
       SmallVector<AttributeSet, 8> ArgAttrs;
       for (unsigned ArgNo = 0; ArgNo < NumArgs; ++ArgNo)
         ArgAttrs.push_back(PAL.getParamAttrs(ArgNo));
-      PAL = AttributeList::get(Fn.getContext(), PAL.getFnAttrs(),
+      PAL = AttributeList::get(F.getContext(), PAL.getFnAttrs(),
                                PAL.getRetAttrs(), ArgAttrs);
     }
 
@@ -224,64 +222,67 @@ bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) {
   // Since we have now created the new function, splice the body of the old
   // function right into the new function, leaving the old rotting hulk of the
   // function empty.
-  NF->getBasicBlockList().splice(NF->begin(), Fn.getBasicBlockList());
+  NF->getBasicBlockList().splice(NF->begin(), F.getBasicBlockList());
 
   // Loop over the argument list, transferring uses of the old arguments over to
-  // the new arguments, also transferring over the names as well.  While we're at
-  // it, remove the dead arguments from the DeadArguments list.
-  for (Function::arg_iterator I = Fn.arg_begin(), E = Fn.arg_end(),
-       I2 = NF->arg_begin(); I != E; ++I, ++I2) {
+  // the new arguments, also transferring over the names as well.  While we're
+  // at it, remove the dead arguments from the DeadArguments list.
+  for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(),
+                              I2 = NF->arg_begin();
+       I != E; ++I, ++I2) {
     // Move the name and users over to the new version.
     I->replaceAllUsesWith(&*I2);
     I2->takeName(&*I);
   }
 
-  // Clone metadatas from the old function, including debug info descriptor.
+  // Clone metadata from the old function, including debug info descriptor.
   SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
-  Fn.getAllMetadata(MDs);
+  F.getAllMetadata(MDs);
   for (auto MD : MDs)
     NF->addMetadata(MD.first, *MD.second);
 
   // Fix up any BlockAddresses that refer to the function.
-  Fn.replaceAllUsesWith(ConstantExpr::getBitCast(NF, Fn.getType()));
+  F.replaceAllUsesWith(ConstantExpr::getBitCast(NF, F.getType()));
   // Delete the bitcast that we just created, so that NF does not
   // appear to be address-taken.
   NF->removeDeadConstantUsers();
   // Finally, nuke the old function.
-  Fn.eraseFromParent();
+  F.eraseFromParent();
   return true;
 }
 
-/// RemoveDeadArgumentsFromCallers - Checks if the given function has any
-/// arguments that are unused, and changes the caller parameters to be undefined
-/// instead.
-bool DeadArgumentEliminationPass::RemoveDeadArgumentsFromCallers(Function &Fn) {
+/// Checks if the given function has any arguments that are unused, and changes
+/// the caller parameters to be poison instead.
+bool DeadArgumentEliminationPass::removeDeadArgumentsFromCallers(Function &F) {
   // We cannot change the arguments if this TU does not define the function or
   // if the linker may choose a function body from another TU, even if the
   // nominal linkage indicates that other copies of the function have the same
   // semantics. In the below example, the dead load from %p may not have been
-  // eliminated from the linker-chosen copy of f, so replacing %p with undef
+  // eliminated from the linker-chosen copy of f, so replacing %p with poison
   // in callers may introduce undefined behavior.
   //
   // define linkonce_odr void @f(i32* %p) {
   //   %v = load i32 %p
   //   ret void
   // }
-  if (!Fn.hasExactDefinition())
+  if (!F.hasExactDefinition())
     return false;
 
-  // Functions with local linkage should already have been handled, except the
-  // fragile (variadic) ones which we can improve here.
-  if (Fn.hasLocalLinkage() && !Fn.getFunctionType()->isVarArg())
+  // Functions with local linkage should already have been handled, except if
+  // they are fully alive (e.g., called indirectly) and except for the fragile
+  // (variadic) ones. In these cases, we may still be able to improve their
+  // statically known call sites.
+  if ((F.hasLocalLinkage() && !LiveFunctions.count(&F)) &&
+      !F.getFunctionType()->isVarArg())
     return false;
 
   // Don't touch naked functions. The assembly might be using an argument, or
   // otherwise rely on the frame layout in a way that this analysis will not
   // see.
-  if (Fn.hasFnAttribute(Attribute::Naked))
+  if (F.hasFnAttribute(Attribute::Naked))
     return false;
 
-  if (Fn.use_empty())
+  if (F.use_empty())
     return false;
 
   SmallVector<unsigned, 8> UnusedArgs;
@@ -289,35 +290,36 @@ bool DeadArgumentEliminationPass::RemoveDeadArgumentsFromCallers(Function &Fn) {
 
   AttributeMask UBImplyingAttributes =
       AttributeFuncs::getUBImplyingAttributes();
-  for (Argument &Arg : Fn.args()) {
+  for (Argument &Arg : F.args()) {
     if (!Arg.hasSwiftErrorAttr() && Arg.use_empty() &&
         !Arg.hasPassPointeeByValueCopyAttr()) {
       if (Arg.isUsedByMetadata()) {
-        Arg.replaceAllUsesWith(UndefValue::get(Arg.getType()));
+        Arg.replaceAllUsesWith(PoisonValue::get(Arg.getType()));
         Changed = true;
       }
       UnusedArgs.push_back(Arg.getArgNo());
-      Fn.removeParamAttrs(Arg.getArgNo(), UBImplyingAttributes);
+      F.removeParamAttrs(Arg.getArgNo(), UBImplyingAttributes);
     }
   }
 
   if (UnusedArgs.empty())
     return false;
 
-  for (Use &U : Fn.uses()) {
+  for (Use &U : F.uses()) {
     CallBase *CB = dyn_cast<CallBase>(U.getUser());
-    if (!CB || !CB->isCallee(&U))
+    if (!CB || !CB->isCallee(&U) ||
+        CB->getFunctionType() != F.getFunctionType())
       continue;
 
-    // Now go through all unused args and replace them with "undef".
+    // Now go through all unused args and replace them with poison.
     for (unsigned I = 0, E = UnusedArgs.size(); I != E; ++I) {
       unsigned ArgNo = UnusedArgs[I];
 
       Value *Arg = CB->getArgOperand(ArgNo);
-      CB->setArgOperand(ArgNo, UndefValue::get(Arg->getType()));
+      CB->setArgOperand(ArgNo, PoisonValue::get(Arg->getType()));
       CB->removeParamAttrs(ArgNo, UBImplyingAttributes);
 
-      ++NumArgumentsReplacedWithUndef;
+      ++NumArgumentsReplacedWithPoison;
       Changed = true;
     }
   }
@@ -328,16 +330,15 @@ bool DeadArgumentEliminationPass::RemoveDeadArgumentsFromCallers(Function &Fn) {
 /// Convenience function that returns the number of return values. It returns 0
 /// for void functions and 1 for functions not returning a struct. It returns
 /// the number of struct elements for functions returning a struct.
-static unsigned NumRetVals(const Function *F) {
+static unsigned numRetVals(const Function *F) {
   Type *RetTy = F->getReturnType();
   if (RetTy->isVoidTy())
     return 0;
-  else if (StructType *STy = dyn_cast<StructType>(RetTy))
+  if (StructType *STy = dyn_cast<StructType>(RetTy))
     return STy->getNumElements();
-  else if (ArrayType *ATy = dyn_cast<ArrayType>(RetTy))
+  if (ArrayType *ATy = dyn_cast<ArrayType>(RetTy))
     return ATy->getNumElements();
-  else
-    return 1;
+  return 1;
 }
 
 /// Returns the sub-type a function will return at a given Idx. Should
@@ -349,20 +350,18 @@ static Type *getRetComponentType(const Function *F, unsigned Idx) {
 
   if (StructType *STy = dyn_cast<StructType>(RetTy))
     return STy->getElementType(Idx);
-  else if (ArrayType *ATy = dyn_cast<ArrayType>(RetTy))
+  if (ArrayType *ATy = dyn_cast<ArrayType>(RetTy))
     return ATy->getElementType();
-  else
-    return RetTy;
+  return RetTy;
 }
 
-/// MarkIfNotLive - This checks Use for liveness in LiveValues. If Use is not
-/// live, it adds Use to the MaybeLiveUses argument. Returns the determined
-/// liveness of Use.
+/// Checks Use for liveness in LiveValues. If Use is not live, it adds Use to
+/// the MaybeLiveUses argument. Returns the determined liveness of Use.
 DeadArgumentEliminationPass::Liveness
-DeadArgumentEliminationPass::MarkIfNotLive(RetOrArg Use,
+DeadArgumentEliminationPass::markIfNotLive(RetOrArg Use,
                                            UseVector &MaybeLiveUses) {
   // We're live if our use or its Function is already marked as live.
-  if (IsLive(Use))
+  if (isLive(Use))
     return Live;
 
   // We're maybe live otherwise, but remember that we must become live if
@@ -371,127 +370,127 @@ DeadArgumentEliminationPass::MarkIfNotLive(RetOrArg Use,
   return MaybeLive;
 }
 
-/// SurveyUse - This looks at a single use of an argument or return value
-/// and determines if it should be alive or not. Adds this use to MaybeLiveUses
-/// if it causes the used value to become MaybeLive.
+/// Looks at a single use of an argument or return value and determines if it
+/// should be alive or not. Adds this use to MaybeLiveUses if it causes the
+/// used value to become MaybeLive.
 ///
 /// RetValNum is the return value number to use when this use is used in a
 /// return instruction. This is used in the recursion, you should always leave
 /// it at 0.
 DeadArgumentEliminationPass::Liveness
-DeadArgumentEliminationPass::SurveyUse(const Use *U, UseVector &MaybeLiveUses,
+DeadArgumentEliminationPass::surveyUse(const Use *U, UseVector &MaybeLiveUses,
                                        unsigned RetValNum) {
-    const User *V = U->getUser();
-    if (const ReturnInst *RI = dyn_cast<ReturnInst>(V)) {
-      // The value is returned from a function. It's only live when the
-      // function's return value is live. We use RetValNum here, for the case
-      // that U is really a use of an insertvalue instruction that uses the
-      // original Use.
-      const Function *F = RI->getParent()->getParent();
-      if (RetValNum != -1U) {
-        RetOrArg Use = CreateRet(F, RetValNum);
-        // We might be live, depending on the liveness of Use.
-        return MarkIfNotLive(Use, MaybeLiveUses);
-      } else {
-        DeadArgumentEliminationPass::Liveness Result = MaybeLive;
-        for (unsigned Ri = 0; Ri < NumRetVals(F); ++Ri) {
-          RetOrArg Use = CreateRet(F, Ri);
-          // We might be live, depending on the liveness of Use. If any
-          // sub-value is live, then the entire value is considered live. This
-          // is a conservative choice, and better tracking is possible.
-          DeadArgumentEliminationPass::Liveness SubResult =
-              MarkIfNotLive(Use, MaybeLiveUses);
-          if (Result != Live)
-            Result = SubResult;
-        }
-        return Result;
-      }
+  const User *V = U->getUser();
+  if (const ReturnInst *RI = dyn_cast<ReturnInst>(V)) {
+    // The value is returned from a function. It's only live when the
+    // function's return value is live. We use RetValNum here, for the case
+    // that U is really a use of an insertvalue instruction that uses the
+    // original Use.
+    const Function *F = RI->getParent()->getParent();
+    if (RetValNum != -1U) {
+      RetOrArg Use = createRet(F, RetValNum);
+      // We might be live, depending on the liveness of Use.
+      return markIfNotLive(Use, MaybeLiveUses);
     }
-    if (const InsertValueInst *IV = dyn_cast<InsertValueInst>(V)) {
-      if (U->getOperandNo() != InsertValueInst::getAggregateOperandIndex()
-          && IV->hasIndices())
-        // The use we are examining is inserted into an aggregate. Our liveness
-        // depends on all uses of that aggregate, but if it is used as a return
-        // value, only index at which we were inserted counts.
-        RetValNum = *IV->idx_begin();
-
-      // Note that if we are used as the aggregate operand to the insertvalue,
-      // we don't change RetValNum, but do survey all our uses.
-
-      Liveness Result = MaybeLive;
-      for (const Use &UU : IV->uses()) {
-        Result = SurveyUse(&UU, MaybeLiveUses, RetValNum);
-        if (Result == Live)
-          break;
-      }
-      return Result;
+
+    DeadArgumentEliminationPass::Liveness Result = MaybeLive;
+    for (unsigned Ri = 0; Ri < numRetVals(F); ++Ri) {
+      RetOrArg Use = createRet(F, Ri);
+      // We might be live, depending on the liveness of Use. If any
+      // sub-value is live, then the entire value is considered live. This
+      // is a conservative choice, and better tracking is possible.
+      DeadArgumentEliminationPass::Liveness SubResult =
+          markIfNotLive(Use, MaybeLiveUses);
+      if (Result != Live)
+        Result = SubResult;
+    }
+    return Result;
+  }
+
+  if (const InsertValueInst *IV = dyn_cast<InsertValueInst>(V)) {
+    if (U->getOperandNo() != InsertValueInst::getAggregateOperandIndex() &&
+        IV->hasIndices())
+      // The use we are examining is inserted into an aggregate. Our liveness
+      // depends on all uses of that aggregate, but if it is used as a return
+      // value, only index at which we were inserted counts.
+      RetValNum = *IV->idx_begin();
+
+    // Note that if we are used as the aggregate operand to the insertvalue,
+    // we don't change RetValNum, but do survey all our uses.
+
+    Liveness Result = MaybeLive;
+    for (const Use &UU : IV->uses()) {
+      Result = surveyUse(&UU, MaybeLiveUses, RetValNum);
+      if (Result == Live)
+        break;
     }
+    return Result;
+  }
 
-    if (const auto *CB = dyn_cast<CallBase>(V)) {
-      const Function *F = CB->getCalledFunction();
-      if (F) {
-        // Used in a direct call.
+  if (const auto *CB = dyn_cast<CallBase>(V)) {
+    const Function *F = CB->getCalledFunction();
+    if (F) {
+      // Used in a direct call.
 
-        // The function argument is live if it is used as a bundle operand.
-        if (CB->isBundleOperand(U))
-          return Live;
+      // The function argument is live if it is used as a bundle operand.
+      if (CB->isBundleOperand(U))
+        return Live;
 
-        // Find the argument number. We know for sure that this use is an
-        // argument, since if it was the function argument this would be an
-        // indirect call and the we know can't be looking at a value of the
-        // label type (for the invoke instruction).
-        unsigned ArgNo = CB->getArgOperandNo(U);
+      // Find the argument number. We know for sure that this use is an
+      // argument, since if it was the function argument this would be an
+      // indirect call and that we know can't be looking at a value of the
+      // label type (for the invoke instruction).
+      unsigned ArgNo = CB->getArgOperandNo(U);
 
-        if (ArgNo >= F->getFunctionType()->getNumParams())
-          // The value is passed in through a vararg! Must be live.
-          return Live;
+      if (ArgNo >= F->getFunctionType()->getNumParams())
+        // The value is passed in through a vararg! Must be live.
+        return Live;
 
-        assert(CB->getArgOperand(ArgNo) == CB->getOperand(U->getOperandNo()) &&
-               "Argument is not where we expected it");
+      assert(CB->getArgOperand(ArgNo) == CB->getOperand(U->getOperandNo()) &&
+             "Argument is not where we expected it");
 
-        // Value passed to a normal call. It's only live when the corresponding
-        // argument to the called function turns out live.
-        RetOrArg Use = CreateArg(F, ArgNo);
-        return MarkIfNotLive(Use, MaybeLiveUses);
-      }
+      // Value passed to a normal call. It's only live when the corresponding
+      // argument to the called function turns out live.
+      RetOrArg Use = createArg(F, ArgNo);
+      return markIfNotLive(Use, MaybeLiveUses);
     }
-    // Used in any other way? Value must be live.
-    return Live;
+  }
+  // Used in any other way? Value must be live.
+  return Live;
 }
 
-/// SurveyUses - This looks at all the uses of the given value
+/// Looks at all the uses of the given value
 /// Returns the Liveness deduced from the uses of this value.
 ///
 /// Adds all uses that cause the result to be MaybeLive to MaybeLiveRetUses. If
 /// the result is Live, MaybeLiveUses might be modified but its content should
 /// be ignored (since it might not be complete).
 DeadArgumentEliminationPass::Liveness
-DeadArgumentEliminationPass::SurveyUses(const Value *V,
+DeadArgumentEliminationPass::surveyUses(const Value *V,
                                         UseVector &MaybeLiveUses) {
   // Assume it's dead (which will only hold if there are no uses at all..).
   Liveness Result = MaybeLive;
   // Check each use.
   for (const Use &U : V->uses()) {
-    Result = SurveyUse(&U, MaybeLiveUses);
+    Result = surveyUse(&U, MaybeLiveUses);
     if (Result == Live)
       break;
   }
   return Result;
 }
 
-// SurveyFunction - This performs the initial survey of the specified function,
-// checking out whether or not it uses any of its incoming arguments or whether
-// any callers use the return value.  This fills in the LiveValues set and Uses
-// map.
-//
-// We consider arguments of non-internal functions to be intrinsically alive as
-// well as arguments to functions which have their "address taken".
-void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
+/// Performs the initial survey of the specified function, checking out whether
+/// it uses any of its incoming arguments or whether any callers use the return
+/// value. This fills in the LiveValues set and Uses map.
+///
+/// We consider arguments of non-internal functions to be intrinsically alive as
+/// well as arguments to functions which have their "address taken".
+void DeadArgumentEliminationPass::surveyFunction(const Function &F) {
   // Functions with inalloca/preallocated parameters are expecting args in a
   // particular register and memory layout.
   if (F.getAttributes().hasAttrSomewhere(Attribute::InAlloca) ||
       F.getAttributes().hasAttrSomewhere(Attribute::Preallocated)) {
-    MarkLive(F);
+    markLive(F);
     return;
   }
 
@@ -499,11 +498,11 @@ void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
   // otherwise rely on the frame layout in a way that this analysis will not
   // see.
   if (F.hasFnAttribute(Attribute::Naked)) {
-    MarkLive(F);
+    markLive(F);
     return;
   }
 
-  unsigned RetCount = NumRetVals(&F);
+  unsigned RetCount = numRetVals(&F);
 
   // Assume all return values are dead
   using RetVals = SmallVector<Liveness, 5>;
@@ -518,20 +517,10 @@ void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
   RetUses MaybeLiveRetUses(RetCount);
 
   bool HasMustTailCalls = false;
-
-  for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-    if (const ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
-      if (RI->getNumOperands() != 0 && RI->getOperand(0)->getType()
-          != F.getFunctionType()->getReturnType()) {
-        // We don't support old style multiple return values.
-        MarkLive(F);
-        return;
-      }
-    }
-
+  for (const BasicBlock &BB : F) {
     // If we have any returns of `musttail` results - the signature can't
     // change
-    if (BB->getTerminatingMustTailCall() != nullptr)
+    if (BB.getTerminatingMustTailCall() != nullptr)
       HasMustTailCalls = true;
   }
 
@@ -541,7 +530,7 @@ void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
   }
 
   if (!F.hasLocalLinkage() && (!ShouldHackArguments || F.isIntrinsic())) {
-    MarkLive(F);
+    markLive(F);
     return;
   }
 
@@ -559,8 +548,9 @@ void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
     // If the function is PASSED IN as an argument, its address has been
     // taken.
     const auto *CB = dyn_cast<CallBase>(U.getUser());
-    if (!CB || !CB->isCallee(&U)) {
-      MarkLive(F);
+    if (!CB || !CB->isCallee(&U) ||
+        CB->getFunctionType() != F.getFunctionType()) {
+      markLive(F);
       return;
     }
 
@@ -577,13 +567,13 @@ void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
       continue;
 
     // Check all uses of the return value.
-    for (const Use &U : CB->uses()) {
-      if (ExtractValueInst *Ext = dyn_cast<ExtractValueInst>(U.getUser())) {
+    for (const Use &UU : CB->uses()) {
+      if (ExtractValueInst *Ext = dyn_cast<ExtractValueInst>(UU.getUser())) {
         // This use uses a part of our return value, survey the uses of
         // that part and store the results for this index only.
         unsigned Idx = *Ext->idx_begin();
         if (RetValLiveness[Idx] != Live) {
-          RetValLiveness[Idx] = SurveyUses(Ext, MaybeLiveRetUses[Idx]);
+          RetValLiveness[Idx] = surveyUses(Ext, MaybeLiveRetUses[Idx]);
           if (RetValLiveness[Idx] == Live)
             NumLiveRetVals++;
         }
@@ -591,16 +581,16 @@ void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
         // Used by something else than extractvalue. Survey, but assume that the
         // result applies to all sub-values.
         UseVector MaybeLiveAggregateUses;
-        if (SurveyUse(&U, MaybeLiveAggregateUses) == Live) {
+        if (surveyUse(&UU, MaybeLiveAggregateUses) == Live) {
           NumLiveRetVals = RetCount;
           RetValLiveness.assign(RetCount, Live);
           break;
-        } else {
-          for (unsigned Ri = 0; Ri != RetCount; ++Ri) {
-            if (RetValLiveness[Ri] != Live)
-              MaybeLiveRetUses[Ri].append(MaybeLiveAggregateUses.begin(),
-                                          MaybeLiveAggregateUses.end());
-          }
+        }
+
+        for (unsigned Ri = 0; Ri != RetCount; ++Ri) {
+          if (RetValLiveness[Ri] != Live)
+            MaybeLiveRetUses[Ri].append(MaybeLiveAggregateUses.begin(),
+                                        MaybeLiveAggregateUses.end());
         }
       }
     }
@@ -613,7 +603,7 @@ void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
 
   // Now we've inspected all callers, record the liveness of our return values.
   for (unsigned Ri = 0; Ri != RetCount; ++Ri)
-    MarkValue(CreateRet(&F, Ri), RetValLiveness[Ri], MaybeLiveRetUses[Ri]);
+    markValue(createRet(&F, Ri), RetValLiveness[Ri], MaybeLiveRetUses[Ri]);
 
   LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Inspecting args for fn: "
                     << F.getName() << "\n");
@@ -641,81 +631,77 @@ void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
     } else {
       // See what the effect of this use is (recording any uses that cause
       // MaybeLive in MaybeLiveArgUses).
-      Result = SurveyUses(&*AI, MaybeLiveArgUses);
+      Result = surveyUses(&*AI, MaybeLiveArgUses);
     }
 
     // Mark the result.
-    MarkValue(CreateArg(&F, ArgI), Result, MaybeLiveArgUses);
+    markValue(createArg(&F, ArgI), Result, MaybeLiveArgUses);
     // Clear the vector again for the next iteration.
     MaybeLiveArgUses.clear();
   }
 }
 
-/// MarkValue - This function marks the liveness of RA depending on L. If L is
-/// MaybeLive, it also takes all uses in MaybeLiveUses and records them in Uses,
-/// such that RA will be marked live if any use in MaybeLiveUses gets marked
-/// live later on.
-void DeadArgumentEliminationPass::MarkValue(const RetOrArg &RA, Liveness L,
+/// Marks the liveness of RA depending on L. If L is MaybeLive, it also takes
+/// all uses in MaybeLiveUses and records them in Uses, such that RA will be
+/// marked live if any use in MaybeLiveUses gets marked live later on.
+void DeadArgumentEliminationPass::markValue(const RetOrArg &RA, Liveness L,
                                             const UseVector &MaybeLiveUses) {
   switch (L) {
-    case Live:
-      MarkLive(RA);
-      break;
-    case MaybeLive:
-      assert(!IsLive(RA) && "Use is already live!");
-      for (const auto &MaybeLiveUse : MaybeLiveUses) {
-        if (IsLive(MaybeLiveUse)) {
-          // A use is live, so this value is live.
-          MarkLive(RA);
-          break;
-        } else {
-          // Note any uses of this value, so this value can be
-          // marked live whenever one of the uses becomes live.
-          Uses.insert(std::make_pair(MaybeLiveUse, RA));
-        }
+  case Live:
+    markLive(RA);
+    break;
+  case MaybeLive:
+    assert(!isLive(RA) && "Use is already live!");
+    for (const auto &MaybeLiveUse : MaybeLiveUses) {
+      if (isLive(MaybeLiveUse)) {
+        // A use is live, so this value is live.
+        markLive(RA);
+        break;
       }
-      break;
+      // Note any uses of this value, so this value can be
+      // marked live whenever one of the uses becomes live.
+      Uses.emplace(MaybeLiveUse, RA);
+    }
+    break;
   }
 }
 
-/// MarkLive - Mark the given Function as alive, meaning that it cannot be
-/// changed in any way. Additionally,
-/// mark any values that are used as this function's parameters or by its return
-/// values (according to Uses) live as well.
-void DeadArgumentEliminationPass::MarkLive(const Function &F) {
+/// Mark the given Function as alive, meaning that it cannot be changed in any
+/// way. Additionally, mark any values that are used as this function's
+/// parameters or by its return values (according to Uses) live as well.
+void DeadArgumentEliminationPass::markLive(const Function &F) {
   LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Intrinsically live fn: "
                     << F.getName() << "\n");
   // Mark the function as live.
   LiveFunctions.insert(&F);
   // Mark all arguments as live.
   for (unsigned ArgI = 0, E = F.arg_size(); ArgI != E; ++ArgI)
-    PropagateLiveness(CreateArg(&F, ArgI));
+    propagateLiveness(createArg(&F, ArgI));
   // Mark all return values as live.
-  for (unsigned Ri = 0, E = NumRetVals(&F); Ri != E; ++Ri)
-    PropagateLiveness(CreateRet(&F, Ri));
+  for (unsigned Ri = 0, E = numRetVals(&F); Ri != E; ++Ri)
+    propagateLiveness(createRet(&F, Ri));
 }
 
-/// MarkLive - Mark the given return value or argument as live. Additionally,
-/// mark any values that are used by this value (according to Uses) live as
-/// well.
-void DeadArgumentEliminationPass::MarkLive(const RetOrArg &RA) {
-  if (IsLive(RA))
+/// Mark the given return value or argument as live. Additionally, mark any
+/// values that are used by this value (according to Uses) live as well.
+void DeadArgumentEliminationPass::markLive(const RetOrArg &RA) {
+  if (isLive(RA))
     return; // Already marked Live.
 
   LiveValues.insert(RA);
 
   LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Marking "
                     << RA.getDescription() << " live\n");
-  PropagateLiveness(RA);
+  propagateLiveness(RA);
 }
 
-bool DeadArgumentEliminationPass::IsLive(const RetOrArg &RA) {
+bool DeadArgumentEliminationPass::isLive(const RetOrArg &RA) {
   return LiveFunctions.count(RA.F) || LiveValues.count(RA);
 }
 
-/// PropagateLiveness - Given that RA is a live value, propagate it's liveness
-/// to any other values it uses (according to Uses).
-void DeadArgumentEliminationPass::PropagateLiveness(const RetOrArg &RA) {
+/// Given that RA is a live value, propagate it's liveness to any other values
+/// it uses (according to Uses).
+void DeadArgumentEliminationPass::propagateLiveness(const RetOrArg &RA) {
   // We don't use upper_bound (or equal_range) here, because our recursive call
   // to ourselves is likely to cause the upper_bound (which is the first value
   // not belonging to RA) to become erased and the iterator invalidated.
@@ -723,18 +709,17 @@ void DeadArgumentEliminationPass::PropagateLiveness(const RetOrArg &RA) {
   UseMap::iterator E = Uses.end();
   UseMap::iterator I;
   for (I = Begin; I != E && I->first == RA; ++I)
-    MarkLive(I->second);
+    markLive(I->second);
 
   // Erase RA from the Uses map (from the lower bound to wherever we ended up
   // after the loop).
   Uses.erase(Begin, I);
 }
 
-// RemoveDeadStuffFromFunction - Remove any arguments and return values from F
-// that are not in LiveValues. Transform the function and all of the callees of
-// the function to not have these arguments and return values.
-//
-bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
+/// Remove any arguments and return values from F that are not in LiveValues.
+/// Transform the function and all the callees of the function to not have these
+/// arguments and return values.
+bool DeadArgumentEliminationPass::removeDeadStuffFromFunction(Function *F) {
   // Don't modify fully live functions
   if (LiveFunctions.count(F))
     return false;
@@ -742,7 +727,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
   // Start by computing a new prototype for the function, which is the same as
   // the old function, but has fewer arguments and a different return type.
   FunctionType *FTy = F->getFunctionType();
-  std::vector<Type*> Params;
+  std::vector<Type *> Params;
 
   // Keep track of if we have a live 'returned' argument
   bool HasLiveReturnedArg = false;
@@ -759,7 +744,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
   unsigned ArgI = 0;
   for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
        ++I, ++ArgI) {
-    RetOrArg Arg = CreateArg(F, ArgI);
+    RetOrArg Arg = createArg(F, ArgI);
     if (LiveValues.erase(Arg)) {
       Params.push_back(I->getType());
       ArgAlive[ArgI] = true;
@@ -776,11 +761,11 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
   // Find out the new return value.
   Type *RetTy = FTy->getReturnType();
   Type *NRetTy = nullptr;
-  unsigned RetCount = NumRetVals(F);
+  unsigned RetCount = numRetVals(F);
 
   // -1 means unused, other numbers are the new index
   SmallVector<int, 5> NewRetIdxs(RetCount, -1);
-  std::vector<Type*> RetTypes;
+  std::vector<Type *> RetTypes;
 
   // If there is a function with a live 'returned' argument but a dead return
   // value, then there are two possible actions:
@@ -792,9 +777,9 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
   // It's not clear in the general case which option is more profitable because,
   // even in the absence of explicit uses of the return value, code generation
   // is free to use the 'returned' attribute to do things like eliding
-  // save/restores of registers across calls. Whether or not this happens is
-  // target and ABI-specific as well as depending on the amount of register
-  // pressure, so there's no good way for an IR-level pass to figure this out.
+  // save/restores of registers across calls. Whether this happens is target and
+  // ABI-specific as well as depending on the amount of register pressure, so
+  // there's no good way for an IR-level pass to figure this out.
   //
   // Fortunately, the only places where 'returned' is currently generated by
   // the FE are places where 'returned' is basically free and almost always a
@@ -806,7 +791,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
   } else {
     // Look at each of the original return values individually.
     for (unsigned Ri = 0; Ri != RetCount; ++Ri) {
-      RetOrArg Ret = CreateRet(F, Ri);
+      RetOrArg Ret = createRet(F, Ri);
       if (LiveValues.erase(Ret)) {
         RetTypes.push_back(getRetComponentType(F, Ri));
         NewRetIdxs[Ri] = RetTypes.size() - 1;
@@ -879,9 +864,9 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
   F->getParent()->getFunctionList().insert(F->getIterator(), NF);
   NF->takeName(F);
 
-  // Loop over all of the callers of the function, transforming the call sites
-  // to pass in a smaller number of arguments into the new function.
-  std::vector<Value*> Args;
+  // Loop over all the callers of the function, transforming the call sites to
+  // pass in a smaller number of arguments into the new function.
+  std::vector<Value *> Args;
   while (!F->use_empty()) {
     CallBase &CB = cast<CallBase>(*F->user_back());
 
@@ -896,7 +881,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
 
     // Declare these outside of the loops, so we can reuse them for the second
     // loop, which loops the varargs.
-    auto I = CB.arg_begin();
+    auto *I = CB.arg_begin();
     unsigned Pi = 0;
     // Loop over those operands, corresponding to the normal arguments to the
     // original function, and add those that are still alive.
@@ -909,11 +894,11 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
           // If the return type has changed, then get rid of 'returned' on the
           // call site. The alternative is to make all 'returned' attributes on
           // call sites keep the return value alive just like 'returned'
-          // attributes on function declaration but it's less clearly a win and
+          // attributes on function declaration, but it's less clearly a win and
           // this is not an expected case anyway
           ArgAttrVec.push_back(AttributeSet::get(
-              F->getContext(),
-              AttrBuilder(F->getContext(), Attrs).removeAttribute(Attribute::Returned)));
+              F->getContext(), AttrBuilder(F->getContext(), Attrs)
+                                   .removeAttribute(Attribute::Returned)));
         } else {
           // Otherwise, use the original attributes.
           ArgAttrVec.push_back(Attrs);
@@ -921,7 +906,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
       }
 
     // Push any varargs arguments on the list. Don't forget their attributes.
-    for (auto E = CB.arg_end(); I != E; ++I, ++Pi) {
+    for (auto *E = CB.arg_end(); I != E; ++I, ++Pi) {
       Args.push_back(*I);
       ArgAttrVec.push_back(CallPAL.getParamAttrs(Pi));
     }
@@ -934,8 +919,8 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
     AttributeSet FnAttrs = CallPAL.getFnAttrs().removeAttribute(
         F->getContext(), Attribute::AllocSize);
 
-    AttributeList NewCallPAL = AttributeList::get(
-        F->getContext(), FnAttrs, RetAttrs, ArgAttrVec);
+    AttributeList NewCallPAL =
+        AttributeList::get(F->getContext(), FnAttrs, RetAttrs, ArgAttrVec);
 
     SmallVector<OperandBundleDef, 1> OpBundles;
     CB.getOperandBundlesAsDefs(OpBundles);
@@ -961,10 +946,10 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
         CB.replaceAllUsesWith(NewCB);
         NewCB->takeName(&CB);
       } else if (NewCB->getType()->isVoidTy()) {
-        // If the return value is dead, replace any uses of it with undef
+        // If the return value is dead, replace any uses of it with poison
         // (any non-debug value uses will get removed later on).
         if (!CB.getType()->isX86_MMXTy())
-          CB.replaceAllUsesWith(UndefValue::get(CB.getType()));
+          CB.replaceAllUsesWith(PoisonValue::get(CB.getType()));
       } else {
         assert((RetTy->isStructTy() || RetTy->isArrayTy()) &&
                "Return type changed, but not into a void. The old return type"
@@ -980,8 +965,8 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
         // with all the uses, we will just rebuild it using extract/insertvalue
         // chaining and let instcombine clean that up.
         //
-        // Start out building up our return value from undef
-        Value *RetVal = UndefValue::get(RetTy);
+        // Start out building up our return value from poison
+        Value *RetVal = PoisonValue::get(RetTy);
         for (unsigned Ri = 0; Ri != RetCount; ++Ri)
           if (NewRetIdxs[Ri] != -1) {
             Value *V;
@@ -1026,10 +1011,10 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
       I2->takeName(&*I);
       ++I2;
     } else {
-      // If this argument is dead, replace any uses of it with undef
+      // If this argument is dead, replace any uses of it with poison
       // (any non-debug value uses will get removed later on).
       if (!I->getType()->isX86_MMXTy())
-        I->replaceAllUsesWith(UndefValue::get(I->getType()));
+        I->replaceAllUsesWith(PoisonValue::get(I->getType()));
     }
 
   // If we change the return value of the function we must rewrite any return
@@ -1048,8 +1033,8 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
           // This does generate messy code, but we'll let it to instcombine to
           // clean that up.
           Value *OldRet = RI->getOperand(0);
-          // Start out building up our return value from undef
-          RetVal = UndefValue::get(NRetTy);
+          // Start out building up our return value from poison
+          RetVal = PoisonValue::get(NRetTy);
           for (unsigned RetI = 0; RetI != RetCount; ++RetI)
             if (NewRetIdxs[RetI] != -1) {
               Value *EV = IRB.CreateExtractValue(OldRet, RetI, "oldret");
@@ -1074,12 +1059,22 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
         BB.getInstList().erase(RI);
       }
 
-  // Clone metadatas from the old function, including debug info descriptor.
+  // Clone metadata from the old function, including debug info descriptor.
   SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
   F->getAllMetadata(MDs);
   for (auto MD : MDs)
     NF->addMetadata(MD.first, *MD.second);
 
+  // If either the return value(s) or argument(s) are removed, then probably the
+  // function does not follow standard calling conventions anymore. Hence, add
+  // DW_CC_nocall to DISubroutineType to inform debugger that it may not be safe
+  // to call this function or try to interpret the return value.
+  if (NFTy != FTy && NF->getSubprogram()) {
+    DISubprogram *SP = NF->getSubprogram();
+    auto Temp = SP->getType()->cloneWithCC(llvm::dwarf::DW_CC_nocall);
+    SP->replaceType(MDNode::replaceWithPermanent(std::move(Temp)));
+  }
+
   // Now that the old function is dead, delete it.
   F->eraseFromParent();
 
@@ -1097,26 +1092,25 @@ PreservedAnalyses DeadArgumentEliminationPass::run(Module &M,
   LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Deleting dead varargs\n");
   for (Function &F : llvm::make_early_inc_range(M))
     if (F.getFunctionType()->isVarArg())
-      Changed |= DeleteDeadVarargs(F);
+      Changed |= deleteDeadVarargs(F);
 
-  // Second phase:loop through the module, determining which arguments are live.
-  // We assume all arguments are dead unless proven otherwise (allowing us to
-  // determine that dead arguments passed into recursive functions are dead).
-  //
+  // Second phase: Loop through the module, determining which arguments are
+  // live. We assume all arguments are dead unless proven otherwise (allowing us
+  // to determine that dead arguments passed into recursive functions are dead).
   LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Determining liveness\n");
   for (auto &F : M)
-    SurveyFunction(F);
+    surveyFunction(F);
 
   // Now, remove all dead arguments and return values from each function in
   // turn.  We use make_early_inc_range here because functions will probably get
   // removed (i.e. replaced by new ones).
   for (Function &F : llvm::make_early_inc_range(M))
-    Changed |= RemoveDeadStuffFromFunction(&F);
+    Changed |= removeDeadStuffFromFunction(&F);
 
   // Finally, look for any unused parameters in functions with non-local
-  // linkage and replace the passed in parameters with undef.
+  // linkage and replace the passed in parameters with poison.
   for (auto &F : M)
-    Changed |= RemoveDeadArgumentsFromCallers(F);
+    Changed |= removeDeadArgumentsFromCallers(F);
 
   if (!Changed)
     return PreservedAnalyses::all();
diff --git a/llvm/lib/Transforms/IPO/ExtractGV.cpp b/llvm/lib/Transforms/IPO/ExtractGV.cpp
index 387f114f6ffa..84280781ee70 100644
--- a/llvm/lib/Transforms/IPO/ExtractGV.cpp
+++ b/llvm/lib/Transforms/IPO/ExtractGV.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/SetVector.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/IPO.h"
diff --git a/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp b/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp
index 16d00a0c89e1..b10c2ea13469 100644
--- a/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp
@@ -8,9 +8,9 @@
 
 #include "llvm/Transforms/IPO/ForceFunctionAttrs.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index 213a998d5bba..49077f92884f 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -30,7 +30,6 @@
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/LazyCallGraph.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
@@ -45,6 +44,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
@@ -69,6 +69,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "function-attrs"
 
+STATISTIC(NumArgMemOnly, "Number of functions marked argmemonly");
 STATISTIC(NumReadNone, "Number of functions marked readnone");
 STATISTIC(NumReadOnly, "Number of functions marked readonly");
 STATISTIC(NumWriteOnly, "Number of functions marked writeonly");
@@ -121,28 +122,28 @@ using SCCNodeSet = SmallSetVector<Function *, 8>;
 /// result will be based only on AA results for the function declaration; it
 /// will be assumed that some other (perhaps less optimized) version of the
 /// function may be selected at link time.
-static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
-                                                  AAResults &AAR,
-                                                  const SCCNodeSet &SCCNodes) {
+static FunctionModRefBehavior
+checkFunctionMemoryAccess(Function &F, bool ThisBody, AAResults &AAR,
+                          const SCCNodeSet &SCCNodes) {
   FunctionModRefBehavior MRB = AAR.getModRefBehavior(&F);
   if (MRB == FMRB_DoesNotAccessMemory)
     // Already perfect!
-    return MAK_ReadNone;
+    return MRB;
 
-  if (!ThisBody) {
-    if (AliasAnalysis::onlyReadsMemory(MRB))
-      return MAK_ReadOnly;
-
-    if (AliasAnalysis::onlyWritesMemory(MRB))
-      return MAK_WriteOnly;
-
-    // Conservatively assume it reads and writes to memory.
-    return MAK_MayWrite;
-  }
+  if (!ThisBody)
+    return MRB;
 
   // Scan the function body for instructions that may read or write memory.
   bool ReadsMemory = false;
   bool WritesMemory = false;
+  // Track if the function accesses memory not based on pointer arguments or
+  // allocas.
+  bool AccessesNonArgsOrAlloca = false;
+  // Returns true if Ptr is not based on a function argument.
+  auto IsArgumentOrAlloca = [](const Value *Ptr) {
+    const Value *UO = getUnderlyingObject(Ptr);
+    return isa<Argument>(UO) || isa<AllocaInst>(UO);
+  };
   for (Instruction &I : instructions(F)) {
     // Some instructions can be ignored even if they read or write memory.
     // Detect these now, skipping to the next instruction if one is found.
@@ -175,6 +176,7 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
         // If it reads, note it.
         if (isRefSet(MRI))
           ReadsMemory = true;
+        AccessesNonArgsOrAlloca = true;
         continue;
       }
 
@@ -187,12 +189,13 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
 
         MemoryLocation Loc =
             MemoryLocation::getBeforeOrAfter(Arg, I.getAAMetadata());
-
         // Skip accesses to local or constant memory as they don't impact the
         // externally visible mod/ref behavior.
         if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
           continue;
 
+        AccessesNonArgsOrAlloca |= !IsArgumentOrAlloca(Loc.Ptr);
+
         if (isModSet(MRI))
           // Writes non-local memory.
           WritesMemory = true;
@@ -202,24 +205,29 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
       }
       continue;
     } else if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+      MemoryLocation Loc = MemoryLocation::get(LI);
       // Ignore non-volatile loads from local memory. (Atomic is okay here.)
-      if (!LI->isVolatile()) {
-        MemoryLocation Loc = MemoryLocation::get(LI);
-        if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
-          continue;
-      }
+      if (!LI->isVolatile() &&
+          AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
+        continue;
+      AccessesNonArgsOrAlloca |= !IsArgumentOrAlloca(Loc.Ptr);
     } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
+      MemoryLocation Loc = MemoryLocation::get(SI);
       // Ignore non-volatile stores to local memory. (Atomic is okay here.)
-      if (!SI->isVolatile()) {
-        MemoryLocation Loc = MemoryLocation::get(SI);
-        if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
-          continue;
-      }
+      if (!SI->isVolatile() &&
+          AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
+        continue;
+      AccessesNonArgsOrAlloca |= !IsArgumentOrAlloca(Loc.Ptr);
     } else if (VAArgInst *VI = dyn_cast<VAArgInst>(&I)) {
       // Ignore vaargs on local memory.
       MemoryLocation Loc = MemoryLocation::get(VI);
       if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
         continue;
+      AccessesNonArgsOrAlloca |= !IsArgumentOrAlloca(Loc.Ptr);
+    } else {
+      // If AccessesNonArgsOrAlloca has not been updated above, set it
+      // conservatively.
+      AccessesNonArgsOrAlloca |= I.mayReadOrWriteMemory();
     }
 
     // Any remaining instructions need to be taken seriously!  Check if they
@@ -232,61 +240,74 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
     ReadsMemory |= I.mayReadFromMemory();
   }
 
-  if (WritesMemory) { 
-    if (!ReadsMemory)
-      return MAK_WriteOnly;
-    else
-      return MAK_MayWrite;
-  }
-
-  return ReadsMemory ? MAK_ReadOnly : MAK_ReadNone;
+  if (!WritesMemory && !ReadsMemory)
+    return FMRB_DoesNotAccessMemory;
+
+  FunctionModRefBehavior Result = FunctionModRefBehavior(FMRL_Anywhere);
+  if (!AccessesNonArgsOrAlloca)
+    Result = FunctionModRefBehavior(FMRL_ArgumentPointees);
+  if (WritesMemory)
+    Result = FunctionModRefBehavior(Result | static_cast<int>(ModRefInfo::Mod));
+  if (ReadsMemory)
+    Result = FunctionModRefBehavior(Result | static_cast<int>(ModRefInfo::Ref));
+  return Result;
 }
 
-MemoryAccessKind llvm::computeFunctionBodyMemoryAccess(Function &F,
-                                                       AAResults &AAR) {
+FunctionModRefBehavior llvm::computeFunctionBodyMemoryAccess(Function &F,
+                                                             AAResults &AAR) {
   return checkFunctionMemoryAccess(F, /*ThisBody=*/true, AAR, {});
 }
 
-/// Deduce readonly/readnone attributes for the SCC.
+/// Deduce readonly/readnone/writeonly attributes for the SCC.
 template <typename AARGetterT>
-static void addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter,
-                         SmallSet<Function *, 8> &Changed) {
+static void addMemoryAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter,
+                           SmallSet<Function *, 8> &Changed) {
   // Check if any of the functions in the SCC read or write memory.  If they
   // write memory then they can't be marked readnone or readonly.
   bool ReadsMemory = false;
   bool WritesMemory = false;
+  // Check if all functions only access memory through their arguments.
+  bool ArgMemOnly = true;
   for (Function *F : SCCNodes) {
     // Call the callable parameter to look up AA results for this function.
     AAResults &AAR = AARGetter(*F);
-
     // Non-exact function definitions may not be selected at link time, and an
     // alternative version that writes to memory may be selected.  See the
     // comment on GlobalValue::isDefinitionExact for more details.
-    switch (checkFunctionMemoryAccess(*F, F->hasExactDefinition(),
-                                      AAR, SCCNodes)) {
-    case MAK_MayWrite:
+    FunctionModRefBehavior FMRB =
+        checkFunctionMemoryAccess(*F, F->hasExactDefinition(), AAR, SCCNodes);
+    if (FMRB == FMRB_DoesNotAccessMemory)
+      continue;
+    ModRefInfo MR = createModRefInfo(FMRB);
+    ReadsMemory |= isRefSet(MR);
+    WritesMemory |= isModSet(MR);
+    ArgMemOnly &= AliasAnalysis::onlyAccessesArgPointees(FMRB);
+    // Reached neither readnone, readonly, writeonly nor argmemonly can be
+    // inferred. Exit.
+    if (ReadsMemory && WritesMemory && !ArgMemOnly)
       return;
-    case MAK_ReadOnly:
-      ReadsMemory = true;
-      break;
-    case MAK_WriteOnly:
-      WritesMemory = true;
-      break;
-    case MAK_ReadNone:
-      // Nothing to do!
-      break;
-    }
   }
 
-  // If the SCC contains both functions that read and functions that write, then
-  // we cannot add readonly attributes.
-  if (ReadsMemory && WritesMemory)
-    return;
-
-  // Success!  Functions in this SCC do not access memory, or only read memory.
-  // Give them the appropriate attribute.
+  assert((!ReadsMemory || !WritesMemory || ArgMemOnly) &&
+         "no memory attributes can be added for this SCC, should have exited "
+         "earlier");
+  // Success!  Functions in this SCC do not access memory, only read memory,
+  // only write memory, or only access memory through its arguments. Give them
+  // the appropriate attribute.
 
   for (Function *F : SCCNodes) {
+    // If possible add argmemonly attribute to F, if it accesses memory.
+    if (ArgMemOnly && !F->onlyAccessesArgMemory() &&
+        (ReadsMemory || WritesMemory)) {
+      NumArgMemOnly++;
+      F->addFnAttr(Attribute::ArgMemOnly);
+      Changed.insert(F);
+    }
+
+    // The SCC contains functions both writing and reading from memory. We
+    // cannot add readonly or writeonline attributes.
+    if (ReadsMemory && WritesMemory)
+      continue;
     if (F->doesNotAccessMemory())
       // Already perfect!
       continue;
@@ -1614,6 +1635,26 @@ static bool basicBlockCanReturn(BasicBlock &BB) {
   return none_of(BB, instructionDoesNotReturn);
 }
 
+// FIXME: this doesn't handle recursion.
+static bool canReturn(Function &F) {
+  SmallVector<BasicBlock *, 16> Worklist;
+  SmallPtrSet<BasicBlock *, 16> Visited;
+
+  Visited.insert(&F.front());
+  Worklist.push_back(&F.front());
+
+  do {
+    BasicBlock *BB = Worklist.pop_back_val();
+    if (basicBlockCanReturn(*BB))
+      return true;
+    for (BasicBlock *Succ : successors(BB))
+      if (Visited.insert(Succ).second)
+        Worklist.push_back(Succ);
+  } while (!Worklist.empty());
+
+  return false;
+}
+
 // Set the noreturn function attribute if possible.
 static void addNoReturnAttrs(const SCCNodeSet &SCCNodes,
                              SmallSet<Function *, 8> &Changed) {
@@ -1622,9 +1663,7 @@ static void addNoReturnAttrs(const SCCNodeSet &SCCNodes,
         F->doesNotReturn())
       continue;
 
-    // The function can return if any basic blocks can return.
-    // FIXME: this doesn't handle recursion or unreachable blocks.
-    if (none_of(*F, basicBlockCanReturn)) {
+    if (!canReturn(*F)) {
       F->setDoesNotReturn();
       Changed.insert(F);
     }
@@ -1792,7 +1831,7 @@ deriveAttrsInPostOrder(ArrayRef<Function *> Functions, AARGetterT &&AARGetter) {
   SmallSet<Function *, 8> Changed;
 
   addArgumentReturnedAttrs(Nodes.SCCNodes, Changed);
-  addReadAttrs(Nodes.SCCNodes, AARGetter, Changed);
+  addMemoryAttrs(Nodes.SCCNodes, AARGetter, Changed);
   addArgumentAttrs(Nodes.SCCNodes, Changed);
   inferConvergent(Nodes.SCCNodes, Changed);
   addNoReturnAttrs(Nodes.SCCNodes, Changed);
@@ -1896,6 +1935,7 @@ struct PostOrderFunctionAttrsLegacyPass : public CallGraphSCCPass {
 char PostOrderFunctionAttrsLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(PostOrderFunctionAttrsLegacyPass, "function-attrs",
                       "Deduce function attributes", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
 INITIALIZE_PASS_END(PostOrderFunctionAttrsLegacyPass, "function-attrs",
@@ -1975,12 +2015,13 @@ static bool addNoRecurseAttrsTopDown(Function &F) {
   // this function could be recursively (indirectly) called. Note that this
   // also detects if F is directly recursive as F is not yet marked as
   // a norecurse function.
-  for (auto *U : F.users()) {
-    auto *I = dyn_cast<Instruction>(U);
+  for (auto &U : F.uses()) {
+    auto *I = dyn_cast<Instruction>(U.getUser());
     if (!I)
       return false;
     CallBase *CB = dyn_cast<CallBase>(I);
-    if (!CB || !CB->getParent()->getParent()->doesNotRecurse())
+    if (!CB || !CB->isCallee(&U) ||
+        !CB->getParent()->getParent()->doesNotRecurse())
       return false;
   }
   F.setDoesNotRecurse();
diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index d9b43109f629..56e2df14ff38 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -18,7 +18,6 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSet.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/IR/AutoUpgrade.h"
 #include "llvm/IR/Constants.h"
@@ -33,8 +32,6 @@
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Linker/IRMover.h"
-#include "llvm/Object/ModuleSymbolTable.h"
-#include "llvm/Object/SymbolicFile.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -1112,12 +1109,13 @@ void llvm::thinLTOFinalizeInModule(Module &TheModule,
         llvm_unreachable("Expected GV to be converted");
     } else {
       // If all copies of the original symbol had global unnamed addr and
-      // linkonce_odr linkage, it should be an auto hide symbol. In that case
-      // the thin link would have marked it as CanAutoHide. Add hidden visibility
-      // to the symbol to preserve the property.
+      // linkonce_odr linkage, or if all of them had local unnamed addr linkage
+      // and are constants, then it should be an auto hide symbol. In that case
+      // the thin link would have marked it as CanAutoHide. Add hidden
+      // visibility to the symbol to preserve the property.
       if (NewLinkage == GlobalValue::WeakODRLinkage &&
           GS->second->canAutoHide()) {
-        assert(GV.hasLinkOnceODRLinkage() && GV.hasGlobalUnnamedAddr());
+        assert(GV.canBeOmittedFromSymbolTable());
         GV.setVisibility(GlobalValue::HiddenVisibility);
       }
 
@@ -1330,10 +1328,9 @@ Expected<bool> FunctionImporter::importFunctions(
                << " from " << SrcModule->getSourceFileName() << "\n";
     }
 
-    if (Error Err = Mover.move(
-            std::move(SrcModule), GlobalsToImport.getArrayRef(),
-            [](GlobalValue &, IRMover::ValueAdder) {},
-            /*IsPerformingImport=*/true))
+    if (Error Err = Mover.move(std::move(SrcModule),
+                               GlobalsToImport.getArrayRef(), nullptr,
+                               /*IsPerformingImport=*/true))
       report_fatal_error(Twine("Function Import: link error: ") +
                          toString(std::move(Err)));
 
diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index 6c3cc3914337..dafd0dc865a2 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -19,11 +19,8 @@
 // Current limitations:
 // - It does not yet handle integer ranges. We do support "literal constants",
 //   but that's off by default under an option.
-// - Only 1 argument per function is specialised,
 // - The cost-model could be further looked into (it mainly focuses on inlining
 //   benefits),
-// - We are not yet caching analysis results, but profiling and checking where
-//   extra compile time is spent didn't suggest this to be a problem.
 //
 // Ideas:
 // - With a function specialization attribute for arguments, we could have
@@ -49,15 +46,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueLattice.h"
+#include "llvm/Analysis/ValueLatticeUtils.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Transforms/Scalar/SCCP.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/SCCPSolver.h"
 #include "llvm/Transforms/Utils/SizeOpts.h"
 #include <cmath>
 
@@ -98,8 +96,13 @@ static cl::opt<bool> SpecializeOnAddresses(
     "func-specialization-on-address", cl::init(false), cl::Hidden,
     cl::desc("Enable function specialization on the address of global values"));
 
-// TODO: This needs checking to see the impact on compile-times, which is why
-// this is off by default for now.
+// Disabled by default as it can significantly increase compilation times.
+// Running nikic's compile time tracker on x86 with instruction count as the
+// metric shows 3-4% regression for SPASS while being neutral for all other
+// benchmarks of the llvm test suite.
+//
+// https://llvm-compile-time-tracker.com
+// https://github.com/nikic/llvm-compile-time-tracker
 static cl::opt<bool> EnableSpecializationForLiteralConstant(
     "function-specialization-for-literal-constant", cl::init(false), cl::Hidden,
     cl::desc("Enable specialization of functions that take a literal constant "
@@ -108,24 +111,18 @@ static cl::opt<bool> EnableSpecializationForLiteralConstant(
 namespace {
 // Bookkeeping struct to pass data from the analysis and profitability phase
 // to the actual transform helper functions.
-struct ArgInfo {
-  Function *Fn;         // The function to perform specialisation on.
-  Argument *Arg;        // The Formal argument being analysed.
-  Constant *Const;      // A corresponding actual constant argument.
-  InstructionCost Gain; // Profitability: Gain = Bonus - Cost.
-
-  // Flag if this will be a partial specialization, in which case we will need
-  // to keep the original function around in addition to the added
-  // specializations.
-  bool Partial = false;
-
-  ArgInfo(Function *F, Argument *A, Constant *C, InstructionCost G)
-      : Fn(F), Arg(A), Const(C), Gain(G){};
+struct SpecializationInfo {
+  SmallVector<ArgInfo, 8> Args; // Stores the {formal,actual} argument pairs.
+  InstructionCost Gain;         // Profitability: Gain = Bonus - Cost.
 };
 } // Anonymous namespace
 
 using FuncList = SmallVectorImpl<Function *>;
-using ConstList = SmallVectorImpl<Constant *>;
+using CallArgBinding = std::pair<CallBase *, Constant *>;
+using CallSpecBinding = std::pair<CallBase *, SpecializationInfo>;
+// We are using MapVector because it guarantees deterministic iteration
+// order across executions.
+using SpecializationMap = SmallMapVector<CallBase *, SpecializationInfo, 8>;
 
 // Helper to check if \p LV is either a constant or a constant
 // range with a single element. This should cover exactly the same cases as the
@@ -204,41 +201,45 @@ static Constant *getConstantStackValue(CallInst *Call, Value *Val,
 //       ret void
 //     }
 //
-static void constantArgPropagation(FuncList &WorkList,
-                                   Module &M, SCCPSolver &Solver) {
+static void constantArgPropagation(FuncList &WorkList, Module &M,
+                                   SCCPSolver &Solver) {
   // Iterate over the argument tracked functions see if there
   // are any new constant values for the call instruction via
   // stack variables.
   for (auto *F : WorkList) {
-    // TODO: Generalize for any read only arguments.
-    if (F->arg_size() != 1)
-      continue;
-
-    auto &Arg = *F->arg_begin();
-    if (!Arg.onlyReadsMemory() || !Arg.getType()->isPointerTy())
-      continue;
 
     for (auto *User : F->users()) {
+
       auto *Call = dyn_cast<CallInst>(User);
       if (!Call)
-        break;
-      auto *ArgOp = Call->getArgOperand(0);
-      auto *ArgOpType = ArgOp->getType();
-      auto *ConstVal = getConstantStackValue(Call, ArgOp, Solver);
-      if (!ConstVal)
-        break;
+        continue;
 
-      Value *GV = new GlobalVariable(M, ConstVal->getType(), true,
-                                     GlobalValue::InternalLinkage, ConstVal,
-                                     "funcspec.arg");
+      bool Changed = false;
+      for (const Use &U : Call->args()) {
+        unsigned Idx = Call->getArgOperandNo(&U);
+        Value *ArgOp = Call->getArgOperand(Idx);
+        Type *ArgOpType = ArgOp->getType();
 
-      if (ArgOpType != ConstVal->getType())
-        GV = ConstantExpr::getBitCast(cast<Constant>(GV), ArgOp->getType());
+        if (!Call->onlyReadsMemory(Idx) || !ArgOpType->isPointerTy())
+          continue;
 
-      Call->setArgOperand(0, GV);
+        auto *ConstVal = getConstantStackValue(Call, ArgOp, Solver);
+        if (!ConstVal)
+          continue;
+
+        Value *GV = new GlobalVariable(M, ConstVal->getType(), true,
+                                       GlobalValue::InternalLinkage, ConstVal,
+                                       "funcspec.arg");
+        if (ArgOpType != ConstVal->getType())
+          GV = ConstantExpr::getBitCast(cast<Constant>(GV), ArgOpType);
+
+        Call->setArgOperand(Idx, GV);
+        Changed = true;
+      }
 
       // Add the changed CallInst to Solver Worklist
-      Solver.visitCall(*Call);
+      if (Changed)
+        Solver.visitCall(*Call);
     }
   }
 }
@@ -275,7 +276,10 @@ class FunctionSpecializer {
   std::function<TargetTransformInfo &(Function &)> GetTTI;
   std::function<TargetLibraryInfo &(Function &)> GetTLI;
 
-  SmallPtrSet<Function *, 2> SpecializedFuncs;
+  SmallPtrSet<Function *, 4> SpecializedFuncs;
+  SmallPtrSet<Function *, 4> FullySpecialized;
+  SmallVector<Instruction *> ReplacedWithConstant;
+  DenseMap<Function *, CodeMetrics> FunctionMetrics;
 
 public:
   FunctionSpecializer(SCCPSolver &Solver,
@@ -284,42 +288,66 @@ public:
                       std::function<TargetLibraryInfo &(Function &)> GetTLI)
       : Solver(Solver), GetAC(GetAC), GetTTI(GetTTI), GetTLI(GetTLI) {}
 
+  ~FunctionSpecializer() {
+    // Eliminate dead code.
+    removeDeadInstructions();
+    removeDeadFunctions();
+  }
+
   /// Attempt to specialize functions in the module to enable constant
   /// propagation across function boundaries.
   ///
   /// \returns true if at least one function is specialized.
-  bool
-  specializeFunctions(FuncList &FuncDecls,
-                      FuncList &CurrentSpecializations) {
+  bool specializeFunctions(FuncList &Candidates, FuncList &WorkList) {
     bool Changed = false;
-    for (auto *F : FuncDecls) {
-      if (!isCandidateFunction(F, CurrentSpecializations))
+    for (auto *F : Candidates) {
+      if (!isCandidateFunction(F))
         continue;
 
       auto Cost = getSpecializationCost(F);
       if (!Cost.isValid()) {
         LLVM_DEBUG(
-            dbgs() << "FnSpecialization: Invalid specialisation cost.\n");
+            dbgs() << "FnSpecialization: Invalid specialization cost.\n");
         continue;
       }
 
-      auto ConstArgs = calculateGains(F, Cost);
-      if (ConstArgs.empty()) {
-        LLVM_DEBUG(dbgs() << "FnSpecialization: no possible constants found\n");
+      LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization cost for "
+                        << F->getName() << " is " << Cost << "\n");
+
+      SmallVector<CallSpecBinding, 8> Specializations;
+      if (!calculateGains(F, Cost, Specializations)) {
+        LLVM_DEBUG(dbgs() << "FnSpecialization: No possible constants found\n");
         continue;
       }
 
-      for (auto &CA : ConstArgs) {
-        specializeFunction(CA, CurrentSpecializations);
-        Changed = true;
-      }
+      Changed = true;
+      for (auto &Entry : Specializations)
+        specializeFunction(F, Entry.second, WorkList);
     }
 
-    updateSpecializedFuncs(FuncDecls, CurrentSpecializations);
+    updateSpecializedFuncs(Candidates, WorkList);
     NumFuncSpecialized += NbFunctionsSpecialized;
     return Changed;
   }
 
+  void removeDeadInstructions() {
+    for (auto *I : ReplacedWithConstant) {
+      LLVM_DEBUG(dbgs() << "FnSpecialization: Removing dead instruction " << *I
+                        << "\n");
+      I->eraseFromParent();
+    }
+    ReplacedWithConstant.clear();
+  }
+
+  void removeDeadFunctions() {
+    for (auto *F : FullySpecialized) {
+      LLVM_DEBUG(dbgs() << "FnSpecialization: Removing dead function "
+                        << F->getName() << "\n");
+      F->eraseFromParent();
+    }
+    FullySpecialized.clear();
+  }
+
   bool tryToReplaceWithConstant(Value *V) {
     if (!V->getType()->isSingleValueType() || isa<CallBase>(V) ||
         V->user_empty())
@@ -330,17 +358,26 @@ public:
       return false;
     auto *Const =
         isConstant(IV) ? Solver.getConstant(IV) : UndefValue::get(V->getType());
-    V->replaceAllUsesWith(Const);
 
-    for (auto *U : Const->users())
+    LLVM_DEBUG(dbgs() << "FnSpecialization: Replacing " << *V
+                      << "\nFnSpecialization: with " << *Const << "\n");
+
+    // Record uses of V to avoid visiting irrelevant uses of const later.
+    SmallVector<Instruction *> UseInsts;
+    for (auto *U : V->users())
       if (auto *I = dyn_cast<Instruction>(U))
         if (Solver.isBlockExecutable(I->getParent()))
-          Solver.visit(I);
+          UseInsts.push_back(I);
+
+    V->replaceAllUsesWith(Const);
+
+    for (auto *I : UseInsts)
+      Solver.visit(I);
 
     // Remove the instruction from Block and Solver.
     if (auto *I = dyn_cast<Instruction>(V)) {
       if (I->isSafeToRemove()) {
-        I->eraseFromParent();
+        ReplacedWithConstant.push_back(I);
         Solver.removeLatticeValueFor(I);
       }
     }
@@ -352,92 +389,108 @@ private:
   // also in the cost model.
   unsigned NbFunctionsSpecialized = 0;
 
+  // Compute the code metrics for function \p F.
+  CodeMetrics &analyzeFunction(Function *F) {
+    auto I = FunctionMetrics.insert({F, CodeMetrics()});
+    CodeMetrics &Metrics = I.first->second;
+    if (I.second) {
+      // The code metrics were not cached.
+      SmallPtrSet<const Value *, 32> EphValues;
+      CodeMetrics::collectEphemeralValues(F, &(GetAC)(*F), EphValues);
+      for (BasicBlock &BB : *F)
+        Metrics.analyzeBasicBlock(&BB, (GetTTI)(*F), EphValues);
+
+      LLVM_DEBUG(dbgs() << "FnSpecialization: Code size of function "
+                        << F->getName() << " is " << Metrics.NumInsts
+                        << " instructions\n");
+    }
+    return Metrics;
+  }
+
   /// Clone the function \p F and remove the ssa_copy intrinsics added by
   /// the SCCPSolver in the cloned version.
-  Function *cloneCandidateFunction(Function *F) {
-    ValueToValueMapTy EmptyMap;
-    Function *Clone = CloneFunction(F, EmptyMap);
+  Function *cloneCandidateFunction(Function *F, ValueToValueMapTy &Mappings) {
+    Function *Clone = CloneFunction(F, Mappings);
     removeSSACopy(*Clone);
     return Clone;
   }
 
-  /// This function decides whether it's worthwhile to specialize function \p F
-  /// based on the known constant values its arguments can take on, i.e. it
-  /// calculates a gain and returns a list of actual arguments that are deemed
-  /// profitable to specialize. Specialization is performed on the first
-  /// interesting argument. Specializations based on additional arguments will
-  /// be evaluated on following iterations of the main IPSCCP solve loop.
-  SmallVector<ArgInfo> calculateGains(Function *F, InstructionCost Cost) {
-    SmallVector<ArgInfo> Worklist;
+  /// This function decides whether it's worthwhile to specialize function
+  /// \p F based on the known constant values its arguments can take on. It
+  /// only discovers potential specialization opportunities without actually
+  /// applying them.
+  ///
+  /// \returns true if any specializations have been found.
+  bool calculateGains(Function *F, InstructionCost Cost,
+                      SmallVectorImpl<CallSpecBinding> &WorkList) {
+    SpecializationMap Specializations;
     // Determine if we should specialize the function based on the values the
     // argument can take on. If specialization is not profitable, we continue
     // on to the next argument.
     for (Argument &FormalArg : F->args()) {
-      LLVM_DEBUG(dbgs() << "FnSpecialization: Analysing arg: "
-                        << FormalArg.getName() << "\n");
       // Determine if this argument is interesting. If we know the argument can
-      // take on any constant values, they are collected in Constants. If the
-      // argument can only ever equal a constant value in Constants, the
-      // function will be completely specialized, and the IsPartial flag will
-      // be set to false by isArgumentInteresting (that function only adds
-      // values to the Constants list that are deemed profitable).
-      bool IsPartial = true;
-      SmallVector<Constant *> ActualConstArg;
-      if (!isArgumentInteresting(&FormalArg, ActualConstArg, IsPartial)) {
-        LLVM_DEBUG(dbgs() << "FnSpecialization: Argument is not interesting\n");
+      // take on any constant values, they are collected in Constants.
+      SmallVector<CallArgBinding, 8> ActualArgs;
+      if (!isArgumentInteresting(&FormalArg, ActualArgs)) {
+        LLVM_DEBUG(dbgs() << "FnSpecialization: Argument "
+                          << FormalArg.getNameOrAsOperand()
+                          << " is not interesting\n");
         continue;
       }
 
-      for (auto *ActualArg : ActualConstArg) {
-        InstructionCost Gain =
-            ForceFunctionSpecialization
-                ? 1
-                : getSpecializationBonus(&FormalArg, ActualArg) - Cost;
-
-        if (Gain <= 0)
-          continue;
-        Worklist.push_back({F, &FormalArg, ActualArg, Gain});
-      }
+      for (const auto &Entry : ActualArgs) {
+        CallBase *Call = Entry.first;
+        Constant *ActualArg = Entry.second;
 
-      if (Worklist.empty())
-        continue;
+        auto I = Specializations.insert({Call, SpecializationInfo()});
+        SpecializationInfo &S = I.first->second;
 
-      // Sort the candidates in descending order.
-      llvm::stable_sort(Worklist, [](const ArgInfo &L, const ArgInfo &R) {
-        return L.Gain > R.Gain;
-      });
-
-      // Truncate the worklist to 'MaxClonesThreshold' candidates if
-      // necessary.
-      if (Worklist.size() > MaxClonesThreshold) {
-        LLVM_DEBUG(dbgs() << "FnSpecialization: number of candidates exceed "
-                    << "the maximum number of clones threshold.\n"
-                    << "Truncating worklist to " << MaxClonesThreshold
-                    << " candidates.\n");
-        Worklist.erase(Worklist.begin() + MaxClonesThreshold,
-                       Worklist.end());
+        if (I.second)
+          S.Gain = ForceFunctionSpecialization ? 1 : 0 - Cost;
+        if (!ForceFunctionSpecialization)
+          S.Gain += getSpecializationBonus(&FormalArg, ActualArg);
+        S.Args.push_back({&FormalArg, ActualArg});
       }
+    }
 
-      if (IsPartial || Worklist.size() < ActualConstArg.size())
-        for (auto &ActualArg : Worklist)
-          ActualArg.Partial = true;
-
-      LLVM_DEBUG(dbgs() << "Sorted list of candidates by gain:\n";
-                 for (auto &C
-                      : Worklist) {
-                   dbgs() << "- Function = " << C.Fn->getName() << ", ";
-                   dbgs() << "FormalArg = " << C.Arg->getName() << ", ";
-                   dbgs() << "ActualArg = " << C.Const->getName() << ", ";
-                   dbgs() << "Gain = " << C.Gain << "\n";
-                 });
-
-      // FIXME: Only one argument per function.
-      break;
+    // Remove unprofitable specializations.
+    Specializations.remove_if(
+        [](const auto &Entry) { return Entry.second.Gain <= 0; });
+
+    // Clear the MapVector and return the underlying vector.
+    WorkList = Specializations.takeVector();
+
+    // Sort the candidates in descending order.
+    llvm::stable_sort(WorkList, [](const auto &L, const auto &R) {
+      return L.second.Gain > R.second.Gain;
+    });
+
+    // Truncate the worklist to 'MaxClonesThreshold' candidates if necessary.
+    if (WorkList.size() > MaxClonesThreshold) {
+      LLVM_DEBUG(dbgs() << "FnSpecialization: Number of candidates exceed "
+                        << "the maximum number of clones threshold.\n"
+                        << "FnSpecialization: Truncating worklist to "
+                        << MaxClonesThreshold << " candidates.\n");
+      WorkList.erase(WorkList.begin() + MaxClonesThreshold, WorkList.end());
     }
-    return Worklist;
+
+    LLVM_DEBUG(dbgs() << "FnSpecialization: Specializations for function "
+                      << F->getName() << "\n";
+               for (const auto &Entry
+                    : WorkList) {
+                 dbgs() << "FnSpecialization:   Gain = " << Entry.second.Gain
+                        << "\n";
+                 for (const ArgInfo &Arg : Entry.second.Args)
+                   dbgs() << "FnSpecialization:   FormalArg = "
+                          << Arg.Formal->getNameOrAsOperand()
+                          << ", ActualArg = "
+                          << Arg.Actual->getNameOrAsOperand() << "\n";
+               });
+
+    return !WorkList.empty();
   }
 
-  bool isCandidateFunction(Function *F, FuncList &Specializations) {
+  bool isCandidateFunction(Function *F) {
     // Do not specialize the cloned function again.
     if (SpecializedFuncs.contains(F))
       return false;
@@ -461,44 +514,45 @@ private:
     return true;
   }
 
-  void specializeFunction(ArgInfo &AI, FuncList &Specializations) {
-    Function *Clone = cloneCandidateFunction(AI.Fn);
-    Argument *ClonedArg = Clone->getArg(AI.Arg->getArgNo());
+  void specializeFunction(Function *F, SpecializationInfo &S,
+                          FuncList &WorkList) {
+    ValueToValueMapTy Mappings;
+    Function *Clone = cloneCandidateFunction(F, Mappings);
 
     // Rewrite calls to the function so that they call the clone instead.
-    rewriteCallSites(AI.Fn, Clone, *ClonedArg, AI.Const);
+    rewriteCallSites(Clone, S.Args, Mappings);
 
     // Initialize the lattice state of the arguments of the function clone,
     // marking the argument on which we specialized the function constant
     // with the given value.
-    Solver.markArgInFuncSpecialization(AI.Fn, ClonedArg, AI.Const);
+    Solver.markArgInFuncSpecialization(Clone, S.Args);
 
     // Mark all the specialized functions
-    Specializations.push_back(Clone);
+    WorkList.push_back(Clone);
     NbFunctionsSpecialized++;
 
     // If the function has been completely specialized, the original function
     // is no longer needed. Mark it unreachable.
-    if (!AI.Partial)
-      Solver.markFunctionUnreachable(AI.Fn);
+    if (F->getNumUses() == 0 || all_of(F->users(), [F](User *U) {
+          if (auto *CS = dyn_cast<CallBase>(U))
+            return CS->getFunction() == F;
+          return false;
+        })) {
+      Solver.markFunctionUnreachable(F);
+      FullySpecialized.insert(F);
+    }
   }
 
   /// Compute and return the cost of specializing function \p F.
   InstructionCost getSpecializationCost(Function *F) {
-    // Compute the code metrics for the function.
-    SmallPtrSet<const Value *, 32> EphValues;
-    CodeMetrics::collectEphemeralValues(F, &(GetAC)(*F), EphValues);
-    CodeMetrics Metrics;
-    for (BasicBlock &BB : *F)
-      Metrics.analyzeBasicBlock(&BB, (GetTTI)(*F), EphValues);
-
+    CodeMetrics &Metrics = analyzeFunction(F);
     // If the code metrics reveal that we shouldn't duplicate the function, we
     // shouldn't specialize it. Set the specialization cost to Invalid.
     // Or if the lines of codes implies that this function is easy to get
     // inlined so that we shouldn't specialize it.
-    if (Metrics.notDuplicatable ||
+    if (Metrics.notDuplicatable || !Metrics.NumInsts.isValid() ||
         (!ForceFunctionSpecialization &&
-         Metrics.NumInsts < SmallFunctionThreshold)) {
+         *Metrics.NumInsts.getValue() < SmallFunctionThreshold)) {
       InstructionCost C{};
       C.setInvalid();
       return C;
@@ -539,31 +593,20 @@ private:
     DominatorTree DT(*F);
     LoopInfo LI(DT);
     auto &TTI = (GetTTI)(*F);
-    LLVM_DEBUG(dbgs() << "FnSpecialization: Analysing bonus for: " << *A
-                      << "\n");
+    LLVM_DEBUG(dbgs() << "FnSpecialization: Analysing bonus for constant: "
+                      << C->getNameOrAsOperand() << "\n");
 
     InstructionCost TotalCost = 0;
     for (auto *U : A->users()) {
       TotalCost += getUserBonus(U, TTI, LI);
-      LLVM_DEBUG(dbgs() << "FnSpecialization: User cost ";
+      LLVM_DEBUG(dbgs() << "FnSpecialization:   User cost ";
                  TotalCost.print(dbgs()); dbgs() << " for: " << *U << "\n");
     }
 
     // The below heuristic is only concerned with exposing inlining
     // opportunities via indirect call promotion. If the argument is not a
-    // function pointer, give up.
-    if (!isa<PointerType>(A->getType()) ||
-        !isa<FunctionType>(A->getType()->getPointerElementType()))
-      return TotalCost;
-
-    // Since the argument is a function pointer, its incoming constant values
-    // should be functions or constant expressions. The code below attempts to
-    // look through cast expressions to find the function that will be called.
-    Value *CalledValue = C;
-    while (isa<ConstantExpr>(CalledValue) &&
-           cast<ConstantExpr>(CalledValue)->isCast())
-      CalledValue = cast<User>(CalledValue)->getOperand(0);
-    Function *CalledFunction = dyn_cast<Function>(CalledValue);
+    // (potentially casted) function pointer, give up.
+    Function *CalledFunction = dyn_cast<Function>(C->stripPointerCasts());
     if (!CalledFunction)
       return TotalCost;
 
@@ -603,6 +646,9 @@ private:
         Bonus += Params.DefaultThreshold;
       else if (IC.isVariable() && IC.getCostDelta() > 0)
         Bonus += IC.getCostDelta();
+
+      LLVM_DEBUG(dbgs() << "FnSpecialization:   Inlining bonus " << Bonus
+                        << " for user " << *U << "\n");
     }
 
     return TotalCost + Bonus;
@@ -615,15 +661,12 @@ private:
   /// specializing the function based on the incoming values of argument \p A
   /// would result in any significant optimization opportunities. If
   /// optimization opportunities exist, the constant values of \p A on which to
-  /// specialize the function are collected in \p Constants. If the values in
-  /// \p Constants represent the complete set of values that \p A can take on,
-  /// the function will be completely specialized, and the \p IsPartial flag is
-  /// set to false.
+  /// specialize the function are collected in \p Constants.
   ///
   /// \returns true if the function should be specialized on the given
   /// argument.
-  bool isArgumentInteresting(Argument *A, ConstList &Constants,
-                             bool &IsPartial) {
+  bool isArgumentInteresting(Argument *A,
+                             SmallVectorImpl<CallArgBinding> &Constants) {
     // For now, don't attempt to specialize functions based on the values of
     // composite types.
     if (!A->getType()->isSingleValueType() || A->user_empty())
@@ -632,8 +675,9 @@ private:
     // If the argument isn't overdefined, there's nothing to do. It should
     // already be constant.
     if (!Solver.getLatticeValueFor(A).isOverdefined()) {
-      LLVM_DEBUG(dbgs() << "FnSpecialization: nothing to do, arg is already "
-                        << "constant?\n");
+      LLVM_DEBUG(dbgs() << "FnSpecialization: Nothing to do, argument "
+                        << A->getNameOrAsOperand()
+                        << " is already constant?\n");
       return false;
     }
 
@@ -650,20 +694,26 @@ private:
     //
     // TODO 2: this currently does not support constants, i.e. integer ranges.
     //
-    IsPartial = !getPossibleConstants(A, Constants);
-    LLVM_DEBUG(dbgs() << "FnSpecialization: interesting arg: " << *A << "\n");
+    getPossibleConstants(A, Constants);
+
+    if (Constants.empty())
+      return false;
+
+    LLVM_DEBUG(dbgs() << "FnSpecialization: Found interesting argument "
+                      << A->getNameOrAsOperand() << "\n");
     return true;
   }
 
   /// Collect in \p Constants all the constant values that argument \p A can
   /// take on.
-  ///
-  /// \returns true if all of the values the argument can take on are constant
-  /// (e.g., the argument's parent function cannot be called with an
-  /// overdefined value).
-  bool getPossibleConstants(Argument *A, ConstList &Constants) {
+  void getPossibleConstants(Argument *A,
+                            SmallVectorImpl<CallArgBinding> &Constants) {
     Function *F = A->getParent();
-    bool AllConstant = true;
+
+    // SCCP solver does not record an argument that will be constructed on
+    // stack.
+    if (A->hasByValAttr() && !F->onlyReadsMemory())
+      return;
 
     // Iterate over all the call sites of the argument's parent function.
     for (User *U : F->users()) {
@@ -672,10 +722,8 @@ private:
       auto &CS = *cast<CallBase>(U);
       // If the call site has attribute minsize set, that callsite won't be
       // specialized.
-      if (CS.hasFnAttr(Attribute::MinSize)) {
-        AllConstant = false;
+      if (CS.hasFnAttr(Attribute::MinSize))
         continue;
-      }
 
       // If the parent of the call site will never be executed, we don't need
       // to worry about the passed value.
@@ -684,13 +732,7 @@ private:
 
       auto *V = CS.getArgOperand(A->getArgNo());
       if (isa<PoisonValue>(V))
-        return false;
-
-      // For now, constant expressions are fine but only if they are function
-      // calls.
-      if (auto *CE = dyn_cast<ConstantExpr>(V))
-        if (!isa<Function>(CE->getOperand(0)))
-          return false;
+        return;
 
       // TrackValueOfGlobalVariable only tracks scalar global variables.
       if (auto *GV = dyn_cast<GlobalVariable>(V)) {
@@ -698,36 +740,32 @@ private:
         // global values.
         if (!GV->isConstant())
           if (!SpecializeOnAddresses)
-            return false;
+            return;
 
         if (!GV->getValueType()->isSingleValueType())
-          return false;
+          return;
       }
 
       if (isa<Constant>(V) && (Solver.getLatticeValueFor(V).isConstant() ||
                                EnableSpecializationForLiteralConstant))
-        Constants.push_back(cast<Constant>(V));
-      else
-        AllConstant = false;
+        Constants.push_back({&CS, cast<Constant>(V)});
     }
-
-    // If the argument can only take on constant values, AllConstant will be
-    // true.
-    return AllConstant;
   }
 
   /// Rewrite calls to function \p F to call function \p Clone instead.
   ///
-  /// This function modifies calls to function \p F whose argument at index \p
-  /// ArgNo is equal to constant \p C. The calls are rewritten to call function
-  /// \p Clone instead.
+  /// This function modifies calls to function \p F as long as the actual
+  /// arguments match those in \p Args. Note that for recursive calls we
+  /// need to compare against the cloned formal arguments.
   ///
   /// Callsites that have been marked with the MinSize function attribute won't
   /// be specialized and rewritten.
-  void rewriteCallSites(Function *F, Function *Clone, Argument &Arg,
-                        Constant *C) {
-    unsigned ArgNo = Arg.getArgNo();
-    SmallVector<CallBase *, 4> CallSitesToRewrite;
+  void rewriteCallSites(Function *Clone, const SmallVectorImpl<ArgInfo> &Args,
+                        ValueToValueMapTy &Mappings) {
+    assert(!Args.empty() && "Specialization without arguments");
+    Function *F = Args[0].Formal->getParent();
+
+    SmallVector<CallBase *, 8> CallSitesToRewrite;
     for (auto *U : F->users()) {
       if (!isa<CallInst>(U) && !isa<InvokeInst>(U))
         continue;
@@ -736,35 +774,50 @@ private:
         continue;
       CallSitesToRewrite.push_back(&CS);
     }
+
+    LLVM_DEBUG(dbgs() << "FnSpecialization: Replacing call sites of "
+                      << F->getName() << " with " << Clone->getName() << "\n");
+
     for (auto *CS : CallSitesToRewrite) {
-      if ((CS->getFunction() == Clone && CS->getArgOperand(ArgNo) == &Arg) ||
-          CS->getArgOperand(ArgNo) == C) {
+      LLVM_DEBUG(dbgs() << "FnSpecialization:   "
+                        << CS->getFunction()->getName() << " ->" << *CS
+                        << "\n");
+      if (/* recursive call */
+          (CS->getFunction() == Clone &&
+           all_of(Args,
+                  [CS, &Mappings](const ArgInfo &Arg) {
+                    unsigned ArgNo = Arg.Formal->getArgNo();
+                    return CS->getArgOperand(ArgNo) == Mappings[Arg.Formal];
+                  })) ||
+          /* normal call */
+          all_of(Args, [CS](const ArgInfo &Arg) {
+            unsigned ArgNo = Arg.Formal->getArgNo();
+            return CS->getArgOperand(ArgNo) == Arg.Actual;
+          })) {
         CS->setCalledFunction(Clone);
         Solver.markOverdefined(CS);
       }
     }
   }
 
-  void updateSpecializedFuncs(FuncList &FuncDecls,
-                              FuncList &CurrentSpecializations) {
-    for (auto *SpecializedFunc : CurrentSpecializations) {
-      SpecializedFuncs.insert(SpecializedFunc);
+  void updateSpecializedFuncs(FuncList &Candidates, FuncList &WorkList) {
+    for (auto *F : WorkList) {
+      SpecializedFuncs.insert(F);
 
       // Initialize the state of the newly created functions, marking them
       // argument-tracked and executable.
-      if (SpecializedFunc->hasExactDefinition() &&
-          !SpecializedFunc->hasFnAttribute(Attribute::Naked))
-        Solver.addTrackedFunction(SpecializedFunc);
+      if (F->hasExactDefinition() && !F->hasFnAttribute(Attribute::Naked))
+        Solver.addTrackedFunction(F);
 
-      Solver.addArgumentTrackedFunction(SpecializedFunc);
-      FuncDecls.push_back(SpecializedFunc);
-      Solver.markBlockExecutable(&SpecializedFunc->front());
+      Solver.addArgumentTrackedFunction(F);
+      Candidates.push_back(F);
+      Solver.markBlockExecutable(&F->front());
 
       // Replace the function arguments for the specialized functions.
-      for (Argument &Arg : SpecializedFunc->args())
+      for (Argument &Arg : F->args())
         if (!Arg.use_empty() && tryToReplaceWithConstant(&Arg))
           LLVM_DEBUG(dbgs() << "FnSpecialization: Replaced constant argument: "
-                            << Arg.getName() << "\n");
+                            << Arg.getNameOrAsOperand() << "\n");
     }
   }
 };
@@ -871,22 +924,26 @@ bool llvm::runFunctionSpecialization(
   // Initially resolve the constants in all the argument tracked functions.
   RunSCCPSolver(FuncDecls);
 
-  SmallVector<Function *, 2> CurrentSpecializations;
+  SmallVector<Function *, 8> WorkList;
   unsigned I = 0;
   while (FuncSpecializationMaxIters != I++ &&
-         FS.specializeFunctions(FuncDecls, CurrentSpecializations)) {
+         FS.specializeFunctions(FuncDecls, WorkList)) {
+    LLVM_DEBUG(dbgs() << "FnSpecialization: Finished iteration " << I << "\n");
 
     // Run the solver for the specialized functions.
-    RunSCCPSolver(CurrentSpecializations);
+    RunSCCPSolver(WorkList);
 
     // Replace some unresolved constant arguments.
     constantArgPropagation(FuncDecls, M, Solver);
 
-    CurrentSpecializations.clear();
+    WorkList.clear();
     Changed = true;
   }
 
-  // Clean up the IR by removing ssa_copy intrinsics.
+  LLVM_DEBUG(dbgs() << "FnSpecialization: Number of specializations = "
+                    << NumFuncSpecialized << "\n");
+
+  // Remove any ssa_copy intrinsics that may have been introduced.
   removeSSACopy(M);
   return Changed;
 }
diff --git a/llvm/lib/Transforms/IPO/GlobalDCE.cpp b/llvm/lib/Transforms/IPO/GlobalDCE.cpp
index 5e5d2086adc2..f35827220bb6 100644
--- a/llvm/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalDCE.cpp
@@ -21,7 +21,6 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
@@ -34,7 +33,7 @@ using namespace llvm;
 #define DEBUG_TYPE "globaldce"
 
 static cl::opt<bool>
-    ClEnableVFE("enable-vfe", cl::Hidden, cl::init(true), cl::ZeroOrMore,
+    ClEnableVFE("enable-vfe", cl::Hidden, cl::init(true),
                 cl::desc("Enable virtual function elimination"));
 
 STATISTIC(NumAliases  , "Number of global aliases removed");
@@ -86,6 +85,9 @@ ModulePass *llvm::createGlobalDCEPass() {
 
 /// Returns true if F is effectively empty.
 static bool isEmptyFunction(Function *F) {
+  // Skip external functions.
+  if (F->isDeclaration())
+    return false;
   BasicBlock &Entry = F->getEntryBlock();
   for (auto &I : Entry) {
     if (I.isDebugOrPseudoInst())
@@ -214,14 +216,14 @@ void GlobalDCEPass::ScanVTableLoad(Function *Caller, Metadata *TypeId,
     if (!Ptr) {
       LLVM_DEBUG(dbgs() << "can't find pointer in vtable!\n");
       VFESafeVTables.erase(VTable);
-      return;
+      continue;
     }
 
     auto Callee = dyn_cast<Function>(Ptr->stripPointerCasts());
     if (!Callee) {
       LLVM_DEBUG(dbgs() << "vtable entry is not function pointer!\n");
       VFESafeVTables.erase(VTable);
-      return;
+      continue;
     }
 
     LLVM_DEBUG(dbgs() << "vfunc dep " << Caller->getName() << " -> "
@@ -298,7 +300,8 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) {
   // marked as alive are discarded.
 
   // Remove empty functions from the global ctors list.
-  Changed |= optimizeGlobalCtorsList(M, isEmptyFunction);
+  Changed |= optimizeGlobalCtorsList(
+      M, [](uint32_t, Function *F) { return isEmptyFunction(F); });
 
   // Collect the set of members for each comdat.
   for (Function &F : M)
@@ -317,7 +320,7 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) {
 
   // Loop over the module, adding globals which are obviously necessary.
   for (GlobalObject &GO : M.global_objects()) {
-    Changed |= RemoveUnusedGlobalValue(GO);
+    GO.removeDeadConstantUsers();
     // Functions with external linkage are needed if they have a body.
     // Externally visible & appending globals are needed, if they have an
     // initializer.
@@ -330,7 +333,7 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) {
 
   // Compute direct dependencies of aliases.
   for (GlobalAlias &GA : M.aliases()) {
-    Changed |= RemoveUnusedGlobalValue(GA);
+    GA.removeDeadConstantUsers();
     // Externally visible aliases are needed.
     if (!GA.isDiscardableIfUnused())
       MarkLive(GA);
@@ -340,7 +343,7 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) {
 
   // Compute direct dependencies of ifuncs.
   for (GlobalIFunc &GIF : M.ifuncs()) {
-    Changed |= RemoveUnusedGlobalValue(GIF);
+    GIF.removeDeadConstantUsers();
     // Externally visible ifuncs are needed.
     if (!GIF.isDiscardableIfUnused())
       MarkLive(GIF);
@@ -403,7 +406,7 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) {
   // Now that all interferences have been dropped, delete the actual objects
   // themselves.
   auto EraseUnusedGlobalValue = [&](GlobalValue *GV) {
-    RemoveUnusedGlobalValue(*GV);
+    GV->removeDeadConstantUsers();
     GV->eraseFromParent();
     Changed = true;
   };
@@ -455,16 +458,3 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) {
     return PreservedAnalyses::none();
   return PreservedAnalyses::all();
 }
-
-// RemoveUnusedGlobalValue - Loop over all of the uses of the specified
-// GlobalValue, looking for the constant pointer ref that may be pointing to it.
-// If found, check to see if the constant pointer ref is safe to destroy, and if
-// so, nuke it.  This will reduce the reference count on the global value, which
-// might make it deader.
-//
-bool GlobalDCEPass::RemoveUnusedGlobalValue(GlobalValue &GV) {
-  if (GV.use_empty())
-    return false;
-  GV.removeDeadConstantUsers();
-  return GV.use_empty();
-}
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 1cb32e32c895..1a1bde4f0668 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/iterator_range.h"
@@ -37,7 +38,6 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -60,7 +60,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Utils/CtorUtils.h"
@@ -100,7 +99,7 @@ static cl::opt<bool>
                            cl::init(false), cl::Hidden);
 
 static cl::opt<int> ColdCCRelFreq(
-    "coldcc-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore,
+    "coldcc-rel-freq", cl::Hidden, cl::init(2),
     cl::desc(
         "Maximum block frequency, expressed as a percentage of caller's "
         "entry frequency, for a call site to be considered cold for enabling"
@@ -232,7 +231,7 @@ CleanupPointerRootUsers(GlobalVariable *GV,
       if (MemSrc && MemSrc->isConstant()) {
         Changed = true;
         MTI->eraseFromParent();
-      } else if (Instruction *I = dyn_cast<Instruction>(MemSrc)) {
+      } else if (Instruction *I = dyn_cast<Instruction>(MTI->getSource())) {
         if (I->hasOneUse())
           Dead.push_back(std::make_pair(I, MTI));
       }
@@ -405,9 +404,37 @@ static void transferSRADebugInfo(GlobalVariable *GV, GlobalVariable *NGV,
   for (auto *GVE : GVs) {
     DIVariable *Var = GVE->getVariable();
     DIExpression *Expr = GVE->getExpression();
+    int64_t CurVarOffsetInBytes = 0;
+    uint64_t CurVarOffsetInBits = 0;
+
+    // Calculate the offset (Bytes), Continue if unknown.
+    if (!Expr->extractIfOffset(CurVarOffsetInBytes))
+      continue;
+
+    // Ignore negative offset.
+    if (CurVarOffsetInBytes < 0)
+      continue;
+
+    // Convert offset to bits.
+    CurVarOffsetInBits = CHAR_BIT * (uint64_t)CurVarOffsetInBytes;
+
+    // Current var starts after the fragment, ignore.
+    if (CurVarOffsetInBits >= (FragmentOffsetInBits + FragmentSizeInBits))
+      continue;
+
+    uint64_t CurVarSize = Var->getType()->getSizeInBits();
+    // Current variable ends before start of fragment, ignore.
+    if (CurVarSize != 0 &&
+        (CurVarOffsetInBits + CurVarSize) <= FragmentOffsetInBits)
+      continue;
+
+    // Current variable fits in the fragment.
+    if (CurVarOffsetInBits == FragmentOffsetInBits &&
+        CurVarSize == FragmentSizeInBits)
+      Expr = DIExpression::get(Expr->getContext(), {});
     // If the FragmentSize is smaller than the variable,
     // emit a fragment expression.
-    if (FragmentSizeInBits < VarSize) {
+    else if (FragmentSizeInBits < VarSize) {
       if (auto E = DIExpression::createFragmentExpression(
               Expr, FragmentOffsetInBits, FragmentSizeInBits))
         Expr = *E;
@@ -581,17 +608,14 @@ static bool AllUsesOfValueWillTrapIfNull(const Value *V,
       // Will trap.
     } else if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
       if (SI->getOperand(0) == V) {
-        //cerr << "NONTRAPPING USE: " << *U;
         return false;  // Storing the value.
       }
     } else if (const CallInst *CI = dyn_cast<CallInst>(U)) {
       if (CI->getCalledOperand() != V) {
-        //cerr << "NONTRAPPING USE: " << *U;
         return false;  // Not calling the ptr
       }
     } else if (const InvokeInst *II = dyn_cast<InvokeInst>(U)) {
       if (II->getCalledOperand() != V) {
-        //cerr << "NONTRAPPING USE: " << *U;
         return false;  // Not calling the ptr
       }
     } else if (const BitCastInst *CI = dyn_cast<BitCastInst>(U)) {
@@ -615,7 +639,6 @@ static bool AllUsesOfValueWillTrapIfNull(const Value *V,
       // the comparing of the value of the created global init bool later in
       // optimizeGlobalAddressOfAllocation for the global variable.
     } else {
-      //cerr << "NONTRAPPING USE: " << *U;
       return false;
     }
   }
@@ -878,7 +901,7 @@ OptimizeGlobalAddressOfAllocation(GlobalVariable *GV, CallInst *CI,
     }
   }
 
-  SmallPtrSet<Constant *, 1> RepValues;
+  SmallSetVector<Constant *, 1> RepValues;
   RepValues.insert(NewGV);
 
   // If there is a comparison against null, we will insert a global bool to
@@ -1015,7 +1038,6 @@ valueIsOnlyUsedLocallyOrStoredToOneGlobal(const CallInst *CI,
 /// accessing the data, and exposes the resultant global to further GlobalOpt.
 static bool tryToOptimizeStoreOfAllocationToGlobal(GlobalVariable *GV,
                                                    CallInst *CI,
-                                                   AtomicOrdering Ordering,
                                                    const DataLayout &DL,
                                                    TargetLibraryInfo *TLI) {
   if (!isAllocRemovable(CI, TLI))
@@ -1062,7 +1084,7 @@ static bool tryToOptimizeStoreOfAllocationToGlobal(GlobalVariable *GV,
 // its initializer) is ever stored to the global.
 static bool
 optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
-                         AtomicOrdering Ordering, const DataLayout &DL,
+                         const DataLayout &DL,
                          function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
   // Ignore no-op GEPs and bitcasts.
   StoredOnceVal = StoredOnceVal->stripPointerCasts();
@@ -1087,7 +1109,7 @@ optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
     } else if (isAllocationFn(StoredOnceVal, GetTLI)) {
       if (auto *CI = dyn_cast<CallInst>(StoredOnceVal)) {
         auto *TLI = &GetTLI(*CI->getFunction());
-        if (tryToOptimizeStoreOfAllocationToGlobal(GV, CI, Ordering, DL, TLI))
+        if (tryToOptimizeStoreOfAllocationToGlobal(GV, CI, DL, TLI))
           return true;
       }
     }
@@ -1257,8 +1279,10 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
   return true;
 }
 
-static bool deleteIfDead(
-    GlobalValue &GV, SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) {
+static bool
+deleteIfDead(GlobalValue &GV,
+             SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats,
+             function_ref<void(Function &)> DeleteFnCallback = nullptr) {
   GV.removeDeadConstantUsers();
 
   if (!GV.isDiscardableIfUnused() && !GV.isDeclaration())
@@ -1277,6 +1301,10 @@ static bool deleteIfDead(
     return false;
 
   LLVM_DEBUG(dbgs() << "GLOBAL DEAD: " << GV << "\n");
+  if (auto *F = dyn_cast<Function>(&GV)) {
+    if (DeleteFnCallback)
+      DeleteFnCallback(*F);
+  }
   GV.eraseFromParent();
   ++NumDeleted;
   return true;
@@ -1416,6 +1444,42 @@ static void makeAllConstantUsesInstructions(Constant *C) {
   }
 }
 
+// For a global variable with one store, if the store dominates any loads,
+// those loads will always load the stored value (as opposed to the
+// initializer), even in the presence of recursion.
+static bool forwardStoredOnceStore(
+    GlobalVariable *GV, const StoreInst *StoredOnceStore,
+    function_ref<DominatorTree &(Function &)> LookupDomTree) {
+  const Value *StoredOnceValue = StoredOnceStore->getValueOperand();
+  // We can do this optimization for non-constants in nosync + norecurse
+  // functions, but globals used in exactly one norecurse functions are already
+  // promoted to an alloca.
+  if (!isa<Constant>(StoredOnceValue))
+    return false;
+  const Function *F = StoredOnceStore->getFunction();
+  SmallVector<LoadInst *> Loads;
+  for (User *U : GV->users()) {
+    if (auto *LI = dyn_cast<LoadInst>(U)) {
+      if (LI->getFunction() == F &&
+          LI->getType() == StoredOnceValue->getType() && LI->isSimple())
+        Loads.push_back(LI);
+    }
+  }
+  // Only compute DT if we have any loads to examine.
+  bool MadeChange = false;
+  if (!Loads.empty()) {
+    auto &DT = LookupDomTree(*const_cast<Function *>(F));
+    for (auto *LI : Loads) {
+      if (DT.dominates(StoredOnceStore, LI)) {
+        LI->replaceAllUsesWith(const_cast<Value *>(StoredOnceValue));
+        LI->eraseFromParent();
+        MadeChange = true;
+      }
+    }
+  }
+  return MadeChange;
+}
+
 /// Analyze the specified global variable and optimize
 /// it if possible.  If we make a change, return true.
 static bool
@@ -1572,9 +1636,15 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS,
 
     // Try to optimize globals based on the knowledge that only one value
     // (besides its initializer) is ever stored to the global.
-    if (optimizeOnceStoredGlobal(GV, StoredOnceValue, GS.Ordering, DL, GetTLI))
+    if (optimizeOnceStoredGlobal(GV, StoredOnceValue, DL, GetTLI))
       return true;
 
+    // Try to forward the store to any loads. If we have more than one store, we
+    // may have a store of the initializer between StoredOnceStore and a load.
+    if (GS.NumStores == 1)
+      if (forwardStoredOnceStore(GV, GS.StoredOnceStore, LookupDomTree))
+        return true;
+
     // Otherwise, if the global was not a boolean, we can shrink it to be a
     // boolean. Skip this optimization for AS that doesn't allow an initializer.
     if (SOVConstant && GS.Ordering == AtomicOrdering::NotAtomic &&
@@ -1755,7 +1825,7 @@ hasOnlyColdCalls(Function &F,
           return false;
         if (!CalledFn->hasLocalLinkage())
           return false;
-        // Skip over instrinsics since they won't remain as function calls.
+        // Skip over intrinsics since they won't remain as function calls.
         if (CalledFn->getIntrinsicID() != Intrinsic::not_intrinsic)
           continue;
         // Check if it's valid to use coldcc calling convention.
@@ -1884,7 +1954,9 @@ OptimizeFunctions(Module &M,
                   function_ref<TargetTransformInfo &(Function &)> GetTTI,
                   function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
                   function_ref<DominatorTree &(Function &)> LookupDomTree,
-                  SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) {
+                  SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats,
+                  function_ref<void(Function &F)> ChangedCFGCallback,
+                  function_ref<void(Function &F)> DeleteFnCallback) {
 
   bool Changed = false;
 
@@ -1904,7 +1976,7 @@ OptimizeFunctions(Module &M,
     if (!F.hasName() && !F.isDeclaration() && !F.hasLocalLinkage())
       F.setLinkage(GlobalValue::InternalLinkage);
 
-    if (deleteIfDead(F, NotDiscardableComdats)) {
+    if (deleteIfDead(F, NotDiscardableComdats, DeleteFnCallback)) {
       Changed = true;
       continue;
     }
@@ -1917,13 +1989,11 @@ OptimizeFunctions(Module &M,
     // So, remove unreachable blocks from the function, because a) there's
     // no point in analyzing them and b) GlobalOpt should otherwise grow
     // some more complicated logic to break these cycles.
-    // Removing unreachable blocks might invalidate the dominator so we
-    // recalculate it.
+    // Notify the analysis manager that we've modified the function's CFG.
     if (!F.isDeclaration()) {
       if (removeUnreachableBlocks(F)) {
-        auto &DT = LookupDomTree(F);
-        DT.recalculate(F);
         Changed = true;
+        ChangedCFGCallback(F);
       }
     }
 
@@ -2031,6 +2101,9 @@ OptimizeGlobalVars(Module &M,
 /// can, false otherwise.
 static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL,
                                       TargetLibraryInfo *TLI) {
+  // Skip external functions.
+  if (F->isDeclaration())
+    return false;
   // Call the function.
   Evaluator Eval(DL, TLI);
   Constant *RetValDummy;
@@ -2383,15 +2456,19 @@ static bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) {
   return Changed;
 }
 
-static bool optimizeGlobalsInModule(
-    Module &M, const DataLayout &DL,
-    function_ref<TargetLibraryInfo &(Function &)> GetTLI,
-    function_ref<TargetTransformInfo &(Function &)> GetTTI,
-    function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
-    function_ref<DominatorTree &(Function &)> LookupDomTree) {
+static bool
+optimizeGlobalsInModule(Module &M, const DataLayout &DL,
+                        function_ref<TargetLibraryInfo &(Function &)> GetTLI,
+                        function_ref<TargetTransformInfo &(Function &)> GetTTI,
+                        function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
+                        function_ref<DominatorTree &(Function &)> LookupDomTree,
+                        function_ref<void(Function &F)> ChangedCFGCallback,
+                        function_ref<void(Function &F)> DeleteFnCallback) {
   SmallPtrSet<const Comdat *, 8> NotDiscardableComdats;
   bool Changed = false;
   bool LocalChange = true;
+  Optional<uint32_t> FirstNotFullyEvaluatedPriority;
+
   while (LocalChange) {
     LocalChange = false;
 
@@ -2411,12 +2488,20 @@ static bool optimizeGlobalsInModule(
 
     // Delete functions that are trivially dead, ccc -> fastcc
     LocalChange |= OptimizeFunctions(M, GetTLI, GetTTI, GetBFI, LookupDomTree,
-                                     NotDiscardableComdats);
+                                     NotDiscardableComdats, ChangedCFGCallback,
+                                     DeleteFnCallback);
 
     // Optimize global_ctors list.
-    LocalChange |= optimizeGlobalCtorsList(M, [&](Function *F) {
-      return EvaluateStaticConstructor(F, DL, &GetTLI(*F));
-    });
+    LocalChange |=
+        optimizeGlobalCtorsList(M, [&](uint32_t Priority, Function *F) {
+          if (FirstNotFullyEvaluatedPriority &&
+              *FirstNotFullyEvaluatedPriority != Priority)
+            return false;
+          bool Evaluated = EvaluateStaticConstructor(F, DL, &GetTLI(*F));
+          if (!Evaluated)
+            FirstNotFullyEvaluatedPriority = Priority;
+          return Evaluated;
+        });
 
     // Optimize non-address-taken globals.
     LocalChange |= OptimizeGlobalVars(M, GetTTI, GetTLI, LookupDomTree,
@@ -2457,10 +2542,23 @@ PreservedAnalyses GlobalOptPass::run(Module &M, ModuleAnalysisManager &AM) {
     auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
       return FAM.getResult<BlockFrequencyAnalysis>(F);
     };
+    auto ChangedCFGCallback = [&FAM](Function &F) {
+      FAM.invalidate(F, PreservedAnalyses::none());
+    };
+    auto DeleteFnCallback = [&FAM](Function &F) { FAM.clear(F, F.getName()); };
 
-    if (!optimizeGlobalsInModule(M, DL, GetTLI, GetTTI, GetBFI, LookupDomTree))
+    if (!optimizeGlobalsInModule(M, DL, GetTLI, GetTTI, GetBFI, LookupDomTree,
+                                 ChangedCFGCallback, DeleteFnCallback))
       return PreservedAnalyses::all();
-    return PreservedAnalyses::none();
+
+    PreservedAnalyses PA = PreservedAnalyses::none();
+    // We made sure to clear analyses for deleted functions.
+    PA.preserve<FunctionAnalysisManagerModuleProxy>();
+    // The only place we modify the CFG is when calling
+    // removeUnreachableBlocks(), but there we make sure to invalidate analyses
+    // for modified functions.
+    PA.preserveSet<CFGAnalyses>();
+    return PA;
 }
 
 namespace {
@@ -2491,8 +2589,13 @@ struct GlobalOptLegacyPass : public ModulePass {
       return this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
     };
 
-    return optimizeGlobalsInModule(M, DL, GetTLI, GetTTI, GetBFI,
-                                   LookupDomTree);
+    auto ChangedCFGCallback = [&LookupDomTree](Function &F) {
+      auto &DT = LookupDomTree(F);
+      DT.recalculate(F);
+    };
+
+    return optimizeGlobalsInModule(M, DL, GetTLI, GetTTI, GetBFI, LookupDomTree,
+                                   ChangedCFGCallback, nullptr);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
diff --git a/llvm/lib/Transforms/IPO/GlobalSplit.cpp b/llvm/lib/Transforms/IPO/GlobalSplit.cpp
index e7d698c42fcf..7d9e6135b2eb 100644
--- a/llvm/lib/Transforms/IPO/GlobalSplit.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalSplit.cpp
@@ -134,9 +134,9 @@ static bool splitGlobal(GlobalVariable &GV) {
   }
 
   // Finally, remove the original global. Any remaining uses refer to invalid
-  // elements of the global, so replace with undef.
+  // elements of the global, so replace with poison.
   if (!GV.use_empty())
-    GV.replaceAllUsesWith(UndefValue::get(GV.getType()));
+    GV.replaceAllUsesWith(PoisonValue::get(GV.getType()));
   GV.eraseFromParent();
   return true;
 }
diff --git a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
index a964fcde0396..95e8ae0fd22f 100644
--- a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
+++ b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
@@ -29,46 +29,33 @@
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/BlockFrequency.h"
-#include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/CodeExtractor.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
-#include <limits>
 #include <cassert>
+#include <limits>
 #include <string>
 
 #define DEBUG_TYPE "hotcoldsplit"
@@ -126,7 +113,8 @@ bool unlikelyExecuted(BasicBlock &BB) {
   // mark sanitizer traps as cold.
   for (Instruction &I : BB)
     if (auto *CB = dyn_cast<CallBase>(&I))
-      if (CB->hasFnAttr(Attribute::Cold) && !CB->getMetadata("nosanitize"))
+      if (CB->hasFnAttr(Attribute::Cold) &&
+          !CB->getMetadata(LLVMContext::MD_nosanitize))
         return true;
 
   // The block is cold if it has an unreachable terminator, unless it's
@@ -352,7 +340,7 @@ Function *HotColdSplitting::extractColdRegion(
   // TODO: Pass BFI and BPI to update profile information.
   CodeExtractor CE(Region, &DT, /* AggregateArgs */ false, /* BFI */ nullptr,
                    /* BPI */ nullptr, AC, /* AllowVarArgs */ false,
-                   /* AllowAlloca */ false,
+                   /* AllowAlloca */ false, /* AllocaBlock */ nullptr,
                    /* Suffix */ "cold." + std::to_string(Count));
 
   // Perform a simple cost/benefit analysis to decide whether or not to permit
@@ -740,7 +728,7 @@ bool HotColdSplittingLegacyPass::runOnModule(Module &M) {
   std::function<OptimizationRemarkEmitter &(Function &)> GetORE =
       [&ORE](Function &F) -> OptimizationRemarkEmitter & {
     ORE.reset(new OptimizationRemarkEmitter(&F));
-    return *ORE.get();
+    return *ORE;
   };
   auto LookupAC = [this](Function &F) -> AssumptionCache * {
     if (auto *ACT = getAnalysisIfAvailable<AssumptionCacheTracker>())
@@ -772,7 +760,7 @@ HotColdSplittingPass::run(Module &M, ModuleAnalysisManager &AM) {
   std::function<OptimizationRemarkEmitter &(Function &)> GetORE =
       [&ORE](Function &F) -> OptimizationRemarkEmitter & {
     ORE.reset(new OptimizationRemarkEmitter(&F));
-    return *ORE.get();
+    return *ORE;
   };
 
   ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
diff --git a/llvm/lib/Transforms/IPO/IPO.cpp b/llvm/lib/Transforms/IPO/IPO.cpp
index de1c1d379502..ec2b80012ed6 100644
--- a/llvm/lib/Transforms/IPO/IPO.cpp
+++ b/llvm/lib/Transforms/IPO/IPO.cpp
@@ -24,7 +24,6 @@ using namespace llvm;
 
 void llvm::initializeIPO(PassRegistry &Registry) {
   initializeOpenMPOptCGSCCLegacyPassPass(Registry);
-  initializeArgPromotionPass(Registry);
   initializeAnnotation2MetadataLegacyPass(Registry);
   initializeCalledValuePropagationLegacyPassPass(Registry);
   initializeConstantMergeLegacyPassPass(Registry);
@@ -70,10 +69,6 @@ void LLVMInitializeIPO(LLVMPassRegistryRef R) {
   initializeIPO(*unwrap(R));
 }
 
-void LLVMAddArgumentPromotionPass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createArgumentPromotionPass());
-}
-
 void LLVMAddCalledValuePropagationPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createCalledValuePropagationPass());
 }
diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
index faf7cb7d566a..d75d99e307fd 100644
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -16,8 +16,9 @@
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Attributes.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/PassManager.h"
@@ -25,8 +26,6 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/IPO.h"
-#include <map>
-#include <set>
 #include <vector>
 
 #define DEBUG_TYPE "iroutliner"
@@ -183,11 +182,24 @@ static void getSortedConstantKeys(std::vector<Value *> &SortedKeys,
 Value *OutlinableRegion::findCorrespondingValueIn(const OutlinableRegion &Other,
                                                   Value *V) {
   Optional<unsigned> GVN = Candidate->getGVN(V);
-  assert(GVN.hasValue() && "No GVN for incoming value");
+  assert(GVN && "No GVN for incoming value");
   Optional<unsigned> CanonNum = Candidate->getCanonicalNum(*GVN);
   Optional<unsigned> FirstGVN = Other.Candidate->fromCanonicalNum(*CanonNum);
   Optional<Value *> FoundValueOpt = Other.Candidate->fromGVN(*FirstGVN);
-  return FoundValueOpt.getValueOr(nullptr);
+  return FoundValueOpt.value_or(nullptr);
+}
+
+BasicBlock *
+OutlinableRegion::findCorrespondingBlockIn(const OutlinableRegion &Other,
+                                           BasicBlock *BB) {
+  Instruction *FirstNonPHI = BB->getFirstNonPHI();
+  assert(FirstNonPHI && "block is empty?");
+  Value *CorrespondingVal = findCorrespondingValueIn(Other, FirstNonPHI);
+  if (!CorrespondingVal)
+    return nullptr;
+  BasicBlock *CorrespondingBlock =
+      cast<Instruction>(CorrespondingVal)->getParent();
+  return CorrespondingBlock;
 }
 
 /// Rewrite the BranchInsts in the incoming blocks to \p PHIBlock that are found
@@ -264,13 +276,33 @@ void OutlinableRegion::splitCandidate() {
   // We iterate over the instructions in the region, if we find a PHINode, we
   // check if there are predecessors outside of the region, if there are,
   // we ignore this region since we are unable to handle the severing of the
-  // phi node right now. 
+  // phi node right now.
+
+  // TODO: Handle extraneous inputs for PHINodes through variable number of
+  // inputs, similar to how outputs are handled.
   BasicBlock::iterator It = StartInst->getIterator();
+  EndBB = BackInst->getParent();
+  BasicBlock *IBlock;
+  BasicBlock *PHIPredBlock = nullptr;
+  bool EndBBTermAndBackInstDifferent = EndBB->getTerminator() != BackInst;
   while (PHINode *PN = dyn_cast<PHINode>(&*It)) {
     unsigned NumPredsOutsideRegion = 0;
-    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
-      if (!BBSet.contains(PN->getIncomingBlock(i)))
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      if (!BBSet.contains(PN->getIncomingBlock(i))) {
+        PHIPredBlock = PN->getIncomingBlock(i);
+        ++NumPredsOutsideRegion;
+        continue;
+      }
+
+      // We must consider the case there the incoming block to the PHINode is
+      // the same as the final block of the OutlinableRegion.  If this is the
+      // case, the branch from this block must also be outlined to be valid.
+      IBlock = PN->getIncomingBlock(i);
+      if (IBlock == EndBB && EndBBTermAndBackInstDifferent) {
+        PHIPredBlock = PN->getIncomingBlock(i);
         ++NumPredsOutsideRegion;
+      }
+    }
 
     if (NumPredsOutsideRegion > 1)
       return;
@@ -285,11 +317,9 @@ void OutlinableRegion::splitCandidate() {
   
   // If the region ends with a PHINode, but does not contain all of the phi node
   // instructions of the region, we ignore it for now.
-  if (isa<PHINode>(BackInst)) {
-    EndBB = BackInst->getParent();
-    if (BackInst != &*std::prev(EndBB->getFirstInsertionPt()))
-      return;
-  }
+  if (isa<PHINode>(BackInst) &&
+      BackInst != &*std::prev(EndBB->getFirstInsertionPt()))
+    return;
 
   // The basic block gets split like so:
   // block:                 block:
@@ -310,6 +340,10 @@ void OutlinableRegion::splitCandidate() {
 
   StartBB = PrevBB->splitBasicBlock(StartInst, OriginalName + "_to_outline");
   PrevBB->replaceSuccessorsPhiUsesWith(PrevBB, StartBB);
+  // If there was a PHINode with an incoming block outside the region,
+  // make sure is correctly updated in the newly split block.
+  if (PHIPredBlock)
+    PrevBB->replaceSuccessorsPhiUsesWith(PHIPredBlock, PrevBB);
 
   CandidateSplit = true;
   if (!BackInst->isTerminator()) {
@@ -353,6 +387,25 @@ void OutlinableRegion::reattachCandidate() {
   assert(StartBB != nullptr && "StartBB for Candidate is not defined!");
 
   assert(PrevBB->getTerminator() && "Terminator removed from PrevBB!");
+  // Make sure PHINode references to the block we are merging into are
+  // updated to be incoming blocks from the predecessor to the current block.
+
+  // NOTE: If this is updated such that the outlined block can have more than
+  // one incoming block to a PHINode, this logic will have to updated
+  // to handle multiple precessors instead.
+
+  // We only need to update this if the outlined section contains a PHINode, if
+  // it does not, then the incoming block was never changed in the first place.
+  // On the other hand, if PrevBB has no predecessors, it means that all
+  // incoming blocks to the first block are contained in the region, and there
+  // will be nothing to update.
+  Instruction *StartInst = (*Candidate->begin()).Inst;
+  if (isa<PHINode>(StartInst) && !PrevBB->hasNPredecessors(0)) {
+    assert(!PrevBB->hasNPredecessorsOrMore(2) &&
+         "PrevBB has more than one predecessor. Should be 0 or 1.");
+    BasicBlock *BeforePrevBB = PrevBB->getSinglePredecessor();
+    PrevBB->replaceSuccessorsPhiUsesWith(PrevBB, BeforePrevBB);
+  }
   PrevBB->getTerminator()->eraseFromParent();
 
   // If we reattaching after outlining, we iterate over the phi nodes to
@@ -501,7 +554,7 @@ collectRegionsConstants(OutlinableRegion &Region,
     // the the number has been found to be not the same value in each instance.
     for (Value *V : ID.OperVals) {
       Optional<unsigned> GVNOpt = C.getGVN(V);
-      assert(GVNOpt.hasValue() && "Expected a GVN for operand?");
+      assert(GVNOpt && "Expected a GVN for operand?");
       unsigned GVN = GVNOpt.getValue();
 
       // Check if this global value has been found to not be the same already.
@@ -516,7 +569,7 @@ collectRegionsConstants(OutlinableRegion &Region,
       // global value number.  If the global value does not map to a Constant,
       // it is considered to not be the same value.
       Optional<bool> ConstantMatches = constantMatches(V, GVN, GVNToConstant);
-      if (ConstantMatches.hasValue()) {
+      if (ConstantMatches) {
         if (ConstantMatches.getValue())
           continue;
         else
@@ -597,7 +650,7 @@ Function *IROutliner::createFunction(Module &M, OutlinableGroup &Group,
       "outlined_ir_func_" + std::to_string(FunctionNameSuffix), M);
 
   // Transfer the swifterr attribute to the correct function parameter.
-  if (Group.SwiftErrorArgument.hasValue())
+  if (Group.SwiftErrorArgument)
     Group.OutlinedFunction->addParamAttr(Group.SwiftErrorArgument.getValue(),
                                          Attribute::SwiftError);
 
@@ -666,6 +719,18 @@ static void moveFunctionData(Function &Old, Function &New,
       if (!isa<CallInst>(&Val)) {
         // Remove the debug information for outlined functions.
         Val.setDebugLoc(DebugLoc());
+
+        // Loop info metadata may contain line locations. Update them to have no
+        // value in the new subprogram since the outlined code could be from
+        // several locations.
+        auto updateLoopInfoLoc = [&New](Metadata *MD) -> Metadata * {
+          if (DISubprogram *SP = New.getSubprogram())
+            if (auto *Loc = dyn_cast_or_null<DILocation>(MD))
+              return DILocation::get(New.getContext(), Loc->getLine(),
+                                     Loc->getColumn(), SP, nullptr);
+          return MD;
+        };
+        updateLoopMetadataDebugLocations(Val, updateLoopInfoLoc);
         continue;
       }
 
@@ -691,8 +756,6 @@ static void moveFunctionData(Function &Old, Function &New,
     for (Instruction *I : DebugInsts)
       I->eraseFromParent();
   }
-
-  assert(NewEnds.size() > 0 && "No return instruction for new function?");
 }
 
 /// Find the the constants that will need to be lifted into arguments
@@ -714,7 +777,7 @@ static void findConstants(IRSimilarityCandidate &C, DenseSet<unsigned> &NotSame,
     for (Value *V : (*IDIt).OperVals) {
       // Since these are stored before any outlining, they will be in the
       // global value numbering.
-      unsigned GVN = C.getGVN(V).getValue();
+      unsigned GVN = *C.getGVN(V);
       if (isa<Constant>(V))
         if (NotSame.contains(GVN) && !Seen.contains(GVN)) {
           Inputs.push_back(GVN);
@@ -745,8 +808,7 @@ static void mapInputsToGVNs(IRSimilarityCandidate &C,
     assert(Input && "Have a nullptr as an input");
     if (OutputMappings.find(Input) != OutputMappings.end())
       Input = OutputMappings.find(Input)->second;
-    assert(C.getGVN(Input).hasValue() &&
-           "Could not find a numbering for the given input");
+    assert(C.getGVN(Input) && "Could not find a numbering for the given input");
     EndInputNumbers.push_back(C.getGVN(Input).getValue());
   }
 }
@@ -885,11 +947,11 @@ findExtractedInputToOverallInputMapping(OutlinableRegion &Region,
   // numbering overrides any discovered location for the extracted code.
   for (unsigned InputVal : InputGVNs) {
     Optional<unsigned> CanonicalNumberOpt = C.getCanonicalNum(InputVal);
-    assert(CanonicalNumberOpt.hasValue() && "Canonical number not found?");
+    assert(CanonicalNumberOpt && "Canonical number not found?");
     unsigned CanonicalNumber = CanonicalNumberOpt.getValue();
 
     Optional<Value *> InputOpt = C.fromGVN(InputVal);
-    assert(InputOpt.hasValue() && "Global value number not found?");
+    assert(InputOpt && "Global value number not found?");
     Value *Input = InputOpt.getValue();
 
     DenseMap<unsigned, unsigned>::iterator AggArgIt =
@@ -901,7 +963,7 @@ findExtractedInputToOverallInputMapping(OutlinableRegion &Region,
       // argument in the overall function.
       if (Input->isSwiftError()) {
         assert(
-            !Group.SwiftErrorArgument.hasValue() &&
+            !Group.SwiftErrorArgument &&
             "Argument already marked with swifterr for this OutlinableGroup!");
         Group.SwiftErrorArgument = TypeIndex;
       }
@@ -969,12 +1031,11 @@ static bool outputHasNonPHI(Value *V, unsigned PHILoc, PHINode &PN,
   // We check to see if the value is used by the PHINode from some other
   // predecessor not included in the region.  If it is, we make sure
   // to keep it as an output.
-  SmallVector<unsigned, 2> IncomingNumbers(PN.getNumIncomingValues());
-  std::iota(IncomingNumbers.begin(), IncomingNumbers.end(), 0);
-  if (any_of(IncomingNumbers, [PHILoc, &PN, V, &BlocksInRegion](unsigned Idx) {
-        return (Idx != PHILoc && V == PN.getIncomingValue(Idx) &&
-                !BlocksInRegion.contains(PN.getIncomingBlock(Idx)));
-      }))
+  if (any_of(llvm::seq<unsigned>(0, PN.getNumIncomingValues()),
+             [PHILoc, &PN, V, &BlocksInRegion](unsigned Idx) {
+               return (Idx != PHILoc && V == PN.getIncomingValue(Idx) &&
+                       !BlocksInRegion.contains(PN.getIncomingBlock(Idx)));
+             }))
     return true;
 
   // Check if the value is used by any other instructions outside the region.
@@ -1098,30 +1159,72 @@ static hash_code encodePHINodeData(PHINodeData &PND) {
 ///
 /// \param Region - The region that \p PN is an output for.
 /// \param PN - The PHINode we are analyzing.
+/// \param Blocks - The blocks for the region we are analyzing.
 /// \param AggArgIdx - The argument \p PN will be stored into.
 /// \returns An optional holding the assigned canonical number, or None if
 /// there is some attribute of the PHINode blocking it from being used.
 static Optional<unsigned> getGVNForPHINode(OutlinableRegion &Region,
-                                           PHINode *PN, unsigned AggArgIdx) {
+                                           PHINode *PN,
+                                           DenseSet<BasicBlock *> &Blocks,
+                                           unsigned AggArgIdx) {
   OutlinableGroup &Group = *Region.Parent;
   IRSimilarityCandidate &Cand = *Region.Candidate;
   BasicBlock *PHIBB = PN->getParent();
   CanonList PHIGVNs;
-  for (Value *Incoming : PN->incoming_values()) {
-    // If we cannot find a GVN, this means that the input to the PHINode is
-    // not included in the region we are trying to analyze, meaning, that if
-    // it was outlined, we would be adding an extra input.  We ignore this
-    // case for now, and so ignore the region.
+  Value *Incoming;
+  BasicBlock *IncomingBlock;
+  for (unsigned Idx = 0, EIdx = PN->getNumIncomingValues(); Idx < EIdx; Idx++) {
+    Incoming = PN->getIncomingValue(Idx);
+    IncomingBlock = PN->getIncomingBlock(Idx);
+    // If we cannot find a GVN, and the incoming block is included in the region
+    // this means that the input to the PHINode is not included in the region we
+    // are trying to analyze, meaning, that if it was outlined, we would be
+    // adding an extra input.  We ignore this case for now, and so ignore the
+    // region.
     Optional<unsigned> OGVN = Cand.getGVN(Incoming);
-    if (!OGVN.hasValue()) {
+    if (!OGVN && Blocks.contains(IncomingBlock)) {
       Region.IgnoreRegion = true;
       return None;
     }
 
+    // If the incoming block isn't in the region, we don't have to worry about
+    // this incoming value.
+    if (!Blocks.contains(IncomingBlock))
+      continue;
+
     // Collect the canonical numbers of the values in the PHINode.
-    unsigned GVN = OGVN.getValue();
+    unsigned GVN = *OGVN;
     OGVN = Cand.getCanonicalNum(GVN);
-    assert(OGVN.hasValue() && "No GVN found for incoming value?");
+    assert(OGVN && "No GVN found for incoming value?");
+    PHIGVNs.push_back(*OGVN);
+
+    // Find the incoming block and use the canonical numbering as well to define
+    // the hash for the PHINode.
+    OGVN = Cand.getGVN(IncomingBlock);
+
+    // If there is no number for the incoming block, it is becaause we have
+    // split the candidate basic blocks.  So we use the previous block that it
+    // was split from to find the valid global value numbering for the PHINode.
+    if (!OGVN) {
+      assert(Cand.getStartBB() == IncomingBlock &&
+             "Unknown basic block used in exit path PHINode.");
+
+      BasicBlock *PrevBlock = nullptr;
+      // Iterate over the predecessors to the incoming block of the
+      // PHINode, when we find a block that is not contained in the region
+      // we know that this is the first block that we split from, and should
+      // have a valid global value numbering.
+      for (BasicBlock *Pred : predecessors(IncomingBlock))
+        if (!Blocks.contains(Pred)) {
+          PrevBlock = Pred;
+          break;
+        }
+      assert(PrevBlock && "Expected a predecessor not in the reigon!");
+      OGVN = Cand.getGVN(PrevBlock);
+    }
+    GVN = *OGVN;
+    OGVN = Cand.getCanonicalNum(GVN);
+    assert(OGVN && "No GVN found for incoming block?");
     PHIGVNs.push_back(*OGVN);
   }
 
@@ -1131,11 +1234,10 @@ static Optional<unsigned> getGVNForPHINode(OutlinableRegion &Region,
   DenseMap<hash_code, unsigned>::iterator GVNToPHIIt;
   DenseMap<unsigned, PHINodeData>::iterator PHIToGVNIt;
   Optional<unsigned> BBGVN = Cand.getGVN(PHIBB);
-  assert(BBGVN.hasValue() && "Could not find GVN for the incoming block!");
+  assert(BBGVN && "Could not find GVN for the incoming block!");
 
   BBGVN = Cand.getCanonicalNum(BBGVN.getValue());
-  assert(BBGVN.hasValue() &&
-         "Could not find canonical number for the incoming block!");
+  assert(BBGVN && "Could not find canonical number for the incoming block!");
   // Create a pair of the exit block canonical value, and the aggregate
   // argument location, connected to the canonical numbers stored in the
   // PHINode.
@@ -1262,9 +1364,9 @@ findExtractedOutputToOverallOutputMapping(OutlinableRegion &Region,
 
       // If two PHINodes have the same canonical values, but different aggregate
       // argument locations, then they will have distinct Canonical Values.
-      GVN = getGVNForPHINode(Region, PN, AggArgIdx);
-      if (!GVN.hasValue())
-        return; 
+      GVN = getGVNForPHINode(Region, PN, BlocksInRegion, AggArgIdx);
+      if (!GVN)
+        return;
     } else {
       // If we do not have a PHINode we use the global value numbering for the
       // output value, to find the canonical number to add to the set of stored
@@ -1413,7 +1515,7 @@ CallInst *replaceCalledFunction(Module &M, OutlinableRegion &Region) {
 
   // Make sure that the argument in the new function has the SwiftError
   // argument.
-  if (Group.SwiftErrorArgument.hasValue())
+  if (Group.SwiftErrorArgument)
     Call->addParamAttr(Group.SwiftErrorArgument.getValue(),
                        Attribute::SwiftError);
 
@@ -1520,17 +1622,18 @@ getPassedArgumentAndAdjustArgumentLocation(const Argument *A,
 /// \param OutputMappings [in] - The mapping of output values from outlined
 /// region to their original values.
 /// \param CanonNums [out] - The canonical numbering for the incoming values to
-/// \p PN.
+/// \p PN paired with their incoming block.
 /// \param ReplacedWithOutlinedCall - A flag to use the extracted function call
 /// of \p Region rather than the overall function's call.
-static void
-findCanonNumsForPHI(PHINode *PN, OutlinableRegion &Region,
-                    const DenseMap<Value *, Value *> &OutputMappings,
-                    DenseSet<unsigned> &CanonNums,
-                    bool ReplacedWithOutlinedCall = true) {
+static void findCanonNumsForPHI(
+    PHINode *PN, OutlinableRegion &Region,
+    const DenseMap<Value *, Value *> &OutputMappings,
+    SmallVector<std::pair<unsigned, BasicBlock *>> &CanonNums,
+    bool ReplacedWithOutlinedCall = true) {
   // Iterate over the incoming values.
   for (unsigned Idx = 0, EIdx = PN->getNumIncomingValues(); Idx < EIdx; Idx++) {
     Value *IVal = PN->getIncomingValue(Idx);
+    BasicBlock *IBlock = PN->getIncomingBlock(Idx);
     // If we have an argument as incoming value, we need to grab the passed
     // value from the call itself.
     if (Argument *A = dyn_cast<Argument>(IVal)) {
@@ -1545,10 +1648,10 @@ findCanonNumsForPHI(PHINode *PN, OutlinableRegion &Region,
 
     // Find and add the canonical number for the incoming value.
     Optional<unsigned> GVN = Region.Candidate->getGVN(IVal);
-    assert(GVN.hasValue() && "No GVN for incoming value");
+    assert(GVN && "No GVN for incoming value");
     Optional<unsigned> CanonNum = Region.Candidate->getCanonicalNum(*GVN);
-    assert(CanonNum.hasValue() && "No Canonical Number for GVN");
-    CanonNums.insert(*CanonNum);
+    assert(CanonNum && "No Canonical Number for GVN");
+    CanonNums.push_back(std::make_pair(*CanonNum, IBlock));
   }
 }
 
@@ -1557,19 +1660,26 @@ findCanonNumsForPHI(PHINode *PN, OutlinableRegion &Region,
 /// function.
 ///
 /// \param PN [in] - The PHINode that we are finding the canonical numbers for.
-/// \param Region [in] - The OutlinableRegion containing \p PN. 
+/// \param Region [in] - The OutlinableRegion containing \p PN.
 /// \param OverallPhiBlock [in] - The overall PHIBlock we are trying to find
 /// \p PN in.
 /// \param OutputMappings [in] - The mapping of output values from outlined
 /// region to their original values.
+/// \param UsedPHIs [in, out] - The PHINodes in the block that have already been
+/// matched.
 /// \return the newly found or created PHINode in \p OverallPhiBlock.
 static PHINode*
 findOrCreatePHIInBlock(PHINode &PN, OutlinableRegion &Region,
                        BasicBlock *OverallPhiBlock,
-                       const DenseMap<Value *, Value *> &OutputMappings) {
+                       const DenseMap<Value *, Value *> &OutputMappings,
+                       DenseSet<PHINode *> &UsedPHIs) {
   OutlinableGroup &Group = *Region.Parent;
   
-  DenseSet<unsigned> PNCanonNums;
+  
+  // A list of the canonical numbering assigned to each incoming value, paired
+  // with the incoming block for the PHINode passed into this function.
+  SmallVector<std::pair<unsigned, BasicBlock *>> PNCanonNums;
+
   // We have to use the extracted function since we have merged this region into
   // the overall function yet.  We make sure to reassign the argument numbering
   // since it is possible that the argument ordering is different between the
@@ -1578,18 +1688,61 @@ findOrCreatePHIInBlock(PHINode &PN, OutlinableRegion &Region,
                       /* ReplacedWithOutlinedCall = */ false);
 
   OutlinableRegion *FirstRegion = Group.Regions[0];
-  DenseSet<unsigned> CurrentCanonNums;
+
+  // A list of the canonical numbering assigned to each incoming value, paired
+  // with the incoming block for the PHINode that we are currently comparing
+  // the passed PHINode to.
+  SmallVector<std::pair<unsigned, BasicBlock *>> CurrentCanonNums;
+
   // Find the Canonical Numbering for each PHINode, if it matches, we replace
   // the uses of the PHINode we are searching for, with the found PHINode.
   for (PHINode &CurrPN : OverallPhiBlock->phis()) {
+    // If this PHINode has already been matched to another PHINode to be merged,
+    // we skip it.
+    if (UsedPHIs.contains(&CurrPN))
+      continue;
+
     CurrentCanonNums.clear();
     findCanonNumsForPHI(&CurrPN, *FirstRegion, OutputMappings, CurrentCanonNums,
                         /* ReplacedWithOutlinedCall = */ true);
 
-    if (all_of(PNCanonNums, [&CurrentCanonNums](unsigned CanonNum) {
-          return CurrentCanonNums.contains(CanonNum);
-        }))
+    // If the list of incoming values is not the same length, then they cannot
+    // match since there is not an analogue for each incoming value.
+    if (PNCanonNums.size() != CurrentCanonNums.size())
+      continue;
+
+    bool FoundMatch = true;
+
+    // We compare the canonical value for each incoming value in the passed
+    // in PHINode to one already present in the outlined region.  If the
+    // incoming values do not match, then the PHINodes do not match.
+
+    // We also check to make sure that the incoming block matches as well by
+    // finding the corresponding incoming block in the combined outlined region
+    // for the current outlined region.
+    for (unsigned Idx = 0, Edx = PNCanonNums.size(); Idx < Edx; ++Idx) {
+      std::pair<unsigned, BasicBlock *> ToCompareTo = CurrentCanonNums[Idx];
+      std::pair<unsigned, BasicBlock *> ToAdd = PNCanonNums[Idx];
+      if (ToCompareTo.first != ToAdd.first) {
+        FoundMatch = false;
+        break;
+      }
+
+      BasicBlock *CorrespondingBlock =
+          Region.findCorrespondingBlockIn(*FirstRegion, ToAdd.second);
+      assert(CorrespondingBlock && "Found block is nullptr");
+      if (CorrespondingBlock != ToCompareTo.second) {
+        FoundMatch = false;
+        break;
+      }
+    }
+
+    // If all incoming values and branches matched, then we can merge
+    // into the found PHINode.
+    if (FoundMatch) {
+      UsedPHIs.insert(&CurrPN);
       return &CurrPN;
+    }
   }
 
   // If we've made it here, it means we weren't able to replace the PHINode, so
@@ -1603,12 +1756,8 @@ findOrCreatePHIInBlock(PHINode &PN, OutlinableRegion &Region,
 
     // Find corresponding basic block in the overall function for the incoming
     // block.
-    Instruction *FirstNonPHI = IncomingBlock->getFirstNonPHI();
-    assert(FirstNonPHI && "Incoming block is empty?");
-    Value *CorrespondingVal =
-        Region.findCorrespondingValueIn(*FirstRegion, FirstNonPHI);
-    assert(CorrespondingVal && "Value is nullptr?");
-    BasicBlock *BlockToUse = cast<Instruction>(CorrespondingVal)->getParent();
+    BasicBlock *BlockToUse =
+        Region.findCorrespondingBlockIn(*FirstRegion, IncomingBlock);
     NewPN->setIncomingBlock(Idx, BlockToUse);
 
     // If we have an argument we make sure we replace using the argument from
@@ -1623,6 +1772,10 @@ findOrCreatePHIInBlock(PHINode &PN, OutlinableRegion &Region,
     IncomingVal = findOutputMapping(OutputMappings, IncomingVal);
     Value *Val = Region.findCorrespondingValueIn(*FirstRegion, IncomingVal);
     assert(Val && "Value is nullptr?");
+    DenseMap<Value *, Value *>::iterator RemappedIt =
+        FirstRegion->RemappedArguments.find(Val);
+    if (RemappedIt != FirstRegion->RemappedArguments.end())
+      Val = RemappedIt->second;
     NewPN->setIncomingValue(Idx, Val);
   }
   return NewPN;
@@ -1649,6 +1802,7 @@ replaceArgumentUses(OutlinableRegion &Region,
   if (FirstFunction)
     DominatingFunction = Group.OutlinedFunction;
   DominatorTree DT(*DominatingFunction);
+  DenseSet<PHINode *> UsedPHIs;
 
   for (unsigned ArgIdx = 0; ArgIdx < Region.ExtractedFunction->arg_size();
        ArgIdx++) {
@@ -1665,6 +1819,8 @@ replaceArgumentUses(OutlinableRegion &Region,
                         << *Region.ExtractedFunction << " with " << *AggArg
                         << " in function " << *Group.OutlinedFunction << "\n");
       Arg->replaceAllUsesWith(AggArg);
+      Value *V = Region.Call->getArgOperand(ArgIdx);
+      Region.RemappedArguments.insert(std::make_pair(V, AggArg));
       continue;
     }
 
@@ -1713,7 +1869,7 @@ replaceArgumentUses(OutlinableRegion &Region,
       // If this is storing a PHINode, we must make sure it is included in the
       // overall function.
       if (!isa<PHINode>(ValueOperand) ||
-          Region.Candidate->getGVN(ValueOperand).hasValue()) {
+          Region.Candidate->getGVN(ValueOperand).has_value()) {
         if (FirstFunction)
           continue;
         Value *CorrVal =
@@ -1725,7 +1881,7 @@ replaceArgumentUses(OutlinableRegion &Region,
       PHINode *PN = cast<PHINode>(SI->getValueOperand());
       // If it has a value, it was not split by the code extractor, which
       // is what we are looking for.
-      if (Region.Candidate->getGVN(PN).hasValue())
+      if (Region.Candidate->getGVN(PN))
         continue;
 
       // We record the parent block for the PHINode in the Region so that
@@ -1748,8 +1904,8 @@ replaceArgumentUses(OutlinableRegion &Region,
       // For our PHINode, we find the combined canonical numbering, and
       // attempt to find a matching PHINode in the overall PHIBlock.  If we
       // cannot, we copy the PHINode and move it into this new block.
-      PHINode *NewPN =
-          findOrCreatePHIInBlock(*PN, Region, OverallPhiBlock, OutputMappings);
+      PHINode *NewPN = findOrCreatePHIInBlock(*PN, Region, OverallPhiBlock,
+                                              OutputMappings, UsedPHIs);
       NewI->setOperand(0, NewPN);
     }
 
@@ -1923,7 +2079,7 @@ static void alignOutputBlockWithAggFunc(
 
   // If there is, we remove the new output blocks.  If it does not,
   // we add it to our list of sets of output blocks.
-  if (MatchingBB.hasValue()) {
+  if (MatchingBB) {
     LLVM_DEBUG(dbgs() << "Set output block for region in function"
                       << Region.ExtractedFunction << " to "
                       << MatchingBB.getValue());
@@ -2279,6 +2435,9 @@ void IROutliner::pruneIncompatibleRegions(
     if (BBHasAddressTaken)
       continue;
 
+    if (IRSC.getFunction()->hasOptNone())
+      continue;
+
     if (IRSC.front()->Inst->getFunction()->hasLinkOnceODRLinkage() &&
         !OutlineFromLinkODRs)
       continue;
@@ -2343,9 +2502,9 @@ static Value *findOutputValueInRegion(OutlinableRegion &Region,
     OutputCanon = *It->second.second.begin();
   }
   Optional<unsigned> OGVN = Region.Candidate->fromCanonicalNum(OutputCanon);
-  assert(OGVN.hasValue() && "Could not find GVN for Canonical Number?");
+  assert(OGVN && "Could not find GVN for Canonical Number?");
   Optional<Value *> OV = Region.Candidate->fromGVN(*OGVN);
-  assert(OV.hasValue() && "Could not find value for GVN?");
+  assert(OV && "Could not find value for GVN?");
   return *OV;
 }
 
@@ -2400,11 +2559,8 @@ static InstructionCost findCostForOutputBlocks(Module &M,
 
     for (Value *V : ID.OperVals) {
       BasicBlock *BB = static_cast<BasicBlock *>(V);
-      DenseSet<BasicBlock *>::iterator CBIt = CandidateBlocks.find(BB);
-      if (CBIt != CandidateBlocks.end() || FoundBlocks.contains(BB))
-        continue;
-      FoundBlocks.insert(BB);
-      NumOutputBranches++;
+      if (!CandidateBlocks.contains(BB) && FoundBlocks.insert(BB).second)
+        NumOutputBranches++;
     }
   }
 
@@ -2520,7 +2676,7 @@ void IROutliner::updateOutputMapping(OutlinableRegion &Region,
 
   // If we found an output register, place a mapping of the new value
   // to the original in the mapping.
-  if (!OutputIdx.hasValue())
+  if (!OutputIdx)
     return;
 
   if (OutputMappings.find(Outputs[OutputIdx.getValue()]) ==
@@ -2680,7 +2836,7 @@ unsigned IROutliner::doOutline(Module &M) {
       OS->Candidate->getBasicBlocks(BlocksInRegion, BE);
       OS->CE = new (ExtractorAllocator.Allocate())
           CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false,
-                        false, "outlined");
+                        false, nullptr, "outlined");
       findAddInputsOutputs(M, *OS, NotSame);
       if (!OS->IgnoreRegion)
         OutlinedRegions.push_back(OS);
@@ -2791,7 +2947,7 @@ unsigned IROutliner::doOutline(Module &M) {
       OS->Candidate->getBasicBlocks(BlocksInRegion, BE);
       OS->CE = new (ExtractorAllocator.Allocate())
           CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false,
-                        false, "outlined");
+                        false, nullptr, "outlined");
       bool FunctionOutlined = extractSection(*OS);
       if (FunctionOutlined) {
         unsigned StartIdx = OS->Candidate->getStartIdx();
@@ -2874,7 +3030,7 @@ bool IROutlinerLegacyPass::runOnModule(Module &M) {
   std::unique_ptr<OptimizationRemarkEmitter> ORE;
   auto GORE = [&ORE](Function &F) -> OptimizationRemarkEmitter & {
     ORE.reset(new OptimizationRemarkEmitter(&F));
-    return *ORE.get();
+    return *ORE;
   };
 
   auto GTTI = [this](Function &F) -> TargetTransformInfo & {
@@ -2905,7 +3061,7 @@ PreservedAnalyses IROutlinerPass::run(Module &M, ModuleAnalysisManager &AM) {
   std::function<OptimizationRemarkEmitter &(Function &)> GORE =
       [&ORE](Function &F) -> OptimizationRemarkEmitter & {
     ORE.reset(new OptimizationRemarkEmitter(&F));
-    return *ORE.get();
+    return *ORE;
   };
 
   if (IROutliner(GTTI, GIRSI, GORE).run(M))
diff --git a/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp b/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp
index c32e09875a12..76f8f1a7a482 100644
--- a/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp
@@ -9,11 +9,8 @@
 #include "llvm/Transforms/IPO/InferFunctionAttrs.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
@@ -32,7 +29,7 @@ static bool inferAllPrototypeAttributes(
     // explicitly visited by CGSCC passes in the new pass manager.)
     if (F.isDeclaration() && !F.hasOptNone()) {
       if (!F.hasFnAttribute(Attribute::NoBuiltin))
-        Changed |= inferLibFuncAttributes(F, GetTLI(F));
+        Changed |= inferNonMandatoryLibFuncAttrs(F, GetTLI(F));
       Changed |= inferAttributesFromOthers(F);
     }
 
diff --git a/llvm/lib/Transforms/IPO/InlineSimple.cpp b/llvm/lib/Transforms/IPO/InlineSimple.cpp
index 76f1d0c54d08..2143e39d488d 100644
--- a/llvm/lib/Transforms/IPO/InlineSimple.cpp
+++ b/llvm/lib/Transforms/IPO/InlineSimple.cpp
@@ -12,14 +12,8 @@
 
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InlineCost.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/Inliner.h"
diff --git a/llvm/lib/Transforms/IPO/Inliner.cpp b/llvm/lib/Transforms/IPO/Inliner.cpp
index 49babc24cb82..4d32266eb9ea 100644
--- a/llvm/lib/Transforms/IPO/Inliner.cpp
+++ b/llvm/lib/Transforms/IPO/Inliner.cpp
@@ -14,21 +14,21 @@
 
 #include "llvm/Transforms/IPO/Inliner.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/PriorityWorklist.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InlineAdvisor.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/InlineOrder.h"
@@ -37,11 +37,9 @@
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ReplayInlineAdvisor.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/Utils/ImportedFunctionsInliningStatistics.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/DiagnosticInfo.h"
@@ -67,8 +65,6 @@
 #include <algorithm>
 #include <cassert>
 #include <functional>
-#include <sstream>
-#include <tuple>
 #include <utility>
 #include <vector>
 
@@ -92,11 +88,28 @@ static cl::opt<bool>
     DisableInlinedAllocaMerging("disable-inlined-alloca-merging",
                                 cl::init(false), cl::Hidden);
 
+static cl::opt<int> IntraSCCCostMultiplier(
+    "intra-scc-cost-multiplier", cl::init(2), cl::Hidden,
+    cl::desc(
+        "Cost multiplier to multiply onto inlined call sites where the "
+        "new call was previously an intra-SCC call (not relevant when the "
+        "original call was already intra-SCC). This can accumulate over "
+        "multiple inlinings (e.g. if a call site already had a cost "
+        "multiplier and one of its inlined calls was also subject to "
+        "this, the inlined call would have the original multiplier "
+        "multiplied by intra-scc-cost-multiplier). This is to prevent tons of "
+        "inlining through a child SCC which can cause terrible compile times"));
+
 /// A flag for test, so we can print the content of the advisor when running it
 /// as part of the default (e.g. -O3) pipeline.
 static cl::opt<bool> KeepAdvisorForPrinting("keep-inline-advisor-for-printing",
                                             cl::init(false), cl::Hidden);
 
+/// Allows printing the contents of the advisor after each SCC inliner pass.
+static cl::opt<bool>
+    EnablePostSCCAdvisorPrinting("enable-scc-inline-advisor-printing",
+                                 cl::init(false), cl::Hidden);
+
 extern cl::opt<InlinerFunctionImportStatsOpts> InlinerFunctionImportStats;
 
 static cl::opt<std::string> CGSCCInlineReplayFile(
@@ -150,10 +163,6 @@ static cl::opt<CallSiteFormat::Format> CGSCCInlineReplayFormat(
                    "<Line Number>:<Column Number>.<Discriminator> (default)")),
     cl::desc("How cgscc inline replay file is formatted"), cl::Hidden);
 
-static cl::opt<bool> InlineEnablePriorityOrder(
-    "inline-enable-priority-order", cl::Hidden, cl::init(false),
-    cl::desc("Enable the priority inline order for the inliner"));
-
 LegacyInlinerBase::LegacyInlinerBase(char &ID) : CallGraphSCCPass(ID) {}
 
 LegacyInlinerBase::LegacyInlinerBase(char &ID, bool InsertLifetime)
@@ -708,8 +717,9 @@ InlinerPass::getAdvisor(const ModuleAnalysisManagerCGSCCProxy::Result &MAM,
     // duration of the inliner pass, and thus the lifetime of the owned advisor.
     // The one we would get from the MAM can be invalidated as a result of the
     // inliner's activity.
-    OwnedAdvisor =
-        std::make_unique<DefaultInlineAdvisor>(M, FAM, getInlineParams());
+    OwnedAdvisor = std::make_unique<DefaultInlineAdvisor>(
+        M, FAM, getInlineParams(),
+        InlineContext{LTOPhase, InlinePass::CGSCCInliner});
 
     if (!CGSCCInlineReplayFile.empty())
       OwnedAdvisor = getReplayInlineAdvisor(
@@ -718,7 +728,9 @@ InlinerPass::getAdvisor(const ModuleAnalysisManagerCGSCCProxy::Result &MAM,
                                 CGSCCInlineReplayScope,
                                 CGSCCInlineReplayFallback,
                                 {CGSCCInlineReplayFormat}},
-          /*EmitRemarks=*/true);
+          /*EmitRemarks=*/true,
+          InlineContext{LTOPhase,
+                              InlinePass::ReplayCGSCCInliner});
 
     return *OwnedAdvisor;
   }
@@ -744,7 +756,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
           .getManager();
 
   InlineAdvisor &Advisor = getAdvisor(MAMProxy, FAM, M);
-  Advisor.onPassEntry();
+  Advisor.onPassEntry(&InitialC);
 
   auto AdvisorOnExit = make_scope_exit([&] { Advisor.onPassExit(&InitialC); });
 
@@ -773,12 +785,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
   // this model, but it is uniformly spread across all the functions in the SCC
   // and eventually they all become too large to inline, rather than
   // incrementally maknig a single function grow in a super linear fashion.
-  std::unique_ptr<InlineOrder<std::pair<CallBase *, int>>> Calls;
-  if (InlineEnablePriorityOrder)
-    Calls = std::make_unique<PriorityInlineOrder<InlineSizePriority>>();
-  else
-    Calls = std::make_unique<DefaultInlineOrder<std::pair<CallBase *, int>>>();
-  assert(Calls != nullptr && "Expected an initialized InlineOrder");
+  DefaultInlineOrder<std::pair<CallBase *, int>> Calls;
 
   // Populate the initial list of calls in this SCC.
   for (auto &N : InitialC) {
@@ -793,7 +800,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
       if (auto *CB = dyn_cast<CallBase>(&I))
         if (Function *Callee = CB->getCalledFunction()) {
           if (!Callee->isDeclaration())
-            Calls->push({CB, -1});
+            Calls.push({CB, -1});
           else if (!isa<IntrinsicInst>(I)) {
             using namespace ore;
             setInlineRemark(*CB, "unavailable definition");
@@ -807,7 +814,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
           }
         }
   }
-  if (Calls->empty())
+  if (Calls.empty())
     return PreservedAnalyses::all();
 
   // Capture updatable variable for the current SCC.
@@ -833,15 +840,15 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
   SmallVector<Function *, 4> DeadFunctionsInComdats;
 
   // Loop forward over all of the calls.
-  while (!Calls->empty()) {
+  while (!Calls.empty()) {
     // We expect the calls to typically be batched with sequences of calls that
     // have the same caller, so we first set up some shared infrastructure for
     // this caller. We also do any pruning we can at this layer on the caller
     // alone.
-    Function &F = *Calls->front().first->getCaller();
+    Function &F = *Calls.front().first->getCaller();
     LazyCallGraph::Node &N = *CG.lookup(F);
     if (CG.lookupSCC(N) != C) {
-      Calls->pop();
+      Calls.pop();
       continue;
     }
 
@@ -857,8 +864,8 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
     // We bail out as soon as the caller has to change so we can update the
     // call graph and prepare the context of that new caller.
     bool DidInline = false;
-    while (!Calls->empty() && Calls->front().first->getCaller() == &F) {
-      auto P = Calls->pop();
+    while (!Calls.empty() && Calls.front().first->getCaller() == &F) {
+      auto P = Calls.pop();
       CallBase *CB = P.first;
       const int InlineHistoryID = P.second;
       Function &Callee = *CB->getCalledFunction();
@@ -876,8 +883,8 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
       // trigger infinite inlining, much like is prevented within the inliner
       // itself by the InlineHistory above, but spread across CGSCC iterations
       // and thus hidden from the full inline history.
-      if (CG.lookupSCC(*CG.lookup(Callee)) == C &&
-          UR.InlinedInternalEdges.count({&N, C})) {
+      LazyCallGraph::SCC *CalleeSCC = CG.lookupSCC(*CG.lookup(Callee));
+      if (CalleeSCC == C && UR.InlinedInternalEdges.count({&N, C})) {
         LLVM_DEBUG(dbgs() << "Skipping inlining internal SCC edge from a node "
                              "previously split out of this SCC by inlining: "
                           << F.getName() << " -> " << Callee.getName() << "\n");
@@ -897,6 +904,11 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
         continue;
       }
 
+      int CBCostMult =
+          getStringFnAttrAsInt(
+              *CB, InlineConstants::FunctionInlineCostMultiplierAttributeName)
+              .value_or(1);
+
       // Setup the data structure used to plumb customization into the
       // `InlineFunction` routine.
       InlineFunctionInfo IFI(
@@ -935,9 +947,28 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
             if (tryPromoteCall(*ICB))
               NewCallee = ICB->getCalledFunction();
           }
-          if (NewCallee)
-            if (!NewCallee->isDeclaration())
-              Calls->push({ICB, NewHistoryID});
+          if (NewCallee) {
+            if (!NewCallee->isDeclaration()) {
+              Calls.push({ICB, NewHistoryID});
+              // Continually inlining through an SCC can result in huge compile
+              // times and bloated code since we arbitrarily stop at some point
+              // when the inliner decides it's not profitable to inline anymore.
+              // We attempt to mitigate this by making these calls exponentially
+              // more expensive.
+              // This doesn't apply to calls in the same SCC since if we do
+              // inline through the SCC the function will end up being
+              // self-recursive which the inliner bails out on, and inlining
+              // within an SCC is necessary for performance.
+              if (CalleeSCC != C &&
+                  CalleeSCC == CG.lookupSCC(CG.get(*NewCallee))) {
+                Attribute NewCBCostMult = Attribute::get(
+                    M.getContext(),
+                    InlineConstants::FunctionInlineCostMultiplierAttributeName,
+                    itostr(CBCostMult * IntraSCCCostMultiplier));
+                ICB->addFnAttr(NewCBCostMult);
+              }
+            }
+          }
         }
       }
 
@@ -953,7 +984,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
       if (Callee.isDiscardableIfUnused() && Callee.hasZeroLiveUses() &&
           !CG.isLibFunction(Callee)) {
         if (Callee.hasLocalLinkage() || !Callee.hasComdat()) {
-          Calls->erase_if([&](const std::pair<CallBase *, int> &Call) {
+          Calls.erase_if([&](const std::pair<CallBase *, int> &Call) {
             return Call.first->getCaller() == &Callee;
           });
           // Clear the body and queue the function itself for deletion when we
@@ -1083,17 +1114,24 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
 
 ModuleInlinerWrapperPass::ModuleInlinerWrapperPass(InlineParams Params,
                                                    bool MandatoryFirst,
+                                                   InlineContext IC,
                                                    InliningAdvisorMode Mode,
                                                    unsigned MaxDevirtIterations)
-    : Params(Params), Mode(Mode), MaxDevirtIterations(MaxDevirtIterations) {
+    : Params(Params), IC(IC), Mode(Mode),
+      MaxDevirtIterations(MaxDevirtIterations) {
   // Run the inliner first. The theory is that we are walking bottom-up and so
   // the callees have already been fully optimized, and we want to inline them
   // into the callers so that our optimizations can reflect that.
   // For PreLinkThinLTO pass, we disable hot-caller heuristic for sample PGO
   // because it makes profile annotation in the backend inaccurate.
-  if (MandatoryFirst)
+  if (MandatoryFirst) {
     PM.addPass(InlinerPass(/*OnlyMandatory*/ true));
+    if (EnablePostSCCAdvisorPrinting)
+      PM.addPass(InlineAdvisorAnalysisPrinterPass(dbgs()));
+  }
   PM.addPass(InlinerPass());
+  if (EnablePostSCCAdvisorPrinting)
+    PM.addPass(InlineAdvisorAnalysisPrinterPass(dbgs()));
 }
 
 PreservedAnalyses ModuleInlinerWrapperPass::run(Module &M,
@@ -1103,7 +1141,8 @@ PreservedAnalyses ModuleInlinerWrapperPass::run(Module &M,
                      {CGSCCInlineReplayFile,
                       CGSCCInlineReplayScope,
                       CGSCCInlineReplayFallback,
-                      {CGSCCInlineReplayFormat}})) {
+                      {CGSCCInlineReplayFormat}},
+                     IC)) {
     M.getContext().emitError(
         "Could not setup Inlining Advisor for the requested "
         "mode and/or options");
diff --git a/llvm/lib/Transforms/IPO/Internalize.cpp b/llvm/lib/Transforms/IPO/Internalize.cpp
index 692e445cb7cb..5aa5b905f06c 100644
--- a/llvm/lib/Transforms/IPO/Internalize.cpp
+++ b/llvm/lib/Transforms/IPO/Internalize.cpp
@@ -19,7 +19,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO/Internalize.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/Triple.h"
@@ -33,8 +32,6 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/Utils/GlobalStatus.h"
-#include "llvm/Transforms/Utils/ModuleUtils.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "internalize"
diff --git a/llvm/lib/Transforms/IPO/LoopExtractor.cpp b/llvm/lib/Transforms/IPO/LoopExtractor.cpp
index d9a59dd35fde..ad1927c09803 100644
--- a/llvm/lib/Transforms/IPO/LoopExtractor.cpp
+++ b/llvm/lib/Transforms/IPO/LoopExtractor.cpp
@@ -23,14 +23,9 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/CodeExtractor.h"
-#include <fstream>
-#include <set>
 using namespace llvm;
 
 #define DEBUG_TYPE "loop-extract"
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index 8e83d7bcb6c2..d5f1d291f41f 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -1223,6 +1223,7 @@ void LowerTypeTestsModule::verifyTypeMDNode(GlobalObject *GO, MDNode *Type) {
 static const unsigned kX86JumpTableEntrySize = 8;
 static const unsigned kARMJumpTableEntrySize = 4;
 static const unsigned kARMBTIJumpTableEntrySize = 8;
+static const unsigned kRISCVJumpTableEntrySize = 8;
 
 unsigned LowerTypeTestsModule::getJumpTableEntrySize() {
   switch (Arch) {
@@ -1238,6 +1239,9 @@ unsigned LowerTypeTestsModule::getJumpTableEntrySize() {
         if (BTE->getZExtValue())
           return kARMBTIJumpTableEntrySize;
       return kARMJumpTableEntrySize;
+    case Triple::riscv32:
+    case Triple::riscv64:
+      return kRISCVJumpTableEntrySize;
     default:
       report_fatal_error("Unsupported architecture for jump tables");
   }
@@ -1265,6 +1269,9 @@ void LowerTypeTestsModule::createJumpTableEntry(
     AsmOS << "b $" << ArgIndex << "\n";
   } else if (JumpTableArch == Triple::thumb) {
     AsmOS << "b.w $" << ArgIndex << "\n";
+  } else if (JumpTableArch == Triple::riscv32 ||
+             JumpTableArch == Triple::riscv64) {
+    AsmOS << "tail $" << ArgIndex << "@plt\n";
   } else {
     report_fatal_error("Unsupported architecture for jump tables");
   }
@@ -1282,7 +1289,8 @@ Type *LowerTypeTestsModule::getJumpTableEntryType() {
 void LowerTypeTestsModule::buildBitSetsFromFunctions(
     ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Functions) {
   if (Arch == Triple::x86 || Arch == Triple::x86_64 || Arch == Triple::arm ||
-      Arch == Triple::thumb || Arch == Triple::aarch64)
+      Arch == Triple::thumb || Arch == Triple::aarch64 ||
+      Arch == Triple::riscv32 || Arch == Triple::riscv64)
     buildBitSetsFromFunctionsNative(TypeIds, Functions);
   else if (Arch == Triple::wasm32 || Arch == Triple::wasm64)
     buildBitSetsFromFunctionsWASM(TypeIds, Functions);
@@ -1427,6 +1435,11 @@ void LowerTypeTestsModule::createJumpTable(
     F->addFnAttr("branch-target-enforcement", "false");
     F->addFnAttr("sign-return-address", "none");
   }
+  if (JumpTableArch == Triple::riscv32 || JumpTableArch == Triple::riscv64) {
+    // Make sure the jump table assembly is not modified by the assembler or
+    // the linker.
+    F->addFnAttr("target-features", "-c,-relax");
+  }
   // Make sure we don't emit .eh_frame for this function.
   F->addFnAttr(Attribute::NoUnwind);
 
@@ -2187,11 +2200,7 @@ bool LowerTypeTestsModule::lower() {
     }
     Sets.emplace_back(I, MaxUniqueId);
   }
-  llvm::sort(Sets,
-             [](const std::pair<GlobalClassesTy::iterator, unsigned> &S1,
-                const std::pair<GlobalClassesTy::iterator, unsigned> &S2) {
-               return S1.second < S2.second;
-             });
+  llvm::sort(Sets, llvm::less_second());
 
   // For each disjoint set we found...
   for (const auto &S : Sets) {
diff --git a/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/llvm/lib/Transforms/IPO/MergeFunctions.cpp
index 97ef872c5499..b850591b4aa6 100644
--- a/llvm/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/llvm/lib/Transforms/IPO/MergeFunctions.cpp
@@ -88,12 +88,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/IPO/MergeFunctions.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Argument.h"
-#include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
@@ -113,7 +112,6 @@
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
-#include "llvm/IR/ValueMap.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
@@ -121,8 +119,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/IPO/MergeFunctions.h"
 #include "llvm/Transforms/Utils/FunctionComparator.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <algorithm>
 #include <cassert>
 #include <iterator>
@@ -139,10 +137,10 @@ STATISTIC(NumThunksWritten, "Number of thunks generated");
 STATISTIC(NumAliasesWritten, "Number of aliases generated");
 STATISTIC(NumDoubleWeak, "Number of new functions created");
 
-static cl::opt<unsigned> NumFunctionsForSanityCheck(
-    "mergefunc-sanity",
-    cl::desc("How many functions in module could be used for "
-             "MergeFunctions pass sanity check. "
+static cl::opt<unsigned> NumFunctionsForVerificationCheck(
+    "mergefunc-verify",
+    cl::desc("How many functions in a module could be used for "
+             "MergeFunctions to pass a basic correctness check. "
              "'0' disables this check. Works only with '-debug' key."),
     cl::init(0), cl::Hidden);
 
@@ -228,10 +226,13 @@ private:
   /// analyzed again.
   std::vector<WeakTrackingVH> Deferred;
 
+  /// Set of values marked as used in llvm.used and llvm.compiler.used.
+  SmallPtrSet<GlobalValue *, 4> Used;
+
 #ifndef NDEBUG
   /// Checks the rules of order relation introduced among functions set.
-  /// Returns true, if sanity check has been passed, and false if failed.
-  bool doSanityCheck(std::vector<WeakTrackingVH> &Worklist);
+  /// Returns true, if check has been passed, and false if failed.
+  bool doFunctionalCheck(std::vector<WeakTrackingVH> &Worklist);
 #endif
 
   /// Insert a ComparableFunction into the FnTree, or merge it away if it's
@@ -330,12 +331,12 @@ PreservedAnalyses MergeFunctionsPass::run(Module &M,
 }
 
 #ifndef NDEBUG
-bool MergeFunctions::doSanityCheck(std::vector<WeakTrackingVH> &Worklist) {
-  if (const unsigned Max = NumFunctionsForSanityCheck) {
+bool MergeFunctions::doFunctionalCheck(std::vector<WeakTrackingVH> &Worklist) {
+  if (const unsigned Max = NumFunctionsForVerificationCheck) {
     unsigned TripleNumber = 0;
     bool Valid = true;
 
-    dbgs() << "MERGEFUNC-SANITY: Started for first " << Max << " functions.\n";
+    dbgs() << "MERGEFUNC-VERIFY: Started for first " << Max << " functions.\n";
 
     unsigned i = 0;
     for (std::vector<WeakTrackingVH>::iterator I = Worklist.begin(),
@@ -351,7 +352,7 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakTrackingVH> &Worklist) {
 
         // If F1 <= F2, then F2 >= F1, otherwise report failure.
         if (Res1 != -Res2) {
-          dbgs() << "MERGEFUNC-SANITY: Non-symmetric; triple: " << TripleNumber
+          dbgs() << "MERGEFUNC-VERIFY: Non-symmetric; triple: " << TripleNumber
                  << "\n";
           dbgs() << *F1 << '\n' << *F2 << '\n';
           Valid = false;
@@ -384,7 +385,7 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakTrackingVH> &Worklist) {
           }
 
           if (!Transitive) {
-            dbgs() << "MERGEFUNC-SANITY: Non-transitive; triple: "
+            dbgs() << "MERGEFUNC-VERIFY: Non-transitive; triple: "
                    << TripleNumber << "\n";
             dbgs() << "Res1, Res3, Res4: " << Res1 << ", " << Res3 << ", "
                    << Res4 << "\n";
@@ -395,7 +396,7 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakTrackingVH> &Worklist) {
       }
     }
 
-    dbgs() << "MERGEFUNC-SANITY: " << (Valid ? "Passed." : "Failed.") << "\n";
+    dbgs() << "MERGEFUNC-VERIFY: " << (Valid ? "Passed." : "Failed.") << "\n";
     return Valid;
   }
   return true;
@@ -410,6 +411,11 @@ static bool isEligibleForMerging(Function &F) {
 bool MergeFunctions::runOnModule(Module &M) {
   bool Changed = false;
 
+  SmallVector<GlobalValue *, 4> UsedV;
+  collectUsedGlobalVariables(M, UsedV, /*CompilerUsed=*/false);
+  collectUsedGlobalVariables(M, UsedV, /*CompilerUsed=*/true);
+  Used.insert(UsedV.begin(), UsedV.end());
+
   // All functions in the module, ordered by hash. Functions with a unique
   // hash value are easily eliminated.
   std::vector<std::pair<FunctionComparator::FunctionHash, Function *>>
@@ -436,7 +442,7 @@ bool MergeFunctions::runOnModule(Module &M) {
     std::vector<WeakTrackingVH> Worklist;
     Deferred.swap(Worklist);
 
-    LLVM_DEBUG(doSanityCheck(Worklist));
+    LLVM_DEBUG(doFunctionalCheck(Worklist));
 
     LLVM_DEBUG(dbgs() << "size of module: " << M.size() << '\n');
     LLVM_DEBUG(dbgs() << "size of worklist: " << Worklist.size() << '\n');
@@ -456,6 +462,7 @@ bool MergeFunctions::runOnModule(Module &M) {
   FnTree.clear();
   FNodesInTree.clear();
   GlobalNumbers.clear();
+  Used.clear();
 
   return Changed;
 }
@@ -484,7 +491,7 @@ static Value *createCast(IRBuilder<> &Builder, Value *V, Type *DestTy) {
   if (SrcTy->isStructTy()) {
     assert(DestTy->isStructTy());
     assert(SrcTy->getStructNumElements() == DestTy->getStructNumElements());
-    Value *Result = UndefValue::get(DestTy);
+    Value *Result = PoisonValue::get(DestTy);
     for (unsigned int I = 0, E = SrcTy->getStructNumElements(); I < E; ++I) {
       Value *Element = createCast(
           Builder, Builder.CreateExtractValue(V, makeArrayRef(I)),
@@ -828,7 +835,10 @@ void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) {
     // For better debugability, under MergeFunctionsPDI, we do not modify G's
     // call sites to point to F even when within the same translation unit.
     if (!G->isInterposable() && !MergeFunctionsPDI) {
-      if (G->hasGlobalUnnamedAddr()) {
+      // Functions referred to by llvm.used/llvm.compiler.used are special:
+      // there are uses of the symbol name that are not visible to LLVM,
+      // usually from inline asm.
+      if (G->hasGlobalUnnamedAddr() && !Used.contains(G)) {
         // G might have been a key in our GlobalNumberState, and it's illegal
         // to replace a key in ValueMap<GlobalValue *> with a non-global.
         GlobalNumbers.erase(G);
diff --git a/llvm/lib/Transforms/IPO/ModuleInliner.cpp b/llvm/lib/Transforms/IPO/ModuleInliner.cpp
index d515303e4911..143715006512 100644
--- a/llvm/lib/Transforms/IPO/ModuleInliner.cpp
+++ b/llvm/lib/Transforms/IPO/ModuleInliner.cpp
@@ -14,43 +14,33 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO/ModuleInliner.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InlineAdvisor.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/InlineOrder.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/ReplayInlineAdvisor.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/CallPromotionUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <cassert>
-#include <functional>
 
 using namespace llvm;
 
@@ -94,7 +84,9 @@ InlineAdvisor &ModuleInlinerPass::getAdvisor(const ModuleAnalysisManager &MAM,
     // inliner pass, and thus the lifetime of the owned advisor. The one we
     // would get from the MAM can be invalidated as a result of the inliner's
     // activity.
-    OwnedAdvisor = std::make_unique<DefaultInlineAdvisor>(M, FAM, Params);
+    OwnedAdvisor = std::make_unique<DefaultInlineAdvisor>(
+        M, FAM, Params,
+        InlineContext{LTOPhase, InlinePass::ModuleInliner});
 
     return *OwnedAdvisor;
   }
@@ -119,7 +111,9 @@ PreservedAnalyses ModuleInlinerPass::run(Module &M,
   LLVM_DEBUG(dbgs() << "---- Module Inliner is Running ---- \n");
 
   auto &IAA = MAM.getResult<InlineAdvisorAnalysis>(M);
-  if (!IAA.tryCreate(Params, Mode, {})) {
+  if (!IAA.tryCreate(
+          Params, Mode, {},
+          InlineContext{LTOPhase, InlinePass::ModuleInliner})) {
     M.getContext().emitError(
         "Could not setup Inlining Advisor for the requested "
         "mode and/or options");
@@ -153,7 +147,8 @@ PreservedAnalyses ModuleInlinerPass::run(Module &M,
   // the SCC inliner, which need some refactoring.
   std::unique_ptr<InlineOrder<std::pair<CallBase *, int>>> Calls;
   if (InlineEnablePriorityOrder)
-    Calls = std::make_unique<PriorityInlineOrder<InlineSizePriority>>();
+    Calls = std::make_unique<PriorityInlineOrder>(
+              std::make_unique<SizePriority>());
   else
     Calls = std::make_unique<DefaultInlineOrder<std::pair<CallBase *, int>>>();
   assert(Calls != nullptr && "Expected an initialized InlineOrder");
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 2d765fb6ce6d..227ad8501f25 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -49,7 +49,6 @@
 #include "llvm/Transforms/IPO/Attributor.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/CallGraphUpdater.h"
-#include "llvm/Transforms/Utils/CodeExtractor.h"
 
 #include <algorithm>
 
@@ -59,17 +58,16 @@ using namespace omp;
 #define DEBUG_TYPE "openmp-opt"
 
 static cl::opt<bool> DisableOpenMPOptimizations(
-    "openmp-opt-disable", cl::ZeroOrMore,
-    cl::desc("Disable OpenMP specific optimizations."), cl::Hidden,
-    cl::init(false));
+    "openmp-opt-disable", cl::desc("Disable OpenMP specific optimizations."),
+    cl::Hidden, cl::init(false));
 
 static cl::opt<bool> EnableParallelRegionMerging(
-    "openmp-opt-enable-merging", cl::ZeroOrMore,
+    "openmp-opt-enable-merging",
     cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden,
     cl::init(false));
 
 static cl::opt<bool>
-    DisableInternalization("openmp-opt-disable-internalization", cl::ZeroOrMore,
+    DisableInternalization("openmp-opt-disable-internalization",
                            cl::desc("Disable function internalization."),
                            cl::Hidden, cl::init(false));
 
@@ -85,42 +83,47 @@ static cl::opt<bool> HideMemoryTransferLatency(
     cl::Hidden, cl::init(false));
 
 static cl::opt<bool> DisableOpenMPOptDeglobalization(
-    "openmp-opt-disable-deglobalization", cl::ZeroOrMore,
+    "openmp-opt-disable-deglobalization",
     cl::desc("Disable OpenMP optimizations involving deglobalization."),
     cl::Hidden, cl::init(false));
 
 static cl::opt<bool> DisableOpenMPOptSPMDization(
-    "openmp-opt-disable-spmdization", cl::ZeroOrMore,
+    "openmp-opt-disable-spmdization",
     cl::desc("Disable OpenMP optimizations involving SPMD-ization."),
     cl::Hidden, cl::init(false));
 
 static cl::opt<bool> DisableOpenMPOptFolding(
-    "openmp-opt-disable-folding", cl::ZeroOrMore,
+    "openmp-opt-disable-folding",
     cl::desc("Disable OpenMP optimizations involving folding."), cl::Hidden,
     cl::init(false));
 
 static cl::opt<bool> DisableOpenMPOptStateMachineRewrite(
-    "openmp-opt-disable-state-machine-rewrite", cl::ZeroOrMore,
+    "openmp-opt-disable-state-machine-rewrite",
     cl::desc("Disable OpenMP optimizations that replace the state machine."),
     cl::Hidden, cl::init(false));
 
 static cl::opt<bool> DisableOpenMPOptBarrierElimination(
-    "openmp-opt-disable-barrier-elimination", cl::ZeroOrMore,
+    "openmp-opt-disable-barrier-elimination",
     cl::desc("Disable OpenMP optimizations that eliminate barriers."),
     cl::Hidden, cl::init(false));
 
 static cl::opt<bool> PrintModuleAfterOptimizations(
-    "openmp-opt-print-module", cl::ZeroOrMore,
+    "openmp-opt-print-module-after",
     cl::desc("Print the current module after OpenMP optimizations."),
     cl::Hidden, cl::init(false));
 
+static cl::opt<bool> PrintModuleBeforeOptimizations(
+    "openmp-opt-print-module-before",
+    cl::desc("Print the current module before OpenMP optimizations."),
+    cl::Hidden, cl::init(false));
+
 static cl::opt<bool> AlwaysInlineDeviceFunctions(
-    "openmp-opt-inline-device", cl::ZeroOrMore,
+    "openmp-opt-inline-device",
     cl::desc("Inline all applicible functions on the device."), cl::Hidden,
     cl::init(false));
 
 static cl::opt<bool>
-    EnableVerboseRemarks("openmp-opt-verbose-remarks", cl::ZeroOrMore,
+    EnableVerboseRemarks("openmp-opt-verbose-remarks",
                          cl::desc("Enables more verbose remarks."), cl::Hidden,
                          cl::init(false));
 
@@ -129,6 +132,11 @@ static cl::opt<unsigned>
                           cl::desc("Maximal number of attributor iterations."),
                           cl::init(256));
 
+static cl::opt<unsigned>
+    SharedMemoryLimit("openmp-opt-shared-limit", cl::Hidden,
+                      cl::desc("Maximum amount of shared memory to use."),
+                      cl::init(std::numeric_limits<unsigned>::max()));
+
 STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
           "Number of OpenMP runtime calls deduplicated");
 STATISTIC(NumOpenMPParallelRegionsDeleted,
@@ -493,11 +501,14 @@ struct OMPInformationCache : public InformationCache {
 
     // Remove the `noinline` attribute from `__kmpc`, `_OMP::` and `omp_`
     // functions, except if `optnone` is present.
-    for (Function &F : M) {
-      for (StringRef Prefix : {"__kmpc", "_ZN4_OMP", "omp_"})
-        if (F.getName().startswith(Prefix) &&
-            !F.hasFnAttribute(Attribute::OptimizeNone))
-          F.removeFnAttr(Attribute::NoInline);
+    if (isOpenMPDevice(M)) {
+      for (Function &F : M) {
+        for (StringRef Prefix : {"__kmpc", "_ZN4_OMP", "omp_"})
+          if (F.hasFnAttribute(Attribute::NoInline) &&
+              F.getName().startswith(Prefix) &&
+              !F.hasFnAttribute(Attribute::OptimizeNone))
+            F.removeFnAttr(Attribute::NoInline);
+      }
     }
 
     // TODO: We should attach the attributes defined in OMPKinds.def.
@@ -591,7 +602,7 @@ struct KernelInfoState : AbstractState {
   /// Abstract State interface
   ///{
 
-  KernelInfoState() {}
+  KernelInfoState() = default;
   KernelInfoState(bool BestState) {
     if (!BestState)
       indicatePessimisticFixpoint();
@@ -926,8 +937,7 @@ private:
     SmallDenseMap<BasicBlock *, SmallPtrSet<Instruction *, 4>> BB2PRMap;
 
     BasicBlock *StartBB = nullptr, *EndBB = nullptr;
-    auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
-                         BasicBlock &ContinuationIP) {
+    auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
       BasicBlock *CGStartBB = CodeGenIP.getBlock();
       BasicBlock *CGEndBB =
           SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
@@ -966,8 +976,7 @@ private:
       const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc();
       ParentBB->getTerminator()->eraseFromParent();
 
-      auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
-                           BasicBlock &ContinuationIP) {
+      auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
         BasicBlock *CGStartBB = CodeGenIP.getBlock();
         BasicBlock *CGEndBB =
             SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
@@ -1107,10 +1116,8 @@ private:
       // callbacks.
       SmallVector<Value *, 8> Args;
       for (auto *CI : MergableCIs) {
-        Value *Callee =
-            CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts();
-        FunctionType *FT =
-            cast<FunctionType>(Callee->getType()->getPointerElementType());
+        Value *Callee = CI->getArgOperand(CallbackCalleeOperand);
+        FunctionType *FT = OMPInfoCache.OMPBuilder.ParallelTask;
         Args.clear();
         Args.push_back(OutlinedFn->getArg(0));
         Args.push_back(OutlinedFn->getArg(1));
@@ -1458,7 +1465,6 @@ private:
             case Intrinsic::nvvm_barrier0_and:
             case Intrinsic::nvvm_barrier0_or:
             case Intrinsic::nvvm_barrier0_popc:
-            case Intrinsic::amdgcn_s_barrier:
               return true;
             default:
               break;
@@ -2120,6 +2126,8 @@ private:
                                        OMPRTL___kmpc_barrier_simple_generic);
     ExternalizationRAII ThreadId(OMPInfoCache,
                                  OMPRTL___kmpc_get_hardware_thread_id_in_block);
+    ExternalizationRAII NumThreads(
+        OMPInfoCache, OMPRTL___kmpc_get_hardware_num_threads_in_block);
     ExternalizationRAII WarpSize(OMPInfoCache, OMPRTL___kmpc_get_warp_size);
 
     registerAAs(IsModulePass);
@@ -2407,8 +2415,7 @@ struct AAICVTrackerFunction : public AAICVTracker {
 
       auto CallCheck = [&](Instruction &I) {
         Optional<Value *> ReplVal = getValueForCall(A, I, ICV);
-        if (ReplVal.hasValue() &&
-            ValuesMap.insert(std::make_pair(&I, *ReplVal)).second)
+        if (ReplVal && ValuesMap.insert(std::make_pair(&I, *ReplVal)).second)
           HasChanged = ChangeStatus::CHANGED;
 
         return true;
@@ -2468,7 +2475,8 @@ struct AAICVTrackerFunction : public AAICVTracker {
 
     if (ICVTrackingAA.isAssumedTracked()) {
       Optional<Value *> URV = ICVTrackingAA.getUniqueReplacementValue(ICV);
-      if (!URV || (*URV && AA::isValidAtPosition(**URV, I, OMPInfoCache)))
+      if (!URV || (*URV && AA::isValidAtPosition(AA::ValueAndContext(**URV, I),
+                                                 OMPInfoCache)))
         return URV;
     }
 
@@ -2509,13 +2517,13 @@ struct AAICVTrackerFunction : public AAICVTracker {
         if (ValuesMap.count(CurrInst)) {
           Optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst);
           // Unknown value, track new.
-          if (!ReplVal.hasValue()) {
+          if (!ReplVal) {
             ReplVal = NewReplVal;
             break;
           }
 
           // If we found a new value, we can't know the icv value anymore.
-          if (NewReplVal.hasValue())
+          if (NewReplVal)
             if (ReplVal != NewReplVal)
               return nullptr;
 
@@ -2523,11 +2531,11 @@ struct AAICVTrackerFunction : public AAICVTracker {
         }
 
         Optional<Value *> NewReplVal = getValueForCall(A, *CurrInst, ICV);
-        if (!NewReplVal.hasValue())
+        if (!NewReplVal)
           continue;
 
         // Unknown value, track new.
-        if (!ReplVal.hasValue()) {
+        if (!ReplVal) {
           ReplVal = NewReplVal;
           break;
         }
@@ -2539,7 +2547,7 @@ struct AAICVTrackerFunction : public AAICVTracker {
       }
 
       // If we are in the same BB and we have a value, we are done.
-      if (CurrBB == I->getParent() && ReplVal.hasValue())
+      if (CurrBB == I->getParent() && ReplVal)
         return ReplVal;
 
       // Go through all predecessors and add terminators for analysis.
@@ -2597,7 +2605,7 @@ struct AAICVTrackerFunctionReturned : AAICVTracker {
             ICVTrackingAA.getReplacementValue(ICV, &I, A);
 
         // If we found a second ICV value there is no unique returned value.
-        if (UniqueICVValue.hasValue() && UniqueICVValue != NewReplVal)
+        if (UniqueICVValue && UniqueICVValue != NewReplVal)
           return false;
 
         UniqueICVValue = NewReplVal;
@@ -2648,10 +2656,10 @@ struct AAICVTrackerCallSite : AAICVTracker {
   }
 
   ChangeStatus manifest(Attributor &A) override {
-    if (!ReplVal.hasValue() || !ReplVal.getValue())
+    if (!ReplVal || !*ReplVal)
       return ChangeStatus::UNCHANGED;
 
-    A.changeValueAfterManifest(*getCtxI(), **ReplVal);
+    A.changeAfterManifest(IRPosition::inst(*getCtxI()), **ReplVal);
     A.deleteAfterManifest(*getCtxI());
 
     return ChangeStatus::CHANGED;
@@ -2789,7 +2797,7 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
   SmallSetVector<const BasicBlock *, 16> SingleThreadedBBs;
 
   /// Total number of basic blocks in this function.
-  long unsigned NumBBs;
+  long unsigned NumBBs = 0;
 };
 
 ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
@@ -2952,12 +2960,23 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
   }
 
   void initialize(Attributor &A) override {
+    if (DisableOpenMPOptDeglobalization) {
+      indicatePessimisticFixpoint();
+      return;
+    }
+
     auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
     auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
 
+    Attributor::SimplifictionCallbackTy SCB =
+        [](const IRPosition &, const AbstractAttribute *,
+           bool &) -> Optional<Value *> { return nullptr; };
     for (User *U : RFI.Declaration->users())
-      if (CallBase *CB = dyn_cast<CallBase>(U))
+      if (CallBase *CB = dyn_cast<CallBase>(U)) {
         MallocCalls.insert(CB);
+        A.registerSimplificationCallback(IRPosition::callsite_returned(*CB),
+                                         SCB);
+      }
 
     findPotentialRemovedFreeCalls(A);
   }
@@ -2999,6 +3018,14 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
 
       auto *AllocSize = cast<ConstantInt>(CB->getArgOperand(0));
 
+      if (AllocSize->getZExtValue() + SharedMemoryUsed > SharedMemoryLimit) {
+        LLVM_DEBUG(dbgs() << TAG << "Cannot replace call " << *CB
+                          << " with shared memory."
+                          << " Shared memory usage is limited to "
+                          << SharedMemoryLimit << " bytes\n");
+        continue;
+      }
+
       LLVM_DEBUG(dbgs() << TAG << "Replace globalization call " << *CB
                         << " with " << AllocSize->getZExtValue()
                         << " bytes of shared memory\n");
@@ -3029,11 +3056,12 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
              "HeapToShared on allocation without alignment attribute");
       SharedMem->setAlignment(MaybeAlign(Alignment));
 
-      A.changeValueAfterManifest(*CB, *NewBuffer);
+      A.changeAfterManifest(IRPosition::callsite_returned(*CB), *NewBuffer);
       A.deleteAfterManifest(*CB);
       A.deleteAfterManifest(*FreeCalls.front());
 
-      NumBytesMovedToSharedMemory += AllocSize->getZExtValue();
+      SharedMemoryUsed += AllocSize->getZExtValue();
+      NumBytesMovedToSharedMemory = SharedMemoryUsed;
       Changed = ChangeStatus::CHANGED;
     }
 
@@ -3069,6 +3097,8 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
   SmallSetVector<CallBase *, 4> MallocCalls;
   /// Collection of potentially removed free calls in a function.
   SmallPtrSet<CallBase *, 4> PotentialRemovedFreeCalls;
+  /// The total amount of shared memory that has been used for HeapToShared.
+  unsigned SharedMemoryUsed = 0;
 };
 
 struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> {
@@ -3137,12 +3167,6 @@ struct AAKernelInfoFunction : AAKernelInfo {
     auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
 
     Function *Fn = getAnchorScope();
-    if (!OMPInfoCache.Kernels.count(Fn))
-      return;
-
-    // Add itself to the reaching kernel and set IsKernelEntry.
-    ReachingKernelEntries.insert(Fn);
-    IsKernelEntry = true;
 
     OMPInformationCache::RuntimeFunctionInfo &InitRFI =
         OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
@@ -3176,10 +3200,12 @@ struct AAKernelInfoFunction : AAKernelInfo {
         Fn);
 
     // Ignore kernels without initializers such as global constructors.
-    if (!KernelInitCB || !KernelDeinitCB) {
-      indicateOptimisticFixpoint();
+    if (!KernelInitCB || !KernelDeinitCB)
       return;
-    }
+
+    // Add itself to the reaching kernel and set IsKernelEntry.
+    ReachingKernelEntries.insert(Fn);
+    IsKernelEntry = true;
 
     // For kernels we might need to initialize/finalize the IsSPMD state and
     // we need to register a simplification callback so that the Attributor
@@ -3345,8 +3371,17 @@ struct AAKernelInfoFunction : AAKernelInfo {
       return false;
     }
 
-    // Check if the kernel is already in SPMD mode, if so, return success.
+    // Get the actual kernel, could be the caller of the anchor scope if we have
+    // a debug wrapper.
     Function *Kernel = getAnchorScope();
+    if (Kernel->hasLocalLinkage()) {
+      assert(Kernel->hasOneUse() && "Unexpected use of debug kernel wrapper.");
+      auto *CB = cast<CallBase>(Kernel->user_back());
+      Kernel = CB->getCaller();
+    }
+    assert(OMPInfoCache.Kernels.count(Kernel) && "Expected kernel function!");
+
+    // Check if the kernel is already in SPMD mode, if so, return success.
     GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable(
         (Kernel->getName() + "_exec_mode").str());
     assert(ExecMode && "Kernel without exec mode?");
@@ -3711,9 +3746,9 @@ struct AAKernelInfoFunction : AAKernelInfo {
     //                         __kmpc_get_hardware_num_threads_in_block();
     //                       WarpSize = __kmpc_get_warp_size();
     //                       BlockSize = BlockHwSize - WarpSize;
-    //                       if (InitCB >= BlockSize) return;
-    // IsWorkerCheckBB:      bool IsWorker = InitCB >= 0;
+    // IsWorkerCheckBB:      bool IsWorker = InitCB != -1;
     //                       if (IsWorker) {
+    //                         if (InitCB >= BlockSize) return;
     // SMBeginBB:               __kmpc_barrier_simple_generic(...);
     //                         void *WorkFn;
     //                         bool Active = __kmpc_kernel_parallel(&WorkFn);
@@ -3770,6 +3805,13 @@ struct AAKernelInfoFunction : AAKernelInfo {
     ReturnInst::Create(Ctx, StateMachineFinishedBB)->setDebugLoc(DLoc);
     InitBB->getTerminator()->eraseFromParent();
 
+    Instruction *IsWorker =
+        ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_NE, KernelInitCB,
+                         ConstantInt::get(KernelInitCB->getType(), -1),
+                         "thread.is_worker", InitBB);
+    IsWorker->setDebugLoc(DLoc);
+    BranchInst::Create(IsWorkerCheckBB, UserCodeEntryBB, IsWorker, InitBB);
+
     Module &M = *Kernel->getParent();
     auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
     FunctionCallee BlockHwSizeFn =
@@ -3779,29 +3821,22 @@ struct AAKernelInfoFunction : AAKernelInfo {
         OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
             M, OMPRTL___kmpc_get_warp_size);
     CallInst *BlockHwSize =
-        CallInst::Create(BlockHwSizeFn, "block.hw_size", InitBB);
+        CallInst::Create(BlockHwSizeFn, "block.hw_size", IsWorkerCheckBB);
     OMPInfoCache.setCallingConvention(BlockHwSizeFn, BlockHwSize);
     BlockHwSize->setDebugLoc(DLoc);
-    CallInst *WarpSize = CallInst::Create(WarpSizeFn, "warp.size", InitBB);
+    CallInst *WarpSize =
+        CallInst::Create(WarpSizeFn, "warp.size", IsWorkerCheckBB);
     OMPInfoCache.setCallingConvention(WarpSizeFn, WarpSize);
     WarpSize->setDebugLoc(DLoc);
-    Instruction *BlockSize =
-        BinaryOperator::CreateSub(BlockHwSize, WarpSize, "block.size", InitBB);
+    Instruction *BlockSize = BinaryOperator::CreateSub(
+        BlockHwSize, WarpSize, "block.size", IsWorkerCheckBB);
     BlockSize->setDebugLoc(DLoc);
-    Instruction *IsMainOrWorker =
-        ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_SLT, KernelInitCB,
-                         BlockSize, "thread.is_main_or_worker", InitBB);
+    Instruction *IsMainOrWorker = ICmpInst::Create(
+        ICmpInst::ICmp, llvm::CmpInst::ICMP_SLT, KernelInitCB, BlockSize,
+        "thread.is_main_or_worker", IsWorkerCheckBB);
     IsMainOrWorker->setDebugLoc(DLoc);
-    BranchInst::Create(IsWorkerCheckBB, StateMachineFinishedBB, IsMainOrWorker,
-                       InitBB);
-
-    Instruction *IsWorker =
-        ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_NE, KernelInitCB,
-                         ConstantInt::get(KernelInitCB->getType(), -1),
-                         "thread.is_worker", IsWorkerCheckBB);
-    IsWorker->setDebugLoc(DLoc);
-    BranchInst::Create(StateMachineBeginBB, UserCodeEntryBB, IsWorker,
-                       IsWorkerCheckBB);
+    BranchInst::Create(StateMachineBeginBB, StateMachineFinishedBB,
+                       IsMainOrWorker, IsWorkerCheckBB);
 
     // Create local storage for the work function pointer.
     const DataLayout &DL = M.getDataLayout();
@@ -4241,10 +4276,10 @@ struct AAKernelInfoCallSite : AAKernelInfo {
       unsigned ScheduleTypeVal =
           ScheduleTypeCI ? ScheduleTypeCI->getZExtValue() : 0;
       switch (OMPScheduleType(ScheduleTypeVal)) {
-      case OMPScheduleType::Static:
-      case OMPScheduleType::StaticChunked:
-      case OMPScheduleType::Distribute:
-      case OMPScheduleType::DistributeChunked:
+      case OMPScheduleType::UnorderedStatic:
+      case OMPScheduleType::UnorderedStaticChunked:
+      case OMPScheduleType::OrderedDistribute:
+      case OMPScheduleType::OrderedDistributeChunked:
         break;
       default:
         SPMDCompatibilityTracker.indicatePessimisticFixpoint();
@@ -4390,7 +4425,7 @@ struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
 
     std::string Str("simplified value: ");
 
-    if (!SimplifiedValue.hasValue())
+    if (!SimplifiedValue)
       return Str + std::string("none");
 
     if (!SimplifiedValue.getValue())
@@ -4420,8 +4455,8 @@ struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
         IRPosition::callsite_returned(CB),
         [&](const IRPosition &IRP, const AbstractAttribute *AA,
             bool &UsedAssumedInformation) -> Optional<Value *> {
-          assert((isValidState() || (SimplifiedValue.hasValue() &&
-                                     SimplifiedValue.getValue() == nullptr)) &&
+          assert((isValidState() ||
+                  (SimplifiedValue && SimplifiedValue.getValue() == nullptr)) &&
                  "Unexpected invalid state!");
 
           if (!isAtFixpoint()) {
@@ -4461,9 +4496,9 @@ struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
   ChangeStatus manifest(Attributor &A) override {
     ChangeStatus Changed = ChangeStatus::UNCHANGED;
 
-    if (SimplifiedValue.hasValue() && SimplifiedValue.getValue()) {
+    if (SimplifiedValue && *SimplifiedValue) {
       Instruction &I = *getCtxI();
-      A.changeValueAfterManifest(I, **SimplifiedValue);
+      A.changeAfterManifest(IRPosition::inst(I), **SimplifiedValue);
       A.deleteAfterManifest(I);
 
       CallBase *CB = dyn_cast<CallBase>(&I);
@@ -4549,7 +4584,7 @@ private:
       // We have empty reaching kernels, therefore we cannot tell if the
       // associated call site can be folded. At this moment, SimplifiedValue
       // must be none.
-      assert(!SimplifiedValue.hasValue() && "SimplifiedValue should be none");
+      assert(!SimplifiedValue && "SimplifiedValue should be none");
     }
 
     return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
@@ -4592,7 +4627,7 @@ private:
       return indicatePessimisticFixpoint();
 
     if (CallerKernelInfoAA.ReachingKernelEntries.empty()) {
-      assert(!SimplifiedValue.hasValue() &&
+      assert(!SimplifiedValue &&
              "SimplifiedValue should keep none at this point");
       return ChangeStatus::UNCHANGED;
     }
@@ -4700,18 +4735,23 @@ void OpenMPOpt::registerFoldRuntimeCall(RuntimeFunction RF) {
 
 void OpenMPOpt::registerAAs(bool IsModulePass) {
   if (SCC.empty())
-
     return;
+
   if (IsModulePass) {
     // Ensure we create the AAKernelInfo AAs first and without triggering an
     // update. This will make sure we register all value simplification
     // callbacks before any other AA has the chance to create an AAValueSimplify
     // or similar.
-    for (Function *Kernel : OMPInfoCache.Kernels)
+    auto CreateKernelInfoCB = [&](Use &, Function &Kernel) {
       A.getOrCreateAAFor<AAKernelInfo>(
-          IRPosition::function(*Kernel), /* QueryingAA */ nullptr,
+          IRPosition::function(Kernel), /* QueryingAA */ nullptr,
           DepClassTy::NONE, /* ForceUpdate */ false,
           /* UpdateAfterInit */ false);
+      return false;
+    };
+    OMPInformationCache::RuntimeFunctionInfo &InitRFI =
+        OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
+    InitRFI.foreachUse(SCC, CreateKernelInfoCB);
 
     registerFoldRuntimeCall(OMPRTL___kmpc_is_generic_main_thread_id);
     registerFoldRuntimeCall(OMPRTL___kmpc_is_spmd_exec_mode);
@@ -4899,6 +4939,9 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
       AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
   KernelSet Kernels = getDeviceKernels(M);
 
+  if (PrintModuleBeforeOptimizations)
+    LLVM_DEBUG(dbgs() << TAG << "Module before OpenMPOpt Module Pass:\n" << M);
+
   auto IsCalled = [&](Function &F) {
     if (Kernels.contains(&F))
       return true;
@@ -4958,8 +5001,15 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
 
   unsigned MaxFixpointIterations =
       (isOpenMPDevice(M)) ? SetFixpointIterations : 32;
-  Attributor A(Functions, InfoCache, CGUpdater, nullptr, true, false,
-               MaxFixpointIterations, OREGetter, DEBUG_TYPE);
+
+  AttributorConfig AC(CGUpdater);
+  AC.DefaultInitializeLiveInternals = false;
+  AC.RewriteSignatures = false;
+  AC.MaxFixpointIterations = MaxFixpointIterations;
+  AC.OREGetter = OREGetter;
+  AC.PassName = DEBUG_TYPE;
+
+  Attributor A(Functions, InfoCache, AC);
 
   OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
   bool Changed = OMPOpt.run(true);
@@ -5001,6 +5051,9 @@ PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C,
 
   Module &M = *C.begin()->getFunction().getParent();
 
+  if (PrintModuleBeforeOptimizations)
+    LLVM_DEBUG(dbgs() << TAG << "Module before OpenMPOpt CGSCC Pass:\n" << M);
+
   KernelSet Kernels = getDeviceKernels(M);
 
   FunctionAnalysisManager &FAM =
@@ -5022,8 +5075,16 @@ PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C,
 
   unsigned MaxFixpointIterations =
       (isOpenMPDevice(M)) ? SetFixpointIterations : 32;
-  Attributor A(Functions, InfoCache, CGUpdater, nullptr, false, true,
-               MaxFixpointIterations, OREGetter, DEBUG_TYPE);
+
+  AttributorConfig AC(CGUpdater);
+  AC.DefaultInitializeLiveInternals = false;
+  AC.IsModulePass = false;
+  AC.RewriteSignatures = false;
+  AC.MaxFixpointIterations = MaxFixpointIterations;
+  AC.OREGetter = OREGetter;
+  AC.PassName = DEBUG_TYPE;
+
+  Attributor A(Functions, InfoCache, AC);
 
   OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
   bool Changed = OMPOpt.run(false);
@@ -5093,8 +5154,16 @@ struct OpenMPOptCGSCCLegacyPass : public CallGraphSCCPass {
 
     unsigned MaxFixpointIterations =
         (isOpenMPDevice(M)) ? SetFixpointIterations : 32;
-    Attributor A(Functions, InfoCache, CGUpdater, nullptr, false, true,
-                 MaxFixpointIterations, OREGetter, DEBUG_TYPE);
+
+    AttributorConfig AC(CGUpdater);
+    AC.DefaultInitializeLiveInternals = false;
+    AC.IsModulePass = false;
+    AC.RewriteSignatures = false;
+    AC.MaxFixpointIterations = MaxFixpointIterations;
+    AC.OREGetter = OREGetter;
+    AC.PassName = DEBUG_TYPE;
+
+    Attributor A(Functions, InfoCache, AC);
 
     OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
     bool Result = OMPOpt.run(false);
diff --git a/llvm/lib/Transforms/IPO/PartialInlining.cpp b/llvm/lib/Transforms/IPO/PartialInlining.cpp
index 5f2223e4047e..54c72bdbb203 100644
--- a/llvm/lib/Transforms/IPO/PartialInlining.cpp
+++ b/llvm/lib/Transforms/IPO/PartialInlining.cpp
@@ -14,7 +14,6 @@
 #include "llvm/Transforms/IPO/PartialInlining.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
@@ -40,6 +39,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
 #include "llvm/IR/User.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
@@ -55,8 +55,6 @@
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
-#include <functional>
-#include <iterator>
 #include <memory>
 #include <tuple>
 #include <vector>
@@ -99,7 +97,7 @@ static cl::opt<bool>
 
 // This is an option used by testing:
 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
-                                      cl::init(false), cl::ZeroOrMore,
+
                                       cl::ReallyHidden,
                                       cl::desc("Skip Cost Analysis"));
 // Used to determine if a cold region is worth outlining based on
@@ -129,7 +127,7 @@ static cl::opt<unsigned> MaxNumInlineBlocks(
 // Command line option to set the maximum number of partial inlining allowed
 // for the module. The default value of -1 means no limit.
 static cl::opt<int> MaxNumPartialInlining(
-    "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
+    "max-partial-inlining", cl::init(-1), cl::Hidden,
     cl::desc("Max number of partial inlining. The default is unlimited"));
 
 // Used only when PGO or user annotated branch data is absent. It is
@@ -137,7 +135,7 @@ static cl::opt<int> MaxNumPartialInlining(
 // produces larger value, the BFI value will be used.
 static cl::opt<int>
     OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
-                             cl::Hidden, cl::ZeroOrMore,
+                             cl::Hidden,
                              cl::desc("Relative frequency of outline region to "
                                       "the entry block"));
 
@@ -169,7 +167,7 @@ struct FunctionOutliningInfo {
 };
 
 struct FunctionOutliningMultiRegionInfo {
-  FunctionOutliningMultiRegionInfo() {}
+  FunctionOutliningMultiRegionInfo() = default;
 
   // Container for outline regions
   struct OutlineRegionInfo {
@@ -440,7 +438,7 @@ PartialInlinerImpl::computeOutliningColdRegionsInfo(
   };
 
   auto BBProfileCount = [BFI](BasicBlock *BB) {
-    return BFI->getBlockProfileCount(BB).getValueOr(0);
+    return BFI->getBlockProfileCount(BB).value_or(0);
   };
 
   // Use the same computeBBInlineCost function to compute the cost savings of
@@ -741,7 +739,7 @@ BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
   auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
       OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
 
-  if (hasProfileData(*Cloner.OrigFunc, *Cloner.ClonedOI.get()))
+  if (hasProfileData(*Cloner.OrigFunc, *Cloner.ClonedOI))
     return OutlineRegionRelFreq;
 
   // When profile data is not available, we need to be conservative in
diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
index 74f68531b89a..ae787be40c55 100644
--- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -15,19 +15,13 @@
 #include "llvm-c/Transforms/PassManagerBuilder.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/CFLAndersAliasAnalysis.h"
 #include "llvm/Analysis/CFLSteensAliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/InlineCost.h"
-#include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/ScopedNoAliasAA.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/Verifier.h"
-#include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Target/CGPassBuilderOption.h"
@@ -41,22 +35,16 @@
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
-#include "llvm/Transforms/Scalar/InstSimplifyPass.h"
 #include "llvm/Transforms/Scalar/LICM.h"
 #include "llvm/Transforms/Scalar/LoopUnrollPass.h"
-#include "llvm/Transforms/Scalar/SCCP.h"
 #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
 #include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Vectorize.h"
-#include "llvm/Transforms/Vectorize/LoopVectorize.h"
-#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
-#include "llvm/Transforms/Vectorize/VectorCombine.h"
 
 using namespace llvm;
 
 namespace llvm {
-cl::opt<bool> RunPartialInlining("enable-partial-inlining", cl::init(false),
-                                 cl::Hidden, cl::ZeroOrMore,
+cl::opt<bool> RunPartialInlining("enable-partial-inlining", cl::Hidden,
                                  cl::desc("Run Partial inlinining pass"));
 
 static cl::opt<bool>
@@ -111,8 +99,8 @@ static cl::opt<bool>
     EnablePerformThinLTO("perform-thinlto", cl::init(false), cl::Hidden,
                          cl::desc("Enable performing ThinLTO."));
 
-cl::opt<bool> EnableHotColdSplit("hot-cold-split", cl::init(false),
-    cl::ZeroOrMore, cl::desc("Enable hot-cold splitting pass"));
+cl::opt<bool> EnableHotColdSplit("hot-cold-split",
+                                 cl::desc("Enable hot-cold splitting pass"));
 
 cl::opt<bool> EnableIROutliner("ir-outliner", cl::init(false), cl::Hidden,
     cl::desc("Enable ir outliner pass"));
@@ -126,12 +114,12 @@ cl::opt<bool>
                       cl::desc("Disable pre-instrumentation inliner"));
 
 cl::opt<int> PreInlineThreshold(
-    "preinline-threshold", cl::Hidden, cl::init(75), cl::ZeroOrMore,
+    "preinline-threshold", cl::Hidden, cl::init(75),
     cl::desc("Control the amount of inlining in pre-instrumentation inliner "
              "(default = 75)"));
 
 cl::opt<bool>
-    EnableGVNHoist("enable-gvn-hoist", cl::init(false), cl::ZeroOrMore,
+    EnableGVNHoist("enable-gvn-hoist",
                    cl::desc("Enable the GVN hoisting pass (default = off)"));
 
 static cl::opt<bool>
@@ -139,13 +127,8 @@ static cl::opt<bool>
                               cl::Hidden,
                               cl::desc("Disable shrink-wrap library calls"));
 
-static cl::opt<bool> EnableSimpleLoopUnswitch(
-    "enable-simple-loop-unswitch", cl::init(false), cl::Hidden,
-    cl::desc("Enable the simple loop unswitch pass. Also enables independent "
-             "cleanup passes integrated into the loop pass manager pipeline."));
-
 cl::opt<bool>
-    EnableGVNSink("enable-gvn-sink", cl::init(false), cl::ZeroOrMore,
+    EnableGVNSink("enable-gvn-sink",
                   cl::desc("Enable the GVN sinking pass (default = off)"));
 
 // This option is used in simplifying testing SampleFDO optimizations for
@@ -336,59 +319,6 @@ void PassManagerBuilder::populateFunctionPassManager(
   FPM.add(createEarlyCSEPass());
 }
 
-// Do PGO instrumentation generation or use pass as the option specified.
-void PassManagerBuilder::addPGOInstrPasses(legacy::PassManagerBase &MPM,
-                                           bool IsCS = false) {
-  if (IsCS) {
-    if (!EnablePGOCSInstrGen && !EnablePGOCSInstrUse)
-      return;
-  } else if (!EnablePGOInstrGen && PGOInstrUse.empty() && PGOSampleUse.empty())
-    return;
-
-  // Perform the preinline and cleanup passes for O1 and above.
-  // We will not do this inline for context sensitive PGO (when IsCS is true).
-  if (OptLevel > 0 && !DisablePreInliner && PGOSampleUse.empty() && !IsCS) {
-    // Create preinline pass. We construct an InlineParams object and specify
-    // the threshold here to avoid the command line options of the regular
-    // inliner to influence pre-inlining. The only fields of InlineParams we
-    // care about are DefaultThreshold and HintThreshold.
-    InlineParams IP;
-    IP.DefaultThreshold = PreInlineThreshold;
-    // FIXME: The hint threshold has the same value used by the regular inliner
-    // when not optimzing for size. This should probably be lowered after
-    // performance testing.
-    // Use PreInlineThreshold for both -Os and -Oz. Not running preinliner makes
-    // the instrumented binary unusably large. Even if PreInlineThreshold is not
-    // correct thresold for -Oz, it is better than not running preinliner.
-    IP.HintThreshold = SizeLevel > 0 ? PreInlineThreshold : 325;
-
-    MPM.add(createFunctionInliningPass(IP));
-    MPM.add(createSROAPass());
-    MPM.add(createEarlyCSEPass());             // Catch trivial redundancies
-    MPM.add(createCFGSimplificationPass());    // Merge & remove BBs
-    MPM.add(createInstructionCombiningPass()); // Combine silly seq's
-    addExtensionsToPM(EP_Peephole, MPM);
-  }
-  if ((EnablePGOInstrGen && !IsCS) || (EnablePGOCSInstrGen && IsCS)) {
-    MPM.add(createPGOInstrumentationGenLegacyPass(IsCS));
-    // Add the profile lowering pass.
-    InstrProfOptions Options;
-    if (!PGOInstrGen.empty())
-      Options.InstrProfileOutput = PGOInstrGen;
-    Options.DoCounterPromotion = true;
-    Options.UseBFIInPromotion = IsCS;
-    MPM.add(createLoopRotatePass());
-    MPM.add(createInstrProfilingLegacyPass(Options, IsCS));
-  }
-  if (!PGOInstrUse.empty())
-    MPM.add(createPGOInstrumentationUseLegacyPass(PGOInstrUse, IsCS));
-  // Indirect call promotion that promotes intra-module targets only.
-  // For ThinLTO this is done earlier due to interactions with globalopt
-  // for imported functions. We don't run this at -O0.
-  if (OptLevel > 0 && !IsCS)
-    MPM.add(
-        createPGOIndirectCallPromotionLegacyPass(false, !PGOSampleUse.empty()));
-}
 void PassManagerBuilder::addFunctionSimplificationPasses(
     legacy::PassManagerBase &MPM) {
   // Start of function pass.
@@ -404,7 +334,8 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
       MPM.add(createGVNHoistPass());
     if (EnableGVNSink) {
       MPM.add(createGVNSinkPass());
-      MPM.add(createCFGSimplificationPass());
+      MPM.add(createCFGSimplificationPass(
+          SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
     }
   }
 
@@ -418,7 +349,9 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
     MPM.add(createJumpThreadingPass());         // Thread jumps.
     MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals
   }
-  MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
+  MPM.add(
+      createCFGSimplificationPass(SimplifyCFGOptions().convertSwitchRangeToICmp(
+          true))); // Merge & remove BBs
   // Combine silly seq's
   if (OptLevel > 2)
     MPM.add(createAggressiveInstCombinerPass());
@@ -427,14 +360,12 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
     MPM.add(createLibCallsShrinkWrapPass());
   addExtensionsToPM(EP_Peephole, MPM);
 
-  // Optimize memory intrinsic calls based on the profiled size information.
-  if (SizeLevel == 0)
-    MPM.add(createPGOMemOPSizeOptLegacyPass());
-
   // TODO: Investigate the cost/benefit of tail call elimination on debugging.
   if (OptLevel > 1)
     MPM.add(createTailCallEliminationPass()); // Eliminate tail calls
-  MPM.add(createCFGSimplificationPass());      // Merge & remove BBs
+  MPM.add(
+      createCFGSimplificationPass(SimplifyCFGOptions().convertSwitchRangeToICmp(
+          true)));                            // Merge & remove BBs
   MPM.add(createReassociatePass());           // Reassociate expressions
 
   // The matrix extension can introduce large vector operations early, which can
@@ -443,29 +374,32 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
     MPM.add(createVectorCombinePass());
 
   // Begin the loop pass pipeline.
-  if (EnableSimpleLoopUnswitch) {
-    // The simple loop unswitch pass relies on separate cleanup passes. Schedule
-    // them first so when we re-process a loop they run before other loop
-    // passes.
-    MPM.add(createLoopInstSimplifyPass());
-    MPM.add(createLoopSimplifyCFGPass());
-  }
+
+  // The simple loop unswitch pass relies on separate cleanup passes. Schedule
+  // them first so when we re-process a loop they run before other loop
+  // passes.
+  MPM.add(createLoopInstSimplifyPass());
+  MPM.add(createLoopSimplifyCFGPass());
+
   // Try to remove as much code from the loop header as possible,
-  // to reduce amount of IR that will have to be duplicated.
+  // to reduce amount of IR that will have to be duplicated. However,
+  // do not perform speculative hoisting the first time as LICM
+  // will destroy metadata that may not need to be destroyed if run
+  // after loop rotation.
   // TODO: Investigate promotion cap for O1.
-  MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+  MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap,
+                         /*AllowSpeculation=*/false));
   // Rotate Loop - disable header duplication at -Oz
   MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1, PrepareForLTO));
   // TODO: Investigate promotion cap for O1.
-  MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
-  if (EnableSimpleLoopUnswitch)
-    MPM.add(createSimpleLoopUnswitchLegacyPass());
-  else
-    MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
+  MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap,
+                         /*AllowSpeculation=*/true));
+  MPM.add(createSimpleLoopUnswitchLegacyPass(OptLevel == 3));
   // FIXME: We break the loop pass pipeline here in order to do full
   // simplifycfg. Eventually loop-simplifycfg should be enhanced to replace the
   // need for this.
-  MPM.add(createCFGSimplificationPass());
+  MPM.add(createCFGSimplificationPass(
+      SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
   MPM.add(createInstructionCombiningPass());
   // We resume loop passes creating a second loop pipeline here.
   if (EnableLoopFlatten) {
@@ -521,7 +455,8 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
   // TODO: Investigate if this is too expensive at O1.
   if (OptLevel > 1) {
     MPM.add(createDeadStoreEliminationPass());  // Delete dead stores
-    MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+    MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap,
+                           /*AllowSpeculation=*/true));
   }
 
   addExtensionsToPM(EP_ScalarOptimizerLate, MPM);
@@ -580,9 +515,11 @@ void PassManagerBuilder::addVectorPasses(legacy::PassManagerBase &PM,
     PM.add(createEarlyCSEPass());
     PM.add(createCorrelatedValuePropagationPass());
     PM.add(createInstructionCombiningPass());
-    PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
-    PM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
-    PM.add(createCFGSimplificationPass());
+    PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap,
+                          /*AllowSpeculation=*/true));
+    PM.add(createSimpleLoopUnswitchLegacyPass());
+    PM.add(createCFGSimplificationPass(
+        SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
     PM.add(createInstructionCombiningPass());
   }
 
@@ -597,6 +534,7 @@ void PassManagerBuilder::addVectorPasses(legacy::PassManagerBase &PM,
   // before SLP vectorization.
   PM.add(createCFGSimplificationPass(SimplifyCFGOptions()
                                          .forwardSwitchCondToPhi(true)
+                                         .convertSwitchRangeToICmp(true)
                                          .convertSwitchToLookupTable(true)
                                          .needCanonicalLoops(false)
                                          .hoistCommonInsts(true)
@@ -641,7 +579,8 @@ void PassManagerBuilder::addVectorPasses(legacy::PassManagerBase &PM,
       // unrolled loop is a inner loop, then the prologue will be inside the
       // outer loop. LICM pass can help to promote the runtime check out if the
       // checked value is loop invariant.
-      PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+      PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap,
+                            /*AllowSpeculation=*/true));
     }
 
     PM.add(createWarnMissedTransformationsPass());
@@ -657,10 +596,6 @@ void PassManagerBuilder::addVectorPasses(legacy::PassManagerBase &PM,
 
 void PassManagerBuilder::populateModulePassManager(
     legacy::PassManagerBase &MPM) {
-  // Whether this is a default or *LTO pre-link pipeline. The FullLTO post-link
-  // is handled separately, so just check this is not the ThinLTO post-link.
-  bool DefaultOrPreLinkPipeline = !PerformThinLTO;
-
   MPM.add(createAnnotation2MetadataLegacyPass());
 
   if (!PGOSampleUse.empty()) {
@@ -678,7 +613,6 @@ void PassManagerBuilder::populateModulePassManager(
   // If all optimizations are disabled, just run the always-inline pass and,
   // if enabled, the function merging pass.
   if (OptLevel == 0) {
-    addPGOInstrPasses(MPM);
     if (Inliner) {
       MPM.add(Inliner);
       Inliner = nullptr;
@@ -732,8 +666,6 @@ void PassManagerBuilder::populateModulePassManager(
   // earlier in the pass pipeline, here before globalopt. Otherwise imported
   // available_externally functions look unreferenced and are removed.
   if (PerformThinLTO) {
-    MPM.add(createPGOIndirectCallPromotionLegacyPass(/*InLTO = */ true,
-                                                     !PGOSampleUse.empty()));
     MPM.add(createLowerTypeTestsPass(nullptr, nullptr, true));
   }
 
@@ -772,20 +704,9 @@ void PassManagerBuilder::populateModulePassManager(
 
   MPM.add(createInstructionCombiningPass()); // Clean up after IPCP & DAE
   addExtensionsToPM(EP_Peephole, MPM);
-  MPM.add(createCFGSimplificationPass()); // Clean up after IPCP & DAE
-
-  // For SamplePGO in ThinLTO compile phase, we do not want to do indirect
-  // call promotion as it will change the CFG too much to make the 2nd
-  // profile annotation in backend more difficult.
-  // PGO instrumentation is added during the compile phase for ThinLTO, do
-  // not run it a second time
-  if (DefaultOrPreLinkPipeline && !PrepareForThinLTOUsingPGOSampleProfile)
-    addPGOInstrPasses(MPM);
-
-  // Create profile COMDAT variables. Lld linker wants to see all variables
-  // before the LTO/ThinLTO link since it needs to resolve symbols/comdats.
-  if (!PerformThinLTO && EnablePGOCSInstrGen)
-    MPM.add(createPGOInstrumentationGenCreateVarLegacyPass(PGOInstrGen));
+  MPM.add(
+      createCFGSimplificationPass(SimplifyCFGOptions().convertSwitchRangeToICmp(
+          true))); // Clean up after IPCP & DAE
 
   // We add a module alias analysis pass here. In part due to bugs in the
   // analysis infrastructure this "works" in that the analysis stays alive
@@ -811,8 +732,6 @@ void PassManagerBuilder::populateModulePassManager(
     MPM.add(createOpenMPOptCGSCCLegacyPass());
 
   MPM.add(createPostOrderFunctionAttrsLegacyPass());
-  if (OptLevel > 2)
-    MPM.add(createArgumentPromotionPass()); // Scalarize uninlined fn args
 
   addExtensionsToPM(EP_CGSCCOptimizerLate, MPM);
   addFunctionSimplificationPasses(MPM);
@@ -837,14 +756,6 @@ void PassManagerBuilder::populateModulePassManager(
     // and saves running remaining passes on the eliminated functions.
     MPM.add(createEliminateAvailableExternallyPass());
 
-  // CSFDO instrumentation and use pass. Don't invoke this for Prepare pass
-  // for LTO and ThinLTO -- The actual pass will be called after all inlines
-  // are performed.
-  // Need to do this after COMDAT variables have been eliminated,
-  // (i.e. after EliminateAvailableExternallyPass).
-  if (!(PrepareForLTO || PrepareForThinLTO))
-    addPGOInstrPasses(MPM, /* IsCS */ true);
-
   if (EnableOrderFileInstrumentation)
     MPM.add(createInstrOrderFilePass());
 
@@ -886,7 +797,8 @@ void PassManagerBuilder::populateModulePassManager(
   // later might get benefit of no-alias assumption in clone loop.
   if (UseLoopVersioningLICM) {
     MPM.add(createLoopVersioningLICMPass());    // Do LoopVersioningLICM
-    MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+    MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap,
+                           /*AllowSpeculation=*/true));
   }
 
   // We add a fresh GlobalsModRef run at this point. This is particularly
@@ -972,7 +884,8 @@ void PassManagerBuilder::populateModulePassManager(
 
   // LoopSink (and other loop passes since the last simplifyCFG) might have
   // resulted in single-entry-single-exit or empty blocks. Clean up the CFG.
-  MPM.add(createCFGSimplificationPass());
+  MPM.add(createCFGSimplificationPass(
+      SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
 
   addExtensionsToPM(EP_OptimizerLast, MPM);
 
@@ -1009,13 +922,6 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
     // Split call-site with more constrained arguments.
     PM.add(createCallSiteSplittingPass());
 
-    // Indirect call promotion. This should promote all the targets that are
-    // left by the earlier promotion pass that promotes intra-module targets.
-    // This two-step promotion is to save the compile time. For LTO, it should
-    // produce the same result as if we only do promotion here.
-    PM.add(
-        createPGOIndirectCallPromotionLegacyPass(true, !PGOSampleUse.empty()));
-
     // Propage constant function arguments by specializing the functions.
     if (EnableFunctionSpecialization && OptLevel > 2)
       PM.add(createFunctionSpecializationPass());
@@ -1081,9 +987,6 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
 
   PM.add(createPruneEHPass());   // Remove dead EH info.
 
-  // CSFDO instrumentation and use pass.
-  addPGOInstrPasses(PM, /* IsCS */ true);
-
   // Infer attributes on declarations, call sites, arguments, etc. for an SCC.
   if (AttributorRun & AttributorRunOption::CGSCC)
     PM.add(createAttributorCGSCCLegacyPass());
@@ -1098,14 +1001,10 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
     PM.add(createGlobalOptimizerPass());
   PM.add(createGlobalDCEPass()); // Remove dead functions.
 
-  // If we didn't decide to inline a function, check to see if we can
-  // transform it to pass arguments by value instead of by reference.
-  PM.add(createArgumentPromotionPass());
-
   // The IPO passes may leave cruft around.  Clean up after them.
   PM.add(createInstructionCombiningPass());
   addExtensionsToPM(EP_Peephole, PM);
-  PM.add(createJumpThreadingPass(/*FreezeSelectCond*/ true));
+  PM.add(createJumpThreadingPass());
 
   // Break up allocas
   PM.add(createSROAPass());
@@ -1120,7 +1019,8 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   // Run a few AA driven optimizations here and now, to cleanup the code.
   PM.add(createGlobalsAAWrapperPass()); // IP alias analysis.
 
-  PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+  PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap,
+                        /*AllowSpeculation=*/true));
   PM.add(NewGVN ? createNewGVNPass()
                 : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies.
   PM.add(createMemCpyOptPass());            // Remove dead memcpys.
@@ -1149,7 +1049,7 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
 
   addExtensionsToPM(EP_Peephole, PM);
 
-  PM.add(createJumpThreadingPass(/*FreezeSelectCond*/ true));
+  PM.add(createJumpThreadingPass());
 }
 
 void PassManagerBuilder::addLateLTOOptimizationPasses(
@@ -1175,80 +1075,6 @@ void PassManagerBuilder::addLateLTOOptimizationPasses(
     PM.add(createMergeFunctionsPass());
 }
 
-void PassManagerBuilder::populateThinLTOPassManager(
-    legacy::PassManagerBase &PM) {
-  PerformThinLTO = true;
-  if (LibraryInfo)
-    PM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo));
-
-  if (VerifyInput)
-    PM.add(createVerifierPass());
-
-  if (ImportSummary) {
-    // This pass imports type identifier resolutions for whole-program
-    // devirtualization and CFI. It must run early because other passes may
-    // disturb the specific instruction patterns that these passes look for,
-    // creating dependencies on resolutions that may not appear in the summary.
-    //
-    // For example, GVN may transform the pattern assume(type.test) appearing in
-    // two basic blocks into assume(phi(type.test, type.test)), which would
-    // transform a dependency on a WPD resolution into a dependency on a type
-    // identifier resolution for CFI.
-    //
-    // Also, WPD has access to more precise information than ICP and can
-    // devirtualize more effectively, so it should operate on the IR first.
-    PM.add(createWholeProgramDevirtPass(nullptr, ImportSummary));
-    PM.add(createLowerTypeTestsPass(nullptr, ImportSummary));
-  }
-
-  populateModulePassManager(PM);
-
-  if (VerifyOutput)
-    PM.add(createVerifierPass());
-  PerformThinLTO = false;
-}
-
-void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) {
-  if (LibraryInfo)
-    PM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo));
-
-  if (VerifyInput)
-    PM.add(createVerifierPass());
-
-  addExtensionsToPM(EP_FullLinkTimeOptimizationEarly, PM);
-
-  if (OptLevel != 0)
-    addLTOOptimizationPasses(PM);
-  else {
-    // The whole-program-devirt pass needs to run at -O0 because only it knows
-    // about the llvm.type.checked.load intrinsic: it needs to both lower the
-    // intrinsic itself and handle it in the summary.
-    PM.add(createWholeProgramDevirtPass(ExportSummary, nullptr));
-  }
-
-  // Create a function that performs CFI checks for cross-DSO calls with targets
-  // in the current module.
-  PM.add(createCrossDSOCFIPass());
-
-  // Lower type metadata and the type.test intrinsic. This pass supports Clang's
-  // control flow integrity mechanisms (-fsanitize=cfi*) and needs to run at
-  // link time if CFI is enabled. The pass does nothing if CFI is disabled.
-  PM.add(createLowerTypeTestsPass(ExportSummary, nullptr));
-  // Run a second time to clean up any type tests left behind by WPD for use
-  // in ICP (which is performed earlier than this in the regular LTO pipeline).
-  PM.add(createLowerTypeTestsPass(nullptr, nullptr, true));
-
-  if (OptLevel != 0)
-    addLateLTOOptimizationPasses(PM);
-
-  addExtensionsToPM(EP_FullLinkTimeOptimizationLast, PM);
-
-  PM.add(createAnnotationRemarksLegacyPass());
-
-  if (VerifyOutput)
-    PM.add(createVerifierPass());
-}
-
 LLVMPassManagerBuilderRef LLVMPassManagerBuilderCreate() {
   PassManagerBuilder *PMB = new PassManagerBuilder();
   return wrap(PMB);
@@ -1314,18 +1140,3 @@ LLVMPassManagerBuilderPopulateModulePassManager(LLVMPassManagerBuilderRef PMB,
   legacy::PassManagerBase *MPM = unwrap(PM);
   Builder->populateModulePassManager(*MPM);
 }
-
-void LLVMPassManagerBuilderPopulateLTOPassManager(LLVMPassManagerBuilderRef PMB,
-                                                  LLVMPassManagerRef PM,
-                                                  LLVMBool Internalize,
-                                                  LLVMBool RunInliner) {
-  PassManagerBuilder *Builder = unwrap(PMB);
-  legacy::PassManagerBase *LPM = unwrap(PM);
-
-  // A small backwards compatibility hack. populateLTOPassManager used to take
-  // an RunInliner option.
-  if (RunInliner && !Builder->Inliner)
-    Builder->Inliner = createFunctionInliningPass();
-
-  Builder->populateLTOPassManager(*LPM);
-}
diff --git a/llvm/lib/Transforms/IPO/PruneEH.cpp b/llvm/lib/Transforms/IPO/PruneEH.cpp
index 39de19ca9e9d..e0836a9fd699 100644
--- a/llvm/lib/Transforms/IPO/PruneEH.cpp
+++ b/llvm/lib/Transforms/IPO/PruneEH.cpp
@@ -14,7 +14,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
@@ -24,9 +23,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Utils/CallGraphUpdater.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -246,7 +243,7 @@ static void DeleteBasicBlock(BasicBlock *BB, CallGraphUpdater &CGU) {
     }
 
     if (!I->use_empty())
-      I->replaceAllUsesWith(UndefValue::get(I->getType()));
+      I->replaceAllUsesWith(PoisonValue::get(I->getType()));
   }
 
   if (TokenInst) {
diff --git a/llvm/lib/Transforms/IPO/SCCP.cpp b/llvm/lib/Transforms/IPO/SCCP.cpp
index 5779553ee732..26fb7d676429 100644
--- a/llvm/lib/Transforms/IPO/SCCP.cpp
+++ b/llvm/lib/Transforms/IPO/SCCP.cpp
@@ -18,6 +18,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Scalar/SCCP.h"
+#include "llvm/Transforms/Utils/SCCPSolver.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
index 7334bf695b67..6859953de962 100644
--- a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
+++ b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
@@ -14,7 +14,8 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/Instructions.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/ProfileData/SampleProf.h"
 #include <map>
 #include <queue>
@@ -62,23 +63,24 @@ ContextTrieNode::getHottestChildContext(const LineLocation &CallSite) {
   return ChildNodeRet;
 }
 
-ContextTrieNode &ContextTrieNode::moveToChildContext(
-    const LineLocation &CallSite, ContextTrieNode &&NodeToMove,
-    uint32_t ContextFramesToRemove, bool DeleteNode) {
+ContextTrieNode &
+SampleContextTracker::moveContextSamples(ContextTrieNode &ToNodeParent,
+                                         const LineLocation &CallSite,
+                                         ContextTrieNode &&NodeToMove) {
   uint64_t Hash =
       FunctionSamples::getCallSiteHash(NodeToMove.getFuncName(), CallSite);
+  std::map<uint64_t, ContextTrieNode> &AllChildContext =
+      ToNodeParent.getAllChildContext();
   assert(!AllChildContext.count(Hash) && "Node to remove must exist");
-  LineLocation OldCallSite = NodeToMove.CallSiteLoc;
-  ContextTrieNode &OldParentContext = *NodeToMove.getParentContext();
   AllChildContext[Hash] = NodeToMove;
   ContextTrieNode &NewNode = AllChildContext[Hash];
-  NewNode.CallSiteLoc = CallSite;
+  NewNode.setCallSiteLoc(CallSite);
 
   // Walk through nodes in the moved the subtree, and update
   // FunctionSamples' context as for the context promotion.
   // We also need to set new parant link for all children.
   std::queue<ContextTrieNode *> NodeToUpdate;
-  NewNode.setParentContext(this);
+  NewNode.setParentContext(&ToNodeParent);
   NodeToUpdate.push(&NewNode);
 
   while (!NodeToUpdate.empty()) {
@@ -87,10 +89,8 @@ ContextTrieNode &ContextTrieNode::moveToChildContext(
     FunctionSamples *FSamples = Node->getFunctionSamples();
 
     if (FSamples) {
-      FSamples->getContext().promoteOnPath(ContextFramesToRemove);
+      setContextNode(FSamples, Node);
       FSamples->getContext().setState(SyntheticContext);
-      LLVM_DEBUG(dbgs() << "  Context promoted to: "
-                        << FSamples->getContext().toString() << "\n");
     }
 
     for (auto &It : Node->getAllChildContext()) {
@@ -100,10 +100,6 @@ ContextTrieNode &ContextTrieNode::moveToChildContext(
     }
   }
 
-  // Original context no longer needed, destroy if requested.
-  if (DeleteNode)
-    OldParentContext.removeChildContext(OldCallSite, NewNode.getFuncName());
-
   return NewNode;
 }
 
@@ -131,7 +127,7 @@ void ContextTrieNode::setFunctionSamples(FunctionSamples *FSamples) {
 Optional<uint32_t> ContextTrieNode::getFunctionSize() const { return FuncSize; }
 
 void ContextTrieNode::addFunctionSize(uint32_t FSize) {
-  if (!FuncSize.hasValue())
+  if (!FuncSize)
     FuncSize = 0;
 
   FuncSize = FuncSize.getValue() + FSize;
@@ -147,6 +143,10 @@ void ContextTrieNode::setParentContext(ContextTrieNode *Parent) {
   ParentContext = Parent;
 }
 
+void ContextTrieNode::setCallSiteLoc(const LineLocation &Loc) {
+  CallSiteLoc = Loc;
+}
+
 void ContextTrieNode::dumpNode() {
   dbgs() << "Node: " << FuncName << "\n"
          << "  Callsite: " << CallSiteLoc << "\n"
@@ -202,13 +202,23 @@ SampleContextTracker::SampleContextTracker(
     SampleContext Context = FuncSample.first;
     LLVM_DEBUG(dbgs() << "Tracking Context for function: " << Context.toString()
                       << "\n");
-    if (!Context.isBaseContext())
-      FuncToCtxtProfiles[Context.getName()].insert(FSamples);
     ContextTrieNode *NewNode = getOrCreateContextPath(Context, true);
     assert(!NewNode->getFunctionSamples() &&
            "New node can't have sample profile");
     NewNode->setFunctionSamples(FSamples);
   }
+  populateFuncToCtxtMap();
+}
+
+void SampleContextTracker::populateFuncToCtxtMap() {
+  for (auto *Node : *this) {
+    FunctionSamples *FSamples = Node->getFunctionSamples();
+    if (FSamples) {
+      FSamples->getContext().setState(RawContext);
+      setContextNode(FSamples, Node);
+      FuncToCtxtProfiles[Node->getFuncName()].push_back(FSamples);
+    }
+  }
 }
 
 FunctionSamples *
@@ -231,7 +241,7 @@ SampleContextTracker::getCalleeContextSamplesFor(const CallBase &Inst,
   if (CalleeContext) {
     FunctionSamples *FSamples = CalleeContext->getFunctionSamples();
     LLVM_DEBUG(if (FSamples) {
-      dbgs() << "  Callee context found: " << FSamples->getContext().toString()
+      dbgs() << "  Callee context found: " << getContextString(CalleeContext)
              << "\n";
     });
     return FSamples;
@@ -333,7 +343,7 @@ FunctionSamples *SampleContextTracker::getBaseSamplesFor(StringRef Name,
       if (Context.hasState(InlinedContext) || Context.hasState(MergedContext))
         continue;
 
-      ContextTrieNode *FromNode = getContextFor(Context);
+      ContextTrieNode *FromNode = getContextNodeForProfile(CSamples);
       if (FromNode == Node)
         continue;
 
@@ -354,7 +364,7 @@ void SampleContextTracker::markContextSamplesInlined(
     const FunctionSamples *InlinedSamples) {
   assert(InlinedSamples && "Expect non-null inlined samples");
   LLVM_DEBUG(dbgs() << "Marking context profile as inlined: "
-                    << InlinedSamples->getContext().toString() << "\n");
+                    << getContextString(*InlinedSamples) << "\n");
   InlinedSamples->getContext().setState(InlinedContext);
 }
 
@@ -405,17 +415,43 @@ ContextTrieNode &SampleContextTracker::promoteMergeContextSamplesTree(
   // the context profile in the base (context-less) profile.
   FunctionSamples *FromSamples = NodeToPromo.getFunctionSamples();
   assert(FromSamples && "Shouldn't promote a context without profile");
+  (void)FromSamples;  // Unused in release build.
+
   LLVM_DEBUG(dbgs() << "  Found context tree root to promote: "
-                    << FromSamples->getContext().toString() << "\n");
+                    << getContextString(&NodeToPromo) << "\n");
 
   assert(!FromSamples->getContext().hasState(InlinedContext) &&
          "Shouldn't promote inlined context profile");
-  uint32_t ContextFramesToRemove =
-      FromSamples->getContext().getContextFrames().size() - 1;
-  return promoteMergeContextSamplesTree(NodeToPromo, RootContext,
-                                        ContextFramesToRemove);
+  return promoteMergeContextSamplesTree(NodeToPromo, RootContext);
+}
+
+#ifndef NDEBUG
+std::string
+SampleContextTracker::getContextString(const FunctionSamples &FSamples) const {
+  return getContextString(getContextNodeForProfile(&FSamples));
 }
 
+std::string
+SampleContextTracker::getContextString(ContextTrieNode *Node) const {
+  SampleContextFrameVector Res;
+  if (Node == &RootContext)
+    return std::string();
+  Res.emplace_back(Node->getFuncName(), LineLocation(0, 0));
+
+  ContextTrieNode *PreNode = Node;
+  Node = Node->getParentContext();
+  while (Node && Node != &RootContext) {
+    Res.emplace_back(Node->getFuncName(), PreNode->getCallSiteLoc());
+    PreNode = Node;
+    Node = Node->getParentContext();
+  }
+
+  std::reverse(Res.begin(), Res.end());
+
+  return SampleContext::getContextString(Res);
+}
+#endif
+
 void SampleContextTracker::dump() { RootContext.dumpTree(); }
 
 StringRef SampleContextTracker::getFuncNameFor(ContextTrieNode *Node) const {
@@ -526,8 +562,7 @@ ContextTrieNode &SampleContextTracker::addTopLevelContextNode(StringRef FName) {
 }
 
 void SampleContextTracker::mergeContextNode(ContextTrieNode &FromNode,
-                                            ContextTrieNode &ToNode,
-                                            uint32_t ContextFramesToRemove) {
+                                            ContextTrieNode &ToNode) {
   FunctionSamples *FromSamples = FromNode.getFunctionSamples();
   FunctionSamples *ToSamples = ToNode.getFunctionSamples();
   if (FromSamples && ToSamples) {
@@ -540,16 +575,13 @@ void SampleContextTracker::mergeContextNode(ContextTrieNode &FromNode,
   } else if (FromSamples) {
     // Transfer FromSamples from FromNode to ToNode
     ToNode.setFunctionSamples(FromSamples);
+    setContextNode(FromSamples, &ToNode);
     FromSamples->getContext().setState(SyntheticContext);
-    FromSamples->getContext().promoteOnPath(ContextFramesToRemove);
-    FromNode.setFunctionSamples(nullptr);
   }
 }
 
 ContextTrieNode &SampleContextTracker::promoteMergeContextSamplesTree(
-    ContextTrieNode &FromNode, ContextTrieNode &ToNodeParent,
-    uint32_t ContextFramesToRemove) {
-  assert(ContextFramesToRemove && "Context to remove can't be empty");
+    ContextTrieNode &FromNode, ContextTrieNode &ToNodeParent) {
 
   // Ignore call site location if destination is top level under root
   LineLocation NewCallSiteLoc = LineLocation(0, 0);
@@ -566,22 +598,25 @@ ContextTrieNode &SampleContextTracker::promoteMergeContextSamplesTree(
   if (!ToNode) {
     // Do not delete node to move from its parent here because
     // caller is iterating over children of that parent node.
-    ToNode = &ToNodeParent.moveToChildContext(
-        NewCallSiteLoc, std::move(FromNode), ContextFramesToRemove, false);
+    ToNode =
+        &moveContextSamples(ToNodeParent, NewCallSiteLoc, std::move(FromNode));
+    LLVM_DEBUG({
+      dbgs() << "  Context promoted and merged to: " << getContextString(ToNode)
+             << "\n";
+    });
   } else {
     // Destination node exists, merge samples for the context tree
-    mergeContextNode(FromNode, *ToNode, ContextFramesToRemove);
+    mergeContextNode(FromNode, *ToNode);
     LLVM_DEBUG({
       if (ToNode->getFunctionSamples())
         dbgs() << "  Context promoted and merged to: "
-               << ToNode->getFunctionSamples()->getContext().toString() << "\n";
+               << getContextString(ToNode) << "\n";
     });
 
     // Recursively promote and merge children
     for (auto &It : FromNode.getAllChildContext()) {
       ContextTrieNode &FromChildNode = It.second;
-      promoteMergeContextSamplesTree(FromChildNode, *ToNode,
-                                     ContextFramesToRemove);
+      promoteMergeContextSamplesTree(FromChildNode, *ToNode);
     }
 
     // Remove children once they're all merged
@@ -594,4 +629,14 @@ ContextTrieNode &SampleContextTracker::promoteMergeContextSamplesTree(
 
   return *ToNode;
 }
+
+void SampleContextTracker::createContextLessProfileMap(
+    SampleProfileMap &ContextLessProfiles) {
+  for (auto *Node : *this) {
+    FunctionSamples *FProfile = Node->getFunctionSamples();
+    // Profile's context can be empty, use ContextNode's func name.
+    if (FProfile)
+      ContextLessProfiles[Node->getFuncName()].merge(*FProfile);
+  }
+}
 } // namespace llvm
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index bc6051de90c4..40de69bbf2cf 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -25,11 +25,8 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/None.h"
 #include "llvm/ADT/PriorityQueue.h"
 #include "llvm/ADT/SCCIterator.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringMap.h"
@@ -38,22 +35,16 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
 #include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/InlineAdvisor.h"
 #include "llvm/Analysis/InlineCost.h"
-#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ReplayInlineAdvisor.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/InstrTypes.h"
@@ -64,6 +55,7 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/IR/PseudoProbe.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
@@ -73,9 +65,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ErrorOr.h"
-#include "llvm/Support/GenericDomTree.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/ProfiledCallGraph.h"
@@ -84,7 +74,6 @@
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/CallPromotionUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/SampleProfileInference.h"
 #include "llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h"
 #include "llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h"
 #include <algorithm>
@@ -151,8 +140,7 @@ static cl::opt<bool> ProfileSampleBlockAccurate(
              "them conservatively as unknown. "));
 
 static cl::opt<bool> ProfileAccurateForSymsInList(
-    "profile-accurate-for-symsinlist", cl::Hidden, cl::ZeroOrMore,
-    cl::init(true),
+    "profile-accurate-for-symsinlist", cl::Hidden, cl::init(true),
     cl::desc("For symbols in profile symbol list, regard their profiles to "
              "be accurate. It may be overriden by profile-sample-accurate. "));
 
@@ -183,6 +171,15 @@ static cl::opt<bool> ProfileSizeInline(
     cl::desc("Inline cold call sites in profile loader if it's beneficial "
              "for code size."));
 
+// Since profiles are consumed by many passes, turning on this option has
+// side effects. For instance, pre-link SCC inliner would see merged profiles
+// and inline the hot functions (that are skipped in this pass).
+static cl::opt<bool> DisableSampleLoaderInlining(
+    "disable-sample-loader-inlining", cl::Hidden, cl::init(false),
+    cl::desc("If true, artifically skip inline transformation in sample-loader "
+             "pass, and merge (or scale) profiles (as configured by "
+             "--sample-profile-merge-inlinee)."));
+
 cl::opt<int> ProfileInlineGrowthLimit(
     "sample-profile-inline-growth-limit", cl::Hidden, cl::init(12),
     cl::desc("The size growth ratio limit for proirity-based sample profile "
@@ -219,19 +216,19 @@ static cl::opt<unsigned> ProfileICPRelativeHotnessSkip(
         "Skip relative hotness check for ICP up to given number of targets."));
 
 static cl::opt<bool> CallsitePrioritizedInline(
-    "sample-profile-prioritized-inline", cl::Hidden, cl::ZeroOrMore,
-    cl::init(false),
+    "sample-profile-prioritized-inline", cl::Hidden,
+
     cl::desc("Use call site prioritized inlining for sample profile loader."
              "Currently only CSSPGO is supported."));
 
 static cl::opt<bool> UsePreInlinerDecision(
-    "sample-profile-use-preinliner", cl::Hidden, cl::ZeroOrMore,
-    cl::init(false),
+    "sample-profile-use-preinliner", cl::Hidden,
+
     cl::desc("Use the preinliner decisions stored in profile context."));
 
 static cl::opt<bool> AllowRecursiveInline(
-    "sample-profile-recursive-inline", cl::Hidden, cl::ZeroOrMore,
-    cl::init(false),
+    "sample-profile-recursive-inline", cl::Hidden,
+
     cl::desc("Allow sample loader inliner to inline recursive calls."));
 
 static cl::opt<std::string> ProfileInlineReplayFile(
@@ -287,7 +284,6 @@ static cl::opt<CallSiteFormat::Format> ProfileInlineReplayFormat(
 
 static cl::opt<unsigned>
     MaxNumPromotions("sample-profile-icp-max-prom", cl::init(3), cl::Hidden,
-                     cl::ZeroOrMore,
                      cl::desc("Max number of promotions for a single indirect "
                               "call callsite in sample profile loader"));
 
@@ -295,6 +291,13 @@ static cl::opt<bool> OverwriteExistingWeights(
     "overwrite-existing-weights", cl::Hidden, cl::init(false),
     cl::desc("Ignore existing branch weights on IR and always overwrite."));
 
+static cl::opt<bool> AnnotateSampleProfileInlinePhase(
+    "annotate-sample-profile-inline-phase", cl::Hidden, cl::init(false),
+    cl::desc("Annotate LTO phase (prelink / postlink), or main (no LTO) for "
+             "sample-profile inline pass name."));
+
+extern cl::opt<bool> EnableExtTspBlockPlacement;
+
 namespace {
 
 using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>;
@@ -425,7 +428,11 @@ public:
       : SampleProfileLoaderBaseImpl(std::string(Name), std::string(RemapName)),
         GetAC(std::move(GetAssumptionCache)),
         GetTTI(std::move(GetTargetTransformInfo)), GetTLI(std::move(GetTLI)),
-        LTOPhase(LTOPhase) {}
+        LTOPhase(LTOPhase),
+        AnnotatedPassName(AnnotateSampleProfileInlinePhase
+                              ? llvm::AnnotateInlinePassName(InlineContext{
+                                    LTOPhase, InlinePass::SampleProfileInliner})
+                              : CSINLINE_DEBUG) {}
 
   bool doInitialization(Module &M, FunctionAnalysisManager *FAM = nullptr);
   bool runOnModule(Module &M, ModuleAnalysisManager *AM,
@@ -487,15 +494,13 @@ protected:
   /// Profile tracker for different context.
   std::unique_ptr<SampleContextTracker> ContextTracker;
 
-  /// Flag indicating whether input profile is context-sensitive
-  bool ProfileIsCSFlat = false;
-
   /// Flag indicating which LTO/ThinLTO phase the pass is invoked in.
   ///
   /// We need to know the LTO phase because for example in ThinLTOPrelink
   /// phase, in annotation, we should not promote indirect calls. Instead,
   /// we will mark GUIDs that needs to be annotated to the function.
-  ThinOrFullLTOPhase LTOPhase;
+  const ThinOrFullLTOPhase LTOPhase;
+  const std::string AnnotatedPassName;
 
   /// Profle Symbol list tells whether a function name appears in the binary
   /// used to generate the current profile.
@@ -535,6 +540,11 @@ protected:
 
   // A pseudo probe helper to correlate the imported sample counts.
   std::unique_ptr<PseudoProbeManager> ProbeManager;
+
+private:
+  const char *getAnnotatedRemarkPassName() const {
+    return AnnotatedPassName.c_str();
+  }
 };
 
 class SampleProfileLoaderLegacyPass : public ModulePass {
@@ -605,7 +615,7 @@ ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
   // call instruction should have 0 count.
   // For CS profile, the callsite count of previously inlined callees is
   // populated with the entry count of the callees.
-  if (!ProfileIsCSFlat)
+  if (!FunctionSamples::ProfileIsCS)
     if (const auto *CB = dyn_cast<CallBase>(&Inst))
       if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB))
         return 0;
@@ -644,7 +654,7 @@ ErrorOr<uint64_t> SampleProfileLoader::getProbeWeight(const Instruction &Inst) {
   // call instruction should have 0 count.
   // For CS profile, the callsite count of previously inlined callees is
   // populated with the entry count of the callees.
-  if (!ProfileIsCSFlat)
+  if (!FunctionSamples::ProfileIsCS)
     if (const auto *CB = dyn_cast<CallBase>(&Inst))
       if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB))
         return 0;
@@ -698,7 +708,7 @@ SampleProfileLoader::findCalleeFunctionSamples(const CallBase &Inst) const {
   if (Function *Callee = Inst.getCalledFunction())
     CalleeName = Callee->getName();
 
-  if (ProfileIsCSFlat)
+  if (FunctionSamples::ProfileIsCS)
     return ContextTracker->getCalleeContextSamplesFor(Inst, CalleeName);
 
   const FunctionSamples *FS = findFunctionSamples(Inst);
@@ -730,7 +740,7 @@ SampleProfileLoader::findIndirectCallFunctionSamples(
            FunctionSamples::getGUID(R->getName());
   };
 
-  if (ProfileIsCSFlat) {
+  if (FunctionSamples::ProfileIsCS) {
     auto CalleeSamples =
         ContextTracker->getIndirectCalleeContextSamplesFor(DIL);
     if (CalleeSamples.empty())
@@ -783,7 +793,7 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
 
   auto it = DILocation2SampleMap.try_emplace(DIL,nullptr);
   if (it.second) {
-    if (ProfileIsCSFlat)
+    if (FunctionSamples::ProfileIsCS)
       it.first->second = ContextTracker->getContextSamplesFor(DIL);
     else
       it.first->second =
@@ -839,6 +849,13 @@ static void
 updateIDTMetaData(Instruction &Inst,
                   const SmallVectorImpl<InstrProfValueData> &CallTargets,
                   uint64_t Sum) {
+  // Bail out early if MaxNumPromotions is zero.
+  // This prevents allocating an array of zero length below.
+  //
+  // Note `updateIDTMetaData` is called in two places so check
+  // `MaxNumPromotions` inside it.
+  if (MaxNumPromotions == 0)
+    return;
   uint32_t NumVals = 0;
   // OldSum is the existing total count in the value profile data.
   uint64_t OldSum = 0;
@@ -922,6 +939,14 @@ updateIDTMetaData(Instruction &Inst,
 bool SampleProfileLoader::tryPromoteAndInlineCandidate(
     Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, uint64_t &Sum,
     SmallVector<CallBase *, 8> *InlinedCallSite) {
+  // Bail out early if sample-loader inliner is disabled.
+  if (DisableSampleLoaderInlining)
+    return false;
+
+  // Bail out early if MaxNumPromotions is zero.
+  // This prevents allocating an array of zero length in callees below.
+  if (MaxNumPromotions == 0)
+    return false;
   auto CalleeFunctionName = Candidate.CalleeSamples->getFuncName();
   auto R = SymbolMap.find(CalleeFunctionName);
   if (R == SymbolMap.end() || !R->getValue())
@@ -1009,8 +1034,9 @@ void SampleProfileLoader::emitOptimizationRemarksForInlineCandidates(
   for (auto I : Candidates) {
     Function *CalledFunction = I->getCalledFunction();
     if (CalledFunction) {
-      ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "InlineAttempt",
-                                           I->getDebugLoc(), I->getParent())
+      ORE->emit(OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(),
+                                           "InlineAttempt", I->getDebugLoc(),
+                                           I->getParent())
                 << "previous inlining reattempted for "
                 << (Hot ? "hotness: '" : "size: '")
                 << ore::NV("Callee", CalledFunction) << "' into '"
@@ -1042,13 +1068,12 @@ void SampleProfileLoader::findExternalInlineCandidate(
 
   // For AutoFDO profile, retrieve candidate profiles by walking over
   // the nested inlinee profiles.
-  if (!ProfileIsCSFlat) {
+  if (!FunctionSamples::ProfileIsCS) {
     Samples->findInlinedFunctions(InlinedGUIDs, SymbolMap, Threshold);
     return;
   }
 
-  ContextTrieNode *Caller =
-      ContextTracker->getContextFor(Samples->getContext());
+  ContextTrieNode *Caller = ContextTracker->getContextNodeForProfile(Samples);
   std::queue<ContextTrieNode *> CalleeList;
   CalleeList.push(Caller);
   while (!CalleeList.empty()) {
@@ -1098,11 +1123,20 @@ void SampleProfileLoader::findExternalInlineCandidate(
 
 /// Iteratively inline hot callsites of a function.
 ///
-/// Iteratively traverse all callsites of the function \p F, and find if
-/// the corresponding inlined instance exists and is hot in profile. If
-/// it is hot enough, inline the callsites and adds new callsites of the
-/// callee into the caller. If the call is an indirect call, first promote
-/// it to direct call. Each indirect call is limited with a single target.
+/// Iteratively traverse all callsites of the function \p F, so as to
+/// find out callsites with corresponding inline instances.
+///
+/// For such callsites,
+/// - If it is hot enough, inline the callsites and adds callsites of the callee
+///   into the caller. If the call is an indirect call, first promote
+///   it to direct call. Each indirect call is limited with a single target.
+///
+/// - If a callsite is not inlined, merge the its profile to the outline
+///   version (if --sample-profile-merge-inlinee is true), or scale the
+///   counters of standalone function based on the profile of inlined
+///   instances (if --sample-profile-merge-inlinee is false).
+///
+///   Later passes may consume the updated profiles.
 ///
 /// \param F function to perform iterative inlining.
 /// \param InlinedGUIDs a set to be updated to include all GUIDs that are
@@ -1137,7 +1171,7 @@ bool SampleProfileLoader::inlineHotFunctions(
               assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) &&
                      "GUIDToFuncNameMap has to be populated");
               AllCandidates.push_back(CB);
-              if (FS->getEntrySamples() > 0 || ProfileIsCSFlat)
+              if (FS->getEntrySamples() > 0 || FunctionSamples::ProfileIsCS)
                 LocalNotInlinedCallSites.try_emplace(CB, FS);
               if (callsiteIsHot(FS, PSI, ProfAccForSymsInList))
                 Hot = true;
@@ -1200,13 +1234,17 @@ bool SampleProfileLoader::inlineHotFunctions(
 
   // For CS profile, profile for not inlined context will be merged when
   // base profile is being retrieved.
-  if (!FunctionSamples::ProfileIsCSFlat)
+  if (!FunctionSamples::ProfileIsCS)
     promoteMergeNotInlinedContextSamples(LocalNotInlinedCallSites, F);
   return Changed;
 }
 
 bool SampleProfileLoader::tryInlineCandidate(
     InlineCandidate &Candidate, SmallVector<CallBase *, 8> *InlinedCallSites) {
+  // Do not attempt to inline a candidate if
+  // --disable-sample-loader-inlining is true.
+  if (DisableSampleLoaderInlining)
+    return false;
 
   CallBase &CB = *Candidate.CallInstr;
   Function *CalledFunction = CB.getCalledFunction();
@@ -1216,7 +1254,8 @@ bool SampleProfileLoader::tryInlineCandidate(
 
   InlineCost Cost = shouldInlineCandidate(Candidate);
   if (Cost.isNever()) {
-    ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "InlineFail", DLoc, BB)
+    ORE->emit(OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(),
+                                         "InlineFail", DLoc, BB)
               << "incompatible inlining");
     return false;
   }
@@ -1226,45 +1265,45 @@ bool SampleProfileLoader::tryInlineCandidate(
 
   InlineFunctionInfo IFI(nullptr, GetAC);
   IFI.UpdateProfile = false;
-  if (InlineFunction(CB, IFI).isSuccess()) {
-    // Merge the attributes based on the inlining.
-    AttributeFuncs::mergeAttributesForInlining(*BB->getParent(),
-                                               *CalledFunction);
-
-    // The call to InlineFunction erases I, so we can't pass it here.
-    emitInlinedIntoBasedOnCost(*ORE, DLoc, BB, *CalledFunction,
-                               *BB->getParent(), Cost, true, CSINLINE_DEBUG);
-
-    // Now populate the list of newly exposed call sites.
-    if (InlinedCallSites) {
-      InlinedCallSites->clear();
-      for (auto &I : IFI.InlinedCallSites)
-        InlinedCallSites->push_back(I);
-    }
+  if (!InlineFunction(CB, IFI).isSuccess())
+    return false;
 
-    if (ProfileIsCSFlat)
-      ContextTracker->markContextSamplesInlined(Candidate.CalleeSamples);
-    ++NumCSInlined;
-
-    // Prorate inlined probes for a duplicated inlining callsite which probably
-    // has a distribution less than 100%. Samples for an inlinee should be
-    // distributed among the copies of the original callsite based on each
-    // callsite's distribution factor for counts accuracy. Note that an inlined
-    // probe may come with its own distribution factor if it has been duplicated
-    // in the inlinee body. The two factor are multiplied to reflect the
-    // aggregation of duplication.
-    if (Candidate.CallsiteDistribution < 1) {
-      for (auto &I : IFI.InlinedCallSites) {
-        if (Optional<PseudoProbe> Probe = extractProbe(*I))
-          setProbeDistributionFactor(*I, Probe->Factor *
-                                             Candidate.CallsiteDistribution);
-      }
-      NumDuplicatedInlinesite++;
-    }
+  // Merge the attributes based on the inlining.
+  AttributeFuncs::mergeAttributesForInlining(*BB->getParent(),
+                                             *CalledFunction);
 
-    return true;
+  // The call to InlineFunction erases I, so we can't pass it here.
+  emitInlinedIntoBasedOnCost(*ORE, DLoc, BB, *CalledFunction, *BB->getParent(),
+                             Cost, true, getAnnotatedRemarkPassName());
+
+  // Now populate the list of newly exposed call sites.
+  if (InlinedCallSites) {
+    InlinedCallSites->clear();
+    for (auto &I : IFI.InlinedCallSites)
+      InlinedCallSites->push_back(I);
   }
-  return false;
+
+  if (FunctionSamples::ProfileIsCS)
+    ContextTracker->markContextSamplesInlined(Candidate.CalleeSamples);
+  ++NumCSInlined;
+
+  // Prorate inlined probes for a duplicated inlining callsite which probably
+  // has a distribution less than 100%. Samples for an inlinee should be
+  // distributed among the copies of the original callsite based on each
+  // callsite's distribution factor for counts accuracy. Note that an inlined
+  // probe may come with its own distribution factor if it has been duplicated
+  // in the inlinee body. The two factor are multiplied to reflect the
+  // aggregation of duplication.
+  if (Candidate.CallsiteDistribution < 1) {
+    for (auto &I : IFI.InlinedCallSites) {
+      if (Optional<PseudoProbe> Probe = extractProbe(*I))
+        setProbeDistributionFactor(*I, Probe->Factor *
+                                   Candidate.CallsiteDistribution);
+    }
+    NumDuplicatedInlinesite++;
+  }
+
+  return true;
 }
 
 bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate,
@@ -1285,14 +1324,8 @@ bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate,
   if (Optional<PseudoProbe> Probe = extractProbe(*CB))
     Factor = Probe->Factor;
 
-  uint64_t CallsiteCount = 0;
-  ErrorOr<uint64_t> Weight = getBlockWeight(CB->getParent());
-  if (Weight)
-    CallsiteCount = Weight.get();
-  if (CalleeSamples)
-    CallsiteCount = std::max(
-        CallsiteCount, uint64_t(CalleeSamples->getEntrySamples() * Factor));
-
+  uint64_t CallsiteCount =
+      CalleeSamples ? CalleeSamples->getEntrySamples() * Factor : 0;
   *NewCandidate = {CB, CalleeSamples, CallsiteCount, Factor};
   return true;
 }
@@ -1387,7 +1420,6 @@ SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
 
 bool SampleProfileLoader::inlineHotFunctionsWithPriority(
     Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
-
   // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
   // Profile symbol list is ignored when profile-sample-accurate is on.
   assert((!ProfAccForSymsInList ||
@@ -1513,7 +1545,7 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority(
 
   // For CS profile, profile for not inlined context will be merged when
   // base profile is being retrieved.
-  if (!FunctionSamples::ProfileIsCSFlat)
+  if (!FunctionSamples::ProfileIsCS)
     promoteMergeNotInlinedContextSamples(LocalNotInlinedCallSites, F);
   return Changed;
 }
@@ -1528,11 +1560,11 @@ void SampleProfileLoader::promoteMergeNotInlinedContextSamples(
     if (!Callee || Callee->isDeclaration())
       continue;
 
-    ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "NotInline",
-                                         I->getDebugLoc(), I->getParent())
-              << "previous inlining not repeated: '"
-              << ore::NV("Callee", Callee) << "' into '"
-              << ore::NV("Caller", &F) << "'");
+    ORE->emit(
+        OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(), "NotInline",
+                                   I->getDebugLoc(), I->getParent())
+        << "previous inlining not repeated: '" << ore::NV("Callee", Callee)
+        << "' into '" << ore::NV("Caller", &F) << "'");
 
     ++NumCSNotInlined;
     const FunctionSamples *FS = Pair.getSecond();
@@ -1540,6 +1572,10 @@ void SampleProfileLoader::promoteMergeNotInlinedContextSamples(
       continue;
     }
 
+    // Do not merge a context that is already duplicated into the base profile.
+    if (FS->getContext().hasAttribute(sampleprof::ContextDuplicatedIntoBase))
+      continue;
+
     if (ProfileMergeInlinee) {
       // A function call can be replicated by optimizations like callsite
       // splitting or jump threading and the replicates end up sharing the
@@ -1623,7 +1659,7 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) {
           // With CSSPGO all indirect call targets are counted torwards the
           // original indirect call site in the profile, including both
           // inlined and non-inlined targets.
-          if (!FunctionSamples::ProfileIsCSFlat) {
+          if (!FunctionSamples::ProfileIsCS) {
             if (const FunctionSamplesMap *M =
                     FS->findFunctionSamplesMapAt(CallSite)) {
               for (const auto &NameFS : *M)
@@ -1714,6 +1750,11 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) {
       }
     }
 
+    // FIXME: Re-enable for sample profiling after investigating why the sum
+    // of branch weights can be 0
+    //
+    // misexpect::checkExpectAnnotations(*TI, Weights, /*IsFrontend=*/false);
+
     uint64_t TempWeight;
     // Only set weights if there is at least one non-zero weight.
     // In any other case, let the analyzer set weights.
@@ -1798,7 +1839,7 @@ INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile",
 std::unique_ptr<ProfiledCallGraph>
 SampleProfileLoader::buildProfiledCallGraph(CallGraph &CG) {
   std::unique_ptr<ProfiledCallGraph> ProfiledCG;
-  if (ProfileIsCSFlat)
+  if (FunctionSamples::ProfileIsCS)
     ProfiledCG = std::make_unique<ProfiledCallGraph>(*ContextTracker);
   else
     ProfiledCG = std::make_unique<ProfiledCallGraph>(Reader->getProfiles());
@@ -1843,8 +1884,8 @@ SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) {
 
   assert(&CG->getModule() == &M);
 
-  if (UseProfiledCallGraph ||
-      (ProfileIsCSFlat && !UseProfiledCallGraph.getNumOccurrences())) {
+  if (UseProfiledCallGraph || (FunctionSamples::ProfileIsCS &&
+                               !UseProfiledCallGraph.getNumOccurrences())) {
     // Use profiled call edges to augment the top-down order. There are cases
     // that the top-down order computed based on the static call graph doesn't
     // reflect real execution order. For example
@@ -1973,40 +2014,50 @@ bool SampleProfileLoader::doInitialization(Module &M,
                               ProfileInlineReplayScope,
                               ProfileInlineReplayFallback,
                               {ProfileInlineReplayFormat}},
-        /*EmitRemarks=*/false);
+        /*EmitRemarks=*/false, InlineContext{LTOPhase, InlinePass::ReplaySampleProfileInliner});
   }
 
-  // Apply tweaks if context-sensitive profile is available.
-  if (Reader->profileIsCSFlat() || Reader->profileIsCSNested()) {
-    ProfileIsCSFlat = Reader->profileIsCSFlat();
+  // Apply tweaks if context-sensitive or probe-based profile is available.
+  if (Reader->profileIsCS() || Reader->profileIsPreInlined() ||
+      Reader->profileIsProbeBased()) {
+    if (!UseIterativeBFIInference.getNumOccurrences())
+      UseIterativeBFIInference = true;
+    if (!SampleProfileUseProfi.getNumOccurrences())
+      SampleProfileUseProfi = true;
+    if (!EnableExtTspBlockPlacement.getNumOccurrences())
+      EnableExtTspBlockPlacement = true;
     // Enable priority-base inliner and size inline by default for CSSPGO.
     if (!ProfileSizeInline.getNumOccurrences())
       ProfileSizeInline = true;
     if (!CallsitePrioritizedInline.getNumOccurrences())
       CallsitePrioritizedInline = true;
-
-    // For CSSPGO, use preinliner decision by default when available.
-    if (!UsePreInlinerDecision.getNumOccurrences())
-      UsePreInlinerDecision = true;
-
     // For CSSPGO, we also allow recursive inline to best use context profile.
     if (!AllowRecursiveInline.getNumOccurrences())
       AllowRecursiveInline = true;
 
-    // Enable iterative-BFI by default for CSSPGO.
-    if (!UseIterativeBFIInference.getNumOccurrences())
-      UseIterativeBFIInference = true;
-    // Enable Profi by default for CSSPGO.
-    if (!SampleProfileUseProfi.getNumOccurrences())
-      SampleProfileUseProfi = true;
+    if (Reader->profileIsPreInlined()) {
+      if (!UsePreInlinerDecision.getNumOccurrences())
+        UsePreInlinerDecision = true;
+    }
 
-    if (FunctionSamples::ProfileIsCSFlat) {
-      // Tracker for profiles under different context
-      ContextTracker = std::make_unique<SampleContextTracker>(
-          Reader->getProfiles(), &GUIDToFuncNameMap);
+    if (!Reader->profileIsCS()) {
+      // Non-CS profile should be fine without a function size budget for the
+      // inliner since the contexts in the profile are either all from inlining
+      // in the prevoius build or pre-computed by the preinliner with a size
+      // cap, thus they are bounded.
+      if (!ProfileInlineLimitMin.getNumOccurrences())
+        ProfileInlineLimitMin = std::numeric_limits<unsigned>::max();
+      if (!ProfileInlineLimitMax.getNumOccurrences())
+        ProfileInlineLimitMax = std::numeric_limits<unsigned>::max();
     }
   }
 
+  if (Reader->profileIsCS()) {
+    // Tracker for profiles under different context
+    ContextTracker = std::make_unique<SampleContextTracker>(
+        Reader->getProfiles(), &GUIDToFuncNameMap);
+  }
+
   // Load pseudo probe descriptors for probe-based function samples.
   if (Reader->profileIsProbeBased()) {
     ProbeManager = std::make_unique<PseudoProbeManager>(M);
@@ -2082,7 +2133,7 @@ bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
   }
 
   // Account for cold calls not inlined....
-  if (!ProfileIsCSFlat)
+  if (!FunctionSamples::ProfileIsCS)
     for (const std::pair<Function *, NotInlinedProfileInfo> &pair :
          notInlinedCallInfo)
       updateProfileCallee(pair.first, pair.second.entryCount);
@@ -2145,7 +2196,7 @@ bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM)
 
   // Initialize entry count when the function has no existing entry
   // count value.
-  if (!F.getEntryCount().hasValue())
+  if (!F.getEntryCount())
     F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real));
   std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
   if (AM) {
@@ -2158,7 +2209,7 @@ bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM)
     ORE = OwnedORE.get();
   }
 
-  if (ProfileIsCSFlat)
+  if (FunctionSamples::ProfileIsCS)
     Samples = ContextTracker->getBaseSamplesFor(F);
   else
     Samples = Reader->getSamplesFor(F);
diff --git a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
index e104ae00e916..d1ab2649ee2e 100644
--- a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
@@ -13,21 +13,19 @@
 #include "llvm/Transforms/IPO/SampleProfileProbe.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PseudoProbe.h"
 #include "llvm/ProfileData/SampleProf.h"
 #include "llvm/Support/CRC.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <unordered_set>
@@ -416,7 +414,7 @@ void PseudoProbeUpdatePass::runOnFunction(Function &F,
                                           FunctionAnalysisManager &FAM) {
   BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
   auto BBProfileCount = [&BFI](BasicBlock *BB) {
-    return BFI.getBlockProfileCount(BB).getValueOr(0);
+    return BFI.getBlockProfileCount(BB).value_or(0);
   };
 
   // Collect the sum of execution weight for each probe.
diff --git a/llvm/lib/Transforms/IPO/SyntheticCountsPropagation.cpp b/llvm/lib/Transforms/IPO/SyntheticCountsPropagation.cpp
index 95393d9476e0..c7d54b8cdeb0 100644
--- a/llvm/lib/Transforms/IPO/SyntheticCountsPropagation.cpp
+++ b/llvm/lib/Transforms/IPO/SyntheticCountsPropagation.cpp
@@ -25,18 +25,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO/SyntheticCountsPropagation.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/SyntheticCountsUtils.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 using Scaled64 = ScaledNumber<uint64_t>;
@@ -47,18 +42,17 @@ using ProfileCount = Function::ProfileCount;
 namespace llvm {
 cl::opt<int>
     InitialSyntheticCount("initial-synthetic-count", cl::Hidden, cl::init(10),
-                          cl::ZeroOrMore,
                           cl::desc("Initial value of synthetic entry count"));
 } // namespace llvm
 
 /// Initial synthetic count assigned to inline functions.
 static cl::opt<int> InlineSyntheticCount(
-    "inline-synthetic-count", cl::Hidden, cl::init(15), cl::ZeroOrMore,
+    "inline-synthetic-count", cl::Hidden, cl::init(15),
     cl::desc("Initial synthetic entry count for inline functions."));
 
 /// Initial synthetic count assigned to cold functions.
 static cl::opt<int> ColdSyntheticCount(
-    "cold-synthetic-count", cl::Hidden, cl::init(5), cl::ZeroOrMore,
+    "cold-synthetic-count", cl::Hidden, cl::init(5),
     cl::desc("Initial synthetic entry count for cold functions."));
 
 // Assign initial synthetic entry counts to functions.
diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index 52708ff2f226..a360a768a2bc 100644
--- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -21,7 +21,6 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Object/ModuleSymbolTable.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/FunctionAttrs.h"
@@ -311,7 +310,8 @@ void splitAndWriteThinLTOBitcode(
             return;
         }
         if (!F->isDeclaration() &&
-            computeFunctionBodyMemoryAccess(*F, AARGetter(*F)) == MAK_ReadNone)
+            computeFunctionBodyMemoryAccess(*F, AARGetter(*F)) ==
+                FMRB_DoesNotAccessMemory)
           EligibleVirtualFns.insert(F);
       });
     }
@@ -542,11 +542,11 @@ class WriteThinLTOBitcode : public ModulePass {
   raw_ostream &OS; // raw_ostream to print on
   // The output stream on which to emit a minimized module for use
   // just in the thin link, if requested.
-  raw_ostream *ThinLinkOS;
+  raw_ostream *ThinLinkOS = nullptr;
 
 public:
   static char ID; // Pass identification, replacement for typeid
-  WriteThinLTOBitcode() : ModulePass(ID), OS(dbgs()), ThinLinkOS(nullptr) {
+  WriteThinLTOBitcode() : ModulePass(ID), OS(dbgs()) {
     initializeWriteThinLTOBitcodePass(*PassRegistry::getPassRegistry());
   }
 
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 8b30f0e989a1..898a213d0849 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -57,6 +57,7 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AssumptionCache.h"
@@ -79,6 +80,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSummaryIndexYAML.h"
@@ -95,6 +97,7 @@
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/FunctionAttrs.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/CallPromotionUtils.h"
 #include "llvm/Transforms/Utils/Evaluator.h"
 #include <algorithm>
 #include <cstddef>
@@ -107,6 +110,15 @@ using namespace wholeprogramdevirt;
 
 #define DEBUG_TYPE "wholeprogramdevirt"
 
+STATISTIC(NumDevirtTargets, "Number of whole program devirtualization targets");
+STATISTIC(NumSingleImpl, "Number of single implementation devirtualizations");
+STATISTIC(NumBranchFunnel, "Number of branch funnels");
+STATISTIC(NumUniformRetVal, "Number of uniform return value optimizations");
+STATISTIC(NumUniqueRetVal, "Number of unique return value optimizations");
+STATISTIC(NumVirtConstProp1Bit,
+          "Number of 1 bit virtual constant propagations");
+STATISTIC(NumVirtConstProp, "Number of virtual constant propagations");
+
 static cl::opt<PassSummaryAction> ClSummaryAction(
     "wholeprogramdevirt-summary-action",
     cl::desc("What to do with the summary when running this pass"),
@@ -132,13 +144,12 @@ static cl::opt<std::string> ClWriteSummary(
 
 static cl::opt<unsigned>
     ClThreshold("wholeprogramdevirt-branch-funnel-threshold", cl::Hidden,
-                cl::init(10), cl::ZeroOrMore,
+                cl::init(10),
                 cl::desc("Maximum number of call targets per "
                          "call site to enable branch funnels"));
 
 static cl::opt<bool>
     PrintSummaryDevirt("wholeprogramdevirt-print-index-based", cl::Hidden,
-                       cl::init(false), cl::ZeroOrMore,
                        cl::desc("Print index-based devirtualization messages"));
 
 /// Provide a way to force enable whole program visibility in tests.
@@ -146,30 +157,34 @@ static cl::opt<bool>
 /// !vcall_visibility metadata (the mere presense of type tests
 /// previously implied hidden visibility).
 static cl::opt<bool>
-    WholeProgramVisibility("whole-program-visibility", cl::init(false),
-                           cl::Hidden, cl::ZeroOrMore,
+    WholeProgramVisibility("whole-program-visibility", cl::Hidden,
                            cl::desc("Enable whole program visibility"));
 
 /// Provide a way to force disable whole program for debugging or workarounds,
 /// when enabled via the linker.
 static cl::opt<bool> DisableWholeProgramVisibility(
-    "disable-whole-program-visibility", cl::init(false), cl::Hidden,
-    cl::ZeroOrMore,
+    "disable-whole-program-visibility", cl::Hidden,
     cl::desc("Disable whole program visibility (overrides enabling options)"));
 
 /// Provide way to prevent certain function from being devirtualized
 static cl::list<std::string>
     SkipFunctionNames("wholeprogramdevirt-skip",
                       cl::desc("Prevent function(s) from being devirtualized"),
-                      cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated);
-
-/// Mechanism to add runtime checking of devirtualization decisions, trapping on
-/// any that are not correct. Useful for debugging undefined behavior leading to
-/// failures with WPD.
-static cl::opt<bool>
-    CheckDevirt("wholeprogramdevirt-check", cl::init(false), cl::Hidden,
-                cl::ZeroOrMore,
-                cl::desc("Add code to trap on incorrect devirtualizations"));
+                      cl::Hidden, cl::CommaSeparated);
+
+/// Mechanism to add runtime checking of devirtualization decisions, optionally
+/// trapping or falling back to indirect call on any that are not correct.
+/// Trapping mode is useful for debugging undefined behavior leading to failures
+/// with WPD. Fallback mode is useful for ensuring safety when whole program
+/// visibility may be compromised.
+enum WPDCheckMode { None, Trap, Fallback };
+static cl::opt<WPDCheckMode> DevirtCheckMode(
+    "wholeprogramdevirt-check", cl::Hidden,
+    cl::desc("Type of checking for incorrect devirtualizations"),
+    cl::values(clEnumValN(WPDCheckMode::None, "none", "No checking"),
+               clEnumValN(WPDCheckMode::Trap, "trap", "Trap when incorrect"),
+               clEnumValN(WPDCheckMode::Fallback, "fallback",
+                          "Fallback to indirect when incorrect")));
 
 namespace {
 struct PatternList {
@@ -866,13 +881,14 @@ void updateVCallVisibilityInIndex(
   if (!hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO))
     return;
   for (auto &P : Index) {
+    // Don't upgrade the visibility for symbols exported to the dynamic
+    // linker, as we have no information on their eventual use.
+    if (DynamicExportSymbols.count(P.first))
+      continue;
     for (auto &S : P.second.SummaryList) {
       auto *GVar = dyn_cast<GlobalVarSummary>(S.get());
       if (!GVar ||
-          GVar->getVCallVisibility() != GlobalObject::VCallVisibilityPublic ||
-          // Don't upgrade the visibility for symbols exported to the dynamic
-          // linker, as we have no information on their eventual use.
-          DynamicExportSymbols.count(P.first))
+          GVar->getVCallVisibility() != GlobalObject::VCallVisibilityPublic)
         continue;
       GVar->setVCallVisibility(GlobalObject::VCallVisibilityLinkageUnit);
     }
@@ -1133,16 +1149,17 @@ void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo,
       if (RemarksEnabled)
         VCallSite.emitRemark("single-impl",
                              TheFn->stripPointerCasts()->getName(), OREGetter);
+      NumSingleImpl++;
       auto &CB = VCallSite.CB;
       assert(!CB.getCalledFunction() && "devirtualizing direct call?");
       IRBuilder<> Builder(&CB);
       Value *Callee =
           Builder.CreateBitCast(TheFn, CB.getCalledOperand()->getType());
 
-      // If checking is enabled, add support to compare the virtual function
-      // pointer to the devirtualized target. In case of a mismatch, perform a
-      // debug trap.
-      if (CheckDevirt) {
+      // If trap checking is enabled, add support to compare the virtual
+      // function pointer to the devirtualized target. In case of a mismatch,
+      // perform a debug trap.
+      if (DevirtCheckMode == WPDCheckMode::Trap) {
         auto *Cond = Builder.CreateICmpNE(CB.getCalledOperand(), Callee);
         Instruction *ThenTerm =
             SplitBlockAndInsertIfThen(Cond, &CB, /*Unreachable=*/false);
@@ -1152,8 +1169,38 @@ void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo,
         CallTrap->setDebugLoc(CB.getDebugLoc());
       }
 
-      // Devirtualize.
-      CB.setCalledOperand(Callee);
+      // If fallback checking is enabled, add support to compare the virtual
+      // function pointer to the devirtualized target. In case of a mismatch,
+      // fall back to indirect call.
+      if (DevirtCheckMode == WPDCheckMode::Fallback) {
+        MDNode *Weights =
+            MDBuilder(M.getContext()).createBranchWeights((1U << 20) - 1, 1);
+        // Version the indirect call site. If the called value is equal to the
+        // given callee, 'NewInst' will be executed, otherwise the original call
+        // site will be executed.
+        CallBase &NewInst = versionCallSite(CB, Callee, Weights);
+        NewInst.setCalledOperand(Callee);
+        // Since the new call site is direct, we must clear metadata that
+        // is only appropriate for indirect calls. This includes !prof and
+        // !callees metadata.
+        NewInst.setMetadata(LLVMContext::MD_prof, nullptr);
+        NewInst.setMetadata(LLVMContext::MD_callees, nullptr);
+        // Additionally, we should remove them from the fallback indirect call,
+        // so that we don't attempt to perform indirect call promotion later.
+        CB.setMetadata(LLVMContext::MD_prof, nullptr);
+        CB.setMetadata(LLVMContext::MD_callees, nullptr);
+      }
+
+      // In either trapping or non-checking mode, devirtualize original call.
+      else {
+        // Devirtualize unconditionally.
+        CB.setCalledOperand(Callee);
+        // Since the call site is now direct, we must clear metadata that
+        // is only appropriate for indirect calls. This includes !prof and
+        // !callees metadata.
+        CB.setMetadata(LLVMContext::MD_prof, nullptr);
+        CB.setMetadata(LLVMContext::MD_callees, nullptr);
+      }
 
       // This use is no longer unsafe.
       if (VCallSite.NumUnsafeUses)
@@ -1208,7 +1255,7 @@ bool DevirtModule::trySingleImplDevirt(
       return false;
 
   // If so, update each call site to call that implementation directly.
-  if (RemarksEnabled)
+  if (RemarksEnabled || AreStatisticsEnabled())
     TargetsForSlot[0].WasDevirt = true;
 
   bool IsExported = false;
@@ -1279,7 +1326,7 @@ bool DevirtIndex::trySingleImplDevirt(MutableArrayRef<ValueInfo> TargetsForSlot,
       return false;
 
   // Collect functions devirtualized at least for one call site for stats.
-  if (PrintSummaryDevirt)
+  if (PrintSummaryDevirt || AreStatisticsEnabled())
     DevirtTargets.insert(TheFn);
 
   auto &S = TheFn.getSummaryList()[0];
@@ -1385,6 +1432,7 @@ void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo,
           !FSAttr.getValueAsString().contains("+retpoline"))
         continue;
 
+      NumBranchFunnel++;
       if (RemarksEnabled)
         VCallSite.emitRemark("branch-funnel",
                              JT->stripPointerCasts()->getName(), OREGetter);
@@ -1476,6 +1524,7 @@ void DevirtModule::applyUniformRetValOpt(CallSiteInfo &CSInfo, StringRef FnName,
   for (auto Call : CSInfo.CallSites) {
     if (!OptimizedCalls.insert(&Call.CB).second)
       continue;
+    NumUniformRetVal++;
     Call.replaceAndErase(
         "uniform-ret-val", FnName, RemarksEnabled, OREGetter,
         ConstantInt::get(cast<IntegerType>(Call.CB.getType()), TheRetVal));
@@ -1499,7 +1548,7 @@ bool DevirtModule::tryUniformRetValOpt(
   }
 
   applyUniformRetValOpt(CSInfo, TargetsForSlot[0].Fn->getName(), TheRetVal);
-  if (RemarksEnabled)
+  if (RemarksEnabled || AreStatisticsEnabled())
     for (auto &&Target : TargetsForSlot)
       Target.WasDevirt = true;
   return true;
@@ -1592,6 +1641,7 @@ void DevirtModule::applyUniqueRetValOpt(CallSiteInfo &CSInfo, StringRef FnName,
         B.CreateICmp(IsOne ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE, Call.VTable,
                      B.CreateBitCast(UniqueMemberAddr, Call.VTable->getType()));
     Cmp = B.CreateZExt(Cmp, Call.CB.getType());
+    NumUniqueRetVal++;
     Call.replaceAndErase("unique-ret-val", FnName, RemarksEnabled, OREGetter,
                          Cmp);
   }
@@ -1636,7 +1686,7 @@ bool DevirtModule::tryUniqueRetValOpt(
                          UniqueMemberAddr);
 
     // Update devirtualization statistics for targets.
-    if (RemarksEnabled)
+    if (RemarksEnabled || AreStatisticsEnabled())
       for (auto &&Target : TargetsForSlot)
         Target.WasDevirt = true;
 
@@ -1665,11 +1715,13 @@ void DevirtModule::applyVirtualConstProp(CallSiteInfo &CSInfo, StringRef FnName,
       Value *Bits = B.CreateLoad(Int8Ty, Addr);
       Value *BitsAndBit = B.CreateAnd(Bits, Bit);
       auto IsBitSet = B.CreateICmpNE(BitsAndBit, ConstantInt::get(Int8Ty, 0));
+      NumVirtConstProp1Bit++;
       Call.replaceAndErase("virtual-const-prop-1-bit", FnName, RemarksEnabled,
                            OREGetter, IsBitSet);
     } else {
       Value *ValAddr = B.CreateBitCast(Addr, RetType->getPointerTo());
       Value *Val = B.CreateLoad(RetType, ValAddr);
+      NumVirtConstProp++;
       Call.replaceAndErase("virtual-const-prop", FnName, RemarksEnabled,
                            OREGetter, Val);
     }
@@ -1701,7 +1753,7 @@ bool DevirtModule::tryVirtualConstProp(
   for (VirtualCallTarget &Target : TargetsForSlot) {
     if (Target.Fn->isDeclaration() ||
         computeFunctionBodyMemoryAccess(*Target.Fn, AARGetter(*Target.Fn)) !=
-            MAK_ReadNone ||
+            FMRB_DoesNotAccessMemory ||
         Target.Fn->arg_empty() || !Target.Fn->arg_begin()->use_empty() ||
         Target.Fn->getReturnType() != RetType)
       return false;
@@ -1755,7 +1807,7 @@ bool DevirtModule::tryVirtualConstProp(
       setAfterReturnValues(TargetsForSlot, AllocAfter, BitWidth, OffsetByte,
                            OffsetBit);
 
-    if (RemarksEnabled)
+    if (RemarksEnabled || AreStatisticsEnabled())
       for (auto &&Target : TargetsForSlot)
         Target.WasDevirt = true;
 
@@ -1963,7 +2015,7 @@ void DevirtModule::scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc) {
     // (although this is unlikely). In that case, explicitly build a pair and
     // RAUW it.
     if (!CI->use_empty()) {
-      Value *Pair = UndefValue::get(CI->getType());
+      Value *Pair = PoisonValue::get(CI->getType());
       IRBuilder<> B(CI);
       Pair = B.CreateInsertValue(Pair, LoadedValue, {0});
       Pair = B.CreateInsertValue(Pair, TypeTestCall, {1});
@@ -2151,9 +2203,9 @@ bool DevirtModule::run() {
 
     removeRedundantTypeTests();
 
-    // We have lowered or deleted the type instrinsics, so we will no
-    // longer have enough information to reason about the liveness of virtual
-    // function pointers in GlobalDCE.
+    // We have lowered or deleted the type intrinsics, so we will no longer have
+    // enough information to reason about the liveness of virtual function
+    // pointers in GlobalDCE.
     for (GlobalVariable &GV : M.globals())
       GV.eraseMetadata(LLVMContext::MD_vcall_visibility);
 
@@ -2243,7 +2295,7 @@ bool DevirtModule::run() {
       }
 
       // Collect functions devirtualized at least for one call site for stats.
-      if (RemarksEnabled)
+      if (RemarksEnabled || AreStatisticsEnabled())
         for (const auto &T : TargetsForSlot)
           if (T.WasDevirt)
             DevirtTargets[std::string(T.Fn->getName())] = T.Fn;
@@ -2276,6 +2328,8 @@ bool DevirtModule::run() {
     }
   }
 
+  NumDevirtTargets += DevirtTargets.size();
+
   removeRedundantTypeTests();
 
   // Rebuild each global we touched as part of virtual constant propagation to
@@ -2284,9 +2338,9 @@ bool DevirtModule::run() {
     for (VTableBits &B : Bits)
       rebuildGlobal(B);
 
-  // We have lowered or deleted the type instrinsics, so we will no
-  // longer have enough information to reason about the liveness of virtual
-  // function pointers in GlobalDCE.
+  // We have lowered or deleted the type intrinsics, so we will no longer have
+  // enough information to reason about the liveness of virtual function
+  // pointers in GlobalDCE.
   for (GlobalVariable &GV : M.globals())
     GV.eraseMetadata(LLVMContext::MD_vcall_visibility);
 
@@ -2367,4 +2421,6 @@ void DevirtIndex::run() {
   if (PrintSummaryDevirt)
     for (const auto &DT : DevirtTargets)
       errs() << "Devirtualized call to " << DT << "\n";
+
+  NumDevirtTargets += DevirtTargets.size();
 }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 0598f751febe..f4d8b79a5311 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -693,9 +693,6 @@ unsigned FAddCombine::calcInstrNumber(const AddendVect &Opnds) {
   unsigned OpndNum = Opnds.size();
   unsigned InstrNeeded = OpndNum - 1;
 
-  // The number of addends in the form of "(-1)*x".
-  unsigned NegOpndNum = 0;
-
   // Adjust the number of instructions needed to emit the N-ary add.
   for (const FAddend *Opnd : Opnds) {
     if (Opnd->isConstant())
@@ -707,9 +704,6 @@ unsigned FAddCombine::calcInstrNumber(const AddendVect &Opnds) {
       continue;
 
     const FAddendCoef &CE = Opnd->getCoef();
-    if (CE.isMinusOne() || CE.isMinusTwo())
-      NegOpndNum++;
-
     // Let the addend be "c * x". If "c == +/-1", the value of the addend
     // is immediately available; otherwise, it needs exactly one instruction
     // to evaluate the value.
@@ -1277,7 +1271,7 @@ static Instruction *factorizeMathWithShlOps(BinaryOperator &I,
 }
 
 Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
-  if (Value *V = SimplifyAddInst(I.getOperand(0), I.getOperand(1),
+  if (Value *V = simplifyAddInst(I.getOperand(0), I.getOperand(1),
                                  I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
                                  SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
@@ -1375,6 +1369,13 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
     }
   }
 
+  // (A & 2^C1) + A => A & (2^C1 - 1) iff bit C1 in A is a sign bit
+  if (match(&I, m_c_Add(m_And(m_Value(A), m_APInt(C1)), m_Deferred(A))) &&
+      C1->isPowerOf2() && (ComputeNumSignBits(A) > C1->countLeadingZeros())) {
+    Constant *NewMask = ConstantInt::get(RHS->getType(), *C1 - 1);
+    return BinaryOperator::CreateAnd(A, NewMask);
+  }
+
   // A+B --> A|B iff A and B have no bits set in common.
   if (haveNoCommonBitsSet(LHS, RHS, DL, &AC, &I, &DT))
     return BinaryOperator::CreateOr(LHS, RHS);
@@ -1528,7 +1529,7 @@ static Instruction *factorizeFAddFSub(BinaryOperator &I,
 }
 
 Instruction *InstCombinerImpl::visitFAdd(BinaryOperator &I) {
-  if (Value *V = SimplifyFAddInst(I.getOperand(0), I.getOperand(1),
+  if (Value *V = simplifyFAddInst(I.getOperand(0), I.getOperand(1),
                                   I.getFastMathFlags(),
                                   SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
@@ -1687,7 +1688,8 @@ Value *InstCombinerImpl::OptimizePointerDifference(Value *LHS, Value *RHS,
   // Require at least one GEP with a common base pointer on both sides.
   if (auto *LHSGEP = dyn_cast<GEPOperator>(LHS)) {
     // (gep X, ...) - X
-    if (LHSGEP->getOperand(0) == RHS) {
+    if (LHSGEP->getOperand(0)->stripPointerCasts() ==
+        RHS->stripPointerCasts()) {
       GEP1 = LHSGEP;
     } else if (auto *RHSGEP = dyn_cast<GEPOperator>(RHS)) {
       // (gep X, ...) - (gep X, ...)
@@ -1749,7 +1751,7 @@ Value *InstCombinerImpl::OptimizePointerDifference(Value *LHS, Value *RHS,
 }
 
 Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
-  if (Value *V = SimplifySubInst(I.getOperand(0), I.getOperand(1),
+  if (Value *V = simplifySubInst(I.getOperand(0), I.getOperand(1),
                                  I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
                                  SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
@@ -2014,6 +2016,37 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
     }
   }
 
+  if (auto *II = dyn_cast<MinMaxIntrinsic>(Op1)) {
+    {
+      // sub(add(X,Y), s/umin(X,Y)) --> s/umax(X,Y)
+      // sub(add(X,Y), s/umax(X,Y)) --> s/umin(X,Y)
+      Value *X = II->getLHS();
+      Value *Y = II->getRHS();
+      if (match(Op0, m_c_Add(m_Specific(X), m_Specific(Y))) &&
+          (Op0->hasOneUse() || Op1->hasOneUse())) {
+        Intrinsic::ID InvID = getInverseMinMaxIntrinsic(II->getIntrinsicID());
+        Value *InvMaxMin = Builder.CreateBinaryIntrinsic(InvID, X, Y);
+        return replaceInstUsesWith(I, InvMaxMin);
+      }
+    }
+
+    {
+      // sub(add(X,Y),umin(Y,Z)) --> add(X,usub.sat(Y,Z))
+      // sub(add(X,Z),umin(Y,Z)) --> add(X,usub.sat(Z,Y))
+      Value *X, *Y, *Z;
+      if (match(Op1, m_OneUse(m_UMin(m_Value(Y), m_Value(Z))))) {
+        if (match(Op0, m_OneUse(m_c_Add(m_Specific(Y), m_Value(X)))))
+          return BinaryOperator::CreateAdd(
+              X, Builder.CreateIntrinsic(Intrinsic::usub_sat, I.getType(),
+                                         {Y, Z}));
+        if (match(Op0, m_OneUse(m_c_Add(m_Specific(Z), m_Value(X)))))
+          return BinaryOperator::CreateAdd(
+              X, Builder.CreateIntrinsic(Intrinsic::usub_sat, I.getType(),
+                                         {Z, Y}));
+      }
+    }
+  }
+
   {
     // If we have a subtraction between some value and a select between
     // said value and something else, sink subtraction into select hands, i.e.:
@@ -2089,36 +2122,6 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
     return BinaryOperator::CreateSub(X, Not);
   }
 
-  // TODO: This is the same logic as above but handles the cmp-select idioms
-  //       for min/max, so the use checks are increased to account for the
-  //       extra instructions. If we canonicalize to intrinsics, this block
-  //       can likely be removed.
-  {
-    Value *LHS, *RHS, *A;
-    Value *NotA = Op0, *MinMax = Op1;
-    SelectPatternFlavor SPF = matchSelectPattern(MinMax, LHS, RHS).Flavor;
-    if (!SelectPatternResult::isMinOrMax(SPF)) {
-      NotA = Op1;
-      MinMax = Op0;
-      SPF = matchSelectPattern(MinMax, LHS, RHS).Flavor;
-    }
-    if (SelectPatternResult::isMinOrMax(SPF) &&
-        match(NotA, m_Not(m_Value(A))) && (NotA == LHS || NotA == RHS)) {
-      if (NotA == LHS)
-        std::swap(LHS, RHS);
-      // LHS is now Y above and expected to have at least 2 uses (the min/max)
-      // NotA is expected to have 2 uses from the min/max and 1 from the sub.
-      if (isFreeToInvert(LHS, !LHS->hasNUsesOrMore(3)) &&
-          !NotA->hasNUsesOrMore(4)) {
-        Value *Not = Builder.CreateNot(MinMax);
-        if (NotA == Op0)
-          return BinaryOperator::CreateSub(Not, A);
-        else
-          return BinaryOperator::CreateSub(A, Not);
-      }
-    }
-  }
-
   // Optimize pointer differences into the same array into a size.  Consider:
   //  &A[10] - &A[0]: we should compile this to "10".
   Value *LHSOp, *RHSOp;
@@ -2149,11 +2152,11 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
     // B = ashr i32 A, 31 ; smear the sign bit
     // sub (xor A, B), B  ; flip bits if negative and subtract -1 (add 1)
     // --> (A < 0) ? -A : A
-    Value *Cmp = Builder.CreateICmpSLT(A, ConstantInt::getNullValue(Ty));
+    Value *IsNeg = Builder.CreateIsNeg(A);
     // Copy the nuw/nsw flags from the sub to the negate.
-    Value *Neg = Builder.CreateNeg(A, "", I.hasNoUnsignedWrap(),
-                                   I.hasNoSignedWrap());
-    return SelectInst::Create(Cmp, Neg, A);
+    Value *NegA = Builder.CreateNeg(A, "", I.hasNoUnsignedWrap(),
+                                    I.hasNoSignedWrap());
+    return SelectInst::Create(IsNeg, NegA, A);
   }
 
   // If we are subtracting a low-bit masked subset of some value from an add
@@ -2187,12 +2190,23 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
     return replaceInstUsesWith(
         I, Builder.CreateIntrinsic(Intrinsic::usub_sat, {Ty}, {X, Op1}));
 
+  // Op0 - umin(X, Op0) --> usub.sat(Op0, X)
+  if (match(Op1, m_OneUse(m_c_UMin(m_Value(X), m_Specific(Op0)))))
+    return replaceInstUsesWith(
+        I, Builder.CreateIntrinsic(Intrinsic::usub_sat, {Ty}, {Op0, X}));
+
   // Op0 - umax(X, Op0) --> 0 - usub.sat(X, Op0)
   if (match(Op1, m_OneUse(m_c_UMax(m_Value(X), m_Specific(Op0))))) {
     Value *USub = Builder.CreateIntrinsic(Intrinsic::usub_sat, {Ty}, {X, Op0});
     return BinaryOperator::CreateNeg(USub);
   }
 
+  // umin(X, Op1) - Op1 --> 0 - usub.sat(Op1, X)
+  if (match(Op0, m_OneUse(m_c_UMin(m_Value(X), m_Specific(Op1))))) {
+    Value *USub = Builder.CreateIntrinsic(Intrinsic::usub_sat, {Ty}, {Op1, X});
+    return BinaryOperator::CreateNeg(USub);
+  }
+
   // C - ctpop(X) => ctpop(~X) if C is bitwidth
   if (match(Op0, m_SpecificInt(Ty->getScalarSizeInBits())) &&
       match(Op1, m_OneUse(m_Intrinsic<Intrinsic::ctpop>(m_Value(X)))))
@@ -2264,7 +2278,7 @@ static Instruction *hoistFNegAboveFMulFDiv(Instruction &I,
 Instruction *InstCombinerImpl::visitFNeg(UnaryOperator &I) {
   Value *Op = I.getOperand(0);
 
-  if (Value *V = SimplifyFNegInst(Op, I.getFastMathFlags(),
+  if (Value *V = simplifyFNegInst(Op, I.getFastMathFlags(),
                                   getSimplifyQuery().getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
@@ -2287,10 +2301,11 @@ Instruction *InstCombinerImpl::visitFNeg(UnaryOperator &I) {
     // Unlike most transforms, this one is not safe to propagate nsz unless
     // it is present on the original select. (We are conservatively intersecting
     // the nsz flags from the select and root fneg instruction.)
-    auto propagateSelectFMF = [&](SelectInst *S) {
+    auto propagateSelectFMF = [&](SelectInst *S, bool CommonOperand) {
       S->copyFastMathFlags(&I);
       if (auto *OldSel = dyn_cast<SelectInst>(Op))
-        if (!OldSel->hasNoSignedZeros())
+        if (!OldSel->hasNoSignedZeros() && !CommonOperand &&
+            !isGuaranteedNotToBeUndefOrPoison(OldSel->getCondition()))
           S->setHasNoSignedZeros(false);
     };
     // -(Cond ? -P : Y) --> Cond ? P : -Y
@@ -2298,14 +2313,14 @@ Instruction *InstCombinerImpl::visitFNeg(UnaryOperator &I) {
     if (match(X, m_FNeg(m_Value(P)))) {
       Value *NegY = Builder.CreateFNegFMF(Y, &I, Y->getName() + ".neg");
       SelectInst *NewSel = SelectInst::Create(Cond, P, NegY);
-      propagateSelectFMF(NewSel);
+      propagateSelectFMF(NewSel, P == Y);
       return NewSel;
     }
     // -(Cond ? X : -P) --> Cond ? -X : P
     if (match(Y, m_FNeg(m_Value(P)))) {
       Value *NegX = Builder.CreateFNegFMF(X, &I, X->getName() + ".neg");
       SelectInst *NewSel = SelectInst::Create(Cond, NegX, P);
-      propagateSelectFMF(NewSel);
+      propagateSelectFMF(NewSel, P == X);
       return NewSel;
     }
   }
@@ -2314,7 +2329,7 @@ Instruction *InstCombinerImpl::visitFNeg(UnaryOperator &I) {
 }
 
 Instruction *InstCombinerImpl::visitFSub(BinaryOperator &I) {
-  if (Value *V = SimplifyFSubInst(I.getOperand(0), I.getOperand(1),
+  if (Value *V = simplifyFSubInst(I.getOperand(0), I.getOperand(1),
                                   I.getFastMathFlags(),
                                   getSimplifyQuery().getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 6bbb0251f2bc..ae8865651ece 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -24,32 +24,6 @@ using namespace PatternMatch;
 
 #define DEBUG_TYPE "instcombine"
 
-/// Similar to getICmpCode but for FCmpInst. This encodes a fcmp predicate into
-/// a four bit mask.
-static unsigned getFCmpCode(FCmpInst::Predicate CC) {
-  assert(FCmpInst::FCMP_FALSE <= CC && CC <= FCmpInst::FCMP_TRUE &&
-         "Unexpected FCmp predicate!");
-  // Take advantage of the bit pattern of FCmpInst::Predicate here.
-  //                                                 U L G E
-  static_assert(FCmpInst::FCMP_FALSE ==  0, "");  // 0 0 0 0
-  static_assert(FCmpInst::FCMP_OEQ   ==  1, "");  // 0 0 0 1
-  static_assert(FCmpInst::FCMP_OGT   ==  2, "");  // 0 0 1 0
-  static_assert(FCmpInst::FCMP_OGE   ==  3, "");  // 0 0 1 1
-  static_assert(FCmpInst::FCMP_OLT   ==  4, "");  // 0 1 0 0
-  static_assert(FCmpInst::FCMP_OLE   ==  5, "");  // 0 1 0 1
-  static_assert(FCmpInst::FCMP_ONE   ==  6, "");  // 0 1 1 0
-  static_assert(FCmpInst::FCMP_ORD   ==  7, "");  // 0 1 1 1
-  static_assert(FCmpInst::FCMP_UNO   ==  8, "");  // 1 0 0 0
-  static_assert(FCmpInst::FCMP_UEQ   ==  9, "");  // 1 0 0 1
-  static_assert(FCmpInst::FCMP_UGT   == 10, "");  // 1 0 1 0
-  static_assert(FCmpInst::FCMP_UGE   == 11, "");  // 1 0 1 1
-  static_assert(FCmpInst::FCMP_ULT   == 12, "");  // 1 1 0 0
-  static_assert(FCmpInst::FCMP_ULE   == 13, "");  // 1 1 0 1
-  static_assert(FCmpInst::FCMP_UNE   == 14, "");  // 1 1 1 0
-  static_assert(FCmpInst::FCMP_TRUE  == 15, "");  // 1 1 1 1
-  return CC;
-}
-
 /// This is the complement of getICmpCode, which turns an opcode and two
 /// operands into either a constant true or false, or a brand new ICmp
 /// instruction. The sign is passed in to determine which kind of predicate to
@@ -66,14 +40,10 @@ static Value *getNewICmpValue(unsigned Code, bool Sign, Value *LHS, Value *RHS,
 /// operands into either a FCmp instruction, or a true/false constant.
 static Value *getFCmpValue(unsigned Code, Value *LHS, Value *RHS,
                            InstCombiner::BuilderTy &Builder) {
-  const auto Pred = static_cast<FCmpInst::Predicate>(Code);
-  assert(FCmpInst::FCMP_FALSE <= Pred && Pred <= FCmpInst::FCMP_TRUE &&
-         "Unexpected FCmp predicate!");
-  if (Pred == FCmpInst::FCMP_FALSE)
-    return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0);
-  if (Pred == FCmpInst::FCMP_TRUE)
-    return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 1);
-  return Builder.CreateFCmp(Pred, LHS, RHS);
+  FCmpInst::Predicate NewPred;
+  if (Constant *TorF = getPredForFCmpCode(Code, LHS->getType(), NewPred))
+    return TorF;
+  return Builder.CreateFCmp(NewPred, LHS, RHS);
 }
 
 /// Transform BITWISE_OP(BSWAP(A),BSWAP(B)) or
@@ -395,6 +365,7 @@ getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C,
 /// (icmp(A & X) ==/!= Y), where the left-hand side is of type Mask_NotAllZeros
 /// and the right hand side is of type BMask_Mixed. For example,
 /// (icmp (A & 12) != 0) & (icmp (A & 15) == 8) -> (icmp (A & 15) == 8).
+/// Also used for logical and/or, must be poison safe.
 static Value *foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
     ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, Value *A, Value *B, Value *C,
     Value *D, Value *E, ICmpInst::Predicate PredL, ICmpInst::Predicate PredR,
@@ -409,9 +380,9 @@ static Value *foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
   //
   // We currently handle the case of B, C, D, E are constant.
   //
-  ConstantInt *BCst, *CCst, *DCst, *ECst;
-  if (!match(B, m_ConstantInt(BCst)) || !match(C, m_ConstantInt(CCst)) ||
-      !match(D, m_ConstantInt(DCst)) || !match(E, m_ConstantInt(ECst)))
+  const APInt *BCst, *CCst, *DCst, *OrigECst;
+  if (!match(B, m_APInt(BCst)) || !match(C, m_APInt(CCst)) ||
+      !match(D, m_APInt(DCst)) || !match(E, m_APInt(OrigECst)))
     return nullptr;
 
   ICmpInst::Predicate NewCC = IsAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE;
@@ -420,19 +391,20 @@ static Value *foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
   // canonicalized as,
   // (icmp ne (A & D), 0) -> (icmp eq (A & D), D) or
   // (icmp ne (A & D), D) -> (icmp eq (A & D), 0).
+  APInt ECst = *OrigECst;
   if (PredR != NewCC)
-    ECst = cast<ConstantInt>(ConstantExpr::getXor(DCst, ECst));
+    ECst ^= *DCst;
 
   // If B or D is zero, skip because if LHS or RHS can be trivially folded by
   // other folding rules and this pattern won't apply any more.
-  if (BCst->getValue() == 0 || DCst->getValue() == 0)
+  if (*BCst == 0 || *DCst == 0)
     return nullptr;
 
   // If B and D don't intersect, ie. (B & D) == 0, no folding because we can't
   // deduce anything from it.
   // For example,
   // (icmp ne (A & 12), 0) & (icmp eq (A & 3), 1) -> no folding.
-  if ((BCst->getValue() & DCst->getValue()) == 0)
+  if ((*BCst & *DCst) == 0)
     return nullptr;
 
   // If the following two conditions are met:
@@ -451,22 +423,21 @@ static Value *foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
   // For example,
   // (icmp ne (A & 12), 0) & (icmp eq (A & 7), 1) -> (icmp eq (A & 15), 9)
   // (icmp ne (A & 15), 0) & (icmp eq (A & 7), 0) -> (icmp eq (A & 15), 8)
-  if ((((BCst->getValue() & DCst->getValue()) & ECst->getValue()) == 0) &&
-      (BCst->getValue() & (BCst->getValue() ^ DCst->getValue())).isPowerOf2()) {
-    APInt BorD = BCst->getValue() | DCst->getValue();
-    APInt BandBxorDorE = (BCst->getValue() & (BCst->getValue() ^ DCst->getValue())) |
-        ECst->getValue();
-    Value *NewMask = ConstantInt::get(BCst->getType(), BorD);
-    Value *NewMaskedValue = ConstantInt::get(BCst->getType(), BandBxorDorE);
+  if ((((*BCst & *DCst) & ECst) == 0) &&
+      (*BCst & (*BCst ^ *DCst)).isPowerOf2()) {
+    APInt BorD = *BCst | *DCst;
+    APInt BandBxorDorE = (*BCst & (*BCst ^ *DCst)) | ECst;
+    Value *NewMask = ConstantInt::get(A->getType(), BorD);
+    Value *NewMaskedValue = ConstantInt::get(A->getType(), BandBxorDorE);
     Value *NewAnd = Builder.CreateAnd(A, NewMask);
     return Builder.CreateICmp(NewCC, NewAnd, NewMaskedValue);
   }
 
-  auto IsSubSetOrEqual = [](ConstantInt *C1, ConstantInt *C2) {
-    return (C1->getValue() & C2->getValue()) == C1->getValue();
+  auto IsSubSetOrEqual = [](const APInt *C1, const APInt *C2) {
+    return (*C1 & *C2) == *C1;
   };
-  auto IsSuperSetOrEqual = [](ConstantInt *C1, ConstantInt *C2) {
-    return (C1->getValue() & C2->getValue()) == C2->getValue();
+  auto IsSuperSetOrEqual = [](const APInt *C1, const APInt *C2) {
+    return (*C1 & *C2) == *C2;
   };
 
   // In the following, we consider only the cases where B is a superset of D, B
@@ -486,7 +457,7 @@ static Value *foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
   // For example,
   // (icmp ne (A & 3), 0) & (icmp eq (A & 7), 0) -> false.
   // (icmp ne (A & 15), 0) & (icmp eq (A & 3), 0) -> no folding.
-  if (ECst->isZero()) {
+  if (ECst.isZero()) {
     if (IsSubSetOrEqual(BCst, DCst))
       return ConstantInt::get(LHS->getType(), !IsAnd);
     return nullptr;
@@ -504,7 +475,7 @@ static Value *foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
   // ie. (B & E) != 0, then LHS is subsumed by RHS. For example.
   // (icmp ne (A & 12), 0) & (icmp eq (A & 15), 8) -> (icmp eq (A & 15), 8).
   assert(IsSubSetOrEqual(BCst, DCst) && "Precondition due to above code");
-  if ((BCst->getValue() & ECst->getValue()) != 0)
+  if ((*BCst & ECst) != 0)
     return RHS;
   // Otherwise, LHS and RHS contradict and the whole expression becomes false
   // (or true if negated.) For example,
@@ -516,6 +487,7 @@ static Value *foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
 /// Try to fold (icmp(A & B) ==/!= 0) &/| (icmp(A & D) ==/!= E) into a single
 /// (icmp(A & X) ==/!= Y), where the left-hand side and the right hand side
 /// aren't of the common mask pattern type.
+/// Also used for logical and/or, must be poison safe.
 static Value *foldLogOpOfMaskedICmpsAsymmetric(
     ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, Value *A, Value *B, Value *C,
     Value *D, Value *E, ICmpInst::Predicate PredL, ICmpInst::Predicate PredR,
@@ -550,6 +522,7 @@ static Value *foldLogOpOfMaskedICmpsAsymmetric(
 /// Try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E)
 /// into a single (icmp(A & X) ==/!= Y).
 static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
+                                     bool IsLogical,
                                      InstCombiner::BuilderTy &Builder) {
   Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr;
   ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
@@ -594,6 +567,8 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
   if (Mask & Mask_AllZeros) {
     // (icmp eq (A & B), 0) & (icmp eq (A & D), 0)
     // -> (icmp eq (A & (B|D)), 0)
+    if (IsLogical && !isGuaranteedNotToBeUndefOrPoison(D))
+      return nullptr; // TODO: Use freeze?
     Value *NewOr = Builder.CreateOr(B, D);
     Value *NewAnd = Builder.CreateAnd(A, NewOr);
     // We can't use C as zero because we might actually handle
@@ -605,6 +580,8 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
   if (Mask & BMask_AllOnes) {
     // (icmp eq (A & B), B) & (icmp eq (A & D), D)
     // -> (icmp eq (A & (B|D)), (B|D))
+    if (IsLogical && !isGuaranteedNotToBeUndefOrPoison(D))
+      return nullptr; // TODO: Use freeze?
     Value *NewOr = Builder.CreateOr(B, D);
     Value *NewAnd = Builder.CreateAnd(A, NewOr);
     return Builder.CreateICmp(NewCC, NewAnd, NewOr);
@@ -612,6 +589,8 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
   if (Mask & AMask_AllOnes) {
     // (icmp eq (A & B), A) & (icmp eq (A & D), A)
     // -> (icmp eq (A & (B&D)), A)
+    if (IsLogical && !isGuaranteedNotToBeUndefOrPoison(D))
+      return nullptr; // TODO: Use freeze?
     Value *NewAnd1 = Builder.CreateAnd(B, D);
     Value *NewAnd2 = Builder.CreateAnd(A, NewAnd1);
     return Builder.CreateICmp(NewCC, NewAnd2, A);
@@ -736,47 +715,6 @@ Value *InstCombinerImpl::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1,
   return Builder.CreateICmp(NewPred, Input, RangeEnd);
 }
 
-static Value *
-foldAndOrOfEqualityCmpsWithConstants(ICmpInst *LHS, ICmpInst *RHS,
-                                     bool JoinedByAnd,
-                                     InstCombiner::BuilderTy &Builder) {
-  Value *X = LHS->getOperand(0);
-  if (X != RHS->getOperand(0))
-    return nullptr;
-
-  const APInt *C1, *C2;
-  if (!match(LHS->getOperand(1), m_APInt(C1)) ||
-      !match(RHS->getOperand(1), m_APInt(C2)))
-    return nullptr;
-
-  // We only handle (X != C1 && X != C2) and (X == C1 || X == C2).
-  ICmpInst::Predicate Pred = LHS->getPredicate();
-  if (Pred !=  RHS->getPredicate())
-    return nullptr;
-  if (JoinedByAnd && Pred != ICmpInst::ICMP_NE)
-    return nullptr;
-  if (!JoinedByAnd && Pred != ICmpInst::ICMP_EQ)
-    return nullptr;
-
-  // The larger unsigned constant goes on the right.
-  if (C1->ugt(*C2))
-    std::swap(C1, C2);
-
-  APInt Xor = *C1 ^ *C2;
-  if (Xor.isPowerOf2()) {
-    // If LHSC and RHSC differ by only one bit, then set that bit in X and
-    // compare against the larger constant:
-    // (X == C1 || X == C2) --> (X | (C1 ^ C2)) == C2
-    // (X != C1 && X != C2) --> (X | (C1 ^ C2)) != C2
-    // We choose an 'or' with a Pow2 constant rather than the inverse mask with
-    // 'and' because that may lead to smaller codegen from a smaller constant.
-    Value *Or = Builder.CreateOr(X, ConstantInt::get(X->getType(), Xor));
-    return Builder.CreateICmp(Pred, Or, ConstantInt::get(X->getType(), *C2));
-  }
-
-  return nullptr;
-}
-
 // Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2)
 // Fold (!iszero(A & K1) & !iszero(A & K2)) -> (A & (K1 | K2)) == (K1 | K2)
 Value *InstCombinerImpl::foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS,
@@ -941,7 +879,29 @@ static Value *foldSignedTruncationCheck(ICmpInst *ICmp0, ICmpInst *ICmp1,
                                CxtI.getName() + ".simplified");
 }
 
+/// Fold (icmp eq ctpop(X) 1) | (icmp eq X 0) into (icmp ult ctpop(X) 2) and
+/// fold (icmp ne ctpop(X) 1) & (icmp ne X 0) into (icmp ugt ctpop(X) 1).
+/// Also used for logical and/or, must be poison safe.
+static Value *foldIsPowerOf2OrZero(ICmpInst *Cmp0, ICmpInst *Cmp1, bool IsAnd,
+                                   InstCombiner::BuilderTy &Builder) {
+  CmpInst::Predicate Pred0, Pred1;
+  Value *X;
+  if (!match(Cmp0, m_ICmp(Pred0, m_Intrinsic<Intrinsic::ctpop>(m_Value(X)),
+                          m_SpecificInt(1))) ||
+      !match(Cmp1, m_ICmp(Pred1, m_Specific(X), m_ZeroInt())))
+    return nullptr;
+
+  Value *CtPop = Cmp0->getOperand(0);
+  if (IsAnd && Pred0 == ICmpInst::ICMP_NE && Pred1 == ICmpInst::ICMP_NE)
+    return Builder.CreateICmpUGT(CtPop, ConstantInt::get(CtPop->getType(), 1));
+  if (!IsAnd && Pred0 == ICmpInst::ICMP_EQ && Pred1 == ICmpInst::ICMP_EQ)
+    return Builder.CreateICmpULT(CtPop, ConstantInt::get(CtPop->getType(), 2));
+
+  return nullptr;
+}
+
 /// Reduce a pair of compares that check if a value has exactly 1 bit set.
+/// Also used for logical and/or, must be poison safe.
 static Value *foldIsPowerOf2(ICmpInst *Cmp0, ICmpInst *Cmp1, bool JoinedByAnd,
                              InstCombiner::BuilderTy &Builder) {
   // Handle 'and' / 'or' commutation: make the equality check the first operand.
@@ -1001,22 +961,13 @@ static Value *foldUnsignedUnderflowCheck(ICmpInst *ZeroICmp,
     };
 
     // Given  ZeroCmpOp = (A + B)
-    //   ZeroCmpOp <= A && ZeroCmpOp != 0  -->  (0-B) <  A
-    //   ZeroCmpOp >  A || ZeroCmpOp == 0  -->  (0-B) >= A
-    //
     //   ZeroCmpOp <  A && ZeroCmpOp != 0  -->  (0-X) <  Y  iff
     //   ZeroCmpOp >= A || ZeroCmpOp == 0  -->  (0-X) >= Y  iff
     //     with X being the value (A/B) that is known to be non-zero,
     //     and Y being remaining value.
-    if (UnsignedPred == ICmpInst::ICMP_ULE && EqPred == ICmpInst::ICMP_NE &&
-        IsAnd)
-      return Builder.CreateICmpULT(Builder.CreateNeg(B), A);
     if (UnsignedPred == ICmpInst::ICMP_ULT && EqPred == ICmpInst::ICMP_NE &&
         IsAnd && GetKnownNonZeroAndOther(B, A))
       return Builder.CreateICmpULT(Builder.CreateNeg(B), A);
-    if (UnsignedPred == ICmpInst::ICMP_UGT && EqPred == ICmpInst::ICMP_EQ &&
-        !IsAnd)
-      return Builder.CreateICmpUGE(Builder.CreateNeg(B), A);
     if (UnsignedPred == ICmpInst::ICMP_UGE && EqPred == ICmpInst::ICMP_EQ &&
         !IsAnd && GetKnownNonZeroAndOther(B, A))
       return Builder.CreateICmpUGE(Builder.CreateNeg(B), A);
@@ -1143,12 +1094,9 @@ Value *InstCombinerImpl::foldEqOfParts(ICmpInst *Cmp0, ICmpInst *Cmp1,
 /// common operand with the constant. Callers are expected to call this with
 /// Cmp0/Cmp1 switched to handle logic op commutativity.
 static Value *foldAndOrOfICmpsWithConstEq(ICmpInst *Cmp0, ICmpInst *Cmp1,
-                                          BinaryOperator &Logic,
+                                          bool IsAnd,
                                           InstCombiner::BuilderTy &Builder,
                                           const SimplifyQuery &Q) {
-  bool IsAnd = Logic.getOpcode() == Instruction::And;
-  assert((IsAnd || Logic.getOpcode() == Instruction::Or) && "Wrong logic op");
-
   // Match an equality compare with a non-poison constant as Cmp0.
   // Also, give up if the compare can be constant-folded to avoid looping.
   ICmpInst::Predicate Pred0;
@@ -1174,7 +1122,7 @@ static Value *foldAndOrOfICmpsWithConstEq(ICmpInst *Cmp0, ICmpInst *Cmp1,
   // (X != C) || (Y Pred1 X) --> (X != C) || (Y Pred1 C)
   // Can think of the 'or' substitution with the 'and' bool equivalent:
   // A || B --> A || (!A && B)
-  Value *SubstituteCmp = SimplifyICmpInst(Pred1, Y, C, Q);
+  Value *SubstituteCmp = simplifyICmpInst(Pred1, Y, C, Q);
   if (!SubstituteCmp) {
     // If we need to create a new instruction, require that the old compare can
     // be removed.
@@ -1182,16 +1130,24 @@ static Value *foldAndOrOfICmpsWithConstEq(ICmpInst *Cmp0, ICmpInst *Cmp1,
       return nullptr;
     SubstituteCmp = Builder.CreateICmp(Pred1, Y, C);
   }
-  return Builder.CreateBinOp(Logic.getOpcode(), Cmp0, SubstituteCmp);
+  return Builder.CreateBinOp(IsAnd ? Instruction::And : Instruction::Or, Cmp0,
+                             SubstituteCmp);
 }
 
 /// Fold (icmp Pred1 V1, C1) & (icmp Pred2 V2, C2)
 /// or   (icmp Pred1 V1, C1) | (icmp Pred2 V2, C2)
 /// into a single comparison using range-based reasoning.
-static Value *foldAndOrOfICmpsUsingRanges(
-    ICmpInst::Predicate Pred1, Value *V1, const APInt &C1,
-    ICmpInst::Predicate Pred2, Value *V2, const APInt &C2,
-    IRBuilderBase &Builder, bool IsAnd) {
+/// NOTE: This is also used for logical and/or, must be poison-safe!
+Value *InstCombinerImpl::foldAndOrOfICmpsUsingRanges(ICmpInst *ICmp1,
+                                                     ICmpInst *ICmp2,
+                                                     bool IsAnd) {
+  ICmpInst::Predicate Pred1, Pred2;
+  Value *V1, *V2;
+  const APInt *C1, *C2;
+  if (!match(ICmp1, m_ICmp(Pred1, m_Value(V1), m_APInt(C1))) ||
+      !match(ICmp2, m_ICmp(Pred2, m_Value(V2), m_APInt(C2))))
+    return nullptr;
+
   // Look through add of a constant offset on V1, V2, or both operands. This
   // allows us to interpret the V + C' < C'' range idiom into a proper range.
   const APInt *Offset1 = nullptr, *Offset2 = nullptr;
@@ -1206,152 +1162,51 @@ static Value *foldAndOrOfICmpsUsingRanges(
   if (V1 != V2)
     return nullptr;
 
-  ConstantRange CR1 = ConstantRange::makeExactICmpRegion(Pred1, C1);
+  ConstantRange CR1 = ConstantRange::makeExactICmpRegion(
+      IsAnd ? ICmpInst::getInversePredicate(Pred1) : Pred1, *C1);
   if (Offset1)
     CR1 = CR1.subtract(*Offset1);
 
-  ConstantRange CR2 = ConstantRange::makeExactICmpRegion(Pred2, C2);
+  ConstantRange CR2 = ConstantRange::makeExactICmpRegion(
+      IsAnd ? ICmpInst::getInversePredicate(Pred2) : Pred2, *C2);
   if (Offset2)
     CR2 = CR2.subtract(*Offset2);
 
-  Optional<ConstantRange> CR =
-      IsAnd ? CR1.exactIntersectWith(CR2) : CR1.exactUnionWith(CR2);
-  if (!CR)
-    return nullptr;
-
-  CmpInst::Predicate NewPred;
-  APInt NewC, Offset;
-  CR->getEquivalentICmp(NewPred, NewC, Offset);
-
   Type *Ty = V1->getType();
   Value *NewV = V1;
-  if (Offset != 0)
-    NewV = Builder.CreateAdd(NewV, ConstantInt::get(Ty, Offset));
-  return Builder.CreateICmp(NewPred, NewV, ConstantInt::get(Ty, NewC));
-}
-
-/// Fold (icmp)&(icmp) if possible.
-Value *InstCombinerImpl::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
-                                        BinaryOperator &And) {
-  const SimplifyQuery Q = SQ.getWithInstruction(&And);
-
-  // Fold (!iszero(A & K1) & !iszero(A & K2)) ->  (A & (K1 | K2)) == (K1 | K2)
-  // if K1 and K2 are a one-bit mask.
-  if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, &And,
-                                               /* IsAnd */ true))
-    return V;
-
-  ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
-
-  // (icmp1 A, B) & (icmp2 A, B) --> (icmp3 A, B)
-  if (predicatesFoldable(PredL, PredR)) {
-    if (LHS->getOperand(0) == RHS->getOperand(1) &&
-        LHS->getOperand(1) == RHS->getOperand(0))
-      LHS->swapOperands();
-    if (LHS->getOperand(0) == RHS->getOperand(0) &&
-        LHS->getOperand(1) == RHS->getOperand(1)) {
-      Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1);
-      unsigned Code = getICmpCode(LHS) & getICmpCode(RHS);
-      bool IsSigned = LHS->isSigned() || RHS->isSigned();
-      return getNewICmpValue(Code, IsSigned, Op0, Op1, Builder);
-    }
-  }
-
-  // handle (roughly):  (icmp eq (A & B), C) & (icmp eq (A & D), E)
-  if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, true, Builder))
-    return V;
-
-  if (Value *V = foldAndOrOfICmpsWithConstEq(LHS, RHS, And, Builder, Q))
-    return V;
-  if (Value *V = foldAndOrOfICmpsWithConstEq(RHS, LHS, And, Builder, Q))
-    return V;
-
-  // E.g. (icmp sge x, 0) & (icmp slt x, n) --> icmp ult x, n
-  if (Value *V = simplifyRangeCheck(LHS, RHS, /*Inverted=*/false))
-    return V;
-
-  // E.g. (icmp slt x, n) & (icmp sge x, 0) --> icmp ult x, n
-  if (Value *V = simplifyRangeCheck(RHS, LHS, /*Inverted=*/false))
-    return V;
-
-  if (Value *V = foldAndOrOfEqualityCmpsWithConstants(LHS, RHS, true, Builder))
-    return V;
-
-  if (Value *V = foldSignedTruncationCheck(LHS, RHS, And, Builder))
-    return V;
-
-  if (Value *V = foldIsPowerOf2(LHS, RHS, true /* JoinedByAnd */, Builder))
-    return V;
-
-  if (Value *X =
-          foldUnsignedUnderflowCheck(LHS, RHS, /*IsAnd=*/true, Q, Builder))
-    return X;
-  if (Value *X =
-          foldUnsignedUnderflowCheck(RHS, LHS, /*IsAnd=*/true, Q, Builder))
-    return X;
-
-  if (Value *X = foldEqOfParts(LHS, RHS, /*IsAnd=*/true))
-    return X;
+  Optional<ConstantRange> CR = CR1.exactUnionWith(CR2);
+  if (!CR) {
+    if (!(ICmp1->hasOneUse() && ICmp2->hasOneUse()) || CR1.isWrappedSet() ||
+        CR2.isWrappedSet())
+      return nullptr;
 
-  // This only handles icmp of constants: (icmp1 A, C1) & (icmp2 B, C2).
-  Value *LHS0 = LHS->getOperand(0), *RHS0 = RHS->getOperand(0);
+    // Check whether we have equal-size ranges that only differ by one bit.
+    // In that case we can apply a mask to map one range onto the other.
+    APInt LowerDiff = CR1.getLower() ^ CR2.getLower();
+    APInt UpperDiff = (CR1.getUpper() - 1) ^ (CR2.getUpper() - 1);
+    APInt CR1Size = CR1.getUpper() - CR1.getLower();
+    if (!LowerDiff.isPowerOf2() || LowerDiff != UpperDiff ||
+        CR1Size != CR2.getUpper() - CR2.getLower())
+      return nullptr;
 
-  // (icmp eq A, 0) & (icmp eq B, 0) --> (icmp eq (A|B), 0)
-  // TODO: Remove this when foldLogOpOfMaskedICmps can handle undefs.
-  if (PredL == ICmpInst::ICMP_EQ && match(LHS->getOperand(1), m_ZeroInt()) &&
-      PredR == ICmpInst::ICMP_EQ && match(RHS->getOperand(1), m_ZeroInt()) &&
-      LHS0->getType() == RHS0->getType()) {
-    Value *NewOr = Builder.CreateOr(LHS0, RHS0);
-    return Builder.CreateICmp(PredL, NewOr,
-                              Constant::getNullValue(NewOr->getType()));
+    CR = CR1.getLower().ult(CR2.getLower()) ? CR1 : CR2;
+    NewV = Builder.CreateAnd(NewV, ConstantInt::get(Ty, ~LowerDiff));
   }
 
-  const APInt *LHSC, *RHSC;
-  if (!match(LHS->getOperand(1), m_APInt(LHSC)) ||
-      !match(RHS->getOperand(1), m_APInt(RHSC)))
-    return nullptr;
-
-  // (trunc x) == C1 & (and x, CA) == C2 -> (and x, CA|CMAX) == C1|C2
-  // where CMAX is the all ones value for the truncated type,
-  // iff the lower bits of C2 and CA are zero.
-  if (PredL == ICmpInst::ICMP_EQ && PredL == PredR && LHS->hasOneUse() &&
-      RHS->hasOneUse()) {
-    Value *V;
-    const APInt *AndC, *SmallC = nullptr, *BigC = nullptr;
-
-    // (trunc x) == C1 & (and x, CA) == C2
-    // (and x, CA) == C2 & (trunc x) == C1
-    if (match(RHS0, m_Trunc(m_Value(V))) &&
-        match(LHS0, m_And(m_Specific(V), m_APInt(AndC)))) {
-      SmallC = RHSC;
-      BigC = LHSC;
-    } else if (match(LHS0, m_Trunc(m_Value(V))) &&
-               match(RHS0, m_And(m_Specific(V), m_APInt(AndC)))) {
-      SmallC = LHSC;
-      BigC = RHSC;
-    }
-
-    if (SmallC && BigC) {
-      unsigned BigBitSize = BigC->getBitWidth();
-      unsigned SmallBitSize = SmallC->getBitWidth();
+  if (IsAnd)
+    CR = CR->inverse();
 
-      // Check that the low bits are zero.
-      APInt Low = APInt::getLowBitsSet(BigBitSize, SmallBitSize);
-      if ((Low & *AndC).isZero() && (Low & *BigC).isZero()) {
-        Value *NewAnd = Builder.CreateAnd(V, Low | *AndC);
-        APInt N = SmallC->zext(BigBitSize) | *BigC;
-        Value *NewVal = ConstantInt::get(NewAnd->getType(), N);
-        return Builder.CreateICmp(PredL, NewAnd, NewVal);
-      }
-    }
-  }
+  CmpInst::Predicate NewPred;
+  APInt NewC, Offset;
+  CR->getEquivalentICmp(NewPred, NewC, Offset);
 
-  return foldAndOrOfICmpsUsingRanges(PredL, LHS0, *LHSC, PredR, RHS0, *RHSC,
-                                     Builder, /* IsAnd */ true);
+  if (Offset != 0)
+    NewV = Builder.CreateAdd(NewV, ConstantInt::get(Ty, Offset));
+  return Builder.CreateICmp(NewPred, NewV, ConstantInt::get(Ty, NewC));
 }
 
 Value *InstCombinerImpl::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS,
-                                          bool IsAnd) {
+                                          bool IsAnd, bool IsLogicalSelect) {
   Value *LHS0 = LHS->getOperand(0), *LHS1 = LHS->getOperand(1);
   Value *RHS0 = RHS->getOperand(0), *RHS1 = RHS->getOperand(1);
   FCmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
@@ -1380,11 +1235,22 @@ Value *InstCombinerImpl::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS,
     unsigned FCmpCodeL = getFCmpCode(PredL);
     unsigned FCmpCodeR = getFCmpCode(PredR);
     unsigned NewPred = IsAnd ? FCmpCodeL & FCmpCodeR : FCmpCodeL | FCmpCodeR;
+
+    // Intersect the fast math flags.
+    // TODO: We can union the fast math flags unless this is a logical select.
+    IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+    FastMathFlags FMF = LHS->getFastMathFlags();
+    FMF &= RHS->getFastMathFlags();
+    Builder.setFastMathFlags(FMF);
+
     return getFCmpValue(NewPred, LHS0, LHS1, Builder);
   }
 
-  if ((PredL == FCmpInst::FCMP_ORD && PredR == FCmpInst::FCMP_ORD && IsAnd) ||
-      (PredL == FCmpInst::FCMP_UNO && PredR == FCmpInst::FCMP_UNO && !IsAnd)) {
+  // This transform is not valid for a logical select.
+  if (!IsLogicalSelect &&
+      ((PredL == FCmpInst::FCMP_ORD && PredR == FCmpInst::FCMP_ORD && IsAnd) ||
+       (PredL == FCmpInst::FCMP_UNO && PredR == FCmpInst::FCMP_UNO &&
+        !IsAnd))) {
     if (LHS0->getType() != RHS0->getType())
       return nullptr;
 
@@ -1574,9 +1440,10 @@ Instruction *InstCombinerImpl::foldCastedBitwiseLogic(BinaryOperator &I) {
   Value *Cast1Src = Cast1->getOperand(0);
 
   // fold logic(cast(A), cast(B)) -> cast(logic(A, B))
-  if (shouldOptimizeCast(Cast0) && shouldOptimizeCast(Cast1)) {
+  if ((Cast0->hasOneUse() || Cast1->hasOneUse()) &&
+      shouldOptimizeCast(Cast0) && shouldOptimizeCast(Cast1)) {
     Value *NewOp = Builder.CreateBinOp(LogicOpc, Cast0Src, Cast1Src,
-                                        I.getName());
+                                       I.getName());
     return CastInst::Create(CastOpcode, NewOp, DestTy);
   }
 
@@ -1589,9 +1456,8 @@ Instruction *InstCombinerImpl::foldCastedBitwiseLogic(BinaryOperator &I) {
   ICmpInst *ICmp0 = dyn_cast<ICmpInst>(Cast0Src);
   ICmpInst *ICmp1 = dyn_cast<ICmpInst>(Cast1Src);
   if (ICmp0 && ICmp1) {
-    Value *Res = LogicOpc == Instruction::And ? foldAndOfICmps(ICmp0, ICmp1, I)
-                                              : foldOrOfICmps(ICmp0, ICmp1, I);
-    if (Res)
+    if (Value *Res =
+            foldAndOrOfICmps(ICmp0, ICmp1, I, LogicOpc == Instruction::And))
       return CastInst::Create(CastOpcode, Res, DestTy);
     return nullptr;
   }
@@ -1862,7 +1728,7 @@ static Instruction *foldComplexAndOrPatterns(BinaryOperator &I,
 Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
   Type *Ty = I.getType();
 
-  if (Value *V = SimplifyAndInst(I.getOperand(0), I.getOperand(1),
+  if (Value *V = simplifyAndInst(I.getOperand(0), I.getOperand(1),
                                  SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
@@ -1930,25 +1796,6 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
       return BinaryOperator::CreateOr(And, ConstantInt::get(Ty, Together));
     }
 
-    // If the mask is only needed on one incoming arm, push the 'and' op up.
-    if (match(Op0, m_OneUse(m_Xor(m_Value(X), m_Value(Y)))) ||
-        match(Op0, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) {
-      APInt NotAndMask(~(*C));
-      BinaryOperator::BinaryOps BinOp = cast<BinaryOperator>(Op0)->getOpcode();
-      if (MaskedValueIsZero(X, NotAndMask, 0, &I)) {
-        // Not masking anything out for the LHS, move mask to RHS.
-        // and ({x}or X, Y), C --> {x}or X, (and Y, C)
-        Value *NewRHS = Builder.CreateAnd(Y, Op1, Y->getName() + ".masked");
-        return BinaryOperator::Create(BinOp, X, NewRHS);
-      }
-      if (!isa<Constant>(Y) && MaskedValueIsZero(Y, NotAndMask, 0, &I)) {
-        // Not masking anything out for the RHS, move mask to LHS.
-        // and ({x}or X, Y), C --> {x}or (and X, C), Y
-        Value *NewLHS = Builder.CreateAnd(X, Op1, X->getName() + ".masked");
-        return BinaryOperator::Create(BinOp, NewLHS, Y);
-      }
-    }
-
     unsigned Width = Ty->getScalarSizeInBits();
     const APInt *ShiftC;
     if (match(Op0, m_OneUse(m_SExt(m_AShr(m_Value(X), m_APInt(ShiftC)))))) {
@@ -1961,6 +1808,12 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
       }
     }
 
+    // If this 'and' clears the sign-bits added by ashr, replace with lshr:
+    // and (ashr X, ShiftC), C --> lshr X, ShiftC
+    if (match(Op0, m_AShr(m_Value(X), m_APInt(ShiftC))) && ShiftC->ult(Width) &&
+        C->isMask(Width - ShiftC->getZExtValue()))
+      return BinaryOperator::CreateLShr(X, ConstantInt::get(Ty, *ShiftC));
+
     const APInt *AddC;
     if (match(Op0, m_Add(m_Value(X), m_APInt(AddC)))) {
       // If we add zeros to every bit below a mask, the add has no effect:
@@ -1983,7 +1836,7 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
 
     // ((C1 OP zext(X)) & C2) -> zext((C1 OP X) & C2) if C2 fits in the
     // bitwidth of X and OP behaves well when given trunc(C1) and X.
-    auto isSuitableBinOpcode = [](BinaryOperator *B) {
+    auto isNarrowableBinOpcode = [](BinaryOperator *B) {
       switch (B->getOpcode()) {
       case Instruction::Xor:
       case Instruction::Or:
@@ -1996,22 +1849,125 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
       }
     };
     BinaryOperator *BO;
-    if (match(Op0, m_OneUse(m_BinOp(BO))) && isSuitableBinOpcode(BO)) {
+    if (match(Op0, m_OneUse(m_BinOp(BO))) && isNarrowableBinOpcode(BO)) {
+      Instruction::BinaryOps BOpcode = BO->getOpcode();
       Value *X;
       const APInt *C1;
       // TODO: The one-use restrictions could be relaxed a little if the AND
       // is going to be removed.
+      // Try to narrow the 'and' and a binop with constant operand:
+      // and (bo (zext X), C1), C --> zext (and (bo X, TruncC1), TruncC)
       if (match(BO, m_c_BinOp(m_OneUse(m_ZExt(m_Value(X))), m_APInt(C1))) &&
           C->isIntN(X->getType()->getScalarSizeInBits())) {
         unsigned XWidth = X->getType()->getScalarSizeInBits();
         Constant *TruncC1 = ConstantInt::get(X->getType(), C1->trunc(XWidth));
         Value *BinOp = isa<ZExtInst>(BO->getOperand(0))
-                           ? Builder.CreateBinOp(BO->getOpcode(), X, TruncC1)
-                           : Builder.CreateBinOp(BO->getOpcode(), TruncC1, X);
+                           ? Builder.CreateBinOp(BOpcode, X, TruncC1)
+                           : Builder.CreateBinOp(BOpcode, TruncC1, X);
         Constant *TruncC = ConstantInt::get(X->getType(), C->trunc(XWidth));
         Value *And = Builder.CreateAnd(BinOp, TruncC);
         return new ZExtInst(And, Ty);
       }
+
+      // Similar to above: if the mask matches the zext input width, then the
+      // 'and' can be eliminated, so we can truncate the other variable op:
+      // and (bo (zext X), Y), C --> zext (bo X, (trunc Y))
+      if (isa<Instruction>(BO->getOperand(0)) &&
+          match(BO->getOperand(0), m_OneUse(m_ZExt(m_Value(X)))) &&
+          C->isMask(X->getType()->getScalarSizeInBits())) {
+        Y = BO->getOperand(1);
+        Value *TrY = Builder.CreateTrunc(Y, X->getType(), Y->getName() + ".tr");
+        Value *NewBO =
+            Builder.CreateBinOp(BOpcode, X, TrY, BO->getName() + ".narrow");
+        return new ZExtInst(NewBO, Ty);
+      }
+      // and (bo Y, (zext X)), C --> zext (bo (trunc Y), X)
+      if (isa<Instruction>(BO->getOperand(1)) &&
+          match(BO->getOperand(1), m_OneUse(m_ZExt(m_Value(X)))) &&
+          C->isMask(X->getType()->getScalarSizeInBits())) {
+        Y = BO->getOperand(0);
+        Value *TrY = Builder.CreateTrunc(Y, X->getType(), Y->getName() + ".tr");
+        Value *NewBO =
+            Builder.CreateBinOp(BOpcode, TrY, X, BO->getName() + ".narrow");
+        return new ZExtInst(NewBO, Ty);
+      }
+    }
+
+    // This is intentionally placed after the narrowing transforms for
+    // efficiency (transform directly to the narrow logic op if possible).
+    // If the mask is only needed on one incoming arm, push the 'and' op up.
+    if (match(Op0, m_OneUse(m_Xor(m_Value(X), m_Value(Y)))) ||
+        match(Op0, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) {
+      APInt NotAndMask(~(*C));
+      BinaryOperator::BinaryOps BinOp = cast<BinaryOperator>(Op0)->getOpcode();
+      if (MaskedValueIsZero(X, NotAndMask, 0, &I)) {
+        // Not masking anything out for the LHS, move mask to RHS.
+        // and ({x}or X, Y), C --> {x}or X, (and Y, C)
+        Value *NewRHS = Builder.CreateAnd(Y, Op1, Y->getName() + ".masked");
+        return BinaryOperator::Create(BinOp, X, NewRHS);
+      }
+      if (!isa<Constant>(Y) && MaskedValueIsZero(Y, NotAndMask, 0, &I)) {
+        // Not masking anything out for the RHS, move mask to LHS.
+        // and ({x}or X, Y), C --> {x}or (and X, C), Y
+        Value *NewLHS = Builder.CreateAnd(X, Op1, X->getName() + ".masked");
+        return BinaryOperator::Create(BinOp, NewLHS, Y);
+      }
+    }
+
+    // When the mask is a power-of-2 constant and op0 is a shifted-power-of-2
+    // constant, test if the shift amount equals the offset bit index:
+    // (ShiftC << X) & C --> X == (log2(C) - log2(ShiftC)) ? C : 0
+    // (ShiftC >> X) & C --> X == (log2(ShiftC) - log2(C)) ? C : 0
+    if (C->isPowerOf2() &&
+        match(Op0, m_OneUse(m_LogicalShift(m_Power2(ShiftC), m_Value(X))))) {
+      int Log2ShiftC = ShiftC->exactLogBase2();
+      int Log2C = C->exactLogBase2();
+      bool IsShiftLeft =
+         cast<BinaryOperator>(Op0)->getOpcode() == Instruction::Shl;
+      int BitNum = IsShiftLeft ? Log2C - Log2ShiftC : Log2ShiftC - Log2C;
+      assert(BitNum >= 0 && "Expected demanded bits to handle impossible mask");
+      Value *Cmp = Builder.CreateICmpEQ(X, ConstantInt::get(Ty, BitNum));
+      return SelectInst::Create(Cmp, ConstantInt::get(Ty, *C),
+                                ConstantInt::getNullValue(Ty));
+    }
+
+    Constant *C1, *C2;
+    const APInt *C3 = C;
+    Value *X;
+    if (C3->isPowerOf2()) {
+      Constant *Log2C3 = ConstantInt::get(Ty, C3->countTrailingZeros());
+      if (match(Op0, m_OneUse(m_LShr(m_Shl(m_ImmConstant(C1), m_Value(X)),
+                                     m_ImmConstant(C2)))) &&
+          match(C1, m_Power2())) {
+        Constant *Log2C1 = ConstantExpr::getExactLogBase2(C1);
+        Constant *LshrC = ConstantExpr::getAdd(C2, Log2C3);
+        KnownBits KnownLShrc = computeKnownBits(LshrC, 0, nullptr);
+        if (KnownLShrc.getMaxValue().ult(Width)) {
+          // iff C1,C3 is pow2 and C2 + cttz(C3) < BitWidth:
+          // ((C1 << X) >> C2) & C3 -> X == (cttz(C3)+C2-cttz(C1)) ? C3 : 0
+          Constant *CmpC = ConstantExpr::getSub(LshrC, Log2C1);
+          Value *Cmp = Builder.CreateICmpEQ(X, CmpC);
+          return SelectInst::Create(Cmp, ConstantInt::get(Ty, *C3),
+                                    ConstantInt::getNullValue(Ty));
+        }
+      }
+
+      if (match(Op0, m_OneUse(m_Shl(m_LShr(m_ImmConstant(C1), m_Value(X)),
+                                    m_ImmConstant(C2)))) &&
+          match(C1, m_Power2())) {
+        Constant *Log2C1 = ConstantExpr::getExactLogBase2(C1);
+        Constant *Cmp =
+            ConstantExpr::getCompare(ICmpInst::ICMP_ULT, Log2C3, C2);
+        if (Cmp->isZeroValue()) {
+          // iff C1,C3 is pow2 and Log2(C3) >= C2:
+          // ((C1 >> X) << C2) & C3 -> X == (cttz(C1)+C2-cttz(C3)) ? C3 : 0
+          Constant *ShlC = ConstantExpr::getAdd(C2, Log2C1);
+          Constant *CmpC = ConstantExpr::getSub(ShlC, Log2C3);
+          Value *Cmp = Builder.CreateICmpEQ(X, CmpC);
+          return SelectInst::Create(Cmp, ConstantInt::get(Ty, *C3),
+                                    ConstantInt::getNullValue(Ty));
+        }
+      }
     }
   }
 
@@ -2121,32 +2077,50 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
     ICmpInst *LHS = dyn_cast<ICmpInst>(Op0);
     ICmpInst *RHS = dyn_cast<ICmpInst>(Op1);
     if (LHS && RHS)
-      if (Value *Res = foldAndOfICmps(LHS, RHS, I))
+      if (Value *Res = foldAndOrOfICmps(LHS, RHS, I, /* IsAnd */ true))
         return replaceInstUsesWith(I, Res);
 
     // TODO: Make this recursive; it's a little tricky because an arbitrary
     // number of 'and' instructions might have to be created.
-    if (LHS && match(Op1, m_OneUse(m_And(m_Value(X), m_Value(Y))))) {
+    if (LHS && match(Op1, m_OneUse(m_LogicalAnd(m_Value(X), m_Value(Y))))) {
+      bool IsLogical = isa<SelectInst>(Op1);
+      // LHS & (X && Y) --> (LHS && X) && Y
       if (auto *Cmp = dyn_cast<ICmpInst>(X))
-        if (Value *Res = foldAndOfICmps(LHS, Cmp, I))
-          return replaceInstUsesWith(I, Builder.CreateAnd(Res, Y));
+        if (Value *Res =
+                foldAndOrOfICmps(LHS, Cmp, I, /* IsAnd */ true, IsLogical))
+          return replaceInstUsesWith(I, IsLogical
+                                            ? Builder.CreateLogicalAnd(Res, Y)
+                                            : Builder.CreateAnd(Res, Y));
+      // LHS & (X && Y) --> X && (LHS & Y)
       if (auto *Cmp = dyn_cast<ICmpInst>(Y))
-        if (Value *Res = foldAndOfICmps(LHS, Cmp, I))
-          return replaceInstUsesWith(I, Builder.CreateAnd(Res, X));
-    }
-    if (RHS && match(Op0, m_OneUse(m_And(m_Value(X), m_Value(Y))))) {
+        if (Value *Res = foldAndOrOfICmps(LHS, Cmp, I, /* IsAnd */ true,
+                                          /* IsLogical */ false))
+          return replaceInstUsesWith(I, IsLogical
+                                            ? Builder.CreateLogicalAnd(X, Res)
+                                            : Builder.CreateAnd(X, Res));
+    }
+    if (RHS && match(Op0, m_OneUse(m_LogicalAnd(m_Value(X), m_Value(Y))))) {
+      bool IsLogical = isa<SelectInst>(Op0);
+      // (X && Y) & RHS --> (X && RHS) && Y
       if (auto *Cmp = dyn_cast<ICmpInst>(X))
-        if (Value *Res = foldAndOfICmps(Cmp, RHS, I))
-          return replaceInstUsesWith(I, Builder.CreateAnd(Res, Y));
+        if (Value *Res =
+                foldAndOrOfICmps(Cmp, RHS, I, /* IsAnd */ true, IsLogical))
+          return replaceInstUsesWith(I, IsLogical
+                                            ? Builder.CreateLogicalAnd(Res, Y)
+                                            : Builder.CreateAnd(Res, Y));
+      // (X && Y) & RHS --> X && (Y & RHS)
       if (auto *Cmp = dyn_cast<ICmpInst>(Y))
-        if (Value *Res = foldAndOfICmps(Cmp, RHS, I))
-          return replaceInstUsesWith(I, Builder.CreateAnd(Res, X));
+        if (Value *Res = foldAndOrOfICmps(Cmp, RHS, I, /* IsAnd */ true,
+                                          /* IsLogical */ false))
+          return replaceInstUsesWith(I, IsLogical
+                                            ? Builder.CreateLogicalAnd(X, Res)
+                                            : Builder.CreateAnd(X, Res));
     }
   }
 
   if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0)))
     if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1)))
-      if (Value *Res = foldLogicOfFCmps(LHS, RHS, true))
+      if (Value *Res = foldLogicOfFCmps(LHS, RHS, /*IsAnd*/ true))
         return replaceInstUsesWith(I, Res);
 
   if (Instruction *FoldedFCmps = reassociateFCmps(I, Builder))
@@ -2175,18 +2149,16 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
   unsigned FullShift = Ty->getScalarSizeInBits() - 1;
   if (match(&I, m_c_And(m_OneUse(m_AShr(m_Value(X), m_SpecificInt(FullShift))),
                         m_Value(Y)))) {
-    Constant *Zero = ConstantInt::getNullValue(Ty);
-    Value *Cmp = Builder.CreateICmpSLT(X, Zero, "isneg");
-    return SelectInst::Create(Cmp, Y, Zero);
+    Value *IsNeg = Builder.CreateIsNeg(X, "isneg");
+    return SelectInst::Create(IsNeg, Y, ConstantInt::getNullValue(Ty));
   }
   // If there's a 'not' of the shifted value, swap the select operands:
   // ~(iN X s>> (N-1)) & Y --> (X s< 0) ? 0 : Y
   if (match(&I, m_c_And(m_OneUse(m_Not(
                             m_AShr(m_Value(X), m_SpecificInt(FullShift)))),
                         m_Value(Y)))) {
-    Constant *Zero = ConstantInt::getNullValue(Ty);
-    Value *Cmp = Builder.CreateICmpSLT(X, Zero, "isneg");
-    return SelectInst::Create(Cmp, Zero, Y);
+    Value *IsNeg = Builder.CreateIsNeg(X, "isneg");
+    return SelectInst::Create(IsNeg, ConstantInt::getNullValue(Ty), Y);
   }
 
   // (~x) & y  -->  ~(x | (~y))  iff that gets rid of inversions
@@ -2482,8 +2454,12 @@ Value *InstCombinerImpl::matchSelectFromAndOr(Value *A, Value *C, Value *B,
     // not create unnecessary casts if the types already match.
     Type *SelTy = A->getType();
     if (auto *VecTy = dyn_cast<VectorType>(Cond->getType())) {
+      // For a fixed or scalable vector get N from <{vscale x} N x iM>
       unsigned Elts = VecTy->getElementCount().getKnownMinValue();
-      Type *EltTy = Builder.getIntNTy(SelTy->getPrimitiveSizeInBits() / Elts);
+      // For a fixed or scalable vector, get the size in bits of N x iM; for a
+      // scalar this is just M.
+      unsigned SelEltSize = SelTy->getPrimitiveSizeInBits().getKnownMinSize();
+      Type *EltTy = Builder.getIntNTy(SelEltSize / Elts);
       SelTy = VectorType::get(EltTy, VecTy->getElementCount());
     }
     Value *BitcastC = Builder.CreateBitCast(C, SelTy);
@@ -2495,15 +2471,46 @@ Value *InstCombinerImpl::matchSelectFromAndOr(Value *A, Value *C, Value *B,
   return nullptr;
 }
 
-/// Fold (icmp)|(icmp) if possible.
-Value *InstCombinerImpl::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
-                                       BinaryOperator &Or) {
-  const SimplifyQuery Q = SQ.getWithInstruction(&Or);
+// (icmp eq X, 0) | (icmp ult Other, X) -> (icmp ule Other, X-1)
+// (icmp ne X, 0) & (icmp uge Other, X) -> (icmp ugt Other, X-1)
+Value *foldAndOrOfICmpEqZeroAndICmp(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
+                                    IRBuilderBase &Builder) {
+  ICmpInst::Predicate LPred =
+      IsAnd ? LHS->getInversePredicate() : LHS->getPredicate();
+  ICmpInst::Predicate RPred =
+      IsAnd ? RHS->getInversePredicate() : RHS->getPredicate();
+  Value *LHS0 = LHS->getOperand(0);
+  if (LPred != ICmpInst::ICMP_EQ || !match(LHS->getOperand(1), m_Zero()) ||
+      !LHS0->getType()->isIntOrIntVectorTy() ||
+      !(LHS->hasOneUse() || RHS->hasOneUse()))
+    return nullptr;
+
+  Value *Other;
+  if (RPred == ICmpInst::ICMP_ULT && RHS->getOperand(1) == LHS0)
+    Other = RHS->getOperand(0);
+  else if (RPred == ICmpInst::ICMP_UGT && RHS->getOperand(0) == LHS0)
+    Other = RHS->getOperand(1);
+  else
+    return nullptr;
+
+  return Builder.CreateICmp(
+      IsAnd ? ICmpInst::ICMP_ULT : ICmpInst::ICMP_UGE,
+      Builder.CreateAdd(LHS0, Constant::getAllOnesValue(LHS0->getType())),
+      Other);
+}
+
+/// Fold (icmp)&(icmp) or (icmp)|(icmp) if possible.
+/// If IsLogical is true, then the and/or is in select form and the transform
+/// must be poison-safe.
+Value *InstCombinerImpl::foldAndOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
+                                          Instruction &I, bool IsAnd,
+                                          bool IsLogical) {
+  const SimplifyQuery Q = SQ.getWithInstruction(&I);
 
   // Fold (iszero(A & K1) | iszero(A & K2)) ->  (A & (K1 | K2)) != (K1 | K2)
+  // Fold (!iszero(A & K1) & !iszero(A & K2)) ->  (A & (K1 | K2)) == (K1 | K2)
   // if K1 and K2 are a one-bit mask.
-  if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, &Or,
-                                               /* IsAnd */ false))
+  if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, &I, IsAnd, IsLogical))
     return V;
 
   ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
@@ -2513,64 +2520,16 @@ Value *InstCombinerImpl::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   match(LHS1, m_APInt(LHSC));
   match(RHS1, m_APInt(RHSC));
 
-  // Fold (icmp ult/ule (A + C1), C3) | (icmp ult/ule (A + C2), C3)
-  //                   -->  (icmp ult/ule ((A & ~(C1 ^ C2)) + max(C1, C2)), C3)
-  // The original condition actually refers to the following two ranges:
-  // [MAX_UINT-C1+1, MAX_UINT-C1+1+C3] and [MAX_UINT-C2+1, MAX_UINT-C2+1+C3]
-  // We can fold these two ranges if:
-  // 1) C1 and C2 is unsigned greater than C3.
-  // 2) The two ranges are separated.
-  // 3) C1 ^ C2 is one-bit mask.
-  // 4) LowRange1 ^ LowRange2 and HighRange1 ^ HighRange2 are one-bit mask.
-  // This implies all values in the two ranges differ by exactly one bit.
-  if ((PredL == ICmpInst::ICMP_ULT || PredL == ICmpInst::ICMP_ULE) &&
-      PredL == PredR && LHSC && RHSC && LHS->hasOneUse() && RHS->hasOneUse() &&
-      LHSC->getBitWidth() == RHSC->getBitWidth() && *LHSC == *RHSC) {
-
-    Value *AddOpnd;
-    const APInt *LAddC, *RAddC;
-    if (match(LHS0, m_Add(m_Value(AddOpnd), m_APInt(LAddC))) &&
-        match(RHS0, m_Add(m_Specific(AddOpnd), m_APInt(RAddC))) &&
-        LAddC->ugt(*LHSC) && RAddC->ugt(*LHSC)) {
-
-      APInt DiffC = *LAddC ^ *RAddC;
-      if (DiffC.isPowerOf2()) {
-        const APInt *MaxAddC = nullptr;
-        if (LAddC->ult(*RAddC))
-          MaxAddC = RAddC;
-        else
-          MaxAddC = LAddC;
-
-        APInt RRangeLow = -*RAddC;
-        APInt RRangeHigh = RRangeLow + *LHSC;
-        APInt LRangeLow = -*LAddC;
-        APInt LRangeHigh = LRangeLow + *LHSC;
-        APInt LowRangeDiff = RRangeLow ^ LRangeLow;
-        APInt HighRangeDiff = RRangeHigh ^ LRangeHigh;
-        APInt RangeDiff = LRangeLow.sgt(RRangeLow) ? LRangeLow - RRangeLow
-                                                   : RRangeLow - LRangeLow;
-
-        if (LowRangeDiff.isPowerOf2() && LowRangeDiff == HighRangeDiff &&
-            RangeDiff.ugt(*LHSC)) {
-          Type *Ty = AddOpnd->getType();
-          Value *MaskC = ConstantInt::get(Ty, ~DiffC);
-
-          Value *NewAnd = Builder.CreateAnd(AddOpnd, MaskC);
-          Value *NewAdd = Builder.CreateAdd(NewAnd,
-                                            ConstantInt::get(Ty, *MaxAddC));
-          return Builder.CreateICmp(LHS->getPredicate(), NewAdd,
-                                    ConstantInt::get(Ty, *LHSC));
-        }
-      }
-    }
-  }
-
   // (icmp1 A, B) | (icmp2 A, B) --> (icmp3 A, B)
+  // (icmp1 A, B) & (icmp2 A, B) --> (icmp3 A, B)
   if (predicatesFoldable(PredL, PredR)) {
-    if (LHS0 == RHS1 && LHS1 == RHS0)
-      LHS->swapOperands();
+    if (LHS0 == RHS1 && LHS1 == RHS0) {
+      PredL = ICmpInst::getSwappedPredicate(PredL);
+      std::swap(LHS0, LHS1);
+    }
     if (LHS0 == RHS0 && LHS1 == RHS1) {
-      unsigned Code = getICmpCode(LHS) | getICmpCode(RHS);
+      unsigned Code = IsAnd ? getICmpCode(PredL) & getICmpCode(PredR)
+                            : getICmpCode(PredL) | getICmpCode(PredR);
       bool IsSigned = LHS->isSigned() || RHS->isSigned();
       return getNewICmpValue(Code, IsSigned, LHS0, LHS1, Builder);
     }
@@ -2578,68 +2537,70 @@ Value *InstCombinerImpl::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
 
   // handle (roughly):
   // (icmp ne (A & B), C) | (icmp ne (A & D), E)
-  if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, false, Builder))
+  // (icmp eq (A & B), C) & (icmp eq (A & D), E)
+  if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, IsAnd, IsLogical, Builder))
     return V;
 
-  if (LHS->hasOneUse() || RHS->hasOneUse()) {
-    // (icmp eq B, 0) | (icmp ult A, B) -> (icmp ule A, B-1)
-    // (icmp eq B, 0) | (icmp ugt B, A) -> (icmp ule A, B-1)
-    Value *A = nullptr, *B = nullptr;
-    if (PredL == ICmpInst::ICMP_EQ && match(LHS1, m_Zero())) {
-      B = LHS0;
-      if (PredR == ICmpInst::ICMP_ULT && LHS0 == RHS1)
-        A = RHS0;
-      else if (PredR == ICmpInst::ICMP_UGT && LHS0 == RHS0)
-        A = RHS1;
-    }
-    // (icmp ult A, B) | (icmp eq B, 0) -> (icmp ule A, B-1)
-    // (icmp ugt B, A) | (icmp eq B, 0) -> (icmp ule A, B-1)
-    else if (PredR == ICmpInst::ICMP_EQ && match(RHS1, m_Zero())) {
-      B = RHS0;
-      if (PredL == ICmpInst::ICMP_ULT && RHS0 == LHS1)
-        A = LHS0;
-      else if (PredL == ICmpInst::ICMP_UGT && RHS0 == LHS0)
-        A = LHS1;
-    }
-    if (A && B && B->getType()->isIntOrIntVectorTy())
-      return Builder.CreateICmp(
-          ICmpInst::ICMP_UGE,
-          Builder.CreateAdd(B, Constant::getAllOnesValue(B->getType())), A);
-  }
-
-  if (Value *V = foldAndOrOfICmpsWithConstEq(LHS, RHS, Or, Builder, Q))
+  // TODO: One of these directions is fine with logical and/or, the other could
+  // be supported by inserting freeze.
+  if (!IsLogical) {
+    if (Value *V = foldAndOrOfICmpEqZeroAndICmp(LHS, RHS, IsAnd, Builder))
+      return V;
+    if (Value *V = foldAndOrOfICmpEqZeroAndICmp(RHS, LHS, IsAnd, Builder))
+      return V;
+  }
+
+  // TODO: Verify whether this is safe for logical and/or.
+  if (!IsLogical) {
+    if (Value *V = foldAndOrOfICmpsWithConstEq(LHS, RHS, IsAnd, Builder, Q))
+      return V;
+    if (Value *V = foldAndOrOfICmpsWithConstEq(RHS, LHS, IsAnd, Builder, Q))
+      return V;
+  }
+
+  if (Value *V = foldIsPowerOf2OrZero(LHS, RHS, IsAnd, Builder))
     return V;
-  if (Value *V = foldAndOrOfICmpsWithConstEq(RHS, LHS, Or, Builder, Q))
+  if (Value *V = foldIsPowerOf2OrZero(RHS, LHS, IsAnd, Builder))
     return V;
 
-  // E.g. (icmp slt x, 0) | (icmp sgt x, n) --> icmp ugt x, n
-  if (Value *V = simplifyRangeCheck(LHS, RHS, /*Inverted=*/true))
-    return V;
+  // TODO: One of these directions is fine with logical and/or, the other could
+  // be supported by inserting freeze.
+  if (!IsLogical) {
+    // E.g. (icmp slt x, 0) | (icmp sgt x, n) --> icmp ugt x, n
+    // E.g. (icmp sge x, 0) & (icmp slt x, n) --> icmp ult x, n
+    if (Value *V = simplifyRangeCheck(LHS, RHS, /*Inverted=*/!IsAnd))
+      return V;
 
-  // E.g. (icmp sgt x, n) | (icmp slt x, 0) --> icmp ugt x, n
-  if (Value *V = simplifyRangeCheck(RHS, LHS, /*Inverted=*/true))
-    return V;
+    // E.g. (icmp sgt x, n) | (icmp slt x, 0) --> icmp ugt x, n
+    // E.g. (icmp slt x, n) & (icmp sge x, 0) --> icmp ult x, n
+    if (Value *V = simplifyRangeCheck(RHS, LHS, /*Inverted=*/!IsAnd))
+      return V;
+  }
 
-  if (Value *V = foldAndOrOfEqualityCmpsWithConstants(LHS, RHS, false, Builder))
-    return V;
+  // TODO: Add conjugated or fold, check whether it is safe for logical and/or.
+  if (IsAnd && !IsLogical)
+    if (Value *V = foldSignedTruncationCheck(LHS, RHS, I, Builder))
+      return V;
 
-  if (Value *V = foldIsPowerOf2(LHS, RHS, false /* JoinedByAnd */, Builder))
+  if (Value *V = foldIsPowerOf2(LHS, RHS, IsAnd, Builder))
     return V;
 
-  if (Value *X =
-          foldUnsignedUnderflowCheck(LHS, RHS, /*IsAnd=*/false, Q, Builder))
-    return X;
-  if (Value *X =
-          foldUnsignedUnderflowCheck(RHS, LHS, /*IsAnd=*/false, Q, Builder))
-    return X;
+  // TODO: Verify whether this is safe for logical and/or.
+  if (!IsLogical) {
+    if (Value *X = foldUnsignedUnderflowCheck(LHS, RHS, IsAnd, Q, Builder))
+      return X;
+    if (Value *X = foldUnsignedUnderflowCheck(RHS, LHS, IsAnd, Q, Builder))
+      return X;
+  }
 
-  if (Value *X = foldEqOfParts(LHS, RHS, /*IsAnd=*/false))
+  if (Value *X = foldEqOfParts(LHS, RHS, IsAnd))
     return X;
 
   // (icmp ne A, 0) | (icmp ne B, 0) --> (icmp ne (A|B), 0)
+  // (icmp eq A, 0) & (icmp eq B, 0) --> (icmp eq (A|B), 0)
   // TODO: Remove this when foldLogOpOfMaskedICmps can handle undefs.
-  if (PredL == ICmpInst::ICMP_NE && match(LHS1, m_ZeroInt()) &&
-      PredR == ICmpInst::ICMP_NE && match(RHS1, m_ZeroInt()) &&
+  if (!IsLogical && PredL == (IsAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE) &&
+      PredL == PredR && match(LHS1, m_ZeroInt()) && match(RHS1, m_ZeroInt()) &&
       LHS0->getType() == RHS0->getType()) {
     Value *NewOr = Builder.CreateOr(LHS0, RHS0);
     return Builder.CreateICmp(PredL, NewOr,
@@ -2650,15 +2611,83 @@ Value *InstCombinerImpl::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   if (!LHSC || !RHSC)
     return nullptr;
 
-  return foldAndOrOfICmpsUsingRanges(PredL, LHS0, *LHSC, PredR, RHS0, *RHSC,
-                                     Builder, /* IsAnd */ false);
+  // (trunc x) == C1 & (and x, CA) == C2 -> (and x, CA|CMAX) == C1|C2
+  // (trunc x) != C1 | (and x, CA) != C2 -> (and x, CA|CMAX) != C1|C2
+  // where CMAX is the all ones value for the truncated type,
+  // iff the lower bits of C2 and CA are zero.
+  if (PredL == (IsAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE) &&
+      PredL == PredR && LHS->hasOneUse() && RHS->hasOneUse()) {
+    Value *V;
+    const APInt *AndC, *SmallC = nullptr, *BigC = nullptr;
+
+    // (trunc x) == C1 & (and x, CA) == C2
+    // (and x, CA) == C2 & (trunc x) == C1
+    if (match(RHS0, m_Trunc(m_Value(V))) &&
+        match(LHS0, m_And(m_Specific(V), m_APInt(AndC)))) {
+      SmallC = RHSC;
+      BigC = LHSC;
+    } else if (match(LHS0, m_Trunc(m_Value(V))) &&
+               match(RHS0, m_And(m_Specific(V), m_APInt(AndC)))) {
+      SmallC = LHSC;
+      BigC = RHSC;
+    }
+
+    if (SmallC && BigC) {
+      unsigned BigBitSize = BigC->getBitWidth();
+      unsigned SmallBitSize = SmallC->getBitWidth();
+
+      // Check that the low bits are zero.
+      APInt Low = APInt::getLowBitsSet(BigBitSize, SmallBitSize);
+      if ((Low & *AndC).isZero() && (Low & *BigC).isZero()) {
+        Value *NewAnd = Builder.CreateAnd(V, Low | *AndC);
+        APInt N = SmallC->zext(BigBitSize) | *BigC;
+        Value *NewVal = ConstantInt::get(NewAnd->getType(), N);
+        return Builder.CreateICmp(PredL, NewAnd, NewVal);
+      }
+    }
+  }
+
+  // Match naive pattern (and its inverted form) for checking if two values
+  // share same sign. An example of the pattern:
+  // (icmp slt (X & Y), 0) | (icmp sgt (X | Y), -1) -> (icmp sgt (X ^ Y), -1)
+  // Inverted form (example):
+  // (icmp slt (X | Y), 0) & (icmp sgt (X & Y), -1) -> (icmp slt (X ^ Y), 0)
+  bool TrueIfSignedL, TrueIfSignedR;
+  if (InstCombiner::isSignBitCheck(PredL, *LHSC, TrueIfSignedL) &&
+      InstCombiner::isSignBitCheck(PredR, *RHSC, TrueIfSignedR) &&
+      (RHS->hasOneUse() || LHS->hasOneUse())) {
+    Value *X, *Y;
+    if (IsAnd) {
+      if ((TrueIfSignedL && !TrueIfSignedR &&
+           match(LHS0, m_Or(m_Value(X), m_Value(Y))) &&
+           match(RHS0, m_c_And(m_Specific(X), m_Specific(Y)))) ||
+          (!TrueIfSignedL && TrueIfSignedR &&
+           match(LHS0, m_And(m_Value(X), m_Value(Y))) &&
+           match(RHS0, m_c_Or(m_Specific(X), m_Specific(Y))))) {
+        Value *NewXor = Builder.CreateXor(X, Y);
+        return Builder.CreateIsNeg(NewXor);
+      }
+    } else {
+      if ((TrueIfSignedL && !TrueIfSignedR &&
+            match(LHS0, m_And(m_Value(X), m_Value(Y))) &&
+            match(RHS0, m_c_Or(m_Specific(X), m_Specific(Y)))) ||
+          (!TrueIfSignedL && TrueIfSignedR &&
+           match(LHS0, m_Or(m_Value(X), m_Value(Y))) &&
+           match(RHS0, m_c_And(m_Specific(X), m_Specific(Y))))) {
+        Value *NewXor = Builder.CreateXor(X, Y);
+        return Builder.CreateIsNotNeg(NewXor);
+      }
+    }
+  }
+
+  return foldAndOrOfICmpsUsingRanges(LHS, RHS, IsAnd);
 }
 
 // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
 // here. We should standardize that construct where it is needed or choose some
 // other way to ensure that commutated variants of patterns are not missed.
 Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
-  if (Value *V = SimplifyOrInst(I.getOperand(0), I.getOperand(1),
+  if (Value *V = simplifyOrInst(I.getOperand(0), I.getOperand(1),
                                 SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
@@ -2824,6 +2853,14 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
     if (match(Op1, m_Xor(m_Specific(B), m_Specific(A))))
       return BinaryOperator::CreateOr(Op1, C);
 
+  // ((A & B) ^ C) | B -> C | B
+  if (match(Op0, m_c_Xor(m_c_And(m_Value(A), m_Specific(Op1)), m_Value(C))))
+    return BinaryOperator::CreateOr(C, Op1);
+
+  // B | ((A & B) ^ C) -> B | C
+  if (match(Op1, m_c_Xor(m_c_And(m_Value(A), m_Specific(Op0)), m_Value(C))))
+    return BinaryOperator::CreateOr(Op0, C);
+
   // ((B | C) & A) | B -> B | (A & C)
   if (match(Op0, m_And(m_Or(m_Specific(Op1), m_Value(C)), m_Value(A))))
     return BinaryOperator::CreateOr(Op1, Builder.CreateAnd(A, C));
@@ -2885,33 +2922,51 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
     ICmpInst *LHS = dyn_cast<ICmpInst>(Op0);
     ICmpInst *RHS = dyn_cast<ICmpInst>(Op1);
     if (LHS && RHS)
-      if (Value *Res = foldOrOfICmps(LHS, RHS, I))
+      if (Value *Res = foldAndOrOfICmps(LHS, RHS, I, /* IsAnd */ false))
         return replaceInstUsesWith(I, Res);
 
     // TODO: Make this recursive; it's a little tricky because an arbitrary
     // number of 'or' instructions might have to be created.
     Value *X, *Y;
-    if (LHS && match(Op1, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) {
+    if (LHS && match(Op1, m_OneUse(m_LogicalOr(m_Value(X), m_Value(Y))))) {
+      bool IsLogical = isa<SelectInst>(Op1);
+      // LHS | (X || Y) --> (LHS || X) || Y
       if (auto *Cmp = dyn_cast<ICmpInst>(X))
-        if (Value *Res = foldOrOfICmps(LHS, Cmp, I))
-          return replaceInstUsesWith(I, Builder.CreateOr(Res, Y));
+        if (Value *Res =
+                foldAndOrOfICmps(LHS, Cmp, I, /* IsAnd */ false, IsLogical))
+          return replaceInstUsesWith(I, IsLogical
+                                            ? Builder.CreateLogicalOr(Res, Y)
+                                            : Builder.CreateOr(Res, Y));
+      // LHS | (X || Y) --> X || (LHS | Y)
       if (auto *Cmp = dyn_cast<ICmpInst>(Y))
-        if (Value *Res = foldOrOfICmps(LHS, Cmp, I))
-          return replaceInstUsesWith(I, Builder.CreateOr(Res, X));
-    }
-    if (RHS && match(Op0, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) {
+        if (Value *Res = foldAndOrOfICmps(LHS, Cmp, I, /* IsAnd */ false,
+                                          /* IsLogical */ false))
+          return replaceInstUsesWith(I, IsLogical
+                                            ? Builder.CreateLogicalOr(X, Res)
+                                            : Builder.CreateOr(X, Res));
+    }
+    if (RHS && match(Op0, m_OneUse(m_LogicalOr(m_Value(X), m_Value(Y))))) {
+      bool IsLogical = isa<SelectInst>(Op0);
+      // (X || Y) | RHS --> (X || RHS) || Y
       if (auto *Cmp = dyn_cast<ICmpInst>(X))
-        if (Value *Res = foldOrOfICmps(Cmp, RHS, I))
-          return replaceInstUsesWith(I, Builder.CreateOr(Res, Y));
+        if (Value *Res =
+                foldAndOrOfICmps(Cmp, RHS, I, /* IsAnd */ false, IsLogical))
+          return replaceInstUsesWith(I, IsLogical
+                                            ? Builder.CreateLogicalOr(Res, Y)
+                                            : Builder.CreateOr(Res, Y));
+      // (X || Y) | RHS --> X || (Y | RHS)
       if (auto *Cmp = dyn_cast<ICmpInst>(Y))
-        if (Value *Res = foldOrOfICmps(Cmp, RHS, I))
-          return replaceInstUsesWith(I, Builder.CreateOr(Res, X));
+        if (Value *Res = foldAndOrOfICmps(Cmp, RHS, I, /* IsAnd */ false,
+                                          /* IsLogical */ false))
+          return replaceInstUsesWith(I, IsLogical
+                                            ? Builder.CreateLogicalOr(X, Res)
+                                            : Builder.CreateOr(X, Res));
     }
   }
 
   if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0)))
     if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1)))
-      if (Value *Res = foldLogicOfFCmps(LHS, RHS, false))
+      if (Value *Res = foldLogicOfFCmps(LHS, RHS, /*IsAnd*/ false))
         return replaceInstUsesWith(I, Res);
 
   if (Instruction *FoldedFCmps = reassociateFCmps(I, Builder))
@@ -3025,6 +3080,36 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
   if (matchSimpleRecurrence(&I, PN, Start, Step) && DT.dominates(Step, PN))
     return replaceInstUsesWith(I, Builder.CreateOr(Start, Step));
 
+  // (A & B) | (C | D) or (C | D) | (A & B)
+  // Can be combined if C or D is of type (A/B & X)
+  if (match(&I, m_c_Or(m_OneUse(m_And(m_Value(A), m_Value(B))),
+                       m_OneUse(m_Or(m_Value(C), m_Value(D)))))) {
+    // (A & B) | (C | ?) -> C | (? | (A & B))
+    // (A & B) | (C | ?) -> C | (? | (A & B))
+    // (A & B) | (C | ?) -> C | (? | (A & B))
+    // (A & B) | (C | ?) -> C | (? | (A & B))
+    // (C | ?) | (A & B) -> C | (? | (A & B))
+    // (C | ?) | (A & B) -> C | (? | (A & B))
+    // (C | ?) | (A & B) -> C | (? | (A & B))
+    // (C | ?) | (A & B) -> C | (? | (A & B))
+    if (match(D, m_OneUse(m_c_And(m_Specific(A), m_Value()))) ||
+        match(D, m_OneUse(m_c_And(m_Specific(B), m_Value()))))
+      return BinaryOperator::CreateOr(
+          C, Builder.CreateOr(D, Builder.CreateAnd(A, B)));
+    // (A & B) | (? | D) -> (? | (A & B)) | D
+    // (A & B) | (? | D) -> (? | (A & B)) | D
+    // (A & B) | (? | D) -> (? | (A & B)) | D
+    // (A & B) | (? | D) -> (? | (A & B)) | D
+    // (? | D) | (A & B) -> (? | (A & B)) | D
+    // (? | D) | (A & B) -> (? | (A & B)) | D
+    // (? | D) | (A & B) -> (? | (A & B)) | D
+    // (? | D) | (A & B) -> (? | (A & B)) | D
+    if (match(C, m_OneUse(m_c_And(m_Specific(A), m_Value()))) ||
+        match(C, m_OneUse(m_c_And(m_Specific(B), m_Value()))))
+      return BinaryOperator::CreateOr(
+          Builder.CreateOr(C, Builder.CreateAnd(A, B)), D);
+  }
+
   return nullptr;
 }
 
@@ -3086,26 +3171,26 @@ Value *InstCombinerImpl::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   assert(I.getOpcode() == Instruction::Xor && I.getOperand(0) == LHS &&
          I.getOperand(1) == RHS && "Should be 'xor' with these operands");
 
-  if (predicatesFoldable(LHS->getPredicate(), RHS->getPredicate())) {
-    if (LHS->getOperand(0) == RHS->getOperand(1) &&
-        LHS->getOperand(1) == RHS->getOperand(0))
-      LHS->swapOperands();
-    if (LHS->getOperand(0) == RHS->getOperand(0) &&
-        LHS->getOperand(1) == RHS->getOperand(1)) {
+  ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
+  Value *LHS0 = LHS->getOperand(0), *LHS1 = LHS->getOperand(1);
+  Value *RHS0 = RHS->getOperand(0), *RHS1 = RHS->getOperand(1);
+
+  if (predicatesFoldable(PredL, PredR)) {
+    if (LHS0 == RHS1 && LHS1 == RHS0) {
+      std::swap(LHS0, LHS1);
+      PredL = ICmpInst::getSwappedPredicate(PredL);
+    }
+    if (LHS0 == RHS0 && LHS1 == RHS1) {
       // (icmp1 A, B) ^ (icmp2 A, B) --> (icmp3 A, B)
-      Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1);
-      unsigned Code = getICmpCode(LHS) ^ getICmpCode(RHS);
+      unsigned Code = getICmpCode(PredL) ^ getICmpCode(PredR);
       bool IsSigned = LHS->isSigned() || RHS->isSigned();
-      return getNewICmpValue(Code, IsSigned, Op0, Op1, Builder);
+      return getNewICmpValue(Code, IsSigned, LHS0, LHS1, Builder);
     }
   }
 
   // TODO: This can be generalized to compares of non-signbits using
   // decomposeBitTestICmp(). It could be enhanced more by using (something like)
   // foldLogOpOfMaskedICmps().
-  ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
-  Value *LHS0 = LHS->getOperand(0), *LHS1 = LHS->getOperand(1);
-  Value *RHS0 = RHS->getOperand(0), *RHS1 = RHS->getOperand(1);
   if ((LHS->hasOneUse() || RHS->hasOneUse()) &&
       LHS0->getType() == RHS0->getType() &&
       LHS0->getType()->isIntOrIntVectorTy()) {
@@ -3114,19 +3199,17 @@ Value *InstCombinerImpl::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS,
     if ((PredL == CmpInst::ICMP_SGT && match(LHS1, m_AllOnes()) &&
          PredR == CmpInst::ICMP_SGT && match(RHS1, m_AllOnes())) ||
         (PredL == CmpInst::ICMP_SLT && match(LHS1, m_Zero()) &&
-         PredR == CmpInst::ICMP_SLT && match(RHS1, m_Zero()))) {
-      Value *Zero = ConstantInt::getNullValue(LHS0->getType());
-      return Builder.CreateICmpSLT(Builder.CreateXor(LHS0, RHS0), Zero);
-    }
+         PredR == CmpInst::ICMP_SLT && match(RHS1, m_Zero())))
+      return Builder.CreateIsNeg(Builder.CreateXor(LHS0, RHS0));
+
     // (X > -1) ^ (Y <  0) --> (X ^ Y) > -1
     // (X <  0) ^ (Y > -1) --> (X ^ Y) > -1
     if ((PredL == CmpInst::ICMP_SGT && match(LHS1, m_AllOnes()) &&
          PredR == CmpInst::ICMP_SLT && match(RHS1, m_Zero())) ||
         (PredL == CmpInst::ICMP_SLT && match(LHS1, m_Zero()) &&
-         PredR == CmpInst::ICMP_SGT && match(RHS1, m_AllOnes()))) {
-      Value *MinusOne = ConstantInt::getAllOnesValue(LHS0->getType());
-      return Builder.CreateICmpSGT(Builder.CreateXor(LHS0, RHS0), MinusOne);
-    }
+         PredR == CmpInst::ICMP_SGT && match(RHS1, m_AllOnes())))
+      return Builder.CreateIsNotNeg(Builder.CreateXor(LHS0, RHS0));
+
   }
 
   // Instead of trying to imitate the folds for and/or, decompose this 'xor'
@@ -3135,10 +3218,10 @@ Value *InstCombinerImpl::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   //
   // This is based on a truth table definition of xor:
   // X ^ Y --> (X | Y) & !(X & Y)
-  if (Value *OrICmp = SimplifyBinOp(Instruction::Or, LHS, RHS, SQ)) {
+  if (Value *OrICmp = simplifyBinOp(Instruction::Or, LHS, RHS, SQ)) {
     // TODO: If OrICmp is true, then the definition of xor simplifies to !(X&Y).
     // TODO: If OrICmp is false, the whole thing is false (InstSimplify?).
-    if (Value *AndICmp = SimplifyBinOp(Instruction::And, LHS, RHS, SQ)) {
+    if (Value *AndICmp = simplifyBinOp(Instruction::And, LHS, RHS, SQ)) {
       // TODO: Independently handle cases where the 'and' side is a constant.
       ICmpInst *X = nullptr, *Y = nullptr;
       if (OrICmp == LHS && AndICmp == RHS) {
@@ -3274,12 +3357,12 @@ static Instruction *canonicalizeAbs(BinaryOperator &Xor,
     // Op1 = ashr i32 A, 31   ; smear the sign bit
     // xor (add A, Op1), Op1  ; add -1 and flip bits if negative
     // --> (A < 0) ? -A : A
-    Value *Cmp = Builder.CreateICmpSLT(A, ConstantInt::getNullValue(Ty));
+    Value *IsNeg = Builder.CreateIsNeg(A);
     // Copy the nuw/nsw flags from the add to the negate.
     auto *Add = cast<BinaryOperator>(Op0);
-    Value *Neg = Builder.CreateNeg(A, "", Add->hasNoUnsignedWrap(),
+    Value *NegA = Builder.CreateNeg(A, "", Add->hasNoUnsignedWrap(),
                                    Add->hasNoSignedWrap());
-    return SelectInst::Create(Cmp, Neg, A);
+    return SelectInst::Create(IsNeg, NegA, A);
   }
   return nullptr;
 }
@@ -3465,51 +3548,7 @@ Instruction *InstCombinerImpl::foldNot(BinaryOperator &I) {
     }
   }
 
-  // TODO: Remove folds if we canonicalize to intrinsics (see above).
-  // Eliminate a bitwise 'not' op of 'not' min/max by inverting the min/max:
-  //
-  //   %notx = xor i32 %x, -1
-  //   %cmp1 = icmp sgt i32 %notx, %y
-  //   %smax = select i1 %cmp1, i32 %notx, i32 %y
-  //   %res = xor i32 %smax, -1
-  // =>
-  //   %noty = xor i32 %y, -1
-  //   %cmp2 = icmp slt %x, %noty
-  //   %res = select i1 %cmp2, i32 %x, i32 %noty
-  //
-  // Same is applicable for smin/umax/umin.
   if (NotOp->hasOneUse()) {
-    Value *LHS, *RHS;
-    SelectPatternFlavor SPF = matchSelectPattern(NotOp, LHS, RHS).Flavor;
-    if (SelectPatternResult::isMinOrMax(SPF)) {
-      // It's possible we get here before the not has been simplified, so make
-      // sure the input to the not isn't freely invertible.
-      if (match(LHS, m_Not(m_Value(X))) && !isFreeToInvert(X, X->hasOneUse())) {
-        Value *NotY = Builder.CreateNot(RHS);
-        return SelectInst::Create(
-            Builder.CreateICmp(getInverseMinMaxPred(SPF), X, NotY), X, NotY);
-      }
-
-      // It's possible we get here before the not has been simplified, so make
-      // sure the input to the not isn't freely invertible.
-      if (match(RHS, m_Not(m_Value(Y))) && !isFreeToInvert(Y, Y->hasOneUse())) {
-        Value *NotX = Builder.CreateNot(LHS);
-        return SelectInst::Create(
-            Builder.CreateICmp(getInverseMinMaxPred(SPF), NotX, Y), NotX, Y);
-      }
-
-      // If both sides are freely invertible, then we can get rid of the xor
-      // completely.
-      if (isFreeToInvert(LHS, !LHS->hasNUsesOrMore(3)) &&
-          isFreeToInvert(RHS, !RHS->hasNUsesOrMore(3))) {
-        Value *NotLHS = Builder.CreateNot(LHS);
-        Value *NotRHS = Builder.CreateNot(RHS);
-        return SelectInst::Create(
-            Builder.CreateICmp(getInverseMinMaxPred(SPF), NotLHS, NotRHS),
-            NotLHS, NotRHS);
-      }
-    }
-
     // Pull 'not' into operands of select if both operands are one-use compares
     // or one is one-use compare and the other one is a constant.
     // Inverting the predicates eliminates the 'not' operation.
@@ -3549,7 +3588,7 @@ Instruction *InstCombinerImpl::foldNot(BinaryOperator &I) {
 // here. We should standardize that construct where it is needed or choose some
 // other way to ensure that commutated variants of patterns are not missed.
 Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
-  if (Value *V = SimplifyXorInst(I.getOperand(0), I.getOperand(1),
+  if (Value *V = simplifyXorInst(I.getOperand(0), I.getOperand(1),
                                  SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
@@ -3596,8 +3635,20 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
   Value *X, *Y;
   Constant *C1;
   if (match(Op1, m_Constant(C1))) {
-    // Use DeMorgan and reassociation to eliminate a 'not' op.
     Constant *C2;
+
+    if (match(Op0, m_OneUse(m_Or(m_Value(X), m_ImmConstant(C2)))) &&
+        match(C1, m_ImmConstant())) {
+      // (X | C2) ^ C1 --> (X & ~C2) ^ (C1^C2)
+      C2 = Constant::replaceUndefsWith(
+          C2, Constant::getAllOnesValue(C2->getType()->getScalarType()));
+      Value *And = Builder.CreateAnd(
+          X, Constant::mergeUndefsWith(ConstantExpr::getNot(C2), C1));
+      return BinaryOperator::CreateXor(
+          And, Constant::mergeUndefsWith(ConstantExpr::getXor(C1, C2), C1));
+    }
+
+    // Use DeMorgan and reassociation to eliminate a 'not' op.
     if (match(Op0, m_OneUse(m_Or(m_Not(m_Value(X)), m_Constant(C2))))) {
       // (~X | C2) ^ C1 --> ((X & ~C2) ^ -1) ^ C1 --> (X & ~C2) ^ ~C1
       Value *And = Builder.CreateAnd(X, ConstantExpr::getNot(C2));
@@ -3619,9 +3670,8 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
         *CA == X->getType()->getScalarSizeInBits() - 1 &&
         !match(C1, m_AllOnes())) {
       assert(!C1->isZeroValue() && "Unexpected xor with 0");
-      Value *ICmp =
-          Builder.CreateICmpSGT(X, Constant::getAllOnesValue(X->getType()));
-      return SelectInst::Create(ICmp, Op1, Builder.CreateNot(Op1));
+      Value *IsNotNeg = Builder.CreateIsNotNeg(X);
+      return SelectInst::Create(IsNotNeg, Op1, Builder.CreateNot(Op1));
     }
   }
 
@@ -3677,9 +3727,8 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
       APInt FoldConst = C1->getValue().lshr(C2->getValue());
       FoldConst ^= C3->getValue();
       // Prepare the two operands.
-      auto *Opnd0 = cast<Instruction>(Builder.CreateLShr(X, C2));
-      Opnd0->takeName(cast<Instruction>(Op0));
-      Opnd0->setDebugLoc(I.getDebugLoc());
+      auto *Opnd0 = Builder.CreateLShr(X, C2);
+      Opnd0->takeName(Op0);
       return BinaryOperator::CreateXor(Opnd0, ConstantInt::get(Ty, FoldConst));
     }
   }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp
index 495493aab4b5..2540e545ae4d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp
@@ -12,7 +12,6 @@
 
 #include "InstCombineInternal.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/Transforms/InstCombine/InstCombiner.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 05b28328afbf..67ef2e895b6c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -15,21 +15,18 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumeBundleQueries.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/Attributes.h"
@@ -74,7 +71,6 @@
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
-#include <cstring>
 #include <utility>
 #include <vector>
 
@@ -108,6 +104,19 @@ static Type *getPromotedType(Type *Ty) {
   return Ty;
 }
 
+/// Recognize a memcpy/memmove from a trivially otherwise unused alloca.
+/// TODO: This should probably be integrated with visitAllocSites, but that
+/// requires a deeper change to allow either unread or unwritten objects.
+static bool hasUndefSource(AnyMemTransferInst *MI) {
+  auto *Src = MI->getRawSource();
+  while (isa<GetElementPtrInst>(Src) || isa<BitCastInst>(Src)) {
+    if (!Src->hasOneUse())
+      return false;
+    Src = cast<Instruction>(Src)->getOperand(0);
+  }
+  return isa<AllocaInst>(Src) && Src->hasOneUse();
+}
+
 Instruction *InstCombinerImpl::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
   Align DstAlign = getKnownAlignment(MI->getRawDest(), DL, MI, &AC, &DT);
   MaybeAlign CopyDstAlign = MI->getDestAlign();
@@ -132,6 +141,14 @@ Instruction *InstCombinerImpl::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
     return MI;
   }
 
+  // If the source is provably undef, the memcpy/memmove doesn't do anything
+  // (unless the transfer is volatile).
+  if (hasUndefSource(MI) && !MI->isVolatile()) {
+    // Set the size of the copy to 0, it will be deleted on the next iteration.
+    MI->setLength(Constant::getNullValue(MI->getLength()->getType()));
+    return MI;
+  }
+
   // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with
   // load/store.
   ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getLength());
@@ -241,6 +258,15 @@ Instruction *InstCombinerImpl::SimplifyAnyMemSet(AnyMemSetInst *MI) {
     return MI;
   }
 
+  // Remove memset with an undef value.
+  // FIXME: This is technically incorrect because it might overwrite a poison
+  // value. Change to PoisonValue once #52930 is resolved.
+  if (isa<UndefValue>(MI->getValue())) {
+    // Set the size of the copy to 0, it will be deleted on the next iteration.
+    MI->setLength(Constant::getNullValue(MI->getLength()->getType()));
+    return MI;
+  }
+
   // Extract the length and alignment and fill if they are constant.
   ConstantInt *LenC = dyn_cast<ConstantInt>(MI->getLength());
   ConstantInt *FillC = dyn_cast<ConstantInt>(MI->getValue());
@@ -248,7 +274,7 @@ Instruction *InstCombinerImpl::SimplifyAnyMemSet(AnyMemSetInst *MI) {
     return nullptr;
   const uint64_t Len = LenC->getLimitedValue();
   assert(Len && "0-sized memory setting should be removed already.");
-  const Align Alignment = assumeAligned(MI->getDestAlignment());
+  const Align Alignment = MI->getDestAlign().valueOrOne();
 
   // If it is an atomic and alignment is less than the size then we will
   // introduce the unaligned memory access which will be later transformed
@@ -769,7 +795,7 @@ static CallInst *canonicalizeConstantArg0ToArg1(CallInst &Call) {
 /// \p Result and a constant \p Overflow value.
 static Instruction *createOverflowTuple(IntrinsicInst *II, Value *Result,
                                         Constant *Overflow) {
-  Constant *V[] = {UndefValue::get(Result->getType()), Overflow};
+  Constant *V[] = {PoisonValue::get(Result->getType()), Overflow};
   StructType *ST = cast<StructType>(II->getType());
   Constant *Struct = ConstantStruct::get(ST, V);
   return InsertValueInst::Create(Struct, Result, 0);
@@ -795,6 +821,10 @@ static Optional<bool> getKnownSign(Value *Op, Instruction *CxtI,
   if (Known.isNegative())
     return true;
 
+  Value *X, *Y;
+  if (match(Op, m_NSWSub(m_Value(X), m_Value(Y))))
+    return isImpliedByDomCondition(ICmpInst::ICMP_SLT, X, Y, CxtI, DL);
+
   return isImpliedByDomCondition(
       ICmpInst::ICMP_SLT, Op, Constant::getNullValue(Op->getType()), CxtI, DL);
 }
@@ -837,6 +867,67 @@ static Instruction *moveAddAfterMinMax(IntrinsicInst *II,
   return IsSigned ? BinaryOperator::CreateNSWAdd(NewMinMax, Add->getOperand(1))
                   : BinaryOperator::CreateNUWAdd(NewMinMax, Add->getOperand(1));
 }
+/// Match a sadd_sat or ssub_sat which is using min/max to clamp the value.
+Instruction *InstCombinerImpl::matchSAddSubSat(IntrinsicInst &MinMax1) {
+  Type *Ty = MinMax1.getType();
+
+  // We are looking for a tree of:
+  // max(INT_MIN, min(INT_MAX, add(sext(A), sext(B))))
+  // Where the min and max could be reversed
+  Instruction *MinMax2;
+  BinaryOperator *AddSub;
+  const APInt *MinValue, *MaxValue;
+  if (match(&MinMax1, m_SMin(m_Instruction(MinMax2), m_APInt(MaxValue)))) {
+    if (!match(MinMax2, m_SMax(m_BinOp(AddSub), m_APInt(MinValue))))
+      return nullptr;
+  } else if (match(&MinMax1,
+                   m_SMax(m_Instruction(MinMax2), m_APInt(MinValue)))) {
+    if (!match(MinMax2, m_SMin(m_BinOp(AddSub), m_APInt(MaxValue))))
+      return nullptr;
+  } else
+    return nullptr;
+
+  // Check that the constants clamp a saturate, and that the new type would be
+  // sensible to convert to.
+  if (!(*MaxValue + 1).isPowerOf2() || -*MinValue != *MaxValue + 1)
+    return nullptr;
+  // In what bitwidth can this be treated as saturating arithmetics?
+  unsigned NewBitWidth = (*MaxValue + 1).logBase2() + 1;
+  // FIXME: This isn't quite right for vectors, but using the scalar type is a
+  // good first approximation for what should be done there.
+  if (!shouldChangeType(Ty->getScalarType()->getIntegerBitWidth(), NewBitWidth))
+    return nullptr;
+
+  // Also make sure that the inner min/max and the add/sub have one use.
+  if (!MinMax2->hasOneUse() || !AddSub->hasOneUse())
+    return nullptr;
+
+  // Create the new type (which can be a vector type)
+  Type *NewTy = Ty->getWithNewBitWidth(NewBitWidth);
+
+  Intrinsic::ID IntrinsicID;
+  if (AddSub->getOpcode() == Instruction::Add)
+    IntrinsicID = Intrinsic::sadd_sat;
+  else if (AddSub->getOpcode() == Instruction::Sub)
+    IntrinsicID = Intrinsic::ssub_sat;
+  else
+    return nullptr;
+
+  // The two operands of the add/sub must be nsw-truncatable to the NewTy. This
+  // is usually achieved via a sext from a smaller type.
+  if (ComputeMaxSignificantBits(AddSub->getOperand(0), 0, AddSub) >
+          NewBitWidth ||
+      ComputeMaxSignificantBits(AddSub->getOperand(1), 0, AddSub) > NewBitWidth)
+    return nullptr;
+
+  // Finally create and return the sat intrinsic, truncated to the new type
+  Function *F = Intrinsic::getDeclaration(MinMax1.getModule(), IntrinsicID, NewTy);
+  Value *AT = Builder.CreateTrunc(AddSub->getOperand(0), NewTy);
+  Value *BT = Builder.CreateTrunc(AddSub->getOperand(1), NewTy);
+  Value *Sat = Builder.CreateCall(F, {AT, BT});
+  return CastInst::Create(Instruction::SExt, Sat, Ty);
+}
+
 
 /// If we have a clamp pattern like max (min X, 42), 41 -- where the output
 /// can only be one of two possible constant values -- turn that into a select
@@ -879,6 +970,59 @@ static Instruction *foldClampRangeOfTwo(IntrinsicInst *II,
   return SelectInst::Create(Cmp, ConstantInt::get(II->getType(), *C0), I1);
 }
 
+/// If this min/max has a constant operand and an operand that is a matching
+/// min/max with a constant operand, constant-fold the 2 constant operands.
+static Instruction *reassociateMinMaxWithConstants(IntrinsicInst *II) {
+  Intrinsic::ID MinMaxID = II->getIntrinsicID();
+  auto *LHS = dyn_cast<IntrinsicInst>(II->getArgOperand(0));
+  if (!LHS || LHS->getIntrinsicID() != MinMaxID)
+    return nullptr;
+
+  Constant *C0, *C1;
+  if (!match(LHS->getArgOperand(1), m_ImmConstant(C0)) ||
+      !match(II->getArgOperand(1), m_ImmConstant(C1)))
+    return nullptr;
+
+  // max (max X, C0), C1 --> max X, (max C0, C1) --> max X, NewC
+  ICmpInst::Predicate Pred = MinMaxIntrinsic::getPredicate(MinMaxID);
+  Constant *CondC = ConstantExpr::getICmp(Pred, C0, C1);
+  Constant *NewC = ConstantExpr::getSelect(CondC, C0, C1);
+
+  Module *Mod = II->getModule();
+  Function *MinMax = Intrinsic::getDeclaration(Mod, MinMaxID, II->getType());
+  return CallInst::Create(MinMax, {LHS->getArgOperand(0), NewC});
+}
+
+/// If this min/max has a matching min/max operand with a constant, try to push
+/// the constant operand into this instruction. This can enable more folds.
+static Instruction *
+reassociateMinMaxWithConstantInOperand(IntrinsicInst *II,
+                                       InstCombiner::BuilderTy &Builder) {
+  // Match and capture a min/max operand candidate.
+  Value *X, *Y;
+  Constant *C;
+  Instruction *Inner;
+  if (!match(II, m_c_MaxOrMin(m_OneUse(m_CombineAnd(
+                                  m_Instruction(Inner),
+                                  m_MaxOrMin(m_Value(X), m_ImmConstant(C)))),
+                              m_Value(Y))))
+    return nullptr;
+
+  // The inner op must match. Check for constants to avoid infinite loops.
+  Intrinsic::ID MinMaxID = II->getIntrinsicID();
+  auto *InnerMM = dyn_cast<IntrinsicInst>(Inner);
+  if (!InnerMM || InnerMM->getIntrinsicID() != MinMaxID ||
+      match(X, m_ImmConstant()) || match(Y, m_ImmConstant()))
+    return nullptr;
+
+  // max (max X, C), Y --> max (max X, Y), C
+  Function *MinMax =
+      Intrinsic::getDeclaration(II->getModule(), MinMaxID, II->getType());
+  Value *NewInner = Builder.CreateBinaryIntrinsic(MinMaxID, X, Y);
+  NewInner->takeName(Inner);
+  return CallInst::Create(MinMax, {NewInner, C});
+}
+
 /// Reduce a sequence of min/max intrinsics with a common operand.
 static Instruction *factorizeMinMaxTree(IntrinsicInst *II) {
   // Match 3 of the same min/max ops. Example: umin(umin(), umin()).
@@ -936,6 +1080,56 @@ static Instruction *factorizeMinMaxTree(IntrinsicInst *II) {
   return CallInst::Create(MinMax, { MinMaxOp, ThirdOp });
 }
 
+/// If all arguments of the intrinsic are unary shuffles with the same mask,
+/// try to shuffle after the intrinsic.
+static Instruction *
+foldShuffledIntrinsicOperands(IntrinsicInst *II,
+                              InstCombiner::BuilderTy &Builder) {
+  // TODO: This should be extended to handle other intrinsics like fshl, ctpop,
+  //       etc. Use llvm::isTriviallyVectorizable() and related to determine
+  //       which intrinsics are safe to shuffle?
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::smax:
+  case Intrinsic::smin:
+  case Intrinsic::umax:
+  case Intrinsic::umin:
+  case Intrinsic::fma:
+  case Intrinsic::fshl:
+  case Intrinsic::fshr:
+    break;
+  default:
+    return nullptr;
+  }
+
+  Value *X;
+  ArrayRef<int> Mask;
+  if (!match(II->getArgOperand(0),
+             m_Shuffle(m_Value(X), m_Undef(), m_Mask(Mask))))
+    return nullptr;
+
+  // At least 1 operand must have 1 use because we are creating 2 instructions.
+  if (none_of(II->args(), [](Value *V) { return V->hasOneUse(); }))
+    return nullptr;
+
+  // See if all arguments are shuffled with the same mask.
+  SmallVector<Value *, 4> NewArgs(II->arg_size());
+  NewArgs[0] = X;
+  Type *SrcTy = X->getType();
+  for (unsigned i = 1, e = II->arg_size(); i != e; ++i) {
+    if (!match(II->getArgOperand(i),
+               m_Shuffle(m_Value(X), m_Undef(), m_SpecificMask(Mask))) ||
+        X->getType() != SrcTy)
+      return nullptr;
+    NewArgs[i] = X;
+  }
+
+  // intrinsic (shuf X, M), (shuf Y, M), ... --> shuf (intrinsic X, Y, ...), M
+  Instruction *FPI = isa<FPMathOperator>(II) ? II : nullptr;
+  Value *NewIntrinsic =
+      Builder.CreateIntrinsic(II->getIntrinsicID(), SrcTy, NewArgs, FPI);
+  return new ShuffleVectorInst(NewIntrinsic, Mask);
+}
+
 /// CallInst simplification. This mostly only handles folding of intrinsic
 /// instructions. For normal calls, it allows visitCallBase to do the heavy
 /// lifting.
@@ -943,14 +1137,14 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
   // Don't try to simplify calls without uses. It will not do anything useful,
   // but will result in the following folds being skipped.
   if (!CI.use_empty())
-    if (Value *V = SimplifyCall(&CI, SQ.getWithInstruction(&CI)))
+    if (Value *V = simplifyCall(&CI, SQ.getWithInstruction(&CI)))
       return replaceInstUsesWith(CI, V);
 
   if (isFreeCall(&CI, &TLI))
     return visitFree(CI);
 
-  // If the caller function is nounwind, mark the call as nounwind, even if the
-  // callee isn't.
+  // If the caller function (i.e. us, the function that contains this CallInst)
+  // is nounwind, mark the call as nounwind, even if the callee isn't.
   if (CI.getFunction()->doesNotThrow() && !CI.doesNotThrow()) {
     CI.setDoesNotThrow();
     return &CI;
@@ -980,13 +1174,6 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     if (Constant *NumBytes = dyn_cast<Constant>(MI->getLength())) {
       if (NumBytes->isNullValue())
         return eraseInstFromFunction(CI);
-
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes))
-        if (CI->getZExtValue() == 1) {
-          // Replace the instruction with just byte operations.  We would
-          // transform other cases to loads/stores, but we don't know if
-          // alignment is sufficient.
-        }
     }
 
     // No other transformations apply to volatile transfers.
@@ -1050,10 +1237,19 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       return NewCall;
   }
 
+  // Unused constrained FP intrinsic calls may have declared side effect, which
+  // prevents it from being removed. In some cases however the side effect is
+  // actually absent. To detect this case, call SimplifyConstrainedFPCall. If it
+  // returns a replacement, the call may be removed.
+  if (CI.use_empty() && isa<ConstrainedFPIntrinsic>(CI)) {
+    if (simplifyConstrainedFPCall(&CI, SQ.getWithInstruction(&CI)))
+      return eraseInstFromFunction(CI);
+  }
+
   Intrinsic::ID IID = II->getIntrinsicID();
   switch (IID) {
   case Intrinsic::objectsize:
-    if (Value *V = lowerObjectSizeCall(II, DL, &TLI, /*MustSucceed=*/false))
+    if (Value *V = lowerObjectSizeCall(II, DL, &TLI, AA, /*MustSucceed=*/false))
       return replaceInstUsesWith(CI, V);
     return nullptr;
   case Intrinsic::abs: {
@@ -1224,6 +1420,12 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
         if (Instruction *R = FoldOpIntoSelect(*II, Sel))
           return R;
 
+    if (Instruction *NewMinMax = reassociateMinMaxWithConstants(II))
+      return NewMinMax;
+
+    if (Instruction *R = reassociateMinMaxWithConstantInOperand(II, Builder))
+      return R;
+
     if (Instruction *NewMinMax = factorizeMinMaxTree(II))
        return NewMinMax;
 
@@ -1231,14 +1433,35 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
   }
   case Intrinsic::bswap: {
     Value *IIOperand = II->getArgOperand(0);
-    Value *X = nullptr;
+
+    // Try to canonicalize bswap-of-logical-shift-by-8-bit-multiple as
+    // inverse-shift-of-bswap:
+    // bswap (shl X, Y) --> lshr (bswap X), Y
+    // bswap (lshr X, Y) --> shl (bswap X), Y
+    Value *X, *Y;
+    if (match(IIOperand, m_OneUse(m_LogicalShift(m_Value(X), m_Value(Y))))) {
+      // The transform allows undef vector elements, so try a constant match
+      // first. If knownbits can handle that case, that clause could be removed.
+      unsigned BitWidth = IIOperand->getType()->getScalarSizeInBits();
+      const APInt *C;
+      if ((match(Y, m_APIntAllowUndef(C)) && (*C & 7) == 0) ||
+          MaskedValueIsZero(Y, APInt::getLowBitsSet(BitWidth, 3))) {
+        Value *NewSwap = Builder.CreateUnaryIntrinsic(Intrinsic::bswap, X);
+        BinaryOperator::BinaryOps InverseShift =
+            cast<BinaryOperator>(IIOperand)->getOpcode() == Instruction::Shl
+                ? Instruction::LShr
+                : Instruction::Shl;
+        return BinaryOperator::Create(InverseShift, NewSwap, Y);
+      }
+    }
 
     KnownBits Known = computeKnownBits(IIOperand, 0, II);
     uint64_t LZ = alignDown(Known.countMinLeadingZeros(), 8);
     uint64_t TZ = alignDown(Known.countMinTrailingZeros(), 8);
+    unsigned BW = Known.getBitWidth();
 
     // bswap(x) -> shift(x) if x has exactly one "active byte"
-    if (Known.getBitWidth() - LZ - TZ == 8) {
+    if (BW - LZ - TZ == 8) {
       assert(LZ != TZ && "active byte cannot be in the middle");
       if (LZ > TZ)  // -> shl(x) if the "active byte" is in the low part of x
         return BinaryOperator::CreateNUWShl(
@@ -1250,8 +1473,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
 
     // bswap(trunc(bswap(x))) -> trunc(lshr(x, c))
     if (match(IIOperand, m_Trunc(m_BSwap(m_Value(X))))) {
-      unsigned C = X->getType()->getScalarSizeInBits() -
-                   IIOperand->getType()->getScalarSizeInBits();
+      unsigned C = X->getType()->getScalarSizeInBits() - BW;
       Value *CV = ConstantInt::get(X->getType(), C);
       Value *V = Builder.CreateLShr(X, CV);
       return new TruncInst(V, IIOperand->getType());
@@ -1618,7 +1840,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     }
 
     // Try to simplify the underlying FMul.
-    if (Value *V = SimplifyFMulInst(II->getArgOperand(0), II->getArgOperand(1),
+    if (Value *V = simplifyFMulInst(II->getArgOperand(0), II->getArgOperand(1),
                                     II->getFastMathFlags(),
                                     SQ.getWithInstruction(II))) {
       auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2));
@@ -1649,7 +1871,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
 
     // Try to simplify the underlying FMul. We can only apply simplifications
     // that do not require rounding.
-    if (Value *V = SimplifyFMAFMul(II->getArgOperand(0), II->getArgOperand(1),
+    if (Value *V = simplifyFMAFMul(II->getArgOperand(0), II->getArgOperand(1),
                                    II->getFastMathFlags(),
                                    SQ.getWithInstruction(II))) {
       auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2));
@@ -2135,7 +2357,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     }
     break;
   }
-  case Intrinsic::experimental_vector_insert: {
+  case Intrinsic::vector_insert: {
     Value *Vec = II->getArgOperand(0);
     Value *SubVec = II->getArgOperand(1);
     Value *Idx = II->getArgOperand(2);
@@ -2181,7 +2403,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     }
     break;
   }
-  case Intrinsic::experimental_vector_extract: {
+  case Intrinsic::vector_extract: {
     Value *Vec = II->getArgOperand(0);
     Value *Idx = II->getArgOperand(1);
 
@@ -2456,11 +2678,15 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
   default: {
     // Handle target specific intrinsics
     Optional<Instruction *> V = targetInstCombineIntrinsic(*II);
-    if (V.hasValue())
+    if (V)
       return V.getValue();
     break;
   }
   }
+
+  if (Instruction *Shuf = foldShuffledIntrinsicOperands(II, Builder))
+    return Shuf;
+
   // Some intrinsics (like experimental_gc_statepoint) can be used in invoke
   // context, so it is handled in visitCallBase and we should trigger it.
   return visitCallBase(*II);
@@ -2648,47 +2874,56 @@ static IntrinsicInst *findInitTrampoline(Value *Callee) {
   return nullptr;
 }
 
-void InstCombinerImpl::annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI) {
+bool InstCombinerImpl::annotateAnyAllocSite(CallBase &Call,
+                                            const TargetLibraryInfo *TLI) {
   // Note: We only handle cases which can't be driven from generic attributes
   // here.  So, for example, nonnull and noalias (which are common properties
   // of some allocation functions) are expected to be handled via annotation
   // of the respective allocator declaration with generic attributes.
+  bool Changed = false;
 
-  uint64_t Size;
-  ObjectSizeOpts Opts;
-  if (getObjectSize(&Call, Size, DL, TLI, Opts) && Size > 0) {
-    // TODO: We really should just emit deref_or_null here and then
-    // let the generic inference code combine that with nonnull.
-    if (Call.hasRetAttr(Attribute::NonNull))
-      Call.addRetAttr(Attribute::getWithDereferenceableBytes(
-          Call.getContext(), Size));
-    else
-      Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes(
-          Call.getContext(), Size));
+  if (isAllocationFn(&Call, TLI)) {
+    uint64_t Size;
+    ObjectSizeOpts Opts;
+    if (getObjectSize(&Call, Size, DL, TLI, Opts) && Size > 0) {
+      // TODO: We really should just emit deref_or_null here and then
+      // let the generic inference code combine that with nonnull.
+      if (Call.hasRetAttr(Attribute::NonNull)) {
+        Changed = !Call.hasRetAttr(Attribute::Dereferenceable);
+        Call.addRetAttr(
+            Attribute::getWithDereferenceableBytes(Call.getContext(), Size));
+      } else {
+        Changed = !Call.hasRetAttr(Attribute::DereferenceableOrNull);
+        Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes(
+            Call.getContext(), Size));
+      }
+    }
   }
 
   // Add alignment attribute if alignment is a power of two constant.
   Value *Alignment = getAllocAlignment(&Call, TLI);
   if (!Alignment)
-    return;
+    return Changed;
 
   ConstantInt *AlignOpC = dyn_cast<ConstantInt>(Alignment);
   if (AlignOpC && AlignOpC->getValue().ult(llvm::Value::MaximumAlignment)) {
     uint64_t AlignmentVal = AlignOpC->getZExtValue();
     if (llvm::isPowerOf2_64(AlignmentVal)) {
-      Call.removeRetAttr(Attribute::Alignment);
-      Call.addRetAttr(Attribute::getWithAlignment(Call.getContext(),
-                                                  Align(AlignmentVal)));
+      Align ExistingAlign = Call.getRetAlign().valueOrOne();
+      Align NewAlign = Align(AlignmentVal);
+      if (NewAlign > ExistingAlign) {
+        Call.addRetAttr(
+            Attribute::getWithAlignment(Call.getContext(), NewAlign));
+        Changed = true;
+      }
     }
   }
+  return Changed;
 }
 
 /// Improvements for call, callbr and invoke instructions.
 Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) {
-  if (isAllocationFn(&Call, &TLI))
-    annotateAnyAllocSite(Call, &TLI);
-
-  bool Changed = false;
+  bool Changed = annotateAnyAllocSite(Call, &TLI);
 
   // Mark any parameters that are known to be non-null with the nonnull
   // attribute.  This is helpful for inlining calls to functions with null
@@ -2718,10 +2953,12 @@ Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) {
   // If the callee is a pointer to a function, attempt to move any casts to the
   // arguments of the call/callbr/invoke.
   Value *Callee = Call.getCalledOperand();
-  if (!isa<Function>(Callee) && transformConstExprCastCall(Call))
+  Function *CalleeF = dyn_cast<Function>(Callee);
+  if ((!CalleeF || CalleeF->getFunctionType() != Call.getFunctionType()) &&
+      transformConstExprCastCall(Call))
     return nullptr;
 
-  if (Function *CalleeF = dyn_cast<Function>(Callee)) {
+  if (CalleeF) {
     // Remove the convergent attr on calls when the callee is not convergent.
     if (Call.isConvergent() && !CalleeF->isConvergent() &&
         !CalleeF->isIntrinsic()) {
@@ -2905,7 +3142,7 @@ Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) {
     Optional<OperandBundleUse> Bundle =
         GCSP.getOperandBundle(LLVMContext::OB_gc_live);
     unsigned NumOfGCLives = LiveGcValues.size();
-    if (!Bundle.hasValue() || NumOfGCLives == Bundle->Inputs.size())
+    if (!Bundle || NumOfGCLives == Bundle->Inputs.size())
       break;
     // We can reduce the size of gc live bundle.
     DenseMap<Value *, unsigned> Val2Idx;
@@ -3026,8 +3263,7 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
   //
   //  Similarly, avoid folding away bitcasts of byval calls.
   if (Callee->getAttributes().hasAttrSomewhere(Attribute::InAlloca) ||
-      Callee->getAttributes().hasAttrSomewhere(Attribute::Preallocated) ||
-      Callee->getAttributes().hasAttrSomewhere(Attribute::ByVal))
+      Callee->getAttributes().hasAttrSomewhere(Attribute::Preallocated))
     return false;
 
   auto AI = Call.arg_begin();
@@ -3038,12 +3274,15 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
     if (!CastInst::isBitOrNoopPointerCastable(ActTy, ParamTy, DL))
       return false;   // Cannot transform this parameter value.
 
+    // Check if there are any incompatible attributes we cannot drop safely.
     if (AttrBuilder(FT->getContext(), CallerPAL.getParamAttrs(i))
-            .overlaps(AttributeFuncs::typeIncompatible(ParamTy)))
+            .overlaps(AttributeFuncs::typeIncompatible(
+                ParamTy, AttributeFuncs::ASK_UNSAFE_TO_DROP)))
       return false;   // Attribute not compatible with transformed value.
 
-    if (Call.isInAllocaArgument(i))
-      return false;   // Cannot transform to and from inalloca.
+    if (Call.isInAllocaArgument(i) ||
+        CallerPAL.hasParamAttr(i, Attribute::Preallocated))
+      return false; // Cannot transform to and from inalloca/preallocated.
 
     if (CallerPAL.hasParamAttr(i, Attribute::SwiftError))
       return false;
@@ -3052,13 +3291,18 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
     // sized type and the sized type has to have the same size as the old type.
     if (ParamTy != ActTy && CallerPAL.hasParamAttr(i, Attribute::ByVal)) {
       PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy);
-      if (!ParamPTy || !ParamPTy->getPointerElementType()->isSized())
+      if (!ParamPTy)
         return false;
 
-      Type *CurElTy = Call.getParamByValType(i);
-      if (DL.getTypeAllocSize(CurElTy) !=
-          DL.getTypeAllocSize(ParamPTy->getPointerElementType()))
-        return false;
+      if (!ParamPTy->isOpaque()) {
+        Type *ParamElTy = ParamPTy->getNonOpaquePointerElementType();
+        if (!ParamElTy->isSized())
+          return false;
+
+        Type *CurElTy = Call.getParamByValType(i);
+        if (DL.getTypeAllocSize(CurElTy) != DL.getTypeAllocSize(ParamElTy))
+          return false;
+      }
     }
   }
 
@@ -3116,13 +3360,20 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
       NewArg = Builder.CreateBitOrPointerCast(*AI, ParamTy);
     Args.push_back(NewArg);
 
-    // Add any parameter attributes.
-    if (CallerPAL.hasParamAttr(i, Attribute::ByVal)) {
-      AttrBuilder AB(FT->getContext(), CallerPAL.getParamAttrs(i));
-      AB.addByValAttr(NewArg->getType()->getPointerElementType());
+    // Add any parameter attributes except the ones incompatible with the new
+    // type. Note that we made sure all incompatible ones are safe to drop.
+    AttributeMask IncompatibleAttrs = AttributeFuncs::typeIncompatible(
+        ParamTy, AttributeFuncs::ASK_SAFE_TO_DROP);
+    if (CallerPAL.hasParamAttr(i, Attribute::ByVal) &&
+        !ParamTy->isOpaquePointerTy()) {
+      AttrBuilder AB(Ctx, CallerPAL.getParamAttrs(i).removeAttributes(
+                              Ctx, IncompatibleAttrs));
+      AB.addByValAttr(ParamTy->getNonOpaquePointerElementType());
       ArgAttrs.push_back(AttributeSet::get(Ctx, AB));
-    } else
-      ArgAttrs.push_back(CallerPAL.getParamAttrs(i));
+    } else {
+      ArgAttrs.push_back(
+          CallerPAL.getParamAttrs(i).removeAttributes(Ctx, IncompatibleAttrs));
+    }
   }
 
   // If the function takes more arguments than the call was taking, add them
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index f11ba8772f3c..e9e779b8619b 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -13,13 +13,10 @@
 #include "InstCombineInternal.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
-#include <numeric>
 using namespace llvm;
 using namespace PatternMatch;
 
@@ -39,8 +36,10 @@ static Value *decomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
 
   if (BinaryOperator *I = dyn_cast<BinaryOperator>(Val)) {
     // Cannot look past anything that might overflow.
+    // We specifically require nuw because we store the Scale in an unsigned
+    // and perform an unsigned divide on it.
     OverflowingBinaryOperator *OBI = dyn_cast<OverflowingBinaryOperator>(Val);
-    if (OBI && !OBI->hasNoUnsignedWrap() && !OBI->hasNoSignedWrap()) {
+    if (OBI && !OBI->hasNoUnsignedWrap()) {
       Scale = 1;
       Offset = 0;
       return Val;
@@ -639,10 +638,12 @@ Instruction *InstCombinerImpl::narrowFunnelShift(TruncInst &Trunc) {
 
 /// Try to narrow the width of math or bitwise logic instructions by pulling a
 /// truncate ahead of binary operators.
-/// TODO: Transforms for truncated shifts should be moved into here.
 Instruction *InstCombinerImpl::narrowBinOp(TruncInst &Trunc) {
   Type *SrcTy = Trunc.getSrcTy();
   Type *DestTy = Trunc.getType();
+  unsigned SrcWidth = SrcTy->getScalarSizeInBits();
+  unsigned DestWidth = DestTy->getScalarSizeInBits();
+
   if (!isa<VectorType>(SrcTy) && !shouldChangeType(SrcTy, DestTy))
     return nullptr;
 
@@ -685,7 +686,30 @@ Instruction *InstCombinerImpl::narrowBinOp(TruncInst &Trunc) {
     }
     break;
   }
-
+  case Instruction::LShr:
+  case Instruction::AShr: {
+    // trunc (*shr (trunc A), C) --> trunc(*shr A, C)
+    Value *A;
+    Constant *C;
+    if (match(BinOp0, m_Trunc(m_Value(A))) && match(BinOp1, m_Constant(C))) {
+      unsigned MaxShiftAmt = SrcWidth - DestWidth;
+      // If the shift is small enough, all zero/sign bits created by the shift
+      // are removed by the trunc.
+      if (match(C, m_SpecificInt_ICMP(ICmpInst::ICMP_ULE,
+                                      APInt(SrcWidth, MaxShiftAmt)))) {
+        auto *OldShift = cast<Instruction>(Trunc.getOperand(0));
+        bool IsExact = OldShift->isExact();
+        auto *ShAmt = ConstantExpr::getIntegerCast(C, A->getType(), true);
+        ShAmt = Constant::mergeUndefsWith(ShAmt, C);
+        Value *Shift =
+            OldShift->getOpcode() == Instruction::AShr
+                ? Builder.CreateAShr(A, ShAmt, OldShift->getName(), IsExact)
+                : Builder.CreateLShr(A, ShAmt, OldShift->getName(), IsExact);
+        return CastInst::CreateTruncOrBitCast(Shift, DestTy);
+      }
+    }
+    break;
+  }
   default: break;
   }
 
@@ -873,26 +897,6 @@ Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) {
     // TODO: Mask high bits with 'and'.
   }
 
-  // trunc (*shr (trunc A), C) --> trunc(*shr A, C)
-  if (match(Src, m_OneUse(m_Shr(m_Trunc(m_Value(A)), m_Constant(C))))) {
-    unsigned MaxShiftAmt = SrcWidth - DestWidth;
-
-    // If the shift is small enough, all zero/sign bits created by the shift are
-    // removed by the trunc.
-    if (match(C, m_SpecificInt_ICMP(ICmpInst::ICMP_ULE,
-                                    APInt(SrcWidth, MaxShiftAmt)))) {
-      auto *OldShift = cast<Instruction>(Src);
-      bool IsExact = OldShift->isExact();
-      auto *ShAmt = ConstantExpr::getIntegerCast(C, A->getType(), true);
-      ShAmt = Constant::mergeUndefsWith(ShAmt, C);
-      Value *Shift =
-          OldShift->getOpcode() == Instruction::AShr
-              ? Builder.CreateAShr(A, ShAmt, OldShift->getName(), IsExact)
-              : Builder.CreateLShr(A, ShAmt, OldShift->getName(), IsExact);
-      return CastInst::CreateTruncOrBitCast(Shift, DestTy);
-    }
-  }
-
   if (Instruction *I = narrowBinOp(Trunc))
     return I;
 
@@ -971,7 +975,7 @@ Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) {
       Attribute Attr =
           Trunc.getFunction()->getFnAttribute(Attribute::VScaleRange);
       if (Optional<unsigned> MaxVScale = Attr.getVScaleRangeMax()) {
-        if (Log2_32(MaxVScale.getValue()) < DestWidth) {
+        if (Log2_32(*MaxVScale) < DestWidth) {
           Value *VScale = Builder.CreateVScale(ConstantInt::get(DestTy, 1));
           return replaceInstUsesWith(Trunc, VScale);
         }
@@ -986,13 +990,18 @@ Instruction *InstCombinerImpl::transformZExtICmp(ICmpInst *Cmp, ZExtInst &Zext)
   // If we are just checking for a icmp eq of a single bit and zext'ing it
   // to an integer, then shift the bit to the appropriate place and then
   // cast to integer to avoid the comparison.
+
+  // FIXME: This set of transforms does not check for extra uses and/or creates
+  //        an extra instruction (an optional final cast is not included
+  //        in the transform comments). We may also want to favor icmp over
+  //        shifts in cases of equal instructions because icmp has better
+  //        analysis in general (invert the transform).
+
   const APInt *Op1CV;
   if (match(Cmp->getOperand(1), m_APInt(Op1CV))) {
 
     // zext (x <s  0) to i32 --> x>>u31      true if signbit set.
-    // zext (x >s -1) to i32 --> (x>>u31)^1  true if signbit clear.
-    if ((Cmp->getPredicate() == ICmpInst::ICMP_SLT && Op1CV->isZero()) ||
-        (Cmp->getPredicate() == ICmpInst::ICMP_SGT && Op1CV->isAllOnes())) {
+    if (Cmp->getPredicate() == ICmpInst::ICMP_SLT && Op1CV->isZero()) {
       Value *In = Cmp->getOperand(0);
       Value *Sh = ConstantInt::get(In->getType(),
                                    In->getType()->getScalarSizeInBits() - 1);
@@ -1000,11 +1009,6 @@ Instruction *InstCombinerImpl::transformZExtICmp(ICmpInst *Cmp, ZExtInst &Zext)
       if (In->getType() != Zext.getType())
         In = Builder.CreateIntCast(In, Zext.getType(), false /*ZExt*/);
 
-      if (Cmp->getPredicate() == ICmpInst::ICMP_SGT) {
-        Constant *One = ConstantInt::get(In->getType(), 1);
-        In = Builder.CreateXor(In, One, In->getName() + ".not");
-      }
-
       return replaceInstUsesWith(Zext, In);
     }
 
@@ -1080,7 +1084,7 @@ Instruction *InstCombinerImpl::transformZExtICmp(ICmpInst *Cmp, ZExtInst &Zext)
       KnownBits KnownLHS = computeKnownBits(LHS, 0, &Zext);
       KnownBits KnownRHS = computeKnownBits(RHS, 0, &Zext);
 
-      if (KnownLHS.Zero == KnownRHS.Zero && KnownLHS.One == KnownRHS.One) {
+      if (KnownLHS == KnownRHS) {
         APInt KnownBits = KnownLHS.Zero | KnownLHS.One;
         APInt UnknownBit = ~KnownBits;
         if (UnknownBit.countPopulation() == 1) {
@@ -1343,7 +1347,7 @@ Instruction *InstCombinerImpl::visitZExt(ZExtInst &CI) {
       Attribute Attr = CI.getFunction()->getFnAttribute(Attribute::VScaleRange);
       if (Optional<unsigned> MaxVScale = Attr.getVScaleRangeMax()) {
         unsigned TypeWidth = Src->getType()->getScalarSizeInBits();
-        if (Log2_32(MaxVScale.getValue()) < TypeWidth) {
+        if (Log2_32(*MaxVScale) < TypeWidth) {
           Value *VScale = Builder.CreateVScale(ConstantInt::get(DestTy, 1));
           return replaceInstUsesWith(CI, VScale);
         }
@@ -1506,10 +1510,8 @@ Instruction *InstCombinerImpl::visitSExt(SExtInst &CI) {
   unsigned SrcBitSize = SrcTy->getScalarSizeInBits();
   unsigned DestBitSize = DestTy->getScalarSizeInBits();
 
-  // If we know that the value being extended is positive, we can use a zext
-  // instead.
-  KnownBits Known = computeKnownBits(Src, 0, &CI);
-  if (Known.isNonNegative())
+  // If the value being extended is zero or positive, use a zext instead.
+  if (isKnownNonNegative(Src, DL, 0, &AC, &CI, &DT))
     return CastInst::Create(Instruction::ZExt, Src, DestTy);
 
   // Try to extend the entire expression tree to the wide destination type.
@@ -1597,14 +1599,20 @@ Instruction *InstCombinerImpl::visitSExt(SExtInst &CI) {
 
   // Splatting a bit of constant-index across a value:
   // sext (ashr (trunc iN X to iM), M-1) to iN --> ashr (shl X, N-M), N-1
-  // TODO: If the dest type is different, use a cast (adjust use check).
+  // If the dest type is different, use a cast (adjust use check).
   if (match(Src, m_OneUse(m_AShr(m_Trunc(m_Value(X)),
-                                 m_SpecificInt(SrcBitSize - 1)))) &&
-      X->getType() == DestTy) {
-    Constant *ShlAmtC = ConstantInt::get(DestTy, DestBitSize - SrcBitSize);
-    Constant *AshrAmtC = ConstantInt::get(DestTy, DestBitSize - 1);
-    Value *Shl = Builder.CreateShl(X, ShlAmtC);
-    return BinaryOperator::CreateAShr(Shl, AshrAmtC);
+                                 m_SpecificInt(SrcBitSize - 1))))) {
+    Type *XTy = X->getType();
+    unsigned XBitSize = XTy->getScalarSizeInBits();
+    Constant *ShlAmtC = ConstantInt::get(XTy, XBitSize - SrcBitSize);
+    Constant *AshrAmtC = ConstantInt::get(XTy, XBitSize - 1);
+    if (XTy == DestTy)
+      return BinaryOperator::CreateAShr(Builder.CreateShl(X, ShlAmtC),
+                                        AshrAmtC);
+    if (cast<BinaryOperator>(Src)->getOperand(0)->hasOneUse()) {
+      Value *Ashr = Builder.CreateAShr(Builder.CreateShl(X, ShlAmtC), AshrAmtC);
+      return CastInst::CreateIntegerCast(Ashr, DestTy, /* isSigned */ true);
+    }
   }
 
   if (match(Src, m_VScale(DL))) {
@@ -1612,7 +1620,7 @@ Instruction *InstCombinerImpl::visitSExt(SExtInst &CI) {
         CI.getFunction()->hasFnAttribute(Attribute::VScaleRange)) {
       Attribute Attr = CI.getFunction()->getFnAttribute(Attribute::VScaleRange);
       if (Optional<unsigned> MaxVScale = Attr.getVScaleRangeMax()) {
-        if (Log2_32(MaxVScale.getValue()) < (SrcBitSize - 1)) {
+        if (Log2_32(*MaxVScale) < (SrcBitSize - 1)) {
           Value *VScale = Builder.CreateVScale(ConstantInt::get(DestTy, 1));
           return replaceInstUsesWith(CI, VScale);
         }
@@ -1712,7 +1720,7 @@ static Type *getMinimumFPType(Value *V) {
 
 /// Return true if the cast from integer to FP can be proven to be exact for all
 /// possible inputs (the conversion does not lose any precision).
-static bool isKnownExactCastIntToFP(CastInst &I) {
+static bool isKnownExactCastIntToFP(CastInst &I, InstCombinerImpl &IC) {
   CastInst::CastOps Opcode = I.getOpcode();
   assert((Opcode == CastInst::SIToFP || Opcode == CastInst::UIToFP) &&
          "Unexpected cast");
@@ -1749,6 +1757,12 @@ static bool isKnownExactCastIntToFP(CastInst &I) {
   // TODO:
   // Try harder to find if the source integer type has less significant bits.
   // For example, compute number of sign bits or compute low bit mask.
+  KnownBits SrcKnown = IC.computeKnownBits(Src, 0, &I);
+  int LowBits =
+      (int)SrcTy->getScalarSizeInBits() - SrcKnown.countMinLeadingZeros();
+  if (LowBits <= DestNumSigBits)
+    return true;
+
   return false;
 }
 
@@ -1929,7 +1943,7 @@ Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) {
   Value *Src = FPT.getOperand(0);
   if (isa<SIToFPInst>(Src) || isa<UIToFPInst>(Src)) {
     auto *FPCast = cast<CastInst>(Src);
-    if (isKnownExactCastIntToFP(*FPCast))
+    if (isKnownExactCastIntToFP(*FPCast, *this))
       return CastInst::Create(FPCast->getOpcode(), FPCast->getOperand(0), Ty);
   }
 
@@ -1943,7 +1957,7 @@ Instruction *InstCombinerImpl::visitFPExt(CastInst &FPExt) {
   Value *Src = FPExt.getOperand(0);
   if (isa<SIToFPInst>(Src) || isa<UIToFPInst>(Src)) {
     auto *FPCast = cast<CastInst>(Src);
-    if (isKnownExactCastIntToFP(*FPCast))
+    if (isKnownExactCastIntToFP(*FPCast, *this))
       return CastInst::Create(FPCast->getOpcode(), FPCast->getOperand(0), Ty);
   }
 
@@ -1970,13 +1984,13 @@ Instruction *InstCombinerImpl::foldItoFPtoI(CastInst &FI) {
 
   // This means this is also safe for a signed input and unsigned output, since
   // a negative input would lead to undefined behavior.
-  if (!isKnownExactCastIntToFP(*OpI)) {
+  if (!isKnownExactCastIntToFP(*OpI, *this)) {
     // The first cast may not round exactly based on the source integer width
     // and FP width, but the overflow UB rules can still allow this to fold.
     // If the destination type is narrow, that means the intermediate FP value
     // must be large enough to hold the source value exactly.
     // For example, (uint8_t)((float)(uint32_t 16777217) is undefined behavior.
-    int OutputSize = (int)DestType->getScalarSizeInBits() - IsOutputSigned;
+    int OutputSize = (int)DestType->getScalarSizeInBits();
     if (OutputSize > OpI->getType()->getFPMantissaWidth())
       return nullptr;
   }
@@ -2150,14 +2164,10 @@ optimizeVectorResizeWithIntegerBitCasts(Value *InVal, VectorType *DestTy,
   // Now that the element types match, get the shuffle mask and RHS of the
   // shuffle to use, which depends on whether we're increasing or decreasing the
   // size of the input.
-  SmallVector<int, 16> ShuffleMaskStorage;
+  auto ShuffleMaskStorage = llvm::to_vector<16>(llvm::seq<int>(0, SrcElts));
   ArrayRef<int> ShuffleMask;
   Value *V2;
 
-  // Produce an identify shuffle mask for the src vector.
-  ShuffleMaskStorage.resize(SrcElts);
-  std::iota(ShuffleMaskStorage.begin(), ShuffleMaskStorage.end(), 0);
-
   if (SrcElts > DestElts) {
     // If we're shrinking the number of elements (rewriting an integer
     // truncate), just shuffle in the elements corresponding to the least
@@ -2278,6 +2288,8 @@ static bool collectInsertionElements(Value *V, unsigned Shift,
   switch (I->getOpcode()) {
   default: return false; // Unhandled case.
   case Instruction::BitCast:
+    if (I->getOperand(0)->getType()->isVectorTy())
+      return false;
     return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,
                                     isBigEndian);
   case Instruction::ZExt:
@@ -2351,21 +2363,28 @@ static Value *optimizeIntegerToVectorInsertions(BitCastInst &CI,
 /// usually not type-specific like scalar integer or scalar floating-point.
 static Instruction *canonicalizeBitCastExtElt(BitCastInst &BitCast,
                                               InstCombinerImpl &IC) {
-  // TODO: Create and use a pattern matcher for ExtractElementInst.
-  auto *ExtElt = dyn_cast<ExtractElementInst>(BitCast.getOperand(0));
-  if (!ExtElt || !ExtElt->hasOneUse())
+  Value *VecOp, *Index;
+  if (!match(BitCast.getOperand(0),
+             m_OneUse(m_ExtractElt(m_Value(VecOp), m_Value(Index)))))
     return nullptr;
 
   // The bitcast must be to a vectorizable type, otherwise we can't make a new
   // type to extract from.
   Type *DestType = BitCast.getType();
-  if (!VectorType::isValidElementType(DestType))
-    return nullptr;
+  VectorType *VecType = cast<VectorType>(VecOp->getType());
+  if (VectorType::isValidElementType(DestType)) {
+    auto *NewVecType = VectorType::get(DestType, VecType);
+    auto *NewBC = IC.Builder.CreateBitCast(VecOp, NewVecType, "bc");
+    return ExtractElementInst::Create(NewBC, Index);
+  }
+
+  // Only solve DestType is vector to avoid inverse transform in visitBitCast.
+  // bitcast (extractelement <1 x elt>, dest) -> bitcast(<1 x elt>, dest)
+  auto *FixedVType = dyn_cast<FixedVectorType>(VecType);
+  if (DestType->isVectorTy() && FixedVType && FixedVType->getNumElements() == 1)
+    return CastInst::Create(Instruction::BitCast, VecOp, DestType);
 
-  auto *NewVecType = VectorType::get(DestType, ExtElt->getVectorOperandType());
-  auto *NewBC = IC.Builder.CreateBitCast(ExtElt->getVectorOperand(),
-                                         NewVecType, "bc");
-  return ExtractElementInst::Create(NewBC, ExtElt->getIndexOperand());
+  return nullptr;
 }
 
 /// Change the type of a bitwise logic operation if we can eliminate a bitcast.
@@ -2373,8 +2392,8 @@ static Instruction *foldBitCastBitwiseLogic(BitCastInst &BitCast,
                                             InstCombiner::BuilderTy &Builder) {
   Type *DestTy = BitCast.getType();
   BinaryOperator *BO;
-  if (!DestTy->isIntOrIntVectorTy() ||
-      !match(BitCast.getOperand(0), m_OneUse(m_BinOp(BO))) ||
+
+  if (!match(BitCast.getOperand(0), m_OneUse(m_BinOp(BO))) ||
       !BO->isBitwiseLogicOp())
     return nullptr;
 
@@ -2384,6 +2403,32 @@ static Instruction *foldBitCastBitwiseLogic(BitCastInst &BitCast,
   if (!DestTy->isVectorTy() || !BO->getType()->isVectorTy())
     return nullptr;
 
+  if (DestTy->isFPOrFPVectorTy()) {
+    Value *X, *Y;
+    // bitcast(logic(bitcast(X), bitcast(Y))) -> bitcast'(logic(bitcast'(X), Y))
+    if (match(BO->getOperand(0), m_OneUse(m_BitCast(m_Value(X)))) &&
+        match(BO->getOperand(1), m_OneUse(m_BitCast(m_Value(Y))))) {
+      if (X->getType()->isFPOrFPVectorTy() &&
+          Y->getType()->isIntOrIntVectorTy()) {
+        Value *CastedOp =
+            Builder.CreateBitCast(BO->getOperand(0), Y->getType());
+        Value *NewBO = Builder.CreateBinOp(BO->getOpcode(), CastedOp, Y);
+        return CastInst::CreateBitOrPointerCast(NewBO, DestTy);
+      }
+      if (X->getType()->isIntOrIntVectorTy() &&
+          Y->getType()->isFPOrFPVectorTy()) {
+        Value *CastedOp =
+            Builder.CreateBitCast(BO->getOperand(1), X->getType());
+        Value *NewBO = Builder.CreateBinOp(BO->getOpcode(), CastedOp, X);
+        return CastInst::CreateBitOrPointerCast(NewBO, DestTy);
+      }
+    }
+    return nullptr;
+  }
+
+  if (!DestTy->isIntOrIntVectorTy())
+    return nullptr;
+
   Value *X;
   if (match(BO->getOperand(0), m_OneUse(m_BitCast(m_Value(X)))) &&
       X->getType() == DestTy && !isa<Constant>(X)) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index e45be5745fcc..d1f89973caa1 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -17,13 +17,11 @@
 #include "llvm/Analysis/CmpInstAnalysis.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
 
@@ -105,10 +103,14 @@ static bool isSignTest(ICmpInst::Predicate &Pred, const APInt &C) {
 ///
 /// If AndCst is non-null, then the loaded value is masked with that constant
 /// before doing the comparison. This handles cases like "A[i]&4 == 0".
-Instruction *
-InstCombinerImpl::foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
-                                               GlobalVariable *GV, CmpInst &ICI,
-                                               ConstantInt *AndCst) {
+Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal(
+    LoadInst *LI, GetElementPtrInst *GEP, GlobalVariable *GV, CmpInst &ICI,
+    ConstantInt *AndCst) {
+  if (LI->isVolatile() || LI->getType() != GEP->getResultElementType() ||
+      GV->getValueType() != GEP->getSourceElementType() ||
+      !GV->isConstant() || !GV->hasDefinitiveInitializer())
+    return nullptr;
+
   Constant *Init = GV->getInitializer();
   if (!isa<ConstantArray>(Init) && !isa<ConstantDataArray>(Init))
     return nullptr;
@@ -188,8 +190,11 @@ InstCombinerImpl::foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
     if (!Elt) return nullptr;
 
     // If this is indexing an array of structures, get the structure element.
-    if (!LaterIndices.empty())
-      Elt = ConstantExpr::getExtractValue(Elt, LaterIndices);
+    if (!LaterIndices.empty()) {
+      Elt = ConstantFoldExtractValueInstruction(Elt, LaterIndices);
+      if (!Elt)
+        return nullptr;
+    }
 
     // If the element is masked, handle it.
     if (AndCst) Elt = ConstantExpr::getAnd(Elt, AndCst);
@@ -757,7 +762,7 @@ getAsConstantIndexedAddress(Type *ElemTy, Value *V, const DataLayout &DL) {
         V = GEP->getOperand(0);
         Constant *GEPIndex = static_cast<Constant *>(GEP->getOperand(1));
         Index = ConstantExpr::getAdd(
-            Index, ConstantExpr::getSExtOrBitCast(GEPIndex, IndexType));
+            Index, ConstantExpr::getSExtOrTrunc(GEPIndex, IndexType));
         continue;
       }
       break;
@@ -887,7 +892,8 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
     if (PtrBase != GEPRHS->getOperand(0)) {
       bool IndicesTheSame =
           GEPLHS->getNumOperands() == GEPRHS->getNumOperands() &&
-          GEPLHS->getType() == GEPRHS->getType() &&
+          GEPLHS->getPointerOperand()->getType() ==
+              GEPRHS->getPointerOperand()->getType() &&
           GEPLHS->getSourceElementType() == GEPRHS->getSourceElementType();
       if (IndicesTheSame)
         for (unsigned i = 1, e = GEPLHS->getNumOperands(); i != e; ++i)
@@ -950,7 +956,8 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
       return foldGEPICmp(GEPLHS, GEPRHS->getOperand(0), Cond, I);
 
     bool GEPsInBounds = GEPLHS->isInBounds() && GEPRHS->isInBounds();
-    if (GEPLHS->getNumOperands() == GEPRHS->getNumOperands()) {
+    if (GEPLHS->getNumOperands() == GEPRHS->getNumOperands() &&
+        GEPLHS->getSourceElementType() == GEPRHS->getSourceElementType()) {
       // If the GEPs only differ by one index, compare it.
       unsigned NumDifferences = 0;  // Keep track of # differences.
       unsigned DiffOperand = 0;     // The operand that differs.
@@ -1001,8 +1008,7 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
 }
 
 Instruction *InstCombinerImpl::foldAllocaCmp(ICmpInst &ICI,
-                                             const AllocaInst *Alloca,
-                                             const Value *Other) {
+                                             const AllocaInst *Alloca) {
   assert(ICI.isEquality() && "Cannot fold non-equality comparison.");
 
   // It would be tempting to fold away comparisons between allocas and any
@@ -1071,10 +1077,9 @@ Instruction *InstCombinerImpl::foldAllocaCmp(ICmpInst &ICI,
     }
   }
 
-  Type *CmpTy = CmpInst::makeCmpResultType(Other->getType());
-  return replaceInstUsesWith(
-      ICI,
-      ConstantInt::get(CmpTy, !CmpInst::isTrueWhenEqual(ICI.getPredicate())));
+  auto *Res = ConstantInt::get(ICI.getType(),
+                               !CmpInst::isTrueWhenEqual(ICI.getPredicate()));
+  return replaceInstUsesWith(ICI, Res);
 }
 
 /// Fold "icmp pred (X+C), X".
@@ -1376,8 +1381,7 @@ Instruction *InstCombinerImpl::foldICmpWithZero(ICmpInst &Cmp) {
   // (icmp sgt smin(PosA, B) 0) -> (icmp sgt B 0)
   if (Pred == ICmpInst::ICMP_SGT) {
     Value *A, *B;
-    SelectPatternResult SPR = matchSelectPattern(Cmp.getOperand(0), A, B);
-    if (SPR.Flavor == SPF_SMIN) {
+    if (match(Cmp.getOperand(0), m_SMin(m_Value(A), m_Value(B)))) {
       if (isKnownPositive(A, DL, 0, &AC, &Cmp, &DT))
         return new ICmpInst(Pred, B, Cmp.getOperand(1));
       if (isKnownPositive(B, DL, 0, &AC, &Cmp, &DT))
@@ -1530,7 +1534,7 @@ Instruction *InstCombinerImpl::foldICmpWithDominatingICmp(ICmpInst &Cmp) {
   return nullptr;
 }
 
-/// Fold icmp (trunc X, Y), C.
+/// Fold icmp (trunc X), C.
 Instruction *InstCombinerImpl::foldICmpTruncConstant(ICmpInst &Cmp,
                                                      TruncInst *Trunc,
                                                      const APInt &C) {
@@ -1547,6 +1551,16 @@ Instruction *InstCombinerImpl::foldICmpTruncConstant(ICmpInst &Cmp,
   unsigned DstBits = Trunc->getType()->getScalarSizeInBits(),
            SrcBits = X->getType()->getScalarSizeInBits();
   if (Cmp.isEquality() && Trunc->hasOneUse()) {
+    // Canonicalize to a mask and wider compare if the wide type is suitable:
+    // (trunc X to i8) == C --> (X & 0xff) == (zext C)
+    if (!X->getType()->isVectorTy() && shouldChangeType(DstBits, SrcBits)) {
+      Constant *Mask = ConstantInt::get(X->getType(),
+                                        APInt::getLowBitsSet(SrcBits, DstBits));
+      Value *And = Builder.CreateAnd(X, Mask);
+      Constant *WideC = ConstantInt::get(X->getType(), C.zext(SrcBits));
+      return new ICmpInst(Pred, And, WideC);
+    }
+
     // Simplify icmp eq (trunc x to i8), 42 -> icmp eq x, 42|highbits if all
     // of the high bits truncated out of x are known.
     KnownBits Known = computeKnownBits(X, 0, &Cmp);
@@ -1865,15 +1879,13 @@ Instruction *InstCombinerImpl::foldICmpAndConstant(ICmpInst &Cmp,
   // Try to optimize things like "A[i] & 42 == 0" to index computations.
   Value *X = And->getOperand(0);
   Value *Y = And->getOperand(1);
-  if (auto *LI = dyn_cast<LoadInst>(X))
-    if (auto *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0)))
-      if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
-        if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
-            !LI->isVolatile() && isa<ConstantInt>(Y)) {
-          ConstantInt *C2 = cast<ConstantInt>(Y);
-          if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, Cmp, C2))
+  if (auto *C2 = dyn_cast<ConstantInt>(Y))
+    if (auto *LI = dyn_cast<LoadInst>(X))
+      if (auto *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0)))
+        if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
+          if (Instruction *Res =
+                  foldCmpLoadFromIndexedGlobal(LI, GEP, GV, Cmp, C2))
             return Res;
-        }
 
   if (!Cmp.isEquality())
     return nullptr;
@@ -2216,22 +2228,41 @@ Instruction *InstCombinerImpl::foldICmpShrConstant(ICmpInst &Cmp,
   if (Cmp.isEquality() && Shr->isExact() && C.isZero())
     return new ICmpInst(Pred, X, Cmp.getOperand(1));
 
-  const APInt *ShiftVal;
-  if (Cmp.isEquality() && match(Shr->getOperand(0), m_APInt(ShiftVal)))
-    return foldICmpShrConstConst(Cmp, Shr->getOperand(1), C, *ShiftVal);
-
-  const APInt *ShiftAmt;
-  if (!match(Shr->getOperand(1), m_APInt(ShiftAmt)))
+  bool IsAShr = Shr->getOpcode() == Instruction::AShr;
+  const APInt *ShiftValC;
+  if (match(Shr->getOperand(0), m_APInt(ShiftValC))) {
+    if (Cmp.isEquality())
+      return foldICmpShrConstConst(Cmp, Shr->getOperand(1), C, *ShiftValC);
+
+    // If the shifted constant is a power-of-2, test the shift amount directly:
+    // (ShiftValC >> X) >u C --> X <u (LZ(C) - LZ(ShiftValC))
+    // (ShiftValC >> X) <u C --> X >=u (LZ(C-1) - LZ(ShiftValC))
+    if (!IsAShr && ShiftValC->isPowerOf2() &&
+        (Pred == CmpInst::ICMP_UGT || Pred == CmpInst::ICMP_ULT)) {
+      bool IsUGT = Pred == CmpInst::ICMP_UGT;
+      assert(ShiftValC->uge(C) && "Expected simplify of compare");
+      assert((IsUGT || !C.isZero()) && "Expected X u< 0 to simplify");
+
+      unsigned CmpLZ =
+          IsUGT ? C.countLeadingZeros() : (C - 1).countLeadingZeros();
+      unsigned ShiftLZ = ShiftValC->countLeadingZeros();
+      Constant *NewC = ConstantInt::get(Shr->getType(), CmpLZ - ShiftLZ);
+      auto NewPred = IsUGT ? CmpInst::ICMP_ULT : CmpInst::ICMP_UGE;
+      return new ICmpInst(NewPred, Shr->getOperand(1), NewC);
+    }
+  }
+
+  const APInt *ShiftAmtC;
+  if (!match(Shr->getOperand(1), m_APInt(ShiftAmtC)))
     return nullptr;
 
   // Check that the shift amount is in range. If not, don't perform undefined
   // shifts. When the shift is visited it will be simplified.
   unsigned TypeBits = C.getBitWidth();
-  unsigned ShAmtVal = ShiftAmt->getLimitedValue(TypeBits);
+  unsigned ShAmtVal = ShiftAmtC->getLimitedValue(TypeBits);
   if (ShAmtVal >= TypeBits || ShAmtVal == 0)
     return nullptr;
 
-  bool IsAShr = Shr->getOpcode() == Instruction::AShr;
   bool IsExact = Shr->isExact();
   Type *ShrTy = Shr->getType();
   // TODO: If we could guarantee that InstSimplify would handle all of the
@@ -2256,8 +2287,11 @@ Instruction *InstCombinerImpl::foldICmpShrConstant(ICmpInst &Cmp,
     }
     if (Pred == CmpInst::ICMP_UGT) {
       // icmp ugt (ashr X, ShAmtC), C --> icmp ugt X, ((C + 1) << ShAmtC) - 1
+      // 'C + 1 << ShAmtC' can overflow as a signed number, so the 2nd
+      // clause accounts for that pattern.
       APInt ShiftedC = (C + 1).shl(ShAmtVal) - 1;
-      if ((ShiftedC + 1).ashr(ShAmtVal) == (C + 1))
+      if ((ShiftedC + 1).ashr(ShAmtVal) == (C + 1) ||
+          (C + 1).shl(ShAmtVal).isMinSignedValue())
         return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC));
     }
 
@@ -2337,7 +2371,8 @@ Instruction *InstCombinerImpl::foldICmpSRemConstant(ICmpInst &Cmp,
   // constant power-of-2 value:
   // (X % pow2C) sgt/slt 0
   const ICmpInst::Predicate Pred = Cmp.getPredicate();
-  if (Pred != ICmpInst::ICMP_SGT && Pred != ICmpInst::ICMP_SLT)
+  if (Pred != ICmpInst::ICMP_SGT && Pred != ICmpInst::ICMP_SLT &&
+      Pred != ICmpInst::ICMP_EQ && Pred != ICmpInst::ICMP_NE)
     return nullptr;
 
   // TODO: The one-use check is standard because we do not typically want to
@@ -2347,7 +2382,15 @@ Instruction *InstCombinerImpl::foldICmpSRemConstant(ICmpInst &Cmp,
     return nullptr;
 
   const APInt *DivisorC;
-  if (!C.isZero() || !match(SRem->getOperand(1), m_Power2(DivisorC)))
+  if (!match(SRem->getOperand(1), m_Power2(DivisorC)))
+    return nullptr;
+
+  // For cmp_sgt/cmp_slt only zero valued C is handled.
+  // For cmp_eq/cmp_ne only positive valued C is handled.
+  if (((Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLT) &&
+       !C.isZero()) ||
+      ((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) &&
+       !C.isStrictlyPositive()))
     return nullptr;
 
   // Mask off the sign bit and the modulo bits (low-bits).
@@ -2356,6 +2399,9 @@ Instruction *InstCombinerImpl::foldICmpSRemConstant(ICmpInst &Cmp,
   Constant *MaskC = ConstantInt::get(Ty, SignMask | (*DivisorC - 1));
   Value *And = Builder.CreateAnd(SRem->getOperand(0), MaskC);
 
+  if (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE)
+    return new ICmpInst(Pred, And, ConstantInt::get(Ty, C));
+
   // For 'is positive?' check that the sign-bit is clear and at least 1 masked
   // bit is set. Example:
   // (i8 X % 32) s> 0 --> (X & 159) s> 0
@@ -2372,26 +2418,30 @@ Instruction *InstCombinerImpl::foldICmpSRemConstant(ICmpInst &Cmp,
 Instruction *InstCombinerImpl::foldICmpUDivConstant(ICmpInst &Cmp,
                                                     BinaryOperator *UDiv,
                                                     const APInt &C) {
+  ICmpInst::Predicate Pred = Cmp.getPredicate();
+  Value *X = UDiv->getOperand(0);
+  Value *Y = UDiv->getOperand(1);
+  Type *Ty = UDiv->getType();
+
   const APInt *C2;
-  if (!match(UDiv->getOperand(0), m_APInt(C2)))
+  if (!match(X, m_APInt(C2)))
     return nullptr;
 
   assert(*C2 != 0 && "udiv 0, X should have been simplified already.");
 
   // (icmp ugt (udiv C2, Y), C) -> (icmp ule Y, C2/(C+1))
-  Value *Y = UDiv->getOperand(1);
-  if (Cmp.getPredicate() == ICmpInst::ICMP_UGT) {
+  if (Pred == ICmpInst::ICMP_UGT) {
     assert(!C.isMaxValue() &&
            "icmp ugt X, UINT_MAX should have been simplified already.");
     return new ICmpInst(ICmpInst::ICMP_ULE, Y,
-                        ConstantInt::get(Y->getType(), C2->udiv(C + 1)));
+                        ConstantInt::get(Ty, C2->udiv(C + 1)));
   }
 
   // (icmp ult (udiv C2, Y), C) -> (icmp ugt Y, C2/C)
-  if (Cmp.getPredicate() == ICmpInst::ICMP_ULT) {
+  if (Pred == ICmpInst::ICMP_ULT) {
     assert(C != 0 && "icmp ult X, 0 should have been simplified already.");
     return new ICmpInst(ICmpInst::ICMP_UGT, Y,
-                        ConstantInt::get(Y->getType(), C2->udiv(C)));
+                        ConstantInt::get(Ty, C2->udiv(C)));
   }
 
   return nullptr;
@@ -2401,6 +2451,28 @@ Instruction *InstCombinerImpl::foldICmpUDivConstant(ICmpInst &Cmp,
 Instruction *InstCombinerImpl::foldICmpDivConstant(ICmpInst &Cmp,
                                                    BinaryOperator *Div,
                                                    const APInt &C) {
+  ICmpInst::Predicate Pred = Cmp.getPredicate();
+  Value *X = Div->getOperand(0);
+  Value *Y = Div->getOperand(1);
+  Type *Ty = Div->getType();
+  bool DivIsSigned = Div->getOpcode() == Instruction::SDiv;
+
+  // If unsigned division and the compare constant is bigger than
+  // UMAX/2 (negative), there's only one pair of values that satisfies an
+  // equality check, so eliminate the division:
+  // (X u/ Y) == C --> (X == C) && (Y == 1)
+  // (X u/ Y) != C --> (X != C) || (Y != 1)
+  // Similarly, if signed division and the compare constant is exactly SMIN:
+  // (X s/ Y) == SMIN --> (X == SMIN) && (Y == 1)
+  // (X s/ Y) != SMIN --> (X != SMIN) || (Y != 1)
+  if (Cmp.isEquality() && Div->hasOneUse() && C.isSignBitSet() &&
+      (!DivIsSigned || C.isMinSignedValue()))   {
+    Value *XBig = Builder.CreateICmp(Pred, X, ConstantInt::get(Ty, C));
+    Value *YOne = Builder.CreateICmp(Pred, Y, ConstantInt::get(Ty, 1));
+    auto Logic = Pred == ICmpInst::ICMP_EQ ? Instruction::And : Instruction::Or;
+    return BinaryOperator::Create(Logic, XBig, YOne);
+  }
+
   // Fold: icmp pred ([us]div X, C2), C -> range test
   // Fold this div into the comparison, producing a range check.
   // Determine, based on the divide type, what the range is being
@@ -2408,7 +2480,7 @@ Instruction *InstCombinerImpl::foldICmpDivConstant(ICmpInst &Cmp,
   // it, otherwise compute the range [low, hi) bounding the new value.
   // See: InsertRangeTest above for the kinds of replacements possible.
   const APInt *C2;
-  if (!match(Div->getOperand(1), m_APInt(C2)))
+  if (!match(Y, m_APInt(C2)))
     return nullptr;
 
   // FIXME: If the operand types don't match the type of the divide
@@ -2419,7 +2491,6 @@ Instruction *InstCombinerImpl::foldICmpDivConstant(ICmpInst &Cmp,
   // (x /u C2) <u C.  Simply casting the operands and result won't
   // work. :(  The if statement below tests that condition and bails
   // if it finds it.
-  bool DivIsSigned = Div->getOpcode() == Instruction::SDiv;
   if (!Cmp.isEquality() && DivIsSigned != Cmp.isSigned())
     return nullptr;
 
@@ -2441,8 +2512,6 @@ Instruction *InstCombinerImpl::foldICmpDivConstant(ICmpInst &Cmp,
   // instruction that we're folding.
   bool ProdOV = (DivIsSigned ? Prod.sdiv(*C2) : Prod.udiv(*C2)) != C;
 
-  ICmpInst::Predicate Pred = Cmp.getPredicate();
-
   // If the division is known to be exact, then there is no remainder from the
   // divide, so the covered range size is unit, otherwise it is the divisor.
   APInt RangeSize = Div->isExact() ? APInt(C2->getBitWidth(), 1) : *C2;
@@ -2457,7 +2526,7 @@ Instruction *InstCombinerImpl::foldICmpDivConstant(ICmpInst &Cmp,
   int LoOverflow = 0, HiOverflow = 0;
   APInt LoBound, HiBound;
 
-  if (!DivIsSigned) {  // udiv
+  if (!DivIsSigned) { // udiv
     // e.g. X/5 op 3  --> [15, 20)
     LoBound = Prod;
     HiOverflow = LoOverflow = ProdOV;
@@ -2472,7 +2541,7 @@ Instruction *InstCombinerImpl::foldICmpDivConstant(ICmpInst &Cmp,
       LoBound = -(RangeSize - 1);
       HiBound = RangeSize;
     } else if (C.isStrictlyPositive()) { // (X / pos) op pos
-      LoBound = Prod;     // e.g.   X/5 op 3 --> [15, 20)
+      LoBound = Prod;                    // e.g.   X/5 op 3 --> [15, 20)
       HiOverflow = LoOverflow = ProdOV;
       if (!HiOverflow)
         HiOverflow = addWithOverflow(HiBound, Prod, RangeSize, true);
@@ -2492,18 +2561,19 @@ Instruction *InstCombinerImpl::foldICmpDivConstant(ICmpInst &Cmp,
       // e.g. X/-5 op 0  --> [-4, 5)
       LoBound = RangeSize + 1;
       HiBound = -RangeSize;
-      if (HiBound == *C2) {        // -INTMIN = INTMIN
-        HiOverflow = 1;            // [INTMIN+1, overflow)
-        HiBound = APInt();         // e.g. X/INTMIN = 0 --> X > INTMIN
+      if (HiBound == *C2) { // -INTMIN = INTMIN
+        HiOverflow = 1;     // [INTMIN+1, overflow)
+        HiBound = APInt();  // e.g. X/INTMIN = 0 --> X > INTMIN
       }
     } else if (C.isStrictlyPositive()) { // (X / neg) op pos
       // e.g. X/-5 op 3  --> [-19, -14)
       HiBound = Prod + 1;
       HiOverflow = LoOverflow = ProdOV ? -1 : 0;
       if (!LoOverflow)
-        LoOverflow = addWithOverflow(LoBound, HiBound, RangeSize, true) ? -1:0;
-    } else {                // (X / neg) op neg
-      LoBound = Prod;       // e.g. X/-5 op -3  --> [15, 20)
+        LoOverflow =
+            addWithOverflow(LoBound, HiBound, RangeSize, true) ? -1 : 0;
+    } else {          // (X / neg) op neg
+      LoBound = Prod; // e.g. X/-5 op -3  --> [15, 20)
       LoOverflow = HiOverflow = ProdOV;
       if (!HiOverflow)
         HiOverflow = subWithOverflow(HiBound, Prod, RangeSize, true);
@@ -2513,54 +2583,47 @@ Instruction *InstCombinerImpl::foldICmpDivConstant(ICmpInst &Cmp,
     Pred = ICmpInst::getSwappedPredicate(Pred);
   }
 
-  Value *X = Div->getOperand(0);
   switch (Pred) {
-    default: llvm_unreachable("Unhandled icmp opcode!");
-    case ICmpInst::ICMP_EQ:
-      if (LoOverflow && HiOverflow)
-        return replaceInstUsesWith(Cmp, Builder.getFalse());
-      if (HiOverflow)
-        return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE :
-                            ICmpInst::ICMP_UGE, X,
-                            ConstantInt::get(Div->getType(), LoBound));
-      if (LoOverflow)
-        return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT :
-                            ICmpInst::ICMP_ULT, X,
-                            ConstantInt::get(Div->getType(), HiBound));
-      return replaceInstUsesWith(
-          Cmp, insertRangeTest(X, LoBound, HiBound, DivIsSigned, true));
-    case ICmpInst::ICMP_NE:
-      if (LoOverflow && HiOverflow)
-        return replaceInstUsesWith(Cmp, Builder.getTrue());
-      if (HiOverflow)
-        return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT :
-                            ICmpInst::ICMP_ULT, X,
-                            ConstantInt::get(Div->getType(), LoBound));
-      if (LoOverflow)
-        return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE :
-                            ICmpInst::ICMP_UGE, X,
-                            ConstantInt::get(Div->getType(), HiBound));
-      return replaceInstUsesWith(Cmp,
-                                 insertRangeTest(X, LoBound, HiBound,
-                                                 DivIsSigned, false));
-    case ICmpInst::ICMP_ULT:
-    case ICmpInst::ICMP_SLT:
-      if (LoOverflow == +1)   // Low bound is greater than input range.
-        return replaceInstUsesWith(Cmp, Builder.getTrue());
-      if (LoOverflow == -1)   // Low bound is less than input range.
-        return replaceInstUsesWith(Cmp, Builder.getFalse());
-      return new ICmpInst(Pred, X, ConstantInt::get(Div->getType(), LoBound));
-    case ICmpInst::ICMP_UGT:
-    case ICmpInst::ICMP_SGT:
-      if (HiOverflow == +1)       // High bound greater than input range.
-        return replaceInstUsesWith(Cmp, Builder.getFalse());
-      if (HiOverflow == -1)       // High bound less than input range.
-        return replaceInstUsesWith(Cmp, Builder.getTrue());
-      if (Pred == ICmpInst::ICMP_UGT)
-        return new ICmpInst(ICmpInst::ICMP_UGE, X,
-                            ConstantInt::get(Div->getType(), HiBound));
-      return new ICmpInst(ICmpInst::ICMP_SGE, X,
-                          ConstantInt::get(Div->getType(), HiBound));
+  default:
+    llvm_unreachable("Unhandled icmp predicate!");
+  case ICmpInst::ICMP_EQ:
+    if (LoOverflow && HiOverflow)
+      return replaceInstUsesWith(Cmp, Builder.getFalse());
+    if (HiOverflow)
+      return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE,
+                          X, ConstantInt::get(Ty, LoBound));
+    if (LoOverflow)
+      return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
+                          X, ConstantInt::get(Ty, HiBound));
+    return replaceInstUsesWith(
+        Cmp, insertRangeTest(X, LoBound, HiBound, DivIsSigned, true));
+  case ICmpInst::ICMP_NE:
+    if (LoOverflow && HiOverflow)
+      return replaceInstUsesWith(Cmp, Builder.getTrue());
+    if (HiOverflow)
+      return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
+                          X, ConstantInt::get(Ty, LoBound));
+    if (LoOverflow)
+      return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE,
+                          X, ConstantInt::get(Ty, HiBound));
+    return replaceInstUsesWith(
+        Cmp, insertRangeTest(X, LoBound, HiBound, DivIsSigned, false));
+  case ICmpInst::ICMP_ULT:
+  case ICmpInst::ICMP_SLT:
+    if (LoOverflow == +1) // Low bound is greater than input range.
+      return replaceInstUsesWith(Cmp, Builder.getTrue());
+    if (LoOverflow == -1) // Low bound is less than input range.
+      return replaceInstUsesWith(Cmp, Builder.getFalse());
+    return new ICmpInst(Pred, X, ConstantInt::get(Ty, LoBound));
+  case ICmpInst::ICMP_UGT:
+  case ICmpInst::ICMP_SGT:
+    if (HiOverflow == +1) // High bound greater than input range.
+      return replaceInstUsesWith(Cmp, Builder.getFalse());
+    if (HiOverflow == -1) // High bound less than input range.
+      return replaceInstUsesWith(Cmp, Builder.getTrue());
+    if (Pred == ICmpInst::ICMP_UGT)
+      return new ICmpInst(ICmpInst::ICMP_UGE, X, ConstantInt::get(Ty, HiBound));
+    return new ICmpInst(ICmpInst::ICMP_SGE, X, ConstantInt::get(Ty, HiBound));
   }
 
   return nullptr;
@@ -2593,18 +2656,24 @@ Instruction *InstCombinerImpl::foldICmpSubConstant(ICmpInst &Cmp,
       !subWithOverflow(SubResult, *C2, C, Cmp.isSigned()))
     return new ICmpInst(SwappedPred, Y, ConstantInt::get(Ty, SubResult));
 
+  // X - Y == 0 --> X == Y.
+  // X - Y != 0 --> X != Y.
+  // TODO: We allow this with multiple uses as long as the other uses are not
+  //       in phis. The phi use check is guarding against a codegen regression
+  //       for a loop test. If the backend could undo this (and possibly
+  //       subsequent transforms), we would not need this hack.
+  if (Cmp.isEquality() && C.isZero() &&
+      none_of((Sub->users()), [](const User *U) { return isa<PHINode>(U); }))
+    return new ICmpInst(Pred, X, Y);
+
   // The following transforms are only worth it if the only user of the subtract
   // is the icmp.
   // TODO: This is an artificial restriction for all of the transforms below
-  //       that only need a single replacement icmp.
+  //       that only need a single replacement icmp. Can these use the phi test
+  //       like the transform above here?
   if (!Sub->hasOneUse())
     return nullptr;
 
-  // X - Y == 0 --> X == Y.
-  // X - Y != 0 --> X != Y.
-  if (Cmp.isEquality() && C.isZero())
-    return new ICmpInst(Pred, X, Y);
-
   if (Sub->hasNoSignedWrap()) {
     // (icmp sgt (sub nsw X, Y), -1) -> (icmp sge X, Y)
     if (Pred == ICmpInst::ICMP_SGT && C.isAllOnes())
@@ -2855,10 +2924,13 @@ Instruction *InstCombinerImpl::foldICmpBitCast(ICmpInst &Cmp) {
   ICmpInst::Predicate Pred = Cmp.getPredicate();
   Value *Op1 = Cmp.getOperand(1);
   Value *BCSrcOp = Bitcast->getOperand(0);
+  Type *SrcType = Bitcast->getSrcTy();
+  Type *DstType = Bitcast->getType();
 
-  // Make sure the bitcast doesn't change the number of vector elements.
-  if (Bitcast->getSrcTy()->getScalarSizeInBits() ==
-          Bitcast->getDestTy()->getScalarSizeInBits()) {
+  // Make sure the bitcast doesn't change between scalar and vector and
+  // doesn't change the number of vector elements.
+  if (SrcType->isVectorTy() == DstType->isVectorTy() &&
+      SrcType->getScalarSizeInBits() == DstType->getScalarSizeInBits()) {
     // Zero-equality and sign-bit checks are preserved through sitofp + bitcast.
     Value *X;
     if (match(BCSrcOp, m_SIToFP(m_Value(X)))) {
@@ -2903,8 +2975,7 @@ Instruction *InstCombinerImpl::foldICmpBitCast(ICmpInst &Cmp) {
         Type *XType = X->getType();
 
         // We can't currently handle Power style floating point operations here.
-        if (!(XType->isPPC_FP128Ty() || BCSrcOp->getType()->isPPC_FP128Ty())) {
-
+        if (!(XType->isPPC_FP128Ty() || SrcType->isPPC_FP128Ty())) {
           Type *NewType = Builder.getIntNTy(XType->getScalarSizeInBits());
           if (auto *XVTy = dyn_cast<VectorType>(XType))
             NewType = VectorType::get(NewType, XVTy->getElementCount());
@@ -2922,21 +2993,19 @@ Instruction *InstCombinerImpl::foldICmpBitCast(ICmpInst &Cmp) {
 
   // Test to see if the operands of the icmp are casted versions of other
   // values. If the ptr->ptr cast can be stripped off both arguments, do so.
-  if (Bitcast->getType()->isPointerTy() &&
-      (isa<Constant>(Op1) || isa<BitCastInst>(Op1))) {
+  if (DstType->isPointerTy() && (isa<Constant>(Op1) || isa<BitCastInst>(Op1))) {
     // If operand #1 is a bitcast instruction, it must also be a ptr->ptr cast
     // so eliminate it as well.
     if (auto *BC2 = dyn_cast<BitCastInst>(Op1))
       Op1 = BC2->getOperand(0);
 
-    Op1 = Builder.CreateBitCast(Op1, BCSrcOp->getType());
+    Op1 = Builder.CreateBitCast(Op1, SrcType);
     return new ICmpInst(Pred, BCSrcOp, Op1);
   }
 
   const APInt *C;
-  if (!match(Cmp.getOperand(1), m_APInt(C)) ||
-      !Bitcast->getType()->isIntegerTy() ||
-      !Bitcast->getSrcTy()->isIntOrIntVectorTy())
+  if (!match(Cmp.getOperand(1), m_APInt(C)) || !DstType->isIntegerTy() ||
+      !SrcType->isIntOrIntVectorTy())
     return nullptr;
 
   // If this is checking if all elements of a vector compare are set or not,
@@ -2948,9 +3017,8 @@ Instruction *InstCombinerImpl::foldICmpBitCast(ICmpInst &Cmp) {
   // TODO: Try harder to reduce compare of 2 freely invertible operands?
   if (Cmp.isEquality() && C->isAllOnes() && Bitcast->hasOneUse() &&
       isFreeToInvert(BCSrcOp, BCSrcOp->hasOneUse())) {
-    Type *ScalarTy = Bitcast->getType();
-    Value *Cast = Builder.CreateBitCast(Builder.CreateNot(BCSrcOp), ScalarTy);
-    return new ICmpInst(Pred, Cast, ConstantInt::getNullValue(ScalarTy));
+    Value *Cast = Builder.CreateBitCast(Builder.CreateNot(BCSrcOp), DstType);
+    return new ICmpInst(Pred, Cast, ConstantInt::getNullValue(DstType));
   }
 
   // If this is checking if all elements of an extended vector are clear or not,
@@ -2978,7 +3046,7 @@ Instruction *InstCombinerImpl::foldICmpBitCast(ICmpInst &Cmp) {
   if (match(BCSrcOp, m_Shuffle(m_Value(Vec), m_Undef(), m_Mask(Mask)))) {
     // Check whether every element of Mask is the same constant
     if (is_splat(Mask)) {
-      auto *VecTy = cast<VectorType>(BCSrcOp->getType());
+      auto *VecTy = cast<VectorType>(SrcType);
       auto *EltTy = cast<IntegerType>(VecTy->getElementType());
       if (C->isSplat(EltTy->getBitWidth())) {
         // Fold the icmp based on the value of C
@@ -3000,83 +3068,31 @@ Instruction *InstCombinerImpl::foldICmpBitCast(ICmpInst &Cmp) {
 /// where X is some kind of instruction.
 Instruction *InstCombinerImpl::foldICmpInstWithConstant(ICmpInst &Cmp) {
   const APInt *C;
-  if (!match(Cmp.getOperand(1), m_APInt(C)))
-    return nullptr;
 
-  if (auto *BO = dyn_cast<BinaryOperator>(Cmp.getOperand(0))) {
-    switch (BO->getOpcode()) {
-    case Instruction::Xor:
-      if (Instruction *I = foldICmpXorConstant(Cmp, BO, *C))
-        return I;
-      break;
-    case Instruction::And:
-      if (Instruction *I = foldICmpAndConstant(Cmp, BO, *C))
-        return I;
-      break;
-    case Instruction::Or:
-      if (Instruction *I = foldICmpOrConstant(Cmp, BO, *C))
-        return I;
-      break;
-    case Instruction::Mul:
-      if (Instruction *I = foldICmpMulConstant(Cmp, BO, *C))
-        return I;
-      break;
-    case Instruction::Shl:
-      if (Instruction *I = foldICmpShlConstant(Cmp, BO, *C))
-        return I;
-      break;
-    case Instruction::LShr:
-    case Instruction::AShr:
-      if (Instruction *I = foldICmpShrConstant(Cmp, BO, *C))
-        return I;
-      break;
-    case Instruction::SRem:
-      if (Instruction *I = foldICmpSRemConstant(Cmp, BO, *C))
-        return I;
-      break;
-    case Instruction::UDiv:
-      if (Instruction *I = foldICmpUDivConstant(Cmp, BO, *C))
-        return I;
-      LLVM_FALLTHROUGH;
-    case Instruction::SDiv:
-      if (Instruction *I = foldICmpDivConstant(Cmp, BO, *C))
+  if (match(Cmp.getOperand(1), m_APInt(C))) {
+    if (auto *BO = dyn_cast<BinaryOperator>(Cmp.getOperand(0)))
+      if (Instruction *I = foldICmpBinOpWithConstant(Cmp, BO, *C))
         return I;
-      break;
-    case Instruction::Sub:
-      if (Instruction *I = foldICmpSubConstant(Cmp, BO, *C))
-        return I;
-      break;
-    case Instruction::Add:
-      if (Instruction *I = foldICmpAddConstant(Cmp, BO, *C))
-        return I;
-      break;
-    default:
-      break;
-    }
-    // TODO: These folds could be refactored to be part of the above calls.
-    if (Instruction *I = foldICmpBinOpEqualityWithConstant(Cmp, BO, *C))
-      return I;
-  }
 
-  // Match against CmpInst LHS being instructions other than binary operators.
+    if (auto *SI = dyn_cast<SelectInst>(Cmp.getOperand(0)))
+      // For now, we only support constant integers while folding the
+      // ICMP(SELECT)) pattern. We can extend this to support vector of integers
+      // similar to the cases handled by binary ops above.
+      if (auto *ConstRHS = dyn_cast<ConstantInt>(Cmp.getOperand(1)))
+        if (Instruction *I = foldICmpSelectConstant(Cmp, SI, ConstRHS))
+          return I;
 
-  if (auto *SI = dyn_cast<SelectInst>(Cmp.getOperand(0))) {
-    // For now, we only support constant integers while folding the
-    // ICMP(SELECT)) pattern. We can extend this to support vector of integers
-    // similar to the cases handled by binary ops above.
-    if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(Cmp.getOperand(1)))
-      if (Instruction *I = foldICmpSelectConstant(Cmp, SI, ConstRHS))
+    if (auto *TI = dyn_cast<TruncInst>(Cmp.getOperand(0)))
+      if (Instruction *I = foldICmpTruncConstant(Cmp, TI, *C))
         return I;
-  }
 
-  if (auto *TI = dyn_cast<TruncInst>(Cmp.getOperand(0))) {
-    if (Instruction *I = foldICmpTruncConstant(Cmp, TI, *C))
-      return I;
+    if (auto *II = dyn_cast<IntrinsicInst>(Cmp.getOperand(0)))
+      if (Instruction *I = foldICmpIntrinsicWithConstant(Cmp, II, *C))
+        return I;
   }
 
-  if (auto *II = dyn_cast<IntrinsicInst>(Cmp.getOperand(0)))
-    if (Instruction *I = foldICmpIntrinsicWithConstant(Cmp, II, *C))
-      return I;
+  if (match(Cmp.getOperand(1), m_APIntAllowUndef(C)))
+    return foldICmpInstWithConstantAllowUndef(Cmp, *C);
 
   return nullptr;
 }
@@ -3233,12 +3249,6 @@ Instruction *InstCombinerImpl::foldICmpEqIntrinsicWithConstant(
   case Intrinsic::fshl:
   case Intrinsic::fshr:
     if (II->getArgOperand(0) == II->getArgOperand(1)) {
-      // (rot X, ?) == 0/-1 --> X == 0/-1
-      // TODO: This transform is safe to re-use undef elts in a vector, but
-      //       the constant value passed in by the caller doesn't allow that.
-      if (C.isZero() || C.isAllOnes())
-        return new ICmpInst(Pred, II->getArgOperand(0), Cmp.getOperand(1));
-
       const APInt *RotAmtC;
       // ror(X, RotAmtC) == C --> X == rol(C, RotAmtC)
       // rol(X, RotAmtC) == C --> X == ror(C, RotAmtC)
@@ -3311,6 +3321,89 @@ static Instruction *foldICmpIntrinsicWithIntrinsic(ICmpInst &Cmp) {
   return nullptr;
 }
 
+/// Try to fold integer comparisons with a constant operand: icmp Pred X, C
+/// where X is some kind of instruction and C is AllowUndef.
+/// TODO: Move more folds which allow undef to this function.
+Instruction *
+InstCombinerImpl::foldICmpInstWithConstantAllowUndef(ICmpInst &Cmp,
+                                                     const APInt &C) {
+  const ICmpInst::Predicate Pred = Cmp.getPredicate();
+  if (auto *II = dyn_cast<IntrinsicInst>(Cmp.getOperand(0))) {
+    switch (II->getIntrinsicID()) {
+    default:
+      break;
+    case Intrinsic::fshl:
+    case Intrinsic::fshr:
+      if (Cmp.isEquality() && II->getArgOperand(0) == II->getArgOperand(1)) {
+        // (rot X, ?) == 0/-1 --> X == 0/-1
+        if (C.isZero() || C.isAllOnes())
+          return new ICmpInst(Pred, II->getArgOperand(0), Cmp.getOperand(1));
+      }
+      break;
+    }
+  }
+
+  return nullptr;
+}
+
+/// Fold an icmp with BinaryOp and constant operand: icmp Pred BO, C.
+Instruction *InstCombinerImpl::foldICmpBinOpWithConstant(ICmpInst &Cmp,
+                                                         BinaryOperator *BO,
+                                                         const APInt &C) {
+  switch (BO->getOpcode()) {
+  case Instruction::Xor:
+    if (Instruction *I = foldICmpXorConstant(Cmp, BO, C))
+      return I;
+    break;
+  case Instruction::And:
+    if (Instruction *I = foldICmpAndConstant(Cmp, BO, C))
+      return I;
+    break;
+  case Instruction::Or:
+    if (Instruction *I = foldICmpOrConstant(Cmp, BO, C))
+      return I;
+    break;
+  case Instruction::Mul:
+    if (Instruction *I = foldICmpMulConstant(Cmp, BO, C))
+      return I;
+    break;
+  case Instruction::Shl:
+    if (Instruction *I = foldICmpShlConstant(Cmp, BO, C))
+      return I;
+    break;
+  case Instruction::LShr:
+  case Instruction::AShr:
+    if (Instruction *I = foldICmpShrConstant(Cmp, BO, C))
+      return I;
+    break;
+  case Instruction::SRem:
+    if (Instruction *I = foldICmpSRemConstant(Cmp, BO, C))
+      return I;
+    break;
+  case Instruction::UDiv:
+    if (Instruction *I = foldICmpUDivConstant(Cmp, BO, C))
+      return I;
+    LLVM_FALLTHROUGH;
+  case Instruction::SDiv:
+    if (Instruction *I = foldICmpDivConstant(Cmp, BO, C))
+      return I;
+    break;
+  case Instruction::Sub:
+    if (Instruction *I = foldICmpSubConstant(Cmp, BO, C))
+      return I;
+    break;
+  case Instruction::Add:
+    if (Instruction *I = foldICmpAddConstant(Cmp, BO, C))
+      return I;
+    break;
+  default:
+    break;
+  }
+
+  // TODO: These folds could be refactored to be part of the above calls.
+  return foldICmpBinOpEqualityWithConstant(Cmp, BO, C);
+}
+
 /// Fold an icmp with LLVM intrinsic and constant operand: icmp Pred II, C.
 Instruction *InstCombinerImpl::foldICmpIntrinsicWithConstant(ICmpInst &Cmp,
                                                              IntrinsicInst *II,
@@ -3406,64 +3499,6 @@ Instruction *InstCombinerImpl::foldICmpInstWithConstantNotInt(ICmpInst &I) {
       if (Instruction *NV = foldOpIntoPhi(I, cast<PHINode>(LHSI)))
         return NV;
     break;
-  case Instruction::Select: {
-    // If either operand of the select is a constant, we can fold the
-    // comparison into the select arms, which will cause one to be
-    // constant folded and the select turned into a bitwise or.
-    Value *Op1 = nullptr, *Op2 = nullptr;
-    ConstantInt *CI = nullptr;
-
-    auto SimplifyOp = [&](Value *V) {
-      Value *Op = nullptr;
-      if (Constant *C = dyn_cast<Constant>(V)) {
-        Op = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
-      } else if (RHSC->isNullValue()) {
-        // If null is being compared, check if it can be further simplified.
-        Op = SimplifyICmpInst(I.getPredicate(), V, RHSC, SQ);
-      }
-      return Op;
-    };
-    Op1 = SimplifyOp(LHSI->getOperand(1));
-    if (Op1)
-      CI = dyn_cast<ConstantInt>(Op1);
-
-    Op2 = SimplifyOp(LHSI->getOperand(2));
-    if (Op2)
-      CI = dyn_cast<ConstantInt>(Op2);
-
-    // We only want to perform this transformation if it will not lead to
-    // additional code. This is true if either both sides of the select
-    // fold to a constant (in which case the icmp is replaced with a select
-    // which will usually simplify) or this is the only user of the
-    // select (in which case we are trading a select+icmp for a simpler
-    // select+icmp) or all uses of the select can be replaced based on
-    // dominance information ("Global cases").
-    bool Transform = false;
-    if (Op1 && Op2)
-      Transform = true;
-    else if (Op1 || Op2) {
-      // Local case
-      if (LHSI->hasOneUse())
-        Transform = true;
-      // Global cases
-      else if (CI && !CI->isZero())
-        // When Op1 is constant try replacing select with second operand.
-        // Otherwise Op2 is constant and try replacing select with first
-        // operand.
-        Transform =
-            replacedSelectWithOperand(cast<SelectInst>(LHSI), &I, Op1 ? 2 : 1);
-    }
-    if (Transform) {
-      if (!Op1)
-        Op1 = Builder.CreateICmp(I.getPredicate(), LHSI->getOperand(1), RHSC,
-                                 I.getName());
-      if (!Op2)
-        Op2 = Builder.CreateICmp(I.getPredicate(), LHSI->getOperand(2), RHSC,
-                                 I.getName());
-      return SelectInst::Create(LHSI->getOperand(0), Op1, Op2);
-    }
-    break;
-  }
   case Instruction::IntToPtr:
     // icmp pred inttoptr(X), null -> icmp pred X, 0
     if (RHSC->isNullValue() &&
@@ -3476,19 +3511,72 @@ Instruction *InstCombinerImpl::foldICmpInstWithConstantNotInt(ICmpInst &I) {
   case Instruction::Load:
     // Try to optimize things like "A[i] > 4" to index computations.
     if (GetElementPtrInst *GEP =
-            dyn_cast<GetElementPtrInst>(LHSI->getOperand(0))) {
+            dyn_cast<GetElementPtrInst>(LHSI->getOperand(0)))
       if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
-        if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
-            !cast<LoadInst>(LHSI)->isVolatile())
-          if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, I))
-            return Res;
-    }
+        if (Instruction *Res =
+                foldCmpLoadFromIndexedGlobal(cast<LoadInst>(LHSI), GEP, GV, I))
+          return Res;
     break;
   }
 
   return nullptr;
 }
 
+Instruction *InstCombinerImpl::foldSelectICmp(ICmpInst::Predicate Pred,
+                                              SelectInst *SI, Value *RHS,
+                                              const ICmpInst &I) {
+  // Try to fold the comparison into the select arms, which will cause the
+  // select to be converted into a logical and/or.
+  auto SimplifyOp = [&](Value *Op, bool SelectCondIsTrue) -> Value * {
+    if (Value *Res = simplifyICmpInst(Pred, Op, RHS, SQ))
+      return Res;
+    if (Optional<bool> Impl = isImpliedCondition(SI->getCondition(), Pred, Op,
+                                                 RHS, DL, SelectCondIsTrue))
+      return ConstantInt::get(I.getType(), *Impl);
+    return nullptr;
+  };
+
+  ConstantInt *CI = nullptr;
+  Value *Op1 = SimplifyOp(SI->getOperand(1), true);
+  if (Op1)
+    CI = dyn_cast<ConstantInt>(Op1);
+
+  Value *Op2 = SimplifyOp(SI->getOperand(2), false);
+  if (Op2)
+    CI = dyn_cast<ConstantInt>(Op2);
+
+  // We only want to perform this transformation if it will not lead to
+  // additional code. This is true if either both sides of the select
+  // fold to a constant (in which case the icmp is replaced with a select
+  // which will usually simplify) or this is the only user of the
+  // select (in which case we are trading a select+icmp for a simpler
+  // select+icmp) or all uses of the select can be replaced based on
+  // dominance information ("Global cases").
+  bool Transform = false;
+  if (Op1 && Op2)
+    Transform = true;
+  else if (Op1 || Op2) {
+    // Local case
+    if (SI->hasOneUse())
+      Transform = true;
+    // Global cases
+    else if (CI && !CI->isZero())
+      // When Op1 is constant try replacing select with second operand.
+      // Otherwise Op2 is constant and try replacing select with first
+      // operand.
+      Transform = replacedSelectWithOperand(SI, &I, Op1 ? 2 : 1);
+  }
+  if (Transform) {
+    if (!Op1)
+      Op1 = Builder.CreateICmp(Pred, SI->getOperand(1), RHS, I.getName());
+    if (!Op2)
+      Op2 = Builder.CreateICmp(Pred, SI->getOperand(2), RHS, I.getName());
+    return SelectInst::Create(SI->getOperand(0), Op1, Op2);
+  }
+
+  return nullptr;
+}
+
 /// Some comparisons can be simplified.
 /// In this case, we are looking for comparisons that look like
 /// a check for a lossy truncation.
@@ -3756,7 +3844,7 @@ foldShiftIntoShiftInAnotherHandOfAndInICmp(ICmpInst &I, const SimplifyQuery SQ,
 
   // Can we fold (XShAmt+YShAmt) ?
   auto *NewShAmt = dyn_cast_or_null<Constant>(
-      SimplifyAddInst(XShAmt, YShAmt, /*isNSW=*/false,
+      simplifyAddInst(XShAmt, YShAmt, /*isNSW=*/false,
                       /*isNUW=*/false, SQ.getWithInstruction(&I)));
   if (!NewShAmt)
     return nullptr;
@@ -3956,6 +4044,24 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
       (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULE))
     return new ICmpInst(Pred, X, Builder.CreateNot(Op0));
 
+  {
+    // (Op1 + X) + C u</u>= Op1 --> ~C - X u</u>= Op1
+    Constant *C;
+    if (match(Op0, m_OneUse(m_Add(m_c_Add(m_Specific(Op1), m_Value(X)),
+                                  m_ImmConstant(C)))) &&
+        (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_UGE)) {
+      Constant *C2 = ConstantExpr::getNot(C);
+      return new ICmpInst(Pred, Builder.CreateSub(C2, X), Op1);
+    }
+    // Op0 u>/u<= (Op0 + X) + C --> Op0 u>/u<= ~C - X
+    if (match(Op1, m_OneUse(m_Add(m_c_Add(m_Specific(Op0), m_Value(X)),
+                                  m_ImmConstant(C)))) &&
+        (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULE)) {
+      Constant *C2 = ConstantExpr::getNot(C);
+      return new ICmpInst(Pred, Op0, Builder.CreateSub(C2, X));
+    }
+  }
+
   {
     // Similar to above: an unsigned overflow comparison may use offset + mask:
     // ((Op1 + C) & C) u<  Op1 --> Op1 != 0
@@ -4114,29 +4220,38 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
   //  icmp (A + C1), (C + C2) -> icmp A, (C + C3)
   //  s.t. C3 = C2 - C1
   if (A && C && NoOp0WrapProblem && NoOp1WrapProblem &&
-      (BO0->hasOneUse() || BO1->hasOneUse()) && !I.isUnsigned())
-    if (ConstantInt *C1 = dyn_cast<ConstantInt>(B))
-      if (ConstantInt *C2 = dyn_cast<ConstantInt>(D)) {
-        const APInt &AP1 = C1->getValue();
-        const APInt &AP2 = C2->getValue();
-        if (AP1.isNegative() == AP2.isNegative()) {
-          APInt AP1Abs = C1->getValue().abs();
-          APInt AP2Abs = C2->getValue().abs();
-          if (AP1Abs.uge(AP2Abs)) {
-            ConstantInt *C3 = Builder.getInt(AP1 - AP2);
-            bool HasNUW = BO0->hasNoUnsignedWrap() && C3->getValue().ule(AP1);
-            bool HasNSW = BO0->hasNoSignedWrap();
-            Value *NewAdd = Builder.CreateAdd(A, C3, "", HasNUW, HasNSW);
-            return new ICmpInst(Pred, NewAdd, C);
-          } else {
-            ConstantInt *C3 = Builder.getInt(AP2 - AP1);
-            bool HasNUW = BO1->hasNoUnsignedWrap() && C3->getValue().ule(AP2);
-            bool HasNSW = BO1->hasNoSignedWrap();
-            Value *NewAdd = Builder.CreateAdd(C, C3, "", HasNUW, HasNSW);
-            return new ICmpInst(Pred, A, NewAdd);
-          }
-        }
+      (BO0->hasOneUse() || BO1->hasOneUse()) && !I.isUnsigned()) {
+    const APInt *AP1, *AP2;
+    // TODO: Support non-uniform vectors.
+    // TODO: Allow undef passthrough if B AND D's element is undef.
+    if (match(B, m_APIntAllowUndef(AP1)) && match(D, m_APIntAllowUndef(AP2)) &&
+        AP1->isNegative() == AP2->isNegative()) {
+      APInt AP1Abs = AP1->abs();
+      APInt AP2Abs = AP2->abs();
+      if (AP1Abs.uge(AP2Abs)) {
+        APInt Diff = *AP1 - *AP2;
+        bool HasNUW = BO0->hasNoUnsignedWrap() && Diff.ule(*AP1);
+        bool HasNSW = BO0->hasNoSignedWrap();
+        Constant *C3 = Constant::getIntegerValue(BO0->getType(), Diff);
+        Value *NewAdd = Builder.CreateAdd(A, C3, "", HasNUW, HasNSW);
+        return new ICmpInst(Pred, NewAdd, C);
+      } else {
+        APInt Diff = *AP2 - *AP1;
+        bool HasNUW = BO1->hasNoUnsignedWrap() && Diff.ule(*AP2);
+        bool HasNSW = BO1->hasNoSignedWrap();
+        Constant *C3 = Constant::getIntegerValue(BO0->getType(), Diff);
+        Value *NewAdd = Builder.CreateAdd(C, C3, "", HasNUW, HasNSW);
+        return new ICmpInst(Pred, A, NewAdd);
       }
+    }
+    Constant *Cst1, *Cst2;
+    if (match(B, m_ImmConstant(Cst1)) && match(D, m_ImmConstant(Cst2)) &&
+        ICmpInst::isEquality(Pred)) {
+      Constant *Diff = ConstantExpr::getSub(Cst2, Cst1);
+      Value *NewAdd = Builder.CreateAdd(C, Diff);
+      return new ICmpInst(Pred, A, NewAdd);
+    }
+  }
 
   // Analyze the case when either Op0 or Op1 is a sub instruction.
   // Op0 = A - B (or A and B are null); Op1 = C - D (or C and D are null).
@@ -4524,18 +4639,21 @@ Instruction *InstCombinerImpl::foldICmpEquality(ICmpInst &I) {
 
   // (A >> C) == (B >> C) --> (A^B) u< (1 << C)
   // For lshr and ashr pairs.
-  if ((match(Op0, m_OneUse(m_LShr(m_Value(A), m_ConstantInt(Cst1)))) &&
-       match(Op1, m_OneUse(m_LShr(m_Value(B), m_Specific(Cst1))))) ||
-      (match(Op0, m_OneUse(m_AShr(m_Value(A), m_ConstantInt(Cst1)))) &&
-       match(Op1, m_OneUse(m_AShr(m_Value(B), m_Specific(Cst1)))))) {
-    unsigned TypeBits = Cst1->getBitWidth();
-    unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits);
+  const APInt *AP1, *AP2;
+  if ((match(Op0, m_OneUse(m_LShr(m_Value(A), m_APIntAllowUndef(AP1)))) &&
+       match(Op1, m_OneUse(m_LShr(m_Value(B), m_APIntAllowUndef(AP2))))) ||
+      (match(Op0, m_OneUse(m_AShr(m_Value(A), m_APIntAllowUndef(AP1)))) &&
+       match(Op1, m_OneUse(m_AShr(m_Value(B), m_APIntAllowUndef(AP2)))))) {
+    if (AP1 != AP2)
+      return nullptr;
+    unsigned TypeBits = AP1->getBitWidth();
+    unsigned ShAmt = AP1->getLimitedValue(TypeBits);
     if (ShAmt < TypeBits && ShAmt != 0) {
       ICmpInst::Predicate NewPred =
           Pred == ICmpInst::ICMP_NE ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT;
       Value *Xor = Builder.CreateXor(A, B, I.getName() + ".unshifted");
       APInt CmpVal = APInt::getOneBitSet(TypeBits, ShAmt);
-      return new ICmpInst(NewPred, Xor, Builder.getInt(CmpVal));
+      return new ICmpInst(NewPred, Xor, ConstantInt::get(A->getType(), CmpVal));
     }
   }
 
@@ -4665,8 +4783,7 @@ static Instruction *foldICmpWithTrunc(ICmpInst &ICmp,
   return nullptr;
 }
 
-static Instruction *foldICmpWithZextOrSext(ICmpInst &ICmp,
-                                           InstCombiner::BuilderTy &Builder) {
+Instruction *InstCombinerImpl::foldICmpWithZextOrSext(ICmpInst &ICmp) {
   assert(isa<CastInst>(ICmp.getOperand(0)) && "Expected cast for operand 0");
   auto *CastOp0 = cast<CastInst>(ICmp.getOperand(0));
   Value *X;
@@ -4675,25 +4792,37 @@ static Instruction *foldICmpWithZextOrSext(ICmpInst &ICmp,
 
   bool IsSignedExt = CastOp0->getOpcode() == Instruction::SExt;
   bool IsSignedCmp = ICmp.isSigned();
-  if (auto *CastOp1 = dyn_cast<CastInst>(ICmp.getOperand(1))) {
-    // If the signedness of the two casts doesn't agree (i.e. one is a sext
-    // and the other is a zext), then we can't handle this.
-    // TODO: This is too strict. We can handle some predicates (equality?).
-    if (CastOp0->getOpcode() != CastOp1->getOpcode())
-      return nullptr;
+
+  // icmp Pred (ext X), (ext Y)
+  Value *Y;
+  if (match(ICmp.getOperand(1), m_ZExtOrSExt(m_Value(Y)))) {
+    bool IsZext0 = isa<ZExtOperator>(ICmp.getOperand(0));
+    bool IsZext1 = isa<ZExtOperator>(ICmp.getOperand(1));
+
+    // If we have mismatched casts, treat the zext of a non-negative source as
+    // a sext to simulate matching casts. Otherwise, we are done.
+    // TODO: Can we handle some predicates (equality) without non-negative?
+    if (IsZext0 != IsZext1) {
+      if ((IsZext0 && isKnownNonNegative(X, DL, 0, &AC, &ICmp, &DT)) ||
+          (IsZext1 && isKnownNonNegative(Y, DL, 0, &AC, &ICmp, &DT)))
+        IsSignedExt = true;
+      else
+        return nullptr;
+    }
 
     // Not an extension from the same type?
-    Value *Y = CastOp1->getOperand(0);
     Type *XTy = X->getType(), *YTy = Y->getType();
     if (XTy != YTy) {
       // One of the casts must have one use because we are creating a new cast.
-      if (!CastOp0->hasOneUse() && !CastOp1->hasOneUse())
+      if (!ICmp.getOperand(0)->hasOneUse() && !ICmp.getOperand(1)->hasOneUse())
         return nullptr;
       // Extend the narrower operand to the type of the wider operand.
+      CastInst::CastOps CastOpcode =
+          IsSignedExt ? Instruction::SExt : Instruction::ZExt;
       if (XTy->getScalarSizeInBits() < YTy->getScalarSizeInBits())
-        X = Builder.CreateCast(CastOp0->getOpcode(), X, YTy);
+        X = Builder.CreateCast(CastOpcode, X, YTy);
       else if (YTy->getScalarSizeInBits() < XTy->getScalarSizeInBits())
-        Y = Builder.CreateCast(CastOp0->getOpcode(), Y, XTy);
+        Y = Builder.CreateCast(CastOpcode, Y, XTy);
       else
         return nullptr;
     }
@@ -4742,7 +4871,7 @@ static Instruction *foldICmpWithZextOrSext(ICmpInst &ICmp,
   // or could not be determined to be equal (in the case of a constant
   // expression), so the constant cannot be represented in the shorter type.
   // All the cases that fold to true or false will have already been handled
-  // by SimplifyICmpInst, so only deal with the tricky case.
+  // by simplifyICmpInst, so only deal with the tricky case.
   if (IsSignedCmp || !IsSignedExt || !isa<ConstantInt>(C))
     return nullptr;
 
@@ -4811,7 +4940,7 @@ Instruction *InstCombinerImpl::foldICmpWithCastOp(ICmpInst &ICmp) {
   if (Instruction *R = foldICmpWithTrunc(ICmp, Builder))
     return R;
 
-  return foldICmpWithZextOrSext(ICmp, Builder);
+  return foldICmpWithZextOrSext(ICmp);
 }
 
 static bool isNeutralValue(Instruction::BinaryOps BinaryOp, Value *RHS) {
@@ -5449,35 +5578,23 @@ Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
         LHS = Op0;
 
       Value *X;
-      if (match(LHS, m_Shl(m_One(), m_Value(X)))) {
-        APInt ValToCheck = Op0KnownZeroInverted;
+      const APInt *C1;
+      if (match(LHS, m_Shl(m_Power2(C1), m_Value(X)))) {
         Type *XTy = X->getType();
-        if (ValToCheck.isPowerOf2()) {
-          // ((1 << X) & 8) == 0 -> X != 3
-          // ((1 << X) & 8) != 0 -> X == 3
-          auto *CmpC = ConstantInt::get(XTy, ValToCheck.countTrailingZeros());
-          auto NewPred = ICmpInst::getInversePredicate(Pred);
-          return new ICmpInst(NewPred, X, CmpC);
-        } else if ((++ValToCheck).isPowerOf2()) {
-          // ((1 << X) & 7) == 0 -> X >= 3
-          // ((1 << X) & 7) != 0 -> X  < 3
-          auto *CmpC = ConstantInt::get(XTy, ValToCheck.countTrailingZeros());
+        unsigned Log2C1 = C1->countTrailingZeros();
+        APInt C2 = Op0KnownZeroInverted;
+        APInt C2Pow2 = (C2 & ~(*C1 - 1)) + *C1;
+        if (C2Pow2.isPowerOf2()) {
+          // iff (C1 is pow2) & ((C2 & ~(C1-1)) + C1) is pow2):
+          // ((C1 << X) & C2) == 0 -> X >= (Log2(C2+C1) - Log2(C1))
+          // ((C1 << X) & C2) != 0 -> X  < (Log2(C2+C1) - Log2(C1))
+          unsigned Log2C2 = C2Pow2.countTrailingZeros();
+          auto *CmpC = ConstantInt::get(XTy, Log2C2 - Log2C1);
           auto NewPred =
               Pred == CmpInst::ICMP_EQ ? CmpInst::ICMP_UGE : CmpInst::ICMP_ULT;
           return new ICmpInst(NewPred, X, CmpC);
         }
       }
-
-      // Check if the LHS is 8 >>u x and the result is a power of 2 like 1.
-      const APInt *CI;
-      if (Op0KnownZeroInverted.isOne() &&
-          match(LHS, m_LShr(m_Power2(CI), m_Value(X)))) {
-        // ((8 >>u X) & 1) == 0 -> X != 3
-        // ((8 >>u X) & 1) != 0 -> X == 3
-        unsigned CmpVal = CI->countTrailingZeros();
-        auto NewPred = ICmpInst::getInversePredicate(Pred);
-        return new ICmpInst(NewPred, X, ConstantInt::get(X->getType(), CmpVal));
-      }
     }
     break;
   }
@@ -5557,6 +5674,28 @@ Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
   return nullptr;
 }
 
+/// If one operand of an icmp is effectively a bool (value range of {0,1}),
+/// then try to reduce patterns based on that limit.
+static Instruction *foldICmpUsingBoolRange(ICmpInst &I,
+                                           InstCombiner::BuilderTy &Builder) {
+  Value *X, *Y;
+  ICmpInst::Predicate Pred;
+
+  // X must be 0 and bool must be true for "ULT":
+  // X <u (zext i1 Y) --> (X == 0) & Y
+  if (match(&I, m_c_ICmp(Pred, m_Value(X), m_OneUse(m_ZExt(m_Value(Y))))) &&
+      Y->getType()->isIntOrIntVectorTy(1) && Pred == ICmpInst::ICMP_ULT)
+    return BinaryOperator::CreateAnd(Builder.CreateIsNull(X), Y);
+
+  // X must be 0 or bool must be true for "ULE":
+  // X <=u (sext i1 Y) --> (X == 0) | Y
+  if (match(&I, m_c_ICmp(Pred, m_Value(X), m_OneUse(m_SExt(m_Value(Y))))) &&
+      Y->getType()->isIntOrIntVectorTy(1) && Pred == ICmpInst::ICMP_ULE)
+    return BinaryOperator::CreateOr(Builder.CreateIsNull(X), Y);
+
+  return nullptr;
+}
+
 llvm::Optional<std::pair<CmpInst::Predicate, Constant *>>
 InstCombiner::getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred,
                                                        Constant *C) {
@@ -5948,7 +6087,7 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
     Changed = true;
   }
 
-  if (Value *V = SimplifyICmpInst(I.getPredicate(), Op0, Op1, Q))
+  if (Value *V = simplifyICmpInst(I.getPredicate(), Op0, Op1, Q))
     return replaceInstUsesWith(I, V);
 
   // Comparing -val or val with non-zero is the same as just comparing val
@@ -5984,6 +6123,9 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
   if (Instruction *Res = foldICmpWithDominatingICmp(I))
     return Res;
 
+  if (Instruction *Res = foldICmpUsingBoolRange(I, Builder))
+    return Res;
+
   if (Instruction *Res = foldICmpUsingKnownBits(I))
     return Res;
 
@@ -6057,14 +6199,21 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
     if (Instruction *NI = foldGEPICmp(GEP, Op0, I.getSwappedPredicate(), I))
       return NI;
 
+  if (auto *SI = dyn_cast<SelectInst>(Op0))
+    if (Instruction *NI = foldSelectICmp(I.getPredicate(), SI, Op1, I))
+      return NI;
+  if (auto *SI = dyn_cast<SelectInst>(Op1))
+    if (Instruction *NI = foldSelectICmp(I.getSwappedPredicate(), SI, Op0, I))
+      return NI;
+
   // Try to optimize equality comparisons against alloca-based pointers.
   if (Op0->getType()->isPointerTy() && I.isEquality()) {
     assert(Op1->getType()->isPointerTy() && "Comparing pointer with non-pointer?");
     if (auto *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(Op0)))
-      if (Instruction *New = foldAllocaCmp(I, Alloca, Op1))
+      if (Instruction *New = foldAllocaCmp(I, Alloca))
         return New;
     if (auto *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(Op1)))
-      if (Instruction *New = foldAllocaCmp(I, Alloca, Op0))
+      if (Instruction *New = foldAllocaCmp(I, Alloca))
         return New;
   }
 
@@ -6529,6 +6678,25 @@ static Instruction *foldFabsWithFcmpZero(FCmpInst &I, InstCombinerImpl &IC) {
   }
 }
 
+static Instruction *foldFCmpFNegCommonOp(FCmpInst &I) {
+  CmpInst::Predicate Pred = I.getPredicate();
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // Canonicalize fneg as Op1.
+  if (match(Op0, m_FNeg(m_Value())) && !match(Op1, m_FNeg(m_Value()))) {
+    std::swap(Op0, Op1);
+    Pred = I.getSwappedPredicate();
+  }
+
+  if (!match(Op1, m_FNeg(m_Specific(Op0))))
+    return nullptr;
+
+  // Replace the negated operand with 0.0:
+  // fcmp Pred Op0, -Op0 --> fcmp Pred Op0, 0.0
+  Constant *Zero = ConstantFP::getNullValue(Op0->getType());
+  return new FCmpInst(Pred, Op0, Zero, "", &I);
+}
+
 Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) {
   bool Changed = false;
 
@@ -6542,7 +6710,7 @@ Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) {
 
   const CmpInst::Predicate Pred = I.getPredicate();
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-  if (Value *V = SimplifyFCmpInst(Pred, Op0, Op1, I.getFastMathFlags(),
+  if (Value *V = simplifyFCmpInst(Pred, Op0, Op1, I.getFastMathFlags(),
                                   SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
@@ -6587,6 +6755,9 @@ Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) {
   if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_FNeg(m_Value(Y))))
     return new FCmpInst(I.getSwappedPredicate(), X, Y, "", &I);
 
+  if (Instruction *R = foldFCmpFNegCommonOp(I))
+    return R;
+
   // Test if the FCmpInst instruction is used exclusively by a select as
   // part of a minimum or maximum operation. If so, refrain from doing
   // any other folding. This helps out other analyses which understand
@@ -6632,10 +6803,9 @@ Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) {
     case Instruction::Load:
       if (auto *GEP = dyn_cast<GetElementPtrInst>(LHSI->getOperand(0)))
         if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
-          if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
-              !cast<LoadInst>(LHSI)->isVolatile())
-            if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, I))
-              return Res;
+          if (Instruction *Res = foldCmpLoadFromIndexedGlobal(
+                  cast<LoadInst>(LHSI), GEP, GV, I))
+            return Res;
       break;
   }
   }
@@ -6657,7 +6827,6 @@ Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) {
     if (match(Op1, m_FPExt(m_Value(Y))) && X->getType() == Y->getType())
       return new FCmpInst(Pred, X, Y, "", &I);
 
-    // fcmp (fpext X), C -> fcmp X, (fptrunc C) if fptrunc is lossless
     const APFloat *C;
     if (match(Op1, m_APFloat(C))) {
       const fltSemantics &FPSem =
@@ -6666,6 +6835,31 @@ Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) {
       APFloat TruncC = *C;
       TruncC.convert(FPSem, APFloat::rmNearestTiesToEven, &Lossy);
 
+      if (Lossy) {
+        // X can't possibly equal the higher-precision constant, so reduce any
+        // equality comparison.
+        // TODO: Other predicates can be handled via getFCmpCode().
+        switch (Pred) {
+        case FCmpInst::FCMP_OEQ:
+          // X is ordered and equal to an impossible constant --> false
+          return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+        case FCmpInst::FCMP_ONE:
+          // X is ordered and not equal to an impossible constant --> ordered
+          return new FCmpInst(FCmpInst::FCMP_ORD, X,
+                              ConstantFP::getNullValue(X->getType()));
+        case FCmpInst::FCMP_UEQ:
+          // X is unordered or equal to an impossible constant --> unordered
+          return new FCmpInst(FCmpInst::FCMP_UNO, X,
+                              ConstantFP::getNullValue(X->getType()));
+        case FCmpInst::FCMP_UNE:
+          // X is unordered or not equal to an impossible constant --> true
+          return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+        default:
+          break;
+        }
+      }
+
+      // fcmp (fpext X), C -> fcmp X, (fptrunc C) if fptrunc is lossless
       // Avoid lossy conversions and denormals.
       // Zero is a special case that's OK to convert.
       APFloat Fabs = TruncC;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 7743b4c41555..271154bb3f5a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -71,7 +71,7 @@ public:
       : InstCombiner(Worklist, Builder, MinimizeSize, AA, AC, TLI, TTI, DT, ORE,
                      BFI, PSI, DL, LI) {}
 
-  virtual ~InstCombinerImpl() {}
+  virtual ~InstCombinerImpl() = default;
 
   /// Run the combiner over the entire worklist until it is empty.
   ///
@@ -172,7 +172,8 @@ public:
   Instruction *visitLandingPadInst(LandingPadInst &LI);
   Instruction *visitVAEndInst(VAEndInst &I);
   Value *pushFreezeToPreventPoisonFromPropagating(FreezeInst &FI);
-  bool freezeDominatedUses(FreezeInst &FI);
+  bool freezeOtherUses(FreezeInst &FI);
+  Instruction *foldFreezeIntoRecurrence(FreezeInst &I, PHINode *PN);
   Instruction *visitFreeze(FreezeInst &I);
 
   /// Specify what to return for unhandled instructions.
@@ -192,7 +193,7 @@ public:
                                  const Twine &Suffix = "");
 
 private:
-  void annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI);
+  bool annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI);
   bool isDesirableIntType(unsigned BitWidth) const;
   bool shouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const;
   bool shouldChangeType(Type *From, Type *To) const;
@@ -325,7 +326,7 @@ private:
   Instruction *narrowMathIfNoOverflow(BinaryOperator &I);
   Instruction *narrowFunnelShift(TruncInst &Trunc);
   Instruction *optimizeBitCastFromPhi(CastInst &CI, PHINode *PN);
-  Instruction *matchSAddSubSat(Instruction &MinMax1);
+  Instruction *matchSAddSubSat(IntrinsicInst &MinMax1);
   Instruction *foldNot(BinaryOperator &I);
 
   void freelyInvertAllUsersOf(Value *V);
@@ -344,16 +345,20 @@ private:
                                             const CastInst *CI2);
   Value *simplifyIntToPtrRoundTripCast(Value *Val);
 
-  Value *foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS, BinaryOperator &And);
-  Value *foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, BinaryOperator &Or);
+  Value *foldAndOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction &I,
+                          bool IsAnd, bool IsLogical = false);
   Value *foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS, BinaryOperator &Xor);
 
   Value *foldEqOfParts(ICmpInst *Cmp0, ICmpInst *Cmp1, bool IsAnd);
 
+  Value *foldAndOrOfICmpsUsingRanges(ICmpInst *ICmp1, ICmpInst *ICmp2,
+                                     bool IsAnd);
+
   /// Optimize (fcmp)&(fcmp) or (fcmp)|(fcmp).
   /// NOTE: Unlike most of instcombine, this returns a Value which should
   /// already be inserted into the function.
-  Value *foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS, bool IsAnd);
+  Value *foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS, bool IsAnd,
+                          bool IsLogicalSelect = false);
 
   Value *foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS,
                                        Instruction *CxtI, bool IsAnd,
@@ -407,7 +412,7 @@ public:
     // If we are replacing the instruction with itself, this must be in a
     // segment of unreachable code, so just clobber the instruction.
     if (&I == V)
-      V = UndefValue::get(I.getType());
+      V = PoisonValue::get(I.getType());
 
     LLVM_DEBUG(dbgs() << "IC: Replacing " << I << "\n"
                       << "    with " << *V << '\n');
@@ -435,7 +440,7 @@ public:
   void CreateNonTerminatorUnreachable(Instruction *InsertAt) {
     auto &Ctx = InsertAt->getContext();
     new StoreInst(ConstantInt::getTrue(Ctx),
-                  UndefValue::get(Type::getInt1PtrTy(Ctx)),
+                  PoisonValue::get(Type::getInt1PtrTy(Ctx)),
                   InsertAt);
   }
 
@@ -621,7 +626,8 @@ public:
   /// other operand, try to fold the binary operator into the select arguments.
   /// This also works for Cast instructions, which obviously do not have a
   /// second operand.
-  Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI);
+  Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI,
+                                bool FoldWithMultiUse = false);
 
   /// This is a convenience wrapper function for the above two functions.
   Instruction *foldBinOpIntoSelectOrPhi(BinaryOperator &I);
@@ -650,22 +656,27 @@ public:
 
   Instruction *foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
                            ICmpInst::Predicate Cond, Instruction &I);
-  Instruction *foldAllocaCmp(ICmpInst &ICI, const AllocaInst *Alloca,
-                             const Value *Other);
-  Instruction *foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
+  Instruction *foldSelectICmp(ICmpInst::Predicate Pred, SelectInst *SI,
+                              Value *RHS, const ICmpInst &I);
+  Instruction *foldAllocaCmp(ICmpInst &ICI, const AllocaInst *Alloca);
+  Instruction *foldCmpLoadFromIndexedGlobal(LoadInst *LI,
+                                            GetElementPtrInst *GEP,
                                             GlobalVariable *GV, CmpInst &ICI,
                                             ConstantInt *AndCst = nullptr);
   Instruction *foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI,
                                     Constant *RHSC);
   Instruction *foldICmpAddOpConst(Value *X, const APInt &C,
                                   ICmpInst::Predicate Pred);
-  Instruction *foldICmpWithCastOp(ICmpInst &ICI);
+  Instruction *foldICmpWithCastOp(ICmpInst &ICmp);
+  Instruction *foldICmpWithZextOrSext(ICmpInst &ICmp);
 
   Instruction *foldICmpUsingKnownBits(ICmpInst &Cmp);
   Instruction *foldICmpWithDominatingICmp(ICmpInst &Cmp);
   Instruction *foldICmpWithConstant(ICmpInst &Cmp);
   Instruction *foldICmpInstWithConstant(ICmpInst &Cmp);
   Instruction *foldICmpInstWithConstantNotInt(ICmpInst &Cmp);
+  Instruction *foldICmpInstWithConstantAllowUndef(ICmpInst &Cmp,
+                                                  const APInt &C);
   Instruction *foldICmpBinOp(ICmpInst &Cmp, const SimplifyQuery &SQ);
   Instruction *foldICmpEquality(ICmpInst &Cmp);
   Instruction *foldIRemByPowerOfTwoToBitTest(ICmpInst &I);
@@ -674,6 +685,8 @@ public:
 
   Value *foldMultiplicationOverflowCheck(ICmpInst &Cmp);
 
+  Instruction *foldICmpBinOpWithConstant(ICmpInst &Cmp, BinaryOperator *BO,
+                                         const APInt &C);
   Instruction *foldICmpSelectConstant(ICmpInst &Cmp, SelectInst *Select,
                                       ConstantInt *C);
   Instruction *foldICmpTruncConstant(ICmpInst &Cmp, TruncInst *Trunc,
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 756792918dba..e03b7026f802 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -16,15 +16,12 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/Loads.h"
-#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 using namespace PatternMatch;
@@ -775,7 +772,7 @@ static bool isObjectSizeLessThanOrEq(Value *V, uint64_t MaxSize,
       uint64_t TypeSize = DL.getTypeAllocSize(AI->getAllocatedType());
       // Make sure that, even if the multiplication below would wrap as an
       // uint64_t, we still do the right thing.
-      if ((CS->getValue().zextOrSelf(128)*APInt(128, TypeSize)).ugt(MaxSize))
+      if ((CS->getValue().zext(128) * APInt(128, TypeSize)).ugt(MaxSize))
         return false;
       continue;
     }
@@ -1395,8 +1392,10 @@ Instruction *InstCombinerImpl::visitStoreInst(StoreInst &SI) {
 
     if (StoreInst *PrevSI = dyn_cast<StoreInst>(BBI)) {
       // Prev store isn't volatile, and stores to the same location?
-      if (PrevSI->isUnordered() && equivalentAddressValues(PrevSI->getOperand(1),
-                                                        SI.getOperand(1))) {
+      if (PrevSI->isUnordered() &&
+          equivalentAddressValues(PrevSI->getOperand(1), SI.getOperand(1)) &&
+          PrevSI->getValueOperand()->getType() ==
+              SI.getValueOperand()->getType()) {
         ++NumDeadStore;
         // Manually add back the original store to the worklist now, so it will
         // be processed after the operands of the removed store, as this may
@@ -1436,6 +1435,8 @@ Instruction *InstCombinerImpl::visitStoreInst(StoreInst &SI) {
   }
 
   // store undef, Ptr -> noop
+  // FIXME: This is technically incorrect because it might overwrite a poison
+  // value. Change to PoisonValue once #52930 is resolved.
   if (isa<UndefValue>(Val))
     return eraseInstFromFunction(SI);
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 1aa10b550fc4..2a34edbf6cb8 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombineInternal.h"
-#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/InstructionSimplify.h"
@@ -30,13 +29,9 @@
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <utility>
 
 #define DEBUG_TYPE "instcombine"
 #include "llvm/Transforms/Utils/InstructionWorklist.h"
@@ -145,7 +140,7 @@ static Value *foldMulSelectToNegate(BinaryOperator &I,
 }
 
 Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
-  if (Value *V = SimplifyMulInst(I.getOperand(0), I.getOperand(1),
+  if (Value *V = simplifyMulInst(I.getOperand(0), I.getOperand(1),
                                  SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
@@ -297,15 +292,24 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
 
       auto RemOpc = Div->getOpcode() == Instruction::UDiv ? Instruction::URem
                                                           : Instruction::SRem;
-      Value *Rem = Builder.CreateBinOp(RemOpc, X, DivOp1);
+      // X must be frozen because we are increasing its number of uses.
+      Value *XFreeze = Builder.CreateFreeze(X, X->getName() + ".fr");
+      Value *Rem = Builder.CreateBinOp(RemOpc, XFreeze, DivOp1);
       if (DivOp1 == Y)
-        return BinaryOperator::CreateSub(X, Rem);
-      return BinaryOperator::CreateSub(Rem, X);
+        return BinaryOperator::CreateSub(XFreeze, Rem);
+      return BinaryOperator::CreateSub(Rem, XFreeze);
     }
   }
 
-  /// i1 mul -> i1 and.
-  if (I.getType()->isIntOrIntVectorTy(1))
+  // Fold the following two scenarios:
+  //   1) i1 mul -> i1 and.
+  //   2) X * Y --> X & Y, iff X, Y can be only {0,1}.
+  // Note: We could use known bits to generalize this and related patterns with
+  // shifts/truncs
+  Type *Ty = I.getType();
+  if (Ty->isIntOrIntVectorTy(1) ||
+      (match(Op0, m_And(m_Value(), m_One())) &&
+       match(Op1, m_And(m_Value(), m_One()))))
     return BinaryOperator::CreateAnd(Op0, Op1);
 
   // X*(1 << Y) --> X << Y
@@ -338,7 +342,7 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
       X->getType()->isIntOrIntVectorTy(1) && X->getType() == Y->getType() &&
       (Op0->hasOneUse() || Op1->hasOneUse() || X == Y)) {
     Value *And = Builder.CreateAnd(X, Y, "mulbool");
-    return CastInst::Create(Instruction::ZExt, And, I.getType());
+    return CastInst::Create(Instruction::ZExt, And, Ty);
   }
   // (sext bool X) * (zext bool Y) --> sext (and X, Y)
   // (zext bool X) * (sext bool Y) --> sext (and X, Y)
@@ -348,42 +352,56 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
       X->getType()->isIntOrIntVectorTy(1) && X->getType() == Y->getType() &&
       (Op0->hasOneUse() || Op1->hasOneUse())) {
     Value *And = Builder.CreateAnd(X, Y, "mulbool");
-    return CastInst::Create(Instruction::SExt, And, I.getType());
+    return CastInst::Create(Instruction::SExt, And, Ty);
   }
 
   // (zext bool X) * Y --> X ? Y : 0
   // Y * (zext bool X) --> X ? Y : 0
   if (match(Op0, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
-    return SelectInst::Create(X, Op1, ConstantInt::get(I.getType(), 0));
+    return SelectInst::Create(X, Op1, ConstantInt::getNullValue(Ty));
   if (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
-    return SelectInst::Create(X, Op0, ConstantInt::get(I.getType(), 0));
+    return SelectInst::Create(X, Op0, ConstantInt::getNullValue(Ty));
 
-  // (sext bool X) * C --> X ? -C : 0
   Constant *ImmC;
-  if (match(Op0, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1) &&
-      match(Op1, m_ImmConstant(ImmC))) {
-    Constant *NegC = ConstantExpr::getNeg(ImmC);
-    return SelectInst::Create(X, NegC, ConstantInt::getNullValue(I.getType()));
+  if (match(Op1, m_ImmConstant(ImmC))) {
+    // (sext bool X) * C --> X ? -C : 0
+    if (match(Op0, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
+      Constant *NegC = ConstantExpr::getNeg(ImmC);
+      return SelectInst::Create(X, NegC, ConstantInt::getNullValue(Ty));
+    }
+
+    // (ashr i32 X, 31) * C --> (X < 0) ? -C : 0
+    const APInt *C;
+    if (match(Op0, m_OneUse(m_AShr(m_Value(X), m_APInt(C)))) &&
+        *C == C->getBitWidth() - 1) {
+      Constant *NegC = ConstantExpr::getNeg(ImmC);
+      Value *IsNeg = Builder.CreateIsNeg(X, "isneg");
+      return SelectInst::Create(IsNeg, NegC, ConstantInt::getNullValue(Ty));
+    }
   }
 
-  // (lshr X, 31) * Y --> (ashr X, 31) & Y
-  // Y * (lshr X, 31) --> (ashr X, 31) & Y
+  // (lshr X, 31) * Y --> (X < 0) ? Y : 0
   // TODO: We are not checking one-use because the elimination of the multiply
   //       is better for analysis?
-  // TODO: Should we canonicalize to '(X < 0) ? Y : 0' instead? That would be
-  //       more similar to what we're doing above.
   const APInt *C;
-  if (match(Op0, m_LShr(m_Value(X), m_APInt(C))) && *C == C->getBitWidth() - 1)
-    return BinaryOperator::CreateAnd(Builder.CreateAShr(X, *C), Op1);
-  if (match(Op1, m_LShr(m_Value(X), m_APInt(C))) && *C == C->getBitWidth() - 1)
-    return BinaryOperator::CreateAnd(Builder.CreateAShr(X, *C), Op0);
+  if (match(&I, m_c_BinOp(m_LShr(m_Value(X), m_APInt(C)), m_Value(Y))) &&
+      *C == C->getBitWidth() - 1) {
+    Value *IsNeg = Builder.CreateIsNeg(X, "isneg");
+    return SelectInst::Create(IsNeg, Y, ConstantInt::getNullValue(Ty));
+  }
+
+  // (and X, 1) * Y --> (trunc X) ? Y : 0
+  if (match(&I, m_c_BinOp(m_OneUse(m_And(m_Value(X), m_One())), m_Value(Y)))) {
+    Value *Tr = Builder.CreateTrunc(X, CmpInst::makeCmpResultType(Ty));
+    return SelectInst::Create(Tr, Y, ConstantInt::getNullValue(Ty));
+  }
 
   // ((ashr X, 31) | 1) * X --> abs(X)
   // X * ((ashr X, 31) | 1) --> abs(X)
   if (match(&I, m_c_BinOp(m_Or(m_AShr(m_Value(X),
-                                    m_SpecificIntAllowUndef(BitWidth - 1)),
-                             m_One()),
-                        m_Deferred(X)))) {
+                                      m_SpecificIntAllowUndef(BitWidth - 1)),
+                               m_One()),
+                          m_Deferred(X)))) {
     Value *Abs = Builder.CreateBinaryIntrinsic(
         Intrinsic::abs, X,
         ConstantInt::getBool(I.getContext(), I.hasNoSignedWrap()));
@@ -442,7 +460,7 @@ Instruction *InstCombinerImpl::foldFPSignBitOps(BinaryOperator &I) {
 }
 
 Instruction *InstCombinerImpl::visitFMul(BinaryOperator &I) {
-  if (Value *V = SimplifyFMulInst(I.getOperand(0), I.getOperand(1),
+  if (Value *V = simplifyFMulInst(I.getOperand(0), I.getOperand(1),
                                   I.getFastMathFlags(),
                                   SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
@@ -532,9 +550,8 @@ Instruction *InstCombinerImpl::visitFMul(BinaryOperator &I) {
     // sqrt(X) * sqrt(Y) -> sqrt(X * Y)
     // nnan disallows the possibility of returning a number if both operands are
     // negative (in that case, we should return NaN).
-    if (I.hasNoNaNs() &&
-        match(Op0, m_OneUse(m_Intrinsic<Intrinsic::sqrt>(m_Value(X)))) &&
-        match(Op1, m_OneUse(m_Intrinsic<Intrinsic::sqrt>(m_Value(Y))))) {
+    if (I.hasNoNaNs() && match(Op0, m_OneUse(m_Sqrt(m_Value(X)))) &&
+        match(Op1, m_OneUse(m_Sqrt(m_Value(Y))))) {
       Value *XY = Builder.CreateFMulFMF(X, Y, &I);
       Value *Sqrt = Builder.CreateUnaryIntrinsic(Intrinsic::sqrt, XY, &I);
       return replaceInstUsesWith(I, Sqrt);
@@ -548,11 +565,11 @@ Instruction *InstCombinerImpl::visitFMul(BinaryOperator &I) {
     // has the necessary (reassoc) fast-math-flags.
     if (I.hasNoSignedZeros() &&
         match(Op0, (m_FDiv(m_SpecificFP(1.0), m_Value(Y)))) &&
-        match(Y, m_Intrinsic<Intrinsic::sqrt>(m_Value(X))) && Op1 == X)
+        match(Y, m_Sqrt(m_Value(X))) && Op1 == X)
       return BinaryOperator::CreateFDivFMF(X, Y, &I);
     if (I.hasNoSignedZeros() &&
         match(Op1, (m_FDiv(m_SpecificFP(1.0), m_Value(Y)))) &&
-        match(Y, m_Intrinsic<Intrinsic::sqrt>(m_Value(X))) && Op0 == X)
+        match(Y, m_Sqrt(m_Value(X))) && Op0 == X)
       return BinaryOperator::CreateFDivFMF(X, Y, &I);
 
     // Like the similar transform in instsimplify, this requires 'nsz' because
@@ -561,14 +578,12 @@ Instruction *InstCombinerImpl::visitFMul(BinaryOperator &I) {
         Op0->hasNUses(2)) {
       // Peek through fdiv to find squaring of square root:
       // (X / sqrt(Y)) * (X / sqrt(Y)) --> (X * X) / Y
-      if (match(Op0, m_FDiv(m_Value(X),
-                            m_Intrinsic<Intrinsic::sqrt>(m_Value(Y))))) {
+      if (match(Op0, m_FDiv(m_Value(X), m_Sqrt(m_Value(Y))))) {
         Value *XX = Builder.CreateFMulFMF(X, X, &I);
         return BinaryOperator::CreateFDivFMF(XX, Y, &I);
       }
       // (sqrt(Y) / X) * (sqrt(Y) / X) --> Y / (X * X)
-      if (match(Op0, m_FDiv(m_Intrinsic<Intrinsic::sqrt>(m_Value(Y)),
-                            m_Value(X)))) {
+      if (match(Op0, m_FDiv(m_Sqrt(m_Value(Y)), m_Value(X)))) {
         Value *XX = Builder.CreateFMulFMF(X, X, &I);
         return BinaryOperator::CreateFDivFMF(Y, XX, &I);
       }
@@ -777,7 +792,8 @@ Instruction *InstCombinerImpl::commonIDivTransforms(BinaryOperator &I) {
   // TODO: Adapt simplifyDivRemOfSelectWithZeroOp to allow this and other folds.
   if (match(Op0, m_ImmConstant()) &&
       match(Op1, m_Select(m_Value(), m_ImmConstant(), m_ImmConstant()))) {
-    if (Instruction *R = FoldOpIntoSelect(I, cast<SelectInst>(Op1)))
+    if (Instruction *R = FoldOpIntoSelect(I, cast<SelectInst>(Op1),
+                                          /*FoldWithMultiUse*/ true))
       return R;
   }
 
@@ -853,12 +869,13 @@ Instruction *InstCombinerImpl::commonIDivTransforms(BinaryOperator &I) {
   if (match(Op0, m_One())) {
     assert(!Ty->isIntOrIntVectorTy(1) && "i1 divide not removed?");
     if (IsSigned) {
-      // If Op1 is 0 then it's undefined behaviour, if Op1 is 1 then the
-      // result is one, if Op1 is -1 then the result is minus one, otherwise
-      // it's zero.
-      Value *Inc = Builder.CreateAdd(Op1, Op0);
+      // 1 / 0 --> undef ; 1 / 1 --> 1 ; 1 / -1 --> -1 ; 1 / anything else --> 0
+      // (Op1 + 1) u< 3 ? Op1 : 0
+      // Op1 must be frozen because we are increasing its number of uses.
+      Value *F1 = Builder.CreateFreeze(Op1, Op1->getName() + ".fr");
+      Value *Inc = Builder.CreateAdd(F1, Op0);
       Value *Cmp = Builder.CreateICmpULT(Inc, ConstantInt::get(Ty, 3));
-      return SelectInst::Create(Cmp, Op1, ConstantInt::get(Ty, 0));
+      return SelectInst::Create(Cmp, F1, ConstantInt::get(Ty, 0));
     } else {
       // If Op1 is 0 then it's undefined behaviour. If Op1 is 1 then the
       // result is one, otherwise it's zero.
@@ -900,113 +917,69 @@ Instruction *InstCombinerImpl::commonIDivTransforms(BinaryOperator &I) {
 
 static const unsigned MaxDepth = 6;
 
-namespace {
-
-using FoldUDivOperandCb = Instruction *(*)(Value *Op0, Value *Op1,
-                                           const BinaryOperator &I,
-                                           InstCombinerImpl &IC);
-
-/// Used to maintain state for visitUDivOperand().
-struct UDivFoldAction {
-  /// Informs visitUDiv() how to fold this operand.  This can be zero if this
-  /// action joins two actions together.
-  FoldUDivOperandCb FoldAction;
-
-  /// Which operand to fold.
-  Value *OperandToFold;
-
-  union {
-    /// The instruction returned when FoldAction is invoked.
-    Instruction *FoldResult;
-
-    /// Stores the LHS action index if this action joins two actions together.
-    size_t SelectLHSIdx;
+// Take the exact integer log2 of the value. If DoFold is true, create the
+// actual instructions, otherwise return a non-null dummy value. Return nullptr
+// on failure.
+static Value *takeLog2(IRBuilderBase &Builder, Value *Op, unsigned Depth,
+                       bool DoFold) {
+  auto IfFold = [DoFold](function_ref<Value *()> Fn) {
+    if (!DoFold)
+      return reinterpret_cast<Value *>(-1);
+    return Fn();
   };
 
-  UDivFoldAction(FoldUDivOperandCb FA, Value *InputOperand)
-      : FoldAction(FA), OperandToFold(InputOperand), FoldResult(nullptr) {}
-  UDivFoldAction(FoldUDivOperandCb FA, Value *InputOperand, size_t SLHS)
-      : FoldAction(FA), OperandToFold(InputOperand), SelectLHSIdx(SLHS) {}
-};
-
-} // end anonymous namespace
-
-// X udiv 2^C -> X >> C
-static Instruction *foldUDivPow2Cst(Value *Op0, Value *Op1,
-                                    const BinaryOperator &I,
-                                    InstCombinerImpl &IC) {
-  Constant *C1 = ConstantExpr::getExactLogBase2(cast<Constant>(Op1));
-  if (!C1)
-    llvm_unreachable("Failed to constant fold udiv -> logbase2");
-  BinaryOperator *LShr = BinaryOperator::CreateLShr(Op0, C1);
-  if (I.isExact())
-    LShr->setIsExact();
-  return LShr;
-}
-
-// X udiv (C1 << N), where C1 is "1<<C2"  -->  X >> (N+C2)
-// X udiv (zext (C1 << N)), where C1 is "1<<C2"  -->  X >> (N+C2)
-static Instruction *foldUDivShl(Value *Op0, Value *Op1, const BinaryOperator &I,
-                                InstCombinerImpl &IC) {
-  Value *ShiftLeft;
-  if (!match(Op1, m_ZExt(m_Value(ShiftLeft))))
-    ShiftLeft = Op1;
-
-  Constant *CI;
-  Value *N;
-  if (!match(ShiftLeft, m_Shl(m_Constant(CI), m_Value(N))))
-    llvm_unreachable("match should never fail here!");
-  Constant *Log2Base = ConstantExpr::getExactLogBase2(CI);
-  if (!Log2Base)
-    llvm_unreachable("getLogBase2 should never fail here!");
-  N = IC.Builder.CreateAdd(N, Log2Base);
-  if (Op1 != ShiftLeft)
-    N = IC.Builder.CreateZExt(N, Op1->getType());
-  BinaryOperator *LShr = BinaryOperator::CreateLShr(Op0, N);
-  if (I.isExact())
-    LShr->setIsExact();
-  return LShr;
-}
-
-// Recursively visits the possible right hand operands of a udiv
-// instruction, seeing through select instructions, to determine if we can
-// replace the udiv with something simpler.  If we find that an operand is not
-// able to simplify the udiv, we abort the entire transformation.
-static size_t visitUDivOperand(Value *Op0, Value *Op1, const BinaryOperator &I,
-                               SmallVectorImpl<UDivFoldAction> &Actions,
-                               unsigned Depth = 0) {
   // FIXME: assert that Op1 isn't/doesn't contain undef.
 
-  // Check to see if this is an unsigned division with an exact power of 2,
-  // if so, convert to a right shift.
-  if (match(Op1, m_Power2())) {
-    Actions.push_back(UDivFoldAction(foldUDivPow2Cst, Op1));
-    return Actions.size();
-  }
-
-  // X udiv (C1 << N), where C1 is "1<<C2"  -->  X >> (N+C2)
-  if (match(Op1, m_Shl(m_Power2(), m_Value())) ||
-      match(Op1, m_ZExt(m_Shl(m_Power2(), m_Value())))) {
-    Actions.push_back(UDivFoldAction(foldUDivShl, Op1));
-    return Actions.size();
-  }
+  // log2(2^C) -> C
+  if (match(Op, m_Power2()))
+    return IfFold([&]() {
+      Constant *C = ConstantExpr::getExactLogBase2(cast<Constant>(Op));
+      if (!C)
+        llvm_unreachable("Failed to constant fold udiv -> logbase2");
+      return C;
+    });
 
   // The remaining tests are all recursive, so bail out if we hit the limit.
   if (Depth++ == MaxDepth)
-    return 0;
-
-  if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
-    // FIXME: missed optimization: if one of the hands of select is/contains
-    //        undef, just directly pick the other one.
-    // FIXME: can both hands contain undef?
-    if (size_t LHSIdx =
-            visitUDivOperand(Op0, SI->getOperand(1), I, Actions, Depth))
-      if (visitUDivOperand(Op0, SI->getOperand(2), I, Actions, Depth)) {
-        Actions.push_back(UDivFoldAction(nullptr, Op1, LHSIdx - 1));
-        return Actions.size();
-      }
+    return nullptr;
+
+  // log2(zext X) -> zext log2(X)
+  // FIXME: Require one use?
+  Value *X, *Y;
+  if (match(Op, m_ZExt(m_Value(X))))
+    if (Value *LogX = takeLog2(Builder, X, Depth, DoFold))
+      return IfFold([&]() { return Builder.CreateZExt(LogX, Op->getType()); });
+
+  // log2(X << Y) -> log2(X) + Y
+  // FIXME: Require one use unless X is 1?
+  if (match(Op, m_Shl(m_Value(X), m_Value(Y))))
+    if (Value *LogX = takeLog2(Builder, X, Depth, DoFold))
+      return IfFold([&]() { return Builder.CreateAdd(LogX, Y); });
+
+  // log2(Cond ? X : Y) -> Cond ? log2(X) : log2(Y)
+  // FIXME: missed optimization: if one of the hands of select is/contains
+  //        undef, just directly pick the other one.
+  // FIXME: can both hands contain undef?
+  // FIXME: Require one use?
+  if (SelectInst *SI = dyn_cast<SelectInst>(Op))
+    if (Value *LogX = takeLog2(Builder, SI->getOperand(1), Depth, DoFold))
+      if (Value *LogY = takeLog2(Builder, SI->getOperand(2), Depth, DoFold))
+        return IfFold([&]() {
+          return Builder.CreateSelect(SI->getOperand(0), LogX, LogY);
+        });
+
+  // log2(umin(X, Y)) -> umin(log2(X), log2(Y))
+  // log2(umax(X, Y)) -> umax(log2(X), log2(Y))
+  auto *MinMax = dyn_cast<MinMaxIntrinsic>(Op);
+  if (MinMax && MinMax->hasOneUse() && !MinMax->isSigned())
+    if (Value *LogX = takeLog2(Builder, MinMax->getLHS(), Depth, DoFold))
+      if (Value *LogY = takeLog2(Builder, MinMax->getRHS(), Depth, DoFold))
+        return IfFold([&]() {
+          return Builder.CreateBinaryIntrinsic(
+              MinMax->getIntrinsicID(), LogX, LogY);
+        });
 
-  return 0;
+  return nullptr;
 }
 
 /// If we have zero-extended operands of an unsigned div or rem, we may be able
@@ -1047,7 +1020,7 @@ static Instruction *narrowUDivURem(BinaryOperator &I,
 }
 
 Instruction *InstCombinerImpl::visitUDiv(BinaryOperator &I) {
-  if (Value *V = SimplifyUDivInst(I.getOperand(0), I.getOperand(1),
+  if (Value *V = simplifyUDivInst(I.getOperand(0), I.getOperand(1),
                                   SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
@@ -1106,42 +1079,18 @@ Instruction *InstCombinerImpl::visitUDiv(BinaryOperator &I) {
       return BinaryOperator::CreateUDiv(A, X);
   }
 
-  // (LHS udiv (select (select (...)))) -> (LHS >> (select (select (...))))
-  SmallVector<UDivFoldAction, 6> UDivActions;
-  if (visitUDivOperand(Op0, Op1, I, UDivActions))
-    for (unsigned i = 0, e = UDivActions.size(); i != e; ++i) {
-      FoldUDivOperandCb Action = UDivActions[i].FoldAction;
-      Value *ActionOp1 = UDivActions[i].OperandToFold;
-      Instruction *Inst;
-      if (Action)
-        Inst = Action(Op0, ActionOp1, I, *this);
-      else {
-        // This action joins two actions together.  The RHS of this action is
-        // simply the last action we processed, we saved the LHS action index in
-        // the joining action.
-        size_t SelectRHSIdx = i - 1;
-        Value *SelectRHS = UDivActions[SelectRHSIdx].FoldResult;
-        size_t SelectLHSIdx = UDivActions[i].SelectLHSIdx;
-        Value *SelectLHS = UDivActions[SelectLHSIdx].FoldResult;
-        Inst = SelectInst::Create(cast<SelectInst>(ActionOp1)->getCondition(),
-                                  SelectLHS, SelectRHS);
-      }
-
-      // If this is the last action to process, return it to the InstCombiner.
-      // Otherwise, we insert it before the UDiv and record it so that we may
-      // use it as part of a joining action (i.e., a SelectInst).
-      if (e - i != 1) {
-        Inst->insertBefore(&I);
-        UDivActions[i].FoldResult = Inst;
-      } else
-        return Inst;
-    }
+  // Op1 udiv Op2 -> Op1 lshr log2(Op2), if log2() folds away.
+  if (takeLog2(Builder, Op1, /*Depth*/0, /*DoFold*/false)) {
+    Value *Res = takeLog2(Builder, Op1, /*Depth*/0, /*DoFold*/true);
+    return replaceInstUsesWith(
+        I, Builder.CreateLShr(Op0, Res, I.getName(), I.isExact()));
+  }
 
   return nullptr;
 }
 
 Instruction *InstCombinerImpl::visitSDiv(BinaryOperator &I) {
-  if (Value *V = SimplifySDivInst(I.getOperand(0), I.getOperand(1),
+  if (Value *V = simplifySDivInst(I.getOperand(0), I.getOperand(1),
                                   SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
@@ -1223,9 +1172,9 @@ Instruction *InstCombinerImpl::visitSDiv(BinaryOperator &I) {
   if (match(&I, m_c_BinOp(
                     m_OneUse(m_Intrinsic<Intrinsic::abs>(m_Value(X), m_One())),
                     m_Deferred(X)))) {
-    Constant *NegOne = ConstantInt::getAllOnesValue(Ty);
-    Value *Cond = Builder.CreateICmpSGT(X, NegOne);
-    return SelectInst::Create(Cond, ConstantInt::get(Ty, 1), NegOne);
+    Value *Cond = Builder.CreateIsNotNeg(X);
+    return SelectInst::Create(Cond, ConstantInt::get(Ty, 1),
+                              ConstantInt::getAllOnesValue(Ty));
   }
 
   // If the sign bits of both operands are zero (i.e. we can prove they are
@@ -1242,8 +1191,10 @@ Instruction *InstCombinerImpl::visitSDiv(BinaryOperator &I) {
     if (match(Op1, m_NegatedPower2())) {
       // X sdiv (-(1 << C)) -> -(X sdiv (1 << C)) ->
       //                    -> -(X udiv (1 << C)) -> -(X u>> C)
-      return BinaryOperator::CreateNeg(Builder.Insert(foldUDivPow2Cst(
-          Op0, ConstantExpr::getNeg(cast<Constant>(Op1)), I, *this)));
+      Constant *CNegLog2 = ConstantExpr::getExactLogBase2(
+          ConstantExpr::getNeg(cast<Constant>(Op1)));
+      Value *Shr = Builder.CreateLShr(Op0, CNegLog2, I.getName(), I.isExact());
+      return BinaryOperator::CreateNeg(Shr);
     }
 
     if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, &I)) {
@@ -1368,7 +1319,9 @@ static Instruction *foldFDivPowDivisor(BinaryOperator &I,
 }
 
 Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
-  if (Value *V = SimplifyFDivInst(I.getOperand(0), I.getOperand(1),
+  Module *M = I.getModule();
+
+  if (Value *V = simplifyFDivInst(I.getOperand(0), I.getOperand(1),
                                   I.getFastMathFlags(),
                                   SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
@@ -1433,8 +1386,8 @@ Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
         !IsTan && match(Op0, m_Intrinsic<Intrinsic::cos>(m_Value(X))) &&
                   match(Op1, m_Intrinsic<Intrinsic::sin>(m_Specific(X)));
 
-    if ((IsTan || IsCot) &&
-        hasFloatFn(&TLI, I.getType(), LibFunc_tan, LibFunc_tanf, LibFunc_tanl)) {
+    if ((IsTan || IsCot) && hasFloatFn(M, &TLI, I.getType(), LibFunc_tan,
+                                       LibFunc_tanf, LibFunc_tanl)) {
       IRBuilder<> B(&I);
       IRBuilder<>::FastMathFlagGuard FMFGuard(B);
       B.setFastMathFlags(I.getFastMathFlags());
@@ -1498,7 +1451,8 @@ Instruction *InstCombinerImpl::commonIRemTransforms(BinaryOperator &I) {
   // TODO: Adapt simplifyDivRemOfSelectWithZeroOp to allow this and other folds.
   if (match(Op0, m_ImmConstant()) &&
       match(Op1, m_Select(m_Value(), m_ImmConstant(), m_ImmConstant()))) {
-    if (Instruction *R = FoldOpIntoSelect(I, cast<SelectInst>(Op1)))
+    if (Instruction *R = FoldOpIntoSelect(I, cast<SelectInst>(Op1),
+                                          /*FoldWithMultiUse*/ true))
       return R;
   }
 
@@ -1530,7 +1484,7 @@ Instruction *InstCombinerImpl::commonIRemTransforms(BinaryOperator &I) {
 }
 
 Instruction *InstCombinerImpl::visitURem(BinaryOperator &I) {
-  if (Value *V = SimplifyURemInst(I.getOperand(0), I.getOperand(1),
+  if (Value *V = simplifyURemInst(I.getOperand(0), I.getOperand(1),
                                   SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
@@ -1560,11 +1514,13 @@ Instruction *InstCombinerImpl::visitURem(BinaryOperator &I) {
     return CastInst::CreateZExtOrBitCast(Cmp, Ty);
   }
 
-  // X urem C -> X < C ? X : X - C, where C >= signbit.
+  // Op0 urem C -> Op0 < C ? Op0 : Op0 - C, where C >= signbit.
+  // Op0 must be frozen because we are increasing its number of uses.
   if (match(Op1, m_Negative())) {
-    Value *Cmp = Builder.CreateICmpULT(Op0, Op1);
-    Value *Sub = Builder.CreateSub(Op0, Op1);
-    return SelectInst::Create(Cmp, Op0, Sub);
+    Value *F0 = Builder.CreateFreeze(Op0, Op0->getName() + ".fr");
+    Value *Cmp = Builder.CreateICmpULT(F0, Op1);
+    Value *Sub = Builder.CreateSub(F0, Op1);
+    return SelectInst::Create(Cmp, F0, Sub);
   }
 
   // If the divisor is a sext of a boolean, then the divisor must be max
@@ -1581,7 +1537,7 @@ Instruction *InstCombinerImpl::visitURem(BinaryOperator &I) {
 }
 
 Instruction *InstCombinerImpl::visitSRem(BinaryOperator &I) {
-  if (Value *V = SimplifySRemInst(I.getOperand(0), I.getOperand(1),
+  if (Value *V = simplifySRemInst(I.getOperand(0), I.getOperand(1),
                                   SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
@@ -1653,7 +1609,7 @@ Instruction *InstCombinerImpl::visitSRem(BinaryOperator &I) {
 }
 
 Instruction *InstCombinerImpl::visitFRem(BinaryOperator &I) {
-  if (Value *V = SimplifyFRemInst(I.getOperand(0), I.getOperand(1),
+  if (Value *V = simplifyFRemInst(I.getOperand(0), I.getOperand(1),
                                   I.getFastMathFlags(),
                                   SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp b/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp
index 42ba4a34a5a9..c573b03f31a6 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp
@@ -248,6 +248,20 @@ LLVM_NODISCARD Value *Negator::visitImpl(Value *V, unsigned Depth) {
     return nullptr;
 
   switch (I->getOpcode()) {
+  case Instruction::And: {
+    Constant *ShAmt;
+    // sub(y,and(lshr(x,C),1)) --> add(ashr(shl(x,(BW-1)-C),BW-1),y)
+    if (match(I, m_c_And(m_OneUse(m_TruncOrSelf(
+                             m_LShr(m_Value(X), m_ImmConstant(ShAmt)))),
+                         m_One()))) {
+      unsigned BW = X->getType()->getScalarSizeInBits();
+      Constant *BWMinusOne = ConstantInt::get(X->getType(), BW - 1);
+      Value *R = Builder.CreateShl(X, Builder.CreateSub(BWMinusOne, ShAmt));
+      R = Builder.CreateAShr(R, BWMinusOne);
+      return Builder.CreateTruncOrBitCast(R, I->getType());
+    }
+    break;
+  }
   case Instruction::SDiv:
     // `sdiv` is negatible if divisor is not undef/INT_MIN/1.
     // While this is normally not behind a use-check,
diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 09694d50468f..90a796a0939e 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -511,7 +511,8 @@ Instruction *InstCombinerImpl::foldPHIArgGEPIntoPHI(PHINode &PN) {
   // Scan to see if all operands are the same opcode, and all have one user.
   for (Value *V : drop_begin(PN.incoming_values())) {
     GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(V);
-    if (!GEP || !GEP->hasOneUser() || GEP->getType() != FirstInst->getType() ||
+    if (!GEP || !GEP->hasOneUser() ||
+        GEP->getSourceElementType() != FirstInst->getSourceElementType() ||
         GEP->getNumOperands() != FirstInst->getNumOperands())
       return nullptr;
 
@@ -657,6 +658,10 @@ static bool isSafeAndProfitableToSinkLoad(LoadInst *L) {
 Instruction *InstCombinerImpl::foldPHIArgLoadIntoPHI(PHINode &PN) {
   LoadInst *FirstLI = cast<LoadInst>(PN.getIncomingValue(0));
 
+  // Can't forward swifterror through a phi.
+  if (FirstLI->getOperand(0)->isSwiftError())
+    return nullptr;
+
   // FIXME: This is overconservative; this transform is allowed in some cases
   // for atomic operations.
   if (FirstLI->isAtomic())
@@ -693,6 +698,10 @@ Instruction *InstCombinerImpl::foldPHIArgLoadIntoPHI(PHINode &PN) {
         LI->getPointerAddressSpace() != LoadAddrSpace)
       return nullptr;
 
+    // Can't forward swifterror through a phi.
+    if (LI->getOperand(0)->isSwiftError())
+      return nullptr;
+
     // We can't sink the load if the loaded value could be modified between
     // the load and the PHI.
     if (LI->getParent() != InBB || !isSafeAndProfitableToSinkLoad(LI))
@@ -1112,6 +1121,13 @@ Instruction *InstCombinerImpl::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
       return nullptr;
     }
 
+    // If the incoming value is a PHI node before a catchswitch, we cannot
+    // extract the value within that BB because we cannot insert any non-PHI
+    // instructions in the BB.
+    for (auto *Pred : PN->blocks())
+      if (Pred->getFirstInsertionPt() == Pred->end())
+        return nullptr;
+
     for (User *U : PN->users()) {
       Instruction *UserI = cast<Instruction>(U);
 
@@ -1260,12 +1276,12 @@ static Value *simplifyUsingControlFlow(InstCombiner &Self, PHINode &PN,
   //      ...      ...
   //       \       /
   //    phi [true] [false]
-  if (!PN.getType()->isIntegerTy(1))
-    return nullptr;
-
-  if (PN.getNumOperands() != 2)
-    return nullptr;
-
+  // and
+  //        switch (cond)
+  // case v1: /       \ case v2:
+  //         ...      ...
+  //          \       /
+  //       phi [v1] [v2]
   // Make sure all inputs are constants.
   if (!all_of(PN.operands(), [](Value *V) { return isa<ConstantInt>(V); }))
     return nullptr;
@@ -1275,50 +1291,77 @@ static Value *simplifyUsingControlFlow(InstCombiner &Self, PHINode &PN,
   if (!DT.isReachableFromEntry(BB))
     return nullptr;
 
-  // Same inputs.
-  if (PN.getOperand(0) == PN.getOperand(1))
-    return PN.getOperand(0);
+  // Determine which value the condition of the idom has for which successor.
+  LLVMContext &Context = PN.getContext();
+  auto *IDom = DT.getNode(BB)->getIDom()->getBlock();
+  Value *Cond;
+  SmallDenseMap<ConstantInt *, BasicBlock *, 8> SuccForValue;
+  SmallDenseMap<BasicBlock *, unsigned, 8> SuccCount;
+  auto AddSucc = [&](ConstantInt *C, BasicBlock *Succ) {
+    SuccForValue[C] = Succ;
+    ++SuccCount[Succ];
+  };
+  if (auto *BI = dyn_cast<BranchInst>(IDom->getTerminator())) {
+    if (BI->isUnconditional())
+      return nullptr;
 
-  BasicBlock *TruePred = nullptr, *FalsePred = nullptr;
-  for (auto *Pred : predecessors(BB)) {
-    auto *Input = cast<ConstantInt>(PN.getIncomingValueForBlock(Pred));
-    if (Input->isAllOnesValue())
-      TruePred = Pred;
-    else
-      FalsePred = Pred;
+    Cond = BI->getCondition();
+    AddSucc(ConstantInt::getTrue(Context), BI->getSuccessor(0));
+    AddSucc(ConstantInt::getFalse(Context), BI->getSuccessor(1));
+  } else if (auto *SI = dyn_cast<SwitchInst>(IDom->getTerminator())) {
+    Cond = SI->getCondition();
+    ++SuccCount[SI->getDefaultDest()];
+    for (auto Case : SI->cases())
+      AddSucc(Case.getCaseValue(), Case.getCaseSuccessor());
+  } else {
+    return nullptr;
   }
-  assert(TruePred && FalsePred && "Must be!");
 
-  // Check which edge of the dominator dominates the true input. If it is the
-  // false edge, we should invert the condition.
-  auto *IDom = DT.getNode(BB)->getIDom()->getBlock();
-  auto *BI = dyn_cast<BranchInst>(IDom->getTerminator());
-  if (!BI || BI->isUnconditional())
+  if (Cond->getType() != PN.getType())
     return nullptr;
 
   // Check that edges outgoing from the idom's terminators dominate respective
   // inputs of the Phi.
-  BasicBlockEdge TrueOutEdge(IDom, BI->getSuccessor(0));
-  BasicBlockEdge FalseOutEdge(IDom, BI->getSuccessor(1));
+  Optional<bool> Invert;
+  for (auto Pair : zip(PN.incoming_values(), PN.blocks())) {
+    auto *Input = cast<ConstantInt>(std::get<0>(Pair));
+    BasicBlock *Pred = std::get<1>(Pair);
+    auto IsCorrectInput = [&](ConstantInt *Input) {
+      // The input needs to be dominated by the corresponding edge of the idom.
+      // This edge cannot be a multi-edge, as that would imply that multiple
+      // different condition values follow the same edge.
+      auto It = SuccForValue.find(Input);
+      return It != SuccForValue.end() && SuccCount[It->second] == 1 &&
+             DT.dominates(BasicBlockEdge(IDom, It->second),
+                          BasicBlockEdge(Pred, BB));
+    };
+
+    // Depending on the constant, the condition may need to be inverted.
+    bool NeedsInvert;
+    if (IsCorrectInput(Input))
+      NeedsInvert = false;
+    else if (IsCorrectInput(cast<ConstantInt>(ConstantExpr::getNot(Input))))
+      NeedsInvert = true;
+    else
+      return nullptr;
+
+    // Make sure the inversion requirement is always the same.
+    if (Invert && *Invert != NeedsInvert)
+      return nullptr;
 
-  BasicBlockEdge TrueIncEdge(TruePred, BB);
-  BasicBlockEdge FalseIncEdge(FalsePred, BB);
+    Invert = NeedsInvert;
+  }
 
-  auto *Cond = BI->getCondition();
-  if (DT.dominates(TrueOutEdge, TrueIncEdge) &&
-      DT.dominates(FalseOutEdge, FalseIncEdge))
-    // This Phi is actually equivalent to branching condition of IDom.
+  if (!*Invert)
     return Cond;
-  if (DT.dominates(TrueOutEdge, FalseIncEdge) &&
-      DT.dominates(FalseOutEdge, TrueIncEdge)) {
-    // This Phi is actually opposite to branching condition of IDom. We invert
-    // the condition that will potentially open up some opportunities for
-    // sinking.
-    auto InsertPt = BB->getFirstInsertionPt();
-    if (InsertPt != BB->end()) {
-      Self.Builder.SetInsertPoint(&*InsertPt);
-      return Self.Builder.CreateNot(Cond);
-    }
+
+  // This Phi is actually opposite to branching condition of IDom. We invert
+  // the condition that will potentially open up some opportunities for
+  // sinking.
+  auto InsertPt = BB->getFirstInsertionPt();
+  if (InsertPt != BB->end()) {
+    Self.Builder.SetInsertPoint(&*InsertPt);
+    return Self.Builder.CreateNot(Cond);
   }
 
   return nullptr;
@@ -1327,7 +1370,7 @@ static Value *simplifyUsingControlFlow(InstCombiner &Self, PHINode &PN,
 // PHINode simplification
 //
 Instruction *InstCombinerImpl::visitPHINode(PHINode &PN) {
-  if (Value *V = SimplifyInstruction(&PN, SQ.getWithInstruction(&PN)))
+  if (Value *V = simplifyInstruction(&PN, SQ.getWithInstruction(&PN)))
     return replaceInstUsesWith(PN, V);
 
   if (Instruction *Result = foldPHIArgZextsIntoPHI(PN))
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 65e60498ff95..ad96a5f475f1 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
@@ -49,13 +50,6 @@ using namespace llvm;
 using namespace PatternMatch;
 
 
-static Value *createMinMax(InstCombiner::BuilderTy &Builder,
-                           SelectPatternFlavor SPF, Value *A, Value *B) {
-  CmpInst::Predicate Pred = getMinMaxPred(SPF);
-  assert(CmpInst::isIntPredicate(Pred) && "Expected integer predicate");
-  return Builder.CreateSelect(Builder.CreateICmp(Pred, A, B), A, B);
-}
-
 /// Replace a select operand based on an equality comparison with the identity
 /// constant of a binop.
 static Instruction *foldSelectBinOpIdentity(SelectInst &Sel,
@@ -370,6 +364,7 @@ Instruction *InstCombinerImpl::foldSelectOpOp(SelectInst &SI, Instruction *TI,
   // one-use constraint, but that needs be examined carefully since it may not
   // reduce the total number of instructions.
   if (TI->getNumOperands() != 2 || FI->getNumOperands() != 2 ||
+      !TI->isSameOperationAs(FI) ||
       (!isa<BinaryOperator>(TI) && !isa<GetElementPtrInst>(TI)) ||
       !TI->hasOneUse() || !FI->hasOneUse())
     return nullptr;
@@ -444,69 +439,56 @@ Instruction *InstCombinerImpl::foldSelectIntoOp(SelectInst &SI, Value *TrueVal,
                                                 Value *FalseVal) {
   // See the comment above GetSelectFoldableOperands for a description of the
   // transformation we are doing here.
-  if (auto *TVI = dyn_cast<BinaryOperator>(TrueVal)) {
-    if (TVI->hasOneUse() && !isa<Constant>(FalseVal)) {
-      if (unsigned SFO = getSelectFoldableOperands(TVI)) {
-        unsigned OpToFold = 0;
-        if ((SFO & 1) && FalseVal == TVI->getOperand(0)) {
-          OpToFold = 1;
-        } else if ((SFO & 2) && FalseVal == TVI->getOperand(1)) {
-          OpToFold = 2;
-        }
-
-        if (OpToFold) {
-          Constant *C = ConstantExpr::getBinOpIdentity(TVI->getOpcode(),
-                                                       TVI->getType(), true);
-          Value *OOp = TVI->getOperand(2-OpToFold);
-          // Avoid creating select between 2 constants unless it's selecting
-          // between 0, 1 and -1.
-          const APInt *OOpC;
-          bool OOpIsAPInt = match(OOp, m_APInt(OOpC));
-          if (!isa<Constant>(OOp) ||
-              (OOpIsAPInt && isSelect01(C->getUniqueInteger(), *OOpC))) {
-            Value *NewSel = Builder.CreateSelect(SI.getCondition(), OOp, C);
-            NewSel->takeName(TVI);
-            BinaryOperator *BO = BinaryOperator::Create(TVI->getOpcode(),
-                                                        FalseVal, NewSel);
-            BO->copyIRFlags(TVI);
-            return BO;
+  auto TryFoldSelectIntoOp = [&](SelectInst &SI, Value *TrueVal,
+                                 Value *FalseVal,
+                                 bool Swapped) -> Instruction * {
+    if (auto *TVI = dyn_cast<BinaryOperator>(TrueVal)) {
+      if (TVI->hasOneUse() && !isa<Constant>(FalseVal)) {
+        if (unsigned SFO = getSelectFoldableOperands(TVI)) {
+          unsigned OpToFold = 0;
+          if ((SFO & 1) && FalseVal == TVI->getOperand(0))
+            OpToFold = 1;
+          else if ((SFO & 2) && FalseVal == TVI->getOperand(1))
+            OpToFold = 2;
+
+          if (OpToFold) {
+            FastMathFlags FMF;
+            // TODO: We probably ought to revisit cases where the select and FP
+            // instructions have different flags and add tests to ensure the
+            // behaviour is correct.
+            if (isa<FPMathOperator>(&SI))
+              FMF = SI.getFastMathFlags();
+            Constant *C = ConstantExpr::getBinOpIdentity(
+                TVI->getOpcode(), TVI->getType(), true, FMF.noSignedZeros());
+            Value *OOp = TVI->getOperand(2 - OpToFold);
+            // Avoid creating select between 2 constants unless it's selecting
+            // between 0, 1 and -1.
+            const APInt *OOpC;
+            bool OOpIsAPInt = match(OOp, m_APInt(OOpC));
+            if (!isa<Constant>(OOp) ||
+                (OOpIsAPInt && isSelect01(C->getUniqueInteger(), *OOpC))) {
+              Value *NewSel = Builder.CreateSelect(
+                  SI.getCondition(), Swapped ? C : OOp, Swapped ? OOp : C);
+              if (isa<FPMathOperator>(&SI))
+                cast<Instruction>(NewSel)->setFastMathFlags(FMF);
+              NewSel->takeName(TVI);
+              BinaryOperator *BO =
+                  BinaryOperator::Create(TVI->getOpcode(), FalseVal, NewSel);
+              BO->copyIRFlags(TVI);
+              return BO;
+            }
           }
         }
       }
     }
-  }
+    return nullptr;
+  };
 
-  if (auto *FVI = dyn_cast<BinaryOperator>(FalseVal)) {
-    if (FVI->hasOneUse() && !isa<Constant>(TrueVal)) {
-      if (unsigned SFO = getSelectFoldableOperands(FVI)) {
-        unsigned OpToFold = 0;
-        if ((SFO & 1) && TrueVal == FVI->getOperand(0)) {
-          OpToFold = 1;
-        } else if ((SFO & 2) && TrueVal == FVI->getOperand(1)) {
-          OpToFold = 2;
-        }
+  if (Instruction *R = TryFoldSelectIntoOp(SI, TrueVal, FalseVal, false))
+    return R;
 
-        if (OpToFold) {
-          Constant *C = ConstantExpr::getBinOpIdentity(FVI->getOpcode(),
-                                                       FVI->getType(), true);
-          Value *OOp = FVI->getOperand(2-OpToFold);
-          // Avoid creating select between 2 constants unless it's selecting
-          // between 0, 1 and -1.
-          const APInt *OOpC;
-          bool OOpIsAPInt = match(OOp, m_APInt(OOpC));
-          if (!isa<Constant>(OOp) ||
-              (OOpIsAPInt && isSelect01(C->getUniqueInteger(), *OOpC))) {
-            Value *NewSel = Builder.CreateSelect(SI.getCondition(), C, OOp);
-            NewSel->takeName(FVI);
-            BinaryOperator *BO = BinaryOperator::Create(FVI->getOpcode(),
-                                                        TrueVal, NewSel);
-            BO->copyIRFlags(FVI);
-            return BO;
-          }
-        }
-      }
-    }
-  }
+  if (Instruction *R = TryFoldSelectIntoOp(SI, FalseVal, TrueVal, true))
+    return R;
 
   return nullptr;
 }
@@ -535,6 +517,16 @@ static Instruction *foldSelectICmpAndAnd(Type *SelType, const ICmpInst *Cmp,
   // Where %B may be optionally shifted:  lshr %X, %Z.
   Value *X, *Z;
   const bool HasShift = match(B, m_OneUse(m_LShr(m_Value(X), m_Value(Z))));
+
+  // The shift must be valid.
+  // TODO: This restricts the fold to constant shift amounts. Is there a way to
+  //       handle variable shifts safely? PR47012
+  if (HasShift &&
+      !match(Z, m_SpecificInt_ICMP(CmpInst::ICMP_ULT,
+                                   APInt(SelType->getScalarSizeInBits(),
+                                         SelType->getScalarSizeInBits()))))
+    return nullptr;
+
   if (!HasShift)
     X = B;
 
@@ -1096,74 +1088,55 @@ static bool adjustMinMax(SelectInst &Sel, ICmpInst &Cmp) {
   return true;
 }
 
-/// If this is an integer min/max (icmp + select) with a constant operand,
-/// create the canonical icmp for the min/max operation and canonicalize the
-/// constant to the 'false' operand of the select:
-/// select (icmp Pred X, C1), C2, X --> select (icmp Pred' X, C2), X, C2
-/// Note: if C1 != C2, this will change the icmp constant to the existing
-/// constant operand of the select.
-static Instruction *canonicalizeMinMaxWithConstant(SelectInst &Sel,
-                                                   ICmpInst &Cmp,
-                                                   InstCombinerImpl &IC) {
-  if (!Cmp.hasOneUse() || !isa<Constant>(Cmp.getOperand(1)))
-    return nullptr;
-
-  // Canonicalize the compare predicate based on whether we have min or max.
+static Instruction *canonicalizeSPF(SelectInst &Sel, ICmpInst &Cmp,
+                                    InstCombinerImpl &IC) {
   Value *LHS, *RHS;
-  SelectPatternResult SPR = matchSelectPattern(&Sel, LHS, RHS);
-  if (!SelectPatternResult::isMinOrMax(SPR.Flavor))
+  // TODO: What to do with pointer min/max patterns?
+  if (!Sel.getType()->isIntOrIntVectorTy())
     return nullptr;
 
-  // Is this already canonical?
-  ICmpInst::Predicate CanonicalPred = getMinMaxPred(SPR.Flavor);
-  if (Cmp.getOperand(0) == LHS && Cmp.getOperand(1) == RHS &&
-      Cmp.getPredicate() == CanonicalPred)
-    return nullptr;
-
-  // Bail out on unsimplified X-0 operand (due to some worklist management bug),
-  // as this may cause an infinite combine loop. Let the sub be folded first.
-  if (match(LHS, m_Sub(m_Value(), m_Zero())) ||
-      match(RHS, m_Sub(m_Value(), m_Zero())))
-    return nullptr;
-
-  // Create the canonical compare and plug it into the select.
-  IC.replaceOperand(Sel, 0, IC.Builder.CreateICmp(CanonicalPred, LHS, RHS));
-
-  // If the select operands did not change, we're done.
-  if (Sel.getTrueValue() == LHS && Sel.getFalseValue() == RHS)
-    return &Sel;
-
-  // If we are swapping the select operands, swap the metadata too.
-  assert(Sel.getTrueValue() == RHS && Sel.getFalseValue() == LHS &&
-         "Unexpected results from matchSelectPattern");
-  Sel.swapValues();
-  Sel.swapProfMetadata();
-  return &Sel;
-}
-
-static Instruction *canonicalizeAbsNabs(SelectInst &Sel, ICmpInst &Cmp,
-                                        InstCombinerImpl &IC) {
-  if (!Cmp.hasOneUse() || !isa<Constant>(Cmp.getOperand(1)))
-    return nullptr;
-
-  Value *LHS, *RHS;
   SelectPatternFlavor SPF = matchSelectPattern(&Sel, LHS, RHS).Flavor;
-  if (SPF != SelectPatternFlavor::SPF_ABS &&
-      SPF != SelectPatternFlavor::SPF_NABS)
-    return nullptr;
-
-  // Note that NSW flag can only be propagated for normal, non-negated abs!
-  bool IntMinIsPoison = SPF == SelectPatternFlavor::SPF_ABS &&
-                        match(RHS, m_NSWNeg(m_Specific(LHS)));
-  Constant *IntMinIsPoisonC =
-      ConstantInt::get(Type::getInt1Ty(Sel.getContext()), IntMinIsPoison);
-  Instruction *Abs =
-      IC.Builder.CreateBinaryIntrinsic(Intrinsic::abs, LHS, IntMinIsPoisonC);
-
-  if (SPF == SelectPatternFlavor::SPF_NABS)
-    return BinaryOperator::CreateNeg(Abs); // Always without NSW flag!
+  if (SPF == SelectPatternFlavor::SPF_ABS ||
+      SPF == SelectPatternFlavor::SPF_NABS) {
+    if (!Cmp.hasOneUse() && !RHS->hasOneUse())
+      return nullptr; // TODO: Relax this restriction.
+
+    // Note that NSW flag can only be propagated for normal, non-negated abs!
+    bool IntMinIsPoison = SPF == SelectPatternFlavor::SPF_ABS &&
+                          match(RHS, m_NSWNeg(m_Specific(LHS)));
+    Constant *IntMinIsPoisonC =
+        ConstantInt::get(Type::getInt1Ty(Sel.getContext()), IntMinIsPoison);
+    Instruction *Abs =
+        IC.Builder.CreateBinaryIntrinsic(Intrinsic::abs, LHS, IntMinIsPoisonC);
+
+    if (SPF == SelectPatternFlavor::SPF_NABS)
+      return BinaryOperator::CreateNeg(Abs); // Always without NSW flag!
+    return IC.replaceInstUsesWith(Sel, Abs);
+  }
+
+  if (SelectPatternResult::isMinOrMax(SPF)) {
+    Intrinsic::ID IntrinsicID;
+    switch (SPF) {
+    case SelectPatternFlavor::SPF_UMIN:
+      IntrinsicID = Intrinsic::umin;
+      break;
+    case SelectPatternFlavor::SPF_UMAX:
+      IntrinsicID = Intrinsic::umax;
+      break;
+    case SelectPatternFlavor::SPF_SMIN:
+      IntrinsicID = Intrinsic::smin;
+      break;
+    case SelectPatternFlavor::SPF_SMAX:
+      IntrinsicID = Intrinsic::smax;
+      break;
+    default:
+      llvm_unreachable("Unexpected SPF");
+    }
+    return IC.replaceInstUsesWith(
+        Sel, IC.Builder.CreateBinaryIntrinsic(IntrinsicID, LHS, RHS));
+  }
 
-  return IC.replaceInstUsesWith(Sel, Abs);
+  return nullptr;
 }
 
 /// If we have a select with an equality comparison, then we know the value in
@@ -1336,6 +1309,7 @@ static Value *canonicalizeClampLike(SelectInst &Sel0, ICmpInst &Cmp0,
                    ICmpInst::Predicate::ICMP_NE,
                    APInt::getAllOnes(C0->getType()->getScalarSizeInBits()))))
       return nullptr; // Can't do, have all-ones element[s].
+    Pred0 = ICmpInst::getFlippedStrictnessPredicate(Pred0);
     C0 = InstCombiner::AddOne(C0);
     break;
   default:
@@ -1401,15 +1375,22 @@ static Value *canonicalizeClampLike(SelectInst &Sel0, ICmpInst &Cmp0,
   case ICmpInst::Predicate::ICMP_SGE:
     // Also non-canonical, but here we don't need to change C2,
     // so we don't have any restrictions on C2, so we can just handle it.
+    Pred1 = ICmpInst::Predicate::ICMP_SLT;
     std::swap(ReplacementLow, ReplacementHigh);
     break;
   default:
     return nullptr; // Unknown predicate.
   }
+  assert(Pred1 == ICmpInst::Predicate::ICMP_SLT &&
+         "Unexpected predicate type.");
 
   // The thresholds of this clamp-like pattern.
   auto *ThresholdLowIncl = ConstantExpr::getNeg(C1);
   auto *ThresholdHighExcl = ConstantExpr::getSub(C0, C1);
+
+  assert((Pred0 == ICmpInst::Predicate::ICMP_ULT ||
+          Pred0 == ICmpInst::Predicate::ICMP_UGE) &&
+         "Unexpected predicate type.");
   if (Pred0 == ICmpInst::Predicate::ICMP_UGE)
     std::swap(ThresholdLowIncl, ThresholdHighExcl);
 
@@ -1530,17 +1511,71 @@ tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp,
   return &Sel;
 }
 
+static Instruction *foldSelectZeroOrOnes(ICmpInst *Cmp, Value *TVal,
+                                         Value *FVal,
+                                         InstCombiner::BuilderTy &Builder) {
+  if (!Cmp->hasOneUse())
+    return nullptr;
+
+  const APInt *CmpC;
+  if (!match(Cmp->getOperand(1), m_APIntAllowUndef(CmpC)))
+    return nullptr;
+
+  // (X u< 2) ? -X : -1 --> sext (X != 0)
+  Value *X = Cmp->getOperand(0);
+  if (Cmp->getPredicate() == ICmpInst::ICMP_ULT && *CmpC == 2 &&
+      match(TVal, m_Neg(m_Specific(X))) && match(FVal, m_AllOnes()))
+    return new SExtInst(Builder.CreateIsNotNull(X), TVal->getType());
+
+  // (X u> 1) ? -1 : -X --> sext (X != 0)
+  if (Cmp->getPredicate() == ICmpInst::ICMP_UGT && *CmpC == 1 &&
+      match(FVal, m_Neg(m_Specific(X))) && match(TVal, m_AllOnes()))
+    return new SExtInst(Builder.CreateIsNotNull(X), TVal->getType());
+
+  return nullptr;
+}
+
+static Value *foldSelectInstWithICmpConst(SelectInst &SI, ICmpInst *ICI) {
+  const APInt *CmpC;
+  Value *V;
+  CmpInst::Predicate Pred;
+  if (!match(ICI, m_ICmp(Pred, m_Value(V), m_APInt(CmpC))))
+    return nullptr;
+
+  BinaryOperator *BO;
+  const APInt *C;
+  CmpInst::Predicate CPred;
+  if (match(&SI, m_Select(m_Specific(ICI), m_APInt(C), m_BinOp(BO))))
+    CPred = ICI->getPredicate();
+  else if (match(&SI, m_Select(m_Specific(ICI), m_BinOp(BO), m_APInt(C))))
+    CPred = ICI->getInversePredicate();
+  else
+    return nullptr;
+
+  const APInt *BinOpC;
+  if (!match(BO, m_BinOp(m_Specific(V), m_APInt(BinOpC))))
+    return nullptr;
+
+  ConstantRange R = ConstantRange::makeExactICmpRegion(CPred, *CmpC)
+                        .binaryOp(BO->getOpcode(), *BinOpC);
+  if (R == *C) {
+    BO->dropPoisonGeneratingFlags();
+    return BO;
+  }
+  return nullptr;
+}
+
 /// Visit a SelectInst that has an ICmpInst as its first operand.
 Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI,
                                                       ICmpInst *ICI) {
   if (Instruction *NewSel = foldSelectValueEquivalence(SI, *ICI))
     return NewSel;
 
-  if (Instruction *NewSel = canonicalizeMinMaxWithConstant(SI, *ICI, *this))
-    return NewSel;
+  if (Instruction *NewSPF = canonicalizeSPF(SI, *ICI, *this))
+    return NewSPF;
 
-  if (Instruction *NewAbs = canonicalizeAbsNabs(SI, *ICI, *this))
-    return NewAbs;
+  if (Value *V = foldSelectInstWithICmpConst(SI, ICI))
+    return replaceInstUsesWith(SI, V);
 
   if (Value *V = canonicalizeClampLike(SI, *ICI, Builder))
     return replaceInstUsesWith(SI, V);
@@ -1572,6 +1607,22 @@ Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI,
     }
   }
 
+  // Canonicalize a signbit condition to use zero constant by swapping:
+  // (CmpLHS > -1) ? TV : FV --> (CmpLHS < 0) ? FV : TV
+  // To avoid conflicts (infinite loops) with other canonicalizations, this is
+  // not applied with any constant select arm.
+  if (Pred == ICmpInst::ICMP_SGT && match(CmpRHS, m_AllOnes()) &&
+      !match(TrueVal, m_Constant()) && !match(FalseVal, m_Constant()) &&
+      ICI->hasOneUse()) {
+    InstCombiner::BuilderTy::InsertPointGuard Guard(Builder);
+    Builder.SetInsertPoint(&SI);
+    Value *IsNeg = Builder.CreateIsNeg(CmpLHS, ICI->getName());
+    replaceOperand(SI, 0, IsNeg);
+    SI.swapValues();
+    SI.swapProfMetadata();
+    return &SI;
+  }
+
   // FIXME: This code is nearly duplicated in InstSimplify. Using/refactoring
   // decomposeBitTestICmp() might help.
   {
@@ -1629,6 +1680,9 @@ Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI,
   if (Instruction *V = foldSelectCtlzToCttz(ICI, TrueVal, FalseVal, Builder))
     return V;
 
+  if (Instruction *V = foldSelectZeroOrOnes(ICI, TrueVal, FalseVal, Builder))
+    return V;
+
   if (Value *V = foldSelectICmpAndOr(ICI, TrueVal, FalseVal, Builder))
     return replaceInstUsesWith(SI, V);
 
@@ -1698,114 +1752,6 @@ Instruction *InstCombinerImpl::foldSPFofSPF(Instruction *Inner,
     // TODO: This could be done in instsimplify.
     if (SPF1 == SPF2 && SelectPatternResult::isMinOrMax(SPF1))
       return replaceInstUsesWith(Outer, Inner);
-
-    // MAX(MIN(a, b), a) -> a
-    // MIN(MAX(a, b), a) -> a
-    // TODO: This could be done in instsimplify.
-    if ((SPF1 == SPF_SMIN && SPF2 == SPF_SMAX) ||
-        (SPF1 == SPF_SMAX && SPF2 == SPF_SMIN) ||
-        (SPF1 == SPF_UMIN && SPF2 == SPF_UMAX) ||
-        (SPF1 == SPF_UMAX && SPF2 == SPF_UMIN))
-      return replaceInstUsesWith(Outer, C);
-  }
-
-  if (SPF1 == SPF2) {
-    const APInt *CB, *CC;
-    if (match(B, m_APInt(CB)) && match(C, m_APInt(CC))) {
-      // MIN(MIN(A, 23), 97) -> MIN(A, 23)
-      // MAX(MAX(A, 97), 23) -> MAX(A, 97)
-      // TODO: This could be done in instsimplify.
-      if ((SPF1 == SPF_UMIN && CB->ule(*CC)) ||
-          (SPF1 == SPF_SMIN && CB->sle(*CC)) ||
-          (SPF1 == SPF_UMAX && CB->uge(*CC)) ||
-          (SPF1 == SPF_SMAX && CB->sge(*CC)))
-        return replaceInstUsesWith(Outer, Inner);
-
-      // MIN(MIN(A, 97), 23) -> MIN(A, 23)
-      // MAX(MAX(A, 23), 97) -> MAX(A, 97)
-      if ((SPF1 == SPF_UMIN && CB->ugt(*CC)) ||
-          (SPF1 == SPF_SMIN && CB->sgt(*CC)) ||
-          (SPF1 == SPF_UMAX && CB->ult(*CC)) ||
-          (SPF1 == SPF_SMAX && CB->slt(*CC))) {
-        Outer.replaceUsesOfWith(Inner, A);
-        return &Outer;
-      }
-    }
-  }
-
-  // max(max(A, B), min(A, B)) --> max(A, B)
-  // min(min(A, B), max(A, B)) --> min(A, B)
-  // TODO: This could be done in instsimplify.
-  if (SPF1 == SPF2 &&
-      ((SPF1 == SPF_UMIN && match(C, m_c_UMax(m_Specific(A), m_Specific(B)))) ||
-       (SPF1 == SPF_SMIN && match(C, m_c_SMax(m_Specific(A), m_Specific(B)))) ||
-       (SPF1 == SPF_UMAX && match(C, m_c_UMin(m_Specific(A), m_Specific(B)))) ||
-       (SPF1 == SPF_SMAX && match(C, m_c_SMin(m_Specific(A), m_Specific(B))))))
-    return replaceInstUsesWith(Outer, Inner);
-
-  // ABS(ABS(X)) -> ABS(X)
-  // NABS(NABS(X)) -> NABS(X)
-  // TODO: This could be done in instsimplify.
-  if (SPF1 == SPF2 && (SPF1 == SPF_ABS || SPF1 == SPF_NABS)) {
-    return replaceInstUsesWith(Outer, Inner);
-  }
-
-  // ABS(NABS(X)) -> ABS(X)
-  // NABS(ABS(X)) -> NABS(X)
-  if ((SPF1 == SPF_ABS && SPF2 == SPF_NABS) ||
-      (SPF1 == SPF_NABS && SPF2 == SPF_ABS)) {
-    SelectInst *SI = cast<SelectInst>(Inner);
-    Value *NewSI =
-        Builder.CreateSelect(SI->getCondition(), SI->getFalseValue(),
-                             SI->getTrueValue(), SI->getName(), SI);
-    return replaceInstUsesWith(Outer, NewSI);
-  }
-
-  auto IsFreeOrProfitableToInvert =
-      [&](Value *V, Value *&NotV, bool &ElidesXor) {
-    if (match(V, m_Not(m_Value(NotV)))) {
-      // If V has at most 2 uses then we can get rid of the xor operation
-      // entirely.
-      ElidesXor |= !V->hasNUsesOrMore(3);
-      return true;
-    }
-
-    if (isFreeToInvert(V, !V->hasNUsesOrMore(3))) {
-      NotV = nullptr;
-      return true;
-    }
-
-    return false;
-  };
-
-  Value *NotA, *NotB, *NotC;
-  bool ElidesXor = false;
-
-  // MIN(MIN(~A, ~B), ~C) == ~MAX(MAX(A, B), C)
-  // MIN(MAX(~A, ~B), ~C) == ~MAX(MIN(A, B), C)
-  // MAX(MIN(~A, ~B), ~C) == ~MIN(MAX(A, B), C)
-  // MAX(MAX(~A, ~B), ~C) == ~MIN(MIN(A, B), C)
-  //
-  // This transform is performance neutral if we can elide at least one xor from
-  // the set of three operands, since we'll be tacking on an xor at the very
-  // end.
-  if (SelectPatternResult::isMinOrMax(SPF1) &&
-      SelectPatternResult::isMinOrMax(SPF2) &&
-      IsFreeOrProfitableToInvert(A, NotA, ElidesXor) &&
-      IsFreeOrProfitableToInvert(B, NotB, ElidesXor) &&
-      IsFreeOrProfitableToInvert(C, NotC, ElidesXor) && ElidesXor) {
-    if (!NotA)
-      NotA = Builder.CreateNot(A);
-    if (!NotB)
-      NotB = Builder.CreateNot(B);
-    if (!NotC)
-      NotC = Builder.CreateNot(C);
-
-    Value *NewInner = createMinMax(Builder, getInverseMinMaxFlavor(SPF1), NotA,
-                                   NotB);
-    Value *NewOuter = Builder.CreateNot(
-        createMinMax(Builder, getInverseMinMaxFlavor(SPF2), NewInner, NotC));
-    return replaceInstUsesWith(Outer, NewOuter);
   }
 
   return nullptr;
@@ -2238,163 +2184,6 @@ static Value *foldSelectCmpXchg(SelectInst &SI) {
   return nullptr;
 }
 
-static Instruction *moveAddAfterMinMax(SelectPatternFlavor SPF, Value *X,
-                                       Value *Y,
-                                       InstCombiner::BuilderTy &Builder) {
-  assert(SelectPatternResult::isMinOrMax(SPF) && "Expected min/max pattern");
-  bool IsUnsigned = SPF == SelectPatternFlavor::SPF_UMIN ||
-                    SPF == SelectPatternFlavor::SPF_UMAX;
-  // TODO: If InstSimplify could fold all cases where C2 <= C1, we could change
-  // the constant value check to an assert.
-  Value *A;
-  const APInt *C1, *C2;
-  if (IsUnsigned && match(X, m_NUWAdd(m_Value(A), m_APInt(C1))) &&
-      match(Y, m_APInt(C2)) && C2->uge(*C1) && X->hasNUses(2)) {
-    // umin (add nuw A, C1), C2 --> add nuw (umin A, C2 - C1), C1
-    // umax (add nuw A, C1), C2 --> add nuw (umax A, C2 - C1), C1
-    Value *NewMinMax = createMinMax(Builder, SPF, A,
-                                    ConstantInt::get(X->getType(), *C2 - *C1));
-    return BinaryOperator::CreateNUW(BinaryOperator::Add, NewMinMax,
-                                     ConstantInt::get(X->getType(), *C1));
-  }
-
-  if (!IsUnsigned && match(X, m_NSWAdd(m_Value(A), m_APInt(C1))) &&
-      match(Y, m_APInt(C2)) && X->hasNUses(2)) {
-    bool Overflow;
-    APInt Diff = C2->ssub_ov(*C1, Overflow);
-    if (!Overflow) {
-      // smin (add nsw A, C1), C2 --> add nsw (smin A, C2 - C1), C1
-      // smax (add nsw A, C1), C2 --> add nsw (smax A, C2 - C1), C1
-      Value *NewMinMax = createMinMax(Builder, SPF, A,
-                                      ConstantInt::get(X->getType(), Diff));
-      return BinaryOperator::CreateNSW(BinaryOperator::Add, NewMinMax,
-                                       ConstantInt::get(X->getType(), *C1));
-    }
-  }
-
-  return nullptr;
-}
-
-/// Match a sadd_sat or ssub_sat which is using min/max to clamp the value.
-Instruction *InstCombinerImpl::matchSAddSubSat(Instruction &MinMax1) {
-  Type *Ty = MinMax1.getType();
-
-  // We are looking for a tree of:
-  // max(INT_MIN, min(INT_MAX, add(sext(A), sext(B))))
-  // Where the min and max could be reversed
-  Instruction *MinMax2;
-  BinaryOperator *AddSub;
-  const APInt *MinValue, *MaxValue;
-  if (match(&MinMax1, m_SMin(m_Instruction(MinMax2), m_APInt(MaxValue)))) {
-    if (!match(MinMax2, m_SMax(m_BinOp(AddSub), m_APInt(MinValue))))
-      return nullptr;
-  } else if (match(&MinMax1,
-                   m_SMax(m_Instruction(MinMax2), m_APInt(MinValue)))) {
-    if (!match(MinMax2, m_SMin(m_BinOp(AddSub), m_APInt(MaxValue))))
-      return nullptr;
-  } else
-    return nullptr;
-
-  // Check that the constants clamp a saturate, and that the new type would be
-  // sensible to convert to.
-  if (!(*MaxValue + 1).isPowerOf2() || -*MinValue != *MaxValue + 1)
-    return nullptr;
-  // In what bitwidth can this be treated as saturating arithmetics?
-  unsigned NewBitWidth = (*MaxValue + 1).logBase2() + 1;
-  // FIXME: This isn't quite right for vectors, but using the scalar type is a
-  // good first approximation for what should be done there.
-  if (!shouldChangeType(Ty->getScalarType()->getIntegerBitWidth(), NewBitWidth))
-    return nullptr;
-
-  // Also make sure that the number of uses is as expected. The 3 is for the
-  // the two items of the compare and the select, or 2 from a min/max.
-  unsigned ExpUses = isa<IntrinsicInst>(MinMax1) ? 2 : 3;
-  if (MinMax2->hasNUsesOrMore(ExpUses) || AddSub->hasNUsesOrMore(ExpUses))
-    return nullptr;
-
-  // Create the new type (which can be a vector type)
-  Type *NewTy = Ty->getWithNewBitWidth(NewBitWidth);
-
-  Intrinsic::ID IntrinsicID;
-  if (AddSub->getOpcode() == Instruction::Add)
-    IntrinsicID = Intrinsic::sadd_sat;
-  else if (AddSub->getOpcode() == Instruction::Sub)
-    IntrinsicID = Intrinsic::ssub_sat;
-  else
-    return nullptr;
-
-  // The two operands of the add/sub must be nsw-truncatable to the NewTy. This
-  // is usually achieved via a sext from a smaller type.
-  if (ComputeMaxSignificantBits(AddSub->getOperand(0), 0, AddSub) >
-          NewBitWidth ||
-      ComputeMaxSignificantBits(AddSub->getOperand(1), 0, AddSub) > NewBitWidth)
-    return nullptr;
-
-  // Finally create and return the sat intrinsic, truncated to the new type
-  Function *F = Intrinsic::getDeclaration(MinMax1.getModule(), IntrinsicID, NewTy);
-  Value *AT = Builder.CreateTrunc(AddSub->getOperand(0), NewTy);
-  Value *BT = Builder.CreateTrunc(AddSub->getOperand(1), NewTy);
-  Value *Sat = Builder.CreateCall(F, {AT, BT});
-  return CastInst::Create(Instruction::SExt, Sat, Ty);
-}
-
-/// Reduce a sequence of min/max with a common operand.
-static Instruction *factorizeMinMaxTree(SelectPatternFlavor SPF, Value *LHS,
-                                        Value *RHS,
-                                        InstCombiner::BuilderTy &Builder) {
-  assert(SelectPatternResult::isMinOrMax(SPF) && "Expected a min/max");
-  // TODO: Allow FP min/max with nnan/nsz.
-  if (!LHS->getType()->isIntOrIntVectorTy())
-    return nullptr;
-
-  // Match 3 of the same min/max ops. Example: umin(umin(), umin()).
-  Value *A, *B, *C, *D;
-  SelectPatternResult L = matchSelectPattern(LHS, A, B);
-  SelectPatternResult R = matchSelectPattern(RHS, C, D);
-  if (SPF != L.Flavor || L.Flavor != R.Flavor)
-    return nullptr;
-
-  // Look for a common operand. The use checks are different than usual because
-  // a min/max pattern typically has 2 uses of each op: 1 by the cmp and 1 by
-  // the select.
-  Value *MinMaxOp = nullptr;
-  Value *ThirdOp = nullptr;
-  if (!LHS->hasNUsesOrMore(3) && RHS->hasNUsesOrMore(3)) {
-    // If the LHS is only used in this chain and the RHS is used outside of it,
-    // reuse the RHS min/max because that will eliminate the LHS.
-    if (D == A || C == A) {
-      // min(min(a, b), min(c, a)) --> min(min(c, a), b)
-      // min(min(a, b), min(a, d)) --> min(min(a, d), b)
-      MinMaxOp = RHS;
-      ThirdOp = B;
-    } else if (D == B || C == B) {
-      // min(min(a, b), min(c, b)) --> min(min(c, b), a)
-      // min(min(a, b), min(b, d)) --> min(min(b, d), a)
-      MinMaxOp = RHS;
-      ThirdOp = A;
-    }
-  } else if (!RHS->hasNUsesOrMore(3)) {
-    // Reuse the LHS. This will eliminate the RHS.
-    if (D == A || D == B) {
-      // min(min(a, b), min(c, a)) --> min(min(a, b), c)
-      // min(min(a, b), min(c, b)) --> min(min(a, b), c)
-      MinMaxOp = LHS;
-      ThirdOp = C;
-    } else if (C == A || C == B) {
-      // min(min(a, b), min(b, d)) --> min(min(a, b), d)
-      // min(min(a, b), min(c, b)) --> min(min(a, b), d)
-      MinMaxOp = LHS;
-      ThirdOp = D;
-    }
-  }
-  if (!MinMaxOp || !ThirdOp)
-    return nullptr;
-
-  CmpInst::Predicate P = getMinMaxPred(SPF);
-  Value *CmpABC = Builder.CreateICmp(P, MinMaxOp, ThirdOp);
-  return SelectInst::Create(CmpABC, MinMaxOp, ThirdOp);
-}
-
 /// Try to reduce a funnel/rotate pattern that includes a compare and select
 /// into a funnel shift intrinsic. Example:
 /// rotl32(a, b) --> (b == 0 ? a : ((a >> (32 - b)) | (a << b)))
@@ -2484,7 +2273,8 @@ static Instruction *foldSelectToCopysign(SelectInst &Sel,
   // Match select ?, TC, FC where the constants are equal but negated.
   // TODO: Generalize to handle a negated variable operand?
   const APFloat *TC, *FC;
-  if (!match(TVal, m_APFloat(TC)) || !match(FVal, m_APFloat(FC)) ||
+  if (!match(TVal, m_APFloatAllowUndef(TC)) ||
+      !match(FVal, m_APFloatAllowUndef(FC)) ||
       !abs(*TC).bitwiseIsEqual(abs(*FC)))
     return nullptr;
 
@@ -2504,17 +2294,16 @@ static Instruction *foldSelectToCopysign(SelectInst &Sel,
   // (bitcast X) <  0 ?  TC : -TC --> copysign(TC, -X)
   // (bitcast X) >= 0 ? -TC :  TC --> copysign(TC, -X)
   // (bitcast X) >= 0 ?  TC : -TC --> copysign(TC,  X)
+  // Note: FMF from the select can not be propagated to the new instructions.
   if (IsTrueIfSignSet ^ TC->isNegative())
-    X = Builder.CreateFNegFMF(X, &Sel);
+    X = Builder.CreateFNeg(X);
 
   // Canonicalize the magnitude argument as the positive constant since we do
   // not care about its sign.
-  Value *MagArg = TC->isNegative() ? FVal : TVal;
+  Value *MagArg = ConstantFP::get(SelType, abs(*TC));
   Function *F = Intrinsic::getDeclaration(Sel.getModule(), Intrinsic::copysign,
                                           Sel.getType());
-  Instruction *CopySign = CallInst::Create(F, { MagArg, X });
-  CopySign->setFastMathFlags(Sel.getFastMathFlags());
-  return CopySign;
+  return CallInst::Create(F, { MagArg, X });
 }
 
 Instruction *InstCombinerImpl::foldVectorSelect(SelectInst &Sel) {
@@ -2715,29 +2504,144 @@ Instruction *InstCombinerImpl::foldAndOrOfSelectUsingImpliedCond(Value *Op,
   }
 }
 
-Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
+// Canonicalize select with fcmp to fabs(). -0.0 makes this tricky. We need
+// fast-math-flags (nsz) or fsub with +0.0 (not fneg) for this to work.
+static Instruction *foldSelectWithFCmpToFabs(SelectInst &SI,
+                                             InstCombinerImpl &IC) {
   Value *CondVal = SI.getCondition();
-  Value *TrueVal = SI.getTrueValue();
-  Value *FalseVal = SI.getFalseValue();
-  Type *SelType = SI.getType();
 
-  // FIXME: Remove this workaround when freeze related patches are done.
-  // For select with undef operand which feeds into an equality comparison,
-  // don't simplify it so loop unswitch can know the equality comparison
-  // may have an undef operand. This is a workaround for PR31652 caused by
-  // descrepancy about branch on undef between LoopUnswitch and GVN.
-  if (match(TrueVal, m_Undef()) || match(FalseVal, m_Undef())) {
-    if (llvm::any_of(SI.users(), [&](User *U) {
-          ICmpInst *CI = dyn_cast<ICmpInst>(U);
-          if (CI && CI->isEquality())
-            return true;
-          return false;
-        })) {
+  for (bool Swap : {false, true}) {
+    Value *TrueVal = SI.getTrueValue();
+    Value *X = SI.getFalseValue();
+    CmpInst::Predicate Pred;
+
+    if (Swap)
+      std::swap(TrueVal, X);
+
+    if (!match(CondVal, m_FCmp(Pred, m_Specific(X), m_AnyZeroFP())))
+      continue;
+
+    // fold (X <= +/-0.0) ? (0.0 - X) : X to fabs(X), when 'Swap' is false
+    // fold (X >  +/-0.0) ? X : (0.0 - X) to fabs(X), when 'Swap' is true
+    if (match(TrueVal, m_FSub(m_PosZeroFP(), m_Specific(X)))) {
+      if (!Swap && (Pred == FCmpInst::FCMP_OLE || Pred == FCmpInst::FCMP_ULE)) {
+        Value *Fabs = IC.Builder.CreateUnaryIntrinsic(Intrinsic::fabs, X, &SI);
+        return IC.replaceInstUsesWith(SI, Fabs);
+      }
+      if (Swap && (Pred == FCmpInst::FCMP_OGT || Pred == FCmpInst::FCMP_UGT)) {
+        Value *Fabs = IC.Builder.CreateUnaryIntrinsic(Intrinsic::fabs, X, &SI);
+        return IC.replaceInstUsesWith(SI, Fabs);
+      }
+    }
+
+    // With nsz, when 'Swap' is false:
+    // fold (X < +/-0.0) ? -X : X or (X <= +/-0.0) ? -X : X to fabs(X)
+    // fold (X > +/-0.0) ? -X : X or (X >= +/-0.0) ? -X : X to -fabs(x)
+    // when 'Swap' is true:
+    // fold (X > +/-0.0) ? X : -X or (X >= +/-0.0) ? X : -X to fabs(X)
+    // fold (X < +/-0.0) ? X : -X or (X <= +/-0.0) ? X : -X to -fabs(X)
+    if (!match(TrueVal, m_FNeg(m_Specific(X))) || !SI.hasNoSignedZeros())
       return nullptr;
+
+    if (Swap)
+      Pred = FCmpInst::getSwappedPredicate(Pred);
+
+    bool IsLTOrLE = Pred == FCmpInst::FCMP_OLT || Pred == FCmpInst::FCMP_OLE ||
+                    Pred == FCmpInst::FCMP_ULT || Pred == FCmpInst::FCMP_ULE;
+    bool IsGTOrGE = Pred == FCmpInst::FCMP_OGT || Pred == FCmpInst::FCMP_OGE ||
+                    Pred == FCmpInst::FCMP_UGT || Pred == FCmpInst::FCMP_UGE;
+
+    if (IsLTOrLE) {
+      Value *Fabs = IC.Builder.CreateUnaryIntrinsic(Intrinsic::fabs, X, &SI);
+      return IC.replaceInstUsesWith(SI, Fabs);
+    }
+    if (IsGTOrGE) {
+      Value *Fabs = IC.Builder.CreateUnaryIntrinsic(Intrinsic::fabs, X, &SI);
+      Instruction *NewFNeg = UnaryOperator::CreateFNeg(Fabs);
+      NewFNeg->setFastMathFlags(SI.getFastMathFlags());
+      return NewFNeg;
     }
   }
 
-  if (Value *V = SimplifySelectInst(CondVal, TrueVal, FalseVal,
+  return nullptr;
+}
+
+// Match the following IR pattern:
+//   %x.lowbits = and i8 %x, %lowbitmask
+//   %x.lowbits.are.zero = icmp eq i8 %x.lowbits, 0
+//   %x.biased = add i8 %x, %bias
+//   %x.biased.highbits = and i8 %x.biased, %highbitmask
+//   %x.roundedup = select i1 %x.lowbits.are.zero, i8 %x, i8 %x.biased.highbits
+// Define:
+//   %alignment = add i8 %lowbitmask, 1
+// Iff 1. an %alignment is a power-of-two (aka, %lowbitmask is a low bit mask)
+// and 2. %bias is equal to either %lowbitmask or %alignment,
+// and 3. %highbitmask is equal to ~%lowbitmask (aka, to -%alignment)
+// then this pattern can be transformed into:
+//   %x.offset = add i8 %x, %lowbitmask
+//   %x.roundedup = and i8 %x.offset, %highbitmask
+static Value *
+foldRoundUpIntegerWithPow2Alignment(SelectInst &SI,
+                                    InstCombiner::BuilderTy &Builder) {
+  Value *Cond = SI.getCondition();
+  Value *X = SI.getTrueValue();
+  Value *XBiasedHighBits = SI.getFalseValue();
+
+  ICmpInst::Predicate Pred;
+  Value *XLowBits;
+  if (!match(Cond, m_ICmp(Pred, m_Value(XLowBits), m_ZeroInt())) ||
+      !ICmpInst::isEquality(Pred))
+    return nullptr;
+
+  if (Pred == ICmpInst::Predicate::ICMP_NE)
+    std::swap(X, XBiasedHighBits);
+
+  // FIXME: we could support non non-splats here.
+
+  const APInt *LowBitMaskCst;
+  if (!match(XLowBits, m_And(m_Specific(X), m_APIntAllowUndef(LowBitMaskCst))))
+    return nullptr;
+
+  const APInt *BiasCst, *HighBitMaskCst;
+  if (!match(XBiasedHighBits,
+             m_And(m_Add(m_Specific(X), m_APIntAllowUndef(BiasCst)),
+                   m_APIntAllowUndef(HighBitMaskCst))))
+    return nullptr;
+
+  if (!LowBitMaskCst->isMask())
+    return nullptr;
+
+  APInt InvertedLowBitMaskCst = ~*LowBitMaskCst;
+  if (InvertedLowBitMaskCst != *HighBitMaskCst)
+    return nullptr;
+
+  APInt AlignmentCst = *LowBitMaskCst + 1;
+
+  if (*BiasCst != AlignmentCst && *BiasCst != *LowBitMaskCst)
+    return nullptr;
+
+  if (!XBiasedHighBits->hasOneUse()) {
+    if (*BiasCst == *LowBitMaskCst)
+      return XBiasedHighBits;
+    return nullptr;
+  }
+
+  // FIXME: could we preserve undef's here?
+  Type *Ty = X->getType();
+  Value *XOffset = Builder.CreateAdd(X, ConstantInt::get(Ty, *LowBitMaskCst),
+                                     X->getName() + ".biased");
+  Value *R = Builder.CreateAnd(XOffset, ConstantInt::get(Ty, *HighBitMaskCst));
+  R->takeName(&SI);
+  return R;
+}
+
+Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
+  Value *CondVal = SI.getCondition();
+  Value *TrueVal = SI.getTrueValue();
+  Value *FalseVal = SI.getFalseValue();
+  Type *SelType = SI.getType();
+
+  if (Value *V = simplifySelectInst(CondVal, TrueVal, FalseVal,
                                     SQ.getWithInstruction(&SI)))
     return replaceInstUsesWith(SI, V);
 
@@ -2747,8 +2651,6 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
   if (Instruction *I = canonicalizeScalarSelectOfVecs(SI, *this))
     return I;
 
-  CmpInst::Predicate Pred;
-
   // Avoid potential infinite loops by checking for non-constant condition.
   // TODO: Can we assert instead by improving canonicalizeSelectToShuffle()?
   //       Scalar select must have simplified?
@@ -2757,13 +2659,29 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
     // Folding select to and/or i1 isn't poison safe in general. impliesPoison
     // checks whether folding it does not convert a well-defined value into
     // poison.
-    if (match(TrueVal, m_One()) && impliesPoison(FalseVal, CondVal)) {
-      // Change: A = select B, true, C --> A = or B, C
-      return BinaryOperator::CreateOr(CondVal, FalseVal);
+    if (match(TrueVal, m_One())) {
+      if (impliesPoison(FalseVal, CondVal)) {
+        // Change: A = select B, true, C --> A = or B, C
+        return BinaryOperator::CreateOr(CondVal, FalseVal);
+      }
+
+      if (auto *LHS = dyn_cast<FCmpInst>(CondVal))
+        if (auto *RHS = dyn_cast<FCmpInst>(FalseVal))
+          if (Value *V = foldLogicOfFCmps(LHS, RHS, /*IsAnd*/ false,
+                                          /*IsSelectLogical*/ true))
+            return replaceInstUsesWith(SI, V);
     }
-    if (match(FalseVal, m_Zero()) && impliesPoison(TrueVal, CondVal)) {
-      // Change: A = select B, C, false --> A = and B, C
-      return BinaryOperator::CreateAnd(CondVal, TrueVal);
+    if (match(FalseVal, m_Zero())) {
+      if (impliesPoison(TrueVal, CondVal)) {
+        // Change: A = select B, C, false --> A = and B, C
+        return BinaryOperator::CreateAnd(CondVal, TrueVal);
+      }
+
+      if (auto *LHS = dyn_cast<FCmpInst>(CondVal))
+        if (auto *RHS = dyn_cast<FCmpInst>(TrueVal))
+          if (Value *V = foldLogicOfFCmps(LHS, RHS, /*IsAnd*/ true,
+                                          /*IsSelectLogical*/ true))
+            return replaceInstUsesWith(SI, V);
     }
 
     auto *One = ConstantInt::getTrue(SelType);
@@ -2821,6 +2739,20 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
         match(TrueVal, m_Specific(B)) && match(FalseVal, m_Zero()))
       return replaceOperand(SI, 0, A);
 
+    Value *C;
+    // select (~a | c), a, b -> and a, (or c, freeze(b))
+    if (match(CondVal, m_c_Or(m_Not(m_Specific(TrueVal)), m_Value(C))) &&
+        CondVal->hasOneUse()) {
+      FalseVal = Builder.CreateFreeze(FalseVal);
+      return BinaryOperator::CreateAnd(TrueVal, Builder.CreateOr(C, FalseVal));
+    }
+    // select (~c & b), a, b -> and b, (or freeze(a), c)
+    if (match(CondVal, m_c_And(m_Not(m_Value(C)), m_Specific(FalseVal))) &&
+        CondVal->hasOneUse()) {
+      TrueVal = Builder.CreateFreeze(TrueVal);
+      return BinaryOperator::CreateAnd(FalseVal, Builder.CreateOr(C, TrueVal));
+    }
+
     if (!SelType->isVectorTy()) {
       if (Value *S = simplifyWithOpReplaced(TrueVal, CondVal, One, SQ,
                                             /* AllowRefinement */ true))
@@ -2846,16 +2778,11 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
                                                         /* IsAnd */ IsAnd))
           return I;
 
-      if (auto *ICmp0 = dyn_cast<ICmpInst>(CondVal)) {
-        if (auto *ICmp1 = dyn_cast<ICmpInst>(Op1)) {
-          if (auto *V = foldAndOrOfICmpsOfAndWithPow2(ICmp0, ICmp1, &SI, IsAnd,
-                                                      /* IsLogical */ true))
+      if (auto *ICmp0 = dyn_cast<ICmpInst>(CondVal))
+        if (auto *ICmp1 = dyn_cast<ICmpInst>(Op1))
+          if (auto *V = foldAndOrOfICmps(ICmp0, ICmp1, SI, IsAnd,
+                                         /* IsLogical */ true))
             return replaceInstUsesWith(SI, V);
-
-          if (auto *V = foldEqOfParts(ICmp0, ICmp1, IsAnd))
-            return replaceInstUsesWith(SI, V);
-        }
-      }
     }
 
     // select (select a, true, b), c, false -> select a, c, false
@@ -2959,42 +2886,9 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
     }
   }
 
-  // Canonicalize select with fcmp to fabs(). -0.0 makes this tricky. We need
-  // fast-math-flags (nsz) or fsub with +0.0 (not fneg) for this to work.
-  // (X <= +/-0.0) ? (0.0 - X) : X --> fabs(X)
-  if (match(CondVal, m_FCmp(Pred, m_Specific(FalseVal), m_AnyZeroFP())) &&
-      match(TrueVal, m_FSub(m_PosZeroFP(), m_Specific(FalseVal))) &&
-      (Pred == FCmpInst::FCMP_OLE || Pred == FCmpInst::FCMP_ULE)) {
-    Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FalseVal, &SI);
-    return replaceInstUsesWith(SI, Fabs);
-  }
-  // (X >  +/-0.0) ? X : (0.0 - X) --> fabs(X)
-  if (match(CondVal, m_FCmp(Pred, m_Specific(TrueVal), m_AnyZeroFP())) &&
-      match(FalseVal, m_FSub(m_PosZeroFP(), m_Specific(TrueVal))) &&
-      (Pred == FCmpInst::FCMP_OGT || Pred == FCmpInst::FCMP_UGT)) {
-    Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, TrueVal, &SI);
-    return replaceInstUsesWith(SI, Fabs);
-  }
-  // With nnan and nsz:
-  // (X <  +/-0.0) ? -X : X --> fabs(X)
-  // (X <= +/-0.0) ? -X : X --> fabs(X)
-  if (match(CondVal, m_FCmp(Pred, m_Specific(FalseVal), m_AnyZeroFP())) &&
-      match(TrueVal, m_FNeg(m_Specific(FalseVal))) && SI.hasNoSignedZeros() &&
-      (Pred == FCmpInst::FCMP_OLT || Pred == FCmpInst::FCMP_OLE ||
-       Pred == FCmpInst::FCMP_ULT || Pred == FCmpInst::FCMP_ULE)) {
-    Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FalseVal, &SI);
-    return replaceInstUsesWith(SI, Fabs);
-  }
-  // With nnan and nsz:
-  // (X >  +/-0.0) ? X : -X --> fabs(X)
-  // (X >= +/-0.0) ? X : -X --> fabs(X)
-  if (match(CondVal, m_FCmp(Pred, m_Specific(TrueVal), m_AnyZeroFP())) &&
-      match(FalseVal, m_FNeg(m_Specific(TrueVal))) && SI.hasNoSignedZeros() &&
-      (Pred == FCmpInst::FCMP_OGT || Pred == FCmpInst::FCMP_OGE ||
-       Pred == FCmpInst::FCMP_UGT || Pred == FCmpInst::FCMP_UGE)) {
-    Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, TrueVal, &SI);
-    return replaceInstUsesWith(SI, Fabs);
-  }
+  // Fold selecting to fabs.
+  if (Instruction *Fabs = foldSelectWithFCmpToFabs(SI, *this))
+    return Fabs;
 
   // See if we are selecting two values based on a comparison of the two values.
   if (ICmpInst *ICI = dyn_cast<ICmpInst>(CondVal))
@@ -3066,8 +2960,6 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
         if (Instruction *R = foldSPFofSPF(cast<Instruction>(RHS), SPF2, LHS2,
                                           RHS2, SI, SPF, LHS))
           return R;
-      // TODO.
-      // ABS(-X) -> ABS(X)
     }
 
     if (SelectPatternResult::isMinOrMax(SPF)) {
@@ -3102,46 +2994,6 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
         Value *NewCast = Builder.CreateCast(CastOp, NewSI, SelType);
         return replaceInstUsesWith(SI, NewCast);
       }
-
-      // MAX(~a, ~b) -> ~MIN(a, b)
-      // MAX(~a, C)  -> ~MIN(a, ~C)
-      // MIN(~a, ~b) -> ~MAX(a, b)
-      // MIN(~a, C)  -> ~MAX(a, ~C)
-      auto moveNotAfterMinMax = [&](Value *X, Value *Y) -> Instruction * {
-        Value *A;
-        if (match(X, m_Not(m_Value(A))) && !X->hasNUsesOrMore(3) &&
-            !isFreeToInvert(A, A->hasOneUse()) &&
-            // Passing false to only consider m_Not and constants.
-            isFreeToInvert(Y, false)) {
-          Value *B = Builder.CreateNot(Y);
-          Value *NewMinMax = createMinMax(Builder, getInverseMinMaxFlavor(SPF),
-                                          A, B);
-          // Copy the profile metadata.
-          if (MDNode *MD = SI.getMetadata(LLVMContext::MD_prof)) {
-            cast<SelectInst>(NewMinMax)->setMetadata(LLVMContext::MD_prof, MD);
-            // Swap the metadata if the operands are swapped.
-            if (X == SI.getFalseValue() && Y == SI.getTrueValue())
-              cast<SelectInst>(NewMinMax)->swapProfMetadata();
-          }
-
-          return BinaryOperator::CreateNot(NewMinMax);
-        }
-
-        return nullptr;
-      };
-
-      if (Instruction *I = moveNotAfterMinMax(LHS, RHS))
-        return I;
-      if (Instruction *I = moveNotAfterMinMax(RHS, LHS))
-        return I;
-
-      if (Instruction *I = moveAddAfterMinMax(SPF, LHS, RHS, Builder))
-        return I;
-
-      if (Instruction *I = factorizeMinMaxTree(SPF, LHS, RHS, Builder))
-        return I;
-      if (Instruction *I = matchSAddSubSat(SI))
-        return I;
     }
   }
 
@@ -3307,35 +3159,42 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
   if (Value *Fr = foldSelectWithFrozenICmp(SI, Builder))
     return replaceInstUsesWith(SI, Fr);
 
+  if (Value *V = foldRoundUpIntegerWithPow2Alignment(SI, Builder))
+    return replaceInstUsesWith(SI, V);
+
   // select(mask, mload(,,mask,0), 0) -> mload(,,mask,0)
   // Load inst is intentionally not checked for hasOneUse()
   if (match(FalseVal, m_Zero()) &&
-      match(TrueVal, m_MaskedLoad(m_Value(), m_Value(), m_Specific(CondVal),
-                                  m_CombineOr(m_Undef(), m_Zero())))) {
-    auto *MaskedLoad = cast<IntrinsicInst>(TrueVal);
-    if (isa<UndefValue>(MaskedLoad->getArgOperand(3)))
-      MaskedLoad->setArgOperand(3, FalseVal /* Zero */);
-    return replaceInstUsesWith(SI, MaskedLoad);
+      (match(TrueVal, m_MaskedLoad(m_Value(), m_Value(), m_Specific(CondVal),
+                                   m_CombineOr(m_Undef(), m_Zero()))) ||
+       match(TrueVal, m_MaskedGather(m_Value(), m_Value(), m_Specific(CondVal),
+                                     m_CombineOr(m_Undef(), m_Zero()))))) {
+    auto *MaskedInst = cast<IntrinsicInst>(TrueVal);
+    if (isa<UndefValue>(MaskedInst->getArgOperand(3)))
+      MaskedInst->setArgOperand(3, FalseVal /* Zero */);
+    return replaceInstUsesWith(SI, MaskedInst);
   }
 
   Value *Mask;
   if (match(TrueVal, m_Zero()) &&
-      match(FalseVal, m_MaskedLoad(m_Value(), m_Value(), m_Value(Mask),
-                                   m_CombineOr(m_Undef(), m_Zero()))) &&
+      (match(FalseVal, m_MaskedLoad(m_Value(), m_Value(), m_Value(Mask),
+                                    m_CombineOr(m_Undef(), m_Zero()))) ||
+       match(FalseVal, m_MaskedGather(m_Value(), m_Value(), m_Value(Mask),
+                                      m_CombineOr(m_Undef(), m_Zero())))) &&
       (CondVal->getType() == Mask->getType())) {
     // We can remove the select by ensuring the load zeros all lanes the
     // select would have.  We determine this by proving there is no overlap
     // between the load and select masks.
     // (i.e (load_mask & select_mask) == 0 == no overlap)
     bool CanMergeSelectIntoLoad = false;
-    if (Value *V = SimplifyAndInst(CondVal, Mask, SQ.getWithInstruction(&SI)))
+    if (Value *V = simplifyAndInst(CondVal, Mask, SQ.getWithInstruction(&SI)))
       CanMergeSelectIntoLoad = match(V, m_Zero());
 
     if (CanMergeSelectIntoLoad) {
-      auto *MaskedLoad = cast<IntrinsicInst>(FalseVal);
-      if (isa<UndefValue>(MaskedLoad->getArgOperand(3)))
-        MaskedLoad->setArgOperand(3, TrueVal /* Zero */);
-      return replaceInstUsesWith(SI, MaskedLoad);
+      auto *MaskedInst = cast<IntrinsicInst>(FalseVal);
+      if (isa<UndefValue>(MaskedInst->getArgOperand(3)))
+        MaskedInst->setArgOperand(3, TrueVal /* Zero */);
+      return replaceInstUsesWith(SI, MaskedInst);
     }
   }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 17f0c5c4cff0..f4e2d1239f0f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombineInternal.h"
-#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
@@ -108,7 +107,7 @@ Value *InstCombinerImpl::reassociateShiftAmtsOfTwoSameDirectionShifts(
 
   // Can we fold (ShAmt0+ShAmt1) ?
   auto *NewShAmt = dyn_cast_or_null<Constant>(
-      SimplifyAddInst(ShAmt0, ShAmt1, /*isNSW=*/false, /*isNUW=*/false,
+      simplifyAddInst(ShAmt0, ShAmt1, /*isNSW=*/false, /*isNUW=*/false,
                       SQ.getWithInstruction(Sh0)));
   if (!NewShAmt)
     return nullptr; // Did not simplify.
@@ -232,7 +231,7 @@ dropRedundantMaskingOfLeftShiftInput(BinaryOperator *OuterShift,
       return nullptr;
 
     // Can we simplify (MaskShAmt+ShiftShAmt) ?
-    auto *SumOfShAmts = dyn_cast_or_null<Constant>(SimplifyAddInst(
+    auto *SumOfShAmts = dyn_cast_or_null<Constant>(simplifyAddInst(
         MaskShAmt, ShiftShAmt, /*IsNSW=*/false, /*IsNUW=*/false, Q));
     if (!SumOfShAmts)
       return nullptr; // Did not simplify.
@@ -264,7 +263,7 @@ dropRedundantMaskingOfLeftShiftInput(BinaryOperator *OuterShift,
       return nullptr;
 
     // Can we simplify (ShiftShAmt-MaskShAmt) ?
-    auto *ShAmtsDiff = dyn_cast_or_null<Constant>(SimplifySubInst(
+    auto *ShAmtsDiff = dyn_cast_or_null<Constant>(simplifySubInst(
         ShiftShAmt, MaskShAmt, /*IsNSW=*/false, /*IsNUW=*/false, Q));
     if (!ShAmtsDiff)
       return nullptr; // Did not simplify.
@@ -374,11 +373,12 @@ Instruction *InstCombinerImpl::commonShiftTransforms(BinaryOperator &I) {
 
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   assert(Op0->getType() == Op1->getType());
+  Type *Ty = I.getType();
 
   // If the shift amount is a one-use `sext`, we can demote it to `zext`.
   Value *Y;
   if (match(Op1, m_OneUse(m_SExt(m_Value(Y))))) {
-    Value *NewExt = Builder.CreateZExt(Y, I.getType(), Op1->getName());
+    Value *NewExt = Builder.CreateZExt(Y, Ty, Op1->getName());
     return BinaryOperator::Create(I.getOpcode(), Op0, NewExt);
   }
 
@@ -400,15 +400,56 @@ Instruction *InstCombinerImpl::commonShiftTransforms(BinaryOperator &I) {
           reassociateShiftAmtsOfTwoSameDirectionShifts(&I, SQ)))
     return NewShift;
 
-  // (C1 shift (A add C2)) -> (C1 shift C2) shift A)
-  // iff A and C2 are both positive.
+  // Pre-shift a constant shifted by a variable amount with constant offset:
+  // C shift (A add nuw C1) --> (C shift C1) shift A
   Value *A;
-  Constant *C;
-  if (match(Op0, m_Constant()) && match(Op1, m_Add(m_Value(A), m_Constant(C))))
-    if (isKnownNonNegative(A, DL, 0, &AC, &I, &DT) &&
-        isKnownNonNegative(C, DL, 0, &AC, &I, &DT))
-      return BinaryOperator::Create(
-          I.getOpcode(), Builder.CreateBinOp(I.getOpcode(), Op0, C), A);
+  Constant *C, *C1;
+  if (match(Op0, m_Constant(C)) &&
+      match(Op1, m_NUWAdd(m_Value(A), m_Constant(C1)))) {
+    Value *NewC = Builder.CreateBinOp(I.getOpcode(), C, C1);
+    return BinaryOperator::Create(I.getOpcode(), NewC, A);
+  }
+
+  unsigned BitWidth = Ty->getScalarSizeInBits();
+
+  const APInt *AC, *AddC;
+  // Try to pre-shift a constant shifted by a variable amount added with a
+  // negative number:
+  // C << (X - AddC) --> (C >> AddC) << X
+  // and
+  // C >> (X - AddC) --> (C << AddC) >> X
+  if (match(Op0, m_APInt(AC)) && match(Op1, m_Add(m_Value(A), m_APInt(AddC))) &&
+      AddC->isNegative() && (-*AddC).ult(BitWidth)) {
+    assert(!AC->isZero() && "Expected simplify of shifted zero");
+    unsigned PosOffset = (-*AddC).getZExtValue();
+
+    auto isSuitableForPreShift = [PosOffset, &I, AC]() {
+      switch (I.getOpcode()) {
+      default:
+        return false;
+      case Instruction::Shl:
+        return (I.hasNoSignedWrap() || I.hasNoUnsignedWrap()) &&
+               AC->eq(AC->lshr(PosOffset).shl(PosOffset));
+      case Instruction::LShr:
+        return I.isExact() && AC->eq(AC->shl(PosOffset).lshr(PosOffset));
+      case Instruction::AShr:
+        return I.isExact() && AC->eq(AC->shl(PosOffset).ashr(PosOffset));
+      }
+    };
+    if (isSuitableForPreShift()) {
+      Constant *NewC = ConstantInt::get(Ty, I.getOpcode() == Instruction::Shl
+                                                ? AC->lshr(PosOffset)
+                                                : AC->shl(PosOffset));
+      BinaryOperator *NewShiftOp =
+          BinaryOperator::Create(I.getOpcode(), NewC, A);
+      if (I.getOpcode() == Instruction::Shl) {
+        NewShiftOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
+      } else {
+        NewShiftOp->setIsExact();
+      }
+      return NewShiftOp;
+    }
+  }
 
   // X shift (A srem C) -> X shift (A and (C - 1)) iff C is a power of 2.
   // Because shifts by negative values (which could occur if A were negative)
@@ -417,7 +458,7 @@ Instruction *InstCombinerImpl::commonShiftTransforms(BinaryOperator &I) {
       match(C, m_Power2())) {
     // FIXME: Should this get moved into SimplifyDemandedBits by saying we don't
     // demand the sign bit (and many others) here??
-    Constant *Mask = ConstantExpr::getSub(C, ConstantInt::get(I.getType(), 1));
+    Constant *Mask = ConstantExpr::getSub(C, ConstantInt::get(Ty, 1));
     Value *Rem = Builder.CreateAnd(A, Mask, Op1->getName());
     return replaceOperand(I, 1, Rem);
   }
@@ -661,10 +702,18 @@ static bool canShiftBinOpWithConstantRHS(BinaryOperator &Shift,
   }
 }
 
-Instruction *InstCombinerImpl::FoldShiftByConstant(Value *Op0, Constant *Op1,
+Instruction *InstCombinerImpl::FoldShiftByConstant(Value *Op0, Constant *C1,
                                                    BinaryOperator &I) {
+  // (C2 << X) << C1 --> (C2 << C1) << X
+  // (C2 >> X) >> C1 --> (C2 >> C1) >> X
+  Constant *C2;
+  Value *X;
+  if (match(Op0, m_BinOp(I.getOpcode(), m_Constant(C2), m_Value(X))))
+    return BinaryOperator::Create(
+        I.getOpcode(), Builder.CreateBinOp(I.getOpcode(), C2, C1), X);
+
   const APInt *Op1C;
-  if (!match(Op1, m_APInt(Op1C)))
+  if (!match(C1, m_APInt(Op1C)))
     return nullptr;
 
   // See if we can propagate this shift into the input, this covers the trivial
@@ -701,11 +750,11 @@ Instruction *InstCombinerImpl::FoldShiftByConstant(Value *Op0, Constant *Op1,
     const APInt *Op0C;
     if (match(Op0BO->getOperand(1), m_APInt(Op0C))) {
       if (canShiftBinOpWithConstantRHS(I, Op0BO)) {
-        Constant *NewRHS = ConstantExpr::get(
-            I.getOpcode(), cast<Constant>(Op0BO->getOperand(1)), Op1);
+        Value *NewRHS =
+            Builder.CreateBinOp(I.getOpcode(), Op0BO->getOperand(1), C1);
 
         Value *NewShift =
-            Builder.CreateBinOp(I.getOpcode(), Op0BO->getOperand(0), Op1);
+            Builder.CreateBinOp(I.getOpcode(), Op0BO->getOperand(0), C1);
         NewShift->takeName(Op0BO);
 
         return BinaryOperator::Create(Op0BO->getOpcode(), NewShift, NewRHS);
@@ -730,10 +779,10 @@ Instruction *InstCombinerImpl::FoldShiftByConstant(Value *Op0, Constant *Op1,
     if (!isa<Constant>(FalseVal) && TBO->getOperand(0) == FalseVal &&
         match(TBO->getOperand(1), m_APInt(C)) &&
         canShiftBinOpWithConstantRHS(I, TBO)) {
-      Constant *NewRHS = ConstantExpr::get(
-          I.getOpcode(), cast<Constant>(TBO->getOperand(1)), Op1);
+      Value *NewRHS =
+          Builder.CreateBinOp(I.getOpcode(), TBO->getOperand(1), C1);
 
-      Value *NewShift = Builder.CreateBinOp(I.getOpcode(), FalseVal, Op1);
+      Value *NewShift = Builder.CreateBinOp(I.getOpcode(), FalseVal, C1);
       Value *NewOp = Builder.CreateBinOp(TBO->getOpcode(), NewShift, NewRHS);
       return SelectInst::Create(Cond, NewOp, NewShift);
     }
@@ -747,10 +796,10 @@ Instruction *InstCombinerImpl::FoldShiftByConstant(Value *Op0, Constant *Op1,
     if (!isa<Constant>(TrueVal) && FBO->getOperand(0) == TrueVal &&
         match(FBO->getOperand(1), m_APInt(C)) &&
         canShiftBinOpWithConstantRHS(I, FBO)) {
-      Constant *NewRHS = ConstantExpr::get(
-          I.getOpcode(), cast<Constant>(FBO->getOperand(1)), Op1);
+      Value *NewRHS =
+          Builder.CreateBinOp(I.getOpcode(), FBO->getOperand(1), C1);
 
-      Value *NewShift = Builder.CreateBinOp(I.getOpcode(), TrueVal, Op1);
+      Value *NewShift = Builder.CreateBinOp(I.getOpcode(), TrueVal, C1);
       Value *NewOp = Builder.CreateBinOp(FBO->getOpcode(), NewShift, NewRHS);
       return SelectInst::Create(Cond, NewShift, NewOp);
     }
@@ -762,7 +811,7 @@ Instruction *InstCombinerImpl::FoldShiftByConstant(Value *Op0, Constant *Op1,
 Instruction *InstCombinerImpl::visitShl(BinaryOperator &I) {
   const SimplifyQuery Q = SQ.getWithInstruction(&I);
 
-  if (Value *V = SimplifyShlInst(I.getOperand(0), I.getOperand(1),
+  if (Value *V = simplifyShlInst(I.getOperand(0), I.getOperand(1),
                                  I.hasNoSignedWrap(), I.hasNoUnsignedWrap(), Q))
     return replaceInstUsesWith(I, V);
 
@@ -968,10 +1017,6 @@ Instruction *InstCombinerImpl::visitShl(BinaryOperator &I) {
   if (match(Op1, m_Constant(C1))) {
     Constant *C2;
     Value *X;
-    // (C2 << X) << C1 --> (C2 << C1) << X
-    if (match(Op0, m_OneUse(m_Shl(m_Constant(C2), m_Value(X)))))
-      return BinaryOperator::CreateShl(ConstantExpr::getShl(C2, C1), X);
-
     // (X * C2) << C1 --> X * (C2 << C1)
     if (match(Op0, m_Mul(m_Value(X), m_Constant(C2))))
       return BinaryOperator::CreateMul(X, ConstantExpr::getShl(C2, C1));
@@ -993,7 +1038,7 @@ Instruction *InstCombinerImpl::visitShl(BinaryOperator &I) {
 }
 
 Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) {
-  if (Value *V = SimplifyLShrInst(I.getOperand(0), I.getOperand(1), I.isExact(),
+  if (Value *V = simplifyLShrInst(I.getOperand(0), I.getOperand(1), I.isExact(),
                                   SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
@@ -1164,15 +1209,54 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) {
       }
     }
 
-    // Look for a "splat" mul pattern - it replicates bits across each half of
-    // a value, so a right shift is just a mask of the low bits:
-    // lshr i32 (mul nuw X, Pow2+1), 16 --> and X, Pow2-1
-    // TODO: Generalize to allow more than just half-width shifts?
     const APInt *MulC;
-    if (match(Op0, m_NUWMul(m_Value(X), m_APInt(MulC))) &&
-        ShAmtC * 2 == BitWidth && (*MulC - 1).isPowerOf2() &&
-        MulC->logBase2() == ShAmtC)
-      return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, *MulC - 2));
+    if (match(Op0, m_NUWMul(m_Value(X), m_APInt(MulC)))) {
+      // Look for a "splat" mul pattern - it replicates bits across each half of
+      // a value, so a right shift is just a mask of the low bits:
+      // lshr i[2N] (mul nuw X, (2^N)+1), N --> and iN X, (2^N)-1
+      // TODO: Generalize to allow more than just half-width shifts?
+      if (BitWidth > 2 && ShAmtC * 2 == BitWidth && (*MulC - 1).isPowerOf2() &&
+          MulC->logBase2() == ShAmtC)
+        return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, *MulC - 2));
+
+      // The one-use check is not strictly necessary, but codegen may not be
+      // able to invert the transform and perf may suffer with an extra mul
+      // instruction.
+      if (Op0->hasOneUse()) {
+        APInt NewMulC = MulC->lshr(ShAmtC);
+        // if c is divisible by (1 << ShAmtC):
+        // lshr (mul nuw x, MulC), ShAmtC -> mul nuw x, (MulC >> ShAmtC)
+        if (MulC->eq(NewMulC.shl(ShAmtC))) {
+          auto *NewMul =
+              BinaryOperator::CreateNUWMul(X, ConstantInt::get(Ty, NewMulC));
+          BinaryOperator *OrigMul = cast<BinaryOperator>(Op0);
+          NewMul->setHasNoSignedWrap(OrigMul->hasNoSignedWrap());
+          return NewMul;
+        }
+      }
+    }
+
+    // Try to narrow bswap.
+    // In the case where the shift amount equals the bitwidth difference, the
+    // shift is eliminated.
+    if (match(Op0, m_OneUse(m_Intrinsic<Intrinsic::bswap>(
+                       m_OneUse(m_ZExt(m_Value(X))))))) {
+      unsigned SrcWidth = X->getType()->getScalarSizeInBits();
+      unsigned WidthDiff = BitWidth - SrcWidth;
+      if (SrcWidth % 16 == 0) {
+        Value *NarrowSwap = Builder.CreateUnaryIntrinsic(Intrinsic::bswap, X);
+        if (ShAmtC >= WidthDiff) {
+          // (bswap (zext X)) >> C --> zext (bswap X >> C')
+          Value *NewShift = Builder.CreateLShr(NarrowSwap, ShAmtC - WidthDiff);
+          return new ZExtInst(NewShift, Ty);
+        } else {
+          // (bswap (zext X)) >> C --> (zext (bswap X)) << C'
+          Value *NewZExt = Builder.CreateZExt(NarrowSwap, Ty);
+          Constant *ShiftDiff = ConstantInt::get(Ty, WidthDiff - ShAmtC);
+          return BinaryOperator::CreateShl(NewZExt, ShiftDiff);
+        }
+      }
+    }
 
     // If the shifted-out value is known-zero, then this is an exact shift.
     if (!I.isExact() &&
@@ -1263,7 +1347,7 @@ InstCombinerImpl::foldVariableSignZeroExtensionOfVariableHighBitExtract(
 }
 
 Instruction *InstCombinerImpl::visitAShr(BinaryOperator &I) {
-  if (Value *V = SimplifyAShrInst(I.getOperand(0), I.getOperand(1), I.isExact(),
+  if (Value *V = simplifyAShrInst(I.getOperand(0), I.getOperand(1), I.isExact(),
                                   SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 3f064cfda712..9d4c01ac03e2 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -12,8 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombineInternal.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/KnownBits.h"
@@ -154,6 +154,29 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
   if (Depth == 0 && !V->hasOneUse())
     DemandedMask.setAllBits();
 
+  // If the high-bits of an ADD/SUB/MUL are not demanded, then we do not care
+  // about the high bits of the operands.
+  auto simplifyOperandsBasedOnUnusedHighBits = [&](APInt &DemandedFromOps) {
+    unsigned NLZ = DemandedMask.countLeadingZeros();
+    // Right fill the mask of bits for the operands to demand the most
+    // significant bit and all those below it.
+    DemandedFromOps = APInt::getLowBitsSet(BitWidth, BitWidth - NLZ);
+    if (ShrinkDemandedConstant(I, 0, DemandedFromOps) ||
+        SimplifyDemandedBits(I, 0, DemandedFromOps, LHSKnown, Depth + 1) ||
+        ShrinkDemandedConstant(I, 1, DemandedFromOps) ||
+        SimplifyDemandedBits(I, 1, DemandedFromOps, RHSKnown, Depth + 1)) {
+      if (NLZ > 0) {
+        // Disable the nsw and nuw flags here: We can no longer guarantee that
+        // we won't wrap after simplification. Removing the nsw/nuw flags is
+        // legal here because the top bit is not demanded.
+        I->setHasNoSignedWrap(false);
+        I->setHasNoUnsignedWrap(false);
+      }
+      return true;
+    }
+    return false;
+  };
+
   switch (I->getOpcode()) {
   default:
     computeKnownBits(I, Known, Depth, CxtI);
@@ -297,13 +320,11 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
           (LHSKnown.One & RHSKnown.One & DemandedMask) != 0) {
         APInt NewMask = ~(LHSKnown.One & RHSKnown.One & DemandedMask);
 
-        Constant *AndC =
-            ConstantInt::get(I->getType(), NewMask & AndRHS->getValue());
+        Constant *AndC = ConstantInt::get(VTy, NewMask & AndRHS->getValue());
         Instruction *NewAnd = BinaryOperator::CreateAnd(I->getOperand(0), AndC);
         InsertNewInstWith(NewAnd, *I);
 
-        Constant *XorC =
-            ConstantInt::get(I->getType(), NewMask & XorRHS->getValue());
+        Constant *XorC = ConstantInt::get(VTy, NewMask & XorRHS->getValue());
         Instruction *NewXor = BinaryOperator::CreateXor(NewAnd, XorC);
         return InsertNewInstWith(NewXor, *I);
       }
@@ -311,33 +332,6 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     break;
   }
   case Instruction::Select: {
-    Value *LHS, *RHS;
-    SelectPatternFlavor SPF = matchSelectPattern(I, LHS, RHS).Flavor;
-    if (SPF == SPF_UMAX) {
-      // UMax(A, C) == A if ...
-      // The lowest non-zero bit of DemandMask is higher than the highest
-      // non-zero bit of C.
-      const APInt *C;
-      unsigned CTZ = DemandedMask.countTrailingZeros();
-      if (match(RHS, m_APInt(C)) && CTZ >= C->getActiveBits())
-        return LHS;
-    } else if (SPF == SPF_UMIN) {
-      // UMin(A, C) == A if ...
-      // The lowest non-zero bit of DemandMask is higher than the highest
-      // non-one bit of C.
-      // This comes from using DeMorgans on the above umax example.
-      const APInt *C;
-      unsigned CTZ = DemandedMask.countTrailingZeros();
-      if (match(RHS, m_APInt(C)) &&
-          CTZ >= C->getBitWidth() - C->countLeadingOnes())
-        return LHS;
-    }
-
-    // If this is a select as part of any other min/max pattern, don't simplify
-    // any further in case we break the structure.
-    if (SPF != SPF_UNKNOWN)
-      return nullptr;
-
     if (SimplifyDemandedBits(I, 2, DemandedMask, RHSKnown, Depth + 1) ||
         SimplifyDemandedBits(I, 1, DemandedMask, LHSKnown, Depth + 1))
       return I;
@@ -393,12 +387,12 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     if (match(I->getOperand(0), m_OneUse(m_LShr(m_Value(X), m_APInt(C))))) {
       // The shift amount must be valid (not poison) in the narrow type, and
       // it must not be greater than the high bits demanded of the result.
-      if (C->ult(I->getType()->getScalarSizeInBits()) &&
+      if (C->ult(VTy->getScalarSizeInBits()) &&
           C->ule(DemandedMask.countLeadingZeros())) {
         // trunc (lshr X, C) --> lshr (trunc X), C
         IRBuilderBase::InsertPointGuard Guard(Builder);
         Builder.SetInsertPoint(I);
-        Value *Trunc = Builder.CreateTrunc(X, I->getType());
+        Value *Trunc = Builder.CreateTrunc(X, VTy);
         return Builder.CreateLShr(Trunc, C->getZExtValue());
       }
     }
@@ -420,9 +414,8 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     if (!I->getOperand(0)->getType()->isIntOrIntVectorTy())
       return nullptr;  // vector->int or fp->int?
 
-    if (VectorType *DstVTy = dyn_cast<VectorType>(I->getType())) {
-      if (VectorType *SrcVTy =
-            dyn_cast<VectorType>(I->getOperand(0)->getType())) {
+    if (auto *DstVTy = dyn_cast<VectorType>(VTy)) {
+      if (auto *SrcVTy = dyn_cast<VectorType>(I->getOperand(0)->getType())) {
         if (cast<FixedVectorType>(DstVTy)->getNumElements() !=
             cast<FixedVectorType>(SrcVTy)->getNumElements())
           // Don't touch a bitcast between vectors of different element counts.
@@ -507,26 +500,9 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     }
     LLVM_FALLTHROUGH;
   case Instruction::Sub: {
-    /// If the high-bits of an ADD/SUB are not demanded, then we do not care
-    /// about the high bits of the operands.
-    unsigned NLZ = DemandedMask.countLeadingZeros();
-    // Right fill the mask of bits for this ADD/SUB to demand the most
-    // significant bit and all those below it.
-    APInt DemandedFromOps(APInt::getLowBitsSet(BitWidth, BitWidth-NLZ));
-    if (ShrinkDemandedConstant(I, 0, DemandedFromOps) ||
-        SimplifyDemandedBits(I, 0, DemandedFromOps, LHSKnown, Depth + 1) ||
-        ShrinkDemandedConstant(I, 1, DemandedFromOps) ||
-        SimplifyDemandedBits(I, 1, DemandedFromOps, RHSKnown, Depth + 1)) {
-      if (NLZ > 0) {
-        // Disable the nsw and nuw flags here: We can no longer guarantee that
-        // we won't wrap after simplification. Removing the nsw/nuw flags is
-        // legal here because the top bit is not demanded.
-        BinaryOperator &BinOP = *cast<BinaryOperator>(I);
-        BinOP.setHasNoSignedWrap(false);
-        BinOP.setHasNoUnsignedWrap(false);
-      }
+    APInt DemandedFromOps;
+    if (simplifyOperandsBasedOnUnusedHighBits(DemandedFromOps))
       return I;
-    }
 
     // If we are known to be adding/subtracting zeros to every bit below
     // the highest demanded bit, we just return the other side.
@@ -544,6 +520,36 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
                                         NSW, LHSKnown, RHSKnown);
     break;
   }
+  case Instruction::Mul: {
+    APInt DemandedFromOps;
+    if (simplifyOperandsBasedOnUnusedHighBits(DemandedFromOps))
+      return I;
+
+    if (DemandedMask.isPowerOf2()) {
+      // The LSB of X*Y is set only if (X & 1) == 1 and (Y & 1) == 1.
+      // If we demand exactly one bit N and we have "X * (C' << N)" where C' is
+      // odd (has LSB set), then the left-shifted low bit of X is the answer.
+      unsigned CTZ = DemandedMask.countTrailingZeros();
+      const APInt *C;
+      if (match(I->getOperand(1), m_APInt(C)) &&
+          C->countTrailingZeros() == CTZ) {
+        Constant *ShiftC = ConstantInt::get(VTy, CTZ);
+        Instruction *Shl = BinaryOperator::CreateShl(I->getOperand(0), ShiftC);
+        return InsertNewInstWith(Shl, *I);
+      }
+    }
+    // For a squared value "X * X", the bottom 2 bits are 0 and X[0] because:
+    // X * X is odd iff X is odd.
+    // 'Quadratic Reciprocity': X * X -> 0 for bit[1]
+    if (I->getOperand(0) == I->getOperand(1) && DemandedMask.ult(4)) {
+      Constant *One = ConstantInt::get(VTy, 1);
+      Instruction *And1 = BinaryOperator::CreateAnd(I->getOperand(0), One);
+      return InsertNewInstWith(And1, *I);
+    }
+
+    computeKnownBits(I, Known, Depth, CxtI);
+    break;
+  }
   case Instruction::Shl: {
     const APInt *SA;
     if (match(I->getOperand(1), m_APInt(SA))) {
@@ -554,7 +560,26 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
                                                     DemandedMask, Known))
             return R;
 
+      // TODO: If we only want bits that already match the signbit then we don't
+      // need to shift.
+
+      // If we can pre-shift a right-shifted constant to the left without
+      // losing any high bits amd we don't demand the low bits, then eliminate
+      // the left-shift:
+      // (C >> X) << LeftShiftAmtC --> (C << RightShiftAmtC) >> X
       uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
+      Value *X;
+      Constant *C;
+      if (DemandedMask.countTrailingZeros() >= ShiftAmt &&
+          match(I->getOperand(0), m_LShr(m_ImmConstant(C), m_Value(X)))) {
+        Constant *LeftShiftAmtC = ConstantInt::get(VTy, ShiftAmt);
+        Constant *NewC = ConstantExpr::getShl(C, LeftShiftAmtC);
+        if (ConstantExpr::getLShr(NewC, LeftShiftAmtC) == C) {
+          Instruction *Lshr = BinaryOperator::CreateLShr(NewC, X);
+          return InsertNewInstWith(Lshr, *I);
+        }
+      }
+
       APInt DemandedMaskIn(DemandedMask.lshr(ShiftAmt));
 
       // If the shift is NUW/NSW, then it does demand the high bits.
@@ -584,7 +609,7 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         else if (SignBitOne)
           Known.One.setSignBit();
         if (Known.hasConflict())
-          return UndefValue::get(I->getType());
+          return UndefValue::get(VTy);
       }
     } else {
       // This is a variable shift, so we can't shift the demand mask by a known
@@ -607,6 +632,34 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     if (match(I->getOperand(1), m_APInt(SA))) {
       uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
 
+      // If we are just demanding the shifted sign bit and below, then this can
+      // be treated as an ASHR in disguise.
+      if (DemandedMask.countLeadingZeros() >= ShiftAmt) {
+        // If we only want bits that already match the signbit then we don't
+        // need to shift.
+        unsigned NumHiDemandedBits =
+            BitWidth - DemandedMask.countTrailingZeros();
+        unsigned SignBits =
+            ComputeNumSignBits(I->getOperand(0), Depth + 1, CxtI);
+        if (SignBits >= NumHiDemandedBits)
+          return I->getOperand(0);
+
+        // If we can pre-shift a left-shifted constant to the right without
+        // losing any low bits (we already know we don't demand the high bits),
+        // then eliminate the right-shift:
+        // (C << X) >> RightShiftAmtC --> (C >> RightShiftAmtC) << X
+        Value *X;
+        Constant *C;
+        if (match(I->getOperand(0), m_Shl(m_ImmConstant(C), m_Value(X)))) {
+          Constant *RightShiftAmtC = ConstantInt::get(VTy, ShiftAmt);
+          Constant *NewC = ConstantExpr::getLShr(C, RightShiftAmtC);
+          if (ConstantExpr::getShl(NewC, RightShiftAmtC) == C) {
+            Instruction *Shl = BinaryOperator::CreateShl(NewC, X);
+            return InsertNewInstWith(Shl, *I);
+          }
+        }
+      }
+
       // Unsigned shift right.
       APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt));
 
@@ -628,6 +681,14 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     break;
   }
   case Instruction::AShr: {
+    unsigned SignBits = ComputeNumSignBits(I->getOperand(0), Depth + 1, CxtI);
+
+    // If we only want bits that already match the signbit then we don't need
+    // to shift.
+    unsigned NumHiDemandedBits = BitWidth - DemandedMask.countTrailingZeros();
+    if (SignBits >= NumHiDemandedBits)
+      return I->getOperand(0);
+
     // If this is an arithmetic shift right and only the low-bit is set, we can
     // always convert this into a logical shr, even if the shift amount is
     // variable.  The low bit of the shift cannot be an input sign bit unless
@@ -639,11 +700,6 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       return InsertNewInstWith(NewVal, *I);
     }
 
-    // If the sign bit is the only bit demanded by this ashr, then there is no
-    // need to do it, the shift doesn't change the high bit.
-    if (DemandedMask.isSignMask())
-      return I->getOperand(0);
-
     const APInt *SA;
     if (match(I->getOperand(1), m_APInt(SA))) {
       uint32_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
@@ -663,8 +719,6 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1))
         return I;
 
-      unsigned SignBits = ComputeNumSignBits(I->getOperand(0), Depth + 1, CxtI);
-
       assert(!Known.hasConflict() && "Bits known to be one AND zero?");
       // Compute the new bits that are at the top now plus sign bits.
       APInt HighBits(APInt::getHighBitsSet(
@@ -713,13 +767,13 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     break;
   }
   case Instruction::SRem: {
-    ConstantInt *Rem;
-    if (match(I->getOperand(1), m_ConstantInt(Rem))) {
+    const APInt *Rem;
+    if (match(I->getOperand(1), m_APInt(Rem))) {
       // X % -1 demands all the bits because we don't want to introduce
       // INT_MIN % -1 (== undef) by accident.
-      if (Rem->isMinusOne())
+      if (Rem->isAllOnes())
         break;
-      APInt RA = Rem->getValue().abs();
+      APInt RA = Rem->abs();
       if (RA.isPowerOf2()) {
         if (DemandedMask.ult(RA))    // srem won't affect demanded bits
           return I->getOperand(0);
@@ -786,7 +840,7 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         if (DemandedMask == 1 && VTy->getScalarSizeInBits() % 2 == 0 &&
             match(II->getArgOperand(0), m_Not(m_Value(X)))) {
           Function *Ctpop = Intrinsic::getDeclaration(
-              II->getModule(), Intrinsic::ctpop, II->getType());
+              II->getModule(), Intrinsic::ctpop, VTy);
           return InsertNewInstWith(CallInst::Create(Ctpop, {X}), *I);
         }
         break;
@@ -809,12 +863,10 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
           Instruction *NewVal;
           if (NLZ > NTZ)
             NewVal = BinaryOperator::CreateLShr(
-                II->getArgOperand(0),
-                ConstantInt::get(I->getType(), NLZ - NTZ));
+                II->getArgOperand(0), ConstantInt::get(VTy, NLZ - NTZ));
           else
             NewVal = BinaryOperator::CreateShl(
-                II->getArgOperand(0),
-                ConstantInt::get(I->getType(), NTZ - NLZ));
+                II->getArgOperand(0), ConstantInt::get(VTy, NTZ - NLZ));
           NewVal->takeName(I);
           return InsertNewInstWith(NewVal, *I);
         }
@@ -872,7 +924,7 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         // Handle target specific intrinsics
         Optional<Value *> V = targetSimplifyDemandedUseBitsIntrinsic(
             *II, DemandedMask, Known, KnownBitsComputed);
-        if (V.hasValue())
+        if (V)
           return V.getValue();
         break;
       }
@@ -1583,7 +1635,7 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V,
       Optional<Value *> V = targetSimplifyDemandedVectorEltsIntrinsic(
           *II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
           simplifyAndSetOp);
-      if (V.hasValue())
+      if (V)
         return V.getValue();
       break;
     }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 736cf9c825d5..22659a8e4951 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -42,7 +42,6 @@
 #include <utility>
 
 #define DEBUG_TYPE "instcombine"
-#include "llvm/Transforms/Utils/InstructionWorklist.h"
 
 using namespace llvm;
 using namespace PatternMatch;
@@ -378,7 +377,7 @@ ConstantInt *getPreferredVectorIndex(ConstantInt *IndexC) {
 Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
   Value *SrcVec = EI.getVectorOperand();
   Value *Index = EI.getIndexOperand();
-  if (Value *V = SimplifyExtractElementInst(SrcVec, Index,
+  if (Value *V = simplifyExtractElementInst(SrcVec, Index,
                                             SQ.getWithInstruction(&EI)))
     return replaceInstUsesWith(EI, V);
 
@@ -879,7 +878,7 @@ Instruction *InstCombinerImpl::foldAggregateConstructionIntoAggregateReuse(
     // of an aggregate. If we did, that means the CurrIVI will later be
     // overwritten with the already-recorded value. But if not, let's record it!
     Optional<Instruction *> &Elt = AggElts[Indices.front()];
-    Elt = Elt.getValueOr(InsertedValue);
+    Elt = Elt.value_or(InsertedValue);
 
     // FIXME: should we handle chain-terminating undef base operand?
   }
@@ -1489,7 +1488,7 @@ Instruction *InstCombinerImpl::visitInsertElementInst(InsertElementInst &IE) {
   Value *ScalarOp = IE.getOperand(1);
   Value *IdxOp    = IE.getOperand(2);
 
-  if (auto *V = SimplifyInsertElementInst(
+  if (auto *V = simplifyInsertElementInst(
           VecOp, ScalarOp, IdxOp, SQ.getWithInstruction(&IE)))
     return replaceInstUsesWith(IE, V);
 
@@ -1919,24 +1918,29 @@ static BinopElts getAlternateBinop(BinaryOperator *BO, const DataLayout &DL) {
   Value *BO0 = BO->getOperand(0), *BO1 = BO->getOperand(1);
   Type *Ty = BO->getType();
   switch (BO->getOpcode()) {
-    case Instruction::Shl: {
-      // shl X, C --> mul X, (1 << C)
-      Constant *C;
-      if (match(BO1, m_Constant(C))) {
-        Constant *ShlOne = ConstantExpr::getShl(ConstantInt::get(Ty, 1), C);
-        return { Instruction::Mul, BO0, ShlOne };
-      }
-      break;
-    }
-    case Instruction::Or: {
-      // or X, C --> add X, C (when X and C have no common bits set)
-      const APInt *C;
-      if (match(BO1, m_APInt(C)) && MaskedValueIsZero(BO0, *C, DL))
-        return { Instruction::Add, BO0, BO1 };
-      break;
+  case Instruction::Shl: {
+    // shl X, C --> mul X, (1 << C)
+    Constant *C;
+    if (match(BO1, m_Constant(C))) {
+      Constant *ShlOne = ConstantExpr::getShl(ConstantInt::get(Ty, 1), C);
+      return {Instruction::Mul, BO0, ShlOne};
     }
-    default:
-      break;
+    break;
+  }
+  case Instruction::Or: {
+    // or X, C --> add X, C (when X and C have no common bits set)
+    const APInt *C;
+    if (match(BO1, m_APInt(C)) && MaskedValueIsZero(BO0, *C, DL))
+      return {Instruction::Add, BO0, BO1};
+    break;
+  }
+  case Instruction::Sub:
+    // sub 0, X --> mul X, -1
+    if (match(BO0, m_ZeroInt()))
+      return {Instruction::Mul, BO1, ConstantInt::getAllOnesValue(Ty)};
+    break;
+  default:
+    break;
   }
   return {};
 }
@@ -2053,15 +2057,20 @@ Instruction *InstCombinerImpl::foldSelectShuffle(ShuffleVectorInst &Shuf) {
       !match(Shuf.getOperand(1), m_BinOp(B1)))
     return nullptr;
 
+  // If one operand is "0 - X", allow that to be viewed as "X * -1"
+  // (ConstantsAreOp1) by getAlternateBinop below. If the neg is not paired
+  // with a multiply, we will exit because C0/C1 will not be set.
   Value *X, *Y;
-  Constant *C0, *C1;
+  Constant *C0 = nullptr, *C1 = nullptr;
   bool ConstantsAreOp1;
-  if (match(B0, m_BinOp(m_Value(X), m_Constant(C0))) &&
-      match(B1, m_BinOp(m_Value(Y), m_Constant(C1))))
-    ConstantsAreOp1 = true;
-  else if (match(B0, m_BinOp(m_Constant(C0), m_Value(X))) &&
-           match(B1, m_BinOp(m_Constant(C1), m_Value(Y))))
+  if (match(B0, m_BinOp(m_Constant(C0), m_Value(X))) &&
+      match(B1, m_BinOp(m_Constant(C1), m_Value(Y))))
     ConstantsAreOp1 = false;
+  else if (match(B0, m_CombineOr(m_BinOp(m_Value(X), m_Constant(C0)),
+                                 m_Neg(m_Value(X)))) &&
+           match(B1, m_CombineOr(m_BinOp(m_Value(Y), m_Constant(C1)),
+                                 m_Neg(m_Value(Y)))))
+    ConstantsAreOp1 = true;
   else
     return nullptr;
 
@@ -2086,7 +2095,7 @@ Instruction *InstCombinerImpl::foldSelectShuffle(ShuffleVectorInst &Shuf) {
     }
   }
 
-  if (Opc0 != Opc1)
+  if (Opc0 != Opc1 || !C0 || !C1)
     return nullptr;
 
   // The opcodes must be the same. Use a new name to make that clear.
@@ -2233,6 +2242,88 @@ static Instruction *narrowVectorSelect(ShuffleVectorInst &Shuf,
   return SelectInst::Create(NarrowCond, NarrowX, NarrowY);
 }
 
+/// Canonicalize FP negate after shuffle.
+static Instruction *foldFNegShuffle(ShuffleVectorInst &Shuf,
+                                    InstCombiner::BuilderTy &Builder) {
+  Instruction *FNeg0;
+  Value *X;
+  if (!match(Shuf.getOperand(0), m_CombineAnd(m_Instruction(FNeg0),
+                                              m_FNeg(m_Value(X)))))
+    return nullptr;
+
+  // shuffle (fneg X), Mask --> fneg (shuffle X, Mask)
+  if (FNeg0->hasOneUse() && match(Shuf.getOperand(1), m_Undef())) {
+    Value *NewShuf = Builder.CreateShuffleVector(X, Shuf.getShuffleMask());
+    return UnaryOperator::CreateFNegFMF(NewShuf, FNeg0);
+  }
+
+  Instruction *FNeg1;
+  Value *Y;
+  if (!match(Shuf.getOperand(1), m_CombineAnd(m_Instruction(FNeg1),
+                                              m_FNeg(m_Value(Y)))))
+    return nullptr;
+
+  // shuffle (fneg X), (fneg Y), Mask --> fneg (shuffle X, Y, Mask)
+  if (FNeg0->hasOneUse() || FNeg1->hasOneUse()) {
+    Value *NewShuf = Builder.CreateShuffleVector(X, Y, Shuf.getShuffleMask());
+    Instruction *NewFNeg = UnaryOperator::CreateFNeg(NewShuf);
+    NewFNeg->copyIRFlags(FNeg0);
+    NewFNeg->andIRFlags(FNeg1);
+    return NewFNeg;
+  }
+
+  return nullptr;
+}
+
+/// Canonicalize casts after shuffle.
+static Instruction *foldCastShuffle(ShuffleVectorInst &Shuf,
+                                    InstCombiner::BuilderTy &Builder) {
+  // Do we have 2 matching cast operands?
+  auto *Cast0 = dyn_cast<CastInst>(Shuf.getOperand(0));
+  auto *Cast1 = dyn_cast<CastInst>(Shuf.getOperand(1));
+  if (!Cast0 || !Cast1 || Cast0->getOpcode() != Cast1->getOpcode() ||
+      Cast0->getSrcTy() != Cast1->getSrcTy())
+    return nullptr;
+
+  // TODO: Allow other opcodes? That would require easing the type restrictions
+  //       below here.
+  CastInst::CastOps CastOpcode = Cast0->getOpcode();
+  switch (CastOpcode) {
+  case Instruction::FPToSI:
+  case Instruction::FPToUI:
+  case Instruction::SIToFP:
+  case Instruction::UIToFP:
+    break;
+  default:
+    return nullptr;
+  }
+
+  VectorType *ShufTy = Shuf.getType();
+  VectorType *ShufOpTy = cast<VectorType>(Shuf.getOperand(0)->getType());
+  VectorType *CastSrcTy = cast<VectorType>(Cast0->getSrcTy());
+
+  // TODO: Allow length-increasing shuffles?
+  if (ShufTy->getElementCount().getKnownMinValue() >
+      ShufOpTy->getElementCount().getKnownMinValue())
+    return nullptr;
+
+  // TODO: Allow element-size-decreasing casts (ex: fptosi float to i8)?
+  assert(isa<FixedVectorType>(CastSrcTy) && isa<FixedVectorType>(ShufOpTy) &&
+         "Expected fixed vector operands for casts and binary shuffle");
+  if (CastSrcTy->getPrimitiveSizeInBits() > ShufOpTy->getPrimitiveSizeInBits())
+    return nullptr;
+
+  // At least one of the operands must have only one use (the shuffle).
+  if (!Cast0->hasOneUse() && !Cast1->hasOneUse())
+    return nullptr;
+
+  // shuffle (cast X), (cast Y), Mask --> cast (shuffle X, Y, Mask)
+  Value *X = Cast0->getOperand(0);
+  Value *Y = Cast1->getOperand(0);
+  Value *NewShuf = Builder.CreateShuffleVector(X, Y, Shuf.getShuffleMask());
+  return CastInst::Create(CastOpcode, NewShuf, ShufTy);
+}
+
 /// Try to fold an extract subvector operation.
 static Instruction *foldIdentityExtractShuffle(ShuffleVectorInst &Shuf) {
   Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1);
@@ -2442,7 +2533,7 @@ Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   Value *LHS = SVI.getOperand(0);
   Value *RHS = SVI.getOperand(1);
   SimplifyQuery ShufQuery = SQ.getWithInstruction(&SVI);
-  if (auto *V = SimplifyShuffleVectorInst(LHS, RHS, SVI.getShuffleMask(),
+  if (auto *V = simplifyShuffleVectorInst(LHS, RHS, SVI.getShuffleMask(),
                                           SVI.getType(), ShufQuery))
     return replaceInstUsesWith(SVI, V);
 
@@ -2497,7 +2588,7 @@ Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
     if (!ScaledMask.empty()) {
       // If the shuffled source vector simplifies, cast that value to this
       // shuffle's type.
-      if (auto *V = SimplifyShuffleVectorInst(X, UndefValue::get(XType),
+      if (auto *V = simplifyShuffleVectorInst(X, UndefValue::get(XType),
                                               ScaledMask, XType, ShufQuery))
         return BitCastInst::Create(Instruction::BitCast, V, SVI.getType());
     }
@@ -2528,6 +2619,12 @@ Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   if (Instruction *I = narrowVectorSelect(SVI, Builder))
     return I;
 
+  if (Instruction *I = foldFNegShuffle(SVI, Builder))
+    return I;
+
+  if (Instruction *I = foldCastShuffle(SVI, Builder))
+    return I;
+
   APInt UndefElts(VWidth, 0);
   APInt AllOnesEltMask(APInt::getAllOnes(VWidth));
   if (Value *V = SimplifyDemandedVectorElts(&SVI, AllOnesEltMask, UndefElts)) {
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 3091905ca534..0816a4a575d9 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -42,7 +42,6 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
@@ -60,6 +59,7 @@
 #include "llvm/Analysis/TargetFolder.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/BasicBlock.h"
@@ -90,8 +90,6 @@
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CBindingWrapping.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
@@ -140,6 +138,10 @@ static cl::opt<bool>
 EnableCodeSinking("instcombine-code-sinking", cl::desc("Enable code sinking"),
                                               cl::init(true));
 
+static cl::opt<unsigned> MaxSinkNumUsers(
+    "instcombine-max-sink-users", cl::init(32),
+    cl::desc("Maximum number of undroppable users for instruction sinking"));
+
 static cl::opt<unsigned> LimitMaxIterations(
     "instcombine-max-iterations",
     cl::desc("Limit the maximum number of instruction combining iterations"),
@@ -424,7 +426,7 @@ bool InstCombinerImpl::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
         Value *C = I.getOperand(1);
 
         // Does "B op C" simplify?
-        if (Value *V = SimplifyBinOp(Opcode, B, C, SQ.getWithInstruction(&I))) {
+        if (Value *V = simplifyBinOp(Opcode, B, C, SQ.getWithInstruction(&I))) {
           // It simplifies to V.  Form "A op V".
           replaceOperand(I, 0, A);
           replaceOperand(I, 1, V);
@@ -457,7 +459,7 @@ bool InstCombinerImpl::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
         Value *C = Op1->getOperand(1);
 
         // Does "A op B" simplify?
-        if (Value *V = SimplifyBinOp(Opcode, A, B, SQ.getWithInstruction(&I))) {
+        if (Value *V = simplifyBinOp(Opcode, A, B, SQ.getWithInstruction(&I))) {
           // It simplifies to V.  Form "V op C".
           replaceOperand(I, 0, V);
           replaceOperand(I, 1, C);
@@ -485,7 +487,7 @@ bool InstCombinerImpl::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
         Value *C = I.getOperand(1);
 
         // Does "C op A" simplify?
-        if (Value *V = SimplifyBinOp(Opcode, C, A, SQ.getWithInstruction(&I))) {
+        if (Value *V = simplifyBinOp(Opcode, C, A, SQ.getWithInstruction(&I))) {
           // It simplifies to V.  Form "V op B".
           replaceOperand(I, 0, V);
           replaceOperand(I, 1, B);
@@ -505,7 +507,7 @@ bool InstCombinerImpl::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
         Value *C = Op1->getOperand(1);
 
         // Does "C op A" simplify?
-        if (Value *V = SimplifyBinOp(Opcode, C, A, SQ.getWithInstruction(&I))) {
+        if (Value *V = simplifyBinOp(Opcode, C, A, SQ.getWithInstruction(&I))) {
           // It simplifies to V.  Form "B op V".
           replaceOperand(I, 0, B);
           replaceOperand(I, 1, V);
@@ -652,7 +654,7 @@ Value *InstCombinerImpl::tryFactorization(BinaryOperator &I,
         std::swap(C, D);
       // Consider forming "A op' (B op D)".
       // If "B op D" simplifies then it can be formed with no cost.
-      V = SimplifyBinOp(TopLevelOpcode, B, D, SQ.getWithInstruction(&I));
+      V = simplifyBinOp(TopLevelOpcode, B, D, SQ.getWithInstruction(&I));
       // If "B op D" doesn't simplify then only go on if both of the existing
       // operations "A op' B" and "C op' D" will be zapped as no longer used.
       if (!V && LHS->hasOneUse() && RHS->hasOneUse())
@@ -671,7 +673,7 @@ Value *InstCombinerImpl::tryFactorization(BinaryOperator &I,
         std::swap(C, D);
       // Consider forming "(A op C) op' B".
       // If "A op C" simplifies then it can be formed with no cost.
-      V = SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I));
+      V = simplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I));
 
       // If "A op C" doesn't simplify then only go on if both of the existing
       // operations "A op' B" and "C op' D" will be zapped as no longer used.
@@ -780,8 +782,8 @@ Value *InstCombinerImpl::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
 
     // Disable the use of undef because it's not safe to distribute undef.
     auto SQDistributive = SQ.getWithInstruction(&I).getWithoutUndef();
-    Value *L = SimplifyBinOp(TopLevelOpcode, A, C, SQDistributive);
-    Value *R = SimplifyBinOp(TopLevelOpcode, B, C, SQDistributive);
+    Value *L = simplifyBinOp(TopLevelOpcode, A, C, SQDistributive);
+    Value *R = simplifyBinOp(TopLevelOpcode, B, C, SQDistributive);
 
     // Do "A op C" and "B op C" both simplify?
     if (L && R) {
@@ -819,8 +821,8 @@ Value *InstCombinerImpl::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
 
     // Disable the use of undef because it's not safe to distribute undef.
     auto SQDistributive = SQ.getWithInstruction(&I).getWithoutUndef();
-    Value *L = SimplifyBinOp(TopLevelOpcode, A, B, SQDistributive);
-    Value *R = SimplifyBinOp(TopLevelOpcode, A, C, SQDistributive);
+    Value *L = simplifyBinOp(TopLevelOpcode, A, B, SQDistributive);
+    Value *R = simplifyBinOp(TopLevelOpcode, A, C, SQDistributive);
 
     // Do "A op B" and "A op C" both simplify?
     if (L && R) {
@@ -876,8 +878,8 @@ Value *InstCombinerImpl::SimplifySelectsFeedingBinaryOp(BinaryOperator &I,
   if (LHSIsSelect && RHSIsSelect && A == D) {
     // (A ? B : C) op (A ? E : F) -> A ? (B op E) : (C op F)
     Cond = A;
-    True = SimplifyBinOp(Opcode, B, E, FMF, Q);
-    False = SimplifyBinOp(Opcode, C, F, FMF, Q);
+    True = simplifyBinOp(Opcode, B, E, FMF, Q);
+    False = simplifyBinOp(Opcode, C, F, FMF, Q);
 
     if (LHS->hasOneUse() && RHS->hasOneUse()) {
       if (False && !True)
@@ -888,13 +890,13 @@ Value *InstCombinerImpl::SimplifySelectsFeedingBinaryOp(BinaryOperator &I,
   } else if (LHSIsSelect && LHS->hasOneUse()) {
     // (A ? B : C) op Y -> A ? (B op Y) : (C op Y)
     Cond = A;
-    True = SimplifyBinOp(Opcode, B, RHS, FMF, Q);
-    False = SimplifyBinOp(Opcode, C, RHS, FMF, Q);
+    True = simplifyBinOp(Opcode, B, RHS, FMF, Q);
+    False = simplifyBinOp(Opcode, C, RHS, FMF, Q);
   } else if (RHSIsSelect && RHS->hasOneUse()) {
     // X op (D ? E : F) -> D ? (X op E) : (X op F)
     Cond = D;
-    True = SimplifyBinOp(Opcode, LHS, E, FMF, Q);
-    False = SimplifyBinOp(Opcode, LHS, F, FMF, Q);
+    True = simplifyBinOp(Opcode, LHS, E, FMF, Q);
+    False = simplifyBinOp(Opcode, LHS, F, FMF, Q);
   }
 
   if (!True || !False)
@@ -986,8 +988,8 @@ Instruction *InstCombinerImpl::foldBinopOfSextBoolToSelect(BinaryOperator &BO) {
   // bo (sext i1 X), C --> select X, (bo -1, C), (bo 0, C)
   Constant *Ones = ConstantInt::getAllOnesValue(BO.getType());
   Constant *Zero = ConstantInt::getNullValue(BO.getType());
-  Constant *TVal = ConstantExpr::get(BO.getOpcode(), Ones, C);
-  Constant *FVal = ConstantExpr::get(BO.getOpcode(), Zero, C);
+  Value *TVal = Builder.CreateBinOp(BO.getOpcode(), Ones, C);
+  Value *FVal = Builder.CreateBinOp(BO.getOpcode(), Zero, C);
   return SelectInst::Create(X, TVal, FVal);
 }
 
@@ -1018,12 +1020,6 @@ static Value *foldOperationIntoSelectOperand(Instruction &I, Value *SO,
   bool ConstIsRHS = isa<Constant>(I.getOperand(1));
   Constant *ConstOperand = cast<Constant>(I.getOperand(ConstIsRHS));
 
-  if (auto *SOC = dyn_cast<Constant>(SO)) {
-    if (ConstIsRHS)
-      return ConstantExpr::get(I.getOpcode(), SOC, ConstOperand);
-    return ConstantExpr::get(I.getOpcode(), ConstOperand, SOC);
-  }
-
   Value *Op0 = SO, *Op1 = ConstOperand;
   if (!ConstIsRHS)
     std::swap(Op0, Op1);
@@ -1035,10 +1031,10 @@ static Value *foldOperationIntoSelectOperand(Instruction &I, Value *SO,
   return NewBO;
 }
 
-Instruction *InstCombinerImpl::FoldOpIntoSelect(Instruction &Op,
-                                                SelectInst *SI) {
-  // Don't modify shared select instructions.
-  if (!SI->hasOneUse())
+Instruction *InstCombinerImpl::FoldOpIntoSelect(Instruction &Op, SelectInst *SI,
+                                                bool FoldWithMultiUse) {
+  // Don't modify shared select instructions unless set FoldWithMultiUse
+  if (!SI->hasOneUse() && !FoldWithMultiUse)
     return nullptr;
 
   Value *TV = SI->getTrueValue();
@@ -1114,12 +1110,6 @@ static Value *foldOperationIntoPhiValue(BinaryOperator *I, Value *InV,
   bool ConstIsRHS = isa<Constant>(I->getOperand(1));
   Constant *C = cast<Constant>(I->getOperand(ConstIsRHS));
 
-  if (auto *InC = dyn_cast<Constant>(InV)) {
-    if (ConstIsRHS)
-      return ConstantExpr::get(I->getOpcode(), InC, C);
-    return ConstantExpr::get(I->getOpcode(), C, InC);
-  }
-
   Value *Op0 = InV, *Op1 = C;
   if (!ConstIsRHS)
     std::swap(Op0, Op1);
@@ -1175,10 +1165,11 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
       if (cast<Instruction>(InVal)->getParent() == NonConstBB)
         return nullptr;
 
-    // If the incoming non-constant value is in I's block, we will remove one
-    // instruction, but insert another equivalent one, leading to infinite
-    // instcombine.
-    if (isPotentiallyReachable(I.getParent(), NonConstBB, nullptr, &DT, LI))
+    // If the incoming non-constant value is reachable from the phis block,
+    // we'll push the operation across a loop backedge. This could result in
+    // an infinite combine loop, and is generally non-profitable (especially
+    // if the operation was originally outside the loop).
+    if (isPotentiallyReachable(PN->getParent(), NonConstBB, nullptr, &DT, LI))
       return nullptr;
   }
 
@@ -1941,10 +1932,8 @@ static Instruction *foldSelectGEP(GetElementPtrInst &GEP,
   SmallVector<Value *, 4> IndexC(GEP.indices());
   bool IsInBounds = GEP.isInBounds();
   Type *Ty = GEP.getSourceElementType();
-  Value *NewTrueC = IsInBounds ? Builder.CreateInBoundsGEP(Ty, TrueC, IndexC)
-                               : Builder.CreateGEP(Ty, TrueC, IndexC);
-  Value *NewFalseC = IsInBounds ? Builder.CreateInBoundsGEP(Ty, FalseC, IndexC)
-                                : Builder.CreateGEP(Ty, FalseC, IndexC);
+  Value *NewTrueC = Builder.CreateGEP(Ty, TrueC, IndexC, "", IsInBounds);
+  Value *NewFalseC = Builder.CreateGEP(Ty, FalseC, IndexC, "", IsInBounds);
   return SelectInst::Create(Cond, NewTrueC, NewFalseC, "", nullptr, Sel);
 }
 
@@ -1953,13 +1942,11 @@ Instruction *InstCombinerImpl::visitGEPOfGEP(GetElementPtrInst &GEP,
   // Combine Indices - If the source pointer to this getelementptr instruction
   // is a getelementptr instruction with matching element type, combine the
   // indices of the two getelementptr instructions into a single instruction.
-  if (Src->getResultElementType() != GEP.getSourceElementType())
-    return nullptr;
-
   if (!shouldMergeGEPs(*cast<GEPOperator>(&GEP), *Src))
     return nullptr;
 
-  if (Src->getNumOperands() == 2 && GEP.getNumOperands() == 2 &&
+  if (Src->getResultElementType() == GEP.getSourceElementType() &&
+      Src->getNumOperands() == 2 && GEP.getNumOperands() == 2 &&
       Src->hasOneUse()) {
     Value *GO1 = GEP.getOperand(1);
     Value *SO1 = Src->getOperand(1);
@@ -1971,45 +1958,21 @@ Instruction *InstCombinerImpl::visitGEPOfGEP(GetElementPtrInst &GEP,
         // invariant: this breaks the dependence between GEPs and allows LICM
         // to hoist the invariant part out of the loop.
         if (L->isLoopInvariant(GO1) && !L->isLoopInvariant(SO1)) {
-          // We have to be careful here.
-          // We have something like:
-          //  %src = getelementptr <ty>, <ty>* %base, <ty> %idx
-          //  %gep = getelementptr <ty>, <ty>* %src, <ty> %idx2
-          // If we just swap idx & idx2 then we could inadvertantly
-          // change %src from a vector to a scalar, or vice versa.
-          // Cases:
-          //  1) %base a scalar & idx a scalar & idx2 a vector
-          //      => Swapping idx & idx2 turns %src into a vector type.
-          //  2) %base a scalar & idx a vector & idx2 a scalar
-          //      => Swapping idx & idx2 turns %src in a scalar type
-          //  3) %base, %idx, and %idx2 are scalars
-          //      => %src & %gep are scalars
-          //      => swapping idx & idx2 is safe
-          //  4) %base a vector
-          //      => %src is a vector
-          //      => swapping idx & idx2 is safe.
-          auto *SO0 = Src->getOperand(0);
-          auto *SO0Ty = SO0->getType();
-          if (!isa<VectorType>(GEP.getType()) || // case 3
-              isa<VectorType>(SO0Ty)) { // case 4
-            Src->setOperand(1, GO1);
-            GEP.setOperand(1, SO1);
-            return &GEP;
-          } else {
-            // Case 1 or 2
-            // -- have to recreate %src & %gep
-            // put NewSrc at same location as %src
-            Builder.SetInsertPoint(cast<Instruction>(Src));
-            Value *NewSrc = Builder.CreateGEP(
-                GEP.getSourceElementType(), SO0, GO1, Src->getName());
-            // Propagate 'inbounds' if the new source was not constant-folded.
-            if (auto *NewSrcGEPI = dyn_cast<GetElementPtrInst>(NewSrc))
-              NewSrcGEPI->setIsInBounds(Src->isInBounds());
-            GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
-                GEP.getSourceElementType(), NewSrc, {SO1});
-            NewGEP->setIsInBounds(GEP.isInBounds());
-            return NewGEP;
-          }
+          // The swapped GEPs are inbounds if both original GEPs are inbounds
+          // and the sign of the offsets is the same. For simplicity, only
+          // handle both offsets being non-negative.
+          bool IsInBounds = Src->isInBounds() && GEP.isInBounds() &&
+                            isKnownNonNegative(SO1, DL, 0, &AC, &GEP, &DT) &&
+                            isKnownNonNegative(GO1, DL, 0, &AC, &GEP, &DT);
+          // Put NewSrc at same location as %src.
+          Builder.SetInsertPoint(cast<Instruction>(Src));
+          Value *NewSrc = Builder.CreateGEP(GEP.getSourceElementType(),
+                                            Src->getPointerOperand(), GO1,
+                                            Src->getName(), IsInBounds);
+          GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
+              GEP.getSourceElementType(), NewSrc, {SO1});
+          NewGEP->setIsInBounds(IsInBounds);
+          return NewGEP;
         }
       }
     }
@@ -2022,6 +1985,87 @@ Instruction *InstCombinerImpl::visitGEPOfGEP(GetElementPtrInst &GEP,
     if (SrcGEP->getNumOperands() == 2 && shouldMergeGEPs(*Src, *SrcGEP))
       return nullptr;   // Wait until our source is folded to completion.
 
+  // For constant GEPs, use a more general offset-based folding approach.
+  // Only do this for opaque pointers, as the result element type may change.
+  Type *PtrTy = Src->getType()->getScalarType();
+  if (PtrTy->isOpaquePointerTy() && GEP.hasAllConstantIndices() &&
+      (Src->hasOneUse() || Src->hasAllConstantIndices())) {
+    // Split Src into a variable part and a constant suffix.
+    gep_type_iterator GTI = gep_type_begin(*Src);
+    Type *BaseType = GTI.getIndexedType();
+    bool IsFirstType = true;
+    unsigned NumVarIndices = 0;
+    for (auto Pair : enumerate(Src->indices())) {
+      if (!isa<ConstantInt>(Pair.value())) {
+        BaseType = GTI.getIndexedType();
+        IsFirstType = false;
+        NumVarIndices = Pair.index() + 1;
+      }
+      ++GTI;
+    }
+
+    // Determine the offset for the constant suffix of Src.
+    APInt Offset(DL.getIndexTypeSizeInBits(PtrTy), 0);
+    if (NumVarIndices != Src->getNumIndices()) {
+      // FIXME: getIndexedOffsetInType() does not handled scalable vectors.
+      if (isa<ScalableVectorType>(BaseType))
+        return nullptr;
+
+      SmallVector<Value *> ConstantIndices;
+      if (!IsFirstType)
+        ConstantIndices.push_back(
+            Constant::getNullValue(Type::getInt32Ty(GEP.getContext())));
+      append_range(ConstantIndices, drop_begin(Src->indices(), NumVarIndices));
+      Offset += DL.getIndexedOffsetInType(BaseType, ConstantIndices);
+    }
+
+    // Add the offset for GEP (which is fully constant).
+    if (!GEP.accumulateConstantOffset(DL, Offset))
+      return nullptr;
+
+    APInt OffsetOld = Offset;
+    // Convert the total offset back into indices.
+    SmallVector<APInt> ConstIndices =
+        DL.getGEPIndicesForOffset(BaseType, Offset);
+    if (!Offset.isZero() || (!IsFirstType && !ConstIndices[0].isZero())) {
+      // If both GEP are constant-indexed, and cannot be merged in either way,
+      // convert them to a GEP of i8.
+      if (Src->hasAllConstantIndices())
+        return isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP))
+            ? GetElementPtrInst::CreateInBounds(
+                Builder.getInt8Ty(), Src->getOperand(0),
+                Builder.getInt(OffsetOld), GEP.getName())
+            : GetElementPtrInst::Create(
+                Builder.getInt8Ty(), Src->getOperand(0),
+                Builder.getInt(OffsetOld), GEP.getName());
+      return nullptr;
+    }
+
+    bool IsInBounds = isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP));
+    SmallVector<Value *> Indices;
+    append_range(Indices, drop_end(Src->indices(),
+                                   Src->getNumIndices() - NumVarIndices));
+    for (const APInt &Idx : drop_begin(ConstIndices, !IsFirstType)) {
+      Indices.push_back(ConstantInt::get(GEP.getContext(), Idx));
+      // Even if the total offset is inbounds, we may end up representing it
+      // by first performing a larger negative offset, and then a smaller
+      // positive one. The large negative offset might go out of bounds. Only
+      // preserve inbounds if all signs are the same.
+      IsInBounds &= Idx.isNonNegative() == ConstIndices[0].isNonNegative();
+    }
+
+    return IsInBounds
+               ? GetElementPtrInst::CreateInBounds(Src->getSourceElementType(),
+                                                   Src->getOperand(0), Indices,
+                                                   GEP.getName())
+               : GetElementPtrInst::Create(Src->getSourceElementType(),
+                                           Src->getOperand(0), Indices,
+                                           GEP.getName());
+  }
+
+  if (Src->getResultElementType() != GEP.getSourceElementType())
+    return nullptr;
+
   SmallVector<Value*, 8> Indices;
 
   // Find out whether the last index in the source GEP is a sequential idx.
@@ -2045,7 +2089,7 @@ Instruction *InstCombinerImpl::visitGEPOfGEP(GetElementPtrInst &GEP,
       return nullptr;
 
     Value *Sum =
-        SimplifyAddInst(GO1, SO1, false, false, SQ.getWithInstruction(&GEP));
+        simplifyAddInst(GO1, SO1, false, false, SQ.getWithInstruction(&GEP));
     // Only do the combine when we are sure the cost after the
     // merge is never more than that before the merge.
     if (Sum == nullptr)
@@ -2116,9 +2160,8 @@ Instruction *InstCombinerImpl::visitGEPOfBitcast(BitCastInst *BCI,
     // existing GEP Value. Causing issues if this Value is accessed when
     // constructing an AddrSpaceCastInst
     SmallVector<Value *, 8> Indices(GEP.indices());
-    Value *NGEP = GEP.isInBounds()
-                      ? Builder.CreateInBoundsGEP(SrcEltType, SrcOp, Indices)
-                      : Builder.CreateGEP(SrcEltType, SrcOp, Indices);
+    Value *NGEP =
+        Builder.CreateGEP(SrcEltType, SrcOp, Indices, "", GEP.isInBounds());
     NGEP->takeName(&GEP);
 
     // Preserve GEP address space to satisfy users
@@ -2169,12 +2212,10 @@ Instruction *InstCombinerImpl::visitGEPOfBitcast(BitCastInst *BCI,
     // Otherwise, if the offset is non-zero, we need to find out if there is a
     // field at Offset in 'A's type.  If so, we can pull the cast through the
     // GEP.
-    SmallVector<Value*, 8> NewIndices;
+    SmallVector<Value *, 8> NewIndices;
     if (findElementAtOffset(SrcType, Offset.getSExtValue(), NewIndices, DL)) {
-      Value *NGEP =
-          GEP.isInBounds()
-              ? Builder.CreateInBoundsGEP(SrcEltType, SrcOp, NewIndices)
-              : Builder.CreateGEP(SrcEltType, SrcOp, NewIndices);
+      Value *NGEP = Builder.CreateGEP(SrcEltType, SrcOp, NewIndices, "",
+                                      GEP.isInBounds());
 
       if (NGEP->getType() == GEP.getType())
         return replaceInstUsesWith(GEP, NGEP);
@@ -2195,7 +2236,7 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   Type *GEPType = GEP.getType();
   Type *GEPEltType = GEP.getSourceElementType();
   bool IsGEPSrcEleScalable = isa<ScalableVectorType>(GEPEltType);
-  if (Value *V = SimplifyGEPInst(GEPEltType, PtrOp, Indices, GEP.isInBounds(),
+  if (Value *V = simplifyGEPInst(GEPEltType, PtrOp, Indices, GEP.isInBounds(),
                                  SQ.getWithInstruction(&GEP)))
     return replaceInstUsesWith(GEP, V);
 
@@ -2280,7 +2321,8 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
 
     for (auto I = PN->op_begin()+1, E = PN->op_end(); I !=E; ++I) {
       auto *Op2 = dyn_cast<GetElementPtrInst>(*I);
-      if (!Op2 || Op1->getNumOperands() != Op2->getNumOperands())
+      if (!Op2 || Op1->getNumOperands() != Op2->getNumOperands() ||
+          Op1->getSourceElementType() != Op2->getSourceElementType())
         return nullptr;
 
       // As for Op1 above, don't try to fold a GEP into itself.
@@ -2476,11 +2518,8 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
             // addrspacecast i8 addrspace(1)* %0 to i8*
             SmallVector<Value *, 8> Idx(GEP.indices());
             Value *NewGEP =
-                GEP.isInBounds()
-                    ? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr,
-                                                Idx, GEP.getName())
-                    : Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, Idx,
-                                        GEP.getName());
+                Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, Idx,
+                                  GEP.getName(), GEP.isInBounds());
             return new AddrSpaceCastInst(NewGEP, GEPType);
           }
         }
@@ -2495,13 +2534,9 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
           DL.getTypeAllocSize(StrippedPtrEltTy->getArrayElementType()) ==
               DL.getTypeAllocSize(GEPEltType)) {
         Type *IdxType = DL.getIndexType(GEPType);
-        Value *Idx[2] = { Constant::getNullValue(IdxType), GEP.getOperand(1) };
-        Value *NewGEP =
-            GEP.isInBounds()
-                ? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr, Idx,
-                                            GEP.getName())
-                : Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, Idx,
-                                    GEP.getName());
+        Value *Idx[2] = {Constant::getNullValue(IdxType), GEP.getOperand(1)};
+        Value *NewGEP = Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, Idx,
+                                          GEP.getName(), GEP.isInBounds());
 
         // V and GEP are both pointer types --> BitCast
         return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP, GEPType);
@@ -2533,11 +2568,8 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
             // If the multiplication NewIdx * Scale may overflow then the new
             // GEP may not be "inbounds".
             Value *NewGEP =
-                GEP.isInBounds() && NSW
-                    ? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr,
-                                                NewIdx, GEP.getName())
-                    : Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, NewIdx,
-                                        GEP.getName());
+                Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, NewIdx,
+                                  GEP.getName(), GEP.isInBounds() && NSW);
 
             // The NewGEP must be pointer typed, so must the old one -> BitCast
             return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP,
@@ -2578,11 +2610,8 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
             Value *Off[2] = {Constant::getNullValue(IndTy), NewIdx};
 
             Value *NewGEP =
-                GEP.isInBounds() && NSW
-                    ? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr,
-                                                Off, GEP.getName())
-                    : Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, Off,
-                                        GEP.getName());
+                Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, Off,
+                                  GEP.getName(), GEP.isInBounds() && NSW);
             // The NewGEP must be pointer typed, so must the old one -> BitCast
             return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP,
                                                                  GEPType);
@@ -2672,6 +2701,7 @@ static bool isAllocSiteRemovable(Instruction *AI,
                                  SmallVectorImpl<WeakTrackingVH> &Users,
                                  const TargetLibraryInfo &TLI) {
   SmallVector<Instruction*, 4> Worklist;
+  const Optional<StringRef> Family = getAllocationFamily(AI, &TLI);
   Worklist.push_back(AI);
 
   do {
@@ -2740,12 +2770,15 @@ static bool isAllocSiteRemovable(Instruction *AI,
           continue;
         }
 
-        if (isFreeCall(I, &TLI)) {
+        if (isFreeCall(I, &TLI) && getAllocationFamily(I, &TLI) == Family) {
+          assert(Family);
           Users.emplace_back(I);
           continue;
         }
 
-        if (isReallocLikeFn(I, &TLI)) {
+        if (isReallocLikeFn(I, &TLI) &&
+            getAllocationFamily(I, &TLI) == Family) {
+          assert(Family);
           Users.emplace_back(I);
           Worklist.push_back(I);
           continue;
@@ -2803,7 +2836,7 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
         if (II->getIntrinsicID() == Intrinsic::objectsize) {
           Value *Result =
-              lowerObjectSizeCall(II, DL, &TLI, /*MustSucceed=*/true);
+              lowerObjectSizeCall(II, DL, &TLI, AA, /*MustSucceed=*/true);
           replaceInstUsesWith(*I, Result);
           eraseInstFromFunction(*I);
           Users[i] = nullptr; // Skip examining in the next loop.
@@ -3192,7 +3225,7 @@ Instruction *InstCombinerImpl::visitExtractValueInst(ExtractValueInst &EV) {
   if (!EV.hasIndices())
     return replaceInstUsesWith(EV, Agg);
 
-  if (Value *V = SimplifyExtractValueInst(Agg, EV.getIndices(),
+  if (Value *V = simplifyExtractValueInst(Agg, EV.getIndices(),
                                           SQ.getWithInstruction(&EV)))
     return replaceInstUsesWith(EV, V);
 
@@ -3248,6 +3281,15 @@ Instruction *InstCombinerImpl::visitExtractValueInst(ExtractValueInst &EV) {
                                       makeArrayRef(exti, exte));
   }
   if (WithOverflowInst *WO = dyn_cast<WithOverflowInst>(Agg)) {
+    // extractvalue (any_mul_with_overflow X, -1), 0 --> -X
+    Intrinsic::ID OvID = WO->getIntrinsicID();
+    if (*EV.idx_begin() == 0 &&
+        (OvID == Intrinsic::smul_with_overflow ||
+         OvID == Intrinsic::umul_with_overflow) &&
+        match(WO->getArgOperand(1), m_AllOnes())) {
+      return BinaryOperator::CreateNeg(WO->getArgOperand(0));
+    }
+
     // We're extracting from an overflow intrinsic, see if we're the only user,
     // which allows us to simplify multiple result intrinsics to simpler
     // things that just get one value.
@@ -3723,21 +3765,116 @@ InstCombinerImpl::pushFreezeToPreventPoisonFromPropagating(FreezeInst &OrigFI) {
   if (!MaybePoisonOperand)
     return OrigOp;
 
-  auto *FrozenMaybePoisonOperand = new FreezeInst(
+  Builder.SetInsertPoint(OrigOpInst);
+  auto *FrozenMaybePoisonOperand = Builder.CreateFreeze(
       MaybePoisonOperand->get(), MaybePoisonOperand->get()->getName() + ".fr");
 
   replaceUse(*MaybePoisonOperand, FrozenMaybePoisonOperand);
-  FrozenMaybePoisonOperand->insertBefore(OrigOpInst);
   return OrigOp;
 }
 
-bool InstCombinerImpl::freezeDominatedUses(FreezeInst &FI) {
+Instruction *InstCombinerImpl::foldFreezeIntoRecurrence(FreezeInst &FI,
+                                                        PHINode *PN) {
+  // Detect whether this is a recurrence with a start value and some number of
+  // backedge values. We'll check whether we can push the freeze through the
+  // backedge values (possibly dropping poison flags along the way) until we
+  // reach the phi again. In that case, we can move the freeze to the start
+  // value.
+  Use *StartU = nullptr;
+  SmallVector<Value *> Worklist;
+  for (Use &U : PN->incoming_values()) {
+    if (DT.dominates(PN->getParent(), PN->getIncomingBlock(U))) {
+      // Add backedge value to worklist.
+      Worklist.push_back(U.get());
+      continue;
+    }
+
+    // Don't bother handling multiple start values.
+    if (StartU)
+      return nullptr;
+    StartU = &U;
+  }
+
+  if (!StartU || Worklist.empty())
+    return nullptr; // Not a recurrence.
+
+  Value *StartV = StartU->get();
+  BasicBlock *StartBB = PN->getIncomingBlock(*StartU);
+  bool StartNeedsFreeze = !isGuaranteedNotToBeUndefOrPoison(StartV);
+  // We can't insert freeze if the the start value is the result of the
+  // terminator (e.g. an invoke).
+  if (StartNeedsFreeze && StartBB->getTerminator() == StartV)
+    return nullptr;
+
+  SmallPtrSet<Value *, 32> Visited;
+  SmallVector<Instruction *> DropFlags;
+  while (!Worklist.empty()) {
+    Value *V = Worklist.pop_back_val();
+    if (!Visited.insert(V).second)
+      continue;
+
+    if (Visited.size() > 32)
+      return nullptr; // Limit the total number of values we inspect.
+
+    // Assume that PN is non-poison, because it will be after the transform.
+    if (V == PN || isGuaranteedNotToBeUndefOrPoison(V))
+      continue;
+
+    Instruction *I = dyn_cast<Instruction>(V);
+    if (!I || canCreateUndefOrPoison(cast<Operator>(I),
+                                     /*ConsiderFlags*/ false))
+      return nullptr;
+
+    DropFlags.push_back(I);
+    append_range(Worklist, I->operands());
+  }
+
+  for (Instruction *I : DropFlags)
+    I->dropPoisonGeneratingFlags();
+
+  if (StartNeedsFreeze) {
+    Builder.SetInsertPoint(StartBB->getTerminator());
+    Value *FrozenStartV = Builder.CreateFreeze(StartV,
+                                               StartV->getName() + ".fr");
+    replaceUse(*StartU, FrozenStartV);
+  }
+  return replaceInstUsesWith(FI, PN);
+}
+
+bool InstCombinerImpl::freezeOtherUses(FreezeInst &FI) {
   Value *Op = FI.getOperand(0);
 
-  if (isa<Constant>(Op))
+  if (isa<Constant>(Op) || Op->hasOneUse())
     return false;
 
+  // Move the freeze directly after the definition of its operand, so that
+  // it dominates the maximum number of uses. Note that it may not dominate
+  // *all* uses if the operand is an invoke/callbr and the use is in a phi on
+  // the normal/default destination. This is why the domination check in the
+  // replacement below is still necessary.
+  Instruction *MoveBefore = nullptr;
+  if (isa<Argument>(Op)) {
+    MoveBefore = &FI.getFunction()->getEntryBlock().front();
+    while (isa<AllocaInst>(MoveBefore))
+      MoveBefore = MoveBefore->getNextNode();
+  } else if (auto *PN = dyn_cast<PHINode>(Op)) {
+    MoveBefore = PN->getParent()->getFirstNonPHI();
+  } else if (auto *II = dyn_cast<InvokeInst>(Op)) {
+    MoveBefore = II->getNormalDest()->getFirstNonPHI();
+  } else if (auto *CB = dyn_cast<CallBrInst>(Op)) {
+    MoveBefore = CB->getDefaultDest()->getFirstNonPHI();
+  } else {
+    auto *I = cast<Instruction>(Op);
+    assert(!I->isTerminator() && "Cannot be a terminator");
+    MoveBefore = I->getNextNode();
+  }
+
   bool Changed = false;
+  if (&FI != MoveBefore) {
+    FI.moveBefore(MoveBefore);
+    Changed = true;
+  }
+
   Op->replaceUsesWithIf(&FI, [&](Use &U) -> bool {
     bool Dominates = DT.dominates(&FI, U);
     Changed |= Dominates;
@@ -3750,48 +3887,63 @@ bool InstCombinerImpl::freezeDominatedUses(FreezeInst &FI) {
 Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) {
   Value *Op0 = I.getOperand(0);
 
-  if (Value *V = SimplifyFreezeInst(Op0, SQ.getWithInstruction(&I)))
+  if (Value *V = simplifyFreezeInst(Op0, SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
   // freeze (phi const, x) --> phi const, (freeze x)
   if (auto *PN = dyn_cast<PHINode>(Op0)) {
     if (Instruction *NV = foldOpIntoPhi(I, PN))
       return NV;
+    if (Instruction *NV = foldFreezeIntoRecurrence(I, PN))
+      return NV;
   }
 
   if (Value *NI = pushFreezeToPreventPoisonFromPropagating(I))
     return replaceInstUsesWith(I, NI);
 
-  if (match(Op0, m_Undef())) {
-    // If I is freeze(undef), see its uses and fold it to the best constant.
-    // - or: pick -1
-    // - select's condition: pick the value that leads to choosing a constant
-    // - other ops: pick 0
+  // If I is freeze(undef), check its uses and fold it to a fixed constant.
+  // - or: pick -1
+  // - select's condition: if the true value is constant, choose it by making
+  //                       the condition true.
+  // - default: pick 0
+  //
+  // Note that this transform is intentionally done here rather than
+  // via an analysis in InstSimplify or at individual user sites. That is
+  // because we must produce the same value for all uses of the freeze -
+  // it's the reason "freeze" exists!
+  //
+  // TODO: This could use getBinopAbsorber() / getBinopIdentity() to avoid
+  //       duplicating logic for binops at least.
+  auto getUndefReplacement = [&I](Type *Ty) {
     Constant *BestValue = nullptr;
-    Constant *NullValue = Constant::getNullValue(I.getType());
+    Constant *NullValue = Constant::getNullValue(Ty);
     for (const auto *U : I.users()) {
       Constant *C = NullValue;
-
       if (match(U, m_Or(m_Value(), m_Value())))
-        C = Constant::getAllOnesValue(I.getType());
-      else if (const auto *SI = dyn_cast<SelectInst>(U)) {
-        if (SI->getCondition() == &I) {
-          APInt CondVal(1, isa<Constant>(SI->getFalseValue()) ? 0 : 1);
-          C = Constant::getIntegerValue(I.getType(), CondVal);
-        }
-      }
+        C = ConstantInt::getAllOnesValue(Ty);
+      else if (match(U, m_Select(m_Specific(&I), m_Constant(), m_Value())))
+        C = ConstantInt::getTrue(Ty);
 
       if (!BestValue)
         BestValue = C;
       else if (BestValue != C)
         BestValue = NullValue;
     }
+    assert(BestValue && "Must have at least one use");
+    return BestValue;
+  };
 
-    return replaceInstUsesWith(I, BestValue);
+  if (match(Op0, m_Undef()))
+    return replaceInstUsesWith(I, getUndefReplacement(I.getType()));
+
+  Constant *C;
+  if (match(Op0, m_Constant(C)) && C->containsUndefOrPoisonElement()) {
+    Constant *ReplaceC = getUndefReplacement(I.getType()->getScalarType());
+    return replaceInstUsesWith(I, Constant::replaceUndefsWith(C, ReplaceC));
   }
 
-  // Replace all dominated uses of Op to freeze(Op).
-  if (freezeDominatedUses(I))
+  // Replace uses of Op with freeze(Op).
+  if (freezeOtherUses(I))
     return &I;
 
   return nullptr;
@@ -3847,7 +3999,6 @@ static bool SoleWriteToDeadLocal(Instruction *I, TargetLibraryInfo &TLI) {
 /// block.
 static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock,
                                  TargetLibraryInfo &TLI) {
-  assert(I->getUniqueUndroppableUser() && "Invariants didn't hold!");
   BasicBlock *SrcBlock = I->getParent();
 
   // Cannot move control-flow-involving, volatile loads, vaarg, etc.
@@ -4014,48 +4165,68 @@ bool InstCombinerImpl::run() {
         [this](Instruction *I) -> Optional<BasicBlock *> {
       if (!EnableCodeSinking)
         return None;
-      auto *UserInst = cast_or_null<Instruction>(I->getUniqueUndroppableUser());
-      if (!UserInst)
-        return None;
 
       BasicBlock *BB = I->getParent();
       BasicBlock *UserParent = nullptr;
+      unsigned NumUsers = 0;
 
-      // Special handling for Phi nodes - get the block the use occurs in.
-      if (PHINode *PN = dyn_cast<PHINode>(UserInst)) {
-        for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) {
-          if (PN->getIncomingValue(i) == I) {
-            // Bail out if we have uses in different blocks. We don't do any
-            // sophisticated analysis (i.e finding NearestCommonDominator of these
-            // use blocks).
-            if (UserParent && UserParent != PN->getIncomingBlock(i))
-              return None;
-            UserParent = PN->getIncomingBlock(i);
+      for (auto *U : I->users()) {
+        if (U->isDroppable())
+          continue;
+        if (NumUsers > MaxSinkNumUsers)
+          return None;
+
+        Instruction *UserInst = cast<Instruction>(U);
+        // Special handling for Phi nodes - get the block the use occurs in.
+        if (PHINode *PN = dyn_cast<PHINode>(UserInst)) {
+          for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) {
+            if (PN->getIncomingValue(i) == I) {
+              // Bail out if we have uses in different blocks. We don't do any
+              // sophisticated analysis (i.e finding NearestCommonDominator of
+              // these use blocks).
+              if (UserParent && UserParent != PN->getIncomingBlock(i))
+                return None;
+              UserParent = PN->getIncomingBlock(i);
+            }
           }
+          assert(UserParent && "expected to find user block!");
+        } else {
+          if (UserParent && UserParent != UserInst->getParent())
+            return None;
+          UserParent = UserInst->getParent();
         }
-        assert(UserParent && "expected to find user block!");
-      } else
-        UserParent = UserInst->getParent();
 
-      // Try sinking to another block. If that block is unreachable, then do
-      // not bother. SimplifyCFG should handle it.
-      if (UserParent == BB || !DT.isReachableFromEntry(UserParent))
-        return None;
+        // Make sure these checks are done only once, naturally we do the checks
+        // the first time we get the userparent, this will save compile time.
+        if (NumUsers == 0) {
+          // Try sinking to another block. If that block is unreachable, then do
+          // not bother. SimplifyCFG should handle it.
+          if (UserParent == BB || !DT.isReachableFromEntry(UserParent))
+            return None;
+
+          auto *Term = UserParent->getTerminator();
+          // See if the user is one of our successors that has only one
+          // predecessor, so that we don't have to split the critical edge.
+          // Another option where we can sink is a block that ends with a
+          // terminator that does not pass control to other block (such as
+          // return or unreachable or resume). In this case:
+          //   - I dominates the User (by SSA form);
+          //   - the User will be executed at most once.
+          // So sinking I down to User is always profitable or neutral.
+          if (UserParent->getUniquePredecessor() != BB && !succ_empty(Term))
+            return None;
+
+          assert(DT.dominates(BB, UserParent) && "Dominance relation broken?");
+        }
 
-      auto *Term = UserParent->getTerminator();
-      // See if the user is one of our successors that has only one
-      // predecessor, so that we don't have to split the critical edge.
-      // Another option where we can sink is a block that ends with a
-      // terminator that does not pass control to other block (such as
-      // return or unreachable or resume). In this case:
-      //   - I dominates the User (by SSA form);
-      //   - the User will be executed at most once.
-      // So sinking I down to User is always profitable or neutral.
-      if (UserParent->getUniquePredecessor() == BB || succ_empty(Term)) {
-        assert(DT.dominates(BB, UserParent) && "Dominance relation broken?");
-        return UserParent;
+        NumUsers++;
       }
-      return None;
+
+      // No user or only has droppable users.
+      if (!UserParent)
+        return None;
+
+      return UserParent;
     };
 
     auto OptBB = getOptionalSinkBlockForInst(I);
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 8f94172a6402..7a5a74aa4fff 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -31,6 +31,7 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/BinaryFormat/MachO.h"
+#include "llvm/Demangle/Demangle.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
@@ -42,14 +43,12 @@
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
@@ -63,15 +62,12 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
 #include "llvm/MC/MCSectionMachO.h"
-#include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h"
@@ -87,7 +83,6 @@
 #include <cstdint>
 #include <iomanip>
 #include <limits>
-#include <memory>
 #include <sstream>
 #include <string>
 #include <tuple>
@@ -116,7 +111,7 @@ static const uint64_t kFreeBSDKasan_ShadowOffset64 = 0xdffff7c000000000;
 static const uint64_t kNetBSD_ShadowOffset32 = 1ULL << 30;
 static const uint64_t kNetBSD_ShadowOffset64 = 1ULL << 46;
 static const uint64_t kNetBSDKasan_ShadowOffset64 = 0xdfff900000000000;
-static const uint64_t kPS4CPU_ShadowOffset64 = 1ULL << 40;
+static const uint64_t kPS_ShadowOffset64 = 1ULL << 40;
 static const uint64_t kWindowsShadowOffset32 = 3ULL << 28;
 static const uint64_t kEmscriptenShadowOffset = 0;
 
@@ -335,6 +330,11 @@ static cl::opt<std::string> ClMemoryAccessCallbackPrefix(
     cl::desc("Prefix for memory access callbacks"), cl::Hidden,
     cl::init("__asan_"));
 
+static cl::opt<bool> ClKasanMemIntrinCallbackPrefix(
+    "asan-kernel-mem-intrinsic-prefix",
+    cl::desc("Use prefix for memory intrinsics in KASAN mode"), cl::Hidden,
+    cl::init(false));
+
 static cl::opt<bool>
     ClInstrumentDynamicAllocas("asan-instrument-dynamic-allocas",
                                cl::desc("instrument dynamic allocas"),
@@ -465,11 +465,12 @@ struct ShadowMapping {
 static ShadowMapping getShadowMapping(const Triple &TargetTriple, int LongSize,
                                       bool IsKasan) {
   bool IsAndroid = TargetTriple.isAndroid();
-  bool IsIOS = TargetTriple.isiOS() || TargetTriple.isWatchOS();
+  bool IsIOS = TargetTriple.isiOS() || TargetTriple.isWatchOS() ||
+               TargetTriple.isDriverKit();
   bool IsMacOS = TargetTriple.isMacOSX();
   bool IsFreeBSD = TargetTriple.isOSFreeBSD();
   bool IsNetBSD = TargetTriple.isOSNetBSD();
-  bool IsPS4CPU = TargetTriple.isPS4CPU();
+  bool IsPS = TargetTriple.isPS();
   bool IsLinux = TargetTriple.isOSLinux();
   bool IsPPC64 = TargetTriple.getArch() == Triple::ppc64 ||
                  TargetTriple.getArch() == Triple::ppc64le;
@@ -528,8 +529,8 @@ static ShadowMapping getShadowMapping(const Triple &TargetTriple, int LongSize,
         Mapping.Offset = kNetBSDKasan_ShadowOffset64;
       else
         Mapping.Offset = kNetBSD_ShadowOffset64;
-    } else if (IsPS4CPU)
-      Mapping.Offset = kPS4CPU_ShadowOffset64;
+    } else if (IsPS)
+      Mapping.Offset = kPS_ShadowOffset64;
     else if (IsLinux && IsX86_64) {
       if (IsKasan)
         Mapping.Offset = kLinuxKasan_ShadowOffset64;
@@ -568,7 +569,7 @@ static ShadowMapping getShadowMapping(const Triple &TargetTriple, int LongSize,
   // offset is not necessary 1/8-th of the address space.  On SystemZ,
   // we could OR the constant in a single instruction, but it's more
   // efficient to load it once and use indexed addressing.
-  Mapping.OrShadowOffset = !IsAArch64 && !IsPPC64 && !IsSystemZ && !IsPS4CPU &&
+  Mapping.OrShadowOffset = !IsAArch64 && !IsPPC64 && !IsSystemZ && !IsPS &&
                            !IsRISCV64 &&
                            !(Mapping.Offset & (Mapping.Offset - 1)) &&
                            Mapping.Offset != kDynamicShadowSentinel;
@@ -621,41 +622,9 @@ static uint64_t GetCtorAndDtorPriority(Triple &TargetTriple) {
 
 namespace {
 
-/// Module analysis for getting various metadata about the module.
-class ASanGlobalsMetadataWrapperPass : public ModulePass {
-public:
-  static char ID;
-
-  ASanGlobalsMetadataWrapperPass() : ModulePass(ID) {
-    initializeASanGlobalsMetadataWrapperPassPass(
-        *PassRegistry::getPassRegistry());
-  }
-
-  bool runOnModule(Module &M) override {
-    GlobalsMD = GlobalsMetadata(M);
-    return false;
-  }
-
-  StringRef getPassName() const override {
-    return "ASanGlobalsMetadataWrapperPass";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesAll();
-  }
-
-  GlobalsMetadata &getGlobalsMD() { return GlobalsMD; }
-
-private:
-  GlobalsMetadata GlobalsMD;
-};
-
-char ASanGlobalsMetadataWrapperPass::ID = 0;
-
 /// AddressSanitizer: instrument the code in module to find memory bugs.
 struct AddressSanitizer {
-  AddressSanitizer(Module &M, const GlobalsMetadata *GlobalsMD,
-                   const StackSafetyGlobalInfo *SSGI,
+  AddressSanitizer(Module &M, const StackSafetyGlobalInfo *SSGI,
                    bool CompileKernel = false, bool Recover = false,
                    bool UseAfterScope = false,
                    AsanDetectStackUseAfterReturnMode UseAfterReturn =
@@ -666,7 +635,7 @@ struct AddressSanitizer {
         UseAfterScope(UseAfterScope || ClUseAfterScope),
         UseAfterReturn(ClUseAfterReturn.getNumOccurrences() ? ClUseAfterReturn
                                                             : UseAfterReturn),
-        GlobalsMD(*GlobalsMD), SSGI(SSGI) {
+        SSGI(SSGI) {
     C = &(M.getContext());
     LongSize = M.getDataLayout().getPointerSizeInBits();
     IntptrTy = Type::getIntNTy(*C, LongSize);
@@ -779,7 +748,6 @@ private:
 
   FunctionCallee AsanMemmove, AsanMemcpy, AsanMemset;
   Value *LocalDynamicShadow = nullptr;
-  const GlobalsMetadata &GlobalsMD;
   const StackSafetyGlobalInfo *SSGI;
   DenseMap<const AllocaInst *, bool> ProcessedAllocas;
 
@@ -787,60 +755,13 @@ private:
   FunctionCallee AMDGPUAddressPrivate;
 };
 
-class AddressSanitizerLegacyPass : public FunctionPass {
-public:
-  static char ID;
-
-  explicit AddressSanitizerLegacyPass(
-      bool CompileKernel = false, bool Recover = false,
-      bool UseAfterScope = false,
-      AsanDetectStackUseAfterReturnMode UseAfterReturn =
-          AsanDetectStackUseAfterReturnMode::Runtime)
-      : FunctionPass(ID), CompileKernel(CompileKernel), Recover(Recover),
-        UseAfterScope(UseAfterScope), UseAfterReturn(UseAfterReturn) {
-    initializeAddressSanitizerLegacyPassPass(*PassRegistry::getPassRegistry());
-  }
-
-  StringRef getPassName() const override {
-    return "AddressSanitizerFunctionPass";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<ASanGlobalsMetadataWrapperPass>();
-    if (ClUseStackSafety)
-      AU.addRequired<StackSafetyGlobalInfoWrapperPass>();
-    AU.addRequired<TargetLibraryInfoWrapperPass>();
-  }
-
-  bool runOnFunction(Function &F) override {
-    GlobalsMetadata &GlobalsMD =
-        getAnalysis<ASanGlobalsMetadataWrapperPass>().getGlobalsMD();
-    const StackSafetyGlobalInfo *const SSGI =
-        ClUseStackSafety
-            ? &getAnalysis<StackSafetyGlobalInfoWrapperPass>().getResult()
-            : nullptr;
-    const TargetLibraryInfo *TLI =
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
-    AddressSanitizer ASan(*F.getParent(), &GlobalsMD, SSGI, CompileKernel,
-                          Recover, UseAfterScope, UseAfterReturn);
-    return ASan.instrumentFunction(F, TLI);
-  }
-
-private:
-  bool CompileKernel;
-  bool Recover;
-  bool UseAfterScope;
-  AsanDetectStackUseAfterReturnMode UseAfterReturn;
-};
-
 class ModuleAddressSanitizer {
 public:
-  ModuleAddressSanitizer(Module &M, const GlobalsMetadata *GlobalsMD,
-                         bool CompileKernel = false, bool Recover = false,
-                         bool UseGlobalsGC = true, bool UseOdrIndicator = false,
+  ModuleAddressSanitizer(Module &M, bool CompileKernel = false,
+                         bool Recover = false, bool UseGlobalsGC = true,
+                         bool UseOdrIndicator = false,
                          AsanDtorKind DestructorKind = AsanDtorKind::Global)
-      : GlobalsMD(*GlobalsMD),
-        CompileKernel(ClEnableKasan.getNumOccurrences() > 0 ? ClEnableKasan
+      : CompileKernel(ClEnableKasan.getNumOccurrences() > 0 ? ClEnableKasan
                                                             : CompileKernel),
         Recover(ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover),
         UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC && !this->CompileKernel),
@@ -906,7 +827,6 @@ private:
   uint64_t getRedzoneSizeForGlobal(uint64_t SizeInBytes) const;
   int GetAsanVersion(const Module &M) const;
 
-  const GlobalsMetadata &GlobalsMD;
   bool CompileKernel;
   bool Recover;
   bool UseGlobalsGC;
@@ -931,44 +851,6 @@ private:
   Function *AsanDtorFunction = nullptr;
 };
 
-class ModuleAddressSanitizerLegacyPass : public ModulePass {
-public:
-  static char ID;
-
-  explicit ModuleAddressSanitizerLegacyPass(
-      bool CompileKernel = false, bool Recover = false, bool UseGlobalGC = true,
-      bool UseOdrIndicator = false,
-      AsanDtorKind DestructorKind = AsanDtorKind::Global)
-      : ModulePass(ID), CompileKernel(CompileKernel), Recover(Recover),
-        UseGlobalGC(UseGlobalGC), UseOdrIndicator(UseOdrIndicator),
-        DestructorKind(DestructorKind) {
-    initializeModuleAddressSanitizerLegacyPassPass(
-        *PassRegistry::getPassRegistry());
-  }
-
-  StringRef getPassName() const override { return "ModuleAddressSanitizer"; }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<ASanGlobalsMetadataWrapperPass>();
-  }
-
-  bool runOnModule(Module &M) override {
-    GlobalsMetadata &GlobalsMD =
-        getAnalysis<ASanGlobalsMetadataWrapperPass>().getGlobalsMD();
-    ModuleAddressSanitizer ASanModule(M, &GlobalsMD, CompileKernel, Recover,
-                                      UseGlobalGC, UseOdrIndicator,
-                                      DestructorKind);
-    return ASanModule.instrumentModule(M);
-  }
-
-private:
-  bool CompileKernel;
-  bool Recover;
-  bool UseGlobalGC;
-  bool UseOdrIndicator;
-  AsanDtorKind DestructorKind;
-};
-
 // Stack poisoning does not play well with exception handling.
 // When an exception is thrown, we essentially bypass the code
 // that unpoisones the stack. This is why the run-time library has
@@ -1221,85 +1103,6 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
 
 } // end anonymous namespace
 
-void LocationMetadata::parse(MDNode *MDN) {
-  assert(MDN->getNumOperands() == 3);
-  MDString *DIFilename = cast<MDString>(MDN->getOperand(0));
-  Filename = DIFilename->getString();
-  LineNo = mdconst::extract<ConstantInt>(MDN->getOperand(1))->getLimitedValue();
-  ColumnNo =
-      mdconst::extract<ConstantInt>(MDN->getOperand(2))->getLimitedValue();
-}
-
-// FIXME: It would be cleaner to instead attach relevant metadata to the globals
-// we want to sanitize instead and reading this metadata on each pass over a
-// function instead of reading module level metadata at first.
-GlobalsMetadata::GlobalsMetadata(Module &M) {
-  NamedMDNode *Globals = M.getNamedMetadata("llvm.asan.globals");
-  if (!Globals)
-    return;
-  for (auto MDN : Globals->operands()) {
-    // Metadata node contains the global and the fields of "Entry".
-    assert(MDN->getNumOperands() == 5);
-    auto *V = mdconst::extract_or_null<Constant>(MDN->getOperand(0));
-    // The optimizer may optimize away a global entirely.
-    if (!V)
-      continue;
-    auto *StrippedV = V->stripPointerCasts();
-    auto *GV = dyn_cast<GlobalVariable>(StrippedV);
-    if (!GV)
-      continue;
-    // We can already have an entry for GV if it was merged with another
-    // global.
-    Entry &E = Entries[GV];
-    if (auto *Loc = cast_or_null<MDNode>(MDN->getOperand(1)))
-      E.SourceLoc.parse(Loc);
-    if (auto *Name = cast_or_null<MDString>(MDN->getOperand(2)))
-      E.Name = Name->getString();
-    ConstantInt *IsDynInit = mdconst::extract<ConstantInt>(MDN->getOperand(3));
-    E.IsDynInit |= IsDynInit->isOne();
-    ConstantInt *IsExcluded =
-        mdconst::extract<ConstantInt>(MDN->getOperand(4));
-    E.IsExcluded |= IsExcluded->isOne();
-  }
-}
-
-AnalysisKey ASanGlobalsMetadataAnalysis::Key;
-
-GlobalsMetadata ASanGlobalsMetadataAnalysis::run(Module &M,
-                                                 ModuleAnalysisManager &AM) {
-  return GlobalsMetadata(M);
-}
-
-PreservedAnalyses AddressSanitizerPass::run(Function &F,
-                                            AnalysisManager<Function> &AM) {
-  auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
-  Module &M = *F.getParent();
-  if (auto *R = MAMProxy.getCachedResult<ASanGlobalsMetadataAnalysis>(M)) {
-    const TargetLibraryInfo *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
-    AddressSanitizer Sanitizer(M, R, nullptr, Options.CompileKernel,
-                               Options.Recover, Options.UseAfterScope,
-                               Options.UseAfterReturn);
-    if (Sanitizer.instrumentFunction(F, TLI))
-      return PreservedAnalyses::none();
-    return PreservedAnalyses::all();
-  }
-
-  report_fatal_error(
-      "The ASanGlobalsMetadataAnalysis is required to run before "
-      "AddressSanitizer can run");
-  return PreservedAnalyses::all();
-}
-
-void AddressSanitizerPass::printPipeline(
-    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
-  static_cast<PassInfoMixin<AddressSanitizerPass> *>(this)->printPipeline(
-      OS, MapClassName2PassName);
-  OS << "<";
-  if (Options.CompileKernel)
-    OS << "kernel";
-  OS << ">";
-}
-
 void ModuleAddressSanitizerPass::printPipeline(
     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
   static_cast<PassInfoMixin<ModuleAddressSanitizerPass> *>(this)->printPipeline(
@@ -1318,8 +1121,7 @@ ModuleAddressSanitizerPass::ModuleAddressSanitizerPass(
 
 PreservedAnalyses ModuleAddressSanitizerPass::run(Module &M,
                                                   ModuleAnalysisManager &MAM) {
-  GlobalsMetadata &GlobalsMD = MAM.getResult<ASanGlobalsMetadataAnalysis>(M);
-  ModuleAddressSanitizer ModuleSanitizer(M, &GlobalsMD, Options.CompileKernel,
+  ModuleAddressSanitizer ModuleSanitizer(M, Options.CompileKernel,
                                          Options.Recover, UseGlobalGC,
                                          UseOdrIndicator, DestructorKind);
   bool Modified = false;
@@ -1327,9 +1129,9 @@ PreservedAnalyses ModuleAddressSanitizerPass::run(Module &M,
   const StackSafetyGlobalInfo *const SSGI =
       ClUseStackSafety ? &MAM.getResult<StackSafetyGlobalAnalysis>(M) : nullptr;
   for (Function &F : M) {
-    AddressSanitizer FunctionSanitizer(
-        M, &GlobalsMD, SSGI, Options.CompileKernel, Options.Recover,
-        Options.UseAfterScope, Options.UseAfterReturn);
+    AddressSanitizer FunctionSanitizer(M, SSGI, Options.CompileKernel,
+                                       Options.Recover, Options.UseAfterScope,
+                                       Options.UseAfterReturn);
     const TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
     Modified |= FunctionSanitizer.instrumentFunction(F, &TLI);
   }
@@ -1337,75 +1139,20 @@ PreservedAnalyses ModuleAddressSanitizerPass::run(Module &M,
   return Modified ? PreservedAnalyses::none() : PreservedAnalyses::all();
 }
 
-INITIALIZE_PASS(ASanGlobalsMetadataWrapperPass, "asan-globals-md",
-                "Read metadata to mark which globals should be instrumented "
-                "when running ASan.",
-                false, true)
-
-char AddressSanitizerLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(
-    AddressSanitizerLegacyPass, "asan",
-    "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false,
-    false)
-INITIALIZE_PASS_DEPENDENCY(ASanGlobalsMetadataWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(StackSafetyGlobalInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(
-    AddressSanitizerLegacyPass, "asan",
-    "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false,
-    false)
-
-FunctionPass *llvm::createAddressSanitizerFunctionPass(
-    bool CompileKernel, bool Recover, bool UseAfterScope,
-    AsanDetectStackUseAfterReturnMode UseAfterReturn) {
-  assert(!CompileKernel || Recover);
-  return new AddressSanitizerLegacyPass(CompileKernel, Recover, UseAfterScope,
-                                        UseAfterReturn);
-}
-
-char ModuleAddressSanitizerLegacyPass::ID = 0;
-
-INITIALIZE_PASS(
-    ModuleAddressSanitizerLegacyPass, "asan-module",
-    "AddressSanitizer: detects use-after-free and out-of-bounds bugs."
-    "ModulePass",
-    false, false)
-
-ModulePass *llvm::createModuleAddressSanitizerLegacyPassPass(
-    bool CompileKernel, bool Recover, bool UseGlobalsGC, bool UseOdrIndicator,
-    AsanDtorKind Destructor) {
-  assert(!CompileKernel || Recover);
-  return new ModuleAddressSanitizerLegacyPass(
-      CompileKernel, Recover, UseGlobalsGC, UseOdrIndicator, Destructor);
-}
-
 static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
   size_t Res = countTrailingZeros(TypeSize / 8);
   assert(Res < kNumberOfAccessSizes);
   return Res;
 }
 
-/// Create a global describing a source location.
-static GlobalVariable *createPrivateGlobalForSourceLoc(Module &M,
-                                                       LocationMetadata MD) {
-  Constant *LocData[] = {
-      createPrivateGlobalForString(M, MD.Filename, true, kAsanGenPrefix),
-      ConstantInt::get(Type::getInt32Ty(M.getContext()), MD.LineNo),
-      ConstantInt::get(Type::getInt32Ty(M.getContext()), MD.ColumnNo),
-  };
-  auto LocStruct = ConstantStruct::getAnon(LocData);
-  auto GV = new GlobalVariable(M, LocStruct->getType(), true,
-                               GlobalValue::PrivateLinkage, LocStruct,
-                               kAsanGenPrefix);
-  GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
-  return GV;
-}
-
 /// Check if \p G has been created by a trusted compiler pass.
 static bool GlobalWasGeneratedByCompiler(GlobalVariable *G) {
   // Do not instrument @llvm.global_ctors, @llvm.used, etc.
-  if (G->getName().startswith("llvm."))
+  if (G->getName().startswith("llvm.") ||
+      // Do not instrument gcov counter arrays.
+      G->getName().startswith("__llvm_gcov_ctr") ||
+      // Do not instrument rtti proxy symbols for function sanitizer.
+      G->getName().startswith("__llvm_rtti_proxy"))
     return true;
 
   // Do not instrument asan globals.
@@ -1414,10 +1161,6 @@ static bool GlobalWasGeneratedByCompiler(GlobalVariable *G) {
       G->getName().startswith(kODRGenPrefix))
     return true;
 
-  // Do not instrument gcov counter arrays.
-  if (G->getName() == "__llvm_gcov_ctr")
-    return true;
-
   return false;
 }
 
@@ -1518,10 +1261,6 @@ bool AddressSanitizer::ignoreAccess(Instruction *Inst, Value *Ptr) {
 
 void AddressSanitizer::getInterestingMemoryOperands(
     Instruction *I, SmallVectorImpl<InterestingMemoryOperand> &Interesting) {
-  // Skip memory accesses inserted by another instrumentation.
-  if (I->hasMetadata("nosanitize"))
-    return;
-
   // Do not instrument the load fetching the dynamic shadow address.
   if (LocalDynamicShadow == I)
     return;
@@ -1613,10 +1352,13 @@ bool AddressSanitizer::GlobalIsLinkerInitialized(GlobalVariable *G) {
   // If a global variable does not have dynamic initialization we don't
   // have to instrument it.  However, if a global does not have initializer
   // at all, we assume it has dynamic initializer (in other TU).
-  //
-  // FIXME: Metadata should be attched directly to the global directly instead
-  // of being added to llvm.asan.globals.
-  return G->hasInitializer() && !GlobalsMD.get(G).IsDynInit;
+  if (!G->hasInitializer())
+    return false;
+
+  if (G->hasSanitizerMetadata() && G->getSanitizerMetadata().IsDynInit)
+    return false;
+
+  return true;
 }
 
 void AddressSanitizer::instrumentPointerComparisonOrSubtraction(
@@ -1977,9 +1719,8 @@ bool ModuleAddressSanitizer::shouldInstrumentGlobal(GlobalVariable *G) const {
   Type *Ty = G->getValueType();
   LLVM_DEBUG(dbgs() << "GLOBAL: " << *G << "\n");
 
-  // FIXME: Metadata should be attched directly to the global directly instead
-  // of being added to llvm.asan.globals.
-  if (GlobalsMD.get(G).IsExcluded) return false;
+  if (G->hasSanitizerMetadata() && G->getSanitizerMetadata().NoAddress)
+    return false;
   if (!Ty->isSized()) return false;
   if (!G->hasInitializer()) return false;
   // Globals in address space 1 and 4 are supported for AMDGPU.
@@ -2125,6 +1866,8 @@ bool ModuleAddressSanitizer::ShouldUseMachOGlobalsSection() const {
     return true;
   if (TargetTriple.isWatchOS() && !TargetTriple.isOSVersionLT(2))
     return true;
+  if (TargetTriple.isDriverKit())
+    return true;
 
   return false;
 }
@@ -2136,7 +1879,9 @@ StringRef ModuleAddressSanitizer::getGlobalMetadataSection() const {
   case Triple::MachO: return "__DATA,__asan_globals,regular";
   case Triple::Wasm:
   case Triple::GOFF:
+  case Triple::SPIRV:
   case Triple::XCOFF:
+  case Triple::DXContainer:
     report_fatal_error(
         "ModuleAddressSanitizer not implemented for object file format");
   case Triple::UnknownObjectFormat:
@@ -2470,7 +2215,7 @@ bool ModuleAddressSanitizer::InstrumentGlobals(IRBuilder<> &IRB, Module &M,
   //   const char *name;
   //   const char *module_name;
   //   size_t has_dynamic_init;
-  //   void *source_location;
+  //   size_t padding_for_windows_msvc_incremental_link;
   //   size_t odr_indicator;
   // We initialize an array of such structures and pass it to a run-time call.
   StructType *GlobalStructTy =
@@ -2489,15 +2234,16 @@ bool ModuleAddressSanitizer::InstrumentGlobals(IRBuilder<> &IRB, Module &M,
   for (size_t i = 0; i < n; i++) {
     GlobalVariable *G = GlobalsToChange[i];
 
-    // FIXME: Metadata should be attched directly to the global directly instead
-    // of being added to llvm.asan.globals.
-    auto MD = GlobalsMD.get(G);
-    StringRef NameForGlobal = G->getName();
-    // Create string holding the global name (use global name from metadata
-    // if it's available, otherwise just write the name of global variable).
-    GlobalVariable *Name = createPrivateGlobalForString(
-        M, MD.Name.empty() ? NameForGlobal : MD.Name,
-        /*AllowMerging*/ true, kAsanGenPrefix);
+    GlobalValue::SanitizerMetadata MD;
+    if (G->hasSanitizerMetadata())
+      MD = G->getSanitizerMetadata();
+
+    // TODO: Symbol names in the descriptor can be demangled by the runtime
+    // library. This could save ~0.4% of VM size for a private large binary.
+    std::string NameForGlobal = llvm::demangle(G->getName().str());
+    GlobalVariable *Name =
+        createPrivateGlobalForString(M, NameForGlobal,
+                                     /*AllowMerging*/ true, kAsanGenPrefix);
 
     Type *Ty = G->getValueType();
     const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
@@ -2545,14 +2291,6 @@ bool ModuleAddressSanitizer::InstrumentGlobals(IRBuilder<> &IRB, Module &M,
     G->eraseFromParent();
     NewGlobals[i] = NewGlobal;
 
-    Constant *SourceLoc;
-    if (!MD.SourceLoc.empty()) {
-      auto SourceLocGlobal = createPrivateGlobalForSourceLoc(M, MD.SourceLoc);
-      SourceLoc = ConstantExpr::getPointerCast(SourceLocGlobal, IntptrTy);
-    } else {
-      SourceLoc = ConstantInt::get(IntptrTy, 0);
-    }
-
     Constant *ODRIndicator = ConstantExpr::getNullValue(IRB.getInt8PtrTy());
     GlobalValue *InstrumentedGlobal = NewGlobal;
 
@@ -2593,10 +2331,12 @@ bool ModuleAddressSanitizer::InstrumentGlobals(IRBuilder<> &IRB, Module &M,
         ConstantInt::get(IntptrTy, SizeInBytes + RightRedzoneSize),
         ConstantExpr::getPointerCast(Name, IntptrTy),
         ConstantExpr::getPointerCast(ModuleName, IntptrTy),
-        ConstantInt::get(IntptrTy, MD.IsDynInit), SourceLoc,
+        ConstantInt::get(IntptrTy, MD.IsDynInit),
+        Constant::getNullValue(IntptrTy),
         ConstantExpr::getPointerCast(ODRIndicator, IntptrTy));
 
-    if (ClInitializers && MD.IsDynInit) HasDynamicallyInitializedGlobals = true;
+    if (ClInitializers && MD.IsDynInit)
+      HasDynamicallyInitializedGlobals = true;
 
     LLVM_DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n");
 
@@ -2759,7 +2499,9 @@ void AddressSanitizer::initializeCallbacks(Module &M) {
   }
 
   const std::string MemIntrinCallbackPrefix =
-      CompileKernel ? std::string("") : ClMemoryAccessCallbackPrefix;
+      (CompileKernel && !ClKasanMemIntrinCallbackPrefix)
+          ? std::string("")
+          : ClMemoryAccessCallbackPrefix;
   AsanMemmove = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memmove",
                                       IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
                                       IRB.getInt8PtrTy(), IntptrTy);
@@ -2888,6 +2630,9 @@ bool AddressSanitizer::instrumentFunction(Function &F,
   // Leave if the function doesn't need instrumentation.
   if (!F.hasFnAttribute(Attribute::SanitizeAddress)) return FunctionModified;
 
+  if (F.hasFnAttribute(Attribute::DisableSanitizerInstrumentation))
+    return FunctionModified;
+
   LLVM_DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n");
 
   initializeCallbacks(*F.getParent());
@@ -2908,7 +2653,6 @@ bool AddressSanitizer::instrumentFunction(Function &F,
   SmallVector<Instruction *, 8> NoReturnCalls;
   SmallVector<BasicBlock *, 16> AllBlocks;
   SmallVector<Instruction *, 16> PointerComparisonsOrSubtracts;
-  int NumAllocas = 0;
 
   // Fill the set of memory operations to instrument.
   for (auto &BB : F) {
@@ -2917,6 +2661,9 @@ bool AddressSanitizer::instrumentFunction(Function &F,
     int NumInsnsPerBB = 0;
     for (auto &Inst : BB) {
       if (LooksLikeCodeInBug11395(&Inst)) return false;
+      // Skip instructions inserted by another instrumentation.
+      if (Inst.hasMetadata(LLVMContext::MD_nosanitize))
+        continue;
       SmallVector<InterestingMemoryOperand, 1> InterestingOperands;
       getInterestingMemoryOperands(&Inst, InterestingOperands);
 
@@ -2948,11 +2695,10 @@ bool AddressSanitizer::instrumentFunction(Function &F,
         IntrinToInstrument.push_back(MI);
         NumInsnsPerBB++;
       } else {
-        if (isa<AllocaInst>(Inst)) NumAllocas++;
         if (auto *CB = dyn_cast<CallBase>(&Inst)) {
           // A call inside BB.
           TempsToInstrument.clear();
-          if (CB->doesNotReturn() && !CB->hasMetadata("nosanitize"))
+          if (CB->doesNotReturn())
             NoReturnCalls.push_back(CB);
         }
         if (CallInst *CI = dyn_cast<CallInst>(&Inst))
@@ -3347,7 +3093,7 @@ void FunctionStackPoisoner::processStaticAllocas() {
     ASanStackVariableDescription D = {AI->getName().data(),
                                       ASan.getAllocaSizeInBytes(*AI),
                                       0,
-                                      AI->getAlignment(),
+                                      AI->getAlign().value(),
                                       AI,
                                       0,
                                       0};
@@ -3611,7 +3357,7 @@ void FunctionStackPoisoner::poisonAlloca(Value *V, uint64_t Size,
 void FunctionStackPoisoner::handleDynamicAllocaCall(AllocaInst *AI) {
   IRBuilder<> IRB(AI);
 
-  const uint64_t Alignment = std::max(kAllocaRzSize, AI->getAlignment());
+  const Align Alignment = std::max(Align(kAllocaRzSize), AI->getAlign());
   const uint64_t AllocaRedzoneMask = kAllocaRzSize - 1;
 
   Value *Zero = Constant::getNullValue(IntptrTy);
@@ -3642,17 +3388,19 @@ void FunctionStackPoisoner::handleDynamicAllocaCall(AllocaInst *AI) {
   // Alignment is added to locate left redzone, PartialPadding for possible
   // partial redzone and kAllocaRzSize for right redzone respectively.
   Value *AdditionalChunkSize = IRB.CreateAdd(
-      ConstantInt::get(IntptrTy, Alignment + kAllocaRzSize), PartialPadding);
+      ConstantInt::get(IntptrTy, Alignment.value() + kAllocaRzSize),
+      PartialPadding);
 
   Value *NewSize = IRB.CreateAdd(OldSize, AdditionalChunkSize);
 
   // Insert new alloca with new NewSize and Alignment params.
   AllocaInst *NewAlloca = IRB.CreateAlloca(IRB.getInt8Ty(), NewSize);
-  NewAlloca->setAlignment(Align(Alignment));
+  NewAlloca->setAlignment(Alignment);
 
   // NewAddress = Address + Alignment
-  Value *NewAddress = IRB.CreateAdd(IRB.CreatePtrToInt(NewAlloca, IntptrTy),
-                                    ConstantInt::get(IntptrTy, Alignment));
+  Value *NewAddress =
+      IRB.CreateAdd(IRB.CreatePtrToInt(NewAlloca, IntptrTy),
+                    ConstantInt::get(IntptrTy, Alignment.value()));
 
   // Insert __asan_alloca_poison call for new created alloca.
   IRB.CreateCall(AsanAllocaPoisonFunc, {NewAddress, OldSize});
diff --git a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
index 4ad07cab001a..1eadafb4e4b4 100644
--- a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -19,7 +19,6 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
-#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
@@ -29,7 +28,6 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdint>
 #include <utility>
@@ -142,6 +140,9 @@ static void insertBoundsCheck(Value *Or, BuilderTy &IRB, GetTrapBBT GetTrapBB) {
 
 static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI,
                               ScalarEvolution &SE) {
+  if (F.hasFnAttribute(Attribute::NoSanitizeBounds))
+    return false;
+
   const DataLayout &DL = F.getParent()->getDataLayout();
   ObjectSizeOpts EvalOpts;
   EvalOpts.RoundToAlign = true;
diff --git a/llvm/lib/Transforms/Instrumentation/CGProfile.cpp b/llvm/lib/Transforms/Instrumentation/CGProfile.cpp
index 1a7f7a365ce4..b11b84d65d23 100644
--- a/llvm/lib/Transforms/Instrumentation/CGProfile.cpp
+++ b/llvm/lib/Transforms/Instrumentation/CGProfile.cpp
@@ -13,15 +13,12 @@
 #include "llvm/Analysis/LazyBlockFrequencyInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Transforms/Instrumentation.h"
 
-#include <array>
-
 using namespace llvm;
 
 static bool
diff --git a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
index 497aac30c3f6..e5c0705b916e 100644
--- a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
@@ -26,6 +26,7 @@
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/InitializePasses.h"
@@ -145,27 +146,27 @@ FunctionPass *llvm::createControlHeightReductionLegacyPass() {
 namespace {
 
 struct CHRStats {
-  CHRStats() : NumBranches(0), NumBranchesDelta(0),
-               WeightedNumBranchesDelta(0) {}
+  CHRStats() = default;
   void print(raw_ostream &OS) const {
     OS << "CHRStats: NumBranches " << NumBranches
        << " NumBranchesDelta " << NumBranchesDelta
        << " WeightedNumBranchesDelta " << WeightedNumBranchesDelta;
   }
-  uint64_t NumBranches;       // The original number of conditional branches /
-                              // selects
-  uint64_t NumBranchesDelta;  // The decrease of the number of conditional
-                              // branches / selects in the hot paths due to CHR.
-  uint64_t WeightedNumBranchesDelta; // NumBranchesDelta weighted by the profile
-                                     // count at the scope entry.
+  // The original number of conditional branches / selects
+  uint64_t NumBranches = 0;
+  // The decrease of the number of conditional branches / selects in the hot
+  // paths due to CHR.
+  uint64_t NumBranchesDelta = 0;
+  // NumBranchesDelta weighted by the profile count at the scope entry.
+  uint64_t WeightedNumBranchesDelta = 0;
 };
 
 // RegInfo - some properties of a Region.
 struct RegInfo {
-  RegInfo() : R(nullptr), HasBranch(false) {}
-  RegInfo(Region *RegionIn) : R(RegionIn), HasBranch(false) {}
-  Region *R;
-  bool HasBranch;
+  RegInfo() = default;
+  RegInfo(Region *RegionIn) : R(RegionIn) {}
+  Region *R = nullptr;
+  bool HasBranch = false;
   SmallVector<SelectInst *, 8> Selects;
 };
 
@@ -769,9 +770,21 @@ CHRScope * CHR::findScope(Region *R) {
       return nullptr;
   // If any of the basic blocks have address taken, we must skip this region
   // because we cannot clone basic blocks that have address taken.
-  for (BasicBlock *BB : R->blocks())
+  for (BasicBlock *BB : R->blocks()) {
     if (BB->hasAddressTaken())
       return nullptr;
+    // If we encounter llvm.coro.id, skip this region because if the basic block
+    // is cloned, we end up inserting a token type PHI node to the block with
+    // llvm.coro.begin.
+    // FIXME: This could lead to less optimal codegen, because the region is
+    // excluded, it can prevent CHR from merging adjacent regions into bigger
+    // scope and hoisting more branches.
+    for (Instruction &I : *BB)
+      if (auto *II = dyn_cast<IntrinsicInst>(&I))
+        if (II->getIntrinsicID() == Intrinsic::coro_id)
+          return nullptr;
+  }
+
   if (Exit) {
     // Try to find an if-then block (check if R is an if-then).
     // if (cond) {
@@ -1752,7 +1765,7 @@ void CHR::transformScopes(CHRScope *Scope, DenseSet<PHINode *> &TrivialPHIs) {
   // Create the combined branch condition and constant-fold the branches/selects
   // in the hot path.
   fixupBranchesAndSelects(Scope, PreEntryBlock, MergedBr,
-                          ProfileCount.getValueOr(0));
+                          ProfileCount.value_or(0));
 }
 
 // A helper for transformScopes. Clone the blocks in the scope (excluding the
@@ -1949,28 +1962,27 @@ void CHR::fixupSelect(SelectInst *SI, CHRScope *Scope,
 // A helper for fixupBranch/fixupSelect. Add a branch condition to the merged
 // condition.
 void CHR::addToMergedCondition(bool IsTrueBiased, Value *Cond,
-                               Instruction *BranchOrSelect,
-                               CHRScope *Scope,
-                               IRBuilder<> &IRB,
-                               Value *&MergedCondition) {
-  if (IsTrueBiased) {
-    MergedCondition = IRB.CreateAnd(MergedCondition, Cond);
-  } else {
+                               Instruction *BranchOrSelect, CHRScope *Scope,
+                               IRBuilder<> &IRB, Value *&MergedCondition) {
+  if (!IsTrueBiased) {
     // If Cond is an icmp and all users of V except for BranchOrSelect is a
     // branch, negate the icmp predicate and swap the branch targets and avoid
     // inserting an Xor to negate Cond.
-    bool Done = false;
-    if (auto *ICmp = dyn_cast<ICmpInst>(Cond))
-      if (negateICmpIfUsedByBranchOrSelectOnly(ICmp, BranchOrSelect, Scope)) {
-        MergedCondition = IRB.CreateAnd(MergedCondition, Cond);
-        Done = true;
-      }
-    if (!Done) {
-      Value *Negate = IRB.CreateXor(
-          ConstantInt::getTrue(F.getContext()), Cond);
-      MergedCondition = IRB.CreateAnd(MergedCondition, Negate);
-    }
+    auto *ICmp = dyn_cast<ICmpInst>(Cond);
+    if (!ICmp ||
+        !negateICmpIfUsedByBranchOrSelectOnly(ICmp, BranchOrSelect, Scope))
+      Cond = IRB.CreateXor(ConstantInt::getTrue(F.getContext()), Cond);
   }
+
+  // Select conditions can be poison, while branching on poison is immediate
+  // undefined behavior. As such, we need to freeze potentially poisonous
+  // conditions derived from selects.
+  if (isa<SelectInst>(BranchOrSelect) &&
+      !isGuaranteedNotToBeUndefOrPoison(Cond))
+    Cond = IRB.CreateFreeze(Cond);
+
+  // Use logical and to avoid propagating poison from later conditions.
+  MergedCondition = IRB.CreateLogicalAnd(MergedCondition, Cond);
 }
 
 void CHR::transformScopes(SmallVectorImpl<CHRScope *> &CHRScopes) {
@@ -2080,7 +2092,7 @@ bool ControlHeightReductionLegacyPass::runOnFunction(Function &F) {
   RegionInfo &RI = getAnalysis<RegionInfoPass>().getRegionInfo();
   std::unique_ptr<OptimizationRemarkEmitter> OwnedORE =
       std::make_unique<OptimizationRemarkEmitter>(&F);
-  return CHR(F, BFI, DT, PSI, RI, *OwnedORE.get()).run();
+  return CHR(F, BFI, DT, PSI, RI, *OwnedORE).run();
 }
 
 namespace llvm {
diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index ff3aa14a2a83..6815688827d2 100644
--- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -66,8 +66,8 @@
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -84,13 +84,11 @@
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
@@ -112,7 +110,6 @@
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
-#include <iterator>
 #include <memory>
 #include <set>
 #include <string>
@@ -187,6 +184,15 @@ static cl::opt<bool> ClCombineOffsetLabelsOnGEP(
         "doing pointer arithmetic."),
     cl::Hidden, cl::init(true));
 
+static cl::list<std::string> ClCombineTaintLookupTables(
+    "dfsan-combine-taint-lookup-table",
+    cl::desc(
+        "When dfsan-combine-offset-labels-on-gep and/or "
+        "dfsan-combine-pointer-labels-on-load are false, this flag can "
+        "be used to re-enable combining offset and/or pointer taint when "
+        "loading specific constant global variables (i.e. lookup tables)."),
+    cl::Hidden);
+
 static cl::opt<bool> ClDebugNonzeroLabels(
     "dfsan-debug-nonzero-labels",
     cl::desc("Insert calls to __dfsan_nonzero_label on observing a parameter, "
@@ -433,6 +439,7 @@ class DataFlowSanitizer {
   FunctionType *DFSanUnionLoadFnTy;
   FunctionType *DFSanLoadLabelAndOriginFnTy;
   FunctionType *DFSanUnimplementedFnTy;
+  FunctionType *DFSanWrapperExternWeakNullFnTy;
   FunctionType *DFSanSetLabelFnTy;
   FunctionType *DFSanNonzeroLabelFnTy;
   FunctionType *DFSanVarargWrapperFnTy;
@@ -448,6 +455,7 @@ class DataFlowSanitizer {
   FunctionCallee DFSanUnionLoadFn;
   FunctionCallee DFSanLoadLabelAndOriginFn;
   FunctionCallee DFSanUnimplementedFn;
+  FunctionCallee DFSanWrapperExternWeakNullFn;
   FunctionCallee DFSanSetLabelFn;
   FunctionCallee DFSanNonzeroLabelFn;
   FunctionCallee DFSanVarargWrapperFn;
@@ -467,6 +475,7 @@ class DataFlowSanitizer {
   DFSanABIList ABIList;
   DenseMap<Value *, Function *> UnwrappedFnMap;
   AttributeMask ReadOnlyNoneAttrs;
+  StringSet<> CombineTaintLookupTableNames;
 
   /// Memory map parameters used in calculation mapping application addresses
   /// to shadow addresses and origin addresses.
@@ -480,14 +489,13 @@ class DataFlowSanitizer {
   bool isInstrumented(const Function *F);
   bool isInstrumented(const GlobalAlias *GA);
   bool isForceZeroLabels(const Function *F);
-  FunctionType *getTrampolineFunctionType(FunctionType *T);
   TransformedFunction getCustomFunctionType(FunctionType *T);
   WrapperKind getWrapperKind(Function *F);
   void addGlobalNameSuffix(GlobalValue *GV);
+  void buildExternWeakCheckIfNeeded(IRBuilder<> &IRB, Function *F);
   Function *buildWrapperFunction(Function *F, StringRef NewFName,
                                  GlobalValue::LinkageTypes NewFLink,
                                  FunctionType *NewFT);
-  Constant *getOrBuildTrampolineFunction(FunctionType *FT, StringRef FName);
   void initializeCallbackFunctions(Module &M);
   void initializeRuntimeFunctions(Module &M);
   void injectMetadataGlobals(Module &M);
@@ -658,6 +666,8 @@ struct DFSanFunction {
   // branch instruction using the given conditional expression.
   void addConditionalCallbacksIfEnabled(Instruction &I, Value *Condition);
 
+  bool isLookupTableConstant(Value *P);
+
 private:
   /// Collapses the shadow with aggregate type into a single primitive shadow
   /// value.
@@ -792,25 +802,9 @@ DataFlowSanitizer::DataFlowSanitizer(
   // FIXME: should we propagate vfs::FileSystem to this constructor?
   ABIList.set(
       SpecialCaseList::createOrDie(AllABIListFiles, *vfs::getRealFileSystem()));
-}
 
-FunctionType *DataFlowSanitizer::getTrampolineFunctionType(FunctionType *T) {
-  assert(!T->isVarArg());
-  SmallVector<Type *, 4> ArgTypes;
-  ArgTypes.push_back(T->getPointerTo());
-  ArgTypes.append(T->param_begin(), T->param_end());
-  ArgTypes.append(T->getNumParams(), PrimitiveShadowTy);
-  Type *RetType = T->getReturnType();
-  if (!RetType->isVoidTy())
-    ArgTypes.push_back(PrimitiveShadowPtrTy);
-
-  if (shouldTrackOrigins()) {
-    ArgTypes.append(T->getNumParams(), OriginTy);
-    if (!RetType->isVoidTy())
-      ArgTypes.push_back(OriginPtrTy);
-  }
-
-  return FunctionType::get(T->getReturnType(), ArgTypes, false);
+  for (StringRef v : ClCombineTaintLookupTables)
+    CombineTaintLookupTableNames.insert(v);
 }
 
 TransformedFunction DataFlowSanitizer::getCustomFunctionType(FunctionType *T) {
@@ -823,16 +817,8 @@ TransformedFunction DataFlowSanitizer::getCustomFunctionType(FunctionType *T) {
   std::vector<unsigned> ArgumentIndexMapping;
   for (unsigned I = 0, E = T->getNumParams(); I != E; ++I) {
     Type *ParamType = T->getParamType(I);
-    FunctionType *FT;
-    if (isa<PointerType>(ParamType) &&
-        (FT = dyn_cast<FunctionType>(ParamType->getPointerElementType()))) {
-      ArgumentIndexMapping.push_back(ArgTypes.size());
-      ArgTypes.push_back(getTrampolineFunctionType(FT)->getPointerTo());
-      ArgTypes.push_back(Type::getInt8PtrTy(*Ctx));
-    } else {
-      ArgumentIndexMapping.push_back(ArgTypes.size());
-      ArgTypes.push_back(ParamType);
-    }
+    ArgumentIndexMapping.push_back(ArgTypes.size());
+    ArgTypes.push_back(ParamType);
   }
   for (unsigned I = 0, E = T->getNumParams(); I != E; ++I)
     ArgTypes.push_back(PrimitiveShadowTy);
@@ -1058,6 +1044,10 @@ bool DataFlowSanitizer::initializeModule(Module &M) {
                         /*isVarArg=*/false);
   DFSanUnimplementedFnTy = FunctionType::get(
       Type::getVoidTy(*Ctx), Type::getInt8PtrTy(*Ctx), /*isVarArg=*/false);
+  Type *DFSanWrapperExternWeakNullArgs[2] = {Int8Ptr, Int8Ptr};
+  DFSanWrapperExternWeakNullFnTy =
+      FunctionType::get(Type::getVoidTy(*Ctx), DFSanWrapperExternWeakNullArgs,
+                        /*isVarArg=*/false);
   Type *DFSanSetLabelArgs[4] = {PrimitiveShadowTy, OriginTy,
                                 Type::getInt8PtrTy(*Ctx), IntptrTy};
   DFSanSetLabelFnTy = FunctionType::get(Type::getVoidTy(*Ctx),
@@ -1149,6 +1139,23 @@ void DataFlowSanitizer::addGlobalNameSuffix(GlobalValue *GV) {
   }
 }
 
+void DataFlowSanitizer::buildExternWeakCheckIfNeeded(IRBuilder<> &IRB,
+                                                     Function *F) {
+  // If the function we are wrapping was ExternWeak, it may be null.
+  // The original code before calling this wrapper may have checked for null,
+  // but replacing with a known-to-not-be-null wrapper can break this check.
+  // When replacing uses of the extern weak function with the wrapper we try
+  // to avoid replacing uses in conditionals, but this is not perfect.
+  // In the case where we fail, and accidentially optimize out a null check
+  // for a extern weak function, add a check here to help identify the issue.
+  if (GlobalValue::isExternalWeakLinkage(F->getLinkage())) {
+    std::vector<Value *> Args;
+    Args.push_back(IRB.CreatePointerCast(F, IRB.getInt8PtrTy()));
+    Args.push_back(IRB.CreateGlobalStringPtr(F->getName()));
+    IRB.CreateCall(DFSanWrapperExternWeakNullFn, Args);
+  }
+}
+
 Function *
 DataFlowSanitizer::buildWrapperFunction(Function *F, StringRef NewFName,
                                         GlobalValue::LinkageTypes NewFLink,
@@ -1181,61 +1188,6 @@ DataFlowSanitizer::buildWrapperFunction(Function *F, StringRef NewFName,
   return NewF;
 }
 
-Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT,
-                                                          StringRef FName) {
-  FunctionType *FTT = getTrampolineFunctionType(FT);
-  FunctionCallee C = Mod->getOrInsertFunction(FName, FTT);
-  Function *F = dyn_cast<Function>(C.getCallee());
-  if (F && F->isDeclaration()) {
-    F->setLinkage(GlobalValue::LinkOnceODRLinkage);
-    BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", F);
-    std::vector<Value *> Args;
-    Function::arg_iterator AI = F->arg_begin() + 1;
-    for (unsigned N = FT->getNumParams(); N != 0; ++AI, --N)
-      Args.push_back(&*AI);
-    CallInst *CI = CallInst::Create(FT, &*F->arg_begin(), Args, "", BB);
-    Type *RetType = FT->getReturnType();
-    ReturnInst *RI = RetType->isVoidTy() ? ReturnInst::Create(*Ctx, BB)
-                                         : ReturnInst::Create(*Ctx, CI, BB);
-
-    // F is called by a wrapped custom function with primitive shadows. So
-    // its arguments and return value need conversion.
-    DFSanFunction DFSF(*this, F, /*IsNativeABI=*/true,
-                       /*IsForceZeroLabels=*/false);
-    Function::arg_iterator ValAI = F->arg_begin(), ShadowAI = AI;
-    ++ValAI;
-    for (unsigned N = FT->getNumParams(); N != 0; ++ValAI, ++ShadowAI, --N) {
-      Value *Shadow =
-          DFSF.expandFromPrimitiveShadow(ValAI->getType(), &*ShadowAI, CI);
-      DFSF.ValShadowMap[&*ValAI] = Shadow;
-    }
-    Function::arg_iterator RetShadowAI = ShadowAI;
-    const bool ShouldTrackOrigins = shouldTrackOrigins();
-    if (ShouldTrackOrigins) {
-      ValAI = F->arg_begin();
-      ++ValAI;
-      Function::arg_iterator OriginAI = ShadowAI;
-      if (!RetType->isVoidTy())
-        ++OriginAI;
-      for (unsigned N = FT->getNumParams(); N != 0; ++ValAI, ++OriginAI, --N) {
-        DFSF.ValOriginMap[&*ValAI] = &*OriginAI;
-      }
-    }
-    DFSanVisitor(DFSF).visitCallInst(*CI);
-    if (!RetType->isVoidTy()) {
-      Value *PrimitiveShadow = DFSF.collapseToPrimitiveShadow(
-          DFSF.getShadow(RI->getReturnValue()), RI);
-      new StoreInst(PrimitiveShadow, &*RetShadowAI, RI);
-      if (ShouldTrackOrigins) {
-        Value *Origin = DFSF.getOrigin(RI->getReturnValue());
-        new StoreInst(Origin, &*std::prev(F->arg_end()), RI);
-      }
-    }
-  }
-
-  return cast<Constant>(C.getCallee());
-}
-
 // Initialize DataFlowSanitizer runtime functions and declare them in the module
 void DataFlowSanitizer::initializeRuntimeFunctions(Module &M) {
   {
@@ -1256,6 +1208,8 @@ void DataFlowSanitizer::initializeRuntimeFunctions(Module &M) {
   }
   DFSanUnimplementedFn =
       Mod->getOrInsertFunction("__dfsan_unimplemented", DFSanUnimplementedFnTy);
+  DFSanWrapperExternWeakNullFn = Mod->getOrInsertFunction(
+      "__dfsan_wrapper_extern_weak_null", DFSanWrapperExternWeakNullFnTy);
   {
     AttributeList AL;
     AL = AL.addParamAttribute(M.getContext(), 0, Attribute::ZExt);
@@ -1299,6 +1253,8 @@ void DataFlowSanitizer::initializeRuntimeFunctions(Module &M) {
       DFSanLoadLabelAndOriginFn.getCallee()->stripPointerCasts());
   DFSanRuntimeFunctions.insert(
       DFSanUnimplementedFn.getCallee()->stripPointerCasts());
+  DFSanRuntimeFunctions.insert(
+      DFSanWrapperExternWeakNullFn.getCallee()->stripPointerCasts());
   DFSanRuntimeFunctions.insert(
       DFSanSetLabelFn.getCallee()->stripPointerCasts());
   DFSanRuntimeFunctions.insert(
@@ -1500,7 +1456,40 @@ bool DataFlowSanitizer::runImpl(Module &M) {
 
       Value *WrappedFnCst =
           ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT));
-      F.replaceAllUsesWith(WrappedFnCst);
+
+      // Extern weak functions can sometimes be null at execution time.
+      // Code will sometimes check if an extern weak function is null.
+      // This could look something like:
+      //   declare extern_weak i8 @my_func(i8)
+      //   br i1 icmp ne (i8 (i8)* @my_func, i8 (i8)* null), label %use_my_func,
+      //   label %avoid_my_func
+      // The @"dfsw$my_func" wrapper is never null, so if we replace this use
+      // in the comparision, the icmp will simplify to false and we have
+      // accidentially optimized away a null check that is necessary.
+      // This can lead to a crash when the null extern_weak my_func is called.
+      //
+      // To prevent (the most common pattern of) this problem,
+      // do not replace uses in comparisons with the wrapper.
+      // We definitely want to replace uses in call instructions.
+      // Other uses (e.g. store the function address somewhere) might be
+      // called or compared or both - this case may not be handled correctly.
+      // We will default to replacing with wrapper in cases we are unsure.
+      auto IsNotCmpUse = [](Use &U) -> bool {
+        User *Usr = U.getUser();
+        if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Usr)) {
+          // This is the most common case for icmp ne null
+          if (CE->getOpcode() == Instruction::ICmp) {
+            return false;
+          }
+        }
+        if (Instruction *I = dyn_cast<Instruction>(Usr)) {
+          if (I->getOpcode() == Instruction::ICmp) {
+            return false;
+          }
+        }
+        return true;
+      };
+      F.replaceUsesWithIf(WrappedFnCst, IsNotCmpUse);
 
       UnwrappedFnMap[WrappedFnCst] = &F;
       *FI = NewF;
@@ -1919,6 +1908,14 @@ Align DFSanFunction::getOriginAlign(Align InstAlignment) {
   return Align(std::max(MinOriginAlignment, Alignment));
 }
 
+bool DFSanFunction::isLookupTableConstant(Value *P) {
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P->stripPointerCasts()))
+    if (GV->isConstant() && GV->hasName())
+      return DFS.CombineTaintLookupTableNames.count(GV->getName());
+
+  return false;
+}
+
 bool DFSanFunction::useCallbackLoadLabelAndOrigin(uint64_t Size,
                                                   Align InstAlignment) {
   // When enabling tracking load instructions, we always use
@@ -2172,6 +2169,29 @@ static AtomicOrdering addAcquireOrdering(AtomicOrdering AO) {
   llvm_unreachable("Unknown ordering");
 }
 
+Value *StripPointerGEPsAndCasts(Value *V) {
+  if (!V->getType()->isPointerTy())
+    return V;
+
+  // DFSan pass should be running on valid IR, but we'll
+  // keep a seen set to ensure there are no issues.
+  SmallPtrSet<const Value *, 4> Visited;
+  Visited.insert(V);
+  do {
+    if (auto *GEP = dyn_cast<GEPOperator>(V)) {
+      V = GEP->getPointerOperand();
+    } else if (Operator::getOpcode(V) == Instruction::BitCast) {
+      V = cast<Operator>(V)->getOperand(0);
+      if (!V->getType()->isPointerTy())
+        return V;
+    } else if (isa<GlobalAlias>(V)) {
+      V = cast<GlobalAlias>(V)->getAliasee();
+    }
+  } while (Visited.insert(V).second);
+
+  return V;
+}
+
 void DFSanVisitor::visitLoadInst(LoadInst &LI) {
   auto &DL = LI.getModule()->getDataLayout();
   uint64_t Size = DL.getTypeStoreSize(LI.getType());
@@ -2200,7 +2220,9 @@ void DFSanVisitor::visitLoadInst(LoadInst &LI) {
     Shadows.push_back(PrimitiveShadow);
     Origins.push_back(Origin);
   }
-  if (ClCombinePointerLabelsOnLoad) {
+  if (ClCombinePointerLabelsOnLoad ||
+      DFSF.isLookupTableConstant(
+          StripPointerGEPsAndCasts(LI.getPointerOperand()))) {
     Value *PtrShadow = DFSF.getShadow(LI.getPointerOperand());
     PrimitiveShadow = DFSF.combineShadows(PrimitiveShadow, PtrShadow, Pos);
     if (ShouldTrackOrigins) {
@@ -2562,7 +2584,9 @@ void DFSanVisitor::visitLandingPadInst(LandingPadInst &LPI) {
 }
 
 void DFSanVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
-  if (ClCombineOffsetLabelsOnGEP) {
+  if (ClCombineOffsetLabelsOnGEP ||
+      DFSF.isLookupTableConstant(
+          StripPointerGEPsAndCasts(GEPI.getPointerOperand()))) {
     visitInstOperands(GEPI);
     return;
   }
@@ -2722,13 +2746,8 @@ void DFSanVisitor::visitMemTransferInst(MemTransferInst &I) {
   auto *MTI = cast<MemTransferInst>(
       IRB.CreateCall(I.getFunctionType(), I.getCalledOperand(),
                      {DestShadow, SrcShadow, LenShadow, I.getVolatileCst()}));
-  if (ClPreserveAlignment) {
-    MTI->setDestAlignment(I.getDestAlign() * DFSF.DFS.ShadowWidthBytes);
-    MTI->setSourceAlignment(I.getSourceAlign() * DFSF.DFS.ShadowWidthBytes);
-  } else {
-    MTI->setDestAlignment(Align(DFSF.DFS.ShadowWidthBytes));
-    MTI->setSourceAlignment(Align(DFSF.DFS.ShadowWidthBytes));
-  }
+  MTI->setDestAlignment(DFSF.getShadowAlign(I.getDestAlign().valueOrOne()));
+  MTI->setSourceAlignment(DFSF.getShadowAlign(I.getSourceAlign().valueOrOne()));
   if (ClEventCallbacks) {
     IRB.CreateCall(DFSF.DFS.DFSanMemTransferCallbackFn,
                    {RawDestShadow,
@@ -2864,16 +2883,19 @@ bool DFSanVisitor::visitWrappedCallBase(Function &F, CallBase &CB) {
     CB.setCalledFunction(&F);
     IRB.CreateCall(DFSF.DFS.DFSanUnimplementedFn,
                    IRB.CreateGlobalStringPtr(F.getName()));
+    DFSF.DFS.buildExternWeakCheckIfNeeded(IRB, &F);
     DFSF.setShadow(&CB, DFSF.DFS.getZeroShadow(&CB));
     DFSF.setOrigin(&CB, DFSF.DFS.ZeroOrigin);
     return true;
   case DataFlowSanitizer::WK_Discard:
     CB.setCalledFunction(&F);
+    DFSF.DFS.buildExternWeakCheckIfNeeded(IRB, &F);
     DFSF.setShadow(&CB, DFSF.DFS.getZeroShadow(&CB));
     DFSF.setOrigin(&CB, DFSF.DFS.ZeroOrigin);
     return true;
   case DataFlowSanitizer::WK_Functional:
     CB.setCalledFunction(&F);
+    DFSF.DFS.buildExternWeakCheckIfNeeded(IRB, &F);
     visitInstOperands(CB);
     return true;
   case DataFlowSanitizer::WK_Custom:
@@ -2905,22 +2927,7 @@ bool DFSanVisitor::visitWrappedCallBase(Function &F, CallBase &CB) {
     // Adds non-variable arguments.
     auto *I = CB.arg_begin();
     for (unsigned N = FT->getNumParams(); N != 0; ++I, --N) {
-      Type *T = (*I)->getType();
-      FunctionType *ParamFT;
-      if (isa<PointerType>(T) &&
-          (ParamFT = dyn_cast<FunctionType>(T->getPointerElementType()))) {
-        std::string TName = "dfst";
-        TName += utostr(FT->getNumParams() - N);
-        TName += "$";
-        TName += F.getName();
-        Constant *Trampoline =
-            DFSF.DFS.getOrBuildTrampolineFunction(ParamFT, TName);
-        Args.push_back(Trampoline);
-        Args.push_back(
-            IRB.CreateBitCast(*I, Type::getInt8PtrTy(*DFSF.DFS.Ctx)));
-      } else {
-        Args.push_back(*I);
-      }
+      Args.push_back(*I);
     }
 
     // Adds shadow arguments.
diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 325089fc4402..ac4a1fd6bb7e 100644
--- a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -14,19 +14,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "CFGMST.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/CFG.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/IRBuilder.h"
@@ -34,8 +30,6 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
 #include "llvm/Support/CRC.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -87,7 +81,7 @@ GCOVOptions GCOVOptions::getDefault() {
 
   if (DefaultGCOVVersion.size() != 4) {
     llvm::report_fatal_error(Twine("Invalid -default-gcov-version: ") +
-                             DefaultGCOVVersion);
+                             DefaultGCOVVersion, /*GenCrashDiag=*/false);
   }
   memcpy(Options.Version, DefaultGCOVVersion.c_str(), 4);
   return Options;
@@ -169,39 +163,6 @@ private:
   StringMap<bool> InstrumentedFiles;
 };
 
-class GCOVProfilerLegacyPass : public ModulePass {
-public:
-  static char ID;
-  GCOVProfilerLegacyPass()
-      : GCOVProfilerLegacyPass(GCOVOptions::getDefault()) {}
-  GCOVProfilerLegacyPass(const GCOVOptions &Opts)
-      : ModulePass(ID), Profiler(Opts) {
-    initializeGCOVProfilerLegacyPassPass(*PassRegistry::getPassRegistry());
-  }
-  StringRef getPassName() const override { return "GCOV Profiler"; }
-
-  bool runOnModule(Module &M) override {
-    auto GetBFI = [this](Function &F) {
-      return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
-    };
-    auto GetBPI = [this](Function &F) {
-      return &this->getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI();
-    };
-    auto GetTLI = [this](Function &F) -> const TargetLibraryInfo & {
-      return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
-    };
-    return Profiler.runOnModule(M, GetBFI, GetBPI, GetTLI);
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<BlockFrequencyInfoWrapperPass>();
-    AU.addRequired<TargetLibraryInfoWrapperPass>();
-  }
-
-private:
-  GCOVProfiler Profiler;
-};
-
 struct BBInfo {
   BBInfo *Group;
   uint32_t Index;
@@ -237,21 +198,6 @@ struct Edge {
 };
 }
 
-char GCOVProfilerLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(
-    GCOVProfilerLegacyPass, "insert-gcov-profiling",
-    "Insert instrumentation for GCOV profiling", false, false)
-INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(
-    GCOVProfilerLegacyPass, "insert-gcov-profiling",
-    "Insert instrumentation for GCOV profiling", false, false)
-
-ModulePass *llvm::createGCOVProfilerPass(const GCOVOptions &Options) {
-  return new GCOVProfilerLegacyPass(Options);
-}
-
 static StringRef getFunctionName(const DISubprogram *SP) {
   if (!SP->getLinkageName().empty())
     return SP->getLinkageName();
@@ -862,7 +808,8 @@ bool GCOVProfiler::emitProfileNotes(
 
       // Split indirectbr critical edges here before computing the MST rather
       // than later in getInstrBB() to avoid invalidating it.
-      SplitIndirectBrCriticalEdges(F, BPI, BFI);
+      SplitIndirectBrCriticalEdges(F, /*IgnoreBlocksWithoutPHI=*/false, BPI,
+                                   BFI);
 
       CFGMST<Edge, BBInfo> MST(F, /*InstrumentFuncEntry_=*/false, BPI, BFI);
 
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 7b3741d19a1b..218b4bbfb6c0 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -13,14 +13,15 @@
 
 #include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/StackSafetyAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
@@ -33,7 +34,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -43,19 +44,15 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/PassRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/MemoryTaggingSupport.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
-#include <sstream>
 
 using namespace llvm;
 
@@ -83,6 +80,11 @@ static cl::opt<std::string>
                                  cl::desc("Prefix for memory access callbacks"),
                                  cl::Hidden, cl::init("__hwasan_"));
 
+static cl::opt<bool> ClKasanMemIntrinCallbackPrefix(
+    "hwasan-kernel-mem-intrinsic-prefix",
+    cl::desc("Use prefix for memory intrinsics in KASAN mode"), cl::Hidden,
+    cl::init(false));
+
 static cl::opt<bool> ClInstrumentWithCalls(
     "hwasan-instrument-with-calls",
     cl::desc("instrument reads and writes with callbacks"), cl::Hidden,
@@ -145,7 +147,7 @@ static cl::opt<bool> ClGenerateTagsWithCalls(
     cl::init(false));
 
 static cl::opt<bool> ClGlobals("hwasan-globals", cl::desc("Instrument globals"),
-                               cl::Hidden, cl::init(false), cl::ZeroOrMore);
+                               cl::Hidden, cl::init(false));
 
 static cl::opt<int> ClMatchAllTag(
     "hwasan-match-all-tag",
@@ -191,17 +193,16 @@ static cl::opt<bool>
 static cl::opt<bool>
     ClInstrumentLandingPads("hwasan-instrument-landing-pads",
                             cl::desc("instrument landing pads"), cl::Hidden,
-                            cl::init(false), cl::ZeroOrMore);
+                            cl::init(false));
 
 static cl::opt<bool> ClUseShortGranules(
     "hwasan-use-short-granules",
     cl::desc("use short granules in allocas and outlined checks"), cl::Hidden,
-    cl::init(false), cl::ZeroOrMore);
+    cl::init(false));
 
 static cl::opt<bool> ClInstrumentPersonalityFunctions(
     "hwasan-instrument-personality-functions",
-    cl::desc("instrument personality functions"), cl::Hidden, cl::init(false),
-    cl::ZeroOrMore);
+    cl::desc("instrument personality functions"), cl::Hidden);
 
 static cl::opt<bool> ClInlineAllChecks("hwasan-inline-all-checks",
                                        cl::desc("inline all checks"),
@@ -244,13 +245,6 @@ bool shouldDetectUseAfterScope(const Triple &TargetTriple) {
 /// An instrumentation pass implementing detection of addressability bugs
 /// using tagged pointers.
 class HWAddressSanitizer {
-private:
-  struct AllocaInfo {
-    AllocaInst *AI;
-    SmallVector<IntrinsicInst *, 2> LifetimeStart;
-    SmallVector<IntrinsicInst *, 2> LifetimeEnd;
-  };
-
 public:
   HWAddressSanitizer(Module &M, bool CompileKernel, bool Recover,
                      const StackSafetyGlobalInfo *SSI)
@@ -265,11 +259,7 @@ public:
 
   void setSSI(const StackSafetyGlobalInfo *S) { SSI = S; }
 
-  DenseMap<AllocaInst *, AllocaInst *> padInterestingAllocas(
-      const MapVector<AllocaInst *, AllocaInfo> &AllocasToInstrument);
-  bool sanitizeFunction(Function &F,
-                        llvm::function_ref<const DominatorTree &()> GetDT,
-                        llvm::function_ref<const PostDominatorTree &()> GetPDT);
+  bool sanitizeFunction(Function &F, FunctionAnalysisManager &FAM);
   void initializeModule();
   void createHwasanCtorComdat();
 
@@ -301,16 +291,9 @@ public:
   void tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag, size_t Size);
   Value *tagPointer(IRBuilder<> &IRB, Type *Ty, Value *PtrLong, Value *Tag);
   Value *untagPointer(IRBuilder<> &IRB, Value *PtrLong);
-  static bool isStandardLifetime(const AllocaInfo &AllocaInfo,
-                                 const DominatorTree &DT);
-  bool instrumentStack(
-      bool ShouldDetectUseAfterScope,
-      MapVector<AllocaInst *, AllocaInfo> &AllocasToInstrument,
-      SmallVector<Instruction *, 4> &UnrecognizedLifetimes,
-      DenseMap<AllocaInst *, std::vector<DbgVariableIntrinsic *>> &AllocaDbgMap,
-      SmallVectorImpl<Instruction *> &RetVec, Value *StackTag,
-      llvm::function_ref<const DominatorTree &()> GetDT,
-      llvm::function_ref<const PostDominatorTree &()> GetPDT);
+  bool instrumentStack(memtag::StackInfo &Info, Value *StackTag,
+                       const DominatorTree &DT, const PostDominatorTree &PDT,
+                       const LoopInfo &LI);
   Value *readRegister(IRBuilder<> &IRB, StringRef Name);
   bool instrumentLandingPads(SmallVectorImpl<Instruction *> &RetVec);
   Value *getNextTagWithCall(IRBuilder<> &IRB);
@@ -328,6 +311,9 @@ public:
   void instrumentGlobal(GlobalVariable *GV, uint8_t Tag);
   void instrumentGlobals();
 
+  Value *getPC(IRBuilder<> &IRB);
+  Value *getSP(IRBuilder<> &IRB);
+
   void instrumentPersonalityFunctions();
 
 private:
@@ -397,96 +383,12 @@ private:
 
   Value *ShadowBase = nullptr;
   Value *StackBaseTag = nullptr;
+  Value *CachedSP = nullptr;
   GlobalValue *ThreadPtrGlobal = nullptr;
 };
 
-class HWAddressSanitizerLegacyPass : public FunctionPass {
-public:
-  // Pass identification, replacement for typeid.
-  static char ID;
-
-  explicit HWAddressSanitizerLegacyPass(bool CompileKernel = false,
-                                        bool Recover = false,
-                                        bool DisableOptimization = false)
-      : FunctionPass(ID), CompileKernel(CompileKernel), Recover(Recover),
-        DisableOptimization(DisableOptimization) {
-    initializeHWAddressSanitizerLegacyPassPass(
-        *PassRegistry::getPassRegistry());
-  }
-
-  StringRef getPassName() const override { return "HWAddressSanitizer"; }
-
-  bool doInitialization(Module &M) override {
-    HWASan = std::make_unique<HWAddressSanitizer>(M, CompileKernel, Recover,
-                                                  /*SSI=*/nullptr);
-    return true;
-  }
-
-  bool runOnFunction(Function &F) override {
-    auto TargetTriple = Triple(F.getParent()->getTargetTriple());
-    if (shouldUseStackSafetyAnalysis(TargetTriple, DisableOptimization)) {
-      // We cannot call getAnalysis in doInitialization, that would cause a
-      // crash as the required analyses are not initialized yet.
-      HWASan->setSSI(
-          &getAnalysis<StackSafetyGlobalInfoWrapperPass>().getResult());
-    }
-    return HWASan->sanitizeFunction(
-        F,
-        [&]() -> const DominatorTree & {
-          return getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-        },
-        [&]() -> const PostDominatorTree & {
-          return getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
-        });
-  }
-
-  bool doFinalization(Module &M) override {
-    HWASan.reset();
-    return false;
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    // This is an over-estimation of, in case we are building for an
-    // architecture that doesn't allow stack tagging we will still load the
-    // analysis.
-    // This is so we don't need to plumb TargetTriple all the way to here.
-    if (mightUseStackSafetyAnalysis(DisableOptimization))
-      AU.addRequired<StackSafetyGlobalInfoWrapperPass>();
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<PostDominatorTreeWrapperPass>();
-  }
-
-private:
-  std::unique_ptr<HWAddressSanitizer> HWASan;
-  bool CompileKernel;
-  bool Recover;
-  bool DisableOptimization;
-};
-
 } // end anonymous namespace
 
-char HWAddressSanitizerLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(
-    HWAddressSanitizerLegacyPass, "hwasan",
-    "HWAddressSanitizer: detect memory bugs using tagged addressing.", false,
-    false)
-INITIALIZE_PASS_DEPENDENCY(StackSafetyGlobalInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
-INITIALIZE_PASS_END(
-    HWAddressSanitizerLegacyPass, "hwasan",
-    "HWAddressSanitizer: detect memory bugs using tagged addressing.", false,
-    false)
-
-FunctionPass *
-llvm::createHWAddressSanitizerLegacyPassPass(bool CompileKernel, bool Recover,
-                                             bool DisableOptimization) {
-  assert(!CompileKernel || Recover);
-  return new HWAddressSanitizerLegacyPass(CompileKernel, Recover,
-                                          DisableOptimization);
-}
-
 PreservedAnalyses HWAddressSanitizerPass::run(Module &M,
                                               ModuleAnalysisManager &MAM) {
   const StackSafetyGlobalInfo *SSI = nullptr;
@@ -497,16 +399,8 @@ PreservedAnalyses HWAddressSanitizerPass::run(Module &M,
   HWAddressSanitizer HWASan(M, Options.CompileKernel, Options.Recover, SSI);
   bool Modified = false;
   auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
-  for (Function &F : M) {
-    Modified |= HWASan.sanitizeFunction(
-        F,
-        [&]() -> const DominatorTree & {
-          return FAM.getResult<DominatorTreeAnalysis>(F);
-        },
-        [&]() -> const PostDominatorTree & {
-          return FAM.getResult<PostDominatorTreeAnalysis>(F);
-        });
-  }
+  for (Function &F : M)
+    Modified |= HWASan.sanitizeFunction(F, FAM);
   if (Modified)
     return PreservedAnalyses::none();
   return PreservedAnalyses::all();
@@ -739,7 +633,9 @@ void HWAddressSanitizer::initializeCallbacks(Module &M) {
                                      ArrayType::get(IRB.getInt8Ty(), 0));
 
   const std::string MemIntrinCallbackPrefix =
-      CompileKernel ? std::string("") : ClMemoryAccessCallbackPrefix;
+      (CompileKernel && !ClKasanMemIntrinCallbackPrefix)
+          ? std::string("")
+          : ClMemoryAccessCallbackPrefix;
   HWAsanMemmove = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memmove",
                                         IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
                                         IRB.getInt8PtrTy(), IntptrTy);
@@ -812,7 +708,7 @@ bool HWAddressSanitizer::ignoreAccess(Instruction *Inst, Value *Ptr) {
 void HWAddressSanitizer::getInterestingMemoryOperands(
     Instruction *I, SmallVectorImpl<InterestingMemoryOperand> &Interesting) {
   // Skip memory accesses inserted by another instrumentation.
-  if (I->hasMetadata("nosanitize"))
+  if (I->hasMetadata(LLVMContext::MD_nosanitize))
     return;
 
   // Do not instrument the load fetching the dynamic shadow address.
@@ -1056,18 +952,6 @@ bool HWAddressSanitizer::instrumentMemAccess(InterestingMemoryOperand &O) {
   return true;
 }
 
-static uint64_t getAllocaSizeInBytes(const AllocaInst &AI) {
-  uint64_t ArraySize = 1;
-  if (AI.isArrayAllocation()) {
-    const ConstantInt *CI = dyn_cast<ConstantInt>(AI.getArraySize());
-    assert(CI && "non-constant array size");
-    ArraySize = CI->getZExtValue();
-  }
-  Type *Ty = AI.getAllocatedType();
-  uint64_t SizeInBytes = AI.getModule()->getDataLayout().getTypeAllocSize(Ty);
-  return SizeInBytes * ArraySize;
-}
-
 void HWAddressSanitizer::tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag,
                                    size_t Size) {
   size_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
@@ -1141,19 +1025,10 @@ Value *HWAddressSanitizer::getStackBaseTag(IRBuilder<> &IRB) {
     return getNextTagWithCall(IRB);
   if (StackBaseTag)
     return StackBaseTag;
-  // FIXME: use addressofreturnaddress (but implement it in aarch64 backend
-  // first).
-  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
-  auto GetStackPointerFn = Intrinsic::getDeclaration(
-      M, Intrinsic::frameaddress,
-      IRB.getInt8PtrTy(M->getDataLayout().getAllocaAddrSpace()));
-  Value *StackPointer = IRB.CreateCall(
-      GetStackPointerFn, {Constant::getNullValue(IRB.getInt32Ty())});
-
   // Extract some entropy from the stack pointer for the tags.
   // Take bits 20..28 (ASLR entropy) and xor with bits 0..8 (these differ
   // between functions).
-  Value *StackPointerLong = IRB.CreatePointerCast(StackPointer, IntptrTy);
+  Value *StackPointerLong = getSP(IRB);
   Value *StackTag =
       applyTagMask(IRB, IRB.CreateXor(StackPointerLong,
                                       IRB.CreateLShr(StackPointerLong, 20)));
@@ -1233,6 +1108,30 @@ Value *HWAddressSanitizer::getHwasanThreadSlotPtr(IRBuilder<> &IRB, Type *Ty) {
   return nullptr;
 }
 
+Value *HWAddressSanitizer::getPC(IRBuilder<> &IRB) {
+  if (TargetTriple.getArch() == Triple::aarch64)
+    return readRegister(IRB, "pc");
+  else
+    return IRB.CreatePtrToInt(IRB.GetInsertBlock()->getParent(), IntptrTy);
+}
+
+Value *HWAddressSanitizer::getSP(IRBuilder<> &IRB) {
+  if (!CachedSP) {
+    // FIXME: use addressofreturnaddress (but implement it in aarch64 backend
+    // first).
+    Function *F = IRB.GetInsertBlock()->getParent();
+    Module *M = F->getParent();
+    auto GetStackPointerFn = Intrinsic::getDeclaration(
+        M, Intrinsic::frameaddress,
+        IRB.getInt8PtrTy(M->getDataLayout().getAllocaAddrSpace()));
+    CachedSP = IRB.CreatePtrToInt(
+        IRB.CreateCall(GetStackPointerFn,
+                       {Constant::getNullValue(IRB.getInt32Ty())}),
+        IntptrTy);
+  }
+  return CachedSP;
+}
+
 void HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord) {
   if (!Mapping.InTls)
     ShadowBase = getShadowNonTls(IRB);
@@ -1251,23 +1150,12 @@ void HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord) {
       TargetTriple.isAArch64() ? ThreadLong : untagPointer(IRB, ThreadLong);
 
   if (WithFrameRecord) {
-    Function *F = IRB.GetInsertBlock()->getParent();
     StackBaseTag = IRB.CreateAShr(ThreadLong, 3);
 
     // Prepare ring buffer data.
-    Value *PC;
-    if (TargetTriple.getArch() == Triple::aarch64)
-      PC = readRegister(IRB, "pc");
-    else
-      PC = IRB.CreatePtrToInt(F, IntptrTy);
-    Module *M = F->getParent();
-    auto GetStackPointerFn = Intrinsic::getDeclaration(
-        M, Intrinsic::frameaddress,
-        IRB.getInt8PtrTy(M->getDataLayout().getAllocaAddrSpace()));
-    Value *SP = IRB.CreatePtrToInt(
-        IRB.CreateCall(GetStackPointerFn,
-                       {Constant::getNullValue(IRB.getInt32Ty())}),
-        IntptrTy);
+    Value *PC = getPC(IRB);
+    Value *SP = getSP(IRB);
+
     // Mix SP and PC.
     // Assumptions:
     // PC is 0x0000PPPPPPPPPPPP  (48 bits are meaningful, others are zero)
@@ -1330,43 +1218,16 @@ bool HWAddressSanitizer::instrumentLandingPads(
   return true;
 }
 
-static bool
-maybeReachableFromEachOther(const SmallVectorImpl<IntrinsicInst *> &Insts,
-                            const DominatorTree &DT) {
-  // If we have too many lifetime ends, give up, as the algorithm below is N^2.
-  if (Insts.size() > ClMaxLifetimes)
-    return true;
-  for (size_t I = 0; I < Insts.size(); ++I) {
-    for (size_t J = 0; J < Insts.size(); ++J) {
-      if (I == J)
-        continue;
-      if (isPotentiallyReachable(Insts[I], Insts[J], nullptr, &DT))
-        return true;
-    }
-  }
-  return false;
-}
-
-// static
-bool HWAddressSanitizer::isStandardLifetime(const AllocaInfo &AllocaInfo,
-                                            const DominatorTree &DT) {
-  // An alloca that has exactly one start and end in every possible execution.
-  // If it has multiple ends, they have to be unreachable from each other, so
-  // at most one of them is actually used for each execution of the function.
-  return AllocaInfo.LifetimeStart.size() == 1 &&
-         (AllocaInfo.LifetimeEnd.size() == 1 ||
-          (AllocaInfo.LifetimeEnd.size() > 0 &&
-           !maybeReachableFromEachOther(AllocaInfo.LifetimeEnd, DT)));
+static bool isLifetimeIntrinsic(Value *V) {
+  auto *II = dyn_cast<IntrinsicInst>(V);
+  return II && II->isLifetimeStartOrEnd();
 }
 
-bool HWAddressSanitizer::instrumentStack(
-    bool ShouldDetectUseAfterScope,
-    MapVector<AllocaInst *, AllocaInfo> &AllocasToInstrument,
-    SmallVector<Instruction *, 4> &UnrecognizedLifetimes,
-    DenseMap<AllocaInst *, std::vector<DbgVariableIntrinsic *>> &AllocaDbgMap,
-    SmallVectorImpl<Instruction *> &RetVec, Value *StackTag,
-    llvm::function_ref<const DominatorTree &()> GetDT,
-    llvm::function_ref<const PostDominatorTree &()> GetPDT) {
+bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
+                                         Value *StackTag,
+                                         const DominatorTree &DT,
+                                         const PostDominatorTree &PDT,
+                                         const LoopInfo &LI) {
   // Ideally, we want to calculate tagged stack base pointer, and rewrite all
   // alloca addresses using that. Unfortunately, offsets are not known yet
   // (unless we use ASan-style mega-alloca). Instead we keep the base tag in a
@@ -1374,10 +1235,10 @@ bool HWAddressSanitizer::instrumentStack(
   // This generates one extra instruction per alloca use.
   unsigned int I = 0;
 
-  for (auto &KV : AllocasToInstrument) {
+  for (auto &KV : SInfo.AllocasToInstrument) {
     auto N = I++;
     auto *AI = KV.first;
-    AllocaInfo &Info = KV.second;
+    memtag::AllocaInfo &Info = KV.second;
     IRBuilder<> IRB(AI->getNextNode());
 
     // Replace uses of the alloca with tagged address.
@@ -1388,10 +1249,34 @@ bool HWAddressSanitizer::instrumentStack(
         AI->hasName() ? AI->getName().str() : "alloca." + itostr(N);
     Replacement->setName(Name + ".hwasan");
 
-    AI->replaceUsesWithIf(Replacement,
-                          [AILong](Use &U) { return U.getUser() != AILong; });
+    size_t Size = memtag::getAllocaSizeInBytes(*AI);
+    size_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
+
+    Value *AICast = IRB.CreatePointerCast(AI, Int8PtrTy);
+
+    auto HandleLifetime = [&](IntrinsicInst *II) {
+      // Set the lifetime intrinsic to cover the whole alloca. This reduces the
+      // set of assumptions we need to make about the lifetime. Without this we
+      // would need to ensure that we can track the lifetime pointer to a
+      // constant offset from the alloca, and would still need to change the
+      // size to include the extra alignment we use for the untagging to make
+      // the size consistent.
+      //
+      // The check for standard lifetime below makes sure that we have exactly
+      // one set of start / end in any execution (i.e. the ends are not
+      // reachable from each other), so this will not cause any problems.
+      II->setArgOperand(0, ConstantInt::get(Int64Ty, AlignedSize));
+      II->setArgOperand(1, AICast);
+    };
+    llvm::for_each(Info.LifetimeStart, HandleLifetime);
+    llvm::for_each(Info.LifetimeEnd, HandleLifetime);
 
-    for (auto *DDI : AllocaDbgMap.lookup(AI)) {
+    AI->replaceUsesWithIf(Replacement, [AICast, AILong](Use &U) {
+      auto *User = U.getUser();
+      return User != AILong && User != AICast && !isLifetimeIntrinsic(User);
+    });
+
+    for (auto *DDI : Info.DbgVariableIntrinsics) {
       // Prepend "tag_offset, N" to the dwarf expression.
       // Tag offset logically applies to the alloca pointer, and it makes sense
       // to put it at the beginning of the expression.
@@ -1403,37 +1288,47 @@ bool HWAddressSanitizer::instrumentStack(
                                                           NewOps, LocNo));
     }
 
-    size_t Size = getAllocaSizeInBytes(*AI);
-    size_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
     auto TagEnd = [&](Instruction *Node) {
       IRB.SetInsertPoint(Node);
       Value *UARTag = getUARTag(IRB, StackTag);
+      // When untagging, use the `AlignedSize` because we need to set the tags
+      // for the entire alloca to zero. If we used `Size` here, we would
+      // keep the last granule tagged, and store zero in the last byte of the
+      // last granule, due to how short granules are implemented.
       tagAlloca(IRB, AI, UARTag, AlignedSize);
     };
+    // Calls to functions that may return twice (e.g. setjmp) confuse the
+    // postdominator analysis, and will leave us to keep memory tagged after
+    // function return. Work around this by always untagging at every return
+    // statement if return_twice functions are called.
     bool StandardLifetime =
-        UnrecognizedLifetimes.empty() && isStandardLifetime(Info, GetDT());
-    if (ShouldDetectUseAfterScope && StandardLifetime) {
+        SInfo.UnrecognizedLifetimes.empty() &&
+        memtag::isStandardLifetime(Info.LifetimeStart, Info.LifetimeEnd, &DT,
+                                   &LI, ClMaxLifetimes) &&
+        !SInfo.CallsReturnTwice;
+    if (DetectUseAfterScope && StandardLifetime) {
       IntrinsicInst *Start = Info.LifetimeStart[0];
       IRB.SetInsertPoint(Start->getNextNode());
       tagAlloca(IRB, AI, Tag, Size);
-      if (!forAllReachableExits(GetDT(), GetPDT(), Start, Info.LifetimeEnd,
-                                RetVec, TagEnd)) {
+      if (!memtag::forAllReachableExits(DT, PDT, LI, Start, Info.LifetimeEnd,
+                                        SInfo.RetVec, TagEnd)) {
         for (auto *End : Info.LifetimeEnd)
           End->eraseFromParent();
       }
     } else {
       tagAlloca(IRB, AI, Tag, Size);
-      for (auto *RI : RetVec)
+      for (auto *RI : SInfo.RetVec)
         TagEnd(RI);
-      if (!StandardLifetime) {
-        for (auto &II : Info.LifetimeStart)
-          II->eraseFromParent();
-        for (auto &II : Info.LifetimeEnd)
-          II->eraseFromParent();
-      }
+      // We inserted tagging outside of the lifetimes, so we have to remove
+      // them.
+      for (auto &II : Info.LifetimeStart)
+        II->eraseFromParent();
+      for (auto &II : Info.LifetimeEnd)
+        II->eraseFromParent();
     }
+    memtag::alignAndPadAlloca(Info, Align(Mapping.getObjectAlignment()));
   }
-  for (auto &I : UnrecognizedLifetimes)
+  for (auto &I : SInfo.UnrecognizedLifetimes)
     I->eraseFromParent();
   return true;
 }
@@ -1443,7 +1338,7 @@ bool HWAddressSanitizer::isInterestingAlloca(const AllocaInst &AI) {
           // FIXME: instrument dynamic allocas, too
           AI.isStaticAlloca() &&
           // alloca() may be called with 0 size, ignore it.
-          getAllocaSizeInBytes(AI) > 0 &&
+          memtag::getAllocaSizeInBytes(AI) > 0 &&
           // We are only interested in allocas not promotable to registers.
           // Promotable allocas are common under -O0.
           !isAllocaPromotable(&AI) &&
@@ -1456,42 +1351,8 @@ bool HWAddressSanitizer::isInterestingAlloca(const AllocaInst &AI) {
          !(SSI && SSI->isSafe(AI));
 }
 
-DenseMap<AllocaInst *, AllocaInst *> HWAddressSanitizer::padInterestingAllocas(
-    const MapVector<AllocaInst *, AllocaInfo> &AllocasToInstrument) {
-  DenseMap<AllocaInst *, AllocaInst *> AllocaToPaddedAllocaMap;
-  for (auto &KV : AllocasToInstrument) {
-    AllocaInst *AI = KV.first;
-    uint64_t Size = getAllocaSizeInBytes(*AI);
-    uint64_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
-    AI->setAlignment(
-        Align(std::max(AI->getAlignment(), Mapping.getObjectAlignment())));
-    if (Size != AlignedSize) {
-      Type *AllocatedType = AI->getAllocatedType();
-      if (AI->isArrayAllocation()) {
-        uint64_t ArraySize =
-            cast<ConstantInt>(AI->getArraySize())->getZExtValue();
-        AllocatedType = ArrayType::get(AllocatedType, ArraySize);
-      }
-      Type *TypeWithPadding = StructType::get(
-          AllocatedType, ArrayType::get(Int8Ty, AlignedSize - Size));
-      auto *NewAI = new AllocaInst(
-          TypeWithPadding, AI->getType()->getAddressSpace(), nullptr, "", AI);
-      NewAI->takeName(AI);
-      NewAI->setAlignment(AI->getAlign());
-      NewAI->setUsedWithInAlloca(AI->isUsedWithInAlloca());
-      NewAI->setSwiftError(AI->isSwiftError());
-      NewAI->copyMetadata(*AI);
-      auto *Bitcast = new BitCastInst(NewAI, AI->getType(), "", AI);
-      AI->replaceAllUsesWith(Bitcast);
-      AllocaToPaddedAllocaMap[AI] = NewAI;
-    }
-  }
-  return AllocaToPaddedAllocaMap;
-}
-
-bool HWAddressSanitizer::sanitizeFunction(
-    Function &F, llvm::function_ref<const DominatorTree &()> GetDT,
-    llvm::function_ref<const PostDominatorTree &()> GetPDT) {
+bool HWAddressSanitizer::sanitizeFunction(Function &F,
+                                          FunctionAnalysisManager &FAM) {
   if (&F == HwasanCtorFunction)
     return false;
 
@@ -1502,72 +1363,27 @@ bool HWAddressSanitizer::sanitizeFunction(
 
   SmallVector<InterestingMemoryOperand, 16> OperandsToInstrument;
   SmallVector<MemIntrinsic *, 16> IntrinToInstrument;
-  MapVector<AllocaInst *, AllocaInfo> AllocasToInstrument;
-  SmallVector<Instruction *, 8> RetVec;
   SmallVector<Instruction *, 8> LandingPadVec;
-  SmallVector<Instruction *, 4> UnrecognizedLifetimes;
-  DenseMap<AllocaInst *, std::vector<DbgVariableIntrinsic *>> AllocaDbgMap;
-  bool CallsReturnTwice = false;
-  for (auto &BB : F) {
-    for (auto &Inst : BB) {
-      if (CallInst *CI = dyn_cast<CallInst>(&Inst)) {
-        if (CI->canReturnTwice()) {
-          CallsReturnTwice = true;
-        }
-      }
-      if (InstrumentStack) {
-        if (AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
-          if (isInterestingAlloca(*AI))
-            AllocasToInstrument.insert({AI, {}});
-          continue;
-        }
-        auto *II = dyn_cast<IntrinsicInst>(&Inst);
-        if (II && (II->getIntrinsicID() == Intrinsic::lifetime_start ||
-                   II->getIntrinsicID() == Intrinsic::lifetime_end)) {
-          AllocaInst *AI = findAllocaForValue(II->getArgOperand(1));
-          if (!AI) {
-            UnrecognizedLifetimes.push_back(&Inst);
-            continue;
-          }
-          if (!isInterestingAlloca(*AI))
-            continue;
-          if (II->getIntrinsicID() == Intrinsic::lifetime_start)
-            AllocasToInstrument[AI].LifetimeStart.push_back(II);
-          else
-            AllocasToInstrument[AI].LifetimeEnd.push_back(II);
-          continue;
-        }
-      }
 
-      if (isa<ReturnInst>(Inst)) {
-        if (CallInst *CI = Inst.getParent()->getTerminatingMustTailCall())
-          RetVec.push_back(CI);
-        else
-          RetVec.push_back(&Inst);
-      } else if (isa<ResumeInst, CleanupReturnInst>(Inst)) {
-        RetVec.push_back(&Inst);
-      }
-
-      if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&Inst)) {
-        for (Value *V : DVI->location_ops()) {
-          if (auto *Alloca = dyn_cast_or_null<AllocaInst>(V))
-            if (!AllocaDbgMap.count(Alloca) ||
-                AllocaDbgMap[Alloca].back() != DVI)
-              AllocaDbgMap[Alloca].push_back(DVI);
-        }
-      }
+  memtag::StackInfoBuilder SIB(
+      [this](const AllocaInst &AI) { return isInterestingAlloca(AI); });
+  for (auto &Inst : instructions(F)) {
+    if (InstrumentStack) {
+      SIB.visit(Inst);
+    }
 
-      if (InstrumentLandingPads && isa<LandingPadInst>(Inst))
-        LandingPadVec.push_back(&Inst);
+    if (InstrumentLandingPads && isa<LandingPadInst>(Inst))
+      LandingPadVec.push_back(&Inst);
 
-      getInterestingMemoryOperands(&Inst, OperandsToInstrument);
+    getInterestingMemoryOperands(&Inst, OperandsToInstrument);
 
-      if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(&Inst))
-        if (!ignoreMemIntrinsic(MI))
-          IntrinToInstrument.push_back(MI);
-    }
+    if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(&Inst))
+      if (!ignoreMemIntrinsic(MI))
+        IntrinToInstrument.push_back(MI);
   }
 
+  memtag::StackInfo &SInfo = SIB.get();
+
   initializeCallbacks(*F.getParent());
 
   bool Changed = false;
@@ -1575,7 +1391,7 @@ bool HWAddressSanitizer::sanitizeFunction(
   if (!LandingPadVec.empty())
     Changed |= instrumentLandingPads(LandingPadVec);
 
-  if (AllocasToInstrument.empty() && F.hasPersonalityFn() &&
+  if (SInfo.AllocasToInstrument.empty() && F.hasPersonalityFn() &&
       F.getPersonalityFn()->getName() == kHwasanPersonalityThunkName) {
     // __hwasan_personality_thunk is a no-op for functions without an
     // instrumented stack, so we can drop it.
@@ -1583,7 +1399,7 @@ bool HWAddressSanitizer::sanitizeFunction(
     Changed = true;
   }
 
-  if (AllocasToInstrument.empty() && OperandsToInstrument.empty() &&
+  if (SInfo.AllocasToInstrument.empty() && OperandsToInstrument.empty() &&
       IntrinToInstrument.empty())
     return Changed;
 
@@ -1593,42 +1409,16 @@ bool HWAddressSanitizer::sanitizeFunction(
   IRBuilder<> EntryIRB(InsertPt);
   emitPrologue(EntryIRB,
                /*WithFrameRecord*/ ClRecordStackHistory &&
-                   Mapping.WithFrameRecord && !AllocasToInstrument.empty());
+                   Mapping.WithFrameRecord &&
+                   !SInfo.AllocasToInstrument.empty());
 
-  if (!AllocasToInstrument.empty()) {
+  if (!SInfo.AllocasToInstrument.empty()) {
+    const DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+    const PostDominatorTree &PDT = FAM.getResult<PostDominatorTreeAnalysis>(F);
+    const LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
     Value *StackTag =
         ClGenerateTagsWithCalls ? nullptr : getStackBaseTag(EntryIRB);
-    // Calls to functions that may return twice (e.g. setjmp) confuse the
-    // postdominator analysis, and will leave us to keep memory tagged after
-    // function return. Work around this by always untagging at every return
-    // statement if return_twice functions are called.
-    instrumentStack(DetectUseAfterScope && !CallsReturnTwice,
-                    AllocasToInstrument, UnrecognizedLifetimes, AllocaDbgMap,
-                    RetVec, StackTag, GetDT, GetPDT);
-  }
-  // Pad and align each of the allocas that we instrumented to stop small
-  // uninteresting allocas from hiding in instrumented alloca's padding and so
-  // that we have enough space to store real tags for short granules.
-  DenseMap<AllocaInst *, AllocaInst *> AllocaToPaddedAllocaMap =
-      padInterestingAllocas(AllocasToInstrument);
-
-  if (!AllocaToPaddedAllocaMap.empty()) {
-    for (auto &BB : F) {
-      for (auto &Inst : BB) {
-        if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&Inst)) {
-          SmallDenseSet<Value *> LocationOps(DVI->location_ops().begin(),
-                                             DVI->location_ops().end());
-          for (Value *V : LocationOps) {
-            if (auto *AI = dyn_cast_or_null<AllocaInst>(V)) {
-              if (auto *NewAI = AllocaToPaddedAllocaMap.lookup(AI))
-                DVI->replaceVariableLocationOp(V, NewAI);
-            }
-          }
-        }
-      }
-    }
-    for (auto &P : AllocaToPaddedAllocaMap)
-      P.first->eraseFromParent();
+    instrumentStack(SInfo, StackTag, DT, PDT, LI);
   }
 
   // If we split the entry block, move any allocas that were originally in the
@@ -1654,6 +1444,7 @@ bool HWAddressSanitizer::sanitizeFunction(
 
   ShadowBase = nullptr;
   StackBaseTag = nullptr;
+  CachedSP = nullptr;
 
   return true;
 }
@@ -1735,34 +1526,10 @@ void HWAddressSanitizer::instrumentGlobal(GlobalVariable *GV, uint8_t Tag) {
   GV->eraseFromParent();
 }
 
-static DenseSet<GlobalVariable *> getExcludedGlobals(Module &M) {
-  NamedMDNode *Globals = M.getNamedMetadata("llvm.asan.globals");
-  if (!Globals)
-    return DenseSet<GlobalVariable *>();
-  DenseSet<GlobalVariable *> Excluded(Globals->getNumOperands());
-  for (auto MDN : Globals->operands()) {
-    // Metadata node contains the global and the fields of "Entry".
-    assert(MDN->getNumOperands() == 5);
-    auto *V = mdconst::extract_or_null<Constant>(MDN->getOperand(0));
-    // The optimizer may optimize away a global entirely.
-    if (!V)
-      continue;
-    auto *StrippedV = V->stripPointerCasts();
-    auto *GV = dyn_cast<GlobalVariable>(StrippedV);
-    if (!GV)
-      continue;
-    ConstantInt *IsExcluded = mdconst::extract<ConstantInt>(MDN->getOperand(4));
-    if (IsExcluded->isOne())
-      Excluded.insert(GV);
-  }
-  return Excluded;
-}
-
 void HWAddressSanitizer::instrumentGlobals() {
   std::vector<GlobalVariable *> Globals;
-  auto ExcludedGlobals = getExcludedGlobals(M);
   for (GlobalVariable &GV : M.globals()) {
-    if (ExcludedGlobals.count(&GV))
+    if (GV.hasSanitizerMetadata() && GV.getSanitizerMetadata().NoHWAddress)
       continue;
 
     if (GV.isDeclarationForLinker() || GV.getName().startswith("llvm.") ||
diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index 9a3afa9cc924..3ef06907dfee 100644
--- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -13,30 +13,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/IndirectCallPromotionAnalysis.h"
 #include "llvm/Analysis/IndirectCallVisitor.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -45,7 +35,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/CallPromotionUtils.h"
 #include <cassert>
 #include <cstdint>
@@ -71,13 +60,13 @@ static cl::opt<bool> DisableICP("disable-icp", cl::init(false), cl::Hidden,
 // value.
 // For debug use only.
 static cl::opt<unsigned>
-    ICPCutOff("icp-cutoff", cl::init(0), cl::Hidden, cl::ZeroOrMore,
+    ICPCutOff("icp-cutoff", cl::init(0), cl::Hidden,
               cl::desc("Max number of promotions for this compilation"));
 
 // If ICPCSSkip is non zero, the first ICPCSSkip callsites will be skipped.
 // For debug use only.
 static cl::opt<unsigned>
-    ICPCSSkip("icp-csskip", cl::init(0), cl::Hidden, cl::ZeroOrMore,
+    ICPCSSkip("icp-csskip", cl::init(0), cl::Hidden,
               cl::desc("Skip Callsite up to this number for this compilation"));
 
 // Set if the pass is called in LTO optimization. The difference for LTO mode
@@ -115,55 +104,6 @@ static cl::opt<bool>
 
 namespace {
 
-class PGOIndirectCallPromotionLegacyPass : public ModulePass {
-public:
-  static char ID;
-
-  PGOIndirectCallPromotionLegacyPass(bool InLTO = false, bool SamplePGO = false)
-      : ModulePass(ID), InLTO(InLTO), SamplePGO(SamplePGO) {
-    initializePGOIndirectCallPromotionLegacyPassPass(
-        *PassRegistry::getPassRegistry());
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<ProfileSummaryInfoWrapperPass>();
-  }
-
-  StringRef getPassName() const override { return "PGOIndirectCallPromotion"; }
-
-private:
-  bool runOnModule(Module &M) override;
-
-  // If this pass is called in LTO. We need to special handling the PGOFuncName
-  // for the static variables due to LTO's internalization.
-  bool InLTO;
-
-  // If this pass is called in SamplePGO. We need to add the prof metadata to
-  // the promoted direct call.
-  bool SamplePGO;
-};
-
-} // end anonymous namespace
-
-char PGOIndirectCallPromotionLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(PGOIndirectCallPromotionLegacyPass, "pgo-icall-prom",
-                      "Use PGO instrumentation profile to promote indirect "
-                      "calls to direct calls.",
-                      false, false)
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
-INITIALIZE_PASS_END(PGOIndirectCallPromotionLegacyPass, "pgo-icall-prom",
-                    "Use PGO instrumentation profile to promote indirect "
-                    "calls to direct calls.",
-                    false, false)
-
-ModulePass *llvm::createPGOIndirectCallPromotionLegacyPass(bool InLTO,
-                                                           bool SamplePGO) {
-  return new PGOIndirectCallPromotionLegacyPass(InLTO, SamplePGO);
-}
-
-namespace {
-
 // The class for main data structure to promote indirect calls to conditional
 // direct calls.
 class ICallPromotionFunc {
@@ -428,15 +368,6 @@ static bool promoteIndirectCalls(Module &M, ProfileSummaryInfo *PSI,
   return Changed;
 }
 
-bool PGOIndirectCallPromotionLegacyPass::runOnModule(Module &M) {
-  ProfileSummaryInfo *PSI =
-      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
-
-  // Command-line option has the priority for InLTO.
-  return promoteIndirectCalls(M, PSI, InLTO | ICPLTOMode,
-                              SamplePGO | ICPSamplePGOMode);
-}
-
 PreservedAnalyses PGOIndirectCallPromotion::run(Module &M,
                                                 ModuleAnalysisManager &AM) {
   ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
diff --git a/llvm/lib/Transforms/Instrumentation/InstrOrderFile.cpp b/llvm/lib/Transforms/Instrumentation/InstrOrderFile.cpp
index 3ea314329079..2091881c29fe 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrOrderFile.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrOrderFile.cpp
@@ -9,29 +9,22 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Instrumentation/InstrOrderFile.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/PassRegistry.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include <fstream>
-#include <map>
 #include <mutex>
-#include <set>
 #include <sstream>
 
 using namespace llvm;
@@ -61,7 +54,7 @@ private:
   ArrayType *MapTy;
 
 public:
-  InstrOrderFile() {}
+  InstrOrderFile() = default;
 
   void createOrderFileData(Module &M) {
     LLVMContext &Ctx = M.getContext();
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 6868408ef5f5..7843b1522830 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -47,12 +47,10 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include <algorithm>
 #include <cassert>
-#include <cstddef>
 #include <cstdint>
 #include <string>
 
@@ -62,7 +60,7 @@ using namespace llvm;
 
 namespace llvm {
 cl::opt<bool>
-    DebugInfoCorrelate("debug-info-correlate", cl::ZeroOrMore,
+    DebugInfoCorrelate("debug-info-correlate",
                        cl::desc("Use debug info to correlate profiles."),
                        cl::init(false));
 } // namespace llvm
@@ -95,18 +93,18 @@ cl::opt<double> NumCountersPerValueSite(
     cl::init(1.0));
 
 cl::opt<bool> AtomicCounterUpdateAll(
-    "instrprof-atomic-counter-update-all", cl::ZeroOrMore,
+    "instrprof-atomic-counter-update-all",
     cl::desc("Make all profile counter updates atomic (for testing only)"),
     cl::init(false));
 
 cl::opt<bool> AtomicCounterUpdatePromoted(
-    "atomic-counter-update-promoted", cl::ZeroOrMore,
+    "atomic-counter-update-promoted",
     cl::desc("Do counter update using atomic fetch add "
              " for promoted counters only"),
     cl::init(false));
 
 cl::opt<bool> AtomicFirstCounter(
-    "atomic-first-counter", cl::ZeroOrMore,
+    "atomic-first-counter",
     cl::desc("Use atomic fetch add for first counter in a function (usually "
              "the entry counter)"),
     cl::init(false));
@@ -116,37 +114,37 @@ cl::opt<bool> AtomicFirstCounter(
 // pipeline is setup, i.e., the default value of true of this option
 // does not mean the promotion will be done by default. Explicitly
 // setting this option can override the default behavior.
-cl::opt<bool> DoCounterPromotion("do-counter-promotion", cl::ZeroOrMore,
+cl::opt<bool> DoCounterPromotion("do-counter-promotion",
                                  cl::desc("Do counter register promotion"),
                                  cl::init(false));
 cl::opt<unsigned> MaxNumOfPromotionsPerLoop(
-    cl::ZeroOrMore, "max-counter-promotions-per-loop", cl::init(20),
+    "max-counter-promotions-per-loop", cl::init(20),
     cl::desc("Max number counter promotions per loop to avoid"
              " increasing register pressure too much"));
 
 // A debug option
 cl::opt<int>
-    MaxNumOfPromotions(cl::ZeroOrMore, "max-counter-promotions", cl::init(-1),
+    MaxNumOfPromotions("max-counter-promotions", cl::init(-1),
                        cl::desc("Max number of allowed counter promotions"));
 
 cl::opt<unsigned> SpeculativeCounterPromotionMaxExiting(
-    cl::ZeroOrMore, "speculative-counter-promotion-max-exiting", cl::init(3),
+    "speculative-counter-promotion-max-exiting", cl::init(3),
     cl::desc("The max number of exiting blocks of a loop to allow "
              " speculative counter promotion"));
 
 cl::opt<bool> SpeculativeCounterPromotionToLoop(
-    cl::ZeroOrMore, "speculative-counter-promotion-to-loop", cl::init(false),
+    "speculative-counter-promotion-to-loop",
     cl::desc("When the option is false, if the target block is in a loop, "
              "the promotion will be disallowed unless the promoted counter "
              " update can be further/iteratively promoted into an acyclic "
              " region."));
 
 cl::opt<bool> IterativeCounterPromotion(
-    cl::ZeroOrMore, "iterative-counter-promotion", cl::init(true),
+    "iterative-counter-promotion", cl::init(true),
     cl::desc("Allow counter promotion across the whole loop nest."));
 
 cl::opt<bool> SkipRetExitBlock(
-    cl::ZeroOrMore, "skip-ret-exit-block", cl::init(true),
+    "skip-ret-exit-block", cl::init(true),
     cl::desc("Suppress counter promotion if exit blocks contain ret."));
 
 class InstrProfilingLegacyPass : public ModulePass {
@@ -211,6 +209,18 @@ public:
       Value *Addr = cast<StoreInst>(Store)->getPointerOperand();
       Type *Ty = LiveInValue->getType();
       IRBuilder<> Builder(InsertPos);
+      if (auto *AddrInst = dyn_cast_or_null<IntToPtrInst>(Addr)) {
+        // If isRuntimeCounterRelocationEnabled() is true then the address of
+        // the store instruction is computed with two instructions in
+        // InstrProfiling::getCounterAddress(). We need to copy those
+        // instructions to this block to compute Addr correctly.
+        // %BiasAdd = add i64 ptrtoint <__profc_>, <__llvm_profile_counter_bias>
+        // %Addr = inttoptr i64 %BiasAdd to i64*
+        auto *OrigBiasInst = dyn_cast<BinaryOperator>(AddrInst->getOperand(0));
+        assert(OrigBiasInst->getOpcode() == Instruction::BinaryOps::Add);
+        Value *BiasInst = Builder.Insert(OrigBiasInst->clone());
+        Addr = Builder.CreateIntToPtr(BiasInst, Ty->getPointerTo());
+      }
       if (AtomicCounterUpdatePromoted)
         // automic update currently can only be promoted across the current
         // loop, not the whole loop nest.
@@ -303,8 +313,7 @@ public:
         auto PreheaderCount = BFI->getBlockProfileCount(L.getLoopPreheader());
         // If the average loop trip count is not greater than 1.5, we skip
         // promotion.
-        if (PreheaderCount &&
-            (PreheaderCount.getValue() * 3) >= (InstrCount.getValue() * 2))
+        if (PreheaderCount && (*PreheaderCount * 3) >= (*InstrCount * 2))
           continue;
       }
 
@@ -705,10 +714,9 @@ Value *InstrProfiling::getCounterAddress(InstrProfInstBase *I) {
 
   Type *Int64Ty = Type::getInt64Ty(M->getContext());
   Function *Fn = I->getParent()->getParent();
-  Instruction &EntryI = Fn->getEntryBlock().front();
-  LoadInst *LI = dyn_cast<LoadInst>(&EntryI);
-  if (!LI) {
-    IRBuilder<> EntryBuilder(&EntryI);
+  LoadInst *&BiasLI = FunctionToProfileBiasMap[Fn];
+  if (!BiasLI) {
+    IRBuilder<> EntryBuilder(&Fn->getEntryBlock().front());
     auto *Bias = M->getGlobalVariable(getInstrProfCounterBiasVarName());
     if (!Bias) {
       // Compiler must define this variable when runtime counter relocation
@@ -725,9 +733,9 @@ Value *InstrProfiling::getCounterAddress(InstrProfInstBase *I) {
       if (TT.supportsCOMDAT())
         Bias->setComdat(M->getOrInsertComdat(Bias->getName()));
     }
-    LI = EntryBuilder.CreateLoad(Int64Ty, Bias);
+    BiasLI = EntryBuilder.CreateLoad(Int64Ty, Bias);
   }
-  auto *Add = Builder.CreateAdd(Builder.CreatePtrToInt(Addr, Int64Ty), LI);
+  auto *Add = Builder.CreateAdd(Builder.CreatePtrToInt(Addr, Int64Ty), BiasLI);
   return Builder.CreateIntToPtr(Add, Addr->getType());
 }
 
@@ -769,7 +777,8 @@ void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageNamesVar) {
 
     Name->setLinkage(GlobalValue::PrivateLinkage);
     ReferencedNames.push_back(Name);
-    NC->dropAllReferences();
+    if (isa<ConstantExpr>(NC))
+      NC->dropAllReferences();
   }
   CoverageNamesVar->eraseFromParent();
 }
@@ -856,8 +865,8 @@ static bool needsRuntimeRegistrationOfSectionRange(const Triple &TT) {
   if (TT.isOSDarwin())
     return false;
   // Use linker script magic to get data/cnts/name start/end.
-  if (TT.isOSLinux() || TT.isOSFreeBSD() || TT.isOSNetBSD() ||
-      TT.isOSSolaris() || TT.isOSFuchsia() || TT.isPS4CPU() || TT.isOSWindows())
+  if (TT.isOSAIX() || TT.isOSLinux() || TT.isOSFreeBSD() || TT.isOSNetBSD() ||
+      TT.isOSSolaris() || TT.isOSFuchsia() || TT.isPS() || TT.isOSWindows())
     return false;
 
   return true;
@@ -1236,7 +1245,7 @@ bool InstrProfiling::emitRuntimeHook() {
       new GlobalVariable(*M, Int32Ty, false, GlobalValue::ExternalLinkage,
                          nullptr, getInstrProfRuntimeHookVarName());
 
-  if (TT.isOSBinFormatELF()) {
+  if (TT.isOSBinFormatELF() && !TT.isPS()) {
     // Mark the user variable as used so that it isn't stripped out.
     CompilerUsedVars.push_back(Var);
   } else {
diff --git a/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp b/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
index dda242492391..9ff0e632bd7f 100644
--- a/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -91,23 +91,13 @@ Comdat *llvm::getOrCreateFunctionComdat(Function &F, Triple &T) {
 /// initializeInstrumentation - Initialize all passes in the TransformUtils
 /// library.
 void llvm::initializeInstrumentation(PassRegistry &Registry) {
-  initializeAddressSanitizerLegacyPassPass(Registry);
-  initializeModuleAddressSanitizerLegacyPassPass(Registry);
   initializeMemProfilerLegacyPassPass(Registry);
   initializeModuleMemProfilerLegacyPassPass(Registry);
   initializeBoundsCheckingLegacyPassPass(Registry);
   initializeControlHeightReductionLegacyPassPass(Registry);
-  initializeGCOVProfilerLegacyPassPass(Registry);
-  initializePGOInstrumentationGenLegacyPassPass(Registry);
-  initializePGOInstrumentationUseLegacyPassPass(Registry);
-  initializePGOIndirectCallPromotionLegacyPassPass(Registry);
-  initializePGOMemOPSizeOptLegacyPassPass(Registry);
   initializeCGProfileLegacyPassPass(Registry);
   initializeInstrOrderFileLegacyPassPass(Registry);
   initializeInstrProfilingLegacyPassPass(Registry);
-  initializeMemorySanitizerLegacyPassPass(Registry);
-  initializeHWAddressSanitizerLegacyPassPass(Registry);
-  initializeThreadSanitizerLegacyPassPass(Registry);
   initializeModuleSanitizerCoverageLegacyPassPass(Registry);
   initializeDataFlowSanitizerLegacyPassPass(Registry);
 }
diff --git a/llvm/lib/Transforms/Instrumentation/MaximumSpanningTree.h b/llvm/lib/Transforms/Instrumentation/MaximumSpanningTree.h
deleted file mode 100644
index 892a6a26da91..000000000000
--- a/llvm/lib/Transforms/Instrumentation/MaximumSpanningTree.h
+++ /dev/null
@@ -1,109 +0,0 @@
-//===- llvm/Analysis/MaximumSpanningTree.h - Interface ----------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This module provides means for calculating a maximum spanning tree for a
-// given set of weighted edges. The type parameter T is the type of a node.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TRANSFORMS_INSTRUMENTATION_MAXIMUMSPANNINGTREE_H
-#define LLVM_LIB_TRANSFORMS_INSTRUMENTATION_MAXIMUMSPANNINGTREE_H
-
-#include "llvm/ADT/EquivalenceClasses.h"
-#include "llvm/IR/BasicBlock.h"
-#include <algorithm>
-#include <vector>
-
-namespace llvm {
-
-  /// MaximumSpanningTree - A MST implementation.
-  /// The type parameter T determines the type of the nodes of the graph.
-  template <typename T>
-  class MaximumSpanningTree {
-  public:
-    typedef std::pair<const T*, const T*> Edge;
-    typedef std::pair<Edge, double> EdgeWeight;
-    typedef std::vector<EdgeWeight> EdgeWeights;
-  protected:
-    typedef std::vector<Edge> MaxSpanTree;
-
-    MaxSpanTree MST;
-
-  private:
-    // A comparing class for comparing weighted edges.
-    struct EdgeWeightCompare {
-      static bool getBlockSize(const T *X) {
-        const BasicBlock *BB = dyn_cast_or_null<BasicBlock>(X);
-        return BB ? BB->size() : 0;
-      }
-
-      bool operator()(EdgeWeight X, EdgeWeight Y) const {
-        if (X.second > Y.second) return true;
-        if (X.second < Y.second) return false;
-
-        // Equal edge weights: break ties by comparing block sizes.
-        size_t XSizeA = getBlockSize(X.first.first);
-        size_t YSizeA = getBlockSize(Y.first.first);
-        if (XSizeA > YSizeA) return true;
-        if (XSizeA < YSizeA) return false;
-
-        size_t XSizeB = getBlockSize(X.first.second);
-        size_t YSizeB = getBlockSize(Y.first.second);
-        if (XSizeB > YSizeB) return true;
-        if (XSizeB < YSizeB) return false;
-
-        return false;
-      }
-    };
-
-  public:
-    static char ID; // Class identification, replacement for typeinfo
-
-    /// MaximumSpanningTree() - Takes a vector of weighted edges and returns a
-    /// spanning tree.
-    MaximumSpanningTree(EdgeWeights &EdgeVector) {
-      llvm::stable_sort(EdgeVector, EdgeWeightCompare());
-
-      // Create spanning tree, Forest contains a special data structure
-      // that makes checking if two nodes are already in a common (sub-)tree
-      // fast and cheap.
-      EquivalenceClasses<const T*> Forest;
-      for (typename EdgeWeights::iterator EWi = EdgeVector.begin(),
-           EWe = EdgeVector.end(); EWi != EWe; ++EWi) {
-        Edge e = (*EWi).first;
-
-        Forest.insert(e.first);
-        Forest.insert(e.second);
-      }
-
-      // Iterate over the sorted edges, biggest first.
-      for (typename EdgeWeights::iterator EWi = EdgeVector.begin(),
-           EWe = EdgeVector.end(); EWi != EWe; ++EWi) {
-        Edge e = (*EWi).first;
-
-        if (Forest.findLeader(e.first) != Forest.findLeader(e.second)) {
-          Forest.unionSets(e.first, e.second);
-          // So we know now that the edge is not already in a subtree, so we push
-          // the edge to the MST.
-          MST.push_back(e);
-        }
-      }
-    }
-
-    typename MaxSpanTree::iterator begin() {
-      return MST.begin();
-    }
-
-    typename MaxSpanTree::iterator end() {
-      return MST.end();
-    }
-  };
-
-} // End llvm namespace
-
-#endif // LLVM_LIB_TRANSFORMS_INSTRUMENTATION_MAXIMUMSPANNINGTREE_H
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
index 5e078f2c4212..01e3b2c20218 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
@@ -27,15 +27,14 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
@@ -156,7 +155,6 @@ static uint64_t getCtorAndDtorPriority(Triple &TargetTriple) {
 struct InterestingMemoryAccess {
   Value *Addr = nullptr;
   bool IsWrite;
-  unsigned Alignment;
   Type *AccessTy;
   uint64_t TypeSize;
   Value *MaybeMask = nullptr;
@@ -182,8 +180,7 @@ public:
   void instrumentAddress(Instruction *OrigIns, Instruction *InsertBefore,
                          Value *Addr, uint32_t TypeSize, bool IsWrite);
   void instrumentMaskedLoadOrStore(const DataLayout &DL, Value *Mask,
-                                   Instruction *I, Value *Addr,
-                                   unsigned Alignment, Type *AccessTy,
+                                   Instruction *I, Value *Addr, Type *AccessTy,
                                    bool IsWrite);
   void instrumentMemIntrinsic(MemIntrinsic *MI);
   Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
@@ -255,7 +252,7 @@ public:
 
 } // end anonymous namespace
 
-MemProfilerPass::MemProfilerPass() {}
+MemProfilerPass::MemProfilerPass() = default;
 
 PreservedAnalyses MemProfilerPass::run(Function &F,
                                        AnalysisManager<Function> &AM) {
@@ -266,7 +263,7 @@ PreservedAnalyses MemProfilerPass::run(Function &F,
   return PreservedAnalyses::all();
 }
 
-ModuleMemProfilerPass::ModuleMemProfilerPass() {}
+ModuleMemProfilerPass::ModuleMemProfilerPass() = default;
 
 PreservedAnalyses ModuleMemProfilerPass::run(Module &M,
                                              AnalysisManager<Module> &AM) {
@@ -341,28 +338,24 @@ MemProfiler::isInterestingMemoryAccess(Instruction *I) const {
       return None;
     Access.IsWrite = false;
     Access.AccessTy = LI->getType();
-    Access.Alignment = LI->getAlignment();
     Access.Addr = LI->getPointerOperand();
   } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
     if (!ClInstrumentWrites)
       return None;
     Access.IsWrite = true;
     Access.AccessTy = SI->getValueOperand()->getType();
-    Access.Alignment = SI->getAlignment();
     Access.Addr = SI->getPointerOperand();
   } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
     if (!ClInstrumentAtomics)
       return None;
     Access.IsWrite = true;
     Access.AccessTy = RMW->getValOperand()->getType();
-    Access.Alignment = 0;
     Access.Addr = RMW->getPointerOperand();
   } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
     if (!ClInstrumentAtomics)
       return None;
     Access.IsWrite = true;
     Access.AccessTy = XCHG->getCompareOperand()->getType();
-    Access.Alignment = 0;
     Access.Addr = XCHG->getPointerOperand();
   } else if (auto *CI = dyn_cast<CallInst>(I)) {
     auto *F = CI->getCalledFunction();
@@ -384,11 +377,6 @@ MemProfiler::isInterestingMemoryAccess(Instruction *I) const {
       }
 
       auto *BasePtr = CI->getOperand(0 + OpOffset);
-      if (auto *AlignmentConstant =
-              dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset)))
-        Access.Alignment = (unsigned)AlignmentConstant->getZExtValue();
-      else
-        Access.Alignment = 1; // No alignment guarantees. We probably got Undef
       Access.MaybeMask = CI->getOperand(2 + OpOffset);
       Access.Addr = BasePtr;
     }
@@ -410,6 +398,25 @@ MemProfiler::isInterestingMemoryAccess(Instruction *I) const {
   if (Access.Addr->isSwiftError())
     return None;
 
+  // Peel off GEPs and BitCasts.
+  auto *Addr = Access.Addr->stripInBoundsOffsets();
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
+    // Do not instrument PGO counter updates.
+    if (GV->hasSection()) {
+      StringRef SectionName = GV->getSection();
+      // Check if the global is in the PGO counters section.
+      auto OF = Triple(I->getModule()->getTargetTriple()).getObjectFormat();
+      if (SectionName.endswith(
+              getInstrProfSectionName(IPSK_cnts, OF, /*AddSegmentInfo=*/false)))
+        return None;
+    }
+
+    // Do not instrument accesses to LLVM internal variables.
+    if (GV->getName().startswith("__llvm"))
+      return None;
+  }
+
   const DataLayout &DL = I->getModule()->getDataLayout();
   Access.TypeSize = DL.getTypeStoreSizeInBits(Access.AccessTy);
   return Access;
@@ -417,7 +424,6 @@ MemProfiler::isInterestingMemoryAccess(Instruction *I) const {
 
 void MemProfiler::instrumentMaskedLoadOrStore(const DataLayout &DL, Value *Mask,
                                               Instruction *I, Value *Addr,
-                                              unsigned Alignment,
                                               Type *AccessTy, bool IsWrite) {
   auto *VTy = cast<FixedVectorType>(AccessTy);
   uint64_t ElemTypeSize = DL.getTypeStoreSizeInBits(VTy->getScalarType());
@@ -468,8 +474,7 @@ void MemProfiler::instrumentMop(Instruction *I, const DataLayout &DL,
 
   if (Access.MaybeMask) {
     instrumentMaskedLoadOrStore(DL, Access.MaybeMask, I, Access.Addr,
-                                Access.Alignment, Access.AccessTy,
-                                Access.IsWrite);
+                                Access.AccessTy, Access.IsWrite);
   } else {
     // Since the access counts will be accumulated across the entire allocation,
     // we only update the shadow access count for the first location and thus
@@ -615,8 +620,6 @@ bool MemProfiler::instrumentFunction(Function &F) {
 
   initializeCallbacks(*F.getParent());
 
-  FunctionModified |= insertDynamicShadowAtFunctionEntry(F);
-
   SmallVector<Instruction *, 16> ToInstrument;
 
   // Fill the set of memory operations to instrument.
@@ -627,6 +630,15 @@ bool MemProfiler::instrumentFunction(Function &F) {
     }
   }
 
+  if (ToInstrument.empty()) {
+    LLVM_DEBUG(dbgs() << "MEMPROF done instrumenting: " << FunctionModified
+                      << " " << F << "\n");
+
+    return FunctionModified;
+  }
+
+  FunctionModified |= insertDynamicShadowAtFunctionEntry(F);
+
   int NumInstrumented = 0;
   for (auto *Inst : ToInstrument) {
     if (ClDebugMin < 0 || ClDebugMax < 0 ||
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index c51acdf52f14..4d72f6c3d1a9 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -174,24 +174,19 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsX86.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueMap.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -634,33 +629,6 @@ void insertModuleCtor(Module &M) {
       });
 }
 
-/// A legacy function pass for msan instrumentation.
-///
-/// Instruments functions to detect uninitialized reads.
-struct MemorySanitizerLegacyPass : public FunctionPass {
-  // Pass identification, replacement for typeid.
-  static char ID;
-
-  MemorySanitizerLegacyPass(MemorySanitizerOptions Options = {})
-      : FunctionPass(ID), Options(Options) {
-    initializeMemorySanitizerLegacyPassPass(*PassRegistry::getPassRegistry());
-  }
-  StringRef getPassName() const override { return "MemorySanitizerLegacyPass"; }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<TargetLibraryInfoWrapperPass>();
-  }
-
-  bool runOnFunction(Function &F) override {
-    return MSan->sanitizeFunction(
-        F, getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F));
-  }
-  bool doInitialization(Module &M) override;
-
-  Optional<MemorySanitizer> MSan;
-  MemorySanitizerOptions Options;
-};
-
 template <class T> T getOptOrDefault(const cl::opt<T> &Opt, T Default) {
   return (Opt.getNumOccurrences() > 0) ? Opt : Default;
 }
@@ -705,21 +673,6 @@ void MemorySanitizerPass::printPipeline(
   OS << ">";
 }
 
-char MemorySanitizerLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(MemorySanitizerLegacyPass, "msan",
-                      "MemorySanitizer: detects uninitialized reads.", false,
-                      false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(MemorySanitizerLegacyPass, "msan",
-                    "MemorySanitizer: detects uninitialized reads.", false,
-                    false)
-
-FunctionPass *
-llvm::createMemorySanitizerLegacyPassPass(MemorySanitizerOptions Options) {
-  return new MemorySanitizerLegacyPass(Options);
-}
-
 /// Create a non-const global initialized with the given string.
 ///
 /// Creates a writable global for Str so that we can pass it to the
@@ -1017,13 +970,6 @@ void MemorySanitizer::initializeModule(Module &M) {
 }
 }
 
-bool MemorySanitizerLegacyPass::doInitialization(Module &M) {
-  if (!Options.Kernel)
-    insertModuleCtor(M);
-  MSan.emplace(M, Options);
-  return true;
-}
-
 namespace {
 
 /// A helper class that handles instrumentation of VarArg
@@ -1674,7 +1620,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   /// or extracts if from ParamTLS (for function arguments).
   Value *getShadow(Value *V) {
     if (Instruction *I = dyn_cast<Instruction>(V)) {
-      if (!PropagateShadow || I->getMetadata("nosanitize"))
+      if (!PropagateShadow || I->getMetadata(LLVMContext::MD_nosanitize))
         return getCleanShadow(V);
       // For instructions the shadow is already stored in the map.
       Value *Shadow = ShadowMap[V];
@@ -1694,9 +1640,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     }
     if (Argument *A = dyn_cast<Argument>(V)) {
       // For arguments we compute the shadow on demand and store it in the map.
-      Value **ShadowPtr = &ShadowMap[V];
-      if (*ShadowPtr)
-        return *ShadowPtr;
+      Value *&ShadowPtr = ShadowMap[V];
+      if (ShadowPtr)
+        return ShadowPtr;
       Function *F = A->getParent();
       IRBuilder<> EntryIRB(FnPrologueEnd);
       unsigned ArgOffset = 0;
@@ -1753,12 +1699,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
           if (!PropagateShadow || Overflow || FArg.hasByValAttr() ||
               (MS.EagerChecks && FArg.hasAttribute(Attribute::NoUndef))) {
-            *ShadowPtr = getCleanShadow(V);
+            ShadowPtr = getCleanShadow(V);
             setOrigin(A, getCleanOrigin());
           } else {
             // Shadow over TLS
             Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset);
-            *ShadowPtr = EntryIRB.CreateAlignedLoad(getShadowTy(&FArg), Base,
+            ShadowPtr = EntryIRB.CreateAlignedLoad(getShadowTy(&FArg), Base,
                                                     kShadowTLSAlignment);
             if (MS.TrackOrigins) {
               Value *OriginPtr =
@@ -1767,14 +1713,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
             }
           }
           LLVM_DEBUG(dbgs()
-                     << "  ARG:    " << FArg << " ==> " << **ShadowPtr << "\n");
+                     << "  ARG:    " << FArg << " ==> " << *ShadowPtr << "\n");
           break;
         }
 
         ArgOffset += alignTo(Size, kShadowTLSAlignment);
       }
-      assert(*ShadowPtr && "Could not find shadow for an argument");
-      return *ShadowPtr;
+      assert(ShadowPtr && "Could not find shadow for an argument");
+      return ShadowPtr;
     }
     // For everything else the shadow is zero.
     return getCleanShadow(V);
@@ -1793,7 +1739,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     assert((isa<Instruction>(V) || isa<Argument>(V)) &&
            "Unexpected value type in getOrigin()");
     if (Instruction *I = dyn_cast<Instruction>(V)) {
-      if (I->getMetadata("nosanitize"))
+      if (I->getMetadata(LLVMContext::MD_nosanitize))
         return getCleanOrigin();
     }
     Value *Origin = OriginMap[V];
@@ -1916,7 +1862,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   // ------------------- Visitors.
   using InstVisitor<MemorySanitizerVisitor>::visit;
   void visit(Instruction &I) {
-    if (I.getMetadata("nosanitize"))
+    if (I.getMetadata(LLVMContext::MD_nosanitize))
       return;
     // Don't want to visit if we're in the prologue
     if (isInPrologue(I))
@@ -1930,12 +1876,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   /// Optionally, checks that the load address is fully defined.
   void visitLoadInst(LoadInst &I) {
     assert(I.getType()->isSized() && "Load type must have size");
-    assert(!I.getMetadata("nosanitize"));
+    assert(!I.getMetadata(LLVMContext::MD_nosanitize));
     IRBuilder<> IRB(I.getNextNode());
     Type *ShadowTy = getShadowTy(&I);
     Value *Addr = I.getPointerOperand();
     Value *ShadowPtr = nullptr, *OriginPtr = nullptr;
-    const Align Alignment = assumeAligned(I.getAlignment());
+    const Align Alignment = I.getAlign();
     if (PropagateShadow) {
       std::tie(ShadowPtr, OriginPtr) =
           getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ false);
@@ -2573,6 +2519,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   ///
   /// Similar situation exists for memcpy and memset.
   void visitMemMoveInst(MemMoveInst &I) {
+    getShadow(I.getArgOperand(1)); // Ensure shadow initialized
     IRBuilder<> IRB(&I);
     IRB.CreateCall(
         MS.MemmoveFn,
@@ -2587,6 +2534,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   // FIXME: consider doing manual inline for small constant sizes and proper
   // alignment.
   void visitMemCpyInst(MemCpyInst &I) {
+    getShadow(I.getArgOperand(1)); // Ensure shadow initialized
     IRBuilder<> IRB(&I);
     IRB.CreateCall(
         MS.MemcpyFn,
@@ -3252,27 +3200,37 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     SOC.Done(&I);
   }
 
-  // Instrument _mm_*_sd intrinsics
-  void handleUnarySdIntrinsic(IntrinsicInst &I) {
+  // Instrument _mm_*_sd|ss intrinsics
+  void handleUnarySdSsIntrinsic(IntrinsicInst &I) {
     IRBuilder<> IRB(&I);
+    unsigned Width =
+        cast<FixedVectorType>(I.getArgOperand(0)->getType())->getNumElements();
     Value *First = getShadow(&I, 0);
     Value *Second = getShadow(&I, 1);
-    // High word of first operand, low word of second
-    Value *Shadow =
-        IRB.CreateShuffleVector(First, Second, llvm::makeArrayRef<int>({2, 1}));
+    // First element of second operand, remaining elements of first operand
+    SmallVector<int, 16> Mask;
+    Mask.push_back(Width);
+    for (unsigned i = 1; i < Width; i++)
+      Mask.push_back(i);
+    Value *Shadow = IRB.CreateShuffleVector(First, Second, Mask);
 
     setShadow(&I, Shadow);
     setOriginForNaryOp(I);
   }
 
-  void handleBinarySdIntrinsic(IntrinsicInst &I) {
+  void handleBinarySdSsIntrinsic(IntrinsicInst &I) {
     IRBuilder<> IRB(&I);
+    unsigned Width =
+        cast<FixedVectorType>(I.getArgOperand(0)->getType())->getNumElements();
     Value *First = getShadow(&I, 0);
     Value *Second = getShadow(&I, 1);
     Value *OrShadow = IRB.CreateOr(First, Second);
-    // High word of first operand, low word of both OR'd together
-    Value *Shadow = IRB.CreateShuffleVector(First, OrShadow,
-                                            llvm::makeArrayRef<int>({2, 1}));
+    // First element of both OR'd together, remaining elements of first operand
+    SmallVector<int, 16> Mask;
+    Mask.push_back(Width);
+    for (unsigned i = 1; i < Width; i++)
+      Mask.push_back(i);
+    Value *Shadow = IRB.CreateShuffleVector(First, OrShadow, Mask);
 
     setShadow(&I, Shadow);
     setOriginForNaryOp(I);
@@ -3547,11 +3505,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       break;
 
     case Intrinsic::x86_sse41_round_sd:
-      handleUnarySdIntrinsic(I);
+    case Intrinsic::x86_sse41_round_ss:
+      handleUnarySdSsIntrinsic(I);
       break;
     case Intrinsic::x86_sse2_max_sd:
+    case Intrinsic::x86_sse_max_ss:
     case Intrinsic::x86_sse2_min_sd:
-      handleBinarySdIntrinsic(I);
+    case Intrinsic::x86_sse_min_ss:
+      handleBinarySdSsIntrinsic(I);
       break;
 
     case Intrinsic::fshl:
@@ -3630,7 +3591,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   }
 
   void visitCallBase(CallBase &CB) {
-    assert(!CB.getMetadata("nosanitize"));
+    assert(!CB.getMetadata(LLVMContext::MD_nosanitize));
     if (CB.isInlineAsm()) {
       // For inline asm (either a call to asm function, or callbr instruction),
       // do the usual thing: check argument shadow and mark all outputs as
@@ -4083,8 +4044,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // Nothing to do here.
   }
 
-  void instrumentAsmArgument(Value *Operand, Instruction &I, IRBuilder<> &IRB,
-                             const DataLayout &DL, bool isOutput) {
+  void instrumentAsmArgument(Value *Operand, Type *ElemTy, Instruction &I,
+                             IRBuilder<> &IRB, const DataLayout &DL,
+                             bool isOutput) {
     // For each assembly argument, we check its value for being initialized.
     // If the argument is a pointer, we assume it points to a single element
     // of the corresponding type (or to a 8-byte word, if the type is unsized).
@@ -4096,10 +4058,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       assert(!isOutput);
       return;
     }
-    Type *ElType = OpType->getPointerElementType();
-    if (!ElType->isSized())
+    if (!ElemTy->isSized())
       return;
-    int Size = DL.getTypeStoreSize(ElType);
+    int Size = DL.getTypeStoreSize(ElemTy);
     Value *Ptr = IRB.CreatePointerCast(Operand, IRB.getInt8PtrTy());
     Value *SizeVal = ConstantInt::get(MS.IntptrTy, Size);
     IRB.CreateCall(MS.MsanInstrumentAsmStoreFn, {Ptr, SizeVal});
@@ -4159,14 +4120,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // that we won't overwrite uninit values before checking them.
     for (int i = OutputArgs; i < NumOperands; i++) {
       Value *Operand = CB->getOperand(i);
-      instrumentAsmArgument(Operand, I, IRB, DL, /*isOutput*/ false);
+      instrumentAsmArgument(Operand, CB->getParamElementType(i), I, IRB, DL,
+                            /*isOutput*/ false);
     }
     // Unpoison output arguments. This must happen before the actual InlineAsm
     // call, so that the shadow for memory published in the asm() statement
     // remains valid.
     for (int i = 0; i < OutputArgs; i++) {
       Value *Operand = CB->getOperand(i);
-      instrumentAsmArgument(Operand, I, IRB, DL, /*isOutput*/ true);
+      instrumentAsmArgument(Operand, CB->getParamElementType(i), I, IRB, DL,
+                            /*isOutput*/ true);
     }
 
     setShadow(&I, getCleanShadow(&I));
@@ -4885,8 +4848,8 @@ struct VarArgPowerPC64Helper : public VarArgHelper {
         assert(A->getType()->isPointerTy());
         Type *RealTy = CB.getParamByValType(ArgNo);
         uint64_t ArgSize = DL.getTypeAllocSize(RealTy);
-        MaybeAlign ArgAlign = CB.getParamAlign(ArgNo);
-        if (!ArgAlign || *ArgAlign < Align(8))
+        Align ArgAlign = CB.getParamAlign(ArgNo).value_or(Align(8));
+        if (ArgAlign < 8)
           ArgAlign = Align(8);
         VAArgOffset = alignTo(VAArgOffset, ArgAlign);
         if (!IsFixed) {
@@ -4902,27 +4865,27 @@ struct VarArgPowerPC64Helper : public VarArgHelper {
                              kShadowTLSAlignment, ArgSize);
           }
         }
-        VAArgOffset += alignTo(ArgSize, 8);
+        VAArgOffset += alignTo(ArgSize, Align(8));
       } else {
         Value *Base;
         uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
-        uint64_t ArgAlign = 8;
+        Align ArgAlign = Align(8);
         if (A->getType()->isArrayTy()) {
           // Arrays are aligned to element size, except for long double
           // arrays, which are aligned to 8 bytes.
           Type *ElementTy = A->getType()->getArrayElementType();
           if (!ElementTy->isPPC_FP128Ty())
-            ArgAlign = DL.getTypeAllocSize(ElementTy);
+            ArgAlign = Align(DL.getTypeAllocSize(ElementTy));
         } else if (A->getType()->isVectorTy()) {
           // Vectors are naturally aligned.
-          ArgAlign = DL.getTypeAllocSize(A->getType());
+          ArgAlign = Align(ArgSize);
         }
         if (ArgAlign < 8)
-          ArgAlign = 8;
+          ArgAlign = Align(8);
         VAArgOffset = alignTo(VAArgOffset, ArgAlign);
         if (DL.isBigEndian()) {
-          // Adjusting the shadow for argument with size < 8 to match the placement
-          // of bits in big endian system
+          // Adjusting the shadow for argument with size < 8 to match the
+          // placement of bits in big endian system
           if (ArgSize < 8)
             VAArgOffset += (8 - ArgSize);
         }
@@ -4933,7 +4896,7 @@ struct VarArgPowerPC64Helper : public VarArgHelper {
             IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
         }
         VAArgOffset += ArgSize;
-        VAArgOffset = alignTo(VAArgOffset, 8);
+        VAArgOffset = alignTo(VAArgOffset, Align(8));
       }
       if (IsFixed)
         VAArgBase = VAArgOffset;
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 0902a94452e3..3a29cd70e42e 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -52,7 +52,6 @@
 #include "ValueProfileCollector.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -68,6 +67,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
@@ -94,8 +94,6 @@
 #include "llvm/IR/ProfileSummary.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/ProfileData/InstrProfReader.h"
 #include "llvm/Support/BranchProbability.h"
@@ -110,6 +108,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/MisExpect.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <algorithm>
 #include <cassert>
@@ -173,14 +172,14 @@ static cl::opt<bool> DisableValueProfiling("disable-vp", cl::init(false),
 // Command line option to set the maximum number of VP annotations to write to
 // the metadata for a single indirect call callsite.
 static cl::opt<unsigned> MaxNumAnnotations(
-    "icp-max-annotations", cl::init(3), cl::Hidden, cl::ZeroOrMore,
+    "icp-max-annotations", cl::init(3), cl::Hidden,
     cl::desc("Max number of annotations for a single indirect "
              "call callsite"));
 
 // Command line option to set the maximum number of value annotations
 // to write to the metadata for a single memop intrinsic.
 static cl::opt<unsigned> MaxNumMemOPAnnotations(
-    "memop-max-annotations", cl::init(4), cl::Hidden, cl::ZeroOrMore,
+    "memop-max-annotations", cl::init(4), cl::Hidden,
     cl::desc("Max number of preicise value annotations for a single memop"
              "intrinsic"));
 
@@ -256,7 +255,7 @@ static cl::opt<bool> PGOInstrumentEntry(
     cl::desc("Force to instrument function entry basicblock."));
 
 static cl::opt<bool> PGOFunctionEntryCoverage(
-    "pgo-function-entry-coverage", cl::init(false), cl::Hidden, cl::ZeroOrMore,
+    "pgo-function-entry-coverage", cl::Hidden,
     cl::desc(
         "Use this option to enable function entry coverage instrumentation."));
 
@@ -431,125 +430,8 @@ struct SelectInstVisitor : public InstVisitor<SelectInstVisitor> {
   unsigned getNumOfSelectInsts() const { return NSIs; }
 };
 
-
-class PGOInstrumentationGenLegacyPass : public ModulePass {
-public:
-  static char ID;
-
-  PGOInstrumentationGenLegacyPass(bool IsCS = false)
-      : ModulePass(ID), IsCS(IsCS) {
-    initializePGOInstrumentationGenLegacyPassPass(
-        *PassRegistry::getPassRegistry());
-  }
-
-  StringRef getPassName() const override { return "PGOInstrumentationGenPass"; }
-
-private:
-  // Is this is context-sensitive instrumentation.
-  bool IsCS;
-  bool runOnModule(Module &M) override;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<BlockFrequencyInfoWrapperPass>();
-    AU.addRequired<TargetLibraryInfoWrapperPass>();
-  }
-};
-
-class PGOInstrumentationUseLegacyPass : public ModulePass {
-public:
-  static char ID;
-
-  // Provide the profile filename as the parameter.
-  PGOInstrumentationUseLegacyPass(std::string Filename = "", bool IsCS = false)
-      : ModulePass(ID), ProfileFileName(std::move(Filename)), IsCS(IsCS) {
-    if (!PGOTestProfileFile.empty())
-      ProfileFileName = PGOTestProfileFile;
-    initializePGOInstrumentationUseLegacyPassPass(
-        *PassRegistry::getPassRegistry());
-  }
-
-  StringRef getPassName() const override { return "PGOInstrumentationUsePass"; }
-
-private:
-  std::string ProfileFileName;
-  // Is this is context-sensitive instrumentation use.
-  bool IsCS;
-
-  bool runOnModule(Module &M) override;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<ProfileSummaryInfoWrapperPass>();
-    AU.addRequired<BlockFrequencyInfoWrapperPass>();
-    AU.addRequired<TargetLibraryInfoWrapperPass>();
-  }
-};
-
-class PGOInstrumentationGenCreateVarLegacyPass : public ModulePass {
-public:
-  static char ID;
-  StringRef getPassName() const override {
-    return "PGOInstrumentationGenCreateVarPass";
-  }
-  PGOInstrumentationGenCreateVarLegacyPass(std::string CSInstrName = "")
-      : ModulePass(ID), InstrProfileOutput(CSInstrName) {
-    initializePGOInstrumentationGenCreateVarLegacyPassPass(
-        *PassRegistry::getPassRegistry());
-  }
-
-private:
-  bool runOnModule(Module &M) override {
-    createProfileFileNameVar(M, InstrProfileOutput);
-    // The variable in a comdat may be discarded by LTO. Ensure the
-    // declaration will be retained.
-    appendToCompilerUsed(M, createIRLevelProfileFlagVar(M, /*IsCS=*/true));
-    return false;
-  }
-  std::string InstrProfileOutput;
-};
-
 } // end anonymous namespace
 
-char PGOInstrumentationGenLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(PGOInstrumentationGenLegacyPass, "pgo-instr-gen",
-                      "PGO instrumentation.", false, false)
-INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(PGOInstrumentationGenLegacyPass, "pgo-instr-gen",
-                    "PGO instrumentation.", false, false)
-
-ModulePass *llvm::createPGOInstrumentationGenLegacyPass(bool IsCS) {
-  return new PGOInstrumentationGenLegacyPass(IsCS);
-}
-
-char PGOInstrumentationUseLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(PGOInstrumentationUseLegacyPass, "pgo-instr-use",
-                      "Read PGO instrumentation profile.", false, false)
-INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
-INITIALIZE_PASS_END(PGOInstrumentationUseLegacyPass, "pgo-instr-use",
-                    "Read PGO instrumentation profile.", false, false)
-
-ModulePass *llvm::createPGOInstrumentationUseLegacyPass(StringRef Filename,
-                                                        bool IsCS) {
-  return new PGOInstrumentationUseLegacyPass(Filename.str(), IsCS);
-}
-
-char PGOInstrumentationGenCreateVarLegacyPass::ID = 0;
-
-INITIALIZE_PASS(PGOInstrumentationGenCreateVarLegacyPass,
-                "pgo-instr-gen-create-var",
-                "Create PGO instrumentation version variable for CSPGO.", false,
-                false)
-
-ModulePass *
-llvm::createPGOInstrumentationGenCreateVarLegacyPass(StringRef CSInstrName) {
-  return new PGOInstrumentationGenCreateVarLegacyPass(std::string(CSInstrName));
-}
-
 namespace {
 
 /// An MST based instrumentation for PGO
@@ -940,7 +822,7 @@ static void instrumentOneFunc(
     bool IsCS) {
   // Split indirectbr critical edges here before computing the MST rather than
   // later in getInstrBB() to avoid invalidating it.
-  SplitIndirectBrCriticalEdges(F, BPI, BFI);
+  SplitIndirectBrCriticalEdges(F, /*IgnoreBlocksWithoutPHI=*/false, BPI, BFI);
 
   FuncPGOInstrumentation<PGOEdge, BBInfo> FuncInfo(
       F, TLI, ComdatMembers, true, BPI, BFI, IsCS, PGOInstrumentEntry);
@@ -1457,6 +1339,7 @@ void PGOUseFunc::populateCounters() {
   }
 
   LLVM_DEBUG(dbgs() << "Populate counts in " << NumPasses << " passes.\n");
+  (void) NumPasses;
 #ifndef NDEBUG
   // Assert every BB has a valid counter.
   for (auto &BB : F) {
@@ -1697,22 +1580,6 @@ PGOInstrumentationGenCreateVar::run(Module &M, ModuleAnalysisManager &AM) {
   return PreservedAnalyses::all();
 }
 
-bool PGOInstrumentationGenLegacyPass::runOnModule(Module &M) {
-  if (skipModule(M))
-    return false;
-
-  auto LookupTLI = [this](Function &F) -> TargetLibraryInfo & {
-    return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
-  };
-  auto LookupBPI = [this](Function &F) {
-    return &this->getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI();
-  };
-  auto LookupBFI = [this](Function &F) {
-    return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
-  };
-  return InstrumentAllFunctions(M, LookupTLI, LookupBPI, LookupBFI, IsCS);
-}
-
 PreservedAnalyses PGOInstrumentationGen::run(Module &M,
                                              ModuleAnalysisManager &AM) {
   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
@@ -1740,7 +1607,7 @@ static void fixFuncEntryCount(PGOUseFunc &Func, LoopInfo &LI,
   BlockFrequencyInfo NBFI(F, NBPI, LI);
 #ifndef NDEBUG
   auto BFIEntryCount = F.getEntryCount();
-  assert(BFIEntryCount.hasValue() && (BFIEntryCount->getCount() > 0) &&
+  assert(BFIEntryCount && (BFIEntryCount->getCount() > 0) &&
          "Invalid BFI Entrycount");
 #endif
   auto SumCount = APFloat::getZero(APFloat::IEEEdouble());
@@ -1752,7 +1619,7 @@ static void fixFuncEntryCount(PGOUseFunc &Func, LoopInfo &LI,
       continue;
     auto BFICount = NBFI.getBlockProfileCount(&BBI);
     CountValue = Func.getBBInfo(&BBI).CountValue;
-    BFICountValue = BFICount.getValue();
+    BFICountValue = *BFICount;
     SumCount.add(APFloat(CountValue * 1.0), APFloat::rmNearestTiesToEven);
     SumBFICount.add(APFloat(BFICountValue * 1.0), APFloat::rmNearestTiesToEven);
   }
@@ -1805,7 +1672,7 @@ static void verifyFuncBFI(PGOUseFunc &Func, LoopInfo &LI,
       NonZeroBBNum++;
     auto BFICount = NBFI.getBlockProfileCount(&BBI);
     if (BFICount)
-      BFICountValue = BFICount.getValue();
+      BFICountValue = *BFICount;
 
     if (HotBBOnly) {
       bool rawIsHot = CountValue >= HotCountThreshold;
@@ -1929,7 +1796,7 @@ static bool annotateAllFunctions(
     auto *BFI = LookupBFI(F);
     // Split indirectbr critical edges here before computing the MST rather than
     // later in getInstrBB() to avoid invalidating it.
-    SplitIndirectBrCriticalEdges(F, BPI, BFI);
+    SplitIndirectBrCriticalEdges(F, /*IgnoreBlocksWithoutPHI=*/false, BPI, BFI);
     PGOUseFunc Func(F, &M, TLI, ComdatMembers, BPI, BFI, PSI, IsCS,
                     InstrumentFuncEntry);
     // When AllMinusOnes is true, it means the profile for the function
@@ -2073,25 +1940,6 @@ PreservedAnalyses PGOInstrumentationUse::run(Module &M,
   return PreservedAnalyses::none();
 }
 
-bool PGOInstrumentationUseLegacyPass::runOnModule(Module &M) {
-  if (skipModule(M))
-    return false;
-
-  auto LookupTLI = [this](Function &F) -> TargetLibraryInfo & {
-    return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
-  };
-  auto LookupBPI = [this](Function &F) {
-    return &this->getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI();
-  };
-  auto LookupBFI = [this](Function &F) {
-    return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
-  };
-
-  auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
-  return annotateAllFunctions(M, ProfileFileName, "", LookupTLI, LookupBPI,
-                              LookupBFI, PSI, IsCS);
-}
-
 static std::string getSimpleNodeName(const BasicBlock *Node) {
   if (!Node->getName().empty())
     return std::string(Node->getName());
@@ -2117,6 +1965,8 @@ void llvm::setProfMetadata(Module *M, Instruction *TI,
     dbgs() << W << " ";
   } dbgs() << "\n";);
 
+  misexpect::checkExpectAnnotations(*TI, Weights, /*IsFrontend=*/false);
+
   TI->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
   if (EmitBranchProbability) {
     std::string BrCondStr = getBranchCondString(TI);
diff --git a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
index d4b78f2c14b0..b11f16894669 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
@@ -20,7 +20,6 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
-#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/BasicBlock.h"
@@ -29,15 +28,11 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
-#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/PassRegistry.h"
 #include "llvm/ProfileData/InstrProf.h"
 #define INSTR_PROF_VALUE_PROF_MEMOP_API
 #include "llvm/ProfileData/InstrProfData.inc"
@@ -46,8 +41,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/WithColor.h"
-#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include <cassert>
@@ -63,8 +56,7 @@ STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
 
 // The minimum call count to optimize memory intrinsic calls.
 static cl::opt<unsigned>
-    MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore,
-                        cl::init(1000),
+    MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::init(1000),
                         cl::desc("The minimum count to optimize memory "
                                  "intrinsic calls"));
 
@@ -76,14 +68,13 @@ static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false),
 // The percent threshold to optimize memory intrinsic calls.
 static cl::opt<unsigned>
     MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40),
-                          cl::Hidden, cl::ZeroOrMore,
+                          cl::Hidden,
                           cl::desc("The percentage threshold for the "
                                    "memory intrinsic calls optimization"));
 
 // Maximum number of versions for optimizing memory intrinsic call.
 static cl::opt<unsigned>
     MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden,
-                    cl::ZeroOrMore,
                     cl::desc("The max version for the optimized memory "
                              " intrinsic calls"));
 
@@ -102,43 +93,6 @@ static cl::opt<unsigned>
     MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden, cl::init(128),
                     cl::desc("Optimize the memop size <= this value"));
 
-namespace {
-class PGOMemOPSizeOptLegacyPass : public FunctionPass {
-public:
-  static char ID;
-
-  PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) {
-    initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry());
-  }
-
-  StringRef getPassName() const override { return "PGOMemOPSize"; }
-
-private:
-  bool runOnFunction(Function &F) override;
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<BlockFrequencyInfoWrapperPass>();
-    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
-    AU.addPreserved<GlobalsAAWrapperPass>();
-    AU.addPreserved<DominatorTreeWrapperPass>();
-    AU.addRequired<TargetLibraryInfoWrapperPass>();
-  }
-};
-} // end anonymous namespace
-
-char PGOMemOPSizeOptLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
-                      "Optimize memory intrinsic using its size value profile",
-                      false, false)
-INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
-                    "Optimize memory intrinsic using its size value profile",
-                    false, false)
-
-FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() {
-  return new PGOMemOPSizeOptLegacyPass();
-}
-
 namespace {
 
 static const char *getMIName(const MemIntrinsic *MI) {
@@ -517,20 +471,6 @@ static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI,
   return MemOPSizeOpt.isChanged();
 }
 
-bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) {
-  BlockFrequencyInfo &BFI =
-      getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
-  auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
-  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-  DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
-  TargetLibraryInfo &TLI =
-      getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
-  return PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI);
-}
-
-namespace llvm {
-char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID;
-
 PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
                                        FunctionAnalysisManager &FAM) {
   auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
@@ -544,4 +484,3 @@ PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
   PA.preserve<DominatorTreeAnalysis>();
   return PA;
 }
-} // namespace llvm
diff --git a/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp b/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp
index fc5267261851..0e39fe266369 100644
--- a/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp
@@ -60,15 +60,9 @@
 
 #include "llvm/Transforms/Instrumentation/PoisonChecking.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstVisitor.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index d3b60c7add34..d9d11cc90d3d 100644
--- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -13,30 +13,24 @@
 #include "llvm/Transforms/Instrumentation/SanitizerCoverage.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/PostDominators.h"
-#include "llvm/IR/CFG.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/SpecialCaseList.h"
 #include "llvm/Support/VirtualFileSystem.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -247,8 +241,7 @@ private:
                                                 Type *Ty);
 
   void SetNoSanitizeMetadata(Instruction *I) {
-    I->setMetadata(I->getModule()->getMDKindID("nosanitize"),
-                   MDNode::get(*C, None));
+    I->setMetadata(LLVMContext::MD_nosanitize, MDNode::get(*C, None));
   }
 
   std::string getSectionName(const std::string &Section) const;
@@ -694,7 +687,7 @@ void ModuleSanitizerCoverage::instrumentFunction(
     for (auto &Inst : BB) {
       if (Options.IndirectCalls) {
         CallBase *CB = dyn_cast<CallBase>(&Inst);
-        if (CB && !CB->getCalledFunction())
+        if (CB && CB->isIndirectCall())
           IndirCalls.push_back(&Inst);
       }
       if (Options.TraceCmp) {
@@ -996,15 +989,11 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
     // if we aren't splitting the block, it's nice for allocas to be before
     // calls.
     IP = PrepareToSplitEntryBlock(BB, IP);
-  } else {
-    EntryLoc = IP->getDebugLoc();
-    if (!EntryLoc)
-      if (auto *SP = F.getSubprogram())
-        EntryLoc = DILocation::get(SP->getContext(), 0, 0, SP);
   }
 
-  IRBuilder<> IRB(&*IP);
-  IRB.SetCurrentDebugLocation(EntryLoc);
+  InstrumentationIRBuilder IRB(&*IP);
+  if (EntryLoc)
+    IRB.SetCurrentDebugLocation(EntryLoc);
   if (Options.TracePC) {
     IRB.CreateCall(SanCovTracePC)
         ->setCannotMerge(); // gets the PC using GET_CALLER_PC.
diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 180012198c42..c33b1b3b1a5c 100644
--- a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -38,7 +38,6 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
-#include "llvm/InitializePasses.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -174,19 +173,6 @@ private:
   FunctionCallee MemmoveFn, MemcpyFn, MemsetFn;
 };
 
-struct ThreadSanitizerLegacyPass : FunctionPass {
-  ThreadSanitizerLegacyPass() : FunctionPass(ID) {
-    initializeThreadSanitizerLegacyPassPass(*PassRegistry::getPassRegistry());
-  }
-  StringRef getPassName() const override;
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-  bool runOnFunction(Function &F) override;
-  bool doInitialization(Module &M) override;
-  static char ID; // Pass identification, replacement for typeid.
-private:
-  Optional<ThreadSanitizer> TSan;
-};
-
 void insertModuleCtor(Module &M) {
   getOrCreateSanitizerCtorAndInitFunctions(
       M, kTsanModuleCtorName, kTsanInitName, /*InitArgTypes=*/{},
@@ -195,7 +181,6 @@ void insertModuleCtor(Module &M) {
       // time. Hook them into the global ctors list in that case:
       [&](Function *Ctor, FunctionCallee) { appendToGlobalCtors(M, Ctor, 0); });
 }
-
 }  // namespace
 
 PreservedAnalyses ThreadSanitizerPass::run(Function &F,
@@ -211,38 +196,6 @@ PreservedAnalyses ModuleThreadSanitizerPass::run(Module &M,
   insertModuleCtor(M);
   return PreservedAnalyses::none();
 }
-
-char ThreadSanitizerLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(ThreadSanitizerLegacyPass, "tsan",
-                      "ThreadSanitizer: detects data races.", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(ThreadSanitizerLegacyPass, "tsan",
-                    "ThreadSanitizer: detects data races.", false, false)
-
-StringRef ThreadSanitizerLegacyPass::getPassName() const {
-  return "ThreadSanitizerLegacyPass";
-}
-
-void ThreadSanitizerLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<TargetLibraryInfoWrapperPass>();
-}
-
-bool ThreadSanitizerLegacyPass::doInitialization(Module &M) {
-  insertModuleCtor(M);
-  TSan.emplace();
-  return true;
-}
-
-bool ThreadSanitizerLegacyPass::runOnFunction(Function &F) {
-  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
-  TSan->sanitizeFunction(F, TLI);
-  return true;
-}
-
-FunctionPass *llvm::createThreadSanitizerLegacyPassPass() {
-  return new ThreadSanitizerLegacyPass();
-}
-
 void ThreadSanitizer::initialize(Module &M) {
   const DataLayout &DL = M.getDataLayout();
   IntptrTy = DL.getIntPtrType(M.getContext());
@@ -527,26 +480,22 @@ void ThreadSanitizer::chooseInstructionsToInstrument(
   Local.clear();
 }
 
-static bool isAtomic(Instruction *I) {
+static bool isTsanAtomic(const Instruction *I) {
   // TODO: Ask TTI whether synchronization scope is between threads.
-  if (LoadInst *LI = dyn_cast<LoadInst>(I))
-    return LI->isAtomic() && LI->getSyncScopeID() != SyncScope::SingleThread;
-  if (StoreInst *SI = dyn_cast<StoreInst>(I))
-    return SI->isAtomic() && SI->getSyncScopeID() != SyncScope::SingleThread;
-  if (isa<AtomicRMWInst>(I))
-    return true;
-  if (isa<AtomicCmpXchgInst>(I))
-    return true;
-  if (isa<FenceInst>(I))
-    return true;
-  return false;
+  auto SSID = getAtomicSyncScopeID(I);
+  if (!SSID)
+    return false;
+  if (isa<LoadInst>(I) || isa<StoreInst>(I))
+    return SSID.getValue() != SyncScope::SingleThread;
+  return true;
 }
 
 void ThreadSanitizer::InsertRuntimeIgnores(Function &F) {
-  IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+  InstrumentationIRBuilder IRB(F.getEntryBlock().getFirstNonPHI());
   IRB.CreateCall(TsanIgnoreBegin);
   EscapeEnumerator EE(F, "tsan_ignore_cleanup", ClHandleCxxExceptions);
   while (IRBuilder<> *AtExit = EE.Next()) {
+    InstrumentationIRBuilder::ensureDebugInfo(*AtExit, F);
     AtExit->CreateCall(TsanIgnoreEnd);
   }
 }
@@ -581,7 +530,7 @@ bool ThreadSanitizer::sanitizeFunction(Function &F,
   // Traverse all instructions, collect loads/stores/returns, check for calls.
   for (auto &BB : F) {
     for (auto &Inst : BB) {
-      if (isAtomic(&Inst))
+      if (isTsanAtomic(&Inst))
         AtomicAccesses.push_back(&Inst);
       else if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst))
         LocalLoadsAndStores.push_back(&Inst);
@@ -629,7 +578,7 @@ bool ThreadSanitizer::sanitizeFunction(Function &F,
 
   // Instrument function entry/exit points if there were instrumented accesses.
   if ((Res || HasCalls) && ClInstrumentFuncEntryExit) {
-    IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+    InstrumentationIRBuilder IRB(F.getEntryBlock().getFirstNonPHI());
     Value *ReturnAddress = IRB.CreateCall(
         Intrinsic::getDeclaration(F.getParent(), Intrinsic::returnaddress),
         IRB.getInt32(0));
@@ -637,6 +586,7 @@ bool ThreadSanitizer::sanitizeFunction(Function &F,
 
     EscapeEnumerator EE(F, "tsan_cleanup", ClHandleCxxExceptions);
     while (IRBuilder<> *AtExit = EE.Next()) {
+      InstrumentationIRBuilder::ensureDebugInfo(*AtExit, F);
       AtExit->CreateCall(TsanFuncExit, {});
     }
     Res = true;
@@ -646,7 +596,7 @@ bool ThreadSanitizer::sanitizeFunction(Function &F,
 
 bool ThreadSanitizer::instrumentLoadOrStore(const InstructionInfo &II,
                                             const DataLayout &DL) {
-  IRBuilder<> IRB(II.Inst);
+  InstrumentationIRBuilder IRB(II.Inst);
   const bool IsWrite = isa<StoreInst>(*II.Inst);
   Value *Addr = IsWrite ? cast<StoreInst>(II.Inst)->getPointerOperand()
                         : cast<LoadInst>(II.Inst)->getPointerOperand();
@@ -686,8 +636,8 @@ bool ThreadSanitizer::instrumentLoadOrStore(const InstructionInfo &II,
     return true;
   }
 
-  const unsigned Alignment = IsWrite ? cast<StoreInst>(II.Inst)->getAlignment()
-                                     : cast<LoadInst>(II.Inst)->getAlignment();
+  const Align Alignment = IsWrite ? cast<StoreInst>(II.Inst)->getAlign()
+                                  : cast<LoadInst>(II.Inst)->getAlign();
   const bool IsCompoundRW =
       ClCompoundReadBeforeWrite && (II.Flags & InstructionInfo::kCompoundRW);
   const bool IsVolatile = ClDistinguishVolatile &&
@@ -697,7 +647,7 @@ bool ThreadSanitizer::instrumentLoadOrStore(const InstructionInfo &II,
 
   const uint32_t TypeSize = DL.getTypeStoreSizeInBits(OrigTy);
   FunctionCallee OnAccessFunc = nullptr;
-  if (Alignment == 0 || Alignment >= 8 || (Alignment % (TypeSize / 8)) == 0) {
+  if (Alignment >= Align(8) || (Alignment.value() % (TypeSize / 8)) == 0) {
     if (IsCompoundRW)
       OnAccessFunc = TsanCompoundRW[Idx];
     else if (IsVolatile)
@@ -775,7 +725,7 @@ bool ThreadSanitizer::instrumentMemIntrinsic(Instruction *I) {
 // http://www.hpl.hp.com/personal/Hans_Boehm/c++mm/
 
 bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) {
-  IRBuilder<> IRB(I);
+  InstrumentationIRBuilder IRB(I);
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
     Value *Addr = LI->getPointerOperand();
     Type *OrigTy = LI->getType();
diff --git a/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp b/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp
index fb6216bb2177..32633bbc941b 100644
--- a/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp
@@ -10,12 +10,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "ValueProfileCollector.h"
 #include "ValueProfilePlugins.inc"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/InitializePasses.h"
-#include <cassert>
+#include "llvm/ProfileData/InstrProf.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h b/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h
index 584a60ab451e..10e5e4d128b1 100644
--- a/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h
+++ b/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h
@@ -16,7 +16,6 @@
 #ifndef LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H
 #define LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H
 
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include <memory>
 #include <vector>
@@ -25,6 +24,7 @@ namespace llvm {
 
 class Function;
 class Instruction;
+class TargetLibraryInfo;
 class Value;
 
 /// Utility analysis that determines what values are worth profiling.
diff --git a/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc b/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
index 6a2c473a596a..3a129de1acd0 100644
--- a/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
+++ b/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
@@ -15,6 +15,7 @@
 
 #include "ValueProfileCollector.h"
 #include "llvm/Analysis/IndirectCallVisitor.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/InstVisitor.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp
index 126845bb3308..70f150c9461a 100644
--- a/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp
+++ b/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp
@@ -16,7 +16,6 @@
 #include "llvm-c/Initialization.h"
 #include "llvm/Analysis/ObjCARCUtil.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARC.h b/llvm/lib/Transforms/ObjCARC/ObjCARC.h
index 62f88a8cc02b..2bc0c8f87d77 100644
--- a/llvm/lib/Transforms/ObjCARC/ObjCARC.h
+++ b/llvm/lib/Transforms/ObjCARC/ObjCARC.h
@@ -22,7 +22,6 @@
 #ifndef LLVM_LIB_TRANSFORMS_OBJCARC_OBJCARC_H
 #define LLVM_LIB_TRANSFORMS_OBJCARC_OBJCARC_H
 
-#include "ARCRuntimeEntryPoints.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/ObjCARCAnalysisUtils.h"
 #include "llvm/Analysis/ObjCARCUtil.h"
diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
index 210ec60f2f87..03e5fb18d5ac 100644
--- a/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
+++ b/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
@@ -23,11 +23,14 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "ObjCARC.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/ObjCARCAnalysisUtils.h"
+#include "llvm/Analysis/ObjCARCInstKind.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/ObjCARC.h"
diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
index 2985ae004d3c..f64c26ef2bed 100644
--- a/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
+++ b/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
@@ -102,11 +102,8 @@ public:
 };
 
 class ObjCARCContractLegacyPass : public FunctionPass {
-  ObjCARCContract OCARCC;
-
 public:
   void getAnalysisUsage(AnalysisUsage &AU) const override;
-  bool doInitialization(Module &M) override;
   bool runOnFunction(Function &F) override;
 
   static char ID;
@@ -737,11 +734,9 @@ Pass *llvm::createObjCARCContractPass() {
   return new ObjCARCContractLegacyPass();
 }
 
-bool ObjCARCContractLegacyPass::doInitialization(Module &M) {
-  return OCARCC.init(M);
-}
-
 bool ObjCARCContractLegacyPass::runOnFunction(Function &F) {
+  ObjCARCContract OCARCC;
+  OCARCC.init(*F.getParent());
   auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   return OCARCC.run(F, AA, DT);
diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
index 6b074ac5adab..efcdc51ef5e3 100644
--- a/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
+++ b/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
@@ -22,7 +22,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "ObjCARC.h"
+#include "llvm/Analysis/ObjCARCAnalysisUtils.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
diff --git a/llvm/lib/Transforms/Scalar/ADCE.cpp b/llvm/lib/Transforms/Scalar/ADCE.cpp
index 1cda206a7e14..cdf9de8d78d5 100644
--- a/llvm/lib/Transforms/Scalar/ADCE.cpp
+++ b/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -35,7 +35,6 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
-#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
diff --git a/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
index e4ec5f266eb8..9571e99dfb19 100644
--- a/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
+++ b/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
@@ -15,8 +15,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Instructions.h"
-#include "llvm/InitializePasses.h"
 #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
@@ -26,12 +24,11 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Constant.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
diff --git a/llvm/lib/Transforms/Scalar/AnnotationRemarks.cpp b/llvm/lib/Transforms/Scalar/AnnotationRemarks.cpp
index a5e65ffc45fe..155f47b49357 100644
--- a/llvm/lib/Transforms/Scalar/AnnotationRemarks.cpp
+++ b/llvm/lib/Transforms/Scalar/AnnotationRemarks.cpp
@@ -16,11 +16,8 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/MemoryOpRemark.h"
 
diff --git a/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp b/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
index 95de59fa8262..cc12033fb677 100644
--- a/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
+++ b/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -57,6 +57,7 @@
 
 #include "llvm/Transforms/Scalar/CallSiteSplitting.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -65,7 +66,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 
@@ -123,8 +123,8 @@ static bool isCondRelevantToAnyCallArgument(ICmpInst *Cmp, CallBase &CB) {
   return false;
 }
 
-typedef std::pair<ICmpInst *, unsigned> ConditionTy;
-typedef SmallVector<ConditionTy, 2> ConditionsTy;
+using ConditionTy = std::pair<ICmpInst *, unsigned>;
+using ConditionsTy = SmallVector<ConditionTy, 2>;
 
 /// If From has a conditional jump to To, add the condition to Conditions,
 /// if it is relevant to any argument at CB.
@@ -301,10 +301,9 @@ static void copyMustTailReturn(BasicBlock *SplitBB, Instruction *CI,
 /// Note that in case any arguments at the call-site are constrained by its
 /// predecessors, new call-sites with more constrained arguments will be
 /// created in createCallSitesOnPredicatedArgument().
-static void splitCallSite(
-    CallBase &CB,
-    const SmallVectorImpl<std::pair<BasicBlock *, ConditionsTy>> &Preds,
-    DomTreeUpdater &DTU) {
+static void splitCallSite(CallBase &CB,
+                          ArrayRef<std::pair<BasicBlock *, ConditionsTy>> Preds,
+                          DomTreeUpdater &DTU) {
   BasicBlock *TailBB = CB.getParent();
   bool IsMustTailCall = CB.isMustTailCall();
 
diff --git a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
index 25e8c3ef3b48..8a1761505d59 100644
--- a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -52,6 +52,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 13963657d183..6dfa2440023f 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -19,15 +19,16 @@
 #include "llvm/Analysis/ConstraintSystem.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/DebugCounter.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Transforms/Scalar.h"
 
 #include <string>
@@ -42,48 +43,129 @@ DEBUG_COUNTER(EliminatedCounter, "conds-eliminated",
               "Controls which conditions are eliminated");
 
 static int64_t MaxConstraintValue = std::numeric_limits<int64_t>::max();
+static int64_t MinSignedConstraintValue = std::numeric_limits<int64_t>::min();
 
 namespace {
-struct ConstraintTy {
-  SmallVector<int64_t, 8> Coefficients;
 
-  ConstraintTy(SmallVector<int64_t, 8> Coefficients)
-      : Coefficients(Coefficients) {}
+class ConstraintInfo;
 
-  unsigned size() const { return Coefficients.size(); }
+struct StackEntry {
+  unsigned NumIn;
+  unsigned NumOut;
+  bool IsNot;
+  bool IsSigned = false;
+  /// Variables that can be removed from the system once the stack entry gets
+  /// removed.
+  SmallVector<Value *, 2> ValuesToRelease;
+
+  StackEntry(unsigned NumIn, unsigned NumOut, bool IsNot, bool IsSigned,
+             SmallVector<Value *, 2> ValuesToRelease)
+      : NumIn(NumIn), NumOut(NumOut), IsNot(IsNot), IsSigned(IsSigned),
+        ValuesToRelease(ValuesToRelease) {}
 };
 
-/// Struct to manage a list of constraints.
-struct ConstraintListTy {
-  SmallVector<ConstraintTy, 4> Constraints;
+/// Struct to express a pre-condition of the form %Op0 Pred %Op1.
+struct PreconditionTy {
+  CmpInst::Predicate Pred;
+  Value *Op0;
+  Value *Op1;
 
-  ConstraintListTy() {}
+  PreconditionTy(CmpInst::Predicate Pred, Value *Op0, Value *Op1)
+      : Pred(Pred), Op0(Op0), Op1(Op1) {}
+};
 
-  ConstraintListTy(const SmallVector<ConstraintTy, 4> &Constraints)
-      : Constraints(Constraints) {}
+struct ConstraintTy {
+  SmallVector<int64_t, 8> Coefficients;
+  SmallVector<PreconditionTy, 2> Preconditions;
 
-  void mergeIn(const ConstraintListTy &Other) {
-    append_range(Constraints, Other.Constraints);
-  }
+  bool IsSigned = false;
+  bool IsEq = false;
+
+  ConstraintTy() = default;
 
-  unsigned size() const { return Constraints.size(); }
+  ConstraintTy(SmallVector<int64_t, 8> Coefficients, bool IsSigned)
+      : Coefficients(Coefficients), IsSigned(IsSigned) {}
+
+  unsigned size() const { return Coefficients.size(); }
 
-  unsigned empty() const { return Constraints.empty(); }
+  unsigned empty() const { return Coefficients.empty(); }
 
   /// Returns true if any constraint has a non-zero coefficient for any of the
   /// newly added indices. Zero coefficients for new indices are removed. If it
   /// returns true, no new variable need to be added to the system.
   bool needsNewIndices(const DenseMap<Value *, unsigned> &NewIndices) {
-    assert(size() == 1);
     for (unsigned I = 0; I < NewIndices.size(); ++I) {
-      int64_t Last = get(0).Coefficients.pop_back_val();
+      int64_t Last = Coefficients.pop_back_val();
       if (Last != 0)
         return true;
     }
     return false;
   }
 
-  ConstraintTy &get(unsigned I) { return Constraints[I]; }
+  /// Returns true if all preconditions for this list of constraints are
+  /// satisfied given \p CS and the corresponding \p Value2Index mapping.
+  bool isValid(const ConstraintInfo &Info) const;
+};
+
+/// Wrapper encapsulating separate constraint systems and corresponding value
+/// mappings for both unsigned and signed information. Facts are added to and
+/// conditions are checked against the corresponding system depending on the
+/// signed-ness of their predicates. While the information is kept separate
+/// based on signed-ness, certain conditions can be transferred between the two
+/// systems.
+class ConstraintInfo {
+  DenseMap<Value *, unsigned> UnsignedValue2Index;
+  DenseMap<Value *, unsigned> SignedValue2Index;
+
+  ConstraintSystem UnsignedCS;
+  ConstraintSystem SignedCS;
+
+public:
+  DenseMap<Value *, unsigned> &getValue2Index(bool Signed) {
+    return Signed ? SignedValue2Index : UnsignedValue2Index;
+  }
+  const DenseMap<Value *, unsigned> &getValue2Index(bool Signed) const {
+    return Signed ? SignedValue2Index : UnsignedValue2Index;
+  }
+
+  ConstraintSystem &getCS(bool Signed) {
+    return Signed ? SignedCS : UnsignedCS;
+  }
+  const ConstraintSystem &getCS(bool Signed) const {
+    return Signed ? SignedCS : UnsignedCS;
+  }
+
+  void popLastConstraint(bool Signed) { getCS(Signed).popLastConstraint(); }
+  void popLastNVariables(bool Signed, unsigned N) {
+    getCS(Signed).popLastNVariables(N);
+  }
+
+  bool doesHold(CmpInst::Predicate Pred, Value *A, Value *B) const;
+
+  void addFact(CmpInst::Predicate Pred, Value *A, Value *B, bool IsNegated,
+               unsigned NumIn, unsigned NumOut,
+               SmallVectorImpl<StackEntry> &DFSInStack);
+
+  /// Turn a comparison of the form \p Op0 \p Pred \p Op1 into a vector of
+  /// constraints, using indices from the corresponding constraint system.
+  /// Additional indices for newly discovered values are added to \p NewIndices.
+  ConstraintTy getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1,
+                             DenseMap<Value *, unsigned> &NewIndices) const;
+
+  /// Turn a condition \p CmpI into a vector of constraints, using indices from
+  /// the corresponding constraint system. Additional indices for newly
+  /// discovered values are added to \p NewIndices.
+  ConstraintTy getConstraint(CmpInst *Cmp,
+                             DenseMap<Value *, unsigned> &NewIndices) const {
+    return getConstraint(Cmp->getPredicate(), Cmp->getOperand(0),
+                         Cmp->getOperand(1), NewIndices);
+  }
+
+  /// Try to add information from \p A \p Pred \p B to the unsigned/signed
+  /// system if \p Pred is signed/unsigned.
+  void transferToOtherSystem(CmpInst::Predicate Pred, Value *A, Value *B,
+                             bool IsNegated, unsigned NumIn, unsigned NumOut,
+                             SmallVectorImpl<StackEntry> &DFSInStack);
 };
 
 } // namespace
@@ -92,11 +174,28 @@ struct ConstraintListTy {
 // sum of the pairs equals \p V.  The first pair is the constant-factor and X
 // must be nullptr. If the expression cannot be decomposed, returns an empty
 // vector.
-static SmallVector<std::pair<int64_t, Value *>, 4> decompose(Value *V) {
+static SmallVector<std::pair<int64_t, Value *>, 4>
+decompose(Value *V, SmallVector<PreconditionTy, 4> &Preconditions,
+          bool IsSigned) {
+
+  auto CanUseSExt = [](ConstantInt *CI) {
+    const APInt &Val = CI->getValue();
+    return Val.sgt(MinSignedConstraintValue) && Val.slt(MaxConstraintValue);
+  };
+  // Decompose \p V used with a signed predicate.
+  if (IsSigned) {
+    if (auto *CI = dyn_cast<ConstantInt>(V)) {
+      if (CanUseSExt(CI))
+        return {{CI->getSExtValue(), nullptr}};
+    }
+
+    return {{0, nullptr}, {1, V}};
+  }
+
   if (auto *CI = dyn_cast<ConstantInt>(V)) {
-    if (CI->isNegative() || CI->uge(MaxConstraintValue))
+    if (CI->uge(MaxConstraintValue))
       return {};
-    return {{CI->getSExtValue(), nullptr}};
+    return {{CI->getZExtValue(), nullptr}};
   }
   auto *GEP = dyn_cast<GetElementPtrInst>(V);
   if (GEP && GEP->getNumOperands() == 2 && GEP->isInBounds()) {
@@ -106,11 +205,13 @@ static SmallVector<std::pair<int64_t, Value *>, 4> decompose(Value *V) {
     // If the index is zero-extended, it is guaranteed to be positive.
     if (match(GEP->getOperand(GEP->getNumOperands() - 1),
               m_ZExt(m_Value(Op0)))) {
-      if (match(Op0, m_NUWShl(m_Value(Op1), m_ConstantInt(CI))))
+      if (match(Op0, m_NUWShl(m_Value(Op1), m_ConstantInt(CI))) &&
+          CanUseSExt(CI))
         return {{0, nullptr},
                 {1, GEP->getPointerOperand()},
                 {std::pow(int64_t(2), CI->getSExtValue()), Op1}};
-      if (match(Op0, m_NSWAdd(m_Value(Op1), m_ConstantInt(CI))))
+      if (match(Op0, m_NSWAdd(m_Value(Op1), m_ConstantInt(CI))) &&
+          CanUseSExt(CI))
         return {{CI->getSExtValue(), nullptr},
                 {1, GEP->getPointerOperand()},
                 {1, Op1}};
@@ -118,17 +219,19 @@ static SmallVector<std::pair<int64_t, Value *>, 4> decompose(Value *V) {
     }
 
     if (match(GEP->getOperand(GEP->getNumOperands() - 1), m_ConstantInt(CI)) &&
-        !CI->isNegative())
+        !CI->isNegative() && CanUseSExt(CI))
       return {{CI->getSExtValue(), nullptr}, {1, GEP->getPointerOperand()}};
 
     SmallVector<std::pair<int64_t, Value *>, 4> Result;
     if (match(GEP->getOperand(GEP->getNumOperands() - 1),
-              m_NUWShl(m_Value(Op0), m_ConstantInt(CI))))
+              m_NUWShl(m_Value(Op0), m_ConstantInt(CI))) &&
+        CanUseSExt(CI))
       Result = {{0, nullptr},
                 {1, GEP->getPointerOperand()},
                 {std::pow(int64_t(2), CI->getSExtValue()), Op0}};
     else if (match(GEP->getOperand(GEP->getNumOperands() - 1),
-                   m_NSWAdd(m_Value(Op0), m_ConstantInt(CI))))
+                   m_NSWAdd(m_Value(Op0), m_ConstantInt(CI))) &&
+             CanUseSExt(CI))
       Result = {{CI->getSExtValue(), nullptr},
                 {1, GEP->getPointerOperand()},
                 {1, Op0}};
@@ -136,6 +239,10 @@ static SmallVector<std::pair<int64_t, Value *>, 4> decompose(Value *V) {
       Op0 = GEP->getOperand(GEP->getNumOperands() - 1);
       Result = {{0, nullptr}, {1, GEP->getPointerOperand()}, {1, Op0}};
     }
+    // If Op0 is signed non-negative, the GEP is increasing monotonically and
+    // can be de-composed.
+    Preconditions.emplace_back(CmpInst::ICMP_SGE, Op0,
+                               ConstantInt::get(Op0->getType(), 0));
     return Result;
   }
 
@@ -145,12 +252,20 @@ static SmallVector<std::pair<int64_t, Value *>, 4> decompose(Value *V) {
 
   Value *Op1;
   ConstantInt *CI;
-  if (match(V, m_NUWAdd(m_Value(Op0), m_ConstantInt(CI))))
+  if (match(V, m_NUWAdd(m_Value(Op0), m_ConstantInt(CI))) &&
+      !CI->uge(MaxConstraintValue))
+    return {{CI->getZExtValue(), nullptr}, {1, Op0}};
+  if (match(V, m_Add(m_Value(Op0), m_ConstantInt(CI))) && CI->isNegative() &&
+      CanUseSExt(CI)) {
+    Preconditions.emplace_back(
+        CmpInst::ICMP_UGE, Op0,
+        ConstantInt::get(Op0->getType(), CI->getSExtValue() * -1));
     return {{CI->getSExtValue(), nullptr}, {1, Op0}};
+  }
   if (match(V, m_NUWAdd(m_Value(Op0), m_Value(Op1))))
     return {{0, nullptr}, {1, Op0}, {1, Op1}};
 
-  if (match(V, m_NUWSub(m_Value(Op0), m_ConstantInt(CI))))
+  if (match(V, m_NUWSub(m_Value(Op0), m_ConstantInt(CI))) && CanUseSExt(CI))
     return {{-1 * CI->getSExtValue(), nullptr}, {1, Op0}};
   if (match(V, m_NUWSub(m_Value(Op0), m_Value(Op1))))
     return {{0, nullptr}, {1, Op0}, {-1, Op1}};
@@ -158,73 +273,73 @@ static SmallVector<std::pair<int64_t, Value *>, 4> decompose(Value *V) {
   return {{0, nullptr}, {1, V}};
 }
 
-/// Turn a condition \p CmpI into a vector of constraints, using indices from \p
-/// Value2Index. Additional indices for newly discovered values are added to \p
-/// NewIndices.
-static ConstraintListTy
-getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1,
-              const DenseMap<Value *, unsigned> &Value2Index,
-              DenseMap<Value *, unsigned> &NewIndices) {
-  int64_t Offset1 = 0;
-  int64_t Offset2 = 0;
-
-  // First try to look up \p V in Value2Index and NewIndices. Otherwise add a
-  // new entry to NewIndices.
-  auto GetOrAddIndex = [&Value2Index, &NewIndices](Value *V) -> unsigned {
-    auto V2I = Value2Index.find(V);
-    if (V2I != Value2Index.end())
-      return V2I->second;
-    auto NewI = NewIndices.find(V);
-    if (NewI != NewIndices.end())
-      return NewI->second;
-    auto Insert =
-        NewIndices.insert({V, Value2Index.size() + NewIndices.size() + 1});
-    return Insert.first->second;
-  };
-
-  if (Pred == CmpInst::ICMP_UGT || Pred == CmpInst::ICMP_UGE)
-    return getConstraint(CmpInst::getSwappedPredicate(Pred), Op1, Op0,
-                         Value2Index, NewIndices);
-
-  if (Pred == CmpInst::ICMP_EQ) {
-    if (match(Op1, m_Zero()))
-      return getConstraint(CmpInst::ICMP_ULE, Op0, Op1, Value2Index,
-                           NewIndices);
-
-    auto A =
-        getConstraint(CmpInst::ICMP_UGE, Op0, Op1, Value2Index, NewIndices);
-    auto B =
-        getConstraint(CmpInst::ICMP_ULE, Op0, Op1, Value2Index, NewIndices);
-    A.mergeIn(B);
-    return A;
+ConstraintTy
+ConstraintInfo::getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1,
+                              DenseMap<Value *, unsigned> &NewIndices) const {
+  bool IsEq = false;
+  // Try to convert Pred to one of ULE/SLT/SLE/SLT.
+  switch (Pred) {
+  case CmpInst::ICMP_UGT:
+  case CmpInst::ICMP_UGE:
+  case CmpInst::ICMP_SGT:
+  case CmpInst::ICMP_SGE: {
+    Pred = CmpInst::getSwappedPredicate(Pred);
+    std::swap(Op0, Op1);
+    break;
   }
-
-  if (Pred == CmpInst::ICMP_NE && match(Op1, m_Zero())) {
-    return getConstraint(CmpInst::ICMP_UGT, Op0, Op1, Value2Index, NewIndices);
+  case CmpInst::ICMP_EQ:
+    if (match(Op1, m_Zero())) {
+      Pred = CmpInst::ICMP_ULE;
+    } else {
+      IsEq = true;
+      Pred = CmpInst::ICMP_ULE;
+    }
+    break;
+  case CmpInst::ICMP_NE:
+    if (!match(Op1, m_Zero()))
+      return {};
+    Pred = CmpInst::getSwappedPredicate(CmpInst::ICMP_UGT);
+    std::swap(Op0, Op1);
+    break;
+  default:
+    break;
   }
 
   // Only ULE and ULT predicates are supported at the moment.
-  if (Pred != CmpInst::ICMP_ULE && Pred != CmpInst::ICMP_ULT)
+  if (Pred != CmpInst::ICMP_ULE && Pred != CmpInst::ICMP_ULT &&
+      Pred != CmpInst::ICMP_SLE && Pred != CmpInst::ICMP_SLT)
     return {};
 
-  auto ADec = decompose(Op0->stripPointerCastsSameRepresentation());
-  auto BDec = decompose(Op1->stripPointerCastsSameRepresentation());
+  SmallVector<PreconditionTy, 4> Preconditions;
+  bool IsSigned = CmpInst::isSigned(Pred);
+  auto &Value2Index = getValue2Index(IsSigned);
+  auto ADec = decompose(Op0->stripPointerCastsSameRepresentation(),
+                        Preconditions, IsSigned);
+  auto BDec = decompose(Op1->stripPointerCastsSameRepresentation(),
+                        Preconditions, IsSigned);
   // Skip if decomposing either of the values failed.
   if (ADec.empty() || BDec.empty())
     return {};
 
-  // Skip trivial constraints without any variables.
-  if (ADec.size() == 1 && BDec.size() == 1)
-    return {};
-
-  Offset1 = ADec[0].first;
-  Offset2 = BDec[0].first;
+  int64_t Offset1 = ADec[0].first;
+  int64_t Offset2 = BDec[0].first;
   Offset1 *= -1;
 
   // Create iterator ranges that skip the constant-factor.
   auto VariablesA = llvm::drop_begin(ADec);
   auto VariablesB = llvm::drop_begin(BDec);
 
+  // First try to look up \p V in Value2Index and NewIndices. Otherwise add a
+  // new entry to NewIndices.
+  auto GetOrAddIndex = [&Value2Index, &NewIndices](Value *V) -> unsigned {
+    auto V2I = Value2Index.find(V);
+    if (V2I != Value2Index.end())
+      return V2I->second;
+    auto Insert =
+        NewIndices.insert({V, Value2Index.size() + NewIndices.size() + 1});
+    return Insert.first->second;
+  };
+
   // Make sure all variables have entries in Value2Index or NewIndices.
   for (const auto &KV :
        concat<std::pair<int64_t, Value *>>(VariablesA, VariablesB))
@@ -232,22 +347,85 @@ getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1,
 
   // Build result constraint, by first adding all coefficients from A and then
   // subtracting all coefficients from B.
-  SmallVector<int64_t, 8> R(Value2Index.size() + NewIndices.size() + 1, 0);
+  ConstraintTy Res(
+      SmallVector<int64_t, 8>(Value2Index.size() + NewIndices.size() + 1, 0),
+      IsSigned);
+  Res.IsEq = IsEq;
+  auto &R = Res.Coefficients;
   for (const auto &KV : VariablesA)
     R[GetOrAddIndex(KV.second)] += KV.first;
 
   for (const auto &KV : VariablesB)
     R[GetOrAddIndex(KV.second)] -= KV.first;
 
-  R[0] = Offset1 + Offset2 + (Pred == CmpInst::ICMP_ULT ? -1 : 0);
-  return {{R}};
+  int64_t OffsetSum;
+  if (AddOverflow(Offset1, Offset2, OffsetSum))
+    return {};
+  if (Pred == (IsSigned ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT))
+    if (AddOverflow(OffsetSum, int64_t(-1), OffsetSum))
+      return {};
+  R[0] = OffsetSum;
+  Res.Preconditions = std::move(Preconditions);
+  return Res;
+}
+
+bool ConstraintTy::isValid(const ConstraintInfo &Info) const {
+  return Coefficients.size() > 0 &&
+         all_of(Preconditions, [&Info](const PreconditionTy &C) {
+           return Info.doesHold(C.Pred, C.Op0, C.Op1);
+         });
+}
+
+bool ConstraintInfo::doesHold(CmpInst::Predicate Pred, Value *A,
+                              Value *B) const {
+  DenseMap<Value *, unsigned> NewIndices;
+  auto R = getConstraint(Pred, A, B, NewIndices);
+
+  if (!NewIndices.empty())
+    return false;
+
+  // TODO: properly check NewIndices.
+  return NewIndices.empty() && R.Preconditions.empty() && !R.IsEq &&
+         !R.empty() &&
+         getCS(CmpInst::isSigned(Pred)).isConditionImplied(R.Coefficients);
 }
 
-static ConstraintListTy
-getConstraint(CmpInst *Cmp, const DenseMap<Value *, unsigned> &Value2Index,
-              DenseMap<Value *, unsigned> &NewIndices) {
-  return getConstraint(Cmp->getPredicate(), Cmp->getOperand(0),
-                       Cmp->getOperand(1), Value2Index, NewIndices);
+void ConstraintInfo::transferToOtherSystem(
+    CmpInst::Predicate Pred, Value *A, Value *B, bool IsNegated, unsigned NumIn,
+    unsigned NumOut, SmallVectorImpl<StackEntry> &DFSInStack) {
+  // Check if we can combine facts from the signed and unsigned systems to
+  // derive additional facts.
+  if (!A->getType()->isIntegerTy())
+    return;
+  // FIXME: This currently depends on the order we add facts. Ideally we
+  // would first add all known facts and only then try to add additional
+  // facts.
+  switch (Pred) {
+  default:
+    break;
+  case CmpInst::ICMP_ULT:
+    //  If B is a signed positive constant, A >=s 0 and A <s B.
+    if (doesHold(CmpInst::ICMP_SGE, B, ConstantInt::get(B->getType(), 0))) {
+      addFact(CmpInst::ICMP_SGE, A, ConstantInt::get(B->getType(), 0),
+              IsNegated, NumIn, NumOut, DFSInStack);
+      addFact(CmpInst::ICMP_SLT, A, B, IsNegated, NumIn, NumOut, DFSInStack);
+    }
+    break;
+  case CmpInst::ICMP_SLT:
+    if (doesHold(CmpInst::ICMP_SGE, A, ConstantInt::get(B->getType(), 0)))
+      addFact(CmpInst::ICMP_ULT, A, B, IsNegated, NumIn, NumOut, DFSInStack);
+    break;
+  case CmpInst::ICMP_SGT:
+    if (doesHold(CmpInst::ICMP_SGE, B, ConstantInt::get(B->getType(), -1)))
+      addFact(CmpInst::ICMP_UGE, A, ConstantInt::get(B->getType(), 0),
+              IsNegated, NumIn, NumOut, DFSInStack);
+    break;
+  case CmpInst::ICMP_SGE:
+    if (doesHold(CmpInst::ICMP_SGE, B, ConstantInt::get(B->getType(), 0))) {
+      addFact(CmpInst::ICMP_UGE, A, B, IsNegated, NumIn, NumOut, DFSInStack);
+    }
+    break;
+  }
 }
 
 namespace {
@@ -271,134 +449,253 @@ struct ConstraintOrBlock {
         Not(Not), Condition(Condition) {}
 };
 
-struct StackEntry {
-  unsigned NumIn;
-  unsigned NumOut;
-  CmpInst *Condition;
-  bool IsNot;
+/// Keep state required to build worklist.
+struct State {
+  DominatorTree &DT;
+  SmallVector<ConstraintOrBlock, 64> WorkList;
 
-  StackEntry(unsigned NumIn, unsigned NumOut, CmpInst *Condition, bool IsNot)
-      : NumIn(NumIn), NumOut(NumOut), Condition(Condition), IsNot(IsNot) {}
+  State(DominatorTree &DT) : DT(DT) {}
+
+  /// Process block \p BB and add known facts to work-list.
+  void addInfoFor(BasicBlock &BB);
+
+  /// Returns true if we can add a known condition from BB to its successor
+  /// block Succ. Each predecessor of Succ can either be BB or be dominated
+  /// by Succ (e.g. the case when adding a condition from a pre-header to a
+  /// loop header).
+  bool canAddSuccessor(BasicBlock &BB, BasicBlock *Succ) const {
+    if (BB.getSingleSuccessor()) {
+      assert(BB.getSingleSuccessor() == Succ);
+      return DT.properlyDominates(&BB, Succ);
+    }
+    return any_of(successors(&BB),
+                  [Succ](const BasicBlock *S) { return S != Succ; }) &&
+           all_of(predecessors(Succ), [&BB, Succ, this](BasicBlock *Pred) {
+             return Pred == &BB || DT.dominates(Succ, Pred);
+           });
+  }
 };
+
 } // namespace
 
 #ifndef NDEBUG
-static void dumpWithNames(ConstraintTy &C,
+static void dumpWithNames(const ConstraintSystem &CS,
                           DenseMap<Value *, unsigned> &Value2Index) {
   SmallVector<std::string> Names(Value2Index.size(), "");
   for (auto &KV : Value2Index) {
     Names[KV.second - 1] = std::string("%") + KV.first->getName().str();
   }
-  ConstraintSystem CS;
-  CS.addVariableRowFill(C.Coefficients);
   CS.dump(Names);
 }
-#endif
 
-static bool eliminateConstraints(Function &F, DominatorTree &DT) {
-  bool Changed = false;
-  DT.updateDFSNumbers();
+static void dumpWithNames(ArrayRef<int64_t> C,
+                          DenseMap<Value *, unsigned> &Value2Index) {
   ConstraintSystem CS;
+  CS.addVariableRowFill(C);
+  dumpWithNames(CS, Value2Index);
+}
+#endif
 
-  SmallVector<ConstraintOrBlock, 64> WorkList;
-
-  // First, collect conditions implied by branches and blocks with their
-  // Dominator DFS in and out numbers.
-  for (BasicBlock &BB : F) {
-    if (!DT.getNode(&BB))
-      continue;
-    WorkList.emplace_back(DT.getNode(&BB));
-
-    // True as long as long as the current instruction is guaranteed to execute.
-    bool GuaranteedToExecute = true;
-    // Scan BB for assume calls.
-    // TODO: also use this scan to queue conditions to simplify, so we can
-    // interleave facts from assumes and conditions to simplify in a single
-    // basic block. And to skip another traversal of each basic block when
-    // simplifying.
-    for (Instruction &I : BB) {
-      Value *Cond;
-      // For now, just handle assumes with a single compare as condition.
-      if (match(&I, m_Intrinsic<Intrinsic::assume>(m_Value(Cond))) &&
-          isa<CmpInst>(Cond)) {
-        if (GuaranteedToExecute) {
-          // The assume is guaranteed to execute when BB is entered, hence Cond
-          // holds on entry to BB.
-          WorkList.emplace_back(DT.getNode(&BB), cast<CmpInst>(Cond), false);
-        } else {
-          // Otherwise the condition only holds in the successors.
-          for (BasicBlock *Succ : successors(&BB))
-            WorkList.emplace_back(DT.getNode(Succ), cast<CmpInst>(Cond), false);
+void State::addInfoFor(BasicBlock &BB) {
+  WorkList.emplace_back(DT.getNode(&BB));
+
+  // True as long as long as the current instruction is guaranteed to execute.
+  bool GuaranteedToExecute = true;
+  // Scan BB for assume calls.
+  // TODO: also use this scan to queue conditions to simplify, so we can
+  // interleave facts from assumes and conditions to simplify in a single
+  // basic block. And to skip another traversal of each basic block when
+  // simplifying.
+  for (Instruction &I : BB) {
+    Value *Cond;
+    // For now, just handle assumes with a single compare as condition.
+    if (match(&I, m_Intrinsic<Intrinsic::assume>(m_Value(Cond))) &&
+        isa<ICmpInst>(Cond)) {
+      if (GuaranteedToExecute) {
+        // The assume is guaranteed to execute when BB is entered, hence Cond
+        // holds on entry to BB.
+        WorkList.emplace_back(DT.getNode(&BB), cast<ICmpInst>(Cond), false);
+      } else {
+        // Otherwise the condition only holds in the successors.
+        for (BasicBlock *Succ : successors(&BB)) {
+          if (!canAddSuccessor(BB, Succ))
+            continue;
+          WorkList.emplace_back(DT.getNode(Succ), cast<ICmpInst>(Cond), false);
         }
       }
-      GuaranteedToExecute &= isGuaranteedToTransferExecutionToSuccessor(&I);
     }
+    GuaranteedToExecute &= isGuaranteedToTransferExecutionToSuccessor(&I);
+  }
 
-    auto *Br = dyn_cast<BranchInst>(BB.getTerminator());
-    if (!Br || !Br->isConditional())
-      continue;
+  auto *Br = dyn_cast<BranchInst>(BB.getTerminator());
+  if (!Br || !Br->isConditional())
+    return;
+
+  // If the condition is an OR of 2 compares and the false successor only has
+  // the current block as predecessor, queue both negated conditions for the
+  // false successor.
+  Value *Op0, *Op1;
+  if (match(Br->getCondition(), m_LogicalOr(m_Value(Op0), m_Value(Op1))) &&
+      isa<ICmpInst>(Op0) && isa<ICmpInst>(Op1)) {
+    BasicBlock *FalseSuccessor = Br->getSuccessor(1);
+    if (canAddSuccessor(BB, FalseSuccessor)) {
+      WorkList.emplace_back(DT.getNode(FalseSuccessor), cast<ICmpInst>(Op0),
+                            true);
+      WorkList.emplace_back(DT.getNode(FalseSuccessor), cast<ICmpInst>(Op1),
+                            true);
+    }
+    return;
+  }
 
-    // Returns true if we can add a known condition from BB to its successor
-    // block Succ. Each predecessor of Succ can either be BB or be dominated by
-    // Succ (e.g. the case when adding a condition from a pre-header to a loop
-    // header).
-    auto CanAdd = [&BB, &DT](BasicBlock *Succ) {
-      return all_of(predecessors(Succ), [&BB, &DT, Succ](BasicBlock *Pred) {
-        return Pred == &BB || DT.dominates(Succ, Pred);
-      });
-    };
-    // If the condition is an OR of 2 compares and the false successor only has
-    // the current block as predecessor, queue both negated conditions for the
-    // false successor.
-    Value *Op0, *Op1;
-    if (match(Br->getCondition(), m_LogicalOr(m_Value(Op0), m_Value(Op1))) &&
-        match(Op0, m_Cmp()) && match(Op1, m_Cmp())) {
-      BasicBlock *FalseSuccessor = Br->getSuccessor(1);
-      if (CanAdd(FalseSuccessor)) {
-        WorkList.emplace_back(DT.getNode(FalseSuccessor), cast<CmpInst>(Op0),
-                              true);
-        WorkList.emplace_back(DT.getNode(FalseSuccessor), cast<CmpInst>(Op1),
-                              true);
-      }
-      continue;
+  // If the condition is an AND of 2 compares and the true successor only has
+  // the current block as predecessor, queue both conditions for the true
+  // successor.
+  if (match(Br->getCondition(), m_LogicalAnd(m_Value(Op0), m_Value(Op1))) &&
+      isa<ICmpInst>(Op0) && isa<ICmpInst>(Op1)) {
+    BasicBlock *TrueSuccessor = Br->getSuccessor(0);
+    if (canAddSuccessor(BB, TrueSuccessor)) {
+      WorkList.emplace_back(DT.getNode(TrueSuccessor), cast<ICmpInst>(Op0),
+                            false);
+      WorkList.emplace_back(DT.getNode(TrueSuccessor), cast<ICmpInst>(Op1),
+                            false);
     }
+    return;
+  }
 
-    // If the condition is an AND of 2 compares and the true successor only has
-    // the current block as predecessor, queue both conditions for the true
-    // successor.
-    if (match(Br->getCondition(), m_LogicalAnd(m_Value(Op0), m_Value(Op1))) &&
-        match(Op0, m_Cmp()) && match(Op1, m_Cmp())) {
-      BasicBlock *TrueSuccessor = Br->getSuccessor(0);
-      if (CanAdd(TrueSuccessor)) {
-        WorkList.emplace_back(DT.getNode(TrueSuccessor), cast<CmpInst>(Op0),
-                              false);
-        WorkList.emplace_back(DT.getNode(TrueSuccessor), cast<CmpInst>(Op1),
-                              false);
+  auto *CmpI = dyn_cast<ICmpInst>(Br->getCondition());
+  if (!CmpI)
+    return;
+  if (canAddSuccessor(BB, Br->getSuccessor(0)))
+    WorkList.emplace_back(DT.getNode(Br->getSuccessor(0)), CmpI, false);
+  if (canAddSuccessor(BB, Br->getSuccessor(1)))
+    WorkList.emplace_back(DT.getNode(Br->getSuccessor(1)), CmpI, true);
+}
+
+void ConstraintInfo::addFact(CmpInst::Predicate Pred, Value *A, Value *B,
+                             bool IsNegated, unsigned NumIn, unsigned NumOut,
+                             SmallVectorImpl<StackEntry> &DFSInStack) {
+  // If the constraint has a pre-condition, skip the constraint if it does not
+  // hold.
+  DenseMap<Value *, unsigned> NewIndices;
+  auto R = getConstraint(Pred, A, B, NewIndices);
+  if (!R.isValid(*this))
+    return;
+
+  //LLVM_DEBUG(dbgs() << "Adding " << *Condition << " " << IsNegated << "\n");
+  bool Added = false;
+  assert(CmpInst::isSigned(Pred) == R.IsSigned &&
+         "condition and constraint signs must match");
+  auto &CSToUse = getCS(R.IsSigned);
+  if (R.Coefficients.empty())
+    return;
+
+  Added |= CSToUse.addVariableRowFill(R.Coefficients);
+
+  // If R has been added to the system, queue it for removal once it goes
+  // out-of-scope.
+  if (Added) {
+    SmallVector<Value *, 2> ValuesToRelease;
+    for (auto &KV : NewIndices) {
+      getValue2Index(R.IsSigned).insert(KV);
+      ValuesToRelease.push_back(KV.first);
+    }
+
+    LLVM_DEBUG({
+      dbgs() << "  constraint: ";
+      dumpWithNames(R.Coefficients, getValue2Index(R.IsSigned));
+    });
+
+    DFSInStack.emplace_back(NumIn, NumOut, IsNegated, R.IsSigned,
+                            ValuesToRelease);
+
+    if (R.IsEq) {
+      // Also add the inverted constraint for equality constraints.
+      for (auto &Coeff : R.Coefficients)
+        Coeff *= -1;
+      CSToUse.addVariableRowFill(R.Coefficients);
+
+      DFSInStack.emplace_back(NumIn, NumOut, IsNegated, R.IsSigned,
+                              SmallVector<Value *, 2>());
+    }
+  }
+}
+
+static void
+tryToSimplifyOverflowMath(IntrinsicInst *II, ConstraintInfo &Info,
+                          SmallVectorImpl<Instruction *> &ToRemove) {
+  auto DoesConditionHold = [](CmpInst::Predicate Pred, Value *A, Value *B,
+                              ConstraintInfo &Info) {
+    DenseMap<Value *, unsigned> NewIndices;
+    auto R = Info.getConstraint(Pred, A, B, NewIndices);
+    if (R.size() < 2 || R.needsNewIndices(NewIndices) || !R.isValid(Info))
+      return false;
+
+    auto &CSToUse = Info.getCS(CmpInst::isSigned(Pred));
+    return CSToUse.isConditionImplied(R.Coefficients);
+  };
+
+  if (II->getIntrinsicID() == Intrinsic::ssub_with_overflow) {
+    // If A s>= B && B s>= 0, ssub.with.overflow(a, b) should not overflow and
+    // can be simplified to a regular sub.
+    Value *A = II->getArgOperand(0);
+    Value *B = II->getArgOperand(1);
+    if (!DoesConditionHold(CmpInst::ICMP_SGE, A, B, Info) ||
+        !DoesConditionHold(CmpInst::ICMP_SGE, B,
+                           ConstantInt::get(A->getType(), 0), Info))
+      return;
+
+    IRBuilder<> Builder(II->getParent(), II->getIterator());
+    Value *Sub = nullptr;
+    for (User *U : make_early_inc_range(II->users())) {
+      if (match(U, m_ExtractValue<0>(m_Value()))) {
+        if (!Sub)
+          Sub = Builder.CreateSub(A, B);
+        U->replaceAllUsesWith(Sub);
+      } else if (match(U, m_ExtractValue<1>(m_Value())))
+        U->replaceAllUsesWith(Builder.getFalse());
+      else
+        continue;
+
+      if (U->use_empty()) {
+        auto *I = cast<Instruction>(U);
+        ToRemove.push_back(I);
+        I->setOperand(0, PoisonValue::get(II->getType()));
       }
-      continue;
     }
 
-    auto *CmpI = dyn_cast<CmpInst>(Br->getCondition());
-    if (!CmpI)
+    if (II->use_empty())
+      II->eraseFromParent();
+  }
+}
+
+static bool eliminateConstraints(Function &F, DominatorTree &DT) {
+  bool Changed = false;
+  DT.updateDFSNumbers();
+
+  ConstraintInfo Info;
+  State S(DT);
+
+  // First, collect conditions implied by branches and blocks with their
+  // Dominator DFS in and out numbers.
+  for (BasicBlock &BB : F) {
+    if (!DT.getNode(&BB))
       continue;
-    if (CanAdd(Br->getSuccessor(0)))
-      WorkList.emplace_back(DT.getNode(Br->getSuccessor(0)), CmpI, false);
-    if (CanAdd(Br->getSuccessor(1)))
-      WorkList.emplace_back(DT.getNode(Br->getSuccessor(1)), CmpI, true);
+    S.addInfoFor(BB);
   }
 
   // Next, sort worklist by dominance, so that dominating blocks and conditions
   // come before blocks and conditions dominated by them. If a block and a
   // condition have the same numbers, the condition comes before the block, as
   // it holds on entry to the block.
-  sort(WorkList, [](const ConstraintOrBlock &A, const ConstraintOrBlock &B) {
+  stable_sort(S.WorkList, [](const ConstraintOrBlock &A, const ConstraintOrBlock &B) {
     return std::tie(A.NumIn, A.IsBlock) < std::tie(B.NumIn, B.IsBlock);
   });
 
+  SmallVector<Instruction *> ToRemove;
+
   // Finally, process ordered worklist and eliminate implied conditions.
   SmallVector<StackEntry, 16> DFSInStack;
-  DenseMap<Value *, unsigned> Value2Index;
-  for (ConstraintOrBlock &CB : WorkList) {
+  for (ConstraintOrBlock &CB : S.WorkList) {
     // First, pop entries from the stack that are out-of-scope for CB. Remove
     // the corresponding entry from the constraint system.
     while (!DFSInStack.empty()) {
@@ -409,10 +706,20 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT) {
       assert(E.NumIn <= CB.NumIn);
       if (CB.NumOut <= E.NumOut)
         break;
-      LLVM_DEBUG(dbgs() << "Removing " << *E.Condition << " " << E.IsNot
-                        << "\n");
+      LLVM_DEBUG({
+        dbgs() << "Removing ";
+        dumpWithNames(Info.getCS(E.IsSigned).getLastConstraint(),
+                      Info.getValue2Index(E.IsSigned));
+        dbgs() << "\n";
+      });
+
+      Info.popLastConstraint(E.IsSigned);
+      // Remove variables in the system that went out of scope.
+      auto &Mapping = Info.getValue2Index(E.IsSigned);
+      for (Value *V : E.ValuesToRelease)
+        Mapping.erase(V);
+      Info.popLastNVariables(E.IsSigned, E.ValuesToRelease.size());
       DFSInStack.pop_back();
-      CS.popLastConstraint();
     }
 
     LLVM_DEBUG({
@@ -427,28 +734,30 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT) {
     // For a block, check if any CmpInsts become known based on the current set
     // of constraints.
     if (CB.IsBlock) {
-      for (Instruction &I : *CB.BB) {
-        auto *Cmp = dyn_cast<CmpInst>(&I);
+      for (Instruction &I : make_early_inc_range(*CB.BB)) {
+        if (auto *II = dyn_cast<WithOverflowInst>(&I)) {
+          tryToSimplifyOverflowMath(II, Info, ToRemove);
+          continue;
+        }
+        auto *Cmp = dyn_cast<ICmpInst>(&I);
         if (!Cmp)
           continue;
 
         DenseMap<Value *, unsigned> NewIndices;
-        auto R = getConstraint(Cmp, Value2Index, NewIndices);
-        if (R.size() != 1)
-          continue;
-
-        if (R.needsNewIndices(NewIndices))
+        auto R = Info.getConstraint(Cmp, NewIndices);
+        if (R.IsEq || R.empty() || R.needsNewIndices(NewIndices) ||
+            !R.isValid(Info))
           continue;
 
-        if (CS.isConditionImplied(R.get(0).Coefficients)) {
+        auto &CSToUse = Info.getCS(R.IsSigned);
+        if (CSToUse.isConditionImplied(R.Coefficients)) {
           if (!DebugCounter::shouldExecute(EliminatedCounter))
             continue;
 
-          LLVM_DEBUG(dbgs() << "Condition " << *Cmp
-                            << " implied by dominating constraints\n");
           LLVM_DEBUG({
-            for (auto &E : reverse(DFSInStack))
-              dbgs() << "   C " << *E.Condition << " " << E.IsNot << "\n";
+            dbgs() << "Condition " << *Cmp
+                   << " implied by dominating constraints\n";
+            dumpWithNames(CSToUse, Info.getValue2Index(R.IsSigned));
           });
           Cmp->replaceUsesWithIf(
               ConstantInt::getTrue(F.getParent()->getContext()), [](Use &U) {
@@ -460,16 +769,15 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT) {
           NumCondsRemoved++;
           Changed = true;
         }
-        if (CS.isConditionImplied(
-                ConstraintSystem::negate(R.get(0).Coefficients))) {
+        if (CSToUse.isConditionImplied(
+                ConstraintSystem::negate(R.Coefficients))) {
           if (!DebugCounter::shouldExecute(EliminatedCounter))
             continue;
 
-          LLVM_DEBUG(dbgs() << "Condition !" << *Cmp
-                            << " implied by dominating constraints\n");
           LLVM_DEBUG({
-            for (auto &E : reverse(DFSInStack))
-              dbgs() << "   C " << *E.Condition << " " << E.IsNot << "\n";
+            dbgs() << "Condition !" << *Cmp
+                   << " implied by dominating constraints\n";
+            dumpWithNames(CSToUse, Info.getValue2Index(R.IsSigned));
           });
           Cmp->replaceAllUsesWith(
               ConstantInt::getFalse(F.getParent()->getContext()));
@@ -482,7 +790,7 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT) {
 
     // Set up a function to restore the predicate at the end of the scope if it
     // has been negated. Negate the predicate in-place, if required.
-    auto *CI = dyn_cast<CmpInst>(CB.Condition);
+    auto *CI = dyn_cast<ICmpInst>(CB.Condition);
     auto PredicateRestorer = make_scope_exit([CI, &CB]() {
       if (CB.Not && CI)
         CI->setPredicate(CI->getInversePredicate());
@@ -496,34 +804,28 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT) {
       }
     }
 
-    // Otherwise, add the condition to the system and stack, if we can transform
-    // it into a constraint.
-    DenseMap<Value *, unsigned> NewIndices;
-    auto R = getConstraint(CB.Condition, Value2Index, NewIndices);
-    if (R.empty())
-      continue;
-
-    for (auto &KV : NewIndices)
-      Value2Index.insert(KV);
-
-    LLVM_DEBUG(dbgs() << "Adding " << *CB.Condition << " " << CB.Not << "\n");
-    bool Added = false;
-    for (auto &C : R.Constraints) {
-      auto Coeffs = C.Coefficients;
-      LLVM_DEBUG({
-        dbgs() << "  constraint: ";
-        dumpWithNames(C, Value2Index);
-      });
-      Added |= CS.addVariableRowFill(Coeffs);
-      // If R has been added to the system, queue it for removal once it goes
-      // out-of-scope.
-      if (Added)
-        DFSInStack.emplace_back(CB.NumIn, CB.NumOut, CB.Condition, CB.Not);
+    ICmpInst::Predicate Pred;
+    Value *A, *B;
+    if (match(CB.Condition, m_ICmp(Pred, m_Value(A), m_Value(B)))) {
+      // Otherwise, add the condition to the system and stack, if we can
+      // transform it into a constraint.
+      Info.addFact(Pred, A, B, CB.Not, CB.NumIn, CB.NumOut, DFSInStack);
+      Info.transferToOtherSystem(Pred, A, B, CB.Not, CB.NumIn, CB.NumOut,
+                                 DFSInStack);
     }
   }
 
-  assert(CS.size() == DFSInStack.size() &&
+#ifndef NDEBUG
+  unsigned SignedEntries =
+      count_if(DFSInStack, [](const StackEntry &E) { return E.IsSigned; });
+  assert(Info.getCS(false).size() == DFSInStack.size() - SignedEntries &&
+         "updates to CS and DFSInStack are out of sync");
+  assert(Info.getCS(true).size() == SignedEntries &&
          "updates to CS and DFSInStack are out of sync");
+#endif
+
+  for (Instruction *I : ToRemove)
+    I->eraseFromParent();
   return Changed;
 }
 
diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index a3fd97079b1d..64bd4241f37c 100644
--- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -41,8 +41,6 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
@@ -215,6 +213,53 @@ static bool simplifyCommonValuePhi(PHINode *P, LazyValueInfo *LVI,
   return true;
 }
 
+static Value *getValueOnEdge(LazyValueInfo *LVI, Value *Incoming,
+                             BasicBlock *From, BasicBlock *To,
+                             Instruction *CxtI) {
+  if (Constant *C = LVI->getConstantOnEdge(Incoming, From, To, CxtI))
+    return C;
+
+  // Look if the incoming value is a select with a scalar condition for which
+  // LVI can tells us the value. In that case replace the incoming value with
+  // the appropriate value of the select. This often allows us to remove the
+  // select later.
+  auto *SI = dyn_cast<SelectInst>(Incoming);
+  if (!SI)
+    return nullptr;
+
+  // Once LVI learns to handle vector types, we could also add support
+  // for vector type constants that are not all zeroes or all ones.
+  Value *Condition = SI->getCondition();
+  if (!Condition->getType()->isVectorTy()) {
+    if (Constant *C = LVI->getConstantOnEdge(Condition, From, To, CxtI)) {
+      if (C->isOneValue())
+        return SI->getTrueValue();
+      if (C->isZeroValue())
+        return SI->getFalseValue();
+    }
+  }
+
+  // Look if the select has a constant but LVI tells us that the incoming
+  // value can never be that constant. In that case replace the incoming
+  // value with the other value of the select. This often allows us to
+  // remove the select later.
+
+  // The "false" case
+  if (auto *C = dyn_cast<Constant>(SI->getFalseValue()))
+    if (LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C, From, To, CxtI) ==
+        LazyValueInfo::False)
+      return SI->getTrueValue();
+
+  // The "true" case,
+  // similar to the select "false" case, but try the select "true" value
+  if (auto *C = dyn_cast<Constant>(SI->getTrueValue()))
+    if (LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C, From, To, CxtI) ==
+        LazyValueInfo::False)
+      return SI->getFalseValue();
+
+  return nullptr;
+}
+
 static bool processPHI(PHINode *P, LazyValueInfo *LVI, DominatorTree *DT,
                        const SimplifyQuery &SQ) {
   bool Changed = false;
@@ -224,53 +269,14 @@ static bool processPHI(PHINode *P, LazyValueInfo *LVI, DominatorTree *DT,
     Value *Incoming = P->getIncomingValue(i);
     if (isa<Constant>(Incoming)) continue;
 
-    Value *V = LVI->getConstantOnEdge(Incoming, P->getIncomingBlock(i), BB, P);
-
-    // Look if the incoming value is a select with a scalar condition for which
-    // LVI can tells us the value. In that case replace the incoming value with
-    // the appropriate value of the select. This often allows us to remove the
-    // select later.
-    if (!V) {
-      SelectInst *SI = dyn_cast<SelectInst>(Incoming);
-      if (!SI) continue;
-
-      Value *Condition = SI->getCondition();
-      if (!Condition->getType()->isVectorTy()) {
-        if (Constant *C = LVI->getConstantOnEdge(
-                Condition, P->getIncomingBlock(i), BB, P)) {
-          if (C->isOneValue()) {
-            V = SI->getTrueValue();
-          } else if (C->isZeroValue()) {
-            V = SI->getFalseValue();
-          }
-          // Once LVI learns to handle vector types, we could also add support
-          // for vector type constants that are not all zeroes or all ones.
-        }
-      }
-
-      // Look if the select has a constant but LVI tells us that the incoming
-      // value can never be that constant. In that case replace the incoming
-      // value with the other value of the select. This often allows us to
-      // remove the select later.
-      if (!V) {
-        Constant *C = dyn_cast<Constant>(SI->getFalseValue());
-        if (!C) continue;
-
-        if (LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C,
-              P->getIncomingBlock(i), BB, P) !=
-            LazyValueInfo::False)
-          continue;
-        V = SI->getTrueValue();
-      }
-
-      LLVM_DEBUG(dbgs() << "CVP: Threading PHI over " << *SI << '\n');
+    Value *V = getValueOnEdge(LVI, Incoming, P->getIncomingBlock(i), BB, P);
+    if (V) {
+      P->setIncomingValue(i, V);
+      Changed = true;
     }
-
-    P->setIncomingValue(i, V);
-    Changed = true;
   }
 
-  if (Value *V = SimplifyInstruction(P, SQ)) {
+  if (Value *V = simplifyInstruction(P, SQ)) {
     P->replaceAllUsesWith(V);
     P->eraseFromParent();
     Changed = true;
@@ -575,7 +581,7 @@ static bool processOverflowIntrinsic(WithOverflowInst *WO, LazyValueInfo *LVI) {
 
   StructType *ST = cast<StructType>(WO->getType());
   Constant *Struct = ConstantStruct::get(ST,
-      { UndefValue::get(ST->getElementType(0)),
+      { PoisonValue::get(ST->getElementType(0)),
         ConstantInt::getFalse(ST->getElementType(1)) });
   Value *NewI = B.CreateInsertValue(Struct, NewOp, 0);
   WO->replaceAllUsesWith(NewI);
@@ -735,8 +741,7 @@ static bool narrowSDivOrSRem(BinaryOperator *Instr, LazyValueInfo *LVI) {
   // sdiv/srem is UB if divisor is -1 and divident is INT_MIN, so unless we can
   // prove that such a combination is impossible, we need to bump the bitwidth.
   if (CRs[1]->contains(APInt::getAllOnes(OrigWidth)) &&
-      CRs[0]->contains(
-          APInt::getSignedMinValue(MinSignedBits).sextOrSelf(OrigWidth)))
+      CRs[0]->contains(APInt::getSignedMinValue(MinSignedBits).sext(OrigWidth)))
     ++MinSignedBits;
 
   // Don't shrink below 8 bits wide.
@@ -955,7 +960,8 @@ static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) {
 
   ++NumAShrsConverted;
   auto *BO = BinaryOperator::CreateLShr(SDI->getOperand(0), SDI->getOperand(1),
-                                        SDI->getName(), SDI);
+                                        "", SDI);
+  BO->takeName(SDI);
   BO->setDebugLoc(SDI->getDebugLoc());
   BO->setIsExact(SDI->isExact());
   SDI->replaceAllUsesWith(BO);
@@ -974,8 +980,8 @@ static bool processSExt(SExtInst *SDI, LazyValueInfo *LVI) {
     return false;
 
   ++NumSExt;
-  auto *ZExt =
-      CastInst::CreateZExtOrBitCast(Base, SDI->getType(), SDI->getName(), SDI);
+  auto *ZExt = CastInst::CreateZExtOrBitCast(Base, SDI->getType(), "", SDI);
+  ZExt->takeName(SDI);
   ZExt->setDebugLoc(SDI->getDebugLoc());
   SDI->replaceAllUsesWith(ZExt);
   SDI->eraseFromParent();
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index 143a78f604fc..5667eefabad5 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -60,30 +60,31 @@
 #include "llvm/Transforms/Scalar/DFAJumpThreading.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Verifier.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/SSAUpdaterBulk.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
 #include <deque>
 
+#ifdef EXPENSIVE_CHECKS
+#include "llvm/IR/Verifier.h"
+#endif
+
 using namespace llvm;
 
 #define DEBUG_TYPE "dfa-jump-threading"
@@ -102,6 +103,11 @@ static cl::opt<unsigned> MaxPathLength(
     cl::desc("Max number of blocks searched to find a threading path"),
     cl::Hidden, cl::init(20));
 
+static cl::opt<unsigned> MaxNumPaths(
+    "dfa-max-num-paths",
+    cl::desc("Max number of paths enumerated around a switch"),
+    cl::Hidden, cl::init(200));
+
 static cl::opt<unsigned>
     CostThreshold("dfa-cost-threshold",
                   cl::desc("Maximum cost accepted for the transformation"),
@@ -414,7 +420,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ThreadingPath &TPath) {
 
 struct MainSwitch {
   MainSwitch(SwitchInst *SI, OptimizationRemarkEmitter *ORE) {
-    if (isPredictable(SI)) {
+    if (isCandidate(SI)) {
       Instr = SI;
     } else {
       ORE->emit([&]() {
@@ -432,83 +438,60 @@ struct MainSwitch {
   }
 
 private:
-  /// Do a use-def chain traversal. Make sure the value of the switch variable
-  /// is always a known constant. This means that all conditional jumps based on
-  /// switch variable can be converted to unconditional jumps.
-  bool isPredictable(const SwitchInst *SI) {
-    std::deque<Instruction *> Q;
+  /// Do a use-def chain traversal starting from the switch condition to see if
+  /// \p SI is a potential condidate.
+  ///
+  /// Also, collect select instructions to unfold.
+  bool isCandidate(const SwitchInst *SI) {
+    std::deque<Value *> Q;
     SmallSet<Value *, 16> SeenValues;
     SelectInsts.clear();
 
-    Value *FirstDef = SI->getOperand(0);
-    auto *Inst = dyn_cast<Instruction>(FirstDef);
-
-    // If this is a function argument or another non-instruction, then give up.
-    // We are interested in loop local variables.
-    if (!Inst)
-      return false;
-
-    // Require the first definition to be a PHINode
-    if (!isa<PHINode>(Inst))
+    Value *SICond = SI->getCondition();
+    LLVM_DEBUG(dbgs() << "\tSICond: " << *SICond << "\n");
+    if (!isa<PHINode>(SICond))
       return false;
 
-    LLVM_DEBUG(dbgs() << "\tisPredictable() FirstDef: " << *Inst << "\n");
-
-    Q.push_back(Inst);
-    SeenValues.insert(FirstDef);
+    addToQueue(SICond, Q, SeenValues);
 
     while (!Q.empty()) {
-      Instruction *Current = Q.front();
+      Value *Current = Q.front();
       Q.pop_front();
 
       if (auto *Phi = dyn_cast<PHINode>(Current)) {
         for (Value *Incoming : Phi->incoming_values()) {
-          if (!isPredictableValue(Incoming, SeenValues))
-            return false;
-          addInstToQueue(Incoming, Q, SeenValues);
+          addToQueue(Incoming, Q, SeenValues);
         }
-        LLVM_DEBUG(dbgs() << "\tisPredictable() phi: " << *Phi << "\n");
+        LLVM_DEBUG(dbgs() << "\tphi: " << *Phi << "\n");
       } else if (SelectInst *SelI = dyn_cast<SelectInst>(Current)) {
         if (!isValidSelectInst(SelI))
           return false;
-        if (!isPredictableValue(SelI->getTrueValue(), SeenValues) ||
-            !isPredictableValue(SelI->getFalseValue(), SeenValues)) {
-          return false;
-        }
-        addInstToQueue(SelI->getTrueValue(), Q, SeenValues);
-        addInstToQueue(SelI->getFalseValue(), Q, SeenValues);
-        LLVM_DEBUG(dbgs() << "\tisPredictable() select: " << *SelI << "\n");
+        addToQueue(SelI->getTrueValue(), Q, SeenValues);
+        addToQueue(SelI->getFalseValue(), Q, SeenValues);
+        LLVM_DEBUG(dbgs() << "\tselect: " << *SelI << "\n");
         if (auto *SelIUse = dyn_cast<PHINode>(SelI->user_back()))
           SelectInsts.push_back(SelectInstToUnfold(SelI, SelIUse));
+      } else if (isa<Constant>(Current)) {
+        LLVM_DEBUG(dbgs() << "\tconst: " << *Current << "\n");
+        continue;
       } else {
-        // If it is neither a phi nor a select, then we give up.
-        return false;
+        LLVM_DEBUG(dbgs() << "\tother: " << *Current << "\n");
+        // Allow unpredictable values. The hope is that those will be the
+        // initial switch values that can be ignored (they will hit the
+        // unthreaded switch) but this assumption will get checked later after
+        // paths have been enumerated (in function getStateDefMap).
+        continue;
       }
     }
 
     return true;
   }
 
-  bool isPredictableValue(Value *InpVal, SmallSet<Value *, 16> &SeenValues) {
-    if (SeenValues.contains(InpVal))
-      return true;
-
-    if (isa<ConstantInt>(InpVal))
-      return true;
-
-    // If this is a function argument or another non-instruction, then give up.
-    if (!isa<Instruction>(InpVal))
-      return false;
-
-    return true;
-  }
-
-  void addInstToQueue(Value *Val, std::deque<Instruction *> &Q,
-                      SmallSet<Value *, 16> &SeenValues) {
+  void addToQueue(Value *Val, std::deque<Value *> &Q,
+                  SmallSet<Value *, 16> &SeenValues) {
     if (SeenValues.contains(Val))
       return;
-    if (Instruction *I = dyn_cast<Instruction>(Val))
-      Q.push_back(I);
+    Q.push_back(Val);
     SeenValues.insert(Val);
   }
 
@@ -562,7 +545,16 @@ struct AllSwitchPaths {
   void run() {
     VisitedBlocks Visited;
     PathsType LoopPaths = paths(SwitchBlock, Visited, /* PathDepth = */ 1);
-    StateDefMap StateDef = getStateDefMap();
+    StateDefMap StateDef = getStateDefMap(LoopPaths);
+
+    if (StateDef.empty()) {
+      ORE->emit([&]() {
+        return OptimizationRemarkMissed(DEBUG_TYPE, "SwitchNotPredictable",
+                                        Switch)
+               << "Switch instruction is not predictable.";
+      });
+      return;
+    }
 
     for (PathType Path : LoopPaths) {
       ThreadingPath TPath;
@@ -637,6 +629,9 @@ private:
         PathType NewPath(Path);
         NewPath.push_front(BB);
         Res.push_back(NewPath);
+        if (Res.size() >= MaxNumPaths) {
+          return Res;
+        }
       }
     }
     // This block could now be visited again from a different predecessor. Note
@@ -647,14 +642,22 @@ private:
   }
 
   /// Walk the use-def chain and collect all the state-defining instructions.
-  StateDefMap getStateDefMap() const {
+  ///
+  /// Return an empty map if unpredictable values encountered inside the basic
+  /// blocks of \p LoopPaths.
+  StateDefMap getStateDefMap(const PathsType &LoopPaths) const {
     StateDefMap Res;
 
+    // Basic blocks belonging to any of the loops around the switch statement.
+    SmallPtrSet<BasicBlock *, 16> LoopBBs;
+    for (const PathType &Path : LoopPaths) {
+      for (BasicBlock *BB : Path)
+        LoopBBs.insert(BB);
+    }
+
     Value *FirstDef = Switch->getOperand(0);
 
-    assert(isa<PHINode>(FirstDef) && "After select unfolding, all state "
-                                     "definitions are expected to be phi "
-                                     "nodes.");
+    assert(isa<PHINode>(FirstDef) && "The first definition must be a phi.");
 
     SmallVector<PHINode *, 8> Stack;
     Stack.push_back(dyn_cast<PHINode>(FirstDef));
@@ -666,15 +669,17 @@ private:
       Res[CurPhi->getParent()] = CurPhi;
       SeenValues.insert(CurPhi);
 
-      for (Value *Incoming : CurPhi->incoming_values()) {
+      for (BasicBlock *IncomingBB : CurPhi->blocks()) {
+        Value *Incoming = CurPhi->getIncomingValueForBlock(IncomingBB);
+        bool IsOutsideLoops = LoopBBs.count(IncomingBB) == 0;
         if (Incoming == FirstDef || isa<ConstantInt>(Incoming) ||
-            SeenValues.contains(Incoming)) {
+            SeenValues.contains(Incoming) || IsOutsideLoops) {
           continue;
         }
 
-        assert(isa<PHINode>(Incoming) && "After select unfolding, all state "
-                                         "definitions are expected to be phi "
-                                         "nodes.");
+        // Any unpredictable value inside the loops means we must bail out.
+        if (!isa<PHINode>(Incoming))
+          return StateDefMap();
 
         Stack.push_back(cast<PHINode>(Incoming));
       }
@@ -823,6 +828,16 @@ private:
         });
         return false;
       }
+
+      if (!Metrics.NumInsts.isValid()) {
+        LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, contains "
+                          << "instructions with invalid cost.\n");
+        ORE->emit([&]() {
+          return OptimizationRemarkMissed(DEBUG_TYPE, "ConvergentInst", Switch)
+                 << "Contains instructions with invalid cost.";
+        });
+        return false;
+      }
     }
 
     unsigned DuplicationCost = 0;
@@ -836,7 +851,7 @@ private:
       // using binary search, hence the LogBase2().
       unsigned CondBranches =
           APInt(32, Switch->getNumSuccessors()).ceilLogBase2();
-      DuplicationCost = Metrics.NumInsts / CondBranches;
+      DuplicationCost = *Metrics.NumInsts.getValue() / CondBranches;
     } else {
       // Compared with jump tables, the DFA optimizer removes an indirect branch
       // on each loop iteration, thus making branch prediction more precise. The
@@ -844,7 +859,7 @@ private:
       // predictor to make a mistake, and the more benefit there is in the DFA
       // optimizer. Thus, the more branch targets there are, the lower is the
       // cost of the DFA opt.
-      DuplicationCost = Metrics.NumInsts / JumpTableSize;
+      DuplicationCost = *Metrics.NumInsts.getValue() / JumpTableSize;
     }
 
     LLVM_DEBUG(dbgs() << "\nDFA Jump Threading: Cost to jump thread block "
@@ -1197,7 +1212,7 @@ private:
         PhiToRemove.push_back(Phi);
       }
       for (PHINode *PN : PhiToRemove) {
-        PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
+        PN->replaceAllUsesWith(PoisonValue::get(PN->getType()));
         PN->eraseFromParent();
       }
       return;
@@ -1246,7 +1261,7 @@ private:
 
   /// Returns true if IncomingBB is a predecessor of BB.
   bool isPredecessor(BasicBlock *BB, BasicBlock *IncomingBB) {
-    return llvm::find(predecessors(BB), IncomingBB) != pred_end(BB);
+    return llvm::is_contained(predecessors(BB), IncomingBB);
   }
 
   AllSwitchPaths *SwitchPaths;
@@ -1278,7 +1293,7 @@ bool DFAJumpThreading::run(Function &F) {
       continue;
 
     LLVM_DEBUG(dbgs() << "\nCheck if SwitchInst in BB " << BB.getName()
-                      << " is predictable\n");
+                      << " is a candidate\n");
     MainSwitch Switch(SI, ORE);
 
     if (!Switch.getInstr())
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index ae636e7b61f7..4c42869dbd58 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -38,7 +38,9 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
@@ -62,8 +64,6 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
@@ -75,7 +75,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/DebugCounter.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
@@ -83,7 +82,6 @@
 #include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 #include <cassert>
-#include <cstddef>
 #include <cstdint>
 #include <iterator>
 #include <map>
@@ -766,20 +764,27 @@ struct DSEState {
   // Post-order numbers for each basic block. Used to figure out if memory
   // accesses are executed before another access.
   DenseMap<BasicBlock *, unsigned> PostOrderNumbers;
+  // Values that are only used with assumes. Used to refine pointer escape
+  // analysis.
+  SmallPtrSet<const Value *, 32> EphValues;
 
   /// Keep track of instructions (partly) overlapping with killing MemoryDefs per
   /// basic block.
   MapVector<BasicBlock *, InstOverlapIntervalsTy> IOLs;
+  // Check if there are root nodes that are terminated by UnreachableInst.
+  // Those roots pessimize post-dominance queries. If there are such roots,
+  // fall back to CFG scan starting from all non-unreachable roots.
+  bool AnyUnreachableExit;
 
   // Class contains self-reference, make sure it's not copied/moved.
   DSEState(const DSEState &) = delete;
   DSEState &operator=(const DSEState &) = delete;
 
   DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT,
-           PostDominatorTree &PDT, const TargetLibraryInfo &TLI,
-           const LoopInfo &LI)
-      : F(F), AA(AA), EI(DT, LI), BatchAA(AA, &EI), MSSA(MSSA), DT(DT),
-        PDT(PDT), TLI(TLI), DL(F.getParent()->getDataLayout()), LI(LI) {
+           PostDominatorTree &PDT, AssumptionCache &AC,
+           const TargetLibraryInfo &TLI, const LoopInfo &LI)
+      : F(F), AA(AA), EI(DT, LI, EphValues), BatchAA(AA, &EI), MSSA(MSSA),
+        DT(DT), PDT(PDT), TLI(TLI), DL(F.getParent()->getDataLayout()), LI(LI) {
     // Collect blocks with throwing instructions not modeled in MemorySSA and
     // alloc-like objects.
     unsigned PO = 0;
@@ -805,6 +810,12 @@ struct DSEState {
 
     // Collect whether there is any irreducible control flow in the function.
     ContainsIrreducibleLoops = mayContainIrreducibleControl(F, &LI);
+
+    AnyUnreachableExit = any_of(PDT.roots(), [](const BasicBlock *E) {
+      return isa<UnreachableInst>(E->getTerminator());
+    });
+
+    CodeMetrics::collectEphemeralValues(&F, &AC, EphValues);
   }
 
   /// Return 'OW_Complete' if a store to the 'KillingLoc' location (by \p
@@ -951,7 +962,7 @@ struct DSEState {
       if (!isInvisibleToCallerOnUnwind(V)) {
         I.first->second = false;
       } else if (isNoAliasCall(V)) {
-        I.first->second = !PointerMayBeCaptured(V, true, false);
+        I.first->second = !PointerMayBeCaptured(V, true, false, EphValues);
       }
     }
     return I.first->second;
@@ -970,7 +981,7 @@ struct DSEState {
       // with the killing MemoryDef. But we refrain from doing so for now to
       // limit compile-time and this does not cause any changes to the number
       // of stores removed on a large test set in practice.
-      I.first->second = PointerMayBeCaptured(V, false, true);
+      I.first->second = PointerMayBeCaptured(V, false, true, EphValues);
     return !I.first->second;
   }
 
@@ -1003,7 +1014,8 @@ struct DSEState {
       if (CB->isLifetimeStartOrEnd())
         return false;
 
-      return CB->use_empty() && CB->willReturn() && CB->doesNotThrow();
+      return CB->use_empty() && CB->willReturn() && CB->doesNotThrow() &&
+             !CB->isTerminator();
     }
 
     return false;
@@ -1233,6 +1245,9 @@ struct DSEState {
       // Reached TOP.
       if (MSSA.isLiveOnEntryDef(Current)) {
         LLVM_DEBUG(dbgs() << "   ...  found LiveOnEntryDef\n");
+        if (CanOptimize && Current != KillingDef->getDefiningAccess())
+          // The first clobbering def is... none.
+          KillingDef->setOptimized(Current);
         return None;
       }
 
@@ -1309,7 +1324,6 @@ struct DSEState {
       // memory location and not located in different loops.
       if (!isGuaranteedLoopIndependent(CurrentI, KillingI, *CurrentLoc)) {
         LLVM_DEBUG(dbgs() << "  ... not guaranteed loop independent\n");
-        WalkerStepLimit -= 1;
         CanOptimize = false;
         continue;
       }
@@ -1508,54 +1522,56 @@ struct DSEState {
         CommonPred = PDT.findNearestCommonDominator(CommonPred, BB);
       }
 
-      // If CommonPred is in the set of killing blocks, just check if it
-      // post-dominates MaybeDeadAccess.
-      if (KillingBlocks.count(CommonPred)) {
-        if (PDT.dominates(CommonPred, MaybeDeadAccess->getBlock()))
-          return {MaybeDeadAccess};
-        return None;
-      }
-
       // If the common post-dominator does not post-dominate MaybeDeadAccess,
       // there is a path from MaybeDeadAccess to an exit not going through a
       // killing block.
-      if (PDT.dominates(CommonPred, MaybeDeadAccess->getBlock())) {
-        SetVector<BasicBlock *> WorkList;
-
-        // If CommonPred is null, there are multiple exits from the function.
-        // They all have to be added to the worklist.
-        if (CommonPred)
-          WorkList.insert(CommonPred);
-        else
-          for (BasicBlock *R : PDT.roots())
+      if (!PDT.dominates(CommonPred, MaybeDeadAccess->getBlock())) {
+        if (!AnyUnreachableExit)
+          return None;
+
+        // Fall back to CFG scan starting at all non-unreachable roots if not
+        // all paths to the exit go through CommonPred.
+        CommonPred = nullptr;
+      }
+
+      // If CommonPred itself is in the set of killing blocks, we're done.
+      if (KillingBlocks.count(CommonPred))
+        return {MaybeDeadAccess};
+
+      SetVector<BasicBlock *> WorkList;
+      // If CommonPred is null, there are multiple exits from the function.
+      // They all have to be added to the worklist.
+      if (CommonPred)
+        WorkList.insert(CommonPred);
+      else
+        for (BasicBlock *R : PDT.roots()) {
+          if (!isa<UnreachableInst>(R->getTerminator()))
             WorkList.insert(R);
+        }
 
-        NumCFGTries++;
-        // Check if all paths starting from an exit node go through one of the
-        // killing blocks before reaching MaybeDeadAccess.
-        for (unsigned I = 0; I < WorkList.size(); I++) {
-          NumCFGChecks++;
-          BasicBlock *Current = WorkList[I];
-          if (KillingBlocks.count(Current))
-            continue;
-          if (Current == MaybeDeadAccess->getBlock())
-            return None;
+      NumCFGTries++;
+      // Check if all paths starting from an exit node go through one of the
+      // killing blocks before reaching MaybeDeadAccess.
+      for (unsigned I = 0; I < WorkList.size(); I++) {
+        NumCFGChecks++;
+        BasicBlock *Current = WorkList[I];
+        if (KillingBlocks.count(Current))
+          continue;
+        if (Current == MaybeDeadAccess->getBlock())
+          return None;
 
-          // MaybeDeadAccess is reachable from the entry, so we don't have to
-          // explore unreachable blocks further.
-          if (!DT.isReachableFromEntry(Current))
-            continue;
+        // MaybeDeadAccess is reachable from the entry, so we don't have to
+        // explore unreachable blocks further.
+        if (!DT.isReachableFromEntry(Current))
+          continue;
 
-          for (BasicBlock *Pred : predecessors(Current))
-            WorkList.insert(Pred);
+        for (BasicBlock *Pred : predecessors(Current))
+          WorkList.insert(Pred);
 
-          if (WorkList.size() >= MemorySSAPathCheckLimit)
-            return None;
-        }
-        NumCFGSuccess++;
-        return {MaybeDeadAccess};
+        if (WorkList.size() >= MemorySSAPathCheckLimit)
+          return None;
       }
-      return None;
+      NumCFGSuccess++;
     }
 
     // No aliasing MemoryUses of MaybeDeadAccess found, MaybeDeadAccess is
@@ -1780,10 +1796,9 @@ struct DSEState {
     if (!isRemovable(DefI))
       return false;
 
-    if (StoredConstant && isAllocationFn(DefUO, &TLI)) {
-      auto *CB = cast<CallBase>(DefUO);
-      auto *InitC = getInitialValueOfAllocation(CB, &TLI,
-                                                StoredConstant->getType());
+    if (StoredConstant) {
+      Constant *InitC =
+          getInitialValueOfAllocation(DefUO, &TLI, StoredConstant->getType());
       // If the clobbering access is LiveOnEntry, no instructions between them
       // can modify the memory location.
       if (InitC && InitC == StoredConstant)
@@ -1921,11 +1936,13 @@ struct DSEState {
 
 static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
                                 DominatorTree &DT, PostDominatorTree &PDT,
+                                AssumptionCache &AC,
                                 const TargetLibraryInfo &TLI,
                                 const LoopInfo &LI) {
   bool MadeChange = false;
 
-  DSEState State(F, AA, MSSA, DT, PDT, TLI, LI);
+  MSSA.ensureOptimizedUses();
+  DSEState State(F, AA, MSSA, DT, PDT, AC, TLI, LI);
   // For each store:
   for (unsigned I = 0; I < State.MemDefs.size(); I++) {
     MemoryDef *KillingDef = State.MemDefs[I];
@@ -2105,9 +2122,10 @@ PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
   DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
   MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
   PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+  AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
   LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
 
-  bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI);
+  bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, AC, TLI, LI);
 
 #ifdef LLVM_ENABLE_STATS
   if (AreStatisticsEnabled())
@@ -2147,9 +2165,11 @@ public:
     MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
     PostDominatorTree &PDT =
         getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+    AssumptionCache &AC =
+        getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
     LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
 
-    bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI);
+    bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, AC, TLI, LI);
 
 #ifdef LLVM_ENABLE_STATS
     if (AreStatisticsEnabled())
@@ -2173,6 +2193,7 @@ public:
     AU.addPreserved<MemorySSAWrapperPass>();
     AU.addRequired<LoopInfoWrapperPass>();
     AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequired<AssumptionCacheTracker>();
   }
 };
 
@@ -2190,6 +2211,7 @@ INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false,
                     false)
 
diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index 59b934c16c8a..cf2824954122 100644
--- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -16,7 +16,6 @@
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopedHashTable.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
@@ -30,19 +29,16 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
@@ -55,7 +51,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
-#include "llvm/Transforms/Utils/GuardUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <deque>
@@ -781,6 +776,21 @@ private:
       return getLoadStorePointerOperand(Inst);
     }
 
+    Type *getValueType() const {
+      // TODO: handle target-specific intrinsics.
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+        switch (II->getIntrinsicID()) {
+        case Intrinsic::masked_load:
+          return II->getType();
+        case Intrinsic::masked_store:
+          return II->getArgOperand(0)->getType();
+        default:
+          return nullptr;
+        }
+      }
+      return getLoadStoreType(Inst);
+    }
+
     bool mayReadFromMemory() const {
       if (IntrID != 0)
         return Info.ReadMem;
@@ -1162,6 +1172,9 @@ bool EarlyCSE::overridingStores(const ParseMemoryInst &Earlier,
          "Violated invariant");
   if (Earlier.getPointerOperand() != Later.getPointerOperand())
     return false;
+  if (!Earlier.getValueType() || !Later.getValueType() ||
+      Earlier.getValueType() != Later.getValueType())
+    return false;
   if (Earlier.getMatchingId() != Later.getMatchingId())
     return false;
   // At the moment, we don't remove ordered stores, but do remove
@@ -1334,7 +1347,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
 
     // If the instruction can be simplified (e.g. X+0 = X) then replace it with
     // its simpler value.
-    if (Value *V = SimplifyInstruction(&Inst, SQ)) {
+    if (Value *V = simplifyInstruction(&Inst, SQ)) {
       LLVM_DEBUG(dbgs() << "EarlyCSE Simplify: " << Inst << "  to: " << *V
                         << '\n');
       if (!DebugCounter::shouldExecute(CSECounter)) {
diff --git a/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
index 44017b555769..ad2041cd4253 100644
--- a/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
@@ -11,8 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/InitializePasses.h"
diff --git a/llvm/lib/Transforms/Scalar/Float2Int.cpp b/llvm/lib/Transforms/Scalar/Float2Int.cpp
index a98bb8358aef..56f2a3b3004d 100644
--- a/llvm/lib/Transforms/Scalar/Float2Int.cpp
+++ b/llvm/lib/Transforms/Scalar/Float2Int.cpp
@@ -11,24 +11,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Scalar/Float2Int.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include <deque>
-#include <functional> // For std::function
 
 #define DEBUG_TYPE "float2int"
 
@@ -236,116 +234,111 @@ void Float2IntPass::walkBackwards() {
   }
 }
 
-// Walk forwards down the list of seen instructions, so we visit defs before
-// uses.
-void Float2IntPass::walkForwards() {
-  for (auto &It : reverse(SeenInsts)) {
-    if (It.second != unknownRange())
-      continue;
+// Calculate result range from operand ranges.
+// Return None if the range cannot be calculated yet.
+Optional<ConstantRange> Float2IntPass::calcRange(Instruction *I) {
+  SmallVector<ConstantRange, 4> OpRanges;
+  for (Value *O : I->operands()) {
+    if (Instruction *OI = dyn_cast<Instruction>(O)) {
+      auto OpIt = SeenInsts.find(OI);
+      assert(OpIt != SeenInsts.end() && "def not seen before use!");
+      if (OpIt->second == unknownRange())
+        return None; // Wait until operand range has been calculated.
+      OpRanges.push_back(OpIt->second);
+    } else if (ConstantFP *CF = dyn_cast<ConstantFP>(O)) {
+      // Work out if the floating point number can be losslessly represented
+      // as an integer.
+      // APFloat::convertToInteger(&Exact) purports to do what we want, but
+      // the exactness can be too precise. For example, negative zero can
+      // never be exactly converted to an integer.
+      //
+      // Instead, we ask APFloat to round itself to an integral value - this
+      // preserves sign-of-zero - then compare the result with the original.
+      //
+      const APFloat &F = CF->getValueAPF();
+
+      // First, weed out obviously incorrect values. Non-finite numbers
+      // can't be represented and neither can negative zero, unless
+      // we're in fast math mode.
+      if (!F.isFinite() ||
+          (F.isZero() && F.isNegative() && isa<FPMathOperator>(I) &&
+           !I->hasNoSignedZeros()))
+        return badRange();
+
+      APFloat NewF = F;
+      auto Res = NewF.roundToIntegral(APFloat::rmNearestTiesToEven);
+      if (Res != APFloat::opOK || NewF != F)
+        return badRange();
+
+      // OK, it's representable. Now get it.
+      APSInt Int(MaxIntegerBW+1, false);
+      bool Exact;
+      CF->getValueAPF().convertToInteger(Int,
+                                         APFloat::rmNearestTiesToEven,
+                                         &Exact);
+      OpRanges.push_back(ConstantRange(Int));
+    } else {
+      llvm_unreachable("Should have already marked this as badRange!");
+    }
+  }
 
-    Instruction *I = It.first;
-    std::function<ConstantRange(ArrayRef<ConstantRange>)> Op;
-    switch (I->getOpcode()) {
-      // FIXME: Handle select and phi nodes.
-    default:
-    case Instruction::UIToFP:
-    case Instruction::SIToFP:
-      llvm_unreachable("Should have been handled in walkForwards!");
+  switch (I->getOpcode()) {
+  // FIXME: Handle select and phi nodes.
+  default:
+  case Instruction::UIToFP:
+  case Instruction::SIToFP:
+    llvm_unreachable("Should have been handled in walkForwards!");
 
-    case Instruction::FNeg:
-      Op = [](ArrayRef<ConstantRange> Ops) {
-        assert(Ops.size() == 1 && "FNeg is a unary operator!");
-        unsigned Size = Ops[0].getBitWidth();
-        auto Zero = ConstantRange(APInt::getZero(Size));
-        return Zero.sub(Ops[0]);
-      };
-      break;
+  case Instruction::FNeg: {
+    assert(OpRanges.size() == 1 && "FNeg is a unary operator!");
+    unsigned Size = OpRanges[0].getBitWidth();
+    auto Zero = ConstantRange(APInt::getZero(Size));
+    return Zero.sub(OpRanges[0]);
+  }
 
-    case Instruction::FAdd:
-    case Instruction::FSub:
-    case Instruction::FMul:
-      Op = [I](ArrayRef<ConstantRange> Ops) {
-        assert(Ops.size() == 2 && "its a binary operator!");
-        auto BinOp = (Instruction::BinaryOps) I->getOpcode();
-        return Ops[0].binaryOp(BinOp, Ops[1]);
-      };
-      break;
+  case Instruction::FAdd:
+  case Instruction::FSub:
+  case Instruction::FMul: {
+    assert(OpRanges.size() == 2 && "its a binary operator!");
+    auto BinOp = (Instruction::BinaryOps) I->getOpcode();
+    return OpRanges[0].binaryOp(BinOp, OpRanges[1]);
+  }
 
-    //
-    // Root-only instructions - we'll only see these if they're the
-    //                          first node in a walk.
-    //
-    case Instruction::FPToUI:
-    case Instruction::FPToSI:
-      Op = [I](ArrayRef<ConstantRange> Ops) {
-        assert(Ops.size() == 1 && "FPTo[US]I is a unary operator!");
-        // Note: We're ignoring the casts output size here as that's what the
-        // caller expects.
-        auto CastOp = (Instruction::CastOps)I->getOpcode();
-        return Ops[0].castOp(CastOp, MaxIntegerBW+1);
-      };
-      break;
+  //
+  // Root-only instructions - we'll only see these if they're the
+  //                          first node in a walk.
+  //
+  case Instruction::FPToUI:
+  case Instruction::FPToSI: {
+    assert(OpRanges.size() == 1 && "FPTo[US]I is a unary operator!");
+    // Note: We're ignoring the casts output size here as that's what the
+    // caller expects.
+    auto CastOp = (Instruction::CastOps)I->getOpcode();
+    return OpRanges[0].castOp(CastOp, MaxIntegerBW+1);
+  }
 
-    case Instruction::FCmp:
-      Op = [](ArrayRef<ConstantRange> Ops) {
-        assert(Ops.size() == 2 && "FCmp is a binary operator!");
-        return Ops[0].unionWith(Ops[1]);
-      };
-      break;
-    }
+  case Instruction::FCmp:
+    assert(OpRanges.size() == 2 && "FCmp is a binary operator!");
+    return OpRanges[0].unionWith(OpRanges[1]);
+  }
+}
 
-    bool Abort = false;
-    SmallVector<ConstantRange,4> OpRanges;
-    for (Value *O : I->operands()) {
-      if (Instruction *OI = dyn_cast<Instruction>(O)) {
-        assert(SeenInsts.find(OI) != SeenInsts.end() &&
-               "def not seen before use!");
-        OpRanges.push_back(SeenInsts.find(OI)->second);
-      } else if (ConstantFP *CF = dyn_cast<ConstantFP>(O)) {
-        // Work out if the floating point number can be losslessly represented
-        // as an integer.
-        // APFloat::convertToInteger(&Exact) purports to do what we want, but
-        // the exactness can be too precise. For example, negative zero can
-        // never be exactly converted to an integer.
-        //
-        // Instead, we ask APFloat to round itself to an integral value - this
-        // preserves sign-of-zero - then compare the result with the original.
-        //
-        const APFloat &F = CF->getValueAPF();
-
-        // First, weed out obviously incorrect values. Non-finite numbers
-        // can't be represented and neither can negative zero, unless
-        // we're in fast math mode.
-        if (!F.isFinite() ||
-            (F.isZero() && F.isNegative() && isa<FPMathOperator>(I) &&
-             !I->hasNoSignedZeros())) {
-          seen(I, badRange());
-          Abort = true;
-          break;
-        }
+// Walk forwards down the list of seen instructions, so we visit defs before
+// uses.
+void Float2IntPass::walkForwards() {
+  std::deque<Instruction *> Worklist;
+  for (const auto &Pair : SeenInsts)
+    if (Pair.second == unknownRange())
+      Worklist.push_back(Pair.first);
 
-        APFloat NewF = F;
-        auto Res = NewF.roundToIntegral(APFloat::rmNearestTiesToEven);
-        if (Res != APFloat::opOK || NewF != F) {
-          seen(I, badRange());
-          Abort = true;
-          break;
-        }
-        // OK, it's representable. Now get it.
-        APSInt Int(MaxIntegerBW+1, false);
-        bool Exact;
-        CF->getValueAPF().convertToInteger(Int,
-                                           APFloat::rmNearestTiesToEven,
-                                           &Exact);
-        OpRanges.push_back(ConstantRange(Int));
-      } else {
-        llvm_unreachable("Should have already marked this as badRange!");
-      }
-    }
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.back();
+    Worklist.pop_back();
 
-    // Reduce the operands' ranges to a single range and return.
-    if (!Abort)
-      seen(I, Op(OpRanges));
+    if (Optional<ConstantRange> Range = calcRange(I))
+      seen(I, *Range);
+    else
+      Worklist.push_front(I); // Reprocess later.
   }
 }
 
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 398c93e8758c..783301fe589e 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -19,7 +19,6 @@
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
@@ -32,6 +31,7 @@
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionPrecedenceTracking.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
@@ -42,12 +42,10 @@
 #include "llvm/Analysis/PHITransAddr.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
@@ -55,11 +53,9 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
@@ -72,7 +68,6 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -112,16 +107,16 @@ static cl::opt<bool> GVNEnableLoadInLoopPRE("enable-load-in-loop-pre",
                                             cl::init(true));
 static cl::opt<bool>
 GVNEnableSplitBackedgeInLoadPRE("enable-split-backedge-in-load-pre",
-                                cl::init(true));
+                                cl::init(false));
 static cl::opt<bool> GVNEnableMemDep("enable-gvn-memdep", cl::init(true));
 
 static cl::opt<uint32_t> MaxNumDeps(
-    "gvn-max-num-deps", cl::Hidden, cl::init(100), cl::ZeroOrMore,
+    "gvn-max-num-deps", cl::Hidden, cl::init(100),
     cl::desc("Max number of dependences to attempt Load PRE (default = 100)"));
 
 // This is based on IsValueFullyAvailableInBlockNumSpeculationsMax stat.
 static cl::opt<uint32_t> MaxBBSpeculations(
-    "gvn-max-block-speculations", cl::Hidden, cl::init(600), cl::ZeroOrMore,
+    "gvn-max-block-speculations", cl::Hidden, cl::init(600),
     cl::desc("Max number of blocks we're willing to speculate on (and recurse "
              "into) when deducing if a value is fully available or not in GVN "
              "(default = 600)"));
@@ -129,6 +124,8 @@ static cl::opt<uint32_t> MaxBBSpeculations(
 struct llvm::GVNPass::Expression {
   uint32_t opcode;
   bool commutative = false;
+  // The type is not necessarily the result type of the expression, it may be
+  // any additional type needed to disambiguate the expression.
   Type *type = nullptr;
   SmallVector<uint32_t, 4> varargs;
 
@@ -178,70 +175,88 @@ template <> struct DenseMapInfo<GVNPass::Expression> {
 /// implicitly associated with a rematerialization point which is the
 /// location of the instruction from which it was formed.
 struct llvm::gvn::AvailableValue {
-  enum ValType {
+  enum class ValType {
     SimpleVal, // A simple offsetted value that is accessed.
     LoadVal,   // A value produced by a load.
     MemIntrin, // A memory intrinsic which is loaded from.
-    UndefVal   // A UndefValue representing a value from dead block (which
+    UndefVal,  // A UndefValue representing a value from dead block (which
                // is not yet physically removed from the CFG).
+    SelectVal, // A pointer select which is loaded from and for which the load
+               // can be replace by a value select.
   };
 
-  /// V - The value that is live out of the block.
-  PointerIntPair<Value *, 2, ValType> Val;
+  /// Val - The value that is live out of the block.
+  Value *Val;
+  /// Kind of the live-out value.
+  ValType Kind;
 
   /// Offset - The byte offset in Val that is interesting for the load query.
   unsigned Offset = 0;
 
   static AvailableValue get(Value *V, unsigned Offset = 0) {
     AvailableValue Res;
-    Res.Val.setPointer(V);
-    Res.Val.setInt(SimpleVal);
+    Res.Val = V;
+    Res.Kind = ValType::SimpleVal;
     Res.Offset = Offset;
     return Res;
   }
 
   static AvailableValue getMI(MemIntrinsic *MI, unsigned Offset = 0) {
     AvailableValue Res;
-    Res.Val.setPointer(MI);
-    Res.Val.setInt(MemIntrin);
+    Res.Val = MI;
+    Res.Kind = ValType::MemIntrin;
     Res.Offset = Offset;
     return Res;
   }
 
   static AvailableValue getLoad(LoadInst *Load, unsigned Offset = 0) {
     AvailableValue Res;
-    Res.Val.setPointer(Load);
-    Res.Val.setInt(LoadVal);
+    Res.Val = Load;
+    Res.Kind = ValType::LoadVal;
     Res.Offset = Offset;
     return Res;
   }
 
   static AvailableValue getUndef() {
     AvailableValue Res;
-    Res.Val.setPointer(nullptr);
-    Res.Val.setInt(UndefVal);
+    Res.Val = nullptr;
+    Res.Kind = ValType::UndefVal;
     Res.Offset = 0;
     return Res;
   }
 
-  bool isSimpleValue() const { return Val.getInt() == SimpleVal; }
-  bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; }
-  bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; }
-  bool isUndefValue() const { return Val.getInt() == UndefVal; }
+  static AvailableValue getSelect(SelectInst *Sel) {
+    AvailableValue Res;
+    Res.Val = Sel;
+    Res.Kind = ValType::SelectVal;
+    Res.Offset = 0;
+    return Res;
+  }
+
+  bool isSimpleValue() const { return Kind == ValType::SimpleVal; }
+  bool isCoercedLoadValue() const { return Kind == ValType::LoadVal; }
+  bool isMemIntrinValue() const { return Kind == ValType::MemIntrin; }
+  bool isUndefValue() const { return Kind == ValType::UndefVal; }
+  bool isSelectValue() const { return Kind == ValType::SelectVal; }
 
   Value *getSimpleValue() const {
     assert(isSimpleValue() && "Wrong accessor");
-    return Val.getPointer();
+    return Val;
   }
 
   LoadInst *getCoercedLoadValue() const {
     assert(isCoercedLoadValue() && "Wrong accessor");
-    return cast<LoadInst>(Val.getPointer());
+    return cast<LoadInst>(Val);
   }
 
   MemIntrinsic *getMemIntrinValue() const {
     assert(isMemIntrinValue() && "Wrong accessor");
-    return cast<MemIntrinsic>(Val.getPointer());
+    return cast<MemIntrinsic>(Val);
+  }
+
+  SelectInst *getSelectValue() const {
+    assert(isSelectValue() && "Wrong accessor");
+    return cast<SelectInst>(Val);
   }
 
   /// Emit code at the specified insertion point to adjust the value defined
@@ -275,6 +290,10 @@ struct llvm::gvn::AvailableValueInBlock {
     return get(BB, AvailableValue::getUndef());
   }
 
+  static AvailableValueInBlock getSelect(BasicBlock *BB, SelectInst *Sel) {
+    return get(BB, AvailableValue::getSelect(Sel));
+  }
+
   /// Emit code at the end of this block to adjust the value defined here to
   /// the specified type. This handles various coercion cases.
   Value *MaterializeAdjustedValue(LoadInst *Load, GVNPass &gvn) const {
@@ -379,6 +398,39 @@ GVNPass::ValueTable::createExtractvalueExpr(ExtractValueInst *EI) {
   return e;
 }
 
+GVNPass::Expression GVNPass::ValueTable::createGEPExpr(GetElementPtrInst *GEP) {
+  Expression E;
+  Type *PtrTy = GEP->getType()->getScalarType();
+  const DataLayout &DL = GEP->getModule()->getDataLayout();
+  unsigned BitWidth = DL.getIndexTypeSizeInBits(PtrTy);
+  MapVector<Value *, APInt> VariableOffsets;
+  APInt ConstantOffset(BitWidth, 0);
+  if (PtrTy->isOpaquePointerTy() &&
+      GEP->collectOffset(DL, BitWidth, VariableOffsets, ConstantOffset)) {
+    // For opaque pointers, convert into offset representation, to recognize
+    // equivalent address calculations that use different type encoding.
+    LLVMContext &Context = GEP->getContext();
+    E.opcode = GEP->getOpcode();
+    E.type = nullptr;
+    E.varargs.push_back(lookupOrAdd(GEP->getPointerOperand()));
+    for (const auto &Pair : VariableOffsets) {
+      E.varargs.push_back(lookupOrAdd(Pair.first));
+      E.varargs.push_back(lookupOrAdd(ConstantInt::get(Context, Pair.second)));
+    }
+    if (!ConstantOffset.isZero())
+      E.varargs.push_back(
+          lookupOrAdd(ConstantInt::get(Context, ConstantOffset)));
+  } else {
+    // If converting to offset representation fails (for typed pointers and
+    // scalable vectors), fall back to type-based implementation:
+    E.opcode = GEP->getOpcode();
+    E.type = GEP->getSourceElementType();
+    for (Use &Op : GEP->operands())
+      E.varargs.push_back(lookupOrAdd(Op));
+  }
+  return E;
+}
+
 //===----------------------------------------------------------------------===//
 //                     ValueTable External Functions
 //===----------------------------------------------------------------------===//
@@ -562,9 +614,11 @@ uint32_t GVNPass::ValueTable::lookupOrAdd(Value *V) {
     case Instruction::InsertElement:
     case Instruction::ShuffleVector:
     case Instruction::InsertValue:
-    case Instruction::GetElementPtr:
       exp = createExpr(I);
       break;
+    case Instruction::GetElementPtr:
+      exp = createGEPExpr(cast<GetElementPtrInst>(I));
+      break;
     case Instruction::ExtractValue:
       exp = createExtractvalueExpr(cast<ExtractValueInst>(I));
       break;
@@ -639,24 +693,24 @@ void GVNPass::ValueTable::verifyRemoved(const Value *V) const {
 //===----------------------------------------------------------------------===//
 
 bool GVNPass::isPREEnabled() const {
-  return Options.AllowPRE.getValueOr(GVNEnablePRE);
+  return Options.AllowPRE.value_or(GVNEnablePRE);
 }
 
 bool GVNPass::isLoadPREEnabled() const {
-  return Options.AllowLoadPRE.getValueOr(GVNEnableLoadPRE);
+  return Options.AllowLoadPRE.value_or(GVNEnableLoadPRE);
 }
 
 bool GVNPass::isLoadInLoopPREEnabled() const {
-  return Options.AllowLoadInLoopPRE.getValueOr(GVNEnableLoadInLoopPRE);
+  return Options.AllowLoadInLoopPRE.value_or(GVNEnableLoadInLoopPRE);
 }
 
 bool GVNPass::isLoadPRESplitBackedgeEnabled() const {
-  return Options.AllowLoadPRESplitBackedge.getValueOr(
+  return Options.AllowLoadPRESplitBackedge.value_or(
       GVNEnableSplitBackedgeInLoadPRE);
 }
 
 bool GVNPass::isMemDepEnabled() const {
-  return Options.AllowMemDep.getValueOr(GVNEnableMemDep);
+  return Options.AllowMemDep.value_or(GVNEnableMemDep);
 }
 
 PreservedAnalyses GVNPass::run(Function &F, FunctionAnalysisManager &AM) {
@@ -897,6 +951,17 @@ ConstructSSAForLoadSet(LoadInst *Load,
   return SSAUpdate.GetValueInMiddleOfBlock(Load->getParent());
 }
 
+static LoadInst *findDominatingLoad(Value *Ptr, Type *LoadTy, SelectInst *Sel,
+                                    DominatorTree &DT) {
+  for (Value *U : Ptr->users()) {
+    auto *LI = dyn_cast<LoadInst>(U);
+    if (LI && LI->getType() == LoadTy && LI->getParent() == Sel->getParent() &&
+        DT.dominates(LI, Sel))
+      return LI;
+  }
+  return nullptr;
+}
+
 Value *AvailableValue::MaterializeAdjustedValue(LoadInst *Load,
                                                 Instruction *InsertPt,
                                                 GVNPass &gvn) const {
@@ -937,6 +1002,17 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *Load,
                       << "  " << *getMemIntrinValue() << '\n'
                       << *Res << '\n'
                       << "\n\n\n");
+  } else if (isSelectValue()) {
+    // Introduce a new value select for a load from an eligible pointer select.
+    SelectInst *Sel = getSelectValue();
+    LoadInst *L1 = findDominatingLoad(Sel->getOperand(1), LoadTy, Sel,
+                                      gvn.getDominatorTree());
+    LoadInst *L2 = findDominatingLoad(Sel->getOperand(2), LoadTy, Sel,
+                                      gvn.getDominatorTree());
+    assert(L1 && L2 &&
+           "must be able to obtain dominating loads for both value operands of "
+           "the select");
+    Res = SelectInst::Create(Sel->getCondition(), L1, L2, "", Sel);
   } else {
     llvm_unreachable("Should not materialize value from dead block");
   }
@@ -1023,8 +1099,54 @@ static void reportMayClobberedLoad(LoadInst *Load, MemDepResult DepInfo,
   ORE->emit(R);
 }
 
+/// Check if a load from pointer-select \p Address in \p DepBB can be converted
+/// to a value select. The following conditions need to be satisfied:
+/// 1. The pointer select (\p Address) must be defined in \p DepBB.
+/// 2. Both value operands of the pointer select must be loaded in the same
+/// basic block, before the pointer select.
+/// 3. There must be no instructions between the found loads and \p End that may
+/// clobber the loads.
+static Optional<AvailableValue>
+tryToConvertLoadOfPtrSelect(BasicBlock *DepBB, BasicBlock::iterator End,
+                            Value *Address, Type *LoadTy, DominatorTree &DT,
+                            AAResults *AA) {
+
+  auto *Sel = dyn_cast_or_null<SelectInst>(Address);
+  if (!Sel || DepBB != Sel->getParent())
+    return None;
+
+  LoadInst *L1 = findDominatingLoad(Sel->getOperand(1), LoadTy, Sel, DT);
+  LoadInst *L2 = findDominatingLoad(Sel->getOperand(2), LoadTy, Sel, DT);
+  if (!L1 || !L2)
+    return None;
+
+  // Ensure there are no accesses that may modify the locations referenced by
+  // either L1 or L2 between L1, L2 and the specified End iterator.
+  Instruction *EarlierLoad = L1->comesBefore(L2) ? L1 : L2;
+  MemoryLocation L1Loc = MemoryLocation::get(L1);
+  MemoryLocation L2Loc = MemoryLocation::get(L2);
+  if (any_of(make_range(EarlierLoad->getIterator(), End), [&](Instruction &I) {
+        return isModSet(AA->getModRefInfo(&I, L1Loc)) ||
+               isModSet(AA->getModRefInfo(&I, L2Loc));
+      }))
+    return None;
+
+  return AvailableValue::getSelect(Sel);
+}
+
 bool GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,
                                       Value *Address, AvailableValue &Res) {
+  if (!DepInfo.isDef() && !DepInfo.isClobber()) {
+    assert(isa<SelectInst>(Address));
+    if (auto R = tryToConvertLoadOfPtrSelect(
+            Load->getParent(), Load->getIterator(), Address, Load->getType(),
+            getDominatorTree(), getAliasAnalysis())) {
+      Res = *R;
+      return true;
+    }
+    return false;
+  }
+
   assert((DepInfo.isDef() || DepInfo.isClobber()) &&
          "expected a local dependence");
   assert(Load->isUnordered() && "rules below are incorrect for ordered access");
@@ -1066,9 +1188,7 @@ bool GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,
             canCoerceMustAliasedValueToLoad(DepLoad, LoadType, DL)) {
           const auto ClobberOff = MD->getClobberOffset(DepLoad);
           // GVN has no deal with a negative offset.
-          Offset = (ClobberOff == None || ClobberOff.getValue() < 0)
-                       ? -1
-                       : ClobberOff.getValue();
+          Offset = (ClobberOff == None || *ClobberOff < 0) ? -1 : *ClobberOff;
         }
         if (Offset == -1)
           Offset =
@@ -1092,6 +1212,7 @@ bool GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,
         }
       }
     }
+
     // Nothing known about this clobber, have to be conservative
     LLVM_DEBUG(
         // fast print dep, using operator<< on instruction is too slow.
@@ -1111,12 +1232,11 @@ bool GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,
     return true;
   }
 
-  if (isAllocationFn(DepInst, TLI))
-    if (auto *InitVal = getInitialValueOfAllocation(cast<CallBase>(DepInst),
-                                                    TLI, Load->getType())) {
-      Res = AvailableValue::get(InitVal);
-      return true;
-    }
+  if (Constant *InitVal =
+          getInitialValueOfAllocation(DepInst, TLI, Load->getType())) {
+    Res = AvailableValue::get(InitVal);
+    return true;
+  }
 
   if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) {
     // Reject loads and stores that are to the same address but are of
@@ -1176,16 +1296,23 @@ void GVNPass::AnalyzeLoadAvailability(LoadInst *Load, LoadDepVect &Deps,
       continue;
     }
 
-    if (!DepInfo.isDef() && !DepInfo.isClobber()) {
-      UnavailableBlocks.push_back(DepBB);
-      continue;
-    }
-
     // The address being loaded in this non-local block may not be the same as
     // the pointer operand of the load if PHI translation occurs.  Make sure
     // to consider the right address.
     Value *Address = Deps[i].getAddress();
 
+    if (!DepInfo.isDef() && !DepInfo.isClobber()) {
+      if (auto R = tryToConvertLoadOfPtrSelect(
+              DepBB, DepBB->end(), Address, Load->getType(), getDominatorTree(),
+              getAliasAnalysis())) {
+        ValuesPerBlock.push_back(
+            AvailableValueInBlock::get(DepBB, std::move(*R)));
+        continue;
+      }
+      UnavailableBlocks.push_back(DepBB);
+      continue;
+    }
+
     AvailableValue AV;
     if (AnalyzeLoadAvailability(Load, DepInfo, Address, AV)) {
       // subtlety: because we know this was a non-local dependency, we know
@@ -1923,8 +2050,9 @@ bool GVNPass::processLoad(LoadInst *L) {
   if (Dep.isNonLocal())
     return processNonLocalLoad(L);
 
+  Value *Address = L->getPointerOperand();
   // Only handle the local case below
-  if (!Dep.isDef() && !Dep.isClobber()) {
+  if (!Dep.isDef() && !Dep.isClobber() && !isa<SelectInst>(Address)) {
     // This might be a NonFuncLocal or an Unknown
     LLVM_DEBUG(
         // fast print dep, using operator<< on instruction is too slow.
@@ -1934,7 +2062,7 @@ bool GVNPass::processLoad(LoadInst *L) {
   }
 
   AvailableValue AV;
-  if (AnalyzeLoadAvailability(L, Dep, L->getPointerOperand(), AV)) {
+  if (AnalyzeLoadAvailability(L, Dep, Address, AV)) {
     Value *AvailableValue = AV.MaterializeAdjustedValue(L, L, *this);
 
     // Replace the load!
@@ -2324,7 +2452,7 @@ bool GVNPass::processInstruction(Instruction *I) {
   // example if it determines that %y is equal to %x then the instruction
   // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify.
   const DataLayout &DL = I->getModule()->getDataLayout();
-  if (Value *V = SimplifyInstruction(I, {DL, TLI, DT, AC})) {
+  if (Value *V = simplifyInstruction(I, {DL, TLI, DT, AC})) {
     bool Changed = false;
     if (!I->use_empty()) {
       // Simplification can cause a special instruction to become not special.
@@ -2491,6 +2619,7 @@ bool GVNPass::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
   unsigned Iteration = 0;
   while (ShouldContinue) {
     LLVM_DEBUG(dbgs() << "GVN iteration: " << Iteration << "\n");
+    (void) Iteration;
     ShouldContinue = iterateOnFunction(F);
     Changed |= ShouldContinue;
     ++Iteration;
diff --git a/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/llvm/lib/Transforms/Scalar/GVNHoist.cpp
index fdc3afd9348a..6cdc671ddb64 100644
--- a/llvm/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/llvm/lib/Transforms/Scalar/GVNHoist.cpp
@@ -54,11 +54,9 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Use.h"
@@ -126,7 +124,7 @@ using HoistingPointInfo = std::pair<BasicBlock *, SmallVecInsn>;
 using HoistingPointList = SmallVector<HoistingPointInfo, 4>;
 
 // A map from a pair of VNs to all the instructions with those VNs.
-using VNType = std::pair<unsigned, unsigned>;
+using VNType = std::pair<unsigned, uintptr_t>;
 
 using VNtoInsns = DenseMap<VNType, SmallVector<Instruction *, 4>>;
 
@@ -161,7 +159,7 @@ using InValuesType =
 
 // An invalid value number Used when inserting a single value number into
 // VNtoInsns.
-enum : unsigned { InvalidVN = ~2U };
+enum : uintptr_t { InvalidVN = ~(uintptr_t)2 };
 
 // Records all scalar instructions candidate for code hoisting.
 class InsnInfo {
@@ -187,7 +185,9 @@ public:
   void insert(LoadInst *Load, GVNPass::ValueTable &VN) {
     if (Load->isSimple()) {
       unsigned V = VN.lookupOrAdd(Load->getPointerOperand());
-      VNtoLoads[{V, InvalidVN}].push_back(Load);
+      // With opaque pointers we may have loads from the same pointer with
+      // different result types, which should be disambiguated.
+      VNtoLoads[{V, (uintptr_t)Load->getType()}].push_back(Load);
     }
   }
 
@@ -261,7 +261,9 @@ public:
   GVNHoist(DominatorTree *DT, PostDominatorTree *PDT, AliasAnalysis *AA,
            MemoryDependenceResults *MD, MemorySSA *MSSA)
       : DT(DT), PDT(PDT), AA(AA), MD(MD), MSSA(MSSA),
-        MSSAUpdater(std::make_unique<MemorySSAUpdater>(MSSA)) {}
+        MSSAUpdater(std::make_unique<MemorySSAUpdater>(MSSA)) {
+    MSSA->ensureOptimizedUses();
+  }
 
   bool run(Function &F);
 
@@ -1147,6 +1149,8 @@ std::pair<unsigned, unsigned> GVNHoist::hoist(HoistingPointList &HPL) {
       DFSNumber[Repl] = DFSNumber[Last]++;
     }
 
+    // Drop debug location as per debug info update guide.
+    Repl->dropLocation();
     NR += removeAndReplace(InstructionsToHoist, Repl, DestBB, MoveAccess);
 
     if (isa<LoadInst>(Repl))
diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp
index e612a82fc89a..720b8e71fd56 100644
--- a/llvm/lib/Transforms/Scalar/GVNSink.cpp
+++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp
@@ -35,7 +35,6 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/None.h"
@@ -45,7 +44,6 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
@@ -383,6 +381,8 @@ public:
   }
 };
 
+using BasicBlocksSet = SmallPtrSet<const BasicBlock *, 32>;
+
 class ValueTable {
   DenseMap<Value *, uint32_t> ValueNumbering;
   DenseMap<GVNExpression::Expression *, uint32_t> ExpressionNumbering;
@@ -390,6 +390,7 @@ class ValueTable {
   BumpPtrAllocator Allocator;
   ArrayRecycler<Value *> Recycler;
   uint32_t nextValueNumber = 1;
+  BasicBlocksSet ReachableBBs;
 
   /// Create an expression for I based on its opcode and its uses. If I
   /// touches or reads memory, the expression is also based upon its memory
@@ -421,6 +422,11 @@ class ValueTable {
 public:
   ValueTable() = default;
 
+  /// Set basic blocks reachable from entry block.
+  void setReachableBBs(const BasicBlocksSet &ReachableBBs) {
+    this->ReachableBBs = ReachableBBs;
+  }
+
   /// Returns the value number for the specified value, assigning
   /// it a new number if it did not have one before.
   uint32_t lookupOrAdd(Value *V) {
@@ -434,6 +440,9 @@ public:
     }
 
     Instruction *I = cast<Instruction>(V);
+    if (!ReachableBBs.contains(I->getParent()))
+      return ~0U;
+
     InstructionUseExpr *exp = nullptr;
     switch (I->getOpcode()) {
     case Instruction::Load:
@@ -570,6 +579,7 @@ public:
 
     unsigned NumSunk = 0;
     ReversePostOrderTraversal<Function*> RPOT(&F);
+    VN.setReachableBBs(BasicBlocksSet(RPOT.begin(), RPOT.end()));
     for (auto *N : RPOT)
       NumSunk += sinkBB(N);
 
@@ -648,12 +658,7 @@ Optional<SinkingInstructionCandidate> GVNSink::analyzeInstructionForSinking(
     VNums[N]++;
   }
   unsigned VNumToSink =
-      std::max_element(VNums.begin(), VNums.end(),
-                       [](const std::pair<uint32_t, unsigned> &I,
-                          const std::pair<uint32_t, unsigned> &J) {
-                         return I.second < J.second;
-                       })
-          ->first;
+      std::max_element(VNums.begin(), VNums.end(), llvm::less_second())->first;
 
   if (VNums[VNumToSink] == 1)
     // Can't sink anything!
@@ -776,12 +781,9 @@ unsigned GVNSink::sinkBB(BasicBlock *BBEnd) {
 
   unsigned NumOrigPreds = Preds.size();
   // We can only sink instructions through unconditional branches.
-  for (auto I = Preds.begin(); I != Preds.end();) {
-    if ((*I)->getTerminator()->getNumSuccessors() != 1)
-      I = Preds.erase(I);
-    else
-      ++I;
-  }
+  llvm::erase_if(Preds, [](BasicBlock *BB) {
+    return BB->getTerminator()->getNumSuccessors() != 1;
+  });
 
   LockstepReverseIterator LRI(Preds);
   SmallVector<SinkingInstructionCandidate, 4> Candidates;
diff --git a/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/llvm/lib/Transforms/Scalar/GuardWidening.cpp
index 82b81003ef21..af6062d142f0 100644
--- a/llvm/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/llvm/lib/Transforms/Scalar/GuardWidening.cpp
@@ -42,7 +42,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/GuardUtils.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -496,6 +495,8 @@ void GuardWideningImpl::makeAvailableAt(Value *V, Instruction *Loc) const {
     makeAvailableAt(Op, Loc);
 
   Inst->moveBefore(Loc);
+  // If we moved instruction before guard we must clean poison generating flags.
+  Inst->dropPoisonGeneratingFlags();
 }
 
 bool GuardWideningImpl::widenCondCommon(Value *Cond0, Value *Cond1,
diff --git a/llvm/lib/Transforms/Scalar/IVUsersPrinter.cpp b/llvm/lib/Transforms/Scalar/IVUsersPrinter.cpp
index e2022aba97c4..26f2db183fbf 100644
--- a/llvm/lib/Transforms/Scalar/IVUsersPrinter.cpp
+++ b/llvm/lib/Transforms/Scalar/IVUsersPrinter.cpp
@@ -8,7 +8,6 @@
 
 #include "llvm/Transforms/Scalar/IVUsersPrinter.h"
 #include "llvm/Analysis/IVUsers.h"
-#include "llvm/Support/Debug.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "iv-users"
diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index ceb03eb17f6d..e977dd18be9f 100644
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -25,10 +25,7 @@
 
 #include "llvm/Transforms/Scalar/IndVarSimplify.h"
 #include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -74,11 +71,9 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
@@ -387,7 +382,7 @@ bool IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) {
   RecursivelyDeleteTriviallyDeadInstructions(Compare, TLI, MSSAU.get());
 
   // Delete the old floating point increment.
-  Incr->replaceAllUsesWith(UndefValue::get(Incr->getType()));
+  Incr->replaceAllUsesWith(PoisonValue::get(Incr->getType()));
   RecursivelyDeleteTriviallyDeadInstructions(Incr, TLI, MSSAU.get());
 
   // If the FP induction variable still has uses, this is because something else
@@ -605,10 +600,10 @@ bool IndVarSimplify::simplifyAndExtend(Loop *L,
           Intrinsic::getName(Intrinsic::experimental_guard));
   bool HasGuards = GuardDecl && !GuardDecl->use_empty();
 
-  SmallVector<PHINode*, 8> LoopPhis;
-  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
-    LoopPhis.push_back(cast<PHINode>(I));
-  }
+  SmallVector<PHINode *, 8> LoopPhis;
+  for (PHINode &PN : L->getHeader()->phis())
+    LoopPhis.push_back(&PN);
+
   // Each round of simplification iterates through the SimplifyIVUsers worklist
   // for all current phis, then determines whether any IVs can be
   // widened. Widening adds new phis to LoopPhis, inducing another round of
diff --git a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index 0e5653eeb7d5..799669a19796 100644
--- a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -56,8 +56,6 @@
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/IR/BasicBlock.h"
@@ -1411,12 +1409,12 @@ bool LoopConstrainer::run() {
 
   bool IsSignedPredicate = MainLoopStructure.IsSignedPredicate;
   Optional<SubRanges> MaybeSR = calculateSubRanges(IsSignedPredicate);
-  if (!MaybeSR.hasValue()) {
+  if (!MaybeSR) {
     LLVM_DEBUG(dbgs() << "irce: could not compute subranges\n");
     return false;
   }
 
-  SubRanges SR = MaybeSR.getValue();
+  SubRanges SR = *MaybeSR;
   bool Increasing = MainLoopStructure.IndVarIncreasing;
   IntegerType *IVTy =
       cast<IntegerType>(Range.getBegin()->getType());
@@ -1429,9 +1427,9 @@ bool LoopConstrainer::run() {
   // constructor.
   ClonedLoop PreLoop, PostLoop;
   bool NeedsPreLoop =
-      Increasing ? SR.LowLimit.hasValue() : SR.HighLimit.hasValue();
+      Increasing ? SR.LowLimit.has_value() : SR.HighLimit.has_value();
   bool NeedsPostLoop =
-      Increasing ? SR.HighLimit.hasValue() : SR.LowLimit.hasValue();
+      Increasing ? SR.HighLimit.has_value() : SR.LowLimit.has_value();
 
   Value *ExitPreLoopAt = nullptr;
   Value *ExitMainLoopAt = nullptr;
@@ -1710,7 +1708,7 @@ IntersectSignedRange(ScalarEvolution &SE,
                      const InductiveRangeCheck::Range &R2) {
   if (R2.isEmpty(SE, /* IsSigned */ true))
     return None;
-  if (!R1.hasValue())
+  if (!R1)
     return R2;
   auto &R1Value = R1.getValue();
   // We never return empty ranges from this function, and R1 is supposed to be
@@ -1739,7 +1737,7 @@ IntersectUnsignedRange(ScalarEvolution &SE,
                        const InductiveRangeCheck::Range &R2) {
   if (R2.isEmpty(SE, /* IsSigned */ false))
     return None;
-  if (!R1.hasValue())
+  if (!R1)
     return R2;
   auto &R1Value = R1.getValue();
   // We never return empty ranges from this function, and R1 is supposed to be
@@ -1763,10 +1761,14 @@ IntersectUnsignedRange(ScalarEvolution &SE,
 }
 
 PreservedAnalyses IRCEPass::run(Function &F, FunctionAnalysisManager &AM) {
-  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
-  auto &BPI = AM.getResult<BranchProbabilityAnalysis>(F);
   LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
+  // There are no loops in the function. Return before computing other expensive
+  // analyses.
+  if (LI.empty())
+    return PreservedAnalyses::all();
+  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+  auto &BPI = AM.getResult<BranchProbabilityAnalysis>(F);
 
   // Get BFI analysis result on demand. Please note that modification of
   // CFG invalidates this analysis and we should handle it.
@@ -1854,7 +1856,7 @@ InductiveRangeCheckElimination::isProfitableToTransform(const Loop &L,
                                                         LoopStructure &LS) {
   if (SkipProfitabilityChecks)
     return true;
-  if (GetBFI.hasValue()) {
+  if (GetBFI) {
     BlockFrequencyInfo &BFI = (*GetBFI)();
     uint64_t hFreq = BFI.getBlockFreq(LS.Header).getFrequency();
     uint64_t phFreq = BFI.getBlockFreq(L.getLoopPreheader()).getFrequency();
@@ -1920,12 +1922,12 @@ bool InductiveRangeCheckElimination::run(
   const char *FailureReason = nullptr;
   Optional<LoopStructure> MaybeLoopStructure =
       LoopStructure::parseLoopStructure(SE, *L, FailureReason);
-  if (!MaybeLoopStructure.hasValue()) {
+  if (!MaybeLoopStructure) {
     LLVM_DEBUG(dbgs() << "irce: could not parse loop structure: "
                       << FailureReason << "\n";);
     return false;
   }
-  LoopStructure LS = MaybeLoopStructure.getValue();
+  LoopStructure LS = *MaybeLoopStructure;
   if (!isProfitableToTransform(*L, LS))
     return false;
   const SCEVAddRecExpr *IndVar =
@@ -1946,10 +1948,10 @@ bool InductiveRangeCheckElimination::run(
   for (InductiveRangeCheck &IRC : RangeChecks) {
     auto Result = IRC.computeSafeIterationSpace(SE, IndVar,
                                                 LS.IsSignedPredicate);
-    if (Result.hasValue()) {
+    if (Result) {
       auto MaybeSafeIterRange =
           IntersectRange(SE, SafeIterRange, Result.getValue());
-      if (MaybeSafeIterRange.hasValue()) {
+      if (MaybeSafeIterRange) {
         assert(
             !MaybeSafeIterRange.getValue().isEmpty(SE, LS.IsSignedPredicate) &&
             "We should never return empty ranges!");
@@ -1959,7 +1961,7 @@ bool InductiveRangeCheckElimination::run(
     }
   }
 
-  if (!SafeIterRange.hasValue())
+  if (!SafeIterRange)
     return false;
 
   LoopConstrainer LC(*L, LI, LPMAddNewLoop, LS, SE, DT,
diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
index 8f5933b7bd71..5eefde2e37a1 100644
--- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -92,8 +92,6 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AssumptionCache.h"
@@ -182,7 +180,7 @@ public:
 
 class InferAddressSpacesImpl {
   AssumptionCache &AC;
-  DominatorTree *DT = nullptr;
+  const DominatorTree *DT = nullptr;
   const TargetTransformInfo *TTI = nullptr;
   const DataLayout *DL = nullptr;
 
@@ -213,10 +211,11 @@ class InferAddressSpacesImpl {
   // Changes the flat address expressions in function F to point to specific
   // address spaces if InferredAddrSpace says so. Postorder is the postorder of
   // all flat expressions in the use-def graph of function F.
-  bool rewriteWithNewAddressSpaces(
-      const TargetTransformInfo &TTI, ArrayRef<WeakTrackingVH> Postorder,
-      const ValueToAddrSpaceMapTy &InferredAddrSpace,
-      const PredicatedAddrSpaceMapTy &PredicatedAS, Function *F) const;
+  bool
+  rewriteWithNewAddressSpaces(ArrayRef<WeakTrackingVH> Postorder,
+                              const ValueToAddrSpaceMapTy &InferredAddrSpace,
+                              const PredicatedAddrSpaceMapTy &PredicatedAS,
+                              Function *F) const;
 
   void appendsFlatAddressExpressionToPostorderStack(
       Value *V, PostorderStackTy &PostorderStack,
@@ -240,7 +239,7 @@ class InferAddressSpacesImpl {
   unsigned getPredicatedAddrSpace(const Value &V, Value *Opnd) const;
 
 public:
-  InferAddressSpacesImpl(AssumptionCache &AC, DominatorTree *DT,
+  InferAddressSpacesImpl(AssumptionCache &AC, const DominatorTree *DT,
                          const TargetTransformInfo *TTI, unsigned FlatAddrSpace)
       : AC(AC), DT(DT), TTI(TTI), FlatAddrSpace(FlatAddrSpace) {}
   bool run(Function &F);
@@ -280,15 +279,15 @@ static bool isNoopPtrIntCastPair(const Operator *I2P, const DataLayout &DL,
   // arithmetic may also be undefined after invalid pointer reinterpret cast.
   // However, as we confirm through the target hooks that it's a no-op
   // addrspacecast, it doesn't matter since the bits should be the same.
+  unsigned P2IOp0AS = P2I->getOperand(0)->getType()->getPointerAddressSpace();
+  unsigned I2PAS = I2P->getType()->getPointerAddressSpace();
   return CastInst::isNoopCast(Instruction::CastOps(I2P->getOpcode()),
                               I2P->getOperand(0)->getType(), I2P->getType(),
                               DL) &&
          CastInst::isNoopCast(Instruction::CastOps(P2I->getOpcode()),
                               P2I->getOperand(0)->getType(), P2I->getType(),
                               DL) &&
-         TTI->isNoopAddrSpaceCast(
-             P2I->getOperand(0)->getType()->getPointerAddressSpace(),
-             I2P->getType()->getPointerAddressSpace());
+         (P2IOp0AS == I2PAS || TTI->isNoopAddrSpaceCast(P2IOp0AS, I2PAS));
 }
 
 // Returns true if V is an address expression.
@@ -332,8 +331,7 @@ getPointerOperands(const Value &V, const DataLayout &DL,
   switch (Op.getOpcode()) {
   case Instruction::PHI: {
     auto IncomingValues = cast<PHINode>(Op).incoming_values();
-    return SmallVector<Value *, 2>(IncomingValues.begin(),
-                                   IncomingValues.end());
+    return {IncomingValues.begin(), IncomingValues.end()};
   }
   case Instruction::BitCast:
   case Instruction::AddrSpaceCast:
@@ -655,10 +653,13 @@ Value *InferAddressSpacesImpl::cloneInstructionWithNewAddressSpace(
   case Instruction::IntToPtr: {
     assert(isNoopPtrIntCastPair(cast<Operator>(I), *DL, TTI));
     Value *Src = cast<Operator>(I->getOperand(0))->getOperand(0);
-    assert(Src->getType()->getPointerAddressSpace() == NewAddrSpace);
-    if (Src->getType() != NewPtrType)
-      return new BitCastInst(Src, NewPtrType);
-    return Src;
+    if (Src->getType() == NewPtrType)
+      return Src;
+
+    // If we had a no-op inttoptr/ptrtoint pair, we may still have inferred a
+    // source address space from a generic pointer source need to insert a cast
+    // back.
+    return CastInst::CreatePointerBitCastOrAddrSpaceCast(Src, NewPtrType);
   }
   default:
     llvm_unreachable("Unexpected opcode");
@@ -726,7 +727,7 @@ static Value *cloneConstantExprWithNewAddressSpace(
       NewOperands.push_back(cast<Constant>(NewOperand));
       continue;
     }
-    if (auto CExpr = dyn_cast<ConstantExpr>(Operand))
+    if (auto *CExpr = dyn_cast<ConstantExpr>(Operand))
       if (Value *NewOperand = cloneConstantExprWithNewAddressSpace(
               CExpr, NewAddrSpace, ValueWithNewAddrSpace, DL, TTI)) {
         IsNew = true;
@@ -738,7 +739,7 @@ static Value *cloneConstantExprWithNewAddressSpace(
   }
 
   // If !IsNew, we will replace the Value with itself. However, replaced values
-  // are assumed to wrapped in a addrspace cast later so drop it now.
+  // are assumed to wrapped in an addrspacecast cast later so drop it now.
   if (!IsNew)
     return nullptr;
 
@@ -821,8 +822,8 @@ bool InferAddressSpacesImpl::run(Function &F) {
 
   // Changes the address spaces of the flat address expressions who are inferred
   // to point to a specific address space.
-  return rewriteWithNewAddressSpaces(*TTI, Postorder, InferredAddrSpace,
-                                     PredicatedAS, &F);
+  return rewriteWithNewAddressSpaces(Postorder, InferredAddrSpace, PredicatedAS,
+                                     &F);
 }
 
 // Constants need to be tracked through RAUW to handle cases with nested
@@ -1010,7 +1011,7 @@ static bool isSimplePointerUseValidToReplace(const TargetTransformInfo &TTI,
 }
 
 /// Update memory intrinsic uses that require more complex processing than
-/// simple memory instructions. Thse require re-mangling and may have multiple
+/// simple memory instructions. These require re-mangling and may have multiple
 /// pointer operands.
 static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV,
                                      Value *NewV) {
@@ -1020,8 +1021,7 @@ static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV,
   MDNode *NoAliasMD = MI->getMetadata(LLVMContext::MD_noalias);
 
   if (auto *MSI = dyn_cast<MemSetInst>(MI)) {
-    B.CreateMemSet(NewV, MSI->getValue(), MSI->getLength(),
-                   MaybeAlign(MSI->getDestAlignment()),
+    B.CreateMemSet(NewV, MSI->getValue(), MSI->getLength(), MSI->getDestAlign(),
                    false, // isVolatile
                    TBAA, ScopeMD, NoAliasMD);
   } else if (auto *MTI = dyn_cast<MemTransferInst>(MI)) {
@@ -1104,7 +1104,7 @@ static Value::use_iterator skipToNextUser(Value::use_iterator I,
 }
 
 bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces(
-    const TargetTransformInfo &TTI, ArrayRef<WeakTrackingVH> Postorder,
+    ArrayRef<WeakTrackingVH> Postorder,
     const ValueToAddrSpaceMapTy &InferredAddrSpace,
     const PredicatedAddrSpaceMapTy &PredicatedAS, Function *F) const {
   // For each address expression to be modified, creates a clone of it with its
@@ -1178,7 +1178,7 @@ bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces(
       I = skipToNextUser(I, E);
 
       if (isSimplePointerUseValidToReplace(
-              TTI, U, V->getType()->getPointerAddressSpace())) {
+              *TTI, U, V->getType()->getPointerAddressSpace())) {
         // If V is used as the pointer operand of a compatible memory operation,
         // sets the pointer operand to NewV. This replacement does not change
         // the element type, so the resultant load/store is still valid.
@@ -1239,8 +1239,16 @@ bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces(
             if (!cast<PointerType>(ASC->getType())
                     ->hasSameElementTypeAs(
                         cast<PointerType>(NewV->getType()))) {
+              BasicBlock::iterator InsertPos;
+              if (Instruction *NewVInst = dyn_cast<Instruction>(NewV))
+                InsertPos = std::next(NewVInst->getIterator());
+              else if (Instruction *VInst = dyn_cast<Instruction>(V))
+                InsertPos = std::next(VInst->getIterator());
+              else
+                InsertPos = ASC->getIterator();
+
               NewV = CastInst::Create(Instruction::BitCast, NewV,
-                                      ASC->getType(), "", ASC);
+                                      ASC->getType(), "", &*InsertPos);
             }
             ASC->replaceAllUsesWith(NewV);
             DeadInstructions.push_back(ASC);
@@ -1249,12 +1257,18 @@ bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces(
         }
 
         // Otherwise, replaces the use with flat(NewV).
-        if (Instruction *Inst = dyn_cast<Instruction>(V)) {
+        if (Instruction *VInst = dyn_cast<Instruction>(V)) {
           // Don't create a copy of the original addrspacecast.
           if (U == V && isa<AddrSpaceCastInst>(V))
             continue;
 
-          BasicBlock::iterator InsertPos = std::next(Inst->getIterator());
+          // Insert the addrspacecast after NewV.
+          BasicBlock::iterator InsertPos;
+          if (Instruction *NewVInst = dyn_cast<Instruction>(NewV))
+            InsertPos = std::next(NewVInst->getIterator());
+          else
+            InsertPos = std::next(VInst->getIterator());
+
           while (isa<PHINode>(InsertPos))
             ++InsertPos;
           U.set(new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos));
diff --git a/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp b/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp
index c11d2e4c1d6b..4644905adba3 100644
--- a/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp
+++ b/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp
@@ -7,21 +7,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/InstSimplifyPass.h"
-#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/Type.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/Local.h"
 
 using namespace llvm;
@@ -55,7 +51,7 @@ static bool runImpl(Function &F, const SimplifyQuery &SQ,
           DeadInstsInBB.push_back(&I);
           Changed = true;
         } else if (!I.use_empty()) {
-          if (Value *V = SimplifyInstruction(&I, SQ, ORE)) {
+          if (Value *V = simplifyInstruction(&I, SQ, ORE)) {
             // Mark all uses for resimplification next time round the loop.
             for (User *U : I.users())
               Next->insert(cast<Instruction>(U));
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index a3efad104ca6..5caefc422921 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -56,7 +56,6 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
@@ -74,7 +73,6 @@
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
 #include <cassert>
-#include <cstddef>
 #include <cstdint>
 #include <iterator>
 #include <memory>
@@ -106,11 +104,6 @@ static cl::opt<bool> PrintLVIAfterJumpThreading(
     cl::desc("Print the LazyValueInfo cache after JumpThreading"), cl::init(false),
     cl::Hidden);
 
-static cl::opt<bool> JumpThreadingFreezeSelectCond(
-    "jump-threading-freeze-select-cond",
-    cl::desc("Freeze the condition when unfolding select"), cl::init(false),
-    cl::Hidden);
-
 static cl::opt<bool> ThreadAcrossLoopHeaders(
     "jump-threading-across-loop-headers",
     cl::desc("Allow JumpThreading to thread across loop headers, for testing"),
@@ -140,8 +133,7 @@ namespace {
   public:
     static char ID; // Pass identification
 
-    JumpThreading(bool InsertFreezeWhenUnfoldingSelect = false, int T = -1)
-        : FunctionPass(ID), Impl(InsertFreezeWhenUnfoldingSelect, T) {
+    JumpThreading(int T = -1) : FunctionPass(ID), Impl(T) {
       initializeJumpThreadingPass(*PassRegistry::getPassRegistry());
     }
 
@@ -175,12 +167,11 @@ INITIALIZE_PASS_END(JumpThreading, "jump-threading",
                 "Jump Threading", false, false)
 
 // Public interface to the Jump Threading pass
-FunctionPass *llvm::createJumpThreadingPass(bool InsertFr, int Threshold) {
-  return new JumpThreading(InsertFr, Threshold);
+FunctionPass *llvm::createJumpThreadingPass(int Threshold) {
+  return new JumpThreading(Threshold);
 }
 
-JumpThreadingPass::JumpThreadingPass(bool InsertFr, int T) {
-  InsertFreezeWhenUnfoldingSelect = JumpThreadingFreezeSelectCond | InsertFr;
+JumpThreadingPass::JumpThreadingPass(int T) {
   DefaultBBDupThreshold = (T == -1) ? BBDuplicateThreshold : unsigned(T);
 }
 
@@ -326,7 +317,7 @@ bool JumpThreading::runOnFunction(Function &F) {
   std::unique_ptr<BlockFrequencyInfo> BFI;
   std::unique_ptr<BranchProbabilityInfo> BPI;
   if (F.hasProfileData()) {
-    LoopInfo LI{DominatorTree(F)};
+    LoopInfo LI{*DT};
     BPI.reset(new BranchProbabilityInfo(F, LI, TLI));
     BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
   }
@@ -491,14 +482,16 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
 // at the end of block. RAUW unconditionally replaces all uses
 // including the guards/assumes themselves and the uses before the
 // guard/assume.
-static void replaceFoldableUses(Instruction *Cond, Value *ToVal) {
+static bool replaceFoldableUses(Instruction *Cond, Value *ToVal,
+                                BasicBlock *KnownAtEndOfBB) {
+  bool Changed = false;
   assert(Cond->getType() == ToVal->getType());
-  auto *BB = Cond->getParent();
   // We can unconditionally replace all uses in non-local blocks (i.e. uses
   // strictly dominated by BB), since LVI information is true from the
   // terminator of BB.
-  replaceNonLocalUsesWith(Cond, ToVal);
-  for (Instruction &I : reverse(*BB)) {
+  if (Cond->getParent() == KnownAtEndOfBB)
+    Changed |= replaceNonLocalUsesWith(Cond, ToVal);
+  for (Instruction &I : reverse(*KnownAtEndOfBB)) {
     // Reached the Cond whose uses we are trying to replace, so there are no
     // more uses.
     if (&I == Cond)
@@ -507,10 +500,13 @@ static void replaceFoldableUses(Instruction *Cond, Value *ToVal) {
     // of BB, where we know Cond is ToVal.
     if (!isGuaranteedToTransferExecutionToSuccessor(&I))
       break;
-    I.replaceUsesOfWith(Cond, ToVal);
+    Changed |= I.replaceUsesOfWith(Cond, ToVal);
   }
-  if (Cond->use_empty() && !Cond->mayHaveSideEffects())
+  if (Cond->use_empty() && !Cond->mayHaveSideEffects()) {
     Cond->eraseFromParent();
+    Changed = true;
+  }
+  return Changed;
 }
 
 /// Return the cost of duplicating a piece of this block from first non-phi
@@ -792,6 +788,7 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl(
     if (Preference != WantInteger)
       return false;
     if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) {
+      const DataLayout &DL = BO->getModule()->getDataLayout();
       PredValueInfoTy LHSVals;
       computeValueKnownInPredecessorsImpl(BO->getOperand(0), BB, LHSVals,
                                           WantInteger, RecursionSet, CxtI);
@@ -799,7 +796,8 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl(
       // Try to use constant folding to simplify the binary operator.
       for (const auto &LHSVal : LHSVals) {
         Constant *V = LHSVal.first;
-        Constant *Folded = ConstantExpr::get(BO->getOpcode(), V, CI);
+        Constant *Folded =
+            ConstantFoldBinaryOpOperands(BO->getOpcode(), V, CI, DL);
 
         if (Constant *KC = getKnownConstant(Folded, WantInteger))
           Result.emplace_back(KC, LHSVal.second);
@@ -835,7 +833,7 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl(
           LHS = CmpLHS->DoPHITranslation(BB, PredBB);
           RHS = PN->getIncomingValue(i);
         }
-        Value *Res = SimplifyCmpInst(Pred, LHS, RHS, {DL});
+        Value *Res = simplifyCmpInst(Pred, LHS, RHS, {DL});
         if (!Res) {
           if (!isa<Constant>(RHS))
             continue;
@@ -1135,34 +1133,21 @@ bool JumpThreadingPass::processBlock(BasicBlock *BB) {
     return ConstantFolded;
   }
 
-  if (CmpInst *CondCmp = dyn_cast<CmpInst>(CondInst)) {
+  // Some of the following optimization can safely work on the unfrozen cond.
+  Value *CondWithoutFreeze = CondInst;
+  if (auto *FI = dyn_cast<FreezeInst>(CondInst))
+    CondWithoutFreeze = FI->getOperand(0);
+
+  if (CmpInst *CondCmp = dyn_cast<CmpInst>(CondWithoutFreeze)) {
     // If we're branching on a conditional, LVI might be able to determine
     // it's value at the branch instruction.  We only handle comparisons
     // against a constant at this time.
-    // TODO: This should be extended to handle switches as well.
-    BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
-    Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1));
-    if (CondBr && CondConst) {
-      // We should have returned as soon as we turn a conditional branch to
-      // unconditional. Because its no longer interesting as far as jump
-      // threading is concerned.
-      assert(CondBr->isConditional() && "Threading on unconditional terminator");
-
+    if (Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1))) {
       LazyValueInfo::Tristate Ret =
           LVI->getPredicateAt(CondCmp->getPredicate(), CondCmp->getOperand(0),
-                              CondConst, CondBr, /*UseBlockValue=*/false);
+                              CondConst, BB->getTerminator(),
+                              /*UseBlockValue=*/false);
       if (Ret != LazyValueInfo::Unknown) {
-        unsigned ToRemove = Ret == LazyValueInfo::True ? 1 : 0;
-        unsigned ToKeep = Ret == LazyValueInfo::True ? 0 : 1;
-        BasicBlock *ToRemoveSucc = CondBr->getSuccessor(ToRemove);
-        ToRemoveSucc->removePredecessor(BB, true);
-        BranchInst *UncondBr =
-          BranchInst::Create(CondBr->getSuccessor(ToKeep), CondBr);
-        UncondBr->setDebugLoc(CondBr->getDebugLoc());
-        ++NumFolds;
-        CondBr->eraseFromParent();
-        if (CondCmp->use_empty())
-          CondCmp->eraseFromParent();
         // We can safely replace *some* uses of the CondInst if it has
         // exactly one value as returned by LVI. RAUW is incorrect in the
         // presence of guards and assumes, that have the `Cond` as the use. This
@@ -1170,17 +1155,11 @@ bool JumpThreadingPass::processBlock(BasicBlock *BB) {
         // at the end of block, but RAUW unconditionally replaces all uses
         // including the guards/assumes themselves and the uses before the
         // guard/assume.
-        else if (CondCmp->getParent() == BB) {
-          auto *CI = Ret == LazyValueInfo::True ?
-            ConstantInt::getTrue(CondCmp->getType()) :
-            ConstantInt::getFalse(CondCmp->getType());
-          replaceFoldableUses(CondCmp, CI);
-        }
-        DTU->applyUpdatesPermissive(
-            {{DominatorTree::Delete, BB, ToRemoveSucc}});
-        if (HasProfileData)
-          BPI->eraseBlock(BB);
-        return true;
+        auto *CI = Ret == LazyValueInfo::True ?
+          ConstantInt::getTrue(CondCmp->getType()) :
+          ConstantInt::getFalse(CondCmp->getType());
+        if (replaceFoldableUses(CondCmp, CI, BB))
+          return true;
       }
 
       // We did not manage to simplify this branch, try to see whether
@@ -1198,11 +1177,7 @@ bool JumpThreadingPass::processBlock(BasicBlock *BB) {
   // for loads that are used by a switch or by the condition for the branch.  If
   // we see one, check to see if it's partially redundant.  If so, insert a PHI
   // which can then be used to thread the values.
-  Value *SimplifyValue = CondInst;
-
-  if (auto *FI = dyn_cast<FreezeInst>(SimplifyValue))
-    // Look into freeze's operand
-    SimplifyValue = FI->getOperand(0);
+  Value *SimplifyValue = CondWithoutFreeze;
 
   if (CmpInst *CondCmp = dyn_cast<CmpInst>(SimplifyValue))
     if (isa<Constant>(CondCmp->getOperand(1)))
@@ -1227,10 +1202,7 @@ bool JumpThreadingPass::processBlock(BasicBlock *BB) {
 
   // If this is an otherwise-unfoldable branch on a phi node or freeze(phi) in
   // the current block, see if we can simplify.
-  PHINode *PN = dyn_cast<PHINode>(
-      isa<FreezeInst>(CondInst) ? cast<FreezeInst>(CondInst)->getOperand(0)
-                                : CondInst);
-
+  PHINode *PN = dyn_cast<PHINode>(CondWithoutFreeze);
   if (PN && PN->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
     return processBranchOnPHI(PN);
 
@@ -1253,6 +1225,17 @@ bool JumpThreadingPass::processImpliedCondition(BasicBlock *BB) {
     return false;
 
   Value *Cond = BI->getCondition();
+  // Assuming that predecessor's branch was taken, if pred's branch condition
+  // (V) implies Cond, Cond can be either true, undef, or poison. In this case,
+  // freeze(Cond) is either true or a nondeterministic value.
+  // If freeze(Cond) has only one use, we can freely fold freeze(Cond) to true
+  // without affecting other instructions.
+  auto *FICond = dyn_cast<FreezeInst>(Cond);
+  if (FICond && FICond->hasOneUse())
+    Cond = FICond->getOperand(0);
+  else
+    FICond = nullptr;
+
   BasicBlock *CurrentBB = BB;
   BasicBlock *CurrentPred = BB->getSinglePredecessor();
   unsigned Iter = 0;
@@ -1269,6 +1252,15 @@ bool JumpThreadingPass::processImpliedCondition(BasicBlock *BB) {
     bool CondIsTrue = PBI->getSuccessor(0) == CurrentBB;
     Optional<bool> Implication =
         isImpliedCondition(PBI->getCondition(), Cond, DL, CondIsTrue);
+
+    // If the branch condition of BB (which is Cond) and CurrentPred are
+    // exactly the same freeze instruction, Cond can be folded into CondIsTrue.
+    if (!Implication && FICond && isa<FreezeInst>(PBI->getCondition())) {
+      if (cast<FreezeInst>(PBI->getCondition())->getOperand(0) ==
+          FICond->getOperand(0))
+        Implication = CondIsTrue;
+    }
+
     if (Implication) {
       BasicBlock *KeepSucc = BI->getSuccessor(*Implication ? 0 : 1);
       BasicBlock *RemoveSucc = BI->getSuccessor(*Implication ? 1 : 0);
@@ -1277,6 +1269,9 @@ bool JumpThreadingPass::processImpliedCondition(BasicBlock *BB) {
       UncondBI->setDebugLoc(BI->getDebugLoc());
       ++NumFolds;
       BI->eraseFromParent();
+      if (FICond)
+        FICond->eraseFromParent();
+
       DTU->applyUpdatesPermissive({{DominatorTree::Delete, BB, RemoveSucc}});
       if (HasProfileData)
         BPI->eraseBlock(BB);
@@ -1338,10 +1333,10 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) {
       combineMetadataForCSE(NLoadI, LoadI, false);
     };
 
-    // If the returned value is the load itself, replace with an undef. This can
+    // If the returned value is the load itself, replace with poison. This can
     // only happen in dead loops.
     if (AvailableVal == LoadI)
-      AvailableVal = UndefValue::get(LoadI->getType());
+      AvailableVal = PoisonValue::get(LoadI->getType());
     if (AvailableVal->getType() != LoadI->getType())
       AvailableVal = CastInst::CreateBitOrPointerCast(
           AvailableVal, LoadI->getType(), "", LoadI);
@@ -1566,10 +1561,8 @@ findMostPopularDest(BasicBlock *BB,
       DestPopularity[PredToDest.second]++;
 
   // Find the most popular dest.
-  using VT = decltype(DestPopularity)::value_type;
   auto MostPopular = std::max_element(
-      DestPopularity.begin(), DestPopularity.end(),
-      [](const VT &L, const VT &R) { return L.second < R.second; });
+      DestPopularity.begin(), DestPopularity.end(), llvm::less_second());
 
   // Okay, we have finally picked the most popular destination.
   return MostPopular->first;
@@ -1742,9 +1735,8 @@ bool JumpThreadingPass::processThreadableEdges(Value *Cond, BasicBlock *BB,
         // at the end of block, but RAUW unconditionally replaces all uses
         // including the guards/assumes themselves and the uses before the
         // guard/assume.
-        else if (OnlyVal && OnlyVal != MultipleVal &&
-                 CondInst->getParent() == BB)
-          replaceFoldableUses(CondInst, OnlyVal);
+        else if (OnlyVal && OnlyVal != MultipleVal)
+          replaceFoldableUses(CondInst, OnlyVal, BB);
       }
       return true;
     }
@@ -2672,7 +2664,7 @@ bool JumpThreadingPass::duplicateCondBranchOnPHIIntoPred(
     // If this instruction can be simplified after the operands are updated,
     // just use the simplified value instead.  This frequently happens due to
     // phi translation.
-    if (Value *IV = SimplifyInstruction(
+    if (Value *IV = simplifyInstruction(
             New,
             {BB->getModule()->getDataLayout(), TLI, nullptr, nullptr, New})) {
       ValueMapping[&*BI] = IV;
@@ -2912,9 +2904,7 @@ bool JumpThreadingPass::tryToUnfoldSelectInCurrBB(BasicBlock *BB) {
       continue;
     // Expand the select.
     Value *Cond = SI->getCondition();
-    if (InsertFreezeWhenUnfoldingSelect &&
-        !isGuaranteedNotToBeUndefOrPoison(Cond, nullptr, SI,
-                                          &DTU->getDomTree()))
+    if (!isGuaranteedNotToBeUndefOrPoison(Cond, nullptr, SI))
       Cond = new FreezeInst(Cond, "cond.fr", SI);
     Instruction *Term = SplitBlockAndInsertIfThen(Cond, SI, false);
     BasicBlock *SplitBB = SI->getParent();
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 7fb1a25bdf13..492f4e40395a 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -37,29 +37,27 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/LICM.h"
+#include "llvm/ADT/PriorityWorklist.h"
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/GuardUtils.h"
 #include "llvm/Analysis/LazyBlockFrequencyInfo.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopNestAnalysis.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/MustExecute.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
@@ -78,7 +76,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -88,6 +85,11 @@
 #include <utility>
 using namespace llvm;
 
+namespace llvm {
+class BlockFrequencyInfo;
+class LPMUpdater;
+} // namespace llvm
+
 #define DEBUG_TYPE "licm"
 
 STATISTIC(NumCreatedBlocks, "Number of blocks created");
@@ -114,8 +116,7 @@ static cl::opt<uint32_t> MaxNumUsesTraversed(
 
 // Experimental option to allow imprecision in LICM in pathological cases, in
 // exchange for faster compile. This is to be removed if MemorySSA starts to
-// address the same issue. This flag applies only when LICM uses MemorySSA
-// instead on AliasSetTracker. LICM calls MemorySSAWalker's
+// address the same issue. LICM calls MemorySSAWalker's
 // getClobberingMemoryAccess, up to the value of the Cap, getting perfect
 // accuracy. Afterwards, LICM will call into MemorySSA's getDefiningAccess,
 // which may not be precise, since optimizeUses is capped. The result is
@@ -143,37 +144,32 @@ static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
                                   bool LoopNestMode);
 static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
                   BasicBlock *Dest, ICFLoopSafetyInfo *SafetyInfo,
-                  MemorySSAUpdater *MSSAU, ScalarEvolution *SE,
+                  MemorySSAUpdater &MSSAU, ScalarEvolution *SE,
                   OptimizationRemarkEmitter *ORE);
 static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
                  BlockFrequencyInfo *BFI, const Loop *CurLoop,
-                 ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU,
+                 ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater &MSSAU,
                  OptimizationRemarkEmitter *ORE);
-static bool isSafeToExecuteUnconditionally(Instruction &Inst,
-                                           const DominatorTree *DT,
-                                           const TargetLibraryInfo *TLI,
-                                           const Loop *CurLoop,
-                                           const LoopSafetyInfo *SafetyInfo,
-                                           OptimizationRemarkEmitter *ORE,
-                                           const Instruction *CtxI = nullptr);
-static bool pointerInvalidatedByLoop(MemoryLocation MemLoc,
-                                     AliasSetTracker *CurAST, Loop *CurLoop,
-                                     AAResults *AA);
-static bool pointerInvalidatedByLoopWithMSSA(MemorySSA *MSSA, MemoryUse *MU,
-                                             Loop *CurLoop, Instruction &I,
-                                             SinkAndHoistLICMFlags &Flags);
-static bool pointerInvalidatedByBlockWithMSSA(BasicBlock &BB, MemorySSA &MSSA,
-                                              MemoryUse &MU);
+static bool isSafeToExecuteUnconditionally(
+    Instruction &Inst, const DominatorTree *DT, const TargetLibraryInfo *TLI,
+    const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo,
+    OptimizationRemarkEmitter *ORE, const Instruction *CtxI,
+    bool AllowSpeculation);
+static bool pointerInvalidatedByLoop(MemorySSA *MSSA, MemoryUse *MU,
+                                     Loop *CurLoop, Instruction &I,
+                                     SinkAndHoistLICMFlags &Flags);
+static bool pointerInvalidatedByBlock(BasicBlock &BB, MemorySSA &MSSA,
+                                      MemoryUse &MU);
 static Instruction *cloneInstructionInExitBlock(
     Instruction &I, BasicBlock &ExitBlock, PHINode &PN, const LoopInfo *LI,
-    const LoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU);
+    const LoopSafetyInfo *SafetyInfo, MemorySSAUpdater &MSSAU);
 
 static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,
-                             MemorySSAUpdater *MSSAU);
+                             MemorySSAUpdater &MSSAU);
 
 static void moveInstructionBefore(Instruction &I, Instruction &Dest,
                                   ICFLoopSafetyInfo &SafetyInfo,
-                                  MemorySSAUpdater *MSSAU, ScalarEvolution *SE);
+                                  MemorySSAUpdater &MSSAU, ScalarEvolution *SE);
 
 static void foreachMemoryAccess(MemorySSA *MSSA, Loop *L,
                                 function_ref<void(Instruction *)> Fn);
@@ -188,21 +184,26 @@ struct LoopInvariantCodeMotion {
                  OptimizationRemarkEmitter *ORE, bool LoopNestMode = false);
 
   LoopInvariantCodeMotion(unsigned LicmMssaOptCap,
-                          unsigned LicmMssaNoAccForPromotionCap)
+                          unsigned LicmMssaNoAccForPromotionCap,
+                          bool LicmAllowSpeculation)
       : LicmMssaOptCap(LicmMssaOptCap),
-        LicmMssaNoAccForPromotionCap(LicmMssaNoAccForPromotionCap) {}
+        LicmMssaNoAccForPromotionCap(LicmMssaNoAccForPromotionCap),
+        LicmAllowSpeculation(LicmAllowSpeculation) {}
 
 private:
   unsigned LicmMssaOptCap;
   unsigned LicmMssaNoAccForPromotionCap;
+  bool LicmAllowSpeculation;
 };
 
 struct LegacyLICMPass : public LoopPass {
   static char ID; // Pass identification, replacement for typeid
   LegacyLICMPass(
       unsigned LicmMssaOptCap = SetLicmMssaOptCap,
-      unsigned LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap)
-      : LoopPass(ID), LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap) {
+      unsigned LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap,
+      bool LicmAllowSpeculation = true)
+      : LoopPass(ID), LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap,
+                           LicmAllowSpeculation) {
     initializeLegacyLICMPassPass(*PassRegistry::getPassRegistry());
   }
 
@@ -265,7 +266,8 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
   // but ORE cannot be preserved (see comment before the pass definition).
   OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
 
-  LoopInvariantCodeMotion LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap);
+  LoopInvariantCodeMotion LICM(Opts.MssaOptCap, Opts.MssaNoAccForPromotionCap,
+                               Opts.AllowSpeculation);
   if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, AR.BFI, &AR.TLI, &AR.TTI,
                       &AR.SE, AR.MSSA, &ORE))
     return PreservedAnalyses::all();
@@ -279,6 +281,16 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
   return PA;
 }
 
+void LICMPass::printPipeline(
+    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+  static_cast<PassInfoMixin<LICMPass> *>(this)->printPipeline(
+      OS, MapClassName2PassName);
+
+  OS << "<";
+  OS << (Opts.AllowSpeculation ? "" : "no-") << "allowspeculation";
+  OS << ">";
+}
+
 PreservedAnalyses LNICMPass::run(LoopNest &LN, LoopAnalysisManager &AM,
                                  LoopStandardAnalysisResults &AR,
                                  LPMUpdater &) {
@@ -290,7 +302,8 @@ PreservedAnalyses LNICMPass::run(LoopNest &LN, LoopAnalysisManager &AM,
   // but ORE cannot be preserved (see comment before the pass definition).
   OptimizationRemarkEmitter ORE(LN.getParent());
 
-  LoopInvariantCodeMotion LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap);
+  LoopInvariantCodeMotion LICM(Opts.MssaOptCap, Opts.MssaNoAccForPromotionCap,
+                               Opts.AllowSpeculation);
 
   Loop &OutermostLoop = LN.getOutermostLoop();
   bool Changed = LICM.runOnLoop(&OutermostLoop, &AR.AA, &AR.LI, &AR.DT, AR.BFI,
@@ -308,6 +321,16 @@ PreservedAnalyses LNICMPass::run(LoopNest &LN, LoopAnalysisManager &AM,
   return PA;
 }
 
+void LNICMPass::printPipeline(
+    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+  static_cast<PassInfoMixin<LNICMPass> *>(this)->printPipeline(
+      OS, MapClassName2PassName);
+
+  OS << "<";
+  OS << (Opts.AllowSpeculation ? "" : "no-") << "allowspeculation";
+  OS << ">";
+}
+
 char LegacyLICMPass::ID = 0;
 INITIALIZE_PASS_BEGIN(LegacyLICMPass, "licm", "Loop Invariant Code Motion",
                       false, false)
@@ -321,8 +344,10 @@ INITIALIZE_PASS_END(LegacyLICMPass, "licm", "Loop Invariant Code Motion", false,
 
 Pass *llvm::createLICMPass() { return new LegacyLICMPass(); }
 Pass *llvm::createLICMPass(unsigned LicmMssaOptCap,
-                           unsigned LicmMssaNoAccForPromotionCap) {
-  return new LegacyLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap);
+                           unsigned LicmMssaNoAccForPromotionCap,
+                           bool LicmAllowSpeculation) {
+  return new LegacyLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap,
+                            LicmAllowSpeculation);
 }
 
 llvm::SinkAndHoistLICMFlags::SinkAndHoistLICMFlags(bool IsSink, Loop *L,
@@ -365,6 +390,7 @@ bool LoopInvariantCodeMotion::runOnLoop(
   bool Changed = false;
 
   assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form.");
+  MSSA->ensureOptimizedUses();
 
   // If this loop has metadata indicating that LICM is not to be performed then
   // just exit.
@@ -411,14 +437,15 @@ bool LoopInvariantCodeMotion::runOnLoop(
   if (L->hasDedicatedExits())
     Changed |= LoopNestMode
                    ? sinkRegionForLoopNest(DT->getNode(L->getHeader()), AA, LI,
-                                           DT, BFI, TLI, TTI, L, &MSSAU,
+                                           DT, BFI, TLI, TTI, L, MSSAU,
                                            &SafetyInfo, Flags, ORE)
                    : sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI,
-                                TLI, TTI, L, &MSSAU, &SafetyInfo, Flags, ORE);
+                                TLI, TTI, L, MSSAU, &SafetyInfo, Flags, ORE);
   Flags.setIsSink(false);
   if (Preheader)
     Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI, L,
-                           &MSSAU, SE, &SafetyInfo, Flags, ORE, LoopNestMode);
+                           MSSAU, SE, &SafetyInfo, Flags, ORE, LoopNestMode,
+                           LicmAllowSpeculation);
 
   // Now that all loop invariants have been removed from the loop, promote any
   // memory references to scalars that we can.
@@ -451,8 +478,7 @@ bool LoopInvariantCodeMotion::runOnLoop(
       PredIteratorCache PIC;
 
       // Promoting one set of accesses may make the pointers for another set
-      // loop invariant, so run this in a loop (with the MaybePromotable set
-      // decreasing in size over time).
+      // loop invariant, so run this in a loop.
       bool Promoted = false;
       bool LocalPromoted;
       do {
@@ -460,8 +486,8 @@ bool LoopInvariantCodeMotion::runOnLoop(
         for (const SmallSetVector<Value *, 8> &PointerMustAliases :
              collectPromotionCandidates(MSSA, AA, L)) {
           LocalPromoted |= promoteLoopAccessesToScalars(
-              PointerMustAliases, ExitBlocks, InsertPts, MSSAInsertPts, PIC,
-              LI, DT, TLI, L, &MSSAU, &SafetyInfo, ORE);
+              PointerMustAliases, ExitBlocks, InsertPts, MSSAInsertPts, PIC, LI,
+              DT, TLI, L, MSSAU, &SafetyInfo, ORE, LicmAllowSpeculation);
         }
         Promoted |= LocalPromoted;
       } while (LocalPromoted);
@@ -502,17 +528,17 @@ bool LoopInvariantCodeMotion::runOnLoop(
 bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
                       DominatorTree *DT, BlockFrequencyInfo *BFI,
                       TargetLibraryInfo *TLI, TargetTransformInfo *TTI,
-                      Loop *CurLoop, MemorySSAUpdater *MSSAU,
+                      Loop *CurLoop, MemorySSAUpdater &MSSAU,
                       ICFLoopSafetyInfo *SafetyInfo,
                       SinkAndHoistLICMFlags &Flags,
                       OptimizationRemarkEmitter *ORE, Loop *OutermostLoop) {
 
   // Verify inputs.
   assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr &&
-         CurLoop != nullptr && MSSAU != nullptr && SafetyInfo != nullptr &&
+         CurLoop != nullptr && SafetyInfo != nullptr &&
          "Unexpected input to sinkRegion.");
 
-  // We want to visit children before parents. We will enque all the parents
+  // We want to visit children before parents. We will enqueue all the parents
   // before their children in the worklist and process the worklist in reverse
   // order.
   SmallVector<DomTreeNode *, 16> Worklist = collectChildrenInLoop(N, CurLoop);
@@ -550,8 +576,7 @@ bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
       if (!I.mayHaveSideEffects() &&
           isNotUsedOrFreeInLoop(I, LoopNestMode ? OutermostLoop : CurLoop,
                                 SafetyInfo, TTI, FreeInLoop, LoopNestMode) &&
-          canSinkOrHoistInst(I, AA, DT, CurLoop, /*CurAST*/nullptr, MSSAU, true,
-                             &Flags, ORE)) {
+          canSinkOrHoistInst(I, AA, DT, CurLoop, MSSAU, true, Flags, ORE)) {
         if (sink(I, LI, DT, BFI, CurLoop, SafetyInfo, MSSAU, ORE)) {
           if (!FreeInLoop) {
             ++II;
@@ -564,14 +589,14 @@ bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
     }
   }
   if (VerifyMemorySSA)
-    MSSAU->getMemorySSA()->verifyMemorySSA();
+    MSSAU.getMemorySSA()->verifyMemorySSA();
   return Changed;
 }
 
 bool llvm::sinkRegionForLoopNest(
     DomTreeNode *N, AAResults *AA, LoopInfo *LI, DominatorTree *DT,
     BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI, TargetTransformInfo *TTI,
-    Loop *CurLoop, MemorySSAUpdater *MSSAU, ICFLoopSafetyInfo *SafetyInfo,
+    Loop *CurLoop, MemorySSAUpdater &MSSAU, ICFLoopSafetyInfo *SafetyInfo,
     SinkAndHoistLICMFlags &Flags, OptimizationRemarkEmitter *ORE) {
 
   bool Changed = false;
@@ -600,7 +625,7 @@ private:
   LoopInfo *LI;
   DominatorTree *DT;
   Loop *CurLoop;
-  MemorySSAUpdater *MSSAU;
+  MemorySSAUpdater &MSSAU;
 
   // A map of blocks in the loop to the block their instructions will be hoisted
   // to.
@@ -612,7 +637,7 @@ private:
 
 public:
   ControlFlowHoister(LoopInfo *LI, DominatorTree *DT, Loop *CurLoop,
-                     MemorySSAUpdater *MSSAU)
+                     MemorySSAUpdater &MSSAU)
       : LI(LI), DT(DT), CurLoop(CurLoop), MSSAU(MSSAU) {}
 
   void registerPossiblyHoistableBranch(BranchInst *BI) {
@@ -788,7 +813,7 @@ public:
     if (HoistTarget == InitialPreheader) {
       // Phis in the loop header now need to use the new preheader.
       InitialPreheader->replaceSuccessorsPhiUsesWith(HoistCommonSucc);
-      MSSAU->wireOldPredecessorsToNewImmediatePredecessor(
+      MSSAU.wireOldPredecessorsToNewImmediatePredecessor(
           HoistTarget->getSingleSuccessor(), HoistCommonSucc, {HoistTarget});
       // The new preheader dominates the loop header.
       DomTreeNode *PreheaderNode = DT->getNode(HoistCommonSucc);
@@ -822,13 +847,14 @@ public:
 bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
                        DominatorTree *DT, BlockFrequencyInfo *BFI,
                        TargetLibraryInfo *TLI, Loop *CurLoop,
-                       MemorySSAUpdater *MSSAU, ScalarEvolution *SE,
+                       MemorySSAUpdater &MSSAU, ScalarEvolution *SE,
                        ICFLoopSafetyInfo *SafetyInfo,
                        SinkAndHoistLICMFlags &Flags,
-                       OptimizationRemarkEmitter *ORE, bool LoopNestMode) {
+                       OptimizationRemarkEmitter *ORE, bool LoopNestMode,
+                       bool AllowSpeculation) {
   // Verify inputs.
   assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr &&
-         CurLoop != nullptr && MSSAU != nullptr && SafetyInfo != nullptr &&
+         CurLoop != nullptr && SafetyInfo != nullptr &&
          "Unexpected input to hoistRegion.");
 
   ControlFlowHoister CFH(LI, DT, CurLoop, MSSAU);
@@ -873,11 +899,10 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
       // and we have accurately duplicated the control flow from the loop header
       // to that block.
       if (CurLoop->hasLoopInvariantOperands(&I) &&
-          canSinkOrHoistInst(I, AA, DT, CurLoop, /*CurAST*/ nullptr, MSSAU,
-                             true, &Flags, ORE) &&
+          canSinkOrHoistInst(I, AA, DT, CurLoop, MSSAU, true, Flags, ORE) &&
           isSafeToExecuteUnconditionally(
               I, DT, TLI, CurLoop, SafetyInfo, ORE,
-              CurLoop->getLoopPreheader()->getTerminator())) {
+              CurLoop->getLoopPreheader()->getTerminator(), AllowSpeculation)) {
         hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo,
               MSSAU, SE, ORE);
         HoistedInstructions.push_back(&I);
@@ -982,7 +1007,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
     }
   }
   if (VerifyMemorySSA)
-    MSSAU->getMemorySSA()->verifyMemorySSA();
+    MSSAU.getMemorySSA()->verifyMemorySSA();
 
     // Now that we've finished hoisting make sure that LI and DT are still
     // valid.
@@ -1083,30 +1108,19 @@ bool isHoistableAndSinkableInst(Instruction &I) {
           isa<ShuffleVectorInst>(I) || isa<ExtractValueInst>(I) ||
           isa<InsertValueInst>(I) || isa<FreezeInst>(I));
 }
-/// Return true if all of the alias sets within this AST are known not to
-/// contain a Mod, or if MSSA knows there are no MemoryDefs in the loop.
-bool isReadOnly(AliasSetTracker *CurAST, const MemorySSAUpdater *MSSAU,
-                const Loop *L) {
-  if (CurAST) {
-    for (AliasSet &AS : *CurAST) {
-      if (!AS.isForwardingAliasSet() && AS.isMod()) {
-        return false;
-      }
-    }
-    return true;
-  } else { /*MSSAU*/
-    for (auto *BB : L->getBlocks())
-      if (MSSAU->getMemorySSA()->getBlockDefs(BB))
-        return false;
-    return true;
-  }
+/// Return true if MSSA knows there are no MemoryDefs in the loop.
+bool isReadOnly(const MemorySSAUpdater &MSSAU, const Loop *L) {
+  for (auto *BB : L->getBlocks())
+    if (MSSAU.getMemorySSA()->getBlockDefs(BB))
+      return false;
+  return true;
 }
 
 /// Return true if I is the only Instruction with a MemoryAccess in L.
 bool isOnlyMemoryAccess(const Instruction *I, const Loop *L,
-                        const MemorySSAUpdater *MSSAU) {
+                        const MemorySSAUpdater &MSSAU) {
   for (auto *BB : L->getBlocks())
-    if (auto *Accs = MSSAU->getMemorySSA()->getBlockAccesses(BB)) {
+    if (auto *Accs = MSSAU.getMemorySSA()->getBlockAccesses(BB)) {
       int NotAPhi = 0;
       for (const auto &Acc : *Accs) {
         if (isa<MemoryPhi>(&Acc))
@@ -1121,22 +1135,15 @@ bool isOnlyMemoryAccess(const Instruction *I, const Loop *L,
 }
 
 bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
-                              Loop *CurLoop, AliasSetTracker *CurAST,
-                              MemorySSAUpdater *MSSAU,
+                              Loop *CurLoop, MemorySSAUpdater &MSSAU,
                               bool TargetExecutesOncePerLoop,
-                              SinkAndHoistLICMFlags *Flags,
+                              SinkAndHoistLICMFlags &Flags,
                               OptimizationRemarkEmitter *ORE) {
-  assert(((CurAST != nullptr) ^ (MSSAU != nullptr)) &&
-         "Either AliasSetTracker or MemorySSA should be initialized.");
-
   // If we don't understand the instruction, bail early.
   if (!isHoistableAndSinkableInst(I))
     return false;
 
-  MemorySSA *MSSA = MSSAU ? MSSAU->getMemorySSA() : nullptr;
-  if (MSSA)
-    assert(Flags != nullptr && "Flags cannot be null.");
-
+  MemorySSA *MSSA = MSSAU.getMemorySSA();
   // Loads have extra constraints we have to verify before we can hoist them.
   if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
     if (!LI->isUnordered())
@@ -1156,13 +1163,8 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
     if (isLoadInvariantInLoop(LI, DT, CurLoop))
       return true;
 
-    bool Invalidated;
-    if (CurAST)
-      Invalidated = pointerInvalidatedByLoop(MemoryLocation::get(LI), CurAST,
-                                             CurLoop, AA);
-    else
-      Invalidated = pointerInvalidatedByLoopWithMSSA(
-          MSSA, cast<MemoryUse>(MSSA->getMemoryAccess(LI)), CurLoop, I, *Flags);
+    bool Invalidated = pointerInvalidatedByLoop(
+        MSSA, cast<MemoryUse>(MSSA->getMemoryAccess(LI)), CurLoop, I, Flags);
     // Check loop-invariant address because this may also be a sinkable load
     // whose address is not necessarily loop-invariant.
     if (ORE && Invalidated && CurLoop->isLoopInvariant(LI->getPointerOperand()))
@@ -1210,24 +1212,17 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
       if (AAResults::onlyAccessesArgPointees(Behavior)) {
         // TODO: expand to writeable arguments
         for (Value *Op : CI->args())
-          if (Op->getType()->isPointerTy()) {
-            bool Invalidated;
-            if (CurAST)
-              Invalidated = pointerInvalidatedByLoop(
-                  MemoryLocation::getBeforeOrAfter(Op), CurAST, CurLoop, AA);
-            else
-              Invalidated = pointerInvalidatedByLoopWithMSSA(
+          if (Op->getType()->isPointerTy() &&
+              pointerInvalidatedByLoop(
                   MSSA, cast<MemoryUse>(MSSA->getMemoryAccess(CI)), CurLoop, I,
-                  *Flags);
-            if (Invalidated)
-              return false;
-          }
+                  Flags))
+            return false;
         return true;
       }
 
       // If this call only reads from memory and there are no writes to memory
       // in the loop, we can hoist or sink the call as appropriate.
-      if (isReadOnly(CurAST, MSSAU, CurLoop))
+      if (isReadOnly(MSSAU, CurLoop))
         return true;
     }
 
@@ -1238,21 +1233,7 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
   } else if (auto *FI = dyn_cast<FenceInst>(&I)) {
     // Fences alias (most) everything to provide ordering.  For the moment,
     // just give up if there are any other memory operations in the loop.
-    if (CurAST) {
-      auto Begin = CurAST->begin();
-      assert(Begin != CurAST->end() && "must contain FI");
-      if (std::next(Begin) != CurAST->end())
-        // constant memory for instance, TODO: handle better
-        return false;
-      auto *UniqueI = Begin->getUniqueInstruction();
-      if (!UniqueI)
-        // other memory op, give up
-        return false;
-      (void)FI; // suppress unused variable warning
-      assert(UniqueI == FI && "AS must contain FI");
-      return true;
-    } else // MSSAU
-      return isOnlyMemoryAccess(FI, CurLoop, MSSAU);
+    return isOnlyMemoryAccess(FI, CurLoop, MSSAU);
   } else if (auto *SI = dyn_cast<StoreInst>(&I)) {
     if (!SI->isUnordered())
       return false; // Don't sink/hoist volatile or ordered atomic store!
@@ -1262,68 +1243,54 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
     // load store promotion instead.  TODO: We can extend this to cases where
     // there is exactly one write to the location and that write dominates an
     // arbitrary number of reads in the loop.
-    if (CurAST) {
-      auto &AS = CurAST->getAliasSetFor(MemoryLocation::get(SI));
-
-      if (AS.isRef() || !AS.isMustAlias())
-        // Quick exit test, handled by the full path below as well.
-        return false;
-      auto *UniqueI = AS.getUniqueInstruction();
-      if (!UniqueI)
-        // other memory op, give up
-        return false;
-      assert(UniqueI == SI && "AS must contain SI");
+    if (isOnlyMemoryAccess(SI, CurLoop, MSSAU))
       return true;
-    } else { // MSSAU
-      if (isOnlyMemoryAccess(SI, CurLoop, MSSAU))
-        return true;
-      // If there are more accesses than the Promotion cap or no "quota" to
-      // check clobber, then give up as we're not walking a list that long.
-      if (Flags->tooManyMemoryAccesses() || Flags->tooManyClobberingCalls())
-        return false;
-      // If there are interfering Uses (i.e. their defining access is in the
-      // loop), or ordered loads (stored as Defs!), don't move this store.
-      // Could do better here, but this is conservatively correct.
-      // TODO: Cache set of Uses on the first walk in runOnLoop, update when
-      // moving accesses. Can also extend to dominating uses.
-      auto *SIMD = MSSA->getMemoryAccess(SI);
-      for (auto *BB : CurLoop->getBlocks())
-        if (auto *Accesses = MSSA->getBlockAccesses(BB)) {
-          for (const auto &MA : *Accesses)
-            if (const auto *MU = dyn_cast<MemoryUse>(&MA)) {
-              auto *MD = MU->getDefiningAccess();
-              if (!MSSA->isLiveOnEntryDef(MD) &&
-                  CurLoop->contains(MD->getBlock()))
-                return false;
-              // Disable hoisting past potentially interfering loads. Optimized
-              // Uses may point to an access outside the loop, as getClobbering
-              // checks the previous iteration when walking the backedge.
-              // FIXME: More precise: no Uses that alias SI.
-              if (!Flags->getIsSink() && !MSSA->dominates(SIMD, MU))
-                return false;
-            } else if (const auto *MD = dyn_cast<MemoryDef>(&MA)) {
-              if (auto *LI = dyn_cast<LoadInst>(MD->getMemoryInst())) {
-                (void)LI; // Silence warning.
-                assert(!LI->isUnordered() && "Expected unordered load");
+    // If there are more accesses than the Promotion cap or no "quota" to
+    // check clobber, then give up as we're not walking a list that long.
+    if (Flags.tooManyMemoryAccesses() || Flags.tooManyClobberingCalls())
+      return false;
+    // If there are interfering Uses (i.e. their defining access is in the
+    // loop), or ordered loads (stored as Defs!), don't move this store.
+    // Could do better here, but this is conservatively correct.
+    // TODO: Cache set of Uses on the first walk in runOnLoop, update when
+    // moving accesses. Can also extend to dominating uses.
+    auto *SIMD = MSSA->getMemoryAccess(SI);
+    for (auto *BB : CurLoop->getBlocks())
+      if (auto *Accesses = MSSA->getBlockAccesses(BB)) {
+        for (const auto &MA : *Accesses)
+          if (const auto *MU = dyn_cast<MemoryUse>(&MA)) {
+            auto *MD = MU->getDefiningAccess();
+            if (!MSSA->isLiveOnEntryDef(MD) &&
+                CurLoop->contains(MD->getBlock()))
+              return false;
+            // Disable hoisting past potentially interfering loads. Optimized
+            // Uses may point to an access outside the loop, as getClobbering
+            // checks the previous iteration when walking the backedge.
+            // FIXME: More precise: no Uses that alias SI.
+            if (!Flags.getIsSink() && !MSSA->dominates(SIMD, MU))
+              return false;
+          } else if (const auto *MD = dyn_cast<MemoryDef>(&MA)) {
+            if (auto *LI = dyn_cast<LoadInst>(MD->getMemoryInst())) {
+              (void)LI; // Silence warning.
+              assert(!LI->isUnordered() && "Expected unordered load");
+              return false;
+            }
+            // Any call, while it may not be clobbering SI, it may be a use.
+            if (auto *CI = dyn_cast<CallInst>(MD->getMemoryInst())) {
+              // Check if the call may read from the memory location written
+              // to by SI. Check CI's attributes and arguments; the number of
+              // such checks performed is limited above by NoOfMemAccTooLarge.
+              ModRefInfo MRI = AA->getModRefInfo(CI, MemoryLocation::get(SI));
+              if (isModOrRefSet(MRI))
                 return false;
-              }
-              // Any call, while it may not be clobbering SI, it may be a use.
-              if (auto *CI = dyn_cast<CallInst>(MD->getMemoryInst())) {
-                // Check if the call may read from the memory location written
-                // to by SI. Check CI's attributes and arguments; the number of
-                // such checks performed is limited above by NoOfMemAccTooLarge.
-                ModRefInfo MRI = AA->getModRefInfo(CI, MemoryLocation::get(SI));
-                if (isModOrRefSet(MRI))
-                  return false;
-              }
             }
-        }
-      auto *Source = MSSA->getSkipSelfWalker()->getClobberingMemoryAccess(SI);
-      Flags->incrementClobberingCalls();
-      // If there are no clobbering Defs in the loop, store is safe to hoist.
-      return MSSA->isLiveOnEntryDef(Source) ||
-             !CurLoop->contains(Source->getBlock());
-    }
+          }
+      }
+    auto *Source = MSSA->getSkipSelfWalker()->getClobberingMemoryAccess(SI);
+    Flags.incrementClobberingCalls();
+    // If there are no clobbering Defs in the loop, store is safe to hoist.
+    return MSSA->isLiveOnEntryDef(Source) ||
+           !CurLoop->contains(Source->getBlock());
   }
 
   assert(!I.mayReadOrWriteMemory() && "unhandled aliasing");
@@ -1421,7 +1388,7 @@ static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
 
 static Instruction *cloneInstructionInExitBlock(
     Instruction &I, BasicBlock &ExitBlock, PHINode &PN, const LoopInfo *LI,
-    const LoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU) {
+    const LoopSafetyInfo *SafetyInfo, MemorySSAUpdater &MSSAU) {
   Instruction *New;
   if (auto *CI = dyn_cast<CallInst>(&I)) {
     const auto &BlockColors = SafetyInfo->getBlockColors();
@@ -1457,16 +1424,16 @@ static Instruction *cloneInstructionInExitBlock(
   if (!I.getName().empty())
     New->setName(I.getName() + ".le");
 
-  if (MSSAU && MSSAU->getMemorySSA()->getMemoryAccess(&I)) {
+  if (MSSAU.getMemorySSA()->getMemoryAccess(&I)) {
     // Create a new MemoryAccess and let MemorySSA set its defining access.
-    MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
+    MemoryAccess *NewMemAcc = MSSAU.createMemoryAccessInBB(
         New, nullptr, New->getParent(), MemorySSA::Beginning);
     if (NewMemAcc) {
       if (auto *MemDef = dyn_cast<MemoryDef>(NewMemAcc))
-        MSSAU->insertDef(MemDef, /*RenameUses=*/true);
+        MSSAU.insertDef(MemDef, /*RenameUses=*/true);
       else {
         auto *MemUse = cast<MemoryUse>(NewMemAcc);
-        MSSAU->insertUse(MemUse, /*RenameUses=*/true);
+        MSSAU.insertUse(MemUse, /*RenameUses=*/true);
       }
     }
   }
@@ -1492,25 +1459,22 @@ static Instruction *cloneInstructionInExitBlock(
 }
 
 static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,
-                             MemorySSAUpdater *MSSAU) {
-  if (MSSAU)
-    MSSAU->removeMemoryAccess(&I);
+                             MemorySSAUpdater &MSSAU) {
+  MSSAU.removeMemoryAccess(&I);
   SafetyInfo.removeInstruction(&I);
   I.eraseFromParent();
 }
 
 static void moveInstructionBefore(Instruction &I, Instruction &Dest,
                                   ICFLoopSafetyInfo &SafetyInfo,
-                                  MemorySSAUpdater *MSSAU,
+                                  MemorySSAUpdater &MSSAU,
                                   ScalarEvolution *SE) {
   SafetyInfo.removeInstruction(&I);
   SafetyInfo.insertInstructionTo(&I, Dest.getParent());
   I.moveBefore(&Dest);
-  if (MSSAU)
-    if (MemoryUseOrDef *OldMemAcc = cast_or_null<MemoryUseOrDef>(
-            MSSAU->getMemorySSA()->getMemoryAccess(&I)))
-      MSSAU->moveToPlace(OldMemAcc, Dest.getParent(),
-                         MemorySSA::BeforeTerminator);
+  if (MemoryUseOrDef *OldMemAcc = cast_or_null<MemoryUseOrDef>(
+          MSSAU.getMemorySSA()->getMemoryAccess(&I)))
+    MSSAU.moveToPlace(OldMemAcc, Dest.getParent(), MemorySSA::BeforeTerminator);
   if (SE)
     SE->forgetValue(&I);
 }
@@ -1519,7 +1483,7 @@ static Instruction *sinkThroughTriviallyReplaceablePHI(
     PHINode *TPN, Instruction *I, LoopInfo *LI,
     SmallDenseMap<BasicBlock *, Instruction *, 32> &SunkCopies,
     const LoopSafetyInfo *SafetyInfo, const Loop *CurLoop,
-    MemorySSAUpdater *MSSAU) {
+    MemorySSAUpdater &MSSAU) {
   assert(isTriviallyReplaceablePHI(*TPN, *I) &&
          "Expect only trivially replaceable PHI");
   BasicBlock *ExitBlock = TPN->getParent();
@@ -1625,7 +1589,7 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
 ///
 static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
                  BlockFrequencyInfo *BFI, const Loop *CurLoop,
-                 ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU,
+                 ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater &MSSAU,
                  OptimizationRemarkEmitter *ORE) {
   bool Changed = false;
   LLVM_DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
@@ -1642,7 +1606,7 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
       continue;
 
     if (!DT->isReachableFromEntry(User->getParent())) {
-      U = UndefValue::get(I.getType());
+      U = PoisonValue::get(I.getType());
       Changed = true;
       continue;
     }
@@ -1655,7 +1619,7 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
     // unreachable.
     BasicBlock *BB = PN->getIncomingBlock(U);
     if (!DT->isReachableFromEntry(BB)) {
-      U = UndefValue::get(I.getType());
+      U = PoisonValue::get(I.getType());
       Changed = true;
       continue;
     }
@@ -1669,7 +1633,7 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
 
     // Split predecessors of the PHI so that we can make users trivially
     // replaceable.
-    splitPredecessorsOfLoopExit(PN, DT, LI, CurLoop, SafetyInfo, MSSAU);
+    splitPredecessorsOfLoopExit(PN, DT, LI, CurLoop, SafetyInfo, &MSSAU);
 
     // Should rebuild the iterators, as they may be invalidated by
     // splitPredecessorsOfLoopExit().
@@ -1720,7 +1684,7 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
     Instruction *New = sinkThroughTriviallyReplaceablePHI(
         PN, &I, LI, SunkCopies, SafetyInfo, CurLoop, MSSAU);
     PN->replaceAllUsesWith(New);
-    eraseInstruction(*PN, *SafetyInfo, nullptr);
+    eraseInstruction(*PN, *SafetyInfo, MSSAU);
     Changed = true;
   }
   return Changed;
@@ -1731,7 +1695,7 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
 ///
 static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
                   BasicBlock *Dest, ICFLoopSafetyInfo *SafetyInfo,
-                  MemorySSAUpdater *MSSAU, ScalarEvolution *SE,
+                  MemorySSAUpdater &MSSAU, ScalarEvolution *SE,
                   OptimizationRemarkEmitter *ORE) {
   LLVM_DEBUG(dbgs() << "LICM hoisting to " << Dest->getNameOrAsOperand() << ": "
                     << I << "\n");
@@ -1774,14 +1738,12 @@ static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
 /// Only sink or hoist an instruction if it is not a trapping instruction,
 /// or if the instruction is known not to trap when moved to the preheader.
 /// or if it is a trapping instruction and is guaranteed to execute.
-static bool isSafeToExecuteUnconditionally(Instruction &Inst,
-                                           const DominatorTree *DT,
-                                           const TargetLibraryInfo *TLI,
-                                           const Loop *CurLoop,
-                                           const LoopSafetyInfo *SafetyInfo,
-                                           OptimizationRemarkEmitter *ORE,
-                                           const Instruction *CtxI) {
-  if (isSafeToSpeculativelyExecute(&Inst, CtxI, DT, TLI))
+static bool isSafeToExecuteUnconditionally(
+    Instruction &Inst, const DominatorTree *DT, const TargetLibraryInfo *TLI,
+    const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo,
+    OptimizationRemarkEmitter *ORE, const Instruction *CtxI,
+    bool AllowSpeculation) {
+  if (AllowSpeculation && isSafeToSpeculativelyExecute(&Inst, CtxI, DT, TLI))
     return true;
 
   bool GuaranteedToExecute =
@@ -1809,7 +1771,7 @@ class LoopPromoter : public LoadAndStorePromoter {
   SmallVectorImpl<Instruction *> &LoopInsertPts;
   SmallVectorImpl<MemoryAccess *> &MSSAInsertPts;
   PredIteratorCache &PredCache;
-  MemorySSAUpdater *MSSAU;
+  MemorySSAUpdater &MSSAU;
   LoopInfo &LI;
   DebugLoc DL;
   Align Alignment;
@@ -1841,7 +1803,7 @@ public:
                SmallVectorImpl<BasicBlock *> &LEB,
                SmallVectorImpl<Instruction *> &LIP,
                SmallVectorImpl<MemoryAccess *> &MSSAIP, PredIteratorCache &PIC,
-               MemorySSAUpdater *MSSAU, LoopInfo &li, DebugLoc dl,
+               MemorySSAUpdater &MSSAU, LoopInfo &li, DebugLoc dl,
                Align Alignment, bool UnorderedAtomic, const AAMDNodes &AATags,
                ICFLoopSafetyInfo &SafetyInfo, bool CanInsertStoresInExitBlocks)
       : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA),
@@ -1883,14 +1845,14 @@ public:
       MemoryAccess *MSSAInsertPoint = MSSAInsertPts[i];
       MemoryAccess *NewMemAcc;
       if (!MSSAInsertPoint) {
-        NewMemAcc = MSSAU->createMemoryAccessInBB(
+        NewMemAcc = MSSAU.createMemoryAccessInBB(
             NewSI, nullptr, NewSI->getParent(), MemorySSA::Beginning);
       } else {
         NewMemAcc =
-            MSSAU->createMemoryAccessAfter(NewSI, nullptr, MSSAInsertPoint);
+            MSSAU.createMemoryAccessAfter(NewSI, nullptr, MSSAInsertPoint);
       }
       MSSAInsertPts[i] = NewMemAcc;
-      MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
+      MSSAU.insertDef(cast<MemoryDef>(NewMemAcc), true);
       // FIXME: true for safety, false may still be correct.
     }
   }
@@ -1902,7 +1864,7 @@ public:
 
   void instructionDeleted(Instruction *I) const override {
     SafetyInfo.removeInstruction(I);
-    MSSAU->removeMemoryAccess(I);
+    MSSAU.removeMemoryAccess(I);
   }
 
   bool shouldDelete(Instruction *I) const override {
@@ -1948,8 +1910,8 @@ bool llvm::promoteLoopAccessesToScalars(
     SmallVectorImpl<Instruction *> &InsertPts,
     SmallVectorImpl<MemoryAccess *> &MSSAInsertPts, PredIteratorCache &PIC,
     LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI,
-    Loop *CurLoop, MemorySSAUpdater *MSSAU, ICFLoopSafetyInfo *SafetyInfo,
-    OptimizationRemarkEmitter *ORE) {
+    Loop *CurLoop, MemorySSAUpdater &MSSAU, ICFLoopSafetyInfo *SafetyInfo,
+    OptimizationRemarkEmitter *ORE, bool AllowSpeculation) {
   // Verify inputs.
   assert(LI != nullptr && DT != nullptr && CurLoop != nullptr &&
          SafetyInfo != nullptr &&
@@ -1997,6 +1959,7 @@ bool llvm::promoteLoopAccessesToScalars(
 
   bool DereferenceableInPH = false;
   bool SafeToInsertStore = false;
+  bool StoreIsGuanteedToExecute = false;
   bool FoundLoadToPromote = false;
 
   SmallVector<Instruction *, 64> LoopUses;
@@ -2031,9 +1994,9 @@ bool llvm::promoteLoopAccessesToScalars(
   // different sizes.  While we are at it, collect alignment and AA info.
   Type *AccessTy = nullptr;
   for (Value *ASIV : PointerMustAliases) {
-    for (User *U : ASIV->users()) {
+    for (Use &U : ASIV->uses()) {
       // Ignore instructions that are outside the loop.
-      Instruction *UI = dyn_cast<Instruction>(U);
+      Instruction *UI = dyn_cast<Instruction>(U.getUser());
       if (!UI || !CurLoop->contains(UI))
         continue;
 
@@ -2054,16 +2017,16 @@ bool llvm::promoteLoopAccessesToScalars(
         // to execute does as well.  Thus we can increase our guaranteed
         // alignment as well.
         if (!DereferenceableInPH || (InstAlignment > Alignment))
-          if (isSafeToExecuteUnconditionally(*Load, DT, TLI, CurLoop,
-                                             SafetyInfo, ORE,
-                                             Preheader->getTerminator())) {
+          if (isSafeToExecuteUnconditionally(
+                  *Load, DT, TLI, CurLoop, SafetyInfo, ORE,
+                  Preheader->getTerminator(), AllowSpeculation)) {
             DereferenceableInPH = true;
             Alignment = std::max(Alignment, InstAlignment);
           }
       } else if (const StoreInst *Store = dyn_cast<StoreInst>(UI)) {
         // Stores *of* the pointer are not interesting, only stores *to* the
         // pointer.
-        if (UI->getOperand(1) != ASIV)
+        if (U.getOperandNo() != StoreInst::getPointerOperandIndex())
           continue;
         if (!Store->isUnordered())
           return false;
@@ -2077,10 +2040,12 @@ bool llvm::promoteLoopAccessesToScalars(
         // alignment than any other guaranteed stores, in which case we can
         // raise the alignment on the promoted store.
         Align InstAlignment = Store->getAlign();
-
+        bool GuaranteedToExecute =
+            SafetyInfo->isGuaranteedToExecute(*UI, DT, CurLoop);
+        StoreIsGuanteedToExecute |= GuaranteedToExecute;
         if (!DereferenceableInPH || !SafeToInsertStore ||
             (InstAlignment > Alignment)) {
-          if (SafetyInfo->isGuaranteedToExecute(*UI, DT, CurLoop)) {
+          if (GuaranteedToExecute) {
             DereferenceableInPH = true;
             SafeToInsertStore = true;
             Alignment = std::max(Alignment, InstAlignment);
@@ -2194,32 +2159,37 @@ bool llvm::promoteLoopAccessesToScalars(
 
   // Set up the preheader to have a definition of the value.  It is the live-out
   // value from the preheader that uses in the loop will use.
-  LoadInst *PreheaderLoad = new LoadInst(
-      AccessTy, SomePtr, SomePtr->getName() + ".promoted",
-      Preheader->getTerminator());
-  if (SawUnorderedAtomic)
-    PreheaderLoad->setOrdering(AtomicOrdering::Unordered);
-  PreheaderLoad->setAlignment(Alignment);
-  PreheaderLoad->setDebugLoc(DebugLoc());
-  if (AATags)
-    PreheaderLoad->setAAMetadata(AATags);
-  SSA.AddAvailableValue(Preheader, PreheaderLoad);
-
-  MemoryAccess *PreheaderLoadMemoryAccess = MSSAU->createMemoryAccessInBB(
-      PreheaderLoad, nullptr, PreheaderLoad->getParent(), MemorySSA::End);
-  MemoryUse *NewMemUse = cast<MemoryUse>(PreheaderLoadMemoryAccess);
-  MSSAU->insertUse(NewMemUse, /*RenameUses=*/true);
+  LoadInst *PreheaderLoad = nullptr;
+  if (FoundLoadToPromote || !StoreIsGuanteedToExecute) {
+    PreheaderLoad =
+        new LoadInst(AccessTy, SomePtr, SomePtr->getName() + ".promoted",
+                     Preheader->getTerminator());
+    if (SawUnorderedAtomic)
+      PreheaderLoad->setOrdering(AtomicOrdering::Unordered);
+    PreheaderLoad->setAlignment(Alignment);
+    PreheaderLoad->setDebugLoc(DebugLoc());
+    if (AATags)
+      PreheaderLoad->setAAMetadata(AATags);
+
+    MemoryAccess *PreheaderLoadMemoryAccess = MSSAU.createMemoryAccessInBB(
+        PreheaderLoad, nullptr, PreheaderLoad->getParent(), MemorySSA::End);
+    MemoryUse *NewMemUse = cast<MemoryUse>(PreheaderLoadMemoryAccess);
+    MSSAU.insertUse(NewMemUse, /*RenameUses=*/true);
+    SSA.AddAvailableValue(Preheader, PreheaderLoad);
+  } else {
+    SSA.AddAvailableValue(Preheader, PoisonValue::get(AccessTy));
+  }
 
   if (VerifyMemorySSA)
-    MSSAU->getMemorySSA()->verifyMemorySSA();
+    MSSAU.getMemorySSA()->verifyMemorySSA();
   // Rewrite all the loads in the loop and remember all the definitions from
   // stores in the loop.
   Promoter.run(LoopUses);
 
   if (VerifyMemorySSA)
-    MSSAU->getMemorySSA()->verifyMemorySSA();
+    MSSAU.getMemorySSA()->verifyMemorySSA();
   // If the SSAUpdater didn't use the load in the preheader, just zap it now.
-  if (PreheaderLoad->use_empty())
+  if (PreheaderLoad && PreheaderLoad->use_empty())
     eraseInstruction(*PreheaderLoad, *SafetyInfo, MSSAU);
 
   return true;
@@ -2246,8 +2216,7 @@ collectPromotionCandidates(MemorySSA *MSSA, AliasAnalysis *AA, Loop *L) {
     return false;
   };
 
-  // Populate AST with potentially promotable accesses and remove them from
-  // MaybePromotable, so they will not be checked again on the next iteration.
+  // Populate AST with potentially promotable accesses.
   SmallPtrSet<Value *, 16> AttemptingPromotion;
   foreachMemoryAccess(MSSA, L, [&](Instruction *I) {
     if (IsPotentiallyPromotable(I)) {
@@ -2286,15 +2255,9 @@ collectPromotionCandidates(MemorySSA *MSSA, AliasAnalysis *AA, Loop *L) {
   return Result;
 }
 
-static bool pointerInvalidatedByLoop(MemoryLocation MemLoc,
-                                     AliasSetTracker *CurAST, Loop *CurLoop,
-                                     AAResults *AA) {
-  return CurAST->getAliasSetFor(MemLoc).isMod();
-}
-
-bool pointerInvalidatedByLoopWithMSSA(MemorySSA *MSSA, MemoryUse *MU,
-                                      Loop *CurLoop, Instruction &I,
-                                      SinkAndHoistLICMFlags &Flags) {
+static bool pointerInvalidatedByLoop(MemorySSA *MSSA, MemoryUse *MU,
+                                     Loop *CurLoop, Instruction &I,
+                                     SinkAndHoistLICMFlags &Flags) {
   // For hoisting, use the walker to determine safety
   if (!Flags.getIsSink()) {
     MemoryAccess *Source;
@@ -2329,17 +2292,16 @@ bool pointerInvalidatedByLoopWithMSSA(MemorySSA *MSSA, MemoryUse *MU,
   if (Flags.tooManyMemoryAccesses())
     return true;
   for (auto *BB : CurLoop->getBlocks())
-    if (pointerInvalidatedByBlockWithMSSA(*BB, *MSSA, *MU))
+    if (pointerInvalidatedByBlock(*BB, *MSSA, *MU))
       return true;
   // When sinking, the source block may not be part of the loop so check it.
   if (!CurLoop->contains(&I))
-    return pointerInvalidatedByBlockWithMSSA(*I.getParent(), *MSSA, *MU);
+    return pointerInvalidatedByBlock(*I.getParent(), *MSSA, *MU);
 
   return false;
 }
 
-bool pointerInvalidatedByBlockWithMSSA(BasicBlock &BB, MemorySSA &MSSA,
-                                       MemoryUse &MU) {
+bool pointerInvalidatedByBlock(BasicBlock &BB, MemorySSA &MSSA, MemoryUse &MU) {
   if (const auto *Accesses = MSSA.getBlockDefs(&BB))
     for (const auto &MA : *Accesses)
       if (const auto *MD = dyn_cast<MemoryDef>(&MA))
diff --git a/llvm/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp b/llvm/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp
index 1c3ff1a61b7e..c063c0d3c88a 100644
--- a/llvm/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "loop-accesses"
diff --git a/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp b/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
index d438d56e38ca..2b9800f11912 100644
--- a/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
@@ -8,20 +8,15 @@
 
 #include "llvm/Transforms/Scalar/LoopBoundSplit.h"
 #include "llvm/ADT/Sequence.h"
-#include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopIterator.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/MemorySSA.h"
-#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/LoopSimplify.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 
 #define DEBUG_TYPE "loop-bound-split"
@@ -33,26 +28,23 @@ using namespace PatternMatch;
 namespace {
 struct ConditionInfo {
   /// Branch instruction with this condition
-  BranchInst *BI;
+  BranchInst *BI = nullptr;
   /// ICmp instruction with this condition
-  ICmpInst *ICmp;
+  ICmpInst *ICmp = nullptr;
   /// Preciate info
-  ICmpInst::Predicate Pred;
+  ICmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
   /// AddRec llvm value
-  Value *AddRecValue;
+  Value *AddRecValue = nullptr;
   /// Non PHI AddRec llvm value
   Value *NonPHIAddRecValue;
   /// Bound llvm value
-  Value *BoundValue;
+  Value *BoundValue = nullptr;
   /// AddRec SCEV
-  const SCEVAddRecExpr *AddRecSCEV;
+  const SCEVAddRecExpr *AddRecSCEV = nullptr;
   /// Bound SCEV
-  const SCEV *BoundSCEV;
+  const SCEV *BoundSCEV = nullptr;
 
-  ConditionInfo()
-      : BI(nullptr), ICmp(nullptr), Pred(ICmpInst::BAD_ICMP_PREDICATE),
-        AddRecValue(nullptr), BoundValue(nullptr), AddRecSCEV(nullptr),
-        BoundSCEV(nullptr) {}
+  ConditionInfo() = default;
 };
 } // namespace
 
diff --git a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
index 57e36e5b9b90..9590fbbb1994 100644
--- a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -22,7 +22,6 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/CFG.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
@@ -30,9 +29,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
 
 #define DEBUG_TYPE "loop-data-prefetch"
 
@@ -236,15 +233,14 @@ struct Prefetch {
   /// The address formula for this prefetch as returned by ScalarEvolution.
   const SCEVAddRecExpr *LSCEVAddRec;
   /// The point of insertion for the prefetch instruction.
-  Instruction *InsertPt;
+  Instruction *InsertPt = nullptr;
   /// True if targeting a write memory access.
-  bool Writes;
+  bool Writes = false;
   /// The (first seen) prefetched instruction.
-  Instruction *MemI;
+  Instruction *MemI = nullptr;
 
   /// Constructor to create a new Prefetch for \p I.
-  Prefetch(const SCEVAddRecExpr *L, Instruction *I)
-      : LSCEVAddRec(L), InsertPt(nullptr), Writes(false), MemI(nullptr) {
+  Prefetch(const SCEVAddRecExpr *L, Instruction *I) : LSCEVAddRec(L) {
     addInstruction(I);
   };
 
@@ -303,7 +299,11 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
     }
     Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
   }
-  unsigned LoopSize = Metrics.NumInsts;
+
+  if (!Metrics.NumInsts.isValid())
+    return MadeChange;
+
+  unsigned LoopSize = *Metrics.NumInsts.getValue();
   if (!LoopSize)
     LoopSize = 1;
 
diff --git a/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
index 361d6c0d9381..93f3cd704196 100644
--- a/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -17,12 +17,12 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/IR/Dominators.h"
 
 #include "llvm/IR/PatternMatch.h"
@@ -192,13 +192,13 @@ getValueOnFirstIteration(Value *V, DenseMap<Value *, Value *> &FirstIterValue,
         getValueOnFirstIteration(BO->getOperand(0), FirstIterValue, SQ);
     Value *RHS =
         getValueOnFirstIteration(BO->getOperand(1), FirstIterValue, SQ);
-    FirstIterV = SimplifyBinOp(BO->getOpcode(), LHS, RHS, SQ);
+    FirstIterV = simplifyBinOp(BO->getOpcode(), LHS, RHS, SQ);
   } else if (auto *Cmp = dyn_cast<ICmpInst>(V)) {
     Value *LHS =
         getValueOnFirstIteration(Cmp->getOperand(0), FirstIterValue, SQ);
     Value *RHS =
         getValueOnFirstIteration(Cmp->getOperand(1), FirstIterValue, SQ);
-    FirstIterV = SimplifyICmpInst(Cmp->getPredicate(), LHS, RHS, SQ);
+    FirstIterV = simplifyICmpInst(Cmp->getPredicate(), LHS, RHS, SQ);
   } else if (auto *Select = dyn_cast<SelectInst>(V)) {
     Value *Cond =
         getValueOnFirstIteration(Select->getCondition(), FirstIterValue, SQ);
@@ -458,13 +458,13 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
   if (ExitBlock && isLoopNeverExecuted(L)) {
     LLVM_DEBUG(dbgs() << "Loop is proven to never execute, delete it!");
     // We need to forget the loop before setting the incoming values of the exit
-    // phis to undef, so we properly invalidate the SCEV expressions for those
+    // phis to poison, so we properly invalidate the SCEV expressions for those
     // phis.
     SE.forgetLoop(L);
-    // Set incoming value to undef for phi nodes in the exit block.
+    // Set incoming value to poison for phi nodes in the exit block.
     for (PHINode &P : ExitBlock->phis()) {
       std::fill(P.incoming_values().begin(), P.incoming_values().end(),
-                UndefValue::get(P.getType()));
+                PoisonValue::get(P.getType()));
     }
     ORE.emit([&]() {
       return OptimizationRemark(DEBUG_TYPE, "NeverExecutes", L->getStartLoc(),
diff --git a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
index 0f4c767c1e4c..03a10cb36bb6 100644
--- a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -47,7 +47,6 @@
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
@@ -231,7 +230,7 @@ public:
     // having to update as many def-use and use-def chains.
     for (auto *Inst : reverse(Unused)) {
       if (!Inst->use_empty())
-        Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
+        Inst->replaceAllUsesWith(PoisonValue::get(Inst->getType()));
       Inst->eraseFromParent();
     }
   }
@@ -601,7 +600,7 @@ private:
         {LLVMLoopDistributeFollowupAll,
          Part->hasDepCycle() ? LLVMLoopDistributeFollowupSequential
                              : LLVMLoopDistributeFollowupCoincident});
-    if (PartitionID.hasValue()) {
+    if (PartitionID) {
       Loop *NewLoop = Part->getDistributedLoop();
       NewLoop->setLoopID(PartitionID.getValue());
     }
@@ -770,19 +769,19 @@ public:
 
     // Don't distribute the loop if we need too many SCEV run-time checks, or
     // any if it's illegal.
-    const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate();
+    const SCEVPredicate &Pred = LAI->getPSE().getPredicate();
     if (LAI->hasConvergentOp() && !Pred.isAlwaysTrue()) {
       return fail("RuntimeCheckWithConvergent",
                   "may not insert runtime check with convergent operation");
     }
 
-    if (Pred.getComplexity() > (IsForced.getValueOr(false)
+    if (Pred.getComplexity() > (IsForced.value_or(false)
                                     ? PragmaDistributeSCEVCheckThreshold
                                     : DistributeSCEVCheckThreshold))
       return fail("TooManySCEVRuntimeChecks",
                   "too many SCEV run-time checks needed.\n");
 
-    if (!IsForced.getValueOr(false) && hasDisableAllTransformsHint(L))
+    if (!IsForced.value_or(false) && hasDisableAllTransformsHint(L))
       return fail("HeuristicDisabled", "distribution heuristic disabled");
 
     LLVM_DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n");
@@ -859,7 +858,7 @@ public:
   /// Provide diagnostics then \return with false.
   bool fail(StringRef RemarkName, StringRef Message) {
     LLVMContext &Ctx = F->getContext();
-    bool Forced = isForced().getValueOr(false);
+    bool Forced = isForced().value_or(false);
 
     LLVM_DEBUG(dbgs() << "Skipping; " << Message << "\n");
 
@@ -991,7 +990,7 @@ static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT,
 
     // If distribution was forced for the specific loop to be
     // enabled/disabled, follow that.  Otherwise use the global flag.
-    if (LDL.isForced().getValueOr(EnableLoopDistribute))
+    if (LDL.isForced().value_or(EnableLoopDistribute))
       Changed |= LDL.processLoop(GetLAA);
   }
 
diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
index c46db4e63bfe..f36193fc468e 100644
--- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
@@ -54,6 +54,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopNestAnalysis.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -64,12 +65,12 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Verifier.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
@@ -210,8 +211,9 @@ struct FlattenInfo {
     if (!MatchedItCount)
       return false;
 
-    // Look through extends if the IV has been widened.
-    if (Widened &&
+    // Look through extends if the IV has been widened. Don't look through
+    // extends if we already looked through a trunc.
+    if (Widened && IsAdd &&
         (isa<SExtInst>(MatchedItCount) || isa<ZExtInst>(MatchedItCount))) {
       assert(MatchedItCount->getType() == InnerInductionPHI->getType() &&
              "Unexpected type mismatch in types after widening");
@@ -410,7 +412,7 @@ static bool findLoopComponents(
   // pre-header and one from the latch. The incoming latch value is the
   // increment variable.
   Increment =
-      dyn_cast<BinaryOperator>(InductionPHI->getIncomingValueForBlock(Latch));
+      cast<BinaryOperator>(InductionPHI->getIncomingValueForBlock(Latch));
   if (Increment->hasNUsesOrMore(3)) {
     LLVM_DEBUG(dbgs() << "Could not find valid increment\n");
     return false;
@@ -921,7 +923,7 @@ PreservedAnalyses LoopFlattenPass::run(LoopNest &LN, LoopAnalysisManager &LAM,
   // this pass will simplify all loops that contain inner loops,
   // regardless of whether anything ends up being flattened.
   Changed |= Flatten(LN, &AR.DT, &AR.LI, &AR.SE, &AR.AC, &AR.TTI, &U,
-                     MSSAU.hasValue() ? MSSAU.getPointer() : nullptr);
+                     MSSAU ? MSSAU.getPointer() : nullptr);
 
   if (!Changed)
     return PreservedAnalyses::all();
@@ -987,7 +989,7 @@ bool LoopFlattenLegacyPass::runOnFunction(Function &F) {
   for (Loop *L : *LI) {
     auto LN = LoopNest::getLoopNest(*L, *SE);
     Changed |= Flatten(*LN, DT, LI, SE, AC, TTI, nullptr,
-                       MSSAU.hasValue() ? MSSAU.getPointer() : nullptr);
+                       MSSAU ? MSSAU.getPointer() : nullptr);
   }
   return Changed;
 }
diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index bf4d275e04ba..d94b767c7b63 100644
--- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -117,7 +117,7 @@ static cl::opt<FusionDependenceAnalysisChoice> FusionDependenceAnalysis(
                           "Use the dependence analysis interface"),
                clEnumValN(FUSION_DEPENDENCE_ANALYSIS_ALL, "all",
                           "Use all available analyses")),
-    cl::Hidden, cl::init(FUSION_DEPENDENCE_ANALYSIS_ALL), cl::ZeroOrMore);
+    cl::Hidden, cl::init(FUSION_DEPENDENCE_ANALYSIS_ALL));
 
 static cl::opt<unsigned> FusionPeelMaxCount(
     "loop-fusion-peel-max-count", cl::init(0), cl::Hidden,
@@ -128,7 +128,7 @@ static cl::opt<unsigned> FusionPeelMaxCount(
 static cl::opt<bool>
     VerboseFusionDebugging("loop-fusion-verbose-debug",
                            cl::desc("Enable verbose debugging for Loop Fusion"),
-                           cl::Hidden, cl::init(false), cl::ZeroOrMore);
+                           cl::Hidden, cl::init(false));
 #endif
 
 namespace {
@@ -178,12 +178,12 @@ struct FusionCandidate {
   /// FusionCandidateCompare function, required by FusionCandidateSet to
   /// determine where the FusionCandidate should be inserted into the set. These
   /// are used to establish ordering of the FusionCandidates based on dominance.
-  const DominatorTree *DT;
+  DominatorTree &DT;
   const PostDominatorTree *PDT;
 
   OptimizationRemarkEmitter &ORE;
 
-  FusionCandidate(Loop *L, const DominatorTree *DT,
+  FusionCandidate(Loop *L, DominatorTree &DT,
                   const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE,
                   TTI::PeelingPreferences PP)
       : Preheader(L->getLoopPreheader()), Header(L->getHeader()),
@@ -192,7 +192,6 @@ struct FusionCandidate {
         GuardBranch(L->getLoopGuardBranch()), PP(PP), AbleToPeel(canPeel(L)),
         Peeled(false), DT(DT), PDT(PDT), ORE(ORE) {
 
-    assert(DT && "Expected non-null DT!");
     // Walk over all blocks in the loop and check for conditions that may
     // prevent fusion. For each block, walk over all instructions and collect
     // the memory reads and writes If any instructions that prevent fusion are
@@ -391,7 +390,7 @@ struct FusionCandidateCompare {
   /// IF RHS dominates LHS and LHS post-dominates RHS, return false;
   bool operator()(const FusionCandidate &LHS,
                   const FusionCandidate &RHS) const {
-    const DominatorTree *DT = LHS.DT;
+    const DominatorTree *DT = &(LHS.DT);
 
     BasicBlock *LHSEntryBlock = LHS.getEntryBlock();
     BasicBlock *RHSEntryBlock = RHS.getEntryBlock();
@@ -646,7 +645,7 @@ private:
     for (Loop *L : LV) {
       TTI::PeelingPreferences PP =
           gatherPeelingPreferences(L, SE, TTI, None, None);
-      FusionCandidate CurrCand(L, &DT, &PDT, ORE, PP);
+      FusionCandidate CurrCand(L, DT, &PDT, ORE, PP);
       if (!CurrCand.isEligibleForFusion(SE))
         continue;
 
@@ -991,7 +990,7 @@ private:
                                                FuseCounter);
 
           FusionCandidate FusedCand(
-              performFusion((Peel ? FC0Copy : *FC0), *FC1), &DT, &PDT, ORE,
+              performFusion((Peel ? FC0Copy : *FC0), *FC1), DT, &PDT, ORE,
               FC0Copy.PP);
           FusedCand.verify();
           assert(FusedCand.isEligibleForFusion(SE) &&
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 318c4c06f0f7..88d6a7aff3c9 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -61,7 +61,6 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
@@ -346,7 +345,7 @@ INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",
 Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
 
 static void deleteDeadInstruction(Instruction *I) {
-  I->replaceAllUsesWith(UndefValue::get(I->getType()));
+  I->replaceAllUsesWith(PoisonValue::get(I->getType()));
   I->eraseFromParent();
 }
 
@@ -798,7 +797,7 @@ bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
 }
 
 /// processLoopMemIntrinsic - Template function for calling different processor
-/// functions based on mem instrinsic type.
+/// functions based on mem intrinsic type.
 template <typename MemInst>
 bool LoopIdiomRecognize::processLoopMemIntrinsic(
     BasicBlock *BB,
@@ -995,9 +994,8 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
   SmallPtrSet<Instruction *, 1> MSIs;
   MSIs.insert(MSI);
   return processLoopStridedStore(Pointer, SE->getSCEV(MSI->getLength()),
-                                 MaybeAlign(MSI->getDestAlignment()),
-                                 SplatValue, MSI, MSIs, Ev, BECount,
-                                 IsNegStride, /*IsLoopMemset=*/true);
+                                 MSI->getDestAlign(), SplatValue, MSI, MSIs, Ev,
+                                 BECount, IsNegStride, /*IsLoopMemset=*/true);
 }
 
 /// mayLoopAccessLocation - Return true if the specified loop might access the
@@ -1101,6 +1099,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
     Value *StoredVal, Instruction *TheStore,
     SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev,
     const SCEV *BECount, bool IsNegStride, bool IsLoopMemset) {
+  Module *M = TheStore->getModule();
   Value *SplatValue = isBytewiseValue(StoredVal, *DL);
   Constant *PatternValue = nullptr;
 
@@ -1173,6 +1172,8 @@ bool LoopIdiomRecognize::processLoopStridedStore(
   CallInst *NewCall;
   if (SplatValue) {
     AAMDNodes AATags = TheStore->getAAMetadata();
+    for (Instruction *Store : Stores)
+      AATags = AATags.merge(Store->getAAMetadata());
     if (auto CI = dyn_cast<ConstantInt>(NumBytes))
       AATags = AATags.extendTo(CI->getZExtValue());
     else
@@ -1181,15 +1182,14 @@ bool LoopIdiomRecognize::processLoopStridedStore(
     NewCall = Builder.CreateMemSet(
         BasePtr, SplatValue, NumBytes, MaybeAlign(StoreAlignment),
         /*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
-  } else {
+  } else if (isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)) {
     // Everything is emitted in default address space
     Type *Int8PtrTy = DestInt8PtrTy;
 
-    Module *M = TheStore->getModule();
     StringRef FuncName = "memset_pattern16";
-    FunctionCallee MSP = M->getOrInsertFunction(FuncName, Builder.getVoidTy(),
-                                                Int8PtrTy, Int8PtrTy, IntIdxTy);
-    inferLibFuncAttributes(M, FuncName, *TLI);
+    FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16,
+                            Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntIdxTy);
+    inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI);
 
     // Otherwise we should form a memset_pattern16.  PatternValue is known to be
     // an constant array of 16-bytes.  Plop the value into a mergable global.
@@ -1200,7 +1200,9 @@ bool LoopIdiomRecognize::processLoopStridedStore(
     GV->setAlignment(Align(16));
     Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
     NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
-  }
+  } else
+    return Changed;
+
   NewCall->setDebugLoc(TheStore->getDebugLoc());
 
   if (MSSAU) {
@@ -1275,9 +1277,8 @@ class MemmoveVerifier {
 public:
   explicit MemmoveVerifier(const Value &LoadBasePtr, const Value &StoreBasePtr,
                            const DataLayout &DL)
-      : DL(DL), LoadOff(0), StoreOff(0),
-        BP1(llvm::GetPointerBaseWithConstantOffset(
-            LoadBasePtr.stripPointerCasts(), LoadOff, DL)),
+      : DL(DL), BP1(llvm::GetPointerBaseWithConstantOffset(
+                    LoadBasePtr.stripPointerCasts(), LoadOff, DL)),
         BP2(llvm::GetPointerBaseWithConstantOffset(
             StoreBasePtr.stripPointerCasts(), StoreOff, DL)),
         IsSameObject(BP1 == BP2) {}
@@ -1307,8 +1308,8 @@ public:
 
 private:
   const DataLayout &DL;
-  int64_t LoadOff;
-  int64_t StoreOff;
+  int64_t LoadOff = 0;
+  int64_t StoreOff = 0;
   const Value *BP1;
   const Value *BP2;
 
@@ -1420,26 +1421,19 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
 
   // If the store is a memcpy instruction, we must check if it will write to
   // the load memory locations. So remove it from the ignored stores.
-  if (IsMemCpy)
-    IgnoredInsts.erase(TheStore);
   MemmoveVerifier Verifier(*LoadBasePtr, *StoreBasePtr, *DL);
+  if (IsMemCpy && !Verifier.IsSameObject)
+    IgnoredInsts.erase(TheStore);
   if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
                             StoreSizeSCEV, *AA, IgnoredInsts)) {
-    if (!IsMemCpy) {
-      ORE.emit([&]() {
-        return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad",
-                                        TheLoad)
-               << ore::NV("Inst", InstRemark) << " in "
-               << ore::NV("Function", TheStore->getFunction())
-               << " function will not be hoisted: "
-               << ore::NV("Reason", "The loop may access load location");
-      });
-      return Changed;
-    }
-    // At this point loop may access load only for memcpy in same underlying
-    // object. If that's not the case bail out.
-    if (!Verifier.IsSameObject)
-      return Changed;
+    ORE.emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)
+             << ore::NV("Inst", InstRemark) << " in "
+             << ore::NV("Function", TheStore->getFunction())
+             << " function will not be hoisted: "
+             << ore::NV("Reason", "The loop may access load location");
+    });
+    return Changed;
   }
 
   bool UseMemMove = IsMemCpy ? Verifier.IsSameObject : LoopAccessStore;
@@ -1487,7 +1481,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
       return Changed;
     // We cannot allow unaligned ops for unordered load/store, so reject
     // anything where the alignment isn't at least the element size.
-    assert((StoreAlign.hasValue() && LoadAlign.hasValue()) &&
+    assert((StoreAlign && LoadAlign) &&
            "Expect unordered load/store to have align.");
     if (StoreAlign.getValue() < StoreSize || LoadAlign.getValue() < StoreSize)
       return Changed;
diff --git a/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
index b9e63a4bc06f..4249512ea0f8 100644
--- a/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/LoopInstSimplify.h"
-#include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -25,21 +24,17 @@
 #include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/IR/User.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
-#include <algorithm>
 #include <utility>
 
 using namespace llvm;
@@ -101,7 +96,7 @@ static bool simplifyLoopInst(Loop &L, DominatorTree &DT, LoopInfo &LI,
         if (!IsFirstIteration && !ToSimplify->count(&I))
           continue;
 
-        Value *V = SimplifyInstruction(&I, SQ.getWithInstruction(&I));
+        Value *V = simplifyInstruction(&I, SQ.getWithInstruction(&I));
         if (!V || !LI.replacementPreservesLCSSAForm(&I, V))
           continue;
 
@@ -109,6 +104,10 @@ static bool simplifyLoopInst(Loop &L, DominatorTree &DT, LoopInfo &LI,
           auto *UserI = cast<Instruction>(U.getUser());
           U.set(V);
 
+          // Do not bother dealing with unreachable code.
+          if (!DT.isReachableFromEntry(UserI->getParent()))
+            continue;
+
           // If the instruction is used by a PHI node we have already processed
           // we'll need to iterate on the loop body to converge, so add it to
           // the next set.
@@ -222,7 +221,7 @@ PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
       AR.MSSA->verifyMemorySSA();
   }
   if (!simplifyLoopInst(L, AR.DT, AR.LI, AR.AC, AR.TLI,
-                        MSSAU.hasValue() ? MSSAU.getPointer() : nullptr))
+                        MSSAU ? MSSAU.getPointer() : nullptr))
     return PreservedAnalyses::all();
 
   auto PA = getLoopPassPreservedAnalyses();
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index c2b065c4eb31..1d3023d04463 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/LoopCacheAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopNestAnalysis.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -33,7 +34,6 @@
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
@@ -44,7 +44,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include <cassert>
@@ -120,8 +119,6 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
       std::vector<char> Dep;
       Instruction *Src = cast<Instruction>(*I);
       Instruction *Dst = cast<Instruction>(*J);
-      if (Src == Dst)
-        continue;
       // Ignore Input dependencies.
       if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
         continue;
@@ -270,26 +267,28 @@ static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
   return true;
 }
 
-static LoopVector populateWorklist(Loop &L) {
+static void populateWorklist(Loop &L, LoopVector &LoopList) {
   LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
                     << L.getHeader()->getParent()->getName() << " Loop: %"
                     << L.getHeader()->getName() << '\n');
-  LoopVector LoopList;
+  assert(LoopList.empty() && "LoopList should initially be empty!");
   Loop *CurrentLoop = &L;
   const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
   while (!Vec->empty()) {
     // The current loop has multiple subloops in it hence it is not tightly
     // nested.
     // Discard all loops above it added into Worklist.
-    if (Vec->size() != 1)
-      return {};
+    if (Vec->size() != 1) {
+      LoopList = {};
+      return;
+    }
 
     LoopList.push_back(CurrentLoop);
     CurrentLoop = Vec->front();
     Vec = &CurrentLoop->getSubLoops();
   }
   LoopList.push_back(CurrentLoop);
-  return LoopList;
+  return;
 }
 
 namespace {
@@ -360,8 +359,10 @@ public:
       : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
 
   /// Check if the loop interchange is profitable.
-  bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId,
-                    CharMatrix &DepMatrix);
+  bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
+                    unsigned InnerLoopId, unsigned OuterLoopId,
+                    CharMatrix &DepMatrix,
+                    const DenseMap<const Loop *, unsigned> &CostMap);
 
 private:
   int getInstrOrderCost();
@@ -412,23 +413,26 @@ struct LoopInterchange {
   LoopInfo *LI = nullptr;
   DependenceInfo *DI = nullptr;
   DominatorTree *DT = nullptr;
+  std::unique_ptr<CacheCost> CC = nullptr;
 
   /// Interface to emit optimization remarks.
   OptimizationRemarkEmitter *ORE;
 
   LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
-                  DominatorTree *DT, OptimizationRemarkEmitter *ORE)
-      : SE(SE), LI(LI), DI(DI), DT(DT), ORE(ORE) {}
+                  DominatorTree *DT, std::unique_ptr<CacheCost> &CC,
+                  OptimizationRemarkEmitter *ORE)
+      : SE(SE), LI(LI), DI(DI), DT(DT), CC(std::move(CC)), ORE(ORE) {}
 
   bool run(Loop *L) {
     if (L->getParentLoop())
       return false;
-
-    return processLoopList(populateWorklist(*L));
+    SmallVector<Loop *, 8> LoopList;
+    populateWorklist(*L, LoopList);
+    return processLoopList(LoopList);
   }
 
   bool run(LoopNest &LN) {
-    const auto &LoopList = LN.getLoops();
+    SmallVector<Loop *, 8> LoopList(LN.getLoops().begin(), LN.getLoops().end());
     for (unsigned I = 1; I < LoopList.size(); ++I)
       if (LoopList[I]->getParentLoop() != LoopList[I - 1])
         return false;
@@ -460,7 +464,7 @@ struct LoopInterchange {
     return LoopList.size() - 1;
   }
 
-  bool processLoopList(ArrayRef<Loop *> LoopList) {
+  bool processLoopList(SmallVectorImpl<Loop *> &LoopList) {
     bool Changed = false;
     unsigned LoopNestDepth = LoopList.size();
     if (LoopNestDepth < 2) {
@@ -500,27 +504,55 @@ struct LoopInterchange {
     }
 
     unsigned SelecLoopId = selectLoopForInterchange(LoopList);
-    // Move the selected loop outwards to the best possible position.
-    Loop *LoopToBeInterchanged = LoopList[SelecLoopId];
-    for (unsigned i = SelecLoopId; i > 0; i--) {
-      bool Interchanged = processLoop(LoopToBeInterchanged, LoopList[i - 1], i,
-                                      i - 1, DependencyMatrix);
-      if (!Interchanged)
-        return Changed;
-      // Update the DependencyMatrix
-      interChangeDependencies(DependencyMatrix, i, i - 1);
+    // Obtain the loop vector returned from loop cache analysis beforehand,
+    // and put each <Loop, index> pair into a map for constant time query
+    // later. Indices in loop vector reprsent the optimal order of the
+    // corresponding loop, e.g., given a loopnest with depth N, index 0
+    // indicates the loop should be placed as the outermost loop and index N
+    // indicates the loop should be placed as the innermost loop.
+    //
+    // For the old pass manager CacheCost would be null.
+    DenseMap<const Loop *, unsigned> CostMap;
+    if (CC != nullptr) {
+      const auto &LoopCosts = CC->getLoopCosts();
+      for (unsigned i = 0; i < LoopCosts.size(); i++) {
+        CostMap[LoopCosts[i].first] = i;
+      }
+    }
+    // We try to achieve the globally optimal memory access for the loopnest,
+    // and do interchange based on a bubble-sort fasion. We start from
+    // the innermost loop, move it outwards to the best possible position
+    // and repeat this process.
+    for (unsigned j = SelecLoopId; j > 0; j--) {
+      bool ChangedPerIter = false;
+      for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
+        bool Interchanged = processLoop(LoopList[i], LoopList[i - 1], i, i - 1,
+                                        DependencyMatrix, CostMap);
+        if (!Interchanged)
+          continue;
+        // Loops interchanged, update LoopList accordingly.
+        std::swap(LoopList[i - 1], LoopList[i]);
+        // Update the DependencyMatrix
+        interChangeDependencies(DependencyMatrix, i, i - 1);
 #ifdef DUMP_DEP_MATRICIES
-      LLVM_DEBUG(dbgs() << "Dependence after interchange\n");
-      printDepMatrix(DependencyMatrix);
+        LLVM_DEBUG(dbgs() << "Dependence after interchange\n");
+        printDepMatrix(DependencyMatrix);
 #endif
-      Changed |= Interchanged;
+        ChangedPerIter |= Interchanged;
+        Changed |= Interchanged;
+      }
+      // Early abort if there was no interchange during an entire round of
+      // moving loops outwards.
+      if (!ChangedPerIter)
+        break;
     }
     return Changed;
   }
 
   bool processLoop(Loop *InnerLoop, Loop *OuterLoop, unsigned InnerLoopId,
                    unsigned OuterLoopId,
-                   std::vector<std::vector<char>> &DependencyMatrix) {
+                   std::vector<std::vector<char>> &DependencyMatrix,
+                   const DenseMap<const Loop *, unsigned> &CostMap) {
     LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
                       << " and OuterLoopId = " << OuterLoopId << "\n");
     LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE);
@@ -530,7 +562,8 @@ struct LoopInterchange {
     }
     LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
     LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
-    if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) {
+    if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
+                          DependencyMatrix, CostMap)) {
       LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
       return false;
     }
@@ -733,8 +766,12 @@ static PHINode *findInnerReductionPhi(Loop *L, Value *V) {
       if (PHI->getNumIncomingValues() == 1)
         continue;
       RecurrenceDescriptor RD;
-      if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD))
+      if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) {
+        // Detect floating point reduction only when it can be reordered.
+        if (RD.getExactFPMathInst() != nullptr)
+          return nullptr;
         return PHI;
+      }
       return nullptr;
     }
   }
@@ -893,28 +930,23 @@ areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL,
 static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
   BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
   for (PHINode &PHI : LoopNestExit->phis()) {
-    //  FIXME: We currently are not able to detect floating point reductions
-    //         and have to use floating point PHIs as a proxy to prevent
-    //         interchanging in the presence of floating point reductions.
-    if (PHI.getType()->isFloatingPointTy())
-      return false;
     for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) {
-     Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i));
-     if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
-       continue;
-
-     // The incoming value is defined in the outer loop latch. Currently we
-     // only support that in case the outer loop latch has a single predecessor.
-     // This guarantees that the outer loop latch is executed if and only if
-     // the inner loop is executed (because tightlyNested() guarantees that the
-     // outer loop header only branches to the inner loop or the outer loop
-     // latch).
-     // FIXME: We could weaken this logic and allow multiple predecessors,
-     //        if the values are produced outside the loop latch. We would need
-     //        additional logic to update the PHI nodes in the exit block as
-     //        well.
-     if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
-       return false;
+      Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i));
+      if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
+        continue;
+
+      // The incoming value is defined in the outer loop latch. Currently we
+      // only support that in case the outer loop latch has a single predecessor.
+      // This guarantees that the outer loop latch is executed if and only if
+      // the inner loop is executed (because tightlyNested() guarantees that the
+      // outer loop header only branches to the inner loop or the outer loop
+      // latch).
+      // FIXME: We could weaken this logic and allow multiple predecessors,
+      //        if the values are produced outside the loop latch. We would need
+      //        additional logic to update the PHI nodes in the exit block as
+      //        well.
+      if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
+        return false;
     }
   }
   return true;
@@ -1125,21 +1157,33 @@ static bool isProfitableForVectorization(unsigned InnerLoopId,
   return !DepMatrix.empty();
 }
 
-bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
-                                                unsigned OuterLoopId,
-                                                CharMatrix &DepMatrix) {
-  // TODO: Add better profitability checks.
-  // e.g
-  // 1) Construct dependency matrix and move the one with no loop carried dep
-  //    inside to enable vectorization.
-
-  // This is rough cost estimation algorithm. It counts the good and bad order
-  // of induction variables in the instruction and allows reordering if number
-  // of bad orders is more than good.
-  int Cost = getInstrOrderCost();
-  LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
-  if (Cost < -LoopInterchangeCostThreshold)
-    return true;
+bool LoopInterchangeProfitability::isProfitable(
+    const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
+    unsigned OuterLoopId, CharMatrix &DepMatrix,
+    const DenseMap<const Loop *, unsigned> &CostMap) {
+  // TODO: Remove the legacy cost model.
+
+  // This is the new cost model returned from loop cache analysis.
+  // A smaller index means the loop should be placed an outer loop, and vice
+  // versa.
+  if (CostMap.find(InnerLoop) != CostMap.end() &&
+      CostMap.find(OuterLoop) != CostMap.end()) {
+    unsigned InnerIndex = 0, OuterIndex = 0;
+    InnerIndex = CostMap.find(InnerLoop)->second;
+    OuterIndex = CostMap.find(OuterLoop)->second;
+    LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
+                      << ", OuterIndex = " << OuterIndex << "\n");
+    if (InnerIndex < OuterIndex)
+      return true;
+  } else {
+    // Legacy cost model: this is rough cost estimation algorithm. It counts the
+    // good and bad order of induction variables in the instruction and allows
+    // reordering if number of bad orders is more than good.
+    int Cost = getInstrOrderCost();
+    LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
+    if (Cost < -LoopInterchangeCostThreshold)
+      return true;
+  }
 
   // It is not profitable as per current cache profitability model. But check if
   // we can move this loop outside to improve parallelism.
@@ -1150,10 +1194,8 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
     return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
                                     InnerLoop->getStartLoc(),
                                     InnerLoop->getHeader())
-           << "Interchanging loops is too costly (cost="
-           << ore::NV("Cost", Cost) << ", threshold="
-           << ore::NV("Threshold", LoopInterchangeCostThreshold)
-           << ") and it does not improve parallelism.";
+           << "Interchanging loops is too costly and it does not improve "
+              "parallelism.";
   });
   return false;
 }
@@ -1424,9 +1466,13 @@ static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
 
     // Incoming values are guaranteed be instructions currently.
     auto IncI = cast<Instruction>(P.getIncomingValueForBlock(InnerLatch));
+    // In case of multi-level nested loops, follow LCSSA to find the incoming
+    // value defined from the innermost loop.
+    auto IncIInnerMost = cast<Instruction>(followLCSSA(IncI));
     // Skip phis with incoming values from the inner loop body, excluding the
     // header and latch.
-    if (IncI->getParent() != InnerLatch && IncI->getParent() != InnerHeader)
+    if (IncIInnerMost->getParent() != InnerLatch &&
+        IncIInnerMost->getParent() != InnerHeader)
       continue;
 
     assert(all_of(P.users(),
@@ -1695,8 +1741,8 @@ struct LoopInterchangeLegacyPass : public LoopPass {
     auto *DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI();
     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
-
-    return LoopInterchange(SE, LI, DI, DT, ORE).run(L);
+    std::unique_ptr<CacheCost> CC = nullptr;
+    return LoopInterchange(SE, LI, DI, DT, CC, ORE).run(L);
   }
 };
 } // namespace
@@ -1723,8 +1769,10 @@ PreservedAnalyses LoopInterchangePass::run(LoopNest &LN,
   Function &F = *LN.getParent();
 
   DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
+  std::unique_ptr<CacheCost> CC =
+      CacheCost::getCacheCost(LN.getOutermostLoop(), AR, DI);
   OptimizationRemarkEmitter ORE(&F);
-  if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &ORE).run(LN))
+  if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, CC, &ORE).run(LN))
     return PreservedAnalyses::all();
   return getLoopPassPreservedAnalyses();
 }
diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
index 21d59936616b..1877ac1dfd08 100644
--- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -61,7 +61,6 @@
 #include <algorithm>
 #include <cassert>
 #include <forward_list>
-#include <set>
 #include <tuple>
 #include <utility>
 
@@ -213,7 +212,8 @@ public:
         continue;
 
       // Only progagate the value if they are of the same type.
-      if (Store->getPointerOperandType() != Load->getPointerOperandType())
+      if (Store->getPointerOperandType() != Load->getPointerOperandType() ||
+          getLoadStoreType(Store) != getLoadStoreType(Load))
         continue;
 
       Candidates.emplace_front(Load, Store);
@@ -528,7 +528,7 @@ public:
       return false;
     }
 
-    if (LAI.getPSE().getUnionPredicate().getComplexity() >
+    if (LAI.getPSE().getPredicate().getComplexity() >
         LoadElimSCEVCheckThreshold) {
       LLVM_DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
       return false;
@@ -539,7 +539,7 @@ public:
       return false;
     }
 
-    if (!Checks.empty() || !LAI.getPSE().getUnionPredicate().isAlwaysTrue()) {
+    if (!Checks.empty() || !LAI.getPSE().getPredicate().isAlwaysTrue()) {
       if (LAI.hasConvergentOp()) {
         LLVM_DEBUG(dbgs() << "Versioning is needed but not allowed with "
                              "convergent calls\n");
@@ -706,8 +706,12 @@ FunctionPass *llvm::createLoopLoadEliminationPass() {
 
 PreservedAnalyses LoopLoadEliminationPass::run(Function &F,
                                                FunctionAnalysisManager &AM) {
-  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
   auto &LI = AM.getResult<LoopAnalysis>(F);
+  // There are no loops in the function. Return before computing other expensive
+  // analyses.
+  if (LI.empty())
+    return PreservedAnalyses::all();
+  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
   auto &TTI = AM.getResult<TargetIRAnalysis>(F);
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
diff --git a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
index 6c783848432b..d20d275ea60c 100644
--- a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
@@ -8,14 +8,12 @@
 
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/MemorySSA.h"
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Support/TimeProfiler.h"
 
 using namespace llvm;
@@ -311,12 +309,12 @@ PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F,
 
 #ifndef NDEBUG
     // LoopAnalysisResults should always be valid.
-    // Note that we don't LAR.SE.verify() because that can change observed SE
-    // queries. See PR44815.
     if (VerifyDomInfo)
       LAR.DT.verify();
     if (VerifyLoopInfo)
       LAR.LI.verify(LAR.DT);
+    if (VerifySCEV)
+      LAR.SE.verify();
     if (LAR.MSSA && VerifyMemorySSA)
       LAR.MSSA->verifyMemorySSA();
 #endif
diff --git a/llvm/lib/Transforms/Scalar/LoopPredication.cpp b/llvm/lib/Transforms/Scalar/LoopPredication.cpp
index aa7e79a589f2..d0ee5b47a8ca 100644
--- a/llvm/lib/Transforms/Scalar/LoopPredication.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopPredication.cpp
@@ -188,7 +188,6 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
@@ -244,7 +243,7 @@ struct LoopICmp {
   LoopICmp(ICmpInst::Predicate Pred, const SCEVAddRecExpr *IV,
            const SCEV *Limit)
     : Pred(Pred), IV(IV), Limit(Limit) {}
-  LoopICmp() {}
+  LoopICmp() = default;
   void dump() {
     dbgs() << "LoopICmp Pred = " << Pred << ", IV = " << *IV
            << ", Limit = " << *Limit << "\n";
@@ -778,7 +777,7 @@ unsigned LoopPredication::collectChecks(SmallVectorImpl<Value *> &Checks,
     if (ICmpInst *ICI = dyn_cast<ICmpInst>(Condition)) {
       if (auto NewRangeCheck = widenICmpRangeCheck(ICI, Expander,
                                                    Guard)) {
-        Checks.push_back(NewRangeCheck.getValue());
+        Checks.push_back(*NewRangeCheck);
         NumWidened++;
         continue;
       }
diff --git a/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
index 9d22eceb987f..f4ef22562341 100644
--- a/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -29,15 +29,11 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
@@ -59,7 +55,6 @@
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
-#include <cstdlib>
 #include <iterator>
 #include <map>
 #include <utility>
@@ -559,12 +554,12 @@ bool LoopReroll::isLoopControlIV(Loop *L, Instruction *IV) {
           }
           // Must be a CMP or an ext (of a value with nsw) then CMP
           else {
-            Instruction *UUser = dyn_cast<Instruction>(UU);
+            auto *UUser = cast<Instruction>(UU);
             // Skip SExt if we are extending an nsw value
             // TODO: Allow ZExt too
-            if (BO->hasNoSignedWrap() && UUser && UUser->hasOneUse() &&
+            if (BO->hasNoSignedWrap() && UUser->hasOneUse() &&
                 isa<SExtInst>(UUser))
-              UUser = dyn_cast<Instruction>(*(UUser->user_begin()));
+              UUser = cast<Instruction>(*(UUser->user_begin()));
             if (!isCompareUsedByBranch(UUser))
               return false;
           }
diff --git a/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/llvm/lib/Transforms/Scalar/LoopRotation.cpp
index 5ba137b1c85f..d9c33b5f335a 100644
--- a/llvm/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopRotation.cpp
@@ -11,10 +11,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/LoopRotation.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LazyBlockFrequencyInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
@@ -22,9 +22,7 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/LoopRotationUtils.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 using namespace llvm;
@@ -62,8 +60,8 @@ PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM,
     MSSAU = MemorySSAUpdater(AR.MSSA);
   bool Changed =
       LoopRotation(&L, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE,
-                   MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, SQ, false,
-                   Threshold, false, PrepareForLTO || PrepareForLTOOption);
+                   MSSAU ? MSSAU.getPointer() : nullptr, SQ, false, Threshold,
+                   false, PrepareForLTO || PrepareForLTOOption);
 
   if (!Changed)
     return PreservedAnalyses::all();
@@ -133,9 +131,8 @@ public:
                         : MaxHeaderSize;
 
     return LoopRotation(L, LI, TTI, AC, &DT, &SE,
-                        MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, SQ,
-                        false, Threshold, false,
-                        PrepareForLTO || PrepareForLTOOption);
+                        MSSAU ? MSSAU.getPointer() : nullptr, SQ, false,
+                        Threshold, false, PrepareForLTO || PrepareForLTOOption);
   }
 };
 } // end namespace
diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index d3fcba10c275..b7e0e32780b4 100644
--- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -16,28 +16,21 @@
 #include "llvm/Transforms/Scalar/LoopSimplifyCFG.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
-#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
-#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 using namespace llvm;
 
@@ -261,13 +254,17 @@ private:
     assert(L.getNumBlocks() == LiveLoopBlocks.size() + DeadLoopBlocks.size() &&
            "Malformed block sets?");
 
-    // Now, all exit blocks that are not marked as live are dead.
+    // Now, all exit blocks that are not marked as live are dead, if all their
+    // predecessors are in the loop. This may not be the case, as the input loop
+    // may not by in loop-simplify/canonical form.
     SmallVector<BasicBlock *, 8> ExitBlocks;
     L.getExitBlocks(ExitBlocks);
     SmallPtrSet<BasicBlock *, 8> UniqueDeadExits;
     for (auto *ExitBlock : ExitBlocks)
       if (!LiveExitBlocks.count(ExitBlock) &&
-          UniqueDeadExits.insert(ExitBlock).second)
+          UniqueDeadExits.insert(ExitBlock).second &&
+          all_of(predecessors(ExitBlock),
+                 [this](BasicBlock *Pred) { return L.contains(Pred); }))
         DeadExitBlocks.push_back(ExitBlock);
 
     // Whether or not the edge From->To will still be present in graph after the
@@ -374,7 +371,7 @@ private:
         DeadInstructions.emplace_back(LandingPad);
 
       for (Instruction *I : DeadInstructions) {
-        I->replaceAllUsesWith(UndefValue::get(I->getType()));
+        I->replaceAllUsesWith(PoisonValue::get(I->getType()));
         I->eraseFromParent();
       }
 
@@ -704,8 +701,7 @@ PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM,
     MSSAU = MemorySSAUpdater(AR.MSSA);
   bool DeleteCurrentLoop = false;
   if (!simplifyLoopCFG(L, AR.DT, AR.LI, AR.SE,
-                       MSSAU.hasValue() ? MSSAU.getPointer() : nullptr,
-                       DeleteCurrentLoop))
+                       MSSAU ? MSSAU.getPointer() : nullptr, DeleteCurrentLoop))
     return PreservedAnalyses::all();
 
   if (DeleteCurrentLoop)
@@ -739,9 +735,9 @@ public:
     if (MSSAA && VerifyMemorySSA)
       MSSAU->getMemorySSA()->verifyMemorySSA();
     bool DeleteCurrentLoop = false;
-    bool Changed = simplifyLoopCFG(
-        *L, DT, LI, SE, MSSAU.hasValue() ? MSSAU.getPointer() : nullptr,
-        DeleteCurrentLoop);
+    bool Changed =
+        simplifyLoopCFG(*L, DT, LI, SE, MSSAU ? MSSAU.getPointer() : nullptr,
+                        DeleteCurrentLoop);
     if (DeleteCurrentLoop)
       LPM.markLoopAsDeleted(*L);
     return Changed;
diff --git a/llvm/lib/Transforms/Scalar/LoopSink.cpp b/llvm/lib/Transforms/Scalar/LoopSink.cpp
index c9c9e60d0921..dce1af475fb1 100644
--- a/llvm/lib/Transforms/Scalar/LoopSink.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSink.cpp
@@ -34,24 +34,18 @@
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Metadata.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 using namespace llvm;
@@ -70,14 +64,6 @@ static cl::opt<unsigned> MaxNumberOfUseBBsForSinking(
     "max-uses-for-sinking", cl::Hidden, cl::init(30),
     cl::desc("Do not sink instructions that have too many uses."));
 
-static cl::opt<bool> EnableMSSAInLoopSink(
-    "enable-mssa-in-loop-sink", cl::Hidden, cl::init(true),
-    cl::desc("Enable MemorySSA for LoopSink in new pass manager"));
-
-static cl::opt<bool> EnableMSSAInLegacyLoopSink(
-    "enable-mssa-in-legacy-loop-sink", cl::Hidden, cl::init(false),
-    cl::desc("Enable MemorySSA for LoopSink in legacy pass manager"));
-
 /// Return adjusted total frequency of \p BBs.
 ///
 /// * If there is only one BB, sinking instruction will not introduce code
@@ -279,9 +265,8 @@ static bool sinkInstruction(
 static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
                                           DominatorTree &DT,
                                           BlockFrequencyInfo &BFI,
-                                          ScalarEvolution *SE,
-                                          AliasSetTracker *CurAST,
-                                          MemorySSA *MSSA) {
+                                          MemorySSA &MSSA,
+                                          ScalarEvolution *SE) {
   BasicBlock *Preheader = L.getLoopPreheader();
   assert(Preheader && "Expected loop to have preheader");
 
@@ -297,13 +282,8 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
       }))
     return false;
 
-  std::unique_ptr<MemorySSAUpdater> MSSAU;
-  std::unique_ptr<SinkAndHoistLICMFlags> LICMFlags;
-  if (MSSA) {
-    MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
-    LICMFlags =
-        std::make_unique<SinkAndHoistLICMFlags>(/*IsSink=*/true, &L, MSSA);
-  }
+  MemorySSAUpdater MSSAU(&MSSA);
+  SinkAndHoistLICMFlags LICMFlags(/*IsSink=*/true, &L, &MSSA);
 
   bool Changed = false;
 
@@ -324,14 +304,15 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
   // on B (A appears after B), A needs to be sinked first before B can be
   // sinked.
   for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) {
+    if (isa<PHINode>(&I))
+      continue;
     // No need to check for instruction's operands are loop invariant.
     assert(L.hasLoopInvariantOperands(&I) &&
            "Insts in a loop's preheader should have loop invariant operands!");
-    if (!canSinkOrHoistInst(I, &AA, &DT, &L, CurAST, MSSAU.get(), false,
-                            LICMFlags.get()))
+    if (!canSinkOrHoistInst(I, &AA, &DT, &L, MSSAU, false, LICMFlags))
       continue;
     if (sinkInstruction(L, I, ColdLoopBBs, LoopBlockNumber, LI, DT, BFI,
-                        MSSAU.get()))
+                        &MSSAU))
       Changed = true;
   }
 
@@ -340,13 +321,6 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
   return Changed;
 }
 
-static void computeAliasSet(Loop &L, BasicBlock &Preheader,
-                            AliasSetTracker &CurAST) {
-  for (BasicBlock *BB : L.blocks())
-    CurAST.add(*BB);
-  CurAST.add(Preheader);
-}
-
 PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) {
   LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
   // Nothing to do if there are no loops.
@@ -356,10 +330,7 @@ PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) {
   AAResults &AA = FAM.getResult<AAManager>(F);
   DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
   BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
-
-  MemorySSA *MSSA = EnableMSSAInLoopSink
-                        ? &FAM.getResult<MemorySSAAnalysis>(F).getMSSA()
-                        : nullptr;
+  MemorySSA &MSSA = FAM.getResult<MemorySSAAnalysis>(F).getMSSA();
 
   // We want to do a postorder walk over the loops. Since loops are a tree this
   // is equivalent to a reversed preorder walk and preorder is easy to compute
@@ -381,18 +352,11 @@ PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) {
     if (!Preheader->getParent()->hasProfileData())
       continue;
 
-    std::unique_ptr<AliasSetTracker> CurAST;
-    if (!EnableMSSAInLoopSink) {
-      CurAST = std::make_unique<AliasSetTracker>(AA);
-      computeAliasSet(L, *Preheader, *CurAST.get());
-    }
-
     // Note that we don't pass SCEV here because it is only used to invalidate
     // loops in SCEV and we don't preserve (or request) SCEV at all making that
     // unnecessary.
-    Changed |= sinkLoopInvariantInstructions(L, AA, LI, DT, BFI,
-                                             /*ScalarEvolution*/ nullptr,
-                                             CurAST.get(), MSSA);
+    Changed |= sinkLoopInvariantInstructions(L, AA, LI, DT, BFI, MSSA,
+                                             /*ScalarEvolution*/ nullptr);
   } while (!PreorderLoops.empty());
 
   if (!Changed)
@@ -400,13 +364,10 @@ PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) {
 
   PreservedAnalyses PA;
   PA.preserveSet<CFGAnalyses>();
+  PA.preserve<MemorySSAAnalysis>();
 
-  if (MSSA) {
-    PA.preserve<MemorySSAAnalysis>();
-
-    if (VerifyMemorySSA)
-      MSSA->verifyMemorySSA();
-  }
+  if (VerifyMemorySSA)
+    MSSA.verifyMemorySSA();
 
   return PA;
 }
@@ -432,24 +393,16 @@ struct LegacyLoopSinkPass : public LoopPass {
       return false;
 
     AAResults &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+    MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
     auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
-    std::unique_ptr<AliasSetTracker> CurAST;
-    MemorySSA *MSSA = nullptr;
-    if (EnableMSSAInLegacyLoopSink)
-      MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
-    else {
-      CurAST = std::make_unique<AliasSetTracker>(AA);
-      computeAliasSet(*L, *Preheader, *CurAST.get());
-    }
-
     bool Changed = sinkLoopInvariantInstructions(
         *L, AA, getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
         getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
         getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(),
-        SE ? &SE->getSE() : nullptr, CurAST.get(), MSSA);
+        MSSA, SE ? &SE->getSE() : nullptr);
 
-    if (MSSA && VerifyMemorySSA)
-      MSSA->verifyMemorySSA();
+    if (VerifyMemorySSA)
+      MSSA.verifyMemorySSA();
 
     return Changed;
   }
@@ -458,10 +411,8 @@ struct LegacyLoopSinkPass : public LoopPass {
     AU.setPreservesCFG();
     AU.addRequired<BlockFrequencyInfoWrapperPass>();
     getLoopAnalysisUsage(AU);
-    if (EnableMSSAInLegacyLoopSink) {
-      AU.addRequired<MemorySSAWrapperPass>();
-      AU.addPreserved<MemorySSAWrapperPass>();
-    }
+    AU.addRequired<MemorySSAWrapperPass>();
+    AU.addPreserved<MemorySSAWrapperPass>();
   }
 };
 }
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 654f0d2a03a8..9959e408e2e2 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -78,6 +78,7 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
@@ -91,9 +92,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/OperandTraits.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
@@ -114,12 +113,12 @@
 #include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
-#include <cstdlib>
 #include <iterator>
 #include <limits>
 #include <map>
@@ -142,10 +141,7 @@ static const unsigned MaxIVUsers = 200;
 /// the salvaging is not too expensive for the compiler.
 static const unsigned MaxSCEVSalvageExpressionSize = 64;
 
-// Temporary flag to cleanup congruent phis after LSR phi expansion.
-// It's currently disabled until we can determine whether it's truly useful or
-// not. The flag should be removed after the v3.0 release.
-// This is now needed for ivchains.
+// Cleanup congruent phis after LSR phi expansion.
 static cl::opt<bool> EnablePhiElim(
   "enable-lsr-phielim", cl::Hidden, cl::init(true),
   cl::desc("Enable LSR phi elimination"));
@@ -481,6 +477,12 @@ void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
   canonicalize(*L);
 }
 
+static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L) {
+  return SCEVExprContains(S, [&L](const SCEV *S) {
+    return isa<SCEVAddRecExpr>(S) && (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
+  });
+}
+
 /// Check whether or not this formula satisfies the canonical
 /// representation.
 /// \see Formula::BaseRegs.
@@ -494,18 +496,15 @@ bool Formula::isCanonical(const Loop &L) const {
   if (Scale == 1 && BaseRegs.empty())
     return false;
 
-  const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg);
-  if (SAR && SAR->getLoop() == &L)
+  if (containsAddRecDependentOnLoop(ScaledReg, L))
     return true;
 
   // If ScaledReg is not a recurrent expr, or it is but its loop is not current
   // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
   // loop, we want to swap the reg in BaseRegs with ScaledReg.
-  auto I = find_if(BaseRegs, [&](const SCEV *S) {
-    return isa<const SCEVAddRecExpr>(S) &&
-           (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
+  return none_of(BaseRegs, [&L](const SCEV *S) {
+    return containsAddRecDependentOnLoop(S, L);
   });
-  return I == BaseRegs.end();
 }
 
 /// Helper method to morph a formula into its canonical representation.
@@ -537,11 +536,9 @@ void Formula::canonicalize(const Loop &L) {
   // If ScaledReg is an invariant with respect to L, find the reg from
   // BaseRegs containing the recurrent expr related with Loop L. Swap the
   // reg with ScaledReg.
-  const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg);
-  if (!SAR || SAR->getLoop() != &L) {
-    auto I = find_if(BaseRegs, [&](const SCEV *S) {
-      return isa<const SCEVAddRecExpr>(S) &&
-             (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
+  if (!containsAddRecDependentOnLoop(ScaledReg, L)) {
+    auto I = find_if(BaseRegs, [&L](const SCEV *S) {
+      return containsAddRecDependentOnLoop(S, L);
     });
     if (I != BaseRegs.end())
       std::swap(ScaledReg, *I);
@@ -1070,7 +1067,7 @@ public:
     C.ScaleCost = 0;
   }
 
-  bool isLess(Cost &Other);
+  bool isLess(const Cost &Other);
 
   void Lose();
 
@@ -1358,6 +1355,8 @@ void Cost::RateFormula(const Formula &F,
                        const DenseSet<const SCEV *> &VisitedRegs,
                        const LSRUse &LU,
                        SmallPtrSetImpl<const SCEV *> *LoserRegs) {
+  if (isLoser())
+    return;
   assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
   // Tally up the registers.
   unsigned PrevAddRecCost = C.AddRecCost;
@@ -1467,7 +1466,7 @@ void Cost::Lose() {
 }
 
 /// Choose the lower cost.
-bool Cost::isLess(Cost &Other) {
+bool Cost::isLess(const Cost &Other) {
   if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
       C.Insns != Other.C.Insns)
     return C.Insns < Other.C.Insns;
@@ -4081,23 +4080,24 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
           continue;
         // Divide out the factor, ignoring high bits, since we'll be
         // scaling the value back up in the end.
-        if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) {
-          // TODO: This could be optimized to avoid all the copying.
-          Formula F = Base;
-          F.ScaledReg = Quotient;
-          F.deleteBaseReg(F.BaseRegs[i]);
-          // The canonical representation of 1*reg is reg, which is already in
-          // Base. In that case, do not try to insert the formula, it will be
-          // rejected anyway.
-          if (F.Scale == 1 && (F.BaseRegs.empty() ||
-                               (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
-            continue;
-          // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
-          // non canonical Formula with ScaledReg's loop not being L.
-          if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
-            F.canonicalize(*L);
-          (void)InsertFormula(LU, LUIdx, F);
-        }
+        if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true))
+          if (!Quotient->isZero()) {
+            // TODO: This could be optimized to avoid all the copying.
+            Formula F = Base;
+            F.ScaledReg = Quotient;
+            F.deleteBaseReg(F.BaseRegs[i]);
+            // The canonical representation of 1*reg is reg, which is already in
+            // Base. In that case, do not try to insert the formula, it will be
+            // rejected anyway.
+            if (F.Scale == 1 && (F.BaseRegs.empty() ||
+                                 (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
+              continue;
+            // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
+            // non canonical Formula with ScaledReg's loop not being L.
+            if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
+              F.canonicalize(*L);
+            (void)InsertFormula(LU, LUIdx, F);
+          }
       }
     }
   }
@@ -5601,6 +5601,27 @@ void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF,
     DeadInsts.emplace_back(OperandIsInstr);
 }
 
+// Check if there are any loop exit values which are only used once within the
+// loop which may potentially be optimized with a call to rewriteLoopExitValue.
+static bool LoopExitValHasSingleUse(Loop *L) {
+  BasicBlock *ExitBB = L->getExitBlock();
+  if (!ExitBB)
+    return false;
+
+  for (PHINode &ExitPhi : ExitBB->phis()) {
+    if (ExitPhi.getNumIncomingValues() != 1)
+      break;
+
+    BasicBlock *Pred = ExitPhi.getIncomingBlock(0);
+    Value *IVNext = ExitPhi.getIncomingValueForBlock(Pred);
+    // One use would be the exit phi node, and there should be only one other
+    // use for this to be considered.
+    if (IVNext->getNumUses() == 2)
+      return true;
+  }
+  return false;
+}
+
 /// Rewrite all the fixup locations with new values, following the chosen
 /// solution.
 void LSRInstance::ImplementSolution(
@@ -5894,40 +5915,57 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 namespace {
+
+/// Enables more convenient iteration over a DWARF expression vector.
+static iterator_range<llvm::DIExpression::expr_op_iterator>
+ToDwarfOpIter(SmallVectorImpl<uint64_t> &Expr) {
+  llvm::DIExpression::expr_op_iterator Begin =
+      llvm::DIExpression::expr_op_iterator(Expr.begin());
+  llvm::DIExpression::expr_op_iterator End =
+      llvm::DIExpression::expr_op_iterator(Expr.end());
+  return {Begin, End};
+}
+
 struct SCEVDbgValueBuilder {
   SCEVDbgValueBuilder() = default;
-  SCEVDbgValueBuilder(const SCEVDbgValueBuilder &Base) {
-    Values = Base.Values;
+  SCEVDbgValueBuilder(const SCEVDbgValueBuilder &Base) { clone(Base); }
+
+  void clone(const SCEVDbgValueBuilder &Base) {
+    LocationOps = Base.LocationOps;
     Expr = Base.Expr;
   }
 
+  void clear() {
+    LocationOps.clear();
+    Expr.clear();
+  }
+
   /// The DIExpression as we translate the SCEV.
   SmallVector<uint64_t, 6> Expr;
   /// The location ops of the DIExpression.
-  SmallVector<llvm::ValueAsMetadata *, 2> Values;
+  SmallVector<Value *, 2> LocationOps;
 
   void pushOperator(uint64_t Op) { Expr.push_back(Op); }
   void pushUInt(uint64_t Operand) { Expr.push_back(Operand); }
 
   /// Add a DW_OP_LLVM_arg to the expression, followed by the index of the value
   /// in the set of values referenced by the expression.
-  void pushValue(llvm::Value *V) {
+  void pushLocation(llvm::Value *V) {
     Expr.push_back(llvm::dwarf::DW_OP_LLVM_arg);
-    auto *It =
-        std::find(Values.begin(), Values.end(), llvm::ValueAsMetadata::get(V));
+    auto *It = std::find(LocationOps.begin(), LocationOps.end(), V);
     unsigned ArgIndex = 0;
-    if (It != Values.end()) {
-      ArgIndex = std::distance(Values.begin(), It);
+    if (It != LocationOps.end()) {
+      ArgIndex = std::distance(LocationOps.begin(), It);
     } else {
-      ArgIndex = Values.size();
-      Values.push_back(llvm::ValueAsMetadata::get(V));
+      ArgIndex = LocationOps.size();
+      LocationOps.push_back(V);
     }
     Expr.push_back(ArgIndex);
   }
 
   void pushValue(const SCEVUnknown *U) {
     llvm::Value *V = cast<SCEVUnknown>(U)->getValue();
-    pushValue(V);
+    pushLocation(V);
   }
 
   bool pushConst(const SCEVConstant *C) {
@@ -5938,6 +5976,12 @@ struct SCEVDbgValueBuilder {
     return true;
   }
 
+  // Iterating the expression as DWARF ops is convenient when updating
+  // DWARF_OP_LLVM_args.
+  iterator_range<llvm::DIExpression::expr_op_iterator> expr_ops() {
+    return ToDwarfOpIter(Expr);
+  }
+
   /// Several SCEV types are sequences of the same arithmetic operator applied
   /// to constants and values that may be extended or truncated.
   bool pushArithmeticExpr(const llvm::SCEVCommutativeExpr *CommExpr,
@@ -5979,7 +6023,7 @@ struct SCEVDbgValueBuilder {
     } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
       if (!U->getValue())
         return false;
-      pushValue(U->getValue());
+      pushLocation(U->getValue());
 
     } else if (const SCEVMulExpr *MulRec = dyn_cast<SCEVMulExpr>(S)) {
       Success &= pushArithmeticExpr(MulRec, llvm::dwarf::DW_OP_mul);
@@ -6010,52 +6054,6 @@ struct SCEVDbgValueBuilder {
     return Success;
   }
 
-  void setFinalExpression(llvm::DbgValueInst &DI, const DIExpression *OldExpr) {
-    // Re-state assumption that this dbg.value is not variadic. Any remaining
-    // opcodes in its expression operate on a single value already on the
-    // expression stack. Prepend our operations, which will re-compute and
-    // place that value on the expression stack.
-    assert(!DI.hasArgList());
-    auto *NewExpr =
-        DIExpression::prependOpcodes(OldExpr, Expr, /*StackValue*/ true);
-    DI.setExpression(NewExpr);
-
-    auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(Values);
-    DI.setRawLocation(llvm::DIArgList::get(DI.getContext(), ValArrayRef));
-  }
-
-  /// If a DVI can be emitted without a DIArgList, omit DW_OP_llvm_arg and the
-  /// location op index 0.
-  void setShortFinalExpression(llvm::DbgValueInst &DI,
-                               const DIExpression *OldExpr) {
-    assert((Expr[0] == llvm::dwarf::DW_OP_LLVM_arg && Expr[1] == 0) &&
-           "Expected DW_OP_llvm_arg and 0.");
-    DI.replaceVariableLocationOp(
-        0u, llvm::MetadataAsValue::get(DI.getContext(), Values[0]));
-
-    // See setFinalExpression: prepend our opcodes on the start of any old
-    // expression opcodes.
-    assert(!DI.hasArgList());
-    llvm::SmallVector<uint64_t, 6> FinalExpr(llvm::drop_begin(Expr, 2));
-    auto *NewExpr =
-        DIExpression::prependOpcodes(OldExpr, FinalExpr, /*StackValue*/ true);
-    DI.setExpression(NewExpr);
-  }
-
-  /// Once the IV and variable SCEV translation is complete, write it to the
-  /// source DVI.
-  void applyExprToDbgValue(llvm::DbgValueInst &DI,
-                           const DIExpression *OldExpr) {
-    assert(!Expr.empty() && "Unexpected empty expression.");
-    // Emit a simpler form if only a single location is referenced.
-    if (Values.size() == 1 && Expr[0] == llvm::dwarf::DW_OP_LLVM_arg &&
-        Expr[1] == 0) {
-      setShortFinalExpression(DI, OldExpr);
-    } else {
-      setFinalExpression(DI, OldExpr);
-    }
-  }
-
   /// Return true if the combination of arithmetic operator and underlying
   /// SCEV constant value is an identity function.
   bool isIdentityFunction(uint64_t Op, const SCEV *S) {
@@ -6104,6 +6102,48 @@ struct SCEVDbgValueBuilder {
     return true;
   }
 
+  /// Create an expression that is an offset from a value (usually the IV).
+  void createOffsetExpr(int64_t Offset, Value *OffsetValue) {
+    pushLocation(OffsetValue);
+    DIExpression::appendOffset(Expr, Offset);
+    LLVM_DEBUG(
+        dbgs() << "scev-salvage: Generated IV offset expression. Offset: "
+               << std::to_string(Offset) << "\n");
+  }
+
+  /// Combine a translation of the SCEV and the IV to create an expression that
+  /// recovers a location's value.
+  /// returns true if an expression was created.
+  bool createIterCountExpr(const SCEV *S,
+                           const SCEVDbgValueBuilder &IterationCount,
+                           ScalarEvolution &SE) {
+    // SCEVs for SSA values are most frquently of the form
+    // {start,+,stride}, but sometimes they are ({start,+,stride} + %a + ..).
+    // This is because %a is a PHI node that is not the IV. However, these
+    // SCEVs have not been observed to result in debuginfo-lossy optimisations,
+    // so its not expected this point will be reached.
+    if (!isa<SCEVAddRecExpr>(S))
+      return false;
+
+    LLVM_DEBUG(dbgs() << "scev-salvage: Location to salvage SCEV: " << *S
+                      << '\n');
+
+    const auto *Rec = cast<SCEVAddRecExpr>(S);
+    if (!Rec->isAffine())
+      return false;
+
+    if (S->getExpressionSize() > MaxSCEVSalvageExpressionSize)
+      return false;
+
+    // Initialise a new builder with the iteration count expression. In
+    // combination with the value's SCEV this enables recovery.
+    clone(IterationCount);
+    if (!SCEVToValueExpr(*Rec, SE))
+      return false;
+
+    return true;
+  }
+
   /// Convert a SCEV of a value to a DIExpression that is pushed onto the
   /// builder's expression stack. The stack should already contain an
   /// expression for the iteration count, so that it can be multiplied by
@@ -6133,74 +6173,294 @@ struct SCEVDbgValueBuilder {
     }
     return true;
   }
+
+  // Append the current expression and locations to a location list and an
+  // expression list. Modify the DW_OP_LLVM_arg indexes to account for
+  // the locations already present in the destination list.
+  void appendToVectors(SmallVectorImpl<uint64_t> &DestExpr,
+                       SmallVectorImpl<Value *> &DestLocations) {
+    assert(!DestLocations.empty() &&
+           "Expected the locations vector to contain the IV");
+    // The DWARF_OP_LLVM_arg arguments of the expression being appended must be
+    // modified to account for the locations already in the destination vector.
+    // All builders contain the IV as the first location op.
+    assert(!LocationOps.empty() &&
+           "Expected the location ops to contain the IV.");
+    // DestIndexMap[n] contains the index in DestLocations for the nth
+    // location in this SCEVDbgValueBuilder.
+    SmallVector<uint64_t, 2> DestIndexMap;
+    for (const auto &Op : LocationOps) {
+      auto It = find(DestLocations, Op);
+      if (It != DestLocations.end()) {
+        // Location already exists in DestLocations, reuse existing ArgIndex.
+        DestIndexMap.push_back(std::distance(DestLocations.begin(), It));
+        continue;
+      }
+      // Location is not in DestLocations, add it.
+      DestIndexMap.push_back(DestLocations.size());
+      DestLocations.push_back(Op);
+    }
+
+    for (const auto &Op : expr_ops()) {
+      if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
+        Op.appendToVector(DestExpr);
+        continue;
+      } 
+
+      DestExpr.push_back(dwarf::DW_OP_LLVM_arg);
+      // `DW_OP_LLVM_arg n` represents the nth LocationOp in this SCEV,
+      // DestIndexMap[n] contains its new index in DestLocations.
+      uint64_t NewIndex = DestIndexMap[Op.getArg(0)];
+      DestExpr.push_back(NewIndex);
+    }
+  }
 };
 
+/// Holds all the required data to salvage a dbg.value using the pre-LSR SCEVs
+/// and DIExpression.
 struct DVIRecoveryRec {
+  DVIRecoveryRec(DbgValueInst *DbgValue)
+      : DVI(DbgValue), Expr(DbgValue->getExpression()),
+        HadLocationArgList(false) {}
+
   DbgValueInst *DVI;
   DIExpression *Expr;
-  Metadata *LocationOp;
-  const llvm::SCEV *SCEV;
+  bool HadLocationArgList;
+  SmallVector<WeakVH, 2> LocationOps;
+  SmallVector<const llvm::SCEV *, 2> SCEVs;
+  SmallVector<std::unique_ptr<SCEVDbgValueBuilder>, 2> RecoveryExprs;
+
+  void clear() {
+    for (auto &RE : RecoveryExprs)
+      RE.reset();
+    RecoveryExprs.clear();
+  }
+
+  ~DVIRecoveryRec() { clear(); }
 };
 } // namespace
 
-static void RewriteDVIUsingIterCount(DVIRecoveryRec CachedDVI,
-                                     const SCEVDbgValueBuilder &IterationCount,
-                                     ScalarEvolution &SE) {
-  // LSR may add locations to previously single location-op DVIs which
-  // are currently not supported.
-  if (CachedDVI.DVI->getNumVariableLocationOps() != 1)
-    return;
+/// Returns the total number of DW_OP_llvm_arg operands in the expression.
+/// This helps in determining if a DIArglist is necessary or can be omitted from
+/// the dbg.value.
+static unsigned numLLVMArgOps(SmallVectorImpl<uint64_t> &Expr) {
+  auto expr_ops = ToDwarfOpIter(Expr);
+  unsigned Count = 0;
+  for (auto Op : expr_ops)
+    if (Op.getOp() == dwarf::DW_OP_LLVM_arg)
+      Count++;
+  return Count;
+}
+
+/// Overwrites DVI with the location and Ops as the DIExpression. This will
+/// create an invalid expression if Ops has any dwarf::DW_OP_llvm_arg operands,
+/// because a DIArglist is not created for the first argument of the dbg.value.
+static void updateDVIWithLocation(DbgValueInst &DVI, Value *Location,
+                                  SmallVectorImpl<uint64_t> &Ops) {
+  assert(
+      numLLVMArgOps(Ops) == 0 &&
+      "Expected expression that does not contain any DW_OP_llvm_arg operands.");
+  DVI.setRawLocation(ValueAsMetadata::get(Location));
+  DVI.setExpression(DIExpression::get(DVI.getContext(), Ops));
+}
+
+/// Overwrite DVI with locations placed into a DIArglist.
+static void updateDVIWithLocations(DbgValueInst &DVI,
+                                   SmallVectorImpl<Value *> &Locations,
+                                   SmallVectorImpl<uint64_t> &Ops) {
+  assert(numLLVMArgOps(Ops) != 0 &&
+         "Expected expression that references DIArglist locations using "
+         "DW_OP_llvm_arg operands.");
+  SmallVector<ValueAsMetadata *, 3> MetadataLocs;
+  for (Value *V : Locations)
+    MetadataLocs.push_back(ValueAsMetadata::get(V));
+  auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
+  DVI.setRawLocation(llvm::DIArgList::get(DVI.getContext(), ValArrayRef));
+  DVI.setExpression(DIExpression::get(DVI.getContext(), Ops));
+}
+
+/// Write the new expression and new location ops for the dbg.value. If possible
+/// reduce the szie of the dbg.value intrinsic by omitting DIArglist. This
+/// can be omitted if:
+/// 1. There is only a single location, refenced by a single DW_OP_llvm_arg.
+/// 2. The DW_OP_LLVM_arg is the first operand in the expression.
+static void UpdateDbgValueInst(DVIRecoveryRec &DVIRec,
+                               SmallVectorImpl<Value *> &NewLocationOps,
+                               SmallVectorImpl<uint64_t> &NewExpr) {
+  unsigned NumLLVMArgs = numLLVMArgOps(NewExpr);
+  if (NumLLVMArgs == 0) {
+    // Location assumed to be on the stack.
+    updateDVIWithLocation(*DVIRec.DVI, NewLocationOps[0], NewExpr);
+  } else if (NumLLVMArgs == 1 && NewExpr[0] == dwarf::DW_OP_LLVM_arg) {
+    // There is only a single DW_OP_llvm_arg at the start of the expression,
+    // so it can be omitted along with DIArglist.
+    assert(NewExpr[1] == 0 &&
+           "Lone LLVM_arg in a DIExpression should refer to location-op 0.");
+    llvm::SmallVector<uint64_t, 6> ShortenedOps(llvm::drop_begin(NewExpr, 2));
+    updateDVIWithLocation(*DVIRec.DVI, NewLocationOps[0], ShortenedOps);
+  } else {
+    // Multiple DW_OP_llvm_arg, so DIArgList is strictly necessary.
+    updateDVIWithLocations(*DVIRec.DVI, NewLocationOps, NewExpr);
+  }
+
+  // If the DIExpression was previously empty then add the stack terminator.
+  // Non-empty expressions have only had elements inserted into them and so the
+  // terminator should already be present e.g. stack_value or fragment.
+  DIExpression *SalvageExpr = DVIRec.DVI->getExpression();
+  if (!DVIRec.Expr->isComplex() && SalvageExpr->isComplex()) {
+    SalvageExpr = DIExpression::append(SalvageExpr, {dwarf::DW_OP_stack_value});
+    DVIRec.DVI->setExpression(SalvageExpr);
+  }
+}
+
+/// Cached location ops may be erased during LSR, in which case an undef is
+/// required when restoring from the cache. The type of that location is no
+/// longer available, so just use int8. The undef will be replaced by one or
+/// more locations later when a SCEVDbgValueBuilder selects alternative
+/// locations to use for the salvage.
+static Value *getValueOrUndef(WeakVH &VH, LLVMContext &C) {
+  return (VH) ? VH : UndefValue::get(llvm::Type::getInt8Ty(C));
+}
+
+/// Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
+static void restorePreTransformState(DVIRecoveryRec &DVIRec) {
+  LLVM_DEBUG(dbgs() << "scev-salvage: restore dbg.value to pre-LSR state\n"
+                    << "scev-salvage: post-LSR: " << *DVIRec.DVI << '\n');
+  assert(DVIRec.Expr && "Expected an expression");
+  DVIRec.DVI->setExpression(DVIRec.Expr);
+
+  // Even a single location-op may be inside a DIArgList and referenced with
+  // DW_OP_LLVM_arg, which is valid only with a DIArgList.
+  if (!DVIRec.HadLocationArgList) {
+    assert(DVIRec.LocationOps.size() == 1 &&
+           "Unexpected number of location ops.");
+    // LSR's unsuccessful salvage attempt may have added DIArgList, which in
+    // this case was not present before, so force the location back to a single
+    // uncontained Value.
+    Value *CachedValue =
+        getValueOrUndef(DVIRec.LocationOps[0], DVIRec.DVI->getContext());
+    DVIRec.DVI->setRawLocation(ValueAsMetadata::get(CachedValue));
+  } else {
+    SmallVector<ValueAsMetadata *, 3> MetadataLocs;
+    for (WeakVH VH : DVIRec.LocationOps) {
+      Value *CachedValue = getValueOrUndef(VH, DVIRec.DVI->getContext());
+      MetadataLocs.push_back(ValueAsMetadata::get(CachedValue));
+    }
+    auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
+    DVIRec.DVI->setRawLocation(
+        llvm::DIArgList::get(DVIRec.DVI->getContext(), ValArrayRef));
+  }
+  LLVM_DEBUG(dbgs() << "scev-salvage: pre-LSR: " << *DVIRec.DVI << '\n');
+}
 
-  // SCEVs for SSA values are most frquently of the form
-  // {start,+,stride}, but sometimes they are ({start,+,stride} + %a + ..).
-  // This is because %a is a PHI node that is not the IV. However, these
-  // SCEVs have not been observed to result in debuginfo-lossy optimisations,
-  // so its not expected this point will be reached.
-  if (!isa<SCEVAddRecExpr>(CachedDVI.SCEV))
-    return;
+static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE,
+                       llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec,
+                       const SCEV *SCEVInductionVar,
+                       SCEVDbgValueBuilder IterCountExpr) {
+  if (!DVIRec.DVI->isUndef())
+    return false;
 
-  LLVM_DEBUG(dbgs() << "scev-salvage: Value to salvage SCEV: "
-                    << *CachedDVI.SCEV << '\n');
+  // LSR may have caused several changes to the dbg.value in the failed salvage
+  // attempt. So restore the DIExpression, the location ops and also the
+  // location ops format, which is always DIArglist for multiple ops, but only
+  // sometimes for a single op.
+  restorePreTransformState(DVIRec);
+
+  // LocationOpIndexMap[i] will store the post-LSR location index of
+  // the non-optimised out location at pre-LSR index i.
+  SmallVector<int64_t, 2> LocationOpIndexMap;
+  LocationOpIndexMap.assign(DVIRec.LocationOps.size(), -1);
+  SmallVector<Value *, 2> NewLocationOps;
+  NewLocationOps.push_back(LSRInductionVar);
+
+  for (unsigned i = 0; i < DVIRec.LocationOps.size(); i++) {
+    WeakVH VH = DVIRec.LocationOps[i];
+    // Place the locations not optimised out in the list first, avoiding
+    // inserts later. The map is used to update the DIExpression's
+    // DW_OP_LLVM_arg arguments as the expression is updated.
+    if (VH && !isa<UndefValue>(VH)) {
+      NewLocationOps.push_back(VH);
+      LocationOpIndexMap[i] = NewLocationOps.size() - 1;
+      LLVM_DEBUG(dbgs() << "scev-salvage: Location index " << i
+                        << " now at index " << LocationOpIndexMap[i] << "\n");
+      continue;
+    }
 
-  const auto *Rec = cast<SCEVAddRecExpr>(CachedDVI.SCEV);
-  if (!Rec->isAffine())
-    return;
+    // It's possible that a value referred to in the SCEV may have been
+    // optimised out by LSR.
+    if (SE.containsErasedValue(DVIRec.SCEVs[i]) ||
+        SE.containsUndefs(DVIRec.SCEVs[i])) {
+      LLVM_DEBUG(dbgs() << "scev-salvage: SCEV for location at index: " << i
+                        << " refers to a location that is now undef or erased. "
+                           "Salvage abandoned.\n");
+      return false;
+    }
 
-  if (CachedDVI.SCEV->getExpressionSize() > MaxSCEVSalvageExpressionSize)
-    return;
+    LLVM_DEBUG(dbgs() << "scev-salvage: salvaging location at index " << i
+                      << " with SCEV: " << *DVIRec.SCEVs[i] << "\n");
+
+    DVIRec.RecoveryExprs[i] = std::make_unique<SCEVDbgValueBuilder>();
+    SCEVDbgValueBuilder *SalvageExpr = DVIRec.RecoveryExprs[i].get();
+
+    // Create an offset-based salvage expression if possible, as it requires
+    // less DWARF ops than an iteration count-based expression.
+    if (Optional<APInt> Offset =
+            SE.computeConstantDifference(DVIRec.SCEVs[i], SCEVInductionVar)) {
+      if (Offset.getValue().getMinSignedBits() <= 64)
+        SalvageExpr->createOffsetExpr(Offset.getValue().getSExtValue(),
+                                      LSRInductionVar);
+    } else if (!SalvageExpr->createIterCountExpr(DVIRec.SCEVs[i], IterCountExpr,
+                                                 SE))
+      return false;
+  }
 
-  // Initialise a new builder with the iteration count expression. In
-  // combination with the value's SCEV this enables recovery.
-  SCEVDbgValueBuilder RecoverValue(IterationCount);
-  if (!RecoverValue.SCEVToValueExpr(*Rec, SE))
-    return;
+  // Merge the DbgValueBuilder generated expressions and the original
+  // DIExpression, place the result into an new vector.
+  SmallVector<uint64_t, 3> NewExpr;
+  if (DVIRec.Expr->getNumElements() == 0) {
+    assert(DVIRec.RecoveryExprs.size() == 1 &&
+           "Expected only a single recovery expression for an empty "
+           "DIExpression.");
+    assert(DVIRec.RecoveryExprs[0] &&
+           "Expected a SCEVDbgSalvageBuilder for location 0");
+    SCEVDbgValueBuilder *B = DVIRec.RecoveryExprs[0].get();
+    B->appendToVectors(NewExpr, NewLocationOps);
+  }
+  for (const auto &Op : DVIRec.Expr->expr_ops()) {
+    // Most Ops needn't be updated.
+    if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
+      Op.appendToVector(NewExpr);
+      continue;
+    }
 
-  LLVM_DEBUG(dbgs() << "scev-salvage: Updating: " << *CachedDVI.DVI << '\n');
-  RecoverValue.applyExprToDbgValue(*CachedDVI.DVI, CachedDVI.Expr);
-  LLVM_DEBUG(dbgs() << "scev-salvage: to: " << *CachedDVI.DVI << '\n');
-}
+    uint64_t LocationArgIndex = Op.getArg(0);
+    SCEVDbgValueBuilder *DbgBuilder =
+        DVIRec.RecoveryExprs[LocationArgIndex].get();
+    // The location doesn't have s SCEVDbgValueBuilder, so LSR did not
+    // optimise it away. So just translate the argument to the updated
+    // location index.
+    if (!DbgBuilder) {
+      NewExpr.push_back(dwarf::DW_OP_LLVM_arg);
+      assert(LocationOpIndexMap[Op.getArg(0)] != -1 &&
+             "Expected a positive index for the location-op position.");
+      NewExpr.push_back(LocationOpIndexMap[Op.getArg(0)]);
+      continue;
+    }
+    // The location has a recovery expression.
+    DbgBuilder->appendToVectors(NewExpr, NewLocationOps);
+  }
 
-static void RewriteDVIUsingOffset(DVIRecoveryRec &DVIRec, llvm::PHINode &IV,
-                                  int64_t Offset) {
-  assert(!DVIRec.DVI->hasArgList() && "Expected single location-op dbg.value.");
-  DbgValueInst *DVI = DVIRec.DVI;
-  SmallVector<uint64_t, 8> Ops;
-  DIExpression::appendOffset(Ops, Offset);
-  DIExpression *Expr = DIExpression::prependOpcodes(DVIRec.Expr, Ops, true);
-  LLVM_DEBUG(dbgs() << "scev-salvage: Updating: " << *DVIRec.DVI << '\n');
-  DVI->setExpression(Expr);
-  llvm::Value *ValIV = dyn_cast<llvm::Value>(&IV);
-  DVI->replaceVariableLocationOp(
-      0u, llvm::MetadataAsValue::get(DVI->getContext(),
-                                     llvm::ValueAsMetadata::get(ValIV)));
-  LLVM_DEBUG(dbgs() << "scev-salvage: updated with offset to IV: "
-                    << *DVIRec.DVI << '\n');
+  UpdateDbgValueInst(DVIRec, NewLocationOps, NewExpr);
+  LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: " << *DVIRec.DVI << "\n");
+  return true;
 }
 
+/// Obtain an expression for the iteration count, then attempt to salvage the
+/// dbg.value intrinsics.
 static void
 DbgRewriteSalvageableDVIs(llvm::Loop *L, ScalarEvolution &SE,
                           llvm::PHINode *LSRInductionVar,
-                          SmallVector<DVIRecoveryRec, 2> &DVIToUpdate) {
+                          SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &DVIToUpdate) {
   if (DVIToUpdate.empty())
     return;
 
@@ -6213,49 +6473,22 @@ DbgRewriteSalvageableDVIs(llvm::Loop *L, ScalarEvolution &SE,
     if (!IVAddRec->isAffine())
       return;
 
+    // Prevent translation using excessive resources.
     if (IVAddRec->getExpressionSize() > MaxSCEVSalvageExpressionSize)
       return;
 
     // The iteration count is required to recover location values.
     SCEVDbgValueBuilder IterCountExpr;
-    IterCountExpr.pushValue(LSRInductionVar);
+    IterCountExpr.pushLocation(LSRInductionVar);
     if (!IterCountExpr.SCEVToIterCountExpr(*IVAddRec, SE))
       return;
 
     LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV: " << *SCEVInductionVar
                       << '\n');
 
-    // Needn't salvage if the location op hasn't been undef'd by LSR.
     for (auto &DVIRec : DVIToUpdate) {
-      if (!DVIRec.DVI->isUndef())
-        continue;
-
-      // Some DVIs that were single location-op when cached are now multi-op,
-      // due to LSR optimisations. However, multi-op salvaging is not yet
-      // supported by SCEV salvaging. But, we can attempt a salvage by restoring
-      // the pre-LSR single-op expression.
-      if (DVIRec.DVI->hasArgList()) {
-        if (!DVIRec.DVI->getVariableLocationOp(0))
-          continue;
-        llvm::Type *Ty = DVIRec.DVI->getVariableLocationOp(0)->getType();
-        DVIRec.DVI->setRawLocation(
-            llvm::ValueAsMetadata::get(UndefValue::get(Ty)));
-        DVIRec.DVI->setExpression(DVIRec.Expr);
-      }
-
-      LLVM_DEBUG(dbgs() << "scev-salvage: value to recover SCEV: "
-                        << *DVIRec.SCEV << '\n');
-
-      // Create a simple expression if the IV and value to salvage SCEVs
-      // start values differ by only a constant value.
-      if (Optional<APInt> Offset =
-              SE.computeConstantDifference(DVIRec.SCEV, SCEVInductionVar)) {
-        if (Offset.getValue().getMinSignedBits() <= 64)
-          RewriteDVIUsingOffset(DVIRec, *LSRInductionVar,
-                                Offset.getValue().getSExtValue());
-      } else {
-        RewriteDVIUsingIterCount(DVIRec, IterCountExpr, SE);
-      }
+      SalvageDVI(L, SE, LSRInductionVar, *DVIRec, SCEVInductionVar,
+                 IterCountExpr);
     }
   }
 }
@@ -6263,39 +6496,53 @@ DbgRewriteSalvageableDVIs(llvm::Loop *L, ScalarEvolution &SE,
 /// Identify and cache salvageable DVI locations and expressions along with the
 /// corresponding SCEV(s). Also ensure that the DVI is not deleted between
 /// cacheing and salvaging.
-static void
-DbgGatherSalvagableDVI(Loop *L, ScalarEvolution &SE,
-                       SmallVector<DVIRecoveryRec, 2> &SalvageableDVISCEVs,
-                       SmallSet<AssertingVH<DbgValueInst>, 2> &DVIHandles) {
+static void DbgGatherSalvagableDVI(
+    Loop *L, ScalarEvolution &SE,
+    SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &SalvageableDVISCEVs,
+    SmallSet<AssertingVH<DbgValueInst>, 2> &DVIHandles) {
   for (auto &B : L->getBlocks()) {
     for (auto &I : *B) {
       auto DVI = dyn_cast<DbgValueInst>(&I);
       if (!DVI)
         continue;
-
+      // Ensure that if any location op is undef that the dbg.vlue is not
+      // cached.
       if (DVI->isUndef())
         continue;
 
-      if (DVI->hasArgList())
-        continue;
+      // Check that the location op SCEVs are suitable for translation to
+      // DIExpression.
+      const auto &HasTranslatableLocationOps =
+          [&](const DbgValueInst *DVI) -> bool {
+        for (const auto LocOp : DVI->location_ops()) {
+          if (!LocOp)
+            return false;
 
-      if (!DVI->getVariableLocationOp(0) ||
-          !SE.isSCEVable(DVI->getVariableLocationOp(0)->getType()))
-        continue;
+          if (!SE.isSCEVable(LocOp->getType()))
+            return false;
 
-      // SCEVUnknown wraps an llvm::Value, it does not have a start and stride.
-      // Therefore no translation to DIExpression is performed.
-      const SCEV *S = SE.getSCEV(DVI->getVariableLocationOp(0));
-      if (isa<SCEVUnknown>(S))
-        continue;
+          const SCEV *S = SE.getSCEV(LocOp);
+          if (SE.containsUndefs(S))
+            return false;
+        }
+        return true;
+      };
 
-      // Avoid wasting resources generating an expression containing undef.
-      if (SE.containsUndefs(S))
+      if (!HasTranslatableLocationOps(DVI))
         continue;
 
-      SalvageableDVISCEVs.push_back(
-          {DVI, DVI->getExpression(), DVI->getRawLocation(),
-           SE.getSCEV(DVI->getVariableLocationOp(0))});
+      std::unique_ptr<DVIRecoveryRec> NewRec =
+          std::make_unique<DVIRecoveryRec>(DVI);
+      // Each location Op may need a SCEVDbgValueBuilder in order to recover it.
+      // Pre-allocating a vector will enable quick lookups of the builder later
+      // during the salvage.
+      NewRec->RecoveryExprs.resize(DVI->getNumVariableLocationOps());
+      for (const auto LocOp : DVI->location_ops()) {
+        NewRec->SCEVs.push_back(SE.getSCEV(LocOp));
+        NewRec->LocationOps.push_back(LocOp);
+        NewRec->HadLocationArgList = DVI->hasArgList();
+      }
+      SalvageableDVISCEVs.push_back(std::move(NewRec));
       DVIHandles.insert(DVI);
     }
   }
@@ -6344,9 +6591,9 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
 
   // Debug preservation - before we start removing anything identify which DVI
   // meet the salvageable criteria and store their DIExpression and SCEVs.
-  SmallVector<DVIRecoveryRec, 2> SalvageableDVI;
+  SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> SalvageableDVIRecords;
   SmallSet<AssertingVH<DbgValueInst>, 2> DVIHandles;
-  DbgGatherSalvagableDVI(L, SE, SalvageableDVI, DVIHandles);
+  DbgGatherSalvagableDVI(L, SE, SalvageableDVIRecords, DVIHandles);
 
   bool Changed = false;
   std::unique_ptr<MemorySSAUpdater> MSSAU;
@@ -6375,8 +6622,26 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
       DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
     }
   }
+  // LSR may at times remove all uses of an induction variable from a loop.
+  // The only remaining use is the PHI in the exit block.
+  // When this is the case, if the exit value of the IV can be calculated using
+  // SCEV, we can replace the exit block PHI with the final value of the IV and
+  // skip the updates in each loop iteration.
+  if (L->isRecursivelyLCSSAForm(DT, LI) && LoopExitValHasSingleUse(L)) {
+    SmallVector<WeakTrackingVH, 16> DeadInsts;
+    const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+    SCEVExpander Rewriter(SE, DL, "lsr", false);
+    int Rewrites = rewriteLoopExitValues(L, &LI, &TLI, &SE, &TTI, Rewriter, &DT,
+                                         OnlyCheapRepl, DeadInsts);
+    if (Rewrites) {
+      Changed = true;
+      RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts, &TLI,
+                                                           MSSAU.get());
+      DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
+    }
+  }
 
-  if (SalvageableDVI.empty())
+  if (SalvageableDVIRecords.empty())
     return Changed;
 
   // Obtain relevant IVs and attempt to rewrite the salvageable DVIs with
@@ -6384,13 +6649,16 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
   // TODO: Allow for multiple IV references for nested AddRecSCEVs
   for (auto &L : LI) {
     if (llvm::PHINode *IV = GetInductionVariable(*L, SE, Reducer))
-      DbgRewriteSalvageableDVIs(L, SE, IV, SalvageableDVI);
+      DbgRewriteSalvageableDVIs(L, SE, IV, SalvageableDVIRecords);
     else {
       LLVM_DEBUG(dbgs() << "scev-salvage: SCEV salvaging not possible. An IV "
                            "could not be identified.\n");
     }
   }
 
+  for (auto &Rec : SalvageableDVIRecords)
+    Rec->clear();
+  SalvageableDVIRecords.clear();
   DVIHandles.clear();
   return Changed;
 }
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
index 1ecbb86724e1..8c2868563227 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopNestAnalysis.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -42,10 +43,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils.h"
-#include "llvm/Transforms/Utils/LCSSA.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/LoopPeel.h"
-#include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
 #include <cassert>
@@ -331,14 +330,23 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   SmallPtrSet<const Value *, 32> EphValues;
   CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
   Loop *SubLoop = L->getSubLoops()[0];
-  unsigned InnerLoopSize =
+  InstructionCost InnerLoopSizeIC =
       ApproximateLoopSize(SubLoop, NumInlineCandidates, NotDuplicatable,
                           Convergent, TTI, EphValues, UP.BEInsns);
-  unsigned OuterLoopSize =
+  InstructionCost OuterLoopSizeIC =
       ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
                           TTI, EphValues, UP.BEInsns);
-  LLVM_DEBUG(dbgs() << "  Outer Loop Size: " << OuterLoopSize << "\n");
-  LLVM_DEBUG(dbgs() << "  Inner Loop Size: " << InnerLoopSize << "\n");
+  LLVM_DEBUG(dbgs() << "  Outer Loop Size: " << OuterLoopSizeIC << "\n");
+  LLVM_DEBUG(dbgs() << "  Inner Loop Size: " << InnerLoopSizeIC << "\n");
+
+  if (!InnerLoopSizeIC.isValid() || !OuterLoopSizeIC.isValid()) {
+    LLVM_DEBUG(dbgs() << "  Not unrolling loop which contains instructions"
+                      << " with invalid cost.\n");
+    return LoopUnrollResult::Unmodified;
+  }
+  unsigned InnerLoopSize = *InnerLoopSizeIC.getValue();
+  unsigned OuterLoopSize = *OuterLoopSizeIC.getValue();
+
   if (NotDuplicatable) {
     LLVM_DEBUG(dbgs() << "  Not unrolling loop which contains non-duplicatable "
                          "instructions.\n");
@@ -364,7 +372,7 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   Optional<MDNode *> NewInnerEpilogueLoopID = makeFollowupLoopID(
       OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll,
                         LLVMLoopUnrollAndJamFollowupRemainderInner});
-  if (NewInnerEpilogueLoopID.hasValue())
+  if (NewInnerEpilogueLoopID)
     SubLoop->setLoopID(NewInnerEpilogueLoopID.getValue());
 
   // Find trip count and trip multiple
@@ -394,14 +402,14 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
     Optional<MDNode *> NewOuterEpilogueLoopID = makeFollowupLoopID(
         OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll,
                           LLVMLoopUnrollAndJamFollowupRemainderOuter});
-    if (NewOuterEpilogueLoopID.hasValue())
+    if (NewOuterEpilogueLoopID)
       EpilogueOuterLoop->setLoopID(NewOuterEpilogueLoopID.getValue());
   }
 
   Optional<MDNode *> NewInnerLoopID =
       makeFollowupLoopID(OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll,
                                            LLVMLoopUnrollAndJamFollowupInner});
-  if (NewInnerLoopID.hasValue())
+  if (NewInnerLoopID)
     SubLoop->setLoopID(NewInnerLoopID.getValue());
   else
     SubLoop->setLoopID(OrigSubLoopID);
@@ -410,7 +418,7 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
     Optional<MDNode *> NewOuterLoopID = makeFollowupLoopID(
         OrigOuterLoopID,
         {LLVMLoopUnrollAndJamFollowupAll, LLVMLoopUnrollAndJamFollowupOuter});
-    if (NewOuterLoopID.hasValue()) {
+    if (NewOuterLoopID) {
       L->setLoopID(NewOuterLoopID.getValue());
 
       // Do not setLoopAlreadyUnrolled if a followup was given.
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 9beb2281cf0f..fda86afe5f9d 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -25,7 +25,6 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -133,7 +132,7 @@ static cl::opt<bool> UnrollAllowRemainder(
              "when unrolling a loop."));
 
 static cl::opt<bool>
-    UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden,
+    UnrollRuntime("unroll-runtime", cl::Hidden,
                   cl::desc("Unroll loops with run-time trip counts"));
 
 static cl::opt<unsigned> UnrollMaxUpperBound(
@@ -254,19 +253,19 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
     UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze;
 
   // Apply user values provided by argument
-  if (UserThreshold.hasValue()) {
+  if (UserThreshold) {
     UP.Threshold = *UserThreshold;
     UP.PartialThreshold = *UserThreshold;
   }
-  if (UserCount.hasValue())
+  if (UserCount)
     UP.Count = *UserCount;
-  if (UserAllowPartial.hasValue())
+  if (UserAllowPartial)
     UP.Partial = *UserAllowPartial;
-  if (UserRuntime.hasValue())
+  if (UserRuntime)
     UP.Runtime = *UserRuntime;
-  if (UserUpperBound.hasValue())
+  if (UserUpperBound)
     UP.UpperBound = *UserUpperBound;
-  if (UserFullUnrollMaxCount.hasValue())
+  if (UserFullUnrollMaxCount)
     UP.FullUnrollMaxCount = *UserFullUnrollMaxCount;
 
   return UP;
@@ -664,7 +663,7 @@ static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
 }
 
 /// ApproximateLoopSize - Approximate the size of the loop.
-unsigned llvm::ApproximateLoopSize(
+InstructionCost llvm::ApproximateLoopSize(
     const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, bool &Convergent,
     const TargetTransformInfo &TTI,
     const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns) {
@@ -675,7 +674,7 @@ unsigned llvm::ApproximateLoopSize(
   NotDuplicatable = Metrics.notDuplicatable;
   Convergent = Metrics.convergent;
 
-  unsigned LoopSize = Metrics.NumInsts;
+  InstructionCost LoopSize = Metrics.NumInsts;
 
   // Don't allow an estimate of size zero.  This would allows unrolling of loops
   // with huge iteration counts, which is a compile time problem even if it's
@@ -683,7 +682,9 @@ unsigned llvm::ApproximateLoopSize(
   // that each loop has at least three instructions (likely a conditional
   // branch, a comparison feeding that branch, and some kind of loop increment
   // feeding that comparison instruction).
-  LoopSize = std::max(LoopSize, BEInsns + 1);
+  if (LoopSize.isValid() && *LoopSize.getValue() < BEInsns + 1)
+    // This is an open coded max() on InstructionCost
+    LoopSize = BEInsns + 1;
 
   return LoopSize;
 }
@@ -788,15 +789,13 @@ shouldPragmaUnroll(Loop *L, const PragmaInfo &PInfo,
 
   // 2nd priority is unroll count set by pragma.
   if (PInfo.PragmaCount > 0) {
-    if ((UP.AllowRemainder || (TripMultiple % PInfo.PragmaCount == 0)) &&
-        UCE.getUnrolledLoopSize(UP, PInfo.PragmaCount) < PragmaUnrollThreshold)
+    if ((UP.AllowRemainder || (TripMultiple % PInfo.PragmaCount == 0)))
       return PInfo.PragmaCount;
   }
 
-  if (PInfo.PragmaFullUnroll && TripCount != 0) {
-    if (UCE.getUnrolledLoopSize(UP, TripCount) < PragmaUnrollThreshold)
-      return TripCount;
-  }
+  if (PInfo.PragmaFullUnroll && TripCount != 0)
+    return TripCount;
+
   // if didn't return until here, should continue to other priorties
   return None;
 }
@@ -912,7 +911,7 @@ bool llvm::computeUnrollCount(
   if (PP.PeelCount) {
     if (UnrollCount.getNumOccurrences() > 0) {
       report_fatal_error("Cannot specify both explicit peel count and "
-                         "explicit unroll count");
+                         "explicit unroll count", /*GenCrashDiag=*/false);
     }
     UP.Count = 1;
     UP.Runtime = false;
@@ -1192,10 +1191,18 @@ static LoopUnrollResult tryToUnrollLoop(
   SmallPtrSet<const Value *, 32> EphValues;
   CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
 
-  unsigned LoopSize =
+  InstructionCost LoopSizeIC =
       ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
                           TTI, EphValues, UP.BEInsns);
-  LLVM_DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
+  LLVM_DEBUG(dbgs() << "  Loop Size = " << LoopSizeIC << "\n");
+
+  if (!LoopSizeIC.isValid()) {
+    LLVM_DEBUG(dbgs() << "  Not unrolling loop which contains instructions"
+                      << " with invalid cost.\n");
+    return LoopUnrollResult::Unmodified;
+  }
+  unsigned LoopSize = *LoopSizeIC.getValue();
+
   if (NotDuplicatable) {
     LLVM_DEBUG(dbgs() << "  Not unrolling loop which contains non-duplicatable"
                       << " instructions.\n");
@@ -1316,7 +1323,7 @@ static LoopUnrollResult tryToUnrollLoop(
     Optional<MDNode *> RemainderLoopID =
         makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll,
                                         LLVMLoopUnrollFollowupRemainder});
-    if (RemainderLoopID.hasValue())
+    if (RemainderLoopID)
       RemainderLoop->setLoopID(RemainderLoopID.getValue());
   }
 
@@ -1324,7 +1331,7 @@ static LoopUnrollResult tryToUnrollLoop(
     Optional<MDNode *> NewLoopID =
         makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll,
                                         LLVMLoopUnrollFollowupUnrolled});
-    if (NewLoopID.hasValue()) {
+    if (NewLoopID) {
       L->setLoopID(NewLoopID.getValue());
 
       // Do not setLoopAlreadyUnrolled if loop attributes have been specified
@@ -1548,8 +1555,12 @@ PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
 
 PreservedAnalyses LoopUnrollPass::run(Function &F,
                                       FunctionAnalysisManager &AM) {
-  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
   auto &LI = AM.getResult<LoopAnalysis>(F);
+  // There are no loops in the function. Return before computing other expensive
+  // analyses.
+  if (LI.empty())
+    return PreservedAnalyses::all();
+  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
   auto &TTI = AM.getResult<TargetIRAnalysis>(F);
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &AC = AM.getResult<AssumptionAnalysis>(F);
diff --git a/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
deleted file mode 100644
index 76bb5497c2c2..000000000000
--- a/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ /dev/null
@@ -1,1774 +0,0 @@
-//===- LoopUnswitch.cpp - Hoist loop-invariant conditionals in loop -------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass transforms loops that contain branches on loop-invariant conditions
-// to multiple loops.  For example, it turns the left into the right code:
-//
-//  for (...)                  if (lic)
-//    A                          for (...)
-//    if (lic)                     A; B; C
-//      B                      else
-//    C                          for (...)
-//                                 A; C
-//
-// This can increase the size of the code exponentially (doubling it every time
-// a loop is unswitched) so we only unswitch if the resultant code will be
-// smaller than a threshold.
-//
-// This pass expects LICM to be run before it to hoist invariant conditions out
-// of the loop, to make the unswitching opportunity obvious.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
-#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopIterator.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/MemorySSA.h"
-#include "llvm/Analysis/MemorySSAUpdater.h"
-#include "llvm/Analysis/MustExecute.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include <algorithm>
-#include <cassert>
-#include <map>
-#include <set>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "loop-unswitch"
-
-STATISTIC(NumBranches, "Number of branches unswitched");
-STATISTIC(NumSwitches, "Number of switches unswitched");
-STATISTIC(NumGuards,   "Number of guards unswitched");
-STATISTIC(NumSelects , "Number of selects unswitched");
-STATISTIC(NumTrivial , "Number of unswitches that are trivial");
-STATISTIC(NumSimplify, "Number of simplifications of unswitched code");
-STATISTIC(TotalInsts,  "Total number of instructions analyzed");
-
-// The specific value of 100 here was chosen based only on intuition and a
-// few specific examples.
-static cl::opt<unsigned>
-Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"),
-          cl::init(100), cl::Hidden);
-
-static cl::opt<unsigned>
-    MSSAThreshold("loop-unswitch-memoryssa-threshold",
-                  cl::desc("Max number of memory uses to explore during "
-                           "partial unswitching analysis"),
-                  cl::init(100), cl::Hidden);
-
-namespace {
-
-  class LUAnalysisCache {
-    using UnswitchedValsMap =
-        DenseMap<const SwitchInst *, SmallPtrSet<const Value *, 8>>;
-    using UnswitchedValsIt = UnswitchedValsMap::iterator;
-
-    struct LoopProperties {
-      unsigned CanBeUnswitchedCount;
-      unsigned WasUnswitchedCount;
-      unsigned SizeEstimation;
-      UnswitchedValsMap UnswitchedVals;
-    };
-
-    // Here we use std::map instead of DenseMap, since we need to keep valid
-    // LoopProperties pointer for current loop for better performance.
-    using LoopPropsMap = std::map<const Loop *, LoopProperties>;
-    using LoopPropsMapIt = LoopPropsMap::iterator;
-
-    LoopPropsMap LoopsProperties;
-    UnswitchedValsMap *CurLoopInstructions = nullptr;
-    LoopProperties *CurrentLoopProperties = nullptr;
-
-    // A loop unswitching with an estimated cost above this threshold
-    // is not performed. MaxSize is turned into unswitching quota for
-    // the current loop, and reduced correspondingly, though note that
-    // the quota is returned by releaseMemory() when the loop has been
-    // processed, so that MaxSize will return to its previous
-    // value. So in most cases MaxSize will equal the Threshold flag
-    // when a new loop is processed. An exception to that is that
-    // MaxSize will have a smaller value while processing nested loops
-    // that were introduced due to loop unswitching of an outer loop.
-    //
-    // FIXME: The way that MaxSize works is subtle and depends on the
-    // pass manager processing loops and calling releaseMemory() in a
-    // specific order. It would be good to find a more straightforward
-    // way of doing what MaxSize does.
-    unsigned MaxSize;
-
-  public:
-    LUAnalysisCache() : MaxSize(Threshold) {}
-
-    // Analyze loop. Check its size, calculate is it possible to unswitch
-    // it. Returns true if we can unswitch this loop.
-    bool countLoop(const Loop *L, const TargetTransformInfo &TTI,
-                   AssumptionCache *AC);
-
-    // Clean all data related to given loop.
-    void forgetLoop(const Loop *L);
-
-    // Mark case value as unswitched.
-    // Since SI instruction can be partly unswitched, in order to avoid
-    // extra unswitching in cloned loops keep track all unswitched values.
-    void setUnswitched(const SwitchInst *SI, const Value *V);
-
-    // Check was this case value unswitched before or not.
-    bool isUnswitched(const SwitchInst *SI, const Value *V);
-
-    // Returns true if another unswitching could be done within the cost
-    // threshold.
-    bool costAllowsUnswitching();
-
-    // Clone all loop-unswitch related loop properties.
-    // Redistribute unswitching quotas.
-    // Note, that new loop data is stored inside the VMap.
-    void cloneData(const Loop *NewLoop, const Loop *OldLoop,
-                   const ValueToValueMapTy &VMap);
-  };
-
-  class LoopUnswitch : public LoopPass {
-    LoopInfo *LI;  // Loop information
-    LPPassManager *LPM;
-    AssumptionCache *AC;
-
-    // Used to check if second loop needs processing after
-    // rewriteLoopBodyWithConditionConstant rewrites first loop.
-    std::vector<Loop*> LoopProcessWorklist;
-
-    LUAnalysisCache BranchesInfo;
-
-    bool OptimizeForSize;
-    bool RedoLoop = false;
-
-    Loop *CurrentLoop = nullptr;
-    DominatorTree *DT = nullptr;
-    MemorySSA *MSSA = nullptr;
-    AAResults *AA = nullptr;
-    std::unique_ptr<MemorySSAUpdater> MSSAU;
-    BasicBlock *LoopHeader = nullptr;
-    BasicBlock *LoopPreheader = nullptr;
-
-    bool SanitizeMemory;
-    SimpleLoopSafetyInfo SafetyInfo;
-
-    // LoopBlocks contains all of the basic blocks of the loop, including the
-    // preheader of the loop, the body of the loop, and the exit blocks of the
-    // loop, in that order.
-    std::vector<BasicBlock*> LoopBlocks;
-    // NewBlocks contained cloned copy of basic blocks from LoopBlocks.
-    std::vector<BasicBlock*> NewBlocks;
-
-    bool HasBranchDivergence;
-
-  public:
-    static char ID; // Pass ID, replacement for typeid
-
-    explicit LoopUnswitch(bool Os = false, bool HasBranchDivergence = false)
-        : LoopPass(ID), OptimizeForSize(Os),
-          HasBranchDivergence(HasBranchDivergence) {
-      initializeLoopUnswitchPass(*PassRegistry::getPassRegistry());
-    }
-
-    bool runOnLoop(Loop *L, LPPassManager &LPM) override;
-    bool processCurrentLoop();
-    bool isUnreachableDueToPreviousUnswitching(BasicBlock *);
-
-    /// This transformation requires natural loop information & requires that
-    /// loop preheaders be inserted into the CFG.
-    ///
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      // Lazy BFI and BPI are marked as preserved here so Loop Unswitching
-      // can remain part of the same loop pass as LICM
-      AU.addPreserved<LazyBlockFrequencyInfoPass>();
-      AU.addPreserved<LazyBranchProbabilityInfoPass>();
-      AU.addRequired<AssumptionCacheTracker>();
-      AU.addRequired<TargetTransformInfoWrapperPass>();
-      AU.addRequired<MemorySSAWrapperPass>();
-      AU.addPreserved<MemorySSAWrapperPass>();
-      if (HasBranchDivergence)
-        AU.addRequired<LegacyDivergenceAnalysis>();
-      getLoopAnalysisUsage(AU);
-    }
-
-  private:
-    void releaseMemory() override { BranchesInfo.forgetLoop(CurrentLoop); }
-
-    void initLoopData() {
-      LoopHeader = CurrentLoop->getHeader();
-      LoopPreheader = CurrentLoop->getLoopPreheader();
-    }
-
-    /// Split all of the edges from inside the loop to their exit blocks.
-    /// Update the appropriate Phi nodes as we do so.
-    void splitExitEdges(Loop *L,
-                        const SmallVectorImpl<BasicBlock *> &ExitBlocks);
-
-    bool tryTrivialLoopUnswitch(bool &Changed);
-
-    bool unswitchIfProfitable(Value *LoopCond, Constant *Val,
-                              Instruction *TI = nullptr,
-                              ArrayRef<Instruction *> ToDuplicate = {});
-    void unswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
-                                  BasicBlock *ExitBlock, Instruction *TI);
-    void unswitchNontrivialCondition(Value *LIC, Constant *OnVal, Loop *L,
-                                     Instruction *TI,
-                                     ArrayRef<Instruction *> ToDuplicate = {});
-
-    void rewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
-                                              Constant *Val, bool IsEqual);
-
-    void
-    emitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
-                                   BasicBlock *TrueDest, BasicBlock *FalseDest,
-                                   BranchInst *OldBranch, Instruction *TI,
-                                   ArrayRef<Instruction *> ToDuplicate = {});
-
-    void simplifyCode(std::vector<Instruction *> &Worklist, Loop *L);
-
-    /// Given that the Invariant is not equal to Val. Simplify instructions
-    /// in the loop.
-    Value *simplifyInstructionWithNotEqual(Instruction *Inst, Value *Invariant,
-                                           Constant *Val);
-  };
-
-} // end anonymous namespace
-
-// Analyze loop. Check its size, calculate is it possible to unswitch
-// it. Returns true if we can unswitch this loop.
-bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI,
-                                AssumptionCache *AC) {
-  LoopPropsMapIt PropsIt;
-  bool Inserted;
-  std::tie(PropsIt, Inserted) =
-      LoopsProperties.insert(std::make_pair(L, LoopProperties()));
-
-  LoopProperties &Props = PropsIt->second;
-
-  if (Inserted) {
-    // New loop.
-
-    // Limit the number of instructions to avoid causing significant code
-    // expansion, and the number of basic blocks, to avoid loops with
-    // large numbers of branches which cause loop unswitching to go crazy.
-    // This is a very ad-hoc heuristic.
-
-    SmallPtrSet<const Value *, 32> EphValues;
-    CodeMetrics::collectEphemeralValues(L, AC, EphValues);
-
-    // FIXME: This is overly conservative because it does not take into
-    // consideration code simplification opportunities and code that can
-    // be shared by the resultant unswitched loops.
-    CodeMetrics Metrics;
-    for (BasicBlock *BB : L->blocks())
-      Metrics.analyzeBasicBlock(BB, TTI, EphValues);
-
-    Props.SizeEstimation = Metrics.NumInsts;
-    Props.CanBeUnswitchedCount = MaxSize / (Props.SizeEstimation);
-    Props.WasUnswitchedCount = 0;
-    MaxSize -= Props.SizeEstimation * Props.CanBeUnswitchedCount;
-
-    if (Metrics.notDuplicatable) {
-      LLVM_DEBUG(dbgs() << "NOT unswitching loop %" << L->getHeader()->getName()
-                        << ", contents cannot be "
-                        << "duplicated!\n");
-      return false;
-    }
-  }
-
-  // Be careful. This links are good only before new loop addition.
-  CurrentLoopProperties = &Props;
-  CurLoopInstructions = &Props.UnswitchedVals;
-
-  return true;
-}
-
-// Clean all data related to given loop.
-void LUAnalysisCache::forgetLoop(const Loop *L) {
-  LoopPropsMapIt LIt = LoopsProperties.find(L);
-
-  if (LIt != LoopsProperties.end()) {
-    LoopProperties &Props = LIt->second;
-    MaxSize += (Props.CanBeUnswitchedCount + Props.WasUnswitchedCount) *
-               Props.SizeEstimation;
-    LoopsProperties.erase(LIt);
-  }
-
-  CurrentLoopProperties = nullptr;
-  CurLoopInstructions = nullptr;
-}
-
-// Mark case value as unswitched.
-// Since SI instruction can be partly unswitched, in order to avoid
-// extra unswitching in cloned loops keep track all unswitched values.
-void LUAnalysisCache::setUnswitched(const SwitchInst *SI, const Value *V) {
-  (*CurLoopInstructions)[SI].insert(V);
-}
-
-// Check was this case value unswitched before or not.
-bool LUAnalysisCache::isUnswitched(const SwitchInst *SI, const Value *V) {
-  return (*CurLoopInstructions)[SI].count(V);
-}
-
-bool LUAnalysisCache::costAllowsUnswitching() {
-  return CurrentLoopProperties->CanBeUnswitchedCount > 0;
-}
-
-// Clone all loop-unswitch related loop properties.
-// Redistribute unswitching quotas.
-// Note, that new loop data is stored inside the VMap.
-void LUAnalysisCache::cloneData(const Loop *NewLoop, const Loop *OldLoop,
-                                const ValueToValueMapTy &VMap) {
-  LoopProperties &NewLoopProps = LoopsProperties[NewLoop];
-  LoopProperties &OldLoopProps = *CurrentLoopProperties;
-  UnswitchedValsMap &Insts = OldLoopProps.UnswitchedVals;
-
-  // Reallocate "can-be-unswitched quota"
-
-  --OldLoopProps.CanBeUnswitchedCount;
-  ++OldLoopProps.WasUnswitchedCount;
-  NewLoopProps.WasUnswitchedCount = 0;
-  unsigned Quota = OldLoopProps.CanBeUnswitchedCount;
-  NewLoopProps.CanBeUnswitchedCount = Quota / 2;
-  OldLoopProps.CanBeUnswitchedCount = Quota - Quota / 2;
-
-  NewLoopProps.SizeEstimation = OldLoopProps.SizeEstimation;
-
-  // Clone unswitched values info:
-  // for new loop switches we clone info about values that was
-  // already unswitched and has redundant successors.
-  for (const auto &I : Insts) {
-    const SwitchInst *OldInst = I.first;
-    Value *NewI = VMap.lookup(OldInst);
-    const SwitchInst *NewInst = cast_or_null<SwitchInst>(NewI);
-    assert(NewInst && "All instructions that are in SrcBB must be in VMap.");
-
-    NewLoopProps.UnswitchedVals[NewInst] = OldLoopProps.UnswitchedVals[OldInst];
-  }
-}
-
-char LoopUnswitch::ID = 0;
-
-INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops",
-                      false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(LoopPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
-INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
-INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops",
-                      false, false)
-
-Pass *llvm::createLoopUnswitchPass(bool Os, bool HasBranchDivergence) {
-  return new LoopUnswitch(Os, HasBranchDivergence);
-}
-
-/// Operator chain lattice.
-enum OperatorChain {
-  OC_OpChainNone,    ///< There is no operator.
-  OC_OpChainOr,      ///< There are only ORs.
-  OC_OpChainAnd,     ///< There are only ANDs.
-  OC_OpChainMixed    ///< There are ANDs and ORs.
-};
-
-/// Cond is a condition that occurs in L. If it is invariant in the loop, or has
-/// an invariant piece, return the invariant. Otherwise, return null.
-//
-/// NOTE: findLIVLoopCondition will not return a partial LIV by walking up a
-/// mixed operator chain, as we can not reliably find a value which will
-/// simplify the operator chain. If the chain is AND-only or OR-only, we can use
-/// 0 or ~0 to simplify the chain.
-///
-/// NOTE: In case a partial LIV and a mixed operator chain, we may be able to
-/// simplify the condition itself to a loop variant condition, but at the
-/// cost of creating an entirely new loop.
-static Value *findLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
-                                   OperatorChain &ParentChain,
-                                   DenseMap<Value *, Value *> &Cache,
-                                   MemorySSAUpdater *MSSAU) {
-  auto CacheIt = Cache.find(Cond);
-  if (CacheIt != Cache.end())
-    return CacheIt->second;
-
-  // We started analyze new instruction, increment scanned instructions counter.
-  ++TotalInsts;
-
-  // We can never unswitch on vector conditions.
-  if (Cond->getType()->isVectorTy())
-    return nullptr;
-
-  // Constants should be folded, not unswitched on!
-  if (isa<Constant>(Cond)) return nullptr;
-
-  // TODO: Handle: br (VARIANT|INVARIANT).
-
-  // Hoist simple values out.
-  if (L->makeLoopInvariant(Cond, Changed, nullptr, MSSAU)) {
-    Cache[Cond] = Cond;
-    return Cond;
-  }
-
-  // Walk up the operator chain to find partial invariant conditions.
-  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Cond))
-    if (BO->getOpcode() == Instruction::And ||
-        BO->getOpcode() == Instruction::Or) {
-      // Given the previous operator, compute the current operator chain status.
-      OperatorChain NewChain;
-      switch (ParentChain) {
-      case OC_OpChainNone:
-        NewChain = BO->getOpcode() == Instruction::And ? OC_OpChainAnd :
-                                      OC_OpChainOr;
-        break;
-      case OC_OpChainOr:
-        NewChain = BO->getOpcode() == Instruction::Or ? OC_OpChainOr :
-                                      OC_OpChainMixed;
-        break;
-      case OC_OpChainAnd:
-        NewChain = BO->getOpcode() == Instruction::And ? OC_OpChainAnd :
-                                      OC_OpChainMixed;
-        break;
-      case OC_OpChainMixed:
-        NewChain = OC_OpChainMixed;
-        break;
-      }
-
-      // If we reach a Mixed state, we do not want to keep walking up as we can not
-      // reliably find a value that will simplify the chain. With this check, we
-      // will return null on the first sight of mixed chain and the caller will
-      // either backtrack to find partial LIV in other operand or return null.
-      if (NewChain != OC_OpChainMixed) {
-        // Update the current operator chain type before we search up the chain.
-        ParentChain = NewChain;
-        // If either the left or right side is invariant, we can unswitch on this,
-        // which will cause the branch to go away in one loop and the condition to
-        // simplify in the other one.
-        if (Value *LHS = findLIVLoopCondition(BO->getOperand(0), L, Changed,
-                                              ParentChain, Cache, MSSAU)) {
-          Cache[Cond] = LHS;
-          return LHS;
-        }
-        // We did not manage to find a partial LIV in operand(0). Backtrack and try
-        // operand(1).
-        ParentChain = NewChain;
-        if (Value *RHS = findLIVLoopCondition(BO->getOperand(1), L, Changed,
-                                              ParentChain, Cache, MSSAU)) {
-          Cache[Cond] = RHS;
-          return RHS;
-        }
-      }
-    }
-
-  Cache[Cond] = nullptr;
-  return nullptr;
-}
-
-/// Cond is a condition that occurs in L. If it is invariant in the loop, or has
-/// an invariant piece, return the invariant along with the operator chain type.
-/// Otherwise, return null.
-static std::pair<Value *, OperatorChain>
-findLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
-                     MemorySSAUpdater *MSSAU) {
-  DenseMap<Value *, Value *> Cache;
-  OperatorChain OpChain = OC_OpChainNone;
-  Value *FCond = findLIVLoopCondition(Cond, L, Changed, OpChain, Cache, MSSAU);
-
-  // In case we do find a LIV, it can not be obtained by walking up a mixed
-  // operator chain.
-  assert((!FCond || OpChain != OC_OpChainMixed) &&
-        "Do not expect a partial LIV with mixed operator chain");
-  return {FCond, OpChain};
-}
-
-bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPMRef) {
-  if (skipLoop(L))
-    return false;
-
-  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
-      *L->getHeader()->getParent());
-  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  LPM = &LPMRef;
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-  MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
-  MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
-  CurrentLoop = L;
-  Function *F = CurrentLoop->getHeader()->getParent();
-
-  SanitizeMemory = F->hasFnAttribute(Attribute::SanitizeMemory);
-  if (SanitizeMemory)
-    SafetyInfo.computeLoopSafetyInfo(L);
-
-  if (VerifyMemorySSA)
-    MSSA->verifyMemorySSA();
-
-  bool Changed = false;
-  do {
-    assert(CurrentLoop->isLCSSAForm(*DT));
-    if (VerifyMemorySSA)
-      MSSA->verifyMemorySSA();
-    RedoLoop = false;
-    Changed |= processCurrentLoop();
-  } while (RedoLoop);
-
-  if (VerifyMemorySSA)
-    MSSA->verifyMemorySSA();
-
-  return Changed;
-}
-
-// Return true if the BasicBlock BB is unreachable from the loop header.
-// Return false, otherwise.
-bool LoopUnswitch::isUnreachableDueToPreviousUnswitching(BasicBlock *BB) {
-  auto *Node = DT->getNode(BB)->getIDom();
-  BasicBlock *DomBB = Node->getBlock();
-  while (CurrentLoop->contains(DomBB)) {
-    BranchInst *BInst = dyn_cast<BranchInst>(DomBB->getTerminator());
-
-    Node = DT->getNode(DomBB)->getIDom();
-    DomBB = Node->getBlock();
-
-    if (!BInst || !BInst->isConditional())
-      continue;
-
-    Value *Cond = BInst->getCondition();
-    if (!isa<ConstantInt>(Cond))
-      continue;
-
-    BasicBlock *UnreachableSucc =
-        Cond == ConstantInt::getTrue(Cond->getContext())
-            ? BInst->getSuccessor(1)
-            : BInst->getSuccessor(0);
-
-    if (DT->dominates(UnreachableSucc, BB))
-      return true;
-  }
-  return false;
-}
-
-/// FIXME: Remove this workaround when freeze related patches are done.
-/// LoopUnswitch and Equality propagation in GVN have discrepancy about
-/// whether branch on undef/poison has undefine behavior. Here it is to
-/// rule out some common cases that we found such discrepancy already
-/// causing problems. Detail could be found in PR31652. Note if the
-/// func returns true, it is unsafe. But if it is false, it doesn't mean
-/// it is necessarily safe.
-static bool equalityPropUnSafe(Value &LoopCond) {
-  ICmpInst *CI = dyn_cast<ICmpInst>(&LoopCond);
-  if (!CI || !CI->isEquality())
-    return false;
-
-  Value *LHS = CI->getOperand(0);
-  Value *RHS = CI->getOperand(1);
-  if (isa<UndefValue>(LHS) || isa<UndefValue>(RHS))
-    return true;
-
-  auto HasUndefInPHI = [](PHINode &PN) {
-    for (Value *Opd : PN.incoming_values()) {
-      if (isa<UndefValue>(Opd))
-        return true;
-    }
-    return false;
-  };
-  PHINode *LPHI = dyn_cast<PHINode>(LHS);
-  PHINode *RPHI = dyn_cast<PHINode>(RHS);
-  if ((LPHI && HasUndefInPHI(*LPHI)) || (RPHI && HasUndefInPHI(*RPHI)))
-    return true;
-
-  auto HasUndefInSelect = [](SelectInst &SI) {
-    if (isa<UndefValue>(SI.getTrueValue()) ||
-        isa<UndefValue>(SI.getFalseValue()))
-      return true;
-    return false;
-  };
-  SelectInst *LSI = dyn_cast<SelectInst>(LHS);
-  SelectInst *RSI = dyn_cast<SelectInst>(RHS);
-  if ((LSI && HasUndefInSelect(*LSI)) || (RSI && HasUndefInSelect(*RSI)))
-    return true;
-  return false;
-}
-
-/// Do actual work and unswitch loop if possible and profitable.
-bool LoopUnswitch::processCurrentLoop() {
-  bool Changed = false;
-
-  initLoopData();
-
-  // If LoopSimplify was unable to form a preheader, don't do any unswitching.
-  if (!LoopPreheader)
-    return false;
-
-  // Loops with indirectbr cannot be cloned.
-  if (!CurrentLoop->isSafeToClone())
-    return false;
-
-  // Without dedicated exits, splitting the exit edge may fail.
-  if (!CurrentLoop->hasDedicatedExits())
-    return false;
-
-  LLVMContext &Context = LoopHeader->getContext();
-
-  // Analyze loop cost, and stop unswitching if loop content can not be duplicated.
-  if (!BranchesInfo.countLoop(
-          CurrentLoop,
-          getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
-              *CurrentLoop->getHeader()->getParent()),
-          AC))
-    return false;
-
-  // Try trivial unswitch first before loop over other basic blocks in the loop.
-  if (tryTrivialLoopUnswitch(Changed)) {
-    return true;
-  }
-
-  // Do not do non-trivial unswitch while optimizing for size.
-  // FIXME: Use Function::hasOptSize().
-  if (OptimizeForSize ||
-      LoopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize))
-    return Changed;
-
-  // Run through the instructions in the loop, keeping track of three things:
-  //
-  //  - That we do not unswitch loops containing convergent operations, as we
-  //    might be making them control dependent on the unswitch value when they
-  //    were not before.
-  //    FIXME: This could be refined to only bail if the convergent operation is
-  //    not already control-dependent on the unswitch value.
-  //
-  //  - That basic blocks in the loop contain invokes whose predecessor edges we
-  //    cannot split.
-  //
-  //  - The set of guard intrinsics encountered (these are non terminator
-  //    instructions that are also profitable to be unswitched).
-
-  SmallVector<IntrinsicInst *, 4> Guards;
-
-  for (const auto BB : CurrentLoop->blocks()) {
-    for (auto &I : *BB) {
-      auto *CB = dyn_cast<CallBase>(&I);
-      if (!CB)
-        continue;
-      if (CB->isConvergent())
-        return Changed;
-      if (auto *II = dyn_cast<InvokeInst>(&I))
-        if (!II->getUnwindDest()->canSplitPredecessors())
-          return Changed;
-      if (auto *II = dyn_cast<IntrinsicInst>(&I))
-        if (II->getIntrinsicID() == Intrinsic::experimental_guard)
-          Guards.push_back(II);
-    }
-  }
-
-  for (IntrinsicInst *Guard : Guards) {
-    Value *LoopCond = findLIVLoopCondition(Guard->getOperand(0), CurrentLoop,
-                                           Changed, MSSAU.get())
-                          .first;
-    if (LoopCond &&
-        unswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) {
-      // NB! Unswitching (if successful) could have erased some of the
-      // instructions in Guards leaving dangling pointers there.  This is fine
-      // because we're returning now, and won't look at Guards again.
-      ++NumGuards;
-      return true;
-    }
-  }
-
-  // Loop over all of the basic blocks in the loop.  If we find an interior
-  // block that is branching on a loop-invariant condition, we can unswitch this
-  // loop.
-  for (Loop::block_iterator I = CurrentLoop->block_begin(),
-                            E = CurrentLoop->block_end();
-       I != E; ++I) {
-    Instruction *TI = (*I)->getTerminator();
-
-    // Unswitching on a potentially uninitialized predicate is not
-    // MSan-friendly. Limit this to the cases when the original predicate is
-    // guaranteed to execute, to avoid creating a use-of-uninitialized-value
-    // in the code that did not have one.
-    // This is a workaround for the discrepancy between LLVM IR and MSan
-    // semantics. See PR28054 for more details.
-    if (SanitizeMemory &&
-        !SafetyInfo.isGuaranteedToExecute(*TI, DT, CurrentLoop))
-      continue;
-
-    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
-      // Some branches may be rendered unreachable because of previous
-      // unswitching.
-      // Unswitch only those branches that are reachable.
-      if (isUnreachableDueToPreviousUnswitching(*I))
-        continue;
-
-      // If this isn't branching on an invariant condition, we can't unswitch
-      // it.
-      if (BI->isConditional()) {
-        // See if this, or some part of it, is loop invariant.  If so, we can
-        // unswitch on it if we desire.
-        Value *LoopCond = findLIVLoopCondition(BI->getCondition(), CurrentLoop,
-                                               Changed, MSSAU.get())
-                              .first;
-        if (LoopCond && !equalityPropUnSafe(*LoopCond) &&
-            unswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context), TI)) {
-          ++NumBranches;
-          return true;
-        }
-      }
-    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
-      Value *SC = SI->getCondition();
-      Value *LoopCond;
-      OperatorChain OpChain;
-      std::tie(LoopCond, OpChain) =
-          findLIVLoopCondition(SC, CurrentLoop, Changed, MSSAU.get());
-
-      unsigned NumCases = SI->getNumCases();
-      if (LoopCond && NumCases) {
-        // Find a value to unswitch on:
-        // FIXME: this should chose the most expensive case!
-        // FIXME: scan for a case with a non-critical edge?
-        Constant *UnswitchVal = nullptr;
-        // Find a case value such that at least one case value is unswitched
-        // out.
-        if (OpChain == OC_OpChainAnd) {
-          // If the chain only has ANDs and the switch has a case value of 0.
-          // Dropping in a 0 to the chain will unswitch out the 0-casevalue.
-          auto *AllZero = cast<ConstantInt>(Constant::getNullValue(SC->getType()));
-          if (BranchesInfo.isUnswitched(SI, AllZero))
-            continue;
-          // We are unswitching 0 out.
-          UnswitchVal = AllZero;
-        } else if (OpChain == OC_OpChainOr) {
-          // If the chain only has ORs and the switch has a case value of ~0.
-          // Dropping in a ~0 to the chain will unswitch out the ~0-casevalue.
-          auto *AllOne = cast<ConstantInt>(Constant::getAllOnesValue(SC->getType()));
-          if (BranchesInfo.isUnswitched(SI, AllOne))
-            continue;
-          // We are unswitching ~0 out.
-          UnswitchVal = AllOne;
-        } else {
-          assert(OpChain == OC_OpChainNone &&
-                 "Expect to unswitch on trivial chain");
-          // Do not process same value again and again.
-          // At this point we have some cases already unswitched and
-          // some not yet unswitched. Let's find the first not yet unswitched one.
-          for (auto Case : SI->cases()) {
-            Constant *UnswitchValCandidate = Case.getCaseValue();
-            if (!BranchesInfo.isUnswitched(SI, UnswitchValCandidate)) {
-              UnswitchVal = UnswitchValCandidate;
-              break;
-            }
-          }
-        }
-
-        if (!UnswitchVal)
-          continue;
-
-        if (unswitchIfProfitable(LoopCond, UnswitchVal)) {
-          ++NumSwitches;
-          // In case of a full LIV, UnswitchVal is the value we unswitched out.
-          // In case of a partial LIV, we only unswitch when its an AND-chain
-          // or OR-chain. In both cases switch input value simplifies to
-          // UnswitchVal.
-          BranchesInfo.setUnswitched(SI, UnswitchVal);
-          return true;
-        }
-      }
-    }
-
-    // Scan the instructions to check for unswitchable values.
-    for (BasicBlock::iterator BBI = (*I)->begin(), E = (*I)->end();
-         BBI != E; ++BBI)
-      if (SelectInst *SI = dyn_cast<SelectInst>(BBI)) {
-        Value *LoopCond = findLIVLoopCondition(SI->getCondition(), CurrentLoop,
-                                               Changed, MSSAU.get())
-                              .first;
-        if (LoopCond &&
-            unswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) {
-          ++NumSelects;
-          return true;
-        }
-      }
-  }
-
-  // Check if there is a header condition that is invariant along the patch from
-  // either the true or false successors to the header. This allows unswitching
-  // conditions depending on memory accesses, if there's a path not clobbering
-  // the memory locations. Check if this transform has been disabled using
-  // metadata, to avoid unswitching the same loop multiple times.
-  if (MSSA &&
-      !findOptionMDForLoop(CurrentLoop, "llvm.loop.unswitch.partial.disable")) {
-    if (auto Info =
-            hasPartialIVCondition(*CurrentLoop, MSSAThreshold, *MSSA, *AA)) {
-      assert(!Info->InstToDuplicate.empty() &&
-             "need at least a partially invariant condition");
-      LLVM_DEBUG(dbgs() << "loop-unswitch: Found partially invariant condition "
-                        << *Info->InstToDuplicate[0] << "\n");
-
-      Instruction *TI = CurrentLoop->getHeader()->getTerminator();
-      Value *LoopCond = Info->InstToDuplicate[0];
-
-      // If the partially unswitched path is a no-op and has a single exit
-      // block, we do not need to do full unswitching. Instead, we can directly
-      // branch to the exit.
-      // TODO: Instead of duplicating the checks, we could also just directly
-      // branch to the exit from the conditional branch in the loop.
-      if (Info->PathIsNoop) {
-        if (HasBranchDivergence &&
-            getAnalysis<LegacyDivergenceAnalysis>().isDivergent(LoopCond)) {
-          LLVM_DEBUG(dbgs() << "NOT unswitching loop %"
-                            << CurrentLoop->getHeader()->getName()
-                            << " at non-trivial condition '"
-                            << *Info->KnownValue << "' == " << *LoopCond << "\n"
-                            << ". Condition is divergent.\n");
-          return false;
-        }
-
-        ++NumBranches;
-
-        BasicBlock *TrueDest = LoopHeader;
-        BasicBlock *FalseDest = Info->ExitForPath;
-        if (Info->KnownValue->isOneValue())
-          std::swap(TrueDest, FalseDest);
-
-        auto *OldBr =
-            cast<BranchInst>(CurrentLoop->getLoopPreheader()->getTerminator());
-        emitPreheaderBranchOnCondition(LoopCond, Info->KnownValue, TrueDest,
-                                       FalseDest, OldBr, TI,
-                                       Info->InstToDuplicate);
-        delete OldBr;
-        RedoLoop = false;
-        return true;
-      }
-
-      // Otherwise, the path is not a no-op. Run regular unswitching.
-      if (unswitchIfProfitable(LoopCond, Info->KnownValue,
-                               CurrentLoop->getHeader()->getTerminator(),
-                               Info->InstToDuplicate)) {
-        ++NumBranches;
-        RedoLoop = false;
-        return true;
-      }
-    }
-  }
-
-  return Changed;
-}
-
-/// Check to see if all paths from BB exit the loop with no side effects
-/// (including infinite loops).
-///
-/// If true, we return true and set ExitBB to the block we
-/// exit through.
-///
-static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB,
-                                         BasicBlock *&ExitBB,
-                                         std::set<BasicBlock*> &Visited) {
-  if (!Visited.insert(BB).second) {
-    // Already visited. Without more analysis, this could indicate an infinite
-    // loop.
-    return false;
-  }
-  if (!L->contains(BB)) {
-    // Otherwise, this is a loop exit, this is fine so long as this is the
-    // first exit.
-    if (ExitBB) return false;
-    ExitBB = BB;
-    return true;
-  }
-
-  // Otherwise, this is an unvisited intra-loop node.  Check all successors.
-  for (BasicBlock *Succ : successors(BB)) {
-    // Check to see if the successor is a trivial loop exit.
-    if (!isTrivialLoopExitBlockHelper(L, Succ, ExitBB, Visited))
-      return false;
-  }
-
-  // Okay, everything after this looks good, check to make sure that this block
-  // doesn't include any side effects.
-  for (Instruction &I : *BB)
-    if (I.mayHaveSideEffects())
-      return false;
-
-  return true;
-}
-
-/// Return true if the specified block unconditionally leads to an exit from
-/// the specified loop, and has no side-effects in the process. If so, return
-/// the block that is exited to, otherwise return null.
-static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) {
-  std::set<BasicBlock*> Visited;
-  Visited.insert(L->getHeader());  // Branches to header make infinite loops.
-  BasicBlock *ExitBB = nullptr;
-  if (isTrivialLoopExitBlockHelper(L, BB, ExitBB, Visited))
-    return ExitBB;
-  return nullptr;
-}
-
-/// We have found that we can unswitch CurrentLoop when LoopCond == Val to
-/// simplify the loop.  If we decide that this is profitable,
-/// unswitch the loop, reprocess the pieces, then return true.
-bool LoopUnswitch::unswitchIfProfitable(Value *LoopCond, Constant *Val,
-                                        Instruction *TI,
-                                        ArrayRef<Instruction *> ToDuplicate) {
-  // Check to see if it would be profitable to unswitch current loop.
-  if (!BranchesInfo.costAllowsUnswitching()) {
-    LLVM_DEBUG(dbgs() << "NOT unswitching loop %"
-                      << CurrentLoop->getHeader()->getName()
-                      << " at non-trivial condition '" << *Val
-                      << "' == " << *LoopCond << "\n"
-                      << ". Cost too high.\n");
-    return false;
-  }
-  if (HasBranchDivergence &&
-      getAnalysis<LegacyDivergenceAnalysis>().isDivergent(LoopCond)) {
-    LLVM_DEBUG(dbgs() << "NOT unswitching loop %"
-                      << CurrentLoop->getHeader()->getName()
-                      << " at non-trivial condition '" << *Val
-                      << "' == " << *LoopCond << "\n"
-                      << ". Condition is divergent.\n");
-    return false;
-  }
-
-  unswitchNontrivialCondition(LoopCond, Val, CurrentLoop, TI, ToDuplicate);
-  return true;
-}
-
-/// Emit a conditional branch on two values if LIC == Val, branch to TrueDst,
-/// otherwise branch to FalseDest. Insert the code immediately before OldBranch
-/// and remove (but not erase!) it from the function.
-void LoopUnswitch::emitPreheaderBranchOnCondition(
-    Value *LIC, Constant *Val, BasicBlock *TrueDest, BasicBlock *FalseDest,
-    BranchInst *OldBranch, Instruction *TI,
-    ArrayRef<Instruction *> ToDuplicate) {
-  assert(OldBranch->isUnconditional() && "Preheader is not split correctly");
-  assert(TrueDest != FalseDest && "Branch targets should be different");
-
-  // Insert a conditional branch on LIC to the two preheaders.  The original
-  // code is the true version and the new code is the false version.
-  Value *BranchVal = LIC;
-  bool Swapped = false;
-
-  if (!ToDuplicate.empty()) {
-    ValueToValueMapTy Old2New;
-    for (Instruction *I : reverse(ToDuplicate)) {
-      auto *New = I->clone();
-      New->insertBefore(OldBranch);
-      RemapInstruction(New, Old2New,
-                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-      Old2New[I] = New;
-
-      if (MSSAU) {
-        MemorySSA *MSSA = MSSAU->getMemorySSA();
-        auto *MemA = dyn_cast_or_null<MemoryUse>(MSSA->getMemoryAccess(I));
-        if (!MemA)
-          continue;
-
-        Loop *L = LI->getLoopFor(I->getParent());
-        auto *DefiningAccess = MemA->getDefiningAccess();
-        // Get the first defining access before the loop.
-        while (L->contains(DefiningAccess->getBlock())) {
-          // If the defining access is a MemoryPhi, get the incoming
-          // value for the pre-header as defining access.
-          if (auto *MemPhi = dyn_cast<MemoryPhi>(DefiningAccess)) {
-            DefiningAccess =
-                MemPhi->getIncomingValueForBlock(L->getLoopPreheader());
-          } else {
-            DefiningAccess =
-                cast<MemoryDef>(DefiningAccess)->getDefiningAccess();
-          }
-        }
-        MSSAU->createMemoryAccessInBB(New, DefiningAccess, New->getParent(),
-                                      MemorySSA::BeforeTerminator);
-      }
-    }
-    BranchVal = Old2New[ToDuplicate[0]];
-  } else {
-
-    if (!isa<ConstantInt>(Val) ||
-        Val->getType() != Type::getInt1Ty(LIC->getContext()))
-      BranchVal = new ICmpInst(OldBranch, ICmpInst::ICMP_EQ, LIC, Val);
-    else if (Val != ConstantInt::getTrue(Val->getContext())) {
-      // We want to enter the new loop when the condition is true.
-      std::swap(TrueDest, FalseDest);
-      Swapped = true;
-    }
-  }
-
-  // Old branch will be removed, so save its parent and successor to update the
-  // DomTree.
-  auto *OldBranchSucc = OldBranch->getSuccessor(0);
-  auto *OldBranchParent = OldBranch->getParent();
-
-  // Insert the new branch.
-  BranchInst *BI =
-      IRBuilder<>(OldBranch).CreateCondBr(BranchVal, TrueDest, FalseDest, TI);
-  if (Swapped)
-    BI->swapProfMetadata();
-
-  // Remove the old branch so there is only one branch at the end. This is
-  // needed to perform DomTree's internal DFS walk on the function's CFG.
-  OldBranch->removeFromParent();
-
-  // Inform the DT about the new branch.
-  if (DT) {
-    // First, add both successors.
-    SmallVector<DominatorTree::UpdateType, 3> Updates;
-    if (TrueDest != OldBranchSucc)
-      Updates.push_back({DominatorTree::Insert, OldBranchParent, TrueDest});
-    if (FalseDest != OldBranchSucc)
-      Updates.push_back({DominatorTree::Insert, OldBranchParent, FalseDest});
-    // If both of the new successors are different from the old one, inform the
-    // DT that the edge was deleted.
-    if (OldBranchSucc != TrueDest && OldBranchSucc != FalseDest) {
-      Updates.push_back({DominatorTree::Delete, OldBranchParent, OldBranchSucc});
-    }
-
-    if (MSSAU)
-      MSSAU->applyUpdates(Updates, *DT, /*UpdateDT=*/true);
-    else
-      DT->applyUpdates(Updates);
-  }
-
-  // If either edge is critical, split it. This helps preserve LoopSimplify
-  // form for enclosing loops.
-  auto Options =
-      CriticalEdgeSplittingOptions(DT, LI, MSSAU.get()).setPreserveLCSSA();
-  SplitCriticalEdge(BI, 0, Options);
-  SplitCriticalEdge(BI, 1, Options);
-}
-
-/// Given a loop that has a trivial unswitchable condition in it (a cond branch
-/// from its header block to its latch block, where the path through the loop
-/// that doesn't execute its body has no side-effects), unswitch it. This
-/// doesn't involve any code duplication, just moving the conditional branch
-/// outside of the loop and updating loop info.
-void LoopUnswitch::unswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
-                                            BasicBlock *ExitBlock,
-                                            Instruction *TI) {
-  LLVM_DEBUG(dbgs() << "loop-unswitch: Trivial-Unswitch loop %"
-                    << LoopHeader->getName() << " [" << L->getBlocks().size()
-                    << " blocks] in Function "
-                    << L->getHeader()->getParent()->getName()
-                    << " on cond: " << *Val << " == " << *Cond << "\n");
-  // We are going to make essential changes to CFG. This may invalidate cached
-  // information for L or one of its parent loops in SCEV.
-  if (auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>())
-    SEWP->getSE().forgetTopmostLoop(L);
-
-  // First step, split the preheader, so that we know that there is a safe place
-  // to insert the conditional branch.  We will change LoopPreheader to have a
-  // conditional branch on Cond.
-  BasicBlock *NewPH = SplitEdge(LoopPreheader, LoopHeader, DT, LI, MSSAU.get());
-
-  // Now that we have a place to insert the conditional branch, create a place
-  // to branch to: this is the exit block out of the loop that we should
-  // short-circuit to.
-
-  // Split this block now, so that the loop maintains its exit block, and so
-  // that the jump from the preheader can execute the contents of the exit block
-  // without actually branching to it (the exit block should be dominated by the
-  // loop header, not the preheader).
-  assert(!L->contains(ExitBlock) && "Exit block is in the loop?");
-  BasicBlock *NewExit =
-      SplitBlock(ExitBlock, &ExitBlock->front(), DT, LI, MSSAU.get());
-
-  // Okay, now we have a position to branch from and a position to branch to,
-  // insert the new conditional branch.
-  auto *OldBranch = dyn_cast<BranchInst>(LoopPreheader->getTerminator());
-  assert(OldBranch && "Failed to split the preheader");
-  emitPreheaderBranchOnCondition(Cond, Val, NewExit, NewPH, OldBranch, TI);
-
-  // emitPreheaderBranchOnCondition removed the OldBranch from the function.
-  // Delete it, as it is no longer needed.
-  delete OldBranch;
-
-  // We need to reprocess this loop, it could be unswitched again.
-  RedoLoop = true;
-
-  // Now that we know that the loop is never entered when this condition is a
-  // particular value, rewrite the loop with this info.  We know that this will
-  // at least eliminate the old branch.
-  rewriteLoopBodyWithConditionConstant(L, Cond, Val, /*IsEqual=*/false);
-
-  ++NumTrivial;
-}
-
-/// Check if the first non-constant condition starting from the loop header is
-/// a trivial unswitch condition: that is, a condition controls whether or not
-/// the loop does anything at all. If it is a trivial condition, unswitching
-/// produces no code duplications (equivalently, it produces a simpler loop and
-/// a new empty loop, which gets deleted). Therefore always unswitch trivial
-/// condition.
-bool LoopUnswitch::tryTrivialLoopUnswitch(bool &Changed) {
-  BasicBlock *CurrentBB = CurrentLoop->getHeader();
-  Instruction *CurrentTerm = CurrentBB->getTerminator();
-  LLVMContext &Context = CurrentBB->getContext();
-
-  // If loop header has only one reachable successor (currently via an
-  // unconditional branch or constant foldable conditional branch, but
-  // should also consider adding constant foldable switch instruction in
-  // future), we should keep looking for trivial condition candidates in
-  // the successor as well. An alternative is to constant fold conditions
-  // and merge successors into loop header (then we only need to check header's
-  // terminator). The reason for not doing this in LoopUnswitch pass is that
-  // it could potentially break LoopPassManager's invariants. Folding dead
-  // branches could either eliminate the current loop or make other loops
-  // unreachable. LCSSA form might also not be preserved after deleting
-  // branches. The following code keeps traversing loop header's successors
-  // until it finds the trivial condition candidate (condition that is not a
-  // constant). Since unswitching generates branches with constant conditions,
-  // this scenario could be very common in practice.
-  SmallPtrSet<BasicBlock*, 8> Visited;
-
-  while (true) {
-    // If we exit loop or reach a previous visited block, then
-    // we can not reach any trivial condition candidates (unfoldable
-    // branch instructions or switch instructions) and no unswitch
-    // can happen. Exit and return false.
-    if (!CurrentLoop->contains(CurrentBB) || !Visited.insert(CurrentBB).second)
-      return false;
-
-    // Check if this loop will execute any side-effecting instructions (e.g.
-    // stores, calls, volatile loads) in the part of the loop that the code
-    // *would* execute. Check the header first.
-    for (Instruction &I : *CurrentBB)
-      if (I.mayHaveSideEffects())
-        return false;
-
-    if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) {
-      if (BI->isUnconditional()) {
-        CurrentBB = BI->getSuccessor(0);
-      } else if (BI->getCondition() == ConstantInt::getTrue(Context)) {
-        CurrentBB = BI->getSuccessor(0);
-      } else if (BI->getCondition() == ConstantInt::getFalse(Context)) {
-        CurrentBB = BI->getSuccessor(1);
-      } else {
-        // Found a trivial condition candidate: non-foldable conditional branch.
-        break;
-      }
-    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
-      // At this point, any constant-foldable instructions should have probably
-      // been folded.
-      ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition());
-      if (!Cond)
-        break;
-      // Find the target block we are definitely going to.
-      CurrentBB = SI->findCaseValue(Cond)->getCaseSuccessor();
-    } else {
-      // We do not understand these terminator instructions.
-      break;
-    }
-
-    CurrentTerm = CurrentBB->getTerminator();
-  }
-
-  // CondVal is the condition that controls the trivial condition.
-  // LoopExitBB is the BasicBlock that loop exits when meets trivial condition.
-  Constant *CondVal = nullptr;
-  BasicBlock *LoopExitBB = nullptr;
-
-  if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) {
-    // If this isn't branching on an invariant condition, we can't unswitch it.
-    if (!BI->isConditional())
-      return false;
-
-    Value *LoopCond = findLIVLoopCondition(BI->getCondition(), CurrentLoop,
-                                           Changed, MSSAU.get())
-                          .first;
-
-    // Unswitch only if the trivial condition itself is an LIV (not
-    // partial LIV which could occur in and/or)
-    if (!LoopCond || LoopCond != BI->getCondition())
-      return false;
-
-    // Check to see if a successor of the branch is guaranteed to
-    // exit through a unique exit block without having any
-    // side-effects.  If so, determine the value of Cond that causes
-    // it to do this.
-    if ((LoopExitBB =
-             isTrivialLoopExitBlock(CurrentLoop, BI->getSuccessor(0)))) {
-      CondVal = ConstantInt::getTrue(Context);
-    } else if ((LoopExitBB =
-                    isTrivialLoopExitBlock(CurrentLoop, BI->getSuccessor(1)))) {
-      CondVal = ConstantInt::getFalse(Context);
-    }
-
-    // If we didn't find a single unique LoopExit block, or if the loop exit
-    // block contains phi nodes, this isn't trivial.
-    if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin()))
-      return false;   // Can't handle this.
-
-    if (equalityPropUnSafe(*LoopCond))
-      return false;
-
-    unswitchTrivialCondition(CurrentLoop, LoopCond, CondVal, LoopExitBB,
-                             CurrentTerm);
-    ++NumBranches;
-    return true;
-  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
-    // If this isn't switching on an invariant condition, we can't unswitch it.
-    Value *LoopCond = findLIVLoopCondition(SI->getCondition(), CurrentLoop,
-                                           Changed, MSSAU.get())
-                          .first;
-
-    // Unswitch only if the trivial condition itself is an LIV (not
-    // partial LIV which could occur in and/or)
-    if (!LoopCond || LoopCond != SI->getCondition())
-      return false;
-
-    // Check to see if a successor of the switch is guaranteed to go to the
-    // latch block or exit through a one exit block without having any
-    // side-effects.  If so, determine the value of Cond that causes it to do
-    // this.
-    // Note that we can't trivially unswitch on the default case or
-    // on already unswitched cases.
-    for (auto Case : SI->cases()) {
-      BasicBlock *LoopExitCandidate;
-      if ((LoopExitCandidate =
-               isTrivialLoopExitBlock(CurrentLoop, Case.getCaseSuccessor()))) {
-        // Okay, we found a trivial case, remember the value that is trivial.
-        ConstantInt *CaseVal = Case.getCaseValue();
-
-        // Check that it was not unswitched before, since already unswitched
-        // trivial vals are looks trivial too.
-        if (BranchesInfo.isUnswitched(SI, CaseVal))
-          continue;
-        LoopExitBB = LoopExitCandidate;
-        CondVal = CaseVal;
-        break;
-      }
-    }
-
-    // If we didn't find a single unique LoopExit block, or if the loop exit
-    // block contains phi nodes, this isn't trivial.
-    if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin()))
-      return false;   // Can't handle this.
-
-    unswitchTrivialCondition(CurrentLoop, LoopCond, CondVal, LoopExitBB,
-                             nullptr);
-
-    // We are only unswitching full LIV.
-    BranchesInfo.setUnswitched(SI, CondVal);
-    ++NumSwitches;
-    return true;
-  }
-  return false;
-}
-
-/// Split all of the edges from inside the loop to their exit blocks.
-/// Update the appropriate Phi nodes as we do so.
-void LoopUnswitch::splitExitEdges(
-    Loop *L, const SmallVectorImpl<BasicBlock *> &ExitBlocks) {
-
-  for (unsigned I = 0, E = ExitBlocks.size(); I != E; ++I) {
-    BasicBlock *ExitBlock = ExitBlocks[I];
-    SmallVector<BasicBlock *, 4> Preds(predecessors(ExitBlock));
-
-    // Although SplitBlockPredecessors doesn't preserve loop-simplify in
-    // general, if we call it on all predecessors of all exits then it does.
-    SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa", DT, LI, MSSAU.get(),
-                           /*PreserveLCSSA*/ true);
-  }
-}
-
-/// We determined that the loop is profitable to unswitch when LIC equal Val.
-/// Split it into loop versions and test the condition outside of either loop.
-/// Return the loops created as Out1/Out2.
-void LoopUnswitch::unswitchNontrivialCondition(
-    Value *LIC, Constant *Val, Loop *L, Instruction *TI,
-    ArrayRef<Instruction *> ToDuplicate) {
-  Function *F = LoopHeader->getParent();
-  LLVM_DEBUG(dbgs() << "loop-unswitch: Unswitching loop %"
-                    << LoopHeader->getName() << " [" << L->getBlocks().size()
-                    << " blocks] in Function " << F->getName() << " when '"
-                    << *Val << "' == " << *LIC << "\n");
-
-  // We are going to make essential changes to CFG. This may invalidate cached
-  // information for L or one of its parent loops in SCEV.
-  if (auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>())
-    SEWP->getSE().forgetTopmostLoop(L);
-
-  LoopBlocks.clear();
-  NewBlocks.clear();
-
-  if (MSSAU && VerifyMemorySSA)
-    MSSA->verifyMemorySSA();
-
-  // First step, split the preheader and exit blocks, and add these blocks to
-  // the LoopBlocks list.
-  BasicBlock *NewPreheader =
-      SplitEdge(LoopPreheader, LoopHeader, DT, LI, MSSAU.get());
-  LoopBlocks.push_back(NewPreheader);
-
-  // We want the loop to come after the preheader, but before the exit blocks.
-  llvm::append_range(LoopBlocks, L->blocks());
-
-  SmallVector<BasicBlock*, 8> ExitBlocks;
-  L->getUniqueExitBlocks(ExitBlocks);
-
-  // Split all of the edges from inside the loop to their exit blocks.  Update
-  // the appropriate Phi nodes as we do so.
-  splitExitEdges(L, ExitBlocks);
-
-  // The exit blocks may have been changed due to edge splitting, recompute.
-  ExitBlocks.clear();
-  L->getUniqueExitBlocks(ExitBlocks);
-
-  // Add exit blocks to the loop blocks.
-  llvm::append_range(LoopBlocks, ExitBlocks);
-
-  // Next step, clone all of the basic blocks that make up the loop (including
-  // the loop preheader and exit blocks), keeping track of the mapping between
-  // the instructions and blocks.
-  NewBlocks.reserve(LoopBlocks.size());
-  ValueToValueMapTy VMap;
-  for (unsigned I = 0, E = LoopBlocks.size(); I != E; ++I) {
-    BasicBlock *NewBB = CloneBasicBlock(LoopBlocks[I], VMap, ".us", F);
-
-    NewBlocks.push_back(NewBB);
-    VMap[LoopBlocks[I]] = NewBB; // Keep the BB mapping.
-  }
-
-  // Splice the newly inserted blocks into the function right before the
-  // original preheader.
-  F->getBasicBlockList().splice(NewPreheader->getIterator(),
-                                F->getBasicBlockList(),
-                                NewBlocks[0]->getIterator(), F->end());
-
-  // Now we create the new Loop object for the versioned loop.
-  Loop *NewLoop = cloneLoop(L, L->getParentLoop(), VMap, LI, LPM);
-
-  // Recalculate unswitching quota, inherit simplified switches info for NewBB,
-  // Probably clone more loop-unswitch related loop properties.
-  BranchesInfo.cloneData(NewLoop, L, VMap);
-
-  Loop *ParentLoop = L->getParentLoop();
-  if (ParentLoop) {
-    // Make sure to add the cloned preheader and exit blocks to the parent loop
-    // as well.
-    ParentLoop->addBasicBlockToLoop(NewBlocks[0], *LI);
-  }
-
-  for (unsigned EBI = 0, EBE = ExitBlocks.size(); EBI != EBE; ++EBI) {
-    BasicBlock *NewExit = cast<BasicBlock>(VMap[ExitBlocks[EBI]]);
-    // The new exit block should be in the same loop as the old one.
-    if (Loop *ExitBBLoop = LI->getLoopFor(ExitBlocks[EBI]))
-      ExitBBLoop->addBasicBlockToLoop(NewExit, *LI);
-
-    assert(NewExit->getTerminator()->getNumSuccessors() == 1 &&
-           "Exit block should have been split to have one successor!");
-    BasicBlock *ExitSucc = NewExit->getTerminator()->getSuccessor(0);
-
-    // If the successor of the exit block had PHI nodes, add an entry for
-    // NewExit.
-    for (PHINode &PN : ExitSucc->phis()) {
-      Value *V = PN.getIncomingValueForBlock(ExitBlocks[EBI]);
-      ValueToValueMapTy::iterator It = VMap.find(V);
-      if (It != VMap.end()) V = It->second;
-      PN.addIncoming(V, NewExit);
-    }
-
-    if (LandingPadInst *LPad = NewExit->getLandingPadInst()) {
-      PHINode *PN = PHINode::Create(LPad->getType(), 0, "",
-                                    &*ExitSucc->getFirstInsertionPt());
-
-      for (BasicBlock *BB : predecessors(ExitSucc)) {
-        LandingPadInst *LPI = BB->getLandingPadInst();
-        LPI->replaceAllUsesWith(PN);
-        PN->addIncoming(LPI, BB);
-      }
-    }
-  }
-
-  // Rewrite the code to refer to itself.
-  for (unsigned NBI = 0, NBE = NewBlocks.size(); NBI != NBE; ++NBI) {
-    for (Instruction &I : *NewBlocks[NBI]) {
-      RemapInstruction(&I, VMap,
-                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-      if (auto *II = dyn_cast<AssumeInst>(&I))
-        AC->registerAssumption(II);
-    }
-  }
-
-  // Rewrite the original preheader to select between versions of the loop.
-  BranchInst *OldBR = cast<BranchInst>(LoopPreheader->getTerminator());
-  assert(OldBR->isUnconditional() && OldBR->getSuccessor(0) == LoopBlocks[0] &&
-         "Preheader splitting did not work correctly!");
-
-  if (MSSAU) {
-    // Update MemorySSA after cloning, and before splitting to unreachables,
-    // since that invalidates the 1:1 mapping of clones in VMap.
-    LoopBlocksRPO LBRPO(L);
-    LBRPO.perform(LI);
-    MSSAU->updateForClonedLoop(LBRPO, ExitBlocks, VMap);
-  }
-
-  // Emit the new branch that selects between the two versions of this loop.
-  emitPreheaderBranchOnCondition(LIC, Val, NewBlocks[0], LoopBlocks[0], OldBR,
-                                 TI, ToDuplicate);
-  if (MSSAU) {
-    // Update MemoryPhis in Exit blocks.
-    MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMap, *DT);
-    if (VerifyMemorySSA)
-      MSSA->verifyMemorySSA();
-  }
-
-  // The OldBr was replaced by a new one and removed (but not erased) by
-  // emitPreheaderBranchOnCondition. It is no longer needed, so delete it.
-  delete OldBR;
-
-  LoopProcessWorklist.push_back(NewLoop);
-  RedoLoop = true;
-
-  // Keep a WeakTrackingVH holding onto LIC.  If the first call to
-  // RewriteLoopBody
-  // deletes the instruction (for example by simplifying a PHI that feeds into
-  // the condition that we're unswitching on), we don't rewrite the second
-  // iteration.
-  WeakTrackingVH LICHandle(LIC);
-
-  if (ToDuplicate.empty()) {
-    // Now we rewrite the original code to know that the condition is true and
-    // the new code to know that the condition is false.
-    rewriteLoopBodyWithConditionConstant(L, LIC, Val, /*IsEqual=*/false);
-
-    // It's possible that simplifying one loop could cause the other to be
-    // changed to another value or a constant.  If its a constant, don't
-    // simplify it.
-    if (!LoopProcessWorklist.empty() && LoopProcessWorklist.back() == NewLoop &&
-        LICHandle && !isa<Constant>(LICHandle))
-      rewriteLoopBodyWithConditionConstant(NewLoop, LICHandle, Val,
-                                           /*IsEqual=*/true);
-  } else {
-    // Partial unswitching. Update the condition in the right loop with the
-    // constant.
-    auto *CC = cast<ConstantInt>(Val);
-    if (CC->isOneValue()) {
-      rewriteLoopBodyWithConditionConstant(NewLoop, VMap[LIC], Val,
-                                           /*IsEqual=*/true);
-    } else
-      rewriteLoopBodyWithConditionConstant(L, LIC, Val, /*IsEqual=*/true);
-
-    // Mark the new loop as partially unswitched, to avoid unswitching on the
-    // same condition again.
-    auto &Context = NewLoop->getHeader()->getContext();
-    MDNode *DisableUnswitchMD = MDNode::get(
-        Context, MDString::get(Context, "llvm.loop.unswitch.partial.disable"));
-    MDNode *NewLoopID = makePostTransformationMetadata(
-        Context, L->getLoopID(), {"llvm.loop.unswitch.partial"},
-        {DisableUnswitchMD});
-    NewLoop->setLoopID(NewLoopID);
-  }
-
-  if (MSSA && VerifyMemorySSA)
-    MSSA->verifyMemorySSA();
-}
-
-/// Remove all instances of I from the worklist vector specified.
-static void removeFromWorklist(Instruction *I,
-                               std::vector<Instruction *> &Worklist) {
-  llvm::erase_value(Worklist, I);
-}
-
-/// When we find that I really equals V, remove I from the
-/// program, replacing all uses with V and update the worklist.
-static void replaceUsesOfWith(Instruction *I, Value *V,
-                              std::vector<Instruction *> &Worklist, Loop *L,
-                              LPPassManager *LPM, MemorySSAUpdater *MSSAU) {
-  LLVM_DEBUG(dbgs() << "Replace with '" << *V << "': " << *I << "\n");
-
-  // Add uses to the worklist, which may be dead now.
-  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
-    if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i)))
-      Worklist.push_back(Use);
-
-  // Add users to the worklist which may be simplified now.
-  for (User *U : I->users())
-    Worklist.push_back(cast<Instruction>(U));
-  removeFromWorklist(I, Worklist);
-  I->replaceAllUsesWith(V);
-  if (!I->mayHaveSideEffects()) {
-    if (MSSAU)
-      MSSAU->removeMemoryAccess(I);
-    I->eraseFromParent();
-  }
-  ++NumSimplify;
-}
-
-/// We know either that the value LIC has the value specified by Val in the
-/// specified loop, or we know it does NOT have that value.
-/// Rewrite any uses of LIC or of properties correlated to it.
-void LoopUnswitch::rewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
-                                                        Constant *Val,
-                                                        bool IsEqual) {
-  assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?");
-
-  // FIXME: Support correlated properties, like:
-  //  for (...)
-  //    if (li1 < li2)
-  //      ...
-  //    if (li1 > li2)
-  //      ...
-
-  // FOLD boolean conditions (X|LIC), (X&LIC).  Fold conditional branches,
-  // selects, switches.
-  std::vector<Instruction*> Worklist;
-  LLVMContext &Context = Val->getContext();
-
-  // If we know that LIC == Val, or that LIC == NotVal, just replace uses of LIC
-  // in the loop with the appropriate one directly.
-  if (IsEqual || (isa<ConstantInt>(Val) &&
-      Val->getType()->isIntegerTy(1))) {
-    Value *Replacement;
-    if (IsEqual)
-      Replacement = Val;
-    else
-      Replacement = ConstantInt::get(Type::getInt1Ty(Val->getContext()),
-                                     !cast<ConstantInt>(Val)->getZExtValue());
-
-    for (User *U : LIC->users()) {
-      Instruction *UI = dyn_cast<Instruction>(U);
-      if (!UI || !L->contains(UI))
-        continue;
-      Worklist.push_back(UI);
-    }
-
-    for (Instruction *UI : Worklist)
-      UI->replaceUsesOfWith(LIC, Replacement);
-
-    simplifyCode(Worklist, L);
-    return;
-  }
-
-  // Otherwise, we don't know the precise value of LIC, but we do know that it
-  // is certainly NOT "Val".  As such, simplify any uses in the loop that we
-  // can.  This case occurs when we unswitch switch statements.
-  for (User *U : LIC->users()) {
-    Instruction *UI = dyn_cast<Instruction>(U);
-    if (!UI || !L->contains(UI))
-      continue;
-
-    // At this point, we know LIC is definitely not Val. Try to use some simple
-    // logic to simplify the user w.r.t. to the context.
-    if (Value *Replacement = simplifyInstructionWithNotEqual(UI, LIC, Val)) {
-      if (LI->replacementPreservesLCSSAForm(UI, Replacement)) {
-        // This in-loop instruction has been simplified w.r.t. its context,
-        // i.e. LIC != Val, make sure we propagate its replacement value to
-        // all its users.
-        //
-        // We can not yet delete UI, the LIC user, yet, because that would invalidate
-        // the LIC->users() iterator !. However, we can make this instruction
-        // dead by replacing all its users and push it onto the worklist so that
-        // it can be properly deleted and its operands simplified.
-        UI->replaceAllUsesWith(Replacement);
-      }
-    }
-
-    // This is a LIC user, push it into the worklist so that simplifyCode can
-    // attempt to simplify it.
-    Worklist.push_back(UI);
-
-    // If we know that LIC is not Val, use this info to simplify code.
-    SwitchInst *SI = dyn_cast<SwitchInst>(UI);
-    if (!SI || !isa<ConstantInt>(Val)) continue;
-
-    // NOTE: if a case value for the switch is unswitched out, we record it
-    // after the unswitch finishes. We can not record it here as the switch
-    // is not a direct user of the partial LIV.
-    SwitchInst::CaseHandle DeadCase =
-        *SI->findCaseValue(cast<ConstantInt>(Val));
-    // Default case is live for multiple values.
-    if (DeadCase == *SI->case_default())
-      continue;
-
-    // Found a dead case value.  Don't remove PHI nodes in the
-    // successor if they become single-entry, those PHI nodes may
-    // be in the Users list.
-
-    BasicBlock *Switch = SI->getParent();
-    BasicBlock *SISucc = DeadCase.getCaseSuccessor();
-    BasicBlock *Latch = L->getLoopLatch();
-
-    if (!SI->findCaseDest(SISucc)) continue;  // Edge is critical.
-    // If the DeadCase successor dominates the loop latch, then the
-    // transformation isn't safe since it will delete the sole predecessor edge
-    // to the latch.
-    if (Latch && DT->dominates(SISucc, Latch))
-      continue;
-
-    // FIXME: This is a hack.  We need to keep the successor around
-    // and hooked up so as to preserve the loop structure, because
-    // trying to update it is complicated.  So instead we preserve the
-    // loop structure and put the block on a dead code path.
-    SplitEdge(Switch, SISucc, DT, LI, MSSAU.get());
-    // Compute the successors instead of relying on the return value
-    // of SplitEdge, since it may have split the switch successor
-    // after PHI nodes.
-    BasicBlock *NewSISucc = DeadCase.getCaseSuccessor();
-    BasicBlock *OldSISucc = *succ_begin(NewSISucc);
-    // Create an "unreachable" destination.
-    BasicBlock *Abort = BasicBlock::Create(Context, "us-unreachable",
-                                           Switch->getParent(),
-                                           OldSISucc);
-    new UnreachableInst(Context, Abort);
-    // Force the new case destination to branch to the "unreachable"
-    // block while maintaining a (dead) CFG edge to the old block.
-    NewSISucc->getTerminator()->eraseFromParent();
-    BranchInst::Create(Abort, OldSISucc,
-                       ConstantInt::getTrue(Context), NewSISucc);
-    // Release the PHI operands for this edge.
-    for (PHINode &PN : NewSISucc->phis())
-      PN.setIncomingValueForBlock(Switch, UndefValue::get(PN.getType()));
-    // Tell the domtree about the new block. We don't fully update the
-    // domtree here -- instead we force it to do a full recomputation
-    // after the pass is complete -- but we do need to inform it of
-    // new blocks.
-    DT->addNewBlock(Abort, NewSISucc);
-  }
-
-  simplifyCode(Worklist, L);
-}
-
-/// Now that we have simplified some instructions in the loop, walk over it and
-/// constant prop, dce, and fold control flow where possible. Note that this is
-/// effectively a very simple loop-structure-aware optimizer. During processing
-/// of this loop, L could very well be deleted, so it must not be used.
-///
-/// FIXME: When the loop optimizer is more mature, separate this out to a new
-/// pass.
-///
-void LoopUnswitch::simplifyCode(std::vector<Instruction *> &Worklist, Loop *L) {
-  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
-  while (!Worklist.empty()) {
-    Instruction *I = Worklist.back();
-    Worklist.pop_back();
-
-    // Simple DCE.
-    if (isInstructionTriviallyDead(I)) {
-      LLVM_DEBUG(dbgs() << "Remove dead instruction '" << *I << "\n");
-
-      // Add uses to the worklist, which may be dead now.
-      for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
-        if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i)))
-          Worklist.push_back(Use);
-      removeFromWorklist(I, Worklist);
-      if (MSSAU)
-        MSSAU->removeMemoryAccess(I);
-      I->eraseFromParent();
-      ++NumSimplify;
-      continue;
-    }
-
-    // See if instruction simplification can hack this up.  This is common for
-    // things like "select false, X, Y" after unswitching made the condition be
-    // 'false'.  TODO: update the domtree properly so we can pass it here.
-    if (Value *V = SimplifyInstruction(I, DL))
-      if (LI->replacementPreservesLCSSAForm(I, V)) {
-        replaceUsesOfWith(I, V, Worklist, L, LPM, MSSAU.get());
-        continue;
-      }
-
-    // Special case hacks that appear commonly in unswitched code.
-    if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
-      if (BI->isUnconditional()) {
-        // If BI's parent is the only pred of the successor, fold the two blocks
-        // together.
-        BasicBlock *Pred = BI->getParent();
-        (void)Pred;
-        BasicBlock *Succ = BI->getSuccessor(0);
-        BasicBlock *SinglePred = Succ->getSinglePredecessor();
-        if (!SinglePred) continue;  // Nothing to do.
-        assert(SinglePred == Pred && "CFG broken");
-
-        // Make the LPM and Worklist updates specific to LoopUnswitch.
-        removeFromWorklist(BI, Worklist);
-        auto SuccIt = Succ->begin();
-        while (PHINode *PN = dyn_cast<PHINode>(SuccIt++)) {
-          for (unsigned It = 0, E = PN->getNumOperands(); It != E; ++It)
-            if (Instruction *Use = dyn_cast<Instruction>(PN->getOperand(It)))
-              Worklist.push_back(Use);
-          for (User *U : PN->users())
-            Worklist.push_back(cast<Instruction>(U));
-          removeFromWorklist(PN, Worklist);
-          ++NumSimplify;
-        }
-        // Merge the block and make the remaining analyses updates.
-        DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
-        MergeBlockIntoPredecessor(Succ, &DTU, LI, MSSAU.get());
-        ++NumSimplify;
-        continue;
-      }
-
-      continue;
-    }
-  }
-}
-
-/// Simple simplifications we can do given the information that Cond is
-/// definitely not equal to Val.
-Value *LoopUnswitch::simplifyInstructionWithNotEqual(Instruction *Inst,
-                                                     Value *Invariant,
-                                                     Constant *Val) {
-  // icmp eq cond, val -> false
-  ICmpInst *CI = dyn_cast<ICmpInst>(Inst);
-  if (CI && CI->isEquality()) {
-    Value *Op0 = CI->getOperand(0);
-    Value *Op1 = CI->getOperand(1);
-    if ((Op0 == Invariant && Op1 == Val) || (Op0 == Val && Op1 == Invariant)) {
-      LLVMContext &Ctx = Inst->getContext();
-      if (CI->getPredicate() == CmpInst::ICMP_EQ)
-        return ConstantInt::getFalse(Ctx);
-      else
-        return ConstantInt::getTrue(Ctx);
-     }
-  }
-
-  // FIXME: there may be other opportunities, e.g. comparison with floating
-  // point, or Invariant - Val != 0, etc.
-  return nullptr;
-}
diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
index 2ff1e8480749..c733aa4701ed 100644
--- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -70,14 +70,12 @@
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
-#include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
diff --git a/llvm/lib/Transforms/Scalar/LowerAtomic.cpp b/llvm/lib/Transforms/Scalar/LowerAtomic.cpp
deleted file mode 100644
index 4063e4fe0472..000000000000
--- a/llvm/lib/Transforms/Scalar/LowerAtomic.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-//===- LowerAtomic.cpp - Lower atomic intrinsics --------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass lowers atomic intrinsics to non-atomic form for use in a known
-// non-preemptible environment.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/LowerAtomic.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/Scalar.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "loweratomic"
-
-static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) {
-  IRBuilder<> Builder(CXI);
-  Value *Ptr = CXI->getPointerOperand();
-  Value *Cmp = CXI->getCompareOperand();
-  Value *Val = CXI->getNewValOperand();
-
-  LoadInst *Orig = Builder.CreateLoad(Val->getType(), Ptr);
-  Value *Equal = Builder.CreateICmpEQ(Orig, Cmp);
-  Value *Res = Builder.CreateSelect(Equal, Val, Orig);
-  Builder.CreateStore(Res, Ptr);
-
-  Res = Builder.CreateInsertValue(UndefValue::get(CXI->getType()), Orig, 0);
-  Res = Builder.CreateInsertValue(Res, Equal, 1);
-
-  CXI->replaceAllUsesWith(Res);
-  CXI->eraseFromParent();
-  return true;
-}
-
-bool llvm::lowerAtomicRMWInst(AtomicRMWInst *RMWI) {
-  IRBuilder<> Builder(RMWI);
-  Value *Ptr = RMWI->getPointerOperand();
-  Value *Val = RMWI->getValOperand();
-
-  LoadInst *Orig = Builder.CreateLoad(Val->getType(), Ptr);
-  Value *Res = nullptr;
-
-  switch (RMWI->getOperation()) {
-  default: llvm_unreachable("Unexpected RMW operation");
-  case AtomicRMWInst::Xchg:
-    Res = Val;
-    break;
-  case AtomicRMWInst::Add:
-    Res = Builder.CreateAdd(Orig, Val);
-    break;
-  case AtomicRMWInst::Sub:
-    Res = Builder.CreateSub(Orig, Val);
-    break;
-  case AtomicRMWInst::And:
-    Res = Builder.CreateAnd(Orig, Val);
-    break;
-  case AtomicRMWInst::Nand:
-    Res = Builder.CreateNot(Builder.CreateAnd(Orig, Val));
-    break;
-  case AtomicRMWInst::Or:
-    Res = Builder.CreateOr(Orig, Val);
-    break;
-  case AtomicRMWInst::Xor:
-    Res = Builder.CreateXor(Orig, Val);
-    break;
-  case AtomicRMWInst::Max:
-    Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val),
-                               Val, Orig);
-    break;
-  case AtomicRMWInst::Min:
-    Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val),
-                               Orig, Val);
-    break;
-  case AtomicRMWInst::UMax:
-    Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val),
-                               Val, Orig);
-    break;
-  case AtomicRMWInst::UMin:
-    Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val),
-                               Orig, Val);
-    break;
-  case AtomicRMWInst::FAdd:
-    Res = Builder.CreateFAdd(Orig, Val);
-    break;
-  case AtomicRMWInst::FSub:
-    Res = Builder.CreateFSub(Orig, Val);
-    break;
-  }
-  Builder.CreateStore(Res, Ptr);
-  RMWI->replaceAllUsesWith(Orig);
-  RMWI->eraseFromParent();
-  return true;
-}
-
-static bool LowerFenceInst(FenceInst *FI) {
-  FI->eraseFromParent();
-  return true;
-}
-
-static bool LowerLoadInst(LoadInst *LI) {
-  LI->setAtomic(AtomicOrdering::NotAtomic);
-  return true;
-}
-
-static bool LowerStoreInst(StoreInst *SI) {
-  SI->setAtomic(AtomicOrdering::NotAtomic);
-  return true;
-}
-
-static bool runOnBasicBlock(BasicBlock &BB) {
-  bool Changed = false;
-  for (Instruction &Inst : make_early_inc_range(BB)) {
-    if (FenceInst *FI = dyn_cast<FenceInst>(&Inst))
-      Changed |= LowerFenceInst(FI);
-    else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(&Inst))
-      Changed |= LowerAtomicCmpXchgInst(CXI);
-    else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(&Inst))
-      Changed |= lowerAtomicRMWInst(RMWI);
-    else if (LoadInst *LI = dyn_cast<LoadInst>(&Inst)) {
-      if (LI->isAtomic())
-        LowerLoadInst(LI);
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(&Inst)) {
-      if (SI->isAtomic())
-        LowerStoreInst(SI);
-    }
-  }
-  return Changed;
-}
-
-static bool lowerAtomics(Function &F) {
-  bool Changed = false;
-  for (BasicBlock &BB : F) {
-    Changed |= runOnBasicBlock(BB);
-  }
-  return Changed;
-}
-
-PreservedAnalyses LowerAtomicPass::run(Function &F, FunctionAnalysisManager &) {
-  if (lowerAtomics(F))
-    return PreservedAnalyses::none();
-  return PreservedAnalyses::all();
-}
-
-namespace {
-class LowerAtomicLegacyPass : public FunctionPass {
-public:
-  static char ID;
-
-  LowerAtomicLegacyPass() : FunctionPass(ID) {
-    initializeLowerAtomicLegacyPassPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnFunction(Function &F) override {
-    // Don't skip optnone functions; atomics still need to be lowered.
-    FunctionAnalysisManager DummyFAM;
-    auto PA = Impl.run(F, DummyFAM);
-    return !PA.areAllPreserved();
-  }
-
-private:
-  LowerAtomicPass Impl;
-  };
-}
-
-char LowerAtomicLegacyPass::ID = 0;
-INITIALIZE_PASS(LowerAtomicLegacyPass, "loweratomic",
-                "Lower atomic intrinsics to non-atomic form", false, false)
-
-Pass *llvm::createLowerAtomicPass() { return new LowerAtomicLegacyPass(); }
diff --git a/llvm/lib/Transforms/Scalar/LowerAtomicPass.cpp b/llvm/lib/Transforms/Scalar/LowerAtomicPass.cpp
new file mode 100644
index 000000000000..6aba913005d0
--- /dev/null
+++ b/llvm/lib/Transforms/Scalar/LowerAtomicPass.cpp
@@ -0,0 +1,99 @@
+//===- LowerAtomic.cpp - Lower atomic intrinsics --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers atomic intrinsics to non-atomic form for use in a known
+// non-preemptible environment.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerAtomicPass.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/LowerAtomic.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loweratomic"
+
+static bool LowerFenceInst(FenceInst *FI) {
+  FI->eraseFromParent();
+  return true;
+}
+
+static bool LowerLoadInst(LoadInst *LI) {
+  LI->setAtomic(AtomicOrdering::NotAtomic);
+  return true;
+}
+
+static bool LowerStoreInst(StoreInst *SI) {
+  SI->setAtomic(AtomicOrdering::NotAtomic);
+  return true;
+}
+
+static bool runOnBasicBlock(BasicBlock &BB) {
+  bool Changed = false;
+  for (Instruction &Inst : make_early_inc_range(BB)) {
+    if (FenceInst *FI = dyn_cast<FenceInst>(&Inst))
+      Changed |= LowerFenceInst(FI);
+    else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(&Inst))
+      Changed |= lowerAtomicCmpXchgInst(CXI);
+    else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(&Inst))
+      Changed |= lowerAtomicRMWInst(RMWI);
+    else if (LoadInst *LI = dyn_cast<LoadInst>(&Inst)) {
+      if (LI->isAtomic())
+        LowerLoadInst(LI);
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(&Inst)) {
+      if (SI->isAtomic())
+        LowerStoreInst(SI);
+    }
+  }
+  return Changed;
+}
+
+static bool lowerAtomics(Function &F) {
+  bool Changed = false;
+  for (BasicBlock &BB : F) {
+    Changed |= runOnBasicBlock(BB);
+  }
+  return Changed;
+}
+
+PreservedAnalyses LowerAtomicPass::run(Function &F, FunctionAnalysisManager &) {
+  if (lowerAtomics(F))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+namespace {
+class LowerAtomicLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  LowerAtomicLegacyPass() : FunctionPass(ID) {
+    initializeLowerAtomicLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    // Don't skip optnone functions; atomics still need to be lowered.
+    FunctionAnalysisManager DummyFAM;
+    auto PA = Impl.run(F, DummyFAM);
+    return !PA.areAllPreserved();
+  }
+
+private:
+  LowerAtomicPass Impl;
+  };
+}
+
+char LowerAtomicLegacyPass::ID = 0;
+INITIALIZE_PASS(LowerAtomicLegacyPass, "loweratomic",
+                "Lower atomic intrinsics to non-atomic form", false, false)
+
+Pass *llvm::createLowerAtomicPass() { return new LowerAtomicLegacyPass(); }
diff --git a/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
index 186065db327e..47493b54a527 100644
--- a/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
@@ -26,11 +26,9 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
 
@@ -96,7 +94,7 @@ static bool replaceConditionalBranchesOnConstant(Instruction *II,
   return HasDeadBlocks;
 }
 
-static bool lowerConstantIntrinsics(Function &F, const TargetLibraryInfo *TLI,
+static bool lowerConstantIntrinsics(Function &F, const TargetLibraryInfo &TLI,
                                     DominatorTree *DT) {
   Optional<DomTreeUpdater> DTU;
   if (DT)
@@ -140,21 +138,21 @@ static bool lowerConstantIntrinsics(Function &F, const TargetLibraryInfo *TLI,
       IsConstantIntrinsicsHandled++;
       break;
     case Intrinsic::objectsize:
-      NewValue = lowerObjectSizeCall(II, DL, TLI, true);
+      NewValue = lowerObjectSizeCall(II, DL, &TLI, true);
       ObjectSizeIntrinsicsHandled++;
       break;
     }
     HasDeadBlocks |= replaceConditionalBranchesOnConstant(
-        II, NewValue, DTU.hasValue() ? DTU.getPointer() : nullptr);
+        II, NewValue, DTU ? DTU.getPointer() : nullptr);
   }
   if (HasDeadBlocks)
-    removeUnreachableBlocks(F, DTU.hasValue() ? DTU.getPointer() : nullptr);
+    removeUnreachableBlocks(F, DTU ? DTU.getPointer() : nullptr);
   return !Worklist.empty();
 }
 
 PreservedAnalyses
 LowerConstantIntrinsicsPass::run(Function &F, FunctionAnalysisManager &AM) {
-  if (lowerConstantIntrinsics(F, AM.getCachedResult<TargetLibraryAnalysis>(F),
+  if (lowerConstantIntrinsics(F, AM.getResult<TargetLibraryAnalysis>(F),
                               AM.getCachedResult<DominatorTreeAnalysis>(F))) {
     PreservedAnalyses PA;
     PA.preserve<DominatorTreeAnalysis>();
@@ -178,8 +176,8 @@ public:
   }
 
   bool runOnFunction(Function &F) override {
-    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
-    const TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
+    const TargetLibraryInfo &TLI =
+        getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
     DominatorTree *DT = nullptr;
     if (auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>())
       DT = &DTWP->getDomTree();
@@ -187,6 +185,7 @@ public:
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
   }
@@ -196,6 +195,7 @@ public:
 char LowerConstantIntrinsics::ID = 0;
 INITIALIZE_PASS_BEGIN(LowerConstantIntrinsics, "lower-constant-intrinsics",
                       "Lower constant intrinsics", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_END(LowerConstantIntrinsics, "lower-constant-intrinsics",
                     "Lower constant intrinsics", false, false)
diff --git a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
index a7eb60b5e032..88fad9896c59 100644
--- a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -21,12 +21,11 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Metadata.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/MisExpect.h"
 
 using namespace llvm;
 
@@ -101,6 +100,8 @@ static bool handleSwitchExpect(SwitchInst &SI) {
   uint64_t Index = (Case == *SI.case_default()) ? 0 : Case.getCaseIndex() + 1;
   Weights[Index] = LikelyBranchWeightVal;
 
+  misexpect::checkExpectAnnotations(SI, Weights, /*IsFrontend=*/true);
+
   SI.setCondition(ArgValue);
 
   SI.setMetadata(LLVMContext::MD_prof,
@@ -315,13 +316,16 @@ template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) {
   std::tie(LikelyBranchWeightVal, UnlikelyBranchWeightVal) =
       getBranchWeight(Fn->getIntrinsicID(), CI, 2);
 
+  SmallVector<uint32_t, 4> ExpectedWeights;
   if ((ExpectedValue->getZExtValue() == ValueComparedTo) ==
       (Predicate == CmpInst::ICMP_EQ)) {
     Node =
         MDB.createBranchWeights(LikelyBranchWeightVal, UnlikelyBranchWeightVal);
+    ExpectedWeights = {LikelyBranchWeightVal, UnlikelyBranchWeightVal};
   } else {
     Node =
         MDB.createBranchWeights(UnlikelyBranchWeightVal, LikelyBranchWeightVal);
+    ExpectedWeights = {UnlikelyBranchWeightVal, LikelyBranchWeightVal};
   }
 
   if (CmpI)
@@ -329,6 +333,8 @@ template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) {
   else
     BSI.setCondition(ArgValue);
 
+  misexpect::checkFrontendInstrumentation(BSI, ExpectedWeights);
+
   BSI.setMetadata(LLVMContext::MD_prof, Node);
 
   return true;
@@ -409,7 +415,7 @@ public:
 
   bool runOnFunction(Function &F) override { return lowerExpectIntrinsic(F); }
 };
-}
+} // namespace
 
 char LowerExpectIntrinsic::ID = 0;
 INITIALIZE_PASS(LowerExpectIntrinsic, "lower-expect",
diff --git a/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp b/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
index 45f5929e3b90..8dc037b10cc8 100644
--- a/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
@@ -15,7 +15,6 @@
 #include "llvm/Transforms/Scalar/LowerGuardIntrinsic.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/GuardUtils.h"
-#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
@@ -49,9 +48,13 @@ static bool lowerGuardIntrinsic(Function &F) {
     return false;
 
   SmallVector<CallInst *, 8> ToLower;
-  for (auto &I : instructions(F))
-    if (isGuard(&I))
-      ToLower.push_back(cast<CallInst>(&I));
+  // Traverse through the users of GuardDecl.
+  // This is presumably cheaper than traversing all instructions in the
+  // function.
+  for (auto *U : GuardDecl->users())
+    if (auto *CI = dyn_cast<CallInst>(U))
+      if (CI->getFunction() == &F)
+        ToLower.push_back(CI);
 
   if (ToLower.empty())
     return false;
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 296becb31e8f..c05906649f16 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -18,11 +18,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h"
-#include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -704,10 +704,10 @@ public:
         // We may remove II.  By default continue on the next/prev instruction.
         ++II;
         // If we were to erase II, move again.
-        auto EraseFromParent = [&II](Value *V) {
+        auto EraseFromParent = [&II, &BB](Value *V) {
           auto *Inst = cast<Instruction>(V);
           if (Inst->use_empty()) {
-            if (Inst == &*II) {
+            if (II != BB.rend() && Inst == &*II) {
               ++II;
             }
             Inst->eraseFromParent();
@@ -718,7 +718,7 @@ public:
         Instruction *NewInst = nullptr;
 
         IRBuilder<> IB(&I);
-        MatrixBuilder<IRBuilder<>> Builder(IB);
+        MatrixBuilder Builder(IB);
 
         Value *TA, *TAMA, *TAMB;
         ConstantInt *R, *K, *C;
@@ -766,28 +766,25 @@ public:
     // If we have a TT matmul, lift the transpose.  We may be able to fold into
     // consuming multiply.
     for (BasicBlock &BB : Func) {
-      for (BasicBlock::iterator II = BB.begin(); II != BB.end();) {
-        Instruction *I = &*II;
-        // We may remove I.
-        ++II;
+      for (Instruction &I : llvm::make_early_inc_range(BB)) {
         Value *A, *B, *AT, *BT;
         ConstantInt *R, *K, *C;
         // A^t * B ^t -> (B * A)^t
-        if (match(&*I, m_Intrinsic<Intrinsic::matrix_multiply>(
-                           m_Value(A), m_Value(B), m_ConstantInt(R),
-                           m_ConstantInt(K), m_ConstantInt(C))) &&
+        if (match(&I, m_Intrinsic<Intrinsic::matrix_multiply>(
+                          m_Value(A), m_Value(B), m_ConstantInt(R),
+                          m_ConstantInt(K), m_ConstantInt(C))) &&
             match(A, m_Intrinsic<Intrinsic::matrix_transpose>(m_Value(AT))) &&
             match(B, m_Intrinsic<Intrinsic::matrix_transpose>(m_Value((BT))))) {
-          IRBuilder<> IB(&*I);
-          MatrixBuilder<IRBuilder<>> Builder(IB);
+          IRBuilder<> IB(&I);
+          MatrixBuilder Builder(IB);
           Value *M = Builder.CreateMatrixMultiply(
               BT, AT, C->getZExtValue(), K->getZExtValue(), R->getZExtValue());
           setShapeInfo(M, {C, R});
           Instruction *NewInst = Builder.CreateMatrixTranspose(
               M, C->getZExtValue(), R->getZExtValue());
-          ReplaceAllUsesWith(*I, NewInst);
-          if (I->use_empty())
-            I->eraseFromParent();
+          ReplaceAllUsesWith(I, NewInst);
+          if (I.use_empty())
+            I.eraseFromParent();
           if (A->use_empty())
             cast<Instruction>(A)->eraseFromParent();
           if (A != B && B->use_empty())
@@ -891,27 +888,27 @@ public:
     // having to update as many def-use and use-def chains.
     //
     // Because we add to ToRemove during fusion we can't guarantee that defs
-    // are before uses.  Change uses to undef temporarily as these should get
+    // are before uses.  Change uses to poison temporarily as these should get
     // removed as well.
     //
-    // For verification, we keep track of where we changed uses to undefs in
-    // UndefedInsts and then check that we in fact remove them.
-    SmallSet<Instruction *, 16> UndefedInsts;
+    // For verification, we keep track of where we changed uses to poison in
+    // PoisonedInsts and then check that we in fact remove them.
+    SmallSet<Instruction *, 16> PoisonedInsts;
     for (auto *Inst : reverse(ToRemove)) {
       for (Use &U : llvm::make_early_inc_range(Inst->uses())) {
-        if (auto *Undefed = dyn_cast<Instruction>(U.getUser()))
-          UndefedInsts.insert(Undefed);
-        U.set(UndefValue::get(Inst->getType()));
+        if (auto *Poisoned = dyn_cast<Instruction>(U.getUser()))
+          PoisonedInsts.insert(Poisoned);
+        U.set(PoisonValue::get(Inst->getType()));
       }
       Inst->eraseFromParent();
-      UndefedInsts.erase(Inst);
+      PoisonedInsts.erase(Inst);
     }
-    if (!UndefedInsts.empty()) {
-      // If we didn't remove all undefed instructions, it's a hard error.
-      dbgs() << "Undefed but present instructions:\n";
-      for (auto *I : UndefedInsts)
+    if (!PoisonedInsts.empty()) {
+      // If we didn't remove all poisoned instructions, it's a hard error.
+      dbgs() << "Poisoned but present instructions:\n";
+      for (auto *I : PoisonedInsts)
         dbgs() << *I << "\n";
-      llvm_unreachable("Undefed but instruction not removed");
+      llvm_unreachable("Poisoned but instruction not removed");
     }
 
     return Changed;
@@ -1670,7 +1667,7 @@ public:
 
     for (unsigned I = 0; I < NewNumVecs; ++I) {
       // Build a single result vector. First initialize it.
-      Value *ResultVector = UndefValue::get(
+      Value *ResultVector = PoisonValue::get(
           FixedVectorType::get(VectorTy->getElementType(), NewNumElts));
       // Go through the old elements and insert it into the resulting vector.
       for (auto J : enumerate(InputMatrix.vectors())) {
diff --git a/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp b/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp
index 73b2cd06fa23..e2de322933bc 100644
--- a/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp
@@ -13,8 +13,6 @@
 
 #include "llvm/Transforms/Scalar/LowerWidenableCondition.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/GuardUtils.h"
-#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
@@ -24,7 +22,6 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/GuardUtils.h"
 
 using namespace llvm;
 
@@ -50,9 +47,13 @@ static bool lowerWidenableCondition(Function &F) {
 
   using namespace llvm::PatternMatch;
   SmallVector<CallInst *, 8> ToLower;
-  for (auto &I : instructions(F))
-    if (match(&I, m_Intrinsic<Intrinsic::experimental_widenable_condition>()))
-      ToLower.push_back(cast<CallInst>(&I));
+  // Traverse through the users of WCDecl.
+  // This is presumably cheaper than traversing all instructions in the
+  // function.
+  for (auto *U : WCDecl->users())
+    if (auto *CI = dyn_cast<CallInst>(U))
+      if (CI->getFunction() == &F)
+        ToLower.push_back(CI);
 
   if (ToLower.empty())
     return false;
diff --git a/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp b/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp
index 5ffae128f5f0..a3f09a5a33c3 100644
--- a/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp
+++ b/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp
@@ -33,13 +33,11 @@
 
 #include "llvm/Transforms/Scalar/MakeGuardsExplicit.h"
 #include "llvm/Analysis/GuardUtils.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
-#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/GuardUtils.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 6698db26626b..1f5bc69acecd 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -28,14 +28,12 @@
 #include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Argument.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
@@ -45,7 +43,6 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
@@ -61,15 +58,13 @@
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
-#include <utility>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "memcpyopt"
 
 static cl::opt<bool> EnableMemCpyOptWithoutLibcalls(
-    "enable-memcpyopt-without-libcalls", cl::init(false), cl::Hidden,
-    cl::ZeroOrMore,
+    "enable-memcpyopt-without-libcalls", cl::Hidden,
     cl::desc("Enable memcpyopt even when libcalls are disabled"));
 
 STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted");
@@ -100,7 +95,7 @@ struct MemsetRange {
   Value *StartPtr;
 
   /// Alignment - The known alignment of the first store.
-  unsigned Alignment;
+  MaybeAlign Alignment;
 
   /// TheStores - The actual stores that make up this range.
   SmallVector<Instruction*, 16> TheStores;
@@ -182,16 +177,16 @@ public:
     TypeSize StoreSize = DL.getTypeStoreSize(SI->getOperand(0)->getType());
     assert(!StoreSize.isScalable() && "Can't track scalable-typed stores");
     addRange(OffsetFromFirst, StoreSize.getFixedSize(), SI->getPointerOperand(),
-             SI->getAlign().value(), SI);
+             SI->getAlign(), SI);
   }
 
   void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) {
     int64_t Size = cast<ConstantInt>(MSI->getLength())->getZExtValue();
-    addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getDestAlignment(), MSI);
+    addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getDestAlign(), MSI);
   }
 
-  void addRange(int64_t Start, int64_t Size, Value *Ptr,
-                unsigned Alignment, Instruction *Inst);
+  void addRange(int64_t Start, int64_t Size, Value *Ptr, MaybeAlign Alignment,
+                Instruction *Inst);
 };
 
 } // end anonymous namespace
@@ -200,7 +195,7 @@ public:
 /// new range for the specified store at the specified offset, merging into
 /// existing ranges as appropriate.
 void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
-                            unsigned Alignment, Instruction *Inst) {
+                            MaybeAlign Alignment, Instruction *Inst) {
   int64_t End = Start+Size;
 
   range_iterator I = partition_point(
@@ -352,9 +347,25 @@ static bool accessedBetween(AliasAnalysis &AA, MemoryLocation Loc,
 
 // Check for mod of Loc between Start and End, excluding both boundaries.
 // Start and End can be in different blocks.
-static bool writtenBetween(MemorySSA *MSSA, MemoryLocation Loc,
-                           const MemoryUseOrDef *Start,
+static bool writtenBetween(MemorySSA *MSSA, AliasAnalysis &AA,
+                           MemoryLocation Loc, const MemoryUseOrDef *Start,
                            const MemoryUseOrDef *End) {
+  if (isa<MemoryUse>(End)) {
+    // For MemoryUses, getClobberingMemoryAccess may skip non-clobbering writes.
+    // Manually check read accesses between Start and End, if they are in the
+    // same block, for clobbers. Otherwise assume Loc is clobbered.
+    return Start->getBlock() != End->getBlock() ||
+           any_of(
+               make_range(std::next(Start->getIterator()), End->getIterator()),
+               [&AA, Loc](const MemoryAccess &Acc) {
+                 if (isa<MemoryUse>(&Acc))
+                   return false;
+                 Instruction *AccInst =
+                     cast<MemoryUseOrDef>(&Acc)->getMemoryInst();
+                 return isModSet(AA.getModRefInfo(AccInst, Loc));
+               });
+  }
+
   // TODO: Only walk until we hit Start.
   MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
       End->getDefiningAccess(), Loc);
@@ -492,7 +503,7 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
     StartPtr = Range.StartPtr;
 
     AMemSet = Builder.CreateMemSet(StartPtr, ByteVal, Range.End - Range.Start,
-                                   MaybeAlign(Range.Alignment));
+                                   Range.Alignment);
     LLVM_DEBUG(dbgs() << "Replace stores:\n"; for (Instruction *SI
                                                    : Range.TheStores) dbgs()
                                               << *SI << '\n';
@@ -749,36 +760,25 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
       // Detect cases where we're performing call slot forwarding, but
       // happen to be using a load-store pair to implement it, rather than
       // a memcpy.
-      CallInst *C = nullptr;
-      if (auto *LoadClobber = dyn_cast<MemoryUseOrDef>(
-              MSSA->getWalker()->getClobberingMemoryAccess(LI))) {
-        // The load most post-dom the call. Limit to the same block for now.
-        // TODO: Support non-local call-slot optimization?
-        if (LoadClobber->getBlock() == SI->getParent())
-          C = dyn_cast_or_null<CallInst>(LoadClobber->getMemoryInst());
-      }
-
-      if (C) {
-        // Check that nothing touches the dest of the "copy" between
-        // the call and the store.
-        MemoryLocation StoreLoc = MemoryLocation::get(SI);
-        if (accessedBetween(*AA, StoreLoc, MSSA->getMemoryAccess(C),
-                            MSSA->getMemoryAccess(SI)))
-          C = nullptr;
-      }
-
-      if (C) {
-        bool changed = performCallSlotOptzn(
-            LI, SI, SI->getPointerOperand()->stripPointerCasts(),
-            LI->getPointerOperand()->stripPointerCasts(),
-            DL.getTypeStoreSize(SI->getOperand(0)->getType()),
-            commonAlignment(SI->getAlign(), LI->getAlign()), C);
-        if (changed) {
-          eraseInstruction(SI);
-          eraseInstruction(LI);
-          ++NumMemCpyInstr;
-          return true;
-        }
+      auto GetCall = [&]() -> CallInst * {
+        // We defer this expensive clobber walk until the cheap checks
+        // have been done on the source inside performCallSlotOptzn.
+        if (auto *LoadClobber = dyn_cast<MemoryUseOrDef>(
+              MSSA->getWalker()->getClobberingMemoryAccess(LI)))
+          return dyn_cast_or_null<CallInst>(LoadClobber->getMemoryInst());
+        return nullptr;
+      };
+
+      bool changed = performCallSlotOptzn(
+          LI, SI, SI->getPointerOperand()->stripPointerCasts(),
+          LI->getPointerOperand()->stripPointerCasts(),
+          DL.getTypeStoreSize(SI->getOperand(0)->getType()),
+          std::min(SI->getAlign(), LI->getAlign()), GetCall);
+      if (changed) {
+        eraseInstruction(SI);
+        eraseInstruction(LI);
+        ++NumMemCpyInstr;
+        return true;
       }
     }
   }
@@ -853,7 +853,8 @@ bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
 bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
                                          Instruction *cpyStore, Value *cpyDest,
                                          Value *cpySrc, TypeSize cpySize,
-                                         Align cpyAlign, CallInst *C) {
+                                         Align cpyAlign,
+                                         std::function<CallInst *()> GetC) {
   // The general transformation to keep in mind is
   //
   //   call @func(..., src, ...)
@@ -872,11 +873,6 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
   if (cpySize.isScalable())
     return false;
 
-  // Lifetime marks shouldn't be operated on.
-  if (Function *F = C->getCalledFunction())
-    if (F->isIntrinsic() && F->getIntrinsicID() == Intrinsic::lifetime_start)
-      return false;
-
   // Require that src be an alloca.  This simplifies the reasoning considerably.
   auto *srcAlloca = dyn_cast<AllocaInst>(cpySrc);
   if (!srcAlloca)
@@ -893,6 +889,33 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
   if (cpySize < srcSize)
     return false;
 
+  CallInst *C = GetC();
+  if (!C)
+    return false;
+
+  // Lifetime marks shouldn't be operated on.
+  if (Function *F = C->getCalledFunction())
+    if (F->isIntrinsic() && F->getIntrinsicID() == Intrinsic::lifetime_start)
+      return false;
+
+
+  if (C->getParent() != cpyStore->getParent()) {
+    LLVM_DEBUG(dbgs() << "Call Slot: block local restriction\n");
+    return false;
+  }
+
+  MemoryLocation DestLoc = isa<StoreInst>(cpyStore) ?
+    MemoryLocation::get(cpyStore) :
+    MemoryLocation::getForDest(cast<MemCpyInst>(cpyStore));
+
+  // Check that nothing touches the dest of the copy between
+  // the call and the store/memcpy.
+  if (accessedBetween(*AA, DestLoc, MSSA->getMemoryAccess(C),
+                      MSSA->getMemoryAccess(cpyStore))) {
+    LLVM_DEBUG(dbgs() << "Call Slot: Dest pointer modified after call\n");
+    return false;
+  }
+
   // Check that accessing the first srcSize bytes of dest will not cause a
   // trap.  Otherwise the transform is invalid since it might cause a trap
   // to occur earlier than it otherwise would.
@@ -902,6 +925,7 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
     return false;
   }
 
+
   // Make sure that nothing can observe cpyDest being written early. There are
   // a number of cases to consider:
   //  1. cpyDest cannot be accessed between C and cpyStore as a precondition of
@@ -1118,7 +1142,7 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
   // then we could still perform the xform by moving M up to the first memcpy.
   // TODO: It would be sufficient to check the MDep source up to the memcpy
   // size of M, rather than MDep.
-  if (writtenBetween(MSSA, MemoryLocation::getForSource(MDep),
+  if (writtenBetween(MSSA, *AA, MemoryLocation::getForSource(MDep),
                      MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(M)))
     return false;
 
@@ -1215,14 +1239,14 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
   }
 
   // By default, create an unaligned memset.
-  unsigned Align = 1;
+  Align Alignment = Align(1);
   // If Dest is aligned, and SrcSize is constant, use the minimum alignment
   // of the sum.
-  const unsigned DestAlign =
-      std::max(MemSet->getDestAlignment(), MemCpy->getDestAlignment());
+  const Align DestAlign = std::max(MemSet->getDestAlign().valueOrOne(),
+                                   MemCpy->getDestAlign().valueOrOne());
   if (DestAlign > 1)
     if (auto *SrcSizeC = dyn_cast<ConstantInt>(SrcSize))
-      Align = MinAlign(SrcSizeC->getZExtValue(), DestAlign);
+      Alignment = commonAlignment(DestAlign, SrcSizeC->getZExtValue());
 
   IRBuilder<> Builder(MemCpy);
 
@@ -1241,11 +1265,11 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
       Ule, ConstantInt::getNullValue(DestSize->getType()), SizeDiff);
   unsigned DestAS = Dest->getType()->getPointerAddressSpace();
   Instruction *NewMemSet = Builder.CreateMemSet(
-      Builder.CreateGEP(Builder.getInt8Ty(),
-                        Builder.CreatePointerCast(Dest,
-                                                  Builder.getInt8PtrTy(DestAS)),
-                        SrcSize),
-      MemSet->getOperand(1), MemsetLen, MaybeAlign(Align));
+      Builder.CreateGEP(
+          Builder.getInt8Ty(),
+          Builder.CreatePointerCast(Dest, Builder.getInt8PtrTy(DestAS)),
+          SrcSize),
+      MemSet->getOperand(1), MemsetLen, Alignment);
 
   assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy)) &&
          "MemCpy must be a MemoryDef");
@@ -1402,7 +1426,8 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
       }
 
   MemoryUseOrDef *MA = MSSA->getMemoryAccess(M);
-  MemoryAccess *AnyClobber = MSSA->getWalker()->getClobberingMemoryAccess(MA);
+  // FIXME: Not using getClobberingMemoryAccess() here due to PR54682.
+  MemoryAccess *AnyClobber = MA->getDefiningAccess();
   MemoryLocation DestLoc = MemoryLocation::getForDest(M);
   const MemoryAccess *DestClobber =
       MSSA->getWalker()->getClobberingMemoryAccess(AnyClobber, DestLoc);
@@ -1431,28 +1456,20 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
     if (Instruction *MI = MD->getMemoryInst()) {
       if (auto *CopySize = dyn_cast<ConstantInt>(M->getLength())) {
         if (auto *C = dyn_cast<CallInst>(MI)) {
-          // The memcpy must post-dom the call. Limit to the same block for
-          // now. Additionally, we need to ensure that there are no accesses
-          // to dest between the call and the memcpy. Accesses to src will be
-          // checked by performCallSlotOptzn().
-          // TODO: Support non-local call-slot optimization?
-          if (C->getParent() == M->getParent() &&
-              !accessedBetween(*AA, DestLoc, MD, MA)) {
-            // FIXME: Can we pass in either of dest/src alignment here instead
-            // of conservatively taking the minimum?
-            Align Alignment = std::min(M->getDestAlign().valueOrOne(),
-                                       M->getSourceAlign().valueOrOne());
-            if (performCallSlotOptzn(
-                    M, M, M->getDest(), M->getSource(),
-                    TypeSize::getFixed(CopySize->getZExtValue()), Alignment,
-                    C)) {
-              LLVM_DEBUG(dbgs() << "Performed call slot optimization:\n"
-                                << "    call: " << *C << "\n"
-                                << "    memcpy: " << *M << "\n");
-              eraseInstruction(M);
-              ++NumMemCpyInstr;
-              return true;
-            }
+          // FIXME: Can we pass in either of dest/src alignment here instead
+          // of conservatively taking the minimum?
+          Align Alignment = std::min(M->getDestAlign().valueOrOne(),
+                                     M->getSourceAlign().valueOrOne());
+          if (performCallSlotOptzn(
+                  M, M, M->getDest(), M->getSource(),
+                  TypeSize::getFixed(CopySize->getZExtValue()), Alignment,
+                  [C]() -> CallInst * { return C; })) {
+            LLVM_DEBUG(dbgs() << "Performed call slot optimization:\n"
+                              << "    call: " << *C << "\n"
+                              << "    memcpy: " << *M << "\n");
+            eraseInstruction(M);
+            ++NumMemCpyInstr;
+            return true;
           }
         }
       }
@@ -1557,7 +1574,7 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
   //    *b = 42;
   //    foo(*a)
   // It would be invalid to transform the second memcpy into foo(*b).
-  if (writtenBetween(MSSA, MemoryLocation::getForSource(MDep),
+  if (writtenBetween(MSSA, *AA, MemoryLocation::getForSource(MDep),
                      MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(&CB)))
     return false;
 
diff --git a/llvm/lib/Transforms/Scalar/MergeICmps.cpp b/llvm/lib/Transforms/Scalar/MergeICmps.cpp
index aac0deea5be3..ce01ae5b2692 100644
--- a/llvm/lib/Transforms/Scalar/MergeICmps.cpp
+++ b/llvm/lib/Transforms/Scalar/MergeICmps.cpp
@@ -144,31 +144,33 @@ BCEAtom visitICmpLoadOperand(Value *const Val, BaseIdentifier &BaseId) {
     LLVM_DEBUG(dbgs() << "volatile or atomic\n");
     return {};
   }
-  Value *const Addr = LoadI->getOperand(0);
+  Value *Addr = LoadI->getOperand(0);
   if (Addr->getType()->getPointerAddressSpace() != 0) {
     LLVM_DEBUG(dbgs() << "from non-zero AddressSpace\n");
     return {};
   }
-  auto *const GEP = dyn_cast<GetElementPtrInst>(Addr);
-  if (!GEP)
-    return {};
-  LLVM_DEBUG(dbgs() << "GEP\n");
-  if (GEP->isUsedOutsideOfBlock(LoadI->getParent())) {
-    LLVM_DEBUG(dbgs() << "used outside of block\n");
-    return {};
-  }
-  const auto &DL = GEP->getModule()->getDataLayout();
-  if (!isDereferenceablePointer(GEP, LoadI->getType(), DL)) {
+  const auto &DL = LoadI->getModule()->getDataLayout();
+  if (!isDereferenceablePointer(Addr, LoadI->getType(), DL)) {
     LLVM_DEBUG(dbgs() << "not dereferenceable\n");
     // We need to make sure that we can do comparison in any order, so we
     // require memory to be unconditionnally dereferencable.
     return {};
   }
-  APInt Offset = APInt(DL.getPointerTypeSizeInBits(GEP->getType()), 0);
-  if (!GEP->accumulateConstantOffset(DL, Offset))
-    return {};
-  return BCEAtom(GEP, LoadI, BaseId.getBaseId(GEP->getPointerOperand()),
-                 Offset);
+
+  APInt Offset = APInt(DL.getPointerTypeSizeInBits(Addr->getType()), 0);
+  Value *Base = Addr;
+  auto *GEP = dyn_cast<GetElementPtrInst>(Addr);
+  if (GEP) {
+    LLVM_DEBUG(dbgs() << "GEP\n");
+    if (GEP->isUsedOutsideOfBlock(LoadI->getParent())) {
+      LLVM_DEBUG(dbgs() << "used outside of block\n");
+      return {};
+    }
+    if (!GEP->accumulateConstantOffset(DL, Offset))
+      return {};
+    Base = GEP->getPointerOperand();
+  }
+  return BCEAtom(GEP, LoadI, BaseId.getBaseId(Base), Offset);
 }
 
 // A comparison between two BCE atoms, e.g. `a == o.a` in the example at the
@@ -244,7 +246,7 @@ bool BCECmpBlock::canSinkBCECmpInst(const Instruction *Inst,
     auto MayClobber = [&](LoadInst *LI) {
       // If a potentially clobbering instruction comes before the load,
       // we can still safely sink the load.
-      return !Inst->comesBefore(LI) &&
+      return (Inst->getParent() != LI->getParent() || !Inst->comesBefore(LI)) &&
              isModSet(AA.getModRefInfo(Inst, MemoryLocation::get(LI)));
     };
     if (MayClobber(Cmp.Lhs.LoadI) || MayClobber(Cmp.Rhs.LoadI))
@@ -270,9 +272,8 @@ void BCECmpBlock::split(BasicBlock *NewParent, AliasAnalysis &AA) const {
   }
 
   // Do the actual spliting.
-  for (Instruction *Inst : reverse(OtherInsts)) {
-    Inst->moveBefore(&*NewParent->begin());
-  }
+  for (Instruction *Inst : reverse(OtherInsts))
+    Inst->moveBefore(*NewParent, NewParent->begin());
 }
 
 bool BCECmpBlock::canSplit(AliasAnalysis &AA) const {
@@ -368,8 +369,11 @@ Optional<BCECmpBlock> visitCmpBlock(Value *const Val, BasicBlock *const Block,
     return None;
 
   BCECmpBlock::InstructionSet BlockInsts(
-      {Result->Lhs.GEP, Result->Rhs.GEP, Result->Lhs.LoadI, Result->Rhs.LoadI,
-       Result->CmpI, BranchI});
+      {Result->Lhs.LoadI, Result->Rhs.LoadI, Result->CmpI, BranchI});
+  if (Result->Lhs.GEP)
+    BlockInsts.insert(Result->Lhs.GEP);
+  if (Result->Rhs.GEP)
+    BlockInsts.insert(Result->Rhs.GEP);
   return BCECmpBlock(std::move(*Result), Block, BlockInsts);
 }
 
@@ -604,8 +608,15 @@ static BasicBlock *mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
                          NextCmpBlock->getParent(), InsertBefore);
   IRBuilder<> Builder(BB);
   // Add the GEPs from the first BCECmpBlock.
-  Value *const Lhs = Builder.Insert(FirstCmp.Lhs().GEP->clone());
-  Value *const Rhs = Builder.Insert(FirstCmp.Rhs().GEP->clone());
+  Value *Lhs, *Rhs;
+  if (FirstCmp.Lhs().GEP)
+    Lhs = Builder.Insert(FirstCmp.Lhs().GEP->clone());
+  else
+    Lhs = FirstCmp.Lhs().LoadI->getPointerOperand();
+  if (FirstCmp.Rhs().GEP)
+    Rhs = Builder.Insert(FirstCmp.Rhs().GEP->clone());
+  else
+    Rhs = FirstCmp.Rhs().LoadI->getPointerOperand();
 
   Value *IsEqual = nullptr;
   LLVM_DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons -> "
diff --git a/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
index 734532a6670c..6383d6ea838b 100644
--- a/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -76,13 +76,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/Loads.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index f35c9212a6f9..876ef3c427a6 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -88,8 +88,6 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
@@ -1076,6 +1074,9 @@ const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,
                                                  Value *Arg1, Value *Arg2,
                                                  Instruction *I) const {
   auto *E = new (ExpressionAllocator) BasicExpression(2);
+  // TODO: we need to remove context instruction after Value Tracking
+  // can run without context instruction
+  const SimplifyQuery Q = SQ.getWithInstruction(I);
 
   E->setType(T);
   E->setOpcode(Opcode);
@@ -1091,7 +1092,7 @@ const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,
   E->op_push_back(lookupOperandLeader(Arg1));
   E->op_push_back(lookupOperandLeader(Arg2));
 
-  Value *V = SimplifyBinOp(Opcode, E->getOperand(0), E->getOperand(1), SQ);
+  Value *V = simplifyBinOp(Opcode, E->getOperand(0), E->getOperand(1), Q);
   if (auto Simplified = checkExprResults(E, I, V)) {
     addAdditionalUsers(Simplified, I);
     return Simplified.Expr;
@@ -1147,6 +1148,9 @@ NewGVN::ExprResult NewGVN::checkExprResults(Expression *E, Instruction *I,
 
 NewGVN::ExprResult NewGVN::createExpression(Instruction *I) const {
   auto *E = new (ExpressionAllocator) BasicExpression(I->getNumOperands());
+  // TODO: we need to remove context instruction after Value Tracking
+  // can run without context instruction
+  const SimplifyQuery Q = SQ.getWithInstruction(I);
 
   bool AllConstant = setBasicExpressionInfo(I, E);
 
@@ -1169,13 +1173,13 @@ NewGVN::ExprResult NewGVN::createExpression(Instruction *I) const {
       Predicate = CmpInst::getSwappedPredicate(Predicate);
     }
     E->setOpcode((CI->getOpcode() << 8) | Predicate);
-    // TODO: 25% of our time is spent in SimplifyCmpInst with pointer operands
+    // TODO: 25% of our time is spent in simplifyCmpInst with pointer operands
     assert(I->getOperand(0)->getType() == I->getOperand(1)->getType() &&
            "Wrong types on cmp instruction");
     assert((E->getOperand(0)->getType() == I->getOperand(0)->getType() &&
             E->getOperand(1)->getType() == I->getOperand(1)->getType()));
     Value *V =
-        SimplifyCmpInst(Predicate, E->getOperand(0), E->getOperand(1), SQ);
+        simplifyCmpInst(Predicate, E->getOperand(0), E->getOperand(1), Q);
     if (auto Simplified = checkExprResults(E, I, V))
       return Simplified;
   } else if (isa<SelectInst>(I)) {
@@ -1183,26 +1187,26 @@ NewGVN::ExprResult NewGVN::createExpression(Instruction *I) const {
         E->getOperand(1) == E->getOperand(2)) {
       assert(E->getOperand(1)->getType() == I->getOperand(1)->getType() &&
              E->getOperand(2)->getType() == I->getOperand(2)->getType());
-      Value *V = SimplifySelectInst(E->getOperand(0), E->getOperand(1),
-                                    E->getOperand(2), SQ);
+      Value *V = simplifySelectInst(E->getOperand(0), E->getOperand(1),
+                                    E->getOperand(2), Q);
       if (auto Simplified = checkExprResults(E, I, V))
         return Simplified;
     }
   } else if (I->isBinaryOp()) {
     Value *V =
-        SimplifyBinOp(E->getOpcode(), E->getOperand(0), E->getOperand(1), SQ);
+        simplifyBinOp(E->getOpcode(), E->getOperand(0), E->getOperand(1), Q);
     if (auto Simplified = checkExprResults(E, I, V))
       return Simplified;
   } else if (auto *CI = dyn_cast<CastInst>(I)) {
     Value *V =
-        SimplifyCastInst(CI->getOpcode(), E->getOperand(0), CI->getType(), SQ);
+        simplifyCastInst(CI->getOpcode(), E->getOperand(0), CI->getType(), Q);
     if (auto Simplified = checkExprResults(E, I, V))
       return Simplified;
   } else if (auto *GEPI = dyn_cast<GetElementPtrInst>(I)) {
     Value *V =
-        SimplifyGEPInst(GEPI->getSourceElementType(), *E->op_begin(),
+        simplifyGEPInst(GEPI->getSourceElementType(), *E->op_begin(),
                         makeArrayRef(std::next(E->op_begin()), E->op_end()),
-                        GEPI->isInBounds(), SQ);
+                        GEPI->isInBounds(), Q);
     if (auto Simplified = checkExprResults(E, I, V))
       return Simplified;
   } else if (AllConstant) {
@@ -1453,10 +1457,12 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
     if (Offset >= 0) {
       if (auto *C = dyn_cast<Constant>(
               lookupOperandLeader(DepSI->getValueOperand()))) {
-        LLVM_DEBUG(dbgs() << "Coercing load from store " << *DepSI
-                          << " to constant " << *C << "\n");
-        return createConstantExpression(
-            getConstantStoreValueForLoad(C, Offset, LoadType, DL));
+        if (Constant *Res =
+                getConstantStoreValueForLoad(C, Offset, LoadType, DL)) {
+          LLVM_DEBUG(dbgs() << "Coercing load from store " << *DepSI
+                            << " to constant " << *Res << "\n");
+          return createConstantExpression(Res);
+        }
       }
     }
   } else if (auto *DepLI = dyn_cast<LoadInst>(DepInst)) {
@@ -1503,9 +1509,8 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
   else if (auto *II = dyn_cast<IntrinsicInst>(DepInst)) {
     if (II->getIntrinsicID() == Intrinsic::lifetime_start)
       return createConstantExpression(UndefValue::get(LoadType));
-  } else if (isAllocationFn(DepInst, TLI))
-    if (auto *InitVal = getInitialValueOfAllocation(cast<CallBase>(DepInst),
-                                                    TLI, LoadType))
+  } else if (auto *InitVal =
+                 getInitialValueOfAllocation(DepInst, TLI, LoadType))
       return createConstantExpression(InitVal);
 
   return nullptr;
@@ -3142,9 +3147,8 @@ bool NewGVN::singleReachablePHIPath(
   // connected component finding in this routine, and it's probably not worth
   // the complexity for the time being. So, we just keep a set of visited
   // MemoryAccess and return true when we hit a cycle.
-  if (Visited.count(First))
+  if (!Visited.insert(First).second)
     return true;
-  Visited.insert(First);
 
   const auto *EndDef = First;
   for (auto *ChainDef : optimized_def_chain(First)) {
@@ -3353,7 +3357,7 @@ void NewGVN::verifyStoreExpressions() const {
 // instruction set, propagating value numbers, marking things touched, etc,
 // until the set of touched instructions is completely empty.
 void NewGVN::iterateTouchedInstructions() {
-  unsigned int Iterations = 0;
+  uint64_t Iterations = 0;
   // Figure out where touchedinstructions starts
   int FirstInstr = TouchedInstructions.find_first();
   // Nothing set, nothing to iterate, just return.
diff --git a/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
index e0d0301c1ef6..689a2a286cb9 100644
--- a/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
+++ b/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -125,6 +125,9 @@ static bool runPartiallyInlineLibCalls(Function &F, TargetLibraryInfo *TLI,
       if (Call->isNoBuiltin() || Call->isStrictFP())
         continue;
 
+      if (Call->isMustTailCall())
+        continue;
+
       // Skip if function either has local linkage or is not a known library
       // function.
       LibFunc LF;
@@ -137,7 +140,7 @@ static bool runPartiallyInlineLibCalls(Function &F, TargetLibraryInfo *TLI,
       case LibFunc_sqrt:
         if (TTI->haveFastSqrt(Call->getType()) &&
             optimizeSQRT(Call, CalledFunc, *CurrBB, BB, TTI,
-                         DTU.hasValue() ? DTU.getPointer() : nullptr))
+                         DTU ? DTU.getPointer() : nullptr))
           break;
         continue;
       default:
diff --git a/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp b/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
index a110f7d5c241..e1cc3fc71c3e 100644
--- a/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
+++ b/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
@@ -53,9 +53,9 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LegacyPassManager.h"
@@ -65,6 +65,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
 
 #define DEBUG_TYPE "safepoint-placement"
 
diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp
index c354fa177a60..da1737979305 100644
--- a/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -24,7 +24,6 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -42,7 +41,6 @@
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
@@ -54,7 +52,6 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -183,7 +180,7 @@ void ReassociatePass::BuildRankMap(Function &F,
     // we cannot move.  This ensures that the ranks for these instructions are
     // all different in the block.
     for (Instruction &I : *BB)
-      if (mayBeMemoryDependent(I))
+      if (mayHaveNonDefUseDependency(I))
         ValueRankMap[&I] = ++BBRank;
   }
 }
@@ -1076,7 +1073,7 @@ static BinaryOperator *ConvertShiftToMul(Instruction *Shl) {
 
   BinaryOperator *Mul =
     BinaryOperator::CreateMul(Shl->getOperand(0), MulCst, "", Shl);
-  Shl->setOperand(0, UndefValue::get(Shl->getType())); // Drop use of op.
+  Shl->setOperand(0, PoisonValue::get(Shl->getType())); // Drop use of op.
   Mul->takeName(Shl);
 
   // Everyone now refers to the mul instruction.
diff --git a/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
index a49b9ad3f62b..9dc64493a9ee 100644
--- a/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
+++ b/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -24,8 +24,6 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index b795ad3899bc..51e4a5773f3e 100644
--- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -258,6 +258,7 @@ struct GCPtrLivenessData {
 // base relation will remain.  Internally, we add a mixture of the two
 // types, then update all the second type to the first type
 using DefiningValueMapTy = MapVector<Value *, Value *>;
+using IsKnownBaseMapTy = MapVector<Value *, bool>;
 using PointerToBaseTy = MapVector<Value *, Value *>;
 using StatepointLiveSetTy = SetVector<Value *>;
 using RematerializedValueMapTy =
@@ -281,19 +282,29 @@ struct PartiallyConstructedSafepointRecord {
   RematerializedValueMapTy RematerializedValues;
 };
 
+struct RematerizlizationCandidateRecord {
+  // Chain from derived pointer to base.
+  SmallVector<Instruction *, 3> ChainToBase;
+  // Original base.
+  Value *RootOfChain;
+  // Cost of chain.
+  InstructionCost Cost;
+};
+using RematCandTy = MapVector<Value *, RematerizlizationCandidateRecord>;
+
 } // end anonymous namespace
 
 static ArrayRef<Use> GetDeoptBundleOperands(const CallBase *Call) {
   Optional<OperandBundleUse> DeoptBundle =
       Call->getOperandBundle(LLVMContext::OB_deopt);
 
-  if (!DeoptBundle.hasValue()) {
+  if (!DeoptBundle) {
     assert(AllowStatepointWithNoDeoptInfo &&
            "Found non-leaf call without deopt info!");
     return None;
   }
 
-  return DeoptBundle.getValue().Inputs;
+  return DeoptBundle->Inputs;
 }
 
 /// Compute the live-in set for every basic block in the function
@@ -385,45 +396,16 @@ static void analyzeParsePointLiveness(
   Result.LiveSet = LiveSet;
 }
 
-// Returns true is V is a knownBaseResult.
-static bool isKnownBaseResult(Value *V);
-
-// Returns true if V is a BaseResult that already exists in the IR, i.e. it is
-// not created by the findBasePointers algorithm.
-static bool isOriginalBaseResult(Value *V);
-
-namespace {
-
-/// A single base defining value - An immediate base defining value for an
-/// instruction 'Def' is an input to 'Def' whose base is also a base of 'Def'.
-/// For instructions which have multiple pointer [vector] inputs or that
-/// transition between vector and scalar types, there is no immediate base
-/// defining value.  The 'base defining value' for 'Def' is the transitive
-/// closure of this relation stopping at the first instruction which has no
-/// immediate base defining value.  The b.d.v. might itself be a base pointer,
-/// but it can also be an arbitrary derived pointer.
-struct BaseDefiningValueResult {
-  /// Contains the value which is the base defining value.
-  Value * const BDV;
-
-  /// True if the base defining value is also known to be an actual base
-  /// pointer.
-  const bool IsKnownBase;
-
-  BaseDefiningValueResult(Value *BDV, bool IsKnownBase)
-    : BDV(BDV), IsKnownBase(IsKnownBase) {
-#ifndef NDEBUG
-    // Check consistency between new and old means of checking whether a BDV is
-    // a base.
-    bool MustBeBase = isKnownBaseResult(BDV);
-    assert(!MustBeBase || MustBeBase == IsKnownBase);
-#endif
-  }
-};
+/// Returns true if V is a known base.
+static bool isKnownBase(Value *V, const IsKnownBaseMapTy &KnownBases);
 
-} // end anonymous namespace
+/// Caches the IsKnownBase flag for a value and asserts that it wasn't present
+/// in the cache before.
+static void setKnownBase(Value *V, bool IsKnownBase,
+                         IsKnownBaseMapTy &KnownBases);
 
-static BaseDefiningValueResult findBaseDefiningValue(Value *I);
+static Value *findBaseDefiningValue(Value *I, DefiningValueMapTy &Cache,
+                                    IsKnownBaseMapTy &KnownBases);
 
 /// Return a base defining value for the 'Index' element of the given vector
 /// instruction 'I'.  If Index is null, returns a BDV for the entire vector
@@ -434,76 +416,122 @@ static BaseDefiningValueResult findBaseDefiningValue(Value *I);
 /// vector returned is a BDV (and possibly a base) of the entire vector 'I'.
 /// If the later, the return pointer is a BDV (or possibly a base) for the
 /// particular element in 'I'.
-static BaseDefiningValueResult
-findBaseDefiningValueOfVector(Value *I) {
+static Value *findBaseDefiningValueOfVector(Value *I, DefiningValueMapTy &Cache,
+                                            IsKnownBaseMapTy &KnownBases) {
   // Each case parallels findBaseDefiningValue below, see that code for
   // detailed motivation.
 
-  if (isa<Argument>(I))
+  auto Cached = Cache.find(I);
+  if (Cached != Cache.end())
+    return Cached->second;
+
+  if (isa<Argument>(I)) {
     // An incoming argument to the function is a base pointer
-    return BaseDefiningValueResult(I, true);
+    Cache[I] = I;
+    setKnownBase(I, /* IsKnownBase */true, KnownBases);
+    return I;
+  }
 
-  if (isa<Constant>(I))
+  if (isa<Constant>(I)) {
     // Base of constant vector consists only of constant null pointers.
     // For reasoning see similar case inside 'findBaseDefiningValue' function.
-    return BaseDefiningValueResult(ConstantAggregateZero::get(I->getType()),
-                                   true);
+    auto *CAZ = ConstantAggregateZero::get(I->getType());
+    Cache[I] = CAZ;
+    setKnownBase(CAZ, /* IsKnownBase */true, KnownBases);
+    return CAZ;
+  }
 
-  if (isa<LoadInst>(I))
-    return BaseDefiningValueResult(I, true);
+  if (isa<LoadInst>(I)) {
+    Cache[I] = I;
+    setKnownBase(I, /* IsKnownBase */true, KnownBases);
+    return I;
+  }
 
-  if (isa<InsertElementInst>(I))
+  if (isa<InsertElementInst>(I)) {
     // We don't know whether this vector contains entirely base pointers or
     // not.  To be conservatively correct, we treat it as a BDV and will
     // duplicate code as needed to construct a parallel vector of bases.
-    return BaseDefiningValueResult(I, false);
+    Cache[I] = I;
+    setKnownBase(I, /* IsKnownBase */false, KnownBases);
+    return I;
+  }
 
-  if (isa<ShuffleVectorInst>(I))
+  if (isa<ShuffleVectorInst>(I)) {
     // We don't know whether this vector contains entirely base pointers or
     // not.  To be conservatively correct, we treat it as a BDV and will
     // duplicate code as needed to construct a parallel vector of bases.
     // TODO: There a number of local optimizations which could be applied here
     // for particular sufflevector patterns.
-    return BaseDefiningValueResult(I, false);
+    Cache[I] = I;
+    setKnownBase(I, /* IsKnownBase */false, KnownBases);
+    return I;
+  }
 
   // The behavior of getelementptr instructions is the same for vector and
   // non-vector data types.
-  if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
-    return findBaseDefiningValue(GEP->getPointerOperand());
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
+    auto *BDV =
+        findBaseDefiningValue(GEP->getPointerOperand(), Cache, KnownBases);
+    Cache[GEP] = BDV;
+    return BDV;
+  }
+
+  // The behavior of freeze instructions is the same for vector and
+  // non-vector data types.
+  if (auto *Freeze = dyn_cast<FreezeInst>(I)) {
+    auto *BDV = findBaseDefiningValue(Freeze->getOperand(0), Cache, KnownBases);
+    Cache[Freeze] = BDV;
+    return BDV;
+  }
 
   // If the pointer comes through a bitcast of a vector of pointers to
   // a vector of another type of pointer, then look through the bitcast
-  if (auto *BC = dyn_cast<BitCastInst>(I))
-    return findBaseDefiningValue(BC->getOperand(0));
+  if (auto *BC = dyn_cast<BitCastInst>(I)) {
+    auto *BDV = findBaseDefiningValue(BC->getOperand(0), Cache, KnownBases);
+    Cache[BC] = BDV;
+    return BDV;
+  }
 
   // We assume that functions in the source language only return base
   // pointers.  This should probably be generalized via attributes to support
   // both source language and internal functions.
-  if (isa<CallInst>(I) || isa<InvokeInst>(I))
-    return BaseDefiningValueResult(I, true);
+  if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
+    Cache[I] = I;
+    setKnownBase(I, /* IsKnownBase */true, KnownBases);
+    return I;
+  }
 
   // A PHI or Select is a base defining value.  The outer findBasePointer
   // algorithm is responsible for constructing a base value for this BDV.
   assert((isa<SelectInst>(I) || isa<PHINode>(I)) &&
          "unknown vector instruction - no base found for vector element");
-  return BaseDefiningValueResult(I, false);
+  Cache[I] = I;
+  setKnownBase(I, /* IsKnownBase */false, KnownBases);
+  return I;
 }
 
 /// Helper function for findBasePointer - Will return a value which either a)
 /// defines the base pointer for the input, b) blocks the simple search
 /// (i.e. a PHI or Select of two derived pointers), or c) involves a change
 /// from pointer to vector type or back.
-static BaseDefiningValueResult findBaseDefiningValue(Value *I) {
+static Value *findBaseDefiningValue(Value *I, DefiningValueMapTy &Cache,
+                                    IsKnownBaseMapTy &KnownBases) {
   assert(I->getType()->isPtrOrPtrVectorTy() &&
          "Illegal to ask for the base pointer of a non-pointer type");
+  auto Cached = Cache.find(I);
+  if (Cached != Cache.end())
+    return Cached->second;
 
   if (I->getType()->isVectorTy())
-    return findBaseDefiningValueOfVector(I);
+    return findBaseDefiningValueOfVector(I, Cache, KnownBases);
 
-  if (isa<Argument>(I))
+  if (isa<Argument>(I)) {
     // An incoming argument to the function is a base pointer
     // We should have never reached here if this argument isn't an gc value
-    return BaseDefiningValueResult(I, true);
+    Cache[I] = I;
+    setKnownBase(I, /* IsKnownBase */true, KnownBases);
+    return I;
+  }
 
   if (isa<Constant>(I)) {
     // We assume that objects with a constant base (e.g. a global) can't move
@@ -516,8 +544,10 @@ static BaseDefiningValueResult findBaseDefiningValue(Value *I) {
     // "phi (const1, const2)" or "phi (const, regular gc ptr)".
     // See constant.ll file for relevant test cases.
 
-    return BaseDefiningValueResult(
-        ConstantPointerNull::get(cast<PointerType>(I->getType())), true);
+    auto *CPN = ConstantPointerNull::get(cast<PointerType>(I->getType()));
+    Cache[I] = CPN;
+    setKnownBase(CPN, /* IsKnownBase */true, KnownBases);
+    return CPN;
   }
 
   // inttoptrs in an integral address space are currently ill-defined.  We
@@ -525,8 +555,11 @@ static BaseDefiningValueResult findBaseDefiningValue(Value *I) {
   // constant rule above and because we don't really have a better semantic
   // to give them.  Note that the optimizer is always free to insert undefined
   // behavior on dynamically dead paths as well.
-  if (isa<IntToPtrInst>(I))
-    return BaseDefiningValueResult(I, true);
+  if (isa<IntToPtrInst>(I)) {
+    Cache[I] = I;
+    setKnownBase(I, /* IsKnownBase */true, KnownBases);
+    return I;
+  }
 
   if (CastInst *CI = dyn_cast<CastInst>(I)) {
     Value *Def = CI->stripPointerCasts();
@@ -539,16 +572,31 @@ static BaseDefiningValueResult findBaseDefiningValue(Value *I) {
     // not simply a pointer cast (i.e. an inttoptr).  We don't know how to
     // handle int->ptr conversion.
     assert(!isa<CastInst>(Def) && "shouldn't find another cast here");
-    return findBaseDefiningValue(Def);
+    auto *BDV = findBaseDefiningValue(Def, Cache, KnownBases);
+    Cache[CI] = BDV;
+    return BDV;
   }
 
-  if (isa<LoadInst>(I))
+  if (isa<LoadInst>(I)) {
     // The value loaded is an gc base itself
-    return BaseDefiningValueResult(I, true);
+    Cache[I] = I;
+    setKnownBase(I, /* IsKnownBase */true, KnownBases);
+    return I;
+  }
 
-  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I))
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
     // The base of this GEP is the base
-    return findBaseDefiningValue(GEP->getPointerOperand());
+    auto *BDV =
+        findBaseDefiningValue(GEP->getPointerOperand(), Cache, KnownBases);
+    Cache[GEP] = BDV;
+    return BDV;
+  }
+
+  if (auto *Freeze = dyn_cast<FreezeInst>(I)) {
+    auto *BDV = findBaseDefiningValue(Freeze->getOperand(0), Cache, KnownBases);
+    Cache[Freeze] = BDV;
+    return BDV;
+  }
 
   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
     switch (II->getIntrinsicID()) {
@@ -569,24 +617,32 @@ static BaseDefiningValueResult findBaseDefiningValue(Value *I) {
       llvm_unreachable(
           "interaction with the gcroot mechanism is not supported");
     case Intrinsic::experimental_gc_get_pointer_base:
-      return findBaseDefiningValue(II->getOperand(0));
+      auto *BDV = findBaseDefiningValue(II->getOperand(0), Cache, KnownBases);
+      Cache[II] = BDV;
+      return BDV;
     }
   }
   // We assume that functions in the source language only return base
   // pointers.  This should probably be generalized via attributes to support
   // both source language and internal functions.
-  if (isa<CallInst>(I) || isa<InvokeInst>(I))
-    return BaseDefiningValueResult(I, true);
+  if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
+    Cache[I] = I;
+    setKnownBase(I, /* IsKnownBase */true, KnownBases);
+    return I;
+  }
 
   // TODO: I have absolutely no idea how to implement this part yet.  It's not
   // necessarily hard, I just haven't really looked at it yet.
   assert(!isa<LandingPadInst>(I) && "Landing Pad is unimplemented");
 
-  if (isa<AtomicCmpXchgInst>(I))
+  if (isa<AtomicCmpXchgInst>(I)) {
     // A CAS is effectively a atomic store and load combined under a
     // predicate.  From the perspective of base pointers, we just treat it
     // like a load.
-    return BaseDefiningValueResult(I, true);
+    Cache[I] = I;
+    setKnownBase(I, /* IsKnownBase */true, KnownBases);
+    return I;
+  }
 
   assert(!isa<AtomicRMWInst>(I) && "Xchg handled above, all others are "
                                    "binary ops which don't apply to pointers");
@@ -594,8 +650,11 @@ static BaseDefiningValueResult findBaseDefiningValue(Value *I) {
   // The aggregate ops.  Aggregates can either be in the heap or on the
   // stack, but in either case, this is simply a field load.  As a result,
   // this is a defining definition of the base just like a load is.
-  if (isa<ExtractValueInst>(I))
-    return BaseDefiningValueResult(I, true);
+  if (isa<ExtractValueInst>(I)) {
+    Cache[I] = I;
+    setKnownBase(I, /* IsKnownBase */true, KnownBases);
+    return I;
+  }
 
   // We should never see an insert vector since that would require we be
   // tracing back a struct value not a pointer value.
@@ -606,6 +665,8 @@ static BaseDefiningValueResult findBaseDefiningValue(Value *I) {
   // substituting gc.get.pointer.base() intrinsic.
   bool IsKnownBase =
       isa<Instruction>(I) && cast<Instruction>(I)->getMetadata("is_base_value");
+  setKnownBase(I, /* IsKnownBase */IsKnownBase, KnownBases);
+  Cache[I] = I;
 
   // An extractelement produces a base result exactly when it's input does.
   // We may need to insert a parallel instruction to extract the appropriate
@@ -615,33 +676,38 @@ static BaseDefiningValueResult findBaseDefiningValue(Value *I) {
     // Note: There a lot of obvious peephole cases here.  This are deliberately
     // handled after the main base pointer inference algorithm to make writing
     // test cases to exercise that code easier.
-    return BaseDefiningValueResult(I, IsKnownBase);
+    return I;
 
   // The last two cases here don't return a base pointer.  Instead, they
   // return a value which dynamically selects from among several base
   // derived pointers (each with it's own base potentially).  It's the job of
   // the caller to resolve these.
   assert((isa<SelectInst>(I) || isa<PHINode>(I)) &&
-         "missing instruction case in findBaseDefiningValing");
-  return BaseDefiningValueResult(I, IsKnownBase);
+         "missing instruction case in findBaseDefiningValue");
+  return I;
 }
 
 /// Returns the base defining value for this value.
-static Value *findBaseDefiningValueCached(Value *I, DefiningValueMapTy &Cache) {
-  Value *&Cached = Cache[I];
-  if (!Cached) {
-    Cached = findBaseDefiningValue(I).BDV;
+static Value *findBaseDefiningValueCached(Value *I, DefiningValueMapTy &Cache,
+                                          IsKnownBaseMapTy &KnownBases) {
+  if (Cache.find(I) == Cache.end()) {
+    auto *BDV = findBaseDefiningValue(I, Cache, KnownBases);
+    Cache[I] = BDV;
     LLVM_DEBUG(dbgs() << "fBDV-cached: " << I->getName() << " -> "
-                      << Cached->getName() << "\n");
+                      << Cache[I]->getName() << ", is known base = "
+                      << KnownBases[I] << "\n");
   }
   assert(Cache[I] != nullptr);
-  return Cached;
+  assert(KnownBases.find(Cache[I]) != KnownBases.end() &&
+         "Cached value must be present in known bases map");
+  return Cache[I];
 }
 
 /// Return a base pointer for this value if known.  Otherwise, return it's
 /// base defining value.
-static Value *findBaseOrBDV(Value *I, DefiningValueMapTy &Cache) {
-  Value *Def = findBaseDefiningValueCached(I, Cache);
+static Value *findBaseOrBDV(Value *I, DefiningValueMapTy &Cache,
+                            IsKnownBaseMapTy &KnownBases) {
+  Value *Def = findBaseDefiningValueCached(I, Cache, KnownBases);
   auto Found = Cache.find(Def);
   if (Found != Cache.end()) {
     // Either a base-of relation, or a self reference.  Caller must check.
@@ -651,6 +717,7 @@ static Value *findBaseOrBDV(Value *I, DefiningValueMapTy &Cache) {
   return Def;
 }
 
+#ifndef NDEBUG
 /// This value is a base pointer that is not generated by RS4GC, i.e. it already
 /// exists in the code.
 static bool isOriginalBaseResult(Value *V) {
@@ -659,21 +726,22 @@ static bool isOriginalBaseResult(Value *V) {
          !isa<ExtractElementInst>(V) && !isa<InsertElementInst>(V) &&
          !isa<ShuffleVectorInst>(V);
 }
+#endif
 
-/// Given the result of a call to findBaseDefiningValue, or findBaseOrBDV,
-/// is it known to be a base pointer?  Or do we need to continue searching.
-static bool isKnownBaseResult(Value *V) {
-  if (isOriginalBaseResult(V))
-    return true;
-  if (isa<Instruction>(V) &&
-      cast<Instruction>(V)->getMetadata("is_base_value")) {
-    // This is a previously inserted base phi or select.  We know
-    // that this is a base value.
-    return true;
-  }
+static bool isKnownBase(Value *V, const IsKnownBaseMapTy &KnownBases) {
+  auto It = KnownBases.find(V);
+  assert(It != KnownBases.end() && "Value not present in the map");
+  return It->second;
+}
 
-  // We need to keep searching
-  return false;
+static void setKnownBase(Value *V, bool IsKnownBase,
+                         IsKnownBaseMapTy &KnownBases) {
+#ifndef NDEBUG
+  auto It = KnownBases.find(V);
+  if (It != KnownBases.end())
+    assert(It->second == IsKnownBase && "Changing already present value");
+#endif
+  KnownBases[V] = IsKnownBase;
 }
 
 // Returns true if First and Second values are both scalar or both vector.
@@ -801,10 +869,11 @@ static raw_ostream &operator<<(raw_ostream &OS, const BDVState &State) {
 /// For gc objects, this is simply itself.  On success, returns a value which is
 /// the base pointer.  (This is reliable and can be used for relocation.)  On
 /// failure, returns nullptr.
-static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
-  Value *Def = findBaseOrBDV(I, Cache);
+static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache,
+                              IsKnownBaseMapTy &KnownBases) {
+  Value *Def = findBaseOrBDV(I, Cache, KnownBases);
 
-  if (isKnownBaseResult(Def) && areBothVectorOrScalar(Def, I))
+  if (isKnownBase(Def, KnownBases) && areBothVectorOrScalar(Def, I))
     return Def;
 
   // Here's the rough algorithm:
@@ -887,8 +956,8 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
       assert(!isOriginalBaseResult(Current) && "why did it get added?");
 
       auto visitIncomingValue = [&](Value *InVal) {
-        Value *Base = findBaseOrBDV(InVal, Cache);
-        if (isKnownBaseResult(Base) && areBothVectorOrScalar(Base, InVal))
+        Value *Base = findBaseOrBDV(InVal, Cache, KnownBases);
+        if (isKnownBase(Base, KnownBases) && areBothVectorOrScalar(Base, InVal))
           // Known bases won't need new instructions introduced and can be
           // ignored safely. However, this can only be done when InVal and Base
           // are both scalar or both vector. Otherwise, we need to find a
@@ -924,12 +993,16 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
     for (auto Pair : States) {
       Value *BDV = Pair.first;
       auto canPruneInput = [&](Value *V) {
-        Value *BDV = findBaseOrBDV(V, Cache);
-        if (V->stripPointerCasts() != BDV)
+        // If the input of the BDV is the BDV itself we can prune it. This is
+        // only possible if the BDV is a PHI node.
+        if (V->stripPointerCasts() == BDV)
+          return true;
+        Value *VBDV = findBaseOrBDV(V, Cache, KnownBases);
+        if (V->stripPointerCasts() != VBDV)
           return false;
         // The assumption is that anything not in the state list is
         // propagates a base pointer.
-        return States.count(BDV) == 0;
+        return States.count(VBDV) == 0;
       };
 
       bool CanPrune = true;
@@ -975,13 +1048,13 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
       // Only values that do not have known bases or those that have differing
       // type (scalar versus vector) from a possible known base should be in the
       // lattice.
-      assert((!isKnownBaseResult(BDV) ||
+      assert((!isKnownBase(BDV, KnownBases) ||
              !areBothVectorOrScalar(BDV, Pair.second.getBaseValue())) &&
                  "why did it get added?");
 
       BDVState NewState(BDV);
       visitBDVOperands(BDV, [&](Value *Op) {
-        Value *BDV = findBaseOrBDV(Op, Cache);
+        Value *BDV = findBaseOrBDV(Op, Cache, KnownBases);
         auto OpState = GetStateForBDV(BDV, Op);
         NewState.meet(OpState);
       });
@@ -1014,8 +1087,9 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
     // Only values that do not have known bases or those that have differing
     // type (scalar versus vector) from a possible known base should be in the
     // lattice.
-    assert((!isKnownBaseResult(I) || !areBothVectorOrScalar(I, BaseValue)) &&
-           "why did it get added?");
+    assert(
+        (!isKnownBase(I, KnownBases) || !areBothVectorOrScalar(I, BaseValue)) &&
+        "why did it get added?");
     assert(!State.isUnknown() && "Optimistic algorithm didn't complete!");
 
     if (!State.isBase() || !isa<VectorType>(BaseValue->getType()))
@@ -1033,6 +1107,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
           State.getBaseValue(), EE->getIndexOperand(), "base_ee", EE);
       BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {}));
       States[I] = BDVState(I, BDVState::Base, BaseInst);
+      setKnownBase(BaseInst, /* IsKnownBase */true, KnownBases);
     } else if (!isa<VectorType>(I->getType())) {
       // We need to handle cases that have a vector base but the instruction is
       // a scalar type (these could be phis or selects or any instruction that
@@ -1055,7 +1130,8 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
     // Only values that do not have known bases or those that have differing
     // type (scalar versus vector) from a possible known base should be in the
     // lattice.
-    assert((!isKnownBaseResult(I) || !areBothVectorOrScalar(I, State.getBaseValue())) &&
+    assert((!isKnownBase(I, KnownBases) ||
+            !areBothVectorOrScalar(I, State.getBaseValue())) &&
            "why did it get added?");
     assert(!State.isUnknown() && "Optimistic algorithm didn't complete!");
 
@@ -1087,6 +1163,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
     // Add metadata marking this as a base value
     BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {}));
     States[I] = BDVState(I, BDVState::Conflict, BaseInst);
+    setKnownBase(BaseInst, /* IsKnownBase */true, KnownBases);
   }
 
 #ifndef NDEBUG
@@ -1102,7 +1179,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
   // assured to be able to determine an instruction which produces it's base
   // pointer.
   auto getBaseForInput = [&](Value *Input, Instruction *InsertPt) {
-    Value *BDV = findBaseOrBDV(Input, Cache);
+    Value *BDV = findBaseOrBDV(Input, Cache, KnownBases);
     Value *Base = nullptr;
     if (!States.count(BDV)) {
       assert(areBothVectorOrScalar(BDV, Input));
@@ -1129,7 +1206,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
     // Only values that do not have known bases or those that have differing
     // type (scalar versus vector) from a possible known base should be in the
     // lattice.
-    assert((!isKnownBaseResult(BDV) ||
+    assert((!isKnownBase(BDV, KnownBases) ||
             !areBothVectorOrScalar(BDV, State.getBaseValue())) &&
            "why did it get added?");
     assert(!State.isUnknown() && "Optimistic algorithm didn't complete!");
@@ -1154,13 +1231,21 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
 #ifndef NDEBUG
           Value *OldBase = BlockToValue[InBB];
           Value *Base = getBaseForInput(InVal, nullptr);
+
+          // We can't use `stripPointerCasts` instead of this function because
+          // `stripPointerCasts` doesn't handle vectors of pointers.
+          auto StripBitCasts = [](Value *V) -> Value * {
+            while (auto *BC = dyn_cast<BitCastInst>(V))
+              V = BC->getOperand(0);
+            return V;
+          };
           // In essence this assert states: the only way two values
           // incoming from the same basic block may be different is by
           // being different bitcasts of the same value.  A cleanup
           // that remains TODO is changing findBaseOrBDV to return an
           // llvm::Value of the correct type (and still remain pure).
           // This will remove the need to add bitcasts.
-          assert(Base->stripPointerCasts() == OldBase->stripPointerCasts() &&
+          assert(StripBitCasts(Base) == StripBitCasts(OldBase) &&
                  "findBaseOrBDV should be pure!");
 #endif
         }
@@ -1223,8 +1308,9 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
     // Only values that do not have known bases or those that have differing
     // type (scalar versus vector) from a possible known base should be in the
     // lattice.
-    assert((!isKnownBaseResult(BDV) || !areBothVectorOrScalar(BDV, Base)) &&
-           "why did it get added?");
+    assert(
+        (!isKnownBase(BDV, KnownBases) || !areBothVectorOrScalar(BDV, Base)) &&
+        "why did it get added?");
 
     LLVM_DEBUG(
         dbgs() << "Updating base value cache"
@@ -1255,9 +1341,10 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
 // pointer was a base pointer.
 static void findBasePointers(const StatepointLiveSetTy &live,
                              PointerToBaseTy &PointerToBase, DominatorTree *DT,
-                             DefiningValueMapTy &DVCache) {
+                             DefiningValueMapTy &DVCache,
+                             IsKnownBaseMapTy &KnownBases) {
   for (Value *ptr : live) {
-    Value *base = findBasePointer(ptr, DVCache);
+    Value *base = findBasePointer(ptr, DVCache, KnownBases);
     assert(base && "failed to find base pointer");
     PointerToBase[ptr] = base;
     assert((!isa<Instruction>(base) || !isa<Instruction>(ptr) ||
@@ -1272,7 +1359,8 @@ static void findBasePointers(const StatepointLiveSetTy &live,
 static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache,
                              CallBase *Call,
                              PartiallyConstructedSafepointRecord &result,
-                             PointerToBaseTy &PointerToBase) {
+                             PointerToBaseTy &PointerToBase,
+                             IsKnownBaseMapTy &KnownBases) {
   StatepointLiveSetTy PotentiallyDerivedPointers = result.LiveSet;
   // We assume that all pointers passed to deopt are base pointers; as an
   // optimization, we can use this to avoid seperately materializing the base
@@ -1286,7 +1374,8 @@ static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache,
       PotentiallyDerivedPointers.remove(V);
       PointerToBase[V] = V;
     }
-  findBasePointers(PotentiallyDerivedPointers, PointerToBase, &DT, DVCache);
+  findBasePointers(PotentiallyDerivedPointers, PointerToBase, &DT, DVCache,
+                   KnownBases);
 }
 
 /// Given an updated version of the dataflow liveness results, update the
@@ -1349,23 +1438,23 @@ static constexpr Attribute::AttrKind FnAttrsToStrip[] =
 // Create new attribute set containing only attributes which can be transferred
 // from original call to the safepoint.
 static AttributeList legalizeCallAttributes(LLVMContext &Ctx,
-                                            AttributeList AL) {
-  if (AL.isEmpty())
-    return AL;
+                                            AttributeList OrigAL,
+                                            AttributeList StatepointAL) {
+  if (OrigAL.isEmpty())
+    return StatepointAL;
 
   // Remove the readonly, readnone, and statepoint function attributes.
-  AttrBuilder FnAttrs(Ctx, AL.getFnAttrs());
+  AttrBuilder FnAttrs(Ctx, OrigAL.getFnAttrs());
   for (auto Attr : FnAttrsToStrip)
     FnAttrs.removeAttribute(Attr);
 
-  for (Attribute A : AL.getFnAttrs()) {
+  for (Attribute A : OrigAL.getFnAttrs()) {
     if (isStatepointDirectiveAttr(A))
       FnAttrs.removeAttribute(A);
   }
 
   // Just skip parameter and return attributes for now
-  return AttributeList::get(Ctx, AttributeList::FunctionIndex,
-                            AttributeSet::get(Ctx, FnAttrs));
+  return StatepointAL.addFnAttributes(Ctx, FnAttrs);
 }
 
 /// Helper function to place all gc relocates necessary for the given
@@ -1570,8 +1659,8 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
     assert(DeoptLowering.equals("live-through") && "Unsupported value!");
   }
 
-  Value *CallTarget = Call->getCalledOperand();
-  if (Function *F = dyn_cast<Function>(CallTarget)) {
+  FunctionCallee CallTarget(Call->getFunctionType(), Call->getCalledOperand());
+  if (Function *F = dyn_cast<Function>(CallTarget.getCallee())) {
     auto IID = F->getIntrinsicID();
     if (IID == Intrinsic::experimental_deoptimize) {
       // Calls to llvm.experimental.deoptimize are lowered to calls to the
@@ -1589,8 +1678,7 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
       // the same module.  This is fine -- we assume the frontend knew what it
       // was doing when generating this kind of IR.
       CallTarget = F->getParent()
-                       ->getOrInsertFunction("__llvm_deoptimize", FTy)
-                       .getCallee();
+                       ->getOrInsertFunction("__llvm_deoptimize", FTy);
 
       IsDeoptimize = true;
     } else if (IID == Intrinsic::memcpy_element_unordered_atomic ||
@@ -1686,8 +1774,7 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
 
       CallTarget =
           F->getParent()
-              ->getOrInsertFunction(GetFunctionName(IID, ElementSizeCI), FTy)
-              .getCallee();
+              ->getOrInsertFunction(GetFunctionName(IID, ElementSizeCI), FTy);
     }
   }
 
@@ -1705,8 +1792,8 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
     // function attributes.  In case if we can handle this set of attributes -
     // set up function attrs directly on statepoint and return attrs later for
     // gc_result intrinsic.
-    SPCall->setAttributes(
-        legalizeCallAttributes(CI->getContext(), CI->getAttributes()));
+    SPCall->setAttributes(legalizeCallAttributes(
+        CI->getContext(), CI->getAttributes(), SPCall->getAttributes()));
 
     Token = cast<GCStatepointInst>(SPCall);
 
@@ -1732,8 +1819,8 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
     // function attributes.  In case if we can handle this set of attributes -
     // set up function attrs directly on statepoint and return attrs later for
     // gc_result intrinsic.
-    SPInvoke->setAttributes(
-        legalizeCallAttributes(II->getContext(), II->getAttributes()));
+    SPInvoke->setAttributes(legalizeCallAttributes(
+        II->getContext(), II->getAttributes(), SPInvoke->getAttributes()));
 
     Token = cast<GCStatepointInst>(SPInvoke);
 
@@ -2071,6 +2158,7 @@ static void relocationViaAlloca(
 
   assert(PromotableAllocas.size() == Live.size() + NumRematerializedValues &&
          "we must have the same allocas with lives");
+  (void) NumRematerializedValues;
   if (!PromotableAllocas.empty()) {
     // Apply mem2reg to promote alloca to SSA
     PromoteMemToReg(PromotableAllocas, DT);
@@ -2221,27 +2309,25 @@ static bool AreEquivalentPhiNodes(PHINode &OrigRootPhi, PHINode &AlternateRootPh
   return true;
 }
 
-// From the statepoint live set pick values that are cheaper to recompute then
-// to relocate. Remove this values from the live set, rematerialize them after
-// statepoint and record them in "Info" structure. Note that similar to
-// relocated values we don't do any user adjustments here.
-static void rematerializeLiveValues(CallBase *Call,
-                                    PartiallyConstructedSafepointRecord &Info,
-                                    PointerToBaseTy &PointerToBase,
-                                    TargetTransformInfo &TTI) {
+// Find derived pointers that can be recomputed cheap enough and fill
+// RematerizationCandidates with such candidates.
+static void
+findRematerializationCandidates(PointerToBaseTy PointerToBase,
+                                RematCandTy &RematerizationCandidates,
+                                TargetTransformInfo &TTI) {
   const unsigned int ChainLengthThreshold = 10;
 
-  // Record values we are going to delete from this statepoint live set.
-  // We can not di this in following loop due to iterator invalidation.
-  SmallVector<Value *, 32> LiveValuesToBeDeleted;
+  for (auto P2B : PointerToBase) {
+    auto *Derived = P2B.first;
+    auto *Base = P2B.second;
+    // Consider only derived pointers.
+    if (Derived == Base)
+      continue;
 
-  for (Value *LiveValue: Info.LiveSet) {
-    // For each live pointer find its defining chain
+    // For each live pointer find its defining chain.
     SmallVector<Instruction *, 3> ChainToBase;
-    assert(PointerToBase.count(LiveValue));
     Value *RootOfChain =
-      findRematerializableChainToBasePointer(ChainToBase,
-                                             LiveValue);
+        findRematerializableChainToBasePointer(ChainToBase, Derived);
 
     // Nothing to do, or chain is too long
     if ( ChainToBase.size() == 0 ||
@@ -2250,9 +2336,9 @@ static void rematerializeLiveValues(CallBase *Call,
 
     // Handle the scenario where the RootOfChain is not equal to the
     // Base Value, but they are essentially the same phi values.
-    if (RootOfChain != PointerToBase[LiveValue]) {
+    if (RootOfChain != PointerToBase[Derived]) {
       PHINode *OrigRootPhi = dyn_cast<PHINode>(RootOfChain);
-      PHINode *AlternateRootPhi = dyn_cast<PHINode>(PointerToBase[LiveValue]);
+      PHINode *AlternateRootPhi = dyn_cast<PHINode>(PointerToBase[Derived]);
       if (!OrigRootPhi || !AlternateRootPhi)
         continue;
       // PHI nodes that have the same incoming values, and belonging to the same
@@ -2266,33 +2352,61 @@ static void rematerializeLiveValues(CallBase *Call,
       // deficiency in the findBasePointer algorithm.
       if (!AreEquivalentPhiNodes(*OrigRootPhi, *AlternateRootPhi))
         continue;
-      // Now that the phi nodes are proved to be the same, assert that
-      // findBasePointer's newly generated AlternateRootPhi is present in the
-      // liveset of the call.
-      assert(Info.LiveSet.count(AlternateRootPhi));
     }
-    // Compute cost of this chain
+    // Compute cost of this chain.
     InstructionCost Cost = chainToBasePointerCost(ChainToBase, TTI);
     // TODO: We can also account for cases when we will be able to remove some
     //       of the rematerialized values by later optimization passes. I.e if
     //       we rematerialized several intersecting chains. Or if original values
     //       don't have any uses besides this statepoint.
 
+    // Ok, there is a candidate.
+    RematerizlizationCandidateRecord Record;
+    Record.ChainToBase = ChainToBase;
+    Record.RootOfChain = RootOfChain;
+    Record.Cost = Cost;
+    RematerizationCandidates.insert({ Derived, Record });
+  }
+}
+
+// From the statepoint live set pick values that are cheaper to recompute then
+// to relocate. Remove this values from the live set, rematerialize them after
+// statepoint and record them in "Info" structure. Note that similar to
+// relocated values we don't do any user adjustments here.
+static void rematerializeLiveValues(CallBase *Call,
+                                    PartiallyConstructedSafepointRecord &Info,
+                                    PointerToBaseTy &PointerToBase,
+                                    RematCandTy &RematerizationCandidates,
+                                    TargetTransformInfo &TTI) {
+  // Record values we are going to delete from this statepoint live set.
+  // We can not di this in following loop due to iterator invalidation.
+  SmallVector<Value *, 32> LiveValuesToBeDeleted;
+
+  for (Value *LiveValue : Info.LiveSet) {
+    auto It = RematerizationCandidates.find(LiveValue);
+    if (It == RematerizationCandidates.end())
+      continue;
+
+    RematerizlizationCandidateRecord &Record = It->second;
+
+    InstructionCost Cost = Record.Cost;
     // For invokes we need to rematerialize each chain twice - for normal and
     // for unwind basic blocks. Model this by multiplying cost by two.
-    if (isa<InvokeInst>(Call)) {
+    if (isa<InvokeInst>(Call))
       Cost *= 2;
-    }
-    // If it's too expensive - skip it
+
+    // If it's too expensive - skip it.
     if (Cost >= RematerializationThreshold)
       continue;
 
     // Remove value from the live set
     LiveValuesToBeDeleted.push_back(LiveValue);
 
-    // Clone instructions and record them inside "Info" structure
+    // Clone instructions and record them inside "Info" structure.
 
-    // Walk backwards to visit top-most instructions first
+    // For each live pointer find get its defining chain.
+    SmallVector<Instruction *, 3> ChainToBase = Record.ChainToBase;
+    // Walk backwards to visit top-most instructions first.
     std::reverse(ChainToBase.begin(), ChainToBase.end());
 
     // Utility function which clones all instructions from "ChainToBase"
@@ -2352,7 +2466,7 @@ static void rematerializeLiveValues(CallBase *Call,
       Instruction *InsertBefore = Call->getNextNode();
       assert(InsertBefore);
       Instruction *RematerializedValue = rematerializeChain(
-          InsertBefore, RootOfChain, PointerToBase[LiveValue]);
+          InsertBefore, Record.RootOfChain, PointerToBase[LiveValue]);
       Info.RematerializedValues[RematerializedValue] = LiveValue;
     } else {
       auto *Invoke = cast<InvokeInst>(Call);
@@ -2363,9 +2477,9 @@ static void rematerializeLiveValues(CallBase *Call,
           &*Invoke->getUnwindDest()->getFirstInsertionPt();
 
       Instruction *NormalRematerializedValue = rematerializeChain(
-          NormalInsertBefore, RootOfChain, PointerToBase[LiveValue]);
+          NormalInsertBefore, Record.RootOfChain, PointerToBase[LiveValue]);
       Instruction *UnwindRematerializedValue = rematerializeChain(
-          UnwindInsertBefore, RootOfChain, PointerToBase[LiveValue]);
+          UnwindInsertBefore, Record.RootOfChain, PointerToBase[LiveValue]);
 
       Info.RematerializedValues[NormalRematerializedValue] = LiveValue;
       Info.RematerializedValues[UnwindRematerializedValue] = LiveValue;
@@ -2380,7 +2494,8 @@ static void rematerializeLiveValues(CallBase *Call,
 
 static bool inlineGetBaseAndOffset(Function &F,
                                    SmallVectorImpl<CallInst *> &Intrinsics,
-                                   DefiningValueMapTy &DVCache) {
+                                   DefiningValueMapTy &DVCache,
+                                   IsKnownBaseMapTy &KnownBases) {
   auto &Context = F.getContext();
   auto &DL = F.getParent()->getDataLayout();
   bool Changed = false;
@@ -2389,7 +2504,8 @@ static bool inlineGetBaseAndOffset(Function &F,
     switch (Callsite->getIntrinsicID()) {
     case Intrinsic::experimental_gc_get_pointer_base: {
       Changed = true;
-      Value *Base = findBasePointer(Callsite->getOperand(0), DVCache);
+      Value *Base =
+          findBasePointer(Callsite->getOperand(0), DVCache, KnownBases);
       assert(!DVCache.count(Callsite));
       auto *BaseBC = IRBuilder<>(Callsite).CreateBitCast(
           Base, Callsite->getType(), suffixed_name_or(Base, ".cast", ""));
@@ -2404,7 +2520,7 @@ static bool inlineGetBaseAndOffset(Function &F,
     case Intrinsic::experimental_gc_get_pointer_offset: {
       Changed = true;
       Value *Derived = Callsite->getOperand(0);
-      Value *Base = findBasePointer(Derived, DVCache);
+      Value *Base = findBasePointer(Derived, DVCache, KnownBases);
       assert(!DVCache.count(Callsite));
       unsigned AddressSpace = Derived->getType()->getPointerAddressSpace();
       unsigned IntPtrSize = DL.getPointerSizeInBits(AddressSpace);
@@ -2431,7 +2547,8 @@ static bool inlineGetBaseAndOffset(Function &F,
 static bool insertParsePoints(Function &F, DominatorTree &DT,
                               TargetTransformInfo &TTI,
                               SmallVectorImpl<CallBase *> &ToUpdate,
-                              DefiningValueMapTy &DVCache) {
+                              DefiningValueMapTy &DVCache,
+                              IsKnownBaseMapTy &KnownBases) {
 #ifndef NDEBUG
   // Validate the input
   std::set<CallBase *> Uniqued;
@@ -2487,7 +2604,7 @@ static bool insertParsePoints(Function &F, DominatorTree &DT,
   // B) Find the base pointers for each live pointer
   for (size_t i = 0; i < Records.size(); i++) {
     PartiallyConstructedSafepointRecord &info = Records[i];
-    findBasePointers(DT, DVCache, ToUpdate[i], info, PointerToBase);
+    findBasePointers(DT, DVCache, ToUpdate[i], info, PointerToBase, KnownBases);
   }
   if (PrintBasePointers) {
     errs() << "Base Pairs (w/o Relocation):\n";
@@ -2563,11 +2680,16 @@ static bool insertParsePoints(Function &F, DominatorTree &DT,
 
   Holders.clear();
 
+  // Compute the cost of possible re-materialization of derived pointers.
+  RematCandTy RematerizationCandidates;
+  findRematerializationCandidates(PointerToBase, RematerizationCandidates, TTI);
+
   // In order to reduce live set of statepoint we might choose to rematerialize
   // some values instead of relocating them. This is purely an optimization and
   // does not influence correctness.
   for (size_t i = 0; i < Records.size(); i++)
-    rematerializeLiveValues(ToUpdate[i], Records[i], PointerToBase, TTI);
+    rematerializeLiveValues(ToUpdate[i], Records[i], PointerToBase,
+                            RematerizationCandidates, TTI);
 
   // We need this to safely RAUW and delete call or invoke return values that
   // may themselves be live over a statepoint.  For details, please see usage in
@@ -2930,13 +3052,18 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT,
   // inlineGetBaseAndOffset() and insertParsePoints().
   DefiningValueMapTy DVCache;
 
+  // Mapping between a base values and a flag indicating whether it's a known
+  // base or not.
+  IsKnownBaseMapTy KnownBases;
+
   if (!Intrinsics.empty())
     // Inline @gc.get.pointer.base() and @gc.get.pointer.offset() before finding
     // live references.
-    MadeChange |= inlineGetBaseAndOffset(F, Intrinsics, DVCache);
+    MadeChange |= inlineGetBaseAndOffset(F, Intrinsics, DVCache, KnownBases);
 
   if (!ParsePointNeeded.empty())
-    MadeChange |= insertParsePoints(F, DT, TTI, ParsePointNeeded, DVCache);
+    MadeChange |=
+        insertParsePoints(F, DT, TTI, ParsePointNeeded, DVCache, KnownBases);
 
   return MadeChange;
 }
diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp
index c34da51e6dc1..2282ef636076 100644
--- a/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -17,20 +17,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/SCCP.h"
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueLattice.h"
 #include "llvm/Analysis/ValueLatticeUtils.h"
@@ -38,14 +33,13 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
@@ -59,7 +53,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/PredicateInfo.h"
+#include "llvm/Transforms/Utils/SCCPSolver.h"
 #include <cassert>
 #include <utility>
 #include <vector>
@@ -97,6 +91,18 @@ static bool isOverdefined(const ValueLatticeElement &LV) {
   return !LV.isUnknownOrUndef() && !isConstant(LV);
 }
 
+static bool canRemoveInstruction(Instruction *I) {
+  if (wouldInstructionBeTriviallyDead(I))
+    return true;
+
+  // Some instructions can be handled but are rejected above. Catch
+  // those cases by falling through to here.
+  // TODO: Mark globals as being constant earlier, so
+  // TODO: wouldInstructionBeTriviallyDead() knows that atomic loads
+  // TODO: are safe to remove.
+  return isa<LoadInst>(I);
+}
+
 static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
   Constant *Const = nullptr;
   if (V->getType()->isStructTy()) {
@@ -127,7 +133,8 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
   // Calls with "clang.arc.attachedcall" implicitly use the return value and
   // those uses cannot be updated with a constant.
   CallBase *CB = dyn_cast<CallBase>(V);
-  if (CB && ((CB->isMustTailCall() && !CB->isSafeToRemove()) ||
+  if (CB && ((CB->isMustTailCall() &&
+              !canRemoveInstruction(CB)) ||
              CB->getOperandBundle(LLVMContext::OB_clang_arc_attachedcall))) {
     Function *F = CB->getCalledFunction();
 
@@ -156,7 +163,7 @@ static bool simplifyInstsInBlock(SCCPSolver &Solver, BasicBlock &BB,
     if (Inst.getType()->isVoidTy())
       continue;
     if (tryToReplaceWithConstant(Solver, &Inst)) {
-      if (Inst.isSafeToRemove())
+      if (canRemoveInstruction(&Inst))
         Inst.eraseFromParent();
 
       MadeChanges = true;
@@ -170,6 +177,7 @@ static bool simplifyInstsInBlock(SCCPSolver &Solver, BasicBlock &BB,
         continue;
       if (IV.getConstantRange().isAllNonNegative()) {
         auto *ZExt = new ZExtInst(ExtOp, Inst.getType(), "", &Inst);
+        ZExt->takeName(&Inst);
         InsertedValues.insert(ZExt);
         Inst.replaceAllUsesWith(ZExt);
         Solver.removeLatticeValueFor(&Inst);
@@ -182,10 +190,14 @@ static bool simplifyInstsInBlock(SCCPSolver &Solver, BasicBlock &BB,
   return MadeChanges;
 }
 
+static bool removeNonFeasibleEdges(const SCCPSolver &Solver, BasicBlock *BB,
+                                   DomTreeUpdater &DTU,
+                                   BasicBlock *&NewUnreachableBB);
+
 // runSCCP() - Run the Sparse Conditional Constant Propagation algorithm,
 // and return true if the function was modified.
 static bool runSCCP(Function &F, const DataLayout &DL,
-                    const TargetLibraryInfo *TLI) {
+                    const TargetLibraryInfo *TLI, DomTreeUpdater &DTU) {
   LLVM_DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n");
   SCCPSolver Solver(
       DL, [TLI](Function &F) -> const TargetLibraryInfo & { return *TLI; },
@@ -213,13 +225,12 @@ static bool runSCCP(Function &F, const DataLayout &DL,
   // as we cannot modify the CFG of the function.
 
   SmallPtrSet<Value *, 32> InsertedValues;
+  SmallVector<BasicBlock *, 8> BlocksToErase;
   for (BasicBlock &BB : F) {
     if (!Solver.isBlockExecutable(&BB)) {
       LLVM_DEBUG(dbgs() << "  BasicBlock Dead:" << BB);
-
       ++NumDeadBlocks;
-      NumInstRemoved += removeAllNonTerminatorAndEHPadInstructions(&BB).first;
-
+      BlocksToErase.push_back(&BB);
       MadeChanges = true;
       continue;
     }
@@ -228,17 +239,32 @@ static bool runSCCP(Function &F, const DataLayout &DL,
                                         NumInstRemoved, NumInstReplaced);
   }
 
+  // Remove unreachable blocks and non-feasible edges.
+  for (BasicBlock *DeadBB : BlocksToErase)
+    NumInstRemoved += changeToUnreachable(DeadBB->getFirstNonPHI(),
+                                          /*PreserveLCSSA=*/false, &DTU);
+
+  BasicBlock *NewUnreachableBB = nullptr;
+  for (BasicBlock &BB : F)
+    MadeChanges |= removeNonFeasibleEdges(Solver, &BB, DTU, NewUnreachableBB);
+
+  for (BasicBlock *DeadBB : BlocksToErase)
+    if (!DeadBB->hasAddressTaken())
+      DTU.deleteBB(DeadBB);
+
   return MadeChanges;
 }
 
 PreservedAnalyses SCCPPass::run(Function &F, FunctionAnalysisManager &AM) {
   const DataLayout &DL = F.getParent()->getDataLayout();
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
-  if (!runSCCP(F, DL, &TLI))
+  auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+  if (!runSCCP(F, DL, &TLI, DTU))
     return PreservedAnalyses::all();
 
   auto PA = PreservedAnalyses();
-  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<DominatorTreeAnalysis>();
   return PA;
 }
 
@@ -261,7 +287,7 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
-    AU.setPreservesCFG();
+    AU.addPreserved<DominatorTreeWrapperPass>();
   }
 
   // runOnFunction - Run the Sparse Conditional Constant Propagation
@@ -272,7 +298,10 @@ public:
     const DataLayout &DL = F.getParent()->getDataLayout();
     const TargetLibraryInfo *TLI =
         &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
-    return runSCCP(F, DL, TLI);
+    auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+    DomTreeUpdater DTU(DTWP ? &DTWP->getDomTree() : nullptr,
+                       DomTreeUpdater::UpdateStrategy::Lazy);
+    return runSCCP(F, DL, TLI, DTU);
   }
 };
 
@@ -342,7 +371,8 @@ static void findReturnsToZap(Function &F,
 }
 
 static bool removeNonFeasibleEdges(const SCCPSolver &Solver, BasicBlock *BB,
-                                   DomTreeUpdater &DTU) {
+                                   DomTreeUpdater &DTU,
+                                   BasicBlock *&NewUnreachableBB) {
   SmallPtrSet<BasicBlock *, 8> FeasibleSuccessors;
   bool HasNonFeasibleEdges = false;
   for (BasicBlock *Succ : successors(BB)) {
@@ -362,7 +392,19 @@ static bool removeNonFeasibleEdges(const SCCPSolver &Solver, BasicBlock *BB,
           isa<IndirectBrInst>(TI)) &&
          "Terminator must be a br, switch or indirectbr");
 
-  if (FeasibleSuccessors.size() == 1) {
+  if (FeasibleSuccessors.size() == 0) {
+    // Branch on undef/poison, replace with unreachable.
+    SmallPtrSet<BasicBlock *, 8> SeenSuccs;
+    SmallVector<DominatorTree::UpdateType, 8> Updates;
+    for (BasicBlock *Succ : successors(BB)) {
+      Succ->removePredecessor(BB);
+      if (SeenSuccs.insert(Succ).second)
+        Updates.push_back({DominatorTree::Delete, BB, Succ});
+    }
+    TI->eraseFromParent();
+    new UnreachableInst(BB->getContext(), BB);
+    DTU.applyUpdatesPermissive(Updates);
+  } else if (FeasibleSuccessors.size() == 1) {
     // Replace with an unconditional branch to the only feasible successor.
     BasicBlock *OnlyFeasibleSuccessor = *FeasibleSuccessors.begin();
     SmallVector<DominatorTree::UpdateType, 8> Updates;
@@ -385,6 +427,23 @@ static bool removeNonFeasibleEdges(const SCCPSolver &Solver, BasicBlock *BB,
   } else if (FeasibleSuccessors.size() > 1) {
     SwitchInstProfUpdateWrapper SI(*cast<SwitchInst>(TI));
     SmallVector<DominatorTree::UpdateType, 8> Updates;
+
+    // If the default destination is unfeasible it will never be taken. Replace
+    // it with a new block with a single Unreachable instruction.
+    BasicBlock *DefaultDest = SI->getDefaultDest();
+    if (!FeasibleSuccessors.contains(DefaultDest)) {
+      if (!NewUnreachableBB) {
+        NewUnreachableBB =
+            BasicBlock::Create(DefaultDest->getContext(), "default.unreachable",
+                               DefaultDest->getParent(), DefaultDest);
+        new UnreachableInst(DefaultDest->getContext(), NewUnreachableBB);
+      }
+
+      SI->setDefaultDest(NewUnreachableBB);
+      Updates.push_back({DominatorTree::Delete, BB, DefaultDest});
+      Updates.push_back({DominatorTree::Insert, BB, NewUnreachableBB});
+    }
+
     for (auto CI = SI->case_begin(); CI != SI->case_end();) {
       if (FeasibleSuccessors.contains(CI->getCaseSuccessor())) {
         ++CI;
@@ -532,11 +591,13 @@ bool llvm::runIPSCCP(
       NumInstRemoved += changeToUnreachable(F.front().getFirstNonPHI(),
                                             /*PreserveLCSSA=*/false, &DTU);
 
+    BasicBlock *NewUnreachableBB = nullptr;
     for (BasicBlock &BB : F)
-      MadeChanges |= removeNonFeasibleEdges(Solver, &BB, DTU);
+      MadeChanges |= removeNonFeasibleEdges(Solver, &BB, DTU, NewUnreachableBB);
 
     for (BasicBlock *DeadBB : BlocksToErase)
-      DTU.deleteBB(DeadBB);
+      if (!DeadBB->hasAddressTaken())
+        DTU.deleteBB(DeadBB);
 
     for (BasicBlock &BB : F) {
       for (Instruction &Inst : llvm::make_early_inc_range(BB)) {
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 8be8946702be..143a035749c7 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -57,11 +57,9 @@
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
-#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
@@ -78,14 +76,12 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include <algorithm>
 #include <cassert>
-#include <chrono>
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
@@ -1016,7 +1012,7 @@ private:
         I.getParent()->getFirstInsertionPt() == I.getParent()->end())
       return PI.setAborted(&I);
 
-    // TODO: We could use SimplifyInstruction here to fold PHINodes and
+    // TODO: We could use simplifyInstruction here to fold PHINodes and
     // SelectInsts. However, doing so requires to change the current
     // dead-operand-tracking mechanism. For instance, suppose neither loading
     // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
@@ -1987,13 +1983,22 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
   uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
   uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
 
+  Use *U = S.getUse();
+
+  // Lifetime intrinsics operate over the whole alloca whose sizes are usually
+  // larger than other load/store slices (RelEnd > Size). But lifetime are
+  // always promotable and should not impact other slices' promotability of the
+  // partition.
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
+    if (II->isLifetimeStartOrEnd() || II->isDroppable())
+      return true;
+  }
+
   // We can't reasonably handle cases where the load or store extends past
   // the end of the alloca's type and into its padding.
   if (RelEnd > Size)
     return false;
 
-  Use *U = S.getUse();
-
   if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
     if (LI->isVolatile())
       return false;
@@ -2048,9 +2053,6 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
       return false;
     if (!S.isSplittable())
       return false; // Skip any unsplittable intrinsics.
-  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
-    if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
-      return false;
   } else {
     return false;
   }
@@ -2179,10 +2181,7 @@ static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
     return V;
   }
 
-  SmallVector<int, 8> Mask;
-  Mask.reserve(NumElements);
-  for (unsigned i = BeginIndex; i != EndIndex; ++i)
-    Mask.push_back(i);
+  auto Mask = llvm::to_vector<8>(llvm::seq<int>(BeginIndex, EndIndex));
   V = IRB.CreateShuffleVector(V, Mask, Name + ".extract");
   LLVM_DEBUG(dbgs() << "     shuffle: " << *V << "\n");
   return V;
@@ -2734,10 +2733,9 @@ private:
     Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8);
     V = IRB.CreateMul(
         IRB.CreateZExt(V, SplatIntTy, "zext"),
-        ConstantExpr::getUDiv(
-            Constant::getAllOnesValue(SplatIntTy),
-            ConstantExpr::getZExt(Constant::getAllOnesValue(V->getType()),
-                                  SplatIntTy)),
+        IRB.CreateUDiv(Constant::getAllOnesValue(SplatIntTy),
+                       IRB.CreateZExt(Constant::getAllOnesValue(V->getType()),
+                                      SplatIntTy)),
         "isplat");
     return V;
   }
@@ -2887,7 +2885,7 @@ private:
     assert((IsDest && II.getRawDest() == OldPtr) ||
            (!IsDest && II.getRawSource() == OldPtr));
 
-    MaybeAlign SliceAlign = getSliceAlign();
+    Align SliceAlign = getSliceAlign();
 
     // For unsplit intrinsics, we simply modify the source and destination
     // pointers in place. This isn't just an optimization, it is a matter of
@@ -3481,19 +3479,13 @@ private:
 
     Type *Ty = GEPI.getSourceElementType();
     Value *True = Sel->getTrueValue();
-    Value *NTrue =
-        IsInBounds
-            ? IRB.CreateInBoundsGEP(Ty, True, Index,
-                                    True->getName() + ".sroa.gep")
-            : IRB.CreateGEP(Ty, True, Index, True->getName() + ".sroa.gep");
+    Value *NTrue = IRB.CreateGEP(Ty, True, Index, True->getName() + ".sroa.gep",
+                                 IsInBounds);
 
     Value *False = Sel->getFalseValue();
 
-    Value *NFalse =
-        IsInBounds
-            ? IRB.CreateInBoundsGEP(Ty, False, Index,
-                                    False->getName() + ".sroa.gep")
-            : IRB.CreateGEP(Ty, False, Index, False->getName() + ".sroa.gep");
+    Value *NFalse = IRB.CreateGEP(Ty, False, Index,
+                                  False->getName() + ".sroa.gep", IsInBounds);
 
     Value *NSel = IRB.CreateSelect(Sel->getCondition(), NTrue, NFalse,
                                    Sel->getName() + ".sroa.sel");
@@ -3547,10 +3539,8 @@ private:
 
         IRB.SetInsertPoint(In->getParent(), std::next(In->getIterator()));
         Type *Ty = GEPI.getSourceElementType();
-        NewVal = IsInBounds ? IRB.CreateInBoundsGEP(Ty, In, Index,
-                                                    In->getName() + ".sroa.gep")
-                            : IRB.CreateGEP(Ty, In, Index,
-                                            In->getName() + ".sroa.gep");
+        NewVal = IRB.CreateGEP(Ty, In, Index, In->getName() + ".sroa.gep",
+                               IsInBounds);
       }
       NewPN->addIncoming(NewVal, B);
     }
@@ -3972,16 +3962,15 @@ bool SROAPass::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
   for (LoadInst *LI : Loads) {
     SplitLoads.clear();
 
-    IntegerType *Ty = cast<IntegerType>(LI->getType());
-    assert(Ty->getBitWidth() % 8 == 0);
-    uint64_t LoadSize = Ty->getBitWidth() / 8;
-    assert(LoadSize > 0 && "Cannot have a zero-sized integer load!");
-
     auto &Offsets = SplitOffsetsMap[LI];
-    assert(LoadSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
-           "Slice size should always match load size exactly!");
+    unsigned SliceSize = Offsets.S->endOffset() - Offsets.S->beginOffset();
+    assert(LI->getType()->getIntegerBitWidth() % 8 == 0 &&
+           "Load must have type size equal to store size");
+    assert(LI->getType()->getIntegerBitWidth() / 8 >= SliceSize &&
+           "Load must be >= slice size");
+
     uint64_t BaseOffset = Offsets.S->beginOffset();
-    assert(BaseOffset + LoadSize > BaseOffset &&
+    assert(BaseOffset + SliceSize > BaseOffset &&
            "Cannot represent alloca access size using 64-bit integers!");
 
     Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand());
@@ -3992,7 +3981,7 @@ bool SROAPass::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
     uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
     int Idx = 0, Size = Offsets.Splits.size();
     for (;;) {
-      auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
+      auto *PartTy = Type::getIntNTy(LI->getContext(), PartSize * 8);
       auto AS = LI->getPointerAddressSpace();
       auto *PartPtrTy = PartTy->getPointerTo(AS);
       LoadInst *PLoad = IRB.CreateAlignedLoad(
@@ -4025,7 +4014,7 @@ bool SROAPass::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
       // Setup the next partition.
       PartOffset = Offsets.Splits[Idx];
       ++Idx;
-      PartSize = (Idx < Size ? Offsets.Splits[Idx] : LoadSize) - PartOffset;
+      PartSize = (Idx < Size ? Offsets.Splits[Idx] : SliceSize) - PartOffset;
     }
 
     // Now that we have the split loads, do the slow walk over all uses of the
diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp
index f9650efc051f..008ddfc72740 100644
--- a/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -16,16 +16,13 @@
 #include "llvm-c/Initialization.h"
 #include "llvm-c/Transforms/Scalar.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/ScopedNoAliasAA.h"
 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Scalar/Scalarizer.h"
-#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
 #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
 
 using namespace llvm;
@@ -76,7 +73,6 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeLoopRerollLegacyPassPass(Registry);
   initializeLoopUnrollPass(Registry);
   initializeLoopUnrollAndJamPass(Registry);
-  initializeLoopUnswitchPass(Registry);
   initializeWarnMissedTransformationsLegacyPass(Registry);
   initializeLoopVersioningLICMLegacyPassPass(Registry);
   initializeLoopIdiomRecognizeLegacyPassPass(Registry);
@@ -104,6 +100,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeSimpleLoopUnswitchLegacyPassPass(Registry);
   initializeSinkingLegacyPassPass(Registry);
   initializeTailCallElimPass(Registry);
+  initializeTLSVariableHoistLegacyPassPass(Registry);
   initializeSeparateConstOffsetFromGEPLegacyPassPass(Registry);
   initializeSpeculativeExecutionLegacyPassPass(Registry);
   initializeStraightLineStrengthReduceLegacyPassPass(Registry);
@@ -214,10 +211,6 @@ void LLVMAddLoopUnrollAndJamPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createLoopUnrollAndJamPass());
 }
 
-void LLVMAddLoopUnswitchPass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createLoopUnswitchPass());
-}
-
 void LLVMAddLowerAtomicPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createLowerAtomicPass());
 }
diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
index 29cea42e4a00..e2976ace3a4a 100644
--- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
+++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
@@ -1,5 +1,5 @@
 //===- ScalarizeMaskedMemIntrin.cpp - Scalarize unsupported masked mem ----===//
-//                                    instrinsics
+//                                    intrinsics
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -24,11 +24,9 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
@@ -36,7 +34,6 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include <algorithm>
 #include <cassert>
 
 using namespace llvm;
@@ -876,7 +873,7 @@ static bool runImpl(Function &F, const TargetTransformInfo &TTI,
     for (BasicBlock &BB : llvm::make_early_inc_range(F)) {
       bool ModifiedDTOnIteration = false;
       MadeChange |= optimizeBlock(BB, ModifiedDTOnIteration, TTI, DL,
-                                  DTU.hasValue() ? DTU.getPointer() : nullptr);
+                                  DTU ? DTU.getPointer() : nullptr);
 
       // Restart BB iteration if the dominator tree of the Function was changed
       if (ModifiedDTOnIteration)
diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
index 3606c8a4b073..08f4b2173da2 100644
--- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -39,8 +39,6 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <cstdint>
@@ -52,7 +50,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "scalarizer"
 
-static cl::opt<bool> ScalarizeVariableInsertExtract(
+static cl::opt<bool> ClScalarizeVariableInsertExtract(
     "scalarize-variable-insert-extract", cl::init(true), cl::Hidden,
     cl::desc("Allow the scalarizer pass to scalarize "
              "insertelement/extractelement with variable index"));
@@ -60,9 +58,9 @@ static cl::opt<bool> ScalarizeVariableInsertExtract(
 // This is disabled by default because having separate loads and stores
 // makes it more likely that the -combiner-alias-analysis limits will be
 // reached.
-static cl::opt<bool>
-    ScalarizeLoadStore("scalarize-load-store", cl::init(false), cl::Hidden,
-                       cl::desc("Allow the scalarizer pass to scalarize loads and store"));
+static cl::opt<bool> ClScalarizeLoadStore(
+    "scalarize-load-store", cl::init(false), cl::Hidden,
+    cl::desc("Allow the scalarizer pass to scalarize loads and store"));
 
 namespace {
 
@@ -96,7 +94,7 @@ public:
   // Scatter V into Size components.  If new instructions are needed,
   // insert them before BBI in BB.  If Cache is nonnull, use it to cache
   // the results.
-  Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v,
+  Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v, Type *PtrElemTy,
             ValueVector *cachePtr = nullptr);
 
   // Return component I, creating a new Value for it if necessary.
@@ -109,8 +107,8 @@ private:
   BasicBlock *BB;
   BasicBlock::iterator BBI;
   Value *V;
+  Type *PtrElemTy;
   ValueVector *CachePtr;
-  PointerType *PtrTy;
   ValueVector Tmp;
   unsigned Size;
 };
@@ -188,10 +186,23 @@ struct VectorLayout {
   uint64_t ElemSize = 0;
 };
 
+template <typename T>
+T getWithDefaultOverride(const cl::opt<T> &ClOption,
+                         const llvm::Optional<T> &DefaultOverride) {
+  return ClOption.getNumOccurrences() ? ClOption
+                                      : DefaultOverride.value_or(ClOption);
+}
+
 class ScalarizerVisitor : public InstVisitor<ScalarizerVisitor, bool> {
 public:
-  ScalarizerVisitor(unsigned ParallelLoopAccessMDKind, DominatorTree *DT)
-    : ParallelLoopAccessMDKind(ParallelLoopAccessMDKind), DT(DT) {
+  ScalarizerVisitor(unsigned ParallelLoopAccessMDKind, DominatorTree *DT,
+                    ScalarizerPassOptions Options)
+      : ParallelLoopAccessMDKind(ParallelLoopAccessMDKind), DT(DT),
+        ScalarizeVariableInsertExtract(
+            getWithDefaultOverride(ClScalarizeVariableInsertExtract,
+                                   Options.ScalarizeVariableInsertExtract)),
+        ScalarizeLoadStore(getWithDefaultOverride(ClScalarizeLoadStore,
+                                                  Options.ScalarizeLoadStore)) {
   }
 
   bool visit(Function &F);
@@ -216,8 +227,9 @@ public:
   bool visitCallInst(CallInst &ICI);
 
 private:
-  Scatterer scatter(Instruction *Point, Value *V);
+  Scatterer scatter(Instruction *Point, Value *V, Type *PtrElemTy = nullptr);
   void gather(Instruction *Op, const ValueVector &CV);
+  void replaceUses(Instruction *Op, Value *CV);
   bool canTransferMetadata(unsigned Kind);
   void transferMetadataAndIRFlags(Instruction *Op, const ValueVector &CV);
   Optional<VectorLayout> getVectorLayout(Type *Ty, Align Alignment,
@@ -231,12 +243,16 @@ private:
 
   ScatterMap Scattered;
   GatherList Gathered;
+  bool Scalarized;
 
   SmallVector<WeakTrackingVH, 32> PotentiallyDeadInstrs;
 
   unsigned ParallelLoopAccessMDKind;
 
   DominatorTree *DT;
+
+  const bool ScalarizeVariableInsertExtract;
+  const bool ScalarizeLoadStore;
 };
 
 class ScalarizerLegacyPass : public FunctionPass {
@@ -265,12 +281,14 @@ INITIALIZE_PASS_END(ScalarizerLegacyPass, "scalarizer",
                     "Scalarize vector operations", false, false)
 
 Scatterer::Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v,
-                     ValueVector *cachePtr)
-  : BB(bb), BBI(bbi), V(v), CachePtr(cachePtr) {
+                     Type *PtrElemTy, ValueVector *cachePtr)
+    : BB(bb), BBI(bbi), V(v), PtrElemTy(PtrElemTy), CachePtr(cachePtr) {
   Type *Ty = V->getType();
-  PtrTy = dyn_cast<PointerType>(Ty);
-  if (PtrTy)
-    Ty = PtrTy->getPointerElementType();
+  if (Ty->isPointerTy()) {
+    assert(cast<PointerType>(Ty)->isOpaqueOrPointeeTypeMatches(PtrElemTy) &&
+           "Pointer element type mismatch");
+    Ty = PtrElemTy;
+  }
   Size = cast<FixedVectorType>(Ty)->getNumElements();
   if (!CachePtr)
     Tmp.resize(Size, nullptr);
@@ -287,15 +305,15 @@ Value *Scatterer::operator[](unsigned I) {
   if (CV[I])
     return CV[I];
   IRBuilder<> Builder(BB, BBI);
-  if (PtrTy) {
-    Type *ElTy =
-        cast<VectorType>(PtrTy->getPointerElementType())->getElementType();
+  if (PtrElemTy) {
+    Type *VectorElemTy = cast<VectorType>(PtrElemTy)->getElementType();
     if (!CV[0]) {
-      Type *NewPtrTy = PointerType::get(ElTy, PtrTy->getAddressSpace());
+      Type *NewPtrTy = PointerType::get(
+          VectorElemTy, V->getType()->getPointerAddressSpace());
       CV[0] = Builder.CreateBitCast(V, NewPtrTy, V->getName() + ".i0");
     }
     if (I != 0)
-      CV[I] = Builder.CreateConstGEP1_32(ElTy, CV[0], I,
+      CV[I] = Builder.CreateConstGEP1_32(VectorElemTy, CV[0], I,
                                          V->getName() + ".i" + Twine(I));
   } else {
     // Search through a chain of InsertElementInsts looking for element I.
@@ -334,7 +352,7 @@ bool ScalarizerLegacyPass::runOnFunction(Function &F) {
   unsigned ParallelLoopAccessMDKind =
       M.getContext().getMDKindID("llvm.mem.parallel_loop_access");
   DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  ScalarizerVisitor Impl(ParallelLoopAccessMDKind, DT);
+  ScalarizerVisitor Impl(ParallelLoopAccessMDKind, DT, ScalarizerPassOptions());
   return Impl.visit(F);
 }
 
@@ -345,6 +363,8 @@ FunctionPass *llvm::createScalarizerPass() {
 bool ScalarizerVisitor::visit(Function &F) {
   assert(Gathered.empty() && Scattered.empty());
 
+  Scalarized = false;
+
   // To ensure we replace gathered components correctly we need to do an ordered
   // traversal of the basic blocks in the function.
   ReversePostOrderTraversal<BasicBlock *> RPOT(&F.getEntryBlock());
@@ -362,13 +382,14 @@ bool ScalarizerVisitor::visit(Function &F) {
 
 // Return a scattered form of V that can be accessed by Point.  V must be a
 // vector or a pointer to a vector.
-Scatterer ScalarizerVisitor::scatter(Instruction *Point, Value *V) {
+Scatterer ScalarizerVisitor::scatter(Instruction *Point, Value *V,
+                                     Type *PtrElemTy) {
   if (Argument *VArg = dyn_cast<Argument>(V)) {
     // Put the scattered form of arguments in the entry block,
     // so that it can be used everywhere.
     Function *F = VArg->getParent();
     BasicBlock *BB = &F->getEntryBlock();
-    return Scatterer(BB, BB->begin(), V, &Scattered[V]);
+    return Scatterer(BB, BB->begin(), V, PtrElemTy, &Scattered[V]);
   }
   if (Instruction *VOp = dyn_cast<Instruction>(V)) {
     // When scalarizing PHI nodes we might try to examine/rewrite InsertElement
@@ -379,17 +400,17 @@ Scatterer ScalarizerVisitor::scatter(Instruction *Point, Value *V) {
     // need to analyse them further.
     if (!DT->isReachableFromEntry(VOp->getParent()))
       return Scatterer(Point->getParent(), Point->getIterator(),
-                       UndefValue::get(V->getType()));
+                       PoisonValue::get(V->getType()), PtrElemTy);
     // Put the scattered form of an instruction directly after the
     // instruction, skipping over PHI nodes and debug intrinsics.
     BasicBlock *BB = VOp->getParent();
     return Scatterer(
         BB, skipPastPhiNodesAndDbg(std::next(BasicBlock::iterator(VOp))), V,
-        &Scattered[V]);
+        PtrElemTy, &Scattered[V]);
   }
   // In the fallback case, just put the scattered before Point and
   // keep the result local to Point.
-  return Scatterer(Point->getParent(), Point->getIterator(), V);
+  return Scatterer(Point->getParent(), Point->getIterator(), V, PtrElemTy);
 }
 
 // Replace Op with the gathered form of the components in CV.  Defer the
@@ -419,6 +440,15 @@ void ScalarizerVisitor::gather(Instruction *Op, const ValueVector &CV) {
   Gathered.push_back(GatherList::value_type(Op, &SV));
 }
 
+// Replace Op with CV and collect Op has a potentially dead instruction.
+void ScalarizerVisitor::replaceUses(Instruction *Op, Value *CV) {
+  if (CV != Op) {
+    Op->replaceAllUsesWith(CV);
+    PotentiallyDeadInstrs.emplace_back(Op);
+    Scalarized = true;
+  }
+}
+
 // Return true if it is safe to transfer the given metadata tag from
 // vector to scalar instructions.
 bool ScalarizerVisitor::canTransferMetadata(unsigned Tag) {
@@ -558,9 +588,11 @@ bool ScalarizerVisitor::splitCall(CallInst &CI) {
     if (OpI->getType()->isVectorTy()) {
       Scattered[I] = scatter(&CI, OpI);
       assert(Scattered[I].size() == NumElems && "mismatched call operands");
+      if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
+        Tys.push_back(OpI->getType()->getScalarType());
     } else {
       ScalarOperands[I] = OpI;
-      if (hasVectorInstrinsicOverloadedScalarOpd(ID, I))
+      if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
         Tys.push_back(OpI->getType());
     }
   }
@@ -576,7 +608,7 @@ bool ScalarizerVisitor::splitCall(CallInst &CI) {
     ScalarCallOps.clear();
 
     for (unsigned J = 0; J != NumArgs; ++J) {
-      if (hasVectorInstrinsicScalarOpd(ID, J))
+      if (isVectorIntrinsicWithScalarOpAtArg(ID, J))
         ScalarCallOps.push_back(ScalarOperands[J]);
       else
         ScalarCallOps.push_back(Scattered[J][Elem]);
@@ -809,7 +841,7 @@ bool ScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) {
 
   if (auto *CI = dyn_cast<ConstantInt>(ExtIdx)) {
     Value *Res = Op0[CI->getValue().getZExtValue()];
-    gather(&EEI, {Res});
+    replaceUses(&EEI, Res);
     return true;
   }
 
@@ -825,7 +857,7 @@ bool ScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) {
     Res = Builder.CreateSelect(ShouldExtract, Elt, Res,
                                EEI.getName() + ".upto" + Twine(I));
   }
-  gather(&EEI, {Res});
+  replaceUses(&EEI, Res);
   return true;
 }
 
@@ -891,7 +923,7 @@ bool ScalarizerVisitor::visitLoadInst(LoadInst &LI) {
 
   unsigned NumElems = cast<FixedVectorType>(Layout->VecTy)->getNumElements();
   IRBuilder<> Builder(&LI);
-  Scatterer Ptr = scatter(&LI, LI.getPointerOperand());
+  Scatterer Ptr = scatter(&LI, LI.getPointerOperand(), LI.getType());
   ValueVector Res;
   Res.resize(NumElems);
 
@@ -917,7 +949,7 @@ bool ScalarizerVisitor::visitStoreInst(StoreInst &SI) {
 
   unsigned NumElems = cast<FixedVectorType>(Layout->VecTy)->getNumElements();
   IRBuilder<> Builder(&SI);
-  Scatterer VPtr = scatter(&SI, SI.getPointerOperand());
+  Scatterer VPtr = scatter(&SI, SI.getPointerOperand(), FullValue->getType());
   Scatterer VVal = scatter(&SI, FullValue);
 
   ValueVector Stores;
@@ -940,7 +972,7 @@ bool ScalarizerVisitor::visitCallInst(CallInst &CI) {
 bool ScalarizerVisitor::finish() {
   // The presence of data in Gathered or Scattered indicates changes
   // made to the Function.
-  if (Gathered.empty() && Scattered.empty())
+  if (Gathered.empty() && Scattered.empty() && !Scalarized)
     return false;
   for (const auto &GMI : Gathered) {
     Instruction *Op = GMI.first;
@@ -971,6 +1003,7 @@ bool ScalarizerVisitor::finish() {
   }
   Gathered.clear();
   Scattered.clear();
+  Scalarized = false;
 
   RecursivelyDeleteTriviallyDeadInstructionsPermissive(PotentiallyDeadInstrs);
 
@@ -982,7 +1015,7 @@ PreservedAnalyses ScalarizerPass::run(Function &F, FunctionAnalysisManager &AM)
   unsigned ParallelLoopAccessMDKind =
       M.getContext().getMDKindID("llvm.mem.parallel_loop_access");
   DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
-  ScalarizerVisitor Impl(ParallelLoopAccessMDKind, DT);
+  ScalarizerVisitor Impl(ParallelLoopAccessMDKind, DT, Options);
   bool Changed = Impl.visit(F);
   PreservedAnalyses PA;
   PA.preserve<DominatorTreeAnalysis>();
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index d23925042b0a..7da5a78772ad 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -189,7 +189,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index a27da047bfd3..0535608244cc 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -19,7 +19,6 @@
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/GuardUtils.h"
-#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
@@ -28,6 +27,7 @@
 #include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/MustExecute.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
@@ -49,7 +49,9 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GenericDomTree.h"
+#include "llvm/Support/InstructionCost.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -81,7 +83,6 @@ static cl::opt<bool> EnableNonTrivialUnswitch(
 
 static cl::opt<int>
     UnswitchThreshold("unswitch-threshold", cl::init(50), cl::Hidden,
-                      cl::ZeroOrMore,
                       cl::desc("The cost threshold for unswitching a loop."));
 
 static cl::opt<bool> EnableUnswitchCostMultiplier(
@@ -110,17 +111,27 @@ static cl::opt<unsigned>
                            "partial unswitching analysis"),
                   cl::init(100), cl::Hidden);
 static cl::opt<bool> FreezeLoopUnswitchCond(
-    "freeze-loop-unswitch-cond", cl::init(false), cl::Hidden,
+    "freeze-loop-unswitch-cond", cl::init(true), cl::Hidden,
     cl::desc("If enabled, the freeze instruction will be added to condition "
              "of loop unswitch to prevent miscompilation."));
 
+// Helper to skip (select x, true, false), which matches both a logical AND and
+// OR and can confuse code that tries to determine if \p Cond is either a
+// logical AND or OR but not both.
+static Value *skipTrivialSelect(Value *Cond) {
+  Value *CondNext;
+  while (match(Cond, m_Select(m_Value(CondNext), m_One(), m_Zero())))
+    Cond = CondNext;
+  return Cond;
+}
+
 /// Collect all of the loop invariant input values transitively used by the
 /// homogeneous instruction graph from a given root.
 ///
 /// This essentially walks from a root recursively through loop variant operands
-/// which have the exact same opcode and finds all inputs which are loop
-/// invariant. For some operations these can be re-associated and unswitched out
-/// of the loop entirely.
+/// which have perform the same logical operation (AND or OR) and finds all
+/// inputs which are loop invariant. For some operations these can be
+/// re-associated and unswitched out of the loop entirely.
 static TinyPtrVector<Value *>
 collectHomogenousInstGraphLoopInvariants(Loop &L, Instruction &Root,
                                          LoopInfo &LI) {
@@ -150,7 +161,7 @@ collectHomogenousInstGraphLoopInvariants(Loop &L, Instruction &Root,
       }
 
       // If not an instruction with the same opcode, nothing we can do.
-      Instruction *OpI = dyn_cast<Instruction>(OpV);
+      Instruction *OpI = dyn_cast<Instruction>(skipTrivialSelect(OpV));
 
       if (OpI && ((IsRootAnd && match(OpI, m_LogicalAnd())) ||
                   (IsRootOr  && match(OpI, m_LogicalOr())))) {
@@ -202,13 +213,19 @@ static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB,
 /// branch on a single value.
 static void buildPartialUnswitchConditionalBranch(
     BasicBlock &BB, ArrayRef<Value *> Invariants, bool Direction,
-    BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, bool InsertFreeze) {
+    BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, bool InsertFreeze,
+    Instruction *I, AssumptionCache *AC, DominatorTree &DT) {
   IRBuilder<> IRB(&BB);
 
-  Value *Cond = Direction ? IRB.CreateOr(Invariants) :
-    IRB.CreateAnd(Invariants);
-  if (InsertFreeze)
-    Cond = IRB.CreateFreeze(Cond, Cond->getName() + ".fr");
+  SmallVector<Value *> FrozenInvariants;
+  for (Value *Inv : Invariants) {
+    if (InsertFreeze && !isGuaranteedNotToBeUndefOrPoison(Inv, AC, I, &DT))
+      Inv = IRB.CreateFreeze(Inv, Inv->getName() + ".fr");
+    FrozenInvariants.push_back(Inv);
+  }
+
+  Value *Cond = Direction ? IRB.CreateOr(FrozenInvariants)
+                          : IRB.CreateAnd(FrozenInvariants);
   IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc,
                    Direction ? &NormalSucc : &UnswitchedSucc);
 }
@@ -442,11 +459,12 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
   // some input conditions to the branch.
   bool FullUnswitch = false;
 
-  if (L.isLoopInvariant(BI.getCondition())) {
-    Invariants.push_back(BI.getCondition());
+  Value *Cond = skipTrivialSelect(BI.getCondition());
+  if (L.isLoopInvariant(Cond)) {
+    Invariants.push_back(Cond);
     FullUnswitch = true;
   } else {
-    if (auto *CondInst = dyn_cast<Instruction>(BI.getCondition()))
+    if (auto *CondInst = dyn_cast<Instruction>(Cond))
       Invariants = collectHomogenousInstGraphLoopInvariants(L, *CondInst, LI);
     if (Invariants.empty()) {
       LLVM_DEBUG(dbgs() << "   Couldn't find invariant inputs!\n");
@@ -480,8 +498,8 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
   // is a graph of `or` operations, or the exit block is along the false edge
   // and the condition is a graph of `and` operations.
   if (!FullUnswitch) {
-    if (ExitDirection ? !match(BI.getCondition(), m_LogicalOr())
-                      : !match(BI.getCondition(), m_LogicalAnd())) {
+    if (ExitDirection ? !match(Cond, m_LogicalOr())
+                      : !match(Cond, m_LogicalAnd())) {
       LLVM_DEBUG(dbgs() << "   Branch condition is in improper form for "
                            "non-full unswitch!\n");
       return false;
@@ -546,6 +564,7 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
     // its successors.
     OldPH->getInstList().splice(OldPH->end(), BI.getParent()->getInstList(),
                                 BI);
+    BI.setCondition(Cond);
     if (MSSAU) {
       // Temporarily clone the terminator, to make MSSA update cheaper by
       // separating "insert edge" updates from "remove edge" ones.
@@ -561,15 +580,16 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
     // Only unswitching a subset of inputs to the condition, so we will need to
     // build a new branch that merges the invariant inputs.
     if (ExitDirection)
-      assert(match(BI.getCondition(), m_LogicalOr()) &&
+      assert(match(skipTrivialSelect(BI.getCondition()), m_LogicalOr()) &&
              "Must have an `or` of `i1`s or `select i1 X, true, Y`s for the "
              "condition!");
     else
-      assert(match(BI.getCondition(), m_LogicalAnd()) &&
+      assert(match(skipTrivialSelect(BI.getCondition()), m_LogicalAnd()) &&
              "Must have an `and` of `i1`s or `select i1 X, Y, false`s for the"
              " condition!");
-    buildPartialUnswitchConditionalBranch(*OldPH, Invariants, ExitDirection,
-                                          *UnswitchedBB, *NewPH, false);
+    buildPartialUnswitchConditionalBranch(
+        *OldPH, Invariants, ExitDirection, *UnswitchedBB, *NewPH,
+        FreezeLoopUnswitchCond, OldPH->getTerminator(), nullptr, DT);
   }
 
   // Update the dominator tree with the added edge.
@@ -1019,7 +1039,8 @@ static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
     // Don't bother trying to unswitch past an unconditional branch or a branch
     // with a constant value. These should be removed by simplifycfg prior to
     // running this pass.
-    if (!BI->isConditional() || isa<Constant>(BI->getCondition()))
+    if (!BI->isConditional() ||
+        isa<Constant>(skipTrivialSelect(BI->getCondition())))
       return Changed;
 
     // Found a trivial condition candidate: non-foldable conditional branch. If
@@ -1663,7 +1684,7 @@ deleteDeadBlocksFromLoop(Loop &L,
     // uses in other blocks.
     for (auto &I : *BB)
       if (!I.use_empty())
-        I.replaceAllUsesWith(UndefValue::get(I.getType()));
+        I.replaceAllUsesWith(PoisonValue::get(I.getType()));
     BB->dropAllReferences();
   }
 
@@ -2042,12 +2063,13 @@ static void unswitchNontrivialInvariants(
          "Can only unswitch switches and conditional branch!");
   bool PartiallyInvariant = !PartialIVInfo.InstToDuplicate.empty();
   bool FullUnswitch =
-      SI || (BI->getCondition() == Invariants[0] && !PartiallyInvariant);
+      SI || (skipTrivialSelect(BI->getCondition()) == Invariants[0] &&
+             !PartiallyInvariant);
   if (FullUnswitch)
     assert(Invariants.size() == 1 &&
            "Cannot have other invariants with full unswitching!");
   else
-    assert(isa<Instruction>(BI->getCondition()) &&
+    assert(isa<Instruction>(skipTrivialSelect(BI->getCondition())) &&
            "Partial unswitching requires an instruction as the condition!");
 
   if (MSSAU && VerifyMemorySSA)
@@ -2062,14 +2084,14 @@ static void unswitchNontrivialInvariants(
   bool Direction = true;
   int ClonedSucc = 0;
   if (!FullUnswitch) {
-    Value *Cond = BI->getCondition();
+    Value *Cond = skipTrivialSelect(BI->getCondition());
     (void)Cond;
     assert(((match(Cond, m_LogicalAnd()) ^ match(Cond, m_LogicalOr())) ||
             PartiallyInvariant) &&
            "Only `or`, `and`, an `select`, partially invariant instructions "
            "can combine invariants being unswitched.");
-    if (!match(BI->getCondition(), m_LogicalOr())) {
-      if (match(BI->getCondition(), m_LogicalAnd()) ||
+    if (!match(Cond, m_LogicalOr())) {
+      if (match(Cond, m_LogicalAnd()) ||
           (PartiallyInvariant && !PartialIVInfo.KnownValue->isOneValue())) {
         Direction = false;
         ClonedSucc = 1;
@@ -2209,11 +2231,12 @@ static void unswitchNontrivialInvariants(
       BasicBlock *ClonedPH = ClonedPHs.begin()->second;
       BI->setSuccessor(ClonedSucc, ClonedPH);
       BI->setSuccessor(1 - ClonedSucc, LoopPH);
+      Value *Cond = skipTrivialSelect(BI->getCondition());
       if (InsertFreeze) {
-        auto Cond = BI->getCondition();
         if (!isGuaranteedNotToBeUndefOrPoison(Cond, &AC, BI, &DT))
-          BI->setCondition(new FreezeInst(Cond, Cond->getName() + ".fr", BI));
+          Cond = new FreezeInst(Cond, Cond->getName() + ".fr", BI);
       }
+      BI->setCondition(Cond);
       DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
     } else {
       assert(SI && "Must either be a branch or switch!");
@@ -2311,9 +2334,11 @@ static void unswitchNontrivialInvariants(
     if (PartiallyInvariant)
       buildPartialInvariantUnswitchConditionalBranch(
           *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, L, MSSAU);
-    else
-      buildPartialUnswitchConditionalBranch(*SplitBB, Invariants, Direction,
-                                            *ClonedPH, *LoopPH, InsertFreeze);
+    else {
+      buildPartialUnswitchConditionalBranch(
+          *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH,
+          FreezeLoopUnswitchCond, BI, &AC, DT);
+    }
     DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
 
     if (MSSAU) {
@@ -2745,22 +2770,16 @@ static bool unswitchBestCondition(
         BI->getSuccessor(0) == BI->getSuccessor(1))
       continue;
 
-    // If BI's condition is 'select _, true, false', simplify it to confuse
-    // matchers
-    Value *Cond = BI->getCondition(), *CondNext;
-    while (match(Cond, m_Select(m_Value(CondNext), m_One(), m_Zero())))
-      Cond = CondNext;
-    BI->setCondition(Cond);
-
+    Value *Cond = skipTrivialSelect(BI->getCondition());
     if (isa<Constant>(Cond))
       continue;
 
-    if (L.isLoopInvariant(BI->getCondition())) {
-      UnswitchCandidates.push_back({BI, {BI->getCondition()}});
+    if (L.isLoopInvariant(Cond)) {
+      UnswitchCandidates.push_back({BI, {Cond}});
       continue;
     }
 
-    Instruction &CondI = *cast<Instruction>(BI->getCondition());
+    Instruction &CondI = *cast<Instruction>(Cond);
     if (match(&CondI, m_CombineOr(m_LogicalAnd(), m_LogicalOr()))) {
       TinyPtrVector<Value *> Invariants =
           collectHomogenousInstGraphLoopInvariants(L, CondI, LI);
@@ -2785,8 +2804,7 @@ static bool unswitchBestCondition(
       PartialIVInfo = *Info;
       PartialIVCondBranch = L.getHeader()->getTerminator();
       TinyPtrVector<Value *> ValsToDuplicate;
-      for (auto *Inst : Info->InstToDuplicate)
-        ValsToDuplicate.push_back(Inst);
+      llvm::append_range(ValsToDuplicate, Info->InstToDuplicate);
       UnswitchCandidates.push_back(
           {L.getHeader()->getTerminator(), std::move(ValsToDuplicate)});
     }
@@ -2902,10 +2920,11 @@ static bool unswitchBestCondition(
       // its cost.
       if (!FullUnswitch) {
         auto &BI = cast<BranchInst>(TI);
-        if (match(BI.getCondition(), m_LogicalAnd())) {
+        Value *Cond = skipTrivialSelect(BI.getCondition());
+        if (match(Cond, m_LogicalAnd())) {
           if (SuccBB == BI.getSuccessor(1))
             continue;
-        } else if (match(BI.getCondition(), m_LogicalOr())) {
+        } else if (match(Cond, m_LogicalOr())) {
           if (SuccBB == BI.getSuccessor(0))
             continue;
         } else if ((PartialIVInfo.KnownValue->isOneValue() &&
@@ -2947,8 +2966,9 @@ static bool unswitchBestCondition(
     ArrayRef<Value *> Invariants = TerminatorAndInvariants.second;
     BranchInst *BI = dyn_cast<BranchInst>(&TI);
     InstructionCost CandidateCost = ComputeUnswitchedCost(
-        TI, /*FullUnswitch*/ !BI || (Invariants.size() == 1 &&
-                                     Invariants[0] == BI->getCondition()));
+        TI, /*FullUnswitch*/ !BI ||
+                (Invariants.size() == 1 &&
+                 Invariants[0] == skipTrivialSelect(BI->getCondition())));
     // Calculate cost multiplier which is a tool to limit potentially
     // exponential behavior of loop-unswitch.
     if (EnableUnswitchCostMultiplier) {
@@ -3131,8 +3151,7 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
       AR.MSSA->verifyMemorySSA();
   }
   if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.AA, AR.TTI, Trivial, NonTrivial,
-                    UnswitchCB, &AR.SE,
-                    MSSAU.hasValue() ? MSSAU.getPointer() : nullptr,
+                    UnswitchCB, &AR.SE, MSSAU ? MSSAU.getPointer() : nullptr,
                     DestroyLoopCB))
     return PreservedAnalyses::all();
 
diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index ee17da1875e5..fb2d812a186d 100644
--- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -31,19 +31,16 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CFG.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/SimplifyCFG.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SimplifyCFGOptions.h"
 #include <utility>
@@ -59,6 +56,11 @@ static cl::opt<bool> UserKeepLoops(
     "keep-loops", cl::Hidden, cl::init(true),
     cl::desc("Preserve canonical loop structure (default = true)"));
 
+static cl::opt<bool> UserSwitchRangeToICmp(
+    "switch-range-to-icmp", cl::Hidden, cl::init(false),
+    cl::desc(
+        "Convert switches into an integer range comparison (default = false)"));
+
 static cl::opt<bool> UserSwitchToLookup(
     "switch-to-lookup", cl::Hidden, cl::init(false),
     cl::desc("Convert switches to lookup tables (default = false)"));
@@ -311,6 +313,8 @@ static void applyCommandLineOverridesToOptions(SimplifyCFGOptions &Options) {
     Options.BonusInstThreshold = UserBonusInstThreshold;
   if (UserForwardSwitchCond.getNumOccurrences())
     Options.ForwardSwitchCondToPhi = UserForwardSwitchCond;
+  if (UserSwitchRangeToICmp.getNumOccurrences())
+    Options.ConvertSwitchRangeToICmp = UserSwitchRangeToICmp;
   if (UserSwitchToLookup.getNumOccurrences())
     Options.ConvertSwitchToLookupTable = UserSwitchToLookup;
   if (UserKeepLoops.getNumOccurrences())
@@ -337,6 +341,8 @@ void SimplifyCFGPass::printPipeline(
   OS << "<";
   OS << "bonus-inst-threshold=" << Options.BonusInstThreshold << ";";
   OS << (Options.ForwardSwitchCondToPhi ? "" : "no-") << "forward-switch-cond;";
+  OS << (Options.ConvertSwitchRangeToICmp ? "" : "no-")
+     << "switch-range-to-icmp;";
   OS << (Options.ConvertSwitchToLookupTable ? "" : "no-")
      << "switch-to-lookup;";
   OS << (Options.NeedCanonicalLoop ? "" : "no-") << "keep-loops;";
diff --git a/llvm/lib/Transforms/Scalar/Sink.cpp b/llvm/lib/Transforms/Scalar/Sink.cpp
index 8600aacdb056..e8fde53005f0 100644
--- a/llvm/lib/Transforms/Scalar/Sink.cpp
+++ b/llvm/lib/Transforms/Scalar/Sink.cpp
@@ -15,12 +15,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -48,7 +43,7 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA,
   }
 
   if (Inst->isTerminator() || isa<PHINode>(Inst) || Inst->isEHPad() ||
-      Inst->mayThrow())
+      Inst->mayThrow() || !Inst->willReturn())
     return false;
 
   if (auto *Call = dyn_cast<CallBase>(Inst)) {
diff --git a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
index 06169a7834f6..9ac4608134c2 100644
--- a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -63,10 +63,10 @@
 #include "llvm/Transforms/Scalar/SpeculativeExecution.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
@@ -275,7 +275,7 @@ bool SpeculativeExecutionPass::considerHoistingFromTo(
       });
     }
 
-    // Usially debug label instrinsic corresponds to label in LLVM IR. In these
+    // Usially debug label intrinsic corresponds to label in LLVM IR. In these
     // cases we should not move it here.
     // TODO: Possible special processing needed to detect it is related to a
     // hoisted instruction.
@@ -301,7 +301,7 @@ bool SpeculativeExecutionPass::considerHoistingFromTo(
       if (TotalSpeculationCost > SpecExecMaxSpeculationCost)
         return false;  // too much to hoist
     } else {
-      // Debug info instrinsics should not be counted for threshold.
+      // Debug info intrinsics should not be counted for threshold.
       if (!isa<DbgInfoIntrinsic>(I))
         NotHoistedInstCount++;
       if (NotHoistedInstCount > SpecExecMaxNotHoisted)
diff --git a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index b47378808216..70df0cec0dca 100644
--- a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -68,7 +68,6 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
@@ -683,24 +682,16 @@ void StraightLineStrengthReduce::rewriteCandidateWithBasis(
         unsigned AS = Basis.Ins->getType()->getPointerAddressSpace();
         Type *CharTy = Type::getInt8PtrTy(Basis.Ins->getContext(), AS);
         Reduced = Builder.CreateBitCast(Basis.Ins, CharTy);
-        if (InBounds)
-          Reduced =
-              Builder.CreateInBoundsGEP(Builder.getInt8Ty(), Reduced, Bump);
-        else
-          Reduced = Builder.CreateGEP(Builder.getInt8Ty(), Reduced, Bump);
+        Reduced =
+            Builder.CreateGEP(Builder.getInt8Ty(), Reduced, Bump, "", InBounds);
         Reduced = Builder.CreateBitCast(Reduced, C.Ins->getType());
       } else {
         // C = gep Basis, Bump
         // Canonicalize bump to pointer size.
         Bump = Builder.CreateSExtOrTrunc(Bump, IntPtrTy);
-        if (InBounds)
-          Reduced = Builder.CreateInBoundsGEP(
-              cast<GetElementPtrInst>(Basis.Ins)->getResultElementType(),
-              Basis.Ins, Bump);
-        else
-          Reduced = Builder.CreateGEP(
-              cast<GetElementPtrInst>(Basis.Ins)->getResultElementType(),
-              Basis.Ins, Bump);
+        Reduced = Builder.CreateGEP(
+            cast<GetElementPtrInst>(Basis.Ins)->getResultElementType(),
+            Basis.Ins, Bump, "", InBounds);
       }
       break;
     }
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index b3a445368537..f6525ad7de9b 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -18,10 +18,8 @@
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/Analysis/RegionIterator.h"
 #include "llvm/Analysis/RegionPass.h"
-#include "llvm/IR/Argument.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
-#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
@@ -33,7 +31,6 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/InitializePasses.h"
@@ -41,7 +38,6 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils.h"
@@ -72,6 +68,11 @@ static cl::opt<bool>
                           cl::desc("Allow relaxed uniform region checks"),
                           cl::init(true));
 
+static cl::opt<unsigned>
+    ReorderNodeSize("structurizecfg-node-reorder-size",
+                     cl::desc("Limit region size for reordering nodes"),
+                     cl::init(100), cl::Hidden);
+
 // Definition of the complex types used in this pass.
 
 using BBValuePair = std::pair<BasicBlock *, Value *>;
@@ -266,6 +267,8 @@ class StructurizeCFG {
 
   void orderNodes();
 
+  void reorderNodes();
+
   void analyzeLoops(RegionNode *N);
 
   Value *buildCondition(BranchInst *Term, unsigned Idx, bool Invert);
@@ -424,6 +427,57 @@ void StructurizeCFG::orderNodes() {
   }
 }
 
+/// Change the node ordering to decrease the range of live values, especially
+/// the values that capture the control flow path for branches. We do this
+/// by moving blocks with a single predecessor and successor to appear after
+/// predecessor. The motivation is to move some loop exit blocks into a loop.
+/// In cases where a loop has a large number of exit blocks, this reduces the
+/// amount of values needed across the loop boundary.
+void StructurizeCFG::reorderNodes() {
+  SmallVector<RegionNode *, 8> NewOrder;
+  DenseMap<BasicBlock *, unsigned> MoveTo;
+  BitVector Moved(Order.size());
+
+  // The benefits of reordering nodes occurs for large regions.
+  if (Order.size() <= ReorderNodeSize)
+    return;
+
+  // The algorithm works with two passes over Order. The first pass identifies
+  // the blocks to move and the position to move them to. The second pass
+  // creates the new order based upon this information. We move blocks with
+  // a single predecessor and successor. If there are multiple candidates then
+  // maintain the original order.
+  BBSet Seen;
+  for (int I = Order.size() - 1; I >= 0; --I) {
+    auto *BB = Order[I]->getEntry();
+    Seen.insert(BB);
+    auto *Pred = BB->getSinglePredecessor();
+    auto *Succ = BB->getSingleSuccessor();
+    // Consider only those basic blocks that have a predecessor in Order and a
+    // successor that exits the region. The region may contain subregions that
+    // have been structurized and are not included in Order.
+    if (Pred && Succ && Seen.count(Pred) && Succ == ParentRegion->getExit() &&
+        !MoveTo.count(Pred)) {
+      MoveTo[Pred] = I;
+      Moved.set(I);
+    }
+  }
+
+  // If no blocks have been moved then the original order is good.
+  if (!Moved.count())
+    return;
+
+  for (size_t I = 0, E = Order.size(); I < E; ++I) {
+    auto *BB = Order[I]->getEntry();
+    if (MoveTo.count(BB))
+      NewOrder.push_back(Order[MoveTo[BB]]);
+    if (!Moved[I])
+      NewOrder.push_back(Order[I]);
+  }
+
+  Order.assign(NewOrder);
+}
+
 /// Determine the end of the loops
 void StructurizeCFG::analyzeLoops(RegionNode *N) {
   if (N->isSubRegion()) {
@@ -685,7 +739,7 @@ void StructurizeCFG::simplifyAffectedPhis() {
     Q.DT = DT;
     for (WeakVH VH : AffectedPhis) {
       if (auto Phi = dyn_cast_or_null<PHINode>(VH)) {
-        if (auto NewValue = SimplifyInstruction(Phi, Q)) {
+        if (auto NewValue = simplifyInstruction(Phi, Q)) {
           Phi->replaceAllUsesWith(NewValue);
           Phi->eraseFromParent();
           Changed = true;
@@ -1085,12 +1139,13 @@ bool StructurizeCFG::run(Region *R, DominatorTree *DT) {
   ParentRegion = R;
 
   orderNodes();
+  reorderNodes();
   collectInfos();
   createFlow();
   insertConditions(false);
   insertConditions(true);
-  simplifyConditions();
   setPhiValues();
+  simplifyConditions();
   simplifyAffectedPhis();
   rebuildSSA();
 
diff --git a/llvm/lib/Transforms/Scalar/TLSVariableHoist.cpp b/llvm/lib/Transforms/Scalar/TLSVariableHoist.cpp
new file mode 100644
index 000000000000..16b3483f9687
--- /dev/null
+++ b/llvm/lib/Transforms/Scalar/TLSVariableHoist.cpp
@@ -0,0 +1,306 @@
+//===- TLSVariableHoist.cpp -------- Remove Redundant TLS Loads ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass identifies/eliminate Redundant TLS Loads if related option is set.
+// The example: Please refer to the comment at the head of TLSVariableHoist.h.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/TLSVariableHoist.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+using namespace tlshoist;
+
+#define DEBUG_TYPE "tlshoist"
+
+static cl::opt<bool> TLSLoadHoist(
+    "tls-load-hoist", cl::init(false), cl::Hidden,
+    cl::desc("hoist the TLS loads in PIC model to eliminate redundant "
+             "TLS address calculation."));
+
+namespace {
+
+/// The TLS Variable hoist pass.
+class TLSVariableHoistLegacyPass : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  TLSVariableHoistLegacyPass() : FunctionPass(ID) {
+    initializeTLSVariableHoistLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &Fn) override;
+
+  StringRef getPassName() const override { return "TLS Variable Hoist"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+  }
+
+private:
+  TLSVariableHoistPass Impl;
+};
+
+} // end anonymous namespace
+
+char TLSVariableHoistLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(TLSVariableHoistLegacyPass, "tlshoist",
+                      "TLS Variable Hoist", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(TLSVariableHoistLegacyPass, "tlshoist",
+                    "TLS Variable Hoist", false, false)
+
+FunctionPass *llvm::createTLSVariableHoistPass() {
+  return new TLSVariableHoistLegacyPass();
+}
+
+/// Perform the TLS Variable Hoist optimization for the given function.
+bool TLSVariableHoistLegacyPass::runOnFunction(Function &Fn) {
+  if (skipFunction(Fn))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "********** Begin TLS Variable Hoist **********\n");
+  LLVM_DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n');
+
+  bool MadeChange =
+      Impl.runImpl(Fn, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+                   getAnalysis<LoopInfoWrapperPass>().getLoopInfo());
+
+  if (MadeChange) {
+    LLVM_DEBUG(dbgs() << "********** Function after TLS Variable Hoist: "
+                      << Fn.getName() << '\n');
+    LLVM_DEBUG(dbgs() << Fn);
+  }
+  LLVM_DEBUG(dbgs() << "********** End TLS Variable Hoist **********\n");
+
+  return MadeChange;
+}
+
+void TLSVariableHoistPass::collectTLSCandidate(Instruction *Inst) {
+  // Skip all cast instructions. They are visited indirectly later on.
+  if (Inst->isCast())
+    return;
+
+  // Scan all operands.
+  for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) {
+    auto *GV = dyn_cast<GlobalVariable>(Inst->getOperand(Idx));
+    if (!GV || !GV->isThreadLocal())
+      continue;
+
+    // Add Candidate to TLSCandMap (GV --> Candidate).
+    TLSCandMap[GV].addUser(Inst, Idx);
+  }
+}
+
+void TLSVariableHoistPass::collectTLSCandidates(Function &Fn) {
+  // First, quickly check if there is TLS Variable.
+  Module *M = Fn.getParent();
+
+  bool HasTLS = llvm::any_of(
+      M->globals(), [](GlobalVariable &GV) { return GV.isThreadLocal(); });
+
+  // If non, directly return.
+  if (!HasTLS)
+    return;
+
+  TLSCandMap.clear();
+
+  // Then, collect TLS Variable info.
+  for (BasicBlock &BB : Fn) {
+    // Ignore unreachable basic blocks.
+    if (!DT->isReachableFromEntry(&BB))
+      continue;
+
+    for (Instruction &Inst : BB)
+      collectTLSCandidate(&Inst);
+  }
+}
+
+static bool oneUseOutsideLoop(tlshoist::TLSCandidate &Cand, LoopInfo *LI) {
+  if (Cand.Users.size() != 1)
+    return false;
+
+  BasicBlock *BB = Cand.Users[0].Inst->getParent();
+  if (LI->getLoopFor(BB))
+    return false;
+
+  return true;
+}
+
+Instruction *TLSVariableHoistPass::getNearestLoopDomInst(BasicBlock *BB,
+                                                         Loop *L) {
+  assert(L && "Unexcepted Loop status!");
+
+  // Get the outermost loop.
+  while (Loop *Parent = L->getParentLoop())
+    L = Parent;
+
+  BasicBlock *PreHeader = L->getLoopPreheader();
+
+  // There is unique predecessor outside the loop.
+  if (PreHeader)
+    return PreHeader->getTerminator();
+
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Dom = Header;
+  for (BasicBlock *PredBB : predecessors(Header))
+    Dom = DT->findNearestCommonDominator(Dom, PredBB);
+
+  assert(Dom && "Not find dominator BB!");
+  Instruction *Term = Dom->getTerminator();
+
+  return Term;
+}
+
+Instruction *TLSVariableHoistPass::getDomInst(Instruction *I1,
+                                              Instruction *I2) {
+  if (!I1)
+    return I2;
+  if (DT->dominates(I1, I2))
+    return I1;
+  if (DT->dominates(I2, I1))
+    return I2;
+
+  // If there is no dominance relation, use common dominator.
+  BasicBlock *DomBB =
+      DT->findNearestCommonDominator(I1->getParent(), I2->getParent());
+
+  Instruction *Dom = DomBB->getTerminator();
+  assert(Dom && "Common dominator not found!");
+
+  return Dom;
+}
+
+BasicBlock::iterator TLSVariableHoistPass::findInsertPos(Function &Fn,
+                                                         GlobalVariable *GV,
+                                                         BasicBlock *&PosBB) {
+  tlshoist::TLSCandidate &Cand = TLSCandMap[GV];
+
+  // We should hoist the TLS use out of loop, so choose its nearest instruction
+  // which dominate the loop and the outside loops (if exist).
+  Instruction *LastPos = nullptr;
+  for (auto &User : Cand.Users) {
+    BasicBlock *BB = User.Inst->getParent();
+    Instruction *Pos = User.Inst;
+    if (Loop *L = LI->getLoopFor(BB)) {
+      Pos = getNearestLoopDomInst(BB, L);
+      assert(Pos && "Not find insert position out of loop!");
+    }
+    Pos = getDomInst(LastPos, Pos);
+    LastPos = Pos;
+  }
+
+  assert(LastPos && "Unexpected insert position!");
+  BasicBlock *Parent = LastPos->getParent();
+  PosBB = Parent;
+  return LastPos->getIterator();
+}
+
+// Generate a bitcast (no type change) to replace the uses of TLS Candidate.
+Instruction *TLSVariableHoistPass::genBitCastInst(Function &Fn,
+                                                  GlobalVariable *GV) {
+  BasicBlock *PosBB = &Fn.getEntryBlock();
+  BasicBlock::iterator Iter = findInsertPos(Fn, GV, PosBB);
+  Type *Ty = GV->getType();
+  auto *CastInst = new BitCastInst(GV, Ty, "tls_bitcast");
+  PosBB->getInstList().insert(Iter, CastInst);
+  return CastInst;
+}
+
+bool TLSVariableHoistPass::tryReplaceTLSCandidate(Function &Fn,
+                                                  GlobalVariable *GV) {
+
+  tlshoist::TLSCandidate &Cand = TLSCandMap[GV];
+
+  // If only used 1 time and not in loops, we no need to replace it.
+  if (oneUseOutsideLoop(Cand, LI))
+    return false;
+
+  // Generate a bitcast (no type change)
+  auto *CastInst = genBitCastInst(Fn, GV);
+
+  // to replace the uses of TLS Candidate
+  for (auto &User : Cand.Users)
+    User.Inst->setOperand(User.OpndIdx, CastInst);
+
+  return true;
+}
+
+bool TLSVariableHoistPass::tryReplaceTLSCandidates(Function &Fn) {
+  if (TLSCandMap.empty())
+    return false;
+
+  bool Replaced = false;
+  for (auto &GV2Cand : TLSCandMap) {
+    GlobalVariable *GV = GV2Cand.first;
+    Replaced |= tryReplaceTLSCandidate(Fn, GV);
+  }
+
+  return Replaced;
+}
+
+/// Optimize expensive TLS variables in the given function.
+bool TLSVariableHoistPass::runImpl(Function &Fn, DominatorTree &DT,
+                                   LoopInfo &LI) {
+  if (Fn.hasOptNone())
+    return false;
+
+  if (!TLSLoadHoist && !Fn.getAttributes().hasFnAttr("tls-load-hoist"))
+    return false;
+
+  this->LI = &LI;
+  this->DT = &DT;
+  assert(this->LI && this->DT && "Unexcepted requirement!");
+
+  // Collect all TLS variable candidates.
+  collectTLSCandidates(Fn);
+
+  bool MadeChange = tryReplaceTLSCandidates(Fn);
+
+  return MadeChange;
+}
+
+PreservedAnalyses TLSVariableHoistPass::run(Function &F,
+                                            FunctionAnalysisManager &AM) {
+
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+
+  if (!runImpl(F, DT, LI))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 3bcf92e28a21..27c04177e894 100644
--- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -53,11 +53,8 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
@@ -76,14 +73,12 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/ValueHandle.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "tailcallelim"
@@ -248,10 +243,10 @@ static bool markTails(Function &F, OptimizationRemarkEmitter *ORE) {
           isa<PseudoProbeInst>(&I))
         continue;
 
-      // Special-case operand bundle "clang.arc.attachedcall".
+      // Special-case operand bundles "clang.arc.attachedcall" and "ptrauth".
       bool IsNoTail =
           CI->isNoTailCall() || CI->hasOperandBundlesOtherThan(
-                                    LLVMContext::OB_clang_arc_attachedcall);
+            {LLVMContext::OB_clang_arc_attachedcall, LLVMContext::OB_ptrauth});
 
       if (!IsNoTail && CI->doesNotAccessMemory()) {
         // A call to a readnone function whose arguments are all things computed
@@ -531,7 +526,7 @@ void TailRecursionEliminator::createTailRecurseLoopHeader(CallInst *CI) {
   }
 
   // If the function doen't return void, create the RetPN and RetKnownPN PHI
-  // nodes to track our return value. We initialize RetPN with undef and
+  // nodes to track our return value. We initialize RetPN with poison and
   // RetKnownPN with false since we can't know our return value at function
   // entry.
   Type *RetType = F.getReturnType();
@@ -540,7 +535,7 @@ void TailRecursionEliminator::createTailRecurseLoopHeader(CallInst *CI) {
     RetPN = PHINode::Create(RetType, 2, "ret.tr", InsertPos);
     RetKnownPN = PHINode::Create(BoolType, 2, "ret.known.tr", InsertPos);
 
-    RetPN->addIncoming(UndefValue::get(RetType), NewEntry);
+    RetPN->addIncoming(PoisonValue::get(RetType), NewEntry);
     RetKnownPN->addIncoming(ConstantInt::getFalse(BoolType), NewEntry);
   }
 
@@ -734,7 +729,7 @@ void TailRecursionEliminator::cleanupAndFinalize() {
   // call.
   for (PHINode *PN : ArgumentPHIs) {
     // If the PHI Node is a dynamic constant, replace it with the value it is.
-    if (Value *PNV = SimplifyInstruction(PN, F.getParent()->getDataLayout())) {
+    if (Value *PNV = simplifyInstruction(PN, F.getParent()->getDataLayout())) {
       PN->replaceAllUsesWith(PNV);
       PN->eraseFromParent();
     }
diff --git a/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp b/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp
index 80a7d3a43ad6..8367e61c1a47 100644
--- a/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp
+++ b/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp
@@ -61,7 +61,7 @@ static void warnAboutLeftoverTransformations(Loop *L,
           << "loop not vectorized: the optimizer was unable to perform the "
              "requested transformation; the transformation might be disabled "
              "or specified as part of an unsupported transformation ordering");
-    else if (InterleaveCount.getValueOr(0) != 1)
+    else if (InterleaveCount.value_or(0) != 1)
       ORE->emit(
           DiagnosticInfoOptimizationFailure(DEBUG_TYPE,
                                             "FailedRequestedInterleaving",
diff --git a/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp b/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
index c734611836eb..24972db404be 100644
--- a/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
+++ b/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
@@ -50,9 +50,6 @@ static Value *callPrintfBegin(IRBuilder<> &Builder, Value *Version) {
   auto Int64Ty = Builder.getInt64Ty();
   auto M = Builder.GetInsertBlock()->getModule();
   auto Fn = M->getOrInsertFunction("__ockl_printf_begin", Int64Ty, Int64Ty);
-  if (!M->getModuleFlag("amdgpu_hostcall")) {
-    M->addModuleFlag(llvm::Module::Override, "amdgpu_hostcall", 1);
-  }
   return Builder.CreateCall(Fn, Version);
 }
 
diff --git a/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp b/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp
index cbc508bb863a..0318429a76a7 100644
--- a/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp
+++ b/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/Transforms/Utils/ASanStackFrameLayout.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/IR/DebugInfo.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/llvm/lib/Transforms/Utils/AddDiscriminators.cpp b/llvm/lib/Transforms/Utils/AddDiscriminators.cpp
index e789194eb3ab..e6372fc5ab86 100644
--- a/llvm/lib/Transforms/Utils/AddDiscriminators.cpp
+++ b/llvm/lib/Transforms/Utils/AddDiscriminators.cpp
@@ -222,7 +222,7 @@ static bool addDiscriminators(Function &F) {
                           << DIL->getColumn() << ":" << Discriminator << " "
                           << I << "\n");
       } else {
-        I.setDebugLoc(NewDIL.getValue());
+        I.setDebugLoc(*NewDIL);
         LLVM_DEBUG(dbgs() << DIL->getFilename() << ":" << DIL->getLine() << ":"
                    << DIL->getColumn() << ":" << Discriminator << " " << I
                    << "\n");
@@ -260,7 +260,7 @@ static bool addDiscriminators(Function &F) {
                      << CurrentDIL->getLine() << ":" << CurrentDIL->getColumn()
                      << ":" << Discriminator << " " << I << "\n");
         } else {
-          I.setDebugLoc(NewDIL.getValue());
+          I.setDebugLoc(*NewDIL);
           Changed = true;
         }
       }
diff --git a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
index f910f7c3c31f..02ea17825c2f 100644
--- a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
+++ b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
@@ -18,6 +18,7 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/DebugCounter.h"
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 15c4a64eb794..e9983ff82176 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -21,7 +21,6 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
-#include "llvm/Analysis/PostDominators.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
@@ -33,7 +32,6 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/PseudoProbe.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
@@ -1164,7 +1162,11 @@ SplitBlockPredecessorsImpl(BasicBlock *BB, ArrayRef<BasicBlock *> Preds,
     if (NewLatch != OldLatch) {
       MDNode *MD = OldLatch->getTerminator()->getMetadata("llvm.loop");
       NewLatch->getTerminator()->setMetadata("llvm.loop", MD);
-      OldLatch->getTerminator()->setMetadata("llvm.loop", nullptr);
+      // It's still possible that OldLatch is the latch of another inner loop,
+      // in which case we do not remove the metadata.
+      Loop *IL = LI->getLoopFor(OldLatch);
+      if (IL && IL->getLoopLatch() != OldLatch)
+        OldLatch->getTerminator()->setMetadata("llvm.loop", nullptr);
     }
   }
 
diff --git a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 1bb80be8ef99..0b36e8708a03 100644
--- a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -27,9 +27,7 @@
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/Type.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
@@ -317,18 +315,11 @@ llvm::SplitKnownCriticalEdge(Instruction *TI, unsigned SuccNum,
 // predecessors of BB.
 static BasicBlock *
 findIBRPredecessor(BasicBlock *BB, SmallVectorImpl<BasicBlock *> &OtherPreds) {
-  // If the block doesn't have any PHIs, we don't care about it, since there's
-  // no point in splitting it.
-  PHINode *PN = dyn_cast<PHINode>(BB->begin());
-  if (!PN)
-    return nullptr;
-
   // Verify we have exactly one IBR predecessor.
   // Conservatively bail out if one of the other predecessors is not a "regular"
   // terminator (that is, not a switch or a br).
   BasicBlock *IBB = nullptr;
-  for (unsigned Pred = 0, E = PN->getNumIncomingValues(); Pred != E; ++Pred) {
-    BasicBlock *PredBB = PN->getIncomingBlock(Pred);
+  for (BasicBlock *PredBB : predecessors(BB)) {
     Instruction *PredTerm = PredBB->getTerminator();
     switch (PredTerm->getOpcode()) {
     case Instruction::IndirectBr:
@@ -349,6 +340,7 @@ findIBRPredecessor(BasicBlock *BB, SmallVectorImpl<BasicBlock *> &OtherPreds) {
 }
 
 bool llvm::SplitIndirectBrCriticalEdges(Function &F,
+                                        bool IgnoreBlocksWithoutPHI,
                                         BranchProbabilityInfo *BPI,
                                         BlockFrequencyInfo *BFI) {
   // Check whether the function has any indirectbrs, and collect which blocks
@@ -370,6 +362,9 @@ bool llvm::SplitIndirectBrCriticalEdges(Function &F,
   bool ShouldUpdateAnalysis = BPI && BFI;
   bool Changed = false;
   for (BasicBlock *Target : Targets) {
+    if (IgnoreBlocksWithoutPHI && Target->phis().empty())
+      continue;
+
     SmallVector<BasicBlock *, 16> OtherPreds;
     BasicBlock *IBRPred = findIBRPredecessor(Target, OtherPreds);
     // If we did not found an indirectbr, or the indirectbr is the only
diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index 97f11ca71726..c4a58f36c171 100644
--- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -13,16 +13,17 @@
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Support/TypeSize.h"
 
 using namespace llvm;
 
@@ -41,7 +42,6 @@ STATISTIC(NumInaccessibleMemOrArgMemOnly,
 STATISTIC(NumNoUnwind, "Number of functions inferred as nounwind");
 STATISTIC(NumNoCapture, "Number of arguments inferred as nocapture");
 STATISTIC(NumWriteOnlyArg, "Number of arguments inferred as writeonly");
-STATISTIC(NumSExtArg, "Number of arguments inferred as signext");
 STATISTIC(NumReadOnlyArg, "Number of arguments inferred as readonly");
 STATISTIC(NumNoAlias, "Number of function returns inferred as noalias");
 STATISTIC(NumNoUndef, "Number of function returns inferred as noundef returns");
@@ -149,14 +149,6 @@ static bool setOnlyWritesMemory(Function &F, unsigned ArgNo) {
   return true;
 }
 
-static bool setSignExtendedArg(Function &F, unsigned ArgNo) {
- if (F.hasParamAttribute(ArgNo, Attribute::SExt))
-    return false;
-  F.addParamAttr(ArgNo, Attribute::SExt);
-  ++NumSExtArg;
-  return true;
-}
-
 static bool setRetNoUndef(Function &F) {
   if (!F.getReturnType()->isVoidTy() &&
       !F.hasRetAttribute(Attribute::NoUndef)) {
@@ -224,15 +216,54 @@ static bool setWillReturn(Function &F) {
   return true;
 }
 
-bool llvm::inferLibFuncAttributes(Module *M, StringRef Name,
-                                  const TargetLibraryInfo &TLI) {
+static bool setAlignedAllocParam(Function &F, unsigned ArgNo) {
+  if (F.hasParamAttribute(ArgNo, Attribute::AllocAlign))
+    return false;
+  F.addParamAttr(ArgNo, Attribute::AllocAlign);
+  return true;
+}
+
+static bool setAllocatedPointerParam(Function &F, unsigned ArgNo) {
+  if (F.hasParamAttribute(ArgNo, Attribute::AllocatedPointer))
+    return false;
+  F.addParamAttr(ArgNo, Attribute::AllocatedPointer);
+  return true;
+}
+
+static bool setAllocSize(Function &F, unsigned ElemSizeArg,
+                         Optional<unsigned> NumElemsArg) {
+  if (F.hasFnAttribute(Attribute::AllocSize))
+    return false;
+  F.addFnAttr(Attribute::getWithAllocSizeArgs(F.getContext(), ElemSizeArg,
+                                              NumElemsArg));
+  return true;
+}
+
+static bool setAllocFamily(Function &F, StringRef Family) {
+  if (F.hasFnAttribute("alloc-family"))
+    return false;
+  F.addFnAttr("alloc-family", Family);
+  return true;
+}
+
+static bool setAllocKind(Function &F, AllocFnKind K) {
+  if (F.hasFnAttribute(Attribute::AllocKind))
+    return false;
+  F.addFnAttr(
+      Attribute::get(F.getContext(), Attribute::AllocKind, uint64_t(K)));
+  return true;
+}
+
+bool llvm::inferNonMandatoryLibFuncAttrs(Module *M, StringRef Name,
+                                         const TargetLibraryInfo &TLI) {
   Function *F = M->getFunction(Name);
   if (!F)
     return false;
-  return inferLibFuncAttributes(*F, TLI);
+  return inferNonMandatoryLibFuncAttrs(*F, TLI);
 }
 
-bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
+bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
+                                         const TargetLibraryInfo &TLI) {
   LibFunc TheLibFunc;
   if (!(TLI.getLibFunc(F, TheLibFunc) && TLI.has(TheLibFunc)))
     return false;
@@ -360,6 +391,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setArgNoUndef(F, 1);
     LLVM_FALLTHROUGH;
   case LibFunc_strdup:
+    Changed |= setAllocFamily(F, "malloc");
     Changed |= setOnlyAccessesInaccessibleMemOrArgMem(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setRetDoesNotAlias(F);
@@ -416,9 +448,17 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
   case LibFunc_aligned_alloc:
+    Changed |= setAlignedAllocParam(F, 0);
+    Changed |= setAllocSize(F, 1, None);
+    Changed |= setAllocKind(F, AllocFnKind::Alloc | AllocFnKind::Uninitialized | AllocFnKind::Aligned);
+    LLVM_FALLTHROUGH;
   case LibFunc_valloc:
   case LibFunc_malloc:
   case LibFunc_vec_malloc:
+    Changed |= setAllocFamily(F, TheLibFunc == LibFunc_vec_malloc ? "vec_malloc"
+                                                                  : "malloc");
+    Changed |= setAllocKind(F, AllocFnKind::Alloc | AllocFnKind::Uninitialized);
+    Changed |= setAllocSize(F, 0, None);
     Changed |= setOnlyAccessesInaccessibleMemory(F);
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
@@ -481,6 +521,11 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
   case LibFunc_memalign:
+    Changed |= setAllocFamily(F, "malloc");
+    Changed |= setAllocKind(F, AllocFnKind::Alloc | AllocFnKind::Aligned |
+                                   AllocFnKind::Uninitialized);
+    Changed |= setAllocSize(F, 1, None);
+    Changed |= setAlignedAllocParam(F, 0);
     Changed |= setOnlyAccessesInaccessibleMemory(F);
     Changed |= setRetNoUndef(F);
     Changed |= setDoesNotThrow(F);
@@ -500,8 +545,13 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setDoesNotCapture(F, 0);
     return Changed;
   case LibFunc_realloc:
-  case LibFunc_vec_realloc:
   case LibFunc_reallocf:
+  case LibFunc_vec_realloc:
+    Changed |= setAllocFamily(
+        F, TheLibFunc == LibFunc_vec_realloc ? "vec_malloc" : "malloc");
+    Changed |= setAllocKind(F, AllocFnKind::Realloc);
+    Changed |= setAllocatedPointerParam(F, 0);
+    Changed |= setAllocSize(F, 1, None);
     Changed |= setOnlyAccessesInaccessibleMemOrArgMem(F);
     Changed |= setRetNoUndef(F);
     Changed |= setDoesNotThrow(F);
@@ -575,6 +625,10 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     return Changed;
   case LibFunc_calloc:
   case LibFunc_vec_calloc:
+    Changed |= setAllocFamily(F, TheLibFunc == LibFunc_vec_calloc ? "vec_malloc"
+                                                                  : "malloc");
+    Changed |= setAllocKind(F, AllocFnKind::Alloc | AllocFnKind::Zeroed);
+    Changed |= setAllocSize(F, 0, 1);
     Changed |= setOnlyAccessesInaccessibleMemory(F);
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
@@ -633,6 +687,10 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     return Changed;
   case LibFunc_free:
   case LibFunc_vec_free:
+    Changed |= setAllocFamily(F, TheLibFunc == LibFunc_vec_free ? "vec_malloc"
+                                                                : "malloc");
+    Changed |= setAllocKind(F, AllocFnKind::Free);
+    Changed |= setAllocatedPointerParam(F, 0);
     Changed |= setOnlyAccessesInaccessibleMemOrArgMem(F);
     Changed |= setArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
@@ -1041,7 +1099,6 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
   case LibFunc_ldexp:
   case LibFunc_ldexpf:
   case LibFunc_ldexpl:
-    Changed |= setSignExtendedArg(F, 1);
     Changed |= setWillReturn(F);
     return Changed;
   case LibFunc_abs:
@@ -1178,34 +1235,179 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
   }
 }
 
-bool llvm::hasFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
+static void setArgExtAttr(Function &F, unsigned ArgNo,
+                          const TargetLibraryInfo &TLI, bool Signed = true) {
+  Attribute::AttrKind ExtAttr = TLI.getExtAttrForI32Param(Signed);
+  if (ExtAttr != Attribute::None && !F.hasParamAttribute(ArgNo, ExtAttr))
+    F.addParamAttr(ArgNo, ExtAttr);
+}
+
+// Modeled after X86TargetLowering::markLibCallAttributes.
+static void markRegisterParameterAttributes(Function *F) {
+  if (!F->arg_size() || F->isVarArg())
+    return;
+
+  const CallingConv::ID CC = F->getCallingConv();
+  if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
+    return;
+
+  const Module *M = F->getParent();
+  unsigned N = M->getNumberRegisterParameters();
+  if (!N)
+    return;
+
+  const DataLayout &DL = M->getDataLayout();
+
+  for (Argument &A : F->args()) {
+    Type *T = A.getType();
+    if (!T->isIntOrPtrTy())
+      continue;
+
+    const TypeSize &TS = DL.getTypeAllocSize(T);
+    if (TS > 8)
+      continue;
+
+    assert(TS <= 4 && "Need to account for parameters larger than word size");
+    const unsigned NumRegs = TS > 4 ? 2 : 1;
+    if (N < NumRegs)
+      return;
+
+    N -= NumRegs;
+    F->addParamAttr(A.getArgNo(), Attribute::InReg);
+  }
+}
+
+FunctionCallee llvm::getOrInsertLibFunc(Module *M, const TargetLibraryInfo &TLI,
+                                        LibFunc TheLibFunc, FunctionType *T,
+                                        AttributeList AttributeList) {
+  assert(TLI.has(TheLibFunc) &&
+         "Creating call to non-existing library function.");
+  StringRef Name = TLI.getName(TheLibFunc);
+  FunctionCallee C = M->getOrInsertFunction(Name, T, AttributeList);
+
+  // Make sure any mandatory argument attributes are added.
+
+  // Any outgoing i32 argument should be handled with setArgExtAttr() which
+  // will add an extension attribute if the target ABI requires it. Adding
+  // argument extensions is typically done by the front end but when an
+  // optimizer is building a library call on its own it has to take care of
+  // this. Each such generated function must be handled here with sign or
+  // zero extensions as needed.  F is retreived with cast<> because we demand
+  // of the caller to have called isLibFuncEmittable() first.
+  Function *F = cast<Function>(C.getCallee());
+  assert(F->getFunctionType() == T && "Function type does not match.");
+  switch (TheLibFunc) {
+  case LibFunc_fputc:
+  case LibFunc_putchar:
+    setArgExtAttr(*F, 0, TLI);
+    break;
+  case LibFunc_ldexp:
+  case LibFunc_ldexpf:
+  case LibFunc_ldexpl:
+  case LibFunc_memchr:
+  case LibFunc_memrchr:
+  case LibFunc_strchr:
+    setArgExtAttr(*F, 1, TLI);
+    break;
+  case LibFunc_memccpy:
+    setArgExtAttr(*F, 2, TLI);
+    break;
+
+    // These are functions that are known to not need any argument extension
+    // on any target: A size_t argument (which may be an i32 on some targets)
+    // should not trigger the assert below.
+  case LibFunc_bcmp:
+  case LibFunc_calloc:
+  case LibFunc_fwrite:
+  case LibFunc_malloc:
+  case LibFunc_memcmp:
+  case LibFunc_memcpy_chk:
+  case LibFunc_mempcpy:
+  case LibFunc_memset_pattern16:
+  case LibFunc_snprintf:
+  case LibFunc_stpncpy:
+  case LibFunc_strlcat:
+  case LibFunc_strlcpy:
+  case LibFunc_strncat:
+  case LibFunc_strncmp:
+  case LibFunc_strncpy:
+  case LibFunc_vsnprintf:
+    break;
+
+  default:
+#ifndef NDEBUG
+    for (unsigned i = 0; i < T->getNumParams(); i++)
+      assert(!isa<IntegerType>(T->getParamType(i)) &&
+             "Unhandled integer argument.");
+#endif
+    break;
+  }
+
+  markRegisterParameterAttributes(F);
+
+  return C;
+}
+
+FunctionCallee llvm::getOrInsertLibFunc(Module *M, const TargetLibraryInfo &TLI,
+                                        LibFunc TheLibFunc, FunctionType *T) {
+  return getOrInsertLibFunc(M, TLI, TheLibFunc, T, AttributeList());
+}
+
+bool llvm::isLibFuncEmittable(const Module *M, const TargetLibraryInfo *TLI,
+                              LibFunc TheLibFunc) {
+  StringRef FuncName = TLI->getName(TheLibFunc);
+  if (!TLI->has(TheLibFunc))
+    return false;
+
+  // Check if the Module already has a GlobalValue with the same name, in
+  // which case it must be a Function with the expected type.
+  if (GlobalValue *GV = M->getNamedValue(FuncName)) {
+    if (auto *F = dyn_cast<Function>(GV))
+      return TLI->isValidProtoForLibFunc(*F->getFunctionType(), TheLibFunc, *M);
+    return false;
+  }
+
+  return true;
+}
+
+bool llvm::isLibFuncEmittable(const Module *M, const TargetLibraryInfo *TLI,
+                              StringRef Name) {
+  LibFunc TheLibFunc;
+  return TLI->getLibFunc(Name, TheLibFunc) &&
+         isLibFuncEmittable(M, TLI, TheLibFunc);
+}
+
+bool llvm::hasFloatFn(const Module *M, const TargetLibraryInfo *TLI, Type *Ty,
                       LibFunc DoubleFn, LibFunc FloatFn, LibFunc LongDoubleFn) {
   switch (Ty->getTypeID()) {
   case Type::HalfTyID:
     return false;
   case Type::FloatTyID:
-    return TLI->has(FloatFn);
+    return isLibFuncEmittable(M, TLI, FloatFn);
   case Type::DoubleTyID:
-    return TLI->has(DoubleFn);
+    return isLibFuncEmittable(M, TLI, DoubleFn);
   default:
-    return TLI->has(LongDoubleFn);
+    return isLibFuncEmittable(M, TLI, LongDoubleFn);
   }
 }
 
-StringRef llvm::getFloatFnName(const TargetLibraryInfo *TLI, Type *Ty,
-                               LibFunc DoubleFn, LibFunc FloatFn,
-                               LibFunc LongDoubleFn) {
-  assert(hasFloatFn(TLI, Ty, DoubleFn, FloatFn, LongDoubleFn) &&
+StringRef llvm::getFloatFn(const Module *M, const TargetLibraryInfo *TLI,
+                           Type *Ty, LibFunc DoubleFn, LibFunc FloatFn,
+                           LibFunc LongDoubleFn, LibFunc &TheLibFunc) {
+  assert(hasFloatFn(M, TLI, Ty, DoubleFn, FloatFn, LongDoubleFn) &&
          "Cannot get name for unavailable function!");
 
   switch (Ty->getTypeID()) {
   case Type::HalfTyID:
     llvm_unreachable("No name for HalfTy!");
   case Type::FloatTyID:
+    TheLibFunc = FloatFn;
     return TLI->getName(FloatFn);
   case Type::DoubleTyID:
+    TheLibFunc = DoubleFn;
     return TLI->getName(DoubleFn);
   default:
+    TheLibFunc = LongDoubleFn;
     return TLI->getName(LongDoubleFn);
   }
 }
@@ -1222,14 +1424,14 @@ static Value *emitLibCall(LibFunc TheLibFunc, Type *ReturnType,
                           ArrayRef<Value *> Operands, IRBuilderBase &B,
                           const TargetLibraryInfo *TLI,
                           bool IsVaArgs = false) {
-  if (!TLI->has(TheLibFunc))
+  Module *M = B.GetInsertBlock()->getModule();
+  if (!isLibFuncEmittable(M, TLI, TheLibFunc))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getModule();
   StringRef FuncName = TLI->getName(TheLibFunc);
   FunctionType *FuncType = FunctionType::get(ReturnType, ParamTypes, IsVaArgs);
-  FunctionCallee Callee = M->getOrInsertFunction(FuncName, FuncType);
-  inferLibFuncAttributes(M, FuncName, *TLI);
+  FunctionCallee Callee = getOrInsertLibFunc(M, *TLI, TheLibFunc, FuncType);
+  inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI);
   CallInst *CI = B.CreateCall(Callee, Operands, FuncName);
   if (const Function *F =
           dyn_cast<Function>(Callee.getCallee()->stripPointerCasts()))
@@ -1298,16 +1500,16 @@ Value *llvm::emitStpNCpy(Value *Dst, Value *Src, Value *Len, IRBuilderBase &B,
 Value *llvm::emitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
                            IRBuilderBase &B, const DataLayout &DL,
                            const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc_memcpy_chk))
+  Module *M = B.GetInsertBlock()->getModule();
+  if (!isLibFuncEmittable(M, TLI, LibFunc_memcpy_chk))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getModule();
   AttributeList AS;
   AS = AttributeList::get(M->getContext(), AttributeList::FunctionIndex,
                           Attribute::NoUnwind);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  FunctionCallee MemCpy = M->getOrInsertFunction(
-      "__memcpy_chk", AttributeList::get(M->getContext(), AS), B.getInt8PtrTy(),
+  FunctionCallee MemCpy = getOrInsertLibFunc(M, *TLI, LibFunc_memcpy_chk,
+      AttributeList::get(M->getContext(), AS), B.getInt8PtrTy(),
       B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context),
       DL.getIntPtrType(Context));
   Dst = castToCStr(Dst, B);
@@ -1337,6 +1539,15 @@ Value *llvm::emitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilderBase &B,
       {castToCStr(Ptr, B), Val, Len}, B, TLI);
 }
 
+Value *llvm::emitMemRChr(Value *Ptr, Value *Val, Value *Len, IRBuilderBase &B,
+                        const DataLayout &DL, const TargetLibraryInfo *TLI) {
+  LLVMContext &Context = B.GetInsertBlock()->getContext();
+  return emitLibCall(
+      LibFunc_memrchr, B.getInt8PtrTy(),
+      {B.getInt8PtrTy(), B.getInt32Ty(), DL.getIntPtrType(Context)},
+      {castToCStr(Ptr, B), Val, Len}, B, TLI);
+}
+
 Value *llvm::emitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilderBase &B,
                         const DataLayout &DL, const TargetLibraryInfo *TLI) {
   LLVMContext &Context = B.GetInsertBlock()->getContext();
@@ -1441,14 +1652,15 @@ static void appendTypeSuffix(Value *Op, StringRef &Name,
   }
 }
 
-static Value *emitUnaryFloatFnCallHelper(Value *Op, StringRef Name,
-                                         IRBuilderBase &B,
-                                         const AttributeList &Attrs) {
+static Value *emitUnaryFloatFnCallHelper(Value *Op, LibFunc TheLibFunc,
+                                         StringRef Name, IRBuilderBase &B,
+                                         const AttributeList &Attrs,
+                                         const TargetLibraryInfo *TLI) {
   assert((Name != "") && "Must specify Name to emitUnaryFloatFnCall");
 
   Module *M = B.GetInsertBlock()->getModule();
-  FunctionCallee Callee =
-      M->getOrInsertFunction(Name, Op->getType(), Op->getType());
+  FunctionCallee Callee = getOrInsertLibFunc(M, *TLI, TheLibFunc, Op->getType(),
+                                             Op->getType());
   CallInst *CI = B.CreateCall(Callee, Op, Name);
 
   // The incoming attribute set may have come from a speculatable intrinsic, but
@@ -1463,12 +1675,16 @@ static Value *emitUnaryFloatFnCallHelper(Value *Op, StringRef Name,
   return CI;
 }
 
-Value *llvm::emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilderBase &B,
+Value *llvm::emitUnaryFloatFnCall(Value *Op, const TargetLibraryInfo *TLI,
+                                  StringRef Name, IRBuilderBase &B,
                                   const AttributeList &Attrs) {
   SmallString<20> NameBuffer;
   appendTypeSuffix(Op, Name, NameBuffer);
 
-  return emitUnaryFloatFnCallHelper(Op, Name, B, Attrs);
+  LibFunc TheLibFunc;
+  TLI->getLibFunc(Name, TheLibFunc);
+
+  return emitUnaryFloatFnCallHelper(Op, TheLibFunc, Name, B, Attrs, TLI);
 }
 
 Value *llvm::emitUnaryFloatFnCall(Value *Op, const TargetLibraryInfo *TLI,
@@ -1476,23 +1692,25 @@ Value *llvm::emitUnaryFloatFnCall(Value *Op, const TargetLibraryInfo *TLI,
                                   LibFunc LongDoubleFn, IRBuilderBase &B,
                                   const AttributeList &Attrs) {
   // Get the name of the function according to TLI.
-  StringRef Name = getFloatFnName(TLI, Op->getType(),
-                                  DoubleFn, FloatFn, LongDoubleFn);
+  Module *M = B.GetInsertBlock()->getModule();
+  LibFunc TheLibFunc;
+  StringRef Name = getFloatFn(M, TLI, Op->getType(), DoubleFn, FloatFn,
+                              LongDoubleFn, TheLibFunc);
 
-  return emitUnaryFloatFnCallHelper(Op, Name, B, Attrs);
+  return emitUnaryFloatFnCallHelper(Op, TheLibFunc, Name, B, Attrs, TLI);
 }
 
 static Value *emitBinaryFloatFnCallHelper(Value *Op1, Value *Op2,
+                                          LibFunc TheLibFunc,
                                           StringRef Name, IRBuilderBase &B,
                                           const AttributeList &Attrs,
-                                          const TargetLibraryInfo *TLI = nullptr) {
+                                          const TargetLibraryInfo *TLI) {
   assert((Name != "") && "Must specify Name to emitBinaryFloatFnCall");
 
   Module *M = B.GetInsertBlock()->getModule();
-  FunctionCallee Callee = M->getOrInsertFunction(Name, Op1->getType(),
-                                                 Op1->getType(), Op2->getType());
-  if (TLI != nullptr)
-    inferLibFuncAttributes(M, Name, *TLI);
+  FunctionCallee Callee = getOrInsertLibFunc(M, *TLI, TheLibFunc, Op1->getType(),
+                                             Op1->getType(), Op2->getType());
+  inferNonMandatoryLibFuncAttrs(M, Name, *TLI);
   CallInst *CI = B.CreateCall(Callee, { Op1, Op2 }, Name);
 
   // The incoming attribute set may have come from a speculatable intrinsic, but
@@ -1507,15 +1725,19 @@ static Value *emitBinaryFloatFnCallHelper(Value *Op1, Value *Op2,
   return CI;
 }
 
-Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
-                                   IRBuilderBase &B,
+Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2,
+                                   const TargetLibraryInfo *TLI,
+                                   StringRef Name, IRBuilderBase &B,
                                    const AttributeList &Attrs) {
   assert((Name != "") && "Must specify Name to emitBinaryFloatFnCall");
 
   SmallString<20> NameBuffer;
   appendTypeSuffix(Op1, Name, NameBuffer);
 
-  return emitBinaryFloatFnCallHelper(Op1, Op2, Name, B, Attrs);
+  LibFunc TheLibFunc;
+  TLI->getLibFunc(Name, TheLibFunc);
+
+  return emitBinaryFloatFnCallHelper(Op1, Op2, TheLibFunc, Name, B, Attrs, TLI);
 }
 
 Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2,
@@ -1524,22 +1746,24 @@ Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2,
                                    LibFunc LongDoubleFn, IRBuilderBase &B,
                                    const AttributeList &Attrs) {
   // Get the name of the function according to TLI.
-  StringRef Name = getFloatFnName(TLI, Op1->getType(),
-                                  DoubleFn, FloatFn, LongDoubleFn);
+  Module *M = B.GetInsertBlock()->getModule();
+  LibFunc TheLibFunc;
+  StringRef Name = getFloatFn(M, TLI, Op1->getType(), DoubleFn, FloatFn,
+                              LongDoubleFn, TheLibFunc);
 
-  return emitBinaryFloatFnCallHelper(Op1, Op2, Name, B, Attrs, TLI);
+  return emitBinaryFloatFnCallHelper(Op1, Op2, TheLibFunc, Name, B, Attrs, TLI);
 }
 
 Value *llvm::emitPutChar(Value *Char, IRBuilderBase &B,
                          const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc_putchar))
+  Module *M = B.GetInsertBlock()->getModule();
+  if (!isLibFuncEmittable(M, TLI, LibFunc_putchar))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getModule();
   StringRef PutCharName = TLI->getName(LibFunc_putchar);
-  FunctionCallee PutChar =
-      M->getOrInsertFunction(PutCharName, B.getInt32Ty(), B.getInt32Ty());
-  inferLibFuncAttributes(M, PutCharName, *TLI);
+  FunctionCallee PutChar = getOrInsertLibFunc(M, *TLI, LibFunc_putchar,
+                                              B.getInt32Ty(), B.getInt32Ty());
+  inferNonMandatoryLibFuncAttrs(M, PutCharName, *TLI);
   CallInst *CI = B.CreateCall(PutChar,
                               B.CreateIntCast(Char,
                               B.getInt32Ty(),
@@ -1555,14 +1779,14 @@ Value *llvm::emitPutChar(Value *Char, IRBuilderBase &B,
 
 Value *llvm::emitPutS(Value *Str, IRBuilderBase &B,
                       const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc_puts))
+  Module *M = B.GetInsertBlock()->getModule();
+  if (!isLibFuncEmittable(M, TLI, LibFunc_puts))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getModule();
   StringRef PutsName = TLI->getName(LibFunc_puts);
-  FunctionCallee PutS =
-      M->getOrInsertFunction(PutsName, B.getInt32Ty(), B.getInt8PtrTy());
-  inferLibFuncAttributes(M, PutsName, *TLI);
+  FunctionCallee PutS = getOrInsertLibFunc(M, *TLI, LibFunc_puts, B.getInt32Ty(),
+                                           B.getInt8PtrTy());
+  inferNonMandatoryLibFuncAttrs(M, PutsName, *TLI);
   CallInst *CI = B.CreateCall(PutS, castToCStr(Str, B), PutsName);
   if (const Function *F =
           dyn_cast<Function>(PutS.getCallee()->stripPointerCasts()))
@@ -1572,15 +1796,15 @@ Value *llvm::emitPutS(Value *Str, IRBuilderBase &B,
 
 Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilderBase &B,
                        const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc_fputc))
+  Module *M = B.GetInsertBlock()->getModule();
+  if (!isLibFuncEmittable(M, TLI, LibFunc_fputc))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getModule();
   StringRef FPutcName = TLI->getName(LibFunc_fputc);
-  FunctionCallee F = M->getOrInsertFunction(FPutcName, B.getInt32Ty(),
-                                            B.getInt32Ty(), File->getType());
+  FunctionCallee F = getOrInsertLibFunc(M, *TLI, LibFunc_fputc, B.getInt32Ty(),
+                                        B.getInt32Ty(), File->getType());
   if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(M, FPutcName, *TLI);
+    inferNonMandatoryLibFuncAttrs(M, FPutcName, *TLI);
   Char = B.CreateIntCast(Char, B.getInt32Ty(), /*isSigned*/true,
                          "chari");
   CallInst *CI = B.CreateCall(F, {Char, File}, FPutcName);
@@ -1593,15 +1817,15 @@ Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilderBase &B,
 
 Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilderBase &B,
                        const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc_fputs))
+  Module *M = B.GetInsertBlock()->getModule();
+  if (!isLibFuncEmittable(M, TLI, LibFunc_fputs))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getModule();
   StringRef FPutsName = TLI->getName(LibFunc_fputs);
-  FunctionCallee F = M->getOrInsertFunction(FPutsName, B.getInt32Ty(),
-                                            B.getInt8PtrTy(), File->getType());
+  FunctionCallee F = getOrInsertLibFunc(M, *TLI, LibFunc_fputs, B.getInt32Ty(),
+                                        B.getInt8PtrTy(), File->getType());
   if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(M, FPutsName, *TLI);
+    inferNonMandatoryLibFuncAttrs(M, FPutsName, *TLI);
   CallInst *CI = B.CreateCall(F, {castToCStr(Str, B), File}, FPutsName);
 
   if (const Function *Fn =
@@ -1612,18 +1836,18 @@ Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilderBase &B,
 
 Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilderBase &B,
                         const DataLayout &DL, const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc_fwrite))
+  Module *M = B.GetInsertBlock()->getModule();
+  if (!isLibFuncEmittable(M, TLI, LibFunc_fwrite))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getModule();
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   StringRef FWriteName = TLI->getName(LibFunc_fwrite);
-  FunctionCallee F = M->getOrInsertFunction(
-      FWriteName, DL.getIntPtrType(Context), B.getInt8PtrTy(),
-      DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType());
+  FunctionCallee F = getOrInsertLibFunc(M, *TLI, LibFunc_fwrite,
+       DL.getIntPtrType(Context), B.getInt8PtrTy(), DL.getIntPtrType(Context),
+       DL.getIntPtrType(Context), File->getType());
 
   if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(M, FWriteName, *TLI);
+    inferNonMandatoryLibFuncAttrs(M, FWriteName, *TLI);
   CallInst *CI =
       B.CreateCall(F, {castToCStr(Ptr, B), Size,
                        ConstantInt::get(DL.getIntPtrType(Context), 1), File});
@@ -1636,15 +1860,15 @@ Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilderBase &B,
 
 Value *llvm::emitMalloc(Value *Num, IRBuilderBase &B, const DataLayout &DL,
                         const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc_malloc))
+  Module *M = B.GetInsertBlock()->getModule();
+  if (!isLibFuncEmittable(M, TLI, LibFunc_malloc))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getModule();
   StringRef MallocName = TLI->getName(LibFunc_malloc);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  FunctionCallee Malloc = M->getOrInsertFunction(MallocName, B.getInt8PtrTy(),
-                                                 DL.getIntPtrType(Context));
-  inferLibFuncAttributes(M, MallocName, *TLI);
+  FunctionCallee Malloc = getOrInsertLibFunc(M, *TLI, LibFunc_malloc,
+                                 B.getInt8PtrTy(), DL.getIntPtrType(Context));
+  inferNonMandatoryLibFuncAttrs(M, MallocName, *TLI);
   CallInst *CI = B.CreateCall(Malloc, Num, MallocName);
 
   if (const Function *F =
@@ -1656,16 +1880,16 @@ Value *llvm::emitMalloc(Value *Num, IRBuilderBase &B, const DataLayout &DL,
 
 Value *llvm::emitCalloc(Value *Num, Value *Size, IRBuilderBase &B,
                         const TargetLibraryInfo &TLI) {
-  if (!TLI.has(LibFunc_calloc))
+  Module *M = B.GetInsertBlock()->getModule();
+  if (!isLibFuncEmittable(M, &TLI, LibFunc_calloc))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getModule();
   StringRef CallocName = TLI.getName(LibFunc_calloc);
   const DataLayout &DL = M->getDataLayout();
   IntegerType *PtrType = DL.getIntPtrType((B.GetInsertBlock()->getContext()));
-  FunctionCallee Calloc =
-      M->getOrInsertFunction(CallocName, B.getInt8PtrTy(), PtrType, PtrType);
-  inferLibFuncAttributes(M, CallocName, TLI);
+  FunctionCallee Calloc = getOrInsertLibFunc(M, TLI, LibFunc_calloc,
+                                             B.getInt8PtrTy(), PtrType, PtrType);
+  inferNonMandatoryLibFuncAttrs(M, CallocName, TLI);
   CallInst *CI = B.CreateCall(Calloc, {Num, Size}, CallocName);
 
   if (const auto *F =
diff --git a/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp b/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp
index ac3839f2a4ab..1840f26add2d 100644
--- a/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp
+++ b/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp
@@ -14,6 +14,9 @@
 
 #include "llvm/Transforms/Utils/CallGraphUpdater.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
index 56b6e4bc46a5..e530afc277db 100644
--- a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
+++ b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
@@ -279,8 +279,8 @@ static void createRetBitCast(CallBase &CB, Type *RetTy, CastInst **RetBitCast) {
 ///     ; The original call instruction stays in its original block.
 ///     %t0 = musttail call i32 %ptr()
 ///     ret %t0
-static CallBase &versionCallSite(CallBase &CB, Value *Callee,
-                                 MDNode *BranchWeights) {
+CallBase &llvm::versionCallSite(CallBase &CB, Value *Callee,
+                                MDNode *BranchWeights) {
 
   IRBuilder<> Builder(&CB);
   CallBase *OrigInst = &CB;
diff --git a/llvm/lib/Transforms/Utils/CanonicalizeAliases.cpp b/llvm/lib/Transforms/Utils/CanonicalizeAliases.cpp
index 6b01c0c71d00..f229d4bf14e9 100644
--- a/llvm/lib/Transforms/Utils/CanonicalizeAliases.cpp
+++ b/llvm/lib/Transforms/Utils/CanonicalizeAliases.cpp
@@ -30,8 +30,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/CanonicalizeAliases.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 
diff --git a/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp b/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
index 049c7d113521..a1ee3df907ec 100644
--- a/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
+++ b/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
@@ -29,7 +29,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/CanonicalizeFreezeInLoops.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/IVDescriptors.h"
diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
index 86413df664a0..8f053cd56e0e 100644
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -14,7 +14,6 @@
 
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -23,7 +22,6 @@
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
@@ -324,6 +322,9 @@ struct PruningFunctionCloner {
   bool ModuleLevelChanges;
   const char *NameSuffix;
   ClonedCodeInfo *CodeInfo;
+  bool HostFuncIsStrictFP;
+
+  Instruction *cloneInstruction(BasicBlock::const_iterator II);
 
 public:
   PruningFunctionCloner(Function *newFunc, const Function *oldFunc,
@@ -331,7 +332,10 @@ public:
                         const char *nameSuffix, ClonedCodeInfo *codeInfo)
       : NewFunc(newFunc), OldFunc(oldFunc), VMap(valueMap),
         ModuleLevelChanges(moduleLevelChanges), NameSuffix(nameSuffix),
-        CodeInfo(codeInfo) {}
+        CodeInfo(codeInfo) {
+    HostFuncIsStrictFP =
+        newFunc->getAttributes().hasFnAttr(Attribute::StrictFP);
+  }
 
   /// The specified block is found to be reachable, clone it and
   /// anything that it can reach.
@@ -340,6 +344,89 @@ public:
 };
 } // namespace
 
+static bool hasRoundingModeOperand(Intrinsic::ID CIID) {
+  switch (CIID) {
+#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)                         \
+  case Intrinsic::INTRINSIC:                                                   \
+    return ROUND_MODE == 1;
+#define FUNCTION INSTRUCTION
+#include "llvm/IR/ConstrainedOps.def"
+  default:
+    llvm_unreachable("Unexpected constrained intrinsic id");
+  }
+}
+
+Instruction *
+PruningFunctionCloner::cloneInstruction(BasicBlock::const_iterator II) {
+  const Instruction &OldInst = *II;
+  Instruction *NewInst = nullptr;
+  if (HostFuncIsStrictFP) {
+    Intrinsic::ID CIID = getConstrainedIntrinsicID(OldInst);
+    if (CIID != Intrinsic::not_intrinsic) {
+      // Instead of cloning the instruction, a call to constrained intrinsic
+      // should be created.
+      // Assume the first arguments of constrained intrinsics are the same as
+      // the operands of original instruction.
+
+      // Determine overloaded types of the intrinsic.
+      SmallVector<Type *, 2> TParams;
+      SmallVector<Intrinsic::IITDescriptor, 8> Descriptor;
+      getIntrinsicInfoTableEntries(CIID, Descriptor);
+      for (unsigned I = 0, E = Descriptor.size(); I != E; ++I) {
+        Intrinsic::IITDescriptor Operand = Descriptor[I];
+        switch (Operand.Kind) {
+        case Intrinsic::IITDescriptor::Argument:
+          if (Operand.getArgumentKind() !=
+              Intrinsic::IITDescriptor::AK_MatchType) {
+            if (I == 0)
+              TParams.push_back(OldInst.getType());
+            else
+              TParams.push_back(OldInst.getOperand(I - 1)->getType());
+          }
+          break;
+        case Intrinsic::IITDescriptor::SameVecWidthArgument:
+          ++I;
+          break;
+        default:
+          break;
+        }
+      }
+
+      // Create intrinsic call.
+      LLVMContext &Ctx = NewFunc->getContext();
+      Function *IFn =
+          Intrinsic::getDeclaration(NewFunc->getParent(), CIID, TParams);
+      SmallVector<Value *, 4> Args;
+      unsigned NumOperands = OldInst.getNumOperands();
+      if (isa<CallInst>(OldInst))
+        --NumOperands;
+      for (unsigned I = 0; I < NumOperands; ++I) {
+        Value *Op = OldInst.getOperand(I);
+        Args.push_back(Op);
+      }
+      if (const auto *CmpI = dyn_cast<FCmpInst>(&OldInst)) {
+        FCmpInst::Predicate Pred = CmpI->getPredicate();
+        StringRef PredName = FCmpInst::getPredicateName(Pred);
+        Args.push_back(MetadataAsValue::get(Ctx, MDString::get(Ctx, PredName)));
+      }
+
+      // The last arguments of a constrained intrinsic are metadata that
+      // represent rounding mode (absents in some intrinsics) and exception
+      // behavior. The inlined function uses default settings.
+      if (hasRoundingModeOperand(CIID))
+        Args.push_back(
+            MetadataAsValue::get(Ctx, MDString::get(Ctx, "round.tonearest")));
+      Args.push_back(
+          MetadataAsValue::get(Ctx, MDString::get(Ctx, "fpexcept.ignore")));
+
+      NewInst = CallInst::Create(IFn, Args, OldInst.getName() + ".strict");
+    }
+  }
+  if (!NewInst)
+    NewInst = II->clone();
+  return NewInst;
+}
+
 /// The specified block is found to be reachable, clone it and
 /// anything that it can reach.
 void PruningFunctionCloner::CloneBlock(
@@ -379,7 +466,14 @@ void PruningFunctionCloner::CloneBlock(
   for (BasicBlock::const_iterator II = StartingInst, IE = --BB->end(); II != IE;
        ++II) {
 
-    Instruction *NewInst = II->clone();
+    Instruction *NewInst = cloneInstruction(II);
+
+    if (HostFuncIsStrictFP) {
+      // All function calls in the inlined function must get 'strictfp'
+      // attribute to prevent undesirable optimizations.
+      if (auto *Call = dyn_cast<CallInst>(NewInst))
+        Call->addFnAttr(Attribute::StrictFP);
+    }
 
     // Eagerly remap operands to the newly cloned instruction, except for PHI
     // nodes for which we defer processing until we update the CFG.
@@ -391,7 +485,7 @@ void PruningFunctionCloner::CloneBlock(
       // a mapping to that value rather than inserting a new instruction into
       // the basic block.
       if (Value *V =
-              SimplifyInstruction(NewInst, BB->getModule()->getDataLayout())) {
+              simplifyInstruction(NewInst, BB->getModule()->getDataLayout())) {
         // On the off-chance that this simplifies to an instruction in the old
         // function, map it back into the new function.
         if (NewFunc != OldFunc)
@@ -674,7 +768,7 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
       continue;
 
     // See if this instruction simplifies.
-    Value *SimpleV = SimplifyInstruction(I, DL);
+    Value *SimpleV = simplifyInstruction(I, DL);
     if (!SimpleV)
       continue;
 
diff --git a/llvm/lib/Transforms/Utils/CloneModule.cpp b/llvm/lib/Transforms/Utils/CloneModule.cpp
index 57c273a0e3c5..55cda0f11e47 100644
--- a/llvm/lib/Transforms/Utils/CloneModule.cpp
+++ b/llvm/lib/Transforms/Utils/CloneModule.cpp
@@ -11,13 +11,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Constant.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 using namespace llvm;
 
+namespace llvm {
+class Constant;
+}
+
 static void copyComdat(GlobalObject *Dst, const GlobalObject *Src) {
   const Comdat *SC = Src->getComdat();
   if (!SC)
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index cec159f6a448..f94d854f7ee8 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -53,7 +53,6 @@
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/Verifier.h"
-#include "llvm/Pass.h"
 #include "llvm/Support/BlockFrequency.h"
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/Casting.h"
@@ -62,12 +61,10 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <cstdint>
 #include <iterator>
 #include <map>
-#include <set>
 #include <utility>
 #include <vector>
 
@@ -249,9 +246,10 @@ CodeExtractor::CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT,
                              bool AggregateArgs, BlockFrequencyInfo *BFI,
                              BranchProbabilityInfo *BPI, AssumptionCache *AC,
                              bool AllowVarArgs, bool AllowAlloca,
-                             std::string Suffix)
+                             BasicBlock *AllocationBlock, std::string Suffix)
     : DT(DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
-      BPI(BPI), AC(AC), AllowVarArgs(AllowVarArgs),
+      BPI(BPI), AC(AC), AllocationBlock(AllocationBlock),
+      AllowVarArgs(AllowVarArgs),
       Blocks(buildExtractionBlockSet(BBs, DT, AllowVarArgs, AllowAlloca)),
       Suffix(Suffix) {}
 
@@ -260,7 +258,7 @@ CodeExtractor::CodeExtractor(DominatorTree &DT, Loop &L, bool AggregateArgs,
                              BranchProbabilityInfo *BPI, AssumptionCache *AC,
                              std::string Suffix)
     : DT(&DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
-      BPI(BPI), AC(AC), AllowVarArgs(false),
+      BPI(BPI), AC(AC), AllocationBlock(nullptr), AllowVarArgs(false),
       Blocks(buildExtractionBlockSet(L.getBlocks(), &DT,
                                      /* AllowVarArgs */ false,
                                      /* AllowAlloca */ false)),
@@ -922,6 +920,8 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
       case Attribute::StackAlignment:
       case Attribute::WillReturn:
       case Attribute::WriteOnly:
+      case Attribute::AllocKind:
+      case Attribute::PresplitCoroutine:
         continue;
       // Those attributes should be safe to propagate to the extracted function.
       case Attribute::AlwaysInline:
@@ -939,6 +939,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
       case Attribute::NonLazyBind:
       case Attribute::NoRedZone:
       case Attribute::NoUnwind:
+      case Attribute::NoSanitizeBounds:
       case Attribute::NoSanitizeCoverage:
       case Attribute::NullPointerIsValid:
       case Attribute::OptForFuzzing:
@@ -964,6 +965,8 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
         break;
       // These attributes cannot be applied to functions.
       case Attribute::Alignment:
+      case Attribute::AllocatedPointer:
+      case Attribute::AllocAlign:
       case Attribute::ByVal:
       case Attribute::Dereferenceable:
       case Attribute::DereferenceableOrNull:
@@ -1190,9 +1193,10 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
 
     // Allocate a struct at the beginning of this function
     StructArgTy = StructType::get(newFunction->getContext(), ArgTypes);
-    Struct = new AllocaInst(StructArgTy, DL.getAllocaAddrSpace(), nullptr,
-                            "structArg",
-                            &codeReplacer->getParent()->front().front());
+    Struct = new AllocaInst(
+        StructArgTy, DL.getAllocaAddrSpace(), nullptr, "structArg",
+        AllocationBlock ? &*AllocationBlock->getFirstInsertionPt()
+                        : &codeReplacer->getParent()->front().front());
     params.push_back(Struct);
 
     // Store aggregated inputs in the struct.
@@ -1771,7 +1775,7 @@ CodeExtractor::extractCodeRegion(const CodeExtractorAnalysisCache &CEAC,
   // Update the entry count of the function.
   if (BFI) {
     auto Count = BFI->getProfileCountFromFreq(EntryFreq.getFrequency());
-    if (Count.hasValue())
+    if (Count)
       newFunction->setEntryCount(
           ProfileCount(Count.getValue(), Function::PCT_Real)); // FIXME
     BFI->setBlockFreq(codeReplacer, EntryFreq.getFrequency());
diff --git a/llvm/lib/Transforms/Utils/CodeLayout.cpp b/llvm/lib/Transforms/Utils/CodeLayout.cpp
index dfb9f608eab2..1ff0f148b3a9 100644
--- a/llvm/lib/Transforms/Utils/CodeLayout.cpp
+++ b/llvm/lib/Transforms/Utils/CodeLayout.cpp
@@ -40,11 +40,20 @@
 
 #include "llvm/Transforms/Utils/CodeLayout.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
 
 using namespace llvm;
 #define DEBUG_TYPE "code-layout"
 
+cl::opt<bool> EnableExtTspBlockPlacement(
+    "enable-ext-tsp-block-placement", cl::Hidden, cl::init(false),
+    cl::desc("Enable machine block placement based on the ext-tsp model, "
+             "optimizing I-cache utilization."));
+
+cl::opt<bool> ApplyExtTspWithoutProfile(
+    "ext-tsp-apply-without-profile",
+    cl::desc("Whether to apply ext-tsp placement for instances w/o profile"),
+    cl::init(true), cl::Hidden);
+
 // Algorithm-specific constants. The values are tuned for the best performance
 // of large-scale front-end bound binaries.
 static cl::opt<double>
@@ -63,6 +72,12 @@ static cl::opt<unsigned> BackwardDistance(
     "ext-tsp-backward-distance", cl::Hidden, cl::init(640),
     cl::desc("The maximum distance (in bytes) of a backward jump for ExtTSP"));
 
+// The maximum size of a chain created by the algorithm. The size is bounded
+// so that the algorithm can efficiently process extremely large instance.
+static cl::opt<unsigned>
+    MaxChainSize("ext-tsp-max-chain-size", cl::Hidden, cl::init(4096),
+                 cl::desc("The maximum size of a chain to create."));
+
 // The maximum size of a chain for splitting. Larger values of the threshold
 // may yield better quality at the cost of worsen run-time.
 static cl::opt<unsigned> ChainSplitThreshold(
@@ -115,7 +130,7 @@ enum class MergeTypeTy : int { X_Y, X1_Y_X2, Y_X2_X1, X2_X1_Y };
 /// together with the corresponfiding merge 'type' and 'offset'.
 class MergeGainTy {
 public:
-  explicit MergeGainTy() {}
+  explicit MergeGainTy() = default;
   explicit MergeGainTy(double Score, size_t MergeOffset, MergeTypeTy MergeType)
       : Score(Score), MergeOffset(MergeOffset), MergeType(MergeType) {}
 
@@ -142,7 +157,6 @@ private:
   MergeTypeTy MergeType{MergeTypeTy::X_Y};
 };
 
-class Block;
 class Jump;
 class Chain;
 class ChainEdge;
@@ -223,6 +237,8 @@ public:
 
   const std::vector<Block *> &blocks() const { return Blocks; }
 
+  size_t numBlocks() const { return Blocks.size(); }
+
   const std::vector<std::pair<Chain *, ChainEdge *>> &edges() const {
     return Edges;
   }
@@ -499,7 +515,7 @@ private:
     AllEdges.reserve(AllJumps.size());
     for (auto &Block : AllBlocks) {
       for (auto &Jump : Block.OutJumps) {
-        const auto SuccBlock = Jump->Target;
+        auto SuccBlock = Jump->Target;
         auto CurEdge = Block.CurChain->getEdge(SuccBlock->CurChain);
         // this edge is already present in the graph
         if (CurEdge != nullptr) {
@@ -589,6 +605,10 @@ private:
           if (ChainPred == ChainSucc)
             continue;
 
+          // Stop early if the combined chain violates the maximum allowed size
+          if (ChainPred->numBlocks() + ChainSucc->numBlocks() >= MaxChainSize)
+            continue;
+
           // Compute the gain of merging the two chains
           auto CurGain = getBestMergeGain(ChainPred, ChainSucc, ChainEdge);
           if (CurGain.score() <= EPS)
diff --git a/llvm/lib/Transforms/Utils/CtorUtils.cpp b/llvm/lib/Transforms/Utils/CtorUtils.cpp
index 069a86f6ab33..c997f39508e3 100644
--- a/llvm/lib/Transforms/Utils/CtorUtils.cpp
+++ b/llvm/lib/Transforms/Utils/CtorUtils.cpp
@@ -18,6 +18,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include <numeric>
 
 #define DEBUG_TYPE "ctor_utils"
 
@@ -62,21 +63,20 @@ static void removeGlobalCtors(GlobalVariable *GCL, const BitVector &CtorsToRemov
 
 /// Given a llvm.global_ctors list that we can understand,
 /// return a list of the functions and null terminator as a vector.
-static std::vector<Function *> parseGlobalCtors(GlobalVariable *GV) {
-  if (GV->getInitializer()->isNullValue())
-    return std::vector<Function *>();
+static std::vector<std::pair<uint32_t, Function *>>
+parseGlobalCtors(GlobalVariable *GV) {
   ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
-  std::vector<Function *> Result;
+  std::vector<std::pair<uint32_t, Function *>> Result;
   Result.reserve(CA->getNumOperands());
   for (auto &V : CA->operands()) {
     ConstantStruct *CS = cast<ConstantStruct>(V);
-    Result.push_back(dyn_cast<Function>(CS->getOperand(1)));
+    Result.emplace_back(cast<ConstantInt>(CS->getOperand(0))->getZExtValue(),
+                        dyn_cast<Function>(CS->getOperand(1)));
   }
   return Result;
 }
 
-/// Find the llvm.global_ctors list, verifying that all initializers have an
-/// init priority of 65535.
+/// Find the llvm.global_ctors list.
 static GlobalVariable *findGlobalCtors(Module &M) {
   GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
   if (!GV)
@@ -87,9 +87,11 @@ static GlobalVariable *findGlobalCtors(Module &M) {
   if (!GV->hasUniqueInitializer())
     return nullptr;
 
-  if (isa<ConstantAggregateZero>(GV->getInitializer()))
-    return GV;
-  ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
+  // If there are no ctors, then the initializer might be null/undef/poison.
+  // Ignore anything but an array.
+  ConstantArray *CA = dyn_cast<ConstantArray>(GV->getInitializer());
+  if (!CA)
+    return nullptr;
 
   for (auto &V : CA->operands()) {
     if (isa<ConstantAggregateZero>(V))
@@ -98,54 +100,47 @@ static GlobalVariable *findGlobalCtors(Module &M) {
     if (isa<ConstantPointerNull>(CS->getOperand(1)))
       continue;
 
-    // Must have a function or null ptr.
-    if (!isa<Function>(CS->getOperand(1)))
-      return nullptr;
-
-    // Init priority must be standard.
-    ConstantInt *CI = cast<ConstantInt>(CS->getOperand(0));
-    if (CI->getZExtValue() != 65535)
+    // Can only handle global constructors with no arguments.
+    Function *F = dyn_cast<Function>(CS->getOperand(1));
+    if (!F || F->arg_size() != 0)
       return nullptr;
   }
-
   return GV;
 }
 
 /// Call "ShouldRemove" for every entry in M's global_ctor list and remove the
 /// entries for which it returns true.  Return true if anything changed.
 bool llvm::optimizeGlobalCtorsList(
-    Module &M, function_ref<bool(Function *)> ShouldRemove) {
+    Module &M, function_ref<bool(uint32_t, Function *)> ShouldRemove) {
   GlobalVariable *GlobalCtors = findGlobalCtors(M);
   if (!GlobalCtors)
     return false;
 
-  std::vector<Function *> Ctors = parseGlobalCtors(GlobalCtors);
+  std::vector<std::pair<uint32_t, Function *>> Ctors =
+      parseGlobalCtors(GlobalCtors);
   if (Ctors.empty())
     return false;
 
   bool MadeChange = false;
-
   // Loop over global ctors, optimizing them when we can.
-  unsigned NumCtors = Ctors.size();
-  BitVector CtorsToRemove(NumCtors);
-  for (unsigned i = 0; i != Ctors.size() && NumCtors > 0; ++i) {
-    Function *F = Ctors[i];
-    // Found a null terminator in the middle of the list, prune off the rest of
-    // the list.
+  BitVector CtorsToRemove(Ctors.size());
+  std::vector<size_t> CtorsByPriority(Ctors.size());
+  std::iota(CtorsByPriority.begin(), CtorsByPriority.end(), 0);
+  stable_sort(CtorsByPriority, [&](size_t LHS, size_t RHS) {
+    return Ctors[LHS].first < Ctors[RHS].first;
+  });
+  for (unsigned CtorIndex : CtorsByPriority) {
+    const uint32_t Priority = Ctors[CtorIndex].first;
+    Function *F = Ctors[CtorIndex].second;
     if (!F)
       continue;
 
     LLVM_DEBUG(dbgs() << "Optimizing Global Constructor: " << *F << "\n");
 
-    // We cannot simplify external ctor functions.
-    if (F->empty())
-      continue;
-
     // If we can evaluate the ctor at compile time, do.
-    if (ShouldRemove(F)) {
-      Ctors[i] = nullptr;
-      CtorsToRemove.set(i);
-      NumCtors--;
+    if (ShouldRemove(Priority, F)) {
+      Ctors[CtorIndex].second = nullptr;
+      CtorsToRemove.set(CtorIndex);
       MadeChange = true;
       continue;
     }
diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp
index 589622d69578..205f7a7d9ed2 100644
--- a/llvm/lib/Transforms/Utils/Debugify.cpp
+++ b/llvm/lib/Transforms/Utils/Debugify.cpp
@@ -37,12 +37,16 @@ namespace {
 cl::opt<bool> Quiet("debugify-quiet",
                     cl::desc("Suppress verbose debugify output"));
 
+cl::opt<uint64_t> DebugifyFunctionsLimit(
+    "debugify-func-limit",
+    cl::desc("Set max number of processed functions per pass."),
+    cl::init(UINT_MAX));
+
 enum class Level {
   Locations,
   LocationsAndVariables
 };
 
-// Used for the synthetic mode only.
 cl::opt<Level> DebugifyLevel(
     "debugify-level", cl::desc("Kind of debug info to add"),
     cl::values(clEnumValN(Level::Locations, "locations", "Locations only"),
@@ -210,15 +214,15 @@ bool llvm::applyDebugifyMetadata(
 static bool
 applyDebugify(Function &F,
               enum DebugifyMode Mode = DebugifyMode::SyntheticDebugInfo,
-              DebugInfoPerPassMap *DIPreservationMap = nullptr,
+              DebugInfoPerPass *DebugInfoBeforePass = nullptr,
               StringRef NameOfWrappedPass = "") {
   Module &M = *F.getParent();
   auto FuncIt = F.getIterator();
   if (Mode == DebugifyMode::SyntheticDebugInfo)
     return applyDebugifyMetadata(M, make_range(FuncIt, std::next(FuncIt)),
                                  "FunctionDebugify: ", /*ApplyToMF*/ nullptr);
-  assert(DIPreservationMap);
-  return collectDebugInfoMetadata(M, M.functions(), *DIPreservationMap,
+  assert(DebugInfoBeforePass);
+  return collectDebugInfoMetadata(M, M.functions(), *DebugInfoBeforePass,
                                   "FunctionDebugify (original debuginfo)",
                                   NameOfWrappedPass);
 }
@@ -226,12 +230,12 @@ applyDebugify(Function &F,
 static bool
 applyDebugify(Module &M,
               enum DebugifyMode Mode = DebugifyMode::SyntheticDebugInfo,
-              DebugInfoPerPassMap *DIPreservationMap = nullptr,
+              DebugInfoPerPass *DebugInfoBeforePass = nullptr,
               StringRef NameOfWrappedPass = "") {
   if (Mode == DebugifyMode::SyntheticDebugInfo)
     return applyDebugifyMetadata(M, M.functions(),
                                  "ModuleDebugify: ", /*ApplyToMF*/ nullptr);
-  return collectDebugInfoMetadata(M, M.functions(), *DIPreservationMap,
+  return collectDebugInfoMetadata(M, M.functions(), *DebugInfoBeforePass,
                                   "ModuleDebugify (original debuginfo)",
                                   NameOfWrappedPass);
 }
@@ -267,7 +271,7 @@ bool llvm::stripDebugifyMetadata(Module &M) {
   SmallVector<MDNode *, 4> Flags(NMD->operands());
   NMD->clearOperands();
   for (MDNode *Flag : Flags) {
-    MDString *Key = dyn_cast_or_null<MDString>(Flag->getOperand(1));
+    auto *Key = cast<MDString>(Flag->getOperand(1));
     if (Key->getString() == "Debug Info Version") {
       Changed = true;
       continue;
@@ -283,32 +287,37 @@ bool llvm::stripDebugifyMetadata(Module &M) {
 
 bool llvm::collectDebugInfoMetadata(Module &M,
                                     iterator_range<Module::iterator> Functions,
-                                    DebugInfoPerPassMap &DIPreservationMap,
+                                    DebugInfoPerPass &DebugInfoBeforePass,
                                     StringRef Banner,
                                     StringRef NameOfWrappedPass) {
   LLVM_DEBUG(dbgs() << Banner << ": (before) " << NameOfWrappedPass << '\n');
 
-  // Clear the map with the debug info before every single pass.
-  DIPreservationMap.clear();
-
   if (!M.getNamedMetadata("llvm.dbg.cu")) {
     dbg() << Banner << ": Skipping module without debug info\n";
     return false;
   }
 
+  uint64_t FunctionsCnt = DebugInfoBeforePass.DIFunctions.size();
   // Visit each instruction.
   for (Function &F : Functions) {
+    // Use DI collected after previous Pass (when -debugify-each is used).
+    if (DebugInfoBeforePass.DIFunctions.count(&F))
+      continue;
+
     if (isFunctionSkipped(F))
       continue;
 
+    // Stop collecting DI if the Functions number reached the limit.
+    if (++FunctionsCnt >= DebugifyFunctionsLimit)
+      break;
     // Collect the DISubprogram.
     auto *SP = F.getSubprogram();
-    DIPreservationMap[NameOfWrappedPass].DIFunctions.insert({F.getName(), SP});
+    DebugInfoBeforePass.DIFunctions.insert({&F, SP});
     if (SP) {
       LLVM_DEBUG(dbgs() << "  Collecting subprogram: " << *SP << '\n');
       for (const DINode *DN : SP->getRetainedNodes()) {
         if (const auto *DV = dyn_cast<DILocalVariable>(DN)) {
-          DIPreservationMap[NameOfWrappedPass].DIVariables[DV] = 0;
+          DebugInfoBeforePass.DIVariables[DV] = 0;
         }
       }
     }
@@ -320,20 +329,22 @@ bool llvm::collectDebugInfoMetadata(Module &M,
         if (isa<PHINode>(I))
           continue;
 
-        // Collect dbg.values and dbg.declares.
-        if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I)) {
-          if (!SP)
-            continue;
-          // Skip inlined variables.
-          if (I.getDebugLoc().getInlinedAt())
+        // Cllect dbg.values and dbg.declare.
+        if (DebugifyLevel > Level::Locations) {
+          if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I)) {
+            if (!SP)
+              continue;
+            // Skip inlined variables.
+            if (I.getDebugLoc().getInlinedAt())
+              continue;
+            // Skip undef values.
+            if (DVI->isUndef())
+              continue;
+
+            auto *Var = DVI->getVariable();
+            DebugInfoBeforePass.DIVariables[Var]++;
             continue;
-          // Skip undef values.
-          if (DVI->isUndef())
-            continue;
-
-          auto *Var = DVI->getVariable();
-          DIPreservationMap[NameOfWrappedPass].DIVariables[Var]++;
-          continue;
+          }
         }
 
         // Skip debug instructions other than dbg.value and dbg.declare.
@@ -341,11 +352,11 @@ bool llvm::collectDebugInfoMetadata(Module &M,
           continue;
 
         LLVM_DEBUG(dbgs() << "  Collecting info for inst: " << I << '\n');
-        DIPreservationMap[NameOfWrappedPass].InstToDelete.insert({&I, &I});
+        DebugInfoBeforePass.InstToDelete.insert({&I, &I});
 
         const DILocation *Loc = I.getDebugLoc().get();
         bool HasLoc = Loc != nullptr;
-        DIPreservationMap[NameOfWrappedPass].DILocations.insert({&I, HasLoc});
+        DebugInfoBeforePass.DILocations.insert({&I, HasLoc});
       }
     }
   }
@@ -367,12 +378,12 @@ static bool checkFunctions(const DebugFnMap &DIFunctionsBefore,
     if (SPIt == DIFunctionsBefore.end()) {
       if (ShouldWriteIntoJSON)
         Bugs.push_back(llvm::json::Object({{"metadata", "DISubprogram"},
-                                           {"name", F.first},
+                                           {"name", F.first->getName()},
                                            {"action", "not-generate"}}));
       else
         dbg() << "ERROR: " << NameOfWrappedPass
-              << " did not generate DISubprogram for " << F.first << " from "
-              << FileNameFromCU << '\n';
+              << " did not generate DISubprogram for " << F.first->getName()
+              << " from " << FileNameFromCU << '\n';
       Preserved = false;
     } else {
       auto SP = SPIt->second;
@@ -382,11 +393,11 @@ static bool checkFunctions(const DebugFnMap &DIFunctionsBefore,
       // a debug info bug.
       if (ShouldWriteIntoJSON)
         Bugs.push_back(llvm::json::Object({{"metadata", "DISubprogram"},
-                                           {"name", F.first},
+                                           {"name", F.first->getName()},
                                            {"action", "drop"}}));
       else
         dbg() << "ERROR: " << NameOfWrappedPass << " dropped DISubprogram of "
-              << F.first << " from " << FileNameFromCU << '\n';
+              << F.first->getName() << " from " << FileNameFromCU << '\n';
       Preserved = false;
     }
   }
@@ -515,7 +526,7 @@ static void writeJSON(StringRef OrigDIVerifyBugsReportFilePath,
 
 bool llvm::checkDebugInfoMetadata(Module &M,
                                   iterator_range<Module::iterator> Functions,
-                                  DebugInfoPerPassMap &DIPreservationMap,
+                                  DebugInfoPerPass &DebugInfoBeforePass,
                                   StringRef Banner, StringRef NameOfWrappedPass,
                                   StringRef OrigDIVerifyBugsReportFilePath) {
   LLVM_DEBUG(dbgs() << Banner << ": (after) " << NameOfWrappedPass << '\n');
@@ -526,24 +537,26 @@ bool llvm::checkDebugInfoMetadata(Module &M,
   }
 
   // Map the debug info holding DIs after a pass.
-  DebugInfoPerPassMap DIPreservationAfter;
+  DebugInfoPerPass DebugInfoAfterPass;
 
   // Visit each instruction.
   for (Function &F : Functions) {
     if (isFunctionSkipped(F))
       continue;
 
+    // Don't process functions without DI collected before the Pass.
+    if (!DebugInfoBeforePass.DIFunctions.count(&F))
+      continue;
     // TODO: Collect metadata other than DISubprograms.
     // Collect the DISubprogram.
     auto *SP = F.getSubprogram();
-    DIPreservationAfter[NameOfWrappedPass].DIFunctions.insert(
-        {F.getName(), SP});
+    DebugInfoAfterPass.DIFunctions.insert({&F, SP});
 
     if (SP) {
       LLVM_DEBUG(dbgs() << "  Collecting subprogram: " << *SP << '\n');
       for (const DINode *DN : SP->getRetainedNodes()) {
         if (const auto *DV = dyn_cast<DILocalVariable>(DN)) {
-          DIPreservationAfter[NameOfWrappedPass].DIVariables[DV] = 0;
+          DebugInfoAfterPass.DIVariables[DV] = 0;
         }
       }
     }
@@ -556,19 +569,21 @@ bool llvm::checkDebugInfoMetadata(Module &M,
           continue;
 
         // Collect dbg.values and dbg.declares.
-        if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I)) {
-          if (!SP)
-            continue;
-          // Skip inlined variables.
-          if (I.getDebugLoc().getInlinedAt())
-            continue;
-          // Skip undef values.
-          if (DVI->isUndef())
+        if (DebugifyLevel > Level::Locations) {
+          if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I)) {
+            if (!SP)
+              continue;
+            // Skip inlined variables.
+            if (I.getDebugLoc().getInlinedAt())
+              continue;
+            // Skip undef values.
+            if (DVI->isUndef())
+              continue;
+
+            auto *Var = DVI->getVariable();
+            DebugInfoAfterPass.DIVariables[Var]++;
             continue;
-
-          auto *Var = DVI->getVariable();
-          DIPreservationAfter[NameOfWrappedPass].DIVariables[Var]++;
-          continue;
+          }
         }
 
         // Skip debug instructions other than dbg.value and dbg.declare.
@@ -580,7 +595,7 @@ bool llvm::checkDebugInfoMetadata(Module &M,
         const DILocation *Loc = I.getDebugLoc().get();
         bool HasLoc = Loc != nullptr;
 
-        DIPreservationAfter[NameOfWrappedPass].DILocations.insert({&I, HasLoc});
+        DebugInfoAfterPass.DILocations.insert({&I, HasLoc});
       }
     }
   }
@@ -590,16 +605,16 @@ bool llvm::checkDebugInfoMetadata(Module &M,
       (cast<DICompileUnit>(M.getNamedMetadata("llvm.dbg.cu")->getOperand(0)))
           ->getFilename();
 
-  auto DIFunctionsBefore = DIPreservationMap[NameOfWrappedPass].DIFunctions;
-  auto DIFunctionsAfter = DIPreservationAfter[NameOfWrappedPass].DIFunctions;
+  auto DIFunctionsBefore = DebugInfoBeforePass.DIFunctions;
+  auto DIFunctionsAfter = DebugInfoAfterPass.DIFunctions;
 
-  auto DILocsBefore = DIPreservationMap[NameOfWrappedPass].DILocations;
-  auto DILocsAfter = DIPreservationAfter[NameOfWrappedPass].DILocations;
+  auto DILocsBefore = DebugInfoBeforePass.DILocations;
+  auto DILocsAfter = DebugInfoAfterPass.DILocations;
 
-  auto InstToDelete = DIPreservationMap[NameOfWrappedPass].InstToDelete;
+  auto InstToDelete = DebugInfoBeforePass.InstToDelete;
 
-  auto DIVarsBefore = DIPreservationMap[NameOfWrappedPass].DIVariables;
-  auto DIVarsAfter = DIPreservationAfter[NameOfWrappedPass].DIVariables;
+  auto DIVarsBefore = DebugInfoBeforePass.DIVariables;
+  auto DIVarsAfter = DebugInfoAfterPass.DIVariables;
 
   bool ShouldWriteIntoJSON = !OrigDIVerifyBugsReportFilePath.empty();
   llvm::json::Array Bugs;
@@ -626,6 +641,11 @@ bool llvm::checkDebugInfoMetadata(Module &M,
   else
     dbg() << ResultBanner << ": FAIL\n";
 
+  // In the case of the `debugify-each`, no need to go over all the instructions
+  // again in the collectDebugInfoMetadata(), since as an input we can use
+  // the debugging information from the previous pass.
+  DebugInfoBeforePass = DebugInfoAfterPass;
+
   LLVM_DEBUG(dbgs() << "\n\n");
   return Result;
 }
@@ -770,14 +790,14 @@ bool checkDebugifyMetadata(Module &M,
 /// legacy module pass manager.
 struct DebugifyModulePass : public ModulePass {
   bool runOnModule(Module &M) override {
-    return applyDebugify(M, Mode, DIPreservationMap, NameOfWrappedPass);
+    return applyDebugify(M, Mode, DebugInfoBeforePass, NameOfWrappedPass);
   }
 
   DebugifyModulePass(enum DebugifyMode Mode = DebugifyMode::SyntheticDebugInfo,
                      StringRef NameOfWrappedPass = "",
-                     DebugInfoPerPassMap *DIPreservationMap = nullptr)
+                     DebugInfoPerPass *DebugInfoBeforePass = nullptr)
       : ModulePass(ID), NameOfWrappedPass(NameOfWrappedPass),
-        DIPreservationMap(DIPreservationMap), Mode(Mode) {}
+        DebugInfoBeforePass(DebugInfoBeforePass), Mode(Mode) {}
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
@@ -787,7 +807,7 @@ struct DebugifyModulePass : public ModulePass {
 
 private:
   StringRef NameOfWrappedPass;
-  DebugInfoPerPassMap *DIPreservationMap;
+  DebugInfoPerPass *DebugInfoBeforePass;
   enum DebugifyMode Mode;
 };
 
@@ -795,15 +815,15 @@ private:
 /// single function, used with the legacy module pass manager.
 struct DebugifyFunctionPass : public FunctionPass {
   bool runOnFunction(Function &F) override {
-    return applyDebugify(F, Mode, DIPreservationMap, NameOfWrappedPass);
+    return applyDebugify(F, Mode, DebugInfoBeforePass, NameOfWrappedPass);
   }
 
   DebugifyFunctionPass(
       enum DebugifyMode Mode = DebugifyMode::SyntheticDebugInfo,
       StringRef NameOfWrappedPass = "",
-      DebugInfoPerPassMap *DIPreservationMap = nullptr)
+      DebugInfoPerPass *DebugInfoBeforePass = nullptr)
       : FunctionPass(ID), NameOfWrappedPass(NameOfWrappedPass),
-        DIPreservationMap(DIPreservationMap), Mode(Mode) {}
+        DebugInfoBeforePass(DebugInfoBeforePass), Mode(Mode) {}
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
@@ -813,7 +833,7 @@ struct DebugifyFunctionPass : public FunctionPass {
 
 private:
   StringRef NameOfWrappedPass;
-  DebugInfoPerPassMap *DIPreservationMap;
+  DebugInfoPerPass *DebugInfoBeforePass;
   enum DebugifyMode Mode;
 };
 
@@ -825,7 +845,7 @@ struct CheckDebugifyModulePass : public ModulePass {
       return checkDebugifyMetadata(M, M.functions(), NameOfWrappedPass,
                                    "CheckModuleDebugify", Strip, StatsMap);
     return checkDebugInfoMetadata(
-        M, M.functions(), *DIPreservationMap,
+        M, M.functions(), *DebugInfoBeforePass,
         "CheckModuleDebugify (original debuginfo)", NameOfWrappedPass,
         OrigDIVerifyBugsReportFilePath);
   }
@@ -834,11 +854,11 @@ struct CheckDebugifyModulePass : public ModulePass {
       bool Strip = false, StringRef NameOfWrappedPass = "",
       DebugifyStatsMap *StatsMap = nullptr,
       enum DebugifyMode Mode = DebugifyMode::SyntheticDebugInfo,
-      DebugInfoPerPassMap *DIPreservationMap = nullptr,
+      DebugInfoPerPass *DebugInfoBeforePass = nullptr,
       StringRef OrigDIVerifyBugsReportFilePath = "")
       : ModulePass(ID), NameOfWrappedPass(NameOfWrappedPass),
         OrigDIVerifyBugsReportFilePath(OrigDIVerifyBugsReportFilePath),
-        StatsMap(StatsMap), DIPreservationMap(DIPreservationMap), Mode(Mode),
+        StatsMap(StatsMap), DebugInfoBeforePass(DebugInfoBeforePass), Mode(Mode),
         Strip(Strip) {}
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -851,7 +871,7 @@ private:
   StringRef NameOfWrappedPass;
   StringRef OrigDIVerifyBugsReportFilePath;
   DebugifyStatsMap *StatsMap;
-  DebugInfoPerPassMap *DIPreservationMap;
+  DebugInfoPerPass *DebugInfoBeforePass;
   enum DebugifyMode Mode;
   bool Strip;
 };
@@ -867,7 +887,7 @@ struct CheckDebugifyFunctionPass : public FunctionPass {
                                    NameOfWrappedPass, "CheckFunctionDebugify",
                                    Strip, StatsMap);
     return checkDebugInfoMetadata(
-        M, make_range(FuncIt, std::next(FuncIt)), *DIPreservationMap,
+        M, make_range(FuncIt, std::next(FuncIt)), *DebugInfoBeforePass,
         "CheckFunctionDebugify (original debuginfo)", NameOfWrappedPass,
         OrigDIVerifyBugsReportFilePath);
   }
@@ -876,11 +896,11 @@ struct CheckDebugifyFunctionPass : public FunctionPass {
       bool Strip = false, StringRef NameOfWrappedPass = "",
       DebugifyStatsMap *StatsMap = nullptr,
       enum DebugifyMode Mode = DebugifyMode::SyntheticDebugInfo,
-      DebugInfoPerPassMap *DIPreservationMap = nullptr,
+      DebugInfoPerPass *DebugInfoBeforePass = nullptr,
       StringRef OrigDIVerifyBugsReportFilePath = "")
       : FunctionPass(ID), NameOfWrappedPass(NameOfWrappedPass),
         OrigDIVerifyBugsReportFilePath(OrigDIVerifyBugsReportFilePath),
-        StatsMap(StatsMap), DIPreservationMap(DIPreservationMap), Mode(Mode),
+        StatsMap(StatsMap), DebugInfoBeforePass(DebugInfoBeforePass), Mode(Mode),
         Strip(Strip) {}
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -893,7 +913,7 @@ private:
   StringRef NameOfWrappedPass;
   StringRef OrigDIVerifyBugsReportFilePath;
   DebugifyStatsMap *StatsMap;
-  DebugInfoPerPassMap *DIPreservationMap;
+  DebugInfoPerPass *DebugInfoBeforePass;
   enum DebugifyMode Mode;
   bool Strip;
 };
@@ -923,21 +943,21 @@ void llvm::exportDebugifyStats(StringRef Path, const DebugifyStatsMap &Map) {
 
 ModulePass *createDebugifyModulePass(enum DebugifyMode Mode,
                                      llvm::StringRef NameOfWrappedPass,
-                                     DebugInfoPerPassMap *DIPreservationMap) {
+                                     DebugInfoPerPass *DebugInfoBeforePass) {
   if (Mode == DebugifyMode::SyntheticDebugInfo)
     return new DebugifyModulePass();
   assert(Mode == DebugifyMode::OriginalDebugInfo && "Must be original mode");
-  return new DebugifyModulePass(Mode, NameOfWrappedPass, DIPreservationMap);
+  return new DebugifyModulePass(Mode, NameOfWrappedPass, DebugInfoBeforePass);
 }
 
 FunctionPass *
 createDebugifyFunctionPass(enum DebugifyMode Mode,
                            llvm::StringRef NameOfWrappedPass,
-                           DebugInfoPerPassMap *DIPreservationMap) {
+                           DebugInfoPerPass *DebugInfoBeforePass) {
   if (Mode == DebugifyMode::SyntheticDebugInfo)
     return new DebugifyFunctionPass();
   assert(Mode == DebugifyMode::OriginalDebugInfo && "Must be original mode");
-  return new DebugifyFunctionPass(Mode, NameOfWrappedPass, DIPreservationMap);
+  return new DebugifyFunctionPass(Mode, NameOfWrappedPass, DebugInfoBeforePass);
 }
 
 PreservedAnalyses NewPMDebugifyPass::run(Module &M, ModuleAnalysisManager &) {
@@ -948,25 +968,25 @@ PreservedAnalyses NewPMDebugifyPass::run(Module &M, ModuleAnalysisManager &) {
 
 ModulePass *createCheckDebugifyModulePass(
     bool Strip, StringRef NameOfWrappedPass, DebugifyStatsMap *StatsMap,
-    enum DebugifyMode Mode, DebugInfoPerPassMap *DIPreservationMap,
+    enum DebugifyMode Mode, DebugInfoPerPass *DebugInfoBeforePass,
     StringRef OrigDIVerifyBugsReportFilePath) {
   if (Mode == DebugifyMode::SyntheticDebugInfo)
     return new CheckDebugifyModulePass(Strip, NameOfWrappedPass, StatsMap);
   assert(Mode == DebugifyMode::OriginalDebugInfo && "Must be original mode");
   return new CheckDebugifyModulePass(false, NameOfWrappedPass, nullptr, Mode,
-                                     DIPreservationMap,
+                                     DebugInfoBeforePass,
                                      OrigDIVerifyBugsReportFilePath);
 }
 
 FunctionPass *createCheckDebugifyFunctionPass(
     bool Strip, StringRef NameOfWrappedPass, DebugifyStatsMap *StatsMap,
-    enum DebugifyMode Mode, DebugInfoPerPassMap *DIPreservationMap,
+    enum DebugifyMode Mode, DebugInfoPerPass *DebugInfoBeforePass,
     StringRef OrigDIVerifyBugsReportFilePath) {
   if (Mode == DebugifyMode::SyntheticDebugInfo)
     return new CheckDebugifyFunctionPass(Strip, NameOfWrappedPass, StatsMap);
   assert(Mode == DebugifyMode::OriginalDebugInfo && "Must be original mode");
   return new CheckDebugifyFunctionPass(false, NameOfWrappedPass, nullptr, Mode,
-                                       DIPreservationMap,
+                                       DebugInfoBeforePass,
                                        OrigDIVerifyBugsReportFilePath);
 }
 
diff --git a/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp b/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp
index 5f53d794fe8a..f6f80540ad95 100644
--- a/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp
+++ b/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp
@@ -8,11 +8,10 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Analysis/CFG.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/Type.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
 /// DemoteRegToStack - This function takes a virtual register computed by an
diff --git a/llvm/lib/Transforms/Utils/Evaluator.cpp b/llvm/lib/Transforms/Utils/Evaluator.cpp
index e73287c060ae..7b8d8553bac2 100644
--- a/llvm/lib/Transforms/Utils/Evaluator.cpp
+++ b/llvm/lib/Transforms/Utils/Evaluator.cpp
@@ -29,7 +29,6 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
@@ -37,7 +36,6 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include <iterator>
 
 #define DEBUG_TYPE "evaluator"
 
@@ -219,10 +217,13 @@ Constant *Evaluator::ComputeLoadResult(Constant *P, Type *Ty) {
   P = cast<Constant>(P->stripAndAccumulateConstantOffsets(
       DL, Offset, /* AllowNonInbounds */ true));
   Offset = Offset.sextOrTrunc(DL.getIndexTypeSizeInBits(P->getType()));
-  auto *GV = dyn_cast<GlobalVariable>(P);
-  if (!GV)
-    return nullptr;
+  if (auto *GV = dyn_cast<GlobalVariable>(P))
+    return ComputeLoadResult(GV, Ty, Offset);
+  return nullptr;
+}
 
+Constant *Evaluator::ComputeLoadResult(GlobalVariable *GV, Type *Ty,
+                                       const APInt &Offset) {
   auto It = MutatedMemory.find(GV);
   if (It != MutatedMemory.end())
     return It->second.read(Ty, Offset, DL);
@@ -335,50 +336,6 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB,
       auto Res = MutatedMemory.try_emplace(GV, GV->getInitializer());
       if (!Res.first->second.write(Val, Offset, DL))
         return false;
-    } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(CurInst)) {
-      InstResult = ConstantExpr::get(BO->getOpcode(),
-                                     getVal(BO->getOperand(0)),
-                                     getVal(BO->getOperand(1)));
-      LLVM_DEBUG(dbgs() << "Found a BinaryOperator! Simplifying: "
-                        << *InstResult << "\n");
-    } else if (CmpInst *CI = dyn_cast<CmpInst>(CurInst)) {
-      InstResult = ConstantExpr::getCompare(CI->getPredicate(),
-                                            getVal(CI->getOperand(0)),
-                                            getVal(CI->getOperand(1)));
-      LLVM_DEBUG(dbgs() << "Found a CmpInst! Simplifying: " << *InstResult
-                        << "\n");
-    } else if (CastInst *CI = dyn_cast<CastInst>(CurInst)) {
-      InstResult = ConstantExpr::getCast(CI->getOpcode(),
-                                         getVal(CI->getOperand(0)),
-                                         CI->getType());
-      LLVM_DEBUG(dbgs() << "Found a Cast! Simplifying: " << *InstResult
-                        << "\n");
-    } else if (SelectInst *SI = dyn_cast<SelectInst>(CurInst)) {
-      InstResult = ConstantExpr::getSelect(getVal(SI->getOperand(0)),
-                                           getVal(SI->getOperand(1)),
-                                           getVal(SI->getOperand(2)));
-      LLVM_DEBUG(dbgs() << "Found a Select! Simplifying: " << *InstResult
-                        << "\n");
-    } else if (auto *EVI = dyn_cast<ExtractValueInst>(CurInst)) {
-      InstResult = ConstantExpr::getExtractValue(
-          getVal(EVI->getAggregateOperand()), EVI->getIndices());
-      LLVM_DEBUG(dbgs() << "Found an ExtractValueInst! Simplifying: "
-                        << *InstResult << "\n");
-    } else if (auto *IVI = dyn_cast<InsertValueInst>(CurInst)) {
-      InstResult = ConstantExpr::getInsertValue(
-          getVal(IVI->getAggregateOperand()),
-          getVal(IVI->getInsertedValueOperand()), IVI->getIndices());
-      LLVM_DEBUG(dbgs() << "Found an InsertValueInst! Simplifying: "
-                        << *InstResult << "\n");
-    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurInst)) {
-      Constant *P = getVal(GEP->getOperand(0));
-      SmallVector<Constant*, 8> GEPOps;
-      for (Use &Op : llvm::drop_begin(GEP->operands()))
-        GEPOps.push_back(getVal(Op));
-      InstResult =
-          ConstantExpr::getGetElementPtr(GEP->getSourceElementType(), P, GEPOps,
-                                         cast<GEPOperator>(GEP)->isInBounds());
-      LLVM_DEBUG(dbgs() << "Found a GEP! Simplifying: " << *InstResult << "\n");
     } else if (LoadInst *LI = dyn_cast<LoadInst>(CurInst)) {
       if (!LI->isSimple()) {
         LLVM_DEBUG(
@@ -438,16 +395,39 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB,
                               << "intrinsic.\n");
             return false;
           }
+
+          auto *LenC = dyn_cast<ConstantInt>(getVal(MSI->getLength()));
+          if (!LenC) {
+            LLVM_DEBUG(dbgs() << "Memset with unknown length.\n");
+            return false;
+          }
+
           Constant *Ptr = getVal(MSI->getDest());
+          APInt Offset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
+          Ptr = cast<Constant>(Ptr->stripAndAccumulateConstantOffsets(
+              DL, Offset, /* AllowNonInbounds */ true));
+          auto *GV = dyn_cast<GlobalVariable>(Ptr);
+          if (!GV) {
+            LLVM_DEBUG(dbgs() << "Memset with unknown base.\n");
+            return false;
+          }
+
           Constant *Val = getVal(MSI->getValue());
-          Constant *DestVal =
-              ComputeLoadResult(getVal(Ptr), MSI->getValue()->getType());
-          if (Val->isNullValue() && DestVal && DestVal->isNullValue()) {
-            // This memset is a no-op.
-            LLVM_DEBUG(dbgs() << "Ignoring no-op memset.\n");
-            ++CurInst;
-            continue;
+          APInt Len = LenC->getValue();
+          while (Len != 0) {
+            Constant *DestVal = ComputeLoadResult(GV, Val->getType(), Offset);
+            if (DestVal != Val) {
+              LLVM_DEBUG(dbgs() << "Memset is not a no-op at offset "
+                                << Offset << " of " << *GV << ".\n");
+              return false;
+            }
+            ++Offset;
+            --Len;
           }
+
+          LLVM_DEBUG(dbgs() << "Ignoring no-op memset.\n");
+          ++CurInst;
+          continue;
         }
 
         if (II->isLifetimeStartOrEnd()) {
@@ -602,11 +582,16 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB,
       LLVM_DEBUG(dbgs() << "Successfully evaluated block.\n");
       return true;
     } else {
-      // Did not know how to evaluate this!
-      LLVM_DEBUG(
-          dbgs() << "Failed to evaluate block due to unhandled instruction."
-                    "\n");
-      return false;
+      SmallVector<Constant *> Ops;
+      for (Value *Op : CurInst->operands())
+        Ops.push_back(getVal(Op));
+      InstResult = ConstantFoldInstOperands(&*CurInst, Ops, DL, TLI);
+      if (!InstResult) {
+        LLVM_DEBUG(dbgs() << "Cannot fold instruction: " << *CurInst << "\n");
+        return false;
+      }
+      LLVM_DEBUG(dbgs() << "Folded instruction " << *CurInst << " to "
+                        << *InstResult << "\n");
     }
 
     if (!CurInst->use_empty()) {
@@ -631,6 +616,8 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB,
 /// function.
 bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal,
                                  const SmallVectorImpl<Constant*> &ActualArgs) {
+  assert(ActualArgs.size() == F->arg_size() && "wrong number of arguments");
+
   // Check to see if this function is already executing (recursion).  If so,
   // bail out.  TODO: we might want to accept limited recursion.
   if (is_contained(CallStack, F))
diff --git a/llvm/lib/Transforms/Utils/FixIrreducible.cpp b/llvm/lib/Transforms/Utils/FixIrreducible.cpp
index 8de3ce876bab..24539bd231c6 100644
--- a/llvm/lib/Transforms/Utils/FixIrreducible.cpp
+++ b/llvm/lib/Transforms/Utils/FixIrreducible.cpp
@@ -68,6 +68,7 @@
 
 #include "llvm/Transforms/Utils/FixIrreducible.h"
 #include "llvm/ADT/SCCIterator.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
@@ -137,10 +138,18 @@ static void reconnectChildLoops(LoopInfo &LI, Loop *ParentLoop, Loop *NewLoop,
     // not be necessary if we can retain such backedges.
     if (Headers.count(Child->getHeader())) {
       for (auto BB : Child->blocks()) {
+        if (LI.getLoopFor(BB) != Child)
+          continue;
         LI.changeLoopFor(BB, NewLoop);
         LLVM_DEBUG(dbgs() << "moved block from child: " << BB->getName()
                           << "\n");
       }
+      std::vector<Loop *> GrandChildLoops;
+      std::swap(GrandChildLoops, Child->getSubLoopsVector());
+      for (auto GrandChildLoop : GrandChildLoops) {
+        GrandChildLoop->setParentLoop(nullptr);
+        NewLoop->addChildLoop(GrandChildLoop);
+      }
       LI.destroy(Child);
       LLVM_DEBUG(dbgs() << "subsumed child loop (common header)\n");
       continue;
diff --git a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
index 2946c0018c31..193806d9cc87 100644
--- a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
+++ b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
@@ -12,8 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/FunctionImportUtils.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/InstIterator.h"
 using namespace llvm;
 
 /// Checks if we should import SGV as a definition, otherwise import as a
diff --git a/llvm/lib/Transforms/Utils/GlobalStatus.cpp b/llvm/lib/Transforms/Utils/GlobalStatus.cpp
index c1c5f5cc879f..c5aded3c45f4 100644
--- a/llvm/lib/Transforms/Utils/GlobalStatus.cpp
+++ b/llvm/lib/Transforms/Utils/GlobalStatus.cpp
@@ -38,22 +38,26 @@ static AtomicOrdering strongerOrdering(AtomicOrdering X, AtomicOrdering Y) {
 }
 
 /// It is safe to destroy a constant iff it is only used by constants itself.
-/// Note that constants cannot be cyclic, so this test is pretty easy to
-/// implement recursively.
-///
+/// Note that while constants cannot be cyclic, they can be tree-like, so we
+/// should keep a visited set to avoid exponential runtime.
 bool llvm::isSafeToDestroyConstant(const Constant *C) {
-  if (isa<GlobalValue>(C))
-    return false;
-
-  if (isa<ConstantData>(C))
-    return false;
+  SmallVector<const Constant *, 8> Worklist;
+  SmallPtrSet<const Constant *, 8> Visited;
+  Worklist.push_back(C);
+  while (!Worklist.empty()) {
+    const Constant *C = Worklist.pop_back_val();
+    if (!Visited.insert(C).second)
+      continue;
+    if (isa<GlobalValue>(C) || isa<ConstantData>(C))
+      return false;
 
-  for (const User *U : C->users())
-    if (const Constant *CU = dyn_cast<Constant>(U)) {
-      if (!isSafeToDestroyConstant(CU))
+    for (const User *U : C->users()) {
+      if (const Constant *CU = dyn_cast<Constant>(U))
+        Worklist.push_back(CU);
+      else
         return false;
-    } else
-      return false;
+    }
+  }
   return true;
 }
 
@@ -100,6 +104,8 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
         if (SI->isVolatile())
           return true;
 
+        ++GS.NumStores;
+
         GS.Ordering = strongerOrdering(GS.Ordering, SI->getOrdering());
 
         // If this is a direct store to the global (i.e., the global is a scalar
diff --git a/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp b/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
index 047bf5569ded..55bcb6f3b121 100644
--- a/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
+++ b/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
@@ -19,7 +19,6 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/InstIterator.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 923bcc781e47..2fb00f95b749 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -37,7 +37,6 @@
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -85,7 +84,7 @@ EnableNoAliasConversion("enable-noalias-to-md-conversion", cl::init(true),
 
 static cl::opt<bool>
     UseNoAliasIntrinsic("use-noalias-intrinsic-during-inlining", cl::Hidden,
-                        cl::ZeroOrMore, cl::init(true),
+                        cl::init(true),
                         cl::desc("Use the llvm.experimental.noalias.scope.decl "
                                  "intrinsic during inlining."));
 
@@ -1044,12 +1043,10 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
         }
 
         for (Value *Arg : Call->args()) {
-          // We need to check the underlying objects of all arguments, not just
-          // the pointer arguments, because we might be passing pointers as
-          // integers, etc.
-          // However, if we know that the call only accesses pointer arguments,
-          // then we only need to check the pointer arguments.
-          if (IsArgMemOnlyCall && !Arg->getType()->isPointerTy())
+          // Only care about pointer arguments. If a noalias argument is
+          // accessed through a non-pointer argument, it must be captured
+          // first (e.g. via ptrtoint), and we protect against captures below.
+          if (!Arg->getType()->isPointerTy())
             continue;
 
           PtrArgs.push_back(Arg);
@@ -1080,7 +1077,8 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
 
       // Figure out if we're derived from anything that is not a noalias
       // argument.
-      bool CanDeriveViaCapture = false, UsesAliasingPtr = false;
+      bool RequiresNoCaptureBefore = false, UsesAliasingPtr = false,
+           UsesUnknownObject = false;
       for (const Value *V : ObjSet) {
         // Is this value a constant that cannot be derived from any pointer
         // value (we need to exclude constant expressions, for example, that
@@ -1101,19 +1099,28 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
           UsesAliasingPtr = true;
         }
 
-        // If this is not some identified function-local object (which cannot
-        // directly alias a noalias argument), or some other argument (which,
-        // by definition, also cannot alias a noalias argument), then we could
-        // alias a noalias argument that has been captured).
-        if (!isa<Argument>(V) &&
-            !isIdentifiedFunctionLocal(const_cast<Value*>(V)))
-          CanDeriveViaCapture = true;
+        if (isEscapeSource(V)) {
+          // An escape source can only alias with a noalias argument if it has
+          // been captured beforehand.
+          RequiresNoCaptureBefore = true;
+        } else if (!isa<Argument>(V) && !isIdentifiedObject(V)) {
+          // If this is neither an escape source, nor some identified object
+          // (which cannot directly alias a noalias argument), nor some other
+          // argument (which, by definition, also cannot alias a noalias
+          // argument), conservatively do not make any assumptions.
+          UsesUnknownObject = true;
+        }
       }
 
+      // Nothing we can do if the used underlying object cannot be reliably
+      // determined.
+      if (UsesUnknownObject)
+        continue;
+
       // A function call can always get captured noalias pointers (via other
       // parameters, globals, etc.).
       if (IsFuncCall && !IsArgMemOnlyCall)
-        CanDeriveViaCapture = true;
+        RequiresNoCaptureBefore = true;
 
       // First, we want to figure out all of the sets with which we definitely
       // don't alias. Iterate over all noalias set, and add those for which:
@@ -1124,16 +1131,16 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
       // noalias arguments via other noalias arguments or globals, and so we
       // must always check for prior capture.
       for (const Argument *A : NoAliasArgs) {
-        if (!ObjSet.count(A) && (!CanDeriveViaCapture ||
-                                 // It might be tempting to skip the
-                                 // PointerMayBeCapturedBefore check if
-                                 // A->hasNoCaptureAttr() is true, but this is
-                                 // incorrect because nocapture only guarantees
-                                 // that no copies outlive the function, not
-                                 // that the value cannot be locally captured.
-                                 !PointerMayBeCapturedBefore(A,
-                                   /* ReturnCaptures */ false,
-                                   /* StoreCaptures */ false, I, &DT)))
+        if (ObjSet.contains(A))
+          continue; // May be based on a noalias argument.
+
+        // It might be tempting to skip the PointerMayBeCapturedBefore check if
+        // A->hasNoCaptureAttr() is true, but this is incorrect because
+        // nocapture only guarantees that no copies outlive the function, not
+        // that the value cannot be locally captured.
+        if (!RequiresNoCaptureBefore ||
+            !PointerMayBeCapturedBefore(A, /* ReturnCaptures */ false,
+                                        /* StoreCaptures */ false, I, &DT))
           NoAliases.push_back(NewScopes[A]);
       }
 
@@ -1422,7 +1429,8 @@ static Value *HandleByValArgument(Type *ByValType, Value *Arg,
   // If the byval had an alignment specified, we *must* use at least that
   // alignment, as it is required by the byval argument (and uses of the
   // pointer inside the callee).
-  Alignment = max(Alignment, MaybeAlign(ByValAlignment));
+  if (ByValAlignment > 0)
+    Alignment = std::max(Alignment, Align(ByValAlignment));
 
   Value *NewAlloca =
       new AllocaInst(ByValType, DL.getAllocaAddrSpace(), nullptr, Alignment,
@@ -1601,7 +1609,7 @@ static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap,
     return;
   auto CallSiteCount = PSI ? PSI->getProfileCount(TheCall, CallerBFI) : None;
   int64_t CallCount =
-      std::min(CallSiteCount.getValueOr(0), CalleeEntryCount.getCount());
+      std::min(CallSiteCount.value_or(0), CalleeEntryCount.getCount());
   updateProfileCallee(Callee, -CallCount, &VMap);
 }
 
@@ -1609,7 +1617,7 @@ void llvm::updateProfileCallee(
     Function *Callee, int64_t EntryDelta,
     const ValueMap<const Value *, WeakTrackingVH> *VMap) {
   auto CalleeCount = Callee->getEntryCount();
-  if (!CalleeCount.hasValue())
+  if (!CalleeCount)
     return;
 
   const uint64_t PriorEntryCount = CalleeCount->getCount();
@@ -1789,6 +1797,13 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
   BasicBlock *OrigBB = CB.getParent();
   Function *Caller = OrigBB->getParent();
 
+  // Do not inline strictfp function into non-strictfp one. It would require
+  // conversion of all FP operations in host function to constrained intrinsics.
+  if (CalledFunc->getAttributes().hasFnAttr(Attribute::StrictFP) &&
+      !Caller->getAttributes().hasFnAttr(Attribute::StrictFP)) {
+    return InlineResult::failure("incompatible strictfp attributes");
+  }
+
   // GC poses two hazards to inlining, which only occur when the callee has GC:
   //  1. If the caller has no GC, then the callee's GC must be propagated to the
   //     caller.
@@ -2644,7 +2659,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
     AssumptionCache *AC =
         IFI.GetAssumptionCache ? &IFI.GetAssumptionCache(*Caller) : nullptr;
     auto &DL = Caller->getParent()->getDataLayout();
-    if (Value *V = SimplifyInstruction(PHI, {DL, nullptr, nullptr, AC})) {
+    if (Value *V = simplifyInstruction(PHI, {DL, nullptr, nullptr, AC})) {
       PHI->replaceAllUsesWith(V);
       PHI->eraseFromParent();
     }
diff --git a/llvm/lib/Transforms/Utils/IntegerDivision.cpp b/llvm/lib/Transforms/Utils/IntegerDivision.cpp
index 9082049c82da..47ab30f03d14 100644
--- a/llvm/lib/Transforms/Utils/IntegerDivision.cpp
+++ b/llvm/lib/Transforms/Utils/IntegerDivision.cpp
@@ -18,7 +18,6 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/Transforms/Utils/LCSSA.cpp b/llvm/lib/Transforms/Utils/LCSSA.cpp
index 72b864dc3e48..84d377d835f3 100644
--- a/llvm/lib/Transforms/Utils/LCSSA.cpp
+++ b/llvm/lib/Transforms/Utils/LCSSA.cpp
@@ -33,14 +33,13 @@
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
diff --git a/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp b/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
index 6958a89f5be6..6e87da9fb168 100644
--- a/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
+++ b/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
@@ -30,14 +30,12 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 9a10535c9310..b203259db1c6 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -29,7 +29,6 @@
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/LazyValueInfo.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
@@ -63,9 +62,7 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/PseudoProbe.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
@@ -80,7 +77,6 @@
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
 #include <cassert>
-#include <climits>
 #include <cstdint>
 #include <iterator>
 #include <map>
@@ -489,7 +485,7 @@ bool llvm::wouldInstructionBeTriviallyDead(Instruction *I,
 
     if (auto *FPI = dyn_cast<ConstrainedFPIntrinsic>(I)) {
       Optional<fp::ExceptionBehavior> ExBehavior = FPI->getExceptionBehavior();
-      return ExBehavior.getValue() != fp::ebStrict;
+      return *ExBehavior != fp::ebStrict;
     }
   }
 
@@ -504,15 +500,12 @@ bool llvm::wouldInstructionBeTriviallyDead(Instruction *I,
     if (isMathLibCallNoop(Call, TLI))
       return true;
 
-  // To express possible interaction with floating point environment constrained
-  // intrinsics are described as if they access memory. So they look like having
-  // side effect but actually do not have it unless they raise floating point
-  // exception. If FP exceptions are ignored, the intrinsic may be deleted.
-  if (auto *CI = dyn_cast<ConstrainedFPIntrinsic>(I)) {
-    Optional<fp::ExceptionBehavior> EB = CI->getExceptionBehavior();
-    if (!EB || *EB == fp::ExceptionBehavior::ebIgnore)
-      return true;
-  }
+  // Non-volatile atomic loads from constants can be removed.
+  if (auto *LI = dyn_cast<LoadInst>(I))
+    if (auto *GV = dyn_cast<GlobalVariable>(
+            LI->getPointerOperand()->stripPointerCasts()))
+      if (!LI->isVolatile() && GV->isConstant())
+        return true;
 
   return false;
 }
@@ -682,7 +675,7 @@ simplifyAndDCEInstruction(Instruction *I,
     return true;
   }
 
-  if (Value *SimpleV = SimplifyInstruction(I, DL)) {
+  if (Value *SimpleV = simplifyInstruction(I, DL)) {
     // Add the users to the worklist. CAREFUL: an instruction can use itself,
     // in the case of a phi node.
     for (User *U : I->users()) {
@@ -1133,7 +1126,7 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
     // If there is more than one pred of succ, and there are PHI nodes in
     // the successor, then we need to add incoming edges for the PHI nodes
     //
-    const PredBlockVector BBPreds(pred_begin(BB), pred_end(BB));
+    const PredBlockVector BBPreds(predecessors(BB));
 
     // Loop over all of the PHI nodes in the successor of BB.
     for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) {
@@ -1393,7 +1386,7 @@ Align llvm::getOrEnforceKnownAlignment(Value *V, MaybeAlign PrefAlign,
 static bool PhiHasDebugValue(DILocalVariable *DIVar,
                              DIExpression *DIExpr,
                              PHINode *APN) {
-  // Since we can't guarantee that the original dbg.declare instrinsic
+  // Since we can't guarantee that the original dbg.declare intrinsic
   // is removed by LowerDbgDeclare(), we need to make sure that we are
   // not inserting the same dbg.value intrinsic over and over.
   SmallVector<DbgValueInst *, 1> DbgValues;
@@ -1472,7 +1465,7 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
     LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: "
                       << *DII << '\n');
     // For now, when there is a store to parts of the variable (but we do not
-    // know which part) we insert an dbg.value instrinsic to indicate that we
+    // know which part) we insert an dbg.value intrinsic to indicate that we
     // know nothing about the variable's content.
     DV = UndefValue::get(DV->getType());
     Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc, SI);
@@ -2240,6 +2233,7 @@ BasicBlock *llvm::changeToInvokeAndSplitBasicBlock(CallInst *CI,
   II->setDebugLoc(CI->getDebugLoc());
   II->setCallingConv(CI->getCallingConv());
   II->setAttributes(CI->getAttributes());
+  II->setMetadata(LLVMContext::MD_prof, CI->getMetadata(LLVMContext::MD_prof));
 
   if (DTU)
     DTU->applyUpdates({{DominatorTree::Insert, BB, UnwindEdge}});
@@ -2349,19 +2343,42 @@ static bool markAliveBlocks(Function &F,
           isa<UndefValue>(Callee)) {
         changeToUnreachable(II, false, DTU);
         Changed = true;
-      } else if (II->doesNotThrow() && canSimplifyInvokeNoUnwind(&F)) {
-        if (II->use_empty() && II->onlyReadsMemory()) {
-          // jump to the normal destination branch.
-          BasicBlock *NormalDestBB = II->getNormalDest();
-          BasicBlock *UnwindDestBB = II->getUnwindDest();
-          BranchInst::Create(NormalDestBB, II);
-          UnwindDestBB->removePredecessor(II->getParent());
-          II->eraseFromParent();
+      } else {
+        if (II->doesNotReturn() &&
+            !isa<UnreachableInst>(II->getNormalDest()->front())) {
+          // If we found an invoke of a no-return function,
+          // create a new empty basic block with an `unreachable` terminator,
+          // and set it as the normal destination for the invoke,
+          // unless that is already the case.
+          // Note that the original normal destination could have other uses.
+          BasicBlock *OrigNormalDest = II->getNormalDest();
+          OrigNormalDest->removePredecessor(II->getParent());
+          LLVMContext &Ctx = II->getContext();
+          BasicBlock *UnreachableNormalDest = BasicBlock::Create(
+              Ctx, OrigNormalDest->getName() + ".unreachable",
+              II->getFunction(), OrigNormalDest);
+          new UnreachableInst(Ctx, UnreachableNormalDest);
+          II->setNormalDest(UnreachableNormalDest);
           if (DTU)
-            DTU->applyUpdates({{DominatorTree::Delete, BB, UnwindDestBB}});
-        } else
-          changeToCall(II, DTU);
-        Changed = true;
+            DTU->applyUpdates(
+                {{DominatorTree::Delete, BB, OrigNormalDest},
+                 {DominatorTree::Insert, BB, UnreachableNormalDest}});
+          Changed = true;
+        }
+        if (II->doesNotThrow() && canSimplifyInvokeNoUnwind(&F)) {
+          if (II->use_empty() && !II->mayHaveSideEffects()) {
+            // jump to the normal destination branch.
+            BasicBlock *NormalDestBB = II->getNormalDest();
+            BasicBlock *UnwindDestBB = II->getUnwindDest();
+            BranchInst::Create(NormalDestBB, II);
+            UnwindDestBB->removePredecessor(II->getParent());
+            II->eraseFromParent();
+            if (DTU)
+              DTU->applyUpdates({{DominatorTree::Delete, BB, UnwindDestBB}});
+          } else
+            changeToCall(II, DTU);
+          Changed = true;
+        }
       }
     } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(Terminator)) {
       // Remove catchpads which cannot be reached.
diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index 5b66da1e7082..f093fea19c4d 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -28,7 +28,6 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Metadata.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -38,12 +37,10 @@
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/UnrollLoop.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
-#include <limits>
 
 using namespace llvm;
 using namespace llvm::PatternMatch;
@@ -389,6 +386,10 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
   if (!PP.AllowPeeling)
     return;
 
+  // Check that we can peel at least one iteration.
+  if (2 * LoopSize > Threshold)
+    return;
+
   unsigned AlreadyPeeled = 0;
   if (auto Peeled = getOptionalIntLoopAttribute(L, PeeledCountMetaData))
     AlreadyPeeled = *Peeled;
@@ -401,47 +402,45 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
   // which every Phi is guaranteed to become an invariant, and try to peel the
   // maximum number of iterations among these values, thus turning all those
   // Phis into invariants.
-  // First, check that we can peel at least one iteration.
-  if (2 * LoopSize <= Threshold && UnrollPeelMaxCount > 0) {
-    // Store the pre-calculated values here.
-    SmallDenseMap<PHINode *, Optional<unsigned> > IterationsToInvariance;
-    // Now go through all Phis to calculate their the number of iterations they
-    // need to become invariants.
-    // Start the max computation with the PP.PeelCount value set by the target
-    // in TTI.getPeelingPreferences or by the flag -unroll-peel-count.
-    unsigned DesiredPeelCount = TargetPeelCount;
-    BasicBlock *BackEdge = L->getLoopLatch();
-    assert(BackEdge && "Loop is not in simplified form?");
-    for (auto BI = L->getHeader()->begin(); isa<PHINode>(&*BI); ++BI) {
-      PHINode *Phi = cast<PHINode>(&*BI);
-      auto ToInvariance = calculateIterationsToInvariance(
-          Phi, L, BackEdge, IterationsToInvariance);
-      if (ToInvariance)
-        DesiredPeelCount = std::max(DesiredPeelCount, *ToInvariance);
-    }
 
-    // Pay respect to limitations implied by loop size and the max peel count.
-    unsigned MaxPeelCount = UnrollPeelMaxCount;
-    MaxPeelCount = std::min(MaxPeelCount, Threshold / LoopSize - 1);
-
-    DesiredPeelCount = std::max(DesiredPeelCount,
-                                countToEliminateCompares(*L, MaxPeelCount, SE));
-
-    if (DesiredPeelCount == 0)
-      DesiredPeelCount = peelToTurnInvariantLoadsDerefencebale(*L, DT);
-
-    if (DesiredPeelCount > 0) {
-      DesiredPeelCount = std::min(DesiredPeelCount, MaxPeelCount);
-      // Consider max peel count limitation.
-      assert(DesiredPeelCount > 0 && "Wrong loop size estimation?");
-      if (DesiredPeelCount + AlreadyPeeled <= UnrollPeelMaxCount) {
-        LLVM_DEBUG(dbgs() << "Peel " << DesiredPeelCount
-                          << " iteration(s) to turn"
-                          << " some Phis into invariants.\n");
-        PP.PeelCount = DesiredPeelCount;
-        PP.PeelProfiledIterations = false;
-        return;
-      }
+  // Store the pre-calculated values here.
+  SmallDenseMap<PHINode *, Optional<unsigned>> IterationsToInvariance;
+  // Now go through all Phis to calculate their the number of iterations they
+  // need to become invariants.
+  // Start the max computation with the PP.PeelCount value set by the target
+  // in TTI.getPeelingPreferences or by the flag -unroll-peel-count.
+  unsigned DesiredPeelCount = TargetPeelCount;
+  BasicBlock *BackEdge = L->getLoopLatch();
+  assert(BackEdge && "Loop is not in simplified form?");
+  for (auto BI = L->getHeader()->begin(); isa<PHINode>(&*BI); ++BI) {
+    PHINode *Phi = cast<PHINode>(&*BI);
+    auto ToInvariance = calculateIterationsToInvariance(Phi, L, BackEdge,
+                                                        IterationsToInvariance);
+    if (ToInvariance)
+      DesiredPeelCount = std::max(DesiredPeelCount, *ToInvariance);
+  }
+
+  // Pay respect to limitations implied by loop size and the max peel count.
+  unsigned MaxPeelCount = UnrollPeelMaxCount;
+  MaxPeelCount = std::min(MaxPeelCount, Threshold / LoopSize - 1);
+
+  DesiredPeelCount = std::max(DesiredPeelCount,
+                              countToEliminateCompares(*L, MaxPeelCount, SE));
+
+  if (DesiredPeelCount == 0)
+    DesiredPeelCount = peelToTurnInvariantLoadsDerefencebale(*L, DT);
+
+  if (DesiredPeelCount > 0) {
+    DesiredPeelCount = std::min(DesiredPeelCount, MaxPeelCount);
+    // Consider max peel count limitation.
+    assert(DesiredPeelCount > 0 && "Wrong loop size estimation?");
+    if (DesiredPeelCount + AlreadyPeeled <= UnrollPeelMaxCount) {
+      LLVM_DEBUG(dbgs() << "Peel " << DesiredPeelCount
+                        << " iteration(s) to turn"
+                        << " some Phis into invariants.\n");
+      PP.PeelCount = DesiredPeelCount;
+      PP.PeelProfiledIterations = false;
+      return;
     }
   }
 
@@ -461,27 +460,26 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
   if (L->getHeader()->getParent()->hasProfileData()) {
     if (violatesLegacyMultiExitLoopCheck(L))
       return;
-    Optional<unsigned> PeelCount = getLoopEstimatedTripCount(L);
-    if (!PeelCount)
+    Optional<unsigned> EstimatedTripCount = getLoopEstimatedTripCount(L);
+    if (!EstimatedTripCount)
       return;
 
-    LLVM_DEBUG(dbgs() << "Profile-based estimated trip count is " << *PeelCount
-                      << "\n");
+    LLVM_DEBUG(dbgs() << "Profile-based estimated trip count is "
+                      << *EstimatedTripCount << "\n");
 
-    if (*PeelCount) {
-      if ((*PeelCount + AlreadyPeeled <= UnrollPeelMaxCount) &&
-          (LoopSize * (*PeelCount + 1) <= Threshold)) {
-        LLVM_DEBUG(dbgs() << "Peeling first " << *PeelCount
-                          << " iterations.\n");
-        PP.PeelCount = *PeelCount;
+    if (*EstimatedTripCount) {
+      if (*EstimatedTripCount + AlreadyPeeled <= MaxPeelCount) {
+        unsigned PeelCount = *EstimatedTripCount;
+        LLVM_DEBUG(dbgs() << "Peeling first " << PeelCount << " iterations.\n");
+        PP.PeelCount = PeelCount;
         return;
       }
-      LLVM_DEBUG(dbgs() << "Requested peel count: " << *PeelCount << "\n");
       LLVM_DEBUG(dbgs() << "Already peel count: " << AlreadyPeeled << "\n");
       LLVM_DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n");
-      LLVM_DEBUG(dbgs() << "Peel cost: " << LoopSize * (*PeelCount + 1)
-                        << "\n");
+      LLVM_DEBUG(dbgs() << "Loop cost: " << LoopSize << "\n");
       LLVM_DEBUG(dbgs() << "Max peel cost: " << Threshold << "\n");
+      LLVM_DEBUG(dbgs() << "Max peel count by cost: "
+                        << (Threshold / LoopSize - 1) << "\n");
     }
   }
 }
@@ -579,7 +577,8 @@ static void cloneLoopBlocks(
     SmallVectorImpl<std::pair<BasicBlock *, BasicBlock *>> &ExitEdges,
     SmallVectorImpl<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks,
     ValueToValueMapTy &VMap, ValueToValueMapTy &LVMap, DominatorTree *DT,
-    LoopInfo *LI, ArrayRef<MDNode *> LoopLocalNoAliasDeclScopes) {
+    LoopInfo *LI, ArrayRef<MDNode *> LoopLocalNoAliasDeclScopes,
+    ScalarEvolution &SE) {
   BasicBlock *Header = L->getHeader();
   BasicBlock *Latch = L->getLoopLatch();
   BasicBlock *PreHeader = L->getLoopPreheader();
@@ -685,6 +684,7 @@ static void cloneLoopBlocks(
       if (LatchInst && L->contains(LatchInst))
         LatchVal = VMap[LatchVal];
       PHI.addIncoming(LatchVal, cast<BasicBlock>(VMap[Edge.first]));
+      SE.forgetValue(&PHI);
     }
 
   // LastValueMap is updated with the values for the current loop
@@ -719,9 +719,9 @@ TargetTransformInfo::PeelingPreferences llvm::gatherPeelingPreferences(
   }
 
   // User specifed values provided by argument.
-  if (UserAllowPeeling.hasValue())
+  if (UserAllowPeeling)
     PP.AllowPeeling = *UserAllowPeeling;
-  if (UserAllowProfileBasedPeeling.hasValue())
+  if (UserAllowProfileBasedPeeling)
     PP.PeelProfiledIterations = *UserAllowProfileBasedPeeling;
 
   return PP;
@@ -851,7 +851,7 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
 
     cloneLoopBlocks(L, Iter, InsertTop, InsertBot, ExitEdges, NewBlocks,
                     LoopBlocks, VMap, LVMap, &DT, LI,
-                    LoopLocalNoAliasDeclScopes);
+                    LoopLocalNoAliasDeclScopes, *SE);
 
     // Remap to use values from the current iteration instead of the
     // previous one.
@@ -907,8 +907,10 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
   // We modified the loop, update SE.
   SE->forgetTopmostLoop(L);
 
+#ifdef EXPENSIVE_CHECKS
   // Finally DomtTree must be correct.
   assert(DT.verify(DominatorTree::VerificationLevel::Fast));
+#endif
 
   // FIXME: Incrementally update loop-simplify
   simplifyLoop(L, &DT, LI, SE, AC, nullptr, PreserveLCSSA);
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index c66fd7bb0588..0f33559c7e70 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -13,31 +13,24 @@
 #include "llvm/Transforms/Utils/LoopRotationUtils.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
-#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 using namespace llvm;
@@ -317,7 +310,13 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
                    L->dump());
         return Rotated;
       }
-      if (Metrics.NumInsts > MaxHeaderSize) {
+      if (!Metrics.NumInsts.isValid()) {
+        LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains instructions"
+                   " with invalid cost: ";
+                   L->dump());
+        return Rotated;
+      }
+      if (*Metrics.NumInsts.getValue() > MaxHeaderSize) {
         LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains "
                           << Metrics.NumInsts
                           << " instructions, which is more than the threshold ("
@@ -446,7 +445,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
       // With the operands remapped, see if the instruction constant folds or is
       // otherwise simplifyable.  This commonly occurs because the entry from PHI
       // nodes allows icmps and other instructions to fold.
-      Value *V = SimplifyInstruction(C, SQ);
+      Value *V = simplifyInstruction(C, SQ);
       if (V && LI->replacementPreservesLCSSAForm(C, V)) {
         // If so, then delete the temporary instruction and stick the folded value
         // in the map.
diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
index 67311ab4cd02..55d5c733733b 100644
--- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
@@ -40,8 +40,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/LoopSimplify.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -59,14 +57,11 @@
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -181,7 +176,7 @@ static PHINode *findPHIToPartitionLoops(Loop *L, DominatorTree *DT,
   for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ) {
     PHINode *PN = cast<PHINode>(I);
     ++I;
-    if (Value *V = SimplifyInstruction(PN, {DL, nullptr, DT, AC})) {
+    if (Value *V = simplifyInstruction(PN, {DL, nullptr, DT, AC})) {
       // This is a degenerate PHI already, don't modify it!
       PN->replaceAllUsesWith(V);
       PN->eraseFromParent();
@@ -602,7 +597,7 @@ ReprocessLoop:
   PHINode *PN;
   for (BasicBlock::iterator I = L->getHeader()->begin();
        (PN = dyn_cast<PHINode>(I++)); )
-    if (Value *V = SimplifyInstruction(PN, {DL, nullptr, DT, AC})) {
+    if (Value *V = simplifyInstruction(PN, {DL, nullptr, DT, AC})) {
       if (SE) SE->forgetValue(PN);
       if (!PreserveLCSSA || LI->replacementPreservesLCSSAForm(PN, V)) {
         PN->replaceAllUsesWith(V);
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 9ca1f4f44b97..1be1082002fc 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -236,7 +236,7 @@ void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
   SmallVector<WeakTrackingVH, 16> DeadInsts;
   for (BasicBlock *BB : L->getBlocks()) {
     for (Instruction &Inst : llvm::make_early_inc_range(*BB)) {
-      if (Value *V = SimplifyInstruction(&Inst, {DL, nullptr, DT, AC}))
+      if (Value *V = simplifyInstruction(&Inst, {DL, nullptr, DT, AC}))
         if (LI->replacementPreservesLCSSAForm(&Inst, V))
           Inst.replaceAllUsesWith(V);
       if (isInstructionTriviallyDead(&Inst))
@@ -513,7 +513,7 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
           if (const DILocation *DIL = I.getDebugLoc()) {
             auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(ULO.Count);
             if (NewDIL)
-              I.setDebugLoc(NewDIL.getValue());
+              I.setDebugLoc(*NewDIL);
             else
               LLVM_DEBUG(dbgs()
                          << "Failed to create new discriminator: "
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
index 6efaa012aeca..96485d15c75b 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
@@ -15,7 +15,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -39,7 +38,6 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
@@ -358,7 +356,7 @@ llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
           if (const DILocation *DIL = I.getDebugLoc()) {
             auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(Count);
             if (NewDIL)
-              I.setDebugLoc(NewDIL.getValue());
+              I.setDebugLoc(*NewDIL);
             else
               LLVM_DEBUG(dbgs()
                          << "Failed to create new discriminator: "
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index bb719a499a4c..cd3b6c1a095a 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -20,20 +20,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -74,7 +73,8 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
                           BasicBlock *OriginalLoopLatchExit,
                           BasicBlock *PreHeader, BasicBlock *NewPreHeader,
                           ValueToValueMapTy &VMap, DominatorTree *DT,
-                          LoopInfo *LI, bool PreserveLCSSA) {
+                          LoopInfo *LI, bool PreserveLCSSA,
+                          ScalarEvolution &SE) {
   // Loop structure should be the following:
   // Preheader
   //  PrologHeader
@@ -134,6 +134,7 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
         PN.setIncomingValueForBlock(NewPreHeader, NewPN);
       else
         PN.addIncoming(NewPN, PrologExit);
+      SE.forgetValue(&PN);
     }
   }
 
@@ -192,7 +193,8 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
                           BasicBlock *Exit, BasicBlock *PreHeader,
                           BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader,
                           ValueToValueMapTy &VMap, DominatorTree *DT,
-                          LoopInfo *LI, bool PreserveLCSSA)  {
+                          LoopInfo *LI, bool PreserveLCSSA,
+                          ScalarEvolution &SE) {
   BasicBlock *Latch = L->getLoopLatch();
   assert(Latch && "Loop must have a latch");
   BasicBlock *EpilogLatch = cast<BasicBlock>(VMap[Latch]);
@@ -233,6 +235,7 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
 
     // Add incoming PreHeader from branch around the Loop
     PN.addIncoming(UndefValue::get(PN.getType()), PreHeader);
+    SE.forgetValue(&PN);
 
     Value *V = PN.getIncomingValueForBlock(Latch);
     Instruction *I = dyn_cast<Instruction>(V);
@@ -398,7 +401,7 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder,
 
   Optional<MDNode *> NewLoopID = makeFollowupLoopID(
       LoopID, {LLVMLoopUnrollFollowupAll, LLVMLoopUnrollFollowupRemainder});
-  if (NewLoopID.hasValue()) {
+  if (NewLoopID) {
     NewLoop->setLoopID(NewLoopID.getValue());
 
     // Do not setLoopAlreadyUnrolled if loop attributes have been defined
@@ -739,11 +742,28 @@ bool llvm::UnrollRuntimeLoopRemainder(
   // Compute the number of extra iterations required, which is:
   //  extra iterations = run-time trip count % loop unroll factor
   PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
+  IRBuilder<> B(PreHeaderBR);
   Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(),
                                             PreHeaderBR);
-  Value *BECount = Expander.expandCodeFor(BECountSC, BECountSC->getType(),
-                                          PreHeaderBR);
-  IRBuilder<> B(PreHeaderBR);
+  Value *BECount;
+  // If there are other exits before the latch, that may cause the latch exit
+  // branch to never be executed, and the latch exit count may be poison.
+  // In this case, freeze the TripCount and base BECount on the frozen
+  // TripCount. We will introduce two branches using these values, and it's
+  // important that they see a consistent value (which would not be guaranteed
+  // if were frozen independently.)
+  if ((!OtherExits.empty() || !SE->loopHasNoAbnormalExits(L)) &&
+      !isGuaranteedNotToBeUndefOrPoison(TripCount, AC, PreHeaderBR, DT)) {
+    TripCount = B.CreateFreeze(TripCount);
+    BECount =
+        B.CreateAdd(TripCount, ConstantInt::get(TripCount->getType(), -1));
+  } else {
+    // If we don't need to freeze, use SCEVExpander for BECount as well, to
+    // allow slightly better value reuse.
+    BECount =
+        Expander.expandCodeFor(BECountSC, BECountSC->getType(), PreHeaderBR);
+  }
+
   Value * const ModVal = CreateTripRemainder(B, BECount, TripCount, Count);
 
   Value *BranchVal =
@@ -884,9 +904,8 @@ bool llvm::UnrollRuntimeLoopRemainder(
   if (UseEpilogRemainder) {
     // Connect the epilog code to the original loop and update the
     // PHI functions.
-    ConnectEpilog(L, ModVal, NewExit, LatchExit, PreHeader,
-                  EpilogPreHeader, NewPreHeader, VMap, DT, LI,
-                  PreserveLCSSA);
+    ConnectEpilog(L, ModVal, NewExit, LatchExit, PreHeader, EpilogPreHeader,
+                  NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE);
 
     // Update counter in loop for unrolling.
     // Use an incrementing IV.  Pre-incr/post-incr is backedge/trip count.
@@ -910,7 +929,7 @@ bool llvm::UnrollRuntimeLoopRemainder(
     // Connect the prolog code to the original loop and update the
     // PHI functions.
     ConnectProlog(L, BECount, Count, PrologExit, LatchExit, PreHeader,
-                  NewPreHeader, VMap, DT, LI, PreserveLCSSA);
+                  NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE);
   }
 
   // If this loop is nested, then the loop unroller changes the code in the any
@@ -941,7 +960,7 @@ bool llvm::UnrollRuntimeLoopRemainder(
     SmallVector<WeakTrackingVH, 16> DeadInsts;
     for (BasicBlock *BB : RemainderBlocks) {
       for (Instruction &Inst : llvm::make_early_inc_range(*BB)) {
-        if (Value *V = SimplifyInstruction(&Inst, {DL, nullptr, DT, AC}))
+        if (Value *V = simplifyInstruction(&Inst, {DL, nullptr, DT, AC}))
           if (LI->replacementPreservesLCSSAForm(&Inst, V))
             Inst.replaceAllUsesWith(V);
         if (isInstructionTriviallyDead(&Inst))
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 95db2fe8d310..ec898c463574 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -23,31 +23,25 @@
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstSimplifyFolder.h"
-#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
-#include "llvm/Analysis/MustExecute.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
@@ -260,10 +254,10 @@ llvm::getOptionalElementCountLoopAttribute(const Loop *TheLoop) {
   Optional<int> Width =
       getOptionalIntLoopAttribute(TheLoop, "llvm.loop.vectorize.width");
 
-  if (Width.hasValue()) {
+  if (Width) {
     Optional<int> IsScalable = getOptionalIntLoopAttribute(
         TheLoop, "llvm.loop.vectorize.scalable.enable");
-    return ElementCount::get(*Width, IsScalable.getValueOr(false));
+    return ElementCount::get(*Width, IsScalable.value_or(false));
   }
 
   return None;
@@ -364,7 +358,7 @@ TransformationMode llvm::hasUnrollTransformation(const Loop *L) {
 
   Optional<int> Count =
       getOptionalIntLoopAttribute(L, "llvm.loop.unroll.count");
-  if (Count.hasValue())
+  if (Count)
     return Count.getValue() == 1 ? TM_SuppressedByUser : TM_ForcedByUser;
 
   if (getBooleanLoopAttribute(L, "llvm.loop.unroll.enable"))
@@ -385,7 +379,7 @@ TransformationMode llvm::hasUnrollAndJamTransformation(const Loop *L) {
 
   Optional<int> Count =
       getOptionalIntLoopAttribute(L, "llvm.loop.unroll_and_jam.count");
-  if (Count.hasValue())
+  if (Count)
     return Count.getValue() == 1 ? TM_SuppressedByUser : TM_ForcedByUser;
 
   if (getBooleanLoopAttribute(L, "llvm.loop.unroll_and_jam.enable"))
@@ -497,9 +491,11 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
   if (SE)
     SE->forgetLoop(L);
 
-  auto *OldBr = dyn_cast<BranchInst>(Preheader->getTerminator());
-  assert(OldBr && "Preheader must end with a branch");
-  assert(OldBr->isUnconditional() && "Preheader must have a single successor");
+  Instruction *OldTerm = Preheader->getTerminator();
+  assert(!OldTerm->mayHaveSideEffects() &&
+         "Preheader must end with a side-effect-free terminator");
+  assert(OldTerm->getNumSuccessors() == 1 &&
+         "Preheader must have a single successor");
   // Connect the preheader to the exit block. Keep the old edge to the header
   // around to perform the dominator tree update in two separate steps
   // -- #1 insertion of the edge preheader -> exit and #2 deletion of the edge
@@ -525,7 +521,7 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
   // coming to this inner loop, this will break the outer loop structure (by
   // deleting the backedge of the outer loop). If the outer loop is indeed a
   // non-loop, it will be deleted in a future iteration of loop deletion pass.
-  IRBuilder<> Builder(OldBr);
+  IRBuilder<> Builder(OldTerm);
 
   auto *ExitBlock = L->getUniqueExitBlock();
   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
@@ -535,7 +531,7 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
 
     Builder.CreateCondBr(Builder.getFalse(), L->getHeader(), ExitBlock);
     // Remove the old branch. The conditional branch becomes a new terminator.
-    OldBr->eraseFromParent();
+    OldTerm->eraseFromParent();
 
     // Rewrite phis in the exit block to get their inputs from the Preheader
     // instead of the exiting block.
@@ -579,7 +575,7 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
     assert(L->hasNoExitBlocks() &&
            "Loop should have either zero or one exit blocks.");
 
-    Builder.SetInsertPoint(OldBr);
+    Builder.SetInsertPoint(OldTerm);
     Builder.CreateUnreachable();
     Preheader->getTerminator()->eraseFromParent();
   }
@@ -692,18 +688,12 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
   }
 }
 
-static Loop *getOutermostLoop(Loop *L) {
-  while (Loop *Parent = L->getParentLoop())
-    L = Parent;
-  return L;
-}
-
 void llvm::breakLoopBackedge(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
                              LoopInfo &LI, MemorySSA *MSSA) {
   auto *Latch = L->getLoopLatch();
   assert(Latch && "multiple latches not yet supported");
   auto *Header = L->getHeader();
-  Loop *OutermostLoop = getOutermostLoop(L);
+  Loop *OutermostLoop = L->getOutermostLoop();
 
   SE.forgetLoop(L);
 
@@ -1103,7 +1093,8 @@ Value *llvm::createOrderedReduction(IRBuilderBase &B,
   return B.CreateFAddReduce(Start, Src);
 }
 
-void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL, Value *OpValue) {
+void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL, Value *OpValue,
+                            bool IncludeWrapFlags) {
   auto *VecOp = dyn_cast<Instruction>(I);
   if (!VecOp)
     return;
@@ -1112,7 +1103,7 @@ void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL, Value *OpValue) {
   if (!Intersection)
     return;
   const unsigned Opcode = Intersection->getOpcode();
-  VecOp->copyIRFlags(Intersection);
+  VecOp->copyIRFlags(Intersection, IncludeWrapFlags);
   for (auto *V : VL) {
     auto *Instr = dyn_cast<Instruction>(V);
     if (!Instr)
@@ -1536,6 +1527,11 @@ static PointerBounds expandBounds(const RuntimeCheckingPtrGroup *CG,
   LLVM_DEBUG(dbgs() << "LAA: Adding RT check for range:\n");
   Start = Exp.expandCodeFor(CG->Low, PtrArithTy, Loc);
   End = Exp.expandCodeFor(CG->High, PtrArithTy, Loc);
+  if (CG->NeedsFreeze) {
+    IRBuilder<> Builder(Loc);
+    Start = Builder.CreateFreeze(Start, Start->getName() + ".fr");
+    End = Builder.CreateFreeze(End, End->getName() + ".fr");
+  }
   LLVM_DEBUG(dbgs() << "Start: " << *CG->Low << " End: " << *CG->High << "\n");
   return {Start, End};
 }
@@ -1614,6 +1610,45 @@ Value *llvm::addRuntimeChecks(
   return MemoryRuntimeCheck;
 }
 
+Value *llvm::addDiffRuntimeChecks(
+    Instruction *Loc, Loop *TheLoop, ArrayRef<PointerDiffInfo> Checks,
+    SCEVExpander &Expander,
+    function_ref<Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC) {
+
+  LLVMContext &Ctx = Loc->getContext();
+  IRBuilder<InstSimplifyFolder> ChkBuilder(Ctx,
+                                           Loc->getModule()->getDataLayout());
+  ChkBuilder.SetInsertPoint(Loc);
+  // Our instructions might fold to a constant.
+  Value *MemoryRuntimeCheck = nullptr;
+
+  for (auto &C : Checks) {
+    Type *Ty = C.SinkStart->getType();
+    // Compute VF * IC * AccessSize.
+    auto *VFTimesUFTimesSize =
+        ChkBuilder.CreateMul(GetVF(ChkBuilder, Ty->getScalarSizeInBits()),
+                             ConstantInt::get(Ty, IC * C.AccessSize));
+    Value *Sink = Expander.expandCodeFor(C.SinkStart, Ty, Loc);
+    Value *Src = Expander.expandCodeFor(C.SrcStart, Ty, Loc);
+    if (C.NeedsFreeze) {
+      IRBuilder<> Builder(Loc);
+      Sink = Builder.CreateFreeze(Sink, Sink->getName() + ".fr");
+      Src = Builder.CreateFreeze(Src, Src->getName() + ".fr");
+    }
+    Value *Diff = ChkBuilder.CreateSub(Sink, Src);
+    Value *IsConflict =
+        ChkBuilder.CreateICmpULT(Diff, VFTimesUFTimesSize, "diff.check");
+
+    if (MemoryRuntimeCheck) {
+      IsConflict =
+          ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx");
+    }
+    MemoryRuntimeCheck = IsConflict;
+  }
+
+  return MemoryRuntimeCheck;
+}
+
 Optional<IVConditionInfo> llvm::hasPartialIVCondition(Loop &L,
                                                       unsigned MSSAThreshold,
                                                       MemorySSA &MSSA,
diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
index f0bf625fa18e..97f29527bb95 100644
--- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -41,9 +41,8 @@ LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI,
                                ArrayRef<RuntimePointerCheck> Checks, Loop *L,
                                LoopInfo *LI, DominatorTree *DT,
                                ScalarEvolution *SE)
-    : VersionedLoop(L), NonVersionedLoop(nullptr),
-      AliasChecks(Checks.begin(), Checks.end()),
-      Preds(LAI.getPSE().getUnionPredicate()), LAI(LAI), LI(LI), DT(DT),
+    : VersionedLoop(L), AliasChecks(Checks.begin(), Checks.end()),
+      Preds(LAI.getPSE().getPredicate()), LAI(LAI), LI(LI), DT(DT),
       SE(SE) {
 }
 
@@ -277,7 +276,7 @@ bool runImpl(LoopInfo *LI, function_ref<const LoopAccessInfo &(Loop &)> GetLAA,
     const LoopAccessInfo &LAI = GetLAA(*L);
     if (!LAI.hasConvergentOp() &&
         (LAI.getNumRuntimePointerChecks() ||
-         !LAI.getPSE().getUnionPredicate().isAlwaysTrue())) {
+         !LAI.getPSE().getPredicate().isAlwaysTrue())) {
       LoopVersioning LVer(LAI, LAI.getRuntimePointerChecking()->getChecks(), L,
                           LI, DT, SE);
       LVer.versionLoop();
diff --git a/llvm/lib/Transforms/Utils/LowerAtomic.cpp b/llvm/lib/Transforms/Utils/LowerAtomic.cpp
new file mode 100644
index 000000000000..8641581c8039
--- /dev/null
+++ b/llvm/lib/Transforms/Utils/LowerAtomic.cpp
@@ -0,0 +1,93 @@
+//===- LowerAtomic.cpp - Lower atomic intrinsics --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers atomic intrinsics to non-atomic form for use in a known
+// non-preemptible environment.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LowerAtomic.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loweratomic"
+
+bool llvm::lowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) {
+  IRBuilder<> Builder(CXI);
+  Value *Ptr = CXI->getPointerOperand();
+  Value *Cmp = CXI->getCompareOperand();
+  Value *Val = CXI->getNewValOperand();
+
+  LoadInst *Orig = Builder.CreateLoad(Val->getType(), Ptr);
+  Value *Equal = Builder.CreateICmpEQ(Orig, Cmp);
+  Value *Res = Builder.CreateSelect(Equal, Val, Orig);
+  Builder.CreateStore(Res, Ptr);
+
+  Res = Builder.CreateInsertValue(UndefValue::get(CXI->getType()), Orig, 0);
+  Res = Builder.CreateInsertValue(Res, Equal, 1);
+
+  CXI->replaceAllUsesWith(Res);
+  CXI->eraseFromParent();
+  return true;
+}
+
+Value *llvm::buildAtomicRMWValue(AtomicRMWInst::BinOp Op,
+                                 IRBuilderBase &Builder, Value *Loaded,
+                                 Value *Inc) {
+  Value *NewVal;
+  switch (Op) {
+  case AtomicRMWInst::Xchg:
+    return Inc;
+  case AtomicRMWInst::Add:
+    return Builder.CreateAdd(Loaded, Inc, "new");
+  case AtomicRMWInst::Sub:
+    return Builder.CreateSub(Loaded, Inc, "new");
+  case AtomicRMWInst::And:
+    return Builder.CreateAnd(Loaded, Inc, "new");
+  case AtomicRMWInst::Nand:
+    return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new");
+  case AtomicRMWInst::Or:
+    return Builder.CreateOr(Loaded, Inc, "new");
+  case AtomicRMWInst::Xor:
+    return Builder.CreateXor(Loaded, Inc, "new");
+  case AtomicRMWInst::Max:
+    NewVal = Builder.CreateICmpSGT(Loaded, Inc);
+    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
+  case AtomicRMWInst::Min:
+    NewVal = Builder.CreateICmpSLE(Loaded, Inc);
+    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
+  case AtomicRMWInst::UMax:
+    NewVal = Builder.CreateICmpUGT(Loaded, Inc);
+    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
+  case AtomicRMWInst::UMin:
+    NewVal = Builder.CreateICmpULE(Loaded, Inc);
+    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
+  case AtomicRMWInst::FAdd:
+    return Builder.CreateFAdd(Loaded, Inc, "new");
+  case AtomicRMWInst::FSub:
+    return Builder.CreateFSub(Loaded, Inc, "new");
+  default:
+    llvm_unreachable("Unknown atomic op");
+  }
+}
+
+bool llvm::lowerAtomicRMWInst(AtomicRMWInst *RMWI) {
+  IRBuilder<> Builder(RMWI);
+  Value *Ptr = RMWI->getPointerOperand();
+  Value *Val = RMWI->getValOperand();
+
+  LoadInst *Orig = Builder.CreateLoad(Val->getType(), Ptr);
+  Value *Res = buildAtomicRMWValue(RMWI->getOperation(), Builder, Orig, Val);
+  Builder.CreateStore(Res, Ptr);
+  RMWI->replaceAllUsesWith(Orig);
+  RMWI->eraseFromParent();
+  return true;
+}
diff --git a/llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp b/llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp
new file mode 100644
index 000000000000..010deb77a883
--- /dev/null
+++ b/llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp
@@ -0,0 +1,221 @@
+//===-- LowerGlobalDtors.cpp - Lower @llvm.global_dtors -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Lower @llvm.global_dtors.
+///
+/// Implement @llvm.global_dtors by creating wrapper functions that are
+/// registered in @llvm.global_ctors and which contain a call to
+/// `__cxa_atexit` to register their destructor functions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LowerGlobalDtors.h"
+
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <map>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "lower-global-dtors"
+
+namespace {
+class LowerGlobalDtorsLegacyPass final : public ModulePass {
+  StringRef getPassName() const override {
+    return "Lower @llvm.global_dtors via `__cxa_atexit`";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    ModulePass::getAnalysisUsage(AU);
+  }
+
+  bool runOnModule(Module &M) override;
+
+public:
+  static char ID;
+  LowerGlobalDtorsLegacyPass() : ModulePass(ID) {
+    initializeLowerGlobalDtorsLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+};
+} // End anonymous namespace
+
+char LowerGlobalDtorsLegacyPass::ID = 0;
+INITIALIZE_PASS(LowerGlobalDtorsLegacyPass, DEBUG_TYPE,
+                "Lower @llvm.global_dtors via `__cxa_atexit`", false, false)
+
+ModulePass *llvm::createLowerGlobalDtorsLegacyPass() {
+  return new LowerGlobalDtorsLegacyPass();
+}
+
+static bool runImpl(Module &M);
+bool LowerGlobalDtorsLegacyPass::runOnModule(Module &M) { return runImpl(M); }
+
+PreservedAnalyses LowerGlobalDtorsPass::run(Module &M,
+                                            ModuleAnalysisManager &AM) {
+  bool Changed = runImpl(M);
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
+static bool runImpl(Module &M) {
+  GlobalVariable *GV = M.getGlobalVariable("llvm.global_dtors");
+  if (!GV || !GV->hasInitializer())
+    return false;
+
+  const ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer());
+  if (!InitList)
+    return false;
+
+  // Validate @llvm.global_dtor's type.
+  auto *ETy = dyn_cast<StructType>(InitList->getType()->getElementType());
+  if (!ETy || ETy->getNumElements() != 3 ||
+      !ETy->getTypeAtIndex(0U)->isIntegerTy() ||
+      !ETy->getTypeAtIndex(1U)->isPointerTy() ||
+      !ETy->getTypeAtIndex(2U)->isPointerTy())
+    return false; // Not (int, ptr, ptr).
+
+  // Collect the contents of @llvm.global_dtors, ordered by priority. Within a
+  // priority, sequences of destructors with the same associated object are
+  // recorded so that we can register them as a group.
+  std::map<
+      uint16_t,
+      std::vector<std::pair<Constant *, std::vector<Constant *>>>
+  > DtorFuncs;
+  for (Value *O : InitList->operands()) {
+    auto *CS = dyn_cast<ConstantStruct>(O);
+    if (!CS)
+      continue; // Malformed.
+
+    auto *Priority = dyn_cast<ConstantInt>(CS->getOperand(0));
+    if (!Priority)
+      continue; // Malformed.
+    uint16_t PriorityValue = Priority->getLimitedValue(UINT16_MAX);
+
+    Constant *DtorFunc = CS->getOperand(1);
+    if (DtorFunc->isNullValue())
+      break; // Found a null terminator, skip the rest.
+
+    Constant *Associated = CS->getOperand(2);
+    Associated = cast<Constant>(Associated->stripPointerCasts());
+
+    auto &AtThisPriority = DtorFuncs[PriorityValue];
+    if (AtThisPriority.empty() || AtThisPriority.back().first != Associated) {
+        std::vector<Constant *> NewList;
+        NewList.push_back(DtorFunc);
+        AtThisPriority.push_back(std::make_pair(Associated, NewList));
+    } else {
+        AtThisPriority.back().second.push_back(DtorFunc);
+    }
+  }
+  if (DtorFuncs.empty())
+    return false;
+
+  // extern "C" int __cxa_atexit(void (*f)(void *), void *p, void *d);
+  LLVMContext &C = M.getContext();
+  PointerType *VoidStar = Type::getInt8PtrTy(C);
+  Type *AtExitFuncArgs[] = {VoidStar};
+  FunctionType *AtExitFuncTy =
+      FunctionType::get(Type::getVoidTy(C), AtExitFuncArgs,
+                        /*isVarArg=*/false);
+
+  FunctionCallee AtExit = M.getOrInsertFunction(
+      "__cxa_atexit",
+      FunctionType::get(Type::getInt32Ty(C),
+                        {PointerType::get(AtExitFuncTy, 0), VoidStar, VoidStar},
+                        /*isVarArg=*/false));
+
+  // Declare __dso_local.
+  Type *DsoHandleTy = Type::getInt8Ty(C);
+  Constant *DsoHandle = M.getOrInsertGlobal("__dso_handle", DsoHandleTy, [&] {
+    auto *GV = new GlobalVariable(M, DsoHandleTy, /*isConstant=*/true,
+                                  GlobalVariable::ExternalWeakLinkage, nullptr,
+                                  "__dso_handle");
+    GV->setVisibility(GlobalVariable::HiddenVisibility);
+    return GV;
+  });
+
+  // For each unique priority level and associated symbol, generate a function
+  // to call all the destructors at that level, and a function to register the
+  // first function with __cxa_atexit.
+  for (auto &PriorityAndMore : DtorFuncs) {
+    uint16_t Priority = PriorityAndMore.first;
+    uint64_t Id = 0;
+    auto &AtThisPriority = PriorityAndMore.second;
+    for (auto &AssociatedAndMore : AtThisPriority) {
+      Constant *Associated = AssociatedAndMore.first;
+      auto ThisId = Id++;
+
+      Function *CallDtors = Function::Create(
+          AtExitFuncTy, Function::PrivateLinkage,
+          "call_dtors" +
+              (Priority != UINT16_MAX ? (Twine(".") + Twine(Priority))
+                                      : Twine()) +
+              (AtThisPriority.size() > 1 ? Twine("$") + Twine(ThisId)
+                                         : Twine()) +
+              (!Associated->isNullValue() ? (Twine(".") + Associated->getName())
+                                          : Twine()),
+          &M);
+      BasicBlock *BB = BasicBlock::Create(C, "body", CallDtors);
+      FunctionType *VoidVoid = FunctionType::get(Type::getVoidTy(C),
+                                                 /*isVarArg=*/false);
+
+      for (auto Dtor : reverse(AssociatedAndMore.second))
+        CallInst::Create(VoidVoid, Dtor, "", BB);
+      ReturnInst::Create(C, BB);
+
+      Function *RegisterCallDtors = Function::Create(
+          VoidVoid, Function::PrivateLinkage,
+          "register_call_dtors" +
+              (Priority != UINT16_MAX ? (Twine(".") + Twine(Priority))
+                                      : Twine()) +
+              (AtThisPriority.size() > 1 ? Twine("$") + Twine(ThisId)
+                                         : Twine()) +
+              (!Associated->isNullValue() ? (Twine(".") + Associated->getName())
+                                          : Twine()),
+          &M);
+      BasicBlock *EntryBB = BasicBlock::Create(C, "entry", RegisterCallDtors);
+      BasicBlock *FailBB = BasicBlock::Create(C, "fail", RegisterCallDtors);
+      BasicBlock *RetBB = BasicBlock::Create(C, "return", RegisterCallDtors);
+
+      Value *Null = ConstantPointerNull::get(VoidStar);
+      Value *Args[] = {CallDtors, Null, DsoHandle};
+      Value *Res = CallInst::Create(AtExit, Args, "call", EntryBB);
+      Value *Cmp = new ICmpInst(*EntryBB, ICmpInst::ICMP_NE, Res,
+                                Constant::getNullValue(Res->getType()));
+      BranchInst::Create(FailBB, RetBB, Cmp, EntryBB);
+
+      // If `__cxa_atexit` hits out-of-memory, trap, so that we don't misbehave.
+      // This should be very rare, because if the process is running out of
+      // memory before main has even started, something is wrong.
+      CallInst::Create(Intrinsic::getDeclaration(&M, Intrinsic::trap), "",
+                       FailBB);
+      new UnreachableInst(C, FailBB);
+
+      ReturnInst::Create(C, RetBB);
+
+      // Now register the registration function with @llvm.global_ctors.
+      appendToGlobalCtors(M, RegisterCallDtors, Priority, Associated);
+    }
+  }
+
+  // Now that we've lowered everything, remove @llvm.global_dtors.
+  GV->eraseFromParent();
+
+  return true;
+}
diff --git a/llvm/lib/Transforms/Utils/LowerInvoke.cpp b/llvm/lib/Transforms/Utils/LowerInvoke.cpp
index fe0ff5899d8f..59cfa41fb7fd 100644
--- a/llvm/lib/Transforms/Utils/LowerInvoke.cpp
+++ b/llvm/lib/Transforms/Utils/LowerInvoke.cpp
@@ -17,8 +17,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils.h"
diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index 3d75dd57456d..b4acb1b2ae90 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -7,9 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
+#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
 using namespace llvm;
@@ -18,7 +20,9 @@ void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
                                      Value *DstAddr, ConstantInt *CopyLen,
                                      Align SrcAlign, Align DstAlign,
                                      bool SrcIsVolatile, bool DstIsVolatile,
-                                     const TargetTransformInfo &TTI) {
+                                     bool CanOverlap,
+                                     const TargetTransformInfo &TTI,
+                                     Optional<uint32_t> AtomicElementSize) {
   // No need to expand zero length copies.
   if (CopyLen->isZero())
     return;
@@ -28,15 +32,25 @@ void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
   Function *ParentFunc = PreLoopBB->getParent();
   LLVMContext &Ctx = PreLoopBB->getContext();
   const DataLayout &DL = ParentFunc->getParent()->getDataLayout();
+  MDBuilder MDB(Ctx);
+  MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain("MemCopyDomain");
+  StringRef Name = "MemCopyAliasScope";
+  MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name);
 
   unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
   unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
 
   Type *TypeOfCopyLen = CopyLen->getType();
   Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
-      Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value());
+      Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value(),
+      AtomicElementSize);
+  assert((!AtomicElementSize || !LoopOpType->isVectorTy()) &&
+         "Atomic memcpy lowering is not supported for vector operand type");
 
   unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
+  assert((!AtomicElementSize || LoopOpSize % *AtomicElementSize == 0) &&
+      "Atomic memcpy lowering is not supported for selected operand size");
+
   uint64_t LoopEndCount = CopyLen->getZExtValue() / LoopOpSize;
 
   if (LoopEndCount != 0) {
@@ -68,12 +82,25 @@ void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
     // Loop Body
     Value *SrcGEP =
         LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex);
-    Value *Load = LoopBuilder.CreateAlignedLoad(LoopOpType, SrcGEP,
-                                                PartSrcAlign, SrcIsVolatile);
+    LoadInst *Load = LoopBuilder.CreateAlignedLoad(LoopOpType, SrcGEP,
+                                                   PartSrcAlign, SrcIsVolatile);
+    if (!CanOverlap) {
+      // Set alias scope for loads.
+      Load->setMetadata(LLVMContext::MD_alias_scope,
+                        MDNode::get(Ctx, NewScope));
+    }
     Value *DstGEP =
         LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex);
-    LoopBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
-
+    StoreInst *Store = LoopBuilder.CreateAlignedStore(
+        Load, DstGEP, PartDstAlign, DstIsVolatile);
+    if (!CanOverlap) {
+      // Indicate that stores don't overlap loads.
+      Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope));
+    }
+    if (AtomicElementSize) {
+      Load->setAtomic(AtomicOrdering::Unordered);
+      Store->setAtomic(AtomicOrdering::Unordered);
+    }
     Value *NewIndex =
         LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1U));
     LoopIndex->addIncoming(NewIndex, LoopBB);
@@ -93,7 +120,7 @@ void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
     SmallVector<Type *, 5> RemainingOps;
     TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
                                           SrcAS, DstAS, SrcAlign.value(),
-                                          DstAlign.value());
+                                          DstAlign.value(), AtomicElementSize);
 
     for (auto OpTy : RemainingOps) {
       Align PartSrcAlign(commonAlignment(SrcAlign, BytesCopied));
@@ -101,6 +128,10 @@ void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
 
       // Calaculate the new index
       unsigned OperandSize = DL.getTypeStoreSize(OpTy);
+      assert(
+          (!AtomicElementSize || OperandSize % *AtomicElementSize == 0) &&
+          "Atomic memcpy lowering is not supported for selected operand size");
+
       uint64_t GepIndex = BytesCopied / OperandSize;
       assert(GepIndex * OperandSize == BytesCopied &&
              "Division should have no Remainder!");
@@ -111,9 +142,13 @@ void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
                              : RBuilder.CreateBitCast(SrcAddr, SrcPtrType);
       Value *SrcGEP = RBuilder.CreateInBoundsGEP(
           OpTy, CastedSrc, ConstantInt::get(TypeOfCopyLen, GepIndex));
-      Value *Load =
+      LoadInst *Load =
           RBuilder.CreateAlignedLoad(OpTy, SrcGEP, PartSrcAlign, SrcIsVolatile);
-
+      if (!CanOverlap) {
+        // Set alias scope for loads.
+        Load->setMetadata(LLVMContext::MD_alias_scope,
+                          MDNode::get(Ctx, NewScope));
+      }
       // Cast destination to operand type and store.
       PointerType *DstPtrType = PointerType::get(OpTy, DstAS);
       Value *CastedDst = DstAddr->getType() == DstPtrType
@@ -121,8 +156,16 @@ void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
                              : RBuilder.CreateBitCast(DstAddr, DstPtrType);
       Value *DstGEP = RBuilder.CreateInBoundsGEP(
           OpTy, CastedDst, ConstantInt::get(TypeOfCopyLen, GepIndex));
-      RBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
-
+      StoreInst *Store = RBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign,
+                                                     DstIsVolatile);
+      if (!CanOverlap) {
+        // Indicate that stores don't overlap loads.
+        Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope));
+      }
+      if (AtomicElementSize) {
+        Load->setAtomic(AtomicOrdering::Unordered);
+        Store->setAtomic(AtomicOrdering::Unordered);
+      }
       BytesCopied += OperandSize;
     }
   }
@@ -134,8 +177,9 @@ void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore,
                                        Value *SrcAddr, Value *DstAddr,
                                        Value *CopyLen, Align SrcAlign,
                                        Align DstAlign, bool SrcIsVolatile,
-                                       bool DstIsVolatile,
-                                       const TargetTransformInfo &TTI) {
+                                       bool DstIsVolatile, bool CanOverlap,
+                                       const TargetTransformInfo &TTI,
+                                       Optional<uint32_t> AtomicElementSize) {
   BasicBlock *PreLoopBB = InsertBefore->getParent();
   BasicBlock *PostLoopBB =
       PreLoopBB->splitBasicBlock(InsertBefore, "post-loop-memcpy-expansion");
@@ -143,12 +187,22 @@ void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore,
   Function *ParentFunc = PreLoopBB->getParent();
   const DataLayout &DL = ParentFunc->getParent()->getDataLayout();
   LLVMContext &Ctx = PreLoopBB->getContext();
+  MDBuilder MDB(Ctx);
+  MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain("MemCopyDomain");
+  StringRef Name = "MemCopyAliasScope";
+  MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name);
+
   unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
   unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
 
   Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
-      Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value());
+      Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value(),
+      AtomicElementSize);
+  assert((!AtomicElementSize || !LoopOpType->isVectorTy()) &&
+         "Atomic memcpy lowering is not supported for vector operand type");
   unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
+  assert((!AtomicElementSize || LoopOpSize % *AtomicElementSize == 0) &&
+         "Atomic memcpy lowering is not supported for selected operand size");
 
   IRBuilder<> PLBuilder(PreLoopBB->getTerminator());
 
@@ -183,19 +237,40 @@ void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore,
   LoopIndex->addIncoming(ConstantInt::get(CopyLenType, 0U), PreLoopBB);
 
   Value *SrcGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex);
-  Value *Load = LoopBuilder.CreateAlignedLoad(LoopOpType, SrcGEP, PartSrcAlign,
-                                              SrcIsVolatile);
+  LoadInst *Load = LoopBuilder.CreateAlignedLoad(LoopOpType, SrcGEP,
+                                                 PartSrcAlign, SrcIsVolatile);
+  if (!CanOverlap) {
+    // Set alias scope for loads.
+    Load->setMetadata(LLVMContext::MD_alias_scope, MDNode::get(Ctx, NewScope));
+  }
   Value *DstGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex);
-  LoopBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
-
+  StoreInst *Store =
+      LoopBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
+  if (!CanOverlap) {
+    // Indicate that stores don't overlap loads.
+    Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope));
+  }
+  if (AtomicElementSize) {
+    Load->setAtomic(AtomicOrdering::Unordered);
+    Store->setAtomic(AtomicOrdering::Unordered);
+  }
   Value *NewIndex =
       LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLenType, 1U));
   LoopIndex->addIncoming(NewIndex, LoopBB);
 
-  if (!LoopOpIsInt8) {
-   // Add in the
-   Value *RuntimeResidual = PLBuilder.CreateURem(CopyLen, CILoopOpSize);
-   Value *RuntimeBytesCopied = PLBuilder.CreateSub(CopyLen, RuntimeResidual);
+  bool requiresResidual =
+      !LoopOpIsInt8 && !(AtomicElementSize && LoopOpSize == AtomicElementSize);
+  if (requiresResidual) {
+    Type *ResLoopOpType = AtomicElementSize
+                              ? Type::getIntNTy(Ctx, *AtomicElementSize * 8)
+                              : Int8Type;
+    unsigned ResLoopOpSize = DL.getTypeStoreSize(ResLoopOpType);
+    assert((ResLoopOpSize == AtomicElementSize ? *AtomicElementSize : 1) &&
+           "Store size is expected to match type size");
+
+    // Add in the
+    Value *RuntimeResidual = PLBuilder.CreateURem(CopyLen, CILoopOpSize);
+    Value *RuntimeBytesCopied = PLBuilder.CreateSub(CopyLen, RuntimeResidual);
 
     // Loop body for the residual copy.
     BasicBlock *ResLoopBB = BasicBlock::Create(Ctx, "loop-memcpy-residual",
@@ -230,21 +305,34 @@ void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore,
         ResBuilder.CreatePHI(CopyLenType, 2, "residual-loop-index");
     ResidualIndex->addIncoming(Zero, ResHeaderBB);
 
-    Value *SrcAsInt8 =
-        ResBuilder.CreateBitCast(SrcAddr, PointerType::get(Int8Type, SrcAS));
-    Value *DstAsInt8 =
-        ResBuilder.CreateBitCast(DstAddr, PointerType::get(Int8Type, DstAS));
+    Value *SrcAsResLoopOpType = ResBuilder.CreateBitCast(
+        SrcAddr, PointerType::get(ResLoopOpType, SrcAS));
+    Value *DstAsResLoopOpType = ResBuilder.CreateBitCast(
+        DstAddr, PointerType::get(ResLoopOpType, DstAS));
     Value *FullOffset = ResBuilder.CreateAdd(RuntimeBytesCopied, ResidualIndex);
-    Value *SrcGEP =
-        ResBuilder.CreateInBoundsGEP(Int8Type, SrcAsInt8, FullOffset);
-    Value *Load = ResBuilder.CreateAlignedLoad(Int8Type, SrcGEP, PartSrcAlign,
-                                               SrcIsVolatile);
-    Value *DstGEP =
-        ResBuilder.CreateInBoundsGEP(Int8Type, DstAsInt8, FullOffset);
-    ResBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
-
-    Value *ResNewIndex =
-        ResBuilder.CreateAdd(ResidualIndex, ConstantInt::get(CopyLenType, 1U));
+    Value *SrcGEP = ResBuilder.CreateInBoundsGEP(
+        ResLoopOpType, SrcAsResLoopOpType, FullOffset);
+    LoadInst *Load = ResBuilder.CreateAlignedLoad(ResLoopOpType, SrcGEP,
+                                                  PartSrcAlign, SrcIsVolatile);
+    if (!CanOverlap) {
+      // Set alias scope for loads.
+      Load->setMetadata(LLVMContext::MD_alias_scope,
+                        MDNode::get(Ctx, NewScope));
+    }
+    Value *DstGEP = ResBuilder.CreateInBoundsGEP(
+        ResLoopOpType, DstAsResLoopOpType, FullOffset);
+    StoreInst *Store = ResBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign,
+                                                     DstIsVolatile);
+    if (!CanOverlap) {
+      // Indicate that stores don't overlap loads.
+      Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope));
+    }
+    if (AtomicElementSize) {
+      Load->setAtomic(AtomicOrdering::Unordered);
+      Store->setAtomic(AtomicOrdering::Unordered);
+    }
+    Value *ResNewIndex = ResBuilder.CreateAdd(
+        ResidualIndex, ConstantInt::get(CopyLenType, ResLoopOpSize));
     ResidualIndex->addIncoming(ResNewIndex, ResLoopBB);
 
     // Create the loop branch condition.
@@ -297,7 +385,13 @@ static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
   Function *F = OrigBB->getParent();
   const DataLayout &DL = F->getParent()->getDataLayout();
 
-  Type *EltTy = SrcAddr->getType()->getPointerElementType();
+  // TODO: Use different element type if possible?
+  IRBuilder<> CastBuilder(InsertBefore);
+  Type *EltTy = CastBuilder.getInt8Ty();
+  Type *PtrTy =
+      CastBuilder.getInt8PtrTy(SrcAddr->getType()->getPointerAddressSpace());
+  SrcAddr = CastBuilder.CreateBitCast(SrcAddr, PtrTy);
+  DstAddr = CastBuilder.CreateBitCast(DstAddr, PtrTy);
 
   // Create the a comparison of src and dst, based on which we jump to either
   // the forward-copy part of the function (if src >= dst) or the backwards-copy
@@ -419,8 +513,21 @@ static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
                            NewBB);
 }
 
+template <typename T>
+static bool canOverlap(MemTransferBase<T> *Memcpy, ScalarEvolution *SE) {
+  if (SE) {
+    auto *SrcSCEV = SE->getSCEV(Memcpy->getRawSource());
+    auto *DestSCEV = SE->getSCEV(Memcpy->getRawDest());
+    if (SE->isKnownPredicateAt(CmpInst::ICMP_NE, SrcSCEV, DestSCEV, Memcpy))
+      return false;
+  }
+  return true;
+}
+
 void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy,
-                              const TargetTransformInfo &TTI) {
+                              const TargetTransformInfo &TTI,
+                              ScalarEvolution *SE) {
+  bool CanOverlap = canOverlap(Memcpy, SE);
   if (ConstantInt *CI = dyn_cast<ConstantInt>(Memcpy->getLength())) {
     createMemCpyLoopKnownSize(
         /* InsertBefore */ Memcpy,
@@ -431,6 +538,7 @@ void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy,
         /* DestAlign */ Memcpy->getDestAlign().valueOrOne(),
         /* SrcIsVolatile */ Memcpy->isVolatile(),
         /* DstIsVolatile */ Memcpy->isVolatile(),
+        /* CanOverlap */ CanOverlap,
         /* TargetTransformInfo */ TTI);
   } else {
     createMemCpyLoopUnknownSize(
@@ -442,6 +550,7 @@ void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy,
         /* DestAlign */ Memcpy->getDestAlign().valueOrOne(),
         /* SrcIsVolatile */ Memcpy->isVolatile(),
         /* DstIsVolatile */ Memcpy->isVolatile(),
+        /* CanOverlap */ CanOverlap,
         /* TargetTransformInfo */ TTI);
   }
 }
@@ -465,3 +574,35 @@ void llvm::expandMemSetAsLoop(MemSetInst *Memset) {
                    /* Alignment */ Memset->getDestAlign().valueOrOne(),
                    Memset->isVolatile());
 }
+
+void llvm::expandAtomicMemCpyAsLoop(AtomicMemCpyInst *AtomicMemcpy,
+                                    const TargetTransformInfo &TTI,
+                                    ScalarEvolution *SE) {
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(AtomicMemcpy->getLength())) {
+    createMemCpyLoopKnownSize(
+        /* InsertBefore */ AtomicMemcpy,
+        /* SrcAddr */ AtomicMemcpy->getRawSource(),
+        /* DstAddr */ AtomicMemcpy->getRawDest(),
+        /* CopyLen */ CI,
+        /* SrcAlign */ AtomicMemcpy->getSourceAlign().valueOrOne(),
+        /* DestAlign */ AtomicMemcpy->getDestAlign().valueOrOne(),
+        /* SrcIsVolatile */ AtomicMemcpy->isVolatile(),
+        /* DstIsVolatile */ AtomicMemcpy->isVolatile(),
+        /* CanOverlap */ false, // SrcAddr & DstAddr may not overlap by spec.
+        /* TargetTransformInfo */ TTI,
+        /* AtomicCpySize */ AtomicMemcpy->getElementSizeInBytes());
+  } else {
+    createMemCpyLoopUnknownSize(
+        /* InsertBefore */ AtomicMemcpy,
+        /* SrcAddr */ AtomicMemcpy->getRawSource(),
+        /* DstAddr */ AtomicMemcpy->getRawDest(),
+        /* CopyLen */ AtomicMemcpy->getLength(),
+        /* SrcAlign */ AtomicMemcpy->getSourceAlign().valueOrOne(),
+        /* DestAlign */ AtomicMemcpy->getDestAlign().valueOrOne(),
+        /* SrcIsVolatile */ AtomicMemcpy->isVolatile(),
+        /* DstIsVolatile */ AtomicMemcpy->isVolatile(),
+        /* CanOverlap */ false, // SrcAddr & DstAddr may not overlap by spec.
+        /* TargetTransformInfo */ TTI,
+        /* AtomicCpySize */ AtomicMemcpy->getElementSizeInBytes());
+  }
+}
diff --git a/llvm/lib/Transforms/Utils/LowerSwitch.cpp b/llvm/lib/Transforms/Utils/LowerSwitch.cpp
index aff9d1311688..44aeb26fadf9 100644
--- a/llvm/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/llvm/lib/Transforms/Utils/LowerSwitch.cpp
@@ -119,25 +119,27 @@ raw_ostream &operator<<(raw_ostream &O, const CaseVector &C) {
 void FixPhis(
     BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB,
     const unsigned NumMergedCases = std::numeric_limits<unsigned>::max()) {
-  for (BasicBlock::iterator I = SuccBB->begin(),
-                            IE = SuccBB->getFirstNonPHI()->getIterator();
-       I != IE; ++I) {
-    PHINode *PN = cast<PHINode>(I);
+  for (auto &I : SuccBB->phis()) {
+    PHINode *PN = cast<PHINode>(&I);
 
-    // Only update the first occurrence.
+    // Only update the first occurrence if NewBB exists.
     unsigned Idx = 0, E = PN->getNumIncomingValues();
     unsigned LocalNumMergedCases = NumMergedCases;
-    for (; Idx != E; ++Idx) {
+    for (; Idx != E && NewBB; ++Idx) {
       if (PN->getIncomingBlock(Idx) == OrigBB) {
         PN->setIncomingBlock(Idx, NewBB);
         break;
       }
     }
 
+    // Skip the updated incoming block so that it will not be removed.
+    if (NewBB)
+      ++Idx;
+
     // Remove additional occurrences coming from condensed cases and keep the
     // number of incoming values equal to the number of branches to SuccBB.
     SmallVector<unsigned, 8> Indices;
-    for (++Idx; LocalNumMergedCases > 0 && Idx < E; ++Idx)
+    for (; LocalNumMergedCases > 0 && Idx < E; ++Idx)
       if (PN->getIncomingBlock(Idx) == OrigBB) {
         Indices.push_back(Idx);
         LocalNumMergedCases--;
@@ -195,6 +197,13 @@ BasicBlock *NewLeafBlock(CaseRange &Leaf, Value *Val, ConstantInt *LowerBound,
   BasicBlock *Succ = Leaf.BB;
   BranchInst::Create(Succ, Default, Comp, NewLeaf);
 
+  // Update the PHI incoming value/block for the default.
+  for (auto &I : Default->phis()) {
+    PHINode *PN = cast<PHINode>(&I);
+    auto *V = PN->getIncomingValueForBlock(OrigBlock);
+    PN->addIncoming(V, NewLeaf);
+  }
+
   // If there were any PHI nodes in this successor, rewrite one entry
   // from OrigBlock to come from NewLeaf.
   for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) {
@@ -494,19 +503,17 @@ void ProcessSwitchInst(SwitchInst *SI,
     Val = SI->getCondition();
   }
 
-  // Create a new, empty default block so that the new hierarchy of
-  // if-then statements go to this and the PHI nodes are happy.
-  BasicBlock *NewDefault = BasicBlock::Create(SI->getContext(), "NewDefault");
-  F->getBasicBlockList().insert(Default->getIterator(), NewDefault);
-  BranchInst::Create(Default, NewDefault);
-
   BasicBlock *SwitchBlock =
       SwitchConvert(Cases.begin(), Cases.end(), LowerBound, UpperBound, Val,
-                    OrigBlock, OrigBlock, NewDefault, UnreachableRanges);
-
-  // If there are entries in any PHI nodes for the default edge, make sure
-  // to update them as well.
-  FixPhis(Default, OrigBlock, NewDefault);
+                    OrigBlock, OrigBlock, Default, UnreachableRanges);
+
+  // We have added incoming values for newly-created predecessors in
+  // NewLeafBlock(). The only meaningful work we offload to FixPhis() is to
+  // remove the incoming values from OrigBlock. There might be a special case
+  // that SwitchBlock is the same as Default, under which the PHIs in Default
+  // are fixed inside SwitchConvert().
+  if (SwitchBlock != Default)
+    FixPhis(Default, OrigBlock, nullptr);
 
   // Branch to our shiny new if-then stuff...
   BranchInst::Create(SwitchBlock, OrigBlock);
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
new file mode 100644
index 000000000000..a1029475cf1d
--- /dev/null
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -0,0 +1,195 @@
+//== MemoryTaggingSupport.cpp - helpers for memory tagging implementations ===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares common infrastructure for HWAddressSanitizer and
+// Aarch64StackTagging.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/MemoryTaggingSupport.h"
+
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/IntrinsicInst.h"
+
+namespace llvm {
+namespace memtag {
+namespace {
+bool maybeReachableFromEachOther(const SmallVectorImpl<IntrinsicInst *> &Insts,
+                                 const DominatorTree *DT, const LoopInfo *LI,
+                                 size_t MaxLifetimes) {
+  // If we have too many lifetime ends, give up, as the algorithm below is N^2.
+  if (Insts.size() > MaxLifetimes)
+    return true;
+  for (size_t I = 0; I < Insts.size(); ++I) {
+    for (size_t J = 0; J < Insts.size(); ++J) {
+      if (I == J)
+        continue;
+      if (isPotentiallyReachable(Insts[I], Insts[J], nullptr, DT, LI))
+        return true;
+    }
+  }
+  return false;
+}
+} // namespace
+
+bool forAllReachableExits(const DominatorTree &DT, const PostDominatorTree &PDT,
+                          const LoopInfo &LI, const Instruction *Start,
+                          const SmallVectorImpl<IntrinsicInst *> &Ends,
+                          const SmallVectorImpl<Instruction *> &RetVec,
+                          llvm::function_ref<void(Instruction *)> Callback) {
+  if (Ends.size() == 1 && PDT.dominates(Ends[0], Start)) {
+    Callback(Ends[0]);
+    return true;
+  }
+  SmallPtrSet<BasicBlock *, 2> EndBlocks;
+  for (auto *End : Ends) {
+    EndBlocks.insert(End->getParent());
+  }
+  SmallVector<Instruction *, 8> ReachableRetVec;
+  unsigned NumCoveredExits = 0;
+  for (auto *RI : RetVec) {
+    if (!isPotentiallyReachable(Start, RI, nullptr, &DT, &LI))
+      continue;
+    ReachableRetVec.push_back(RI);
+    // If there is an end in the same basic block as the return, we know for
+    // sure that the return is covered. Otherwise, we can check whether there
+    // is a way to reach the RI from the start of the lifetime without passing
+    // through an end.
+    if (EndBlocks.count(RI->getParent()) > 0 ||
+        !isPotentiallyReachable(Start, RI, &EndBlocks, &DT, &LI)) {
+      ++NumCoveredExits;
+    }
+  }
+  // If there's a mix of covered and non-covered exits, just put the untag
+  // on exits, so we avoid the redundancy of untagging twice.
+  if (NumCoveredExits == ReachableRetVec.size()) {
+    for (auto *End : Ends)
+      Callback(End);
+  } else {
+    for (auto *RI : ReachableRetVec)
+      Callback(RI);
+    // We may have inserted untag outside of the lifetime interval.
+    // Signal the caller to remove the lifetime end call for this alloca.
+    return false;
+  }
+  return true;
+}
+
+bool isStandardLifetime(const SmallVectorImpl<IntrinsicInst *> &LifetimeStart,
+                        const SmallVectorImpl<IntrinsicInst *> &LifetimeEnd,
+                        const DominatorTree *DT, const LoopInfo *LI,
+                        size_t MaxLifetimes) {
+  // An alloca that has exactly one start and end in every possible execution.
+  // If it has multiple ends, they have to be unreachable from each other, so
+  // at most one of them is actually used for each execution of the function.
+  return LifetimeStart.size() == 1 &&
+         (LifetimeEnd.size() == 1 ||
+          (LifetimeEnd.size() > 0 &&
+           !maybeReachableFromEachOther(LifetimeEnd, DT, LI, MaxLifetimes)));
+}
+
+Instruction *getUntagLocationIfFunctionExit(Instruction &Inst) {
+  if (isa<ReturnInst>(Inst)) {
+    if (CallInst *CI = Inst.getParent()->getTerminatingMustTailCall())
+      return CI;
+    return &Inst;
+  }
+  if (isa<ResumeInst, CleanupReturnInst>(Inst)) {
+    return &Inst;
+  }
+  return nullptr;
+}
+
+void StackInfoBuilder::visit(Instruction &Inst) {
+  if (CallInst *CI = dyn_cast<CallInst>(&Inst)) {
+    if (CI->canReturnTwice()) {
+      Info.CallsReturnTwice = true;
+    }
+  }
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
+    if (IsInterestingAlloca(*AI)) {
+      Info.AllocasToInstrument[AI].AI = AI;
+    }
+    return;
+  }
+  auto *II = dyn_cast<IntrinsicInst>(&Inst);
+  if (II && (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+             II->getIntrinsicID() == Intrinsic::lifetime_end)) {
+    AllocaInst *AI = findAllocaForValue(II->getArgOperand(1));
+    if (!AI) {
+      Info.UnrecognizedLifetimes.push_back(&Inst);
+      return;
+    }
+    if (!IsInterestingAlloca(*AI))
+      return;
+    if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+      Info.AllocasToInstrument[AI].LifetimeStart.push_back(II);
+    else
+      Info.AllocasToInstrument[AI].LifetimeEnd.push_back(II);
+    return;
+  }
+  if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&Inst)) {
+    for (Value *V : DVI->location_ops()) {
+      if (auto *AI = dyn_cast_or_null<AllocaInst>(V)) {
+        if (!IsInterestingAlloca(*AI))
+          continue;
+        AllocaInfo &AInfo = Info.AllocasToInstrument[AI];
+        auto &DVIVec = AInfo.DbgVariableIntrinsics;
+        if (DVIVec.empty() || DVIVec.back() != DVI)
+          DVIVec.push_back(DVI);
+      }
+    }
+  }
+  Instruction *ExitUntag = getUntagLocationIfFunctionExit(Inst);
+  if (ExitUntag)
+    Info.RetVec.push_back(ExitUntag);
+}
+
+uint64_t getAllocaSizeInBytes(const AllocaInst &AI) {
+  auto DL = AI.getModule()->getDataLayout();
+  return *AI.getAllocationSizeInBits(DL) / 8;
+}
+
+void alignAndPadAlloca(memtag::AllocaInfo &Info, llvm::Align Alignment) {
+  const Align NewAlignment = std::max(Info.AI->getAlign(), Alignment);
+  Info.AI->setAlignment(NewAlignment);
+  auto &Ctx = Info.AI->getFunction()->getContext();
+
+  uint64_t Size = getAllocaSizeInBytes(*Info.AI);
+  uint64_t AlignedSize = alignTo(Size, Alignment);
+  if (Size == AlignedSize)
+    return;
+
+  // Add padding to the alloca.
+  Type *AllocatedType =
+      Info.AI->isArrayAllocation()
+          ? ArrayType::get(
+                Info.AI->getAllocatedType(),
+                cast<ConstantInt>(Info.AI->getArraySize())->getZExtValue())
+          : Info.AI->getAllocatedType();
+  Type *PaddingType = ArrayType::get(Type::getInt8Ty(Ctx), AlignedSize - Size);
+  Type *TypeWithPadding = StructType::get(AllocatedType, PaddingType);
+  auto *NewAI =
+      new AllocaInst(TypeWithPadding, Info.AI->getType()->getAddressSpace(),
+                     nullptr, "", Info.AI);
+  NewAI->takeName(Info.AI);
+  NewAI->setAlignment(Info.AI->getAlign());
+  NewAI->setUsedWithInAlloca(Info.AI->isUsedWithInAlloca());
+  NewAI->setSwiftError(Info.AI->isSwiftError());
+  NewAI->copyMetadata(*Info.AI);
+
+  auto *NewPtr = new BitCastInst(NewAI, Info.AI->getType(), "", Info.AI);
+  Info.AI->replaceAllUsesWith(NewPtr);
+  Info.AI->eraseFromParent();
+  Info.AI = NewAI;
+}
+
+} // namespace memtag
+} // namespace llvm
diff --git a/llvm/lib/Transforms/Utils/MisExpect.cpp b/llvm/lib/Transforms/Utils/MisExpect.cpp
new file mode 100644
index 000000000000..b73d68ebec7c
--- /dev/null
+++ b/llvm/lib/Transforms/Utils/MisExpect.cpp
@@ -0,0 +1,249 @@
+//===--- MisExpect.cpp - Check the use of llvm.expect with PGO data -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This contains code to emit warnings for potentially incorrect usage of the
+// llvm.expect intrinsic. This utility extracts the threshold values from
+// metadata associated with the instrumented Branch or Switch instruction. The
+// threshold values are then used to determine if a warning should be emmited.
+//
+// MisExpect's implementation relies on two assumptions about how branch weights
+// are managed in LLVM.
+//
+// 1) Frontend profiling weights are always in place before llvm.expect is
+// lowered in LowerExpectIntrinsic.cpp. Frontend based instrumentation therefore
+// needs to extract the branch weights and then compare them to the weights
+// being added by the llvm.expect intrinsic lowering.
+//
+// 2) Sampling and IR based profiles will *only* have branch weight metadata
+// before profiling data is consulted if they are from a lowered llvm.expect
+// intrinsic. These profiles thus always extract the expected weights and then
+// compare them to the weights collected during profiling to determine if a
+// diagnostic message is warranted.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/MisExpect.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormatVariadic.h"
+#include <cstdint>
+#include <functional>
+#include <numeric>
+
+#define DEBUG_TYPE "misexpect"
+
+using namespace llvm;
+using namespace misexpect;
+
+namespace llvm {
+
+// Command line option to enable/disable the warning when profile data suggests
+// a mismatch with the use of the llvm.expect intrinsic
+static cl::opt<bool> PGOWarnMisExpect(
+    "pgo-warn-misexpect", cl::init(false), cl::Hidden,
+    cl::desc("Use this option to turn on/off "
+             "warnings about incorrect usage of llvm.expect intrinsics."));
+
+static cl::opt<unsigned> MisExpectTolerance(
+    "misexpect-tolerance", cl::init(0),
+    cl::desc("Prevents emiting diagnostics when profile counts are "
+             "within N% of the threshold.."));
+
+} // namespace llvm
+
+namespace {
+
+bool isMisExpectDiagEnabled(LLVMContext &Ctx) {
+  return PGOWarnMisExpect || Ctx.getMisExpectWarningRequested();
+}
+
+uint64_t getMisExpectTolerance(LLVMContext &Ctx) {
+  return std::max(static_cast<uint64_t>(MisExpectTolerance),
+                  Ctx.getDiagnosticsMisExpectTolerance());
+}
+
+Instruction *getInstCondition(Instruction *I) {
+  assert(I != nullptr && "MisExpect target Instruction cannot be nullptr");
+  Instruction *Ret = nullptr;
+  if (auto *B = dyn_cast<BranchInst>(I)) {
+    Ret = dyn_cast<Instruction>(B->getCondition());
+  }
+  // TODO: Find a way to resolve condition location for switches
+  // Using the condition of the switch seems to often resolve to an earlier
+  // point in the program, i.e. the calculation of the switch condition, rather
+  // than the switch's location in the source code. Thus, we should use the
+  // instruction to get source code locations rather than the condition to
+  // improve diagnostic output, such as the caret. If the same problem exists
+  // for branch instructions, then we should remove this function and directly
+  // use the instruction
+  //
+  else if (auto *S = dyn_cast<SwitchInst>(I)) {
+    Ret = dyn_cast<Instruction>(S->getCondition());
+  }
+  return Ret ? Ret : I;
+}
+
+void emitMisexpectDiagnostic(Instruction *I, LLVMContext &Ctx,
+                             uint64_t ProfCount, uint64_t TotalCount) {
+  double PercentageCorrect = (double)ProfCount / TotalCount;
+  auto PerString =
+      formatv("{0:P} ({1} / {2})", PercentageCorrect, ProfCount, TotalCount);
+  auto RemStr = formatv(
+      "Potential performance regression from use of the llvm.expect intrinsic: "
+      "Annotation was correct on {0} of profiled executions.",
+      PerString);
+  Twine Msg(PerString);
+  Instruction *Cond = getInstCondition(I);
+  if (isMisExpectDiagEnabled(Ctx))
+    Ctx.diagnose(DiagnosticInfoMisExpect(Cond, Msg));
+  OptimizationRemarkEmitter ORE(I->getParent()->getParent());
+  ORE.emit(OptimizationRemark(DEBUG_TYPE, "misexpect", Cond) << RemStr.str());
+}
+
+} // namespace
+
+namespace llvm {
+namespace misexpect {
+
+// Helper function to extract branch weights into a vector
+Optional<SmallVector<uint32_t, 4>> extractWeights(Instruction *I,
+                                                  LLVMContext &Ctx) {
+  assert(I && "MisExpect::extractWeights given invalid pointer");
+
+  auto *ProfileData = I->getMetadata(LLVMContext::MD_prof);
+  if (!ProfileData)
+    return None;
+
+  unsigned NOps = ProfileData->getNumOperands();
+  if (NOps < 3)
+    return None;
+
+  auto *ProfDataName = dyn_cast<MDString>(ProfileData->getOperand(0));
+  if (!ProfDataName || !ProfDataName->getString().equals("branch_weights"))
+    return None;
+
+  SmallVector<uint32_t, 4> Weights(NOps - 1);
+  for (unsigned Idx = 1; Idx < NOps; Idx++) {
+    ConstantInt *Value =
+        mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(Idx));
+    uint32_t V = Value->getZExtValue();
+    Weights[Idx - 1] = V;
+  }
+
+  return Weights;
+}
+
+// TODO: when clang allows c++17, use std::clamp instead
+uint32_t clamp(uint64_t value, uint32_t low, uint32_t hi) {
+  if (value > hi)
+    return hi;
+  if (value < low)
+    return low;
+  return value;
+}
+
+void verifyMisExpect(Instruction &I, ArrayRef<uint32_t> RealWeights,
+                     ArrayRef<uint32_t> ExpectedWeights) {
+  // To determine if we emit a diagnostic, we need to compare the branch weights
+  // from the profile to those added by the llvm.expect intrinsic.
+  // So first, we extract the "likely" and "unlikely" weights from
+  // ExpectedWeights And determine the correct weight in the profile to compare
+  // against.
+  uint64_t LikelyBranchWeight = 0,
+           UnlikelyBranchWeight = std::numeric_limits<uint32_t>::max();
+  size_t MaxIndex = 0;
+  for (size_t Idx = 0, End = ExpectedWeights.size(); Idx < End; Idx++) {
+    uint32_t V = ExpectedWeights[Idx];
+    if (LikelyBranchWeight < V) {
+      LikelyBranchWeight = V;
+      MaxIndex = Idx;
+    }
+    if (UnlikelyBranchWeight > V) {
+      UnlikelyBranchWeight = V;
+    }
+  }
+
+  const uint64_t ProfiledWeight = RealWeights[MaxIndex];
+  const uint64_t RealWeightsTotal =
+      std::accumulate(RealWeights.begin(), RealWeights.end(), (uint64_t)0,
+                      std::plus<uint64_t>());
+  const uint64_t NumUnlikelyTargets = RealWeights.size() - 1;
+
+  uint64_t TotalBranchWeight =
+      LikelyBranchWeight + (UnlikelyBranchWeight * NumUnlikelyTargets);
+
+  // FIXME: When we've addressed sample profiling, restore the assertion
+  //
+  // We cannot calculate branch probability if either of these invariants aren't
+  // met. However, MisExpect diagnostics should not prevent code from compiling,
+  // so we simply forgo emitting diagnostics here, and return early.
+  if ((TotalBranchWeight == 0) || (TotalBranchWeight <= LikelyBranchWeight))
+    return;
+
+  // To determine our threshold value we need to obtain the branch probability
+  // for the weights added by llvm.expect and use that proportion to calculate
+  // our threshold based on the collected profile data.
+  auto LikelyProbablilty = BranchProbability::getBranchProbability(
+      LikelyBranchWeight, TotalBranchWeight);
+
+  uint64_t ScaledThreshold = LikelyProbablilty.scale(RealWeightsTotal);
+
+  // clamp tolerance range to [0, 100)
+  auto Tolerance = getMisExpectTolerance(I.getContext());
+  Tolerance = clamp(Tolerance, 0, 99);
+
+  // Allow users to relax checking by N%  i.e., if they use a 5% tolerance,
+  // then we check against 0.95*ScaledThreshold
+  if (Tolerance > 0)
+    ScaledThreshold *= (1.0 - Tolerance / 100.0);
+
+  // When the profile weight is below the threshold, we emit the diagnostic
+  if (ProfiledWeight < ScaledThreshold)
+    emitMisexpectDiagnostic(&I, I.getContext(), ProfiledWeight,
+                            RealWeightsTotal);
+}
+
+void checkBackendInstrumentation(Instruction &I,
+                                 const ArrayRef<uint32_t> RealWeights) {
+  auto ExpectedWeightsOpt = extractWeights(&I, I.getContext());
+  if (!ExpectedWeightsOpt)
+    return;
+  auto ExpectedWeights = ExpectedWeightsOpt.getValue();
+  verifyMisExpect(I, RealWeights, ExpectedWeights);
+}
+
+void checkFrontendInstrumentation(Instruction &I,
+                                  const ArrayRef<uint32_t> ExpectedWeights) {
+  auto RealWeightsOpt = extractWeights(&I, I.getContext());
+  if (!RealWeightsOpt)
+    return;
+  auto RealWeights = RealWeightsOpt.getValue();
+  verifyMisExpect(I, RealWeights, ExpectedWeights);
+}
+
+void checkExpectAnnotations(Instruction &I,
+                            const ArrayRef<uint32_t> ExistingWeights,
+                            bool IsFrontendInstr) {
+  if (IsFrontendInstr) {
+    checkFrontendInstrumentation(I, ExistingWeights);
+  } else {
+    checkBackendInstrumentation(I, ExistingWeights);
+  }
+}
+
+} // namespace misexpect
+} // namespace llvm
+#undef DEBUG_TYPE
diff --git a/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/llvm/lib/Transforms/Utils/ModuleUtils.cpp
index d6a6be2762c7..5120ade70e16 100644
--- a/llvm/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/llvm/lib/Transforms/Utils/ModuleUtils.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/ModuleUtils.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
@@ -237,8 +236,8 @@ std::string llvm::getUniqueModuleId(Module *M) {
   return ("." + Str).str();
 }
 
-void VFABI::setVectorVariantNames(
-    CallInst *CI, const SmallVector<std::string, 8> &VariantMappings) {
+void VFABI::setVectorVariantNames(CallInst *CI,
+                                  ArrayRef<std::string> VariantMappings) {
   if (VariantMappings.empty())
     return;
 
@@ -255,7 +254,7 @@ void VFABI::setVectorVariantNames(
   for (const std::string &VariantMapping : VariantMappings) {
     LLVM_DEBUG(dbgs() << "VFABI: adding mapping '" << VariantMapping << "'\n");
     Optional<VFInfo> VI = VFABI::tryDemangleForVFABI(VariantMapping, *M);
-    assert(VI.hasValue() && "Cannot add an invalid VFABI name.");
+    assert(VI && "Cannot add an invalid VFABI name.");
     assert(M->getNamedValue(VI.getValue().VectorName) &&
            "Cannot add variant to attribute: "
            "vector function declaration is missing.");
@@ -266,14 +265,15 @@ void VFABI::setVectorVariantNames(
 }
 
 void llvm::embedBufferInModule(Module &M, MemoryBufferRef Buf,
-                               StringRef SectionName) {
-  // Embed the buffer into the module.
+                               StringRef SectionName, Align Alignment) {
+  // Embed the memory buffer into the module.
   Constant *ModuleConstant = ConstantDataArray::get(
       M.getContext(), makeArrayRef(Buf.getBufferStart(), Buf.getBufferSize()));
   GlobalVariable *GV = new GlobalVariable(
       M, ModuleConstant->getType(), true, GlobalValue::PrivateLinkage,
       ModuleConstant, "llvm.embedded.object");
   GV->setSection(SectionName);
+  GV->setAlignment(Alignment);
 
   appendToCompilerUsed(M, GV);
 }
diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
index bd2b6fafdf2e..53334bc2a369 100644
--- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
@@ -15,19 +15,12 @@
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/CFG.h"
 #include "llvm/IR/AssemblyAnnotationWriter.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
-#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
@@ -35,7 +28,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/DebugCounter.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Transforms/Utils.h"
 #include <algorithm>
 #define DEBUG_TYPE "predicateinfo"
 using namespace llvm;
diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 01b433b4782a..aff692b36288 100644
--- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -20,7 +20,6 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
@@ -32,7 +31,6 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DebugInfo.h"
-#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstrTypes.h"
@@ -68,7 +66,7 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) {
     if (const LoadInst *LI = dyn_cast<LoadInst>(U)) {
       // Note that atomic loads can be transformed; atomic semantics do
       // not have any meaning for a local alloca.
-      if (LI->isVolatile())
+      if (LI->isVolatile() || LI->getType() != AI->getAllocatedType())
         return false;
     } else if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
       if (SI->getValueOperand() == AI ||
@@ -678,7 +676,7 @@ void PromoteMem2Reg::run() {
     A->eraseFromParent();
   }
 
-  // Remove alloca's dbg.declare instrinsics from the function.
+  // Remove alloca's dbg.declare intrinsics from the function.
   for (auto &DbgUsers : AllocaDbgUsers) {
     for (auto *DII : DbgUsers)
       if (DII->isAddressOfVariable() || DII->getExpression()->startsWithDeref())
@@ -704,7 +702,7 @@ void PromoteMem2Reg::run() {
       PHINode *PN = I->second;
 
       // If this PHI node merges one value and/or undefs, get the value.
-      if (Value *V = SimplifyInstruction(PN, SQ)) {
+      if (Value *V = simplifyInstruction(PN, SQ)) {
         PN->replaceAllUsesWith(V);
         PN->eraseFromParent();
         NewPhiNodes.erase(I++);
diff --git a/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp b/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp
index 65207056a3f4..926427450682 100644
--- a/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp
+++ b/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp
@@ -18,9 +18,6 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
 using namespace llvm;
 
@@ -38,11 +35,13 @@ static bool shouldConvertToRelLookupTable(Module &M, GlobalVariable &GV) {
 
   GetElementPtrInst *GEP =
       dyn_cast<GetElementPtrInst>(GV.use_begin()->getUser());
-  if (!GEP || !GEP->hasOneUse())
+  if (!GEP || !GEP->hasOneUse() ||
+      GV.getValueType() != GEP->getSourceElementType())
     return false;
 
   LoadInst *Load = dyn_cast<LoadInst>(GEP->use_begin()->getUser());
-  if (!Load || !Load->hasOneUse())
+  if (!Load || !Load->hasOneUse() ||
+      Load->getType() != GEP->getResultElementType())
     return false;
 
   // If the original lookup table does not have local linkage and is
@@ -144,7 +143,7 @@ static void convertToRelLookupTable(GlobalVariable &LookupTable) {
   Value *Offset =
       Builder.CreateShl(Index, ConstantInt::get(IntTy, 2), "reltable.shift");
 
-  // Insert the call to load.relative instrinsic before LOAD.
+  // Insert the call to load.relative intrinsic before LOAD.
   // GEP might not be immediately followed by a LOAD, like it can be hoisted
   // outside the loop or another instruction might be inserted them in between.
   Builder.SetInsertPoint(Load);
@@ -171,13 +170,17 @@ static void convertToRelLookupTable(GlobalVariable &LookupTable) {
 // Convert lookup tables to relative lookup tables in the module.
 static bool convertToRelativeLookupTables(
     Module &M, function_ref<TargetTransformInfo &(Function &)> GetTTI) {
-  Module::iterator FI = M.begin();
-  if (FI == M.end())
-    return false;
+  for (Function &F : M) {
+    if (F.isDeclaration())
+      continue;
 
-  // Check if we have a target that supports relative lookup tables.
-  if (!GetTTI(*FI).shouldBuildRelLookupTables())
-    return false;
+    // Check if we have a target that supports relative lookup tables.
+    if (!GetTTI(F).shouldBuildRelLookupTables())
+      return false;
+
+    // We assume that the result is independent of the checked function.
+    break;
+  }
 
   bool Changed = false;
 
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index d7e8eaf677c6..eee91e70292e 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -15,14 +15,12 @@
 #include "llvm/Transforms/Utils/SCCPSolver.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
+#include "llvm/Analysis/ValueLattice.h"
+#include "llvm/IR/InstVisitor.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <utility>
 #include <vector>
@@ -452,7 +450,8 @@ public:
     return TrackingIncomingArguments;
   }
 
-  void markArgInFuncSpecialization(Function *F, Argument *A, Constant *C);
+  void markArgInFuncSpecialization(Function *F,
+                                   const SmallVectorImpl<ArgInfo> &Args);
 
   void markFunctionUnreachable(Function *F) {
     for (auto &BB : *F)
@@ -526,29 +525,38 @@ Constant *SCCPInstVisitor::getConstant(const ValueLatticeElement &LV) const {
   return nullptr;
 }
 
-void SCCPInstVisitor::markArgInFuncSpecialization(Function *F, Argument *A,
-                                                  Constant *C) {
-  assert(F->arg_size() == A->getParent()->arg_size() &&
+void SCCPInstVisitor::markArgInFuncSpecialization(
+    Function *F, const SmallVectorImpl<ArgInfo> &Args) {
+  assert(!Args.empty() && "Specialization without arguments");
+  assert(F->arg_size() == Args[0].Formal->getParent()->arg_size() &&
          "Functions should have the same number of arguments");
 
-  // Mark the argument constant in the new function.
-  markConstant(A, C);
-
-  // For the remaining arguments in the new function, copy the lattice state
-  // over from the old function.
-  for (auto I = F->arg_begin(), J = A->getParent()->arg_begin(),
-            E = F->arg_end();
-       I != E; ++I, ++J)
-    if (J != A && ValueState.count(I)) {
+  auto Iter = Args.begin();
+  Argument *NewArg = F->arg_begin();
+  Argument *OldArg = Args[0].Formal->getParent()->arg_begin();
+  for (auto End = F->arg_end(); NewArg != End; ++NewArg, ++OldArg) {
+
+    LLVM_DEBUG(dbgs() << "SCCP: Marking argument "
+                      << NewArg->getNameOrAsOperand() << "\n");
+
+    if (Iter != Args.end() && OldArg == Iter->Formal) {
+      // Mark the argument constants in the new function.
+      markConstant(NewArg, Iter->Actual);
+      ++Iter;
+    } else if (ValueState.count(OldArg)) {
+      // For the remaining arguments in the new function, copy the lattice state
+      // over from the old function.
+      //
       // Note: This previously looked like this:
-      // ValueState[J] = ValueState[I];
+      // ValueState[NewArg] = ValueState[OldArg];
       // This is incorrect because the DenseMap class may resize the underlying
-      // memory when inserting `J`, which will invalidate the reference to `I`.
-      // Instead, we make sure `J` exists, then set it to `I` afterwards.
-      auto &NewValue = ValueState[J];
-      NewValue = ValueState[I];
-      pushToWorkList(NewValue, J);
+      // memory when inserting `NewArg`, which will invalidate the reference to
+      // `OldArg`. Instead, we make sure `NewArg` exists before setting it.
+      auto &NewValue = ValueState[NewArg];
+      NewValue = ValueState[OldArg];
+      pushToWorkList(NewValue, NewArg);
     }
+  }
 }
 
 void SCCPInstVisitor::visitInstruction(Instruction &I) {
@@ -988,7 +996,7 @@ void SCCPInstVisitor::visitBinaryOperator(Instruction &I) {
   if ((V1State.isConstant() || V2State.isConstant())) {
     Value *V1 = isConstant(V1State) ? getConstant(V1State) : I.getOperand(0);
     Value *V2 = isConstant(V2State) ? getConstant(V2State) : I.getOperand(1);
-    Value *R = SimplifyBinOp(I.getOpcode(), V1, V2, SimplifyQuery(DL));
+    Value *R = simplifyBinOp(I.getOpcode(), V1, V2, SimplifyQuery(DL));
     auto *C = dyn_cast_or_null<Constant>(R);
     if (C) {
       // X op Y -> undef.
@@ -1287,17 +1295,6 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) {
         return;
       }
 
-      // TODO: Actually filp MayIncludeUndef for the created range to false,
-      // once most places in the optimizer respect the branches on
-      // undef/poison are UB rule. The reason why the new range cannot be
-      // undef is as follows below:
-      // The new range is based on a branch condition. That guarantees that
-      // neither of the compare operands can be undef in the branch targets,
-      // unless we have conditions that are always true/false (e.g. icmp ule
-      // i32, %a, i32_max). For the latter overdefined/empty range will be
-      // inferred, but the branch will get folded accordingly anyways.
-      bool MayIncludeUndef = !isa<PredicateAssume>(PI);
-
       ValueLatticeElement CondVal = getValueState(OtherOp);
       ValueLatticeElement &IV = ValueState[&CB];
       if (CondVal.isConstantRange() || CopyOfVal.isConstantRange()) {
@@ -1322,9 +1319,15 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) {
         if (!CopyOfCR.contains(NewCR) && CopyOfCR.getSingleMissingElement())
           NewCR = CopyOfCR;
 
+        // The new range is based on a branch condition. That guarantees that
+        // neither of the compare operands can be undef in the branch targets,
+        // unless we have conditions that are always true/false (e.g. icmp ule
+        // i32, %a, i32_max). For the latter overdefined/empty range will be
+        // inferred, but the branch will get folded accordingly anyways.
         addAdditionalUser(OtherOp, &CB);
-        mergeInValue(IV, &CB,
-                     ValueLatticeElement::getRange(NewCR, MayIncludeUndef));
+        mergeInValue(
+            IV, &CB,
+            ValueLatticeElement::getRange(NewCR, /*MayIncludeUndef*/ false));
         return;
       } else if (Pred == CmpInst::ICMP_EQ && CondVal.isConstant()) {
         // For non-integer values or integer constant expressions, only
@@ -1332,8 +1335,7 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) {
         addAdditionalUser(OtherOp, &CB);
         mergeInValue(IV, &CB, CondVal);
         return;
-      } else if (Pred == CmpInst::ICMP_NE && CondVal.isConstant() &&
-                 !MayIncludeUndef) {
+      } else if (Pred == CmpInst::ICMP_NE && CondVal.isConstant()) {
         // Propagate inequalities.
         addAdditionalUser(OtherOp, &CB);
         mergeInValue(IV, &CB,
@@ -1442,22 +1444,19 @@ void SCCPInstVisitor::solve() {
   }
 }
 
-/// resolvedUndefsIn - While solving the dataflow for a function, we assume
-/// that branches on undef values cannot reach any of their successors.
-/// However, this is not a safe assumption.  After we solve dataflow, this
-/// method should be use to handle this.  If this returns true, the solver
-/// should be rerun.
+/// While solving the dataflow for a function, we don't compute a result for
+/// operations with an undef operand, to allow undef to be lowered to a
+/// constant later. For example, constant folding of "zext i8 undef to i16"
+/// would result in "i16 0", and if undef is later lowered to "i8 1", then the
+/// zext result would become "i16 1" and would result into an overdefined
+/// lattice value once merged with the previous result. Not computing the
+/// result of the zext (treating undef the same as unknown) allows us to handle
+/// a later undef->constant lowering more optimally.
 ///
-/// This method handles this by finding an unresolved branch and marking it one
-/// of the edges from the block as being feasible, even though the condition
-/// doesn't say it would otherwise be.  This allows SCCP to find the rest of the
-/// CFG and only slightly pessimizes the analysis results (by marking one,
-/// potentially infeasible, edge feasible).  This cannot usefully modify the
-/// constraints on the condition of the branch, as that would impact other users
-/// of the value.
-///
-/// This scan also checks for values that use undefs. It conservatively marks
-/// them as overdefined.
+/// However, if the operand remains undef when the solver returns, we do need
+/// to assign some result to the instruction (otherwise we would treat it as
+/// unreachable). For simplicity, we mark any instructions that are still
+/// unknown as overdefined.
 bool SCCPInstVisitor::resolvedUndefsIn(Function &F) {
   bool MadeChange = false;
   for (BasicBlock &BB : F) {
@@ -1486,7 +1485,7 @@ bool SCCPInstVisitor::resolvedUndefsIn(Function &F) {
         // more precise than this but it isn't worth bothering.
         for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
           ValueLatticeElement &LV = getStructValueState(&I, i);
-          if (LV.isUnknownOrUndef()) {
+          if (LV.isUnknown()) {
             markOverdefined(LV, &I);
             MadeChange = true;
           }
@@ -1495,7 +1494,7 @@ bool SCCPInstVisitor::resolvedUndefsIn(Function &F) {
       }
 
       ValueLatticeElement &LV = getValueState(&I);
-      if (!LV.isUnknownOrUndef())
+      if (!LV.isUnknown())
         continue;
 
       // There are two reasons a call can have an undef result
@@ -1518,91 +1517,6 @@ bool SCCPInstVisitor::resolvedUndefsIn(Function &F) {
       markOverdefined(&I);
       MadeChange = true;
     }
-
-    // Check to see if we have a branch or switch on an undefined value.  If so
-    // we force the branch to go one way or the other to make the successor
-    // values live.  It doesn't really matter which way we force it.
-    Instruction *TI = BB.getTerminator();
-    if (auto *BI = dyn_cast<BranchInst>(TI)) {
-      if (!BI->isConditional())
-        continue;
-      if (!getValueState(BI->getCondition()).isUnknownOrUndef())
-        continue;
-
-      // If the input to SCCP is actually branch on undef, fix the undef to
-      // false.
-      if (isa<UndefValue>(BI->getCondition())) {
-        BI->setCondition(ConstantInt::getFalse(BI->getContext()));
-        markEdgeExecutable(&BB, TI->getSuccessor(1));
-        MadeChange = true;
-        continue;
-      }
-
-      // Otherwise, it is a branch on a symbolic value which is currently
-      // considered to be undef.  Make sure some edge is executable, so a
-      // branch on "undef" always flows somewhere.
-      // FIXME: Distinguish between dead code and an LLVM "undef" value.
-      BasicBlock *DefaultSuccessor = TI->getSuccessor(1);
-      if (markEdgeExecutable(&BB, DefaultSuccessor))
-        MadeChange = true;
-
-      continue;
-    }
-
-    if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) {
-      // Indirect branch with no successor ?. Its ok to assume it branches
-      // to no target.
-      if (IBR->getNumSuccessors() < 1)
-        continue;
-
-      if (!getValueState(IBR->getAddress()).isUnknownOrUndef())
-        continue;
-
-      // If the input to SCCP is actually branch on undef, fix the undef to
-      // the first successor of the indirect branch.
-      if (isa<UndefValue>(IBR->getAddress())) {
-        IBR->setAddress(BlockAddress::get(IBR->getSuccessor(0)));
-        markEdgeExecutable(&BB, IBR->getSuccessor(0));
-        MadeChange = true;
-        continue;
-      }
-
-      // Otherwise, it is a branch on a symbolic value which is currently
-      // considered to be undef.  Make sure some edge is executable, so a
-      // branch on "undef" always flows somewhere.
-      // FIXME: IndirectBr on "undef" doesn't actually need to go anywhere:
-      // we can assume the branch has undefined behavior instead.
-      BasicBlock *DefaultSuccessor = IBR->getSuccessor(0);
-      if (markEdgeExecutable(&BB, DefaultSuccessor))
-        MadeChange = true;
-
-      continue;
-    }
-
-    if (auto *SI = dyn_cast<SwitchInst>(TI)) {
-      if (!SI->getNumCases() ||
-          !getValueState(SI->getCondition()).isUnknownOrUndef())
-        continue;
-
-      // If the input to SCCP is actually switch on undef, fix the undef to
-      // the first constant.
-      if (isa<UndefValue>(SI->getCondition())) {
-        SI->setCondition(SI->case_begin()->getCaseValue());
-        markEdgeExecutable(&BB, SI->case_begin()->getCaseSuccessor());
-        MadeChange = true;
-        continue;
-      }
-
-      // Otherwise, it is a branch on a symbolic value which is currently
-      // considered to be undef.  Make sure some edge is executable, so a
-      // branch on "undef" always flows somewhere.
-      // FIXME: Distinguish between dead code and an LLVM "undef" value.
-      BasicBlock *DefaultSuccessor = SI->case_begin()->getCaseSuccessor();
-      if (markEdgeExecutable(&BB, DefaultSuccessor))
-        MadeChange = true;
-
-      continue;
-    }
   }
 
   return MadeChange;
@@ -1618,7 +1532,7 @@ SCCPSolver::SCCPSolver(
     LLVMContext &Ctx)
     : Visitor(new SCCPInstVisitor(DL, std::move(GetTLI), Ctx)) {}
 
-SCCPSolver::~SCCPSolver() {}
+SCCPSolver::~SCCPSolver() = default;
 
 void SCCPSolver::addAnalysis(Function &F, AnalysisResultsForFn A) {
   return Visitor->addAnalysis(F, std::move(A));
@@ -1713,9 +1627,9 @@ SmallPtrSetImpl<Function *> &SCCPSolver::getArgumentTrackedFunctions() {
   return Visitor->getArgumentTrackedFunctions();
 }
 
-void SCCPSolver::markArgInFuncSpecialization(Function *F, Argument *A,
-                                             Constant *C) {
-  Visitor->markArgInFuncSpecialization(F, A, C);
+void SCCPSolver::markArgInFuncSpecialization(
+    Function *F, const SmallVectorImpl<ArgInfo> &Args) {
+  Visitor->markArgInFuncSpecialization(F, Args);
 }
 
 void SCCPSolver::markFunctionUnreachable(Function *F) {
diff --git a/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
index 7d9992176658..37019e3bf95b 100644
--- a/llvm/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
@@ -25,7 +25,6 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/Value.h"
-#include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -166,7 +165,7 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
   // See if the PHI node can be merged to a single value.  This can happen in
   // loop cases when we get a PHI of itself and one other value.
   if (Value *V =
-          SimplifyInstruction(InsertedPHI, BB->getModule()->getDataLayout())) {
+          simplifyInstruction(InsertedPHI, BB->getModule()->getDataLayout())) {
     InsertedPHI->eraseFromParent();
     return V;
   }
diff --git a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
index 961adf2570a7..5e92b9852a9f 100644
--- a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
+++ b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
@@ -15,15 +15,46 @@
 
 #include "llvm/Transforms/Utils/SampleProfileInference.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include <queue>
 #include <set>
+#include <stack>
 
 using namespace llvm;
 #define DEBUG_TYPE "sample-profile-inference"
 
 namespace {
 
+static cl::opt<bool> SampleProfileEvenCountDistribution(
+    "sample-profile-even-count-distribution", cl::init(true), cl::Hidden,
+    cl::desc("Try to evenly distribute counts when there are multiple equally "
+             "likely options."));
+
+static cl::opt<unsigned> SampleProfileMaxDfsCalls(
+    "sample-profile-max-dfs-calls", cl::init(10), cl::Hidden,
+    cl::desc("Maximum number of dfs iterations for even count distribution."));
+
+static cl::opt<unsigned> SampleProfileProfiCostInc(
+    "sample-profile-profi-cost-inc", cl::init(10), cl::Hidden,
+    cl::desc("A cost of increasing a block's count by one."));
+
+static cl::opt<unsigned> SampleProfileProfiCostDec(
+    "sample-profile-profi-cost-dec", cl::init(20), cl::Hidden,
+    cl::desc("A cost of decreasing a block's count by one."));
+
+static cl::opt<unsigned> SampleProfileProfiCostIncZero(
+    "sample-profile-profi-cost-inc-zero", cl::init(11), cl::Hidden,
+    cl::desc("A cost of increasing a count of zero-weight block by one."));
+
+static cl::opt<unsigned> SampleProfileProfiCostIncEntry(
+    "sample-profile-profi-cost-inc-entry", cl::init(40), cl::Hidden,
+    cl::desc("A cost of increasing the entry block's count by one."));
+
+static cl::opt<unsigned> SampleProfileProfiCostDecEntry(
+    "sample-profile-profi-cost-dec-entry", cl::init(10), cl::Hidden,
+    cl::desc("A cost of decreasing the entry block's count by one."));
+
 /// A value indicating an infinite flow/capacity/weight of a block/edge.
 /// Not using numeric_limits<int64_t>::max(), as the values can be summed up
 /// during the execution.
@@ -52,16 +83,16 @@ public:
 
     Nodes = std::vector<Node>(NodeCount);
     Edges = std::vector<std::vector<Edge>>(NodeCount, std::vector<Edge>());
+    if (SampleProfileEvenCountDistribution)
+      AugmentingEdges =
+          std::vector<std::vector<Edge *>>(NodeCount, std::vector<Edge *>());
   }
 
   // Run the algorithm.
   int64_t run() {
-    // Find an augmenting path and update the flow along the path
-    size_t AugmentationIters = 0;
-    while (findAugmentingPath()) {
-      augmentFlowAlongPath();
-      AugmentationIters++;
-    }
+    // Iteratively find an augmentation path/dag in the network and send the
+    // flow along its edges
+    size_t AugmentationIters = applyFlowAugmentation();
 
     // Compute the total flow and its cost
     int64_t TotalCost = 0;
@@ -79,6 +110,7 @@ public:
                       << " iterations with " << TotalFlow << " total flow"
                       << " of " << TotalCost << " cost\n");
     (void)TotalFlow;
+    (void)AugmentationIters;
     return TotalCost;
   }
 
@@ -134,20 +166,61 @@ public:
     return Flow;
   }
 
-  /// A cost of increasing a block's count by one.
-  static constexpr int64_t AuxCostInc = 10;
-  /// A cost of decreasing a block's count by one.
-  static constexpr int64_t AuxCostDec = 20;
-  /// A cost of increasing a count of zero-weight block by one.
-  static constexpr int64_t AuxCostIncZero = 11;
-  /// A cost of increasing the entry block's count by one.
-  static constexpr int64_t AuxCostIncEntry = 40;
-  /// A cost of decreasing the entry block's count by one.
-  static constexpr int64_t AuxCostDecEntry = 10;
   /// A cost of taking an unlikely jump.
   static constexpr int64_t AuxCostUnlikely = ((int64_t)1) << 30;
+  /// Minimum BaseDistance for the jump distance values in island joining.
+  static constexpr uint64_t MinBaseDistance = 10000;
 
 private:
+  /// Iteratively find an augmentation path/dag in the network and send the
+  /// flow along its edges. The method returns the number of applied iterations.
+  size_t applyFlowAugmentation() {
+    size_t AugmentationIters = 0;
+    while (findAugmentingPath()) {
+      uint64_t PathCapacity = computeAugmentingPathCapacity();
+      while (PathCapacity > 0) {
+        bool Progress = false;
+        if (SampleProfileEvenCountDistribution) {
+          // Identify node/edge candidates for augmentation
+          identifyShortestEdges(PathCapacity);
+
+          // Find an augmenting DAG
+          auto AugmentingOrder = findAugmentingDAG();
+
+          // Apply the DAG augmentation
+          Progress = augmentFlowAlongDAG(AugmentingOrder);
+          PathCapacity = computeAugmentingPathCapacity();
+        }
+
+        if (!Progress) {
+          augmentFlowAlongPath(PathCapacity);
+          PathCapacity = 0;
+        }
+
+        AugmentationIters++;
+      }
+    }
+    return AugmentationIters;
+  }
+
+  /// Compute the capacity of the cannonical augmenting path. If the path is
+  /// saturated (that is, no flow can be sent along the path), then return 0.
+  uint64_t computeAugmentingPathCapacity() {
+    uint64_t PathCapacity = INF;
+    uint64_t Now = Target;
+    while (Now != Source) {
+      uint64_t Pred = Nodes[Now].ParentNode;
+      auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex];
+
+      assert(Edge.Capacity >= Edge.Flow && "incorrect edge flow");
+      uint64_t EdgeCapacity = uint64_t(Edge.Capacity - Edge.Flow);
+      PathCapacity = std::min(PathCapacity, EdgeCapacity);
+
+      Now = Pred;
+    }
+    return PathCapacity;
+  }
+
   /// Check for existence of an augmenting path with a positive capacity.
   bool findAugmentingPath() {
     // Initialize data structures
@@ -180,7 +253,7 @@ private:
       //    from Source to Target; it follows from inequalities
       //    Dist[Source, Target] >= Dist[Source, V] + Dist[V, Target]
       //                         >= Dist[Source, V]
-      if (Nodes[Target].Distance == 0)
+      if (!SampleProfileEvenCountDistribution && Nodes[Target].Distance == 0)
         break;
       if (Nodes[Src].Distance > Nodes[Target].Distance)
         continue;
@@ -210,21 +283,9 @@ private:
   }
 
   /// Update the current flow along the augmenting path.
-  void augmentFlowAlongPath() {
-    // Find path capacity
-    int64_t PathCapacity = INF;
-    uint64_t Now = Target;
-    while (Now != Source) {
-      uint64_t Pred = Nodes[Now].ParentNode;
-      auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex];
-      PathCapacity = std::min(PathCapacity, Edge.Capacity - Edge.Flow);
-      Now = Pred;
-    }
-
+  void augmentFlowAlongPath(uint64_t PathCapacity) {
     assert(PathCapacity > 0 && "found an incorrect augmenting path");
-
-    // Update the flow along the path
-    Now = Target;
+    uint64_t Now = Target;
     while (Now != Source) {
       uint64_t Pred = Nodes[Now].ParentNode;
       auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex];
@@ -237,6 +298,220 @@ private:
     }
   }
 
+  /// Find an Augmenting DAG order using a modified version of DFS in which we
+  /// can visit a node multiple times. In the DFS search, when scanning each
+  /// edge out of a node, continue search at Edge.Dst endpoint if it has not
+  /// been discovered yet and its NumCalls < MaxDfsCalls. The algorithm
+  /// runs in O(MaxDfsCalls * |Edges| + |Nodes|) time.
+  /// It returns an Augmenting Order (Taken nodes in decreasing Finish time)
+  /// that starts with Source and ends with Target.
+  std::vector<uint64_t> findAugmentingDAG() {
+    // We use a stack based implemenation of DFS to avoid recursion.
+    // Defining DFS data structures:
+    // A pair (NodeIdx, EdgeIdx) at the top of the Stack denotes that
+    //  - we are currently visiting Nodes[NodeIdx] and
+    //  - the next edge to scan is Edges[NodeIdx][EdgeIdx]
+    typedef std::pair<uint64_t, uint64_t> StackItemType;
+    std::stack<StackItemType> Stack;
+    std::vector<uint64_t> AugmentingOrder;
+
+    // Phase 0: Initialize Node attributes and Time for DFS run
+    for (auto &Node : Nodes) {
+      Node.Discovery = 0;
+      Node.Finish = 0;
+      Node.NumCalls = 0;
+      Node.Taken = false;
+    }
+    uint64_t Time = 0;
+    // Mark Target as Taken
+    // Taken attribute will be propagated backwards from Target towards Source
+    Nodes[Target].Taken = true;
+
+    // Phase 1: Start DFS traversal from Source
+    Stack.emplace(Source, 0);
+    Nodes[Source].Discovery = ++Time;
+    while (!Stack.empty()) {
+      auto NodeIdx = Stack.top().first;
+      auto EdgeIdx = Stack.top().second;
+
+      // If we haven't scanned all edges out of NodeIdx, continue scanning
+      if (EdgeIdx < Edges[NodeIdx].size()) {
+        auto &Edge = Edges[NodeIdx][EdgeIdx];
+        auto &Dst = Nodes[Edge.Dst];
+        Stack.top().second++;
+
+        if (Edge.OnShortestPath) {
+          // If we haven't seen Edge.Dst so far, continue DFS search there
+          if (Dst.Discovery == 0 && Dst.NumCalls < SampleProfileMaxDfsCalls) {
+            Dst.Discovery = ++Time;
+            Stack.emplace(Edge.Dst, 0);
+            Dst.NumCalls++;
+          } else if (Dst.Taken && Dst.Finish != 0) {
+            // Else, if Edge.Dst already have a path to Target, so that NodeIdx
+            Nodes[NodeIdx].Taken = true;
+          }
+        }
+      } else {
+        // If we are done scanning all edge out of NodeIdx
+        Stack.pop();
+        // If we haven't found a path from NodeIdx to Target, forget about it
+        if (!Nodes[NodeIdx].Taken) {
+          Nodes[NodeIdx].Discovery = 0;
+        } else {
+          // If we have found a path from NodeIdx to Target, then finish NodeIdx
+          // and propagate Taken flag to DFS parent unless at the Source
+          Nodes[NodeIdx].Finish = ++Time;
+          // NodeIdx == Source if and only if the stack is empty
+          if (NodeIdx != Source) {
+            assert(!Stack.empty() && "empty stack while running dfs");
+            Nodes[Stack.top().first].Taken = true;
+          }
+          AugmentingOrder.push_back(NodeIdx);
+        }
+      }
+    }
+    // Nodes are collected decreasing Finish time, so the order is reversed
+    std::reverse(AugmentingOrder.begin(), AugmentingOrder.end());
+
+    // Phase 2: Extract all forward (DAG) edges and fill in AugmentingEdges
+    for (size_t Src : AugmentingOrder) {
+      AugmentingEdges[Src].clear();
+      for (auto &Edge : Edges[Src]) {
+        uint64_t Dst = Edge.Dst;
+        if (Edge.OnShortestPath && Nodes[Src].Taken && Nodes[Dst].Taken &&
+            Nodes[Dst].Finish < Nodes[Src].Finish) {
+          AugmentingEdges[Src].push_back(&Edge);
+        }
+      }
+      assert((Src == Target || !AugmentingEdges[Src].empty()) &&
+             "incorrectly constructed augmenting edges");
+    }
+
+    return AugmentingOrder;
+  }
+
+  /// Update the current flow along the given (acyclic) subgraph specified by
+  /// the vertex order, AugmentingOrder. The objective is to send as much flow
+  /// as possible while evenly distributing flow among successors of each node.
+  /// After the update at least one edge is saturated.
+  bool augmentFlowAlongDAG(const std::vector<uint64_t> &AugmentingOrder) {
+    // Phase 0: Initialization
+    for (uint64_t Src : AugmentingOrder) {
+      Nodes[Src].FracFlow = 0;
+      Nodes[Src].IntFlow = 0;
+      for (auto &Edge : AugmentingEdges[Src]) {
+        Edge->AugmentedFlow = 0;
+      }
+    }
+
+    // Phase 1: Send a unit of fractional flow along the DAG
+    uint64_t MaxFlowAmount = INF;
+    Nodes[Source].FracFlow = 1.0;
+    for (uint64_t Src : AugmentingOrder) {
+      assert((Src == Target || Nodes[Src].FracFlow > 0.0) &&
+             "incorrectly computed fractional flow");
+      // Distribute flow evenly among successors of Src
+      uint64_t Degree = AugmentingEdges[Src].size();
+      for (auto &Edge : AugmentingEdges[Src]) {
+        double EdgeFlow = Nodes[Src].FracFlow / Degree;
+        Nodes[Edge->Dst].FracFlow += EdgeFlow;
+        if (Edge->Capacity == INF)
+          continue;
+        uint64_t MaxIntFlow = double(Edge->Capacity - Edge->Flow) / EdgeFlow;
+        MaxFlowAmount = std::min(MaxFlowAmount, MaxIntFlow);
+      }
+    }
+    // Stop early if we cannot send any (integral) flow from Source to Target
+    if (MaxFlowAmount == 0)
+      return false;
+
+    // Phase 2: Send an integral flow of MaxFlowAmount
+    Nodes[Source].IntFlow = MaxFlowAmount;
+    for (uint64_t Src : AugmentingOrder) {
+      if (Src == Target)
+        break;
+      // Distribute flow evenly among successors of Src, rounding up to make
+      // sure all flow is sent
+      uint64_t Degree = AugmentingEdges[Src].size();
+      // We are guaranteeed that Node[Src].IntFlow <= SuccFlow * Degree
+      uint64_t SuccFlow = (Nodes[Src].IntFlow + Degree - 1) / Degree;
+      for (auto &Edge : AugmentingEdges[Src]) {
+        uint64_t Dst = Edge->Dst;
+        uint64_t EdgeFlow = std::min(Nodes[Src].IntFlow, SuccFlow);
+        EdgeFlow = std::min(EdgeFlow, uint64_t(Edge->Capacity - Edge->Flow));
+        Nodes[Dst].IntFlow += EdgeFlow;
+        Nodes[Src].IntFlow -= EdgeFlow;
+        Edge->AugmentedFlow += EdgeFlow;
+      }
+    }
+    assert(Nodes[Target].IntFlow <= MaxFlowAmount);
+    Nodes[Target].IntFlow = 0;
+
+    // Phase 3: Send excess flow back traversing the nodes backwards.
+    // Because of rounding, not all flow can be sent along the edges of Src.
+    // Hence, sending the remaining flow back to maintain flow conservation
+    for (size_t Idx = AugmentingOrder.size() - 1; Idx > 0; Idx--) {
+      uint64_t Src = AugmentingOrder[Idx - 1];
+      // Try to send excess flow back along each edge.
+      // Make sure we only send back flow we just augmented (AugmentedFlow).
+      for (auto &Edge : AugmentingEdges[Src]) {
+        uint64_t Dst = Edge->Dst;
+        if (Nodes[Dst].IntFlow == 0)
+          continue;
+        uint64_t EdgeFlow = std::min(Nodes[Dst].IntFlow, Edge->AugmentedFlow);
+        Nodes[Dst].IntFlow -= EdgeFlow;
+        Nodes[Src].IntFlow += EdgeFlow;
+        Edge->AugmentedFlow -= EdgeFlow;
+      }
+    }
+
+    // Phase 4: Update flow values along all edges
+    bool HasSaturatedEdges = false;
+    for (uint64_t Src : AugmentingOrder) {
+      // Verify that we have sent all the excess flow from the node
+      assert(Src == Source || Nodes[Src].IntFlow == 0);
+      for (auto &Edge : AugmentingEdges[Src]) {
+        assert(uint64_t(Edge->Capacity - Edge->Flow) >= Edge->AugmentedFlow);
+        // Update flow values along the edge and its reverse copy
+        auto &RevEdge = Edges[Edge->Dst][Edge->RevEdgeIndex];
+        Edge->Flow += Edge->AugmentedFlow;
+        RevEdge.Flow -= Edge->AugmentedFlow;
+        if (Edge->Capacity == Edge->Flow && Edge->AugmentedFlow > 0)
+          HasSaturatedEdges = true;
+      }
+    }
+
+    // The augmentation is successful iff at least one edge becomes saturated
+    return HasSaturatedEdges;
+  }
+
+  /// Identify candidate (shortest) edges for augmentation.
+  void identifyShortestEdges(uint64_t PathCapacity) {
+    assert(PathCapacity > 0 && "found an incorrect augmenting DAG");
+    // To make sure the augmentation DAG contains only edges with large residual
+    // capacity, we prune all edges whose capacity is below a fraction of
+    // the capacity of the augmented path.
+    // (All edges of the path itself are always in the DAG)
+    uint64_t MinCapacity = std::max(PathCapacity / 2, uint64_t(1));
+
+    // Decide which edges are on a shortest path from Source to Target
+    for (size_t Src = 0; Src < Nodes.size(); Src++) {
+      // An edge cannot be augmenting if the endpoint has large distance
+      if (Nodes[Src].Distance > Nodes[Target].Distance)
+        continue;
+
+      for (auto &Edge : Edges[Src]) {
+        uint64_t Dst = Edge.Dst;
+        Edge.OnShortestPath =
+            Src != Target && Dst != Source &&
+            Nodes[Dst].Distance <= Nodes[Target].Distance &&
+            Nodes[Dst].Distance == Nodes[Src].Distance + Edge.Cost &&
+            Edge.Capacity > Edge.Flow &&
+            uint64_t(Edge.Capacity - Edge.Flow) >= MinCapacity;
+      }
+    }
+  }
+
   /// A node in a flow network.
   struct Node {
     /// The cost of the cheapest path from the source to the current node.
@@ -247,7 +522,20 @@ private:
     uint64_t ParentEdgeIndex;
     /// An indicator of whether the current node is in a queue.
     bool Taken;
+
+    /// Data fields utilized in DAG-augmentation:
+    /// Fractional flow.
+    double FracFlow;
+    /// Integral flow.
+    uint64_t IntFlow;
+    /// Discovery time.
+    uint64_t Discovery;
+    /// Finish time.
+    uint64_t Finish;
+    /// NumCalls.
+    uint64_t NumCalls;
   };
+
   /// An edge in a flow network.
   struct Edge {
     /// The cost of the edge.
@@ -260,6 +548,12 @@ private:
     uint64_t Dst;
     /// The index of the reverse edge between Dst and the current node.
     uint64_t RevEdgeIndex;
+
+    /// Data fields utilized in DAG-augmentation:
+    /// Whether the edge is currently on a shortest path from Source to Target.
+    bool OnShortestPath;
+    /// Extra flow along the edge.
+    uint64_t AugmentedFlow;
   };
 
   /// The set of network nodes.
@@ -270,8 +564,13 @@ private:
   uint64_t Source;
   /// Target (sink) node of the flow.
   uint64_t Target;
+  /// Augmenting edges.
+  std::vector<std::vector<Edge *>> AugmentingEdges;
 };
 
+constexpr int64_t MinCostMaxFlow::AuxCostUnlikely;
+constexpr uint64_t MinCostMaxFlow::MinBaseDistance;
+
 /// A post-processing adjustment of control flow. It applies two steps by
 /// rerouting some flow and making it more realistic:
 ///
@@ -433,19 +732,22 @@ private:
   /// A distance of a path for a given jump.
   /// In order to incite the path to use blocks/jumps with large positive flow,
   /// and avoid changing branch probability of outgoing edges drastically,
-  /// set the distance as follows:
-  ///   if Jump.Flow > 0, then distance = max(100 - Jump->Flow, 0)
-  ///   if Block.Weight > 0, then distance = 1
-  ///   otherwise distance >> 1
+  /// set the jump distance so as:
+  ///   - to minimize the number of unlikely jumps used and subject to that,
+  ///   - to minimize the number of Flow == 0 jumps used and subject to that,
+  ///   - minimizes total multiplicative Flow increase for the remaining edges.
+  /// To capture this objective with integer distances, we round off fractional
+  /// parts to a multiple of 1 / BaseDistance.
   int64_t jumpDistance(FlowJump *Jump) const {
-    int64_t BaseDistance = 100;
+    uint64_t BaseDistance =
+        std::max(static_cast<uint64_t>(MinCostMaxFlow::MinBaseDistance),
+                 std::min(Func.Blocks[Func.Entry].Flow,
+                          MinCostMaxFlow::AuxCostUnlikely / NumBlocks()));
     if (Jump->IsUnlikely)
       return MinCostMaxFlow::AuxCostUnlikely;
     if (Jump->Flow > 0)
-      return std::max(BaseDistance - (int64_t)Jump->Flow, (int64_t)0);
-    if (Func.Blocks[Jump->Target].Weight > 0)
-      return BaseDistance;
-    return BaseDistance * (NumBlocks() + 1);
+      return BaseDistance + BaseDistance / Jump->Flow;
+    return BaseDistance * NumBlocks();
   };
 
   uint64_t NumBlocks() const { return Func.Blocks.size(); }
@@ -511,7 +813,7 @@ private:
                            std::vector<FlowBlock *> &KnownDstBlocks,
                            std::vector<FlowBlock *> &UnknownBlocks) {
     // Run BFS from SrcBlock and make sure all paths are going through unknown
-    // blocks and end at a non-unknown DstBlock
+    // blocks and end at a known DstBlock
     auto Visited = BitVector(NumBlocks(), false);
     std::queue<uint64_t> Queue;
 
@@ -778,8 +1080,8 @@ void initializeNetwork(MinCostMaxFlow &Network, FlowFunction &Func) {
     // We assume that decreasing block counts is more expensive than increasing,
     // and thus, setting separate costs here. In the future we may want to tune
     // the relative costs so as to maximize the quality of generated profiles.
-    int64_t AuxCostInc = MinCostMaxFlow::AuxCostInc;
-    int64_t AuxCostDec = MinCostMaxFlow::AuxCostDec;
+    int64_t AuxCostInc = SampleProfileProfiCostInc;
+    int64_t AuxCostDec = SampleProfileProfiCostDec;
     if (Block.UnknownWeight) {
       // Do not penalize changing weights of blocks w/o known profile count
       AuxCostInc = 0;
@@ -788,12 +1090,12 @@ void initializeNetwork(MinCostMaxFlow &Network, FlowFunction &Func) {
       // Increasing the count for "cold" blocks with zero initial count is more
       // expensive than for "hot" ones
       if (Block.Weight == 0) {
-        AuxCostInc = MinCostMaxFlow::AuxCostIncZero;
+        AuxCostInc = SampleProfileProfiCostIncZero;
       }
       // Modifying the count of the entry block is expensive
       if (Block.isEntry()) {
-        AuxCostInc = MinCostMaxFlow::AuxCostIncEntry;
-        AuxCostDec = MinCostMaxFlow::AuxCostDecEntry;
+        AuxCostInc = SampleProfileProfiCostIncEntry;
+        AuxCostDec = SampleProfileProfiCostDecEntry;
       }
     }
     // For blocks with self-edges, do not penalize a reduction of the count,
diff --git a/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp b/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp
index ea0e8343eb88..a2588b8cec7d 100644
--- a/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp
+++ b/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp
@@ -11,6 +11,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
 
 namespace llvm {
 
@@ -35,9 +39,13 @@ cl::opt<bool> NoWarnSampleUnused(
              "samples but without debug information to use those samples. "));
 
 cl::opt<bool> SampleProfileUseProfi(
-    "sample-profile-use-profi", cl::init(false), cl::Hidden, cl::ZeroOrMore,
+    "sample-profile-use-profi", cl::Hidden,
     cl::desc("Use profi to infer block and edge counts."));
 
+cl::opt<bool> SampleProfileInferEntryCount(
+    "sample-profile-infer-entry-count", cl::init(true), cl::Hidden,
+    cl::desc("Use profi to infer function entry count."));
+
 namespace sampleprofutil {
 
 /// Return true if the given callsite is hot wrt to hot cutoff threshold.
diff --git a/llvm/lib/Transforms/Utils/SanitizerStats.cpp b/llvm/lib/Transforms/Utils/SanitizerStats.cpp
index a1313c77ed77..fd21ee4cc408 100644
--- a/llvm/lib/Transforms/Utils/SanitizerStats.cpp
+++ b/llvm/lib/Transforms/Utils/SanitizerStats.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/SanitizerStats.h"
-#include "llvm/ADT/Triple.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GlobalVariable.h"
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 5363a851fc27..401f1ee5a55d 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -22,11 +22,8 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 
@@ -276,7 +273,9 @@ Value *SCEVExpander::InsertBinop(Instruction::BinaryOps Opcode,
   }
 
   // If we haven't found this binop, insert it.
-  Instruction *BO = cast<Instruction>(Builder.CreateBinOp(Opcode, LHS, RHS));
+  // TODO: Use the Builder, which will make CreateBinOp below fold with
+  // InstSimplifyFolder.
+  Instruction *BO = Builder.Insert(BinaryOperator::Create(Opcode, LHS, RHS));
   BO->setDebugLoc(Loc);
   if (Flags & SCEV::FlagNUW)
     BO->setHasNoUnsignedWrap();
@@ -591,7 +590,9 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
         if (isa<DbgInfoIntrinsic>(IP))
           ScanLimit++;
         if (IP->getOpcode() == Instruction::GetElementPtr &&
-            IP->getOperand(0) == V && IP->getOperand(1) == Idx)
+            IP->getOperand(0) == V && IP->getOperand(1) == Idx &&
+            cast<GEPOperator>(&*IP)->getSourceElementType() ==
+                Type::getInt8Ty(Ty->getContext()))
           return &*IP;
         if (IP == BlockBegin) break;
       }
@@ -1633,7 +1634,6 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
     NewS = Ext;
 
   const SCEV *V = cast<SCEVAddRecExpr>(NewS)->evaluateAtIteration(IH, SE);
-  //cerr << "Evaluated: " << *this << "\n     to: " << *V << "\n";
 
   // Truncate the result down to the original type, if needed.
   const SCEV *T = SE.getTruncateOrNoop(V, Ty);
@@ -1671,154 +1671,49 @@ Value *SCEVExpander::visitSignExtendExpr(const SCEVSignExtendExpr *S) {
   return Builder.CreateSExt(V, Ty);
 }
 
-Value *SCEVExpander::expandSMaxExpr(const SCEVNAryExpr *S) {
-  Value *LHS = expand(S->getOperand(S->getNumOperands()-1));
-  Type *Ty = LHS->getType();
-  for (int i = S->getNumOperands()-2; i >= 0; --i) {
-    // In the case of mixed integer and pointer types, do the
-    // rest of the comparisons as integer.
-    Type *OpTy = S->getOperand(i)->getType();
-    if (OpTy->isIntegerTy() != Ty->isIntegerTy()) {
-      Ty = SE.getEffectiveSCEVType(Ty);
-      LHS = InsertNoopCastOfTo(LHS, Ty);
-    }
-    Value *RHS = expandCodeForImpl(S->getOperand(i), Ty, false);
-    Value *Sel;
-    if (Ty->isIntegerTy())
-      Sel = Builder.CreateIntrinsic(Intrinsic::smax, {Ty}, {LHS, RHS},
-                                    /*FMFSource=*/nullptr, "smax");
-    else {
-      Value *ICmp = Builder.CreateICmpSGT(LHS, RHS);
-      Sel = Builder.CreateSelect(ICmp, LHS, RHS, "smax");
-    }
-    LHS = Sel;
-  }
-  // In the case of mixed integer and pointer types, cast the
-  // final result back to the pointer type.
-  if (LHS->getType() != S->getType())
-    LHS = InsertNoopCastOfTo(LHS, S->getType());
-  return LHS;
-}
-
-Value *SCEVExpander::expandUMaxExpr(const SCEVNAryExpr *S) {
-  Value *LHS = expand(S->getOperand(S->getNumOperands()-1));
-  Type *Ty = LHS->getType();
-  for (int i = S->getNumOperands()-2; i >= 0; --i) {
-    // In the case of mixed integer and pointer types, do the
-    // rest of the comparisons as integer.
-    Type *OpTy = S->getOperand(i)->getType();
-    if (OpTy->isIntegerTy() != Ty->isIntegerTy()) {
-      Ty = SE.getEffectiveSCEVType(Ty);
-      LHS = InsertNoopCastOfTo(LHS, Ty);
-    }
-    Value *RHS = expandCodeForImpl(S->getOperand(i), Ty, false);
-    Value *Sel;
-    if (Ty->isIntegerTy())
-      Sel = Builder.CreateIntrinsic(Intrinsic::umax, {Ty}, {LHS, RHS},
-                                    /*FMFSource=*/nullptr, "umax");
-    else {
-      Value *ICmp = Builder.CreateICmpUGT(LHS, RHS);
-      Sel = Builder.CreateSelect(ICmp, LHS, RHS, "umax");
-    }
-    LHS = Sel;
-  }
-  // In the case of mixed integer and pointer types, cast the
-  // final result back to the pointer type.
-  if (LHS->getType() != S->getType())
-    LHS = InsertNoopCastOfTo(LHS, S->getType());
-  return LHS;
-}
-
-Value *SCEVExpander::expandSMinExpr(const SCEVNAryExpr *S) {
-  Value *LHS = expand(S->getOperand(S->getNumOperands() - 1));
-  Type *Ty = LHS->getType();
-  for (int i = S->getNumOperands() - 2; i >= 0; --i) {
-    // In the case of mixed integer and pointer types, do the
-    // rest of the comparisons as integer.
-    Type *OpTy = S->getOperand(i)->getType();
-    if (OpTy->isIntegerTy() != Ty->isIntegerTy()) {
-      Ty = SE.getEffectiveSCEVType(Ty);
-      LHS = InsertNoopCastOfTo(LHS, Ty);
-    }
-    Value *RHS = expandCodeForImpl(S->getOperand(i), Ty, false);
-    Value *Sel;
-    if (Ty->isIntegerTy())
-      Sel = Builder.CreateIntrinsic(Intrinsic::smin, {Ty}, {LHS, RHS},
-                                    /*FMFSource=*/nullptr, "smin");
-    else {
-      Value *ICmp = Builder.CreateICmpSLT(LHS, RHS);
-      Sel = Builder.CreateSelect(ICmp, LHS, RHS, "smin");
-    }
-    LHS = Sel;
-  }
-  // In the case of mixed integer and pointer types, cast the
-  // final result back to the pointer type.
-  if (LHS->getType() != S->getType())
-    LHS = InsertNoopCastOfTo(LHS, S->getType());
-  return LHS;
-}
-
-Value *SCEVExpander::expandUMinExpr(const SCEVNAryExpr *S) {
+Value *SCEVExpander::expandMinMaxExpr(const SCEVNAryExpr *S,
+                                      Intrinsic::ID IntrinID, Twine Name,
+                                      bool IsSequential) {
   Value *LHS = expand(S->getOperand(S->getNumOperands() - 1));
   Type *Ty = LHS->getType();
+  if (IsSequential)
+    LHS = Builder.CreateFreeze(LHS);
   for (int i = S->getNumOperands() - 2; i >= 0; --i) {
-    // In the case of mixed integer and pointer types, do the
-    // rest of the comparisons as integer.
-    Type *OpTy = S->getOperand(i)->getType();
-    if (OpTy->isIntegerTy() != Ty->isIntegerTy()) {
-      Ty = SE.getEffectiveSCEVType(Ty);
-      LHS = InsertNoopCastOfTo(LHS, Ty);
-    }
     Value *RHS = expandCodeForImpl(S->getOperand(i), Ty, false);
+    if (IsSequential && i != 0)
+      RHS = Builder.CreateFreeze(RHS);
     Value *Sel;
     if (Ty->isIntegerTy())
-      Sel = Builder.CreateIntrinsic(Intrinsic::umin, {Ty}, {LHS, RHS},
-                                    /*FMFSource=*/nullptr, "umin");
+      Sel = Builder.CreateIntrinsic(IntrinID, {Ty}, {LHS, RHS},
+                                    /*FMFSource=*/nullptr, Name);
     else {
-      Value *ICmp = Builder.CreateICmpULT(LHS, RHS);
-      Sel = Builder.CreateSelect(ICmp, LHS, RHS, "umin");
+      Value *ICmp =
+          Builder.CreateICmp(MinMaxIntrinsic::getPredicate(IntrinID), LHS, RHS);
+      Sel = Builder.CreateSelect(ICmp, LHS, RHS, Name);
     }
     LHS = Sel;
   }
-  // In the case of mixed integer and pointer types, cast the
-  // final result back to the pointer type.
-  if (LHS->getType() != S->getType())
-    LHS = InsertNoopCastOfTo(LHS, S->getType());
   return LHS;
 }
 
 Value *SCEVExpander::visitSMaxExpr(const SCEVSMaxExpr *S) {
-  return expandSMaxExpr(S);
+  return expandMinMaxExpr(S, Intrinsic::smax, "smax");
 }
 
 Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) {
-  return expandUMaxExpr(S);
+  return expandMinMaxExpr(S, Intrinsic::umax, "umax");
 }
 
 Value *SCEVExpander::visitSMinExpr(const SCEVSMinExpr *S) {
-  return expandSMinExpr(S);
+  return expandMinMaxExpr(S, Intrinsic::smin, "smin");
 }
 
 Value *SCEVExpander::visitUMinExpr(const SCEVUMinExpr *S) {
-  return expandUMinExpr(S);
+  return expandMinMaxExpr(S, Intrinsic::umin, "umin");
 }
 
 Value *SCEVExpander::visitSequentialUMinExpr(const SCEVSequentialUMinExpr *S) {
-  SmallVector<Value *> Ops;
-  for (const SCEV *Op : S->operands())
-    Ops.emplace_back(expand(Op));
-
-  Value *SaturationPoint =
-      MinMaxIntrinsic::getSaturationPoint(Intrinsic::umin, S->getType());
-
-  SmallVector<Value *> OpIsZero;
-  for (Value *Op : ArrayRef<Value *>(Ops).drop_back())
-    OpIsZero.emplace_back(Builder.CreateICmpEQ(Op, SaturationPoint));
-
-  Value *AnyOpIsZero = Builder.CreateLogicalOr(OpIsZero);
-
-  Value *NaiveUMin = expandUMinExpr(S);
-  return Builder.CreateSelect(AnyOpIsZero, SaturationPoint, NaiveUMin);
+  return expandMinMaxExpr(S, Intrinsic::umin, "umin", /*IsSequential*/true);
 }
 
 Value *SCEVExpander::expandCodeForImpl(const SCEV *SH, Type *Ty,
@@ -1868,35 +1763,33 @@ Value *SCEVExpander::expandCodeForImpl(const SCEV *SH, Type *Ty, bool Root) {
   return V;
 }
 
-ScalarEvolution::ValueOffsetPair
-SCEVExpander::FindValueInExprValueMap(const SCEV *S,
-                                      const Instruction *InsertPt) {
-  auto *Set = SE.getSCEVValues(S);
+Value *SCEVExpander::FindValueInExprValueMap(const SCEV *S,
+                                             const Instruction *InsertPt) {
   // If the expansion is not in CanonicalMode, and the SCEV contains any
   // sub scAddRecExpr type SCEV, it is required to expand the SCEV literally.
-  if (CanonicalMode || !SE.containsAddRecurrence(S)) {
-    // If S is scConstant, it may be worse to reuse an existing Value.
-    if (S->getSCEVType() != scConstant && Set) {
-      // Choose a Value from the set which dominates the InsertPt.
-      // InsertPt should be inside the Value's parent loop so as not to break
-      // the LCSSA form.
-      for (auto const &VOPair : *Set) {
-        Value *V = VOPair.first;
-        ConstantInt *Offset = VOPair.second;
-        Instruction *EntInst = dyn_cast_or_null<Instruction>(V);
-        if (!EntInst)
-          continue;
+  if (!CanonicalMode && SE.containsAddRecurrence(S))
+    return nullptr;
 
-        assert(EntInst->getFunction() == InsertPt->getFunction());
-        if (S->getType() == V->getType() &&
-            SE.DT.dominates(EntInst, InsertPt) &&
-            (SE.LI.getLoopFor(EntInst->getParent()) == nullptr ||
-             SE.LI.getLoopFor(EntInst->getParent())->contains(InsertPt)))
-          return {V, Offset};
-      }
-    }
+  // If S is a constant, it may be worse to reuse an existing Value.
+  if (isa<SCEVConstant>(S))
+    return nullptr;
+
+  // Choose a Value from the set which dominates the InsertPt.
+  // InsertPt should be inside the Value's parent loop so as not to break
+  // the LCSSA form.
+  for (Value *V : SE.getSCEVValues(S)) {
+    Instruction *EntInst = dyn_cast<Instruction>(V);
+    if (!EntInst)
+      continue;
+
+    assert(EntInst->getFunction() == InsertPt->getFunction());
+    if (S->getType() == V->getType() &&
+        SE.DT.dominates(EntInst, InsertPt) &&
+        (SE.LI.getLoopFor(EntInst->getParent()) == nullptr ||
+         SE.LI.getLoopFor(EntInst->getParent())->contains(InsertPt)))
+      return V;
   }
-  return {nullptr, nullptr};
+  return nullptr;
 }
 
 // The expansion of SCEV will either reuse a previous Value in ExprValueMap,
@@ -1965,9 +1858,7 @@ Value *SCEVExpander::expand(const SCEV *S) {
   Builder.SetInsertPoint(InsertPt);
 
   // Expand the expression into instructions.
-  ScalarEvolution::ValueOffsetPair VO = FindValueInExprValueMap(S, InsertPt);
-  Value *V = VO.first;
-
+  Value *V = FindValueInExprValueMap(S, InsertPt);
   if (!V)
     V = visit(S);
   else {
@@ -1978,21 +1869,6 @@ Value *SCEVExpander::expand(const SCEV *S) {
     if (auto *I = dyn_cast<Instruction>(V))
       if (I->hasPoisonGeneratingFlags() && !programUndefinedIfPoison(I))
         I->dropPoisonGeneratingFlags();
-
-    if (VO.second) {
-      if (PointerType *Vty = dyn_cast<PointerType>(V->getType())) {
-        int64_t Offset = VO.second->getSExtValue();
-        ConstantInt *Idx =
-          ConstantInt::getSigned(VO.second->getType(), -Offset);
-        unsigned AS = Vty->getAddressSpace();
-        V = Builder.CreateBitCast(V, Type::getInt8PtrTy(SE.getContext(), AS));
-        V = Builder.CreateGEP(Type::getInt8Ty(SE.getContext()), V, Idx,
-                              "uglygep");
-        V = Builder.CreateBitCast(V, Vty);
-      } else {
-        V = Builder.CreateSub(V, VO.second);
-      }
-    }
   }
   // Remember the expanded value for this SCEV at this location.
   //
@@ -2058,7 +1934,7 @@ SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT,
   // so narrow phis can reuse them.
   for (PHINode *Phi : Phis) {
     auto SimplifyPHINode = [&](PHINode *PN) -> Value * {
-      if (Value *V = SimplifyInstruction(PN, {DL, &SE.TLI, &SE.DT, &SE.AC}))
+      if (Value *V = simplifyInstruction(PN, {DL, &SE.TLI, &SE.DT, &SE.AC}))
         return V;
       if (!SE.isSCEVable(PN->getType()))
         return nullptr;
@@ -2174,9 +2050,9 @@ SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT,
   return NumElim;
 }
 
-Optional<ScalarEvolution::ValueOffsetPair>
-SCEVExpander::getRelatedExistingExpansion(const SCEV *S, const Instruction *At,
-                                          Loop *L) {
+Value *SCEVExpander::getRelatedExistingExpansion(const SCEV *S,
+                                                 const Instruction *At,
+                                                 Loop *L) {
   using namespace llvm::PatternMatch;
 
   SmallVector<BasicBlock *, 4> ExitingBlocks;
@@ -2193,25 +2069,17 @@ SCEVExpander::getRelatedExistingExpansion(const SCEV *S, const Instruction *At,
       continue;
 
     if (SE.getSCEV(LHS) == S && SE.DT.dominates(LHS, At))
-      return ScalarEvolution::ValueOffsetPair(LHS, nullptr);
+      return LHS;
 
     if (SE.getSCEV(RHS) == S && SE.DT.dominates(RHS, At))
-      return ScalarEvolution::ValueOffsetPair(RHS, nullptr);
+      return RHS;
   }
 
   // Use expand's logic which is used for reusing a previous Value in
   // ExprValueMap.  Note that we don't currently model the cost of
   // needing to drop poison generating flags on the instruction if we
   // want to reuse it.  We effectively assume that has zero cost.
-  ScalarEvolution::ValueOffsetPair VO = FindValueInExprValueMap(S, At);
-  if (VO.first)
-    return VO;
-
-  // There is potential to make this significantly smarter, but this simple
-  // heuristic already gets some interesting cases.
-
-  // Can not find suitable value.
-  return None;
+  return FindValueInExprValueMap(S, At);
 }
 
 template<typename T> static InstructionCost costAndCollectOperands(
@@ -2469,8 +2337,8 @@ Value *SCEVExpander::expandCodeForPredicate(const SCEVPredicate *Pred,
   switch (Pred->getKind()) {
   case SCEVPredicate::P_Union:
     return expandUnionPredicate(cast<SCEVUnionPredicate>(Pred), IP);
-  case SCEVPredicate::P_Equal:
-    return expandEqualPredicate(cast<SCEVEqualPredicate>(Pred), IP);
+  case SCEVPredicate::P_Compare:
+    return expandComparePredicate(cast<SCEVComparePredicate>(Pred), IP);
   case SCEVPredicate::P_Wrap: {
     auto *AddRecPred = cast<SCEVWrapPredicate>(Pred);
     return expandWrapPredicate(AddRecPred, IP);
@@ -2479,15 +2347,16 @@ Value *SCEVExpander::expandCodeForPredicate(const SCEVPredicate *Pred,
   llvm_unreachable("Unknown SCEV predicate type");
 }
 
-Value *SCEVExpander::expandEqualPredicate(const SCEVEqualPredicate *Pred,
-                                          Instruction *IP) {
+Value *SCEVExpander::expandComparePredicate(const SCEVComparePredicate *Pred,
+                                            Instruction *IP) {
   Value *Expr0 =
       expandCodeForImpl(Pred->getLHS(), Pred->getLHS()->getType(), IP, false);
   Value *Expr1 =
       expandCodeForImpl(Pred->getRHS(), Pred->getRHS()->getType(), IP, false);
 
   Builder.SetInsertPoint(IP);
-  auto *I = Builder.CreateICmpNE(Expr0, Expr1, "ident.check");
+  auto InvPred = ICmpInst::getInversePredicate(Pred->getPredicate());
+  auto *I = Builder.CreateICmp(InvPred, Expr0, Expr1, "ident.check");
   return I;
 }
 
@@ -2496,7 +2365,8 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR,
   assert(AR->isAffine() && "Cannot generate RT check for "
                            "non-affine expression");
 
-  SCEVUnionPredicate Pred;
+  // FIXME: It is highly suspicious that we're ignoring the predicates here.
+  SmallVector<const SCEVPredicate *, 4> Pred;
   const SCEV *ExitCount =
       SE.getPredicatedBackedgeTakenCount(AR->getLoop(), Pred);
 
@@ -2710,10 +2580,10 @@ namespace {
 struct SCEVFindUnsafe {
   ScalarEvolution &SE;
   bool CanonicalMode;
-  bool IsUnsafe;
+  bool IsUnsafe = false;
 
   SCEVFindUnsafe(ScalarEvolution &SE, bool CanonicalMode)
-      : SE(SE), CanonicalMode(CanonicalMode), IsUnsafe(false) {}
+      : SE(SE), CanonicalMode(CanonicalMode) {}
 
   bool follow(const SCEV *S) {
     if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 335ac03ccb52..567b866f7777 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -27,7 +27,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/GuardUtils.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemorySSA.h"
@@ -50,7 +50,6 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
@@ -58,7 +57,6 @@
 #include "llvm/IR/NoFolder.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/PseudoProbe.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
@@ -74,7 +72,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
 #include <cassert>
@@ -94,8 +91,8 @@ using namespace PatternMatch;
 #define DEBUG_TYPE "simplifycfg"
 
 cl::opt<bool> llvm::RequireAndPreserveDomTree(
-    "simplifycfg-require-and-preserve-domtree", cl::Hidden, cl::ZeroOrMore,
-    cl::init(false),
+    "simplifycfg-require-and-preserve-domtree", cl::Hidden,
+
     cl::desc("Temorary development switch used to gradually uplift SimplifyCFG "
              "into preserving DomTree,"));
 
@@ -167,6 +164,14 @@ static cl::opt<unsigned> BranchFoldToCommonDestVectorMultiplier(
              "to fold branch to common destination when vector operations are "
              "present"));
 
+static cl::opt<bool> EnableMergeCompatibleInvokes(
+    "simplifycfg-merge-compatible-invokes", cl::Hidden, cl::init(true),
+    cl::desc("Allow SimplifyCFG to merge invokes together when appropriate"));
+
+static cl::opt<unsigned> MaxSwitchCasesPerResult(
+    "max-switch-cases-per-result", cl::Hidden, cl::init(16),
+    cl::desc("Limit cases to analyze when converting a switch to select"));
+
 STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
 STATISTIC(NumLinearMaps,
           "Number of switch instructions turned into linear mapping");
@@ -192,6 +197,8 @@ STATISTIC(NumSinkCommonInstrs,
 STATISTIC(NumSpeculations, "Number of speculative executed instructions");
 STATISTIC(NumInvokes,
           "Number of invokes with empty resume blocks simplified into calls");
+STATISTIC(NumInvokesMerged, "Number of invokes that were merged together");
+STATISTIC(NumInvokeSetsFormed, "Number of invoke sets that were formed");
 
 namespace {
 
@@ -291,6 +298,34 @@ public:
 
 } // end anonymous namespace
 
+/// Return true if all the PHI nodes in the basic block \p BB
+/// receive compatible (identical) incoming values when coming from
+/// all of the predecessor blocks that are specified in \p IncomingBlocks.
+///
+/// Note that if the values aren't exactly identical, but \p EquivalenceSet
+/// is provided, and *both* of the values are present in the set,
+/// then they are considered equal.
+static bool IncomingValuesAreCompatible(
+    BasicBlock *BB, ArrayRef<BasicBlock *> IncomingBlocks,
+    SmallPtrSetImpl<Value *> *EquivalenceSet = nullptr) {
+  assert(IncomingBlocks.size() == 2 &&
+         "Only for a pair of incoming blocks at the time!");
+
+  // FIXME: it is okay if one of the incoming values is an `undef` value,
+  //        iff the other incoming value is guaranteed to be a non-poison value.
+  // FIXME: it is okay if one of the incoming values is a `poison` value.
+  return all_of(BB->phis(), [IncomingBlocks, EquivalenceSet](PHINode &PN) {
+    Value *IV0 = PN.getIncomingValueForBlock(IncomingBlocks[0]);
+    Value *IV1 = PN.getIncomingValueForBlock(IncomingBlocks[1]);
+    if (IV0 == IV1)
+      return true;
+    if (EquivalenceSet && EquivalenceSet->contains(IV0) &&
+        EquivalenceSet->contains(IV1))
+      return true;
+    return false;
+  });
+}
+
 /// Return true if it is safe to merge these two
 /// terminator instructions together.
 static bool
@@ -307,17 +342,17 @@ SafeToMergeTerminators(Instruction *SI1, Instruction *SI2,
 
   SmallPtrSet<BasicBlock *, 16> SI1Succs(succ_begin(SI1BB), succ_end(SI1BB));
   bool Fail = false;
-  for (BasicBlock *Succ : successors(SI2BB))
-    if (SI1Succs.count(Succ))
-      for (BasicBlock::iterator BBI = Succ->begin(); isa<PHINode>(BBI); ++BBI) {
-        PHINode *PN = cast<PHINode>(BBI);
-        if (PN->getIncomingValueForBlock(SI1BB) !=
-            PN->getIncomingValueForBlock(SI2BB)) {
-          if (FailBlocks)
-            FailBlocks->insert(Succ);
-          Fail = true;
-        }
-      }
+  for (BasicBlock *Succ : successors(SI2BB)) {
+    if (!SI1Succs.count(Succ))
+      continue;
+    if (IncomingValuesAreCompatible(Succ, {SI1BB, SI2BB}))
+      continue;
+    Fail = true;
+    if (FailBlocks)
+      FailBlocks->insert(Succ);
+    else
+      break;
+  }
 
   return !Fail;
 }
@@ -347,6 +382,13 @@ static InstructionCost computeSpeculationCost(const User *I,
   return TTI.getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency);
 }
 
+/// Check whether this is a potentially trapping constant.
+static bool canTrap(const Value *V) {
+  if (auto *C = dyn_cast<Constant>(V))
+    return C->canTrap();
+  return false;
+}
+
 /// If we have a merge point of an "if condition" as accepted above,
 /// return true if the specified value dominates the block.  We
 /// don't handle the true generality of domination here, just a special case
@@ -381,10 +423,7 @@ static bool dominatesMergePoint(Value *V, BasicBlock *BB,
   if (!I) {
     // Non-instructions all dominate instructions, but not all constantexprs
     // can be executed unconditionally.
-    if (ConstantExpr *C = dyn_cast<ConstantExpr>(V))
-      if (C->canTrap())
-        return false;
-    return true;
+    return !canTrap(V);
   }
   BasicBlock *PBB = I->getParent();
 
@@ -1459,7 +1498,7 @@ bool SimplifyCFGOpt::HoistThenElseCodeToIf(BranchInst *BI,
       return false;
     if (!I1NonDbg->isTerminator())
       return false;
-    // Now we know that we only need to hoist debug instrinsics and the
+    // Now we know that we only need to hoist debug intrinsics and the
     // terminator. Let the loop below handle those 2 cases.
   }
 
@@ -2212,6 +2251,320 @@ static bool SinkCommonCodeFromPredecessors(BasicBlock *BB,
   return Changed;
 }
 
+namespace {
+
+struct CompatibleSets {
+  using SetTy = SmallVector<InvokeInst *, 2>;
+
+  SmallVector<SetTy, 1> Sets;
+
+  static bool shouldBelongToSameSet(ArrayRef<InvokeInst *> Invokes);
+
+  SetTy &getCompatibleSet(InvokeInst *II);
+
+  void insert(InvokeInst *II);
+};
+
+CompatibleSets::SetTy &CompatibleSets::getCompatibleSet(InvokeInst *II) {
+  // Perform a linear scan over all the existing sets, see if the new `invoke`
+  // is compatible with any particular set. Since we know that all the `invokes`
+  // within a set are compatible, only check the first `invoke` in each set.
+  // WARNING: at worst, this has quadratic complexity.
+  for (CompatibleSets::SetTy &Set : Sets) {
+    if (CompatibleSets::shouldBelongToSameSet({Set.front(), II}))
+      return Set;
+  }
+
+  // Otherwise, we either had no sets yet, or this invoke forms a new set.
+  return Sets.emplace_back();
+}
+
+void CompatibleSets::insert(InvokeInst *II) {
+  getCompatibleSet(II).emplace_back(II);
+}
+
+bool CompatibleSets::shouldBelongToSameSet(ArrayRef<InvokeInst *> Invokes) {
+  assert(Invokes.size() == 2 && "Always called with exactly two candidates.");
+
+  // Can we theoretically merge these `invoke`s?
+  auto IsIllegalToMerge = [](InvokeInst *II) {
+    return II->cannotMerge() || II->isInlineAsm();
+  };
+  if (any_of(Invokes, IsIllegalToMerge))
+    return false;
+
+  // Either both `invoke`s must be   direct,
+  // or     both `invoke`s must be indirect.
+  auto IsIndirectCall = [](InvokeInst *II) { return II->isIndirectCall(); };
+  bool HaveIndirectCalls = any_of(Invokes, IsIndirectCall);
+  bool AllCallsAreIndirect = all_of(Invokes, IsIndirectCall);
+  if (HaveIndirectCalls) {
+    if (!AllCallsAreIndirect)
+      return false;
+  } else {
+    // All callees must be identical.
+    Value *Callee = nullptr;
+    for (InvokeInst *II : Invokes) {
+      Value *CurrCallee = II->getCalledOperand();
+      assert(CurrCallee && "There is always a called operand.");
+      if (!Callee)
+        Callee = CurrCallee;
+      else if (Callee != CurrCallee)
+        return false;
+    }
+  }
+
+  // Either both `invoke`s must not have a normal destination,
+  // or     both `invoke`s must     have a normal destination,
+  auto HasNormalDest = [](InvokeInst *II) {
+    return !isa<UnreachableInst>(II->getNormalDest()->getFirstNonPHIOrDbg());
+  };
+  if (any_of(Invokes, HasNormalDest)) {
+    // Do not merge `invoke` that does not have a normal destination with one
+    // that does have a normal destination, even though doing so would be legal.
+    if (!all_of(Invokes, HasNormalDest))
+      return false;
+
+    // All normal destinations must be identical.
+    BasicBlock *NormalBB = nullptr;
+    for (InvokeInst *II : Invokes) {
+      BasicBlock *CurrNormalBB = II->getNormalDest();
+      assert(CurrNormalBB && "There is always a 'continue to' basic block.");
+      if (!NormalBB)
+        NormalBB = CurrNormalBB;
+      else if (NormalBB != CurrNormalBB)
+        return false;
+    }
+
+    // In the normal destination, the incoming values for these two `invoke`s
+    // must be compatible.
+    SmallPtrSet<Value *, 16> EquivalenceSet(Invokes.begin(), Invokes.end());
+    if (!IncomingValuesAreCompatible(
+            NormalBB, {Invokes[0]->getParent(), Invokes[1]->getParent()},
+            &EquivalenceSet))
+      return false;
+  }
+
+#ifndef NDEBUG
+  // All unwind destinations must be identical.
+  // We know that because we have started from said unwind destination.
+  BasicBlock *UnwindBB = nullptr;
+  for (InvokeInst *II : Invokes) {
+    BasicBlock *CurrUnwindBB = II->getUnwindDest();
+    assert(CurrUnwindBB && "There is always an 'unwind to' basic block.");
+    if (!UnwindBB)
+      UnwindBB = CurrUnwindBB;
+    else
+      assert(UnwindBB == CurrUnwindBB && "Unexpected unwind destination.");
+  }
+#endif
+
+  // In the unwind destination, the incoming values for these two `invoke`s
+  // must be compatible.
+  if (!IncomingValuesAreCompatible(
+          Invokes.front()->getUnwindDest(),
+          {Invokes[0]->getParent(), Invokes[1]->getParent()}))
+    return false;
+
+  // Ignoring arguments, these `invoke`s must be identical,
+  // including operand bundles.
+  const InvokeInst *II0 = Invokes.front();
+  for (auto *II : Invokes.drop_front())
+    if (!II->isSameOperationAs(II0))
+      return false;
+
+  // Can we theoretically form the data operands for the merged `invoke`?
+  auto IsIllegalToMergeArguments = [](auto Ops) {
+    Type *Ty = std::get<0>(Ops)->getType();
+    assert(Ty == std::get<1>(Ops)->getType() && "Incompatible types?");
+    return Ty->isTokenTy() && std::get<0>(Ops) != std::get<1>(Ops);
+  };
+  assert(Invokes.size() == 2 && "Always called with exactly two candidates.");
+  if (any_of(zip(Invokes[0]->data_ops(), Invokes[1]->data_ops()),
+             IsIllegalToMergeArguments))
+    return false;
+
+  return true;
+}
+
+} // namespace
+
+// Merge all invokes in the provided set, all of which are compatible
+// as per the `CompatibleSets::shouldBelongToSameSet()`.
+static void MergeCompatibleInvokesImpl(ArrayRef<InvokeInst *> Invokes,
+                                       DomTreeUpdater *DTU) {
+  assert(Invokes.size() >= 2 && "Must have at least two invokes to merge.");
+
+  SmallVector<DominatorTree::UpdateType, 8> Updates;
+  if (DTU)
+    Updates.reserve(2 + 3 * Invokes.size());
+
+  bool HasNormalDest =
+      !isa<UnreachableInst>(Invokes[0]->getNormalDest()->getFirstNonPHIOrDbg());
+
+  // Clone one of the invokes into a new basic block.
+  // Since they are all compatible, it doesn't matter which invoke is cloned.
+  InvokeInst *MergedInvoke = [&Invokes, HasNormalDest]() {
+    InvokeInst *II0 = Invokes.front();
+    BasicBlock *II0BB = II0->getParent();
+    BasicBlock *InsertBeforeBlock =
+        II0->getParent()->getIterator()->getNextNode();
+    Function *Func = II0BB->getParent();
+    LLVMContext &Ctx = II0->getContext();
+
+    BasicBlock *MergedInvokeBB = BasicBlock::Create(
+        Ctx, II0BB->getName() + ".invoke", Func, InsertBeforeBlock);
+
+    auto *MergedInvoke = cast<InvokeInst>(II0->clone());
+    // NOTE: all invokes have the same attributes, so no handling needed.
+    MergedInvokeBB->getInstList().push_back(MergedInvoke);
+
+    if (!HasNormalDest) {
+      // This set does not have a normal destination,
+      // so just form a new block with unreachable terminator.
+      BasicBlock *MergedNormalDest = BasicBlock::Create(
+          Ctx, II0BB->getName() + ".cont", Func, InsertBeforeBlock);
+      new UnreachableInst(Ctx, MergedNormalDest);
+      MergedInvoke->setNormalDest(MergedNormalDest);
+    }
+
+    // The unwind destination, however, remainds identical for all invokes here.
+
+    return MergedInvoke;
+  }();
+
+  if (DTU) {
+    // Predecessor blocks that contained these invokes will now branch to
+    // the new block that contains the merged invoke, ...
+    for (InvokeInst *II : Invokes)
+      Updates.push_back(
+          {DominatorTree::Insert, II->getParent(), MergedInvoke->getParent()});
+
+    // ... which has the new `unreachable` block as normal destination,
+    // or unwinds to the (same for all `invoke`s in this set) `landingpad`,
+    for (BasicBlock *SuccBBOfMergedInvoke : successors(MergedInvoke))
+      Updates.push_back({DominatorTree::Insert, MergedInvoke->getParent(),
+                         SuccBBOfMergedInvoke});
+
+    // Since predecessor blocks now unconditionally branch to a new block,
+    // they no longer branch to their original successors.
+    for (InvokeInst *II : Invokes)
+      for (BasicBlock *SuccOfPredBB : successors(II->getParent()))
+        Updates.push_back(
+            {DominatorTree::Delete, II->getParent(), SuccOfPredBB});
+  }
+
+  bool IsIndirectCall = Invokes[0]->isIndirectCall();
+
+  // Form the merged operands for the merged invoke.
+  for (Use &U : MergedInvoke->operands()) {
+    // Only PHI together the indirect callees and data operands.
+    if (MergedInvoke->isCallee(&U)) {
+      if (!IsIndirectCall)
+        continue;
+    } else if (!MergedInvoke->isDataOperand(&U))
+      continue;
+
+    // Don't create trivial PHI's with all-identical incoming values.
+    bool NeedPHI = any_of(Invokes, [&U](InvokeInst *II) {
+      return II->getOperand(U.getOperandNo()) != U.get();
+    });
+    if (!NeedPHI)
+      continue;
+
+    // Form a PHI out of all the data ops under this index.
+    PHINode *PN = PHINode::Create(
+        U->getType(), /*NumReservedValues=*/Invokes.size(), "", MergedInvoke);
+    for (InvokeInst *II : Invokes)
+      PN->addIncoming(II->getOperand(U.getOperandNo()), II->getParent());
+
+    U.set(PN);
+  }
+
+  // We've ensured that each PHI node has compatible (identical) incoming values
+  // when coming from each of the `invoke`s in the current merge set,
+  // so update the PHI nodes accordingly.
+  for (BasicBlock *Succ : successors(MergedInvoke))
+    AddPredecessorToBlock(Succ, /*NewPred=*/MergedInvoke->getParent(),
+                          /*ExistPred=*/Invokes.front()->getParent());
+
+  // And finally, replace the original `invoke`s with an unconditional branch
+  // to the block with the merged `invoke`. Also, give that merged `invoke`
+  // the merged debugloc of all the original `invoke`s.
+  const DILocation *MergedDebugLoc = nullptr;
+  for (InvokeInst *II : Invokes) {
+    // Compute the debug location common to all the original `invoke`s.
+    if (!MergedDebugLoc)
+      MergedDebugLoc = II->getDebugLoc();
+    else
+      MergedDebugLoc =
+          DILocation::getMergedLocation(MergedDebugLoc, II->getDebugLoc());
+
+    // And replace the old `invoke` with an unconditionally branch
+    // to the block with the merged `invoke`.
+    for (BasicBlock *OrigSuccBB : successors(II->getParent()))
+      OrigSuccBB->removePredecessor(II->getParent());
+    BranchInst::Create(MergedInvoke->getParent(), II->getParent());
+    II->replaceAllUsesWith(MergedInvoke);
+    II->eraseFromParent();
+    ++NumInvokesMerged;
+  }
+  MergedInvoke->setDebugLoc(MergedDebugLoc);
+  ++NumInvokeSetsFormed;
+
+  if (DTU)
+    DTU->applyUpdates(Updates);
+}
+
+/// If this block is a `landingpad` exception handling block, categorize all
+/// the predecessor `invoke`s into sets, with all `invoke`s in each set
+/// being "mergeable" together, and then merge invokes in each set together.
+///
+/// This is a weird mix of hoisting and sinking. Visually, it goes from:
+///          [...]        [...]
+///            |            |
+///        [invoke0]    [invoke1]
+///           / \          / \
+///     [cont0] [landingpad] [cont1]
+/// to:
+///      [...] [...]
+///          \ /
+///       [invoke]
+///          / \
+///     [cont] [landingpad]
+///
+/// But of course we can only do that if the invokes share the `landingpad`,
+/// edges invoke0->cont0 and invoke1->cont1 are "compatible",
+/// and the invoked functions are "compatible".
+static bool MergeCompatibleInvokes(BasicBlock *BB, DomTreeUpdater *DTU) {
+  if (!EnableMergeCompatibleInvokes)
+    return false;
+
+  bool Changed = false;
+
+  // FIXME: generalize to all exception handling blocks?
+  if (!BB->isLandingPad())
+    return Changed;
+
+  CompatibleSets Grouper;
+
+  // Record all the predecessors of this `landingpad`. As per verifier,
+  // the only allowed predecessor is the unwind edge of an `invoke`.
+  // We want to group "compatible" `invokes` into the same set to be merged.
+  for (BasicBlock *PredBB : predecessors(BB))
+    Grouper.insert(cast<InvokeInst>(PredBB->getTerminator()));
+
+  // And now, merge `invoke`s that were grouped togeter.
+  for (ArrayRef<InvokeInst *> Invokes : Grouper.Sets) {
+    if (Invokes.size() < 2)
+      continue;
+    Changed = true;
+    MergeCompatibleInvokesImpl(Invokes, DTU);
+  }
+
+  return Changed;
+}
+
 /// Determine if we can hoist sink a sole store instruction out of a
 /// conditional block.
 ///
@@ -2326,15 +2679,15 @@ static bool validateAndCostRequiredSelects(BasicBlock *BB, BasicBlock *ThenBB,
         passingValueIsAlwaysUndefined(ThenV, &PN))
       return false;
 
+    if (canTrap(OrigV) || canTrap(ThenV))
+      return false;
+
     HaveRewritablePHIs = true;
     ConstantExpr *OrigCE = dyn_cast<ConstantExpr>(OrigV);
     ConstantExpr *ThenCE = dyn_cast<ConstantExpr>(ThenV);
     if (!OrigCE && !ThenCE)
-      continue; // Known safe and cheap.
+      continue; // Known cheap (FIXME: Maybe not true for aggregates).
 
-    if ((ThenCE && !isSafeToSpeculativelyExecute(ThenCE)) ||
-        (OrigCE && !isSafeToSpeculativelyExecute(OrigCE)))
-      return false;
     InstructionCost OrigCost = OrigCE ? computeSpeculationCost(OrigCE, TTI) : 0;
     InstructionCost ThenCost = ThenCE ? computeSpeculationCost(ThenCE, TTI) : 0;
     InstructionCost MaxCost =
@@ -2626,40 +2979,85 @@ static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
   return true;
 }
 
-/// If we have a conditional branch on a PHI node value that is defined in the
-/// same block as the branch and if any PHI entries are constants, thread edges
-/// corresponding to that entry to be branches to their ultimate destination.
-static Optional<bool> FoldCondBranchOnPHIImpl(BranchInst *BI,
-                                              DomTreeUpdater *DTU,
-                                              const DataLayout &DL,
-                                              AssumptionCache *AC) {
+static ConstantInt *
+getKnownValueOnEdge(Value *V, BasicBlock *From, BasicBlock *To,
+                    SmallDenseMap<std::pair<BasicBlock *, BasicBlock *>,
+                                  ConstantInt *> &Visited) {
+  // Don't look past the block defining the value, we might get the value from
+  // a previous loop iteration.
+  auto *I = dyn_cast<Instruction>(V);
+  if (I && I->getParent() == To)
+    return nullptr;
+
+  // We know the value if the From block branches on it.
+  auto *BI = dyn_cast<BranchInst>(From->getTerminator());
+  if (BI && BI->isConditional() && BI->getCondition() == V &&
+      BI->getSuccessor(0) != BI->getSuccessor(1))
+    return BI->getSuccessor(0) == To ? ConstantInt::getTrue(BI->getContext())
+                                     : ConstantInt::getFalse(BI->getContext());
+
+  // Limit the amount of blocks we inspect.
+  if (Visited.size() >= 8)
+    return nullptr;
+
+  auto Pair = Visited.try_emplace({From, To}, nullptr);
+  if (!Pair.second)
+    return Pair.first->second;
+
+  // Check whether the known value is the same for all predecessors.
+  ConstantInt *Common = nullptr;
+  for (BasicBlock *Pred : predecessors(From)) {
+    ConstantInt *C = getKnownValueOnEdge(V, Pred, From, Visited);
+    if (!C || (Common && Common != C))
+      return nullptr;
+    Common = C;
+  }
+  return Visited[{From, To}] = Common;
+}
+
+/// If we have a conditional branch on something for which we know the constant
+/// value in predecessors (e.g. a phi node in the current block), thread edges
+/// from the predecessor to their ultimate destination.
+static Optional<bool>
+FoldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU,
+                                            const DataLayout &DL,
+                                            AssumptionCache *AC) {
+  SmallMapVector<BasicBlock *, ConstantInt *, 8> KnownValues;
   BasicBlock *BB = BI->getParent();
-  PHINode *PN = dyn_cast<PHINode>(BI->getCondition());
-  // NOTE: we currently cannot transform this case if the PHI node is used
-  // outside of the block.
-  if (!PN || PN->getParent() != BB || !PN->hasOneUse())
-    return false;
+  Value *Cond = BI->getCondition();
+  PHINode *PN = dyn_cast<PHINode>(Cond);
+  if (PN && PN->getParent() == BB) {
+    // Degenerate case of a single entry PHI.
+    if (PN->getNumIncomingValues() == 1) {
+      FoldSingleEntryPHINodes(PN->getParent());
+      return true;
+    }
 
-  // Degenerate case of a single entry PHI.
-  if (PN->getNumIncomingValues() == 1) {
-    FoldSingleEntryPHINodes(PN->getParent());
-    return true;
+    for (Use &U : PN->incoming_values())
+      if (auto *CB = dyn_cast<ConstantInt>(U))
+        KnownValues.insert({PN->getIncomingBlock(U), CB});
+  } else {
+    SmallDenseMap<std::pair<BasicBlock *, BasicBlock *>, ConstantInt *> Visited;
+    for (BasicBlock *Pred : predecessors(BB)) {
+      if (ConstantInt *CB = getKnownValueOnEdge(Cond, Pred, BB, Visited))
+        KnownValues.insert({Pred, CB});
+    }
   }
 
+  if (KnownValues.empty())
+    return false;
+
   // Now we know that this block has multiple preds and two succs.
+  // Check that the block is small enough and values defined in the block are
+  // not used outside of it.
   if (!BlockIsSimpleEnoughToThreadThrough(BB))
     return false;
 
-  // Okay, this is a simple enough basic block.  See if any phi values are
-  // constants.
-  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
-    ConstantInt *CB = dyn_cast<ConstantInt>(PN->getIncomingValue(i));
-    if (!CB || !CB->getType()->isIntegerTy(1))
-      continue;
-
+  for (const auto &Pair : KnownValues) {
     // Okay, we now know that all edges from PredBB should be revectored to
     // branch to RealDest.
-    BasicBlock *PredBB = PN->getIncomingBlock(i);
+    ConstantInt *CB = Pair.second;
+    BasicBlock *PredBB = Pair.first;
     BasicBlock *RealDest = BI->getSuccessor(!CB->getZExtValue());
 
     if (RealDest == BB)
@@ -2690,6 +3088,7 @@ static Optional<bool> FoldCondBranchOnPHIImpl(BranchInst *BI,
     // cloned instructions outside of EdgeBB.
     BasicBlock::iterator InsertPt = EdgeBB->begin();
     DenseMap<Value *, Value *> TranslateMap; // Track translated values.
+    TranslateMap[Cond] = Pair.second;
     for (BasicBlock::iterator BBI = BB->begin(); &*BBI != BI; ++BBI) {
       if (PHINode *PN = dyn_cast<PHINode>(BBI)) {
         TranslateMap[PN] = PN->getIncomingValueForBlock(PredBB);
@@ -2708,7 +3107,7 @@ static Optional<bool> FoldCondBranchOnPHIImpl(BranchInst *BI,
       }
 
       // Check for trivial simplification.
-      if (Value *V = SimplifyInstruction(N, {DL, nullptr, nullptr, AC})) {
+      if (Value *V = simplifyInstruction(N, {DL, nullptr, nullptr, AC})) {
         if (!BBI->use_empty())
           TranslateMap[&*BBI] = V;
         if (!N->mayHaveSideEffects()) {
@@ -2746,6 +3145,12 @@ static Optional<bool> FoldCondBranchOnPHIImpl(BranchInst *BI,
       DTU->applyUpdates(Updates);
     }
 
+    // For simplicity, we created a separate basic block for the edge. Merge
+    // it back into the predecessor if possible. This not only avoids
+    // unnecessary SimplifyCFG iterations, but also makes sure that we don't
+    // bypass the check for trivial cycles above.
+    MergeBlockIntoPredecessor(EdgeBB, DTU);
+
     // Signal repeat, simplifying any other constants.
     return None;
   }
@@ -2753,13 +3158,15 @@ static Optional<bool> FoldCondBranchOnPHIImpl(BranchInst *BI,
   return false;
 }
 
-static bool FoldCondBranchOnPHI(BranchInst *BI, DomTreeUpdater *DTU,
-                                const DataLayout &DL, AssumptionCache *AC) {
+static bool FoldCondBranchOnValueKnownInPredecessor(BranchInst *BI,
+                                                    DomTreeUpdater *DTU,
+                                                    const DataLayout &DL,
+                                                    AssumptionCache *AC) {
   Optional<bool> Result;
   bool EverChanged = false;
   do {
     // Note that None means "we changed things, but recurse further."
-    Result = FoldCondBranchOnPHIImpl(BI, DTU, DL, AC);
+    Result = FoldCondBranchOnValueKnownInPredecessorImpl(BI, DTU, DL, AC);
     EverChanged |= Result == None || *Result;
   } while (Result == None);
   return EverChanged;
@@ -2847,7 +3254,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   bool Changed = false;
   for (BasicBlock::iterator II = BB->begin(); isa<PHINode>(II);) {
     PHINode *PN = cast<PHINode>(II++);
-    if (Value *V = SimplifyInstruction(PN, {DL, PN})) {
+    if (Value *V = simplifyInstruction(PN, {DL, PN})) {
       PN->replaceAllUsesWith(V);
       PN->eraseFromParent();
       Changed = true;
@@ -3186,18 +3593,18 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,
 
   Instruction *Cond = dyn_cast<Instruction>(BI->getCondition());
 
-  if (!Cond || (!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond)) ||
+  if (!Cond ||
+      (!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond) &&
+       !isa<SelectInst>(Cond)) ||
       Cond->getParent() != BB || !Cond->hasOneUse())
     return false;
 
   // Cond is known to be a compare or binary operator.  Check to make sure that
   // neither operand is a potentially-trapping constant expression.
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(0)))
-    if (CE->canTrap())
-      return false;
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(1)))
-    if (CE->canTrap())
-      return false;
+  if (canTrap(Cond->getOperand(0)))
+    return false;
+  if (canTrap(Cond->getOperand(1)))
+    return false;
 
   // Finally, don't infinitely unroll conditional loops.
   if (is_contained(successors(BB), BB))
@@ -3384,7 +3791,9 @@ static bool mergeConditionalStoreToAddress(
     return false;
 
   // Now check the stores are compatible.
-  if (!QStore->isUnordered() || !PStore->isUnordered())
+  if (!QStore->isUnordered() || !PStore->isUnordered() ||
+      PStore->getValueOperand()->getType() !=
+          QStore->getValueOperand()->getType())
     return false;
 
   // Check that sinking the store won't cause program behavior changes. Sinking
@@ -3687,7 +4096,8 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
   if (PBI->getCondition() == BI->getCondition() &&
       PBI->getSuccessor(0) != PBI->getSuccessor(1)) {
     // Okay, the outcome of this conditional branch is statically
-    // knowable.  If this block had a single pred, handle specially.
+    // knowable.  If this block had a single pred, handle specially, otherwise
+    // FoldCondBranchOnValueKnownInPredecessor() will handle it.
     if (BB->getSinglePredecessor()) {
       // Turn this into a branch on constant.
       bool CondIsTrue = PBI->getSuccessor(0) == BB;
@@ -3695,35 +4105,6 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
           ConstantInt::get(Type::getInt1Ty(BB->getContext()), CondIsTrue));
       return true; // Nuke the branch on constant.
     }
-
-    // Otherwise, if there are multiple predecessors, insert a PHI that merges
-    // in the constant and simplify the block result.  Subsequent passes of
-    // simplifycfg will thread the block.
-    if (BlockIsSimpleEnoughToThreadThrough(BB)) {
-      pred_iterator PB = pred_begin(BB), PE = pred_end(BB);
-      PHINode *NewPN = PHINode::Create(
-          Type::getInt1Ty(BB->getContext()), std::distance(PB, PE),
-          BI->getCondition()->getName() + ".pr", &BB->front());
-      // Okay, we're going to insert the PHI node.  Since PBI is not the only
-      // predecessor, compute the PHI'd conditional value for all of the preds.
-      // Any predecessor where the condition is not computable we keep symbolic.
-      for (pred_iterator PI = PB; PI != PE; ++PI) {
-        BasicBlock *P = *PI;
-        if ((PBI = dyn_cast<BranchInst>(P->getTerminator())) && PBI != BI &&
-            PBI->isConditional() && PBI->getCondition() == BI->getCondition() &&
-            PBI->getSuccessor(0) != PBI->getSuccessor(1)) {
-          bool CondIsTrue = PBI->getSuccessor(0) == BB;
-          NewPN->addIncoming(
-              ConstantInt::get(Type::getInt1Ty(BB->getContext()), CondIsTrue),
-              P);
-        } else {
-          NewPN->addIncoming(BI->getCondition(), P);
-        }
-      }
-
-      BI->setCondition(NewPN);
-      return true;
-    }
   }
 
   // If the previous block ended with a widenable branch, determine if reusing
@@ -3732,9 +4113,8 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
   if (tryWidenCondBranchToCondBranch(PBI, BI, DTU))
     return true;
 
-  if (auto *CE = dyn_cast<ConstantExpr>(BI->getCondition()))
-    if (CE->canTrap())
-      return false;
+  if (canTrap(BI->getCondition()))
+    return false;
 
   // If both branches are conditional and both contain stores to the same
   // address, remove the stores from the conditionals and create a conditional
@@ -3791,15 +4171,13 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
 
     PHINode *PN = cast<PHINode>(II);
     Value *BIV = PN->getIncomingValueForBlock(BB);
-    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(BIV))
-      if (CE->canTrap())
-        return false;
+    if (canTrap(BIV))
+      return false;
 
     unsigned PBBIdx = PN->getBasicBlockIndex(PBI->getParent());
     Value *PBIV = PN->getIncomingValue(PBBIdx);
-    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(PBIV))
-      if (CE->canTrap())
-        return false;
+    if (canTrap(PBIV))
+      return false;
   }
 
   // Finally, if everything is ok, fold the branches to logical ops.
@@ -4116,7 +4494,7 @@ bool SimplifyCFGOpt::tryToSimplifyUncondBranchWithICmpInIt(
     assert(VVal && "Should have a unique destination value");
     ICI->setOperand(0, VVal);
 
-    if (Value *V = SimplifyInstruction(ICI, {DL, ICI})) {
+    if (Value *V = simplifyInstruction(ICI, {DL, ICI})) {
       ICI->replaceAllUsesWith(V);
       ICI->eraseFromParent();
     }
@@ -4812,8 +5190,9 @@ static void createUnreachableSwitchDefault(SwitchInst *Switch,
   }
 }
 
-/// Turn a switch with two reachable destinations into an integer range
-/// comparison and branch.
+/// Turn a switch into an integer range comparison and branch.
+/// Switches with more than 2 destinations are ignored.
+/// Switches with 1 destination are also ignored.
 bool SimplifyCFGOpt::TurnSwitchRangeIntoICmp(SwitchInst *SI,
                                              IRBuilder<> &Builder) {
   assert(SI->getNumCases() > 1 && "Degenerate switch?");
@@ -4845,6 +5224,8 @@ bool SimplifyCFGOpt::TurnSwitchRangeIntoICmp(SwitchInst *SI,
     }
     return false; // More than two destinations.
   }
+  if (!DestB)
+    return false; // All destinations are the same and the default is unreachable
 
   assert(DestA && DestB &&
          "Single-destination switch should have been folded.");
@@ -5169,11 +5550,6 @@ ConstantFold(Instruction *I, const DataLayout &DL,
       return nullptr;
   }
 
-  if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) {
-    return ConstantFoldCompareInstOperands(Cmp->getPredicate(), COps[0],
-                                           COps[1], DL);
-  }
-
   return ConstantFoldInstOperands(I, COps, DL);
 }
 
@@ -5182,7 +5558,7 @@ ConstantFold(Instruction *I, const DataLayout &DL,
 /// destionations CaseDest corresponding to value CaseVal (0 for the default
 /// case), of a switch instruction SI.
 static bool
-GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
+getCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
                BasicBlock **CommonDest,
                SmallVectorImpl<std::pair<PHINode *, Constant *>> &Res,
                const DataLayout &DL, const TargetTransformInfo &TTI) {
@@ -5253,9 +5629,9 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
 
 // Helper function used to add CaseVal to the list of cases that generate
 // Result. Returns the updated number of cases that generate this result.
-static uintptr_t MapCaseToResult(ConstantInt *CaseVal,
-                                 SwitchCaseResultVectorTy &UniqueResults,
-                                 Constant *Result) {
+static size_t mapCaseToResult(ConstantInt *CaseVal,
+                              SwitchCaseResultVectorTy &UniqueResults,
+                              Constant *Result) {
   for (auto &I : UniqueResults) {
     if (I.first == Result) {
       I.second.push_back(CaseVal);
@@ -5271,18 +5647,19 @@ static uintptr_t MapCaseToResult(ConstantInt *CaseVal,
 // results for the PHI node of the common destination block for a switch
 // instruction. Returns false if multiple PHI nodes have been found or if
 // there is not a common destination block for the switch.
-static bool
-InitializeUniqueCases(SwitchInst *SI, PHINode *&PHI, BasicBlock *&CommonDest,
-                      SwitchCaseResultVectorTy &UniqueResults,
-                      Constant *&DefaultResult, const DataLayout &DL,
-                      const TargetTransformInfo &TTI,
-                      uintptr_t MaxUniqueResults, uintptr_t MaxCasesPerResult) {
+static bool initializeUniqueCases(SwitchInst *SI, PHINode *&PHI,
+                                  BasicBlock *&CommonDest,
+                                  SwitchCaseResultVectorTy &UniqueResults,
+                                  Constant *&DefaultResult,
+                                  const DataLayout &DL,
+                                  const TargetTransformInfo &TTI,
+                                  uintptr_t MaxUniqueResults) {
   for (auto &I : SI->cases()) {
     ConstantInt *CaseVal = I.getCaseValue();
 
     // Resulting value at phi nodes for this case value.
     SwitchCaseResultsTy Results;
-    if (!GetCaseResults(SI, CaseVal, I.getCaseSuccessor(), &CommonDest, Results,
+    if (!getCaseResults(SI, CaseVal, I.getCaseSuccessor(), &CommonDest, Results,
                         DL, TTI))
       return false;
 
@@ -5291,11 +5668,11 @@ InitializeUniqueCases(SwitchInst *SI, PHINode *&PHI, BasicBlock *&CommonDest,
       return false;
 
     // Add the case->result mapping to UniqueResults.
-    const uintptr_t NumCasesForResult =
-        MapCaseToResult(CaseVal, UniqueResults, Results.begin()->second);
+    const size_t NumCasesForResult =
+        mapCaseToResult(CaseVal, UniqueResults, Results.begin()->second);
 
     // Early out if there are too many cases for this result.
-    if (NumCasesForResult > MaxCasesPerResult)
+    if (NumCasesForResult > MaxSwitchCasesPerResult)
       return false;
 
     // Early out if there are too many unique results.
@@ -5311,7 +5688,7 @@ InitializeUniqueCases(SwitchInst *SI, PHINode *&PHI, BasicBlock *&CommonDest,
   // Find the default result value.
   SmallVector<std::pair<PHINode *, Constant *>, 1> DefaultResults;
   BasicBlock *DefaultDest = SI->getDefaultDest();
-  GetCaseResults(SI, nullptr, SI->getDefaultDest(), &CommonDest, DefaultResults,
+  getCaseResults(SI, nullptr, SI->getDefaultDest(), &CommonDest, DefaultResults,
                  DL, TTI);
   // If the default value is not found abort unless the default destination
   // is unreachable.
@@ -5326,48 +5703,76 @@ InitializeUniqueCases(SwitchInst *SI, PHINode *&PHI, BasicBlock *&CommonDest,
 
 // Helper function that checks if it is possible to transform a switch with only
 // two cases (or two cases + default) that produces a result into a select.
-// Example:
-// switch (a) {
-//   case 10:                %0 = icmp eq i32 %a, 10
-//     return 10;            %1 = select i1 %0, i32 10, i32 4
-//   case 20:        ---->   %2 = icmp eq i32 %a, 20
-//     return 2;             %3 = select i1 %2, i32 2, i32 %1
-//   default:
-//     return 4;
-// }
-static Value *ConvertTwoCaseSwitch(const SwitchCaseResultVectorTy &ResultVector,
-                                   Constant *DefaultResult, Value *Condition,
-                                   IRBuilder<> &Builder) {
+// TODO: Handle switches with more than 2 cases that map to the same result.
+static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector,
+                                 Constant *DefaultResult, Value *Condition,
+                                 IRBuilder<> &Builder) {
   // If we are selecting between only two cases transform into a simple
   // select or a two-way select if default is possible.
+  // Example:
+  // switch (a) {                  %0 = icmp eq i32 %a, 10
+  //   case 10: return 42;         %1 = select i1 %0, i32 42, i32 4
+  //   case 20: return 2;   ---->  %2 = icmp eq i32 %a, 20
+  //   default: return 4;          %3 = select i1 %2, i32 2, i32 %1
+  // }
   if (ResultVector.size() == 2 && ResultVector[0].second.size() == 1 &&
       ResultVector[1].second.size() == 1) {
-    ConstantInt *const FirstCase = ResultVector[0].second[0];
-    ConstantInt *const SecondCase = ResultVector[1].second[0];
-
-    bool DefaultCanTrigger = DefaultResult;
+    ConstantInt *FirstCase = ResultVector[0].second[0];
+    ConstantInt *SecondCase = ResultVector[1].second[0];
     Value *SelectValue = ResultVector[1].first;
-    if (DefaultCanTrigger) {
-      Value *const ValueCompare =
+    if (DefaultResult) {
+      Value *ValueCompare =
           Builder.CreateICmpEQ(Condition, SecondCase, "switch.selectcmp");
       SelectValue = Builder.CreateSelect(ValueCompare, ResultVector[1].first,
                                          DefaultResult, "switch.select");
     }
-    Value *const ValueCompare =
+    Value *ValueCompare =
         Builder.CreateICmpEQ(Condition, FirstCase, "switch.selectcmp");
     return Builder.CreateSelect(ValueCompare, ResultVector[0].first,
                                 SelectValue, "switch.select");
   }
 
-  // Handle the degenerate case where two cases have the same value.
-  if (ResultVector.size() == 1 && ResultVector[0].second.size() == 2 &&
-      DefaultResult) {
-    Value *Cmp1 = Builder.CreateICmpEQ(
-        Condition, ResultVector[0].second[0], "switch.selectcmp.case1");
-    Value *Cmp2 = Builder.CreateICmpEQ(
-        Condition, ResultVector[0].second[1], "switch.selectcmp.case2");
-    Value *Cmp = Builder.CreateOr(Cmp1, Cmp2, "switch.selectcmp");
-    return Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult);
+  // Handle the degenerate case where two cases have the same result value.
+  if (ResultVector.size() == 1 && DefaultResult) {
+    ArrayRef<ConstantInt *> CaseValues = ResultVector[0].second;
+    unsigned CaseCount = CaseValues.size();
+    // n bits group cases map to the same result:
+    // case 0,4      -> Cond & 0b1..1011 == 0 ? result : default
+    // case 0,2,4,6  -> Cond & 0b1..1001 == 0 ? result : default
+    // case 0,2,8,10 -> Cond & 0b1..0101 == 0 ? result : default
+    if (isPowerOf2_32(CaseCount)) {
+      ConstantInt *MinCaseVal = CaseValues[0];
+      // Find mininal value.
+      for (auto Case : CaseValues)
+        if (Case->getValue().slt(MinCaseVal->getValue()))
+          MinCaseVal = Case;
+
+      // Mark the bits case number touched.
+      APInt BitMask = APInt::getZero(MinCaseVal->getBitWidth());
+      for (auto Case : CaseValues)
+        BitMask |= (Case->getValue() - MinCaseVal->getValue());
+
+      // Check if cases with the same result can cover all number
+      // in touched bits.
+      if (BitMask.countPopulation() == Log2_32(CaseCount)) {
+        if (!MinCaseVal->isNullValue())
+          Condition = Builder.CreateSub(Condition, MinCaseVal);
+        Value *And = Builder.CreateAnd(Condition, ~BitMask, "switch.and");
+        Value *Cmp = Builder.CreateICmpEQ(
+            And, Constant::getNullValue(And->getType()), "switch.selectcmp");
+        return Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult);
+      }
+    }
+
+    // Handle the degenerate case where two cases have the same value.
+    if (CaseValues.size() == 2) {
+      Value *Cmp1 = Builder.CreateICmpEQ(Condition, CaseValues[0],
+                                         "switch.selectcmp.case1");
+      Value *Cmp2 = Builder.CreateICmpEQ(Condition, CaseValues[1],
+                                         "switch.selectcmp.case2");
+      Value *Cmp = Builder.CreateOr(Cmp1, Cmp2, "switch.selectcmp");
+      return Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult);
+    }
   }
 
   return nullptr;
@@ -5375,10 +5780,10 @@ static Value *ConvertTwoCaseSwitch(const SwitchCaseResultVectorTy &ResultVector,
 
 // Helper function to cleanup a switch instruction that has been converted into
 // a select, fixing up PHI nodes and basic blocks.
-static void RemoveSwitchAfterSelectConversion(SwitchInst *SI, PHINode *PHI,
-                                              Value *SelectValue,
-                                              IRBuilder<> &Builder,
-                                              DomTreeUpdater *DTU) {
+static void removeSwitchAfterSelectFold(SwitchInst *SI, PHINode *PHI,
+                                        Value *SelectValue,
+                                        IRBuilder<> &Builder,
+                                        DomTreeUpdater *DTU) {
   std::vector<DominatorTree::UpdateType> Updates;
 
   BasicBlock *SelectBB = SI->getParent();
@@ -5409,33 +5814,31 @@ static void RemoveSwitchAfterSelectConversion(SwitchInst *SI, PHINode *PHI,
     DTU->applyUpdates(Updates);
 }
 
-/// If the switch is only used to initialize one or more
-/// phi nodes in a common successor block with only two different
-/// constant values, replace the switch with select.
-static bool switchToSelect(SwitchInst *SI, IRBuilder<> &Builder,
-                           DomTreeUpdater *DTU, const DataLayout &DL,
-                           const TargetTransformInfo &TTI) {
+/// If a switch is only used to initialize one or more phi nodes in a common
+/// successor block with only two different constant values, try to replace the
+/// switch with a select. Returns true if the fold was made.
+static bool trySwitchToSelect(SwitchInst *SI, IRBuilder<> &Builder,
+                              DomTreeUpdater *DTU, const DataLayout &DL,
+                              const TargetTransformInfo &TTI) {
   Value *const Cond = SI->getCondition();
   PHINode *PHI = nullptr;
   BasicBlock *CommonDest = nullptr;
   Constant *DefaultResult;
   SwitchCaseResultVectorTy UniqueResults;
   // Collect all the cases that will deliver the same value from the switch.
-  if (!InitializeUniqueCases(SI, PHI, CommonDest, UniqueResults, DefaultResult,
-                             DL, TTI, /*MaxUniqueResults*/2,
-                             /*MaxCasesPerResult*/2))
+  if (!initializeUniqueCases(SI, PHI, CommonDest, UniqueResults, DefaultResult,
+                             DL, TTI, /*MaxUniqueResults*/ 2))
     return false;
-  assert(PHI != nullptr && "PHI for value select not found");
 
+  assert(PHI != nullptr && "PHI for value select not found");
   Builder.SetInsertPoint(SI);
   Value *SelectValue =
-      ConvertTwoCaseSwitch(UniqueResults, DefaultResult, Cond, Builder);
-  if (SelectValue) {
-    RemoveSwitchAfterSelectConversion(SI, PHI, SelectValue, Builder, DTU);
-    return true;
-  }
-  // The switch couldn't be converted into a select.
-  return false;
+      foldSwitchToSelect(UniqueResults, DefaultResult, Cond, Builder);
+  if (!SelectValue)
+    return false;
+
+  removeSwitchAfterSelectFold(SI, PHI, SelectValue, Builder, DTU);
+  return true;
 }
 
 namespace {
@@ -5655,7 +6058,7 @@ Value *SwitchLookupTable::BuildLookup(Value *Index, IRBuilder<> &Builder) {
     IntegerType *IT = cast<IntegerType>(Index->getType());
     uint64_t TableSize =
         Array->getInitializer()->getType()->getArrayNumElements();
-    if (TableSize > (1ULL << (IT->getBitWidth() - 1)))
+    if (TableSize > (1ULL << std::min(IT->getBitWidth() - 1, 63u)))
       Index = Builder.CreateZExt(
           Index, IntegerType::get(IT->getContext(), IT->getBitWidth() + 1),
           "switch.tableidx.zext");
@@ -5707,6 +6110,27 @@ static bool isTypeLegalForLookupTable(Type *Ty, const TargetTransformInfo &TTI,
          DL.fitsInLegalInteger(IT->getBitWidth());
 }
 
+static bool isSwitchDense(uint64_t NumCases, uint64_t CaseRange) {
+  // 40% is the default density for building a jump table in optsize/minsize
+  // mode. See also TargetLoweringBase::isSuitableForJumpTable(), which this
+  // function was based on.
+  const uint64_t MinDensity = 40;
+
+  if (CaseRange >= UINT64_MAX / 100)
+    return false; // Avoid multiplication overflows below.
+
+  return NumCases * 100 >= CaseRange * MinDensity;
+}
+
+static bool isSwitchDense(ArrayRef<int64_t> Values) {
+  uint64_t Diff = (uint64_t)Values.back() - (uint64_t)Values.front();
+  uint64_t Range = Diff + 1;
+  if (Range < Diff)
+    return false; // Overflow.
+
+  return isSwitchDense(Values.size(), Range);
+}
+
 /// Determine whether a lookup table should be built for this switch, based on
 /// the number of cases, size of the table, and the types of the results.
 // TODO: We could support larger than legal types by limiting based on the
@@ -5716,8 +6140,8 @@ static bool
 ShouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize,
                        const TargetTransformInfo &TTI, const DataLayout &DL,
                        const SmallDenseMap<PHINode *, Type *> &ResultTypes) {
-  if (SI->getNumCases() > TableSize || TableSize >= UINT64_MAX / 10)
-    return false; // TableSize overflowed, or mul below might overflow.
+  if (SI->getNumCases() > TableSize)
+    return false; // TableSize overflowed.
 
   bool AllTablesFitInRegister = true;
   bool HasIllegalType = false;
@@ -5747,10 +6171,7 @@ ShouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize,
   if (HasIllegalType)
     return false;
 
-  // The table density should be at least 40%. This is the same criterion as for
-  // jump tables, see SelectionDAGBuilder::handleJTSwitchCase.
-  // FIXME: Find the best cut-off.
-  return SI->getNumCases() * 10 >= TableSize * 4;
+  return isSwitchDense(SI->getNumCases(), TableSize);
 }
 
 /// Try to reuse the switch table index compare. Following pattern:
@@ -5888,7 +6309,7 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
     // Resulting value at phi nodes for this case value.
     using ResultsTy = SmallVector<std::pair<PHINode *, Constant *>, 4>;
     ResultsTy Results;
-    if (!GetCaseResults(SI, CaseVal, CI->getCaseSuccessor(), &CommonDest,
+    if (!getCaseResults(SI, CaseVal, CI->getCaseSuccessor(), &CommonDest,
                         Results, DL, TTI))
       return false;
 
@@ -5916,7 +6337,7 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
   // or a bitmask that fits in a register.
   SmallVector<std::pair<PHINode *, Constant *>, 4> DefaultResultsList;
   bool HasDefaultResults =
-      GetCaseResults(SI, nullptr, SI->getDefaultDest(), &CommonDest,
+      getCaseResults(SI, nullptr, SI->getDefaultDest(), &CommonDest,
                      DefaultResultsList, DL, TTI);
 
   bool NeedMask = (TableHasHoles && !HasDefaultResults);
@@ -6086,17 +6507,6 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
   return true;
 }
 
-static bool isSwitchDense(ArrayRef<int64_t> Values) {
-  // See also SelectionDAGBuilder::isDense(), which this function was based on.
-  uint64_t Diff = (uint64_t)Values.back() - (uint64_t)Values.front();
-  uint64_t Range = Diff + 1;
-  uint64_t NumCases = Values.size();
-  // 40% is the default density for building a jump table in optsize/minsize mode.
-  uint64_t MinDensity = 40;
-
-  return NumCases * 100 >= Range * MinDensity;
-}
-
 /// Try to transform a switch that has "holes" in it to a contiguous sequence
 /// of cases.
 ///
@@ -6211,14 +6621,16 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
   }
 
   // Try to transform the switch into an icmp and a branch.
-  if (TurnSwitchRangeIntoICmp(SI, Builder))
+  // The conversion from switch to comparison may lose information on
+  // impossible switch values, so disable it early in the pipeline.
+  if (Options.ConvertSwitchRangeToICmp && TurnSwitchRangeIntoICmp(SI, Builder))
     return requestResimplify();
 
   // Remove unreachable cases.
   if (eliminateDeadSwitchCases(SI, DTU, Options.AC, DL))
     return requestResimplify();
 
-  if (switchToSelect(SI, Builder, DTU, DL, TTI))
+  if (trySwitchToSelect(SI, Builder, DTU, DL, TTI))
     return requestResimplify();
 
   if (Options.ForwardSwitchCondToPhi && ForwardSwitchConditionToPHI(SI))
@@ -6521,12 +6933,11 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
         return requestResimplify();
   }
 
-  // If this is a branch on a phi node in the current block, thread control
-  // through this block if any PHI node entries are constants.
-  if (PHINode *PN = dyn_cast<PHINode>(BI->getCondition()))
-    if (PN->getParent() == BI->getParent())
-      if (FoldCondBranchOnPHI(BI, DTU, DL, Options.AC))
-        return requestResimplify();
+  // If this is a branch on something for which we know the constant value in
+  // predecessors (e.g. a phi node in the current block), thread control
+  // through this block.
+  if (FoldCondBranchOnValueKnownInPredecessor(BI, DTU, DL, Options.AC))
+    return requestResimplify();
 
   // Scan predecessor blocks for conditional branches.
   for (BasicBlock *Pred : predecessors(BB))
@@ -6725,7 +7136,8 @@ bool SimplifyCFGOpt::simplifyOnce(BasicBlock *BB) {
     return true;
 
   if (SinkCommon && Options.SinkCommonInsts)
-    if (SinkCommonCodeFromPredecessors(BB, DTU)) {
+    if (SinkCommonCodeFromPredecessors(BB, DTU) ||
+        MergeCompatibleInvokes(BB, DTU)) {
       // SinkCommonCodeFromPredecessors() does not automatically CSE PHI's,
       // so we may now how duplicate PHI's.
       // Let's rerun EliminateDuplicatePHINodes() first,
diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
index 5b7fd4349c6c..dbef1ff2e739 100644
--- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -13,11 +13,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
@@ -58,7 +56,7 @@ namespace {
     SCEVExpander     &Rewriter;
     SmallVectorImpl<WeakTrackingVH> &DeadInsts;
 
-    bool Changed;
+    bool Changed = false;
 
   public:
     SimplifyIndvar(Loop *Loop, ScalarEvolution *SE, DominatorTree *DT,
@@ -66,7 +64,7 @@ namespace {
                    SCEVExpander &Rewriter,
                    SmallVectorImpl<WeakTrackingVH> &Dead)
         : L(Loop), LI(LI), SE(SE), DT(DT), TTI(TTI), Rewriter(Rewriter),
-          DeadInsts(Dead), Changed(false) {
+          DeadInsts(Dead) {
       assert(LI && "IV simplification requires LoopInfo");
     }
 
@@ -161,11 +159,12 @@ Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand)
       D = ConstantInt::get(UseInst->getContext(),
                            APInt::getOneBitSet(BitWidth, D->getZExtValue()));
     }
-    FoldedExpr = SE->getUDivExpr(SE->getSCEV(IVSrc), SE->getSCEV(D));
+    const auto *LHS = SE->getSCEV(IVSrc);
+    const auto *RHS = SE->getSCEV(D);
+    FoldedExpr = SE->getUDivExpr(LHS, RHS);
     // We might have 'exact' flag set at this point which will no longer be
     // correct after we make the replacement.
-    if (UseInst->isExact() &&
-        SE->getSCEV(IVSrc) != SE->getMulExpr(FoldedExpr, SE->getSCEV(D)))
+    if (UseInst->isExact() && LHS != SE->getMulExpr(FoldedExpr, RHS))
       MustDropExactFlag = true;
   }
   // We have something that might fold it's operand. Compare SCEVs.
@@ -872,6 +871,7 @@ void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) {
     Instruction *IVOperand = UseOper.second;
     for (unsigned N = 0; IVOperand; ++N) {
       assert(N <= Simplified.size() && "runaway iteration");
+      (void) N;
 
       Value *NewOper = foldIVUser(UseInst, IVOperand);
       if (!NewOper)
@@ -1757,10 +1757,6 @@ Instruction *WidenIV::widenIVUse(WidenIV::NarrowIVDefUse DU, SCEVExpander &Rewri
     truncateIVUse(DU, DT, LI);
     return nullptr;
   }
-  // Assume block terminators cannot evaluate to a recurrence. We can't to
-  // insert a Trunc after a terminator if there happens to be a critical edge.
-  assert(DU.NarrowUse != DU.NarrowUse->getParent()->getTerminator() &&
-         "SCEV is not expected to evaluate a block terminator");
 
   // Reuse the IV increment that SCEVExpander created as long as it dominates
   // NarrowUse.
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index e02d02a05752..f4306bb43dfd 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -14,28 +14,23 @@
 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Analysis/CaptureTracking.h"
-#include "llvm/Analysis/Loads.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SizeOpts.h"
 
 using namespace llvm;
@@ -206,6 +201,11 @@ static Value *copyFlags(const CallInst &Old, Value *New) {
   return New;
 }
 
+// Helper to avoid truncating the length if size_t is 32-bits.
+static StringRef substr(StringRef Str, uint64_t Len) {
+  return Len >= Str.size() ? Str : Str.substr(0, Len);
+}
+
 //===----------------------------------------------------------------------===//
 // String and Memory Library Call Optimizations
 //===----------------------------------------------------------------------===//
@@ -242,7 +242,7 @@ Value *LibCallSimplifier::emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len,
   // Now that we have the destination's length, we must index into the
   // destination's pointer to get the actual memcpy destination (end of
   // the string .. we're concatenating).
-  Value *CpyDst = B.CreateGEP(B.getInt8Ty(), Dst, DstLen, "endptr");
+  Value *CpyDst = B.CreateInBoundsGEP(B.getInt8Ty(), Dst, DstLen, "endptr");
 
   // We have enough information to now generate the memcpy call to do the
   // concatenation for us.  Make a memcpy to copy the nul byte with align = 1.
@@ -326,7 +326,7 @@ Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilderBase &B) {
   if (!getConstantStringInfo(SrcStr, Str)) {
     if (CharC->isZero()) // strchr(p, 0) -> p + strlen(p)
       if (Value *StrLen = emitStrLen(SrcStr, B, DL, TLI))
-        return B.CreateGEP(B.getInt8Ty(), SrcStr, StrLen, "strchr");
+        return B.CreateInBoundsGEP(B.getInt8Ty(), SrcStr, StrLen, "strchr");
     return nullptr;
   }
 
@@ -339,35 +339,29 @@ Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilderBase &B) {
     return Constant::getNullValue(CI->getType());
 
   // strchr(s+n,c)  -> gep(s+n+i,c)
-  return B.CreateGEP(B.getInt8Ty(), SrcStr, B.getInt64(I), "strchr");
+  return B.CreateInBoundsGEP(B.getInt8Ty(), SrcStr, B.getInt64(I), "strchr");
 }
 
 Value *LibCallSimplifier::optimizeStrRChr(CallInst *CI, IRBuilderBase &B) {
   Value *SrcStr = CI->getArgOperand(0);
-  ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+  Value *CharVal = CI->getArgOperand(1);
+  ConstantInt *CharC = dyn_cast<ConstantInt>(CharVal);
   annotateNonNullNoUndefBasedOnAccess(CI, 0);
 
-  // Cannot fold anything if we're not looking for a constant.
-  if (!CharC)
-    return nullptr;
-
   StringRef Str;
   if (!getConstantStringInfo(SrcStr, Str)) {
     // strrchr(s, 0) -> strchr(s, 0)
-    if (CharC->isZero())
+    if (CharC && CharC->isZero())
       return copyFlags(*CI, emitStrChr(SrcStr, '\0', B, TLI));
     return nullptr;
   }
 
-  // Compute the offset.
-  size_t I = (0xFF & CharC->getSExtValue()) == 0
-                 ? Str.size()
-                 : Str.rfind(CharC->getSExtValue());
-  if (I == StringRef::npos) // Didn't find the char. Return null.
-    return Constant::getNullValue(CI->getType());
-
-  // strrchr(s+n,c) -> gep(s+n+i,c)
-  return B.CreateGEP(B.getInt8Ty(), SrcStr, B.getInt64(I), "strrchr");
+  // Try to expand strrchr to the memrchr nonstandard extension if it's
+  // available, or simply fail otherwise.
+  uint64_t NBytes = Str.size() + 1;   // Include the terminating nul.
+  Type *IntPtrType = DL.getIntPtrType(CI->getContext());
+  Value *Size = ConstantInt::get(IntPtrType, NBytes);
+  return copyFlags(*CI, emitMemRChr(SrcStr, CharVal, Size, B, DL, TLI));
 }
 
 Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilderBase &B) {
@@ -428,6 +422,12 @@ Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilderBase &B) {
   return nullptr;
 }
 
+// Optimize a memcmp or, when StrNCmp is true, strncmp call CI with constant
+// arrays LHS and RHS and nonconstant Size.
+static Value *optimizeMemCmpVarSize(CallInst *CI, Value *LHS, Value *RHS,
+                                    Value *Size, bool StrNCmp,
+                                    IRBuilderBase &B, const DataLayout &DL);
+
 Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilderBase &B) {
   Value *Str1P = CI->getArgOperand(0);
   Value *Str2P = CI->getArgOperand(1);
@@ -442,7 +442,7 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilderBase &B) {
   if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(Size))
     Length = LengthArg->getZExtValue();
   else
-    return nullptr;
+    return optimizeMemCmpVarSize(CI, Str1P, Str2P, Size, true, B, DL);
 
   if (Length == 0) // strncmp(x,y,0)   -> 0
     return ConstantInt::get(CI->getType(), 0);
@@ -456,8 +456,9 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilderBase &B) {
 
   // strncmp(x, y)  -> cnst  (if both x and y are constant strings)
   if (HasStr1 && HasStr2) {
-    StringRef SubStr1 = Str1.substr(0, Length);
-    StringRef SubStr2 = Str2.substr(0, Length);
+    // Avoid truncating the 64-bit Length to 32 bits in ILP32.
+    StringRef SubStr1 = substr(Str1, Length);
+    StringRef SubStr2 = substr(Str2, Length);
     return ConstantInt::get(CI->getType(), SubStr1.compare(SubStr2));
   }
 
@@ -557,8 +558,8 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilderBase &B) {
 
   Type *PT = Callee->getFunctionType()->getParamType(0);
   Value *LenV = ConstantInt::get(DL.getIntPtrType(PT), Len);
-  Value *DstEnd = B.CreateGEP(B.getInt8Ty(), Dst,
-                              ConstantInt::get(DL.getIntPtrType(PT), Len - 1));
+  Value *DstEnd = B.CreateInBoundsGEP(
+      B.getInt8Ty(), Dst, ConstantInt::get(DL.getIntPtrType(PT), Len - 1));
 
   // We have enough information to now generate the memcpy call to do the
   // copy for us.  Make a memcpy to copy the nul byte with align = 1.
@@ -634,12 +635,51 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilderBase &B) {
 }
 
 Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilderBase &B,
-                                               unsigned CharSize) {
+                                               unsigned CharSize,
+                                               Value *Bound) {
   Value *Src = CI->getArgOperand(0);
+  Type *CharTy = B.getIntNTy(CharSize);
+
+  if (isOnlyUsedInZeroEqualityComparison(CI) &&
+      (!Bound || isKnownNonZero(Bound, DL))) {
+    // Fold strlen:
+    //   strlen(x) != 0 --> *x != 0
+    //   strlen(x) == 0 --> *x == 0
+    // and likewise strnlen with constant N > 0:
+    //   strnlen(x, N) != 0 --> *x != 0
+    //   strnlen(x, N) == 0 --> *x == 0
+    return B.CreateZExt(B.CreateLoad(CharTy, Src, "char0"),
+                        CI->getType());
+  }
+
+  if (Bound) {
+    if (ConstantInt *BoundCst = dyn_cast<ConstantInt>(Bound)) {
+      if (BoundCst->isZero())
+        // Fold strnlen(s, 0) -> 0 for any s, constant or otherwise.
+        return ConstantInt::get(CI->getType(), 0);
+
+      if (BoundCst->isOne()) {
+        // Fold strnlen(s, 1) -> *s ? 1 : 0 for any s.
+        Value *CharVal = B.CreateLoad(CharTy, Src, "strnlen.char0");
+        Value *ZeroChar = ConstantInt::get(CharTy, 0);
+        Value *Cmp = B.CreateICmpNE(CharVal, ZeroChar, "strnlen.char0cmp");
+        return B.CreateZExt(Cmp, CI->getType());
+      }
+    }
+  }
+
+  if (uint64_t Len = GetStringLength(Src, CharSize)) {
+    Value *LenC = ConstantInt::get(CI->getType(), Len - 1);
+    // Fold strlen("xyz") -> 3 and strnlen("xyz", 2) -> 2
+    // and strnlen("xyz", Bound) -> min(3, Bound) for nonconstant Bound.
+    if (Bound)
+      return B.CreateBinaryIntrinsic(Intrinsic::umin, LenC, Bound);
+    return LenC;
+  }
 
-  // Constant folding: strlen("xyz") -> 3
-  if (uint64_t Len = GetStringLength(Src, CharSize))
-    return ConstantInt::get(CI->getType(), Len - 1);
+  if (Bound)
+    // Punt for strnlen for now.
+    return nullptr;
 
   // If s is a constant pointer pointing to a string literal, we can fold
   // strlen(s + x) to strlen(s) - x, when x is known to be in the range
@@ -650,6 +690,7 @@ Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilderBase &B,
   // very useful because calling strlen for a pointer of other types is
   // very uncommon.
   if (GEPOperator *GEP = dyn_cast<GEPOperator>(Src)) {
+    // TODO: Handle subobjects.
     if (!isGEPBasedOnPointerToString(GEP, CharSize))
       return nullptr;
 
@@ -674,22 +715,15 @@ Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilderBase &B,
 
       Value *Offset = GEP->getOperand(2);
       KnownBits Known = computeKnownBits(Offset, DL, 0, nullptr, CI, nullptr);
-      Known.Zero.flipAllBits();
       uint64_t ArrSize =
              cast<ArrayType>(GEP->getSourceElementType())->getNumElements();
 
-      // KnownZero's bits are flipped, so zeros in KnownZero now represent
-      // bits known to be zeros in Offset, and ones in KnowZero represent
-      // bits unknown in Offset. Therefore, Offset is known to be in range
-      // [0, NullTermIdx] when the flipped KnownZero is non-negative and
-      // unsigned-less-than NullTermIdx.
-      //
       // If Offset is not provably in the range [0, NullTermIdx], we can still
       // optimize if we can prove that the program has undefined behavior when
       // Offset is outside that range. That is the case when GEP->getOperand(0)
       // is a pointer to an object whose memory extent is NullTermIdx+1.
-      if ((Known.Zero.isNonNegative() && Known.Zero.ule(NullTermIdx)) ||
-          (GEP->isInBounds() && isa<GlobalVariable>(GEP->getOperand(0)) &&
+      if ((Known.isNonNegative() && Known.getMaxValue().ule(NullTermIdx)) ||
+          (isa<GlobalVariable>(GEP->getOperand(0)) &&
            NullTermIdx == ArrSize - 1)) {
         Offset = B.CreateSExtOrTrunc(Offset, CI->getType());
         return B.CreateSub(ConstantInt::get(CI->getType(), NullTermIdx),
@@ -713,12 +747,6 @@ Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilderBase &B,
     }
   }
 
-  // strlen(x) != 0 --> *x != 0
-  // strlen(x) == 0 --> *x == 0
-  if (isOnlyUsedInZeroEqualityComparison(CI))
-    return B.CreateZExt(B.CreateLoad(B.getIntNTy(CharSize), Src, "strlenfirst"),
-                        CI->getType());
-
   return nullptr;
 }
 
@@ -729,6 +757,16 @@ Value *LibCallSimplifier::optimizeStrLen(CallInst *CI, IRBuilderBase &B) {
   return nullptr;
 }
 
+Value *LibCallSimplifier::optimizeStrNLen(CallInst *CI, IRBuilderBase &B) {
+  Value *Bound = CI->getArgOperand(1);
+  if (Value *V = optimizeStringLength(CI, B, 8, Bound))
+    return V;
+
+  if (isKnownNonZero(Bound, DL))
+    annotateNonNullNoUndefBasedOnAccess(CI, 0);
+  return nullptr;
+}
+
 Value *LibCallSimplifier::optimizeWcslen(CallInst *CI, IRBuilderBase &B) {
   Module &M = *CI->getModule();
   unsigned WCharSize = TLI->getWCharSize(M) * 8;
@@ -755,8 +793,8 @@ Value *LibCallSimplifier::optimizeStrPBrk(CallInst *CI, IRBuilderBase &B) {
     if (I == StringRef::npos) // No match.
       return Constant::getNullValue(CI->getType());
 
-    return B.CreateGEP(B.getInt8Ty(), CI->getArgOperand(0), B.getInt64(I),
-                       "strpbrk");
+    return B.CreateInBoundsGEP(B.getInt8Ty(), CI->getArgOperand(0),
+                               B.getInt64(I), "strpbrk");
   }
 
   // strpbrk(s, "a") -> strchr(s, 'a')
@@ -880,35 +918,190 @@ Value *LibCallSimplifier::optimizeStrStr(CallInst *CI, IRBuilderBase &B) {
 }
 
 Value *LibCallSimplifier::optimizeMemRChr(CallInst *CI, IRBuilderBase &B) {
-  if (isKnownNonZero(CI->getOperand(2), DL))
-    annotateNonNullNoUndefBasedOnAccess(CI, 0);
-  return nullptr;
+  Value *SrcStr = CI->getArgOperand(0);
+  Value *Size = CI->getArgOperand(2);
+  annotateNonNullAndDereferenceable(CI, 0, Size, DL);
+  Value *CharVal = CI->getArgOperand(1);
+  ConstantInt *LenC = dyn_cast<ConstantInt>(Size);
+  Value *NullPtr = Constant::getNullValue(CI->getType());
+
+  if (LenC) {
+    if (LenC->isZero())
+      // Fold memrchr(x, y, 0) --> null.
+      return NullPtr;
+
+    if (LenC->isOne()) {
+      // Fold memrchr(x, y, 1) --> *x == y ? x : null for any x and y,
+      // constant or otherwise.
+      Value *Val = B.CreateLoad(B.getInt8Ty(), SrcStr, "memrchr.char0");
+      // Slice off the character's high end bits.
+      CharVal = B.CreateTrunc(CharVal, B.getInt8Ty());
+      Value *Cmp = B.CreateICmpEQ(Val, CharVal, "memrchr.char0cmp");
+      return B.CreateSelect(Cmp, SrcStr, NullPtr, "memrchr.sel");
+    }
+  }
+
+  StringRef Str;
+  if (!getConstantStringInfo(SrcStr, Str, 0, /*TrimAtNul=*/false))
+    return nullptr;
+
+  if (Str.size() == 0)
+    // If the array is empty fold memrchr(A, C, N) to null for any value
+    // of C and N on the basis that the only valid value of N is zero
+    // (otherwise the call is undefined).
+    return NullPtr;
+
+  uint64_t EndOff = UINT64_MAX;
+  if (LenC) {
+    EndOff = LenC->getZExtValue();
+    if (Str.size() < EndOff)
+      // Punt out-of-bounds accesses to sanitizers and/or libc.
+      return nullptr;
+  }
+
+  if (ConstantInt *CharC = dyn_cast<ConstantInt>(CharVal)) {
+    // Fold memrchr(S, C, N) for a constant C.
+    size_t Pos = Str.rfind(CharC->getZExtValue(), EndOff);
+    if (Pos == StringRef::npos)
+      // When the character is not in the source array fold the result
+      // to null regardless of Size.
+      return NullPtr;
+
+    if (LenC)
+      // Fold memrchr(s, c, N) --> s + Pos for constant N > Pos.
+      return B.CreateInBoundsGEP(B.getInt8Ty(), SrcStr, B.getInt64(Pos));
+
+    if (Str.find(Str[Pos]) == Pos) {
+      // When there is just a single occurrence of C in S, i.e., the one
+      // in Str[Pos], fold
+      //   memrchr(s, c, N) --> N <= Pos ? null : s + Pos
+      // for nonconstant N.
+      Value *Cmp = B.CreateICmpULE(Size, ConstantInt::get(Size->getType(), Pos),
+                                   "memrchr.cmp");
+      Value *SrcPlus = B.CreateInBoundsGEP(B.getInt8Ty(), SrcStr,
+                                           B.getInt64(Pos), "memrchr.ptr_plus");
+      return B.CreateSelect(Cmp, NullPtr, SrcPlus, "memrchr.sel");
+    }
+  }
+
+  // Truncate the string to search at most EndOff characters.
+  Str = Str.substr(0, EndOff);
+  if (Str.find_first_not_of(Str[0]) != StringRef::npos)
+    return nullptr;
+
+  // If the source array consists of all equal characters, then for any
+  // C and N (whether in bounds or not), fold memrchr(S, C, N) to
+  //   N != 0 && *S == C ? S + N - 1 : null
+  Type *SizeTy = Size->getType();
+  Type *Int8Ty = B.getInt8Ty();
+  Value *NNeZ = B.CreateICmpNE(Size, ConstantInt::get(SizeTy, 0));
+  // Slice off the sought character's high end bits.
+  CharVal = B.CreateTrunc(CharVal, Int8Ty);
+  Value *CEqS0 = B.CreateICmpEQ(ConstantInt::get(Int8Ty, Str[0]), CharVal);
+  Value *And = B.CreateLogicalAnd(NNeZ, CEqS0);
+  Value *SizeM1 = B.CreateSub(Size, ConstantInt::get(SizeTy, 1));
+  Value *SrcPlus =
+      B.CreateInBoundsGEP(Int8Ty, SrcStr, SizeM1, "memrchr.ptr_plus");
+  return B.CreateSelect(And, SrcPlus, NullPtr, "memrchr.sel");
 }
 
 Value *LibCallSimplifier::optimizeMemChr(CallInst *CI, IRBuilderBase &B) {
   Value *SrcStr = CI->getArgOperand(0);
   Value *Size = CI->getArgOperand(2);
-  annotateNonNullAndDereferenceable(CI, 0, Size, DL);
-  ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+  if (isKnownNonZero(Size, DL))
+    annotateNonNullNoUndefBasedOnAccess(CI, 0);
+
+  Value *CharVal = CI->getArgOperand(1);
+  ConstantInt *CharC = dyn_cast<ConstantInt>(CharVal);
   ConstantInt *LenC = dyn_cast<ConstantInt>(Size);
+  Value *NullPtr = Constant::getNullValue(CI->getType());
 
   // memchr(x, y, 0) -> null
   if (LenC) {
     if (LenC->isZero())
-      return Constant::getNullValue(CI->getType());
-  } else {
-    // From now on we need at least constant length and string.
-    return nullptr;
+      return NullPtr;
+
+    if (LenC->isOne()) {
+      // Fold memchr(x, y, 1) --> *x == y ? x : null for any x and y,
+      // constant or otherwise.
+      Value *Val = B.CreateLoad(B.getInt8Ty(), SrcStr, "memchr.char0");
+      // Slice off the character's high end bits.
+      CharVal = B.CreateTrunc(CharVal, B.getInt8Ty());
+      Value *Cmp = B.CreateICmpEQ(Val, CharVal, "memchr.char0cmp");
+      return B.CreateSelect(Cmp, SrcStr, NullPtr, "memchr.sel");
+    }
   }
 
   StringRef Str;
   if (!getConstantStringInfo(SrcStr, Str, 0, /*TrimAtNul=*/false))
     return nullptr;
 
-  // Truncate the string to LenC. If Str is smaller than LenC we will still only
-  // scan the string, as reading past the end of it is undefined and we can just
-  // return null if we don't find the char.
-  Str = Str.substr(0, LenC->getZExtValue());
+  if (CharC) {
+    size_t Pos = Str.find(CharC->getZExtValue());
+    if (Pos == StringRef::npos)
+      // When the character is not in the source array fold the result
+      // to null regardless of Size.
+      return NullPtr;
+
+    // Fold memchr(s, c, n) -> n <= Pos ? null : s + Pos
+    // When the constant Size is less than or equal to the character
+    // position also fold the result to null.
+    Value *Cmp = B.CreateICmpULE(Size, ConstantInt::get(Size->getType(), Pos),
+                                 "memchr.cmp");
+    Value *SrcPlus = B.CreateInBoundsGEP(B.getInt8Ty(), SrcStr, B.getInt64(Pos),
+                                         "memchr.ptr");
+    return B.CreateSelect(Cmp, NullPtr, SrcPlus);
+  }
+
+  if (Str.size() == 0)
+    // If the array is empty fold memchr(A, C, N) to null for any value
+    // of C and N on the basis that the only valid value of N is zero
+    // (otherwise the call is undefined).
+    return NullPtr;
+
+  if (LenC)
+    Str = substr(Str, LenC->getZExtValue());
+
+  size_t Pos = Str.find_first_not_of(Str[0]);
+  if (Pos == StringRef::npos
+      || Str.find_first_not_of(Str[Pos], Pos) == StringRef::npos) {
+    // If the source array consists of at most two consecutive sequences
+    // of the same characters, then for any C and N (whether in bounds or
+    // not), fold memchr(S, C, N) to
+    //   N != 0 && *S == C ? S : null
+    // or for the two sequences to:
+    //   N != 0 && *S == C ? S : (N > Pos && S[Pos] == C ? S + Pos : null)
+    //   ^Sel2                   ^Sel1 are denoted above.
+    // The latter makes it also possible to fold strchr() calls with strings
+    // of the same characters.
+    Type *SizeTy = Size->getType();
+    Type *Int8Ty = B.getInt8Ty();
+
+    // Slice off the sought character's high end bits.
+    CharVal = B.CreateTrunc(CharVal, Int8Ty);
+
+    Value *Sel1 = NullPtr;
+    if (Pos != StringRef::npos) {
+      // Handle two consecutive sequences of the same characters.
+      Value *PosVal = ConstantInt::get(SizeTy, Pos);
+      Value *StrPos = ConstantInt::get(Int8Ty, Str[Pos]);
+      Value *CEqSPos = B.CreateICmpEQ(CharVal, StrPos);
+      Value *NGtPos = B.CreateICmp(ICmpInst::ICMP_UGT, Size, PosVal);
+      Value *And = B.CreateAnd(CEqSPos, NGtPos);
+      Value *SrcPlus = B.CreateInBoundsGEP(B.getInt8Ty(), SrcStr, PosVal);
+      Sel1 = B.CreateSelect(And, SrcPlus, NullPtr, "memchr.sel1");
+    }
+
+    Value *Str0 = ConstantInt::get(Int8Ty, Str[0]);
+    Value *CEqS0 = B.CreateICmpEQ(Str0, CharVal);
+    Value *NNeZ = B.CreateICmpNE(Size, ConstantInt::get(SizeTy, 0));
+    Value *And = B.CreateAnd(NNeZ, CEqS0);
+    return B.CreateSelect(And, SrcStr, Sel1, "memchr.sel2");
+  }
+
+  if (!LenC)
+    // From now on we need a constant length and constant array.
+    return nullptr;
 
   // If the char is variable but the input str and length are not we can turn
   // this memchr call into a simple bit field test. Of course this only works
@@ -920,60 +1113,93 @@ Value *LibCallSimplifier::optimizeMemChr(CallInst *CI, IRBuilderBase &B) {
   // memchr("\r\n", C, 2) != nullptr -> (1 << C & ((1 << '\r') | (1 << '\n')))
   // != 0
   //   after bounds check.
-  if (!CharC && !Str.empty() && isOnlyUsedInZeroEqualityComparison(CI)) {
-    unsigned char Max =
-        *std::max_element(reinterpret_cast<const unsigned char *>(Str.begin()),
-                          reinterpret_cast<const unsigned char *>(Str.end()));
-
-    // Make sure the bit field we're about to create fits in a register on the
-    // target.
-    // FIXME: On a 64 bit architecture this prevents us from using the
-    // interesting range of alpha ascii chars. We could do better by emitting
-    // two bitfields or shifting the range by 64 if no lower chars are used.
-    if (!DL.fitsInLegalInteger(Max + 1))
-      return nullptr;
+  if (Str.empty() || !isOnlyUsedInZeroEqualityComparison(CI))
+    return nullptr;
+
+  unsigned char Max =
+      *std::max_element(reinterpret_cast<const unsigned char *>(Str.begin()),
+                        reinterpret_cast<const unsigned char *>(Str.end()));
 
-    // For the bit field use a power-of-2 type with at least 8 bits to avoid
-    // creating unnecessary illegal types.
-    unsigned char Width = NextPowerOf2(std::max((unsigned char)7, Max));
+  // Make sure the bit field we're about to create fits in a register on the
+  // target.
+  // FIXME: On a 64 bit architecture this prevents us from using the
+  // interesting range of alpha ascii chars. We could do better by emitting
+  // two bitfields or shifting the range by 64 if no lower chars are used.
+  if (!DL.fitsInLegalInteger(Max + 1))
+    return nullptr;
 
-    // Now build the bit field.
-    APInt Bitfield(Width, 0);
-    for (char C : Str)
-      Bitfield.setBit((unsigned char)C);
-    Value *BitfieldC = B.getInt(Bitfield);
+  // For the bit field use a power-of-2 type with at least 8 bits to avoid
+  // creating unnecessary illegal types.
+  unsigned char Width = NextPowerOf2(std::max((unsigned char)7, Max));
 
-    // Adjust width of "C" to the bitfield width, then mask off the high bits.
-    Value *C = B.CreateZExtOrTrunc(CI->getArgOperand(1), BitfieldC->getType());
-    C = B.CreateAnd(C, B.getIntN(Width, 0xFF));
+  // Now build the bit field.
+  APInt Bitfield(Width, 0);
+  for (char C : Str)
+    Bitfield.setBit((unsigned char)C);
+  Value *BitfieldC = B.getInt(Bitfield);
 
-    // First check that the bit field access is within bounds.
-    Value *Bounds = B.CreateICmp(ICmpInst::ICMP_ULT, C, B.getIntN(Width, Width),
-                                 "memchr.bounds");
+  // Adjust width of "C" to the bitfield width, then mask off the high bits.
+  Value *C = B.CreateZExtOrTrunc(CharVal, BitfieldC->getType());
+  C = B.CreateAnd(C, B.getIntN(Width, 0xFF));
 
-    // Create code that checks if the given bit is set in the field.
-    Value *Shl = B.CreateShl(B.getIntN(Width, 1ULL), C);
-    Value *Bits = B.CreateIsNotNull(B.CreateAnd(Shl, BitfieldC), "memchr.bits");
+  // First check that the bit field access is within bounds.
+  Value *Bounds = B.CreateICmp(ICmpInst::ICMP_ULT, C, B.getIntN(Width, Width),
+                               "memchr.bounds");
 
-    // Finally merge both checks and cast to pointer type. The inttoptr
-    // implicitly zexts the i1 to intptr type.
-    return B.CreateIntToPtr(B.CreateLogicalAnd(Bounds, Bits, "memchr"),
-                            CI->getType());
-  }
+  // Create code that checks if the given bit is set in the field.
+  Value *Shl = B.CreateShl(B.getIntN(Width, 1ULL), C);
+  Value *Bits = B.CreateIsNotNull(B.CreateAnd(Shl, BitfieldC), "memchr.bits");
 
-  // Check if all arguments are constants.  If so, we can constant fold.
-  if (!CharC)
-    return nullptr;
+  // Finally merge both checks and cast to pointer type. The inttoptr
+  // implicitly zexts the i1 to intptr type.
+  return B.CreateIntToPtr(B.CreateLogicalAnd(Bounds, Bits, "memchr"),
+                          CI->getType());
+}
 
-  // Compute the offset.
-  size_t I = Str.find(CharC->getSExtValue() & 0xFF);
-  if (I == StringRef::npos) // Didn't find the char.  memchr returns null.
+// Optimize a memcmp or, when StrNCmp is true, strncmp call CI with constant
+// arrays LHS and RHS and nonconstant Size.
+static Value *optimizeMemCmpVarSize(CallInst *CI, Value *LHS, Value *RHS,
+                                    Value *Size, bool StrNCmp,
+                                    IRBuilderBase &B, const DataLayout &DL) {
+  if (LHS == RHS) // memcmp(s,s,x) -> 0
     return Constant::getNullValue(CI->getType());
 
-  // memchr(s+n,c,l) -> gep(s+n+i,c)
-  return B.CreateGEP(B.getInt8Ty(), SrcStr, B.getInt64(I), "memchr");
+  StringRef LStr, RStr;
+  if (!getConstantStringInfo(LHS, LStr, 0, /*TrimAtNul=*/false) ||
+      !getConstantStringInfo(RHS, RStr, 0, /*TrimAtNul=*/false))
+    return nullptr;
+
+  // If the contents of both constant arrays are known, fold a call to
+  // memcmp(A, B, N) to
+  //   N <= Pos ? 0 : (A < B ? -1 : B < A ? +1 : 0)
+  // where Pos is the first mismatch between A and B, determined below.
+
+  uint64_t Pos = 0;
+  Value *Zero = ConstantInt::get(CI->getType(), 0);
+  for (uint64_t MinSize = std::min(LStr.size(), RStr.size()); ; ++Pos) {
+    if (Pos == MinSize ||
+        (StrNCmp && (LStr[Pos] == '\0' && RStr[Pos] == '\0'))) {
+      // One array is a leading part of the other of equal or greater
+      // size, or for strncmp, the arrays are equal strings.
+      // Fold the result to zero.  Size is assumed to be in bounds, since
+      // otherwise the call would be undefined.
+      return Zero;
+    }
+
+    if (LStr[Pos] != RStr[Pos])
+      break;
+  }
+
+  // Normalize the result.
+  typedef unsigned char UChar;
+  int IRes = UChar(LStr[Pos]) < UChar(RStr[Pos]) ? -1 : 1;
+  Value *MaxSize = ConstantInt::get(Size->getType(), Pos);
+  Value *Cmp = B.CreateICmp(ICmpInst::ICMP_ULE, Size, MaxSize);
+  Value *Res = ConstantInt::get(CI->getType(), IRes);
+  return B.CreateSelect(Cmp, Zero, Res);
 }
 
+// Optimize a memcmp call CI with constant size Len.
 static Value *optimizeMemCmpConstantSize(CallInst *CI, Value *LHS, Value *RHS,
                                          uint64_t Len, IRBuilderBase &B,
                                          const DataLayout &DL) {
@@ -1028,25 +1254,6 @@ static Value *optimizeMemCmpConstantSize(CallInst *CI, Value *LHS, Value *RHS,
     }
   }
 
-  // Constant folding: memcmp(x, y, Len) -> constant (all arguments are const).
-  // TODO: This is limited to i8 arrays.
-  StringRef LHSStr, RHSStr;
-  if (getConstantStringInfo(LHS, LHSStr) &&
-      getConstantStringInfo(RHS, RHSStr)) {
-    // Make sure we're not reading out-of-bounds memory.
-    if (Len > LHSStr.size() || Len > RHSStr.size())
-      return nullptr;
-    // Fold the memcmp and normalize the result.  This way we get consistent
-    // results across multiple platforms.
-    uint64_t Ret = 0;
-    int Cmp = memcmp(LHSStr.data(), RHSStr.data(), Len);
-    if (Cmp < 0)
-      Ret = -1;
-    else if (Cmp > 0)
-      Ret = 1;
-    return ConstantInt::get(CI->getType(), Ret);
-  }
-
   return nullptr;
 }
 
@@ -1056,33 +1263,29 @@ Value *LibCallSimplifier::optimizeMemCmpBCmpCommon(CallInst *CI,
   Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1);
   Value *Size = CI->getArgOperand(2);
 
-  if (LHS == RHS) // memcmp(s,s,x) -> 0
-    return Constant::getNullValue(CI->getType());
-
   annotateNonNullAndDereferenceable(CI, {0, 1}, Size, DL);
-  // Handle constant lengths.
+
+  if (Value *Res = optimizeMemCmpVarSize(CI, LHS, RHS, Size, false, B, DL))
+    return Res;
+
+  // Handle constant Size.
   ConstantInt *LenC = dyn_cast<ConstantInt>(Size);
   if (!LenC)
     return nullptr;
 
-  // memcmp(d,s,0) -> 0
-  if (LenC->getZExtValue() == 0)
-    return Constant::getNullValue(CI->getType());
-
-  if (Value *Res =
-          optimizeMemCmpConstantSize(CI, LHS, RHS, LenC->getZExtValue(), B, DL))
-    return Res;
-  return nullptr;
+  return optimizeMemCmpConstantSize(CI, LHS, RHS, LenC->getZExtValue(), B, DL);
 }
 
 Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilderBase &B) {
+  Module *M = CI->getModule();
   if (Value *V = optimizeMemCmpBCmpCommon(CI, B))
     return V;
 
   // memcmp(x, y, Len) == 0 -> bcmp(x, y, Len) == 0
   // bcmp can be more efficient than memcmp because it only has to know that
   // there is a difference, not how different one is to the other.
-  if (TLI->has(LibFunc_bcmp) && isOnlyUsedInZeroEqualityComparison(CI)) {
+  if (isLibFuncEmittable(M, TLI, LibFunc_bcmp) &&
+      isOnlyUsedInZeroEqualityComparison(CI)) {
     Value *LHS = CI->getArgOperand(0);
     Value *RHS = CI->getArgOperand(1);
     Value *Size = CI->getArgOperand(2);
@@ -1125,6 +1328,7 @@ Value *LibCallSimplifier::optimizeMemCCpy(CallInst *CI, IRBuilderBase &B) {
       return Constant::getNullValue(CI->getType());
     if (!getConstantStringInfo(Src, SrcStr, /*Offset=*/0,
                                /*TrimAtNul=*/false) ||
+        // TODO: Handle zeroinitializer.
         !StopChar)
       return nullptr;
   } else {
@@ -1246,7 +1450,8 @@ static Value *valueHasFloatPrecision(Value *Val) {
 
 /// Shrink double -> float functions.
 static Value *optimizeDoubleFP(CallInst *CI, IRBuilderBase &B,
-                               bool isBinary, bool isPrecise = false) {
+                               bool isBinary, const TargetLibraryInfo *TLI,
+                               bool isPrecise = false) {
   Function *CalleeFn = CI->getCalledFunction();
   if (!CI->getType()->isDoubleTy() || !CalleeFn)
     return nullptr;
@@ -1296,22 +1501,25 @@ static Value *optimizeDoubleFP(CallInst *CI, IRBuilderBase &B,
     R = isBinary ? B.CreateCall(Fn, V) : B.CreateCall(Fn, V[0]);
   } else {
     AttributeList CalleeAttrs = CalleeFn->getAttributes();
-    R = isBinary ? emitBinaryFloatFnCall(V[0], V[1], CalleeName, B, CalleeAttrs)
-                 : emitUnaryFloatFnCall(V[0], CalleeName, B, CalleeAttrs);
+    R = isBinary ? emitBinaryFloatFnCall(V[0], V[1], TLI, CalleeName, B,
+                                         CalleeAttrs)
+                 : emitUnaryFloatFnCall(V[0], TLI, CalleeName, B, CalleeAttrs);
   }
   return B.CreateFPExt(R, B.getDoubleTy());
 }
 
 /// Shrink double -> float for unary functions.
 static Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilderBase &B,
+                                    const TargetLibraryInfo *TLI,
                                     bool isPrecise = false) {
-  return optimizeDoubleFP(CI, B, false, isPrecise);
+  return optimizeDoubleFP(CI, B, false, TLI, isPrecise);
 }
 
 /// Shrink double -> float for binary functions.
 static Value *optimizeBinaryDoubleFP(CallInst *CI, IRBuilderBase &B,
+                                     const TargetLibraryInfo *TLI,
                                      bool isPrecise = false) {
-  return optimizeDoubleFP(CI, B, true, isPrecise);
+  return optimizeDoubleFP(CI, B, true, TLI, isPrecise);
 }
 
 // cabs(z) -> sqrt((creal(z)*creal(z)) + (cimag(z)*cimag(z)))
@@ -1427,6 +1635,7 @@ static Value *getIntToFPVal(Value *I2F, IRBuilderBase &B, unsigned DstWidth) {
 /// ldexp(1.0, x) for pow(2.0, itofp(x)); exp2(n * x) for pow(2.0 ** n, x);
 /// exp10(x) for pow(10.0, x); exp2(log2(n) * x) for pow(n, x).
 Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) {
+  Module *M = Pow->getModule();
   Value *Base = Pow->getArgOperand(0), *Expo = Pow->getArgOperand(1);
   AttributeList Attrs; // Attributes are only meaningful on the original call
   Module *Mod = Pow->getModule();
@@ -1454,7 +1663,8 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) {
 
     Function *CalleeFn = BaseFn->getCalledFunction();
     if (CalleeFn &&
-        TLI->getLibFunc(CalleeFn->getName(), LibFn) && TLI->has(LibFn)) {
+        TLI->getLibFunc(CalleeFn->getName(), LibFn) &&
+        isLibFuncEmittable(M, TLI, LibFn)) {
       StringRef ExpName;
       Intrinsic::ID ID;
       Value *ExpFn;
@@ -1506,7 +1716,7 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) {
   // pow(2.0, itofp(x)) -> ldexp(1.0, x)
   if (match(Base, m_SpecificFP(2.0)) &&
       (isa<SIToFPInst>(Expo) || isa<UIToFPInst>(Expo)) &&
-      hasFloatFn(TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) {
+      hasFloatFn(M, TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) {
     if (Value *ExpoI = getIntToFPVal(Expo, B, TLI->getIntSize()))
       return copyFlags(*Pow,
                        emitBinaryFloatFnCall(ConstantFP::get(Ty, 1.0), ExpoI,
@@ -1515,7 +1725,7 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) {
   }
 
   // pow(2.0 ** n, x) -> exp2(n * x)
-  if (hasFloatFn(TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l)) {
+  if (hasFloatFn(M, TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l)) {
     APFloat BaseR = APFloat(1.0);
     BaseR.convert(BaseF->getSemantics(), APFloat::rmTowardZero, &Ignored);
     BaseR = BaseR / *BaseF;
@@ -1542,7 +1752,7 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) {
   // pow(10.0, x) -> exp10(x)
   // TODO: There is no exp10() intrinsic yet, but some day there shall be one.
   if (match(Base, m_SpecificFP(10.0)) &&
-      hasFloatFn(TLI, Ty, LibFunc_exp10, LibFunc_exp10f, LibFunc_exp10l))
+      hasFloatFn(M, TLI, Ty, LibFunc_exp10, LibFunc_exp10f, LibFunc_exp10l))
     return copyFlags(*Pow, emitUnaryFloatFnCall(Expo, TLI, LibFunc_exp10,
                                                 LibFunc_exp10f, LibFunc_exp10l,
                                                 B, Attrs));
@@ -1567,7 +1777,8 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) {
         return copyFlags(*Pow, B.CreateCall(Intrinsic::getDeclaration(
                                                 Mod, Intrinsic::exp2, Ty),
                                             FMul, "exp2"));
-      else if (hasFloatFn(TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l))
+      else if (hasFloatFn(M, TLI, Ty, LibFunc_exp2, LibFunc_exp2f,
+                          LibFunc_exp2l))
         return copyFlags(*Pow, emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2,
                                                     LibFunc_exp2f,
                                                     LibFunc_exp2l, B, Attrs));
@@ -1588,7 +1799,8 @@ static Value *getSqrtCall(Value *V, AttributeList Attrs, bool NoErrno,
   }
 
   // Otherwise, use the libcall for sqrt().
-  if (hasFloatFn(TLI, V->getType(), LibFunc_sqrt, LibFunc_sqrtf, LibFunc_sqrtl))
+  if (hasFloatFn(M, TLI, V->getType(), LibFunc_sqrt, LibFunc_sqrtf,
+                 LibFunc_sqrtl))
     // TODO: We also should check that the target can in fact lower the sqrt()
     // libcall. We currently have no way to ask this question, so we ask if
     // the target has a sqrt() libcall, which is not exactly the same.
@@ -1778,8 +1990,8 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilderBase &B) {
   // Shrink pow() to powf() if the arguments are single precision,
   // unless the result is expected to be double precision.
   if (UnsafeFPShrink && Name == TLI->getName(LibFunc_pow) &&
-      hasFloatVersion(Name)) {
-    if (Value *Shrunk = optimizeBinaryDoubleFP(Pow, B, true))
+      hasFloatVersion(M, Name)) {
+    if (Value *Shrunk = optimizeBinaryDoubleFP(Pow, B, TLI, true))
       return Shrunk;
   }
 
@@ -1787,13 +1999,14 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilderBase &B) {
 }
 
 Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilderBase &B) {
+  Module *M = CI->getModule();
   Function *Callee = CI->getCalledFunction();
   AttributeList Attrs; // Attributes are only meaningful on the original call
   StringRef Name = Callee->getName();
   Value *Ret = nullptr;
   if (UnsafeFPShrink && Name == TLI->getName(LibFunc_exp2) &&
-      hasFloatVersion(Name))
-    Ret = optimizeUnaryDoubleFP(CI, B, true);
+      hasFloatVersion(M, Name))
+    Ret = optimizeUnaryDoubleFP(CI, B, TLI, true);
 
   Type *Ty = CI->getType();
   Value *Op = CI->getArgOperand(0);
@@ -1801,7 +2014,7 @@ Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilderBase &B) {
   // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x))  if sizeof(x) <= IntSize
   // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x))  if sizeof(x) < IntSize
   if ((isa<SIToFPInst>(Op) || isa<UIToFPInst>(Op)) &&
-      hasFloatFn(TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) {
+      hasFloatFn(M, TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) {
     if (Value *Exp = getIntToFPVal(Op, B, TLI->getIntSize()))
       return emitBinaryFloatFnCall(ConstantFP::get(Ty, 1.0), Exp, TLI,
                                    LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl,
@@ -1812,12 +2025,14 @@ Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilderBase &B) {
 }
 
 Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilderBase &B) {
+  Module *M = CI->getModule();
+
   // If we can shrink the call to a float function rather than a double
   // function, do that first.
   Function *Callee = CI->getCalledFunction();
   StringRef Name = Callee->getName();
-  if ((Name == "fmin" || Name == "fmax") && hasFloatVersion(Name))
-    if (Value *Ret = optimizeBinaryDoubleFP(CI, B))
+  if ((Name == "fmin" || Name == "fmax") && hasFloatVersion(M, Name))
+    if (Value *Ret = optimizeBinaryDoubleFP(CI, B, TLI))
       return Ret;
 
   // The LLVM intrinsics minnum/maxnum correspond to fmin/fmax. Canonicalize to
@@ -1848,8 +2063,8 @@ Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilderBase &B) {
   Type *Ty = Log->getType();
   Value *Ret = nullptr;
 
-  if (UnsafeFPShrink && hasFloatVersion(LogNm))
-    Ret = optimizeUnaryDoubleFP(Log, B, true);
+  if (UnsafeFPShrink && hasFloatVersion(Mod, LogNm))
+    Ret = optimizeUnaryDoubleFP(Log, B, TLI, true);
 
   // The earlier call must also be 'fast' in order to do these transforms.
   CallInst *Arg = dyn_cast<CallInst>(Log->getArgOperand(0));
@@ -1957,7 +2172,7 @@ Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilderBase &B) {
         Log->doesNotAccessMemory()
             ? B.CreateCall(Intrinsic::getDeclaration(Mod, LogID, Ty),
                            Arg->getOperand(0), "log")
-            : emitUnaryFloatFnCall(Arg->getOperand(0), LogNm, B, Attrs);
+            : emitUnaryFloatFnCall(Arg->getOperand(0), TLI, LogNm, B, Attrs);
     Value *MulY = B.CreateFMul(Arg->getArgOperand(1), LogX, "mul");
     // Since pow() may have side effects, e.g. errno,
     // dead code elimination may not be trusted to remove it.
@@ -1980,7 +2195,7 @@ Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilderBase &B) {
     Value *LogE = Log->doesNotAccessMemory()
                       ? B.CreateCall(Intrinsic::getDeclaration(Mod, LogID, Ty),
                                      Eul, "log")
-                      : emitUnaryFloatFnCall(Eul, LogNm, B, Attrs);
+                      : emitUnaryFloatFnCall(Eul, TLI, LogNm, B, Attrs);
     Value *MulY = B.CreateFMul(Arg->getArgOperand(0), LogE, "mul");
     // Since exp() may have side effects, e.g. errno,
     // dead code elimination may not be trusted to remove it.
@@ -1992,14 +2207,16 @@ Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilderBase &B) {
 }
 
 Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilderBase &B) {
+  Module *M = CI->getModule();
   Function *Callee = CI->getCalledFunction();
   Value *Ret = nullptr;
   // TODO: Once we have a way (other than checking for the existince of the
   // libcall) to tell whether our target can lower @llvm.sqrt, relax the
   // condition below.
-  if (TLI->has(LibFunc_sqrtf) && (Callee->getName() == "sqrt" ||
-                                  Callee->getIntrinsicID() == Intrinsic::sqrt))
-    Ret = optimizeUnaryDoubleFP(CI, B, true);
+  if (isLibFuncEmittable(M, TLI, LibFunc_sqrtf) &&
+      (Callee->getName() == "sqrt" ||
+       Callee->getIntrinsicID() == Intrinsic::sqrt))
+    Ret = optimizeUnaryDoubleFP(CI, B, TLI, true);
 
   if (!CI->isFast())
     return Ret;
@@ -2044,7 +2261,6 @@ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilderBase &B) {
 
   // If we found a repeated factor, hoist it out of the square root and
   // replace it with the fabs of that factor.
-  Module *M = Callee->getParent();
   Type *ArgType = I->getType();
   Function *Fabs = Intrinsic::getDeclaration(M, Intrinsic::fabs, ArgType);
   Value *FabsCall = B.CreateCall(Fabs, RepeatOp, "fabs");
@@ -2061,11 +2277,12 @@ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilderBase &B) {
 
 // TODO: Generalize to handle any trig function and its inverse.
 Value *LibCallSimplifier::optimizeTan(CallInst *CI, IRBuilderBase &B) {
+  Module *M = CI->getModule();
   Function *Callee = CI->getCalledFunction();
   Value *Ret = nullptr;
   StringRef Name = Callee->getName();
-  if (UnsafeFPShrink && Name == "tan" && hasFloatVersion(Name))
-    Ret = optimizeUnaryDoubleFP(CI, B, true);
+  if (UnsafeFPShrink && Name == "tan" && hasFloatVersion(M, Name))
+    Ret = optimizeUnaryDoubleFP(CI, B, TLI, true);
 
   Value *Op1 = CI->getArgOperand(0);
   auto *OpC = dyn_cast<CallInst>(Op1);
@@ -2081,7 +2298,8 @@ Value *LibCallSimplifier::optimizeTan(CallInst *CI, IRBuilderBase &B) {
   // tanl(atanl(x)) -> x
   LibFunc Func;
   Function *F = OpC->getCalledFunction();
-  if (F && TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) &&
+  if (F && TLI->getLibFunc(F->getName(), Func) &&
+      isLibFuncEmittable(M, TLI, Func) &&
       ((Func == LibFunc_atan && Callee->getName() == "tan") ||
        (Func == LibFunc_atanf && Callee->getName() == "tanf") ||
        (Func == LibFunc_atanl && Callee->getName() == "tanl")))
@@ -2097,9 +2315,10 @@ static bool isTrigLibCall(CallInst *CI) {
          CI->hasFnAttr(Attribute::ReadNone);
 }
 
-static void insertSinCosCall(IRBuilderBase &B, Function *OrigCallee, Value *Arg,
+static bool insertSinCosCall(IRBuilderBase &B, Function *OrigCallee, Value *Arg,
                              bool UseFloat, Value *&Sin, Value *&Cos,
-                             Value *&SinCos) {
+                             Value *&SinCos, const TargetLibraryInfo *TLI) {
+  Module *M = OrigCallee->getParent();
   Type *ArgTy = Arg->getType();
   Type *ResTy;
   StringRef Name;
@@ -2119,9 +2338,12 @@ static void insertSinCosCall(IRBuilderBase &B, Function *OrigCallee, Value *Arg,
     ResTy = StructType::get(ArgTy, ArgTy);
   }
 
-  Module *M = OrigCallee->getParent();
-  FunctionCallee Callee =
-      M->getOrInsertFunction(Name, OrigCallee->getAttributes(), ResTy, ArgTy);
+  if (!isLibFuncEmittable(M, TLI, Name))
+    return false;
+  LibFunc TheLibFunc;
+  TLI->getLibFunc(Name, TheLibFunc);
+  FunctionCallee Callee = getOrInsertLibFunc(
+      M, *TLI, TheLibFunc, OrigCallee->getAttributes(), ResTy, ArgTy);
 
   if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) {
     // If the argument is an instruction, it must dominate all uses so put our
@@ -2145,6 +2367,8 @@ static void insertSinCosCall(IRBuilderBase &B, Function *OrigCallee, Value *Arg,
     Cos = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 1),
                                  "cospi");
   }
+
+  return true;
 }
 
 Value *LibCallSimplifier::optimizeSinCosPi(CallInst *CI, IRBuilderBase &B) {
@@ -2172,7 +2396,9 @@ Value *LibCallSimplifier::optimizeSinCosPi(CallInst *CI, IRBuilderBase &B) {
     return nullptr;
 
   Value *Sin, *Cos, *SinCos;
-  insertSinCosCall(B, CI->getCalledFunction(), Arg, IsFloat, Sin, Cos, SinCos);
+  if (!insertSinCosCall(B, CI->getCalledFunction(), Arg, IsFloat, Sin, Cos,
+                        SinCos, TLI))
+    return nullptr;
 
   auto replaceTrigInsts = [this](SmallVectorImpl<CallInst *> &Calls,
                                  Value *Res) {
@@ -2193,6 +2419,7 @@ void LibCallSimplifier::classifyArgUse(
     SmallVectorImpl<CallInst *> &CosCalls,
     SmallVectorImpl<CallInst *> &SinCosCalls) {
   CallInst *CI = dyn_cast<CallInst>(Val);
+  Module *M = CI->getModule();
 
   if (!CI || CI->use_empty())
     return;
@@ -2203,7 +2430,8 @@ void LibCallSimplifier::classifyArgUse(
 
   Function *Callee = CI->getCalledFunction();
   LibFunc Func;
-  if (!Callee || !TLI->getLibFunc(*Callee, Func) || !TLI->has(Func) ||
+  if (!Callee || !TLI->getLibFunc(*Callee, Func) ||
+      !isLibFuncEmittable(M, TLI, Func) ||
       !isTrigLibCall(CI))
     return;
 
@@ -2258,7 +2486,7 @@ Value *LibCallSimplifier::optimizeAbs(CallInst *CI, IRBuilderBase &B) {
   // abs(x) -> x <s 0 ? -x : x
   // The negation has 'nsw' because abs of INT_MIN is undefined.
   Value *X = CI->getArgOperand(0);
-  Value *IsNeg = B.CreateICmpSLT(X, Constant::getNullValue(X->getType()));
+  Value *IsNeg = B.CreateIsNeg(X);
   Value *NegX = B.CreateNSWNeg(X, "neg");
   return B.CreateSelect(IsNeg, NegX, X);
 }
@@ -2418,6 +2646,7 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilderBase &B) {
 
 Value *LibCallSimplifier::optimizePrintF(CallInst *CI, IRBuilderBase &B) {
 
+  Module *M = CI->getModule();
   Function *Callee = CI->getCalledFunction();
   FunctionType *FT = Callee->getFunctionType();
   if (Value *V = optimizePrintFString(CI, B)) {
@@ -2426,10 +2655,10 @@ Value *LibCallSimplifier::optimizePrintF(CallInst *CI, IRBuilderBase &B) {
 
   // printf(format, ...) -> iprintf(format, ...) if no floating point
   // arguments.
-  if (TLI->has(LibFunc_iprintf) && !callHasFloatingPointArgument(CI)) {
-    Module *M = B.GetInsertBlock()->getParent()->getParent();
-    FunctionCallee IPrintFFn =
-        M->getOrInsertFunction("iprintf", FT, Callee->getAttributes());
+  if (isLibFuncEmittable(M, TLI, LibFunc_iprintf) &&
+      !callHasFloatingPointArgument(CI)) {
+    FunctionCallee IPrintFFn = getOrInsertLibFunc(M, *TLI, LibFunc_iprintf, FT,
+                                                  Callee->getAttributes());
     CallInst *New = cast<CallInst>(CI->clone());
     New->setCalledFunction(IPrintFFn);
     B.Insert(New);
@@ -2438,11 +2667,10 @@ Value *LibCallSimplifier::optimizePrintF(CallInst *CI, IRBuilderBase &B) {
 
   // printf(format, ...) -> __small_printf(format, ...) if no 128-bit floating point
   // arguments.
-  if (TLI->has(LibFunc_small_printf) && !callHasFP128Argument(CI)) {
-    Module *M = B.GetInsertBlock()->getParent()->getParent();
-    auto SmallPrintFFn =
-        M->getOrInsertFunction(TLI->getName(LibFunc_small_printf),
-                               FT, Callee->getAttributes());
+  if (isLibFuncEmittable(M, TLI, LibFunc_small_printf) &&
+      !callHasFP128Argument(CI)) {
+    auto SmallPrintFFn = getOrInsertLibFunc(M, *TLI, LibFunc_small_printf, FT,
+                                            Callee->getAttributes());
     CallInst *New = cast<CallInst>(CI->clone());
     New->setCalledFunction(SmallPrintFFn);
     B.Insert(New);
@@ -2489,7 +2717,7 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI,
     Value *V = B.CreateTrunc(CI->getArgOperand(2), B.getInt8Ty(), "char");
     Value *Ptr = castToCStr(Dest, B);
     B.CreateStore(V, Ptr);
-    Ptr = B.CreateGEP(B.getInt8Ty(), Ptr, B.getInt32(1), "nul");
+    Ptr = B.CreateInBoundsGEP(B.getInt8Ty(), Ptr, B.getInt32(1), "nul");
     B.CreateStore(B.getInt8(0), Ptr);
 
     return ConstantInt::get(CI->getType(), 1);
@@ -2541,6 +2769,7 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI,
 }
 
 Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilderBase &B) {
+  Module *M = CI->getModule();
   Function *Callee = CI->getCalledFunction();
   FunctionType *FT = Callee->getFunctionType();
   if (Value *V = optimizeSPrintFString(CI, B)) {
@@ -2549,10 +2778,10 @@ Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilderBase &B) {
 
   // sprintf(str, format, ...) -> siprintf(str, format, ...) if no floating
   // point arguments.
-  if (TLI->has(LibFunc_siprintf) && !callHasFloatingPointArgument(CI)) {
-    Module *M = B.GetInsertBlock()->getParent()->getParent();
-    FunctionCallee SIPrintFFn =
-        M->getOrInsertFunction("siprintf", FT, Callee->getAttributes());
+  if (isLibFuncEmittable(M, TLI, LibFunc_siprintf) &&
+      !callHasFloatingPointArgument(CI)) {
+    FunctionCallee SIPrintFFn = getOrInsertLibFunc(M, *TLI, LibFunc_siprintf,
+                                                   FT, Callee->getAttributes());
     CallInst *New = cast<CallInst>(CI->clone());
     New->setCalledFunction(SIPrintFFn);
     B.Insert(New);
@@ -2561,11 +2790,10 @@ Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilderBase &B) {
 
   // sprintf(str, format, ...) -> __small_sprintf(str, format, ...) if no 128-bit
   // floating point arguments.
-  if (TLI->has(LibFunc_small_sprintf) && !callHasFP128Argument(CI)) {
-    Module *M = B.GetInsertBlock()->getParent()->getParent();
-    auto SmallSPrintFFn =
-        M->getOrInsertFunction(TLI->getName(LibFunc_small_sprintf),
-                               FT, Callee->getAttributes());
+  if (isLibFuncEmittable(M, TLI, LibFunc_small_sprintf) &&
+      !callHasFP128Argument(CI)) {
+    auto SmallSPrintFFn = getOrInsertLibFunc(M, *TLI, LibFunc_small_sprintf, FT,
+                                             Callee->getAttributes());
     CallInst *New = cast<CallInst>(CI->clone());
     New->setCalledFunction(SmallSPrintFFn);
     B.Insert(New);
@@ -2629,7 +2857,7 @@ Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI,
       Value *V = B.CreateTrunc(CI->getArgOperand(3), B.getInt8Ty(), "char");
       Value *Ptr = castToCStr(CI->getArgOperand(0), B);
       B.CreateStore(V, Ptr);
-      Ptr = B.CreateGEP(B.getInt8Ty(), Ptr, B.getInt32(1), "nul");
+      Ptr = B.CreateInBoundsGEP(B.getInt8Ty(), Ptr, B.getInt32(1), "nul");
       B.CreateStore(B.getInt8(0), Ptr);
 
       return ConstantInt::get(CI->getType(), 1);
@@ -2721,6 +2949,7 @@ Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI,
 }
 
 Value *LibCallSimplifier::optimizeFPrintF(CallInst *CI, IRBuilderBase &B) {
+  Module *M = CI->getModule();
   Function *Callee = CI->getCalledFunction();
   FunctionType *FT = Callee->getFunctionType();
   if (Value *V = optimizeFPrintFString(CI, B)) {
@@ -2729,10 +2958,10 @@ Value *LibCallSimplifier::optimizeFPrintF(CallInst *CI, IRBuilderBase &B) {
 
   // fprintf(stream, format, ...) -> fiprintf(stream, format, ...) if no
   // floating point arguments.
-  if (TLI->has(LibFunc_fiprintf) && !callHasFloatingPointArgument(CI)) {
-    Module *M = B.GetInsertBlock()->getParent()->getParent();
-    FunctionCallee FIPrintFFn =
-        M->getOrInsertFunction("fiprintf", FT, Callee->getAttributes());
+  if (isLibFuncEmittable(M, TLI, LibFunc_fiprintf) &&
+      !callHasFloatingPointArgument(CI)) {
+    FunctionCallee FIPrintFFn = getOrInsertLibFunc(M, *TLI, LibFunc_fiprintf,
+                                                   FT, Callee->getAttributes());
     CallInst *New = cast<CallInst>(CI->clone());
     New->setCalledFunction(FIPrintFFn);
     B.Insert(New);
@@ -2741,11 +2970,11 @@ Value *LibCallSimplifier::optimizeFPrintF(CallInst *CI, IRBuilderBase &B) {
 
   // fprintf(stream, format, ...) -> __small_fprintf(stream, format, ...) if no
   // 128-bit floating point arguments.
-  if (TLI->has(LibFunc_small_fprintf) && !callHasFP128Argument(CI)) {
-    Module *M = B.GetInsertBlock()->getParent()->getParent();
+  if (isLibFuncEmittable(M, TLI, LibFunc_small_fprintf) &&
+      !callHasFP128Argument(CI)) {
     auto SmallFPrintFFn =
-        M->getOrInsertFunction(TLI->getName(LibFunc_small_fprintf),
-                               FT, Callee->getAttributes());
+        getOrInsertLibFunc(M, *TLI, LibFunc_small_fprintf, FT,
+                           Callee->getAttributes());
     CallInst *New = cast<CallInst>(CI->clone());
     New->setCalledFunction(SmallFPrintFFn);
     B.Insert(New);
@@ -2830,21 +3059,19 @@ Value *LibCallSimplifier::optimizeBCopy(CallInst *CI, IRBuilderBase &B) {
                                         CI->getArgOperand(2)));
 }
 
-bool LibCallSimplifier::hasFloatVersion(StringRef FuncName) {
-  LibFunc Func;
+bool LibCallSimplifier::hasFloatVersion(const Module *M, StringRef FuncName) {
   SmallString<20> FloatFuncName = FuncName;
   FloatFuncName += 'f';
-  if (TLI->getLibFunc(FloatFuncName, Func))
-    return TLI->has(Func);
-  return false;
+  return isLibFuncEmittable(M, TLI, FloatFuncName);
 }
 
 Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
                                                       IRBuilderBase &Builder) {
+  Module *M = CI->getModule();
   LibFunc Func;
   Function *Callee = CI->getCalledFunction();
   // Check for string/memory library functions.
-  if (TLI->getLibFunc(*Callee, Func) && TLI->has(Func)) {
+  if (TLI->getLibFunc(*Callee, Func) && isLibFuncEmittable(M, TLI, Func)) {
     // Make sure we never change the calling convention.
     assert(
         (ignoreCallingConv(Func) ||
@@ -2871,6 +3098,8 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
       return optimizeStrNCpy(CI, Builder);
     case LibFunc_strlen:
       return optimizeStrLen(CI, Builder);
+    case LibFunc_strnlen:
+      return optimizeStrNLen(CI, Builder);
     case LibFunc_strpbrk:
       return optimizeStrPBrk(CI, Builder);
     case LibFunc_strndup:
@@ -2923,6 +3152,8 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
 Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI,
                                                        LibFunc Func,
                                                        IRBuilderBase &Builder) {
+  const Module *M = CI->getModule();
+
   // Don't optimize calls that require strict floating point semantics.
   if (CI->isStrictFP())
     return nullptr;
@@ -3001,12 +3232,12 @@ Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI,
   case LibFunc_sin:
   case LibFunc_sinh:
   case LibFunc_tanh:
-    if (UnsafeFPShrink && hasFloatVersion(CI->getCalledFunction()->getName()))
-      return optimizeUnaryDoubleFP(CI, Builder, true);
+    if (UnsafeFPShrink && hasFloatVersion(M, CI->getCalledFunction()->getName()))
+      return optimizeUnaryDoubleFP(CI, Builder, TLI, true);
     return nullptr;
   case LibFunc_copysign:
-    if (hasFloatVersion(CI->getCalledFunction()->getName()))
-      return optimizeBinaryDoubleFP(CI, Builder);
+    if (hasFloatVersion(M, CI->getCalledFunction()->getName()))
+      return optimizeBinaryDoubleFP(CI, Builder, TLI);
     return nullptr;
   case LibFunc_fminf:
   case LibFunc_fmin:
@@ -3025,6 +3256,7 @@ Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI,
 }
 
 Value *LibCallSimplifier::optimizeCall(CallInst *CI, IRBuilderBase &Builder) {
+  Module *M = CI->getModule();
   assert(!CI->isMustTailCall() && "These transforms aren't musttail safe.");
 
   // TODO: Split out the code below that operates on FP calls so that
@@ -3103,7 +3335,7 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI, IRBuilderBase &Builder) {
   }
 
   // Then check for known library functions.
-  if (TLI->getLibFunc(*Callee, Func) && TLI->has(Func)) {
+  if (TLI->getLibFunc(*Callee, Func) && isLibFuncEmittable(M, TLI, Func)) {
     // We never change the calling convention.
     if (!ignoreCallingConv(Func) && !IsCallingConvC)
       return nullptr;
@@ -3170,7 +3402,7 @@ LibCallSimplifier::LibCallSimplifier(
     function_ref<void(Instruction *, Value *)> Replacer,
     function_ref<void(Instruction *)> Eraser)
     : FortifiedSimplifier(TLI), DL(DL), TLI(TLI), ORE(ORE), BFI(BFI), PSI(PSI),
-      UnsafeFPShrink(false), Replacer(Replacer), Eraser(Eraser) {}
+      Replacer(Replacer), Eraser(Eraser) {}
 
 void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) {
   // Indirect through the replacer used in this instance.
@@ -3361,7 +3593,8 @@ Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
   // If the function was an __stpcpy_chk, and we were able to fold it into
   // a __memcpy_chk, we still need to return the correct end pointer.
   if (Ret && Func == LibFunc_stpcpy_chk)
-    return B.CreateGEP(B.getInt8Ty(), Dst, ConstantInt::get(SizeTTy, Len - 1));
+    return B.CreateInBoundsGEP(B.getInt8Ty(), Dst,
+                               ConstantInt::get(SizeTTy, Len - 1));
   return copyFlags(*CI, cast<CallInst>(Ret));
 }
 
diff --git a/llvm/lib/Transforms/Utils/SizeOpts.cpp b/llvm/lib/Transforms/Utils/SizeOpts.cpp
index 08a29ea16ba1..1242380f73c1 100644
--- a/llvm/lib/Transforms/Utils/SizeOpts.cpp
+++ b/llvm/lib/Transforms/Utils/SizeOpts.cpp
@@ -48,12 +48,12 @@ cl::opt<bool> llvm::ForcePGSO(
     cl::desc("Force the (profiled-guided) size optimizations. "));
 
 cl::opt<int> llvm::PgsoCutoffInstrProf(
-    "pgso-cutoff-instr-prof", cl::Hidden, cl::init(950000), cl::ZeroOrMore,
+    "pgso-cutoff-instr-prof", cl::Hidden, cl::init(950000),
     cl::desc("The profile guided size optimization profile summary cutoff "
              "for instrumentation profile."));
 
 cl::opt<int> llvm::PgsoCutoffSampleProf(
-    "pgso-cutoff-sample-prof", cl::Hidden, cl::init(990000), cl::ZeroOrMore,
+    "pgso-cutoff-sample-prof", cl::Hidden, cl::init(990000),
     cl::desc("The profile guided size optimization profile summary cutoff "
              "for sample profile."));
 
diff --git a/llvm/lib/Transforms/Utils/StripGCRelocates.cpp b/llvm/lib/Transforms/Utils/StripGCRelocates.cpp
index 1fa574f04c37..0ff88e8b4612 100644
--- a/llvm/lib/Transforms/Utils/StripGCRelocates.cpp
+++ b/llvm/lib/Transforms/Utils/StripGCRelocates.cpp
@@ -9,7 +9,7 @@
 // This is a little utility pass that removes the gc.relocates inserted by
 // RewriteStatepointsForGC. Note that the generated IR is incorrect,
 // but this is useful as a single pass in itself, for analysis of IR, without
-// the GC.relocates. The statepoint and gc.result instrinsics would still be
+// the GC.relocates. The statepoint and gc.result intrinsics would still be
 // present.
 //===----------------------------------------------------------------------===//
 
@@ -18,10 +18,8 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Statepoint.h"
-#include "llvm/IR/Type.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Transforms/Utils/SymbolRewriter.cpp b/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
index 6a0eb34a7999..4ad16d622e8d 100644
--- a/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
+++ b/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
@@ -57,7 +57,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/SymbolRewriter.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/ilist.h"
diff --git a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
index 0b718ed6136e..832353741500 100644
--- a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
+++ b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
@@ -18,7 +18,9 @@
 
 #include "llvm/Transforms/Utils/UnifyLoopExits.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Transforms/Utils.h"
@@ -143,6 +145,8 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
   // locate the exit blocks.
   SetVector<BasicBlock *> ExitingBlocks;
   SetVector<BasicBlock *> Exits;
+  // Record the exit blocks that branch to the same block.
+  MapVector<BasicBlock *, SetVector<BasicBlock *> > CommonSuccs;
 
   // We need SetVectors, but the Loop API takes a vector, so we use a temporary.
   SmallVector<BasicBlock *, 8> Temp;
@@ -156,6 +160,11 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
       if (SL == L || L->contains(SL))
         continue;
       Exits.insert(S);
+      // The typical case for reducing the number of guard blocks occurs when
+      // the exit block has a single predecessor and successor.
+      if (S->getSinglePredecessor())
+        if (auto *Succ = S->getSingleSuccessor())
+          CommonSuccs[Succ].insert(S);
     }
   }
 
@@ -170,13 +179,39 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
       for (auto EB : ExitingBlocks) {
         dbgs() << " " << EB->getName();
       }
-      dbgs() << "\n";);
+      dbgs() << "\n";
+
+      dbgs() << "Exit blocks with a common successor:\n";
+      for (auto CS : CommonSuccs) {
+        dbgs() << "  Succ " << CS.first->getName() << ", exits:";
+        for (auto Exit : CS.second)
+          dbgs() << " " << Exit->getName();
+        dbgs() << "\n";
+      });
 
   if (Exits.size() <= 1) {
     LLVM_DEBUG(dbgs() << "loop does not have multiple exits; nothing to do\n");
     return false;
   }
 
+  // When multiple exit blocks branch to the same block, change the control
+  // flow hub to after the exit blocks rather than before. This reduces the
+  // number of guard blocks needed after the loop.
+  for (auto CS : CommonSuccs) {
+    auto CB = CS.first;
+    auto Preds = CS.second;
+    if (Exits.contains(CB))
+      continue;
+    if (Preds.size() < 2 || Preds.size() == Exits.size())
+      continue;
+    for (auto Exit : Preds) {
+      Exits.remove(Exit);
+      ExitingBlocks.remove(Exit->getSinglePredecessor());
+      ExitingBlocks.insert(Exit);
+    }
+    Exits.insert(CB);
+  }
+
   SmallVector<BasicBlock *, 8> GuardBlocks;
   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
   auto LoopExitBlock = CreateControlFlowHub(&DTU, GuardBlocks, ExitingBlocks,
@@ -196,6 +231,17 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
   if (auto ParentLoop = L->getParentLoop()) {
     for (auto G : GuardBlocks) {
       ParentLoop->addBasicBlockToLoop(G, LI);
+      // Ensure the guard block predecessors are in a valid loop. After the
+      // change to the control flow hub for common successors, a guard block
+      // predecessor may not be in a loop or may be in an outer loop.
+      for (auto Pred : predecessors(G)) {
+        auto PredLoop = LI.getLoopFor(Pred);
+        if (!ParentLoop->contains(PredLoop)) {
+          if (PredLoop)
+            LI.removeBlock(Pred);
+          ParentLoop->addBasicBlockToLoop(Pred, LI);
+        }
+      }
     }
     ParentLoop->verifyLoop();
   }
diff --git a/llvm/lib/Transforms/Utils/Utils.cpp b/llvm/lib/Transforms/Utils/Utils.cpp
index 43eb5c87acee..f34f2df971b1 100644
--- a/llvm/lib/Transforms/Utils/Utils.cpp
+++ b/llvm/lib/Transforms/Utils/Utils.cpp
@@ -34,6 +34,7 @@ void llvm::initializeTransformUtils(PassRegistry &Registry) {
   initializeLCSSAWrapperPassPass(Registry);
   initializeLibCallsShrinkWrapLegacyPassPass(Registry);
   initializeLoopSimplifyPass(Registry);
+  initializeLowerGlobalDtorsLegacyPassPass(Registry);
   initializeLowerInvokeLegacyPassPass(Registry);
   initializeLowerSwitchLegacyPassPass(Registry);
   initializeNameAnonGlobalLegacyPassPass(Registry);
diff --git a/llvm/lib/Transforms/Utils/VNCoercion.cpp b/llvm/lib/Transforms/Utils/VNCoercion.cpp
index 637181722f63..42be67f3cfc0 100644
--- a/llvm/lib/Transforms/Utils/VNCoercion.cpp
+++ b/llvm/lib/Transforms/Utils/VNCoercion.cpp
@@ -64,10 +64,15 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
   return true;
 }
 
-template <class T, class HelperClass>
-static T *coerceAvailableValueToLoadTypeHelper(T *StoredVal, Type *LoadedTy,
-                                               HelperClass &Helper,
-                                               const DataLayout &DL) {
+/// If we saw a store of a value to memory, and
+/// then a load from a must-aliased pointer of a different type, try to coerce
+/// the stored value.  LoadedTy is the type of the load we want to replace.
+/// IRB is IRBuilder used to insert new instructions.
+///
+/// If we can't do it, return null.
+Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
+                                      IRBuilderBase &Helper,
+                                      const DataLayout &DL) {
   assert(canCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL) &&
          "precondition violation - materialization can't fail");
   if (auto *C = dyn_cast<Constant>(StoredVal))
@@ -154,18 +159,6 @@ static T *coerceAvailableValueToLoadTypeHelper(T *StoredVal, Type *LoadedTy,
   return StoredVal;
 }
 
-/// If we saw a store of a value to memory, and
-/// then a load from a must-aliased pointer of a different type, try to coerce
-/// the stored value.  LoadedTy is the type of the load we want to replace.
-/// IRB is IRBuilder used to insert new instructions.
-///
-/// If we can't do it, return null.
-Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
-                                      IRBuilderBase &IRB,
-                                      const DataLayout &DL) {
-  return coerceAvailableValueToLoadTypeHelper(StoredVal, LoadedTy, IRB, DL);
-}
-
 /// This function is called when we have a memdep query of a load that ends up
 /// being a clobbering memory write (store, memset, memcpy, memmove).  This
 /// means that the write *may* provide bits used by the load but we can't be
@@ -277,7 +270,7 @@ static unsigned getLoadLoadClobberFullWidthSize(const Value *MemLocBase,
   // looking at an i8 load on x86-32 that is known 1024 byte aligned, we can
   // widen it up to an i32 load.  If it is known 2-byte aligned, we can widen it
   // to i16.
-  unsigned LoadAlign = LI->getAlignment();
+  unsigned LoadAlign = LI->getAlign().value();
 
   int64_t MemLocEnd = MemLocOffs + MemLocSize;
 
@@ -400,10 +393,9 @@ int analyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
   return -1;
 }
 
-template <class T, class HelperClass>
-static T *getStoreValueForLoadHelper(T *SrcVal, unsigned Offset, Type *LoadTy,
-                                     HelperClass &Helper,
-                                     const DataLayout &DL) {
+static Value *getStoreValueForLoadHelper(Value *SrcVal, unsigned Offset,
+                                         Type *LoadTy, IRBuilderBase &Builder,
+                                         const DataLayout &DL) {
   LLVMContext &Ctx = SrcVal->getType()->getContext();
 
   // If two pointers are in the same address space, they have the same size,
@@ -421,9 +413,11 @@ static T *getStoreValueForLoadHelper(T *SrcVal, unsigned Offset, Type *LoadTy,
   // Compute which bits of the stored value are being used by the load.  Convert
   // to an integer type to start with.
   if (SrcVal->getType()->isPtrOrPtrVectorTy())
-    SrcVal = Helper.CreatePtrToInt(SrcVal, DL.getIntPtrType(SrcVal->getType()));
+    SrcVal =
+        Builder.CreatePtrToInt(SrcVal, DL.getIntPtrType(SrcVal->getType()));
   if (!SrcVal->getType()->isIntegerTy())
-    SrcVal = Helper.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize * 8));
+    SrcVal =
+        Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize * 8));
 
   // Shift the bits to the least significant depending on endianness.
   unsigned ShiftAmt;
@@ -432,12 +426,12 @@ static T *getStoreValueForLoadHelper(T *SrcVal, unsigned Offset, Type *LoadTy,
   else
     ShiftAmt = (StoreSize - LoadSize - Offset) * 8;
   if (ShiftAmt)
-    SrcVal = Helper.CreateLShr(SrcVal,
-                               ConstantInt::get(SrcVal->getType(), ShiftAmt));
+    SrcVal = Builder.CreateLShr(SrcVal,
+                                ConstantInt::get(SrcVal->getType(), ShiftAmt));
 
   if (LoadSize != StoreSize)
-    SrcVal = Helper.CreateTruncOrBitCast(SrcVal,
-                                         IntegerType::get(Ctx, LoadSize * 8));
+    SrcVal = Builder.CreateTruncOrBitCast(SrcVal,
+                                          IntegerType::get(Ctx, LoadSize * 8));
   return SrcVal;
 }
 
@@ -450,14 +444,12 @@ Value *getStoreValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy,
 
   IRBuilder<> Builder(InsertPt);
   SrcVal = getStoreValueForLoadHelper(SrcVal, Offset, LoadTy, Builder, DL);
-  return coerceAvailableValueToLoadTypeHelper(SrcVal, LoadTy, Builder, DL);
+  return coerceAvailableValueToLoadType(SrcVal, LoadTy, Builder, DL);
 }
 
 Constant *getConstantStoreValueForLoad(Constant *SrcVal, unsigned Offset,
                                        Type *LoadTy, const DataLayout &DL) {
-  ConstantFolder F;
-  SrcVal = getStoreValueForLoadHelper(SrcVal, Offset, LoadTy, F, DL);
-  return coerceAvailableValueToLoadTypeHelper(SrcVal, LoadTy, F, DL);
+  return ConstantFoldLoadFromConst(SrcVal, LoadTy, APInt(32, Offset), DL);
 }
 
 /// This function is called when we have a memdep query of a load that ends up
@@ -522,75 +514,77 @@ Constant *getConstantLoadValueForLoad(Constant *SrcVal, unsigned Offset,
   return getConstantStoreValueForLoad(SrcVal, Offset, LoadTy, DL);
 }
 
-template <class T, class HelperClass>
-T *getMemInstValueForLoadHelper(MemIntrinsic *SrcInst, unsigned Offset,
-                                Type *LoadTy, HelperClass &Helper,
-                                const DataLayout &DL) {
+/// This function is called when we have a
+/// memdep query of a load that ends up being a clobbering mem intrinsic.
+Value *getMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
+                              Type *LoadTy, Instruction *InsertPt,
+                              const DataLayout &DL) {
   LLVMContext &Ctx = LoadTy->getContext();
   uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy).getFixedSize() / 8;
+  IRBuilder<> Builder(InsertPt);
 
   // We know that this method is only called when the mem transfer fully
   // provides the bits for the load.
   if (MemSetInst *MSI = dyn_cast<MemSetInst>(SrcInst)) {
     // memset(P, 'x', 1234) -> splat('x'), even if x is a variable, and
     // independently of what the offset is.
-    T *Val = cast<T>(MSI->getValue());
+    Value *Val = MSI->getValue();
     if (LoadSize != 1)
       Val =
-          Helper.CreateZExtOrBitCast(Val, IntegerType::get(Ctx, LoadSize * 8));
-    T *OneElt = Val;
+          Builder.CreateZExtOrBitCast(Val, IntegerType::get(Ctx, LoadSize * 8));
+    Value *OneElt = Val;
 
     // Splat the value out to the right number of bits.
     for (unsigned NumBytesSet = 1; NumBytesSet != LoadSize;) {
       // If we can double the number of bytes set, do it.
       if (NumBytesSet * 2 <= LoadSize) {
-        T *ShVal = Helper.CreateShl(
+        Value *ShVal = Builder.CreateShl(
             Val, ConstantInt::get(Val->getType(), NumBytesSet * 8));
-        Val = Helper.CreateOr(Val, ShVal);
+        Val = Builder.CreateOr(Val, ShVal);
         NumBytesSet <<= 1;
         continue;
       }
 
       // Otherwise insert one byte at a time.
-      T *ShVal = Helper.CreateShl(Val, ConstantInt::get(Val->getType(), 1 * 8));
-      Val = Helper.CreateOr(OneElt, ShVal);
+      Value *ShVal =
+          Builder.CreateShl(Val, ConstantInt::get(Val->getType(), 1 * 8));
+      Val = Builder.CreateOr(OneElt, ShVal);
       ++NumBytesSet;
     }
 
-    return coerceAvailableValueToLoadTypeHelper(Val, LoadTy, Helper, DL);
+    return coerceAvailableValueToLoadType(Val, LoadTy, Builder, DL);
   }
 
   // Otherwise, this is a memcpy/memmove from a constant global.
   MemTransferInst *MTI = cast<MemTransferInst>(SrcInst);
   Constant *Src = cast<Constant>(MTI->getSource());
-
-  // Otherwise, see if we can constant fold a load from the constant with the
-  // offset applied as appropriate.
   unsigned IndexSize = DL.getIndexTypeSizeInBits(Src->getType());
-  return ConstantFoldLoadFromConstPtr(
-      Src, LoadTy, APInt(IndexSize, Offset), DL);
-}
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being a clobbering mem intrinsic.
-Value *getMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
-                              Type *LoadTy, Instruction *InsertPt,
-                              const DataLayout &DL) {
-  IRBuilder<> Builder(InsertPt);
-  return getMemInstValueForLoadHelper<Value, IRBuilder<>>(SrcInst, Offset,
-                                                          LoadTy, Builder, DL);
+  return ConstantFoldLoadFromConstPtr(Src, LoadTy, APInt(IndexSize, Offset),
+                                      DL);
 }
 
 Constant *getConstantMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
                                          Type *LoadTy, const DataLayout &DL) {
-  // The only case analyzeLoadFromClobberingMemInst cannot be converted to a
-  // constant is when it's a memset of a non-constant.
-  if (auto *MSI = dyn_cast<MemSetInst>(SrcInst))
-    if (!isa<Constant>(MSI->getValue()))
+  LLVMContext &Ctx = LoadTy->getContext();
+  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy).getFixedSize() / 8;
+
+  // We know that this method is only called when the mem transfer fully
+  // provides the bits for the load.
+  if (MemSetInst *MSI = dyn_cast<MemSetInst>(SrcInst)) {
+    auto *Val = dyn_cast<ConstantInt>(MSI->getValue());
+    if (!Val)
       return nullptr;
-  ConstantFolder F;
-  return getMemInstValueForLoadHelper<Constant, ConstantFolder>(SrcInst, Offset,
-                                                                LoadTy, F, DL);
+
+    Val = ConstantInt::get(Ctx, APInt::getSplat(LoadSize * 8, Val->getValue()));
+    return ConstantFoldLoadFromConst(Val, LoadTy, DL);
+  }
+
+  // Otherwise, this is a memcpy/memmove from a constant global.
+  MemTransferInst *MTI = cast<MemTransferInst>(SrcInst);
+  Constant *Src = cast<Constant>(MTI->getSource());
+  unsigned IndexSize = DL.getIndexTypeSizeInBits(Src->getType());
+  return ConstantFoldLoadFromConstPtr(Src, LoadTy, APInt(IndexSize, Offset),
+                                      DL);
 }
 } // namespace VNCoercion
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 97c2acb7d4c7..f59fc3a6dd60 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -62,14 +62,13 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
-#include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
@@ -497,7 +496,7 @@ bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
   if (PtrDelta.urem(Stride) != 0)
     return false;
   unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
-  APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth);
+  APInt IdxDiff = PtrDelta.udiv(Stride).zext(IdxBitWidth);
 
   // Only look through a ZExt/SExt.
   if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
@@ -1298,10 +1297,16 @@ bool Vectorizer::vectorizeLoadChain(
     CV->replaceAllUsesWith(V);
   }
 
-  // Bitcast might not be an Instruction, if the value being loaded is a
-  // constant. In that case, no need to reorder anything.
-  if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
-    reorder(BitcastInst);
+  // Since we might have opaque pointers we might end up using the pointer
+  // operand of the first load (wrt. memory loaded) for the vector load. Since
+  // this first load might not be the first in the block we potentially need to
+  // reorder the pointer operand (and its operands). If we have a bitcast though
+  // it might be before the load and should be the reorder start instruction.
+  // "Might" because for opaque pointers the "bitcast" is just the first loads
+  // pointer operand, as oppposed to something we inserted at the right position
+  // ourselves.
+  Instruction *BCInst = dyn_cast<Instruction>(Bitcast);
+  reorder((BCInst && BCInst != L0->getPointerOperand()) ? BCInst : LI);
 
   eraseInstructions(Chain);
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 81e5aa223c07..6242d9a93fc1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -17,7 +17,9 @@
 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -31,8 +33,6 @@ using namespace PatternMatch;
 #define LV_NAME "loop-vectorize"
 #define DEBUG_TYPE LV_NAME
 
-extern cl::opt<bool> EnableVPlanPredication;
-
 static cl::opt<bool>
     EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
                        cl::desc("Enable if-conversion during vectorization."));
@@ -439,6 +439,26 @@ static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
   return false;
 }
 
+/// Returns true if A and B have same pointer operands or same SCEVs addresses
+static bool storeToSameAddress(ScalarEvolution *SE, StoreInst *A,
+                               StoreInst *B) {
+  // Compare store
+  if (A == B)
+    return true;
+
+  // Otherwise Compare pointers
+  Value *APtr = A->getPointerOperand();
+  Value *BPtr = B->getPointerOperand();
+  if (APtr == BPtr)
+    return true;
+
+  // Otherwise compare address SCEVs
+  if (SE->getSCEV(APtr) == SE->getSCEV(BPtr))
+    return true;
+
+  return false;
+}
+
 int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
                                                 Value *Ptr) const {
   const ValueToValueMap &Strides =
@@ -487,7 +507,7 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() {
     // FIXME: We skip these checks when VPlan predication is enabled as we
     // want to allow divergent branches. This whole check will be removed
     // once VPlan predication is on by default.
-    if (!EnableVPlanPredication && Br && Br->isConditional() &&
+    if (Br && Br->isConditional() &&
         !TheLoop->isLoopInvariant(Br->getCondition()) &&
         !LI->isLoopHeader(Br->getSuccessor(0)) &&
         !LI->isLoopHeader(Br->getSuccessor(1))) {
@@ -572,7 +592,7 @@ void LoopVectorizationLegality::addInductionPhi(
   // on predicates that only hold within the loop, since allowing the exit
   // currently means re-using this SCEV outside the loop (see PR33706 for more
   // details).
-  if (PSE.getUnionPredicate().isAlwaysTrue()) {
+  if (PSE.getPredicate().isAlwaysTrue()) {
     AllowedExit.insert(Phi);
     AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch()));
   }
@@ -676,7 +696,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
 
         RecurrenceDescriptor RedDes;
         if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC,
-                                                 DT)) {
+                                                 DT, PSE.getSE())) {
           Requirements->addExactFPMathInst(RedDes.getExactFPMathInst());
           AllowedExit.insert(RedDes.getLoopExitInstr());
           Reductions[Phi] = RedDes;
@@ -770,7 +790,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         auto *SE = PSE.getSE();
         Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
         for (unsigned i = 0, e = CI->arg_size(); i != e; ++i)
-          if (hasVectorInstrinsicScalarOpd(IntrinID, i)) {
+          if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, i)) {
             if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(i)), TheLoop)) {
               reportVectorizationFailure("Found unvectorizable intrinsic",
                   "intrinsic instruction cannot be vectorized",
@@ -849,7 +869,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         // used outside the loop only if the SCEV predicates within the loop is
         // same as outside the loop. Allowing the exit means reusing the SCEV
         // outside the loop.
-        if (PSE.getUnionPredicate().isAlwaysTrue()) {
+        if (PSE.getPredicate().isAlwaysTrue()) {
           AllowedExit.insert(&I);
           continue;
         }
@@ -911,15 +931,70 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   if (!LAI->canVectorizeMemory())
     return false;
 
-  if (LAI->hasDependenceInvolvingLoopInvariantAddress()) {
-    reportVectorizationFailure("Stores to a uniform address",
-        "write to a loop invariant address could not be vectorized",
-        "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
-    return false;
+  // We can vectorize stores to invariant address when final reduction value is
+  // guaranteed to be stored at the end of the loop. Also, if decision to
+  // vectorize loop is made, runtime checks are added so as to make sure that
+  // invariant address won't alias with any other objects.
+  if (!LAI->getStoresToInvariantAddresses().empty()) {
+    // For each invariant address, check its last stored value is unconditional.
+    for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
+      if (isInvariantStoreOfReduction(SI) &&
+          blockNeedsPredication(SI->getParent())) {
+        reportVectorizationFailure(
+            "We don't allow storing to uniform addresses",
+            "write of conditional recurring variant value to a loop "
+            "invariant address could not be vectorized",
+            "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
+        return false;
+      }
+    }
+
+    if (LAI->hasDependenceInvolvingLoopInvariantAddress()) {
+      // For each invariant address, check its last stored value is the result
+      // of one of our reductions.
+      //
+      // We do not check if dependence with loads exists because they are
+      // currently rejected earlier in LoopAccessInfo::analyzeLoop. In case this
+      // behaviour changes we have to modify this code.
+      ScalarEvolution *SE = PSE.getSE();
+      SmallVector<StoreInst *, 4> UnhandledStores;
+      for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
+        if (isInvariantStoreOfReduction(SI)) {
+          // Earlier stores to this address are effectively deadcode.
+          // With opaque pointers it is possible for one pointer to be used with
+          // different sizes of stored values:
+          //    store i32 0, ptr %x
+          //    store i8 0, ptr %x
+          // The latest store doesn't complitely overwrite the first one in the
+          // example. That is why we have to make sure that types of stored
+          // values are same.
+          // TODO: Check that bitwidth of unhandled store is smaller then the
+          // one that overwrites it and add a test.
+          erase_if(UnhandledStores, [SE, SI](StoreInst *I) {
+            return storeToSameAddress(SE, SI, I) &&
+                   I->getValueOperand()->getType() ==
+                       SI->getValueOperand()->getType();
+          });
+          continue;
+        }
+        UnhandledStores.push_back(SI);
+      }
+
+      bool IsOK = UnhandledStores.empty();
+      // TODO: we should also validate against InvariantMemSets.
+      if (!IsOK) {
+        reportVectorizationFailure(
+            "We don't allow storing to uniform addresses",
+            "write to a loop invariant address could not "
+            "be vectorized",
+            "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
+        return false;
+      }
+    }
   }
 
   Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
-  PSE.addPredicate(LAI->getPSE().getUnionPredicate());
+  PSE.addPredicate(LAI->getPSE().getPredicate());
   return true;
 }
 
@@ -949,6 +1024,26 @@ bool LoopVectorizationLegality::canVectorizeFPMath(
   }));
 }
 
+bool LoopVectorizationLegality::isInvariantStoreOfReduction(StoreInst *SI) {
+  return any_of(getReductionVars(), [&](auto &Reduction) -> bool {
+    const RecurrenceDescriptor &RdxDesc = Reduction.second;
+    return RdxDesc.IntermediateStore == SI;
+  });
+}
+
+bool LoopVectorizationLegality::isInvariantAddressOfReduction(Value *V) {
+  return any_of(getReductionVars(), [&](auto &Reduction) -> bool {
+    const RecurrenceDescriptor &RdxDesc = Reduction.second;
+    if (!RdxDesc.IntermediateStore)
+      return false;
+
+    ScalarEvolution *SE = PSE.getSE();
+    Value *InvariantAddress = RdxDesc.IntermediateStore->getPointerOperand();
+    return V == InvariantAddress ||
+           SE->getSCEV(V) == SE->getSCEV(InvariantAddress);
+  });
+}
+
 bool LoopVectorizationLegality::isInductionPhi(const Value *V) const {
   Value *In0 = const_cast<Value *>(V);
   PHINode *PN = dyn_cast_or_null<PHINode>(In0);
@@ -969,6 +1064,16 @@ LoopVectorizationLegality::getIntOrFpInductionDescriptor(PHINode *Phi) const {
   return nullptr;
 }
 
+const InductionDescriptor *
+LoopVectorizationLegality::getPointerInductionDescriptor(PHINode *Phi) const {
+  if (!isInductionPhi(Phi))
+    return nullptr;
+  auto &ID = getInductionVars().find(Phi)->second;
+  if (ID.getKind() == InductionDescriptor::IK_PtrInduction)
+    return &ID;
+  return nullptr;
+}
+
 bool LoopVectorizationLegality::isCastedInductionVariable(
     const Value *V) const {
   auto *Inst = dyn_cast<Instruction>(V);
@@ -1266,7 +1371,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
   if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
     SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
 
-  if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
+  if (PSE.getPredicate().getComplexity() > SCEVThreshold) {
     reportVectorizationFailure("Too many SCEV checks needed",
         "Too many SCEV assumptions need to be made and checked at runtime",
         "TooManySCEVRunTimeChecks", ORE, TheLoop);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 71eb39a18d2f..0cb2032fa45a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -25,6 +25,7 @@
 #define LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
 
 #include "VPlan.h"
+#include "llvm/Support/InstructionCost.h"
 
 namespace llvm {
 
@@ -59,7 +60,7 @@ class VPBuilder {
   }
 
 public:
-  VPBuilder() {}
+  VPBuilder() = default;
 
   /// Clear the insertion point: created instructions will not be inserted into
   /// a block.
@@ -187,12 +188,16 @@ struct VectorizationFactor {
   /// Cost of the loop with that width.
   InstructionCost Cost;
 
-  VectorizationFactor(ElementCount Width, InstructionCost Cost)
-      : Width(Width), Cost(Cost) {}
+  /// Cost of the scalar loop.
+  InstructionCost ScalarCost;
+
+  VectorizationFactor(ElementCount Width, InstructionCost Cost,
+                      InstructionCost ScalarCost)
+      : Width(Width), Cost(Cost), ScalarCost(ScalarCost) {}
 
   /// Width 1 means no vectorization, cost 0 means uncomputed cost.
   static VectorizationFactor Disabled() {
-    return {ElementCount::getFixed(1), 0};
+    return {ElementCount::getFixed(1), 0, 0};
   }
 
   bool operator==(const VectorizationFactor &rhs) const {
@@ -298,8 +303,12 @@ public:
 
   /// Generate the IR code for the body of the vectorized loop according to the
   /// best selected \p VF, \p UF and VPlan \p BestPlan.
+  /// TODO: \p IsEpilogueVectorization is needed to avoid issues due to epilogue
+  /// vectorization re-using plans for both the main and epilogue vector loops.
+  /// It should be removed once the re-use issue has been fixed.
   void executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan,
-                   InnerLoopVectorizer &LB, DominatorTree *DT);
+                   InnerLoopVectorizer &LB, DominatorTree *DT,
+                   bool IsEpilogueVectorization);
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   void printPlans(raw_ostream &O);
@@ -319,6 +328,9 @@ public:
   getDecisionAndClampRange(const std::function<bool(ElementCount)> &Predicate,
                            VFRange &Range);
 
+  /// Check if the number of runtime checks exceeds the threshold.
+  bool requiresTooManyRuntimeChecks() const;
+
 protected:
   /// Collect the instructions from the original loop that would be trivially
   /// dead in the vectorized loop if generated.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3290439ecd07..b637b2d5ddae 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -58,7 +58,6 @@
 #include "VPRecipeBuilder.h"
 #include "VPlan.h"
 #include "VPlanHCFGBuilder.h"
-#include "VPlanPredicator.h"
 #include "VPlanTransforms.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -112,7 +111,6 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
@@ -144,10 +142,10 @@
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
-#include <cstdlib>
 #include <functional>
 #include <iterator>
 #include <limits>
+#include <map>
 #include <memory>
 #include <string>
 #include <tuple>
@@ -346,13 +344,6 @@ cl::opt<bool> EnableVPlanNativePath(
     cl::desc("Enable VPlan-native vectorization path with "
              "support for outer loop vectorization."));
 
-// FIXME: Remove this switch once we have divergence analysis. Currently we
-// assume divergent non-backedge branches when this switch is true.
-cl::opt<bool> EnableVPlanPredication(
-    "enable-vplan-predication", cl::init(false), cl::Hidden,
-    cl::desc("Enable VPlan-native vectorization path predicator with "
-             "support for outer loop vectorization."));
-
 // This flag enables the stress testing of the VPlan H-CFG construction in the
 // VPlan-native vectorization path. It must be used in conjuction with
 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
@@ -481,7 +472,7 @@ public:
                             VPTransformState &State);
 
   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
-  void fixVectorizedLoop(VPTransformState &State);
+  void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
 
   // Return true if any runtime check is added.
   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
@@ -491,12 +482,6 @@ public:
   /// new unrolled loop, where UF is the unroll factor.
   using VectorParts = SmallVector<Value *, 2>;
 
-  /// Vectorize a single first-order recurrence or pointer induction PHINode in
-  /// a block. This method handles the induction variable canonicalization. It
-  /// supports both VF = 1 for unrolled loops and arbitrary length vectors.
-  void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR,
-                           VPTransformState &State);
-
   /// A helper function to scalarize a single Instruction in the innermost loop.
   /// Generates a sequence of scalar instances for each lane between \p MinLane
   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
@@ -506,13 +491,6 @@ public:
                             const VPIteration &Instance, bool IfPredicateInstr,
                             VPTransformState &State);
 
-  /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
-  /// is provided, the integer induction variable will first be truncated to
-  /// the corresponding type. \p CanonicalIV is the scalar value generated for
-  /// the canonical induction variable.
-  void widenIntOrFpInduction(PHINode *IV, VPWidenIntOrFpInductionRecipe *Def,
-                             VPTransformState &State, Value *CanonicalIV);
-
   /// Construct the vector value of a scalarized value \p V one lane at a time.
   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
                                  VPTransformState &State);
@@ -527,13 +505,8 @@ public:
                                 ArrayRef<VPValue *> StoredValues,
                                 VPValue *BlockInMask = nullptr);
 
-  /// Set the debug location in the builder \p Ptr using the debug location in
-  /// \p V. If \p Ptr is None then it uses the class member's Builder.
-  void setDebugLocFromInst(const Value *V,
-                           Optional<IRBuilder<> *> CustomBuilder = None);
-
-  /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
-  void fixNonInductionPHIs(VPTransformState &State);
+  /// Fix the non-induction PHIs in \p Plan.
+  void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
 
   /// Returns true if the reordering of FP operations is not allowed, but we are
   /// able to vectorize with strict in-order reductions for the given RdxDesc.
@@ -546,17 +519,6 @@ public:
   /// element.
   virtual Value *getBroadcastInstrs(Value *V);
 
-  /// Add metadata from one instruction to another.
-  ///
-  /// This includes both the original MDs from \p From and additional ones (\see
-  /// addNewMetadata).  Use this for *newly created* instructions in the vector
-  /// loop.
-  void addMetadata(Instruction *To, Instruction *From);
-
-  /// Similar to the previous function but it adds the metadata to a
-  /// vector of instructions.
-  void addMetadata(ArrayRef<Value *> To, Instruction *From);
-
   // Returns the resume value (bc.merge.rdx) for a reduction as
   // generated by fixReduction.
   PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
@@ -575,13 +537,9 @@ protected:
 
   /// Set up the values of the IVs correctly when exiting the vector loop.
   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
-                    Value *CountRoundDown, Value *EndValue,
-                    BasicBlock *MiddleBlock);
-
-  /// Introduce a conditional branch (on true, condition to be set later) at the
-  /// end of the header=latch connecting it to itself (across the backedge) and
-  /// to the exit block of \p L.
-  void createHeaderBranch(Loop *L);
+                    Value *VectorTripCount, Value *EndValue,
+                    BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
+                    VPlan &Plan);
 
   /// Handle all cross-iteration phis in the header.
   void fixCrossIterationPHIs(VPTransformState &State);
@@ -595,16 +553,9 @@ protected:
   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
 
   /// Clear NSW/NUW flags from reduction instructions if necessary.
-  void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
+  void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
                                VPTransformState &State);
 
-  /// Fixup the LCSSA phi nodes in the unique exit block.  This simply
-  /// means we need to add the appropriate incoming value from the middle
-  /// block as exiting edges from the scalar epilogue loop (if present) are
-  /// already in place, and we exit the vector loop exclusively to the middle
-  /// block.
-  void fixLCSSAPHIs(VPTransformState &State);
-
   /// Iteratively sink the scalarized operands of a predicated instruction into
   /// the block that was created for it.
   void sinkScalarOperands(Instruction *PredInst);
@@ -613,30 +564,11 @@ protected:
   /// represented as.
   void truncateToMinimalBitwidths(VPTransformState &State);
 
-  /// Compute scalar induction steps. \p ScalarIV is the scalar induction
-  /// variable on which to base the steps, \p Step is the size of the step, and
-  /// \p EntryVal is the value from the original loop that maps to the steps.
-  /// Note that \p EntryVal doesn't have to be an induction variable - it
-  /// can also be a truncate instruction.
-  void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
-                        const InductionDescriptor &ID, VPValue *Def,
-                        VPTransformState &State);
-
-  /// Create a vector induction phi node based on an existing scalar one. \p
-  /// EntryVal is the value from the original loop that maps to the vector phi
-  /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
-  /// truncate instruction, instead of widening the original IV, we widen a
-  /// version of the IV truncated to \p EntryVal's type.
-  void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
-                                       Value *Step, Value *Start,
-                                       Instruction *EntryVal, VPValue *Def,
-                                       VPTransformState &State);
-
   /// Returns (and creates if needed) the original loop trip count.
-  Value *getOrCreateTripCount(Loop *NewLoop);
+  Value *getOrCreateTripCount(BasicBlock *InsertBlock);
 
   /// Returns (and creates if needed) the trip count of the widened loop.
-  Value *getOrCreateVectorTripCount(Loop *NewLoop);
+  Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
 
   /// Returns a bitcasted value to the requested vector type.
   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
@@ -645,33 +577,21 @@ protected:
 
   /// Emit a bypass check to see if the vector trip count is zero, including if
   /// it overflows.
-  void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
+  void emitIterationCountCheck(BasicBlock *Bypass);
 
   /// Emit a bypass check to see if all of the SCEV assumptions we've
   /// had to make are correct. Returns the block containing the checks or
   /// nullptr if no checks have been added.
-  BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass);
+  BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
 
   /// Emit bypass checks to check any memory assumptions we may have made.
   /// Returns the block containing the checks or nullptr if no checks have been
   /// added.
-  BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
-
-  /// Compute the transformed value of Index at offset StartValue using step
-  /// StepValue.
-  /// For integer induction, returns StartValue + Index * StepValue.
-  /// For pointer induction, returns StartValue[Index * StepValue].
-  /// FIXME: The newly created binary instructions should contain nsw/nuw
-  /// flags, which can be found from the original scalar operations.
-  Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
-                              const DataLayout &DL,
-                              const InductionDescriptor &ID,
-                              BasicBlock *VectorHeader) const;
+  BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
 
   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
-  /// vector loop preheader, middle block and scalar preheader. Also
-  /// allocate a loop object for the new vector loop and return it.
-  Loop *createVectorLoopSkeleton(StringRef Prefix);
+  /// vector loop preheader, middle block and scalar preheader.
+  void createVectorLoopSkeleton(StringRef Prefix);
 
   /// Create new phi nodes for the induction variables to resume iteration count
   /// in the scalar epilogue, from where the vectorized loop left off.
@@ -680,21 +600,12 @@ protected:
   /// block, the \p AdditionalBypass pair provides information about the bypass
   /// block and the end value on the edge from bypass to this loop.
   void createInductionResumeValues(
-      Loop *L,
       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
 
   /// Complete the loop skeleton by adding debug MDs, creating appropriate
   /// conditional branches in the middle block, preparing the builder and
-  /// running the verifier. Take in the vector loop \p L as argument, and return
-  /// the preheader of the completed vector loop.
-  BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
-
-  /// Add additional metadata to \p To that was not present on \p Orig.
-  ///
-  /// Currently this is used to add the noalias annotations based on the
-  /// inserted memchecks.  Use this for instructions that are *cloned* into the
-  /// vector loop.
-  void addNewMetadata(Instruction *To, const Instruction *Orig);
+  /// running the verifier. Return the preheader of the completed vector loop.
+  BasicBlock *completeLoopSkeleton(MDNode *OrigLoopID);
 
   /// Collect poison-generating recipes that may generate a poison value that is
   /// used after vectorization, even when their operands are not poison. Those
@@ -741,13 +652,6 @@ protected:
   /// Interface to emit optimization remarks.
   OptimizationRemarkEmitter *ORE;
 
-  /// LoopVersioning.  It's only set up (non-null) if memchecks were
-  /// used.
-  ///
-  /// This is currently only used to add no-alias metadata based on the
-  /// memchecks.  The actually versioning is performed manually.
-  std::unique_ptr<LoopVersioning> LVer;
-
   /// The vectorization SIMD factor to use. Each vector will have this many
   /// vector elements.
   ElementCount VF;
@@ -774,9 +678,6 @@ protected:
   /// there can be multiple exiting edges reaching this block.
   BasicBlock *LoopExitBlock;
 
-  /// The vector loop body.
-  BasicBlock *LoopVectorBody;
-
   /// The scalar loop body.
   BasicBlock *LoopScalarBody;
 
@@ -805,10 +706,6 @@ protected:
   // so we can later fix-up the external users of the induction variables.
   DenseMap<PHINode *, Value *> IVEndValues;
 
-  // Vector of original scalar PHIs whose corresponding widened PHIs need to be
-  // fixed up at the end of vector code generation.
-  SmallVector<PHINode *, 8> OrigPHIsToFix;
-
   /// BFI and PSI are used to check for profile guided size optimizations.
   BlockFrequencyInfo *BFI;
   ProfileSummaryInfo *PSI;
@@ -936,8 +833,7 @@ protected:
   /// Emits an iteration count bypass check once for the main loop (when \p
   /// ForEpilogue is false) and once for the epilogue loop (when \p
   /// ForEpilogue is true).
-  BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
-                                             bool ForEpilogue);
+  BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
   void printDebugTracesAtStart() override;
   void printDebugTracesAtEnd() override;
 };
@@ -956,7 +852,9 @@ public:
       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
       GeneratedRTChecks &Checks)
       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
-                                       EPI, LVL, CM, BFI, PSI, Checks) {}
+                                       EPI, LVL, CM, BFI, PSI, Checks) {
+    TripCount = EPI.TripCount;
+  }
   /// Implements the interface for creating a vectorized skeleton using the
   /// *epilogue loop* strategy (ie the second pass of vplan execution).
   std::pair<BasicBlock *, Value *>
@@ -966,7 +864,7 @@ protected:
   /// Emits an iteration count bypass check after the main vector loop has
   /// finished to see if there are any iterations left to execute by either
   /// the vector epilogue or the scalar epilogue.
-  BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
+  BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
                                                       BasicBlock *Bypass,
                                                       BasicBlock *Insert);
   void printDebugTracesAtStart() override;
@@ -993,31 +891,6 @@ static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
   return I;
 }
 
-void InnerLoopVectorizer::setDebugLocFromInst(
-    const Value *V, Optional<IRBuilder<> *> CustomBuilder) {
-  IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder;
-  if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
-    const DILocation *DIL = Inst->getDebugLoc();
-
-    // When a FSDiscriminator is enabled, we don't need to add the multiply
-    // factors to the discriminators.
-    if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
-        !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
-      // FIXME: For scalable vectors, assume vscale=1.
-      auto NewDIL =
-          DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
-      if (NewDIL)
-        B->SetCurrentDebugLocation(NewDIL.getValue());
-      else
-        LLVM_DEBUG(dbgs()
-                   << "Failed to create new discriminator: "
-                   << DIL->getFilename() << " Line: " << DIL->getLine());
-    } else
-      B->SetCurrentDebugLocation(DIL);
-  } else
-    B->SetCurrentDebugLocation(DebugLoc());
-}
-
 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
 /// is passed, the message relates to that particular instruction.
 #ifndef NDEBUG
@@ -1059,7 +932,7 @@ static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
 namespace llvm {
 
 /// Return a value for Step multiplied by VF.
-Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF,
+Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
                        int64_t Step) {
   assert(Ty->isIntegerTy() && "Expected an integer step");
   Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
@@ -1067,12 +940,13 @@ Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF,
 }
 
 /// Return the runtime value for VF.
-Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
+Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
   Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
   return VF.isScalable() ? B.CreateVScale(EC) : EC;
 }
 
-static Value *getRuntimeVFAsFloat(IRBuilder<> &B, Type *FTy, ElementCount VF) {
+static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
+                                  ElementCount VF) {
   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
@@ -1119,14 +993,6 @@ static std::string getDebugLocString(const Loop *L) {
 }
 #endif
 
-void InnerLoopVectorizer::addNewMetadata(Instruction *To,
-                                         const Instruction *Orig) {
-  // If the loop was versioned with memchecks, add the corresponding no-alias
-  // metadata.
-  if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
-    LVer->annotateInstWithNoAlias(To, Orig);
-}
-
 void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
     VPTransformState &State) {
 
@@ -1151,6 +1017,7 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
       // handled.
       if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
           isa<VPInterleaveRecipe>(CurRec) ||
+          isa<VPScalarIVStepsRecipe>(CurRec) ||
           isa<VPCanonicalIVPHIRecipe>(CurRec))
         continue;
 
@@ -1176,10 +1043,10 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
     for (VPRecipeBase &Recipe : *VPBB) {
       if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
-        Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr();
+        Instruction &UnderlyingInstr = WidenRec->getIngredient();
         VPDef *AddrDef = WidenRec->getAddr()->getDef();
-        if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr &&
-            Legal->blockNeedsPredication(UnderlyingInstr->getParent()))
+        if (AddrDef && WidenRec->isConsecutive() &&
+            Legal->blockNeedsPredication(UnderlyingInstr.getParent()))
           collectPoisonGeneratingInstrsInBackwardSlice(
               cast<VPRecipeBase>(AddrDef));
       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
@@ -1206,20 +1073,6 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
   }
 }
 
-void InnerLoopVectorizer::addMetadata(Instruction *To,
-                                      Instruction *From) {
-  propagateMetadata(To, From);
-  addNewMetadata(To, From);
-}
-
-void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
-                                      Instruction *From) {
-  for (Value *V : To) {
-    if (Instruction *I = dyn_cast<Instruction>(V))
-      addMetadata(I, From);
-  }
-}
-
 PHINode *InnerLoopVectorizer::getReductionResumeValue(
     const RecurrenceDescriptor &RdxDesc) {
   auto It = ReductionResumeValues.find(&RdxDesc);
@@ -1363,7 +1216,7 @@ public:
   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
   /// of FP operations.
-  bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) {
+  bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
     return !Hints->allowReordering() && RdxDesc.isOrdered();
   }
 
@@ -1701,6 +1554,11 @@ public:
 private:
   unsigned NumPredStores = 0;
 
+  /// Convenience function that returns the value of vscale_range iff
+  /// vscale_range.min == vscale_range.max or otherwise returns the value
+  /// returned by the corresponding TLI method.
+  Optional<unsigned> getVScaleForTuning() const;
+
   /// \return An upper bound for the vectorization factors for both
   /// fixed and scalable vectorization, where the minimum-known number of
   /// elements is a power-of-2 larger than zero. If scalable vectorization is
@@ -1713,15 +1571,10 @@ private:
   /// \return the maximized element count based on the targets vector
   /// registers and the loop trip-count, but limited to a maximum safe VF.
   /// This is a helper function of computeFeasibleMaxVF.
-  /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure
-  /// issue that occurred on one of the buildbots which cannot be reproduced
-  /// without having access to the properietary compiler (see comments on
-  /// D98509). The issue is currently under investigation and this workaround
-  /// will be removed as soon as possible.
   ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
                                        unsigned SmallestType,
                                        unsigned WidestType,
-                                       const ElementCount &MaxSafeVF,
+                                       ElementCount MaxSafeVF,
                                        bool FoldTailByMasking);
 
   /// \return the maximum legal scalable VF, based on the safe max number
@@ -2012,7 +1865,7 @@ public:
   /// there is no vector code generation, the check blocks are removed
   /// completely.
   void Create(Loop *L, const LoopAccessInfo &LAI,
-              const SCEVUnionPredicate &UnionPred) {
+              const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
 
     BasicBlock *LoopHeader = L->getHeader();
     BasicBlock *Preheader = L->getLoopPreheader();
@@ -2035,9 +1888,19 @@ public:
       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
                                  "vector.memcheck");
 
-      MemRuntimeCheckCond =
-          addRuntimeChecks(MemCheckBlock->getTerminator(), L,
-                           RtPtrChecking.getChecks(), MemCheckExp);
+      auto DiffChecks = RtPtrChecking.getDiffChecks();
+      if (DiffChecks) {
+        MemRuntimeCheckCond = addDiffRuntimeChecks(
+            MemCheckBlock->getTerminator(), L, *DiffChecks, MemCheckExp,
+            [VF](IRBuilderBase &B, unsigned Bits) {
+              return getRuntimeVF(B, B.getIntNTy(Bits), VF);
+            },
+            IC);
+      } else {
+        MemRuntimeCheckCond =
+            addRuntimeChecks(MemCheckBlock->getTerminator(), L,
+                             RtPtrChecking.getChecks(), MemCheckExp);
+      }
       assert(MemRuntimeCheckCond &&
              "no RT checks generated although RtPtrChecking "
              "claimed checks are required");
@@ -2109,12 +1972,16 @@ public:
   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
   /// adjusts the branches to branch to the vector preheader or \p Bypass,
   /// depending on the generated condition.
-  BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass,
+  BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
                              BasicBlock *LoopVectorPreHeader,
                              BasicBlock *LoopExitBlock) {
     if (!SCEVCheckCond)
       return nullptr;
-    if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
+
+    Value *Cond = SCEVCheckCond;
+    // Mark the check as used, to prevent it from being removed during cleanup.
+    SCEVCheckCond = nullptr;
+    if (auto *C = dyn_cast<ConstantInt>(Cond))
       if (C->isZero())
         return nullptr;
 
@@ -2133,18 +2000,15 @@ public:
     DT->addNewBlock(SCEVCheckBlock, Pred);
     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
 
-    ReplaceInstWithInst(
-        SCEVCheckBlock->getTerminator(),
-        BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
-    // Mark the check as used, to prevent it from being removed during cleanup.
-    SCEVCheckCond = nullptr;
+    ReplaceInstWithInst(SCEVCheckBlock->getTerminator(),
+                        BranchInst::Create(Bypass, LoopVectorPreHeader, Cond));
     return SCEVCheckBlock;
   }
 
   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
   /// the branches to branch to the vector preheader or \p Bypass, depending on
   /// the generated condition.
-  BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass,
+  BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
                                    BasicBlock *LoopVectorPreHeader) {
     // Check if we generated code that checks in runtime if arrays overlap.
     if (!MemRuntimeCheckCond)
@@ -2341,7 +2205,7 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
 /// \p Opcode is relevant for FP induction variable.
 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
                             Instruction::BinaryOps BinOp, ElementCount VF,
-                            IRBuilder<> &Builder) {
+                            IRBuilderBase &Builder) {
   assert(VF.isVector() && "only vector VFs are supported");
 
   // Create and check the types.
@@ -2357,9 +2221,8 @@ static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
 
   // Create a vector of consecutive numbers from zero to VF.
   VectorType *InitVecValVTy = ValVTy;
-  Type *InitVecValSTy = STy;
   if (STy->isFloatingPointTy()) {
-    InitVecValSTy =
+    Type *InitVecValSTy =
         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
   }
@@ -2389,199 +2252,12 @@ static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
 }
 
-void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
-    const InductionDescriptor &II, Value *Step, Value *Start,
-    Instruction *EntryVal, VPValue *Def, VPTransformState &State) {
-  IRBuilder<> &Builder = State.Builder;
-  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
-         "Expected either an induction phi-node or a truncate of it!");
-
-  // Construct the initial value of the vector IV in the vector loop preheader
-  auto CurrIP = Builder.saveIP();
-  Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
-  if (isa<TruncInst>(EntryVal)) {
-    assert(Start->getType()->isIntegerTy() &&
-           "Truncation requires an integer type");
-    auto *TruncType = cast<IntegerType>(EntryVal->getType());
-    Step = Builder.CreateTrunc(Step, TruncType);
-    Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
-  }
-
-  Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
-  Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
-  Value *SteppedStart = getStepVector(
-      SplatStart, Zero, Step, II.getInductionOpcode(), State.VF, State.Builder);
-
-  // We create vector phi nodes for both integer and floating-point induction
-  // variables. Here, we determine the kind of arithmetic we will perform.
-  Instruction::BinaryOps AddOp;
-  Instruction::BinaryOps MulOp;
-  if (Step->getType()->isIntegerTy()) {
-    AddOp = Instruction::Add;
-    MulOp = Instruction::Mul;
-  } else {
-    AddOp = II.getInductionOpcode();
-    MulOp = Instruction::FMul;
-  }
-
-  // Multiply the vectorization factor by the step using integer or
-  // floating-point arithmetic as appropriate.
-  Type *StepType = Step->getType();
-  Value *RuntimeVF;
-  if (Step->getType()->isFloatingPointTy())
-    RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
-  else
-    RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
-  Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
-
-  // Create a vector splat to use in the induction update.
-  //
-  // FIXME: If the step is non-constant, we create the vector splat with
-  //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
-  //        handle a constant vector splat.
-  Value *SplatVF = isa<Constant>(Mul)
-                       ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
-                       : Builder.CreateVectorSplat(State.VF, Mul);
-  Builder.restoreIP(CurrIP);
-
-  // We may need to add the step a number of times, depending on the unroll
-  // factor. The last of those goes into the PHI.
-  PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
-                                    &*LoopVectorBody->getFirstInsertionPt());
-  VecInd->setDebugLoc(EntryVal->getDebugLoc());
-  Instruction *LastInduction = VecInd;
-  for (unsigned Part = 0; Part < UF; ++Part) {
-    State.set(Def, LastInduction, Part);
-
-    if (isa<TruncInst>(EntryVal))
-      addMetadata(LastInduction, EntryVal);
-
-    LastInduction = cast<Instruction>(
-        Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
-    LastInduction->setDebugLoc(EntryVal->getDebugLoc());
-  }
-
-  // Move the last step to the end of the latch block. This ensures consistent
-  // placement of all induction updates.
-  auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
-  auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
-  LastInduction->moveBefore(Br);
-  LastInduction->setName("vec.ind.next");
-
-  VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
-  VecInd->addIncoming(LastInduction, LoopVectorLatch);
-}
-
-void InnerLoopVectorizer::widenIntOrFpInduction(
-    PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, VPTransformState &State,
-    Value *CanonicalIV) {
-  Value *Start = Def->getStartValue()->getLiveInIRValue();
-  const InductionDescriptor &ID = Def->getInductionDescriptor();
-  TruncInst *Trunc = Def->getTruncInst();
-  IRBuilder<> &Builder = State.Builder;
-  assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
-  assert(!State.VF.isZero() && "VF must be non-zero");
-
-  // The value from the original loop to which we are mapping the new induction
-  // variable.
-  Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
-
-  auto &DL = EntryVal->getModule()->getDataLayout();
-
-  // Generate code for the induction step. Note that induction steps are
-  // required to be loop-invariant
-  auto CreateStepValue = [&](const SCEV *Step) -> Value * {
-    assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
-           "Induction step should be loop invariant");
-    if (PSE.getSE()->isSCEVable(IV->getType())) {
-      SCEVExpander Exp(*PSE.getSE(), DL, "induction");
-      return Exp.expandCodeFor(Step, Step->getType(),
-                               State.CFG.VectorPreHeader->getTerminator());
-    }
-    return cast<SCEVUnknown>(Step)->getValue();
-  };
-
-  // The scalar value to broadcast. This is derived from the canonical
-  // induction variable. If a truncation type is given, truncate the canonical
-  // induction variable and step. Otherwise, derive these values from the
-  // induction descriptor.
-  auto CreateScalarIV = [&](Value *&Step) -> Value * {
-    Value *ScalarIV = CanonicalIV;
-    Type *NeededType = IV->getType();
-    if (!Def->isCanonical() || ScalarIV->getType() != NeededType) {
-      ScalarIV =
-          NeededType->isIntegerTy()
-              ? Builder.CreateSExtOrTrunc(ScalarIV, NeededType)
-              : Builder.CreateCast(Instruction::SIToFP, ScalarIV, NeededType);
-      ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID,
-                                      State.CFG.PrevBB);
-      ScalarIV->setName("offset.idx");
-    }
-    if (Trunc) {
-      auto *TruncType = cast<IntegerType>(Trunc->getType());
-      assert(Step->getType()->isIntegerTy() &&
-             "Truncation requires an integer step");
-      ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
-      Step = Builder.CreateTrunc(Step, TruncType);
-    }
-    return ScalarIV;
-  };
-
-  // Fast-math-flags propagate from the original induction instruction.
-  IRBuilder<>::FastMathFlagGuard FMFG(Builder);
-  if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
-    Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
-
-  // Now do the actual transformations, and start with creating the step value.
-  Value *Step = CreateStepValue(ID.getStep());
-  if (State.VF.isScalar()) {
-    Value *ScalarIV = CreateScalarIV(Step);
-    Type *ScalarTy = IntegerType::get(ScalarIV->getContext(),
-                                      Step->getType()->getScalarSizeInBits());
-
-    Instruction::BinaryOps IncOp = ID.getInductionOpcode();
-    if (IncOp == Instruction::BinaryOpsEnd)
-      IncOp = Instruction::Add;
-    for (unsigned Part = 0; Part < UF; ++Part) {
-      Value *StartIdx = ConstantInt::get(ScalarTy, Part);
-      Instruction::BinaryOps MulOp = Instruction::Mul;
-      if (Step->getType()->isFloatingPointTy()) {
-        StartIdx = Builder.CreateUIToFP(StartIdx, Step->getType());
-        MulOp = Instruction::FMul;
-      }
-
-      Value *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
-      Value *EntryPart = Builder.CreateBinOp(IncOp, ScalarIV, Mul, "induction");
-      State.set(Def, EntryPart, Part);
-      if (Trunc) {
-        assert(!Step->getType()->isFloatingPointTy() &&
-               "fp inductions shouldn't be truncated");
-        addMetadata(EntryPart, Trunc);
-      }
-    }
-    return;
-  }
-
-  // Create a new independent vector induction variable, if one is needed.
-  if (Def->needsVectorIV())
-    createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
-
-  if (Def->needsScalarIV()) {
-    // Create scalar steps that can be used by instructions we will later
-    // scalarize. Note that the addition of the scalar steps will not increase
-    // the number of instructions in the loop in the common case prior to
-    // InstCombine. We will be trading one vector extract for each scalar step.
-    Value *ScalarIV = CreateScalarIV(Step);
-    buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
-  }
-}
-
-void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
-                                           Instruction *EntryVal,
-                                           const InductionDescriptor &ID,
-                                           VPValue *Def,
-                                           VPTransformState &State) {
-  IRBuilder<> &Builder = State.Builder;
+/// Compute scalar induction steps. \p ScalarIV is the scalar induction
+/// variable on which to base the steps, \p Step is the size of the step.
+static void buildScalarSteps(Value *ScalarIV, Value *Step,
+                             const InductionDescriptor &ID, VPValue *Def,
+                             VPTransformState &State) {
+  IRBuilderBase &Builder = State.Builder;
   // We shouldn't have to build scalar steps if we aren't vectorizing.
   assert(State.VF.isVector() && "VF should be greater than one");
   // Get the value type and ensure it and the step have the same integer type.
@@ -2652,6 +2328,103 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
   }
 }
 
+// Generate code for the induction step. Note that induction steps are
+// required to be loop-invariant
+static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE,
+                              Instruction *InsertBefore,
+                              Loop *OrigLoop = nullptr) {
+  const DataLayout &DL = SE.getDataLayout();
+  assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) &&
+         "Induction step should be loop invariant");
+  if (auto *E = dyn_cast<SCEVUnknown>(Step))
+    return E->getValue();
+
+  SCEVExpander Exp(SE, DL, "induction");
+  return Exp.expandCodeFor(Step, Step->getType(), InsertBefore);
+}
+
+/// Compute the transformed value of Index at offset StartValue using step
+/// StepValue.
+/// For integer induction, returns StartValue + Index * StepValue.
+/// For pointer induction, returns StartValue[Index * StepValue].
+/// FIXME: The newly created binary instructions should contain nsw/nuw
+/// flags, which can be found from the original scalar operations.
+static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
+                                   Value *StartValue, Value *Step,
+                                   const InductionDescriptor &ID) {
+  assert(Index->getType()->getScalarType() == Step->getType() &&
+         "Index scalar type does not match StepValue type");
+
+  // Note: the IR at this point is broken. We cannot use SE to create any new
+  // SCEV and then expand it, hoping that SCEV's simplification will give us
+  // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
+  // lead to various SCEV crashes. So all we can do is to use builder and rely
+  // on InstCombine for future simplifications. Here we handle some trivial
+  // cases only.
+  auto CreateAdd = [&B](Value *X, Value *Y) {
+    assert(X->getType() == Y->getType() && "Types don't match!");
+    if (auto *CX = dyn_cast<ConstantInt>(X))
+      if (CX->isZero())
+        return Y;
+    if (auto *CY = dyn_cast<ConstantInt>(Y))
+      if (CY->isZero())
+        return X;
+    return B.CreateAdd(X, Y);
+  };
+
+  // We allow X to be a vector type, in which case Y will potentially be
+  // splatted into a vector with the same element count.
+  auto CreateMul = [&B](Value *X, Value *Y) {
+    assert(X->getType()->getScalarType() == Y->getType() &&
+           "Types don't match!");
+    if (auto *CX = dyn_cast<ConstantInt>(X))
+      if (CX->isOne())
+        return Y;
+    if (auto *CY = dyn_cast<ConstantInt>(Y))
+      if (CY->isOne())
+        return X;
+    VectorType *XVTy = dyn_cast<VectorType>(X->getType());
+    if (XVTy && !isa<VectorType>(Y->getType()))
+      Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
+    return B.CreateMul(X, Y);
+  };
+
+  switch (ID.getKind()) {
+  case InductionDescriptor::IK_IntInduction: {
+    assert(!isa<VectorType>(Index->getType()) &&
+           "Vector indices not supported for integer inductions yet");
+    assert(Index->getType() == StartValue->getType() &&
+           "Index type does not match StartValue type");
+    if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
+      return B.CreateSub(StartValue, Index);
+    auto *Offset = CreateMul(Index, Step);
+    return CreateAdd(StartValue, Offset);
+  }
+  case InductionDescriptor::IK_PtrInduction: {
+    assert(isa<Constant>(Step) &&
+           "Expected constant step for pointer induction");
+    return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step));
+  }
+  case InductionDescriptor::IK_FpInduction: {
+    assert(!isa<VectorType>(Index->getType()) &&
+           "Vector indices not supported for FP inductions yet");
+    assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
+    auto InductionBinOp = ID.getInductionBinOp();
+    assert(InductionBinOp &&
+           (InductionBinOp->getOpcode() == Instruction::FAdd ||
+            InductionBinOp->getOpcode() == Instruction::FSub) &&
+           "Original bin op should be defined for FP induction");
+
+    Value *MulExp = B.CreateFMul(Step, Index);
+    return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
+                         "induction");
+  }
+  case InductionDescriptor::IK_NoInduction:
+    return nullptr;
+  }
+  llvm_unreachable("invalid enum");
+}
+
 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
                                                     const VPIteration &Instance,
                                                     VPTransformState &State) {
@@ -2734,7 +2507,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
 
   for (unsigned Part = 0; Part < UF; Part++) {
     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
-    setDebugLocFromInst(AddrPart);
+    State.setDebugLocFromInst(AddrPart);
 
     // Notice current instruction could be any index. Need to adjust the address
     // to the member of index 0.
@@ -2760,7 +2533,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
   }
 
-  setDebugLocFromInst(Instr);
+  State.setDebugLocFromInst(Instr);
   Value *PoisonVec = PoisonValue::get(VecTy);
 
   Value *MaskForGaps = nullptr;
@@ -2915,8 +2688,6 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
     if (!Instance.isFirstIteration())
       return;
 
-  setDebugLocFromInst(Instr);
-
   // Does this instruction return a value ?
   bool IsVoidRetTy = Instr->getType()->isVoidTy();
 
@@ -2933,21 +2704,23 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
   if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
     Cloned->dropPoisonGeneratingFlags();
 
-  State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
-                               Builder.GetInsertPoint());
+  if (Instr->getDebugLoc())
+    State.setDebugLocFromInst(Instr);
+
   // Replace the operands of the cloned instructions with their scalar
   // equivalents in the new loop.
   for (auto &I : enumerate(RepRecipe->operands())) {
     auto InputInstance = Instance;
     VPValue *Operand = I.value();
-    if (State.Plan->isUniformAfterVectorization(Operand))
+    VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand);
+    if (OperandR && OperandR->isUniform())
       InputInstance.Lane = VPLane::getFirstLane();
     Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
   }
-  addNewMetadata(Cloned, Instr);
+  State.addNewMetadata(Cloned, Instr);
 
   // Place the cloned scalar in the new loop.
-  Builder.Insert(Cloned);
+  State.Builder.Insert(Cloned);
 
   State.set(RepRecipe, Cloned, Instance);
 
@@ -2960,29 +2733,12 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
     PredicatedInstructions.push_back(Cloned);
 }
 
-void InnerLoopVectorizer::createHeaderBranch(Loop *L) {
-  BasicBlock *Header = L->getHeader();
-  assert(!L->getLoopLatch() && "loop should not have a latch at this point");
-
-  IRBuilder<> B(Header->getTerminator());
-  Instruction *OldInst =
-      getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
-  setDebugLocFromInst(OldInst, &B);
-
-  // Connect the header to the exit and header blocks and replace the old
-  // terminator.
-  B.CreateCondBr(B.getTrue(), L->getUniqueExitBlock(), Header);
-
-  // Now we have two terminators. Remove the old one from the block.
-  Header->getTerminator()->eraseFromParent();
-}
-
-Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
+Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) {
   if (TripCount)
     return TripCount;
 
-  assert(L && "Create Trip Count for null loop.");
-  IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
+  assert(InsertBlock);
+  IRBuilder<> Builder(InsertBlock->getTerminator());
   // Find the loop boundaries.
   ScalarEvolution *SE = PSE.getSE();
   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
@@ -3006,7 +2762,7 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
   const SCEV *ExitCount = SE->getAddExpr(
       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
 
-  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+  const DataLayout &DL = InsertBlock->getModule()->getDataLayout();
 
   // Expand the trip count and place the new instructions in the preheader.
   // Notice that the pre-header does not change, only the loop body.
@@ -3014,22 +2770,23 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
 
   // Count holds the overall loop count (N).
   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
-                                L->getLoopPreheader()->getTerminator());
+                                InsertBlock->getTerminator());
 
   if (TripCount->getType()->isPointerTy())
     TripCount =
         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
-                                    L->getLoopPreheader()->getTerminator());
+                                    InsertBlock->getTerminator());
 
   return TripCount;
 }
 
-Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
+Value *
+InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
   if (VectorTripCount)
     return VectorTripCount;
 
-  Value *TC = getOrCreateTripCount(L);
-  IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
+  Value *TC = getOrCreateTripCount(InsertBlock);
+  IRBuilder<> Builder(InsertBlock->getTerminator());
 
   Type *Ty = TC->getType();
   // This is where we can make the step a runtime constant.
@@ -3041,6 +2798,8 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
   // overflows: the vector induction variable will eventually wrap to zero given
   // that it starts at zero and its Step is a power of two; the loop will then
   // exit, with the last early-exit vector comparison also producing all-true.
+  // For scalable vectors the VF is not guaranteed to be a power of 2, but this
+  // is accounted for in emitIterationCountCheck that adds an overflow check.
   if (Cost->foldTailByMasking()) {
     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
            "VF*UF must be a power of 2 when folding tail by masking");
@@ -3103,9 +2862,8 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
 }
 
-void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
-                                                         BasicBlock *Bypass) {
-  Value *Count = getOrCreateTripCount(L);
+void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
+  Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
   // Reuse existing vector loop preheader for TC checks.
   // Note that new preheader block is generated for vector loop.
   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
@@ -3120,10 +2878,23 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
                                             : ICmpInst::ICMP_ULT;
 
   // If tail is to be folded, vector loop takes care of all iterations.
+  Type *CountTy = Count->getType();
   Value *CheckMinIters = Builder.getFalse();
-  if (!Cost->foldTailByMasking()) {
-    Value *Step = createStepForVF(Builder, Count->getType(), VF, UF);
+  Value *Step = createStepForVF(Builder, CountTy, VF, UF);
+  if (!Cost->foldTailByMasking())
     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
+  else if (VF.isScalable()) {
+    // vscale is not necessarily a power-of-2, which means we cannot guarantee
+    // an overflow to zero when updating induction variables and so an
+    // additional overflow check is required before entering the vector loop.
+
+    // Get the maximum unsigned value for the type.
+    Value *MaxUIntTripCount =
+        ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
+    Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
+
+    // Don't execute the vector loop if (UMax - n) < (VF * UF).
+    CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, Step);
   }
   // Create new preheader for vector loop.
   LoopVectorPreHeader =
@@ -3148,10 +2919,10 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
   LoopBypassBlocks.push_back(TCCheckBlock);
 }
 
-BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
+BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
 
   BasicBlock *const SCEVCheckBlock =
-      RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock);
+      RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
   if (!SCEVCheckBlock)
     return nullptr;
 
@@ -3176,14 +2947,13 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
   return SCEVCheckBlock;
 }
 
-BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
-                                                      BasicBlock *Bypass) {
+BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
   // VPlan-native path does not do any analysis for runtime checks currently.
   if (EnableVPlanNativePath)
     return nullptr;
 
   BasicBlock *const MemCheckBlock =
-      RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader);
+      RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
 
   // Check if we generated code that checks in runtime if arrays overlap. We put
   // the checks into a separate block to make the more common case of few
@@ -3197,7 +2967,8 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
            "to vectorize.");
     ORE->emit([&]() {
       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
-                                        L->getStartLoc(), L->getHeader())
+                                        OrigLoop->getStartLoc(),
+                                        OrigLoop->getHeader())
              << "Code-size may be reduced by not forcing "
                 "vectorization, or by source-code modifications "
                 "eliminating the need for runtime checks "
@@ -3209,116 +2980,10 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
 
   AddedSafetyChecks = true;
 
-  // We currently don't use LoopVersioning for the actual loop cloning but we
-  // still use it to add the noalias metadata.
-  LVer = std::make_unique<LoopVersioning>(
-      *Legal->getLAI(),
-      Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
-      DT, PSE.getSE());
-  LVer->prepareNoAliasMetadata();
   return MemCheckBlock;
 }
 
-Value *InnerLoopVectorizer::emitTransformedIndex(
-    IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
-    const InductionDescriptor &ID, BasicBlock *VectorHeader) const {
-
-  SCEVExpander Exp(*SE, DL, "induction");
-  auto Step = ID.getStep();
-  auto StartValue = ID.getStartValue();
-  assert(Index->getType()->getScalarType() == Step->getType() &&
-         "Index scalar type does not match StepValue type");
-
-  // Note: the IR at this point is broken. We cannot use SE to create any new
-  // SCEV and then expand it, hoping that SCEV's simplification will give us
-  // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
-  // lead to various SCEV crashes. So all we can do is to use builder and rely
-  // on InstCombine for future simplifications. Here we handle some trivial
-  // cases only.
-  auto CreateAdd = [&B](Value *X, Value *Y) {
-    assert(X->getType() == Y->getType() && "Types don't match!");
-    if (auto *CX = dyn_cast<ConstantInt>(X))
-      if (CX->isZero())
-        return Y;
-    if (auto *CY = dyn_cast<ConstantInt>(Y))
-      if (CY->isZero())
-        return X;
-    return B.CreateAdd(X, Y);
-  };
-
-  // We allow X to be a vector type, in which case Y will potentially be
-  // splatted into a vector with the same element count.
-  auto CreateMul = [&B](Value *X, Value *Y) {
-    assert(X->getType()->getScalarType() == Y->getType() &&
-           "Types don't match!");
-    if (auto *CX = dyn_cast<ConstantInt>(X))
-      if (CX->isOne())
-        return Y;
-    if (auto *CY = dyn_cast<ConstantInt>(Y))
-      if (CY->isOne())
-        return X;
-    VectorType *XVTy = dyn_cast<VectorType>(X->getType());
-    if (XVTy && !isa<VectorType>(Y->getType()))
-      Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
-    return B.CreateMul(X, Y);
-  };
-
-  // Get a suitable insert point for SCEV expansion. For blocks in the vector
-  // loop, choose the end of the vector loop header (=VectorHeader), because
-  // the DomTree is not kept up-to-date for additional blocks generated in the
-  // vector loop. By using the header as insertion point, we guarantee that the
-  // expanded instructions dominate all their uses.
-  auto GetInsertPoint = [this, &B, VectorHeader]() {
-    BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
-    if (InsertBB != LoopVectorBody &&
-        LI->getLoopFor(VectorHeader) == LI->getLoopFor(InsertBB))
-      return VectorHeader->getTerminator();
-    return &*B.GetInsertPoint();
-  };
-
-  switch (ID.getKind()) {
-  case InductionDescriptor::IK_IntInduction: {
-    assert(!isa<VectorType>(Index->getType()) &&
-           "Vector indices not supported for integer inductions yet");
-    assert(Index->getType() == StartValue->getType() &&
-           "Index type does not match StartValue type");
-    if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
-      return B.CreateSub(StartValue, Index);
-    auto *Offset = CreateMul(
-        Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
-    return CreateAdd(StartValue, Offset);
-  }
-  case InductionDescriptor::IK_PtrInduction: {
-    assert(isa<SCEVConstant>(Step) &&
-           "Expected constant step for pointer induction");
-    return B.CreateGEP(
-        ID.getElementType(), StartValue,
-        CreateMul(Index,
-                  Exp.expandCodeFor(Step, Index->getType()->getScalarType(),
-                                    GetInsertPoint())));
-  }
-  case InductionDescriptor::IK_FpInduction: {
-    assert(!isa<VectorType>(Index->getType()) &&
-           "Vector indices not supported for FP inductions yet");
-    assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
-    auto InductionBinOp = ID.getInductionBinOp();
-    assert(InductionBinOp &&
-           (InductionBinOp->getOpcode() == Instruction::FAdd ||
-            InductionBinOp->getOpcode() == Instruction::FSub) &&
-           "Original bin op should be defined for FP induction");
-
-    Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
-    Value *MulExp = B.CreateFMul(StepValue, Index);
-    return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
-                         "induction");
-  }
-  case InductionDescriptor::IK_NoInduction:
-    return nullptr;
-  }
-  llvm_unreachable("invalid enum");
-}
-
-Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
+void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
   LoopScalarBody = OrigLoop->getHeader();
   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
   assert(LoopVectorPreHeader && "Invalid loop structure");
@@ -3350,43 +3015,24 @@ Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
 
-  // We intentionally don't let SplitBlock to update LoopInfo since
-  // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
-  // LoopVectorBody is explicitly added to the correct place few lines later.
-  LoopVectorBody =
-      SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
-                 nullptr, nullptr, Twine(Prefix) + "vector.body");
-
-  // Update dominator for loop exit.
+  // Update dominator for loop exit. During skeleton creation, only the vector
+  // pre-header and the middle block are created. The vector loop is entirely
+  // created during VPlan exection.
   if (!Cost->requiresScalarEpilogue(VF))
     // If there is an epilogue which must run, there's no edge from the
     // middle block to exit blocks  and thus no need to update the immediate
     // dominator of the exit blocks.
     DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
-
-  // Create and register the new vector loop.
-  Loop *Lp = LI->AllocateLoop();
-  Loop *ParentLoop = OrigLoop->getParentLoop();
-
-  // Insert the new loop into the loop nest and register the new basic blocks
-  // before calling any utilities such as SCEV that require valid LoopInfo.
-  if (ParentLoop) {
-    ParentLoop->addChildLoop(Lp);
-  } else {
-    LI->addTopLevelLoop(Lp);
-  }
-  Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
-  return Lp;
 }
 
 void InnerLoopVectorizer::createInductionResumeValues(
-    Loop *L, std::pair<BasicBlock *, Value *> AdditionalBypass) {
+    std::pair<BasicBlock *, Value *> AdditionalBypass) {
   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
          "Inconsistent information about additional bypass.");
 
-  Value *VectorTripCount = getOrCreateVectorTripCount(L);
-  assert(VectorTripCount && L && "Expected valid arguments");
+  Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
+  assert(VectorTripCount && "Expected valid arguments");
   // We are going to resume the execution of the scalar loop.
   // Go over all of the induction variables that we found and fix the
   // PHIs that are left in the scalar version of the loop.
@@ -3399,19 +3045,13 @@ void InnerLoopVectorizer::createInductionResumeValues(
     PHINode *OrigPhi = InductionEntry.first;
     InductionDescriptor II = InductionEntry.second;
 
-    // Create phi nodes to merge from the  backedge-taken check block.
-    PHINode *BCResumeVal =
-        PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
-                        LoopScalarPreHeader->getTerminator());
-    // Copy original phi DL over to the new one.
-    BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
     Value *&EndValue = IVEndValues[OrigPhi];
     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
     if (OrigPhi == OldInduction) {
       // We know what the end value is.
       EndValue = VectorTripCount;
     } else {
-      IRBuilder<> B(L->getLoopPreheader()->getTerminator());
+      IRBuilder<> B(LoopVectorPreHeader->getTerminator());
 
       // Fast-math-flags propagate from the original induction instruction.
       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
@@ -3420,10 +3060,10 @@ void InnerLoopVectorizer::createInductionResumeValues(
       Type *StepType = II.getStep()->getType();
       Instruction::CastOps CastOp =
           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
-      Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
-      const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
-      EndValue =
-          emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody);
+      Value *VTC = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.vtc");
+      Value *Step =
+          CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
+      EndValue = emitTransformedIndex(B, VTC, II.getStartValue(), Step, II);
       EndValue->setName("ind.end");
 
       // Compute the end value for the additional bypass (if applicable).
@@ -3431,13 +3071,23 @@ void InnerLoopVectorizer::createInductionResumeValues(
         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
                                          StepType, true);
-        CRD =
-            B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
+        Value *Step =
+            CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
+        VTC =
+            B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.vtc");
         EndValueFromAdditionalBypass =
-            emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody);
+            emitTransformedIndex(B, VTC, II.getStartValue(), Step, II);
         EndValueFromAdditionalBypass->setName("ind.end");
       }
     }
+
+    // Create phi nodes to merge from the  backedge-taken check block.
+    PHINode *BCResumeVal =
+        PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
+                        LoopScalarPreHeader->getTerminator());
+    // Copy original phi DL over to the new one.
+    BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
+
     // The new PHI merges the original incoming value, in case of a bypass,
     // or the value at the end of the vectorized loop.
     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
@@ -3456,13 +3106,10 @@ void InnerLoopVectorizer::createInductionResumeValues(
   }
 }
 
-BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
-                                                      MDNode *OrigLoopID) {
-  assert(L && "Expected valid loop.");
-
+BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(MDNode *OrigLoopID) {
   // The trip counts should be cached by now.
-  Value *Count = getOrCreateTripCount(L);
-  Value *VectorTripCount = getOrCreateVectorTripCount(L);
+  Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
+  Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
 
   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
 
@@ -3487,14 +3134,8 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
   }
 
-  // Get ready to start creating new instructions into the vectorized body.
-  assert(LoopVectorPreHeader == L->getLoopPreheader() &&
-         "Inconsistent vector loop preheader");
-  Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
-
 #ifdef EXPENSIVE_CHECKS
   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
-  LI->verify(*DT);
 #endif
 
   return LoopVectorPreHeader;
@@ -3517,7 +3158,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   |/    |
   |     v
   |    [  ] \
-  |    [  ]_|   <-- vector loop.
+  |    [  ]_|   <-- vector loop (created during VPlan execution).
   |     |
   |     v
   \   -[ ]   <--- middle-block.
@@ -3544,34 +3185,32 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   // simply happens to be prone to hitting this in practice.  In theory, we
   // can hit the same issue for any SCEV, or ValueTracking query done during
   // mutation.  See PR49900.
-  getOrCreateTripCount(OrigLoop);
+  getOrCreateTripCount(OrigLoop->getLoopPreheader());
 
   // Create an empty vector loop, and prepare basic blocks for the runtime
   // checks.
-  Loop *Lp = createVectorLoopSkeleton("");
+  createVectorLoopSkeleton("");
 
   // Now, compare the new count to zero. If it is zero skip the vector loop and
   // jump to the scalar loop. This check also covers the case where the
   // backedge-taken count is uint##_max: adding one to it will overflow leading
   // to an incorrect trip count of zero. In this (rare) case we will also jump
   // to the scalar loop.
-  emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
+  emitIterationCountCheck(LoopScalarPreHeader);
 
   // Generate the code to check any assumptions that we've made for SCEV
   // expressions.
-  emitSCEVChecks(Lp, LoopScalarPreHeader);
+  emitSCEVChecks(LoopScalarPreHeader);
 
   // Generate the code that checks in runtime if arrays overlap. We put the
   // checks into a separate block to make the more common case of few elements
   // faster.
-  emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
-
-  createHeaderBranch(Lp);
+  emitMemRuntimeChecks(LoopScalarPreHeader);
 
   // Emit phis for the new starting index of the scalar loop.
-  createInductionResumeValues(Lp);
+  createInductionResumeValues();
 
-  return {completeLoopSkeleton(Lp, OrigLoopID), nullptr};
+  return {completeLoopSkeleton(OrigLoopID), nullptr};
 }
 
 // Fix up external users of the induction variable. At this point, we are
@@ -3580,8 +3219,9 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton() {
 // value for the IV when arriving directly from the middle block.
 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
                                        const InductionDescriptor &II,
-                                       Value *CountRoundDown, Value *EndValue,
-                                       BasicBlock *MiddleBlock) {
+                                       Value *VectorTripCount, Value *EndValue,
+                                       BasicBlock *MiddleBlock,
+                                       BasicBlock *VectorHeader, VPlan &Plan) {
   // There are two kinds of external IV usages - those that use the value
   // computed in the last iteration (the PHI) and those that use the penultimate
   // value (the value that feeds into the phi from the loop latch).
@@ -3608,8 +3248,6 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
   for (User *U : OrigPhi->users()) {
     auto *UI = cast<Instruction>(U);
     if (!OrigLoop->contains(UI)) {
-      const DataLayout &DL =
-          OrigLoop->getHeader()->getModule()->getDataLayout();
       assert(isa<PHINode>(UI) && "Expected LCSSA form");
 
       IRBuilder<> B(MiddleBlock->getTerminator());
@@ -3619,15 +3257,18 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
 
       Value *CountMinusOne = B.CreateSub(
-          CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
+          VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
       Value *CMO =
           !II.getStep()->getType()->isIntegerTy()
               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
                              II.getStep()->getType())
               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
       CMO->setName("cast.cmo");
+
+      Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),
+                                    VectorHeader->getTerminator());
       Value *Escape =
-          emitTransformedIndex(B, CMO, PSE.getSE(), DL, II, LoopVectorBody);
+          emitTransformedIndex(B, CMO, II.getStartValue(), Step, II);
       Escape->setName("ind.escape");
       MissingVals[UI] = Escape;
     }
@@ -3640,8 +3281,10 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
     // In this case, if IV1 has an external use, we need to avoid adding both
     // "last value of IV1" and "penultimate value of IV2". So, verify that we
     // don't already have an incoming value for the middle block.
-    if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
+    if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
       PHI->addIncoming(I.second, MiddleBlock);
+      Plan.removeLiveOut(PHI);
+    }
   }
 }
 
@@ -3920,18 +3563,16 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
   }
 }
 
-void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
+void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
+                                            VPlan &Plan) {
   // Insert truncates and extends for any truncated instructions as hints to
   // InstCombine.
   if (VF.isVector())
     truncateToMinimalBitwidths(State);
 
   // Fix widened non-induction PHIs by setting up the PHI operands.
-  if (OrigPHIsToFix.size()) {
-    assert(EnableVPlanNativePath &&
-           "Unexpected non-induction PHIs for fixup in non VPlan-native path");
-    fixNonInductionPHIs(State);
-  }
+  if (EnableVPlanNativePath)
+    fixNonInductionPHIs(Plan, State);
 
   // At this point every instruction in the original loop is widened to a
   // vector form. Now we need to fix the recurrences in the loop. These PHI
@@ -3942,24 +3583,37 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
   // Forget the original basic block.
   PSE.getSE()->forgetLoop(OrigLoop);
 
-  // If we inserted an edge from the middle block to the unique exit block,
-  // update uses outside the loop (phis) to account for the newly inserted
-  // edge.
-  if (!Cost->requiresScalarEpilogue(VF)) {
+  VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock();
+  Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
+  if (Cost->requiresScalarEpilogue(VF)) {
+    // No edge from the middle block to the unique exit block has been inserted
+    // and there is nothing to fix from vector loop; phis should have incoming
+    // from scalar loop only.
+    Plan.clearLiveOuts();
+  } else {
+    // If we inserted an edge from the middle block to the unique exit block,
+    // update uses outside the loop (phis) to account for the newly inserted
+    // edge.
+
     // Fix-up external users of the induction variables.
     for (auto &Entry : Legal->getInductionVars())
       fixupIVUsers(Entry.first, Entry.second,
-                   getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
-                   IVEndValues[Entry.first], LoopMiddleBlock);
-
-    fixLCSSAPHIs(State);
+                   getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
+                   IVEndValues[Entry.first], LoopMiddleBlock,
+                   VectorLoop->getHeader(), Plan);
   }
 
+  // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
+  // in the exit block, so update the builder.
+  State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI());
+  for (auto &KV : Plan.getLiveOuts())
+    KV.second->fixPhi(Plan, State);
+
   for (Instruction *PI : PredicatedInstructions)
     sinkScalarOperands(&*PI);
 
   // Remove redundant induction instructions.
-  cse(LoopVectorBody);
+  cse(VectorLoop->getHeader());
 
   // Set/update profile weights for the vector and remainder loops as original
   // loop iterations are now distributed among them. Note that original loop
@@ -3974,9 +3628,9 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
   // For scalable vectorization we can't know at compile time how many iterations
   // of the loop are handled in one vector iteration, so instead assume a pessimistic
   // vscale of '1'.
-  setProfileInfoAfterUnrolling(
-      LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
-      LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
+  setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
+                               LI->getLoopFor(LoopScalarBody),
+                               VF.getKnownMinValue() * UF);
 }
 
 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
@@ -3986,7 +3640,8 @@ void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
   // the currently empty PHI nodes. At this point every instruction in the
   // original loop is widened to a vector form so we can use them to construct
   // the incoming edges.
-  VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock();
+  VPBasicBlock *Header =
+      State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
   for (VPRecipeBase &R : Header->phis()) {
     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
       fixReduction(ReductionPhi, State);
@@ -4102,8 +3757,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(
   // and thus no phis which needed updated.
   if (!Cost->requiresScalarEpilogue(VF))
     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
-      if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi))
+      if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) {
         LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
+        State.Plan->removeLiveOut(&LCSSAPhi);
+      }
 }
 
 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
@@ -4117,14 +3774,14 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
   RecurKind RK = RdxDesc.getRecurrenceKind();
   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
-  setDebugLocFromInst(ReductionStartValue);
+  State.setDebugLocFromInst(ReductionStartValue);
 
   VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
   // This is the vector-clone of the value that leaves the loop.
   Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
 
   // Wrap flags are in general invalid after vectorization, clear them.
-  clearReductionWrapFlags(RdxDesc, State);
+  clearReductionWrapFlags(PhiR, State);
 
   // Before each round, move the insertion point right between
   // the PHIs and the values we are going to write.
@@ -4132,9 +3789,13 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
   // instructions.
   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
 
-  setDebugLocFromInst(LoopExitInst);
+  State.setDebugLocFromInst(LoopExitInst);
 
   Type *PhiTy = OrigPhi->getType();
+
+  VPBasicBlock *LatchVPBB =
+      PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock();
+  BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB];
   // If tail is folded by masking, the vector value to leave the loop should be
   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
   // instead of the former. For an inloop reduction the reduction will already
@@ -4142,17 +3803,20 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
   if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
     for (unsigned Part = 0; Part < UF; ++Part) {
       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
-      Value *Sel = nullptr;
+      SelectInst *Sel = nullptr;
       for (User *U : VecLoopExitInst->users()) {
         if (isa<SelectInst>(U)) {
           assert(!Sel && "Reduction exit feeding two selects");
-          Sel = U;
+          Sel = cast<SelectInst>(U);
         } else
           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
       }
       assert(Sel && "Reduction exit feeds no select");
       State.reset(LoopExitInstDef, Sel, Part);
 
+      if (isa<FPMathOperator>(Sel))
+        Sel->setFastMathFlags(RdxDesc.getFastMathFlags());
+
       // If the target can create a predicated operator for the reduction at no
       // extra cost in the loop (for example a predicated vadd), it can be
       // cheaper for the select to remain in the loop than be sunk out of it,
@@ -4164,8 +3828,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
               TargetTransformInfo::ReductionFlags())) {
         auto *VecRdxPhi =
             cast<PHINode>(State.get(PhiR, Part));
-        VecRdxPhi->setIncomingValueForBlock(
-            LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
+        VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel);
       }
     }
   }
@@ -4176,8 +3839,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
   if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
     assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
-    Builder.SetInsertPoint(
-        LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
+    Builder.SetInsertPoint(VectorLoopLatch->getTerminator());
     VectorParts RdxParts(UF);
     for (unsigned Part = 0; Part < UF; ++Part) {
       RdxParts[Part] = State.get(LoopExitInstDef, Part);
@@ -4208,7 +3870,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
   // conditional branch, and (c) other passes may add new predecessors which
   // terminate on this line. This is the easiest way to ensure we don't
   // accidentally cause an extra step back into the loop while debugging.
-  setDebugLocFromInst(LoopMiddleBlock->getTerminator());
+  State.setDebugLocFromInst(LoopMiddleBlock->getTerminator());
   if (PhiR->isOrdered())
     ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
   else {
@@ -4265,6 +3927,17 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
   // Set the resume value for this reduction
   ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
 
+  // If there were stores of the reduction value to a uniform memory address
+  // inside the loop, create the final store here.
+  if (StoreInst *SI = RdxDesc.IntermediateStore) {
+    StoreInst *NewSI =
+        Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand());
+    propagateMetadata(NewSI, SI);
+
+    // If the reduction value is used in other places,
+    // then let the code below create PHI's for that.
+  }
+
   // Now, we need to fix the users of the reduction variable
   // inside and outside of the scalar remainder loop.
 
@@ -4273,8 +3946,10 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
   // fixFirstOrderRecurrence for a more complete explaination of the logic.
   if (!Cost->requiresScalarEpilogue(VF))
     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
-      if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst))
+      if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) {
         LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
+        State.Plan->removeLiveOut(&LCSSAPhi);
+      }
 
   // Fix the scalar loop reduction variable with the incoming reduction sum
   // from the vector body and from the backedge value.
@@ -4287,63 +3962,35 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
 }
 
-void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
+void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
                                                   VPTransformState &State) {
+  const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
   RecurKind RK = RdxDesc.getRecurrenceKind();
   if (RK != RecurKind::Add && RK != RecurKind::Mul)
     return;
 
-  Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
-  assert(LoopExitInstr && "null loop exit instruction");
-  SmallVector<Instruction *, 8> Worklist;
-  SmallPtrSet<Instruction *, 8> Visited;
-  Worklist.push_back(LoopExitInstr);
-  Visited.insert(LoopExitInstr);
+  SmallVector<VPValue *, 8> Worklist;
+  SmallPtrSet<VPValue *, 8> Visited;
+  Worklist.push_back(PhiR);
+  Visited.insert(PhiR);
 
   while (!Worklist.empty()) {
-    Instruction *Cur = Worklist.pop_back_val();
-    if (isa<OverflowingBinaryOperator>(Cur))
-      for (unsigned Part = 0; Part < UF; ++Part) {
-        // FIXME: Should not rely on getVPValue at this point.
-        Value *V = State.get(State.Plan->getVPValue(Cur, true), Part);
-        cast<Instruction>(V)->dropPoisonGeneratingFlags();
+    VPValue *Cur = Worklist.pop_back_val();
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *V = State.get(Cur, Part);
+      if (!isa<OverflowingBinaryOperator>(V))
+        break;
+      cast<Instruction>(V)->dropPoisonGeneratingFlags();
       }
 
-    for (User *U : Cur->users()) {
-      Instruction *UI = cast<Instruction>(U);
-      if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
-          Visited.insert(UI).second)
-        Worklist.push_back(UI);
-    }
-  }
-}
-
-void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
-  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
-    if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
-      // Some phis were already hand updated by the reduction and recurrence
-      // code above, leave them alone.
-      continue;
-
-    auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
-    // Non-instruction incoming values will have only one value.
-
-    VPLane Lane = VPLane::getFirstLane();
-    if (isa<Instruction>(IncomingValue) &&
-        !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
-                                           VF))
-      Lane = VPLane::getLastLaneForVF(VF);
-
-    // Can be a loop invariant incoming value or the last scalar value to be
-    // extracted from the vectorized loop.
-    // FIXME: Should not rely on getVPValue at this point.
-    Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
-    Value *lastIncomingValue =
-        OrigLoop->isLoopInvariant(IncomingValue)
-            ? IncomingValue
-            : State.get(State.Plan->getVPValue(IncomingValue, true),
-                        VPIteration(UF - 1, Lane));
-    LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
+      for (VPUser *U : Cur->users()) {
+        auto *UserRecipe = dyn_cast<VPRecipeBase>(U);
+        if (!UserRecipe)
+          continue;
+        for (VPValue *V : UserRecipe->definedValues())
+          if (Visited.insert(V).second)
+            Worklist.push_back(V);
+      }
   }
 }
 
@@ -4421,17 +4068,23 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
   } while (Changed);
 }
 
-void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
-  for (PHINode *OrigPhi : OrigPHIsToFix) {
-    VPWidenPHIRecipe *VPPhi =
-        cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
-    PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
-    // Make sure the builder has a valid insert point.
-    Builder.SetInsertPoint(NewPhi);
-    for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
-      VPValue *Inc = VPPhi->getIncomingValue(i);
-      VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
-      NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
+void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
+                                              VPTransformState &State) {
+  auto Iter = depth_first(
+      VPBlockRecursiveTraversalWrapper<VPBlockBase *>(Plan.getEntry()));
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
+    for (VPRecipeBase &P : VPBB->phis()) {
+      VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
+      if (!VPPhi)
+        continue;
+      PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
+      // Make sure the builder has a valid insert point.
+      Builder.SetInsertPoint(NewPhi);
+      for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
+        VPValue *Inc = VPPhi->getIncomingValue(i);
+        VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
+        NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
+      }
     }
   }
 }
@@ -4441,139 +4094,6 @@ bool InnerLoopVectorizer::useOrderedReductions(
   return Cost->useOrderedReductions(RdxDesc);
 }
 
-void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
-                                              VPWidenPHIRecipe *PhiR,
-                                              VPTransformState &State) {
-  PHINode *P = cast<PHINode>(PN);
-  if (EnableVPlanNativePath) {
-    // Currently we enter here in the VPlan-native path for non-induction
-    // PHIs where all control flow is uniform. We simply widen these PHIs.
-    // Create a vector phi with no operands - the vector phi operands will be
-    // set at the end of vector code generation.
-    Type *VecTy = (State.VF.isScalar())
-                      ? PN->getType()
-                      : VectorType::get(PN->getType(), State.VF);
-    Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
-    State.set(PhiR, VecPhi, 0);
-    OrigPHIsToFix.push_back(P);
-
-    return;
-  }
-
-  assert(PN->getParent() == OrigLoop->getHeader() &&
-         "Non-header phis should have been handled elsewhere");
-
-  // In order to support recurrences we need to be able to vectorize Phi nodes.
-  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
-  // stage #1: We create a new vector PHI node with no incoming edges. We'll use
-  // this value when we vectorize all of the instructions that use the PHI.
-
-  assert(!Legal->isReductionVariable(P) &&
-         "reductions should be handled elsewhere");
-
-  setDebugLocFromInst(P);
-
-  // This PHINode must be an induction variable.
-  // Make sure that we know about it.
-  assert(Legal->getInductionVars().count(P) && "Not an induction variable");
-
-  InductionDescriptor II = Legal->getInductionVars().lookup(P);
-  const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
-
-  auto *IVR = PhiR->getParent()->getPlan()->getCanonicalIV();
-  PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
-
-  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
-  // which can be found from the original scalar operations.
-  switch (II.getKind()) {
-  case InductionDescriptor::IK_NoInduction:
-    llvm_unreachable("Unknown induction");
-  case InductionDescriptor::IK_IntInduction:
-  case InductionDescriptor::IK_FpInduction:
-    llvm_unreachable("Integer/fp induction is handled elsewhere.");
-  case InductionDescriptor::IK_PtrInduction: {
-    // Handle the pointer induction variable case.
-    assert(P->getType()->isPointerTy() && "Unexpected type.");
-
-    if (Cost->isScalarAfterVectorization(P, State.VF)) {
-      // This is the normalized GEP that starts counting at zero.
-      Value *PtrInd =
-          Builder.CreateSExtOrTrunc(CanonicalIV, II.getStep()->getType());
-      // Determine the number of scalars we need to generate for each unroll
-      // iteration. If the instruction is uniform, we only need to generate the
-      // first lane. Otherwise, we generate all VF values.
-      bool IsUniform = vputils::onlyFirstLaneUsed(PhiR);
-      assert((IsUniform || !State.VF.isScalable()) &&
-             "Cannot scalarize a scalable VF");
-      unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
-
-      for (unsigned Part = 0; Part < UF; ++Part) {
-        Value *PartStart =
-            createStepForVF(Builder, PtrInd->getType(), VF, Part);
-
-        for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
-          Value *Idx = Builder.CreateAdd(
-              PartStart, ConstantInt::get(PtrInd->getType(), Lane));
-          Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
-          Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(),
-                                                DL, II, State.CFG.PrevBB);
-          SclrGep->setName("next.gep");
-          State.set(PhiR, SclrGep, VPIteration(Part, Lane));
-        }
-      }
-      return;
-    }
-    assert(isa<SCEVConstant>(II.getStep()) &&
-           "Induction step not a SCEV constant!");
-    Type *PhiType = II.getStep()->getType();
-
-    // Build a pointer phi
-    Value *ScalarStartValue = PhiR->getStartValue()->getLiveInIRValue();
-    Type *ScStValueType = ScalarStartValue->getType();
-    PHINode *NewPointerPhi =
-        PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
-    NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
-
-    // A pointer induction, performed by using a gep
-    BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
-    Instruction *InductionLoc = LoopLatch->getTerminator();
-    const SCEV *ScalarStep = II.getStep();
-    SCEVExpander Exp(*PSE.getSE(), DL, "induction");
-    Value *ScalarStepValue =
-        Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
-    Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF);
-    Value *NumUnrolledElems =
-        Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
-    Value *InductionGEP = GetElementPtrInst::Create(
-        II.getElementType(), NewPointerPhi,
-        Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
-        InductionLoc);
-    NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
-
-    // Create UF many actual address geps that use the pointer
-    // phi as base and a vectorized version of the step value
-    // (<step*0, ..., step*N>) as offset.
-    for (unsigned Part = 0; Part < State.UF; ++Part) {
-      Type *VecPhiType = VectorType::get(PhiType, State.VF);
-      Value *StartOffsetScalar =
-          Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
-      Value *StartOffset =
-          Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
-      // Create a vector of consecutive numbers from zero to VF.
-      StartOffset =
-          Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType));
-
-      Value *GEP = Builder.CreateGEP(
-          II.getElementType(), NewPointerPhi,
-          Builder.CreateMul(
-              StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue),
-              "vector.gep"));
-      State.set(PhiR, GEP, Part);
-    }
-  }
-  }
-}
-
 /// A helper function for checking whether an integer division-related
 /// instruction may divide by zero (in which case it must be predicated if
 /// executed conditionally in the scalar code).
@@ -4597,7 +4117,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
                                                VPTransformState &State) {
   assert(!isa<DbgInfoIntrinsic>(I) &&
          "DbgInfoIntrinsic should have been dropped during VPlan construction");
-  setDebugLocFromInst(&I);
+  State.setDebugLocFromInst(&I);
 
   Module *M = I.getParent()->getParent()->getParent();
   auto *CI = cast<CallInst>(&I);
@@ -4627,13 +4147,13 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
       // Some intrinsics have a scalar argument - don't replace it with a
       // vector.
       Value *Arg;
-      if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
+      if (!UseVectorIntrinsic ||
+          !isVectorIntrinsicWithScalarOpAtArg(ID, I.index()))
         Arg = State.get(I.value(), Part);
-      else {
+      else
         Arg = State.get(I.value(), VPIteration(0, 0));
-        if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index()))
-          TysForDecl.push_back(Arg->getType());
-      }
+      if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index()))
+        TysForDecl.push_back(Arg->getType());
       Args.push_back(Arg);
     }
 
@@ -4661,7 +4181,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
         V->copyFastMathFlags(CI);
 
       State.set(Def, V, Part);
-      addMetadata(V, &I);
+      State.addMetadata(V, &I);
   }
 }
 
@@ -4672,6 +4192,14 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
          "This function should not be visited twice for the same VF");
 
+  // This avoids any chances of creating a REPLICATE recipe during planning
+  // since that would result in generation of scalarized code during execution,
+  // which is not supported for scalable vectors.
+  if (VF.isScalable()) {
+    Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
+    return;
+  }
+
   SmallSetVector<Instruction *, 8> Worklist;
 
   // These sets are used to seed the analysis with pointers used by memory
@@ -4761,7 +4289,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
     }
 
   // Insert the forced scalars.
-  // FIXME: Currently widenPHIInstruction() often creates a dead vector
+  // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
   // induction variable when the PHI user is scalarized.
   auto ForcedScalar = ForcedScalars.find(VF);
   if (ForcedScalar != ForcedScalars.end())
@@ -4888,6 +4416,27 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
   if (hasIrregularType(ScalarTy, DL))
     return false;
 
+  // If the group involves a non-integral pointer, we may not be able to
+  // losslessly cast all values to a common type.
+  unsigned InterleaveFactor = Group->getFactor();
+  bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
+  for (unsigned i = 0; i < InterleaveFactor; i++) {
+    Instruction *Member = Group->getMember(i);
+    if (!Member)
+      continue;
+    auto *MemberTy = getLoadStoreType(Member);
+    bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
+    // Don't coerce non-integral pointers to integers or vice versa.
+    if (MemberNI != ScalarNI) {
+      // TODO: Consider adding special nullptr value case here
+      return false;
+    } else if (MemberNI && ScalarNI &&
+               ScalarTy->getPointerAddressSpace() !=
+               MemberTy->getPointerAddressSpace()) {
+      return false;
+    }
+  }
+
   // Check if masking is required.
   // A Group may need masking for one of two reasons: it resides in a block that
   // needs predication, or it was decided to use masking to deal with gaps
@@ -5170,7 +4719,7 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() {
     return true;
   }
 
-  if (!PSE.getUnionPredicate().getPredicates().empty()) {
+  if (!PSE.getPredicate().isAlwaysTrue()) {
     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
         "runtime SCEV checks needed. Enable vectorization of this "
         "loop with '#pragma clang loop vectorize(enable)' when "
@@ -5461,14 +5010,6 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     }
   }
 
-  // For scalable vectors don't use tail folding for low trip counts or
-  // optimizing for code size. We only permit this if the user has explicitly
-  // requested it.
-  if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate &&
-      ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate &&
-      MaxFactors.ScalableVF.isVector())
-    MaxFactors.ScalableVF = ElementCount::getScalable(0);
-
   // If we don't know the precise trip count, or if the trip count that we
   // found modulo the vectorization factor is not zero, try to fold the tail
   // by masking.
@@ -5511,7 +5052,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
 
 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
     unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
-    const ElementCount &MaxSafeVF, bool FoldTailByMasking) {
+    ElementCount MaxSafeVF, bool FoldTailByMasking) {
   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
   TypeSize WidestRegister = TTI.getRegisterBitWidth(
       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
@@ -5556,9 +5097,12 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
     return ElementCount::getFixed(ClampedConstTripCount);
   }
 
+  TargetTransformInfo::RegisterKind RegKind =
+      ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
+                           : TargetTransformInfo::RGK_FixedWidthVector;
   ElementCount MaxVF = MaxVectorElementCount;
-  if (TTI.shouldMaximizeVectorBandwidth() ||
-      (MaximizeBandwidth && isScalarEpilogueAllowed())) {
+  if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
+                            TTI.shouldMaximizeVectorBandwidth(RegKind))) {
     auto MaxVectorElementCountMaxBW = ElementCount::get(
         PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
         ComputeScalableMaxVF);
@@ -5596,10 +5140,27 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
         MaxVF = MinVF;
       }
     }
+
+    // Invalidate any widening decisions we might have made, in case the loop
+    // requires prediction (decided later), but we have already made some
+    // load/store widening decisions.
+    invalidateCostModelingDecisions();
   }
   return MaxVF;
 }
 
+Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
+  if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
+    auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
+    auto Min = Attr.getVScaleRangeMin();
+    auto Max = Attr.getVScaleRangeMax();
+    if (Max && Min == Max)
+      return Max;
+  }
+
+  return TTI.getVScaleForTuning();
+}
+
 bool LoopVectorizationCostModel::isMoreProfitable(
     const VectorizationFactor &A, const VectorizationFactor &B) const {
   InstructionCost CostA = A.Cost;
@@ -5624,7 +5185,7 @@ bool LoopVectorizationCostModel::isMoreProfitable(
   // Improve estimate for the vector width if it is scalable.
   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
-  if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) {
+  if (Optional<unsigned> VScale = getVScaleForTuning()) {
     if (A.Width.isScalable())
       EstimatedWidthA *= VScale.getValue();
     if (B.Width.isScalable())
@@ -5651,7 +5212,8 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
          "Expected Scalar VF to be a candidate");
 
-  const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost);
+  const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
+                                       ExpectedCost);
   VectorizationFactor ChosenFactor = ScalarCost;
 
   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
@@ -5669,12 +5231,12 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
       continue;
 
     VectorizationCostTy C = expectedCost(i, &InvalidCosts);
-    VectorizationFactor Candidate(i, C.first);
+    VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
 
 #ifndef NDEBUG
     unsigned AssumedMinimumVscale = 1;
-    if (Optional<unsigned> VScale = TTI.getVScaleForTuning())
-      AssumedMinimumVscale = VScale.getValue();
+    if (Optional<unsigned> VScale = getVScaleForTuning())
+      AssumedMinimumVscale = *VScale;
     unsigned Width =
         Candidate.Width.isScalable()
             ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
@@ -5862,7 +5424,7 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
     if (LVP.hasPlanWithVF(ForcedEC))
-      return {ForcedEC, 0};
+      return {ForcedEC, 0, 0};
     else {
       LLVM_DEBUG(
           dbgs()
@@ -5885,8 +5447,20 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
     return Result;
   }
 
+  // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
+  // the main loop handles 8 lanes per iteration. We could still benefit from
+  // vectorizing the epilogue loop with VF=4.
+  ElementCount EstimatedRuntimeVF = MainLoopVF;
+  if (MainLoopVF.isScalable()) {
+    EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
+    if (Optional<unsigned> VScale = getVScaleForTuning())
+      EstimatedRuntimeVF *= *VScale;
+  }
+
   for (auto &NextVF : ProfitableVFs)
-    if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
+    if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
+          ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) ||
+         ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) &&
         (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&
         LVP.hasPlanWithVF(NextVF.Width))
       Result = NextVF;
@@ -6006,6 +5580,18 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
     return 1;
 
+  // If we did not calculate the cost for VF (because the user selected the VF)
+  // then we calculate the cost of VF here.
+  if (LoopCost == 0) {
+    InstructionCost C = expectedCost(VF).first;
+    assert(C.isValid() && "Expected to have chosen a VF with valid cost");
+    LoopCost = *C.getValue();
+
+    // Loop body is free and there is no need for interleaving.
+    if (LoopCost == 0)
+      return 1;
+  }
+
   RegisterUsage R = calculateRegisterUsage({VF})[0];
   // We divide by these constants so assume that we have at least one
   // instruction that uses at least one register.
@@ -6097,16 +5683,6 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
 
   assert(IC > 0 && "Interleave count must be greater than 0.");
 
-  // If we did not calculate the cost for VF (because the user selected the VF)
-  // then we calculate the cost of VF here.
-  if (LoopCost == 0) {
-    InstructionCost C = expectedCost(VF).first;
-    assert(C.isValid() && "Expected to have chosen a VF with valid cost");
-    LoopCost = *C.getValue();
-  }
-
-  assert(LoopCost && "Non-zero loop cost expected");
-
   // Interleave if we vectorized this loop and there is a reduction that could
   // benefit from interleaving.
   if (VF.isVector() && HasReductions) {
@@ -6114,9 +5690,15 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
     return IC;
   }
 
-  // Note that if we've already vectorized the loop we will have done the
-  // runtime check and so interleaving won't require further checks.
-  bool InterleavingRequiresRuntimePointerCheck =
+  // For any scalar loop that either requires runtime checks or predication we
+  // are better off leaving this to the unroller. Note that if we've already
+  // vectorized the loop we will have done the runtime check and so interleaving
+  // won't require further checks.
+  bool ScalarInterleavingRequiresPredication =
+      (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
+         return Legal->blockNeedsPredication(BB);
+       }));
+  bool ScalarInterleavingRequiresRuntimePointerCheck =
       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
 
   // We want to interleave small loops in order to reduce the loop overhead and
@@ -6126,7 +5708,8 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
                     << "LV: VF is " << VF << '\n');
   const bool AggressivelyInterleaveReductions =
       TTI.enableAggressiveInterleaving(HasReductions);
-  if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
+  if (!ScalarInterleavingRequiresRuntimePointerCheck &&
+      !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
     // We assume that the cost overhead is 1 and we use the cost model
     // to estimate the cost of the loop and interleave until the cost of the
     // loop overhead is about 5% of the cost of the loop.
@@ -6289,16 +5872,10 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
 
   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
 
-  // A lambda that gets the register usage for the given type and VF.
-  const auto &TTICapture = TTI;
-  auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
+  auto GetRegUsage = [&TTI = TTI](Type *Ty, ElementCount VF) -> unsigned {
     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
       return 0;
-    InstructionCost::CostType RegUsage =
-        *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue();
-    assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() &&
-           "Nonsensical values for register usage.");
-    return RegUsage;
+    return TTI.getRegUsageForType(VectorType::get(Ty, VF));
   };
 
   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
@@ -7049,10 +6626,17 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
 
   bool TypeNotScalarized = false;
   if (VF.isVector() && VectorTy->isVectorTy()) {
-    unsigned NumParts = TTI.getNumberOfParts(VectorTy);
-    if (NumParts)
-      TypeNotScalarized = NumParts < VF.getKnownMinValue();
-    else
+    if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
+      if (VF.isScalable())
+        // <vscale x 1 x iN> is assumed to be profitable over iN because
+        // scalable registers are a distinct register class from scalar ones.
+        // If we ever find a target which wants to lower scalable vectors
+        // back to scalars, we'll need to update this code to explicitly
+        // ask TTI about the register class uses for each part.
+        TypeNotScalarized = NumParts <= VF.getKnownMinValue();
+      else
+        TypeNotScalarized = NumParts < VF.getKnownMinValue();
+    } else
       C = InstructionCost::getInvalid();
   }
   return VectorizationCostTy(C, TypeNotScalarized);
@@ -7128,8 +6712,6 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
           Cost = getGatherScatterCost(&I, VF);
           setWideningDecision(&I, VF, CM_GatherScatter, Cost);
         } else {
-          assert((isa<LoadInst>(&I) || !VF.isScalable()) &&
-                 "Cannot yet scalarize uniform stores");
           Cost = getUniformMemOpCost(&I, VF);
           setWideningDecision(&I, VF, CM_Scalarize, Cost);
         }
@@ -7487,8 +7069,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
       InstWidening Decision = getWideningDecision(I, Width);
       assert(Decision != CM_Unknown &&
              "CM decision should be taken at this point");
-      if (Decision == CM_Scalarize)
+      if (Decision == CM_Scalarize) {
+        if (VF.isScalable() && isa<StoreInst>(I))
+          // We can't scalarize a scalable vector store (even a uniform one
+          // currently), return an invalid cost so as to prevent vectorization.
+          return InstructionCost::getInvalid();
         Width = ElementCount::getFixed(1);
+      }
     }
     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
     return getMemoryInstructionCost(I, VF);
@@ -7656,6 +7243,16 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
   // Ignore ephemeral values.
   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
 
+  // Find all stores to invariant variables. Since they are going to sink
+  // outside the loop we do not need calculate cost for them.
+  for (BasicBlock *BB : TheLoop->blocks())
+    for (Instruction &I : *BB) {
+      StoreInst *SI;
+      if ((SI = dyn_cast<StoreInst>(&I)) &&
+          Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
+        ValuesToIgnore.insert(&I);
+    }
+
   // Ignore type-promoting instructions we identified during reduction
   // detection.
   for (auto &Reduction : Legal->getReductionVars()) {
@@ -7757,7 +7354,7 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
     if (VPlanBuildStressTest)
       return VectorizationFactor::Disabled();
 
-    return {VF, 0 /*Cost*/};
+    return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
   }
 
   LLVM_DEBUG(
@@ -7766,6 +7363,14 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
   return VectorizationFactor::Disabled();
 }
 
+bool LoopVectorizationPlanner::requiresTooManyRuntimeChecks() const {
+  unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
+  return (NumRuntimePointerChecks >
+              VectorizerParams::RuntimeMemoryCheckThreshold &&
+          !Hints.allowReordering()) ||
+         NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
+}
+
 Optional<VectorizationFactor>
 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
@@ -7800,7 +7405,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
       CM.collectInLoopReductions();
       buildVPlansWithVPRecipes(UserVF, UserVF);
       LLVM_DEBUG(printPlans(dbgs()));
-      return {{UserVF, 0}};
+      return {{UserVF, 0, 0}};
     } else
       reportVectorizationInfo("UserVF ignored because of invalid costs.",
                               "InvalidCost", ORE, OrigLoop);
@@ -7834,30 +7439,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
     return VectorizationFactor::Disabled();
 
   // Select the optimal vectorization factor.
-  auto SelectedVF = CM.selectVectorizationFactor(VFCandidates);
-
-  // Check if it is profitable to vectorize with runtime checks.
-  unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
-  if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) {
-    bool PragmaThresholdReached =
-        NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
-    bool ThresholdReached =
-        NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
-    if ((ThresholdReached && !Hints.allowReordering()) ||
-        PragmaThresholdReached) {
-      ORE->emit([&]() {
-        return OptimizationRemarkAnalysisAliasing(
-                   DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(),
-                   OrigLoop->getHeader())
-               << "loop not vectorized: cannot prove it is safe to reorder "
-                  "memory operations";
-      });
-      LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
-      Hints.emitRemarkWithHints();
-      return VectorizationFactor::Disabled();
-    }
-  }
-  return SelectedVF;
+  return CM.selectVectorizationFactor(VFCandidates);
 }
 
 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
@@ -7910,17 +7492,36 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) {
 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
                                            VPlan &BestVPlan,
                                            InnerLoopVectorizer &ILV,
-                                           DominatorTree *DT) {
+                                           DominatorTree *DT,
+                                           bool IsEpilogueVectorization) {
   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
                     << '\n');
 
   // Perform the actual loop transformation.
 
-  // 1. Create a new empty loop. Unlink the old loop and connect the new one.
+  // 1. Set up the skeleton for vectorization, including vector pre-header and
+  // middle block. The vector loop is created during VPlan execution.
   VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
   Value *CanonicalIVStartValue;
   std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
       ILV.createVectorizedLoopSkeleton();
+
+  // Only use noalias metadata when using memory checks guaranteeing no overlap
+  // across all iterations.
+  const LoopAccessInfo *LAI = ILV.Legal->getLAI();
+  if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
+      !LAI->getRuntimePointerChecking()->getDiffChecks()) {
+
+    //  We currently don't use LoopVersioning for the actual loop cloning but we
+    //  still use it to add the noalias metadata.
+    //  TODO: Find a better way to re-use LoopVersioning functionality to add
+    //        metadata.
+    State.LVer = std::make_unique<LoopVersioning>(
+        *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
+        PSE.getSE());
+    State.LVer->prepareNoAliasMetadata();
+  }
+
   ILV.collectPoisonGeneratingRecipes(State);
 
   ILV.printDebugTracesAtStart();
@@ -7936,7 +7537,9 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
   // 2. Copy and widen instructions from the old loop into the new loop.
   BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
                              ILV.getOrCreateVectorTripCount(nullptr),
-                             CanonicalIVStartValue, State);
+                             CanonicalIVStartValue, State,
+                             IsEpilogueVectorization);
+
   BestVPlan.execute(&State);
 
   // Keep all loop hints from the original loop on the vector loop (we'll
@@ -7947,8 +7550,10 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
                                       LLVMLoopVectorizeFollowupVectorized});
 
-  Loop *L = LI->getLoopFor(State.CFG.PrevBB);
-  if (VectorizedLoopID.hasValue())
+  VPBasicBlock *HeaderVPBB =
+      BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
+  Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
+  if (VectorizedLoopID)
     L->setLoopID(VectorizedLoopID.getValue());
   else {
     // Keep all loop hints from the original loop on the vector loop (we'll
@@ -7965,7 +7570,7 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
 
   // 3. Fix the vectorized code: take care of header phi's, live-outs,
   //    predication, updating analyses.
-  ILV.fixVectorizedLoop(State);
+  ILV.fixVectorizedLoop(State, BestVPlan);
 
   ILV.printDebugTracesAtEnd();
 }
@@ -8036,22 +7641,31 @@ Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
 std::pair<BasicBlock *, Value *>
 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
   MDNode *OrigLoopID = OrigLoop->getLoopID();
-  Loop *Lp = createVectorLoopSkeleton("");
+
+  // Workaround!  Compute the trip count of the original loop and cache it
+  // before we start modifying the CFG.  This code has a systemic problem
+  // wherein it tries to run analysis over partially constructed IR; this is
+  // wrong, and not simply for SCEV.  The trip count of the original loop
+  // simply happens to be prone to hitting this in practice.  In theory, we
+  // can hit the same issue for any SCEV, or ValueTracking query done during
+  // mutation.  See PR49900.
+  getOrCreateTripCount(OrigLoop->getLoopPreheader());
+  createVectorLoopSkeleton("");
 
   // Generate the code to check the minimum iteration count of the vector
   // epilogue (see below).
   EPI.EpilogueIterationCountCheck =
-      emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
+      emitIterationCountCheck(LoopScalarPreHeader, true);
   EPI.EpilogueIterationCountCheck->setName("iter.check");
 
   // Generate the code to check any assumptions that we've made for SCEV
   // expressions.
-  EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader);
+  EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
 
   // Generate the code that checks at runtime if arrays overlap. We put the
   // checks into a separate block to make the more common case of few elements
   // faster.
-  EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
+  EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
 
   // Generate the iteration count check for the main loop, *after* the check
   // for the epilogue loop, so that the path-length is shorter for the case
@@ -8060,19 +7674,17 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
   // trip count. Note: the branch will get updated later on when we vectorize
   // the epilogue.
   EPI.MainLoopIterationCountCheck =
-      emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
+      emitIterationCountCheck(LoopScalarPreHeader, false);
 
   // Generate the induction variable.
-  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
-  EPI.VectorTripCount = CountRoundDown;
-  createHeaderBranch(Lp);
+  EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
 
   // Skip induction resume value creation here because they will be created in
   // the second pass. If we created them here, they wouldn't be used anyway,
   // because the vplan in the second pass still contains the inductions from the
   // original loop.
 
-  return {completeLoopSkeleton(Lp, OrigLoopID), nullptr};
+  return {completeLoopSkeleton(OrigLoopID), nullptr};
 }
 
 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
@@ -8092,13 +7704,13 @@ void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
   });
 }
 
-BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
-    Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
-  assert(L && "Expected valid Loop.");
+BasicBlock *
+EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
+                                                    bool ForEpilogue) {
   assert(Bypass && "Expected valid bypass basic block.");
   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
-  Value *Count = getOrCreateTripCount(L);
+  Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
   // Reuse existing vector loop preheader for TC checks.
   // Note that new preheader block is generated for vector loop.
   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
@@ -8157,7 +7769,7 @@ BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
 std::pair<BasicBlock *, Value *>
 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
   MDNode *OrigLoopID = OrigLoop->getLoopID();
-  Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
+  createVectorLoopSkeleton("vec.epilog.");
 
   // Now, compare the remaining count and if there aren't enough iterations to
   // execute the vectorized epilogue skip to the scalar part.
@@ -8166,7 +7778,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
   LoopVectorPreHeader =
       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
                  LI, nullptr, "vec.epilog.ph");
-  emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
+  emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
                                           VecEpilogueIterationCountCheck);
 
   // Adjust the control flow taking the state info from the main loop
@@ -8238,9 +7850,6 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
                            EPI.MainLoopIterationCountCheck);
 
-  // Generate the induction variable.
-  createHeaderBranch(Lp);
-
   // Generate induction resume values. These variables save the new starting
   // indexes for the scalar loop. They are used to test if there are any tail
   // iterations left once the vector loop has completed.
@@ -8248,15 +7857,15 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
   // check, then the resume value for the induction variable comes from
   // the trip count of the main vector loop, hence passing the AdditionalBypass
   // argument.
-  createInductionResumeValues(Lp, {VecEpilogueIterationCountCheck,
-                                   EPI.VectorTripCount} /* AdditionalBypass */);
+  createInductionResumeValues({VecEpilogueIterationCountCheck,
+                               EPI.VectorTripCount} /* AdditionalBypass */);
 
-  return {completeLoopSkeleton(Lp, OrigLoopID), EPResumeVal};
+  return {completeLoopSkeleton(OrigLoopID), EPResumeVal};
 }
 
 BasicBlock *
 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
-    Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
+    BasicBlock *Bypass, BasicBlock *Insert) {
 
   assert(EPI.TripCount &&
          "Expected trip count to have been safed in the first pass.");
@@ -8397,7 +8006,8 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
     // constructing the desired canonical IV in the header block as its first
     // non-phi instructions.
     assert(CM.foldTailByMasking() && "must fold the tail");
-    VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock();
+    VPBasicBlock *HeaderVPBB =
+        Plan->getVectorLoopRegion()->getEntryBasicBlock();
     auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
     auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV());
     HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());
@@ -8439,8 +8049,6 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
          "Must be called with either a load or store");
 
   auto willWiden = [&](ElementCount VF) -> bool {
-    if (VF.isScalar())
-      return false;
     LoopVectorizationCostModel::InstWidening Decision =
         CM.getWideningDecision(I, VF);
     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
@@ -8477,11 +8085,12 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
                                             Mask, Consecutive, Reverse);
 }
 
-static VPWidenIntOrFpInductionRecipe *
-createWidenInductionRecipe(PHINode *Phi, Instruction *PhiOrTrunc,
-                           VPValue *Start, const InductionDescriptor &IndDesc,
-                           LoopVectorizationCostModel &CM, Loop &OrigLoop,
-                           VFRange &Range) {
+/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
+/// insert a recipe to expand the step for the induction recipe.
+static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes(
+    PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start,
+    const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM,
+    VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) {
   // Returns true if an instruction \p I should be scalarized instead of
   // vectorized for the chosen vectorization factor.
   auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) {
@@ -8489,18 +8098,6 @@ createWidenInductionRecipe(PHINode *Phi, Instruction *PhiOrTrunc,
            CM.isProfitableToScalarize(I, VF);
   };
 
-  bool NeedsScalarIV = LoopVectorizationPlanner::getDecisionAndClampRange(
-      [&](ElementCount VF) {
-        // Returns true if we should generate a scalar version of \p IV.
-        if (ShouldScalarizeInstruction(PhiOrTrunc, VF))
-          return true;
-        auto isScalarInst = [&](User *U) -> bool {
-          auto *I = cast<Instruction>(U);
-          return OrigLoop.contains(I) && ShouldScalarizeInstruction(I, VF);
-        };
-        return any_of(PhiOrTrunc->users(), isScalarInst);
-      },
-      Range);
   bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange(
       [&](ElementCount VF) {
         return ShouldScalarizeInstruction(PhiOrTrunc, VF);
@@ -8508,30 +8105,38 @@ createWidenInductionRecipe(PHINode *Phi, Instruction *PhiOrTrunc,
       Range);
   assert(IndDesc.getStartValue() ==
          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
+  assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
+         "step must be loop invariant");
+
+  VPValue *Step =
+      vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
-    return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, TruncI,
-                                             NeedsScalarIV, !NeedsScalarIVOnly);
+    return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI,
+                                             !NeedsScalarIVOnly);
   }
   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
-  return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, NeedsScalarIV,
+  return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc,
                                            !NeedsScalarIVOnly);
 }
 
-VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
-    PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) const {
+VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
+    PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) {
 
   // Check if this is an integer or fp induction. If so, build the recipe that
   // produces its scalar and vector values.
   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
-    return createWidenInductionRecipe(Phi, Phi, Operands[0], *II, CM, *OrigLoop,
-                                      Range);
+    return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan,
+                                       *PSE.getSE(), *OrigLoop, Range);
 
+  // Check if this is pointer induction. If so, build the recipe for it.
+  if (auto *II = Legal->getPointerInductionDescriptor(Phi))
+    return new VPWidenPointerInductionRecipe(Phi, Operands[0], *II,
+                                             *PSE.getSE());
   return nullptr;
 }
 
 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
-    TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range,
-    VPlan &Plan) const {
+    TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) {
   // Optimize the special case where the source is a constant integer
   // induction variable. Notice that we can only optimize the 'trunc' case
   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
@@ -8552,7 +8157,8 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
     auto *Phi = cast<PHINode>(I->getOperand(0));
     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
-    return createWidenInductionRecipe(Phi, I, Start, II, CM, *OrigLoop, Range);
+    return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan,
+                                       *PSE.getSE(), *OrigLoop, Range);
   }
   return nullptr;
 }
@@ -8569,13 +8175,30 @@ VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
     return Operands[0];
   }
 
+  unsigned NumIncoming = Phi->getNumIncomingValues();
+  // For in-loop reductions, we do not need to create an additional select.
+  VPValue *InLoopVal = nullptr;
+  for (unsigned In = 0; In < NumIncoming; In++) {
+    PHINode *PhiOp =
+        dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());
+    if (PhiOp && CM.isInLoopReduction(PhiOp)) {
+      assert(!InLoopVal && "Found more than one in-loop reduction!");
+      InLoopVal = Operands[In];
+    }
+  }
+
+  assert((!InLoopVal || NumIncoming == 2) &&
+         "Found an in-loop reduction for PHI with unexpected number of "
+         "incoming values");
+  if (InLoopVal)
+    return Operands[Operands[0] == InLoopVal ? 1 : 0];
+
   // We know that all PHIs in non-header blocks are converted into selects, so
   // we don't have to worry about the insertion order and we can just use the
   // builder. At this point we generate the predication tree. There may be
   // duplications since this is a simple recursive scan, but future
   // optimizations will clean it up.
   SmallVector<VPValue *, 2> OperandsWithMask;
-  unsigned NumIncoming = Phi->getNumIncomingValues();
 
   for (unsigned In = 0; In < NumIncoming; In++) {
     VPValue *EdgeMask =
@@ -8681,6 +8304,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
     case Instruction::URem:
     case Instruction::Xor:
     case Instruction::ZExt:
+    case Instruction::Freeze:
       return true;
     }
     return false;
@@ -8806,14 +8430,14 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
     Plan->removeVPValueFor(Instr);
     Plan->addVPValue(Instr, PHIRecipe);
   }
-  auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
+  auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
-  VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
+  VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true);
 
   // Note: first set Entry as region entry and then connect successors starting
   // from it in order, to propagate the "parent" of each VPBasicBlock.
-  VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
-  VPBlockUtils::connectBlocks(Pred, Exit);
+  VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
+  VPBlockUtils::connectBlocks(Pred, Exiting);
 
   return Region;
 }
@@ -8822,52 +8446,37 @@ VPRecipeOrVPValueTy
 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
                                         ArrayRef<VPValue *> Operands,
                                         VFRange &Range, VPlanPtr &Plan) {
-  // First, check for specific widening recipes that deal with calls, memory
-  // operations, inductions and Phi nodes.
-  if (auto *CI = dyn_cast<CallInst>(Instr))
-    return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
-
-  if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
-    return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
-
+  // First, check for specific widening recipes that deal with inductions, Phi
+  // nodes, calls and memory operations.
   VPRecipeBase *Recipe;
   if (auto Phi = dyn_cast<PHINode>(Instr)) {
     if (Phi->getParent() != OrigLoop->getHeader())
       return tryToBlend(Phi, Operands, Plan);
-    if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
+    if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))
       return toVPRecipeResult(Recipe);
 
     VPHeaderPHIRecipe *PhiRecipe = nullptr;
-    if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) {
-      VPValue *StartV = Operands[0];
-      if (Legal->isReductionVariable(Phi)) {
-        const RecurrenceDescriptor &RdxDesc =
-            Legal->getReductionVars().find(Phi)->second;
-        assert(RdxDesc.getRecurrenceStartValue() ==
-               Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
-        PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
-                                             CM.isInLoopReduction(Phi),
-                                             CM.useOrderedReductions(RdxDesc));
-      } else {
-        PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
-      }
-
-      // Record the incoming value from the backedge, so we can add the incoming
-      // value from the backedge after all recipes have been created.
-      recordRecipeOf(cast<Instruction>(
-          Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
-      PhisToFix.push_back(PhiRecipe);
+    assert((Legal->isReductionVariable(Phi) ||
+            Legal->isFirstOrderRecurrence(Phi)) &&
+           "can only widen reductions and first-order recurrences here");
+    VPValue *StartV = Operands[0];
+    if (Legal->isReductionVariable(Phi)) {
+      const RecurrenceDescriptor &RdxDesc =
+          Legal->getReductionVars().find(Phi)->second;
+      assert(RdxDesc.getRecurrenceStartValue() ==
+             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
+      PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
+                                           CM.isInLoopReduction(Phi),
+                                           CM.useOrderedReductions(RdxDesc));
     } else {
-      // TODO: record backedge value for remaining pointer induction phis.
-      assert(Phi->getType()->isPointerTy() &&
-             "only pointer phis should be handled here");
-      assert(Legal->getInductionVars().count(Phi) &&
-             "Not an induction variable");
-      InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
-      VPValue *Start = Plan->getOrAddVPValue(II.getStartValue());
-      PhiRecipe = new VPWidenPHIRecipe(Phi, Start);
+      PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
     }
 
+    // Record the incoming value from the backedge, so we can add the incoming
+    // value from the backedge after all recipes have been created.
+    recordRecipeOf(cast<Instruction>(
+        Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
+    PhisToFix.push_back(PhiRecipe);
     return toVPRecipeResult(PhiRecipe);
   }
 
@@ -8876,6 +8485,17 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
                                                Range, *Plan)))
     return toVPRecipeResult(Recipe);
 
+  // All widen recipes below deal only with VF > 1.
+  if (LoopVectorizationPlanner::getDecisionAndClampRange(
+          [&](ElementCount VF) { return VF.isScalar(); }, Range))
+    return nullptr;
+
+  if (auto *CI = dyn_cast<CallInst>(Instr))
+    return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
+
+  if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
+    return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
+
   if (!shouldWiden(Instr, Range))
     return nullptr;
 
@@ -8949,15 +8569,13 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
 // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a
 // BranchOnCount VPInstruction to the latch.
 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
-                                  bool HasNUW, bool IsVPlanNative) {
+                                  bool HasNUW) {
   Value *StartIdx = ConstantInt::get(IdxTy, 0);
   auto *StartV = Plan.getOrAddVPValue(StartIdx);
 
   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
-  if (IsVPlanNative)
-    Header = cast<VPBasicBlock>(Header->getSingleSuccessor());
   Header->insert(CanonicalIVPHI, Header->begin());
 
   auto *CanonicalIVIncrement =
@@ -8966,11 +8584,7 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
                         {CanonicalIVPHI}, DL);
   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
 
-  VPBasicBlock *EB = TopRegion->getExitBasicBlock();
-  if (IsVPlanNative) {
-    EB = cast<VPBasicBlock>(EB->getSinglePredecessor());
-    EB->setCondBit(nullptr);
-  }
+  VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
   EB->appendRecipe(CanonicalIVIncrement);
 
   auto *BranchOnCount =
@@ -8979,6 +8593,26 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
   EB->appendRecipe(BranchOnCount);
 }
 
+// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
+// original exit block.
+static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB,
+                                VPBasicBlock *MiddleVPBB, Loop *OrigLoop,
+                                VPlan &Plan) {
+  BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
+  BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
+  // Only handle single-exit loops with unique exit blocks for now.
+  if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
+    return;
+
+  // Introduce VPUsers modeling the exit values.
+  for (PHINode &ExitPhi : ExitBB->phis()) {
+    Value *IncomingValue =
+        ExitPhi.getIncomingValueForBlock(ExitingBB);
+    VPValue *V = Plan.getOrAddVPValue(IncomingValue, true);
+    Plan.addLiveOut(&ExitPhi, V);
+  }
+}
+
 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
     const MapVector<Instruction *, Instruction *> &SinkAfter) {
@@ -9007,7 +8641,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
     RecipeBuilder.recordRecipeOf(Phi);
     for (auto &R : ReductionOperations) {
       RecipeBuilder.recordRecipeOf(R);
-      // For min/max reducitons, where we have a pair of icmp/select, we also
+      // For min/max reductions, where we have a pair of icmp/select, we also
       // need to record the ICmp recipe, so it can be removed later.
       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
              "Only min/max recurrences allowed for inloop reductions");
@@ -9039,18 +8673,25 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
   // visit each basic block after having visited its predecessor basic blocks.
   // ---------------------------------------------------------------------------
 
-  // Create initial VPlan skeleton, with separate header and latch blocks.
-  VPBasicBlock *HeaderVPBB = new VPBasicBlock();
+  // Create initial VPlan skeleton, starting with a block for the pre-header,
+  // followed by a region for the vector loop, followed by the middle block. The
+  // skeleton vector loop region contains a header and latch block.
+  VPBasicBlock *Preheader = new VPBasicBlock("vector.ph");
+  auto Plan = std::make_unique<VPlan>(Preheader);
+
+  VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
   VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
   VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
   auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
-  auto Plan = std::make_unique<VPlan>(TopRegion);
+  VPBlockUtils::insertBlockAfter(TopRegion, Preheader);
+  VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
+  VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
 
   Instruction *DLInst =
       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
                         DLInst ? DLInst->getDebugLoc() : DebugLoc(),
-                        !CM.foldTailByMasking(), false);
+                        !CM.foldTailByMasking());
 
   // Scan the body of the loop in a topological order to visit each basic block
   // after having visited its predecessor basic blocks.
@@ -9063,11 +8704,12 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
     // Relevant instructions from basic block BB will be grouped into VPRecipe
     // ingredients and fill a new VPBasicBlock.
     unsigned VPBBsForBB = 0;
-    VPBB->setName(BB->getName());
+    if (VPBB != HeaderVPBB)
+      VPBB->setName(BB->getName());
     Builder.setInsertPoint(VPBB);
 
     // Introduce each ingredient into VPlan.
-    // TODO: Model and preserve debug instrinsics in VPlan.
+    // TODO: Model and preserve debug intrinsics in VPlan.
     for (Instruction &I : BB->instructionsWithoutDebug()) {
       Instruction *Instr = &I;
 
@@ -9085,6 +8727,14 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
         auto OpRange = Plan->mapToVPValues(Instr->operands());
         Operands = {OpRange.begin(), OpRange.end()};
       }
+
+      // Invariant stores inside loop will be deleted and a single store
+      // with the final reduction value will be added to the exit block
+      StoreInst *SI;
+      if ((SI = dyn_cast<StoreInst>(&I)) &&
+          Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
+        continue;
+
       if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
               Instr, Operands, Range, Plan)) {
         // If Instr can be simplified to an existing VPValue, use it.
@@ -9135,14 +8785,18 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
   }
 
+  HeaderVPBB->setName("vector.body");
+
   // Fold the last, empty block into its predecessor.
   VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB);
   assert(VPBB && "expected to fold last (empty) block");
   // After here, VPBB should not be used.
   VPBB = nullptr;
 
-  assert(isa<VPRegionBlock>(Plan->getEntry()) &&
-         !Plan->getEntry()->getEntryBasicBlock()->empty() &&
+  addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan);
+
+  assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
+         !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
          "entry block must be set to a VPRegionBlock having a non-empty entry "
          "VPBasicBlock");
   RecipeBuilder.fixHeaderPhis();
@@ -9222,12 +8876,13 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
     Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
 
   // Adjust the recipes for any inloop reductions.
-  adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan,
+  adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan,
                              RecipeBuilder, Range.Start);
 
   // Introduce a recipe to combine the incoming and previous values of a
   // first-order recurrence.
-  for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
+  for (VPRecipeBase &R :
+       Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
     auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
     if (!RecurPhi)
       continue;
@@ -9236,7 +8891,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
     VPBasicBlock *InsertBlock = PrevRecipe->getParent();
     auto *Region = GetReplicateRegion(PrevRecipe);
     if (Region)
-      InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor());
+      InsertBlock = dyn_cast<VPBasicBlock>(Region->getSingleSuccessor());
+    if (!InsertBlock) {
+      InsertBlock = new VPBasicBlock(Region->getName() + ".succ");
+      VPBlockUtils::insertBlockAfter(InsertBlock, Region);
+    }
     if (Region || PrevRecipe->isPhi())
       Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
     else
@@ -9283,13 +8942,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
       }
   }
 
-  // From this point onwards, VPlan-to-VPlan transformations may change the plan
-  // in ways that accessing values using original IR values is incorrect.
-  Plan->disableValue2VPValue();
-
-  VPlanTransforms::sinkScalarOperands(*Plan);
-  VPlanTransforms::mergeReplicateRegions(*Plan);
-
   std::string PlanName;
   raw_string_ostream RSO(PlanName);
   ElementCount VF = Range.Start;
@@ -9303,10 +8955,20 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
   RSO.flush();
   Plan->setName(PlanName);
 
+  // From this point onwards, VPlan-to-VPlan transformations may change the plan
+  // in ways that accessing values using original IR values is incorrect.
+  Plan->disableValue2VPValue();
+
+  VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE());
+  VPlanTransforms::sinkScalarOperands(*Plan);
+  VPlanTransforms::mergeReplicateRegions(*Plan);
+  VPlanTransforms::removeDeadRecipes(*Plan);
+  VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan);
+
   // Fold Exit block into its predecessor if possible.
   // TODO: Fold block earlier once all VPlan transforms properly maintain a
   // VPBasicBlock as exit.
-  VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit());
+  VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExiting());
 
   assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
   return Plan;
@@ -9331,23 +8993,20 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
        VF *= 2)
     Plan->addVF(VF);
 
-  if (EnableVPlanPredication) {
-    VPlanPredicator VPP(*Plan);
-    VPP.predicate();
-
-    // Avoid running transformation to recipes until masked code generation in
-    // VPlan-native path is in place.
-    return Plan;
-  }
-
   SmallPtrSet<Instruction *, 1> DeadInstructions;
   VPlanTransforms::VPInstructionsToVPRecipes(
       OrigLoop, Plan,
       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
       DeadInstructions, *PSE.getSE());
 
+  // Remove the existing terminator of the exiting block of the top-most region.
+  // A BranchOnCount will be added instead when adding the canonical IV recipes.
+  auto *Term =
+      Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
+  Term->eraseFromParent();
+
   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
-                        true, true);
+                        true);
   return Plan;
 }
 
@@ -9399,7 +9058,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
 
-      auto *CondOp = CM.foldTailByMasking()
+      auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent())
                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
                          : nullptr;
 
@@ -9441,7 +9100,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
   // dedicated latch block.
   if (CM.foldTailByMasking()) {
     Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin());
-    for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
+    for (VPRecipeBase &R :
+         Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
       VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
       if (!PhiR || PhiR->isInLoop())
         continue;
@@ -9493,7 +9153,7 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
 
 void VPWidenSelectRecipe::execute(VPTransformState &State) {
   auto &I = *cast<SelectInst>(getUnderlyingInstr());
-  State.ILV->setDebugLocFromInst(&I);
+  State.setDebugLocFromInst(&I);
 
   // The condition can be loop invariant  but still defined inside the
   // loop. This means that we can't just use the original 'cond' value.
@@ -9508,7 +9168,7 @@ void VPWidenSelectRecipe::execute(VPTransformState &State) {
     Value *Op1 = State.get(getOperand(2), Part);
     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
     State.set(this, Sel, Part);
-    State.ILV->addMetadata(Sel, &I);
+    State.addMetadata(Sel, &I);
   }
 }
 
@@ -9542,7 +9202,7 @@ void VPWidenRecipe::execute(VPTransformState &State) {
   case Instruction::Or:
   case Instruction::Xor: {
     // Just widen unops and binops.
-    State.ILV->setDebugLocFromInst(&I);
+    State.setDebugLocFromInst(&I);
 
     for (unsigned Part = 0; Part < State.UF; ++Part) {
       SmallVector<Value *, 2> Ops;
@@ -9565,17 +9225,28 @@ void VPWidenRecipe::execute(VPTransformState &State) {
 
       // Use this vector value for all users of the original instruction.
       State.set(this, V, Part);
-      State.ILV->addMetadata(V, &I);
+      State.addMetadata(V, &I);
     }
 
     break;
   }
+  case Instruction::Freeze: {
+    State.setDebugLocFromInst(&I);
+
+    for (unsigned Part = 0; Part < State.UF; ++Part) {
+      Value *Op = State.get(getOperand(0), Part);
+
+      Value *Freeze = Builder.CreateFreeze(Op);
+      State.set(this, Freeze, Part);
+    }
+    break;
+  }
   case Instruction::ICmp:
   case Instruction::FCmp: {
     // Widen compares. Generate vector compares.
     bool FCmp = (I.getOpcode() == Instruction::FCmp);
     auto *Cmp = cast<CmpInst>(&I);
-    State.ILV->setDebugLocFromInst(Cmp);
+    State.setDebugLocFromInst(Cmp);
     for (unsigned Part = 0; Part < State.UF; ++Part) {
       Value *A = State.get(getOperand(0), Part);
       Value *B = State.get(getOperand(1), Part);
@@ -9589,7 +9260,7 @@ void VPWidenRecipe::execute(VPTransformState &State) {
         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
       }
       State.set(this, C, Part);
-      State.ILV->addMetadata(C, &I);
+      State.addMetadata(C, &I);
     }
 
     break;
@@ -9608,7 +9279,7 @@ void VPWidenRecipe::execute(VPTransformState &State) {
   case Instruction::FPTrunc:
   case Instruction::BitCast: {
     auto *CI = cast<CastInst>(&I);
-    State.ILV->setDebugLocFromInst(CI);
+    State.setDebugLocFromInst(CI);
 
     /// Vectorize casts.
     Type *DestTy = (State.VF.isScalar())
@@ -9619,7 +9290,7 @@ void VPWidenRecipe::execute(VPTransformState &State) {
       Value *A = State.get(getOperand(0), Part);
       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
       State.set(this, Cast, Part);
-      State.ILV->addMetadata(Cast, &I);
+      State.addMetadata(Cast, &I);
     }
     break;
   }
@@ -9655,7 +9326,7 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {
     for (unsigned Part = 0; Part < State.UF; ++Part) {
       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);
       State.set(this, EntryPart, Part);
-      State.ILV->addMetadata(EntryPart, GEP);
+      State.addMetadata(EntryPart, GEP);
     }
   } else {
     // If the GEP has at least one loop-varying operand, we are sure to
@@ -9693,32 +9364,276 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {
 
       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
       // but it should be a vector, otherwise.
-      auto *NewGEP = IsInBounds
-                         ? State.Builder.CreateInBoundsGEP(
-                               GEP->getSourceElementType(), Ptr, Indices)
-                         : State.Builder.CreateGEP(GEP->getSourceElementType(),
-                                                   Ptr, Indices);
+      auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
+                                             Indices, "", IsInBounds);
       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
              "NewGEP is not a pointer vector");
       State.set(this, NewGEP, Part);
-      State.ILV->addMetadata(NewGEP, GEP);
+      State.addMetadata(NewGEP, GEP);
     }
   }
 }
 
 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "Int or FP induction being replicated.");
-  auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0);
-  State.ILV->widenIntOrFpInduction(IV, this, State, CanonicalIV);
+
+  Value *Start = getStartValue()->getLiveInIRValue();
+  const InductionDescriptor &ID = getInductionDescriptor();
+  TruncInst *Trunc = getTruncInst();
+  IRBuilderBase &Builder = State.Builder;
+  assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
+  assert(State.VF.isVector() && "must have vector VF");
+
+  // The value from the original loop to which we are mapping the new induction
+  // variable.
+  Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
+
+  // Fast-math-flags propagate from the original induction instruction.
+  IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+  if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
+    Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
+
+  // Now do the actual transformations, and start with fetching the step value.
+  Value *Step = State.get(getStepValue(), VPIteration(0, 0));
+
+  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
+         "Expected either an induction phi-node or a truncate of it!");
+
+  // Construct the initial value of the vector IV in the vector loop preheader
+  auto CurrIP = Builder.saveIP();
+  BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
+  Builder.SetInsertPoint(VectorPH->getTerminator());
+  if (isa<TruncInst>(EntryVal)) {
+    assert(Start->getType()->isIntegerTy() &&
+           "Truncation requires an integer type");
+    auto *TruncType = cast<IntegerType>(EntryVal->getType());
+    Step = Builder.CreateTrunc(Step, TruncType);
+    Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
+  }
+
+  Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
+  Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
+  Value *SteppedStart = getStepVector(
+      SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
+
+  // We create vector phi nodes for both integer and floating-point induction
+  // variables. Here, we determine the kind of arithmetic we will perform.
+  Instruction::BinaryOps AddOp;
+  Instruction::BinaryOps MulOp;
+  if (Step->getType()->isIntegerTy()) {
+    AddOp = Instruction::Add;
+    MulOp = Instruction::Mul;
+  } else {
+    AddOp = ID.getInductionOpcode();
+    MulOp = Instruction::FMul;
+  }
+
+  // Multiply the vectorization factor by the step using integer or
+  // floating-point arithmetic as appropriate.
+  Type *StepType = Step->getType();
+  Value *RuntimeVF;
+  if (Step->getType()->isFloatingPointTy())
+    RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
+  else
+    RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
+  Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
+
+  // Create a vector splat to use in the induction update.
+  //
+  // FIXME: If the step is non-constant, we create the vector splat with
+  //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
+  //        handle a constant vector splat.
+  Value *SplatVF = isa<Constant>(Mul)
+                       ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
+                       : Builder.CreateVectorSplat(State.VF, Mul);
+  Builder.restoreIP(CurrIP);
+
+  // We may need to add the step a number of times, depending on the unroll
+  // factor. The last of those goes into the PHI.
+  PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
+                                    &*State.CFG.PrevBB->getFirstInsertionPt());
+  VecInd->setDebugLoc(EntryVal->getDebugLoc());
+  Instruction *LastInduction = VecInd;
+  for (unsigned Part = 0; Part < State.UF; ++Part) {
+    State.set(this, LastInduction, Part);
+
+    if (isa<TruncInst>(EntryVal))
+      State.addMetadata(LastInduction, EntryVal);
+
+    LastInduction = cast<Instruction>(
+        Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
+    LastInduction->setDebugLoc(EntryVal->getDebugLoc());
+  }
+
+  LastInduction->setName("vec.ind.next");
+  VecInd->addIncoming(SteppedStart, VectorPH);
+  // Add induction update using an incorrect block temporarily. The phi node
+  // will be fixed after VPlan execution. Note that at this point the latch
+  // block cannot be used, as it does not exist yet.
+  // TODO: Model increment value in VPlan, by turning the recipe into a
+  // multi-def and a subclass of VPHeaderPHIRecipe.
+  VecInd->addIncoming(LastInduction, VectorPH);
+}
+
+void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
+  assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
+         "Not a pointer induction according to InductionDescriptor!");
+  assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
+         "Unexpected type.");
+
+  auto *IVR = getParent()->getPlan()->getCanonicalIV();
+  PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
+
+  if (onlyScalarsGenerated(State.VF)) {
+    // This is the normalized GEP that starts counting at zero.
+    Value *PtrInd = State.Builder.CreateSExtOrTrunc(
+        CanonicalIV, IndDesc.getStep()->getType());
+    // Determine the number of scalars we need to generate for each unroll
+    // iteration. If the instruction is uniform, we only need to generate the
+    // first lane. Otherwise, we generate all VF values.
+    bool IsUniform = vputils::onlyFirstLaneUsed(this);
+    assert((IsUniform || !State.VF.isScalable()) &&
+           "Cannot scalarize a scalable VF");
+    unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
+
+    for (unsigned Part = 0; Part < State.UF; ++Part) {
+      Value *PartStart =
+          createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part);
+
+      for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
+        Value *Idx = State.Builder.CreateAdd(
+            PartStart, ConstantInt::get(PtrInd->getType(), Lane));
+        Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);
+
+        Value *Step = CreateStepValue(IndDesc.getStep(), SE,
+                                      State.CFG.PrevBB->getTerminator());
+        Value *SclrGep = emitTransformedIndex(
+            State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc);
+        SclrGep->setName("next.gep");
+        State.set(this, SclrGep, VPIteration(Part, Lane));
+      }
+    }
+    return;
+  }
+
+  assert(isa<SCEVConstant>(IndDesc.getStep()) &&
+         "Induction step not a SCEV constant!");
+  Type *PhiType = IndDesc.getStep()->getType();
+
+  // Build a pointer phi
+  Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
+  Type *ScStValueType = ScalarStartValue->getType();
+  PHINode *NewPointerPhi =
+      PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
+
+  BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
+  NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
+
+  // A pointer induction, performed by using a gep
+  const DataLayout &DL = NewPointerPhi->getModule()->getDataLayout();
+  Instruction *InductionLoc = &*State.Builder.GetInsertPoint();
+
+  const SCEV *ScalarStep = IndDesc.getStep();
+  SCEVExpander Exp(SE, DL, "induction");
+  Value *ScalarStepValue = Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
+  Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
+  Value *NumUnrolledElems =
+      State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
+  Value *InductionGEP = GetElementPtrInst::Create(
+      IndDesc.getElementType(), NewPointerPhi,
+      State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
+      InductionLoc);
+  // Add induction update using an incorrect block temporarily. The phi node
+  // will be fixed after VPlan execution. Note that at this point the latch
+  // block cannot be used, as it does not exist yet.
+  // TODO: Model increment value in VPlan, by turning the recipe into a
+  // multi-def and a subclass of VPHeaderPHIRecipe.
+  NewPointerPhi->addIncoming(InductionGEP, VectorPH);
+
+  // Create UF many actual address geps that use the pointer
+  // phi as base and a vectorized version of the step value
+  // (<step*0, ..., step*N>) as offset.
+  for (unsigned Part = 0; Part < State.UF; ++Part) {
+    Type *VecPhiType = VectorType::get(PhiType, State.VF);
+    Value *StartOffsetScalar =
+        State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
+    Value *StartOffset =
+        State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
+    // Create a vector of consecutive numbers from zero to VF.
+    StartOffset = State.Builder.CreateAdd(
+        StartOffset, State.Builder.CreateStepVector(VecPhiType));
+
+    Value *GEP = State.Builder.CreateGEP(
+        IndDesc.getElementType(), NewPointerPhi,
+        State.Builder.CreateMul(
+            StartOffset,
+            State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
+            "vector.gep"));
+    State.set(this, GEP, Part);
+  }
 }
 
-void VPWidenPHIRecipe::execute(VPTransformState &State) {
-  State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this,
-                                 State);
+void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
+  assert(!State.Instance && "VPScalarIVStepsRecipe being replicated.");
+
+  // Fast-math-flags propagate from the original induction instruction.
+  IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
+  if (IndDesc.getInductionBinOp() &&
+      isa<FPMathOperator>(IndDesc.getInductionBinOp()))
+    State.Builder.setFastMathFlags(
+        IndDesc.getInductionBinOp()->getFastMathFlags());
+
+  Value *Step = State.get(getStepValue(), VPIteration(0, 0));
+  auto CreateScalarIV = [&](Value *&Step) -> Value * {
+    Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0));
+    auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0);
+    if (!isCanonical() || CanonicalIV->getType() != Ty) {
+      ScalarIV =
+          Ty->isIntegerTy()
+              ? State.Builder.CreateSExtOrTrunc(ScalarIV, Ty)
+              : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, Ty);
+      ScalarIV = emitTransformedIndex(State.Builder, ScalarIV,
+                                      getStartValue()->getLiveInIRValue(), Step,
+                                      IndDesc);
+      ScalarIV->setName("offset.idx");
+    }
+    if (TruncToTy) {
+      assert(Step->getType()->isIntegerTy() &&
+             "Truncation requires an integer step");
+      ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncToTy);
+      Step = State.Builder.CreateTrunc(Step, TruncToTy);
+    }
+    return ScalarIV;
+  };
+
+  Value *ScalarIV = CreateScalarIV(Step);
+  if (State.VF.isVector()) {
+    buildScalarSteps(ScalarIV, Step, IndDesc, this, State);
+    return;
+  }
+
+  for (unsigned Part = 0; Part < State.UF; ++Part) {
+    assert(!State.VF.isScalable() && "scalable vectors not yet supported.");
+    Value *EntryPart;
+    if (Step->getType()->isFloatingPointTy()) {
+      Value *StartIdx =
+          getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part);
+      // Floating-point operations inherit FMF via the builder's flags.
+      Value *MulOp = State.Builder.CreateFMul(StartIdx, Step);
+      EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(),
+                                            ScalarIV, MulOp);
+    } else {
+      Value *StartIdx =
+          getRuntimeVF(State.Builder, Step->getType(), State.VF * Part);
+      EntryPart = State.Builder.CreateAdd(
+          ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction");
+    }
+    State.set(this, EntryPart, Part);
+  }
 }
 
 void VPBlendRecipe::execute(VPTransformState &State) {
-  State.ILV->setDebugLocFromInst(Phi, &State.Builder);
+  State.setDebugLocFromInst(Phi);
   // We know that all PHIs in non-header blocks are converted into
   // selects, so we don't have to worry about the insertion order and we
   // can just use the builder.
@@ -9979,7 +9894,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
 
   // Handle Stores:
   if (SI) {
-    State.ILV->setDebugLocFromInst(SI);
+    State.setDebugLocFromInst(SI);
 
     for (unsigned Part = 0; Part < State.UF; ++Part) {
       Instruction *NewSI = nullptr;
@@ -10005,14 +9920,14 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
         else
           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
       }
-      State.ILV->addMetadata(NewSI, SI);
+      State.addMetadata(NewSI, SI);
     }
     return;
   }
 
   // Handle loads.
   assert(LI && "Must have a load instruction");
-  State.ILV->setDebugLocFromInst(LI);
+  State.setDebugLocFromInst(LI);
   for (unsigned Part = 0; Part < State.UF; ++Part) {
     Value *NewLI;
     if (CreateGatherScatter) {
@@ -10020,7 +9935,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
       Value *VectorGep = State.get(getAddr(), Part);
       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
                                          nullptr, "wide.masked.gather");
-      State.ILV->addMetadata(NewLI, LI);
+      State.addMetadata(NewLI, LI);
     } else {
       auto *VecPtr =
           CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
@@ -10033,12 +9948,12 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
 
       // Add metadata to the load, but setVectorValue to the reverse shuffle.
-      State.ILV->addMetadata(NewLI, LI);
+      State.addMetadata(NewLI, LI);
       if (Reverse)
         NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
     }
 
-    State.set(this, NewLI, Part);
+    State.set(getVPSingleValue(), NewLI, Part);
   }
 }
 
@@ -10119,7 +10034,8 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part) {
   // Check if there is a scalar value for the selected lane.
   if (!hasScalarValue(Def, {Part, LastLane})) {
     // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
-    assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) &&
+    assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) ||
+            isa<VPScalarIVStepsRecipe>(Def->getDef())) &&
            "unexpected recipe found to be invariant");
     IsUniform = true;
     LastLane = 0;
@@ -10201,8 +10117,7 @@ static bool processLoopInVPlanNativePath(
   // If we are stress testing VPlan builds, do not attempt to generate vector
   // code. Masked vector code generation support will follow soon.
   // Also, do not attempt to vectorize if no vector code will be produced.
-  if (VPlanBuildStressTest || EnableVPlanPredication ||
-      VectorizationFactor::Disabled() == VF)
+  if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
     return false;
 
   VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
@@ -10214,7 +10129,7 @@ static bool processLoopInVPlanNativePath(
                            &CM, BFI, PSI, Checks);
     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
                       << L->getHeader()->getParent()->getName() << "\"\n");
-    LVP.executePlan(VF.Width, 1, BestPlan, LB, DT);
+    LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
   }
 
   // Mark the loop as already vectorized to avoid vectorizing again.
@@ -10282,8 +10197,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   const std::string DebugLocStr = getDebugLocString(L);
 #endif /* NDEBUG */
 
-  LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
-                    << L->getHeader()->getParent()->getName() << "\" from "
+  LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
+                    << L->getHeader()->getParent()->getName() << "' from "
                     << DebugLocStr << "\n");
 
   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
@@ -10438,10 +10353,30 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   VectorizationFactor VF = VectorizationFactor::Disabled();
   unsigned IC = 1;
 
+  GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
+                           F->getParent()->getDataLayout());
   if (MaybeVF) {
+    if (LVP.requiresTooManyRuntimeChecks()) {
+      ORE->emit([&]() {
+        return OptimizationRemarkAnalysisAliasing(
+                   DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
+                   L->getHeader())
+               << "loop not vectorized: cannot prove it is safe to reorder "
+                  "memory operations";
+      });
+      LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
+      Hints.emitRemarkWithHints();
+      return false;
+    }
     VF = *MaybeVF;
     // Select the interleave count.
     IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
+
+    unsigned SelectedIC = std::max(IC, UserIC);
+    //  Optimistically generate runtime checks if they are needed. Drop them if
+    //  they turn out to not be profitable.
+    if (VF.Width.isVector() || SelectedIC > 1)
+      Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
   }
 
   // Identify the diagnostic messages that should be produced.
@@ -10529,14 +10464,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   bool DisableRuntimeUnroll = false;
   MDNode *OrigLoopID = L->getLoopID();
   {
-    // Optimistically generate runtime checks. Drop them if they turn out to not
-    // be profitable. Limit the scope of Checks, so the cleanup happens
-    // immediately after vector codegeneration is done.
-    GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
-                             F->getParent()->getDataLayout());
-    if (!VF.Width.isScalar() || IC > 1)
-      Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate());
-
     using namespace ore;
     if (!VectorizeLoop) {
       assert(IC > 1 && "interleave count should not be 1 or 0");
@@ -10546,7 +10473,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                                  &CM, BFI, PSI, Checks);
 
       VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
-      LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT);
+      LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
 
       ORE->emit([&]() {
         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
@@ -10571,12 +10498,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
 
         VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
         LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
-                        DT);
+                        DT, true);
         ++LoopsVectorized;
 
-        simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
-        formLCSSARecursively(*L, *DT, LI, SE);
-
         // Second pass vectorizes the epilogue and adjusts the control flow
         // edges from the first pass.
         EPI.MainLoopVF = EPI.EpilogueVF;
@@ -10586,23 +10510,24 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                                                  Checks);
 
         VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
+        VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
+        VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
+        Header->setName("vec.epilog.vector.body");
 
         // Ensure that the start values for any VPReductionPHIRecipes are
         // updated before vectorising the epilogue loop.
-        VPBasicBlock *Header = BestEpiPlan.getEntry()->getEntryBasicBlock();
         for (VPRecipeBase &R : Header->phis()) {
           if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
             if (auto *Resume = MainILV.getReductionResumeValue(
                     ReductionPhi->getRecurrenceDescriptor())) {
-              VPValue *StartVal = new VPValue(Resume);
-              BestEpiPlan.addExternalDef(StartVal);
+              VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(Resume);
               ReductionPhi->setOperand(0, StartVal);
             }
           }
         }
 
         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
-                        DT);
+                        DT, true);
         ++LoopsEpilogueVectorized;
 
         if (!MainILV.areSafetyChecksAdded())
@@ -10612,7 +10537,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                                &LVL, &CM, BFI, PSI, Checks);
 
         VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
-        LVP.executePlan(VF.Width, IC, BestPlan, LB, DT);
+        LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
         ++LoopsVectorized;
 
         // Add metadata to disable runtime unrolling a scalar loop when there
@@ -10638,7 +10563,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   Optional<MDNode *> RemainderLoopID =
       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
                                       LLVMLoopVectorizeFollowupEpilogue});
-  if (RemainderLoopID.hasValue()) {
+  if (RemainderLoopID) {
     L->setLoopID(RemainderLoopID.getValue());
   } else {
     if (DisableRuntimeUnroll)
@@ -10720,8 +10645,12 @@ LoopVectorizeResult LoopVectorizePass::runImpl(
 
 PreservedAnalyses LoopVectorizePass::run(Function &F,
                                          FunctionAnalysisManager &AM) {
-    auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
     auto &LI = AM.getResult<LoopAnalysis>(F);
+    // There are no loops in the function. Return before computing other expensive
+    // analyses.
+    if (LI.empty())
+      return PreservedAnalyses::all();
+    auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 15b349f53fd9..019a09665a67 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -53,7 +53,6 @@
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
@@ -64,7 +63,6 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/NoFolder.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
@@ -72,8 +70,9 @@
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
+#ifdef EXPENSIVE_CHECKS
 #include "llvm/IR/Verifier.h"
-#include "llvm/InitializePasses.h"
+#endif
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -87,6 +86,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Vectorize.h"
 #include <algorithm>
@@ -164,13 +164,14 @@ static cl::opt<int> LookAheadMaxDepth(
     "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
     cl::desc("The maximum look-ahead depth for operand reordering scores"));
 
-// The Look-ahead heuristic goes through the users of the bundle to calculate
-// the users cost in getExternalUsesCost(). To avoid compilation time increase
-// we limit the number of users visited to this value.
-static cl::opt<unsigned> LookAheadUsersBudget(
-    "slp-look-ahead-users-budget", cl::init(2), cl::Hidden,
-    cl::desc("The maximum number of users to visit while visiting the "
-             "predecessors. This prevents compilation time increase."));
+// The maximum depth that the look-ahead score heuristic will explore
+// when it probing among candidates for vectorization tree roots.
+// The higher this value, the higher the compilation time overhead but unlike
+// similar limit for operands ordering this is less frequently used, hence
+// impact of higher value is less noticeable.
+static cl::opt<int> RootLookAheadMaxDepth(
+    "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
+    cl::desc("The maximum look-ahead depth for searching best rooting option"));
 
 static cl::opt<bool>
     ViewSLPTree("view-slp-tree", cl::Hidden,
@@ -571,7 +572,7 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
               areCompatibleCmpOps(AltOp0, AltOp1, Op1, Op0))
             continue;
         }
-        if (BaseIndex == AltIndex) {
+        if (BaseIndex == AltIndex && BasePred != CurrentPred) {
           assert(isValidForAlternation(Opcode) &&
                  isValidForAlternation(InstOpcode) &&
                  "Cast isn't safe for alternation, logic needs to be updated!");
@@ -640,7 +641,7 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
     CallInst *CI = cast<CallInst>(UserInst);
     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
     for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {
-      if (hasVectorInstrinsicScalarOpd(ID, i))
+      if (isVectorIntrinsicWithScalarOpAtArg(ID, i))
         return (CI->getArgOperand(i) == Scalar);
     }
     LLVM_FALLTHROUGH;
@@ -736,29 +737,28 @@ static void inversePermutation(ArrayRef<unsigned> Indices,
 
 /// \returns inserting index of InsertElement or InsertValue instruction,
 /// using Offset as base offset for index.
-static Optional<int> getInsertIndex(Value *InsertInst, unsigned Offset) {
+static Optional<unsigned> getInsertIndex(const Value *InsertInst,
+                                         unsigned Offset = 0) {
   int Index = Offset;
-  if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
-    if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {
+  if (const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
+    if (const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {
       auto *VT = cast<FixedVectorType>(IE->getType());
       if (CI->getValue().uge(VT->getNumElements()))
-        return UndefMaskElem;
+        return None;
       Index *= VT->getNumElements();
       Index += CI->getZExtValue();
       return Index;
     }
-    if (isa<UndefValue>(IE->getOperand(2)))
-      return UndefMaskElem;
     return None;
   }
 
-  auto *IV = cast<InsertValueInst>(InsertInst);
+  const auto *IV = cast<InsertValueInst>(InsertInst);
   Type *CurrentType = IV->getType();
   for (unsigned I : IV->indices()) {
-    if (auto *ST = dyn_cast<StructType>(CurrentType)) {
+    if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
       Index *= ST->getNumElements();
       CurrentType = ST->getElementType(I);
-    } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
+    } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
       Index *= AT->getNumElements();
       CurrentType = AT->getElementType();
     } else {
@@ -769,11 +769,7 @@ static Optional<int> getInsertIndex(Value *InsertInst, unsigned Offset) {
   return Index;
 }
 
-/// Reorders the list of scalars in accordance with the given \p Order and then
-/// the \p Mask. \p Order - is the original order of the scalars, need to
-/// reorder scalars into an unordered state at first according to the given
-/// order. Then the ordered scalars are shuffled once again in accordance with
-/// the provided mask.
+/// Reorders the list of scalars in accordance with the given \p Mask.
 static void reorderScalars(SmallVectorImpl<Value *> &Scalars,
                            ArrayRef<int> Mask) {
   assert(!Mask.empty() && "Expected non-empty mask.");
@@ -785,6 +781,58 @@ static void reorderScalars(SmallVectorImpl<Value *> &Scalars,
       Scalars[Mask[I]] = Prev[I];
 }
 
+/// Checks if the provided value does not require scheduling. It does not
+/// require scheduling if this is not an instruction or it is an instruction
+/// that does not read/write memory and all operands are either not instructions
+/// or phi nodes or instructions from different blocks.
+static bool areAllOperandsNonInsts(Value *V) {
+  auto *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return true;
+  return !mayHaveNonDefUseDependency(*I) &&
+    all_of(I->operands(), [I](Value *V) {
+      auto *IO = dyn_cast<Instruction>(V);
+      if (!IO)
+        return true;
+      return isa<PHINode>(IO) || IO->getParent() != I->getParent();
+    });
+}
+
+/// Checks if the provided value does not require scheduling. It does not
+/// require scheduling if this is not an instruction or it is an instruction
+/// that does not read/write memory and all users are phi nodes or instructions
+/// from the different blocks.
+static bool isUsedOutsideBlock(Value *V) {
+  auto *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return true;
+  // Limits the number of uses to save compile time.
+  constexpr int UsesLimit = 8;
+  return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
+         all_of(I->users(), [I](User *U) {
+           auto *IU = dyn_cast<Instruction>(U);
+           if (!IU)
+             return true;
+           return IU->getParent() != I->getParent() || isa<PHINode>(IU);
+         });
+}
+
+/// Checks if the specified value does not require scheduling. It does not
+/// require scheduling if all operands and all users do not need to be scheduled
+/// in the current basic block.
+static bool doesNotNeedToBeScheduled(Value *V) {
+  return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V);
+}
+
+/// Checks if the specified array of instructions does not require scheduling.
+/// It is so if all either instructions have operands that do not require
+/// scheduling or their users do not require scheduling since they are phis or
+/// in other basic blocks.
+static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
+  return !VL.empty() &&
+         (all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts));
+}
+
 namespace slpvectorizer {
 
 /// Bottom Up SLP Vectorizer.
@@ -805,8 +853,8 @@ public:
           TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
           DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
           const DataLayout *DL, OptimizationRemarkEmitter *ORE)
-      : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
-        DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
+      : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li),
+        DT(Dt), AC(AC), DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
     CodeMetrics::collectEphemeralValues(F, AC, EphValues);
     // Use the vector register size specified by the target unless overridden
     // by a command-line option.
@@ -847,7 +895,10 @@ public:
   /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
   /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
   void buildTree(ArrayRef<Value *> Roots,
-                 ArrayRef<Value *> UserIgnoreLst = None);
+                 const SmallDenseSet<Value *> &UserIgnoreLst);
+
+  /// Construct a vectorizable tree that starts at \p Roots.
+  void buildTree(ArrayRef<Value *> Roots);
 
   /// Builds external uses of the vectorized scalars, i.e. the list of
   /// vectorized scalars to be extracted, their lanes and their scalar users. \p
@@ -868,6 +919,7 @@ public:
     }
     MinBWs.clear();
     InstrElementSize.clear();
+    UserIgnoreList = nullptr;
   }
 
   unsigned getTreeSize() const { return VectorizableTree.size(); }
@@ -881,6 +933,9 @@ public:
   /// ExtractElement, ExtractValue), which can be part of the graph.
   Optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
 
+  /// Sort loads into increasing pointers offsets to allow greater clustering.
+  Optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
+
   /// Gets reordering data for the given tree entry. If the entry is vectorized
   /// - just return ReorderIndices, otherwise check if the scalars can be
   /// reordered and return the most optimal order.
@@ -995,96 +1050,18 @@ public:
 #endif
   };
 
-  /// A helper data structure to hold the operands of a vector of instructions.
-  /// This supports a fixed vector length for all operand vectors.
-  class VLOperands {
-    /// For each operand we need (i) the value, and (ii) the opcode that it
-    /// would be attached to if the expression was in a left-linearized form.
-    /// This is required to avoid illegal operand reordering.
-    /// For example:
-    /// \verbatim
-    ///                         0 Op1
-    ///                         |/
-    /// Op1 Op2   Linearized    + Op2
-    ///   \ /     ---------->   |/
-    ///    -                    -
-    ///
-    /// Op1 - Op2            (0 + Op1) - Op2
-    /// \endverbatim
-    ///
-    /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
-    ///
-    /// Another way to think of this is to track all the operations across the
-    /// path from the operand all the way to the root of the tree and to
-    /// calculate the operation that corresponds to this path. For example, the
-    /// path from Op2 to the root crosses the RHS of the '-', therefore the
-    /// corresponding operation is a '-' (which matches the one in the
-    /// linearized tree, as shown above).
-    ///
-    /// For lack of a better term, we refer to this operation as Accumulated
-    /// Path Operation (APO).
-    struct OperandData {
-      OperandData() = default;
-      OperandData(Value *V, bool APO, bool IsUsed)
-          : V(V), APO(APO), IsUsed(IsUsed) {}
-      /// The operand value.
-      Value *V = nullptr;
-      /// TreeEntries only allow a single opcode, or an alternate sequence of
-      /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
-      /// APO. It is set to 'true' if 'V' is attached to an inverse operation
-      /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
-      /// (e.g., Add/Mul)
-      bool APO = false;
-      /// Helper data for the reordering function.
-      bool IsUsed = false;
-    };
-
-    /// During operand reordering, we are trying to select the operand at lane
-    /// that matches best with the operand at the neighboring lane. Our
-    /// selection is based on the type of value we are looking for. For example,
-    /// if the neighboring lane has a load, we need to look for a load that is
-    /// accessing a consecutive address. These strategies are summarized in the
-    /// 'ReorderingMode' enumerator.
-    enum class ReorderingMode {
-      Load,     ///< Matching loads to consecutive memory addresses
-      Opcode,   ///< Matching instructions based on opcode (same or alternate)
-      Constant, ///< Matching constants
-      Splat,    ///< Matching the same instruction multiple times (broadcast)
-      Failed,   ///< We failed to create a vectorizable group
-    };
-
-    using OperandDataVec = SmallVector<OperandData, 2>;
-
-    /// A vector of operand vectors.
-    SmallVector<OperandDataVec, 4> OpsVec;
-
+  /// A helper class used for scoring candidates for two consecutive lanes.
+  class LookAheadHeuristics {
     const DataLayout &DL;
     ScalarEvolution &SE;
     const BoUpSLP &R;
+    int NumLanes; // Total number of lanes (aka vectorization factor).
+    int MaxLevel; // The maximum recursion depth for accumulating score.
 
-    /// \returns the operand data at \p OpIdx and \p Lane.
-    OperandData &getData(unsigned OpIdx, unsigned Lane) {
-      return OpsVec[OpIdx][Lane];
-    }
-
-    /// \returns the operand data at \p OpIdx and \p Lane. Const version.
-    const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
-      return OpsVec[OpIdx][Lane];
-    }
-
-    /// Clears the used flag for all entries.
-    void clearUsed() {
-      for (unsigned OpIdx = 0, NumOperands = getNumOperands();
-           OpIdx != NumOperands; ++OpIdx)
-        for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
-             ++Lane)
-          OpsVec[OpIdx][Lane].IsUsed = false;
-    }
-
-    /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
-    void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
-      std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
-    }
+  public:
+    LookAheadHeuristics(const DataLayout &DL, ScalarEvolution &SE,
+                        const BoUpSLP &R, int NumLanes, int MaxLevel)
+        : DL(DL), SE(SE), R(R), NumLanes(NumLanes), MaxLevel(MaxLevel) {}
 
     // The hard-coded scores listed here are not very important, though it shall
     // be higher for better matches to improve the resulting cost. When
@@ -1099,6 +1076,11 @@ public:
 
     /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
     static const int ScoreConsecutiveLoads = 4;
+    /// The same load multiple times. This should have a better score than
+    /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
+    /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
+    /// a vector load and 1.0 for a broadcast.
+    static const int ScoreSplatLoads = 3;
     /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
     static const int ScoreReversedLoads = 3;
     /// ExtractElementInst from same vector and consecutive indexes.
@@ -1117,43 +1099,67 @@ public:
     static const int ScoreUndef = 1;
     /// Score for failing to find a decent match.
     static const int ScoreFail = 0;
-    /// User exteranl to the vectorized code.
-    static const int ExternalUseCost = 1;
-    /// The user is internal but in a different lane.
-    static const int UserInDiffLaneCost = ExternalUseCost;
+    /// Score if all users are vectorized.
+    static const int ScoreAllUserVectorized = 1;
 
     /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
-    static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL,
-                               ScalarEvolution &SE, int NumLanes) {
-      if (V1 == V2)
-        return VLOperands::ScoreSplat;
+    /// \p U1 and \p U2 are the users of \p V1 and \p V2.
+    /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
+    /// MainAltOps.
+    int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2,
+                        ArrayRef<Value *> MainAltOps) const {
+      if (V1 == V2) {
+        if (isa<LoadInst>(V1)) {
+          // Retruns true if the users of V1 and V2 won't need to be extracted.
+          auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
+            // Bail out if we have too many uses to save compilation time.
+            static constexpr unsigned Limit = 8;
+            if (V1->hasNUsesOrMore(Limit) || V2->hasNUsesOrMore(Limit))
+              return false;
+
+            auto AllUsersVectorized = [U1, U2, this](Value *V) {
+              return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
+                return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
+              });
+            };
+            return AllUsersVectorized(V1) && AllUsersVectorized(V2);
+          };
+          // A broadcast of a load can be cheaper on some targets.
+          if (R.TTI->isLegalBroadcastLoad(V1->getType(),
+                                          ElementCount::getFixed(NumLanes)) &&
+              ((int)V1->getNumUses() == NumLanes ||
+               AllUsersAreInternal(V1, V2)))
+            return LookAheadHeuristics::ScoreSplatLoads;
+        }
+        return LookAheadHeuristics::ScoreSplat;
+      }
 
       auto *LI1 = dyn_cast<LoadInst>(V1);
       auto *LI2 = dyn_cast<LoadInst>(V2);
       if (LI1 && LI2) {
         if (LI1->getParent() != LI2->getParent())
-          return VLOperands::ScoreFail;
+          return LookAheadHeuristics::ScoreFail;
 
         Optional<int> Dist = getPointersDiff(
             LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
             LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
-        if (!Dist)
-          return VLOperands::ScoreFail;
+        if (!Dist || *Dist == 0)
+          return LookAheadHeuristics::ScoreFail;
         // The distance is too large - still may be profitable to use masked
         // loads/gathers.
         if (std::abs(*Dist) > NumLanes / 2)
-          return VLOperands::ScoreAltOpcodes;
+          return LookAheadHeuristics::ScoreAltOpcodes;
         // This still will detect consecutive loads, but we might have "holes"
         // in some cases. It is ok for non-power-2 vectorization and may produce
         // better results. It should not affect current vectorization.
-        return (*Dist > 0) ? VLOperands::ScoreConsecutiveLoads
-                           : VLOperands::ScoreReversedLoads;
+        return (*Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveLoads
+                           : LookAheadHeuristics::ScoreReversedLoads;
       }
 
       auto *C1 = dyn_cast<Constant>(V1);
       auto *C2 = dyn_cast<Constant>(V2);
       if (C1 && C2)
-        return VLOperands::ScoreConstants;
+        return LookAheadHeuristics::ScoreConstants;
 
       // Extracts from consecutive indexes of the same vector better score as
       // the extracts could be optimized away.
@@ -1162,7 +1168,7 @@ public:
       if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
         // Undefs are always profitable for extractelements.
         if (isa<UndefValue>(V2))
-          return VLOperands::ScoreConsecutiveExtracts;
+          return LookAheadHeuristics::ScoreConsecutiveExtracts;
         Value *EV2 = nullptr;
         ConstantInt *Ex2Idx = nullptr;
         if (match(V2,
@@ -1170,108 +1176,62 @@ public:
                                                          m_Undef())))) {
           // Undefs are always profitable for extractelements.
           if (!Ex2Idx)
-            return VLOperands::ScoreConsecutiveExtracts;
+            return LookAheadHeuristics::ScoreConsecutiveExtracts;
           if (isUndefVector(EV2) && EV2->getType() == EV1->getType())
-            return VLOperands::ScoreConsecutiveExtracts;
+            return LookAheadHeuristics::ScoreConsecutiveExtracts;
           if (EV2 == EV1) {
             int Idx1 = Ex1Idx->getZExtValue();
             int Idx2 = Ex2Idx->getZExtValue();
             int Dist = Idx2 - Idx1;
             // The distance is too large - still may be profitable to use
             // shuffles.
+            if (std::abs(Dist) == 0)
+              return LookAheadHeuristics::ScoreSplat;
             if (std::abs(Dist) > NumLanes / 2)
-              return VLOperands::ScoreAltOpcodes;
-            return (Dist > 0) ? VLOperands::ScoreConsecutiveExtracts
-                              : VLOperands::ScoreReversedExtracts;
+              return LookAheadHeuristics::ScoreSameOpcode;
+            return (Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveExtracts
+                              : LookAheadHeuristics::ScoreReversedExtracts;
           }
+          return LookAheadHeuristics::ScoreAltOpcodes;
         }
+        return LookAheadHeuristics::ScoreFail;
       }
 
       auto *I1 = dyn_cast<Instruction>(V1);
       auto *I2 = dyn_cast<Instruction>(V2);
       if (I1 && I2) {
         if (I1->getParent() != I2->getParent())
-          return VLOperands::ScoreFail;
-        InstructionsState S = getSameOpcode({I1, I2});
+          return LookAheadHeuristics::ScoreFail;
+        SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
+        Ops.push_back(I1);
+        Ops.push_back(I2);
+        InstructionsState S = getSameOpcode(Ops);
         // Note: Only consider instructions with <= 2 operands to avoid
         // complexity explosion.
-        if (S.getOpcode() && S.MainOp->getNumOperands() <= 2)
-          return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes
-                                  : VLOperands::ScoreSameOpcode;
+        if (S.getOpcode() &&
+            (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
+             !S.isAltShuffle()) &&
+            all_of(Ops, [&S](Value *V) {
+              return cast<Instruction>(V)->getNumOperands() ==
+                     S.MainOp->getNumOperands();
+            }))
+          return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
+                                  : LookAheadHeuristics::ScoreSameOpcode;
       }
 
       if (isa<UndefValue>(V2))
-        return VLOperands::ScoreUndef;
-
-      return VLOperands::ScoreFail;
-    }
-
-    /// Holds the values and their lanes that are taking part in the look-ahead
-    /// score calculation. This is used in the external uses cost calculation.
-    /// Need to hold all the lanes in case of splat/broadcast at least to
-    /// correctly check for the use in the different lane.
-    SmallDenseMap<Value *, SmallSet<int, 4>> InLookAheadValues;
-
-    /// \returns the additional cost due to uses of \p LHS and \p RHS that are
-    /// either external to the vectorized code, or require shuffling.
-    int getExternalUsesCost(const std::pair<Value *, int> &LHS,
-                            const std::pair<Value *, int> &RHS) {
-      int Cost = 0;
-      std::array<std::pair<Value *, int>, 2> Values = {{LHS, RHS}};
-      for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {
-        Value *V = Values[Idx].first;
-        if (isa<Constant>(V)) {
-          // Since this is a function pass, it doesn't make semantic sense to
-          // walk the users of a subclass of Constant. The users could be in
-          // another function, or even another module that happens to be in
-          // the same LLVMContext.
-          continue;
-        }
+        return LookAheadHeuristics::ScoreUndef;
 
-        // Calculate the absolute lane, using the minimum relative lane of LHS
-        // and RHS as base and Idx as the offset.
-        int Ln = std::min(LHS.second, RHS.second) + Idx;
-        assert(Ln >= 0 && "Bad lane calculation");
-        unsigned UsersBudget = LookAheadUsersBudget;
-        for (User *U : V->users()) {
-          if (const TreeEntry *UserTE = R.getTreeEntry(U)) {
-            // The user is in the VectorizableTree. Check if we need to insert.
-            int UserLn = UserTE->findLaneForValue(U);
-            assert(UserLn >= 0 && "Bad lane");
-            // If the values are different, check just the line of the current
-            // value. If the values are the same, need to add UserInDiffLaneCost
-            // only if UserLn does not match both line numbers.
-            if ((LHS.first != RHS.first && UserLn != Ln) ||
-                (LHS.first == RHS.first && UserLn != LHS.second &&
-                 UserLn != RHS.second)) {
-              Cost += UserInDiffLaneCost;
-              break;
-            }
-          } else {
-            // Check if the user is in the look-ahead code.
-            auto It2 = InLookAheadValues.find(U);
-            if (It2 != InLookAheadValues.end()) {
-              // The user is in the look-ahead code. Check the lane.
-              if (!It2->getSecond().contains(Ln)) {
-                Cost += UserInDiffLaneCost;
-                break;
-              }
-            } else {
-              // The user is neither in SLP tree nor in the look-ahead code.
-              Cost += ExternalUseCost;
-              break;
-            }
-          }
-          // Limit the number of visited uses to cap compilation time.
-          if (--UsersBudget == 0)
-            break;
-        }
-      }
-      return Cost;
+      return LookAheadHeuristics::ScoreFail;
     }
 
-    /// Go through the operands of \p LHS and \p RHS recursively until \p
-    /// MaxLevel, and return the cummulative score. For example:
+    /// Go through the operands of \p LHS and \p RHS recursively until
+    /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
+    /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
+    /// of \p U1 and \p U2), except at the beginning of the recursion where
+    /// these are set to nullptr.
+    ///
+    /// For example:
     /// \verbatim
     ///  A[0]  B[0]  A[1]  B[1]  C[0] D[0]  B[1] A[1]
     ///     \ /         \ /         \ /        \ /
@@ -1282,8 +1242,8 @@ public:
     /// each level recursively, accumulating the score. It starts from matching
     /// the additions at level 0, then moves on to the loads (level 1). The
     /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
-    /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while
-    /// {A[0],C[0]} has a score of VLOperands::ScoreFail.
+    /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
+    /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
     /// Please note that the order of the operands does not matter, as we
     /// evaluate the score of all profitable combinations of operands. In
     /// other words the score of G1 and G4 is the same as G1 and G2. This
@@ -1291,18 +1251,13 @@ public:
     ///   Look-ahead SLP: Auto-vectorization in the presence of commutative
     ///   operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
     ///   Luís F. W. Góes
-    int getScoreAtLevelRec(const std::pair<Value *, int> &LHS,
-                           const std::pair<Value *, int> &RHS, int CurrLevel,
-                           int MaxLevel) {
+    int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1,
+                           Instruction *U2, int CurrLevel,
+                           ArrayRef<Value *> MainAltOps) const {
 
-      Value *V1 = LHS.first;
-      Value *V2 = RHS.first;
       // Get the shallow score of V1 and V2.
-      int ShallowScoreAtThisLevel = std::max(
-          (int)ScoreFail, getShallowScore(V1, V2, DL, SE, getNumLanes()) -
-                              getExternalUsesCost(LHS, RHS));
-      int Lane1 = LHS.second;
-      int Lane2 = RHS.second;
+      int ShallowScoreAtThisLevel =
+          getShallowScore(LHS, RHS, U1, U2, MainAltOps);
 
       // If reached MaxLevel,
       //  or if V1 and V2 are not instructions,
@@ -1310,20 +1265,17 @@ public:
       //  or if they are not consecutive,
       //  or if profitable to vectorize loads or extractelements, early return
       //  the current cost.
-      auto *I1 = dyn_cast<Instruction>(V1);
-      auto *I2 = dyn_cast<Instruction>(V2);
+      auto *I1 = dyn_cast<Instruction>(LHS);
+      auto *I2 = dyn_cast<Instruction>(RHS);
       if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
-          ShallowScoreAtThisLevel == VLOperands::ScoreFail ||
+          ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
           (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
+            (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
             (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
            ShallowScoreAtThisLevel))
         return ShallowScoreAtThisLevel;
       assert(I1 && I2 && "Should have early exited.");
 
-      // Keep track of in-tree values for determining the external-use cost.
-      InLookAheadValues[V1].insert(Lane1);
-      InLookAheadValues[V2].insert(Lane2);
-
       // Contains the I2 operand indexes that got matched with I1 operands.
       SmallSet<unsigned, 4> Op2Used;
 
@@ -1346,11 +1298,12 @@ public:
           if (Op2Used.count(OpIdx2))
             continue;
           // Recursively calculate the cost at each level
-          int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1},
-                                            {I2->getOperand(OpIdx2), Lane2},
-                                            CurrLevel + 1, MaxLevel);
+          int TmpScore =
+              getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
+                                 I1, I2, CurrLevel + 1, None);
           // Look for the best score.
-          if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) {
+          if (TmpScore > LookAheadHeuristics::ScoreFail &&
+              TmpScore > MaxTmpScore) {
             MaxTmpScore = TmpScore;
             MaxOpIdx2 = OpIdx2;
             FoundBest = true;
@@ -1364,24 +1317,213 @@ public:
       }
       return ShallowScoreAtThisLevel;
     }
+  };
+  /// A helper data structure to hold the operands of a vector of instructions.
+  /// This supports a fixed vector length for all operand vectors.
+  class VLOperands {
+    /// For each operand we need (i) the value, and (ii) the opcode that it
+    /// would be attached to if the expression was in a left-linearized form.
+    /// This is required to avoid illegal operand reordering.
+    /// For example:
+    /// \verbatim
+    ///                         0 Op1
+    ///                         |/
+    /// Op1 Op2   Linearized    + Op2
+    ///   \ /     ---------->   |/
+    ///    -                    -
+    ///
+    /// Op1 - Op2            (0 + Op1) - Op2
+    /// \endverbatim
+    ///
+    /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
+    ///
+    /// Another way to think of this is to track all the operations across the
+    /// path from the operand all the way to the root of the tree and to
+    /// calculate the operation that corresponds to this path. For example, the
+    /// path from Op2 to the root crosses the RHS of the '-', therefore the
+    /// corresponding operation is a '-' (which matches the one in the
+    /// linearized tree, as shown above).
+    ///
+    /// For lack of a better term, we refer to this operation as Accumulated
+    /// Path Operation (APO).
+    struct OperandData {
+      OperandData() = default;
+      OperandData(Value *V, bool APO, bool IsUsed)
+          : V(V), APO(APO), IsUsed(IsUsed) {}
+      /// The operand value.
+      Value *V = nullptr;
+      /// TreeEntries only allow a single opcode, or an alternate sequence of
+      /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
+      /// APO. It is set to 'true' if 'V' is attached to an inverse operation
+      /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
+      /// (e.g., Add/Mul)
+      bool APO = false;
+      /// Helper data for the reordering function.
+      bool IsUsed = false;
+    };
+
+    /// During operand reordering, we are trying to select the operand at lane
+    /// that matches best with the operand at the neighboring lane. Our
+    /// selection is based on the type of value we are looking for. For example,
+    /// if the neighboring lane has a load, we need to look for a load that is
+    /// accessing a consecutive address. These strategies are summarized in the
+    /// 'ReorderingMode' enumerator.
+    enum class ReorderingMode {
+      Load,     ///< Matching loads to consecutive memory addresses
+      Opcode,   ///< Matching instructions based on opcode (same or alternate)
+      Constant, ///< Matching constants
+      Splat,    ///< Matching the same instruction multiple times (broadcast)
+      Failed,   ///< We failed to create a vectorizable group
+    };
+
+    using OperandDataVec = SmallVector<OperandData, 2>;
+
+    /// A vector of operand vectors.
+    SmallVector<OperandDataVec, 4> OpsVec;
+
+    const DataLayout &DL;
+    ScalarEvolution &SE;
+    const BoUpSLP &R;
+
+    /// \returns the operand data at \p OpIdx and \p Lane.
+    OperandData &getData(unsigned OpIdx, unsigned Lane) {
+      return OpsVec[OpIdx][Lane];
+    }
+
+    /// \returns the operand data at \p OpIdx and \p Lane. Const version.
+    const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
+      return OpsVec[OpIdx][Lane];
+    }
+
+    /// Clears the used flag for all entries.
+    void clearUsed() {
+      for (unsigned OpIdx = 0, NumOperands = getNumOperands();
+           OpIdx != NumOperands; ++OpIdx)
+        for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
+             ++Lane)
+          OpsVec[OpIdx][Lane].IsUsed = false;
+    }
+
+    /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
+    void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
+      std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
+    }
+
+    /// \param Lane lane of the operands under analysis.
+    /// \param OpIdx operand index in \p Lane lane we're looking the best
+    /// candidate for.
+    /// \param Idx operand index of the current candidate value.
+    /// \returns The additional score due to possible broadcasting of the
+    /// elements in the lane. It is more profitable to have power-of-2 unique
+    /// elements in the lane, it will be vectorized with higher probability
+    /// after removing duplicates. Currently the SLP vectorizer supports only
+    /// vectorization of the power-of-2 number of unique scalars.
+    int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
+      Value *IdxLaneV = getData(Idx, Lane).V;
+      if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
+        return 0;
+      SmallPtrSet<Value *, 4> Uniques;
+      for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
+        if (Ln == Lane)
+          continue;
+        Value *OpIdxLnV = getData(OpIdx, Ln).V;
+        if (!isa<Instruction>(OpIdxLnV))
+          return 0;
+        Uniques.insert(OpIdxLnV);
+      }
+      int UniquesCount = Uniques.size();
+      int UniquesCntWithIdxLaneV =
+          Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
+      Value *OpIdxLaneV = getData(OpIdx, Lane).V;
+      int UniquesCntWithOpIdxLaneV =
+          Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
+      if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
+        return 0;
+      return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
+              UniquesCntWithOpIdxLaneV) -
+             (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
+    }
+
+    /// \param Lane lane of the operands under analysis.
+    /// \param OpIdx operand index in \p Lane lane we're looking the best
+    /// candidate for.
+    /// \param Idx operand index of the current candidate value.
+    /// \returns The additional score for the scalar which users are all
+    /// vectorized.
+    int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
+      Value *IdxLaneV = getData(Idx, Lane).V;
+      Value *OpIdxLaneV = getData(OpIdx, Lane).V;
+      // Do not care about number of uses for vector-like instructions
+      // (extractelement/extractvalue with constant indices), they are extracts
+      // themselves and already externally used. Vectorization of such
+      // instructions does not add extra extractelement instruction, just may
+      // remove it.
+      if (isVectorLikeInstWithConstOps(IdxLaneV) &&
+          isVectorLikeInstWithConstOps(OpIdxLaneV))
+        return LookAheadHeuristics::ScoreAllUserVectorized;
+      auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
+      if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
+        return 0;
+      return R.areAllUsersVectorized(IdxLaneI, None)
+                 ? LookAheadHeuristics::ScoreAllUserVectorized
+                 : 0;
+    }
+
+    /// Score scaling factor for fully compatible instructions but with
+    /// different number of external uses. Allows better selection of the
+    /// instructions with less external uses.
+    static const int ScoreScaleFactor = 10;
 
     /// \Returns the look-ahead score, which tells us how much the sub-trees
     /// rooted at \p LHS and \p RHS match, the more they match the higher the
     /// score. This helps break ties in an informed way when we cannot decide on
     /// the order of the operands by just considering the immediate
     /// predecessors.
-    int getLookAheadScore(const std::pair<Value *, int> &LHS,
-                          const std::pair<Value *, int> &RHS) {
-      InLookAheadValues.clear();
-      return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth);
+    int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
+                          int Lane, unsigned OpIdx, unsigned Idx,
+                          bool &IsUsed) {
+      LookAheadHeuristics LookAhead(DL, SE, R, getNumLanes(),
+                                    LookAheadMaxDepth);
+      // Keep track of the instruction stack as we recurse into the operands
+      // during the look-ahead score exploration.
+      int Score =
+          LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
+                                       /*CurrLevel=*/1, MainAltOps);
+      if (Score) {
+        int SplatScore = getSplatScore(Lane, OpIdx, Idx);
+        if (Score <= -SplatScore) {
+          // Set the minimum score for splat-like sequence to avoid setting
+          // failed state.
+          Score = 1;
+        } else {
+          Score += SplatScore;
+          // Scale score to see the difference between different operands
+          // and similar operands but all vectorized/not all vectorized
+          // uses. It does not affect actual selection of the best
+          // compatible operand in general, just allows to select the
+          // operand with all vectorized uses.
+          Score *= ScoreScaleFactor;
+          Score += getExternalUseScore(Lane, OpIdx, Idx);
+          IsUsed = true;
+        }
+      }
+      return Score;
     }
 
+    /// Best defined scores per lanes between the passes. Used to choose the
+    /// best operand (with the highest score) between the passes.
+    /// The key - {Operand Index, Lane}.
+    /// The value - the best score between the passes for the lane and the
+    /// operand.
+    SmallDenseMap<std::pair<unsigned, unsigned>, unsigned, 8>
+        BestScoresPerLanes;
+
     // Search all operands in Ops[*][Lane] for the one that matches best
     // Ops[OpIdx][LastLane] and return its opreand index.
     // If no good match can be found, return None.
-    Optional<unsigned>
-    getBestOperand(unsigned OpIdx, int Lane, int LastLane,
-                   ArrayRef<ReorderingMode> ReorderingModes) {
+    Optional<unsigned> getBestOperand(unsigned OpIdx, int Lane, int LastLane,
+                                      ArrayRef<ReorderingMode> ReorderingModes,
+                                      ArrayRef<Value *> MainAltOps) {
       unsigned NumOperands = getNumOperands();
 
       // The operand of the previous lane at OpIdx.
@@ -1389,6 +1531,8 @@ public:
 
       // Our strategy mode for OpIdx.
       ReorderingMode RMode = ReorderingModes[OpIdx];
+      if (RMode == ReorderingMode::Failed)
+        return None;
 
       // The linearized opcode of the operand at OpIdx, Lane.
       bool OpIdxAPO = getData(OpIdx, Lane).APO;
@@ -1400,7 +1544,15 @@ public:
         Optional<unsigned> Idx = None;
         unsigned Score = 0;
       } BestOp;
-
+      BestOp.Score =
+          BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
+              .first->second;
+
+      // Track if the operand must be marked as used. If the operand is set to
+      // Score 1 explicitly (because of non power-of-2 unique scalars, we may
+      // want to reestimate the operands again on the following iterations).
+      bool IsUsed =
+          RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;
       // Iterate through all unused operands and look for the best.
       for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
         // Get the operand at Idx and Lane.
@@ -1426,11 +1578,12 @@ public:
           bool LeftToRight = Lane > LastLane;
           Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
           Value *OpRight = (LeftToRight) ? Op : OpLastLane;
-          unsigned Score =
-              getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane});
-          if (Score > BestOp.Score) {
+          int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
+                                        OpIdx, Idx, IsUsed);
+          if (Score > static_cast<int>(BestOp.Score)) {
             BestOp.Idx = Idx;
             BestOp.Score = Score;
+            BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
           }
           break;
         }
@@ -1439,12 +1592,12 @@ public:
             BestOp.Idx = Idx;
           break;
         case ReorderingMode::Failed:
-          return None;
+          llvm_unreachable("Not expected Failed reordering mode.");
         }
       }
 
       if (BestOp.Idx) {
-        getData(BestOp.Idx.getValue(), Lane).IsUsed = true;
+        getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
         return BestOp.Idx;
       }
       // If we could not find a good match return None.
@@ -1761,6 +1914,10 @@ public:
         // rest of the lanes. We are visiting the nodes in a circular fashion,
         // using FirstLane as the center point and increasing the radius
         // distance.
+        SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
+        for (unsigned I = 0; I < NumOperands; ++I)
+          MainAltOps[I].push_back(getData(I, FirstLane).V);
+
         for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
           // Visit the lane on the right and then the lane on the left.
           for (int Direction : {+1, -1}) {
@@ -1773,21 +1930,29 @@ public:
             // Look for a good match for each operand.
             for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
               // Search for the operand that matches SortedOps[OpIdx][Lane-1].
-              Optional<unsigned> BestIdx =
-                  getBestOperand(OpIdx, Lane, LastLane, ReorderingModes);
+              Optional<unsigned> BestIdx = getBestOperand(
+                  OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
               // By not selecting a value, we allow the operands that follow to
               // select a better matching value. We will get a non-null value in
               // the next run of getBestOperand().
               if (BestIdx) {
                 // Swap the current operand with the one returned by
                 // getBestOperand().
-                swap(OpIdx, BestIdx.getValue(), Lane);
+                swap(OpIdx, *BestIdx, Lane);
               } else {
                 // We failed to find a best operand, set mode to 'Failed'.
                 ReorderingModes[OpIdx] = ReorderingMode::Failed;
                 // Enable the second pass.
                 StrategyFailed = true;
               }
+              // Try to get the alternate opcode and follow it during analysis.
+              if (MainAltOps[OpIdx].size() != 2) {
+                OperandData &AltOp = getData(OpIdx, Lane);
+                InstructionsState OpS =
+                    getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V});
+                if (OpS.getOpcode() && OpS.isAltShuffle())
+                  MainAltOps[OpIdx].push_back(AltOp.V);
+              }
             }
           }
         }
@@ -1851,15 +2016,109 @@ public:
 #endif
   };
 
+  /// Evaluate each pair in \p Candidates and return index into \p Candidates
+  /// for a pair which have highest score deemed to have best chance to form
+  /// root of profitable tree to vectorize. Return None if no candidate scored
+  /// above the LookAheadHeuristics::ScoreFail.
+  /// \param Limit Lower limit of the cost, considered to be good enough score.
+  Optional<int>
+  findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
+                   int Limit = LookAheadHeuristics::ScoreFail) {
+    LookAheadHeuristics LookAhead(*DL, *SE, *this, /*NumLanes=*/2,
+                                  RootLookAheadMaxDepth);
+    int BestScore = Limit;
+    Optional<int> Index = None;
+    for (int I : seq<int>(0, Candidates.size())) {
+      int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
+                                               Candidates[I].second,
+                                               /*U1=*/nullptr, /*U2=*/nullptr,
+                                               /*Level=*/1, None);
+      if (Score > BestScore) {
+        BestScore = Score;
+        Index = I;
+      }
+    }
+    return Index;
+  }
+
   /// Checks if the instruction is marked for deletion.
   bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
 
-  /// Marks values operands for later deletion by replacing them with Undefs.
-  void eraseInstructions(ArrayRef<Value *> AV);
+  /// Removes an instruction from its block and eventually deletes it.
+  /// It's like Instruction::eraseFromParent() except that the actual deletion
+  /// is delayed until BoUpSLP is destructed.
+  void eraseInstruction(Instruction *I) {
+    DeletedInstructions.insert(I);
+  }
+
+  /// Checks if the instruction was already analyzed for being possible
+  /// reduction root.
+  bool isAnalyzedReductionRoot(Instruction *I) const {
+    return AnalyzedReductionsRoots.count(I);
+  }
+  /// Register given instruction as already analyzed for being possible
+  /// reduction root.
+  void analyzedReductionRoot(Instruction *I) {
+    AnalyzedReductionsRoots.insert(I);
+  }
+  /// Checks if the provided list of reduced values was checked already for
+  /// vectorization.
+  bool areAnalyzedReductionVals(ArrayRef<Value *> VL) {
+    return AnalyzedReductionVals.contains(hash_value(VL));
+  }
+  /// Adds the list of reduced values to list of already checked values for the
+  /// vectorization.
+  void analyzedReductionVals(ArrayRef<Value *> VL) {
+    AnalyzedReductionVals.insert(hash_value(VL));
+  }
+  /// Clear the list of the analyzed reduction root instructions.
+  void clearReductionData() {
+    AnalyzedReductionsRoots.clear();
+    AnalyzedReductionVals.clear();
+  }
+  /// Checks if the given value is gathered in one of the nodes.
+  bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
+    return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
+  }
 
   ~BoUpSLP();
 
 private:
+  /// Check if the operands on the edges \p Edges of the \p UserTE allows
+  /// reordering (i.e. the operands can be reordered because they have only one
+  /// user and reordarable).
+  /// \param ReorderableGathers List of all gather nodes that require reordering
+  /// (e.g., gather of extractlements or partially vectorizable loads).
+  /// \param GatherOps List of gather operand nodes for \p UserTE that require
+  /// reordering, subset of \p NonVectorized.
+  bool
+  canReorderOperands(TreeEntry *UserTE,
+                     SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
+                     ArrayRef<TreeEntry *> ReorderableGathers,
+                     SmallVectorImpl<TreeEntry *> &GatherOps);
+
+  /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
+  /// if any. If it is not vectorized (gather node), returns nullptr.
+  TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
+    ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
+    TreeEntry *TE = nullptr;
+    const auto *It = find_if(VL, [this, &TE](Value *V) {
+      TE = getTreeEntry(V);
+      return TE;
+    });
+    if (It != VL.end() && TE->isSame(VL))
+      return TE;
+    return nullptr;
+  }
+
+  /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
+  /// if any. If it is not vectorized (gather node), returns nullptr.
+  const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
+                                        unsigned OpIdx) const {
+    return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
+        const_cast<TreeEntry *>(UserTE), OpIdx);
+  }
+
   /// Checks if all users of \p I are the part of the vectorization tree.
   bool areAllUsersVectorized(Instruction *I,
                              ArrayRef<Value *> VectorizedVals) const;
@@ -1886,12 +2145,17 @@ private:
   /// Vectorize a single entry in the tree, starting in \p VL.
   Value *vectorizeTree(ArrayRef<Value *> VL);
 
+  /// Create a new vector from a list of scalar values.  Produces a sequence
+  /// which exploits values reused across lanes, and arranges the inserts
+  /// for ease of later optimization.
+  Value *createBuildVector(ArrayRef<Value *> VL);
+
   /// \returns the scalarization cost for this type. Scalarization in this
   /// context means the creation of vectors from a group of scalars. If \p
   /// NeedToShuffle is true, need to add a cost of reshuffling some of the
   /// vector elements.
   InstructionCost getGatherCost(FixedVectorType *Ty,
-                                const DenseSet<unsigned> &ShuffledIndices,
+                                const APInt &ShuffledIndices,
                                 bool NeedToShuffle) const;
 
   /// Checks if the gathered \p VL can be represented as shuffle(s) of previous
@@ -1926,6 +2190,29 @@ private:
                                              const DataLayout &DL,
                                              ScalarEvolution &SE,
                                              const BoUpSLP &R);
+
+  /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
+  /// users of \p TE and collects the stores. It returns the map from the store
+  /// pointers to the collected stores.
+  DenseMap<Value *, SmallVector<StoreInst *, 4>>
+  collectUserStores(const BoUpSLP::TreeEntry *TE) const;
+
+  /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
+  /// stores in \p StoresVec can for a vector instruction. If so it returns true
+  /// and populates \p ReorderIndices with the shuffle indices of the the stores
+  /// when compared to the sorted vector.
+  bool CanFormVector(const SmallVector<StoreInst *, 4> &StoresVec,
+                     OrdersType &ReorderIndices) const;
+
+  /// Iterates through the users of \p TE, looking for scalar stores that can be
+  /// potentially vectorized in a future SLP-tree. If found, it keeps track of
+  /// their order and builds an order index vector for each store bundle. It
+  /// returns all these order vectors found.
+  /// We run this after the tree has formed, otherwise we may come across user
+  /// instructions that are not yet in the tree.
+  SmallVector<OrdersType, 1>
+  findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
+
   struct TreeEntry {
     using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
     TreeEntry(VecTreeTy &Container) : Container(Container) {}
@@ -2270,15 +2557,21 @@ private:
         ScalarToTreeEntry[V] = Last;
       }
       // Update the scheduler bundle to point to this TreeEntry.
-      unsigned Lane = 0;
-      for (ScheduleData *BundleMember = Bundle.getValue(); BundleMember;
-           BundleMember = BundleMember->NextInBundle) {
-        BundleMember->TE = Last;
-        BundleMember->Lane = Lane;
-        ++Lane;
-      }
-      assert((!Bundle.getValue() || Lane == VL.size()) &&
+      ScheduleData *BundleMember = *Bundle;
+      assert((BundleMember || isa<PHINode>(S.MainOp) ||
+              isVectorLikeInstWithConstOps(S.MainOp) ||
+              doesNotNeedToSchedule(VL)) &&
              "Bundle and VL out of sync");
+      if (BundleMember) {
+        for (Value *V : VL) {
+          if (doesNotNeedToBeScheduled(V))
+            continue;
+          assert(BundleMember && "Unexpected end of bundle.");
+          BundleMember->TE = Last;
+          BundleMember = BundleMember->NextInBundle;
+        }
+      }
+      assert(!BundleMember && "Bundle and VL out of sync");
     } else {
       MustGather.insert(VL.begin(), VL.end());
     }
@@ -2312,7 +2605,7 @@ private:
   /// Maps a specific scalar to its tree entry.
   SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry;
 
-  /// Maps a value to the proposed vectorizable size.
+  /// Maps a value to the proposed vectorizable size.
   SmallDenseMap<Value *, unsigned> InstrElementSize;
 
   /// A list of scalars that we found that we need to keep as scalars.
@@ -2343,12 +2636,12 @@ private:
     // First check if the result is already in the cache.
     AliasCacheKey key = std::make_pair(Inst1, Inst2);
     Optional<bool> &result = AliasCache[key];
-    if (result.hasValue()) {
+    if (result) {
       return result.getValue();
     }
     bool aliased = true;
     if (Loc1.Ptr && isSimple(Inst1))
-      aliased = isModOrRefSet(AA->getModRefInfo(Inst2, Loc1));
+      aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
     // Store the result in the cache.
     result = aliased;
     return aliased;
@@ -2360,20 +2653,23 @@ private:
   /// TODO: consider moving this to the AliasAnalysis itself.
   DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
 
-  /// Removes an instruction from its block and eventually deletes it.
-  /// It's like Instruction::eraseFromParent() except that the actual deletion
-  /// is delayed until BoUpSLP is destructed.
-  /// This is required to ensure that there are no incorrect collisions in the
-  /// AliasCache, which can happen if a new instruction is allocated at the
-  /// same address as a previously deleted instruction.
-  void eraseInstruction(Instruction *I, bool ReplaceOpsWithUndef = false) {
-    auto It = DeletedInstructions.try_emplace(I, ReplaceOpsWithUndef).first;
-    It->getSecond() = It->getSecond() && ReplaceOpsWithUndef;
-  }
+  // Cache for pointerMayBeCaptured calls inside AA.  This is preserved
+  // globally through SLP because we don't perform any action which
+  // invalidates capture results.
+  BatchAAResults BatchAA;
 
   /// Temporary store for deleted instructions. Instructions will be deleted
-  /// eventually when the BoUpSLP is destructed.
-  DenseMap<Instruction *, bool> DeletedInstructions;
+  /// eventually when the BoUpSLP is destructed.  The deferral is required to
+  /// ensure that there are no incorrect collisions in the AliasCache, which
+  /// can happen if a new instruction is allocated at the same address as a
+  /// previously deleted instruction.
+  DenseSet<Instruction *> DeletedInstructions;
+
+  /// Set of the instruction, being analyzed already for reductions.
+  SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
+
+  /// Set of hashes for the list of reduction values already being analyzed.
+  DenseSet<size_t> AnalyzedReductionVals;
 
   /// A list of values that need to extracted out of the tree.
   /// This list holds pairs of (Internal Scalar : External User). External User
@@ -2407,14 +2703,39 @@ private:
       NextLoadStore = nullptr;
       IsScheduled = false;
       SchedulingRegionID = BlockSchedulingRegionID;
-      UnscheduledDepsInBundle = UnscheduledDeps;
       clearDependencies();
       OpValue = OpVal;
       TE = nullptr;
-      Lane = -1;
+    }
+
+    /// Verify basic self consistency properties
+    void verify() {
+      if (hasValidDependencies()) {
+        assert(UnscheduledDeps <= Dependencies && "invariant");
+      } else {
+        assert(UnscheduledDeps == Dependencies && "invariant");
+      }
+
+      if (IsScheduled) {
+        assert(isSchedulingEntity() &&
+                "unexpected scheduled state");
+        for (const ScheduleData *BundleMember = this; BundleMember;
+             BundleMember = BundleMember->NextInBundle) {
+          assert(BundleMember->hasValidDependencies() &&
+                 BundleMember->UnscheduledDeps == 0 &&
+                 "unexpected scheduled state");
+          assert((BundleMember == this || !BundleMember->IsScheduled) &&
+                 "only bundle is marked scheduled");
+        }
+      }
+
+      assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
+             "all bundle members must be in same basic block");
     }
 
     /// Returns true if the dependency information has been calculated.
+    /// Note that depenendency validity can vary between instructions within
+    /// a single bundle.
     bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
 
     /// Returns true for single instructions and for bundle representatives
@@ -2424,7 +2745,7 @@ private:
     /// Returns true if it represents an instruction bundle and not only a
     /// single instruction.
     bool isPartOfBundle() const {
-      return NextInBundle != nullptr || FirstInBundle != this;
+      return NextInBundle != nullptr || FirstInBundle != this || TE;
     }
 
     /// Returns true if it is ready for scheduling, i.e. it has no more
@@ -2432,20 +2753,23 @@ private:
     bool isReady() const {
       assert(isSchedulingEntity() &&
              "can't consider non-scheduling entity for ready list");
-      return UnscheduledDepsInBundle == 0 && !IsScheduled;
+      return unscheduledDepsInBundle() == 0 && !IsScheduled;
     }
 
-    /// Modifies the number of unscheduled dependencies, also updating it for
-    /// the whole bundle.
+    /// Modifies the number of unscheduled dependencies for this instruction,
+    /// and returns the number of remaining dependencies for the containing
+    /// bundle.
     int incrementUnscheduledDeps(int Incr) {
+      assert(hasValidDependencies() &&
+             "increment of unscheduled deps would be meaningless");
       UnscheduledDeps += Incr;
-      return FirstInBundle->UnscheduledDepsInBundle += Incr;
+      return FirstInBundle->unscheduledDepsInBundle();
     }
 
     /// Sets the number of unscheduled dependencies to the number of
     /// dependencies.
     void resetUnscheduledDeps() {
-      incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
+      UnscheduledDeps = Dependencies;
     }
 
     /// Clears all dependency information.
@@ -2453,6 +2777,19 @@ private:
       Dependencies = InvalidDeps;
       resetUnscheduledDeps();
       MemoryDependencies.clear();
+      ControlDependencies.clear();
+    }
+
+    int unscheduledDepsInBundle() const {
+      assert(isSchedulingEntity() && "only meaningful on the bundle");
+      int Sum = 0;
+      for (const ScheduleData *BundleMember = this; BundleMember;
+           BundleMember = BundleMember->NextInBundle) {
+        if (BundleMember->UnscheduledDeps == InvalidDeps)
+          return InvalidDeps;
+        Sum += BundleMember->UnscheduledDeps;
+      }
+      return Sum;
     }
 
     void dump(raw_ostream &os) const {
@@ -2473,6 +2810,12 @@ private:
 
     Instruction *Inst = nullptr;
 
+    /// Opcode of the current instruction in the schedule data.
+    Value *OpValue = nullptr;
+
+    /// The TreeEntry that this instruction corresponds to.
+    TreeEntry *TE = nullptr;
+
     /// Points to the head in an instruction bundle (and always to this for
     /// single instructions).
     ScheduleData *FirstInBundle = nullptr;
@@ -2489,6 +2832,12 @@ private:
     /// This list is derived on demand in calculateDependencies().
     SmallVector<ScheduleData *, 4> MemoryDependencies;
 
+    /// List of instructions which this instruction could be control dependent
+    /// on.  Allowing such nodes to be scheduled below this one could introduce
+    /// a runtime fault which didn't exist in the original program.
+    /// ex: this is a load or udiv following a readonly call which inf loops
+    SmallVector<ScheduleData *, 4> ControlDependencies;
+
     /// This ScheduleData is in the current scheduling region if this matches
     /// the current SchedulingRegionID of BlockScheduling.
     int SchedulingRegionID = 0;
@@ -2508,22 +2857,9 @@ private:
     /// Note that this is negative as long as Dependencies is not calculated.
     int UnscheduledDeps = InvalidDeps;
 
-    /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
-    /// single instructions.
-    int UnscheduledDepsInBundle = InvalidDeps;
-
     /// True if this instruction is scheduled (or considered as scheduled in the
     /// dry-run).
     bool IsScheduled = false;
-
-    /// Opcode of the current instruction in the schedule data.
-    Value *OpValue = nullptr;
-
-    /// The TreeEntry that this instruction corresponds to.
-    TreeEntry *TE = nullptr;
-
-    /// The lane of this node in the TreeEntry.
-    int Lane = -1;
   };
 
 #ifndef NDEBUG
@@ -2538,6 +2874,21 @@ private:
   friend struct DOTGraphTraits<BoUpSLP *>;
 
   /// Contains all scheduling data for a basic block.
+  /// It does not schedules instructions, which are not memory read/write
+  /// instructions and their operands are either constants, or arguments, or
+  /// phis, or instructions from others blocks, or their users are phis or from
+  /// the other blocks. The resulting vector instructions can be placed at the
+  /// beginning of the basic block without scheduling (if operands does not need
+  /// to be scheduled) or at the end of the block (if users are outside of the
+  /// block). It allows to save some compile time and memory used by the
+  /// compiler.
+  /// ScheduleData is assigned for each instruction in between the boundaries of
+  /// the tree entry, even for those, which are not part of the graph. It is
+  /// required to correctly follow the dependencies between the instructions and
+  /// their correct scheduling. The ScheduleData is not allocated for the
+  /// instructions, which do not require scheduling, like phis, nodes with
+  /// extractelements/insertelements only or nodes with instructions, with
+  /// uses/operands outside of the block.
   struct BlockScheduling {
     BlockScheduling(BasicBlock *BB)
         : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
@@ -2548,6 +2899,7 @@ private:
       ScheduleEnd = nullptr;
       FirstLoadStoreInRegion = nullptr;
       LastLoadStoreInRegion = nullptr;
+      RegionHasStackSave = false;
 
       // Reduce the maximum schedule region size by the size of the
       // previous scheduling run.
@@ -2561,20 +2913,29 @@ private:
       ++SchedulingRegionID;
     }
 
-    ScheduleData *getScheduleData(Value *V) {
-      ScheduleData *SD = ScheduleDataMap[V];
-      if (SD && SD->SchedulingRegionID == SchedulingRegionID)
+    ScheduleData *getScheduleData(Instruction *I) {
+      if (BB != I->getParent())
+        // Avoid lookup if can't possibly be in map.
+        return nullptr;
+      ScheduleData *SD = ScheduleDataMap.lookup(I);
+      if (SD && isInSchedulingRegion(SD))
         return SD;
       return nullptr;
     }
 
+    ScheduleData *getScheduleData(Value *V) {
+      if (auto *I = dyn_cast<Instruction>(V))
+        return getScheduleData(I);
+      return nullptr;
+    }
+
     ScheduleData *getScheduleData(Value *V, Value *Key) {
       if (V == Key)
         return getScheduleData(V);
       auto I = ExtraScheduleDataMap.find(V);
       if (I != ExtraScheduleDataMap.end()) {
-        ScheduleData *SD = I->second[Key];
-        if (SD && SD->SchedulingRegionID == SchedulingRegionID)
+        ScheduleData *SD = I->second.lookup(Key);
+        if (SD && isInSchedulingRegion(SD))
           return SD;
       }
       return nullptr;
@@ -2595,7 +2956,7 @@ private:
            BundleMember = BundleMember->NextInBundle) {
         if (BundleMember->Inst != BundleMember->OpValue)
           continue;
-        
+
         // Handle the def-use chain dependencies.
 
         // Decrement the unscheduled counter and insert to ready list if ready.
@@ -2617,10 +2978,12 @@ private:
         };
 
         // If BundleMember is a vector bundle, its operands may have been
-        // reordered duiring buildTree(). We therefore need to get its operands
+        // reordered during buildTree(). We therefore need to get its operands
         // through the TreeEntry.
         if (TreeEntry *TE = BundleMember->TE) {
-          int Lane = BundleMember->Lane;
+          // Need to search for the lane since the tree entry can be reordered.
+          int Lane = std::distance(TE->Scalars.begin(),
+                                   find(TE->Scalars, BundleMember->Inst));
           assert(Lane >= 0 && "Lane not set");
 
           // Since vectorization tree is being built recursively this assertion
@@ -2629,7 +2992,7 @@ private:
           // where their second (immediate) operand is not added. Since
           // immediates do not affect scheduler behavior this is considered
           // okay.
-          auto *In = TE->getMainOp();
+          auto *In = BundleMember->Inst;
           assert(In &&
                  (isa<ExtractValueInst>(In) || isa<ExtractElementInst>(In) ||
                   In->getNumOperands() == TE->getNumOperands()) &&
@@ -2649,7 +3012,8 @@ private:
         }
         // Handle the memory dependencies.
         for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
-          if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
+          if (MemoryDepSD->hasValidDependencies() &&
+              MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
             // There are no more unscheduled dependencies after decrementing,
             // so we can put the dependent instruction into the ready list.
             ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
@@ -2660,6 +3024,48 @@ private:
                        << "SLP:    gets ready (mem): " << *DepBundle << "\n");
           }
         }
+        // Handle the control dependencies.
+        for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
+          if (DepSD->incrementUnscheduledDeps(-1) == 0) {
+            // There are no more unscheduled dependencies after decrementing,
+            // so we can put the dependent instruction into the ready list.
+            ScheduleData *DepBundle = DepSD->FirstInBundle;
+            assert(!DepBundle->IsScheduled &&
+                   "already scheduled bundle gets ready");
+            ReadyList.insert(DepBundle);
+            LLVM_DEBUG(dbgs()
+                       << "SLP:    gets ready (ctl): " << *DepBundle << "\n");
+          }
+        }
+
+      }
+    }
+
+    /// Verify basic self consistency properties of the data structure.
+    void verify() {
+      if (!ScheduleStart)
+        return;
+
+      assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
+             ScheduleStart->comesBefore(ScheduleEnd) &&
+             "Not a valid scheduling region?");
+
+      for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
+        auto *SD = getScheduleData(I);
+        if (!SD)
+          continue;
+        assert(isInSchedulingRegion(SD) &&
+               "primary schedule data not in window?");
+        assert(isInSchedulingRegion(SD->FirstInBundle) &&
+               "entire bundle in window!");
+        (void)SD;
+        doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); });
+      }
+
+      for (auto *SD : ReadyInsts) {
+        assert(SD->isSchedulingEntity() && SD->isReady() &&
+               "item in ready list not ready?");
+        (void)SD;
       }
     }
 
@@ -2670,7 +3076,7 @@ private:
       auto I = ExtraScheduleDataMap.find(V);
       if (I != ExtraScheduleDataMap.end())
         for (auto &P : I->second)
-          if (P.second->SchedulingRegionID == SchedulingRegionID)
+          if (isInSchedulingRegion(P.second))
             Action(P.second);
     }
 
@@ -2679,10 +3085,11 @@ private:
     void initialFillReadyList(ReadyListType &ReadyList) {
       for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
         doForAllOpcodes(I, [&](ScheduleData *SD) {
-          if (SD->isSchedulingEntity() && SD->isReady()) {
+          if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
+              SD->isReady()) {
             ReadyList.insert(SD);
             LLVM_DEBUG(dbgs()
-                       << "SLP:    initially in ready list: " << *I << "\n");
+                       << "SLP:    initially in ready list: " << *SD << "\n");
           }
         });
       }
@@ -2740,18 +3147,14 @@ private:
     /// Attaches ScheduleData to Instruction.
     /// Note that the mapping survives during all vectorization iterations, i.e.
     /// ScheduleData structures are recycled.
-    DenseMap<Value *, ScheduleData *> ScheduleDataMap;
+    DenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
 
     /// Attaches ScheduleData to Instruction with the leading key.
     DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
         ExtraScheduleDataMap;
 
-    struct ReadyList : SmallVector<ScheduleData *, 8> {
-      void insert(ScheduleData *SD) { push_back(SD); }
-    };
-
     /// The ready-list for scheduling (only used for the dry-run).
-    ReadyList ReadyInsts;
+    SetVector<ScheduleData *> ReadyInsts;
 
     /// The first instruction of the scheduling region.
     Instruction *ScheduleStart = nullptr;
@@ -2767,6 +3170,11 @@ private:
     /// (can be null).
     ScheduleData *LastLoadStoreInRegion = nullptr;
 
+    /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
+    /// region?  Used to optimize the dependence calculation for the
+    /// common case where there isn't.
+    bool RegionHasStackSave = false;
+
     /// The current size of the scheduling region.
     int ScheduleRegionSize = 0;
 
@@ -2775,8 +3183,8 @@ private:
 
     /// The ID of the scheduling region. For a new vectorization iteration this
     /// is incremented which "removes" all ScheduleData from the region.
-    // Make sure that the initial SchedulingRegionID is greater than the
-    // initial SchedulingRegionID in ScheduleData (which is 0).
+    /// Make sure that the initial SchedulingRegionID is greater than the
+    /// initial SchedulingRegionID in ScheduleData (which is 0).
     int SchedulingRegionID = 1;
   };
 
@@ -2788,7 +3196,7 @@ private:
   void scheduleBlock(BlockScheduling *BS);
 
   /// List of users to ignore during scheduling and that don't need extracting.
-  ArrayRef<Value *> UserIgnoreList;
+  const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
 
   /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
   /// sorted SmallVectors of unsigned.
@@ -2819,7 +3227,6 @@ private:
   ScalarEvolution *SE;
   TargetTransformInfo *TTI;
   TargetLibraryInfo *TLI;
-  AAResults *AA;
   LoopInfo *LI;
   DominatorTree *DT;
   AssumptionCache *AC;
@@ -2936,20 +3343,25 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
 } // end namespace llvm
 
 BoUpSLP::~BoUpSLP() {
-  for (const auto &Pair : DeletedInstructions) {
-    // Replace operands of ignored instructions with Undefs in case if they were
-    // marked for deletion.
-    if (Pair.getSecond()) {
-      Value *Undef = UndefValue::get(Pair.getFirst()->getType());
-      Pair.getFirst()->replaceAllUsesWith(Undef);
-    }
-    Pair.getFirst()->dropAllReferences();
-  }
-  for (const auto &Pair : DeletedInstructions) {
-    assert(Pair.getFirst()->use_empty() &&
+  SmallVector<WeakTrackingVH> DeadInsts;
+  for (auto *I : DeletedInstructions) {
+    for (Use &U : I->operands()) {
+      auto *Op = dyn_cast<Instruction>(U.get());
+      if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
+          wouldInstructionBeTriviallyDead(Op, TLI))
+        DeadInsts.emplace_back(Op);
+    }
+    I->dropAllReferences();
+  }
+  for (auto *I : DeletedInstructions) {
+    assert(I->use_empty() &&
            "trying to erase instruction with users.");
-    Pair.getFirst()->eraseFromParent();
+    I->eraseFromParent();
   }
+
+  // Cleanup any dead scalar code feeding the vectorized instructions
+  RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI);
+
 #ifdef EXPENSIVE_CHECKS
   // If we could guarantee that this call is not extremely slow, we could
   // remove the ifdef limitation (see PR47712).
@@ -2957,13 +3369,6 @@ BoUpSLP::~BoUpSLP() {
 #endif
 }
 
-void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {
-  for (auto *V : AV) {
-    if (auto *I = dyn_cast<Instruction>(V))
-      eraseInstruction(I, /*ReplaceOpsWithUndef=*/true);
-  };
-}
-
 /// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
 /// contains original mask for the scalars reused in the node. Procedure
 /// transform this mask in accordance with the given \p Mask.
@@ -3068,6 +3473,189 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
   return None;
 }
 
+namespace {
+/// Tracks the state we can represent the loads in the given sequence.
+enum class LoadsState { Gather, Vectorize, ScatterVectorize };
+} // anonymous namespace
+
+/// Checks if the given array of loads can be represented as a vectorized,
+/// scatter or just simple gather.
+static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
+                                    const TargetTransformInfo &TTI,
+                                    const DataLayout &DL, ScalarEvolution &SE,
+                                    LoopInfo &LI,
+                                    SmallVectorImpl<unsigned> &Order,
+                                    SmallVectorImpl<Value *> &PointerOps) {
+  // Check that a vectorized load would load the same memory as a scalar
+  // load. For example, we don't want to vectorize loads that are smaller
+  // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
+  // treats loading/storing it as an i8 struct. If we vectorize loads/stores
+  // from such a struct, we read/write packed bits disagreeing with the
+  // unvectorized version.
+  Type *ScalarTy = VL0->getType();
+
+  if (DL.getTypeSizeInBits(ScalarTy) != DL.getTypeAllocSizeInBits(ScalarTy))
+    return LoadsState::Gather;
+
+  // Make sure all loads in the bundle are simple - we can't vectorize
+  // atomic or volatile loads.
+  PointerOps.clear();
+  PointerOps.resize(VL.size());
+  auto *POIter = PointerOps.begin();
+  for (Value *V : VL) {
+    auto *L = cast<LoadInst>(V);
+    if (!L->isSimple())
+      return LoadsState::Gather;
+    *POIter = L->getPointerOperand();
+    ++POIter;
+  }
+
+  Order.clear();
+  // Check the order of pointer operands or that all pointers are the same.
+  bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order);
+  if (IsSorted || all_of(PointerOps, [&PointerOps](Value *P) {
+        if (getUnderlyingObject(P) != getUnderlyingObject(PointerOps.front()))
+          return false;
+        auto *GEP = dyn_cast<GetElementPtrInst>(P);
+        if (!GEP)
+          return false;
+        auto *GEP0 = cast<GetElementPtrInst>(PointerOps.front());
+        return GEP->getNumOperands() == 2 &&
+               ((isConstant(GEP->getOperand(1)) &&
+                 isConstant(GEP0->getOperand(1))) ||
+                getSameOpcode({GEP->getOperand(1), GEP0->getOperand(1)})
+                    .getOpcode());
+      })) {
+    if (IsSorted) {
+      Value *Ptr0;
+      Value *PtrN;
+      if (Order.empty()) {
+        Ptr0 = PointerOps.front();
+        PtrN = PointerOps.back();
+      } else {
+        Ptr0 = PointerOps[Order.front()];
+        PtrN = PointerOps[Order.back()];
+      }
+      Optional<int> Diff =
+          getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
+      // Check that the sorted loads are consecutive.
+      if (static_cast<unsigned>(*Diff) == VL.size() - 1)
+        return LoadsState::Vectorize;
+    }
+    // TODO: need to improve analysis of the pointers, if not all of them are
+    // GEPs or have > 2 operands, we end up with a gather node, which just
+    // increases the cost.
+    Loop *L = LI.getLoopFor(cast<LoadInst>(VL0)->getParent());
+    bool ProfitableGatherPointers =
+        static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
+          return L && L->isLoopInvariant(V);
+        })) <= VL.size() / 2 && VL.size() > 2;
+    if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
+          auto *GEP = dyn_cast<GetElementPtrInst>(P);
+          return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
+                 (GEP && GEP->getNumOperands() == 2);
+        })) {
+      Align CommonAlignment = cast<LoadInst>(VL0)->getAlign();
+      for (Value *V : VL)
+        CommonAlignment =
+            std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
+      auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
+      if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) &&
+          !TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment))
+        return LoadsState::ScatterVectorize;
+    }
+  }
+
+  return LoadsState::Gather;
+}
+
+bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,
+                            const DataLayout &DL, ScalarEvolution &SE,
+                            SmallVectorImpl<unsigned> &SortedIndices) {
+  assert(llvm::all_of(
+             VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
+         "Expected list of pointer operands.");
+  // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
+  // Ptr into, sort and return the sorted indices with values next to one
+  // another.
+  MapVector<Value *, SmallVector<std::tuple<Value *, int, unsigned>>> Bases;
+  Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
+
+  unsigned Cnt = 1;
+  for (Value *Ptr : VL.drop_front()) {
+    bool Found = any_of(Bases, [&](auto &Base) {
+      Optional<int> Diff =
+          getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
+                          /*StrictCheck=*/true);
+      if (!Diff)
+        return false;
+
+      Base.second.emplace_back(Ptr, *Diff, Cnt++);
+      return true;
+    });
+
+    if (!Found) {
+      // If we haven't found enough to usefully cluster, return early.
+      if (Bases.size() > VL.size() / 2 - 1)
+        return false;
+
+      // Not found already - add a new Base
+      Bases[Ptr].emplace_back(Ptr, 0, Cnt++);
+    }
+  }
+
+  // For each of the bases sort the pointers by Offset and check if any of the
+  // base become consecutively allocated.
+  bool AnyConsecutive = false;
+  for (auto &Base : Bases) {
+    auto &Vec = Base.second;
+    if (Vec.size() > 1) {
+      llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
+                                const std::tuple<Value *, int, unsigned> &Y) {
+        return std::get<1>(X) < std::get<1>(Y);
+      });
+      int InitialOffset = std::get<1>(Vec[0]);
+      AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](auto &P) {
+        return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
+      });
+    }
+  }
+
+  // Fill SortedIndices array only if it looks worth-while to sort the ptrs.
+  SortedIndices.clear();
+  if (!AnyConsecutive)
+    return false;
+
+  for (auto &Base : Bases) {
+    for (auto &T : Base.second)
+      SortedIndices.push_back(std::get<2>(T));
+  }
+
+  assert(SortedIndices.size() == VL.size() &&
+         "Expected SortedIndices to be the size of VL");
+  return true;
+}
+
+Optional<BoUpSLP::OrdersType>
+BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
+  assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
+  Type *ScalarTy = TE.Scalars[0]->getType();
+
+  SmallVector<Value *> Ptrs;
+  Ptrs.reserve(TE.Scalars.size());
+  for (Value *V : TE.Scalars) {
+    auto *L = dyn_cast<LoadInst>(V);
+    if (!L || !L->isSimple())
+      return None;
+    Ptrs.push_back(L->getPointerOperand());
+  }
+
+  BoUpSLP::OrdersType Order;
+  if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))
+    return Order;
+  return None;
+}
+
 Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE,
                                                          bool TopToBottom) {
   // No need to reorder if need to shuffle reuses, still need to shuffle the
@@ -3108,6 +3696,9 @@ Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE,
     }
     if (Optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
       return CurrentOrder;
+    if (TE.Scalars.size() >= 4)
+      if (Optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
+        return Order;
   }
   return None;
 }
@@ -3118,13 +3709,55 @@ void BoUpSLP::reorderTopToBottom() {
   // ExtractElement gather nodes which can be vectorized and need to handle
   // their ordering.
   DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
+
+  // AltShuffles can also have a preferred ordering that leads to fewer
+  // instructions, e.g., the addsub instruction in x86.
+  DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
+
+  // Maps a TreeEntry to the reorder indices of external users.
+  DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>>
+      ExternalUserReorderMap;
+  // FIXME: Workaround for syntax error reported by MSVC buildbots.
+  TargetTransformInfo &TTIRef = *TTI;
   // Find all reorderable nodes with the given VF.
   // Currently the are vectorized stores,loads,extracts + some gathering of
   // extracts.
-  for_each(VectorizableTree, [this, &VFToOrderedEntries, &GathersToOrders](
+  for_each(VectorizableTree, [this, &TTIRef, &VFToOrderedEntries,
+                              &GathersToOrders, &ExternalUserReorderMap,
+                              &AltShufflesToOrders](
                                  const std::unique_ptr<TreeEntry> &TE) {
+    // Look for external users that will probably be vectorized.
+    SmallVector<OrdersType, 1> ExternalUserReorderIndices =
+        findExternalStoreUsersReorderIndices(TE.get());
+    if (!ExternalUserReorderIndices.empty()) {
+      VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
+      ExternalUserReorderMap.try_emplace(TE.get(),
+                                         std::move(ExternalUserReorderIndices));
+    }
+
+    // Patterns like [fadd,fsub] can be combined into a single instruction in
+    // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
+    // to take into account their order when looking for the most used order.
+    if (TE->isAltShuffle()) {
+      VectorType *VecTy =
+          FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size());
+      unsigned Opcode0 = TE->getOpcode();
+      unsigned Opcode1 = TE->getAltOpcode();
+      // The opcode mask selects between the two opcodes.
+      SmallBitVector OpcodeMask(TE->Scalars.size(), 0);
+      for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
+        if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
+          OpcodeMask.set(Lane);
+      // If this pattern is supported by the target then we consider the order.
+      if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
+        VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
+        AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
+      }
+      // TODO: Check the reverse order too.
+    }
+
     if (Optional<OrdersType> CurrentOrder =
-            getReorderingData(*TE.get(), /*TopToBottom=*/true)) {
+            getReorderingData(*TE, /*TopToBottom=*/true)) {
       // Do not include ordering for nodes used in the alt opcode vectorization,
       // better to reorder them during bottom-to-top stage. If follow the order
       // here, it causes reordering of the whole graph though actually it is
@@ -3142,10 +3775,7 @@ void BoUpSLP::reorderTopToBottom() {
                      EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
             }))
           return;
-        if (UserTE->UserTreeIndices.empty())
-          UserTE = nullptr;
-        else
-          UserTE = UserTE->UserTreeIndices.back().UserTE;
+        UserTE = UserTE->UserTreeIndices.back().UserTE;
         ++Cnt;
       }
       VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
@@ -3176,11 +3806,30 @@ void BoUpSLP::reorderTopToBottom() {
       if (!OpTE->ReuseShuffleIndices.empty())
         continue;
       // Count number of orders uses.
-      const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & {
-        if (OpTE->State == TreeEntry::NeedToGather)
-          return GathersToOrders.find(OpTE)->second;
+      const auto &Order = [OpTE, &GathersToOrders,
+                           &AltShufflesToOrders]() -> const OrdersType & {
+        if (OpTE->State == TreeEntry::NeedToGather) {
+          auto It = GathersToOrders.find(OpTE);
+          if (It != GathersToOrders.end())
+            return It->second;
+        }
+        if (OpTE->isAltShuffle()) {
+          auto It = AltShufflesToOrders.find(OpTE);
+          if (It != AltShufflesToOrders.end())
+            return It->second;
+        }
         return OpTE->ReorderIndices;
       }();
+      // First consider the order of the external scalar users.
+      auto It = ExternalUserReorderMap.find(OpTE);
+      if (It != ExternalUserReorderMap.end()) {
+        const auto &ExternalUserReorderIndices = It->second;
+        for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
+          ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
+        // No other useful reorder data in this entry.
+        if (Order.empty())
+          continue;
+      }
       // Stores actually store the mask, not the order, need to invert.
       if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
           OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
@@ -3270,6 +3919,57 @@ void BoUpSLP::reorderTopToBottom() {
   }
 }
 
+bool BoUpSLP::canReorderOperands(
+    TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
+    ArrayRef<TreeEntry *> ReorderableGathers,
+    SmallVectorImpl<TreeEntry *> &GatherOps) {
+  for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
+    if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
+          return OpData.first == I &&
+                 OpData.second->State == TreeEntry::Vectorize;
+        }))
+      continue;
+    if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
+      // Do not reorder if operand node is used by many user nodes.
+      if (any_of(TE->UserTreeIndices,
+                 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
+        return false;
+      // Add the node to the list of the ordered nodes with the identity
+      // order.
+      Edges.emplace_back(I, TE);
+      // Add ScatterVectorize nodes to the list of operands, where just
+      // reordering of the scalars is required. Similar to the gathers, so
+      // simply add to the list of gathered ops.
+      // If there are reused scalars, process this node as a regular vectorize
+      // node, just reorder reuses mask.
+      if (TE->State != TreeEntry::Vectorize && TE->ReuseShuffleIndices.empty())
+        GatherOps.push_back(TE);
+      continue;
+    }
+    TreeEntry *Gather = nullptr;
+    if (count_if(ReorderableGathers,
+                 [&Gather, UserTE, I](TreeEntry *TE) {
+                   assert(TE->State != TreeEntry::Vectorize &&
+                          "Only non-vectorized nodes are expected.");
+                   if (any_of(TE->UserTreeIndices,
+                              [UserTE, I](const EdgeInfo &EI) {
+                                return EI.UserTE == UserTE && EI.EdgeIdx == I;
+                              })) {
+                     assert(TE->isSame(UserTE->getOperand(I)) &&
+                            "Operand entry does not match operands.");
+                     Gather = TE;
+                     return true;
+                   }
+                   return false;
+                 }) > 1 &&
+        !all_of(UserTE->getOperand(I), isConstant))
+      return false;
+    if (Gather)
+      GatherOps.push_back(Gather);
+  }
+  return true;
+}
+
 void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
   SetVector<TreeEntry *> OrderedEntries;
   DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
@@ -3283,49 +3983,13 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
     if (TE->State != TreeEntry::Vectorize)
       NonVectorized.push_back(TE.get());
     if (Optional<OrdersType> CurrentOrder =
-            getReorderingData(*TE.get(), /*TopToBottom=*/false)) {
+            getReorderingData(*TE, /*TopToBottom=*/false)) {
       OrderedEntries.insert(TE.get());
       if (TE->State != TreeEntry::Vectorize)
         GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
     }
   });
 
-  // Checks if the operands of the users are reordarable and have only single
-  // use.
-  auto &&CheckOperands =
-      [this, &NonVectorized](const auto &Data,
-                             SmallVectorImpl<TreeEntry *> &GatherOps) {
-        for (unsigned I = 0, E = Data.first->getNumOperands(); I < E; ++I) {
-          if (any_of(Data.second,
-                     [I](const std::pair<unsigned, TreeEntry *> &OpData) {
-                       return OpData.first == I &&
-                              OpData.second->State == TreeEntry::Vectorize;
-                     }))
-            continue;
-          ArrayRef<Value *> VL = Data.first->getOperand(I);
-          const TreeEntry *TE = nullptr;
-          const auto *It = find_if(VL, [this, &TE](Value *V) {
-            TE = getTreeEntry(V);
-            return TE;
-          });
-          if (It != VL.end() && TE->isSame(VL))
-            return false;
-          TreeEntry *Gather = nullptr;
-          if (count_if(NonVectorized, [VL, &Gather](TreeEntry *TE) {
-                assert(TE->State != TreeEntry::Vectorize &&
-                       "Only non-vectorized nodes are expected.");
-                if (TE->isSame(VL)) {
-                  Gather = TE;
-                  return true;
-                }
-                return false;
-              }) > 1)
-            return false;
-          if (Gather)
-            GatherOps.push_back(Gather);
-        }
-        return true;
-      };
   // 1. Propagate order to the graph nodes, which use only reordered nodes.
   // I.e., if the node has operands, that are reordered, try to make at least
   // one operand order in the natural order and reorder others + reorder the
@@ -3334,7 +3998,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
   while (!OrderedEntries.empty()) {
     // 1. Filter out only reordered nodes.
     // 2. If the entry has multiple uses - skip it and jump to the next node.
-    MapVector<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
+    DenseMap<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
     SmallVector<TreeEntry *> Filtered;
     for (TreeEntry *TE : OrderedEntries) {
       if (!(TE->State == TreeEntry::Vectorize ||
@@ -3362,10 +4026,17 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
     // Erase filtered entries.
     for_each(Filtered,
              [&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); });
-    for (const auto &Data : Users) {
+    SmallVector<
+        std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
+        UsersVec(Users.begin(), Users.end());
+    sort(UsersVec, [](const auto &Data1, const auto &Data2) {
+      return Data1.first->Idx > Data2.first->Idx;
+    });
+    for (auto &Data : UsersVec) {
       // Check that operands are used only in the User node.
       SmallVector<TreeEntry *> GatherOps;
-      if (!CheckOperands(Data, GatherOps)) {
+      if (!canReorderOperands(Data.first, Data.second, NonVectorized,
+                              GatherOps)) {
         for_each(Data.second,
                  [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {
                    OrderedEntries.remove(Op.second);
@@ -3381,18 +4052,22 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
       // the same node my be considered several times, though might be not
       // profitable.
       SmallPtrSet<const TreeEntry *, 4> VisitedOps;
+      SmallPtrSet<const TreeEntry *, 4> VisitedUsers;
       for (const auto &Op : Data.second) {
         TreeEntry *OpTE = Op.second;
         if (!VisitedOps.insert(OpTE).second)
           continue;
-        if (!OpTE->ReuseShuffleIndices.empty() ||
-            (IgnoreReorder && OpTE == VectorizableTree.front().get()))
+        if (!OpTE->ReuseShuffleIndices.empty())
           continue;
         const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & {
           if (OpTE->State == TreeEntry::NeedToGather)
             return GathersToOrders.find(OpTE)->second;
           return OpTE->ReorderIndices;
         }();
+        unsigned NumOps = count_if(
+            Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
+              return P.second == OpTE;
+            });
         // Stores actually store the mask, not the order, need to invert.
         if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
             OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
@@ -3404,14 +4079,52 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
             return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);
           });
           fixupOrderingIndices(CurrentOrder);
-          ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
+          OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
+              NumOps;
         } else {
-          ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
+          OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
+        }
+        auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
+        const auto &&AllowsReordering = [IgnoreReorder, &GathersToOrders](
+                                            const TreeEntry *TE) {
+          if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
+              (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
+              (IgnoreReorder && TE->Idx == 0))
+            return true;
+          if (TE->State == TreeEntry::NeedToGather) {
+            auto It = GathersToOrders.find(TE);
+            if (It != GathersToOrders.end())
+              return !It->second.empty();
+            return true;
+          }
+          return false;
+        };
+        for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
+          TreeEntry *UserTE = EI.UserTE;
+          if (!VisitedUsers.insert(UserTE).second)
+            continue;
+          // May reorder user node if it requires reordering, has reused
+          // scalars, is an alternate op vectorize node or its op nodes require
+          // reordering.
+          if (AllowsReordering(UserTE))
+            continue;
+          // Check if users allow reordering.
+          // Currently look up just 1 level of operands to avoid increase of
+          // the compile time.
+          // Profitable to reorder if definitely more operands allow
+          // reordering rather than those with natural order.
+          ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users[UserTE];
+          if (static_cast<unsigned>(count_if(
+                  Ops, [UserTE, &AllowsReordering](
+                           const std::pair<unsigned, TreeEntry *> &Op) {
+                    return AllowsReordering(Op.second) &&
+                           all_of(Op.second->UserTreeIndices,
+                                  [UserTE](const EdgeInfo &EI) {
+                                    return EI.UserTE == UserTE;
+                                  });
+                  })) <= Ops.size() / 2)
+            ++Res.first->second;
         }
-        OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
-            OpTE->UserTreeIndices.size();
-        assert(OrdersUses[{}] > 0 && "Counter cannot be less than 0.");
-        --OrdersUses[{}];
       }
       // If no orders - skip current nodes and jump to the next one, if any.
       if (OrdersUses.empty()) {
@@ -3452,7 +4165,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
         OrderedEntries.remove(TE);
         if (!VisitedOps.insert(TE).second)
           continue;
-        if (!TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty()) {
+        if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
           // Just reorder reuses indices.
           reorderReuses(TE->ReuseShuffleIndices, Mask);
           continue;
@@ -3464,6 +4177,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
                 TE->ReorderIndices.empty()) &&
                "Non-matching sizes of user/operand entries.");
         reorderOrder(TE->ReorderIndices, Mask);
+        if (IgnoreReorder && TE == VectorizableTree.front().get())
+          IgnoreReorder = false;
       }
       // For gathers just need to reorder its scalars.
       for (TreeEntry *Gather : GatherOps) {
@@ -3554,90 +4269,282 @@ void BoUpSLP::buildExternalUses(
           }
         }
 
-        // Ignore users in the user ignore list.
-        if (is_contained(UserIgnoreList, UserInst))
-          continue;
+        // Ignore users in the user ignore list.
+        if (UserIgnoreList && UserIgnoreList->contains(UserInst))
+          continue;
+
+        LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
+                          << Lane << " from " << *Scalar << ".\n");
+        ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
+      }
+    }
+  }
+}
+
+DenseMap<Value *, SmallVector<StoreInst *, 4>>
+BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
+  DenseMap<Value *, SmallVector<StoreInst *, 4>> PtrToStoresMap;
+  for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
+    Value *V = TE->Scalars[Lane];
+    // To save compilation time we don't visit if we have too many users.
+    static constexpr unsigned UsersLimit = 4;
+    if (V->hasNUsesOrMore(UsersLimit))
+      break;
+
+    // Collect stores per pointer object.
+    for (User *U : V->users()) {
+      auto *SI = dyn_cast<StoreInst>(U);
+      if (SI == nullptr || !SI->isSimple() ||
+          !isValidElementType(SI->getValueOperand()->getType()))
+        continue;
+      // Skip entry if already
+      if (getTreeEntry(U))
+        continue;
+
+      Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
+      auto &StoresVec = PtrToStoresMap[Ptr];
+      // For now just keep one store per pointer object per lane.
+      // TODO: Extend this to support multiple stores per pointer per lane
+      if (StoresVec.size() > Lane)
+        continue;
+      // Skip if in different BBs.
+      if (!StoresVec.empty() &&
+          SI->getParent() != StoresVec.back()->getParent())
+        continue;
+      // Make sure that the stores are of the same type.
+      if (!StoresVec.empty() &&
+          SI->getValueOperand()->getType() !=
+              StoresVec.back()->getValueOperand()->getType())
+        continue;
+      StoresVec.push_back(SI);
+    }
+  }
+  return PtrToStoresMap;
+}
+
+bool BoUpSLP::CanFormVector(const SmallVector<StoreInst *, 4> &StoresVec,
+                            OrdersType &ReorderIndices) const {
+  // We check whether the stores in StoreVec can form a vector by sorting them
+  // and checking whether they are consecutive.
+
+  // To avoid calling getPointersDiff() while sorting we create a vector of
+  // pairs {store, offset from first} and sort this instead.
+  SmallVector<std::pair<StoreInst *, int>, 4> StoreOffsetVec(StoresVec.size());
+  StoreInst *S0 = StoresVec[0];
+  StoreOffsetVec[0] = {S0, 0};
+  Type *S0Ty = S0->getValueOperand()->getType();
+  Value *S0Ptr = S0->getPointerOperand();
+  for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
+    StoreInst *SI = StoresVec[Idx];
+    Optional<int> Diff =
+        getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
+                        SI->getPointerOperand(), *DL, *SE,
+                        /*StrictCheck=*/true);
+    // We failed to compare the pointers so just abandon this StoresVec.
+    if (!Diff)
+      return false;
+    StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff};
+  }
+
+  // Sort the vector based on the pointers. We create a copy because we may
+  // need the original later for calculating the reorder (shuffle) indices.
+  stable_sort(StoreOffsetVec, [](const std::pair<StoreInst *, int> &Pair1,
+                                 const std::pair<StoreInst *, int> &Pair2) {
+    int Offset1 = Pair1.second;
+    int Offset2 = Pair2.second;
+    return Offset1 < Offset2;
+  });
+
+  // Check if the stores are consecutive by checking if their difference is 1.
+  for (unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
+    if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx-1].second + 1)
+      return false;
+
+  // Calculate the shuffle indices according to their offset against the sorted
+  // StoreOffsetVec.
+  ReorderIndices.reserve(StoresVec.size());
+  for (StoreInst *SI : StoresVec) {
+    unsigned Idx = find_if(StoreOffsetVec,
+                           [SI](const std::pair<StoreInst *, int> &Pair) {
+                             return Pair.first == SI;
+                           }) -
+                   StoreOffsetVec.begin();
+    ReorderIndices.push_back(Idx);
+  }
+  // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
+  // reorderTopToBottom() and reorderBottomToTop(), so we are following the
+  // same convention here.
+  auto IsIdentityOrder = [](const OrdersType &Order) {
+    for (unsigned Idx : seq<unsigned>(0, Order.size()))
+      if (Idx != Order[Idx])
+        return false;
+    return true;
+  };
+  if (IsIdentityOrder(ReorderIndices))
+    ReorderIndices.clear();
+
+  return true;
+}
 
-        LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
-                          << Lane << " from " << *Scalar << ".\n");
-        ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
-      }
-    }
+#ifndef NDEBUG
+LLVM_DUMP_METHOD static void dumpOrder(const BoUpSLP::OrdersType &Order) {
+  for (unsigned Idx : Order)
+    dbgs() << Idx << ", ";
+  dbgs() << "\n";
+}
+#endif
+
+SmallVector<BoUpSLP::OrdersType, 1>
+BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
+  unsigned NumLanes = TE->Scalars.size();
+
+  DenseMap<Value *, SmallVector<StoreInst *, 4>> PtrToStoresMap =
+      collectUserStores(TE);
+
+  // Holds the reorder indices for each candidate store vector that is a user of
+  // the current TreeEntry.
+  SmallVector<OrdersType, 1> ExternalReorderIndices;
+
+  // Now inspect the stores collected per pointer and look for vectorization
+  // candidates. For each candidate calculate the reorder index vector and push
+  // it into `ExternalReorderIndices`
+  for (const auto &Pair : PtrToStoresMap) {
+    auto &StoresVec = Pair.second;
+    // If we have fewer than NumLanes stores, then we can't form a vector.
+    if (StoresVec.size() != NumLanes)
+      continue;
+
+    // If the stores are not consecutive then abandon this StoresVec.
+    OrdersType ReorderIndices;
+    if (!CanFormVector(StoresVec, ReorderIndices))
+      continue;
+
+    // We now know that the scalars in StoresVec can form a vector instruction,
+    // so set the reorder indices.
+    ExternalReorderIndices.push_back(ReorderIndices);
   }
+  return ExternalReorderIndices;
 }
 
 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
-                        ArrayRef<Value *> UserIgnoreLst) {
+                        const SmallDenseSet<Value *> &UserIgnoreLst) {
   deleteTree();
-  UserIgnoreList = UserIgnoreLst;
+  UserIgnoreList = &UserIgnoreLst;
   if (!allSameType(Roots))
     return;
   buildTree_rec(Roots, 0, EdgeInfo());
 }
 
-namespace {
-/// Tracks the state we can represent the loads in the given sequence.
-enum class LoadsState { Gather, Vectorize, ScatterVectorize };
-} // anonymous namespace
-
-/// Checks if the given array of loads can be represented as a vectorized,
-/// scatter or just simple gather.
-static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
-                                    const TargetTransformInfo &TTI,
-                                    const DataLayout &DL, ScalarEvolution &SE,
-                                    SmallVectorImpl<unsigned> &Order,
-                                    SmallVectorImpl<Value *> &PointerOps) {
-  // Check that a vectorized load would load the same memory as a scalar
-  // load. For example, we don't want to vectorize loads that are smaller
-  // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
-  // treats loading/storing it as an i8 struct. If we vectorize loads/stores
-  // from such a struct, we read/write packed bits disagreeing with the
-  // unvectorized version.
-  Type *ScalarTy = VL0->getType();
-
-  if (DL.getTypeSizeInBits(ScalarTy) != DL.getTypeAllocSizeInBits(ScalarTy))
-    return LoadsState::Gather;
+void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
+  deleteTree();
+  if (!allSameType(Roots))
+    return;
+  buildTree_rec(Roots, 0, EdgeInfo());
+}
 
-  // Make sure all loads in the bundle are simple - we can't vectorize
-  // atomic or volatile loads.
-  PointerOps.clear();
-  PointerOps.resize(VL.size());
-  auto *POIter = PointerOps.begin();
+/// \return true if the specified list of values has only one instruction that
+/// requires scheduling, false otherwise.
+#ifndef NDEBUG
+static bool needToScheduleSingleInstruction(ArrayRef<Value *> VL) {
+  Value *NeedsScheduling = nullptr;
   for (Value *V : VL) {
-    auto *L = cast<LoadInst>(V);
-    if (!L->isSimple())
-      return LoadsState::Gather;
-    *POIter = L->getPointerOperand();
-    ++POIter;
+    if (doesNotNeedToBeScheduled(V))
+      continue;
+    if (!NeedsScheduling) {
+      NeedsScheduling = V;
+      continue;
+    }
+    return false;
   }
+  return NeedsScheduling;
+}
+#endif
 
-  Order.clear();
-  // Check the order of pointer operands.
-  if (llvm::sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order)) {
-    Value *Ptr0;
-    Value *PtrN;
-    if (Order.empty()) {
-      Ptr0 = PointerOps.front();
-      PtrN = PointerOps.back();
+/// Generates key/subkey pair for the given value to provide effective sorting
+/// of the values and better detection of the vectorizable values sequences. The
+/// keys/subkeys can be used for better sorting of the values themselves (keys)
+/// and in values subgroups (subkeys).
+static std::pair<size_t, size_t> generateKeySubkey(
+    Value *V, const TargetLibraryInfo *TLI,
+    function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
+    bool AllowAlternate) {
+  hash_code Key = hash_value(V->getValueID() + 2);
+  hash_code SubKey = hash_value(0);
+  // Sort the loads by the distance between the pointers.
+  if (auto *LI = dyn_cast<LoadInst>(V)) {
+    Key = hash_combine(hash_value(Instruction::Load), Key);
+    if (LI->isSimple())
+      SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
+    else
+      SubKey = hash_value(LI);
+  } else if (isVectorLikeInstWithConstOps(V)) {
+    // Sort extracts by the vector operands.
+    if (isa<ExtractElementInst, UndefValue>(V))
+      Key = hash_value(Value::UndefValueVal + 1);
+    if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
+      if (!isUndefVector(EI->getVectorOperand()) &&
+          !isa<UndefValue>(EI->getIndexOperand()))
+        SubKey = hash_value(EI->getVectorOperand());
+    }
+  } else if (auto *I = dyn_cast<Instruction>(V)) {
+    // Sort other instructions just by the opcodes except for CMPInst.
+    // For CMP also sort by the predicate kind.
+    if ((isa<BinaryOperator>(I) || isa<CastInst>(I)) &&
+        isValidForAlternation(I->getOpcode())) {
+      if (AllowAlternate)
+        Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
+      else
+        Key = hash_combine(hash_value(I->getOpcode()), Key);
+      SubKey = hash_combine(
+          hash_value(I->getOpcode()), hash_value(I->getType()),
+          hash_value(isa<BinaryOperator>(I)
+                         ? I->getType()
+                         : cast<CastInst>(I)->getOperand(0)->getType()));
+      // For casts, look through the only operand to improve compile time.
+      if (isa<CastInst>(I)) {
+        std::pair<size_t, size_t> OpVals =
+            generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
+                              /*=AllowAlternate*/ true);
+        Key = hash_combine(OpVals.first, Key);
+        SubKey = hash_combine(OpVals.first, SubKey);
+      }
+    } else if (auto *CI = dyn_cast<CmpInst>(I)) {
+      CmpInst::Predicate Pred = CI->getPredicate();
+      if (CI->isCommutative())
+        Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
+      CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(Pred);
+      SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
+                            hash_value(SwapPred),
+                            hash_value(CI->getOperand(0)->getType()));
+    } else if (auto *Call = dyn_cast<CallInst>(I)) {
+      Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, TLI);
+      if (isTriviallyVectorizable(ID)) {
+        SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
+      } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
+        SubKey = hash_combine(hash_value(I->getOpcode()),
+                              hash_value(Call->getCalledFunction()));
+      } else {
+        Key = hash_combine(hash_value(Call), Key);
+        SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
+      }
+      for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
+        SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
+                              hash_value(Op.Tag), SubKey);
+    } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
+      if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
+        SubKey = hash_value(Gep->getPointerOperand());
+      else
+        SubKey = hash_value(Gep);
+    } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
+               !isa<ConstantInt>(I->getOperand(1))) {
+      // Do not try to vectorize instructions with potentially high cost.
+      SubKey = hash_value(I);
     } else {
-      Ptr0 = PointerOps[Order.front()];
-      PtrN = PointerOps[Order.back()];
+      SubKey = hash_value(I->getOpcode());
     }
-    Optional<int> Diff =
-        getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
-    // Check that the sorted loads are consecutive.
-    if (static_cast<unsigned>(*Diff) == VL.size() - 1)
-      return LoadsState::Vectorize;
-    Align CommonAlignment = cast<LoadInst>(VL0)->getAlign();
-    for (Value *V : VL)
-      CommonAlignment =
-          commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());
-    if (TTI.isLegalMaskedGather(FixedVectorType::get(ScalarTy, VL.size()),
-                                CommonAlignment))
-      return LoadsState::ScatterVectorize;
+    Key = hash_combine(hash_value(I->getParent()), Key);
   }
-
-  return LoadsState::Gather;
+  return std::make_pair(Key, SubKey);
 }
 
 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
@@ -3722,10 +4629,84 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   // If all of the operands are identical or constant we have a simple solution.
   // If we deal with insert/extract instructions, they all must have constant
   // indices, otherwise we should gather them, not try to vectorize.
-  if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode() ||
-      (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(S.MainOp) &&
-       !all_of(VL, isVectorLikeInstWithConstOps))) {
-    LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
+  // If alternate op node with 2 elements with gathered operands - do not
+  // vectorize.
+  auto &&NotProfitableForVectorization = [&S, this,
+                                          Depth](ArrayRef<Value *> VL) {
+    if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
+      return false;
+    if (VectorizableTree.size() < MinTreeSize)
+      return false;
+    if (Depth >= RecursionMaxDepth - 1)
+      return true;
+    // Check if all operands are extracts, part of vector node or can build a
+    // regular vectorize node.
+    SmallVector<unsigned, 2> InstsCount(VL.size(), 0);
+    for (Value *V : VL) {
+      auto *I = cast<Instruction>(V);
+      InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
+        return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
+      }));
+    }
+    bool IsCommutative = isCommutative(S.MainOp) || isCommutative(S.AltOp);
+    if ((IsCommutative &&
+         std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
+        (!IsCommutative &&
+         all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
+      return true;
+    assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
+    SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
+    auto *I1 = cast<Instruction>(VL.front());
+    auto *I2 = cast<Instruction>(VL.back());
+    for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
+      Candidates.emplace_back().emplace_back(I1->getOperand(Op),
+                                             I2->getOperand(Op));
+    if (static_cast<unsigned>(count_if(
+            Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
+              return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat);
+            })) >= S.MainOp->getNumOperands() / 2)
+      return false;
+    if (S.MainOp->getNumOperands() > 2)
+      return true;
+    if (IsCommutative) {
+      // Check permuted operands.
+      Candidates.clear();
+      for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
+        Candidates.emplace_back().emplace_back(I1->getOperand(Op),
+                                               I2->getOperand((Op + 1) % E));
+      if (any_of(
+              Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
+                return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat);
+              }))
+        return false;
+    }
+    return true;
+  };
+  SmallVector<unsigned> SortedIndices;
+  BasicBlock *BB = nullptr;
+  bool AreAllSameInsts =
+      (S.getOpcode() && allSameBlock(VL)) ||
+      (S.OpValue->getType()->isPointerTy() && UserTreeIdx.UserTE &&
+       UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&
+       VL.size() > 2 &&
+       all_of(VL,
+              [&BB](Value *V) {
+                auto *I = dyn_cast<GetElementPtrInst>(V);
+                if (!I)
+                  return doesNotNeedToBeScheduled(V);
+                if (!BB)
+                  BB = I->getParent();
+                return BB == I->getParent() && I->getNumOperands() == 2;
+              }) &&
+       BB &&
+       sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
+                       SortedIndices));
+  if (allConstant(VL) || isSplat(VL) || !AreAllSameInsts ||
+      (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
+           S.OpValue) &&
+       !all_of(VL, isVectorLikeInstWithConstOps)) ||
+      NotProfitableForVectorization(VL)) {
+    LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
     if (TryToFindDuplicates(S))
       newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
                    ReuseShuffleIndicies);
@@ -3736,12 +4717,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   // the same block.
 
   // Don't vectorize ephemeral values.
-  for (Value *V : VL) {
-    if (EphValues.count(V)) {
-      LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
-                        << ") is ephemeral.\n");
-      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
-      return;
+  if (!EphValues.empty()) {
+    for (Value *V : VL) {
+      if (EphValues.count(V)) {
+        LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
+                          << ") is ephemeral.\n");
+        newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+        return;
+      }
     }
   }
 
@@ -3779,20 +4762,37 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   }
 
   // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
-  for (Value *V : VL) {
-    if (is_contained(UserIgnoreList, V)) {
-      LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
-      if (TryToFindDuplicates(S))
-        newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
-                     ReuseShuffleIndicies);
-      return;
+  if (UserIgnoreList && !UserIgnoreList->empty()) {
+    for (Value *V : VL) {
+      if (UserIgnoreList && UserIgnoreList->contains(V)) {
+        LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
+        if (TryToFindDuplicates(S))
+          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                       ReuseShuffleIndicies);
+        return;
+      }
     }
   }
 
+  // Special processing for sorted pointers for ScatterVectorize node with
+  // constant indeces only.
+  if (AreAllSameInsts && !(S.getOpcode() && allSameBlock(VL)) &&
+      UserTreeIdx.UserTE &&
+      UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize) {
+    assert(S.OpValue->getType()->isPointerTy() &&
+           count_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }) >=
+               2 &&
+           "Expected pointers only.");
+    // Reset S to make it GetElementPtr kind of node.
+    const auto *It = find_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); });
+    assert(It != VL.end() && "Expected at least one GEP.");
+    S = getSameOpcode(*It);
+  }
+
   // Check that all of the users of the scalars that we want to vectorize are
   // schedulable.
   auto *VL0 = cast<Instruction>(S.OpValue);
-  BasicBlock *BB = VL0->getParent();
+  BB = VL0->getParent();
 
   if (!DT->isReachableFromEntry(BB)) {
     // Don't go into unreachable blocks. They may contain instructions with
@@ -3810,9 +4810,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   if (!BSRef)
     BSRef = std::make_unique<BlockScheduling>(BB);
 
-  BlockScheduling &BS = *BSRef.get();
+  BlockScheduling &BS = *BSRef;
 
   Optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);
+#ifdef EXPENSIVE_CHECKS
+  // Make sure we didn't break any internal invariants
+  BS.verify();
+#endif
   if (!Bundle) {
     LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
     assert((!BS.getScheduleData(VL0) ||
@@ -3832,10 +4836,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
 
       // Check for terminator values (e.g. invoke).
       for (Value *V : VL)
-        for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
-          Instruction *Term = dyn_cast<Instruction>(
-              cast<PHINode>(V)->getIncomingValueForBlock(
-                  PH->getIncomingBlock(I)));
+        for (Value *Incoming : cast<PHINode>(V)->incoming_values()) {
+          Instruction *Term = dyn_cast<Instruction>(Incoming);
           if (Term && Term->isTerminator()) {
             LLVM_DEBUG(dbgs()
                        << "SLP: Need to swizzle PHINodes (terminator use).\n");
@@ -3918,13 +4920,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       // Check that we have a buildvector and not a shuffle of 2 or more
       // different vectors.
       ValueSet SourceVectors;
-      int MinIdx = std::numeric_limits<int>::max();
       for (Value *V : VL) {
         SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
-        Optional<int> Idx = *getInsertIndex(V, 0);
-        if (!Idx || *Idx == UndefMaskElem)
-          continue;
-        MinIdx = std::min(MinIdx, *Idx);
+        assert(getInsertIndex(V) != None && "Non-constant or undef index?");
       }
 
       if (count_if(VL, [&SourceVectors](Value *V) {
@@ -3946,10 +4944,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                     decltype(OrdCompare)>
           Indices(OrdCompare);
       for (int I = 0, E = VL.size(); I < E; ++I) {
-        Optional<int> Idx = *getInsertIndex(VL[I], 0);
-        if (!Idx || *Idx == UndefMaskElem)
-          continue;
-        Indices.emplace(*Idx, I);
+        unsigned Idx = *getInsertIndex(VL[I]);
+        Indices.emplace(Idx, I);
       }
       OrdersType CurrentOrder(VL.size(), VL.size());
       bool IsIdentity = true;
@@ -3985,7 +4981,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       SmallVector<Value *> PointerOps;
       OrdersType CurrentOrder;
       TreeEntry *TE = nullptr;
-      switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, CurrentOrder,
+      switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, *LI, CurrentOrder,
                                 PointerOps)) {
       case LoadsState::Vectorize:
         if (CurrentOrder.empty()) {
@@ -4166,7 +5162,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     case Instruction::GetElementPtr: {
       // We don't combine GEPs with complicated (nested) indexing.
       for (Value *V : VL) {
-        if (cast<Instruction>(V)->getNumOperands() != 2) {
+        auto *I = dyn_cast<GetElementPtrInst>(V);
+        if (!I)
+          continue;
+        if (I->getNumOperands() != 2) {
           LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
           BS.cancelScheduling(VL, VL0);
           newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
@@ -4177,9 +5176,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
 
       // We can't combine several GEPs into one vector if they operate on
       // different types.
-      Type *Ty0 = VL0->getOperand(0)->getType();
+      Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
       for (Value *V : VL) {
-        Type *CurTy = cast<Instruction>(V)->getOperand(0)->getType();
+        auto *GEP = dyn_cast<GEPOperator>(V);
+        if (!GEP)
+          continue;
+        Type *CurTy = GEP->getSourceElementType();
         if (Ty0 != CurTy) {
           LLVM_DEBUG(dbgs()
                      << "SLP: not-vectorizable GEP (different types).\n");
@@ -4190,15 +5192,22 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         }
       }
 
+      bool IsScatterUser =
+          UserTreeIdx.UserTE &&
+          UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
       // We don't combine GEPs with non-constant indexes.
       Type *Ty1 = VL0->getOperand(1)->getType();
       for (Value *V : VL) {
-        auto Op = cast<Instruction>(V)->getOperand(1);
-        if (!isa<ConstantInt>(Op) ||
+        auto *I = dyn_cast<GetElementPtrInst>(V);
+        if (!I)
+          continue;
+        auto *Op = I->getOperand(1);
+        if ((!IsScatterUser && !isa<ConstantInt>(Op)) ||
             (Op->getType() != Ty1 &&
-             Op->getType()->getScalarSizeInBits() >
-                 DL->getIndexSizeInBits(
-                     V->getType()->getPointerAddressSpace()))) {
+             ((IsScatterUser && !isa<ConstantInt>(Op)) ||
+              Op->getType()->getScalarSizeInBits() >
+                  DL->getIndexSizeInBits(
+                      V->getType()->getPointerAddressSpace())))) {
           LLVM_DEBUG(dbgs()
                      << "SLP: not-vectorizable GEP (non-constant indexes).\n");
           BS.cancelScheduling(VL, VL0);
@@ -4213,9 +5222,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
       SmallVector<ValueList, 2> Operands(2);
       // Prepare the operand vector for pointer operands.
-      for (Value *V : VL)
-        Operands.front().push_back(
-            cast<GetElementPtrInst>(V)->getPointerOperand());
+      for (Value *V : VL) {
+        auto *GEP = dyn_cast<GetElementPtrInst>(V);
+        if (!GEP) {
+          Operands.front().push_back(V);
+          continue;
+        }
+        Operands.front().push_back(GEP->getPointerOperand());
+      }
       TE->setOperand(0, Operands.front());
       // Need to cast all indices to the same type before vectorization to
       // avoid crash.
@@ -4226,9 +5240,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
       Type *Ty = all_of(VL,
                         [VL0Ty, IndexIdx](Value *V) {
-                          return VL0Ty == cast<GetElementPtrInst>(V)
-                                              ->getOperand(IndexIdx)
-                                              ->getType();
+                          auto *GEP = dyn_cast<GetElementPtrInst>(V);
+                          if (!GEP)
+                            return true;
+                          return VL0Ty == GEP->getOperand(IndexIdx)->getType();
                         })
                      ? VL0Ty
                      : DL->getIndexType(cast<GetElementPtrInst>(VL0)
@@ -4236,10 +5251,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                                             ->getScalarType());
       // Prepare the operand vector.
       for (Value *V : VL) {
-        auto *Op = cast<Instruction>(V)->getOperand(IndexIdx);
-        auto *CI = cast<ConstantInt>(Op);
-        Operands.back().push_back(ConstantExpr::getIntegerCast(
-            CI, Ty, CI->getValue().isSignBitSet()));
+        auto *I = dyn_cast<GetElementPtrInst>(V);
+        if (!I) {
+          Operands.back().push_back(
+              ConstantInt::get(Ty, 0, /*isSigned=*/false));
+          continue;
+        }
+        auto *Op = I->getOperand(IndexIdx);
+        auto *CI = dyn_cast<ConstantInt>(Op);
+        if (!CI)
+          Operands.back().push_back(Op);
+        else
+          Operands.back().push_back(ConstantExpr::getIntegerCast(
+              CI, Ty, CI->getValue().isSignBitSet()));
       }
       TE->setOperand(IndexIdx, Operands.back());
 
@@ -4345,7 +5369,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       unsigned NumArgs = CI->arg_size();
       SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr);
       for (unsigned j = 0; j != NumArgs; ++j)
-        if (hasVectorInstrinsicScalarOpd(ID, j))
+        if (isVectorIntrinsicWithScalarOpAtArg(ID, j))
           ScalarArgs[j] = CI->getArgOperand(j);
       for (Value *V : VL) {
         CallInst *CI2 = dyn_cast<CallInst>(V);
@@ -4364,7 +5388,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         // Some intrinsics have scalar arguments and should be same in order for
         // them to be vectorized.
         for (unsigned j = 0; j != NumArgs; ++j) {
-          if (hasVectorInstrinsicScalarOpd(ID, j)) {
+          if (isVectorIntrinsicWithScalarOpAtArg(ID, j)) {
             Value *A1J = CI2->getArgOperand(j);
             if (ScalarArgs[j] != A1J) {
               BS.cancelScheduling(VL, VL0);
@@ -4397,7 +5421,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {
         // For scalar operands no need to to create an entry since no need to
         // vectorize it.
-        if (hasVectorInstrinsicScalarOpd(ID, i))
+        if (isVectorIntrinsicWithScalarOpAtArg(ID, i))
           continue;
         ValueList Operands;
         // Prepare the operand vector.
@@ -4434,6 +5458,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         } else {
           CmpInst::Predicate P0 = CI->getPredicate();
           CmpInst::Predicate AltP0 = cast<CmpInst>(S.AltOp)->getPredicate();
+          assert(P0 != AltP0 &&
+                 "Expected different main/alternate predicates.");
           CmpInst::Predicate AltP0Swapped = CmpInst::getSwappedPredicate(AltP0);
           Value *BaseOp0 = VL0->getOperand(0);
           Value *BaseOp1 = VL0->getOperand(1);
@@ -4443,16 +5469,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             auto *Cmp = cast<CmpInst>(V);
             Value *LHS = Cmp->getOperand(0);
             Value *RHS = Cmp->getOperand(1);
-            CmpInst::Predicate CurrentPred = CI->getPredicate();
-            CmpInst::Predicate CurrentPredSwapped =
-                CmpInst::getSwappedPredicate(CurrentPred);
-            if (P0 == AltP0 || P0 == AltP0Swapped) {
-              if ((P0 == CurrentPred &&
-                   !areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)) ||
-                  (P0 == CurrentPredSwapped &&
-                   !areCompatibleCmpOps(BaseOp0, BaseOp1, RHS, LHS)))
+            CmpInst::Predicate CurrentPred = Cmp->getPredicate();
+            if (P0 == AltP0Swapped) {
+              if (CI != Cmp && S.AltOp != Cmp &&
+                  ((P0 == CurrentPred &&
+                    !areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)) ||
+                   (AltP0 == CurrentPred &&
+                    areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS))))
                 std::swap(LHS, RHS);
-            } else if (!areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)) {
+            } else if (P0 != CurrentPred && AltP0 != CurrentPred) {
               std::swap(LHS, RHS);
             }
             Left.push_back(LHS);
@@ -4602,7 +5627,9 @@ bool BoUpSLP::areAllUsersVectorized(Instruction *I,
                                     ArrayRef<Value *> VectorizedVals) const {
   return (I->hasOneUse() && is_contained(VectorizedVals, I)) ||
          all_of(I->users(), [this](User *U) {
-           return ScalarToTreeEntry.count(U) > 0 || MustGather.contains(U);
+           return ScalarToTreeEntry.count(U) > 0 ||
+                  isVectorLikeInstWithConstOps(U) ||
+                  (isa<ExtractElementInst>(U) && MustGather.contains(U));
          });
 }
 
@@ -4659,19 +5686,21 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
   // Process extracts in blocks of EltsPerVector to check if the source vector
   // operand can be re-used directly. If not, add the cost of creating a shuffle
   // to extract the values into a vector register.
+  SmallVector<int> RegMask(EltsPerVector, UndefMaskElem);
   for (auto *V : VL) {
     ++Idx;
 
-    // Need to exclude undefs from analysis.
-    if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)
-      continue;
-
     // Reached the start of a new vector registers.
     if (Idx % EltsPerVector == 0) {
+      RegMask.assign(EltsPerVector, UndefMaskElem);
       AllConsecutive = true;
       continue;
     }
 
+    // Need to exclude undefs from analysis.
+    if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)
+      continue;
+
     // Check all extracts for a vector register on the target directly
     // extract values in order.
     unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));
@@ -4679,6 +5708,7 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
       unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));
       AllConsecutive &= PrevIdx + 1 == CurrentIdx &&
                         CurrentIdx % EltsPerVector == Idx % EltsPerVector;
+      RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector;
     }
 
     if (AllConsecutive)
@@ -4690,10 +5720,10 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
 
     // If we have a series of extracts which are not consecutive and hence
     // cannot re-use the source vector register directly, compute the shuffle
-    // cost to extract the a vector with EltsPerVector elements.
+    // cost to extract the vector with EltsPerVector elements.
     Cost += TTI.getShuffleCost(
         TargetTransformInfo::SK_PermuteSingleSrc,
-        FixedVectorType::get(VecTy->getElementType(), EltsPerVector));
+        FixedVectorType::get(VecTy->getElementType(), EltsPerVector), RegMask);
   }
   return Cost;
 }
@@ -4701,12 +5731,12 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
 /// Build shuffle mask for shuffle graph entries and lists of main and alternate
 /// operations operands.
 static void
-buildSuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,
-                     ArrayRef<int> ReusesIndices,
-                     const function_ref<bool(Instruction *)> IsAltOp,
-                     SmallVectorImpl<int> &Mask,
-                     SmallVectorImpl<Value *> *OpScalars = nullptr,
-                     SmallVectorImpl<Value *> *AltScalars = nullptr) {
+buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,
+                      ArrayRef<int> ReusesIndices,
+                      const function_ref<bool(Instruction *)> IsAltOp,
+                      SmallVectorImpl<int> &Mask,
+                      SmallVectorImpl<Value *> *OpScalars = nullptr,
+                      SmallVectorImpl<Value *> *AltScalars = nullptr) {
   unsigned Sz = VL.size();
   Mask.assign(Sz, UndefMaskElem);
   SmallVector<int> OrderMask;
@@ -4736,6 +5766,29 @@ buildSuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,
   }
 }
 
+/// Checks if the specified instruction \p I is an alternate operation for the
+/// given \p MainOp and \p AltOp instructions.
+static bool isAlternateInstruction(const Instruction *I,
+                                   const Instruction *MainOp,
+                                   const Instruction *AltOp) {
+  if (auto *CI0 = dyn_cast<CmpInst>(MainOp)) {
+    auto *AltCI0 = cast<CmpInst>(AltOp);
+    auto *CI = cast<CmpInst>(I);
+    CmpInst::Predicate P0 = CI0->getPredicate();
+    CmpInst::Predicate AltP0 = AltCI0->getPredicate();
+    assert(P0 != AltP0 && "Expected different main/alternate predicates.");
+    CmpInst::Predicate AltP0Swapped = CmpInst::getSwappedPredicate(AltP0);
+    CmpInst::Predicate CurrentPred = CI->getPredicate();
+    if (P0 == AltP0Swapped)
+      return I == AltCI0 ||
+             (I != MainOp &&
+              !areCompatibleCmpOps(CI0->getOperand(0), CI0->getOperand(1),
+                                   CI->getOperand(0), CI->getOperand(1)));
+    return AltP0 == CurrentPred || AltP0Swapped == CurrentPred;
+  }
+  return I->getOpcode() == AltOp->getOpcode();
+}
+
 InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
                                       ArrayRef<Value *> VectorizedVals) {
   ArrayRef<Value*> VL = E->Scalars;
@@ -4849,7 +5902,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
     SmallVector<const TreeEntry *> Entries;
     Optional<TargetTransformInfo::ShuffleKind> Shuffle =
         isGatherShuffledEntry(E, Mask, Entries);
-    if (Shuffle.hasValue()) {
+    if (Shuffle) {
       InstructionCost GatherCost = 0;
       if (ShuffleVectorInst::isIdentityMask(Mask)) {
         // Perfect match in the graph, will reuse the previously vectorized
@@ -4885,7 +5938,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
       SmallVector<int> Mask;
       Optional<TargetTransformInfo::ShuffleKind> ShuffleKind =
           isFixedVectorShuffle(VL, Mask);
-      if (ShuffleKind.hasValue()) {
+      if (ShuffleKind) {
         // Found the bunch of extractelement instructions that must be gathered
         // into a vector and can be represented as a permutation elements in a
         // single input vector or of 2 input vectors.
@@ -4903,7 +5956,9 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
       // broadcast.
       assert(VecTy == FinalVecTy &&
              "No reused scalars expected for broadcast.");
-      return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy);
+      return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy,
+                                 /*Mask=*/None, /*Index=*/0,
+                                 /*SubTp=*/nullptr, /*Args=*/VL[0]);
     }
     InstructionCost ReuseShuffleCost = 0;
     if (NeedToShuffleReuses)
@@ -4927,8 +5982,9 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
               !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
             SmallVector<Value *> PointerOps;
             OrdersType CurrentOrder;
-            LoadsState LS = canVectorizeLoads(Slice, Slice.front(), *TTI, *DL,
-                                              *SE, CurrentOrder, PointerOps);
+            LoadsState LS =
+                canVectorizeLoads(Slice, Slice.front(), *TTI, *DL, *SE, *LI,
+                                  CurrentOrder, PointerOps);
             switch (LS) {
             case LoadsState::Vectorize:
             case LoadsState::ScatterVectorize:
@@ -5018,7 +6074,11 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
   assert((E->State == TreeEntry::Vectorize ||
           E->State == TreeEntry::ScatterVectorize) &&
          "Unhandled state");
-  assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
+  assert(E->getOpcode() &&
+         ((allSameType(VL) && allSameBlock(VL)) ||
+          (E->getOpcode() == Instruction::GetElementPtr &&
+           E->getMainOp()->getType()->isPointerTy())) &&
+         "Invalid VL");
   Instruction *VL0 = E->getMainOp();
   unsigned ShuffleOrOp =
       E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
@@ -5090,30 +6150,60 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
       assert(E->ReuseShuffleIndices.empty() &&
              "Unique insertelements only are expected.");
       auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
-
       unsigned const NumElts = SrcVecTy->getNumElements();
       unsigned const NumScalars = VL.size();
+
+      unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
+
+      unsigned OffsetBeg = *getInsertIndex(VL.front());
+      unsigned OffsetEnd = OffsetBeg;
+      for (Value *V : VL.drop_front()) {
+        unsigned Idx = *getInsertIndex(V);
+        if (OffsetBeg > Idx)
+          OffsetBeg = Idx;
+        else if (OffsetEnd < Idx)
+          OffsetEnd = Idx;
+      }
+      unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
+      if (NumOfParts > 0)
+        VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
+      unsigned VecSz =
+          (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
+          VecScalarsSz;
+      unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
+      unsigned InsertVecSz = std::min<unsigned>(
+          PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
+          ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) *
+              VecScalarsSz);
+      bool IsWholeSubvector =
+          OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
+      // Check if we can safely insert a subvector. If it is not possible, just
+      // generate a whole-sized vector and shuffle the source vector and the new
+      // subvector.
+      if (OffsetBeg + InsertVecSz > VecSz) {
+        // Align OffsetBeg to generate correct mask.
+        OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
+        InsertVecSz = VecSz;
+      }
+
       APInt DemandedElts = APInt::getZero(NumElts);
       // TODO: Add support for Instruction::InsertValue.
       SmallVector<int> Mask;
       if (!E->ReorderIndices.empty()) {
         inversePermutation(E->ReorderIndices, Mask);
-        Mask.append(NumElts - NumScalars, UndefMaskElem);
+        Mask.append(InsertVecSz - Mask.size(), UndefMaskElem);
       } else {
-        Mask.assign(NumElts, UndefMaskElem);
-        std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
+        Mask.assign(VecSz, UndefMaskElem);
+        std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
       }
-      unsigned Offset = *getInsertIndex(VL0, 0);
       bool IsIdentity = true;
-      SmallVector<int> PrevMask(NumElts, UndefMaskElem);
+      SmallVector<int> PrevMask(InsertVecSz, UndefMaskElem);
       Mask.swap(PrevMask);
       for (unsigned I = 0; I < NumScalars; ++I) {
-        Optional<int> InsertIdx = getInsertIndex(VL[PrevMask[I]], 0);
-        if (!InsertIdx || *InsertIdx == UndefMaskElem)
-          continue;
-        DemandedElts.setBit(*InsertIdx);
-        IsIdentity &= *InsertIdx - Offset == I;
-        Mask[*InsertIdx - Offset] = I;
+        unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]);
+        DemandedElts.setBit(InsertIdx);
+        IsIdentity &= InsertIdx - OffsetBeg == I;
+        Mask[InsertIdx - OffsetBeg] = I;
       }
       assert(Offset < NumElts && "Failed to find vector index offset");
 
@@ -5121,32 +6211,41 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
       Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
                                             /*Insert*/ true, /*Extract*/ false);
 
-      if (IsIdentity && NumElts != NumScalars && Offset % NumScalars != 0) {
-        // FIXME: Replace with SK_InsertSubvector once it is properly supported.
-        unsigned Sz = PowerOf2Ceil(Offset + NumScalars);
-        Cost += TTI->getShuffleCost(
-            TargetTransformInfo::SK_PermuteSingleSrc,
-            FixedVectorType::get(SrcVecTy->getElementType(), Sz));
-      } else if (!IsIdentity) {
-        auto *FirstInsert =
-            cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
-              return !is_contained(E->Scalars,
-                                   cast<Instruction>(V)->getOperand(0));
-            }));
-        if (isUndefVector(FirstInsert->getOperand(0))) {
-          Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, Mask);
+      // First cost - resize to actual vector size if not identity shuffle or
+      // need to shift the vector.
+      // Do not calculate the cost if the actual size is the register size and
+      // we can merge this shuffle with the following SK_Select.
+      auto *InsertVecTy =
+          FixedVectorType::get(SrcVecTy->getElementType(), InsertVecSz);
+      if (!IsIdentity)
+        Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+                                    InsertVecTy, Mask);
+      auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
+        return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
+      }));
+      // Second cost - permutation with subvector, if some elements are from the
+      // initial vector or inserting a subvector.
+      // TODO: Implement the analysis of the FirstInsert->getOperand(0)
+      // subvector of ActualVecTy.
+      if (!isUndefVector(FirstInsert->getOperand(0)) && NumScalars != NumElts &&
+          !IsWholeSubvector) {
+        if (InsertVecSz != VecSz) {
+          auto *ActualVecTy =
+              FixedVectorType::get(SrcVecTy->getElementType(), VecSz);
+          Cost += TTI->getShuffleCost(TTI::SK_InsertSubvector, ActualVecTy,
+                                      None, OffsetBeg - Offset, InsertVecTy);
         } else {
-          SmallVector<int> InsertMask(NumElts);
-          std::iota(InsertMask.begin(), InsertMask.end(), 0);
-          for (unsigned I = 0; I < NumElts; I++) {
+          for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
+            Mask[I] = I;
+          for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
+               I <= End; ++I)
             if (Mask[I] != UndefMaskElem)
-              InsertMask[Offset + I] = NumElts + I;
-          }
-          Cost +=
-              TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVecTy, InsertMask);
+              Mask[I] = I + VecSz;
+          for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
+            Mask[I] = I;
+          Cost += TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
         }
       }
-
       return Cost;
     }
     case Instruction::ZExt:
@@ -5227,9 +6326,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
         // If the selects are the only uses of the compares, they will be dead
         // and we can adjust the cost by removing their cost.
         if (IntrinsicAndUse.second)
-          IntrinsicCost -=
-              TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy, MaskTy,
-                                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
+          IntrinsicCost -= TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy,
+                                                   MaskTy, VecPred, CostKind);
         VecCost = std::min(VecCost, IntrinsicCost);
       }
       LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
@@ -5309,7 +6407,14 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
       TargetTransformInfo::OperandValueKind Op1VK =
           TargetTransformInfo::OK_AnyValue;
       TargetTransformInfo::OperandValueKind Op2VK =
-          TargetTransformInfo::OK_UniformConstantValue;
+          any_of(VL,
+                 [](Value *V) {
+                   return isa<GetElementPtrInst>(V) &&
+                          !isConstant(
+                              cast<GetElementPtrInst>(V)->getOperand(1));
+                 })
+              ? TargetTransformInfo::OK_AnyValue
+              : TargetTransformInfo::OK_UniformConstantValue;
 
       InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost(
           Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK);
@@ -5340,7 +6445,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
         Align CommonAlignment = Alignment;
         for (Value *V : VL)
           CommonAlignment =
-              commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());
+              std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
         VecLdCost = TTI->getGatherScatterOpCost(
             Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
             /*VariableMask=*/false, CommonAlignment, CostKind, VL0);
@@ -5458,39 +6563,21 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
                                          TTI::CastContextHint::None, CostKind);
       }
 
-      SmallVector<int> Mask;
-      buildSuffleEntryMask(
-          E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
-          [E](Instruction *I) {
-            assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
-            if (auto *CI0 = dyn_cast<CmpInst>(E->getMainOp())) {
-              auto *AltCI0 = cast<CmpInst>(E->getAltOp());
-              auto *CI = cast<CmpInst>(I);
-              CmpInst::Predicate P0 = CI0->getPredicate();
-              CmpInst::Predicate AltP0 = AltCI0->getPredicate();
-              CmpInst::Predicate AltP0Swapped =
-                  CmpInst::getSwappedPredicate(AltP0);
-              CmpInst::Predicate CurrentPred = CI->getPredicate();
-              CmpInst::Predicate CurrentPredSwapped =
-                  CmpInst::getSwappedPredicate(CurrentPred);
-              if (P0 == AltP0 || P0 == AltP0Swapped) {
-                // Alternate cmps have same/swapped predicate as main cmps but
-                // different order of compatible operands.
-                return !(
-                    (P0 == CurrentPred &&
-                     areCompatibleCmpOps(CI0->getOperand(0), CI0->getOperand(1),
-                                         I->getOperand(0), I->getOperand(1))) ||
-                    (P0 == CurrentPredSwapped &&
-                     areCompatibleCmpOps(CI0->getOperand(0), CI0->getOperand(1),
-                                         I->getOperand(1), I->getOperand(0))));
-              }
-              return CurrentPred != P0 && CurrentPredSwapped != P0;
-            }
-            return I->getOpcode() == E->getAltOpcode();
-          },
-          Mask);
-      CommonCost =
-          TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy, Mask);
+      if (E->ReuseShuffleIndices.empty()) {
+        CommonCost =
+            TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy);
+      } else {
+        SmallVector<int> Mask;
+        buildShuffleEntryMask(
+            E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
+            [E](Instruction *I) {
+              assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
+              return I->getOpcode() == E->getAltOpcode();
+            },
+            Mask);
+        CommonCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
+                                         FinalVecTy, Mask);
+      }
       LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
       return CommonCost + VecCost - ScalarCost;
     }
@@ -5618,7 +6705,10 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
   // No need to vectorize inserts of gathered values.
   if (VectorizableTree.size() == 2 &&
       isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
-      VectorizableTree[1]->State == TreeEntry::NeedToGather)
+      VectorizableTree[1]->State == TreeEntry::NeedToGather &&
+      (VectorizableTree[1]->getVectorFactor() <= 2 ||
+       !(isSplat(VectorizableTree[1]->Scalars) ||
+         allConstant(VectorizableTree[1]->Scalars))))
     return true;
 
   // We can vectorize the tree if its size is greater than or equal to the
@@ -5748,20 +6838,26 @@ static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU,
     return false;
   auto *IE1 = VU;
   auto *IE2 = V;
+  unsigned Idx1 = *getInsertIndex(IE1);
+  unsigned Idx2 = *getInsertIndex(IE2);
   // Go through the vector operand of insertelement instructions trying to find
   // either VU as the original vector for IE2 or V as the original vector for
   // IE1.
   do {
-    if (IE2 == VU || IE1 == V)
-      return true;
+    if (IE2 == VU)
+      return VU->hasOneUse();
+    if (IE1 == V)
+      return V->hasOneUse();
     if (IE1) {
-      if (IE1 != VU && !IE1->hasOneUse())
+      if ((IE1 != VU && !IE1->hasOneUse()) ||
+          getInsertIndex(IE1).value_or(Idx2) == Idx2)
         IE1 = nullptr;
       else
         IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0));
     }
     if (IE2) {
-      if (IE2 != V && !IE2->hasOneUse())
+      if ((IE2 != V && !IE2->hasOneUse()) ||
+          getInsertIndex(IE2).value_or(Idx1) == Idx1)
         IE2 = nullptr;
       else
         IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0));
@@ -5770,6 +6866,153 @@ static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU,
   return false;
 }
 
+/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
+/// buildvector sequence.
+static bool isFirstInsertElement(const InsertElementInst *IE1,
+                                 const InsertElementInst *IE2) {
+  if (IE1 == IE2)
+    return false;
+  const auto *I1 = IE1;
+  const auto *I2 = IE2;
+  const InsertElementInst *PrevI1;
+  const InsertElementInst *PrevI2;
+  unsigned Idx1 = *getInsertIndex(IE1);
+  unsigned Idx2 = *getInsertIndex(IE2);
+  do {
+    if (I2 == IE1)
+      return true;
+    if (I1 == IE2)
+      return false;
+    PrevI1 = I1;
+    PrevI2 = I2;
+    if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
+        getInsertIndex(I1).value_or(Idx2) != Idx2)
+      I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
+    if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
+        getInsertIndex(I2).value_or(Idx1) != Idx1)
+      I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
+  } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
+  llvm_unreachable("Two different buildvectors not expected.");
+}
+
+namespace {
+/// Returns incoming Value *, if the requested type is Value * too, or a default
+/// value, otherwise.
+struct ValueSelect {
+  template <typename U>
+  static typename std::enable_if<std::is_same<Value *, U>::value, Value *>::type
+  get(Value *V) {
+    return V;
+  }
+  template <typename U>
+  static typename std::enable_if<!std::is_same<Value *, U>::value, U>::type
+  get(Value *) {
+    return U();
+  }
+};
+} // namespace
+
+/// Does the analysis of the provided shuffle masks and performs the requested
+/// actions on the vectors with the given shuffle masks. It tries to do it in
+/// several steps.
+/// 1. If the Base vector is not undef vector, resizing the very first mask to
+/// have common VF and perform action for 2 input vectors (including non-undef
+/// Base). Other shuffle masks are combined with the resulting after the 1 stage
+/// and processed as a shuffle of 2 elements.
+/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
+/// action only for 1 vector with the given mask, if it is not the identity
+/// mask.
+/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
+/// vectors, combing the masks properly between the steps.
+template <typename T>
+static T *performExtractsShuffleAction(
+    MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
+    function_ref<unsigned(T *)> GetVF,
+    function_ref<std::pair<T *, bool>(T *, ArrayRef<int>)> ResizeAction,
+    function_ref<T *(ArrayRef<int>, ArrayRef<T *>)> Action) {
+  assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
+  SmallVector<int> Mask(ShuffleMask.begin()->second);
+  auto VMIt = std::next(ShuffleMask.begin());
+  T *Prev = nullptr;
+  bool IsBaseNotUndef = !isUndefVector(Base);
+  if (IsBaseNotUndef) {
+    // Base is not undef, need to combine it with the next subvectors.
+    std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask);
+    for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
+      if (Mask[Idx] == UndefMaskElem)
+        Mask[Idx] = Idx;
+      else
+        Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
+    }
+    auto *V = ValueSelect::get<T *>(Base);
+    (void)V;
+    assert((!V || GetVF(V) == Mask.size()) &&
+           "Expected base vector of VF number of elements.");
+    Prev = Action(Mask, {nullptr, Res.first});
+  } else if (ShuffleMask.size() == 1) {
+    // Base is undef and only 1 vector is shuffled - perform the action only for
+    // single vector, if the mask is not the identity mask.
+    std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask);
+    if (Res.second)
+      // Identity mask is found.
+      Prev = Res.first;
+    else
+      Prev = Action(Mask, {ShuffleMask.begin()->first});
+  } else {
+    // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
+    // shuffles step by step, combining shuffle between the steps.
+    unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
+    unsigned Vec2VF = GetVF(VMIt->first);
+    if (Vec1VF == Vec2VF) {
+      // No need to resize the input vectors since they are of the same size, we
+      // can shuffle them directly.
+      ArrayRef<int> SecMask = VMIt->second;
+      for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
+        if (SecMask[I] != UndefMaskElem) {
+          assert(Mask[I] == UndefMaskElem && "Multiple uses of scalars.");
+          Mask[I] = SecMask[I] + Vec1VF;
+        }
+      }
+      Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
+    } else {
+      // Vectors of different sizes - resize and reshuffle.
+      std::pair<T *, bool> Res1 =
+          ResizeAction(ShuffleMask.begin()->first, Mask);
+      std::pair<T *, bool> Res2 = ResizeAction(VMIt->first, VMIt->second);
+      ArrayRef<int> SecMask = VMIt->second;
+      for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
+        if (Mask[I] != UndefMaskElem) {
+          assert(SecMask[I] == UndefMaskElem && "Multiple uses of scalars.");
+          if (Res1.second)
+            Mask[I] = I;
+        } else if (SecMask[I] != UndefMaskElem) {
+          assert(Mask[I] == UndefMaskElem && "Multiple uses of scalars.");
+          Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
+        }
+      }
+      Prev = Action(Mask, {Res1.first, Res2.first});
+    }
+    VMIt = std::next(VMIt);
+  }
+  // Perform requested actions for the remaining masks/vectors.
+  for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
+    // Shuffle other input vectors, if any.
+    std::pair<T *, bool> Res = ResizeAction(VMIt->first, VMIt->second);
+    ArrayRef<int> SecMask = VMIt->second;
+    for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
+      if (SecMask[I] != UndefMaskElem) {
+        assert((Mask[I] == UndefMaskElem || IsBaseNotUndef) &&
+               "Multiple uses of scalars.");
+        Mask[I] = (Res.second ? I : SecMask[I]) + VF;
+      } else if (Mask[I] != UndefMaskElem) {
+        Mask[I] = I;
+      }
+    }
+    Prev = Action(Mask, {Prev, Res.first});
+  }
+  return Prev;
+}
+
 InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
   InstructionCost Cost = 0;
   LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
@@ -5778,7 +7021,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
   unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
 
   for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
-    TreeEntry &TE = *VectorizableTree[I].get();
+    TreeEntry &TE = *VectorizableTree[I];
 
     InstructionCost C = getEntryCost(&TE, VectorizedVals);
     Cost += C;
@@ -5790,9 +7033,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
 
   SmallPtrSet<Value *, 16> ExtractCostCalculated;
   InstructionCost ExtractCost = 0;
-  SmallVector<unsigned> VF;
-  SmallVector<SmallVector<int>> ShuffleMask;
-  SmallVector<Value *> FirstUsers;
+  SmallVector<MapVector<const TreeEntry *, SmallVector<int>>> ShuffleMasks;
+  SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers;
   SmallVector<APInt> DemandedElts;
   for (ExternalUser &EU : ExternalUses) {
     // We only add extract cost once for the same scalar.
@@ -5819,42 +7061,59 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
     // to detect it as a final shuffled/identity match.
     if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
       if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
-        Optional<int> InsertIdx = getInsertIndex(VU, 0);
-        if (!InsertIdx || *InsertIdx == UndefMaskElem)
-          continue;
-        auto *It = find_if(FirstUsers, [VU](Value *V) {
-          return areTwoInsertFromSameBuildVector(VU,
-                                                 cast<InsertElementInst>(V));
-        });
-        int VecId = -1;
-        if (It == FirstUsers.end()) {
-          VF.push_back(FTy->getNumElements());
-          ShuffleMask.emplace_back(VF.back(), UndefMaskElem);
-          // Find the insertvector, vectorized in tree, if any.
-          Value *Base = VU;
-          while (isa<InsertElementInst>(Base)) {
-            // Build the mask for the vectorized insertelement instructions.
-            if (const TreeEntry *E = getTreeEntry(Base)) {
-              VU = cast<InsertElementInst>(Base);
-              do {
-                int Idx = E->findLaneForValue(Base);
-                ShuffleMask.back()[Idx] = Idx;
-                Base = cast<InsertElementInst>(Base)->getOperand(0);
-              } while (E == getTreeEntry(Base));
-              break;
+        Optional<unsigned> InsertIdx = getInsertIndex(VU);
+        if (InsertIdx) {
+          const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
+          auto *It =
+              find_if(FirstUsers,
+                      [VU](const std::pair<Value *, const TreeEntry *> &Pair) {
+                        return areTwoInsertFromSameBuildVector(
+                            VU, cast<InsertElementInst>(Pair.first));
+                      });
+          int VecId = -1;
+          if (It == FirstUsers.end()) {
+            (void)ShuffleMasks.emplace_back();
+            SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];
+            if (Mask.empty())
+              Mask.assign(FTy->getNumElements(), UndefMaskElem);
+            // Find the insertvector, vectorized in tree, if any.
+            Value *Base = VU;
+            while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
+              if (IEBase != EU.User &&
+                  (!IEBase->hasOneUse() ||
+                   getInsertIndex(IEBase).value_or(*InsertIdx) == *InsertIdx))
+                break;
+              // Build the mask for the vectorized insertelement instructions.
+              if (const TreeEntry *E = getTreeEntry(IEBase)) {
+                VU = IEBase;
+                do {
+                  IEBase = cast<InsertElementInst>(Base);
+                  int Idx = *getInsertIndex(IEBase);
+                  assert(Mask[Idx] == UndefMaskElem &&
+                         "InsertElementInstruction used already.");
+                  Mask[Idx] = Idx;
+                  Base = IEBase->getOperand(0);
+                } while (E == getTreeEntry(Base));
+                break;
+              }
+              Base = cast<InsertElementInst>(Base)->getOperand(0);
             }
-            Base = cast<InsertElementInst>(Base)->getOperand(0);
+            FirstUsers.emplace_back(VU, ScalarTE);
+            DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
+            VecId = FirstUsers.size() - 1;
+          } else {
+            if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first)))
+              It->first = VU;
+            VecId = std::distance(FirstUsers.begin(), It);
           }
-          FirstUsers.push_back(VU);
-          DemandedElts.push_back(APInt::getZero(VF.back()));
-          VecId = FirstUsers.size() - 1;
-        } else {
-          VecId = std::distance(FirstUsers.begin(), It);
+          int InIdx = *InsertIdx;
+          SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE];
+          if (Mask.empty())
+            Mask.assign(FTy->getNumElements(), UndefMaskElem);
+          Mask[InIdx] = EU.Lane;
+          DemandedElts[VecId].setBit(InIdx);
+          continue;
         }
-        int Idx = *InsertIdx;
-        ShuffleMask[VecId][Idx] = EU.Lane;
-        DemandedElts[VecId].setBit(Idx);
-        continue;
       }
     }
 
@@ -5878,86 +7137,75 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
 
   InstructionCost SpillCost = getSpillCost();
   Cost += SpillCost + ExtractCost;
-  if (FirstUsers.size() == 1) {
-    int Limit = ShuffleMask.front().size() * 2;
-    if (all_of(ShuffleMask.front(), [Limit](int Idx) { return Idx < Limit; }) &&
-        !ShuffleVectorInst::isIdentityMask(ShuffleMask.front())) {
-      InstructionCost C = TTI->getShuffleCost(
+  auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask) {
+    InstructionCost C = 0;
+    unsigned VF = Mask.size();
+    unsigned VecVF = TE->getVectorFactor();
+    if (VF != VecVF &&
+        (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
+         (all_of(Mask,
+                 [VF](int Idx) { return Idx < 2 * static_cast<int>(VF); }) &&
+          !ShuffleVectorInst::isIdentityMask(Mask)))) {
+      SmallVector<int> OrigMask(VecVF, UndefMaskElem);
+      std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
+                OrigMask.begin());
+      C = TTI->getShuffleCost(
           TTI::SK_PermuteSingleSrc,
-          cast<FixedVectorType>(FirstUsers.front()->getType()),
-          ShuffleMask.front());
-      LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
-                        << " for final shuffle of insertelement external users "
-                        << *VectorizableTree.front()->Scalars.front() << ".\n"
-                        << "SLP: Current total cost = " << Cost << "\n");
+          FixedVectorType::get(TE->getMainOp()->getType(), VecVF), OrigMask);
+      LLVM_DEBUG(
+          dbgs() << "SLP: Adding cost " << C
+                 << " for final shuffle of insertelement external users.\n";
+          TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
       Cost += C;
+      return std::make_pair(TE, true);
     }
+    return std::make_pair(TE, false);
+  };
+  // Calculate the cost of the reshuffled vectors, if any.
+  for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
+    Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0);
+    unsigned VF = ShuffleMasks[I].begin()->second.size();
+    auto *FTy = FixedVectorType::get(
+        cast<VectorType>(FirstUsers[I].first->getType())->getElementType(), VF);
+    auto Vector = ShuffleMasks[I].takeVector();
+    auto &&EstimateShufflesCost = [this, FTy,
+                                   &Cost](ArrayRef<int> Mask,
+                                          ArrayRef<const TreeEntry *> TEs) {
+      assert((TEs.size() == 1 || TEs.size() == 2) &&
+             "Expected exactly 1 or 2 tree entries.");
+      if (TEs.size() == 1) {
+        int Limit = 2 * Mask.size();
+        if (!all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) ||
+            !ShuffleVectorInst::isIdentityMask(Mask)) {
+          InstructionCost C =
+              TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FTy, Mask);
+          LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+                            << " for final shuffle of insertelement "
+                               "external users.\n";
+                     TEs.front()->dump();
+                     dbgs() << "SLP: Current total cost = " << Cost << "\n");
+          Cost += C;
+        }
+      } else {
+        InstructionCost C =
+            TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, FTy, Mask);
+        LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+                          << " for final shuffle of vector node and external "
+                             "insertelement users.\n";
+                   if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
+                   dbgs() << "SLP: Current total cost = " << Cost << "\n");
+        Cost += C;
+      }
+      return TEs.back();
+    };
+    (void)performExtractsShuffleAction<const TreeEntry>(
+        makeMutableArrayRef(Vector.data(), Vector.size()), Base,
+        [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
+        EstimateShufflesCost);
     InstructionCost InsertCost = TTI->getScalarizationOverhead(
-        cast<FixedVectorType>(FirstUsers.front()->getType()),
-        DemandedElts.front(), /*Insert*/ true, /*Extract*/ false);
-    LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost
-                      << " for insertelements gather.\n"
-                      << "SLP: Current total cost = " << Cost << "\n");
-    Cost -= InsertCost;
-  } else if (FirstUsers.size() >= 2) {
-    unsigned MaxVF = *std::max_element(VF.begin(), VF.end());
-    // Combined masks of the first 2 vectors.
-    SmallVector<int> CombinedMask(MaxVF, UndefMaskElem);
-    copy(ShuffleMask.front(), CombinedMask.begin());
-    APInt CombinedDemandedElts = DemandedElts.front().zextOrSelf(MaxVF);
-    auto *VecTy = FixedVectorType::get(
-        cast<VectorType>(FirstUsers.front()->getType())->getElementType(),
-        MaxVF);
-    for (int I = 0, E = ShuffleMask[1].size(); I < E; ++I) {
-      if (ShuffleMask[1][I] != UndefMaskElem) {
-        CombinedMask[I] = ShuffleMask[1][I] + MaxVF;
-        CombinedDemandedElts.setBit(I);
-      }
-    }
-    InstructionCost C =
-        TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask);
-    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
-                      << " for final shuffle of vector node and external "
-                         "insertelement users "
-                      << *VectorizableTree.front()->Scalars.front() << ".\n"
-                      << "SLP: Current total cost = " << Cost << "\n");
-    Cost += C;
-    InstructionCost InsertCost = TTI->getScalarizationOverhead(
-        VecTy, CombinedDemandedElts, /*Insert*/ true, /*Extract*/ false);
-    LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost
-                      << " for insertelements gather.\n"
-                      << "SLP: Current total cost = " << Cost << "\n");
+        cast<FixedVectorType>(FirstUsers[I].first->getType()), DemandedElts[I],
+        /*Insert*/ true, /*Extract*/ false);
     Cost -= InsertCost;
-    for (int I = 2, E = FirstUsers.size(); I < E; ++I) {
-      // Other elements - permutation of 2 vectors (the initial one and the
-      // next Ith incoming vector).
-      unsigned VF = ShuffleMask[I].size();
-      for (unsigned Idx = 0; Idx < VF; ++Idx) {
-        int Mask = ShuffleMask[I][Idx];
-        if (Mask != UndefMaskElem)
-          CombinedMask[Idx] = MaxVF + Mask;
-        else if (CombinedMask[Idx] != UndefMaskElem)
-          CombinedMask[Idx] = Idx;
-      }
-      for (unsigned Idx = VF; Idx < MaxVF; ++Idx)
-        if (CombinedMask[Idx] != UndefMaskElem)
-          CombinedMask[Idx] = Idx;
-      InstructionCost C =
-          TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask);
-      LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
-                        << " for final shuffle of vector node and external "
-                           "insertelement users "
-                        << *VectorizableTree.front()->Scalars.front() << ".\n"
-                        << "SLP: Current total cost = " << Cost << "\n");
-      Cost += C;
-      InstructionCost InsertCost = TTI->getScalarizationOverhead(
-          cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I],
-          /*Insert*/ true, /*Extract*/ false);
-      LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost
-                        << " for insertelements gather.\n"
-                        << "SLP: Current total cost = " << Cost << "\n");
-      Cost -= InsertCost;
-    }
   }
 
 #ifndef NDEBUG
@@ -6050,6 +7298,12 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
     }
   }
 
+  if (UsedTEs.empty()) {
+    assert(all_of(TE->Scalars, UndefValue::classof) &&
+           "Expected vector of undefs only.");
+    return None;
+  }
+
   unsigned VF = 0;
   if (UsedTEs.size() == 1) {
     // Try to find the perfect match in another gather node at first.
@@ -6109,17 +7363,11 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
   return None;
 }
 
-InstructionCost
-BoUpSLP::getGatherCost(FixedVectorType *Ty,
-                       const DenseSet<unsigned> &ShuffledIndices,
-                       bool NeedToShuffle) const {
-  unsigned NumElts = Ty->getNumElements();
-  APInt DemandedElts = APInt::getZero(NumElts);
-  for (unsigned I = 0; I < NumElts; ++I)
-    if (!ShuffledIndices.count(I))
-      DemandedElts.setBit(I);
+InstructionCost BoUpSLP::getGatherCost(FixedVectorType *Ty,
+                                       const APInt &ShuffledIndices,
+                                       bool NeedToShuffle) const {
   InstructionCost Cost =
-      TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true,
+      TTI->getScalarizationOverhead(Ty, ~ShuffledIndices, /*Insert*/ true,
                                     /*Extract*/ false);
   if (NeedToShuffle)
     Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
@@ -6136,19 +7384,19 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
   // Find the cost of inserting/extracting values from the vector.
   // Check if the same elements are inserted several times and count them as
   // shuffle candidates.
-  DenseSet<unsigned> ShuffledElements;
+  APInt ShuffledElements = APInt::getZero(VL.size());
   DenseSet<Value *> UniqueElements;
   // Iterate in reverse order to consider insert elements with the high cost.
   for (unsigned I = VL.size(); I > 0; --I) {
     unsigned Idx = I - 1;
     // No need to shuffle duplicates for constants.
     if (isConstant(VL[Idx])) {
-      ShuffledElements.insert(Idx);
+      ShuffledElements.setBit(Idx);
       continue;
     }
     if (!UniqueElements.insert(VL[Idx]).second) {
       DuplicateNonConst = true;
-      ShuffledElements.insert(Idx);
+      ShuffledElements.setBit(Idx);
     }
   }
   return getGatherCost(VecTy, ShuffledElements, DuplicateNonConst);
@@ -6173,14 +7421,83 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
 
 void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
   // Get the basic block this bundle is in. All instructions in the bundle
-  // should be in this block.
+  // should be in this block (except for extractelement-like instructions with
+  // constant indeces).
   auto *Front = E->getMainOp();
   auto *BB = Front->getParent();
   assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {
+    if (E->getOpcode() == Instruction::GetElementPtr &&
+        !isa<GetElementPtrInst>(V))
+      return true;
     auto *I = cast<Instruction>(V);
-    return !E->isOpcodeOrAlt(I) || I->getParent() == BB;
+    return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
+           isVectorLikeInstWithConstOps(I);
   }));
 
+  auto &&FindLastInst = [E, Front, this, &BB]() {
+    Instruction *LastInst = Front;
+    for (Value *V : E->Scalars) {
+      auto *I = dyn_cast<Instruction>(V);
+      if (!I)
+        continue;
+      if (LastInst->getParent() == I->getParent()) {
+        if (LastInst->comesBefore(I))
+          LastInst = I;
+        continue;
+      }
+      assert(isVectorLikeInstWithConstOps(LastInst) &&
+             isVectorLikeInstWithConstOps(I) &&
+             "Expected vector-like insts only.");
+      if (!DT->isReachableFromEntry(LastInst->getParent())) {
+        LastInst = I;
+        continue;
+      }
+      if (!DT->isReachableFromEntry(I->getParent()))
+        continue;
+      auto *NodeA = DT->getNode(LastInst->getParent());
+      auto *NodeB = DT->getNode(I->getParent());
+      assert(NodeA && "Should only process reachable instructions");
+      assert(NodeB && "Should only process reachable instructions");
+      assert((NodeA == NodeB) ==
+                 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
+             "Different nodes should have different DFS numbers");
+      if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
+        LastInst = I;
+    }
+    BB = LastInst->getParent();
+    return LastInst;
+  };
+
+  auto &&FindFirstInst = [E, Front]() {
+    Instruction *FirstInst = Front;
+    for (Value *V : E->Scalars) {
+      auto *I = dyn_cast<Instruction>(V);
+      if (!I)
+        continue;
+      if (I->comesBefore(FirstInst))
+        FirstInst = I;
+    }
+    return FirstInst;
+  };
+
+  // Set the insert point to the beginning of the basic block if the entry
+  // should not be scheduled.
+  if (E->State != TreeEntry::NeedToGather &&
+      doesNotNeedToSchedule(E->Scalars)) {
+    Instruction *InsertInst;
+    if (all_of(E->Scalars, isUsedOutsideBlock))
+      InsertInst = FindLastInst();
+    else
+      InsertInst = FindFirstInst();
+    // If the instruction is PHI, set the insert point after all the PHIs.
+    if (isa<PHINode>(InsertInst))
+      InsertInst = BB->getFirstNonPHI();
+    BasicBlock::iterator InsertPt = InsertInst->getIterator();
+    Builder.SetInsertPoint(BB, InsertPt);
+    Builder.SetCurrentDebugLocation(Front->getDebugLoc());
+    return;
+  }
+
   // The last instruction in the bundle in program order.
   Instruction *LastInst = nullptr;
 
@@ -6189,8 +7506,10 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
   // VL.back() and iterate over schedule data until we reach the end of the
   // bundle. The end of the bundle is marked by null ScheduleData.
   if (BlocksSchedules.count(BB)) {
-    auto *Bundle =
-        BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back()));
+    Value *V = E->isOneOf(E->Scalars.back());
+    if (doesNotNeedToBeScheduled(V))
+      V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
+    auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
     if (Bundle && Bundle->isPartOfBundle())
       for (; Bundle; Bundle = Bundle->NextInBundle)
         if (Bundle->OpValue == Bundle->Inst)
@@ -6216,19 +7535,16 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
   // we both exit early from buildTree_rec and that the bundle be out-of-order
   // (causing us to iterate all the way to the end of the block).
   if (!LastInst) {
-    SmallPtrSet<Value *, 16> Bundle(E->Scalars.begin(), E->Scalars.end());
-    for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
-      if (Bundle.erase(&I) && E->isOpcodeOrAlt(&I))
-        LastInst = &I;
-      if (Bundle.empty())
-        break;
-    }
+    LastInst = FindLastInst();
+    // If the instruction is PHI, set the insert point after all the PHIs.
+    if (isa<PHINode>(LastInst))
+      LastInst = BB->getFirstNonPHI()->getPrevNode();
   }
   assert(LastInst && "Failed to find last instruction in bundle");
 
   // Set the insertion point after the last instruction in the bundle. Set the
   // debug location to Front.
-  Builder.SetInsertPoint(BB, ++LastInst->getIterator());
+  Builder.SetInsertPoint(BB, std::next(LastInst->getIterator()));
   Builder.SetCurrentDebugLocation(Front->getDebugLoc());
 }
 
@@ -6358,8 +7674,15 @@ public:
 } // namespace
 
 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
-  unsigned VF = VL.size();
+  const unsigned VF = VL.size();
   InstructionsState S = getSameOpcode(VL);
+  // Special processing for GEPs bundle, which may include non-gep values.
+  if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
+    const auto *It =
+        find_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); });
+    if (It != VL.end())
+      S = getSameOpcode(*It);
+  }
   if (S.getOpcode()) {
     if (TreeEntry *E = getTreeEntry(S.OpValue))
       if (E->isSame(VL)) {
@@ -6414,7 +7737,18 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
       }
   }
 
-  // Check that every instruction appears once in this bundle.
+  // Can't vectorize this, so simply build a new vector with each lane
+  // corresponding to the requested value.
+  return createBuildVector(VL);
+}
+Value *BoUpSLP::createBuildVector(ArrayRef<Value *> VL) {
+  assert(any_of(VectorizableTree,
+                [VL](const std::unique_ptr<TreeEntry> &TE) {
+                  return TE->State == TreeEntry::NeedToGather && TE->isSame(VL);
+                }) &&
+         "Non-matching gather node.");
+  unsigned VF = VL.size();
+  // Exploit possible reuse of values across lanes.
   SmallVector<int> ReuseShuffleIndicies;
   SmallVector<Value *> UniqueValues;
   if (VL.size() > 2) {
@@ -6447,6 +7781,10 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
       ReuseShuffleIndicies.append(VF - ReuseShuffleIndicies.size(),
                                   UndefMaskElem);
     } else if (UniqueValues.size() >= VF - 1 || UniqueValues.size() <= 1) {
+      if (UniqueValues.empty()) {
+        assert(all_of(VL, UndefValue::classof) && "Expected list of undefs.");
+        NumValues = VF;
+      }
       ReuseShuffleIndicies.clear();
       UniqueValues.clear();
       UniqueValues.append(VL.begin(), std::next(VL.begin(), NumValues));
@@ -6486,7 +7824,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     SmallVector<const TreeEntry *> Entries;
     Optional<TargetTransformInfo::ShuffleKind> Shuffle =
         isGatherShuffledEntry(E, Mask, Entries);
-    if (Shuffle.hasValue()) {
+    if (Shuffle) {
       assert((Entries.size() == 1 || Entries.size() == 2) &&
              "Expected shuffle of 1 or 2 entries.");
       Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue,
@@ -6520,14 +7858,20 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
   auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
   switch (ShuffleOrOp) {
     case Instruction::PHI: {
-      assert(
-          (E->ReorderIndices.empty() || E != VectorizableTree.front().get()) &&
-          "PHI reordering is free.");
+      assert((E->ReorderIndices.empty() ||
+              E != VectorizableTree.front().get() ||
+              !E->UserTreeIndices.empty()) &&
+             "PHI reordering is free.");
       auto *PH = cast<PHINode>(VL0);
       Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
       Builder.SetCurrentDebugLocation(PH->getDebugLoc());
       PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
       Value *V = NewPhi;
+
+      // Adjust insertion point once all PHI's have been generated.
+      Builder.SetInsertPoint(&*PH->getParent()->getFirstInsertionPt());
+      Builder.SetCurrentDebugLocation(PH->getDebugLoc());
+
       ShuffleBuilder.addInversedMask(E->ReorderIndices);
       ShuffleBuilder.addMask(E->ReuseShuffleIndices);
       V = ShuffleBuilder.finalize(V);
@@ -6593,7 +7937,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
           cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
       const unsigned NumScalars = E->Scalars.size();
 
-      unsigned Offset = *getInsertIndex(VL0, 0);
+      unsigned Offset = *getInsertIndex(VL0);
       assert(Offset < NumElts && "Failed to find vector index offset");
 
       // Create shuffle to resize vector
@@ -6611,11 +7955,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       Mask.swap(PrevMask);
       for (unsigned I = 0; I < NumScalars; ++I) {
         Value *Scalar = E->Scalars[PrevMask[I]];
-        Optional<int> InsertIdx = getInsertIndex(Scalar, 0);
-        if (!InsertIdx || *InsertIdx == UndefMaskElem)
-          continue;
-        IsIdentity &= *InsertIdx - Offset == I;
-        Mask[*InsertIdx - Offset] = I;
+        unsigned InsertIdx = *getInsertIndex(Scalar);
+        IsIdentity &= InsertIdx - Offset == I;
+        Mask[InsertIdx - Offset] = I;
       }
       if (!IsIdentity || NumElts != NumScalars) {
         V = Builder.CreateShuffleVector(V, Mask);
@@ -6802,19 +8144,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       unsigned AS = LI->getPointerAddressSpace();
       Value *PO = LI->getPointerOperand();
       if (E->State == TreeEntry::Vectorize) {
-
         Value *VecPtr = Builder.CreateBitCast(PO, VecTy->getPointerTo(AS));
+        NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
 
         // The pointer operand uses an in-tree scalar so we add the new BitCast
-        // to ExternalUses list to make sure that an extract will be generated
-        // in the future.
+        // or LoadInst to ExternalUses list to make sure that an extract will
+        // be generated in the future.
         if (TreeEntry *Entry = getTreeEntry(PO)) {
           // Find which lane we need to extract.
           unsigned FoundLane = Entry->findLaneForValue(PO);
-          ExternalUses.emplace_back(PO, cast<User>(VecPtr), FoundLane);
+          ExternalUses.emplace_back(
+              PO, PO != VecPtr ? cast<User>(VecPtr) : NewLI, FoundLane);
         }
-
-        NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
       } else {
         assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
         Value *VecPtr = vectorizeTree(E->getOperand(0));
@@ -6822,7 +8163,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         Align CommonAlignment = LI->getAlign();
         for (Value *V : E->Scalars)
           CommonAlignment =
-              commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());
+              std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
         NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
       }
       Value *V = propagateMetadata(NewLI, E->Scalars);
@@ -6847,17 +8188,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       Value *ScalarPtr = SI->getPointerOperand();
       Value *VecPtr = Builder.CreateBitCast(
           ScalarPtr, VecValue->getType()->getPointerTo(AS));
-      StoreInst *ST = Builder.CreateAlignedStore(VecValue, VecPtr,
-                                                 SI->getAlign());
+      StoreInst *ST =
+          Builder.CreateAlignedStore(VecValue, VecPtr, SI->getAlign());
 
-      // The pointer operand uses an in-tree scalar, so add the new BitCast to
-      // ExternalUses to make sure that an extract will be generated in the
-      // future.
+      // The pointer operand uses an in-tree scalar, so add the new BitCast or
+      // StoreInst to ExternalUses to make sure that an extract will be
+      // generated in the future.
       if (TreeEntry *Entry = getTreeEntry(ScalarPtr)) {
         // Find which lane we need to extract.
         unsigned FoundLane = Entry->findLaneForValue(ScalarPtr);
-        ExternalUses.push_back(
-            ExternalUser(ScalarPtr, cast<User>(VecPtr), FoundLane));
+        ExternalUses.push_back(ExternalUser(
+            ScalarPtr, ScalarPtr != VecPtr ? cast<User>(VecPtr) : ST,
+            FoundLane));
       }
 
       Value *V = propagateMetadata(ST, E->Scalars);
@@ -6879,8 +8221,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       }
 
       Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
-      if (Instruction *I = dyn_cast<Instruction>(V))
-        V = propagateMetadata(I, E->Scalars);
+      if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
+        SmallVector<Value *> GEPs;
+        for (Value *V : E->Scalars) {
+          if (isa<GetElementPtrInst>(V))
+            GEPs.push_back(V);
+        }
+        V = propagateMetadata(I, GEPs);
+      }
 
       ShuffleBuilder.addInversedMask(E->ReorderIndices);
       ShuffleBuilder.addMask(E->ReuseShuffleIndices);
@@ -6913,11 +8261,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         ValueList OpVL;
         // Some intrinsics have scalar arguments. This argument should not be
         // vectorized.
-        if (UseIntrinsic && hasVectorInstrinsicScalarOpd(IID, j)) {
+        if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(IID, j)) {
           CallInst *CEI = cast<CallInst>(VL0);
           ScalarArg = CEI->getArgOperand(j);
           OpVecs.push_back(CEI->getArgOperand(j));
-          if (hasVectorInstrinsicOverloadedScalarOpd(IID, j))
+          if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j))
             TysForDecl.push_back(ScalarArg->getType());
           continue;
         }
@@ -6925,6 +8273,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         Value *OpVec = vectorizeTree(E->getOperand(j));
         LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
         OpVecs.push_back(OpVec);
+        if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j))
+          TysForDecl.push_back(OpVec->getType());
       }
 
       Function *CF;
@@ -6997,10 +8347,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
         auto *AltCI = cast<CmpInst>(E->getAltOp());
         CmpInst::Predicate AltPred = AltCI->getPredicate();
-        unsigned AltIdx =
-            std::distance(E->Scalars.begin(), find(E->Scalars, AltCI));
-        if (AltCI->getOperand(0) != E->getOperand(0)[AltIdx])
-          AltPred = CmpInst::getSwappedPredicate(AltPred);
         V1 = Builder.CreateCmp(AltPred, LHS, RHS);
       } else {
         V0 = Builder.CreateCast(
@@ -7022,34 +8368,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       // each vector operation.
       ValueList OpScalars, AltScalars;
       SmallVector<int> Mask;
-      buildSuffleEntryMask(
+      buildShuffleEntryMask(
           E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
           [E](Instruction *I) {
             assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
-            if (auto *CI0 = dyn_cast<CmpInst>(E->getMainOp())) {
-              auto *AltCI0 = cast<CmpInst>(E->getAltOp());
-              auto *CI = cast<CmpInst>(I);
-              CmpInst::Predicate P0 = CI0->getPredicate();
-              CmpInst::Predicate AltP0 = AltCI0->getPredicate();
-              CmpInst::Predicate AltP0Swapped =
-                  CmpInst::getSwappedPredicate(AltP0);
-              CmpInst::Predicate CurrentPred = CI->getPredicate();
-              CmpInst::Predicate CurrentPredSwapped =
-                  CmpInst::getSwappedPredicate(CurrentPred);
-              if (P0 == AltP0 || P0 == AltP0Swapped) {
-                // Alternate cmps have same/swapped predicate as main cmps but
-                // different order of compatible operands.
-                return !(
-                    (P0 == CurrentPred &&
-                     areCompatibleCmpOps(CI0->getOperand(0), CI0->getOperand(1),
-                                         I->getOperand(0), I->getOperand(1))) ||
-                    (P0 == CurrentPredSwapped &&
-                     areCompatibleCmpOps(CI0->getOperand(0), CI0->getOperand(1),
-                                         I->getOperand(1), I->getOperand(0))));
-              }
-              return CurrentPred != P0 && CurrentPredSwapped != P0;
-            }
-            return I->getOpcode() == E->getAltOpcode();
+            return isAlternateInstruction(I, E->getMainOp(), E->getAltOp());
           },
           Mask, &OpScalars, &AltScalars);
 
@@ -7080,6 +8403,17 @@ Value *BoUpSLP::vectorizeTree() {
   return vectorizeTree(ExternallyUsedValues);
 }
 
+namespace {
+/// Data type for handling buildvector sequences with the reused scalars from
+/// other tree entries.
+struct ShuffledInsertData {
+  /// List of insertelements to be replaced by shuffles.
+  SmallVector<InsertElementInst *> InsertElements;
+  /// The parent vectors and shuffle mask for the given list of inserts.
+  MapVector<Value *, SmallVector<int>> ValueMasks;
+};
+} // namespace
+
 Value *
 BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
   // All blocks must be scheduled before any instructions are inserted.
@@ -7113,6 +8447,9 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
   LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
                     << " values .\n");
 
+  SmallVector<ShuffledInsertData> ShuffledInserts;
+  // Maps vector instruction to original insertelement instruction
+  DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
   // Extract all of the elements with the external uses.
   for (const auto &ExternalUse : ExternalUses) {
     Value *Scalar = ExternalUse.Scalar;
@@ -7126,6 +8463,10 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
     assert(E && "Invalid scalar");
     assert(E->State != TreeEntry::NeedToGather &&
            "Extracting from a gather list");
+    // Non-instruction pointers are not deleted, just skip them.
+    if (E->getOpcode() == Instruction::GetElementPtr &&
+        !isa<GetElementPtrInst>(Scalar))
+      continue;
 
     Value *Vec = E->VectorizedValue;
     assert(Vec && "Can't find vectorizable value");
@@ -7152,6 +8493,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
       assert(isa<FixedVectorType>(Scalar->getType()) &&
              isa<InsertElementInst>(Scalar) &&
              "In-tree scalar of vector type is not insertelement?");
+      auto *IE = cast<InsertElementInst>(Scalar);
+      VectorToInsertElement.try_emplace(Vec, IE);
       return Vec;
     };
     // If User == nullptr, the Scalar is used as extra arg. Generate
@@ -7180,6 +8523,69 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
       continue;
     }
 
+    if (auto *VU = dyn_cast<InsertElementInst>(User)) {
+      // Skip if the scalar is another vector op or Vec is not an instruction.
+      if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
+        if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
+          Optional<unsigned> InsertIdx = getInsertIndex(VU);
+          if (InsertIdx) {
+            // Need to use original vector, if the root is truncated.
+            if (MinBWs.count(Scalar) &&
+                VectorizableTree[0]->VectorizedValue == Vec)
+              Vec = VectorRoot;
+            auto *It =
+                find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) {
+                  // Checks if 2 insertelements are from the same buildvector.
+                  InsertElementInst *VecInsert = Data.InsertElements.front();
+                  return areTwoInsertFromSameBuildVector(VU, VecInsert);
+                });
+            unsigned Idx = *InsertIdx;
+            if (It == ShuffledInserts.end()) {
+              (void)ShuffledInserts.emplace_back();
+              It = std::next(ShuffledInserts.begin(),
+                             ShuffledInserts.size() - 1);
+              SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
+              if (Mask.empty())
+                Mask.assign(FTy->getNumElements(), UndefMaskElem);
+              // Find the insertvector, vectorized in tree, if any.
+              Value *Base = VU;
+              while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
+                if (IEBase != User &&
+                    (!IEBase->hasOneUse() ||
+                     getInsertIndex(IEBase).value_or(Idx) == Idx))
+                  break;
+                // Build the mask for the vectorized insertelement instructions.
+                if (const TreeEntry *E = getTreeEntry(IEBase)) {
+                  do {
+                    IEBase = cast<InsertElementInst>(Base);
+                    int IEIdx = *getInsertIndex(IEBase);
+                    assert(Mask[Idx] == UndefMaskElem &&
+                           "InsertElementInstruction used already.");
+                    Mask[IEIdx] = IEIdx;
+                    Base = IEBase->getOperand(0);
+                  } while (E == getTreeEntry(Base));
+                  break;
+                }
+                Base = cast<InsertElementInst>(Base)->getOperand(0);
+                // After the vectorization the def-use chain has changed, need
+                // to look through original insertelement instructions, if they
+                // get replaced by vector instructions.
+                auto It = VectorToInsertElement.find(Base);
+                if (It != VectorToInsertElement.end())
+                  Base = It->second;
+              }
+            }
+            SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
+            if (Mask.empty())
+              Mask.assign(FTy->getNumElements(), UndefMaskElem);
+            Mask[Idx] = ExternalUse.Lane;
+            It->InsertElements.push_back(cast<InsertElementInst>(User));
+            continue;
+          }
+        }
+      }
+    }
+
     // Generate extracts for out-of-tree users.
     // Find the insertion point for the extractelement lane.
     if (auto *VecI = dyn_cast<Instruction>(Vec)) {
@@ -7215,6 +8621,221 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
     LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
   }
 
+  // Checks if the mask is an identity mask.
+  auto &&IsIdentityMask = [](ArrayRef<int> Mask, FixedVectorType *VecTy) {
+    int Limit = Mask.size();
+    return VecTy->getNumElements() == Mask.size() &&
+           all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) &&
+           ShuffleVectorInst::isIdentityMask(Mask);
+  };
+  // Tries to combine 2 different masks into single one.
+  auto &&CombineMasks = [](SmallVectorImpl<int> &Mask, ArrayRef<int> ExtMask) {
+    SmallVector<int> NewMask(ExtMask.size(), UndefMaskElem);
+    for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
+      if (ExtMask[I] == UndefMaskElem)
+        continue;
+      NewMask[I] = Mask[ExtMask[I]];
+    }
+    Mask.swap(NewMask);
+  };
+  // Peek through shuffles, trying to simplify the final shuffle code.
+  auto &&PeekThroughShuffles =
+      [&IsIdentityMask, &CombineMasks](Value *&V, SmallVectorImpl<int> &Mask,
+                                       bool CheckForLengthChange = false) {
+        while (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
+          // Exit if not a fixed vector type or changing size shuffle.
+          if (!isa<FixedVectorType>(SV->getType()) ||
+              (CheckForLengthChange && SV->changesLength()))
+            break;
+          // Exit if the identity or broadcast mask is found.
+          if (IsIdentityMask(Mask, cast<FixedVectorType>(SV->getType())) ||
+              SV->isZeroEltSplat())
+            break;
+          bool IsOp1Undef = isUndefVector(SV->getOperand(0));
+          bool IsOp2Undef = isUndefVector(SV->getOperand(1));
+          if (!IsOp1Undef && !IsOp2Undef)
+            break;
+          SmallVector<int> ShuffleMask(SV->getShuffleMask().begin(),
+                                       SV->getShuffleMask().end());
+          CombineMasks(ShuffleMask, Mask);
+          Mask.swap(ShuffleMask);
+          if (IsOp2Undef)
+            V = SV->getOperand(0);
+          else
+            V = SV->getOperand(1);
+        }
+      };
+  // Smart shuffle instruction emission, walks through shuffles trees and
+  // tries to find the best matching vector for the actual shuffle
+  // instruction.
+  auto &&CreateShuffle = [this, &IsIdentityMask, &PeekThroughShuffles,
+                          &CombineMasks](Value *V1, Value *V2,
+                                         ArrayRef<int> Mask) -> Value * {
+    assert(V1 && "Expected at least one vector value.");
+    if (V2 && !isUndefVector(V2)) {
+      // Peek through shuffles.
+      Value *Op1 = V1;
+      Value *Op2 = V2;
+      int VF =
+          cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
+      SmallVector<int> CombinedMask1(Mask.size(), UndefMaskElem);
+      SmallVector<int> CombinedMask2(Mask.size(), UndefMaskElem);
+      for (int I = 0, E = Mask.size(); I < E; ++I) {
+        if (Mask[I] < VF)
+          CombinedMask1[I] = Mask[I];
+        else
+          CombinedMask2[I] = Mask[I] - VF;
+      }
+      Value *PrevOp1;
+      Value *PrevOp2;
+      do {
+        PrevOp1 = Op1;
+        PrevOp2 = Op2;
+        PeekThroughShuffles(Op1, CombinedMask1, /*CheckForLengthChange=*/true);
+        PeekThroughShuffles(Op2, CombinedMask2, /*CheckForLengthChange=*/true);
+        // Check if we have 2 resizing shuffles - need to peek through operands
+        // again.
+        if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
+          if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2))
+            if (SV1->getOperand(0)->getType() ==
+                    SV2->getOperand(0)->getType() &&
+                SV1->getOperand(0)->getType() != SV1->getType() &&
+                isUndefVector(SV1->getOperand(1)) &&
+                isUndefVector(SV2->getOperand(1))) {
+              Op1 = SV1->getOperand(0);
+              Op2 = SV2->getOperand(0);
+              SmallVector<int> ShuffleMask1(SV1->getShuffleMask().begin(),
+                                            SV1->getShuffleMask().end());
+              CombineMasks(ShuffleMask1, CombinedMask1);
+              CombinedMask1.swap(ShuffleMask1);
+              SmallVector<int> ShuffleMask2(SV2->getShuffleMask().begin(),
+                                            SV2->getShuffleMask().end());
+              CombineMasks(ShuffleMask2, CombinedMask2);
+              CombinedMask2.swap(ShuffleMask2);
+            }
+      } while (PrevOp1 != Op1 || PrevOp2 != Op2);
+      VF = cast<VectorType>(Op1->getType())
+               ->getElementCount()
+               .getKnownMinValue();
+      for (int I = 0, E = Mask.size(); I < E; ++I) {
+        if (CombinedMask2[I] != UndefMaskElem) {
+          assert(CombinedMask1[I] == UndefMaskElem &&
+                 "Expected undefined mask element");
+          CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
+        }
+      }
+      Value *Vec = Builder.CreateShuffleVector(
+          Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
+          CombinedMask1);
+      if (auto *I = dyn_cast<Instruction>(Vec)) {
+        GatherShuffleSeq.insert(I);
+        CSEBlocks.insert(I->getParent());
+      }
+      return Vec;
+    }
+    if (isa<PoisonValue>(V1))
+      return PoisonValue::get(FixedVectorType::get(
+          cast<VectorType>(V1->getType())->getElementType(), Mask.size()));
+    Value *Op = V1;
+    SmallVector<int> CombinedMask(Mask.begin(), Mask.end());
+    PeekThroughShuffles(Op, CombinedMask);
+    if (!isa<FixedVectorType>(Op->getType()) ||
+        !IsIdentityMask(CombinedMask, cast<FixedVectorType>(Op->getType()))) {
+      Value *Vec = Builder.CreateShuffleVector(Op, CombinedMask);
+      if (auto *I = dyn_cast<Instruction>(Vec)) {
+        GatherShuffleSeq.insert(I);
+        CSEBlocks.insert(I->getParent());
+      }
+      return Vec;
+    }
+    return Op;
+  };
+
+  auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask) {
+    unsigned VF = Mask.size();
+    unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
+    if (VF != VecVF) {
+      if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
+        Vec = CreateShuffle(Vec, nullptr, Mask);
+        return std::make_pair(Vec, true);
+      }
+      SmallVector<int> ResizeMask(VF, UndefMaskElem);
+      for (unsigned I = 0; I < VF; ++I) {
+        if (Mask[I] != UndefMaskElem)
+          ResizeMask[Mask[I]] = Mask[I];
+      }
+      Vec = CreateShuffle(Vec, nullptr, ResizeMask);
+    }
+
+    return std::make_pair(Vec, false);
+  };
+  // Perform shuffling of the vectorize tree entries for better handling of
+  // external extracts.
+  for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
+    // Find the first and the last instruction in the list of insertelements.
+    sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
+    InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
+    InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
+    Builder.SetInsertPoint(LastInsert);
+    auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
+    Value *NewInst = performExtractsShuffleAction<Value>(
+        makeMutableArrayRef(Vector.data(), Vector.size()),
+        FirstInsert->getOperand(0),
+        [](Value *Vec) {
+          return cast<VectorType>(Vec->getType())
+              ->getElementCount()
+              .getKnownMinValue();
+        },
+        ResizeToVF,
+        [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
+                                      ArrayRef<Value *> Vals) {
+          assert((Vals.size() == 1 || Vals.size() == 2) &&
+                 "Expected exactly 1 or 2 input values.");
+          if (Vals.size() == 1) {
+            // Do not create shuffle if the mask is a simple identity
+            // non-resizing mask.
+            if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
+                                   ->getNumElements() ||
+                !ShuffleVectorInst::isIdentityMask(Mask))
+              return CreateShuffle(Vals.front(), nullptr, Mask);
+            return Vals.front();
+          }
+          return CreateShuffle(Vals.front() ? Vals.front()
+                                            : FirstInsert->getOperand(0),
+                               Vals.back(), Mask);
+        });
+    auto It = ShuffledInserts[I].InsertElements.rbegin();
+    // Rebuild buildvector chain.
+    InsertElementInst *II = nullptr;
+    if (It != ShuffledInserts[I].InsertElements.rend())
+      II = *It;
+    SmallVector<Instruction *> Inserts;
+    while (It != ShuffledInserts[I].InsertElements.rend()) {
+      assert(II && "Must be an insertelement instruction.");
+      if (*It == II)
+        ++It;
+      else
+        Inserts.push_back(cast<Instruction>(II));
+      II = dyn_cast<InsertElementInst>(II->getOperand(0));
+    }
+    for (Instruction *II : reverse(Inserts)) {
+      II->replaceUsesOfWith(II->getOperand(0), NewInst);
+      if (auto *NewI = dyn_cast<Instruction>(NewInst))
+        if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
+          II->moveAfter(NewI);
+      NewInst = II;
+    }
+    LastInsert->replaceAllUsesWith(NewInst);
+    for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
+      IE->replaceUsesOfWith(IE->getOperand(0),
+                            PoisonValue::get(IE->getOperand(0)->getType()));
+      IE->replaceUsesOfWith(IE->getOperand(1),
+                            PoisonValue::get(IE->getOperand(1)->getType()));
+      eraseInstruction(IE);
+    }
+    CSEBlocks.insert(LastInsert->getParent());
+  }
+
   // For each vectorized value:
   for (auto &TEPtr : VectorizableTree) {
     TreeEntry *Entry = TEPtr.get();
@@ -7229,6 +8850,9 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
       Value *Scalar = Entry->Scalars[Lane];
 
+      if (Entry->getOpcode() == Instruction::GetElementPtr &&
+          !isa<GetElementPtrInst>(Scalar))
+        continue;
 #ifndef NDEBUG
       Type *Ty = Scalar->getType();
       if (!Ty->isVoidTy()) {
@@ -7236,7 +8860,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
           LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
 
           // It is legal to delete users in the ignorelist.
-          assert((getTreeEntry(U) || is_contained(UserIgnoreList, U) ||
+          assert((getTreeEntry(U) ||
+                  (UserIgnoreList && UserIgnoreList->contains(U)) ||
                   (isa_and_nonnull<Instruction>(U) &&
                    isDeleted(cast<Instruction>(U)))) &&
                  "Deleting out-of-tree value");
@@ -7404,9 +9029,11 @@ void BoUpSLP::optimizeGatherSequence() {
 
 BoUpSLP::ScheduleData *
 BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
-  ScheduleData *Bundle = nullptr;  
+  ScheduleData *Bundle = nullptr;
   ScheduleData *PrevInBundle = nullptr;
   for (Value *V : VL) {
+    if (doesNotNeedToBeScheduled(V))
+      continue;
     ScheduleData *BundleMember = getScheduleData(V);
     assert(BundleMember &&
            "no ScheduleData for bundle member "
@@ -7418,8 +9045,6 @@ BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
     } else {
       Bundle = BundleMember;
     }
-    BundleMember->UnscheduledDepsInBundle = 0;
-    Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
 
     // Group the instructions to a bundle.
     BundleMember->FirstInBundle = Bundle;
@@ -7436,7 +9061,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
                                             const InstructionsState &S) {
   // No need to schedule PHIs, insertelement, extractelement and extractvalue
   // instructions.
-  if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue))
+  if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) ||
+      doesNotNeedToSchedule(VL))
     return nullptr;
 
   // Initialize the instruction bundle.
@@ -7455,16 +9081,17 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
         doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
       ReSchedule = true;
     }
-    if (ReSchedule) {
-      resetSchedule();
-      initialFillReadyList(ReadyInsts);
-    }
     if (Bundle) {
       LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
                         << " in block " << BB->getName() << "\n");
       calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
     }
 
+    if (ReSchedule) {
+      resetSchedule();
+      initialFillReadyList(ReadyInsts);
+    }
+
     // Now try to schedule the new bundle or (if no bundle) just calculate
     // dependencies. As soon as the bundle is "ready" it means that there are no
     // cyclic dependencies and we can schedule it. Note that's important that we
@@ -7472,14 +9099,17 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
     while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
            !ReadyInsts.empty()) {
       ScheduleData *Picked = ReadyInsts.pop_back_val();
-      if (Picked->isSchedulingEntity() && Picked->isReady())
-        schedule(Picked, ReadyInsts);
+      assert(Picked->isSchedulingEntity() && Picked->isReady() &&
+             "must be ready to schedule");
+      schedule(Picked, ReadyInsts);
     }
   };
 
   // Make sure that the scheduling region contains all
   // instructions of the bundle.
   for (Value *V : VL) {
+    if (doesNotNeedToBeScheduled(V))
+      continue;
     if (!extendSchedulingRegion(V, S)) {
       // If the scheduling region got new instructions at the lower end (or it
       // is a new region for the first bundle). This makes it necessary to
@@ -7494,9 +9124,16 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
 
   bool ReSchedule = false;
   for (Value *V : VL) {
+    if (doesNotNeedToBeScheduled(V))
+      continue;
     ScheduleData *BundleMember = getScheduleData(V);
     assert(BundleMember &&
            "no ScheduleData for bundle member (maybe not in same basic block)");
+
+    // Make sure we don't leave the pieces of the bundle in the ready list when
+    // whole bundle might not be ready.
+    ReadyInsts.remove(BundleMember);
+
     if (!BundleMember->IsScheduled)
       continue;
     // A bundle member was scheduled as single instruction before and now
@@ -7518,16 +9155,24 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
 
 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
                                                 Value *OpValue) {
-  if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue))
+  if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
+      doesNotNeedToSchedule(VL))
     return;
 
+  if (doesNotNeedToBeScheduled(OpValue))
+    OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
   ScheduleData *Bundle = getScheduleData(OpValue);
   LLVM_DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
   assert(!Bundle->IsScheduled &&
          "Can't cancel bundle which is already scheduled");
-  assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
+  assert(Bundle->isSchedulingEntity() &&
+         (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&
          "tried to unbundle something which is not a bundle");
 
+  // Remove the bundle from the ready list.
+  if (Bundle->isReady())
+    ReadyInsts.remove(Bundle);
+
   // Un-bundle: make single instructions out of the bundle.
   ScheduleData *BundleMember = Bundle;
   while (BundleMember) {
@@ -7535,8 +9180,8 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
     BundleMember->FirstInBundle = BundleMember;
     ScheduleData *Next = BundleMember->NextInBundle;
     BundleMember->NextInBundle = nullptr;
-    BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
-    if (BundleMember->UnscheduledDepsInBundle == 0) {
+    BundleMember->TE = nullptr;
+    if (BundleMember->unscheduledDepsInBundle() == 0) {
       ReadyInsts.insert(BundleMember);
     }
     BundleMember = Next;
@@ -7559,9 +9204,10 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
   Instruction *I = dyn_cast<Instruction>(V);
   assert(I && "bundle member must be an instruction");
   assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
+         !doesNotNeedToBeScheduled(I) &&
          "phi nodes/insertelements/extractelements/extractvalues don't need to "
          "be scheduled");
-  auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool {
+  auto &&CheckScheduleForI = [this, &S](Instruction *I) -> bool {
     ScheduleData *ISD = getScheduleData(I);
     if (!ISD)
       return false;
@@ -7573,7 +9219,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
     ExtraScheduleDataMap[I][S.OpValue] = SD;
     return true;
   };
-  if (CheckSheduleForI(I))
+  if (CheckScheduleForI(I))
     return true;
   if (!ScheduleStart) {
     // It's the first instruction in the new region.
@@ -7581,7 +9227,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
     ScheduleStart = I;
     ScheduleEnd = I->getNextNode();
     if (isOneOf(S, I) != I)
-      CheckSheduleForI(I);
+      CheckScheduleForI(I);
     assert(ScheduleEnd && "tried to vectorize a terminator?");
     LLVM_DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
     return true;
@@ -7609,7 +9255,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
     initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
     ScheduleStart = I;
     if (isOneOf(S, I) != I)
-      CheckSheduleForI(I);
+      CheckScheduleForI(I);
     LLVM_DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I
                       << "\n");
     return true;
@@ -7623,7 +9269,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
                    nullptr);
   ScheduleEnd = I->getNextNode();
   if (isOneOf(S, I) != I)
-    CheckSheduleForI(I);
+    CheckScheduleForI(I);
   assert(ScheduleEnd && "tried to vectorize a terminator?");
   LLVM_DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I << "\n");
   return true;
@@ -7635,7 +9281,10 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
                                                 ScheduleData *NextLoadStore) {
   ScheduleData *CurrentLoadStore = PrevLoadStore;
   for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
-    ScheduleData *SD = ScheduleDataMap[I];
+    // No need to allocate data for non-schedulable instructions.
+    if (doesNotNeedToBeScheduled(I))
+      continue;
+    ScheduleData *SD = ScheduleDataMap.lookup(I);
     if (!SD) {
       SD = allocateScheduleDataChunks();
       ScheduleDataMap[I] = SD;
@@ -7658,6 +9307,10 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
       }
       CurrentLoadStore = SD;
     }
+
+    if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
+        match(I, m_Intrinsic<Intrinsic::stackrestore>()))
+      RegionHasStackSave = true;
   }
   if (NextLoadStore) {
     if (CurrentLoadStore)
@@ -7690,8 +9343,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
 
       // Handle def-use chain dependencies.
       if (BundleMember->OpValue != BundleMember->Inst) {
-        ScheduleData *UseSD = getScheduleData(BundleMember->Inst);
-        if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
+        if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) {
           BundleMember->Dependencies++;
           ScheduleData *DestBundle = UseSD->FirstInBundle;
           if (!DestBundle->IsScheduled)
@@ -7701,10 +9353,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
         }
       } else {
         for (User *U : BundleMember->Inst->users()) {
-          assert(isa<Instruction>(U) &&
-                 "user of instruction must be instruction");
-          ScheduleData *UseSD = getScheduleData(U);
-          if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
+          if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
             BundleMember->Dependencies++;
             ScheduleData *DestBundle = UseSD->FirstInBundle;
             if (!DestBundle->IsScheduled)
@@ -7715,6 +9364,75 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
         }
       }
 
+      auto makeControlDependent = [&](Instruction *I) {
+        auto *DepDest = getScheduleData(I);
+        assert(DepDest && "must be in schedule window");
+        DepDest->ControlDependencies.push_back(BundleMember);
+        BundleMember->Dependencies++;
+        ScheduleData *DestBundle = DepDest->FirstInBundle;
+        if (!DestBundle->IsScheduled)
+          BundleMember->incrementUnscheduledDeps(1);
+        if (!DestBundle->hasValidDependencies())
+          WorkList.push_back(DestBundle);
+      };
+
+      // Any instruction which isn't safe to speculate at the begining of the
+      // block is control dependend on any early exit or non-willreturn call
+      // which proceeds it.
+      if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {
+        for (Instruction *I = BundleMember->Inst->getNextNode();
+             I != ScheduleEnd; I = I->getNextNode()) {
+          if (isSafeToSpeculativelyExecute(I, &*BB->begin()))
+            continue;
+
+          // Add the dependency
+          makeControlDependent(I);
+
+          if (!isGuaranteedToTransferExecutionToSuccessor(I))
+            // Everything past here must be control dependent on I.
+            break;
+        }
+      }
+
+      if (RegionHasStackSave) {
+        // If we have an inalloc alloca instruction, it needs to be scheduled
+        // after any preceeding stacksave.  We also need to prevent any alloca
+        // from reordering above a preceeding stackrestore.
+        if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
+            match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
+          for (Instruction *I = BundleMember->Inst->getNextNode();
+               I != ScheduleEnd; I = I->getNextNode()) {
+            if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
+                match(I, m_Intrinsic<Intrinsic::stackrestore>()))
+              // Any allocas past here must be control dependent on I, and I
+              // must be memory dependend on BundleMember->Inst.
+              break;
+
+            if (!isa<AllocaInst>(I))
+              continue;
+
+            // Add the dependency
+            makeControlDependent(I);
+          }
+        }
+
+        // In addition to the cases handle just above, we need to prevent
+        // allocas from moving below a stacksave.  The stackrestore case
+        // is currently thought to be conservatism.
+        if (isa<AllocaInst>(BundleMember->Inst)) {
+          for (Instruction *I = BundleMember->Inst->getNextNode();
+               I != ScheduleEnd; I = I->getNextNode()) {
+            if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
+                !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
+              continue;
+
+            // Add the dependency
+            makeControlDependent(I);
+            break;
+          }
+        }
+      }
+
       // Handle the memory dependencies (if any).
       ScheduleData *DepDest = BundleMember->NextLoadStore;
       if (!DepDest)
@@ -7777,7 +9495,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
       }
     }
     if (InsertInReadyList && SD->isReady()) {
-      ReadyInsts.push_back(SD);
+      ReadyInsts.insert(SD);
       LLVM_DEBUG(dbgs() << "SLP:     gets ready on update: " << *SD->Inst
                         << "\n");
     }
@@ -7804,11 +9522,18 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
 
   LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
 
+  // A key point - if we got here, pre-scheduling was able to find a valid
+  // scheduling of the sub-graph of the scheduling window which consists
+  // of all vector bundles and their transitive users.  As such, we do not
+  // need to reschedule anything *outside of* that subgraph.
+
   BS->resetSchedule();
 
   // For the real scheduling we use a more sophisticated ready-list: it is
   // sorted by the original instruction location. This lets the final schedule
   // be as  close as possible to the original instruction order.
+  // WARNING: If changing this order causes a correctness issue, that means
+  // there is some missing dependence edge in the schedule data graph.
   struct ScheduleDataCompare {
     bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
       return SD2->SchedulingPriority < SD1->SchedulingPriority;
@@ -7816,21 +9541,22 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   };
   std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
 
-  // Ensure that all dependency data is updated and fill the ready-list with
-  // initial instructions.
+  // Ensure that all dependency data is updated (for nodes in the sub-graph)
+  // and fill the ready-list with initial instructions.
   int Idx = 0;
-  int NumToSchedule = 0;
   for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
        I = I->getNextNode()) {
-    BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
+    BS->doForAllOpcodes(I, [this, &Idx, BS](ScheduleData *SD) {
+      TreeEntry *SDTE = getTreeEntry(SD->Inst);
+      (void)SDTE;
       assert((isVectorLikeInstWithConstOps(SD->Inst) ||
-              SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr)) &&
+              SD->isPartOfBundle() ==
+                  (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
              "scheduler and vectorizer bundle mismatch");
       SD->FirstInBundle->SchedulingPriority = Idx++;
-      if (SD->isSchedulingEntity()) {
+
+      if (SD->isSchedulingEntity() && SD->isPartOfBundle())
         BS->calculateDependencies(SD, false, this);
-        NumToSchedule++;
-      }
     });
   }
   BS->initialFillReadyList(ReadyInsts);
@@ -7853,9 +9579,23 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
     }
 
     BS->schedule(picked, ReadyInsts);
-    NumToSchedule--;
   }
-  assert(NumToSchedule == 0 && "could not schedule all instructions");
+
+  // Check that we didn't break any of our invariants.
+#ifdef EXPENSIVE_CHECKS
+  BS->verify();
+#endif
+
+#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
+  // Check that all schedulable entities got scheduled
+  for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
+    BS->doForAllOpcodes(I, [&](ScheduleData *SD) {
+      if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {
+        assert(SD->IsScheduled && "must be scheduled at this point");
+      }
+    });
+  }
+#endif
 
   // Avoid duplicate scheduling of the block.
   BS->ScheduleStart = nullptr;
@@ -7865,11 +9605,8 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) {
   // If V is a store, just return the width of the stored value (or value
   // truncated just before storing) without traversing the expression tree.
   // This is the common case.
-  if (auto *Store = dyn_cast<StoreInst>(V)) {
-    if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
-      return DL->getTypeSizeInBits(Trunc->getSrcTy());
+  if (auto *Store = dyn_cast<StoreInst>(V))
     return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
-  }
 
   if (auto *IEI = dyn_cast<InsertElementInst>(V))
     return getVectorElementSize(IEI->getOperand(1));
@@ -8271,6 +10008,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
 
   // Scan the blocks in the function in post order.
   for (auto BB : post_order(&F.getEntryBlock())) {
+    // Start new block - clear the list of reduction roots.
+    R.clearReductionData();
     collectSeedInstructions(BB);
 
     // Vectorize trees that end at stores.
@@ -8301,11 +10040,10 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
 }
 
 bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
-                                            unsigned Idx) {
+                                            unsigned Idx, unsigned MinVF) {
   LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
                     << "\n");
   const unsigned Sz = R.getVectorElementSize(Chain[0]);
-  const unsigned MinVF = R.getMinVecRegSize() / Sz;
   unsigned VF = Chain.size();
 
   if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF)
@@ -8444,9 +10182,15 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
     unsigned EltSize = R.getVectorElementSize(Operands[0]);
     unsigned MaxElts = llvm::PowerOf2Floor(MaxVecRegSize / EltSize);
 
-    unsigned MinVF = R.getMinVF(EltSize);
     unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store),
                               MaxElts);
+    auto *Store = cast<StoreInst>(Operands[0]);
+    Type *StoreTy = Store->getValueOperand()->getType();
+    Type *ValueTy = StoreTy;
+    if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
+      ValueTy = Trunc->getSrcTy();
+    unsigned MinVF = TTI->getStoreMinimumVF(
+        R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy);
 
     // FIXME: Is division-by-2 the correct step? Should we assert that the
     // register size is a power-of-2?
@@ -8456,7 +10200,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
         ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size);
         if (!VectorizedStores.count(Slice.front()) &&
             !VectorizedStores.count(Slice.back()) &&
-            vectorizeStoreChain(Slice, R, Cnt)) {
+            vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
           // Mark the vectorized stores so that we don't vectorize them again.
           VectorizedStores.insert(Slice.begin(), Slice.end());
           Changed = true;
@@ -8516,6 +10260,8 @@ void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
 bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
   if (!A || !B)
     return false;
+  if (isa<InsertElementInst>(A) || isa<InsertElementInst>(B))
+    return false;
   Value *VL[] = {A, B};
   return tryToVectorizeList(VL, R);
 }
@@ -8658,7 +10404,8 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
   if (!I)
     return false;
 
-  if (!isa<BinaryOperator>(I) && !isa<CmpInst>(I))
+  if ((!isa<BinaryOperator>(I) && !isa<CmpInst>(I)) ||
+      isa<VectorType>(I->getType()))
     return false;
 
   Value *P = I->getParent();
@@ -8669,32 +10416,40 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
   if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
     return false;
 
-  // Try to vectorize V.
-  if (tryToVectorizePair(Op0, Op1, R))
-    return true;
+  // First collect all possible candidates
+  SmallVector<std::pair<Value *, Value *>, 4> Candidates;
+  Candidates.emplace_back(Op0, Op1);
 
   auto *A = dyn_cast<BinaryOperator>(Op0);
   auto *B = dyn_cast<BinaryOperator>(Op1);
   // Try to skip B.
-  if (B && B->hasOneUse()) {
+  if (A && B && B->hasOneUse()) {
     auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
     auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
-    if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))
-      return true;
-    if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))
-      return true;
+    if (B0 && B0->getParent() == P)
+      Candidates.emplace_back(A, B0);
+    if (B1 && B1->getParent() == P)
+      Candidates.emplace_back(A, B1);
   }
-
   // Try to skip A.
-  if (A && A->hasOneUse()) {
+  if (B && A && A->hasOneUse()) {
     auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
     auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
-    if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))
-      return true;
-    if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))
-      return true;
+    if (A0 && A0->getParent() == P)
+      Candidates.emplace_back(A0, B);
+    if (A1 && A1->getParent() == P)
+      Candidates.emplace_back(A1, B);
   }
-  return false;
+
+  if (Candidates.size() == 1)
+    return tryToVectorizePair(Op0, Op1, R);
+
+  // We have multiple options. Try to pick the single best.
+  Optional<int> BestCandidate = R.findBestRootPair(Candidates);
+  if (!BestCandidate)
+    return false;
+  return tryToVectorizePair(Candidates[*BestCandidate].first,
+                            Candidates[*BestCandidate].second, R);
 }
 
 namespace {
@@ -8729,15 +10484,16 @@ class HorizontalReduction {
   using ReductionOpsType = SmallVector<Value *, 16>;
   using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
   ReductionOpsListType ReductionOps;
-  SmallVector<Value *, 32> ReducedVals;
+  /// List of possibly reduced values.
+  SmallVector<SmallVector<Value *>> ReducedVals;
+  /// Maps reduced value to the corresponding reduction operation.
+  DenseMap<Value *, SmallVector<Instruction *>> ReducedValsToOps;
   // Use map vector to make stable output.
   MapVector<Instruction *, Value *> ExtraArgs;
   WeakTrackingVH ReductionRoot;
   /// The type of reduction operation.
   RecurKind RdxKind;
 
-  const unsigned INVALID_OPERAND_INDEX = std::numeric_limits<unsigned>::max();
-
   static bool isCmpSelMinMax(Instruction *I) {
     return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
            RecurrenceDescriptor::isMinMaxRecurrenceKind(getRdxKind(I));
@@ -8781,26 +10537,6 @@ class HorizontalReduction {
     return I->getOperand(Index);
   }
 
-  /// Checks if the ParentStackElem.first should be marked as a reduction
-  /// operation with an extra argument or as extra argument itself.
-  void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,
-                    Value *ExtraArg) {
-    if (ExtraArgs.count(ParentStackElem.first)) {
-      ExtraArgs[ParentStackElem.first] = nullptr;
-      // We ran into something like:
-      // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg.
-      // The whole ParentStackElem.first should be considered as an extra value
-      // in this case.
-      // Do not perform analysis of remaining operands of ParentStackElem.first
-      // instruction, this whole instruction is an extra argument.
-      ParentStackElem.second = INVALID_OPERAND_INDEX;
-    } else {
-      // We ran into something like:
-      // ParentStackElem.first += ... + ExtraArg + ...
-      ExtraArgs[ParentStackElem.first] = ExtraArg;
-    }
-  }
-
   /// Creates reduction operation with the current opcode.
   static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS,
                          Value *RHS, const Twine &Name, bool UseSelect) {
@@ -8859,7 +10595,7 @@ class HorizontalReduction {
   }
 
   /// Creates reduction operation with the current opcode with the IR flags
-  /// from \p ReductionOps.
+  /// from \p ReductionOps, dropping nuw/nsw flags.
   static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
                          Value *RHS, const Twine &Name,
                          const ReductionOpsListType &ReductionOps) {
@@ -8873,31 +10609,21 @@ class HorizontalReduction {
     Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
     if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {
       if (auto *Sel = dyn_cast<SelectInst>(Op)) {
-        propagateIRFlags(Sel->getCondition(), ReductionOps[0]);
-        propagateIRFlags(Op, ReductionOps[1]);
+        propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
+                         /*IncludeWrapFlags=*/false);
+        propagateIRFlags(Op, ReductionOps[1], nullptr,
+                         /*IncludeWrapFlags=*/false);
         return Op;
       }
     }
-    propagateIRFlags(Op, ReductionOps[0]);
-    return Op;
-  }
-
-  /// Creates reduction operation with the current opcode with the IR flags
-  /// from \p I.
-  static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
-                         Value *RHS, const Twine &Name, Instruction *I) {
-    auto *SelI = dyn_cast<SelectInst>(I);
-    Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, SelI != nullptr);
-    if (SelI && RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {
-      if (auto *Sel = dyn_cast<SelectInst>(Op))
-        propagateIRFlags(Sel->getCondition(), SelI->getCondition());
-    }
-    propagateIRFlags(Op, I);
+    propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
     return Op;
   }
 
-  static RecurKind getRdxKind(Instruction *I) {
-    assert(I && "Expected instruction for reduction matching");
+  static RecurKind getRdxKind(Value *V) {
+    auto *I = dyn_cast<Instruction>(V);
+    if (!I)
+      return RecurKind::None;
     if (match(I, m_Add(m_Value(), m_Value())))
       return RecurKind::Add;
     if (match(I, m_Mul(m_Value(), m_Value())))
@@ -9059,7 +10785,9 @@ public:
   HorizontalReduction() = default;
 
   /// Try to find a reduction tree.
-  bool matchAssociativeReduction(PHINode *Phi, Instruction *Inst) {
+  bool matchAssociativeReduction(PHINode *Phi, Instruction *Inst,
+                                 ScalarEvolution &SE, const DataLayout &DL,
+                                 const TargetLibraryInfo &TLI) {
     assert((!Phi || is_contained(Phi->operands(), Inst)) &&
            "Phi needs to use the binary operator");
     assert((isa<BinaryOperator>(Inst) || isa<SelectInst>(Inst) ||
@@ -9103,124 +10831,178 @@ public:
 
     ReductionRoot = Inst;
 
-    // The opcode for leaf values that we perform a reduction on.
-    // For example: load(x) + load(y) + load(z) + fptoui(w)
-    // The leaf opcode for 'w' does not match, so we don't include it as a
-    // potential candidate for the reduction.
-    unsigned LeafOpcode = 0;
-
-    // Post-order traverse the reduction tree starting at Inst. We only handle
-    // true trees containing binary operators or selects.
-    SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
-    Stack.push_back(std::make_pair(Inst, getFirstOperandIndex(Inst)));
-    initReductionOps(Inst);
-    while (!Stack.empty()) {
-      Instruction *TreeN = Stack.back().first;
-      unsigned EdgeToVisit = Stack.back().second++;
-      const RecurKind TreeRdxKind = getRdxKind(TreeN);
-      bool IsReducedValue = TreeRdxKind != RdxKind;
-
-      // Postorder visit.
-      if (IsReducedValue || EdgeToVisit >= getNumberOfOperands(TreeN)) {
-        if (IsReducedValue)
-          ReducedVals.push_back(TreeN);
-        else {
-          auto ExtraArgsIter = ExtraArgs.find(TreeN);
-          if (ExtraArgsIter != ExtraArgs.end() && !ExtraArgsIter->second) {
-            // Check if TreeN is an extra argument of its parent operation.
-            if (Stack.size() <= 1) {
-              // TreeN can't be an extra argument as it is a root reduction
-              // operation.
-              return false;
-            }
-            // Yes, TreeN is an extra argument, do not add it to a list of
-            // reduction operations.
-            // Stack[Stack.size() - 2] always points to the parent operation.
-            markExtraArg(Stack[Stack.size() - 2], TreeN);
-            ExtraArgs.erase(TreeN);
-          } else
-            addReductionOps(TreeN);
-        }
-        // Retract.
-        Stack.pop_back();
-        continue;
-      }
-
-      // Visit operands.
-      Value *EdgeVal = getRdxOperand(TreeN, EdgeToVisit);
-      auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
-      if (!EdgeInst) {
-        // Edge value is not a reduction instruction or a leaf instruction.
-        // (It may be a constant, function argument, or something else.)
-        markExtraArg(Stack.back(), EdgeVal);
-        continue;
+    // Iterate through all the operands of the possible reduction tree and
+    // gather all the reduced values, sorting them by their value id.
+    BasicBlock *BB = Inst->getParent();
+    bool IsCmpSelMinMax = isCmpSelMinMax(Inst);
+    SmallVector<Instruction *> Worklist(1, Inst);
+    // Checks if the operands of the \p TreeN instruction are also reduction
+    // operations or should be treated as reduced values or an extra argument,
+    // which is not part of the reduction.
+    auto &&CheckOperands = [this, IsCmpSelMinMax,
+                            BB](Instruction *TreeN,
+                                SmallVectorImpl<Value *> &ExtraArgs,
+                                SmallVectorImpl<Value *> &PossibleReducedVals,
+                                SmallVectorImpl<Instruction *> &ReductionOps) {
+      for (int I = getFirstOperandIndex(TreeN),
+               End = getNumberOfOperands(TreeN);
+           I < End; ++I) {
+        Value *EdgeVal = getRdxOperand(TreeN, I);
+        ReducedValsToOps[EdgeVal].push_back(TreeN);
+        auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
+        // Edge has wrong parent - mark as an extra argument.
+        if (EdgeInst && !isVectorLikeInstWithConstOps(EdgeInst) &&
+            !hasSameParent(EdgeInst, BB)) {
+          ExtraArgs.push_back(EdgeVal);
+          continue;
+        }
+        // If the edge is not an instruction, or it is different from the main
+        // reduction opcode or has too many uses - possible reduced value.
+        if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind ||
+            IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
+            !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
+            !isVectorizable(getRdxKind(EdgeInst), EdgeInst)) {
+          PossibleReducedVals.push_back(EdgeVal);
+          continue;
+        }
+        ReductionOps.push_back(EdgeInst);
       }
-      RecurKind EdgeRdxKind = getRdxKind(EdgeInst);
-      // Continue analysis if the next operand is a reduction operation or
-      // (possibly) a leaf value. If the leaf value opcode is not set,
-      // the first met operation != reduction operation is considered as the
-      // leaf opcode.
-      // Only handle trees in the current basic block.
-      // Each tree node needs to have minimal number of users except for the
-      // ultimate reduction.
-      const bool IsRdxInst = EdgeRdxKind == RdxKind;
-      if (EdgeInst != Phi && EdgeInst != Inst &&
-          hasSameParent(EdgeInst, Inst->getParent()) &&
-          hasRequiredNumberOfUses(isCmpSelMinMax(Inst), EdgeInst) &&
-          (!LeafOpcode || LeafOpcode == EdgeInst->getOpcode() || IsRdxInst)) {
-        if (IsRdxInst) {
-          // We need to be able to reassociate the reduction operations.
-          if (!isVectorizable(EdgeRdxKind, EdgeInst)) {
-            // I is an extra argument for TreeN (its parent operation).
-            markExtraArg(Stack.back(), EdgeInst);
-            continue;
-          }
-        } else if (!LeafOpcode) {
-          LeafOpcode = EdgeInst->getOpcode();
+    };
+    // Try to regroup reduced values so that it gets more profitable to try to
+    // reduce them. Values are grouped by their value ids, instructions - by
+    // instruction op id and/or alternate op id, plus do extra analysis for
+    // loads (grouping them by the distabce between pointers) and cmp
+    // instructions (grouping them by the predicate).
+    MapVector<size_t, MapVector<size_t, MapVector<Value *, unsigned>>>
+        PossibleReducedVals;
+    initReductionOps(Inst);
+    while (!Worklist.empty()) {
+      Instruction *TreeN = Worklist.pop_back_val();
+      SmallVector<Value *> Args;
+      SmallVector<Value *> PossibleRedVals;
+      SmallVector<Instruction *> PossibleReductionOps;
+      CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
+      // If too many extra args - mark the instruction itself as a reduction
+      // value, not a reduction operation.
+      if (Args.size() < 2) {
+        addReductionOps(TreeN);
+        // Add extra args.
+        if (!Args.empty()) {
+          assert(Args.size() == 1 && "Expected only single argument.");
+          ExtraArgs[TreeN] = Args.front();
         }
-        Stack.push_back(
-            std::make_pair(EdgeInst, getFirstOperandIndex(EdgeInst)));
-        continue;
+        // Add reduction values. The values are sorted for better vectorization
+        // results.
+        for (Value *V : PossibleRedVals) {
+          size_t Key, Idx;
+          std::tie(Key, Idx) = generateKeySubkey(
+              V, &TLI,
+              [&PossibleReducedVals, &DL, &SE](size_t Key, LoadInst *LI) {
+                auto It = PossibleReducedVals.find(Key);
+                if (It != PossibleReducedVals.end()) {
+                  for (const auto &LoadData : It->second) {
+                    auto *RLI = cast<LoadInst>(LoadData.second.front().first);
+                    if (getPointersDiff(RLI->getType(),
+                                        RLI->getPointerOperand(), LI->getType(),
+                                        LI->getPointerOperand(), DL, SE,
+                                        /*StrictCheck=*/true))
+                      return hash_value(RLI->getPointerOperand());
+                  }
+                }
+                return hash_value(LI->getPointerOperand());
+              },
+              /*AllowAlternate=*/false);
+          ++PossibleReducedVals[Key][Idx]
+                .insert(std::make_pair(V, 0))
+                .first->second;
+        }
+        Worklist.append(PossibleReductionOps.rbegin(),
+                        PossibleReductionOps.rend());
+      } else {
+        size_t Key, Idx;
+        std::tie(Key, Idx) = generateKeySubkey(
+            TreeN, &TLI,
+            [&PossibleReducedVals, &DL, &SE](size_t Key, LoadInst *LI) {
+              auto It = PossibleReducedVals.find(Key);
+              if (It != PossibleReducedVals.end()) {
+                for (const auto &LoadData : It->second) {
+                  auto *RLI = cast<LoadInst>(LoadData.second.front().first);
+                  if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
+                                      LI->getType(), LI->getPointerOperand(),
+                                      DL, SE, /*StrictCheck=*/true))
+                    return hash_value(RLI->getPointerOperand());
+                }
+              }
+              return hash_value(LI->getPointerOperand());
+            },
+            /*AllowAlternate=*/false);
+        ++PossibleReducedVals[Key][Idx]
+              .insert(std::make_pair(TreeN, 0))
+              .first->second;
+      }
+    }
+    auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
+    // Sort values by the total number of values kinds to start the reduction
+    // from the longest possible reduced values sequences.
+    for (auto &PossibleReducedVals : PossibleReducedValsVect) {
+      auto PossibleRedVals = PossibleReducedVals.second.takeVector();
+      SmallVector<SmallVector<Value *>> PossibleRedValsVect;
+      for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
+           It != E; ++It) {
+        PossibleRedValsVect.emplace_back();
+        auto RedValsVect = It->second.takeVector();
+        stable_sort(RedValsVect, [](const auto &P1, const auto &P2) {
+          return P1.second < P2.second;
+        });
+        for (const std::pair<Value *, unsigned> &Data : RedValsVect)
+          PossibleRedValsVect.back().append(Data.second, Data.first);
       }
-      // I is an extra argument for TreeN (its parent operation).
-      markExtraArg(Stack.back(), EdgeInst);
-    }
+      stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
+        return P1.size() > P2.size();
+      });
+      ReducedVals.emplace_back();
+      for (ArrayRef<Value *> Data : PossibleRedValsVect)
+        ReducedVals.back().append(Data.rbegin(), Data.rend());
+    }
+    // Sort the reduced values by number of same/alternate opcode and/or pointer
+    // operand.
+    stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
+      return P1.size() > P2.size();
+    });
     return true;
   }
 
   /// Attempt to vectorize the tree found by matchAssociativeReduction.
   Value *tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
+    constexpr int ReductionLimit = 4;
+    constexpr unsigned RegMaxNumber = 4;
+    constexpr unsigned RedValsMaxNumber = 128;
     // If there are a sufficient number of reduction values, reduce
     // to a nearby power-of-2. We can safely generate oversized
     // vectors and rely on the backend to split them to legal sizes.
-    unsigned NumReducedVals = ReducedVals.size();
-    if (NumReducedVals < 4)
+    unsigned NumReducedVals = std::accumulate(
+        ReducedVals.begin(), ReducedVals.end(), 0,
+        [](int Num, ArrayRef<Value *> Vals) { return Num + Vals.size(); });
+    if (NumReducedVals < ReductionLimit)
       return nullptr;
 
-    // Intersect the fast-math-flags from all reduction operations.
-    FastMathFlags RdxFMF;
-    RdxFMF.set();
-    for (ReductionOpsType &RdxOp : ReductionOps) {
-      for (Value *RdxVal : RdxOp) {
-        if (auto *FPMO = dyn_cast<FPMathOperator>(RdxVal))
-          RdxFMF &= FPMO->getFastMathFlags();
-      }
-    }
-
     IRBuilder<> Builder(cast<Instruction>(ReductionRoot));
-    Builder.setFastMathFlags(RdxFMF);
 
+    // Track the reduced values in case if they are replaced by extractelement
+    // because of the vectorization.
+    DenseMap<Value *, WeakTrackingVH> TrackedVals;
     BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
     // The same extra argument may be used several times, so log each attempt
     // to use it.
     for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
       assert(Pair.first && "DebugLoc must be set.");
       ExternallyUsedValues[Pair.second].push_back(Pair.first);
+      TrackedVals.try_emplace(Pair.second, Pair.second);
     }
 
     // The compare instruction of a min/max is the insertion point for new
     // instructions and may be replaced with a new compare instruction.
-    auto getCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
+    auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
       assert(isa<SelectInst>(RdxRootInst) &&
              "Expected min/max reduction to have select root instruction");
       Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
@@ -9232,164 +11014,390 @@ public:
     // The reduction root is used as the insertion point for new instructions,
     // so set it as externally used to prevent it from being deleted.
     ExternallyUsedValues[ReductionRoot];
-    SmallVector<Value *, 16> IgnoreList;
-    for (ReductionOpsType &RdxOp : ReductionOps)
-      IgnoreList.append(RdxOp.begin(), RdxOp.end());
-
-    unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
-    if (NumReducedVals > ReduxWidth) {
-      // In the loop below, we are building a tree based on a window of
-      // 'ReduxWidth' values.
-      // If the operands of those values have common traits (compare predicate,
-      // constant operand, etc), then we want to group those together to
-      // minimize the cost of the reduction.
-
-      // TODO: This should be extended to count common operands for
-      //       compares and binops.
-
-      // Step 1: Count the number of times each compare predicate occurs.
-      SmallDenseMap<unsigned, unsigned> PredCountMap;
-      for (Value *RdxVal : ReducedVals) {
-        CmpInst::Predicate Pred;
-        if (match(RdxVal, m_Cmp(Pred, m_Value(), m_Value())))
-          ++PredCountMap[Pred];
-      }
-      // Step 2: Sort the values so the most common predicates come first.
-      stable_sort(ReducedVals, [&PredCountMap](Value *A, Value *B) {
-        CmpInst::Predicate PredA, PredB;
-        if (match(A, m_Cmp(PredA, m_Value(), m_Value())) &&
-            match(B, m_Cmp(PredB, m_Value(), m_Value()))) {
-          return PredCountMap[PredA] > PredCountMap[PredB];
-        }
-        return false;
-      });
-    }
+    SmallDenseSet<Value *> IgnoreList;
+    for (ReductionOpsType &RdxOps : ReductionOps)
+      for (Value *RdxOp : RdxOps) {
+        if (!RdxOp)
+          continue;
+        IgnoreList.insert(RdxOp);
+      }
+    bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
+
+    // Need to track reduced vals, they may be changed during vectorization of
+    // subvectors.
+    for (ArrayRef<Value *> Candidates : ReducedVals)
+      for (Value *V : Candidates)
+        TrackedVals.try_emplace(V, V);
 
+    DenseMap<Value *, unsigned> VectorizedVals;
     Value *VectorizedTree = nullptr;
-    unsigned i = 0;
-    while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
-      ArrayRef<Value *> VL(&ReducedVals[i], ReduxWidth);
-      V.buildTree(VL, IgnoreList);
-      if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true))
-        break;
-      if (V.isLoadCombineReductionCandidate(RdxKind))
-        break;
-      V.reorderTopToBottom();
-      V.reorderBottomToTop(/*IgnoreReorder=*/true);
-      V.buildExternalUses(ExternallyUsedValues);
-
-      // For a poison-safe boolean logic reduction, do not replace select
-      // instructions with logic ops. All reduced values will be frozen (see
-      // below) to prevent leaking poison.
-      if (isa<SelectInst>(ReductionRoot) &&
-          isBoolLogicOp(cast<Instruction>(ReductionRoot)) &&
-          NumReducedVals != ReduxWidth)
-        break;
+    bool CheckForReusedReductionOps = false;
+    // Try to vectorize elements based on their type.
+    for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
+      ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
+      InstructionsState S = getSameOpcode(OrigReducedVals);
+      SmallVector<Value *> Candidates;
+      DenseMap<Value *, Value *> TrackedToOrig;
+      for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
+        Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
+        // Check if the reduction value was not overriden by the extractelement
+        // instruction because of the vectorization and exclude it, if it is not
+        // compatible with other values.
+        if (auto *Inst = dyn_cast<Instruction>(RdxVal))
+          if (isVectorLikeInstWithConstOps(Inst) &&
+              (!S.getOpcode() || !S.isOpcodeOrAlt(Inst)))
+            continue;
+        Candidates.push_back(RdxVal);
+        TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
+      }
+      bool ShuffledExtracts = false;
+      // Try to handle shuffled extractelements.
+      if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
+          I + 1 < E) {
+        InstructionsState NextS = getSameOpcode(ReducedVals[I + 1]);
+        if (NextS.getOpcode() == Instruction::ExtractElement &&
+            !NextS.isAltShuffle()) {
+          SmallVector<Value *> CommonCandidates(Candidates);
+          for (Value *RV : ReducedVals[I + 1]) {
+            Value *RdxVal = TrackedVals.find(RV)->second;
+            // Check if the reduction value was not overriden by the
+            // extractelement instruction because of the vectorization and
+            // exclude it, if it is not compatible with other values.
+            if (auto *Inst = dyn_cast<Instruction>(RdxVal))
+              if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst))
+                continue;
+            CommonCandidates.push_back(RdxVal);
+            TrackedToOrig.try_emplace(RdxVal, RV);
+          }
+          SmallVector<int> Mask;
+          if (isFixedVectorShuffle(CommonCandidates, Mask)) {
+            ++I;
+            Candidates.swap(CommonCandidates);
+            ShuffledExtracts = true;
+          }
+        }
+      }
+      unsigned NumReducedVals = Candidates.size();
+      if (NumReducedVals < ReductionLimit)
+        continue;
 
-      V.computeMinimumValueSizes();
+      unsigned MaxVecRegSize = V.getMaxVecRegSize();
+      unsigned EltSize = V.getVectorElementSize(Candidates[0]);
+      unsigned MaxElts = RegMaxNumber * PowerOf2Floor(MaxVecRegSize / EltSize);
+
+      unsigned ReduxWidth = std::min<unsigned>(
+          PowerOf2Floor(NumReducedVals), std::max(RedValsMaxNumber, MaxElts));
+      unsigned Start = 0;
+      unsigned Pos = Start;
+      // Restarts vectorization attempt with lower vector factor.
+      unsigned PrevReduxWidth = ReduxWidth;
+      bool CheckForReusedReductionOpsLocal = false;
+      auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
+                                  &CheckForReusedReductionOpsLocal,
+                                  &PrevReduxWidth, &V,
+                                  &IgnoreList](bool IgnoreVL = false) {
+        bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
+        if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
+          // Check if any of the reduction ops are gathered. If so, worth
+          // trying again with less number of reduction ops.
+          CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
+        }
+        ++Pos;
+        if (Pos < NumReducedVals - ReduxWidth + 1)
+          return IsAnyRedOpGathered;
+        Pos = Start;
+        ReduxWidth /= 2;
+        return IsAnyRedOpGathered;
+      };
+      while (Pos < NumReducedVals - ReduxWidth + 1 &&
+             ReduxWidth >= ReductionLimit) {
+        // Dependency in tree of the reduction ops - drop this attempt, try
+        // later.
+        if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
+            Start == 0) {
+          CheckForReusedReductionOps = true;
+          break;
+        }
+        PrevReduxWidth = ReduxWidth;
+        ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
+        // Beeing analyzed already - skip.
+        if (V.areAnalyzedReductionVals(VL)) {
+          (void)AdjustReducedVals(/*IgnoreVL=*/true);
+          continue;
+        }
+        // Early exit if any of the reduction values were deleted during
+        // previous vectorization attempts.
+        if (any_of(VL, [&V](Value *RedVal) {
+              auto *RedValI = dyn_cast<Instruction>(RedVal);
+              if (!RedValI)
+                return false;
+              return V.isDeleted(RedValI);
+            }))
+          break;
+        V.buildTree(VL, IgnoreList);
+        if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
+          if (!AdjustReducedVals())
+            V.analyzedReductionVals(VL);
+          continue;
+        }
+        if (V.isLoadCombineReductionCandidate(RdxKind)) {
+          if (!AdjustReducedVals())
+            V.analyzedReductionVals(VL);
+          continue;
+        }
+        V.reorderTopToBottom();
+        // No need to reorder the root node at all.
+        V.reorderBottomToTop(/*IgnoreReorder=*/true);
+        // Keep extracted other reduction values, if they are used in the
+        // vectorization trees.
+        BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
+            ExternallyUsedValues);
+        for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
+          if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
+            continue;
+          for_each(ReducedVals[Cnt],
+                   [&LocalExternallyUsedValues, &TrackedVals](Value *V) {
+                     if (isa<Instruction>(V))
+                       LocalExternallyUsedValues[TrackedVals[V]];
+                   });
+        }
+        // Number of uses of the candidates in the vector of values.
+        SmallDenseMap<Value *, unsigned> NumUses;
+        for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) {
+          Value *V = Candidates[Cnt];
+          if (NumUses.count(V) > 0)
+            continue;
+          NumUses[V] = std::count(VL.begin(), VL.end(), V);
+        }
+        for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) {
+          Value *V = Candidates[Cnt];
+          if (NumUses.count(V) > 0)
+            continue;
+          NumUses[V] = std::count(VL.begin(), VL.end(), V);
+        }
+        // Gather externally used values.
+        SmallPtrSet<Value *, 4> Visited;
+        for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) {
+          Value *V = Candidates[Cnt];
+          if (!Visited.insert(V).second)
+            continue;
+          unsigned NumOps = VectorizedVals.lookup(V) + NumUses[V];
+          if (NumOps != ReducedValsToOps.find(V)->second.size())
+            LocalExternallyUsedValues[V];
+        }
+        for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) {
+          Value *V = Candidates[Cnt];
+          if (!Visited.insert(V).second)
+            continue;
+          unsigned NumOps = VectorizedVals.lookup(V) + NumUses[V];
+          if (NumOps != ReducedValsToOps.find(V)->second.size())
+            LocalExternallyUsedValues[V];
+        }
+        V.buildExternalUses(LocalExternallyUsedValues);
+
+        V.computeMinimumValueSizes();
+
+        // Intersect the fast-math-flags from all reduction operations.
+        FastMathFlags RdxFMF;
+        RdxFMF.set();
+        for (Value *U : IgnoreList)
+          if (auto *FPMO = dyn_cast<FPMathOperator>(U))
+            RdxFMF &= FPMO->getFastMathFlags();
+        // Estimate cost.
+        InstructionCost TreeCost = V.getTreeCost(VL);
+        InstructionCost ReductionCost =
+            getReductionCost(TTI, VL, ReduxWidth, RdxFMF);
+        InstructionCost Cost = TreeCost + ReductionCost;
+        if (!Cost.isValid()) {
+          LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n");
+          return nullptr;
+        }
+        if (Cost >= -SLPCostThreshold) {
+          V.getORE()->emit([&]() {
+            return OptimizationRemarkMissed(
+                       SV_NAME, "HorSLPNotBeneficial",
+                       ReducedValsToOps.find(VL[0])->second.front())
+                   << "Vectorizing horizontal reduction is possible"
+                   << "but not beneficial with cost " << ore::NV("Cost", Cost)
+                   << " and threshold "
+                   << ore::NV("Threshold", -SLPCostThreshold);
+          });
+          if (!AdjustReducedVals())
+            V.analyzedReductionVals(VL);
+          continue;
+        }
 
-      // Estimate cost.
-      InstructionCost TreeCost =
-          V.getTreeCost(makeArrayRef(&ReducedVals[i], ReduxWidth));
-      InstructionCost ReductionCost =
-          getReductionCost(TTI, ReducedVals[i], ReduxWidth, RdxFMF);
-      InstructionCost Cost = TreeCost + ReductionCost;
-      if (!Cost.isValid()) {
-        LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n");
-        return nullptr;
-      }
-      if (Cost >= -SLPCostThreshold) {
+        LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
+                          << Cost << ". (HorRdx)\n");
         V.getORE()->emit([&]() {
-          return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
-                                          cast<Instruction>(VL[0]))
-                 << "Vectorizing horizontal reduction is possible"
-                 << "but not beneficial with cost " << ore::NV("Cost", Cost)
-                 << " and threshold "
-                 << ore::NV("Threshold", -SLPCostThreshold);
+          return OptimizationRemark(
+                     SV_NAME, "VectorizedHorizontalReduction",
+                     ReducedValsToOps.find(VL[0])->second.front())
+                 << "Vectorized horizontal reduction with cost "
+                 << ore::NV("Cost", Cost) << " and with tree size "
+                 << ore::NV("TreeSize", V.getTreeSize());
         });
-        break;
-      }
 
-      LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
-                        << Cost << ". (HorRdx)\n");
-      V.getORE()->emit([&]() {
-        return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
-                                  cast<Instruction>(VL[0]))
-               << "Vectorized horizontal reduction with cost "
-               << ore::NV("Cost", Cost) << " and with tree size "
-               << ore::NV("TreeSize", V.getTreeSize());
-      });
+        Builder.setFastMathFlags(RdxFMF);
 
-      // Vectorize a tree.
-      DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
-      Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
+        // Vectorize a tree.
+        Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues);
 
-      // Emit a reduction. If the root is a select (min/max idiom), the insert
-      // point is the compare condition of that select.
-      Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
-      if (isCmpSelMinMax(RdxRootInst))
-        Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst));
-      else
-        Builder.SetInsertPoint(RdxRootInst);
+        // Emit a reduction. If the root is a select (min/max idiom), the insert
+        // point is the compare condition of that select.
+        Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
+        if (IsCmpSelMinMax)
+          Builder.SetInsertPoint(GetCmpForMinMaxReduction(RdxRootInst));
+        else
+          Builder.SetInsertPoint(RdxRootInst);
 
-      // To prevent poison from leaking across what used to be sequential, safe,
-      // scalar boolean logic operations, the reduction operand must be frozen.
-      if (isa<SelectInst>(RdxRootInst) && isBoolLogicOp(RdxRootInst))
-        VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
+        // To prevent poison from leaking across what used to be sequential,
+        // safe, scalar boolean logic operations, the reduction operand must be
+        // frozen.
+        if (isa<SelectInst>(RdxRootInst) && isBoolLogicOp(RdxRootInst))
+          VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
 
-      Value *ReducedSubTree =
-          emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
+        Value *ReducedSubTree =
+            emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
 
-      if (!VectorizedTree) {
-        // Initialize the final value in the reduction.
-        VectorizedTree = ReducedSubTree;
-      } else {
-        // Update the final value in the reduction.
-        Builder.SetCurrentDebugLocation(Loc);
-        VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
-                                  ReducedSubTree, "op.rdx", ReductionOps);
+        if (!VectorizedTree) {
+          // Initialize the final value in the reduction.
+          VectorizedTree = ReducedSubTree;
+        } else {
+          // Update the final value in the reduction.
+          Builder.SetCurrentDebugLocation(
+              cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
+          VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
+                                    ReducedSubTree, "op.rdx", ReductionOps);
+        }
+        // Count vectorized reduced values to exclude them from final reduction.
+        for (Value *V : VL)
+          ++VectorizedVals.try_emplace(TrackedToOrig.find(V)->second, 0)
+                .first->getSecond();
+        Pos += ReduxWidth;
+        Start = Pos;
+        ReduxWidth = PowerOf2Floor(NumReducedVals - Pos);
       }
-      i += ReduxWidth;
-      ReduxWidth = PowerOf2Floor(NumReducedVals - i);
     }
-
     if (VectorizedTree) {
       // Finish the reduction.
-      for (; i < NumReducedVals; ++i) {
-        auto *I = cast<Instruction>(ReducedVals[i]);
-        Builder.SetCurrentDebugLocation(I->getDebugLoc());
-        VectorizedTree =
-            createOp(Builder, RdxKind, VectorizedTree, I, "", ReductionOps);
+      // Need to add extra arguments and not vectorized possible reduction
+      // values.
+      // Try to avoid dependencies between the scalar remainders after
+      // reductions.
+      auto &&FinalGen =
+          [this, &Builder,
+           &TrackedVals](ArrayRef<std::pair<Instruction *, Value *>> InstVals) {
+            unsigned Sz = InstVals.size();
+            SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 +
+                                                                     Sz % 2);
+            for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
+              Instruction *RedOp = InstVals[I + 1].first;
+              Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
+              Value *RdxVal1 = InstVals[I].second;
+              Value *StableRdxVal1 = RdxVal1;
+              auto It1 = TrackedVals.find(RdxVal1);
+              if (It1 != TrackedVals.end())
+                StableRdxVal1 = It1->second;
+              Value *RdxVal2 = InstVals[I + 1].second;
+              Value *StableRdxVal2 = RdxVal2;
+              auto It2 = TrackedVals.find(RdxVal2);
+              if (It2 != TrackedVals.end())
+                StableRdxVal2 = It2->second;
+              Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
+                                         StableRdxVal2, "op.rdx", ReductionOps);
+              ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
+            }
+            if (Sz % 2 == 1)
+              ExtraReds[Sz / 2] = InstVals.back();
+            return ExtraReds;
+          };
+      SmallVector<std::pair<Instruction *, Value *>> ExtraReductions;
+      SmallPtrSet<Value *, 8> Visited;
+      for (ArrayRef<Value *> Candidates : ReducedVals) {
+        for (Value *RdxVal : Candidates) {
+          if (!Visited.insert(RdxVal).second)
+            continue;
+          unsigned NumOps = VectorizedVals.lookup(RdxVal);
+          for (Instruction *RedOp :
+               makeArrayRef(ReducedValsToOps.find(RdxVal)->second)
+                   .drop_back(NumOps))
+            ExtraReductions.emplace_back(RedOp, RdxVal);
+        }
       }
       for (auto &Pair : ExternallyUsedValues) {
         // Add each externally used value to the final reduction.
-        for (auto *I : Pair.second) {
-          Builder.SetCurrentDebugLocation(I->getDebugLoc());
-          VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
-                                    Pair.first, "op.extra", I);
-        }
+        for (auto *I : Pair.second)
+          ExtraReductions.emplace_back(I, Pair.first);
+      }
+      // Iterate through all not-vectorized reduction values/extra arguments.
+      while (ExtraReductions.size() > 1) {
+        SmallVector<std::pair<Instruction *, Value *>> NewReds =
+            FinalGen(ExtraReductions);
+        ExtraReductions.swap(NewReds);
+      }
+      // Final reduction.
+      if (ExtraReductions.size() == 1) {
+        Instruction *RedOp = ExtraReductions.back().first;
+        Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
+        Value *RdxVal = ExtraReductions.back().second;
+        Value *StableRdxVal = RdxVal;
+        auto It = TrackedVals.find(RdxVal);
+        if (It != TrackedVals.end())
+          StableRdxVal = It->second;
+        VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
+                                  StableRdxVal, "op.rdx", ReductionOps);
       }
 
       ReductionRoot->replaceAllUsesWith(VectorizedTree);
 
-      // Mark all scalar reduction ops for deletion, they are replaced by the
-      // vector reductions.
-      V.eraseInstructions(IgnoreList);
+      // The original scalar reduction is expected to have no remaining
+      // uses outside the reduction tree itself.  Assert that we got this
+      // correct, replace internal uses with undef, and mark for eventual
+      // deletion.
+#ifndef NDEBUG
+      SmallSet<Value *, 4> IgnoreSet;
+      for (ArrayRef<Value *> RdxOps : ReductionOps)
+        IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
+#endif
+      for (ArrayRef<Value *> RdxOps : ReductionOps) {
+        for (Value *Ignore : RdxOps) {
+          if (!Ignore)
+            continue;
+#ifndef NDEBUG
+          for (auto *U : Ignore->users()) {
+            assert(IgnoreSet.count(U) &&
+                   "All users must be either in the reduction ops list.");
+          }
+#endif
+          if (!Ignore->use_empty()) {
+            Value *Undef = UndefValue::get(Ignore->getType());
+            Ignore->replaceAllUsesWith(Undef);
+          }
+          V.eraseInstruction(cast<Instruction>(Ignore));
+        }
+      }
+    } else if (!CheckForReusedReductionOps) {
+      for (ReductionOpsType &RdxOps : ReductionOps)
+        for (Value *RdxOp : RdxOps)
+          V.analyzedReductionRoot(cast<Instruction>(RdxOp));
     }
     return VectorizedTree;
   }
 
-  unsigned numReductionValues() const { return ReducedVals.size(); }
-
 private:
   /// Calculate the cost of a reduction.
   InstructionCost getReductionCost(TargetTransformInfo *TTI,
-                                   Value *FirstReducedVal, unsigned ReduxWidth,
-                                   FastMathFlags FMF) {
+                                   ArrayRef<Value *> ReducedVals,
+                                   unsigned ReduxWidth, FastMathFlags FMF) {
     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+    Value *FirstReducedVal = ReducedVals.front();
     Type *ScalarTy = FirstReducedVal->getType();
     FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);
-    InstructionCost VectorCost, ScalarCost;
+    InstructionCost VectorCost = 0, ScalarCost;
+    // If all of the reduced values are constant, the vector cost is 0, since
+    // the reduction value can be calculated at the compile time.
+    bool AllConsts = all_of(ReducedVals, isConstant);
     switch (RdxKind) {
     case RecurKind::Add:
     case RecurKind::Mul:
@@ -9399,17 +11407,22 @@ private:
     case RecurKind::FAdd:
     case RecurKind::FMul: {
       unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
-      VectorCost =
-          TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
+      if (!AllConsts)
+        VectorCost =
+            TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
       ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
       break;
     }
     case RecurKind::FMax:
     case RecurKind::FMin: {
       auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);
-      auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
-      VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
-                                               /*IsUnsigned=*/false, CostKind);
+      if (!AllConsts) {
+        auto *VecCondTy =
+            cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
+        VectorCost =
+            TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
+                                        /*IsUnsigned=*/false, CostKind);
+      }
       CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
       ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy,
                                            SclCondTy, RdxPred, CostKind) +
@@ -9422,11 +11435,14 @@ private:
     case RecurKind::UMax:
     case RecurKind::UMin: {
       auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);
-      auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
-      bool IsUnsigned =
-          RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin;
-      VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, IsUnsigned,
-                                               CostKind);
+      if (!AllConsts) {
+        auto *VecCondTy =
+            cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
+        bool IsUnsigned =
+            RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin;
+        VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
+                                                 IsUnsigned, CostKind);
+      }
       CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
       ScalarCost = TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
                                            SclCondTy, RdxPred, CostKind) +
@@ -9490,21 +11506,22 @@ static Optional<unsigned> getAggregateSize(Instruction *InsertInst) {
   } while (true);
 }
 
-static bool findBuildAggregate_rec(Instruction *LastInsertInst,
+static void findBuildAggregate_rec(Instruction *LastInsertInst,
                                    TargetTransformInfo *TTI,
                                    SmallVectorImpl<Value *> &BuildVectorOpds,
                                    SmallVectorImpl<Value *> &InsertElts,
                                    unsigned OperandOffset) {
   do {
     Value *InsertedOperand = LastInsertInst->getOperand(1);
-    Optional<int> OperandIndex = getInsertIndex(LastInsertInst, OperandOffset);
+    Optional<unsigned> OperandIndex =
+        getInsertIndex(LastInsertInst, OperandOffset);
     if (!OperandIndex)
-      return false;
+      return;
     if (isa<InsertElementInst>(InsertedOperand) ||
         isa<InsertValueInst>(InsertedOperand)) {
-      if (!findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
-                                  BuildVectorOpds, InsertElts, *OperandIndex))
-        return false;
+      findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
+                             BuildVectorOpds, InsertElts, *OperandIndex);
+
     } else {
       BuildVectorOpds[*OperandIndex] = InsertedOperand;
       InsertElts[*OperandIndex] = LastInsertInst;
@@ -9514,7 +11531,6 @@ static bool findBuildAggregate_rec(Instruction *LastInsertInst,
            (isa<InsertValueInst>(LastInsertInst) ||
             isa<InsertElementInst>(LastInsertInst)) &&
            LastInsertInst->hasOneUse());
-  return true;
 }
 
 /// Recognize construction of vectors like
@@ -9549,13 +11565,11 @@ static bool findBuildAggregate(Instruction *LastInsertInst,
   BuildVectorOpds.resize(*AggregateSize);
   InsertElts.resize(*AggregateSize);
 
-  if (findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts,
-                             0)) {
-    llvm::erase_value(BuildVectorOpds, nullptr);
-    llvm::erase_value(InsertElts, nullptr);
-    if (BuildVectorOpds.size() >= 2)
-      return true;
-  }
+  findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0);
+  llvm::erase_value(BuildVectorOpds, nullptr);
+  llvm::erase_value(InsertElts, nullptr);
+  if (BuildVectorOpds.size() >= 2)
+    return true;
 
   return false;
 }
@@ -9642,7 +11656,8 @@ static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
 /// performed.
 static bool tryToVectorizeHorReductionOrInstOperands(
     PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
-    TargetTransformInfo *TTI,
+    TargetTransformInfo *TTI, ScalarEvolution &SE, const DataLayout &DL,
+    const TargetLibraryInfo &TLI,
     const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) {
   if (!ShouldVectorizeHor)
     return false;
@@ -9661,7 +11676,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(
   // horizontal reduction.
   // Interrupt the process if the Root instruction itself was vectorized or all
   // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
-  // Skip the analysis of CmpInsts.Compiler implements postanalysis of the
+  // Skip the analysis of CmpInsts. Compiler implements postanalysis of the
   // CmpInsts so we can skip extra attempts in
   // tryToVectorizeHorReductionOrInstOperands and save compile time.
   std::queue<std::pair<Instruction *, unsigned>> Stack;
@@ -9669,13 +11684,16 @@ static bool tryToVectorizeHorReductionOrInstOperands(
   SmallPtrSet<Value *, 8> VisitedInstrs;
   SmallVector<WeakTrackingVH> PostponedInsts;
   bool Res = false;
-  auto &&TryToReduce = [TTI, &P, &R](Instruction *Inst, Value *&B0,
-                                     Value *&B1) -> Value * {
+  auto &&TryToReduce = [TTI, &SE, &DL, &P, &R, &TLI](Instruction *Inst,
+                                                     Value *&B0,
+                                                     Value *&B1) -> Value * {
+    if (R.isAnalyzedReductionRoot(Inst))
+      return nullptr;
     bool IsBinop = matchRdxBop(Inst, B0, B1);
     bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value()));
     if (IsBinop || IsSelect) {
       HorizontalReduction HorRdx;
-      if (HorRdx.matchAssociativeReduction(P, Inst))
+      if (HorRdx.matchAssociativeReduction(P, Inst, SE, DL, TLI))
         return HorRdx.tryToReduce(R, TTI);
     }
     return nullptr;
@@ -9720,7 +11738,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(
       // Do not try to vectorize CmpInst operands, this is done separately.
       // Final attempt for binop args vectorization should happen after the loop
       // to try to find reductions.
-      if (!isa<CmpInst>(Inst))
+      if (!isa<CmpInst, InsertElementInst, InsertValueInst>(Inst))
         PostponedInsts.push_back(Inst);
     }
 
@@ -9733,8 +11751,8 @@ static bool tryToVectorizeHorReductionOrInstOperands(
           if (auto *I = dyn_cast<Instruction>(Op))
             // Do not try to vectorize CmpInst operands,  this is done
             // separately.
-            if (!isa<PHINode>(I) && !isa<CmpInst>(I) && !R.isDeleted(I) &&
-                I->getParent() == BB)
+            if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
+                !R.isDeleted(I) && I->getParent() == BB)
               Stack.emplace(I, Level);
   }
   // Try to vectorized binops where reductions were not found.
@@ -9758,8 +11776,8 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
   auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool {
     return tryToVectorize(I, R);
   };
-  return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI,
-                                                  ExtraVectorization);
+  return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, *SE, *DL,
+                                                  *TLI, ExtraVectorization);
 }
 
 bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
@@ -9927,12 +11945,16 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions(
   for (auto *I : reverse(Instructions)) {
     if (R.isDeleted(I))
       continue;
-    if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))
+    if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
       OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
-    else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I))
+    } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
       OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
-    else if (isa<CmpInst>(I))
+    } else if (isa<CmpInst>(I)) {
       PostponedCmps.push_back(I);
+      continue;
+    }
+    // Try to find reductions in buildvector sequnces.
+    OpsChanged |= vectorizeRootInstruction(nullptr, I, BB, R, TTI);
   }
   if (AtTerminator) {
     // Try to find reductions first.
@@ -10350,7 +12372,7 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
         DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
             DT->getNode(I2->getParent());
         assert(NodeI1 && "Should only process reachable instructions");
-        assert(NodeI1 && "Should only process reachable instructions");
+        assert(NodeI2 && "Should only process reachable instructions");
         assert((NodeI1 == NodeI2) ==
                    (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
                "Different nodes should have different DFS numbers");
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 8822c0004eb2..97f2b1a93815 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -72,17 +72,17 @@ class VPRecipeBuilder {
   VPRecipeBase *tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
                                  VFRange &Range, VPlanPtr &Plan);
 
-  /// Check if an induction recipe should be constructed for \I. If so build and
-  /// return it. If not, return null.
-  VPWidenIntOrFpInductionRecipe *
-  tryToOptimizeInductionPHI(PHINode *Phi, ArrayRef<VPValue *> Operands,
-                            VFRange &Range) const;
+  /// Check if an induction recipe should be constructed for \p Phi. If so build
+  /// and return it. If not, return null.
+  VPRecipeBase *tryToOptimizeInductionPHI(PHINode *Phi,
+                                          ArrayRef<VPValue *> Operands,
+                                          VPlan &Plan, VFRange &Range);
 
   /// Optimize the special case where the operand of \p I is a constant integer
   /// induction variable.
   VPWidenIntOrFpInductionRecipe *
   tryToOptimizeInductionTruncate(TruncInst *I, ArrayRef<VPValue *> Operands,
-                                 VFRange &Range, VPlan &Plan) const;
+                                 VFRange &Range, VPlan &Plan);
 
   /// Handle non-loop phi nodes. Return a VPValue, if all incoming values match
   /// or a new VPBlendRecipe otherwise. Currently all such phi nodes are turned
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 342d4a074e10..4d709097c306 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -23,11 +23,10 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
-#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Type.h"
@@ -35,13 +34,13 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GenericDomTreeConstruction.h"
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include <cassert>
-#include <iterator>
 #include <string>
 #include <vector>
 
@@ -60,7 +59,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) {
 }
 #endif
 
-Value *VPLane::getAsRuntimeExpr(IRBuilder<> &Builder,
+Value *VPLane::getAsRuntimeExpr(IRBuilderBase &Builder,
                                 const ElementCount &VF) const {
   switch (LaneKind) {
   case VPLane::Kind::ScalableLast:
@@ -158,25 +157,25 @@ void VPBlockBase::setPlan(VPlan *ParentPlan) {
 }
 
 /// \return the VPBasicBlock that is the exit of Block, possibly indirectly.
-const VPBasicBlock *VPBlockBase::getExitBasicBlock() const {
+const VPBasicBlock *VPBlockBase::getExitingBasicBlock() const {
   const VPBlockBase *Block = this;
   while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
-    Block = Region->getExit();
+    Block = Region->getExiting();
   return cast<VPBasicBlock>(Block);
 }
 
-VPBasicBlock *VPBlockBase::getExitBasicBlock() {
+VPBasicBlock *VPBlockBase::getExitingBasicBlock() {
   VPBlockBase *Block = this;
   while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
-    Block = Region->getExit();
+    Block = Region->getExiting();
   return cast<VPBasicBlock>(Block);
 }
 
 VPBlockBase *VPBlockBase::getEnclosingBlockWithSuccessors() {
   if (!Successors.empty() || !Parent)
     return this;
-  assert(Parent->getExit() == this &&
-         "Block w/o successors not the exit of its parent.");
+  assert(Parent->getExiting() == this &&
+         "Block w/o successors not the exiting block of its parent.");
   return Parent->getEnclosingBlockWithSuccessors();
 }
 
@@ -188,28 +187,6 @@ VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() {
   return Parent->getEnclosingBlockWithPredecessors();
 }
 
-VPValue *VPBlockBase::getCondBit() {
-  return CondBitUser.getSingleOperandOrNull();
-}
-
-const VPValue *VPBlockBase::getCondBit() const {
-  return CondBitUser.getSingleOperandOrNull();
-}
-
-void VPBlockBase::setCondBit(VPValue *CV) { CondBitUser.resetSingleOpUser(CV); }
-
-VPValue *VPBlockBase::getPredicate() {
-  return PredicateUser.getSingleOperandOrNull();
-}
-
-const VPValue *VPBlockBase::getPredicate() const {
-  return PredicateUser.getSingleOperandOrNull();
-}
-
-void VPBlockBase::setPredicate(VPValue *CV) {
-  PredicateUser.resetSingleOpUser(CV);
-}
-
 void VPBlockBase::deleteCFG(VPBlockBase *Entry) {
   SmallVector<VPBlockBase *, 8> Blocks(depth_first(Entry));
 
@@ -245,6 +222,52 @@ Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {
   // set(Def, Extract, Instance);
   return Extract;
 }
+BasicBlock *VPTransformState::CFGState::getPreheaderBBFor(VPRecipeBase *R) {
+  VPRegionBlock *LoopRegion = R->getParent()->getEnclosingLoopRegion();
+  return VPBB2IRBB[LoopRegion->getPreheaderVPBB()];
+}
+
+void VPTransformState::addNewMetadata(Instruction *To,
+                                      const Instruction *Orig) {
+  // If the loop was versioned with memchecks, add the corresponding no-alias
+  // metadata.
+  if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
+    LVer->annotateInstWithNoAlias(To, Orig);
+}
+
+void VPTransformState::addMetadata(Instruction *To, Instruction *From) {
+  propagateMetadata(To, From);
+  addNewMetadata(To, From);
+}
+
+void VPTransformState::addMetadata(ArrayRef<Value *> To, Instruction *From) {
+  for (Value *V : To) {
+    if (Instruction *I = dyn_cast<Instruction>(V))
+      addMetadata(I, From);
+  }
+}
+
+void VPTransformState::setDebugLocFromInst(const Value *V) {
+  if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
+    const DILocation *DIL = Inst->getDebugLoc();
+
+    // When a FSDiscriminator is enabled, we don't need to add the multiply
+    // factors to the discriminators.
+    if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
+        !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
+      // FIXME: For scalable vectors, assume vscale=1.
+      auto NewDIL =
+          DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
+      if (NewDIL)
+        Builder.SetCurrentDebugLocation(*NewDIL);
+      else
+        LLVM_DEBUG(dbgs() << "Failed to create new discriminator: "
+                          << DIL->getFilename() << " Line: " << DIL->getLine());
+    } else
+      Builder.SetCurrentDebugLocation(DIL);
+  } else
+    Builder.SetCurrentDebugLocation(DebugLoc());
+}
 
 BasicBlock *
 VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
@@ -252,43 +275,36 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
   // Pred stands for Predessor. Prev stands for Previous - last visited/created.
   BasicBlock *PrevBB = CFG.PrevBB;
   BasicBlock *NewBB = BasicBlock::Create(PrevBB->getContext(), getName(),
-                                         PrevBB->getParent(), CFG.LastBB);
+                                         PrevBB->getParent(), CFG.ExitBB);
   LLVM_DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n');
 
   // Hook up the new basic block to its predecessors.
   for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) {
-    VPBasicBlock *PredVPBB = PredVPBlock->getExitBasicBlock();
-    auto &PredVPSuccessors = PredVPBB->getSuccessors();
+    VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock();
+    auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
     BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB];
 
-    // In outer loop vectorization scenario, the predecessor BBlock may not yet
-    // be visited(backedge). Mark the VPBasicBlock for fixup at the end of
-    // vectorization. We do not encounter this case in inner loop vectorization
-    // as we start out by building a loop skeleton with the vector loop header
-    // and latch blocks. As a result, we never enter this function for the
-    // header block in the non VPlan-native path.
-    if (!PredBB) {
-      assert(EnableVPlanNativePath &&
-             "Unexpected null predecessor in non VPlan-native path");
-      CFG.VPBBsToFix.push_back(PredVPBB);
-      continue;
-    }
-
     assert(PredBB && "Predecessor basic-block not found building successor.");
     auto *PredBBTerminator = PredBB->getTerminator();
     LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n');
+
+    auto *TermBr = dyn_cast<BranchInst>(PredBBTerminator);
     if (isa<UnreachableInst>(PredBBTerminator)) {
       assert(PredVPSuccessors.size() == 1 &&
              "Predecessor ending w/o branch must have single successor.");
+      DebugLoc DL = PredBBTerminator->getDebugLoc();
       PredBBTerminator->eraseFromParent();
-      BranchInst::Create(NewBB, PredBB);
+      auto *Br = BranchInst::Create(NewBB, PredBB);
+      Br->setDebugLoc(DL);
+    } else if (TermBr && !TermBr->isConditional()) {
+      TermBr->setSuccessor(0, NewBB);
     } else {
-      assert(PredVPSuccessors.size() == 2 &&
-             "Predecessor ending with branch must have two successors.");
+      // Set each forward successor here when it is created, excluding
+      // backedges. A backward successor is set when the branch is created.
       unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
-      assert(!PredBBTerminator->getSuccessor(idx) &&
+      assert(!TermBr->getSuccessor(idx) &&
              "Trying to reset an existing successor block.");
-      PredBBTerminator->setSuccessor(idx, NewBB);
+      TermBr->setSuccessor(idx, NewBB);
     }
   }
   return NewBB;
@@ -300,27 +316,51 @@ void VPBasicBlock::execute(VPTransformState *State) {
   VPBlockBase *SingleHPred = nullptr;
   BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible.
 
-  // 1. Create an IR basic block, or reuse the last one if possible.
-  // The last IR basic block is reused, as an optimization, in three cases:
-  // A. the first VPBB reuses the loop header BB - when PrevVPBB is null;
-  // B. when the current VPBB has a single (hierarchical) predecessor which
-  //    is PrevVPBB and the latter has a single (hierarchical) successor; and
-  // C. when the current VPBB is an entry of a region replica - where PrevVPBB
-  //    is the exit of this region from a previous instance, or the predecessor
-  //    of this region.
-  if (PrevVPBB && /* A */
-      !((SingleHPred = getSingleHierarchicalPredecessor()) &&
-        SingleHPred->getExitBasicBlock() == PrevVPBB &&
-        PrevVPBB->getSingleHierarchicalSuccessor()) && /* B */
-      !(Replica && getPredecessors().empty())) {       /* C */
+  auto IsLoopRegion = [](VPBlockBase *BB) {
+    auto *R = dyn_cast<VPRegionBlock>(BB);
+    return R && !R->isReplicator();
+  };
+
+  // 1. Create an IR basic block, or reuse the last one or ExitBB if possible.
+  if (getPlan()->getVectorLoopRegion()->getSingleSuccessor() == this) {
+    // ExitBB can be re-used for the exit block of the Plan.
+    NewBB = State->CFG.ExitBB;
+    State->CFG.PrevBB = NewBB;
+
+    // Update the branch instruction in the predecessor to branch to ExitBB.
+    VPBlockBase *PredVPB = getSingleHierarchicalPredecessor();
+    VPBasicBlock *ExitingVPBB = PredVPB->getExitingBasicBlock();
+    assert(PredVPB->getSingleSuccessor() == this &&
+           "predecessor must have the current block as only successor");
+    BasicBlock *ExitingBB = State->CFG.VPBB2IRBB[ExitingVPBB];
+    // The Exit block of a loop is always set to be successor 0 of the Exiting
+    // block.
+    cast<BranchInst>(ExitingBB->getTerminator())->setSuccessor(0, NewBB);
+  } else if (PrevVPBB && /* A */
+             !((SingleHPred = getSingleHierarchicalPredecessor()) &&
+               SingleHPred->getExitingBasicBlock() == PrevVPBB &&
+               PrevVPBB->getSingleHierarchicalSuccessor() &&
+               (SingleHPred->getParent() == getEnclosingLoopRegion() &&
+                !IsLoopRegion(SingleHPred))) &&         /* B */
+             !(Replica && getPredecessors().empty())) { /* C */
+    // The last IR basic block is reused, as an optimization, in three cases:
+    // A. the first VPBB reuses the loop pre-header BB - when PrevVPBB is null;
+    // B. when the current VPBB has a single (hierarchical) predecessor which
+    //    is PrevVPBB and the latter has a single (hierarchical) successor which
+    //    both are in the same non-replicator region; and
+    // C. when the current VPBB is an entry of a region replica - where PrevVPBB
+    //    is the exiting VPBB of this region from a previous instance, or the
+    //    predecessor of this region.
+
     NewBB = createEmptyBasicBlock(State->CFG);
     State->Builder.SetInsertPoint(NewBB);
     // Temporarily terminate with unreachable until CFG is rewired.
     UnreachableInst *Terminator = State->Builder.CreateUnreachable();
+    // Register NewBB in its loop. In innermost loops its the same for all
+    // BB's.
+    if (State->CurrentVectorLoop)
+      State->CurrentVectorLoop->addBasicBlockToLoop(NewBB, *State->LI);
     State->Builder.SetInsertPoint(Terminator);
-    // Register NewBB in its loop. In innermost loops its the same for all BB's.
-    Loop *L = State->LI->getLoopFor(State->CFG.LastBB);
-    L->addBasicBlockToLoop(NewBB, *State->LI);
     State->CFG.PrevBB = NewBB;
   }
 
@@ -334,29 +374,6 @@ void VPBasicBlock::execute(VPTransformState *State) {
   for (VPRecipeBase &Recipe : Recipes)
     Recipe.execute(*State);
 
-  VPValue *CBV;
-  if (EnableVPlanNativePath && (CBV = getCondBit())) {
-    assert(CBV->getUnderlyingValue() &&
-           "Unexpected null underlying value for condition bit");
-
-    // Condition bit value in a VPBasicBlock is used as the branch selector. In
-    // the VPlan-native path case, since all branches are uniform we generate a
-    // branch instruction using the condition value from vector lane 0 and dummy
-    // successors. The successors are fixed later when the successor blocks are
-    // visited.
-    Value *NewCond = State->get(CBV, {0, 0});
-
-    // Replace the temporary unreachable terminator with the new conditional
-    // branch.
-    auto *CurrentTerminator = NewBB->getTerminator();
-    assert(isa<UnreachableInst>(CurrentTerminator) &&
-           "Expected to replace unreachable terminator with conditional "
-           "branch.");
-    auto *CondBr = BranchInst::Create(NewBB, nullptr, NewCond);
-    CondBr->setSuccessor(0, nullptr);
-    ReplaceInstWithInst(CurrentTerminator, CondBr);
-  }
-
   LLVM_DEBUG(dbgs() << "LV: filled BB:" << *NewBB);
 }
 
@@ -395,6 +412,61 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) {
   return SplitBlock;
 }
 
+VPRegionBlock *VPBasicBlock::getEnclosingLoopRegion() {
+  VPRegionBlock *P = getParent();
+  if (P && P->isReplicator()) {
+    P = P->getParent();
+    assert(!cast<VPRegionBlock>(P)->isReplicator() &&
+           "unexpected nested replicate regions");
+  }
+  return P;
+}
+
+static bool hasConditionalTerminator(const VPBasicBlock *VPBB) {
+  if (VPBB->empty()) {
+    assert(
+        VPBB->getNumSuccessors() < 2 &&
+        "block with multiple successors doesn't have a recipe as terminator");
+    return false;
+  }
+
+  const VPRecipeBase *R = &VPBB->back();
+  auto *VPI = dyn_cast<VPInstruction>(R);
+  bool IsCondBranch =
+      isa<VPBranchOnMaskRecipe>(R) ||
+      (VPI && (VPI->getOpcode() == VPInstruction::BranchOnCond ||
+               VPI->getOpcode() == VPInstruction::BranchOnCount));
+  (void)IsCondBranch;
+
+  if (VPBB->getNumSuccessors() >= 2 || VPBB->isExiting()) {
+    assert(IsCondBranch && "block with multiple successors not terminated by "
+                           "conditional branch recipe");
+
+    return true;
+  }
+
+  assert(
+      !IsCondBranch &&
+      "block with 0 or 1 successors terminated by conditional branch recipe");
+  return false;
+}
+
+VPRecipeBase *VPBasicBlock::getTerminator() {
+  if (hasConditionalTerminator(this))
+    return &back();
+  return nullptr;
+}
+
+const VPRecipeBase *VPBasicBlock::getTerminator() const {
+  if (hasConditionalTerminator(this))
+    return &back();
+  return nullptr;
+}
+
+bool VPBasicBlock::isExiting() const {
+  return getParent()->getExitingBasicBlock() == this;
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPBlockBase::printSuccessors(raw_ostream &O, const Twine &Indent) const {
   if (getSuccessors().empty()) {
@@ -411,13 +483,6 @@ void VPBlockBase::printSuccessors(raw_ostream &O, const Twine &Indent) const {
 void VPBasicBlock::print(raw_ostream &O, const Twine &Indent,
                          VPSlotTracker &SlotTracker) const {
   O << Indent << getName() << ":\n";
-  if (const VPValue *Pred = getPredicate()) {
-    O << Indent << "BlockPredicate:";
-    Pred->printAsOperand(O, SlotTracker);
-    if (const auto *PredInst = dyn_cast<VPInstruction>(Pred))
-      O << " (" << PredInst->getParent()->getName() << ")";
-    O << '\n';
-  }
 
   auto RecipeIndent = Indent + "  ";
   for (const VPRecipeBase &Recipe : *this) {
@@ -426,14 +491,6 @@ void VPBasicBlock::print(raw_ostream &O, const Twine &Indent,
   }
 
   printSuccessors(O, Indent);
-
-  if (const VPValue *CBV = getCondBit()) {
-    O << Indent << "CondBit: ";
-    CBV->printAsOperand(O, SlotTracker);
-    if (const auto *CBI = dyn_cast<VPInstruction>(CBV))
-      O << " (" << CBI->getParent()->getName() << ")";
-    O << '\n';
-  }
 }
 #endif
 
@@ -448,25 +505,26 @@ void VPRegionBlock::execute(VPTransformState *State) {
   ReversePostOrderTraversal<VPBlockBase *> RPOT(Entry);
 
   if (!isReplicator()) {
+    // Create and register the new vector loop.
+    Loop *PrevLoop = State->CurrentVectorLoop;
+    State->CurrentVectorLoop = State->LI->AllocateLoop();
+    BasicBlock *VectorPH = State->CFG.VPBB2IRBB[getPreheaderVPBB()];
+    Loop *ParentLoop = State->LI->getLoopFor(VectorPH);
+
+    // Insert the new loop into the loop nest and register the new basic blocks
+    // before calling any utilities such as SCEV that require valid LoopInfo.
+    if (ParentLoop)
+      ParentLoop->addChildLoop(State->CurrentVectorLoop);
+    else
+      State->LI->addTopLevelLoop(State->CurrentVectorLoop);
+
     // Visit the VPBlocks connected to "this", starting from it.
     for (VPBlockBase *Block : RPOT) {
-      if (EnableVPlanNativePath) {
-        // The inner loop vectorization path does not represent loop preheader
-        // and exit blocks as part of the VPlan. In the VPlan-native path, skip
-        // vectorizing loop preheader block. In future, we may replace this
-        // check with the check for loop preheader.
-        if (Block->getNumPredecessors() == 0)
-          continue;
-
-        // Skip vectorizing loop exit block. In future, we may replace this
-        // check with the check for loop exit.
-        if (Block->getNumSuccessors() == 0)
-          continue;
-      }
-
       LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
       Block->execute(State);
     }
+
+    State->CurrentVectorLoop = PrevLoop;
     return;
   }
 
@@ -508,341 +566,32 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
-bool VPRecipeBase::mayWriteToMemory() const {
-  switch (getVPDefID()) {
-  case VPWidenMemoryInstructionSC: {
-    return cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
-  }
-  case VPReplicateSC:
-  case VPWidenCallSC:
-    return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
-        ->mayWriteToMemory();
-  case VPBranchOnMaskSC:
-    return false;
-  case VPWidenIntOrFpInductionSC:
-  case VPWidenCanonicalIVSC:
-  case VPWidenPHISC:
-  case VPBlendSC:
-  case VPWidenSC:
-  case VPWidenGEPSC:
-  case VPReductionSC:
-  case VPWidenSelectSC: {
-    const Instruction *I =
-        dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
-    (void)I;
-    assert((!I || !I->mayWriteToMemory()) &&
-           "underlying instruction may write to memory");
-    return false;
-  }
-  default:
-    return true;
-  }
-}
-
-bool VPRecipeBase::mayReadFromMemory() const {
-  switch (getVPDefID()) {
-  case VPWidenMemoryInstructionSC: {
-    return !cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
-  }
-  case VPReplicateSC:
-  case VPWidenCallSC:
-    return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
-        ->mayReadFromMemory();
-  case VPBranchOnMaskSC:
-    return false;
-  case VPWidenIntOrFpInductionSC:
-  case VPWidenCanonicalIVSC:
-  case VPWidenPHISC:
-  case VPBlendSC:
-  case VPWidenSC:
-  case VPWidenGEPSC:
-  case VPReductionSC:
-  case VPWidenSelectSC: {
-    const Instruction *I =
-        dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
-    (void)I;
-    assert((!I || !I->mayReadFromMemory()) &&
-           "underlying instruction may read from memory");
-    return false;
-  }
-  default:
-    return true;
-  }
-}
-
-bool VPRecipeBase::mayHaveSideEffects() const {
-  switch (getVPDefID()) {
-  case VPBranchOnMaskSC:
-    return false;
-  case VPWidenIntOrFpInductionSC:
-  case VPWidenCanonicalIVSC:
-  case VPWidenPHISC:
-  case VPBlendSC:
-  case VPWidenSC:
-  case VPWidenGEPSC:
-  case VPReductionSC:
-  case VPWidenSelectSC: {
-    const Instruction *I =
-        dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
-    (void)I;
-    assert((!I || !I->mayHaveSideEffects()) &&
-           "underlying instruction has side-effects");
-    return false;
-  }
-  case VPReplicateSC: {
-    auto *R = cast<VPReplicateRecipe>(this);
-    return R->getUnderlyingInstr()->mayHaveSideEffects();
-  }
-  default:
-    return true;
-  }
-}
-
-void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
-  assert(!Parent && "Recipe already in some VPBasicBlock");
-  assert(InsertPos->getParent() &&
-         "Insertion position not in any VPBasicBlock");
-  Parent = InsertPos->getParent();
-  Parent->getRecipeList().insert(InsertPos->getIterator(), this);
-}
-
-void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
-  assert(!Parent && "Recipe already in some VPBasicBlock");
-  assert(InsertPos->getParent() &&
-         "Insertion position not in any VPBasicBlock");
-  Parent = InsertPos->getParent();
-  Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this);
-}
-
-void VPRecipeBase::removeFromParent() {
-  assert(getParent() && "Recipe not in any VPBasicBlock");
-  getParent()->getRecipeList().remove(getIterator());
-  Parent = nullptr;
-}
-
-iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
-  assert(getParent() && "Recipe not in any VPBasicBlock");
-  return getParent()->getRecipeList().erase(getIterator());
-}
-
-void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
-  removeFromParent();
-  insertAfter(InsertPos);
-}
-
-void VPRecipeBase::moveBefore(VPBasicBlock &BB,
-                              iplist<VPRecipeBase>::iterator I) {
-  assert(I == BB.end() || I->getParent() == &BB);
-  removeFromParent();
-  Parent = &BB;
-  BB.getRecipeList().insert(I, this);
-}
-
-void VPInstruction::generateInstruction(VPTransformState &State,
-                                        unsigned Part) {
-  IRBuilder<> &Builder = State.Builder;
-  Builder.SetCurrentDebugLocation(DL);
-
-  if (Instruction::isBinaryOp(getOpcode())) {
-    Value *A = State.get(getOperand(0), Part);
-    Value *B = State.get(getOperand(1), Part);
-    Value *V = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B);
-    State.set(this, V, Part);
-    return;
-  }
-
-  switch (getOpcode()) {
-  case VPInstruction::Not: {
-    Value *A = State.get(getOperand(0), Part);
-    Value *V = Builder.CreateNot(A);
-    State.set(this, V, Part);
-    break;
-  }
-  case VPInstruction::ICmpULE: {
-    Value *IV = State.get(getOperand(0), Part);
-    Value *TC = State.get(getOperand(1), Part);
-    Value *V = Builder.CreateICmpULE(IV, TC);
-    State.set(this, V, Part);
-    break;
-  }
-  case Instruction::Select: {
-    Value *Cond = State.get(getOperand(0), Part);
-    Value *Op1 = State.get(getOperand(1), Part);
-    Value *Op2 = State.get(getOperand(2), Part);
-    Value *V = Builder.CreateSelect(Cond, Op1, Op2);
-    State.set(this, V, Part);
-    break;
-  }
-  case VPInstruction::ActiveLaneMask: {
-    // Get first lane of vector induction variable.
-    Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
-    // Get the original loop tripcount.
-    Value *ScalarTC = State.get(getOperand(1), Part);
-
-    auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
-    auto *PredTy = VectorType::get(Int1Ty, State.VF);
-    Instruction *Call = Builder.CreateIntrinsic(
-        Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()},
-        {VIVElem0, ScalarTC}, nullptr, "active.lane.mask");
-    State.set(this, Call, Part);
-    break;
-  }
-  case VPInstruction::FirstOrderRecurrenceSplice: {
-    // Generate code to combine the previous and current values in vector v3.
-    //
-    //   vector.ph:
-    //     v_init = vector(..., ..., ..., a[-1])
-    //     br vector.body
-    //
-    //   vector.body
-    //     i = phi [0, vector.ph], [i+4, vector.body]
-    //     v1 = phi [v_init, vector.ph], [v2, vector.body]
-    //     v2 = a[i, i+1, i+2, i+3];
-    //     v3 = vector(v1(3), v2(0, 1, 2))
-
-    // For the first part, use the recurrence phi (v1), otherwise v2.
-    auto *V1 = State.get(getOperand(0), 0);
-    Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1);
-    if (!PartMinus1->getType()->isVectorTy()) {
-      State.set(this, PartMinus1, Part);
-    } else {
-      Value *V2 = State.get(getOperand(1), Part);
-      State.set(this, Builder.CreateVectorSplice(PartMinus1, V2, -1), Part);
-    }
-    break;
-  }
-
-  case VPInstruction::CanonicalIVIncrement:
-  case VPInstruction::CanonicalIVIncrementNUW: {
-    Value *Next = nullptr;
-    if (Part == 0) {
-      bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW;
-      auto *Phi = State.get(getOperand(0), 0);
-      // The loop step is equal to the vectorization factor (num of SIMD
-      // elements) times the unroll factor (num of SIMD instructions).
-      Value *Step =
-          createStepForVF(Builder, Phi->getType(), State.VF, State.UF);
-      Next = Builder.CreateAdd(Phi, Step, "index.next", IsNUW, false);
-    } else {
-      Next = State.get(this, 0);
-    }
-
-    State.set(this, Next, Part);
-    break;
-  }
-  case VPInstruction::BranchOnCount: {
-    if (Part != 0)
-      break;
-    // First create the compare.
-    Value *IV = State.get(getOperand(0), Part);
-    Value *TC = State.get(getOperand(1), Part);
-    Value *Cond = Builder.CreateICmpEQ(IV, TC);
-
-    // Now create the branch.
-    auto *Plan = getParent()->getPlan();
-    VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
-    VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
-    if (Header->empty()) {
-      assert(EnableVPlanNativePath &&
-             "empty entry block only expected in VPlanNativePath");
-      Header = cast<VPBasicBlock>(Header->getSingleSuccessor());
+void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
+                             Value *CanonicalIVStartValue,
+                             VPTransformState &State,
+                             bool IsEpilogueVectorization) {
+
+  VPBasicBlock *ExitingVPBB = getVectorLoopRegion()->getExitingBasicBlock();
+  auto *Term = dyn_cast<VPInstruction>(&ExitingVPBB->back());
+  // Try to simplify BranchOnCount to 'BranchOnCond true' if TC <= VF * UF when
+  // preparing to execute the plan for the main vector loop.
+  if (!IsEpilogueVectorization && Term &&
+      Term->getOpcode() == VPInstruction::BranchOnCount &&
+      isa<ConstantInt>(TripCountV)) {
+    ConstantInt *C = cast<ConstantInt>(TripCountV);
+    uint64_t TCVal = C->getZExtValue();
+    if (TCVal && TCVal <= State.VF.getKnownMinValue() * State.UF) {
+      auto *BOC =
+          new VPInstruction(VPInstruction::BranchOnCond,
+                            {getOrAddExternalDef(State.Builder.getTrue())});
+      Term->eraseFromParent();
+      ExitingVPBB->appendRecipe(BOC);
+      // TODO: Further simplifications are possible
+      //      1. Replace inductions with constants.
+      //      2. Replace vector loop region with VPBasicBlock.
     }
-    // TODO: Once the exit block is modeled in VPlan, use it instead of going
-    // through State.CFG.LastBB.
-    BasicBlock *Exit =
-        cast<BranchInst>(State.CFG.LastBB->getTerminator())->getSuccessor(0);
-
-    Builder.CreateCondBr(Cond, Exit, State.CFG.VPBB2IRBB[Header]);
-    Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
-    break;
-  }
-  default:
-    llvm_unreachable("Unsupported opcode for instruction");
-  }
-}
-
-void VPInstruction::execute(VPTransformState &State) {
-  assert(!State.Instance && "VPInstruction executing an Instance");
-  IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
-  State.Builder.setFastMathFlags(FMF);
-  for (unsigned Part = 0; Part < State.UF; ++Part)
-    generateInstruction(State, Part);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPInstruction::dump() const {
-  VPSlotTracker SlotTracker(getParent()->getPlan());
-  print(dbgs(), "", SlotTracker);
-}
-
-void VPInstruction::print(raw_ostream &O, const Twine &Indent,
-                          VPSlotTracker &SlotTracker) const {
-  O << Indent << "EMIT ";
-
-  if (hasResult()) {
-    printAsOperand(O, SlotTracker);
-    O << " = ";
-  }
-
-  switch (getOpcode()) {
-  case VPInstruction::Not:
-    O << "not";
-    break;
-  case VPInstruction::ICmpULE:
-    O << "icmp ule";
-    break;
-  case VPInstruction::SLPLoad:
-    O << "combined load";
-    break;
-  case VPInstruction::SLPStore:
-    O << "combined store";
-    break;
-  case VPInstruction::ActiveLaneMask:
-    O << "active lane mask";
-    break;
-  case VPInstruction::FirstOrderRecurrenceSplice:
-    O << "first-order splice";
-    break;
-  case VPInstruction::CanonicalIVIncrement:
-    O << "VF * UF + ";
-    break;
-  case VPInstruction::CanonicalIVIncrementNUW:
-    O << "VF * UF +(nuw) ";
-    break;
-  case VPInstruction::BranchOnCount:
-    O << "branch-on-count ";
-    break;
-  default:
-    O << Instruction::getOpcodeName(getOpcode());
-  }
-
-  O << FMF;
-
-  for (const VPValue *Operand : operands()) {
-    O << " ";
-    Operand->printAsOperand(O, SlotTracker);
   }
 
-  if (DL) {
-    O << ", !dbg ";
-    DL.print(O);
-  }
-}
-#endif
-
-void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) {
-  // Make sure the VPInstruction is a floating-point operation.
-  assert((Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
-          Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
-          Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
-          Opcode == Instruction::FCmp) &&
-         "this op can't take fast-math flags");
-  FMF = FMFNew;
-}
-
-void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
-                             Value *CanonicalIVStartValue,
-                             VPTransformState &State) {
   // Check if the trip count is needed, and if so build it.
   if (TripCount && TripCount->getNumUsers()) {
     for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
@@ -868,111 +617,78 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
   // When vectorizing the epilogue loop, the canonical induction start value
   // needs to be changed from zero to the value after the main vector loop.
   if (CanonicalIVStartValue) {
-    VPValue *VPV = new VPValue(CanonicalIVStartValue);
-    addExternalDef(VPV);
+    VPValue *VPV = getOrAddExternalDef(CanonicalIVStartValue);
     auto *IV = getCanonicalIV();
     assert(all_of(IV->users(),
                   [](const VPUser *U) {
+                    if (isa<VPScalarIVStepsRecipe>(U))
+                      return true;
                     auto *VPI = cast<VPInstruction>(U);
                     return VPI->getOpcode() ==
                                VPInstruction::CanonicalIVIncrement ||
                            VPI->getOpcode() ==
                                VPInstruction::CanonicalIVIncrementNUW;
                   }) &&
-           "the canonical IV should only be used by its increments when "
+           "the canonical IV should only be used by its increments or "
+           "ScalarIVSteps when "
            "resetting the start value");
     IV->setOperand(0, VPV);
   }
 }
 
-/// Generate the code inside the body of the vectorized loop. Assumes a single
-/// LoopVectorBody basic-block was created for this. Introduce additional
-/// basic-blocks as needed, and fill them all.
+/// Generate the code inside the preheader and body of the vectorized loop.
+/// Assumes a single pre-header basic-block was created for this. Introduce
+/// additional basic-blocks as needed, and fill them all.
 void VPlan::execute(VPTransformState *State) {
-  // 0. Set the reverse mapping from VPValues to Values for code generation.
+  // Set the reverse mapping from VPValues to Values for code generation.
   for (auto &Entry : Value2VPValue)
     State->VPValue2Value[Entry.second] = Entry.first;
 
-  BasicBlock *VectorPreHeaderBB = State->CFG.PrevBB;
-  State->CFG.VectorPreHeader = VectorPreHeaderBB;
-  BasicBlock *VectorHeaderBB = VectorPreHeaderBB->getSingleSuccessor();
-  assert(VectorHeaderBB && "Loop preheader does not have a single successor.");
-
-  // 1. Make room to generate basic-blocks inside loop body if needed.
-  BasicBlock *VectorLatchBB = VectorHeaderBB->splitBasicBlock(
-      VectorHeaderBB->getFirstInsertionPt(), "vector.body.latch");
-  Loop *L = State->LI->getLoopFor(VectorHeaderBB);
-  L->addBasicBlockToLoop(VectorLatchBB, *State->LI);
-  // Remove the edge between Header and Latch to allow other connections.
-  // Temporarily terminate with unreachable until CFG is rewired.
-  // Note: this asserts the generated code's assumption that
-  // getFirstInsertionPt() can be dereferenced into an Instruction.
-  VectorHeaderBB->getTerminator()->eraseFromParent();
-  State->Builder.SetInsertPoint(VectorHeaderBB);
-  UnreachableInst *Terminator = State->Builder.CreateUnreachable();
-  State->Builder.SetInsertPoint(Terminator);
-
-  // 2. Generate code in loop body.
+  // Initialize CFG state.
   State->CFG.PrevVPBB = nullptr;
-  State->CFG.PrevBB = VectorHeaderBB;
-  State->CFG.LastBB = VectorLatchBB;
+  State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor();
+  BasicBlock *VectorPreHeader = State->CFG.PrevBB;
+  State->Builder.SetInsertPoint(VectorPreHeader->getTerminator());
 
+  // Generate code in the loop pre-header and body.
   for (VPBlockBase *Block : depth_first(Entry))
     Block->execute(State);
 
-  // Setup branch terminator successors for VPBBs in VPBBsToFix based on
-  // VPBB's successors.
-  for (auto VPBB : State->CFG.VPBBsToFix) {
-    assert(EnableVPlanNativePath &&
-           "Unexpected VPBBsToFix in non VPlan-native path");
-    BasicBlock *BB = State->CFG.VPBB2IRBB[VPBB];
-    assert(BB && "Unexpected null basic block for VPBB");
-
-    unsigned Idx = 0;
-    auto *BBTerminator = BB->getTerminator();
-
-    for (VPBlockBase *SuccVPBlock : VPBB->getHierarchicalSuccessors()) {
-      VPBasicBlock *SuccVPBB = SuccVPBlock->getEntryBasicBlock();
-      BBTerminator->setSuccessor(Idx, State->CFG.VPBB2IRBB[SuccVPBB]);
-      ++Idx;
-    }
-  }
-
-  // 3. Merge the temporary latch created with the last basic-block filled.
-  BasicBlock *LastBB = State->CFG.PrevBB;
-  assert(isa<BranchInst>(LastBB->getTerminator()) &&
-         "Expected VPlan CFG to terminate with branch");
-
-  // Move both the branch and check from LastBB to VectorLatchBB.
-  auto *LastBranch = cast<BranchInst>(LastBB->getTerminator());
-  LastBranch->moveBefore(VectorLatchBB->getTerminator());
-  VectorLatchBB->getTerminator()->eraseFromParent();
-  // Move condition so it is guaranteed to be next to branch. This is only done
-  // to avoid excessive test updates.
-  // TODO: Remove special handling once the increments for all inductions are
-  // modeled explicitly in VPlan.
-  cast<Instruction>(LastBranch->getCondition())->moveBefore(LastBranch);
-  // Connect LastBB to VectorLatchBB to facilitate their merge.
-  BranchInst::Create(VectorLatchBB, LastBB);
-
-  // Merge LastBB with Latch.
-  bool Merged = MergeBlockIntoPredecessor(VectorLatchBB, nullptr, State->LI);
-  (void)Merged;
-  assert(Merged && "Could not merge last basic block with latch.");
-  VectorLatchBB = LastBB;
+  VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock();
+  BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
 
   // Fix the latch value of canonical, reduction and first-order recurrences
   // phis in the vector loop.
-  VPBasicBlock *Header = Entry->getEntryBasicBlock();
-  if (Header->empty()) {
-    assert(EnableVPlanNativePath);
-    Header = cast<VPBasicBlock>(Header->getSingleSuccessor());
-  }
+  VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
   for (VPRecipeBase &R : Header->phis()) {
     // Skip phi-like recipes that generate their backedege values themselves.
-    // TODO: Model their backedge values explicitly.
-    if (isa<VPWidenIntOrFpInductionRecipe>(&R) || isa<VPWidenPHIRecipe>(&R))
+    if (isa<VPWidenPHIRecipe>(&R))
+      continue;
+
+    if (isa<VPWidenPointerInductionRecipe>(&R) ||
+        isa<VPWidenIntOrFpInductionRecipe>(&R)) {
+      PHINode *Phi = nullptr;
+      if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
+        Phi = cast<PHINode>(State->get(R.getVPSingleValue(), 0));
+      } else {
+        auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
+        // TODO: Split off the case that all users of a pointer phi are scalar
+        // from the VPWidenPointerInductionRecipe.
+        if (WidenPhi->onlyScalarsGenerated(State->VF))
+          continue;
+
+        auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi, 0));
+        Phi = cast<PHINode>(GEP->getPointerOperand());
+      }
+
+      Phi->setIncomingBlock(1, VectorLatchBB);
+
+      // Move the last step to the end of the latch block. This ensures
+      // consistent placement of all induction updates.
+      Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
+      Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
       continue;
+    }
 
     auto *PhiR = cast<VPHeaderPHIRecipe>(&R);
     // For  canonical IV, first-order recurrences and in-order reduction phis,
@@ -993,9 +709,12 @@ void VPlan::execute(VPTransformState *State) {
   }
 
   // We do not attempt to preserve DT for outer loop vectorization currently.
-  if (!EnableVPlanNativePath)
-    updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB,
-                        L->getExitBlock());
+  if (!EnableVPlanNativePath) {
+    BasicBlock *VectorHeaderBB = State->CFG.VPBB2IRBB[Header];
+    State->DT->addNewBlock(VectorHeaderBB, VectorPreHeader);
+    updateDominatorTree(State->DT, VectorHeaderBB, VectorLatchBB,
+                        State->CFG.ExitBB);
+  }
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1021,6 +740,17 @@ void VPlan::print(raw_ostream &O) const {
     O << '\n';
     Block->print(O, "", SlotTracker);
   }
+
+  if (!LiveOuts.empty())
+    O << "\n";
+  for (auto &KV : LiveOuts) {
+    O << "Live-out ";
+    KV.second->getPhi()->printAsOperand(O);
+    O << " = ";
+    KV.second->getOperand(0)->printAsOperand(O, SlotTracker);
+    O << "\n";
+  }
+
   O << "}\n";
 }
 
@@ -1034,11 +764,14 @@ LLVM_DUMP_METHOD
 void VPlan::dump() const { print(dbgs()); }
 #endif
 
-void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB,
+void VPlan::addLiveOut(PHINode *PN, VPValue *V) {
+  assert(LiveOuts.count(PN) == 0 && "an exit value for PN already exists");
+  LiveOuts.insert({PN, new VPLiveOut(PN, V)});
+}
+
+void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopHeaderBB,
                                 BasicBlock *LoopLatchBB,
                                 BasicBlock *LoopExitBB) {
-  BasicBlock *LoopHeaderBB = LoopPreHeaderBB->getSingleSuccessor();
-  assert(LoopHeaderBB && "Loop preheader does not have a single successor.");
   // The vector body may be more than a single basic-block by this point.
   // Update the dominator tree information inside the vector body by propagating
   // it from header to latch, expecting only triangular control-flow, if any.
@@ -1075,6 +808,7 @@ void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB,
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+
 Twine VPlanPrinter::getUID(const VPBlockBase *Block) {
   return (isa<VPRegionBlock>(Block) ? "cluster_N" : "N") +
          Twine(getOrCreateBID(Block));
@@ -1122,8 +856,8 @@ void VPlanPrinter::dumpBlock(const VPBlockBase *Block) {
 void VPlanPrinter::drawEdge(const VPBlockBase *From, const VPBlockBase *To,
                             bool Hidden, const Twine &Label) {
   // Due to "dot" we print an edge between two regions as an edge between the
-  // exit basic block and the entry basic of the respective regions.
-  const VPBlockBase *Tail = From->getExitBasicBlock();
+  // exiting basic block and the entry basic of the respective regions.
+  const VPBlockBase *Tail = From->getExitingBasicBlock();
   const VPBlockBase *Head = To->getEntryBasicBlock();
   OS << Indent << getUID(Tail) << " -> " << getUID(Head);
   OS << " [ label=\"" << Label << '\"';
@@ -1213,328 +947,6 @@ void VPlanIngredient::print(raw_ostream &O) const {
     V->printAsOperand(O, false);
 }
 
-void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
-                              VPSlotTracker &SlotTracker) const {
-  O << Indent << "WIDEN-CALL ";
-
-  auto *CI = cast<CallInst>(getUnderlyingInstr());
-  if (CI->getType()->isVoidTy())
-    O << "void ";
-  else {
-    printAsOperand(O, SlotTracker);
-    O << " = ";
-  }
-
-  O << "call @" << CI->getCalledFunction()->getName() << "(";
-  printOperands(O, SlotTracker);
-  O << ")";
-}
-
-void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
-                                VPSlotTracker &SlotTracker) const {
-  O << Indent << "WIDEN-SELECT ";
-  printAsOperand(O, SlotTracker);
-  O << " = select ";
-  getOperand(0)->printAsOperand(O, SlotTracker);
-  O << ", ";
-  getOperand(1)->printAsOperand(O, SlotTracker);
-  O << ", ";
-  getOperand(2)->printAsOperand(O, SlotTracker);
-  O << (InvariantCond ? " (condition is loop invariant)" : "");
-}
-
-void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
-                          VPSlotTracker &SlotTracker) const {
-  O << Indent << "WIDEN ";
-  printAsOperand(O, SlotTracker);
-  O << " = " << getUnderlyingInstr()->getOpcodeName() << " ";
-  printOperands(O, SlotTracker);
-}
-
-void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
-                                          VPSlotTracker &SlotTracker) const {
-  O << Indent << "WIDEN-INDUCTION";
-  if (getTruncInst()) {
-    O << "\\l\"";
-    O << " +\n" << Indent << "\"  " << VPlanIngredient(IV) << "\\l\"";
-    O << " +\n" << Indent << "\"  ";
-    getVPValue(0)->printAsOperand(O, SlotTracker);
-  } else
-    O << " " << VPlanIngredient(IV);
-}
-#endif
-
-bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
-  auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
-  auto *StepC = dyn_cast<SCEVConstant>(getInductionDescriptor().getStep());
-  return StartC && StartC->isZero() && StepC && StepC->isOne();
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
-                             VPSlotTracker &SlotTracker) const {
-  O << Indent << "WIDEN-GEP ";
-  O << (IsPtrLoopInvariant ? "Inv" : "Var");
-  size_t IndicesNumber = IsIndexLoopInvariant.size();
-  for (size_t I = 0; I < IndicesNumber; ++I)
-    O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]";
-
-  O << " ";
-  printAsOperand(O, SlotTracker);
-  O << " = getelementptr ";
-  printOperands(O, SlotTracker);
-}
-
-void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
-                             VPSlotTracker &SlotTracker) const {
-  O << Indent << "WIDEN-PHI ";
-
-  auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
-  // Unless all incoming values are modeled in VPlan  print the original PHI
-  // directly.
-  // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
-  // values as VPValues.
-  if (getNumOperands() != OriginalPhi->getNumOperands()) {
-    O << VPlanIngredient(OriginalPhi);
-    return;
-  }
-
-  printAsOperand(O, SlotTracker);
-  O << " = phi ";
-  printOperands(O, SlotTracker);
-}
-
-void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
-                          VPSlotTracker &SlotTracker) const {
-  O << Indent << "BLEND ";
-  Phi->printAsOperand(O, false);
-  O << " =";
-  if (getNumIncomingValues() == 1) {
-    // Not a User of any mask: not really blending, this is a
-    // single-predecessor phi.
-    O << " ";
-    getIncomingValue(0)->printAsOperand(O, SlotTracker);
-  } else {
-    for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
-      O << " ";
-      getIncomingValue(I)->printAsOperand(O, SlotTracker);
-      O << "/";
-      getMask(I)->printAsOperand(O, SlotTracker);
-    }
-  }
-}
-
-void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
-                              VPSlotTracker &SlotTracker) const {
-  O << Indent << "REDUCE ";
-  printAsOperand(O, SlotTracker);
-  O << " = ";
-  getChainOp()->printAsOperand(O, SlotTracker);
-  O << " +";
-  if (isa<FPMathOperator>(getUnderlyingInstr()))
-    O << getUnderlyingInstr()->getFastMathFlags();
-  O << " reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) << " (";
-  getVecOp()->printAsOperand(O, SlotTracker);
-  if (getCondOp()) {
-    O << ", ";
-    getCondOp()->printAsOperand(O, SlotTracker);
-  }
-  O << ")";
-}
-
-void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
-                              VPSlotTracker &SlotTracker) const {
-  O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
-
-  if (!getUnderlyingInstr()->getType()->isVoidTy()) {
-    printAsOperand(O, SlotTracker);
-    O << " = ";
-  }
-  O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()) << " ";
-  printOperands(O, SlotTracker);
-
-  if (AlsoPack)
-    O << " (S->V)";
-}
-
-void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
-                                VPSlotTracker &SlotTracker) const {
-  O << Indent << "PHI-PREDICATED-INSTRUCTION ";
-  printAsOperand(O, SlotTracker);
-  O << " = ";
-  printOperands(O, SlotTracker);
-}
-
-void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
-                                           VPSlotTracker &SlotTracker) const {
-  O << Indent << "WIDEN ";
-
-  if (!isStore()) {
-    printAsOperand(O, SlotTracker);
-    O << " = ";
-  }
-  O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " ";
-
-  printOperands(O, SlotTracker);
-}
-#endif
-
-void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
-  Value *Start = getStartValue()->getLiveInIRValue();
-  PHINode *EntryPart = PHINode::Create(
-      Start->getType(), 2, "index", &*State.CFG.PrevBB->getFirstInsertionPt());
-  EntryPart->addIncoming(Start, State.CFG.VectorPreHeader);
-  EntryPart->setDebugLoc(DL);
-  for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
-    State.set(this, EntryPart, Part);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
-                                   VPSlotTracker &SlotTracker) const {
-  O << Indent << "EMIT ";
-  printAsOperand(O, SlotTracker);
-  O << " = CANONICAL-INDUCTION";
-}
-#endif
-
-void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
-  Value *CanonicalIV = State.get(getOperand(0), 0);
-  Type *STy = CanonicalIV->getType();
-  IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
-  ElementCount VF = State.VF;
-  Value *VStart = VF.isScalar()
-                      ? CanonicalIV
-                      : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
-  for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
-    Value *VStep = createStepForVF(Builder, STy, VF, Part);
-    if (VF.isVector()) {
-      VStep = Builder.CreateVectorSplat(VF, VStep);
-      VStep = Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
-    }
-    Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
-    State.set(this, CanonicalVectorIV, Part);
-  }
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
-                                     VPSlotTracker &SlotTracker) const {
-  O << Indent << "EMIT ";
-  printAsOperand(O, SlotTracker);
-  O << " = WIDEN-CANONICAL-INDUCTION ";
-  printOperands(O, SlotTracker);
-}
-#endif
-
-void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
-  auto &Builder = State.Builder;
-  // Create a vector from the initial value.
-  auto *VectorInit = getStartValue()->getLiveInIRValue();
-
-  Type *VecTy = State.VF.isScalar()
-                    ? VectorInit->getType()
-                    : VectorType::get(VectorInit->getType(), State.VF);
-
-  if (State.VF.isVector()) {
-    auto *IdxTy = Builder.getInt32Ty();
-    auto *One = ConstantInt::get(IdxTy, 1);
-    IRBuilder<>::InsertPointGuard Guard(Builder);
-    Builder.SetInsertPoint(State.CFG.VectorPreHeader->getTerminator());
-    auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
-    auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
-    VectorInit = Builder.CreateInsertElement(
-        PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
-  }
-
-  // Create a phi node for the new recurrence.
-  PHINode *EntryPart = PHINode::Create(
-      VecTy, 2, "vector.recur", &*State.CFG.PrevBB->getFirstInsertionPt());
-  EntryPart->addIncoming(VectorInit, State.CFG.VectorPreHeader);
-  State.set(this, EntryPart, 0);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
-                                            VPSlotTracker &SlotTracker) const {
-  O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
-  printAsOperand(O, SlotTracker);
-  O << " = phi ";
-  printOperands(O, SlotTracker);
-}
-#endif
-
-void VPReductionPHIRecipe::execute(VPTransformState &State) {
-  PHINode *PN = cast<PHINode>(getUnderlyingValue());
-  auto &Builder = State.Builder;
-
-  // In order to support recurrences we need to be able to vectorize Phi nodes.
-  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
-  // stage #1: We create a new vector PHI node with no incoming edges. We'll use
-  // this value when we vectorize all of the instructions that use the PHI.
-  bool ScalarPHI = State.VF.isScalar() || IsInLoop;
-  Type *VecTy =
-      ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF);
-
-  BasicBlock *HeaderBB = State.CFG.PrevBB;
-  assert(State.LI->getLoopFor(HeaderBB)->getHeader() == HeaderBB &&
-         "recipe must be in the vector loop header");
-  unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
-  for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
-    Value *EntryPart =
-        PHINode::Create(VecTy, 2, "vec.phi", &*HeaderBB->getFirstInsertionPt());
-    State.set(this, EntryPart, Part);
-  }
-
-  // Reductions do not have to start at zero. They can start with
-  // any loop invariant values.
-  VPValue *StartVPV = getStartValue();
-  Value *StartV = StartVPV->getLiveInIRValue();
-
-  Value *Iden = nullptr;
-  RecurKind RK = RdxDesc.getRecurrenceKind();
-  if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
-      RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) {
-    // MinMax reduction have the start value as their identify.
-    if (ScalarPHI) {
-      Iden = StartV;
-    } else {
-      IRBuilderBase::InsertPointGuard IPBuilder(Builder);
-      Builder.SetInsertPoint(State.CFG.VectorPreHeader->getTerminator());
-      StartV = Iden =
-          Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
-    }
-  } else {
-    Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(),
-                                         RdxDesc.getFastMathFlags());
-
-    if (!ScalarPHI) {
-      Iden = Builder.CreateVectorSplat(State.VF, Iden);
-      IRBuilderBase::InsertPointGuard IPBuilder(Builder);
-      Builder.SetInsertPoint(State.CFG.VectorPreHeader->getTerminator());
-      Constant *Zero = Builder.getInt32(0);
-      StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
-    }
-  }
-
-  for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
-    Value *EntryPart = State.get(this, Part);
-    // Make sure to add the reduction start value only to the
-    // first unroll part.
-    Value *StartVal = (Part == 0) ? StartV : Iden;
-    cast<PHINode>(EntryPart)->addIncoming(StartVal, State.CFG.VectorPreHeader);
-  }
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
-                                 VPSlotTracker &SlotTracker) const {
-  O << Indent << "WIDEN-REDUCTION-PHI ";
-
-  printAsOperand(O, SlotTracker);
-  O << " = phi ";
-  printOperands(O, SlotTracker);
-}
 #endif
 
 template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT);
@@ -1594,7 +1006,10 @@ void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
         continue;
       assert(isa<VPInstruction>(&VPI) && "Can only handle VPInstructions");
       auto *VPInst = cast<VPInstruction>(&VPI);
-      auto *Inst = cast<Instruction>(VPInst->getUnderlyingValue());
+
+      auto *Inst = dyn_cast_or_null<Instruction>(VPInst->getUnderlyingValue());
+      if (!Inst)
+        continue;
       auto *IG = IAI.getInterleaveGroup(Inst);
       if (!IG)
         continue;
@@ -1622,7 +1037,7 @@ void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
 VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan,
                                                  InterleavedAccessInfo &IAI) {
   Old2NewTy Old2New;
-  visitRegion(cast<VPRegionBlock>(Plan.getEntry()), Old2New, IAI);
+  visitRegion(Plan.getVectorLoopRegion(), Old2New, IAI);
 }
 
 void VPSlotTracker::assignSlot(const VPValue *V) {
@@ -1632,8 +1047,8 @@ void VPSlotTracker::assignSlot(const VPValue *V) {
 
 void VPSlotTracker::assignSlots(const VPlan &Plan) {
 
-  for (const VPValue *V : Plan.VPExternalDefs)
-    assignSlot(V);
+  for (const auto &P : Plan.VPExternalDefs)
+    assignSlot(P.second);
 
   assignSlot(&Plan.VectorTripCount);
   if (Plan.BackedgeTakenCount)
@@ -1651,7 +1066,19 @@ void VPSlotTracker::assignSlots(const VPlan &Plan) {
 }
 
 bool vputils::onlyFirstLaneUsed(VPValue *Def) {
-  return all_of(Def->users(), [Def](VPUser *U) {
-    return cast<VPRecipeBase>(U)->onlyFirstLaneUsed(Def);
-  });
+  return all_of(Def->users(),
+                [Def](VPUser *U) { return U->onlyFirstLaneUsed(Def); });
+}
+
+VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
+                                                ScalarEvolution &SE) {
+  if (auto *E = dyn_cast<SCEVConstant>(Expr))
+    return Plan.getOrAddExternalDef(E->getValue());
+  if (auto *E = dyn_cast<SCEVUnknown>(Expr))
+    return Plan.getOrAddExternalDef(E->getValue());
+
+  VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock();
+  VPValue *Step = new VPExpandSCEVRecipe(Expr, SE);
+  Preheader->appendRecipe(cast<VPRecipeBase>(Step->getDef()));
+  return Step;
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index bcaabca692cc..09da4a545d0d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -25,27 +25,26 @@
 #ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
 #define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
 
-#include "VPlanLoopInfo.h"
 #include "VPlanValue.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/ilist.h"
 #include "llvm/ADT/ilist_node.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/Support/InstructionCost.h"
+#include "llvm/IR/FMF.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
-#include <map>
 #include <string>
 
 namespace llvm {
@@ -54,6 +53,7 @@ class BasicBlock;
 class DominatorTree;
 class InductionDescriptor;
 class InnerLoopVectorizer;
+class IRBuilderBase;
 class LoopInfo;
 class raw_ostream;
 class RecurrenceDescriptor;
@@ -67,10 +67,11 @@ class VPlanSlp;
 /// Returns a calculation for the total number of elements for a given \p VF.
 /// For fixed width vectors this value is a constant, whereas for scalable
 /// vectors it is an expression determined at runtime.
-Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF);
+Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF);
 
 /// Return a value for Step multiplied by VF.
-Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, int64_t Step);
+Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
+                       int64_t Step);
 
 /// A range of powers-of-2 vectorization factors with fixed start and
 /// adjustable end. The range includes start and excludes end, e.g.,:
@@ -151,7 +152,7 @@ public:
 
   /// Returns an expression describing the lane index that can be used at
   /// runtime.
-  Value *getAsRuntimeExpr(IRBuilder<> &Builder, const ElementCount &VF) const;
+  Value *getAsRuntimeExpr(IRBuilderBase &Builder, const ElementCount &VF) const;
 
   /// Returns the Kind of lane offset.
   Kind getKind() const { return LaneKind; }
@@ -199,10 +200,10 @@ struct VPIteration {
 /// needed for generating the output IR.
 struct VPTransformState {
   VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
-                   DominatorTree *DT, IRBuilder<> &Builder,
+                   DominatorTree *DT, IRBuilderBase &Builder,
                    InnerLoopVectorizer *ILV, VPlan *Plan)
-      : VF(VF), UF(UF), LI(LI), DT(DT), Builder(Builder), ILV(ILV), Plan(Plan) {
-  }
+      : VF(VF), UF(UF), LI(LI), DT(DT), Builder(Builder), ILV(ILV), Plan(Plan),
+        LVer(nullptr) {}
 
   /// The chosen Vectorization and Unroll Factors of the loop being vectorized.
   ElementCount VF;
@@ -298,6 +299,27 @@ struct VPTransformState {
     Iter->second[Instance.Part][CacheIdx] = V;
   }
 
+  /// Add additional metadata to \p To that was not present on \p Orig.
+  ///
+  /// Currently this is used to add the noalias annotations based on the
+  /// inserted memchecks.  Use this for instructions that are *cloned* into the
+  /// vector loop.
+  void addNewMetadata(Instruction *To, const Instruction *Orig);
+
+  /// Add metadata from one instruction to another.
+  ///
+  /// This includes both the original MDs from \p From and additional ones (\see
+  /// addNewMetadata).  Use this for *newly created* instructions in the vector
+  /// loop.
+  void addMetadata(Instruction *To, Instruction *From);
+
+  /// Similar to the previous function but it adds the metadata to a
+  /// vector of instructions.
+  void addMetadata(ArrayRef<Value *> To, Instruction *From);
+
+  /// Set the debug location in the builder using the debug location in \p V.
+  void setDebugLocFromInst(const Value *V);
+
   /// Hold state information used when constructing the CFG of the output IR,
   /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.
   struct CFGState {
@@ -308,26 +330,19 @@ struct VPTransformState {
     /// header BasicBlock.
     BasicBlock *PrevBB = nullptr;
 
-    /// The last IR BasicBlock in the output IR. Set to the new latch
-    /// BasicBlock, used for placing the newly created BasicBlocks.
-    BasicBlock *LastBB = nullptr;
-
-    /// The IR BasicBlock that is the preheader of the vector loop in the output
-    /// IR.
-    /// FIXME: The vector preheader should also be modeled in VPlan, so any code
-    /// that needs to be added to the preheader gets directly generated by
-    /// VPlan. There should be no need to manage a pointer to the IR BasicBlock.
-    BasicBlock *VectorPreHeader = nullptr;
+    /// The last IR BasicBlock in the output IR. Set to the exit block of the
+    /// vector loop.
+    BasicBlock *ExitBB = nullptr;
 
     /// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case
     /// of replication, maps the BasicBlock of the last replica created.
     SmallDenseMap<VPBasicBlock *, BasicBlock *> VPBB2IRBB;
 
-    /// Vector of VPBasicBlocks whose terminator instruction needs to be fixed
-    /// up at the end of vector code generation.
-    SmallVector<VPBasicBlock *, 8> VPBBsToFix;
-
     CFGState() = default;
+
+    /// Returns the BasicBlock* mapped to the pre-header of the loop region
+    /// containing \p R.
+    BasicBlock *getPreheaderBBFor(VPRecipeBase *R);
   } CFG;
 
   /// Hold a pointer to LoopInfo to register new basic blocks in the loop.
@@ -337,7 +352,7 @@ struct VPTransformState {
   DominatorTree *DT;
 
   /// Hold a reference to the IRBuilder used to generate output IR code.
-  IRBuilder<> &Builder;
+  IRBuilderBase &Builder;
 
   VPValue2ValueTy VPValue2Value;
 
@@ -353,41 +368,16 @@ struct VPTransformState {
   /// Holds recipes that may generate a poison value that is used after
   /// vectorization, even when their operands are not poison.
   SmallPtrSet<VPRecipeBase *, 16> MayGeneratePoisonRecipes;
-};
-
-/// VPUsers instance used by VPBlockBase to manage CondBit and the block
-/// predicate. Currently VPBlockUsers are used in VPBlockBase for historical
-/// reasons, but in the future the only VPUsers should either be recipes or
-/// live-outs.VPBlockBase uses.
-struct VPBlockUser : public VPUser {
-  VPBlockUser() : VPUser({}, VPUserID::Block) {}
 
-  VPValue *getSingleOperandOrNull() {
-    if (getNumOperands() == 1)
-      return getOperand(0);
+  /// The loop object for the current parent region, or nullptr.
+  Loop *CurrentVectorLoop = nullptr;
 
-    return nullptr;
-  }
-  const VPValue *getSingleOperandOrNull() const {
-    if (getNumOperands() == 1)
-      return getOperand(0);
-
-    return nullptr;
-  }
-
-  void resetSingleOpUser(VPValue *NewVal) {
-    assert(getNumOperands() <= 1 && "Didn't expect more than one operand!");
-    if (!NewVal) {
-      if (getNumOperands() == 1)
-        removeLastOperand();
-      return;
-    }
-
-    if (getNumOperands() == 1)
-      setOperand(0, NewVal);
-    else
-      addOperand(NewVal);
-  }
+  /// LoopVersioning.  It's only set up (non-null) if memchecks were
+  /// used.
+  ///
+  /// This is currently only used to add no-alias metadata based on the
+  /// memchecks.  The actually versioning is performed manually.
+  std::unique_ptr<LoopVersioning> LVer;
 };
 
 /// VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
@@ -410,16 +400,6 @@ class VPBlockBase {
   /// List of successor blocks.
   SmallVector<VPBlockBase *, 1> Successors;
 
-  /// Successor selector managed by a VPUser. For blocks with zero or one
-  /// successors, there is no operand. Otherwise there is exactly one operand
-  /// which is the branch condition.
-  VPBlockUser CondBitUser;
-
-  /// If the block is predicated, its predicate is stored as an operand of this
-  /// VPUser to maintain the def-use relations. Otherwise there is no operand
-  /// here.
-  VPBlockUser PredicateUser;
-
   /// VPlan containing the block. Can only be set on the entry block of the
   /// plan.
   VPlan *Plan = nullptr;
@@ -493,11 +473,11 @@ public:
   const VPBasicBlock *getEntryBasicBlock() const;
   VPBasicBlock *getEntryBasicBlock();
 
-  /// \return the VPBasicBlock that is the exit of this VPBlockBase,
+  /// \return the VPBasicBlock that is the exiting this VPBlockBase,
   /// recursively, if the latter is a VPRegionBlock. Otherwise, if this
   /// VPBlockBase is a VPBasicBlock, it is returned.
-  const VPBasicBlock *getExitBasicBlock() const;
-  VPBasicBlock *getExitBasicBlock();
+  const VPBasicBlock *getExitingBasicBlock() const;
+  VPBasicBlock *getExitingBasicBlock();
 
   const VPBlocksTy &getSuccessors() const { return Successors; }
   VPBlocksTy &getSuccessors() { return Successors; }
@@ -565,20 +545,6 @@ public:
     return getEnclosingBlockWithPredecessors()->getSinglePredecessor();
   }
 
-  /// \return the condition bit selecting the successor.
-  VPValue *getCondBit();
-  /// \return the condition bit selecting the successor.
-  const VPValue *getCondBit() const;
-  /// Set the condition bit selecting the successor.
-  void setCondBit(VPValue *CV);
-
-  /// \return the block's predicate.
-  VPValue *getPredicate();
-  /// \return the block's predicate.
-  const VPValue *getPredicate() const;
-  /// Set the block's predicate.
-  void setPredicate(VPValue *Pred);
-
   /// Set a given VPBlockBase \p Successor as the single successor of this
   /// VPBlockBase. This VPBlockBase is not added as predecessor of \p Successor.
   /// This VPBlockBase must have no successors.
@@ -588,14 +554,11 @@ public:
   }
 
   /// Set two given VPBlockBases \p IfTrue and \p IfFalse to be the two
-  /// successors of this VPBlockBase. \p Condition is set as the successor
-  /// selector. This VPBlockBase is not added as predecessor of \p IfTrue or \p
-  /// IfFalse. This VPBlockBase must have no successors.
-  void setTwoSuccessors(VPBlockBase *IfTrue, VPBlockBase *IfFalse,
-                        VPValue *Condition) {
+  /// successors of this VPBlockBase. This VPBlockBase is not added as
+  /// predecessor of \p IfTrue or \p IfFalse. This VPBlockBase must have no
+  /// successors.
+  void setTwoSuccessors(VPBlockBase *IfTrue, VPBlockBase *IfFalse) {
     assert(Successors.empty() && "Setting two successors when others exist.");
-    assert(Condition && "Setting two successors without condition!");
-    setCondBit(Condition);
     appendSuccessor(IfTrue);
     appendSuccessor(IfFalse);
   }
@@ -612,11 +575,8 @@ public:
   /// Remove all the predecessor of this block.
   void clearPredecessors() { Predecessors.clear(); }
 
-  /// Remove all the successors of this block and set to null its condition bit
-  void clearSuccessors() {
-    Successors.clear();
-    setCondBit(nullptr);
-  }
+  /// Remove all the successors of this block.
+  void clearSuccessors() { Successors.clear(); }
 
   /// The method which generates the output IR that correspond to this
   /// VPBlockBase, thereby "executing" the VPlan.
@@ -665,6 +625,32 @@ public:
 #endif
 };
 
+/// A value that is used outside the VPlan. The operand of the user needs to be
+/// added to the associated LCSSA phi node.
+class VPLiveOut : public VPUser {
+  PHINode *Phi;
+
+public:
+  VPLiveOut(PHINode *Phi, VPValue *Op)
+      : VPUser({Op}, VPUser::VPUserID::LiveOut), Phi(Phi) {}
+
+  /// Fixup the wrapped LCSSA phi node in the unique exit block.  This simply
+  /// means we need to add the appropriate incoming value from the middle
+  /// block as exiting edges from the scalar epilogue loop (if present) are
+  /// already in place, and we exit the vector loop exclusively to the middle
+  /// block.
+  void fixPhi(VPlan &Plan, VPTransformState &State);
+
+  /// Returns true if the VPLiveOut uses scalars of operand \p Op.
+  bool usesScalars(const VPValue *Op) const override {
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    return true;
+  }
+
+  PHINode *getPhi() const { return Phi; }
+};
+
 /// VPRecipeBase is a base class modeling a sequence of one or more output IR
 /// instructions. VPRecipeBase owns the the VPValues it defines through VPDef
 /// and is responsible for deleting its defined values. Single-value
@@ -699,6 +685,9 @@ public:
   /// Insert an unlinked recipe into a basic block immediately before
   /// the specified recipe.
   void insertBefore(VPRecipeBase *InsertPos);
+  /// Insert an unlinked recipe into \p BB immediately before the insertion
+  /// point \p IP;
+  void insertBefore(VPBasicBlock &BB, iplist<VPRecipeBase>::iterator IP);
 
   /// Insert an unlinked Recipe into a basic block immediately after
   /// the specified Recipe.
@@ -759,14 +748,6 @@ public:
   bool mayReadOrWriteMemory() const {
     return mayReadFromMemory() || mayWriteToMemory();
   }
-
-  /// Returns true if the recipe only uses the first lane of operand \p Op.
-  /// Conservatively returns false.
-  virtual bool onlyFirstLaneUsed(const VPValue *Op) const {
-    assert(is_contained(operands(), Op) &&
-           "Op must be an operand of the recipe");
-    return false;
-  }
 };
 
 inline bool VPUser::classof(const VPDef *Def) {
@@ -804,6 +785,7 @@ public:
     CanonicalIVIncrement,
     CanonicalIVIncrementNUW,
     BranchOnCount,
+    BranchOnCond
   };
 
 private:
@@ -892,6 +874,7 @@ public:
     case Instruction::Unreachable:
     case Instruction::Fence:
     case Instruction::AtomicRMW:
+    case VPInstruction::BranchOnCond:
     case VPInstruction::BranchOnCount:
       return false;
     default:
@@ -1049,27 +1032,25 @@ public:
 };
 
 /// A recipe for handling phi nodes of integer and floating-point inductions,
-/// producing their vector and scalar values.
+/// producing their vector values.
 class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPValue {
   PHINode *IV;
   const InductionDescriptor &IndDesc;
-  bool NeedsScalarIV;
   bool NeedsVectorIV;
 
 public:
-  VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start,
+  VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
                                 const InductionDescriptor &IndDesc,
-                                bool NeedsScalarIV, bool NeedsVectorIV)
-      : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(IV, this),
-        IV(IV), IndDesc(IndDesc), NeedsScalarIV(NeedsScalarIV),
+                                bool NeedsVectorIV)
+      : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start, Step}),
+        VPValue(IV, this), IV(IV), IndDesc(IndDesc),
         NeedsVectorIV(NeedsVectorIV) {}
 
-  VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start,
+  VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
                                 const InductionDescriptor &IndDesc,
-                                TruncInst *Trunc, bool NeedsScalarIV,
-                                bool NeedsVectorIV)
-      : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(Trunc, this),
-        IV(IV), IndDesc(IndDesc), NeedsScalarIV(NeedsScalarIV),
+                                TruncInst *Trunc, bool NeedsVectorIV)
+      : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start, Step}),
+        VPValue(Trunc, this), IV(IV), IndDesc(IndDesc),
         NeedsVectorIV(NeedsVectorIV) {}
 
   ~VPWidenIntOrFpInductionRecipe() override = default;
@@ -1093,6 +1074,10 @@ public:
   VPValue *getStartValue() { return getOperand(0); }
   const VPValue *getStartValue() const { return getOperand(0); }
 
+  /// Returns the step value of the induction.
+  VPValue *getStepValue() { return getOperand(1); }
+  const VPValue *getStepValue() const { return getOperand(1); }
+
   /// Returns the first defined value as TruncInst, if it is one or nullptr
   /// otherwise.
   TruncInst *getTruncInst() {
@@ -1102,6 +1087,8 @@ public:
     return dyn_cast_or_null<TruncInst>(getVPValue(0)->getUnderlyingValue());
   }
 
+  PHINode *getPHINode() { return IV; }
+
   /// Returns the induction descriptor for the recipe.
   const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
 
@@ -1115,9 +1102,6 @@ public:
     return TruncI ? TruncI->getType() : IV->getType();
   }
 
-  /// Returns true if a scalar phi needs to be created for the induction.
-  bool needsScalarIV() const { return NeedsScalarIV; }
-
   /// Returns true if a vector phi needs to be created for the induction.
   bool needsVectorIV() const { return NeedsVectorIV; }
 };
@@ -1167,6 +1151,9 @@ public:
   VPValue *getStartValue() {
     return getNumOperands() == 0 ? nullptr : getOperand(0);
   }
+  VPValue *getStartValue() const {
+    return getNumOperands() == 0 ? nullptr : getOperand(0);
+  }
 
   /// Returns the incoming value from the loop backedge.
   VPValue *getBackedgeValue() {
@@ -1180,6 +1167,52 @@ public:
   }
 };
 
+class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe {
+  const InductionDescriptor &IndDesc;
+
+  /// SCEV used to expand step.
+  /// FIXME: move expansion of step to the pre-header, once it is modeled
+  /// explicitly.
+  ScalarEvolution &SE;
+
+public:
+  /// Create a new VPWidenPointerInductionRecipe for \p Phi with start value \p
+  /// Start.
+  VPWidenPointerInductionRecipe(PHINode *Phi, VPValue *Start,
+                                const InductionDescriptor &IndDesc,
+                                ScalarEvolution &SE)
+      : VPHeaderPHIRecipe(VPVWidenPointerInductionSC, VPWidenPointerInductionSC,
+                          Phi),
+        IndDesc(IndDesc), SE(SE) {
+    addOperand(Start);
+  }
+
+  ~VPWidenPointerInductionRecipe() override = default;
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
+  static inline bool classof(const VPRecipeBase *B) {
+    return B->getVPDefID() == VPRecipeBase::VPWidenPointerInductionSC;
+  }
+  static inline bool classof(const VPHeaderPHIRecipe *R) {
+    return R->getVPDefID() == VPRecipeBase::VPWidenPointerInductionSC;
+  }
+  static inline bool classof(const VPValue *V) {
+    return V->getVPValueID() == VPValue::VPVWidenPointerInductionSC;
+  }
+
+  /// Generate vector values for the pointer induction.
+  void execute(VPTransformState &State) override;
+
+  /// Returns true if only scalar values will be generated.
+  bool onlyScalarsGenerated(ElementCount VF);
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// A recipe for handling header phis that are widened in the vector loop.
 /// In the VPlan native path, all incoming VPValues & VPBasicBlock pairs are
 /// managed in the recipe directly.
@@ -1363,9 +1396,8 @@ public:
            "Op must be an operand of the recipe");
     // Recursing through Blend recipes only, must terminate at header phi's the
     // latest.
-    return all_of(users(), [this](VPUser *U) {
-      return cast<VPRecipeBase>(U)->onlyFirstLaneUsed(this);
-    });
+    return all_of(users(),
+                  [this](VPUser *U) { return U->onlyFirstLaneUsed(this); });
   }
 };
 
@@ -1440,6 +1472,15 @@ public:
   unsigned getNumStoreOperands() const {
     return getNumOperands() - (HasMask ? 2 : 1);
   }
+
+  /// The recipe only uses the first lane of the address.
+  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    return Op == getAddr() && all_of(getStoredValues(), [Op](VPValue *StoredV) {
+             return Op != StoredV;
+           });
+  }
 };
 
 /// A recipe to represent inloop reduction operations, performing a reduction on
@@ -1551,6 +1592,13 @@ public:
            "Op must be an operand of the recipe");
     return isUniform();
   }
+
+  /// Returns true if the recipe uses scalars of operand \p Op.
+  bool usesScalars(const VPValue *Op) const override {
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    return true;
+  }
 };
 
 /// A recipe for generating conditional branches on the bits of a mask.
@@ -1590,6 +1638,13 @@ public:
     // Mask is optional.
     return getNumOperands() == 1 ? getOperand(0) : nullptr;
   }
+
+  /// Returns true if the recipe uses scalars of operand \p Op.
+  bool usesScalars(const VPValue *Op) const override {
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    return true;
+  }
 };
 
 /// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when
@@ -1619,6 +1674,13 @@ public:
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
 #endif
+
+  /// Returns true if the recipe uses scalars of operand \p Op.
+  bool usesScalars(const VPValue *Op) const override {
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    return true;
+  }
 };
 
 /// A Recipe for widening load/store operations.
@@ -1627,7 +1689,7 @@ public:
 /// - For store: Address, stored value, optional mask
 /// TODO: We currently execute only per-part unless a specific instance is
 /// provided.
-class VPWidenMemoryInstructionRecipe : public VPRecipeBase, public VPValue {
+class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
   Instruction &Ingredient;
 
   // Whether the loaded-from / stored-to addresses are consecutive.
@@ -1649,10 +1711,10 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase, public VPValue {
 public:
   VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
                                  bool Consecutive, bool Reverse)
-      : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr}),
-        VPValue(VPValue::VPVMemoryInstructionSC, &Load, this), Ingredient(Load),
+      : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr}), Ingredient(Load),
         Consecutive(Consecutive), Reverse(Reverse) {
     assert((Consecutive || !Reverse) && "Reverse implies consecutive");
+    new VPValue(VPValue::VPVMemoryInstructionSC, &Load, this);
     setMask(Mask);
   }
 
@@ -1660,7 +1722,6 @@ public:
                                  VPValue *StoredValue, VPValue *Mask,
                                  bool Consecutive, bool Reverse)
       : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr, StoredValue}),
-        VPValue(VPValue::VPVMemoryInstructionSC, &Store, this),
         Ingredient(Store), Consecutive(Consecutive), Reverse(Reverse) {
     assert((Consecutive || !Reverse) && "Reverse implies consecutive");
     setMask(Mask);
@@ -1714,9 +1775,42 @@ public:
            "Op must be an operand of the recipe");
 
     // Widened, consecutive memory operations only demand the first lane of
-    // their address.
-    return Op == getAddr() && isConsecutive();
+    // their address, unless the same operand is also stored. That latter can
+    // happen with opaque pointers.
+    return Op == getAddr() && isConsecutive() &&
+           (!isStore() || Op != getStoredValue());
+  }
+
+  Instruction &getIngredient() const { return Ingredient; }
+};
+
+/// Recipe to expand a SCEV expression.
+class VPExpandSCEVRecipe : public VPRecipeBase, public VPValue {
+  const SCEV *Expr;
+  ScalarEvolution &SE;
+
+public:
+  VPExpandSCEVRecipe(const SCEV *Expr, ScalarEvolution &SE)
+      : VPRecipeBase(VPExpandSCEVSC, {}), VPValue(nullptr, this), Expr(Expr),
+        SE(SE) {}
+
+  ~VPExpandSCEVRecipe() override = default;
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
+  static inline bool classof(const VPDef *D) {
+    return D->getVPDefID() == VPExpandSCEVSC;
   }
+
+  /// Generate a canonical vector induction variable of the vector loop, with
+  void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+
+  const SCEV *getSCEV() const { return Expr; }
 };
 
 /// Canonical scalar induction phi of the vector loop. Starting at the specified
@@ -1738,6 +1832,12 @@ public:
   static inline bool classof(const VPDef *D) {
     return D->getVPDefID() == VPCanonicalIVPHISC;
   }
+  static inline bool classof(const VPHeaderPHIRecipe *D) {
+    return D->getVPDefID() == VPCanonicalIVPHISC;
+  }
+  static inline bool classof(const VPValue *V) {
+    return V->getVPValueID() == VPValue::VPVCanonicalIVPHISC;
+  }
 
   /// Generate the canonical scalar induction phi of the vector loop.
   void execute(VPTransformState &State) override;
@@ -1803,6 +1903,64 @@ public:
   }
 };
 
+/// A recipe for handling phi nodes of integer and floating-point inductions,
+/// producing their scalar values.
+class VPScalarIVStepsRecipe : public VPRecipeBase, public VPValue {
+  /// Scalar type to use for the generated values.
+  Type *Ty;
+  /// If not nullptr, truncate the generated values to TruncToTy.
+  Type *TruncToTy;
+  const InductionDescriptor &IndDesc;
+
+public:
+  VPScalarIVStepsRecipe(Type *Ty, const InductionDescriptor &IndDesc,
+                        VPValue *CanonicalIV, VPValue *Start, VPValue *Step,
+                        Type *TruncToTy)
+      : VPRecipeBase(VPScalarIVStepsSC, {CanonicalIV, Start, Step}),
+        VPValue(nullptr, this), Ty(Ty), TruncToTy(TruncToTy), IndDesc(IndDesc) {
+  }
+
+  ~VPScalarIVStepsRecipe() override = default;
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
+  static inline bool classof(const VPDef *D) {
+    return D->getVPDefID() == VPRecipeBase::VPScalarIVStepsSC;
+  }
+  /// Extra classof implementations to allow directly casting from VPUser ->
+  /// VPScalarIVStepsRecipe.
+  static inline bool classof(const VPUser *U) {
+    auto *R = dyn_cast<VPRecipeBase>(U);
+    return R && R->getVPDefID() == VPRecipeBase::VPScalarIVStepsSC;
+  }
+  static inline bool classof(const VPRecipeBase *R) {
+    return R->getVPDefID() == VPRecipeBase::VPScalarIVStepsSC;
+  }
+
+  /// Generate the scalarized versions of the phi node as needed by their users.
+  void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+
+  /// Returns true if the induction is canonical, i.e. starting at 0 and
+  /// incremented by UF * VF (= the original IV is incremented by 1).
+  bool isCanonical() const;
+
+  VPCanonicalIVPHIRecipe *getCanonicalIV() const;
+  VPValue *getStartValue() const { return getOperand(1); }
+  VPValue *getStepValue() const { return getOperand(2); }
+
+  /// Returns true if the recipe only uses the first lane of operand \p Op.
+  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    return true;
+  }
+};
+
 /// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It
 /// holds a sequence of zero or more VPRecipe's each representing a sequence of
 /// output IR instructions. All PHI-like recipes must come before any non-PHI recipes.
@@ -1895,6 +2053,8 @@ public:
   /// SplitAt to the new block. Returns the new block.
   VPBasicBlock *splitAt(iterator SplitAt);
 
+  VPRegionBlock *getEnclosingLoopRegion();
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print this VPBsicBlock to \p O, prefixing all lines with \p Indent. \p
   /// SlotTracker is used to print unnamed VPValue's using consequtive numbers.
@@ -1906,6 +2066,14 @@ public:
   using VPBlockBase::print; // Get the print(raw_stream &O) version.
 #endif
 
+  /// If the block has multiple successors, return the branch recipe terminating
+  /// the block. If there are no or only a single successor, return nullptr;
+  VPRecipeBase *getTerminator();
+  const VPRecipeBase *getTerminator() const;
+
+  /// Returns true if the block is exiting it's parent region.
+  bool isExiting() const;
+
 private:
   /// Create an IR BasicBlock to hold the output instructions generated by this
   /// VPBasicBlock, and return it. Update the CFGState accordingly.
@@ -1913,7 +2081,7 @@ private:
 };
 
 /// VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks
-/// which form a Single-Entry-Single-Exit subgraph of the output IR CFG.
+/// which form a Single-Entry-Single-Exiting subgraph of the output IR CFG.
 /// A VPRegionBlock may indicate that its contents are to be replicated several
 /// times. This is designed to support predicated scalarization, in which a
 /// scalar if-then code structure needs to be generated VF * UF times. Having
@@ -1924,25 +2092,26 @@ class VPRegionBlock : public VPBlockBase {
   /// Hold the Single Entry of the SESE region modelled by the VPRegionBlock.
   VPBlockBase *Entry;
 
-  /// Hold the Single Exit of the SESE region modelled by the VPRegionBlock.
-  VPBlockBase *Exit;
+  /// Hold the Single Exiting block of the SESE region modelled by the
+  /// VPRegionBlock.
+  VPBlockBase *Exiting;
 
   /// An indicator whether this region is to generate multiple replicated
   /// instances of output IR corresponding to its VPBlockBases.
   bool IsReplicator;
 
 public:
-  VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exit,
+  VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting,
                 const std::string &Name = "", bool IsReplicator = false)
-      : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exit(Exit),
+      : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exiting(Exiting),
         IsReplicator(IsReplicator) {
     assert(Entry->getPredecessors().empty() && "Entry block has predecessors.");
-    assert(Exit->getSuccessors().empty() && "Exit block has successors.");
+    assert(Exiting->getSuccessors().empty() && "Exit block has successors.");
     Entry->setParent(this);
-    Exit->setParent(this);
+    Exiting->setParent(this);
   }
   VPRegionBlock(const std::string &Name = "", bool IsReplicator = false)
-      : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exit(nullptr),
+      : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exiting(nullptr),
         IsReplicator(IsReplicator) {}
 
   ~VPRegionBlock() override {
@@ -1976,16 +2145,22 @@ public:
   // DominatorTreeBase representing the Graph type.
   VPBlockBase &front() const { return *Entry; }
 
-  const VPBlockBase *getExit() const { return Exit; }
-  VPBlockBase *getExit() { return Exit; }
+  const VPBlockBase *getExiting() const { return Exiting; }
+  VPBlockBase *getExiting() { return Exiting; }
 
-  /// Set \p ExitBlock as the exit VPBlockBase of this VPRegionBlock. \p
-  /// ExitBlock must have no successors.
-  void setExit(VPBlockBase *ExitBlock) {
-    assert(ExitBlock->getSuccessors().empty() &&
+  /// Set \p ExitingBlock as the exiting VPBlockBase of this VPRegionBlock. \p
+  /// ExitingBlock must have no successors.
+  void setExiting(VPBlockBase *ExitingBlock) {
+    assert(ExitingBlock->getSuccessors().empty() &&
            "Exit block cannot have successors.");
-    Exit = ExitBlock;
-    ExitBlock->setParent(this);
+    Exiting = ExitingBlock;
+    ExitingBlock->setParent(this);
+  }
+
+  /// Returns the pre-header VPBasicBlock of the loop region.
+  VPBasicBlock *getPreheaderVPBB() {
+    assert(!isReplicator() && "should only get pre-header of loop regions");
+    return getSinglePredecessor()->getExitingBasicBlock();
   }
 
   /// An indicator whether this region is to generate multiple replicated
@@ -2119,11 +2294,11 @@ struct GraphTraits<Inverse<VPRegionBlock *>>
   using nodes_iterator = df_iterator<NodeRef>;
 
   static NodeRef getEntryNode(Inverse<GraphRef> N) {
-    return N.Graph->getExit();
+    return N.Graph->getExiting();
   }
 
   static nodes_iterator nodes_begin(GraphRef N) {
-    return nodes_iterator::begin(N->getExit());
+    return nodes_iterator::begin(N->getExiting());
   }
 
   static nodes_iterator nodes_end(GraphRef N) {
@@ -2281,12 +2456,9 @@ class VPlan {
   /// Holds the name of the VPlan, for printing.
   std::string Name;
 
-  /// Holds all the external definitions created for this VPlan.
-  // TODO: Introduce a specific representation for external definitions in
-  // VPlan. External definitions must be immutable and hold a pointer to its
-  // underlying IR that will be used to implement its structural comparison
-  // (operators '==' and '<').
-  SetVector<VPValue *> VPExternalDefs;
+  /// Holds all the external definitions created for this VPlan. External
+  /// definitions must be immutable and hold a pointer to their underlying IR.
+  DenseMap<Value *, VPValue *> VPExternalDefs;
 
   /// Represents the trip count of the original loop, for folding
   /// the tail.
@@ -2307,13 +2479,13 @@ class VPlan {
   /// to be free when the plan's destructor is called.
   SmallVector<VPValue *, 16> VPValuesToFree;
 
-  /// Holds the VPLoopInfo analysis for this VPlan.
-  VPLoopInfo VPLInfo;
-
   /// Indicates whether it is safe use the Value2VPValue mapping or if the
   /// mapping cannot be used any longer, because it is stale.
   bool Value2VPValueEnabled = true;
 
+  /// Values used outside the plan.
+  MapVector<PHINode *, VPLiveOut *> LiveOuts;
+
 public:
   VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) {
     if (Entry)
@@ -2321,6 +2493,8 @@ public:
   }
 
   ~VPlan() {
+    clearLiveOuts();
+
     if (Entry) {
       VPValue DummyValue;
       for (VPBlockBase *Block : depth_first(Entry))
@@ -2334,13 +2508,14 @@ public:
       delete TripCount;
     if (BackedgeTakenCount)
       delete BackedgeTakenCount;
-    for (VPValue *Def : VPExternalDefs)
-      delete Def;
+    for (auto &P : VPExternalDefs)
+      delete P.second;
   }
 
   /// Prepare the plan for execution, setting up the required live-in values.
   void prepareToExecute(Value *TripCount, Value *VectorTripCount,
-                        Value *CanonicalIVStartValue, VPTransformState &State);
+                        Value *CanonicalIVStartValue, VPTransformState &State,
+                        bool IsEpilogueVectorization);
 
   /// Generate the IR code for this VPlan.
   void execute(struct VPTransformState *State);
@@ -2383,9 +2558,13 @@ public:
 
   void setName(const Twine &newName) { Name = newName.str(); }
 
-  /// Add \p VPVal to the pool of external definitions if it's not already
-  /// in the pool.
-  void addExternalDef(VPValue *VPVal) { VPExternalDefs.insert(VPVal); }
+  /// Get the existing or add a new external definition for \p V.
+  VPValue *getOrAddExternalDef(Value *V) {
+    auto I = VPExternalDefs.insert({V, nullptr});
+    if (I.second)
+      I.first->second = new VPValue(V);
+    return I.first->second;
+  }
 
   void addVPValue(Value *V) {
     assert(Value2VPValueEnabled &&
@@ -2432,10 +2611,6 @@ public:
     Value2VPValue.erase(V);
   }
 
-  /// Return the VPLoopInfo analysis for this VPlan.
-  VPLoopInfo &getVPLoopInfo() { return VPLInfo; }
-  const VPLoopInfo &getVPLoopInfo() const { return VPLInfo; }
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print this VPlan to \p O.
   void print(raw_ostream &O) const;
@@ -2465,7 +2640,10 @@ public:
 
   /// Returns the VPRegionBlock of the vector loop.
   VPRegionBlock *getVectorLoopRegion() {
-    return cast<VPRegionBlock>(getEntry());
+    return cast<VPRegionBlock>(getEntry()->getSingleSuccessor());
+  }
+  const VPRegionBlock *getVectorLoopRegion() const {
+    return cast<VPRegionBlock>(getEntry()->getSingleSuccessor());
   }
 
   /// Returns the canonical induction recipe of the vector loop.
@@ -2478,6 +2656,23 @@ public:
     return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin());
   }
 
+  void addLiveOut(PHINode *PN, VPValue *V);
+
+  void clearLiveOuts() {
+    for (auto &KV : LiveOuts)
+      delete KV.second;
+    LiveOuts.clear();
+  }
+
+  void removeLiveOut(PHINode *PN) {
+    delete LiveOuts[PN];
+    LiveOuts.erase(PN);
+  }
+
+  const MapVector<PHINode *, VPLiveOut *> &getLiveOuts() const {
+    return LiveOuts;
+  }
+
 private:
   /// Add to the given dominator tree the header block and every new basic block
   /// that was created between it and the latch block, inclusive.
@@ -2567,9 +2762,8 @@ public:
   /// Insert disconnected VPBlockBase \p NewBlock after \p BlockPtr. Add \p
   /// NewBlock as successor of \p BlockPtr and \p BlockPtr as predecessor of \p
   /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. \p BlockPtr's
-  /// successors are moved from \p BlockPtr to \p NewBlock and \p BlockPtr's
-  /// conditional bit is propagated to \p NewBlock. \p NewBlock must have
-  /// neither successors nor predecessors.
+  /// successors are moved from \p BlockPtr to \p NewBlock. \p NewBlock must
+  /// have neither successors nor predecessors.
   static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) {
     assert(NewBlock->getSuccessors().empty() &&
            NewBlock->getPredecessors().empty() &&
@@ -2580,24 +2774,22 @@ public:
       disconnectBlocks(BlockPtr, Succ);
       connectBlocks(NewBlock, Succ);
     }
-    NewBlock->setCondBit(BlockPtr->getCondBit());
-    BlockPtr->setCondBit(nullptr);
     connectBlocks(BlockPtr, NewBlock);
   }
 
   /// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p
   /// BlockPtr. Add \p IfTrue and \p IfFalse as succesors of \p BlockPtr and \p
   /// BlockPtr as predecessor of \p IfTrue and \p IfFalse. Propagate \p BlockPtr
-  /// parent to \p IfTrue and \p IfFalse. \p Condition is set as the successor
-  /// selector. \p BlockPtr must have no successors and \p IfTrue and \p IfFalse
-  /// must have neither successors nor predecessors.
+  /// parent to \p IfTrue and \p IfFalse. \p BlockPtr must have no successors
+  /// and \p IfTrue and \p IfFalse must have neither successors nor
+  /// predecessors.
   static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse,
-                                   VPValue *Condition, VPBlockBase *BlockPtr) {
+                                   VPBlockBase *BlockPtr) {
     assert(IfTrue->getSuccessors().empty() &&
            "Can't insert IfTrue with successors.");
     assert(IfFalse->getSuccessors().empty() &&
            "Can't insert IfFalse with successors.");
-    BlockPtr->setTwoSuccessors(IfTrue, IfFalse, Condition);
+    BlockPtr->setTwoSuccessors(IfTrue, IfFalse);
     IfTrue->setPredecessors({BlockPtr});
     IfFalse->setPredecessors({BlockPtr});
     IfTrue->setParent(BlockPtr->getParent());
@@ -2639,8 +2831,8 @@ public:
       R.moveBefore(*PredVPBB, PredVPBB->end());
     VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
     auto *ParentRegion = cast<VPRegionBlock>(Block->getParent());
-    if (ParentRegion->getExit() == Block)
-      ParentRegion->setExit(PredVPBB);
+    if (ParentRegion->getExiting() == Block)
+      ParentRegion->setExiting(PredVPBB);
     SmallVector<VPBlockBase *> Successors(Block->successors());
     for (auto *Succ : Successors) {
       VPBlockUtils::disconnectBlocks(Block, Succ);
@@ -2650,41 +2842,6 @@ public:
     return PredVPBB;
   }
 
-  /// Returns true if the edge \p FromBlock -> \p ToBlock is a back-edge.
-  static bool isBackEdge(const VPBlockBase *FromBlock,
-                         const VPBlockBase *ToBlock, const VPLoopInfo *VPLI) {
-    assert(FromBlock->getParent() == ToBlock->getParent() &&
-           FromBlock->getParent() && "Must be in same region");
-    const VPLoop *FromLoop = VPLI->getLoopFor(FromBlock);
-    const VPLoop *ToLoop = VPLI->getLoopFor(ToBlock);
-    if (!FromLoop || !ToLoop || FromLoop != ToLoop)
-      return false;
-
-    // A back-edge is a branch from the loop latch to its header.
-    return ToLoop->isLoopLatch(FromBlock) && ToBlock == ToLoop->getHeader();
-  }
-
-  /// Returns true if \p Block is a loop latch
-  static bool blockIsLoopLatch(const VPBlockBase *Block,
-                               const VPLoopInfo *VPLInfo) {
-    if (const VPLoop *ParentVPL = VPLInfo->getLoopFor(Block))
-      return ParentVPL->isLoopLatch(Block);
-
-    return false;
-  }
-
-  /// Count and return the number of succesors of \p PredBlock excluding any
-  /// backedges.
-  static unsigned countSuccessorsNoBE(VPBlockBase *PredBlock,
-                                      VPLoopInfo *VPLI) {
-    unsigned Count = 0;
-    for (VPBlockBase *SuccBlock : PredBlock->getSuccessors()) {
-      if (!VPBlockUtils::isBackEdge(PredBlock, SuccBlock, VPLI))
-        Count++;
-    }
-    return Count;
-  }
-
   /// Return an iterator range over \p Range which only includes \p BlockTy
   /// blocks. The accesses are casted to \p BlockTy.
   template <typename BlockTy, typename T>
@@ -2845,6 +3002,13 @@ namespace vputils {
 /// Returns true if only the first lane of \p Def is used.
 bool onlyFirstLaneUsed(VPValue *Def);
 
+/// Get or create a VPValue that corresponds to the expansion of \p Expr. If \p
+/// Expr is a SCEVConstant or SCEVUnknown, return a VPValue wrapping the live-in
+/// value. Otherwise return a VPExpandSCEVRecipe to expand \p Expr. If \p Plan's
+/// pre-header already contains a recipe expanding \p Expr, return it. If not,
+/// create a new one.
+VPValue *getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
+                                       ScalarEvolution &SE);
 } // end namespace vputils
 
 } // end namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
index 379988733312..84b0dac862b6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -42,9 +42,6 @@ private:
   // Vectorization plan that we are working on.
   VPlan &Plan;
 
-  // Output Top Region.
-  VPRegionBlock *TopRegion = nullptr;
-
   // Builder of the VPlan instruction-level representation.
   VPBuilder VPIRBuilder;
 
@@ -59,6 +56,9 @@ private:
   // Hold phi node's that need to be fixed once the plain CFG has been built.
   SmallVector<PHINode *, 8> PhisToFix;
 
+  /// Maps loops in the original IR to their corresponding region.
+  DenseMap<Loop *, VPRegionBlock *> Loop2Region;
+
   // Utility functions.
   void setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB);
   void fixPhiNodes();
@@ -73,8 +73,9 @@ public:
   PlainCFGBuilder(Loop *Lp, LoopInfo *LI, VPlan &P)
       : TheLoop(Lp), LI(LI), Plan(P) {}
 
-  // Build the plain CFG and return its Top Region.
-  VPRegionBlock *buildPlainCFG();
+  /// Build plain CFG for TheLoop. Return the pre-header VPBasicBlock connected
+  /// to a new VPRegionBlock (TopRegion) enclosing the plain CFG.
+  VPBasicBlock *buildPlainCFG();
 };
 } // anonymous namespace
 
@@ -106,19 +107,32 @@ void PlainCFGBuilder::fixPhiNodes() {
   }
 }
 
-// Create a new empty VPBasicBlock for an incoming BasicBlock or retrieve an
-// existing one if it was already created.
+// Create a new empty VPBasicBlock for an incoming BasicBlock in the region
+// corresponding to the containing loop  or retrieve an existing one if it was
+// already created. If no region exists yet for the loop containing \p BB, a new
+// one is created.
 VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) {
   auto BlockIt = BB2VPBB.find(BB);
   if (BlockIt != BB2VPBB.end())
     // Retrieve existing VPBB.
     return BlockIt->second;
 
+  // Get or create a region for the loop containing BB.
+  Loop *CurrentLoop = LI->getLoopFor(BB);
+  VPRegionBlock *ParentR = nullptr;
+  if (CurrentLoop) {
+    auto Iter = Loop2Region.insert({CurrentLoop, nullptr});
+    if (Iter.second)
+      Iter.first->second = new VPRegionBlock(
+          CurrentLoop->getHeader()->getName().str(), false /*isReplicator*/);
+    ParentR = Iter.first->second;
+  }
+
   // Create new VPBB.
   LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << BB->getName() << "\n");
   VPBasicBlock *VPBB = new VPBasicBlock(BB->getName());
   BB2VPBB[BB] = VPBB;
-  VPBB->setParent(TopRegion);
+  VPBB->setParent(ParentR);
   return VPBB;
 }
 
@@ -182,8 +196,7 @@ VPValue *PlainCFGBuilder::getOrCreateVPOperand(Value *IRVal) {
 
   // A and B: Create VPValue and add it to the pool of external definitions and
   // to the Value->VPValue map.
-  VPValue *NewVPVal = new VPValue(IRVal);
-  Plan.addExternalDef(NewVPVal);
+  VPValue *NewVPVal = Plan.getOrAddExternalDef(IRVal);
   IRDef2VPValue[IRVal] = NewVPVal;
   return NewVPVal;
 }
@@ -203,10 +216,13 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
            "Instruction shouldn't have been visited.");
 
     if (auto *Br = dyn_cast<BranchInst>(Inst)) {
-      // Branch instruction is not explicitly represented in VPlan but we need
-      // to represent its condition bit when it's conditional.
-      if (Br->isConditional())
-        getOrCreateVPOperand(Br->getCondition());
+      // Conditional branch instruction are represented using BranchOnCond
+      // recipes.
+      if (Br->isConditional()) {
+        VPValue *Cond = getOrCreateVPOperand(Br->getCondition());
+        VPBB->appendRecipe(
+            new VPInstruction(VPInstruction::BranchOnCond, {Cond}));
+      }
 
       // Skip the rest of the Instruction processing for Branch instructions.
       continue;
@@ -238,11 +254,8 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
 }
 
 // Main interface to build the plain CFG.
-VPRegionBlock *PlainCFGBuilder::buildPlainCFG() {
-  // 1. Create the Top Region. It will be the parent of all VPBBs.
-  TopRegion = new VPRegionBlock("TopRegion", false /*isReplicator*/);
-
-  // 2. Scan the body of the loop in a topological order to visit each basic
+VPBasicBlock *PlainCFGBuilder::buildPlainCFG() {
+  // 1. Scan the body of the loop in a topological order to visit each basic
   // block after having visited its predecessor basic blocks. Create a VPBB for
   // each BB and link it to its successor and predecessor VPBBs. Note that
   // predecessors must be set in the same order as they are in the incomming IR.
@@ -251,21 +264,20 @@ VPRegionBlock *PlainCFGBuilder::buildPlainCFG() {
 
   // Loop PH needs to be explicitly visited since it's not taken into account by
   // LoopBlocksDFS.
-  BasicBlock *PreheaderBB = TheLoop->getLoopPreheader();
-  assert((PreheaderBB->getTerminator()->getNumSuccessors() == 1) &&
+  BasicBlock *ThePreheaderBB = TheLoop->getLoopPreheader();
+  assert((ThePreheaderBB->getTerminator()->getNumSuccessors() == 1) &&
          "Unexpected loop preheader");
-  VPBasicBlock *PreheaderVPBB = getOrCreateVPBB(PreheaderBB);
-  for (auto &I : *PreheaderBB) {
+  VPBasicBlock *ThePreheaderVPBB = getOrCreateVPBB(ThePreheaderBB);
+  ThePreheaderVPBB->setName("vector.ph");
+  for (auto &I : *ThePreheaderBB) {
     if (I.getType()->isVoidTy())
       continue;
-    VPValue *VPV = new VPValue(&I);
-    Plan.addExternalDef(VPV);
-    IRDef2VPValue[&I] = VPV;
+    IRDef2VPValue[&I] = Plan.getOrAddExternalDef(&I);
   }
   // Create empty VPBB for Loop H so that we can link PH->H.
   VPBlockBase *HeaderVPBB = getOrCreateVPBB(TheLoop->getHeader());
-  // Preheader's predecessors will be set during the loop RPO traversal below.
-  PreheaderVPBB->setOneSuccessor(HeaderVPBB);
+  HeaderVPBB->setName("vector.body");
+  ThePreheaderVPBB->setOneSuccessor(HeaderVPBB);
 
   LoopBlocksRPO RPO(TheLoop);
   RPO.perform(LI);
@@ -295,16 +307,13 @@ VPRegionBlock *PlainCFGBuilder::buildPlainCFG() {
 
       // Get VPBB's condition bit.
       assert(isa<BranchInst>(TI) && "Unsupported terminator!");
-      auto *Br = cast<BranchInst>(TI);
-      Value *BrCond = Br->getCondition();
       // Look up the branch condition to get the corresponding VPValue
       // representing the condition bit in VPlan (which may be in another VPBB).
-      assert(IRDef2VPValue.count(BrCond) &&
+      assert(IRDef2VPValue.count(cast<BranchInst>(TI)->getCondition()) &&
              "Missing condition bit in IRDef2VPValue!");
-      VPValue *VPCondBit = IRDef2VPValue[BrCond];
 
-      // Link successors using condition bit.
-      VPBB->setTwoSuccessors(SuccVPBB0, SuccVPBB1, VPCondBit);
+      // Link successors.
+      VPBB->setTwoSuccessors(SuccVPBB0, SuccVPBB1);
     } else
       llvm_unreachable("Number of successors not supported.");
 
@@ -312,30 +321,61 @@ VPRegionBlock *PlainCFGBuilder::buildPlainCFG() {
     setVPBBPredsFromBB(VPBB, BB);
   }
 
-  // 3. Process outermost loop exit. We created an empty VPBB for the loop
+  // 2. Process outermost loop exit. We created an empty VPBB for the loop
   // single exit BB during the RPO traversal of the loop body but Instructions
   // weren't visited because it's not part of the the loop.
   BasicBlock *LoopExitBB = TheLoop->getUniqueExitBlock();
   assert(LoopExitBB && "Loops with multiple exits are not supported.");
   VPBasicBlock *LoopExitVPBB = BB2VPBB[LoopExitBB];
-  createVPInstructionsForVPBB(LoopExitVPBB, LoopExitBB);
   // Loop exit was already set as successor of the loop exiting BB.
   // We only set its predecessor VPBB now.
   setVPBBPredsFromBB(LoopExitVPBB, LoopExitBB);
 
+  // 3. Fix up region blocks for loops. For each loop,
+  //   * use the header block as entry to the corresponding region,
+  //   * use the latch block as exit of the corresponding region,
+  //   * set the region as successor of the loop pre-header, and
+  //   * set the exit block as successor to the region.
+  SmallVector<Loop *> LoopWorkList;
+  LoopWorkList.push_back(TheLoop);
+  while (!LoopWorkList.empty()) {
+    Loop *L = LoopWorkList.pop_back_val();
+    BasicBlock *Header = L->getHeader();
+    BasicBlock *Exiting = L->getLoopLatch();
+    assert(Exiting == L->getExitingBlock() &&
+           "Latch must be the only exiting block");
+    VPRegionBlock *Region = Loop2Region[L];
+    VPBasicBlock *HeaderVPBB = getOrCreateVPBB(Header);
+    VPBasicBlock *ExitingVPBB = getOrCreateVPBB(Exiting);
+
+    // Disconnect backedge and pre-header from header.
+    VPBasicBlock *PreheaderVPBB = getOrCreateVPBB(L->getLoopPreheader());
+    VPBlockUtils::disconnectBlocks(PreheaderVPBB, HeaderVPBB);
+    VPBlockUtils::disconnectBlocks(ExitingVPBB, HeaderVPBB);
+
+    Region->setParent(PreheaderVPBB->getParent());
+    Region->setEntry(HeaderVPBB);
+    VPBlockUtils::connectBlocks(PreheaderVPBB, Region);
+
+    // Disconnect exit block from exiting (=latch) block, set exiting block and
+    // connect region to exit block.
+    VPBasicBlock *ExitVPBB = getOrCreateVPBB(L->getExitBlock());
+    VPBlockUtils::disconnectBlocks(ExitingVPBB, ExitVPBB);
+    Region->setExiting(ExitingVPBB);
+    VPBlockUtils::connectBlocks(Region, ExitVPBB);
+
+    // Queue sub-loops for processing.
+    LoopWorkList.append(L->begin(), L->end());
+  }
   // 4. The whole CFG has been built at this point so all the input Values must
   // have a VPlan couterpart. Fix VPlan phi nodes by adding their corresponding
   // VPlan operands.
   fixPhiNodes();
 
-  // 5. Final Top Region setup. Set outermost loop pre-header and single exit as
-  // Top Region entry and exit.
-  TopRegion->setEntry(PreheaderVPBB);
-  TopRegion->setExit(LoopExitVPBB);
-  return TopRegion;
+  return ThePreheaderVPBB;
 }
 
-VPRegionBlock *VPlanHCFGBuilder::buildPlainCFG() {
+VPBasicBlock *VPlanHCFGBuilder::buildPlainCFG() {
   PlainCFGBuilder PCFGBuilder(TheLoop, LI, Plan);
   return PCFGBuilder.buildPlainCFG();
 }
@@ -343,20 +383,15 @@ VPRegionBlock *VPlanHCFGBuilder::buildPlainCFG() {
 // Public interface to build a H-CFG.
 void VPlanHCFGBuilder::buildHierarchicalCFG() {
   // Build Top Region enclosing the plain CFG and set it as VPlan entry.
-  VPRegionBlock *TopRegion = buildPlainCFG();
-  Plan.setEntry(TopRegion);
+  VPBasicBlock *EntryVPBB = buildPlainCFG();
+  Plan.setEntry(EntryVPBB);
   LLVM_DEBUG(Plan.setName("HCFGBuilder: Plain CFG\n"); dbgs() << Plan);
 
+  VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
   Verifier.verifyHierarchicalCFG(TopRegion);
 
   // Compute plain CFG dom tree for VPLInfo.
   VPDomTree.recalculate(*TopRegion);
   LLVM_DEBUG(dbgs() << "Dominator Tree after building the plain CFG.\n";
              VPDomTree.print(dbgs()));
-
-  // Compute VPLInfo and keep it in Plan.
-  VPLoopInfo &VPLInfo = Plan.getVPLoopInfo();
-  VPLInfo.analyze(VPDomTree);
-  LLVM_DEBUG(dbgs() << "VPLoop Info After buildPlainCFG:\n";
-             VPLInfo.print(dbgs()));
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
index 238ee7e6347c..2d52990af268 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
@@ -24,13 +24,15 @@
 #ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H
 #define LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H
 
-#include "VPlan.h"
 #include "VPlanDominatorTree.h"
 #include "VPlanVerifier.h"
 
 namespace llvm {
 
 class Loop;
+class LoopInfo;
+class VPRegionBlock;
+class VPlan;
 class VPlanTestBase;
 
 /// Main class to build the VPlan H-CFG for an incoming IR.
@@ -55,9 +57,9 @@ private:
   // are introduced.
   VPDominatorTree VPDomTree;
 
-  /// Build plain CFG for TheLoop. Return a new VPRegionBlock (TopRegion)
-  /// enclosing the plain CFG.
-  VPRegionBlock *buildPlainCFG();
+  /// Build plain CFG for TheLoop. Return the pre-header VPBasicBlock connected
+  /// to a new VPRegionBlock (TopRegion) enclosing the plain CFG.
+  VPBasicBlock *buildPlainCFG();
 
 public:
   VPlanHCFGBuilder(Loop *Lp, LoopInfo *LI, VPlan &P)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanLoopInfo.h b/llvm/lib/Transforms/Vectorize/VPlanLoopInfo.h
deleted file mode 100644
index 5208f2d58e2b..000000000000
--- a/llvm/lib/Transforms/Vectorize/VPlanLoopInfo.h
+++ /dev/null
@@ -1,44 +0,0 @@
-//===-- VPLoopInfo.h --------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines VPLoopInfo analysis and VPLoop class. VPLoopInfo is a
-/// specialization of LoopInfoBase for VPBlockBase. VPLoops is a specialization
-/// of LoopBase that is used to hold loop metadata from VPLoopInfo. Further
-/// information can be found in VectorizationPlanner.rst.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLOOPINFO_H
-#define LLVM_TRANSFORMS_VECTORIZE_VPLOOPINFO_H
-
-#include "llvm/Analysis/LoopInfoImpl.h"
-
-namespace llvm {
-class VPBlockBase;
-
-/// Hold analysis information for every loop detected by VPLoopInfo. It is an
-/// instantiation of LoopBase.
-class VPLoop : public LoopBase<VPBlockBase, VPLoop> {
-private:
-  friend class LoopInfoBase<VPBlockBase, VPLoop>;
-  explicit VPLoop(VPBlockBase *VPB) : LoopBase<VPBlockBase, VPLoop>(VPB) {}
-};
-
-/// VPLoopInfo provides analysis of natural loop for VPBlockBase-based
-/// Hierarchical CFG. It is a specialization of LoopInfoBase class.
-// TODO: VPLoopInfo is initially computed on top of the VPlan plain CFG, which
-// is the same as the incoming IR CFG. If it's more efficient than running the
-// whole loop detection algorithm, we may want to create a mechanism to
-// translate LoopInfo into VPLoopInfo. However, that would require significant
-// changes in LoopInfoBase class.
-typedef LoopInfoBase<VPBlockBase, VPLoop> VPLoopInfo;
-
-} // namespace llvm
-
-#endif // LLVM_TRANSFORMS_VECTORIZE_VPLOOPINFO_H
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
deleted file mode 100644
index e879a33db6ee..000000000000
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ /dev/null
@@ -1,248 +0,0 @@
-//===-- VPlanPredicator.cpp -------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file implements the VPlanPredicator class which contains the public
-/// interfaces to predicate and linearize the VPlan region.
-///
-//===----------------------------------------------------------------------===//
-
-#include "VPlanPredicator.h"
-#include "VPlan.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/GraphTraits.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-#define DEBUG_TYPE "VPlanPredicator"
-
-using namespace llvm;
-
-// Generate VPInstructions at the beginning of CurrBB that calculate the
-// predicate being propagated from PredBB to CurrBB depending on the edge type
-// between them. For example if:
-//  i.  PredBB is controlled by predicate %BP, and
-//  ii. The edge PredBB->CurrBB is the false edge, controlled by the condition
-//  bit value %CBV then this function will generate the following two
-//  VPInstructions at the start of CurrBB:
-//   %IntermediateVal = not %CBV
-//   %FinalVal        = and %BP %IntermediateVal
-// It returns %FinalVal.
-VPValue *VPlanPredicator::getOrCreateNotPredicate(VPBasicBlock *PredBB,
-                                                  VPBasicBlock *CurrBB) {
-  VPValue *CBV = PredBB->getCondBit();
-
-  // Set the intermediate value - this is either 'CBV', or 'not CBV'
-  // depending on the edge type.
-  EdgeType ET = getEdgeTypeBetween(PredBB, CurrBB);
-  VPValue *IntermediateVal = nullptr;
-  switch (ET) {
-  case EdgeType::TRUE_EDGE:
-    // CurrBB is the true successor of PredBB - nothing to do here.
-    IntermediateVal = CBV;
-    break;
-
-  case EdgeType::FALSE_EDGE:
-    // CurrBB is the False successor of PredBB - compute not of CBV.
-    IntermediateVal = Builder.createNot(CBV, {});
-    break;
-  }
-
-  // Now AND intermediate value with PredBB's block predicate if it has one.
-  VPValue *BP = PredBB->getPredicate();
-  if (BP)
-    return Builder.createAnd(BP, IntermediateVal, {});
-  else
-    return IntermediateVal;
-}
-
-// Generate a tree of ORs for all IncomingPredicates in  WorkList.
-// Note: This function destroys the original Worklist.
-//
-// P1 P2 P3 P4 P5
-//  \ /   \ /  /
-//  OR1   OR2 /
-//    \    | /
-//     \   +/-+
-//      \  /  |
-//       OR3  |
-//         \  |
-//          OR4 <- Returns this
-//           |
-//
-// The algorithm uses a worklist of predicates as its main data structure.
-// We pop a pair of values from the front (e.g. P1 and P2), generate an OR
-// (in this example OR1), and push it back. In this example the worklist
-// contains {P3, P4, P5, OR1}.
-// The process iterates until we have only one element in the Worklist (OR4).
-// The last element is the root predicate which is returned.
-VPValue *VPlanPredicator::genPredicateTree(std::list<VPValue *> &Worklist) {
-  if (Worklist.empty())
-    return nullptr;
-
-  // The worklist initially contains all the leaf nodes. Initialize the tree
-  // using them.
-  while (Worklist.size() >= 2) {
-    // Pop a pair of values from the front.
-    VPValue *LHS = Worklist.front();
-    Worklist.pop_front();
-    VPValue *RHS = Worklist.front();
-    Worklist.pop_front();
-
-    // Create an OR of these values.
-    VPValue *Or = Builder.createOr(LHS, RHS, {});
-
-    // Push OR to the back of the worklist.
-    Worklist.push_back(Or);
-  }
-
-  assert(Worklist.size() == 1 && "Expected 1 item in worklist");
-
-  // The root is the last node in the worklist.
-  VPValue *Root = Worklist.front();
-
-  // This root needs to replace the existing block predicate. This is done in
-  // the caller function.
-  return Root;
-}
-
-// Return whether the edge FromBlock -> ToBlock is a TRUE_EDGE or FALSE_EDGE
-VPlanPredicator::EdgeType
-VPlanPredicator::getEdgeTypeBetween(VPBlockBase *FromBlock,
-                                    VPBlockBase *ToBlock) {
-  unsigned Count = 0;
-  for (VPBlockBase *SuccBlock : FromBlock->getSuccessors()) {
-    if (SuccBlock == ToBlock) {
-      assert(Count < 2 && "Switch not supported currently");
-      return (Count == 0) ? EdgeType::TRUE_EDGE : EdgeType::FALSE_EDGE;
-    }
-    Count++;
-  }
-
-  llvm_unreachable("Broken getEdgeTypeBetween");
-}
-
-// Generate all predicates needed for CurrBlock by going through its immediate
-// predecessor blocks.
-void VPlanPredicator::createOrPropagatePredicates(VPBlockBase *CurrBlock,
-                                                  VPRegionBlock *Region) {
-  // Blocks that dominate region exit inherit the predicate from the region.
-  // Return after setting the predicate.
-  if (VPDomTree.dominates(CurrBlock, Region->getExit())) {
-    VPValue *RegionBP = Region->getPredicate();
-    CurrBlock->setPredicate(RegionBP);
-    return;
-  }
-
-  // Collect all incoming predicates in a worklist.
-  std::list<VPValue *> IncomingPredicates;
-
-  // Set the builder's insertion point to the top of the current BB
-  VPBasicBlock *CurrBB = cast<VPBasicBlock>(CurrBlock->getEntryBasicBlock());
-  Builder.setInsertPoint(CurrBB, CurrBB->begin());
-
-  // For each predecessor, generate the VPInstructions required for
-  // computing 'BP AND (not) CBV" at the top of CurrBB.
-  // Collect the outcome of this calculation for all predecessors
-  // into IncomingPredicates.
-  for (VPBlockBase *PredBlock : CurrBlock->getPredecessors()) {
-    // Skip back-edges
-    if (VPBlockUtils::isBackEdge(PredBlock, CurrBlock, VPLI))
-      continue;
-
-    VPValue *IncomingPredicate = nullptr;
-    unsigned NumPredSuccsNoBE =
-        VPBlockUtils::countSuccessorsNoBE(PredBlock, VPLI);
-
-    // If there is an unconditional branch to the currBB, then we don't create
-    // edge predicates. We use the predecessor's block predicate instead.
-    if (NumPredSuccsNoBE == 1)
-      IncomingPredicate = PredBlock->getPredicate();
-    else if (NumPredSuccsNoBE == 2) {
-      // Emit recipes into CurrBlock if required
-      assert(isa<VPBasicBlock>(PredBlock) && "Only BBs have multiple exits");
-      IncomingPredicate =
-          getOrCreateNotPredicate(cast<VPBasicBlock>(PredBlock), CurrBB);
-    } else
-      llvm_unreachable("FIXME: switch statement ?");
-
-    if (IncomingPredicate)
-      IncomingPredicates.push_back(IncomingPredicate);
-  }
-
-  // Logically OR all incoming predicates by building the Predicate Tree.
-  VPValue *Predicate = genPredicateTree(IncomingPredicates);
-
-  // Now update the block's predicate with the new one.
-  CurrBlock->setPredicate(Predicate);
-}
-
-// Generate all predicates needed for Region.
-void VPlanPredicator::predicateRegionRec(VPRegionBlock *Region) {
-  VPBasicBlock *EntryBlock = cast<VPBasicBlock>(Region->getEntry());
-  ReversePostOrderTraversal<VPBlockBase *> RPOT(EntryBlock);
-
-  // Generate edge predicates and append them to the block predicate. RPO is
-  // necessary since the predecessor blocks' block predicate needs to be set
-  // before the current block's block predicate can be computed.
-  for (VPBlockBase *Block : RPOT) {
-    // TODO: Handle nested regions once we start generating the same.
-    assert(!isa<VPRegionBlock>(Block) && "Nested region not expected");
-    createOrPropagatePredicates(Block, Region);
-  }
-}
-
-// Linearize the CFG within Region.
-// TODO: Predication and linearization need RPOT for every region.
-// This traversal is expensive. Since predication is not adding new
-// blocks, we should be able to compute RPOT once in predication and
-// reuse it here. This becomes even more important once we have nested
-// regions.
-void VPlanPredicator::linearizeRegionRec(VPRegionBlock *Region) {
-  ReversePostOrderTraversal<VPBlockBase *> RPOT(Region->getEntry());
-  VPBlockBase *PrevBlock = nullptr;
-
-  for (VPBlockBase *CurrBlock : RPOT) {
-    // TODO: Handle nested regions once we start generating the same.
-    assert(!isa<VPRegionBlock>(CurrBlock) && "Nested region not expected");
-
-    // Linearize control flow by adding an unconditional edge between PrevBlock
-    // and CurrBlock skipping loop headers and latches to keep intact loop
-    // header predecessors and loop latch successors.
-    if (PrevBlock && !VPLI->isLoopHeader(CurrBlock) &&
-        !VPBlockUtils::blockIsLoopLatch(PrevBlock, VPLI)) {
-
-      LLVM_DEBUG(dbgs() << "Linearizing: " << PrevBlock->getName() << "->"
-                        << CurrBlock->getName() << "\n");
-
-      PrevBlock->clearSuccessors();
-      CurrBlock->clearPredecessors();
-      VPBlockUtils::connectBlocks(PrevBlock, CurrBlock);
-    }
-
-    PrevBlock = CurrBlock;
-  }
-}
-
-// Entry point. The driver function for the predicator.
-void VPlanPredicator::predicate() {
-  // Predicate the blocks within Region.
-  predicateRegionRec(cast<VPRegionBlock>(Plan.getEntry()));
-
-  // Linearlize the blocks with Region.
-  linearizeRegionRec(cast<VPRegionBlock>(Plan.getEntry()));
-}
-
-VPlanPredicator::VPlanPredicator(VPlan &Plan)
-    : Plan(Plan), VPLI(&(Plan.getVPLoopInfo())) {
-  // FIXME: Predicator is currently computing the dominator information for the
-  // top region. Once we start storing dominator information in a VPRegionBlock,
-  // we can avoid this recalculation.
-  VPDomTree.recalculate(*(cast<VPRegionBlock>(Plan.getEntry())));
-}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.h b/llvm/lib/Transforms/Vectorize/VPlanPredicator.h
deleted file mode 100644
index a5db9a54da3c..000000000000
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.h
+++ /dev/null
@@ -1,74 +0,0 @@
-//===-- VPlanPredicator.h ---------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines the VPlanPredicator class which contains the public
-/// interfaces to predicate and linearize the VPlan region.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H
-#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H
-
-#include "LoopVectorizationPlanner.h"
-#include "VPlan.h"
-#include "VPlanDominatorTree.h"
-
-namespace llvm {
-
-class VPlanPredicator {
-private:
-  enum class EdgeType {
-    TRUE_EDGE,
-    FALSE_EDGE,
-  };
-
-  // VPlan being predicated.
-  VPlan &Plan;
-
-  // VPLoopInfo for Plan's HCFG.
-  VPLoopInfo *VPLI;
-
-  // Dominator tree for Plan's HCFG.
-  VPDominatorTree VPDomTree;
-
-  // VPlan builder used to generate VPInstructions for block predicates.
-  VPBuilder Builder;
-
-  /// Get the type of edge from \p FromBlock to \p ToBlock. Returns TRUE_EDGE if
-  /// \p ToBlock is either the unconditional successor or the conditional true
-  /// successor of \p FromBlock and FALSE_EDGE otherwise.
-  EdgeType getEdgeTypeBetween(VPBlockBase *FromBlock, VPBlockBase *ToBlock);
-
-  /// Create and return VPValue corresponding to the predicate for the edge from
-  /// \p PredBB to \p CurrentBlock.
-  VPValue *getOrCreateNotPredicate(VPBasicBlock *PredBB, VPBasicBlock *CurrBB);
-
-  /// Generate and return the result of ORing all the predicate VPValues in \p
-  /// Worklist.
-  VPValue *genPredicateTree(std::list<VPValue *> &Worklist);
-
-  /// Create or propagate predicate for \p CurrBlock in region \p Region using
-  /// predicate(s) of its predecessor(s)
-  void createOrPropagatePredicates(VPBlockBase *CurrBlock,
-                                   VPRegionBlock *Region);
-
-  /// Predicate the CFG within \p Region.
-  void predicateRegionRec(VPRegionBlock *Region);
-
-  /// Linearize the CFG within \p Region.
-  void linearizeRegionRec(VPRegionBlock *Region);
-
-public:
-  VPlanPredicator(VPlan &Plan);
-
-  /// Predicate Plan's HCFG.
-  void predicate();
-};
-} // end namespace llvm
-#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
new file mode 100644
index 000000000000..92422b17457c
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -0,0 +1,840 @@
+//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains implementations for different VPlan recipes.
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlan.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/IVDescriptors.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include <cassert>
+
+using namespace llvm;
+
+extern cl::opt<bool> EnableVPlanNativePath;
+
+bool VPRecipeBase::mayWriteToMemory() const {
+  switch (getVPDefID()) {
+  case VPWidenMemoryInstructionSC: {
+    return cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
+  }
+  case VPReplicateSC:
+  case VPWidenCallSC:
+    return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
+        ->mayWriteToMemory();
+  case VPBranchOnMaskSC:
+    return false;
+  case VPWidenIntOrFpInductionSC:
+  case VPWidenCanonicalIVSC:
+  case VPWidenPHISC:
+  case VPBlendSC:
+  case VPWidenSC:
+  case VPWidenGEPSC:
+  case VPReductionSC:
+  case VPWidenSelectSC: {
+    const Instruction *I =
+        dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
+    (void)I;
+    assert((!I || !I->mayWriteToMemory()) &&
+           "underlying instruction may write to memory");
+    return false;
+  }
+  default:
+    return true;
+  }
+}
+
+bool VPRecipeBase::mayReadFromMemory() const {
+  switch (getVPDefID()) {
+  case VPWidenMemoryInstructionSC: {
+    return !cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
+  }
+  case VPReplicateSC:
+  case VPWidenCallSC:
+    return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
+        ->mayReadFromMemory();
+  case VPBranchOnMaskSC:
+    return false;
+  case VPWidenIntOrFpInductionSC:
+  case VPWidenCanonicalIVSC:
+  case VPWidenPHISC:
+  case VPBlendSC:
+  case VPWidenSC:
+  case VPWidenGEPSC:
+  case VPReductionSC:
+  case VPWidenSelectSC: {
+    const Instruction *I =
+        dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
+    (void)I;
+    assert((!I || !I->mayReadFromMemory()) &&
+           "underlying instruction may read from memory");
+    return false;
+  }
+  default:
+    return true;
+  }
+}
+
+bool VPRecipeBase::mayHaveSideEffects() const {
+  switch (getVPDefID()) {
+  case VPWidenIntOrFpInductionSC:
+  case VPWidenPointerInductionSC:
+  case VPWidenCanonicalIVSC:
+  case VPWidenPHISC:
+  case VPBlendSC:
+  case VPWidenSC:
+  case VPWidenGEPSC:
+  case VPReductionSC:
+  case VPWidenSelectSC:
+  case VPScalarIVStepsSC: {
+    const Instruction *I =
+        dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
+    (void)I;
+    assert((!I || !I->mayHaveSideEffects()) &&
+           "underlying instruction has side-effects");
+    return false;
+  }
+  case VPReplicateSC: {
+    auto *R = cast<VPReplicateRecipe>(this);
+    return R->getUnderlyingInstr()->mayHaveSideEffects();
+  }
+  default:
+    return true;
+  }
+}
+
+void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
+  auto Lane = VPLane::getLastLaneForVF(State.VF);
+  VPValue *ExitValue = getOperand(0);
+  if (Plan.isUniformAfterVectorization(ExitValue))
+    Lane = VPLane::getFirstLane();
+  Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)),
+                   State.Builder.GetInsertBlock());
+}
+
+void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
+  assert(!Parent && "Recipe already in some VPBasicBlock");
+  assert(InsertPos->getParent() &&
+         "Insertion position not in any VPBasicBlock");
+  Parent = InsertPos->getParent();
+  Parent->getRecipeList().insert(InsertPos->getIterator(), this);
+}
+
+void VPRecipeBase::insertBefore(VPBasicBlock &BB,
+                                iplist<VPRecipeBase>::iterator I) {
+  assert(!Parent && "Recipe already in some VPBasicBlock");
+  assert(I == BB.end() || I->getParent() == &BB);
+  Parent = &BB;
+  BB.getRecipeList().insert(I, this);
+}
+
+void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
+  assert(!Parent && "Recipe already in some VPBasicBlock");
+  assert(InsertPos->getParent() &&
+         "Insertion position not in any VPBasicBlock");
+  Parent = InsertPos->getParent();
+  Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this);
+}
+
+void VPRecipeBase::removeFromParent() {
+  assert(getParent() && "Recipe not in any VPBasicBlock");
+  getParent()->getRecipeList().remove(getIterator());
+  Parent = nullptr;
+}
+
+iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
+  assert(getParent() && "Recipe not in any VPBasicBlock");
+  return getParent()->getRecipeList().erase(getIterator());
+}
+
+void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
+  removeFromParent();
+  insertAfter(InsertPos);
+}
+
+void VPRecipeBase::moveBefore(VPBasicBlock &BB,
+                              iplist<VPRecipeBase>::iterator I) {
+  removeFromParent();
+  insertBefore(BB, I);
+}
+
+void VPInstruction::generateInstruction(VPTransformState &State,
+                                        unsigned Part) {
+  IRBuilderBase &Builder = State.Builder;
+  Builder.SetCurrentDebugLocation(DL);
+
+  if (Instruction::isBinaryOp(getOpcode())) {
+    Value *A = State.get(getOperand(0), Part);
+    Value *B = State.get(getOperand(1), Part);
+    Value *V = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B);
+    State.set(this, V, Part);
+    return;
+  }
+
+  switch (getOpcode()) {
+  case VPInstruction::Not: {
+    Value *A = State.get(getOperand(0), Part);
+    Value *V = Builder.CreateNot(A);
+    State.set(this, V, Part);
+    break;
+  }
+  case VPInstruction::ICmpULE: {
+    Value *IV = State.get(getOperand(0), Part);
+    Value *TC = State.get(getOperand(1), Part);
+    Value *V = Builder.CreateICmpULE(IV, TC);
+    State.set(this, V, Part);
+    break;
+  }
+  case Instruction::Select: {
+    Value *Cond = State.get(getOperand(0), Part);
+    Value *Op1 = State.get(getOperand(1), Part);
+    Value *Op2 = State.get(getOperand(2), Part);
+    Value *V = Builder.CreateSelect(Cond, Op1, Op2);
+    State.set(this, V, Part);
+    break;
+  }
+  case VPInstruction::ActiveLaneMask: {
+    // Get first lane of vector induction variable.
+    Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
+    // Get the original loop tripcount.
+    Value *ScalarTC = State.get(getOperand(1), Part);
+
+    auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
+    auto *PredTy = VectorType::get(Int1Ty, State.VF);
+    Instruction *Call = Builder.CreateIntrinsic(
+        Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()},
+        {VIVElem0, ScalarTC}, nullptr, "active.lane.mask");
+    State.set(this, Call, Part);
+    break;
+  }
+  case VPInstruction::FirstOrderRecurrenceSplice: {
+    // Generate code to combine the previous and current values in vector v3.
+    //
+    //   vector.ph:
+    //     v_init = vector(..., ..., ..., a[-1])
+    //     br vector.body
+    //
+    //   vector.body
+    //     i = phi [0, vector.ph], [i+4, vector.body]
+    //     v1 = phi [v_init, vector.ph], [v2, vector.body]
+    //     v2 = a[i, i+1, i+2, i+3];
+    //     v3 = vector(v1(3), v2(0, 1, 2))
+
+    // For the first part, use the recurrence phi (v1), otherwise v2.
+    auto *V1 = State.get(getOperand(0), 0);
+    Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1);
+    if (!PartMinus1->getType()->isVectorTy()) {
+      State.set(this, PartMinus1, Part);
+    } else {
+      Value *V2 = State.get(getOperand(1), Part);
+      State.set(this, Builder.CreateVectorSplice(PartMinus1, V2, -1), Part);
+    }
+    break;
+  }
+  case VPInstruction::CanonicalIVIncrement:
+  case VPInstruction::CanonicalIVIncrementNUW: {
+    Value *Next = nullptr;
+    if (Part == 0) {
+      bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW;
+      auto *Phi = State.get(getOperand(0), 0);
+      // The loop step is equal to the vectorization factor (num of SIMD
+      // elements) times the unroll factor (num of SIMD instructions).
+      Value *Step =
+          createStepForVF(Builder, Phi->getType(), State.VF, State.UF);
+      Next = Builder.CreateAdd(Phi, Step, "index.next", IsNUW, false);
+    } else {
+      Next = State.get(this, 0);
+    }
+
+    State.set(this, Next, Part);
+    break;
+  }
+  case VPInstruction::BranchOnCond: {
+    if (Part != 0)
+      break;
+
+    Value *Cond = State.get(getOperand(0), VPIteration(Part, 0));
+    VPRegionBlock *ParentRegion = getParent()->getParent();
+    VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
+
+    // Replace the temporary unreachable terminator with a new conditional
+    // branch, hooking it up to backward destination for exiting blocks now and
+    // to forward destination(s) later when they are created.
+    BranchInst *CondBr =
+        Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
+
+    if (getParent()->isExiting())
+      CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
+
+    CondBr->setSuccessor(0, nullptr);
+    Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
+    break;
+  }
+  case VPInstruction::BranchOnCount: {
+    if (Part != 0)
+      break;
+    // First create the compare.
+    Value *IV = State.get(getOperand(0), Part);
+    Value *TC = State.get(getOperand(1), Part);
+    Value *Cond = Builder.CreateICmpEQ(IV, TC);
+
+    // Now create the branch.
+    auto *Plan = getParent()->getPlan();
+    VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
+    VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
+
+    // Replace the temporary unreachable terminator with a new conditional
+    // branch, hooking it up to backward destination (the header) now and to the
+    // forward destination (the exit/middle block) later when it is created.
+    // Note that CreateCondBr expects a valid BB as first argument, so we need
+    // to set it to nullptr later.
+    BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
+                                              State.CFG.VPBB2IRBB[Header]);
+    CondBr->setSuccessor(0, nullptr);
+    Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
+    break;
+  }
+  default:
+    llvm_unreachable("Unsupported opcode for instruction");
+  }
+}
+
+void VPInstruction::execute(VPTransformState &State) {
+  assert(!State.Instance && "VPInstruction executing an Instance");
+  IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
+  State.Builder.setFastMathFlags(FMF);
+  for (unsigned Part = 0; Part < State.UF; ++Part)
+    generateInstruction(State, Part);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPInstruction::dump() const {
+  VPSlotTracker SlotTracker(getParent()->getPlan());
+  print(dbgs(), "", SlotTracker);
+}
+
+void VPInstruction::print(raw_ostream &O, const Twine &Indent,
+                          VPSlotTracker &SlotTracker) const {
+  O << Indent << "EMIT ";
+
+  if (hasResult()) {
+    printAsOperand(O, SlotTracker);
+    O << " = ";
+  }
+
+  switch (getOpcode()) {
+  case VPInstruction::Not:
+    O << "not";
+    break;
+  case VPInstruction::ICmpULE:
+    O << "icmp ule";
+    break;
+  case VPInstruction::SLPLoad:
+    O << "combined load";
+    break;
+  case VPInstruction::SLPStore:
+    O << "combined store";
+    break;
+  case VPInstruction::ActiveLaneMask:
+    O << "active lane mask";
+    break;
+  case VPInstruction::FirstOrderRecurrenceSplice:
+    O << "first-order splice";
+    break;
+  case VPInstruction::CanonicalIVIncrement:
+    O << "VF * UF + ";
+    break;
+  case VPInstruction::CanonicalIVIncrementNUW:
+    O << "VF * UF +(nuw) ";
+    break;
+  case VPInstruction::BranchOnCond:
+    O << "branch-on-cond";
+    break;
+  case VPInstruction::BranchOnCount:
+    O << "branch-on-count ";
+    break;
+  default:
+    O << Instruction::getOpcodeName(getOpcode());
+  }
+
+  O << FMF;
+
+  for (const VPValue *Operand : operands()) {
+    O << " ";
+    Operand->printAsOperand(O, SlotTracker);
+  }
+
+  if (DL) {
+    O << ", !dbg ";
+    DL.print(O);
+  }
+}
+#endif
+
+void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) {
+  // Make sure the VPInstruction is a floating-point operation.
+  assert((Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
+          Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
+          Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
+          Opcode == Instruction::FCmp) &&
+         "this op can't take fast-math flags");
+  FMF = FMFNew;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
+                              VPSlotTracker &SlotTracker) const {
+  O << Indent << "WIDEN-CALL ";
+
+  auto *CI = cast<CallInst>(getUnderlyingInstr());
+  if (CI->getType()->isVoidTy())
+    O << "void ";
+  else {
+    printAsOperand(O, SlotTracker);
+    O << " = ";
+  }
+
+  O << "call @" << CI->getCalledFunction()->getName() << "(";
+  printOperands(O, SlotTracker);
+  O << ")";
+}
+
+void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
+                                VPSlotTracker &SlotTracker) const {
+  O << Indent << "WIDEN-SELECT ";
+  printAsOperand(O, SlotTracker);
+  O << " = select ";
+  getOperand(0)->printAsOperand(O, SlotTracker);
+  O << ", ";
+  getOperand(1)->printAsOperand(O, SlotTracker);
+  O << ", ";
+  getOperand(2)->printAsOperand(O, SlotTracker);
+  O << (InvariantCond ? " (condition is loop invariant)" : "");
+}
+
+void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
+                          VPSlotTracker &SlotTracker) const {
+  O << Indent << "WIDEN ";
+  printAsOperand(O, SlotTracker);
+  O << " = " << getUnderlyingInstr()->getOpcodeName() << " ";
+  printOperands(O, SlotTracker);
+}
+
+void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
+                                          VPSlotTracker &SlotTracker) const {
+  O << Indent << "WIDEN-INDUCTION";
+  if (getTruncInst()) {
+    O << "\\l\"";
+    O << " +\n" << Indent << "\"  " << VPlanIngredient(IV) << "\\l\"";
+    O << " +\n" << Indent << "\"  ";
+    getVPValue(0)->printAsOperand(O, SlotTracker);
+  } else
+    O << " " << VPlanIngredient(IV);
+
+  O << ", ";
+  getStepValue()->printAsOperand(O, SlotTracker);
+}
+#endif
+
+bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
+  auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
+  auto *StepC = dyn_cast<SCEVConstant>(getInductionDescriptor().getStep());
+  return StartC && StartC->isZero() && StepC && StepC->isOne();
+}
+
+VPCanonicalIVPHIRecipe *VPScalarIVStepsRecipe::getCanonicalIV() const {
+  return cast<VPCanonicalIVPHIRecipe>(getOperand(0));
+}
+
+bool VPScalarIVStepsRecipe::isCanonical() const {
+  auto *CanIV = getCanonicalIV();
+  // The start value of the steps-recipe must match the start value of the
+  // canonical induction and it must step by 1.
+  if (CanIV->getStartValue() != getStartValue())
+    return false;
+  auto *StepVPV = getStepValue();
+  if (StepVPV->getDef())
+    return false;
+  auto *StepC = dyn_cast_or_null<ConstantInt>(StepVPV->getLiveInIRValue());
+  return StepC && StepC->isOne();
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
+                                  VPSlotTracker &SlotTracker) const {
+  O << Indent;
+  printAsOperand(O, SlotTracker);
+  O << Indent << "= SCALAR-STEPS ";
+  printOperands(O, SlotTracker);
+}
+
+void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
+                             VPSlotTracker &SlotTracker) const {
+  O << Indent << "WIDEN-GEP ";
+  O << (IsPtrLoopInvariant ? "Inv" : "Var");
+  size_t IndicesNumber = IsIndexLoopInvariant.size();
+  for (size_t I = 0; I < IndicesNumber; ++I)
+    O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]";
+
+  O << " ";
+  printAsOperand(O, SlotTracker);
+  O << " = getelementptr ";
+  printOperands(O, SlotTracker);
+}
+
+void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
+                          VPSlotTracker &SlotTracker) const {
+  O << Indent << "BLEND ";
+  Phi->printAsOperand(O, false);
+  O << " =";
+  if (getNumIncomingValues() == 1) {
+    // Not a User of any mask: not really blending, this is a
+    // single-predecessor phi.
+    O << " ";
+    getIncomingValue(0)->printAsOperand(O, SlotTracker);
+  } else {
+    for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
+      O << " ";
+      getIncomingValue(I)->printAsOperand(O, SlotTracker);
+      O << "/";
+      getMask(I)->printAsOperand(O, SlotTracker);
+    }
+  }
+}
+
+void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
+                              VPSlotTracker &SlotTracker) const {
+  O << Indent << "REDUCE ";
+  printAsOperand(O, SlotTracker);
+  O << " = ";
+  getChainOp()->printAsOperand(O, SlotTracker);
+  O << " +";
+  if (isa<FPMathOperator>(getUnderlyingInstr()))
+    O << getUnderlyingInstr()->getFastMathFlags();
+  O << " reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) << " (";
+  getVecOp()->printAsOperand(O, SlotTracker);
+  if (getCondOp()) {
+    O << ", ";
+    getCondOp()->printAsOperand(O, SlotTracker);
+  }
+  O << ")";
+  if (RdxDesc->IntermediateStore)
+    O << " (with final reduction value stored in invariant address sank "
+         "outside of loop)";
+}
+
+void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
+                              VPSlotTracker &SlotTracker) const {
+  O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
+
+  if (!getUnderlyingInstr()->getType()->isVoidTy()) {
+    printAsOperand(O, SlotTracker);
+    O << " = ";
+  }
+  if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
+    O << "call @" << CB->getCalledFunction()->getName() << "(";
+    interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
+                    O, [&O, &SlotTracker](VPValue *Op) {
+                      Op->printAsOperand(O, SlotTracker);
+                    });
+    O << ")";
+  } else {
+    O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()) << " ";
+    printOperands(O, SlotTracker);
+  }
+
+  if (AlsoPack)
+    O << " (S->V)";
+}
+
+void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+                                VPSlotTracker &SlotTracker) const {
+  O << Indent << "PHI-PREDICATED-INSTRUCTION ";
+  printAsOperand(O, SlotTracker);
+  O << " = ";
+  printOperands(O, SlotTracker);
+}
+
+void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
+                                           VPSlotTracker &SlotTracker) const {
+  O << Indent << "WIDEN ";
+
+  if (!isStore()) {
+    getVPSingleValue()->printAsOperand(O, SlotTracker);
+    O << " = ";
+  }
+  O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " ";
+
+  printOperands(O, SlotTracker);
+}
+#endif
+
+void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
+  Value *Start = getStartValue()->getLiveInIRValue();
+  PHINode *EntryPart = PHINode::Create(
+      Start->getType(), 2, "index", &*State.CFG.PrevBB->getFirstInsertionPt());
+
+  BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
+  EntryPart->addIncoming(Start, VectorPH);
+  EntryPart->setDebugLoc(DL);
+  for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
+    State.set(this, EntryPart, Part);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+                                   VPSlotTracker &SlotTracker) const {
+  O << Indent << "EMIT ";
+  printAsOperand(O, SlotTracker);
+  O << " = CANONICAL-INDUCTION";
+}
+#endif
+
+bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(ElementCount VF) {
+  bool IsUniform = vputils::onlyFirstLaneUsed(this);
+  return all_of(users(),
+                [&](const VPUser *U) { return U->usesScalars(this); }) &&
+         (IsUniform || !VF.isScalable());
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
+                                          VPSlotTracker &SlotTracker) const {
+  O << Indent << "EMIT ";
+  printAsOperand(O, SlotTracker);
+  O << " = WIDEN-POINTER-INDUCTION ";
+  getStartValue()->printAsOperand(O, SlotTracker);
+  O << ", " << *IndDesc.getStep();
+}
+#endif
+
+void VPExpandSCEVRecipe::execute(VPTransformState &State) {
+  assert(!State.Instance && "cannot be used in per-lane");
+  const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout();
+  SCEVExpander Exp(SE, DL, "induction");
+
+  Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
+                                 &*State.Builder.GetInsertPoint());
+
+  for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
+    State.set(this, Res, Part);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
+                               VPSlotTracker &SlotTracker) const {
+  O << Indent << "EMIT ";
+  getVPSingleValue()->printAsOperand(O, SlotTracker);
+  O << " = EXPAND SCEV " << *Expr;
+}
+#endif
+
+void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
+  Value *CanonicalIV = State.get(getOperand(0), 0);
+  Type *STy = CanonicalIV->getType();
+  IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
+  ElementCount VF = State.VF;
+  Value *VStart = VF.isScalar()
+                      ? CanonicalIV
+                      : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
+  for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
+    Value *VStep = createStepForVF(Builder, STy, VF, Part);
+    if (VF.isVector()) {
+      VStep = Builder.CreateVectorSplat(VF, VStep);
+      VStep =
+          Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
+    }
+    Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
+    State.set(this, CanonicalVectorIV, Part);
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
+                                     VPSlotTracker &SlotTracker) const {
+  O << Indent << "EMIT ";
+  printAsOperand(O, SlotTracker);
+  O << " = WIDEN-CANONICAL-INDUCTION ";
+  printOperands(O, SlotTracker);
+}
+#endif
+
+void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
+  auto &Builder = State.Builder;
+  // Create a vector from the initial value.
+  auto *VectorInit = getStartValue()->getLiveInIRValue();
+
+  Type *VecTy = State.VF.isScalar()
+                    ? VectorInit->getType()
+                    : VectorType::get(VectorInit->getType(), State.VF);
+
+  BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
+  if (State.VF.isVector()) {
+    auto *IdxTy = Builder.getInt32Ty();
+    auto *One = ConstantInt::get(IdxTy, 1);
+    IRBuilder<>::InsertPointGuard Guard(Builder);
+    Builder.SetInsertPoint(VectorPH->getTerminator());
+    auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
+    auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
+    VectorInit = Builder.CreateInsertElement(
+        PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
+  }
+
+  // Create a phi node for the new recurrence.
+  PHINode *EntryPart = PHINode::Create(
+      VecTy, 2, "vector.recur", &*State.CFG.PrevBB->getFirstInsertionPt());
+  EntryPart->addIncoming(VectorInit, VectorPH);
+  State.set(this, EntryPart, 0);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
+                                            VPSlotTracker &SlotTracker) const {
+  O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
+  printAsOperand(O, SlotTracker);
+  O << " = phi ";
+  printOperands(O, SlotTracker);
+}
+#endif
+
+void VPReductionPHIRecipe::execute(VPTransformState &State) {
+  PHINode *PN = cast<PHINode>(getUnderlyingValue());
+  auto &Builder = State.Builder;
+
+  // In order to support recurrences we need to be able to vectorize Phi nodes.
+  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
+  // stage #1: We create a new vector PHI node with no incoming edges. We'll use
+  // this value when we vectorize all of the instructions that use the PHI.
+  bool ScalarPHI = State.VF.isScalar() || IsInLoop;
+  Type *VecTy =
+      ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF);
+
+  BasicBlock *HeaderBB = State.CFG.PrevBB;
+  assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
+         "recipe must be in the vector loop header");
+  unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
+  for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
+    Value *EntryPart =
+        PHINode::Create(VecTy, 2, "vec.phi", &*HeaderBB->getFirstInsertionPt());
+    State.set(this, EntryPart, Part);
+  }
+
+  BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
+
+  // Reductions do not have to start at zero. They can start with
+  // any loop invariant values.
+  VPValue *StartVPV = getStartValue();
+  Value *StartV = StartVPV->getLiveInIRValue();
+
+  Value *Iden = nullptr;
+  RecurKind RK = RdxDesc.getRecurrenceKind();
+  if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
+      RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) {
+    // MinMax reduction have the start value as their identify.
+    if (ScalarPHI) {
+      Iden = StartV;
+    } else {
+      IRBuilderBase::InsertPointGuard IPBuilder(Builder);
+      Builder.SetInsertPoint(VectorPH->getTerminator());
+      StartV = Iden =
+          Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
+    }
+  } else {
+    Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(),
+                                         RdxDesc.getFastMathFlags());
+
+    if (!ScalarPHI) {
+      Iden = Builder.CreateVectorSplat(State.VF, Iden);
+      IRBuilderBase::InsertPointGuard IPBuilder(Builder);
+      Builder.SetInsertPoint(VectorPH->getTerminator());
+      Constant *Zero = Builder.getInt32(0);
+      StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
+    }
+  }
+
+  for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
+    Value *EntryPart = State.get(this, Part);
+    // Make sure to add the reduction start value only to the
+    // first unroll part.
+    Value *StartVal = (Part == 0) ? StartV : Iden;
+    cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH);
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+                                 VPSlotTracker &SlotTracker) const {
+  O << Indent << "WIDEN-REDUCTION-PHI ";
+
+  printAsOperand(O, SlotTracker);
+  O << " = phi ";
+  printOperands(O, SlotTracker);
+}
+#endif
+
+void VPWidenPHIRecipe::execute(VPTransformState &State) {
+  assert(EnableVPlanNativePath &&
+         "Non-native vplans are not expected to have VPWidenPHIRecipes.");
+
+  // Currently we enter here in the VPlan-native path for non-induction
+  // PHIs where all control flow is uniform. We simply widen these PHIs.
+  // Create a vector phi with no operands - the vector phi operands will be
+  // set at the end of vector code generation.
+  VPBasicBlock *Parent = getParent();
+  VPRegionBlock *LoopRegion = Parent->getEnclosingLoopRegion();
+  unsigned StartIdx = 0;
+  // For phis in header blocks of loop regions, use the index of the value
+  // coming from the preheader.
+  if (LoopRegion->getEntryBasicBlock() == Parent) {
+    for (unsigned I = 0; I < getNumOperands(); ++I) {
+      if (getIncomingBlock(I) ==
+          LoopRegion->getSinglePredecessor()->getExitingBasicBlock())
+        StartIdx = I;
+    }
+  }
+  Value *Op0 = State.get(getOperand(StartIdx), 0);
+  Type *VecTy = Op0->getType();
+  Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
+  State.set(this, VecPhi, 0);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+                             VPSlotTracker &SlotTracker) const {
+  O << Indent << "WIDEN-PHI ";
+
+  auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
+  // Unless all incoming values are modeled in VPlan  print the original PHI
+  // directly.
+  // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
+  // values as VPValues.
+  if (getNumOperands() != OriginalPhi->getNumOperands()) {
+    O << VPlanIngredient(OriginalPhi);
+    return;
+  }
+
+  printAsOperand(O, SlotTracker);
+  O << " = phi ";
+  printOperands(O, SlotTracker);
+}
+#endif
diff --git a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
index 9e19e172dea5..3a7e77fd9efd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
@@ -15,16 +15,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "VPlan.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/PostOrderIterator.h"
+#include "VPlanValue.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Type.h"
@@ -32,12 +26,9 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include <algorithm>
 #include <cassert>
-#include <iterator>
 #include <utility>
 
 using namespace llvm;
@@ -396,7 +387,7 @@ VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
     return markFailed();
 
   assert(getOpcode(Values) && "Opcodes for all values must match");
-  unsigned ValuesOpcode = getOpcode(Values).getValue();
+  unsigned ValuesOpcode = *getOpcode(Values);
 
   SmallVector<VPValue *, 4> CombinedOperands;
   if (areCommutative(Values)) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 70ce773a8a85..cca484e13bf1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -13,6 +13,8 @@
 
 #include "VPlanTransforms.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/IVDescriptors.h"
 
 using namespace llvm;
 
@@ -22,17 +24,15 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
         GetIntOrFpInductionDescriptor,
     SmallPtrSetImpl<Instruction *> &DeadInstructions, ScalarEvolution &SE) {
 
-  auto *TopRegion = cast<VPRegionBlock>(Plan->getEntry());
-  ReversePostOrderTraversal<VPBlockBase *> RPOT(TopRegion->getEntry());
-
-  for (VPBlockBase *Base : RPOT) {
-    // Do not widen instructions in pre-header and exit blocks.
-    if (Base->getNumPredecessors() == 0 || Base->getNumSuccessors() == 0)
-      continue;
-
-    VPBasicBlock *VPBB = Base->getEntryBasicBlock();
+  ReversePostOrderTraversal<VPBlockRecursiveTraversalWrapper<VPBlockBase *>>
+      RPOT(Plan->getEntry());
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+    VPRecipeBase *Term = VPBB->getTerminator();
+    auto EndIter = Term ? Term->getIterator() : VPBB->end();
     // Introduce each ingredient into VPlan.
-    for (VPRecipeBase &Ingredient : llvm::make_early_inc_range(*VPBB)) {
+    for (VPRecipeBase &Ingredient :
+         make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
+
       VPValue *VPV = Ingredient.getVPSingleValue();
       Instruction *Inst = cast<Instruction>(VPV->getUnderlyingValue());
       if (DeadInstructions.count(Inst)) {
@@ -47,8 +47,10 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
         auto *Phi = cast<PHINode>(VPPhi->getUnderlyingValue());
         if (const auto *II = GetIntOrFpInductionDescriptor(Phi)) {
           VPValue *Start = Plan->getOrAddVPValue(II->getStartValue());
+          VPValue *Step =
+              vputils::getOrCreateVPValueForSCEVExpr(*Plan, II->getStep(), SE);
           NewRecipe =
-              new VPWidenIntOrFpInductionRecipe(Phi, Start, *II, false, true);
+              new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, *II, true);
         } else {
           Plan->addVPValue(Phi, VPPhi);
           continue;
@@ -295,14 +297,19 @@ bool VPlanTransforms::mergeReplicateRegions(VPlan &Plan) {
 }
 
 void VPlanTransforms::removeRedundantInductionCasts(VPlan &Plan) {
-  SmallVector<std::pair<VPRecipeBase *, VPValue *>> CastsToRemove;
-  for (auto &Phi : Plan.getEntry()->getEntryBasicBlock()->phis()) {
+  for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
     auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
     if (!IV || IV->getTruncInst())
       continue;
 
-    // Visit all casts connected to IV and in Casts. Collect them.
-    // remember them for removal.
+    // A sequence of IR Casts has potentially been recorded for IV, which
+    // *must be bypassed* when the IV is vectorized, because the vectorized IV
+    // will produce the desired casted value. This sequence forms a def-use
+    // chain and is provided in reverse order, ending with the cast that uses
+    // the IV phi. Search for the recipe of the last cast in the chain and
+    // replace it with the original IV. Note that only the final cast is
+    // expected to have users outside the cast-chain and the dead casts left
+    // over will be cleaned up later.
     auto &Casts = IV->getInductionDescriptor().getCastInsts();
     VPValue *FindMyCast = IV;
     for (Instruction *IRCast : reverse(Casts)) {
@@ -315,14 +322,9 @@ void VPlanTransforms::removeRedundantInductionCasts(VPlan &Plan) {
           break;
         }
       }
-      assert(FoundUserCast && "Missing a cast to remove");
-      CastsToRemove.emplace_back(FoundUserCast, IV);
       FindMyCast = FoundUserCast->getVPSingleValue();
     }
-  }
-  for (auto &E : CastsToRemove) {
-    E.first->getVPSingleValue()->replaceAllUsesWith(E.second);
-    E.first->eraseFromParent();
+    FindMyCast->replaceAllUsesWith(IV);
   }
 }
 
@@ -358,3 +360,73 @@ void VPlanTransforms::removeRedundantCanonicalIVs(VPlan &Plan) {
     }
   }
 }
+
+void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {
+  ReversePostOrderTraversal<VPBlockRecursiveTraversalWrapper<VPBlockBase *>>
+      RPOT(Plan.getEntry());
+
+  for (VPBasicBlock *VPBB : reverse(VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT))) {
+    // The recipes in the block are processed in reverse order, to catch chains
+    // of dead recipes.
+    for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
+      if (R.mayHaveSideEffects() || any_of(R.definedValues(), [](VPValue *V) {
+            return V->getNumUsers() > 0;
+          }))
+        continue;
+      R.eraseFromParent();
+    }
+  }
+}
+
+void VPlanTransforms::optimizeInductions(VPlan &Plan, ScalarEvolution &SE) {
+  SmallVector<VPRecipeBase *> ToRemove;
+  VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
+  bool HasOnlyVectorVFs = !Plan.hasVF(ElementCount::getFixed(1));
+  for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
+    auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
+    if (!IV)
+      continue;
+    if (HasOnlyVectorVFs &&
+        none_of(IV->users(), [IV](VPUser *U) { return U->usesScalars(IV); }))
+      continue;
+
+    const InductionDescriptor &ID = IV->getInductionDescriptor();
+    VPValue *Step =
+        vputils::getOrCreateVPValueForSCEVExpr(Plan, ID.getStep(), SE);
+    Instruction *TruncI = IV->getTruncInst();
+    VPScalarIVStepsRecipe *Steps = new VPScalarIVStepsRecipe(
+        IV->getPHINode()->getType(), ID, Plan.getCanonicalIV(),
+        IV->getStartValue(), Step, TruncI ? TruncI->getType() : nullptr);
+    HeaderVPBB->insert(Steps, HeaderVPBB->getFirstNonPhi());
+
+    // Update scalar users of IV to use Step instead. Use SetVector to ensure
+    // the list of users doesn't contain duplicates.
+    SetVector<VPUser *> Users(IV->user_begin(), IV->user_end());
+    for (VPUser *U : Users) {
+      if (HasOnlyVectorVFs && !U->usesScalars(IV))
+        continue;
+      for (unsigned I = 0, E = U->getNumOperands(); I != E; I++) {
+        if (U->getOperand(I) != IV)
+          continue;
+        U->setOperand(I, Steps);
+      }
+    }
+  }
+}
+
+void VPlanTransforms::removeRedundantExpandSCEVRecipes(VPlan &Plan) {
+  DenseMap<const SCEV *, VPValue *> SCEV2VPV;
+
+  for (VPRecipeBase &R :
+       make_early_inc_range(*Plan.getEntry()->getEntryBasicBlock())) {
+    auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
+    if (!ExpR)
+      continue;
+
+    auto I = SCEV2VPV.insert({ExpR->getSCEV(), ExpR});
+    if (I.second)
+      continue;
+    ExpR->replaceAllUsesWith(I.first->second);
+    ExpR->eraseFromParent();
+  }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index e74409a86466..3372e255dff7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -14,8 +14,7 @@
 #define LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H
 
 #include "VPlan.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 
 namespace llvm {
 
@@ -23,6 +22,7 @@ class InductionDescriptor;
 class Instruction;
 class PHINode;
 class ScalarEvolution;
+class Loop;
 
 struct VPlanTransforms {
   /// Replaces the VPInstructions in \p Plan with corresponding
@@ -49,6 +49,18 @@ struct VPlanTransforms {
   /// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
   /// recipe, if it exists.
   static void removeRedundantCanonicalIVs(VPlan &Plan);
+
+  static void removeDeadRecipes(VPlan &Plan);
+
+  /// If any user of a VPWidenIntOrFpInductionRecipe needs scalar values,
+  /// provide them by building scalar steps off of the canonical scalar IV and
+  /// update the original IV's users. This is an optional optimization to reduce
+  /// the needs of vector extracts.
+  static void optimizeInductions(VPlan &Plan, ScalarEvolution &SE);
+
+  /// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
+  /// them with already existing recipes expanding the same SCEV expression.
+  static void removeRedundantExpandSCEVRecipes(VPlan &Plan);
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 5296d2b9485c..5fc676834331 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -106,6 +106,7 @@ public:
     VPVFirstOrderRecurrencePHISC,
     VPVWidenPHISC,
     VPVWidenIntOrFpInductionSC,
+    VPVWidenPointerInductionSC,
     VPVPredInstPHI,
     VPVReductionPHISC,
   };
@@ -207,9 +208,7 @@ public:
   /// Subclass identifier (for isa/dyn_cast).
   enum class VPUserID {
     Recipe,
-    // TODO: Currently VPUsers are used in VPBlockBase, but in the future the
-    // only VPUsers should either be recipes or live-outs.
-    Block
+    LiveOut,
   };
 
 private:
@@ -286,6 +285,22 @@ public:
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPDef *Recipe);
+
+  /// Returns true if the VPUser uses scalars of operand \p Op. Conservatively
+  /// returns if only first (scalar) lane is used, as default.
+  virtual bool usesScalars(const VPValue *Op) const {
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    return onlyFirstLaneUsed(Op);
+  }
+
+  /// Returns true if the VPUser only uses the first lane of operand \p Op.
+  /// Conservatively returns false.
+  virtual bool onlyFirstLaneUsed(const VPValue *Op) const {
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    return false;
+  }
 };
 
 /// This class augments a recipe with a set of VPValues defined by the recipe.
@@ -327,10 +342,12 @@ public:
   /// type identification.
   using VPRecipeTy = enum {
     VPBranchOnMaskSC,
+    VPExpandSCEVSC,
     VPInstructionSC,
     VPInterleaveSC,
     VPReductionSC,
     VPReplicateSC,
+    VPScalarIVStepsSC,
     VPWidenCallSC,
     VPWidenCanonicalIVSC,
     VPWidenGEPSC,
@@ -344,6 +361,7 @@ public:
     VPFirstOrderRecurrencePHISC,
     VPWidenPHISC,
     VPWidenIntOrFpInductionSC,
+    VPWidenPointerInductionSC,
     VPPredInstPHISC,
     VPReductionPHISC,
     VPFirstPHISC = VPBlendSC,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index d36f250995e1..f917883145c0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -43,17 +43,20 @@ static bool hasDuplicates(const SmallVectorImpl<VPBlockBase *> &VPBlockVec) {
 /// \p Region. Checks in this function are generic for VPBlockBases. They are
 /// not specific for VPBasicBlocks or VPRegionBlocks.
 static void verifyBlocksInRegion(const VPRegionBlock *Region) {
-  for (const VPBlockBase *VPB :
-       make_range(df_iterator<const VPBlockBase *>::begin(Region->getEntry()),
-                  df_iterator<const VPBlockBase *>::end(Region->getExit()))) {
+  for (const VPBlockBase *VPB : make_range(
+           df_iterator<const VPBlockBase *>::begin(Region->getEntry()),
+           df_iterator<const VPBlockBase *>::end(Region->getExiting()))) {
     // Check block's parent.
     assert(VPB->getParent() == Region && "VPBlockBase has wrong parent");
 
+    auto *VPBB = dyn_cast<VPBasicBlock>(VPB);
     // Check block's condition bit.
-    if (VPB->getNumSuccessors() > 1)
-      assert(VPB->getCondBit() && "Missing condition bit!");
+    if (VPB->getNumSuccessors() > 1 || (VPBB && VPBB->isExiting()))
+      assert(VPBB && VPBB->getTerminator() &&
+             "Block has multiple successors but doesn't "
+             "have a proper branch recipe!");
     else
-      assert(!VPB->getCondBit() && "Unexpected condition bit!");
+      assert((!VPBB || !VPBB->getTerminator()) && "Unexpected branch recipe!");
 
     // Check block's successors.
     const auto &Successors = VPB->getSuccessors();
@@ -94,13 +97,14 @@ static void verifyBlocksInRegion(const VPRegionBlock *Region) {
 /// VPBlockBases. Do not recurse inside nested VPRegionBlocks.
 static void verifyRegion(const VPRegionBlock *Region) {
   const VPBlockBase *Entry = Region->getEntry();
-  const VPBlockBase *Exit = Region->getExit();
+  const VPBlockBase *Exiting = Region->getExiting();
 
-  // Entry and Exit shouldn't have any predecessor/successor, respectively.
+  // Entry and Exiting shouldn't have any predecessor/successor, respectively.
   assert(!Entry->getNumPredecessors() && "Region entry has predecessors.");
-  assert(!Exit->getNumSuccessors() && "Region exit has successors.");
+  assert(!Exiting->getNumSuccessors() &&
+         "Region exiting block has successors.");
   (void)Entry;
-  (void)Exit;
+  (void)Exiting;
 
   verifyBlocksInRegion(Region);
 }
@@ -111,9 +115,9 @@ static void verifyRegionRec(const VPRegionBlock *Region) {
   verifyRegion(Region);
 
   // Recurse inside nested regions.
-  for (const VPBlockBase *VPB :
-       make_range(df_iterator<const VPBlockBase *>::begin(Region->getEntry()),
-                  df_iterator<const VPBlockBase *>::end(Region->getExit()))) {
+  for (const VPBlockBase *VPB : make_range(
+           df_iterator<const VPBlockBase *>::begin(Region->getEntry()),
+           df_iterator<const VPBlockBase *>::end(Region->getExiting()))) {
     if (const auto *SubRegion = dyn_cast<VPRegionBlock>(VPB))
       verifyRegionRec(SubRegion);
   }
@@ -157,7 +161,7 @@ bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) {
     }
   }
 
-  const VPRegionBlock *TopRegion = cast<VPRegionBlock>(Plan.getEntry());
+  const VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
   const VPBasicBlock *Entry = dyn_cast<VPBasicBlock>(TopRegion->getEntry());
   if (!Entry) {
     errs() << "VPlan entry block is not a VPBasicBlock\n";
@@ -170,19 +174,19 @@ bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) {
     return false;
   }
 
-  const VPBasicBlock *Exit = dyn_cast<VPBasicBlock>(TopRegion->getExit());
-  if (!Exit) {
-    errs() << "VPlan exit block is not a VPBasicBlock\n";
+  const VPBasicBlock *Exiting = dyn_cast<VPBasicBlock>(TopRegion->getExiting());
+  if (!Exiting) {
+    errs() << "VPlan exiting block is not a VPBasicBlock\n";
     return false;
   }
 
-  if (Exit->empty()) {
-    errs() << "VPlan vector loop exit must end with BranchOnCount "
+  if (Exiting->empty()) {
+    errs() << "VPlan vector loop exiting block must end with BranchOnCount "
               "VPInstruction but is empty\n";
     return false;
   }
 
-  auto *LastInst = dyn_cast<VPInstruction>(std::prev(Exit->end()));
+  auto *LastInst = dyn_cast<VPInstruction>(std::prev(Exiting->end()));
   if (!LastInst || LastInst->getOpcode() != VPInstruction::BranchOnCount) {
     errs() << "VPlan vector loop exit must end with BranchOnCount "
               "VPInstruction\n";
@@ -197,10 +201,17 @@ bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) {
       errs() << "region entry block has predecessors\n";
       return false;
     }
-    if (Region->getExit()->getNumSuccessors() != 0) {
-      errs() << "region exit block has successors\n";
+    if (Region->getExiting()->getNumSuccessors() != 0) {
+      errs() << "region exiting block has successors\n";
       return false;
     }
   }
+
+  for (auto &KV : Plan.getLiveOuts())
+    if (KV.second->getNumOperands() != 1) {
+      errs() << "live outs must have a single operand\n";
+      return false;
+    }
+
   return true;
 }
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 620d388199e0..90598937affc 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -103,11 +103,13 @@ private:
   bool foldSingleElementStore(Instruction &I);
   bool scalarizeLoadExtract(Instruction &I);
   bool foldShuffleOfBinops(Instruction &I);
+  bool foldShuffleFromReductions(Instruction &I);
+  bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
 
   void replaceValue(Value &Old, Value &New) {
     Old.replaceAllUsesWith(&New);
-    New.takeName(&Old);
     if (auto *NewI = dyn_cast<Instruction>(&New)) {
+      New.takeName(&Old);
       Worklist.pushUsersToWorkList(*NewI);
       Worklist.pushValue(NewI);
     }
@@ -152,12 +154,7 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
   Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
   assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
 
-  // If original AS != Load's AS, we can't bitcast the original pointer and have
-  // to use Load's operand instead. Ideally we would want to strip pointer casts
-  // without changing AS, but there's no API to do that ATM.
   unsigned AS = Load->getPointerAddressSpace();
-  if (AS != SrcPtr->getType()->getPointerAddressSpace())
-    SrcPtr = Load->getPointerOperand();
 
   // We are potentially transforming byte-sized (8-bit) memory accesses, so make
   // sure we have all of our type-based constraints in place for this target.
@@ -245,7 +242,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
   // It is safe and potentially profitable to load a vector directly:
   // inselt undef, load Scalar, 0 --> load VecPtr
   IRBuilder<> Builder(Load);
-  Value *CastedPtr = Builder.CreateBitCast(SrcPtr, MinVecTy->getPointerTo(AS));
+  Value *CastedPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      SrcPtr, MinVecTy->getPointerTo(AS));
   Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
   VecLd = Builder.CreateShuffleVector(VecLd, Mask);
 
@@ -259,12 +257,12 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
 ExtractElementInst *VectorCombine::getShuffleExtract(
     ExtractElementInst *Ext0, ExtractElementInst *Ext1,
     unsigned PreferredExtractIndex = InvalidIndex) const {
-  assert(isa<ConstantInt>(Ext0->getIndexOperand()) &&
-         isa<ConstantInt>(Ext1->getIndexOperand()) &&
-         "Expected constant extract indexes");
+  auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
+  auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
+  assert(Index0C && Index1C && "Expected constant extract indexes");
 
-  unsigned Index0 = cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue();
-  unsigned Index1 = cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue();
+  unsigned Index0 = Index0C->getZExtValue();
+  unsigned Index1 = Index1C->getZExtValue();
 
   // If the extract indexes are identical, no shuffle is needed.
   if (Index0 == Index1)
@@ -310,9 +308,10 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
                                           const Instruction &I,
                                           ExtractElementInst *&ConvertToShuffle,
                                           unsigned PreferredExtractIndex) {
-  assert(isa<ConstantInt>(Ext0->getOperand(1)) &&
-         isa<ConstantInt>(Ext1->getOperand(1)) &&
-         "Expected constant extract indexes");
+  auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getOperand(1));
+  auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getOperand(1));
+  assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes");
+
   unsigned Opcode = I.getOpcode();
   Type *ScalarTy = Ext0->getType();
   auto *VecTy = cast<VectorType>(Ext0->getOperand(0)->getType());
@@ -335,8 +334,8 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
 
   // Get cost estimates for the extract elements. These costs will factor into
   // both sequences.
-  unsigned Ext0Index = cast<ConstantInt>(Ext0->getOperand(1))->getZExtValue();
-  unsigned Ext1Index = cast<ConstantInt>(Ext1->getOperand(1))->getZExtValue();
+  unsigned Ext0Index = Ext0IndexC->getZExtValue();
+  unsigned Ext1Index = Ext1IndexC->getZExtValue();
 
   InstructionCost Extract0Cost =
       TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext0Index);
@@ -698,8 +697,9 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
     ScalarInst->copyIRFlags(&I);
 
   // Fold the vector constants in the original vectors into a new base vector.
-  Constant *NewVecC = IsCmp ? ConstantExpr::getCompare(Pred, VecC0, VecC1)
-                            : ConstantExpr::get(Opcode, VecC0, VecC1);
+  Value *NewVecC =
+      IsCmp ? Builder.CreateCmp(Pred, VecC0, VecC1)
+            : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1);
   Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index);
   replaceValue(I, *Insert);
   return true;
@@ -1019,12 +1019,8 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
           return false;
         NumInstChecked++;
       }
-    }
-
-    if (!LastCheckedInst)
-      LastCheckedInst = UI;
-    else if (LastCheckedInst->comesBefore(UI))
       LastCheckedInst = UI;
+    }
 
     auto ScalarIdx = canScalarizeAccess(FixedVT, UI->getOperand(1), &I, AC, DT);
     if (!ScalarIdx.isSafe()) {
@@ -1121,6 +1117,339 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
   return true;
 }
 
+/// Given a commutative reduction, the order of the input lanes does not alter
+/// the results. We can use this to remove certain shuffles feeding the
+/// reduction, removing the need to shuffle at all.
+bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
+  auto *II = dyn_cast<IntrinsicInst>(&I);
+  if (!II)
+    return false;
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::vector_reduce_add:
+  case Intrinsic::vector_reduce_mul:
+  case Intrinsic::vector_reduce_and:
+  case Intrinsic::vector_reduce_or:
+  case Intrinsic::vector_reduce_xor:
+  case Intrinsic::vector_reduce_smin:
+  case Intrinsic::vector_reduce_smax:
+  case Intrinsic::vector_reduce_umin:
+  case Intrinsic::vector_reduce_umax:
+    break;
+  default:
+    return false;
+  }
+
+  // Find all the inputs when looking through operations that do not alter the
+  // lane order (binops, for example). Currently we look for a single shuffle,
+  // and can ignore splat values.
+  std::queue<Value *> Worklist;
+  SmallPtrSet<Value *, 4> Visited;
+  ShuffleVectorInst *Shuffle = nullptr;
+  if (auto *Op = dyn_cast<Instruction>(I.getOperand(0)))
+    Worklist.push(Op);
+
+  while (!Worklist.empty()) {
+    Value *CV = Worklist.front();
+    Worklist.pop();
+    if (Visited.contains(CV))
+      continue;
+
+    // Splats don't change the order, so can be safely ignored.
+    if (isSplatValue(CV))
+      continue;
+
+    Visited.insert(CV);
+
+    if (auto *CI = dyn_cast<Instruction>(CV)) {
+      if (CI->isBinaryOp()) {
+        for (auto *Op : CI->operand_values())
+          Worklist.push(Op);
+        continue;
+      } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) {
+        if (Shuffle && Shuffle != SV)
+          return false;
+        Shuffle = SV;
+        continue;
+      }
+    }
+
+    // Anything else is currently an unknown node.
+    return false;
+  }
+
+  if (!Shuffle)
+    return false;
+
+  // Check all uses of the binary ops and shuffles are also included in the
+  // lane-invariant operations (Visited should be the list of lanewise
+  // instructions, including the shuffle that we found).
+  for (auto *V : Visited)
+    for (auto *U : V->users())
+      if (!Visited.contains(U) && U != &I)
+        return false;
+
+  FixedVectorType *VecType =
+      dyn_cast<FixedVectorType>(II->getOperand(0)->getType());
+  if (!VecType)
+    return false;
+  FixedVectorType *ShuffleInputType =
+      dyn_cast<FixedVectorType>(Shuffle->getOperand(0)->getType());
+  if (!ShuffleInputType)
+    return false;
+  int NumInputElts = ShuffleInputType->getNumElements();
+
+  // Find the mask from sorting the lanes into order. This is most likely to
+  // become a identity or concat mask. Undef elements are pushed to the end.
+  SmallVector<int> ConcatMask;
+  Shuffle->getShuffleMask(ConcatMask);
+  sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
+  bool UsesSecondVec =
+      any_of(ConcatMask, [&](int M) { return M >= NumInputElts; });
+  InstructionCost OldCost = TTI.getShuffleCost(
+      UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
+      Shuffle->getShuffleMask());
+  InstructionCost NewCost = TTI.getShuffleCost(
+      UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
+      ConcatMask);
+
+  LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
+                    << "\n");
+  LLVM_DEBUG(dbgs() << "  OldCost: " << OldCost << " vs NewCost: " << NewCost
+                    << "\n");
+  if (NewCost < OldCost) {
+    Builder.SetInsertPoint(Shuffle);
+    Value *NewShuffle = Builder.CreateShuffleVector(
+        Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask);
+    LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
+    replaceValue(*Shuffle, *NewShuffle);
+  }
+
+  // See if we can re-use foldSelectShuffle, getting it to reduce the size of
+  // the shuffle into a nicer order, as it can ignore the order of the shuffles.
+  return foldSelectShuffle(*Shuffle, true);
+}
+
+/// This method looks for groups of shuffles acting on binops, of the form:
+///  %x = shuffle ...
+///  %y = shuffle ...
+///  %a = binop %x, %y
+///  %b = binop %x, %y
+///  shuffle %a, %b, selectmask
+/// We may, especially if the shuffle is wider than legal, be able to convert
+/// the shuffle to a form where only parts of a and b need to be computed. On
+/// architectures with no obvious "select" shuffle, this can reduce the total
+/// number of operations if the target reports them as cheaper.
+bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
+  auto *SVI = dyn_cast<ShuffleVectorInst>(&I);
+  auto *VT = dyn_cast<FixedVectorType>(I.getType());
+  if (!SVI || !VT)
+    return false;
+  auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));
+  auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));
+  if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||
+      VT != Op0->getType())
+    return false;
+  auto *SVI0A = dyn_cast<ShuffleVectorInst>(Op0->getOperand(0));
+  auto *SVI0B = dyn_cast<ShuffleVectorInst>(Op0->getOperand(1));
+  auto *SVI1A = dyn_cast<ShuffleVectorInst>(Op1->getOperand(0));
+  auto *SVI1B = dyn_cast<ShuffleVectorInst>(Op1->getOperand(1));
+  auto checkSVNonOpUses = [&](Instruction *I) {
+    if (!I || I->getOperand(0)->getType() != VT)
+      return true;
+    return any_of(I->users(), [&](User *U) { return U != Op0 && U != Op1; });
+  };
+  if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) ||
+      checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B))
+    return false;
+
+  // Collect all the uses that are shuffles that we can transform together. We
+  // may not have a single shuffle, but a group that can all be transformed
+  // together profitably.
+  SmallVector<ShuffleVectorInst *> Shuffles;
+  auto collectShuffles = [&](Instruction *I) {
+    for (auto *U : I->users()) {
+      auto *SV = dyn_cast<ShuffleVectorInst>(U);
+      if (!SV || SV->getType() != VT)
+        return false;
+      if (!llvm::is_contained(Shuffles, SV))
+        Shuffles.push_back(SV);
+    }
+    return true;
+  };
+  if (!collectShuffles(Op0) || !collectShuffles(Op1))
+    return false;
+  // From a reduction, we need to be processing a single shuffle, otherwise the
+  // other uses will not be lane-invariant.
+  if (FromReduction && Shuffles.size() > 1)
+    return false;
+
+  // For each of the output shuffles, we try to sort all the first vector
+  // elements to the beginning, followed by the second array elements at the
+  // end. If the binops are legalized to smaller vectors, this may reduce total
+  // number of binops. We compute the ReconstructMask mask needed to convert
+  // back to the original lane order.
+  SmallVector<int> V1, V2;
+  SmallVector<SmallVector<int>> ReconstructMasks;
+  int MaxV1Elt = 0, MaxV2Elt = 0;
+  unsigned NumElts = VT->getNumElements();
+  for (ShuffleVectorInst *SVN : Shuffles) {
+    SmallVector<int> Mask;
+    SVN->getShuffleMask(Mask);
+
+    // Check the operands are the same as the original, or reversed (in which
+    // case we need to commute the mask).
+    Value *SVOp0 = SVN->getOperand(0);
+    Value *SVOp1 = SVN->getOperand(1);
+    if (SVOp0 == Op1 && SVOp1 == Op0) {
+      std::swap(SVOp0, SVOp1);
+      ShuffleVectorInst::commuteShuffleMask(Mask, NumElts);
+    }
+    if (SVOp0 != Op0 || SVOp1 != Op1)
+      return false;
+
+    // Calculate the reconstruction mask for this shuffle, as the mask needed to
+    // take the packed values from Op0/Op1 and reconstructing to the original
+    // order.
+    SmallVector<int> ReconstructMask;
+    for (unsigned I = 0; I < Mask.size(); I++) {
+      if (Mask[I] < 0) {
+        ReconstructMask.push_back(-1);
+      } else if (Mask[I] < static_cast<int>(NumElts)) {
+        MaxV1Elt = std::max(MaxV1Elt, Mask[I]);
+        auto It = find(V1, Mask[I]);
+        if (It != V1.end())
+          ReconstructMask.push_back(It - V1.begin());
+        else {
+          ReconstructMask.push_back(V1.size());
+          V1.push_back(Mask[I]);
+        }
+      } else {
+        MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts);
+        auto It = find(V2, Mask[I] - NumElts);
+        if (It != V2.end())
+          ReconstructMask.push_back(NumElts + It - V2.begin());
+        else {
+          ReconstructMask.push_back(NumElts + V2.size());
+          V2.push_back(Mask[I] - NumElts);
+        }
+      }
+    }
+
+    // For reductions, we know that the lane ordering out doesn't alter the
+    // result. In-order can help simplify the shuffle away.
+    if (FromReduction)
+      sort(ReconstructMask);
+    ReconstructMasks.push_back(ReconstructMask);
+  }
+
+  // If the Maximum element used from V1 and V2 are not larger than the new
+  // vectors, the vectors are already packes and performing the optimization
+  // again will likely not help any further. This also prevents us from getting
+  // stuck in a cycle in case the costs do not also rule it out.
+  if (V1.empty() || V2.empty() ||
+      (MaxV1Elt == static_cast<int>(V1.size()) - 1 &&
+       MaxV2Elt == static_cast<int>(V2.size()) - 1))
+    return false;
+
+  // Calculate the masks needed for the new input shuffles, which get padded
+  // with undef
+  SmallVector<int> V1A, V1B, V2A, V2B;
+  for (unsigned I = 0; I < V1.size(); I++) {
+    V1A.push_back(SVI0A->getMaskValue(V1[I]));
+    V1B.push_back(SVI0B->getMaskValue(V1[I]));
+  }
+  for (unsigned I = 0; I < V2.size(); I++) {
+    V2A.push_back(SVI1A->getMaskValue(V2[I]));
+    V2B.push_back(SVI1B->getMaskValue(V2[I]));
+  }
+  while (V1A.size() < NumElts) {
+    V1A.push_back(UndefMaskElem);
+    V1B.push_back(UndefMaskElem);
+  }
+  while (V2A.size() < NumElts) {
+    V2A.push_back(UndefMaskElem);
+    V2B.push_back(UndefMaskElem);
+  }
+
+  auto AddShuffleCost = [&](InstructionCost C, ShuffleVectorInst *SV) {
+    return C +
+           TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, SV->getShuffleMask());
+  };
+  auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
+    return C + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, Mask);
+  };
+
+  // Get the costs of the shuffles + binops before and after with the new
+  // shuffle masks.
+  InstructionCost CostBefore =
+      TTI.getArithmeticInstrCost(Op0->getOpcode(), VT) +
+      TTI.getArithmeticInstrCost(Op1->getOpcode(), VT);
+  CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
+                                InstructionCost(0), AddShuffleCost);
+  // This set helps us only cost each unique shuffle once.
+  SmallPtrSet<ShuffleVectorInst *, 4> InputShuffles(
+      {SVI0A, SVI0B, SVI1A, SVI1B});
+  CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
+                                InstructionCost(0), AddShuffleCost);
+
+  // The new binops will be unused for lanes past the used shuffle lengths.
+  // These types attempt to get the correct cost for that from the target.
+  FixedVectorType *Op0SmallVT =
+      FixedVectorType::get(VT->getScalarType(), V1.size());
+  FixedVectorType *Op1SmallVT =
+      FixedVectorType::get(VT->getScalarType(), V2.size());
+  InstructionCost CostAfter =
+      TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT) +
+      TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT);
+  CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
+                               InstructionCost(0), AddShuffleMaskCost);
+  std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
+  CostAfter +=
+      std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
+                      InstructionCost(0), AddShuffleMaskCost);
+
+  if (CostBefore <= CostAfter)
+    return false;
+
+  // The cost model has passed, create the new instructions.
+  Builder.SetInsertPoint(SVI0A);
+  Value *NSV0A = Builder.CreateShuffleVector(SVI0A->getOperand(0),
+                                             SVI0A->getOperand(1), V1A);
+  Builder.SetInsertPoint(SVI0B);
+  Value *NSV0B = Builder.CreateShuffleVector(SVI0B->getOperand(0),
+                                             SVI0B->getOperand(1), V1B);
+  Builder.SetInsertPoint(SVI1A);
+  Value *NSV1A = Builder.CreateShuffleVector(SVI1A->getOperand(0),
+                                             SVI1A->getOperand(1), V2A);
+  Builder.SetInsertPoint(SVI1B);
+  Value *NSV1B = Builder.CreateShuffleVector(SVI1B->getOperand(0),
+                                             SVI1B->getOperand(1), V2B);
+  Builder.SetInsertPoint(Op0);
+  Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(),
+                                    NSV0A, NSV0B);
+  if (auto *I = dyn_cast<Instruction>(NOp0))
+    I->copyIRFlags(Op0, true);
+  Builder.SetInsertPoint(Op1);
+  Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(),
+                                    NSV1A, NSV1B);
+  if (auto *I = dyn_cast<Instruction>(NOp1))
+    I->copyIRFlags(Op1, true);
+
+  for (int S = 0, E = ReconstructMasks.size(); S != E; S++) {
+    Builder.SetInsertPoint(Shuffles[S]);
+    Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]);
+    replaceValue(*Shuffles[S], *NSV);
+  }
+
+  Worklist.pushValue(NSV0A);
+  Worklist.pushValue(NSV0B);
+  Worklist.pushValue(NSV1A);
+  Worklist.pushValue(NSV1B);
+  for (auto *S : Shuffles)
+    Worklist.add(S);
+  return true;
+}
+
 /// This is the entry point for all transforms. Pass manager differences are
 /// handled in the callers of this function.
 bool VectorCombine::run() {
@@ -1140,6 +1469,8 @@ bool VectorCombine::run() {
       MadeChange |= foldBitcastShuf(I);
       MadeChange |= foldExtractedCmps(I);
       MadeChange |= foldShuffleOfBinops(I);
+      MadeChange |= foldShuffleFromReductions(I);
+      MadeChange |= foldSelectShuffle(I);
     }
     MadeChange |= scalarizeBinopOrCmp(I);
     MadeChange |= scalarizeLoadExtract(I);
diff --git a/llvm/lib/Transforms/Vectorize/Vectorize.cpp b/llvm/lib/Transforms/Vectorize/Vectorize.cpp
index 010ca28fc237..208e5eeea864 100644
--- a/llvm/lib/Transforms/Vectorize/Vectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/Vectorize.cpp
@@ -15,7 +15,6 @@
 #include "llvm/Transforms/Vectorize.h"
 #include "llvm-c/Initialization.h"
 #include "llvm-c/Transforms/Vectorize.h"
-#include "llvm/Analysis/Passes.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/PassRegistry.h"
diff --git a/llvm/lib/WindowsDriver/MSVCPaths.cpp b/llvm/lib/WindowsDriver/MSVCPaths.cpp
new file mode 100644
index 000000000000..0661ed7c6ae1
--- /dev/null
+++ b/llvm/lib/WindowsDriver/MSVCPaths.cpp
@@ -0,0 +1,719 @@
+//===-- MSVCPaths.cpp - MSVC path-parsing helpers -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/WindowsDriver/MSVCPaths.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/Program.h"
+#include "llvm/Support/VersionTuple.h"
+#include "llvm/Support/VirtualFileSystem.h"
+#include <string>
+
+#ifdef _WIN32
+#include "llvm/Support/ConvertUTF.h"
+#endif
+
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#define NOGDI
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
+#ifdef _MSC_VER
+// Don't support SetupApi on MinGW.
+#define USE_MSVC_SETUP_API
+
+// Make sure this comes before MSVCSetupApi.h
+#include <comdef.h>
+
+#include "llvm/Support/COM.h"
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wnon-virtual-dtor"
+#endif
+#include "llvm/WindowsDriver/MSVCSetupApi.h"
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+_COM_SMARTPTR_TYPEDEF(ISetupConfiguration, __uuidof(ISetupConfiguration));
+_COM_SMARTPTR_TYPEDEF(ISetupConfiguration2, __uuidof(ISetupConfiguration2));
+_COM_SMARTPTR_TYPEDEF(ISetupHelper, __uuidof(ISetupHelper));
+_COM_SMARTPTR_TYPEDEF(IEnumSetupInstances, __uuidof(IEnumSetupInstances));
+_COM_SMARTPTR_TYPEDEF(ISetupInstance, __uuidof(ISetupInstance));
+_COM_SMARTPTR_TYPEDEF(ISetupInstance2, __uuidof(ISetupInstance2));
+#endif
+
+static std::string
+getHighestNumericTupleInDirectory(llvm::vfs::FileSystem &VFS,
+                                  llvm::StringRef Directory) {
+  std::string Highest;
+  llvm::VersionTuple HighestTuple;
+
+  std::error_code EC;
+  for (llvm::vfs::directory_iterator DirIt = VFS.dir_begin(Directory, EC),
+                                     DirEnd;
+       !EC && DirIt != DirEnd; DirIt.increment(EC)) {
+    auto Status = VFS.status(DirIt->path());
+    if (!Status || !Status->isDirectory())
+      continue;
+    llvm::StringRef CandidateName = llvm::sys::path::filename(DirIt->path());
+    llvm::VersionTuple Tuple;
+    if (Tuple.tryParse(CandidateName)) // tryParse() returns true on error.
+      continue;
+    if (Tuple > HighestTuple) {
+      HighestTuple = Tuple;
+      Highest = CandidateName.str();
+    }
+  }
+
+  return Highest;
+}
+
+static bool getWindows10SDKVersionFromPath(llvm::vfs::FileSystem &VFS,
+                                           const std::string &SDKPath,
+                                           std::string &SDKVersion) {
+  llvm::SmallString<128> IncludePath(SDKPath);
+  llvm::sys::path::append(IncludePath, "Include");
+  SDKVersion = getHighestNumericTupleInDirectory(VFS, IncludePath);
+  return !SDKVersion.empty();
+}
+
+static bool getWindowsSDKDirViaCommandLine(
+    llvm::vfs::FileSystem &VFS, llvm::Optional<llvm::StringRef> WinSdkDir,
+    llvm::Optional<llvm::StringRef> WinSdkVersion,
+    llvm::Optional<llvm::StringRef> WinSysRoot, std::string &Path, int &Major,
+    std::string &Version) {
+  if (WinSdkDir || WinSysRoot) {
+    // Don't validate the input; trust the value supplied by the user.
+    // The motivation is to prevent unnecessary file and registry access.
+    llvm::VersionTuple SDKVersion;
+    if (WinSdkVersion)
+      SDKVersion.tryParse(*WinSdkVersion);
+
+    if (WinSysRoot) {
+      llvm::SmallString<128> SDKPath(*WinSysRoot);
+      llvm::sys::path::append(SDKPath, "Windows Kits");
+      if (!SDKVersion.empty())
+        llvm::sys::path::append(SDKPath, llvm::Twine(SDKVersion.getMajor()));
+      else
+        llvm::sys::path::append(
+            SDKPath, getHighestNumericTupleInDirectory(VFS, SDKPath));
+      Path = std::string(SDKPath.str());
+    } else {
+      Path = WinSdkDir->str();
+    }
+
+    if (!SDKVersion.empty()) {
+      Major = SDKVersion.getMajor();
+      Version = SDKVersion.getAsString();
+    } else if (getWindows10SDKVersionFromPath(VFS, Path, Version)) {
+      Major = 10;
+    }
+    return true;
+  }
+  return false;
+}
+
+#ifdef _WIN32
+static bool readFullStringValue(HKEY hkey, const char *valueName,
+                                std::string &value) {
+  std::wstring WideValueName;
+  if (!llvm::ConvertUTF8toWide(valueName, WideValueName))
+    return false;
+
+  DWORD result = 0;
+  DWORD valueSize = 0;
+  DWORD type = 0;
+  // First just query for the required size.
+  result = RegQueryValueExW(hkey, WideValueName.c_str(), NULL, &type, NULL,
+                            &valueSize);
+  if (result != ERROR_SUCCESS || type != REG_SZ || !valueSize)
+    return false;
+  std::vector<BYTE> buffer(valueSize);
+  result = RegQueryValueExW(hkey, WideValueName.c_str(), NULL, NULL, &buffer[0],
+                            &valueSize);
+  if (result == ERROR_SUCCESS) {
+    std::wstring WideValue(reinterpret_cast<const wchar_t *>(buffer.data()),
+                           valueSize / sizeof(wchar_t));
+    if (valueSize && WideValue.back() == L'\0') {
+      WideValue.pop_back();
+    }
+    // The destination buffer must be empty as an invariant of the conversion
+    // function; but this function is sometimes called in a loop that passes in
+    // the same buffer, however. Simply clear it out so we can overwrite it.
+    value.clear();
+    return llvm::convertWideToUTF8(WideValue, value);
+  }
+  return false;
+}
+#endif
+
+/// Read registry string.
+/// This also supports a means to look for high-versioned keys by use
+/// of a $VERSION placeholder in the key path.
+/// $VERSION in the key path is a placeholder for the version number,
+/// causing the highest value path to be searched for and used.
+/// I.e. "SOFTWARE\\Microsoft\\VisualStudio\\$VERSION".
+/// There can be additional characters in the component.  Only the numeric
+/// characters are compared.  This function only searches HKLM.
+static bool getSystemRegistryString(const char *keyPath, const char *valueName,
+                                    std::string &value, std::string *phValue) {
+#ifndef _WIN32
+  return false;
+#else
+  HKEY hRootKey = HKEY_LOCAL_MACHINE;
+  HKEY hKey = NULL;
+  long lResult;
+  bool returnValue = false;
+
+  const char *placeHolder = strstr(keyPath, "$VERSION");
+  std::string bestName;
+  // If we have a $VERSION placeholder, do the highest-version search.
+  if (placeHolder) {
+    const char *keyEnd = placeHolder - 1;
+    const char *nextKey = placeHolder;
+    // Find end of previous key.
+    while ((keyEnd > keyPath) && (*keyEnd != '\\'))
+      keyEnd--;
+    // Find end of key containing $VERSION.
+    while (*nextKey && (*nextKey != '\\'))
+      nextKey++;
+    size_t partialKeyLength = keyEnd - keyPath;
+    char partialKey[256];
+    if (partialKeyLength >= sizeof(partialKey))
+      partialKeyLength = sizeof(partialKey) - 1;
+    strncpy(partialKey, keyPath, partialKeyLength);
+    partialKey[partialKeyLength] = '\0';
+    HKEY hTopKey = NULL;
+    lResult = RegOpenKeyExA(hRootKey, partialKey, 0, KEY_READ | KEY_WOW64_32KEY,
+                            &hTopKey);
+    if (lResult == ERROR_SUCCESS) {
+      char keyName[256];
+      double bestValue = 0.0;
+      DWORD index, size = sizeof(keyName) - 1;
+      for (index = 0; RegEnumKeyExA(hTopKey, index, keyName, &size, NULL, NULL,
+                                    NULL, NULL) == ERROR_SUCCESS;
+           index++) {
+        const char *sp = keyName;
+        while (*sp && !llvm::isDigit(*sp))
+          sp++;
+        if (!*sp)
+          continue;
+        const char *ep = sp + 1;
+        while (*ep && (llvm::isDigit(*ep) || (*ep == '.')))
+          ep++;
+        char numBuf[32];
+        strncpy(numBuf, sp, sizeof(numBuf) - 1);
+        numBuf[sizeof(numBuf) - 1] = '\0';
+        double dvalue = strtod(numBuf, NULL);
+        if (dvalue > bestValue) {
+          // Test that InstallDir is indeed there before keeping this index.
+          // Open the chosen key path remainder.
+          bestName = keyName;
+          // Append rest of key.
+          bestName.append(nextKey);
+          lResult = RegOpenKeyExA(hTopKey, bestName.c_str(), 0,
+                                  KEY_READ | KEY_WOW64_32KEY, &hKey);
+          if (lResult == ERROR_SUCCESS) {
+            if (readFullStringValue(hKey, valueName, value)) {
+              bestValue = dvalue;
+              if (phValue)
+                *phValue = bestName;
+              returnValue = true;
+            }
+            RegCloseKey(hKey);
+          }
+        }
+        size = sizeof(keyName) - 1;
+      }
+      RegCloseKey(hTopKey);
+    }
+  } else {
+    lResult =
+        RegOpenKeyExA(hRootKey, keyPath, 0, KEY_READ | KEY_WOW64_32KEY, &hKey);
+    if (lResult == ERROR_SUCCESS) {
+      if (readFullStringValue(hKey, valueName, value))
+        returnValue = true;
+      if (phValue)
+        phValue->clear();
+      RegCloseKey(hKey);
+    }
+  }
+  return returnValue;
+#endif // _WIN32
+}
+
+namespace llvm {
+
+const char *archToWindowsSDKArch(Triple::ArchType Arch) {
+  switch (Arch) {
+  case Triple::ArchType::x86:
+    return "x86";
+  case Triple::ArchType::x86_64:
+    return "x64";
+  case Triple::ArchType::arm:
+    return "arm";
+  case Triple::ArchType::aarch64:
+    return "arm64";
+  default:
+    return "";
+  }
+}
+
+const char *archToLegacyVCArch(Triple::ArchType Arch) {
+  switch (Arch) {
+  case Triple::ArchType::x86:
+    // x86 is default in legacy VC toolchains.
+    // e.g. x86 libs are directly in /lib as opposed to /lib/x86.
+    return "";
+  case Triple::ArchType::x86_64:
+    return "amd64";
+  case Triple::ArchType::arm:
+    return "arm";
+  case Triple::ArchType::aarch64:
+    return "arm64";
+  default:
+    return "";
+  }
+}
+
+const char *archToDevDivInternalArch(Triple::ArchType Arch) {
+  switch (Arch) {
+  case Triple::ArchType::x86:
+    return "i386";
+  case Triple::ArchType::x86_64:
+    return "amd64";
+  case Triple::ArchType::arm:
+    return "arm";
+  case Triple::ArchType::aarch64:
+    return "arm64";
+  default:
+    return "";
+  }
+}
+
+bool appendArchToWindowsSDKLibPath(int SDKMajor, SmallString<128> LibPath,
+                                   Triple::ArchType Arch, std::string &path) {
+  if (SDKMajor >= 8) {
+    sys::path::append(LibPath, archToWindowsSDKArch(Arch));
+  } else {
+    switch (Arch) {
+    // In Windows SDK 7.x, x86 libraries are directly in the Lib folder.
+    case Triple::x86:
+      break;
+    case Triple::x86_64:
+      sys::path::append(LibPath, "x64");
+      break;
+    case Triple::arm:
+      // It is not necessary to link against Windows SDK 7.x when targeting ARM.
+      return false;
+    default:
+      return false;
+    }
+  }
+
+  path = std::string(LibPath.str());
+  return true;
+}
+
+std::string getSubDirectoryPath(SubDirectoryType Type, ToolsetLayout VSLayout,
+                                const std::string &VCToolChainPath,
+                                Triple::ArchType TargetArch,
+                                StringRef SubdirParent) {
+  const char *SubdirName;
+  const char *IncludeName;
+  switch (VSLayout) {
+  case ToolsetLayout::OlderVS:
+    SubdirName = archToLegacyVCArch(TargetArch);
+    IncludeName = "include";
+    break;
+  case ToolsetLayout::VS2017OrNewer:
+    SubdirName = archToWindowsSDKArch(TargetArch);
+    IncludeName = "include";
+    break;
+  case ToolsetLayout::DevDivInternal:
+    SubdirName = archToDevDivInternalArch(TargetArch);
+    IncludeName = "inc";
+    break;
+  }
+
+  SmallString<256> Path(VCToolChainPath);
+  if (!SubdirParent.empty())
+    sys::path::append(Path, SubdirParent);
+
+  switch (Type) {
+  case SubDirectoryType::Bin:
+    if (VSLayout == ToolsetLayout::VS2017OrNewer) {
+      // MSVC ships with two linkers: a 32-bit x86 and 64-bit x86 linker.
+      // On x86, pick the linker that corresponds to the current process.
+      // On ARM64, pick the 32-bit x86 linker; the 64-bit one doesn't run
+      // on Windows 10.
+      //
+      // FIXME: Consider using IsWow64GuestMachineSupported to figure out
+      // if we can invoke the 64-bit linker. It's generally preferable
+      // because it won't run out of address-space.
+      const bool HostIsX64 =
+          Triple(sys::getProcessTriple()).getArch() == Triple::x86_64;
+      const char *const HostName = HostIsX64 ? "Hostx64" : "Hostx86";
+      sys::path::append(Path, "bin", HostName, SubdirName);
+    } else { // OlderVS or DevDivInternal
+      sys::path::append(Path, "bin", SubdirName);
+    }
+    break;
+  case SubDirectoryType::Include:
+    sys::path::append(Path, IncludeName);
+    break;
+  case SubDirectoryType::Lib:
+    sys::path::append(Path, "lib", SubdirName);
+    break;
+  }
+  return std::string(Path.str());
+}
+
+bool useUniversalCRT(ToolsetLayout VSLayout, const std::string &VCToolChainPath,
+                     Triple::ArchType TargetArch, vfs::FileSystem &VFS) {
+  SmallString<128> TestPath(getSubDirectoryPath(
+      SubDirectoryType::Include, VSLayout, VCToolChainPath, TargetArch));
+  sys::path::append(TestPath, "stdlib.h");
+  return !VFS.exists(TestPath);
+}
+
+bool getWindowsSDKDir(vfs::FileSystem &VFS, Optional<StringRef> WinSdkDir,
+                      Optional<StringRef> WinSdkVersion,
+                      Optional<StringRef> WinSysRoot, std::string &Path,
+                      int &Major, std::string &WindowsSDKIncludeVersion,
+                      std::string &WindowsSDKLibVersion) {
+  // Trust /winsdkdir and /winsdkversion if present.
+  if (getWindowsSDKDirViaCommandLine(VFS, WinSdkDir, WinSdkVersion, WinSysRoot,
+                                     Path, Major, WindowsSDKIncludeVersion)) {
+    WindowsSDKLibVersion = WindowsSDKIncludeVersion;
+    return true;
+  }
+
+  // FIXME: Try env vars (%WindowsSdkDir%, %UCRTVersion%) before going to
+  // registry.
+
+  // Try the Windows registry.
+  std::string RegistrySDKVersion;
+  if (!getSystemRegistryString(
+          "SOFTWARE\\Microsoft\\Microsoft SDKs\\Windows\\$VERSION",
+          "InstallationFolder", Path, &RegistrySDKVersion))
+    return false;
+  if (Path.empty() || RegistrySDKVersion.empty())
+    return false;
+
+  WindowsSDKIncludeVersion.clear();
+  WindowsSDKLibVersion.clear();
+  Major = 0;
+  std::sscanf(RegistrySDKVersion.c_str(), "v%d.", &Major);
+  if (Major <= 7)
+    return true;
+  if (Major == 8) {
+    // Windows SDK 8.x installs libraries in a folder whose names depend on the
+    // version of the OS you're targeting.  By default choose the newest, which
+    // usually corresponds to the version of the OS you've installed the SDK on.
+    const char *Tests[] = {"winv6.3", "win8", "win7"};
+    for (const char *Test : Tests) {
+      SmallString<128> TestPath(Path);
+      sys::path::append(TestPath, "Lib", Test);
+      if (VFS.exists(TestPath)) {
+        WindowsSDKLibVersion = Test;
+        break;
+      }
+    }
+    return !WindowsSDKLibVersion.empty();
+  }
+  if (Major == 10) {
+    if (!getWindows10SDKVersionFromPath(VFS, Path, WindowsSDKIncludeVersion))
+      return false;
+    WindowsSDKLibVersion = WindowsSDKIncludeVersion;
+    return true;
+  }
+  // Unsupported SDK version
+  return false;
+}
+
+bool getUniversalCRTSdkDir(vfs::FileSystem &VFS, Optional<StringRef> WinSdkDir,
+                           Optional<StringRef> WinSdkVersion,
+                           Optional<StringRef> WinSysRoot, std::string &Path,
+                           std::string &UCRTVersion) {
+  // If /winsdkdir is passed, use it as location for the UCRT too.
+  // FIXME: Should there be a dedicated /ucrtdir to override /winsdkdir?
+  int Major;
+  if (getWindowsSDKDirViaCommandLine(VFS, WinSdkDir, WinSdkVersion, WinSysRoot,
+                                     Path, Major, UCRTVersion))
+    return true;
+
+  // FIXME: Try env vars (%UniversalCRTSdkDir%, %UCRTVersion%) before going to
+  // registry.
+
+  // vcvarsqueryregistry.bat for Visual Studio 2015 queries the registry
+  // for the specific key "KitsRoot10". So do we.
+  if (!getSystemRegistryString(
+          "SOFTWARE\\Microsoft\\Windows Kits\\Installed Roots", "KitsRoot10",
+          Path, nullptr))
+    return false;
+
+  return getWindows10SDKVersionFromPath(VFS, Path, UCRTVersion);
+}
+
+bool findVCToolChainViaCommandLine(vfs::FileSystem &VFS,
+                                   Optional<StringRef> VCToolsDir,
+                                   Optional<StringRef> VCToolsVersion,
+                                   Optional<StringRef> WinSysRoot,
+                                   std::string &Path, ToolsetLayout &VSLayout) {
+  // Don't validate the input; trust the value supplied by the user.
+  // The primary motivation is to prevent unnecessary file and registry access.
+  if (VCToolsDir || WinSysRoot) {
+    if (WinSysRoot) {
+      SmallString<128> ToolsPath(*WinSysRoot);
+      sys::path::append(ToolsPath, "VC", "Tools", "MSVC");
+      std::string ToolsVersion;
+      if (VCToolsVersion)
+        ToolsVersion = VCToolsVersion->str();
+      else
+        ToolsVersion = getHighestNumericTupleInDirectory(VFS, ToolsPath);
+      sys::path::append(ToolsPath, ToolsVersion);
+      Path = std::string(ToolsPath.str());
+    } else {
+      Path = VCToolsDir->str();
+    }
+    VSLayout = ToolsetLayout::VS2017OrNewer;
+    return true;
+  }
+  return false;
+}
+
+bool findVCToolChainViaEnvironment(vfs::FileSystem &VFS, std::string &Path,
+                                   ToolsetLayout &VSLayout) {
+  // These variables are typically set by vcvarsall.bat
+  // when launching a developer command prompt.
+  if (Optional<std::string> VCToolsInstallDir =
+          sys::Process::GetEnv("VCToolsInstallDir")) {
+    // This is only set by newer Visual Studios, and it leads straight to
+    // the toolchain directory.
+    Path = std::move(*VCToolsInstallDir);
+    VSLayout = ToolsetLayout::VS2017OrNewer;
+    return true;
+  }
+  if (Optional<std::string> VCInstallDir =
+          sys::Process::GetEnv("VCINSTALLDIR")) {
+    // If the previous variable isn't set but this one is, then we've found
+    // an older Visual Studio. This variable is set by newer Visual Studios too,
+    // so this check has to appear second.
+    // In older Visual Studios, the VC directory is the toolchain.
+    Path = std::move(*VCInstallDir);
+    VSLayout = ToolsetLayout::OlderVS;
+    return true;
+  }
+
+  // We couldn't find any VC environment variables. Let's walk through PATH and
+  // see if it leads us to a VC toolchain bin directory. If it does, pick the
+  // first one that we find.
+  if (Optional<std::string> PathEnv = sys::Process::GetEnv("PATH")) {
+    SmallVector<StringRef, 8> PathEntries;
+    StringRef(*PathEnv).split(PathEntries, sys::EnvPathSeparator);
+    for (StringRef PathEntry : PathEntries) {
+      if (PathEntry.empty())
+        continue;
+
+      SmallString<256> ExeTestPath;
+
+      // If cl.exe doesn't exist, then this definitely isn't a VC toolchain.
+      ExeTestPath = PathEntry;
+      sys::path::append(ExeTestPath, "cl.exe");
+      if (!VFS.exists(ExeTestPath))
+        continue;
+
+      // cl.exe existing isn't a conclusive test for a VC toolchain; clang also
+      // has a cl.exe. So let's check for link.exe too.
+      ExeTestPath = PathEntry;
+      sys::path::append(ExeTestPath, "link.exe");
+      if (!VFS.exists(ExeTestPath))
+        continue;
+
+      // whatever/VC/bin --> old toolchain, VC dir is toolchain dir.
+      StringRef TestPath = PathEntry;
+      bool IsBin = sys::path::filename(TestPath).equals_insensitive("bin");
+      if (!IsBin) {
+        // Strip any architecture subdir like "amd64".
+        TestPath = sys::path::parent_path(TestPath);
+        IsBin = sys::path::filename(TestPath).equals_insensitive("bin");
+      }
+      if (IsBin) {
+        StringRef ParentPath = sys::path::parent_path(TestPath);
+        StringRef ParentFilename = sys::path::filename(ParentPath);
+        if (ParentFilename.equals_insensitive("VC")) {
+          Path = std::string(ParentPath);
+          VSLayout = ToolsetLayout::OlderVS;
+          return true;
+        }
+        if (ParentFilename.equals_insensitive("x86ret") ||
+            ParentFilename.equals_insensitive("x86chk") ||
+            ParentFilename.equals_insensitive("amd64ret") ||
+            ParentFilename.equals_insensitive("amd64chk")) {
+          Path = std::string(ParentPath);
+          VSLayout = ToolsetLayout::DevDivInternal;
+          return true;
+        }
+
+      } else {
+        // This could be a new (>=VS2017) toolchain. If it is, we should find
+        // path components with these prefixes when walking backwards through
+        // the path.
+        // Note: empty strings match anything.
+        StringRef ExpectedPrefixes[] = {"",     "Host",  "bin", "",
+                                        "MSVC", "Tools", "VC"};
+
+        auto It = sys::path::rbegin(PathEntry);
+        auto End = sys::path::rend(PathEntry);
+        for (StringRef Prefix : ExpectedPrefixes) {
+          if (It == End)
+            goto NotAToolChain;
+          if (!It->startswith_insensitive(Prefix))
+            goto NotAToolChain;
+          ++It;
+        }
+
+        // We've found a new toolchain!
+        // Back up 3 times (/bin/Host/arch) to get the root path.
+        StringRef ToolChainPath(PathEntry);
+        for (int i = 0; i < 3; ++i)
+          ToolChainPath = sys::path::parent_path(ToolChainPath);
+
+        Path = std::string(ToolChainPath);
+        VSLayout = ToolsetLayout::VS2017OrNewer;
+        return true;
+      }
+
+    NotAToolChain:
+      continue;
+    }
+  }
+  return false;
+}
+
+bool findVCToolChainViaSetupConfig(vfs::FileSystem &VFS, std::string &Path,
+                                   ToolsetLayout &VSLayout) {
+#if !defined(USE_MSVC_SETUP_API)
+  return false;
+#else
+  // FIXME: This really should be done once in the top-level program's main
+  // function, as it may have already been initialized with a different
+  // threading model otherwise.
+  sys::InitializeCOMRAII COM(sys::COMThreadingMode::SingleThreaded);
+  HRESULT HR;
+
+  // _com_ptr_t will throw a _com_error if a COM calls fail.
+  // The LLVM coding standards forbid exception handling, so we'll have to
+  // stop them from being thrown in the first place.
+  // The destructor will put the regular error handler back when we leave
+  // this scope.
+  struct SuppressCOMErrorsRAII {
+    static void __stdcall handler(HRESULT hr, IErrorInfo *perrinfo) {}
+
+    SuppressCOMErrorsRAII() { _set_com_error_handler(handler); }
+
+    ~SuppressCOMErrorsRAII() { _set_com_error_handler(_com_raise_error); }
+
+  } COMErrorSuppressor;
+
+  ISetupConfigurationPtr Query;
+  HR = Query.CreateInstance(__uuidof(SetupConfiguration));
+  if (FAILED(HR))
+    return false;
+
+  IEnumSetupInstancesPtr EnumInstances;
+  HR = ISetupConfiguration2Ptr(Query)->EnumAllInstances(&EnumInstances);
+  if (FAILED(HR))
+    return false;
+
+  ISetupInstancePtr Instance;
+  HR = EnumInstances->Next(1, &Instance, nullptr);
+  if (HR != S_OK)
+    return false;
+
+  ISetupInstancePtr NewestInstance;
+  Optional<uint64_t> NewestVersionNum;
+  do {
+    bstr_t VersionString;
+    uint64_t VersionNum;
+    HR = Instance->GetInstallationVersion(VersionString.GetAddress());
+    if (FAILED(HR))
+      continue;
+    HR = ISetupHelperPtr(Query)->ParseVersion(VersionString, &VersionNum);
+    if (FAILED(HR))
+      continue;
+    if (!NewestVersionNum || (VersionNum > NewestVersionNum)) {
+      NewestInstance = Instance;
+      NewestVersionNum = VersionNum;
+    }
+  } while ((HR = EnumInstances->Next(1, &Instance, nullptr)) == S_OK);
+
+  if (!NewestInstance)
+    return false;
+
+  bstr_t VCPathWide;
+  HR = NewestInstance->ResolvePath(L"VC", VCPathWide.GetAddress());
+  if (FAILED(HR))
+    return false;
+
+  std::string VCRootPath;
+  convertWideToUTF8(std::wstring(VCPathWide), VCRootPath);
+
+  SmallString<256> ToolsVersionFilePath(VCRootPath);
+  sys::path::append(ToolsVersionFilePath, "Auxiliary", "Build",
+                    "Microsoft.VCToolsVersion.default.txt");
+
+  auto ToolsVersionFile = MemoryBuffer::getFile(ToolsVersionFilePath);
+  if (!ToolsVersionFile)
+    return false;
+
+  SmallString<256> ToolchainPath(VCRootPath);
+  sys::path::append(ToolchainPath, "Tools", "MSVC",
+                    ToolsVersionFile->get()->getBuffer().rtrim());
+  auto Status = VFS.status(ToolchainPath);
+  if (!Status || !Status->isDirectory())
+    return false;
+
+  Path = std::string(ToolchainPath.str());
+  VSLayout = ToolsetLayout::VS2017OrNewer;
+  return true;
+#endif
+}
+
+bool findVCToolChainViaRegistry(std::string &Path, ToolsetLayout &VSLayout) {
+  std::string VSInstallPath;
+  if (getSystemRegistryString(R"(SOFTWARE\Microsoft\VisualStudio\$VERSION)",
+                              "InstallDir", VSInstallPath, nullptr) ||
+      getSystemRegistryString(R"(SOFTWARE\Microsoft\VCExpress\$VERSION)",
+                              "InstallDir", VSInstallPath, nullptr)) {
+    if (!VSInstallPath.empty()) {
+      SmallString<256> VCPath(StringRef(VSInstallPath.c_str(),
+                                        VSInstallPath.find(R"(\Common7\IDE)")));
+      sys::path::append(VCPath, "VC");
+
+      Path = std::string(VCPath.str());
+      VSLayout = ToolsetLayout::OlderVS;
+      return true;
+    }
+  }
+  return false;
+}
+
+} // namespace llvm
diff --git a/llvm/lib/WindowsManifest/WindowsManifestMerger.cpp b/llvm/lib/WindowsManifest/WindowsManifestMerger.cpp
index 40c03f7b0de7..8f5c53faf91e 100644
--- a/llvm/lib/WindowsManifest/WindowsManifestMerger.cpp
+++ b/llvm/lib/WindowsManifest/WindowsManifestMerger.cpp
@@ -14,8 +14,6 @@
 #include "llvm/Config/config.h"
 #include "llvm/Support/MemoryBuffer.h"
 
-#include <map>
-
 #if LLVM_ENABLE_LIBXML2
 #include <libxml/xmlreader.h>
 #endif
@@ -706,7 +704,7 @@ bool windows_manifest::isAvailable() { return false; }
 WindowsManifestMerger::WindowsManifestMerger()
     : Impl(std::make_unique<WindowsManifestMergerImpl>()) {}
 
-WindowsManifestMerger::~WindowsManifestMerger() {}
+WindowsManifestMerger::~WindowsManifestMerger() = default;
 
 Error WindowsManifestMerger::merge(MemoryBufferRef Manifest) {
   return Impl->merge(Manifest);
diff --git a/llvm/lib/XRay/FDRTraceWriter.cpp b/llvm/lib/XRay/FDRTraceWriter.cpp
index 71c09bd4fce4..2b80740ed436 100644
--- a/llvm/lib/XRay/FDRTraceWriter.cpp
+++ b/llvm/lib/XRay/FDRTraceWriter.cpp
@@ -74,7 +74,7 @@ FDRTraceWriter::FDRTraceWriter(raw_ostream &O, const XRayFileHeader &H)
   OS.write(FreeFormBytes);
 }
 
-FDRTraceWriter::~FDRTraceWriter() {}
+FDRTraceWriter::~FDRTraceWriter() = default;
 
 Error FDRTraceWriter::visit(BufferExtents &R) {
   return writeMetadata<7u>(OS, R.size());
diff --git a/llvm/tools/bugpoint/CrashDebugger.cpp b/llvm/tools/bugpoint/CrashDebugger.cpp
index d127ea0945f2..9912f59f0ba6 100644
--- a/llvm/tools/bugpoint/CrashDebugger.cpp
+++ b/llvm/tools/bugpoint/CrashDebugger.cpp
@@ -270,7 +270,7 @@ bool ReduceCrashingFunctions::TestFuncs(std::vector<Function *> &Funcs) {
     // First, remove aliases to functions we're about to purge.
     for (GlobalAlias &Alias : M->aliases()) {
       GlobalObject *Root = Alias.getAliaseeObject();
-      Function *F = dyn_cast_or_null<Function>(Root);
+      auto *F = dyn_cast<Function>(Root);
       if (F) {
         if (Functions.count(F))
           // We're keeping this function.
@@ -278,7 +278,7 @@ bool ReduceCrashingFunctions::TestFuncs(std::vector<Function *> &Funcs) {
       } else if (Root->isNullValue()) {
         // This referenced a globalalias that we've already replaced,
         // so we still need to replace this alias.
-      } else if (!F) {
+      } else {
         // Not a function, therefore not something we mess with.
         continue;
       }
diff --git a/llvm/tools/bugpoint/ExecutionDriver.cpp b/llvm/tools/bugpoint/ExecutionDriver.cpp
index f06f378962d9..2b06e8f3b365 100644
--- a/llvm/tools/bugpoint/ExecutionDriver.cpp
+++ b/llvm/tools/bugpoint/ExecutionDriver.cpp
@@ -105,7 +105,7 @@ namespace llvm {
 // program being debugged.
 cl::list<std::string> InputArgv("args", cl::Positional,
                                 cl::desc("<program arguments>..."),
-                                cl::ZeroOrMore, cl::PositionalEatsArgs);
+                                cl::PositionalEatsArgs);
 
 cl::opt<std::string>
     OutputPrefix("output-prefix", cl::init("bugpoint"),
@@ -114,19 +114,19 @@ cl::opt<std::string>
 
 namespace {
 cl::list<std::string> ToolArgv("tool-args", cl::Positional,
-                               cl::desc("<tool arguments>..."), cl::ZeroOrMore,
+                               cl::desc("<tool arguments>..."),
                                cl::PositionalEatsArgs);
 
 cl::list<std::string> SafeToolArgv("safe-tool-args", cl::Positional,
                                    cl::desc("<safe-tool arguments>..."),
-                                   cl::ZeroOrMore, cl::PositionalEatsArgs);
+                                   cl::PositionalEatsArgs);
 
 cl::opt<std::string> CCBinary("gcc", cl::init(""),
                               cl::desc("The gcc binary to use."));
 
 cl::list<std::string> CCToolArgv("gcc-tool-args", cl::Positional,
                                  cl::desc("<gcc-tool arguments>..."),
-                                 cl::ZeroOrMore, cl::PositionalEatsArgs);
+                                 cl::PositionalEatsArgs);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/tools/bugpoint/OptimizerDriver.cpp b/llvm/tools/bugpoint/OptimizerDriver.cpp
index e67e877c13af..d425a8c5b49a 100644
--- a/llvm/tools/bugpoint/OptimizerDriver.cpp
+++ b/llvm/tools/bugpoint/OptimizerDriver.cpp
@@ -117,7 +117,7 @@ cl::opt<bool> SilencePasses(
 
 static cl::list<std::string> OptArgs("opt-args", cl::Positional,
                                      cl::desc("<opt arguments>..."),
-                                     cl::ZeroOrMore, cl::PositionalEatsArgs);
+                                     cl::PositionalEatsArgs);
 
 /// runPasses - Run the specified passes on Program, outputting a bitcode file
 /// and writing the filename into OutputFile if successful.  If the
diff --git a/llvm/tools/bugpoint/bugpoint.cpp b/llvm/tools/bugpoint/bugpoint.cpp
index 937ec23231b0..6e3f237d0a39 100644
--- a/llvm/tools/bugpoint/bugpoint.cpp
+++ b/llvm/tools/bugpoint/bugpoint.cpp
@@ -65,11 +65,7 @@ static cl::opt<bool>
 // PassNameParser.
 //
 static cl::list<const PassInfo *, bool, PassNameParser>
-    PassList(cl::desc("Passes available:"), cl::ZeroOrMore);
-
-static cl::opt<bool>
-    StandardLinkOpts("std-link-opts",
-                     cl::desc("Include the standard link time optimizations"));
+    PassList(cl::desc("Passes available:"));
 
 static cl::opt<bool>
     OptLevelO1("O1", cl::desc("Optimization level 1. Identical to 'opt -O1'"));
@@ -203,12 +199,6 @@ int main(int argc, char **argv) {
 
   AddToDriver PM(D);
 
-  if (StandardLinkOpts) {
-    PassManagerBuilder Builder;
-    Builder.Inliner = createFunctionInliningPass();
-    Builder.populateLTOPassManager(PM);
-  }
-
   if (OptLevelO1)
     AddOptimizationPasses(PM, 1, 0);
   else if (OptLevelO2)
diff --git a/llvm/tools/llc/llc.cpp b/llvm/tools/llc/llc.cpp
index c07f4e66486c..8d82d78b15b5 100644
--- a/llvm/tools/llc/llc.cpp
+++ b/llvm/tools/llc/llc.cpp
@@ -36,6 +36,7 @@
 #include "llvm/IR/Verifier.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/MC/MCTargetOptionsCommandFlags.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Pass.h"
@@ -117,12 +118,10 @@ static cl::opt<bool>
 
 // Determine optimization level.
 static cl::opt<char>
-OptLevel("O",
-         cl::desc("Optimization level. [-O0, -O1, -O2, or -O3] "
-                  "(default = '-O2')"),
-         cl::Prefix,
-         cl::ZeroOrMore,
-         cl::init(' '));
+    OptLevel("O",
+             cl::desc("Optimization level. [-O0, -O1, -O2, or -O3] "
+                      "(default = '-O2')"),
+             cl::Prefix, cl::init(' '));
 
 static cl::opt<std::string>
 TargetTriple("mtriple", cl::desc("Override target triple for module"));
@@ -212,7 +211,7 @@ static RunPassOption RunPassOpt;
 static cl::opt<RunPassOption, true, cl::parser<std::string>> RunPass(
     "run-pass",
     cl::desc("Run compiler only for specified passes (comma separated list)"),
-    cl::value_desc("pass-name"), cl::ZeroOrMore, cl::location(RunPassOpt));
+    cl::value_desc("pass-name"), cl::location(RunPassOpt));
 
 static int compileModule(char **, LLVMContext &);
 
@@ -369,6 +368,7 @@ int main(int argc, char **argv) {
   initializeHardwareLoopsPass(*Registry);
   initializeTransformUtils(*Registry);
   initializeReplaceWithVeclibLegacyPass(*Registry);
+  initializeTLSVariableHoistLegacyPassPass(*Registry);
 
   // Initialize debugging passes.
   initializeScavengerTestPass(*Registry);
@@ -501,14 +501,26 @@ static int compileModule(char **argv, LLVMContext &Context) {
         TargetMachine::parseBinutilsVersion(BinutilsVersion);
     Options.DisableIntegratedAS = NoIntegratedAssembler;
     Options.MCOptions.ShowMCEncoding = ShowMCEncoding;
-    Options.MCOptions.MCUseDwarfDirectory = DwarfDirectory;
     Options.MCOptions.AsmVerbose = AsmVerbose;
     Options.MCOptions.PreserveAsmComments = PreserveComments;
     Options.MCOptions.IASSearchPaths = IncludeDirs;
     Options.MCOptions.SplitDwarfFile = SplitDwarfFile;
+    if (DwarfDirectory.getPosition()) {
+      Options.MCOptions.MCUseDwarfDirectory =
+          DwarfDirectory ? MCTargetOptions::EnableDwarfDirectory
+                         : MCTargetOptions::DisableDwarfDirectory;
+    } else {
+      // -dwarf-directory is not set explicitly. Some assemblers
+      // (e.g. GNU as or ptxas) do not support `.file directory'
+      // syntax prior to DWARFv5. Let the target decide the default
+      // value.
+      Options.MCOptions.MCUseDwarfDirectory =
+          MCTargetOptions::DefaultDwarfDirectory;
+    }
   };
 
   Optional<Reloc::Model> RM = codegen::getExplicitRelocModel();
+  Optional<CodeModel::Model> CM = codegen::getExplicitCodeModel();
 
   const Target *TheTarget = nullptr;
   std::unique_ptr<TargetMachine> Target;
@@ -535,14 +547,13 @@ static int compileModule(char **argv, LLVMContext &Context) {
 
       // On AIX, setting the relocation model to anything other than PIC is
       // considered a user error.
-      if (TheTriple.isOSAIX() && RM.hasValue() && *RM != Reloc::PIC_)
+      if (TheTriple.isOSAIX() && RM && *RM != Reloc::PIC_)
         reportError("invalid relocation model, AIX only supports PIC",
                     InputFilename);
 
       InitializeOptions(TheTriple);
       Target = std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
-          TheTriple.getTriple(), CPUStr, FeaturesStr, Options, RM,
-          codegen::getExplicitCodeModel(), OLvl));
+          TheTriple.getTriple(), CPUStr, FeaturesStr, Options, RM, CM, OLvl));
       assert(Target && "Could not allocate target machine!");
 
       return Target->createDataLayout().getStringRepresentation();
@@ -562,6 +573,10 @@ static int compileModule(char **argv, LLVMContext &Context) {
     }
     if (!TargetTriple.empty())
       M->setTargetTriple(Triple::normalize(TargetTriple));
+
+    Optional<CodeModel::Model> CM_IR = M->getCodeModel();
+    if (!CM && CM_IR)
+      Target->setCodeModel(CM_IR.getValue());
   } else {
     TheTriple = Triple(Triple::normalize(TargetTriple));
     if (TheTriple.getTriple().empty())
@@ -578,7 +593,7 @@ static int compileModule(char **argv, LLVMContext &Context) {
 
     // On AIX, setting the relocation model to anything other than PIC is
     // considered a user error.
-    if (TheTriple.isOSAIX() && RM.hasValue() && *RM != Reloc::PIC_) {
+    if (TheTriple.isOSAIX() && RM && *RM != Reloc::PIC_) {
       WithColor::error(errs(), argv[0])
           << "invalid relocation model, AIX only supports PIC.\n";
       return 1;
@@ -586,8 +601,7 @@ static int compileModule(char **argv, LLVMContext &Context) {
 
     InitializeOptions(TheTriple);
     Target = std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
-        TheTriple.getTriple(), CPUStr, FeaturesStr, Options, RM,
-        codegen::getExplicitCodeModel(), OLvl));
+        TheTriple.getTriple(), CPUStr, FeaturesStr, Options, RM, CM, OLvl));
     assert(Target && "Could not allocate target machine!");
 
     // If we don't have a module then just exit now. We do this down
diff --git a/llvm/tools/lli/lli.cpp b/llvm/tools/lli/lli.cpp
index d20daa07196b..f2e3886bdf07 100644
--- a/llvm/tools/lli/lli.cpp
+++ b/llvm/tools/lli/lli.cpp
@@ -28,12 +28,15 @@
 #include "llvm/ExecutionEngine/ObjectCache.h"
 #include "llvm/ExecutionEngine/Orc/DebugObjectManagerPlugin.h"
 #include "llvm/ExecutionEngine/Orc/DebugUtils.h"
+#include "llvm/ExecutionEngine/Orc/ELFNixPlatform.h"
 #include "llvm/ExecutionEngine/Orc/EPCDebugObjectRegistrar.h"
+#include "llvm/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.h"
 #include "llvm/ExecutionEngine/Orc/EPCEHFrameRegistrar.h"
 #include "llvm/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.h"
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
 #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
 #include "llvm/ExecutionEngine/Orc/LLJIT.h"
+#include "llvm/ExecutionEngine/Orc/MachOPlatform.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/Orc/SimpleRemoteEPC.h"
 #include "llvm/ExecutionEngine/Orc/SymbolStringPool.h"
@@ -120,6 +123,9 @@ namespace {
                                       "RuntimeDyld"),
                            clEnumValN(JITLinkerKind::JITLink, "jitlink",
                                       "Orc-specific linker")));
+  cl::opt<std::string> OrcRuntime("orc-runtime",
+                                  cl::desc("Use ORC runtime from given path"),
+                                  cl::init(""));
 
   cl::opt<unsigned>
   LazyJITCompileThreads("compile-threads",
@@ -144,8 +150,7 @@ namespace {
                          "-extra-module arguments."));
 
   cl::list<std::string>
-    Dylibs("dlopen", cl::desc("Dynamic libraries to load before linking"),
-           cl::ZeroOrMore);
+      Dylibs("dlopen", cl::desc("Dynamic libraries to load before linking"));
 
   // The MCJIT supports building for a target address space separate from
   // the JIT compilation process. Use a forked process and a copying
@@ -166,13 +171,10 @@ namespace {
                 cl::value_desc("filename"), cl::init(""));
 
   // Determine optimization level.
-  cl::opt<char>
-  OptLevel("O",
-           cl::desc("Optimization level. [-O0, -O1, -O2, or -O3] "
-                    "(default = '-O2')"),
-           cl::Prefix,
-           cl::ZeroOrMore,
-           cl::init(' '));
+  cl::opt<char> OptLevel("O",
+                         cl::desc("Optimization level. [-O0, -O1, -O2, or -O3] "
+                                  "(default = '-O2')"),
+                         cl::Prefix, cl::init(' '));
 
   cl::opt<std::string>
   TargetTriple("mtriple", cl::desc("Override target triple for module"));
@@ -234,13 +236,15 @@ namespace {
       cl::desc("Do not resolve lli process symbols in JIT'd code"),
       cl::init(false));
 
-  enum class LLJITPlatform { Inactive, DetectHost, GenericIR };
+  enum class LLJITPlatform { Inactive, DetectHost, ORC, GenericIR };
 
   cl::opt<LLJITPlatform>
       Platform("lljit-platform", cl::desc("Platform to use with LLJIT"),
                cl::init(LLJITPlatform::DetectHost),
                cl::values(clEnumValN(LLJITPlatform::DetectHost, "DetectHost",
                                      "Select based on JIT target triple"),
+                          clEnumValN(LLJITPlatform::ORC, "ORC",
+                                     "Use ORCPlatform with the ORC runtime"),
                           clEnumValN(LLJITPlatform::GenericIR, "GenericIR",
                                      "Use LLJITGenericIRPlatform"),
                           clEnumValN(LLJITPlatform::Inactive, "Inactive",
@@ -369,6 +373,53 @@ private:
   }
 };
 
+class ORCPlatformSupport : public orc::LLJIT::PlatformSupport {
+public:
+  ORCPlatformSupport(orc::LLJIT &J) : J(J) {}
+
+  Error initialize(orc::JITDylib &JD) override {
+    using llvm::orc::shared::SPSExecutorAddr;
+    using llvm::orc::shared::SPSString;
+    using SPSDLOpenSig = SPSExecutorAddr(SPSString, int32_t);
+    enum dlopen_mode : int32_t {
+      ORC_RT_RTLD_LAZY = 0x1,
+      ORC_RT_RTLD_NOW = 0x2,
+      ORC_RT_RTLD_LOCAL = 0x4,
+      ORC_RT_RTLD_GLOBAL = 0x8
+    };
+
+    if (auto WrapperAddr = J.lookup("__orc_rt_jit_dlopen_wrapper")) {
+      return J.getExecutionSession().callSPSWrapper<SPSDLOpenSig>(
+          *WrapperAddr, DSOHandles[&JD], JD.getName(),
+          int32_t(ORC_RT_RTLD_LAZY));
+    } else
+      return WrapperAddr.takeError();
+  }
+
+  Error deinitialize(orc::JITDylib &JD) override {
+    using llvm::orc::shared::SPSExecutorAddr;
+    using SPSDLCloseSig = int32_t(SPSExecutorAddr);
+
+    if (auto WrapperAddr = J.lookup("__orc_rt_jit_dlclose_wrapper")) {
+      int32_t result;
+      auto E = J.getExecutionSession().callSPSWrapper<SPSDLCloseSig>(
+          *WrapperAddr, result, DSOHandles[&JD]);
+      if (E)
+        return E;
+      else if (result)
+        return make_error<StringError>("dlclose failed",
+                                       inconvertibleErrorCode());
+      DSOHandles.erase(&JD);
+    } else
+      return WrapperAddr.takeError();
+    return Error::success();
+  }
+
+private:
+  orc::LLJIT &J;
+  DenseMap<orc::JITDylib *, orc::ExecutorAddr> DSOHandles;
+};
+
 // On Mingw and Cygwin, an external symbol named '__main' is called from the
 // generated 'main' function to allow static initialization.  To avoid linking
 // problems with remote targets (because lli's remote target support does not
@@ -881,7 +932,7 @@ int runOrcJIT(const char *ProgName) {
   }
 
   Builder.setLazyCompileFailureAddr(
-      pointerToJITTargetAddress(exitOnLazyCallThroughFailure));
+      orc::ExecutorAddr::fromPtr(exitOnLazyCallThroughFailure));
   Builder.setNumCompileThreads(LazyJITCompileThreads);
 
   // If the object cache is enabled then set a custom compile function
@@ -908,21 +959,29 @@ int runOrcJIT(const char *ProgName) {
   }
 
   // Set up LLJIT platform.
-  {
-    LLJITPlatform P = Platform;
-    if (P == LLJITPlatform::DetectHost)
+  LLJITPlatform P = Platform;
+  if (P == LLJITPlatform::DetectHost) {
+    if (JITLinker == JITLinkerKind::JITLink && !OrcRuntime.empty() &&
+        (TT->isOSBinFormatMachO() || TT->isOSBinFormatELF()))
+      P = LLJITPlatform::ORC;
+    else
       P = LLJITPlatform::GenericIR;
-
-    switch (P) {
-    case LLJITPlatform::GenericIR:
-      // Nothing to do: LLJITBuilder will use this by default.
-      break;
-    case LLJITPlatform::Inactive:
-      Builder.setPlatformSetUp(orc::setUpInactivePlatform);
-      break;
-    default:
-      llvm_unreachable("Unrecognized platform value");
-    }
+  }
+  switch (P) {
+  case LLJITPlatform::ORC:
+    Builder.setPlatformSetUp([](llvm::orc::LLJIT &J) -> llvm::Error {
+      J.setPlatformSupport(std::make_unique<ORCPlatformSupport>(J));
+      return Error::success();
+    });
+    break;
+  case LLJITPlatform::GenericIR:
+    // Nothing to do: LLJITBuilder will use this by default.
+    break;
+  case LLJITPlatform::Inactive:
+    Builder.setPlatformSetUp(orc::setUpInactivePlatform);
+    break;
+  default:
+    llvm_unreachable("Unrecognized platform value");
   }
 
   std::unique_ptr<orc::ExecutorProcessControl> EPC = nullptr;
@@ -930,13 +989,15 @@ int runOrcJIT(const char *ProgName) {
     EPC = ExitOnErr(orc::SelfExecutorProcessControl::Create(
         std::make_shared<orc::SymbolStringPool>()));
 
-    Builder.setObjectLinkingLayerCreator([&EPC](orc::ExecutionSession &ES,
-                                                const Triple &) {
+    Builder.setObjectLinkingLayerCreator([&EPC, &P](orc::ExecutionSession &ES,
+                                                    const Triple &TT) {
       auto L = std::make_unique<orc::ObjectLinkingLayer>(ES, EPC->getMemMgr());
-      L->addPlugin(std::make_unique<orc::EHFrameRegistrationPlugin>(
-          ES, ExitOnErr(orc::EPCEHFrameRegistrar::Create(ES))));
-      L->addPlugin(std::make_unique<orc::DebugObjectManagerPlugin>(
-          ES, ExitOnErr(orc::createJITLoaderGDBRegistrar(ES))));
+      if (P != LLJITPlatform::ORC) {
+        L->addPlugin(std::make_unique<orc::EHFrameRegistrationPlugin>(
+            ES, ExitOnErr(orc::EPCEHFrameRegistrar::Create(ES))));
+        L->addPlugin(std::make_unique<orc::DebugObjectManagerPlugin>(
+            ES, ExitOnErr(orc::createJITLoaderGDBRegistrar(ES))));
+      }
       return L;
     });
   }
@@ -983,6 +1044,31 @@ int runOrcJIT(const char *ProgName) {
         std::make_unique<LLIBuiltinFunctionGenerator>(GenerateBuiltinFunctions,
                                                       Mangle));
 
+  if (P == LLJITPlatform::ORC) {
+    if (auto *OLL = llvm::dyn_cast<llvm::orc::ObjectLinkingLayer>(ObjLayer)) {
+      auto &ES = J->getExecutionSession();
+      if (TT->isOSBinFormatMachO()) {
+        if (auto P = llvm::orc::MachOPlatform::Create(
+                ES, *OLL, J->getMainJITDylib(), OrcRuntime.c_str()))
+          ES.setPlatform(std::move(*P));
+        else
+          ExitOnErr(P.takeError());
+      } else if (TT->isOSBinFormatELF()) {
+        if (auto P = llvm::orc::ELFNixPlatform::Create(
+                ES, *OLL, J->getMainJITDylib(), OrcRuntime.c_str()))
+          ES.setPlatform(std::move(*P));
+        else
+          ExitOnErr(P.takeError());
+      } else {
+        errs() << "No ORC platform support\n";
+        exit(1);
+      }
+    } else {
+      errs() << "ORC platform requires JITLink\n";
+      exit(1);
+    }
+  }
+
   // Regular modules are greedy: They materialize as a whole and trigger
   // materialization for all required symbols recursively. Lazy modules go
   // through partitioning and they replace outgoing calls with reexport stubs
@@ -1049,23 +1135,21 @@ int runOrcJIT(const char *ProgName) {
   for (auto &ThreadEntryPoint : ThreadEntryPoints) {
     auto EntryPointSym = ExitOnErr(J->lookup(ThreadEntryPoint));
     typedef void (*EntryPointPtr)();
-    auto EntryPoint =
-      reinterpret_cast<EntryPointPtr>(static_cast<uintptr_t>(EntryPointSym.getAddress()));
+    auto EntryPoint = EntryPointSym.toPtr<EntryPointPtr>();
     AltEntryThreads.push_back(std::thread([EntryPoint]() { EntryPoint(); }));
   }
 
   // Resolve and run the main function.
-  JITEvaluatedSymbol MainSym = ExitOnErr(J->lookup(EntryFunc));
+  auto MainAddr = ExitOnErr(J->lookup(EntryFunc));
   int Result;
 
   if (EPC) {
     // ExecutorProcessControl-based execution with JITLink.
-    Result = ExitOnErr(
-        EPC->runAsMain(orc::ExecutorAddr(MainSym.getAddress()), InputArgv));
+    Result = ExitOnErr(EPC->runAsMain(MainAddr, InputArgv));
   } else {
     // Manual in-process execution with RuntimeDyld.
     using MainFnTy = int(int, char *[]);
-    auto MainFn = jitTargetAddressToFunction<MainFnTy *>(MainSym.getAddress());
+    auto MainFn = MainAddr.toPtr<MainFnTy *>();
     Result = orc::runAsMain(MainFn, InputArgv, StringRef(InputFile));
   }
 
diff --git a/llvm/tools/llvm-ar/llvm-ar.cpp b/llvm/tools/llvm-ar/llvm-ar.cpp
index 8842162f5216..e964dc8256a5 100644
--- a/llvm/tools/llvm-ar/llvm-ar.cpp
+++ b/llvm/tools/llvm-ar/llvm-ar.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Object/MachO.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Object/SymbolicFile.h"
+#include "llvm/Object/XCOFFObjectFile.h"
 #include "llvm/Support/Chrono.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ConvertUTF.h"
@@ -61,32 +62,30 @@ static StringRef ToolName;
 // The basename of this program.
 static StringRef Stem;
 
-const char RanlibHelp[] = R"(OVERVIEW: LLVM Ranlib (llvm-ranlib)
-
-  This program generates an index to speed access to archives
-
-USAGE: llvm-ranlib <archive-file>
-
-OPTIONS:
-  -h --help             - Display available options
-  -v --version          - Display the version of this program
-  -D                    - Use zero for timestamps and uids/gids (default)
-  -U                    - Use actual timestamps and uids/gids
-)";
-
-const char ArHelp[] = R"(OVERVIEW: LLVM Archiver
-
-USAGE: llvm-ar [options] [-]<operation>[modifiers] [relpos] [count] <archive> [files]
-       llvm-ar -M [<mri-script]
+static void printRanLibHelp(StringRef ToolName) {
+  outs() << "OVERVIEW: LLVM Ranlib\n\n"
+         << "This program generates an index to speed access to archives\n\n"
+         << "USAGE: " + ToolName + " <archive-file>\n\n"
+         << "OPTIONS:\n"
+         << "  -h --help             - Display available options\n"
+         << "  -v --version          - Display the version of this program\n"
+         << "  -D                    - Use zero for timestamps and uids/gids "
+            "(default)\n"
+         << "  -U                    - Use actual timestamps and uids/gids\n";
+}
 
-OPTIONS:
+static void printArHelp(StringRef ToolName) {
+  const char ArOptions[] =
+      R"(OPTIONS:
   --format              - archive format to create
     =default            -   default
     =gnu                -   gnu
     =darwin             -   darwin
     =bsd                -   bsd
+    =aix                -   aix (big archive)
   --plugin=<string>     - ignored for compatibility
   -h --help             - display this help and exit
+  --output              - the directory to extract archive members to
   --rsp-quoting         - quoting style for response files
     =posix              -   posix
     =windows            -   windows
@@ -126,11 +125,20 @@ MODIFIERS:
   [V] - display the version and exit
 )";
 
+  outs() << "OVERVIEW: LLVM Archiver\n\n"
+         << "USAGE: " + ToolName +
+                " [options] [-]<operation>[modifiers] [relpos] "
+                "[count] <archive> [files]\n"
+         << "       " + ToolName + " -M [<mri-script]\n\n";
+
+  outs() << ArOptions;
+}
+
 static void printHelpMessage() {
   if (Stem.contains_insensitive("ranlib"))
-    outs() << RanlibHelp;
+    printRanLibHelp(Stem);
   else if (Stem.contains_insensitive("ar"))
-    outs() << ArHelp;
+    printArHelp(Stem);
 }
 
 static unsigned MRILineNumber;
@@ -181,7 +189,7 @@ static SmallVector<const char *, 256> PositionalArgs;
 static bool MRI;
 
 namespace {
-enum Format { Default, GNU, BSD, DARWIN, Unknown };
+enum Format { Default, GNU, BSD, DARWIN, BIGARCHIVE, Unknown };
 }
 
 static Format FormatType = Default;
@@ -230,6 +238,9 @@ static int CountParam = 0;
 // command line.
 static std::string ArchiveName;
 
+// Output directory specified by --output.
+static std::string OutputDir;
+
 static std::vector<std::unique_ptr<MemoryBuffer>> ArchiveBuffers;
 static std::vector<std::unique_ptr<object::Archive>> Archives;
 
@@ -447,6 +458,19 @@ static ArchiveOperation parseCommandLine() {
   if (AddLibrary && Operation != QuickAppend)
     badUsage("the 'L' modifier is only applicable to the 'q' operation");
 
+  if (!OutputDir.empty()) {
+    if (Operation != Extract)
+      badUsage("--output is only applicable to the 'x' operation");
+    bool IsDir = false;
+    // If OutputDir is not a directory, create_directories may still succeed if
+    // all components of the path prefix are directories. Test is_directory as
+    // well.
+    if (!sys::fs::create_directories(OutputDir))
+      sys::fs::is_directory(OutputDir, IsDir);
+    if (!IsDir)
+      fail("'" + OutputDir + "' is not a directory");
+  }
+
   // Return the parsed operation to the caller
   return Operation;
 }
@@ -547,7 +571,15 @@ static void doExtract(StringRef Name, const object::Archive::Child &C) {
   failIfError(ModeOrErr.takeError());
   sys::fs::perms Mode = ModeOrErr.get();
 
-  llvm::StringRef outputFilePath = sys::path::filename(Name);
+  StringRef outputFilePath;
+  SmallString<128> path;
+  if (OutputDir.empty()) {
+    outputFilePath = sys::path::filename(Name);
+  } else {
+    sys::path::append(path, OutputDir, sys::path::filename(Name));
+    outputFilePath = path.str();
+  }
+
   if (Verbose)
     outs() << "x - " << outputFilePath << '\n';
 
@@ -652,8 +684,6 @@ static void performReadOperation(ArchiveOperation Operation,
 static void addChildMember(std::vector<NewArchiveMember> &Members,
                            const object::Archive::Child &M,
                            bool FlattenArchive = false) {
-  if (Thin && !M.getParent()->isThin())
-    fail("cannot convert a regular archive to a thin one");
   Expected<NewArchiveMember> NMOrErr =
       NewArchiveMember::getOldMember(M, Deterministic);
   failIfError(NMOrErr.takeError());
@@ -875,48 +905,18 @@ computeNewArchiveMembers(ArchiveOperation Operation,
   return Ret;
 }
 
-static object::Archive::Kind getDefaultForHost() {
-  return Triple(sys::getProcessTriple()).isOSDarwin()
-             ? object::Archive::K_DARWIN
-             : object::Archive::K_GNU;
-}
-
-static object::Archive::Kind getKindFromMember(const NewArchiveMember &Member) {
-  auto MemBufferRef = Member.Buf->getMemBufferRef();
-  Expected<std::unique_ptr<object::ObjectFile>> OptionalObject =
-      object::ObjectFile::createObjectFile(MemBufferRef);
-
-  if (OptionalObject)
-    return isa<object::MachOObjectFile>(**OptionalObject)
-               ? object::Archive::K_DARWIN
-               : object::Archive::K_GNU;
-
-  // squelch the error in case we had a non-object file
-  consumeError(OptionalObject.takeError());
-
-  // If we're adding a bitcode file to the archive, detect the Archive kind
-  // based on the target triple.
-  LLVMContext Context;
-  if (identify_magic(MemBufferRef.getBuffer()) == file_magic::bitcode) {
-    if (auto ObjOrErr = object::SymbolicFile::createSymbolicFile(
-            MemBufferRef, file_magic::bitcode, &Context)) {
-      auto &IRObject = cast<object::IRObjectFile>(**ObjOrErr);
-      return Triple(IRObject.getTargetTriple()).isOSDarwin()
-                 ? object::Archive::K_DARWIN
-                 : object::Archive::K_GNU;
-    } else {
-      // Squelch the error in case this was not a SymbolicFile.
-      consumeError(ObjOrErr.takeError());
-    }
-  }
-
-  return getDefaultForHost();
-}
-
 static void performWriteOperation(ArchiveOperation Operation,
                                   object::Archive *OldArchive,
                                   std::unique_ptr<MemoryBuffer> OldArchiveBuf,
                                   std::vector<NewArchiveMember> *NewMembersP) {
+  if (OldArchive) {
+    if (Thin && !OldArchive->isThin())
+      fail("cannot convert a regular archive to a thin one");
+
+    if (OldArchive->isThin())
+      Thin = true;
+  }
+
   std::vector<NewArchiveMember> NewMembers;
   if (!NewMembersP)
     NewMembers = computeNewArchiveMembers(Operation, OldArchive);
@@ -926,14 +926,23 @@ static void performWriteOperation(ArchiveOperation Operation,
   case Default:
     if (Thin)
       Kind = object::Archive::K_GNU;
-    else if (OldArchive)
+    else if (OldArchive) {
       Kind = OldArchive->kind();
-    else if (NewMembersP)
-      Kind = !NewMembersP->empty() ? getKindFromMember(NewMembersP->front())
-                                   : getDefaultForHost();
+      if (Kind == object::Archive::K_BSD) {
+        auto InferredKind = object::Archive::K_BSD;
+        if (NewMembersP && !NewMembersP->empty())
+          InferredKind = NewMembersP->front().detectKindFromObject();
+        else if (!NewMembers.empty())
+          InferredKind = NewMembers.front().detectKindFromObject();
+        if (InferredKind == object::Archive::K_DARWIN)
+          Kind = object::Archive::K_DARWIN;
+      }
+    } else if (NewMembersP)
+      Kind = !NewMembersP->empty() ? NewMembersP->front().detectKindFromObject()
+                                   : object::Archive::getDefaultKindForHost();
     else
-      Kind = !NewMembers.empty() ? getKindFromMember(NewMembers.front())
-                                 : getDefaultForHost();
+      Kind = !NewMembers.empty() ? NewMembers.front().detectKindFromObject()
+                                 : object::Archive::getDefaultKindForHost();
     break;
   case GNU:
     Kind = object::Archive::K_GNU;
@@ -948,6 +957,11 @@ static void performWriteOperation(ArchiveOperation Operation,
       fail("only the gnu format has a thin mode");
     Kind = object::Archive::K_DARWIN;
     break;
+  case BIGARCHIVE:
+    if (Thin)
+      fail("only the gnu format has a thin mode");
+    Kind = object::Archive::K_AIXBIG;
+    break;
   case Unknown:
     llvm_unreachable("");
   }
@@ -1073,8 +1087,12 @@ static void runMRIScript() {
 
     switch (Command) {
     case MRICommand::AddLib: {
+      if (!Create)
+        fail("no output archive has been opened");
       object::Archive &Lib = readLibrary(Rest);
       {
+        if (Thin && !Lib.isThin())
+          fail("cannot add a regular archive's contents to a thin archive");
         Error Err = Error::success();
         for (auto &Member : Lib.children(Err))
           addChildMember(NewMembers, Member, /*FlattenArchive=*/Thin);
@@ -1083,6 +1101,8 @@ static void runMRIScript() {
       break;
     }
     case MRICommand::AddMod:
+      if (!Create)
+        fail("no output archive has been opened");
       addMember(NewMembers, Rest);
       break;
     case MRICommand::CreateThin:
@@ -1095,6 +1115,8 @@ static void runMRIScript() {
       if (Saved)
         fail("file already saved");
       ArchiveName = std::string(Rest);
+      if (ArchiveName.empty())
+        fail("missing archive name");
       break;
     case MRICommand::Delete: {
       llvm::erase_if(NewMembers, [=](NewArchiveMember &M) {
@@ -1116,7 +1138,8 @@ static void runMRIScript() {
 
   // Nothing to do if not saved.
   if (Saved)
-    performOperation(ReplaceOrInsert, &NewMembers);
+    performOperation(ReplaceOrInsert, /*OldArchive=*/nullptr,
+                     /*OldArchiveBuf=*/nullptr, &NewMembers);
   exit(0);
 }
 
@@ -1219,12 +1242,18 @@ static int ar_main(int argc, char **argv) {
                        .Case("gnu", GNU)
                        .Case("darwin", DARWIN)
                        .Case("bsd", BSD)
+                       .Case("bigarchive", BIGARCHIVE)
                        .Default(Unknown);
       if (FormatType == Unknown)
         fail(std::string("Invalid format ") + Match);
       continue;
     }
 
+    if ((Match = matchFlagWithArg("output", ArgIt, Argv))) {
+      OutputDir = Match;
+      continue;
+    }
+
     if (matchFlagWithArg("plugin", ArgIt, Argv) ||
         matchFlagWithArg("rsp-quoting", ArgIt, Argv))
       continue;
@@ -1274,7 +1303,7 @@ static int ranlib_main(int argc, char **argv) {
   return performOperation(CreateSymTab, nullptr);
 }
 
-int main(int argc, char **argv) {
+int llvm_ar_main(int argc, char **argv) {
   InitLLVM X(argc, argv);
   ToolName = argv[0];
 
diff --git a/llvm/tools/llvm-cov/CodeCoverage.cpp b/llvm/tools/llvm-cov/CodeCoverage.cpp
index ef801287c1be..6932e9b5bd31 100644
--- a/llvm/tools/llvm-cov/CodeCoverage.cpp
+++ b/llvm/tools/llvm-cov/CodeCoverage.cpp
@@ -265,8 +265,7 @@ bool CodeCoverageTool::isEquivalentFile(StringRef FilePath1,
                                         StringRef FilePath2) {
   auto Status1 = getFileStatus(FilePath1);
   auto Status2 = getFileStatus(FilePath2);
-  return Status1.hasValue() && Status2.hasValue() &&
-         sys::fs::equivalent(Status1.getValue(), Status2.getValue());
+  return Status1 && Status2 && sys::fs::equivalent(*Status1, *Status2);
 }
 
 ErrorOr<const MemoryBuffer &>
@@ -621,14 +620,14 @@ int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
       cl::Positional, cl::desc("Covered executable or object file."));
 
   cl::list<std::string> CovFilenames(
-      "object", cl::desc("Coverage executable or object file"), cl::ZeroOrMore);
+      "object", cl::desc("Coverage executable or object file"));
 
   cl::opt<bool> DebugDumpCollectedObjects(
       "dump-collected-objects", cl::Optional, cl::Hidden,
       cl::desc("Show the collected coverage object files"));
 
-  cl::list<std::string> InputSourceFiles(
-      cl::Positional, cl::desc("<Source files>"), cl::ZeroOrMore);
+  cl::list<std::string> InputSourceFiles(cl::Positional,
+                                         cl::desc("<Source files>"));
 
   cl::opt<bool> DebugDumpCollectedPaths(
       "dump-collected-paths", cl::Optional, cl::Hidden,
@@ -665,32 +664,32 @@ int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
   cl::list<std::string> NameFilters(
       "name", cl::Optional,
       cl::desc("Show code coverage only for functions with the given name"),
-      cl::ZeroOrMore, cl::cat(FilteringCategory));
+      cl::cat(FilteringCategory));
 
   cl::list<std::string> NameFilterFiles(
       "name-allowlist", cl::Optional,
       cl::desc("Show code coverage only for functions listed in the given "
                "file"),
-      cl::ZeroOrMore, cl::cat(FilteringCategory));
+      cl::cat(FilteringCategory));
 
   // Allow for accepting previous option name.
   cl::list<std::string> NameFilterFilesDeprecated(
       "name-whitelist", cl::Optional, cl::Hidden,
       cl::desc("Show code coverage only for functions listed in the given "
                "file. Deprecated, use -name-allowlist instead"),
-      cl::ZeroOrMore, cl::cat(FilteringCategory));
+      cl::cat(FilteringCategory));
 
   cl::list<std::string> NameRegexFilters(
       "name-regex", cl::Optional,
       cl::desc("Show code coverage only for functions that match the given "
                "regular expression"),
-      cl::ZeroOrMore, cl::cat(FilteringCategory));
+      cl::cat(FilteringCategory));
 
   cl::list<std::string> IgnoreFilenameRegexFilters(
       "ignore-filename-regex", cl::Optional,
       cl::desc("Skip source code files with file paths that match the given "
                "regular expression"),
-      cl::ZeroOrMore, cl::cat(FilteringCategory));
+      cl::cat(FilteringCategory));
 
   cl::opt<double> RegionCoverageLtFilter(
       "region-coverage-lt", cl::Optional,
@@ -883,6 +882,9 @@ int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
         }
         CoverageArches.emplace_back(Arch);
       }
+      if (CoverageArches.size() == 1)
+        CoverageArches.insert(CoverageArches.end(), ObjectFilenames.size() - 1,
+                              CoverageArches[0]);
       if (CoverageArches.size() != ObjectFilenames.size()) {
         error("Number of architectures doesn't match the number of objects");
         return 1;
@@ -973,6 +975,11 @@ int CodeCoverageTool::doShow(int argc, const char **argv,
       "project-title", cl::Optional,
       cl::desc("Set project title for the coverage report"));
 
+  cl::opt<std::string> CovWatermark(
+      "coverage-watermark", cl::Optional,
+      cl::desc("<high>,<low> value indicate thresholds for high and low"
+               "coverage watermark"));
+
   auto Err = commandLineParser(argc, argv);
   if (Err)
     return Err;
@@ -982,6 +989,47 @@ int CodeCoverageTool::doShow(int argc, const char **argv,
     return 1;
   }
 
+  ViewOpts.HighCovWatermark = 100.0;
+  ViewOpts.LowCovWatermark = 80.0;
+  if (!CovWatermark.empty()) {
+    auto WaterMarkPair = StringRef(CovWatermark).split(',');
+    if (WaterMarkPair.first.empty() || WaterMarkPair.second.empty()) {
+      error("invalid argument '" + CovWatermark +
+                "', must be in format 'high,low'",
+            "-coverage-watermark");
+      return 1;
+    }
+
+    char *EndPointer = nullptr;
+    ViewOpts.HighCovWatermark =
+        strtod(WaterMarkPair.first.begin(), &EndPointer);
+    if (EndPointer != WaterMarkPair.first.end()) {
+      error("invalid number '" + WaterMarkPair.first +
+                "', invalid value for 'high'",
+            "-coverage-watermark");
+      return 1;
+    }
+
+    ViewOpts.LowCovWatermark =
+        strtod(WaterMarkPair.second.begin(), &EndPointer);
+    if (EndPointer != WaterMarkPair.second.end()) {
+      error("invalid number '" + WaterMarkPair.second +
+                "', invalid value for 'low'",
+            "-coverage-watermark");
+      return 1;
+    }
+
+    if (ViewOpts.HighCovWatermark > 100 || ViewOpts.LowCovWatermark < 0 ||
+        ViewOpts.HighCovWatermark <= ViewOpts.LowCovWatermark) {
+      error(
+          "invalid number range '" + CovWatermark +
+              "', must be both high and low should be between 0-100, and high "
+              "> low",
+          "-coverage-watermark");
+      return 1;
+    }
+  }
+
   ViewOpts.ShowLineNumbers = true;
   ViewOpts.ShowLineStats = ShowLineExecutionCounts.getNumOccurrences() != 0 ||
                            !ShowRegions || ShowBestLineRegionsCounts;
diff --git a/llvm/tools/llvm-cov/CoverageViewOptions.h b/llvm/tools/llvm-cov/CoverageViewOptions.h
index 045fb1787bce..c6e99819f319 100644
--- a/llvm/tools/llvm-cov/CoverageViewOptions.h
+++ b/llvm/tools/llvm-cov/CoverageViewOptions.h
@@ -50,6 +50,8 @@ struct CoverageViewOptions {
   std::string CreatedTimeStr;
   unsigned NumThreads;
   std::string CompilationDirectory;
+  float HighCovWatermark;
+  float LowCovWatermark;
 
   /// Change the output's stream color if the colors are enabled.
   ColoredRawOstream colored_ostream(raw_ostream &OS,
diff --git a/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp b/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp
index 56efc40b9349..46782c9b3c9a 100644
--- a/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp
+++ b/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp
@@ -338,24 +338,24 @@ void CoveragePrinterHTML::emitFileSummary(raw_ostream &OS, StringRef SF,
   SmallVector<std::string, 8> Columns;
 
   // Format a coverage triple and add the result to the list of columns.
-  auto AddCoverageTripleToColumn = [&Columns](unsigned Hit, unsigned Total,
-                                              float Pctg) {
-    std::string S;
-    {
-      raw_string_ostream RSO{S};
-      if (Total)
-        RSO << format("%*.2f", 7, Pctg) << "% ";
-      else
-        RSO << "- ";
-      RSO << '(' << Hit << '/' << Total << ')';
-    }
-    const char *CellClass = "column-entry-yellow";
-    if (Hit == Total)
-      CellClass = "column-entry-green";
-    else if (Pctg < 80.0)
-      CellClass = "column-entry-red";
-    Columns.emplace_back(tag("td", tag("pre", S), CellClass));
-  };
+  auto AddCoverageTripleToColumn =
+      [&Columns, this](unsigned Hit, unsigned Total, float Pctg) {
+        std::string S;
+        {
+          raw_string_ostream RSO{S};
+          if (Total)
+            RSO << format("%*.2f", 7, Pctg) << "% ";
+          else
+            RSO << "- ";
+          RSO << '(' << Hit << '/' << Total << ')';
+        }
+        const char *CellClass = "column-entry-yellow";
+        if (Pctg >= Opts.HighCovWatermark)
+          CellClass = "column-entry-green";
+        else if (Pctg < Opts.LowCovWatermark)
+          CellClass = "column-entry-red";
+        Columns.emplace_back(tag("td", tag("pre", S), CellClass));
+      };
 
   // Simplify the display file path, and wrap it in a link if requested.
   std::string Filename;
@@ -538,7 +538,7 @@ void SourceCoverageViewHTML::renderLine(raw_ostream &OS, LineRef L,
   auto Highlight = [&](const std::string &Snippet, unsigned LC, unsigned RC) {
     if (getOptions().Debug)
       HighlightedRanges.emplace_back(LC, RC);
-    return tag("span", Snippet, std::string(Color.getValue()));
+    return tag("span", Snippet, std::string(*Color));
   };
 
   auto CheckIfUncovered = [&](const CoverageSegment *S) {
@@ -561,12 +561,12 @@ void SourceCoverageViewHTML::renderLine(raw_ostream &OS, LineRef L,
     else
       Color = None;
 
-    if (Color.hasValue())
+    if (Color)
       Snippets[I + 1] = Highlight(Snippets[I + 1], CurSeg->Col,
                                   CurSeg->Col + Snippets[I + 1].size());
   }
 
-  if (Color.hasValue() && Segments.empty())
+  if (Color && Segments.empty())
     Snippets.back() = Highlight(Snippets.back(), 1, 1 + Snippets.back().size());
 
   if (getOptions().Debug) {
diff --git a/llvm/tools/llvm-cov/TestingSupport.cpp b/llvm/tools/llvm-cov/TestingSupport.cpp
index 9c6b25f2f585..289a1621660b 100644
--- a/llvm/tools/llvm-cov/TestingSupport.cpp
+++ b/llvm/tools/llvm-cov/TestingSupport.cpp
@@ -12,6 +12,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/LEB128.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 #include <functional>
 #include <system_error>
diff --git a/llvm/tools/llvm-cxxdump/llvm-cxxdump.cpp b/llvm/tools/llvm-cxxdump/llvm-cxxdump.cpp
index 1430674dbadc..02f4c8493903 100644
--- a/llvm/tools/llvm-cxxdump/llvm-cxxdump.cpp
+++ b/llvm/tools/llvm-cxxdump/llvm-cxxdump.cpp
@@ -36,7 +36,7 @@ namespace opts {
 cl::OptionCategory CXXDumpCategory("CXX Dump Options");
 cl::list<std::string> InputFilenames(cl::Positional,
                                      cl::desc("<input object files>"),
-                                     cl::ZeroOrMore, cl::cat(CXXDumpCategory));
+                                     cl::cat(CXXDumpCategory));
 } // namespace opts
 
 namespace llvm {
diff --git a/llvm/tools/llvm-cxxfilt/Opts.td b/llvm/tools/llvm-cxxfilt/Opts.td
index 93f865245fe6..f652a1a7f88b 100644
--- a/llvm/tools/llvm-cxxfilt/Opts.td
+++ b/llvm/tools/llvm-cxxfilt/Opts.td
@@ -16,7 +16,7 @@ multiclass Eq<string name, string help> {
 
 def help : FF<"help", "Display this help">;
 defm strip_underscore : BB<"strip-underscore", "Strip the leading underscore", "Don't strip the leading underscore">;
-def types : FF<"types", "">;
+def types : FF<"types", "Attempt to demangle types as well as function names">;
 def version : FF<"version", "Display the version">;
 
 defm : Eq<"format", "Specify mangling format. Currently ignored because only 'gnu' is supported">;
diff --git a/llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp b/llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp
index ccfaaa96deb2..1cea9e29faa4 100644
--- a/llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp
+++ b/llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp
@@ -140,7 +140,7 @@ static void demangleLine(llvm::raw_ostream &OS, StringRef Mangled, bool Split) {
   OS.flush();
 }
 
-int main(int argc, char **argv) {
+int llvm_cxxfilt_main(int argc, char **argv) {
   InitLLVM X(argc, argv);
   BumpPtrAllocator A;
   StringSaver Saver(A);
diff --git a/llvm/tools/llvm-dis/llvm-dis.cpp b/llvm/tools/llvm-dis/llvm-dis.cpp
index 7b3c3e7706a6..4996fc12ae32 100644
--- a/llvm/tools/llvm-dis/llvm-dis.cpp
+++ b/llvm/tools/llvm-dis/llvm-dis.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
@@ -37,7 +38,7 @@ using namespace llvm;
 
 static cl::OptionCategory DisCategory("Disassembler Options");
 
-static cl::list<std::string> InputFilenames(cl::Positional, cl::ZeroOrMore,
+static cl::list<std::string> InputFilenames(cl::Positional,
                                             cl::desc("[input bitcode]..."),
                                             cl::cat(DisCategory));
 
@@ -179,8 +180,13 @@ int main(int argc, char **argv) {
   }
 
   for (std::string InputFilename : InputFilenames) {
-    std::unique_ptr<MemoryBuffer> MB = ExitOnErr(
-        errorOrToExpected(MemoryBuffer::getFileOrSTDIN(InputFilename)));
+    ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
+        MemoryBuffer::getFileOrSTDIN(InputFilename);
+    if (std::error_code EC = BufferOrErr.getError()) {
+      WithColor::error() << InputFilename << ": " << EC.message() << '\n';
+      return 1;
+    }
+    std::unique_ptr<MemoryBuffer> MB = std::move(BufferOrErr.get());
 
     BitcodeFileContents IF = ExitOnErr(llvm::getBitcodeFileContents(*MB));
 
diff --git a/llvm/tools/llvm-dwarfdump/Statistics.cpp b/llvm/tools/llvm-dwarfdump/Statistics.cpp
index 5c08e43b4b09..ed92665e0483 100644
--- a/llvm/tools/llvm-dwarfdump/Statistics.cpp
+++ b/llvm/tools/llvm-dwarfdump/Statistics.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/StringSet.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugLoc.h"
+#include "llvm/DebugInfo/DWARF/DWARFExpression.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/JSON.h"
 
@@ -1043,14 +1044,19 @@ bool dwarfdump::collectStatsForObjectFile(ObjectFile &Obj, DWARFContext &DICtx,
                      LocStats.LocalVarNonEntryValLocStats);
   J.objectEnd();
   OS << '\n';
-  LLVM_DEBUG(llvm::dbgs() << "Total Availability: "
-                          << (int)std::round((VarParamWithLoc.Value * 100.0) /
+  LLVM_DEBUG(
+      llvm::dbgs() << "Total Availability: "
+                   << (VarParamTotal.Value
+                           ? (int)std::round((VarParamWithLoc.Value * 100.0) /
                                              VarParamTotal.Value)
-                          << "%\n";
-             llvm::dbgs() << "PC Ranges covered: "
-                          << (int)std::round(
+                           : 0)
+                   << "%\n";
+      llvm::dbgs() << "PC Ranges covered: "
+                   << (GlobalStats.ScopeBytes.Value
+                           ? (int)std::round(
                                  (GlobalStats.ScopeBytesCovered.Value * 100.0) /
                                  GlobalStats.ScopeBytes.Value)
-                          << "%\n");
+                           : 0)
+                   << "%\n");
   return true;
 }
diff --git a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
index 9c2ddc3867a5..f7d3052c8c4d 100644
--- a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
+++ b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
@@ -15,6 +15,8 @@
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/DebugInfo/DIContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h"
+#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/MachOUniversal.h"
@@ -24,6 +26,7 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/Regex.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
@@ -119,7 +122,7 @@ using namespace cl;
 OptionCategory DwarfDumpCategory("Specific Options");
 static list<std::string>
     InputFilenames(Positional, desc("<input object files or .dSYM bundles>"),
-                   ZeroOrMore, cat(DwarfDumpCategory));
+                   cat(DwarfDumpCategory));
 
 cl::OptionCategory SectionCategory("Section-specific Dump Options",
                                    "These control which sections are dumped. "
@@ -245,6 +248,10 @@ static cl::opt<bool>
                      cl::desc("Show the sizes of all debug sections, "
                               "expressed in bytes."),
                      cat(DwarfDumpCategory));
+static cl::opt<bool>
+    ShowSources("show-sources",
+                cl::desc("Show the sources across all compilation units."),
+                cat(DwarfDumpCategory));
 static opt<bool> Verify("verify", desc("Verify the DWARF debug info."),
                         cat(DwarfDumpCategory));
 static opt<bool> Quiet("quiet", desc("Use with -verify to not emit to STDOUT."),
@@ -464,6 +471,87 @@ static bool lookup(ObjectFile &Obj, DWARFContext &DICtx, uint64_t Address,
   return true;
 }
 
+// Collect all sources referenced from the given line table, scoped to the given
+// CU compilation directory.
+static bool collectLineTableSources(const DWARFDebugLine::LineTable &LT,
+                                    StringRef CompDir,
+                                    std::vector<std::string> &Sources) {
+  bool Result = true;
+  llvm::Optional<uint64_t> LastIndex = LT.getLastValidFileIndex();
+  for (uint64_t I = LT.hasFileAtIndex(0) ? 0 : 1,
+                E = LastIndex ? *LastIndex + 1 : 0;
+       I < E; ++I) {
+    std::string Path;
+    Result &= LT.getFileNameByIndex(
+        I, CompDir, DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath,
+        Path);
+    Sources.push_back(std::move(Path));
+  }
+  return Result;
+}
+
+static bool collectObjectSources(ObjectFile &Obj, DWARFContext &DICtx,
+                                 const Twine &Filename, raw_ostream &OS) {
+  bool Result = true;
+  std::vector<std::string> Sources;
+
+  bool HasCompileUnits = false;
+  for (const auto &CU : DICtx.compile_units()) {
+    HasCompileUnits = true;
+    // Extract paths from the line table for this CU. This allows combining the
+    // compilation directory with the line information, in case both the include
+    // directory and file names in the line table are relative.
+    const DWARFDebugLine::LineTable *LT = DICtx.getLineTableForUnit(CU.get());
+    StringRef CompDir = CU->getCompilationDir();
+    if (LT) {
+      Result &= collectLineTableSources(*LT, CompDir, Sources);
+    } else {
+      // Since there's no line table for this CU, collect the name from the CU
+      // itself.
+      const char *Name = CU->getUnitDIE().getShortName();
+      if (!Name) {
+        WithColor::warning()
+            << Filename << ": missing name for compilation unit\n";
+        continue;
+      }
+      SmallString<64> AbsName;
+      if (sys::path::is_relative(Name, sys::path::Style::posix) &&
+          sys::path::is_relative(Name, sys::path::Style::windows))
+        AbsName = CompDir;
+      sys::path::append(AbsName, Name);
+      Sources.push_back(std::string(AbsName));
+    }
+  }
+
+  if (!HasCompileUnits) {
+    // Since there's no compile units available, walk the line tables and
+    // extract out any referenced paths.
+    DWARFDataExtractor LineData(DICtx.getDWARFObj(),
+                                DICtx.getDWARFObj().getLineSection(),
+                                DICtx.isLittleEndian(), 0);
+    DWARFDebugLine::SectionParser Parser(LineData, DICtx, DICtx.normal_units());
+    while (!Parser.done()) {
+      const auto RecoverableErrorHandler = [&](Error Err) {
+        Result = false;
+        WithColor::defaultErrorHandler(std::move(Err));
+      };
+      void (*UnrecoverableErrorHandler)(Error Err) = error;
+
+      DWARFDebugLine::LineTable LT =
+          Parser.parseNext(RecoverableErrorHandler, UnrecoverableErrorHandler);
+      Result &= collectLineTableSources(LT, /*CompDir=*/"", Sources);
+    }
+  }
+
+  // Dedup and order the sources.
+  llvm::sort(Sources.begin(), Sources.end());
+  Sources.erase(std::unique(Sources.begin(), Sources.end()), Sources.end());
+
+  for (StringRef Name : Sources)
+    OS << Name << "\n";
+  return Result;
+}
+
 static bool dumpObjectFile(ObjectFile &Obj, DWARFContext &DICtx,
                            const Twine &Filename, raw_ostream &OS) {
   logAllUnhandledErrors(DICtx.loadRegisterInfo(Obj), errs(),
@@ -677,6 +765,9 @@ int main(int argc, char **argv) {
   } else if (ShowSectionSizes) {
     for (auto Object : Objects)
       Success &= handleFile(Object, collectObjectSectionSizes, OutputFile.os());
+  } else if (ShowSources) {
+    for (auto Object : Objects)
+      Success &= handleFile(Object, collectObjectSources, OutputFile.os());
   } else {
     for (auto Object : Objects)
       Success &= handleFile(Object, dumpObjectFile, OutputFile.os());
diff --git a/llvm/tools/llvm-dwp/llvm-dwp.cpp b/llvm/tools/llvm-dwp/llvm-dwp.cpp
index 4b6f7bc8dd34..d2d162d648c0 100644
--- a/llvm/tools/llvm-dwp/llvm-dwp.cpp
+++ b/llvm/tools/llvm-dwp/llvm-dwp.cpp
@@ -19,11 +19,14 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCTargetOptionsCommandFlags.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
 
@@ -33,13 +36,13 @@ using namespace llvm::object;
 static mc::RegisterMCTargetOptionsFlags MCTargetOptionsFlags;
 
 cl::OptionCategory DwpCategory("Specific Options");
-static cl::list<std::string> InputFiles(cl::Positional, cl::ZeroOrMore,
-                                        cl::desc("<input files>"),
-                                        cl::cat(DwpCategory));
+static cl::list<std::string>
+    InputFiles(cl::Positional, cl::desc("<input files>"), cl::cat(DwpCategory));
 
 static cl::list<std::string> ExecFilenames(
-    "e", cl::ZeroOrMore,
-    cl::desc("Specify the executable/library files to get the list of *.dwo from"),
+    "e",
+    cl::desc(
+        "Specify the executable/library files to get the list of *.dwo from"),
     cl::value_desc("filename"), cl::cat(DwpCategory));
 
 static cl::opt<std::string> OutputFilename(cl::Required, "o",
@@ -162,7 +165,7 @@ int main(int argc, char **argv) {
   if (!MII)
     return error("no instr info info for target " + TripleName, Context);
 
-  MCCodeEmitter *MCE = TheTarget->createMCCodeEmitter(*MII, *MRI, MC);
+  MCCodeEmitter *MCE = TheTarget->createMCCodeEmitter(*MII, MC);
   if (!MCE)
     return error("no code emitter for target " + TripleName, Context);
 
@@ -193,7 +196,7 @@ int main(int argc, char **argv) {
     return 1;
   }
 
-  MS->Finish();
+  MS->finish();
   OutFile.keep();
   return 0;
 }
diff --git a/llvm/tools/llvm-extract/llvm-extract.cpp b/llvm/tools/llvm-extract/llvm-extract.cpp
index 3cdef529504e..ffd2a390d9c3 100644
--- a/llvm/tools/llvm-extract/llvm-extract.cpp
+++ b/llvm/tools/llvm-extract/llvm-extract.cpp
@@ -66,8 +66,7 @@ static cl::opt<bool>
 // ExtractFuncs - The functions to extract from the module.
 static cl::list<std::string>
     ExtractFuncs("func", cl::desc("Specify function to extract"),
-                 cl::ZeroOrMore, cl::value_desc("function"),
-                 cl::cat(ExtractCat));
+                 cl::value_desc("function"), cl::cat(ExtractCat));
 
 // ExtractRegExpFuncs - The functions, matched via regular expression, to
 // extract from the module.
@@ -75,8 +74,7 @@ static cl::list<std::string>
     ExtractRegExpFuncs("rfunc",
                        cl::desc("Specify function(s) to extract using a "
                                 "regular expression"),
-                       cl::ZeroOrMore, cl::value_desc("rfunction"),
-                       cl::cat(ExtractCat));
+                       cl::value_desc("rfunction"), cl::cat(ExtractCat));
 
 // ExtractBlocks - The blocks to extract from the module.
 static cl::list<std::string> ExtractBlocks(
@@ -90,14 +88,12 @@ static cl::list<std::string> ExtractBlocks(
         "  --bb=f:bb1;bb2 will extract one function with both bb1 and bb2;\n"
         "  --bb=f:bb1 --bb=f:bb2 will extract two functions, one with bb1, one "
         "with bb2."),
-    cl::ZeroOrMore, cl::value_desc("function:bb1[;bb2...]"),
-    cl::cat(ExtractCat));
+    cl::value_desc("function:bb1[;bb2...]"), cl::cat(ExtractCat));
 
 // ExtractAlias - The alias to extract from the module.
 static cl::list<std::string>
     ExtractAliases("alias", cl::desc("Specify alias to extract"),
-                   cl::ZeroOrMore, cl::value_desc("alias"),
-                   cl::cat(ExtractCat));
+                   cl::value_desc("alias"), cl::cat(ExtractCat));
 
 // ExtractRegExpAliases - The aliases, matched via regular expression, to
 // extract from the module.
@@ -105,14 +101,12 @@ static cl::list<std::string>
     ExtractRegExpAliases("ralias",
                          cl::desc("Specify alias(es) to extract using a "
                                   "regular expression"),
-                         cl::ZeroOrMore, cl::value_desc("ralias"),
-                         cl::cat(ExtractCat));
+                         cl::value_desc("ralias"), cl::cat(ExtractCat));
 
 // ExtractGlobals - The globals to extract from the module.
 static cl::list<std::string>
     ExtractGlobals("glob", cl::desc("Specify global to extract"),
-                   cl::ZeroOrMore, cl::value_desc("global"),
-                   cl::cat(ExtractCat));
+                   cl::value_desc("global"), cl::cat(ExtractCat));
 
 // ExtractRegExpGlobals - The globals, matched via regular expression, to
 // extract from the module...
@@ -120,8 +114,7 @@ static cl::list<std::string>
     ExtractRegExpGlobals("rglob",
                          cl::desc("Specify global(s) to extract using a "
                                   "regular expression"),
-                         cl::ZeroOrMore, cl::value_desc("rglobal"),
-                         cl::cat(ExtractCat));
+                         cl::value_desc("rglobal"), cl::cat(ExtractCat));
 
 static cl::opt<bool> OutputAssembly("S",
                                     cl::desc("Write output as LLVM assembly"),
diff --git a/llvm/tools/llvm-link/llvm-link.cpp b/llvm/tools/llvm-link/llvm-link.cpp
index 9abe8efaa4e8..6585b193b2cb 100644
--- a/llvm/tools/llvm-link/llvm-link.cpp
+++ b/llvm/tools/llvm-link/llvm-link.cpp
@@ -48,7 +48,7 @@ static cl::list<std::string> InputFilenames(cl::Positional, cl::OneOrMore,
                                             cl::cat(LinkCategory));
 
 static cl::list<std::string> OverridingInputs(
-    "override", cl::ZeroOrMore, cl::value_desc("filename"),
+    "override", cl::value_desc("filename"),
     cl::desc(
         "input bitcode file which can override previously defined symbol(s)"),
     cl::cat(LinkCategory));
@@ -56,7 +56,7 @@ static cl::list<std::string> OverridingInputs(
 // Option to simulate function importing for testing. This enables using
 // llvm-link to simulate ThinLTO backend processes.
 static cl::list<std::string> Imports(
-    "import", cl::ZeroOrMore, cl::value_desc("function:filename"),
+    "import", cl::value_desc("function:filename"),
     cl::desc("Pair of function name and filename, where function should be "
              "imported from bitcode in filename"),
     cl::cat(LinkCategory));
@@ -124,6 +124,11 @@ static cl::opt<bool> NoVerify("disable-verify",
                               cl::desc("Do not run the verifier"), cl::Hidden,
                               cl::cat(LinkCategory));
 
+static cl::opt<bool> IgnoreNonBitcode(
+    "ignore-non-bitcode",
+    cl::desc("Do not report an error for non-bitcode files in archives"),
+    cl::Hidden);
+
 static ExitOnError ExitOnErr;
 
 // Read the specified bitcode file in and return it. This routine searches the
@@ -164,11 +169,16 @@ static std::unique_ptr<Module> loadArFile(const char *Argv0,
   if (Verbose)
     errs() << "Reading library archive file '" << ArchiveName
            << "' to memory\n";
-  Error Err = Error::success();
-  object::Archive Archive(*Buffer, Err);
-  ExitOnErr(std::move(Err));
+  Expected<std::unique_ptr<object::Archive>> ArchiveOrError =
+      object::Archive::create(Buffer->getMemBufferRef());
+  if (!ArchiveOrError)
+    ExitOnErr(ArchiveOrError.takeError());
+
+  std::unique_ptr<object::Archive> Archive = std::move(ArchiveOrError.get());
+
   Linker L(*Result);
-  for (const object::Archive::Child &C : Archive.children(Err)) {
+  Error Err = Error::success();
+  for (const object::Archive::Child &C : Archive->children(Err)) {
     Expected<StringRef> Ename = C.getName();
     if (Error E = Ename.takeError()) {
       errs() << Argv0 << ": ";
@@ -194,6 +204,8 @@ static std::unique_ptr<Module> loadArFile(const char *Argv0,
                        MemBuf.get().getBufferStart()),
                    reinterpret_cast<const unsigned char *>(
                        MemBuf.get().getBufferEnd()))) {
+      if (IgnoreNonBitcode)
+        continue;
       errs() << Argv0 << ": ";
       WithColor::error() << "  member of archive is not a bitcode file: '"
                          << ChildName << "'\n";
diff --git a/llvm/tools/llvm-lto/llvm-lto.cpp b/llvm/tools/llvm-lto/llvm-lto.cpp
index 8fc3a5d68500..c8266616b73d 100644
--- a/llvm/tools/llvm-lto/llvm-lto.cpp
+++ b/llvm/tools/llvm-lto/llvm-lto.cpp
@@ -71,7 +71,7 @@ static cl::opt<char>
     OptLevel("O",
              cl::desc("Optimization level. [-O0, -O1, -O2, or -O3] "
                       "(default = '-O2')"),
-             cl::Prefix, cl::ZeroOrMore, cl::init('2'), cl::cat(LTOCategory));
+             cl::Prefix, cl::init('2'), cl::cat(LTOCategory));
 
 static cl::opt<bool>
     IndexStats("thinlto-index-stats",
@@ -210,12 +210,12 @@ static cl::opt<std::string> OutputFilename("o", cl::init(""),
 static cl::list<std::string> ExportedSymbols(
     "exported-symbol",
     cl::desc("List of symbols to export from the resulting object file"),
-    cl::ZeroOrMore, cl::cat(LTOCategory));
+    cl::cat(LTOCategory));
 
 static cl::list<std::string>
     DSOSymbols("dso-symbol",
                cl::desc("Symbol to put in the symtab in the resulting dso"),
-               cl::ZeroOrMore, cl::cat(LTOCategory));
+               cl::cat(LTOCategory));
 
 static cl::opt<bool> ListSymbolsOnly(
     "list-symbols-only", cl::init(false),
@@ -256,10 +256,6 @@ static cl::opt<bool> PrintMachOCPUOnly(
     cl::desc("Instead of running LTO, print the mach-o cpu in each IR file"),
     cl::cat(LTOCategory));
 
-static cl::opt<bool> UseNewPM(
-    "use-new-pm", cl::desc("Run LTO passes using the new pass manager"),
-    cl::init(LLVM_ENABLE_NEW_PASS_MANAGER), cl::Hidden, cl::cat(LTOCategory));
-
 static cl::opt<bool>
     DebugPassManager("debug-pass-manager", cl::init(false), cl::Hidden,
                      cl::desc("Print pass management debugging information"),
@@ -604,7 +600,6 @@ public:
     ThinGenerator.setCacheMaxSizeFiles(ThinLTOCacheMaxSizeFiles);
     ThinGenerator.setCacheMaxSizeBytes(ThinLTOCacheMaxSizeBytes);
     ThinGenerator.setFreestanding(EnableFreestanding);
-    ThinGenerator.setUseNewPM(UseNewPM);
     ThinGenerator.setDebugPassManager(DebugPassManager);
 
     // Add all the exported symbols to the table of symbols to preserve.
@@ -1015,6 +1010,7 @@ int main(int argc, char **argv) {
 
   CodeGen.setCodePICModel(codegen::getExplicitRelocModel());
   CodeGen.setFreestanding(EnableFreestanding);
+  CodeGen.setDebugPassManager(DebugPassManager);
 
   CodeGen.setDebugInfo(LTO_DEBUG_MODEL_DWARF);
   CodeGen.setTargetOptions(Options);
@@ -1069,10 +1065,8 @@ int main(int argc, char **argv) {
   CodeGen.setOptLevel(OptLevel - '0');
   CodeGen.setAttrs(codegen::getMAttrs());
 
-  CodeGen.setUseNewPM(UseNewPM);
-
   if (auto FT = codegen::getExplicitFileType())
-    CodeGen.setFileType(FT.getValue());
+    CodeGen.setFileType(*FT);
 
   if (!OutputFilename.empty()) {
     if (SaveLinkedModuleFile) {
diff --git a/llvm/tools/llvm-lto2/llvm-lto2.cpp b/llvm/tools/llvm-lto2/llvm-lto2.cpp
index 7416e5850944..f79db36d2d2d 100644
--- a/llvm/tools/llvm-lto2/llvm-lto2.cpp
+++ b/llvm/tools/llvm-lto2/llvm-lto2.cpp
@@ -37,9 +37,10 @@ using namespace lto;
 static codegen::RegisterCodeGenFlags CGF;
 
 static cl::opt<char>
-    OptLevel("O", cl::desc("Optimization level. [-O0, -O1, -O2, or -O3] "
-                           "(default = '-O2')"),
-             cl::Prefix, cl::ZeroOrMore, cl::init('2'));
+    OptLevel("O",
+             cl::desc("Optimization level. [-O0, -O1, -O2, or -O3] "
+                      "(default = '-O2')"),
+             cl::Prefix, cl::init('2'));
 
 static cl::opt<char> CGOptLevel(
     "cg-opt-level",
@@ -67,11 +68,23 @@ static cl::opt<std::string> AAPipeline("aa-pipeline",
 static cl::opt<bool> SaveTemps("save-temps", cl::desc("Save temporary files"));
 
 static cl::opt<bool>
-    ThinLTODistributedIndexes("thinlto-distributed-indexes", cl::init(false),
+    ThinLTODistributedIndexes("thinlto-distributed-indexes",
                               cl::desc("Write out individual index and "
                                        "import files for the "
                                        "distributed backend case"));
 
+static cl::opt<bool>
+    ThinLTOEmitIndexes("thinlto-emit-indexes",
+                       cl::desc("Write out individual index files via "
+                                "InProcessThinLTO"));
+
+static cl::opt<bool>
+    ThinLTOEmitImports("thinlto-emit-imports",
+                       cl::desc("Write out individual imports files via "
+                                "InProcessThinLTO. Has no effect unless "
+                                "specified with -thinlto-emit-indexes or "
+                                "-thinlto-distributed-indexes"));
+
 // Default to using all available threads in the system, but using only one
 // thread per core (no SMT).
 // Use -thinlto-threads=all to use hardware_concurrency() instead, which means
@@ -89,8 +102,7 @@ static cl::list<std::string> SymbolResolutions(
              "     runtime and is known to be in this linkage unit\n"
              " x - externally visible: the definition of this symbol is\n"
              "     visible outside of the LTO unit\n"
-             "A resolution for each symbol must be specified."),
-    cl::ZeroOrMore);
+             "A resolution for each symbol must be specified"));
 
 static cl::opt<std::string> OverrideTriple(
     "override-triple",
@@ -141,15 +153,14 @@ static cl::opt<std::string>
 static cl::opt<bool>
     RunCSIRInstr("lto-cspgo-gen",
                  cl::desc("Run PGO context sensitive IR instrumentation"),
-                 cl::init(false), cl::Hidden);
+                 cl::Hidden);
 
-static cl::opt<bool>
-    UseNewPM("use-new-pm",
-             cl::desc("Run LTO passes using the new pass manager"),
-             cl::init(LLVM_ENABLE_NEW_PASS_MANAGER), cl::Hidden);
+static cl::opt<bool> LtoOpaquePointers("lto-opaque-pointers",
+                                       cl::desc("Enable opaque pointer types"),
+                                       cl::init(true), cl::Hidden);
 
 static cl::opt<bool>
-    DebugPassManager("debug-pass-manager", cl::init(false), cl::Hidden,
+    DebugPassManager("debug-pass-manager", cl::Hidden,
                      cl::desc("Print pass management debugging information"));
 
 static cl::opt<std::string>
@@ -162,7 +173,7 @@ static cl::list<std::string>
 static cl::opt<bool> EnableFreestanding(
     "lto-freestanding",
     cl::desc("Enable Freestanding (disable builtins / TLI) during LTO"),
-    cl::init(false), cl::Hidden);
+    cl::Hidden);
 
 static void check(Error E, std::string Msg) {
   if (!E)
@@ -242,7 +253,7 @@ static int run(int argc, char **argv) {
   Conf.Options = codegen::InitTargetOptionsFromCodeGenFlags(Triple());
   Conf.MAttrs = codegen::getMAttrs();
   if (auto RM = codegen::getExplicitRelocModel())
-    Conf.RelocModel = RM.getValue();
+    Conf.RelocModel = *RM;
   Conf.CodeModel = codegen::getExplicitCodeModel();
 
   Conf.DebugPassManager = DebugPassManager;
@@ -267,7 +278,6 @@ static int run(int argc, char **argv) {
   Conf.AAPipeline = AAPipeline;
 
   Conf.OptLevel = OptLevel - '0';
-  Conf.UseNewPM = UseNewPM;
   Conf.Freestanding = EnableFreestanding;
   for (auto &PluginFN : PassPlugins)
     Conf.PassPlugins.push_back(PluginFN);
@@ -290,24 +300,27 @@ static int run(int argc, char **argv) {
   }
 
   if (auto FT = codegen::getExplicitFileType())
-    Conf.CGFileType = FT.getValue();
+    Conf.CGFileType = *FT;
 
   Conf.OverrideTriple = OverrideTriple;
   Conf.DefaultTriple = DefaultTriple;
   Conf.StatsFile = StatsFile;
   Conf.PTO.LoopVectorization = Conf.OptLevel > 1;
   Conf.PTO.SLPVectorization = Conf.OptLevel > 1;
+  Conf.OpaquePointers = LtoOpaquePointers;
 
   ThinBackend Backend;
   if (ThinLTODistributedIndexes)
-    Backend = createWriteIndexesThinBackend(/* OldPrefix */ "",
-                                            /* NewPrefix */ "",
-                                            /* ShouldEmitImportsFiles */ true,
-                                            /* LinkedObjectsFile */ nullptr,
-                                            /* OnWrite */ {});
+    Backend =
+        createWriteIndexesThinBackend(/* OldPrefix */ "",
+                                      /* NewPrefix */ "", ThinLTOEmitImports,
+                                      /* LinkedObjectsFile */ nullptr,
+                                      /* OnWrite */ {});
   else
     Backend = createInProcessThinBackend(
-        llvm::heavyweight_hardware_concurrency(Threads));
+        llvm::heavyweight_hardware_concurrency(Threads),
+        /* OnWrite */ {}, ThinLTOEmitIndexes, ThinLTOEmitImports);
+
   // Track whether we hit an error; in particular, in the multi-threaded case,
   // we can't exit() early because the rest of the threads wouldn't have had a
   // change to be join-ed, and that would result in a "terminate called without
diff --git a/llvm/tools/llvm-mc/llvm-mc.cpp b/llvm/tools/llvm-mc/llvm-mc.cpp
index 4e5a12e53a6b..2a525f53ec29 100644
--- a/llvm/tools/llvm-mc/llvm-mc.cpp
+++ b/llvm/tools/llvm-mc/llvm-mc.cpp
@@ -541,7 +541,7 @@ int main(int argc, char **argv) {
     // Set up the AsmStreamer.
     std::unique_ptr<MCCodeEmitter> CE;
     if (ShowEncoding)
-      CE.reset(TheTarget->createMCCodeEmitter(*MCII, *MRI, Ctx));
+      CE.reset(TheTarget->createMCCodeEmitter(*MCII, Ctx));
 
     std::unique_ptr<MCAsmBackend> MAB(
         TheTarget->createMCAsmBackend(*STI, *MRI, MCOptions));
@@ -561,7 +561,7 @@ int main(int argc, char **argv) {
       OS = BOS.get();
     }
 
-    MCCodeEmitter *CE = TheTarget->createMCCodeEmitter(*MCII, *MRI, Ctx);
+    MCCodeEmitter *CE = TheTarget->createMCCodeEmitter(*MCII, Ctx);
     MCAsmBackend *MAB = TheTarget->createMCAsmBackend(*STI, *MRI, MCOptions);
     Str.reset(TheTarget->createMCObjectStreamer(
         TheTriple, Ctx, std::unique_ptr<MCAsmBackend>(MAB),
diff --git a/llvm/tools/llvm-mca/CodeRegionGenerator.cpp b/llvm/tools/llvm-mca/CodeRegionGenerator.cpp
index 6cdd0ba797aa..cb8e1822ee30 100644
--- a/llvm/tools/llvm-mca/CodeRegionGenerator.cpp
+++ b/llvm/tools/llvm-mca/CodeRegionGenerator.cpp
@@ -16,6 +16,7 @@
 #include "CodeRegionGenerator.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCTargetOptions.h"
@@ -62,10 +63,10 @@ public:
                     uint64_t Size = 0, unsigned ByteAlignment = 0,
                     SMLoc Loc = SMLoc()) override {}
   void emitGPRel32Value(const MCExpr *Value) override {}
-  void BeginCOFFSymbolDef(const MCSymbol *Symbol) override {}
-  void EmitCOFFSymbolStorageClass(int StorageClass) override {}
-  void EmitCOFFSymbolType(int Type) override {}
-  void EndCOFFSymbolDef() override {}
+  void beginCOFFSymbolDef(const MCSymbol *Symbol) override {}
+  void emitCOFFSymbolStorageClass(int StorageClass) override {}
+  void emitCOFFSymbolType(int Type) override {}
+  void endCOFFSymbolDef() override {}
 
   ArrayRef<MCInst> GetInstructionSequence(unsigned Index) const {
     return Regions.getInstructionSequence(Index);
diff --git a/llvm/tools/llvm-mca/Views/InstructionInfoView.cpp b/llvm/tools/llvm-mca/Views/InstructionInfoView.cpp
index caa8554a416a..67b636737b97 100644
--- a/llvm/tools/llvm-mca/Views/InstructionInfoView.cpp
+++ b/llvm/tools/llvm-mca/Views/InstructionInfoView.cpp
@@ -70,7 +70,7 @@ void InstructionInfoView::printView(raw_ostream &OS) const {
     else if (IIVDEntry.Latency < 100)
       TempStream << ' ';
 
-    if (IIVDEntry.RThroughput.hasValue()) {
+    if (IIVDEntry.RThroughput) {
       double RT = IIVDEntry.RThroughput.getValue();
       TempStream << format("%.2f", RT) << ' ';
       if (RT < 10.0)
@@ -152,7 +152,7 @@ InstructionInfoView::toJSON(const InstructionInfoViewData &IIVD) const {
                    {"mayLoad", IIVD.mayLoad},
                    {"mayStore", IIVD.mayStore},
                    {"hasUnmodeledSideEffects", IIVD.hasUnmodeledSideEffects}});
-  JO.try_emplace("RThroughput", IIVD.RThroughput.getValueOr(0.0));
+  JO.try_emplace("RThroughput", IIVD.RThroughput.value_or(0.0));
   return JO;
 }
 
diff --git a/llvm/tools/llvm-mca/Views/InstructionView.h b/llvm/tools/llvm-mca/Views/InstructionView.h
index cec07eef6a80..ae57246fc35f 100644
--- a/llvm/tools/llvm-mca/Views/InstructionView.h
+++ b/llvm/tools/llvm-mca/Views/InstructionView.h
@@ -17,9 +17,10 @@
 
 #include "llvm/MCA/View.h"
 #include "llvm/Support/JSON.h"
-#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
+class MCInstPrinter;
+
 namespace mca {
 
 // The base class for views that deal with individual machine instructions.
diff --git a/llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp b/llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp
index 7a341d4c2079..06caeda344c8 100644
--- a/llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp
+++ b/llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp
@@ -48,23 +48,23 @@ void SchedulerStatistics::onEvent(const HWInstructionEvent &Event) {
   } else if (Event.Type == HWInstructionEvent::Dispatched) {
     const Instruction &Inst = *Event.IR.getInstruction();
     const unsigned Index = Event.IR.getSourceIndex();
-    if (LQResourceID && Inst.getDesc().MayLoad &&
+    if (LQResourceID && Inst.getMayLoad() &&
         MostRecentLoadDispatched != Index) {
       Usage[LQResourceID].SlotsInUse++;
       MostRecentLoadDispatched = Index;
     }
-    if (SQResourceID && Inst.getDesc().MayStore &&
+    if (SQResourceID && Inst.getMayStore() &&
         MostRecentStoreDispatched != Index) {
       Usage[SQResourceID].SlotsInUse++;
       MostRecentStoreDispatched = Index;
     }
   } else if (Event.Type == HWInstructionEvent::Executed) {
     const Instruction &Inst = *Event.IR.getInstruction();
-    if (LQResourceID && Inst.getDesc().MayLoad) {
+    if (LQResourceID && Inst.getMayLoad()) {
       assert(Usage[LQResourceID].SlotsInUse);
       Usage[LQResourceID].SlotsInUse--;
     }
-    if (SQResourceID && Inst.getDesc().MayStore) {
+    if (SQResourceID && Inst.getMayStore()) {
       assert(Usage[SQResourceID].SlotsInUse);
       Usage[SQResourceID].SlotsInUse--;
     }
diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp
index 1826491f3f30..409de283e5a1 100644
--- a/llvm/tools/llvm-mca/llvm-mca.cpp
+++ b/llvm/tools/llvm-mca/llvm-mca.cpp
@@ -465,6 +465,21 @@ int main(int argc, char **argv) {
 
   const MCSchedModel &SM = STI->getSchedModel();
 
+  std::unique_ptr<mca::InstrPostProcess> IPP;
+  if (!DisableCustomBehaviour) {
+    // TODO: It may be a good idea to separate CB and IPP so that they can
+    // be used independently of each other. What I mean by this is to add
+    // an extra command-line arg --disable-ipp so that CB and IPP can be
+    // toggled without needing to toggle both of them together.
+    IPP = std::unique_ptr<mca::InstrPostProcess>(
+        TheTarget->createInstrPostProcess(*STI, *MCII));
+  }
+  if (!IPP) {
+    // If the target doesn't have its own IPP implemented (or the -disable-cb
+    // flag is set) then we use the base class (which does nothing).
+    IPP = std::make_unique<mca::InstrPostProcess>(*STI, *MCII);
+  }
+
   // Create an instruction builder.
   mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get());
 
@@ -479,7 +494,7 @@ int main(int argc, char **argv) {
   unsigned RegionIdx = 0;
 
   std::unique_ptr<MCCodeEmitter> MCE(
-      TheTarget->createMCCodeEmitter(*MCII, *MRI, Ctx));
+      TheTarget->createMCCodeEmitter(*MCII, Ctx));
   assert(MCE && "Unable to create code emitter!");
 
   std::unique_ptr<MCAsmBackend> MAB(TheTarget->createMCAsmBackend(
@@ -498,16 +513,7 @@ int main(int argc, char **argv) {
     ArrayRef<MCInst> Insts = Region->getInstructions();
     mca::CodeEmitter CE(*STI, *MAB, *MCE, Insts);
 
-    std::unique_ptr<mca::InstrPostProcess> IPP;
-    if (!DisableCustomBehaviour) {
-      IPP = std::unique_ptr<mca::InstrPostProcess>(
-          TheTarget->createInstrPostProcess(*STI, *MCII));
-    }
-    if (!IPP)
-      // If the target doesn't have its own IPP implemented (or the
-      // -disable-cb flag is set) then we use the base class
-      // (which does nothing).
-      IPP = std::make_unique<mca::InstrPostProcess>(*STI, *MCII);
+    IPP->resetState();
 
     SmallVector<std::unique_ptr<mca::Instruction>> LoweredSequence;
     for (const MCInst &MCI : Insts) {
@@ -536,7 +542,8 @@ int main(int argc, char **argv) {
       LoweredSequence.emplace_back(std::move(Inst.get()));
     }
 
-    mca::SourceMgr S(LoweredSequence, PrintInstructionTables ? 1 : Iterations);
+    mca::CircularSourceMgr S(LoweredSequence,
+                             PrintInstructionTables ? 1 : Iterations);
 
     if (PrintInstructionTables) {
       //  Create a pipeline, stages, and a printer.
diff --git a/llvm/tools/llvm-modextract/llvm-modextract.cpp b/llvm/tools/llvm-modextract/llvm-modextract.cpp
index b1d6bfb790ec..50f503ae0ac4 100644
--- a/llvm/tools/llvm-modextract/llvm-modextract.cpp
+++ b/llvm/tools/llvm-modextract/llvm-modextract.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/WithColor.h"
 
diff --git a/llvm/tools/llvm-nm/Opts.td b/llvm/tools/llvm-nm/Opts.td
index 3a790890909a..60ac134269b3 100644
--- a/llvm/tools/llvm-nm/Opts.td
+++ b/llvm/tools/llvm-nm/Opts.td
@@ -13,10 +13,12 @@ multiclass Eq<string name, string help> {
   def : Separate<["--"], name>, Alias<!cast<Joined>(NAME #_EQ)>;
 }
 
+def X : JoinedOrSeparate<["-"], "X">, HelpText<"Specifies the type of ELF, XCOFF, or IR object file to examine. The value must be one of: 32, 64, 32_64, any (default)">;
 def debug_syms : FF<"debug-syms", "Show all symbols, even debugger only">;
 def defined_only : FF<"defined-only", "Show only defined symbols">;
 defm demangle : BB<"demangle", "Demangle C++ symbol names", "Don't demangle symbol names">;
 def dynamic : FF<"dynamic", "Display dynamic symbols instead of normal symbols">;
+def export_symbols : FF<"export-symbols", "Export symbol list for all inputs">;
 def extern_only : FF<"extern-only", "Show only external symbols">;
 defm format : Eq<"format", "Specify output format: bsd (default), posix, sysv, darwin, just-symbols">, MetaVarName<"<format>">;
 def help : FF<"help", "Display this help">;
@@ -48,6 +50,11 @@ def no_dyldinfo : FF<"no-dyldinfo", "Don't add any symbols from the dyldinfo">,
 def s : F<"s", "Dump only symbols from this segment and section name">, Group<grp_mach_o>;
 def x : F<"x", "Print symbol entry in hex">, Group<grp_mach_o>;
 
+// XCOFF specific options.
+def grp_xcoff_o : OptionGroup<"kind">, HelpText<"llvm-nm XCOFF Specific Options">;
+
+def no_rsrc : FF<"no-rsrc", "Exclude resource file symbols (__rsrc) from the export symbol list.">, Group<grp_xcoff_o>;
+
 def : FF<"just-symbol-name", "Alias for --format=just-symbols">, Alias<format_EQ>, AliasArgs<["just-symbols"]>, Flags<[HelpHidden]>;
 def : FF<"portability", "Alias for --format=posix">, Alias<format_EQ>, AliasArgs<["posix"]>;
 
@@ -70,7 +77,7 @@ def : F<"r", "Alias for --reverse-sort">, Alias<reverse_sort>;
 def : F<"S", "Alias for --print-size">, Alias<print_size>;
 def : JoinedOrSeparate<["-"], "t">, HelpText<"Alias for --radix">, Alias<radix_EQ>, MetaVarName<"<radix>">;
 def : F<"u", "Alias for --undefined-only">, Alias<undefined_only>;
-def : F<"U", "Deprecated alias for --defined-only">, Alias<defined_only>, Flags<[HelpHidden]>;
+def : F<"U", "Alias for --defined-only">, Alias<defined_only>;
 def : F<"v", "Alias for --numeric-sort">, Alias<numeric_sort>;
 def : F<"V", "Alias for --version">, Alias<version>;
-def : F<"W", "Deprecated alias for --no-weak">, Alias<no_weak>, Flags<[HelpHidden]>;
+def : F<"W", "Alias for --no-weak">, Alias<no_weak>;
diff --git a/llvm/tools/llvm-nm/llvm-nm.cpp b/llvm/tools/llvm-nm/llvm-nm.cpp
index f1d8b0026429..f0def8b74e60 100644
--- a/llvm/tools/llvm-nm/llvm-nm.cpp
+++ b/llvm/tools/llvm-nm/llvm-nm.cpp
@@ -17,6 +17,7 @@
 
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/BinaryFormat/COFF.h"
+#include "llvm/BinaryFormat/XCOFF.h"
 #include "llvm/Demangle/Demangle.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
@@ -83,13 +84,16 @@ public:
 };
 
 enum OutputFormatTy { bsd, sysv, posix, darwin, just_symbols };
+enum class BitModeTy { Bit32, Bit64, Bit32_64, Any };
 } // namespace
 
 static bool ArchiveMap;
+static BitModeTy BitMode;
 static bool DebugSyms;
 static bool DefinedOnly;
 static bool Demangle;
 static bool DynamicSyms;
+static bool ExportSymbols;
 static bool ExternalOnly;
 static OutputFormatTy OutputFormat;
 static bool NoLLVMBitcode;
@@ -105,6 +109,9 @@ static bool SizeSort;
 static bool UndefinedOnly;
 static bool WithoutAliases;
 
+// XCOFF-specific options.
+static bool NoRsrc;
+
 namespace {
 enum Radix { d, o, x };
 } // namespace
@@ -128,7 +135,8 @@ static bool HadError = false;
 
 static StringRef ToolName;
 
-static void warn(Error Err, Twine FileName, Twine Context = Twine()) {
+static void warn(Error Err, Twine FileName, Twine Context = Twine(),
+                 Twine Archive = Twine()) {
   assert(Err);
 
   // Flush the standard output so that the warning isn't interleaved with other
@@ -137,8 +145,9 @@ static void warn(Error Err, Twine FileName, Twine Context = Twine()) {
 
   handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) {
     WithColor::warning(errs(), ToolName)
-        << FileName << ": " << (Context.str().empty() ? "" : Context + ": ")
-        << EI.message() << "\n";
+        << (Archive.str().empty() ? FileName : Archive + "(" + FileName + ")")
+        << ": " << (Context.str().empty() ? "" : Context + ": ") << EI.message()
+        << "\n";
   });
 }
 
@@ -211,6 +220,8 @@ struct NMSymbol {
   StringRef SectionName;
   StringRef TypeName;
   BasicSymbolRef Sym;
+  StringRef Visibility;
+
   // The Sym field above points to the native symbol in the object file,
   // for Mach-O when we are creating symbols from the dyld info the above
   // pointer is null as there is no native symbol.  In these cases the fields
@@ -222,40 +233,59 @@ struct NMSymbol {
   uint8_t NSect;
   uint16_t NDesc;
   std::string IndirectName;
-};
-} // anonymous namespace
 
-static bool compareSymbolAddress(const NMSymbol &A, const NMSymbol &B) {
-  bool ADefined;
-  // Symbol flags have been checked in the caller.
-  if (A.Sym.getRawDataRefImpl().p) {
-    uint32_t AFlags = cantFail(A.Sym.getFlags());
-    ADefined = !(AFlags & SymbolRef::SF_Undefined);
-  } else {
-    ADefined = A.TypeChar != 'U';
+  bool isDefined() const {
+    if (Sym.getRawDataRefImpl().p) {
+      uint32_t Flags = cantFail(Sym.getFlags());
+      return !(Flags & SymbolRef::SF_Undefined);
+    }
+    return TypeChar != 'U';
   }
-  bool BDefined;
-  // Symbol flags have been checked in the caller.
-  if (B.Sym.getRawDataRefImpl().p) {
-    uint32_t BFlags = cantFail(B.Sym.getFlags());
-    BDefined = !(BFlags & SymbolRef::SF_Undefined);
-  } else {
-    BDefined = B.TypeChar != 'U';
+
+  bool initializeFlags(const SymbolicFile &Obj) {
+    Expected<uint32_t> SymFlagsOrErr = Sym.getFlags();
+    if (!SymFlagsOrErr) {
+      // TODO: Test this error.
+      error(SymFlagsOrErr.takeError(), Obj.getFileName());
+      return false;
+    }
+    SymFlags = *SymFlagsOrErr;
+    return true;
   }
-  return std::make_tuple(ADefined, A.Address, A.Name, A.Size) <
-         std::make_tuple(BDefined, B.Address, B.Name, B.Size);
-}
 
-static bool compareSymbolSize(const NMSymbol &A, const NMSymbol &B) {
-  return std::make_tuple(A.Size, A.Name, A.Address) <
-         std::make_tuple(B.Size, B.Name, B.Address);
-}
+  bool shouldPrint() const {
+    bool Undefined = SymFlags & SymbolRef::SF_Undefined;
+    bool Global = SymFlags & SymbolRef::SF_Global;
+    bool Weak = SymFlags & SymbolRef::SF_Weak;
+    bool FormatSpecific = SymFlags & SymbolRef::SF_FormatSpecific;
+    if ((!Undefined && UndefinedOnly) || (Undefined && DefinedOnly) ||
+        (!Global && ExternalOnly) || (Weak && NoWeakSymbols) ||
+        (FormatSpecific && !(SpecialSyms || DebugSyms)))
+      return false;
+    return true;
+  }
+};
 
-static bool compareSymbolName(const NMSymbol &A, const NMSymbol &B) {
+bool operator<(const NMSymbol &A, const NMSymbol &B) {
+  if (NumericSort)
+    return std::make_tuple(A.isDefined(), A.Address, A.Name, A.Size) <
+           std::make_tuple(B.isDefined(), B.Address, B.Name, B.Size);
+  if (SizeSort)
+    return std::make_tuple(A.Size, A.Name, A.Address) <
+           std::make_tuple(B.Size, B.Name, B.Address);
+  if (ExportSymbols)
+    return std::make_tuple(A.Name, A.Visibility) <
+           std::make_tuple(B.Name, B.Visibility);
   return std::make_tuple(A.Name, A.Size, A.Address) <
          std::make_tuple(B.Name, B.Size, B.Address);
 }
 
+bool operator>(const NMSymbol &A, const NMSymbol &B) { return B < A; }
+bool operator==(const NMSymbol &A, const NMSymbol &B) {
+  return !(A < B) && !(B < A);
+}
+} // anonymous namespace
+
 static char isSymbolList64Bit(SymbolicFile &Obj) {
   if (auto *IRObj = dyn_cast<IRObjectFile>(&Obj))
     return Triple(IRObj->getTargetTriple()).isArch64Bit();
@@ -263,7 +293,6 @@ static char isSymbolList64Bit(SymbolicFile &Obj) {
     return false;
   if (XCOFFObjectFile *XCOFFObj = dyn_cast<XCOFFObjectFile>(&Obj))
     return XCOFFObj->is64Bit();
-
   if (isa<WasmObjectFile>(Obj))
     return false;
   if (TapiFile *Tapi = dyn_cast<TapiFile>(&Obj))
@@ -274,7 +303,6 @@ static char isSymbolList64Bit(SymbolicFile &Obj) {
 }
 
 static StringRef CurrentFilename;
-static std::vector<NMSymbol> SymbolList;
 
 static char getSymbolNMTypeChar(IRObjectFile &Obj, basic_symbol_iterator I);
 
@@ -658,27 +686,28 @@ static void writeFileName(raw_ostream &S, StringRef ArchiveName,
   }
 }
 
-static void sortAndPrintSymbolList(SymbolicFile &Obj, bool printName,
-                                   StringRef ArchiveName,
-                                   StringRef ArchitectureName) {
-  if (!NoSort) {
-    using Comparator = bool (*)(const NMSymbol &, const NMSymbol &);
-    Comparator Cmp;
-    if (NumericSort)
-      Cmp = &compareSymbolAddress;
-    else if (SizeSort)
-      Cmp = &compareSymbolSize;
-    else
-      Cmp = &compareSymbolName;
+static void sortSymbolList(std::vector<NMSymbol> &SymbolList) {
+  if (NoSort)
+    return;
 
-    if (ReverseSort)
-      llvm::sort(SymbolList, [=](const NMSymbol &A, const NMSymbol &B) -> bool {
-        return Cmp(B, A);
-      });
-    else
-      llvm::sort(SymbolList, Cmp);
+  if (ReverseSort)
+    llvm::sort(SymbolList, std::greater<>());
+  else
+    llvm::sort(SymbolList);
+}
+
+static void printExportSymbolList(const std::vector<NMSymbol> &SymbolList) {
+  for (const NMSymbol &Sym : SymbolList) {
+    outs() << Sym.Name;
+    if (!Sym.Visibility.empty())
+      outs() << ' ' << Sym.Visibility;
+    outs() << '\n';
   }
+}
 
+static void printSymbolList(SymbolicFile &Obj,
+                            std::vector<NMSymbol> &SymbolList, bool printName,
+                            StringRef ArchiveName, StringRef ArchitectureName) {
   if (!PrintFileName) {
     if ((OutputFormat == bsd || OutputFormat == posix ||
          OutputFormat == just_symbols) &&
@@ -725,7 +754,9 @@ static void sortAndPrintSymbolList(SymbolicFile &Obj, bool printName,
   }
 
   for (const NMSymbol &S : SymbolList) {
-    uint32_t SymFlags;
+    if (!S.shouldPrint())
+      continue;
+
     std::string Name = S.Name;
     MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(&Obj);
     if (Demangle) {
@@ -737,25 +768,7 @@ static void sortAndPrintSymbolList(SymbolicFile &Obj, bool printName,
       if (Optional<std::string> Opt = Fn(S.Name))
         Name = *Opt;
     }
-    if (S.Sym.getRawDataRefImpl().p) {
-      Expected<uint32_t> SymFlagsOrErr = S.Sym.getFlags();
-      if (!SymFlagsOrErr) {
-        // TODO: Test this error.
-        error(SymFlagsOrErr.takeError(), Obj.getFileName());
-        return;
-      }
-      SymFlags = *SymFlagsOrErr;
-    } else
-      SymFlags = S.SymFlags;
 
-    bool Undefined = SymFlags & SymbolRef::SF_Undefined;
-    bool Global = SymFlags & SymbolRef::SF_Global;
-    bool Weak = SymFlags & SymbolRef::SF_Weak;
-    bool FormatSpecific = SymFlags & SymbolRef::SF_FormatSpecific;
-    if ((!Undefined && UndefinedOnly) || (Undefined && DefinedOnly) ||
-        (!Global && ExternalOnly) || (Weak && NoWeakSymbols) ||
-        (FormatSpecific && !(SpecialSyms || DebugSyms)))
-      continue;
     if (PrintFileName)
       writeFileName(outs(), ArchiveName, ArchitectureName);
     if ((OutputFormat == just_symbols ||
@@ -1141,7 +1154,7 @@ static char getNMSectionTagAndName(SymbolicFile &Obj, basic_symbol_iterator I,
 
 // getNsectForSegSect() is used to implement the Mach-O "-s segname sectname"
 // option to dump only those symbols from that section in a Mach-O file.
-// It is called once for each Mach-O file from dumpSymbolNamesFromObject()
+// It is called once for each Mach-O file from getSymbolNamesFromObject()
 // to get the section number for that named section from the command line
 // arguments. It returns the section number for that section in the Mach-O
 // file or zero it is not present.
@@ -1163,7 +1176,7 @@ static unsigned getNsectForSegSect(MachOObjectFile *Obj) {
 // getNsectInMachO() is used to implement the Mach-O "-s segname sectname"
 // option to dump only those symbols from that section in a Mach-O file.
 // It is called once for each symbol in a Mach-O file from
-// dumpSymbolNamesFromObject() and returns the section number for that symbol
+// getSymbolNamesFromObject() and returns the section number for that symbol
 // if it is in a section, else it returns 0.
 static unsigned getNsectInMachO(MachOObjectFile &Obj, BasicSymbolRef Sym) {
   DataRefImpl Symb = Sym.getRawDataRefImpl();
@@ -1175,7 +1188,8 @@ static unsigned getNsectInMachO(MachOObjectFile &Obj, BasicSymbolRef Sym) {
   return (STE.n_type & MachO::N_TYPE) == MachO::N_SECT ? STE.n_sect : 0;
 }
 
-static void dumpSymbolsFromDLInfoMachO(MachOObjectFile &MachO) {
+static void dumpSymbolsFromDLInfoMachO(MachOObjectFile &MachO,
+                                       std::vector<NMSymbol> &SymbolList) {
   size_t I = SymbolList.size();
   std::string ExportsNameBuffer;
   raw_string_ostream EOS(ExportsNameBuffer);
@@ -1642,28 +1656,127 @@ static void dumpSymbolsFromDLInfoMachO(MachOObjectFile &MachO) {
   }
 }
 
-static void dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
-                                      StringRef ArchiveName = {},
-                                      StringRef ArchitectureName = {}) {
+static bool shouldDump(SymbolicFile &Obj) {
+  // The -X option is currently only implemented for XCOFF, ELF, and IR object
+  // files. The option isn't fundamentally impossible with other formats, just
+  // isn't implemented.
+  if (!isa<XCOFFObjectFile>(Obj) && !isa<ELFObjectFileBase>(Obj) &&
+      !isa<IRObjectFile>(Obj))
+    return true;
+
+  return isSymbolList64Bit(Obj) ? BitMode != BitModeTy::Bit32
+                                : BitMode != BitModeTy::Bit64;
+}
+
+static void getXCOFFExports(XCOFFObjectFile *XCOFFObj,
+                            std::vector<NMSymbol> &SymbolList,
+                            StringRef ArchiveName) {
+  // Skip Shared object file.
+  if (XCOFFObj->getFlags() & XCOFF::F_SHROBJ)
+    return;
+
+  for (SymbolRef Sym : XCOFFObj->symbols()) {
+    // There is no visibility in old 32 bit XCOFF object file interpret.
+    bool HasVisibilityAttr =
+        XCOFFObj->is64Bit() || (XCOFFObj->auxiliaryHeader32() &&
+                                (XCOFFObj->auxiliaryHeader32()->getVersion() ==
+                                 XCOFF::NEW_XCOFF_INTERPRET));
+
+    if (HasVisibilityAttr) {
+      XCOFFSymbolRef XCOFFSym = XCOFFObj->toSymbolRef(Sym.getRawDataRefImpl());
+      uint16_t SymType = XCOFFSym.getSymbolType();
+      if ((SymType & XCOFF::VISIBILITY_MASK) == XCOFF::SYM_V_INTERNAL)
+        continue;
+      if ((SymType & XCOFF::VISIBILITY_MASK) == XCOFF::SYM_V_HIDDEN)
+        continue;
+    }
+
+    Expected<section_iterator> SymSecOrErr = Sym.getSection();
+    if (!SymSecOrErr) {
+      warn(SymSecOrErr.takeError(), XCOFFObj->getFileName(),
+           "for symbol with index " +
+               Twine(XCOFFObj->getSymbolIndex(Sym.getRawDataRefImpl().p)),
+           ArchiveName);
+      continue;
+    }
+    section_iterator SecIter = *SymSecOrErr;
+    // If the symbol is not in a text or data section, it is not exported.
+    if (SecIter == XCOFFObj->section_end())
+      continue;
+    if (!(SecIter->isText() || SecIter->isData() || SecIter->isBSS()))
+      continue;
+
+    StringRef SymName = cantFail(Sym.getName());
+    if (SymName.empty())
+      continue;
+    if (SymName.startswith("__sinit") || SymName.startswith("__sterm") ||
+        SymName.front() == '.' || SymName.front() == '(')
+      continue;
+
+    // Check the SymName regex matching with "^__[0-9]+__".
+    if (SymName.size() > 4 && SymName.startswith("__") &&
+        SymName.endswith("__")) {
+      if (std::all_of(SymName.begin() + 2, SymName.end() - 2, isDigit))
+        continue;
+    }
+
+    if (SymName == "__rsrc" && NoRsrc)
+      continue;
+
+    if (SymName.startswith("__tf1"))
+      SymName = SymName.substr(6);
+    else if (SymName.startswith("__tf9"))
+      SymName = SymName.substr(14);
+
+    NMSymbol S = {};
+    S.Name = SymName.str();
+    S.Sym = Sym;
+
+    if (HasVisibilityAttr) {
+      XCOFFSymbolRef XCOFFSym = XCOFFObj->toSymbolRef(Sym.getRawDataRefImpl());
+      uint16_t SymType = XCOFFSym.getSymbolType();
+      if ((SymType & XCOFF::VISIBILITY_MASK) == XCOFF::SYM_V_PROTECTED)
+        S.Visibility = "protected";
+      else if ((SymType & XCOFF::VISIBILITY_MASK) == XCOFF::SYM_V_EXPORTED)
+        S.Visibility = "export";
+    }
+    if (S.initializeFlags(*XCOFFObj))
+      SymbolList.push_back(S);
+  }
+}
+
+static Expected<SymbolicFile::basic_symbol_iterator_range>
+getDynamicSyms(SymbolicFile &Obj) {
+  const auto *E = dyn_cast<ELFObjectFileBase>(&Obj);
+  if (!E)
+    return createError("File format has no dynamic symbol table");
+  return E->getDynamicSymbolIterators();
+}
+
+// Returns false if there is error found or true otherwise.
+static bool getSymbolNamesFromObject(SymbolicFile &Obj,
+                                     std::vector<NMSymbol> &SymbolList) {
   auto Symbols = Obj.symbols();
   std::vector<VersionEntry> SymbolVersions;
+
   if (DynamicSyms) {
-    const auto *E = dyn_cast<ELFObjectFileBase>(&Obj);
-    if (!E) {
-      error("File format has no dynamic symbol table", Obj.getFileName());
-      return;
+    Expected<SymbolicFile::basic_symbol_iterator_range> SymbolsOrErr =
+        getDynamicSyms(Obj);
+    if (!SymbolsOrErr) {
+      error(SymbolsOrErr.takeError(), Obj.getFileName());
+      return false;
+    }
+    Symbols = *SymbolsOrErr;
+    if (const auto *E = dyn_cast<ELFObjectFileBase>(&Obj)) {
+      if (Expected<std::vector<VersionEntry>> VersionsOrErr =
+              E->readDynsymVersions())
+        SymbolVersions = std::move(*VersionsOrErr);
+      else
+        WithColor::warning(errs(), ToolName)
+            << "unable to read symbol versions: "
+            << toString(VersionsOrErr.takeError()) << "\n";
     }
-    Symbols = E->getDynamicSymbolIterators();
-
-    if (Expected<std::vector<VersionEntry>> VersionsOrErr =
-            E->readDynsymVersions())
-      SymbolVersions = std::move(*VersionsOrErr);
-    else
-      WithColor::warning(errs(), ToolName)
-          << "unable to read symbol versions: "
-          << toString(VersionsOrErr.takeError()) << "\n";
   }
-
   // If a "-s segname sectname" option was specified and this is a Mach-O
   // file get the section number for that section in this object file.
   unsigned int Nsect = 0;
@@ -1672,8 +1785,9 @@ static void dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
     Nsect = getNsectForSegSect(MachO);
     // If this section is not in the object file no symbols are printed.
     if (Nsect == 0)
-      return;
+      return false;
   }
+
   if (!(MachO && DyldInfoOnly)) {
     size_t I = -1;
     for (BasicSymbolRef Sym : Symbols) {
@@ -1681,7 +1795,7 @@ static void dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
       Expected<uint32_t> SymFlagsOrErr = Sym.getFlags();
       if (!SymFlagsOrErr) {
         error(SymFlagsOrErr.takeError(), Obj.getFileName());
-        return;
+        return false;
       }
 
       // Don't drop format specifc symbols for ARM and AArch64 ELF targets, they
@@ -1734,7 +1848,8 @@ static void dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
             (SymbolVersions[I].IsVerDef ? "@@" : "@") + SymbolVersions[I].Name;
 
       S.Sym = Sym;
-      SymbolList.push_back(S);
+      if (S.initializeFlags(Obj))
+        SymbolList.push_back(S);
     }
   }
 
@@ -1745,16 +1860,66 @@ static void dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
   // language symbols for example.  The option -only-dyldinfo will fake up
   // all symbols from the dyld export trie as well as the bind info.
   if (MachO && !NoDyldInfo)
-    dumpSymbolsFromDLInfoMachO(*MachO);
+    dumpSymbolsFromDLInfoMachO(*MachO, SymbolList);
 
+  return true;
+}
+
+static void printObjectLabel(bool PrintArchiveName, StringRef ArchiveName,
+                             StringRef ArchitectureName,
+                             StringRef ObjectFileName) {
+  outs() << "\n";
+  if (ArchiveName.empty() || !PrintArchiveName)
+    outs() << ObjectFileName;
+  else
+    outs() << ArchiveName << "(" << ObjectFileName << ")";
+  if (!ArchitectureName.empty())
+    outs() << " (for architecture " << ArchitectureName << ")";
+  outs() << ":\n";
+}
+
+static Expected<bool> hasSymbols(SymbolicFile &Obj) {
+  if (DynamicSyms) {
+    Expected<SymbolicFile::basic_symbol_iterator_range> DynamicSymsOrErr =
+        getDynamicSyms(Obj);
+    if (!DynamicSymsOrErr)
+      return DynamicSymsOrErr.takeError();
+    return !DynamicSymsOrErr->empty();
+  }
+  return !Obj.symbols().empty();
+}
+
+static void dumpSymbolNamesFromObject(
+    SymbolicFile &Obj, std::vector<NMSymbol> &SymbolList,
+    bool PrintSymbolObject, bool PrintObjectLabel, StringRef ArchiveName = {},
+    StringRef ArchitectureName = {}, StringRef ObjectName = {},
+    bool PrintArchiveName = true) {
+  if (!shouldDump(Obj))
+    return;
+
+  if (ExportSymbols && Obj.isXCOFF()) {
+    XCOFFObjectFile *XCOFFObj = cast<XCOFFObjectFile>(&Obj);
+    getXCOFFExports(XCOFFObj, SymbolList, ArchiveName);
+    return;
+  }
+
+  if (PrintObjectLabel && !ExportSymbols)
+    printObjectLabel(PrintArchiveName, ArchiveName, ArchitectureName,
+                     ObjectName.empty() ? Obj.getFileName() : ObjectName);
+  if (!getSymbolNamesFromObject(Obj, SymbolList) || ExportSymbols)
+    return;
   CurrentFilename = Obj.getFileName();
 
-  if (Symbols.empty() && SymbolList.empty() && !Quiet) {
+  // If there is an error in hasSymbols(), the error should be encountered in
+  // function getSymbolNamesFromObject first.
+  if (!cantFail(hasSymbols(Obj)) && SymbolList.empty() && !Quiet) {
     writeFileName(errs(), ArchiveName, ArchitectureName);
     errs() << "no symbols\n";
   }
 
-  sortAndPrintSymbolList(Obj, printName, ArchiveName, ArchitectureName);
+  sortSymbolList(SymbolList);
+  printSymbolList(Obj, SymbolList, PrintSymbolObject, ArchiveName,
+                  ArchitectureName);
 }
 
 // checkMachOAndArchFlags() checks to see if the SymbolicFile is a Mach-O file
@@ -1762,7 +1927,7 @@ static void dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
 // check to make sure this Mach-O file is one of those architectures or all
 // architectures was specificed.  If not then an error is generated and this
 // routine returns false.  Else it returns true.
-static bool checkMachOAndArchFlags(SymbolicFile *O, std::string &Filename) {
+static bool checkMachOAndArchFlags(SymbolicFile *O, StringRef Filename) {
   auto *MachO = dyn_cast<MachOObjectFile>(O);
 
   if (!MachO || ArchAll || ArchFlags.empty())
@@ -1789,282 +1954,172 @@ static bool checkMachOAndArchFlags(SymbolicFile *O, std::string &Filename) {
   return true;
 }
 
-static void dumpSymbolNamesFromFile(std::string &Filename) {
-  ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
-      MemoryBuffer::getFileOrSTDIN(Filename);
-  if (error(BufferOrErr.getError(), Filename))
-    return;
-
-  LLVMContext Context;
-  LLVMContext *ContextPtr = NoLLVMBitcode ? nullptr : &Context;
-  Expected<std::unique_ptr<Binary>> BinaryOrErr =
-      createBinary(BufferOrErr.get()->getMemBufferRef(), ContextPtr);
-  if (!BinaryOrErr) {
-    error(BinaryOrErr.takeError(), Filename);
-    return;
-  }
-  Binary &Bin = *BinaryOrErr.get();
-
-  if (Archive *A = dyn_cast<Archive>(&Bin)) {
-    if (ArchiveMap) {
-      Archive::symbol_iterator I = A->symbol_begin();
-      Archive::symbol_iterator E = A->symbol_end();
-      if (I != E) {
-        outs() << "Archive map\n";
-        for (; I != E; ++I) {
-          Expected<Archive::Child> C = I->getMember();
-          if (!C) {
-            error(C.takeError(), Filename);
-            break;
-          }
-          Expected<StringRef> FileNameOrErr = C->getName();
-          if (!FileNameOrErr) {
-            error(FileNameOrErr.takeError(), Filename);
-            break;
-          }
-          StringRef SymName = I->getName();
-          outs() << SymName << " in " << FileNameOrErr.get() << "\n";
-        }
-        outs() << "\n";
+static void dumpArchiveMap(Archive *A, StringRef Filename) {
+  Archive::symbol_iterator I = A->symbol_begin();
+  Archive::symbol_iterator E = A->symbol_end();
+  if (I != E) {
+    outs() << "Archive map\n";
+    for (; I != E; ++I) {
+      Expected<Archive::Child> C = I->getMember();
+      if (!C) {
+        error(C.takeError(), Filename);
+        break;
       }
+      Expected<StringRef> FileNameOrErr = C->getName();
+      if (!FileNameOrErr) {
+        error(FileNameOrErr.takeError(), Filename);
+        break;
+      }
+      StringRef SymName = I->getName();
+      outs() << SymName << " in " << FileNameOrErr.get() << "\n";
     }
+    outs() << "\n";
+  }
+}
 
-    {
-      Error Err = Error::success();
-      for (auto &C : A->children(Err)) {
-        Expected<std::unique_ptr<Binary>> ChildOrErr =
-            C.getAsBinary(ContextPtr);
-        if (!ChildOrErr) {
-          if (auto E = isNotObjectErrorInvalidFileType(ChildOrErr.takeError()))
-            error(std::move(E), Filename, C);
-          continue;
-        }
-        if (SymbolicFile *O = dyn_cast<SymbolicFile>(&*ChildOrErr.get())) {
-          if (!MachOPrintSizeWarning && PrintSize &&  isa<MachOObjectFile>(O)) {
-            WithColor::warning(errs(), ToolName)
-                << "sizes with -print-size for Mach-O files are always zero.\n";
-            MachOPrintSizeWarning = true;
-          }
-          if (!checkMachOAndArchFlags(O, Filename))
-            return;
-          if (!PrintFileName) {
-            outs() << "\n";
-            if (isa<MachOObjectFile>(O)) {
-              outs() << Filename << "(" << O->getFileName() << ")";
-            } else
-              outs() << O->getFileName();
-            outs() << ":\n";
-          }
-          dumpSymbolNamesFromObject(*O, false, Filename);
-        }
+static void dumpArchive(Archive *A, std::vector<NMSymbol> &SymbolList,
+                        StringRef Filename, LLVMContext *ContextPtr) {
+  if (ArchiveMap)
+    dumpArchiveMap(A, Filename);
+
+  Error Err = Error::success();
+  for (auto &C : A->children(Err)) {
+    Expected<std::unique_ptr<Binary>> ChildOrErr = C.getAsBinary(ContextPtr);
+    if (!ChildOrErr) {
+      if (auto E = isNotObjectErrorInvalidFileType(ChildOrErr.takeError()))
+        error(std::move(E), Filename, C);
+      continue;
+    }
+    if (SymbolicFile *O = dyn_cast<SymbolicFile>(&*ChildOrErr.get())) {
+      if (!MachOPrintSizeWarning && PrintSize && isa<MachOObjectFile>(O)) {
+        WithColor::warning(errs(), ToolName)
+            << "sizes with -print-size for Mach-O files are always zero.\n";
+        MachOPrintSizeWarning = true;
       }
-      if (Err)
-        error(std::move(Err), A->getFileName());
+      if (!checkMachOAndArchFlags(O, Filename))
+        return;
+      dumpSymbolNamesFromObject(*O, SymbolList, /*PrintSymbolObject=*/false,
+                                !PrintFileName, Filename,
+                                /*ArchitectureName=*/{}, O->getFileName(),
+                                /*PrintArchiveName=*/false);
     }
-    return;
   }
-  if (MachOUniversalBinary *UB = dyn_cast<MachOUniversalBinary>(&Bin)) {
-    // If we have a list of architecture flags specified dump only those.
-    if (!ArchAll && !ArchFlags.empty()) {
-      // Look for a slice in the universal binary that matches each ArchFlag.
-      bool ArchFound;
-      for (unsigned i = 0; i < ArchFlags.size(); ++i) {
-        ArchFound = false;
-        for (MachOUniversalBinary::object_iterator I = UB->begin_objects(),
-                                                   E = UB->end_objects();
-             I != E; ++I) {
-          if (ArchFlags[i] == I->getArchFlagName()) {
-            ArchFound = true;
-            Expected<std::unique_ptr<ObjectFile>> ObjOrErr =
-                I->getAsObjectFile();
-            std::string ArchiveName;
-            std::string ArchitectureName;
-            ArchiveName.clear();
-            ArchitectureName.clear();
-            if (ObjOrErr) {
-              ObjectFile &Obj = *ObjOrErr.get();
-              if (ArchFlags.size() > 1) {
-                if (PrintFileName)
-                  ArchitectureName = I->getArchFlagName();
-                else
-                  outs() << "\n" << Obj.getFileName() << " (for architecture "
-                         << I->getArchFlagName() << ")"
-                         << ":\n";
+  if (Err)
+    error(std::move(Err), A->getFileName());
+}
+
+static void dumpMachOUniversalBinaryMatchArchFlags(
+    MachOUniversalBinary *UB, std::vector<NMSymbol> &SymbolList,
+    StringRef Filename, LLVMContext *ContextPtr) {
+  // Look for a slice in the universal binary that matches each ArchFlag.
+  bool ArchFound;
+  for (unsigned i = 0; i < ArchFlags.size(); ++i) {
+    ArchFound = false;
+    for (MachOUniversalBinary::object_iterator I = UB->begin_objects(),
+                                               E = UB->end_objects();
+         I != E; ++I) {
+      if (ArchFlags[i] == I->getArchFlagName()) {
+        ArchFound = true;
+        Expected<std::unique_ptr<ObjectFile>> ObjOrErr = I->getAsObjectFile();
+        std::string ArchiveName;
+        std::string ArchitectureName;
+        ArchiveName.clear();
+        ArchitectureName.clear();
+        if (ObjOrErr) {
+          ObjectFile &Obj = *ObjOrErr.get();
+          if (ArchFlags.size() > 1)
+            ArchitectureName = I->getArchFlagName();
+          dumpSymbolNamesFromObject(Obj, SymbolList,
+                                    /*PrintSymbolObject=*/false,
+                                    (ArchFlags.size() > 1) && !PrintFileName,
+                                    ArchiveName, ArchitectureName);
+        } else if (auto E =
+                       isNotObjectErrorInvalidFileType(ObjOrErr.takeError())) {
+          error(std::move(E), Filename,
+                ArchFlags.size() > 1 ? StringRef(I->getArchFlagName())
+                                     : StringRef());
+          continue;
+        } else if (Expected<std::unique_ptr<Archive>> AOrErr =
+                       I->getAsArchive()) {
+          std::unique_ptr<Archive> &A = *AOrErr;
+          Error Err = Error::success();
+          for (auto &C : A->children(Err)) {
+            Expected<std::unique_ptr<Binary>> ChildOrErr =
+                C.getAsBinary(ContextPtr);
+            if (!ChildOrErr) {
+              if (auto E =
+                      isNotObjectErrorInvalidFileType(ChildOrErr.takeError())) {
+                error(std::move(E), Filename, C,
+                      ArchFlags.size() > 1 ? StringRef(I->getArchFlagName())
+                                           : StringRef());
               }
-              dumpSymbolNamesFromObject(Obj, false, ArchiveName,
-                                        ArchitectureName);
-            } else if (auto E = isNotObjectErrorInvalidFileType(
-                       ObjOrErr.takeError())) {
-              error(std::move(E), Filename, ArchFlags.size() > 1 ?
-                    StringRef(I->getArchFlagName()) : StringRef());
               continue;
-            } else if (Expected<std::unique_ptr<Archive>> AOrErr =
-                           I->getAsArchive()) {
-              std::unique_ptr<Archive> &A = *AOrErr;
-              Error Err = Error::success();
-              for (auto &C : A->children(Err)) {
-                Expected<std::unique_ptr<Binary>> ChildOrErr =
-                    C.getAsBinary(ContextPtr);
-                if (!ChildOrErr) {
-                  if (auto E = isNotObjectErrorInvalidFileType(
-                                       ChildOrErr.takeError())) {
-                    error(std::move(E), Filename, C, ArchFlags.size() > 1 ?
-                          StringRef(I->getArchFlagName()) : StringRef());
-                  }
-                  continue;
-                }
-                if (SymbolicFile *O =
-                        dyn_cast<SymbolicFile>(&*ChildOrErr.get())) {
-                  if (PrintFileName) {
-                    ArchiveName = std::string(A->getFileName());
-                    if (ArchFlags.size() > 1)
-                      ArchitectureName = I->getArchFlagName();
-                  } else {
-                    outs() << "\n" << A->getFileName();
-                    outs() << "(" << O->getFileName() << ")";
-                    if (ArchFlags.size() > 1) {
-                      outs() << " (for architecture " << I->getArchFlagName()
-                             << ")";
-                    }
-                    outs() << ":\n";
-                  }
-                  dumpSymbolNamesFromObject(*O, false, ArchiveName,
-                                            ArchitectureName);
-                }
-              }
-              if (Err)
-                error(std::move(Err), A->getFileName());
-            } else {
-              consumeError(AOrErr.takeError());
-              error(Filename + " for architecture " +
-                    StringRef(I->getArchFlagName()) +
-                    " is not a Mach-O file or an archive file",
-                    "Mach-O universal file");
             }
-          }
-        }
-        if (!ArchFound) {
-          error(ArchFlags[i],
-                "file: " + Filename + " does not contain architecture");
-          return;
-        }
-      }
-      return;
-    }
-    // No architecture flags were specified so if this contains a slice that
-    // matches the host architecture dump only that.
-    if (!ArchAll) {
-      Triple HostTriple = MachOObjectFile::getHostArch();
-      StringRef HostArchName = HostTriple.getArchName();
-      for (MachOUniversalBinary::object_iterator I = UB->begin_objects(),
-                                                 E = UB->end_objects();
-           I != E; ++I) {
-        if (HostArchName == I->getArchFlagName()) {
-          Expected<std::unique_ptr<ObjectFile>> ObjOrErr = I->getAsObjectFile();
-          std::string ArchiveName;
-          if (ObjOrErr) {
-            ObjectFile &Obj = *ObjOrErr.get();
-            dumpSymbolNamesFromObject(Obj, false);
-          } else if (auto E = isNotObjectErrorInvalidFileType(
-                     ObjOrErr.takeError())) {
-            error(std::move(E), Filename);
-            return;
-          } else if (Expected<std::unique_ptr<Archive>> AOrErr =
-                         I->getAsArchive()) {
-            std::unique_ptr<Archive> &A = *AOrErr;
-            Error Err = Error::success();
-            for (auto &C : A->children(Err)) {
-              Expected<std::unique_ptr<Binary>> ChildOrErr =
-                  C.getAsBinary(ContextPtr);
-              if (!ChildOrErr) {
-                if (auto E = isNotObjectErrorInvalidFileType(
-                                     ChildOrErr.takeError()))
-                  error(std::move(E), Filename, C);
-                continue;
-              }
-              if (SymbolicFile *O =
-                      dyn_cast<SymbolicFile>(&*ChildOrErr.get())) {
-                if (PrintFileName)
-                  ArchiveName = std::string(A->getFileName());
-                else
-                  outs() << "\n" << A->getFileName() << "(" << O->getFileName()
-                         << ")"
-                         << ":\n";
-                dumpSymbolNamesFromObject(*O, false, ArchiveName);
-              }
+            if (SymbolicFile *O = dyn_cast<SymbolicFile>(&*ChildOrErr.get())) {
+              ArchiveName = std::string(A->getFileName());
+              if (ArchFlags.size() > 1)
+                ArchitectureName = I->getArchFlagName();
+              dumpSymbolNamesFromObject(
+                  *O, SymbolList, /*PrintSymbolObject=*/false, !PrintFileName,
+                  ArchiveName, ArchitectureName);
             }
-            if (Err)
-              error(std::move(Err), A->getFileName());
-          } else {
-            consumeError(AOrErr.takeError());
-            error(Filename + " for architecture " +
-                  StringRef(I->getArchFlagName()) +
-                  " is not a Mach-O file or an archive file",
-                  "Mach-O universal file");
           }
-          return;
+          if (Err)
+            error(std::move(Err), A->getFileName());
+        } else {
+          consumeError(AOrErr.takeError());
+          error(Filename + " for architecture " +
+                    StringRef(I->getArchFlagName()) +
+                    " is not a Mach-O file or an archive file",
+                "Mach-O universal file");
         }
       }
     }
-    // Either all architectures have been specified or none have been specified
-    // and this does not contain the host architecture so dump all the slices.
-    bool moreThanOneArch = UB->getNumberOfObjects() > 1;
-    for (const MachOUniversalBinary::ObjectForArch &O : UB->objects()) {
-      Expected<std::unique_ptr<ObjectFile>> ObjOrErr = O.getAsObjectFile();
+    if (!ArchFound) {
+      error(ArchFlags[i],
+            "file: " + Filename + " does not contain architecture");
+      return;
+    }
+  }
+}
+
+// Returns true If the binary contains a slice that matches the host
+// architecture, or false otherwise.
+static bool dumpMachOUniversalBinaryMatchHost(MachOUniversalBinary *UB,
+                                              std::vector<NMSymbol> &SymbolList,
+                                              StringRef Filename,
+                                              LLVMContext *ContextPtr) {
+  Triple HostTriple = MachOObjectFile::getHostArch();
+  StringRef HostArchName = HostTriple.getArchName();
+  for (MachOUniversalBinary::object_iterator I = UB->begin_objects(),
+                                             E = UB->end_objects();
+       I != E; ++I) {
+    if (HostArchName == I->getArchFlagName()) {
+      Expected<std::unique_ptr<ObjectFile>> ObjOrErr = I->getAsObjectFile();
       std::string ArchiveName;
-      std::string ArchitectureName;
-      ArchiveName.clear();
-      ArchitectureName.clear();
       if (ObjOrErr) {
         ObjectFile &Obj = *ObjOrErr.get();
-        if (PrintFileName) {
-          if (isa<MachOObjectFile>(Obj) && moreThanOneArch)
-            ArchitectureName = O.getArchFlagName();
-        } else {
-          if (moreThanOneArch)
-            outs() << "\n";
-          outs() << Obj.getFileName();
-          if (isa<MachOObjectFile>(Obj) && moreThanOneArch)
-            outs() << " (for architecture " << O.getArchFlagName() << ")";
-          outs() << ":\n";
-        }
-        dumpSymbolNamesFromObject(Obj, false, ArchiveName, ArchitectureName);
-      } else if (auto E = isNotObjectErrorInvalidFileType(
-                 ObjOrErr.takeError())) {
-        error(std::move(E), Filename, moreThanOneArch ?
-              StringRef(O.getArchFlagName()) : StringRef());
-        continue;
-      } else if (Expected<std::unique_ptr<Archive>> AOrErr =
-                  O.getAsArchive()) {
+        dumpSymbolNamesFromObject(Obj, SymbolList, /*PrintSymbolObject=*/false,
+                                  /*PrintObjectLabel=*/false);
+      } else if (auto E = isNotObjectErrorInvalidFileType(ObjOrErr.takeError()))
+        error(std::move(E), Filename);
+      else if (Expected<std::unique_ptr<Archive>> AOrErr = I->getAsArchive()) {
         std::unique_ptr<Archive> &A = *AOrErr;
         Error Err = Error::success();
         for (auto &C : A->children(Err)) {
           Expected<std::unique_ptr<Binary>> ChildOrErr =
-            C.getAsBinary(ContextPtr);
+              C.getAsBinary(ContextPtr);
           if (!ChildOrErr) {
-            if (auto E = isNotObjectErrorInvalidFileType(
-                                 ChildOrErr.takeError()))
-              error(std::move(E), Filename, C, moreThanOneArch ?
-                    StringRef(ArchitectureName) : StringRef());
+            if (auto E =
+                    isNotObjectErrorInvalidFileType(ChildOrErr.takeError()))
+              error(std::move(E), Filename, C);
             continue;
           }
-          if (SymbolicFile *F = dyn_cast<SymbolicFile>(&*ChildOrErr.get())) {
-            if (PrintFileName) {
-              ArchiveName = std::string(A->getFileName());
-              if (isa<MachOObjectFile>(F) && moreThanOneArch)
-                ArchitectureName = O.getArchFlagName();
-            } else {
-              outs() << "\n" << A->getFileName();
-              if (isa<MachOObjectFile>(F)) {
-                outs() << "(" << F->getFileName() << ")";
-                if (moreThanOneArch)
-                  outs() << " (for architecture " << O.getArchFlagName()
-                         << ")";
-              } else
-                outs() << ":" << F->getFileName();
-              outs() << ":\n";
-            }
-            dumpSymbolNamesFromObject(*F, false, ArchiveName, ArchitectureName);
+          if (SymbolicFile *O = dyn_cast<SymbolicFile>(&*ChildOrErr.get())) {
+            ArchiveName = std::string(A->getFileName());
+            dumpSymbolNamesFromObject(*O, SymbolList,
+                                      /*PrintSymbolObject=*/false,
+                                      !PrintFileName, ArchiveName);
           }
         }
         if (Err)
@@ -2072,49 +2127,176 @@ static void dumpSymbolNamesFromFile(std::string &Filename) {
       } else {
         consumeError(AOrErr.takeError());
         error(Filename + " for architecture " +
-              StringRef(O.getArchFlagName()) +
-              " is not a Mach-O file or an archive file",
+                  StringRef(I->getArchFlagName()) +
+                  " is not a Mach-O file or an archive file",
               "Mach-O universal file");
       }
+      return true;
     }
-    return;
   }
+  return false;
+}
 
-  if (TapiUniversal *TU = dyn_cast<TapiUniversal>(&Bin)) {
-    for (const TapiUniversal::ObjectForArch &I : TU->objects()) {
-      StringRef ArchName = I.getArchFlagName();
-      const bool ShowArch =
-          ArchFlags.empty() || llvm::is_contained(ArchFlags, ArchName);
-      if (!ShowArch)
-        continue;
-      if (!AddInlinedInfo && !I.isTopLevelLib())
-        continue;
-      if (auto ObjOrErr = I.getAsObjectFile()) {
-        outs() << "\n"
-               << I.getInstallName() << " (for architecture " << ArchName << ")"
-               << ":\n";
-        dumpSymbolNamesFromObject(*ObjOrErr.get(), false, {}, ArchName);
-      } else if (Error E =
-                     isNotObjectErrorInvalidFileType(ObjOrErr.takeError())) {
-        error(std::move(E), Filename, ArchName);
+static void dumpMachOUniversalBinaryArchAll(MachOUniversalBinary *UB,
+                                            std::vector<NMSymbol> &SymbolList,
+                                            StringRef Filename,
+                                            LLVMContext *ContextPtr) {
+  bool moreThanOneArch = UB->getNumberOfObjects() > 1;
+  for (const MachOUniversalBinary::ObjectForArch &O : UB->objects()) {
+    Expected<std::unique_ptr<ObjectFile>> ObjOrErr = O.getAsObjectFile();
+    std::string ArchiveName;
+    std::string ArchitectureName;
+    ArchiveName.clear();
+    ArchitectureName.clear();
+    if (ObjOrErr) {
+      ObjectFile &Obj = *ObjOrErr.get();
+      if (isa<MachOObjectFile>(Obj) && moreThanOneArch)
+        ArchitectureName = O.getArchFlagName();
+      dumpSymbolNamesFromObject(Obj, SymbolList, /*PrintSymbolObject=*/false,
+                                !PrintFileName, ArchiveName, ArchitectureName);
+    } else if (auto E = isNotObjectErrorInvalidFileType(ObjOrErr.takeError())) {
+      error(std::move(E), Filename,
+            moreThanOneArch ? StringRef(O.getArchFlagName()) : StringRef());
+      continue;
+    } else if (Expected<std::unique_ptr<Archive>> AOrErr = O.getAsArchive()) {
+      std::unique_ptr<Archive> &A = *AOrErr;
+      Error Err = Error::success();
+      for (auto &C : A->children(Err)) {
+        Expected<std::unique_ptr<Binary>> ChildOrErr =
+            C.getAsBinary(ContextPtr);
+        if (!ChildOrErr) {
+          if (auto E = isNotObjectErrorInvalidFileType(ChildOrErr.takeError()))
+            error(std::move(E), Filename, C,
+                  moreThanOneArch ? StringRef(ArchitectureName) : StringRef());
+          continue;
+        }
+        if (SymbolicFile *F = dyn_cast<SymbolicFile>(&*ChildOrErr.get())) {
+          ArchiveName = std::string(A->getFileName());
+          if (isa<MachOObjectFile>(F) && moreThanOneArch)
+            ArchitectureName = O.getArchFlagName();
+          dumpSymbolNamesFromObject(*F, SymbolList, /*PrintSymbolObject=*/false,
+                                    !PrintFileName, ArchiveName,
+                                    ArchitectureName);
+        }
       }
+      if (Err)
+        error(std::move(Err), A->getFileName());
+    } else {
+      consumeError(AOrErr.takeError());
+      error(Filename + " for architecture " + StringRef(O.getArchFlagName()) +
+                " is not a Mach-O file or an archive file",
+            "Mach-O universal file");
     }
+  }
+}
 
+static void dumpMachOUniversalBinary(MachOUniversalBinary *UB,
+                                     std::vector<NMSymbol> &SymbolList,
+                                     StringRef Filename,
+                                     LLVMContext *ContextPtr) {
+  // If we have a list of architecture flags specified dump only those.
+  if (!ArchAll && !ArchFlags.empty()) {
+    dumpMachOUniversalBinaryMatchArchFlags(UB, SymbolList, Filename,
+                                           ContextPtr);
     return;
   }
 
-  if (SymbolicFile *O = dyn_cast<SymbolicFile>(&Bin)) {
-    if (!MachOPrintSizeWarning && PrintSize &&  isa<MachOObjectFile>(O)) {
-      WithColor::warning(errs(), ToolName)
-          << "sizes with --print-size for Mach-O files are always zero.\n";
-      MachOPrintSizeWarning = true;
+  // No architecture flags were specified so if this contains a slice that
+  // matches the host architecture dump only that.
+  if (!ArchAll &&
+      dumpMachOUniversalBinaryMatchHost(UB, SymbolList, Filename, ContextPtr))
+    return;
+
+  // Either all architectures have been specified or none have been specified
+  // and this does not contain the host architecture so dump all the slices.
+  dumpMachOUniversalBinaryArchAll(UB, SymbolList, Filename, ContextPtr);
+}
+
+static void dumpTapiUniversal(TapiUniversal *TU,
+                              std::vector<NMSymbol> &SymbolList,
+                              StringRef Filename) {
+  for (const TapiUniversal::ObjectForArch &I : TU->objects()) {
+    StringRef ArchName = I.getArchFlagName();
+    const bool ShowArch =
+        ArchFlags.empty() || llvm::is_contained(ArchFlags, ArchName);
+    if (!ShowArch)
+      continue;
+    if (!AddInlinedInfo && !I.isTopLevelLib())
+      continue;
+    if (auto ObjOrErr = I.getAsObjectFile())
+      dumpSymbolNamesFromObject(
+          *ObjOrErr.get(), SymbolList, /*PrintSymbolObject=*/false,
+          /*PrintObjectLabel=*/true,
+          /*ArchiveName=*/{}, ArchName, I.getInstallName());
+    else if (Error E = isNotObjectErrorInvalidFileType(ObjOrErr.takeError())) {
+      error(std::move(E), Filename, ArchName);
     }
-    if (!checkMachOAndArchFlags(O, Filename))
-      return;
-    dumpSymbolNamesFromObject(*O, true);
   }
 }
 
+static void dumpSymbolicFile(SymbolicFile *O, std::vector<NMSymbol> &SymbolList,
+                             StringRef Filename) {
+  if (!MachOPrintSizeWarning && PrintSize && isa<MachOObjectFile>(O)) {
+    WithColor::warning(errs(), ToolName)
+        << "sizes with --print-size for Mach-O files are always zero.\n";
+    MachOPrintSizeWarning = true;
+  }
+  if (!checkMachOAndArchFlags(O, Filename))
+    return;
+  dumpSymbolNamesFromObject(*O, SymbolList, /*PrintSymbolObject=*/true,
+                            /*PrintObjectLabel=*/false);
+}
+
+static std::vector<NMSymbol> dumpSymbolNamesFromFile(StringRef Filename) {
+  std::vector<NMSymbol> SymbolList;
+  ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
+      MemoryBuffer::getFileOrSTDIN(Filename);
+  if (error(BufferOrErr.getError(), Filename))
+    return SymbolList;
+
+  // Always enable opaque pointers, to handle archives with mixed typed and
+  // opaque pointer bitcode files gracefully. As we're only reading symbols,
+  // the used pointer types don't matter.
+  LLVMContext Context;
+  Context.setOpaquePointers(true);
+  LLVMContext *ContextPtr = NoLLVMBitcode ? nullptr : &Context;
+  Expected<std::unique_ptr<Binary>> BinaryOrErr =
+      createBinary(BufferOrErr.get()->getMemBufferRef(), ContextPtr);
+  if (!BinaryOrErr) {
+    error(BinaryOrErr.takeError(), Filename);
+    return SymbolList;
+  }
+  Binary &Bin = *BinaryOrErr.get();
+  if (Archive *A = dyn_cast<Archive>(&Bin))
+    dumpArchive(A, SymbolList, Filename, ContextPtr);
+  else if (MachOUniversalBinary *UB = dyn_cast<MachOUniversalBinary>(&Bin))
+    dumpMachOUniversalBinary(UB, SymbolList, Filename, ContextPtr);
+  else if (TapiUniversal *TU = dyn_cast<TapiUniversal>(&Bin))
+    dumpTapiUniversal(TU, SymbolList, Filename);
+  else if (SymbolicFile *O = dyn_cast<SymbolicFile>(&Bin))
+    dumpSymbolicFile(O, SymbolList, Filename);
+  return SymbolList;
+}
+
+static void
+exportSymbolNamesFromFiles(const std::vector<std::string> &InputFilenames) {
+  std::vector<NMSymbol> SymbolList;
+  for (const auto &FileName : InputFilenames) {
+    std::vector<NMSymbol> FileSymList = dumpSymbolNamesFromFile(FileName);
+    SymbolList.insert(SymbolList.end(), FileSymList.begin(), FileSymList.end());
+  }
+
+  // Delete symbols which should not be printed from SymolList.
+  SymbolList.erase(
+      llvm::remove_if(SymbolList,
+                      [](const NMSymbol &s) { return !s.shouldPrint(); }),
+      SymbolList.end());
+  sortSymbolList(SymbolList);
+  SymbolList.erase(std::unique(SymbolList.begin(), SymbolList.end()),
+                   SymbolList.end());
+  printExportSymbolList(SymbolList);
+}
+
 int main(int argc, char **argv) {
   InitLLVM X(argc, argv);
   BumpPtrAllocator A;
@@ -2169,6 +2351,12 @@ int main(int argc, char **argv) {
   PrintFileName = Args.hasArg(OPT_print_file_name);
   PrintSize = Args.hasArg(OPT_print_size);
   ReverseSort = Args.hasArg(OPT_reverse_sort);
+  ExportSymbols = Args.hasArg(OPT_export_symbols);
+  if (ExportSymbols) {
+    ExternalOnly = true;
+    DefinedOnly = true;
+  }
+
   Quiet = Args.hasArg(OPT_quiet);
   V = Args.getLastArgValue(OPT_radix_EQ, "x");
   if (V == "o")
@@ -2185,6 +2373,18 @@ int main(int argc, char **argv) {
   UndefinedOnly = Args.hasArg(OPT_undefined_only);
   WithoutAliases = Args.hasArg(OPT_without_aliases);
 
+  StringRef Mode = Args.getLastArgValue(OPT_X, "any");
+  if (Mode == "32")
+    BitMode = BitModeTy::Bit32;
+  else if (Mode == "64")
+    BitMode = BitModeTy::Bit64;
+  else if (Mode == "32_64")
+    BitMode = BitModeTy::Bit32_64;
+  else if (Mode == "any")
+    BitMode = BitModeTy::Any;
+  else
+    error("-X value should be one of: 32, 64, 32_64, (default) any");
+
   // Mach-O specific options.
   FormatMachOasHex = Args.hasArg(OPT_x);
   AddDyldInfo = Args.hasArg(OPT_add_dyldinfo);
@@ -2192,6 +2392,9 @@ int main(int argc, char **argv) {
   DyldInfoOnly = Args.hasArg(OPT_dyldinfo_only);
   NoDyldInfo = Args.hasArg(OPT_no_dyldinfo);
 
+  // XCOFF specific options.
+  NoRsrc = Args.hasArg(OPT_no_rsrc);
+
   // llvm-nm only reads binary files.
   if (error(sys::ChangeStdinToBinary()))
     return 1;
@@ -2249,7 +2452,10 @@ int main(int argc, char **argv) {
   if (NoDyldInfo && (AddDyldInfo || DyldInfoOnly))
     error("--no-dyldinfo can't be used with --add-dyldinfo or --dyldinfo-only");
 
-  llvm::for_each(InputFilenames, dumpSymbolNamesFromFile);
+  if (ExportSymbols)
+    exportSymbolNamesFromFiles(InputFilenames);
+  else
+    llvm::for_each(InputFilenames, dumpSymbolNamesFromFile);
 
   if (HadError)
     return 1;
diff --git a/llvm/tools/llvm-objcopy/BitcodeStripOpts.td b/llvm/tools/llvm-objcopy/BitcodeStripOpts.td
index cc178164b03c..21db854b1e6f 100644
--- a/llvm/tools/llvm-objcopy/BitcodeStripOpts.td
+++ b/llvm/tools/llvm-objcopy/BitcodeStripOpts.td
@@ -17,8 +17,14 @@ def help : Flag<["--"], "help">;
 def h : Flag<["-"], "h">, Alias<help>;
 
 def version : Flag<["--"], "version">,
-              HelpText<"Print the version and exit.">;
+              HelpText<"Print the version and exit">;
 
 def V : Flag<["-"], "V">,
         Alias<version>,
         HelpText<"Alias for --version">;
+
+def remove : Flag<["-"], "r">,
+        HelpText<"Remove the __LLVM bitcode segment entirely">;
+
+def output : JoinedOrSeparate<["-"], "o">, HelpText<"Write output to <file>">,
+             MetaVarName<"<file>">;
diff --git a/llvm/tools/llvm-objcopy/COFF/COFFConfig.h b/llvm/tools/llvm-objcopy/COFF/COFFConfig.h
deleted file mode 100644
index 7bf673fa4af9..000000000000
--- a/llvm/tools/llvm-objcopy/COFF/COFFConfig.h
+++ /dev/null
@@ -1,27 +0,0 @@
-//===- COFFConfig.h ---------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_LLVM_OBJCOPY_COFF_COFFCONFIG_H
-#define LLVM_TOOLS_LLVM_OBJCOPY_COFF_COFFCONFIG_H
-
-#include "llvm/ADT/Optional.h"
-
-namespace llvm {
-namespace objcopy {
-
-// Coff specific configuration for copying/stripping a single file.
-struct COFFConfig {
-  Optional<unsigned> Subsystem;
-  Optional<unsigned> MajorSubsystemVersion;
-  Optional<unsigned> MinorSubsystemVersion;
-};
-
-} // namespace objcopy
-} // namespace llvm
-
-#endif // LLVM_TOOLS_LLVM_OBJCOPY_COFF_COFFCONFIG_H
diff --git a/llvm/tools/llvm-objcopy/COFF/COFFObjcopy.cpp b/llvm/tools/llvm-objcopy/COFF/COFFObjcopy.cpp
deleted file mode 100644
index e0039cd3a675..000000000000
--- a/llvm/tools/llvm-objcopy/COFF/COFFObjcopy.cpp
+++ /dev/null
@@ -1,297 +0,0 @@
-//===- COFFObjcopy.cpp ----------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "COFFObjcopy.h"
-#include "COFFConfig.h"
-#include "CommonConfig.h"
-#include "Object.h"
-#include "Reader.h"
-#include "Writer.h"
-
-#include "llvm/Object/Binary.h"
-#include "llvm/Object/COFF.h"
-#include "llvm/Support/CRC.h"
-#include "llvm/Support/Errc.h"
-#include "llvm/Support/Path.h"
-#include <cassert>
-
-namespace llvm {
-namespace objcopy {
-namespace coff {
-
-using namespace object;
-using namespace COFF;
-
-static bool isDebugSection(const Section &Sec) {
-  return Sec.Name.startswith(".debug");
-}
-
-static uint64_t getNextRVA(const Object &Obj) {
-  if (Obj.getSections().empty())
-    return 0;
-  const Section &Last = Obj.getSections().back();
-  return alignTo(Last.Header.VirtualAddress + Last.Header.VirtualSize,
-                 Obj.IsPE ? Obj.PeHeader.SectionAlignment : 1);
-}
-
-static Expected<std::vector<uint8_t>>
-createGnuDebugLinkSectionContents(StringRef File) {
-  ErrorOr<std::unique_ptr<MemoryBuffer>> LinkTargetOrErr =
-      MemoryBuffer::getFile(File);
-  if (!LinkTargetOrErr)
-    return createFileError(File, LinkTargetOrErr.getError());
-  auto LinkTarget = std::move(*LinkTargetOrErr);
-  uint32_t CRC32 = llvm::crc32(arrayRefFromStringRef(LinkTarget->getBuffer()));
-
-  StringRef FileName = sys::path::filename(File);
-  size_t CRCPos = alignTo(FileName.size() + 1, 4);
-  std::vector<uint8_t> Data(CRCPos + 4);
-  memcpy(Data.data(), FileName.data(), FileName.size());
-  support::endian::write32le(Data.data() + CRCPos, CRC32);
-  return Data;
-}
-
-// Adds named section with given contents to the object.
-static void addSection(Object &Obj, StringRef Name, ArrayRef<uint8_t> Contents,
-                       uint32_t Characteristics) {
-  bool NeedVA = Characteristics & (IMAGE_SCN_MEM_EXECUTE | IMAGE_SCN_MEM_READ |
-                                   IMAGE_SCN_MEM_WRITE);
-
-  Section Sec;
-  Sec.setOwnedContents(Contents);
-  Sec.Name = Name;
-  Sec.Header.VirtualSize = NeedVA ? Sec.getContents().size() : 0u;
-  Sec.Header.VirtualAddress = NeedVA ? getNextRVA(Obj) : 0u;
-  Sec.Header.SizeOfRawData =
-      NeedVA ? alignTo(Sec.Header.VirtualSize,
-                       Obj.IsPE ? Obj.PeHeader.FileAlignment : 1)
-             : Sec.getContents().size();
-  // Sec.Header.PointerToRawData is filled in by the writer.
-  Sec.Header.PointerToRelocations = 0;
-  Sec.Header.PointerToLinenumbers = 0;
-  // Sec.Header.NumberOfRelocations is filled in by the writer.
-  Sec.Header.NumberOfLinenumbers = 0;
-  Sec.Header.Characteristics = Characteristics;
-
-  Obj.addSections(Sec);
-}
-
-static Error addGnuDebugLink(Object &Obj, StringRef DebugLinkFile) {
-  Expected<std::vector<uint8_t>> Contents =
-      createGnuDebugLinkSectionContents(DebugLinkFile);
-  if (!Contents)
-    return Contents.takeError();
-
-  addSection(Obj, ".gnu_debuglink", *Contents,
-             IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_MEM_READ |
-                 IMAGE_SCN_MEM_DISCARDABLE);
-
-  return Error::success();
-}
-
-static uint32_t flagsToCharacteristics(SectionFlag AllFlags, uint32_t OldChar) {
-  // Need to preserve alignment flags.
-  const uint32_t PreserveMask =
-      IMAGE_SCN_ALIGN_1BYTES | IMAGE_SCN_ALIGN_2BYTES | IMAGE_SCN_ALIGN_4BYTES |
-      IMAGE_SCN_ALIGN_8BYTES | IMAGE_SCN_ALIGN_16BYTES |
-      IMAGE_SCN_ALIGN_32BYTES | IMAGE_SCN_ALIGN_64BYTES |
-      IMAGE_SCN_ALIGN_128BYTES | IMAGE_SCN_ALIGN_256BYTES |
-      IMAGE_SCN_ALIGN_512BYTES | IMAGE_SCN_ALIGN_1024BYTES |
-      IMAGE_SCN_ALIGN_2048BYTES | IMAGE_SCN_ALIGN_4096BYTES |
-      IMAGE_SCN_ALIGN_8192BYTES;
-
-  // Setup new section characteristics based on the flags provided in command
-  // line.
-  uint32_t NewCharacteristics = (OldChar & PreserveMask) | IMAGE_SCN_MEM_READ;
-
-  if ((AllFlags & SectionFlag::SecAlloc) && !(AllFlags & SectionFlag::SecLoad))
-    NewCharacteristics |= IMAGE_SCN_CNT_UNINITIALIZED_DATA;
-  if (AllFlags & SectionFlag::SecNoload)
-    NewCharacteristics |= IMAGE_SCN_LNK_REMOVE;
-  if (!(AllFlags & SectionFlag::SecReadonly))
-    NewCharacteristics |= IMAGE_SCN_MEM_WRITE;
-  if (AllFlags & SectionFlag::SecDebug)
-    NewCharacteristics |=
-        IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_MEM_DISCARDABLE;
-  if (AllFlags & SectionFlag::SecCode)
-    NewCharacteristics |= IMAGE_SCN_CNT_CODE | IMAGE_SCN_MEM_EXECUTE;
-  if (AllFlags & SectionFlag::SecData)
-    NewCharacteristics |= IMAGE_SCN_CNT_INITIALIZED_DATA;
-  if (AllFlags & SectionFlag::SecShare)
-    NewCharacteristics |= IMAGE_SCN_MEM_SHARED;
-  if (AllFlags & SectionFlag::SecExclude)
-    NewCharacteristics |= IMAGE_SCN_LNK_REMOVE;
-
-  return NewCharacteristics;
-}
-
-static Error handleArgs(const CommonConfig &Config,
-                        const COFFConfig &COFFConfig, Object &Obj) {
-  // Perform the actual section removals.
-  Obj.removeSections([&Config](const Section &Sec) {
-    // Contrary to --only-keep-debug, --only-section fully removes sections that
-    // aren't mentioned.
-    if (!Config.OnlySection.empty() && !Config.OnlySection.matches(Sec.Name))
-      return true;
-
-    if (Config.StripDebug || Config.StripAll || Config.StripAllGNU ||
-        Config.DiscardMode == DiscardType::All || Config.StripUnneeded) {
-      if (isDebugSection(Sec) &&
-          (Sec.Header.Characteristics & IMAGE_SCN_MEM_DISCARDABLE) != 0)
-        return true;
-    }
-
-    if (Config.ToRemove.matches(Sec.Name))
-      return true;
-
-    return false;
-  });
-
-  if (Config.OnlyKeepDebug) {
-    // For --only-keep-debug, we keep all other sections, but remove their
-    // content. The VirtualSize field in the section header is kept intact.
-    Obj.truncateSections([](const Section &Sec) {
-      return !isDebugSection(Sec) && Sec.Name != ".buildid" &&
-             ((Sec.Header.Characteristics &
-               (IMAGE_SCN_CNT_CODE | IMAGE_SCN_CNT_INITIALIZED_DATA)) != 0);
-    });
-  }
-
-  // StripAll removes all symbols and thus also removes all relocations.
-  if (Config.StripAll || Config.StripAllGNU)
-    for (Section &Sec : Obj.getMutableSections())
-      Sec.Relocs.clear();
-
-  // If we need to do per-symbol removals, initialize the Referenced field.
-  if (Config.StripUnneeded || Config.DiscardMode == DiscardType::All ||
-      !Config.SymbolsToRemove.empty())
-    if (Error E = Obj.markSymbols())
-      return E;
-
-  for (Symbol &Sym : Obj.getMutableSymbols()) {
-    auto I = Config.SymbolsToRename.find(Sym.Name);
-    if (I != Config.SymbolsToRename.end())
-      Sym.Name = I->getValue();
-  }
-
-  auto ToRemove = [&](const Symbol &Sym) -> Expected<bool> {
-    // For StripAll, all relocations have been stripped and we remove all
-    // symbols.
-    if (Config.StripAll || Config.StripAllGNU)
-      return true;
-
-    if (Config.SymbolsToRemove.matches(Sym.Name)) {
-      // Explicitly removing a referenced symbol is an error.
-      if (Sym.Referenced)
-        return createStringError(
-            llvm::errc::invalid_argument,
-            "'" + Config.OutputFilename + "': not stripping symbol '" +
-                Sym.Name.str() + "' because it is named in a relocation");
-      return true;
-    }
-
-    if (!Sym.Referenced) {
-      // With --strip-unneeded, GNU objcopy removes all unreferenced local
-      // symbols, and any unreferenced undefined external.
-      // With --strip-unneeded-symbol we strip only specific unreferenced
-      // local symbol instead of removing all of such.
-      if (Sym.Sym.StorageClass == IMAGE_SYM_CLASS_STATIC ||
-          Sym.Sym.SectionNumber == 0)
-        if (Config.StripUnneeded ||
-            Config.UnneededSymbolsToRemove.matches(Sym.Name))
-          return true;
-
-      // GNU objcopy keeps referenced local symbols and external symbols
-      // if --discard-all is set, similar to what --strip-unneeded does,
-      // but undefined local symbols are kept when --discard-all is set.
-      if (Config.DiscardMode == DiscardType::All &&
-          Sym.Sym.StorageClass == IMAGE_SYM_CLASS_STATIC &&
-          Sym.Sym.SectionNumber != 0)
-        return true;
-    }
-
-    return false;
-  };
-
-  // Actually do removals of symbols.
-  if (Error Err = Obj.removeSymbols(ToRemove))
-    return Err;
-
-  if (!Config.SetSectionFlags.empty())
-    for (Section &Sec : Obj.getMutableSections()) {
-      const auto It = Config.SetSectionFlags.find(Sec.Name);
-      if (It != Config.SetSectionFlags.end())
-        Sec.Header.Characteristics = flagsToCharacteristics(
-            It->second.NewFlags, Sec.Header.Characteristics);
-    }
-
-  for (const auto &Flag : Config.AddSection) {
-    StringRef SecName, FileName;
-    std::tie(SecName, FileName) = Flag.split("=");
-
-    auto BufOrErr = MemoryBuffer::getFile(FileName);
-    if (!BufOrErr)
-      return createFileError(FileName, errorCodeToError(BufOrErr.getError()));
-    auto Buf = std::move(*BufOrErr);
-
-    uint32_t Characteristics;
-    const auto It = Config.SetSectionFlags.find(SecName);
-    if (It != Config.SetSectionFlags.end())
-      Characteristics = flagsToCharacteristics(It->second.NewFlags, 0);
-    else
-      Characteristics = IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_ALIGN_1BYTES;
-
-    addSection(
-        Obj, SecName,
-        makeArrayRef(reinterpret_cast<const uint8_t *>(Buf->getBufferStart()),
-                     Buf->getBufferSize()),
-        Characteristics);
-  }
-
-  if (!Config.AddGnuDebugLink.empty())
-    if (Error E = addGnuDebugLink(Obj, Config.AddGnuDebugLink))
-      return E;
-
-  if (COFFConfig.Subsystem || COFFConfig.MajorSubsystemVersion ||
-      COFFConfig.MinorSubsystemVersion) {
-    if (!Obj.IsPE)
-      return createStringError(
-          errc::invalid_argument,
-          "'" + Config.OutputFilename +
-              "': unable to set subsystem on a relocatable object file");
-    if (COFFConfig.Subsystem)
-      Obj.PeHeader.Subsystem = *COFFConfig.Subsystem;
-    if (COFFConfig.MajorSubsystemVersion)
-      Obj.PeHeader.MajorSubsystemVersion = *COFFConfig.MajorSubsystemVersion;
-    if (COFFConfig.MinorSubsystemVersion)
-      Obj.PeHeader.MinorSubsystemVersion = *COFFConfig.MinorSubsystemVersion;
-  }
-
-  return Error::success();
-}
-
-Error executeObjcopyOnBinary(const CommonConfig &Config,
-                             const COFFConfig &COFFConfig, COFFObjectFile &In,
-                             raw_ostream &Out) {
-  COFFReader Reader(In);
-  Expected<std::unique_ptr<Object>> ObjOrErr = Reader.create();
-  if (!ObjOrErr)
-    return createFileError(Config.InputFilename, ObjOrErr.takeError());
-  Object *Obj = ObjOrErr->get();
-  assert(Obj && "Unable to deserialize COFF object");
-  if (Error E = handleArgs(Config, COFFConfig, *Obj))
-    return createFileError(Config.InputFilename, std::move(E));
-  COFFWriter Writer(*Obj, Out);
-  if (Error E = Writer.write())
-    return createFileError(Config.OutputFilename, std::move(E));
-  return Error::success();
-}
-
-} // end namespace coff
-} // end namespace objcopy
-} // end namespace llvm
diff --git a/llvm/tools/llvm-objcopy/COFF/COFFObjcopy.h b/llvm/tools/llvm-objcopy/COFF/COFFObjcopy.h
deleted file mode 100644
index 2c7ccd34653d..000000000000
--- a/llvm/tools/llvm-objcopy/COFF/COFFObjcopy.h
+++ /dev/null
@@ -1,33 +0,0 @@
-//===- COFFObjcopy.h --------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_OBJCOPY_COFFOBJCOPY_H
-#define LLVM_TOOLS_OBJCOPY_COFFOBJCOPY_H
-
-namespace llvm {
-class Error;
-class raw_ostream;
-
-namespace object {
-class COFFObjectFile;
-} // end namespace object
-
-namespace objcopy {
-struct CommonConfig;
-struct COFFConfig;
-
-namespace coff {
-
-Error executeObjcopyOnBinary(const CommonConfig &Config, const COFFConfig &,
-                             object::COFFObjectFile &In, raw_ostream &Out);
-
-} // end namespace coff
-} // end namespace objcopy
-} // end namespace llvm
-
-#endif // LLVM_TOOLS_OBJCOPY_COFFOBJCOPY_H
diff --git a/llvm/tools/llvm-objcopy/COFF/Object.cpp b/llvm/tools/llvm-objcopy/COFF/Object.cpp
deleted file mode 100644
index ec2628c7eca9..000000000000
--- a/llvm/tools/llvm-objcopy/COFF/Object.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-//===- Object.cpp ---------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "Object.h"
-#include "llvm/ADT/DenseSet.h"
-#include <algorithm>
-
-namespace llvm {
-namespace objcopy {
-namespace coff {
-
-using namespace object;
-
-void Object::addSymbols(ArrayRef<Symbol> NewSymbols) {
-  for (Symbol S : NewSymbols) {
-    S.UniqueId = NextSymbolUniqueId++;
-    Symbols.emplace_back(S);
-  }
-  updateSymbols();
-}
-
-void Object::updateSymbols() {
-  SymbolMap = DenseMap<size_t, Symbol *>(Symbols.size());
-  for (Symbol &Sym : Symbols)
-    SymbolMap[Sym.UniqueId] = &Sym;
-}
-
-const Symbol *Object::findSymbol(size_t UniqueId) const {
-  return SymbolMap.lookup(UniqueId);
-}
-
-Error Object::removeSymbols(
-    function_ref<Expected<bool>(const Symbol &)> ToRemove) {
-  Error Errs = Error::success();
-  llvm::erase_if(Symbols, [ToRemove, &Errs](const Symbol &Sym) {
-    Expected<bool> ShouldRemove = ToRemove(Sym);
-    if (!ShouldRemove) {
-      Errs = joinErrors(std::move(Errs), ShouldRemove.takeError());
-      return false;
-    }
-    return *ShouldRemove;
-  });
-
-  updateSymbols();
-  return Errs;
-}
-
-Error Object::markSymbols() {
-  for (Symbol &Sym : Symbols)
-    Sym.Referenced = false;
-  for (const Section &Sec : Sections) {
-    for (const Relocation &R : Sec.Relocs) {
-      auto It = SymbolMap.find(R.Target);
-      if (It == SymbolMap.end())
-        return createStringError(object_error::invalid_symbol_index,
-                                 "relocation target %zu not found", R.Target);
-      It->second->Referenced = true;
-    }
-  }
-  return Error::success();
-}
-
-void Object::addSections(ArrayRef<Section> NewSections) {
-  for (Section S : NewSections) {
-    S.UniqueId = NextSectionUniqueId++;
-    Sections.emplace_back(S);
-  }
-  updateSections();
-}
-
-void Object::updateSections() {
-  SectionMap = DenseMap<ssize_t, Section *>(Sections.size());
-  size_t Index = 1;
-  for (Section &S : Sections) {
-    SectionMap[S.UniqueId] = &S;
-    S.Index = Index++;
-  }
-}
-
-const Section *Object::findSection(ssize_t UniqueId) const {
-  return SectionMap.lookup(UniqueId);
-}
-
-void Object::removeSections(function_ref<bool(const Section &)> ToRemove) {
-  DenseSet<ssize_t> AssociatedSections;
-  auto RemoveAssociated = [&AssociatedSections](const Section &Sec) {
-    return AssociatedSections.contains(Sec.UniqueId);
-  };
-  do {
-    DenseSet<ssize_t> RemovedSections;
-    llvm::erase_if(Sections, [ToRemove, &RemovedSections](const Section &Sec) {
-      bool Remove = ToRemove(Sec);
-      if (Remove)
-        RemovedSections.insert(Sec.UniqueId);
-      return Remove;
-    });
-    // Remove all symbols referring to the removed sections.
-    AssociatedSections.clear();
-    llvm::erase_if(
-        Symbols, [&RemovedSections, &AssociatedSections](const Symbol &Sym) {
-          // If there are sections that are associative to a removed
-          // section,
-          // remove those as well as nothing will include them (and we can't
-          // leave them dangling).
-          if (RemovedSections.contains(Sym.AssociativeComdatTargetSectionId))
-            AssociatedSections.insert(Sym.TargetSectionId);
-          return RemovedSections.contains(Sym.TargetSectionId);
-        });
-    ToRemove = RemoveAssociated;
-  } while (!AssociatedSections.empty());
-  updateSections();
-  updateSymbols();
-}
-
-void Object::truncateSections(function_ref<bool(const Section &)> ToTruncate) {
-  for (Section &Sec : Sections) {
-    if (ToTruncate(Sec)) {
-      Sec.clearContents();
-      Sec.Relocs.clear();
-      Sec.Header.SizeOfRawData = 0;
-    }
-  }
-}
-
-} // end namespace coff
-} // end namespace objcopy
-} // end namespace llvm
diff --git a/llvm/tools/llvm-objcopy/COFF/Object.h b/llvm/tools/llvm-objcopy/COFF/Object.h
deleted file mode 100644
index 0e854b58cbdb..000000000000
--- a/llvm/tools/llvm-objcopy/COFF/Object.h
+++ /dev/null
@@ -1,211 +0,0 @@
-//===- Object.h -------------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_OBJCOPY_COFF_OBJECT_H
-#define LLVM_TOOLS_OBJCOPY_COFF_OBJECT_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/BinaryFormat/COFF.h"
-#include "llvm/Object/COFF.h"
-#include <cstddef>
-#include <cstdint>
-#include <vector>
-
-namespace llvm {
-namespace objcopy {
-namespace coff {
-
-struct Relocation {
-  Relocation() = default;
-  Relocation(const object::coff_relocation &R) : Reloc(R) {}
-
-  object::coff_relocation Reloc;
-  size_t Target = 0;
-  StringRef TargetName; // Used for diagnostics only
-};
-
-struct Section {
-  object::coff_section Header;
-  std::vector<Relocation> Relocs;
-  StringRef Name;
-  ssize_t UniqueId;
-  size_t Index;
-
-  ArrayRef<uint8_t> getContents() const {
-    if (!OwnedContents.empty())
-      return OwnedContents;
-    return ContentsRef;
-  }
-
-  void setContentsRef(ArrayRef<uint8_t> Data) {
-    OwnedContents.clear();
-    ContentsRef = Data;
-  }
-
-  void setOwnedContents(std::vector<uint8_t> &&Data) {
-    ContentsRef = ArrayRef<uint8_t>();
-    OwnedContents = std::move(Data);
-  }
-
-  void clearContents() {
-    ContentsRef = ArrayRef<uint8_t>();
-    OwnedContents.clear();
-  }
-
-private:
-  ArrayRef<uint8_t> ContentsRef;
-  std::vector<uint8_t> OwnedContents;
-};
-
-struct AuxSymbol {
-  AuxSymbol(ArrayRef<uint8_t> In) {
-    assert(In.size() == sizeof(Opaque));
-    std::copy(In.begin(), In.end(), Opaque);
-  }
-
-  ArrayRef<uint8_t> getRef() const {
-    return ArrayRef<uint8_t>(Opaque, sizeof(Opaque));
-  }
-
-  uint8_t Opaque[sizeof(object::coff_symbol16)];
-};
-
-struct Symbol {
-  object::coff_symbol32 Sym;
-  StringRef Name;
-  std::vector<AuxSymbol> AuxData;
-  StringRef AuxFile;
-  ssize_t TargetSectionId;
-  ssize_t AssociativeComdatTargetSectionId = 0;
-  Optional<size_t> WeakTargetSymbolId;
-  size_t UniqueId;
-  size_t RawIndex;
-  bool Referenced;
-};
-
-struct Object {
-  bool IsPE = false;
-
-  object::dos_header DosHeader;
-  ArrayRef<uint8_t> DosStub;
-
-  object::coff_file_header CoffFileHeader;
-
-  bool Is64 = false;
-  object::pe32plus_header PeHeader;
-  uint32_t BaseOfData = 0; // pe32plus_header lacks this field.
-
-  std::vector<object::data_directory> DataDirectories;
-
-  ArrayRef<Symbol> getSymbols() const { return Symbols; }
-  // This allows mutating individual Symbols, but not mutating the list
-  // of symbols itself.
-  iterator_range<std::vector<Symbol>::iterator> getMutableSymbols() {
-    return make_range(Symbols.begin(), Symbols.end());
-  }
-
-  const Symbol *findSymbol(size_t UniqueId) const;
-
-  void addSymbols(ArrayRef<Symbol> NewSymbols);
-  Error removeSymbols(function_ref<Expected<bool>(const Symbol &)> ToRemove);
-
-  // Set the Referenced field on all Symbols, based on relocations in
-  // all sections.
-  Error markSymbols();
-
-  ArrayRef<Section> getSections() const { return Sections; }
-  // This allows mutating individual Sections, but not mutating the list
-  // of sections itself.
-  iterator_range<std::vector<Section>::iterator> getMutableSections() {
-    return make_range(Sections.begin(), Sections.end());
-  }
-
-  const Section *findSection(ssize_t UniqueId) const;
-
-  void addSections(ArrayRef<Section> NewSections);
-  void removeSections(function_ref<bool(const Section &)> ToRemove);
-  void truncateSections(function_ref<bool(const Section &)> ToTruncate);
-
-private:
-  std::vector<Symbol> Symbols;
-  DenseMap<size_t, Symbol *> SymbolMap;
-
-  size_t NextSymbolUniqueId = 0;
-
-  std::vector<Section> Sections;
-  DenseMap<ssize_t, Section *> SectionMap;
-
-  ssize_t NextSectionUniqueId = 1; // Allow a UniqueId 0 to mean undefined.
-
-  // Update SymbolMap.
-  void updateSymbols();
-
-  // Update SectionMap and Index in each Section.
-  void updateSections();
-};
-
-// Copy between coff_symbol16 and coff_symbol32.
-// The source and destination files can use either coff_symbol16 or
-// coff_symbol32, while we always store them as coff_symbol32 in the
-// intermediate data structure.
-template <class Symbol1Ty, class Symbol2Ty>
-void copySymbol(Symbol1Ty &Dest, const Symbol2Ty &Src) {
-  static_assert(sizeof(Dest.Name.ShortName) == sizeof(Src.Name.ShortName),
-                "Mismatched name sizes");
-  memcpy(Dest.Name.ShortName, Src.Name.ShortName, sizeof(Dest.Name.ShortName));
-  Dest.Value = Src.Value;
-  Dest.SectionNumber = Src.SectionNumber;
-  Dest.Type = Src.Type;
-  Dest.StorageClass = Src.StorageClass;
-  Dest.NumberOfAuxSymbols = Src.NumberOfAuxSymbols;
-}
-
-// Copy between pe32_header and pe32plus_header.
-// We store the intermediate state in a pe32plus_header.
-template <class PeHeader1Ty, class PeHeader2Ty>
-void copyPeHeader(PeHeader1Ty &Dest, const PeHeader2Ty &Src) {
-  Dest.Magic = Src.Magic;
-  Dest.MajorLinkerVersion = Src.MajorLinkerVersion;
-  Dest.MinorLinkerVersion = Src.MinorLinkerVersion;
-  Dest.SizeOfCode = Src.SizeOfCode;
-  Dest.SizeOfInitializedData = Src.SizeOfInitializedData;
-  Dest.SizeOfUninitializedData = Src.SizeOfUninitializedData;
-  Dest.AddressOfEntryPoint = Src.AddressOfEntryPoint;
-  Dest.BaseOfCode = Src.BaseOfCode;
-  Dest.ImageBase = Src.ImageBase;
-  Dest.SectionAlignment = Src.SectionAlignment;
-  Dest.FileAlignment = Src.FileAlignment;
-  Dest.MajorOperatingSystemVersion = Src.MajorOperatingSystemVersion;
-  Dest.MinorOperatingSystemVersion = Src.MinorOperatingSystemVersion;
-  Dest.MajorImageVersion = Src.MajorImageVersion;
-  Dest.MinorImageVersion = Src.MinorImageVersion;
-  Dest.MajorSubsystemVersion = Src.MajorSubsystemVersion;
-  Dest.MinorSubsystemVersion = Src.MinorSubsystemVersion;
-  Dest.Win32VersionValue = Src.Win32VersionValue;
-  Dest.SizeOfImage = Src.SizeOfImage;
-  Dest.SizeOfHeaders = Src.SizeOfHeaders;
-  Dest.CheckSum = Src.CheckSum;
-  Dest.Subsystem = Src.Subsystem;
-  Dest.DLLCharacteristics = Src.DLLCharacteristics;
-  Dest.SizeOfStackReserve = Src.SizeOfStackReserve;
-  Dest.SizeOfStackCommit = Src.SizeOfStackCommit;
-  Dest.SizeOfHeapReserve = Src.SizeOfHeapReserve;
-  Dest.SizeOfHeapCommit = Src.SizeOfHeapCommit;
-  Dest.LoaderFlags = Src.LoaderFlags;
-  Dest.NumberOfRvaAndSize = Src.NumberOfRvaAndSize;
-}
-
-} // end namespace coff
-} // end namespace objcopy
-} // end namespace llvm
-
-#endif // LLVM_TOOLS_OBJCOPY_COFF_OBJECT_H
diff --git a/llvm/tools/llvm-objcopy/COFF/Reader.cpp b/llvm/tools/llvm-objcopy/COFF/Reader.cpp
deleted file mode 100644
index d1beacb3bd67..000000000000
--- a/llvm/tools/llvm-objcopy/COFF/Reader.cpp
+++ /dev/null
@@ -1,226 +0,0 @@
-//===- Reader.cpp ---------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "Reader.h"
-#include "Object.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/BinaryFormat/COFF.h"
-#include "llvm/Object/COFF.h"
-#include "llvm/Support/ErrorHandling.h"
-#include <cstddef>
-#include <cstdint>
-
-namespace llvm {
-namespace objcopy {
-namespace coff {
-
-using namespace object;
-using namespace COFF;
-
-Error COFFReader::readExecutableHeaders(Object &Obj) const {
-  const dos_header *DH = COFFObj.getDOSHeader();
-  Obj.Is64 = COFFObj.is64();
-  if (!DH)
-    return Error::success();
-
-  Obj.IsPE = true;
-  Obj.DosHeader = *DH;
-  if (DH->AddressOfNewExeHeader > sizeof(*DH))
-    Obj.DosStub = ArrayRef<uint8_t>(reinterpret_cast<const uint8_t *>(&DH[1]),
-                                    DH->AddressOfNewExeHeader - sizeof(*DH));
-
-  if (COFFObj.is64()) {
-    Obj.PeHeader = *COFFObj.getPE32PlusHeader();
-  } else {
-    const pe32_header *PE32 = COFFObj.getPE32Header();
-    copyPeHeader(Obj.PeHeader, *PE32);
-    // The pe32plus_header (stored in Object) lacks the BaseOfData field.
-    Obj.BaseOfData = PE32->BaseOfData;
-  }
-
-  for (size_t I = 0; I < Obj.PeHeader.NumberOfRvaAndSize; I++) {
-    const data_directory *Dir = COFFObj.getDataDirectory(I);
-    if (!Dir)
-      return errorCodeToError(object_error::parse_failed);
-    Obj.DataDirectories.emplace_back(*Dir);
-  }
-  return Error::success();
-}
-
-Error COFFReader::readSections(Object &Obj) const {
-  std::vector<Section> Sections;
-  // Section indexing starts from 1.
-  for (size_t I = 1, E = COFFObj.getNumberOfSections(); I <= E; I++) {
-    Expected<const coff_section *> SecOrErr = COFFObj.getSection(I);
-    if (!SecOrErr)
-      return SecOrErr.takeError();
-    const coff_section *Sec = *SecOrErr;
-    Sections.push_back(Section());
-    Section &S = Sections.back();
-    S.Header = *Sec;
-    S.Header.Characteristics &= ~COFF::IMAGE_SCN_LNK_NRELOC_OVFL;
-    ArrayRef<uint8_t> Contents;
-    if (Error E = COFFObj.getSectionContents(Sec, Contents))
-      return E;
-    S.setContentsRef(Contents);
-    ArrayRef<coff_relocation> Relocs = COFFObj.getRelocations(Sec);
-    for (const coff_relocation &R : Relocs)
-      S.Relocs.push_back(R);
-    if (Expected<StringRef> NameOrErr = COFFObj.getSectionName(Sec))
-      S.Name = *NameOrErr;
-    else
-      return NameOrErr.takeError();
-  }
-  Obj.addSections(Sections);
-  return Error::success();
-}
-
-Error COFFReader::readSymbols(Object &Obj, bool IsBigObj) const {
-  std::vector<Symbol> Symbols;
-  Symbols.reserve(COFFObj.getRawNumberOfSymbols());
-  ArrayRef<Section> Sections = Obj.getSections();
-  for (uint32_t I = 0, E = COFFObj.getRawNumberOfSymbols(); I < E;) {
-    Expected<COFFSymbolRef> SymOrErr = COFFObj.getSymbol(I);
-    if (!SymOrErr)
-      return SymOrErr.takeError();
-    COFFSymbolRef SymRef = *SymOrErr;
-
-    Symbols.push_back(Symbol());
-    Symbol &Sym = Symbols.back();
-    // Copy symbols from the original form into an intermediate coff_symbol32.
-    if (IsBigObj)
-      copySymbol(Sym.Sym,
-                 *reinterpret_cast<const coff_symbol32 *>(SymRef.getRawPtr()));
-    else
-      copySymbol(Sym.Sym,
-                 *reinterpret_cast<const coff_symbol16 *>(SymRef.getRawPtr()));
-    auto NameOrErr = COFFObj.getSymbolName(SymRef);
-    if (!NameOrErr)
-      return NameOrErr.takeError();
-    Sym.Name = *NameOrErr;
-
-    ArrayRef<uint8_t> AuxData = COFFObj.getSymbolAuxData(SymRef);
-    size_t SymSize = IsBigObj ? sizeof(coff_symbol32) : sizeof(coff_symbol16);
-    assert(AuxData.size() == SymSize * SymRef.getNumberOfAuxSymbols());
-    // The auxillary symbols are structs of sizeof(coff_symbol16) each.
-    // In the big object format (where symbols are coff_symbol32), each
-    // auxillary symbol is padded with 2 bytes at the end. Copy each
-    // auxillary symbol to the Sym.AuxData vector. For file symbols,
-    // the whole range of aux symbols are interpreted as one null padded
-    // string instead.
-    if (SymRef.isFileRecord())
-      Sym.AuxFile = StringRef(reinterpret_cast<const char *>(AuxData.data()),
-                              AuxData.size())
-                        .rtrim('\0');
-    else
-      for (size_t I = 0; I < SymRef.getNumberOfAuxSymbols(); I++)
-        Sym.AuxData.push_back(AuxData.slice(I * SymSize, sizeof(AuxSymbol)));
-
-    // Find the unique id of the section
-    if (SymRef.getSectionNumber() <=
-        0) // Special symbol (undefined/absolute/debug)
-      Sym.TargetSectionId = SymRef.getSectionNumber();
-    else if (static_cast<uint32_t>(SymRef.getSectionNumber() - 1) <
-             Sections.size())
-      Sym.TargetSectionId = Sections[SymRef.getSectionNumber() - 1].UniqueId;
-    else
-      return createStringError(object_error::parse_failed,
-                               "section number out of range");
-    // For section definitions, check if it is comdat associative, and if
-    // it is, find the target section unique id.
-    const coff_aux_section_definition *SD = SymRef.getSectionDefinition();
-    const coff_aux_weak_external *WE = SymRef.getWeakExternal();
-    if (SD && SD->Selection == IMAGE_COMDAT_SELECT_ASSOCIATIVE) {
-      int32_t Index = SD->getNumber(IsBigObj);
-      if (Index <= 0 || static_cast<uint32_t>(Index - 1) >= Sections.size())
-        return createStringError(object_error::parse_failed,
-                                 "unexpected associative section index");
-      Sym.AssociativeComdatTargetSectionId = Sections[Index - 1].UniqueId;
-    } else if (WE) {
-      // This is a raw symbol index for now, but store it in the Symbol
-      // until we've added them to the Object, which assigns the final
-      // unique ids.
-      Sym.WeakTargetSymbolId = WE->TagIndex;
-    }
-    I += 1 + SymRef.getNumberOfAuxSymbols();
-  }
-  Obj.addSymbols(Symbols);
-  return Error::success();
-}
-
-Error COFFReader::setSymbolTargets(Object &Obj) const {
-  std::vector<const Symbol *> RawSymbolTable;
-  for (const Symbol &Sym : Obj.getSymbols()) {
-    RawSymbolTable.push_back(&Sym);
-    for (size_t I = 0; I < Sym.Sym.NumberOfAuxSymbols; I++)
-      RawSymbolTable.push_back(nullptr);
-  }
-  for (Symbol &Sym : Obj.getMutableSymbols()) {
-    // Convert WeakTargetSymbolId from the original raw symbol index to
-    // a proper unique id.
-    if (Sym.WeakTargetSymbolId) {
-      if (*Sym.WeakTargetSymbolId >= RawSymbolTable.size())
-        return createStringError(object_error::parse_failed,
-                                 "weak external reference out of range");
-      const Symbol *Target = RawSymbolTable[*Sym.WeakTargetSymbolId];
-      if (Target == nullptr)
-        return createStringError(object_error::parse_failed,
-                                 "invalid SymbolTableIndex");
-      Sym.WeakTargetSymbolId = Target->UniqueId;
-    }
-  }
-  for (Section &Sec : Obj.getMutableSections()) {
-    for (Relocation &R : Sec.Relocs) {
-      if (R.Reloc.SymbolTableIndex >= RawSymbolTable.size())
-        return createStringError(object_error::parse_failed,
-                                 "SymbolTableIndex out of range");
-      const Symbol *Sym = RawSymbolTable[R.Reloc.SymbolTableIndex];
-      if (Sym == nullptr)
-        return createStringError(object_error::parse_failed,
-                                 "invalid SymbolTableIndex");
-      R.Target = Sym->UniqueId;
-      R.TargetName = Sym->Name;
-    }
-  }
-  return Error::success();
-}
-
-Expected<std::unique_ptr<Object>> COFFReader::create() const {
-  auto Obj = std::make_unique<Object>();
-
-  bool IsBigObj = false;
-  if (const coff_file_header *CFH = COFFObj.getCOFFHeader()) {
-    Obj->CoffFileHeader = *CFH;
-  } else {
-    const coff_bigobj_file_header *CBFH = COFFObj.getCOFFBigObjHeader();
-    if (!CBFH)
-      return createStringError(object_error::parse_failed,
-                               "no COFF file header returned");
-    // Only copying the few fields from the bigobj header that we need
-    // and won't recreate in the end.
-    Obj->CoffFileHeader.Machine = CBFH->Machine;
-    Obj->CoffFileHeader.TimeDateStamp = CBFH->TimeDateStamp;
-    IsBigObj = true;
-  }
-
-  if (Error E = readExecutableHeaders(*Obj))
-    return std::move(E);
-  if (Error E = readSections(*Obj))
-    return std::move(E);
-  if (Error E = readSymbols(*Obj, IsBigObj))
-    return std::move(E);
-  if (Error E = setSymbolTargets(*Obj))
-    return std::move(E);
-
-  return std::move(Obj);
-}
-
-} // end namespace coff
-} // end namespace objcopy
-} // end namespace llvm
diff --git a/llvm/tools/llvm-objcopy/COFF/Reader.h b/llvm/tools/llvm-objcopy/COFF/Reader.h
deleted file mode 100644
index 48c050b6ea11..000000000000
--- a/llvm/tools/llvm-objcopy/COFF/Reader.h
+++ /dev/null
@@ -1,41 +0,0 @@
-//===- Reader.h -------------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_OBJCOPY_COFF_READER_H
-#define LLVM_TOOLS_OBJCOPY_COFF_READER_H
-
-#include "llvm/BinaryFormat/COFF.h"
-#include "llvm/Object/COFF.h"
-#include "llvm/Support/Error.h"
-
-namespace llvm {
-namespace objcopy {
-namespace coff {
-
-struct Object;
-
-using object::COFFObjectFile;
-
-class COFFReader {
-  const COFFObjectFile &COFFObj;
-
-  Error readExecutableHeaders(Object &Obj) const;
-  Error readSections(Object &Obj) const;
-  Error readSymbols(Object &Obj, bool IsBigObj) const;
-  Error setSymbolTargets(Object &Obj) const;
-
-public:
-  explicit COFFReader(const COFFObjectFile &O) : COFFObj(O) {}
-  Expected<std::unique_ptr<Object>> create() const;
-};
-
-} // end namespace coff
-} // end namespace objcopy
-} // end namespace llvm
-
-#endif // LLVM_TOOLS_OBJCOPY_COFF_READER_H
diff --git a/llvm/tools/llvm-objcopy/COFF/Writer.cpp b/llvm/tools/llvm-objcopy/COFF/Writer.cpp
deleted file mode 100644
index cbd0e4261238..000000000000
--- a/llvm/tools/llvm-objcopy/COFF/Writer.cpp
+++ /dev/null
@@ -1,457 +0,0 @@
-//===- Writer.cpp ---------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "Writer.h"
-#include "Object.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/BinaryFormat/COFF.h"
-#include "llvm/Object/COFF.h"
-#include "llvm/Support/Errc.h"
-#include "llvm/Support/ErrorHandling.h"
-#include <cstddef>
-#include <cstdint>
-
-namespace llvm {
-namespace objcopy {
-namespace coff {
-
-using namespace object;
-using namespace COFF;
-
-Error COFFWriter::finalizeRelocTargets() {
-  for (Section &Sec : Obj.getMutableSections()) {
-    for (Relocation &R : Sec.Relocs) {
-      const Symbol *Sym = Obj.findSymbol(R.Target);
-      if (Sym == nullptr)
-        return createStringError(object_error::invalid_symbol_index,
-                                 "relocation target '%s' (%zu) not found",
-                                 R.TargetName.str().c_str(), R.Target);
-      R.Reloc.SymbolTableIndex = Sym->RawIndex;
-    }
-  }
-  return Error::success();
-}
-
-Error COFFWriter::finalizeSymbolContents() {
-  for (Symbol &Sym : Obj.getMutableSymbols()) {
-    if (Sym.TargetSectionId <= 0) {
-      // Undefined, or a special kind of symbol. These negative values
-      // are stored in the SectionNumber field which is unsigned.
-      Sym.Sym.SectionNumber = static_cast<uint32_t>(Sym.TargetSectionId);
-    } else {
-      const Section *Sec = Obj.findSection(Sym.TargetSectionId);
-      if (Sec == nullptr)
-        return createStringError(object_error::invalid_symbol_index,
-                                 "symbol '%s' points to a removed section",
-                                 Sym.Name.str().c_str());
-      Sym.Sym.SectionNumber = Sec->Index;
-
-      if (Sym.Sym.NumberOfAuxSymbols == 1 &&
-          Sym.Sym.StorageClass == IMAGE_SYM_CLASS_STATIC) {
-        coff_aux_section_definition *SD =
-            reinterpret_cast<coff_aux_section_definition *>(
-                Sym.AuxData[0].Opaque);
-        uint32_t SDSectionNumber;
-        if (Sym.AssociativeComdatTargetSectionId == 0) {
-          // Not a comdat associative section; just set the Number field to
-          // the number of the section itself.
-          SDSectionNumber = Sec->Index;
-        } else {
-          Sec = Obj.findSection(Sym.AssociativeComdatTargetSectionId);
-          if (Sec == nullptr)
-            return createStringError(
-                object_error::invalid_symbol_index,
-                "symbol '%s' is associative to a removed section",
-                Sym.Name.str().c_str());
-          SDSectionNumber = Sec->Index;
-        }
-        // Update the section definition with the new section number.
-        SD->NumberLowPart = static_cast<uint16_t>(SDSectionNumber);
-        SD->NumberHighPart = static_cast<uint16_t>(SDSectionNumber >> 16);
-      }
-    }
-    // Check that we actually have got AuxData to match the weak symbol target
-    // we want to set. Only >= 1 would be required, but only == 1 makes sense.
-    if (Sym.WeakTargetSymbolId && Sym.Sym.NumberOfAuxSymbols == 1) {
-      coff_aux_weak_external *WE =
-          reinterpret_cast<coff_aux_weak_external *>(Sym.AuxData[0].Opaque);
-      const Symbol *Target = Obj.findSymbol(*Sym.WeakTargetSymbolId);
-      if (Target == nullptr)
-        return createStringError(object_error::invalid_symbol_index,
-                                 "symbol '%s' is missing its weak target",
-                                 Sym.Name.str().c_str());
-      WE->TagIndex = Target->RawIndex;
-    }
-  }
-  return Error::success();
-}
-
-void COFFWriter::layoutSections() {
-  for (auto &S : Obj.getMutableSections()) {
-    if (S.Header.SizeOfRawData > 0)
-      S.Header.PointerToRawData = FileSize;
-    FileSize += S.Header.SizeOfRawData; // For executables, this is already
-                                        // aligned to FileAlignment.
-    if (S.Relocs.size() >= 0xffff) {
-      S.Header.Characteristics |= COFF::IMAGE_SCN_LNK_NRELOC_OVFL;
-      S.Header.NumberOfRelocations = 0xffff;
-      S.Header.PointerToRelocations = FileSize;
-      FileSize += sizeof(coff_relocation);
-    } else {
-      S.Header.NumberOfRelocations = S.Relocs.size();
-      S.Header.PointerToRelocations = S.Relocs.size() ? FileSize : 0;
-    }
-
-    FileSize += S.Relocs.size() * sizeof(coff_relocation);
-    FileSize = alignTo(FileSize, FileAlignment);
-
-    if (S.Header.Characteristics & IMAGE_SCN_CNT_INITIALIZED_DATA)
-      SizeOfInitializedData += S.Header.SizeOfRawData;
-  }
-}
-
-size_t COFFWriter::finalizeStringTable() {
-  for (const auto &S : Obj.getSections())
-    if (S.Name.size() > COFF::NameSize)
-      StrTabBuilder.add(S.Name);
-
-  for (const auto &S : Obj.getSymbols())
-    if (S.Name.size() > COFF::NameSize)
-      StrTabBuilder.add(S.Name);
-
-  StrTabBuilder.finalize();
-
-  for (auto &S : Obj.getMutableSections()) {
-    memset(S.Header.Name, 0, sizeof(S.Header.Name));
-    if (S.Name.size() > COFF::NameSize) {
-      snprintf(S.Header.Name, sizeof(S.Header.Name), "/%d",
-               (int)StrTabBuilder.getOffset(S.Name));
-    } else {
-      memcpy(S.Header.Name, S.Name.data(), S.Name.size());
-    }
-  }
-  for (auto &S : Obj.getMutableSymbols()) {
-    if (S.Name.size() > COFF::NameSize) {
-      S.Sym.Name.Offset.Zeroes = 0;
-      S.Sym.Name.Offset.Offset = StrTabBuilder.getOffset(S.Name);
-    } else {
-      strncpy(S.Sym.Name.ShortName, S.Name.data(), COFF::NameSize);
-    }
-  }
-  return StrTabBuilder.getSize();
-}
-
-template <class SymbolTy>
-std::pair<size_t, size_t> COFFWriter::finalizeSymbolTable() {
-  size_t RawSymIndex = 0;
-  for (auto &S : Obj.getMutableSymbols()) {
-    // Symbols normally have NumberOfAuxSymbols set correctly all the time.
-    // For file symbols, we need to know the output file's symbol size to be
-    // able to calculate the number of slots it occupies.
-    if (!S.AuxFile.empty())
-      S.Sym.NumberOfAuxSymbols =
-          alignTo(S.AuxFile.size(), sizeof(SymbolTy)) / sizeof(SymbolTy);
-    S.RawIndex = RawSymIndex;
-    RawSymIndex += 1 + S.Sym.NumberOfAuxSymbols;
-  }
-  return std::make_pair(RawSymIndex * sizeof(SymbolTy), sizeof(SymbolTy));
-}
-
-Error COFFWriter::finalize(bool IsBigObj) {
-  size_t SymTabSize, SymbolSize;
-  std::tie(SymTabSize, SymbolSize) = IsBigObj
-                                         ? finalizeSymbolTable<coff_symbol32>()
-                                         : finalizeSymbolTable<coff_symbol16>();
-
-  if (Error E = finalizeRelocTargets())
-    return E;
-  if (Error E = finalizeSymbolContents())
-    return E;
-
-  size_t SizeOfHeaders = 0;
-  FileAlignment = 1;
-  size_t PeHeaderSize = 0;
-  if (Obj.IsPE) {
-    Obj.DosHeader.AddressOfNewExeHeader =
-        sizeof(Obj.DosHeader) + Obj.DosStub.size();
-    SizeOfHeaders += Obj.DosHeader.AddressOfNewExeHeader + sizeof(PEMagic);
-
-    FileAlignment = Obj.PeHeader.FileAlignment;
-    Obj.PeHeader.NumberOfRvaAndSize = Obj.DataDirectories.size();
-
-    PeHeaderSize = Obj.Is64 ? sizeof(pe32plus_header) : sizeof(pe32_header);
-    SizeOfHeaders +=
-        PeHeaderSize + sizeof(data_directory) * Obj.DataDirectories.size();
-  }
-  Obj.CoffFileHeader.NumberOfSections = Obj.getSections().size();
-  SizeOfHeaders +=
-      IsBigObj ? sizeof(coff_bigobj_file_header) : sizeof(coff_file_header);
-  SizeOfHeaders += sizeof(coff_section) * Obj.getSections().size();
-  SizeOfHeaders = alignTo(SizeOfHeaders, FileAlignment);
-
-  Obj.CoffFileHeader.SizeOfOptionalHeader =
-      PeHeaderSize + sizeof(data_directory) * Obj.DataDirectories.size();
-
-  FileSize = SizeOfHeaders;
-  SizeOfInitializedData = 0;
-
-  layoutSections();
-
-  if (Obj.IsPE) {
-    Obj.PeHeader.SizeOfHeaders = SizeOfHeaders;
-    Obj.PeHeader.SizeOfInitializedData = SizeOfInitializedData;
-
-    if (!Obj.getSections().empty()) {
-      const Section &S = Obj.getSections().back();
-      Obj.PeHeader.SizeOfImage =
-          alignTo(S.Header.VirtualAddress + S.Header.VirtualSize,
-                  Obj.PeHeader.SectionAlignment);
-    }
-
-    // If the PE header had a checksum, clear it, since it isn't valid
-    // any longer. (We don't calculate a new one.)
-    Obj.PeHeader.CheckSum = 0;
-  }
-
-  size_t StrTabSize = finalizeStringTable();
-
-  size_t PointerToSymbolTable = FileSize;
-  // StrTabSize <= 4 is the size of an empty string table, only consisting
-  // of the length field.
-  if (SymTabSize == 0 && StrTabSize <= 4 && Obj.IsPE) {
-    // For executables, don't point to the symbol table and skip writing
-    // the length field, if both the symbol and string tables are empty.
-    PointerToSymbolTable = 0;
-    StrTabSize = 0;
-  }
-
-  size_t NumRawSymbols = SymTabSize / SymbolSize;
-  Obj.CoffFileHeader.PointerToSymbolTable = PointerToSymbolTable;
-  Obj.CoffFileHeader.NumberOfSymbols = NumRawSymbols;
-  FileSize += SymTabSize + StrTabSize;
-  FileSize = alignTo(FileSize, FileAlignment);
-
-  return Error::success();
-}
-
-void COFFWriter::writeHeaders(bool IsBigObj) {
-  uint8_t *Ptr = reinterpret_cast<uint8_t *>(Buf->getBufferStart());
-  if (Obj.IsPE) {
-    memcpy(Ptr, &Obj.DosHeader, sizeof(Obj.DosHeader));
-    Ptr += sizeof(Obj.DosHeader);
-    memcpy(Ptr, Obj.DosStub.data(), Obj.DosStub.size());
-    Ptr += Obj.DosStub.size();
-    memcpy(Ptr, PEMagic, sizeof(PEMagic));
-    Ptr += sizeof(PEMagic);
-  }
-  if (!IsBigObj) {
-    memcpy(Ptr, &Obj.CoffFileHeader, sizeof(Obj.CoffFileHeader));
-    Ptr += sizeof(Obj.CoffFileHeader);
-  } else {
-    // Generate a coff_bigobj_file_header, filling it in with the values
-    // from Obj.CoffFileHeader. All extra fields that don't exist in
-    // coff_file_header can be set to hardcoded values.
-    coff_bigobj_file_header BigObjHeader;
-    BigObjHeader.Sig1 = IMAGE_FILE_MACHINE_UNKNOWN;
-    BigObjHeader.Sig2 = 0xffff;
-    BigObjHeader.Version = BigObjHeader::MinBigObjectVersion;
-    BigObjHeader.Machine = Obj.CoffFileHeader.Machine;
-    BigObjHeader.TimeDateStamp = Obj.CoffFileHeader.TimeDateStamp;
-    memcpy(BigObjHeader.UUID, BigObjMagic, sizeof(BigObjMagic));
-    BigObjHeader.unused1 = 0;
-    BigObjHeader.unused2 = 0;
-    BigObjHeader.unused3 = 0;
-    BigObjHeader.unused4 = 0;
-    // The value in Obj.CoffFileHeader.NumberOfSections is truncated, thus
-    // get the original one instead.
-    BigObjHeader.NumberOfSections = Obj.getSections().size();
-    BigObjHeader.PointerToSymbolTable = Obj.CoffFileHeader.PointerToSymbolTable;
-    BigObjHeader.NumberOfSymbols = Obj.CoffFileHeader.NumberOfSymbols;
-
-    memcpy(Ptr, &BigObjHeader, sizeof(BigObjHeader));
-    Ptr += sizeof(BigObjHeader);
-  }
-  if (Obj.IsPE) {
-    if (Obj.Is64) {
-      memcpy(Ptr, &Obj.PeHeader, sizeof(Obj.PeHeader));
-      Ptr += sizeof(Obj.PeHeader);
-    } else {
-      pe32_header PeHeader;
-      copyPeHeader(PeHeader, Obj.PeHeader);
-      // The pe32plus_header (stored in Object) lacks the BaseOfData field.
-      PeHeader.BaseOfData = Obj.BaseOfData;
-
-      memcpy(Ptr, &PeHeader, sizeof(PeHeader));
-      Ptr += sizeof(PeHeader);
-    }
-    for (const auto &DD : Obj.DataDirectories) {
-      memcpy(Ptr, &DD, sizeof(DD));
-      Ptr += sizeof(DD);
-    }
-  }
-  for (const auto &S : Obj.getSections()) {
-    memcpy(Ptr, &S.Header, sizeof(S.Header));
-    Ptr += sizeof(S.Header);
-  }
-}
-
-void COFFWriter::writeSections() {
-  for (const auto &S : Obj.getSections()) {
-    uint8_t *Ptr = reinterpret_cast<uint8_t *>(Buf->getBufferStart()) +
-                   S.Header.PointerToRawData;
-    ArrayRef<uint8_t> Contents = S.getContents();
-    std::copy(Contents.begin(), Contents.end(), Ptr);
-
-    // For executable sections, pad the remainder of the raw data size with
-    // 0xcc, which is int3 on x86.
-    if ((S.Header.Characteristics & IMAGE_SCN_CNT_CODE) &&
-        S.Header.SizeOfRawData > Contents.size())
-      memset(Ptr + Contents.size(), 0xcc,
-             S.Header.SizeOfRawData - Contents.size());
-
-    Ptr += S.Header.SizeOfRawData;
-
-    if (S.Relocs.size() >= 0xffff) {
-      object::coff_relocation R;
-      R.VirtualAddress = S.Relocs.size() + 1;
-      R.SymbolTableIndex = 0;
-      R.Type = 0;
-      memcpy(Ptr, &R, sizeof(R));
-      Ptr += sizeof(R);
-    }
-    for (const auto &R : S.Relocs) {
-      memcpy(Ptr, &R.Reloc, sizeof(R.Reloc));
-      Ptr += sizeof(R.Reloc);
-    }
-  }
-}
-
-template <class SymbolTy> void COFFWriter::writeSymbolStringTables() {
-  uint8_t *Ptr = reinterpret_cast<uint8_t *>(Buf->getBufferStart()) +
-                 Obj.CoffFileHeader.PointerToSymbolTable;
-  for (const auto &S : Obj.getSymbols()) {
-    // Convert symbols back to the right size, from coff_symbol32.
-    copySymbol<SymbolTy, coff_symbol32>(*reinterpret_cast<SymbolTy *>(Ptr),
-                                        S.Sym);
-    Ptr += sizeof(SymbolTy);
-    if (!S.AuxFile.empty()) {
-      // For file symbols, just write the string into the aux symbol slots,
-      // assuming that the unwritten parts are initialized to zero in the memory
-      // mapped file.
-      std::copy(S.AuxFile.begin(), S.AuxFile.end(), Ptr);
-      Ptr += S.Sym.NumberOfAuxSymbols * sizeof(SymbolTy);
-    } else {
-      // For other auxillary symbols, write their opaque payload into one symbol
-      // table slot each. For big object files, the symbols are larger than the
-      // opaque auxillary symbol struct and we leave padding at the end of each
-      // entry.
-      for (const AuxSymbol &AuxSym : S.AuxData) {
-        ArrayRef<uint8_t> Ref = AuxSym.getRef();
-        std::copy(Ref.begin(), Ref.end(), Ptr);
-        Ptr += sizeof(SymbolTy);
-      }
-    }
-  }
-  if (StrTabBuilder.getSize() > 4 || !Obj.IsPE) {
-    // Always write a string table in object files, even an empty one.
-    StrTabBuilder.write(Ptr);
-    Ptr += StrTabBuilder.getSize();
-  }
-}
-
-Error COFFWriter::write(bool IsBigObj) {
-  if (Error E = finalize(IsBigObj))
-    return E;
-
-  Buf = WritableMemoryBuffer::getNewMemBuffer(FileSize);
-  if (!Buf)
-    return createStringError(llvm::errc::not_enough_memory,
-                             "failed to allocate memory buffer of " +
-                                 Twine::utohexstr(FileSize) + " bytes.");
-
-  writeHeaders(IsBigObj);
-  writeSections();
-  if (IsBigObj)
-    writeSymbolStringTables<coff_symbol32>();
-  else
-    writeSymbolStringTables<coff_symbol16>();
-
-  if (Obj.IsPE)
-    if (Error E = patchDebugDirectory())
-      return E;
-
-  // TODO: Implement direct writing to the output stream (without intermediate
-  // memory buffer Buf).
-  Out.write(Buf->getBufferStart(), Buf->getBufferSize());
-  return Error::success();
-}
-
-Expected<uint32_t> COFFWriter::virtualAddressToFileAddress(uint32_t RVA) {
-  for (const auto &S : Obj.getSections()) {
-    if (RVA >= S.Header.VirtualAddress &&
-        RVA < S.Header.VirtualAddress + S.Header.SizeOfRawData)
-      return S.Header.PointerToRawData + RVA - S.Header.VirtualAddress;
-  }
-  return createStringError(object_error::parse_failed,
-                           "debug directory payload not found");
-}
-
-// Locate which sections contain the debug directories, iterate over all
-// the debug_directory structs in there, and set the PointerToRawData field
-// in all of them, according to their new physical location in the file.
-Error COFFWriter::patchDebugDirectory() {
-  if (Obj.DataDirectories.size() <= DEBUG_DIRECTORY)
-    return Error::success();
-  const data_directory *Dir = &Obj.DataDirectories[DEBUG_DIRECTORY];
-  if (Dir->Size <= 0)
-    return Error::success();
-  for (const auto &S : Obj.getSections()) {
-    if (Dir->RelativeVirtualAddress >= S.Header.VirtualAddress &&
-        Dir->RelativeVirtualAddress <
-            S.Header.VirtualAddress + S.Header.SizeOfRawData) {
-      if (Dir->RelativeVirtualAddress + Dir->Size >
-          S.Header.VirtualAddress + S.Header.SizeOfRawData)
-        return createStringError(object_error::parse_failed,
-                                 "debug directory extends past end of section");
-
-      size_t Offset = Dir->RelativeVirtualAddress - S.Header.VirtualAddress;
-      uint8_t *Ptr = reinterpret_cast<uint8_t *>(Buf->getBufferStart()) +
-                     S.Header.PointerToRawData + Offset;
-      uint8_t *End = Ptr + Dir->Size;
-      while (Ptr < End) {
-        debug_directory *Debug = reinterpret_cast<debug_directory *>(Ptr);
-        if (Debug->PointerToRawData) {
-          if (Expected<uint32_t> FilePosOrErr =
-                  virtualAddressToFileAddress(Debug->AddressOfRawData))
-            Debug->PointerToRawData = *FilePosOrErr;
-          else
-            return FilePosOrErr.takeError();
-        }
-        Ptr += sizeof(debug_directory);
-        Offset += sizeof(debug_directory);
-      }
-      // Debug directory found and patched, all done.
-      return Error::success();
-    }
-  }
-  return createStringError(object_error::parse_failed,
-                           "debug directory not found");
-}
-
-Error COFFWriter::write() {
-  bool IsBigObj = Obj.getSections().size() > MaxNumberOfSections16;
-  if (IsBigObj && Obj.IsPE)
-    return createStringError(object_error::parse_failed,
-                             "too many sections for executable");
-  return write(IsBigObj);
-}
-
-} // end namespace coff
-} // end namespace objcopy
-} // end namespace llvm
diff --git a/llvm/tools/llvm-objcopy/COFF/Writer.h b/llvm/tools/llvm-objcopy/COFF/Writer.h
deleted file mode 100644
index eed43b3e5814..000000000000
--- a/llvm/tools/llvm-objcopy/COFF/Writer.h
+++ /dev/null
@@ -1,63 +0,0 @@
-//===- Writer.h -------------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_OBJCOPY_COFF_WRITER_H
-#define LLVM_TOOLS_OBJCOPY_COFF_WRITER_H
-
-#include "llvm/MC/StringTableBuilder.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include <cstddef>
-#include <utility>
-
-namespace llvm {
-namespace objcopy {
-namespace coff {
-
-struct Object;
-
-class COFFWriter {
-  Object &Obj;
-  std::unique_ptr<WritableMemoryBuffer> Buf;
-  raw_ostream &Out;
-
-  size_t FileSize;
-  size_t FileAlignment;
-  size_t SizeOfInitializedData;
-  StringTableBuilder StrTabBuilder;
-
-  template <class SymbolTy> std::pair<size_t, size_t> finalizeSymbolTable();
-  Error finalizeRelocTargets();
-  Error finalizeSymbolContents();
-  void layoutSections();
-  size_t finalizeStringTable();
-
-  Error finalize(bool IsBigObj);
-
-  void writeHeaders(bool IsBigObj);
-  void writeSections();
-  template <class SymbolTy> void writeSymbolStringTables();
-
-  Error write(bool IsBigObj);
-
-  Error patchDebugDirectory();
-  Expected<uint32_t> virtualAddressToFileAddress(uint32_t RVA);
-
-public:
-  virtual ~COFFWriter() {}
-  Error write();
-
-  COFFWriter(Object &Obj, raw_ostream &Out)
-      : Obj(Obj), Out(Out), StrTabBuilder(StringTableBuilder::WinCOFF) {}
-};
-
-} // end namespace coff
-} // end namespace objcopy
-} // end namespace llvm
-
-#endif // LLVM_TOOLS_OBJCOPY_COFF_WRITER_H
diff --git a/llvm/tools/llvm-objcopy/CommonConfig.h b/llvm/tools/llvm-objcopy/CommonConfig.h
deleted file mode 100644
index ea39a6da2ba5..000000000000
--- a/llvm/tools/llvm-objcopy/CommonConfig.h
+++ /dev/null
@@ -1,260 +0,0 @@
-//===- CommonConfig.h -------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_LLVM_OBJCOPY_COMMONCONFIG_H
-#define LLVM_TOOLS_LLVM_OBJCOPY_COMMONCONFIG_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/CachedHashString.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Object/ELFTypes.h"
-#include "llvm/Support/GlobPattern.h"
-#include "llvm/Support/Regex.h"
-// Necessary for llvm::DebugCompressionType::None
-#include "llvm/Target/TargetOptions.h"
-#include <vector>
-
-namespace llvm {
-namespace objcopy {
-
-enum class FileFormat {
-  Unspecified,
-  ELF,
-  Binary,
-  IHex,
-};
-
-// This type keeps track of the machine info for various architectures. This
-// lets us map architecture names to ELF types and the e_machine value of the
-// ELF file.
-struct MachineInfo {
-  MachineInfo(uint16_t EM, uint8_t ABI, bool Is64, bool IsLittle)
-      : EMachine(EM), OSABI(ABI), Is64Bit(Is64), IsLittleEndian(IsLittle) {}
-  // Alternative constructor that defaults to NONE for OSABI.
-  MachineInfo(uint16_t EM, bool Is64, bool IsLittle)
-      : MachineInfo(EM, ELF::ELFOSABI_NONE, Is64, IsLittle) {}
-  // Default constructor for unset fields.
-  MachineInfo() : MachineInfo(0, 0, false, false) {}
-  uint16_t EMachine;
-  uint8_t OSABI;
-  bool Is64Bit;
-  bool IsLittleEndian;
-};
-
-// Flags set by --set-section-flags or --rename-section. Interpretation of these
-// is format-specific and not all flags are meaningful for all object file
-// formats. This is a bitmask; many section flags may be set.
-enum SectionFlag {
-  SecNone = 0,
-  SecAlloc = 1 << 0,
-  SecLoad = 1 << 1,
-  SecNoload = 1 << 2,
-  SecReadonly = 1 << 3,
-  SecDebug = 1 << 4,
-  SecCode = 1 << 5,
-  SecData = 1 << 6,
-  SecRom = 1 << 7,
-  SecMerge = 1 << 8,
-  SecStrings = 1 << 9,
-  SecContents = 1 << 10,
-  SecShare = 1 << 11,
-  SecExclude = 1 << 12,
-  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/SecExclude)
-};
-
-struct SectionRename {
-  StringRef OriginalName;
-  StringRef NewName;
-  Optional<SectionFlag> NewFlags;
-};
-
-struct SectionFlagsUpdate {
-  StringRef Name;
-  SectionFlag NewFlags;
-};
-
-enum class DiscardType {
-  None,   // Default
-  All,    // --discard-all (-x)
-  Locals, // --discard-locals (-X)
-};
-
-enum class MatchStyle {
-  Literal,  // Default for symbols.
-  Wildcard, // Default for sections, or enabled with --wildcard (-w).
-  Regex,    // Enabled with --regex.
-};
-
-class NameOrPattern {
-  StringRef Name;
-  // Regex is shared between multiple CommonConfig instances.
-  std::shared_ptr<Regex> R;
-  std::shared_ptr<GlobPattern> G;
-  bool IsPositiveMatch = true;
-
-  NameOrPattern(StringRef N) : Name(N) {}
-  NameOrPattern(std::shared_ptr<Regex> R) : R(R) {}
-  NameOrPattern(std::shared_ptr<GlobPattern> G, bool IsPositiveMatch)
-      : G(G), IsPositiveMatch(IsPositiveMatch) {}
-
-public:
-  // ErrorCallback is used to handle recoverable errors. An Error returned
-  // by the callback aborts the parsing and is then returned by this function.
-  static Expected<NameOrPattern>
-  create(StringRef Pattern, MatchStyle MS,
-         llvm::function_ref<Error(Error)> ErrorCallback);
-
-  bool isPositiveMatch() const { return IsPositiveMatch; }
-  Optional<StringRef> getName() const {
-    if (!R && !G)
-      return Name;
-    return None;
-  }
-  bool operator==(StringRef S) const {
-    return R ? R->match(S) : G ? G->match(S) : Name == S;
-  }
-  bool operator!=(StringRef S) const { return !operator==(S); }
-};
-
-// Matcher that checks symbol or section names against the command line flags
-// provided for that option.
-class NameMatcher {
-  DenseSet<CachedHashStringRef> PosNames;
-  std::vector<NameOrPattern> PosPatterns;
-  std::vector<NameOrPattern> NegMatchers;
-
-public:
-  Error addMatcher(Expected<NameOrPattern> Matcher) {
-    if (!Matcher)
-      return Matcher.takeError();
-    if (Matcher->isPositiveMatch()) {
-      if (Optional<StringRef> MaybeName = Matcher->getName())
-        PosNames.insert(CachedHashStringRef(*MaybeName));
-      else
-        PosPatterns.push_back(std::move(*Matcher));
-    } else {
-      NegMatchers.push_back(std::move(*Matcher));
-    }
-    return Error::success();
-  }
-  bool matches(StringRef S) const {
-    return (PosNames.contains(CachedHashStringRef(S)) ||
-            is_contained(PosPatterns, S)) &&
-           !is_contained(NegMatchers, S);
-  }
-  bool empty() const {
-    return PosNames.empty() && PosPatterns.empty() && NegMatchers.empty();
-  }
-};
-
-enum class SymbolFlag {
-  Global,
-  Local,
-  Weak,
-  Default,
-  Hidden,
-  Protected,
-  File,
-  Section,
-  Object,
-  Function,
-  IndirectFunction,
-  Debug,
-  Constructor,
-  Warning,
-  Indirect,
-  Synthetic,
-  UniqueObject,
-};
-
-// Symbol info specified by --add-symbol option. Symbol flags not supported
-// by a concrete format should be ignored.
-struct NewSymbolInfo {
-  StringRef SymbolName;
-  StringRef SectionName;
-  uint64_t Value = 0;
-  std::vector<SymbolFlag> Flags;
-  std::vector<StringRef> BeforeSyms;
-};
-
-// Configuration for copying/stripping a single file.
-struct CommonConfig {
-  // Main input/output options
-  StringRef InputFilename;
-  FileFormat InputFormat = FileFormat::Unspecified;
-  StringRef OutputFilename;
-  FileFormat OutputFormat = FileFormat::Unspecified;
-
-  // Only applicable when --output-format!=binary (e.g. elf64-x86-64).
-  Optional<MachineInfo> OutputArch;
-
-  // Advanced options
-  StringRef AddGnuDebugLink;
-  // Cached gnu_debuglink's target CRC
-  uint32_t GnuDebugLinkCRC32;
-  Optional<StringRef> ExtractPartition;
-  StringRef SplitDWO;
-  StringRef SymbolsPrefix;
-  StringRef AllocSectionsPrefix;
-  DiscardType DiscardMode = DiscardType::None;
-
-  // Repeated options
-  std::vector<StringRef> AddSection;
-  std::vector<StringRef> DumpSection;
-  std::vector<StringRef> UpdateSection;
-
-  // Section matchers
-  NameMatcher KeepSection;
-  NameMatcher OnlySection;
-  NameMatcher ToRemove;
-
-  // Symbol matchers
-  NameMatcher SymbolsToGlobalize;
-  NameMatcher SymbolsToKeep;
-  NameMatcher SymbolsToLocalize;
-  NameMatcher SymbolsToRemove;
-  NameMatcher UnneededSymbolsToRemove;
-  NameMatcher SymbolsToWeaken;
-  NameMatcher SymbolsToKeepGlobal;
-
-  // Map options
-  StringMap<SectionRename> SectionsToRename;
-  StringMap<uint64_t> SetSectionAlignment;
-  StringMap<SectionFlagsUpdate> SetSectionFlags;
-  StringMap<StringRef> SymbolsToRename;
-
-  // Symbol info specified by --add-symbol option.
-  std::vector<NewSymbolInfo> SymbolsToAdd;
-
-  // Boolean options
-  bool DeterministicArchives = true;
-  bool ExtractDWO = false;
-  bool ExtractMainPartition = false;
-  bool OnlyKeepDebug = false;
-  bool PreserveDates = false;
-  bool StripAll = false;
-  bool StripAllGNU = false;
-  bool StripDWO = false;
-  bool StripDebug = false;
-  bool StripNonAlloc = false;
-  bool StripSections = false;
-  bool StripUnneeded = false;
-  bool Weaken = false;
-  bool DecompressDebugSections = false;
-
-  DebugCompressionType CompressionType = DebugCompressionType::None;
-};
-
-} // namespace objcopy
-} // namespace llvm
-
-#endif // LLVM_TOOLS_LLVM_OBJCOPY_COMMONCONFIG_H
diff --git a/llvm/tools/llvm-objcopy/ConfigManager.cpp b/llvm/tools/llvm-objcopy/ConfigManager.cpp
deleted file mode 100644
index 90730c421a46..000000000000
--- a/llvm/tools/llvm-objcopy/ConfigManager.cpp
+++ /dev/null
@@ -1,1432 +0,0 @@
-//===- ConfigManager.cpp --------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "ConfigManager.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSet.h"
-#include "llvm/BinaryFormat/COFF.h"
-#include "llvm/Option/Arg.h"
-#include "llvm/Option/ArgList.h"
-#include "llvm/Support/CRC.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compression.h"
-#include "llvm/Support/Errc.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/StringSaver.h"
-#include <memory>
-
-using namespace llvm;
-using namespace llvm::objcopy;
-
-namespace {
-enum ObjcopyID {
-  OBJCOPY_INVALID = 0, // This is not an option ID.
-#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
-               HELPTEXT, METAVAR, VALUES)                                      \
-  OBJCOPY_##ID,
-#include "ObjcopyOpts.inc"
-#undef OPTION
-};
-
-#define PREFIX(NAME, VALUE) const char *const OBJCOPY_##NAME[] = VALUE;
-#include "ObjcopyOpts.inc"
-#undef PREFIX
-
-const opt::OptTable::Info ObjcopyInfoTable[] = {
-#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
-               HELPTEXT, METAVAR, VALUES)                                      \
-  {OBJCOPY_##PREFIX,                                                           \
-   NAME,                                                                       \
-   HELPTEXT,                                                                   \
-   METAVAR,                                                                    \
-   OBJCOPY_##ID,                                                               \
-   opt::Option::KIND##Class,                                                   \
-   PARAM,                                                                      \
-   FLAGS,                                                                      \
-   OBJCOPY_##GROUP,                                                            \
-   OBJCOPY_##ALIAS,                                                            \
-   ALIASARGS,                                                                  \
-   VALUES},
-#include "ObjcopyOpts.inc"
-#undef OPTION
-};
-
-class ObjcopyOptTable : public opt::OptTable {
-public:
-  ObjcopyOptTable() : OptTable(ObjcopyInfoTable) {
-    setGroupedShortOptions(true);
-  }
-};
-
-enum InstallNameToolID {
-  INSTALL_NAME_TOOL_INVALID = 0, // This is not an option ID.
-#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
-               HELPTEXT, METAVAR, VALUES)                                      \
-  INSTALL_NAME_TOOL_##ID,
-#include "InstallNameToolOpts.inc"
-#undef OPTION
-};
-
-#define PREFIX(NAME, VALUE)                                                    \
-  const char *const INSTALL_NAME_TOOL_##NAME[] = VALUE;
-#include "InstallNameToolOpts.inc"
-#undef PREFIX
-
-const opt::OptTable::Info InstallNameToolInfoTable[] = {
-#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
-               HELPTEXT, METAVAR, VALUES)                                      \
-  {INSTALL_NAME_TOOL_##PREFIX,                                                 \
-   NAME,                                                                       \
-   HELPTEXT,                                                                   \
-   METAVAR,                                                                    \
-   INSTALL_NAME_TOOL_##ID,                                                     \
-   opt::Option::KIND##Class,                                                   \
-   PARAM,                                                                      \
-   FLAGS,                                                                      \
-   INSTALL_NAME_TOOL_##GROUP,                                                  \
-   INSTALL_NAME_TOOL_##ALIAS,                                                  \
-   ALIASARGS,                                                                  \
-   VALUES},
-#include "InstallNameToolOpts.inc"
-#undef OPTION
-};
-
-class InstallNameToolOptTable : public opt::OptTable {
-public:
-  InstallNameToolOptTable() : OptTable(InstallNameToolInfoTable) {}
-};
-
-enum BitcodeStripID {
-  BITCODE_STRIP_INVALID = 0, // This is not an option ID.
-#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
-               HELPTEXT, METAVAR, VALUES)                                      \
-  BITCODE_STRIP_##ID,
-#include "BitcodeStripOpts.inc"
-#undef OPTION
-};
-
-#define PREFIX(NAME, VALUE) const char *const BITCODE_STRIP_##NAME[] = VALUE;
-#include "BitcodeStripOpts.inc"
-#undef PREFIX
-
-const opt::OptTable::Info BitcodeStripInfoTable[] = {
-#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
-               HELPTEXT, METAVAR, VALUES)                                      \
-  {BITCODE_STRIP_##PREFIX,                                                     \
-   NAME,                                                                       \
-   HELPTEXT,                                                                   \
-   METAVAR,                                                                    \
-   BITCODE_STRIP_##ID,                                                         \
-   opt::Option::KIND##Class,                                                   \
-   PARAM,                                                                      \
-   FLAGS,                                                                      \
-   BITCODE_STRIP_##GROUP,                                                      \
-   BITCODE_STRIP_##ALIAS,                                                      \
-   ALIASARGS,                                                                  \
-   VALUES},
-#include "BitcodeStripOpts.inc"
-#undef OPTION
-};
-
-class BitcodeStripOptTable : public opt::OptTable {
-public:
-  BitcodeStripOptTable() : OptTable(BitcodeStripInfoTable) {}
-};
-
-enum StripID {
-  STRIP_INVALID = 0, // This is not an option ID.
-#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
-               HELPTEXT, METAVAR, VALUES)                                      \
-  STRIP_##ID,
-#include "StripOpts.inc"
-#undef OPTION
-};
-
-#define PREFIX(NAME, VALUE) const char *const STRIP_##NAME[] = VALUE;
-#include "StripOpts.inc"
-#undef PREFIX
-
-const opt::OptTable::Info StripInfoTable[] = {
-#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
-               HELPTEXT, METAVAR, VALUES)                                      \
-  {STRIP_##PREFIX, NAME,       HELPTEXT,                                       \
-   METAVAR,        STRIP_##ID, opt::Option::KIND##Class,                       \
-   PARAM,          FLAGS,      STRIP_##GROUP,                                  \
-   STRIP_##ALIAS,  ALIASARGS,  VALUES},
-#include "StripOpts.inc"
-#undef OPTION
-};
-
-class StripOptTable : public opt::OptTable {
-public:
-  StripOptTable() : OptTable(StripInfoTable) { setGroupedShortOptions(true); }
-};
-
-} // namespace
-
-static SectionFlag parseSectionRenameFlag(StringRef SectionName) {
-  return llvm::StringSwitch<SectionFlag>(SectionName)
-      .CaseLower("alloc", SectionFlag::SecAlloc)
-      .CaseLower("load", SectionFlag::SecLoad)
-      .CaseLower("noload", SectionFlag::SecNoload)
-      .CaseLower("readonly", SectionFlag::SecReadonly)
-      .CaseLower("debug", SectionFlag::SecDebug)
-      .CaseLower("code", SectionFlag::SecCode)
-      .CaseLower("data", SectionFlag::SecData)
-      .CaseLower("rom", SectionFlag::SecRom)
-      .CaseLower("merge", SectionFlag::SecMerge)
-      .CaseLower("strings", SectionFlag::SecStrings)
-      .CaseLower("contents", SectionFlag::SecContents)
-      .CaseLower("share", SectionFlag::SecShare)
-      .CaseLower("exclude", SectionFlag::SecExclude)
-      .Default(SectionFlag::SecNone);
-}
-
-static Expected<SectionFlag>
-parseSectionFlagSet(ArrayRef<StringRef> SectionFlags) {
-  SectionFlag ParsedFlags = SectionFlag::SecNone;
-  for (StringRef Flag : SectionFlags) {
-    SectionFlag ParsedFlag = parseSectionRenameFlag(Flag);
-    if (ParsedFlag == SectionFlag::SecNone)
-      return createStringError(
-          errc::invalid_argument,
-          "unrecognized section flag '%s'. Flags supported for GNU "
-          "compatibility: alloc, load, noload, readonly, exclude, debug, "
-          "code, data, rom, share, contents, merge, strings",
-          Flag.str().c_str());
-    ParsedFlags |= ParsedFlag;
-  }
-
-  return ParsedFlags;
-}
-
-static Expected<SectionRename> parseRenameSectionValue(StringRef FlagValue) {
-  if (!FlagValue.contains('='))
-    return createStringError(errc::invalid_argument,
-                             "bad format for --rename-section: missing '='");
-
-  // Initial split: ".foo" = ".bar,f1,f2,..."
-  auto Old2New = FlagValue.split('=');
-  SectionRename SR;
-  SR.OriginalName = Old2New.first;
-
-  // Flags split: ".bar" "f1" "f2" ...
-  SmallVector<StringRef, 6> NameAndFlags;
-  Old2New.second.split(NameAndFlags, ',');
-  SR.NewName = NameAndFlags[0];
-
-  if (NameAndFlags.size() > 1) {
-    Expected<SectionFlag> ParsedFlagSet =
-        parseSectionFlagSet(makeArrayRef(NameAndFlags).drop_front());
-    if (!ParsedFlagSet)
-      return ParsedFlagSet.takeError();
-    SR.NewFlags = *ParsedFlagSet;
-  }
-
-  return SR;
-}
-
-static Expected<std::pair<StringRef, uint64_t>>
-parseSetSectionAlignment(StringRef FlagValue) {
-  if (!FlagValue.contains('='))
-    return createStringError(
-        errc::invalid_argument,
-        "bad format for --set-section-alignment: missing '='");
-  auto Split = StringRef(FlagValue).split('=');
-  if (Split.first.empty())
-    return createStringError(
-        errc::invalid_argument,
-        "bad format for --set-section-alignment: missing section name");
-  uint64_t NewAlign;
-  if (Split.second.getAsInteger(0, NewAlign))
-    return createStringError(
-        errc::invalid_argument,
-        "invalid alignment for --set-section-alignment: '%s'",
-        Split.second.str().c_str());
-  return std::make_pair(Split.first, NewAlign);
-}
-
-static Expected<SectionFlagsUpdate>
-parseSetSectionFlagValue(StringRef FlagValue) {
-  if (!StringRef(FlagValue).contains('='))
-    return createStringError(errc::invalid_argument,
-                             "bad format for --set-section-flags: missing '='");
-
-  // Initial split: ".foo" = "f1,f2,..."
-  auto Section2Flags = StringRef(FlagValue).split('=');
-  SectionFlagsUpdate SFU;
-  SFU.Name = Section2Flags.first;
-
-  // Flags split: "f1" "f2" ...
-  SmallVector<StringRef, 6> SectionFlags;
-  Section2Flags.second.split(SectionFlags, ',');
-  Expected<SectionFlag> ParsedFlagSet = parseSectionFlagSet(SectionFlags);
-  if (!ParsedFlagSet)
-    return ParsedFlagSet.takeError();
-  SFU.NewFlags = *ParsedFlagSet;
-
-  return SFU;
-}
-
-namespace {
-struct TargetInfo {
-  FileFormat Format;
-  MachineInfo Machine;
-};
-} // namespace
-
-// FIXME: consolidate with the bfd parsing used by lld.
-static const StringMap<MachineInfo> TargetMap{
-    // Name, {EMachine, 64bit, LittleEndian}
-    // x86
-    {"elf32-i386", {ELF::EM_386, false, true}},
-    {"elf32-x86-64", {ELF::EM_X86_64, false, true}},
-    {"elf64-x86-64", {ELF::EM_X86_64, true, true}},
-    // Intel MCU
-    {"elf32-iamcu", {ELF::EM_IAMCU, false, true}},
-    // ARM
-    {"elf32-littlearm", {ELF::EM_ARM, false, true}},
-    // ARM AArch64
-    {"elf64-aarch64", {ELF::EM_AARCH64, true, true}},
-    {"elf64-littleaarch64", {ELF::EM_AARCH64, true, true}},
-    // RISC-V
-    {"elf32-littleriscv", {ELF::EM_RISCV, false, true}},
-    {"elf64-littleriscv", {ELF::EM_RISCV, true, true}},
-    // PowerPC
-    {"elf32-powerpc", {ELF::EM_PPC, false, false}},
-    {"elf32-powerpcle", {ELF::EM_PPC, false, true}},
-    {"elf64-powerpc", {ELF::EM_PPC64, true, false}},
-    {"elf64-powerpcle", {ELF::EM_PPC64, true, true}},
-    // MIPS
-    {"elf32-bigmips", {ELF::EM_MIPS, false, false}},
-    {"elf32-ntradbigmips", {ELF::EM_MIPS, false, false}},
-    {"elf32-ntradlittlemips", {ELF::EM_MIPS, false, true}},
-    {"elf32-tradbigmips", {ELF::EM_MIPS, false, false}},
-    {"elf32-tradlittlemips", {ELF::EM_MIPS, false, true}},
-    {"elf64-tradbigmips", {ELF::EM_MIPS, true, false}},
-    {"elf64-tradlittlemips", {ELF::EM_MIPS, true, true}},
-    // SPARC
-    {"elf32-sparc", {ELF::EM_SPARC, false, false}},
-    {"elf32-sparcel", {ELF::EM_SPARC, false, true}},
-    {"elf32-hexagon", {ELF::EM_HEXAGON, false, true}},
-};
-
-static Expected<TargetInfo>
-getOutputTargetInfoByTargetName(StringRef TargetName) {
-  StringRef OriginalTargetName = TargetName;
-  bool IsFreeBSD = TargetName.consume_back("-freebsd");
-  auto Iter = TargetMap.find(TargetName);
-  if (Iter == std::end(TargetMap))
-    return createStringError(errc::invalid_argument,
-                             "invalid output format: '%s'",
-                             OriginalTargetName.str().c_str());
-  MachineInfo MI = Iter->getValue();
-  if (IsFreeBSD)
-    MI.OSABI = ELF::ELFOSABI_FREEBSD;
-
-  FileFormat Format;
-  if (TargetName.startswith("elf"))
-    Format = FileFormat::ELF;
-  else
-    // This should never happen because `TargetName` is valid (it certainly
-    // exists in the TargetMap).
-    llvm_unreachable("unknown target prefix");
-
-  return {TargetInfo{Format, MI}};
-}
-
-static Error addSymbolsFromFile(NameMatcher &Symbols, BumpPtrAllocator &Alloc,
-                                StringRef Filename, MatchStyle MS,
-                                function_ref<Error(Error)> ErrorCallback) {
-  StringSaver Saver(Alloc);
-  SmallVector<StringRef, 16> Lines;
-  auto BufOrErr = MemoryBuffer::getFile(Filename);
-  if (!BufOrErr)
-    return createFileError(Filename, BufOrErr.getError());
-
-  BufOrErr.get()->getBuffer().split(Lines, '\n');
-  for (StringRef Line : Lines) {
-    // Ignore everything after '#', trim whitespace, and only add the symbol if
-    // it's not empty.
-    auto TrimmedLine = Line.split('#').first.trim();
-    if (!TrimmedLine.empty())
-      if (Error E = Symbols.addMatcher(NameOrPattern::create(
-              Saver.save(TrimmedLine), MS, ErrorCallback)))
-        return E;
-  }
-
-  return Error::success();
-}
-
-Expected<NameOrPattern>
-NameOrPattern::create(StringRef Pattern, MatchStyle MS,
-                      function_ref<Error(Error)> ErrorCallback) {
-  switch (MS) {
-  case MatchStyle::Literal:
-    return NameOrPattern(Pattern);
-  case MatchStyle::Wildcard: {
-    SmallVector<char, 32> Data;
-    bool IsPositiveMatch = true;
-    if (Pattern[0] == '!') {
-      IsPositiveMatch = false;
-      Pattern = Pattern.drop_front();
-    }
-    Expected<GlobPattern> GlobOrErr = GlobPattern::create(Pattern);
-
-    // If we couldn't create it as a glob, report the error, but try again with
-    // a literal if the error reporting is non-fatal.
-    if (!GlobOrErr) {
-      if (Error E = ErrorCallback(GlobOrErr.takeError()))
-        return std::move(E);
-      return create(Pattern, MatchStyle::Literal, ErrorCallback);
-    }
-
-    return NameOrPattern(std::make_shared<GlobPattern>(*GlobOrErr),
-                         IsPositiveMatch);
-  }
-  case MatchStyle::Regex: {
-    SmallVector<char, 32> Data;
-    return NameOrPattern(std::make_shared<Regex>(
-        ("^" + Pattern.ltrim('^').rtrim('$') + "$").toStringRef(Data)));
-  }
-  }
-  llvm_unreachable("Unhandled llvm.objcopy.MatchStyle enum");
-}
-
-static Error addSymbolsToRenameFromFile(StringMap<StringRef> &SymbolsToRename,
-                                        BumpPtrAllocator &Alloc,
-                                        StringRef Filename) {
-  StringSaver Saver(Alloc);
-  SmallVector<StringRef, 16> Lines;
-  auto BufOrErr = MemoryBuffer::getFile(Filename);
-  if (!BufOrErr)
-    return createFileError(Filename, BufOrErr.getError());
-
-  BufOrErr.get()->getBuffer().split(Lines, '\n');
-  size_t NumLines = Lines.size();
-  for (size_t LineNo = 0; LineNo < NumLines; ++LineNo) {
-    StringRef TrimmedLine = Lines[LineNo].split('#').first.trim();
-    if (TrimmedLine.empty())
-      continue;
-
-    std::pair<StringRef, StringRef> Pair = Saver.save(TrimmedLine).split(' ');
-    StringRef NewName = Pair.second.trim();
-    if (NewName.empty())
-      return createStringError(errc::invalid_argument,
-                               "%s:%zu: missing new symbol name",
-                               Filename.str().c_str(), LineNo + 1);
-    SymbolsToRename.insert({Pair.first, NewName});
-  }
-  return Error::success();
-}
-
-template <class T> static ErrorOr<T> getAsInteger(StringRef Val) {
-  T Result;
-  if (Val.getAsInteger(0, Result))
-    return errc::invalid_argument;
-  return Result;
-}
-
-namespace {
-
-enum class ToolType { Objcopy, Strip, InstallNameTool, BitcodeStrip };
-
-} // anonymous namespace
-
-static void printHelp(const opt::OptTable &OptTable, raw_ostream &OS,
-                      ToolType Tool) {
-  StringRef HelpText, ToolName;
-  switch (Tool) {
-  case ToolType::Objcopy:
-    ToolName = "llvm-objcopy";
-    HelpText = " [options] input [output]";
-    break;
-  case ToolType::Strip:
-    ToolName = "llvm-strip";
-    HelpText = " [options] inputs...";
-    break;
-  case ToolType::InstallNameTool:
-    ToolName = "llvm-install-name-tool";
-    HelpText = " [options] input";
-    break;
-  case ToolType::BitcodeStrip:
-    ToolName = "llvm-bitcode-strip";
-    HelpText = " [options] input";
-    break;
-  }
-  OptTable.printHelp(OS, (ToolName + HelpText).str().c_str(),
-                     (ToolName + " tool").str().c_str());
-  // TODO: Replace this with libOption call once it adds extrahelp support.
-  // The CommandLine library has a cl::extrahelp class to support this,
-  // but libOption does not have that yet.
-  OS << "\nPass @FILE as argument to read options from FILE.\n";
-}
-
-static Expected<NewSymbolInfo> parseNewSymbolInfo(StringRef FlagValue) {
-  // Parse value given with --add-symbol option and create the
-  // new symbol if possible. The value format for --add-symbol is:
-  //
-  // <name>=[<section>:]<value>[,<flags>]
-  //
-  // where:
-  // <name> - symbol name, can be empty string
-  // <section> - optional section name. If not given ABS symbol is created
-  // <value> - symbol value, can be decimal or hexadecimal number prefixed
-  //           with 0x.
-  // <flags> - optional flags affecting symbol type, binding or visibility.
-  NewSymbolInfo SI;
-  StringRef Value;
-  std::tie(SI.SymbolName, Value) = FlagValue.split('=');
-  if (Value.empty())
-    return createStringError(
-        errc::invalid_argument,
-        "bad format for --add-symbol, missing '=' after '%s'",
-        SI.SymbolName.str().c_str());
-
-  if (Value.contains(':')) {
-    std::tie(SI.SectionName, Value) = Value.split(':');
-    if (SI.SectionName.empty() || Value.empty())
-      return createStringError(
-          errc::invalid_argument,
-          "bad format for --add-symbol, missing section name or symbol value");
-  }
-
-  SmallVector<StringRef, 6> Flags;
-  Value.split(Flags, ',');
-  if (Flags[0].getAsInteger(0, SI.Value))
-    return createStringError(errc::invalid_argument, "bad symbol value: '%s'",
-                             Flags[0].str().c_str());
-
-  using Functor = std::function<void()>;
-  SmallVector<StringRef, 6> UnsupportedFlags;
-  for (size_t I = 1, NumFlags = Flags.size(); I < NumFlags; ++I)
-    static_cast<Functor>(
-        StringSwitch<Functor>(Flags[I])
-            .CaseLower("global",
-                       [&] { SI.Flags.push_back(SymbolFlag::Global); })
-            .CaseLower("local", [&] { SI.Flags.push_back(SymbolFlag::Local); })
-            .CaseLower("weak", [&] { SI.Flags.push_back(SymbolFlag::Weak); })
-            .CaseLower("default",
-                       [&] { SI.Flags.push_back(SymbolFlag::Default); })
-            .CaseLower("hidden",
-                       [&] { SI.Flags.push_back(SymbolFlag::Hidden); })
-            .CaseLower("protected",
-                       [&] { SI.Flags.push_back(SymbolFlag::Protected); })
-            .CaseLower("file", [&] { SI.Flags.push_back(SymbolFlag::File); })
-            .CaseLower("section",
-                       [&] { SI.Flags.push_back(SymbolFlag::Section); })
-            .CaseLower("object",
-                       [&] { SI.Flags.push_back(SymbolFlag::Object); })
-            .CaseLower("function",
-                       [&] { SI.Flags.push_back(SymbolFlag::Function); })
-            .CaseLower(
-                "indirect-function",
-                [&] { SI.Flags.push_back(SymbolFlag::IndirectFunction); })
-            .CaseLower("debug", [&] { SI.Flags.push_back(SymbolFlag::Debug); })
-            .CaseLower("constructor",
-                       [&] { SI.Flags.push_back(SymbolFlag::Constructor); })
-            .CaseLower("warning",
-                       [&] { SI.Flags.push_back(SymbolFlag::Warning); })
-            .CaseLower("indirect",
-                       [&] { SI.Flags.push_back(SymbolFlag::Indirect); })
-            .CaseLower("synthetic",
-                       [&] { SI.Flags.push_back(SymbolFlag::Synthetic); })
-            .CaseLower("unique-object",
-                       [&] { SI.Flags.push_back(SymbolFlag::UniqueObject); })
-            .StartsWithLower("before=",
-                             [&] {
-                               StringRef SymNamePart =
-                                   Flags[I].split('=').second;
-
-                               if (!SymNamePart.empty())
-                                 SI.BeforeSyms.push_back(SymNamePart);
-                             })
-            .Default([&] { UnsupportedFlags.push_back(Flags[I]); }))();
-  if (!UnsupportedFlags.empty())
-    return createStringError(errc::invalid_argument,
-                             "unsupported flag%s for --add-symbol: '%s'",
-                             UnsupportedFlags.size() > 1 ? "s" : "",
-                             join(UnsupportedFlags, "', '").c_str());
-
-  return SI;
-}
-
-Expected<const ELFConfig &> ConfigManager::getELFConfig() const {
-  return ELF;
-}
-
-Expected<const COFFConfig &> ConfigManager::getCOFFConfig() const {
-  if (!Common.SplitDWO.empty() || !Common.SymbolsPrefix.empty() ||
-      !Common.AllocSectionsPrefix.empty() || !Common.DumpSection.empty() ||
-      !Common.KeepSection.empty() || !Common.SymbolsToGlobalize.empty() ||
-      !Common.SymbolsToKeep.empty() || !Common.SymbolsToLocalize.empty() ||
-      !Common.SymbolsToWeaken.empty() || !Common.SymbolsToKeepGlobal.empty() ||
-      !Common.SectionsToRename.empty() || !Common.SetSectionAlignment.empty() ||
-      Common.ExtractDWO || Common.PreserveDates || Common.StripDWO ||
-      Common.StripNonAlloc || Common.StripSections || Common.Weaken ||
-      Common.DecompressDebugSections ||
-      Common.DiscardMode == DiscardType::Locals ||
-      !Common.SymbolsToAdd.empty()) {
-    return createStringError(llvm::errc::invalid_argument,
-                             "option not supported by llvm-objcopy for COFF");
-  }
-
-  return COFF;
-}
-
-Expected<const MachOConfig &> ConfigManager::getMachOConfig() const {
-  if (!Common.SplitDWO.empty() || !Common.SymbolsPrefix.empty() ||
-      !Common.AllocSectionsPrefix.empty() || !Common.KeepSection.empty() ||
-      !Common.SymbolsToGlobalize.empty() || !Common.SymbolsToKeep.empty() ||
-      !Common.SymbolsToLocalize.empty() || !Common.SymbolsToWeaken.empty() ||
-      !Common.SymbolsToKeepGlobal.empty() || !Common.SectionsToRename.empty() ||
-      !Common.UnneededSymbolsToRemove.empty() ||
-      !Common.SetSectionAlignment.empty() || !Common.SetSectionFlags.empty() ||
-      Common.ExtractDWO || Common.PreserveDates || Common.StripAllGNU ||
-      Common.StripDWO || Common.StripNonAlloc || Common.StripSections ||
-      Common.Weaken || Common.DecompressDebugSections || Common.StripUnneeded ||
-      Common.DiscardMode == DiscardType::Locals ||
-      !Common.SymbolsToAdd.empty()) {
-    return createStringError(llvm::errc::invalid_argument,
-                             "option not supported by llvm-objcopy for MachO");
-  }
-
-  return MachO;
-}
-
-Expected<const WasmConfig &> ConfigManager::getWasmConfig() const {
-  if (!Common.AddGnuDebugLink.empty() || Common.ExtractPartition ||
-      !Common.SplitDWO.empty() || !Common.SymbolsPrefix.empty() ||
-      !Common.AllocSectionsPrefix.empty() ||
-      Common.DiscardMode != DiscardType::None || !Common.SymbolsToAdd.empty() ||
-      !Common.SymbolsToGlobalize.empty() || !Common.SymbolsToLocalize.empty() ||
-      !Common.SymbolsToKeep.empty() || !Common.SymbolsToRemove.empty() ||
-      !Common.UnneededSymbolsToRemove.empty() ||
-      !Common.SymbolsToWeaken.empty() || !Common.SymbolsToKeepGlobal.empty() ||
-      !Common.SectionsToRename.empty() || !Common.SetSectionAlignment.empty() ||
-      !Common.SetSectionFlags.empty() || !Common.SymbolsToRename.empty()) {
-    return createStringError(
-        llvm::errc::invalid_argument,
-        "only flags for section dumping, removal, and addition are supported");
-  }
-
-  return Wasm;
-}
-
-// ParseObjcopyOptions returns the config and sets the input arguments. If a
-// help flag is set then ParseObjcopyOptions will print the help messege and
-// exit.
-Expected<DriverConfig>
-objcopy::parseObjcopyOptions(ArrayRef<const char *> RawArgsArr,
-                             function_ref<Error(Error)> ErrorCallback) {
-  DriverConfig DC;
-  ObjcopyOptTable T;
-
-  const char *const *DashDash =
-      std::find_if(RawArgsArr.begin(), RawArgsArr.end(),
-                   [](StringRef Str) { return Str == "--"; });
-  ArrayRef<const char *> ArgsArr = makeArrayRef(RawArgsArr.begin(), DashDash);
-  if (DashDash != RawArgsArr.end())
-    DashDash = std::next(DashDash);
-
-  unsigned MissingArgumentIndex, MissingArgumentCount;
-  llvm::opt::InputArgList InputArgs =
-      T.ParseArgs(ArgsArr, MissingArgumentIndex, MissingArgumentCount);
-
-  if (InputArgs.size() == 0 && DashDash == RawArgsArr.end()) {
-    printHelp(T, errs(), ToolType::Objcopy);
-    exit(1);
-  }
-
-  if (InputArgs.hasArg(OBJCOPY_help)) {
-    printHelp(T, outs(), ToolType::Objcopy);
-    exit(0);
-  }
-
-  if (InputArgs.hasArg(OBJCOPY_version)) {
-    outs() << "llvm-objcopy, compatible with GNU objcopy\n";
-    cl::PrintVersionMessage();
-    exit(0);
-  }
-
-  SmallVector<const char *, 2> Positional;
-
-  for (auto Arg : InputArgs.filtered(OBJCOPY_UNKNOWN))
-    return createStringError(errc::invalid_argument, "unknown argument '%s'",
-                             Arg->getAsString(InputArgs).c_str());
-
-  for (auto Arg : InputArgs.filtered(OBJCOPY_INPUT))
-    Positional.push_back(Arg->getValue());
-  std::copy(DashDash, RawArgsArr.end(), std::back_inserter(Positional));
-
-  if (Positional.empty())
-    return createStringError(errc::invalid_argument, "no input file specified");
-
-  if (Positional.size() > 2)
-    return createStringError(errc::invalid_argument,
-                             "too many positional arguments");
-
-  ConfigManager ConfigMgr;
-  CommonConfig &Config = ConfigMgr.Common;
-  COFFConfig &COFFConfig = ConfigMgr.COFF;
-  ELFConfig &ELFConfig = ConfigMgr.ELF;
-  MachOConfig &MachOConfig = ConfigMgr.MachO;
-  Config.InputFilename = Positional[0];
-  Config.OutputFilename = Positional[Positional.size() == 1 ? 0 : 1];
-  if (InputArgs.hasArg(OBJCOPY_target) &&
-      (InputArgs.hasArg(OBJCOPY_input_target) ||
-       InputArgs.hasArg(OBJCOPY_output_target)))
-    return createStringError(
-        errc::invalid_argument,
-        "--target cannot be used with --input-target or --output-target");
-
-  if (InputArgs.hasArg(OBJCOPY_regex) && InputArgs.hasArg(OBJCOPY_wildcard))
-    return createStringError(errc::invalid_argument,
-                             "--regex and --wildcard are incompatible");
-
-  MatchStyle SectionMatchStyle = InputArgs.hasArg(OBJCOPY_regex)
-                                     ? MatchStyle::Regex
-                                     : MatchStyle::Wildcard;
-  MatchStyle SymbolMatchStyle = InputArgs.hasArg(OBJCOPY_regex)
-                                    ? MatchStyle::Regex
-                                    : InputArgs.hasArg(OBJCOPY_wildcard)
-                                          ? MatchStyle::Wildcard
-                                          : MatchStyle::Literal;
-  StringRef InputFormat, OutputFormat;
-  if (InputArgs.hasArg(OBJCOPY_target)) {
-    InputFormat = InputArgs.getLastArgValue(OBJCOPY_target);
-    OutputFormat = InputArgs.getLastArgValue(OBJCOPY_target);
-  } else {
-    InputFormat = InputArgs.getLastArgValue(OBJCOPY_input_target);
-    OutputFormat = InputArgs.getLastArgValue(OBJCOPY_output_target);
-  }
-
-  // FIXME:  Currently, we ignore the target for non-binary/ihex formats
-  // explicitly specified by -I option (e.g. -Ielf32-x86-64) and guess the
-  // format by llvm::object::createBinary regardless of the option value.
-  Config.InputFormat = StringSwitch<FileFormat>(InputFormat)
-                           .Case("binary", FileFormat::Binary)
-                           .Case("ihex", FileFormat::IHex)
-                           .Default(FileFormat::Unspecified);
-
-  if (InputArgs.hasArg(OBJCOPY_new_symbol_visibility)) {
-    const uint8_t Invalid = 0xff;
-    StringRef VisibilityStr =
-        InputArgs.getLastArgValue(OBJCOPY_new_symbol_visibility);
-
-    ELFConfig.NewSymbolVisibility = StringSwitch<uint8_t>(VisibilityStr)
-                                        .Case("default", ELF::STV_DEFAULT)
-                                        .Case("hidden", ELF::STV_HIDDEN)
-                                        .Case("internal", ELF::STV_INTERNAL)
-                                        .Case("protected", ELF::STV_PROTECTED)
-                                        .Default(Invalid);
-
-    if (ELFConfig.NewSymbolVisibility == Invalid)
-      return createStringError(errc::invalid_argument,
-                               "'%s' is not a valid symbol visibility",
-                               VisibilityStr.str().c_str());
-  }
-
-  for (const auto *Arg : InputArgs.filtered(OBJCOPY_subsystem)) {
-    StringRef Subsystem, Version;
-    std::tie(Subsystem, Version) = StringRef(Arg->getValue()).split(':');
-    COFFConfig.Subsystem =
-        StringSwitch<unsigned>(Subsystem.lower())
-            .Case("boot_application",
-                  COFF::IMAGE_SUBSYSTEM_WINDOWS_BOOT_APPLICATION)
-            .Case("console", COFF::IMAGE_SUBSYSTEM_WINDOWS_CUI)
-            .Case("efi_application", COFF::IMAGE_SUBSYSTEM_EFI_APPLICATION)
-            .Case("efi_boot_service_driver",
-                  COFF::IMAGE_SUBSYSTEM_EFI_BOOT_SERVICE_DRIVER)
-            .Case("efi_rom", COFF::IMAGE_SUBSYSTEM_EFI_ROM)
-            .Case("efi_runtime_driver",
-                  COFF::IMAGE_SUBSYSTEM_EFI_RUNTIME_DRIVER)
-            .Case("native", COFF::IMAGE_SUBSYSTEM_NATIVE)
-            .Case("posix", COFF::IMAGE_SUBSYSTEM_POSIX_CUI)
-            .Case("windows", COFF::IMAGE_SUBSYSTEM_WINDOWS_GUI)
-            .Default(COFF::IMAGE_SUBSYSTEM_UNKNOWN);
-    if (*COFFConfig.Subsystem == COFF::IMAGE_SUBSYSTEM_UNKNOWN)
-      return createStringError(errc::invalid_argument,
-                               "'%s' is not a valid subsystem",
-                               Subsystem.str().c_str());
-    if (!Version.empty()) {
-      StringRef Major, Minor;
-      std::tie(Major, Minor) = Version.split('.');
-      unsigned Number;
-      if (Major.getAsInteger(10, Number))
-        return createStringError(errc::invalid_argument,
-                                 "'%s' is not a valid subsystem major version",
-                                 Major.str().c_str());
-      COFFConfig.MajorSubsystemVersion = Number;
-      Number = 0;
-      if (!Minor.empty() && Minor.getAsInteger(10, Number))
-        return createStringError(errc::invalid_argument,
-                                 "'%s' is not a valid subsystem minor version",
-                                 Minor.str().c_str());
-      COFFConfig.MinorSubsystemVersion = Number;
-    }
-  }
-
-  Config.OutputFormat = StringSwitch<FileFormat>(OutputFormat)
-                            .Case("binary", FileFormat::Binary)
-                            .Case("ihex", FileFormat::IHex)
-                            .Default(FileFormat::Unspecified);
-  if (Config.OutputFormat == FileFormat::Unspecified) {
-    if (OutputFormat.empty()) {
-      Config.OutputFormat = Config.InputFormat;
-    } else {
-      Expected<TargetInfo> Target =
-          getOutputTargetInfoByTargetName(OutputFormat);
-      if (!Target)
-        return Target.takeError();
-      Config.OutputFormat = Target->Format;
-      Config.OutputArch = Target->Machine;
-    }
-  }
-
-  if (auto Arg = InputArgs.getLastArg(OBJCOPY_compress_debug_sections,
-                                      OBJCOPY_compress_debug_sections_eq)) {
-    Config.CompressionType = DebugCompressionType::Z;
-
-    if (Arg->getOption().getID() == OBJCOPY_compress_debug_sections_eq) {
-      Config.CompressionType =
-          StringSwitch<DebugCompressionType>(
-              InputArgs.getLastArgValue(OBJCOPY_compress_debug_sections_eq))
-              .Case("zlib-gnu", DebugCompressionType::GNU)
-              .Case("zlib", DebugCompressionType::Z)
-              .Default(DebugCompressionType::None);
-      if (Config.CompressionType == DebugCompressionType::None)
-        return createStringError(
-            errc::invalid_argument,
-            "invalid or unsupported --compress-debug-sections format: %s",
-            InputArgs.getLastArgValue(OBJCOPY_compress_debug_sections_eq)
-                .str()
-                .c_str());
-    }
-    if (!zlib::isAvailable())
-      return createStringError(
-          errc::invalid_argument,
-          "LLVM was not compiled with LLVM_ENABLE_ZLIB: can not compress");
-  }
-
-  Config.AddGnuDebugLink = InputArgs.getLastArgValue(OBJCOPY_add_gnu_debuglink);
-  // The gnu_debuglink's target is expected to not change or else its CRC would
-  // become invalidated and get rejected. We can avoid recalculating the
-  // checksum for every target file inside an archive by precomputing the CRC
-  // here. This prevents a significant amount of I/O.
-  if (!Config.AddGnuDebugLink.empty()) {
-    auto DebugOrErr = MemoryBuffer::getFile(Config.AddGnuDebugLink);
-    if (!DebugOrErr)
-      return createFileError(Config.AddGnuDebugLink, DebugOrErr.getError());
-    auto Debug = std::move(*DebugOrErr);
-    Config.GnuDebugLinkCRC32 =
-        llvm::crc32(arrayRefFromStringRef(Debug->getBuffer()));
-  }
-  Config.SplitDWO = InputArgs.getLastArgValue(OBJCOPY_split_dwo);
-  Config.SymbolsPrefix = InputArgs.getLastArgValue(OBJCOPY_prefix_symbols);
-  Config.AllocSectionsPrefix =
-      InputArgs.getLastArgValue(OBJCOPY_prefix_alloc_sections);
-  if (auto Arg = InputArgs.getLastArg(OBJCOPY_extract_partition))
-    Config.ExtractPartition = Arg->getValue();
-
-  for (auto Arg : InputArgs.filtered(OBJCOPY_redefine_symbol)) {
-    if (!StringRef(Arg->getValue()).contains('='))
-      return createStringError(errc::invalid_argument,
-                               "bad format for --redefine-sym");
-    auto Old2New = StringRef(Arg->getValue()).split('=');
-    if (!Config.SymbolsToRename.insert(Old2New).second)
-      return createStringError(errc::invalid_argument,
-                               "multiple redefinition of symbol '%s'",
-                               Old2New.first.str().c_str());
-  }
-
-  for (auto Arg : InputArgs.filtered(OBJCOPY_redefine_symbols))
-    if (Error E = addSymbolsToRenameFromFile(Config.SymbolsToRename, DC.Alloc,
-                                             Arg->getValue()))
-      return std::move(E);
-
-  for (auto Arg : InputArgs.filtered(OBJCOPY_rename_section)) {
-    Expected<SectionRename> SR =
-        parseRenameSectionValue(StringRef(Arg->getValue()));
-    if (!SR)
-      return SR.takeError();
-    if (!Config.SectionsToRename.try_emplace(SR->OriginalName, *SR).second)
-      return createStringError(errc::invalid_argument,
-                               "multiple renames of section '%s'",
-                               SR->OriginalName.str().c_str());
-  }
-  for (auto Arg : InputArgs.filtered(OBJCOPY_set_section_alignment)) {
-    Expected<std::pair<StringRef, uint64_t>> NameAndAlign =
-        parseSetSectionAlignment(Arg->getValue());
-    if (!NameAndAlign)
-      return NameAndAlign.takeError();
-    Config.SetSectionAlignment[NameAndAlign->first] = NameAndAlign->second;
-  }
-  for (auto Arg : InputArgs.filtered(OBJCOPY_set_section_flags)) {
-    Expected<SectionFlagsUpdate> SFU =
-        parseSetSectionFlagValue(Arg->getValue());
-    if (!SFU)
-      return SFU.takeError();
-    if (!Config.SetSectionFlags.try_emplace(SFU->Name, *SFU).second)
-      return createStringError(
-          errc::invalid_argument,
-          "--set-section-flags set multiple times for section '%s'",
-          SFU->Name.str().c_str());
-  }
-  // Prohibit combinations of --set-section-flags when the section name is used
-  // by --rename-section, either as a source or a destination.
-  for (const auto &E : Config.SectionsToRename) {
-    const SectionRename &SR = E.second;
-    if (Config.SetSectionFlags.count(SR.OriginalName))
-      return createStringError(
-          errc::invalid_argument,
-          "--set-section-flags=%s conflicts with --rename-section=%s=%s",
-          SR.OriginalName.str().c_str(), SR.OriginalName.str().c_str(),
-          SR.NewName.str().c_str());
-    if (Config.SetSectionFlags.count(SR.NewName))
-      return createStringError(
-          errc::invalid_argument,
-          "--set-section-flags=%s conflicts with --rename-section=%s=%s",
-          SR.NewName.str().c_str(), SR.OriginalName.str().c_str(),
-          SR.NewName.str().c_str());
-  }
-
-  for (auto Arg : InputArgs.filtered(OBJCOPY_remove_section))
-    if (Error E = Config.ToRemove.addMatcher(NameOrPattern::create(
-            Arg->getValue(), SectionMatchStyle, ErrorCallback)))
-      return std::move(E);
-  for (auto Arg : InputArgs.filtered(OBJCOPY_keep_section))
-    if (Error E = Config.KeepSection.addMatcher(NameOrPattern::create(
-            Arg->getValue(), SectionMatchStyle, ErrorCallback)))
-      return std::move(E);
-  for (auto Arg : InputArgs.filtered(OBJCOPY_only_section))
-    if (Error E = Config.OnlySection.addMatcher(NameOrPattern::create(
-            Arg->getValue(), SectionMatchStyle, ErrorCallback)))
-      return std::move(E);
-  for (auto Arg : InputArgs.filtered(OBJCOPY_add_section)) {
-    StringRef ArgValue(Arg->getValue());
-    if (!ArgValue.contains('='))
-      return createStringError(errc::invalid_argument,
-                               "bad format for --add-section: missing '='");
-    if (ArgValue.split("=").second.empty())
-      return createStringError(
-          errc::invalid_argument,
-          "bad format for --add-section: missing file name");
-    Config.AddSection.push_back(ArgValue);
-  }
-  for (auto Arg : InputArgs.filtered(OBJCOPY_update_section)) {
-    StringRef ArgValue(Arg->getValue());
-    if (!ArgValue.contains('='))
-      return createStringError(errc::invalid_argument,
-                               "bad format for --update-section: missing '='");
-    if (ArgValue.split("=").second.empty())
-      return createStringError(
-          errc::invalid_argument,
-          "bad format for --update-section: missing file name");
-    Config.UpdateSection.push_back(ArgValue);
-  }
-  for (auto *Arg : InputArgs.filtered(OBJCOPY_dump_section)) {
-    StringRef Value(Arg->getValue());
-    if (Value.split('=').second.empty())
-      return createStringError(
-          errc::invalid_argument,
-          "bad format for --dump-section, expected section=file");
-    Config.DumpSection.push_back(Value);
-  }
-  Config.StripAll = InputArgs.hasArg(OBJCOPY_strip_all);
-  Config.StripAllGNU = InputArgs.hasArg(OBJCOPY_strip_all_gnu);
-  Config.StripDebug = InputArgs.hasArg(OBJCOPY_strip_debug);
-  Config.StripDWO = InputArgs.hasArg(OBJCOPY_strip_dwo);
-  Config.StripSections = InputArgs.hasArg(OBJCOPY_strip_sections);
-  Config.StripNonAlloc = InputArgs.hasArg(OBJCOPY_strip_non_alloc);
-  Config.StripUnneeded = InputArgs.hasArg(OBJCOPY_strip_unneeded);
-  Config.ExtractDWO = InputArgs.hasArg(OBJCOPY_extract_dwo);
-  Config.ExtractMainPartition =
-      InputArgs.hasArg(OBJCOPY_extract_main_partition);
-  ELFConfig.LocalizeHidden = InputArgs.hasArg(OBJCOPY_localize_hidden);
-  Config.Weaken = InputArgs.hasArg(OBJCOPY_weaken);
-  if (InputArgs.hasArg(OBJCOPY_discard_all, OBJCOPY_discard_locals))
-    Config.DiscardMode =
-        InputArgs.hasFlag(OBJCOPY_discard_all, OBJCOPY_discard_locals)
-            ? DiscardType::All
-            : DiscardType::Locals;
-  Config.OnlyKeepDebug = InputArgs.hasArg(OBJCOPY_only_keep_debug);
-  ELFConfig.KeepFileSymbols = InputArgs.hasArg(OBJCOPY_keep_file_symbols);
-  MachOConfig.KeepUndefined = InputArgs.hasArg(OBJCOPY_keep_undefined);
-  Config.DecompressDebugSections =
-      InputArgs.hasArg(OBJCOPY_decompress_debug_sections);
-  if (Config.DiscardMode == DiscardType::All) {
-    Config.StripDebug = true;
-    ELFConfig.KeepFileSymbols = true;
-  }
-  for (auto Arg : InputArgs.filtered(OBJCOPY_localize_symbol))
-    if (Error E = Config.SymbolsToLocalize.addMatcher(NameOrPattern::create(
-            Arg->getValue(), SymbolMatchStyle, ErrorCallback)))
-      return std::move(E);
-  for (auto Arg : InputArgs.filtered(OBJCOPY_localize_symbols))
-    if (Error E = addSymbolsFromFile(Config.SymbolsToLocalize, DC.Alloc,
-                                     Arg->getValue(), SymbolMatchStyle,
-                                     ErrorCallback))
-      return std::move(E);
-  for (auto Arg : InputArgs.filtered(OBJCOPY_keep_global_symbol))
-    if (Error E = Config.SymbolsToKeepGlobal.addMatcher(NameOrPattern::create(
-            Arg->getValue(), SymbolMatchStyle, ErrorCallback)))
-      return std::move(E);
-  for (auto Arg : InputArgs.filtered(OBJCOPY_keep_global_symbols))
-    if (Error E = addSymbolsFromFile(Config.SymbolsToKeepGlobal, DC.Alloc,
-                                     Arg->getValue(), SymbolMatchStyle,
-                                     ErrorCallback))
-      return std::move(E);
-  for (auto Arg : InputArgs.filtered(OBJCOPY_globalize_symbol))
-    if (Error E = Config.SymbolsToGlobalize.addMatcher(NameOrPattern::create(
-            Arg->getValue(), SymbolMatchStyle, ErrorCallback)))
-      return std::move(E);
-  for (auto Arg : InputArgs.filtered(OBJCOPY_globalize_symbols))
-    if (Error E = addSymbolsFromFile(Config.SymbolsToGlobalize, DC.Alloc,
-                                     Arg->getValue(), SymbolMatchStyle,
-                                     ErrorCallback))
-      return std::move(E);
-  for (auto Arg : InputArgs.filtered(OBJCOPY_weaken_symbol))
-    if (Error E = Config.SymbolsToWeaken.addMatcher(NameOrPattern::create(
-            Arg->getValue(), SymbolMatchStyle, ErrorCallback)))
-      return std::move(E);
-  for (auto Arg : InputArgs.filtered(OBJCOPY_weaken_symbols))
-    if (Error E = addSymbolsFromFile(Config.SymbolsToWeaken, DC.Alloc,
-                                     Arg->getValue(), SymbolMatchStyle,
-                                     ErrorCallback))
-      return std::move(E);
-  for (auto Arg : InputArgs.filtered(OBJCOPY_strip_symbol))
-    if (Error E = Config.SymbolsToRemove.addMatcher(NameOrPattern::create(
-            Arg->getValue(), SymbolMatchStyle, ErrorCallback)))
-      return std::move(E);
-  for (auto Arg : InputArgs.filtered(OBJCOPY_strip_symbols))
-    if (Error E = addSymbolsFromFile(Config.SymbolsToRemove, DC.Alloc,
-                                     Arg->getValue(), SymbolMatchStyle,
-                                     ErrorCallback))
-      return std::move(E);
-  for (auto Arg : InputArgs.filtered(OBJCOPY_strip_unneeded_symbol))
-    if (Error E =
-            Config.UnneededSymbolsToRemove.addMatcher(NameOrPattern::create(
-                Arg->getValue(), SymbolMatchStyle, ErrorCallback)))
-      return std::move(E);
-  for (auto Arg : InputArgs.filtered(OBJCOPY_strip_unneeded_symbols))
-    if (Error E = addSymbolsFromFile(Config.UnneededSymbolsToRemove, DC.Alloc,
-                                     Arg->getValue(), SymbolMatchStyle,
-                                     ErrorCallback))
-      return std::move(E);
-  for (auto Arg : InputArgs.filtered(OBJCOPY_keep_symbol))
-    if (Error E = Config.SymbolsToKeep.addMatcher(NameOrPattern::create(
-            Arg->getValue(), SymbolMatchStyle, ErrorCallback)))
-      return std::move(E);
-  for (auto Arg : InputArgs.filtered(OBJCOPY_keep_symbols))
-    if (Error E =
-            addSymbolsFromFile(Config.SymbolsToKeep, DC.Alloc, Arg->getValue(),
-                               SymbolMatchStyle, ErrorCallback))
-      return std::move(E);
-  for (auto *Arg : InputArgs.filtered(OBJCOPY_add_symbol)) {
-    Expected<NewSymbolInfo> SymInfo = parseNewSymbolInfo(Arg->getValue());
-    if (!SymInfo)
-      return SymInfo.takeError();
-
-    Config.SymbolsToAdd.push_back(*SymInfo);
-  }
-
-  ELFConfig.AllowBrokenLinks = InputArgs.hasArg(OBJCOPY_allow_broken_links);
-
-  Config.DeterministicArchives = InputArgs.hasFlag(
-      OBJCOPY_enable_deterministic_archives,
-      OBJCOPY_disable_deterministic_archives, /*default=*/true);
-
-  Config.PreserveDates = InputArgs.hasArg(OBJCOPY_preserve_dates);
-
-  if (Config.PreserveDates &&
-      (Config.OutputFilename == "-" || Config.InputFilename == "-"))
-    return createStringError(errc::invalid_argument,
-                             "--preserve-dates requires a file");
-
-  for (auto Arg : InputArgs)
-    if (Arg->getOption().matches(OBJCOPY_set_start)) {
-      auto EAddr = getAsInteger<uint64_t>(Arg->getValue());
-      if (!EAddr)
-        return createStringError(
-            EAddr.getError(), "bad entry point address: '%s'", Arg->getValue());
-
-      ELFConfig.EntryExpr = [EAddr](uint64_t) { return *EAddr; };
-    } else if (Arg->getOption().matches(OBJCOPY_change_start)) {
-      auto EIncr = getAsInteger<int64_t>(Arg->getValue());
-      if (!EIncr)
-        return createStringError(EIncr.getError(),
-                                 "bad entry point increment: '%s'",
-                                 Arg->getValue());
-      auto Expr = ELFConfig.EntryExpr ? std::move(ELFConfig.EntryExpr)
-                                      : [](uint64_t A) { return A; };
-      ELFConfig.EntryExpr = [Expr, EIncr](uint64_t EAddr) {
-        return Expr(EAddr) + *EIncr;
-      };
-    }
-
-  if (Config.DecompressDebugSections &&
-      Config.CompressionType != DebugCompressionType::None) {
-    return createStringError(
-        errc::invalid_argument,
-        "cannot specify both --compress-debug-sections and "
-        "--decompress-debug-sections");
-  }
-
-  if (Config.DecompressDebugSections && !zlib::isAvailable())
-    return createStringError(
-        errc::invalid_argument,
-        "LLVM was not compiled with LLVM_ENABLE_ZLIB: cannot decompress");
-
-  if (Config.ExtractPartition && Config.ExtractMainPartition)
-    return createStringError(errc::invalid_argument,
-                             "cannot specify --extract-partition together with "
-                             "--extract-main-partition");
-
-  DC.CopyConfigs.push_back(std::move(ConfigMgr));
-  return std::move(DC);
-}
-
-// ParseInstallNameToolOptions returns the config and sets the input arguments.
-// If a help flag is set then ParseInstallNameToolOptions will print the help
-// messege and exit.
-Expected<DriverConfig>
-objcopy::parseInstallNameToolOptions(ArrayRef<const char *> ArgsArr) {
-  DriverConfig DC;
-  ConfigManager ConfigMgr;
-  CommonConfig &Config = ConfigMgr.Common;
-  MachOConfig &MachOConfig = ConfigMgr.MachO;
-  InstallNameToolOptTable T;
-  unsigned MissingArgumentIndex, MissingArgumentCount;
-  llvm::opt::InputArgList InputArgs =
-      T.ParseArgs(ArgsArr, MissingArgumentIndex, MissingArgumentCount);
-
-  if (MissingArgumentCount)
-    return createStringError(
-        errc::invalid_argument,
-        "missing argument to " +
-            StringRef(InputArgs.getArgString(MissingArgumentIndex)) +
-            " option");
-
-  if (InputArgs.size() == 0) {
-    printHelp(T, errs(), ToolType::InstallNameTool);
-    exit(1);
-  }
-
-  if (InputArgs.hasArg(INSTALL_NAME_TOOL_help)) {
-    printHelp(T, outs(), ToolType::InstallNameTool);
-    exit(0);
-  }
-
-  if (InputArgs.hasArg(INSTALL_NAME_TOOL_version)) {
-    outs() << "llvm-install-name-tool, compatible with cctools "
-              "install_name_tool\n";
-    cl::PrintVersionMessage();
-    exit(0);
-  }
-
-  for (auto Arg : InputArgs.filtered(INSTALL_NAME_TOOL_add_rpath))
-    MachOConfig.RPathToAdd.push_back(Arg->getValue());
-
-  for (auto *Arg : InputArgs.filtered(INSTALL_NAME_TOOL_prepend_rpath))
-    MachOConfig.RPathToPrepend.push_back(Arg->getValue());
-
-  for (auto Arg : InputArgs.filtered(INSTALL_NAME_TOOL_delete_rpath)) {
-    StringRef RPath = Arg->getValue();
-
-    // Cannot add and delete the same rpath at the same time.
-    if (is_contained(MachOConfig.RPathToAdd, RPath))
-      return createStringError(
-          errc::invalid_argument,
-          "cannot specify both -add_rpath '%s' and -delete_rpath '%s'",
-          RPath.str().c_str(), RPath.str().c_str());
-    if (is_contained(MachOConfig.RPathToPrepend, RPath))
-      return createStringError(
-          errc::invalid_argument,
-          "cannot specify both -prepend_rpath '%s' and -delete_rpath '%s'",
-          RPath.str().c_str(), RPath.str().c_str());
-
-    MachOConfig.RPathsToRemove.insert(RPath);
-  }
-
-  for (auto *Arg : InputArgs.filtered(INSTALL_NAME_TOOL_rpath)) {
-    StringRef Old = Arg->getValue(0);
-    StringRef New = Arg->getValue(1);
-
-    auto Match = [=](StringRef RPath) { return RPath == Old || RPath == New; };
-
-    // Cannot specify duplicate -rpath entries
-    auto It1 = find_if(
-        MachOConfig.RPathsToUpdate,
-        [&Match](const DenseMap<StringRef, StringRef>::value_type &OldNew) {
-          return Match(OldNew.getFirst()) || Match(OldNew.getSecond());
-        });
-    if (It1 != MachOConfig.RPathsToUpdate.end())
-      return createStringError(errc::invalid_argument,
-                               "cannot specify both -rpath '" +
-                                   It1->getFirst() + "' '" + It1->getSecond() +
-                                   "' and -rpath '" + Old + "' '" + New + "'");
-
-    // Cannot specify the same rpath under both -delete_rpath and -rpath
-    auto It2 = find_if(MachOConfig.RPathsToRemove, Match);
-    if (It2 != MachOConfig.RPathsToRemove.end())
-      return createStringError(errc::invalid_argument,
-                               "cannot specify both -delete_rpath '" + *It2 +
-                                   "' and -rpath '" + Old + "' '" + New + "'");
-
-    // Cannot specify the same rpath under both -add_rpath and -rpath
-    auto It3 = find_if(MachOConfig.RPathToAdd, Match);
-    if (It3 != MachOConfig.RPathToAdd.end())
-      return createStringError(errc::invalid_argument,
-                               "cannot specify both -add_rpath '" + *It3 +
-                                   "' and -rpath '" + Old + "' '" + New + "'");
-
-    // Cannot specify the same rpath under both -prepend_rpath and -rpath.
-    auto It4 = find_if(MachOConfig.RPathToPrepend, Match);
-    if (It4 != MachOConfig.RPathToPrepend.end())
-      return createStringError(errc::invalid_argument,
-                               "cannot specify both -prepend_rpath '" + *It4 +
-                                   "' and -rpath '" + Old + "' '" + New + "'");
-
-    MachOConfig.RPathsToUpdate.insert({Old, New});
-  }
-
-  if (auto *Arg = InputArgs.getLastArg(INSTALL_NAME_TOOL_id)) {
-    MachOConfig.SharedLibId = Arg->getValue();
-    if (MachOConfig.SharedLibId->empty())
-      return createStringError(errc::invalid_argument,
-                               "cannot specify an empty id");
-  }
-
-  for (auto *Arg : InputArgs.filtered(INSTALL_NAME_TOOL_change))
-    MachOConfig.InstallNamesToUpdate.insert(
-        {Arg->getValue(0), Arg->getValue(1)});
-
-  MachOConfig.RemoveAllRpaths =
-      InputArgs.hasArg(INSTALL_NAME_TOOL_delete_all_rpaths);
-
-  SmallVector<StringRef, 2> Positional;
-  for (auto Arg : InputArgs.filtered(INSTALL_NAME_TOOL_UNKNOWN))
-    return createStringError(errc::invalid_argument, "unknown argument '%s'",
-                             Arg->getAsString(InputArgs).c_str());
-  for (auto Arg : InputArgs.filtered(INSTALL_NAME_TOOL_INPUT))
-    Positional.push_back(Arg->getValue());
-  if (Positional.empty())
-    return createStringError(errc::invalid_argument, "no input file specified");
-  if (Positional.size() > 1)
-    return createStringError(
-        errc::invalid_argument,
-        "llvm-install-name-tool expects a single input file");
-  Config.InputFilename = Positional[0];
-  Config.OutputFilename = Positional[0];
-
-  DC.CopyConfigs.push_back(std::move(ConfigMgr));
-  return std::move(DC);
-}
-
-Expected<DriverConfig>
-objcopy::parseBitcodeStripOptions(ArrayRef<const char *> ArgsArr) {
-  DriverConfig DC;
-  ConfigManager ConfigMgr;
-  CommonConfig &Config = ConfigMgr.Common;
-  BitcodeStripOptTable T;
-  unsigned MissingArgumentIndex, MissingArgumentCount;
-  opt::InputArgList InputArgs =
-      T.ParseArgs(ArgsArr, MissingArgumentIndex, MissingArgumentCount);
-
-  if (InputArgs.size() == 0) {
-    printHelp(T, errs(), ToolType::BitcodeStrip);
-    exit(1);
-  }
-
-  if (InputArgs.hasArg(BITCODE_STRIP_help)) {
-    printHelp(T, outs(), ToolType::BitcodeStrip);
-    exit(0);
-  }
-
-  if (InputArgs.hasArg(BITCODE_STRIP_version)) {
-    outs() << "llvm-bitcode-strip, compatible with cctools "
-              "bitcode_strip\n";
-    cl::PrintVersionMessage();
-    exit(0);
-  }
-
-  for (auto *Arg : InputArgs.filtered(BITCODE_STRIP_UNKNOWN))
-    return createStringError(errc::invalid_argument, "unknown argument '%s'",
-                             Arg->getAsString(InputArgs).c_str());
-
-  SmallVector<StringRef, 2> Positional;
-  for (auto *Arg : InputArgs.filtered(BITCODE_STRIP_INPUT))
-    Positional.push_back(Arg->getValue());
-  if (Positional.size() > 1)
-    return createStringError(errc::invalid_argument,
-                             "llvm-bitcode-strip expects a single input file");
-  assert(!Positional.empty());
-  Config.InputFilename = Positional[0];
-  Config.OutputFilename = Positional[0];
-
-  DC.CopyConfigs.push_back(std::move(ConfigMgr));
-  return std::move(DC);
-}
-
-// ParseStripOptions returns the config and sets the input arguments. If a
-// help flag is set then ParseStripOptions will print the help messege and
-// exit.
-Expected<DriverConfig>
-objcopy::parseStripOptions(ArrayRef<const char *> RawArgsArr,
-                           function_ref<Error(Error)> ErrorCallback) {
-  const char *const *DashDash =
-      std::find_if(RawArgsArr.begin(), RawArgsArr.end(),
-                   [](StringRef Str) { return Str == "--"; });
-  ArrayRef<const char *> ArgsArr = makeArrayRef(RawArgsArr.begin(), DashDash);
-  if (DashDash != RawArgsArr.end())
-    DashDash = std::next(DashDash);
-
-  StripOptTable T;
-  unsigned MissingArgumentIndex, MissingArgumentCount;
-  llvm::opt::InputArgList InputArgs =
-      T.ParseArgs(ArgsArr, MissingArgumentIndex, MissingArgumentCount);
-
-  if (InputArgs.size() == 0 && DashDash == RawArgsArr.end()) {
-    printHelp(T, errs(), ToolType::Strip);
-    exit(1);
-  }
-
-  if (InputArgs.hasArg(STRIP_help)) {
-    printHelp(T, outs(), ToolType::Strip);
-    exit(0);
-  }
-
-  if (InputArgs.hasArg(STRIP_version)) {
-    outs() << "llvm-strip, compatible with GNU strip\n";
-    cl::PrintVersionMessage();
-    exit(0);
-  }
-
-  SmallVector<StringRef, 2> Positional;
-  for (auto Arg : InputArgs.filtered(STRIP_UNKNOWN))
-    return createStringError(errc::invalid_argument, "unknown argument '%s'",
-                             Arg->getAsString(InputArgs).c_str());
-  for (auto Arg : InputArgs.filtered(STRIP_INPUT))
-    Positional.push_back(Arg->getValue());
-  std::copy(DashDash, RawArgsArr.end(), std::back_inserter(Positional));
-
-  if (Positional.empty())
-    return createStringError(errc::invalid_argument, "no input file specified");
-
-  if (Positional.size() > 1 && InputArgs.hasArg(STRIP_output))
-    return createStringError(
-        errc::invalid_argument,
-        "multiple input files cannot be used in combination with -o");
-
-  ConfigManager ConfigMgr;
-  CommonConfig &Config = ConfigMgr.Common;
-  ELFConfig &ELFConfig = ConfigMgr.ELF;
-  MachOConfig &MachOConfig = ConfigMgr.MachO;
-
-  if (InputArgs.hasArg(STRIP_regex) && InputArgs.hasArg(STRIP_wildcard))
-    return createStringError(errc::invalid_argument,
-                             "--regex and --wildcard are incompatible");
-  MatchStyle SectionMatchStyle =
-      InputArgs.hasArg(STRIP_regex) ? MatchStyle::Regex : MatchStyle::Wildcard;
-  MatchStyle SymbolMatchStyle = InputArgs.hasArg(STRIP_regex)
-                                    ? MatchStyle::Regex
-                                    : InputArgs.hasArg(STRIP_wildcard)
-                                          ? MatchStyle::Wildcard
-                                          : MatchStyle::Literal;
-  ELFConfig.AllowBrokenLinks = InputArgs.hasArg(STRIP_allow_broken_links);
-  Config.StripDebug = InputArgs.hasArg(STRIP_strip_debug);
-
-  if (InputArgs.hasArg(STRIP_discard_all, STRIP_discard_locals))
-    Config.DiscardMode =
-        InputArgs.hasFlag(STRIP_discard_all, STRIP_discard_locals)
-            ? DiscardType::All
-            : DiscardType::Locals;
-  Config.StripSections = InputArgs.hasArg(STRIP_strip_sections);
-  Config.StripUnneeded = InputArgs.hasArg(STRIP_strip_unneeded);
-  if (auto Arg = InputArgs.getLastArg(STRIP_strip_all, STRIP_no_strip_all))
-    Config.StripAll = Arg->getOption().getID() == STRIP_strip_all;
-  Config.StripAllGNU = InputArgs.hasArg(STRIP_strip_all_gnu);
-  MachOConfig.StripSwiftSymbols = InputArgs.hasArg(STRIP_strip_swift_symbols);
-  Config.OnlyKeepDebug = InputArgs.hasArg(STRIP_only_keep_debug);
-  ELFConfig.KeepFileSymbols = InputArgs.hasArg(STRIP_keep_file_symbols);
-  MachOConfig.KeepUndefined = InputArgs.hasArg(STRIP_keep_undefined);
-
-  for (auto Arg : InputArgs.filtered(STRIP_keep_section))
-    if (Error E = Config.KeepSection.addMatcher(NameOrPattern::create(
-            Arg->getValue(), SectionMatchStyle, ErrorCallback)))
-      return std::move(E);
-
-  for (auto Arg : InputArgs.filtered(STRIP_remove_section))
-    if (Error E = Config.ToRemove.addMatcher(NameOrPattern::create(
-            Arg->getValue(), SectionMatchStyle, ErrorCallback)))
-      return std::move(E);
-
-  for (auto Arg : InputArgs.filtered(STRIP_strip_symbol))
-    if (Error E = Config.SymbolsToRemove.addMatcher(NameOrPattern::create(
-            Arg->getValue(), SymbolMatchStyle, ErrorCallback)))
-      return std::move(E);
-
-  for (auto Arg : InputArgs.filtered(STRIP_keep_symbol))
-    if (Error E = Config.SymbolsToKeep.addMatcher(NameOrPattern::create(
-            Arg->getValue(), SymbolMatchStyle, ErrorCallback)))
-      return std::move(E);
-
-  if (!InputArgs.hasArg(STRIP_no_strip_all) && !Config.StripDebug &&
-      !Config.StripUnneeded && Config.DiscardMode == DiscardType::None &&
-      !Config.StripAllGNU && Config.SymbolsToRemove.empty())
-    Config.StripAll = true;
-
-  if (Config.DiscardMode == DiscardType::All) {
-    Config.StripDebug = true;
-    ELFConfig.KeepFileSymbols = true;
-  }
-
-  Config.DeterministicArchives =
-      InputArgs.hasFlag(STRIP_enable_deterministic_archives,
-                        STRIP_disable_deterministic_archives, /*default=*/true);
-
-  Config.PreserveDates = InputArgs.hasArg(STRIP_preserve_dates);
-  Config.InputFormat = FileFormat::Unspecified;
-  Config.OutputFormat = FileFormat::Unspecified;
-
-  DriverConfig DC;
-  if (Positional.size() == 1) {
-    Config.InputFilename = Positional[0];
-    Config.OutputFilename =
-        InputArgs.getLastArgValue(STRIP_output, Positional[0]);
-    DC.CopyConfigs.push_back(std::move(ConfigMgr));
-  } else {
-    StringMap<unsigned> InputFiles;
-    for (StringRef Filename : Positional) {
-      if (InputFiles[Filename]++ == 1) {
-        if (Filename == "-")
-          return createStringError(
-              errc::invalid_argument,
-              "cannot specify '-' as an input file more than once");
-        if (Error E = ErrorCallback(createStringError(
-                errc::invalid_argument, "'%s' was already specified",
-                Filename.str().c_str())))
-          return std::move(E);
-      }
-      Config.InputFilename = Filename;
-      Config.OutputFilename = Filename;
-      DC.CopyConfigs.push_back(ConfigMgr);
-    }
-  }
-
-  if (Config.PreserveDates && (is_contained(Positional, "-") ||
-                               InputArgs.getLastArgValue(STRIP_output) == "-"))
-    return createStringError(errc::invalid_argument,
-                             "--preserve-dates requires a file");
-
-  return std::move(DC);
-}
diff --git a/llvm/tools/llvm-objcopy/ConfigManager.h b/llvm/tools/llvm-objcopy/ConfigManager.h
deleted file mode 100644
index c0d0e8bbc721..000000000000
--- a/llvm/tools/llvm-objcopy/ConfigManager.h
+++ /dev/null
@@ -1,80 +0,0 @@
-//===- ConfigManager.h ----------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_LLVM_OBJCOPY_CONFIGMANAGER_H
-#define LLVM_TOOLS_LLVM_OBJCOPY_CONFIGMANAGER_H
-
-#include "COFF/COFFConfig.h"
-#include "CommonConfig.h"
-#include "ELF/ELFConfig.h"
-#include "MachO/MachOConfig.h"
-#include "MultiFormatConfig.h"
-#include "wasm/WasmConfig.h"
-#include "llvm/Support/Allocator.h"
-#include <vector>
-
-namespace llvm {
-namespace objcopy {
-
-// ConfigManager keeps all configurations and prepare
-// format-specific options.
-struct ConfigManager : public MultiFormatConfig {
-  virtual ~ConfigManager() {}
-
-  const CommonConfig &getCommonConfig() const override { return Common; }
-  Expected<const ELFConfig &> getELFConfig() const override;
-  Expected<const COFFConfig &> getCOFFConfig() const override;
-  Expected<const MachOConfig &> getMachOConfig() const override;
-  Expected<const WasmConfig &> getWasmConfig() const override;
-
-  // All configs.
-  CommonConfig Common;
-  ELFConfig ELF;
-  COFFConfig COFF;
-  MachOConfig MachO;
-  WasmConfig Wasm;
-};
-
-// Configuration for the overall invocation of this tool. When invoked as
-// objcopy, will always contain exactly one CopyConfig. When invoked as strip,
-// will contain one or more CopyConfigs.
-struct DriverConfig {
-  SmallVector<ConfigManager, 1> CopyConfigs;
-  BumpPtrAllocator Alloc;
-};
-
-// ParseObjcopyOptions returns the config and sets the input arguments. If a
-// help flag is set then ParseObjcopyOptions will print the help messege and
-// exit. ErrorCallback is used to handle recoverable errors. An Error returned
-// by the callback aborts the parsing and is then returned by this function.
-Expected<DriverConfig>
-parseObjcopyOptions(ArrayRef<const char *> ArgsArr,
-                    llvm::function_ref<Error(Error)> ErrorCallback);
-
-// ParseInstallNameToolOptions returns the config and sets the input arguments.
-// If a help flag is set then ParseInstallNameToolOptions will print the help
-// messege and exit.
-Expected<DriverConfig>
-parseInstallNameToolOptions(ArrayRef<const char *> ArgsArr);
-
-// ParseBitcodeStripOptions returns the config and sets the input arguments.
-// If a help flag is set then ParseBitcodeStripOptions will print the help
-// messege and exit.
-Expected<DriverConfig> parseBitcodeStripOptions(ArrayRef<const char *> ArgsArr);
-
-// ParseStripOptions returns the config and sets the input arguments. If a
-// help flag is set then ParseStripOptions will print the help messege and
-// exit. ErrorCallback is used to handle recoverable errors. An Error returned
-// by the callback aborts the parsing and is then returned by this function.
-Expected<DriverConfig>
-parseStripOptions(ArrayRef<const char *> ArgsArr,
-                  llvm::function_ref<Error(Error)> ErrorCallback);
-} // namespace objcopy
-} // namespace llvm
-
-#endif // LLVM_TOOLS_LLVM_OBJCOPY_CONFIGMANAGER_H
diff --git a/llvm/tools/llvm-objcopy/ELF/ELFConfig.h b/llvm/tools/llvm-objcopy/ELF/ELFConfig.h
deleted file mode 100644
index 229a8d61fb83..000000000000
--- a/llvm/tools/llvm-objcopy/ELF/ELFConfig.h
+++ /dev/null
@@ -1,38 +0,0 @@
-//===- ELFConfig.h ----------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_LLVM_OBJCOPY_ELF_ELFCONFIG_H
-#define LLVM_TOOLS_LLVM_OBJCOPY_ELF_ELFCONFIG_H
-
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Object/ELFTypes.h"
-#include <vector>
-
-namespace llvm {
-namespace objcopy {
-
-// ELF specific configuration for copying/stripping a single file.
-struct ELFConfig {
-  uint8_t NewSymbolVisibility = (uint8_t)ELF::STV_DEFAULT;
-
-  // ELF entry point address expression. The input parameter is an entry point
-  // address in the input ELF file. The entry address in the output file is
-  // calculated with EntryExpr(input_address), when either --set-start or
-  // --change-start is used.
-  std::function<uint64_t(uint64_t)> EntryExpr;
-
-  bool AllowBrokenLinks = false;
-  bool KeepFileSymbols = false;
-  bool LocalizeHidden = false;
-};
-
-} // namespace objcopy
-} // namespace llvm
-
-#endif // LLVM_TOOLS_LLVM_OBJCOPY_ELF_ELFCONFIG_H
diff --git a/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp b/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
deleted file mode 100644
index f8521fa0d5b7..000000000000
--- a/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
+++ /dev/null
@@ -1,833 +0,0 @@
-//===- ELFObjcopy.cpp -----------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "ELFObjcopy.h"
-#include "CommonConfig.h"
-#include "ELFConfig.h"
-#include "Object.h"
-#include "llvm-objcopy.h"
-#include "llvm/ADT/BitmaskEnum.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/MC/MCTargetOptions.h"
-#include "llvm/Object/Binary.h"
-#include "llvm/Object/ELFObjectFile.h"
-#include "llvm/Object/ELFTypes.h"
-#include "llvm/Object/Error.h"
-#include "llvm/Option/Option.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Compression.h"
-#include "llvm/Support/Errc.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/ErrorOr.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/Memory.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdlib>
-#include <functional>
-#include <iterator>
-#include <memory>
-#include <string>
-#include <system_error>
-#include <utility>
-
-using namespace llvm;
-using namespace llvm::ELF;
-using namespace llvm::objcopy;
-using namespace llvm::objcopy::elf;
-using namespace llvm::object;
-
-using SectionPred = std::function<bool(const SectionBase &Sec)>;
-
-static bool isDebugSection(const SectionBase &Sec) {
-  return StringRef(Sec.Name).startswith(".debug") ||
-         StringRef(Sec.Name).startswith(".zdebug") || Sec.Name == ".gdb_index";
-}
-
-static bool isDWOSection(const SectionBase &Sec) {
-  return StringRef(Sec.Name).endswith(".dwo");
-}
-
-static bool onlyKeepDWOPred(const Object &Obj, const SectionBase &Sec) {
-  // We can't remove the section header string table.
-  if (&Sec == Obj.SectionNames)
-    return false;
-  // Short of keeping the string table we want to keep everything that is a DWO
-  // section and remove everything else.
-  return !isDWOSection(Sec);
-}
-
-static uint64_t getNewShfFlags(SectionFlag AllFlags) {
-  uint64_t NewFlags = 0;
-  if (AllFlags & SectionFlag::SecAlloc)
-    NewFlags |= ELF::SHF_ALLOC;
-  if (!(AllFlags & SectionFlag::SecReadonly))
-    NewFlags |= ELF::SHF_WRITE;
-  if (AllFlags & SectionFlag::SecCode)
-    NewFlags |= ELF::SHF_EXECINSTR;
-  if (AllFlags & SectionFlag::SecMerge)
-    NewFlags |= ELF::SHF_MERGE;
-  if (AllFlags & SectionFlag::SecStrings)
-    NewFlags |= ELF::SHF_STRINGS;
-  if (AllFlags & SectionFlag::SecExclude)
-    NewFlags |= ELF::SHF_EXCLUDE;
-  return NewFlags;
-}
-
-static uint64_t getSectionFlagsPreserveMask(uint64_t OldFlags,
-                                            uint64_t NewFlags) {
-  // Preserve some flags which should not be dropped when setting flags.
-  // Also, preserve anything OS/processor dependant.
-  const uint64_t PreserveMask =
-      (ELF::SHF_COMPRESSED | ELF::SHF_GROUP | ELF::SHF_LINK_ORDER |
-       ELF::SHF_MASKOS | ELF::SHF_MASKPROC | ELF::SHF_TLS |
-       ELF::SHF_INFO_LINK) &
-      ~ELF::SHF_EXCLUDE;
-  return (OldFlags & PreserveMask) | (NewFlags & ~PreserveMask);
-}
-
-static void setSectionFlagsAndType(SectionBase &Sec, SectionFlag Flags) {
-  Sec.Flags = getSectionFlagsPreserveMask(Sec.Flags, getNewShfFlags(Flags));
-
-  // In GNU objcopy, certain flags promote SHT_NOBITS to SHT_PROGBITS. This rule
-  // may promote more non-ALLOC sections than GNU objcopy, but it is fine as
-  // non-ALLOC SHT_NOBITS sections do not make much sense.
-  if (Sec.Type == SHT_NOBITS &&
-      (!(Sec.Flags & ELF::SHF_ALLOC) ||
-       Flags & (SectionFlag::SecContents | SectionFlag::SecLoad)))
-    Sec.Type = SHT_PROGBITS;
-}
-
-static ElfType getOutputElfType(const Binary &Bin) {
-  // Infer output ELF type from the input ELF object
-  if (isa<ELFObjectFile<ELF32LE>>(Bin))
-    return ELFT_ELF32LE;
-  if (isa<ELFObjectFile<ELF64LE>>(Bin))
-    return ELFT_ELF64LE;
-  if (isa<ELFObjectFile<ELF32BE>>(Bin))
-    return ELFT_ELF32BE;
-  if (isa<ELFObjectFile<ELF64BE>>(Bin))
-    return ELFT_ELF64BE;
-  llvm_unreachable("Invalid ELFType");
-}
-
-static ElfType getOutputElfType(const MachineInfo &MI) {
-  // Infer output ELF type from the binary arch specified
-  if (MI.Is64Bit)
-    return MI.IsLittleEndian ? ELFT_ELF64LE : ELFT_ELF64BE;
-  else
-    return MI.IsLittleEndian ? ELFT_ELF32LE : ELFT_ELF32BE;
-}
-
-static std::unique_ptr<Writer> createELFWriter(const CommonConfig &Config,
-                                               Object &Obj, raw_ostream &Out,
-                                               ElfType OutputElfType) {
-  // Depending on the initial ELFT and OutputFormat we need a different Writer.
-  switch (OutputElfType) {
-  case ELFT_ELF32LE:
-    return std::make_unique<ELFWriter<ELF32LE>>(Obj, Out, !Config.StripSections,
-                                                Config.OnlyKeepDebug);
-  case ELFT_ELF64LE:
-    return std::make_unique<ELFWriter<ELF64LE>>(Obj, Out, !Config.StripSections,
-                                                Config.OnlyKeepDebug);
-  case ELFT_ELF32BE:
-    return std::make_unique<ELFWriter<ELF32BE>>(Obj, Out, !Config.StripSections,
-                                                Config.OnlyKeepDebug);
-  case ELFT_ELF64BE:
-    return std::make_unique<ELFWriter<ELF64BE>>(Obj, Out, !Config.StripSections,
-                                                Config.OnlyKeepDebug);
-  }
-  llvm_unreachable("Invalid output format");
-}
-
-static std::unique_ptr<Writer> createWriter(const CommonConfig &Config,
-                                            Object &Obj, raw_ostream &Out,
-                                            ElfType OutputElfType) {
-  switch (Config.OutputFormat) {
-  case FileFormat::Binary:
-    return std::make_unique<BinaryWriter>(Obj, Out);
-  case FileFormat::IHex:
-    return std::make_unique<IHexWriter>(Obj, Out);
-  default:
-    return createELFWriter(Config, Obj, Out, OutputElfType);
-  }
-}
-
-template <class... Ts>
-static Error makeStringError(std::error_code EC, const Twine &Msg,
-                             Ts &&... Args) {
-  std::string FullMsg = (EC.message() + ": " + Msg).str();
-  return createStringError(EC, FullMsg.c_str(), std::forward<Ts>(Args)...);
-}
-
-static Error dumpSectionToFile(StringRef SecName, StringRef Filename,
-                               Object &Obj) {
-  for (auto &Sec : Obj.sections()) {
-    if (Sec.Name == SecName) {
-      if (Sec.Type == SHT_NOBITS)
-        return createStringError(object_error::parse_failed,
-                                 "cannot dump section '%s': it has no contents",
-                                 SecName.str().c_str());
-      Expected<std::unique_ptr<FileOutputBuffer>> BufferOrErr =
-          FileOutputBuffer::create(Filename, Sec.OriginalData.size());
-      if (!BufferOrErr)
-        return BufferOrErr.takeError();
-      std::unique_ptr<FileOutputBuffer> Buf = std::move(*BufferOrErr);
-      std::copy(Sec.OriginalData.begin(), Sec.OriginalData.end(),
-                Buf->getBufferStart());
-      if (Error E = Buf->commit())
-        return E;
-      return Error::success();
-    }
-  }
-  return createStringError(object_error::parse_failed, "section '%s' not found",
-                           SecName.str().c_str());
-}
-
-static bool isCompressable(const SectionBase &Sec) {
-  return !(Sec.Flags & ELF::SHF_COMPRESSED) &&
-         StringRef(Sec.Name).startswith(".debug");
-}
-
-static Error replaceDebugSections(
-    Object &Obj, function_ref<bool(const SectionBase &)> ShouldReplace,
-    function_ref<Expected<SectionBase *>(const SectionBase *)> AddSection) {
-  // Build a list of the debug sections we are going to replace.
-  // We can't call `AddSection` while iterating over sections,
-  // because it would mutate the sections array.
-  SmallVector<SectionBase *, 13> ToReplace;
-  for (auto &Sec : Obj.sections())
-    if (ShouldReplace(Sec))
-      ToReplace.push_back(&Sec);
-
-  // Build a mapping from original section to a new one.
-  DenseMap<SectionBase *, SectionBase *> FromTo;
-  for (SectionBase *S : ToReplace) {
-    Expected<SectionBase *> NewSection = AddSection(S);
-    if (!NewSection)
-      return NewSection.takeError();
-
-    FromTo[S] = *NewSection;
-  }
-
-  return Obj.replaceSections(FromTo);
-}
-
-static bool isAArch64MappingSymbol(const Symbol &Sym) {
-  if (Sym.Binding != STB_LOCAL || Sym.Type != STT_NOTYPE ||
-      Sym.getShndx() == SHN_UNDEF)
-    return false;
-  StringRef Name = Sym.Name;
-  if (!Name.consume_front("$x") && !Name.consume_front("$d"))
-    return false;
-  return Name.empty() || Name.startswith(".");
-}
-
-static bool isArmMappingSymbol(const Symbol &Sym) {
-  if (Sym.Binding != STB_LOCAL || Sym.Type != STT_NOTYPE ||
-      Sym.getShndx() == SHN_UNDEF)
-    return false;
-  StringRef Name = Sym.Name;
-  if (!Name.consume_front("$a") && !Name.consume_front("$d") &&
-      !Name.consume_front("$t"))
-    return false;
-  return Name.empty() || Name.startswith(".");
-}
-
-// Check if the symbol should be preserved because it is required by ABI.
-static bool isRequiredByABISymbol(const Object &Obj, const Symbol &Sym) {
-  switch (Obj.Machine) {
-  case EM_AARCH64:
-    // Mapping symbols should be preserved for a relocatable object file.
-    return Obj.isRelocatable() && isAArch64MappingSymbol(Sym);
-  case EM_ARM:
-    // Mapping symbols should be preserved for a relocatable object file.
-    return Obj.isRelocatable() && isArmMappingSymbol(Sym);
-  default:
-    return false;
-  }
-}
-
-static bool isUnneededSymbol(const Symbol &Sym) {
-  return !Sym.Referenced &&
-         (Sym.Binding == STB_LOCAL || Sym.getShndx() == SHN_UNDEF) &&
-         Sym.Type != STT_SECTION;
-}
-
-static Error updateAndRemoveSymbols(const CommonConfig &Config,
-                                    const ELFConfig &ELFConfig, Object &Obj) {
-  // TODO: update or remove symbols only if there is an option that affects
-  // them.
-  if (!Obj.SymbolTable)
-    return Error::success();
-
-  Obj.SymbolTable->updateSymbols([&](Symbol &Sym) {
-    // Common and undefined symbols don't make sense as local symbols, and can
-    // even cause crashes if we localize those, so skip them.
-    if (!Sym.isCommon() && Sym.getShndx() != SHN_UNDEF &&
-        ((ELFConfig.LocalizeHidden &&
-          (Sym.Visibility == STV_HIDDEN || Sym.Visibility == STV_INTERNAL)) ||
-         Config.SymbolsToLocalize.matches(Sym.Name)))
-      Sym.Binding = STB_LOCAL;
-
-    // Note: these two globalize flags have very similar names but different
-    // meanings:
-    //
-    // --globalize-symbol: promote a symbol to global
-    // --keep-global-symbol: all symbols except for these should be made local
-    //
-    // If --globalize-symbol is specified for a given symbol, it will be
-    // global in the output file even if it is not included via
-    // --keep-global-symbol. Because of that, make sure to check
-    // --globalize-symbol second.
-    if (!Config.SymbolsToKeepGlobal.empty() &&
-        !Config.SymbolsToKeepGlobal.matches(Sym.Name) &&
-        Sym.getShndx() != SHN_UNDEF)
-      Sym.Binding = STB_LOCAL;
-
-    if (Config.SymbolsToGlobalize.matches(Sym.Name) &&
-        Sym.getShndx() != SHN_UNDEF)
-      Sym.Binding = STB_GLOBAL;
-
-    if (Config.SymbolsToWeaken.matches(Sym.Name) && Sym.Binding == STB_GLOBAL)
-      Sym.Binding = STB_WEAK;
-
-    if (Config.Weaken && Sym.Binding == STB_GLOBAL &&
-        Sym.getShndx() != SHN_UNDEF)
-      Sym.Binding = STB_WEAK;
-
-    const auto I = Config.SymbolsToRename.find(Sym.Name);
-    if (I != Config.SymbolsToRename.end())
-      Sym.Name = std::string(I->getValue());
-
-    if (!Config.SymbolsPrefix.empty() && Sym.Type != STT_SECTION)
-      Sym.Name = (Config.SymbolsPrefix + Sym.Name).str();
-  });
-
-  // The purpose of this loop is to mark symbols referenced by sections
-  // (like GroupSection or RelocationSection). This way, we know which
-  // symbols are still 'needed' and which are not.
-  if (Config.StripUnneeded || !Config.UnneededSymbolsToRemove.empty() ||
-      !Config.OnlySection.empty()) {
-    for (SectionBase &Sec : Obj.sections())
-      Sec.markSymbols();
-  }
-
-  auto RemoveSymbolsPred = [&](const Symbol &Sym) {
-    if (Config.SymbolsToKeep.matches(Sym.Name) ||
-        (ELFConfig.KeepFileSymbols && Sym.Type == STT_FILE))
-      return false;
-
-    if (Config.SymbolsToRemove.matches(Sym.Name))
-      return true;
-
-    if (Config.StripAll || Config.StripAllGNU)
-      return true;
-
-    if (isRequiredByABISymbol(Obj, Sym))
-      return false;
-
-    if (Config.StripDebug && Sym.Type == STT_FILE)
-      return true;
-
-    if ((Config.DiscardMode == DiscardType::All ||
-         (Config.DiscardMode == DiscardType::Locals &&
-          StringRef(Sym.Name).startswith(".L"))) &&
-        Sym.Binding == STB_LOCAL && Sym.getShndx() != SHN_UNDEF &&
-        Sym.Type != STT_FILE && Sym.Type != STT_SECTION)
-      return true;
-
-    if ((Config.StripUnneeded ||
-         Config.UnneededSymbolsToRemove.matches(Sym.Name)) &&
-        (!Obj.isRelocatable() || isUnneededSymbol(Sym)))
-      return true;
-
-    // We want to remove undefined symbols if all references have been stripped.
-    if (!Config.OnlySection.empty() && !Sym.Referenced &&
-        Sym.getShndx() == SHN_UNDEF)
-      return true;
-
-    return false;
-  };
-
-  return Obj.removeSymbols(RemoveSymbolsPred);
-}
-
-static Error replaceAndRemoveSections(const CommonConfig &Config,
-                                      const ELFConfig &ELFConfig, Object &Obj) {
-  SectionPred RemovePred = [](const SectionBase &) { return false; };
-
-  // Removes:
-  if (!Config.ToRemove.empty()) {
-    RemovePred = [&Config](const SectionBase &Sec) {
-      return Config.ToRemove.matches(Sec.Name);
-    };
-  }
-
-  if (Config.StripDWO)
-    RemovePred = [RemovePred](const SectionBase &Sec) {
-      return isDWOSection(Sec) || RemovePred(Sec);
-    };
-
-  if (Config.ExtractDWO)
-    RemovePred = [RemovePred, &Obj](const SectionBase &Sec) {
-      return onlyKeepDWOPred(Obj, Sec) || RemovePred(Sec);
-    };
-
-  if (Config.StripAllGNU)
-    RemovePred = [RemovePred, &Obj](const SectionBase &Sec) {
-      if (RemovePred(Sec))
-        return true;
-      if ((Sec.Flags & SHF_ALLOC) != 0)
-        return false;
-      if (&Sec == Obj.SectionNames)
-        return false;
-      switch (Sec.Type) {
-      case SHT_SYMTAB:
-      case SHT_REL:
-      case SHT_RELA:
-      case SHT_STRTAB:
-        return true;
-      }
-      return isDebugSection(Sec);
-    };
-
-  if (Config.StripSections) {
-    RemovePred = [RemovePred](const SectionBase &Sec) {
-      return RemovePred(Sec) || Sec.ParentSegment == nullptr;
-    };
-  }
-
-  if (Config.StripDebug || Config.StripUnneeded) {
-    RemovePred = [RemovePred](const SectionBase &Sec) {
-      return RemovePred(Sec) || isDebugSection(Sec);
-    };
-  }
-
-  if (Config.StripNonAlloc)
-    RemovePred = [RemovePred, &Obj](const SectionBase &Sec) {
-      if (RemovePred(Sec))
-        return true;
-      if (&Sec == Obj.SectionNames)
-        return false;
-      return (Sec.Flags & SHF_ALLOC) == 0 && Sec.ParentSegment == nullptr;
-    };
-
-  if (Config.StripAll)
-    RemovePred = [RemovePred, &Obj](const SectionBase &Sec) {
-      if (RemovePred(Sec))
-        return true;
-      if (&Sec == Obj.SectionNames)
-        return false;
-      if (StringRef(Sec.Name).startswith(".gnu.warning"))
-        return false;
-      // We keep the .ARM.attribute section to maintain compatibility
-      // with Debian derived distributions. This is a bug in their
-      // patchset as documented here:
-      // https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=943798
-      if (Sec.Type == SHT_ARM_ATTRIBUTES)
-        return false;
-      if (Sec.ParentSegment != nullptr)
-        return false;
-      return (Sec.Flags & SHF_ALLOC) == 0;
-    };
-
-  if (Config.ExtractPartition || Config.ExtractMainPartition) {
-    RemovePred = [RemovePred](const SectionBase &Sec) {
-      if (RemovePred(Sec))
-        return true;
-      if (Sec.Type == SHT_LLVM_PART_EHDR || Sec.Type == SHT_LLVM_PART_PHDR)
-        return true;
-      return (Sec.Flags & SHF_ALLOC) != 0 && !Sec.ParentSegment;
-    };
-  }
-
-  // Explicit copies:
-  if (!Config.OnlySection.empty()) {
-    RemovePred = [&Config, RemovePred, &Obj](const SectionBase &Sec) {
-      // Explicitly keep these sections regardless of previous removes.
-      if (Config.OnlySection.matches(Sec.Name))
-        return false;
-
-      // Allow all implicit removes.
-      if (RemovePred(Sec))
-        return true;
-
-      // Keep special sections.
-      if (Obj.SectionNames == &Sec)
-        return false;
-      if (Obj.SymbolTable == &Sec ||
-          (Obj.SymbolTable && Obj.SymbolTable->getStrTab() == &Sec))
-        return false;
-
-      // Remove everything else.
-      return true;
-    };
-  }
-
-  if (!Config.KeepSection.empty()) {
-    RemovePred = [&Config, RemovePred](const SectionBase &Sec) {
-      // Explicitly keep these sections regardless of previous removes.
-      if (Config.KeepSection.matches(Sec.Name))
-        return false;
-      // Otherwise defer to RemovePred.
-      return RemovePred(Sec);
-    };
-  }
-
-  // This has to be the last predicate assignment.
-  // If the option --keep-symbol has been specified
-  // and at least one of those symbols is present
-  // (equivalently, the updated symbol table is not empty)
-  // the symbol table and the string table should not be removed.
-  if ((!Config.SymbolsToKeep.empty() || ELFConfig.KeepFileSymbols) &&
-      Obj.SymbolTable && !Obj.SymbolTable->empty()) {
-    RemovePred = [&Obj, RemovePred](const SectionBase &Sec) {
-      if (&Sec == Obj.SymbolTable || &Sec == Obj.SymbolTable->getStrTab())
-        return false;
-      return RemovePred(Sec);
-    };
-  }
-
-  if (Error E = Obj.removeSections(ELFConfig.AllowBrokenLinks, RemovePred))
-    return E;
-
-  if (Config.CompressionType != DebugCompressionType::None) {
-    if (Error Err = replaceDebugSections(
-            Obj, isCompressable,
-            [&Config, &Obj](const SectionBase *S) -> Expected<SectionBase *> {
-              Expected<CompressedSection> NewSection =
-                  CompressedSection::create(*S, Config.CompressionType);
-              if (!NewSection)
-                return NewSection.takeError();
-
-              return &Obj.addSection<CompressedSection>(std::move(*NewSection));
-            }))
-      return Err;
-  } else if (Config.DecompressDebugSections) {
-    if (Error Err = replaceDebugSections(
-            Obj,
-            [](const SectionBase &S) { return isa<CompressedSection>(&S); },
-            [&Obj](const SectionBase *S) {
-              const CompressedSection *CS = cast<CompressedSection>(S);
-              return &Obj.addSection<DecompressedSection>(*CS);
-            }))
-      return Err;
-  }
-
-  return Error::success();
-}
-
-// Add symbol to the Object symbol table with the specified properties.
-static void addSymbol(Object &Obj, const NewSymbolInfo &SymInfo,
-                      uint8_t DefaultVisibility) {
-  SectionBase *Sec = Obj.findSection(SymInfo.SectionName);
-  uint64_t Value = Sec ? Sec->Addr + SymInfo.Value : SymInfo.Value;
-
-  uint8_t Bind = ELF::STB_GLOBAL;
-  uint8_t Type = ELF::STT_NOTYPE;
-  uint8_t Visibility = DefaultVisibility;
-
-  for (SymbolFlag FlagValue : SymInfo.Flags)
-    switch (FlagValue) {
-    case SymbolFlag::Global:
-      Bind = ELF::STB_GLOBAL;
-      break;
-    case SymbolFlag::Local:
-      Bind = ELF::STB_LOCAL;
-      break;
-    case SymbolFlag::Weak:
-      Bind = ELF::STB_WEAK;
-      break;
-    case SymbolFlag::Default:
-      Visibility = ELF::STV_DEFAULT;
-      break;
-    case SymbolFlag::Hidden:
-      Visibility = ELF::STV_HIDDEN;
-      break;
-    case SymbolFlag::Protected:
-      Visibility = ELF::STV_PROTECTED;
-      break;
-    case SymbolFlag::File:
-      Type = ELF::STT_FILE;
-      break;
-    case SymbolFlag::Section:
-      Type = ELF::STT_SECTION;
-      break;
-    case SymbolFlag::Object:
-      Type = ELF::STT_OBJECT;
-      break;
-    case SymbolFlag::Function:
-      Type = ELF::STT_FUNC;
-      break;
-    case SymbolFlag::IndirectFunction:
-      Type = ELF::STT_GNU_IFUNC;
-      break;
-    default: /* Other flag values are ignored for ELF. */
-      break;
-    };
-
-  Obj.SymbolTable->addSymbol(
-      SymInfo.SymbolName, Bind, Type, Sec, Value, Visibility,
-      Sec ? (uint16_t)SYMBOL_SIMPLE_INDEX : (uint16_t)SHN_ABS, 0);
-}
-
-static Error
-handleUserSection(StringRef Flag,
-                  function_ref<Error(StringRef, ArrayRef<uint8_t>)> F) {
-  std::pair<StringRef, StringRef> SecPair = Flag.split("=");
-  StringRef SecName = SecPair.first;
-  StringRef File = SecPair.second;
-  ErrorOr<std::unique_ptr<MemoryBuffer>> BufOrErr = MemoryBuffer::getFile(File);
-  if (!BufOrErr)
-    return createFileError(File, errorCodeToError(BufOrErr.getError()));
-  std::unique_ptr<MemoryBuffer> Buf = std::move(*BufOrErr);
-  ArrayRef<uint8_t> Data(
-      reinterpret_cast<const uint8_t *>(Buf->getBufferStart()),
-      Buf->getBufferSize());
-  return F(SecName, Data);
-}
-
-// This function handles the high level operations of GNU objcopy including
-// handling command line options. It's important to outline certain properties
-// we expect to hold of the command line operations. Any operation that "keeps"
-// should keep regardless of a remove. Additionally any removal should respect
-// any previous removals. Lastly whether or not something is removed shouldn't
-// depend a) on the order the options occur in or b) on some opaque priority
-// system. The only priority is that keeps/copies overrule removes.
-static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig,
-                        Object &Obj) {
-  if (Config.OutputArch) {
-    Obj.Machine = Config.OutputArch.getValue().EMachine;
-    Obj.OSABI = Config.OutputArch.getValue().OSABI;
-  }
-
-  if (!Config.SplitDWO.empty() && Config.ExtractDWO) {
-    return Obj.removeSections(
-        ELFConfig.AllowBrokenLinks,
-        [&Obj](const SectionBase &Sec) { return onlyKeepDWOPred(Obj, Sec); });
-  }
-
-  // Dump sections before add/remove for compatibility with GNU objcopy.
-  for (StringRef Flag : Config.DumpSection) {
-    StringRef SectionName;
-    StringRef FileName;
-    std::tie(SectionName, FileName) = Flag.split('=');
-    if (Error E = dumpSectionToFile(SectionName, FileName, Obj))
-      return E;
-  }
-
-  // It is important to remove the sections first. For example, we want to
-  // remove the relocation sections before removing the symbols. That allows
-  // us to avoid reporting the inappropriate errors about removing symbols
-  // named in relocations.
-  if (Error E = replaceAndRemoveSections(Config, ELFConfig, Obj))
-    return E;
-
-  if (Error E = updateAndRemoveSymbols(Config, ELFConfig, Obj))
-    return E;
-
-  if (!Config.SectionsToRename.empty()) {
-    std::vector<RelocationSectionBase *> RelocSections;
-    DenseSet<SectionBase *> RenamedSections;
-    for (SectionBase &Sec : Obj.sections()) {
-      auto *RelocSec = dyn_cast<RelocationSectionBase>(&Sec);
-      const auto Iter = Config.SectionsToRename.find(Sec.Name);
-      if (Iter != Config.SectionsToRename.end()) {
-        const SectionRename &SR = Iter->second;
-        Sec.Name = std::string(SR.NewName);
-        if (SR.NewFlags.hasValue())
-          setSectionFlagsAndType(Sec, SR.NewFlags.getValue());
-        RenamedSections.insert(&Sec);
-      } else if (RelocSec && !(Sec.Flags & SHF_ALLOC))
-        // Postpone processing relocation sections which are not specified in
-        // their explicit '--rename-section' commands until after their target
-        // sections are renamed.
-        // Dynamic relocation sections (i.e. ones with SHF_ALLOC) should be
-        // renamed only explicitly. Otherwise, renaming, for example, '.got.plt'
-        // would affect '.rela.plt', which is not desirable.
-        RelocSections.push_back(RelocSec);
-    }
-
-    // Rename relocation sections according to their target sections.
-    for (RelocationSectionBase *RelocSec : RelocSections) {
-      auto Iter = RenamedSections.find(RelocSec->getSection());
-      if (Iter != RenamedSections.end())
-        RelocSec->Name = (RelocSec->getNamePrefix() + (*Iter)->Name).str();
-    }
-  }
-
-  // Add a prefix to allocated sections and their relocation sections. This
-  // should be done after renaming the section by Config.SectionToRename to
-  // imitate the GNU objcopy behavior.
-  if (!Config.AllocSectionsPrefix.empty()) {
-    DenseSet<SectionBase *> PrefixedSections;
-    for (SectionBase &Sec : Obj.sections()) {
-      if (Sec.Flags & SHF_ALLOC) {
-        Sec.Name = (Config.AllocSectionsPrefix + Sec.Name).str();
-        PrefixedSections.insert(&Sec);
-      } else if (auto *RelocSec = dyn_cast<RelocationSectionBase>(&Sec)) {
-        // Rename relocation sections associated to the allocated sections.
-        // For example, if we rename .text to .prefix.text, we also rename
-        // .rel.text to .rel.prefix.text.
-        //
-        // Dynamic relocation sections (SHT_REL[A] with SHF_ALLOC) are handled
-        // above, e.g., .rela.plt is renamed to .prefix.rela.plt, not
-        // .rela.prefix.plt since GNU objcopy does so.
-        const SectionBase *TargetSec = RelocSec->getSection();
-        if (TargetSec && (TargetSec->Flags & SHF_ALLOC)) {
-          // If the relocation section comes *after* the target section, we
-          // don't add Config.AllocSectionsPrefix because we've already added
-          // the prefix to TargetSec->Name. Otherwise, if the relocation
-          // section comes *before* the target section, we add the prefix.
-          if (PrefixedSections.count(TargetSec))
-            Sec.Name = (RelocSec->getNamePrefix() + TargetSec->Name).str();
-          else
-            Sec.Name = (RelocSec->getNamePrefix() + Config.AllocSectionsPrefix +
-                        TargetSec->Name)
-                           .str();
-        }
-      }
-    }
-  }
-
-  if (!Config.SetSectionAlignment.empty()) {
-    for (SectionBase &Sec : Obj.sections()) {
-      auto I = Config.SetSectionAlignment.find(Sec.Name);
-      if (I != Config.SetSectionAlignment.end())
-        Sec.Align = I->second;
-    }
-  }
-
-  if (Config.OnlyKeepDebug)
-    for (auto &Sec : Obj.sections())
-      if (Sec.Flags & SHF_ALLOC && Sec.Type != SHT_NOTE)
-        Sec.Type = SHT_NOBITS;
-
-  for (const auto &Flag : Config.AddSection) {
-    auto AddSection = [&](StringRef Name, ArrayRef<uint8_t> Data) {
-      OwnedDataSection &NewSection =
-          Obj.addSection<OwnedDataSection>(Name, Data);
-      if (Name.startswith(".note") && Name != ".note.GNU-stack")
-        NewSection.Type = SHT_NOTE;
-      return Error::success();
-    };
-    if (Error E = handleUserSection(Flag, AddSection))
-      return E;
-  }
-
-  for (StringRef Flag : Config.UpdateSection) {
-    auto UpdateSection = [&](StringRef Name, ArrayRef<uint8_t> Data) {
-      return Obj.updateSection(Name, Data);
-    };
-    if (Error E = handleUserSection(Flag, UpdateSection))
-      return E;
-  }
-
-  if (!Config.AddGnuDebugLink.empty())
-    Obj.addSection<GnuDebugLinkSection>(Config.AddGnuDebugLink,
-                                        Config.GnuDebugLinkCRC32);
-
-  // If the symbol table was previously removed, we need to create a new one
-  // before adding new symbols.
-  if (!Obj.SymbolTable && !Config.SymbolsToAdd.empty())
-    if (Error E = Obj.addNewSymbolTable())
-      return E;
-
-  for (const NewSymbolInfo &SI : Config.SymbolsToAdd)
-    addSymbol(Obj, SI, ELFConfig.NewSymbolVisibility);
-
-  // --set-section-flags works with sections added by --add-section.
-  if (!Config.SetSectionFlags.empty()) {
-    for (auto &Sec : Obj.sections()) {
-      const auto Iter = Config.SetSectionFlags.find(Sec.Name);
-      if (Iter != Config.SetSectionFlags.end()) {
-        const SectionFlagsUpdate &SFU = Iter->second;
-        setSectionFlagsAndType(Sec, SFU.NewFlags);
-      }
-    }
-  }
-
-  if (ELFConfig.EntryExpr)
-    Obj.Entry = ELFConfig.EntryExpr(Obj.Entry);
-  return Error::success();
-}
-
-static Error writeOutput(const CommonConfig &Config, Object &Obj,
-                         raw_ostream &Out, ElfType OutputElfType) {
-  std::unique_ptr<Writer> Writer =
-      createWriter(Config, Obj, Out, OutputElfType);
-  if (Error E = Writer->finalize())
-    return E;
-  return Writer->write();
-}
-
-Error objcopy::elf::executeObjcopyOnIHex(const CommonConfig &Config,
-                                         const ELFConfig &ELFConfig,
-                                         MemoryBuffer &In, raw_ostream &Out) {
-  IHexReader Reader(&In);
-  Expected<std::unique_ptr<Object>> Obj = Reader.create(true);
-  if (!Obj)
-    return Obj.takeError();
-
-  const ElfType OutputElfType =
-      getOutputElfType(Config.OutputArch.getValueOr(MachineInfo()));
-  if (Error E = handleArgs(Config, ELFConfig, **Obj))
-    return E;
-  return writeOutput(Config, **Obj, Out, OutputElfType);
-}
-
-Error objcopy::elf::executeObjcopyOnRawBinary(const CommonConfig &Config,
-                                              const ELFConfig &ELFConfig,
-                                              MemoryBuffer &In,
-                                              raw_ostream &Out) {
-  BinaryReader Reader(&In, ELFConfig.NewSymbolVisibility);
-  Expected<std::unique_ptr<Object>> Obj = Reader.create(true);
-  if (!Obj)
-    return Obj.takeError();
-
-  // Prefer OutputArch (-O<format>) if set, otherwise fallback to BinaryArch
-  // (-B<arch>).
-  const ElfType OutputElfType =
-      getOutputElfType(Config.OutputArch.getValueOr(MachineInfo()));
-  if (Error E = handleArgs(Config, ELFConfig, **Obj))
-    return E;
-  return writeOutput(Config, **Obj, Out, OutputElfType);
-}
-
-Error objcopy::elf::executeObjcopyOnBinary(const CommonConfig &Config,
-                                           const ELFConfig &ELFConfig,
-                                           object::ELFObjectFileBase &In,
-                                           raw_ostream &Out) {
-  ELFReader Reader(&In, Config.ExtractPartition);
-  Expected<std::unique_ptr<Object>> Obj =
-      Reader.create(!Config.SymbolsToAdd.empty());
-  if (!Obj)
-    return Obj.takeError();
-  // Prefer OutputArch (-O<format>) if set, otherwise infer it from the input.
-  const ElfType OutputElfType =
-      Config.OutputArch ? getOutputElfType(Config.OutputArch.getValue())
-                        : getOutputElfType(In);
-
-  if (Error E = handleArgs(Config, ELFConfig, **Obj))
-    return createFileError(Config.InputFilename, std::move(E));
-
-  if (Error E = writeOutput(Config, **Obj, Out, OutputElfType))
-    return createFileError(Config.InputFilename, std::move(E));
-
-  return Error::success();
-}
diff --git a/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.h b/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.h
deleted file mode 100644
index 852661e68f37..000000000000
--- a/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.h
+++ /dev/null
@@ -1,40 +0,0 @@
-//===- ELFObjcopy.h ---------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_OBJCOPY_ELFOBJCOPY_H
-#define LLVM_TOOLS_OBJCOPY_ELFOBJCOPY_H
-
-namespace llvm {
-class Error;
-class MemoryBuffer;
-class raw_ostream;
-
-namespace object {
-class ELFObjectFileBase;
-} // end namespace object
-
-namespace objcopy {
-struct CommonConfig;
-struct ELFConfig;
-
-namespace elf {
-Error executeObjcopyOnIHex(const CommonConfig &Config,
-                           const ELFConfig &ELFConfig, MemoryBuffer &In,
-                           raw_ostream &Out);
-Error executeObjcopyOnRawBinary(const CommonConfig &Config,
-                                const ELFConfig &ELFConfig, MemoryBuffer &In,
-                                raw_ostream &Out);
-Error executeObjcopyOnBinary(const CommonConfig &Config,
-                             const ELFConfig &ELFConfig,
-                             object::ELFObjectFileBase &In, raw_ostream &Out);
-
-} // end namespace elf
-} // end namespace objcopy
-} // end namespace llvm
-
-#endif // LLVM_TOOLS_OBJCOPY_ELFOBJCOPY_H
diff --git a/llvm/tools/llvm-objcopy/ELF/Object.cpp b/llvm/tools/llvm-objcopy/ELF/Object.cpp
deleted file mode 100644
index 659e12bf0306..000000000000
--- a/llvm/tools/llvm-objcopy/ELF/Object.cpp
+++ /dev/null
@@ -1,2826 +0,0 @@
-//===- Object.cpp ---------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "Object.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/MC/MCTargetOptions.h"
-#include "llvm/Object/ELF.h"
-#include "llvm/Object/ELFObjectFile.h"
-#include "llvm/Support/Compression.h"
-#include "llvm/Support/Endian.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FileOutputBuffer.h"
-#include "llvm/Support/Path.h"
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-#include <iterator>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-using namespace llvm::ELF;
-using namespace llvm::objcopy::elf;
-using namespace llvm::object;
-
-template <class ELFT> void ELFWriter<ELFT>::writePhdr(const Segment &Seg) {
-  uint8_t *B = reinterpret_cast<uint8_t *>(Buf->getBufferStart()) +
-               Obj.ProgramHdrSegment.Offset + Seg.Index * sizeof(Elf_Phdr);
-  Elf_Phdr &Phdr = *reinterpret_cast<Elf_Phdr *>(B);
-  Phdr.p_type = Seg.Type;
-  Phdr.p_flags = Seg.Flags;
-  Phdr.p_offset = Seg.Offset;
-  Phdr.p_vaddr = Seg.VAddr;
-  Phdr.p_paddr = Seg.PAddr;
-  Phdr.p_filesz = Seg.FileSize;
-  Phdr.p_memsz = Seg.MemSize;
-  Phdr.p_align = Seg.Align;
-}
-
-Error SectionBase::removeSectionReferences(
-    bool, function_ref<bool(const SectionBase *)>) {
-  return Error::success();
-}
-
-Error SectionBase::removeSymbols(function_ref<bool(const Symbol &)>) {
-  return Error::success();
-}
-
-Error SectionBase::initialize(SectionTableRef) { return Error::success(); }
-void SectionBase::finalize() {}
-void SectionBase::markSymbols() {}
-void SectionBase::replaceSectionReferences(
-    const DenseMap<SectionBase *, SectionBase *> &) {}
-void SectionBase::onRemove() {}
-
-template <class ELFT> void ELFWriter<ELFT>::writeShdr(const SectionBase &Sec) {
-  uint8_t *B =
-      reinterpret_cast<uint8_t *>(Buf->getBufferStart()) + Sec.HeaderOffset;
-  Elf_Shdr &Shdr = *reinterpret_cast<Elf_Shdr *>(B);
-  Shdr.sh_name = Sec.NameIndex;
-  Shdr.sh_type = Sec.Type;
-  Shdr.sh_flags = Sec.Flags;
-  Shdr.sh_addr = Sec.Addr;
-  Shdr.sh_offset = Sec.Offset;
-  Shdr.sh_size = Sec.Size;
-  Shdr.sh_link = Sec.Link;
-  Shdr.sh_info = Sec.Info;
-  Shdr.sh_addralign = Sec.Align;
-  Shdr.sh_entsize = Sec.EntrySize;
-}
-
-template <class ELFT> Error ELFSectionSizer<ELFT>::visit(Section &) {
-  return Error::success();
-}
-
-template <class ELFT> Error ELFSectionSizer<ELFT>::visit(OwnedDataSection &) {
-  return Error::success();
-}
-
-template <class ELFT> Error ELFSectionSizer<ELFT>::visit(StringTableSection &) {
-  return Error::success();
-}
-
-template <class ELFT>
-Error ELFSectionSizer<ELFT>::visit(DynamicRelocationSection &) {
-  return Error::success();
-}
-
-template <class ELFT>
-Error ELFSectionSizer<ELFT>::visit(SymbolTableSection &Sec) {
-  Sec.EntrySize = sizeof(Elf_Sym);
-  Sec.Size = Sec.Symbols.size() * Sec.EntrySize;
-  // Align to the largest field in Elf_Sym.
-  Sec.Align = ELFT::Is64Bits ? sizeof(Elf_Xword) : sizeof(Elf_Word);
-  return Error::success();
-}
-
-template <class ELFT>
-Error ELFSectionSizer<ELFT>::visit(RelocationSection &Sec) {
-  Sec.EntrySize = Sec.Type == SHT_REL ? sizeof(Elf_Rel) : sizeof(Elf_Rela);
-  Sec.Size = Sec.Relocations.size() * Sec.EntrySize;
-  // Align to the largest field in Elf_Rel(a).
-  Sec.Align = ELFT::Is64Bits ? sizeof(Elf_Xword) : sizeof(Elf_Word);
-  return Error::success();
-}
-
-template <class ELFT>
-Error ELFSectionSizer<ELFT>::visit(GnuDebugLinkSection &) {
-  return Error::success();
-}
-
-template <class ELFT> Error ELFSectionSizer<ELFT>::visit(GroupSection &Sec) {
-  Sec.Size = sizeof(Elf_Word) + Sec.GroupMembers.size() * sizeof(Elf_Word);
-  return Error::success();
-}
-
-template <class ELFT>
-Error ELFSectionSizer<ELFT>::visit(SectionIndexSection &) {
-  return Error::success();
-}
-
-template <class ELFT> Error ELFSectionSizer<ELFT>::visit(CompressedSection &) {
-  return Error::success();
-}
-
-template <class ELFT>
-Error ELFSectionSizer<ELFT>::visit(DecompressedSection &) {
-  return Error::success();
-}
-
-Error BinarySectionWriter::visit(const SectionIndexSection &Sec) {
-  return createStringError(errc::operation_not_permitted,
-                           "cannot write symbol section index table '" +
-                               Sec.Name + "' ");
-}
-
-Error BinarySectionWriter::visit(const SymbolTableSection &Sec) {
-  return createStringError(errc::operation_not_permitted,
-                           "cannot write symbol table '" + Sec.Name +
-                               "' out to binary");
-}
-
-Error BinarySectionWriter::visit(const RelocationSection &Sec) {
-  return createStringError(errc::operation_not_permitted,
-                           "cannot write relocation section '" + Sec.Name +
-                               "' out to binary");
-}
-
-Error BinarySectionWriter::visit(const GnuDebugLinkSection &Sec) {
-  return createStringError(errc::operation_not_permitted,
-                           "cannot write '" + Sec.Name + "' out to binary");
-}
-
-Error BinarySectionWriter::visit(const GroupSection &Sec) {
-  return createStringError(errc::operation_not_permitted,
-                           "cannot write '" + Sec.Name + "' out to binary");
-}
-
-Error SectionWriter::visit(const Section &Sec) {
-  if (Sec.Type != SHT_NOBITS)
-    llvm::copy(Sec.Contents, Out.getBufferStart() + Sec.Offset);
-
-  return Error::success();
-}
-
-static bool addressOverflows32bit(uint64_t Addr) {
-  // Sign extended 32 bit addresses (e.g 0xFFFFFFFF80000000) are ok
-  return Addr > UINT32_MAX && Addr + 0x80000000 > UINT32_MAX;
-}
-
-template <class T> static T checkedGetHex(StringRef S) {
-  T Value;
-  bool Fail = S.getAsInteger(16, Value);
-  assert(!Fail);
-  (void)Fail;
-  return Value;
-}
-
-// Fills exactly Len bytes of buffer with hexadecimal characters
-// representing value 'X'
-template <class T, class Iterator>
-static Iterator toHexStr(T X, Iterator It, size_t Len) {
-  // Fill range with '0'
-  std::fill(It, It + Len, '0');
-
-  for (long I = Len - 1; I >= 0; --I) {
-    unsigned char Mod = static_cast<unsigned char>(X) & 15;
-    *(It + I) = hexdigit(Mod, false);
-    X >>= 4;
-  }
-  assert(X == 0);
-  return It + Len;
-}
-
-uint8_t IHexRecord::getChecksum(StringRef S) {
-  assert((S.size() & 1) == 0);
-  uint8_t Checksum = 0;
-  while (!S.empty()) {
-    Checksum += checkedGetHex<uint8_t>(S.take_front(2));
-    S = S.drop_front(2);
-  }
-  return -Checksum;
-}
-
-IHexLineData IHexRecord::getLine(uint8_t Type, uint16_t Addr,
-                                 ArrayRef<uint8_t> Data) {
-  IHexLineData Line(getLineLength(Data.size()));
-  assert(Line.size());
-  auto Iter = Line.begin();
-  *Iter++ = ':';
-  Iter = toHexStr(Data.size(), Iter, 2);
-  Iter = toHexStr(Addr, Iter, 4);
-  Iter = toHexStr(Type, Iter, 2);
-  for (uint8_t X : Data)
-    Iter = toHexStr(X, Iter, 2);
-  StringRef S(Line.data() + 1, std::distance(Line.begin() + 1, Iter));
-  Iter = toHexStr(getChecksum(S), Iter, 2);
-  *Iter++ = '\r';
-  *Iter++ = '\n';
-  assert(Iter == Line.end());
-  return Line;
-}
-
-static Error checkRecord(const IHexRecord &R) {
-  switch (R.Type) {
-  case IHexRecord::Data:
-    if (R.HexData.size() == 0)
-      return createStringError(
-          errc::invalid_argument,
-          "zero data length is not allowed for data records");
-    break;
-  case IHexRecord::EndOfFile:
-    break;
-  case IHexRecord::SegmentAddr:
-    // 20-bit segment address. Data length must be 2 bytes
-    // (4 bytes in hex)
-    if (R.HexData.size() != 4)
-      return createStringError(
-          errc::invalid_argument,
-          "segment address data should be 2 bytes in size");
-    break;
-  case IHexRecord::StartAddr80x86:
-  case IHexRecord::StartAddr:
-    if (R.HexData.size() != 8)
-      return createStringError(errc::invalid_argument,
-                               "start address data should be 4 bytes in size");
-    // According to Intel HEX specification '03' record
-    // only specifies the code address within the 20-bit
-    // segmented address space of the 8086/80186. This
-    // means 12 high order bits should be zeroes.
-    if (R.Type == IHexRecord::StartAddr80x86 &&
-        R.HexData.take_front(3) != "000")
-      return createStringError(errc::invalid_argument,
-                               "start address exceeds 20 bit for 80x86");
-    break;
-  case IHexRecord::ExtendedAddr:
-    // 16-31 bits of linear base address
-    if (R.HexData.size() != 4)
-      return createStringError(
-          errc::invalid_argument,
-          "extended address data should be 2 bytes in size");
-    break;
-  default:
-    // Unknown record type
-    return createStringError(errc::invalid_argument, "unknown record type: %u",
-                             static_cast<unsigned>(R.Type));
-  }
-  return Error::success();
-}
-
-// Checks that IHEX line contains valid characters.
-// This allows converting hexadecimal data to integers
-// without extra verification.
-static Error checkChars(StringRef Line) {
-  assert(!Line.empty());
-  if (Line[0] != ':')
-    return createStringError(errc::invalid_argument,
-                             "missing ':' in the beginning of line.");
-
-  for (size_t Pos = 1; Pos < Line.size(); ++Pos)
-    if (hexDigitValue(Line[Pos]) == -1U)
-      return createStringError(errc::invalid_argument,
-                               "invalid character at position %zu.", Pos + 1);
-  return Error::success();
-}
-
-Expected<IHexRecord> IHexRecord::parse(StringRef Line) {
-  assert(!Line.empty());
-
-  // ':' + Length + Address + Type + Checksum with empty data ':LLAAAATTCC'
-  if (Line.size() < 11)
-    return createStringError(errc::invalid_argument,
-                             "line is too short: %zu chars.", Line.size());
-
-  if (Error E = checkChars(Line))
-    return std::move(E);
-
-  IHexRecord Rec;
-  size_t DataLen = checkedGetHex<uint8_t>(Line.substr(1, 2));
-  if (Line.size() != getLength(DataLen))
-    return createStringError(errc::invalid_argument,
-                             "invalid line length %zu (should be %zu)",
-                             Line.size(), getLength(DataLen));
-
-  Rec.Addr = checkedGetHex<uint16_t>(Line.substr(3, 4));
-  Rec.Type = checkedGetHex<uint8_t>(Line.substr(7, 2));
-  Rec.HexData = Line.substr(9, DataLen * 2);
-
-  if (getChecksum(Line.drop_front(1)) != 0)
-    return createStringError(errc::invalid_argument, "incorrect checksum.");
-  if (Error E = checkRecord(Rec))
-    return std::move(E);
-  return Rec;
-}
-
-static uint64_t sectionPhysicalAddr(const SectionBase *Sec) {
-  Segment *Seg = Sec->ParentSegment;
-  if (Seg && Seg->Type != ELF::PT_LOAD)
-    Seg = nullptr;
-  return Seg ? Seg->PAddr + Sec->OriginalOffset - Seg->OriginalOffset
-             : Sec->Addr;
-}
-
-void IHexSectionWriterBase::writeSection(const SectionBase *Sec,
-                                         ArrayRef<uint8_t> Data) {
-  assert(Data.size() == Sec->Size);
-  const uint32_t ChunkSize = 16;
-  uint32_t Addr = sectionPhysicalAddr(Sec) & 0xFFFFFFFFU;
-  while (!Data.empty()) {
-    uint64_t DataSize = std::min<uint64_t>(Data.size(), ChunkSize);
-    if (Addr > SegmentAddr + BaseAddr + 0xFFFFU) {
-      if (Addr > 0xFFFFFU) {
-        // Write extended address record, zeroing segment address
-        // if needed.
-        if (SegmentAddr != 0)
-          SegmentAddr = writeSegmentAddr(0U);
-        BaseAddr = writeBaseAddr(Addr);
-      } else {
-        // We can still remain 16-bit
-        SegmentAddr = writeSegmentAddr(Addr);
-      }
-    }
-    uint64_t SegOffset = Addr - BaseAddr - SegmentAddr;
-    assert(SegOffset <= 0xFFFFU);
-    DataSize = std::min(DataSize, 0x10000U - SegOffset);
-    writeData(0, SegOffset, Data.take_front(DataSize));
-    Addr += DataSize;
-    Data = Data.drop_front(DataSize);
-  }
-}
-
-uint64_t IHexSectionWriterBase::writeSegmentAddr(uint64_t Addr) {
-  assert(Addr <= 0xFFFFFU);
-  uint8_t Data[] = {static_cast<uint8_t>((Addr & 0xF0000U) >> 12), 0};
-  writeData(2, 0, Data);
-  return Addr & 0xF0000U;
-}
-
-uint64_t IHexSectionWriterBase::writeBaseAddr(uint64_t Addr) {
-  assert(Addr <= 0xFFFFFFFFU);
-  uint64_t Base = Addr & 0xFFFF0000U;
-  uint8_t Data[] = {static_cast<uint8_t>(Base >> 24),
-                    static_cast<uint8_t>((Base >> 16) & 0xFF)};
-  writeData(4, 0, Data);
-  return Base;
-}
-
-void IHexSectionWriterBase::writeData(uint8_t, uint16_t,
-                                      ArrayRef<uint8_t> Data) {
-  Offset += IHexRecord::getLineLength(Data.size());
-}
-
-Error IHexSectionWriterBase::visit(const Section &Sec) {
-  writeSection(&Sec, Sec.Contents);
-  return Error::success();
-}
-
-Error IHexSectionWriterBase::visit(const OwnedDataSection &Sec) {
-  writeSection(&Sec, Sec.Data);
-  return Error::success();
-}
-
-Error IHexSectionWriterBase::visit(const StringTableSection &Sec) {
-  // Check that sizer has already done its work
-  assert(Sec.Size == Sec.StrTabBuilder.getSize());
-  // We are free to pass an invalid pointer to writeSection as long
-  // as we don't actually write any data. The real writer class has
-  // to override this method .
-  writeSection(&Sec, {nullptr, static_cast<size_t>(Sec.Size)});
-  return Error::success();
-}
-
-Error IHexSectionWriterBase::visit(const DynamicRelocationSection &Sec) {
-  writeSection(&Sec, Sec.Contents);
-  return Error::success();
-}
-
-void IHexSectionWriter::writeData(uint8_t Type, uint16_t Addr,
-                                  ArrayRef<uint8_t> Data) {
-  IHexLineData HexData = IHexRecord::getLine(Type, Addr, Data);
-  memcpy(Out.getBufferStart() + Offset, HexData.data(), HexData.size());
-  Offset += HexData.size();
-}
-
-Error IHexSectionWriter::visit(const StringTableSection &Sec) {
-  assert(Sec.Size == Sec.StrTabBuilder.getSize());
-  std::vector<uint8_t> Data(Sec.Size);
-  Sec.StrTabBuilder.write(Data.data());
-  writeSection(&Sec, Data);
-  return Error::success();
-}
-
-Error Section::accept(SectionVisitor &Visitor) const {
-  return Visitor.visit(*this);
-}
-
-Error Section::accept(MutableSectionVisitor &Visitor) {
-  return Visitor.visit(*this);
-}
-
-Error SectionWriter::visit(const OwnedDataSection &Sec) {
-  llvm::copy(Sec.Data, Out.getBufferStart() + Sec.Offset);
-  return Error::success();
-}
-
-static constexpr std::array<uint8_t, 4> ZlibGnuMagic = {{'Z', 'L', 'I', 'B'}};
-
-static bool isDataGnuCompressed(ArrayRef<uint8_t> Data) {
-  return Data.size() > ZlibGnuMagic.size() &&
-         std::equal(ZlibGnuMagic.begin(), ZlibGnuMagic.end(), Data.data());
-}
-
-template <class ELFT>
-static std::tuple<uint64_t, uint64_t>
-getDecompressedSizeAndAlignment(ArrayRef<uint8_t> Data) {
-  const bool IsGnuDebug = isDataGnuCompressed(Data);
-  const uint64_t DecompressedSize =
-      IsGnuDebug
-          ? support::endian::read64be(Data.data() + ZlibGnuMagic.size())
-          : reinterpret_cast<const Elf_Chdr_Impl<ELFT> *>(Data.data())->ch_size;
-  const uint64_t DecompressedAlign =
-      IsGnuDebug ? 1
-                 : reinterpret_cast<const Elf_Chdr_Impl<ELFT> *>(Data.data())
-                       ->ch_addralign;
-
-  return std::make_tuple(DecompressedSize, DecompressedAlign);
-}
-
-template <class ELFT>
-Error ELFSectionWriter<ELFT>::visit(const DecompressedSection &Sec) {
-  const size_t DataOffset = isDataGnuCompressed(Sec.OriginalData)
-                                ? (ZlibGnuMagic.size() + sizeof(Sec.Size))
-                                : sizeof(Elf_Chdr_Impl<ELFT>);
-
-  StringRef CompressedContent(
-      reinterpret_cast<const char *>(Sec.OriginalData.data()) + DataOffset,
-      Sec.OriginalData.size() - DataOffset);
-
-  SmallVector<char, 128> DecompressedContent;
-  if (Error Err = zlib::uncompress(CompressedContent, DecompressedContent,
-                                   static_cast<size_t>(Sec.Size)))
-    return createStringError(errc::invalid_argument,
-                             "'" + Sec.Name + "': " + toString(std::move(Err)));
-
-  uint8_t *Buf = reinterpret_cast<uint8_t *>(Out.getBufferStart()) + Sec.Offset;
-  std::copy(DecompressedContent.begin(), DecompressedContent.end(), Buf);
-
-  return Error::success();
-}
-
-Error BinarySectionWriter::visit(const DecompressedSection &Sec) {
-  return createStringError(errc::operation_not_permitted,
-                           "cannot write compressed section '" + Sec.Name +
-                               "' ");
-}
-
-Error DecompressedSection::accept(SectionVisitor &Visitor) const {
-  return Visitor.visit(*this);
-}
-
-Error DecompressedSection::accept(MutableSectionVisitor &Visitor) {
-  return Visitor.visit(*this);
-}
-
-Error OwnedDataSection::accept(SectionVisitor &Visitor) const {
-  return Visitor.visit(*this);
-}
-
-Error OwnedDataSection::accept(MutableSectionVisitor &Visitor) {
-  return Visitor.visit(*this);
-}
-
-void OwnedDataSection::appendHexData(StringRef HexData) {
-  assert((HexData.size() & 1) == 0);
-  while (!HexData.empty()) {
-    Data.push_back(checkedGetHex<uint8_t>(HexData.take_front(2)));
-    HexData = HexData.drop_front(2);
-  }
-  Size = Data.size();
-}
-
-Error BinarySectionWriter::visit(const CompressedSection &Sec) {
-  return createStringError(errc::operation_not_permitted,
-                           "cannot write compressed section '" + Sec.Name +
-                               "' ");
-}
-
-template <class ELFT>
-Error ELFSectionWriter<ELFT>::visit(const CompressedSection &Sec) {
-  uint8_t *Buf = reinterpret_cast<uint8_t *>(Out.getBufferStart()) + Sec.Offset;
-  if (Sec.CompressionType == DebugCompressionType::None) {
-    std::copy(Sec.OriginalData.begin(), Sec.OriginalData.end(), Buf);
-    return Error::success();
-  }
-
-  if (Sec.CompressionType == DebugCompressionType::GNU) {
-    const char *Magic = "ZLIB";
-    memcpy(Buf, Magic, strlen(Magic));
-    Buf += strlen(Magic);
-    const uint64_t DecompressedSize =
-        support::endian::read64be(&Sec.DecompressedSize);
-    memcpy(Buf, &DecompressedSize, sizeof(DecompressedSize));
-    Buf += sizeof(DecompressedSize);
-  } else {
-    Elf_Chdr_Impl<ELFT> Chdr;
-    Chdr.ch_type = ELF::ELFCOMPRESS_ZLIB;
-    Chdr.ch_size = Sec.DecompressedSize;
-    Chdr.ch_addralign = Sec.DecompressedAlign;
-    memcpy(Buf, &Chdr, sizeof(Chdr));
-    Buf += sizeof(Chdr);
-  }
-
-  std::copy(Sec.CompressedData.begin(), Sec.CompressedData.end(), Buf);
-  return Error::success();
-}
-
-Expected<CompressedSection>
-CompressedSection::create(const SectionBase &Sec,
-                          DebugCompressionType CompressionType) {
-  Error Err = Error::success();
-  CompressedSection Section(Sec, CompressionType, Err);
-
-  if (Err)
-    return std::move(Err);
-
-  return Section;
-}
-Expected<CompressedSection>
-CompressedSection::create(ArrayRef<uint8_t> CompressedData,
-                          uint64_t DecompressedSize,
-                          uint64_t DecompressedAlign) {
-  return CompressedSection(CompressedData, DecompressedSize, DecompressedAlign);
-}
-
-CompressedSection::CompressedSection(const SectionBase &Sec,
-                                     DebugCompressionType CompressionType,
-                                     Error &OutErr)
-    : SectionBase(Sec), CompressionType(CompressionType),
-      DecompressedSize(Sec.OriginalData.size()), DecompressedAlign(Sec.Align) {
-  ErrorAsOutParameter EAO(&OutErr);
-
-  if (Error Err = zlib::compress(
-          StringRef(reinterpret_cast<const char *>(OriginalData.data()),
-                    OriginalData.size()),
-          CompressedData)) {
-    OutErr = createStringError(llvm::errc::invalid_argument,
-                               "'" + Name + "': " + toString(std::move(Err)));
-    return;
-  }
-
-  size_t ChdrSize;
-  if (CompressionType == DebugCompressionType::GNU) {
-    Name = ".z" + Sec.Name.substr(1);
-    ChdrSize = sizeof("ZLIB") - 1 + sizeof(uint64_t);
-  } else {
-    Flags |= ELF::SHF_COMPRESSED;
-    ChdrSize =
-        std::max(std::max(sizeof(object::Elf_Chdr_Impl<object::ELF64LE>),
-                          sizeof(object::Elf_Chdr_Impl<object::ELF64BE>)),
-                 std::max(sizeof(object::Elf_Chdr_Impl<object::ELF32LE>),
-                          sizeof(object::Elf_Chdr_Impl<object::ELF32BE>)));
-  }
-  Size = ChdrSize + CompressedData.size();
-  Align = 8;
-}
-
-CompressedSection::CompressedSection(ArrayRef<uint8_t> CompressedData,
-                                     uint64_t DecompressedSize,
-                                     uint64_t DecompressedAlign)
-    : CompressionType(DebugCompressionType::None),
-      DecompressedSize(DecompressedSize), DecompressedAlign(DecompressedAlign) {
-  OriginalData = CompressedData;
-}
-
-Error CompressedSection::accept(SectionVisitor &Visitor) const {
-  return Visitor.visit(*this);
-}
-
-Error CompressedSection::accept(MutableSectionVisitor &Visitor) {
-  return Visitor.visit(*this);
-}
-
-void StringTableSection::addString(StringRef Name) { StrTabBuilder.add(Name); }
-
-uint32_t StringTableSection::findIndex(StringRef Name) const {
-  return StrTabBuilder.getOffset(Name);
-}
-
-void StringTableSection::prepareForLayout() {
-  StrTabBuilder.finalize();
-  Size = StrTabBuilder.getSize();
-}
-
-Error SectionWriter::visit(const StringTableSection &Sec) {
-  Sec.StrTabBuilder.write(reinterpret_cast<uint8_t *>(Out.getBufferStart()) +
-                          Sec.Offset);
-  return Error::success();
-}
-
-Error StringTableSection::accept(SectionVisitor &Visitor) const {
-  return Visitor.visit(*this);
-}
-
-Error StringTableSection::accept(MutableSectionVisitor &Visitor) {
-  return Visitor.visit(*this);
-}
-
-template <class ELFT>
-Error ELFSectionWriter<ELFT>::visit(const SectionIndexSection &Sec) {
-  uint8_t *Buf = reinterpret_cast<uint8_t *>(Out.getBufferStart()) + Sec.Offset;
-  llvm::copy(Sec.Indexes, reinterpret_cast<Elf_Word *>(Buf));
-  return Error::success();
-}
-
-Error SectionIndexSection::initialize(SectionTableRef SecTable) {
-  Size = 0;
-  Expected<SymbolTableSection *> Sec =
-      SecTable.getSectionOfType<SymbolTableSection>(
-          Link,
-          "Link field value " + Twine(Link) + " in section " + Name +
-              " is invalid",
-          "Link field value " + Twine(Link) + " in section " + Name +
-              " is not a symbol table");
-  if (!Sec)
-    return Sec.takeError();
-
-  setSymTab(*Sec);
-  Symbols->setShndxTable(this);
-  return Error::success();
-}
-
-void SectionIndexSection::finalize() { Link = Symbols->Index; }
-
-Error SectionIndexSection::accept(SectionVisitor &Visitor) const {
-  return Visitor.visit(*this);
-}
-
-Error SectionIndexSection::accept(MutableSectionVisitor &Visitor) {
-  return Visitor.visit(*this);
-}
-
-static bool isValidReservedSectionIndex(uint16_t Index, uint16_t Machine) {
-  switch (Index) {
-  case SHN_ABS:
-  case SHN_COMMON:
-    return true;
-  }
-
-  if (Machine == EM_AMDGPU) {
-    return Index == SHN_AMDGPU_LDS;
-  }
-
-  if (Machine == EM_HEXAGON) {
-    switch (Index) {
-    case SHN_HEXAGON_SCOMMON:
-    case SHN_HEXAGON_SCOMMON_1:
-    case SHN_HEXAGON_SCOMMON_2:
-    case SHN_HEXAGON_SCOMMON_4:
-    case SHN_HEXAGON_SCOMMON_8:
-      return true;
-    }
-  }
-  return false;
-}
-
-// Large indexes force us to clarify exactly what this function should do. This
-// function should return the value that will appear in st_shndx when written
-// out.
-uint16_t Symbol::getShndx() const {
-  if (DefinedIn != nullptr) {
-    if (DefinedIn->Index >= SHN_LORESERVE)
-      return SHN_XINDEX;
-    return DefinedIn->Index;
-  }
-
-  if (ShndxType == SYMBOL_SIMPLE_INDEX) {
-    // This means that we don't have a defined section but we do need to
-    // output a legitimate section index.
-    return SHN_UNDEF;
-  }
-
-  assert(ShndxType == SYMBOL_ABS || ShndxType == SYMBOL_COMMON ||
-         (ShndxType >= SYMBOL_LOPROC && ShndxType <= SYMBOL_HIPROC) ||
-         (ShndxType >= SYMBOL_LOOS && ShndxType <= SYMBOL_HIOS));
-  return static_cast<uint16_t>(ShndxType);
-}
-
-bool Symbol::isCommon() const { return getShndx() == SHN_COMMON; }
-
-void SymbolTableSection::assignIndices() {
-  uint32_t Index = 0;
-  for (auto &Sym : Symbols)
-    Sym->Index = Index++;
-}
-
-void SymbolTableSection::addSymbol(Twine Name, uint8_t Bind, uint8_t Type,
-                                   SectionBase *DefinedIn, uint64_t Value,
-                                   uint8_t Visibility, uint16_t Shndx,
-                                   uint64_t SymbolSize) {
-  Symbol Sym;
-  Sym.Name = Name.str();
-  Sym.Binding = Bind;
-  Sym.Type = Type;
-  Sym.DefinedIn = DefinedIn;
-  if (DefinedIn != nullptr)
-    DefinedIn->HasSymbol = true;
-  if (DefinedIn == nullptr) {
-    if (Shndx >= SHN_LORESERVE)
-      Sym.ShndxType = static_cast<SymbolShndxType>(Shndx);
-    else
-      Sym.ShndxType = SYMBOL_SIMPLE_INDEX;
-  }
-  Sym.Value = Value;
-  Sym.Visibility = Visibility;
-  Sym.Size = SymbolSize;
-  Sym.Index = Symbols.size();
-  Symbols.emplace_back(std::make_unique<Symbol>(Sym));
-  Size += this->EntrySize;
-}
-
-Error SymbolTableSection::removeSectionReferences(
-    bool AllowBrokenLinks, function_ref<bool(const SectionBase *)> ToRemove) {
-  if (ToRemove(SectionIndexTable))
-    SectionIndexTable = nullptr;
-  if (ToRemove(SymbolNames)) {
-    if (!AllowBrokenLinks)
-      return createStringError(
-          llvm::errc::invalid_argument,
-          "string table '%s' cannot be removed because it is "
-          "referenced by the symbol table '%s'",
-          SymbolNames->Name.data(), this->Name.data());
-    SymbolNames = nullptr;
-  }
-  return removeSymbols(
-      [ToRemove](const Symbol &Sym) { return ToRemove(Sym.DefinedIn); });
-}
-
-void SymbolTableSection::updateSymbols(function_ref<void(Symbol &)> Callable) {
-  std::for_each(std::begin(Symbols) + 1, std::end(Symbols),
-                [Callable](SymPtr &Sym) { Callable(*Sym); });
-  std::stable_partition(
-      std::begin(Symbols), std::end(Symbols),
-      [](const SymPtr &Sym) { return Sym->Binding == STB_LOCAL; });
-  assignIndices();
-}
-
-Error SymbolTableSection::removeSymbols(
-    function_ref<bool(const Symbol &)> ToRemove) {
-  Symbols.erase(
-      std::remove_if(std::begin(Symbols) + 1, std::end(Symbols),
-                     [ToRemove](const SymPtr &Sym) { return ToRemove(*Sym); }),
-      std::end(Symbols));
-  Size = Symbols.size() * EntrySize;
-  assignIndices();
-  return Error::success();
-}
-
-void SymbolTableSection::replaceSectionReferences(
-    const DenseMap<SectionBase *, SectionBase *> &FromTo) {
-  for (std::unique_ptr<Symbol> &Sym : Symbols)
-    if (SectionBase *To = FromTo.lookup(Sym->DefinedIn))
-      Sym->DefinedIn = To;
-}
-
-Error SymbolTableSection::initialize(SectionTableRef SecTable) {
-  Size = 0;
-  Expected<StringTableSection *> Sec =
-      SecTable.getSectionOfType<StringTableSection>(
-          Link,
-          "Symbol table has link index of " + Twine(Link) +
-              " which is not a valid index",
-          "Symbol table has link index of " + Twine(Link) +
-              " which is not a string table");
-  if (!Sec)
-    return Sec.takeError();
-
-  setStrTab(*Sec);
-  return Error::success();
-}
-
-void SymbolTableSection::finalize() {
-  uint32_t MaxLocalIndex = 0;
-  for (std::unique_ptr<Symbol> &Sym : Symbols) {
-    Sym->NameIndex =
-        SymbolNames == nullptr ? 0 : SymbolNames->findIndex(Sym->Name);
-    if (Sym->Binding == STB_LOCAL)
-      MaxLocalIndex = std::max(MaxLocalIndex, Sym->Index);
-  }
-  // Now we need to set the Link and Info fields.
-  Link = SymbolNames == nullptr ? 0 : SymbolNames->Index;
-  Info = MaxLocalIndex + 1;
-}
-
-void SymbolTableSection::prepareForLayout() {
-  // Reserve proper amount of space in section index table, so we can
-  // layout sections correctly. We will fill the table with correct
-  // indexes later in fillShdnxTable.
-  if (SectionIndexTable)
-    SectionIndexTable->reserve(Symbols.size());
-
-  // Add all of our strings to SymbolNames so that SymbolNames has the right
-  // size before layout is decided.
-  // If the symbol names section has been removed, don't try to add strings to
-  // the table.
-  if (SymbolNames != nullptr)
-    for (std::unique_ptr<Symbol> &Sym : Symbols)
-      SymbolNames->addString(Sym->Name);
-}
-
-void SymbolTableSection::fillShndxTable() {
-  if (SectionIndexTable == nullptr)
-    return;
-  // Fill section index table with real section indexes. This function must
-  // be called after assignOffsets.
-  for (const std::unique_ptr<Symbol> &Sym : Symbols) {
-    if (Sym->DefinedIn != nullptr && Sym->DefinedIn->Index >= SHN_LORESERVE)
-      SectionIndexTable->addIndex(Sym->DefinedIn->Index);
-    else
-      SectionIndexTable->addIndex(SHN_UNDEF);
-  }
-}
-
-Expected<const Symbol *>
-SymbolTableSection::getSymbolByIndex(uint32_t Index) const {
-  if (Symbols.size() <= Index)
-    return createStringError(errc::invalid_argument,
-                             "invalid symbol index: " + Twine(Index));
-  return Symbols[Index].get();
-}
-
-Expected<Symbol *> SymbolTableSection::getSymbolByIndex(uint32_t Index) {
-  Expected<const Symbol *> Sym =
-      static_cast<const SymbolTableSection *>(this)->getSymbolByIndex(Index);
-  if (!Sym)
-    return Sym.takeError();
-
-  return const_cast<Symbol *>(*Sym);
-}
-
-template <class ELFT>
-Error ELFSectionWriter<ELFT>::visit(const SymbolTableSection &Sec) {
-  Elf_Sym *Sym = reinterpret_cast<Elf_Sym *>(Out.getBufferStart() + Sec.Offset);
-  // Loop though symbols setting each entry of the symbol table.
-  for (const std::unique_ptr<Symbol> &Symbol : Sec.Symbols) {
-    Sym->st_name = Symbol->NameIndex;
-    Sym->st_value = Symbol->Value;
-    Sym->st_size = Symbol->Size;
-    Sym->st_other = Symbol->Visibility;
-    Sym->setBinding(Symbol->Binding);
-    Sym->setType(Symbol->Type);
-    Sym->st_shndx = Symbol->getShndx();
-    ++Sym;
-  }
-  return Error::success();
-}
-
-Error SymbolTableSection::accept(SectionVisitor &Visitor) const {
-  return Visitor.visit(*this);
-}
-
-Error SymbolTableSection::accept(MutableSectionVisitor &Visitor) {
-  return Visitor.visit(*this);
-}
-
-StringRef RelocationSectionBase::getNamePrefix() const {
-  switch (Type) {
-  case SHT_REL:
-    return ".rel";
-  case SHT_RELA:
-    return ".rela";
-  default:
-    llvm_unreachable("not a relocation section");
-  }
-}
-
-Error RelocationSection::removeSectionReferences(
-    bool AllowBrokenLinks, function_ref<bool(const SectionBase *)> ToRemove) {
-  if (ToRemove(Symbols)) {
-    if (!AllowBrokenLinks)
-      return createStringError(
-          llvm::errc::invalid_argument,
-          "symbol table '%s' cannot be removed because it is "
-          "referenced by the relocation section '%s'",
-          Symbols->Name.data(), this->Name.data());
-    Symbols = nullptr;
-  }
-
-  for (const Relocation &R : Relocations) {
-    if (!R.RelocSymbol || !R.RelocSymbol->DefinedIn ||
-        !ToRemove(R.RelocSymbol->DefinedIn))
-      continue;
-    return createStringError(llvm::errc::invalid_argument,
-                             "section '%s' cannot be removed: (%s+0x%" PRIx64
-                             ") has relocation against symbol '%s'",
-                             R.RelocSymbol->DefinedIn->Name.data(),
-                             SecToApplyRel->Name.data(), R.Offset,
-                             R.RelocSymbol->Name.c_str());
-  }
-
-  return Error::success();
-}
-
-template <class SymTabType>
-Error RelocSectionWithSymtabBase<SymTabType>::initialize(
-    SectionTableRef SecTable) {
-  if (Link != SHN_UNDEF) {
-    Expected<SymTabType *> Sec = SecTable.getSectionOfType<SymTabType>(
-        Link,
-        "Link field value " + Twine(Link) + " in section " + Name +
-            " is invalid",
-        "Link field value " + Twine(Link) + " in section " + Name +
-            " is not a symbol table");
-    if (!Sec)
-      return Sec.takeError();
-
-    setSymTab(*Sec);
-  }
-
-  if (Info != SHN_UNDEF) {
-    Expected<SectionBase *> Sec =
-        SecTable.getSection(Info, "Info field value " + Twine(Info) +
-                                      " in section " + Name + " is invalid");
-    if (!Sec)
-      return Sec.takeError();
-
-    setSection(*Sec);
-  } else
-    setSection(nullptr);
-
-  return Error::success();
-}
-
-template <class SymTabType>
-void RelocSectionWithSymtabBase<SymTabType>::finalize() {
-  this->Link = Symbols ? Symbols->Index : 0;
-
-  if (SecToApplyRel != nullptr)
-    this->Info = SecToApplyRel->Index;
-}
-
-template <class ELFT>
-static void setAddend(Elf_Rel_Impl<ELFT, false> &, uint64_t) {}
-
-template <class ELFT>
-static void setAddend(Elf_Rel_Impl<ELFT, true> &Rela, uint64_t Addend) {
-  Rela.r_addend = Addend;
-}
-
-template <class RelRange, class T>
-static void writeRel(const RelRange &Relocations, T *Buf, bool IsMips64EL) {
-  for (const auto &Reloc : Relocations) {
-    Buf->r_offset = Reloc.Offset;
-    setAddend(*Buf, Reloc.Addend);
-    Buf->setSymbolAndType(Reloc.RelocSymbol ? Reloc.RelocSymbol->Index : 0,
-                          Reloc.Type, IsMips64EL);
-    ++Buf;
-  }
-}
-
-template <class ELFT>
-Error ELFSectionWriter<ELFT>::visit(const RelocationSection &Sec) {
-  uint8_t *Buf = reinterpret_cast<uint8_t *>(Out.getBufferStart()) + Sec.Offset;
-  if (Sec.Type == SHT_REL)
-    writeRel(Sec.Relocations, reinterpret_cast<Elf_Rel *>(Buf),
-             Sec.getObject().IsMips64EL);
-  else
-    writeRel(Sec.Relocations, reinterpret_cast<Elf_Rela *>(Buf),
-             Sec.getObject().IsMips64EL);
-  return Error::success();
-}
-
-Error RelocationSection::accept(SectionVisitor &Visitor) const {
-  return Visitor.visit(*this);
-}
-
-Error RelocationSection::accept(MutableSectionVisitor &Visitor) {
-  return Visitor.visit(*this);
-}
-
-Error RelocationSection::removeSymbols(
-    function_ref<bool(const Symbol &)> ToRemove) {
-  for (const Relocation &Reloc : Relocations)
-    if (Reloc.RelocSymbol && ToRemove(*Reloc.RelocSymbol))
-      return createStringError(
-          llvm::errc::invalid_argument,
-          "not stripping symbol '%s' because it is named in a relocation",
-          Reloc.RelocSymbol->Name.data());
-  return Error::success();
-}
-
-void RelocationSection::markSymbols() {
-  for (const Relocation &Reloc : Relocations)
-    if (Reloc.RelocSymbol)
-      Reloc.RelocSymbol->Referenced = true;
-}
-
-void RelocationSection::replaceSectionReferences(
-    const DenseMap<SectionBase *, SectionBase *> &FromTo) {
-  // Update the target section if it was replaced.
-  if (SectionBase *To = FromTo.lookup(SecToApplyRel))
-    SecToApplyRel = To;
-}
-
-Error SectionWriter::visit(const DynamicRelocationSection &Sec) {
-  llvm::copy(Sec.Contents, Out.getBufferStart() + Sec.Offset);
-  return Error::success();
-}
-
-Error DynamicRelocationSection::accept(SectionVisitor &Visitor) const {
-  return Visitor.visit(*this);
-}
-
-Error DynamicRelocationSection::accept(MutableSectionVisitor &Visitor) {
-  return Visitor.visit(*this);
-}
-
-Error DynamicRelocationSection::removeSectionReferences(
-    bool AllowBrokenLinks, function_ref<bool(const SectionBase *)> ToRemove) {
-  if (ToRemove(Symbols)) {
-    if (!AllowBrokenLinks)
-      return createStringError(
-          llvm::errc::invalid_argument,
-          "symbol table '%s' cannot be removed because it is "
-          "referenced by the relocation section '%s'",
-          Symbols->Name.data(), this->Name.data());
-    Symbols = nullptr;
-  }
-
-  // SecToApplyRel contains a section referenced by sh_info field. It keeps
-  // a section to which the relocation section applies. When we remove any
-  // sections we also remove their relocation sections. Since we do that much
-  // earlier, this assert should never be triggered.
-  assert(!SecToApplyRel || !ToRemove(SecToApplyRel));
-  return Error::success();
-}
-
-Error Section::removeSectionReferences(
-    bool AllowBrokenDependency,
-    function_ref<bool(const SectionBase *)> ToRemove) {
-  if (ToRemove(LinkSection)) {
-    if (!AllowBrokenDependency)
-      return createStringError(llvm::errc::invalid_argument,
-                               "section '%s' cannot be removed because it is "
-                               "referenced by the section '%s'",
-                               LinkSection->Name.data(), this->Name.data());
-    LinkSection = nullptr;
-  }
-  return Error::success();
-}
-
-void GroupSection::finalize() {
-  this->Info = Sym ? Sym->Index : 0;
-  this->Link = SymTab ? SymTab->Index : 0;
-  // Linker deduplication for GRP_COMDAT is based on Sym->Name. The local/global
-  // status is not part of the equation. If Sym is localized, the intention is
-  // likely to make the group fully localized. Drop GRP_COMDAT to suppress
-  // deduplication. See https://groups.google.com/g/generic-abi/c/2X6mR-s2zoc
-  if ((FlagWord & GRP_COMDAT) && Sym && Sym->Binding == STB_LOCAL)
-    this->FlagWord &= ~GRP_COMDAT;
-}
-
-Error GroupSection::removeSectionReferences(
-    bool AllowBrokenLinks, function_ref<bool(const SectionBase *)> ToRemove) {
-  if (ToRemove(SymTab)) {
-    if (!AllowBrokenLinks)
-      return createStringError(
-          llvm::errc::invalid_argument,
-          "section '.symtab' cannot be removed because it is "
-          "referenced by the group section '%s'",
-          this->Name.data());
-    SymTab = nullptr;
-    Sym = nullptr;
-  }
-  llvm::erase_if(GroupMembers, ToRemove);
-  return Error::success();
-}
-
-Error GroupSection::removeSymbols(function_ref<bool(const Symbol &)> ToRemove) {
-  if (ToRemove(*Sym))
-    return createStringError(llvm::errc::invalid_argument,
-                             "symbol '%s' cannot be removed because it is "
-                             "referenced by the section '%s[%d]'",
-                             Sym->Name.data(), this->Name.data(), this->Index);
-  return Error::success();
-}
-
-void GroupSection::markSymbols() {
-  if (Sym)
-    Sym->Referenced = true;
-}
-
-void GroupSection::replaceSectionReferences(
-    const DenseMap<SectionBase *, SectionBase *> &FromTo) {
-  for (SectionBase *&Sec : GroupMembers)
-    if (SectionBase *To = FromTo.lookup(Sec))
-      Sec = To;
-}
-
-void GroupSection::onRemove() {
-  // As the header section of the group is removed, drop the Group flag in its
-  // former members.
-  for (SectionBase *Sec : GroupMembers)
-    Sec->Flags &= ~SHF_GROUP;
-}
-
-Error Section::initialize(SectionTableRef SecTable) {
-  if (Link == ELF::SHN_UNDEF)
-    return Error::success();
-
-  Expected<SectionBase *> Sec =
-      SecTable.getSection(Link, "Link field value " + Twine(Link) +
-                                    " in section " + Name + " is invalid");
-  if (!Sec)
-    return Sec.takeError();
-
-  LinkSection = *Sec;
-
-  if (LinkSection->Type == ELF::SHT_SYMTAB)
-    LinkSection = nullptr;
-
-  return Error::success();
-}
-
-void Section::finalize() { this->Link = LinkSection ? LinkSection->Index : 0; }
-
-void GnuDebugLinkSection::init(StringRef File) {
-  FileName = sys::path::filename(File);
-  // The format for the .gnu_debuglink starts with the file name and is
-  // followed by a null terminator and then the CRC32 of the file. The CRC32
-  // should be 4 byte aligned. So we add the FileName size, a 1 for the null
-  // byte, and then finally push the size to alignment and add 4.
-  Size = alignTo(FileName.size() + 1, 4) + 4;
-  // The CRC32 will only be aligned if we align the whole section.
-  Align = 4;
-  Type = OriginalType = ELF::SHT_PROGBITS;
-  Name = ".gnu_debuglink";
-  // For sections not found in segments, OriginalOffset is only used to
-  // establish the order that sections should go in. By using the maximum
-  // possible offset we cause this section to wind up at the end.
-  OriginalOffset = std::numeric_limits<uint64_t>::max();
-}
-
-GnuDebugLinkSection::GnuDebugLinkSection(StringRef File,
-                                         uint32_t PrecomputedCRC)
-    : FileName(File), CRC32(PrecomputedCRC) {
-  init(File);
-}
-
-template <class ELFT>
-Error ELFSectionWriter<ELFT>::visit(const GnuDebugLinkSection &Sec) {
-  unsigned char *Buf =
-      reinterpret_cast<uint8_t *>(Out.getBufferStart()) + Sec.Offset;
-  Elf_Word *CRC =
-      reinterpret_cast<Elf_Word *>(Buf + Sec.Size - sizeof(Elf_Word));
-  *CRC = Sec.CRC32;
-  llvm::copy(Sec.FileName, Buf);
-  return Error::success();
-}
-
-Error GnuDebugLinkSection::accept(SectionVisitor &Visitor) const {
-  return Visitor.visit(*this);
-}
-
-Error GnuDebugLinkSection::accept(MutableSectionVisitor &Visitor) {
-  return Visitor.visit(*this);
-}
-
-template <class ELFT>
-Error ELFSectionWriter<ELFT>::visit(const GroupSection &Sec) {
-  ELF::Elf32_Word *Buf =
-      reinterpret_cast<ELF::Elf32_Word *>(Out.getBufferStart() + Sec.Offset);
-  support::endian::write32<ELFT::TargetEndianness>(Buf++, Sec.FlagWord);
-  for (SectionBase *S : Sec.GroupMembers)
-    support::endian::write32<ELFT::TargetEndianness>(Buf++, S->Index);
-  return Error::success();
-}
-
-Error GroupSection::accept(SectionVisitor &Visitor) const {
-  return Visitor.visit(*this);
-}
-
-Error GroupSection::accept(MutableSectionVisitor &Visitor) {
-  return Visitor.visit(*this);
-}
-
-// Returns true IFF a section is wholly inside the range of a segment
-static bool sectionWithinSegment(const SectionBase &Sec, const Segment &Seg) {
-  // If a section is empty it should be treated like it has a size of 1. This is
-  // to clarify the case when an empty section lies on a boundary between two
-  // segments and ensures that the section "belongs" to the second segment and
-  // not the first.
-  uint64_t SecSize = Sec.Size ? Sec.Size : 1;
-
-  // Ignore just added sections.
-  if (Sec.OriginalOffset == std::numeric_limits<uint64_t>::max())
-    return false;
-
-  if (Sec.Type == SHT_NOBITS) {
-    if (!(Sec.Flags & SHF_ALLOC))
-      return false;
-
-    bool SectionIsTLS = Sec.Flags & SHF_TLS;
-    bool SegmentIsTLS = Seg.Type == PT_TLS;
-    if (SectionIsTLS != SegmentIsTLS)
-      return false;
-
-    return Seg.VAddr <= Sec.Addr &&
-           Seg.VAddr + Seg.MemSize >= Sec.Addr + SecSize;
-  }
-
-  return Seg.Offset <= Sec.OriginalOffset &&
-         Seg.Offset + Seg.FileSize >= Sec.OriginalOffset + SecSize;
-}
-
-// Returns true IFF a segment's original offset is inside of another segment's
-// range.
-static bool segmentOverlapsSegment(const Segment &Child,
-                                   const Segment &Parent) {
-
-  return Parent.OriginalOffset <= Child.OriginalOffset &&
-         Parent.OriginalOffset + Parent.FileSize > Child.OriginalOffset;
-}
-
-static bool compareSegmentsByOffset(const Segment *A, const Segment *B) {
-  // Any segment without a parent segment should come before a segment
-  // that has a parent segment.
-  if (A->OriginalOffset < B->OriginalOffset)
-    return true;
-  if (A->OriginalOffset > B->OriginalOffset)
-    return false;
-  return A->Index < B->Index;
-}
-
-void BasicELFBuilder::initFileHeader() {
-  Obj->Flags = 0x0;
-  Obj->Type = ET_REL;
-  Obj->OSABI = ELFOSABI_NONE;
-  Obj->ABIVersion = 0;
-  Obj->Entry = 0x0;
-  Obj->Machine = EM_NONE;
-  Obj->Version = 1;
-}
-
-void BasicELFBuilder::initHeaderSegment() { Obj->ElfHdrSegment.Index = 0; }
-
-StringTableSection *BasicELFBuilder::addStrTab() {
-  auto &StrTab = Obj->addSection<StringTableSection>();
-  StrTab.Name = ".strtab";
-
-  Obj->SectionNames = &StrTab;
-  return &StrTab;
-}
-
-SymbolTableSection *BasicELFBuilder::addSymTab(StringTableSection *StrTab) {
-  auto &SymTab = Obj->addSection<SymbolTableSection>();
-
-  SymTab.Name = ".symtab";
-  SymTab.Link = StrTab->Index;
-
-  // The symbol table always needs a null symbol
-  SymTab.addSymbol("", 0, 0, nullptr, 0, 0, 0, 0);
-
-  Obj->SymbolTable = &SymTab;
-  return &SymTab;
-}
-
-Error BasicELFBuilder::initSections() {
-  for (SectionBase &Sec : Obj->sections())
-    if (Error Err = Sec.initialize(Obj->sections()))
-      return Err;
-
-  return Error::success();
-}
-
-void BinaryELFBuilder::addData(SymbolTableSection *SymTab) {
-  auto Data = ArrayRef<uint8_t>(
-      reinterpret_cast<const uint8_t *>(MemBuf->getBufferStart()),
-      MemBuf->getBufferSize());
-  auto &DataSection = Obj->addSection<Section>(Data);
-  DataSection.Name = ".data";
-  DataSection.Type = ELF::SHT_PROGBITS;
-  DataSection.Size = Data.size();
-  DataSection.Flags = ELF::SHF_ALLOC | ELF::SHF_WRITE;
-
-  std::string SanitizedFilename = MemBuf->getBufferIdentifier().str();
-  std::replace_if(
-      std::begin(SanitizedFilename), std::end(SanitizedFilename),
-      [](char C) { return !isAlnum(C); }, '_');
-  Twine Prefix = Twine("_binary_") + SanitizedFilename;
-
-  SymTab->addSymbol(Prefix + "_start", STB_GLOBAL, STT_NOTYPE, &DataSection,
-                    /*Value=*/0, NewSymbolVisibility, 0, 0);
-  SymTab->addSymbol(Prefix + "_end", STB_GLOBAL, STT_NOTYPE, &DataSection,
-                    /*Value=*/DataSection.Size, NewSymbolVisibility, 0, 0);
-  SymTab->addSymbol(Prefix + "_size", STB_GLOBAL, STT_NOTYPE, nullptr,
-                    /*Value=*/DataSection.Size, NewSymbolVisibility, SHN_ABS,
-                    0);
-}
-
-Expected<std::unique_ptr<Object>> BinaryELFBuilder::build() {
-  initFileHeader();
-  initHeaderSegment();
-
-  SymbolTableSection *SymTab = addSymTab(addStrTab());
-  if (Error Err = initSections())
-    return std::move(Err);
-  addData(SymTab);
-
-  return std::move(Obj);
-}
-
-// Adds sections from IHEX data file. Data should have been
-// fully validated by this time.
-void IHexELFBuilder::addDataSections() {
-  OwnedDataSection *Section = nullptr;
-  uint64_t SegmentAddr = 0, BaseAddr = 0;
-  uint32_t SecNo = 1;
-
-  for (const IHexRecord &R : Records) {
-    uint64_t RecAddr;
-    switch (R.Type) {
-    case IHexRecord::Data:
-      // Ignore empty data records
-      if (R.HexData.empty())
-        continue;
-      RecAddr = R.Addr + SegmentAddr + BaseAddr;
-      if (!Section || Section->Addr + Section->Size != RecAddr) {
-        // OriginalOffset field is only used to sort sections before layout, so
-        // instead of keeping track of real offsets in IHEX file, and as
-        // layoutSections() and layoutSectionsForOnlyKeepDebug() use
-        // llvm::stable_sort(), we can just set it to a constant (zero).
-        Section = &Obj->addSection<OwnedDataSection>(
-            ".sec" + std::to_string(SecNo), RecAddr,
-            ELF::SHF_ALLOC | ELF::SHF_WRITE, 0);
-        SecNo++;
-      }
-      Section->appendHexData(R.HexData);
-      break;
-    case IHexRecord::EndOfFile:
-      break;
-    case IHexRecord::SegmentAddr:
-      // 20-bit segment address.
-      SegmentAddr = checkedGetHex<uint16_t>(R.HexData) << 4;
-      break;
-    case IHexRecord::StartAddr80x86:
-    case IHexRecord::StartAddr:
-      Obj->Entry = checkedGetHex<uint32_t>(R.HexData);
-      assert(Obj->Entry <= 0xFFFFFU);
-      break;
-    case IHexRecord::ExtendedAddr:
-      // 16-31 bits of linear base address
-      BaseAddr = checkedGetHex<uint16_t>(R.HexData) << 16;
-      break;
-    default:
-      llvm_unreachable("unknown record type");
-    }
-  }
-}
-
-Expected<std::unique_ptr<Object>> IHexELFBuilder::build() {
-  initFileHeader();
-  initHeaderSegment();
-  StringTableSection *StrTab = addStrTab();
-  addSymTab(StrTab);
-  if (Error Err = initSections())
-    return std::move(Err);
-  addDataSections();
-
-  return std::move(Obj);
-}
-
-template <class ELFT>
-ELFBuilder<ELFT>::ELFBuilder(const ELFObjectFile<ELFT> &ElfObj, Object &Obj,
-                             Optional<StringRef> ExtractPartition)
-    : ElfFile(ElfObj.getELFFile()), Obj(Obj),
-      ExtractPartition(ExtractPartition) {
-  Obj.IsMips64EL = ElfFile.isMips64EL();
-}
-
-template <class ELFT> void ELFBuilder<ELFT>::setParentSegment(Segment &Child) {
-  for (Segment &Parent : Obj.segments()) {
-    // Every segment will overlap with itself but we don't want a segment to
-    // be its own parent so we avoid that situation.
-    if (&Child != &Parent && segmentOverlapsSegment(Child, Parent)) {
-      // We want a canonical "most parental" segment but this requires
-      // inspecting the ParentSegment.
-      if (compareSegmentsByOffset(&Parent, &Child))
-        if (Child.ParentSegment == nullptr ||
-            compareSegmentsByOffset(&Parent, Child.ParentSegment)) {
-          Child.ParentSegment = &Parent;
-        }
-    }
-  }
-}
-
-template <class ELFT> Error ELFBuilder<ELFT>::findEhdrOffset() {
-  if (!ExtractPartition)
-    return Error::success();
-
-  for (const SectionBase &Sec : Obj.sections()) {
-    if (Sec.Type == SHT_LLVM_PART_EHDR && Sec.Name == *ExtractPartition) {
-      EhdrOffset = Sec.Offset;
-      return Error::success();
-    }
-  }
-  return createStringError(errc::invalid_argument,
-                           "could not find partition named '" +
-                               *ExtractPartition + "'");
-}
-
-template <class ELFT>
-Error ELFBuilder<ELFT>::readProgramHeaders(const ELFFile<ELFT> &HeadersFile) {
-  uint32_t Index = 0;
-
-  Expected<typename ELFFile<ELFT>::Elf_Phdr_Range> Headers =
-      HeadersFile.program_headers();
-  if (!Headers)
-    return Headers.takeError();
-
-  for (const typename ELFFile<ELFT>::Elf_Phdr &Phdr : *Headers) {
-    if (Phdr.p_offset + Phdr.p_filesz > HeadersFile.getBufSize())
-      return createStringError(
-          errc::invalid_argument,
-          "program header with offset 0x" + Twine::utohexstr(Phdr.p_offset) +
-              " and file size 0x" + Twine::utohexstr(Phdr.p_filesz) +
-              " goes past the end of the file");
-
-    ArrayRef<uint8_t> Data{HeadersFile.base() + Phdr.p_offset,
-                           (size_t)Phdr.p_filesz};
-    Segment &Seg = Obj.addSegment(Data);
-    Seg.Type = Phdr.p_type;
-    Seg.Flags = Phdr.p_flags;
-    Seg.OriginalOffset = Phdr.p_offset + EhdrOffset;
-    Seg.Offset = Phdr.p_offset + EhdrOffset;
-    Seg.VAddr = Phdr.p_vaddr;
-    Seg.PAddr = Phdr.p_paddr;
-    Seg.FileSize = Phdr.p_filesz;
-    Seg.MemSize = Phdr.p_memsz;
-    Seg.Align = Phdr.p_align;
-    Seg.Index = Index++;
-    for (SectionBase &Sec : Obj.sections())
-      if (sectionWithinSegment(Sec, Seg)) {
-        Seg.addSection(&Sec);
-        if (!Sec.ParentSegment || Sec.ParentSegment->Offset > Seg.Offset)
-          Sec.ParentSegment = &Seg;
-      }
-  }
-
-  auto &ElfHdr = Obj.ElfHdrSegment;
-  ElfHdr.Index = Index++;
-  ElfHdr.OriginalOffset = ElfHdr.Offset = EhdrOffset;
-
-  const typename ELFT::Ehdr &Ehdr = HeadersFile.getHeader();
-  auto &PrHdr = Obj.ProgramHdrSegment;
-  PrHdr.Type = PT_PHDR;
-  PrHdr.Flags = 0;
-  // The spec requires us to have p_vaddr % p_align == p_offset % p_align.
-  // Whereas this works automatically for ElfHdr, here OriginalOffset is
-  // always non-zero and to ensure the equation we assign the same value to
-  // VAddr as well.
-  PrHdr.OriginalOffset = PrHdr.Offset = PrHdr.VAddr = EhdrOffset + Ehdr.e_phoff;
-  PrHdr.PAddr = 0;
-  PrHdr.FileSize = PrHdr.MemSize = Ehdr.e_phentsize * Ehdr.e_phnum;
-  // The spec requires us to naturally align all the fields.
-  PrHdr.Align = sizeof(Elf_Addr);
-  PrHdr.Index = Index++;
-
-  // Now we do an O(n^2) loop through the segments in order to match up
-  // segments.
-  for (Segment &Child : Obj.segments())
-    setParentSegment(Child);
-  setParentSegment(ElfHdr);
-  setParentSegment(PrHdr);
-
-  return Error::success();
-}
-
-template <class ELFT>
-Error ELFBuilder<ELFT>::initGroupSection(GroupSection *GroupSec) {
-  if (GroupSec->Align % sizeof(ELF::Elf32_Word) != 0)
-    return createStringError(errc::invalid_argument,
-                             "invalid alignment " + Twine(GroupSec->Align) +
-                                 " of group section '" + GroupSec->Name + "'");
-  SectionTableRef SecTable = Obj.sections();
-  if (GroupSec->Link != SHN_UNDEF) {
-    auto SymTab = SecTable.template getSectionOfType<SymbolTableSection>(
-        GroupSec->Link,
-        "link field value '" + Twine(GroupSec->Link) + "' in section '" +
-            GroupSec->Name + "' is invalid",
-        "link field value '" + Twine(GroupSec->Link) + "' in section '" +
-            GroupSec->Name + "' is not a symbol table");
-    if (!SymTab)
-      return SymTab.takeError();
-
-    Expected<Symbol *> Sym = (*SymTab)->getSymbolByIndex(GroupSec->Info);
-    if (!Sym)
-      return createStringError(errc::invalid_argument,
-                               "info field value '" + Twine(GroupSec->Info) +
-                                   "' in section '" + GroupSec->Name +
-                                   "' is not a valid symbol index");
-    GroupSec->setSymTab(*SymTab);
-    GroupSec->setSymbol(*Sym);
-  }
-  if (GroupSec->Contents.size() % sizeof(ELF::Elf32_Word) ||
-      GroupSec->Contents.empty())
-    return createStringError(errc::invalid_argument,
-                             "the content of the section " + GroupSec->Name +
-                                 " is malformed");
-  const ELF::Elf32_Word *Word =
-      reinterpret_cast<const ELF::Elf32_Word *>(GroupSec->Contents.data());
-  const ELF::Elf32_Word *End =
-      Word + GroupSec->Contents.size() / sizeof(ELF::Elf32_Word);
-  GroupSec->setFlagWord(
-      support::endian::read32<ELFT::TargetEndianness>(Word++));
-  for (; Word != End; ++Word) {
-    uint32_t Index = support::endian::read32<ELFT::TargetEndianness>(Word);
-    Expected<SectionBase *> Sec = SecTable.getSection(
-        Index, "group member index " + Twine(Index) + " in section '" +
-                   GroupSec->Name + "' is invalid");
-    if (!Sec)
-      return Sec.takeError();
-
-    GroupSec->addMember(*Sec);
-  }
-
-  return Error::success();
-}
-
-template <class ELFT>
-Error ELFBuilder<ELFT>::initSymbolTable(SymbolTableSection *SymTab) {
-  Expected<const Elf_Shdr *> Shdr = ElfFile.getSection(SymTab->Index);
-  if (!Shdr)
-    return Shdr.takeError();
-
-  Expected<StringRef> StrTabData = ElfFile.getStringTableForSymtab(**Shdr);
-  if (!StrTabData)
-    return StrTabData.takeError();
-
-  ArrayRef<Elf_Word> ShndxData;
-
-  Expected<typename ELFFile<ELFT>::Elf_Sym_Range> Symbols =
-      ElfFile.symbols(*Shdr);
-  if (!Symbols)
-    return Symbols.takeError();
-
-  for (const typename ELFFile<ELFT>::Elf_Sym &Sym : *Symbols) {
-    SectionBase *DefSection = nullptr;
-
-    Expected<StringRef> Name = Sym.getName(*StrTabData);
-    if (!Name)
-      return Name.takeError();
-
-    if (Sym.st_shndx == SHN_XINDEX) {
-      if (SymTab->getShndxTable() == nullptr)
-        return createStringError(errc::invalid_argument,
-                                 "symbol '" + *Name +
-                                     "' has index SHN_XINDEX but no "
-                                     "SHT_SYMTAB_SHNDX section exists");
-      if (ShndxData.data() == nullptr) {
-        Expected<const Elf_Shdr *> ShndxSec =
-            ElfFile.getSection(SymTab->getShndxTable()->Index);
-        if (!ShndxSec)
-          return ShndxSec.takeError();
-
-        Expected<ArrayRef<Elf_Word>> Data =
-            ElfFile.template getSectionContentsAsArray<Elf_Word>(**ShndxSec);
-        if (!Data)
-          return Data.takeError();
-
-        ShndxData = *Data;
-        if (ShndxData.size() != Symbols->size())
-          return createStringError(
-              errc::invalid_argument,
-              "symbol section index table does not have the same number of "
-              "entries as the symbol table");
-      }
-      Elf_Word Index = ShndxData[&Sym - Symbols->begin()];
-      Expected<SectionBase *> Sec = Obj.sections().getSection(
-          Index,
-          "symbol '" + *Name + "' has invalid section index " + Twine(Index));
-      if (!Sec)
-        return Sec.takeError();
-
-      DefSection = *Sec;
-    } else if (Sym.st_shndx >= SHN_LORESERVE) {
-      if (!isValidReservedSectionIndex(Sym.st_shndx, Obj.Machine)) {
-        return createStringError(
-            errc::invalid_argument,
-            "symbol '" + *Name +
-                "' has unsupported value greater than or equal "
-                "to SHN_LORESERVE: " +
-                Twine(Sym.st_shndx));
-      }
-    } else if (Sym.st_shndx != SHN_UNDEF) {
-      Expected<SectionBase *> Sec = Obj.sections().getSection(
-          Sym.st_shndx, "symbol '" + *Name +
-                            "' is defined has invalid section index " +
-                            Twine(Sym.st_shndx));
-      if (!Sec)
-        return Sec.takeError();
-
-      DefSection = *Sec;
-    }
-
-    SymTab->addSymbol(*Name, Sym.getBinding(), Sym.getType(), DefSection,
-                      Sym.getValue(), Sym.st_other, Sym.st_shndx, Sym.st_size);
-  }
-
-  return Error::success();
-}
-
-template <class ELFT>
-static void getAddend(uint64_t &, const Elf_Rel_Impl<ELFT, false> &) {}
-
-template <class ELFT>
-static void getAddend(uint64_t &ToSet, const Elf_Rel_Impl<ELFT, true> &Rela) {
-  ToSet = Rela.r_addend;
-}
-
-template <class T>
-static Error initRelocations(RelocationSection *Relocs, T RelRange) {
-  for (const auto &Rel : RelRange) {
-    Relocation ToAdd;
-    ToAdd.Offset = Rel.r_offset;
-    getAddend(ToAdd.Addend, Rel);
-    ToAdd.Type = Rel.getType(Relocs->getObject().IsMips64EL);
-
-    if (uint32_t Sym = Rel.getSymbol(Relocs->getObject().IsMips64EL)) {
-      if (!Relocs->getObject().SymbolTable)
-        return createStringError(
-            errc::invalid_argument,
-            "'" + Relocs->Name + "': relocation references symbol with index " +
-                Twine(Sym) + ", but there is no symbol table");
-      Expected<Symbol *> SymByIndex =
-          Relocs->getObject().SymbolTable->getSymbolByIndex(Sym);
-      if (!SymByIndex)
-        return SymByIndex.takeError();
-
-      ToAdd.RelocSymbol = *SymByIndex;
-    }
-
-    Relocs->addRelocation(ToAdd);
-  }
-
-  return Error::success();
-}
-
-Expected<SectionBase *> SectionTableRef::getSection(uint32_t Index,
-                                                    Twine ErrMsg) {
-  if (Index == SHN_UNDEF || Index > Sections.size())
-    return createStringError(errc::invalid_argument, ErrMsg);
-  return Sections[Index - 1].get();
-}
-
-template <class T>
-Expected<T *> SectionTableRef::getSectionOfType(uint32_t Index,
-                                                Twine IndexErrMsg,
-                                                Twine TypeErrMsg) {
-  Expected<SectionBase *> BaseSec = getSection(Index, IndexErrMsg);
-  if (!BaseSec)
-    return BaseSec.takeError();
-
-  if (T *Sec = dyn_cast<T>(*BaseSec))
-    return Sec;
-
-  return createStringError(errc::invalid_argument, TypeErrMsg);
-}
-
-template <class ELFT>
-Expected<SectionBase &> ELFBuilder<ELFT>::makeSection(const Elf_Shdr &Shdr) {
-  switch (Shdr.sh_type) {
-  case SHT_REL:
-  case SHT_RELA:
-    if (Shdr.sh_flags & SHF_ALLOC) {
-      if (Expected<ArrayRef<uint8_t>> Data = ElfFile.getSectionContents(Shdr))
-        return Obj.addSection<DynamicRelocationSection>(*Data);
-      else
-        return Data.takeError();
-    }
-    return Obj.addSection<RelocationSection>(Obj);
-  case SHT_STRTAB:
-    // If a string table is allocated we don't want to mess with it. That would
-    // mean altering the memory image. There are no special link types or
-    // anything so we can just use a Section.
-    if (Shdr.sh_flags & SHF_ALLOC) {
-      if (Expected<ArrayRef<uint8_t>> Data = ElfFile.getSectionContents(Shdr))
-        return Obj.addSection<Section>(*Data);
-      else
-        return Data.takeError();
-    }
-    return Obj.addSection<StringTableSection>();
-  case SHT_HASH:
-  case SHT_GNU_HASH:
-    // Hash tables should refer to SHT_DYNSYM which we're not going to change.
-    // Because of this we don't need to mess with the hash tables either.
-    if (Expected<ArrayRef<uint8_t>> Data = ElfFile.getSectionContents(Shdr))
-      return Obj.addSection<Section>(*Data);
-    else
-      return Data.takeError();
-  case SHT_GROUP:
-    if (Expected<ArrayRef<uint8_t>> Data = ElfFile.getSectionContents(Shdr))
-      return Obj.addSection<GroupSection>(*Data);
-    else
-      return Data.takeError();
-  case SHT_DYNSYM:
-    if (Expected<ArrayRef<uint8_t>> Data = ElfFile.getSectionContents(Shdr))
-      return Obj.addSection<DynamicSymbolTableSection>(*Data);
-    else
-      return Data.takeError();
-  case SHT_DYNAMIC:
-    if (Expected<ArrayRef<uint8_t>> Data = ElfFile.getSectionContents(Shdr))
-      return Obj.addSection<DynamicSection>(*Data);
-    else
-      return Data.takeError();
-  case SHT_SYMTAB: {
-    auto &SymTab = Obj.addSection<SymbolTableSection>();
-    Obj.SymbolTable = &SymTab;
-    return SymTab;
-  }
-  case SHT_SYMTAB_SHNDX: {
-    auto &ShndxSection = Obj.addSection<SectionIndexSection>();
-    Obj.SectionIndexTable = &ShndxSection;
-    return ShndxSection;
-  }
-  case SHT_NOBITS:
-    return Obj.addSection<Section>(ArrayRef<uint8_t>());
-  default: {
-    Expected<ArrayRef<uint8_t>> Data = ElfFile.getSectionContents(Shdr);
-    if (!Data)
-      return Data.takeError();
-
-    Expected<StringRef> Name = ElfFile.getSectionName(Shdr);
-    if (!Name)
-      return Name.takeError();
-
-    if (Name->startswith(".zdebug") || (Shdr.sh_flags & ELF::SHF_COMPRESSED)) {
-      uint64_t DecompressedSize, DecompressedAlign;
-      std::tie(DecompressedSize, DecompressedAlign) =
-          getDecompressedSizeAndAlignment<ELFT>(*Data);
-      Expected<CompressedSection> NewSection =
-          CompressedSection::create(*Data, DecompressedSize, DecompressedAlign);
-      if (!NewSection)
-        return NewSection.takeError();
-
-      return Obj.addSection<CompressedSection>(std::move(*NewSection));
-    }
-
-    return Obj.addSection<Section>(*Data);
-  }
-  }
-}
-
-template <class ELFT> Error ELFBuilder<ELFT>::readSectionHeaders() {
-  uint32_t Index = 0;
-  Expected<typename ELFFile<ELFT>::Elf_Shdr_Range> Sections =
-      ElfFile.sections();
-  if (!Sections)
-    return Sections.takeError();
-
-  for (const typename ELFFile<ELFT>::Elf_Shdr &Shdr : *Sections) {
-    if (Index == 0) {
-      ++Index;
-      continue;
-    }
-    Expected<SectionBase &> Sec = makeSection(Shdr);
-    if (!Sec)
-      return Sec.takeError();
-
-    Expected<StringRef> SecName = ElfFile.getSectionName(Shdr);
-    if (!SecName)
-      return SecName.takeError();
-    Sec->Name = SecName->str();
-    Sec->Type = Sec->OriginalType = Shdr.sh_type;
-    Sec->Flags = Sec->OriginalFlags = Shdr.sh_flags;
-    Sec->Addr = Shdr.sh_addr;
-    Sec->Offset = Shdr.sh_offset;
-    Sec->OriginalOffset = Shdr.sh_offset;
-    Sec->Size = Shdr.sh_size;
-    Sec->Link = Shdr.sh_link;
-    Sec->Info = Shdr.sh_info;
-    Sec->Align = Shdr.sh_addralign;
-    Sec->EntrySize = Shdr.sh_entsize;
-    Sec->Index = Index++;
-    Sec->OriginalIndex = Sec->Index;
-    Sec->OriginalData =
-        ArrayRef<uint8_t>(ElfFile.base() + Shdr.sh_offset,
-                          (Shdr.sh_type == SHT_NOBITS) ? (size_t)0 : Shdr.sh_size);
-  }
-
-  return Error::success();
-}
-
-template <class ELFT> Error ELFBuilder<ELFT>::readSections(bool EnsureSymtab) {
-  uint32_t ShstrIndex = ElfFile.getHeader().e_shstrndx;
-  if (ShstrIndex == SHN_XINDEX) {
-    Expected<const Elf_Shdr *> Sec = ElfFile.getSection(0);
-    if (!Sec)
-      return Sec.takeError();
-
-    ShstrIndex = (*Sec)->sh_link;
-  }
-
-  if (ShstrIndex == SHN_UNDEF)
-    Obj.HadShdrs = false;
-  else {
-    Expected<StringTableSection *> Sec =
-        Obj.sections().template getSectionOfType<StringTableSection>(
-            ShstrIndex,
-            "e_shstrndx field value " + Twine(ShstrIndex) + " in elf header " +
-                " is invalid",
-            "e_shstrndx field value " + Twine(ShstrIndex) + " in elf header " +
-                " does not reference a string table");
-    if (!Sec)
-      return Sec.takeError();
-
-    Obj.SectionNames = *Sec;
-  }
-
-  // If a section index table exists we'll need to initialize it before we
-  // initialize the symbol table because the symbol table might need to
-  // reference it.
-  if (Obj.SectionIndexTable)
-    if (Error Err = Obj.SectionIndexTable->initialize(Obj.sections()))
-      return Err;
-
-  // Now that all of the sections have been added we can fill out some extra
-  // details about symbol tables. We need the symbol table filled out before
-  // any relocations.
-  if (Obj.SymbolTable) {
-    if (Error Err = Obj.SymbolTable->initialize(Obj.sections()))
-      return Err;
-    if (Error Err = initSymbolTable(Obj.SymbolTable))
-      return Err;
-  } else if (EnsureSymtab) {
-    if (Error Err = Obj.addNewSymbolTable())
-      return Err;
-  }
-
-  // Now that all sections and symbols have been added we can add
-  // relocations that reference symbols and set the link and info fields for
-  // relocation sections.
-  for (SectionBase &Sec : Obj.sections()) {
-    if (&Sec == Obj.SymbolTable)
-      continue;
-    if (Error Err = Sec.initialize(Obj.sections()))
-      return Err;
-    if (auto RelSec = dyn_cast<RelocationSection>(&Sec)) {
-      Expected<typename ELFFile<ELFT>::Elf_Shdr_Range> Sections =
-          ElfFile.sections();
-      if (!Sections)
-        return Sections.takeError();
-
-      const typename ELFFile<ELFT>::Elf_Shdr *Shdr =
-          Sections->begin() + RelSec->Index;
-      if (RelSec->Type == SHT_REL) {
-        Expected<typename ELFFile<ELFT>::Elf_Rel_Range> Rels =
-            ElfFile.rels(*Shdr);
-        if (!Rels)
-          return Rels.takeError();
-
-        if (Error Err = initRelocations(RelSec, *Rels))
-          return Err;
-      } else {
-        Expected<typename ELFFile<ELFT>::Elf_Rela_Range> Relas =
-            ElfFile.relas(*Shdr);
-        if (!Relas)
-          return Relas.takeError();
-
-        if (Error Err = initRelocations(RelSec, *Relas))
-          return Err;
-      }
-    } else if (auto GroupSec = dyn_cast<GroupSection>(&Sec)) {
-      if (Error Err = initGroupSection(GroupSec))
-        return Err;
-    }
-  }
-
-  return Error::success();
-}
-
-template <class ELFT> Error ELFBuilder<ELFT>::build(bool EnsureSymtab) {
-  if (Error E = readSectionHeaders())
-    return E;
-  if (Error E = findEhdrOffset())
-    return E;
-
-  // The ELFFile whose ELF headers and program headers are copied into the
-  // output file. Normally the same as ElfFile, but if we're extracting a
-  // loadable partition it will point to the partition's headers.
-  Expected<ELFFile<ELFT>> HeadersFile = ELFFile<ELFT>::create(toStringRef(
-      {ElfFile.base() + EhdrOffset, ElfFile.getBufSize() - EhdrOffset}));
-  if (!HeadersFile)
-    return HeadersFile.takeError();
-
-  const typename ELFFile<ELFT>::Elf_Ehdr &Ehdr = HeadersFile->getHeader();
-  Obj.OSABI = Ehdr.e_ident[EI_OSABI];
-  Obj.ABIVersion = Ehdr.e_ident[EI_ABIVERSION];
-  Obj.Type = Ehdr.e_type;
-  Obj.Machine = Ehdr.e_machine;
-  Obj.Version = Ehdr.e_version;
-  Obj.Entry = Ehdr.e_entry;
-  Obj.Flags = Ehdr.e_flags;
-
-  if (Error E = readSections(EnsureSymtab))
-    return E;
-  return readProgramHeaders(*HeadersFile);
-}
-
-Writer::~Writer() {}
-
-Reader::~Reader() {}
-
-Expected<std::unique_ptr<Object>>
-BinaryReader::create(bool /*EnsureSymtab*/) const {
-  return BinaryELFBuilder(MemBuf, NewSymbolVisibility).build();
-}
-
-Expected<std::vector<IHexRecord>> IHexReader::parse() const {
-  SmallVector<StringRef, 16> Lines;
-  std::vector<IHexRecord> Records;
-  bool HasSections = false;
-
-  MemBuf->getBuffer().split(Lines, '\n');
-  Records.reserve(Lines.size());
-  for (size_t LineNo = 1; LineNo <= Lines.size(); ++LineNo) {
-    StringRef Line = Lines[LineNo - 1].trim();
-    if (Line.empty())
-      continue;
-
-    Expected<IHexRecord> R = IHexRecord::parse(Line);
-    if (!R)
-      return parseError(LineNo, R.takeError());
-    if (R->Type == IHexRecord::EndOfFile)
-      break;
-    HasSections |= (R->Type == IHexRecord::Data);
-    Records.push_back(*R);
-  }
-  if (!HasSections)
-    return parseError(-1U, "no sections");
-
-  return std::move(Records);
-}
-
-Expected<std::unique_ptr<Object>>
-IHexReader::create(bool /*EnsureSymtab*/) const {
-  Expected<std::vector<IHexRecord>> Records = parse();
-  if (!Records)
-    return Records.takeError();
-
-  return IHexELFBuilder(*Records).build();
-}
-
-Expected<std::unique_ptr<Object>> ELFReader::create(bool EnsureSymtab) const {
-  auto Obj = std::make_unique<Object>();
-  if (auto *O = dyn_cast<ELFObjectFile<ELF32LE>>(Bin)) {
-    ELFBuilder<ELF32LE> Builder(*O, *Obj, ExtractPartition);
-    if (Error Err = Builder.build(EnsureSymtab))
-      return std::move(Err);
-    return std::move(Obj);
-  } else if (auto *O = dyn_cast<ELFObjectFile<ELF64LE>>(Bin)) {
-    ELFBuilder<ELF64LE> Builder(*O, *Obj, ExtractPartition);
-    if (Error Err = Builder.build(EnsureSymtab))
-      return std::move(Err);
-    return std::move(Obj);
-  } else if (auto *O = dyn_cast<ELFObjectFile<ELF32BE>>(Bin)) {
-    ELFBuilder<ELF32BE> Builder(*O, *Obj, ExtractPartition);
-    if (Error Err = Builder.build(EnsureSymtab))
-      return std::move(Err);
-    return std::move(Obj);
-  } else if (auto *O = dyn_cast<ELFObjectFile<ELF64BE>>(Bin)) {
-    ELFBuilder<ELF64BE> Builder(*O, *Obj, ExtractPartition);
-    if (Error Err = Builder.build(EnsureSymtab))
-      return std::move(Err);
-    return std::move(Obj);
-  }
-  return createStringError(errc::invalid_argument, "invalid file type");
-}
-
-template <class ELFT> void ELFWriter<ELFT>::writeEhdr() {
-  Elf_Ehdr &Ehdr = *reinterpret_cast<Elf_Ehdr *>(Buf->getBufferStart());
-  std::fill(Ehdr.e_ident, Ehdr.e_ident + 16, 0);
-  Ehdr.e_ident[EI_MAG0] = 0x7f;
-  Ehdr.e_ident[EI_MAG1] = 'E';
-  Ehdr.e_ident[EI_MAG2] = 'L';
-  Ehdr.e_ident[EI_MAG3] = 'F';
-  Ehdr.e_ident[EI_CLASS] = ELFT::Is64Bits ? ELFCLASS64 : ELFCLASS32;
-  Ehdr.e_ident[EI_DATA] =
-      ELFT::TargetEndianness == support::big ? ELFDATA2MSB : ELFDATA2LSB;
-  Ehdr.e_ident[EI_VERSION] = EV_CURRENT;
-  Ehdr.e_ident[EI_OSABI] = Obj.OSABI;
-  Ehdr.e_ident[EI_ABIVERSION] = Obj.ABIVersion;
-
-  Ehdr.e_type = Obj.Type;
-  Ehdr.e_machine = Obj.Machine;
-  Ehdr.e_version = Obj.Version;
-  Ehdr.e_entry = Obj.Entry;
-  // We have to use the fully-qualified name llvm::size
-  // since some compilers complain on ambiguous resolution.
-  Ehdr.e_phnum = llvm::size(Obj.segments());
-  Ehdr.e_phoff = (Ehdr.e_phnum != 0) ? Obj.ProgramHdrSegment.Offset : 0;
-  Ehdr.e_phentsize = (Ehdr.e_phnum != 0) ? sizeof(Elf_Phdr) : 0;
-  Ehdr.e_flags = Obj.Flags;
-  Ehdr.e_ehsize = sizeof(Elf_Ehdr);
-  if (WriteSectionHeaders && Obj.sections().size() != 0) {
-    Ehdr.e_shentsize = sizeof(Elf_Shdr);
-    Ehdr.e_shoff = Obj.SHOff;
-    // """
-    // If the number of sections is greater than or equal to
-    // SHN_LORESERVE (0xff00), this member has the value zero and the actual
-    // number of section header table entries is contained in the sh_size field
-    // of the section header at index 0.
-    // """
-    auto Shnum = Obj.sections().size() + 1;
-    if (Shnum >= SHN_LORESERVE)
-      Ehdr.e_shnum = 0;
-    else
-      Ehdr.e_shnum = Shnum;
-    // """
-    // If the section name string table section index is greater than or equal
-    // to SHN_LORESERVE (0xff00), this member has the value SHN_XINDEX (0xffff)
-    // and the actual index of the section name string table section is
-    // contained in the sh_link field of the section header at index 0.
-    // """
-    if (Obj.SectionNames->Index >= SHN_LORESERVE)
-      Ehdr.e_shstrndx = SHN_XINDEX;
-    else
-      Ehdr.e_shstrndx = Obj.SectionNames->Index;
-  } else {
-    Ehdr.e_shentsize = 0;
-    Ehdr.e_shoff = 0;
-    Ehdr.e_shnum = 0;
-    Ehdr.e_shstrndx = 0;
-  }
-}
-
-template <class ELFT> void ELFWriter<ELFT>::writePhdrs() {
-  for (auto &Seg : Obj.segments())
-    writePhdr(Seg);
-}
-
-template <class ELFT> void ELFWriter<ELFT>::writeShdrs() {
-  // This reference serves to write the dummy section header at the begining
-  // of the file. It is not used for anything else
-  Elf_Shdr &Shdr =
-      *reinterpret_cast<Elf_Shdr *>(Buf->getBufferStart() + Obj.SHOff);
-  Shdr.sh_name = 0;
-  Shdr.sh_type = SHT_NULL;
-  Shdr.sh_flags = 0;
-  Shdr.sh_addr = 0;
-  Shdr.sh_offset = 0;
-  // See writeEhdr for why we do this.
-  uint64_t Shnum = Obj.sections().size() + 1;
-  if (Shnum >= SHN_LORESERVE)
-    Shdr.sh_size = Shnum;
-  else
-    Shdr.sh_size = 0;
-  // See writeEhdr for why we do this.
-  if (Obj.SectionNames != nullptr && Obj.SectionNames->Index >= SHN_LORESERVE)
-    Shdr.sh_link = Obj.SectionNames->Index;
-  else
-    Shdr.sh_link = 0;
-  Shdr.sh_info = 0;
-  Shdr.sh_addralign = 0;
-  Shdr.sh_entsize = 0;
-
-  for (SectionBase &Sec : Obj.sections())
-    writeShdr(Sec);
-}
-
-template <class ELFT> Error ELFWriter<ELFT>::writeSectionData() {
-  for (SectionBase &Sec : Obj.sections())
-    // Segments are responsible for writing their contents, so only write the
-    // section data if the section is not in a segment. Note that this renders
-    // sections in segments effectively immutable.
-    if (Sec.ParentSegment == nullptr)
-      if (Error Err = Sec.accept(*SecWriter))
-        return Err;
-
-  return Error::success();
-}
-
-template <class ELFT> void ELFWriter<ELFT>::writeSegmentData() {
-  for (Segment &Seg : Obj.segments()) {
-    size_t Size = std::min<size_t>(Seg.FileSize, Seg.getContents().size());
-    std::memcpy(Buf->getBufferStart() + Seg.Offset, Seg.getContents().data(),
-                Size);
-  }
-
-  for (auto it : Obj.getUpdatedSections()) {
-    SectionBase *Sec = it.first;
-    ArrayRef<uint8_t> Data = it.second;
-
-    auto *Parent = Sec->ParentSegment;
-    assert(Parent && "This section should've been part of a segment.");
-    uint64_t Offset =
-        Sec->OriginalOffset - Parent->OriginalOffset + Parent->Offset;
-    llvm::copy(Data, Buf->getBufferStart() + Offset);
-  }
-
-  // Iterate over removed sections and overwrite their old data with zeroes.
-  for (auto &Sec : Obj.removedSections()) {
-    Segment *Parent = Sec.ParentSegment;
-    if (Parent == nullptr || Sec.Type == SHT_NOBITS || Sec.Size == 0)
-      continue;
-    uint64_t Offset =
-        Sec.OriginalOffset - Parent->OriginalOffset + Parent->Offset;
-    std::memset(Buf->getBufferStart() + Offset, 0, Sec.Size);
-  }
-}
-
-template <class ELFT>
-ELFWriter<ELFT>::ELFWriter(Object &Obj, raw_ostream &Buf, bool WSH,
-                           bool OnlyKeepDebug)
-    : Writer(Obj, Buf), WriteSectionHeaders(WSH && Obj.HadShdrs),
-      OnlyKeepDebug(OnlyKeepDebug) {}
-
-Error Object::updateSection(StringRef Name, ArrayRef<uint8_t> Data) {
-  auto It = llvm::find_if(Sections,
-                          [&](const SecPtr &Sec) { return Sec->Name == Name; });
-  if (It == Sections.end())
-    return createStringError(errc::invalid_argument, "section '%s' not found",
-                             Name.str().c_str());
-
-  auto *OldSec = It->get();
-  if (!OldSec->hasContents())
-    return createStringError(
-        errc::invalid_argument,
-        "section '%s' can't be updated because it does not have contents",
-        Name.str().c_str());
-
-  if (Data.size() > OldSec->Size && OldSec->ParentSegment)
-    return createStringError(errc::invalid_argument,
-                             "cannot fit data of size %zu into section '%s' "
-                             "with size %zu that is part of a segment",
-                             Data.size(), Name.str().c_str(), OldSec->Size);
-
-  if (!OldSec->ParentSegment) {
-    *It = std::make_unique<OwnedDataSection>(*OldSec, Data);
-  } else {
-    // The segment writer will be in charge of updating these contents.
-    OldSec->Size = Data.size();
-    UpdatedSections[OldSec] = Data;
-  }
-
-  return Error::success();
-}
-
-Error Object::removeSections(
-    bool AllowBrokenLinks, std::function<bool(const SectionBase &)> ToRemove) {
-
-  auto Iter = std::stable_partition(
-      std::begin(Sections), std::end(Sections), [=](const SecPtr &Sec) {
-        if (ToRemove(*Sec))
-          return false;
-        if (auto RelSec = dyn_cast<RelocationSectionBase>(Sec.get())) {
-          if (auto ToRelSec = RelSec->getSection())
-            return !ToRemove(*ToRelSec);
-        }
-        return true;
-      });
-  if (SymbolTable != nullptr && ToRemove(*SymbolTable))
-    SymbolTable = nullptr;
-  if (SectionNames != nullptr && ToRemove(*SectionNames))
-    SectionNames = nullptr;
-  if (SectionIndexTable != nullptr && ToRemove(*SectionIndexTable))
-    SectionIndexTable = nullptr;
-  // Now make sure there are no remaining references to the sections that will
-  // be removed. Sometimes it is impossible to remove a reference so we emit
-  // an error here instead.
-  std::unordered_set<const SectionBase *> RemoveSections;
-  RemoveSections.reserve(std::distance(Iter, std::end(Sections)));
-  for (auto &RemoveSec : make_range(Iter, std::end(Sections))) {
-    for (auto &Segment : Segments)
-      Segment->removeSection(RemoveSec.get());
-    RemoveSec->onRemove();
-    RemoveSections.insert(RemoveSec.get());
-  }
-
-  // For each section that remains alive, we want to remove the dead references.
-  // This either might update the content of the section (e.g. remove symbols
-  // from symbol table that belongs to removed section) or trigger an error if
-  // a live section critically depends on a section being removed somehow
-  // (e.g. the removed section is referenced by a relocation).
-  for (auto &KeepSec : make_range(std::begin(Sections), Iter)) {
-    if (Error E = KeepSec->removeSectionReferences(
-            AllowBrokenLinks, [&RemoveSections](const SectionBase *Sec) {
-              return RemoveSections.find(Sec) != RemoveSections.end();
-            }))
-      return E;
-  }
-
-  // Transfer removed sections into the Object RemovedSections container for use
-  // later.
-  std::move(Iter, Sections.end(), std::back_inserter(RemovedSections));
-  // Now finally get rid of them all together.
-  Sections.erase(Iter, std::end(Sections));
-  return Error::success();
-}
-
-Error Object::replaceSections(
-    const DenseMap<SectionBase *, SectionBase *> &FromTo) {
-  auto SectionIndexLess = [](const SecPtr &Lhs, const SecPtr &Rhs) {
-    return Lhs->Index < Rhs->Index;
-  };
-  assert(llvm::is_sorted(Sections, SectionIndexLess) &&
-         "Sections are expected to be sorted by Index");
-  // Set indices of new sections so that they can be later sorted into positions
-  // of removed ones.
-  for (auto &I : FromTo)
-    I.second->Index = I.first->Index;
-
-  // Notify all sections about the replacement.
-  for (auto &Sec : Sections)
-    Sec->replaceSectionReferences(FromTo);
-
-  if (Error E = removeSections(
-          /*AllowBrokenLinks=*/false,
-          [=](const SectionBase &Sec) { return FromTo.count(&Sec) > 0; }))
-    return E;
-  llvm::sort(Sections, SectionIndexLess);
-  return Error::success();
-}
-
-Error Object::removeSymbols(function_ref<bool(const Symbol &)> ToRemove) {
-  if (SymbolTable)
-    for (const SecPtr &Sec : Sections)
-      if (Error E = Sec->removeSymbols(ToRemove))
-        return E;
-  return Error::success();
-}
-
-Error Object::addNewSymbolTable() {
-  assert(!SymbolTable && "Object must not has a SymbolTable.");
-
-  // Reuse an existing SHT_STRTAB section if it exists.
-  StringTableSection *StrTab = nullptr;
-  for (SectionBase &Sec : sections()) {
-    if (Sec.Type == ELF::SHT_STRTAB && !(Sec.Flags & SHF_ALLOC)) {
-      StrTab = static_cast<StringTableSection *>(&Sec);
-
-      // Prefer a string table that is not the section header string table, if
-      // such a table exists.
-      if (SectionNames != &Sec)
-        break;
-    }
-  }
-  if (!StrTab)
-    StrTab = &addSection<StringTableSection>();
-
-  SymbolTableSection &SymTab = addSection<SymbolTableSection>();
-  SymTab.Name = ".symtab";
-  SymTab.Link = StrTab->Index;
-  if (Error Err = SymTab.initialize(sections()))
-    return Err;
-  SymTab.addSymbol("", 0, 0, nullptr, 0, 0, 0, 0);
-
-  SymbolTable = &SymTab;
-
-  return Error::success();
-}
-
-// Orders segments such that if x = y->ParentSegment then y comes before x.
-static void orderSegments(std::vector<Segment *> &Segments) {
-  llvm::stable_sort(Segments, compareSegmentsByOffset);
-}
-
-// This function finds a consistent layout for a list of segments starting from
-// an Offset. It assumes that Segments have been sorted by orderSegments and
-// returns an Offset one past the end of the last segment.
-static uint64_t layoutSegments(std::vector<Segment *> &Segments,
-                               uint64_t Offset) {
-  assert(llvm::is_sorted(Segments, compareSegmentsByOffset));
-  // The only way a segment should move is if a section was between two
-  // segments and that section was removed. If that section isn't in a segment
-  // then it's acceptable, but not ideal, to simply move it to after the
-  // segments. So we can simply layout segments one after the other accounting
-  // for alignment.
-  for (Segment *Seg : Segments) {
-    // We assume that segments have been ordered by OriginalOffset and Index
-    // such that a parent segment will always come before a child segment in
-    // OrderedSegments. This means that the Offset of the ParentSegment should
-    // already be set and we can set our offset relative to it.
-    if (Seg->ParentSegment != nullptr) {
-      Segment *Parent = Seg->ParentSegment;
-      Seg->Offset =
-          Parent->Offset + Seg->OriginalOffset - Parent->OriginalOffset;
-    } else {
-      Seg->Offset =
-          alignTo(Offset, std::max<uint64_t>(Seg->Align, 1), Seg->VAddr);
-    }
-    Offset = std::max(Offset, Seg->Offset + Seg->FileSize);
-  }
-  return Offset;
-}
-
-// This function finds a consistent layout for a list of sections. It assumes
-// that the ->ParentSegment of each section has already been laid out. The
-// supplied starting Offset is used for the starting offset of any section that
-// does not have a ParentSegment. It returns either the offset given if all
-// sections had a ParentSegment or an offset one past the last section if there
-// was a section that didn't have a ParentSegment.
-template <class Range>
-static uint64_t layoutSections(Range Sections, uint64_t Offset) {
-  // Now the offset of every segment has been set we can assign the offsets
-  // of each section. For sections that are covered by a segment we should use
-  // the segment's original offset and the section's original offset to compute
-  // the offset from the start of the segment. Using the offset from the start
-  // of the segment we can assign a new offset to the section. For sections not
-  // covered by segments we can just bump Offset to the next valid location.
-  // While it is not necessary, layout the sections in the order based on their
-  // original offsets to resemble the input file as close as possible.
-  std::vector<SectionBase *> OutOfSegmentSections;
-  uint32_t Index = 1;
-  for (auto &Sec : Sections) {
-    Sec.Index = Index++;
-    if (Sec.ParentSegment != nullptr) {
-      auto Segment = *Sec.ParentSegment;
-      Sec.Offset =
-          Segment.Offset + (Sec.OriginalOffset - Segment.OriginalOffset);
-    } else
-      OutOfSegmentSections.push_back(&Sec);
-  }
-
-  llvm::stable_sort(OutOfSegmentSections,
-                    [](const SectionBase *Lhs, const SectionBase *Rhs) {
-                      return Lhs->OriginalOffset < Rhs->OriginalOffset;
-                    });
-  for (auto *Sec : OutOfSegmentSections) {
-    Offset = alignTo(Offset, Sec->Align == 0 ? 1 : Sec->Align);
-    Sec->Offset = Offset;
-    if (Sec->Type != SHT_NOBITS)
-      Offset += Sec->Size;
-  }
-  return Offset;
-}
-
-// Rewrite sh_offset after some sections are changed to SHT_NOBITS and thus
-// occupy no space in the file.
-static uint64_t layoutSectionsForOnlyKeepDebug(Object &Obj, uint64_t Off) {
-  // The layout algorithm requires the sections to be handled in the order of
-  // their offsets in the input file, at least inside segments.
-  std::vector<SectionBase *> Sections;
-  Sections.reserve(Obj.sections().size());
-  uint32_t Index = 1;
-  for (auto &Sec : Obj.sections()) {
-    Sec.Index = Index++;
-    Sections.push_back(&Sec);
-  }
-  llvm::stable_sort(Sections,
-                    [](const SectionBase *Lhs, const SectionBase *Rhs) {
-                      return Lhs->OriginalOffset < Rhs->OriginalOffset;
-                    });
-
-  for (auto *Sec : Sections) {
-    auto *FirstSec = Sec->ParentSegment && Sec->ParentSegment->Type == PT_LOAD
-                         ? Sec->ParentSegment->firstSection()
-                         : nullptr;
-
-    // The first section in a PT_LOAD has to have congruent offset and address
-    // modulo the alignment, which usually equals the maximum page size.
-    if (FirstSec && FirstSec == Sec)
-      Off = alignTo(Off, Sec->ParentSegment->Align, Sec->Addr);
-
-    // sh_offset is not significant for SHT_NOBITS sections, but the congruence
-    // rule must be followed if it is the first section in a PT_LOAD. Do not
-    // advance Off.
-    if (Sec->Type == SHT_NOBITS) {
-      Sec->Offset = Off;
-      continue;
-    }
-
-    if (!FirstSec) {
-      // FirstSec being nullptr generally means that Sec does not have the
-      // SHF_ALLOC flag.
-      Off = Sec->Align ? alignTo(Off, Sec->Align) : Off;
-    } else if (FirstSec != Sec) {
-      // The offset is relative to the first section in the PT_LOAD segment. Use
-      // sh_offset for non-SHF_ALLOC sections.
-      Off = Sec->OriginalOffset - FirstSec->OriginalOffset + FirstSec->Offset;
-    }
-    Sec->Offset = Off;
-    Off += Sec->Size;
-  }
-  return Off;
-}
-
-// Rewrite p_offset and p_filesz of non-PT_PHDR segments after sh_offset values
-// have been updated.
-static uint64_t layoutSegmentsForOnlyKeepDebug(std::vector<Segment *> &Segments,
-                                               uint64_t HdrEnd) {
-  uint64_t MaxOffset = 0;
-  for (Segment *Seg : Segments) {
-    if (Seg->Type == PT_PHDR)
-      continue;
-
-    // The segment offset is generally the offset of the first section.
-    //
-    // For a segment containing no section (see sectionWithinSegment), if it has
-    // a parent segment, copy the parent segment's offset field. This works for
-    // empty PT_TLS. If no parent segment, use 0: the segment is not useful for
-    // debugging anyway.
-    const SectionBase *FirstSec = Seg->firstSection();
-    uint64_t Offset =
-        FirstSec ? FirstSec->Offset
-                 : (Seg->ParentSegment ? Seg->ParentSegment->Offset : 0);
-    uint64_t FileSize = 0;
-    for (const SectionBase *Sec : Seg->Sections) {
-      uint64_t Size = Sec->Type == SHT_NOBITS ? 0 : Sec->Size;
-      if (Sec->Offset + Size > Offset)
-        FileSize = std::max(FileSize, Sec->Offset + Size - Offset);
-    }
-
-    // If the segment includes EHDR and program headers, don't make it smaller
-    // than the headers.
-    if (Seg->Offset < HdrEnd && HdrEnd <= Seg->Offset + Seg->FileSize) {
-      FileSize += Offset - Seg->Offset;
-      Offset = Seg->Offset;
-      FileSize = std::max(FileSize, HdrEnd - Offset);
-    }
-
-    Seg->Offset = Offset;
-    Seg->FileSize = FileSize;
-    MaxOffset = std::max(MaxOffset, Offset + FileSize);
-  }
-  return MaxOffset;
-}
-
-template <class ELFT> void ELFWriter<ELFT>::initEhdrSegment() {
-  Segment &ElfHdr = Obj.ElfHdrSegment;
-  ElfHdr.Type = PT_PHDR;
-  ElfHdr.Flags = 0;
-  ElfHdr.VAddr = 0;
-  ElfHdr.PAddr = 0;
-  ElfHdr.FileSize = ElfHdr.MemSize = sizeof(Elf_Ehdr);
-  ElfHdr.Align = 0;
-}
-
-template <class ELFT> void ELFWriter<ELFT>::assignOffsets() {
-  // We need a temporary list of segments that has a special order to it
-  // so that we know that anytime ->ParentSegment is set that segment has
-  // already had its offset properly set.
-  std::vector<Segment *> OrderedSegments;
-  for (Segment &Segment : Obj.segments())
-    OrderedSegments.push_back(&Segment);
-  OrderedSegments.push_back(&Obj.ElfHdrSegment);
-  OrderedSegments.push_back(&Obj.ProgramHdrSegment);
-  orderSegments(OrderedSegments);
-
-  uint64_t Offset;
-  if (OnlyKeepDebug) {
-    // For --only-keep-debug, the sections that did not preserve contents were
-    // changed to SHT_NOBITS. We now rewrite sh_offset fields of sections, and
-    // then rewrite p_offset/p_filesz of program headers.
-    uint64_t HdrEnd =
-        sizeof(Elf_Ehdr) + llvm::size(Obj.segments()) * sizeof(Elf_Phdr);
-    Offset = layoutSectionsForOnlyKeepDebug(Obj, HdrEnd);
-    Offset = std::max(Offset,
-                      layoutSegmentsForOnlyKeepDebug(OrderedSegments, HdrEnd));
-  } else {
-    // Offset is used as the start offset of the first segment to be laid out.
-    // Since the ELF Header (ElfHdrSegment) must be at the start of the file,
-    // we start at offset 0.
-    Offset = layoutSegments(OrderedSegments, 0);
-    Offset = layoutSections(Obj.sections(), Offset);
-  }
-  // If we need to write the section header table out then we need to align the
-  // Offset so that SHOffset is valid.
-  if (WriteSectionHeaders)
-    Offset = alignTo(Offset, sizeof(Elf_Addr));
-  Obj.SHOff = Offset;
-}
-
-template <class ELFT> size_t ELFWriter<ELFT>::totalSize() const {
-  // We already have the section header offset so we can calculate the total
-  // size by just adding up the size of each section header.
-  if (!WriteSectionHeaders)
-    return Obj.SHOff;
-  size_t ShdrCount = Obj.sections().size() + 1; // Includes null shdr.
-  return Obj.SHOff + ShdrCount * sizeof(Elf_Shdr);
-}
-
-template <class ELFT> Error ELFWriter<ELFT>::write() {
-  // Segment data must be written first, so that the ELF header and program
-  // header tables can overwrite it, if covered by a segment.
-  writeSegmentData();
-  writeEhdr();
-  writePhdrs();
-  if (Error E = writeSectionData())
-    return E;
-  if (WriteSectionHeaders)
-    writeShdrs();
-
-  // TODO: Implement direct writing to the output stream (without intermediate
-  // memory buffer Buf).
-  Out.write(Buf->getBufferStart(), Buf->getBufferSize());
-  return Error::success();
-}
-
-static Error removeUnneededSections(Object &Obj) {
-  // We can remove an empty symbol table from non-relocatable objects.
-  // Relocatable objects typically have relocation sections whose
-  // sh_link field points to .symtab, so we can't remove .symtab
-  // even if it is empty.
-  if (Obj.isRelocatable() || Obj.SymbolTable == nullptr ||
-      !Obj.SymbolTable->empty())
-    return Error::success();
-
-  // .strtab can be used for section names. In such a case we shouldn't
-  // remove it.
-  auto *StrTab = Obj.SymbolTable->getStrTab() == Obj.SectionNames
-                     ? nullptr
-                     : Obj.SymbolTable->getStrTab();
-  return Obj.removeSections(false, [&](const SectionBase &Sec) {
-    return &Sec == Obj.SymbolTable || &Sec == StrTab;
-  });
-}
-
-template <class ELFT> Error ELFWriter<ELFT>::finalize() {
-  // It could happen that SectionNames has been removed and yet the user wants
-  // a section header table output. We need to throw an error if a user tries
-  // to do that.
-  if (Obj.SectionNames == nullptr && WriteSectionHeaders)
-    return createStringError(llvm::errc::invalid_argument,
-                             "cannot write section header table because "
-                             "section header string table was removed");
-
-  if (Error E = removeUnneededSections(Obj))
-    return E;
-
-  // We need to assign indexes before we perform layout because we need to know
-  // if we need large indexes or not. We can assign indexes first and check as
-  // we go to see if we will actully need large indexes.
-  bool NeedsLargeIndexes = false;
-  if (Obj.sections().size() >= SHN_LORESERVE) {
-    SectionTableRef Sections = Obj.sections();
-    // Sections doesn't include the null section header, so account for this
-    // when skipping the first N sections.
-    NeedsLargeIndexes =
-        any_of(drop_begin(Sections, SHN_LORESERVE - 1),
-               [](const SectionBase &Sec) { return Sec.HasSymbol; });
-    // TODO: handle case where only one section needs the large index table but
-    // only needs it because the large index table hasn't been removed yet.
-  }
-
-  if (NeedsLargeIndexes) {
-    // This means we definitely need to have a section index table but if we
-    // already have one then we should use it instead of making a new one.
-    if (Obj.SymbolTable != nullptr && Obj.SectionIndexTable == nullptr) {
-      // Addition of a section to the end does not invalidate the indexes of
-      // other sections and assigns the correct index to the new section.
-      auto &Shndx = Obj.addSection<SectionIndexSection>();
-      Obj.SymbolTable->setShndxTable(&Shndx);
-      Shndx.setSymTab(Obj.SymbolTable);
-    }
-  } else {
-    // Since we don't need SectionIndexTable we should remove it and all
-    // references to it.
-    if (Obj.SectionIndexTable != nullptr) {
-      // We do not support sections referring to the section index table.
-      if (Error E = Obj.removeSections(false /*AllowBrokenLinks*/,
-                                       [this](const SectionBase &Sec) {
-                                         return &Sec == Obj.SectionIndexTable;
-                                       }))
-        return E;
-    }
-  }
-
-  // Make sure we add the names of all the sections. Importantly this must be
-  // done after we decide to add or remove SectionIndexes.
-  if (Obj.SectionNames != nullptr)
-    for (const SectionBase &Sec : Obj.sections())
-      Obj.SectionNames->addString(Sec.Name);
-
-  initEhdrSegment();
-
-  // Before we can prepare for layout the indexes need to be finalized.
-  // Also, the output arch may not be the same as the input arch, so fix up
-  // size-related fields before doing layout calculations.
-  uint64_t Index = 0;
-  auto SecSizer = std::make_unique<ELFSectionSizer<ELFT>>();
-  for (SectionBase &Sec : Obj.sections()) {
-    Sec.Index = Index++;
-    if (Error Err = Sec.accept(*SecSizer))
-      return Err;
-  }
-
-  // The symbol table does not update all other sections on update. For
-  // instance, symbol names are not added as new symbols are added. This means
-  // that some sections, like .strtab, don't yet have their final size.
-  if (Obj.SymbolTable != nullptr)
-    Obj.SymbolTable->prepareForLayout();
-
-  // Now that all strings are added we want to finalize string table builders,
-  // because that affects section sizes which in turn affects section offsets.
-  for (SectionBase &Sec : Obj.sections())
-    if (auto StrTab = dyn_cast<StringTableSection>(&Sec))
-      StrTab->prepareForLayout();
-
-  assignOffsets();
-
-  // layoutSections could have modified section indexes, so we need
-  // to fill the index table after assignOffsets.
-  if (Obj.SymbolTable != nullptr)
-    Obj.SymbolTable->fillShndxTable();
-
-  // Finally now that all offsets and indexes have been set we can finalize any
-  // remaining issues.
-  uint64_t Offset = Obj.SHOff + sizeof(Elf_Shdr);
-  for (SectionBase &Sec : Obj.sections()) {
-    Sec.HeaderOffset = Offset;
-    Offset += sizeof(Elf_Shdr);
-    if (WriteSectionHeaders)
-      Sec.NameIndex = Obj.SectionNames->findIndex(Sec.Name);
-    Sec.finalize();
-  }
-
-  size_t TotalSize = totalSize();
-  Buf = WritableMemoryBuffer::getNewMemBuffer(TotalSize);
-  if (!Buf)
-    return createStringError(errc::not_enough_memory,
-                             "failed to allocate memory buffer of " +
-                                 Twine::utohexstr(TotalSize) + " bytes");
-
-  SecWriter = std::make_unique<ELFSectionWriter<ELFT>>(*Buf);
-  return Error::success();
-}
-
-Error BinaryWriter::write() {
-  for (const SectionBase &Sec : Obj.allocSections())
-    if (Error Err = Sec.accept(*SecWriter))
-      return Err;
-
-  // TODO: Implement direct writing to the output stream (without intermediate
-  // memory buffer Buf).
-  Out.write(Buf->getBufferStart(), Buf->getBufferSize());
-  return Error::success();
-}
-
-Error BinaryWriter::finalize() {
-  // Compute the section LMA based on its sh_offset and the containing segment's
-  // p_offset and p_paddr. Also compute the minimum LMA of all non-empty
-  // sections as MinAddr. In the output, the contents between address 0 and
-  // MinAddr will be skipped.
-  uint64_t MinAddr = UINT64_MAX;
-  for (SectionBase &Sec : Obj.allocSections()) {
-    if (Sec.ParentSegment != nullptr)
-      Sec.Addr =
-          Sec.Offset - Sec.ParentSegment->Offset + Sec.ParentSegment->PAddr;
-    if (Sec.Type != SHT_NOBITS && Sec.Size > 0)
-      MinAddr = std::min(MinAddr, Sec.Addr);
-  }
-
-  // Now that every section has been laid out we just need to compute the total
-  // file size. This might not be the same as the offset returned by
-  // layoutSections, because we want to truncate the last segment to the end of
-  // its last non-empty section, to match GNU objcopy's behaviour.
-  TotalSize = 0;
-  for (SectionBase &Sec : Obj.allocSections())
-    if (Sec.Type != SHT_NOBITS && Sec.Size > 0) {
-      Sec.Offset = Sec.Addr - MinAddr;
-      TotalSize = std::max(TotalSize, Sec.Offset + Sec.Size);
-    }
-
-  Buf = WritableMemoryBuffer::getNewMemBuffer(TotalSize);
-  if (!Buf)
-    return createStringError(errc::not_enough_memory,
-                             "failed to allocate memory buffer of " +
-                                 Twine::utohexstr(TotalSize) + " bytes");
-  SecWriter = std::make_unique<BinarySectionWriter>(*Buf);
-  return Error::success();
-}
-
-bool IHexWriter::SectionCompare::operator()(const SectionBase *Lhs,
-                                            const SectionBase *Rhs) const {
-  return (sectionPhysicalAddr(Lhs) & 0xFFFFFFFFU) <
-         (sectionPhysicalAddr(Rhs) & 0xFFFFFFFFU);
-}
-
-uint64_t IHexWriter::writeEntryPointRecord(uint8_t *Buf) {
-  IHexLineData HexData;
-  uint8_t Data[4] = {};
-  // We don't write entry point record if entry is zero.
-  if (Obj.Entry == 0)
-    return 0;
-
-  if (Obj.Entry <= 0xFFFFFU) {
-    Data[0] = ((Obj.Entry & 0xF0000U) >> 12) & 0xFF;
-    support::endian::write(&Data[2], static_cast<uint16_t>(Obj.Entry),
-                           support::big);
-    HexData = IHexRecord::getLine(IHexRecord::StartAddr80x86, 0, Data);
-  } else {
-    support::endian::write(Data, static_cast<uint32_t>(Obj.Entry),
-                           support::big);
-    HexData = IHexRecord::getLine(IHexRecord::StartAddr, 0, Data);
-  }
-  memcpy(Buf, HexData.data(), HexData.size());
-  return HexData.size();
-}
-
-uint64_t IHexWriter::writeEndOfFileRecord(uint8_t *Buf) {
-  IHexLineData HexData = IHexRecord::getLine(IHexRecord::EndOfFile, 0, {});
-  memcpy(Buf, HexData.data(), HexData.size());
-  return HexData.size();
-}
-
-Error IHexWriter::write() {
-  IHexSectionWriter Writer(*Buf);
-  // Write sections.
-  for (const SectionBase *Sec : Sections)
-    if (Error Err = Sec->accept(Writer))
-      return Err;
-
-  uint64_t Offset = Writer.getBufferOffset();
-  // Write entry point address.
-  Offset += writeEntryPointRecord(
-      reinterpret_cast<uint8_t *>(Buf->getBufferStart()) + Offset);
-  // Write EOF.
-  Offset += writeEndOfFileRecord(
-      reinterpret_cast<uint8_t *>(Buf->getBufferStart()) + Offset);
-  assert(Offset == TotalSize);
-
-  // TODO: Implement direct writing to the output stream (without intermediate
-  // memory buffer Buf).
-  Out.write(Buf->getBufferStart(), Buf->getBufferSize());
-  return Error::success();
-}
-
-Error IHexWriter::checkSection(const SectionBase &Sec) {
-  uint64_t Addr = sectionPhysicalAddr(&Sec);
-  if (addressOverflows32bit(Addr) || addressOverflows32bit(Addr + Sec.Size - 1))
-    return createStringError(
-        errc::invalid_argument,
-        "Section '%s' address range [0x%llx, 0x%llx] is not 32 bit",
-        Sec.Name.c_str(), Addr, Addr + Sec.Size - 1);
-  return Error::success();
-}
-
-Error IHexWriter::finalize() {
-  // We can't write 64-bit addresses.
-  if (addressOverflows32bit(Obj.Entry))
-    return createStringError(errc::invalid_argument,
-                             "Entry point address 0x%llx overflows 32 bits",
-                             Obj.Entry);
-
-  for (const SectionBase &Sec : Obj.sections())
-    if ((Sec.Flags & ELF::SHF_ALLOC) && Sec.Type != ELF::SHT_NOBITS &&
-        Sec.Size > 0) {
-      if (Error E = checkSection(Sec))
-        return E;
-      Sections.insert(&Sec);
-    }
-
-  std::unique_ptr<WritableMemoryBuffer> EmptyBuffer =
-      WritableMemoryBuffer::getNewMemBuffer(0);
-  if (!EmptyBuffer)
-    return createStringError(errc::not_enough_memory,
-                             "failed to allocate memory buffer of 0 bytes");
-
-  IHexSectionWriterBase LengthCalc(*EmptyBuffer);
-  for (const SectionBase *Sec : Sections)
-    if (Error Err = Sec->accept(LengthCalc))
-      return Err;
-
-  // We need space to write section records + StartAddress record
-  // (if start adress is not zero) + EndOfFile record.
-  TotalSize = LengthCalc.getBufferOffset() +
-              (Obj.Entry ? IHexRecord::getLineLength(4) : 0) +
-              IHexRecord::getLineLength(0);
-
-  Buf = WritableMemoryBuffer::getNewMemBuffer(TotalSize);
-  if (!Buf)
-    return createStringError(errc::not_enough_memory,
-                             "failed to allocate memory buffer of " +
-                                 Twine::utohexstr(TotalSize) + " bytes");
-
-  return Error::success();
-}
-
-namespace llvm {
-namespace objcopy {
-namespace elf {
-
-template class ELFBuilder<ELF64LE>;
-template class ELFBuilder<ELF64BE>;
-template class ELFBuilder<ELF32LE>;
-template class ELFBuilder<ELF32BE>;
-
-template class ELFWriter<ELF64LE>;
-template class ELFWriter<ELF64BE>;
-template class ELFWriter<ELF32LE>;
-template class ELFWriter<ELF32BE>;
-
-} // end namespace elf
-} // end namespace objcopy
-} // end namespace llvm
diff --git a/llvm/tools/llvm-objcopy/ELF/Object.h b/llvm/tools/llvm-objcopy/ELF/Object.h
deleted file mode 100644
index 681ab8f56381..000000000000
--- a/llvm/tools/llvm-objcopy/ELF/Object.h
+++ /dev/null
@@ -1,1113 +0,0 @@
-//===- Object.h -------------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_OBJCOPY_OBJECT_H
-#define LLVM_TOOLS_OBJCOPY_OBJECT_H
-
-#include "CommonConfig.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/MC/StringTableBuilder.h"
-#include "llvm/Object/ELFObjectFile.h"
-#include "llvm/Support/Errc.h"
-#include "llvm/Support/FileOutputBuffer.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include <cstddef>
-#include <cstdint>
-#include <functional>
-#include <memory>
-#include <set>
-#include <vector>
-
-namespace llvm {
-enum class DebugCompressionType;
-namespace objcopy {
-namespace elf {
-
-class SectionBase;
-class Section;
-class OwnedDataSection;
-class StringTableSection;
-class SymbolTableSection;
-class RelocationSection;
-class DynamicRelocationSection;
-class GnuDebugLinkSection;
-class GroupSection;
-class SectionIndexSection;
-class CompressedSection;
-class DecompressedSection;
-class Segment;
-class Object;
-struct Symbol;
-
-class SectionTableRef {
-  ArrayRef<std::unique_ptr<SectionBase>> Sections;
-
-public:
-  using iterator = pointee_iterator<const std::unique_ptr<SectionBase> *>;
-
-  explicit SectionTableRef(ArrayRef<std::unique_ptr<SectionBase>> Secs)
-      : Sections(Secs) {}
-  SectionTableRef(const SectionTableRef &) = default;
-
-  iterator begin() const { return iterator(Sections.data()); }
-  iterator end() const { return iterator(Sections.data() + Sections.size()); }
-  size_t size() const { return Sections.size(); }
-
-  Expected<SectionBase *> getSection(uint32_t Index, Twine ErrMsg);
-
-  template <class T>
-  Expected<T *> getSectionOfType(uint32_t Index, Twine IndexErrMsg,
-                                 Twine TypeErrMsg);
-};
-
-enum ElfType { ELFT_ELF32LE, ELFT_ELF64LE, ELFT_ELF32BE, ELFT_ELF64BE };
-
-class SectionVisitor {
-public:
-  virtual ~SectionVisitor() = default;
-
-  virtual Error visit(const Section &Sec) = 0;
-  virtual Error visit(const OwnedDataSection &Sec) = 0;
-  virtual Error visit(const StringTableSection &Sec) = 0;
-  virtual Error visit(const SymbolTableSection &Sec) = 0;
-  virtual Error visit(const RelocationSection &Sec) = 0;
-  virtual Error visit(const DynamicRelocationSection &Sec) = 0;
-  virtual Error visit(const GnuDebugLinkSection &Sec) = 0;
-  virtual Error visit(const GroupSection &Sec) = 0;
-  virtual Error visit(const SectionIndexSection &Sec) = 0;
-  virtual Error visit(const CompressedSection &Sec) = 0;
-  virtual Error visit(const DecompressedSection &Sec) = 0;
-};
-
-class MutableSectionVisitor {
-public:
-  virtual ~MutableSectionVisitor() = default;
-
-  virtual Error visit(Section &Sec) = 0;
-  virtual Error visit(OwnedDataSection &Sec) = 0;
-  virtual Error visit(StringTableSection &Sec) = 0;
-  virtual Error visit(SymbolTableSection &Sec) = 0;
-  virtual Error visit(RelocationSection &Sec) = 0;
-  virtual Error visit(DynamicRelocationSection &Sec) = 0;
-  virtual Error visit(GnuDebugLinkSection &Sec) = 0;
-  virtual Error visit(GroupSection &Sec) = 0;
-  virtual Error visit(SectionIndexSection &Sec) = 0;
-  virtual Error visit(CompressedSection &Sec) = 0;
-  virtual Error visit(DecompressedSection &Sec) = 0;
-};
-
-class SectionWriter : public SectionVisitor {
-protected:
-  WritableMemoryBuffer &Out;
-
-public:
-  virtual ~SectionWriter() = default;
-
-  Error visit(const Section &Sec) override;
-  Error visit(const OwnedDataSection &Sec) override;
-  Error visit(const StringTableSection &Sec) override;
-  Error visit(const DynamicRelocationSection &Sec) override;
-  virtual Error visit(const SymbolTableSection &Sec) override = 0;
-  virtual Error visit(const RelocationSection &Sec) override = 0;
-  virtual Error visit(const GnuDebugLinkSection &Sec) override = 0;
-  virtual Error visit(const GroupSection &Sec) override = 0;
-  virtual Error visit(const SectionIndexSection &Sec) override = 0;
-  virtual Error visit(const CompressedSection &Sec) override = 0;
-  virtual Error visit(const DecompressedSection &Sec) override = 0;
-
-  explicit SectionWriter(WritableMemoryBuffer &Buf) : Out(Buf) {}
-};
-
-template <class ELFT> class ELFSectionWriter : public SectionWriter {
-private:
-  using Elf_Word = typename ELFT::Word;
-  using Elf_Rel = typename ELFT::Rel;
-  using Elf_Rela = typename ELFT::Rela;
-  using Elf_Sym = typename ELFT::Sym;
-
-public:
-  virtual ~ELFSectionWriter() {}
-  Error visit(const SymbolTableSection &Sec) override;
-  Error visit(const RelocationSection &Sec) override;
-  Error visit(const GnuDebugLinkSection &Sec) override;
-  Error visit(const GroupSection &Sec) override;
-  Error visit(const SectionIndexSection &Sec) override;
-  Error visit(const CompressedSection &Sec) override;
-  Error visit(const DecompressedSection &Sec) override;
-
-  explicit ELFSectionWriter(WritableMemoryBuffer &Buf) : SectionWriter(Buf) {}
-};
-
-template <class ELFT> class ELFSectionSizer : public MutableSectionVisitor {
-private:
-  using Elf_Rel = typename ELFT::Rel;
-  using Elf_Rela = typename ELFT::Rela;
-  using Elf_Sym = typename ELFT::Sym;
-  using Elf_Word = typename ELFT::Word;
-  using Elf_Xword = typename ELFT::Xword;
-
-public:
-  Error visit(Section &Sec) override;
-  Error visit(OwnedDataSection &Sec) override;
-  Error visit(StringTableSection &Sec) override;
-  Error visit(DynamicRelocationSection &Sec) override;
-  Error visit(SymbolTableSection &Sec) override;
-  Error visit(RelocationSection &Sec) override;
-  Error visit(GnuDebugLinkSection &Sec) override;
-  Error visit(GroupSection &Sec) override;
-  Error visit(SectionIndexSection &Sec) override;
-  Error visit(CompressedSection &Sec) override;
-  Error visit(DecompressedSection &Sec) override;
-};
-
-#define MAKE_SEC_WRITER_FRIEND                                                 \
-  friend class SectionWriter;                                                  \
-  friend class IHexSectionWriterBase;                                          \
-  friend class IHexSectionWriter;                                              \
-  template <class ELFT> friend class ELFSectionWriter;                         \
-  template <class ELFT> friend class ELFSectionSizer;
-
-class BinarySectionWriter : public SectionWriter {
-public:
-  virtual ~BinarySectionWriter() {}
-
-  Error visit(const SymbolTableSection &Sec) override;
-  Error visit(const RelocationSection &Sec) override;
-  Error visit(const GnuDebugLinkSection &Sec) override;
-  Error visit(const GroupSection &Sec) override;
-  Error visit(const SectionIndexSection &Sec) override;
-  Error visit(const CompressedSection &Sec) override;
-  Error visit(const DecompressedSection &Sec) override;
-
-  explicit BinarySectionWriter(WritableMemoryBuffer &Buf)
-      : SectionWriter(Buf) {}
-};
-
-using IHexLineData = SmallVector<char, 64>;
-
-struct IHexRecord {
-  // Memory address of the record.
-  uint16_t Addr;
-  // Record type (see below).
-  uint16_t Type;
-  // Record data in hexadecimal form.
-  StringRef HexData;
-
-  // Helper method to get file length of the record
-  // including newline character
-  static size_t getLength(size_t DataSize) {
-    // :LLAAAATT[DD...DD]CC'
-    return DataSize * 2 + 11;
-  }
-
-  // Gets length of line in a file (getLength + CRLF).
-  static size_t getLineLength(size_t DataSize) {
-    return getLength(DataSize) + 2;
-  }
-
-  // Given type, address and data returns line which can
-  // be written to output file.
-  static IHexLineData getLine(uint8_t Type, uint16_t Addr,
-                              ArrayRef<uint8_t> Data);
-
-  // Parses the line and returns record if possible.
-  // Line should be trimmed from whitespace characters.
-  static Expected<IHexRecord> parse(StringRef Line);
-
-  // Calculates checksum of stringified record representation
-  // S must NOT contain leading ':' and trailing whitespace
-  // characters
-  static uint8_t getChecksum(StringRef S);
-
-  enum Type {
-    // Contains data and a 16-bit starting address for the data.
-    // The byte count specifies number of data bytes in the record.
-    Data = 0,
-    // Must occur exactly once per file in the last line of the file.
-    // The data field is empty (thus byte count is 00) and the address
-    // field is typically 0000.
-    EndOfFile = 1,
-    // The data field contains a 16-bit segment base address (thus byte
-    // count is always 02) compatible with 80x86 real mode addressing.
-    // The address field (typically 0000) is ignored. The segment address
-    // from the most recent 02 record is multiplied by 16 and added to each
-    // subsequent data record address to form the physical starting address
-    // for the data. This allows addressing up to one megabyte of address
-    // space.
-    SegmentAddr = 2,
-    // or 80x86 processors, specifies the initial content of the CS:IP
-    // registers. The address field is 0000, the byte count is always 04,
-    // the first two data bytes are the CS value, the latter two are the
-    // IP value.
-    StartAddr80x86 = 3,
-    // Allows for 32 bit addressing (up to 4GiB). The record's address field
-    // is ignored (typically 0000) and its byte count is always 02. The two
-    // data bytes (big endian) specify the upper 16 bits of the 32 bit
-    // absolute address for all subsequent type 00 records
-    ExtendedAddr = 4,
-    // The address field is 0000 (not used) and the byte count is always 04.
-    // The four data bytes represent a 32-bit address value. In the case of
-    // 80386 and higher CPUs, this address is loaded into the EIP register.
-    StartAddr = 5,
-    // We have no other valid types
-    InvalidType = 6
-  };
-};
-
-// Base class for IHexSectionWriter. This class implements writing algorithm,
-// but doesn't actually write records. It is used for output buffer size
-// calculation in IHexWriter::finalize.
-class IHexSectionWriterBase : public BinarySectionWriter {
-  // 20-bit segment address
-  uint32_t SegmentAddr = 0;
-  // Extended linear address
-  uint32_t BaseAddr = 0;
-
-  // Write segment address corresponding to 'Addr'
-  uint64_t writeSegmentAddr(uint64_t Addr);
-  // Write extended linear (base) address corresponding to 'Addr'
-  uint64_t writeBaseAddr(uint64_t Addr);
-
-protected:
-  // Offset in the output buffer
-  uint64_t Offset = 0;
-
-  void writeSection(const SectionBase *Sec, ArrayRef<uint8_t> Data);
-  virtual void writeData(uint8_t Type, uint16_t Addr, ArrayRef<uint8_t> Data);
-
-public:
-  explicit IHexSectionWriterBase(WritableMemoryBuffer &Buf)
-      : BinarySectionWriter(Buf) {}
-
-  uint64_t getBufferOffset() const { return Offset; }
-  Error visit(const Section &Sec) final;
-  Error visit(const OwnedDataSection &Sec) final;
-  Error visit(const StringTableSection &Sec) override;
-  Error visit(const DynamicRelocationSection &Sec) final;
-  using BinarySectionWriter::visit;
-};
-
-// Real IHEX section writer
-class IHexSectionWriter : public IHexSectionWriterBase {
-public:
-  IHexSectionWriter(WritableMemoryBuffer &Buf) : IHexSectionWriterBase(Buf) {}
-
-  void writeData(uint8_t Type, uint16_t Addr, ArrayRef<uint8_t> Data) override;
-  Error visit(const StringTableSection &Sec) override;
-};
-
-class Writer {
-protected:
-  Object &Obj;
-  std::unique_ptr<WritableMemoryBuffer> Buf;
-  raw_ostream &Out;
-
-public:
-  virtual ~Writer();
-  virtual Error finalize() = 0;
-  virtual Error write() = 0;
-
-  Writer(Object &O, raw_ostream &Out) : Obj(O), Out(Out) {}
-};
-
-template <class ELFT> class ELFWriter : public Writer {
-private:
-  using Elf_Addr = typename ELFT::Addr;
-  using Elf_Shdr = typename ELFT::Shdr;
-  using Elf_Phdr = typename ELFT::Phdr;
-  using Elf_Ehdr = typename ELFT::Ehdr;
-
-  void initEhdrSegment();
-
-  void writeEhdr();
-  void writePhdr(const Segment &Seg);
-  void writeShdr(const SectionBase &Sec);
-
-  void writePhdrs();
-  void writeShdrs();
-  Error writeSectionData();
-  void writeSegmentData();
-
-  void assignOffsets();
-
-  std::unique_ptr<ELFSectionWriter<ELFT>> SecWriter;
-
-  size_t totalSize() const;
-
-public:
-  virtual ~ELFWriter() {}
-  bool WriteSectionHeaders;
-
-  // For --only-keep-debug, select an alternative section/segment layout
-  // algorithm.
-  bool OnlyKeepDebug;
-
-  Error finalize() override;
-  Error write() override;
-  ELFWriter(Object &Obj, raw_ostream &Out, bool WSH, bool OnlyKeepDebug);
-};
-
-class BinaryWriter : public Writer {
-private:
-  std::unique_ptr<BinarySectionWriter> SecWriter;
-
-  uint64_t TotalSize = 0;
-
-public:
-  ~BinaryWriter() {}
-  Error finalize() override;
-  Error write() override;
-  BinaryWriter(Object &Obj, raw_ostream &Out) : Writer(Obj, Out) {}
-};
-
-class IHexWriter : public Writer {
-  struct SectionCompare {
-    bool operator()(const SectionBase *Lhs, const SectionBase *Rhs) const;
-  };
-
-  std::set<const SectionBase *, SectionCompare> Sections;
-  size_t TotalSize = 0;
-
-  Error checkSection(const SectionBase &Sec);
-  uint64_t writeEntryPointRecord(uint8_t *Buf);
-  uint64_t writeEndOfFileRecord(uint8_t *Buf);
-
-public:
-  ~IHexWriter() {}
-  Error finalize() override;
-  Error write() override;
-  IHexWriter(Object &Obj, raw_ostream &Out) : Writer(Obj, Out) {}
-};
-
-class SectionBase {
-public:
-  std::string Name;
-  Segment *ParentSegment = nullptr;
-  uint64_t HeaderOffset = 0;
-  uint32_t Index = 0;
-
-  uint32_t OriginalIndex = 0;
-  uint64_t OriginalFlags = 0;
-  uint64_t OriginalType = ELF::SHT_NULL;
-  uint64_t OriginalOffset = std::numeric_limits<uint64_t>::max();
-
-  uint64_t Addr = 0;
-  uint64_t Align = 1;
-  uint32_t EntrySize = 0;
-  uint64_t Flags = 0;
-  uint64_t Info = 0;
-  uint64_t Link = ELF::SHN_UNDEF;
-  uint64_t NameIndex = 0;
-  uint64_t Offset = 0;
-  uint64_t Size = 0;
-  uint64_t Type = ELF::SHT_NULL;
-  ArrayRef<uint8_t> OriginalData;
-  bool HasSymbol = false;
-
-  SectionBase() = default;
-  SectionBase(const SectionBase &) = default;
-
-  virtual ~SectionBase() = default;
-
-  virtual Error initialize(SectionTableRef SecTable);
-  virtual void finalize();
-  // Remove references to these sections. The list of sections must be sorted.
-  virtual Error
-  removeSectionReferences(bool AllowBrokenLinks,
-                          function_ref<bool(const SectionBase *)> ToRemove);
-  virtual Error removeSymbols(function_ref<bool(const Symbol &)> ToRemove);
-  virtual Error accept(SectionVisitor &Visitor) const = 0;
-  virtual Error accept(MutableSectionVisitor &Visitor) = 0;
-  virtual void markSymbols();
-  virtual void
-  replaceSectionReferences(const DenseMap<SectionBase *, SectionBase *> &);
-  virtual bool hasContents() const { return false; }
-  // Notify the section that it is subject to removal.
-  virtual void onRemove();
-};
-
-class Segment {
-private:
-  struct SectionCompare {
-    bool operator()(const SectionBase *Lhs, const SectionBase *Rhs) const {
-      // Some sections might have the same address if one of them is empty. To
-      // fix this we can use the lexicographic ordering on ->Addr and the
-      // original index.
-      if (Lhs->OriginalOffset == Rhs->OriginalOffset)
-        return Lhs->OriginalIndex < Rhs->OriginalIndex;
-      return Lhs->OriginalOffset < Rhs->OriginalOffset;
-    }
-  };
-
-public:
-  uint32_t Type = 0;
-  uint32_t Flags = 0;
-  uint64_t Offset = 0;
-  uint64_t VAddr = 0;
-  uint64_t PAddr = 0;
-  uint64_t FileSize = 0;
-  uint64_t MemSize = 0;
-  uint64_t Align = 0;
-
-  uint32_t Index = 0;
-  uint64_t OriginalOffset = 0;
-  Segment *ParentSegment = nullptr;
-  ArrayRef<uint8_t> Contents;
-  std::set<const SectionBase *, SectionCompare> Sections;
-
-  explicit Segment(ArrayRef<uint8_t> Data) : Contents(Data) {}
-  Segment() = default;
-
-  const SectionBase *firstSection() const {
-    if (!Sections.empty())
-      return *Sections.begin();
-    return nullptr;
-  }
-
-  void removeSection(const SectionBase *Sec) { Sections.erase(Sec); }
-  void addSection(const SectionBase *Sec) { Sections.insert(Sec); }
-
-  ArrayRef<uint8_t> getContents() const { return Contents; }
-};
-
-class Section : public SectionBase {
-  MAKE_SEC_WRITER_FRIEND
-
-  ArrayRef<uint8_t> Contents;
-  SectionBase *LinkSection = nullptr;
-
-public:
-  explicit Section(ArrayRef<uint8_t> Data) : Contents(Data) {}
-
-  Error accept(SectionVisitor &Visitor) const override;
-  Error accept(MutableSectionVisitor &Visitor) override;
-  Error removeSectionReferences(
-      bool AllowBrokenLinks,
-      function_ref<bool(const SectionBase *)> ToRemove) override;
-  Error initialize(SectionTableRef SecTable) override;
-  void finalize() override;
-  bool hasContents() const override {
-    return Type != ELF::SHT_NOBITS && Type != ELF::SHT_NULL;
-  }
-};
-
-class OwnedDataSection : public SectionBase {
-  MAKE_SEC_WRITER_FRIEND
-
-  std::vector<uint8_t> Data;
-
-public:
-  OwnedDataSection(StringRef SecName, ArrayRef<uint8_t> Data)
-      : Data(std::begin(Data), std::end(Data)) {
-    Name = SecName.str();
-    Type = OriginalType = ELF::SHT_PROGBITS;
-    Size = Data.size();
-    OriginalOffset = std::numeric_limits<uint64_t>::max();
-  }
-
-  OwnedDataSection(const Twine &SecName, uint64_t SecAddr, uint64_t SecFlags,
-                   uint64_t SecOff) {
-    Name = SecName.str();
-    Type = OriginalType = ELF::SHT_PROGBITS;
-    Addr = SecAddr;
-    Flags = OriginalFlags = SecFlags;
-    OriginalOffset = SecOff;
-  }
-
-  OwnedDataSection(SectionBase &S, ArrayRef<uint8_t> Data)
-      : SectionBase(S), Data(std::begin(Data), std::end(Data)) {
-    Size = Data.size();
-  }
-
-  void appendHexData(StringRef HexData);
-  Error accept(SectionVisitor &Sec) const override;
-  Error accept(MutableSectionVisitor &Visitor) override;
-  bool hasContents() const override { return true; }
-};
-
-class CompressedSection : public SectionBase {
-  MAKE_SEC_WRITER_FRIEND
-
-  DebugCompressionType CompressionType;
-  uint64_t DecompressedSize;
-  uint64_t DecompressedAlign;
-  SmallVector<char, 128> CompressedData;
-
-public:
-  static Expected<CompressedSection>
-  create(const SectionBase &Sec, DebugCompressionType CompressionType);
-  static Expected<CompressedSection> create(ArrayRef<uint8_t> CompressedData,
-                                            uint64_t DecompressedSize,
-                                            uint64_t DecompressedAlign);
-
-  uint64_t getDecompressedSize() const { return DecompressedSize; }
-  uint64_t getDecompressedAlign() const { return DecompressedAlign; }
-
-  Error accept(SectionVisitor &Visitor) const override;
-  Error accept(MutableSectionVisitor &Visitor) override;
-
-  static bool classof(const SectionBase *S) {
-    return (S->OriginalFlags & ELF::SHF_COMPRESSED) ||
-           (StringRef(S->Name).startswith(".zdebug"));
-  }
-
-private:
-  CompressedSection(const SectionBase &Sec,
-                    DebugCompressionType CompressionType, Error &Err);
-  CompressedSection(ArrayRef<uint8_t> CompressedData, uint64_t DecompressedSize,
-                    uint64_t DecompressedAlign);
-};
-
-class DecompressedSection : public SectionBase {
-  MAKE_SEC_WRITER_FRIEND
-
-public:
-  explicit DecompressedSection(const CompressedSection &Sec)
-      : SectionBase(Sec) {
-    Size = Sec.getDecompressedSize();
-    Align = Sec.getDecompressedAlign();
-    Flags = OriginalFlags = (Flags & ~ELF::SHF_COMPRESSED);
-    if (StringRef(Name).startswith(".zdebug"))
-      Name = "." + Name.substr(2);
-  }
-
-  Error accept(SectionVisitor &Visitor) const override;
-  Error accept(MutableSectionVisitor &Visitor) override;
-};
-
-// There are two types of string tables that can exist, dynamic and not dynamic.
-// In the dynamic case the string table is allocated. Changing a dynamic string
-// table would mean altering virtual addresses and thus the memory image. So
-// dynamic string tables should not have an interface to modify them or
-// reconstruct them. This type lets us reconstruct a string table. To avoid
-// this class being used for dynamic string tables (which has happened) the
-// classof method checks that the particular instance is not allocated. This
-// then agrees with the makeSection method used to construct most sections.
-class StringTableSection : public SectionBase {
-  MAKE_SEC_WRITER_FRIEND
-
-  StringTableBuilder StrTabBuilder;
-
-public:
-  StringTableSection() : StrTabBuilder(StringTableBuilder::ELF) {
-    Type = OriginalType = ELF::SHT_STRTAB;
-  }
-
-  void addString(StringRef Name);
-  uint32_t findIndex(StringRef Name) const;
-  void prepareForLayout();
-  Error accept(SectionVisitor &Visitor) const override;
-  Error accept(MutableSectionVisitor &Visitor) override;
-
-  static bool classof(const SectionBase *S) {
-    if (S->OriginalFlags & ELF::SHF_ALLOC)
-      return false;
-    return S->OriginalType == ELF::SHT_STRTAB;
-  }
-};
-
-// Symbols have a st_shndx field that normally stores an index but occasionally
-// stores a different special value. This enum keeps track of what the st_shndx
-// field means. Most of the values are just copies of the special SHN_* values.
-// SYMBOL_SIMPLE_INDEX means that the st_shndx is just an index of a section.
-enum SymbolShndxType {
-  SYMBOL_SIMPLE_INDEX = 0,
-  SYMBOL_ABS = ELF::SHN_ABS,
-  SYMBOL_COMMON = ELF::SHN_COMMON,
-  SYMBOL_LOPROC = ELF::SHN_LOPROC,
-  SYMBOL_AMDGPU_LDS = ELF::SHN_AMDGPU_LDS,
-  SYMBOL_HEXAGON_SCOMMON = ELF::SHN_HEXAGON_SCOMMON,
-  SYMBOL_HEXAGON_SCOMMON_2 = ELF::SHN_HEXAGON_SCOMMON_2,
-  SYMBOL_HEXAGON_SCOMMON_4 = ELF::SHN_HEXAGON_SCOMMON_4,
-  SYMBOL_HEXAGON_SCOMMON_8 = ELF::SHN_HEXAGON_SCOMMON_8,
-  SYMBOL_HIPROC = ELF::SHN_HIPROC,
-  SYMBOL_LOOS = ELF::SHN_LOOS,
-  SYMBOL_HIOS = ELF::SHN_HIOS,
-  SYMBOL_XINDEX = ELF::SHN_XINDEX,
-};
-
-struct Symbol {
-  uint8_t Binding;
-  SectionBase *DefinedIn = nullptr;
-  SymbolShndxType ShndxType;
-  uint32_t Index;
-  std::string Name;
-  uint32_t NameIndex;
-  uint64_t Size;
-  uint8_t Type;
-  uint64_t Value;
-  uint8_t Visibility;
-  bool Referenced = false;
-
-  uint16_t getShndx() const;
-  bool isCommon() const;
-};
-
-class SectionIndexSection : public SectionBase {
-  MAKE_SEC_WRITER_FRIEND
-
-private:
-  std::vector<uint32_t> Indexes;
-  SymbolTableSection *Symbols = nullptr;
-
-public:
-  virtual ~SectionIndexSection() {}
-  void addIndex(uint32_t Index) {
-    assert(Size > 0);
-    Indexes.push_back(Index);
-  }
-
-  void reserve(size_t NumSymbols) {
-    Indexes.reserve(NumSymbols);
-    Size = NumSymbols * 4;
-  }
-  void setSymTab(SymbolTableSection *SymTab) { Symbols = SymTab; }
-  Error initialize(SectionTableRef SecTable) override;
-  void finalize() override;
-  Error accept(SectionVisitor &Visitor) const override;
-  Error accept(MutableSectionVisitor &Visitor) override;
-
-  SectionIndexSection() {
-    Name = ".symtab_shndx";
-    Align = 4;
-    EntrySize = 4;
-    Type = OriginalType = ELF::SHT_SYMTAB_SHNDX;
-  }
-};
-
-class SymbolTableSection : public SectionBase {
-  MAKE_SEC_WRITER_FRIEND
-
-  void setStrTab(StringTableSection *StrTab) { SymbolNames = StrTab; }
-  void assignIndices();
-
-protected:
-  std::vector<std::unique_ptr<Symbol>> Symbols;
-  StringTableSection *SymbolNames = nullptr;
-  SectionIndexSection *SectionIndexTable = nullptr;
-
-  using SymPtr = std::unique_ptr<Symbol>;
-
-public:
-  SymbolTableSection() { Type = OriginalType = ELF::SHT_SYMTAB; }
-
-  void addSymbol(Twine Name, uint8_t Bind, uint8_t Type, SectionBase *DefinedIn,
-                 uint64_t Value, uint8_t Visibility, uint16_t Shndx,
-                 uint64_t SymbolSize);
-  void prepareForLayout();
-  // An 'empty' symbol table still contains a null symbol.
-  bool empty() const { return Symbols.size() == 1; }
-  void setShndxTable(SectionIndexSection *ShndxTable) {
-    SectionIndexTable = ShndxTable;
-  }
-  const SectionIndexSection *getShndxTable() const { return SectionIndexTable; }
-  void fillShndxTable();
-  const SectionBase *getStrTab() const { return SymbolNames; }
-  Expected<const Symbol *> getSymbolByIndex(uint32_t Index) const;
-  Expected<Symbol *> getSymbolByIndex(uint32_t Index);
-  void updateSymbols(function_ref<void(Symbol &)> Callable);
-
-  Error removeSectionReferences(
-      bool AllowBrokenLinks,
-      function_ref<bool(const SectionBase *)> ToRemove) override;
-  Error initialize(SectionTableRef SecTable) override;
-  void finalize() override;
-  Error accept(SectionVisitor &Visitor) const override;
-  Error accept(MutableSectionVisitor &Visitor) override;
-  Error removeSymbols(function_ref<bool(const Symbol &)> ToRemove) override;
-  void replaceSectionReferences(
-      const DenseMap<SectionBase *, SectionBase *> &FromTo) override;
-
-  static bool classof(const SectionBase *S) {
-    return S->OriginalType == ELF::SHT_SYMTAB;
-  }
-};
-
-struct Relocation {
-  Symbol *RelocSymbol = nullptr;
-  uint64_t Offset;
-  uint64_t Addend;
-  uint32_t Type;
-};
-
-// All relocation sections denote relocations to apply to another section.
-// However, some relocation sections use a dynamic symbol table and others use
-// a regular symbol table. Because the types of the two symbol tables differ in
-// our system (because they should behave differently) we can't uniformly
-// represent all relocations with the same base class if we expose an interface
-// that mentions the symbol table type. So we split the two base types into two
-// different classes, one which handles the section the relocation is applied to
-// and another which handles the symbol table type. The symbol table type is
-// taken as a type parameter to the class (see RelocSectionWithSymtabBase).
-class RelocationSectionBase : public SectionBase {
-protected:
-  SectionBase *SecToApplyRel = nullptr;
-
-public:
-  const SectionBase *getSection() const { return SecToApplyRel; }
-  void setSection(SectionBase *Sec) { SecToApplyRel = Sec; }
-
-  StringRef getNamePrefix() const;
-
-  static bool classof(const SectionBase *S) {
-    return S->OriginalType == ELF::SHT_REL || S->OriginalType == ELF::SHT_RELA;
-  }
-};
-
-// Takes the symbol table type to use as a parameter so that we can deduplicate
-// that code between the two symbol table types.
-template <class SymTabType>
-class RelocSectionWithSymtabBase : public RelocationSectionBase {
-  void setSymTab(SymTabType *SymTab) { Symbols = SymTab; }
-
-protected:
-  RelocSectionWithSymtabBase() = default;
-
-  SymTabType *Symbols = nullptr;
-
-public:
-  Error initialize(SectionTableRef SecTable) override;
-  void finalize() override;
-};
-
-class RelocationSection
-    : public RelocSectionWithSymtabBase<SymbolTableSection> {
-  MAKE_SEC_WRITER_FRIEND
-
-  std::vector<Relocation> Relocations;
-  const Object &Obj;
-
-public:
-  RelocationSection(const Object &O) : Obj(O) {}
-  void addRelocation(Relocation Rel) { Relocations.push_back(Rel); }
-  Error accept(SectionVisitor &Visitor) const override;
-  Error accept(MutableSectionVisitor &Visitor) override;
-  Error removeSectionReferences(
-      bool AllowBrokenLinks,
-      function_ref<bool(const SectionBase *)> ToRemove) override;
-  Error removeSymbols(function_ref<bool(const Symbol &)> ToRemove) override;
-  void markSymbols() override;
-  void replaceSectionReferences(
-      const DenseMap<SectionBase *, SectionBase *> &FromTo) override;
-  const Object &getObject() const { return Obj; }
-
-  static bool classof(const SectionBase *S) {
-    if (S->OriginalFlags & ELF::SHF_ALLOC)
-      return false;
-    return S->OriginalType == ELF::SHT_REL || S->OriginalType == ELF::SHT_RELA;
-  }
-};
-
-// TODO: The way stripping and groups interact is complicated
-// and still needs to be worked on.
-
-class GroupSection : public SectionBase {
-  MAKE_SEC_WRITER_FRIEND
-  const SymbolTableSection *SymTab = nullptr;
-  Symbol *Sym = nullptr;
-  ELF::Elf32_Word FlagWord;
-  SmallVector<SectionBase *, 3> GroupMembers;
-
-public:
-  // TODO: Contents is present in several classes of the hierarchy.
-  // This needs to be refactored to avoid duplication.
-  ArrayRef<uint8_t> Contents;
-
-  explicit GroupSection(ArrayRef<uint8_t> Data) : Contents(Data) {}
-
-  void setSymTab(const SymbolTableSection *SymTabSec) { SymTab = SymTabSec; }
-  void setSymbol(Symbol *S) { Sym = S; }
-  void setFlagWord(ELF::Elf32_Word W) { FlagWord = W; }
-  void addMember(SectionBase *Sec) { GroupMembers.push_back(Sec); }
-
-  Error accept(SectionVisitor &) const override;
-  Error accept(MutableSectionVisitor &Visitor) override;
-  void finalize() override;
-  Error removeSectionReferences(
-      bool AllowBrokenLinks,
-      function_ref<bool(const SectionBase *)> ToRemove) override;
-  Error removeSymbols(function_ref<bool(const Symbol &)> ToRemove) override;
-  void markSymbols() override;
-  void replaceSectionReferences(
-      const DenseMap<SectionBase *, SectionBase *> &FromTo) override;
-  void onRemove() override;
-
-  static bool classof(const SectionBase *S) {
-    return S->OriginalType == ELF::SHT_GROUP;
-  }
-};
-
-class DynamicSymbolTableSection : public Section {
-public:
-  explicit DynamicSymbolTableSection(ArrayRef<uint8_t> Data) : Section(Data) {}
-
-  static bool classof(const SectionBase *S) {
-    return S->OriginalType == ELF::SHT_DYNSYM;
-  }
-};
-
-class DynamicSection : public Section {
-public:
-  explicit DynamicSection(ArrayRef<uint8_t> Data) : Section(Data) {}
-
-  static bool classof(const SectionBase *S) {
-    return S->OriginalType == ELF::SHT_DYNAMIC;
-  }
-};
-
-class DynamicRelocationSection
-    : public RelocSectionWithSymtabBase<DynamicSymbolTableSection> {
-  MAKE_SEC_WRITER_FRIEND
-
-private:
-  ArrayRef<uint8_t> Contents;
-
-public:
-  explicit DynamicRelocationSection(ArrayRef<uint8_t> Data) : Contents(Data) {}
-
-  Error accept(SectionVisitor &) const override;
-  Error accept(MutableSectionVisitor &Visitor) override;
-  Error removeSectionReferences(
-      bool AllowBrokenLinks,
-      function_ref<bool(const SectionBase *)> ToRemove) override;
-
-  static bool classof(const SectionBase *S) {
-    if (!(S->OriginalFlags & ELF::SHF_ALLOC))
-      return false;
-    return S->OriginalType == ELF::SHT_REL || S->OriginalType == ELF::SHT_RELA;
-  }
-};
-
-class GnuDebugLinkSection : public SectionBase {
-  MAKE_SEC_WRITER_FRIEND
-
-private:
-  StringRef FileName;
-  uint32_t CRC32;
-
-  void init(StringRef File);
-
-public:
-  // If we add this section from an external source we can use this ctor.
-  explicit GnuDebugLinkSection(StringRef File, uint32_t PrecomputedCRC);
-  Error accept(SectionVisitor &Visitor) const override;
-  Error accept(MutableSectionVisitor &Visitor) override;
-};
-
-class Reader {
-public:
-  virtual ~Reader();
-  virtual Expected<std::unique_ptr<Object>> create(bool EnsureSymtab) const = 0;
-};
-
-using object::Binary;
-using object::ELFFile;
-using object::ELFObjectFile;
-using object::OwningBinary;
-
-class BasicELFBuilder {
-protected:
-  std::unique_ptr<Object> Obj;
-
-  void initFileHeader();
-  void initHeaderSegment();
-  StringTableSection *addStrTab();
-  SymbolTableSection *addSymTab(StringTableSection *StrTab);
-  Error initSections();
-
-public:
-  BasicELFBuilder() : Obj(std::make_unique<Object>()) {}
-};
-
-class BinaryELFBuilder : public BasicELFBuilder {
-  MemoryBuffer *MemBuf;
-  uint8_t NewSymbolVisibility;
-  void addData(SymbolTableSection *SymTab);
-
-public:
-  BinaryELFBuilder(MemoryBuffer *MB, uint8_t NewSymbolVisibility)
-      : MemBuf(MB), NewSymbolVisibility(NewSymbolVisibility) {}
-
-  Expected<std::unique_ptr<Object>> build();
-};
-
-class IHexELFBuilder : public BasicELFBuilder {
-  const std::vector<IHexRecord> &Records;
-
-  void addDataSections();
-
-public:
-  IHexELFBuilder(const std::vector<IHexRecord> &Records) : Records(Records) {}
-
-  Expected<std::unique_ptr<Object>> build();
-};
-
-template <class ELFT> class ELFBuilder {
-private:
-  using Elf_Addr = typename ELFT::Addr;
-  using Elf_Shdr = typename ELFT::Shdr;
-  using Elf_Word = typename ELFT::Word;
-
-  const ELFFile<ELFT> &ElfFile;
-  Object &Obj;
-  size_t EhdrOffset = 0;
-  Optional<StringRef> ExtractPartition;
-
-  void setParentSegment(Segment &Child);
-  Error readProgramHeaders(const ELFFile<ELFT> &HeadersFile);
-  Error initGroupSection(GroupSection *GroupSec);
-  Error initSymbolTable(SymbolTableSection *SymTab);
-  Error readSectionHeaders();
-  Error readSections(bool EnsureSymtab);
-  Error findEhdrOffset();
-  Expected<SectionBase &> makeSection(const Elf_Shdr &Shdr);
-
-public:
-  ELFBuilder(const ELFObjectFile<ELFT> &ElfObj, Object &Obj,
-             Optional<StringRef> ExtractPartition);
-
-  Error build(bool EnsureSymtab);
-};
-
-class BinaryReader : public Reader {
-  MemoryBuffer *MemBuf;
-  uint8_t NewSymbolVisibility;
-
-public:
-  BinaryReader(MemoryBuffer *MB, const uint8_t NewSymbolVisibility)
-      : MemBuf(MB), NewSymbolVisibility(NewSymbolVisibility) {}
-  Expected<std::unique_ptr<Object>> create(bool EnsureSymtab) const override;
-};
-
-class IHexReader : public Reader {
-  MemoryBuffer *MemBuf;
-
-  Expected<std::vector<IHexRecord>> parse() const;
-  Error parseError(size_t LineNo, Error E) const {
-    return LineNo == -1U
-               ? createFileError(MemBuf->getBufferIdentifier(), std::move(E))
-               : createFileError(MemBuf->getBufferIdentifier(), LineNo,
-                                 std::move(E));
-  }
-  template <typename... Ts>
-  Error parseError(size_t LineNo, char const *Fmt, const Ts &... Vals) const {
-    Error E = createStringError(errc::invalid_argument, Fmt, Vals...);
-    return parseError(LineNo, std::move(E));
-  }
-
-public:
-  IHexReader(MemoryBuffer *MB) : MemBuf(MB) {}
-
-  Expected<std::unique_ptr<Object>> create(bool EnsureSymtab) const override;
-};
-
-class ELFReader : public Reader {
-  Binary *Bin;
-  Optional<StringRef> ExtractPartition;
-
-public:
-  Expected<std::unique_ptr<Object>> create(bool EnsureSymtab) const override;
-  explicit ELFReader(Binary *B, Optional<StringRef> ExtractPartition)
-      : Bin(B), ExtractPartition(ExtractPartition) {}
-};
-
-class Object {
-private:
-  using SecPtr = std::unique_ptr<SectionBase>;
-  using SegPtr = std::unique_ptr<Segment>;
-
-  std::vector<SecPtr> Sections;
-  std::vector<SegPtr> Segments;
-  std::vector<SecPtr> RemovedSections;
-  DenseMap<SectionBase *, std::vector<uint8_t>> UpdatedSections;
-
-  static bool sectionIsAlloc(const SectionBase &Sec) {
-    return Sec.Flags & ELF::SHF_ALLOC;
-  };
-
-public:
-  template <class T>
-  using ConstRange = iterator_range<pointee_iterator<
-      typename std::vector<std::unique_ptr<T>>::const_iterator>>;
-
-  // It is often the case that the ELF header and the program header table are
-  // not present in any segment. This could be a problem during file layout,
-  // because other segments may get assigned an offset where either of the
-  // two should reside, which will effectively corrupt the resulting binary.
-  // Other than that we use these segments to track program header offsets
-  // when they may not follow the ELF header.
-  Segment ElfHdrSegment;
-  Segment ProgramHdrSegment;
-
-  uint8_t OSABI;
-  uint8_t ABIVersion;
-  uint64_t Entry;
-  uint64_t SHOff;
-  uint32_t Type;
-  uint32_t Machine;
-  uint32_t Version;
-  uint32_t Flags;
-
-  bool HadShdrs = true;
-  bool MustBeRelocatable = false;
-  StringTableSection *SectionNames = nullptr;
-  SymbolTableSection *SymbolTable = nullptr;
-  SectionIndexSection *SectionIndexTable = nullptr;
-
-  bool IsMips64EL = false;
-
-  SectionTableRef sections() const { return SectionTableRef(Sections); }
-  iterator_range<
-      filter_iterator<pointee_iterator<std::vector<SecPtr>::const_iterator>,
-                      decltype(&sectionIsAlloc)>>
-  allocSections() const {
-    return make_filter_range(make_pointee_range(Sections), sectionIsAlloc);
-  }
-
-  const auto &getUpdatedSections() const { return UpdatedSections; }
-  Error updateSection(StringRef Name, ArrayRef<uint8_t> Data);
-
-  SectionBase *findSection(StringRef Name) {
-    auto SecIt =
-        find_if(Sections, [&](const SecPtr &Sec) { return Sec->Name == Name; });
-    return SecIt == Sections.end() ? nullptr : SecIt->get();
-  }
-  SectionTableRef removedSections() { return SectionTableRef(RemovedSections); }
-
-  ConstRange<Segment> segments() const { return make_pointee_range(Segments); }
-
-  Error removeSections(bool AllowBrokenLinks,
-                       std::function<bool(const SectionBase &)> ToRemove);
-  Error replaceSections(const DenseMap<SectionBase *, SectionBase *> &FromTo);
-  Error removeSymbols(function_ref<bool(const Symbol &)> ToRemove);
-  template <class T, class... Ts> T &addSection(Ts &&... Args) {
-    auto Sec = std::make_unique<T>(std::forward<Ts>(Args)...);
-    auto Ptr = Sec.get();
-    MustBeRelocatable |= isa<RelocationSection>(*Ptr);
-    Sections.emplace_back(std::move(Sec));
-    Ptr->Index = Sections.size();
-    return *Ptr;
-  }
-  Error addNewSymbolTable();
-  Segment &addSegment(ArrayRef<uint8_t> Data) {
-    Segments.emplace_back(std::make_unique<Segment>(Data));
-    return *Segments.back();
-  }
-  bool isRelocatable() const {
-    return (Type != ELF::ET_DYN && Type != ELF::ET_EXEC) || MustBeRelocatable;
-  }
-};
-
-} // end namespace elf
-} // end namespace objcopy
-} // end namespace llvm
-
-#endif // LLVM_TOOLS_OBJCOPY_OBJECT_H
diff --git a/llvm/tools/llvm-objcopy/MachO/MachOConfig.h b/llvm/tools/llvm-objcopy/MachO/MachOConfig.h
deleted file mode 100644
index 93f9facfcf0b..000000000000
--- a/llvm/tools/llvm-objcopy/MachO/MachOConfig.h
+++ /dev/null
@@ -1,43 +0,0 @@
-//===- MachOConfig.h --------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_LLVM_OBJCOPY_MACHO_MACHOCONFIG_H
-#define LLVM_TOOLS_LLVM_OBJCOPY_MACHO_MACHOCONFIG_H
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/StringRef.h"
-#include <vector>
-
-namespace llvm {
-namespace objcopy {
-
-// Mach-O specific configuration for copying/stripping a single file.
-struct MachOConfig {
-  // Repeated options
-  std::vector<StringRef> RPathToAdd;
-  std::vector<StringRef> RPathToPrepend;
-  DenseMap<StringRef, StringRef> RPathsToUpdate;
-  DenseMap<StringRef, StringRef> InstallNamesToUpdate;
-  DenseSet<StringRef> RPathsToRemove;
-
-  // install-name-tool's id option
-  Optional<StringRef> SharedLibId;
-
-  // Boolean options
-  bool StripSwiftSymbols = false;
-  bool KeepUndefined = false;
-
-  // install-name-tool's --delete_all_rpaths
-  bool RemoveAllRpaths = false;
-};
-
-} // namespace objcopy
-} // namespace llvm
-
-#endif // LLVM_TOOLS_LLVM_OBJCOPY_MACHO_MACHOCONFIG_H
diff --git a/llvm/tools/llvm-objcopy/MachO/MachOLayoutBuilder.cpp b/llvm/tools/llvm-objcopy/MachO/MachOLayoutBuilder.cpp
deleted file mode 100644
index 6b731abd9ed9..000000000000
--- a/llvm/tools/llvm-objcopy/MachO/MachOLayoutBuilder.cpp
+++ /dev/null
@@ -1,441 +0,0 @@
-//===- MachOLayoutBuilder.cpp -----------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "MachOLayoutBuilder.h"
-#include "llvm/Support/Alignment.h"
-#include "llvm/Support/Errc.h"
-#include "llvm/Support/ErrorHandling.h"
-
-using namespace llvm;
-using namespace llvm::objcopy::macho;
-
-StringTableBuilder::Kind
-MachOLayoutBuilder::getStringTableBuilderKind(const Object &O, bool Is64Bit) {
-  if (O.Header.FileType == MachO::HeaderFileType::MH_OBJECT)
-    return Is64Bit ? StringTableBuilder::MachO64 : StringTableBuilder::MachO;
-  return Is64Bit ? StringTableBuilder::MachO64Linked
-                 : StringTableBuilder::MachOLinked;
-}
-
-uint32_t MachOLayoutBuilder::computeSizeOfCmds() const {
-  uint32_t Size = 0;
-  for (const LoadCommand &LC : O.LoadCommands) {
-    const MachO::macho_load_command &MLC = LC.MachOLoadCommand;
-    auto cmd = MLC.load_command_data.cmd;
-    switch (cmd) {
-    case MachO::LC_SEGMENT:
-      Size += sizeof(MachO::segment_command) +
-              sizeof(MachO::section) * LC.Sections.size();
-      continue;
-    case MachO::LC_SEGMENT_64:
-      Size += sizeof(MachO::segment_command_64) +
-              sizeof(MachO::section_64) * LC.Sections.size();
-      continue;
-    }
-
-    switch (cmd) {
-#define HANDLE_LOAD_COMMAND(LCName, LCValue, LCStruct)                         \
-  case MachO::LCName:                                                          \
-    Size += sizeof(MachO::LCStruct) + LC.Payload.size();                       \
-    break;
-#include "llvm/BinaryFormat/MachO.def"
-#undef HANDLE_LOAD_COMMAND
-    }
-  }
-
-  return Size;
-}
-
-void MachOLayoutBuilder::constructStringTable() {
-  for (std::unique_ptr<SymbolEntry> &Sym : O.SymTable.Symbols)
-    StrTableBuilder.add(Sym->Name);
-  StrTableBuilder.finalize();
-}
-
-void MachOLayoutBuilder::updateSymbolIndexes() {
-  uint32_t Index = 0;
-  for (auto &Symbol : O.SymTable.Symbols)
-    Symbol->Index = Index++;
-}
-
-// Updates the index and the number of local/external/undefined symbols.
-void MachOLayoutBuilder::updateDySymTab(MachO::macho_load_command &MLC) {
-  assert(MLC.load_command_data.cmd == MachO::LC_DYSYMTAB);
-  // Make sure that nlist entries in the symbol table are sorted by the those
-  // types. The order is: local < defined external < undefined external.
-  assert(llvm::is_sorted(O.SymTable.Symbols,
-                         [](const std::unique_ptr<SymbolEntry> &A,
-                            const std::unique_ptr<SymbolEntry> &B) {
-                           bool AL = A->isLocalSymbol(),
-                                BL = B->isLocalSymbol();
-                           if (AL != BL)
-                             return AL;
-                           return !AL && !A->isUndefinedSymbol() &&
-                                  B->isUndefinedSymbol();
-                         }) &&
-         "Symbols are not sorted by their types.");
-
-  uint32_t NumLocalSymbols = 0;
-  auto Iter = O.SymTable.Symbols.begin();
-  auto End = O.SymTable.Symbols.end();
-  for (; Iter != End; ++Iter) {
-    if ((*Iter)->isExternalSymbol())
-      break;
-
-    ++NumLocalSymbols;
-  }
-
-  uint32_t NumExtDefSymbols = 0;
-  for (; Iter != End; ++Iter) {
-    if ((*Iter)->isUndefinedSymbol())
-      break;
-
-    ++NumExtDefSymbols;
-  }
-
-  MLC.dysymtab_command_data.ilocalsym = 0;
-  MLC.dysymtab_command_data.nlocalsym = NumLocalSymbols;
-  MLC.dysymtab_command_data.iextdefsym = NumLocalSymbols;
-  MLC.dysymtab_command_data.nextdefsym = NumExtDefSymbols;
-  MLC.dysymtab_command_data.iundefsym = NumLocalSymbols + NumExtDefSymbols;
-  MLC.dysymtab_command_data.nundefsym =
-      O.SymTable.Symbols.size() - (NumLocalSymbols + NumExtDefSymbols);
-}
-
-// Recomputes and updates offset and size fields in load commands and sections
-// since they could be modified.
-uint64_t MachOLayoutBuilder::layoutSegments() {
-  auto HeaderSize =
-      Is64Bit ? sizeof(MachO::mach_header_64) : sizeof(MachO::mach_header);
-  const bool IsObjectFile =
-      O.Header.FileType == MachO::HeaderFileType::MH_OBJECT;
-  uint64_t Offset = IsObjectFile ? (HeaderSize + O.Header.SizeOfCmds) : 0;
-  for (LoadCommand &LC : O.LoadCommands) {
-    auto &MLC = LC.MachOLoadCommand;
-    StringRef Segname;
-    uint64_t SegmentVmAddr;
-    uint64_t SegmentVmSize;
-    switch (MLC.load_command_data.cmd) {
-    case MachO::LC_SEGMENT:
-      SegmentVmAddr = MLC.segment_command_data.vmaddr;
-      SegmentVmSize = MLC.segment_command_data.vmsize;
-      Segname = StringRef(MLC.segment_command_data.segname,
-                          strnlen(MLC.segment_command_data.segname,
-                                  sizeof(MLC.segment_command_data.segname)));
-      break;
-    case MachO::LC_SEGMENT_64:
-      SegmentVmAddr = MLC.segment_command_64_data.vmaddr;
-      SegmentVmSize = MLC.segment_command_64_data.vmsize;
-      Segname = StringRef(MLC.segment_command_64_data.segname,
-                          strnlen(MLC.segment_command_64_data.segname,
-                                  sizeof(MLC.segment_command_64_data.segname)));
-      break;
-    default:
-      continue;
-    }
-
-    if (Segname == "__LINKEDIT") {
-      // We update the __LINKEDIT segment later (in layoutTail).
-      assert(LC.Sections.empty() && "__LINKEDIT segment has sections");
-      LinkEditLoadCommand = &MLC;
-      continue;
-    }
-
-    // Update file offsets and sizes of sections.
-    uint64_t SegOffset = Offset;
-    uint64_t SegFileSize = 0;
-    uint64_t VMSize = 0;
-    for (std::unique_ptr<Section> &Sec : LC.Sections) {
-      assert(SegmentVmAddr <= Sec->Addr &&
-             "Section's address cannot be smaller than Segment's one");
-      uint32_t SectOffset = Sec->Addr - SegmentVmAddr;
-      if (IsObjectFile) {
-        if (!Sec->hasValidOffset()) {
-          Sec->Offset = 0;
-        } else {
-          uint64_t PaddingSize =
-              offsetToAlignment(SegFileSize, Align(1ull << Sec->Align));
-          Sec->Offset = SegOffset + SegFileSize + PaddingSize;
-          Sec->Size = Sec->Content.size();
-          SegFileSize += PaddingSize + Sec->Size;
-        }
-      } else {
-        if (!Sec->hasValidOffset()) {
-          Sec->Offset = 0;
-        } else {
-          Sec->Offset = SegOffset + SectOffset;
-          Sec->Size = Sec->Content.size();
-          SegFileSize = std::max(SegFileSize, SectOffset + Sec->Size);
-        }
-      }
-      VMSize = std::max(VMSize, SectOffset + Sec->Size);
-    }
-
-    if (IsObjectFile) {
-      Offset += SegFileSize;
-    } else {
-      Offset = alignTo(Offset + SegFileSize, PageSize);
-      SegFileSize = alignTo(SegFileSize, PageSize);
-      // Use the original vmsize if the segment is __PAGEZERO.
-      VMSize =
-          Segname == "__PAGEZERO" ? SegmentVmSize : alignTo(VMSize, PageSize);
-    }
-
-    switch (MLC.load_command_data.cmd) {
-    case MachO::LC_SEGMENT:
-      MLC.segment_command_data.cmdsize =
-          sizeof(MachO::segment_command) +
-          sizeof(MachO::section) * LC.Sections.size();
-      MLC.segment_command_data.nsects = LC.Sections.size();
-      MLC.segment_command_data.fileoff = SegOffset;
-      MLC.segment_command_data.vmsize = VMSize;
-      MLC.segment_command_data.filesize = SegFileSize;
-      break;
-    case MachO::LC_SEGMENT_64:
-      MLC.segment_command_64_data.cmdsize =
-          sizeof(MachO::segment_command_64) +
-          sizeof(MachO::section_64) * LC.Sections.size();
-      MLC.segment_command_64_data.nsects = LC.Sections.size();
-      MLC.segment_command_64_data.fileoff = SegOffset;
-      MLC.segment_command_64_data.vmsize = VMSize;
-      MLC.segment_command_64_data.filesize = SegFileSize;
-      break;
-    }
-  }
-
-  return Offset;
-}
-
-uint64_t MachOLayoutBuilder::layoutRelocations(uint64_t Offset) {
-  for (LoadCommand &LC : O.LoadCommands)
-    for (std::unique_ptr<Section> &Sec : LC.Sections) {
-      Sec->RelOff = Sec->Relocations.empty() ? 0 : Offset;
-      Sec->NReloc = Sec->Relocations.size();
-      Offset += sizeof(MachO::any_relocation_info) * Sec->NReloc;
-    }
-
-  return Offset;
-}
-
-Error MachOLayoutBuilder::layoutTail(uint64_t Offset) {
-  // If we are building the layout of an executable or dynamic library
-  // which does not have any segments other than __LINKEDIT,
-  // the Offset can be equal to zero by this time. It happens because of the
-  // convention that in such cases the file offsets specified by LC_SEGMENT
-  // start with zero (unlike the case of a relocatable object file).
-  const uint64_t HeaderSize =
-      Is64Bit ? sizeof(MachO::mach_header_64) : sizeof(MachO::mach_header);
-  assert((!(O.Header.FileType == MachO::HeaderFileType::MH_OBJECT) ||
-          Offset >= HeaderSize + O.Header.SizeOfCmds) &&
-         "Incorrect tail offset");
-  Offset = std::max(Offset, HeaderSize + O.Header.SizeOfCmds);
-
-  // The order of LINKEDIT elements is as follows:
-  // rebase info, binding info, weak binding info, lazy binding info, export
-  // trie, data-in-code, symbol table, indirect symbol table, symbol table
-  // strings, code signature.
-  uint64_t NListSize = Is64Bit ? sizeof(MachO::nlist_64) : sizeof(MachO::nlist);
-  uint64_t StartOfLinkEdit = Offset;
-  uint64_t StartOfRebaseInfo = StartOfLinkEdit;
-  uint64_t StartOfBindingInfo = StartOfRebaseInfo + O.Rebases.Opcodes.size();
-  uint64_t StartOfWeakBindingInfo = StartOfBindingInfo + O.Binds.Opcodes.size();
-  uint64_t StartOfLazyBindingInfo =
-      StartOfWeakBindingInfo + O.WeakBinds.Opcodes.size();
-  uint64_t StartOfExportTrie =
-      StartOfLazyBindingInfo + O.LazyBinds.Opcodes.size();
-  uint64_t StartOfFunctionStarts = StartOfExportTrie + O.Exports.Trie.size();
-  uint64_t StartOfDyldExportsTrie =
-      StartOfFunctionStarts + O.FunctionStarts.Data.size();
-  uint64_t StartOfChainedFixups =
-      StartOfDyldExportsTrie + O.ExportsTrie.Data.size();
-  uint64_t StartOfDataInCode =
-      StartOfChainedFixups + O.ChainedFixups.Data.size();
-  uint64_t StartOfLinkerOptimizationHint =
-      StartOfDataInCode + O.DataInCode.Data.size();
-  uint64_t StartOfSymbols =
-      StartOfLinkerOptimizationHint + O.LinkerOptimizationHint.Data.size();
-  uint64_t StartOfIndirectSymbols =
-      StartOfSymbols + NListSize * O.SymTable.Symbols.size();
-  uint64_t StartOfSymbolStrings =
-      StartOfIndirectSymbols +
-      sizeof(uint32_t) * O.IndirectSymTable.Symbols.size();
-  uint64_t StartOfCodeSignature =
-      StartOfSymbolStrings + StrTableBuilder.getSize();
-  uint32_t CodeSignatureSize = 0;
-  if (O.CodeSignatureCommandIndex) {
-    StartOfCodeSignature = alignTo(StartOfCodeSignature, 16);
-
-    // Note: These calculations are to be kept in sync with the same
-    // calculations performed in LLD's CodeSignatureSection.
-    const uint32_t AllHeadersSize =
-        alignTo(CodeSignature.FixedHeadersSize + OutputFileName.size() + 1,
-                CodeSignature.Align);
-    const uint32_t BlockCount =
-        (StartOfCodeSignature + CodeSignature.BlockSize - 1) /
-        CodeSignature.BlockSize;
-    const uint32_t Size =
-        alignTo(AllHeadersSize + BlockCount * CodeSignature.HashSize,
-                CodeSignature.Align);
-
-    CodeSignature.StartOffset = StartOfCodeSignature;
-    CodeSignature.AllHeadersSize = AllHeadersSize;
-    CodeSignature.BlockCount = BlockCount;
-    CodeSignature.OutputFileName = OutputFileName;
-    CodeSignature.Size = Size;
-    CodeSignatureSize = Size;
-  }
-  uint64_t LinkEditSize =
-      StartOfCodeSignature + CodeSignatureSize - StartOfLinkEdit;
-
-  // Now we have determined the layout of the contents of the __LINKEDIT
-  // segment. Update its load command.
-  if (LinkEditLoadCommand) {
-    MachO::macho_load_command *MLC = LinkEditLoadCommand;
-    switch (LinkEditLoadCommand->load_command_data.cmd) {
-    case MachO::LC_SEGMENT:
-      MLC->segment_command_data.cmdsize = sizeof(MachO::segment_command);
-      MLC->segment_command_data.fileoff = StartOfLinkEdit;
-      MLC->segment_command_data.vmsize = alignTo(LinkEditSize, PageSize);
-      MLC->segment_command_data.filesize = LinkEditSize;
-      break;
-    case MachO::LC_SEGMENT_64:
-      MLC->segment_command_64_data.cmdsize = sizeof(MachO::segment_command_64);
-      MLC->segment_command_64_data.fileoff = StartOfLinkEdit;
-      MLC->segment_command_64_data.vmsize = alignTo(LinkEditSize, PageSize);
-      MLC->segment_command_64_data.filesize = LinkEditSize;
-      break;
-    }
-  }
-
-  for (LoadCommand &LC : O.LoadCommands) {
-    auto &MLC = LC.MachOLoadCommand;
-    auto cmd = MLC.load_command_data.cmd;
-    switch (cmd) {
-    case MachO::LC_CODE_SIGNATURE:
-      MLC.linkedit_data_command_data.dataoff = StartOfCodeSignature;
-      MLC.linkedit_data_command_data.datasize = CodeSignatureSize;
-      break;
-    case MachO::LC_SYMTAB:
-      MLC.symtab_command_data.symoff = StartOfSymbols;
-      MLC.symtab_command_data.nsyms = O.SymTable.Symbols.size();
-      MLC.symtab_command_data.stroff = StartOfSymbolStrings;
-      MLC.symtab_command_data.strsize = StrTableBuilder.getSize();
-      break;
-    case MachO::LC_DYSYMTAB: {
-      if (MLC.dysymtab_command_data.ntoc != 0 ||
-          MLC.dysymtab_command_data.nmodtab != 0 ||
-          MLC.dysymtab_command_data.nextrefsyms != 0 ||
-          MLC.dysymtab_command_data.nlocrel != 0 ||
-          MLC.dysymtab_command_data.nextrel != 0)
-        return createStringError(llvm::errc::not_supported,
-                                 "shared library is not yet supported");
-
-      if (!O.IndirectSymTable.Symbols.empty()) {
-        MLC.dysymtab_command_data.indirectsymoff = StartOfIndirectSymbols;
-        MLC.dysymtab_command_data.nindirectsyms =
-            O.IndirectSymTable.Symbols.size();
-      }
-
-      updateDySymTab(MLC);
-      break;
-    }
-    case MachO::LC_DATA_IN_CODE:
-      MLC.linkedit_data_command_data.dataoff = StartOfDataInCode;
-      MLC.linkedit_data_command_data.datasize = O.DataInCode.Data.size();
-      break;
-    case MachO::LC_LINKER_OPTIMIZATION_HINT:
-      MLC.linkedit_data_command_data.dataoff = StartOfLinkerOptimizationHint;
-      MLC.linkedit_data_command_data.datasize =
-          O.LinkerOptimizationHint.Data.size();
-      break;
-    case MachO::LC_FUNCTION_STARTS:
-      MLC.linkedit_data_command_data.dataoff = StartOfFunctionStarts;
-      MLC.linkedit_data_command_data.datasize = O.FunctionStarts.Data.size();
-      break;
-    case MachO::LC_DYLD_CHAINED_FIXUPS:
-      MLC.linkedit_data_command_data.dataoff = StartOfChainedFixups;
-      MLC.linkedit_data_command_data.datasize = O.ChainedFixups.Data.size();
-      break;
-    case MachO::LC_DYLD_EXPORTS_TRIE:
-      MLC.linkedit_data_command_data.dataoff = StartOfDyldExportsTrie;
-      MLC.linkedit_data_command_data.datasize = O.ExportsTrie.Data.size();
-      break;
-    case MachO::LC_DYLD_INFO:
-    case MachO::LC_DYLD_INFO_ONLY:
-      MLC.dyld_info_command_data.rebase_off =
-          O.Rebases.Opcodes.empty() ? 0 : StartOfRebaseInfo;
-      MLC.dyld_info_command_data.rebase_size = O.Rebases.Opcodes.size();
-      MLC.dyld_info_command_data.bind_off =
-          O.Binds.Opcodes.empty() ? 0 : StartOfBindingInfo;
-      MLC.dyld_info_command_data.bind_size = O.Binds.Opcodes.size();
-      MLC.dyld_info_command_data.weak_bind_off =
-          O.WeakBinds.Opcodes.empty() ? 0 : StartOfWeakBindingInfo;
-      MLC.dyld_info_command_data.weak_bind_size = O.WeakBinds.Opcodes.size();
-      MLC.dyld_info_command_data.lazy_bind_off =
-          O.LazyBinds.Opcodes.empty() ? 0 : StartOfLazyBindingInfo;
-      MLC.dyld_info_command_data.lazy_bind_size = O.LazyBinds.Opcodes.size();
-      MLC.dyld_info_command_data.export_off =
-          O.Exports.Trie.empty() ? 0 : StartOfExportTrie;
-      MLC.dyld_info_command_data.export_size = O.Exports.Trie.size();
-      break;
-    // Note that LC_ENCRYPTION_INFO.cryptoff despite its name and the comment in
-    // <mach-o/loader.h> is not an offset in the binary file, instead, it is a
-    // relative virtual address. At the moment modification of the __TEXT
-    // segment of executables isn't supported anyway (e.g. data in code entries
-    // are not recalculated). Moreover, in general
-    // LC_ENCRYPT_INFO/LC_ENCRYPTION_INFO_64 are nontrivial to update because
-    // without making additional assumptions (e.g. that the entire __TEXT
-    // segment should be encrypted) we do not know how to recalculate the
-    // boundaries of the encrypted part. For now just copy over these load
-    // commands until we encounter a real world usecase where
-    // LC_ENCRYPT_INFO/LC_ENCRYPTION_INFO_64 need to be adjusted.
-    case MachO::LC_ENCRYPTION_INFO:
-    case MachO::LC_ENCRYPTION_INFO_64:
-    case MachO::LC_LOAD_DYLINKER:
-    case MachO::LC_MAIN:
-    case MachO::LC_RPATH:
-    case MachO::LC_SEGMENT:
-    case MachO::LC_SEGMENT_64:
-    case MachO::LC_VERSION_MIN_MACOSX:
-    case MachO::LC_VERSION_MIN_IPHONEOS:
-    case MachO::LC_VERSION_MIN_TVOS:
-    case MachO::LC_VERSION_MIN_WATCHOS:
-    case MachO::LC_BUILD_VERSION:
-    case MachO::LC_ID_DYLIB:
-    case MachO::LC_LOAD_DYLIB:
-    case MachO::LC_LOAD_WEAK_DYLIB:
-    case MachO::LC_UUID:
-    case MachO::LC_SOURCE_VERSION:
-    case MachO::LC_THREAD:
-    case MachO::LC_UNIXTHREAD:
-    case MachO::LC_SUB_FRAMEWORK:
-    case MachO::LC_SUB_UMBRELLA:
-    case MachO::LC_SUB_CLIENT:
-    case MachO::LC_SUB_LIBRARY:
-    case MachO::LC_LINKER_OPTION:
-      // Nothing to update.
-      break;
-    default:
-      // Abort if it's unsupported in order to prevent corrupting the object.
-      return createStringError(llvm::errc::not_supported,
-                               "unsupported load command (cmd=0x%x)", cmd);
-    }
-  }
-
-  return Error::success();
-}
-
-Error MachOLayoutBuilder::layout() {
-  O.Header.NCmds = O.LoadCommands.size();
-  O.Header.SizeOfCmds = computeSizeOfCmds();
-  constructStringTable();
-  updateSymbolIndexes();
-  uint64_t Offset = layoutSegments();
-  Offset = layoutRelocations(Offset);
-  return layoutTail(Offset);
-}
diff --git a/llvm/tools/llvm-objcopy/MachO/MachOLayoutBuilder.h b/llvm/tools/llvm-objcopy/MachO/MachOLayoutBuilder.h
deleted file mode 100644
index 44d03b4af7e8..000000000000
--- a/llvm/tools/llvm-objcopy/MachO/MachOLayoutBuilder.h
+++ /dev/null
@@ -1,97 +0,0 @@
-//===- MachOLayoutBuilder.h -------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_OBJCOPY_MACHO_MACHOLAYOUTBUILDER_H
-#define LLVM_OBJCOPY_MACHO_MACHOLAYOUTBUILDER_H
-
-#include "MachOObjcopy.h"
-#include "Object.h"
-
-namespace llvm {
-namespace objcopy {
-namespace macho {
-
-/// When MachO binaries include a LC_CODE_SIGNATURE load command,
-/// the __LINKEDIT data segment will include a section corresponding
-/// to the LC_CODE_SIGNATURE load command. This section serves as a signature
-/// for the binary. Included in the CodeSignature section is a header followed
-/// by a hash of the binary. If present, the CodeSignature section is the
-/// last component of the binary.
-struct CodeSignatureInfo {
-  // NOTE: These values are to be kept in sync with those in
-  // LLD's CodeSignatureSection class.
-
-  static constexpr uint32_t Align = 16;
-  static constexpr uint8_t BlockSizeShift = 12;
-  // The binary is read in blocks of the following size.
-  static constexpr size_t BlockSize = (1 << BlockSizeShift); // 4 KiB
-  // For each block, a SHA256 hash (256 bits, 32 bytes) is written to
-  // the CodeSignature section.
-  static constexpr size_t HashSize = 256 / 8;
-  static constexpr size_t BlobHeadersSize = llvm::alignTo<8>(
-      sizeof(llvm::MachO::CS_SuperBlob) + sizeof(llvm::MachO::CS_BlobIndex));
-  // The size of the entire header depends upon the filename the binary is being
-  // written to, but the rest of the header is fixed in size.
-  static constexpr uint32_t FixedHeadersSize =
-      BlobHeadersSize + sizeof(llvm::MachO::CS_CodeDirectory);
-
-  // The offset relative to the start of the binary where
-  // the CodeSignature section should begin.
-  uint32_t StartOffset;
-  // The size of the entire header, output file name size included.
-  uint32_t AllHeadersSize;
-  // The number of blocks required to hash the binary.
-  uint32_t BlockCount;
-  StringRef OutputFileName;
-  // The size of the entire CodeSignature section, including both the header and
-  // hashes.
-  uint32_t Size;
-};
-
-class MachOLayoutBuilder {
-  Object &O;
-  bool Is64Bit;
-  StringRef OutputFileName;
-  uint64_t PageSize;
-  CodeSignatureInfo CodeSignature;
-
-  // Points to the __LINKEDIT segment if it exists.
-  MachO::macho_load_command *LinkEditLoadCommand = nullptr;
-  StringTableBuilder StrTableBuilder;
-
-  uint32_t computeSizeOfCmds() const;
-  void constructStringTable();
-  void updateSymbolIndexes();
-  void updateDySymTab(MachO::macho_load_command &MLC);
-  uint64_t layoutSegments();
-  uint64_t layoutRelocations(uint64_t Offset);
-  Error layoutTail(uint64_t Offset);
-
-  static StringTableBuilder::Kind getStringTableBuilderKind(const Object &O,
-                                                            bool Is64Bit);
-
-public:
-  MachOLayoutBuilder(Object &O, bool Is64Bit, StringRef OutputFileName,
-                     uint64_t PageSize)
-      : O(O), Is64Bit(Is64Bit), OutputFileName(OutputFileName),
-        PageSize(PageSize),
-        StrTableBuilder(getStringTableBuilderKind(O, Is64Bit)) {}
-
-  // Recomputes and updates fields in the given object such as file offsets.
-  Error layout();
-
-  StringTableBuilder &getStringTableBuilder() { return StrTableBuilder; }
-
-  const CodeSignatureInfo &getCodeSignature() { return CodeSignature; }
-};
-
-} // end namespace macho
-} // end namespace objcopy
-} // end namespace llvm
-
-#endif // LLVM_OBJCOPY_MACHO_MACHOLAYOUTBUILDER_H
diff --git a/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.cpp b/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.cpp
deleted file mode 100644
index 0f92ca516bef..000000000000
--- a/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.cpp
+++ /dev/null
@@ -1,549 +0,0 @@
-//===- MachOObjcopy.cpp -----------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "MachOObjcopy.h"
-#include "../llvm-objcopy.h"
-#include "CommonConfig.h"
-#include "MachO/MachOConfig.h"
-#include "MachOReader.h"
-#include "MachOWriter.h"
-#include "MultiFormatConfig.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/Object/ArchiveWriter.h"
-#include "llvm/Object/MachOUniversal.h"
-#include "llvm/Object/MachOUniversalWriter.h"
-#include "llvm/Support/Errc.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/FileOutputBuffer.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/SmallVectorMemoryBuffer.h"
-
-using namespace llvm;
-using namespace llvm::objcopy;
-using namespace llvm::objcopy::macho;
-using namespace llvm::object;
-
-using SectionPred = std::function<bool(const std::unique_ptr<Section> &Sec)>;
-using LoadCommandPred = std::function<bool(const LoadCommand &LC)>;
-
-#ifndef NDEBUG
-static bool isLoadCommandWithPayloadString(const LoadCommand &LC) {
-  // TODO: Add support for LC_REEXPORT_DYLIB, LC_LOAD_UPWARD_DYLIB and
-  // LC_LAZY_LOAD_DYLIB
-  return LC.MachOLoadCommand.load_command_data.cmd == MachO::LC_RPATH ||
-         LC.MachOLoadCommand.load_command_data.cmd == MachO::LC_ID_DYLIB ||
-         LC.MachOLoadCommand.load_command_data.cmd == MachO::LC_LOAD_DYLIB ||
-         LC.MachOLoadCommand.load_command_data.cmd == MachO::LC_LOAD_WEAK_DYLIB;
-}
-#endif
-
-static StringRef getPayloadString(const LoadCommand &LC) {
-  assert(isLoadCommandWithPayloadString(LC) &&
-         "unsupported load command encountered");
-
-  return StringRef(reinterpret_cast<const char *>(LC.Payload.data()),
-                   LC.Payload.size())
-      .rtrim('\0');
-}
-
-static Error removeSections(const CommonConfig &Config, Object &Obj) {
-  SectionPred RemovePred = [](const std::unique_ptr<Section> &) {
-    return false;
-  };
-
-  if (!Config.ToRemove.empty()) {
-    RemovePred = [&Config, RemovePred](const std::unique_ptr<Section> &Sec) {
-      return Config.ToRemove.matches(Sec->CanonicalName);
-    };
-  }
-
-  if (Config.StripAll || Config.StripDebug) {
-    // Remove all debug sections.
-    RemovePred = [RemovePred](const std::unique_ptr<Section> &Sec) {
-      if (Sec->Segname == "__DWARF")
-        return true;
-
-      return RemovePred(Sec);
-    };
-  }
-
-  if (!Config.OnlySection.empty()) {
-    // Overwrite RemovePred because --only-section takes priority.
-    RemovePred = [&Config](const std::unique_ptr<Section> &Sec) {
-      return !Config.OnlySection.matches(Sec->CanonicalName);
-    };
-  }
-
-  return Obj.removeSections(RemovePred);
-}
-
-static void markSymbols(const CommonConfig &, Object &Obj) {
-  // Symbols referenced from the indirect symbol table must not be removed.
-  for (IndirectSymbolEntry &ISE : Obj.IndirectSymTable.Symbols)
-    if (ISE.Symbol)
-      (*ISE.Symbol)->Referenced = true;
-}
-
-static void updateAndRemoveSymbols(const CommonConfig &Config,
-                                   const MachOConfig &MachOConfig,
-                                   Object &Obj) {
-  for (SymbolEntry &Sym : Obj.SymTable) {
-    auto I = Config.SymbolsToRename.find(Sym.Name);
-    if (I != Config.SymbolsToRename.end())
-      Sym.Name = std::string(I->getValue());
-  }
-
-  auto RemovePred = [Config, MachOConfig,
-                     &Obj](const std::unique_ptr<SymbolEntry> &N) {
-    if (N->Referenced)
-      return false;
-    if (MachOConfig.KeepUndefined && N->isUndefinedSymbol())
-      return false;
-    if (N->n_desc & MachO::REFERENCED_DYNAMICALLY)
-      return false;
-    if (Config.StripAll)
-      return true;
-    if (Config.DiscardMode == DiscardType::All && !(N->n_type & MachO::N_EXT))
-      return true;
-    // This behavior is consistent with cctools' strip.
-    if (MachOConfig.StripSwiftSymbols &&
-        (Obj.Header.Flags & MachO::MH_DYLDLINK) && Obj.SwiftVersion &&
-        *Obj.SwiftVersion && N->isSwiftSymbol())
-      return true;
-    return false;
-  };
-
-  Obj.SymTable.removeSymbols(RemovePred);
-}
-
-template <typename LCType>
-static void updateLoadCommandPayloadString(LoadCommand &LC, StringRef S) {
-  assert(isLoadCommandWithPayloadString(LC) &&
-         "unsupported load command encountered");
-
-  uint32_t NewCmdsize = alignTo(sizeof(LCType) + S.size() + 1, 8);
-
-  LC.MachOLoadCommand.load_command_data.cmdsize = NewCmdsize;
-  LC.Payload.assign(NewCmdsize - sizeof(LCType), 0);
-  std::copy(S.begin(), S.end(), LC.Payload.begin());
-}
-
-static LoadCommand buildRPathLoadCommand(StringRef Path) {
-  LoadCommand LC;
-  MachO::rpath_command RPathLC;
-  RPathLC.cmd = MachO::LC_RPATH;
-  RPathLC.path = sizeof(MachO::rpath_command);
-  RPathLC.cmdsize = alignTo(sizeof(MachO::rpath_command) + Path.size() + 1, 8);
-  LC.MachOLoadCommand.rpath_command_data = RPathLC;
-  LC.Payload.assign(RPathLC.cmdsize - sizeof(MachO::rpath_command), 0);
-  std::copy(Path.begin(), Path.end(), LC.Payload.begin());
-  return LC;
-}
-
-static Error processLoadCommands(const MachOConfig &MachOConfig, Object &Obj) {
-  // Remove RPaths.
-  DenseSet<StringRef> RPathsToRemove(MachOConfig.RPathsToRemove.begin(),
-                                     MachOConfig.RPathsToRemove.end());
-
-  LoadCommandPred RemovePred = [&RPathsToRemove,
-                                &MachOConfig](const LoadCommand &LC) {
-    if (LC.MachOLoadCommand.load_command_data.cmd == MachO::LC_RPATH) {
-      // When removing all RPaths we don't need to care
-      // about what it contains
-      if (MachOConfig.RemoveAllRpaths)
-        return true;
-
-      StringRef RPath = getPayloadString(LC);
-      if (RPathsToRemove.count(RPath)) {
-        RPathsToRemove.erase(RPath);
-        return true;
-      }
-    }
-    return false;
-  };
-
-  if (Error E = Obj.removeLoadCommands(RemovePred))
-    return E;
-
-  // Emit an error if the Mach-O binary does not contain an rpath path name
-  // specified in -delete_rpath.
-  for (StringRef RPath : MachOConfig.RPathsToRemove) {
-    if (RPathsToRemove.count(RPath))
-      return createStringError(errc::invalid_argument,
-                               "no LC_RPATH load command with path: %s",
-                               RPath.str().c_str());
-  }
-
-  DenseSet<StringRef> RPaths;
-
-  // Get all existing RPaths.
-  for (LoadCommand &LC : Obj.LoadCommands) {
-    if (LC.MachOLoadCommand.load_command_data.cmd == MachO::LC_RPATH)
-      RPaths.insert(getPayloadString(LC));
-  }
-
-  // Throw errors for invalid RPaths.
-  for (const auto &OldNew : MachOConfig.RPathsToUpdate) {
-    StringRef Old = OldNew.getFirst();
-    StringRef New = OldNew.getSecond();
-    if (!RPaths.contains(Old))
-      return createStringError(errc::invalid_argument,
-                               "no LC_RPATH load command with path: " + Old);
-    if (RPaths.contains(New))
-      return createStringError(errc::invalid_argument,
-                               "rpath '" + New +
-                                   "' would create a duplicate load command");
-  }
-
-  // Update load commands.
-  for (LoadCommand &LC : Obj.LoadCommands) {
-    switch (LC.MachOLoadCommand.load_command_data.cmd) {
-    case MachO::LC_ID_DYLIB:
-      if (MachOConfig.SharedLibId)
-        updateLoadCommandPayloadString<MachO::dylib_command>(
-            LC, *MachOConfig.SharedLibId);
-      break;
-
-    case MachO::LC_RPATH: {
-      StringRef RPath = getPayloadString(LC);
-      StringRef NewRPath = MachOConfig.RPathsToUpdate.lookup(RPath);
-      if (!NewRPath.empty())
-        updateLoadCommandPayloadString<MachO::rpath_command>(LC, NewRPath);
-      break;
-    }
-
-    // TODO: Add LC_REEXPORT_DYLIB, LC_LAZY_LOAD_DYLIB, and LC_LOAD_UPWARD_DYLIB
-    // here once llvm-objcopy supports them.
-    case MachO::LC_LOAD_DYLIB:
-    case MachO::LC_LOAD_WEAK_DYLIB:
-      StringRef InstallName = getPayloadString(LC);
-      StringRef NewInstallName =
-          MachOConfig.InstallNamesToUpdate.lookup(InstallName);
-      if (!NewInstallName.empty())
-        updateLoadCommandPayloadString<MachO::dylib_command>(LC,
-                                                             NewInstallName);
-      break;
-    }
-  }
-
-  // Add new RPaths.
-  for (StringRef RPath : MachOConfig.RPathToAdd) {
-    if (RPaths.contains(RPath))
-      return createStringError(errc::invalid_argument,
-                               "rpath '" + RPath +
-                                   "' would create a duplicate load command");
-    RPaths.insert(RPath);
-    Obj.LoadCommands.push_back(buildRPathLoadCommand(RPath));
-  }
-
-  for (StringRef RPath : MachOConfig.RPathToPrepend) {
-    if (RPaths.contains(RPath))
-      return createStringError(errc::invalid_argument,
-                               "rpath '" + RPath +
-                                   "' would create a duplicate load command");
-
-    RPaths.insert(RPath);
-    Obj.LoadCommands.insert(Obj.LoadCommands.begin(),
-                            buildRPathLoadCommand(RPath));
-  }
-
-  // Unlike appending rpaths, the indexes of subsequent load commands must
-  // be recalculated after prepending one.
-  if (!MachOConfig.RPathToPrepend.empty())
-    Obj.updateLoadCommandIndexes();
-
-  return Error::success();
-}
-
-static Error dumpSectionToFile(StringRef SecName, StringRef Filename,
-                               Object &Obj) {
-  for (LoadCommand &LC : Obj.LoadCommands)
-    for (const std::unique_ptr<Section> &Sec : LC.Sections) {
-      if (Sec->CanonicalName == SecName) {
-        Expected<std::unique_ptr<FileOutputBuffer>> BufferOrErr =
-            FileOutputBuffer::create(Filename, Sec->Content.size());
-        if (!BufferOrErr)
-          return BufferOrErr.takeError();
-        std::unique_ptr<FileOutputBuffer> Buf = std::move(*BufferOrErr);
-        llvm::copy(Sec->Content, Buf->getBufferStart());
-
-        if (Error E = Buf->commit())
-          return E;
-        return Error::success();
-      }
-    }
-
-  return createStringError(object_error::parse_failed, "section '%s' not found",
-                           SecName.str().c_str());
-}
-
-static Error addSection(StringRef SecName, StringRef Filename, Object &Obj) {
-  ErrorOr<std::unique_ptr<MemoryBuffer>> BufOrErr =
-      MemoryBuffer::getFile(Filename);
-  if (!BufOrErr)
-    return createFileError(Filename, errorCodeToError(BufOrErr.getError()));
-  std::unique_ptr<MemoryBuffer> Buf = std::move(*BufOrErr);
-
-  std::pair<StringRef, StringRef> Pair = SecName.split(',');
-  StringRef TargetSegName = Pair.first;
-  Section Sec(TargetSegName, Pair.second);
-  Sec.Content = Obj.NewSectionsContents.save(Buf->getBuffer());
-  Sec.Size = Sec.Content.size();
-
-  // Add the a section into an existing segment.
-  for (LoadCommand &LC : Obj.LoadCommands) {
-    Optional<StringRef> SegName = LC.getSegmentName();
-    if (SegName && SegName == TargetSegName) {
-      uint64_t Addr = *LC.getSegmentVMAddr();
-      for (const std::unique_ptr<Section> &S : LC.Sections)
-        Addr = std::max(Addr, S->Addr + S->Size);
-      LC.Sections.push_back(std::make_unique<Section>(Sec));
-      LC.Sections.back()->Addr = Addr;
-      return Error::success();
-    }
-  }
-
-  // There's no segment named TargetSegName. Create a new load command and
-  // Insert a new section into it.
-  LoadCommand &NewSegment =
-      Obj.addSegment(TargetSegName, alignTo(Sec.Size, 16384));
-  NewSegment.Sections.push_back(std::make_unique<Section>(Sec));
-  NewSegment.Sections.back()->Addr = *NewSegment.getSegmentVMAddr();
-  return Error::success();
-}
-
-static Expected<Section &> findSection(StringRef SecName, Object &O) {
-  StringRef SegName;
-  std::tie(SegName, SecName) = SecName.split(",");
-  auto FoundSeg =
-      llvm::find_if(O.LoadCommands, [SegName](const LoadCommand &LC) {
-        return LC.getSegmentName() == SegName;
-      });
-  if (FoundSeg == O.LoadCommands.end())
-    return createStringError(errc::invalid_argument,
-                             "could not find segment with name '%s'",
-                             SegName.str().c_str());
-  auto FoundSec = llvm::find_if(FoundSeg->Sections,
-                                [SecName](const std::unique_ptr<Section> &Sec) {
-                                  return Sec->Sectname == SecName;
-                                });
-  if (FoundSec == FoundSeg->Sections.end())
-    return createStringError(errc::invalid_argument,
-                             "could not find section with name '%s'",
-                             SecName.str().c_str());
-
-  assert(FoundSec->get()->CanonicalName == (SegName + "," + SecName).str());
-  return *FoundSec->get();
-}
-
-static Error updateSection(StringRef SecName, StringRef Filename, Object &O) {
-  Expected<Section &> SecToUpdateOrErr = findSection(SecName, O);
-
-  if (!SecToUpdateOrErr)
-    return SecToUpdateOrErr.takeError();
-  Section &Sec = *SecToUpdateOrErr;
-
-  ErrorOr<std::unique_ptr<MemoryBuffer>> BufOrErr =
-      MemoryBuffer::getFile(Filename);
-  if (!BufOrErr)
-    return createFileError(Filename, errorCodeToError(BufOrErr.getError()));
-  std::unique_ptr<MemoryBuffer> Buf = std::move(*BufOrErr);
-
-  if (Buf->getBufferSize() > Sec.Size)
-    return createStringError(
-        errc::invalid_argument,
-        "new section cannot be larger than previous section");
-  Sec.Content = O.NewSectionsContents.save(Buf->getBuffer());
-  Sec.Size = Sec.Content.size();
-  return Error::success();
-}
-
-// isValidMachOCannonicalName returns success if Name is a MachO cannonical name
-// ("<segment>,<section>") and lengths of both segment and section names are
-// valid.
-static Error isValidMachOCannonicalName(StringRef Name) {
-  if (Name.count(',') != 1)
-    return createStringError(errc::invalid_argument,
-                             "invalid section name '%s' (should be formatted "
-                             "as '<segment name>,<section name>')",
-                             Name.str().c_str());
-
-  std::pair<StringRef, StringRef> Pair = Name.split(',');
-  if (Pair.first.size() > 16)
-    return createStringError(errc::invalid_argument,
-                             "too long segment name: '%s'",
-                             Pair.first.str().c_str());
-  if (Pair.second.size() > 16)
-    return createStringError(errc::invalid_argument,
-                             "too long section name: '%s'",
-                             Pair.second.str().c_str());
-  return Error::success();
-}
-
-static Error handleArgs(const CommonConfig &Config,
-                        const MachOConfig &MachOConfig, Object &Obj) {
-  // Dump sections before add/remove for compatibility with GNU objcopy.
-  for (StringRef Flag : Config.DumpSection) {
-    StringRef SectionName;
-    StringRef FileName;
-    std::tie(SectionName, FileName) = Flag.split('=');
-    if (Error E = dumpSectionToFile(SectionName, FileName, Obj))
-      return E;
-  }
-
-  if (Error E = removeSections(Config, Obj))
-    return E;
-
-  // Mark symbols to determine which symbols are still needed.
-  if (Config.StripAll)
-    markSymbols(Config, Obj);
-
-  updateAndRemoveSymbols(Config, MachOConfig, Obj);
-
-  if (Config.StripAll)
-    for (LoadCommand &LC : Obj.LoadCommands)
-      for (std::unique_ptr<Section> &Sec : LC.Sections)
-        Sec->Relocations.clear();
-
-  for (const auto &Flag : Config.AddSection) {
-    std::pair<StringRef, StringRef> SecPair = Flag.split("=");
-    StringRef SecName = SecPair.first;
-    StringRef File = SecPair.second;
-    if (Error E = isValidMachOCannonicalName(SecName))
-      return E;
-    if (Error E = addSection(SecName, File, Obj))
-      return E;
-  }
-
-  for (const auto &Flag : Config.UpdateSection) {
-    StringRef SectionName;
-    StringRef FileName;
-    std::tie(SectionName, FileName) = Flag.split('=');
-    if (Error E = isValidMachOCannonicalName(SectionName))
-      return E;
-    if (Error E = updateSection(SectionName, FileName, Obj))
-      return E;
-  }
-
-  if (Error E = processLoadCommands(MachOConfig, Obj))
-    return E;
-
-  return Error::success();
-}
-
-Error objcopy::macho::executeObjcopyOnBinary(const CommonConfig &Config,
-                                             const MachOConfig &MachOConfig,
-                                             object::MachOObjectFile &In,
-                                             raw_ostream &Out) {
-  MachOReader Reader(In);
-  Expected<std::unique_ptr<Object>> O = Reader.create();
-  if (!O)
-    return createFileError(Config.InputFilename, O.takeError());
-
-  if (O->get()->Header.FileType == MachO::HeaderFileType::MH_PRELOAD)
-    return createStringError(std::errc::not_supported,
-                             "%s: MH_PRELOAD files are not supported",
-                             Config.InputFilename.str().c_str());
-
-  if (Error E = handleArgs(Config, MachOConfig, **O))
-    return createFileError(Config.InputFilename, std::move(E));
-
-  // Page size used for alignment of segment sizes in Mach-O executables and
-  // dynamic libraries.
-  uint64_t PageSize;
-  switch (In.getArch()) {
-  case Triple::ArchType::arm:
-  case Triple::ArchType::aarch64:
-  case Triple::ArchType::aarch64_32:
-    PageSize = 16384;
-    break;
-  default:
-    PageSize = 4096;
-  }
-
-  MachOWriter Writer(**O, In.is64Bit(), In.isLittleEndian(),
-                     sys::path::filename(Config.OutputFilename), PageSize, Out);
-  if (auto E = Writer.finalize())
-    return E;
-  return Writer.write();
-}
-
-Error objcopy::macho::executeObjcopyOnMachOUniversalBinary(
-    const MultiFormatConfig &Config, const MachOUniversalBinary &In,
-    raw_ostream &Out) {
-  SmallVector<OwningBinary<Binary>, 2> Binaries;
-  SmallVector<Slice, 2> Slices;
-  for (const auto &O : In.objects()) {
-    Expected<std::unique_ptr<Archive>> ArOrErr = O.getAsArchive();
-    if (ArOrErr) {
-      Expected<std::vector<NewArchiveMember>> NewArchiveMembersOrErr =
-          createNewArchiveMembers(Config, **ArOrErr);
-      if (!NewArchiveMembersOrErr)
-        return NewArchiveMembersOrErr.takeError();
-      Expected<std::unique_ptr<MemoryBuffer>> OutputBufferOrErr =
-          writeArchiveToBuffer(*NewArchiveMembersOrErr,
-                               (*ArOrErr)->hasSymbolTable(), (*ArOrErr)->kind(),
-                               Config.getCommonConfig().DeterministicArchives,
-                               (*ArOrErr)->isThin());
-      if (!OutputBufferOrErr)
-        return OutputBufferOrErr.takeError();
-      Expected<std::unique_ptr<Binary>> BinaryOrErr =
-          object::createBinary(**OutputBufferOrErr);
-      if (!BinaryOrErr)
-        return BinaryOrErr.takeError();
-      Binaries.emplace_back(std::move(*BinaryOrErr),
-                            std::move(*OutputBufferOrErr));
-      Slices.emplace_back(*cast<Archive>(Binaries.back().getBinary()),
-                          O.getCPUType(), O.getCPUSubType(),
-                          O.getArchFlagName(), O.getAlign());
-      continue;
-    }
-    // The methods getAsArchive, getAsObjectFile, getAsIRObject of the class
-    // ObjectForArch return an Error in case of the type mismatch. We need to
-    // check each in turn to see what kind of slice this is, so ignore errors
-    // produced along the way.
-    consumeError(ArOrErr.takeError());
-
-    Expected<std::unique_ptr<MachOObjectFile>> ObjOrErr = O.getAsObjectFile();
-    if (!ObjOrErr) {
-      consumeError(ObjOrErr.takeError());
-      return createStringError(
-          std::errc::invalid_argument,
-          "slice for '%s' of the universal Mach-O binary "
-          "'%s' is not a Mach-O object or an archive",
-          O.getArchFlagName().c_str(),
-          Config.getCommonConfig().InputFilename.str().c_str());
-    }
-    std::string ArchFlagName = O.getArchFlagName();
-
-    SmallVector<char, 0> Buffer;
-    raw_svector_ostream MemStream(Buffer);
-
-    Expected<const MachOConfig &> MachO = Config.getMachOConfig();
-    if (!MachO)
-      return MachO.takeError();
-
-    if (Error E = executeObjcopyOnBinary(Config.getCommonConfig(), *MachO,
-                                         **ObjOrErr, MemStream))
-      return E;
-
-    auto MB = std::make_unique<SmallVectorMemoryBuffer>(
-        std::move(Buffer), ArchFlagName, /*RequiresNullTerminator=*/false);
-    Expected<std::unique_ptr<Binary>> BinaryOrErr = object::createBinary(*MB);
-    if (!BinaryOrErr)
-      return BinaryOrErr.takeError();
-    Binaries.emplace_back(std::move(*BinaryOrErr), std::move(MB));
-    Slices.emplace_back(*cast<MachOObjectFile>(Binaries.back().getBinary()),
-                        O.getAlign());
-  }
-
-  if (Error Err = writeUniversalBinaryToStream(Slices, Out))
-    return Err;
-
-  return Error::success();
-}
diff --git a/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.h b/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.h
deleted file mode 100644
index d03eee9d5fdb..000000000000
--- a/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.h
+++ /dev/null
@@ -1,39 +0,0 @@
-//===- MachOObjcopy.h -------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_OBJCOPY_MACHOOBJCOPY_H
-#define LLVM_TOOLS_OBJCOPY_MACHOOBJCOPY_H
-
-namespace llvm {
-class Error;
-class raw_ostream;
-
-namespace object {
-class MachOObjectFile;
-class MachOUniversalBinary;
-} // end namespace object
-
-namespace objcopy {
-struct CommonConfig;
-struct MachOConfig;
-class MultiFormatConfig;
-
-namespace macho {
-Error executeObjcopyOnBinary(const CommonConfig &Config,
-                             const MachOConfig &MachOConfig,
-                             object::MachOObjectFile &In, raw_ostream &Out);
-
-Error executeObjcopyOnMachOUniversalBinary(
-    const MultiFormatConfig &Config, const object::MachOUniversalBinary &In,
-    raw_ostream &Out);
-
-} // end namespace macho
-} // end namespace objcopy
-} // end namespace llvm
-
-#endif // LLVM_TOOLS_OBJCOPY_MACHOOBJCOPY_H
diff --git a/llvm/tools/llvm-objcopy/MachO/MachOReader.cpp b/llvm/tools/llvm-objcopy/MachO/MachOReader.cpp
deleted file mode 100644
index d68d1692997a..000000000000
--- a/llvm/tools/llvm-objcopy/MachO/MachOReader.cpp
+++ /dev/null
@@ -1,374 +0,0 @@
-//===- MachOReader.cpp ------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "MachOReader.h"
-#include "Object.h"
-#include "llvm/BinaryFormat/MachO.h"
-#include "llvm/Object/MachO.h"
-#include "llvm/Support/Errc.h"
-#include <memory>
-
-using namespace llvm;
-using namespace llvm::objcopy;
-using namespace llvm::objcopy::macho;
-
-void MachOReader::readHeader(Object &O) const {
-  O.Header.Magic = MachOObj.getHeader().magic;
-  O.Header.CPUType = MachOObj.getHeader().cputype;
-  O.Header.CPUSubType = MachOObj.getHeader().cpusubtype;
-  O.Header.FileType = MachOObj.getHeader().filetype;
-  O.Header.NCmds = MachOObj.getHeader().ncmds;
-  O.Header.SizeOfCmds = MachOObj.getHeader().sizeofcmds;
-  O.Header.Flags = MachOObj.getHeader().flags;
-}
-
-template <typename SectionType>
-static Section constructSectionCommon(const SectionType &Sec, uint32_t Index) {
-  StringRef SegName(Sec.segname, strnlen(Sec.segname, sizeof(Sec.segname)));
-  StringRef SectName(Sec.sectname, strnlen(Sec.sectname, sizeof(Sec.sectname)));
-  Section S(SegName, SectName);
-  S.Index = Index;
-  S.Addr = Sec.addr;
-  S.Size = Sec.size;
-  S.OriginalOffset = Sec.offset;
-  S.Align = Sec.align;
-  S.RelOff = Sec.reloff;
-  S.NReloc = Sec.nreloc;
-  S.Flags = Sec.flags;
-  S.Reserved1 = Sec.reserved1;
-  S.Reserved2 = Sec.reserved2;
-  S.Reserved3 = 0;
-  return S;
-}
-
-Section constructSection(const MachO::section &Sec, uint32_t Index) {
-  return constructSectionCommon(Sec, Index);
-}
-
-Section constructSection(const MachO::section_64 &Sec, uint32_t Index) {
-  Section S = constructSectionCommon(Sec, Index);
-  S.Reserved3 = Sec.reserved3;
-  return S;
-}
-
-template <typename SectionType, typename SegmentType>
-Expected<std::vector<std::unique_ptr<Section>>> static extractSections(
-    const object::MachOObjectFile::LoadCommandInfo &LoadCmd,
-    const object::MachOObjectFile &MachOObj, uint32_t &NextSectionIndex) {
-  std::vector<std::unique_ptr<Section>> Sections;
-  for (auto Curr = reinterpret_cast<const SectionType *>(LoadCmd.Ptr +
-                                                         sizeof(SegmentType)),
-            End = reinterpret_cast<const SectionType *>(LoadCmd.Ptr +
-                                                        LoadCmd.C.cmdsize);
-       Curr < End; ++Curr) {
-    SectionType Sec;
-    memcpy((void *)&Sec, Curr, sizeof(SectionType));
-
-    if (MachOObj.isLittleEndian() != sys::IsLittleEndianHost)
-      MachO::swapStruct(Sec);
-
-    Sections.push_back(
-        std::make_unique<Section>(constructSection(Sec, NextSectionIndex)));
-
-    Section &S = *Sections.back();
-
-    Expected<object::SectionRef> SecRef =
-        MachOObj.getSection(NextSectionIndex++);
-    if (!SecRef)
-      return SecRef.takeError();
-
-    Expected<ArrayRef<uint8_t>> Data =
-        MachOObj.getSectionContents(SecRef->getRawDataRefImpl());
-    if (!Data)
-      return Data.takeError();
-
-    S.Content =
-        StringRef(reinterpret_cast<const char *>(Data->data()), Data->size());
-
-    const uint32_t CPUType = MachOObj.getHeader().cputype;
-    S.Relocations.reserve(S.NReloc);
-    for (auto RI = MachOObj.section_rel_begin(SecRef->getRawDataRefImpl()),
-              RE = MachOObj.section_rel_end(SecRef->getRawDataRefImpl());
-         RI != RE; ++RI) {
-      RelocationInfo R;
-      R.Symbol = nullptr; // We'll fill this field later.
-      R.Info = MachOObj.getRelocation(RI->getRawDataRefImpl());
-      R.Scattered = MachOObj.isRelocationScattered(R.Info);
-      unsigned Type = MachOObj.getAnyRelocationType(R.Info);
-      // TODO Support CPU_TYPE_ARM.
-      R.IsAddend = !R.Scattered && (CPUType == MachO::CPU_TYPE_ARM64 &&
-                                    Type == MachO::ARM64_RELOC_ADDEND);
-      R.Extern = !R.Scattered && MachOObj.getPlainRelocationExternal(R.Info);
-      S.Relocations.push_back(R);
-    }
-
-    assert(S.NReloc == S.Relocations.size() &&
-           "Incorrect number of relocations");
-  }
-  return std::move(Sections);
-}
-
-Error MachOReader::readLoadCommands(Object &O) const {
-  // For MachO sections indices start from 1.
-  uint32_t NextSectionIndex = 1;
-  static constexpr char TextSegmentName[] = "__TEXT";
-  for (auto LoadCmd : MachOObj.load_commands()) {
-    LoadCommand LC;
-    switch (LoadCmd.C.cmd) {
-    case MachO::LC_CODE_SIGNATURE:
-      O.CodeSignatureCommandIndex = O.LoadCommands.size();
-      break;
-    case MachO::LC_SEGMENT:
-      // LoadCmd.Ptr might not be aligned temporarily as
-      // MachO::segment_command requires, but the segname char pointer do not
-      // have alignment restrictions.
-      if (StringRef(reinterpret_cast<const char *>(
-              LoadCmd.Ptr + offsetof(MachO::segment_command, segname))) ==
-          TextSegmentName)
-        O.TextSegmentCommandIndex = O.LoadCommands.size();
-
-      if (Expected<std::vector<std::unique_ptr<Section>>> Sections =
-              extractSections<MachO::section, MachO::segment_command>(
-                  LoadCmd, MachOObj, NextSectionIndex))
-        LC.Sections = std::move(*Sections);
-      else
-        return Sections.takeError();
-      break;
-    case MachO::LC_SEGMENT_64:
-      // LoadCmd.Ptr might not be aligned temporarily as
-      // MachO::segment_command_64 requires, but the segname char pointer do
-      // not have alignment restrictions.
-      if (StringRef(reinterpret_cast<const char *>(
-              LoadCmd.Ptr + offsetof(MachO::segment_command_64, segname))) ==
-          TextSegmentName)
-        O.TextSegmentCommandIndex = O.LoadCommands.size();
-
-      if (Expected<std::vector<std::unique_ptr<Section>>> Sections =
-              extractSections<MachO::section_64, MachO::segment_command_64>(
-                  LoadCmd, MachOObj, NextSectionIndex))
-        LC.Sections = std::move(*Sections);
-      else
-        return Sections.takeError();
-      break;
-    case MachO::LC_SYMTAB:
-      O.SymTabCommandIndex = O.LoadCommands.size();
-      break;
-    case MachO::LC_DYSYMTAB:
-      O.DySymTabCommandIndex = O.LoadCommands.size();
-      break;
-    case MachO::LC_DYLD_INFO:
-    case MachO::LC_DYLD_INFO_ONLY:
-      O.DyLdInfoCommandIndex = O.LoadCommands.size();
-      break;
-    case MachO::LC_DATA_IN_CODE:
-      O.DataInCodeCommandIndex = O.LoadCommands.size();
-      break;
-    case MachO::LC_LINKER_OPTIMIZATION_HINT:
-      O.LinkerOptimizationHintCommandIndex = O.LoadCommands.size();
-      break;
-    case MachO::LC_FUNCTION_STARTS:
-      O.FunctionStartsCommandIndex = O.LoadCommands.size();
-      break;
-    case MachO::LC_DYLD_EXPORTS_TRIE:
-      O.ExportsTrieCommandIndex = O.LoadCommands.size();
-      break;
-    case MachO::LC_DYLD_CHAINED_FIXUPS:
-      O.ChainedFixupsCommandIndex = O.LoadCommands.size();
-      break;
-    }
-#define HANDLE_LOAD_COMMAND(LCName, LCValue, LCStruct)                         \
-  case MachO::LCName:                                                          \
-    memcpy((void *)&(LC.MachOLoadCommand.LCStruct##_data), LoadCmd.Ptr,        \
-           sizeof(MachO::LCStruct));                                           \
-    if (MachOObj.isLittleEndian() != sys::IsLittleEndianHost)                  \
-      MachO::swapStruct(LC.MachOLoadCommand.LCStruct##_data);                  \
-    if (LoadCmd.C.cmdsize > sizeof(MachO::LCStruct))                           \
-      LC.Payload = ArrayRef<uint8_t>(                                          \
-          reinterpret_cast<uint8_t *>(const_cast<char *>(LoadCmd.Ptr)) +       \
-              sizeof(MachO::LCStruct),                                         \
-          LoadCmd.C.cmdsize - sizeof(MachO::LCStruct));                        \
-    break;
-
-    switch (LoadCmd.C.cmd) {
-    default:
-      memcpy((void *)&(LC.MachOLoadCommand.load_command_data), LoadCmd.Ptr,
-             sizeof(MachO::load_command));
-      if (MachOObj.isLittleEndian() != sys::IsLittleEndianHost)
-        MachO::swapStruct(LC.MachOLoadCommand.load_command_data);
-      if (LoadCmd.C.cmdsize > sizeof(MachO::load_command))
-        LC.Payload = ArrayRef<uint8_t>(
-            reinterpret_cast<uint8_t *>(const_cast<char *>(LoadCmd.Ptr)) +
-                sizeof(MachO::load_command),
-            LoadCmd.C.cmdsize - sizeof(MachO::load_command));
-      break;
-#include "llvm/BinaryFormat/MachO.def"
-    }
-    O.LoadCommands.push_back(std::move(LC));
-  }
-  return Error::success();
-}
-
-template <typename nlist_t>
-SymbolEntry constructSymbolEntry(StringRef StrTable, const nlist_t &nlist) {
-  assert(nlist.n_strx < StrTable.size() &&
-         "n_strx exceeds the size of the string table");
-  SymbolEntry SE;
-  SE.Name = StringRef(StrTable.data() + nlist.n_strx).str();
-  SE.n_type = nlist.n_type;
-  SE.n_sect = nlist.n_sect;
-  SE.n_desc = nlist.n_desc;
-  SE.n_value = nlist.n_value;
-  return SE;
-}
-
-void MachOReader::readSymbolTable(Object &O) const {
-  StringRef StrTable = MachOObj.getStringTableData();
-  for (auto Symbol : MachOObj.symbols()) {
-    SymbolEntry SE =
-        (MachOObj.is64Bit()
-             ? constructSymbolEntry(StrTable, MachOObj.getSymbol64TableEntry(
-                                                  Symbol.getRawDataRefImpl()))
-             : constructSymbolEntry(StrTable, MachOObj.getSymbolTableEntry(
-                                                  Symbol.getRawDataRefImpl())));
-
-    O.SymTable.Symbols.push_back(std::make_unique<SymbolEntry>(SE));
-  }
-}
-
-void MachOReader::setSymbolInRelocationInfo(Object &O) const {
-  std::vector<const Section *> Sections;
-  for (auto &LC : O.LoadCommands)
-    for (std::unique_ptr<Section> &Sec : LC.Sections)
-      Sections.push_back(Sec.get());
-
-  for (LoadCommand &LC : O.LoadCommands)
-    for (std::unique_ptr<Section> &Sec : LC.Sections)
-      for (auto &Reloc : Sec->Relocations)
-        if (!Reloc.Scattered && !Reloc.IsAddend) {
-          const uint32_t SymbolNum =
-              Reloc.getPlainRelocationSymbolNum(MachOObj.isLittleEndian());
-          if (Reloc.Extern) {
-            Reloc.Symbol = O.SymTable.getSymbolByIndex(SymbolNum);
-          } else {
-            // FIXME: Refactor error handling in MachOReader and report an error
-            // if we encounter an invalid relocation.
-            assert(SymbolNum >= 1 && SymbolNum <= Sections.size() &&
-                   "Invalid section index.");
-            Reloc.Sec = Sections[SymbolNum - 1];
-          }
-        }
-}
-
-void MachOReader::readRebaseInfo(Object &O) const {
-  O.Rebases.Opcodes = MachOObj.getDyldInfoRebaseOpcodes();
-}
-
-void MachOReader::readBindInfo(Object &O) const {
-  O.Binds.Opcodes = MachOObj.getDyldInfoBindOpcodes();
-}
-
-void MachOReader::readWeakBindInfo(Object &O) const {
-  O.WeakBinds.Opcodes = MachOObj.getDyldInfoWeakBindOpcodes();
-}
-
-void MachOReader::readLazyBindInfo(Object &O) const {
-  O.LazyBinds.Opcodes = MachOObj.getDyldInfoLazyBindOpcodes();
-}
-
-void MachOReader::readExportInfo(Object &O) const {
-  O.Exports.Trie = MachOObj.getDyldInfoExportsTrie();
-}
-
-void MachOReader::readLinkData(Object &O, Optional<size_t> LCIndex,
-                               LinkData &LD) const {
-  if (!LCIndex)
-    return;
-  const MachO::linkedit_data_command &LC =
-      O.LoadCommands[*LCIndex].MachOLoadCommand.linkedit_data_command_data;
-  LD.Data =
-      arrayRefFromStringRef(MachOObj.getData().substr(LC.dataoff, LC.datasize));
-}
-
-void MachOReader::readDataInCodeData(Object &O) const {
-  return readLinkData(O, O.DataInCodeCommandIndex, O.DataInCode);
-}
-
-void MachOReader::readLinkerOptimizationHint(Object &O) const {
-  return readLinkData(O, O.LinkerOptimizationHintCommandIndex,
-                      O.LinkerOptimizationHint);
-}
-
-void MachOReader::readFunctionStartsData(Object &O) const {
-  return readLinkData(O, O.FunctionStartsCommandIndex, O.FunctionStarts);
-}
-
-void MachOReader::readExportsTrie(Object &O) const {
-  return readLinkData(O, O.ExportsTrieCommandIndex, O.ExportsTrie);
-}
-
-void MachOReader::readChainedFixups(Object &O) const {
-  return readLinkData(O, O.ChainedFixupsCommandIndex, O.ChainedFixups);
-}
-
-void MachOReader::readIndirectSymbolTable(Object &O) const {
-  MachO::dysymtab_command DySymTab = MachOObj.getDysymtabLoadCommand();
-  constexpr uint32_t AbsOrLocalMask =
-      MachO::INDIRECT_SYMBOL_LOCAL | MachO::INDIRECT_SYMBOL_ABS;
-  for (uint32_t i = 0; i < DySymTab.nindirectsyms; ++i) {
-    uint32_t Index = MachOObj.getIndirectSymbolTableEntry(DySymTab, i);
-    if ((Index & AbsOrLocalMask) != 0)
-      O.IndirectSymTable.Symbols.emplace_back(Index, None);
-    else
-      O.IndirectSymTable.Symbols.emplace_back(
-          Index, O.SymTable.getSymbolByIndex(Index));
-  }
-}
-
-void MachOReader::readSwiftVersion(Object &O) const {
-  struct ObjCImageInfo {
-    uint32_t Version;
-    uint32_t Flags;
-  } ImageInfo;
-
-  for (const LoadCommand &LC : O.LoadCommands)
-    for (const std::unique_ptr<Section> &Sec : LC.Sections)
-      if (Sec->Sectname == "__objc_imageinfo" &&
-          (Sec->Segname == "__DATA" || Sec->Segname == "__DATA_CONST" ||
-           Sec->Segname == "__DATA_DIRTY") &&
-          Sec->Content.size() >= sizeof(ObjCImageInfo)) {
-        memcpy(&ImageInfo, Sec->Content.data(), sizeof(ObjCImageInfo));
-        if (MachOObj.isLittleEndian() != sys::IsLittleEndianHost) {
-          sys::swapByteOrder(ImageInfo.Version);
-          sys::swapByteOrder(ImageInfo.Flags);
-        }
-        O.SwiftVersion = (ImageInfo.Flags >> 8) & 0xff;
-        return;
-      }
-}
-
-Expected<std::unique_ptr<Object>> MachOReader::create() const {
-  auto Obj = std::make_unique<Object>();
-  readHeader(*Obj);
-  if (Error E = readLoadCommands(*Obj))
-    return std::move(E);
-  readSymbolTable(*Obj);
-  setSymbolInRelocationInfo(*Obj);
-  readRebaseInfo(*Obj);
-  readBindInfo(*Obj);
-  readWeakBindInfo(*Obj);
-  readLazyBindInfo(*Obj);
-  readExportInfo(*Obj);
-  readDataInCodeData(*Obj);
-  readLinkerOptimizationHint(*Obj);
-  readFunctionStartsData(*Obj);
-  readExportsTrie(*Obj);
-  readChainedFixups(*Obj);
-  readIndirectSymbolTable(*Obj);
-  readSwiftVersion(*Obj);
-  return std::move(Obj);
-}
diff --git a/llvm/tools/llvm-objcopy/MachO/MachOReader.h b/llvm/tools/llvm-objcopy/MachO/MachOReader.h
deleted file mode 100644
index b29e86ca642e..000000000000
--- a/llvm/tools/llvm-objcopy/MachO/MachOReader.h
+++ /dev/null
@@ -1,57 +0,0 @@
-//===- MachOReader.h --------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "MachOObjcopy.h"
-#include "Object.h"
-#include "llvm/BinaryFormat/MachO.h"
-#include "llvm/Object/MachO.h"
-#include <memory>
-
-namespace llvm {
-namespace objcopy {
-namespace macho {
-
-// The hierarchy of readers is responsible for parsing different inputs:
-// raw binaries and regular MachO object files.
-class Reader {
-public:
-  virtual ~Reader(){};
-  virtual Expected<std::unique_ptr<Object>> create() const = 0;
-};
-
-class MachOReader : public Reader {
-  const object::MachOObjectFile &MachOObj;
-
-  void readHeader(Object &O) const;
-  Error readLoadCommands(Object &O) const;
-  void readSymbolTable(Object &O) const;
-  void setSymbolInRelocationInfo(Object &O) const;
-  void readRebaseInfo(Object &O) const;
-  void readBindInfo(Object &O) const;
-  void readWeakBindInfo(Object &O) const;
-  void readLazyBindInfo(Object &O) const;
-  void readExportInfo(Object &O) const;
-  void readLinkData(Object &O, Optional<size_t> LCIndex, LinkData &LD) const;
-  void readCodeSignature(Object &O) const;
-  void readDataInCodeData(Object &O) const;
-  void readLinkerOptimizationHint(Object &O) const;
-  void readFunctionStartsData(Object &O) const;
-  void readExportsTrie(Object &O) const;
-  void readChainedFixups(Object &O) const;
-  void readIndirectSymbolTable(Object &O) const;
-  void readSwiftVersion(Object &O) const;
-
-public:
-  explicit MachOReader(const object::MachOObjectFile &Obj) : MachOObj(Obj) {}
-
-  Expected<std::unique_ptr<Object>> create() const override;
-};
-
-} // end namespace macho
-} // end namespace objcopy
-} // end namespace llvm
diff --git a/llvm/tools/llvm-objcopy/MachO/MachOWriter.cpp b/llvm/tools/llvm-objcopy/MachO/MachOWriter.cpp
deleted file mode 100644
index 52f20794cc57..000000000000
--- a/llvm/tools/llvm-objcopy/MachO/MachOWriter.cpp
+++ /dev/null
@@ -1,748 +0,0 @@
-//===- MachOWriter.cpp ------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "MachOWriter.h"
-#include "MachOLayoutBuilder.h"
-#include "Object.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/BinaryFormat/MachO.h"
-#include "llvm/Object/MachO.h"
-#include "llvm/Support/Errc.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/SHA256.h"
-#include <memory>
-
-#if defined(__APPLE__)
-#include <sys/mman.h>
-#endif
-
-using namespace llvm;
-using namespace llvm::objcopy::macho;
-using namespace llvm::support::endian;
-
-size_t MachOWriter::headerSize() const {
-  return Is64Bit ? sizeof(MachO::mach_header_64) : sizeof(MachO::mach_header);
-}
-
-size_t MachOWriter::loadCommandsSize() const { return O.Header.SizeOfCmds; }
-
-size_t MachOWriter::symTableSize() const {
-  return O.SymTable.Symbols.size() *
-         (Is64Bit ? sizeof(MachO::nlist_64) : sizeof(MachO::nlist));
-}
-
-size_t MachOWriter::totalSize() const {
-  // Going from tail to head and looking for an appropriate "anchor" to
-  // calculate the total size assuming that all the offsets are either valid
-  // ("true") or 0 (0 indicates that the corresponding part is missing).
-
-  SmallVector<size_t, 7> Ends;
-  if (O.SymTabCommandIndex) {
-    const MachO::symtab_command &SymTabCommand =
-        O.LoadCommands[*O.SymTabCommandIndex]
-            .MachOLoadCommand.symtab_command_data;
-    if (SymTabCommand.symoff)
-      Ends.push_back(SymTabCommand.symoff + symTableSize());
-    if (SymTabCommand.stroff)
-      Ends.push_back(SymTabCommand.stroff + SymTabCommand.strsize);
-  }
-  if (O.DyLdInfoCommandIndex) {
-    const MachO::dyld_info_command &DyLdInfoCommand =
-        O.LoadCommands[*O.DyLdInfoCommandIndex]
-            .MachOLoadCommand.dyld_info_command_data;
-    if (DyLdInfoCommand.rebase_off) {
-      assert((DyLdInfoCommand.rebase_size == O.Rebases.Opcodes.size()) &&
-             "Incorrect rebase opcodes size");
-      Ends.push_back(DyLdInfoCommand.rebase_off + DyLdInfoCommand.rebase_size);
-    }
-    if (DyLdInfoCommand.bind_off) {
-      assert((DyLdInfoCommand.bind_size == O.Binds.Opcodes.size()) &&
-             "Incorrect bind opcodes size");
-      Ends.push_back(DyLdInfoCommand.bind_off + DyLdInfoCommand.bind_size);
-    }
-    if (DyLdInfoCommand.weak_bind_off) {
-      assert((DyLdInfoCommand.weak_bind_size == O.WeakBinds.Opcodes.size()) &&
-             "Incorrect weak bind opcodes size");
-      Ends.push_back(DyLdInfoCommand.weak_bind_off +
-                     DyLdInfoCommand.weak_bind_size);
-    }
-    if (DyLdInfoCommand.lazy_bind_off) {
-      assert((DyLdInfoCommand.lazy_bind_size == O.LazyBinds.Opcodes.size()) &&
-             "Incorrect lazy bind opcodes size");
-      Ends.push_back(DyLdInfoCommand.lazy_bind_off +
-                     DyLdInfoCommand.lazy_bind_size);
-    }
-    if (DyLdInfoCommand.export_off) {
-      assert((DyLdInfoCommand.export_size == O.Exports.Trie.size()) &&
-             "Incorrect trie size");
-      Ends.push_back(DyLdInfoCommand.export_off + DyLdInfoCommand.export_size);
-    }
-  }
-
-  if (O.DySymTabCommandIndex) {
-    const MachO::dysymtab_command &DySymTabCommand =
-        O.LoadCommands[*O.DySymTabCommandIndex]
-            .MachOLoadCommand.dysymtab_command_data;
-
-    if (DySymTabCommand.indirectsymoff)
-      Ends.push_back(DySymTabCommand.indirectsymoff +
-                     sizeof(uint32_t) * O.IndirectSymTable.Symbols.size());
-  }
-
-  if (O.CodeSignatureCommandIndex) {
-    const MachO::linkedit_data_command &LinkEditDataCommand =
-        O.LoadCommands[*O.CodeSignatureCommandIndex]
-            .MachOLoadCommand.linkedit_data_command_data;
-    if (LinkEditDataCommand.dataoff)
-      Ends.push_back(LinkEditDataCommand.dataoff +
-                     LinkEditDataCommand.datasize);
-  }
-
-  if (O.DataInCodeCommandIndex) {
-    const MachO::linkedit_data_command &LinkEditDataCommand =
-        O.LoadCommands[*O.DataInCodeCommandIndex]
-            .MachOLoadCommand.linkedit_data_command_data;
-
-    if (LinkEditDataCommand.dataoff)
-      Ends.push_back(LinkEditDataCommand.dataoff +
-                     LinkEditDataCommand.datasize);
-  }
-
-  if (O.LinkerOptimizationHintCommandIndex) {
-    const MachO::linkedit_data_command &LinkEditDataCommand =
-        O.LoadCommands[*O.LinkerOptimizationHintCommandIndex]
-            .MachOLoadCommand.linkedit_data_command_data;
-
-    if (LinkEditDataCommand.dataoff)
-      Ends.push_back(LinkEditDataCommand.dataoff +
-                     LinkEditDataCommand.datasize);
-  }
-
-  if (O.FunctionStartsCommandIndex) {
-    const MachO::linkedit_data_command &LinkEditDataCommand =
-        O.LoadCommands[*O.FunctionStartsCommandIndex]
-            .MachOLoadCommand.linkedit_data_command_data;
-
-    if (LinkEditDataCommand.dataoff)
-      Ends.push_back(LinkEditDataCommand.dataoff +
-                     LinkEditDataCommand.datasize);
-  }
-
-  if (O.ChainedFixupsCommandIndex) {
-    const MachO::linkedit_data_command &LinkEditDataCommand =
-        O.LoadCommands[*O.ChainedFixupsCommandIndex]
-            .MachOLoadCommand.linkedit_data_command_data;
-
-    if (LinkEditDataCommand.dataoff)
-      Ends.push_back(LinkEditDataCommand.dataoff +
-                     LinkEditDataCommand.datasize);
-  }
-
-  if (O.ExportsTrieCommandIndex) {
-    const MachO::linkedit_data_command &LinkEditDataCommand =
-        O.LoadCommands[*O.ExportsTrieCommandIndex]
-            .MachOLoadCommand.linkedit_data_command_data;
-
-    if (LinkEditDataCommand.dataoff)
-      Ends.push_back(LinkEditDataCommand.dataoff +
-                     LinkEditDataCommand.datasize);
-  }
-
-  // Otherwise, use the last section / reloction.
-  for (const LoadCommand &LC : O.LoadCommands)
-    for (const std::unique_ptr<Section> &S : LC.Sections) {
-      if (!S->hasValidOffset()) {
-        assert((S->Offset == 0) && "Skipped section's offset must be zero");
-        assert((S->isVirtualSection() || S->Size == 0) &&
-               "Non-zero-fill sections with zero offset must have zero size");
-        continue;
-      }
-      assert((S->Offset != 0) &&
-             "Non-zero-fill section's offset cannot be zero");
-      Ends.push_back(S->Offset + S->Size);
-      if (S->RelOff)
-        Ends.push_back(S->RelOff +
-                       S->NReloc * sizeof(MachO::any_relocation_info));
-    }
-
-  if (!Ends.empty())
-    return *std::max_element(Ends.begin(), Ends.end());
-
-  // Otherwise, we have only Mach header and load commands.
-  return headerSize() + loadCommandsSize();
-}
-
-void MachOWriter::writeHeader() {
-  MachO::mach_header_64 Header;
-
-  Header.magic = O.Header.Magic;
-  Header.cputype = O.Header.CPUType;
-  Header.cpusubtype = O.Header.CPUSubType;
-  Header.filetype = O.Header.FileType;
-  Header.ncmds = O.Header.NCmds;
-  Header.sizeofcmds = O.Header.SizeOfCmds;
-  Header.flags = O.Header.Flags;
-  Header.reserved = O.Header.Reserved;
-
-  if (IsLittleEndian != sys::IsLittleEndianHost)
-    MachO::swapStruct(Header);
-
-  auto HeaderSize =
-      Is64Bit ? sizeof(MachO::mach_header_64) : sizeof(MachO::mach_header);
-  memcpy(Buf->getBufferStart(), &Header, HeaderSize);
-}
-
-void MachOWriter::writeLoadCommands() {
-  uint8_t *Begin =
-      reinterpret_cast<uint8_t *>(Buf->getBufferStart()) + headerSize();
-  for (const LoadCommand &LC : O.LoadCommands) {
-    // Construct a load command.
-    MachO::macho_load_command MLC = LC.MachOLoadCommand;
-    switch (MLC.load_command_data.cmd) {
-    case MachO::LC_SEGMENT:
-      if (IsLittleEndian != sys::IsLittleEndianHost)
-        MachO::swapStruct(MLC.segment_command_data);
-      memcpy(Begin, &MLC.segment_command_data, sizeof(MachO::segment_command));
-      Begin += sizeof(MachO::segment_command);
-
-      for (const std::unique_ptr<Section> &Sec : LC.Sections)
-        writeSectionInLoadCommand<MachO::section>(*Sec, Begin);
-      continue;
-    case MachO::LC_SEGMENT_64:
-      if (IsLittleEndian != sys::IsLittleEndianHost)
-        MachO::swapStruct(MLC.segment_command_64_data);
-      memcpy(Begin, &MLC.segment_command_64_data,
-             sizeof(MachO::segment_command_64));
-      Begin += sizeof(MachO::segment_command_64);
-
-      for (const std::unique_ptr<Section> &Sec : LC.Sections)
-        writeSectionInLoadCommand<MachO::section_64>(*Sec, Begin);
-      continue;
-    }
-
-#define HANDLE_LOAD_COMMAND(LCName, LCValue, LCStruct)                         \
-  case MachO::LCName:                                                          \
-    assert(sizeof(MachO::LCStruct) + LC.Payload.size() ==                      \
-           MLC.load_command_data.cmdsize);                                     \
-    if (IsLittleEndian != sys::IsLittleEndianHost)                             \
-      MachO::swapStruct(MLC.LCStruct##_data);                                  \
-    memcpy(Begin, &MLC.LCStruct##_data, sizeof(MachO::LCStruct));              \
-    Begin += sizeof(MachO::LCStruct);                                          \
-    if (!LC.Payload.empty())                                                   \
-      memcpy(Begin, LC.Payload.data(), LC.Payload.size());                     \
-    Begin += LC.Payload.size();                                                \
-    break;
-
-    // Copy the load command as it is.
-    switch (MLC.load_command_data.cmd) {
-    default:
-      assert(sizeof(MachO::load_command) + LC.Payload.size() ==
-             MLC.load_command_data.cmdsize);
-      if (IsLittleEndian != sys::IsLittleEndianHost)
-        MachO::swapStruct(MLC.load_command_data);
-      memcpy(Begin, &MLC.load_command_data, sizeof(MachO::load_command));
-      Begin += sizeof(MachO::load_command);
-      if (!LC.Payload.empty())
-        memcpy(Begin, LC.Payload.data(), LC.Payload.size());
-      Begin += LC.Payload.size();
-      break;
-#include "llvm/BinaryFormat/MachO.def"
-    }
-  }
-}
-
-template <typename StructType>
-void MachOWriter::writeSectionInLoadCommand(const Section &Sec, uint8_t *&Out) {
-  StructType Temp;
-  assert(Sec.Segname.size() <= sizeof(Temp.segname) && "too long segment name");
-  assert(Sec.Sectname.size() <= sizeof(Temp.sectname) &&
-         "too long section name");
-  memset(&Temp, 0, sizeof(StructType));
-  memcpy(Temp.segname, Sec.Segname.data(), Sec.Segname.size());
-  memcpy(Temp.sectname, Sec.Sectname.data(), Sec.Sectname.size());
-  Temp.addr = Sec.Addr;
-  Temp.size = Sec.Size;
-  Temp.offset = Sec.Offset;
-  Temp.align = Sec.Align;
-  Temp.reloff = Sec.RelOff;
-  Temp.nreloc = Sec.NReloc;
-  Temp.flags = Sec.Flags;
-  Temp.reserved1 = Sec.Reserved1;
-  Temp.reserved2 = Sec.Reserved2;
-
-  if (IsLittleEndian != sys::IsLittleEndianHost)
-    MachO::swapStruct(Temp);
-  memcpy(Out, &Temp, sizeof(StructType));
-  Out += sizeof(StructType);
-}
-
-void MachOWriter::writeSections() {
-  for (const LoadCommand &LC : O.LoadCommands)
-    for (const std::unique_ptr<Section> &Sec : LC.Sections) {
-      if (!Sec->hasValidOffset()) {
-        assert((Sec->Offset == 0) && "Skipped section's offset must be zero");
-        assert((Sec->isVirtualSection() || Sec->Size == 0) &&
-               "Non-zero-fill sections with zero offset must have zero size");
-        continue;
-      }
-
-      assert(Sec->Offset && "Section offset can not be zero");
-      assert((Sec->Size == Sec->Content.size()) && "Incorrect section size");
-      memcpy(Buf->getBufferStart() + Sec->Offset, Sec->Content.data(),
-             Sec->Content.size());
-      for (size_t Index = 0; Index < Sec->Relocations.size(); ++Index) {
-        RelocationInfo RelocInfo = Sec->Relocations[Index];
-        if (!RelocInfo.Scattered && !RelocInfo.IsAddend) {
-          const uint32_t SymbolNum = RelocInfo.Extern
-                                         ? (*RelocInfo.Symbol)->Index
-                                         : (*RelocInfo.Sec)->Index;
-          RelocInfo.setPlainRelocationSymbolNum(SymbolNum, IsLittleEndian);
-        }
-        if (IsLittleEndian != sys::IsLittleEndianHost)
-          MachO::swapStruct(
-              reinterpret_cast<MachO::any_relocation_info &>(RelocInfo.Info));
-        memcpy(Buf->getBufferStart() + Sec->RelOff +
-                   Index * sizeof(MachO::any_relocation_info),
-               &RelocInfo.Info, sizeof(RelocInfo.Info));
-      }
-    }
-}
-
-template <typename NListType>
-void writeNListEntry(const SymbolEntry &SE, bool IsLittleEndian, char *&Out,
-                     uint32_t Nstrx) {
-  NListType ListEntry;
-  ListEntry.n_strx = Nstrx;
-  ListEntry.n_type = SE.n_type;
-  ListEntry.n_sect = SE.n_sect;
-  ListEntry.n_desc = SE.n_desc;
-  ListEntry.n_value = SE.n_value;
-
-  if (IsLittleEndian != sys::IsLittleEndianHost)
-    MachO::swapStruct(ListEntry);
-  memcpy(Out, reinterpret_cast<const char *>(&ListEntry), sizeof(NListType));
-  Out += sizeof(NListType);
-}
-
-void MachOWriter::writeStringTable() {
-  if (!O.SymTabCommandIndex)
-    return;
-  const MachO::symtab_command &SymTabCommand =
-      O.LoadCommands[*O.SymTabCommandIndex]
-          .MachOLoadCommand.symtab_command_data;
-
-  uint8_t *StrTable = (uint8_t *)Buf->getBufferStart() + SymTabCommand.stroff;
-  LayoutBuilder.getStringTableBuilder().write(StrTable);
-}
-
-void MachOWriter::writeSymbolTable() {
-  if (!O.SymTabCommandIndex)
-    return;
-  const MachO::symtab_command &SymTabCommand =
-      O.LoadCommands[*O.SymTabCommandIndex]
-          .MachOLoadCommand.symtab_command_data;
-
-  char *SymTable = (char *)Buf->getBufferStart() + SymTabCommand.symoff;
-  for (auto Iter = O.SymTable.Symbols.begin(), End = O.SymTable.Symbols.end();
-       Iter != End; Iter++) {
-    SymbolEntry *Sym = Iter->get();
-    uint32_t Nstrx = LayoutBuilder.getStringTableBuilder().getOffset(Sym->Name);
-
-    if (Is64Bit)
-      writeNListEntry<MachO::nlist_64>(*Sym, IsLittleEndian, SymTable, Nstrx);
-    else
-      writeNListEntry<MachO::nlist>(*Sym, IsLittleEndian, SymTable, Nstrx);
-  }
-}
-
-void MachOWriter::writeRebaseInfo() {
-  if (!O.DyLdInfoCommandIndex)
-    return;
-  const MachO::dyld_info_command &DyLdInfoCommand =
-      O.LoadCommands[*O.DyLdInfoCommandIndex]
-          .MachOLoadCommand.dyld_info_command_data;
-  char *Out = (char *)Buf->getBufferStart() + DyLdInfoCommand.rebase_off;
-  assert((DyLdInfoCommand.rebase_size == O.Rebases.Opcodes.size()) &&
-         "Incorrect rebase opcodes size");
-  memcpy(Out, O.Rebases.Opcodes.data(), O.Rebases.Opcodes.size());
-}
-
-void MachOWriter::writeBindInfo() {
-  if (!O.DyLdInfoCommandIndex)
-    return;
-  const MachO::dyld_info_command &DyLdInfoCommand =
-      O.LoadCommands[*O.DyLdInfoCommandIndex]
-          .MachOLoadCommand.dyld_info_command_data;
-  char *Out = (char *)Buf->getBufferStart() + DyLdInfoCommand.bind_off;
-  assert((DyLdInfoCommand.bind_size == O.Binds.Opcodes.size()) &&
-         "Incorrect bind opcodes size");
-  memcpy(Out, O.Binds.Opcodes.data(), O.Binds.Opcodes.size());
-}
-
-void MachOWriter::writeWeakBindInfo() {
-  if (!O.DyLdInfoCommandIndex)
-    return;
-  const MachO::dyld_info_command &DyLdInfoCommand =
-      O.LoadCommands[*O.DyLdInfoCommandIndex]
-          .MachOLoadCommand.dyld_info_command_data;
-  char *Out = (char *)Buf->getBufferStart() + DyLdInfoCommand.weak_bind_off;
-  assert((DyLdInfoCommand.weak_bind_size == O.WeakBinds.Opcodes.size()) &&
-         "Incorrect weak bind opcodes size");
-  memcpy(Out, O.WeakBinds.Opcodes.data(), O.WeakBinds.Opcodes.size());
-}
-
-void MachOWriter::writeLazyBindInfo() {
-  if (!O.DyLdInfoCommandIndex)
-    return;
-  const MachO::dyld_info_command &DyLdInfoCommand =
-      O.LoadCommands[*O.DyLdInfoCommandIndex]
-          .MachOLoadCommand.dyld_info_command_data;
-  char *Out = (char *)Buf->getBufferStart() + DyLdInfoCommand.lazy_bind_off;
-  assert((DyLdInfoCommand.lazy_bind_size == O.LazyBinds.Opcodes.size()) &&
-         "Incorrect lazy bind opcodes size");
-  memcpy(Out, O.LazyBinds.Opcodes.data(), O.LazyBinds.Opcodes.size());
-}
-
-void MachOWriter::writeExportInfo() {
-  if (!O.DyLdInfoCommandIndex)
-    return;
-  const MachO::dyld_info_command &DyLdInfoCommand =
-      O.LoadCommands[*O.DyLdInfoCommandIndex]
-          .MachOLoadCommand.dyld_info_command_data;
-  char *Out = (char *)Buf->getBufferStart() + DyLdInfoCommand.export_off;
-  assert((DyLdInfoCommand.export_size == O.Exports.Trie.size()) &&
-         "Incorrect export trie size");
-  memcpy(Out, O.Exports.Trie.data(), O.Exports.Trie.size());
-}
-
-void MachOWriter::writeIndirectSymbolTable() {
-  if (!O.DySymTabCommandIndex)
-    return;
-
-  const MachO::dysymtab_command &DySymTabCommand =
-      O.LoadCommands[*O.DySymTabCommandIndex]
-          .MachOLoadCommand.dysymtab_command_data;
-
-  uint32_t *Out =
-      (uint32_t *)(Buf->getBufferStart() + DySymTabCommand.indirectsymoff);
-  for (const IndirectSymbolEntry &Sym : O.IndirectSymTable.Symbols) {
-    uint32_t Entry = (Sym.Symbol) ? (*Sym.Symbol)->Index : Sym.OriginalIndex;
-    if (IsLittleEndian != sys::IsLittleEndianHost)
-      sys::swapByteOrder(Entry);
-    *Out++ = Entry;
-  }
-}
-
-void MachOWriter::writeLinkData(Optional<size_t> LCIndex, const LinkData &LD) {
-  if (!LCIndex)
-    return;
-  const MachO::linkedit_data_command &LinkEditDataCommand =
-      O.LoadCommands[*LCIndex].MachOLoadCommand.linkedit_data_command_data;
-  char *Out = (char *)Buf->getBufferStart() + LinkEditDataCommand.dataoff;
-  assert((LinkEditDataCommand.datasize == LD.Data.size()) &&
-         "Incorrect data size");
-  memcpy(Out, LD.Data.data(), LD.Data.size());
-}
-
-static uint64_t
-getSegmentFileOffset(const LoadCommand &TextSegmentLoadCommand) {
-  const MachO::macho_load_command &MLC =
-      TextSegmentLoadCommand.MachOLoadCommand;
-  switch (MLC.load_command_data.cmd) {
-  case MachO::LC_SEGMENT:
-    return MLC.segment_command_data.fileoff;
-  case MachO::LC_SEGMENT_64:
-    return MLC.segment_command_64_data.fileoff;
-  default:
-    return 0;
-  }
-}
-
-static uint64_t getSegmentFileSize(const LoadCommand &TextSegmentLoadCommand) {
-  const MachO::macho_load_command &MLC =
-      TextSegmentLoadCommand.MachOLoadCommand;
-  switch (MLC.load_command_data.cmd) {
-  case MachO::LC_SEGMENT:
-    return MLC.segment_command_data.filesize;
-  case MachO::LC_SEGMENT_64:
-    return MLC.segment_command_64_data.filesize;
-  default:
-    return 0;
-  }
-}
-
-void MachOWriter::writeCodeSignatureData() {
-  // NOTE: This CodeSignature section behaviour must be kept in sync with that
-  // performed in LLD's CodeSignatureSection::write /
-  // CodeSignatureSection::writeHashes. Furthermore, this call must occur only
-  // after the rest of the binary has already been written to the buffer. This
-  // is because the buffer is read from to perform the necessary hashing.
-
-  // The CodeSignature section is the last section in the MachO binary and
-  // contains a hash of all content in the binary before it. Since llvm-objcopy
-  // has likely modified the target binary, the hash must be regenerated
-  // entirely. To generate this hash, we must read from the start of the binary
-  // (HashReadStart) to just before the start of the CodeSignature section
-  // (HashReadEnd).
-
-  const CodeSignatureInfo &CodeSignature = LayoutBuilder.getCodeSignature();
-
-  uint8_t *BufferStart = reinterpret_cast<uint8_t *>(Buf->getBufferStart());
-  uint8_t *HashReadStart = BufferStart;
-  uint8_t *HashReadEnd = BufferStart + CodeSignature.StartOffset;
-
-  // The CodeSignature section begins with a header, after which the hashes
-  // of each page of the binary are written.
-  uint8_t *HashWriteStart = HashReadEnd + CodeSignature.AllHeadersSize;
-
-  uint32_t TextSegmentFileOff = 0;
-  uint32_t TextSegmentFileSize = 0;
-  if (O.TextSegmentCommandIndex) {
-    const LoadCommand &TextSegmentLoadCommand =
-        O.LoadCommands[*O.TextSegmentCommandIndex];
-    assert(TextSegmentLoadCommand.MachOLoadCommand.load_command_data.cmd ==
-               MachO::LC_SEGMENT ||
-           TextSegmentLoadCommand.MachOLoadCommand.load_command_data.cmd ==
-               MachO::LC_SEGMENT_64);
-    assert(StringRef(TextSegmentLoadCommand.MachOLoadCommand
-                         .segment_command_data.segname) == "__TEXT");
-    TextSegmentFileOff = getSegmentFileOffset(TextSegmentLoadCommand);
-    TextSegmentFileSize = getSegmentFileSize(TextSegmentLoadCommand);
-  }
-
-  const uint32_t FileNamePad = CodeSignature.AllHeadersSize -
-                               CodeSignature.FixedHeadersSize -
-                               CodeSignature.OutputFileName.size();
-
-  // Write code section header.
-  auto *SuperBlob = reinterpret_cast<MachO::CS_SuperBlob *>(HashReadEnd);
-  write32be(&SuperBlob->magic, MachO::CSMAGIC_EMBEDDED_SIGNATURE);
-  write32be(&SuperBlob->length, CodeSignature.Size);
-  write32be(&SuperBlob->count, 1);
-  auto *BlobIndex = reinterpret_cast<MachO::CS_BlobIndex *>(&SuperBlob[1]);
-  write32be(&BlobIndex->type, MachO::CSSLOT_CODEDIRECTORY);
-  write32be(&BlobIndex->offset, CodeSignature.BlobHeadersSize);
-  auto *CodeDirectory = reinterpret_cast<MachO::CS_CodeDirectory *>(
-      HashReadEnd + CodeSignature.BlobHeadersSize);
-  write32be(&CodeDirectory->magic, MachO::CSMAGIC_CODEDIRECTORY);
-  write32be(&CodeDirectory->length,
-            CodeSignature.Size - CodeSignature.BlobHeadersSize);
-  write32be(&CodeDirectory->version, MachO::CS_SUPPORTSEXECSEG);
-  write32be(&CodeDirectory->flags, MachO::CS_ADHOC | MachO::CS_LINKER_SIGNED);
-  write32be(&CodeDirectory->hashOffset,
-            sizeof(MachO::CS_CodeDirectory) +
-                CodeSignature.OutputFileName.size() + FileNamePad);
-  write32be(&CodeDirectory->identOffset, sizeof(MachO::CS_CodeDirectory));
-  CodeDirectory->nSpecialSlots = 0;
-  write32be(&CodeDirectory->nCodeSlots, CodeSignature.BlockCount);
-  write32be(&CodeDirectory->codeLimit, CodeSignature.StartOffset);
-  CodeDirectory->hashSize = static_cast<uint8_t>(CodeSignature.HashSize);
-  CodeDirectory->hashType = MachO::kSecCodeSignatureHashSHA256;
-  CodeDirectory->platform = 0;
-  CodeDirectory->pageSize = CodeSignature.BlockSizeShift;
-  CodeDirectory->spare2 = 0;
-  CodeDirectory->scatterOffset = 0;
-  CodeDirectory->teamOffset = 0;
-  CodeDirectory->spare3 = 0;
-  CodeDirectory->codeLimit64 = 0;
-  write64be(&CodeDirectory->execSegBase, TextSegmentFileOff);
-  write64be(&CodeDirectory->execSegLimit, TextSegmentFileSize);
-  write64be(&CodeDirectory->execSegFlags, O.Header.FileType == MachO::MH_EXECUTE
-                                              ? MachO::CS_EXECSEG_MAIN_BINARY
-                                              : 0);
-
-  auto *Id = reinterpret_cast<char *>(&CodeDirectory[1]);
-  memcpy(Id, CodeSignature.OutputFileName.begin(),
-         CodeSignature.OutputFileName.size());
-  memset(Id + CodeSignature.OutputFileName.size(), 0, FileNamePad);
-
-  // Write the hashes.
-  uint8_t *CurrHashReadPosition = HashReadStart;
-  uint8_t *CurrHashWritePosition = HashWriteStart;
-  while (CurrHashReadPosition < HashReadEnd) {
-    StringRef Block(reinterpret_cast<char *>(CurrHashReadPosition),
-                    std::min(HashReadEnd - CurrHashReadPosition,
-                             static_cast<ssize_t>(CodeSignature.BlockSize)));
-    SHA256 Hasher;
-    Hasher.update(Block);
-    StringRef Hash = Hasher.final();
-    assert(Hash.size() == CodeSignature.HashSize);
-    memcpy(CurrHashWritePosition, Hash.data(), CodeSignature.HashSize);
-    CurrHashReadPosition += CodeSignature.BlockSize;
-    CurrHashWritePosition += CodeSignature.HashSize;
-  }
-#if defined(__APPLE__)
-  // This is macOS-specific work-around and makes no sense for any
-  // other host OS. See https://openradar.appspot.com/FB8914231
-  //
-  // The macOS kernel maintains a signature-verification cache to
-  // quickly validate applications at time of execve(2).  The trouble
-  // is that for the kernel creates the cache entry at the time of the
-  // mmap(2) call, before we have a chance to write either the code to
-  // sign or the signature header+hashes.  The fix is to invalidate
-  // all cached data associated with the output file, thus discarding
-  // the bogus prematurely-cached signature.
-  msync(BufferStart, CodeSignature.StartOffset + CodeSignature.Size,
-        MS_INVALIDATE);
-#endif
-}
-
-void MachOWriter::writeDataInCodeData() {
-  return writeLinkData(O.DataInCodeCommandIndex, O.DataInCode);
-}
-
-void MachOWriter::writeLinkerOptimizationHint() {
-  return writeLinkData(O.LinkerOptimizationHintCommandIndex,
-                       O.LinkerOptimizationHint);
-}
-
-void MachOWriter::writeFunctionStartsData() {
-  return writeLinkData(O.FunctionStartsCommandIndex, O.FunctionStarts);
-}
-
-void MachOWriter::writeChainedFixupsData() {
-  return writeLinkData(O.ChainedFixupsCommandIndex, O.ChainedFixups);
-}
-
-void MachOWriter::writeExportsTrieData() {
-  return writeLinkData(O.ExportsTrieCommandIndex, O.ExportsTrie);
-}
-
-void MachOWriter::writeTail() {
-  typedef void (MachOWriter::*WriteHandlerType)();
-  typedef std::pair<uint64_t, WriteHandlerType> WriteOperation;
-  SmallVector<WriteOperation, 7> Queue;
-
-  if (O.SymTabCommandIndex) {
-    const MachO::symtab_command &SymTabCommand =
-        O.LoadCommands[*O.SymTabCommandIndex]
-            .MachOLoadCommand.symtab_command_data;
-    if (SymTabCommand.symoff)
-      Queue.push_back({SymTabCommand.symoff, &MachOWriter::writeSymbolTable});
-    if (SymTabCommand.stroff)
-      Queue.push_back({SymTabCommand.stroff, &MachOWriter::writeStringTable});
-  }
-
-  if (O.DyLdInfoCommandIndex) {
-    const MachO::dyld_info_command &DyLdInfoCommand =
-        O.LoadCommands[*O.DyLdInfoCommandIndex]
-            .MachOLoadCommand.dyld_info_command_data;
-    if (DyLdInfoCommand.rebase_off)
-      Queue.push_back(
-          {DyLdInfoCommand.rebase_off, &MachOWriter::writeRebaseInfo});
-    if (DyLdInfoCommand.bind_off)
-      Queue.push_back({DyLdInfoCommand.bind_off, &MachOWriter::writeBindInfo});
-    if (DyLdInfoCommand.weak_bind_off)
-      Queue.push_back(
-          {DyLdInfoCommand.weak_bind_off, &MachOWriter::writeWeakBindInfo});
-    if (DyLdInfoCommand.lazy_bind_off)
-      Queue.push_back(
-          {DyLdInfoCommand.lazy_bind_off, &MachOWriter::writeLazyBindInfo});
-    if (DyLdInfoCommand.export_off)
-      Queue.push_back(
-          {DyLdInfoCommand.export_off, &MachOWriter::writeExportInfo});
-  }
-
-  if (O.DySymTabCommandIndex) {
-    const MachO::dysymtab_command &DySymTabCommand =
-        O.LoadCommands[*O.DySymTabCommandIndex]
-            .MachOLoadCommand.dysymtab_command_data;
-
-    if (DySymTabCommand.indirectsymoff)
-      Queue.emplace_back(DySymTabCommand.indirectsymoff,
-                         &MachOWriter::writeIndirectSymbolTable);
-  }
-
-  if (O.CodeSignatureCommandIndex) {
-    const MachO::linkedit_data_command &LinkEditDataCommand =
-        O.LoadCommands[*O.CodeSignatureCommandIndex]
-            .MachOLoadCommand.linkedit_data_command_data;
-
-    if (LinkEditDataCommand.dataoff)
-      Queue.emplace_back(LinkEditDataCommand.dataoff,
-                         &MachOWriter::writeCodeSignatureData);
-  }
-
-  if (O.DataInCodeCommandIndex) {
-    const MachO::linkedit_data_command &LinkEditDataCommand =
-        O.LoadCommands[*O.DataInCodeCommandIndex]
-            .MachOLoadCommand.linkedit_data_command_data;
-
-    if (LinkEditDataCommand.dataoff)
-      Queue.emplace_back(LinkEditDataCommand.dataoff,
-                         &MachOWriter::writeDataInCodeData);
-  }
-
-  if (O.LinkerOptimizationHintCommandIndex) {
-    const MachO::linkedit_data_command &LinkEditDataCommand =
-        O.LoadCommands[*O.LinkerOptimizationHintCommandIndex]
-            .MachOLoadCommand.linkedit_data_command_data;
-
-    if (LinkEditDataCommand.dataoff)
-      Queue.emplace_back(LinkEditDataCommand.dataoff,
-                         &MachOWriter::writeLinkerOptimizationHint);
-  }
-
-  if (O.FunctionStartsCommandIndex) {
-    const MachO::linkedit_data_command &LinkEditDataCommand =
-        O.LoadCommands[*O.FunctionStartsCommandIndex]
-            .MachOLoadCommand.linkedit_data_command_data;
-
-    if (LinkEditDataCommand.dataoff)
-      Queue.emplace_back(LinkEditDataCommand.dataoff,
-                         &MachOWriter::writeFunctionStartsData);
-  }
-
-  if (O.ChainedFixupsCommandIndex) {
-    const MachO::linkedit_data_command &LinkEditDataCommand =
-        O.LoadCommands[*O.ChainedFixupsCommandIndex]
-            .MachOLoadCommand.linkedit_data_command_data;
-
-    if (LinkEditDataCommand.dataoff)
-      Queue.emplace_back(LinkEditDataCommand.dataoff,
-                         &MachOWriter::writeChainedFixupsData);
-  }
-
-  if (O.ExportsTrieCommandIndex) {
-    const MachO::linkedit_data_command &LinkEditDataCommand =
-        O.LoadCommands[*O.ExportsTrieCommandIndex]
-            .MachOLoadCommand.linkedit_data_command_data;
-
-    if (LinkEditDataCommand.dataoff)
-      Queue.emplace_back(LinkEditDataCommand.dataoff,
-                         &MachOWriter::writeExportsTrieData);
-  }
-
-  llvm::sort(Queue, [](const WriteOperation &LHS, const WriteOperation &RHS) {
-    return LHS.first < RHS.first;
-  });
-
-  for (auto WriteOp : Queue)
-    (this->*WriteOp.second)();
-}
-
-Error MachOWriter::finalize() { return LayoutBuilder.layout(); }
-
-Error MachOWriter::write() {
-  size_t TotalSize = totalSize();
-  Buf = WritableMemoryBuffer::getNewMemBuffer(TotalSize);
-  if (!Buf)
-    return createStringError(errc::not_enough_memory,
-                             "failed to allocate memory buffer of " +
-                                 Twine::utohexstr(TotalSize) + " bytes");
-  memset(Buf->getBufferStart(), 0, totalSize());
-  writeHeader();
-  writeLoadCommands();
-  writeSections();
-  writeTail();
-
-  // TODO: Implement direct writing to the output stream (without intermediate
-  // memory buffer Buf).
-  Out.write(Buf->getBufferStart(), Buf->getBufferSize());
-  return Error::success();
-}
diff --git a/llvm/tools/llvm-objcopy/MachO/MachOWriter.h b/llvm/tools/llvm-objcopy/MachO/MachOWriter.h
deleted file mode 100644
index a172534dac8a..000000000000
--- a/llvm/tools/llvm-objcopy/MachO/MachOWriter.h
+++ /dev/null
@@ -1,71 +0,0 @@
-//===- MachOWriter.h --------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "MachOLayoutBuilder.h"
-#include "MachOObjcopy.h"
-#include "Object.h"
-#include "llvm/BinaryFormat/MachO.h"
-#include "llvm/Object/MachO.h"
-
-namespace llvm {
-class Error;
-
-namespace objcopy {
-namespace macho {
-
-class MachOWriter {
-  Object &O;
-  bool Is64Bit;
-  bool IsLittleEndian;
-  uint64_t PageSize;
-  std::unique_ptr<WritableMemoryBuffer> Buf;
-  raw_ostream &Out;
-  MachOLayoutBuilder LayoutBuilder;
-
-  size_t headerSize() const;
-  size_t loadCommandsSize() const;
-  size_t symTableSize() const;
-  size_t strTableSize() const;
-
-  void writeHeader();
-  void writeLoadCommands();
-  template <typename StructType>
-  void writeSectionInLoadCommand(const Section &Sec, uint8_t *&Out);
-  void writeSections();
-  void writeSymbolTable();
-  void writeStringTable();
-  void writeRebaseInfo();
-  void writeBindInfo();
-  void writeWeakBindInfo();
-  void writeLazyBindInfo();
-  void writeExportInfo();
-  void writeIndirectSymbolTable();
-  void writeLinkData(Optional<size_t> LCIndex, const LinkData &LD);
-  void writeCodeSignatureData();
-  void writeDataInCodeData();
-  void writeLinkerOptimizationHint();
-  void writeFunctionStartsData();
-  void writeChainedFixupsData();
-  void writeExportsTrieData();
-  void writeTail();
-
-public:
-  MachOWriter(Object &O, bool Is64Bit, bool IsLittleEndian,
-              StringRef OutputFileName, uint64_t PageSize, raw_ostream &Out)
-      : O(O), Is64Bit(Is64Bit), IsLittleEndian(IsLittleEndian),
-        PageSize(PageSize), Out(Out),
-        LayoutBuilder(O, Is64Bit, OutputFileName, PageSize) {}
-
-  size_t totalSize() const;
-  Error finalize();
-  Error write();
-};
-
-} // end namespace macho
-} // end namespace objcopy
-} // end namespace llvm
diff --git a/llvm/tools/llvm-objcopy/MachO/Object.cpp b/llvm/tools/llvm-objcopy/MachO/Object.cpp
deleted file mode 100644
index 6312adbbc9f7..000000000000
--- a/llvm/tools/llvm-objcopy/MachO/Object.cpp
+++ /dev/null
@@ -1,214 +0,0 @@
-//===- Object.cpp - Mach-O object file model --------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "Object.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include <unordered_set>
-
-using namespace llvm;
-using namespace llvm::objcopy::macho;
-
-const SymbolEntry *SymbolTable::getSymbolByIndex(uint32_t Index) const {
-  assert(Index < Symbols.size() && "invalid symbol index");
-  return Symbols[Index].get();
-}
-
-SymbolEntry *SymbolTable::getSymbolByIndex(uint32_t Index) {
-  return const_cast<SymbolEntry *>(
-      static_cast<const SymbolTable *>(this)->getSymbolByIndex(Index));
-}
-
-void SymbolTable::removeSymbols(
-    function_ref<bool(const std::unique_ptr<SymbolEntry> &)> ToRemove) {
-  llvm::erase_if(Symbols, ToRemove);
-}
-
-void Object::updateLoadCommandIndexes() {
-  static constexpr char TextSegmentName[] = "__TEXT";
-  // Update indices of special load commands
-  for (size_t Index = 0, Size = LoadCommands.size(); Index < Size; ++Index) {
-    LoadCommand &LC = LoadCommands[Index];
-    switch (LC.MachOLoadCommand.load_command_data.cmd) {
-    case MachO::LC_CODE_SIGNATURE:
-      CodeSignatureCommandIndex = Index;
-      break;
-    case MachO::LC_SEGMENT:
-      if (StringRef(LC.MachOLoadCommand.segment_command_data.segname) ==
-          TextSegmentName)
-        TextSegmentCommandIndex = Index;
-      break;
-    case MachO::LC_SEGMENT_64:
-      if (StringRef(LC.MachOLoadCommand.segment_command_64_data.segname) ==
-          TextSegmentName)
-        TextSegmentCommandIndex = Index;
-      break;
-    case MachO::LC_SYMTAB:
-      SymTabCommandIndex = Index;
-      break;
-    case MachO::LC_DYSYMTAB:
-      DySymTabCommandIndex = Index;
-      break;
-    case MachO::LC_DYLD_INFO:
-    case MachO::LC_DYLD_INFO_ONLY:
-      DyLdInfoCommandIndex = Index;
-      break;
-    case MachO::LC_DATA_IN_CODE:
-      DataInCodeCommandIndex = Index;
-      break;
-    case MachO::LC_LINKER_OPTIMIZATION_HINT:
-      LinkerOptimizationHintCommandIndex = Index;
-      break;
-    case MachO::LC_FUNCTION_STARTS:
-      FunctionStartsCommandIndex = Index;
-      break;
-    case MachO::LC_DYLD_CHAINED_FIXUPS:
-      ChainedFixupsCommandIndex = Index;
-      break;
-    case MachO::LC_DYLD_EXPORTS_TRIE:
-      ExportsTrieCommandIndex = Index;
-      break;
-    }
-  }
-}
-
-Error Object::removeLoadCommands(
-    function_ref<bool(const LoadCommand &)> ToRemove) {
-  auto It = std::stable_partition(
-      LoadCommands.begin(), LoadCommands.end(),
-      [&](const LoadCommand &LC) { return !ToRemove(LC); });
-  LoadCommands.erase(It, LoadCommands.end());
-
-  updateLoadCommandIndexes();
-  return Error::success();
-}
-
-Error Object::removeSections(
-    function_ref<bool(const std::unique_ptr<Section> &)> ToRemove) {
-  DenseMap<uint32_t, const Section *> OldIndexToSection;
-  uint32_t NextSectionIndex = 1;
-  for (LoadCommand &LC : LoadCommands) {
-    auto It = std::stable_partition(
-        std::begin(LC.Sections), std::end(LC.Sections),
-        [&](const std::unique_ptr<Section> &Sec) { return !ToRemove(Sec); });
-    for (auto I = LC.Sections.begin(), End = It; I != End; ++I) {
-      OldIndexToSection[(*I)->Index] = I->get();
-      (*I)->Index = NextSectionIndex++;
-    }
-    LC.Sections.erase(It, LC.Sections.end());
-  }
-
-  auto IsDead = [&](const std::unique_ptr<SymbolEntry> &S) -> bool {
-    Optional<uint32_t> Section = S->section();
-    return (Section && !OldIndexToSection.count(*Section));
-  };
-
-  SmallPtrSet<const SymbolEntry *, 2> DeadSymbols;
-  for (const std::unique_ptr<SymbolEntry> &Sym : SymTable.Symbols)
-    if (IsDead(Sym))
-      DeadSymbols.insert(Sym.get());
-
-  for (const LoadCommand &LC : LoadCommands)
-    for (const std::unique_ptr<Section> &Sec : LC.Sections)
-      for (const RelocationInfo &R : Sec->Relocations)
-        if (R.Symbol && *R.Symbol && DeadSymbols.count(*R.Symbol))
-          return createStringError(std::errc::invalid_argument,
-                                   "symbol '%s' defined in section with index "
-                                   "'%u' cannot be removed because it is "
-                                   "referenced by a relocation in section '%s'",
-                                   (*R.Symbol)->Name.c_str(),
-                                   *((*R.Symbol)->section()),
-                                   Sec->CanonicalName.c_str());
-  SymTable.removeSymbols(IsDead);
-  for (std::unique_ptr<SymbolEntry> &S : SymTable.Symbols)
-    if (S->section())
-      S->n_sect = OldIndexToSection[S->n_sect]->Index;
-  return Error::success();
-}
-
-uint64_t Object::nextAvailableSegmentAddress() const {
-  uint64_t HeaderSize =
-      is64Bit() ? sizeof(MachO::mach_header_64) : sizeof(MachO::mach_header);
-  uint64_t Addr = HeaderSize + Header.SizeOfCmds;
-  for (const LoadCommand &LC : LoadCommands) {
-    const MachO::macho_load_command &MLC = LC.MachOLoadCommand;
-    switch (MLC.load_command_data.cmd) {
-    case MachO::LC_SEGMENT:
-      Addr = std::max(Addr,
-                      static_cast<uint64_t>(MLC.segment_command_data.vmaddr) +
-                          MLC.segment_command_data.vmsize);
-      break;
-    case MachO::LC_SEGMENT_64:
-      Addr = std::max(Addr, MLC.segment_command_64_data.vmaddr +
-                                MLC.segment_command_64_data.vmsize);
-      break;
-    default:
-      continue;
-    }
-  }
-  return Addr;
-}
-
-template <typename SegmentType>
-static void
-constructSegment(SegmentType &Seg, llvm::MachO::LoadCommandType CmdType,
-                 StringRef SegName, uint64_t SegVMAddr, uint64_t SegVMSize) {
-  assert(SegName.size() <= sizeof(Seg.segname) && "too long segment name");
-  memset(&Seg, 0, sizeof(SegmentType));
-  Seg.cmd = CmdType;
-  strncpy(Seg.segname, SegName.data(), SegName.size());
-  Seg.maxprot |=
-      (MachO::VM_PROT_READ | MachO::VM_PROT_WRITE | MachO::VM_PROT_EXECUTE);
-  Seg.initprot |=
-      (MachO::VM_PROT_READ | MachO::VM_PROT_WRITE | MachO::VM_PROT_EXECUTE);
-  Seg.vmaddr = SegVMAddr;
-  Seg.vmsize = SegVMSize;
-}
-
-LoadCommand &Object::addSegment(StringRef SegName, uint64_t SegVMSize) {
-  LoadCommand LC;
-  const uint64_t SegVMAddr = nextAvailableSegmentAddress();
-  if (is64Bit())
-    constructSegment(LC.MachOLoadCommand.segment_command_64_data,
-                     MachO::LC_SEGMENT_64, SegName, SegVMAddr, SegVMSize);
-  else
-    constructSegment(LC.MachOLoadCommand.segment_command_data,
-                     MachO::LC_SEGMENT, SegName, SegVMAddr, SegVMSize);
-
-  LoadCommands.push_back(std::move(LC));
-  return LoadCommands.back();
-}
-
-/// Extracts a segment name from a string which is possibly non-null-terminated.
-static StringRef extractSegmentName(const char *SegName) {
-  return StringRef(SegName,
-                   strnlen(SegName, sizeof(MachO::segment_command::segname)));
-}
-
-Optional<StringRef> LoadCommand::getSegmentName() const {
-  const MachO::macho_load_command &MLC = MachOLoadCommand;
-  switch (MLC.load_command_data.cmd) {
-  case MachO::LC_SEGMENT:
-    return extractSegmentName(MLC.segment_command_data.segname);
-  case MachO::LC_SEGMENT_64:
-    return extractSegmentName(MLC.segment_command_64_data.segname);
-  default:
-    return None;
-  }
-}
-
-Optional<uint64_t> LoadCommand::getSegmentVMAddr() const {
-  const MachO::macho_load_command &MLC = MachOLoadCommand;
-  switch (MLC.load_command_data.cmd) {
-  case MachO::LC_SEGMENT:
-    return MLC.segment_command_data.vmaddr;
-  case MachO::LC_SEGMENT_64:
-    return MLC.segment_command_64_data.vmaddr;
-  default:
-    return None;
-  }
-}
diff --git a/llvm/tools/llvm-objcopy/MachO/Object.h b/llvm/tools/llvm-objcopy/MachO/Object.h
deleted file mode 100644
index 13aaf42634b0..000000000000
--- a/llvm/tools/llvm-objcopy/MachO/Object.h
+++ /dev/null
@@ -1,374 +0,0 @@
-//===- Object.h - Mach-O object file model ----------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_OBJCOPY_MACHO_OBJECT_H
-#define LLVM_OBJCOPY_MACHO_OBJECT_H
-
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/BinaryFormat/MachO.h"
-#include "llvm/MC/StringTableBuilder.h"
-#include "llvm/ObjectYAML/DWARFYAML.h"
-#include "llvm/Support/StringSaver.h"
-#include "llvm/Support/YAMLTraits.h"
-#include <cstdint>
-#include <string>
-#include <vector>
-
-namespace llvm {
-namespace objcopy {
-namespace macho {
-
-struct MachHeader {
-  uint32_t Magic;
-  uint32_t CPUType;
-  uint32_t CPUSubType;
-  uint32_t FileType;
-  uint32_t NCmds;
-  uint32_t SizeOfCmds;
-  uint32_t Flags;
-  uint32_t Reserved = 0;
-};
-
-struct RelocationInfo;
-struct Section {
-  uint32_t Index;
-  std::string Segname;
-  std::string Sectname;
-  // CanonicalName is a string formatted as “<Segname>,<Sectname>".
-  std::string CanonicalName;
-  uint64_t Addr = 0;
-  uint64_t Size = 0;
-  // Offset in the input file.
-  Optional<uint32_t> OriginalOffset;
-  uint32_t Offset = 0;
-  uint32_t Align = 0;
-  uint32_t RelOff = 0;
-  uint32_t NReloc = 0;
-  uint32_t Flags = 0;
-  uint32_t Reserved1 = 0;
-  uint32_t Reserved2 = 0;
-  uint32_t Reserved3 = 0;
-  StringRef Content;
-  std::vector<RelocationInfo> Relocations;
-
-  Section(StringRef SegName, StringRef SectName)
-      : Segname(std::string(SegName)), Sectname(std::string(SectName)),
-        CanonicalName((Twine(SegName) + Twine(',') + SectName).str()) {}
-
-  Section(StringRef SegName, StringRef SectName, StringRef Content)
-      : Segname(std::string(SegName)), Sectname(std::string(SectName)),
-        CanonicalName((Twine(SegName) + Twine(',') + SectName).str()),
-        Content(Content) {}
-
-  MachO::SectionType getType() const {
-    return static_cast<MachO::SectionType>(Flags & MachO::SECTION_TYPE);
-  }
-
-  bool isVirtualSection() const {
-    return (getType() == MachO::S_ZEROFILL ||
-            getType() == MachO::S_GB_ZEROFILL ||
-            getType() == MachO::S_THREAD_LOCAL_ZEROFILL);
-  }
-
-  bool hasValidOffset() const {
-    return !(isVirtualSection() || (OriginalOffset && *OriginalOffset == 0));
-  }
-};
-
-struct LoadCommand {
-  // The type MachO::macho_load_command is defined in llvm/BinaryFormat/MachO.h
-  // and it is a union of all the structs corresponding to various load
-  // commands.
-  MachO::macho_load_command MachOLoadCommand;
-
-  // The raw content of the payload of the load command (located right after the
-  // corresponding struct). In some cases it is either empty or can be
-  // copied-over without digging into its structure.
-  std::vector<uint8_t> Payload;
-
-  // Some load commands can contain (inside the payload) an array of sections,
-  // though the contents of the sections are stored separately. The struct
-  // Section describes only sections' metadata and where to find the
-  // corresponding content inside the binary.
-  std::vector<std::unique_ptr<Section>> Sections;
-
-  // Returns the segment name if the load command is a segment command.
-  Optional<StringRef> getSegmentName() const;
-
-  // Returns the segment vm address if the load command is a segment command.
-  Optional<uint64_t> getSegmentVMAddr() const;
-};
-
-// A symbol information. Fields which starts with "n_" are same as them in the
-// nlist.
-struct SymbolEntry {
-  std::string Name;
-  bool Referenced = false;
-  uint32_t Index;
-  uint8_t n_type;
-  uint8_t n_sect;
-  uint16_t n_desc;
-  uint64_t n_value;
-
-  bool isExternalSymbol() const { return n_type & MachO::N_EXT; }
-
-  bool isLocalSymbol() const { return !isExternalSymbol(); }
-
-  bool isUndefinedSymbol() const {
-    return (n_type & MachO::N_TYPE) == MachO::N_UNDF;
-  }
-
-  bool isSwiftSymbol() const {
-    return StringRef(Name).startswith("_$s") ||
-           StringRef(Name).startswith("_$S");
-  }
-
-  Optional<uint32_t> section() const {
-    return n_sect == MachO::NO_SECT ? None : Optional<uint32_t>(n_sect);
-  }
-};
-
-/// The location of the symbol table inside the binary is described by LC_SYMTAB
-/// load command.
-struct SymbolTable {
-  std::vector<std::unique_ptr<SymbolEntry>> Symbols;
-
-  using iterator = pointee_iterator<
-      std::vector<std::unique_ptr<SymbolEntry>>::const_iterator>;
-
-  iterator begin() const { return iterator(Symbols.begin()); }
-  iterator end() const { return iterator(Symbols.end()); }
-
-  const SymbolEntry *getSymbolByIndex(uint32_t Index) const;
-  SymbolEntry *getSymbolByIndex(uint32_t Index);
-  void removeSymbols(
-      function_ref<bool(const std::unique_ptr<SymbolEntry> &)> ToRemove);
-};
-
-struct IndirectSymbolEntry {
-  // The original value in an indirect symbol table. Higher bits encode extra
-  // information (INDIRECT_SYMBOL_LOCAL and INDIRECT_SYMBOL_ABS).
-  uint32_t OriginalIndex;
-  /// The Symbol referenced by this entry. It's None if the index is
-  /// INDIRECT_SYMBOL_LOCAL or INDIRECT_SYMBOL_ABS.
-  Optional<SymbolEntry *> Symbol;
-
-  IndirectSymbolEntry(uint32_t OriginalIndex, Optional<SymbolEntry *> Symbol)
-      : OriginalIndex(OriginalIndex), Symbol(Symbol) {}
-};
-
-struct IndirectSymbolTable {
-  std::vector<IndirectSymbolEntry> Symbols;
-};
-
-/// The location of the string table inside the binary is described by LC_SYMTAB
-/// load command.
-struct StringTable {
-  std::vector<std::string> Strings;
-};
-
-struct RelocationInfo {
-  // The referenced symbol entry. Set if !Scattered && Extern.
-  Optional<const SymbolEntry *> Symbol;
-  // The referenced section. Set if !Scattered && !Extern.
-  Optional<const Section *> Sec;
-  // True if Info is a scattered_relocation_info.
-  bool Scattered;
-  // True if the type is an ADDEND. r_symbolnum holds the addend instead of a
-  // symbol index.
-  bool IsAddend;
-  // True if the r_symbolnum points to a section number (i.e. r_extern=0).
-  bool Extern;
-  MachO::any_relocation_info Info;
-
-  unsigned getPlainRelocationSymbolNum(bool IsLittleEndian) {
-    if (IsLittleEndian)
-      return Info.r_word1 & 0xffffff;
-    return Info.r_word1 >> 8;
-  }
-
-  void setPlainRelocationSymbolNum(unsigned SymbolNum, bool IsLittleEndian) {
-    assert(SymbolNum < (1 << 24) && "SymbolNum out of range");
-    if (IsLittleEndian)
-      Info.r_word1 = (Info.r_word1 & ~0x00ffffff) | SymbolNum;
-    else
-      Info.r_word1 = (Info.r_word1 & ~0xffffff00) | (SymbolNum << 8);
-  }
-};
-
-/// The location of the rebase info inside the binary is described by
-/// LC_DYLD_INFO load command. Dyld rebases an image whenever dyld loads it at
-/// an address different from its preferred address.  The rebase information is
-/// a stream of byte sized opcodes whose symbolic names start with
-/// REBASE_OPCODE_. Conceptually the rebase information is a table of tuples:
-///   <seg-index, seg-offset, type>
-/// The opcodes are a compressed way to encode the table by only
-/// encoding when a column changes.  In addition simple patterns
-/// like "every n'th offset for m times" can be encoded in a few
-/// bytes.
-struct RebaseInfo {
-  // At the moment we do not parse this info (and it is simply copied over),
-  // but the proper support will be added later.
-  ArrayRef<uint8_t> Opcodes;
-};
-
-/// The location of the bind info inside the binary is described by
-/// LC_DYLD_INFO load command. Dyld binds an image during the loading process,
-/// if the image requires any pointers to be initialized to symbols in other
-/// images. The bind information is a stream of byte sized opcodes whose
-/// symbolic names start with BIND_OPCODE_. Conceptually the bind information is
-/// a table of tuples: <seg-index, seg-offset, type, symbol-library-ordinal,
-/// symbol-name, addend> The opcodes are a compressed way to encode the table by
-/// only encoding when a column changes.  In addition simple patterns like for
-/// runs of pointers initialized to the same value can be encoded in a few
-/// bytes.
-struct BindInfo {
-  // At the moment we do not parse this info (and it is simply copied over),
-  // but the proper support will be added later.
-  ArrayRef<uint8_t> Opcodes;
-};
-
-/// The location of the weak bind info inside the binary is described by
-/// LC_DYLD_INFO load command. Some C++ programs require dyld to unique symbols
-/// so that all images in the process use the same copy of some code/data. This
-/// step is done after binding. The content of the weak_bind info is an opcode
-/// stream like the bind_info.  But it is sorted alphabetically by symbol name.
-/// This enable dyld to walk all images with weak binding information in order
-/// and look for collisions.  If there are no collisions, dyld does no updating.
-/// That means that some fixups are also encoded in the bind_info.  For
-/// instance, all calls to "operator new" are first bound to libstdc++.dylib
-/// using the information in bind_info.  Then if some image overrides operator
-/// new that is detected when the weak_bind information is processed and the
-/// call to operator new is then rebound.
-struct WeakBindInfo {
-  // At the moment we do not parse this info (and it is simply copied over),
-  // but the proper support will be added later.
-  ArrayRef<uint8_t> Opcodes;
-};
-
-/// The location of the lazy bind info inside the binary is described by
-/// LC_DYLD_INFO load command. Some uses of external symbols do not need to be
-/// bound immediately. Instead they can be lazily bound on first use.  The
-/// lazy_bind contains a stream of BIND opcodes to bind all lazy symbols. Normal
-/// use is that dyld ignores the lazy_bind section when loading an image.
-/// Instead the static linker arranged for the lazy pointer to initially point
-/// to a helper function which pushes the offset into the lazy_bind area for the
-/// symbol needing to be bound, then jumps to dyld which simply adds the offset
-/// to lazy_bind_off to get the information on what to bind.
-struct LazyBindInfo {
-  ArrayRef<uint8_t> Opcodes;
-};
-
-/// The location of the export info inside the binary is described by
-/// LC_DYLD_INFO load command. The symbols exported by a dylib are encoded in a
-/// trie.  This is a compact representation that factors out common prefixes. It
-/// also reduces LINKEDIT pages in RAM because it encodes all information (name,
-/// address, flags) in one small, contiguous range. The export area is a stream
-/// of nodes.  The first node sequentially is the start node for the trie. Nodes
-/// for a symbol start with a uleb128 that is the length of the exported symbol
-/// information for the string so far. If there is no exported symbol, the node
-/// starts with a zero byte. If there is exported info, it follows the length.
-/// First is a uleb128 containing flags. Normally, it is followed by
-/// a uleb128 encoded offset which is location of the content named
-/// by the symbol from the mach_header for the image.  If the flags
-/// is EXPORT_SYMBOL_FLAGS_REEXPORT, then following the flags is
-/// a uleb128 encoded library ordinal, then a zero terminated
-/// UTF8 string.  If the string is zero length, then the symbol
-/// is re-export from the specified dylib with the same name.
-/// If the flags is EXPORT_SYMBOL_FLAGS_STUB_AND_RESOLVER, then following
-/// the flags is two uleb128s: the stub offset and the resolver offset.
-/// The stub is used by non-lazy pointers.  The resolver is used
-/// by lazy pointers and must be called to get the actual address to use.
-/// After the optional exported symbol information is a byte of
-/// how many edges (0-255) that this node has leaving it,
-/// followed by each edge.
-/// Each edge is a zero terminated UTF8 of the addition chars
-/// in the symbol, followed by a uleb128 offset for the node that
-/// edge points to.
-struct ExportInfo {
-  ArrayRef<uint8_t> Trie;
-};
-
-struct LinkData {
-  ArrayRef<uint8_t> Data;
-};
-
-struct Object {
-  MachHeader Header;
-  std::vector<LoadCommand> LoadCommands;
-
-  SymbolTable SymTable;
-  StringTable StrTable;
-
-  RebaseInfo Rebases;
-  BindInfo Binds;
-  WeakBindInfo WeakBinds;
-  LazyBindInfo LazyBinds;
-  ExportInfo Exports;
-  IndirectSymbolTable IndirectSymTable;
-  LinkData DataInCode;
-  LinkData LinkerOptimizationHint;
-  LinkData FunctionStarts;
-  LinkData ExportsTrie;
-  LinkData ChainedFixups;
-
-  Optional<uint32_t> SwiftVersion;
-
-  /// The index of LC_CODE_SIGNATURE load command if present.
-  Optional<size_t> CodeSignatureCommandIndex;
-  /// The index of LC_SYMTAB load command if present.
-  Optional<size_t> SymTabCommandIndex;
-  /// The index of LC_DYLD_INFO or LC_DYLD_INFO_ONLY load command if present.
-  Optional<size_t> DyLdInfoCommandIndex;
-  /// The index LC_DYSYMTAB load command if present.
-  Optional<size_t> DySymTabCommandIndex;
-  /// The index LC_DATA_IN_CODE load command if present.
-  Optional<size_t> DataInCodeCommandIndex;
-  /// The index of LC_LINKER_OPTIMIZATIN_HINT load command if present.
-  Optional<size_t> LinkerOptimizationHintCommandIndex;
-  /// The index LC_FUNCTION_STARTS load command if present.
-  Optional<size_t> FunctionStartsCommandIndex;
-  /// The index LC_DYLD_CHAINED_FIXUPS load command if present.
-  Optional<size_t> ChainedFixupsCommandIndex;
-  /// The index LC_DYLD_EXPORTS_TRIE load command if present.
-  Optional<size_t> ExportsTrieCommandIndex;
-  /// The index of the LC_SEGMENT or LC_SEGMENT_64 load command
-  /// corresponding to the __TEXT segment.
-  Optional<size_t> TextSegmentCommandIndex;
-
-  BumpPtrAllocator Alloc;
-  StringSaver NewSectionsContents;
-
-  Object() : NewSectionsContents(Alloc) {}
-
-  Error
-  removeSections(function_ref<bool(const std::unique_ptr<Section> &)> ToRemove);
-
-  Error removeLoadCommands(function_ref<bool(const LoadCommand &)> ToRemove);
-
-  void updateLoadCommandIndexes();
-
-  /// Creates a new segment load command in the object and returns a reference
-  /// to the newly created load command. The caller should verify that SegName
-  /// is not too long (SegName.size() should be less than or equal to 16).
-  LoadCommand &addSegment(StringRef SegName, uint64_t SegVMSize);
-
-  bool is64Bit() const {
-    return Header.Magic == MachO::MH_MAGIC_64 ||
-           Header.Magic == MachO::MH_CIGAM_64;
-  }
-
-  uint64_t nextAvailableSegmentAddress() const;
-};
-
-} // end namespace macho
-} // end namespace objcopy
-} // end namespace llvm
-
-#endif // LLVM_OBJCOPY_MACHO_OBJECT_H
diff --git a/llvm/tools/llvm-objcopy/MultiFormatConfig.h b/llvm/tools/llvm-objcopy/MultiFormatConfig.h
deleted file mode 100644
index 31d9883d6d3a..000000000000
--- a/llvm/tools/llvm-objcopy/MultiFormatConfig.h
+++ /dev/null
@@ -1,37 +0,0 @@
-//===- MultiFormatConfig.h --------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_LLVM_OBJCOPY_MULTIFORMATCONFIG_H
-#define LLVM_TOOLS_LLVM_OBJCOPY_MULTIFORMATCONFIG_H
-
-#include "llvm/Support/Error.h"
-
-namespace llvm {
-namespace objcopy {
-
-struct CommonConfig;
-struct ELFConfig;
-struct COFFConfig;
-struct MachOConfig;
-struct WasmConfig;
-
-class MultiFormatConfig {
-public:
-  virtual ~MultiFormatConfig() {}
-
-  virtual const CommonConfig &getCommonConfig() const = 0;
-  virtual Expected<const ELFConfig &> getELFConfig() const = 0;
-  virtual Expected<const COFFConfig &> getCOFFConfig() const = 0;
-  virtual Expected<const MachOConfig &> getMachOConfig() const = 0;
-  virtual Expected<const WasmConfig &> getWasmConfig() const = 0;
-};
-
-} // namespace objcopy
-} // namespace llvm
-
-#endif // LLVM_TOOLS_LLVM_OBJCOPY_MULTIFORMATCONFIG_H
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
new file mode 100644
index 000000000000..5b2b4b5704d8
--- /dev/null
+++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
@@ -0,0 +1,1364 @@
+//===- ObjcopyOptions.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ObjcopyOptions.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/ObjCopy/CommonConfig.h"
+#include "llvm/ObjCopy/ConfigManager.h"
+#include "llvm/ObjCopy/MachO/MachOConfig.h"
+#include "llvm/Option/Arg.h"
+#include "llvm/Option/ArgList.h"
+#include "llvm/Support/CRC.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compression.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+using namespace llvm;
+using namespace llvm::objcopy;
+
+namespace {
+enum ObjcopyID {
+  OBJCOPY_INVALID = 0, // This is not an option ID.
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+  OBJCOPY_##ID,
+#include "ObjcopyOpts.inc"
+#undef OPTION
+};
+
+#define PREFIX(NAME, VALUE) const char *const OBJCOPY_##NAME[] = VALUE;
+#include "ObjcopyOpts.inc"
+#undef PREFIX
+
+const opt::OptTable::Info ObjcopyInfoTable[] = {
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+  {OBJCOPY_##PREFIX,                                                           \
+   NAME,                                                                       \
+   HELPTEXT,                                                                   \
+   METAVAR,                                                                    \
+   OBJCOPY_##ID,                                                               \
+   opt::Option::KIND##Class,                                                   \
+   PARAM,                                                                      \
+   FLAGS,                                                                      \
+   OBJCOPY_##GROUP,                                                            \
+   OBJCOPY_##ALIAS,                                                            \
+   ALIASARGS,                                                                  \
+   VALUES},
+#include "ObjcopyOpts.inc"
+#undef OPTION
+};
+
+class ObjcopyOptTable : public opt::OptTable {
+public:
+  ObjcopyOptTable() : OptTable(ObjcopyInfoTable) {
+    setGroupedShortOptions(true);
+  }
+};
+
+enum InstallNameToolID {
+  INSTALL_NAME_TOOL_INVALID = 0, // This is not an option ID.
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+  INSTALL_NAME_TOOL_##ID,
+#include "InstallNameToolOpts.inc"
+#undef OPTION
+};
+
+#define PREFIX(NAME, VALUE)                                                    \
+  const char *const INSTALL_NAME_TOOL_##NAME[] = VALUE;
+#include "InstallNameToolOpts.inc"
+#undef PREFIX
+
+const opt::OptTable::Info InstallNameToolInfoTable[] = {
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+  {INSTALL_NAME_TOOL_##PREFIX,                                                 \
+   NAME,                                                                       \
+   HELPTEXT,                                                                   \
+   METAVAR,                                                                    \
+   INSTALL_NAME_TOOL_##ID,                                                     \
+   opt::Option::KIND##Class,                                                   \
+   PARAM,                                                                      \
+   FLAGS,                                                                      \
+   INSTALL_NAME_TOOL_##GROUP,                                                  \
+   INSTALL_NAME_TOOL_##ALIAS,                                                  \
+   ALIASARGS,                                                                  \
+   VALUES},
+#include "InstallNameToolOpts.inc"
+#undef OPTION
+};
+
+class InstallNameToolOptTable : public opt::OptTable {
+public:
+  InstallNameToolOptTable() : OptTable(InstallNameToolInfoTable) {}
+};
+
+enum BitcodeStripID {
+  BITCODE_STRIP_INVALID = 0, // This is not an option ID.
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+  BITCODE_STRIP_##ID,
+#include "BitcodeStripOpts.inc"
+#undef OPTION
+};
+
+#define PREFIX(NAME, VALUE) const char *const BITCODE_STRIP_##NAME[] = VALUE;
+#include "BitcodeStripOpts.inc"
+#undef PREFIX
+
+const opt::OptTable::Info BitcodeStripInfoTable[] = {
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+  {BITCODE_STRIP_##PREFIX,                                                     \
+   NAME,                                                                       \
+   HELPTEXT,                                                                   \
+   METAVAR,                                                                    \
+   BITCODE_STRIP_##ID,                                                         \
+   opt::Option::KIND##Class,                                                   \
+   PARAM,                                                                      \
+   FLAGS,                                                                      \
+   BITCODE_STRIP_##GROUP,                                                      \
+   BITCODE_STRIP_##ALIAS,                                                      \
+   ALIASARGS,                                                                  \
+   VALUES},
+#include "BitcodeStripOpts.inc"
+#undef OPTION
+};
+
+class BitcodeStripOptTable : public opt::OptTable {
+public:
+  BitcodeStripOptTable() : OptTable(BitcodeStripInfoTable) {}
+};
+
+enum StripID {
+  STRIP_INVALID = 0, // This is not an option ID.
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+  STRIP_##ID,
+#include "StripOpts.inc"
+#undef OPTION
+};
+
+#define PREFIX(NAME, VALUE) const char *const STRIP_##NAME[] = VALUE;
+#include "StripOpts.inc"
+#undef PREFIX
+
+const opt::OptTable::Info StripInfoTable[] = {
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+  {STRIP_##PREFIX, NAME,       HELPTEXT,                                       \
+   METAVAR,        STRIP_##ID, opt::Option::KIND##Class,                       \
+   PARAM,          FLAGS,      STRIP_##GROUP,                                  \
+   STRIP_##ALIAS,  ALIASARGS,  VALUES},
+#include "StripOpts.inc"
+#undef OPTION
+};
+
+class StripOptTable : public opt::OptTable {
+public:
+  StripOptTable() : OptTable(StripInfoTable) { setGroupedShortOptions(true); }
+};
+
+} // namespace
+
+static SectionFlag parseSectionRenameFlag(StringRef SectionName) {
+  return llvm::StringSwitch<SectionFlag>(SectionName)
+      .CaseLower("alloc", SectionFlag::SecAlloc)
+      .CaseLower("load", SectionFlag::SecLoad)
+      .CaseLower("noload", SectionFlag::SecNoload)
+      .CaseLower("readonly", SectionFlag::SecReadonly)
+      .CaseLower("debug", SectionFlag::SecDebug)
+      .CaseLower("code", SectionFlag::SecCode)
+      .CaseLower("data", SectionFlag::SecData)
+      .CaseLower("rom", SectionFlag::SecRom)
+      .CaseLower("merge", SectionFlag::SecMerge)
+      .CaseLower("strings", SectionFlag::SecStrings)
+      .CaseLower("contents", SectionFlag::SecContents)
+      .CaseLower("share", SectionFlag::SecShare)
+      .CaseLower("exclude", SectionFlag::SecExclude)
+      .Default(SectionFlag::SecNone);
+}
+
+static Expected<SectionFlag>
+parseSectionFlagSet(ArrayRef<StringRef> SectionFlags) {
+  SectionFlag ParsedFlags = SectionFlag::SecNone;
+  for (StringRef Flag : SectionFlags) {
+    SectionFlag ParsedFlag = parseSectionRenameFlag(Flag);
+    if (ParsedFlag == SectionFlag::SecNone)
+      return createStringError(
+          errc::invalid_argument,
+          "unrecognized section flag '%s'. Flags supported for GNU "
+          "compatibility: alloc, load, noload, readonly, exclude, debug, "
+          "code, data, rom, share, contents, merge, strings",
+          Flag.str().c_str());
+    ParsedFlags |= ParsedFlag;
+  }
+
+  return ParsedFlags;
+}
+
+static Expected<SectionRename> parseRenameSectionValue(StringRef FlagValue) {
+  if (!FlagValue.contains('='))
+    return createStringError(errc::invalid_argument,
+                             "bad format for --rename-section: missing '='");
+
+  // Initial split: ".foo" = ".bar,f1,f2,..."
+  auto Old2New = FlagValue.split('=');
+  SectionRename SR;
+  SR.OriginalName = Old2New.first;
+
+  // Flags split: ".bar" "f1" "f2" ...
+  SmallVector<StringRef, 6> NameAndFlags;
+  Old2New.second.split(NameAndFlags, ',');
+  SR.NewName = NameAndFlags[0];
+
+  if (NameAndFlags.size() > 1) {
+    Expected<SectionFlag> ParsedFlagSet =
+        parseSectionFlagSet(makeArrayRef(NameAndFlags).drop_front());
+    if (!ParsedFlagSet)
+      return ParsedFlagSet.takeError();
+    SR.NewFlags = *ParsedFlagSet;
+  }
+
+  return SR;
+}
+
+static Expected<std::pair<StringRef, uint64_t>>
+parseSetSectionAlignment(StringRef FlagValue) {
+  if (!FlagValue.contains('='))
+    return createStringError(
+        errc::invalid_argument,
+        "bad format for --set-section-alignment: missing '='");
+  auto Split = StringRef(FlagValue).split('=');
+  if (Split.first.empty())
+    return createStringError(
+        errc::invalid_argument,
+        "bad format for --set-section-alignment: missing section name");
+  uint64_t NewAlign;
+  if (Split.second.getAsInteger(0, NewAlign))
+    return createStringError(
+        errc::invalid_argument,
+        "invalid alignment for --set-section-alignment: '%s'",
+        Split.second.str().c_str());
+  return std::make_pair(Split.first, NewAlign);
+}
+
+static Expected<SectionFlagsUpdate>
+parseSetSectionFlagValue(StringRef FlagValue) {
+  if (!StringRef(FlagValue).contains('='))
+    return createStringError(errc::invalid_argument,
+                             "bad format for --set-section-flags: missing '='");
+
+  // Initial split: ".foo" = "f1,f2,..."
+  auto Section2Flags = StringRef(FlagValue).split('=');
+  SectionFlagsUpdate SFU;
+  SFU.Name = Section2Flags.first;
+
+  // Flags split: "f1" "f2" ...
+  SmallVector<StringRef, 6> SectionFlags;
+  Section2Flags.second.split(SectionFlags, ',');
+  Expected<SectionFlag> ParsedFlagSet = parseSectionFlagSet(SectionFlags);
+  if (!ParsedFlagSet)
+    return ParsedFlagSet.takeError();
+  SFU.NewFlags = *ParsedFlagSet;
+
+  return SFU;
+}
+
+namespace {
+struct TargetInfo {
+  FileFormat Format;
+  MachineInfo Machine;
+};
+} // namespace
+
+// FIXME: consolidate with the bfd parsing used by lld.
+static const StringMap<MachineInfo> TargetMap{
+    // Name, {EMachine, 64bit, LittleEndian}
+    // x86
+    {"elf32-i386", {ELF::EM_386, false, true}},
+    {"elf32-x86-64", {ELF::EM_X86_64, false, true}},
+    {"elf64-x86-64", {ELF::EM_X86_64, true, true}},
+    // Intel MCU
+    {"elf32-iamcu", {ELF::EM_IAMCU, false, true}},
+    // ARM
+    {"elf32-littlearm", {ELF::EM_ARM, false, true}},
+    // ARM AArch64
+    {"elf64-aarch64", {ELF::EM_AARCH64, true, true}},
+    {"elf64-littleaarch64", {ELF::EM_AARCH64, true, true}},
+    // RISC-V
+    {"elf32-littleriscv", {ELF::EM_RISCV, false, true}},
+    {"elf64-littleriscv", {ELF::EM_RISCV, true, true}},
+    // PowerPC
+    {"elf32-powerpc", {ELF::EM_PPC, false, false}},
+    {"elf32-powerpcle", {ELF::EM_PPC, false, true}},
+    {"elf64-powerpc", {ELF::EM_PPC64, true, false}},
+    {"elf64-powerpcle", {ELF::EM_PPC64, true, true}},
+    // MIPS
+    {"elf32-bigmips", {ELF::EM_MIPS, false, false}},
+    {"elf32-ntradbigmips", {ELF::EM_MIPS, false, false}},
+    {"elf32-ntradlittlemips", {ELF::EM_MIPS, false, true}},
+    {"elf32-tradbigmips", {ELF::EM_MIPS, false, false}},
+    {"elf32-tradlittlemips", {ELF::EM_MIPS, false, true}},
+    {"elf64-tradbigmips", {ELF::EM_MIPS, true, false}},
+    {"elf64-tradlittlemips", {ELF::EM_MIPS, true, true}},
+    // SPARC
+    {"elf32-sparc", {ELF::EM_SPARC, false, false}},
+    {"elf32-sparcel", {ELF::EM_SPARC, false, true}},
+    {"elf32-hexagon", {ELF::EM_HEXAGON, false, true}},
+};
+
+static Expected<TargetInfo>
+getOutputTargetInfoByTargetName(StringRef TargetName) {
+  StringRef OriginalTargetName = TargetName;
+  bool IsFreeBSD = TargetName.consume_back("-freebsd");
+  auto Iter = TargetMap.find(TargetName);
+  if (Iter == std::end(TargetMap))
+    return createStringError(errc::invalid_argument,
+                             "invalid output format: '%s'",
+                             OriginalTargetName.str().c_str());
+  MachineInfo MI = Iter->getValue();
+  if (IsFreeBSD)
+    MI.OSABI = ELF::ELFOSABI_FREEBSD;
+
+  FileFormat Format;
+  if (TargetName.startswith("elf"))
+    Format = FileFormat::ELF;
+  else
+    // This should never happen because `TargetName` is valid (it certainly
+    // exists in the TargetMap).
+    llvm_unreachable("unknown target prefix");
+
+  return {TargetInfo{Format, MI}};
+}
+
+static Error addSymbolsFromFile(NameMatcher &Symbols, BumpPtrAllocator &Alloc,
+                                StringRef Filename, MatchStyle MS,
+                                function_ref<Error(Error)> ErrorCallback) {
+  StringSaver Saver(Alloc);
+  SmallVector<StringRef, 16> Lines;
+  auto BufOrErr = MemoryBuffer::getFile(Filename);
+  if (!BufOrErr)
+    return createFileError(Filename, BufOrErr.getError());
+
+  BufOrErr.get()->getBuffer().split(Lines, '\n');
+  for (StringRef Line : Lines) {
+    // Ignore everything after '#', trim whitespace, and only add the symbol if
+    // it's not empty.
+    auto TrimmedLine = Line.split('#').first.trim();
+    if (!TrimmedLine.empty())
+      if (Error E = Symbols.addMatcher(NameOrPattern::create(
+              Saver.save(TrimmedLine), MS, ErrorCallback)))
+        return E;
+  }
+
+  return Error::success();
+}
+
+static Error addSymbolsToRenameFromFile(StringMap<StringRef> &SymbolsToRename,
+                                        BumpPtrAllocator &Alloc,
+                                        StringRef Filename) {
+  StringSaver Saver(Alloc);
+  SmallVector<StringRef, 16> Lines;
+  auto BufOrErr = MemoryBuffer::getFile(Filename);
+  if (!BufOrErr)
+    return createFileError(Filename, BufOrErr.getError());
+
+  BufOrErr.get()->getBuffer().split(Lines, '\n');
+  size_t NumLines = Lines.size();
+  for (size_t LineNo = 0; LineNo < NumLines; ++LineNo) {
+    StringRef TrimmedLine = Lines[LineNo].split('#').first.trim();
+    if (TrimmedLine.empty())
+      continue;
+
+    std::pair<StringRef, StringRef> Pair = Saver.save(TrimmedLine).split(' ');
+    StringRef NewName = Pair.second.trim();
+    if (NewName.empty())
+      return createStringError(errc::invalid_argument,
+                               "%s:%zu: missing new symbol name",
+                               Filename.str().c_str(), LineNo + 1);
+    SymbolsToRename.insert({Pair.first, NewName});
+  }
+  return Error::success();
+}
+
+template <class T> static ErrorOr<T> getAsInteger(StringRef Val) {
+  T Result;
+  if (Val.getAsInteger(0, Result))
+    return errc::invalid_argument;
+  return Result;
+}
+
+namespace {
+
+enum class ToolType { Objcopy, Strip, InstallNameTool, BitcodeStrip };
+
+} // anonymous namespace
+
+static void printHelp(const opt::OptTable &OptTable, raw_ostream &OS,
+                      ToolType Tool) {
+  StringRef HelpText, ToolName;
+  switch (Tool) {
+  case ToolType::Objcopy:
+    ToolName = "llvm-objcopy";
+    HelpText = " [options] input [output]";
+    break;
+  case ToolType::Strip:
+    ToolName = "llvm-strip";
+    HelpText = " [options] inputs...";
+    break;
+  case ToolType::InstallNameTool:
+    ToolName = "llvm-install-name-tool";
+    HelpText = " [options] input";
+    break;
+  case ToolType::BitcodeStrip:
+    ToolName = "llvm-bitcode-strip";
+    HelpText = " [options] input";
+    break;
+  }
+  OptTable.printHelp(OS, (ToolName + HelpText).str().c_str(),
+                     (ToolName + " tool").str().c_str());
+  // TODO: Replace this with libOption call once it adds extrahelp support.
+  // The CommandLine library has a cl::extrahelp class to support this,
+  // but libOption does not have that yet.
+  OS << "\nPass @FILE as argument to read options from FILE.\n";
+}
+
+static Expected<NewSymbolInfo> parseNewSymbolInfo(StringRef FlagValue) {
+  // Parse value given with --add-symbol option and create the
+  // new symbol if possible. The value format for --add-symbol is:
+  //
+  // <name>=[<section>:]<value>[,<flags>]
+  //
+  // where:
+  // <name> - symbol name, can be empty string
+  // <section> - optional section name. If not given ABS symbol is created
+  // <value> - symbol value, can be decimal or hexadecimal number prefixed
+  //           with 0x.
+  // <flags> - optional flags affecting symbol type, binding or visibility.
+  NewSymbolInfo SI;
+  StringRef Value;
+  std::tie(SI.SymbolName, Value) = FlagValue.split('=');
+  if (Value.empty())
+    return createStringError(
+        errc::invalid_argument,
+        "bad format for --add-symbol, missing '=' after '%s'",
+        SI.SymbolName.str().c_str());
+
+  if (Value.contains(':')) {
+    std::tie(SI.SectionName, Value) = Value.split(':');
+    if (SI.SectionName.empty() || Value.empty())
+      return createStringError(
+          errc::invalid_argument,
+          "bad format for --add-symbol, missing section name or symbol value");
+  }
+
+  SmallVector<StringRef, 6> Flags;
+  Value.split(Flags, ',');
+  if (Flags[0].getAsInteger(0, SI.Value))
+    return createStringError(errc::invalid_argument, "bad symbol value: '%s'",
+                             Flags[0].str().c_str());
+
+  using Functor = std::function<void()>;
+  SmallVector<StringRef, 6> UnsupportedFlags;
+  for (size_t I = 1, NumFlags = Flags.size(); I < NumFlags; ++I)
+    static_cast<Functor>(
+        StringSwitch<Functor>(Flags[I])
+            .CaseLower("global",
+                       [&] { SI.Flags.push_back(SymbolFlag::Global); })
+            .CaseLower("local", [&] { SI.Flags.push_back(SymbolFlag::Local); })
+            .CaseLower("weak", [&] { SI.Flags.push_back(SymbolFlag::Weak); })
+            .CaseLower("default",
+                       [&] { SI.Flags.push_back(SymbolFlag::Default); })
+            .CaseLower("hidden",
+                       [&] { SI.Flags.push_back(SymbolFlag::Hidden); })
+            .CaseLower("protected",
+                       [&] { SI.Flags.push_back(SymbolFlag::Protected); })
+            .CaseLower("file", [&] { SI.Flags.push_back(SymbolFlag::File); })
+            .CaseLower("section",
+                       [&] { SI.Flags.push_back(SymbolFlag::Section); })
+            .CaseLower("object",
+                       [&] { SI.Flags.push_back(SymbolFlag::Object); })
+            .CaseLower("function",
+                       [&] { SI.Flags.push_back(SymbolFlag::Function); })
+            .CaseLower(
+                "indirect-function",
+                [&] { SI.Flags.push_back(SymbolFlag::IndirectFunction); })
+            .CaseLower("debug", [&] { SI.Flags.push_back(SymbolFlag::Debug); })
+            .CaseLower("constructor",
+                       [&] { SI.Flags.push_back(SymbolFlag::Constructor); })
+            .CaseLower("warning",
+                       [&] { SI.Flags.push_back(SymbolFlag::Warning); })
+            .CaseLower("indirect",
+                       [&] { SI.Flags.push_back(SymbolFlag::Indirect); })
+            .CaseLower("synthetic",
+                       [&] { SI.Flags.push_back(SymbolFlag::Synthetic); })
+            .CaseLower("unique-object",
+                       [&] { SI.Flags.push_back(SymbolFlag::UniqueObject); })
+            .StartsWithLower("before=",
+                             [&] {
+                               StringRef SymNamePart =
+                                   Flags[I].split('=').second;
+
+                               if (!SymNamePart.empty())
+                                 SI.BeforeSyms.push_back(SymNamePart);
+                             })
+            .Default([&] { UnsupportedFlags.push_back(Flags[I]); }))();
+  if (!UnsupportedFlags.empty())
+    return createStringError(errc::invalid_argument,
+                             "unsupported flag%s for --add-symbol: '%s'",
+                             UnsupportedFlags.size() > 1 ? "s" : "",
+                             join(UnsupportedFlags, "', '").c_str());
+
+  return SI;
+}
+
+// Parse input option \p ArgValue and load section data. This function
+// extracts section name and name of the file keeping section data from
+// ArgValue, loads data from the file, and stores section name and data
+// into the vector of new sections \p NewSections.
+static Error loadNewSectionData(StringRef ArgValue, StringRef OptionName,
+                                std::vector<NewSectionInfo> &NewSections) {
+  if (!ArgValue.contains('='))
+    return createStringError(errc::invalid_argument,
+                             "bad format for " + OptionName + ": missing '='");
+
+  std::pair<StringRef, StringRef> SecPair = ArgValue.split("=");
+  if (SecPair.second.empty())
+    return createStringError(errc::invalid_argument, "bad format for " +
+                                                         OptionName +
+                                                         ": missing file name");
+
+  ErrorOr<std::unique_ptr<MemoryBuffer>> BufOrErr =
+      MemoryBuffer::getFile(SecPair.second);
+  if (!BufOrErr)
+    return createFileError(SecPair.second,
+                           errorCodeToError(BufOrErr.getError()));
+
+  NewSections.push_back({SecPair.first, std::move(*BufOrErr)});
+  return Error::success();
+}
+
+// parseObjcopyOptions returns the config and sets the input arguments. If a
+// help flag is set then parseObjcopyOptions will print the help messege and
+// exit.
+Expected<DriverConfig>
+objcopy::parseObjcopyOptions(ArrayRef<const char *> RawArgsArr,
+                             function_ref<Error(Error)> ErrorCallback) {
+  DriverConfig DC;
+  ObjcopyOptTable T;
+
+  const char *const *DashDash =
+      std::find_if(RawArgsArr.begin(), RawArgsArr.end(),
+                   [](StringRef Str) { return Str == "--"; });
+  ArrayRef<const char *> ArgsArr = makeArrayRef(RawArgsArr.begin(), DashDash);
+  if (DashDash != RawArgsArr.end())
+    DashDash = std::next(DashDash);
+
+  unsigned MissingArgumentIndex, MissingArgumentCount;
+  llvm::opt::InputArgList InputArgs =
+      T.ParseArgs(ArgsArr, MissingArgumentIndex, MissingArgumentCount);
+
+  if (InputArgs.size() == 0 && DashDash == RawArgsArr.end()) {
+    printHelp(T, errs(), ToolType::Objcopy);
+    exit(1);
+  }
+
+  if (InputArgs.hasArg(OBJCOPY_help)) {
+    printHelp(T, outs(), ToolType::Objcopy);
+    exit(0);
+  }
+
+  if (InputArgs.hasArg(OBJCOPY_version)) {
+    outs() << "llvm-objcopy, compatible with GNU objcopy\n";
+    cl::PrintVersionMessage();
+    exit(0);
+  }
+
+  SmallVector<const char *, 2> Positional;
+
+  for (auto Arg : InputArgs.filtered(OBJCOPY_UNKNOWN))
+    return createStringError(errc::invalid_argument, "unknown argument '%s'",
+                             Arg->getAsString(InputArgs).c_str());
+
+  for (auto Arg : InputArgs.filtered(OBJCOPY_INPUT))
+    Positional.push_back(Arg->getValue());
+  std::copy(DashDash, RawArgsArr.end(), std::back_inserter(Positional));
+
+  if (Positional.empty())
+    return createStringError(errc::invalid_argument, "no input file specified");
+
+  if (Positional.size() > 2)
+    return createStringError(errc::invalid_argument,
+                             "too many positional arguments");
+
+  ConfigManager ConfigMgr;
+  CommonConfig &Config = ConfigMgr.Common;
+  COFFConfig &COFFConfig = ConfigMgr.COFF;
+  ELFConfig &ELFConfig = ConfigMgr.ELF;
+  MachOConfig &MachOConfig = ConfigMgr.MachO;
+  Config.InputFilename = Positional[0];
+  Config.OutputFilename = Positional[Positional.size() == 1 ? 0 : 1];
+  if (InputArgs.hasArg(OBJCOPY_target) &&
+      (InputArgs.hasArg(OBJCOPY_input_target) ||
+       InputArgs.hasArg(OBJCOPY_output_target)))
+    return createStringError(
+        errc::invalid_argument,
+        "--target cannot be used with --input-target or --output-target");
+
+  if (InputArgs.hasArg(OBJCOPY_regex) && InputArgs.hasArg(OBJCOPY_wildcard))
+    return createStringError(errc::invalid_argument,
+                             "--regex and --wildcard are incompatible");
+
+  MatchStyle SectionMatchStyle = InputArgs.hasArg(OBJCOPY_regex)
+                                     ? MatchStyle::Regex
+                                     : MatchStyle::Wildcard;
+  MatchStyle SymbolMatchStyle
+      = InputArgs.hasArg(OBJCOPY_regex)    ? MatchStyle::Regex
+      : InputArgs.hasArg(OBJCOPY_wildcard) ? MatchStyle::Wildcard
+                                           : MatchStyle::Literal;
+  StringRef InputFormat, OutputFormat;
+  if (InputArgs.hasArg(OBJCOPY_target)) {
+    InputFormat = InputArgs.getLastArgValue(OBJCOPY_target);
+    OutputFormat = InputArgs.getLastArgValue(OBJCOPY_target);
+  } else {
+    InputFormat = InputArgs.getLastArgValue(OBJCOPY_input_target);
+    OutputFormat = InputArgs.getLastArgValue(OBJCOPY_output_target);
+  }
+
+  // FIXME:  Currently, we ignore the target for non-binary/ihex formats
+  // explicitly specified by -I option (e.g. -Ielf32-x86-64) and guess the
+  // format by llvm::object::createBinary regardless of the option value.
+  Config.InputFormat = StringSwitch<FileFormat>(InputFormat)
+                           .Case("binary", FileFormat::Binary)
+                           .Case("ihex", FileFormat::IHex)
+                           .Default(FileFormat::Unspecified);
+
+  if (InputArgs.hasArg(OBJCOPY_new_symbol_visibility)) {
+    const uint8_t Invalid = 0xff;
+    StringRef VisibilityStr =
+        InputArgs.getLastArgValue(OBJCOPY_new_symbol_visibility);
+
+    ELFConfig.NewSymbolVisibility = StringSwitch<uint8_t>(VisibilityStr)
+                                        .Case("default", ELF::STV_DEFAULT)
+                                        .Case("hidden", ELF::STV_HIDDEN)
+                                        .Case("internal", ELF::STV_INTERNAL)
+                                        .Case("protected", ELF::STV_PROTECTED)
+                                        .Default(Invalid);
+
+    if (ELFConfig.NewSymbolVisibility == Invalid)
+      return createStringError(errc::invalid_argument,
+                               "'%s' is not a valid symbol visibility",
+                               VisibilityStr.str().c_str());
+  }
+
+  for (const auto *Arg : InputArgs.filtered(OBJCOPY_subsystem)) {
+    StringRef Subsystem, Version;
+    std::tie(Subsystem, Version) = StringRef(Arg->getValue()).split(':');
+    COFFConfig.Subsystem =
+        StringSwitch<unsigned>(Subsystem.lower())
+            .Case("boot_application",
+                  COFF::IMAGE_SUBSYSTEM_WINDOWS_BOOT_APPLICATION)
+            .Case("console", COFF::IMAGE_SUBSYSTEM_WINDOWS_CUI)
+            .Case("efi_application", COFF::IMAGE_SUBSYSTEM_EFI_APPLICATION)
+            .Case("efi_boot_service_driver",
+                  COFF::IMAGE_SUBSYSTEM_EFI_BOOT_SERVICE_DRIVER)
+            .Case("efi_rom", COFF::IMAGE_SUBSYSTEM_EFI_ROM)
+            .Case("efi_runtime_driver",
+                  COFF::IMAGE_SUBSYSTEM_EFI_RUNTIME_DRIVER)
+            .Case("native", COFF::IMAGE_SUBSYSTEM_NATIVE)
+            .Case("posix", COFF::IMAGE_SUBSYSTEM_POSIX_CUI)
+            .Case("windows", COFF::IMAGE_SUBSYSTEM_WINDOWS_GUI)
+            .Default(COFF::IMAGE_SUBSYSTEM_UNKNOWN);
+    if (*COFFConfig.Subsystem == COFF::IMAGE_SUBSYSTEM_UNKNOWN)
+      return createStringError(errc::invalid_argument,
+                               "'%s' is not a valid subsystem",
+                               Subsystem.str().c_str());
+    if (!Version.empty()) {
+      StringRef Major, Minor;
+      std::tie(Major, Minor) = Version.split('.');
+      unsigned Number;
+      if (Major.getAsInteger(10, Number))
+        return createStringError(errc::invalid_argument,
+                                 "'%s' is not a valid subsystem major version",
+                                 Major.str().c_str());
+      COFFConfig.MajorSubsystemVersion = Number;
+      Number = 0;
+      if (!Minor.empty() && Minor.getAsInteger(10, Number))
+        return createStringError(errc::invalid_argument,
+                                 "'%s' is not a valid subsystem minor version",
+                                 Minor.str().c_str());
+      COFFConfig.MinorSubsystemVersion = Number;
+    }
+  }
+
+  Config.OutputFormat = StringSwitch<FileFormat>(OutputFormat)
+                            .Case("binary", FileFormat::Binary)
+                            .Case("ihex", FileFormat::IHex)
+                            .Default(FileFormat::Unspecified);
+  if (Config.OutputFormat == FileFormat::Unspecified) {
+    if (OutputFormat.empty()) {
+      Config.OutputFormat = Config.InputFormat;
+    } else {
+      Expected<TargetInfo> Target =
+          getOutputTargetInfoByTargetName(OutputFormat);
+      if (!Target)
+        return Target.takeError();
+      Config.OutputFormat = Target->Format;
+      Config.OutputArch = Target->Machine;
+    }
+  }
+
+  if (auto Arg = InputArgs.getLastArg(OBJCOPY_compress_debug_sections,
+                                      OBJCOPY_compress_debug_sections_eq)) {
+    Config.CompressionType = DebugCompressionType::Z;
+
+    if (Arg->getOption().getID() == OBJCOPY_compress_debug_sections_eq) {
+      Config.CompressionType =
+          StringSwitch<DebugCompressionType>(
+              InputArgs.getLastArgValue(OBJCOPY_compress_debug_sections_eq))
+              .Case("zlib", DebugCompressionType::Z)
+              .Default(DebugCompressionType::None);
+      if (Config.CompressionType == DebugCompressionType::None)
+        return createStringError(
+            errc::invalid_argument,
+            "invalid or unsupported --compress-debug-sections format: %s",
+            InputArgs.getLastArgValue(OBJCOPY_compress_debug_sections_eq)
+                .str()
+                .c_str());
+    }
+    if (!zlib::isAvailable())
+      return createStringError(
+          errc::invalid_argument,
+          "LLVM was not compiled with LLVM_ENABLE_ZLIB: can not compress");
+  }
+
+  Config.AddGnuDebugLink = InputArgs.getLastArgValue(OBJCOPY_add_gnu_debuglink);
+  // The gnu_debuglink's target is expected to not change or else its CRC would
+  // become invalidated and get rejected. We can avoid recalculating the
+  // checksum for every target file inside an archive by precomputing the CRC
+  // here. This prevents a significant amount of I/O.
+  if (!Config.AddGnuDebugLink.empty()) {
+    auto DebugOrErr = MemoryBuffer::getFile(Config.AddGnuDebugLink);
+    if (!DebugOrErr)
+      return createFileError(Config.AddGnuDebugLink, DebugOrErr.getError());
+    auto Debug = std::move(*DebugOrErr);
+    Config.GnuDebugLinkCRC32 =
+        llvm::crc32(arrayRefFromStringRef(Debug->getBuffer()));
+  }
+  Config.SplitDWO = InputArgs.getLastArgValue(OBJCOPY_split_dwo);
+  Config.SymbolsPrefix = InputArgs.getLastArgValue(OBJCOPY_prefix_symbols);
+  Config.AllocSectionsPrefix =
+      InputArgs.getLastArgValue(OBJCOPY_prefix_alloc_sections);
+  if (auto Arg = InputArgs.getLastArg(OBJCOPY_extract_partition))
+    Config.ExtractPartition = Arg->getValue();
+
+  for (auto Arg : InputArgs.filtered(OBJCOPY_redefine_symbol)) {
+    if (!StringRef(Arg->getValue()).contains('='))
+      return createStringError(errc::invalid_argument,
+                               "bad format for --redefine-sym");
+    auto Old2New = StringRef(Arg->getValue()).split('=');
+    if (!Config.SymbolsToRename.insert(Old2New).second)
+      return createStringError(errc::invalid_argument,
+                               "multiple redefinition of symbol '%s'",
+                               Old2New.first.str().c_str());
+  }
+
+  for (auto Arg : InputArgs.filtered(OBJCOPY_redefine_symbols))
+    if (Error E = addSymbolsToRenameFromFile(Config.SymbolsToRename, DC.Alloc,
+                                             Arg->getValue()))
+      return std::move(E);
+
+  for (auto Arg : InputArgs.filtered(OBJCOPY_rename_section)) {
+    Expected<SectionRename> SR =
+        parseRenameSectionValue(StringRef(Arg->getValue()));
+    if (!SR)
+      return SR.takeError();
+    if (!Config.SectionsToRename.try_emplace(SR->OriginalName, *SR).second)
+      return createStringError(errc::invalid_argument,
+                               "multiple renames of section '%s'",
+                               SR->OriginalName.str().c_str());
+  }
+  for (auto Arg : InputArgs.filtered(OBJCOPY_set_section_alignment)) {
+    Expected<std::pair<StringRef, uint64_t>> NameAndAlign =
+        parseSetSectionAlignment(Arg->getValue());
+    if (!NameAndAlign)
+      return NameAndAlign.takeError();
+    Config.SetSectionAlignment[NameAndAlign->first] = NameAndAlign->second;
+  }
+  for (auto Arg : InputArgs.filtered(OBJCOPY_set_section_flags)) {
+    Expected<SectionFlagsUpdate> SFU =
+        parseSetSectionFlagValue(Arg->getValue());
+    if (!SFU)
+      return SFU.takeError();
+    if (!Config.SetSectionFlags.try_emplace(SFU->Name, *SFU).second)
+      return createStringError(
+          errc::invalid_argument,
+          "--set-section-flags set multiple times for section '%s'",
+          SFU->Name.str().c_str());
+  }
+  // Prohibit combinations of --set-section-flags when the section name is used
+  // by --rename-section, either as a source or a destination.
+  for (const auto &E : Config.SectionsToRename) {
+    const SectionRename &SR = E.second;
+    if (Config.SetSectionFlags.count(SR.OriginalName))
+      return createStringError(
+          errc::invalid_argument,
+          "--set-section-flags=%s conflicts with --rename-section=%s=%s",
+          SR.OriginalName.str().c_str(), SR.OriginalName.str().c_str(),
+          SR.NewName.str().c_str());
+    if (Config.SetSectionFlags.count(SR.NewName))
+      return createStringError(
+          errc::invalid_argument,
+          "--set-section-flags=%s conflicts with --rename-section=%s=%s",
+          SR.NewName.str().c_str(), SR.OriginalName.str().c_str(),
+          SR.NewName.str().c_str());
+  }
+
+  for (auto Arg : InputArgs.filtered(OBJCOPY_remove_section))
+    if (Error E = Config.ToRemove.addMatcher(NameOrPattern::create(
+            Arg->getValue(), SectionMatchStyle, ErrorCallback)))
+      return std::move(E);
+  for (auto Arg : InputArgs.filtered(OBJCOPY_keep_section))
+    if (Error E = Config.KeepSection.addMatcher(NameOrPattern::create(
+            Arg->getValue(), SectionMatchStyle, ErrorCallback)))
+      return std::move(E);
+  for (auto Arg : InputArgs.filtered(OBJCOPY_only_section))
+    if (Error E = Config.OnlySection.addMatcher(NameOrPattern::create(
+            Arg->getValue(), SectionMatchStyle, ErrorCallback)))
+      return std::move(E);
+  for (auto Arg : InputArgs.filtered(OBJCOPY_add_section)) {
+    if (Error Err = loadNewSectionData(Arg->getValue(), "--add-section",
+                                       Config.AddSection))
+      return std::move(Err);
+  }
+  for (auto Arg : InputArgs.filtered(OBJCOPY_update_section)) {
+    if (Error Err = loadNewSectionData(Arg->getValue(), "--update-section",
+                                       Config.UpdateSection))
+      return std::move(Err);
+  }
+  for (auto *Arg : InputArgs.filtered(OBJCOPY_dump_section)) {
+    StringRef Value(Arg->getValue());
+    if (Value.split('=').second.empty())
+      return createStringError(
+          errc::invalid_argument,
+          "bad format for --dump-section, expected section=file");
+    Config.DumpSection.push_back(Value);
+  }
+  Config.StripAll = InputArgs.hasArg(OBJCOPY_strip_all);
+  Config.StripAllGNU = InputArgs.hasArg(OBJCOPY_strip_all_gnu);
+  Config.StripDebug = InputArgs.hasArg(OBJCOPY_strip_debug);
+  Config.StripDWO = InputArgs.hasArg(OBJCOPY_strip_dwo);
+  Config.StripSections = InputArgs.hasArg(OBJCOPY_strip_sections);
+  Config.StripNonAlloc = InputArgs.hasArg(OBJCOPY_strip_non_alloc);
+  Config.StripUnneeded = InputArgs.hasArg(OBJCOPY_strip_unneeded);
+  Config.ExtractDWO = InputArgs.hasArg(OBJCOPY_extract_dwo);
+  Config.ExtractMainPartition =
+      InputArgs.hasArg(OBJCOPY_extract_main_partition);
+  ELFConfig.LocalizeHidden = InputArgs.hasArg(OBJCOPY_localize_hidden);
+  Config.Weaken = InputArgs.hasArg(OBJCOPY_weaken);
+  if (auto *Arg =
+          InputArgs.getLastArg(OBJCOPY_discard_all, OBJCOPY_discard_locals)) {
+    Config.DiscardMode = Arg->getOption().matches(OBJCOPY_discard_all)
+                             ? DiscardType::All
+                             : DiscardType::Locals;
+  }
+  Config.OnlyKeepDebug = InputArgs.hasArg(OBJCOPY_only_keep_debug);
+  ELFConfig.KeepFileSymbols = InputArgs.hasArg(OBJCOPY_keep_file_symbols);
+  MachOConfig.KeepUndefined = InputArgs.hasArg(OBJCOPY_keep_undefined);
+  Config.DecompressDebugSections =
+      InputArgs.hasArg(OBJCOPY_decompress_debug_sections);
+  if (Config.DiscardMode == DiscardType::All) {
+    Config.StripDebug = true;
+    ELFConfig.KeepFileSymbols = true;
+  }
+  for (auto Arg : InputArgs.filtered(OBJCOPY_localize_symbol))
+    if (Error E = Config.SymbolsToLocalize.addMatcher(NameOrPattern::create(
+            Arg->getValue(), SymbolMatchStyle, ErrorCallback)))
+      return std::move(E);
+  for (auto Arg : InputArgs.filtered(OBJCOPY_localize_symbols))
+    if (Error E = addSymbolsFromFile(Config.SymbolsToLocalize, DC.Alloc,
+                                     Arg->getValue(), SymbolMatchStyle,
+                                     ErrorCallback))
+      return std::move(E);
+  for (auto Arg : InputArgs.filtered(OBJCOPY_keep_global_symbol))
+    if (Error E = Config.SymbolsToKeepGlobal.addMatcher(NameOrPattern::create(
+            Arg->getValue(), SymbolMatchStyle, ErrorCallback)))
+      return std::move(E);
+  for (auto Arg : InputArgs.filtered(OBJCOPY_keep_global_symbols))
+    if (Error E = addSymbolsFromFile(Config.SymbolsToKeepGlobal, DC.Alloc,
+                                     Arg->getValue(), SymbolMatchStyle,
+                                     ErrorCallback))
+      return std::move(E);
+  for (auto Arg : InputArgs.filtered(OBJCOPY_globalize_symbol))
+    if (Error E = Config.SymbolsToGlobalize.addMatcher(NameOrPattern::create(
+            Arg->getValue(), SymbolMatchStyle, ErrorCallback)))
+      return std::move(E);
+  for (auto Arg : InputArgs.filtered(OBJCOPY_globalize_symbols))
+    if (Error E = addSymbolsFromFile(Config.SymbolsToGlobalize, DC.Alloc,
+                                     Arg->getValue(), SymbolMatchStyle,
+                                     ErrorCallback))
+      return std::move(E);
+  for (auto Arg : InputArgs.filtered(OBJCOPY_weaken_symbol))
+    if (Error E = Config.SymbolsToWeaken.addMatcher(NameOrPattern::create(
+            Arg->getValue(), SymbolMatchStyle, ErrorCallback)))
+      return std::move(E);
+  for (auto Arg : InputArgs.filtered(OBJCOPY_weaken_symbols))
+    if (Error E = addSymbolsFromFile(Config.SymbolsToWeaken, DC.Alloc,
+                                     Arg->getValue(), SymbolMatchStyle,
+                                     ErrorCallback))
+      return std::move(E);
+  for (auto Arg : InputArgs.filtered(OBJCOPY_strip_symbol))
+    if (Error E = Config.SymbolsToRemove.addMatcher(NameOrPattern::create(
+            Arg->getValue(), SymbolMatchStyle, ErrorCallback)))
+      return std::move(E);
+  for (auto Arg : InputArgs.filtered(OBJCOPY_strip_symbols))
+    if (Error E = addSymbolsFromFile(Config.SymbolsToRemove, DC.Alloc,
+                                     Arg->getValue(), SymbolMatchStyle,
+                                     ErrorCallback))
+      return std::move(E);
+  for (auto Arg : InputArgs.filtered(OBJCOPY_strip_unneeded_symbol))
+    if (Error E =
+            Config.UnneededSymbolsToRemove.addMatcher(NameOrPattern::create(
+                Arg->getValue(), SymbolMatchStyle, ErrorCallback)))
+      return std::move(E);
+  for (auto Arg : InputArgs.filtered(OBJCOPY_strip_unneeded_symbols))
+    if (Error E = addSymbolsFromFile(Config.UnneededSymbolsToRemove, DC.Alloc,
+                                     Arg->getValue(), SymbolMatchStyle,
+                                     ErrorCallback))
+      return std::move(E);
+  for (auto Arg : InputArgs.filtered(OBJCOPY_keep_symbol))
+    if (Error E = Config.SymbolsToKeep.addMatcher(NameOrPattern::create(
+            Arg->getValue(), SymbolMatchStyle, ErrorCallback)))
+      return std::move(E);
+  for (auto Arg : InputArgs.filtered(OBJCOPY_keep_symbols))
+    if (Error E =
+            addSymbolsFromFile(Config.SymbolsToKeep, DC.Alloc, Arg->getValue(),
+                               SymbolMatchStyle, ErrorCallback))
+      return std::move(E);
+  for (auto *Arg : InputArgs.filtered(OBJCOPY_add_symbol)) {
+    Expected<NewSymbolInfo> SymInfo = parseNewSymbolInfo(Arg->getValue());
+    if (!SymInfo)
+      return SymInfo.takeError();
+
+    Config.SymbolsToAdd.push_back(*SymInfo);
+  }
+
+  ELFConfig.AllowBrokenLinks = InputArgs.hasArg(OBJCOPY_allow_broken_links);
+
+  Config.DeterministicArchives = InputArgs.hasFlag(
+      OBJCOPY_enable_deterministic_archives,
+      OBJCOPY_disable_deterministic_archives, /*default=*/true);
+
+  Config.PreserveDates = InputArgs.hasArg(OBJCOPY_preserve_dates);
+
+  if (Config.PreserveDates &&
+      (Config.OutputFilename == "-" || Config.InputFilename == "-"))
+    return createStringError(errc::invalid_argument,
+                             "--preserve-dates requires a file");
+
+  for (auto Arg : InputArgs)
+    if (Arg->getOption().matches(OBJCOPY_set_start)) {
+      auto EAddr = getAsInteger<uint64_t>(Arg->getValue());
+      if (!EAddr)
+        return createStringError(
+            EAddr.getError(), "bad entry point address: '%s'", Arg->getValue());
+
+      ELFConfig.EntryExpr = [EAddr](uint64_t) { return *EAddr; };
+    } else if (Arg->getOption().matches(OBJCOPY_change_start)) {
+      auto EIncr = getAsInteger<int64_t>(Arg->getValue());
+      if (!EIncr)
+        return createStringError(EIncr.getError(),
+                                 "bad entry point increment: '%s'",
+                                 Arg->getValue());
+      auto Expr = ELFConfig.EntryExpr ? std::move(ELFConfig.EntryExpr)
+                                      : [](uint64_t A) { return A; };
+      ELFConfig.EntryExpr = [Expr, EIncr](uint64_t EAddr) {
+        return Expr(EAddr) + *EIncr;
+      };
+    }
+
+  if (Config.DecompressDebugSections &&
+      Config.CompressionType != DebugCompressionType::None) {
+    return createStringError(
+        errc::invalid_argument,
+        "cannot specify both --compress-debug-sections and "
+        "--decompress-debug-sections");
+  }
+
+  if (Config.DecompressDebugSections && !zlib::isAvailable())
+    return createStringError(
+        errc::invalid_argument,
+        "LLVM was not compiled with LLVM_ENABLE_ZLIB: cannot decompress");
+
+  if (Config.ExtractPartition && Config.ExtractMainPartition)
+    return createStringError(errc::invalid_argument,
+                             "cannot specify --extract-partition together with "
+                             "--extract-main-partition");
+
+  DC.CopyConfigs.push_back(std::move(ConfigMgr));
+  return std::move(DC);
+}
+
+// parseInstallNameToolOptions returns the config and sets the input arguments.
+// If a help flag is set then parseInstallNameToolOptions will print the help
+// messege and exit.
+Expected<DriverConfig>
+objcopy::parseInstallNameToolOptions(ArrayRef<const char *> ArgsArr) {
+  DriverConfig DC;
+  ConfigManager ConfigMgr;
+  CommonConfig &Config = ConfigMgr.Common;
+  MachOConfig &MachOConfig = ConfigMgr.MachO;
+  InstallNameToolOptTable T;
+  unsigned MissingArgumentIndex, MissingArgumentCount;
+  llvm::opt::InputArgList InputArgs =
+      T.ParseArgs(ArgsArr, MissingArgumentIndex, MissingArgumentCount);
+
+  if (MissingArgumentCount)
+    return createStringError(
+        errc::invalid_argument,
+        "missing argument to " +
+            StringRef(InputArgs.getArgString(MissingArgumentIndex)) +
+            " option");
+
+  if (InputArgs.size() == 0) {
+    printHelp(T, errs(), ToolType::InstallNameTool);
+    exit(1);
+  }
+
+  if (InputArgs.hasArg(INSTALL_NAME_TOOL_help)) {
+    printHelp(T, outs(), ToolType::InstallNameTool);
+    exit(0);
+  }
+
+  if (InputArgs.hasArg(INSTALL_NAME_TOOL_version)) {
+    outs() << "llvm-install-name-tool, compatible with cctools "
+              "install_name_tool\n";
+    cl::PrintVersionMessage();
+    exit(0);
+  }
+
+  for (auto Arg : InputArgs.filtered(INSTALL_NAME_TOOL_add_rpath))
+    MachOConfig.RPathToAdd.push_back(Arg->getValue());
+
+  for (auto *Arg : InputArgs.filtered(INSTALL_NAME_TOOL_prepend_rpath))
+    MachOConfig.RPathToPrepend.push_back(Arg->getValue());
+
+  for (auto Arg : InputArgs.filtered(INSTALL_NAME_TOOL_delete_rpath)) {
+    StringRef RPath = Arg->getValue();
+
+    // Cannot add and delete the same rpath at the same time.
+    if (is_contained(MachOConfig.RPathToAdd, RPath))
+      return createStringError(
+          errc::invalid_argument,
+          "cannot specify both -add_rpath '%s' and -delete_rpath '%s'",
+          RPath.str().c_str(), RPath.str().c_str());
+    if (is_contained(MachOConfig.RPathToPrepend, RPath))
+      return createStringError(
+          errc::invalid_argument,
+          "cannot specify both -prepend_rpath '%s' and -delete_rpath '%s'",
+          RPath.str().c_str(), RPath.str().c_str());
+
+    MachOConfig.RPathsToRemove.insert(RPath);
+  }
+
+  for (auto *Arg : InputArgs.filtered(INSTALL_NAME_TOOL_rpath)) {
+    StringRef Old = Arg->getValue(0);
+    StringRef New = Arg->getValue(1);
+
+    auto Match = [=](StringRef RPath) { return RPath == Old || RPath == New; };
+
+    // Cannot specify duplicate -rpath entries
+    auto It1 = find_if(
+        MachOConfig.RPathsToUpdate,
+        [&Match](const DenseMap<StringRef, StringRef>::value_type &OldNew) {
+          return Match(OldNew.getFirst()) || Match(OldNew.getSecond());
+        });
+    if (It1 != MachOConfig.RPathsToUpdate.end())
+      return createStringError(errc::invalid_argument,
+                               "cannot specify both -rpath '" +
+                                   It1->getFirst() + "' '" + It1->getSecond() +
+                                   "' and -rpath '" + Old + "' '" + New + "'");
+
+    // Cannot specify the same rpath under both -delete_rpath and -rpath
+    auto It2 = find_if(MachOConfig.RPathsToRemove, Match);
+    if (It2 != MachOConfig.RPathsToRemove.end())
+      return createStringError(errc::invalid_argument,
+                               "cannot specify both -delete_rpath '" + *It2 +
+                                   "' and -rpath '" + Old + "' '" + New + "'");
+
+    // Cannot specify the same rpath under both -add_rpath and -rpath
+    auto It3 = find_if(MachOConfig.RPathToAdd, Match);
+    if (It3 != MachOConfig.RPathToAdd.end())
+      return createStringError(errc::invalid_argument,
+                               "cannot specify both -add_rpath '" + *It3 +
+                                   "' and -rpath '" + Old + "' '" + New + "'");
+
+    // Cannot specify the same rpath under both -prepend_rpath and -rpath.
+    auto It4 = find_if(MachOConfig.RPathToPrepend, Match);
+    if (It4 != MachOConfig.RPathToPrepend.end())
+      return createStringError(errc::invalid_argument,
+                               "cannot specify both -prepend_rpath '" + *It4 +
+                                   "' and -rpath '" + Old + "' '" + New + "'");
+
+    MachOConfig.RPathsToUpdate.insert({Old, New});
+  }
+
+  if (auto *Arg = InputArgs.getLastArg(INSTALL_NAME_TOOL_id)) {
+    MachOConfig.SharedLibId = Arg->getValue();
+    if (MachOConfig.SharedLibId->empty())
+      return createStringError(errc::invalid_argument,
+                               "cannot specify an empty id");
+  }
+
+  for (auto *Arg : InputArgs.filtered(INSTALL_NAME_TOOL_change))
+    MachOConfig.InstallNamesToUpdate.insert(
+        {Arg->getValue(0), Arg->getValue(1)});
+
+  MachOConfig.RemoveAllRpaths =
+      InputArgs.hasArg(INSTALL_NAME_TOOL_delete_all_rpaths);
+
+  SmallVector<StringRef, 2> Positional;
+  for (auto Arg : InputArgs.filtered(INSTALL_NAME_TOOL_UNKNOWN))
+    return createStringError(errc::invalid_argument, "unknown argument '%s'",
+                             Arg->getAsString(InputArgs).c_str());
+  for (auto Arg : InputArgs.filtered(INSTALL_NAME_TOOL_INPUT))
+    Positional.push_back(Arg->getValue());
+  if (Positional.empty())
+    return createStringError(errc::invalid_argument, "no input file specified");
+  if (Positional.size() > 1)
+    return createStringError(
+        errc::invalid_argument,
+        "llvm-install-name-tool expects a single input file");
+  Config.InputFilename = Positional[0];
+  Config.OutputFilename = Positional[0];
+
+  DC.CopyConfigs.push_back(std::move(ConfigMgr));
+  return std::move(DC);
+}
+
+Expected<DriverConfig>
+objcopy::parseBitcodeStripOptions(ArrayRef<const char *> ArgsArr,
+                                  function_ref<Error(Error)> ErrorCallback) {
+  DriverConfig DC;
+  ConfigManager ConfigMgr;
+  CommonConfig &Config = ConfigMgr.Common;
+  MachOConfig &MachOConfig = ConfigMgr.MachO;
+  BitcodeStripOptTable T;
+  unsigned MissingArgumentIndex, MissingArgumentCount;
+  opt::InputArgList InputArgs =
+      T.ParseArgs(ArgsArr, MissingArgumentIndex, MissingArgumentCount);
+
+  if (InputArgs.size() == 0) {
+    printHelp(T, errs(), ToolType::BitcodeStrip);
+    exit(1);
+  }
+
+  if (InputArgs.hasArg(BITCODE_STRIP_help)) {
+    printHelp(T, outs(), ToolType::BitcodeStrip);
+    exit(0);
+  }
+
+  if (InputArgs.hasArg(BITCODE_STRIP_version)) {
+    outs() << "llvm-bitcode-strip, compatible with cctools "
+              "bitcode_strip\n";
+    cl::PrintVersionMessage();
+    exit(0);
+  }
+
+  for (auto *Arg : InputArgs.filtered(BITCODE_STRIP_UNKNOWN))
+    return createStringError(errc::invalid_argument, "unknown argument '%s'",
+                             Arg->getAsString(InputArgs).c_str());
+
+  SmallVector<StringRef, 2> Positional;
+  for (auto *Arg : InputArgs.filtered(BITCODE_STRIP_INPUT))
+    Positional.push_back(Arg->getValue());
+  if (Positional.size() > 1)
+    return createStringError(errc::invalid_argument,
+                             "llvm-bitcode-strip expects a single input file");
+  assert(!Positional.empty());
+  Config.InputFilename = Positional[0];
+
+  if (!InputArgs.hasArg(BITCODE_STRIP_output)) {
+    return createStringError(errc::invalid_argument,
+                             "-o is a required argument");
+  }
+  Config.OutputFilename = InputArgs.getLastArgValue(BITCODE_STRIP_output);
+
+  if (!InputArgs.hasArg(BITCODE_STRIP_remove))
+    return createStringError(errc::invalid_argument, "no action specified");
+
+  // We only support -r for now, which removes all bitcode sections and
+  // the __LLVM segment if it's now empty.
+  cantFail(Config.ToRemove.addMatcher(NameOrPattern::create(
+      "__LLVM,__bundle", MatchStyle::Literal, ErrorCallback)));
+  MachOConfig.EmptySegmentsToRemove.insert("__LLVM");
+
+  DC.CopyConfigs.push_back(std::move(ConfigMgr));
+  return std::move(DC);
+}
+
+// parseStripOptions returns the config and sets the input arguments. If a
+// help flag is set then parseStripOptions will print the help messege and
+// exit.
+Expected<DriverConfig>
+objcopy::parseStripOptions(ArrayRef<const char *> RawArgsArr,
+                           function_ref<Error(Error)> ErrorCallback) {
+  const char *const *DashDash =
+      std::find_if(RawArgsArr.begin(), RawArgsArr.end(),
+                   [](StringRef Str) { return Str == "--"; });
+  ArrayRef<const char *> ArgsArr = makeArrayRef(RawArgsArr.begin(), DashDash);
+  if (DashDash != RawArgsArr.end())
+    DashDash = std::next(DashDash);
+
+  StripOptTable T;
+  unsigned MissingArgumentIndex, MissingArgumentCount;
+  llvm::opt::InputArgList InputArgs =
+      T.ParseArgs(ArgsArr, MissingArgumentIndex, MissingArgumentCount);
+
+  if (InputArgs.size() == 0 && DashDash == RawArgsArr.end()) {
+    printHelp(T, errs(), ToolType::Strip);
+    exit(1);
+  }
+
+  if (InputArgs.hasArg(STRIP_help)) {
+    printHelp(T, outs(), ToolType::Strip);
+    exit(0);
+  }
+
+  if (InputArgs.hasArg(STRIP_version)) {
+    outs() << "llvm-strip, compatible with GNU strip\n";
+    cl::PrintVersionMessage();
+    exit(0);
+  }
+
+  SmallVector<StringRef, 2> Positional;
+  for (auto Arg : InputArgs.filtered(STRIP_UNKNOWN))
+    return createStringError(errc::invalid_argument, "unknown argument '%s'",
+                             Arg->getAsString(InputArgs).c_str());
+  for (auto Arg : InputArgs.filtered(STRIP_INPUT))
+    Positional.push_back(Arg->getValue());
+  std::copy(DashDash, RawArgsArr.end(), std::back_inserter(Positional));
+
+  if (Positional.empty())
+    return createStringError(errc::invalid_argument, "no input file specified");
+
+  if (Positional.size() > 1 && InputArgs.hasArg(STRIP_output))
+    return createStringError(
+        errc::invalid_argument,
+        "multiple input files cannot be used in combination with -o");
+
+  ConfigManager ConfigMgr;
+  CommonConfig &Config = ConfigMgr.Common;
+  ELFConfig &ELFConfig = ConfigMgr.ELF;
+  MachOConfig &MachOConfig = ConfigMgr.MachO;
+
+  if (InputArgs.hasArg(STRIP_regex) && InputArgs.hasArg(STRIP_wildcard))
+    return createStringError(errc::invalid_argument,
+                             "--regex and --wildcard are incompatible");
+  MatchStyle SectionMatchStyle =
+      InputArgs.hasArg(STRIP_regex) ? MatchStyle::Regex : MatchStyle::Wildcard;
+  MatchStyle SymbolMatchStyle
+      = InputArgs.hasArg(STRIP_regex)    ? MatchStyle::Regex
+      : InputArgs.hasArg(STRIP_wildcard) ? MatchStyle::Wildcard
+                                         : MatchStyle::Literal;
+  ELFConfig.AllowBrokenLinks = InputArgs.hasArg(STRIP_allow_broken_links);
+  Config.StripDebug = InputArgs.hasArg(STRIP_strip_debug);
+
+  if (auto *Arg = InputArgs.getLastArg(STRIP_discard_all, STRIP_discard_locals))
+    Config.DiscardMode = Arg->getOption().matches(STRIP_discard_all)
+                             ? DiscardType::All
+                             : DiscardType::Locals;
+  Config.StripSections = InputArgs.hasArg(STRIP_strip_sections);
+  Config.StripUnneeded = InputArgs.hasArg(STRIP_strip_unneeded);
+  if (auto Arg = InputArgs.getLastArg(STRIP_strip_all, STRIP_no_strip_all))
+    Config.StripAll = Arg->getOption().getID() == STRIP_strip_all;
+  Config.StripAllGNU = InputArgs.hasArg(STRIP_strip_all_gnu);
+  MachOConfig.StripSwiftSymbols = InputArgs.hasArg(STRIP_strip_swift_symbols);
+  Config.OnlyKeepDebug = InputArgs.hasArg(STRIP_only_keep_debug);
+  ELFConfig.KeepFileSymbols = InputArgs.hasArg(STRIP_keep_file_symbols);
+  MachOConfig.KeepUndefined = InputArgs.hasArg(STRIP_keep_undefined);
+
+  for (auto Arg : InputArgs.filtered(STRIP_keep_section))
+    if (Error E = Config.KeepSection.addMatcher(NameOrPattern::create(
+            Arg->getValue(), SectionMatchStyle, ErrorCallback)))
+      return std::move(E);
+
+  for (auto Arg : InputArgs.filtered(STRIP_remove_section))
+    if (Error E = Config.ToRemove.addMatcher(NameOrPattern::create(
+            Arg->getValue(), SectionMatchStyle, ErrorCallback)))
+      return std::move(E);
+
+  for (auto Arg : InputArgs.filtered(STRIP_strip_symbol))
+    if (Error E = Config.SymbolsToRemove.addMatcher(NameOrPattern::create(
+            Arg->getValue(), SymbolMatchStyle, ErrorCallback)))
+      return std::move(E);
+
+  for (auto Arg : InputArgs.filtered(STRIP_keep_symbol))
+    if (Error E = Config.SymbolsToKeep.addMatcher(NameOrPattern::create(
+            Arg->getValue(), SymbolMatchStyle, ErrorCallback)))
+      return std::move(E);
+
+  if (!InputArgs.hasArg(STRIP_no_strip_all) && !Config.StripDebug &&
+      !Config.OnlyKeepDebug && !Config.StripUnneeded &&
+      Config.DiscardMode == DiscardType::None && !Config.StripAllGNU &&
+      Config.SymbolsToRemove.empty())
+    Config.StripAll = true;
+
+  if (Config.DiscardMode == DiscardType::All) {
+    Config.StripDebug = true;
+    ELFConfig.KeepFileSymbols = true;
+  }
+
+  Config.DeterministicArchives =
+      InputArgs.hasFlag(STRIP_enable_deterministic_archives,
+                        STRIP_disable_deterministic_archives, /*default=*/true);
+
+  Config.PreserveDates = InputArgs.hasArg(STRIP_preserve_dates);
+  Config.InputFormat = FileFormat::Unspecified;
+  Config.OutputFormat = FileFormat::Unspecified;
+
+  DriverConfig DC;
+  if (Positional.size() == 1) {
+    Config.InputFilename = Positional[0];
+    Config.OutputFilename =
+        InputArgs.getLastArgValue(STRIP_output, Positional[0]);
+    DC.CopyConfigs.push_back(std::move(ConfigMgr));
+  } else {
+    StringMap<unsigned> InputFiles;
+    for (StringRef Filename : Positional) {
+      if (InputFiles[Filename]++ == 1) {
+        if (Filename == "-")
+          return createStringError(
+              errc::invalid_argument,
+              "cannot specify '-' as an input file more than once");
+        if (Error E = ErrorCallback(createStringError(
+                errc::invalid_argument, "'%s' was already specified",
+                Filename.str().c_str())))
+          return std::move(E);
+      }
+      Config.InputFilename = Filename;
+      Config.OutputFilename = Filename;
+      DC.CopyConfigs.push_back(ConfigMgr);
+    }
+  }
+
+  if (Config.PreserveDates && (is_contained(Positional, "-") ||
+                               InputArgs.getLastArgValue(STRIP_output) == "-"))
+    return createStringError(errc::invalid_argument,
+                             "--preserve-dates requires a file");
+
+  return std::move(DC);
+}
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOptions.h b/llvm/tools/llvm-objcopy/ObjcopyOptions.h
new file mode 100644
index 000000000000..f7fa2af304d7
--- /dev/null
+++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.h
@@ -0,0 +1,58 @@
+//===- ObjcopyOptions.h ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_OBJCOPY_OBJCOPYOPTIONS_H
+#define LLVM_TOOLS_LLVM_OBJCOPY_OBJCOPYOPTIONS_H
+
+#include "llvm/ObjCopy/ConfigManager.h"
+#include "llvm/Support/Allocator.h"
+#include <vector>
+
+namespace llvm {
+namespace objcopy {
+
+// Configuration for the overall invocation of this tool. When invoked as
+// objcopy, will always contain exactly one CopyConfig. When invoked as strip,
+// will contain one or more CopyConfigs.
+struct DriverConfig {
+  SmallVector<ConfigManager, 1> CopyConfigs;
+  BumpPtrAllocator Alloc;
+};
+
+// ParseObjcopyOptions returns the config and sets the input arguments. If a
+// help flag is set then ParseObjcopyOptions will print the help messege and
+// exit. ErrorCallback is used to handle recoverable errors. An Error returned
+// by the callback aborts the parsing and is then returned by this function.
+Expected<DriverConfig>
+parseObjcopyOptions(ArrayRef<const char *> ArgsArr,
+                    llvm::function_ref<Error(Error)> ErrorCallback);
+
+// ParseInstallNameToolOptions returns the config and sets the input arguments.
+// If a help flag is set then ParseInstallNameToolOptions will print the help
+// messege and exit.
+Expected<DriverConfig>
+parseInstallNameToolOptions(ArrayRef<const char *> ArgsArr);
+
+// ParseBitcodeStripOptions returns the config and sets the input arguments.
+// If a help flag is set then ParseBitcodeStripOptions will print the help
+// messege and exit.
+Expected<DriverConfig>
+parseBitcodeStripOptions(ArrayRef<const char *> ArgsArr,
+                         llvm::function_ref<Error(Error)> ErrorCallback);
+
+// ParseStripOptions returns the config and sets the input arguments. If a
+// help flag is set then ParseStripOptions will print the help messege and
+// exit. ErrorCallback is used to handle recoverable errors. An Error returned
+// by the callback aborts the parsing and is then returned by this function.
+Expected<DriverConfig>
+parseStripOptions(ArrayRef<const char *> ArgsArr,
+                  llvm::function_ref<Error(Error)> ErrorCallback);
+} // namespace objcopy
+} // namespace llvm
+
+#endif // LLVM_TOOLS_LLVM_OBJCOPY_OBJCOPYOPTIONS_H
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOpts.td b/llvm/tools/llvm-objcopy/ObjcopyOpts.td
index bfd66caf41ed..ff73265989f3 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOpts.td
+++ b/llvm/tools/llvm-objcopy/ObjcopyOpts.td
@@ -32,9 +32,9 @@ defm new_symbol_visibility : Eq<"new-symbol-visibility", "Visibility of "
 def compress_debug_sections : Flag<["--"], "compress-debug-sections">;
 def compress_debug_sections_eq
     : Joined<["--"], "compress-debug-sections=">,
-      MetaVarName<"[ zlib | zlib-gnu ]">,
+      MetaVarName<"[ zlib ]">,
       HelpText<"Compress DWARF debug sections using specified style. Supported "
-               "styles: 'zlib-gnu' and 'zlib'">;
+               "formats: 'zlib'">;
 def decompress_debug_sections : Flag<["--"], "decompress-debug-sections">,
                                 HelpText<"Decompress DWARF debug sections.">;
 defm split_dwo
@@ -222,5 +222,5 @@ defm add_symbol
       MetaVarName<"name=[section:]value[,flags]">;
 
 defm update_section
-    : Eq<"update-section", "Add section <name> with contents from a file <file>.">,
+    : Eq<"update-section", "Replace the contents of section <name> with contents from a file <file>.">,
       MetaVarName<"name=file">;
diff --git a/llvm/tools/llvm-objcopy/llvm-objcopy.cpp b/llvm/tools/llvm-objcopy/llvm-objcopy.cpp
index a5963985f78a..aa262152ed64 100644
--- a/llvm/tools/llvm-objcopy/llvm-objcopy.cpp
+++ b/llvm/tools/llvm-objcopy/llvm-objcopy.cpp
@@ -6,23 +6,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm-objcopy.h"
-#include "COFF/COFFConfig.h"
-#include "COFF/COFFObjcopy.h"
-#include "CommonConfig.h"
-#include "ConfigManager.h"
-#include "ELF/ELFConfig.h"
-#include "ELF/ELFObjcopy.h"
-#include "MachO/MachOConfig.h"
-#include "MachO/MachOObjcopy.h"
-#include "wasm/WasmConfig.h"
-#include "wasm/WasmObjcopy.h"
-
+#include "ObjcopyOptions.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/ELF.h"
+#include "llvm/ObjCopy/COFF/COFFConfig.h"
+#include "llvm/ObjCopy/COFF/COFFObjcopy.h"
+#include "llvm/ObjCopy/CommonConfig.h"
+#include "llvm/ObjCopy/ELF/ELFConfig.h"
+#include "llvm/ObjCopy/ELF/ELFObjcopy.h"
+#include "llvm/ObjCopy/MachO/MachOConfig.h"
+#include "llvm/ObjCopy/MachO/MachOObjcopy.h"
+#include "llvm/ObjCopy/ObjCopy.h"
+#include "llvm/ObjCopy/wasm/WasmConfig.h"
+#include "llvm/ObjCopy/wasm/WasmObjcopy.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/ArchiveWriter.h"
 #include "llvm/Object/Binary.h"
@@ -42,6 +41,7 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/FileUtilities.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/Memory.h"
@@ -87,7 +87,7 @@ static Expected<DriverConfig> getDriverConfig(ArrayRef<const char *> Args) {
   };
 
   if (Is("bitcode-strip") || Is("bitcode_strip"))
-    return parseBitcodeStripOptions(Args);
+    return parseBitcodeStripOptions(Args, reportWarning);
   else if (Is("strip"))
     return parseStripOptions(Args, reportWarning);
   else if (Is("install-name-tool") || Is("install_name_tool"))
@@ -96,40 +96,6 @@ static Expected<DriverConfig> getDriverConfig(ArrayRef<const char *> Args) {
     return parseObjcopyOptions(Args, reportWarning);
 }
 
-// For regular archives this function simply calls llvm::writeArchive,
-// For thin archives it writes the archive file itself as well as its members.
-static Error deepWriteArchive(StringRef ArcName,
-                              ArrayRef<NewArchiveMember> NewMembers,
-                              bool WriteSymtab, object::Archive::Kind Kind,
-                              bool Deterministic, bool Thin) {
-  if (Error E = writeArchive(ArcName, NewMembers, WriteSymtab, Kind,
-                             Deterministic, Thin))
-    return createFileError(ArcName, std::move(E));
-
-  if (!Thin)
-    return Error::success();
-
-  for (const NewArchiveMember &Member : NewMembers) {
-    // For regular files (as is the case for deepWriteArchive),
-    // FileOutputBuffer::create will return OnDiskBuffer.
-    // OnDiskBuffer uses a temporary file and then renames it. So in reality
-    // there is no inefficiency / duplicated in-memory buffers in this case. For
-    // now in-memory buffers can not be completely avoided since
-    // NewArchiveMember still requires them even though writeArchive does not
-    // write them on disk.
-    Expected<std::unique_ptr<FileOutputBuffer>> FB =
-        FileOutputBuffer::create(Member.MemberName, Member.Buf->getBufferSize(),
-                                 FileOutputBuffer::F_executable);
-    if (!FB)
-      return FB.takeError();
-    std::copy(Member.Buf->getBufferStart(), Member.Buf->getBufferEnd(),
-              (*FB)->getBufferStart());
-    if (Error E = (*FB)->commit())
-      return E;
-  }
-  return Error::success();
-}
-
 /// The function executeObjcopyOnIHex does the dispatch based on the format
 /// of the output specified by the command line options.
 static Error executeObjcopyOnIHex(ConfigManager &ConfigMgr, MemoryBuffer &In,
@@ -166,162 +132,16 @@ static Error executeObjcopyOnRawBinary(ConfigManager &ConfigMgr,
   llvm_unreachable("unsupported output format");
 }
 
-/// The function executeObjcopyOnBinary does the dispatch based on the format
-/// of the input binary (ELF, MachO or COFF).
-static Error executeObjcopyOnBinary(const MultiFormatConfig &Config,
-                                    object::Binary &In, raw_ostream &Out) {
-  if (auto *ELFBinary = dyn_cast<object::ELFObjectFileBase>(&In)) {
-    Expected<const ELFConfig &> ELFConfig = Config.getELFConfig();
-    if (!ELFConfig)
-      return ELFConfig.takeError();
-
-    return elf::executeObjcopyOnBinary(Config.getCommonConfig(), *ELFConfig,
-                                       *ELFBinary, Out);
-  } else if (auto *COFFBinary = dyn_cast<object::COFFObjectFile>(&In)) {
-    Expected<const COFFConfig &> COFFConfig = Config.getCOFFConfig();
-    if (!COFFConfig)
-      return COFFConfig.takeError();
-
-    return coff::executeObjcopyOnBinary(Config.getCommonConfig(), *COFFConfig,
-                                        *COFFBinary, Out);
-  } else if (auto *MachOBinary = dyn_cast<object::MachOObjectFile>(&In)) {
-    Expected<const MachOConfig &> MachOConfig = Config.getMachOConfig();
-    if (!MachOConfig)
-      return MachOConfig.takeError();
-
-    return macho::executeObjcopyOnBinary(Config.getCommonConfig(), *MachOConfig,
-                                         *MachOBinary, Out);
-  } else if (auto *MachOUniversalBinary =
-                 dyn_cast<object::MachOUniversalBinary>(&In)) {
-    return macho::executeObjcopyOnMachOUniversalBinary(
-        Config, *MachOUniversalBinary, Out);
-  } else if (auto *WasmBinary = dyn_cast<object::WasmObjectFile>(&In)) {
-    Expected<const WasmConfig &> WasmConfig = Config.getWasmConfig();
-    if (!WasmConfig)
-      return WasmConfig.takeError();
-
-    return objcopy::wasm::executeObjcopyOnBinary(Config.getCommonConfig(),
-                                                 *WasmConfig, *WasmBinary, Out);
-  } else
-    return createStringError(object_error::invalid_file_type,
-                             "unsupported object file format");
-}
-
-namespace llvm {
-namespace objcopy {
-
-Expected<std::vector<NewArchiveMember>>
-createNewArchiveMembers(const MultiFormatConfig &Config, const Archive &Ar) {
-  std::vector<NewArchiveMember> NewArchiveMembers;
-  Error Err = Error::success();
-  for (const Archive::Child &Child : Ar.children(Err)) {
-    Expected<StringRef> ChildNameOrErr = Child.getName();
-    if (!ChildNameOrErr)
-      return createFileError(Ar.getFileName(), ChildNameOrErr.takeError());
-
-    Expected<std::unique_ptr<Binary>> ChildOrErr = Child.getAsBinary();
-    if (!ChildOrErr)
-      return createFileError(Ar.getFileName() + "(" + *ChildNameOrErr + ")",
-                             ChildOrErr.takeError());
-
-    SmallVector<char, 0> Buffer;
-    raw_svector_ostream MemStream(Buffer);
-
-    if (Error E = executeObjcopyOnBinary(Config, *ChildOrErr->get(), MemStream))
-      return std::move(E);
-
-    Expected<NewArchiveMember> Member = NewArchiveMember::getOldMember(
-        Child, Config.getCommonConfig().DeterministicArchives);
-    if (!Member)
-      return createFileError(Ar.getFileName(), Member.takeError());
-
-    Member->Buf = std::make_unique<SmallVectorMemoryBuffer>(
-        std::move(Buffer), ChildNameOrErr.get(),
-        /*RequiresNullTerminator=*/false);
-    Member->MemberName = Member->Buf->getBufferIdentifier();
-    NewArchiveMembers.push_back(std::move(*Member));
-  }
-  if (Err)
-    return createFileError(Config.getCommonConfig().InputFilename,
-                           std::move(Err));
-  return std::move(NewArchiveMembers);
-}
-
-} // end namespace objcopy
-} // end namespace llvm
-
-static Error executeObjcopyOnArchive(const ConfigManager &ConfigMgr,
-                                     const object::Archive &Ar) {
-  Expected<std::vector<NewArchiveMember>> NewArchiveMembersOrErr =
-      createNewArchiveMembers(ConfigMgr, Ar);
-  if (!NewArchiveMembersOrErr)
-    return NewArchiveMembersOrErr.takeError();
-  const CommonConfig &Config = ConfigMgr.getCommonConfig();
-  return deepWriteArchive(Config.OutputFilename, *NewArchiveMembersOrErr,
-                          Ar.hasSymbolTable(), Ar.kind(),
-                          Config.DeterministicArchives, Ar.isThin());
-}
-
-static Error restoreStatOnFile(StringRef Filename,
-                               const sys::fs::file_status &Stat,
-                               const ConfigManager &ConfigMgr) {
-  int FD;
-  const CommonConfig &Config = ConfigMgr.getCommonConfig();
-
-  // Writing to stdout should not be treated as an error here, just
-  // do not set access/modification times or permissions.
-  if (Filename == "-")
-    return Error::success();
-
-  if (auto EC =
-          sys::fs::openFileForWrite(Filename, FD, sys::fs::CD_OpenExisting))
-    return createFileError(Filename, EC);
-
-  if (Config.PreserveDates)
-    if (auto EC = sys::fs::setLastAccessAndModificationTime(
-            FD, Stat.getLastAccessedTime(), Stat.getLastModificationTime()))
-      return createFileError(Filename, EC);
-
-  sys::fs::file_status OStat;
-  if (std::error_code EC = sys::fs::status(FD, OStat))
-    return createFileError(Filename, EC);
-  if (OStat.type() == sys::fs::file_type::regular_file) {
-#ifndef _WIN32
-    // Keep ownership if llvm-objcopy is called under root.
-    if (Config.InputFilename == Config.OutputFilename && OStat.getUser() == 0)
-      sys::fs::changeFileOwnership(FD, Stat.getUser(), Stat.getGroup());
-#endif
-
-    sys::fs::perms Perm = Stat.permissions();
-    if (Config.InputFilename != Config.OutputFilename)
-      Perm = static_cast<sys::fs::perms>(Perm & ~sys::fs::getUmask() & ~06000);
-#ifdef _WIN32
-    if (auto EC = sys::fs::setPermissions(Filename, Perm))
-#else
-    if (auto EC = sys::fs::setPermissions(FD, Perm))
-#endif
-      return createFileError(Filename, EC);
-  }
-
-  if (auto EC = sys::Process::SafelyCloseFileDescriptor(FD))
-    return createFileError(Filename, EC);
-
-  return Error::success();
-}
-
 /// The function executeObjcopy does the higher level dispatch based on the type
 /// of input (raw binary, archive or single object file) and takes care of the
 /// format-agnostic modifications, i.e. preserving dates.
 static Error executeObjcopy(ConfigManager &ConfigMgr) {
   CommonConfig &Config = ConfigMgr.Common;
 
-  sys::fs::file_status Stat;
-  if (Config.InputFilename != "-") {
-    if (auto EC = sys::fs::status(Config.InputFilename, Stat))
-      return createFileError(Config.InputFilename, EC);
-  } else {
-    Stat.permissions(static_cast<sys::fs::perms>(0777));
-  }
+  Expected<FilePermissionsApplier> PermsApplierOrErr =
+      FilePermissionsApplier::create(Config.InputFilename);
+  if (!PermsApplierOrErr)
+    return PermsApplierOrErr.takeError();
 
   std::function<Error(raw_ostream & OutFile)> ObjcopyFunc;
 
@@ -390,19 +210,20 @@ static Error executeObjcopy(ConfigManager &ConfigMgr) {
     }
   }
 
-  if (Error E = restoreStatOnFile(Config.OutputFilename, Stat, ConfigMgr))
+  if (Error E =
+          PermsApplierOrErr->apply(Config.OutputFilename, Config.PreserveDates))
     return E;
 
-  if (!Config.SplitDWO.empty()) {
-    Stat.permissions(static_cast<sys::fs::perms>(0666));
-    if (Error E = restoreStatOnFile(Config.SplitDWO, Stat, ConfigMgr))
+  if (!Config.SplitDWO.empty())
+    if (Error E =
+            PermsApplierOrErr->apply(Config.SplitDWO, Config.PreserveDates,
+                                     static_cast<sys::fs::perms>(0666)))
       return E;
-  }
 
   return Error::success();
 }
 
-int main(int argc, char **argv) {
+int llvm_objcopy_main(int argc, char **argv) {
   InitLLVM X(argc, argv);
   ToolName = argv[0];
 
diff --git a/llvm/tools/llvm-objcopy/llvm-objcopy.h b/llvm/tools/llvm-objcopy/llvm-objcopy.h
deleted file mode 100644
index 182c95dc64c8..000000000000
--- a/llvm/tools/llvm-objcopy/llvm-objcopy.h
+++ /dev/null
@@ -1,34 +0,0 @@
-//===- llvm-objcopy.h -------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_OBJCOPY_OBJCOPY_H
-#define LLVM_TOOLS_OBJCOPY_OBJCOPY_H
-
-#include "llvm/Support/Error.h"
-#include "llvm/Support/raw_ostream.h"
-
-namespace llvm {
-
-struct NewArchiveMember;
-
-namespace object {
-
-class Archive;
-
-} // end namespace object
-
-namespace objcopy {
-class MultiFormatConfig;
-Expected<std::vector<NewArchiveMember>>
-createNewArchiveMembers(const MultiFormatConfig &Config,
-                        const object::Archive &Ar);
-
-} // end namespace objcopy
-} // end namespace llvm
-
-#endif // LLVM_TOOLS_OBJCOPY_OBJCOPY_H
diff --git a/llvm/tools/llvm-objcopy/wasm/Object.cpp b/llvm/tools/llvm-objcopy/wasm/Object.cpp
deleted file mode 100644
index e7a2956fedca..000000000000
--- a/llvm/tools/llvm-objcopy/wasm/Object.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-//===- Object.cpp ---------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "Object.h"
-
-#include "llvm/Support/LEB128.h"
-#include "llvm/Support/raw_ostream.h"
-
-namespace llvm {
-namespace objcopy {
-namespace wasm {
-
-using namespace object;
-using namespace llvm::wasm;
-
-void Object::addSectionWithOwnedContents(
-    Section NewSection, std::unique_ptr<MemoryBuffer> &&Content) {
-  Sections.push_back(NewSection);
-  OwnedContents.emplace_back(std::move(Content));
-}
-
-void Object::removeSections(function_ref<bool(const Section &)> ToRemove) {
-  // TODO: remove reloc sections for the removed section, handle symbols, etc.
-  llvm::erase_if(Sections, ToRemove);
-}
-
-} // end namespace wasm
-} // end namespace objcopy
-} // end namespace llvm
diff --git a/llvm/tools/llvm-objcopy/wasm/Object.h b/llvm/tools/llvm-objcopy/wasm/Object.h
deleted file mode 100644
index 9db91c41e2e2..000000000000
--- a/llvm/tools/llvm-objcopy/wasm/Object.h
+++ /dev/null
@@ -1,47 +0,0 @@
-//===- Object.h -------------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_LLVM_OBJCOPY_WASM_OBJECT_H
-#define LLVM_TOOLS_LLVM_OBJCOPY_WASM_OBJECT_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Object/Wasm.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include <vector>
-
-namespace llvm {
-namespace objcopy {
-namespace wasm {
-
-struct Section {
-  // For now, each section is only an opaque binary blob with no distinction
-  // between custom and known sections.
-  uint8_t SectionType;
-  StringRef Name;
-  ArrayRef<uint8_t> Contents;
-};
-
-struct Object {
-  llvm::wasm::WasmObjectHeader Header;
-  // For now don't discriminate between kinds of sections.
-  std::vector<Section> Sections;
-
-  void addSectionWithOwnedContents(Section NewSection,
-                                   std::unique_ptr<MemoryBuffer> &&Content);
-  void removeSections(function_ref<bool(const Section &)> ToRemove);
-
-private:
-  std::vector<std::unique_ptr<MemoryBuffer>> OwnedContents;
-};
-
-} // end namespace wasm
-} // end namespace objcopy
-} // end namespace llvm
-
-#endif // LLVM_TOOLS_LLVM_OBJCOPY_WASM_OBJECT_H
diff --git a/llvm/tools/llvm-objcopy/wasm/Reader.cpp b/llvm/tools/llvm-objcopy/wasm/Reader.cpp
deleted file mode 100644
index 13fa84ad8020..000000000000
--- a/llvm/tools/llvm-objcopy/wasm/Reader.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-//===- Reader.cpp ---------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "Reader.h"
-
-namespace llvm {
-namespace objcopy {
-namespace wasm {
-
-using namespace object;
-using namespace llvm::wasm;
-
-Expected<std::unique_ptr<Object>> Reader::create() const {
-  auto Obj = std::make_unique<Object>();
-  Obj->Header = WasmObj.getHeader();
-  std::vector<Section> Sections;
-  Obj->Sections.reserve(WasmObj.getNumSections());
-  for (const SectionRef &Sec : WasmObj.sections()) {
-    const WasmSection &WS = WasmObj.getWasmSection(Sec);
-    Obj->Sections.push_back(
-        {static_cast<uint8_t>(WS.Type), WS.Name, WS.Content});
-  }
-  return std::move(Obj);
-}
-
-} // end namespace wasm
-} // end namespace objcopy
-} // end namespace llvm
diff --git a/llvm/tools/llvm-objcopy/wasm/Reader.h b/llvm/tools/llvm-objcopy/wasm/Reader.h
deleted file mode 100644
index 2dcf7dde029a..000000000000
--- a/llvm/tools/llvm-objcopy/wasm/Reader.h
+++ /dev/null
@@ -1,31 +0,0 @@
-//===- Reader.h -------------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_LLVM_OBJCOPY_WASM_READER_H
-#define LLVM_TOOLS_LLVM_OBJCOPY_WASM_READER_H
-
-#include "Object.h"
-
-namespace llvm {
-namespace objcopy {
-namespace wasm {
-
-class Reader {
-public:
-  explicit Reader(const object::WasmObjectFile &O) : WasmObj(O) {}
-  Expected<std::unique_ptr<Object>> create() const;
-
-private:
-  const object::WasmObjectFile &WasmObj;
-};
-
-} // end namespace wasm
-} // end namespace objcopy
-} // end namespace llvm
-
-#endif // LLVM_TOOLS_LLVM_OBJCOPY_WASM_READER_H
diff --git a/llvm/tools/llvm-objcopy/wasm/WasmConfig.h b/llvm/tools/llvm-objcopy/wasm/WasmConfig.h
deleted file mode 100644
index 4e40926ae453..000000000000
--- a/llvm/tools/llvm-objcopy/wasm/WasmConfig.h
+++ /dev/null
@@ -1,21 +0,0 @@
-//===- WasmConfig.h ---------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_LLVM_OBJCOPY_WASM_WASMCONFIG_H
-#define LLVM_TOOLS_LLVM_OBJCOPY_WASM_WASMCONFIG_H
-
-namespace llvm {
-namespace objcopy {
-
-// Wasm specific configuration for copying/stripping a single file.
-struct WasmConfig {};
-
-} // namespace objcopy
-} // namespace llvm
-
-#endif // LLVM_TOOLS_LLVM_OBJCOPY_WASM_WASMCONFIG_H
diff --git a/llvm/tools/llvm-objcopy/wasm/WasmObjcopy.cpp b/llvm/tools/llvm-objcopy/wasm/WasmObjcopy.cpp
deleted file mode 100644
index 397d09757e54..000000000000
--- a/llvm/tools/llvm-objcopy/wasm/WasmObjcopy.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-//===- WasmObjcopy.cpp ----------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "WasmObjcopy.h"
-#include "CommonConfig.h"
-#include "Object.h"
-#include "Reader.h"
-#include "Writer.h"
-#include "llvm/Support/Errc.h"
-#include "llvm/Support/FileOutputBuffer.h"
-
-namespace llvm {
-namespace objcopy {
-namespace wasm {
-
-using namespace object;
-using SectionPred = std::function<bool(const Section &Sec)>;
-
-static bool isDebugSection(const Section &Sec) {
-  return Sec.Name.startswith(".debug");
-}
-
-static bool isLinkerSection(const Section &Sec) {
-  return Sec.Name.startswith("reloc.") || Sec.Name == "linking";
-}
-
-static bool isNameSection(const Section &Sec) { return Sec.Name == "name"; }
-
-// Sections which are known to be "comments" or informational and do not affect
-// program semantics.
-static bool isCommentSection(const Section &Sec) {
-  return Sec.Name == "producers";
-}
-
-static Error dumpSectionToFile(StringRef SecName, StringRef Filename,
-                               Object &Obj) {
-  for (const Section &Sec : Obj.Sections) {
-    if (Sec.Name == SecName) {
-      ArrayRef<uint8_t> Contents = Sec.Contents;
-      Expected<std::unique_ptr<FileOutputBuffer>> BufferOrErr =
-          FileOutputBuffer::create(Filename, Contents.size());
-      if (!BufferOrErr)
-        return BufferOrErr.takeError();
-      std::unique_ptr<FileOutputBuffer> Buf = std::move(*BufferOrErr);
-      std::copy(Contents.begin(), Contents.end(), Buf->getBufferStart());
-      if (Error E = Buf->commit())
-        return E;
-      return Error::success();
-    }
-  }
-  return createStringError(errc::invalid_argument, "section '%s' not found",
-                           SecName.str().c_str());
-}
-
-static void removeSections(const CommonConfig &Config, Object &Obj) {
-  SectionPred RemovePred = [](const Section &) { return false; };
-
-  // Explicitly-requested sections.
-  if (!Config.ToRemove.empty()) {
-    RemovePred = [&Config](const Section &Sec) {
-      return Config.ToRemove.matches(Sec.Name);
-    };
-  }
-
-  if (Config.StripDebug) {
-    RemovePred = [RemovePred](const Section &Sec) {
-      return RemovePred(Sec) || isDebugSection(Sec);
-    };
-  }
-
-  if (Config.StripAll) {
-    RemovePred = [RemovePred](const Section &Sec) {
-      return RemovePred(Sec) || isDebugSection(Sec) || isLinkerSection(Sec) ||
-             isNameSection(Sec) || isCommentSection(Sec);
-    };
-  }
-
-  if (Config.OnlyKeepDebug) {
-    RemovePred = [&Config](const Section &Sec) {
-      // Keep debug sections, unless explicitly requested to remove.
-      // Remove everything else, including known sections.
-      return Config.ToRemove.matches(Sec.Name) || !isDebugSection(Sec);
-    };
-  }
-
-  if (!Config.OnlySection.empty()) {
-    RemovePred = [&Config](const Section &Sec) {
-      // Explicitly keep these sections regardless of previous removes.
-      // Remove everything else, inluding known sections.
-      return !Config.OnlySection.matches(Sec.Name);
-    };
-  }
-
-  if (!Config.KeepSection.empty()) {
-    RemovePred = [&Config, RemovePred](const Section &Sec) {
-      // Explicitly keep these sections regardless of previous removes.
-      if (Config.KeepSection.matches(Sec.Name))
-        return false;
-      // Otherwise defer to RemovePred.
-      return RemovePred(Sec);
-    };
-  }
-
-  Obj.removeSections(RemovePred);
-}
-
-static Error handleArgs(const CommonConfig &Config, Object &Obj) {
-  // Only support AddSection, DumpSection, RemoveSection for now.
-  for (StringRef Flag : Config.DumpSection) {
-    StringRef SecName;
-    StringRef FileName;
-    std::tie(SecName, FileName) = Flag.split("=");
-    if (Error E = dumpSectionToFile(SecName, FileName, Obj))
-      return createFileError(FileName, std::move(E));
-  }
-
-  removeSections(Config, Obj);
-
-  for (StringRef Flag : Config.AddSection) {
-    StringRef SecName, FileName;
-    std::tie(SecName, FileName) = Flag.split("=");
-    ErrorOr<std::unique_ptr<MemoryBuffer>> BufOrErr =
-        MemoryBuffer::getFile(FileName);
-    if (!BufOrErr)
-      return createFileError(FileName, errorCodeToError(BufOrErr.getError()));
-    Section Sec;
-    Sec.SectionType = llvm::wasm::WASM_SEC_CUSTOM;
-    Sec.Name = SecName;
-    std::unique_ptr<MemoryBuffer> Buf = std::move(*BufOrErr);
-    Sec.Contents = makeArrayRef<uint8_t>(
-        reinterpret_cast<const uint8_t *>(Buf->getBufferStart()),
-        Buf->getBufferSize());
-    Obj.addSectionWithOwnedContents(Sec, std::move(Buf));
-  }
-
-  return Error::success();
-}
-
-Error executeObjcopyOnBinary(const CommonConfig &Config, const WasmConfig &,
-                             object::WasmObjectFile &In, raw_ostream &Out) {
-  Reader TheReader(In);
-  Expected<std::unique_ptr<Object>> ObjOrErr = TheReader.create();
-  if (!ObjOrErr)
-    return createFileError(Config.InputFilename, ObjOrErr.takeError());
-  Object *Obj = ObjOrErr->get();
-  assert(Obj && "Unable to deserialize Wasm object");
-  if (Error E = handleArgs(Config, *Obj))
-    return E;
-  Writer TheWriter(*Obj, Out);
-  if (Error E = TheWriter.write())
-    return createFileError(Config.OutputFilename, std::move(E));
-  return Error::success();
-}
-
-} // end namespace wasm
-} // end namespace objcopy
-} // end namespace llvm
diff --git a/llvm/tools/llvm-objcopy/wasm/WasmObjcopy.h b/llvm/tools/llvm-objcopy/wasm/WasmObjcopy.h
deleted file mode 100644
index 28268e38c584..000000000000
--- a/llvm/tools/llvm-objcopy/wasm/WasmObjcopy.h
+++ /dev/null
@@ -1,32 +0,0 @@
-//===- WasmObjcopy.h -------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_LLVM_OBJCOPY_WASM_WASMOBJCOPY_H
-#define LLVM_TOOLS_LLVM_OBJCOPY_WASM_WASMOBJCOPY_H
-
-namespace llvm {
-class Error;
-class raw_ostream;
-
-namespace object {
-class WasmObjectFile;
-} // end namespace object
-
-namespace objcopy {
-struct CommonConfig;
-struct WasmConfig;
-
-namespace wasm {
-Error executeObjcopyOnBinary(const CommonConfig &Config, const WasmConfig &,
-                             object::WasmObjectFile &In, raw_ostream &Out);
-
-} // end namespace wasm
-} // end namespace objcopy
-} // end namespace llvm
-
-#endif // LLVM_TOOLS_LLVM_OBJCOPY_WASM_WASMOBJCOPY_H
diff --git a/llvm/tools/llvm-objcopy/wasm/Writer.cpp b/llvm/tools/llvm-objcopy/wasm/Writer.cpp
deleted file mode 100644
index 2fad9e60c50f..000000000000
--- a/llvm/tools/llvm-objcopy/wasm/Writer.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-//===- Writer.cpp ---------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "Writer.h"
-#include "llvm/BinaryFormat/Wasm.h"
-#include "llvm/Support/Endian.h"
-#include "llvm/Support/Errc.h"
-#include "llvm/Support/LEB128.h"
-#include "llvm/Support/raw_ostream.h"
-
-namespace llvm {
-namespace objcopy {
-namespace wasm {
-
-using namespace object;
-using namespace llvm::wasm;
-
-Writer::SectionHeader Writer::createSectionHeader(const Section &S,
-                                                  size_t &SectionSize) {
-  SectionHeader Header;
-  raw_svector_ostream OS(Header);
-  OS << S.SectionType;
-  bool HasName = S.SectionType == WASM_SEC_CUSTOM;
-  SectionSize = S.Contents.size();
-  if (HasName)
-    SectionSize += getULEB128Size(S.Name.size()) + S.Name.size();
-  // Pad the LEB value out to 5 bytes to make it a predictable size, and
-  // match the behavior of clang.
-  encodeULEB128(SectionSize, OS, 5);
-  if (HasName) {
-    encodeULEB128(S.Name.size(), OS);
-    OS << S.Name;
-  }
-  // Total section size is the content size plus 1 for the section type and
-  // 5 for the LEB-encoded size.
-  SectionSize = SectionSize + 1 + 5;
-  return Header;
-}
-
-size_t Writer::finalize() {
-  size_t ObjectSize = sizeof(WasmMagic) + sizeof(WasmVersion);
-  SectionHeaders.reserve(Obj.Sections.size());
-  // Finalize the headers of each section so we know the total size.
-  for (const Section &S : Obj.Sections) {
-    size_t SectionSize;
-    SectionHeaders.push_back(createSectionHeader(S, SectionSize));
-    ObjectSize += SectionSize;
-  }
-  return ObjectSize;
-}
-
-Error Writer::write() {
-  size_t TotalSize = finalize();
-  Out.reserveExtraSpace(TotalSize);
-
-  // Write the header.
-  Out.write(Obj.Header.Magic.data(), Obj.Header.Magic.size());
-  uint32_t Version;
-  support::endian::write32le(&Version, Obj.Header.Version);
-  Out.write(reinterpret_cast<const char *>(&Version), sizeof(Version));
-
-  // Write each section.
-  for (size_t I = 0, S = SectionHeaders.size(); I < S; ++I) {
-    Out.write(SectionHeaders[I].data(), SectionHeaders[I].size());
-    Out.write(reinterpret_cast<const char *>(Obj.Sections[I].Contents.data()),
-              Obj.Sections[I].Contents.size());
-  }
-
-  return Error::success();
-}
-
-} // end namespace wasm
-} // end namespace objcopy
-} // end namespace llvm
diff --git a/llvm/tools/llvm-objcopy/wasm/Writer.h b/llvm/tools/llvm-objcopy/wasm/Writer.h
deleted file mode 100644
index 4404cd8caf84..000000000000
--- a/llvm/tools/llvm-objcopy/wasm/Writer.h
+++ /dev/null
@@ -1,49 +0,0 @@
-//===- Writer.h -------------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_LLVM_OBJCOPY_WASM_WRITER_H
-#define LLVM_TOOLS_LLVM_OBJCOPY_WASM_WRITER_H
-
-#include "Object.h"
-#include <cstdint>
-#include <vector>
-
-namespace llvm {
-namespace objcopy {
-namespace wasm {
-
-class Writer {
-public:
-  Writer(Object &Obj, raw_ostream &Out) : Obj(Obj), Out(Out) {}
-  Error write();
-
-private:
-  using SectionHeader = SmallVector<char, 8>;
-  Object &Obj;
-  raw_ostream &Out;
-  std::vector<SectionHeader> SectionHeaders;
-
-  /// Generate a wasm section section header for S.
-  /// The header consists of
-  /// * A one-byte section ID (aka the section type).
-  /// * The size of the section contents, encoded as ULEB128.
-  /// * If the section is a custom section (type 0) it also has a name, which is
-  ///   encoded as a length-prefixed string. The encoded section size *includes*
-  ///   this string.
-  /// See https://webassembly.github.io/spec/core/binary/modules.html#sections
-  /// Return the header and store the total size in SectionSize.
-  static SectionHeader createSectionHeader(const Section &S,
-                                           size_t &SectionSize);
-  size_t finalize();
-};
-
-} // end namespace wasm
-} // end namespace objcopy
-} // end namespace llvm
-
-#endif // LLVM_TOOLS_LLVM_OBJCOPY_WASM_WRITER_H
diff --git a/llvm/tools/llvm-objdump/COFFDump.cpp b/llvm/tools/llvm-objdump/COFFDump.cpp
index 32fdd1a4d5c3..e085e26c3cd0 100644
--- a/llvm/tools/llvm-objdump/COFFDump.cpp
+++ b/llvm/tools/llvm-objdump/COFFDump.cpp
@@ -430,21 +430,12 @@ static void printTLSDirectory(const COFFObjectFile *Obj) {
   if (!PE32Header && !PE32PlusHeader)
     return;
 
-  const data_directory *DataDir = Obj->getDataDirectory(COFF::TLS_TABLE);
-  if (!DataDir || DataDir->RelativeVirtualAddress == 0)
-    return;
-
-  uintptr_t IntPtr = 0;
-  if (Error E =
-          Obj->getRvaPtr(DataDir->RelativeVirtualAddress, IntPtr))
-    reportError(std::move(E), Obj->getFileName());
-
   if (PE32Header) {
-    auto *TLSDir = reinterpret_cast<const coff_tls_directory32 *>(IntPtr);
-    printTLSDirectoryT(TLSDir);
+    if (auto *TLSDir = Obj->getTLSDirectory32())
+      printTLSDirectoryT(TLSDir);
   } else {
-    auto *TLSDir = reinterpret_cast<const coff_tls_directory64 *>(IntPtr);
-    printTLSDirectoryT(TLSDir);
+    if (auto *TLSDir = Obj->getTLSDirectory64())
+      printTLSDirectoryT(TLSDir);
   }
 
   outs() << "\n";
@@ -459,19 +450,10 @@ static void printLoadConfiguration(const COFFObjectFile *Obj) {
   if (Obj->getMachine() != COFF::IMAGE_FILE_MACHINE_I386)
     return;
 
-  const data_directory *DataDir = Obj->getDataDirectory(COFF::LOAD_CONFIG_TABLE);
-  if (!DataDir)
-    reportError("no load config data dir", Obj->getFileName());
-
-  uintptr_t IntPtr = 0;
-  if (DataDir->RelativeVirtualAddress == 0)
+  auto *LoadConf = Obj->getLoadConfig32();
+  if (!LoadConf)
     return;
 
-  if (Error E =
-          Obj->getRvaPtr(DataDir->RelativeVirtualAddress, IntPtr))
-    reportError(std::move(E), Obj->getFileName());
-
-  auto *LoadConf = reinterpret_cast<const coff_load_configuration32 *>(IntPtr);
   outs() << "Load configuration:"
          << "\n  Timestamp: " << LoadConf->TimeDateStamp
          << "\n  Major Version: " << LoadConf->MajorVersion
@@ -544,11 +526,11 @@ static void printImportTables(const COFFObjectFile *Obj) {
 // Prints export tables. The export table is a table containing the list of
 // exported symbol from the DLL.
 static void printExportTable(const COFFObjectFile *Obj) {
-  outs() << "Export Table:\n";
   export_directory_iterator I = Obj->export_directory_begin();
   export_directory_iterator E = Obj->export_directory_end();
   if (I == E)
     return;
+  outs() << "Export Table:\n";
   StringRef DllName;
   uint32_t OrdinalBase;
   if (I->getDllName(DllName))
diff --git a/llvm/tools/llvm-objdump/ELFDump.cpp b/llvm/tools/llvm-objdump/ELFDump.cpp
index 98e71497d022..ca73dafe2b8e 100644
--- a/llvm/tools/llvm-objdump/ELFDump.cpp
+++ b/llvm/tools/llvm-objdump/ELFDump.cpp
@@ -171,8 +171,12 @@ uint64_t objdump::getELFSectionLMA(const object::ELFSectionRef &Sec) {
 
 template <class ELFT>
 static void printDynamicSection(const ELFFile<ELFT> &Elf, StringRef Filename) {
-  ArrayRef<typename ELFT::Dyn> DynamicEntries =
-      unwrapOrError(Elf.dynamicEntries(), Filename);
+  auto DynamicEntriesOrErr = Elf.dynamicEntries();
+  if (!DynamicEntriesOrErr) {
+    reportWarning(toString(DynamicEntriesOrErr.takeError()), Filename);
+    return;
+  }
+  ArrayRef<typename ELFT::Dyn> DynamicEntries = *DynamicEntriesOrErr;
 
   // Find the maximum tag name length to format the value column properly.
   size_t MaxLen = 0;
diff --git a/llvm/tools/llvm-objdump/MachODump.cpp b/llvm/tools/llvm-objdump/MachODump.cpp
index 31867625f0e5..60c34158941b 100644
--- a/llvm/tools/llvm-objdump/MachODump.cpp
+++ b/llvm/tools/llvm-objdump/MachODump.cpp
@@ -81,6 +81,7 @@ bool objdump::DataInCode;
 bool objdump::FunctionStarts;
 bool objdump::LinkOptHints;
 bool objdump::InfoPlist;
+bool objdump::DyldInfo;
 bool objdump::DylibsUsed;
 bool objdump::DylibId;
 bool objdump::Verbose;
@@ -111,6 +112,7 @@ void objdump::parseMachOOptions(const llvm::opt::InputArgList &InputArgs) {
   FunctionStarts = InputArgs.hasArg(OBJDUMP_function_starts);
   LinkOptHints = InputArgs.hasArg(OBJDUMP_link_opt_hints);
   InfoPlist = InputArgs.hasArg(OBJDUMP_info_plist);
+  DyldInfo = InputArgs.hasArg(OBJDUMP_dyld_info);
   DylibsUsed = InputArgs.hasArg(OBJDUMP_dylibs_used);
   DylibId = InputArgs.hasArg(OBJDUMP_dylib_id);
   Verbose = !InputArgs.hasArg(OBJDUMP_non_verbose);
@@ -188,8 +190,12 @@ typedef DiceTable::iterator dice_table_iterator;
 namespace {
 struct ScopedXarFile {
   xar_t xar;
-  ScopedXarFile(const char *filename, int32_t flags)
-      : xar(xar_open(filename, flags)) {}
+  ScopedXarFile(const char *filename, int32_t flags) {
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+    xar = xar_open(filename, flags);
+#pragma clang diagnostic pop
+  }
   ~ScopedXarFile() {
     if (xar)
       xar_close(xar);
@@ -1178,6 +1184,20 @@ static void PrintLinkOptHints(MachOObjectFile *O) {
   }
 }
 
+static void printMachOChainedFixups(object::MachOObjectFile *Obj) {
+  Error Err = Error::success();
+  for (const object::MachOChainedFixupEntry &Entry : Obj->fixupTable(Err)) {
+    (void)Entry;
+  }
+  if (Err)
+    reportError(std::move(Err), Obj->getFileName());
+}
+
+static void PrintDyldInfo(MachOObjectFile *O) {
+  outs() << "dyld information:" << '\n';
+  printMachOChainedFixups(O);
+}
+
 static void PrintDylibs(MachOObjectFile *O, bool JustId) {
   unsigned Index = 0;
   for (const auto &Load : O->load_commands()) {
@@ -1896,8 +1916,8 @@ static void ProcessMachO(StringRef Name, MachOObjectFile *MachOOF,
   // UniversalHeaders or ArchiveHeaders.
   if (Disassemble || Relocations || PrivateHeaders || ExportsTrie || Rebase ||
       Bind || SymbolTable || LazyBind || WeakBind || IndirectSymbols ||
-      DataInCode || FunctionStarts || LinkOptHints || DylibsUsed || DylibId ||
-      Rpaths || ObjcMetaData || (!FilterSections.empty())) {
+      DataInCode || FunctionStarts || LinkOptHints || DyldInfo || DylibsUsed ||
+      DylibId || Rpaths || ObjcMetaData || (!FilterSections.empty())) {
     if (LeadingHeaders) {
       outs() << Name;
       if (!ArchiveMemberName.empty())
@@ -1966,6 +1986,8 @@ static void ProcessMachO(StringRef Name, MachOObjectFile *MachOOF,
     DumpSectionContents(FileName, MachOOF, Verbose);
   if (InfoPlist)
     DumpInfoPlistSectionContents(FileName, MachOOF);
+  if (DyldInfo)
+    PrintDyldInfo(MachOOF);
   if (DylibsUsed)
     PrintDylibs(MachOOF, false);
   if (DylibId)
@@ -2586,7 +2608,8 @@ struct DisassembleInfo {
 // value of TagType is currently 1 (for the LLVMOpInfo1 struct). If symbolic
 // information is returned then this function returns 1 else it returns 0.
 static int SymbolizerGetOpInfo(void *DisInfo, uint64_t Pc, uint64_t Offset,
-                               uint64_t Size, int TagType, void *TagBuf) {
+                               uint64_t OpSize, uint64_t InstSize, int TagType,
+                               void *TagBuf) {
   struct DisassembleInfo *info = (struct DisassembleInfo *)DisInfo;
   struct LLVMOpInfo1 *op_info = (struct LLVMOpInfo1 *)TagBuf;
   uint64_t value = op_info->Value;
@@ -2603,7 +2626,7 @@ static int SymbolizerGetOpInfo(void *DisInfo, uint64_t Pc, uint64_t Offset,
 
   unsigned int Arch = info->O->getArch();
   if (Arch == Triple::x86) {
-    if (Size != 1 && Size != 2 && Size != 4 && Size != 0)
+    if (OpSize != 1 && OpSize != 2 && OpSize != 4 && OpSize != 0)
       return 0;
     if (info->O->getHeader().filetype != MachO::MH_OBJECT) {
       // TODO:
@@ -2683,7 +2706,7 @@ static int SymbolizerGetOpInfo(void *DisInfo, uint64_t Pc, uint64_t Offset,
     return 0;
   }
   if (Arch == Triple::x86_64) {
-    if (Size != 1 && Size != 2 && Size != 4 && Size != 0)
+    if (OpSize != 1 && OpSize != 2 && OpSize != 4 && OpSize != 0)
       return 0;
     // For non MH_OBJECT types, like MH_KEXT_BUNDLE, Search the external
     // relocation entries of a linked image (if any) for an entry that matches
@@ -2715,7 +2738,7 @@ static int SymbolizerGetOpInfo(void *DisInfo, uint64_t Pc, uint64_t Offset,
         // adds the Pc.  But for x86_64 external relocation entries the Value
         // is the offset from the external symbol.
         if (info->O->getAnyRelocationPCRel(RE))
-          op_info->Value -= Pc + Offset + Size;
+          op_info->Value -= Pc + InstSize;
         const char *name =
             unwrapOrError(Symbol.getName(), info->O->getFileName()).data();
         op_info->AddSymbol.Present = 1;
@@ -2753,7 +2776,7 @@ static int SymbolizerGetOpInfo(void *DisInfo, uint64_t Pc, uint64_t Offset,
       // adds the Pc.  But for x86_64 external relocation entries the Value
       // is the offset from the external symbol.
       if (info->O->getAnyRelocationPCRel(RE))
-        op_info->Value -= Pc + Offset + Size;
+        op_info->Value -= Pc + InstSize;
       const char *name =
           unwrapOrError(Symbol.getName(), info->O->getFileName()).data();
       unsigned Type = info->O->getAnyRelocationType(RE);
@@ -2781,7 +2804,7 @@ static int SymbolizerGetOpInfo(void *DisInfo, uint64_t Pc, uint64_t Offset,
     return 0;
   }
   if (Arch == Triple::arm) {
-    if (Offset != 0 || (Size != 4 && Size != 2))
+    if (Offset != 0 || (InstSize != 4 && InstSize != 2))
       return 0;
     if (info->O->getHeader().filetype != MachO::MH_OBJECT) {
       // TODO:
@@ -2918,7 +2941,7 @@ static int SymbolizerGetOpInfo(void *DisInfo, uint64_t Pc, uint64_t Offset,
     return 1;
   }
   if (Arch == Triple::aarch64) {
-    if (Offset != 0 || Size != 4)
+    if (Offset != 0 || InstSize != 4)
       return 0;
     if (info->O->getHeader().filetype != MachO::MH_OBJECT) {
       // TODO:
@@ -9141,14 +9164,20 @@ static void PrintNoteLoadCommand(MachO::note_command Nt) {
   outs() << "      size " << Nt.size << "\n";
 }
 
-static void PrintBuildToolVersion(MachO::build_tool_version bv) {
-  outs() << "      tool " << MachOObjectFile::getBuildTool(bv.tool) << "\n";
+static void PrintBuildToolVersion(MachO::build_tool_version bv, bool verbose) {
+  outs() << "      tool ";
+  if (verbose)
+    outs() << MachOObjectFile::getBuildTool(bv.tool);
+  else
+    outs() << bv.tool;
+  outs() << "\n";
   outs() << "   version " << MachOObjectFile::getVersionString(bv.version)
          << "\n";
 }
 
 static void PrintBuildVersionLoadCommand(const MachOObjectFile *obj,
-                                         MachO::build_version_command bd) {
+                                         MachO::build_version_command bd,
+                                         bool verbose) {
   outs() << "       cmd LC_BUILD_VERSION\n";
   outs() << "   cmdsize " << bd.cmdsize;
   if (bd.cmdsize !=
@@ -9157,8 +9186,12 @@ static void PrintBuildVersionLoadCommand(const MachOObjectFile *obj,
     outs() << " Incorrect size\n";
   else
     outs() << "\n";
-  outs() << "  platform " << MachOObjectFile::getBuildPlatform(bd.platform)
-         << "\n";
+  outs() << "  platform ";
+  if (verbose)
+    outs() << MachOObjectFile::getBuildPlatform(bd.platform);
+  else
+    outs() << bd.platform;
+  outs() << "\n";
   if (bd.sdk)
     outs() << "       sdk " << MachOObjectFile::getVersionString(bd.sdk)
            << "\n";
@@ -9169,7 +9202,7 @@ static void PrintBuildVersionLoadCommand(const MachOObjectFile *obj,
   outs() << "    ntools " << bd.ntools << "\n";
   for (unsigned i = 0; i < bd.ntools; ++i) {
     MachO::build_tool_version bv = obj->getBuildToolVersion(i);
-    PrintBuildToolVersion(bv);
+    PrintBuildToolVersion(bv, verbose);
   }
 }
 
@@ -10146,7 +10179,7 @@ static void PrintLoadCommands(const MachOObjectFile *Obj, uint32_t filetype,
     } else if (Command.C.cmd == MachO::LC_BUILD_VERSION) {
       MachO::build_version_command Bv =
           Obj->getBuildVersionLoadCommand(Command);
-      PrintBuildVersionLoadCommand(Obj, Bv);
+      PrintBuildVersionLoadCommand(Obj, Bv, verbose);
     } else if (Command.C.cmd == MachO::LC_SOURCE_VERSION) {
       MachO::source_version_command Sd = Obj->getSourceVersionCommand(Command);
       PrintSourceVersionCommand(Sd);
diff --git a/llvm/tools/llvm-objdump/MachODump.h b/llvm/tools/llvm-objdump/MachODump.h
index 7568062bd6b0..12783e15b425 100644
--- a/llvm/tools/llvm-objdump/MachODump.h
+++ b/llvm/tools/llvm-objdump/MachODump.h
@@ -36,6 +36,7 @@ void parseMachOOptions(const llvm::opt::InputArgList &InputArgs);
 extern bool Bind;
 extern bool DataInCode;
 extern std::string DisSymName;
+extern bool DyldInfo;
 extern bool DylibId;
 extern bool DylibsUsed;
 extern bool ExportsTrie;
diff --git a/llvm/tools/llvm-objdump/ObjdumpOpts.td b/llvm/tools/llvm-objdump/ObjdumpOpts.td
index 9f27a6cdf163..00d7d8ccff17 100644
--- a/llvm/tools/llvm-objdump/ObjdumpOpts.td
+++ b/llvm/tools/llvm-objdump/ObjdumpOpts.td
@@ -81,6 +81,9 @@ def dwarf_EQ : Joined<["--"], "dwarf=">,
 def fault_map_section : Flag<["--"], "fault-map-section">,
   HelpText<"Display the content of the fault map section">;
 
+def offloading : Flag<["--"], "offloading">,
+  HelpText<"Display the content of the offloading section">;
+
 def file_headers : Flag<["--"], "file-headers">,
   HelpText<"Display the contents of the overall file header">;
 def : Flag<["-"], "f">, Alias<file_headers>,
@@ -296,6 +299,12 @@ def info_plist : Flag<["--"], "info-plist">,
            "Mach-O objects (requires --macho)">,
   Group<grp_mach_o>;
 
+def dyld_info : Flag<["--"], "dyld_info">,
+      HelpText<"Print bind and rebase information used by dyld to resolve "
+               "external references in a final linked binary "
+               "(requires --macho)">,
+      Group<grp_mach_o>;
+
 def dylibs_used : Flag<["--"], "dylibs-used">,
   HelpText<"Print the shared libraries used for linked "
            "Mach-O files (requires --macho)">,
diff --git a/llvm/tools/llvm-objdump/OffloadDump.cpp b/llvm/tools/llvm-objdump/OffloadDump.cpp
new file mode 100644
index 000000000000..7d4461f0a70e
--- /dev/null
+++ b/llvm/tools/llvm-objdump/OffloadDump.cpp
@@ -0,0 +1,102 @@
+//===-- OffloadDump.cpp - Offloading dumper ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the offloading-specific dumper for llvm-objdump.
+///
+//===----------------------------------------------------------------------===//
+#include "OffloadDump.h"
+#include "llvm-objdump.h"
+
+using namespace llvm;
+using namespace llvm::object;
+using namespace llvm::objdump;
+
+constexpr const char OffloadSectionString[] = ".llvm.offloading";
+
+/// Get the printable name of the image kind.
+static StringRef getImageName(const OffloadBinary &OB) {
+  switch (OB.getImageKind()) {
+  case IMG_Object:
+    return "elf";
+  case IMG_Bitcode:
+    return "llvm ir";
+  case IMG_Cubin:
+    return "cubin";
+  case IMG_Fatbinary:
+    return "fatbinary";
+  case IMG_PTX:
+    return "ptx";
+  default:
+    return "<none>";
+  }
+}
+
+static void printBinary(const OffloadBinary &OB, uint64_t Index) {
+  outs() << "\nOFFLOADING IMAGE [" << Index << "]:\n";
+  outs() << left_justify("kind", 16) << getImageName(OB) << "\n";
+  outs() << left_justify("arch", 16) << OB.getArch() << "\n";
+  outs() << left_justify("triple", 16) << OB.getTriple() << "\n";
+  outs() << left_justify("producer", 16)
+         << getOffloadKindName(OB.getOffloadKind()) << "\n";
+}
+
+static Error visitAllBinaries(const OffloadBinary &OB) {
+  uint64_t Offset = 0;
+  uint64_t Index = 0;
+  while (Offset < OB.getMemoryBufferRef().getBufferSize()) {
+    MemoryBufferRef Buffer =
+        MemoryBufferRef(OB.getData().drop_front(Offset), OB.getFileName());
+    auto BinaryOrErr = OffloadBinary::create(Buffer);
+    if (!BinaryOrErr)
+      return BinaryOrErr.takeError();
+
+    OffloadBinary &Binary = **BinaryOrErr;
+    printBinary(Binary, Index++);
+
+    Offset += Binary.getSize();
+  }
+  return Error::success();
+}
+
+/// Print the embedded offloading contents of an ObjectFile \p O.
+void llvm::dumpOffloadBinary(const ObjectFile &O) {
+  for (SectionRef Sec : O.sections()) {
+    Expected<StringRef> Name = Sec.getName();
+    if (!Name || !Name->startswith(OffloadSectionString))
+      continue;
+
+    Expected<StringRef> Contents = Sec.getContents();
+    if (!Contents)
+      reportError(Contents.takeError(), O.getFileName());
+
+    MemoryBufferRef Buffer = MemoryBufferRef(*Contents, O.getFileName());
+    auto BinaryOrErr = OffloadBinary::create(Buffer);
+    if (!BinaryOrErr)
+      reportError(O.getFileName(), "while extracting offloading files: " +
+                                       toString(BinaryOrErr.takeError()));
+    OffloadBinary &Binary = **BinaryOrErr;
+
+    // Print out all the binaries that are contained in this buffer. If we fail
+    // to parse a binary before reaching the end of the buffer emit a warning.
+    if (Error Err = visitAllBinaries(Binary))
+      reportWarning("while parsing offloading files: " +
+                        toString(std::move(Err)),
+                    O.getFileName());
+  }
+}
+
+/// Print the contents of an offload binary file \p OB. This may contain
+/// multiple binaries stored in the same buffer.
+void llvm::dumpOffloadSections(const OffloadBinary &OB) {
+  // Print out all the binaries that are contained at this buffer. If we fail to
+  // parse a binary before reaching the end of the buffer emit a warning.
+  if (Error Err = visitAllBinaries(OB))
+    reportWarning("while parsing offloading files: " + toString(std::move(Err)),
+                  OB.getFileName());
+}
diff --git a/llvm/tools/llvm-objdump/OffloadDump.h b/llvm/tools/llvm-objdump/OffloadDump.h
new file mode 100644
index 000000000000..75f188e9d506
--- /dev/null
+++ b/llvm/tools/llvm-objdump/OffloadDump.h
@@ -0,0 +1,22 @@
+//===-- OffloadDump.h -------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_OBJDUMP_OFFLOADDUMP_H
+#define LLVM_TOOLS_LLVM_OBJDUMP_OFFLOADDUMP_H
+
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/OffloadBinary.h"
+
+namespace llvm {
+
+void dumpOffloadSections(const object::OffloadBinary &OB);
+void dumpOffloadBinary(const object::ObjectFile &O);
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/tools/llvm-objdump/OtoolOpts.td b/llvm/tools/llvm-objdump/OtoolOpts.td
index 61ea701ed75d..e8bef284c0e9 100644
--- a/llvm/tools/llvm-objdump/OtoolOpts.td
+++ b/llvm/tools/llvm-objdump/OtoolOpts.td
@@ -47,7 +47,6 @@ def X : Flag<["-"], "X">, HelpText<"omit leading addresses or headers">;
 // -addr_slide=arg
 // -function_offsets
 
-
 // Obsolete and unsupported:
 def grp_obsolete : OptionGroup<"kind">,
   HelpText<"Obsolete and unsupported flags">;
diff --git a/llvm/tools/llvm-objdump/SourcePrinter.cpp b/llvm/tools/llvm-objdump/SourcePrinter.cpp
index 8befac546204..c8ea6b543245 100644
--- a/llvm/tools/llvm-objdump/SourcePrinter.cpp
+++ b/llvm/tools/llvm-objdump/SourcePrinter.cpp
@@ -16,6 +16,8 @@
 #include "llvm-objdump.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringSet.h"
+#include "llvm/DebugInfo/DWARF/DWARFExpression.h"
+#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/FormatVariadic.h"
 
diff --git a/llvm/tools/llvm-objdump/SourcePrinter.h b/llvm/tools/llvm-objdump/SourcePrinter.h
index 31d46e3108f6..29ef19c98c80 100644
--- a/llvm/tools/llvm-objdump/SourcePrinter.h
+++ b/llvm/tools/llvm-objdump/SourcePrinter.h
@@ -13,6 +13,7 @@
 #include "llvm/ADT/StringSet.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/Symbolize/Symbolize.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/FormattedStream.h"
 #include <unordered_map>
 #include <vector>
diff --git a/llvm/tools/llvm-objdump/XCOFFDump.cpp b/llvm/tools/llvm-objdump/XCOFFDump.cpp
index b8fb2ed3d063..159741bebb67 100644
--- a/llvm/tools/llvm-objdump/XCOFFDump.cpp
+++ b/llvm/tools/llvm-objdump/XCOFFDump.cpp
@@ -106,7 +106,7 @@ std::string objdump::getXCOFFSymbolDescription(const SymbolInfoTy &SymbolInfo,
   if (SymbolInfo.XCOFFSymInfo.StorageMappingClass &&
       !SymbolInfo.XCOFFSymInfo.IsLabel) {
     const XCOFF::StorageMappingClass Smc =
-        SymbolInfo.XCOFFSymInfo.StorageMappingClass.getValue();
+        *SymbolInfo.XCOFFSymInfo.StorageMappingClass;
     Result.append(("[" + XCOFF::getMappingClassString(Smc) + "]").str());
   }
 
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index 6b238fa01d25..7cd47da9efd9 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -20,6 +20,7 @@
 #include "ELFDump.h"
 #include "MachODump.h"
 #include "ObjdumpOptID.h"
+#include "OffloadDump.h"
 #include "SourcePrinter.h"
 #include "WasmDump.h"
 #include "XCOFFDump.h"
@@ -33,6 +34,7 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
 #include "llvm/DebugInfo/Symbolize/Symbolize.h"
 #include "llvm/Demangle/Demangle.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -52,10 +54,12 @@
 #include "llvm/Object/COFF.h"
 #include "llvm/Object/COFFImportFile.h"
 #include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/ELFTypes.h"
 #include "llvm/Object/FaultMapParser.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Object/MachOUniversal.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/OffloadBinary.h"
 #include "llvm/Object/Wasm.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
@@ -196,6 +200,7 @@ std::string objdump::MCPU;
 std::vector<std::string> objdump::MAttrs;
 bool objdump::ShowRawInsn;
 bool objdump::LeadingAddr;
+static bool Offloading;
 static bool RawClangAST;
 bool objdump::Relocations;
 bool objdump::PrintImmHex;
@@ -440,8 +445,13 @@ static bool isArmElf(const ObjectFile *Obj) {
   return Elf && Elf->getEMachine() == ELF::EM_ARM;
 }
 
+static bool isCSKYElf(const ObjectFile *Obj) {
+  const auto *Elf = dyn_cast<ELFObjectFileBase>(Obj);
+  return Elf && Elf->getEMachine() == ELF::EM_CSKY;
+}
+
 static bool hasMappingSymbols(const ObjectFile *Obj) {
-  return isArmElf(Obj) || isAArch64Elf(Obj);
+  return isArmElf(Obj) || isAArch64Elf(Obj) || isCSKYElf(Obj) ;
 }
 
 static void printRelocation(formatted_raw_ostream &OS, StringRef FileName,
@@ -957,6 +967,9 @@ SymbolInfoTy objdump::createSymbolInfo(const ObjectFile *Obj,
         getXCOFFSymbolCsectSMC(XCOFFObj, Symbol);
     return SymbolInfoTy(Addr, Name, Smc, SymbolIndex,
                         isLabel(XCOFFObj, Symbol));
+  } else if (Obj->isXCOFF()) {
+    const SymbolRef::Type SymType = unwrapOrError(Symbol.getType(), FileName);
+    return SymbolInfoTy(Addr, Name, SymType, true);
   } else
     return SymbolInfoTy(Addr, Name,
                         Obj->isELF() ? getElfSymbolType(Obj, Symbol)
@@ -973,11 +986,29 @@ static SymbolInfoTy createDummySymbolInfo(const ObjectFile *Obj,
 }
 
 static void
-collectLocalBranchTargets(ArrayRef<uint8_t> Bytes, const MCInstrAnalysis *MIA,
-                          MCDisassembler *DisAsm, MCInstPrinter *IP,
-                          const MCSubtargetInfo *STI, uint64_t SectionAddr,
-                          uint64_t Start, uint64_t End,
-                          std::unordered_map<uint64_t, std::string> &Labels) {
+collectBBAddrMapLabels(const std::unordered_map<uint64_t, BBAddrMap> &AddrToBBAddrMap,
+                       uint64_t SectionAddr, uint64_t Start, uint64_t End,
+                       std::unordered_map<uint64_t, std::vector<std::string>> &Labels) {
+  if (AddrToBBAddrMap.empty())
+    return;
+  Labels.clear();
+  uint64_t StartAddress = SectionAddr + Start;
+  uint64_t EndAddress = SectionAddr + End;
+  auto Iter = AddrToBBAddrMap.find(StartAddress);
+  if (Iter == AddrToBBAddrMap.end())
+    return;
+  for (unsigned I = 0, Size = Iter->second.BBEntries.size(); I < Size; ++I) {
+    uint64_t BBAddress = Iter->second.BBEntries[I].Offset + Iter->second.Addr;
+    if (BBAddress >= EndAddress)
+      continue;
+    Labels[BBAddress].push_back(("BB" + Twine(I)).str());
+  }
+}
+
+static void collectLocalBranchTargets(
+    ArrayRef<uint8_t> Bytes, const MCInstrAnalysis *MIA, MCDisassembler *DisAsm,
+    MCInstPrinter *IP, const MCSubtargetInfo *STI, uint64_t SectionAddr,
+    uint64_t Start, uint64_t End, std::unordered_map<uint64_t, std::string> &Labels) {
   // So far only supports PowerPC and X86.
   if (!STI->getTargetTriple().isPPC() && !STI->getTargetTriple().isX86())
     return;
@@ -1006,7 +1037,6 @@ collectLocalBranchTargets(ArrayRef<uint8_t> Bytes, const MCInstrAnalysis *MIA,
           !(STI->getTargetTriple().isPPC() && Target == Index))
         Labels[Target] = ("L" + Twine(LabelCount++)).str();
     }
-
     Index += Size;
   }
 }
@@ -1241,6 +1271,20 @@ static void disassembleObject(const Target *TheTarget, const ObjectFile *Obj,
     if (!SectSize)
       continue;
 
+    std::unordered_map<uint64_t, BBAddrMap> AddrToBBAddrMap;
+    if (SymbolizeOperands) {
+      if (auto *Elf = dyn_cast<ELFObjectFileBase>(Obj)) {
+        // Read the BB-address-map corresponding to this section, if present.
+        auto SectionBBAddrMapsOrErr = Elf->readBBAddrMap(Section.getIndex());
+        if (!SectionBBAddrMapsOrErr)
+          reportWarning(toString(SectionBBAddrMapsOrErr.takeError()),
+                        Obj->getFileName());
+        for (auto &FunctionBBAddrMap : *SectionBBAddrMapsOrErr)
+          AddrToBBAddrMap.emplace(FunctionBBAddrMap.Addr,
+                                  std::move(FunctionBBAddrMap));
+      }
+    }
+
     // Get the list of all the symbols in this section.
     SectionSymbolsTy &Symbols = AllSymbols[Section];
     std::vector<MappingSymbolPair> MappingSymbols;
@@ -1367,7 +1411,7 @@ static void disassembleObject(const Target *TheTarget, const ObjectFile *Obj,
       // Right now, most targets return None i.e ignore to treat a symbol
       // separately. But WebAssembly decodes preludes for some symbols.
       //
-      if (Status.hasValue()) {
+      if (Status) {
         if (Status.getValue() == MCDisassembler::Fail) {
           outs() << "// Error in decoding " << SymbolName
                  << " : Decoding failed region as bytes.\n";
@@ -1404,9 +1448,13 @@ static void disassembleObject(const Target *TheTarget, const ObjectFile *Obj,
       formatted_raw_ostream FOS(outs());
 
       std::unordered_map<uint64_t, std::string> AllLabels;
-      if (SymbolizeOperands)
+      std::unordered_map<uint64_t, std::vector<std::string>> BBAddrMapLabels;
+      if (SymbolizeOperands) {
         collectLocalBranchTargets(Bytes, MIA, DisAsm, IP, PrimarySTI,
                                   SectionAddr, Index, End, AllLabels);
+        collectBBAddrMapLabels(AddrToBBAddrMap, SectionAddr, Index, End,
+                               BBAddrMapLabels);
+      }
 
       while (Index < End) {
         // ARM and AArch64 ELF binaries can interleave data and text in the
@@ -1450,9 +1498,15 @@ static void disassembleObject(const Target *TheTarget, const ObjectFile *Obj,
           }
 
           // Print local label if there's any.
-          auto Iter = AllLabels.find(SectionAddr + Index);
-          if (Iter != AllLabels.end())
-            FOS << "<" << Iter->second << ">:\n";
+          auto Iter1 = BBAddrMapLabels.find(SectionAddr + Index);
+          if (Iter1 != BBAddrMapLabels.end()) {
+            for (StringRef Label : Iter1->second)
+              FOS << "<" << Label << ">:\n";
+          } else {
+            auto Iter2 = AllLabels.find(SectionAddr + Index);
+            if (Iter2 != AllLabels.end())
+              FOS << "<" << Iter2->second << ">:\n";
+          }
 
           // Disassemble a real instruction or a data when disassemble all is
           // provided
@@ -1547,6 +1601,7 @@ static void disassembleObject(const Target *TheTarget, const ObjectFile *Obj,
               }
 
               // Print the labels corresponding to the target if there's any.
+              bool BBAddrMapLabelAvailable = BBAddrMapLabels.count(Target);
               bool LabelAvailable = AllLabels.count(Target);
               if (TargetSym != nullptr) {
                 uint64_t TargetAddress = TargetSym->Addr;
@@ -1560,14 +1615,18 @@ static void disassembleObject(const Target *TheTarget, const ObjectFile *Obj,
                   // Always Print the binary symbol precisely corresponding to
                   // the target address.
                   *TargetOS << TargetName;
-                } else if (!LabelAvailable) {
+                } else if (BBAddrMapLabelAvailable) {
+                  *TargetOS << BBAddrMapLabels[Target].front();
+                } else if (LabelAvailable) {
+                  *TargetOS << AllLabels[Target];
+                } else {
                   // Always Print the binary symbol plus an offset if there's no
                   // local label corresponding to the target address.
                   *TargetOS << TargetName << "+0x" << Twine::utohexstr(Disp);
-                } else {
-                  *TargetOS << AllLabels[Target];
                 }
                 *TargetOS << ">";
+              } else if (BBAddrMapLabelAvailable) {
+                *TargetOS << " <" << BBAddrMapLabels[Target].front() << ">";
               } else if (LabelAvailable) {
                 *TargetOS << " <" << AllLabels[Target] << ">";
               }
@@ -1634,9 +1693,12 @@ static void disassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
 
   // Package up features to be passed to target/subtarget
   SubtargetFeatures Features = Obj->getFeatures();
-  if (!MAttrs.empty())
+  if (!MAttrs.empty()) {
     for (unsigned I = 0; I != MAttrs.size(); ++I)
       Features.AddFeature(MAttrs[I]);
+  } else if (MCPU.empty() && Obj->getArch() == llvm::Triple::aarch64) {
+    Features.AddFeature("+all");
+  }
 
   std::unique_ptr<const MCRegisterInfo> MRI(
       TheTarget->createMCRegInfo(TripleName));
@@ -1653,7 +1715,7 @@ static void disassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
                 "no assembly info for target " + TripleName);
 
   if (MCPU.empty())
-    MCPU = Obj->tryGetCPUName().getValueOr("").str();
+    MCPU = Obj->tryGetCPUName().value_or("").str();
 
   std::unique_ptr<const MCSubtargetInfo> STI(
       TheTarget->createMCSubtargetInfo(TripleName, MCPU, Features.getString()));
@@ -1721,10 +1783,6 @@ static void disassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
 void objdump::printRelocations(const ObjectFile *Obj) {
   StringRef Fmt = Obj->getBytesInAddress() > 4 ? "%016" PRIx64 :
                                                  "%08" PRIx64;
-  // Regular objdump doesn't print relocations in non-relocatable object
-  // files.
-  if (!Obj->isRelocatableObject())
-    return;
 
   // Build a mapping from relocation target to a vector of relocation
   // sections. Usually, there is an only one relocation section for
@@ -1732,6 +1790,8 @@ void objdump::printRelocations(const ObjectFile *Obj) {
   MapVector<SectionRef, std::vector<SectionRef>> SecToRelSec;
   uint64_t Ndx;
   for (const SectionRef &Section : ToolSectionFilter(*Obj, &Ndx)) {
+    if (Obj->isELF() && (ELFSectionRef(Section).getFlags() & ELF::SHF_ALLOC))
+      continue;
     if (Section.relocation_begin() == Section.relocation_end())
       continue;
     Expected<section_iterator> SecOrErr = Section.getRelocatedSection();
@@ -2073,7 +2133,7 @@ void objdump::printSymbol(const ObjectFile *O, const SymbolRef &Symbol,
           dyn_cast<const XCOFFObjectFile>(O), Symbol);
       if (SymRef) {
 
-        Expected<StringRef> NameOrErr = SymRef.getValue().getName();
+        Expected<StringRef> NameOrErr = SymRef->getName();
 
         if (NameOrErr) {
           outs() << " (csect:";
@@ -2227,13 +2287,13 @@ static void printFaultMaps(const ObjectFile *Obj) {
 
   outs() << "FaultMap table:\n";
 
-  if (!FaultMapSection.hasValue()) {
+  if (!FaultMapSection) {
     outs() << "<not found>\n";
     return;
   }
 
   StringRef FaultMapContents =
-      unwrapOrError(FaultMapSection.getValue().getContents(), Obj->getFileName());
+      unwrapOrError(FaultMapSection->getContents(), Obj->getFileName());
   FaultMapParser FMP(FaultMapContents.bytes_begin(),
                      FaultMapContents.bytes_end());
 
@@ -2423,6 +2483,8 @@ static void dumpObject(ObjectFile *O, const Archive *A = nullptr,
     printRawClangAST(O);
   if (FaultMapSection)
     printFaultMaps(O);
+  if (Offloading)
+    dumpOffloadBinary(*O);
 }
 
 static void dumpObject(const COFFImportFile *I, const Archive *A,
@@ -2486,6 +2548,8 @@ static void dumpInput(StringRef file) {
     dumpObject(O);
   else if (MachOUniversalBinary *UB = dyn_cast<MachOUniversalBinary>(&Binary))
     parseInputMachO(UB);
+  else if (OffloadBinary *OB = dyn_cast<OffloadBinary>(&Binary))
+    dumpOffloadSections(*OB);
   else
     reportError(errorCodeToError(object_error::invalid_file_type), file);
 }
@@ -2589,6 +2653,7 @@ static void parseObjdumpOptions(const llvm::opt::InputArgList &InputArgs) {
   }
   DynamicRelocations = InputArgs.hasArg(OBJDUMP_dynamic_reloc);
   FaultMapSection = InputArgs.hasArg(OBJDUMP_fault_map_section);
+  Offloading = InputArgs.hasArg(OBJDUMP_offloading);
   FileHeaders = InputArgs.hasArg(OBJDUMP_file_headers);
   SectionContents = InputArgs.hasArg(OBJDUMP_full_contents);
   PrintLines = InputArgs.hasArg(OBJDUMP_line_numbers);
@@ -2756,12 +2821,12 @@ int main(int argc, char **argv) {
   if (!ArchiveHeaders && !Disassemble && DwarfDumpType == DIDT_Null &&
       !DynamicRelocations && !FileHeaders && !PrivateHeaders && !RawClangAST &&
       !Relocations && !SectionHeaders && !SectionContents && !SymbolTable &&
-      !DynamicSymbolTable && !UnwindInfo && !FaultMapSection &&
-      !(MachOOpt &&
-        (Bind || DataInCode || DylibId || DylibsUsed || ExportsTrie ||
-         FirstPrivateHeader || FunctionStarts || IndirectSymbols || InfoPlist ||
-         LazyBind || LinkOptHints || ObjcMetaData || Rebase || Rpaths ||
-         UniversalHeaders || WeakBind || !FilterSections.empty()))) {
+      !DynamicSymbolTable && !UnwindInfo && !FaultMapSection && !Offloading &&
+      !(MachOOpt && (Bind || DataInCode || DyldInfo || DylibId || DylibsUsed ||
+                     ExportsTrie || FirstPrivateHeader || FunctionStarts ||
+                     IndirectSymbols || InfoPlist || LazyBind || LinkOptHints ||
+                     ObjcMetaData || Rebase || Rpaths || UniversalHeaders ||
+                     WeakBind || !FilterSections.empty()))) {
     T->printHelp(ToolName);
     return 2;
   }
diff --git a/llvm/tools/llvm-pdbutil/BytesOutputStyle.cpp b/llvm/tools/llvm-pdbutil/BytesOutputStyle.cpp
index ffc907e09f11..4c851e14a12d 100644
--- a/llvm/tools/llvm-pdbutil/BytesOutputStyle.cpp
+++ b/llvm/tools/llvm-pdbutil/BytesOutputStyle.cpp
@@ -8,7 +8,6 @@
 
 #include "BytesOutputStyle.h"
 
-#include "FormatUtil.h"
 #include "StreamUtil.h"
 #include "llvm-pdbutil.h"
 
@@ -17,6 +16,7 @@
 #include "llvm/DebugInfo/MSF/MSFCommon.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/PDB/Native/DbiStream.h"
+#include "llvm/DebugInfo/PDB/Native/FormatUtil.h"
 #include "llvm/DebugInfo/PDB/Native/InfoStream.h"
 #include "llvm/DebugInfo/PDB/Native/ModuleDebugStream.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
@@ -83,13 +83,13 @@ static void printHeader(LinePrinter &P, const Twine &S) {
 }
 
 BytesOutputStyle::BytesOutputStyle(PDBFile &File)
-    : File(File), P(2, false, outs()) {}
+    : File(File), P(2, false, outs(), opts::Filters) {}
 
 Error BytesOutputStyle::dump() {
 
-  if (opts::bytes::DumpBlockRange.hasValue()) {
+  if (opts::bytes::DumpBlockRange) {
     auto &R = *opts::bytes::DumpBlockRange;
-    uint32_t Max = R.Max.getValueOr(R.Min);
+    uint32_t Max = R.Max.value_or(R.Min);
 
     if (Max < R.Min)
       return make_error<StringError>(
@@ -104,9 +104,9 @@ Error BytesOutputStyle::dump() {
     P.NewLine();
   }
 
-  if (opts::bytes::DumpByteRange.hasValue()) {
+  if (opts::bytes::DumpByteRange) {
     auto &R = *opts::bytes::DumpByteRange;
-    uint32_t Max = R.Max.getValueOr(File.getFileSize());
+    uint32_t Max = R.Max.value_or(File.getFileSize());
 
     if (Max < R.Min)
       return make_error<StringError>("Invalid byte range specified.  Max < Min",
diff --git a/llvm/tools/llvm-pdbutil/BytesOutputStyle.h b/llvm/tools/llvm-pdbutil/BytesOutputStyle.h
index d3aceb47679e..cd28032fe7cd 100644
--- a/llvm/tools/llvm-pdbutil/BytesOutputStyle.h
+++ b/llvm/tools/llvm-pdbutil/BytesOutputStyle.h
@@ -9,10 +9,10 @@
 #ifndef LLVM_TOOLS_LLVMPDBDUMP_BYTESOUTPUTSTYLE_H
 #define LLVM_TOOLS_LLVMPDBDUMP_BYTESOUTPUTSTYLE_H
 
-#include "LinePrinter.h"
 #include "OutputStyle.h"
 #include "StreamUtil.h"
 
+#include "llvm/DebugInfo/PDB/Native/LinePrinter.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
diff --git a/llvm/tools/llvm-pdbutil/DumpOutputStyle.cpp b/llvm/tools/llvm-pdbutil/DumpOutputStyle.cpp
index ef299ea9d482..a173eb1faa62 100644
--- a/llvm/tools/llvm-pdbutil/DumpOutputStyle.cpp
+++ b/llvm/tools/llvm-pdbutil/DumpOutputStyle.cpp
@@ -8,8 +8,6 @@
 
 #include "DumpOutputStyle.h"
 
-#include "FormatUtil.h"
-#include "InputFile.h"
 #include "MinimalSymbolDumper.h"
 #include "MinimalTypeDumper.h"
 #include "StreamUtil.h"
@@ -38,10 +36,13 @@
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/PDB/Native/DbiModuleDescriptor.h"
 #include "llvm/DebugInfo/PDB/Native/DbiStream.h"
+#include "llvm/DebugInfo/PDB/Native/FormatUtil.h"
 #include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
 #include "llvm/DebugInfo/PDB/Native/ISectionContribVisitor.h"
 #include "llvm/DebugInfo/PDB/Native/InfoStream.h"
+#include "llvm/DebugInfo/PDB/Native/InputFile.h"
 #include "llvm/DebugInfo/PDB/Native/ModuleDebugStream.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/Native/PublicsStream.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
@@ -61,7 +62,7 @@ using namespace llvm::msf;
 using namespace llvm::pdb;
 
 DumpOutputStyle::DumpOutputStyle(InputFile &File)
-    : File(File), P(2, false, outs()) {
+    : File(File), P(2, false, outs(), opts::Filters) {
   if (opts::dump::DumpTypeRefStats)
     RefTracker.reset(new TypeReferenceTracker(File));
 }
@@ -99,8 +100,8 @@ Error DumpOutputStyle::dump() {
   }
 
   if (opts::dump::DumpSymbolStats) {
-    if (auto EC = dumpSymbolStats())
-      return EC;
+    ExitOnError Err("Unexpected error processing module stats: ");
+    Err(dumpSymbolStats());
     P.NewLine();
   }
 
@@ -129,33 +130,33 @@ Error DumpOutputStyle::dump() {
   }
 
   if (opts::dump::DumpModules) {
-    if (auto EC = dumpModules())
-      return EC;
+    ExitOnError Err("Unexpected error processing modules: ");
+    Err(dumpModules());
   }
 
   if (opts::dump::DumpModuleFiles) {
-    if (auto EC = dumpModuleFiles())
-      return EC;
+    ExitOnError Err("Unexpected error processing files: ");
+    Err(dumpModuleFiles());
   }
 
   if (opts::dump::DumpLines) {
-    if (auto EC = dumpLines())
-      return EC;
+    ExitOnError Err("Unexpected error processing lines: ");
+    Err(dumpLines());
   }
 
   if (opts::dump::DumpInlineeLines) {
-    if (auto EC = dumpInlineeLines())
-      return EC;
+    ExitOnError Err("Unexpected error processing inlinee lines: ");
+    Err(dumpInlineeLines());
   }
 
   if (opts::dump::DumpXmi) {
-    if (auto EC = dumpXmi())
-      return EC;
+    ExitOnError Err("Unexpected error processing cross module imports: ");
+    Err(dumpXmi());
   }
 
   if (opts::dump::DumpXme) {
-    if (auto EC = dumpXme())
-      return EC;
+    ExitOnError Err("Unexpected error processing cross module exports: ");
+    Err(dumpXme());
   }
 
   if (opts::dump::DumpFpo) {
@@ -198,9 +199,8 @@ Error DumpOutputStyle::dump() {
   }
 
   if (opts::dump::DumpSymbols) {
-    auto EC = File.isPdb() ? dumpModuleSymsForPdb() : dumpModuleSymsForObj();
-    if (EC)
-      return EC;
+    ExitOnError Err("Unexpected error processing symbols: ");
+    Err(File.isPdb() ? dumpModuleSymsForPdb() : dumpModuleSymsForObj());
   }
 
   if (opts::dump::DumpTypeRefStats) {
@@ -260,7 +260,7 @@ Error DumpOutputStyle::dumpFileSummary() {
   P.formatLine("Has Globals: {0}", getPdb().hasPDBGlobalsStream());
   P.formatLine("Has Publics: {0}", getPdb().hasPDBPublicsStream());
   if (getPdb().hasPDBDbiStream()) {
-    auto &DBI = Err(getPdb().getPDBDbiStream());
+    DbiStream &DBI = Err(getPdb().getPDBDbiStream());
     P.formatLine("Is incrementally linked: {0}", DBI.isIncrementallyLinked());
     P.formatLine("Has conflicting types: {0}", DBI.hasCTypes());
     P.formatLine("Is stripped: {0}", DBI.isStripped());
@@ -343,36 +343,6 @@ static void printModuleDetailStats(LinePrinter &P, StringRef Label,
   }
 }
 
-static bool isMyCode(const SymbolGroup &Group) {
-  if (Group.getFile().isObj())
-    return true;
-
-  StringRef Name = Group.name();
-  if (Name.startswith("Import:"))
-    return false;
-  if (Name.endswith_insensitive(".dll"))
-    return false;
-  if (Name.equals_insensitive("* linker *"))
-    return false;
-  if (Name.startswith_insensitive("f:\\binaries\\Intermediate\\vctools"))
-    return false;
-  if (Name.startswith_insensitive("f:\\dd\\vctools\\crt"))
-    return false;
-  return true;
-}
-
-static bool shouldDumpSymbolGroup(uint32_t Idx, const SymbolGroup &Group) {
-  if (opts::dump::JustMyCode && !isMyCode(Group))
-    return false;
-
-  // If the arg was not specified on the command line, always dump all modules.
-  if (opts::dump::DumpModi.getNumOccurrences() == 0)
-    return true;
-
-  // Otherwise, only dump if this is the same module specified.
-  return (opts::dump::DumpModi == Idx);
-}
-
 Error DumpOutputStyle::dumpStreamSummary() {
   printHeader(P, "Streams");
 
@@ -389,7 +359,7 @@ Error DumpOutputStyle::dumpStreamSummary() {
   uint32_t StreamCount = getPdb().getNumStreams();
   uint32_t MaxStreamSize = getPdb().getMaxStreamSize();
 
-  for (uint16_t StreamIdx = 0; StreamIdx < StreamCount; ++StreamIdx) {
+  for (uint32_t StreamIdx = 0; StreamIdx < StreamCount; ++StreamIdx) {
     P.formatLine(
         "Stream {0} ({1} bytes): [{2}]",
         fmt_align(StreamIdx, AlignStyle::Right, NumDigits(StreamCount)),
@@ -409,93 +379,6 @@ Error DumpOutputStyle::dumpStreamSummary() {
   return Error::success();
 }
 
-static Expected<ModuleDebugStreamRef> getModuleDebugStream(PDBFile &File,
-                                                           uint32_t Index) {
-  ExitOnError Err("Unexpected error: ");
-
-  auto &Dbi = Err(File.getPDBDbiStream());
-  const auto &Modules = Dbi.modules();
-  auto Modi = Modules.getModuleDescriptor(Index);
-
-  uint16_t ModiStream = Modi.getModuleStreamIndex();
-  if (ModiStream == kInvalidStreamIndex)
-    return make_error<RawError>(raw_error_code::no_stream,
-                                "Module stream not present");
-
-  auto ModStreamData = File.createIndexedStream(ModiStream);
-
-  ModuleDebugStreamRef ModS(Modi, std::move(ModStreamData));
-  if (auto EC = ModS.reload())
-    return make_error<RawError>(raw_error_code::corrupt_file,
-                                "Invalid module stream");
-
-  return std::move(ModS);
-}
-
-template <typename CallbackT>
-static void
-iterateOneModule(InputFile &File, const Optional<PrintScope> &HeaderScope,
-                 const SymbolGroup &SG, uint32_t Modi, CallbackT Callback) {
-  if (HeaderScope) {
-    HeaderScope->P.formatLine(
-        "Mod {0:4} | `{1}`: ",
-        fmt_align(Modi, AlignStyle::Right, HeaderScope->LabelWidth), SG.name());
-  }
-
-  AutoIndent Indent(HeaderScope);
-  Callback(Modi, SG);
-}
-
-template <typename CallbackT>
-static void iterateSymbolGroups(InputFile &Input,
-                                const Optional<PrintScope> &HeaderScope,
-                                CallbackT Callback) {
-  AutoIndent Indent(HeaderScope);
-
-  ExitOnError Err("Unexpected error processing modules: ");
-
-  if (opts::dump::DumpModi.getNumOccurrences() > 0) {
-    assert(opts::dump::DumpModi.getNumOccurrences() == 1);
-    uint32_t Modi = opts::dump::DumpModi;
-    SymbolGroup SG(&Input, Modi);
-    iterateOneModule(Input, withLabelWidth(HeaderScope, NumDigits(Modi)), SG,
-                     Modi, Callback);
-    return;
-  }
-
-  uint32_t I = 0;
-
-  for (const auto &SG : Input.symbol_groups()) {
-    if (shouldDumpSymbolGroup(I, SG))
-      iterateOneModule(Input, withLabelWidth(HeaderScope, NumDigits(I)), SG, I,
-                       Callback);
-
-    ++I;
-  }
-}
-
-template <typename SubsectionT>
-static void iterateModuleSubsections(
-    InputFile &File, const Optional<PrintScope> &HeaderScope,
-    llvm::function_ref<void(uint32_t, const SymbolGroup &, SubsectionT &)>
-        Callback) {
-
-  iterateSymbolGroups(File, HeaderScope,
-                      [&](uint32_t Modi, const SymbolGroup &SG) {
-                        for (const auto &SS : SG.getDebugSubsections()) {
-                          SubsectionT Subsection;
-
-                          if (SS.kind() != Subsection.kind())
-                            continue;
-
-                          BinaryStreamReader Reader(SS.getRecordData());
-                          if (auto EC = Subsection.initialize(Reader))
-                            continue;
-                          Callback(Modi, SG, Subsection);
-                        }
-                      });
-}
-
 static Expected<std::pair<std::unique_ptr<MappedBlockStream>,
                           ArrayRef<llvm::object::coff_section>>>
 loadSectionHeaders(PDBFile &File, DbgHeaderType Type) {
@@ -504,7 +387,7 @@ loadSectionHeaders(PDBFile &File, DbgHeaderType Type) {
         "Section headers require a DBI Stream, which could not be loaded",
         inconvertibleErrorCode());
 
-  auto &Dbi = cantFail(File.getPDBDbiStream());
+  DbiStream &Dbi = cantFail(File.getPDBDbiStream());
   uint32_t SI = Dbi.getDebugStreamIndex(Type);
 
   if (SI == kInvalidStreamIndex)
@@ -529,10 +412,10 @@ loadSectionHeaders(PDBFile &File, DbgHeaderType Type) {
   return std::make_pair(std::move(Stream), Headers);
 }
 
-static std::vector<std::string> getSectionNames(PDBFile &File) {
+static Expected<std::vector<std::string>> getSectionNames(PDBFile &File) {
   auto ExpectedHeaders = loadSectionHeaders(File, DbgHeaderType::SectionHdr);
   if (!ExpectedHeaders)
-    return {};
+    return ExpectedHeaders.takeError();
 
   std::unique_ptr<MappedBlockStream> Stream;
   ArrayRef<object::coff_section> Headers;
@@ -590,31 +473,44 @@ Error DumpOutputStyle::dumpModules() {
   }
 
   AutoIndent Indent(P);
-  ExitOnError Err("Unexpected error processing modules: ");
 
-  auto &Stream = Err(getPdb().getPDBDbiStream());
+  Expected<DbiStream &> StreamOrErr = getPdb().getPDBDbiStream();
+  if (!StreamOrErr)
+    return StreamOrErr.takeError();
+  DbiStream &Stream = *StreamOrErr;
 
   const DbiModuleList &Modules = Stream.modules();
-  iterateSymbolGroups(
-      File, PrintScope{P, 11}, [&](uint32_t Modi, const SymbolGroup &Strings) {
+  return iterateSymbolGroups(
+      File, PrintScope{P, 11},
+      [&](uint32_t Modi, const SymbolGroup &Strings) -> Error {
         auto Desc = Modules.getModuleDescriptor(Modi);
         if (opts::dump::DumpSectionContribs) {
-          std::vector<std::string> Sections = getSectionNames(getPdb());
+          auto SectionsOrErr = getSectionNames(getPdb());
+          if (!SectionsOrErr)
+            return SectionsOrErr.takeError();
+          ArrayRef<std::string> Sections = *SectionsOrErr;
           dumpSectionContrib(P, Desc.getSectionContrib(), Sections, 0);
         }
         P.formatLine("Obj: `{0}`: ", Desc.getObjFileName());
         P.formatLine("debug stream: {0}, # files: {1}, has ec info: {2}",
                      Desc.getModuleStreamIndex(), Desc.getNumberOfFiles(),
                      Desc.hasECInfo());
-        StringRef PdbFilePath =
-            Err(Stream.getECName(Desc.getPdbFilePathNameIndex()));
-        StringRef SrcFilePath =
-            Err(Stream.getECName(Desc.getSourceFileNameIndex()));
+
+        auto PdbPathOrErr = Stream.getECName(Desc.getPdbFilePathNameIndex());
+        if (!PdbPathOrErr)
+          return PdbPathOrErr.takeError();
+        StringRef PdbFilePath = *PdbPathOrErr;
+
+        auto SrcPathOrErr = Stream.getECName(Desc.getSourceFileNameIndex());
+        if (!SrcPathOrErr)
+          return SrcPathOrErr.takeError();
+        StringRef SrcFilePath = *SrcPathOrErr;
+
         P.formatLine("pdb file ni: {0} `{1}`, src file ni: {2} `{3}`",
                      Desc.getPdbFilePathNameIndex(), PdbFilePath,
                      Desc.getSourceFileNameIndex(), SrcFilePath);
+        return Error::success();
       });
-  return Error::success();
 }
 
 Error DumpOutputStyle::dumpModuleFiles() {
@@ -630,18 +526,20 @@ Error DumpOutputStyle::dumpModuleFiles() {
     return Error::success();
   }
 
-  ExitOnError Err("Unexpected error processing modules: ");
-
-  iterateSymbolGroups(File, PrintScope{P, 11},
-                      [this, &Err](uint32_t Modi, const SymbolGroup &Strings) {
-                        auto &Stream = Err(getPdb().getPDBDbiStream());
+  return iterateSymbolGroups(
+      File, PrintScope{P, 11},
+      [this](uint32_t Modi, const SymbolGroup &Strings) -> Error {
+        Expected<DbiStream &> StreamOrErr = getPdb().getPDBDbiStream();
+        if (!StreamOrErr)
+          return StreamOrErr.takeError();
+        DbiStream &Stream = *StreamOrErr;
 
-                        const DbiModuleList &Modules = Stream.modules();
-                        for (const auto &F : Modules.source_files(Modi)) {
-                          Strings.formatFromFileName(P, F);
-                        }
-                      });
-  return Error::success();
+        const DbiModuleList &Modules = Stream.modules();
+        for (const auto &F : Modules.source_files(Modi)) {
+          Strings.formatFromFileName(P, F);
+        }
+        return Error::success();
+      });
 }
 
 Error DumpOutputStyle::dumpSymbolStats() {
@@ -652,39 +550,40 @@ Error DumpOutputStyle::dumpSymbolStats() {
     return Error::success();
   }
 
-  ExitOnError Err("Unexpected error processing modules: ");
-
   StatCollection SymStats;
   StatCollection ChunkStats;
-
-  Optional<PrintScope> Scope;
-  if (File.isPdb())
-    Scope.emplace(P, 2);
-
-  iterateSymbolGroups(File, Scope, [&](uint32_t Modi, const SymbolGroup &SG) {
-    StatCollection SS = getSymbolStats(SG, SymStats);
-    StatCollection CS = getChunkStats(SG, ChunkStats);
-
-    if (SG.getFile().isPdb()) {
-      AutoIndent Indent(P);
-      auto Modules = cantFail(File.pdb().getPDBDbiStream()).modules();
-      uint32_t ModCount = Modules.getModuleCount();
-      DbiModuleDescriptor Desc = Modules.getModuleDescriptor(Modi);
-      uint32_t StreamIdx = Desc.getModuleStreamIndex();
-
-      if (StreamIdx == kInvalidStreamIndex) {
-        P.formatLine("Mod {0} (debug info not present): [{1}]",
-                     fmt_align(Modi, AlignStyle::Right, NumDigits(ModCount)),
-                     Desc.getModuleName());
-        return;
-      }
-      P.formatLine("Stream {0}, {1} bytes", StreamIdx,
-                   getPdb().getStreamByteSize(StreamIdx));
-
-      printModuleDetailStats<SymbolKind>(P, "Symbols", SS);
-      printModuleDetailStats<DebugSubsectionKind>(P, "Chunks", CS);
-    }
-  });
+  PrintScope Scope(P, 2);
+
+  if (Error Err = iterateSymbolGroups(
+          File, Scope, [&](uint32_t Modi, const SymbolGroup &SG) -> Error {
+            StatCollection SS = getSymbolStats(SG, SymStats);
+            StatCollection CS = getChunkStats(SG, ChunkStats);
+
+            if (!SG.getFile().isPdb())
+              return Error::success();
+
+            AutoIndent Indent(P);
+            auto Modules = cantFail(File.pdb().getPDBDbiStream()).modules();
+            uint32_t ModCount = Modules.getModuleCount();
+            DbiModuleDescriptor Desc = Modules.getModuleDescriptor(Modi);
+            uint32_t StreamIdx = Desc.getModuleStreamIndex();
+
+            if (StreamIdx == kInvalidStreamIndex) {
+              P.formatLine(
+                  "Mod {0} (debug info not present): [{1}]",
+                  fmt_align(Modi, AlignStyle::Right, NumDigits(ModCount)),
+                  Desc.getModuleName());
+              return Error::success();
+            }
+            P.formatLine("Stream {0}, {1} bytes", StreamIdx,
+                         getPdb().getStreamByteSize(StreamIdx));
+
+            printModuleDetailStats<SymbolKind>(P, "Symbols", SS);
+            printModuleDetailStats<DebugSubsectionKind>(P, "Chunks", CS);
+
+            return Error::success();
+          }))
+    return Err;
 
   if (SymStats.Totals.Count > 0) {
     P.printLine("  Summary |");
@@ -944,11 +843,11 @@ Error DumpOutputStyle::dumpLines() {
 
   uint32_t LastModi = UINT32_MAX;
   uint32_t LastNameIndex = UINT32_MAX;
-  iterateModuleSubsections<DebugLinesSubsectionRef>(
+  return iterateModuleSubsections<DebugLinesSubsectionRef>(
       File, PrintScope{P, 4},
-      [this, &LastModi, &LastNameIndex](uint32_t Modi,
-                                        const SymbolGroup &Strings,
-                                        DebugLinesSubsectionRef &Lines) {
+      [this, &LastModi,
+       &LastNameIndex](uint32_t Modi, const SymbolGroup &Strings,
+                       DebugLinesSubsectionRef &Lines) -> Error {
         uint16_t Segment = Lines.header()->RelocSegment;
         uint32_t Begin = Lines.header()->RelocOffset;
         uint32_t End = Begin + Lines.header()->CodeSize;
@@ -970,9 +869,8 @@ Error DumpOutputStyle::dumpLines() {
           P.NewLine();
           typesetLinesAndColumns(P, Begin, Block);
         }
+        return Error::success();
       });
-
-  return Error::success();
 }
 
 Error DumpOutputStyle::dumpInlineeLines() {
@@ -983,10 +881,10 @@ Error DumpOutputStyle::dumpInlineeLines() {
     return Error::success();
   }
 
-  iterateModuleSubsections<DebugInlineeLinesSubsectionRef>(
+  return iterateModuleSubsections<DebugInlineeLinesSubsectionRef>(
       File, PrintScope{P, 2},
       [this](uint32_t Modi, const SymbolGroup &Strings,
-             DebugInlineeLinesSubsectionRef &Lines) {
+             DebugInlineeLinesSubsectionRef &Lines) -> Error {
         P.formatLine("{0,+8} | {1,+5} | {2}", "Inlinee", "Line", "Source File");
         for (const auto &Entry : Lines) {
           P.formatLine("{0,+8} | {1,+5} | ", Entry.Header->Inlinee,
@@ -998,9 +896,8 @@ Error DumpOutputStyle::dumpInlineeLines() {
           }
         }
         P.NewLine();
+        return Error::success();
       });
-
-  return Error::success();
 }
 
 Error DumpOutputStyle::dumpXmi() {
@@ -1011,10 +908,10 @@ Error DumpOutputStyle::dumpXmi() {
     return Error::success();
   }
 
-  iterateModuleSubsections<DebugCrossModuleImportsSubsectionRef>(
+  return iterateModuleSubsections<DebugCrossModuleImportsSubsectionRef>(
       File, PrintScope{P, 2},
       [this](uint32_t Modi, const SymbolGroup &Strings,
-             DebugCrossModuleImportsSubsectionRef &Imports) {
+             DebugCrossModuleImportsSubsectionRef &Imports) -> Error {
         P.formatLine("{0,=32} | {1}", "Imported Module", "Type IDs");
 
         for (const auto &Xmi : Imports) {
@@ -1039,9 +936,8 @@ Error DumpOutputStyle::dumpXmi() {
               typesetItemList(TIs, P.getIndentLevel() + 35, 12, " ");
           P.formatLine("{0,+32} | {1}", Module, Result);
         }
+        return Error::success();
       });
-
-  return Error::success();
 }
 
 Error DumpOutputStyle::dumpXme() {
@@ -1052,18 +948,17 @@ Error DumpOutputStyle::dumpXme() {
     return Error::success();
   }
 
-  iterateModuleSubsections<DebugCrossModuleExportsSubsectionRef>(
+  return iterateModuleSubsections<DebugCrossModuleExportsSubsectionRef>(
       File, PrintScope{P, 2},
       [this](uint32_t Modi, const SymbolGroup &Strings,
-             DebugCrossModuleExportsSubsectionRef &Exports) {
+             DebugCrossModuleExportsSubsectionRef &Exports) -> Error {
         P.formatLine("{0,-10} | {1}", "Local ID", "Global ID");
         for (const auto &Export : Exports) {
           P.formatLine("{0,+10:X+} | {1}", TypeIndex(Export.Local),
                        TypeIndex(Export.Global));
         }
+        return Error::success();
       });
-
-  return Error::success();
 }
 
 std::string formatFrameType(object::frame_type FT) {
@@ -1084,7 +979,7 @@ Error DumpOutputStyle::dumpOldFpo(PDBFile &File) {
   printHeader(P, "Old FPO Data");
 
   ExitOnError Err("Error dumping old fpo data:");
-  auto &Dbi = Err(File.getPDBDbiStream());
+  DbiStream &Dbi = Err(File.getPDBDbiStream());
 
   if (!Dbi.hasOldFpoRecords()) {
     printStreamNotPresent("FPO");
@@ -1111,7 +1006,7 @@ Error DumpOutputStyle::dumpNewFpo(PDBFile &File) {
   printHeader(P, "New FPO Data");
 
   ExitOnError Err("Error dumping new fpo data:");
-  auto &Dbi = Err(File.getPDBDbiStream());
+  DbiStream &Dbi = Err(File.getPDBDbiStream());
 
   if (!Dbi.hasNewFpoRecords()) {
     printStreamNotPresent("New FPO");
@@ -1232,10 +1127,10 @@ Error DumpOutputStyle::dumpStringTableFromPdb() {
 }
 
 Error DumpOutputStyle::dumpStringTableFromObj() {
-  iterateModuleSubsections<DebugStringTableSubsectionRef>(
+  return iterateModuleSubsections<DebugStringTableSubsectionRef>(
       File, PrintScope{P, 4},
       [&](uint32_t Modi, const SymbolGroup &Strings,
-          DebugStringTableSubsectionRef &Strings2) {
+          DebugStringTableSubsectionRef &Strings2) -> Error {
         BinaryStreamRef StringTableBuffer = Strings2.getBuffer();
         BinaryStreamReader Reader(StringTableBuffer);
         while (Reader.bytesRemaining() > 0) {
@@ -1248,8 +1143,8 @@ Error DumpOutputStyle::dumpStringTableFromObj() {
           P.formatLine("{0} | {1}", fmt_align(Offset, AlignStyle::Right, 4),
                        Str);
         }
+        return Error::success();
       });
-  return Error::success();
 }
 
 Error DumpOutputStyle::dumpNamedStreams() {
@@ -1352,10 +1247,16 @@ static void dumpPartialTypeStream(LinePrinter &Printer,
 
     for (const auto &I : TiList) {
       TypeIndex TI(I);
-      CVType Type = Types.getType(TI);
-      if (auto EC = codeview::visitTypeRecord(Type, TI, V))
-        Printer.formatLine("An error occurred dumping type record {0}: {1}", TI,
-                           toString(std::move(EC)));
+      if (TI.isSimple()) {
+        Printer.formatLine("{0} | {1}", fmt_align(I, AlignStyle::Right, Width),
+                           Types.getTypeName(TI));
+      } else if (Optional<CVType> Type = Types.tryGetType(TI)) {
+        if (auto EC = codeview::visitTypeRecord(*Type, TI, V))
+          Printer.formatLine("An error occurred dumping type record {0}: {1}",
+                             TI, toString(std::move(EC)));
+      } else {
+        Printer.formatLine("Type {0} doesn't exist in TPI stream", TI);
+      }
     }
   }
 }
@@ -1526,8 +1427,6 @@ Error DumpOutputStyle::dumpModuleSymsForObj() {
 
   AutoIndent Indent(P);
 
-  ExitOnError Err("Unexpected error processing symbols: ");
-
   auto &Types = File.types();
 
   SymbolVisitorCallbackPipeline Pipeline;
@@ -1538,25 +1437,18 @@ Error DumpOutputStyle::dumpModuleSymsForObj() {
   Pipeline.addCallbackToPipeline(Dumper);
   CVSymbolVisitor Visitor(Pipeline);
 
-  std::unique_ptr<llvm::Error> SymbolError;
-
-  iterateModuleSubsections<DebugSymbolsSubsectionRef>(
+  return iterateModuleSubsections<DebugSymbolsSubsectionRef>(
       File, PrintScope{P, 2},
       [&](uint32_t Modi, const SymbolGroup &Strings,
-          DebugSymbolsSubsectionRef &Symbols) {
+          DebugSymbolsSubsectionRef &Symbols) -> Error {
         Dumper.setSymbolGroup(&Strings);
         for (auto Symbol : Symbols) {
           if (auto EC = Visitor.visitSymbolRecord(Symbol)) {
-            SymbolError = std::make_unique<Error>(std::move(EC));
-            return;
+            return EC;
           }
         }
+        return Error::success();
       });
-
-  if (SymbolError)
-    return std::move(*SymbolError);
-
-  return Error::success();
 }
 
 Error DumpOutputStyle::dumpModuleSymsForPdb() {
@@ -1568,18 +1460,18 @@ Error DumpOutputStyle::dumpModuleSymsForPdb() {
   }
 
   AutoIndent Indent(P);
-  ExitOnError Err("Unexpected error processing symbols: ");
 
   auto &Ids = File.ids();
   auto &Types = File.types();
 
-  iterateSymbolGroups(
-      File, PrintScope{P, 2}, [&](uint32_t I, const SymbolGroup &Strings) {
+  return iterateSymbolGroups(
+      File, PrintScope{P, 2},
+      [&](uint32_t I, const SymbolGroup &Strings) -> Error {
         auto ExpectedModS = getModuleDebugStream(File.pdb(), I);
         if (!ExpectedModS) {
           P.formatLine("Error loading module stream {0}.  {1}", I,
                        toString(ExpectedModS.takeError()));
-          return;
+          return Error::success();
         }
 
         ModuleDebugStreamRef &ModS = *ExpectedModS;
@@ -1593,14 +1485,25 @@ Error DumpOutputStyle::dumpModuleSymsForPdb() {
         Pipeline.addCallbackToPipeline(Dumper);
         CVSymbolVisitor Visitor(Pipeline);
         auto SS = ModS.getSymbolsSubstream();
-        if (auto EC =
-                Visitor.visitSymbolStream(ModS.getSymbolArray(), SS.Offset)) {
+        if (opts::Filters.SymbolOffset) {
+          CVSymbolVisitor::FilterOptions Filter;
+          Filter.SymbolOffset = opts::Filters.SymbolOffset;
+          Filter.ParentRecursiveDepth = opts::Filters.ParentRecurseDepth;
+          Filter.ChildRecursiveDepth = opts::Filters.ChildrenRecurseDepth;
+          if (auto EC = Visitor.visitSymbolStreamFiltered(ModS.getSymbolArray(),
+                                                          Filter)) {
+            P.formatLine("Error while processing symbol records.  {0}",
+                         toString(std::move(EC)));
+            return EC;
+          }
+        } else if (auto EC = Visitor.visitSymbolStream(ModS.getSymbolArray(),
+                                                       SS.Offset)) {
           P.formatLine("Error while processing symbol records.  {0}",
                        toString(std::move(EC)));
-          return;
+          return EC;
         }
+        return Error::success();
       });
-  return Error::success();
 }
 
 Error DumpOutputStyle::dumpTypeRefStats() {
@@ -1925,7 +1828,7 @@ Error DumpOutputStyle::dumpSectionContribs() {
   AutoIndent Indent(P);
   ExitOnError Err("Error dumping section contributions: ");
 
-  auto &Dbi = Err(getPdb().getPDBDbiStream());
+  DbiStream &Dbi = Err(getPdb().getPDBDbiStream());
 
   class Visitor : public ISectionContribVisitor {
   public:
@@ -1948,8 +1851,11 @@ Error DumpOutputStyle::dumpSectionContribs() {
     ArrayRef<std::string> Names;
   };
 
-  std::vector<std::string> Names = getSectionNames(getPdb());
-  Visitor V(P, makeArrayRef(Names));
+  auto NamesOrErr = getSectionNames(getPdb());
+  if (!NamesOrErr)
+    return NamesOrErr.takeError();
+  ArrayRef<std::string> Names = *NamesOrErr;
+  Visitor V(P, Names);
   Dbi.visitSectionContributions(V);
   return Error::success();
 }
@@ -1970,7 +1876,7 @@ Error DumpOutputStyle::dumpSectionMap() {
   AutoIndent Indent(P);
   ExitOnError Err("Error dumping section map: ");
 
-  auto &Dbi = Err(getPdb().getPDBDbiStream());
+  DbiStream &Dbi = Err(getPdb().getPDBDbiStream());
 
   uint32_t I = 0;
   for (auto &M : Dbi.getSectionMap()) {
diff --git a/llvm/tools/llvm-pdbutil/DumpOutputStyle.h b/llvm/tools/llvm-pdbutil/DumpOutputStyle.h
index 041fb93a18a5..217d25d66d8b 100644
--- a/llvm/tools/llvm-pdbutil/DumpOutputStyle.h
+++ b/llvm/tools/llvm-pdbutil/DumpOutputStyle.h
@@ -9,13 +9,13 @@
 #ifndef LLVM_TOOLS_LLVMPDBDUMP_DUMPOUTPUTSTYLE_H
 #define LLVM_TOOLS_LLVMPDBDUMP_DUMPOUTPUTSTYLE_H
 
-#include "LinePrinter.h"
 #include "OutputStyle.h"
 #include "StreamUtil.h"
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/DebugInfo/PDB/Native/LinePrinter.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
 
 #include <string>
diff --git a/llvm/tools/llvm-pdbutil/ExplainOutputStyle.cpp b/llvm/tools/llvm-pdbutil/ExplainOutputStyle.cpp
index b631bdf8f2b1..13a5f6ea6fe7 100644
--- a/llvm/tools/llvm-pdbutil/ExplainOutputStyle.cpp
+++ b/llvm/tools/llvm-pdbutil/ExplainOutputStyle.cpp
@@ -8,17 +8,20 @@
 
 #include "ExplainOutputStyle.h"
 
-#include "FormatUtil.h"
-#include "InputFile.h"
 #include "StreamUtil.h"
 #include "llvm-pdbutil.h"
 
 #include "llvm/DebugInfo/CodeView/Formatters.h"
+#include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/PDB/Native/DbiStream.h"
+#include "llvm/DebugInfo/PDB/Native/FormatUtil.h"
 #include "llvm/DebugInfo/PDB/Native/InfoStream.h"
+#include "llvm/DebugInfo/PDB/Native/InputFile.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Object/COFF.h"
 #include "llvm/Support/BinaryByteStream.h"
 #include "llvm/Support/BinaryStreamArray.h"
 #include "llvm/Support/Error.h"
@@ -29,7 +32,7 @@ using namespace llvm::msf;
 using namespace llvm::pdb;
 
 ExplainOutputStyle::ExplainOutputStyle(InputFile &File, uint64_t FileOffset)
-    : File(File), FileOffset(FileOffset), P(2, false, outs()) {}
+    : File(File), FileOffset(FileOffset), P(2, false, outs(), opts::Filters) {}
 
 Error ExplainOutputStyle::dump() {
   P.formatLine("Explaining file offset {0} of file '{1}'.", FileOffset,
diff --git a/llvm/tools/llvm-pdbutil/ExplainOutputStyle.h b/llvm/tools/llvm-pdbutil/ExplainOutputStyle.h
index f405cf615e92..e3d19f25a9ea 100644
--- a/llvm/tools/llvm-pdbutil/ExplainOutputStyle.h
+++ b/llvm/tools/llvm-pdbutil/ExplainOutputStyle.h
@@ -9,9 +9,10 @@
 #ifndef LLVM_TOOLS_LLVMPDBDUMP_EXPLAINOUTPUTSTYLE_H
 #define LLVM_TOOLS_LLVMPDBDUMP_EXPLAINOUTPUTSTYLE_H
 
-#include "LinePrinter.h"
 #include "OutputStyle.h"
 
+#include "llvm/DebugInfo/PDB/Native/LinePrinter.h"
+
 #include <string>
 
 namespace llvm {
diff --git a/llvm/tools/llvm-pdbutil/FormatUtil.cpp b/llvm/tools/llvm-pdbutil/FormatUtil.cpp
deleted file mode 100644
index b4837398f1d0..000000000000
--- a/llvm/tools/llvm-pdbutil/FormatUtil.cpp
+++ /dev/null
@@ -1,258 +0,0 @@
-//===- FormatUtil.cpp ----------------------------------------- *- C++ --*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "FormatUtil.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/BinaryFormat/COFF.h"
-#include "llvm/DebugInfo/CodeView/CodeView.h"
-#include "llvm/Support/FormatAdapters.h"
-#include "llvm/Support/FormatVariadic.h"
-
-using namespace llvm;
-using namespace llvm::codeview;
-using namespace llvm::pdb;
-
-std::string llvm::pdb::truncateStringBack(StringRef S, uint32_t MaxLen) {
-  if (MaxLen == 0 || S.size() <= MaxLen || S.size() <= 3)
-    return std::string(S);
-
-  assert(MaxLen >= 3);
-  uint32_t FinalLen = std::min<size_t>(S.size(), MaxLen - 3);
-  S = S.take_front(FinalLen);
-  return std::string(S) + std::string("...");
-}
-
-std::string llvm::pdb::truncateStringMiddle(StringRef S, uint32_t MaxLen) {
-  if (MaxLen == 0 || S.size() <= MaxLen || S.size() <= 3)
-    return std::string(S);
-
-  assert(MaxLen >= 3);
-  uint32_t FinalLen = std::min<size_t>(S.size(), MaxLen - 3);
-  StringRef Front = S.take_front(FinalLen / 2);
-  StringRef Back = S.take_back(Front.size());
-  return std::string(Front) + std::string("...") + std::string(Back);
-}
-
-std::string llvm::pdb::truncateStringFront(StringRef S, uint32_t MaxLen) {
-  if (MaxLen == 0 || S.size() <= MaxLen || S.size() <= 3)
-    return std::string(S);
-
-  assert(MaxLen >= 3);
-  S = S.take_back(MaxLen - 3);
-  return std::string("...") + std::string(S);
-}
-
-std::string llvm::pdb::truncateQuotedNameFront(StringRef Label, StringRef Name,
-                                               uint32_t MaxLen) {
-  uint32_t RequiredExtraChars = Label.size() + 1 + 2;
-  if (MaxLen == 0 || RequiredExtraChars + Name.size() <= MaxLen)
-    return formatv("{0} \"{1}\"", Label, Name).str();
-
-  assert(MaxLen >= RequiredExtraChars);
-  std::string TN = truncateStringFront(Name, MaxLen - RequiredExtraChars);
-  return formatv("{0} \"{1}\"", Label, TN).str();
-}
-
-std::string llvm::pdb::truncateQuotedNameBack(StringRef Label, StringRef Name,
-                                              uint32_t MaxLen) {
-  uint32_t RequiredExtraChars = Label.size() + 1 + 2;
-  if (MaxLen == 0 || RequiredExtraChars + Name.size() <= MaxLen)
-    return formatv("{0} \"{1}\"", Label, Name).str();
-
-  assert(MaxLen >= RequiredExtraChars);
-  std::string TN = truncateStringBack(Name, MaxLen - RequiredExtraChars);
-  return formatv("{0} \"{1}\"", Label, TN).str();
-}
-
-std::string llvm::pdb::typesetItemList(ArrayRef<std::string> Opts,
-                                       uint32_t IndentLevel, uint32_t GroupSize,
-                                       StringRef Sep) {
-  std::string Result;
-  while (!Opts.empty()) {
-    ArrayRef<std::string> ThisGroup;
-    ThisGroup = Opts.take_front(GroupSize);
-    Opts = Opts.drop_front(ThisGroup.size());
-    Result += join(ThisGroup, Sep);
-    if (!Opts.empty()) {
-      Result += Sep;
-      Result += "\n";
-      Result += std::string(formatv("{0}", fmt_repeat(' ', IndentLevel)));
-    }
-  }
-  return Result;
-}
-
-std::string llvm::pdb::typesetStringList(uint32_t IndentLevel,
-                                         ArrayRef<StringRef> Strings) {
-  std::string Result = "[";
-  for (const auto &S : Strings) {
-    Result += std::string(formatv("\n{0}{1}", fmt_repeat(' ', IndentLevel), S));
-  }
-  Result += "]";
-  return Result;
-}
-
-std::string llvm::pdb::formatChunkKind(DebugSubsectionKind Kind,
-                                       bool Friendly) {
-  if (Friendly) {
-    switch (Kind) {
-      RETURN_CASE(DebugSubsectionKind, None, "none");
-      RETURN_CASE(DebugSubsectionKind, Symbols, "symbols");
-      RETURN_CASE(DebugSubsectionKind, Lines, "lines");
-      RETURN_CASE(DebugSubsectionKind, StringTable, "strings");
-      RETURN_CASE(DebugSubsectionKind, FileChecksums, "checksums");
-      RETURN_CASE(DebugSubsectionKind, FrameData, "frames");
-      RETURN_CASE(DebugSubsectionKind, InlineeLines, "inlinee lines");
-      RETURN_CASE(DebugSubsectionKind, CrossScopeImports, "xmi");
-      RETURN_CASE(DebugSubsectionKind, CrossScopeExports, "xme");
-      RETURN_CASE(DebugSubsectionKind, ILLines, "il lines");
-      RETURN_CASE(DebugSubsectionKind, FuncMDTokenMap, "func md token map");
-      RETURN_CASE(DebugSubsectionKind, TypeMDTokenMap, "type md token map");
-      RETURN_CASE(DebugSubsectionKind, MergedAssemblyInput,
-                  "merged assembly input");
-      RETURN_CASE(DebugSubsectionKind, CoffSymbolRVA, "coff symbol rva");
-    }
-  } else {
-    switch (Kind) {
-      RETURN_CASE(DebugSubsectionKind, None, "none");
-      RETURN_CASE(DebugSubsectionKind, Symbols, "DEBUG_S_SYMBOLS");
-      RETURN_CASE(DebugSubsectionKind, Lines, "DEBUG_S_LINES");
-      RETURN_CASE(DebugSubsectionKind, StringTable, "DEBUG_S_STRINGTABLE");
-      RETURN_CASE(DebugSubsectionKind, FileChecksums, "DEBUG_S_FILECHKSMS");
-      RETURN_CASE(DebugSubsectionKind, FrameData, "DEBUG_S_FRAMEDATA");
-      RETURN_CASE(DebugSubsectionKind, InlineeLines, "DEBUG_S_INLINEELINES");
-      RETURN_CASE(DebugSubsectionKind, CrossScopeImports,
-                  "DEBUG_S_CROSSSCOPEIMPORTS");
-      RETURN_CASE(DebugSubsectionKind, CrossScopeExports,
-                  "DEBUG_S_CROSSSCOPEEXPORTS");
-      RETURN_CASE(DebugSubsectionKind, ILLines, "DEBUG_S_IL_LINES");
-      RETURN_CASE(DebugSubsectionKind, FuncMDTokenMap,
-                  "DEBUG_S_FUNC_MDTOKEN_MAP");
-      RETURN_CASE(DebugSubsectionKind, TypeMDTokenMap,
-                  "DEBUG_S_TYPE_MDTOKEN_MAP");
-      RETURN_CASE(DebugSubsectionKind, MergedAssemblyInput,
-                  "DEBUG_S_MERGED_ASSEMBLYINPUT");
-      RETURN_CASE(DebugSubsectionKind, CoffSymbolRVA,
-                  "DEBUG_S_COFF_SYMBOL_RVA");
-    }
-  }
-  return formatUnknownEnum(Kind);
-}
-
-std::string llvm::pdb::formatSymbolKind(SymbolKind K) {
-  switch (uint32_t(K)) {
-#define SYMBOL_RECORD(EnumName, value, name)                                   \
-  case EnumName:                                                               \
-    return #EnumName;
-#define CV_SYMBOL(EnumName, value) SYMBOL_RECORD(EnumName, value, EnumName)
-#include "llvm/DebugInfo/CodeView/CodeViewSymbols.def"
-  }
-  return formatUnknownEnum(K);
-}
-
-std::string llvm::pdb::formatTypeLeafKind(TypeLeafKind K) {
-  switch (K) {
-#define TYPE_RECORD(EnumName, value, name)                                     \
-  case EnumName:                                                               \
-    return #EnumName;
-#include "llvm/DebugInfo/CodeView/CodeViewTypes.def"
-  default:
-    return formatv("UNKNOWN RECORD ({0:X})",
-                   static_cast<std::underlying_type_t<TypeLeafKind>>(K))
-        .str();
-  }
-}
-
-std::string llvm::pdb::formatSegmentOffset(uint16_t Segment, uint32_t Offset) {
-  return std::string(formatv("{0:4}:{1:4}", Segment, Offset));
-}
-
-#define PUSH_CHARACTERISTIC_FLAG(Enum, TheOpt, Value, Style, Descriptive)      \
-  PUSH_FLAG(Enum, TheOpt, Value,                                               \
-            ((Style == CharacteristicStyle::HeaderDefinition) ? #TheOpt        \
-                                                              : Descriptive))
-
-#define PUSH_MASKED_CHARACTERISTIC_FLAG(Enum, Mask, TheOpt, Value, Style,      \
-                                        Descriptive)                           \
-  PUSH_MASKED_FLAG(Enum, Mask, TheOpt, Value,                                  \
-                   ((Style == CharacteristicStyle::HeaderDefinition)           \
-                        ? #TheOpt                                              \
-                        : Descriptive))
-
-std::string llvm::pdb::formatSectionCharacteristics(uint32_t IndentLevel,
-                                                    uint32_t C,
-                                                    uint32_t FlagsPerLine,
-                                                    StringRef Separator,
-                                                    CharacteristicStyle Style) {
-  using SC = COFF::SectionCharacteristics;
-  std::vector<std::string> Opts;
-  if (C == COFF::SC_Invalid)
-    return "invalid";
-  if (C == 0)
-    return "none";
-  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_TYPE_NOLOAD, C, Style, "noload");
-  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_TYPE_NO_PAD, C, Style, "no padding");
-  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_CNT_CODE, C, Style, "code");
-  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_CNT_INITIALIZED_DATA, C, Style,
-                           "initialized data");
-  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_CNT_UNINITIALIZED_DATA, C, Style,
-                           "uninitialized data");
-  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_LNK_OTHER, C, Style, "other");
-  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_LNK_INFO, C, Style, "info");
-  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_LNK_REMOVE, C, Style, "remove");
-  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_LNK_COMDAT, C, Style, "comdat");
-  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_GPREL, C, Style, "gp rel");
-  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_MEM_PURGEABLE, C, Style, "purgeable");
-  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_MEM_16BIT, C, Style, "16-bit");
-  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_MEM_LOCKED, C, Style, "locked");
-  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_MEM_PRELOAD, C, Style, "preload");
-  PUSH_MASKED_CHARACTERISTIC_FLAG(SC, 0xF00000, IMAGE_SCN_ALIGN_1BYTES, C,
-                                  Style, "1 byte align");
-  PUSH_MASKED_CHARACTERISTIC_FLAG(SC, 0xF00000, IMAGE_SCN_ALIGN_2BYTES, C,
-                                  Style, "2 byte align");
-  PUSH_MASKED_CHARACTERISTIC_FLAG(SC, 0xF00000, IMAGE_SCN_ALIGN_4BYTES, C,
-                                  Style, "4 byte align");
-  PUSH_MASKED_CHARACTERISTIC_FLAG(SC, 0xF00000, IMAGE_SCN_ALIGN_8BYTES, C,
-                                  Style, "8 byte align");
-  PUSH_MASKED_CHARACTERISTIC_FLAG(SC, 0xF00000, IMAGE_SCN_ALIGN_16BYTES, C,
-                                  Style, "16 byte align");
-  PUSH_MASKED_CHARACTERISTIC_FLAG(SC, 0xF00000, IMAGE_SCN_ALIGN_32BYTES, C,
-                                  Style, "32 byte align");
-  PUSH_MASKED_CHARACTERISTIC_FLAG(SC, 0xF00000, IMAGE_SCN_ALIGN_64BYTES, C,
-                                  Style, "64 byte align");
-  PUSH_MASKED_CHARACTERISTIC_FLAG(SC, 0xF00000, IMAGE_SCN_ALIGN_128BYTES, C,
-                                  Style, "128 byte align");
-  PUSH_MASKED_CHARACTERISTIC_FLAG(SC, 0xF00000, IMAGE_SCN_ALIGN_256BYTES, C,
-                                  Style, "256 byte align");
-  PUSH_MASKED_CHARACTERISTIC_FLAG(SC, 0xF00000, IMAGE_SCN_ALIGN_512BYTES, C,
-                                  Style, "512 byte align");
-  PUSH_MASKED_CHARACTERISTIC_FLAG(SC, 0xF00000, IMAGE_SCN_ALIGN_1024BYTES, C,
-                                  Style, "1024 byte align");
-  PUSH_MASKED_CHARACTERISTIC_FLAG(SC, 0xF00000, IMAGE_SCN_ALIGN_2048BYTES, C,
-                                  Style, "2048 byte align");
-  PUSH_MASKED_CHARACTERISTIC_FLAG(SC, 0xF00000, IMAGE_SCN_ALIGN_4096BYTES, C,
-                                  Style, "4096 byte align");
-  PUSH_MASKED_CHARACTERISTIC_FLAG(SC, 0xF00000, IMAGE_SCN_ALIGN_8192BYTES, C,
-                                  Style, "8192 byte align");
-  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_LNK_NRELOC_OVFL, C, Style,
-                           "noreloc overflow");
-  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_MEM_DISCARDABLE, C, Style,
-                           "discardable");
-  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_MEM_NOT_CACHED, C, Style,
-                           "not cached");
-  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_MEM_NOT_PAGED, C, Style, "not paged");
-  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_MEM_SHARED, C, Style, "shared");
-  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_MEM_EXECUTE, C, Style,
-                           "execute permissions");
-  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_MEM_READ, C, Style,
-                           "read permissions");
-  PUSH_CHARACTERISTIC_FLAG(SC, IMAGE_SCN_MEM_WRITE, C, Style,
-                           "write permissions");
-  return typesetItemList(Opts, IndentLevel, FlagsPerLine, Separator);
-}
diff --git a/llvm/tools/llvm-pdbutil/FormatUtil.h b/llvm/tools/llvm-pdbutil/FormatUtil.h
deleted file mode 100644
index b99ccec215b5..000000000000
--- a/llvm/tools/llvm-pdbutil/FormatUtil.h
+++ /dev/null
@@ -1,141 +0,0 @@
-//===- FormatUtil.h ------------------------------------------- *- C++ --*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_LLVMPDBUTIL_FORMAT_UTIL_H
-#define LLVM_TOOLS_LLVMPDBUTIL_FORMAT_UTIL_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/DebugInfo/CodeView/CodeView.h"
-#include "llvm/Support/Endian.h"
-#include "llvm/Support/FormatAdapters.h"
-#include "llvm/Support/FormatVariadic.h"
-
-#include <string>
-#include <type_traits>
-
-namespace llvm {
-namespace pdb {
-
-std::string truncateStringBack(StringRef S, uint32_t MaxLen);
-std::string truncateStringMiddle(StringRef S, uint32_t MaxLen);
-std::string truncateStringFront(StringRef S, uint32_t MaxLen);
-std::string truncateQuotedNameFront(StringRef Label, StringRef Name,
-                                    uint32_t MaxLen);
-std::string truncateQuotedNameBack(StringRef Label, StringRef Name,
-                                   uint32_t MaxLen);
-
-#define PUSH_MASKED_FLAG(Enum, Mask, TheOpt, Value, Text)                      \
-  if (Enum::TheOpt == (Value & Mask))                                          \
-    Opts.push_back(Text);
-
-#define PUSH_FLAG(Enum, TheOpt, Value, Text)                                   \
-  PUSH_MASKED_FLAG(Enum, Enum::TheOpt, TheOpt, Value, Text)
-
-#define RETURN_CASE(Enum, X, Ret)                                              \
-  case Enum::X:                                                                \
-    return Ret;
-
-template <typename T> std::string formatUnknownEnum(T Value) {
-  return formatv("unknown ({0})", static_cast<std::underlying_type_t<T>>(Value))
-      .str();
-}
-
-std::string formatSegmentOffset(uint16_t Segment, uint32_t Offset);
-
-enum class CharacteristicStyle {
-  HeaderDefinition, // format as windows header definition
-  Descriptive,      // format as human readable words
-};
-std::string formatSectionCharacteristics(
-    uint32_t IndentLevel, uint32_t C, uint32_t FlagsPerLine,
-    StringRef Separator,
-    CharacteristicStyle Style = CharacteristicStyle::HeaderDefinition);
-
-std::string typesetItemList(ArrayRef<std::string> Opts, uint32_t IndentLevel,
-                            uint32_t GroupSize, StringRef Sep);
-
-std::string typesetStringList(uint32_t IndentLevel,
-                              ArrayRef<StringRef> Strings);
-
-std::string formatChunkKind(codeview::DebugSubsectionKind Kind,
-                            bool Friendly = true);
-std::string formatSymbolKind(codeview::SymbolKind K);
-std::string formatTypeLeafKind(codeview::TypeLeafKind K);
-
-/// Returns the number of digits in the given integer.
-inline int NumDigits(uint64_t N) {
-  if (N < 10ULL)
-    return 1;
-  if (N < 100ULL)
-    return 2;
-  if (N < 1000ULL)
-    return 3;
-  if (N < 10000ULL)
-    return 4;
-  if (N < 100000ULL)
-    return 5;
-  if (N < 1000000ULL)
-    return 6;
-  if (N < 10000000ULL)
-    return 7;
-  if (N < 100000000ULL)
-    return 8;
-  if (N < 1000000000ULL)
-    return 9;
-  if (N < 10000000000ULL)
-    return 10;
-  if (N < 100000000000ULL)
-    return 11;
-  if (N < 1000000000000ULL)
-    return 12;
-  if (N < 10000000000000ULL)
-    return 13;
-  if (N < 100000000000000ULL)
-    return 14;
-  if (N < 1000000000000000ULL)
-    return 15;
-  if (N < 10000000000000000ULL)
-    return 16;
-  if (N < 100000000000000000ULL)
-    return 17;
-  if (N < 1000000000000000000ULL)
-    return 18;
-  if (N < 10000000000000000000ULL)
-    return 19;
-  return 20;
-}
-
-namespace detail {
-template <typename T>
-struct EndianAdapter final
-    : public FormatAdapter<support::detail::packed_endian_specific_integral<
-          T, support::little, support::unaligned>> {
-  using EndianType =
-      support::detail::packed_endian_specific_integral<T, support::little,
-                                                       support::unaligned>;
-
-  explicit EndianAdapter(EndianType &&Item)
-      : FormatAdapter<EndianType>(std::move(Item)) {}
-
-  void format(llvm::raw_ostream &Stream, StringRef Style) override {
-    format_provider<T>::format(static_cast<T>(this->Item), Stream, Style);
-  }
-};
-} // namespace detail
-
-template <typename T>
-detail::EndianAdapter<T>
-fmtle(support::detail::packed_endian_specific_integral<T, support::little,
-                                                       support::unaligned>
-          Value) {
-  return detail::EndianAdapter<T>(std::move(Value));
-}
-}
-} // namespace llvm
-#endif
diff --git a/llvm/tools/llvm-pdbutil/InputFile.cpp b/llvm/tools/llvm-pdbutil/InputFile.cpp
deleted file mode 100644
index 40b35625b6f8..000000000000
--- a/llvm/tools/llvm-pdbutil/InputFile.cpp
+++ /dev/null
@@ -1,510 +0,0 @@
-//===- InputFile.cpp ------------------------------------------ *- C++ --*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "InputFile.h"
-
-#include "FormatUtil.h"
-#include "LinePrinter.h"
-
-#include "llvm/BinaryFormat/Magic.h"
-#include "llvm/DebugInfo/CodeView/CodeView.h"
-#include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
-#include "llvm/DebugInfo/CodeView/StringsAndChecksums.h"
-#include "llvm/DebugInfo/PDB/Native/DbiStream.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
-#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
-#include "llvm/DebugInfo/PDB/Native/PDBStringTable.h"
-#include "llvm/DebugInfo/PDB/Native/RawError.h"
-#include "llvm/DebugInfo/PDB/Native/TpiStream.h"
-#include "llvm/DebugInfo/PDB/PDB.h"
-#include "llvm/Object/COFF.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/FormatVariadic.h"
-
-using namespace llvm;
-using namespace llvm::codeview;
-using namespace llvm::object;
-using namespace llvm::pdb;
-
-InputFile::InputFile() {}
-InputFile::~InputFile() {}
-
-static Expected<ModuleDebugStreamRef>
-getModuleDebugStream(PDBFile &File, StringRef &ModuleName, uint32_t Index) {
-  ExitOnError Err("Unexpected error: ");
-
-  auto &Dbi = Err(File.getPDBDbiStream());
-  const auto &Modules = Dbi.modules();
-  if (Index >= Modules.getModuleCount())
-    return make_error<RawError>(raw_error_code::index_out_of_bounds,
-                                "Invalid module index");
-
-  auto Modi = Modules.getModuleDescriptor(Index);
-
-  ModuleName = Modi.getModuleName();
-
-  uint16_t ModiStream = Modi.getModuleStreamIndex();
-  if (ModiStream == kInvalidStreamIndex)
-    return make_error<RawError>(raw_error_code::no_stream,
-                                "Module stream not present");
-
-  auto ModStreamData = File.createIndexedStream(ModiStream);
-
-  ModuleDebugStreamRef ModS(Modi, std::move(ModStreamData));
-  if (auto EC = ModS.reload())
-    return make_error<RawError>(raw_error_code::corrupt_file,
-                                "Invalid module stream");
-
-  return std::move(ModS);
-}
-
-static inline bool isCodeViewDebugSubsection(object::SectionRef Section,
-                                             StringRef Name,
-                                             BinaryStreamReader &Reader) {
-  if (Expected<StringRef> NameOrErr = Section.getName()) {
-    if (*NameOrErr != Name)
-      return false;
-  } else {
-    consumeError(NameOrErr.takeError());
-    return false;
-  }
-
-  Expected<StringRef> ContentsOrErr = Section.getContents();
-  if (!ContentsOrErr) {
-    consumeError(ContentsOrErr.takeError());
-    return false;
-  }
-
-  Reader = BinaryStreamReader(*ContentsOrErr, support::little);
-  uint32_t Magic;
-  if (Reader.bytesRemaining() < sizeof(uint32_t))
-    return false;
-  cantFail(Reader.readInteger(Magic));
-  if (Magic != COFF::DEBUG_SECTION_MAGIC)
-    return false;
-  return true;
-}
-
-static inline bool isDebugSSection(object::SectionRef Section,
-                                   DebugSubsectionArray &Subsections) {
-  BinaryStreamReader Reader;
-  if (!isCodeViewDebugSubsection(Section, ".debug$S", Reader))
-    return false;
-
-  cantFail(Reader.readArray(Subsections, Reader.bytesRemaining()));
-  return true;
-}
-
-static bool isDebugTSection(SectionRef Section, CVTypeArray &Types) {
-  BinaryStreamReader Reader;
-  if (!isCodeViewDebugSubsection(Section, ".debug$T", Reader) &&
-      !isCodeViewDebugSubsection(Section, ".debug$P", Reader))
-    return false;
-  cantFail(Reader.readArray(Types, Reader.bytesRemaining()));
-  return true;
-}
-
-static std::string formatChecksumKind(FileChecksumKind Kind) {
-  switch (Kind) {
-    RETURN_CASE(FileChecksumKind, None, "None");
-    RETURN_CASE(FileChecksumKind, MD5, "MD5");
-    RETURN_CASE(FileChecksumKind, SHA1, "SHA-1");
-    RETURN_CASE(FileChecksumKind, SHA256, "SHA-256");
-  }
-  return formatUnknownEnum(Kind);
-}
-
-template <typename... Args>
-static void formatInternal(LinePrinter &Printer, bool Append, Args &&... args) {
-  if (Append)
-    Printer.format(std::forward<Args>(args)...);
-  else
-    Printer.formatLine(std::forward<Args>(args)...);
-}
-
-SymbolGroup::SymbolGroup(InputFile *File, uint32_t GroupIndex) : File(File) {
-  if (!File)
-    return;
-
-  if (File->isPdb())
-    initializeForPdb(GroupIndex);
-  else {
-    Name = ".debug$S";
-    uint32_t I = 0;
-    for (const auto &S : File->obj().sections()) {
-      DebugSubsectionArray SS;
-      if (!isDebugSSection(S, SS))
-        continue;
-
-      if (!SC.hasChecksums() || !SC.hasStrings())
-        SC.initialize(SS);
-
-      if (I == GroupIndex)
-        Subsections = SS;
-
-      if (SC.hasChecksums() && SC.hasStrings())
-        break;
-    }
-    rebuildChecksumMap();
-  }
-}
-
-StringRef SymbolGroup::name() const { return Name; }
-
-void SymbolGroup::updateDebugS(const codeview::DebugSubsectionArray &SS) {
-  Subsections = SS;
-}
-
-void SymbolGroup::updatePdbModi(uint32_t Modi) { initializeForPdb(Modi); }
-
-void SymbolGroup::initializeForPdb(uint32_t Modi) {
-  assert(File && File->isPdb());
-
-  // PDB always uses the same string table, but each module has its own
-  // checksums.  So we only set the strings if they're not already set.
-  if (!SC.hasStrings()) {
-    auto StringTable = File->pdb().getStringTable();
-    if (StringTable)
-      SC.setStrings(StringTable->getStringTable());
-    else
-      consumeError(StringTable.takeError());
-  }
-
-  SC.resetChecksums();
-  auto MDS = getModuleDebugStream(File->pdb(), Name, Modi);
-  if (!MDS) {
-    consumeError(MDS.takeError());
-    return;
-  }
-
-  DebugStream = std::make_shared<ModuleDebugStreamRef>(std::move(*MDS));
-  Subsections = DebugStream->getSubsectionsArray();
-  SC.initialize(Subsections);
-  rebuildChecksumMap();
-}
-
-void SymbolGroup::rebuildChecksumMap() {
-  if (!SC.hasChecksums())
-    return;
-
-  for (const auto &Entry : SC.checksums()) {
-    auto S = SC.strings().getString(Entry.FileNameOffset);
-    if (!S)
-      continue;
-    ChecksumsByFile[*S] = Entry;
-  }
-}
-
-const ModuleDebugStreamRef &SymbolGroup::getPdbModuleStream() const {
-  assert(File && File->isPdb() && DebugStream);
-  return *DebugStream;
-}
-
-Expected<StringRef> SymbolGroup::getNameFromStringTable(uint32_t Offset) const {
-  return SC.strings().getString(Offset);
-}
-
-void SymbolGroup::formatFromFileName(LinePrinter &Printer, StringRef File,
-                                     bool Append) const {
-  auto FC = ChecksumsByFile.find(File);
-  if (FC == ChecksumsByFile.end()) {
-    formatInternal(Printer, Append, "- (no checksum) {0}", File);
-    return;
-  }
-
-  formatInternal(Printer, Append, "- ({0}: {1}) {2}",
-                 formatChecksumKind(FC->getValue().Kind),
-                 toHex(FC->getValue().Checksum), File);
-}
-
-void SymbolGroup::formatFromChecksumsOffset(LinePrinter &Printer,
-                                            uint32_t Offset,
-                                            bool Append) const {
-  if (!SC.hasChecksums()) {
-    formatInternal(Printer, Append, "(unknown file name offset {0})", Offset);
-    return;
-  }
-
-  auto Iter = SC.checksums().getArray().at(Offset);
-  if (Iter == SC.checksums().getArray().end()) {
-    formatInternal(Printer, Append, "(unknown file name offset {0})", Offset);
-    return;
-  }
-
-  uint32_t FO = Iter->FileNameOffset;
-  auto ExpectedFile = getNameFromStringTable(FO);
-  if (!ExpectedFile) {
-    formatInternal(Printer, Append, "(unknown file name offset {0})", Offset);
-    consumeError(ExpectedFile.takeError());
-    return;
-  }
-  if (Iter->Kind == FileChecksumKind::None) {
-    formatInternal(Printer, Append, "{0} (no checksum)", *ExpectedFile);
-  } else {
-    formatInternal(Printer, Append, "{0} ({1}: {2})", *ExpectedFile,
-                   formatChecksumKind(Iter->Kind), toHex(Iter->Checksum));
-  }
-}
-
-Expected<InputFile> InputFile::open(StringRef Path, bool AllowUnknownFile) {
-  InputFile IF;
-  if (!llvm::sys::fs::exists(Path))
-    return make_error<StringError>(formatv("File {0} not found", Path),
-                                   inconvertibleErrorCode());
-
-  file_magic Magic;
-  if (auto EC = identify_magic(Path, Magic))
-    return make_error<StringError>(
-        formatv("Unable to identify file type for file {0}", Path), EC);
-
-  if (Magic == file_magic::coff_object) {
-    Expected<OwningBinary<Binary>> BinaryOrErr = createBinary(Path);
-    if (!BinaryOrErr)
-      return BinaryOrErr.takeError();
-
-    IF.CoffObject = std::move(*BinaryOrErr);
-    IF.PdbOrObj = llvm::cast<COFFObjectFile>(IF.CoffObject.getBinary());
-    return std::move(IF);
-  }
-
-  if (Magic == file_magic::pdb) {
-    std::unique_ptr<IPDBSession> Session;
-    if (auto Err = loadDataForPDB(PDB_ReaderType::Native, Path, Session))
-      return std::move(Err);
-
-    IF.PdbSession.reset(static_cast<NativeSession *>(Session.release()));
-    IF.PdbOrObj = &IF.PdbSession->getPDBFile();
-
-    return std::move(IF);
-  }
-
-  if (!AllowUnknownFile)
-    return make_error<StringError>(
-        formatv("File {0} is not a supported file type", Path),
-        inconvertibleErrorCode());
-
-  auto Result = MemoryBuffer::getFile(Path, /*IsText=*/false,
-                                      /*RequiresNullTerminator=*/false);
-  if (!Result)
-    return make_error<StringError>(
-        formatv("File {0} could not be opened", Path), Result.getError());
-
-  IF.UnknownFile = std::move(*Result);
-  IF.PdbOrObj = IF.UnknownFile.get();
-  return std::move(IF);
-}
-
-PDBFile &InputFile::pdb() {
-  assert(isPdb());
-  return *PdbOrObj.get<PDBFile *>();
-}
-
-const PDBFile &InputFile::pdb() const {
-  assert(isPdb());
-  return *PdbOrObj.get<PDBFile *>();
-}
-
-object::COFFObjectFile &InputFile::obj() {
-  assert(isObj());
-  return *PdbOrObj.get<object::COFFObjectFile *>();
-}
-
-const object::COFFObjectFile &InputFile::obj() const {
-  assert(isObj());
-  return *PdbOrObj.get<object::COFFObjectFile *>();
-}
-
-MemoryBuffer &InputFile::unknown() {
-  assert(isUnknown());
-  return *PdbOrObj.get<MemoryBuffer *>();
-}
-
-const MemoryBuffer &InputFile::unknown() const {
-  assert(isUnknown());
-  return *PdbOrObj.get<MemoryBuffer *>();
-}
-
-StringRef InputFile::getFilePath() const {
-  if (isPdb())
-    return pdb().getFilePath();
-  if (isObj())
-    return obj().getFileName();
-  assert(isUnknown());
-  return unknown().getBufferIdentifier();
-}
-
-bool InputFile::hasTypes() const {
-  if (isPdb())
-    return pdb().hasPDBTpiStream();
-
-  for (const auto &Section : obj().sections()) {
-    CVTypeArray Types;
-    if (isDebugTSection(Section, Types))
-      return true;
-  }
-  return false;
-}
-
-bool InputFile::hasIds() const {
-  if (isObj())
-    return false;
-  return pdb().hasPDBIpiStream();
-}
-
-bool InputFile::isPdb() const { return PdbOrObj.is<PDBFile *>(); }
-
-bool InputFile::isObj() const {
-  return PdbOrObj.is<object::COFFObjectFile *>();
-}
-
-bool InputFile::isUnknown() const { return PdbOrObj.is<MemoryBuffer *>(); }
-
-codeview::LazyRandomTypeCollection &
-InputFile::getOrCreateTypeCollection(TypeCollectionKind Kind) {
-  if (Types && Kind == kTypes)
-    return *Types;
-  if (Ids && Kind == kIds)
-    return *Ids;
-
-  if (Kind == kIds) {
-    assert(isPdb() && pdb().hasPDBIpiStream());
-  }
-
-  // If the collection was already initialized, we should have just returned it
-  // in step 1.
-  if (isPdb()) {
-    TypeCollectionPtr &Collection = (Kind == kIds) ? Ids : Types;
-    auto &Stream = cantFail((Kind == kIds) ? pdb().getPDBIpiStream()
-                                           : pdb().getPDBTpiStream());
-
-    auto &Array = Stream.typeArray();
-    uint32_t Count = Stream.getNumTypeRecords();
-    auto Offsets = Stream.getTypeIndexOffsets();
-    Collection =
-        std::make_unique<LazyRandomTypeCollection>(Array, Count, Offsets);
-    return *Collection;
-  }
-
-  assert(isObj());
-  assert(Kind == kTypes);
-  assert(!Types);
-
-  for (const auto &Section : obj().sections()) {
-    CVTypeArray Records;
-    if (!isDebugTSection(Section, Records))
-      continue;
-
-    Types = std::make_unique<LazyRandomTypeCollection>(Records, 100);
-    return *Types;
-  }
-
-  Types = std::make_unique<LazyRandomTypeCollection>(100);
-  return *Types;
-}
-
-codeview::LazyRandomTypeCollection &InputFile::types() {
-  return getOrCreateTypeCollection(kTypes);
-}
-
-codeview::LazyRandomTypeCollection &InputFile::ids() {
-  // Object files have only one type stream that contains both types and ids.
-  // Similarly, some PDBs don't contain an IPI stream, and for those both types
-  // and IDs are in the same stream.
-  if (isObj() || !pdb().hasPDBIpiStream())
-    return types();
-
-  return getOrCreateTypeCollection(kIds);
-}
-
-iterator_range<SymbolGroupIterator> InputFile::symbol_groups() {
-  return make_range<SymbolGroupIterator>(symbol_groups_begin(),
-                                         symbol_groups_end());
-}
-
-SymbolGroupIterator InputFile::symbol_groups_begin() {
-  return SymbolGroupIterator(*this);
-}
-
-SymbolGroupIterator InputFile::symbol_groups_end() {
-  return SymbolGroupIterator();
-}
-
-SymbolGroupIterator::SymbolGroupIterator() : Value(nullptr) {}
-
-SymbolGroupIterator::SymbolGroupIterator(InputFile &File) : Value(&File) {
-  if (File.isObj()) {
-    SectionIter = File.obj().section_begin();
-    scanToNextDebugS();
-  }
-}
-
-bool SymbolGroupIterator::operator==(const SymbolGroupIterator &R) const {
-  bool E = isEnd();
-  bool RE = R.isEnd();
-  if (E || RE)
-    return E == RE;
-
-  if (Value.File != R.Value.File)
-    return false;
-  return Index == R.Index;
-}
-
-const SymbolGroup &SymbolGroupIterator::operator*() const {
-  assert(!isEnd());
-  return Value;
-}
-SymbolGroup &SymbolGroupIterator::operator*() {
-  assert(!isEnd());
-  return Value;
-}
-
-SymbolGroupIterator &SymbolGroupIterator::operator++() {
-  assert(Value.File && !isEnd());
-  ++Index;
-  if (isEnd())
-    return *this;
-
-  if (Value.File->isPdb()) {
-    Value.updatePdbModi(Index);
-    return *this;
-  }
-
-  scanToNextDebugS();
-  return *this;
-}
-
-void SymbolGroupIterator::scanToNextDebugS() {
-  assert(SectionIter.hasValue());
-  auto End = Value.File->obj().section_end();
-  auto &Iter = *SectionIter;
-  assert(!isEnd());
-
-  while (++Iter != End) {
-    DebugSubsectionArray SS;
-    SectionRef SR = *Iter;
-    if (!isDebugSSection(SR, SS))
-      continue;
-
-    Value.updateDebugS(SS);
-    return;
-  }
-}
-
-bool SymbolGroupIterator::isEnd() const {
-  if (!Value.File)
-    return true;
-  if (Value.File->isPdb()) {
-    auto &Dbi = cantFail(Value.File->pdb().getPDBDbiStream());
-    uint32_t Count = Dbi.modules().getModuleCount();
-    assert(Index <= Count);
-    return Index == Count;
-  }
-
-  assert(SectionIter.hasValue());
-  return *SectionIter == Value.File->obj().section_end();
-}
diff --git a/llvm/tools/llvm-pdbutil/InputFile.h b/llvm/tools/llvm-pdbutil/InputFile.h
deleted file mode 100644
index 633ab34a54d4..000000000000
--- a/llvm/tools/llvm-pdbutil/InputFile.h
+++ /dev/null
@@ -1,154 +0,0 @@
-//===- InputFile.h -------------------------------------------- *- C++ --*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_LLVMPDBDUMP_INPUTFILE_H
-#define LLVM_TOOLS_LLVMPDBDUMP_INPUTFILE_H
-
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/PointerUnion.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/iterator.h"
-#include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
-#include "llvm/DebugInfo/CodeView/StringsAndChecksums.h"
-#include "llvm/DebugInfo/PDB/Native/ModuleDebugStream.h"
-#include "llvm/Object/Binary.h"
-#include "llvm/Object/ObjectFile.h"
-#include "llvm/Support/Error.h"
-
-namespace llvm {
-namespace codeview {
-class LazyRandomTypeCollection;
-}
-namespace object {
-class COFFObjectFile;
-} // namespace object
-
-namespace pdb {
-class InputFile;
-class LinePrinter;
-class PDBFile;
-class NativeSession;
-class SymbolGroupIterator;
-class SymbolGroup;
-
-class InputFile {
-  InputFile();
-
-  std::unique_ptr<NativeSession> PdbSession;
-  object::OwningBinary<object::Binary> CoffObject;
-  std::unique_ptr<MemoryBuffer> UnknownFile;
-  PointerUnion<PDBFile *, object::COFFObjectFile *, MemoryBuffer *> PdbOrObj;
-
-  using TypeCollectionPtr = std::unique_ptr<codeview::LazyRandomTypeCollection>;
-
-  TypeCollectionPtr Types;
-  TypeCollectionPtr Ids;
-
-  enum TypeCollectionKind { kTypes, kIds };
-  codeview::LazyRandomTypeCollection &
-  getOrCreateTypeCollection(TypeCollectionKind Kind);
-
-public:
-  ~InputFile();
-  InputFile(InputFile &&Other) = default;
-
-  static Expected<InputFile> open(StringRef Path,
-                                  bool AllowUnknownFile = false);
-
-  PDBFile &pdb();
-  const PDBFile &pdb() const;
-  object::COFFObjectFile &obj();
-  const object::COFFObjectFile &obj() const;
-  MemoryBuffer &unknown();
-  const MemoryBuffer &unknown() const;
-
-  StringRef getFilePath() const;
-
-  bool hasTypes() const;
-  bool hasIds() const;
-
-  codeview::LazyRandomTypeCollection &types();
-  codeview::LazyRandomTypeCollection &ids();
-
-  iterator_range<SymbolGroupIterator> symbol_groups();
-  SymbolGroupIterator symbol_groups_begin();
-  SymbolGroupIterator symbol_groups_end();
-
-  bool isPdb() const;
-  bool isObj() const;
-  bool isUnknown() const;
-};
-
-class SymbolGroup {
-  friend class SymbolGroupIterator;
-
-public:
-  explicit SymbolGroup(InputFile *File, uint32_t GroupIndex = 0);
-
-  Expected<StringRef> getNameFromStringTable(uint32_t Offset) const;
-
-  void formatFromFileName(LinePrinter &Printer, StringRef File,
-                          bool Append = false) const;
-
-  void formatFromChecksumsOffset(LinePrinter &Printer, uint32_t Offset,
-                                 bool Append = false) const;
-
-  StringRef name() const;
-
-  codeview::DebugSubsectionArray getDebugSubsections() const {
-    return Subsections;
-  }
-  const ModuleDebugStreamRef &getPdbModuleStream() const;
-
-  const InputFile &getFile() const { return *File; }
-  InputFile &getFile() { return *File; }
-
-  bool hasDebugStream() const { return DebugStream != nullptr; }
-
-private:
-  void initializeForPdb(uint32_t Modi);
-  void updatePdbModi(uint32_t Modi);
-  void updateDebugS(const codeview::DebugSubsectionArray &SS);
-
-  void rebuildChecksumMap();
-  InputFile *File = nullptr;
-  StringRef Name;
-  codeview::DebugSubsectionArray Subsections;
-  std::shared_ptr<ModuleDebugStreamRef> DebugStream;
-  codeview::StringsAndChecksumsRef SC;
-  StringMap<codeview::FileChecksumEntry> ChecksumsByFile;
-};
-
-class SymbolGroupIterator
-    : public iterator_facade_base<SymbolGroupIterator,
-                                  std::forward_iterator_tag, SymbolGroup> {
-public:
-  SymbolGroupIterator();
-  explicit SymbolGroupIterator(InputFile &File);
-  SymbolGroupIterator(const SymbolGroupIterator &Other) = default;
-  SymbolGroupIterator &operator=(const SymbolGroupIterator &R) = default;
-
-  const SymbolGroup &operator*() const;
-  SymbolGroup &operator*();
-
-  bool operator==(const SymbolGroupIterator &R) const;
-  SymbolGroupIterator &operator++();
-
-private:
-  void scanToNextDebugS();
-  bool isEnd() const;
-
-  uint32_t Index = 0;
-  Optional<object::section_iterator> SectionIter;
-  SymbolGroup Value;
-};
-
-} // namespace pdb
-} // namespace llvm
-
-#endif
diff --git a/llvm/tools/llvm-pdbutil/LinePrinter.cpp b/llvm/tools/llvm-pdbutil/LinePrinter.cpp
deleted file mode 100644
index dd6ca5bf41b1..000000000000
--- a/llvm/tools/llvm-pdbutil/LinePrinter.cpp
+++ /dev/null
@@ -1,335 +0,0 @@
-//===- LinePrinter.cpp ------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "LinePrinter.h"
-
-#include "llvm-pdbutil.h"
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/DebugInfo/MSF/MSFCommon.h"
-#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
-#include "llvm/DebugInfo/PDB/UDTLayout.h"
-#include "llvm/Support/BinaryStreamReader.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/FormatAdapters.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/Regex.h"
-
-#include <algorithm>
-
-using namespace llvm;
-using namespace llvm::msf;
-using namespace llvm::pdb;
-
-namespace {
-bool IsItemExcluded(llvm::StringRef Item,
-                    std::list<llvm::Regex> &IncludeFilters,
-                    std::list<llvm::Regex> &ExcludeFilters) {
-  if (Item.empty())
-    return false;
-
-  auto match_pred = [Item](llvm::Regex &R) { return R.match(Item); };
-
-  // Include takes priority over exclude.  If the user specified include
-  // filters, and none of them include this item, them item is gone.
-  if (!IncludeFilters.empty() && !any_of(IncludeFilters, match_pred))
-    return true;
-
-  if (any_of(ExcludeFilters, match_pred))
-    return true;
-
-  return false;
-}
-}
-
-using namespace llvm;
-
-LinePrinter::LinePrinter(int Indent, bool UseColor, llvm::raw_ostream &Stream)
-    : OS(Stream), IndentSpaces(Indent), CurrentIndent(0), UseColor(UseColor) {
-  SetFilters(ExcludeTypeFilters, opts::pretty::ExcludeTypes.begin(),
-             opts::pretty::ExcludeTypes.end());
-  SetFilters(ExcludeSymbolFilters, opts::pretty::ExcludeSymbols.begin(),
-             opts::pretty::ExcludeSymbols.end());
-  SetFilters(ExcludeCompilandFilters, opts::pretty::ExcludeCompilands.begin(),
-             opts::pretty::ExcludeCompilands.end());
-
-  SetFilters(IncludeTypeFilters, opts::pretty::IncludeTypes.begin(),
-             opts::pretty::IncludeTypes.end());
-  SetFilters(IncludeSymbolFilters, opts::pretty::IncludeSymbols.begin(),
-             opts::pretty::IncludeSymbols.end());
-  SetFilters(IncludeCompilandFilters, opts::pretty::IncludeCompilands.begin(),
-             opts::pretty::IncludeCompilands.end());
-}
-
-void LinePrinter::Indent(uint32_t Amount) {
-  if (Amount == 0)
-    Amount = IndentSpaces;
-  CurrentIndent += Amount;
-}
-
-void LinePrinter::Unindent(uint32_t Amount) {
-  if (Amount == 0)
-    Amount = IndentSpaces;
-  CurrentIndent = std::max<int>(0, CurrentIndent - Amount);
-}
-
-void LinePrinter::NewLine() {
-  OS << "\n";
-  OS.indent(CurrentIndent);
-}
-
-void LinePrinter::print(const Twine &T) { OS << T; }
-
-void LinePrinter::printLine(const Twine &T) {
-  NewLine();
-  OS << T;
-}
-
-bool LinePrinter::IsClassExcluded(const ClassLayout &Class) {
-  if (IsTypeExcluded(Class.getName(), Class.getSize()))
-    return true;
-  if (Class.deepPaddingSize() < opts::pretty::PaddingThreshold)
-    return true;
-  return false;
-}
-
-void LinePrinter::formatBinary(StringRef Label, ArrayRef<uint8_t> Data,
-                               uint64_t StartOffset) {
-  NewLine();
-  OS << Label << " (";
-  if (!Data.empty()) {
-    OS << "\n";
-    OS << format_bytes_with_ascii(Data, StartOffset, 32, 4,
-                                  CurrentIndent + IndentSpaces, true);
-    NewLine();
-  }
-  OS << ")";
-}
-
-void LinePrinter::formatBinary(StringRef Label, ArrayRef<uint8_t> Data,
-                               uint64_t Base, uint64_t StartOffset) {
-  NewLine();
-  OS << Label << " (";
-  if (!Data.empty()) {
-    OS << "\n";
-    Base += StartOffset;
-    OS << format_bytes_with_ascii(Data, Base, 32, 4,
-                                  CurrentIndent + IndentSpaces, true);
-    NewLine();
-  }
-  OS << ")";
-}
-
-namespace {
-struct Run {
-  Run() = default;
-  explicit Run(uint32_t Block) : Block(Block) {}
-  uint32_t Block = 0;
-  uint64_t ByteLen = 0;
-};
-} // namespace
-
-static std::vector<Run> computeBlockRuns(uint32_t BlockSize,
-                                         const msf::MSFStreamLayout &Layout) {
-  std::vector<Run> Runs;
-  if (Layout.Length == 0)
-    return Runs;
-
-  ArrayRef<support::ulittle32_t> Blocks = Layout.Blocks;
-  assert(!Blocks.empty());
-  uint64_t StreamBytesRemaining = Layout.Length;
-  uint32_t CurrentBlock = Blocks[0];
-  Runs.emplace_back(CurrentBlock);
-  while (!Blocks.empty()) {
-    Run *CurrentRun = &Runs.back();
-    uint32_t NextBlock = Blocks.front();
-    if (NextBlock < CurrentBlock || (NextBlock - CurrentBlock > 1)) {
-      Runs.emplace_back(NextBlock);
-      CurrentRun = &Runs.back();
-    }
-    uint64_t Used =
-        std::min(static_cast<uint64_t>(BlockSize), StreamBytesRemaining);
-    CurrentRun->ByteLen += Used;
-    StreamBytesRemaining -= Used;
-    CurrentBlock = NextBlock;
-    Blocks = Blocks.drop_front();
-  }
-  return Runs;
-}
-
-static std::pair<Run, uint64_t> findRun(uint64_t Offset, ArrayRef<Run> Runs) {
-  for (const auto &R : Runs) {
-    if (Offset < R.ByteLen)
-      return std::make_pair(R, Offset);
-    Offset -= R.ByteLen;
-  }
-  llvm_unreachable("Invalid offset!");
-}
-
-void LinePrinter::formatMsfStreamData(StringRef Label, PDBFile &File,
-                                      uint32_t StreamIdx,
-                                      StringRef StreamPurpose, uint64_t Offset,
-                                      uint64_t Size) {
-  if (StreamIdx >= File.getNumStreams()) {
-    formatLine("Stream {0}: Not present", StreamIdx);
-    return;
-  }
-  if (Size + Offset > File.getStreamByteSize(StreamIdx)) {
-    formatLine(
-        "Stream {0}: Invalid offset and size, range out of stream bounds",
-        StreamIdx);
-    return;
-  }
-
-  auto S = File.createIndexedStream(StreamIdx);
-  if (!S) {
-    NewLine();
-    formatLine("Stream {0}: Not present", StreamIdx);
-    return;
-  }
-
-  uint64_t End =
-      (Size == 0) ? S->getLength() : std::min(Offset + Size, S->getLength());
-  Size = End - Offset;
-
-  formatLine("Stream {0}: {1} (dumping {2:N} / {3:N} bytes)", StreamIdx,
-             StreamPurpose, Size, S->getLength());
-  AutoIndent Indent(*this);
-  BinaryStreamRef Slice(*S);
-  BinarySubstreamRef Substream;
-  Substream.Offset = Offset;
-  Substream.StreamData = Slice.drop_front(Offset).keep_front(Size);
-
-  auto Layout = File.getStreamLayout(StreamIdx);
-  formatMsfStreamData(Label, File, Layout, Substream);
-}
-
-void LinePrinter::formatMsfStreamData(StringRef Label, PDBFile &File,
-                                      const msf::MSFStreamLayout &Stream,
-                                      BinarySubstreamRef Substream) {
-  BinaryStreamReader Reader(Substream.StreamData);
-
-  auto Runs = computeBlockRuns(File.getBlockSize(), Stream);
-
-  NewLine();
-  OS << Label << " (";
-  while (Reader.bytesRemaining() > 0) {
-    OS << "\n";
-
-    Run FoundRun;
-    uint64_t RunOffset;
-    std::tie(FoundRun, RunOffset) = findRun(Substream.Offset, Runs);
-    assert(FoundRun.ByteLen >= RunOffset);
-    uint64_t Len = FoundRun.ByteLen - RunOffset;
-    Len = std::min(Len, Reader.bytesRemaining());
-    uint64_t Base = FoundRun.Block * File.getBlockSize() + RunOffset;
-    ArrayRef<uint8_t> Data;
-    consumeError(Reader.readBytes(Data, Len));
-    OS << format_bytes_with_ascii(Data, Base, 32, 4,
-                                  CurrentIndent + IndentSpaces, true);
-    if (Reader.bytesRemaining() > 0) {
-      NewLine();
-      OS << formatv("  {0}",
-                    fmt_align("<discontinuity>", AlignStyle::Center, 114, '-'));
-    }
-    Substream.Offset += Len;
-  }
-  NewLine();
-  OS << ")";
-}
-
-void LinePrinter::formatMsfStreamBlocks(
-    PDBFile &File, const msf::MSFStreamLayout &StreamLayout) {
-  auto Blocks = makeArrayRef(StreamLayout.Blocks);
-  uint64_t L = StreamLayout.Length;
-
-  while (L > 0) {
-    NewLine();
-    assert(!Blocks.empty());
-    OS << formatv("Block {0} (\n", uint32_t(Blocks.front()));
-    uint64_t UsedBytes =
-        std::min(L, static_cast<uint64_t>(File.getBlockSize()));
-    ArrayRef<uint8_t> BlockData =
-        cantFail(File.getBlockData(Blocks.front(), File.getBlockSize()));
-    uint64_t BaseOffset = Blocks.front();
-    BaseOffset *= File.getBlockSize();
-    OS << format_bytes_with_ascii(BlockData, BaseOffset, 32, 4,
-                                  CurrentIndent + IndentSpaces, true);
-    NewLine();
-    OS << ")";
-    NewLine();
-    L -= UsedBytes;
-    Blocks = Blocks.drop_front();
-  }
-}
-
-bool LinePrinter::IsTypeExcluded(llvm::StringRef TypeName, uint64_t Size) {
-  if (IsItemExcluded(TypeName, IncludeTypeFilters, ExcludeTypeFilters))
-    return true;
-  if (Size < opts::pretty::SizeThreshold)
-    return true;
-  return false;
-}
-
-bool LinePrinter::IsSymbolExcluded(llvm::StringRef SymbolName) {
-  return IsItemExcluded(SymbolName, IncludeSymbolFilters, ExcludeSymbolFilters);
-}
-
-bool LinePrinter::IsCompilandExcluded(llvm::StringRef CompilandName) {
-  return IsItemExcluded(CompilandName, IncludeCompilandFilters,
-                        ExcludeCompilandFilters);
-}
-
-WithColor::WithColor(LinePrinter &P, PDB_ColorItem C)
-    : OS(P.OS), UseColor(P.hasColor()) {
-  if (UseColor)
-    applyColor(C);
-}
-
-WithColor::~WithColor() {
-  if (UseColor)
-    OS.resetColor();
-}
-
-void WithColor::applyColor(PDB_ColorItem C) {
-  switch (C) {
-  case PDB_ColorItem::None:
-    OS.resetColor();
-    return;
-  case PDB_ColorItem::Comment:
-    OS.changeColor(raw_ostream::GREEN, false);
-    return;
-  case PDB_ColorItem::Address:
-    OS.changeColor(raw_ostream::YELLOW, /*bold=*/true);
-    return;
-  case PDB_ColorItem::Keyword:
-    OS.changeColor(raw_ostream::MAGENTA, true);
-    return;
-  case PDB_ColorItem::Register:
-  case PDB_ColorItem::Offset:
-    OS.changeColor(raw_ostream::YELLOW, false);
-    return;
-  case PDB_ColorItem::Type:
-    OS.changeColor(raw_ostream::CYAN, true);
-    return;
-  case PDB_ColorItem::Identifier:
-    OS.changeColor(raw_ostream::CYAN, false);
-    return;
-  case PDB_ColorItem::Path:
-    OS.changeColor(raw_ostream::CYAN, false);
-    return;
-  case PDB_ColorItem::Padding:
-  case PDB_ColorItem::SectionHeader:
-    OS.changeColor(raw_ostream::RED, true);
-    return;
-  case PDB_ColorItem::LiteralValue:
-    OS.changeColor(raw_ostream::GREEN, true);
-    return;
-  }
-}
diff --git a/llvm/tools/llvm-pdbutil/LinePrinter.h b/llvm/tools/llvm-pdbutil/LinePrinter.h
deleted file mode 100644
index b6bb77280fd5..000000000000
--- a/llvm/tools/llvm-pdbutil/LinePrinter.h
+++ /dev/null
@@ -1,167 +0,0 @@
-//===- LinePrinter.h ------------------------------------------ *- C++ --*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_LLVMPDBDUMP_LINEPRINTER_H
-#define LLVM_TOOLS_LLVMPDBDUMP_LINEPRINTER_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Support/BinaryStreamRef.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/Regex.h"
-#include "llvm/Support/raw_ostream.h"
-
-#include <list>
-
-namespace llvm {
-namespace msf {
-class MSFStreamLayout;
-} // namespace msf
-namespace pdb {
-
-class ClassLayout;
-class PDBFile;
-
-class LinePrinter {
-  friend class WithColor;
-
-public:
-  LinePrinter(int Indent, bool UseColor, raw_ostream &Stream);
-
-  void Indent(uint32_t Amount = 0);
-  void Unindent(uint32_t Amount = 0);
-  void NewLine();
-
-  void printLine(const Twine &T);
-  void print(const Twine &T);
-  template <typename... Ts> void formatLine(const char *Fmt, Ts &&... Items) {
-    printLine(formatv(Fmt, std::forward<Ts>(Items)...));
-  }
-  template <typename... Ts> void format(const char *Fmt, Ts &&... Items) {
-    print(formatv(Fmt, std::forward<Ts>(Items)...));
-  }
-
-  void formatBinary(StringRef Label, ArrayRef<uint8_t> Data,
-                    uint64_t StartOffset);
-  void formatBinary(StringRef Label, ArrayRef<uint8_t> Data, uint64_t BaseAddr,
-                    uint64_t StartOffset);
-
-  void formatMsfStreamData(StringRef Label, PDBFile &File, uint32_t StreamIdx,
-                           StringRef StreamPurpose, uint64_t Offset,
-                           uint64_t Size);
-  void formatMsfStreamData(StringRef Label, PDBFile &File,
-                           const msf::MSFStreamLayout &Stream,
-                           BinarySubstreamRef Substream);
-  void formatMsfStreamBlocks(PDBFile &File, const msf::MSFStreamLayout &Stream);
-
-  bool hasColor() const { return UseColor; }
-  raw_ostream &getStream() { return OS; }
-  int getIndentLevel() const { return CurrentIndent; }
-
-  bool IsClassExcluded(const ClassLayout &Class);
-  bool IsTypeExcluded(llvm::StringRef TypeName, uint64_t Size);
-  bool IsSymbolExcluded(llvm::StringRef SymbolName);
-  bool IsCompilandExcluded(llvm::StringRef CompilandName);
-
-private:
-  template <typename Iter>
-  void SetFilters(std::list<Regex> &List, Iter Begin, Iter End) {
-    List.clear();
-    for (; Begin != End; ++Begin)
-      List.emplace_back(StringRef(*Begin));
-  }
-
-  raw_ostream &OS;
-  int IndentSpaces;
-  int CurrentIndent;
-  bool UseColor;
-
-  std::list<Regex> ExcludeCompilandFilters;
-  std::list<Regex> ExcludeTypeFilters;
-  std::list<Regex> ExcludeSymbolFilters;
-
-  std::list<Regex> IncludeCompilandFilters;
-  std::list<Regex> IncludeTypeFilters;
-  std::list<Regex> IncludeSymbolFilters;
-};
-
-struct PrintScope {
-  explicit PrintScope(LinePrinter &P, uint32_t IndentLevel)
-      : P(P), IndentLevel(IndentLevel) {}
-  explicit PrintScope(const PrintScope &Other, uint32_t LabelWidth)
-      : P(Other.P), IndentLevel(Other.IndentLevel), LabelWidth(LabelWidth) {}
-
-  LinePrinter &P;
-  uint32_t IndentLevel;
-  uint32_t LabelWidth = 0;
-};
-
-inline Optional<PrintScope> withLabelWidth(const Optional<PrintScope> &Scope,
-                                           uint32_t W) {
-  if (!Scope)
-    return None;
-  return PrintScope{*Scope, W};
-}
-
-struct AutoIndent {
-  explicit AutoIndent(LinePrinter &L, uint32_t Amount = 0)
-      : L(&L), Amount(Amount) {
-    L.Indent(Amount);
-  }
-  explicit AutoIndent(const Optional<PrintScope> &Scope) {
-    if (Scope.hasValue()) {
-      L = &Scope->P;
-      Amount = Scope->IndentLevel;
-    }
-  }
-  ~AutoIndent() {
-    if (L)
-      L->Unindent(Amount);
-  }
-
-  LinePrinter *L = nullptr;
-  uint32_t Amount = 0;
-};
-
-template <class T>
-inline raw_ostream &operator<<(LinePrinter &Printer, const T &Item) {
-  return Printer.getStream() << Item;
-}
-
-enum class PDB_ColorItem {
-  None,
-  Address,
-  Type,
-  Comment,
-  Padding,
-  Keyword,
-  Offset,
-  Identifier,
-  Path,
-  SectionHeader,
-  LiteralValue,
-  Register,
-};
-
-class WithColor {
-public:
-  WithColor(LinePrinter &P, PDB_ColorItem C);
-  ~WithColor();
-
-  raw_ostream &get() { return OS; }
-
-private:
-  void applyColor(PDB_ColorItem C);
-  raw_ostream &OS;
-  bool UseColor;
-};
-}
-}
-
-#endif
diff --git a/llvm/tools/llvm-pdbutil/MinimalSymbolDumper.cpp b/llvm/tools/llvm-pdbutil/MinimalSymbolDumper.cpp
index e6b5d21f36e5..8e17284871a9 100644
--- a/llvm/tools/llvm-pdbutil/MinimalSymbolDumper.cpp
+++ b/llvm/tools/llvm-pdbutil/MinimalSymbolDumper.cpp
@@ -8,17 +8,19 @@
 
 #include "MinimalSymbolDumper.h"
 
-#include "FormatUtil.h"
-#include "InputFile.h"
-#include "LinePrinter.h"
-
 #include "llvm/DebugInfo/CodeView/CVRecord.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/Formatters.h"
 #include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/PDB/Native/FormatUtil.h"
+#include "llvm/DebugInfo/PDB/Native/InputFile.h"
+#include "llvm/DebugInfo/PDB/Native/LinePrinter.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/Native/PDBStringTable.h"
+#include "llvm/Object/COFF.h"
 #include "llvm/Support/FormatVariadic.h"
 
 using namespace llvm;
diff --git a/llvm/tools/llvm-pdbutil/MinimalTypeDumper.cpp b/llvm/tools/llvm-pdbutil/MinimalTypeDumper.cpp
index 08006e9c62d4..be7e487673fb 100644
--- a/llvm/tools/llvm-pdbutil/MinimalTypeDumper.cpp
+++ b/llvm/tools/llvm-pdbutil/MinimalTypeDumper.cpp
@@ -8,8 +8,6 @@
 
 #include "MinimalTypeDumper.h"
 
-#include "FormatUtil.h"
-#include "LinePrinter.h"
 #include "TypeReferenceTracker.h"
 
 #include "llvm-pdbutil.h"
@@ -19,8 +17,13 @@
 #include "llvm/DebugInfo/CodeView/Formatters.h"
 #include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/PDB/Native/FormatUtil.h"
+#include "llvm/DebugInfo/PDB/Native/LinePrinter.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/Native/TpiHashing.h"
 #include "llvm/DebugInfo/PDB/Native/TpiStream.h"
+#include "llvm/Object/COFF.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/MathExtras.h"
 
diff --git a/llvm/tools/llvm-pdbutil/OutputStyle.h b/llvm/tools/llvm-pdbutil/OutputStyle.h
index da93c32053f3..8cc9016d79a2 100644
--- a/llvm/tools/llvm-pdbutil/OutputStyle.h
+++ b/llvm/tools/llvm-pdbutil/OutputStyle.h
@@ -9,9 +9,10 @@
 #ifndef LLVM_TOOLS_LLVMPDBDUMP_OUTPUTSTYLE_H
 #define LLVM_TOOLS_LLVMPDBDUMP_OUTPUTSTYLE_H
 
-#include "llvm/Support/Error.h"
-
 namespace llvm {
+
+class Error;
+
 namespace pdb {
 
 class OutputStyle {
diff --git a/llvm/tools/llvm-pdbutil/PrettyBuiltinDumper.cpp b/llvm/tools/llvm-pdbutil/PrettyBuiltinDumper.cpp
index cd01a4004819..895066146a9d 100644
--- a/llvm/tools/llvm-pdbutil/PrettyBuiltinDumper.cpp
+++ b/llvm/tools/llvm-pdbutil/PrettyBuiltinDumper.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "PrettyBuiltinDumper.h"
-#include "LinePrinter.h"
 
+#include "llvm/DebugInfo/PDB/Native/LinePrinter.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
 
 using namespace llvm;
@@ -90,6 +90,8 @@ StringRef BuiltinDumper::getTypeName(const PDBSymbolTypeBuiltin &Symbol) {
     return "char16_t";
   case PDB_BuiltinType::Char32:
     return "char32_t";
+  case PDB_BuiltinType::Char8:
+    return "char8_t";
   case PDB_BuiltinType::None:
     return "...";
   }
diff --git a/llvm/tools/llvm-pdbutil/PrettyClassDefinitionDumper.cpp b/llvm/tools/llvm-pdbutil/PrettyClassDefinitionDumper.cpp
index b7eccac5988c..2285ed16d2a5 100644
--- a/llvm/tools/llvm-pdbutil/PrettyClassDefinitionDumper.cpp
+++ b/llvm/tools/llvm-pdbutil/PrettyClassDefinitionDumper.cpp
@@ -8,13 +8,14 @@
 
 #include "PrettyClassDefinitionDumper.h"
 
-#include "LinePrinter.h"
 #include "PrettyClassLayoutGraphicalDumper.h"
 #include "llvm-pdbutil.h"
 
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
 #include "llvm/DebugInfo/PDB/UDTLayout.h"
 
diff --git a/llvm/tools/llvm-pdbutil/PrettyClassLayoutGraphicalDumper.cpp b/llvm/tools/llvm-pdbutil/PrettyClassLayoutGraphicalDumper.cpp
index a522935e34f1..1ade7f397030 100644
--- a/llvm/tools/llvm-pdbutil/PrettyClassLayoutGraphicalDumper.cpp
+++ b/llvm/tools/llvm-pdbutil/PrettyClassLayoutGraphicalDumper.cpp
@@ -8,7 +8,6 @@
 
 #include "PrettyClassLayoutGraphicalDumper.h"
 
-#include "LinePrinter.h"
 #include "PrettyClassDefinitionDumper.h"
 #include "PrettyEnumDumper.h"
 #include "PrettyFunctionDumper.h"
@@ -17,8 +16,10 @@
 #include "PrettyVariableDumper.h"
 #include "llvm-pdbutil.h"
 
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolData.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
 #include "llvm/DebugInfo/PDB/UDTLayout.h"
 #include "llvm/Support/Format.h"
diff --git a/llvm/tools/llvm-pdbutil/PrettyCompilandDumper.cpp b/llvm/tools/llvm-pdbutil/PrettyCompilandDumper.cpp
index cf769ff66472..591bd4f93702 100644
--- a/llvm/tools/llvm-pdbutil/PrettyCompilandDumper.cpp
+++ b/llvm/tools/llvm-pdbutil/PrettyCompilandDumper.cpp
@@ -8,7 +8,6 @@
 
 #include "PrettyCompilandDumper.h"
 
-#include "LinePrinter.h"
 #include "PrettyFunctionDumper.h"
 #include "llvm-pdbutil.h"
 
diff --git a/llvm/tools/llvm-pdbutil/PrettyEnumDumper.cpp b/llvm/tools/llvm-pdbutil/PrettyEnumDumper.cpp
index 9ed5893f252e..64557ff09c72 100644
--- a/llvm/tools/llvm-pdbutil/PrettyEnumDumper.cpp
+++ b/llvm/tools/llvm-pdbutil/PrettyEnumDumper.cpp
@@ -8,10 +8,11 @@
 
 #include "PrettyEnumDumper.h"
 
-#include "LinePrinter.h"
 #include "PrettyBuiltinDumper.h"
 #include "llvm-pdbutil.h"
 
+#include "llvm/DebugInfo/PDB/ConcreteSymbolEnumerator.h"
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolData.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
diff --git a/llvm/tools/llvm-pdbutil/PrettyExternalSymbolDumper.cpp b/llvm/tools/llvm-pdbutil/PrettyExternalSymbolDumper.cpp
index fede031ec0c0..34436c572c8a 100644
--- a/llvm/tools/llvm-pdbutil/PrettyExternalSymbolDumper.cpp
+++ b/llvm/tools/llvm-pdbutil/PrettyExternalSymbolDumper.cpp
@@ -7,8 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "PrettyExternalSymbolDumper.h"
-#include "LinePrinter.h"
 
+#include "llvm/DebugInfo/PDB/ConcreteSymbolEnumerator.h"
+#include "llvm/DebugInfo/PDB/Native/LinePrinter.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolPublicSymbol.h"
 #include "llvm/Support/Format.h"
diff --git a/llvm/tools/llvm-pdbutil/PrettyFunctionDumper.cpp b/llvm/tools/llvm-pdbutil/PrettyFunctionDumper.cpp
index b820ca333965..83cf4d918322 100644
--- a/llvm/tools/llvm-pdbutil/PrettyFunctionDumper.cpp
+++ b/llvm/tools/llvm-pdbutil/PrettyFunctionDumper.cpp
@@ -7,16 +7,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "PrettyFunctionDumper.h"
-#include "LinePrinter.h"
 #include "PrettyBuiltinDumper.h"
 
+#include "llvm/DebugInfo/PDB/ConcreteSymbolEnumerator.h"
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
+#include "llvm/DebugInfo/PDB/Native/LinePrinter.h"
 #include "llvm/DebugInfo/PDB/PDBExtras.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolData.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolFunc.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolFuncDebugEnd.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolFuncDebugStart.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeArray.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionArg.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h"
diff --git a/llvm/tools/llvm-pdbutil/PrettyTypeDumper.cpp b/llvm/tools/llvm-pdbutil/PrettyTypeDumper.cpp
index 2f7a39803ca5..9547d4e4ed35 100644
--- a/llvm/tools/llvm-pdbutil/PrettyTypeDumper.cpp
+++ b/llvm/tools/llvm-pdbutil/PrettyTypeDumper.cpp
@@ -8,7 +8,6 @@
 
 #include "PrettyTypeDumper.h"
 
-#include "LinePrinter.h"
 #include "PrettyBuiltinDumper.h"
 #include "PrettyClassDefinitionDumper.h"
 #include "PrettyEnumDumper.h"
@@ -16,6 +15,8 @@
 #include "PrettyTypedefDumper.h"
 #include "llvm-pdbutil.h"
 
+#include "llvm/DebugInfo/PDB/ConcreteSymbolEnumerator.h"
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeArray.h"
@@ -25,6 +26,7 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolTypePointer.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h"
 #include "llvm/DebugInfo/PDB/UDTLayout.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/FormatVariadic.h"
diff --git a/llvm/tools/llvm-pdbutil/PrettyTypedefDumper.cpp b/llvm/tools/llvm-pdbutil/PrettyTypedefDumper.cpp
index ef73a8cdf9c4..197aa07299d1 100644
--- a/llvm/tools/llvm-pdbutil/PrettyTypedefDumper.cpp
+++ b/llvm/tools/llvm-pdbutil/PrettyTypedefDumper.cpp
@@ -8,13 +8,15 @@
 
 #include "PrettyTypedefDumper.h"
 
-#include "LinePrinter.h"
 #include "PrettyBuiltinDumper.h"
 #include "PrettyFunctionDumper.h"
 #include "PrettyTypeDumper.h"
 
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
+#include "llvm/DebugInfo/PDB/Native/LinePrinter.h"
 #include "llvm/DebugInfo/PDB/PDBExtras.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypePointer.h"
diff --git a/llvm/tools/llvm-pdbutil/PrettyVariableDumper.cpp b/llvm/tools/llvm-pdbutil/PrettyVariableDumper.cpp
index 6dd7cc384cc9..e9ac6984356c 100644
--- a/llvm/tools/llvm-pdbutil/PrettyVariableDumper.cpp
+++ b/llvm/tools/llvm-pdbutil/PrettyVariableDumper.cpp
@@ -8,21 +8,23 @@
 
 #include "PrettyVariableDumper.h"
 
-#include "LinePrinter.h"
 #include "PrettyBuiltinDumper.h"
 #include "PrettyFunctionDumper.h"
 #include "llvm-pdbutil.h"
 
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolData.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolFunc.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeArray.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypePointer.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h"
 #include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 #include "llvm/Support/Format.h"
diff --git a/llvm/tools/llvm-pdbutil/StreamUtil.cpp b/llvm/tools/llvm-pdbutil/StreamUtil.cpp
index d0d0a9fbe927..878fb77353fa 100644
--- a/llvm/tools/llvm-pdbutil/StreamUtil.cpp
+++ b/llvm/tools/llvm-pdbutil/StreamUtil.cpp
@@ -7,13 +7,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "StreamUtil.h"
-#include "FormatUtil.h"
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/DebugInfo/PDB/Native/DbiModuleDescriptor.h"
 #include "llvm/DebugInfo/PDB/Native/DbiModuleList.h"
 #include "llvm/DebugInfo/PDB/Native/DbiStream.h"
+#include "llvm/DebugInfo/PDB/Native/FormatUtil.h"
 #include "llvm/DebugInfo/PDB/Native/InfoStream.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/Native/TpiStream.h"
@@ -95,7 +95,7 @@ void llvm::pdb::discoverStreamPurposes(PDBFile &File,
   }
 
   Streams.resize(StreamCount);
-  for (uint16_t StreamIdx = 0; StreamIdx < StreamCount; ++StreamIdx) {
+  for (uint32_t StreamIdx = 0; StreamIdx < StreamCount; ++StreamIdx) {
     if (StreamIdx == OldMSFDirectory)
       Streams[StreamIdx] =
           stream(StreamPurpose::Other, "Old MSF Directory", StreamIdx);
diff --git a/llvm/tools/llvm-pdbutil/TypeReferenceTracker.cpp b/llvm/tools/llvm-pdbutil/TypeReferenceTracker.cpp
index f184f02e01ee..d813bc22a93c 100644
--- a/llvm/tools/llvm-pdbutil/TypeReferenceTracker.cpp
+++ b/llvm/tools/llvm-pdbutil/TypeReferenceTracker.cpp
@@ -9,10 +9,12 @@
 #include "TypeReferenceTracker.h"
 
 #include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
-#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
-#include "llvm/DebugInfo/PDB/Native/TpiStream.h"
 #include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/Native/SymbolStream.h"
+#include "llvm/DebugInfo/PDB/Native/TpiStream.h"
+#include "llvm/Object/COFF.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/tools/llvm-pdbutil/TypeReferenceTracker.h b/llvm/tools/llvm-pdbutil/TypeReferenceTracker.h
index 8861731ab6ee..c586f6523c57 100644
--- a/llvm/tools/llvm-pdbutil/TypeReferenceTracker.h
+++ b/llvm/tools/llvm-pdbutil/TypeReferenceTracker.h
@@ -9,14 +9,13 @@
 #ifndef LLVM_TOOLS_LLVMPDBDUMP_TYPEREFERENCETRACKER_H
 #define LLVM_TOOLS_LLVMPDBDUMP_TYPEREFERENCETRACKER_H
 
-#include "InputFile.h"
-
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/DebugInfo/CodeView/CVRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeIndexDiscovery.h"
+#include "llvm/DebugInfo/PDB/Native/InputFile.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
diff --git a/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp b/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp
index b152ebd6dccb..3b922a7bea21 100644
--- a/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp
+++ b/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp
@@ -15,8 +15,6 @@
 #include "BytesOutputStyle.h"
 #include "DumpOutputStyle.h"
 #include "ExplainOutputStyle.h"
-#include "InputFile.h"
-#include "LinePrinter.h"
 #include "OutputStyle.h"
 #include "PrettyClassDefinitionDumper.h"
 #include "PrettyCompilandDumper.h"
@@ -44,14 +42,18 @@
 #include "llvm/DebugInfo/CodeView/StringsAndChecksums.h"
 #include "llvm/DebugInfo/CodeView/TypeStreamMerger.h"
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
+#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
+#include "llvm/DebugInfo/PDB/ConcreteSymbolEnumerator.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/IPDBInjectedSource.h"
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h"
 #include "llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h"
 #include "llvm/DebugInfo/PDB/Native/InfoStream.h"
 #include "llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h"
+#include "llvm/DebugInfo/PDB/Native/InputFile.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFileBuilder.h"
@@ -67,6 +69,7 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolFunc.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolPublicSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolThunk.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionArg.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h"
@@ -195,6 +198,8 @@ static cl::opt<bool> Typedefs("typedefs", cl::desc("Dump typedefs"),
                               cl::sub(DiaDumpSubcommand));
 } // namespace diadump
 
+FilterOptions Filters;
+
 namespace pretty {
 cl::list<std::string> InputFilenames(cl::Positional,
                                      cl::desc("<input PDB files>"),
@@ -211,7 +216,7 @@ cl::opt<bool> ShowInjectedSourceContent(
 cl::list<std::string> WithName(
     "with-name",
     cl::desc("Display any symbol or type with the specified exact name"),
-    cl::cat(TypeCategory), cl::ZeroOrMore, cl::sub(PrettySubcommand));
+    cl::cat(TypeCategory), cl::sub(PrettySubcommand));
 
 cl::opt<bool> Compilands("compilands", cl::desc("Display compilands"),
                          cl::cat(TypeCategory), cl::sub(PrettySubcommand));
@@ -224,7 +229,7 @@ cl::opt<bool> Externals("externals", cl::desc("Dump external symbols"),
                         cl::cat(TypeCategory), cl::sub(PrettySubcommand));
 cl::list<SymLevel> SymTypes(
     "sym-types", cl::desc("Type of symbols to dump (default all)"),
-    cl::cat(TypeCategory), cl::sub(PrettySubcommand), cl::ZeroOrMore,
+    cl::cat(TypeCategory), cl::sub(PrettySubcommand),
     cl::values(
         clEnumValN(SymLevel::Thunks, "thunks", "Display thunk symbols"),
         clEnumValN(SymLevel::Data, "data", "Display data symbols"),
@@ -310,28 +315,31 @@ cl::opt<cl::boolOrDefault>
     ColorOutput("color-output",
                 cl::desc("Override use of color (default = isatty)"),
                 cl::cat(OtherOptions), cl::sub(PrettySubcommand));
-cl::list<std::string> ExcludeTypes(
-    "exclude-types", cl::desc("Exclude types by regular expression"),
-    cl::ZeroOrMore, cl::cat(FilterCategory), cl::sub(PrettySubcommand));
-cl::list<std::string> ExcludeSymbols(
-    "exclude-symbols", cl::desc("Exclude symbols by regular expression"),
-    cl::ZeroOrMore, cl::cat(FilterCategory), cl::sub(PrettySubcommand));
-cl::list<std::string> ExcludeCompilands(
-    "exclude-compilands", cl::desc("Exclude compilands by regular expression"),
-    cl::ZeroOrMore, cl::cat(FilterCategory), cl::sub(PrettySubcommand));
+cl::list<std::string>
+    ExcludeTypes("exclude-types",
+                 cl::desc("Exclude types by regular expression"),
+                 cl::cat(FilterCategory), cl::sub(PrettySubcommand));
+cl::list<std::string>
+    ExcludeSymbols("exclude-symbols",
+                   cl::desc("Exclude symbols by regular expression"),
+                   cl::cat(FilterCategory), cl::sub(PrettySubcommand));
+cl::list<std::string>
+    ExcludeCompilands("exclude-compilands",
+                      cl::desc("Exclude compilands by regular expression"),
+                      cl::cat(FilterCategory), cl::sub(PrettySubcommand));
 
 cl::list<std::string> IncludeTypes(
     "include-types",
     cl::desc("Include only types which match a regular expression"),
-    cl::ZeroOrMore, cl::cat(FilterCategory), cl::sub(PrettySubcommand));
+    cl::cat(FilterCategory), cl::sub(PrettySubcommand));
 cl::list<std::string> IncludeSymbols(
     "include-symbols",
     cl::desc("Include only symbols which match a regular expression"),
-    cl::ZeroOrMore, cl::cat(FilterCategory), cl::sub(PrettySubcommand));
+    cl::cat(FilterCategory), cl::sub(PrettySubcommand));
 cl::list<std::string> IncludeCompilands(
     "include-compilands",
     cl::desc("Include only compilands those which match a regular expression"),
-    cl::ZeroOrMore, cl::cat(FilterCategory), cl::sub(PrettySubcommand));
+    cl::cat(FilterCategory), cl::sub(PrettySubcommand));
 cl::opt<uint32_t> SizeThreshold(
     "min-type-size", cl::desc("Displays only those types which are greater "
                               "than or equal to the specified size."),
@@ -384,7 +392,7 @@ cl::opt<std::string>
                      cl::sub(BytesSubcommand), cl::cat(MsfBytes));
 
 cl::list<std::string>
-    DumpStreamData("stream-data", cl::CommaSeparated, cl::ZeroOrMore,
+    DumpStreamData("stream-data", cl::CommaSeparated,
                    cl::desc("Dump binary data from specified streams.  Format "
                             "is SN[:Start][@Size]"),
                    cl::sub(BytesSubcommand), cl::cat(MsfBytes));
@@ -407,14 +415,12 @@ cl::opt<bool> TypeServerMap("type-server", cl::desc("Dump type server map"),
 cl::opt<bool> ECData("ec", cl::desc("Dump edit and continue map"),
                      cl::sub(BytesSubcommand), cl::cat(DbiBytes));
 
-cl::list<uint32_t>
-    TypeIndex("type",
-              cl::desc("Dump the type record with the given type index"),
-              cl::ZeroOrMore, cl::CommaSeparated, cl::sub(BytesSubcommand),
-              cl::cat(TypeCategory));
+cl::list<uint32_t> TypeIndex(
+    "type", cl::desc("Dump the type record with the given type index"),
+    cl::CommaSeparated, cl::sub(BytesSubcommand), cl::cat(TypeCategory));
 cl::list<uint32_t>
     IdIndex("id", cl::desc("Dump the id record with the given type index"),
-            cl::ZeroOrMore, cl::CommaSeparated, cl::sub(BytesSubcommand),
+            cl::CommaSeparated, cl::sub(BytesSubcommand),
             cl::cat(TypeCategory));
 
 cl::opt<uint32_t> ModuleIndex(
@@ -500,7 +506,7 @@ cl::opt<bool> DontResolveForwardRefs(
     cl::cat(TypeOptions), cl::sub(DumpSubcommand));
 
 cl::list<uint32_t> DumpTypeIndex(
-    "type-index", cl::ZeroOrMore, cl::CommaSeparated,
+    "type-index", cl::CommaSeparated,
     cl::desc("only dump types with the specified hexadecimal type index"),
     cl::cat(TypeOptions), cl::sub(DumpSubcommand));
 
@@ -516,7 +522,7 @@ cl::opt<bool> DumpIdExtras("id-extras",
                            cl::desc("dump id hashes and index offsets"),
                            cl::cat(TypeOptions), cl::sub(DumpSubcommand));
 cl::list<uint32_t> DumpIdIndex(
-    "id-index", cl::ZeroOrMore, cl::CommaSeparated,
+    "id-index", cl::CommaSeparated,
     cl::desc("only dump ids with the specified hexadecimal type index"),
     cl::cat(TypeOptions), cl::sub(DumpSubcommand));
 
@@ -536,7 +542,7 @@ cl::list<std::string> DumpGlobalNames(
     "global-name",
     cl::desc(
         "With -globals, only dump globals whose name matches the given value"),
-    cl::cat(SymbolOptions), cl::sub(DumpSubcommand), cl::ZeroOrMore);
+    cl::cat(SymbolOptions), cl::sub(DumpSubcommand));
 cl::opt<bool> DumpPublics("publics", cl::desc("dump Publics stream data"),
                           cl::cat(SymbolOptions), cl::sub(DumpSubcommand));
 cl::opt<bool> DumpPublicExtras("public-extras",
@@ -557,6 +563,27 @@ cl::opt<bool>
 cl::opt<bool> DumpFpo("fpo", cl::desc("dump FPO records"),
                       cl::cat(SymbolOptions), cl::sub(DumpSubcommand));
 
+cl::opt<uint32_t> DumpSymbolOffset(
+    "symbol-offset", cl::Optional,
+    cl::desc("only dump symbol record with the specified symbol offset"),
+    cl::cat(SymbolOptions), cl::sub(DumpSubcommand));
+cl::opt<bool> DumpParents("show-parents",
+                          cl::desc("dump the symbols record's all parents."),
+                          cl::cat(SymbolOptions), cl::sub(DumpSubcommand));
+cl::opt<uint32_t>
+    DumpParentDepth("parent-recurse-depth", cl::Optional, cl::init(-1U),
+                    cl::desc("only recurse to a depth of N when displaying "
+                             "parents of a symbol record."),
+                    cl::cat(SymbolOptions), cl::sub(DumpSubcommand));
+cl::opt<bool> DumpChildren("show-children",
+                           cl::desc("dump the symbols record's all children."),
+                           cl::cat(SymbolOptions), cl::sub(DumpSubcommand));
+cl::opt<uint32_t>
+    DumpChildrenDepth("children-recurse-depth", cl::Optional, cl::init(-1U),
+                      cl::desc("only recurse to a depth of N when displaying "
+                               "children of a symbol record."),
+                      cl::cat(SymbolOptions), cl::sub(DumpSubcommand));
+
 // MODULE & FILE OPTIONS
 cl::opt<bool> DumpModules("modules", cl::desc("dump compiland information"),
                           cl::cat(FileOptions), cl::sub(DumpSubcommand));
@@ -680,7 +707,7 @@ cl::opt<bool> DumpModuleFiles("module-files", cl::desc("dump file information"),
                               cl::cat(FileOptions),
                               cl::sub(PdbToYamlSubcommand));
 cl::list<ModuleSubsection> DumpModuleSubsections(
-    "subsections", cl::ZeroOrMore, cl::CommaSeparated,
+    "subsections", cl::CommaSeparated,
     cl::desc("dump subsections from each module's debug stream"), ChunkValues,
     cl::cat(FileOptions), cl::sub(PdbToYamlSubcommand));
 cl::opt<bool> DumpModuleSyms("module-syms", cl::desc("dump module symbols"),
@@ -764,7 +791,7 @@ static void yamlToPdb(StringRef Path) {
   PDBFileBuilder Builder(Allocator);
 
   uint32_t BlockSize = 4096;
-  if (YamlObj.Headers.hasValue())
+  if (YamlObj.Headers)
     BlockSize = YamlObj.Headers->SuperBlock.BlockSize;
   ExitOnErr(Builder.initialize(BlockSize));
   // Add each of the reserved streams.  We ignore stream metadata in the
@@ -779,7 +806,7 @@ static void yamlToPdb(StringRef Path) {
   StringsAndChecksums Strings;
   Strings.setStrings(std::make_shared<DebugStringTableSubsection>());
 
-  if (YamlObj.StringTable.hasValue()) {
+  if (YamlObj.StringTable) {
     for (auto S : *YamlObj.StringTable)
       Strings.strings()->insert(S);
   }
@@ -789,7 +816,7 @@ static void yamlToPdb(StringRef Path) {
   pdb::yaml::PdbTpiStream DefaultTpiStream;
   pdb::yaml::PdbTpiStream DefaultIpiStream;
 
-  const auto &Info = YamlObj.PdbStream.getValueOr(DefaultInfoStream);
+  const auto &Info = YamlObj.PdbStream.value_or(DefaultInfoStream);
 
   auto &InfoBuilder = Builder.getInfoBuilder();
   InfoBuilder.setAge(Info.Age);
@@ -799,7 +826,7 @@ static void yamlToPdb(StringRef Path) {
   for (auto F : Info.Features)
     InfoBuilder.addFeature(F);
 
-  const auto &Dbi = YamlObj.DbiStream.getValueOr(DefaultDbiStream);
+  const auto &Dbi = YamlObj.DbiStream.value_or(DefaultDbiStream);
   auto &DbiBuilder = Builder.getDbiBuilder();
   DbiBuilder.setAge(Dbi.Age);
   DbiBuilder.setBuildNumber(Dbi.BuildNumber);
@@ -814,7 +841,7 @@ static void yamlToPdb(StringRef Path) {
 
     for (auto S : MI.SourceFiles)
       ExitOnErr(DbiBuilder.addModuleSourceFile(ModiBuilder, S));
-    if (MI.Modi.hasValue()) {
+    if (MI.Modi) {
       const auto &ModiStream = *MI.Modi;
       for (auto Symbol : ModiStream.Symbols) {
         ModiBuilder.addSymbol(
@@ -834,7 +861,7 @@ static void yamlToPdb(StringRef Path) {
   }
 
   auto &TpiBuilder = Builder.getTpiBuilder();
-  const auto &Tpi = YamlObj.TpiStream.getValueOr(DefaultTpiStream);
+  const auto &Tpi = YamlObj.TpiStream.value_or(DefaultTpiStream);
   TpiBuilder.setVersionHeader(Tpi.Version);
   AppendingTypeTableBuilder TS(Allocator);
   for (const auto &R : Tpi.Records) {
@@ -842,7 +869,7 @@ static void yamlToPdb(StringRef Path) {
     TpiBuilder.addTypeRecord(Type.RecordData, None);
   }
 
-  const auto &Ipi = YamlObj.IpiStream.getValueOr(DefaultIpiStream);
+  const auto &Ipi = YamlObj.IpiStream.value_or(DefaultIpiStream);
   auto &IpiBuilder = Builder.getIpiBuilder();
   IpiBuilder.setVersionHeader(Ipi.Version);
   for (const auto &R : Ipi.Records) {
@@ -1068,7 +1095,7 @@ static void dumpPretty(StringRef Path) {
   const bool UseColor = opts::pretty::ColorOutput == cl::BOU_UNSET
                             ? Stream.has_colors()
                             : opts::pretty::ColorOutput == cl::BOU_TRUE;
-  LinePrinter Printer(2, UseColor, Stream);
+  LinePrinter Printer(2, UseColor, Stream, opts::Filters);
 
   auto GlobalScope(Session->getGlobalScope());
   if (!GlobalScope)
@@ -1506,6 +1533,44 @@ int main(int Argc, const char **Argv) {
 
   llvm::sys::InitializeCOMRAII COM(llvm::sys::COMThreadingMode::MultiThreaded);
 
+  // Initialize the filters for LinePrinter.
+  auto propagate = [&](auto &Target, auto &Reference) {
+    for (std::string &Option : Reference)
+      Target.push_back(Option);
+  };
+
+  propagate(opts::Filters.ExcludeTypes, opts::pretty::ExcludeTypes);
+  propagate(opts::Filters.ExcludeTypes, opts::pretty::ExcludeTypes);
+  propagate(opts::Filters.ExcludeSymbols, opts::pretty::ExcludeSymbols);
+  propagate(opts::Filters.ExcludeCompilands, opts::pretty::ExcludeCompilands);
+  propagate(opts::Filters.IncludeTypes, opts::pretty::IncludeTypes);
+  propagate(opts::Filters.IncludeSymbols, opts::pretty::IncludeSymbols);
+  propagate(opts::Filters.IncludeCompilands, opts::pretty::IncludeCompilands);
+  opts::Filters.PaddingThreshold = opts::pretty::PaddingThreshold;
+  opts::Filters.SizeThreshold = opts::pretty::SizeThreshold;
+  opts::Filters.JustMyCode = opts::dump::JustMyCode;
+  if (opts::dump::DumpModi.getNumOccurrences() > 0) {
+    if (opts::dump::DumpModi.getNumOccurrences() != 1) {
+      errs() << "argument '-modi' specified more than once.\n";
+      errs().flush();
+      exit(1);
+    }
+    opts::Filters.DumpModi = opts::dump::DumpModi;
+  }
+  if (opts::dump::DumpSymbolOffset) {
+    if (opts::dump::DumpModi.getNumOccurrences() != 1) {
+      errs()
+          << "need to specify argument '-modi' when using '-symbol-offset'.\n";
+      errs().flush();
+      exit(1);
+    }
+    opts::Filters.SymbolOffset = opts::dump::DumpSymbolOffset;
+    if (opts::dump::DumpParents)
+      opts::Filters.ParentRecurseDepth = opts::dump::DumpParentDepth;
+    if (opts::dump::DumpChildren)
+      opts::Filters.ChildrenRecurseDepth = opts::dump::DumpChildrenDepth;
+  }
+
   if (opts::PdbToYamlSubcommand) {
     pdb2Yaml(opts::pdb2yaml::InputFilename.front());
   } else if (opts::YamlToPdbSubcommand) {
@@ -1544,14 +1609,14 @@ int main(int Argc, const char **Argv) {
     // it needs to be escaped again in the C++.  So matching a single \ in the
     // input requires 4 \es in the C++.
     if (opts::pretty::ExcludeCompilerGenerated) {
-      opts::pretty::ExcludeTypes.push_back("__vc_attributes");
-      opts::pretty::ExcludeCompilands.push_back("\\* Linker \\*");
+      opts::Filters.ExcludeTypes.push_back("__vc_attributes");
+      opts::Filters.ExcludeCompilands.push_back("\\* Linker \\*");
     }
     if (opts::pretty::ExcludeSystemLibraries) {
-      opts::pretty::ExcludeCompilands.push_back(
+      opts::Filters.ExcludeCompilands.push_back(
           "f:\\\\binaries\\\\Intermediate\\\\vctools\\\\crt_bld");
-      opts::pretty::ExcludeCompilands.push_back("f:\\\\dd\\\\vctools\\\\crt");
-      opts::pretty::ExcludeCompilands.push_back(
+      opts::Filters.ExcludeCompilands.push_back("f:\\\\dd\\\\vctools\\\\crt");
+      opts::Filters.ExcludeCompilands.push_back(
           "d:\\\\th.obj.x86fre\\\\minkernel");
     }
     llvm::for_each(opts::pretty::InputFilenames, dumpPretty);
diff --git a/llvm/tools/llvm-pdbutil/llvm-pdbutil.h b/llvm/tools/llvm-pdbutil/llvm-pdbutil.h
index 9fe92c2c9d75..455fe5f28191 100644
--- a/llvm/tools/llvm-pdbutil/llvm-pdbutil.h
+++ b/llvm/tools/llvm-pdbutil/llvm-pdbutil.h
@@ -12,6 +12,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/PointerUnion.h"
+#include "llvm/DebugInfo/PDB/Native/LinePrinter.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -50,6 +51,8 @@ enum class ModuleSubsection {
   All
 };
 
+extern FilterOptions Filters;
+
 namespace pretty {
 
 enum class ClassDefinitionFormat { None, Layout, All };
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 6000460d3c23..9c6586483ef0 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ProfileData/InstrProfCorrelator.h"
 #include "llvm/ProfileData/InstrProfReader.h"
 #include "llvm/ProfileData/InstrProfWriter.h"
+#include "llvm/ProfileData/MemProf.h"
 #include "llvm/ProfileData/ProfileCommon.h"
 #include "llvm/ProfileData/RawMemProfReader.h"
 #include "llvm/ProfileData/SampleProfReader.h"
@@ -37,6 +38,7 @@
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <queue>
 
 using namespace llvm;
 
@@ -89,6 +91,7 @@ static void exitWithError(Error E, StringRef Whence = "") {
       }
       exitWithError(IPE.message(), std::string(Whence), std::string(Hint));
     });
+    return;
   }
 
   exitWithError(toString(std::move(E)), std::string(Whence));
@@ -237,7 +240,7 @@ static void overlapInput(const std::string &BaseFilename,
 /// Load an input into a writer context.
 static void loadInput(const WeightedFile &Input, SymbolRemapper *Remapper,
                       const InstrProfCorrelator *Correlator,
-                      WriterContext *WC) {
+                      const StringRef ProfiledBinary, WriterContext *WC) {
   std::unique_lock<std::mutex> CtxGuard{WC->Lock};
 
   // Copy the filename, because llvm::ThreadPool copied the input "const
@@ -245,6 +248,48 @@ static void loadInput(const WeightedFile &Input, SymbolRemapper *Remapper,
   // invalid outside of this packaged task.
   std::string Filename = Input.Filename;
 
+  using ::llvm::memprof::RawMemProfReader;
+  if (RawMemProfReader::hasFormat(Input.Filename)) {
+    auto ReaderOrErr = RawMemProfReader::create(Input.Filename, ProfiledBinary);
+    if (!ReaderOrErr) {
+      exitWithError(ReaderOrErr.takeError(), Input.Filename);
+    }
+    std::unique_ptr<RawMemProfReader> Reader = std::move(ReaderOrErr.get());
+    // Check if the profile types can be merged, e.g. clang frontend profiles
+    // should not be merged with memprof profiles.
+    if (Error E = WC->Writer.mergeProfileKind(Reader->getProfileKind())) {
+      consumeError(std::move(E));
+      WC->Errors.emplace_back(
+          make_error<StringError>(
+              "Cannot merge MemProf profile with Clang generated profile.",
+              std::error_code()),
+          Filename);
+      return;
+    }
+
+    auto MemProfError = [&](Error E) {
+      instrprof_error IPE = InstrProfError::take(std::move(E));
+      WC->Errors.emplace_back(make_error<InstrProfError>(IPE), Filename);
+    };
+
+    // Add the frame mappings into the writer context.
+    const auto &IdToFrame = Reader->getFrameMapping();
+    for (const auto &I : IdToFrame) {
+      bool Succeeded = WC->Writer.addMemProfFrame(
+          /*Id=*/I.first, /*Frame=*/I.getSecond(), MemProfError);
+      // If we weren't able to add the frame mappings then it doesn't make sense
+      // to try to add the records from this profile.
+      if (!Succeeded)
+        return;
+    }
+    const auto &FunctionProfileData = Reader->getProfileData();
+    // Add the memprof records into the writer context.
+    for (const auto &I : FunctionProfileData) {
+      WC->Writer.addMemProfRecord(/*Id=*/I.first, /*Record=*/I.second);
+    }
+    return;
+  }
+
   auto ReaderOrErr = InstrProfReader::create(Input.Filename, Correlator);
   if (Error E = ReaderOrErr.takeError()) {
     // Skip the empty profiles by returning sliently.
@@ -330,7 +375,8 @@ static void mergeInstrProfile(const WeightedFileVector &Inputs,
                               SymbolRemapper *Remapper,
                               StringRef OutputFilename,
                               ProfileFormat OutputFormat, bool OutputSparse,
-                              unsigned NumThreads, FailureMode FailMode) {
+                              unsigned NumThreads, FailureMode FailMode,
+                              const StringRef ProfiledBinary) {
   if (OutputFormat != PF_Binary && OutputFormat != PF_Compact_Binary &&
       OutputFormat != PF_Ext_Binary && OutputFormat != PF_Text)
     exitWithError("unknown format is specified");
@@ -363,14 +409,15 @@ static void mergeInstrProfile(const WeightedFileVector &Inputs,
 
   if (NumThreads == 1) {
     for (const auto &Input : Inputs)
-      loadInput(Input, Remapper, Correlator.get(), Contexts[0].get());
+      loadInput(Input, Remapper, Correlator.get(), ProfiledBinary,
+                Contexts[0].get());
   } else {
     ThreadPool Pool(hardware_concurrency(NumThreads));
 
     // Load the inputs in parallel (N/NumThreads serial steps).
     unsigned Ctx = 0;
     for (const auto &Input : Inputs) {
-      Pool.async(loadInput, Input, Remapper, Correlator.get(),
+      Pool.async(loadInput, Input, Remapper, Correlator.get(), ProfiledBinary,
                  Contexts[Ctx].get());
       Ctx = (Ctx + 1) % NumThreads;
     }
@@ -587,7 +634,7 @@ static void supplementInstrProfile(
   SmallSet<instrprof_error, 4> WriterErrorCodes;
   auto WC = std::make_unique<WriterContext>(OutputSparse, ErrorLock,
                                             WriterErrorCodes);
-  loadInput(Inputs[0], nullptr, nullptr, WC.get());
+  loadInput(Inputs[0], nullptr, nullptr, /*ProfiledBinary=*/"", WC.get());
   if (WC->Errors.size() > 0)
     exitWithError(std::move(WC->Errors[0].first), InstrFilename);
 
@@ -708,7 +755,7 @@ mergeSampleProfile(const WeightedFileVector &Inputs, SymbolRemapper *Remapper,
   LLVMContext Context;
   sampleprof::ProfileSymbolList WriterList;
   Optional<bool> ProfileIsProbeBased;
-  Optional<bool> ProfileIsCSFlat;
+  Optional<bool> ProfileIsCS;
   for (const auto &Input : Inputs) {
     auto ReaderOrErr = SampleProfileReader::create(Input.Filename, Context,
                                                    FSDiscriminatorPassOption);
@@ -730,15 +777,14 @@ mergeSampleProfile(const WeightedFileVector &Inputs, SymbolRemapper *Remapper,
     }
 
     SampleProfileMap &Profiles = Reader->getProfiles();
-    if (ProfileIsProbeBased.hasValue() &&
+    if (ProfileIsProbeBased &&
         ProfileIsProbeBased != FunctionSamples::ProfileIsProbeBased)
       exitWithError(
           "cannot merge probe-based profile with non-probe-based profile");
     ProfileIsProbeBased = FunctionSamples::ProfileIsProbeBased;
-    if (ProfileIsCSFlat.hasValue() &&
-        ProfileIsCSFlat != FunctionSamples::ProfileIsCSFlat)
+    if (ProfileIsCS && ProfileIsCS != FunctionSamples::ProfileIsCS)
       exitWithError("cannot merge CS profile with non-CS profile");
-    ProfileIsCSFlat = FunctionSamples::ProfileIsCSFlat;
+    ProfileIsCS = FunctionSamples::ProfileIsCS;
     for (SampleProfileMap::iterator I = Profiles.begin(), E = Profiles.end();
          I != E; ++I) {
       sampleprof_error Result = sampleprof_error::success;
@@ -761,7 +807,7 @@ mergeSampleProfile(const WeightedFileVector &Inputs, SymbolRemapper *Remapper,
       WriterList.merge(*ReaderList);
   }
 
-  if (ProfileIsCSFlat && (SampleMergeColdContext || SampleTrimColdContext)) {
+  if (ProfileIsCS && (SampleMergeColdContext || SampleTrimColdContext)) {
     // Use threshold calculated from profile summary unless specified.
     SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs);
     auto Summary = Builder.computeSummaryForProfiles(ProfileMap);
@@ -776,10 +822,10 @@ mergeSampleProfile(const WeightedFileVector &Inputs, SymbolRemapper *Remapper,
             SampleMergeColdContext, SampleColdContextFrameDepth, false);
   }
 
-  if (ProfileIsCSFlat && GenCSNestedProfile) {
+  if (ProfileIsCS && GenCSNestedProfile) {
     CSProfileConverter CSConverter(ProfileMap);
     CSConverter.convertProfiles();
-    ProfileIsCSFlat = FunctionSamples::ProfileIsCSFlat = false;
+    ProfileIsCS = FunctionSamples::ProfileIsCS = false;
   }
 
   auto WriterOrErr =
@@ -933,7 +979,7 @@ static int merge_main(int argc, const char *argv[]) {
       cl::desc(
           "Trim context sample profiles whose count is below cold threshold"));
   cl::opt<uint32_t> SampleColdContextFrameDepth(
-      "sample-frame-depth-for-cold-context", cl::init(1), cl::ZeroOrMore,
+      "sample-frame-depth-for-cold-context", cl::init(1),
       cl::desc("Keep the last K frames while merging cold profile. 1 means the "
                "context-less base profile"));
   cl::opt<bool> GenPartialProfile(
@@ -949,7 +995,7 @@ static int merge_main(int argc, const char *argv[]) {
       "zero-counter-threshold", cl::init(0.7), cl::Hidden,
       cl::desc("For the function which is cold in instr profile but hot in "
                "sample profile, if the ratio of the number of zero counters "
-               "divided by the the total number of counters is above the "
+               "divided by the total number of counters is above the "
                "threshold, the profile of the function will be regarded as "
                "being harmful for performance and will be dropped."));
   cl::opt<unsigned> SupplMinSizeThreshold(
@@ -967,6 +1013,9 @@ static int merge_main(int argc, const char *argv[]) {
   cl::opt<std::string> DebugInfoFilename(
       "debug-info", cl::init(""),
       cl::desc("Use the provided debug info to correlate the raw profile."));
+  cl::opt<std::string> ProfiledBinary(
+      "profiled-binary", cl::init(""),
+      cl::desc("Path to binary from which the profile was collected."));
 
   cl::ParseCommandLineOptions(argc, argv, "LLVM profile data merger\n");
 
@@ -1009,7 +1058,7 @@ static int merge_main(int argc, const char *argv[]) {
   if (ProfileKind == instr)
     mergeInstrProfile(WeightedInputs, DebugInfoFilename, Remapper.get(),
                       OutputFilename, OutputFormat, OutputSparse, NumThreads,
-                      FailureMode);
+                      FailureMode, ProfiledBinary);
   else
     mergeSampleProfile(WeightedInputs, Remapper.get(), OutputFilename,
                        OutputFormat, ProfileSymbolListFile, CompressAllSections,
@@ -1040,7 +1089,7 @@ static void overlapInstrProfile(const std::string &BaseFilename,
     OS << "Sum of edge counts for profile " << TestFilename << " is 0.\n";
     exit(0);
   }
-  loadInput(WeightedInput, nullptr, nullptr, &Context);
+  loadInput(WeightedInput, nullptr, nullptr, /*ProfiledBinary=*/"", &Context);
   overlapInput(BaseFilename, TestFilename, &Context, Overlap, FuncFilter, OS,
                IsCS);
   Overlap.dump(OS);
@@ -1936,7 +1985,7 @@ std::error_code SampleOverlapAggregator::loadProfiles() {
   if (BaseReader->profileIsProbeBased() != TestReader->profileIsProbeBased())
     exitWithError(
         "cannot compare probe-based profile with non-probe-based profile");
-  if (BaseReader->profileIsCSFlat() != TestReader->profileIsCSFlat())
+  if (BaseReader->profileIsCS() != TestReader->profileIsCS())
     exitWithError("cannot compare CS profile with non-CS profile");
 
   // Load BaseHotThreshold and TestHotThreshold as 99-percentile threshold in
@@ -2097,7 +2146,7 @@ static int showInstrProfile(const std::string &Filename, bool ShowCounts,
   auto ReaderOrErr = InstrProfReader::create(Filename);
   std::vector<uint32_t> Cutoffs = std::move(DetailedSummaryCutoffs);
   if (ShowDetailedSummary && Cutoffs.empty()) {
-    Cutoffs = {800000, 900000, 950000, 990000, 999000, 999900, 999990};
+    Cutoffs = ProfileSummaryBuilder::DefaultCutoffs;
   }
   InstrProfSummaryBuilder Builder(std::move(Cutoffs));
   if (Error E = ReaderOrErr.takeError())
@@ -2480,14 +2529,21 @@ static int showSampleProfile(const std::string &Filename, bool ShowCounts,
   return 0;
 }
 
-static int showMemProfProfile(const std::string &Filename, raw_fd_ostream &OS) {
-  auto ReaderOr = llvm::memprof::RawMemProfReader::create(Filename);
+static int showMemProfProfile(const std::string &Filename,
+                              const std::string &ProfiledBinary,
+                              raw_fd_ostream &OS) {
+  auto ReaderOr = llvm::memprof::RawMemProfReader::create(
+      Filename, ProfiledBinary, /*KeepNames=*/true);
   if (Error E = ReaderOr.takeError())
-    exitWithError(std::move(E), Filename);
+    // Since the error can be related to the profile or the binary we do not
+    // pass whence. Instead additional context is provided where necessary in
+    // the error message.
+    exitWithError(std::move(E), /*Whence*/ "");
 
   std::unique_ptr<llvm::memprof::RawMemProfReader> Reader(
       ReaderOr.get().release());
-  Reader->printSummaries(OS);
+
+  Reader->printYAML(OS);
   return 0;
 }
 
@@ -2587,6 +2643,9 @@ static int show_main(int argc, const char *argv[]) {
   cl::opt<bool> ShowCovered(
       "covered", cl::init(false),
       cl::desc("Show only the functions that have been executed."));
+  cl::opt<std::string> ProfiledBinary(
+      "profiled-binary", cl::init(""),
+      cl::desc("Path to binary from which the profile was collected."));
 
   cl::ParseCommandLineOptions(argc, argv, "LLVM profile data summary\n");
 
@@ -2624,7 +2683,7 @@ static int show_main(int argc, const char *argv[]) {
                              ShowAllFunctions, ShowDetailedSummary,
                              ShowFunction, ShowProfileSymbolList,
                              ShowSectionInfoOnly, ShowHotFuncList, OS);
-  return showMemProfProfile(Filename, OS);
+  return showMemProfProfile(Filename, ProfiledBinary, OS);
 }
 
 int main(int argc, const char *argv[]) {
diff --git a/llvm/tools/llvm-readobj/ARMWinEHPrinter.cpp b/llvm/tools/llvm-readobj/ARMWinEHPrinter.cpp
index 78be632f2153..b7cbf353c43f 100644
--- a/llvm/tools/llvm-readobj/ARMWinEHPrinter.cpp
+++ b/llvm/tools/llvm-readobj/ARMWinEHPrinter.cpp
@@ -78,10 +78,10 @@ raw_ostream &operator<<(raw_ostream &OS, const ARM::WinEH::ReturnType &RT) {
     OS << "pop {pc}";
     break;
   case ARM::WinEH::ReturnType::RT_B:
-    OS << "b target";
+    OS << "bx <reg>";
     break;
   case ARM::WinEH::ReturnType::RT_BW:
-    OS << "b.w target";
+    OS << "b.w <target>";
     break;
   case ARM::WinEH::ReturnType::RT_NoEpilogue:
     OS << "(no epilogue)";
@@ -174,26 +174,47 @@ const Decoder::RingEntry Decoder::Ring64[] = {
   { 0xff, 0xec, 1, &Decoder::opcode_clear_unwound_to_call },
 };
 
-void Decoder::printRegisters(const std::pair<uint16_t, uint32_t> &RegisterMask) {
-  static const char * const GPRRegisterNames[16] = {
-    "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
-    "r11", "ip", "sp", "lr", "pc",
-  };
+static void printRange(raw_ostream &OS, ListSeparator &LS, unsigned First,
+                       unsigned Last, char Letter) {
+  if (First == Last)
+    OS << LS << Letter << First;
+  else
+    OS << LS << Letter << First << "-" << Letter << Last;
+}
 
-  const uint16_t GPRMask = std::get<0>(RegisterMask);
-  const uint16_t VFPMask = std::get<1>(RegisterMask);
+static void printRange(raw_ostream &OS, uint32_t Mask, ListSeparator &LS,
+                       unsigned Start, unsigned End, char Letter) {
+  int First = -1;
+  for (unsigned RI = Start; RI <= End; ++RI) {
+    if (Mask & (1 << RI)) {
+      if (First < 0)
+        First = RI;
+    } else {
+      if (First >= 0) {
+        printRange(OS, LS, First, RI - 1, Letter);
+        First = -1;
+      }
+    }
+  }
+  if (First >= 0)
+    printRange(OS, LS, First, End, Letter);
+}
+
+void Decoder::printGPRMask(uint16_t GPRMask) {
+  OS << '{';
+  ListSeparator LS;
+  printRange(OS, GPRMask, LS, 0, 12, 'r');
+  if (GPRMask & (1 << 14))
+    OS << LS << "lr";
+  if (GPRMask & (1 << 15))
+    OS << LS << "pc";
+  OS << '}';
+}
 
+void Decoder::printVFPMask(uint32_t VFPMask) {
   OS << '{';
   ListSeparator LS;
-  for (unsigned RI = 0, RE = 11; RI < RE; ++RI)
-    if (GPRMask & (1 << RI))
-      OS << LS << GPRRegisterNames[RI];
-  for (unsigned RI = 0, RE = 32; RI < RE; ++RI)
-    if (VFPMask & (1 << RI))
-      OS << LS << "d" << unsigned(RI);
-  for (unsigned RI = 11, RE = 16; RI < RE; ++RI)
-    if (GPRMask & (1 << RI))
-      OS << LS << GPRRegisterNames[RI];
+  printRange(OS, VFPMask, LS, 0, 31, 'd');
   OS << '}';
 }
 
@@ -325,7 +346,7 @@ bool Decoder::opcode_10Lxxxxx(const uint8_t *OC, unsigned &Offset,
   SW.startLine() << format("0x%02x 0x%02x           ; %s.w ",
                            OC[Offset + 0], OC[Offset + 1],
                            Prologue ? "push" : "pop");
-  printRegisters(std::make_pair(RegisterMask, 0));
+  printGPRMask(RegisterMask);
   OS << '\n';
 
   Offset += 2;
@@ -346,7 +367,7 @@ bool Decoder::opcode_1100xxxx(const uint8_t *OC, unsigned &Offset,
 
 bool Decoder::opcode_11010Lxx(const uint8_t *OC, unsigned &Offset,
                               unsigned Length, bool Prologue) {
-  unsigned Link = (OC[Offset] & 0x4) >> 3;
+  unsigned Link = (OC[Offset] & 0x4) >> 2;
   unsigned Count = (OC[Offset] & 0x3);
 
   uint16_t GPRMask = (Link << (Prologue ? 14 : 15))
@@ -354,7 +375,7 @@ bool Decoder::opcode_11010Lxx(const uint8_t *OC, unsigned &Offset,
 
   SW.startLine() << format("0x%02x                ; %s ", OC[Offset],
                            Prologue ? "push" : "pop");
-  printRegisters(std::make_pair(GPRMask, 0));
+  printGPRMask(GPRMask);
   OS << '\n';
 
   ++Offset;
@@ -371,7 +392,7 @@ bool Decoder::opcode_11011Lxx(const uint8_t *OC, unsigned &Offset,
 
   SW.startLine() << format("0x%02x                ; %s.w ", OC[Offset],
                            Prologue ? "push" : "pop");
-  printRegisters(std::make_pair(GPRMask, 0));
+  printGPRMask(GPRMask);
   OS << '\n';
 
   ++Offset;
@@ -385,7 +406,7 @@ bool Decoder::opcode_11100xxx(const uint8_t *OC, unsigned &Offset,
 
   SW.startLine() << format("0x%02x                ; %s ", OC[Offset],
                            Prologue ? "vpush" : "vpop");
-  printRegisters(std::make_pair(0, VFPMask));
+  printVFPMask(VFPMask);
   OS << '\n';
 
   ++Offset;
@@ -407,12 +428,12 @@ bool Decoder::opcode_111010xx(const uint8_t *OC, unsigned &Offset,
 
 bool Decoder::opcode_1110110L(const uint8_t *OC, unsigned &Offset,
                               unsigned Length, bool Prologue) {
-  uint8_t GPRMask = ((OC[Offset + 0] & 0x01) << (Prologue ? 14 : 15))
-                  | ((OC[Offset + 1] & 0xff) << 0);
+  uint16_t GPRMask = ((OC[Offset + 0] & 0x01) << (Prologue ? 14 : 15))
+                   | ((OC[Offset + 1] & 0xff) << 0);
 
   SW.startLine() << format("0x%02x 0x%02x           ; %s ", OC[Offset + 0],
                            OC[Offset + 1], Prologue ? "push" : "pop");
-  printRegisters(std::make_pair(GPRMask, 0));
+  printGPRMask(GPRMask);
   OS << '\n';
 
   Offset += 2;
@@ -437,11 +458,13 @@ bool Decoder::opcode_11101110(const uint8_t *OC, unsigned &Offset,
 
 bool Decoder::opcode_11101111(const uint8_t *OC, unsigned &Offset,
                               unsigned Length, bool Prologue) {
-  assert(!Prologue && "may not be used in prologue");
-
   if (OC[Offset + 1] & 0xf0)
     SW.startLine() << format("0x%02x 0x%02x           ; reserved\n",
                              OC[Offset + 0], OC[Offset +  1]);
+  else if (Prologue)
+    SW.startLine()
+      << format("0x%02x 0x%02x           ; str.w lr, [sp, #-%u]!\n",
+                OC[Offset + 0], OC[Offset + 1], OC[Offset + 1] << 2);
   else
     SW.startLine()
       << format("0x%02x 0x%02x           ; ldr.w lr, [sp], #%u\n",
@@ -455,11 +478,11 @@ bool Decoder::opcode_11110101(const uint8_t *OC, unsigned &Offset,
                               unsigned Length, bool Prologue) {
   unsigned Start = (OC[Offset + 1] & 0xf0) >> 4;
   unsigned End = (OC[Offset + 1] & 0x0f) >> 0;
-  uint32_t VFPMask = ((1 << (End - Start)) - 1) << Start;
+  uint32_t VFPMask = ((1 << (End + 1 - Start)) - 1) << Start;
 
   SW.startLine() << format("0x%02x 0x%02x           ; %s ", OC[Offset + 0],
                            OC[Offset + 1], Prologue ? "vpush" : "vpop");
-  printRegisters(std::make_pair(0, VFPMask));
+  printVFPMask(VFPMask);
   OS << '\n';
 
   Offset += 2;
@@ -470,11 +493,11 @@ bool Decoder::opcode_11110110(const uint8_t *OC, unsigned &Offset,
                               unsigned Length, bool Prologue) {
   unsigned Start = (OC[Offset + 1] & 0xf0) >> 4;
   unsigned End = (OC[Offset + 1] & 0x0f) >> 0;
-  uint32_t VFPMask = ((1 << (End - Start)) - 1) << 16;
+  uint32_t VFPMask = ((1 << (End + 1 - Start)) - 1) << (16 + Start);
 
   SW.startLine() << format("0x%02x 0x%02x           ; %s ", OC[Offset + 0],
                            OC[Offset + 1], Prologue ? "vpush" : "vpop");
-  printRegisters(std::make_pair(0, VFPMask));
+  printVFPMask(VFPMask);
   OS << '\n';
 
   Offset += 2;
@@ -553,14 +576,14 @@ bool Decoder::opcode_11111100(const uint8_t *OC, unsigned &Offset,
 
 bool Decoder::opcode_11111101(const uint8_t *OC, unsigned &Offset,
                               unsigned Length, bool Prologue) {
-  SW.startLine() << format("0x%02x                ; b\n", OC[Offset]);
+  SW.startLine() << format("0x%02x                ; bx <reg>\n", OC[Offset]);
   ++Offset;
   return true;
 }
 
 bool Decoder::opcode_11111110(const uint8_t *OC, unsigned &Offset,
                               unsigned Length, bool Prologue) {
-  SW.startLine() << format("0x%02x                ; b.w\n", OC[Offset]);
+  SW.startLine() << format("0x%02x                ; b.w <target>\n", OC[Offset]);
   ++Offset;
   return true;
 }
@@ -948,7 +971,7 @@ bool Decoder::dumpXDataRecord(const COFFObjectFile &COFF,
 
   if (XData.E()) {
     ArrayRef<uint8_t> UC = XData.UnwindByteCode();
-    if (isAArch64 || !XData.F()) {
+    {
       ListScope PS(SW, "Prologue");
       decodeOpcodes(UC, 0, /*Prologue=*/true);
     }
@@ -971,8 +994,9 @@ bool Decoder::dumpXDataRecord(const COFFObjectFile &COFF,
       SW.printNumber("EpilogueStartIndex",
                      isAArch64 ? ES.EpilogueStartIndexAArch64()
                                : ES.EpilogueStartIndexARM());
-      if (ES.ES & ~0xffc3ffff)
-        SW.printNumber("ReservedBits", (ES.ES >> 18) & 0xF);
+      unsigned ReservedMask = isAArch64 ? 0xF : 0x3;
+      if ((ES.ES >> 18) & ReservedMask)
+        SW.printNumber("ReservedBits", (ES.ES >> 18) & ReservedMask);
 
       ListScope Opcodes(SW, "Opcodes");
       decodeOpcodes(XData.UnwindByteCode(),
@@ -1110,17 +1134,75 @@ bool Decoder::dumpPackedEntry(const object::COFFObjectFile &COFF,
 
   SW.printString("Function",
                  formatSymbol(FunctionName, FunctionAddress, FunctionOffset));
-  if (!isAArch64)
-    SW.printBoolean("Fragment",
-                    RF.Flag() == RuntimeFunctionFlag::RFF_PackedFragment);
+  SW.printBoolean("Fragment",
+                  RF.Flag() == RuntimeFunctionFlag::RFF_PackedFragment);
   SW.printNumber("FunctionLength", RF.FunctionLength());
   SW.startLine() << "ReturnType: " << RF.Ret() << '\n';
   SW.printBoolean("HomedParameters", RF.H());
-  SW.startLine() << "SavedRegisters: ";
-                 printRegisters(SavedRegisterMask(RF));
-  OS << '\n';
+  SW.printNumber("Reg", RF.Reg());
+  SW.printNumber("R", RF.R());
+  SW.printBoolean("LinkRegister", RF.L());
+  SW.printBoolean("Chaining", RF.C());
   SW.printNumber("StackAdjustment", StackAdjustment(RF) << 2);
 
+  {
+    ListScope PS(SW, "Prologue");
+
+    uint16_t GPRMask, VFPMask;
+    std::tie(GPRMask, VFPMask) = SavedRegisterMask(RF, /*Prologue=*/true);
+
+    if (StackAdjustment(RF) && !PrologueFolding(RF))
+      SW.startLine() << "sub sp, sp, #" << StackAdjustment(RF) * 4 << "\n";
+    if (VFPMask) {
+      SW.startLine() << "vpush ";
+      printVFPMask(VFPMask);
+      OS << "\n";
+    }
+    if (RF.C()) {
+      // Count the number of registers pushed below R11
+      int FpOffset = 4 * countPopulation(GPRMask & ((1U << 11) - 1));
+      if (FpOffset)
+        SW.startLine() << "add.w r11, sp, #" << FpOffset << "\n";
+      else
+        SW.startLine() << "mov r11, sp\n";
+    }
+    if (GPRMask) {
+      SW.startLine() << "push ";
+      printGPRMask(GPRMask);
+      OS << "\n";
+    }
+    if (RF.H())
+      SW.startLine() << "push {r0-r3}\n";
+  }
+
+  if (RF.Ret() != ReturnType::RT_NoEpilogue) {
+    ListScope PS(SW, "Epilogue");
+
+    uint16_t GPRMask, VFPMask;
+    std::tie(GPRMask, VFPMask) = SavedRegisterMask(RF, /*Prologue=*/false);
+
+    if (StackAdjustment(RF) && !EpilogueFolding(RF))
+      SW.startLine() << "add sp, sp, #" << StackAdjustment(RF) * 4 << "\n";
+    if (VFPMask) {
+      SW.startLine() << "vpop ";
+      printVFPMask(VFPMask);
+      OS << "\n";
+    }
+    if (GPRMask) {
+      SW.startLine() << "pop ";
+      printGPRMask(GPRMask);
+      OS << "\n";
+    }
+    if (RF.H()) {
+      if (RF.L() == 0 || RF.Ret() != ReturnType::RT_POP)
+        SW.startLine() << "add sp, sp, #16\n";
+      else
+        SW.startLine() << "ldr pc, [sp], #20\n";
+    }
+    if (RF.Ret() != ReturnType::RT_POP)
+      SW.startLine() << RF.Ret() << '\n';
+  }
+
   return true;
 }
 
@@ -1189,11 +1271,11 @@ bool Decoder::dumpPackedARM64Entry(const object::COFFObjectFile &COFF,
     SW.startLine() << format("sub sp, sp, #%d\n", LocSZ);
   }
   if (RF.H()) {
-    SW.startLine() << format("stp x6, x7, [sp, #%d]\n", IntSZ + FpSZ + 48);
-    SW.startLine() << format("stp x4, x5, [sp, #%d]\n", IntSZ + FpSZ + 32);
-    SW.startLine() << format("stp x2, x3, [sp, #%d]\n", IntSZ + FpSZ + 16);
+    SW.startLine() << format("stp x6, x7, [sp, #%d]\n", SavSZ - 16);
+    SW.startLine() << format("stp x4, x5, [sp, #%d]\n", SavSZ - 32);
+    SW.startLine() << format("stp x2, x3, [sp, #%d]\n", SavSZ - 48);
     if (RF.RegI() > 0 || RF.RegF() > 0 || RF.CR() == 1) {
-      SW.startLine() << format("stp x0, x1, [sp, #%d]\n", IntSZ + FpSZ);
+      SW.startLine() << format("stp x0, x1, [sp, #%d]\n", SavSZ - 64);
     } else {
       // This case isn't documented; if neither RegI nor RegF nor CR=1
       // have decremented the stack pointer by SavSZ, we need to do it here
diff --git a/llvm/tools/llvm-readobj/ARMWinEHPrinter.h b/llvm/tools/llvm-readobj/ARMWinEHPrinter.h
index 920d4e5f7332..ceaa866ff215 100644
--- a/llvm/tools/llvm-readobj/ARMWinEHPrinter.h
+++ b/llvm/tools/llvm-readobj/ARMWinEHPrinter.h
@@ -133,7 +133,8 @@ class Decoder {
   void decodeOpcodes(ArrayRef<uint8_t> Opcodes, unsigned Offset,
                      bool Prologue);
 
-  void printRegisters(const std::pair<uint16_t, uint32_t> &RegisterMask);
+  void printGPRMask(uint16_t Mask);
+  void printVFPMask(uint32_t Mask);
 
   ErrorOr<object::SectionRef>
   getSectionContaining(const object::COFFObjectFile &COFF, uint64_t Address);
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index 04a67225401f..ae2dec5d15fb 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -1204,6 +1204,7 @@ const EnumEntry<unsigned> ElfMachineType[] = {
   ENUM_ENT(EM_LANAI,         "EM_LANAI"),
   ENUM_ENT(EM_BPF,           "EM_BPF"),
   ENUM_ENT(EM_VE,            "NEC SX-Aurora Vector Engine"),
+  ENUM_ENT(EM_LOONGARCH,     "LoongArch"),
 };
 
 const EnumEntry<unsigned> ElfSymbolBindings[] = {
@@ -1241,10 +1242,17 @@ const EnumEntry<unsigned> ElfSectionFlags[] = {
   ENUM_ENT(SHF_GROUP,            "G"),
   ENUM_ENT(SHF_TLS,              "T"),
   ENUM_ENT(SHF_COMPRESSED,       "C"),
-  ENUM_ENT(SHF_GNU_RETAIN,       "R"),
   ENUM_ENT(SHF_EXCLUDE,          "E"),
 };
 
+const EnumEntry<unsigned> ElfGNUSectionFlags[] = {
+  ENUM_ENT(SHF_GNU_RETAIN, "R")
+};
+
+const EnumEntry<unsigned> ElfSolarisSectionFlags[] = {
+  ENUM_ENT(SHF_SUNW_NODISCARD, "R")
+};
+
 const EnumEntry<unsigned> ElfXCoreSectionFlags[] = {
   ENUM_ENT(XCORE_SHF_CP_SECTION, ""),
   ENUM_ENT(XCORE_SHF_DP_SECTION, "")
@@ -1274,9 +1282,19 @@ const EnumEntry<unsigned> ElfX86_64SectionFlags[] = {
 };
 
 static std::vector<EnumEntry<unsigned>>
-getSectionFlagsForTarget(unsigned EMachine) {
+getSectionFlagsForTarget(unsigned EOSAbi, unsigned EMachine) {
   std::vector<EnumEntry<unsigned>> Ret(std::begin(ElfSectionFlags),
                                        std::end(ElfSectionFlags));
+  switch (EOSAbi) {
+  case ELFOSABI_SOLARIS:
+    Ret.insert(Ret.end(), std::begin(ElfSolarisSectionFlags),
+               std::end(ElfSolarisSectionFlags));
+    break;
+  default:
+    Ret.insert(Ret.end(), std::begin(ElfGNUSectionFlags),
+               std::end(ElfGNUSectionFlags));
+    break;
+  }
   switch (EMachine) {
   case EM_ARM:
     Ret.insert(Ret.end(), std::begin(ElfARMSectionFlags),
@@ -1304,7 +1322,8 @@ getSectionFlagsForTarget(unsigned EMachine) {
   return Ret;
 }
 
-static std::string getGNUFlags(unsigned EMachine, uint64_t Flags) {
+static std::string getGNUFlags(unsigned EOSAbi, unsigned EMachine,
+                               uint64_t Flags) {
   // Here we are trying to build the flags string in the same way as GNU does.
   // It is not that straightforward. Imagine we have sh_flags == 0x90000000.
   // SHF_EXCLUDE ("E") has a value of 0x80000000 and SHF_MASKPROC is 0xf0000000.
@@ -1315,7 +1334,7 @@ static std::string getGNUFlags(unsigned EMachine, uint64_t Flags) {
   bool HasOSFlag = false;
   bool HasProcFlag = false;
   std::vector<EnumEntry<unsigned>> FlagsList =
-      getSectionFlagsForTarget(EMachine);
+      getSectionFlagsForTarget(EOSAbi, EMachine);
   while (Flags) {
     // Take the least significant bit as a flag.
     uint64_t Flag = Flags & -Flags;
@@ -1371,6 +1390,8 @@ static StringRef segmentTypeToString(unsigned Arch, unsigned Type) {
       LLVM_READOBJ_ENUM_CASE(ELF, PT_MIPS_ABIFLAGS);
     }
     break;
+  case ELF::EM_RISCV:
+    switch (Type) { LLVM_READOBJ_ENUM_CASE(ELF, PT_RISCV_ATTRIBUTES); }
   }
 
   switch (Type) {
@@ -1404,12 +1425,16 @@ static std::string getGNUPtType(unsigned Arch, unsigned Type) {
     return std::string("<unknown>: ") + to_string(format_hex(Type, 1));
 
   // E.g. "PT_ARM_EXIDX" -> "EXIDX".
-  if (Seg.startswith("PT_ARM_"))
-    return Seg.drop_front(7).str();
+  if (Seg.consume_front("PT_ARM_"))
+    return Seg.str();
 
   // E.g. "PT_MIPS_REGINFO" -> "REGINFO".
-  if (Seg.startswith("PT_MIPS_"))
-    return Seg.drop_front(8).str();
+  if (Seg.consume_front("PT_MIPS_"))
+    return Seg.str();
+
+  // E.g. "PT_RISCV_ATTRIBUTES"
+  if (Seg.consume_front("PT_RISCV_"))
+    return Seg.str();
 
   // E.g. "PT_LOAD" -> "LOAD".
   assert(Seg.startswith("PT_"));
@@ -1508,6 +1533,7 @@ const EnumEntry<unsigned> ElfHeaderAMDGPUFlagsABIVersion3[] = {
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX909),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX90A),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX90C),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX940),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1010),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1011),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1012),
@@ -1518,6 +1544,11 @@ const EnumEntry<unsigned> ElfHeaderAMDGPUFlagsABIVersion3[] = {
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1033),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1034),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1035),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1036),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1100),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1101),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1102),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1103),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_FEATURE_XNACK_V3),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_FEATURE_SRAMECC_V3)
 };
@@ -1562,6 +1593,7 @@ const EnumEntry<unsigned> ElfHeaderAMDGPUFlagsABIVersion4[] = {
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX909),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX90A),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX90C),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX940),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1010),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1011),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1012),
@@ -1572,6 +1604,11 @@ const EnumEntry<unsigned> ElfHeaderAMDGPUFlagsABIVersion4[] = {
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1033),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1034),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1035),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1036),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1100),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1101),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1102),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1103),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_FEATURE_XNACK_ANY_V4),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_FEATURE_XNACK_OFF_V4),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_FEATURE_XNACK_ON_V4),
@@ -2265,6 +2302,7 @@ std::string ELFDumper<ELFT>::getDynamicEntry(uint64_t Type,
     case DT_MIPS_PLTGOT:
     case DT_MIPS_RWPLT:
     case DT_MIPS_RLD_MAP_REL:
+    case DT_MIPS_XHASH:
       return FormatHexValue(Value);
     case DT_MIPS_FLAGS:
       return FormatFlags(Value, makeArrayRef(ElfDynamicDTMipsFlags));
@@ -3277,7 +3315,7 @@ template <class ELFT> void GNUELFDumper<ELFT>::printFileHeaders() {
   OS.PadToColumn(2u);
   OS << "Version:";
   OS.PadToColumn(37u);
-  OS << to_hexString(e.e_ident[ELF::EI_VERSION]);
+  OS << utohexstr(e.e_ident[ELF::EI_VERSION]);
   if (e.e_version == ELF::EV_CURRENT)
     OS << " (current)";
   OS << "\n";
@@ -3290,19 +3328,19 @@ template <class ELFT> void GNUELFDumper<ELFT>::printFileHeaders() {
     Str = E->AltName.str();
   } else {
     if (e.e_type >= ET_LOPROC)
-      Str = "Processor Specific: (" + to_hexString(e.e_type, false) + ")";
+      Str = "Processor Specific: (" + utohexstr(e.e_type, /*LowerCase=*/true) + ")";
     else if (e.e_type >= ET_LOOS)
-      Str = "OS Specific: (" + to_hexString(e.e_type, false) + ")";
+      Str = "OS Specific: (" + utohexstr(e.e_type, /*LowerCase=*/true) + ")";
     else
-      Str = "<unknown>: " + to_hexString(e.e_type, false);
+      Str = "<unknown>: " + utohexstr(e.e_type, /*LowerCase=*/true);
   }
   printFields(OS, "Type:", Str);
 
   Str = enumToString(e.e_machine, makeArrayRef(ElfMachineType));
   printFields(OS, "Machine:", Str);
-  Str = "0x" + to_hexString(e.e_version);
+  Str = "0x" + utohexstr(e.e_version);
   printFields(OS, "Version:", Str);
-  Str = "0x" + to_hexString(e.e_entry);
+  Str = "0x" + utohexstr(e.e_entry);
   printFields(OS, "Entry point address:", Str);
   Str = to_string(e.e_phoff) + " (bytes into file)";
   printFields(OS, "Start of program headers:", Str);
@@ -3319,7 +3357,7 @@ template <class ELFT> void GNUELFDumper<ELFT>::printFileHeaders() {
   else if (e.e_machine == EM_AVR)
     ElfFlags = printFlags(e.e_flags, makeArrayRef(ElfHeaderAVRFlags),
                           unsigned(ELF::EF_AVR_ARCH_MASK));
-  Str = "0x" + to_hexString(e.e_flags);
+  Str = "0x" + utohexstr(e.e_flags);
   if (!ElfFlags.empty())
     Str = Str + ", " + ElfFlags;
   printFields(OS, "Flags:", Str);
@@ -3497,7 +3535,7 @@ void GNUELFDumper<ELFT>::printRelRelaReloc(const Relocation<ELFT> &R,
         Addend = " + ";
       }
     }
-    Addend += to_hexString(RelAddend, false);
+    Addend += utohexstr(RelAddend, /*LowerCase=*/true);
   }
   OS << Addend << "\n";
 }
@@ -3529,7 +3567,7 @@ void GNUELFDumper<ELFT>::printDynamicRelocHeader(unsigned Type, StringRef Name,
                                                  const DynRegionInfo &Reg) {
   uint64_t Offset = Reg.Addr - this->Obj.base();
   OS << "\n'" << Name.str().c_str() << "' relocation section at offset 0x"
-     << to_hexString(Offset, false) << " contains " << Reg.Size << " bytes:\n";
+     << utohexstr(Offset, /*LowerCase=*/true) << " contains " << Reg.Size << " bytes:\n";
   printRelocHeaderFields<ELFT>(OS, Type);
 }
 
@@ -3582,7 +3620,7 @@ template <class ELFT> void GNUELFDumper<ELFT>::printRelocations() {
     uintX_t Offset = Sec.sh_offset;
     StringRef Name = this->getPrintableSectionName(Sec);
     OS << "\nRelocation section '" << Name << "' at offset 0x"
-       << to_hexString(Offset, false) << " contains " << EntriesNum
+       << utohexstr(Offset, /*LowerCase=*/true) << " contains " << EntriesNum
        << " entries:\n";
     printRelocHeaderFields<ELFT>(OS, Sec.sh_type);
     this->printRelocationsHelper(Sec);
@@ -3597,30 +3635,30 @@ template <class ELFT> void GNUELFDumper<ELFT>::printRelocations() {
 // returned as '<unknown>' followed by the type value.
 static std::string getSectionTypeOffsetString(unsigned Type) {
   if (Type >= SHT_LOOS && Type <= SHT_HIOS)
-    return "LOOS+0x" + to_hexString(Type - SHT_LOOS);
+    return "LOOS+0x" + utohexstr(Type - SHT_LOOS);
   else if (Type >= SHT_LOPROC && Type <= SHT_HIPROC)
-    return "LOPROC+0x" + to_hexString(Type - SHT_LOPROC);
+    return "LOPROC+0x" + utohexstr(Type - SHT_LOPROC);
   else if (Type >= SHT_LOUSER && Type <= SHT_HIUSER)
-    return "LOUSER+0x" + to_hexString(Type - SHT_LOUSER);
-  return "0x" + to_hexString(Type) + ": <unknown>";
+    return "LOUSER+0x" + utohexstr(Type - SHT_LOUSER);
+  return "0x" + utohexstr(Type) + ": <unknown>";
 }
 
 static std::string getSectionTypeString(unsigned Machine, unsigned Type) {
   StringRef Name = getELFSectionTypeName(Machine, Type);
 
   // Handle SHT_GNU_* type names.
-  if (Name.startswith("SHT_GNU_")) {
-    if (Name == "SHT_GNU_HASH")
+  if (Name.consume_front("SHT_GNU_")) {
+    if (Name == "HASH")
       return "GNU_HASH";
     // E.g. SHT_GNU_verneed -> VERNEED.
-    return Name.drop_front(8).upper();
+    return Name.upper();
   }
 
   if (Name == "SHT_SYMTAB_SHNDX")
     return "SYMTAB SECTION INDICES";
 
-  if (Name.startswith("SHT_"))
-    return Name.drop_front(4).str();
+  if (Name.consume_front("SHT_"))
+    return Name.str();
   return getSectionTypeOffsetString(Type);
 }
 
@@ -3647,7 +3685,7 @@ template <class ELFT> void GNUELFDumper<ELFT>::printSectionHeaders() {
   ArrayRef<Elf_Shdr> Sections = cantFail(this->Obj.sections());
   OS << "There are " << to_string(Sections.size())
      << " section headers, starting at offset "
-     << "0x" << to_hexString(this->Obj.getHeader().e_shoff, false) << ":\n\n";
+     << "0x" << utohexstr(this->Obj.getHeader().e_shoff, /*LowerCase=*/true) << ":\n\n";
   OS << "Section Headers:\n";
   Field Fields[11] = {
       {"[Nr]", 2},        {"Name", 7},        {"Type", 25},
@@ -3680,7 +3718,8 @@ template <class ELFT> void GNUELFDumper<ELFT>::printSectionHeaders() {
     Fields[4].Str = to_string(format_hex_no_prefix(Sec.sh_offset, 6));
     Fields[5].Str = to_string(format_hex_no_prefix(Sec.sh_size, 6));
     Fields[6].Str = to_string(format_hex_no_prefix(Sec.sh_entsize, 2));
-    Fields[7].Str = getGNUFlags(this->Obj.getHeader().e_machine, Sec.sh_flags);
+    Fields[7].Str = getGNUFlags(this->Obj.getHeader().e_ident[ELF::EI_OSABI],
+                                this->Obj.getHeader().e_machine, Sec.sh_flags);
     Fields[8].Str = to_string(Sec.sh_link);
     Fields[9].Str = to_string(Sec.sh_info);
     Fields[10].Str = to_string(Sec.sh_addralign);
@@ -3804,7 +3843,7 @@ void GNUELFDumper<ELFT>::printSymbol(const Elf_Sym &Symbol, unsigned SymIndex,
         Other &= ~STO_AARCH64_VARIANT_PCS;
         Fields[5].Str += " [VARIANT_PCS";
         if (Other != 0)
-          Fields[5].Str.append(" | " + to_hexString(Other, false));
+          Fields[5].Str.append(" | " + utohexstr(Other, /*LowerCase=*/true));
         Fields[5].Str.append("]");
       }
     } else if (this->Obj.getHeader().e_machine == ELF::EM_RISCV) {
@@ -3813,7 +3852,7 @@ void GNUELFDumper<ELFT>::printSymbol(const Elf_Sym &Symbol, unsigned SymIndex,
         Other &= ~STO_RISCV_VARIANT_CC;
         Fields[5].Str += " [VARIANT_CC";
         if (Other != 0)
-          Fields[5].Str.append(" | " + to_hexString(Other, false));
+          Fields[5].Str.append(" | " + utohexstr(Other, /*LowerCase=*/true));
         Fields[5].Str.append("]");
       }
     } else {
@@ -4025,7 +4064,7 @@ template <class ELFT> void GNUELFDumper<ELFT>::printSectionDetails() {
   ArrayRef<Elf_Shdr> Sections = cantFail(this->Obj.sections());
   OS << "There are " << to_string(Sections.size())
      << " section headers, starting at offset "
-     << "0x" << to_hexString(this->Obj.getHeader().e_shoff, false) << ":\n\n";
+     << "0x" << utohexstr(this->Obj.getHeader().e_shoff, /*LowerCase=*/true) << ":\n\n";
 
   OS << "Section Headers:\n";
 
@@ -5041,6 +5080,57 @@ static bool printGNUNote(raw_ostream &OS, uint32_t NoteType,
   return true;
 }
 
+using AndroidNoteProperties = std::vector<std::pair<StringRef, std::string>>;
+static AndroidNoteProperties getAndroidNoteProperties(uint32_t NoteType,
+                                                      ArrayRef<uint8_t> Desc) {
+  AndroidNoteProperties Props;
+  switch (NoteType) {
+  case ELF::NT_ANDROID_TYPE_MEMTAG:
+    if (Desc.empty()) {
+      Props.emplace_back("Invalid .note.android.memtag", "");
+      return Props;
+    }
+
+    switch (Desc[0] & NT_MEMTAG_LEVEL_MASK) {
+    case NT_MEMTAG_LEVEL_NONE:
+      Props.emplace_back("Tagging Mode", "NONE");
+      break;
+    case NT_MEMTAG_LEVEL_ASYNC:
+      Props.emplace_back("Tagging Mode", "ASYNC");
+      break;
+    case NT_MEMTAG_LEVEL_SYNC:
+      Props.emplace_back("Tagging Mode", "SYNC");
+      break;
+    default:
+      Props.emplace_back(
+          "Tagging Mode",
+          ("Unknown (" + Twine::utohexstr(Desc[0] & NT_MEMTAG_LEVEL_MASK) + ")")
+              .str());
+      break;
+    }
+    Props.emplace_back("Heap",
+                       (Desc[0] & NT_MEMTAG_HEAP) ? "Enabled" : "Disabled");
+    Props.emplace_back("Stack",
+                       (Desc[0] & NT_MEMTAG_STACK) ? "Enabled" : "Disabled");
+    break;
+  default:
+    return Props;
+  }
+  return Props;
+}
+
+static bool printAndroidNote(raw_ostream &OS, uint32_t NoteType,
+                             ArrayRef<uint8_t> Desc) {
+  // Return true if we were able to pretty-print the note, false otherwise.
+  AndroidNoteProperties Props = getAndroidNoteProperties(NoteType, Desc);
+  if (Props.empty())
+    return false;
+  for (const auto &KV : Props)
+    OS << "    " << KV.first << ": " << KV.second << '\n';
+  OS << '\n';
+  return true;
+}
+
 template <typename ELFT>
 static bool printLLVMOMPOFFLOADNote(raw_ostream &OS, uint32_t NoteType,
                                     ArrayRef<uint8_t> Desc) {
@@ -5400,6 +5490,13 @@ const NoteType LLVMOMPOFFLOADNoteTypes[] = {
      "NT_LLVM_OPENMP_OFFLOAD_PRODUCER_VERSION (producing toolchain version)"},
 };
 
+const NoteType AndroidNoteTypes[] = {
+    {ELF::NT_ANDROID_TYPE_IDENT, "NT_ANDROID_TYPE_IDENT"},
+    {ELF::NT_ANDROID_TYPE_KUSER, "NT_ANDROID_TYPE_KUSER"},
+    {ELF::NT_ANDROID_TYPE_MEMTAG,
+     "NT_ANDROID_TYPE_MEMTAG (Android memory tagging information)"},
+};
+
 const NoteType CoreNoteTypes[] = {
     {ELF::NT_PRSTATUS, "NT_PRSTATUS (prstatus structure)"},
     {ELF::NT_FPREGSET, "NT_FPREGSET (floating point registers)"},
@@ -5508,6 +5605,8 @@ StringRef getNoteTypeName(const typename ELFT::Note &Note, unsigned ELFType) {
     return FindNote(AMDGPUNoteTypes);
   if (Name == "LLVMOMPOFFLOAD")
     return FindNote(LLVMOMPOFFLOADNoteTypes);
+  if (Name == "Android")
+    return FindNote(AndroidNoteTypes);
 
   if (ELFType == ELF::ET_CORE)
     return FindNote(CoreNoteTypes);
@@ -5658,6 +5757,9 @@ template <class ELFT> void GNUELFDumper<ELFT>::printNotes() {
           return NoteOrErr.takeError();
         }
       }
+    } else if (Name == "Android") {
+      if (printAndroidNote(OS, Type, Descriptor))
+        return Error::success();
     }
     if (!Descriptor.empty()) {
       OS << "   description data:";
@@ -5838,7 +5940,7 @@ template <class ELFT>
 SmallVector<uint32_t> ELFDumper<ELFT>::getSymbolIndexesForFunctionAddress(
     uint64_t SymValue, Optional<const Elf_Shdr *> FunctionSec) {
   SmallVector<uint32_t> SymbolIndexes;
-  if (!this->AddressToIndexMap.hasValue()) {
+  if (!this->AddressToIndexMap) {
     // Populate the address to index map upon the first invocation of this
     // function.
     this->AddressToIndexMap.emplace();
@@ -5991,9 +6093,8 @@ void ELFDumper<ELFT>::printStackSize(const Relocation<ELFT> &R,
     return;
   }
 
-  uint64_t SymValue =
-      Resolver(R.Type, Offset, RelocSymValue, Data.getAddress(&Offset),
-               R.Addend.getValueOr(0));
+  uint64_t SymValue = Resolver(R.Type, Offset, RelocSymValue,
+                               Data.getAddress(&Offset), R.Addend.value_or(0));
   this->printFunctionStackSize(SymValue, FunctionSec, StackSizeSec, Data,
                                &Offset);
 }
@@ -6368,7 +6469,7 @@ template <class ELFT> void LLVMELFDumper<ELFT>::printFileHeaders() {
       else
         TypeStr = "Unknown";
     }
-    W.printString("Type", TypeStr + " (0x" + to_hexString(E.e_type) + ")");
+    W.printString("Type", TypeStr + " (0x" + utohexstr(E.e_type) + ")");
 
     W.printEnum("Machine", E.e_machine, makeArrayRef(ElfMachineType));
     W.printNumber("Version", E.e_version);
@@ -6501,7 +6602,8 @@ template <class ELFT> void LLVMELFDumper<ELFT>::printSectionHeaders() {
 
   int SectionIndex = -1;
   std::vector<EnumEntry<unsigned>> FlagsList =
-      getSectionFlagsForTarget(this->Obj.getHeader().e_machine);
+      getSectionFlagsForTarget(this->Obj.getHeader().e_ident[ELF::EI_OSABI],
+                               this->Obj.getHeader().e_machine);
   for (const Elf_Shdr &Sec : cantFail(this->Obj.sections())) {
     DictScope SectionD(W, "Section");
     W.printNumber("Index", ++SectionIndex);
@@ -6932,8 +7034,10 @@ template <class ELFT> void LLVMELFDumper<ELFT>::printCGProfile() {
 template <class ELFT> void LLVMELFDumper<ELFT>::printBBAddrMaps() {
   bool IsRelocatable = this->Obj.getHeader().e_type == ELF::ET_REL;
   for (const Elf_Shdr &Sec : cantFail(this->Obj.sections())) {
-    if (Sec.sh_type != SHT_LLVM_BB_ADDR_MAP)
+    if (Sec.sh_type != SHT_LLVM_BB_ADDR_MAP &&
+        Sec.sh_type != SHT_LLVM_BB_ADDR_MAP_V0) {
       continue;
+    }
     Optional<const Elf_Shdr *> FunctionSec = None;
     if (IsRelocatable)
       FunctionSec =
@@ -7024,6 +7128,17 @@ static bool printGNUNoteLLVMStyle(uint32_t NoteType, ArrayRef<uint8_t> Desc,
   return true;
 }
 
+static bool printAndroidNoteLLVMStyle(uint32_t NoteType, ArrayRef<uint8_t> Desc,
+                                      ScopedPrinter &W) {
+  // Return true if we were able to pretty-print the note, false otherwise.
+  AndroidNoteProperties Props = getAndroidNoteProperties(NoteType, Desc);
+  if (Props.empty())
+    return false;
+  for (const auto &KV : Props)
+    W.printString(KV.first, KV.second);
+  return true;
+}
+
 template <typename ELFT>
 static bool printLLVMOMPOFFLOADNoteLLVMStyle(uint32_t NoteType,
                                              ArrayRef<uint8_t> Desc,
@@ -7126,6 +7241,9 @@ template <class ELFT> void LLVMELFDumper<ELFT>::printNotes() {
           return N.takeError();
         }
       }
+    } else if (Name == "Android") {
+      if (printAndroidNoteLLVMStyle(Type, Descriptor, W))
+        return Error::success();
     }
     if (!Descriptor.empty()) {
       W.printBinaryBlock("Description data", Descriptor);
diff --git a/llvm/tools/llvm-readobj/MachODumper.cpp b/llvm/tools/llvm-readobj/MachODumper.cpp
index 599b0355917e..4931ab575bb2 100644
--- a/llvm/tools/llvm-readobj/MachODumper.cpp
+++ b/llvm/tools/llvm-readobj/MachODumper.cpp
@@ -13,6 +13,7 @@
 #include "ObjDumper.h"
 #include "StackMapPrinter.h"
 #include "llvm-readobj.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Object/MachO.h"
@@ -39,6 +40,11 @@ public:
 
   void printNeededLibraries() override;
 
+  bool canCompareSymbols() const override { return true; }
+  bool compareSymbolsByName(object::SymbolRef LHS,
+                            object::SymbolRef RHS) const override;
+  bool compareSymbolsByType(object::SymbolRef LHS,
+                            object::SymbolRef RHS) const override;
   // MachO-specific.
   void printMachODataInCode() override;
   void printMachOVersionMin() override;
@@ -51,10 +57,14 @@ private:
   template<class MachHeader>
   void printFileHeaders(const MachHeader &Header);
 
-  StringRef getSymbolName(const SymbolRef &Symbol);
+  StringRef getSymbolName(const SymbolRef &Symbol) const;
+  uint8_t getSymbolType(const SymbolRef &Symbol) const;
 
   void printSymbols() override;
+  void printSymbols(Optional<SymbolComparator> SymComp) override;
   void printDynamicSymbols() override;
+  void printDynamicSymbols(Optional<SymbolComparator> SymComp) override;
+  void printSymbol(const SymbolRef &Symbol, ScopedPrinter &W);
   void printSymbol(const SymbolRef &Symbol);
 
   void printRelocation(const RelocationRef &Reloc);
@@ -602,7 +612,7 @@ void MachODumper::printRelocation(const MachOObjectFile *Obj,
   }
 }
 
-StringRef MachODumper::getSymbolName(const SymbolRef &Symbol) {
+StringRef MachODumper::getSymbolName(const SymbolRef &Symbol) const {
   Expected<StringRef> SymbolNameOrErr = Symbol.getName();
   if (!SymbolNameOrErr) {
     reportError(SymbolNameOrErr.takeError(), Obj->getFileName());
@@ -610,19 +620,50 @@ StringRef MachODumper::getSymbolName(const SymbolRef &Symbol) {
   return *SymbolNameOrErr;
 }
 
-void MachODumper::printSymbols() {
-  ListScope Group(W, "Symbols");
+uint8_t MachODumper::getSymbolType(const SymbolRef &Symbol) const {
+  return Obj->is64Bit()
+      ? Obj->getSymbol64TableEntry(Symbol.getRawDataRefImpl()).n_type
+      : Obj->getSymbolTableEntry(Symbol.getRawDataRefImpl()).n_type;
+}
+
+bool MachODumper::compareSymbolsByName(SymbolRef LHS, SymbolRef RHS) const {
+  return getSymbolName(LHS).str().compare(getSymbolName(RHS).str()) < 0;
+}
+
+bool MachODumper::compareSymbolsByType(SymbolRef LHS, SymbolRef RHS) const {
+  return getSymbolType(LHS) < getSymbolType(RHS);
+}
+
+void MachODumper::printSymbols() { printSymbols(None); }
 
-  for (const SymbolRef &Symbol : Obj->symbols()) {
-    printSymbol(Symbol);
+void MachODumper::printSymbols(Optional<SymbolComparator> SymComp) {
+  ListScope Group(W, "Symbols");
+  if (SymComp) {
+    auto SymbolRange = Obj->symbols();
+    std::vector<SymbolRef> SortedSymbols(SymbolRange.begin(),
+                                         SymbolRange.end());
+    llvm::stable_sort(SortedSymbols, *SymComp);
+    for (SymbolRef Symbol : SortedSymbols)
+      printSymbol(Symbol);
+  } else {
+    for (const SymbolRef &Symbol : Obj->symbols()) {
+      printSymbol(Symbol);
+    }
   }
 }
 
 void MachODumper::printDynamicSymbols() {
   ListScope Group(W, "DynamicSymbols");
 }
+void MachODumper::printDynamicSymbols(Optional<SymbolComparator> SymComp) {
+  ListScope Group(W, "DynamicSymbols");
+}
 
 void MachODumper::printSymbol(const SymbolRef &Symbol) {
+  printSymbol(Symbol, W);
+}
+
+void MachODumper::printSymbol(const SymbolRef &Symbol, ScopedPrinter &W) {
   StringRef SymbolName = getSymbolName(Symbol);
 
   MachOSymbol MOSymbol;
diff --git a/llvm/tools/llvm-readobj/ObjDumper.h b/llvm/tools/llvm-readobj/ObjDumper.h
index a09a243d381e..292efd2ae350 100644
--- a/llvm/tools/llvm-readobj/ObjDumper.h
+++ b/llvm/tools/llvm-readobj/ObjDumper.h
@@ -9,9 +9,14 @@
 #ifndef LLVM_TOOLS_LLVM_READOBJ_OBJDUMPER_H
 #define LLVM_TOOLS_LLVM_READOBJ_OBJDUMPER_H
 
+#include <functional>
 #include <memory>
 #include <system_error>
 
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/CommandLine.h"
@@ -25,7 +30,7 @@ class COFFImportFile;
 class ObjectFile;
 class XCOFFObjectFile;
 class ELFObjectFileBase;
-}
+} // namespace object
 namespace codeview {
 class GlobalTypeTableBuilder;
 class MergingTypeTableBuilder;
@@ -33,6 +38,33 @@ class MergingTypeTableBuilder;
 
 class ScopedPrinter;
 
+// Comparator to compare symbols.
+// Usage: the caller registers predicates (i.e., how to compare the symbols) by
+// calling addPredicate(). The order in which predicates are registered is also
+// their priority.
+class SymbolComparator {
+public:
+  using CompPredicate =
+      std::function<bool(object::SymbolRef, object::SymbolRef)>;
+
+  // Each Obj format has a slightly different way of retrieving a symbol's info
+  // So we defer the predicate's impl to each format.
+  void addPredicate(CompPredicate Pred) { Predicates.push_back(Pred); }
+
+  bool operator()(object::SymbolRef LHS, object::SymbolRef RHS) {
+    for (CompPredicate Pred : Predicates) {
+      if (Pred(LHS, RHS))
+        return true;
+      if (Pred(RHS, LHS))
+        return false;
+    }
+    return false;
+  }
+
+private:
+  SmallVector<CompPredicate, 2> Predicates;
+};
+
 class ObjDumper {
 public:
   ObjDumper(ScopedPrinter &Writer, StringRef ObjName);
@@ -52,6 +84,17 @@ public:
     if (PrintDynamicSymbols)
       printDynamicSymbols();
   }
+  virtual void printSymbols(bool PrintSymbols, bool PrintDynamicSymbols,
+                            llvm::Optional<SymbolComparator> SymComp) {
+    if (SymComp) {
+      if (PrintSymbols)
+        printSymbols(SymComp);
+      if (PrintDynamicSymbols)
+        printDynamicSymbols(SymComp);
+    } else {
+      printSymbols(PrintSymbols, PrintDynamicSymbols);
+    }
+  }
   virtual void printProgramHeaders(bool PrintProgramHeaders,
                                    cl::boolOrDefault PrintSectionMapping) {
     if (PrintProgramHeaders)
@@ -62,6 +105,17 @@ public:
 
   virtual void printUnwindInfo() = 0;
 
+  // Symbol comparison functions.
+  virtual bool canCompareSymbols() const { return false; }
+  virtual bool compareSymbolsByName(object::SymbolRef LHS,
+                                    object::SymbolRef RHS) const {
+    return true;
+  }
+  virtual bool compareSymbolsByType(object::SymbolRef LHS,
+                                    object::SymbolRef RHS) const {
+    return true;
+  }
+
   // Only implemented for ELF at this time.
   virtual void printDependentLibs() {}
   virtual void printDynamicRelocations() { }
@@ -133,7 +187,9 @@ protected:
 
 private:
   virtual void printSymbols() {}
+  virtual void printSymbols(llvm::Optional<SymbolComparator> Comp) {}
   virtual void printDynamicSymbols() {}
+  virtual void printDynamicSymbols(llvm::Optional<SymbolComparator> Comp) {}
   virtual void printProgramHeaders() {}
   virtual void printSectionMapping() {}
 
diff --git a/llvm/tools/llvm-readobj/Opts.td b/llvm/tools/llvm-readobj/Opts.td
index d0f273fa60c7..4687fc71245f 100644
--- a/llvm/tools/llvm-readobj/Opts.td
+++ b/llvm/tools/llvm-readobj/Opts.td
@@ -37,6 +37,7 @@ def section_mapping : FF<"section-mapping", "Display the section to segment mapp
 def section_mapping_EQ_false : FF<"section-mapping=false", "Don't display the section to segment mapping">, Flags<[HelpHidden]>;
 def section_relocations : FF<"section-relocations", "Display relocations for each section shown. This option has no effect for GNU style output">;
 def section_symbols : FF<"section-symbols", "Display symbols for each section shown. This option has no effect for GNU style output">;
+defm sort_symbols : Eq<"sort-symbols", "Specify the keys to sort the symbols before displaying symtab">;
 def stack_sizes : FF<"stack-sizes", "Display contents of all stack sizes sections. This option has no effect for GNU style output">;
 def stackmap : FF<"stackmap", "Display contents of stackmap section">;
 defm string_dump : Eq<"string-dump", "Display the specified section(s) as a list of strings">, MetaVarName<"<name or index>">;
@@ -86,7 +87,7 @@ def coff_tls_directory : FF<"coff-tls-directory", "Display TLS directory">, Grou
 
 // XCOFF specific options.
 def grp_xcoff : OptionGroup<"kind">, HelpText<"OPTIONS (XCOFF specific)">;
-def auxiliary_header : FF<"auxiliary-header" , "display the auxiliary header">, Group<grp_xcoff>;
+def auxiliary_header : FF<"auxiliary-header" , "Display the auxiliary header">, Group<grp_xcoff>;
 
 def help : FF<"help", "Display this help">;
 def version : FF<"version", "Display the version">;
diff --git a/llvm/tools/llvm-readobj/WasmDumper.cpp b/llvm/tools/llvm-readobj/WasmDumper.cpp
index b4d726016437..cf80a2d13d2d 100644
--- a/llvm/tools/llvm-readobj/WasmDumper.cpp
+++ b/llvm/tools/llvm-readobj/WasmDumper.cpp
@@ -179,13 +179,15 @@ void WasmDumper::printSectionHeaders() {
         if (!Seg.Name.empty())
           W.printString("Name", Seg.Name);
         W.printNumber("Size", static_cast<uint64_t>(Seg.Content.size()));
-        if (Seg.Offset.Opcode == wasm::WASM_OPCODE_I32_CONST)
-          W.printNumber("Offset", Seg.Offset.Value.Int32);
-        else if (Seg.Offset.Opcode == wasm::WASM_OPCODE_I64_CONST)
-          W.printNumber("Offset", Seg.Offset.Value.Int64);
-        else if (Seg.Offset.Opcode == wasm::WASM_OPCODE_GLOBAL_GET) {
+        if (Seg.Offset.Extended)
+          llvm_unreachable("extended const exprs not supported");
+        else if (Seg.Offset.Inst.Opcode == wasm::WASM_OPCODE_I32_CONST)
+          W.printNumber("Offset", Seg.Offset.Inst.Value.Int32);
+        else if (Seg.Offset.Inst.Opcode == wasm::WASM_OPCODE_I64_CONST)
+          W.printNumber("Offset", Seg.Offset.Inst.Value.Int64);
+        else if (Seg.Offset.Inst.Opcode == wasm::WASM_OPCODE_GLOBAL_GET) {
           ListScope Group(W, "Offset");
-          W.printNumber("Global", Seg.Offset.Value.Global);
+          W.printNumber("Global", Seg.Offset.Inst.Value.Global);
         } else
           llvm_unreachable("unknown init expr opcode");
       }
diff --git a/llvm/tools/llvm-readobj/XCOFFDumper.cpp b/llvm/tools/llvm-readobj/XCOFFDumper.cpp
index 6e778d558d4f..ccae66f20127 100644
--- a/llvm/tools/llvm-readobj/XCOFFDumper.cpp
+++ b/llvm/tools/llvm-readobj/XCOFFDumper.cpp
@@ -17,7 +17,6 @@
 #include "llvm/Support/ScopedPrinter.h"
 
 #include <ctime>
-#include <stddef.h>
 
 using namespace llvm;
 using namespace object;
@@ -41,6 +40,8 @@ public:
   void printNeededLibraries() override;
   void printStringTable() override;
 
+  ScopedPrinter &getScopedPrinter() const { return W; }
+
 private:
   template <typename T> void printSectionHeaders(ArrayRef<T> Sections);
   template <typename T> void printGenericSectionHeader(T &Sec) const;
@@ -113,6 +114,8 @@ void XCOFFDumper::printFileHeaders() {
 }
 
 void XCOFFDumper::printAuxiliaryHeader() {
+  DictScope DS(W, "AuxiliaryHeader");
+
   if (Obj.is64Bit())
     printAuxiliaryHeader(Obj.auxiliaryHeader64());
   else
@@ -736,6 +739,46 @@ void XCOFFDumper::printGenericSectionHeader(T &Sec) const {
   W.printNumber("NumberOfLineNumbers", Sec.NumberOfLineNumbers);
 }
 
+enum PrintStyle { Hex, Number };
+template <typename T, typename V>
+static void printAuxMemberHelper(PrintStyle Style, const char *MemberName,
+                                 const T &Member, const V *AuxHeader,
+                                 uint16_t AuxSize, uint16_t &PartialFieldOffset,
+                                 const char *&PartialFieldName,
+                                 ScopedPrinter &W) {
+  ptrdiff_t Offset = reinterpret_cast<const char *>(&Member) -
+                     reinterpret_cast<const char *>(AuxHeader);
+  if (Offset + sizeof(Member) <= AuxSize)
+    Style == Hex ? W.printHex(MemberName, Member)
+                 : W.printNumber(MemberName, Member);
+  else if (Offset < AuxSize) {
+    PartialFieldOffset = Offset;
+    PartialFieldName = MemberName;
+  }
+}
+
+template <class T>
+void checkAndPrintAuxHeaderParseError(const char *PartialFieldName,
+                                      uint16_t PartialFieldOffset,
+                                      uint16_t AuxSize, T &AuxHeader,
+                                      XCOFFDumper *Dumper) {
+  if (PartialFieldOffset < AuxSize) {
+    Dumper->reportUniqueWarning(Twine("only partial field for ") +
+                                PartialFieldName + " at offset (" +
+                                Twine(PartialFieldOffset) + ")");
+    Dumper->getScopedPrinter().printBinary(
+        "Raw data", "",
+        ArrayRef<uint8_t>(reinterpret_cast<const uint8_t *>(&AuxHeader) +
+                              PartialFieldOffset,
+                          AuxSize - PartialFieldOffset));
+  } else if (sizeof(AuxHeader) < AuxSize)
+    Dumper->getScopedPrinter().printBinary(
+        "Extra raw data", "",
+        ArrayRef<uint8_t>(reinterpret_cast<const uint8_t *>(&AuxHeader) +
+                              sizeof(AuxHeader),
+                          AuxSize - sizeof(AuxHeader)));
+}
+
 void XCOFFDumper::printAuxiliaryHeader(
     const XCOFFAuxiliaryHeader32 *AuxHeader) {
   if (AuxHeader == nullptr)
@@ -744,44 +787,40 @@ void XCOFFDumper::printAuxiliaryHeader(
   uint16_t PartialFieldOffset = AuxSize;
   const char *PartialFieldName = nullptr;
 
-  DictScope DS(W, "AuxiliaryHeader");
-
-#define PrintAuxMember32(H, S, T)                                              \
-  if (offsetof(XCOFFAuxiliaryHeader32, T) +                                    \
-          sizeof(XCOFFAuxiliaryHeader32::T) <=                                 \
-      AuxSize)                                                                 \
-    W.print##H(S, AuxHeader->T);                                               \
-  else if (offsetof(XCOFFAuxiliaryHeader32, T) < AuxSize) {                    \
-    PartialFieldOffset = offsetof(XCOFFAuxiliaryHeader32, T);                  \
-    PartialFieldName = S;                                                      \
-  }
+  auto PrintAuxMember = [&](PrintStyle Style, const char *MemberName,
+                            auto &Member) {
+    printAuxMemberHelper(Style, MemberName, Member, AuxHeader, AuxSize,
+                         PartialFieldOffset, PartialFieldName, W);
+  };
 
-  PrintAuxMember32(Hex, "Magic", AuxMagic);
-  PrintAuxMember32(Hex, "Version", Version);
-  PrintAuxMember32(Hex, "Size of .text section", TextSize);
-  PrintAuxMember32(Hex, "Size of .data section", InitDataSize);
-  PrintAuxMember32(Hex, "Size of .bss section", BssDataSize);
-  PrintAuxMember32(Hex, "Entry point address", EntryPointAddr);
-  PrintAuxMember32(Hex, ".text section start address", TextStartAddr);
-  PrintAuxMember32(Hex, ".data section start address", DataStartAddr);
-  PrintAuxMember32(Hex, "TOC anchor address", TOCAnchorAddr);
-  PrintAuxMember32(Number, "Section number of entryPoint", SecNumOfEntryPoint);
-  PrintAuxMember32(Number, "Section number of .text", SecNumOfText);
-  PrintAuxMember32(Number, "Section number of .data", SecNumOfData);
-  PrintAuxMember32(Number, "Section number of TOC", SecNumOfTOC);
-  PrintAuxMember32(Number, "Section number of loader data", SecNumOfLoader);
-  PrintAuxMember32(Number, "Section number of .bss", SecNumOfBSS);
-  PrintAuxMember32(Hex, "Maxium alignment of .text", MaxAlignOfText);
-  PrintAuxMember32(Hex, "Maxium alignment of .data", MaxAlignOfData);
-  PrintAuxMember32(Hex, "Module type", ModuleType);
-  PrintAuxMember32(Hex, "CPU type of objects", CpuFlag);
-  PrintAuxMember32(Hex, "(Reserved)", CpuType);
-  PrintAuxMember32(Hex, "Maximum stack size", MaxStackSize);
-  PrintAuxMember32(Hex, "Maximum data size", MaxDataSize);
-  PrintAuxMember32(Hex, "Reserved for debugger", ReservedForDebugger);
-  PrintAuxMember32(Hex, "Text page size", TextPageSize);
-  PrintAuxMember32(Hex, "Data page size", DataPageSize);
-  PrintAuxMember32(Hex, "Stack page size", StackPageSize);
+  PrintAuxMember(Hex, "Magic", AuxHeader->AuxMagic);
+  PrintAuxMember(Hex, "Version", AuxHeader->Version);
+  PrintAuxMember(Hex, "Size of .text section", AuxHeader->TextSize);
+  PrintAuxMember(Hex, "Size of .data section", AuxHeader->InitDataSize);
+  PrintAuxMember(Hex, "Size of .bss section", AuxHeader->BssDataSize);
+  PrintAuxMember(Hex, "Entry point address", AuxHeader->EntryPointAddr);
+  PrintAuxMember(Hex, ".text section start address", AuxHeader->TextStartAddr);
+  PrintAuxMember(Hex, ".data section start address", AuxHeader->DataStartAddr);
+  PrintAuxMember(Hex, "TOC anchor address", AuxHeader->TOCAnchorAddr);
+  PrintAuxMember(Number, "Section number of entryPoint",
+                 AuxHeader->SecNumOfEntryPoint);
+  PrintAuxMember(Number, "Section number of .text", AuxHeader->SecNumOfText);
+  PrintAuxMember(Number, "Section number of .data", AuxHeader->SecNumOfData);
+  PrintAuxMember(Number, "Section number of TOC", AuxHeader->SecNumOfTOC);
+  PrintAuxMember(Number, "Section number of loader data",
+                 AuxHeader->SecNumOfLoader);
+  PrintAuxMember(Number, "Section number of .bss", AuxHeader->SecNumOfBSS);
+  PrintAuxMember(Hex, "Maxium alignment of .text", AuxHeader->MaxAlignOfText);
+  PrintAuxMember(Hex, "Maxium alignment of .data", AuxHeader->MaxAlignOfData);
+  PrintAuxMember(Hex, "Module type", AuxHeader->ModuleType);
+  PrintAuxMember(Hex, "CPU type of objects", AuxHeader->CpuFlag);
+  PrintAuxMember(Hex, "(Reserved)", AuxHeader->CpuType);
+  PrintAuxMember(Hex, "Maximum stack size", AuxHeader->MaxStackSize);
+  PrintAuxMember(Hex, "Maximum data size", AuxHeader->MaxDataSize);
+  PrintAuxMember(Hex, "Reserved for debugger", AuxHeader->ReservedForDebugger);
+  PrintAuxMember(Hex, "Text page size", AuxHeader->TextPageSize);
+  PrintAuxMember(Hex, "Data page size", AuxHeader->DataPageSize);
+  PrintAuxMember(Hex, "Stack page size", AuxHeader->StackPageSize);
   if (offsetof(XCOFFAuxiliaryHeader32, FlagAndTDataAlignment) +
           sizeof(XCOFFAuxiliaryHeader32::FlagAndTDataAlignment) <=
       AuxSize) {
@@ -790,35 +829,11 @@ void XCOFFDumper::printAuxiliaryHeader(
                AuxHeader->getTDataAlignment());
   }
 
-  PrintAuxMember32(Number, "Section number for .tdata", SecNumOfTData);
-  PrintAuxMember32(Number, "Section number for .tbss", SecNumOfTBSS);
+  PrintAuxMember(Number, "Section number for .tdata", AuxHeader->SecNumOfTData);
+  PrintAuxMember(Number, "Section number for .tbss", AuxHeader->SecNumOfTBSS);
 
-  // Deal with error.
-  if (PartialFieldOffset < AuxSize) {
-    std::string ErrInfo;
-    llvm::raw_string_ostream StringOS(ErrInfo);
-    StringOS << "Only partial field for " << PartialFieldName << " at offset ("
-             << PartialFieldOffset << ").";
-    StringOS.flush();
-    reportWarning(
-        make_error<GenericBinaryError>(ErrInfo, object_error::parse_failed),
-        "-");
-    W.printBinary(
-        "Raw data", "",
-        ArrayRef<uint8_t>((const uint8_t *)(AuxHeader) + PartialFieldOffset,
-                          AuxSize - PartialFieldOffset));
-  } else if (sizeof(XCOFFAuxiliaryHeader32) < AuxSize) {
-    reportWarning(make_error<GenericBinaryError>(
-                      "There are extra data beyond auxiliary header",
-                      object_error::parse_failed),
-                  "-");
-    W.printBinary("Extra raw data", "",
-                  ArrayRef<uint8_t>((const uint8_t *)(AuxHeader) +
-                                        sizeof(XCOFFAuxiliaryHeader32),
-                                    AuxSize - sizeof(XCOFFAuxiliaryHeader32)));
-  }
-
-#undef PrintAuxMember32
+  checkAndPrintAuxHeaderParseError(PartialFieldName, PartialFieldOffset,
+                                   AuxSize, *AuxHeader, this);
 }
 
 void XCOFFDumper::printAuxiliaryHeader(
@@ -829,38 +844,34 @@ void XCOFFDumper::printAuxiliaryHeader(
   uint16_t PartialFieldOffset = AuxSize;
   const char *PartialFieldName = nullptr;
 
-  DictScope DS(W, "AuxiliaryHeader");
-
-#define PrintAuxMember64(H, S, T)                                              \
-  if (offsetof(XCOFFAuxiliaryHeader64, T) +                                    \
-          sizeof(XCOFFAuxiliaryHeader64::T) <=                                 \
-      AuxSize)                                                                 \
-    W.print##H(S, AuxHeader->T);                                               \
-  else if (offsetof(XCOFFAuxiliaryHeader64, T) < AuxSize) {                    \
-    PartialFieldOffset = offsetof(XCOFFAuxiliaryHeader64, T);                  \
-    PartialFieldName = S;                                                      \
-  }
+  auto PrintAuxMember = [&](PrintStyle Style, const char *MemberName,
+                            auto &Member) {
+    printAuxMemberHelper(Style, MemberName, Member, AuxHeader, AuxSize,
+                         PartialFieldOffset, PartialFieldName, W);
+  };
 
-  PrintAuxMember64(Hex, "Magic", AuxMagic);
-  PrintAuxMember64(Hex, "Version", Version);
-  PrintAuxMember64(Hex, "Reserved for debugger", ReservedForDebugger);
-  PrintAuxMember64(Hex, ".text section start address", TextStartAddr);
-  PrintAuxMember64(Hex, ".data section start address", DataStartAddr);
-  PrintAuxMember64(Hex, "TOC anchor address", TOCAnchorAddr);
-  PrintAuxMember64(Number, "Section number of entryPoint", SecNumOfEntryPoint);
-  PrintAuxMember64(Number, "Section number of .text", SecNumOfText);
-  PrintAuxMember64(Number, "Section number of .data", SecNumOfData);
-  PrintAuxMember64(Number, "Section number of TOC", SecNumOfTOC);
-  PrintAuxMember64(Number, "Section number of loader data", SecNumOfLoader);
-  PrintAuxMember64(Number, "Section number of .bss", SecNumOfBSS);
-  PrintAuxMember64(Hex, "Maxium alignment of .text", MaxAlignOfText);
-  PrintAuxMember64(Hex, "Maxium alignment of .data", MaxAlignOfData);
-  PrintAuxMember64(Hex, "Module type", ModuleType);
-  PrintAuxMember64(Hex, "CPU type of objects", CpuFlag);
-  PrintAuxMember64(Hex, "(Reserved)", CpuType);
-  PrintAuxMember64(Hex, "Text page size", TextPageSize);
-  PrintAuxMember64(Hex, "Data page size", DataPageSize);
-  PrintAuxMember64(Hex, "Stack page size", StackPageSize);
+  PrintAuxMember(Hex, "Magic", AuxHeader->AuxMagic);
+  PrintAuxMember(Hex, "Version", AuxHeader->Version);
+  PrintAuxMember(Hex, "Reserved for debugger", AuxHeader->ReservedForDebugger);
+  PrintAuxMember(Hex, ".text section start address", AuxHeader->TextStartAddr);
+  PrintAuxMember(Hex, ".data section start address", AuxHeader->DataStartAddr);
+  PrintAuxMember(Hex, "TOC anchor address", AuxHeader->TOCAnchorAddr);
+  PrintAuxMember(Number, "Section number of entryPoint",
+                 AuxHeader->SecNumOfEntryPoint);
+  PrintAuxMember(Number, "Section number of .text", AuxHeader->SecNumOfText);
+  PrintAuxMember(Number, "Section number of .data", AuxHeader->SecNumOfData);
+  PrintAuxMember(Number, "Section number of TOC", AuxHeader->SecNumOfTOC);
+  PrintAuxMember(Number, "Section number of loader data",
+                 AuxHeader->SecNumOfLoader);
+  PrintAuxMember(Number, "Section number of .bss", AuxHeader->SecNumOfBSS);
+  PrintAuxMember(Hex, "Maxium alignment of .text", AuxHeader->MaxAlignOfText);
+  PrintAuxMember(Hex, "Maxium alignment of .data", AuxHeader->MaxAlignOfData);
+  PrintAuxMember(Hex, "Module type", AuxHeader->ModuleType);
+  PrintAuxMember(Hex, "CPU type of objects", AuxHeader->CpuFlag);
+  PrintAuxMember(Hex, "(Reserved)", AuxHeader->CpuType);
+  PrintAuxMember(Hex, "Text page size", AuxHeader->TextPageSize);
+  PrintAuxMember(Hex, "Data page size", AuxHeader->DataPageSize);
+  PrintAuxMember(Hex, "Stack page size", AuxHeader->StackPageSize);
   if (offsetof(XCOFFAuxiliaryHeader64, FlagAndTDataAlignment) +
           sizeof(XCOFFAuxiliaryHeader64::FlagAndTDataAlignment) <=
       AuxSize) {
@@ -868,42 +879,18 @@ void XCOFFDumper::printAuxiliaryHeader(
     W.printHex("Alignment of thread-local storage",
                AuxHeader->getTDataAlignment());
   }
-  PrintAuxMember64(Hex, "Size of .text section", TextSize);
-  PrintAuxMember64(Hex, "Size of .data section", InitDataSize);
-  PrintAuxMember64(Hex, "Size of .bss section", BssDataSize);
-  PrintAuxMember64(Hex, "Entry point address", EntryPointAddr);
-  PrintAuxMember64(Hex, "Maximum stack size", MaxStackSize);
-  PrintAuxMember64(Hex, "Maximum data size", MaxDataSize);
-  PrintAuxMember64(Number, "Section number for .tdata", SecNumOfTData);
-  PrintAuxMember64(Number, "Section number for .tbss", SecNumOfTBSS);
-  PrintAuxMember64(Hex, "Additional flags 64-bit XCOFF", XCOFF64Flag);
-
-  if (PartialFieldOffset < AuxSize) {
-    std::string ErrInfo;
-    llvm::raw_string_ostream StringOS(ErrInfo);
-    StringOS << "Only partial field for " << PartialFieldName << " at offset ("
-             << PartialFieldOffset << ").";
-    StringOS.flush();
-    reportWarning(
-        make_error<GenericBinaryError>(ErrInfo, object_error::parse_failed),
-        "-");
-    ;
-    W.printBinary(
-        "Raw data", "",
-        ArrayRef<uint8_t>((const uint8_t *)(AuxHeader) + PartialFieldOffset,
-                          AuxSize - PartialFieldOffset));
-  } else if (sizeof(XCOFFAuxiliaryHeader64) < AuxSize) {
-    reportWarning(make_error<GenericBinaryError>(
-                      "There are extra data beyond auxiliary header",
-                      object_error::parse_failed),
-                  "-");
-    W.printBinary("Extra raw data", "",
-                  ArrayRef<uint8_t>((const uint8_t *)(AuxHeader) +
-                                        sizeof(XCOFFAuxiliaryHeader64),
-                                    AuxSize - sizeof(XCOFFAuxiliaryHeader64)));
-  }
-
-#undef PrintAuxMember64
+  PrintAuxMember(Hex, "Size of .text section", AuxHeader->TextSize);
+  PrintAuxMember(Hex, "Size of .data section", AuxHeader->InitDataSize);
+  PrintAuxMember(Hex, "Size of .bss section", AuxHeader->BssDataSize);
+  PrintAuxMember(Hex, "Entry point address", AuxHeader->EntryPointAddr);
+  PrintAuxMember(Hex, "Maximum stack size", AuxHeader->MaxStackSize);
+  PrintAuxMember(Hex, "Maximum data size", AuxHeader->MaxDataSize);
+  PrintAuxMember(Number, "Section number for .tdata", AuxHeader->SecNumOfTData);
+  PrintAuxMember(Number, "Section number for .tbss", AuxHeader->SecNumOfTBSS);
+  PrintAuxMember(Hex, "Additional flags 64-bit XCOFF", AuxHeader->XCOFF64Flag);
+
+  checkAndPrintAuxHeaderParseError(PartialFieldName, PartialFieldOffset,
+                                   AuxSize, *AuxHeader, this);
 }
 
 template <typename T>
diff --git a/llvm/tools/llvm-readobj/llvm-readobj.cpp b/llvm/tools/llvm-readobj/llvm-readobj.cpp
index 543b0de82cdf..e1ebbeb41f28 100644
--- a/llvm/tools/llvm-readobj/llvm-readobj.cpp
+++ b/llvm/tools/llvm-readobj/llvm-readobj.cpp
@@ -21,6 +21,7 @@
 #include "llvm-readobj.h"
 #include "ObjDumper.h"
 #include "WindowsResourceDumper.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/DebugInfo/CodeView/GlobalTypeTableBuilder.h"
 #include "llvm/DebugInfo/CodeView/MergingTypeTableBuilder.h"
 #include "llvm/MC/TargetRegistry.h"
@@ -83,6 +84,14 @@ public:
 };
 
 enum OutputFormatTy { bsd, sysv, posix, darwin, just_symbols };
+
+enum SortSymbolKeyTy {
+  NAME = 0,
+  TYPE = 1,
+  UNKNOWN = 100,
+  // TODO: add ADDRESS, SIZE as needed.
+};
+
 } // namespace
 
 namespace opts {
@@ -113,6 +122,7 @@ static bool StringTable;
 static bool Symbols;
 static bool UnwindInfo;
 static cl::boolOrDefault SectionMapping;
+static SmallVector<SortSymbolKeyTy> SortKeys;
 
 // ELF specific options.
 static bool DynamicTable;
@@ -253,6 +263,19 @@ static void parseOptions(const opt::InputArgList &Args) {
   opts::ProgramHeaders = Args.hasArg(OPT_program_headers);
   opts::RawRelr = Args.hasArg(OPT_raw_relr);
   opts::SectionGroups = Args.hasArg(OPT_section_groups);
+  if (Arg *A = Args.getLastArg(OPT_sort_symbols_EQ)) {
+    std::string SortKeysString = A->getValue();
+    for (StringRef KeyStr : llvm::split(A->getValue(), ",")) {
+      SortSymbolKeyTy KeyType = StringSwitch<SortSymbolKeyTy>(KeyStr)
+                                    .Case("name", SortSymbolKeyTy::NAME)
+                                    .Case("type", SortSymbolKeyTy::TYPE)
+                                    .Default(SortSymbolKeyTy::UNKNOWN);
+      if (KeyType == SortSymbolKeyTy::UNKNOWN)
+        error("--sort-symbols value should be 'name' or 'type', but was '" +
+              Twine(KeyStr) + "'");
+      opts::SortKeys.push_back(KeyType);
+    }
+  }
   opts::VersionInfo = Args.hasArg(OPT_version_info);
 
   // Mach-O specific options.
@@ -334,11 +357,39 @@ static void dumpObject(ObjectFile &Obj, ScopedPrinter &Writer,
                        toString(std::move(ContentErr));
 
   ObjDumper *Dumper;
+  Optional<SymbolComparator> SymComp;
   Expected<std::unique_ptr<ObjDumper>> DumperOrErr = createDumper(Obj, Writer);
   if (!DumperOrErr)
     reportError(DumperOrErr.takeError(), FileStr);
   Dumper = (*DumperOrErr).get();
 
+  if (!opts::SortKeys.empty()) {
+    if (Dumper->canCompareSymbols()) {
+      SymComp = SymbolComparator();
+      for (SortSymbolKeyTy Key : opts::SortKeys) {
+        switch (Key) {
+        case NAME:
+          SymComp->addPredicate([Dumper](SymbolRef LHS, SymbolRef RHS) {
+            return Dumper->compareSymbolsByName(LHS, RHS);
+          });
+          break;
+        case TYPE:
+          SymComp->addPredicate([Dumper](SymbolRef LHS, SymbolRef RHS) {
+            return Dumper->compareSymbolsByType(LHS, RHS);
+          });
+          break;
+        case UNKNOWN:
+          llvm_unreachable("Unsupported sort key");
+        }
+      }
+
+    } else {
+      reportWarning(createStringError(
+                        errc::invalid_argument,
+                        "--sort-symbols is not supported yet for this format"),
+                    FileStr);
+    }
+  }
   Dumper->printFileSummary(FileStr, Obj, opts::InputFilenames, A);
 
   if (opts::FileHeaders)
@@ -374,7 +425,7 @@ static void dumpObject(ObjectFile &Obj, ScopedPrinter &Writer,
   if (opts::UnwindInfo)
     Dumper->printUnwindInfo();
   if (opts::Symbols || opts::DynamicSymbols)
-    Dumper->printSymbols(opts::Symbols, opts::DynamicSymbols);
+    Dumper->printSymbols(opts::Symbols, opts::DynamicSymbols, SymComp);
   if (!opts::StringDump.empty())
     Dumper->printSectionsAsString(Obj, opts::StringDump);
   if (!opts::HexDump.empty())
diff --git a/llvm/tools/llvm-readobj/llvm-readobj.h b/llvm/tools/llvm-readobj/llvm-readobj.h
index 0ea695d1673d..989cd0aba6c0 100644
--- a/llvm/tools/llvm-readobj/llvm-readobj.h
+++ b/llvm/tools/llvm-readobj/llvm-readobj.h
@@ -9,10 +9,13 @@
 #ifndef LLVM_TOOLS_LLVM_READOBJ_LLVM_READOBJ_H
 #define LLVM_TOOLS_LLVM_READOBJ_LLVM_READOBJ_H
 
+#include "ObjDumper.h"
+
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
-#include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorOr.h"
 #include <string>
 
 namespace llvm {
diff --git a/llvm/tools/llvm-rtdyld/llvm-rtdyld.cpp b/llvm/tools/llvm-rtdyld/llvm-rtdyld.cpp
index 21339a3f8f3d..df82fb04e8e6 100644
--- a/llvm/tools/llvm-rtdyld/llvm-rtdyld.cpp
+++ b/llvm/tools/llvm-rtdyld/llvm-rtdyld.cpp
@@ -46,7 +46,7 @@ using namespace llvm::object;
 
 static cl::OptionCategory RTDyldCategory("RTDyld Options");
 
-static cl::list<std::string> InputFileList(cl::Positional, cl::ZeroOrMore,
+static cl::list<std::string> InputFileList(cl::Positional,
                                            cl::desc("<input files>"),
                                            cl::cat(RTDyldCategory));
 
@@ -79,11 +79,11 @@ static cl::opt<std::string>
                cl::init("_main"), cl::cat(RTDyldCategory));
 
 static cl::list<std::string> Dylibs("dylib", cl::desc("Add library."),
-                                    cl::ZeroOrMore, cl::cat(RTDyldCategory));
+                                    cl::cat(RTDyldCategory));
 
 static cl::list<std::string> InputArgv("args", cl::Positional,
                                        cl::desc("<program arguments>..."),
-                                       cl::ZeroOrMore, cl::PositionalEatsArgs,
+                                       cl::PositionalEatsArgs,
                                        cl::cat(RTDyldCategory));
 
 static cl::opt<std::string>
@@ -98,7 +98,7 @@ static cl::opt<std::string>
 static cl::list<std::string>
     CheckFiles("check",
                cl::desc("File containing RuntimeDyld verifier checks."),
-               cl::ZeroOrMore, cl::cat(RTDyldCategory));
+               cl::cat(RTDyldCategory));
 
 static cl::opt<uint64_t>
     PreallocMemory("preallocate",
@@ -127,14 +127,13 @@ static cl::list<std::string>
     SpecificSectionMappings("map-section",
                             cl::desc("For -verify only: Map a section to a "
                                      "specific address."),
-                            cl::ZeroOrMore, cl::Hidden,
-                            cl::cat(RTDyldCategory));
+                            cl::Hidden, cl::cat(RTDyldCategory));
 
 static cl::list<std::string> DummySymbolMappings(
     "dummy-extern",
     cl::desc("For -verify only: Inject a symbol into the extern "
              "symbol table."),
-    cl::ZeroOrMore, cl::Hidden, cl::cat(RTDyldCategory));
+    cl::Hidden, cl::cat(RTDyldCategory));
 
 static cl::opt<bool> PrintAllocationRequests(
     "print-alloc-requests",
@@ -286,7 +285,7 @@ private:
   uintptr_t SlabSize = 0;
   uintptr_t CurrentSlabOffset = 0;
   SectionIDMap *SecIDMap = nullptr;
-#if defined(__x86_64__) && defined(__ELF__)
+#if defined(__x86_64__) && defined(__ELF__) && defined(__linux__)
   unsigned UsedTLSStorage = 0;
 #endif
 };
@@ -350,7 +349,7 @@ uint8_t *TrivialMemoryManager::allocateDataSection(uintptr_t Size,
 
 // In case the execution needs TLS storage, we define a very small TLS memory
 // area here that will be used in allocateTLSSection().
-#if defined(__x86_64__) && defined(__ELF__)
+#if defined(__x86_64__) && defined(__ELF__) && defined(__linux__)
 extern "C" {
 alignas(16) __attribute__((visibility("hidden"), tls_model("initial-exec"),
                            used)) thread_local char LLVMRTDyldTLSSpace[16];
@@ -361,7 +360,7 @@ TrivialMemoryManager::TLSSection
 TrivialMemoryManager::allocateTLSSection(uintptr_t Size, unsigned Alignment,
                                          unsigned SectionID,
                                          StringRef SectionName) {
-#if defined(__x86_64__) && defined(__ELF__)
+#if defined(__x86_64__) && defined(__ELF__) && defined(__linux__)
   if (Size + UsedTLSStorage > sizeof(LLVMRTDyldTLSSpace)) {
     return {};
   }
diff --git a/llvm/tools/llvm-sim/llvm-sim.cpp b/llvm/tools/llvm-sim/llvm-sim.cpp
index 26e370ff30f1..6879d73c4434 100644
--- a/llvm/tools/llvm-sim/llvm-sim.cpp
+++ b/llvm/tools/llvm-sim/llvm-sim.cpp
@@ -85,10 +85,9 @@ exportToFile(const StringRef FilePath,
       Optional<unsigned> End =
           getPositionInModule((*C.back()).Inst, LLVMInstNum);
 
-      assert(Start.hasValue() &&
+      assert(Start &&
              "Could not find instruction number for first instruction");
-      assert(End.hasValue() &&
-             "Could not find instruction number for last instruction");
+      assert(End && "Could not find instruction number for last instruction");
 
       J.object([&] {
         J.attribute("start", Start.getValue());
diff --git a/llvm/tools/llvm-stress/llvm-stress.cpp b/llvm/tools/llvm-stress/llvm-stress.cpp
index 9135d60fdf92..e15d1d6048c7 100644
--- a/llvm/tools/llvm-stress/llvm-stress.cpp
+++ b/llvm/tools/llvm-stress/llvm-stress.cpp
@@ -69,41 +69,10 @@ static cl::opt<std::string> OutputFilename("o",
                                            cl::value_desc("filename"),
                                            cl::cat(StressCategory));
 
-static LLVMContext Context;
-
-namespace cl {
-
-template <> class parser<Type*> final : public basic_parser<Type*> {
-public:
-  parser(Option &O) : basic_parser(O) {}
-
-  // Parse options as IR types. Return true on error.
-  bool parse(Option &O, StringRef, StringRef Arg, Type *&Value) {
-    if      (Arg == "half")      Value = Type::getHalfTy(Context);
-    else if (Arg == "fp128")     Value = Type::getFP128Ty(Context);
-    else if (Arg == "x86_fp80")  Value = Type::getX86_FP80Ty(Context);
-    else if (Arg == "ppc_fp128") Value = Type::getPPC_FP128Ty(Context);
-    else if (Arg == "x86_mmx")   Value = Type::getX86_MMXTy(Context);
-    else if (Arg.startswith("i")) {
-      unsigned N = 0;
-      Arg.drop_front().getAsInteger(10, N);
-      if (N > 0)
-        Value = Type::getIntNTy(Context, N);
-    }
-
-    if (!Value)
-      return O.error("Invalid IR scalar type: '" + Arg + "'!");
-    return false;
-  }
-
-  StringRef getValueName() const override { return "IR scalar type"; }
-};
-
-} // end namespace cl
-
-static cl::list<Type*> AdditionalScalarTypes("types", cl::CommaSeparated,
-  cl::desc("Additional IR scalar types "
-           "(always includes i1, i8, i16, i32, i64, float and double)"));
+static cl::list<StringRef> AdditionalScalarTypes(
+    "types", cl::CommaSeparated,
+    cl::desc("Additional IR scalar types "
+             "(always includes i1, i8, i16, i32, i64, float and double)"));
 
 namespace {
 
@@ -185,7 +154,38 @@ struct Modifier {
 public:
   /// C'tor
   Modifier(BasicBlock *Block, PieceTable *PT, Random *R)
-      : BB(Block), PT(PT), Ran(R), Context(BB->getContext()) {}
+      : BB(Block), PT(PT), Ran(R), Context(BB->getContext()) {
+    ScalarTypes.assign({Type::getInt1Ty(Context), Type::getInt8Ty(Context),
+                        Type::getInt16Ty(Context), Type::getInt32Ty(Context),
+                        Type::getInt64Ty(Context), Type::getFloatTy(Context),
+                        Type::getDoubleTy(Context)});
+
+    for (auto &Arg : AdditionalScalarTypes) {
+      Type *Ty = nullptr;
+      if (Arg == "half")
+        Ty = Type::getHalfTy(Context);
+      else if (Arg == "fp128")
+        Ty = Type::getFP128Ty(Context);
+      else if (Arg == "x86_fp80")
+        Ty = Type::getX86_FP80Ty(Context);
+      else if (Arg == "ppc_fp128")
+        Ty = Type::getPPC_FP128Ty(Context);
+      else if (Arg == "x86_mmx")
+        Ty = Type::getX86_MMXTy(Context);
+      else if (Arg.startswith("i")) {
+        unsigned N = 0;
+        Arg.drop_front().getAsInteger(10, N);
+        if (N > 0)
+          Ty = Type::getIntNTy(Context, N);
+      }
+      if (!Ty) {
+        errs() << "Invalid IR scalar type: '" << Arg << "'!\n";
+        exit(1);
+      }
+
+      ScalarTypes.push_back(Ty);
+    }
+  }
 
   /// virtual D'tor to silence warnings.
   virtual ~Modifier() = default;
@@ -310,20 +310,6 @@ protected:
 
   /// Pick a random scalar type.
   Type *pickScalarType() {
-    static std::vector<Type*> ScalarTypes;
-    if (ScalarTypes.empty()) {
-      ScalarTypes.assign({
-        Type::getInt1Ty(Context),
-        Type::getInt8Ty(Context),
-        Type::getInt16Ty(Context),
-        Type::getInt32Ty(Context),
-        Type::getInt64Ty(Context),
-        Type::getFloatTy(Context),
-        Type::getDoubleTy(Context)
-      });
-      llvm::append_range(ScalarTypes, AdditionalScalarTypes);
-    }
-
     return ScalarTypes[getRandom() % ScalarTypes.size()];
   }
 
@@ -338,6 +324,8 @@ protected:
 
   /// Context
   LLVMContext &Context;
+
+  std::vector<Type *> ScalarTypes;
 };
 
 struct LoadModifier: public Modifier {
@@ -347,8 +335,10 @@ struct LoadModifier: public Modifier {
   void Act() override {
     // Try to use predefined pointers. If non-exist, use undef pointer value;
     Value *Ptr = getRandomPointerValue();
-    Value *V = new LoadInst(Ptr->getType()->getPointerElementType(), Ptr, "L",
-                            BB->getTerminator());
+    Type *Ty = Ptr->getType()->isOpaquePointerTy()
+                   ? pickType()
+                   : Ptr->getType()->getNonOpaquePointerElementType();
+    Value *V = new LoadInst(Ty, Ptr, "L", BB->getTerminator());
     PT->push_back(V);
   }
 };
@@ -360,14 +350,16 @@ struct StoreModifier: public Modifier {
   void Act() override {
     // Try to use predefined pointers. If non-exist, use undef pointer value;
     Value *Ptr = getRandomPointerValue();
-    Value *Val = getRandomValue(Ptr->getType()->getPointerElementType());
-    Type  *ValTy = Val->getType();
+    Type *ValTy = Ptr->getType()->isOpaquePointerTy()
+                      ? pickType()
+                      : Ptr->getType()->getNonOpaquePointerElementType();
 
     // Do not store vectors of i1s because they are unsupported
     // by the codegen.
     if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() == 1)
       return;
 
+    Value *Val = getRandomValue(ValTy);
     new StoreInst(Val, Ptr, BB->getTerminator());
   }
 };
@@ -745,6 +737,7 @@ int main(int argc, char **argv) {
   cl::HideUnrelatedOptions({&StressCategory, &getColorCategory()});
   cl::ParseCommandLineOptions(argc, argv, "llvm codegen stress-tester\n");
 
+  LLVMContext Context;
   auto M = std::make_unique<Module>("/tmp/autogen.bc", Context);
   Function *F = GenEmptyFunction(M.get());
 
diff --git a/llvm/tools/llvm-strings/llvm-strings.cpp b/llvm/tools/llvm-strings/llvm-strings.cpp
index 438eed33d283..71d1321ee0ba 100644
--- a/llvm/tools/llvm-strings/llvm-strings.cpp
+++ b/llvm/tools/llvm-strings/llvm-strings.cpp
@@ -64,8 +64,7 @@ public:
 static StringRef ToolName;
 
 static cl::list<std::string> InputFileNames(cl::Positional,
-                                            cl::desc("<input object files>"),
-                                            cl::ZeroOrMore);
+                                            cl::desc("<input object files>"));
 
 static int MinLength = 4;
 static bool PrintFileName;
diff --git a/llvm/tools/llvm-symbolizer/Opts.td b/llvm/tools/llvm-symbolizer/Opts.td
index 6026e24d6ffa..6742e086d6ff 100644
--- a/llvm/tools/llvm-symbolizer/Opts.td
+++ b/llvm/tools/llvm-symbolizer/Opts.td
@@ -21,11 +21,17 @@ defm adjust_vma
     : Eq<"adjust-vma", "Add specified offset to object file addresses">,
       MetaVarName<"<offset>">;
 def basenames : Flag<["--"], "basenames">, HelpText<"Strip directory names from paths">;
+defm build_id : Eq<"build-id", "Build ID used to look up the object file">;
+defm cache_size : Eq<"cache-size", "Max size in bytes of the in-memory binary cache.">;
+def color : F<"color", "Use color when symbolizing log markup.">;
+def color_EQ : Joined<["--"], "color=">, HelpText<"Whether to use color when symbolizing log markup: always, auto, never">, Values<"always,auto,never">;
 defm debug_file_directory : Eq<"debug-file-directory", "Path to directory where to look for debug files">, MetaVarName<"<dir>">;
+defm debuginfod : B<"debuginfod", "Use debuginfod to find debug binaries", "Don't use debuginfod to find debug binaries">;
 defm default_arch
     : Eq<"default-arch", "Default architecture (for multi-arch objects)">,
       Group<grp_mach_o>;
 defm demangle : B<"demangle", "Demangle function names", "Don't demangle function names">;
+def filter_markup : Flag<["--"], "filter-markup">, HelpText<"Filter symbolizer markup from stdin.">;
 def functions : F<"functions", "Print function name for a given address">;
 def functions_EQ : Joined<["--"], "functions=">, HelpText<"Print function name for a given address">, Values<"none,short,linkage">;
 def help : F<"help", "Display this help">;
diff --git a/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp b/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp
index 66a2e703129b..b782c7a1720a 100644
--- a/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp
+++ b/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp
@@ -15,10 +15,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "Opts.inc"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Config/config.h"
 #include "llvm/DebugInfo/Symbolize/DIPrinter.h"
+#include "llvm/DebugInfo/Symbolize/Markup.h"
+#include "llvm/DebugInfo/Symbolize/MarkupFilter.h"
+#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
 #include "llvm/DebugInfo/Symbolize/Symbolize.h"
+#include "llvm/Debuginfod/DIFetcher.h"
+#include "llvm/Debuginfod/Debuginfod.h"
 #include "llvm/Debuginfod/HTTPClient.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
@@ -102,9 +108,31 @@ enum class Command {
   Frame,
 };
 
+static void enableDebuginfod(LLVMSymbolizer &Symbolizer) {
+  static bool IsEnabled = false;
+  if (IsEnabled)
+    return;
+  IsEnabled = true;
+  // Look up symbols using the debuginfod client.
+  Symbolizer.addDIFetcher(std::make_unique<DebuginfodDIFetcher>());
+  // The HTTPClient must be initialized for use by the debuginfod client.
+  HTTPClient::initialize();
+}
+
+static SmallVector<uint8_t> parseBuildID(StringRef Str) {
+  std::string Bytes;
+  if (!tryGetFromHex(Str, Bytes))
+    return {};
+  ArrayRef<uint8_t> BuildID(reinterpret_cast<const uint8_t *>(Bytes.data()),
+                            Bytes.size());
+  return SmallVector<uint8_t>(BuildID.begin(), BuildID.end());
+}
+
 static bool parseCommand(StringRef BinaryName, bool IsAddr2Line,
                          StringRef InputString, Command &Cmd,
-                         std::string &ModuleName, uint64_t &ModuleOffset) {
+                         std::string &ModuleName,
+                         SmallVectorImpl<uint8_t> &BuildID,
+                         uint64_t &ModuleOffset) {
   const char kDelimiters[] = " \n\r";
   ModuleName = "";
   if (InputString.consume_front("CODE ")) {
@@ -117,9 +145,31 @@ static bool parseCommand(StringRef BinaryName, bool IsAddr2Line,
     // If no cmd, assume it's CODE.
     Cmd = Command::Code;
   }
-  const char *Pos = InputString.data();
+
+  const char *Pos;
   // Skip delimiters and parse input filename (if needed).
-  if (BinaryName.empty()) {
+  if (BinaryName.empty() && BuildID.empty()) {
+    bool HasFilePrefix = false;
+    bool HasBuildIDPrefix = false;
+    while (true) {
+      if (InputString.consume_front("FILE:")) {
+        if (HasFilePrefix)
+          return false;
+        HasFilePrefix = true;
+        continue;
+      }
+      if (InputString.consume_front("BUILDID:")) {
+        if (HasBuildIDPrefix)
+          return false;
+        HasBuildIDPrefix = true;
+        continue;
+      }
+      break;
+    }
+    if (HasFilePrefix && HasBuildIDPrefix)
+      return false;
+
+    Pos = InputString.data();
     Pos += strspn(Pos, kDelimiters);
     if (*Pos == '"' || *Pos == '\'') {
       char Quote = *Pos;
@@ -134,7 +184,14 @@ static bool parseCommand(StringRef BinaryName, bool IsAddr2Line,
       ModuleName = std::string(Pos, NameLength);
       Pos += NameLength;
     }
+    if (HasBuildIDPrefix) {
+      BuildID = parseBuildID(ModuleName);
+      if (BuildID.empty())
+        return false;
+      ModuleName.clear();
+    }
   } else {
+    Pos = InputString.data();
     ModuleName = BinaryName.str();
   }
   // Skip delimiters and parse module offset.
@@ -148,31 +205,24 @@ static bool parseCommand(StringRef BinaryName, bool IsAddr2Line,
   return !Offset.getAsInteger(IsAddr2Line ? 16 : 0, ModuleOffset);
 }
 
-static void symbolizeInput(const opt::InputArgList &Args, uint64_t AdjustVMA,
-                           bool IsAddr2Line, OutputStyle Style,
-                           StringRef InputString, LLVMSymbolizer &Symbolizer,
-                           DIPrinter &Printer) {
-  Command Cmd;
-  std::string ModuleName;
-  uint64_t Offset = 0;
-  if (!parseCommand(Args.getLastArgValue(OPT_obj_EQ), IsAddr2Line,
-                    StringRef(InputString), Cmd, ModuleName, Offset)) {
-    Printer.printInvalidCommand({ModuleName, None}, InputString);
-    return;
-  }
-
+template <typename T>
+void executeCommand(StringRef ModuleName, const T &ModuleSpec, Command Cmd,
+                    uint64_t Offset, uint64_t AdjustVMA, bool ShouldInline,
+                    OutputStyle Style, LLVMSymbolizer &Symbolizer,
+                    DIPrinter &Printer) {
   uint64_t AdjustedOffset = Offset - AdjustVMA;
+  object::SectionedAddress Address = {AdjustedOffset,
+                                      object::SectionedAddress::UndefSection};
   if (Cmd == Command::Data) {
-    Expected<DIGlobal> ResOrErr = Symbolizer.symbolizeData(
-        ModuleName, {AdjustedOffset, object::SectionedAddress::UndefSection});
+    Expected<DIGlobal> ResOrErr = Symbolizer.symbolizeData(ModuleSpec, Address);
     print({ModuleName, Offset}, ResOrErr, Printer);
   } else if (Cmd == Command::Frame) {
-    Expected<std::vector<DILocal>> ResOrErr = Symbolizer.symbolizeFrame(
-        ModuleName, {AdjustedOffset, object::SectionedAddress::UndefSection});
+    Expected<std::vector<DILocal>> ResOrErr =
+        Symbolizer.symbolizeFrame(ModuleSpec, Address);
     print({ModuleName, Offset}, ResOrErr, Printer);
-  } else if (Args.hasFlag(OPT_inlines, OPT_no_inlines, !IsAddr2Line)) {
-    Expected<DIInliningInfo> ResOrErr = Symbolizer.symbolizeInlinedCode(
-        ModuleName, {AdjustedOffset, object::SectionedAddress::UndefSection});
+  } else if (ShouldInline) {
+    Expected<DIInliningInfo> ResOrErr =
+        Symbolizer.symbolizeInlinedCode(ModuleSpec, Address);
     print({ModuleName, Offset}, ResOrErr, Printer);
   } else if (Style == OutputStyle::GNU) {
     // With PrintFunctions == FunctionNameKind::LinkageName (default)
@@ -181,8 +231,8 @@ static void symbolizeInput(const opt::InputArgList &Args, uint64_t AdjustVMA,
     // caller function in the inlining chain. This contradicts the existing
     // behavior of addr2line. Symbolizer.symbolizeInlinedCode() overrides only
     // the topmost function, which suits our needs better.
-    Expected<DIInliningInfo> ResOrErr = Symbolizer.symbolizeInlinedCode(
-        ModuleName, {AdjustedOffset, object::SectionedAddress::UndefSection});
+    Expected<DIInliningInfo> ResOrErr =
+        Symbolizer.symbolizeInlinedCode(ModuleSpec, Address);
     Expected<DILineInfo> Res0OrErr =
         !ResOrErr
             ? Expected<DILineInfo>(ResOrErr.takeError())
@@ -190,10 +240,39 @@ static void symbolizeInput(const opt::InputArgList &Args, uint64_t AdjustVMA,
                                                     : ResOrErr->getFrame(0));
     print({ModuleName, Offset}, Res0OrErr, Printer);
   } else {
-    Expected<DILineInfo> ResOrErr = Symbolizer.symbolizeCode(
-        ModuleName, {AdjustedOffset, object::SectionedAddress::UndefSection});
+    Expected<DILineInfo> ResOrErr =
+        Symbolizer.symbolizeCode(ModuleSpec, Address);
     print({ModuleName, Offset}, ResOrErr, Printer);
   }
+  Symbolizer.pruneCache();
+}
+
+static void symbolizeInput(const opt::InputArgList &Args,
+                           ArrayRef<uint8_t> IncomingBuildID,
+                           uint64_t AdjustVMA, bool IsAddr2Line,
+                           OutputStyle Style, StringRef InputString,
+                           LLVMSymbolizer &Symbolizer, DIPrinter &Printer) {
+  Command Cmd;
+  std::string ModuleName;
+  SmallVector<uint8_t> BuildID(IncomingBuildID.begin(), IncomingBuildID.end());
+  uint64_t Offset = 0;
+  if (!parseCommand(Args.getLastArgValue(OPT_obj_EQ), IsAddr2Line,
+                    StringRef(InputString), Cmd, ModuleName, BuildID, Offset)) {
+    Printer.printInvalidCommand({ModuleName, None}, InputString);
+    return;
+  }
+  bool ShouldInline = Args.hasFlag(OPT_inlines, OPT_no_inlines, !IsAddr2Line);
+  if (!BuildID.empty()) {
+    assert(ModuleName.empty());
+    if (!Args.hasArg(OPT_no_debuginfod))
+      enableDebuginfod(Symbolizer);
+    std::string BuildIDStr = toHex(BuildID);
+    executeCommand(BuildIDStr, BuildID, Cmd, Offset, AdjustVMA, ShouldInline,
+                   Style, Symbolizer, Printer);
+  } else {
+    executeCommand(ModuleName, ModuleName, Cmd, Offset, AdjustVMA, ShouldInline,
+                   Style, Symbolizer, Printer);
+  }
 }
 
 static void printHelp(StringRef ToolName, const SymbolizerOptTable &Tbl,
@@ -260,10 +339,52 @@ static FunctionNameKind decideHowToPrintFunctions(const opt::InputArgList &Args,
   return IsAddr2Line ? FunctionNameKind::None : FunctionNameKind::LinkageName;
 }
 
+static Optional<bool> parseColorArg(const opt::InputArgList &Args) {
+  if (Args.hasArg(OPT_color))
+    return true;
+  if (const opt::Arg *A = Args.getLastArg(OPT_color_EQ))
+    return StringSwitch<Optional<bool>>(A->getValue())
+        .Case("always", true)
+        .Case("never", false)
+        .Case("auto", None);
+  return None;
+}
+
+static SmallVector<uint8_t> parseBuildIDArg(const opt::InputArgList &Args,
+                                            int ID) {
+  const opt::Arg *A = Args.getLastArg(ID);
+  if (!A)
+    return {};
+
+  StringRef V(A->getValue());
+  SmallVector<uint8_t> BuildID = parseBuildID(V);
+  if (BuildID.empty()) {
+    errs() << A->getSpelling() + ": expected a build ID, but got '" + V + "'\n";
+    exit(1);
+  }
+  return BuildID;
+}
+
+// Symbolize the markup from stdin and write the result to stdout.
+static void filterMarkup(const opt::InputArgList &Args) {
+  MarkupParser Parser;
+  MarkupFilter Filter(outs(), parseColorArg(Args));
+  for (std::string InputString; std::getline(std::cin, InputString);) {
+    InputString += '\n';
+    Parser.parseLine(InputString);
+    Filter.beginLine(InputString);
+    while (Optional<MarkupNode> Element = Parser.nextNode())
+      Filter.filter(*Element);
+  }
+  Parser.flush();
+  while (Optional<MarkupNode> Element = Parser.nextNode())
+    Filter.filter(*Element);
+}
+
+ExitOnError ExitOnErr;
+
 int main(int argc, char **argv) {
   InitLLVM X(argc, argv);
-  // The HTTPClient must be initialized for use by the debuginfod client.
-  HTTPClient::initialize();
   sys::InitializeCOMRAII COM(sys::COMThreadingMode::MultiThreaded);
 
   bool IsAddr2Line = sys::path::stem(argv[0]).contains("addr2line");
@@ -304,6 +425,8 @@ int main(int argc, char **argv) {
   }
 #endif
   Opts.UseSymbolTable = true;
+  if (Args.hasArg(OPT_cache_size_EQ))
+    parseIntArg(Args, OPT_cache_size_EQ, Opts.MaxCacheSize);
   Config.PrintAddress = Args.hasArg(OPT_addresses);
   Config.PrintFunctions = Opts.PrintFunctions != FunctionNameKind::None;
   Config.Pretty = Args.hasArg(OPT_pretty_print);
@@ -319,6 +442,11 @@ int main(int argc, char **argv) {
     }
   }
 
+  if (Args.hasArg(OPT_filter_markup)) {
+    filterMarkup(Args);
+    return 0;
+  }
+
   auto Style = IsAddr2Line ? OutputStyle::GNU : OutputStyle::LLVM;
   if (const opt::Arg *A = Args.getLastArg(OPT_output_style_EQ)) {
     if (strcmp(A->getValue(), "GNU") == 0)
@@ -329,7 +457,23 @@ int main(int argc, char **argv) {
       Style = OutputStyle::LLVM;
   }
 
+  if (Args.hasArg(OPT_build_id_EQ) && Args.hasArg(OPT_obj_EQ)) {
+    errs() << "error: cannot specify both --build-id and --obj\n";
+    return EXIT_FAILURE;
+  }
+  SmallVector<uint8_t> BuildID = parseBuildIDArg(Args, OPT_build_id_EQ);
+
   LLVMSymbolizer Symbolizer(Opts);
+
+  // A debuginfod lookup could succeed if a HTTP client is available and at
+  // least one backing URL is configured.
+  bool ShouldUseDebuginfodByDefault =
+      HTTPClient::isAvailable() &&
+      !ExitOnErr(getDefaultDebuginfodUrls()).empty();
+  if (Args.hasFlag(OPT_debuginfod, OPT_no_debuginfod,
+                   ShouldUseDebuginfodByDefault))
+    enableDebuginfod(Symbolizer);
+
   std::unique_ptr<DIPrinter> Printer;
   if (Style == OutputStyle::GNU)
     Printer = std::make_unique<GNUPrinter>(outs(), errs(), Config);
@@ -348,15 +492,15 @@ int main(int argc, char **argv) {
       std::string StrippedInputString(InputString);
       llvm::erase_if(StrippedInputString,
                      [](char c) { return c == '\r' || c == '\n'; });
-      symbolizeInput(Args, AdjustVMA, IsAddr2Line, Style, StrippedInputString,
-                     Symbolizer, *Printer);
+      symbolizeInput(Args, BuildID, AdjustVMA, IsAddr2Line, Style,
+                     StrippedInputString, Symbolizer, *Printer);
       outs().flush();
     }
   } else {
     Printer->listBegin();
     for (StringRef Address : InputAddresses)
-      symbolizeInput(Args, AdjustVMA, IsAddr2Line, Style, Address, Symbolizer,
-                     *Printer);
+      symbolizeInput(Args, BuildID, AdjustVMA, IsAddr2Line, Style, Address,
+                     Symbolizer, *Printer);
     Printer->listEnd();
   }
 
diff --git a/llvm/tools/llvm-tapi-diff/llvm-tapi-diff.cpp b/llvm/tools/llvm-tapi-diff/llvm-tapi-diff.cpp
index 772f124c5a59..09dd6f76bf6e 100644
--- a/llvm/tools/llvm-tapi-diff/llvm-tapi-diff.cpp
+++ b/llvm/tools/llvm-tapi-diff/llvm-tapi-diff.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdlib>
diff --git a/llvm/tools/llvm-tli-checker/llvm-tli-checker.cpp b/llvm/tools/llvm-tli-checker/llvm-tli-checker.cpp
index 4a69f96a597a..7deeaef40caf 100644
--- a/llvm/tools/llvm-tli-checker/llvm-tli-checker.cpp
+++ b/llvm/tools/llvm-tli-checker/llvm-tli-checker.cpp
@@ -338,6 +338,7 @@ int main(int argc, char *argv[]) {
     assert(TLIandSDKboth + TLIandSDKneither + TLIdoesSDKdoesnt +
                TLIdoesntSDKdoes ==
            LibFunc::NumLibFuncs);
+    (void) TLIandSDKneither;
     outs() << "<< Total TLI yes SDK no:  " << TLIdoesSDKdoesnt
            << "\n>> Total TLI no  SDK yes: " << TLIdoesntSDKdoes
            << "\n== Total TLI yes SDK yes: " << TLIandSDKboth;
diff --git a/llvm/tools/llvm-xray/func-id-helper.cpp b/llvm/tools/llvm-xray/func-id-helper.cpp
index afc912a6398e..ce4eafd071ec 100644
--- a/llvm/tools/llvm-xray/func-id-helper.cpp
+++ b/llvm/tools/llvm-xray/func-id-helper.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "func-id-helper.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include <sstream>
 
diff --git a/llvm/tools/llvm-xray/func-id-helper.h b/llvm/tools/llvm-xray/func-id-helper.h
index c6ce198170d5..d99fb7c1cfb0 100644
--- a/llvm/tools/llvm-xray/func-id-helper.h
+++ b/llvm/tools/llvm-xray/func-id-helper.h
@@ -13,6 +13,7 @@
 #define LLVM_TOOLS_LLVM_XRAY_FUNC_ID_HELPER_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
 #include "llvm/DebugInfo/Symbolize/Symbolize.h"
 #include <unordered_map>
 
diff --git a/llvm/tools/llvm-xray/xray-graph-diff.cpp b/llvm/tools/llvm-xray/xray-graph-diff.cpp
index f22ea06e0537..bcadade86bb5 100644
--- a/llvm/tools/llvm-xray/xray-graph-diff.cpp
+++ b/llvm/tools/llvm-xray/xray-graph-diff.cpp
@@ -22,6 +22,7 @@
 #include "xray-color-helper.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/XRay/Trace.h"
 
 using namespace llvm;
diff --git a/llvm/tools/opt/NewPMDriver.cpp b/llvm/tools/opt/NewPMDriver.cpp
index af3308939442..17c5da408560 100644
--- a/llvm/tools/opt/NewPMDriver.cpp
+++ b/llvm/tools/opt/NewPMDriver.cpp
@@ -13,7 +13,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "NewPMDriver.h"
-#include "PassPrinters.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -66,10 +65,6 @@ static cl::opt<DebugLogging> DebugPM(
             DebugLogging::Verbose, "verbose",
             "Print extra information about adaptors and pass managers")));
 
-static cl::list<std::string>
-    PassPlugins("load-pass-plugin",
-                cl::desc("Load passes from plugin library"));
-
 // This flag specifies a textual description of the alias analysis pipeline to
 // use when querying for aliasing information. It only works in concert with
 // the "passes" flag above.
@@ -122,11 +117,28 @@ static cl::opt<std::string> PipelineEarlySimplificationEPPipeline(
     cl::desc("A textual description of the module pass pipeline inserted at "
              "the EarlySimplification extension point into default pipelines"),
     cl::Hidden);
+static cl::opt<std::string> OptimizerEarlyEPPipeline(
+    "passes-ep-optimizer-early",
+    cl::desc("A textual description of the module pass pipeline inserted at "
+             "the OptimizerEarly extension point into default pipelines"),
+    cl::Hidden);
 static cl::opt<std::string> OptimizerLastEPPipeline(
     "passes-ep-optimizer-last",
     cl::desc("A textual description of the module pass pipeline inserted at "
              "the OptimizerLast extension point into default pipelines"),
     cl::Hidden);
+static cl::opt<std::string> FullLinkTimeOptimizationEarlyEPPipeline(
+    "passes-ep-full-link-time-optimization-early",
+    cl::desc("A textual description of the module pass pipeline inserted at "
+             "the FullLinkTimeOptimizationEarly extension point into default "
+             "pipelines"),
+    cl::Hidden);
+static cl::opt<std::string> FullLinkTimeOptimizationLastEPPipeline(
+    "passes-ep-full-link-time-optimization-last",
+    cl::desc("A textual description of the module pass pipeline inserted at "
+             "the FullLinkTimeOptimizationLast extension point into default "
+             "pipelines"),
+    cl::Hidden);
 
 // Individual pipeline tuning options.
 extern cl::opt<bool> DisableLoopUnrolling;
@@ -223,12 +235,35 @@ static void registerEPCallbacks(PassBuilder &PB) {
           ExitOnError Err("Unable to parse EarlySimplification pipeline: ");
           Err(PB.parsePassPipeline(PM, PipelineEarlySimplificationEPPipeline));
         });
-  if (tryParsePipelineText<FunctionPassManager>(PB, OptimizerLastEPPipeline))
+  if (tryParsePipelineText<ModulePassManager>(PB, OptimizerEarlyEPPipeline))
+    PB.registerOptimizerEarlyEPCallback(
+        [&PB](ModulePassManager &PM, OptimizationLevel) {
+          ExitOnError Err("Unable to parse OptimizerEarlyEP pipeline: ");
+          Err(PB.parsePassPipeline(PM, OptimizerEarlyEPPipeline));
+        });
+  if (tryParsePipelineText<ModulePassManager>(PB, OptimizerLastEPPipeline))
     PB.registerOptimizerLastEPCallback(
         [&PB](ModulePassManager &PM, OptimizationLevel) {
           ExitOnError Err("Unable to parse OptimizerLastEP pipeline: ");
           Err(PB.parsePassPipeline(PM, OptimizerLastEPPipeline));
         });
+  if (tryParsePipelineText<ModulePassManager>(
+          PB, FullLinkTimeOptimizationEarlyEPPipeline))
+    PB.registerFullLinkTimeOptimizationEarlyEPCallback(
+        [&PB](ModulePassManager &PM, OptimizationLevel) {
+          ExitOnError Err(
+              "Unable to parse FullLinkTimeOptimizationEarlyEP pipeline: ");
+          Err(PB.parsePassPipeline(PM,
+                                   FullLinkTimeOptimizationEarlyEPPipeline));
+        });
+  if (tryParsePipelineText<ModulePassManager>(
+          PB, FullLinkTimeOptimizationLastEPPipeline))
+    PB.registerFullLinkTimeOptimizationLastEPCallback(
+        [&PB](ModulePassManager &PM, OptimizationLevel) {
+          ExitOnError Err(
+              "Unable to parse FullLinkTimeOptimizationLastEP pipeline: ");
+          Err(PB.parsePassPipeline(PM, FullLinkTimeOptimizationLastEPPipeline));
+        });
 }
 
 #define HANDLE_EXTENSION(Ext)                                                  \
@@ -240,6 +275,7 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
                            ToolOutputFile *ThinLTOLinkOut,
                            ToolOutputFile *OptRemarkFile,
                            StringRef PassPipeline, ArrayRef<StringRef> Passes,
+                           ArrayRef<PassPlugin> PassPlugins,
                            OutputKind OK, VerifierKind VK,
                            bool ShouldPreserveAssemblyUseListOrder,
                            bool ShouldPreserveBitcodeUseListOrder,
@@ -312,33 +348,17 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
   PassBuilder PB(TM, PTO, P, &PIC);
   registerEPCallbacks(PB);
 
-  // Load requested pass plugins and let them register pass builder callbacks
-  for (auto &PluginFN : PassPlugins) {
-    auto PassPlugin = PassPlugin::Load(PluginFN);
-    if (!PassPlugin) {
-      errs() << "Failed to load passes from '" << PluginFN
-             << "'. Request ignored.\n";
-      continue;
-    }
-
-    PassPlugin->registerPassBuilderCallbacks(PB);
-  }
+  // For any loaded plugins, let them register pass builder callbacks.
+  for (auto &PassPlugin : PassPlugins)
+    PassPlugin.registerPassBuilderCallbacks(PB);
 
   PB.registerPipelineParsingCallback(
       [](StringRef Name, ModulePassManager &MPM,
          ArrayRef<PassBuilder::PipelineElement>) {
         AddressSanitizerOptions Opts;
         if (Name == "asan-pipeline") {
-          MPM.addPass(
-              RequireAnalysisPass<ASanGlobalsMetadataAnalysis, Module>());
           MPM.addPass(ModuleAddressSanitizerPass(Opts));
           return true;
-        } else if (Name == "asan-function-pipeline") {
-          MPM.addPass(
-              RequireAnalysisPass<ASanGlobalsMetadataAnalysis, Module>());
-          MPM.addPass(
-              createModuleToFunctionPassAdaptor(AddressSanitizerPass(Opts)));
-          return true;
         }
         return false;
       });
diff --git a/llvm/tools/opt/NewPMDriver.h b/llvm/tools/opt/NewPMDriver.h
index 056f7d6a9b80..16bb205afdca 100644
--- a/llvm/tools/opt/NewPMDriver.h
+++ b/llvm/tools/opt/NewPMDriver.h
@@ -20,12 +20,12 @@
 #ifndef LLVM_TOOLS_OPT_NEWPMDRIVER_H
 #define LLVM_TOOLS_OPT_NEWPMDRIVER_H
 
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/CommandLine.h"
 
 namespace llvm {
 class StringRef;
 class Module;
+class PassPlugin;
 class TargetMachine;
 class ToolOutputFile;
 class TargetLibraryInfoImpl;
@@ -69,7 +69,8 @@ bool runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
                      TargetLibraryInfoImpl *TLII, ToolOutputFile *Out,
                      ToolOutputFile *ThinLinkOut, ToolOutputFile *OptRemarkFile,
                      StringRef PassPipeline, ArrayRef<StringRef> PassInfos,
-                     opt_tool::OutputKind OK, opt_tool::VerifierKind VK,
+                     ArrayRef<PassPlugin> PassPlugins, opt_tool::OutputKind OK,
+                     opt_tool::VerifierKind VK,
                      bool ShouldPreserveAssemblyUseListOrder,
                      bool ShouldPreserveBitcodeUseListOrder,
                      bool EmitSummaryIndex, bool EmitModuleHash,
diff --git a/llvm/tools/opt/PassPrinters.cpp b/llvm/tools/opt/PassPrinters.cpp
deleted file mode 100644
index 4e81b5d29c4d..000000000000
--- a/llvm/tools/opt/PassPrinters.cpp
+++ /dev/null
@@ -1,212 +0,0 @@
-//===- PassPrinters.cpp - Utilities to print analysis info for passes -----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// Utilities to print analysis info for various kinds of passes.
-///
-//===----------------------------------------------------------------------===//
-
-#include "PassPrinters.h"
-#include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/CallGraphSCCPass.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/RegionInfo.h"
-#include "llvm/Analysis/RegionPass.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/raw_ostream.h"
-#include <string>
-
-using namespace llvm;
-
-namespace {
-
-struct FunctionPassPrinter : public FunctionPass {
-  const PassInfo *PassToPrint;
-  raw_ostream &Out;
-  static char ID;
-  std::string PassName;
-
-  FunctionPassPrinter(const PassInfo *PI, raw_ostream &out)
-      : FunctionPass(ID), PassToPrint(PI), Out(out) {
-    std::string PassToPrintName = std::string(PassToPrint->getPassName());
-    PassName = "FunctionPass Printer: " + PassToPrintName;
-  }
-
-  bool runOnFunction(Function &F) override {
-    Out << "Printing analysis '" << PassToPrint->getPassName()
-        << "' for function '" << F.getName() << "':\n";
-
-    // Get and print pass...
-    getAnalysisID<Pass>(PassToPrint->getTypeInfo()).print(Out, F.getParent());
-    return false;
-  }
-
-  StringRef getPassName() const override { return PassName; }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequiredID(PassToPrint->getTypeInfo());
-    AU.setPreservesAll();
-  }
-};
-
-char FunctionPassPrinter::ID = 0;
-
-struct CallGraphSCCPassPrinter : public CallGraphSCCPass {
-  static char ID;
-  const PassInfo *PassToPrint;
-  raw_ostream &Out;
-  std::string PassName;
-
-  CallGraphSCCPassPrinter(const PassInfo *PI, raw_ostream &out)
-      : CallGraphSCCPass(ID), PassToPrint(PI), Out(out) {
-    std::string PassToPrintName = std::string(PassToPrint->getPassName());
-    PassName = "CallGraphSCCPass Printer: " + PassToPrintName;
-  }
-
-  bool runOnSCC(CallGraphSCC &SCC) override {
-    Out << "Printing analysis '" << PassToPrint->getPassName() << "':\n";
-
-    // Get and print pass...
-    for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
-      Function *F = (*I)->getFunction();
-      if (F)
-        getAnalysisID<Pass>(PassToPrint->getTypeInfo())
-            .print(Out, F->getParent());
-    }
-    return false;
-  }
-
-  StringRef getPassName() const override { return PassName; }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequiredID(PassToPrint->getTypeInfo());
-    AU.setPreservesAll();
-  }
-};
-
-char CallGraphSCCPassPrinter::ID = 0;
-
-struct ModulePassPrinter : public ModulePass {
-  static char ID;
-  const PassInfo *PassToPrint;
-  raw_ostream &Out;
-  std::string PassName;
-
-  ModulePassPrinter(const PassInfo *PI, raw_ostream &out)
-      : ModulePass(ID), PassToPrint(PI), Out(out) {
-    std::string PassToPrintName = std::string(PassToPrint->getPassName());
-    PassName = "ModulePass Printer: " + PassToPrintName;
-  }
-
-  bool runOnModule(Module &M) override {
-    Out << "Printing analysis '" << PassToPrint->getPassName() << "':\n";
-
-    // Get and print pass...
-    getAnalysisID<Pass>(PassToPrint->getTypeInfo()).print(Out, &M);
-    return false;
-  }
-
-  StringRef getPassName() const override { return PassName; }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequiredID(PassToPrint->getTypeInfo());
-    AU.setPreservesAll();
-  }
-};
-
-char ModulePassPrinter::ID = 0;
-
-struct LoopPassPrinter : public LoopPass {
-  static char ID;
-  const PassInfo *PassToPrint;
-  raw_ostream &Out;
-  std::string PassName;
-
-  LoopPassPrinter(const PassInfo *PI, raw_ostream &out)
-      : LoopPass(ID), PassToPrint(PI), Out(out) {
-    std::string PassToPrintName = std::string(PassToPrint->getPassName());
-    PassName = "LoopPass Printer: " + PassToPrintName;
-  }
-
-  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
-    Out << "Printing analysis '" << PassToPrint->getPassName() << "':\n";
-
-    // Get and print pass...
-    getAnalysisID<Pass>(PassToPrint->getTypeInfo())
-        .print(Out, L->getHeader()->getParent()->getParent());
-    return false;
-  }
-
-  StringRef getPassName() const override { return PassName; }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequiredID(PassToPrint->getTypeInfo());
-    AU.setPreservesAll();
-  }
-};
-
-char LoopPassPrinter::ID = 0;
-
-struct RegionPassPrinter : public RegionPass {
-  static char ID;
-  const PassInfo *PassToPrint;
-  raw_ostream &Out;
-  std::string PassName;
-
-  RegionPassPrinter(const PassInfo *PI, raw_ostream &out)
-      : RegionPass(ID), PassToPrint(PI), Out(out) {
-    std::string PassToPrintName = std::string(PassToPrint->getPassName());
-    PassName = "RegionPass Printer: " + PassToPrintName;
-  }
-
-  bool runOnRegion(Region *R, RGPassManager &RGM) override {
-    Out << "Printing analysis '" << PassToPrint->getPassName() << "' for "
-        << "region: '" << R->getNameStr() << "' in function '"
-        << R->getEntry()->getParent()->getName() << "':\n";
-    // Get and print pass...
-    getAnalysisID<Pass>(PassToPrint->getTypeInfo())
-        .print(Out, R->getEntry()->getParent()->getParent());
-    return false;
-  }
-
-  StringRef getPassName() const override { return PassName; }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequiredID(PassToPrint->getTypeInfo());
-    AU.setPreservesAll();
-  }
-};
-
-char RegionPassPrinter::ID = 0;
-
-} // end anonymous namespace
-
-FunctionPass *llvm::createFunctionPassPrinter(const PassInfo *PI,
-                                              raw_ostream &OS) {
-  return new FunctionPassPrinter(PI, OS);
-}
-
-CallGraphSCCPass *llvm::createCallGraphPassPrinter(const PassInfo *PI,
-                                                   raw_ostream &OS) {
-  return new CallGraphSCCPassPrinter(PI, OS);
-}
-
-ModulePass *llvm::createModulePassPrinter(const PassInfo *PI, raw_ostream &OS) {
-  return new ModulePassPrinter(PI, OS);
-}
-
-LoopPass *llvm::createLoopPassPrinter(const PassInfo *PI, raw_ostream &OS) {
-  return new LoopPassPrinter(PI, OS);
-}
-
-RegionPass *llvm::createRegionPassPrinter(const PassInfo *PI, raw_ostream &OS) {
-  return new RegionPassPrinter(PI, OS);
-}
diff --git a/llvm/tools/opt/PassPrinters.h b/llvm/tools/opt/PassPrinters.h
deleted file mode 100644
index a4e1921399fc..000000000000
--- a/llvm/tools/opt/PassPrinters.h
+++ /dev/null
@@ -1,40 +0,0 @@
-//=- PassPrinters.h - Utilities to print analysis info for passes -*- C++ -*-=//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// Utilities to print analysis info for various kinds of passes.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_OPT_PASSPRINTERS_H
-#define LLVM_TOOLS_OPT_PASSPRINTERS_H
-
-namespace llvm {
-
-class CallGraphSCCPass;
-class FunctionPass;
-class ModulePass;
-class LoopPass;
-class PassInfo;
-class raw_ostream;
-class RegionPass;
-
-FunctionPass *createFunctionPassPrinter(const PassInfo *PI, raw_ostream &out);
-
-CallGraphSCCPass *createCallGraphPassPrinter(const PassInfo *PI,
-                                             raw_ostream &out);
-
-ModulePass *createModulePassPrinter(const PassInfo *PI, raw_ostream &out);
-
-LoopPass *createLoopPassPrinter(const PassInfo *PI, raw_ostream &out);
-
-RegionPass *createRegionPassPrinter(const PassInfo *PI, raw_ostream &out);
-
-} // end namespace llvm
-
-#endif // LLVM_TOOLS_OPT_PASSPRINTERS_H
diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp
index 7793a5471793..0e013ef3b9fd 100644
--- a/llvm/tools/opt/opt.cpp
+++ b/llvm/tools/opt/opt.cpp
@@ -13,7 +13,6 @@
 
 #include "BreakpointPrinter.h"
 #include "NewPMDriver.h"
-#include "PassPrinters.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
@@ -32,6 +31,7 @@
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/LegacyPassNameParser.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/InitializePasses.h"
@@ -39,6 +39,7 @@
 #include "llvm/LinkAllPasses.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Passes/PassPlugin.h"
 #include "llvm/Remarks/HotnessThresholdParser.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
@@ -51,7 +52,6 @@
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/YAMLTraits.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Transforms/Coroutines.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/IPO/WholeProgramDevirt.h"
@@ -74,7 +74,7 @@ static cl::opt<bool> EnableNewPassManager(
     cl::desc("Enable the new pass manager, translating "
              "'opt -foo' to 'opt -passes=foo'. This is strictly for the new PM "
              "migration, use '-passes=' when possible."),
-    cl::init(LLVM_ENABLE_NEW_PASS_MANAGER));
+    cl::init(true));
 
 // This flag specifies a textual description of the optimization pass pipeline
 // to run over the module. This flag switches opt to use the new pass manager
@@ -192,14 +192,9 @@ static cl::opt<bool>
 DisableSimplifyLibCalls("disable-simplify-libcalls",
                         cl::desc("Disable simplify-libcalls"));
 
-static cl::list<std::string>
-DisableBuiltins("disable-builtin",
-                cl::desc("Disable specific target library builtin function"),
-                cl::ZeroOrMore);
-
-static cl::opt<bool>
-    AnalyzeOnly("analyze", cl::desc("Only perform analysis, no optimization. "
-                                    "Legacy pass manager only."));
+static cl::list<std::string> DisableBuiltins(
+    "disable-builtin",
+    cl::desc("Disable specific target library builtin function"));
 
 static cl::opt<bool> EnableDebugify(
     "enable-debugify",
@@ -252,11 +247,6 @@ static cl::opt<bool> DiscardValueNames(
     cl::desc("Discard names from Value (other than GlobalValue)."),
     cl::init(false), cl::Hidden);
 
-static cl::opt<bool> Coroutines(
-  "enable-coroutines",
-  cl::desc("Enable coroutine passes."),
-  cl::init(false), cl::Hidden);
-
 static cl::opt<bool> TimeTrace(
     "time-trace",
     cl::desc("Record time trace"));
@@ -300,6 +290,10 @@ static cl::opt<std::string> RemarksFormat(
     cl::desc("The format used for serializing remarks (default: YAML)"),
     cl::value_desc("format"), cl::init("yaml"));
 
+static cl::list<std::string>
+    PassPlugins("load-pass-plugin",
+                cl::desc("Load passes from plugin library"));
+
 namespace llvm {
 cl::opt<PGOKind>
     PGOKindFlag("pgo-kind", cl::init(NoPGO), cl::Hidden,
@@ -370,9 +364,6 @@ static void AddOptimizationPasses(legacy::PassManagerBase &MPM,
   if (TM)
     TM->adjustPassManager(Builder);
 
-  if (Coroutines)
-    addCoroutinePassesToExtensionPoints(Builder);
-
   switch (PGOKindFlag) {
   case InstrGen:
     Builder.EnablePGOInstrGen = true;
@@ -484,7 +475,7 @@ static bool shouldPinPassToLegacyPM(StringRef Pass) {
       "x86-",    "xcore-", "wasm-",  "systemz-", "ppc-",    "nvvm-",
       "nvptx-",  "mips-",  "lanai-", "hexagon-", "bpf-",    "avr-",
       "thumb2-", "arm-",   "si-",    "gcn-",     "amdgpu-", "aarch64-",
-      "amdgcn-", "polly-", "riscv-"};
+      "amdgcn-", "polly-", "riscv-", "dxil-"};
   std::vector<StringRef> PassNameContain = {"ehprepare"};
   std::vector<StringRef> PassNameExact = {
       "safe-stack",           "cost-model",
@@ -498,7 +489,11 @@ static bool shouldPinPassToLegacyPM(StringRef Pass) {
       "generic-to-nvvm",      "expandmemcmp",
       "loop-reduce",          "lower-amx-type",
       "pre-amx-config",       "lower-amx-intrinsics",
-      "polyhedral-info",      "replace-with-veclib"};
+      "polyhedral-info",      "print-polyhedral-info",
+      "replace-with-veclib",  "jmc-instrument",
+      "dot-regions",          "dot-regions-only",
+      "view-regions",         "view-regions-only",
+      "select-optimize"};
   for (const auto &P : PassNamePrefix)
     if (Pass.startswith(P))
       return true;
@@ -535,7 +530,6 @@ int main(int argc, char **argv) {
   // Initialize passes
   PassRegistry &Registry = *PassRegistry::getPassRegistry();
   initializeCore(Registry);
-  initializeCoroutines(Registry);
   initializeScalarOpts(Registry);
   initializeObjCARCOpts(Registry);
   initializeVectorization(Registry);
@@ -550,6 +544,7 @@ int main(int argc, char **argv) {
   // supported.
   initializeExpandMemCmpPassPass(Registry);
   initializeScalarizeMaskedMemIntrinLegacyPassPass(Registry);
+  initializeSelectOptimizePass(Registry);
   initializeCodeGenPreparePass(Registry);
   initializeAtomicExpandPass(Registry);
   initializeRewriteSymbolsLegacyPassPass(Registry);
@@ -572,18 +567,38 @@ int main(int argc, char **argv) {
   initializeHardwareLoopsPass(Registry);
   initializeTypePromotionPass(Registry);
   initializeReplaceWithVeclibLegacyPass(Registry);
+  initializeJMCInstrumenterPass(Registry);
 
 #ifdef BUILD_EXAMPLES
   initializeExampleIRTransforms(Registry);
 #endif
 
+  SmallVector<PassPlugin, 1> PluginList;
+  PassPlugins.setCallback([&](const std::string &PluginPath) {
+    auto Plugin = PassPlugin::Load(PluginPath);
+    if (!Plugin) {
+      errs() << "Failed to load passes from '" << PluginPath
+             << "'. Request ignored.\n";
+      return;
+    }
+    PluginList.emplace_back(Plugin.get());
+  });
+
   cl::ParseCommandLineOptions(argc, argv,
     "llvm .bc -> .bc modular optimizer and analysis printer\n");
 
   LLVMContext Context;
 
-  if (AnalyzeOnly && NoOutput) {
-    errs() << argv[0] << ": analyze mode conflicts with no-output mode.\n";
+  // If `-passes=` is specified, use NPM.
+  // If `-enable-new-pm` is specified and there are no codegen passes, use NPM.
+  // e.g. `-enable-new-pm -sroa` will use NPM.
+  // but `-enable-new-pm -codegenprepare` will still revert to legacy PM.
+  const bool UseNPM = (EnableNewPassManager && !shouldForceLegacyPM()) ||
+                      PassPipeline.getNumOccurrences() > 0;
+
+  if (!UseNPM && PluginList.size()) {
+    errs() << argv[0] << ": " << PassPlugins.ArgStr
+           << " specified with legacy PM.\n";
     return 1;
   }
 
@@ -722,7 +737,7 @@ int main(int argc, char **argv) {
   // If the output is set to be emitted to standard out, and standard out is a
   // console, print out a warning message and refuse to do it.  We don't
   // impress anyone by spewing tons of binary goo to a terminal.
-  if (!Force && !NoOutput && !AnalyzeOnly && !OutputAssembly)
+  if (!Force && !NoOutput && !OutputAssembly)
     if (CheckBitcodeOutputToConsole(Out->os()))
       NoOutput = true;
 
@@ -748,19 +763,7 @@ int main(int argc, char **argv) {
       }
   }
 
-  // If `-passes=` is specified, use NPM.
-  // If `-enable-new-pm` is specified and there are no codegen passes, use NPM.
-  // e.g. `-enable-new-pm -sroa` will use NPM.
-  // but `-enable-new-pm -codegenprepare` will still revert to legacy PM.
-  if ((EnableNewPassManager && !shouldForceLegacyPM()) ||
-      PassPipeline.getNumOccurrences() > 0) {
-    if (AnalyzeOnly) {
-      errs() << "Cannot specify -analyze under new pass manager, either "
-                "specify '-enable-new-pm=0', or use the corresponding new pass "
-                "manager pass, e.g. '-passes=print<scalar-evolution>'. For a "
-                "full list of passes, see the '--print-passes' flag.\n";
-      return 1;
-    }
+  if (UseNPM) {
     if (legacy::debugPassSpecified()) {
       errs()
           << "-debug-pass does not work with the new PM, either use "
@@ -778,8 +781,9 @@ int main(int argc, char **argv) {
       errs() << "Cannot specify multiple -O#\n";
       return 1;
     }
-    if (NumOLevel > 0 && PassPipeline.getNumOccurrences() > 0) {
-      errs() << "Cannot specify -O# and --passes=, use "
+    if (NumOLevel > 0 &&
+        (PassPipeline.getNumOccurrences() > 0 || PassList.size() > 0)) {
+      errs() << "Cannot specify -O# and --passes=/--foo-pass, use "
                 "-passes='default<O#>,other-pass'\n";
       return 1;
     }
@@ -817,7 +821,7 @@ int main(int argc, char **argv) {
     // layer.
     return runPassPipeline(argv[0], *M, TM.get(), &TLII, Out.get(),
                            ThinLinkOut.get(), RemarksFile.get(), Pipeline,
-                           Passes, OK, VK, PreserveAssemblyUseListOrder,
+                           Passes, PluginList, OK, VK, PreserveAssemblyUseListOrder,
                            PreserveBitcodeUseListOrder, EmitSummaryIndex,
                            EmitModuleHash, EnableDebugify)
                ? 0
@@ -829,13 +833,13 @@ int main(int argc, char **argv) {
   // the (-check)-debugify passes.
   DebugifyCustomPassManager Passes;
   DebugifyStatsMap DIStatsMap;
-  DebugInfoPerPassMap DIPreservationMap;
+  DebugInfoPerPass DebugInfoBeforePass;
   if (DebugifyEach) {
     Passes.setDebugifyMode(DebugifyMode::SyntheticDebugInfo);
     Passes.setDIStatsMap(DIStatsMap);
   } else if (VerifyEachDebugInfoPreserve) {
     Passes.setDebugifyMode(DebugifyMode::OriginalDebugInfo);
-    Passes.setDIPreservationMap(DIPreservationMap);
+    Passes.setDebugInfoBeforePass(DebugInfoBeforePass);
     if (!VerifyDIPreserveExport.empty())
       Passes.setOrigDIVerifyBugsReportFilePath(VerifyDIPreserveExport);
   }
@@ -855,10 +859,10 @@ int main(int argc, char **argv) {
       Passes.setDIStatsMap(DIStatsMap);
       Passes.add(createDebugifyModulePass());
     } else if (VerifyDebugInfoPreserve) {
-      Passes.setDIPreservationMap(DIPreservationMap);
+      Passes.setDebugInfoBeforePass(DebugInfoBeforePass);
       Passes.add(createDebugifyModulePass(
           DebugifyMode::OriginalDebugInfo, "",
-          &(Passes.getDebugInfoPerPassMap())));
+          &(Passes.getDebugInfoPerPass())));
     }
   }
 
@@ -934,30 +938,8 @@ int main(int argc, char **argv) {
     else
       errs() << argv[0] << ": cannot create pass: "
              << PassInf->getPassName() << "\n";
-    if (P) {
-      PassKind Kind = P->getPassKind();
+    if (P)
       addPass(Passes, P);
-
-      if (AnalyzeOnly) {
-        switch (Kind) {
-        case PT_Region:
-          Passes.add(createRegionPassPrinter(PassInf, Out->os()));
-          break;
-        case PT_Loop:
-          Passes.add(createLoopPassPrinter(PassInf, Out->os()));
-          break;
-        case PT_Function:
-          Passes.add(createFunctionPassPrinter(PassInf, Out->os()));
-          break;
-        case PT_CallGraphSCC:
-          Passes.add(createCallGraphPassPrinter(PassInf, Out->os()));
-          break;
-        default:
-          Passes.add(createModulePassPrinter(PassInf, Out->os()));
-          break;
-        }
-      }
-    }
   }
 
   if (OptLevelO0)
@@ -997,7 +979,7 @@ int main(int argc, char **argv) {
         Passes.setOrigDIVerifyBugsReportFilePath(VerifyDIPreserveExport);
       Passes.add(createCheckDebugifyModulePass(
           false, "", nullptr, DebugifyMode::OriginalDebugInfo,
-          &(Passes.getDebugInfoPerPassMap()), VerifyDIPreserveExport));
+          &(Passes.getDebugInfoPerPass()), VerifyDIPreserveExport));
     }
   }
 
@@ -1010,7 +992,7 @@ int main(int argc, char **argv) {
   std::unique_ptr<raw_svector_ostream> BOS;
   raw_ostream *OS = nullptr;
 
-  const bool ShouldEmitOutput = !NoOutput && !AnalyzeOnly;
+  const bool ShouldEmitOutput = !NoOutput;
 
   // Write bitcode or assembly to the output as the last step...
   if (ShouldEmitOutput || RunTwice) {
diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
index be17d5c718c2..1acc2a86d176 100644
--- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
@@ -95,6 +95,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CodeGenInstruction.h"
 #include "CodeGenTarget.h"
 #include "SubtargetFeatureInfo.h"
 #include "Types.h"
@@ -3394,7 +3395,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
                         StringTable.GetOrAddStringOffset(LenMnemonic, false));
   }
 
-  OS << "static const char *const MnemonicTable =\n";
+  OS << "static const char MnemonicTable[] =\n";
   StringTable.EmitString(OS);
   OS << ";\n\n";
 
diff --git a/llvm/utils/TableGen/AsmWriterEmitter.cpp b/llvm/utils/TableGen/AsmWriterEmitter.cpp
index 9283ceeb31e0..1d738274c75a 100644
--- a/llvm/utils/TableGen/AsmWriterEmitter.cpp
+++ b/llvm/utils/TableGen/AsmWriterEmitter.cpp
@@ -19,15 +19,14 @@
 #include "Types.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/MathExtras.h"
@@ -868,8 +867,6 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
 
       IAPrinter IAP(CGA.Result->getAsString(), FlatAliasAsmString, NumMIOps);
 
-      bool CantHandle = false;
-
       unsigned MIOpNum = 0;
       for (unsigned i = 0, e = LastOpNo; i != e; ++i) {
         // Skip over tied operands as they're not part of an alias declaration.
@@ -969,10 +966,9 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
           break;
         }
         case CodeGenInstAlias::ResultOperand::K_Reg:
-          // If this is zero_reg, something's playing tricks we're not
-          // equipped to handle.
           if (!CGA.ResultOperands[i].getRegister()) {
-            CantHandle = true;
+            IAP.addCond(std::string(formatv(
+                "AliasPatternCond::K_Reg, {0}::NoRegister", Namespace)));
             break;
           }
 
@@ -985,8 +981,6 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
         MIOpNum += RO.getMINumOperands();
       }
 
-      if (CantHandle) continue;
-
       std::vector<Record *> ReqFeatures;
       if (PassSubtarget) {
         // We only consider ReqFeatures predicates if PassSubtarget
@@ -1005,6 +999,17 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
         if (D->getNumArgs() == 0)
           PrintFatalError(R->getLoc(), "Invalid AssemblerCondDag!");
         bool IsOr = CombineType == "any_of";
+        // Change (any_of FeatureAll, (any_of ...)) to (any_of FeatureAll, ...).
+        if (IsOr && D->getNumArgs() == 2 && isa<DagInit>(D->getArg(1))) {
+          DagInit *RHS = dyn_cast<DagInit>(D->getArg(1));
+          SmallVector<Init *> Args{D->getArg(0)};
+          SmallVector<StringInit *> ArgNames{D->getArgName(0)};
+          for (unsigned i = 0, e = RHS->getNumArgs(); i != e; ++i) {
+            Args.push_back(RHS->getArg(i));
+            ArgNames.push_back(RHS->getArgName(i));
+          }
+          D = DagInit::get(D->getOperator(), nullptr, Args, ArgNames);
+        }
 
         for (auto *Arg : D->getArgs()) {
           bool IsNeg = false;
diff --git a/llvm/utils/TableGen/AsmWriterInst.cpp b/llvm/utils/TableGen/AsmWriterInst.cpp
index 887abbac9d3b..4a78108d6f4a 100644
--- a/llvm/utils/TableGen/AsmWriterInst.cpp
+++ b/llvm/utils/TableGen/AsmWriterInst.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AsmWriterInst.h"
+#include "CodeGenInstruction.h"
 #include "CodeGenTarget.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/TableGen/Error.h"
diff --git a/llvm/utils/TableGen/Attributes.cpp b/llvm/utils/TableGen/Attributes.cpp
index 5deac4b34bf2..1f975f52d6e7 100644
--- a/llvm/utils/TableGen/Attributes.cpp
+++ b/llvm/utils/TableGen/Attributes.cpp
@@ -6,10 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/TableGen/Record.h"
-#include <algorithm>
-#include <string>
 #include <vector>
 using namespace llvm;
 
diff --git a/llvm/utils/TableGen/CallingConvEmitter.cpp b/llvm/utils/TableGen/CallingConvEmitter.cpp
index 127ae6247bd9..8f080cd250ab 100644
--- a/llvm/utils/TableGen/CallingConvEmitter.cpp
+++ b/llvm/utils/TableGen/CallingConvEmitter.cpp
@@ -15,12 +15,19 @@
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
-#include <cassert>
 using namespace llvm;
 
 namespace {
 class CallingConvEmitter {
   RecordKeeper &Records;
+  unsigned Counter;
+  std::string CurrentAction;
+  bool SwiftAction;
+
+  std::map<std::string, std::set<std::string>> AssignedRegsMap;
+  std::map<std::string, std::set<std::string>> AssignedSwiftRegsMap;
+  std::map<std::string, std::set<std::string>> DelegateToMap;
+
 public:
   explicit CallingConvEmitter(RecordKeeper &R) : Records(R) {}
 
@@ -29,7 +36,7 @@ public:
 private:
   void EmitCallingConv(Record *CC, raw_ostream &O);
   void EmitAction(Record *Action, unsigned Indent, raw_ostream &O);
-  unsigned Counter;
+  void EmitArgRegisterLists(raw_ostream &O);
 };
 } // End anonymous namespace
 
@@ -39,6 +46,7 @@ void CallingConvEmitter::run(raw_ostream &O) {
   // Emit prototypes for all of the non-custom CC's so that they can forward ref
   // each other.
   Records.startTimer("Emit prototypes");
+  O << "#ifndef GET_CC_REGISTER_LISTS\n\n";
   for (Record *CC : CCs) {
     if (!CC->getValueAsBit("Custom")) {
       unsigned Pad = CC->getName().size();
@@ -59,18 +67,28 @@ void CallingConvEmitter::run(raw_ostream &O) {
   // Emit each non-custom calling convention description in full.
   Records.startTimer("Emit full descriptions");
   for (Record *CC : CCs) {
-    if (!CC->getValueAsBit("Custom"))
+    if (!CC->getValueAsBit("Custom")) {
       EmitCallingConv(CC, O);
+    }
   }
-}
 
+  EmitArgRegisterLists(O);
+
+  O << "\n#endif // CC_REGISTER_LIST\n";
+}
 
 void CallingConvEmitter::EmitCallingConv(Record *CC, raw_ostream &O) {
   ListInit *CCActions = CC->getValueAsListInit("Actions");
   Counter = 0;
 
+  CurrentAction = CC->getName().str();
+  // Call upon the creation of a map entry from the void!
+  // We want an entry in AssignedRegsMap for every action, even if that
+  // entry is empty.
+  AssignedRegsMap[CurrentAction] = {};
+
   O << "\n\n";
-  unsigned Pad = CC->getName().size();
+  unsigned Pad = CurrentAction.size();
   if (CC->getValueAsBit("Entry")) {
     O << "bool llvm::";
     Pad += 12;
@@ -78,13 +96,21 @@ void CallingConvEmitter::EmitCallingConv(Record *CC, raw_ostream &O) {
     O << "static bool ";
     Pad += 13;
   }
-  O << CC->getName() << "(unsigned ValNo, MVT ValVT,\n"
+  O << CurrentAction << "(unsigned ValNo, MVT ValVT,\n"
     << std::string(Pad, ' ') << "MVT LocVT, CCValAssign::LocInfo LocInfo,\n"
     << std::string(Pad, ' ') << "ISD::ArgFlagsTy ArgFlags, CCState &State) {\n";
   // Emit all of the actions, in order.
   for (unsigned i = 0, e = CCActions->size(); i != e; ++i) {
+    Record *Action = CCActions->getElementAsRecord(i);
+    SwiftAction = llvm::any_of(Action->getSuperClasses(),
+                               [](const std::pair<Record *, SMRange> &Class) {
+                                 std::string Name =
+                                     Class.first->getNameInitAsString();
+                                 return StringRef(Name).startswith("CCIfSwift");
+                               });
+
     O << "\n";
-    EmitAction(CCActions->getElementAsRecord(i), 2, O);
+    EmitAction(Action, 2, O);
   }
   
   O << "\n  return true; // CC didn't match.\n";
@@ -94,7 +120,7 @@ void CallingConvEmitter::EmitCallingConv(Record *CC, raw_ostream &O) {
 void CallingConvEmitter::EmitAction(Record *Action,
                                     unsigned Indent, raw_ostream &O) {
   std::string IndentStr = std::string(Indent, ' ');
-  
+
   if (Action->isSubClassOf("CCPredicateAction")) {
     O << IndentStr << "if (";
     
@@ -122,18 +148,30 @@ void CallingConvEmitter::EmitAction(Record *Action,
       O << IndentStr << "if (!" << CC->getName()
         << "(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))\n"
         << IndentStr << "  return false;\n";
+      DelegateToMap[CurrentAction].insert(CC->getName().str());
     } else if (Action->isSubClassOf("CCAssignToReg")) {
       ListInit *RegList = Action->getValueAsListInit("RegList");
       if (RegList->size() == 1) {
-        O << IndentStr << "if (unsigned Reg = State.AllocateReg(";
-        O << getQualifiedName(RegList->getElementAsRecord(0)) << ")) {\n";
+        std::string Name = getQualifiedName(RegList->getElementAsRecord(0));
+        O << IndentStr << "if (unsigned Reg = State.AllocateReg(" << Name
+          << ")) {\n";
+        if (SwiftAction)
+          AssignedSwiftRegsMap[CurrentAction].insert(Name);
+        else
+          AssignedRegsMap[CurrentAction].insert(Name);
       } else {
         O << IndentStr << "static const MCPhysReg RegList" << ++Counter
           << "[] = {\n";
         O << IndentStr << "  ";
         ListSeparator LS;
-        for (unsigned i = 0, e = RegList->size(); i != e; ++i)
-          O << LS << getQualifiedName(RegList->getElementAsRecord(i));
+        for (unsigned i = 0, e = RegList->size(); i != e; ++i) {
+          std::string Name = getQualifiedName(RegList->getElementAsRecord(i));
+          if (SwiftAction)
+            AssignedSwiftRegsMap[CurrentAction].insert(Name);
+          else
+            AssignedRegsMap[CurrentAction].insert(Name);
+          O << LS << Name;
+        }
         O << "\n" << IndentStr << "};\n";
         O << IndentStr << "if (unsigned Reg = State.AllocateReg(RegList"
           << Counter << ")) {\n";
@@ -288,6 +326,83 @@ void CallingConvEmitter::EmitAction(Record *Action,
   }
 }
 
+void CallingConvEmitter::EmitArgRegisterLists(raw_ostream &O) {
+  // Transitively merge all delegated CCs into AssignedRegsMap.
+  using EntryTy = std::pair<std::string, std::set<std::string>>;
+  bool Redo;
+  do {
+    Redo = false;
+    std::deque<EntryTy> Worklist(DelegateToMap.begin(), DelegateToMap.end());
+
+    while (!Worklist.empty()) {
+      EntryTy Entry = Worklist.front();
+      Worklist.pop_front();
+
+      const std::string &CCName = Entry.first;
+      std::set<std::string> &Registers = Entry.second;
+      if (!Registers.empty())
+        continue;
+
+      for (auto &InnerEntry : Worklist) {
+        const std::string &InnerCCName = InnerEntry.first;
+        std::set<std::string> &InnerRegisters = InnerEntry.second;
+
+        if (InnerRegisters.find(CCName) != InnerRegisters.end()) {
+          AssignedRegsMap[InnerCCName].insert(
+              AssignedRegsMap[CCName].begin(),
+              AssignedRegsMap[CCName].end());
+          InnerRegisters.erase(CCName);
+        }
+      }
+
+      DelegateToMap.erase(CCName);
+      Redo = true;
+    }
+  } while (Redo);
+
+  if (AssignedRegsMap.empty())
+    return;
+
+  O << "\n#else\n\n";
+
+  for (auto &Entry : AssignedRegsMap) {
+    const std::string &RegName = Entry.first;
+    std::set<std::string> &Registers = Entry.second;
+
+    if (RegName.empty())
+      continue;
+
+    O << "const MCRegister " << Entry.first << "_ArgRegs[] = { ";
+
+    if (Registers.empty()) {
+      O << "0";
+    } else {
+      ListSeparator LS;
+      for (const std::string &Reg : Registers)
+        O << LS << Reg;
+    }
+
+    O << " };\n";
+  }
+
+  if (AssignedSwiftRegsMap.empty())
+    return;
+
+  O << "\n// Registers used by Swift.\n";
+  for (auto &Entry : AssignedSwiftRegsMap) {
+    const std::string &RegName = Entry.first;
+    std::set<std::string> &Registers = Entry.second;
+
+    O << "const MCRegister " << RegName << "_Swift_ArgRegs[] = { ";
+
+    ListSeparator LS;
+    for (const std::string &Reg : Registers)
+      O << LS << Reg;
+
+    O << " };\n";
+  }
+}
+
 namespace llvm {
 
 void EmitCallingConv(RecordKeeper &RK, raw_ostream &OS) {
diff --git a/llvm/utils/TableGen/CodeBeadsGen.cpp b/llvm/utils/TableGen/CodeBeadsGen.cpp
deleted file mode 100644
index 18a6d6d19eb2..000000000000
--- a/llvm/utils/TableGen/CodeBeadsGen.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-//===---------- CodeBeadsGen.cpp - Code Beads Generator -------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// CodeBeads are data fields carrying auxiliary information for instructions.
-//
-// Under the hood it's simply implemented by a `bits` field (with arbitrary
-// length) in each TG instruction description, where this TG backend will
-// generate a helper function to access it.
-//
-// This is especially useful for expressing variable length encoding
-// instructions and complex addressing modes. Since in those cases each
-// instruction is usually associated with large amount of information like
-// addressing mode details used on a specific operand. Instead of retreating to
-// ad-hoc methods to figure out these information when encoding an instruction,
-// CodeBeads provide a clean table for the instruction encoder to lookup.
-//===----------------------------------------------------------------------===//
-
-#include "CodeGenTarget.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/TableGen/Error.h"
-#include "llvm/TableGen/Record.h"
-#include "llvm/TableGen/TableGenBackend.h"
-#include <map>
-#include <string>
-#include <vector>
-using namespace llvm;
-
-namespace {
-
-class CodeBeadsGen {
-  RecordKeeper &Records;
-
-public:
-  CodeBeadsGen(RecordKeeper &R) : Records(R) {}
-  void run(raw_ostream &OS);
-};
-
-void CodeBeadsGen::run(raw_ostream &OS) {
-  CodeGenTarget Target(Records);
-  std::vector<Record *> Insts = Records.getAllDerivedDefinitions("Instruction");
-
-  // For little-endian instruction bit encodings, reverse the bit order
-  Target.reverseBitsForLittleEndianEncoding();
-
-  ArrayRef<const CodeGenInstruction *> NumberedInstructions =
-      Target.getInstructionsByEnumValue();
-
-  // Emit function declaration
-  OS << "const uint8_t *llvm::" << Target.getInstNamespace();
-  OS << "::getMCInstrBeads(unsigned Opcode) {\n";
-
-  // First, get the maximum bit length among all beads. And do some
-  // simple validation
-  unsigned MaxBitLength = 0;
-
-  for (const CodeGenInstruction *CGI : NumberedInstructions) {
-    Record *R = CGI->TheDef;
-    if (!R->getValue("Beads"))
-      continue;
-
-    BitsInit *BI = R->getValueAsBitsInit("Beads");
-    if (!BI->isComplete()) {
-      PrintFatalError(R->getLoc(), "Record `" + R->getName() +
-                                       "', bit field 'Beads' is not complete");
-    }
-
-    MaxBitLength = std::max(MaxBitLength, BI->getNumBits());
-  }
-
-  // Number of bytes
-  unsigned Parts = MaxBitLength / 8;
-
-  // Emit instruction base values
-  OS << "  static const uint8_t InstBits[][" << Parts << "] = {\n";
-  for (const CodeGenInstruction *CGI : NumberedInstructions) {
-    Record *R = CGI->TheDef;
-
-    if (R->getValueAsString("Namespace") == "TargetOpcode" ||
-        !R->getValue("Beads")) {
-      OS << "\t{ 0x0 },\t// ";
-      if (R->getValueAsBit("isPseudo"))
-        OS << "(Pseudo) ";
-      OS << R->getName() << "\n";
-      continue;
-    }
-
-    BitsInit *BI = R->getValueAsBitsInit("Beads");
-
-    // Convert to byte array:
-    // [dcba] -> [a][b][c][d]
-    OS << "\t{";
-    for (unsigned p = 0; p < Parts; ++p) {
-      unsigned Right = 8 * p;
-      unsigned Left = Right + 8;
-
-      uint8_t Value = 0;
-      for (unsigned i = Right; i != Left; ++i) {
-        unsigned Shift = i % 8;
-        if (auto *B = dyn_cast<BitInit>(BI->getBit(i))) {
-          Value |= (static_cast<uint8_t>(B->getValue()) << Shift);
-        } else {
-          PrintFatalError(R->getLoc(), "Record `" + R->getName() +
-                                           "', bit 'Beads[" + Twine(i) +
-                                           "]' is not defined");
-        }
-      }
-
-      if (p)
-        OS << ',';
-      OS << " 0x";
-      OS.write_hex(Value);
-      OS << "";
-    }
-    OS << " }," << '\t' << "// " << R->getName() << "\n";
-  }
-  OS << "\t{ 0x0 }\n  };\n";
-
-  // Emit initial function code
-  OS << "  return InstBits[Opcode];\n"
-     << "}\n\n";
-}
-
-} // End anonymous namespace
-
-namespace llvm {
-
-void EmitCodeBeads(RecordKeeper &RK, raw_ostream &OS) {
-  emitSourceFileHeader("Machine Code Beads", OS);
-  CodeBeadsGen(RK).run(OS);
-}
-
-} // namespace llvm
diff --git a/llvm/utils/TableGen/CodeEmitterGen.cpp b/llvm/utils/TableGen/CodeEmitterGen.cpp
index fbac0d969917..2b9931b23c11 100644
--- a/llvm/utils/TableGen/CodeEmitterGen.cpp
+++ b/llvm/utils/TableGen/CodeEmitterGen.cpp
@@ -16,11 +16,13 @@
 #include "CodeGenTarget.h"
 #include "SubtargetFeatureInfo.h"
 #include "Types.h"
+#include "VarLenCodeEmitterGen.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
 #include <cassert>
@@ -117,16 +119,16 @@ AddCodeToMergeInOperand(Record *R, BitsInit *BI, const std::string &VarName,
               (!NamedOpIndices.empty() && NamedOpIndices.count(
                 CGI.Operands.getSubOperandNumber(NumberedOp).first)))) {
       ++NumberedOp;
+    }
 
-      if (NumberedOp >= CGI.Operands.back().MIOperandNo +
-                        CGI.Operands.back().MINumOperands) {
-        errs() << "Too few operands in record " << R->getName() <<
-                  " (no match for variable " << VarName << "):\n";
-        errs() << *R;
-        errs() << '\n';
-
-        return;
-      }
+    if (NumberedOp >=
+        CGI.Operands.back().MIOperandNo + CGI.Operands.back().MINumOperands) {
+      std::string E;
+      raw_string_ostream S(E);
+      S << "Too few operands in record " << R->getName()
+        << " (no match for variable " << VarName << "):\n";
+      S << *R;
+      PrintFatalError(R, E);
     }
 
     OpIdx = NumberedOp++;
@@ -396,132 +398,138 @@ void CodeEmitterGen::run(raw_ostream &o) {
   ArrayRef<const CodeGenInstruction*> NumberedInstructions =
     Target.getInstructionsByEnumValue();
 
-  const CodeGenHwModes &HWM = Target.getHwModes();
-  // The set of HwModes used by instruction encodings.
-  std::set<unsigned> HwModes;
-  BitWidth = 0;
-  for (const CodeGenInstruction *CGI : NumberedInstructions) {
-    Record *R = CGI->TheDef;
-    if (R->getValueAsString("Namespace") == "TargetOpcode" ||
-        R->getValueAsBit("isPseudo"))
-      continue;
+  if (any_of(NumberedInstructions, [](const CodeGenInstruction *CGI) {
+        Record *R = CGI->TheDef;
+        return R->getValue("Inst") && isa<DagInit>(R->getValueInit("Inst"));
+      })) {
+    emitVarLenCodeEmitter(Records, o);
+  } else {
+    const CodeGenHwModes &HWM = Target.getHwModes();
+    // The set of HwModes used by instruction encodings.
+    std::set<unsigned> HwModes;
+    BitWidth = 0;
+    for (const CodeGenInstruction *CGI : NumberedInstructions) {
+      Record *R = CGI->TheDef;
+      if (R->getValueAsString("Namespace") == "TargetOpcode" ||
+          R->getValueAsBit("isPseudo"))
+        continue;
 
-    if (const RecordVal *RV = R->getValue("EncodingInfos")) {
-      if (DefInit *DI = dyn_cast_or_null<DefInit>(RV->getValue())) {
-        EncodingInfoByHwMode EBM(DI->getDef(), HWM);
-        for (auto &KV : EBM) {
-          BitsInit *BI = KV.second->getValueAsBitsInit("Inst");
-          BitWidth = std::max(BitWidth, BI->getNumBits());
-          HwModes.insert(KV.first);
+      if (const RecordVal *RV = R->getValue("EncodingInfos")) {
+        if (DefInit *DI = dyn_cast_or_null<DefInit>(RV->getValue())) {
+          EncodingInfoByHwMode EBM(DI->getDef(), HWM);
+          for (auto &KV : EBM) {
+            BitsInit *BI = KV.second->getValueAsBitsInit("Inst");
+            BitWidth = std::max(BitWidth, BI->getNumBits());
+            HwModes.insert(KV.first);
+          }
+          continue;
         }
-        continue;
       }
+      BitsInit *BI = R->getValueAsBitsInit("Inst");
+      BitWidth = std::max(BitWidth, BI->getNumBits());
+    }
+    UseAPInt = BitWidth > 64;
+
+    // Emit function declaration
+    if (UseAPInt) {
+      o << "void " << Target.getName()
+        << "MCCodeEmitter::getBinaryCodeForInstr(const MCInst &MI,\n"
+        << "    SmallVectorImpl<MCFixup> &Fixups,\n"
+        << "    APInt &Inst,\n"
+        << "    APInt &Scratch,\n"
+        << "    const MCSubtargetInfo &STI) const {\n";
+    } else {
+      o << "uint64_t " << Target.getName();
+      o << "MCCodeEmitter::getBinaryCodeForInstr(const MCInst &MI,\n"
+        << "    SmallVectorImpl<MCFixup> &Fixups,\n"
+        << "    const MCSubtargetInfo &STI) const {\n";
     }
-    BitsInit *BI = R->getValueAsBitsInit("Inst");
-    BitWidth = std::max(BitWidth, BI->getNumBits());
-  }
-  UseAPInt = BitWidth > 64;
-  
-  // Emit function declaration
-  if (UseAPInt) {
-    o << "void " << Target.getName()
-      << "MCCodeEmitter::getBinaryCodeForInstr(const MCInst &MI,\n"
-      << "    SmallVectorImpl<MCFixup> &Fixups,\n"
-      << "    APInt &Inst,\n"
-      << "    APInt &Scratch,\n"
-      << "    const MCSubtargetInfo &STI) const {\n";
-  } else {
-    o << "uint64_t " << Target.getName();
-    o << "MCCodeEmitter::getBinaryCodeForInstr(const MCInst &MI,\n"
-      << "    SmallVectorImpl<MCFixup> &Fixups,\n"
-      << "    const MCSubtargetInfo &STI) const {\n";
-  }
-  
-  // Emit instruction base values
-  if (HwModes.empty()) {
-    emitInstructionBaseValues(o, NumberedInstructions, Target, -1);
-  } else {
-    for (unsigned HwMode : HwModes)
-      emitInstructionBaseValues(o, NumberedInstructions, Target, (int)HwMode);
-  }
 
-  if (!HwModes.empty()) {
-    o << "  const uint64_t *InstBits;\n";
-    o << "  unsigned HwMode = STI.getHwMode();\n";
-    o << "  switch (HwMode) {\n";
-    o << "  default: llvm_unreachable(\"Unknown hardware mode!\"); break;\n";
-    for (unsigned I : HwModes) {
-      o << "  case " << I << ": InstBits = InstBits_" << HWM.getMode(I).Name
-        << "; break;\n";
+    // Emit instruction base values
+    if (HwModes.empty()) {
+      emitInstructionBaseValues(o, NumberedInstructions, Target, -1);
+    } else {
+      for (unsigned HwMode : HwModes)
+        emitInstructionBaseValues(o, NumberedInstructions, Target, (int)HwMode);
     }
-    o << "  };\n";
-  }
 
-  // Map to accumulate all the cases.
-  std::map<std::string, std::vector<std::string>> CaseMap;
+    if (!HwModes.empty()) {
+      o << "  const uint64_t *InstBits;\n";
+      o << "  unsigned HwMode = STI.getHwMode();\n";
+      o << "  switch (HwMode) {\n";
+      o << "  default: llvm_unreachable(\"Unknown hardware mode!\"); break;\n";
+      for (unsigned I : HwModes) {
+        o << "  case " << I << ": InstBits = InstBits_" << HWM.getMode(I).Name
+          << "; break;\n";
+      }
+      o << "  };\n";
+    }
 
-  // Construct all cases statement for each opcode
-  for (Record *R : Insts) {
-    if (R->getValueAsString("Namespace") == "TargetOpcode" ||
-        R->getValueAsBit("isPseudo"))
-      continue;
-    std::string InstName =
-        (R->getValueAsString("Namespace") + "::" + R->getName()).str();
-    std::string Case = getInstructionCase(R, Target);
+    // Map to accumulate all the cases.
+    std::map<std::string, std::vector<std::string>> CaseMap;
 
-    CaseMap[Case].push_back(std::move(InstName));
-  }
+    // Construct all cases statement for each opcode
+    for (Record *R : Insts) {
+      if (R->getValueAsString("Namespace") == "TargetOpcode" ||
+          R->getValueAsBit("isPseudo"))
+        continue;
+      std::string InstName =
+          (R->getValueAsString("Namespace") + "::" + R->getName()).str();
+      std::string Case = getInstructionCase(R, Target);
 
-  // Emit initial function code
-  if (UseAPInt) {
-    int NumWords = APInt::getNumWords(BitWidth);
-    int NumBytes = (BitWidth + 7) / 8;
-    o << "  const unsigned opcode = MI.getOpcode();\n"
-      << "  if (Inst.getBitWidth() != " << BitWidth << ")\n"
-      << "    Inst = Inst.zext(" << BitWidth << ");\n"
-      << "  if (Scratch.getBitWidth() != " << BitWidth << ")\n"
-      << "    Scratch = Scratch.zext(" << BitWidth << ");\n"
-      << "  LoadIntFromMemory(Inst, (const uint8_t *)&InstBits[opcode * "
-      << NumWords << "], " << NumBytes << ");\n"
-      << "  APInt &Value = Inst;\n"
-      << "  APInt &op = Scratch;\n"
-      << "  switch (opcode) {\n";
-  } else {
-    o << "  const unsigned opcode = MI.getOpcode();\n"
-      << "  uint64_t Value = InstBits[opcode];\n"
-      << "  uint64_t op = 0;\n"
-      << "  (void)op;  // suppress warning\n"
-      << "  switch (opcode) {\n";
-  }
+      CaseMap[Case].push_back(std::move(InstName));
+    }
+
+    // Emit initial function code
+    if (UseAPInt) {
+      int NumWords = APInt::getNumWords(BitWidth);
+      o << "  const unsigned opcode = MI.getOpcode();\n"
+        << "  if (Scratch.getBitWidth() != " << BitWidth << ")\n"
+        << "    Scratch = Scratch.zext(" << BitWidth << ");\n"
+        << "  Inst = APInt(" << BitWidth
+        << ", makeArrayRef(InstBits + opcode * " << NumWords << ", " << NumWords
+        << "));\n"
+        << "  APInt &Value = Inst;\n"
+        << "  APInt &op = Scratch;\n"
+        << "  switch (opcode) {\n";
+    } else {
+      o << "  const unsigned opcode = MI.getOpcode();\n"
+        << "  uint64_t Value = InstBits[opcode];\n"
+        << "  uint64_t op = 0;\n"
+        << "  (void)op;  // suppress warning\n"
+        << "  switch (opcode) {\n";
+    }
 
-  // Emit each case statement
-  std::map<std::string, std::vector<std::string>>::iterator IE, EE;
-  for (IE = CaseMap.begin(), EE = CaseMap.end(); IE != EE; ++IE) {
-    const std::string &Case = IE->first;
-    std::vector<std::string> &InstList = IE->second;
+    // Emit each case statement
+    std::map<std::string, std::vector<std::string>>::iterator IE, EE;
+    for (IE = CaseMap.begin(), EE = CaseMap.end(); IE != EE; ++IE) {
+      const std::string &Case = IE->first;
+      std::vector<std::string> &InstList = IE->second;
 
-    for (int i = 0, N = InstList.size(); i < N; i++) {
-      if (i) o << "\n";
-      o << "    case " << InstList[i]  << ":";
+      for (int i = 0, N = InstList.size(); i < N; i++) {
+        if (i)
+          o << "\n";
+        o << "    case " << InstList[i] << ":";
+      }
+      o << " {\n";
+      o << Case;
+      o << "      break;\n"
+        << "    }\n";
     }
-    o << " {\n";
-    o << Case;
-    o << "      break;\n"
-      << "    }\n";
-  }
 
-  // Default case: unhandled opcode
-  o << "  default:\n"
-    << "    std::string msg;\n"
-    << "    raw_string_ostream Msg(msg);\n"
-    << "    Msg << \"Not supported instr: \" << MI;\n"
-    << "    report_fatal_error(msg.c_str());\n"
-    << "  }\n";
-  if (UseAPInt)
-    o << "  Inst = Value;\n";
-  else
-    o << "  return Value;\n";
-  o << "}\n\n";
+    // Default case: unhandled opcode
+    o << "  default:\n"
+      << "    std::string msg;\n"
+      << "    raw_string_ostream Msg(msg);\n"
+      << "    Msg << \"Not supported instr: \" << MI;\n"
+      << "    report_fatal_error(Msg.str().c_str());\n"
+      << "  }\n";
+    if (UseAPInt)
+      o << "  Inst = Value;\n";
+    else
+      o << "  return Value;\n";
+    o << "}\n\n";
+  }
 
   const auto &All = SubtargetFeatureInfo::getAll(Records);
   std::map<Record *, SubtargetFeatureInfo, LessRecordByID> SubtargetFeatures;
diff --git a/llvm/utils/TableGen/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/CodeGenDAGPatterns.cpp
index a1f8f4809d5f..9d6adb6d2c37 100644
--- a/llvm/utils/TableGen/CodeGenDAGPatterns.cpp
+++ b/llvm/utils/TableGen/CodeGenDAGPatterns.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CodeGenDAGPatterns.h"
+#include "CodeGenInstruction.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
@@ -2815,6 +2816,7 @@ void TreePattern::ComputeNamedNodes(TreePatternNode *N) {
 
 TreePatternNodePtr TreePattern::ParseTreePattern(Init *TheInit,
                                                  StringRef OpName) {
+  RecordKeeper &RK = TheInit->getRecordKeeper();
   if (DefInit *DI = dyn_cast<DefInit>(TheInit)) {
     Record *R = DI->getDef();
 
@@ -2853,13 +2855,13 @@ TreePatternNodePtr TreePattern::ParseTreePattern(Init *TheInit,
     if (!OpName.empty())
       error("Constant int or bit argument should not have a name!");
     if (isa<BitInit>(TheInit))
-      TheInit = TheInit->convertInitializerTo(IntRecTy::get());
+      TheInit = TheInit->convertInitializerTo(IntRecTy::get(RK));
     return std::make_shared<TreePatternNode>(TheInit, 1);
   }
 
   if (BitsInit *BI = dyn_cast<BitsInit>(TheInit)) {
     // Turn this into an IntInit.
-    Init *II = BI->convertInitializerTo(IntRecTy::get());
+    Init *II = BI->convertInitializerTo(IntRecTy::get(RK));
     if (!II || !isa<IntInit>(II))
       error("Bits value must be constants!");
     return ParseTreePattern(II, OpName);
@@ -2958,8 +2960,8 @@ TreePatternNodePtr TreePattern::ParseTreePattern(Init *TheInit,
     else // Otherwise, no chain.
       Operator = getDAGPatterns().get_intrinsic_wo_chain_sdnode();
 
-    Children.insert(Children.begin(),
-                    std::make_shared<TreePatternNode>(IntInit::get(IID), 1));
+    Children.insert(Children.begin(), std::make_shared<TreePatternNode>(
+                                          IntInit::get(RK, IID), 1));
   }
 
   if (Operator->isSubClassOf("ComplexPattern")) {
@@ -4366,7 +4368,7 @@ void CodeGenDAGPatterns::ExpandHwModeBasedTypes() {
     PatternsToMatch.emplace_back(P.getSrcRecord(), P.getPredicates(),
                                  std::move(NewSrc), std::move(NewDst),
                                  P.getDstRegs(), P.getAddedComplexity(),
-                                 Record::getNewUID(), Mode, Check);
+                                 Record::getNewUID(Records), Mode, Check);
   };
 
   for (PatternToMatch &P : Copy) {
@@ -4742,7 +4744,7 @@ void CodeGenDAGPatterns::GenerateVariants() {
           PatternsToMatch[i].getSrcRecord(), PatternsToMatch[i].getPredicates(),
           Variant, PatternsToMatch[i].getDstPatternShared(),
           PatternsToMatch[i].getDstRegs(),
-          PatternsToMatch[i].getAddedComplexity(), Record::getNewUID(),
+          PatternsToMatch[i].getAddedComplexity(), Record::getNewUID(Records),
           PatternsToMatch[i].getForceMode(),
           PatternsToMatch[i].getHwModeFeatures());
     }
diff --git a/llvm/utils/TableGen/CodeGenDAGPatterns.h b/llvm/utils/TableGen/CodeGenDAGPatterns.h
index 39d81230a4f2..94694a96eb90 100644
--- a/llvm/utils/TableGen/CodeGenDAGPatterns.h
+++ b/llvm/utils/TableGen/CodeGenDAGPatterns.h
@@ -28,7 +28,6 @@
 #include <functional>
 #include <map>
 #include <numeric>
-#include <set>
 #include <vector>
 
 namespace llvm {
diff --git a/llvm/utils/TableGen/CodeGenInstruction.cpp b/llvm/utils/TableGen/CodeGenInstruction.cpp
index 78b698c31b2b..ba12633ace8c 100644
--- a/llvm/utils/TableGen/CodeGenInstruction.cpp
+++ b/llvm/utils/TableGen/CodeGenInstruction.cpp
@@ -12,7 +12,6 @@
 
 #include "CodeGenInstruction.h"
 #include "CodeGenTarget.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/TableGen/Error.h"
@@ -416,6 +415,7 @@ CodeGenInstruction::CodeGenInstruction(Record *R)
   hasExtraDefRegAllocReq = R->getValueAsBit("hasExtraDefRegAllocReq");
   isCodeGenOnly = R->getValueAsBit("isCodeGenOnly");
   isPseudo = R->getValueAsBit("isPseudo");
+  isMeta = R->getValueAsBit("isMeta");
   ImplicitDefs = R->getValueAsListOfDefs("Defs");
   ImplicitUses = R->getValueAsListOfDefs("Uses");
 
@@ -632,8 +632,8 @@ bool CodeGenInstAlias::tryAliasOpMatch(DagInit *Result, unsigned AliasOpNo,
     if (!BI->isComplete())
       return false;
     // Convert the bits init to an integer and use that for the result.
-    IntInit *II =
-      dyn_cast_or_null<IntInit>(BI->convertInitializerTo(IntRecTy::get()));
+    IntInit *II = dyn_cast_or_null<IntInit>(
+        BI->convertInitializerTo(IntRecTy::get(BI->getRecordKeeper())));
     if (!II)
       return false;
     ResOp = ResultOperand(II->getValue());
diff --git a/llvm/utils/TableGen/CodeGenInstruction.h b/llvm/utils/TableGen/CodeGenInstruction.h
index e0ce5d433602..d3de6d95780c 100644
--- a/llvm/utils/TableGen/CodeGenInstruction.h
+++ b/llvm/utils/TableGen/CodeGenInstruction.h
@@ -16,13 +16,13 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/MachineValueType.h"
-#include "llvm/Support/SMLoc.h"
 #include <cassert>
 #include <string>
 #include <utility>
 #include <vector>
 
 namespace llvm {
+class SMLoc;
 template <typename T> class ArrayRef;
   class Record;
   class DagInit;
@@ -271,6 +271,7 @@ template <typename T> class ArrayRef;
     bool hasExtraDefRegAllocReq : 1;
     bool isCodeGenOnly : 1;
     bool isPseudo : 1;
+    bool isMeta : 1;
     bool isRegSequence : 1;
     bool isExtractSubreg : 1;
     bool isInsertSubreg : 1;
diff --git a/llvm/utils/TableGen/CodeGenIntrinsics.h b/llvm/utils/TableGen/CodeGenIntrinsics.h
index b005a5866f80..599795e3c065 100644
--- a/llvm/utils/TableGen/CodeGenIntrinsics.h
+++ b/llvm/utils/TableGen/CodeGenIntrinsics.h
@@ -26,7 +26,7 @@ struct CodeGenIntrinsic {
   Record *TheDef;             // The actual record defining this intrinsic.
   std::string Name;           // The name of the LLVM function "llvm.bswap.i32"
   std::string EnumName;       // The name of the enum "bswap_i32"
-  std::string GCCBuiltinName; // Name of the corresponding GCC builtin, or "".
+  std::string ClangBuiltinName; // Name of the corresponding GCC builtin, or "".
   std::string MSBuiltinName;  // Name of the corresponding MS builtin, or "".
   std::string TargetPrefix;   // Target prefix, e.g. "ppc" for t-s intrinsics.
 
@@ -125,6 +125,9 @@ struct CodeGenIntrinsic {
   /// True if the intrinsic is no-return.
   bool isNoReturn;
 
+  /// True if the intrinsic is no-callback.
+  bool isNoCallback;
+
   /// True if the intrinsic is no-sync.
   bool isNoSync;
 
diff --git a/llvm/utils/TableGen/CodeGenMapTable.cpp b/llvm/utils/TableGen/CodeGenMapTable.cpp
index 38871eb8cf3c..02695942f5c1 100644
--- a/llvm/utils/TableGen/CodeGenMapTable.cpp
+++ b/llvm/utils/TableGen/CodeGenMapTable.cpp
@@ -75,8 +75,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CodeGenInstruction.h"
 #include "CodeGenTarget.h"
-#include "llvm/Support/Format.h"
 #include "llvm/TableGen/Error.h"
 using namespace llvm;
 typedef std::map<std::string, std::vector<Record*> > InstrRelMapTy;
diff --git a/llvm/utils/TableGen/CodeGenRegisters.cpp b/llvm/utils/TableGen/CodeGenRegisters.cpp
index afaeb73ffab1..2c61be713afc 100644
--- a/llvm/utils/TableGen/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/CodeGenRegisters.cpp
@@ -12,21 +12,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "CodeGenRegisters.h"
-#include "CodeGenTarget.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/IntEqClasses.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
@@ -204,12 +201,16 @@ namespace {
 class RegUnitIterator {
   CodeGenRegister::Vec::const_iterator RegI, RegE;
   CodeGenRegister::RegUnitList::iterator UnitI, UnitE;
+  static CodeGenRegister::RegUnitList Sentinel;
 
 public:
   RegUnitIterator(const CodeGenRegister::Vec &Regs):
     RegI(Regs.begin()), RegE(Regs.end()) {
 
-    if (RegI != RegE) {
+    if (RegI == RegE) {
+      UnitI = Sentinel.end();
+      UnitE = Sentinel.end();
+    } else {
       UnitI = (*RegI)->getRegUnits().begin();
       UnitE = (*RegI)->getRegUnits().end();
       advance();
@@ -240,6 +241,8 @@ protected:
   }
 };
 
+CodeGenRegister::RegUnitList RegUnitIterator::Sentinel;
+
 } // end anonymous namespace
 
 // Return true of this unit appears in RegUnits.
@@ -635,6 +638,7 @@ struct TupleExpander : SetTheory::Expander {
       Def->getValueAsListOfStrings("RegAsmNames");
 
     // Zip them up.
+    RecordKeeper &RK = Def->getRecords();
     for (unsigned n = 0; n != Length; ++n) {
       std::string Name;
       Record *Proto = Lists[0][n];
@@ -651,13 +655,13 @@ struct TupleExpander : SetTheory::Expander {
       SmallVector<Init *, 2> CostPerUse;
       CostPerUse.insert(CostPerUse.end(), CostList->begin(), CostList->end());
 
-      StringInit *AsmName = StringInit::get("");
+      StringInit *AsmName = StringInit::get(RK, "");
       if (!RegNames.empty()) {
         if (RegNames.size() <= n)
           PrintFatalError(Def->getLoc(),
                           "Register tuple definition missing name for '" +
                             Name + "'.");
-        AsmName = StringInit::get(RegNames[n]);
+        AsmName = StringInit::get(RK, RegNames[n]);
       }
 
       // Create a new Record representing the synthesized register. This record
@@ -696,7 +700,7 @@ struct TupleExpander : SetTheory::Expander {
 
         // Composite registers are always covered by sub-registers.
         if (Field == "CoveredBySubRegs")
-          RV.setValue(BitInit::get(true));
+          RV.setValue(BitInit::get(RK, true));
 
         // Copy fields from the RegisterTuples def.
         if (Field == "SubRegIndices" ||
@@ -1105,6 +1109,17 @@ void CodeGenRegisterClass::buildRegUnitSet(const CodeGenRegBank &RegBank,
                    std::back_inserter(RegUnits));
 }
 
+//===----------------------------------------------------------------------===//
+//                           CodeGenRegisterCategory
+//===----------------------------------------------------------------------===//
+
+CodeGenRegisterCategory::CodeGenRegisterCategory(CodeGenRegBank &RegBank,
+                                                 Record *R)
+    : TheDef(R), Name(std::string(R->getName())) {
+  for (Record *RegClass : R->getValueAsListOfDefs("Classes"))
+    Classes.push_back(RegBank.getRegClass(RegClass));
+}
+
 //===----------------------------------------------------------------------===//
 //                               CodeGenRegBank
 //===----------------------------------------------------------------------===//
@@ -1222,6 +1237,12 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records,
   for (auto &RC : RegClasses)
     RC.EnumValue = i++;
   CodeGenRegisterClass::computeSubClasses(*this);
+
+  // Read in the register category definitions.
+  std::vector<Record *> RCats =
+      Records.getAllDerivedDefinitions("RegisterCategory");
+  for (auto *R : RCats)
+    RegCategories.emplace_back(*this, R);
 }
 
 // Create a synthetic CodeGenSubRegIndex without a corresponding Record.
@@ -1794,6 +1815,7 @@ void CodeGenRegBank::computeRegUnitWeights() {
   unsigned NumIters = 0;
   for (bool Changed = true; Changed; ++NumIters) {
     assert(NumIters <= NumNativeRegUnits && "Runaway register unit weights");
+    (void) NumIters;
     Changed = false;
     for (auto &Reg : Registers) {
       CodeGenRegister::RegUnitList NormalUnits;
diff --git a/llvm/utils/TableGen/CodeGenRegisters.h b/llvm/utils/TableGen/CodeGenRegisters.h
index c9fcf83b0a8a..0fc8b3ef80dd 100644
--- a/llvm/utils/TableGen/CodeGenRegisters.h
+++ b/llvm/utils/TableGen/CodeGenRegisters.h
@@ -27,7 +27,6 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MachineValueType.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/SetTheory.h"
 #include <cassert>
@@ -476,6 +475,26 @@ namespace llvm {
     static void computeSubClasses(CodeGenRegBank&);
   };
 
+  // Register categories are used when we need to deterine the category a
+  // register falls into (GPR, vector, fixed, etc.) without having to know
+  // specific information about the target architecture.
+  class CodeGenRegisterCategory {
+    Record *TheDef;
+    std::string Name;
+    std::list<CodeGenRegisterClass *> Classes;
+
+  public:
+    CodeGenRegisterCategory(CodeGenRegBank &, Record *R);
+    CodeGenRegisterCategory(CodeGenRegisterCategory &) = delete;
+
+    // Return the Record that defined this class, or NULL if the class was
+    // created by TableGen.
+    Record *getDef() const { return TheDef; }
+
+    std::string getName() const { return Name; }
+    std::list<CodeGenRegisterClass *> getClasses() const { return Classes; }
+  };
+
   // Register units are used to model interference and register pressure.
   // Every register is assigned one or more register units such that two
   // registers overlap if and only if they have a register unit in common.
@@ -559,6 +578,13 @@ namespace llvm {
     typedef std::map<CodeGenRegisterClass::Key, CodeGenRegisterClass*> RCKeyMap;
     RCKeyMap Key2RC;
 
+    // Register categories.
+    std::list<CodeGenRegisterCategory> RegCategories;
+    DenseMap<Record *, CodeGenRegisterCategory *> Def2RCat;
+    using RCatKeyMap =
+        std::map<CodeGenRegisterClass::Key, CodeGenRegisterCategory *>;
+    RCatKeyMap Key2RCat;
+
     // Remember each unique set of register units. Initially, this contains a
     // unique set for each register class. Simliar sets are coalesced with
     // pruneUnitSets and new supersets are inferred during computeRegUnitSets.
@@ -719,6 +745,14 @@ namespace llvm {
       return RegClasses;
     }
 
+    std::list<CodeGenRegisterCategory> &getRegCategories() {
+      return RegCategories;
+    }
+
+    const std::list<CodeGenRegisterCategory> &getRegCategories() const {
+      return RegCategories;
+    }
+
     // Find a register class from its def.
     CodeGenRegisterClass *getRegClass(const Record *) const;
 
diff --git a/llvm/utils/TableGen/CodeGenSchedule.cpp b/llvm/utils/TableGen/CodeGenSchedule.cpp
index e47bda725a17..4933bfc476f4 100644
--- a/llvm/utils/TableGen/CodeGenSchedule.cpp
+++ b/llvm/utils/TableGen/CodeGenSchedule.cpp
@@ -17,7 +17,6 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
diff --git a/llvm/utils/TableGen/CodeGenSchedule.h b/llvm/utils/TableGen/CodeGenSchedule.h
index a331a30b51a8..f7e35b0c808f 100644
--- a/llvm/utils/TableGen/CodeGenSchedule.h
+++ b/llvm/utils/TableGen/CodeGenSchedule.h
@@ -17,11 +17,8 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/SetTheory.h"
-#include <map>
 
 namespace llvm {
 
diff --git a/llvm/utils/TableGen/CodeGenTarget.cpp b/llvm/utils/TableGen/CodeGenTarget.cpp
index 2c1583f7979d..af2e8576af2e 100644
--- a/llvm/utils/TableGen/CodeGenTarget.cpp
+++ b/llvm/utils/TableGen/CodeGenTarget.cpp
@@ -14,16 +14,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "CodeGenTarget.h"
-#include "CodeGenDAGPatterns.h"
+#include "CodeGenInstruction.h"
 #include "CodeGenIntrinsics.h"
 #include "CodeGenSchedule.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Timer.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
-#include "llvm/TableGen/TableGenBackend.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -56,9 +53,12 @@ StringRef llvm::getName(MVT::SimpleValueType T) {
 }
 
 StringRef llvm::getEnumName(MVT::SimpleValueType T) {
+  // clang-format off
   switch (T) {
   case MVT::Other:    return "MVT::Other";
   case MVT::i1:       return "MVT::i1";
+  case MVT::i2:       return "MVT::i2";
+  case MVT::i4:       return "MVT::i4";
   case MVT::i8:       return "MVT::i8";
   case MVT::i16:      return "MVT::i16";
   case MVT::i32:      return "MVT::i32";
@@ -91,6 +91,8 @@ StringRef llvm::getEnumName(MVT::SimpleValueType T) {
   case MVT::v256i1:   return "MVT::v256i1";
   case MVT::v512i1:   return "MVT::v512i1";
   case MVT::v1024i1:  return "MVT::v1024i1";
+  case MVT::v128i2:   return "MVT::v128i2";
+  case MVT::v64i4:    return "MVT::v64i4";
   case MVT::v1i8:     return "MVT::v1i8";
   case MVT::v2i8:     return "MVT::v2i8";
   case MVT::v4i8:     return "MVT::v4i8";
@@ -227,6 +229,8 @@ StringRef llvm::getEnumName(MVT::SimpleValueType T) {
   case MVT::nxv2bf16:  return "MVT::nxv2bf16";
   case MVT::nxv4bf16:  return "MVT::nxv4bf16";
   case MVT::nxv8bf16:  return "MVT::nxv8bf16";
+  case MVT::nxv16bf16: return "MVT::nxv16bf16";
+  case MVT::nxv32bf16: return "MVT::nxv32bf16";
   case MVT::nxv1f32:   return "MVT::nxv1f32";
   case MVT::nxv2f32:   return "MVT::nxv2f32";
   case MVT::nxv4f32:   return "MVT::nxv4f32";
@@ -245,6 +249,7 @@ StringRef llvm::getEnumName(MVT::SimpleValueType T) {
   case MVT::externref: return "MVT::externref";
   default: llvm_unreachable("ILLEGAL VALUE TYPE!");
   }
+  // clang-format on
 }
 
 /// getQualifiedName - Return the name of the specified record, with a
@@ -471,7 +476,7 @@ GetInstByName(const char *Name,
   return I->second.get();
 }
 
-static const char *const FixedInstrs[] = {
+static const char *FixedInstrs[] = {
 #define HANDLE_TARGET_OPCODE(OPC) #OPC,
 #include "llvm/Support/TargetOpcodes.def"
     nullptr};
@@ -555,7 +560,7 @@ void CodeGenTarget::reverseBitsForLittleEndianEncoding() {
       NewBits[middle] = BI->getBit(middle);
     }
 
-    BitsInit *NewBI = BitsInit::get(NewBits);
+    BitsInit *NewBI = BitsInit::get(Records, NewBits);
 
     // Update the bits in reversed order so that emitInstrOpBits will get the
     // correct endianness.
@@ -666,6 +671,7 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R,
   isCommutative = false;
   canThrow = false;
   isNoReturn = false;
+  isNoCallback = false;
   isNoSync = false;
   isNoFree = false;
   isWillReturn = false;
@@ -682,8 +688,8 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R,
 
   EnumName = DefName.substr(4);
 
-  if (R->getValue("GCCBuiltinName"))  // Ignore a missing GCCBuiltinName field.
-    GCCBuiltinName = std::string(R->getValueAsString("GCCBuiltinName"));
+  if (R->getValue("ClangBuiltinName"))  // Ignore a missing ClangBuiltinName field.
+    ClangBuiltinName = std::string(R->getValueAsString("ClangBuiltinName"));
   if (R->getValue("MSBuiltinName"))   // Ignore a missing MSBuiltinName field.
     MSBuiltinName = std::string(R->getValueAsString("MSBuiltinName"));
 
@@ -864,6 +870,8 @@ void CodeGenIntrinsic::setProperty(Record *R) {
     isConvergent = true;
   else if (R->getName() == "IntrNoReturn")
     isNoReturn = true;
+  else if (R->getName() == "IntrNoCallback")
+    isNoCallback = true;
   else if (R->getName() == "IntrNoSync")
     isNoSync = true;
   else if (R->getName() == "IntrNoFree")
diff --git a/llvm/utils/TableGen/CodeGenTarget.h b/llvm/utils/TableGen/CodeGenTarget.h
index 5bd84c873f2f..f14828f2c347 100644
--- a/llvm/utils/TableGen/CodeGenTarget.h
+++ b/llvm/utils/TableGen/CodeGenTarget.h
@@ -17,16 +17,15 @@
 #define LLVM_UTILS_TABLEGEN_CODEGENTARGET_H
 
 #include "CodeGenHwModes.h"
-#include "CodeGenInstruction.h"
 #include "CodeGenRegisters.h"
 #include "InfoByHwMode.h"
 #include "SDNodeProperties.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/TableGen/Record.h"
-#include <algorithm>
 
 namespace llvm {
 
+class RecordKeeper;
+class Record;
+class CodeGenInstruction;
 struct CodeGenRegister;
 class CodeGenSchedModels;
 class CodeGenTarget;
diff --git a/llvm/utils/TableGen/DAGISelEmitter.cpp b/llvm/utils/TableGen/DAGISelEmitter.cpp
index 2f211e2958fa..d012a0172a8f 100644
--- a/llvm/utils/TableGen/DAGISelEmitter.cpp
+++ b/llvm/utils/TableGen/DAGISelEmitter.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CodeGenDAGPatterns.h"
+#include "CodeGenInstruction.h"
 #include "DAGISelMatcher.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/TableGen/Record.h"
diff --git a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
index 5b0d16a8f3c8..777e75dcd929 100644
--- a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
@@ -13,9 +13,7 @@
 #include "CodeGenDAGPatterns.h"
 #include "DAGISelMatcher.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/Support/CommandLine.h"
diff --git a/llvm/utils/TableGen/DAGISelMatcherGen.cpp b/llvm/utils/TableGen/DAGISelMatcherGen.cpp
index 2361ed8a7a95..44bff4c67ab3 100644
--- a/llvm/utils/TableGen/DAGISelMatcherGen.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherGen.cpp
@@ -6,9 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "DAGISelMatcher.h"
 #include "CodeGenDAGPatterns.h"
+#include "CodeGenInstruction.h"
 #include "CodeGenRegisters.h"
+#include "DAGISelMatcher.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/TableGen/Error.h"
diff --git a/llvm/utils/TableGen/DFAEmitter.cpp b/llvm/utils/TableGen/DFAEmitter.cpp
index 27161d261e85..f2d9165c5c8c 100644
--- a/llvm/utils/TableGen/DFAEmitter.cpp
+++ b/llvm/utils/TableGen/DFAEmitter.cpp
@@ -21,7 +21,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "DFAEmitter.h"
-#include "CodeGenTarget.h"
 #include "SequenceToOffsetTable.h"
 #include "TableGenBackends.h"
 #include "llvm/ADT/SmallVector.h"
@@ -30,9 +29,9 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/Record.h"
-#include "llvm/TableGen/TableGenBackend.h"
 #include <cassert>
 #include <cstdint>
+#include <deque>
 #include <map>
 #include <set>
 #include <string>
@@ -306,6 +305,7 @@ void Automaton::emit(raw_ostream &OS) {
   }
   LLVM_DEBUG(dbgs() << "  NFA automaton has " << SeenStates.size()
                     << " states with " << NumTransitions << " transitions.\n");
+  (void) NumTransitions;
 
   const auto &ActionTypes = Transitions.back().getTypes();
   OS << "// The type of an action in the " << Name << " automaton.\n";
diff --git a/llvm/utils/TableGen/DFAPacketizerEmitter.cpp b/llvm/utils/TableGen/DFAPacketizerEmitter.cpp
index 9cbdbc19c206..6704d747f715 100644
--- a/llvm/utils/TableGen/DFAPacketizerEmitter.cpp
+++ b/llvm/utils/TableGen/DFAPacketizerEmitter.cpp
@@ -17,9 +17,7 @@
 #include "CodeGenSchedule.h"
 #include "CodeGenTarget.h"
 #include "DFAEmitter.h"
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/Record.h"
diff --git a/llvm/utils/TableGen/DXILEmitter.cpp b/llvm/utils/TableGen/DXILEmitter.cpp
new file mode 100644
index 000000000000..fd58e798b445
--- /dev/null
+++ b/llvm/utils/TableGen/DXILEmitter.cpp
@@ -0,0 +1,374 @@
+//===- DXILEmitter.cpp - DXIL operation Emitter ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// DXILEmitter uses the descriptions of DXIL operation to construct enum and
+// helper functions for DXIL operation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SequenceToOffsetTable.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+
+using namespace llvm;
+
+namespace {
+
+struct DXILShaderModel {
+  int Major;
+  int Minor;
+};
+struct DXILParam {
+  int Pos;        // position in parameter list
+  StringRef Type; // llvm type name, $o for overload, $r for resource
+                  // type, $cb for legacy cbuffer, $u4 for u4 struct
+  StringRef Name; // short, unique name
+  StringRef Doc;  // the documentation description of this parameter
+  bool IsConst;   // whether this argument requires a constant value in the IR
+  StringRef EnumName; // the name of the enum type if applicable
+  int MaxValue;       // the maximum value for this parameter if applicable
+  DXILParam(const Record *R) {
+    Name = R->getValueAsString("name");
+    Pos = R->getValueAsInt("pos");
+    Type = R->getValueAsString("llvm_type");
+    if (R->getValue("doc"))
+      Doc = R->getValueAsString("doc");
+    IsConst = R->getValueAsBit("is_const");
+    EnumName = R->getValueAsString("enum_name");
+    MaxValue = R->getValueAsInt("max_value");
+  }
+};
+
+struct DXILOperationData {
+  StringRef Name; // short, unique name
+
+  StringRef DXILOp;    // name of DXIL operation
+  int DXILOpID;        // ID of DXIL operation
+  StringRef DXILClass; // name of the opcode class
+  StringRef Category;  // classification for this instruction
+  StringRef Doc;       // the documentation description of this instruction
+
+  SmallVector<DXILParam> Params; // the operands that this instruction takes
+  StringRef OverloadTypes;       // overload types if applicable
+  StringRef FnAttr;              // attribute shorthands: rn=does not access
+                                 // memory,ro=only reads from memory
+  StringRef Intrinsic; // The llvm intrinsic map to DXILOp. Default is "" which
+                       // means no map exist
+  bool IsDeriv;        // whether this is some kind of derivative
+  bool IsGradient;               // whether this requires a gradient calculation
+  bool IsFeedback;               // whether this is a sampler feedback op
+  bool IsWave; // whether this requires in-wave, cross-lane functionality
+  bool RequiresUniformInputs; // whether this operation requires that all
+                              // of its inputs are uniform across the wave
+  SmallVector<StringRef, 4>
+      ShaderStages; // shader stages to which this applies, empty for all.
+  DXILShaderModel ShaderModel;           // minimum shader model required
+  DXILShaderModel ShaderModelTranslated; // minimum shader model required with
+                                         // translation by linker
+  SmallVector<StringRef, 4> counters;    // counters for this inst.
+  DXILOperationData(const Record *R) {
+    Name = R->getValueAsString("name");
+    DXILOp = R->getValueAsString("dxil_op");
+    DXILOpID = R->getValueAsInt("dxil_opid");
+    DXILClass = R->getValueAsDef("op_class")->getValueAsString("name");
+    Category = R->getValueAsDef("category")->getValueAsString("name");
+
+    if (R->getValue("llvm_intrinsic")) {
+      auto *IntrinsicDef = R->getValueAsDef("llvm_intrinsic");
+      auto DefName = IntrinsicDef->getName();
+      assert(DefName.startswith("int_") && "invalid intrinsic name");
+      // Remove the int_ from intrinsic name.
+      Intrinsic = DefName.substr(4);
+    }
+
+    Doc = R->getValueAsString("doc");
+
+    ListInit *ParamList = R->getValueAsListInit("ops");
+    for (unsigned i = 0; i < ParamList->size(); ++i) {
+      Record *Param = ParamList->getElementAsRecord(i);
+      Params.emplace_back(DXILParam(Param));
+    }
+    OverloadTypes = R->getValueAsString("oload_types");
+    FnAttr = R->getValueAsString("fn_attr");
+  }
+};
+} // end anonymous namespace
+
+static void emitDXILOpEnum(DXILOperationData &DXILOp, raw_ostream &OS) {
+  // Name = ID, // Doc
+  OS << DXILOp.Name << " = " << DXILOp.DXILOpID << ", // " << DXILOp.Doc
+     << "\n";
+}
+
+static std::string buildCategoryStr(StringSet<> &Cetegorys) {
+  std::string Str;
+  raw_string_ostream OS(Str);
+  for (auto &It : Cetegorys) {
+    OS << " " << It.getKey();
+  }
+  return OS.str();
+}
+
+// Emit enum declaration for DXIL.
+static void emitDXILEnums(std::vector<DXILOperationData> &DXILOps,
+                          raw_ostream &OS) {
+  // Sort by Category + OpName.
+  std::sort(DXILOps.begin(), DXILOps.end(),
+            [](DXILOperationData &A, DXILOperationData &B) {
+              // Group by Category first.
+              if (A.Category == B.Category)
+                // Inside same Category, order by OpName.
+                return A.DXILOp < B.DXILOp;
+              else
+                return A.Category < B.Category;
+            });
+
+  OS << "// Enumeration for operations specified by DXIL\n";
+  OS << "enum class OpCode : unsigned {\n";
+
+  StringMap<StringSet<>> ClassMap;
+  StringRef PrevCategory = "";
+  for (auto &DXILOp : DXILOps) {
+    StringRef Category = DXILOp.Category;
+    if (Category != PrevCategory) {
+      OS << "\n// " << Category << "\n";
+      PrevCategory = Category;
+    }
+    emitDXILOpEnum(DXILOp, OS);
+    auto It = ClassMap.find(DXILOp.DXILClass);
+    if (It != ClassMap.end()) {
+      It->second.insert(DXILOp.Category);
+    } else {
+      ClassMap[DXILOp.DXILClass].insert(DXILOp.Category);
+    }
+  }
+
+  OS << "\n};\n\n";
+
+  std::vector<std::pair<std::string, std::string>> ClassVec;
+  for (auto &It : ClassMap) {
+    ClassVec.emplace_back(
+        std::make_pair(It.getKey().str(), buildCategoryStr(It.second)));
+  }
+  // Sort by Category + ClassName.
+  std::sort(ClassVec.begin(), ClassVec.end(),
+            [](std::pair<std::string, std::string> &A,
+               std::pair<std::string, std::string> &B) {
+              StringRef ClassA = A.first;
+              StringRef CategoryA = A.second;
+              StringRef ClassB = B.first;
+              StringRef CategoryB = B.second;
+              // Group by Category first.
+              if (CategoryA == CategoryB)
+                // Inside same Category, order by ClassName.
+                return ClassA < ClassB;
+              else
+                return CategoryA < CategoryB;
+            });
+
+  OS << "// Groups for DXIL operations with equivalent function templates\n";
+  OS << "enum class OpCodeClass : unsigned {\n";
+  PrevCategory = "";
+  for (auto &It : ClassVec) {
+
+    StringRef Category = It.second;
+    if (Category != PrevCategory) {
+      OS << "\n// " << Category << "\n";
+      PrevCategory = Category;
+    }
+    StringRef Name = It.first;
+    OS << Name << ",\n";
+  }
+  OS << "\n};\n\n";
+}
+
+// Emit map from llvm intrinsic to DXIL operation.
+static void emitDXILIntrinsicMap(std::vector<DXILOperationData> &DXILOps,
+                                 raw_ostream &OS) {
+  OS << "\n";
+  // FIXME: use array instead of SmallDenseMap.
+  OS << "static const SmallDenseMap<Intrinsic::ID, DXIL::OpCode> LowerMap = "
+        "{\n";
+  for (auto &DXILOp : DXILOps) {
+    if (DXILOp.Intrinsic.empty())
+      continue;
+    // {Intrinsic::sin, DXIL::OpCode::Sin},
+    OS << "  { Intrinsic::" << DXILOp.Intrinsic
+       << ", DXIL::OpCode::" << DXILOp.DXILOp << "},\n";
+  }
+  OS << "};\n";
+  OS << "\n";
+}
+
+static std::string emitDXILOperationFnAttr(StringRef FnAttr) {
+  return StringSwitch<std::string>(FnAttr)
+      .Case("rn", "Attribute::ReadNone")
+      .Case("ro", "Attribute::ReadOnly")
+      .Default("Attribute::None");
+}
+
+static std::string getOverloadKind(StringRef Overload) {
+  return StringSwitch<std::string>(Overload)
+      .Case("half", "OverloadKind::HALF")
+      .Case("float", "OverloadKind::FLOAT")
+      .Case("double", "OverloadKind::DOUBLE")
+      .Case("i1", "OverloadKind::I1")
+      .Case("i16", "OverloadKind::I16")
+      .Case("i32", "OverloadKind::I32")
+      .Case("i64", "OverloadKind::I64")
+      .Case("udt", "OverloadKind::UserDefineType")
+      .Case("obj", "OverloadKind::ObjectType")
+      .Default("OverloadKind::VOID");
+}
+
+static std::string getDXILOperationOverload(StringRef Overloads) {
+  SmallVector<StringRef> OverloadStrs;
+  Overloads.split(OverloadStrs, ';', /*MaxSplit*/ -1, /*KeepEmpty*/ false);
+  // Format is: OverloadKind::FLOAT | OverloadKind::HALF
+  assert(!OverloadStrs.empty() && "Invalid overloads");
+  auto It = OverloadStrs.begin();
+  std::string Result;
+  raw_string_ostream OS(Result);
+  OS << getOverloadKind(*It);
+  for (++It; It != OverloadStrs.end(); ++It) {
+    OS << " | " << getOverloadKind(*It);
+  }
+  return OS.str();
+}
+
+static std::string lowerFirstLetter(StringRef Name) {
+  if (Name.empty())
+    return "";
+
+  std::string LowerName = Name.str();
+  LowerName[0] = llvm::toLower(Name[0]);
+  return LowerName;
+}
+
+static std::string getDXILOpClassName(StringRef DXILOpClass) {
+  // Lower first letter expect for special case.
+  return StringSwitch<std::string>(DXILOpClass)
+      .Case("CBufferLoad", "cbufferLoad")
+      .Case("CBufferLoadLegacy", "cbufferLoadLegacy")
+      .Case("GSInstanceID", "gsInstanceID")
+      .Default(lowerFirstLetter(DXILOpClass));
+}
+
+static void emitDXILOperationTable(std::vector<DXILOperationData> &DXILOps,
+                                   raw_ostream &OS) {
+  // Sort by DXILOpID.
+  std::sort(DXILOps.begin(), DXILOps.end(),
+            [](DXILOperationData &A, DXILOperationData &B) {
+              return A.DXILOpID < B.DXILOpID;
+            });
+
+  // Collect Names.
+  SequenceToOffsetTable<std::string> OpClassStrings;
+  SequenceToOffsetTable<std::string> OpStrings;
+
+  StringSet<> ClassSet;
+  for (auto &DXILOp : DXILOps) {
+    OpStrings.add(DXILOp.DXILOp.str());
+
+    if (ClassSet.find(DXILOp.DXILClass) != ClassSet.end())
+      continue;
+    ClassSet.insert(DXILOp.DXILClass);
+    OpClassStrings.add(getDXILOpClassName(DXILOp.DXILClass));
+  }
+
+  // Layout names.
+  OpStrings.layout();
+  OpClassStrings.layout();
+
+  // Emit the DXIL operation table.
+  //{DXIL::OpCode::Sin, OpCodeNameIndex, OpCodeClass::Unary,
+  // OpCodeClassNameIndex,
+  // OverloadKind::FLOAT | OverloadKind::HALF, Attribute::AttrKind::ReadNone},
+  OS << "static const OpCodeProperty *getOpCodeProperty(DXIL::OpCode DXILOp) "
+        "{\n";
+
+  OS << "  static const OpCodeProperty OpCodeProps[] = {\n";
+  for (auto &DXILOp : DXILOps) {
+    OS << "  { DXIL::OpCode::" << DXILOp.DXILOp << ", "
+       << OpStrings.get(DXILOp.DXILOp.str())
+       << ", OpCodeClass::" << DXILOp.DXILClass << ", "
+       << OpClassStrings.get(getDXILOpClassName(DXILOp.DXILClass)) << ", "
+       << getDXILOperationOverload(DXILOp.OverloadTypes) << ", "
+       << emitDXILOperationFnAttr(DXILOp.FnAttr) << " },\n";
+  }
+  OS << "  };\n";
+
+  OS << "  // FIXME: change search to indexing with\n";
+  OS << "  // DXILOp once all DXIL op is added.\n";
+  OS << "  OpCodeProperty TmpProp;\n";
+  OS << "  TmpProp.OpCode = DXILOp;\n";
+  OS << "  const OpCodeProperty *Prop =\n";
+  OS << "      llvm::lower_bound(OpCodeProps, TmpProp,\n";
+  OS << "                        [](const OpCodeProperty &A, const "
+        "OpCodeProperty &B) {\n";
+  OS << "                          return A.OpCode < B.OpCode;\n";
+  OS << "                        });\n";
+  OS << "  assert(Prop && \"fail to find OpCodeProperty\");\n";
+  OS << "  return Prop;\n";
+  OS << "}\n\n";
+
+  // Emit the string tables.
+  OS << "static const char *getOpCodeName(DXIL::OpCode DXILOp) {\n\n";
+
+  OpStrings.emitStringLiteralDef(OS,
+                                 "  static const char DXILOpCodeNameTable[]");
+
+  OS << "  auto *Prop = getOpCodeProperty(DXILOp);\n";
+  OS << "  unsigned Index = Prop->OpCodeNameOffset;\n";
+  OS << "  return DXILOpCodeNameTable + Index;\n";
+  OS << "}\n\n";
+
+  OS << "static const char *getOpCodeClassName(const OpCodeProperty &Prop) "
+        "{\n\n";
+
+  OpClassStrings.emitStringLiteralDef(
+      OS, "  static const char DXILOpCodeClassNameTable[]");
+
+  OS << "  unsigned Index = Prop.OpCodeClassNameOffset;\n";
+  OS << "  return DXILOpCodeClassNameTable + Index;\n";
+  OS << "}\n ";
+}
+
+namespace llvm {
+
+void EmitDXILOperation(RecordKeeper &Records, raw_ostream &OS) {
+  std::vector<Record *> Ops = Records.getAllDerivedDefinitions("dxil_op");
+  OS << "// Generated code, do not edit.\n";
+  OS << "\n";
+
+  std::vector<DXILOperationData> DXILOps;
+  DXILOps.reserve(Ops.size());
+  for (auto *Record : Ops) {
+    DXILOps.emplace_back(DXILOperationData(Record));
+  }
+
+  OS << "#ifdef DXIL_OP_ENUM\n";
+  emitDXILEnums(DXILOps, OS);
+  OS << "#endif\n\n";
+
+  OS << "#ifdef DXIL_OP_INTRINSIC_MAP\n";
+  emitDXILIntrinsicMap(DXILOps, OS);
+  OS << "#endif\n\n";
+
+  OS << "#ifdef DXIL_OP_OPERATION_TABLE\n";
+  emitDXILOperationTable(DXILOps, OS);
+  OS << "#endif\n\n";
+
+  OS << "\n";
+}
+
+} // namespace llvm
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
new file mode 100644
index 000000000000..8477e0639f90
--- /dev/null
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -0,0 +1,2705 @@
+//===---------------- DecoderEmitter.cpp - Decoder Generator --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// It contains the tablegen backend that emits the decoder functions for
+// targets with fixed/variable length instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CodeGenInstruction.h"
+#include "CodeGenTarget.h"
+#include "InfoByHwMode.h"
+#include "VarLenCodeEmitterGen.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/CachedHashString.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCDecoderOps.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "decoder-emitter"
+
+namespace {
+
+STATISTIC(NumEncodings, "Number of encodings considered");
+STATISTIC(NumEncodingsLackingDisasm, "Number of encodings without disassembler info");
+STATISTIC(NumInstructions, "Number of instructions considered");
+STATISTIC(NumEncodingsSupported, "Number of encodings supported");
+STATISTIC(NumEncodingsOmitted, "Number of encodings omitted");
+
+struct EncodingField {
+  unsigned Base, Width, Offset;
+  EncodingField(unsigned B, unsigned W, unsigned O)
+    : Base(B), Width(W), Offset(O) { }
+};
+
+struct OperandInfo {
+  std::vector<EncodingField> Fields;
+  std::string Decoder;
+  bool HasCompleteDecoder;
+  uint64_t InitValue;
+
+  OperandInfo(std::string D, bool HCD)
+      : Decoder(std::move(D)), HasCompleteDecoder(HCD), InitValue(0) {}
+
+  void addField(unsigned Base, unsigned Width, unsigned Offset) {
+    Fields.push_back(EncodingField(Base, Width, Offset));
+  }
+
+  unsigned numFields() const { return Fields.size(); }
+
+  typedef std::vector<EncodingField>::const_iterator const_iterator;
+
+  const_iterator begin() const { return Fields.begin(); }
+  const_iterator end() const   { return Fields.end();   }
+};
+
+typedef std::vector<uint8_t> DecoderTable;
+typedef uint32_t DecoderFixup;
+typedef std::vector<DecoderFixup> FixupList;
+typedef std::vector<FixupList> FixupScopeList;
+typedef SmallSetVector<CachedHashString, 16> PredicateSet;
+typedef SmallSetVector<CachedHashString, 16> DecoderSet;
+struct DecoderTableInfo {
+  DecoderTable Table;
+  FixupScopeList FixupStack;
+  PredicateSet Predicates;
+  DecoderSet Decoders;
+};
+
+struct EncodingAndInst {
+  const Record *EncodingDef;
+  const CodeGenInstruction *Inst;
+  StringRef HwModeName;
+
+  EncodingAndInst(const Record *EncodingDef, const CodeGenInstruction *Inst,
+                  StringRef HwModeName = "")
+      : EncodingDef(EncodingDef), Inst(Inst), HwModeName(HwModeName) {}
+};
+
+struct EncodingIDAndOpcode {
+  unsigned EncodingID;
+  unsigned Opcode;
+
+  EncodingIDAndOpcode() : EncodingID(0), Opcode(0) {}
+  EncodingIDAndOpcode(unsigned EncodingID, unsigned Opcode)
+      : EncodingID(EncodingID), Opcode(Opcode) {}
+};
+
+raw_ostream &operator<<(raw_ostream &OS, const EncodingAndInst &Value) {
+  if (Value.EncodingDef != Value.Inst->TheDef)
+    OS << Value.EncodingDef->getName() << ":";
+  OS << Value.Inst->TheDef->getName();
+  return OS;
+}
+
+class DecoderEmitter {
+  RecordKeeper &RK;
+  std::vector<EncodingAndInst> NumberedEncodings;
+
+public:
+  // Defaults preserved here for documentation, even though they aren't
+  // strictly necessary given the way that this is currently being called.
+  DecoderEmitter(RecordKeeper &R, std::string PredicateNamespace,
+                 std::string GPrefix = "if (",
+                 std::string GPostfix = " == MCDisassembler::Fail)",
+                 std::string ROK = "MCDisassembler::Success",
+                 std::string RFail = "MCDisassembler::Fail", std::string L = "")
+      : RK(R), Target(R), PredicateNamespace(std::move(PredicateNamespace)),
+        GuardPrefix(std::move(GPrefix)), GuardPostfix(std::move(GPostfix)),
+        ReturnOK(std::move(ROK)), ReturnFail(std::move(RFail)),
+        Locals(std::move(L)) {}
+
+  // Emit the decoder state machine table.
+  void emitTable(formatted_raw_ostream &o, DecoderTable &Table,
+                 unsigned Indentation, unsigned BitWidth,
+                 StringRef Namespace) const;
+  void emitInstrLenTable(formatted_raw_ostream &OS,
+                         std::vector<unsigned> &InstrLen) const;
+  void emitPredicateFunction(formatted_raw_ostream &OS,
+                             PredicateSet &Predicates,
+                             unsigned Indentation) const;
+  void emitDecoderFunction(formatted_raw_ostream &OS,
+                           DecoderSet &Decoders,
+                           unsigned Indentation) const;
+
+  // run - Output the code emitter
+  void run(raw_ostream &o);
+
+private:
+  CodeGenTarget Target;
+
+public:
+  std::string PredicateNamespace;
+  std::string GuardPrefix, GuardPostfix;
+  std::string ReturnOK, ReturnFail;
+  std::string Locals;
+};
+
+} // end anonymous namespace
+
+// The set (BIT_TRUE, BIT_FALSE, BIT_UNSET) represents a ternary logic system
+// for a bit value.
+//
+// BIT_UNFILTERED is used as the init value for a filter position.  It is used
+// only for filter processings.
+typedef enum {
+  BIT_TRUE,      // '1'
+  BIT_FALSE,     // '0'
+  BIT_UNSET,     // '?'
+  BIT_UNFILTERED // unfiltered
+} bit_value_t;
+
+static bool ValueSet(bit_value_t V) {
+  return (V == BIT_TRUE || V == BIT_FALSE);
+}
+
+static bool ValueNotSet(bit_value_t V) {
+  return (V == BIT_UNSET);
+}
+
+static int Value(bit_value_t V) {
+  return ValueNotSet(V) ? -1 : (V == BIT_FALSE ? 0 : 1);
+}
+
+static bit_value_t bitFromBits(const BitsInit &bits, unsigned index) {
+  if (BitInit *bit = dyn_cast<BitInit>(bits.getBit(index)))
+    return bit->getValue() ? BIT_TRUE : BIT_FALSE;
+
+  // The bit is uninitialized.
+  return BIT_UNSET;
+}
+
+// Prints the bit value for each position.
+static void dumpBits(raw_ostream &o, const BitsInit &bits) {
+  for (unsigned index = bits.getNumBits(); index > 0; --index) {
+    switch (bitFromBits(bits, index - 1)) {
+    case BIT_TRUE:
+      o << "1";
+      break;
+    case BIT_FALSE:
+      o << "0";
+      break;
+    case BIT_UNSET:
+      o << "_";
+      break;
+    default:
+      llvm_unreachable("unexpected return value from bitFromBits");
+    }
+  }
+}
+
+static BitsInit &getBitsField(const Record &def, StringRef str) {
+  const RecordVal *RV = def.getValue(str);
+  if (BitsInit *Bits = dyn_cast<BitsInit>(RV->getValue()))
+    return *Bits;
+
+  // variable length instruction
+  VarLenInst VLI = VarLenInst(cast<DagInit>(RV->getValue()), RV);
+  SmallVector<Init *, 16> Bits;
+
+  for (auto &SI : VLI) {
+    if (const BitsInit *BI = dyn_cast<BitsInit>(SI.Value)) {
+      for (unsigned Idx = 0U; Idx < BI->getNumBits(); ++Idx) {
+        Bits.push_back(BI->getBit(Idx));
+      }
+    } else if (const BitInit *BI = dyn_cast<BitInit>(SI.Value)) {
+      Bits.push_back(const_cast<BitInit *>(BI));
+    } else {
+      for (unsigned Idx = 0U; Idx < SI.BitWidth; ++Idx)
+        Bits.push_back(UnsetInit::get(def.getRecords()));
+    }
+  }
+
+  return *BitsInit::get(def.getRecords(), Bits);
+}
+
+// Representation of the instruction to work on.
+typedef std::vector<bit_value_t> insn_t;
+
+namespace {
+
+static const uint64_t NO_FIXED_SEGMENTS_SENTINEL = -1ULL;
+
+class FilterChooser;
+
+/// Filter - Filter works with FilterChooser to produce the decoding tree for
+/// the ISA.
+///
+/// It is useful to think of a Filter as governing the switch stmts of the
+/// decoding tree in a certain level.  Each case stmt delegates to an inferior
+/// FilterChooser to decide what further decoding logic to employ, or in another
+/// words, what other remaining bits to look at.  The FilterChooser eventually
+/// chooses a best Filter to do its job.
+///
+/// This recursive scheme ends when the number of Opcodes assigned to the
+/// FilterChooser becomes 1 or if there is a conflict.  A conflict happens when
+/// the Filter/FilterChooser combo does not know how to distinguish among the
+/// Opcodes assigned.
+///
+/// An example of a conflict is
+///
+/// Conflict:
+///                     111101000.00........00010000....
+///                     111101000.00........0001........
+///                     1111010...00........0001........
+///                     1111010...00....................
+///                     1111010.........................
+///                     1111............................
+///                     ................................
+///     VST4q8a         111101000_00________00010000____
+///     VST4q8b         111101000_00________00010000____
+///
+/// The Debug output shows the path that the decoding tree follows to reach the
+/// the conclusion that there is a conflict.  VST4q8a is a vst4 to double-spaced
+/// even registers, while VST4q8b is a vst4 to double-spaced odd registers.
+///
+/// The encoding info in the .td files does not specify this meta information,
+/// which could have been used by the decoder to resolve the conflict.  The
+/// decoder could try to decode the even/odd register numbering and assign to
+/// VST4q8a or VST4q8b, but for the time being, the decoder chooses the "a"
+/// version and return the Opcode since the two have the same Asm format string.
+class Filter {
+protected:
+  const FilterChooser *Owner;// points to the FilterChooser who owns this filter
+  unsigned StartBit; // the starting bit position
+  unsigned NumBits; // number of bits to filter
+  bool Mixed; // a mixed region contains both set and unset bits
+
+  // Map of well-known segment value to the set of uid's with that value.
+  std::map<uint64_t, std::vector<EncodingIDAndOpcode>>
+      FilteredInstructions;
+
+  // Set of uid's with non-constant segment values.
+  std::vector<EncodingIDAndOpcode> VariableInstructions;
+
+  // Map of well-known segment value to its delegate.
+  std::map<uint64_t, std::unique_ptr<const FilterChooser>> FilterChooserMap;
+
+  // Number of instructions which fall under FilteredInstructions category.
+  unsigned NumFiltered;
+
+  // Keeps track of the last opcode in the filtered bucket.
+  EncodingIDAndOpcode LastOpcFiltered;
+
+public:
+  Filter(Filter &&f);
+  Filter(FilterChooser &owner, unsigned startBit, unsigned numBits, bool mixed);
+
+  ~Filter() = default;
+
+  unsigned getNumFiltered() const { return NumFiltered; }
+
+  EncodingIDAndOpcode getSingletonOpc() const {
+    assert(NumFiltered == 1);
+    return LastOpcFiltered;
+  }
+
+  // Return the filter chooser for the group of instructions without constant
+  // segment values.
+  const FilterChooser &getVariableFC() const {
+    assert(NumFiltered == 1);
+    assert(FilterChooserMap.size() == 1);
+    return *(FilterChooserMap.find(NO_FIXED_SEGMENTS_SENTINEL)->second);
+  }
+
+  // Divides the decoding task into sub tasks and delegates them to the
+  // inferior FilterChooser's.
+  //
+  // A special case arises when there's only one entry in the filtered
+  // instructions.  In order to unambiguously decode the singleton, we need to
+  // match the remaining undecoded encoding bits against the singleton.
+  void recurse();
+
+  // Emit table entries to decode instructions given a segment or segments of
+  // bits.
+  void emitTableEntry(DecoderTableInfo &TableInfo) const;
+
+  // Returns the number of fanout produced by the filter.  More fanout implies
+  // the filter distinguishes more categories of instructions.
+  unsigned usefulness() const;
+}; // end class Filter
+
+} // end anonymous namespace
+
+// These are states of our finite state machines used in FilterChooser's
+// filterProcessor() which produces the filter candidates to use.
+typedef enum {
+  ATTR_NONE,
+  ATTR_FILTERED,
+  ATTR_ALL_SET,
+  ATTR_ALL_UNSET,
+  ATTR_MIXED
+} bitAttr_t;
+
+/// FilterChooser - FilterChooser chooses the best filter among a set of Filters
+/// in order to perform the decoding of instructions at the current level.
+///
+/// Decoding proceeds from the top down.  Based on the well-known encoding bits
+/// of instructions available, FilterChooser builds up the possible Filters that
+/// can further the task of decoding by distinguishing among the remaining
+/// candidate instructions.
+///
+/// Once a filter has been chosen, it is called upon to divide the decoding task
+/// into sub-tasks and delegates them to its inferior FilterChoosers for further
+/// processings.
+///
+/// It is useful to think of a Filter as governing the switch stmts of the
+/// decoding tree.  And each case is delegated to an inferior FilterChooser to
+/// decide what further remaining bits to look at.
+namespace {
+
+class FilterChooser {
+protected:
+  friend class Filter;
+
+  // Vector of codegen instructions to choose our filter.
+  ArrayRef<EncodingAndInst> AllInstructions;
+
+  // Vector of uid's for this filter chooser to work on.
+  // The first member of the pair is the opcode id being decoded, the second is
+  // the opcode id that should be emitted.
+  const std::vector<EncodingIDAndOpcode> &Opcodes;
+
+  // Lookup table for the operand decoding of instructions.
+  const std::map<unsigned, std::vector<OperandInfo>> &Operands;
+
+  // Vector of candidate filters.
+  std::vector<Filter> Filters;
+
+  // Array of bit values passed down from our parent.
+  // Set to all BIT_UNFILTERED's for Parent == NULL.
+  std::vector<bit_value_t> FilterBitValues;
+
+  // Links to the FilterChooser above us in the decoding tree.
+  const FilterChooser *Parent;
+
+  // Index of the best filter from Filters.
+  int BestIndex;
+
+  // Width of instructions
+  unsigned BitWidth;
+
+  // Parent emitter
+  const DecoderEmitter *Emitter;
+
+public:
+  FilterChooser(ArrayRef<EncodingAndInst> Insts,
+                const std::vector<EncodingIDAndOpcode> &IDs,
+                const std::map<unsigned, std::vector<OperandInfo>> &Ops,
+                unsigned BW, const DecoderEmitter *E)
+      : AllInstructions(Insts), Opcodes(IDs), Operands(Ops),
+        FilterBitValues(BW, BIT_UNFILTERED), Parent(nullptr), BestIndex(-1),
+        BitWidth(BW), Emitter(E) {
+    doFilter();
+  }
+
+  FilterChooser(ArrayRef<EncodingAndInst> Insts,
+                const std::vector<EncodingIDAndOpcode> &IDs,
+                const std::map<unsigned, std::vector<OperandInfo>> &Ops,
+                const std::vector<bit_value_t> &ParentFilterBitValues,
+                const FilterChooser &parent)
+      : AllInstructions(Insts), Opcodes(IDs), Operands(Ops),
+        FilterBitValues(ParentFilterBitValues), Parent(&parent), BestIndex(-1),
+        BitWidth(parent.BitWidth), Emitter(parent.Emitter) {
+    doFilter();
+  }
+
+  FilterChooser(const FilterChooser &) = delete;
+  void operator=(const FilterChooser &) = delete;
+
+  unsigned getBitWidth() const { return BitWidth; }
+
+protected:
+  // Populates the insn given the uid.
+  void insnWithID(insn_t &Insn, unsigned Opcode) const {
+    BitsInit &Bits = getBitsField(*AllInstructions[Opcode].EncodingDef, "Inst");
+    Insn.resize(BitWidth > Bits.getNumBits() ? BitWidth : Bits.getNumBits(),
+                BIT_UNSET);
+    // We may have a SoftFail bitmask, which specifies a mask where an encoding
+    // may differ from the value in "Inst" and yet still be valid, but the
+    // disassembler should return SoftFail instead of Success.
+    //
+    // This is used for marking UNPREDICTABLE instructions in the ARM world.
+    const RecordVal *RV =
+        AllInstructions[Opcode].EncodingDef->getValue("SoftFail");
+    const BitsInit *SFBits = RV ? dyn_cast<BitsInit>(RV->getValue()) : nullptr;
+    for (unsigned i = 0; i < Bits.getNumBits(); ++i) {
+      if (SFBits && bitFromBits(*SFBits, i) == BIT_TRUE)
+        Insn[i] = BIT_UNSET;
+      else
+        Insn[i] = bitFromBits(Bits, i);
+    }
+  }
+
+  // Emit the name of the encoding/instruction pair.
+  void emitNameWithID(raw_ostream &OS, unsigned Opcode) const {
+    const Record *EncodingDef = AllInstructions[Opcode].EncodingDef;
+    const Record *InstDef = AllInstructions[Opcode].Inst->TheDef;
+    if (EncodingDef != InstDef)
+      OS << EncodingDef->getName() << ":";
+    OS << InstDef->getName();
+  }
+
+  // Populates the field of the insn given the start position and the number of
+  // consecutive bits to scan for.
+  //
+  // Returns false if there exists any uninitialized bit value in the range.
+  // Returns true, otherwise.
+  bool fieldFromInsn(uint64_t &Field, insn_t &Insn, unsigned StartBit,
+                     unsigned NumBits) const;
+
+  /// dumpFilterArray - dumpFilterArray prints out debugging info for the given
+  /// filter array as a series of chars.
+  void dumpFilterArray(raw_ostream &o,
+                       const std::vector<bit_value_t> & filter) const;
+
+  /// dumpStack - dumpStack traverses the filter chooser chain and calls
+  /// dumpFilterArray on each filter chooser up to the top level one.
+  void dumpStack(raw_ostream &o, const char *prefix) const;
+
+  Filter &bestFilter() {
+    assert(BestIndex != -1 && "BestIndex not set");
+    return Filters[BestIndex];
+  }
+
+  bool PositionFiltered(unsigned i) const {
+    return ValueSet(FilterBitValues[i]);
+  }
+
+  // Calculates the island(s) needed to decode the instruction.
+  // This returns a lit of undecoded bits of an instructions, for example,
+  // Inst{20} = 1 && Inst{3-0} == 0b1111 represents two islands of yet-to-be
+  // decoded bits in order to verify that the instruction matches the Opcode.
+  unsigned getIslands(std::vector<unsigned> &StartBits,
+                      std::vector<unsigned> &EndBits,
+                      std::vector<uint64_t> &FieldVals,
+                      const insn_t &Insn) const;
+
+  // Emits code to check the Predicates member of an instruction are true.
+  // Returns true if predicate matches were emitted, false otherwise.
+  bool emitPredicateMatch(raw_ostream &o, unsigned &Indentation,
+                          unsigned Opc) const;
+  bool emitPredicateMatchAux(const Init &Val, bool ParenIfBinOp,
+                             raw_ostream &OS) const;
+
+  bool doesOpcodeNeedPredicate(unsigned Opc) const;
+  unsigned getPredicateIndex(DecoderTableInfo &TableInfo, StringRef P) const;
+  void emitPredicateTableEntry(DecoderTableInfo &TableInfo,
+                               unsigned Opc) const;
+
+  void emitSoftFailTableEntry(DecoderTableInfo &TableInfo,
+                              unsigned Opc) const;
+
+  // Emits table entries to decode the singleton.
+  void emitSingletonTableEntry(DecoderTableInfo &TableInfo,
+                               EncodingIDAndOpcode Opc) const;
+
+  // Emits code to decode the singleton, and then to decode the rest.
+  void emitSingletonTableEntry(DecoderTableInfo &TableInfo,
+                               const Filter &Best) const;
+
+  void emitBinaryParser(raw_ostream &o, unsigned &Indentation,
+                        const OperandInfo &OpInfo,
+                        bool &OpHasCompleteDecoder) const;
+
+  void emitDecoder(raw_ostream &OS, unsigned Indentation, unsigned Opc,
+                   bool &HasCompleteDecoder) const;
+  unsigned getDecoderIndex(DecoderSet &Decoders, unsigned Opc,
+                           bool &HasCompleteDecoder) const;
+
+  // Assign a single filter and run with it.
+  void runSingleFilter(unsigned startBit, unsigned numBit, bool mixed);
+
+  // reportRegion is a helper function for filterProcessor to mark a region as
+  // eligible for use as a filter region.
+  void reportRegion(bitAttr_t RA, unsigned StartBit, unsigned BitIndex,
+                    bool AllowMixed);
+
+  // FilterProcessor scans the well-known encoding bits of the instructions and
+  // builds up a list of candidate filters.  It chooses the best filter and
+  // recursively descends down the decoding tree.
+  bool filterProcessor(bool AllowMixed, bool Greedy = true);
+
+  // Decides on the best configuration of filter(s) to use in order to decode
+  // the instructions.  A conflict of instructions may occur, in which case we
+  // dump the conflict set to the standard error.
+  void doFilter();
+
+public:
+  // emitTableEntries - Emit state machine entries to decode our share of
+  // instructions.
+  void emitTableEntries(DecoderTableInfo &TableInfo) const;
+};
+
+} // end anonymous namespace
+
+///////////////////////////
+//                       //
+// Filter Implementation //
+//                       //
+///////////////////////////
+
+Filter::Filter(Filter &&f)
+  : Owner(f.Owner), StartBit(f.StartBit), NumBits(f.NumBits), Mixed(f.Mixed),
+    FilteredInstructions(std::move(f.FilteredInstructions)),
+    VariableInstructions(std::move(f.VariableInstructions)),
+    FilterChooserMap(std::move(f.FilterChooserMap)), NumFiltered(f.NumFiltered),
+    LastOpcFiltered(f.LastOpcFiltered) {
+}
+
+Filter::Filter(FilterChooser &owner, unsigned startBit, unsigned numBits,
+               bool mixed)
+  : Owner(&owner), StartBit(startBit), NumBits(numBits), Mixed(mixed) {
+  assert(StartBit + NumBits - 1 < Owner->BitWidth);
+
+  NumFiltered = 0;
+  LastOpcFiltered = {0, 0};
+
+  for (unsigned i = 0, e = Owner->Opcodes.size(); i != e; ++i) {
+    insn_t Insn;
+
+    // Populates the insn given the uid.
+    Owner->insnWithID(Insn, Owner->Opcodes[i].EncodingID);
+
+    uint64_t Field;
+    // Scans the segment for possibly well-specified encoding bits.
+    bool ok = Owner->fieldFromInsn(Field, Insn, StartBit, NumBits);
+
+    if (ok) {
+      // The encoding bits are well-known.  Lets add the uid of the
+      // instruction into the bucket keyed off the constant field value.
+      LastOpcFiltered = Owner->Opcodes[i];
+      FilteredInstructions[Field].push_back(LastOpcFiltered);
+      ++NumFiltered;
+    } else {
+      // Some of the encoding bit(s) are unspecified.  This contributes to
+      // one additional member of "Variable" instructions.
+      VariableInstructions.push_back(Owner->Opcodes[i]);
+    }
+  }
+
+  assert((FilteredInstructions.size() + VariableInstructions.size() > 0)
+         && "Filter returns no instruction categories");
+}
+
+// Divides the decoding task into sub tasks and delegates them to the
+// inferior FilterChooser's.
+//
+// A special case arises when there's only one entry in the filtered
+// instructions.  In order to unambiguously decode the singleton, we need to
+// match the remaining undecoded encoding bits against the singleton.
+void Filter::recurse() {
+  // Starts by inheriting our parent filter chooser's filter bit values.
+  std::vector<bit_value_t> BitValueArray(Owner->FilterBitValues);
+
+  if (!VariableInstructions.empty()) {
+    // Conservatively marks each segment position as BIT_UNSET.
+    for (unsigned bitIndex = 0; bitIndex < NumBits; ++bitIndex)
+      BitValueArray[StartBit + bitIndex] = BIT_UNSET;
+
+    // Delegates to an inferior filter chooser for further processing on this
+    // group of instructions whose segment values are variable.
+    FilterChooserMap.insert(std::make_pair(NO_FIXED_SEGMENTS_SENTINEL,
+        std::make_unique<FilterChooser>(Owner->AllInstructions,
+            VariableInstructions, Owner->Operands, BitValueArray, *Owner)));
+  }
+
+  // No need to recurse for a singleton filtered instruction.
+  // See also Filter::emit*().
+  if (getNumFiltered() == 1) {
+    assert(FilterChooserMap.size() == 1);
+    return;
+  }
+
+  // Otherwise, create sub choosers.
+  for (const auto &Inst : FilteredInstructions) {
+
+    // Marks all the segment positions with either BIT_TRUE or BIT_FALSE.
+    for (unsigned bitIndex = 0; bitIndex < NumBits; ++bitIndex) {
+      if (Inst.first & (1ULL << bitIndex))
+        BitValueArray[StartBit + bitIndex] = BIT_TRUE;
+      else
+        BitValueArray[StartBit + bitIndex] = BIT_FALSE;
+    }
+
+    // Delegates to an inferior filter chooser for further processing on this
+    // category of instructions.
+    FilterChooserMap.insert(std::make_pair(
+        Inst.first, std::make_unique<FilterChooser>(
+                                Owner->AllInstructions, Inst.second,
+                                Owner->Operands, BitValueArray, *Owner)));
+  }
+}
+
+static void resolveTableFixups(DecoderTable &Table, const FixupList &Fixups,
+                               uint32_t DestIdx) {
+  // Any NumToSkip fixups in the current scope can resolve to the
+  // current location.
+  for (FixupList::const_reverse_iterator I = Fixups.rbegin(),
+                                         E = Fixups.rend();
+       I != E; ++I) {
+    // Calculate the distance from the byte following the fixup entry byte
+    // to the destination. The Target is calculated from after the 16-bit
+    // NumToSkip entry itself, so subtract two  from the displacement here
+    // to account for that.
+    uint32_t FixupIdx = *I;
+    uint32_t Delta = DestIdx - FixupIdx - 3;
+    // Our NumToSkip entries are 24-bits. Make sure our table isn't too
+    // big.
+    assert(Delta < (1u << 24));
+    Table[FixupIdx] = (uint8_t)Delta;
+    Table[FixupIdx + 1] = (uint8_t)(Delta >> 8);
+    Table[FixupIdx + 2] = (uint8_t)(Delta >> 16);
+  }
+}
+
+// Emit table entries to decode instructions given a segment or segments
+// of bits.
+void Filter::emitTableEntry(DecoderTableInfo &TableInfo) const {
+  TableInfo.Table.push_back(MCD::OPC_ExtractField);
+  TableInfo.Table.push_back(StartBit);
+  TableInfo.Table.push_back(NumBits);
+
+  // A new filter entry begins a new scope for fixup resolution.
+  TableInfo.FixupStack.emplace_back();
+
+  DecoderTable &Table = TableInfo.Table;
+
+  size_t PrevFilter = 0;
+  bool HasFallthrough = false;
+  for (auto &Filter : FilterChooserMap) {
+    // Field value -1 implies a non-empty set of variable instructions.
+    // See also recurse().
+    if (Filter.first == NO_FIXED_SEGMENTS_SENTINEL) {
+      HasFallthrough = true;
+
+      // Each scope should always have at least one filter value to check
+      // for.
+      assert(PrevFilter != 0 && "empty filter set!");
+      FixupList &CurScope = TableInfo.FixupStack.back();
+      // Resolve any NumToSkip fixups in the current scope.
+      resolveTableFixups(Table, CurScope, Table.size());
+      CurScope.clear();
+      PrevFilter = 0;  // Don't re-process the filter's fallthrough.
+    } else {
+      Table.push_back(MCD::OPC_FilterValue);
+      // Encode and emit the value to filter against.
+      uint8_t Buffer[16];
+      unsigned Len = encodeULEB128(Filter.first, Buffer);
+      Table.insert(Table.end(), Buffer, Buffer + Len);
+      // Reserve space for the NumToSkip entry. We'll backpatch the value
+      // later.
+      PrevFilter = Table.size();
+      Table.push_back(0);
+      Table.push_back(0);
+      Table.push_back(0);
+    }
+
+    // We arrive at a category of instructions with the same segment value.
+    // Now delegate to the sub filter chooser for further decodings.
+    // The case may fallthrough, which happens if the remaining well-known
+    // encoding bits do not match exactly.
+    Filter.second->emitTableEntries(TableInfo);
+
+    // Now that we've emitted the body of the handler, update the NumToSkip
+    // of the filter itself to be able to skip forward when false. Subtract
+    // two as to account for the width of the NumToSkip field itself.
+    if (PrevFilter) {
+      uint32_t NumToSkip = Table.size() - PrevFilter - 3;
+      assert(NumToSkip < (1u << 24) && "disassembler decoding table too large!");
+      Table[PrevFilter] = (uint8_t)NumToSkip;
+      Table[PrevFilter + 1] = (uint8_t)(NumToSkip >> 8);
+      Table[PrevFilter + 2] = (uint8_t)(NumToSkip >> 16);
+    }
+  }
+
+  // Any remaining unresolved fixups bubble up to the parent fixup scope.
+  assert(TableInfo.FixupStack.size() > 1 && "fixup stack underflow!");
+  FixupScopeList::iterator Source = TableInfo.FixupStack.end() - 1;
+  FixupScopeList::iterator Dest = Source - 1;
+  llvm::append_range(*Dest, *Source);
+  TableInfo.FixupStack.pop_back();
+
+  // If there is no fallthrough, then the final filter should get fixed
+  // up according to the enclosing scope rather than the current position.
+  if (!HasFallthrough)
+    TableInfo.FixupStack.back().push_back(PrevFilter);
+}
+
+// Returns the number of fanout produced by the filter.  More fanout implies
+// the filter distinguishes more categories of instructions.
+unsigned Filter::usefulness() const {
+  if (!VariableInstructions.empty())
+    return FilteredInstructions.size();
+  else
+    return FilteredInstructions.size() + 1;
+}
+
+//////////////////////////////////
+//                              //
+// Filterchooser Implementation //
+//                              //
+//////////////////////////////////
+
+// Emit the decoder state machine table.
+void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table,
+                               unsigned Indentation, unsigned BitWidth,
+                               StringRef Namespace) const {
+  OS.indent(Indentation) << "static const uint8_t DecoderTable" << Namespace
+    << BitWidth << "[] = {\n";
+
+  Indentation += 2;
+
+  // FIXME: We may be able to use the NumToSkip values to recover
+  // appropriate indentation levels.
+  DecoderTable::const_iterator I = Table.begin();
+  DecoderTable::const_iterator E = Table.end();
+  while (I != E) {
+    assert (I < E && "incomplete decode table entry!");
+
+    uint64_t Pos = I - Table.begin();
+    OS << "/* " << Pos << " */";
+    OS.PadToColumn(12);
+
+    switch (*I) {
+    default:
+      PrintFatalError("invalid decode table opcode");
+    case MCD::OPC_ExtractField: {
+      ++I;
+      unsigned Start = *I++;
+      unsigned Len = *I++;
+      OS.indent(Indentation) << "MCD::OPC_ExtractField, " << Start << ", "
+        << Len << ",  // Inst{";
+      if (Len > 1)
+        OS << (Start + Len - 1) << "-";
+      OS << Start << "} ...\n";
+      break;
+    }
+    case MCD::OPC_FilterValue: {
+      ++I;
+      OS.indent(Indentation) << "MCD::OPC_FilterValue, ";
+      // The filter value is ULEB128 encoded.
+      while (*I >= 128)
+        OS << (unsigned)*I++ << ", ";
+      OS << (unsigned)*I++ << ", ";
+
+      // 24-bit numtoskip value.
+      uint8_t Byte = *I++;
+      uint32_t NumToSkip = Byte;
+      OS << (unsigned)Byte << ", ";
+      Byte = *I++;
+      OS << (unsigned)Byte << ", ";
+      NumToSkip |= Byte << 8;
+      Byte = *I++;
+      OS << utostr(Byte) << ", ";
+      NumToSkip |= Byte << 16;
+      OS << "// Skip to: " << ((I - Table.begin()) + NumToSkip) << "\n";
+      break;
+    }
+    case MCD::OPC_CheckField: {
+      ++I;
+      unsigned Start = *I++;
+      unsigned Len = *I++;
+      OS.indent(Indentation) << "MCD::OPC_CheckField, " << Start << ", "
+        << Len << ", ";// << Val << ", " << NumToSkip << ",\n";
+      // ULEB128 encoded field value.
+      for (; *I >= 128; ++I)
+        OS << (unsigned)*I << ", ";
+      OS << (unsigned)*I++ << ", ";
+      // 24-bit numtoskip value.
+      uint8_t Byte = *I++;
+      uint32_t NumToSkip = Byte;
+      OS << (unsigned)Byte << ", ";
+      Byte = *I++;
+      OS << (unsigned)Byte << ", ";
+      NumToSkip |= Byte << 8;
+      Byte = *I++;
+      OS << utostr(Byte) << ", ";
+      NumToSkip |= Byte << 16;
+      OS << "// Skip to: " << ((I - Table.begin()) + NumToSkip) << "\n";
+      break;
+    }
+    case MCD::OPC_CheckPredicate: {
+      ++I;
+      OS.indent(Indentation) << "MCD::OPC_CheckPredicate, ";
+      for (; *I >= 128; ++I)
+        OS << (unsigned)*I << ", ";
+      OS << (unsigned)*I++ << ", ";
+
+      // 24-bit numtoskip value.
+      uint8_t Byte = *I++;
+      uint32_t NumToSkip = Byte;
+      OS << (unsigned)Byte << ", ";
+      Byte = *I++;
+      OS << (unsigned)Byte << ", ";
+      NumToSkip |= Byte << 8;
+      Byte = *I++;
+      OS << utostr(Byte) << ", ";
+      NumToSkip |= Byte << 16;
+      OS << "// Skip to: " << ((I - Table.begin()) + NumToSkip) << "\n";
+      break;
+    }
+    case MCD::OPC_Decode:
+    case MCD::OPC_TryDecode: {
+      bool IsTry = *I == MCD::OPC_TryDecode;
+      ++I;
+      // Extract the ULEB128 encoded Opcode to a buffer.
+      uint8_t Buffer[16], *p = Buffer;
+      while ((*p++ = *I++) >= 128)
+        assert((p - Buffer) <= (ptrdiff_t)sizeof(Buffer)
+               && "ULEB128 value too large!");
+      // Decode the Opcode value.
+      unsigned Opc = decodeULEB128(Buffer);
+      OS.indent(Indentation) << "MCD::OPC_" << (IsTry ? "Try" : "")
+        << "Decode, ";
+      for (p = Buffer; *p >= 128; ++p)
+        OS << (unsigned)*p << ", ";
+      OS << (unsigned)*p << ", ";
+
+      // Decoder index.
+      for (; *I >= 128; ++I)
+        OS << (unsigned)*I << ", ";
+      OS << (unsigned)*I++ << ", ";
+
+      if (!IsTry) {
+        OS << "// Opcode: " << NumberedEncodings[Opc] << "\n";
+        break;
+      }
+
+      // Fallthrough for OPC_TryDecode.
+
+      // 24-bit numtoskip value.
+      uint8_t Byte = *I++;
+      uint32_t NumToSkip = Byte;
+      OS << (unsigned)Byte << ", ";
+      Byte = *I++;
+      OS << (unsigned)Byte << ", ";
+      NumToSkip |= Byte << 8;
+      Byte = *I++;
+      OS << utostr(Byte) << ", ";
+      NumToSkip |= Byte << 16;
+
+      OS << "// Opcode: " << NumberedEncodings[Opc]
+         << ", skip to: " << ((I - Table.begin()) + NumToSkip) << "\n";
+      break;
+    }
+    case MCD::OPC_SoftFail: {
+      ++I;
+      OS.indent(Indentation) << "MCD::OPC_SoftFail";
+      // Positive mask
+      uint64_t Value = 0;
+      unsigned Shift = 0;
+      do {
+        OS << ", " << (unsigned)*I;
+        Value += (*I & 0x7f) << Shift;
+        Shift += 7;
+      } while (*I++ >= 128);
+      if (Value > 127) {
+        OS << " /* 0x";
+        OS.write_hex(Value);
+        OS << " */";
+      }
+      // Negative mask
+      Value = 0;
+      Shift = 0;
+      do {
+        OS << ", " << (unsigned)*I;
+        Value += (*I & 0x7f) << Shift;
+        Shift += 7;
+      } while (*I++ >= 128);
+      if (Value > 127) {
+        OS << " /* 0x";
+        OS.write_hex(Value);
+        OS << " */";
+      }
+      OS << ",\n";
+      break;
+    }
+    case MCD::OPC_Fail: {
+      ++I;
+      OS.indent(Indentation) << "MCD::OPC_Fail,\n";
+      break;
+    }
+    }
+  }
+  OS.indent(Indentation) << "0\n";
+
+  Indentation -= 2;
+
+  OS.indent(Indentation) << "};\n\n";
+}
+
+void DecoderEmitter::emitInstrLenTable(formatted_raw_ostream &OS,
+                                       std::vector<unsigned> &InstrLen) const {
+  OS << "static const uint8_t InstrLenTable[] = {\n";
+  for (unsigned &Len : InstrLen) {
+    OS << Len << ",\n";
+  }
+  OS << "};\n\n";
+}
+
+void DecoderEmitter::emitPredicateFunction(formatted_raw_ostream &OS,
+                                           PredicateSet &Predicates,
+                                           unsigned Indentation) const {
+  // The predicate function is just a big switch statement based on the
+  // input predicate index.
+  OS.indent(Indentation) << "static bool checkDecoderPredicate(unsigned Idx, "
+    << "const FeatureBitset &Bits) {\n";
+  Indentation += 2;
+  if (!Predicates.empty()) {
+    OS.indent(Indentation) << "switch (Idx) {\n";
+    OS.indent(Indentation) << "default: llvm_unreachable(\"Invalid index!\");\n";
+    unsigned Index = 0;
+    for (const auto &Predicate : Predicates) {
+      OS.indent(Indentation) << "case " << Index++ << ":\n";
+      OS.indent(Indentation+2) << "return (" << Predicate << ");\n";
+    }
+    OS.indent(Indentation) << "}\n";
+  } else {
+    // No case statement to emit
+    OS.indent(Indentation) << "llvm_unreachable(\"Invalid index!\");\n";
+  }
+  Indentation -= 2;
+  OS.indent(Indentation) << "}\n\n";
+}
+
+void DecoderEmitter::emitDecoderFunction(formatted_raw_ostream &OS,
+                                         DecoderSet &Decoders,
+                                         unsigned Indentation) const {
+  // The decoder function is just a big switch statement based on the
+  // input decoder index.
+  OS.indent(Indentation) << "template <typename InsnType>\n";
+  OS.indent(Indentation) << "static DecodeStatus decodeToMCInst(DecodeStatus S,"
+    << " unsigned Idx, InsnType insn, MCInst &MI,\n";
+  OS.indent(Indentation)
+      << "                                   uint64_t "
+      << "Address, const MCDisassembler *Decoder, bool &DecodeComplete) {\n";
+  Indentation += 2;
+  OS.indent(Indentation) << "DecodeComplete = true;\n";
+  // TODO: When InsnType is large, using uint64_t limits all fields to 64 bits
+  // It would be better for emitBinaryParser to use a 64-bit tmp whenever
+  // possible but fall back to an InsnType-sized tmp for truly large fields.
+  OS.indent(Indentation) << "using TmpType = "
+                            "std::conditional_t<std::is_integral<InsnType>::"
+                            "value, InsnType, uint64_t>;\n";
+  OS.indent(Indentation) << "TmpType tmp;\n";
+  OS.indent(Indentation) << "switch (Idx) {\n";
+  OS.indent(Indentation) << "default: llvm_unreachable(\"Invalid index!\");\n";
+  unsigned Index = 0;
+  for (const auto &Decoder : Decoders) {
+    OS.indent(Indentation) << "case " << Index++ << ":\n";
+    OS << Decoder;
+    OS.indent(Indentation+2) << "return S;\n";
+  }
+  OS.indent(Indentation) << "}\n";
+  Indentation -= 2;
+  OS.indent(Indentation) << "}\n\n";
+}
+
+// Populates the field of the insn given the start position and the number of
+// consecutive bits to scan for.
+//
+// Returns false if and on the first uninitialized bit value encountered.
+// Returns true, otherwise.
+bool FilterChooser::fieldFromInsn(uint64_t &Field, insn_t &Insn,
+                                  unsigned StartBit, unsigned NumBits) const {
+  Field = 0;
+
+  for (unsigned i = 0; i < NumBits; ++i) {
+    if (Insn[StartBit + i] == BIT_UNSET)
+      return false;
+
+    if (Insn[StartBit + i] == BIT_TRUE)
+      Field = Field | (1ULL << i);
+  }
+
+  return true;
+}
+
+/// dumpFilterArray - dumpFilterArray prints out debugging info for the given
+/// filter array as a series of chars.
+void FilterChooser::dumpFilterArray(raw_ostream &o,
+                                 const std::vector<bit_value_t> &filter) const {
+  for (unsigned bitIndex = BitWidth; bitIndex > 0; bitIndex--) {
+    switch (filter[bitIndex - 1]) {
+    case BIT_UNFILTERED:
+      o << ".";
+      break;
+    case BIT_UNSET:
+      o << "_";
+      break;
+    case BIT_TRUE:
+      o << "1";
+      break;
+    case BIT_FALSE:
+      o << "0";
+      break;
+    }
+  }
+}
+
+/// dumpStack - dumpStack traverses the filter chooser chain and calls
+/// dumpFilterArray on each filter chooser up to the top level one.
+void FilterChooser::dumpStack(raw_ostream &o, const char *prefix) const {
+  const FilterChooser *current = this;
+
+  while (current) {
+    o << prefix;
+    dumpFilterArray(o, current->FilterBitValues);
+    o << '\n';
+    current = current->Parent;
+  }
+}
+
+// Calculates the island(s) needed to decode the instruction.
+// This returns a list of undecoded bits of an instructions, for example,
+// Inst{20} = 1 && Inst{3-0} == 0b1111 represents two islands of yet-to-be
+// decoded bits in order to verify that the instruction matches the Opcode.
+unsigned FilterChooser::getIslands(std::vector<unsigned> &StartBits,
+                                   std::vector<unsigned> &EndBits,
+                                   std::vector<uint64_t> &FieldVals,
+                                   const insn_t &Insn) const {
+  unsigned Num, BitNo;
+  Num = BitNo = 0;
+
+  uint64_t FieldVal = 0;
+
+  // 0: Init
+  // 1: Water (the bit value does not affect decoding)
+  // 2: Island (well-known bit value needed for decoding)
+  int State = 0;
+
+  for (unsigned i = 0; i < BitWidth; ++i) {
+    int64_t Val = Value(Insn[i]);
+    bool Filtered = PositionFiltered(i);
+    switch (State) {
+    default: llvm_unreachable("Unreachable code!");
+    case 0:
+    case 1:
+      if (Filtered || Val == -1)
+        State = 1; // Still in Water
+      else {
+        State = 2; // Into the Island
+        BitNo = 0;
+        StartBits.push_back(i);
+        FieldVal = Val;
+      }
+      break;
+    case 2:
+      if (Filtered || Val == -1) {
+        State = 1; // Into the Water
+        EndBits.push_back(i - 1);
+        FieldVals.push_back(FieldVal);
+        ++Num;
+      } else {
+        State = 2; // Still in Island
+        ++BitNo;
+        FieldVal = FieldVal | Val << BitNo;
+      }
+      break;
+    }
+  }
+  // If we are still in Island after the loop, do some housekeeping.
+  if (State == 2) {
+    EndBits.push_back(BitWidth - 1);
+    FieldVals.push_back(FieldVal);
+    ++Num;
+  }
+
+  assert(StartBits.size() == Num && EndBits.size() == Num &&
+         FieldVals.size() == Num);
+  return Num;
+}
+
+void FilterChooser::emitBinaryParser(raw_ostream &o, unsigned &Indentation,
+                                     const OperandInfo &OpInfo,
+                                     bool &OpHasCompleteDecoder) const {
+  const std::string &Decoder = OpInfo.Decoder;
+
+  bool UseInsertBits = OpInfo.numFields() != 1 || OpInfo.InitValue != 0;
+
+  if (UseInsertBits) {
+    o.indent(Indentation) << "tmp = 0x";
+    o.write_hex(OpInfo.InitValue);
+    o << ";\n";
+  }
+
+  for (const EncodingField &EF : OpInfo) {
+    o.indent(Indentation);
+    if (UseInsertBits)
+      o << "insertBits(tmp, ";
+    else
+      o << "tmp = ";
+    o << "fieldFromInstruction(insn, " << EF.Base << ", " << EF.Width << ')';
+    if (UseInsertBits)
+      o << ", " << EF.Offset << ", " << EF.Width << ')';
+    else if (EF.Offset != 0)
+      o << " << " << EF.Offset;
+    o << ";\n";
+  }
+
+  if (Decoder != "") {
+    OpHasCompleteDecoder = OpInfo.HasCompleteDecoder;
+    o.indent(Indentation) << Emitter->GuardPrefix << Decoder
+      << "(MI, tmp, Address, Decoder)"
+      << Emitter->GuardPostfix
+      << " { " << (OpHasCompleteDecoder ? "" : "DecodeComplete = false; ")
+      << "return MCDisassembler::Fail; }\n";
+  } else {
+    OpHasCompleteDecoder = true;
+    o.indent(Indentation) << "MI.addOperand(MCOperand::createImm(tmp));\n";
+  }
+}
+
+void FilterChooser::emitDecoder(raw_ostream &OS, unsigned Indentation,
+                                unsigned Opc, bool &HasCompleteDecoder) const {
+  HasCompleteDecoder = true;
+
+  for (const auto &Op : Operands.find(Opc)->second) {
+    // If a custom instruction decoder was specified, use that.
+    if (Op.numFields() == 0 && !Op.Decoder.empty()) {
+      HasCompleteDecoder = Op.HasCompleteDecoder;
+      OS.indent(Indentation) << Emitter->GuardPrefix << Op.Decoder
+        << "(MI, insn, Address, Decoder)"
+        << Emitter->GuardPostfix
+        << " { " << (HasCompleteDecoder ? "" : "DecodeComplete = false; ")
+        << "return MCDisassembler::Fail; }\n";
+      break;
+    }
+
+    bool OpHasCompleteDecoder;
+    emitBinaryParser(OS, Indentation, Op, OpHasCompleteDecoder);
+    if (!OpHasCompleteDecoder)
+      HasCompleteDecoder = false;
+  }
+}
+
+unsigned FilterChooser::getDecoderIndex(DecoderSet &Decoders,
+                                        unsigned Opc,
+                                        bool &HasCompleteDecoder) const {
+  // Build up the predicate string.
+  SmallString<256> Decoder;
+  // FIXME: emitDecoder() function can take a buffer directly rather than
+  // a stream.
+  raw_svector_ostream S(Decoder);
+  unsigned I = 4;
+  emitDecoder(S, I, Opc, HasCompleteDecoder);
+
+  // Using the full decoder string as the key value here is a bit
+  // heavyweight, but is effective. If the string comparisons become a
+  // performance concern, we can implement a mangling of the predicate
+  // data easily enough with a map back to the actual string. That's
+  // overkill for now, though.
+
+  // Make sure the predicate is in the table.
+  Decoders.insert(CachedHashString(Decoder));
+  // Now figure out the index for when we write out the table.
+  DecoderSet::const_iterator P = find(Decoders, Decoder.str());
+  return (unsigned)(P - Decoders.begin());
+}
+
+// If ParenIfBinOp is true, print a surrounding () if Val uses && or ||.
+bool FilterChooser::emitPredicateMatchAux(const Init &Val, bool ParenIfBinOp,
+                                          raw_ostream &OS) const {
+  if (auto *D = dyn_cast<DefInit>(&Val)) {
+    if (!D->getDef()->isSubClassOf("SubtargetFeature"))
+      return true;
+    OS << "Bits[" << Emitter->PredicateNamespace << "::" << D->getAsString()
+       << "]";
+    return false;
+  }
+  if (auto *D = dyn_cast<DagInit>(&Val)) {
+    std::string Op = D->getOperator()->getAsString();
+    if (Op == "not" && D->getNumArgs() == 1) {
+      OS << '!';
+      return emitPredicateMatchAux(*D->getArg(0), true, OS);
+    }
+    if ((Op == "any_of" || Op == "all_of") && D->getNumArgs() > 0) {
+      bool Paren = D->getNumArgs() > 1 && std::exchange(ParenIfBinOp, true);
+      if (Paren)
+        OS << '(';
+      ListSeparator LS(Op == "any_of" ? " || " : " && ");
+      for (auto *Arg : D->getArgs()) {
+        OS << LS;
+        if (emitPredicateMatchAux(*Arg, ParenIfBinOp, OS))
+          return true;
+      }
+      if (Paren)
+        OS << ')';
+      return false;
+    }
+  }
+  return true;
+}
+
+bool FilterChooser::emitPredicateMatch(raw_ostream &o, unsigned &Indentation,
+                                       unsigned Opc) const {
+  ListInit *Predicates =
+      AllInstructions[Opc].EncodingDef->getValueAsListInit("Predicates");
+  bool IsFirstEmission = true;
+  for (unsigned i = 0; i < Predicates->size(); ++i) {
+    Record *Pred = Predicates->getElementAsRecord(i);
+    if (!Pred->getValue("AssemblerMatcherPredicate"))
+      continue;
+
+    if (!isa<DagInit>(Pred->getValue("AssemblerCondDag")->getValue()))
+      continue;
+
+    if (!IsFirstEmission)
+      o << " && ";
+    if (emitPredicateMatchAux(*Pred->getValueAsDag("AssemblerCondDag"),
+                              Predicates->size() > 1, o))
+      PrintFatalError(Pred->getLoc(), "Invalid AssemblerCondDag!");
+    IsFirstEmission = false;
+  }
+  return !Predicates->empty();
+}
+
+bool FilterChooser::doesOpcodeNeedPredicate(unsigned Opc) const {
+  ListInit *Predicates =
+      AllInstructions[Opc].EncodingDef->getValueAsListInit("Predicates");
+  for (unsigned i = 0; i < Predicates->size(); ++i) {
+    Record *Pred = Predicates->getElementAsRecord(i);
+    if (!Pred->getValue("AssemblerMatcherPredicate"))
+      continue;
+
+    if (isa<DagInit>(Pred->getValue("AssemblerCondDag")->getValue()))
+      return true;
+  }
+  return false;
+}
+
+unsigned FilterChooser::getPredicateIndex(DecoderTableInfo &TableInfo,
+                                          StringRef Predicate) const {
+  // Using the full predicate string as the key value here is a bit
+  // heavyweight, but is effective. If the string comparisons become a
+  // performance concern, we can implement a mangling of the predicate
+  // data easily enough with a map back to the actual string. That's
+  // overkill for now, though.
+
+  // Make sure the predicate is in the table.
+  TableInfo.Predicates.insert(CachedHashString(Predicate));
+  // Now figure out the index for when we write out the table.
+  PredicateSet::const_iterator P = find(TableInfo.Predicates, Predicate);
+  return (unsigned)(P - TableInfo.Predicates.begin());
+}
+
+void FilterChooser::emitPredicateTableEntry(DecoderTableInfo &TableInfo,
+                                            unsigned Opc) const {
+  if (!doesOpcodeNeedPredicate(Opc))
+    return;
+
+  // Build up the predicate string.
+  SmallString<256> Predicate;
+  // FIXME: emitPredicateMatch() functions can take a buffer directly rather
+  // than a stream.
+  raw_svector_ostream PS(Predicate);
+  unsigned I = 0;
+  emitPredicateMatch(PS, I, Opc);
+
+  // Figure out the index into the predicate table for the predicate just
+  // computed.
+  unsigned PIdx = getPredicateIndex(TableInfo, PS.str());
+  SmallString<16> PBytes;
+  raw_svector_ostream S(PBytes);
+  encodeULEB128(PIdx, S);
+
+  TableInfo.Table.push_back(MCD::OPC_CheckPredicate);
+  // Predicate index
+  for (unsigned i = 0, e = PBytes.size(); i != e; ++i)
+    TableInfo.Table.push_back(PBytes[i]);
+  // Push location for NumToSkip backpatching.
+  TableInfo.FixupStack.back().push_back(TableInfo.Table.size());
+  TableInfo.Table.push_back(0);
+  TableInfo.Table.push_back(0);
+  TableInfo.Table.push_back(0);
+}
+
+void FilterChooser::emitSoftFailTableEntry(DecoderTableInfo &TableInfo,
+                                           unsigned Opc) const {
+  const RecordVal *RV = AllInstructions[Opc].EncodingDef->getValue("SoftFail");
+  BitsInit *SFBits = RV ? dyn_cast<BitsInit>(RV->getValue()) : nullptr;
+
+  if (!SFBits) return;
+  BitsInit *InstBits =
+      AllInstructions[Opc].EncodingDef->getValueAsBitsInit("Inst");
+
+  APInt PositiveMask(BitWidth, 0ULL);
+  APInt NegativeMask(BitWidth, 0ULL);
+  for (unsigned i = 0; i < BitWidth; ++i) {
+    bit_value_t B = bitFromBits(*SFBits, i);
+    bit_value_t IB = bitFromBits(*InstBits, i);
+
+    if (B != BIT_TRUE) continue;
+
+    switch (IB) {
+    case BIT_FALSE:
+      // The bit is meant to be false, so emit a check to see if it is true.
+      PositiveMask.setBit(i);
+      break;
+    case BIT_TRUE:
+      // The bit is meant to be true, so emit a check to see if it is false.
+      NegativeMask.setBit(i);
+      break;
+    default:
+      // The bit is not set; this must be an error!
+      errs() << "SoftFail Conflict: bit SoftFail{" << i << "} in "
+             << AllInstructions[Opc] << " is set but Inst{" << i
+             << "} is unset!\n"
+             << "  - You can only mark a bit as SoftFail if it is fully defined"
+             << " (1/0 - not '?') in Inst\n";
+      return;
+    }
+  }
+
+  bool NeedPositiveMask = PositiveMask.getBoolValue();
+  bool NeedNegativeMask = NegativeMask.getBoolValue();
+
+  if (!NeedPositiveMask && !NeedNegativeMask)
+    return;
+
+  TableInfo.Table.push_back(MCD::OPC_SoftFail);
+
+  SmallString<16> MaskBytes;
+  raw_svector_ostream S(MaskBytes);
+  if (NeedPositiveMask) {
+    encodeULEB128(PositiveMask.getZExtValue(), S);
+    for (unsigned i = 0, e = MaskBytes.size(); i != e; ++i)
+      TableInfo.Table.push_back(MaskBytes[i]);
+  } else
+    TableInfo.Table.push_back(0);
+  if (NeedNegativeMask) {
+    MaskBytes.clear();
+    encodeULEB128(NegativeMask.getZExtValue(), S);
+    for (unsigned i = 0, e = MaskBytes.size(); i != e; ++i)
+      TableInfo.Table.push_back(MaskBytes[i]);
+  } else
+    TableInfo.Table.push_back(0);
+}
+
+// Emits table entries to decode the singleton.
+void FilterChooser::emitSingletonTableEntry(DecoderTableInfo &TableInfo,
+                                            EncodingIDAndOpcode Opc) const {
+  std::vector<unsigned> StartBits;
+  std::vector<unsigned> EndBits;
+  std::vector<uint64_t> FieldVals;
+  insn_t Insn;
+  insnWithID(Insn, Opc.EncodingID);
+
+  // Look for islands of undecoded bits of the singleton.
+  getIslands(StartBits, EndBits, FieldVals, Insn);
+
+  unsigned Size = StartBits.size();
+
+  // Emit the predicate table entry if one is needed.
+  emitPredicateTableEntry(TableInfo, Opc.EncodingID);
+
+  // Check any additional encoding fields needed.
+  for (unsigned I = Size; I != 0; --I) {
+    unsigned NumBits = EndBits[I-1] - StartBits[I-1] + 1;
+    TableInfo.Table.push_back(MCD::OPC_CheckField);
+    TableInfo.Table.push_back(StartBits[I-1]);
+    TableInfo.Table.push_back(NumBits);
+    uint8_t Buffer[16], *p;
+    encodeULEB128(FieldVals[I-1], Buffer);
+    for (p = Buffer; *p >= 128 ; ++p)
+      TableInfo.Table.push_back(*p);
+    TableInfo.Table.push_back(*p);
+    // Push location for NumToSkip backpatching.
+    TableInfo.FixupStack.back().push_back(TableInfo.Table.size());
+    // The fixup is always 24-bits, so go ahead and allocate the space
+    // in the table so all our relative position calculations work OK even
+    // before we fully resolve the real value here.
+    TableInfo.Table.push_back(0);
+    TableInfo.Table.push_back(0);
+    TableInfo.Table.push_back(0);
+  }
+
+  // Check for soft failure of the match.
+  emitSoftFailTableEntry(TableInfo, Opc.EncodingID);
+
+  bool HasCompleteDecoder;
+  unsigned DIdx =
+      getDecoderIndex(TableInfo.Decoders, Opc.EncodingID, HasCompleteDecoder);
+
+  // Produce OPC_Decode or OPC_TryDecode opcode based on the information
+  // whether the instruction decoder is complete or not. If it is complete
+  // then it handles all possible values of remaining variable/unfiltered bits
+  // and for any value can determine if the bitpattern is a valid instruction
+  // or not. This means OPC_Decode will be the final step in the decoding
+  // process. If it is not complete, then the Fail return code from the
+  // decoder method indicates that additional processing should be done to see
+  // if there is any other instruction that also matches the bitpattern and
+  // can decode it.
+  TableInfo.Table.push_back(HasCompleteDecoder ? MCD::OPC_Decode :
+      MCD::OPC_TryDecode);
+  NumEncodingsSupported++;
+  uint8_t Buffer[16], *p;
+  encodeULEB128(Opc.Opcode, Buffer);
+  for (p = Buffer; *p >= 128 ; ++p)
+    TableInfo.Table.push_back(*p);
+  TableInfo.Table.push_back(*p);
+
+  SmallString<16> Bytes;
+  raw_svector_ostream S(Bytes);
+  encodeULEB128(DIdx, S);
+
+  // Decoder index
+  for (unsigned i = 0, e = Bytes.size(); i != e; ++i)
+    TableInfo.Table.push_back(Bytes[i]);
+
+  if (!HasCompleteDecoder) {
+    // Push location for NumToSkip backpatching.
+    TableInfo.FixupStack.back().push_back(TableInfo.Table.size());
+    // Allocate the space for the fixup.
+    TableInfo.Table.push_back(0);
+    TableInfo.Table.push_back(0);
+    TableInfo.Table.push_back(0);
+  }
+}
+
+// Emits table entries to decode the singleton, and then to decode the rest.
+void FilterChooser::emitSingletonTableEntry(DecoderTableInfo &TableInfo,
+                                            const Filter &Best) const {
+  EncodingIDAndOpcode Opc = Best.getSingletonOpc();
+
+  // complex singletons need predicate checks from the first singleton
+  // to refer forward to the variable filterchooser that follows.
+  TableInfo.FixupStack.emplace_back();
+
+  emitSingletonTableEntry(TableInfo, Opc);
+
+  resolveTableFixups(TableInfo.Table, TableInfo.FixupStack.back(),
+                     TableInfo.Table.size());
+  TableInfo.FixupStack.pop_back();
+
+  Best.getVariableFC().emitTableEntries(TableInfo);
+}
+
+// Assign a single filter and run with it.  Top level API client can initialize
+// with a single filter to start the filtering process.
+void FilterChooser::runSingleFilter(unsigned startBit, unsigned numBit,
+                                    bool mixed) {
+  Filters.clear();
+  Filters.emplace_back(*this, startBit, numBit, true);
+  BestIndex = 0; // Sole Filter instance to choose from.
+  bestFilter().recurse();
+}
+
+// reportRegion is a helper function for filterProcessor to mark a region as
+// eligible for use as a filter region.
+void FilterChooser::reportRegion(bitAttr_t RA, unsigned StartBit,
+                                 unsigned BitIndex, bool AllowMixed) {
+  if (RA == ATTR_MIXED && AllowMixed)
+    Filters.emplace_back(*this, StartBit, BitIndex - StartBit, true);
+  else if (RA == ATTR_ALL_SET && !AllowMixed)
+    Filters.emplace_back(*this, StartBit, BitIndex - StartBit, false);
+}
+
+// FilterProcessor scans the well-known encoding bits of the instructions and
+// builds up a list of candidate filters.  It chooses the best filter and
+// recursively descends down the decoding tree.
+bool FilterChooser::filterProcessor(bool AllowMixed, bool Greedy) {
+  Filters.clear();
+  BestIndex = -1;
+  unsigned numInstructions = Opcodes.size();
+
+  assert(numInstructions && "Filter created with no instructions");
+
+  // No further filtering is necessary.
+  if (numInstructions == 1)
+    return true;
+
+  // Heuristics.  See also doFilter()'s "Heuristics" comment when num of
+  // instructions is 3.
+  if (AllowMixed && !Greedy) {
+    assert(numInstructions == 3);
+
+    for (auto Opcode : Opcodes) {
+      std::vector<unsigned> StartBits;
+      std::vector<unsigned> EndBits;
+      std::vector<uint64_t> FieldVals;
+      insn_t Insn;
+
+      insnWithID(Insn, Opcode.EncodingID);
+
+      // Look for islands of undecoded bits of any instruction.
+      if (getIslands(StartBits, EndBits, FieldVals, Insn) > 0) {
+        // Found an instruction with island(s).  Now just assign a filter.
+        runSingleFilter(StartBits[0], EndBits[0] - StartBits[0] + 1, true);
+        return true;
+      }
+    }
+  }
+
+  unsigned BitIndex;
+
+  // We maintain BIT_WIDTH copies of the bitAttrs automaton.
+  // The automaton consumes the corresponding bit from each
+  // instruction.
+  //
+  //   Input symbols: 0, 1, and _ (unset).
+  //   States:        NONE, FILTERED, ALL_SET, ALL_UNSET, and MIXED.
+  //   Initial state: NONE.
+  //
+  // (NONE) ------- [01] -> (ALL_SET)
+  // (NONE) ------- _ ----> (ALL_UNSET)
+  // (ALL_SET) ---- [01] -> (ALL_SET)
+  // (ALL_SET) ---- _ ----> (MIXED)
+  // (ALL_UNSET) -- [01] -> (MIXED)
+  // (ALL_UNSET) -- _ ----> (ALL_UNSET)
+  // (MIXED) ------ . ----> (MIXED)
+  // (FILTERED)---- . ----> (FILTERED)
+
+  std::vector<bitAttr_t> bitAttrs;
+
+  // FILTERED bit positions provide no entropy and are not worthy of pursuing.
+  // Filter::recurse() set either BIT_TRUE or BIT_FALSE for each position.
+  for (BitIndex = 0; BitIndex < BitWidth; ++BitIndex)
+    if (FilterBitValues[BitIndex] == BIT_TRUE ||
+        FilterBitValues[BitIndex] == BIT_FALSE)
+      bitAttrs.push_back(ATTR_FILTERED);
+    else
+      bitAttrs.push_back(ATTR_NONE);
+
+  for (unsigned InsnIndex = 0; InsnIndex < numInstructions; ++InsnIndex) {
+    insn_t insn;
+
+    insnWithID(insn, Opcodes[InsnIndex].EncodingID);
+
+    for (BitIndex = 0; BitIndex < BitWidth; ++BitIndex) {
+      switch (bitAttrs[BitIndex]) {
+      case ATTR_NONE:
+        if (insn[BitIndex] == BIT_UNSET)
+          bitAttrs[BitIndex] = ATTR_ALL_UNSET;
+        else
+          bitAttrs[BitIndex] = ATTR_ALL_SET;
+        break;
+      case ATTR_ALL_SET:
+        if (insn[BitIndex] == BIT_UNSET)
+          bitAttrs[BitIndex] = ATTR_MIXED;
+        break;
+      case ATTR_ALL_UNSET:
+        if (insn[BitIndex] != BIT_UNSET)
+          bitAttrs[BitIndex] = ATTR_MIXED;
+        break;
+      case ATTR_MIXED:
+      case ATTR_FILTERED:
+        break;
+      }
+    }
+  }
+
+  // The regionAttr automaton consumes the bitAttrs automatons' state,
+  // lowest-to-highest.
+  //
+  //   Input symbols: F(iltered), (all_)S(et), (all_)U(nset), M(ixed)
+  //   States:        NONE, ALL_SET, MIXED
+  //   Initial state: NONE
+  //
+  // (NONE) ----- F --> (NONE)
+  // (NONE) ----- S --> (ALL_SET)     ; and set region start
+  // (NONE) ----- U --> (NONE)
+  // (NONE) ----- M --> (MIXED)       ; and set region start
+  // (ALL_SET) -- F --> (NONE)        ; and report an ALL_SET region
+  // (ALL_SET) -- S --> (ALL_SET)
+  // (ALL_SET) -- U --> (NONE)        ; and report an ALL_SET region
+  // (ALL_SET) -- M --> (MIXED)       ; and report an ALL_SET region
+  // (MIXED) ---- F --> (NONE)        ; and report a MIXED region
+  // (MIXED) ---- S --> (ALL_SET)     ; and report a MIXED region
+  // (MIXED) ---- U --> (NONE)        ; and report a MIXED region
+  // (MIXED) ---- M --> (MIXED)
+
+  bitAttr_t RA = ATTR_NONE;
+  unsigned StartBit = 0;
+
+  for (BitIndex = 0; BitIndex < BitWidth; ++BitIndex) {
+    bitAttr_t bitAttr = bitAttrs[BitIndex];
+
+    assert(bitAttr != ATTR_NONE && "Bit without attributes");
+
+    switch (RA) {
+    case ATTR_NONE:
+      switch (bitAttr) {
+      case ATTR_FILTERED:
+        break;
+      case ATTR_ALL_SET:
+        StartBit = BitIndex;
+        RA = ATTR_ALL_SET;
+        break;
+      case ATTR_ALL_UNSET:
+        break;
+      case ATTR_MIXED:
+        StartBit = BitIndex;
+        RA = ATTR_MIXED;
+        break;
+      default:
+        llvm_unreachable("Unexpected bitAttr!");
+      }
+      break;
+    case ATTR_ALL_SET:
+      switch (bitAttr) {
+      case ATTR_FILTERED:
+        reportRegion(RA, StartBit, BitIndex, AllowMixed);
+        RA = ATTR_NONE;
+        break;
+      case ATTR_ALL_SET:
+        break;
+      case ATTR_ALL_UNSET:
+        reportRegion(RA, StartBit, BitIndex, AllowMixed);
+        RA = ATTR_NONE;
+        break;
+      case ATTR_MIXED:
+        reportRegion(RA, StartBit, BitIndex, AllowMixed);
+        StartBit = BitIndex;
+        RA = ATTR_MIXED;
+        break;
+      default:
+        llvm_unreachable("Unexpected bitAttr!");
+      }
+      break;
+    case ATTR_MIXED:
+      switch (bitAttr) {
+      case ATTR_FILTERED:
+        reportRegion(RA, StartBit, BitIndex, AllowMixed);
+        StartBit = BitIndex;
+        RA = ATTR_NONE;
+        break;
+      case ATTR_ALL_SET:
+        reportRegion(RA, StartBit, BitIndex, AllowMixed);
+        StartBit = BitIndex;
+        RA = ATTR_ALL_SET;
+        break;
+      case ATTR_ALL_UNSET:
+        reportRegion(RA, StartBit, BitIndex, AllowMixed);
+        RA = ATTR_NONE;
+        break;
+      case ATTR_MIXED:
+        break;
+      default:
+        llvm_unreachable("Unexpected bitAttr!");
+      }
+      break;
+    case ATTR_ALL_UNSET:
+      llvm_unreachable("regionAttr state machine has no ATTR_UNSET state");
+    case ATTR_FILTERED:
+      llvm_unreachable("regionAttr state machine has no ATTR_FILTERED state");
+    }
+  }
+
+  // At the end, if we're still in ALL_SET or MIXED states, report a region
+  switch (RA) {
+  case ATTR_NONE:
+    break;
+  case ATTR_FILTERED:
+    break;
+  case ATTR_ALL_SET:
+    reportRegion(RA, StartBit, BitIndex, AllowMixed);
+    break;
+  case ATTR_ALL_UNSET:
+    break;
+  case ATTR_MIXED:
+    reportRegion(RA, StartBit, BitIndex, AllowMixed);
+    break;
+  }
+
+  // We have finished with the filter processings.  Now it's time to choose
+  // the best performing filter.
+  BestIndex = 0;
+  bool AllUseless = true;
+  unsigned BestScore = 0;
+
+  for (unsigned i = 0, e = Filters.size(); i != e; ++i) {
+    unsigned Usefulness = Filters[i].usefulness();
+
+    if (Usefulness)
+      AllUseless = false;
+
+    if (Usefulness > BestScore) {
+      BestIndex = i;
+      BestScore = Usefulness;
+    }
+  }
+
+  if (!AllUseless)
+    bestFilter().recurse();
+
+  return !AllUseless;
+} // end of FilterChooser::filterProcessor(bool)
+
+// Decides on the best configuration of filter(s) to use in order to decode
+// the instructions.  A conflict of instructions may occur, in which case we
+// dump the conflict set to the standard error.
+void FilterChooser::doFilter() {
+  unsigned Num = Opcodes.size();
+  assert(Num && "FilterChooser created with no instructions");
+
+  // Try regions of consecutive known bit values first.
+  if (filterProcessor(false))
+    return;
+
+  // Then regions of mixed bits (both known and unitialized bit values allowed).
+  if (filterProcessor(true))
+    return;
+
+  // Heuristics to cope with conflict set {t2CMPrs, t2SUBSrr, t2SUBSrs} where
+  // no single instruction for the maximum ATTR_MIXED region Inst{14-4} has a
+  // well-known encoding pattern.  In such case, we backtrack and scan for the
+  // the very first consecutive ATTR_ALL_SET region and assign a filter to it.
+  if (Num == 3 && filterProcessor(true, false))
+    return;
+
+  // If we come to here, the instruction decoding has failed.
+  // Set the BestIndex to -1 to indicate so.
+  BestIndex = -1;
+}
+
+// emitTableEntries - Emit state machine entries to decode our share of
+// instructions.
+void FilterChooser::emitTableEntries(DecoderTableInfo &TableInfo) const {
+  if (Opcodes.size() == 1) {
+    // There is only one instruction in the set, which is great!
+    // Call emitSingletonDecoder() to see whether there are any remaining
+    // encodings bits.
+    emitSingletonTableEntry(TableInfo, Opcodes[0]);
+    return;
+  }
+
+  // Choose the best filter to do the decodings!
+  if (BestIndex != -1) {
+    const Filter &Best = Filters[BestIndex];
+    if (Best.getNumFiltered() == 1)
+      emitSingletonTableEntry(TableInfo, Best);
+    else
+      Best.emitTableEntry(TableInfo);
+    return;
+  }
+
+  // We don't know how to decode these instructions!  Dump the
+  // conflict set and bail.
+
+  // Print out useful conflict information for postmortem analysis.
+  errs() << "Decoding Conflict:\n";
+
+  dumpStack(errs(), "\t\t");
+
+  for (auto Opcode : Opcodes) {
+    errs() << '\t';
+    emitNameWithID(errs(), Opcode.EncodingID);
+    errs() << " ";
+    dumpBits(
+        errs(),
+        getBitsField(*AllInstructions[Opcode.EncodingID].EncodingDef, "Inst"));
+    errs() << '\n';
+  }
+}
+
+static std::string findOperandDecoderMethod(Record *Record) {
+  std::string Decoder;
+
+  RecordVal *DecoderString = Record->getValue("DecoderMethod");
+  StringInit *String = DecoderString ?
+    dyn_cast<StringInit>(DecoderString->getValue()) : nullptr;
+  if (String) {
+    Decoder = std::string(String->getValue());
+    if (!Decoder.empty())
+      return Decoder;
+  }
+
+  if (Record->isSubClassOf("RegisterOperand"))
+    Record = Record->getValueAsDef("RegClass");
+
+  if (Record->isSubClassOf("RegisterClass")) {
+    Decoder = "Decode" + Record->getName().str() + "RegisterClass";
+  } else if (Record->isSubClassOf("PointerLikeRegClass")) {
+    Decoder = "DecodePointerLikeRegClass" +
+      utostr(Record->getValueAsInt("RegClassKind"));
+  }
+
+  return Decoder;
+}
+
+OperandInfo getOpInfo(Record *TypeRecord) {
+  std::string Decoder = findOperandDecoderMethod(TypeRecord);
+
+  RecordVal *HasCompleteDecoderVal = TypeRecord->getValue("hasCompleteDecoder");
+  BitInit *HasCompleteDecoderBit =
+      HasCompleteDecoderVal
+          ? dyn_cast<BitInit>(HasCompleteDecoderVal->getValue())
+          : nullptr;
+  bool HasCompleteDecoder =
+      HasCompleteDecoderBit ? HasCompleteDecoderBit->getValue() : true;
+
+  return OperandInfo(Decoder, HasCompleteDecoder);
+}
+
+void parseVarLenInstOperand(const Record &Def,
+                            std::vector<OperandInfo> &Operands,
+                            const CodeGenInstruction &CGI) {
+
+  const RecordVal *RV = Def.getValue("Inst");
+  VarLenInst VLI(cast<DagInit>(RV->getValue()), RV);
+  SmallVector<int> TiedTo;
+
+  for (unsigned Idx = 0; Idx < CGI.Operands.size(); ++Idx) {
+    auto &Op = CGI.Operands[Idx];
+    if (Op.MIOperandInfo && Op.MIOperandInfo->getNumArgs() > 0)
+      for (auto *Arg : Op.MIOperandInfo->getArgs())
+        Operands.push_back(getOpInfo(cast<DefInit>(Arg)->getDef()));
+    else
+      Operands.push_back(getOpInfo(Op.Rec));
+
+    int TiedReg = Op.getTiedRegister();
+    TiedTo.push_back(-1);
+    if (TiedReg != -1) {
+      TiedTo[Idx] = TiedReg;
+      TiedTo[TiedReg] = Idx;
+    }
+  }
+
+  unsigned CurrBitPos = 0;
+  for (auto &EncodingSegment : VLI) {
+    unsigned Offset = 0;
+    StringRef OpName;
+
+    if (const StringInit *SI = dyn_cast<StringInit>(EncodingSegment.Value)) {
+      OpName = SI->getValue();
+    } else if (const DagInit *DI = dyn_cast<DagInit>(EncodingSegment.Value)) {
+      OpName = cast<StringInit>(DI->getArg(0))->getValue();
+      Offset = cast<IntInit>(DI->getArg(2))->getValue();
+    }
+
+    if (!OpName.empty()) {
+      auto OpSubOpPair =
+          const_cast<CodeGenInstruction &>(CGI).Operands.ParseOperandName(
+              OpName);
+      unsigned OpIdx = CGI.Operands.getFlattenedOperandNumber(OpSubOpPair);
+      Operands[OpIdx].addField(CurrBitPos, EncodingSegment.BitWidth, Offset);
+
+      int TiedReg = TiedTo[OpSubOpPair.first];
+      if (TiedReg != -1) {
+        unsigned OpIdx = CGI.Operands.getFlattenedOperandNumber(
+            std::make_pair(TiedReg, OpSubOpPair.second));
+        Operands[OpIdx].addField(CurrBitPos, EncodingSegment.BitWidth, Offset);
+      }
+    }
+
+    CurrBitPos += EncodingSegment.BitWidth;
+  }
+}
+
+static unsigned
+populateInstruction(CodeGenTarget &Target, const Record &EncodingDef,
+                    const CodeGenInstruction &CGI, unsigned Opc,
+                    std::map<unsigned, std::vector<OperandInfo>> &Operands,
+                    bool IsVarLenInst) {
+  const Record &Def = *CGI.TheDef;
+  // If all the bit positions are not specified; do not decode this instruction.
+  // We are bound to fail!  For proper disassembly, the well-known encoding bits
+  // of the instruction must be fully specified.
+
+  BitsInit &Bits = getBitsField(EncodingDef, "Inst");
+  if (Bits.allInComplete())
+    return 0;
+
+  std::vector<OperandInfo> InsnOperands;
+
+  // If the instruction has specified a custom decoding hook, use that instead
+  // of trying to auto-generate the decoder.
+  StringRef InstDecoder = EncodingDef.getValueAsString("DecoderMethod");
+  if (InstDecoder != "") {
+    bool HasCompleteInstDecoder = EncodingDef.getValueAsBit("hasCompleteDecoder");
+    InsnOperands.push_back(
+        OperandInfo(std::string(InstDecoder), HasCompleteInstDecoder));
+    Operands[Opc] = InsnOperands;
+    return Bits.getNumBits();
+  }
+
+  // Generate a description of the operand of the instruction that we know
+  // how to decode automatically.
+  // FIXME: We'll need to have a way to manually override this as needed.
+
+  // Gather the outputs/inputs of the instruction, so we can find their
+  // positions in the encoding.  This assumes for now that they appear in the
+  // MCInst in the order that they're listed.
+  std::vector<std::pair<Init*, StringRef>> InOutOperands;
+  DagInit *Out  = Def.getValueAsDag("OutOperandList");
+  DagInit *In  = Def.getValueAsDag("InOperandList");
+  for (unsigned i = 0; i < Out->getNumArgs(); ++i)
+    InOutOperands.push_back(
+        std::make_pair(Out->getArg(i), Out->getArgNameStr(i)));
+  for (unsigned i = 0; i < In->getNumArgs(); ++i)
+    InOutOperands.push_back(
+        std::make_pair(In->getArg(i), In->getArgNameStr(i)));
+
+  // Search for tied operands, so that we can correctly instantiate
+  // operands that are not explicitly represented in the encoding.
+  std::map<std::string, std::string> TiedNames;
+  for (unsigned i = 0; i < CGI.Operands.size(); ++i) {
+    int tiedTo = CGI.Operands[i].getTiedRegister();
+    if (tiedTo != -1) {
+      std::pair<unsigned, unsigned> SO =
+        CGI.Operands.getSubOperandNumber(tiedTo);
+      TiedNames[std::string(InOutOperands[i].second)] =
+          std::string(InOutOperands[SO.first].second);
+      TiedNames[std::string(InOutOperands[SO.first].second)] =
+          std::string(InOutOperands[i].second);
+    }
+  }
+
+  if (IsVarLenInst) {
+    parseVarLenInstOperand(EncodingDef, InsnOperands, CGI);
+  } else {
+    std::map<std::string, std::vector<OperandInfo>> NumberedInsnOperands;
+    std::set<std::string> NumberedInsnOperandsNoTie;
+    if (Target.getInstructionSet()->getValueAsBit(
+            "decodePositionallyEncodedOperands")) {
+      const std::vector<RecordVal> &Vals = Def.getValues();
+      unsigned NumberedOp = 0;
+
+      std::set<unsigned> NamedOpIndices;
+      if (Target.getInstructionSet()->getValueAsBit(
+              "noNamedPositionallyEncodedOperands"))
+        // Collect the set of operand indices that might correspond to named
+        // operand, and skip these when assigning operands based on position.
+        for (unsigned i = 0, e = Vals.size(); i != e; ++i) {
+          unsigned OpIdx;
+          if (!CGI.Operands.hasOperandNamed(Vals[i].getName(), OpIdx))
+            continue;
+
+          NamedOpIndices.insert(OpIdx);
+        }
+
+      for (unsigned i = 0, e = Vals.size(); i != e; ++i) {
+        // Ignore fixed fields in the record, we're looking for values like:
+        //    bits<5> RST = { ?, ?, ?, ?, ? };
+        if (Vals[i].isNonconcreteOK() || Vals[i].getValue()->isComplete())
+          continue;
+
+        // Determine if Vals[i] actually contributes to the Inst encoding.
+        unsigned bi = 0;
+        for (; bi < Bits.getNumBits(); ++bi) {
+          VarInit *Var = nullptr;
+          VarBitInit *BI = dyn_cast<VarBitInit>(Bits.getBit(bi));
+          if (BI)
+            Var = dyn_cast<VarInit>(BI->getBitVar());
+          else
+            Var = dyn_cast<VarInit>(Bits.getBit(bi));
+
+          if (Var && Var->getName() == Vals[i].getName())
+            break;
+        }
+
+        if (bi == Bits.getNumBits())
+          continue;
+
+        // Skip variables that correspond to explicitly-named operands.
+        unsigned OpIdx;
+        if (CGI.Operands.hasOperandNamed(Vals[i].getName(), OpIdx))
+          continue;
+
+        // Get the bit range for this operand:
+        unsigned bitStart = bi++, bitWidth = 1;
+        for (; bi < Bits.getNumBits(); ++bi) {
+          VarInit *Var = nullptr;
+          VarBitInit *BI = dyn_cast<VarBitInit>(Bits.getBit(bi));
+          if (BI)
+            Var = dyn_cast<VarInit>(BI->getBitVar());
+          else
+            Var = dyn_cast<VarInit>(Bits.getBit(bi));
+
+          if (!Var)
+            break;
+
+          if (Var->getName() != Vals[i].getName())
+            break;
+
+          ++bitWidth;
+        }
+
+        unsigned NumberOps = CGI.Operands.size();
+        while (NumberedOp < NumberOps &&
+               (CGI.Operands.isFlatOperandNotEmitted(NumberedOp) ||
+                (!NamedOpIndices.empty() &&
+                 NamedOpIndices.count(
+                     CGI.Operands.getSubOperandNumber(NumberedOp).first))))
+          ++NumberedOp;
+
+        OpIdx = NumberedOp++;
+
+        // OpIdx now holds the ordered operand number of Vals[i].
+        std::pair<unsigned, unsigned> SO =
+            CGI.Operands.getSubOperandNumber(OpIdx);
+        const std::string &Name = CGI.Operands[SO.first].Name;
+
+        LLVM_DEBUG(dbgs() << "Numbered operand mapping for " << Def.getName()
+                          << ": " << Name << "(" << SO.first << ", "
+                          << SO.second << ") => " << Vals[i].getName() << "\n");
+
+        std::string Decoder;
+        Record *TypeRecord = CGI.Operands[SO.first].Rec;
+
+        RecordVal *DecoderString = TypeRecord->getValue("DecoderMethod");
+        StringInit *String =
+            DecoderString ? dyn_cast<StringInit>(DecoderString->getValue())
+                          : nullptr;
+        if (String && String->getValue() != "")
+          Decoder = std::string(String->getValue());
+
+        if (Decoder == "" && CGI.Operands[SO.first].MIOperandInfo &&
+            CGI.Operands[SO.first].MIOperandInfo->getNumArgs()) {
+          Init *Arg = CGI.Operands[SO.first].MIOperandInfo->getArg(SO.second);
+          if (DefInit *DI = cast<DefInit>(Arg))
+            TypeRecord = DI->getDef();
+        }
+
+        bool isReg = false;
+        if (TypeRecord->isSubClassOf("RegisterOperand"))
+          TypeRecord = TypeRecord->getValueAsDef("RegClass");
+        if (TypeRecord->isSubClassOf("RegisterClass")) {
+          Decoder = "Decode" + TypeRecord->getName().str() + "RegisterClass";
+          isReg = true;
+        } else if (TypeRecord->isSubClassOf("PointerLikeRegClass")) {
+          Decoder = "DecodePointerLikeRegClass" +
+                    utostr(TypeRecord->getValueAsInt("RegClassKind"));
+          isReg = true;
+        }
+
+        DecoderString = TypeRecord->getValue("DecoderMethod");
+        String = DecoderString ? dyn_cast<StringInit>(DecoderString->getValue())
+                               : nullptr;
+        if (!isReg && String && String->getValue() != "")
+          Decoder = std::string(String->getValue());
+
+        RecordVal *HasCompleteDecoderVal =
+            TypeRecord->getValue("hasCompleteDecoder");
+        BitInit *HasCompleteDecoderBit =
+            HasCompleteDecoderVal
+                ? dyn_cast<BitInit>(HasCompleteDecoderVal->getValue())
+                : nullptr;
+        bool HasCompleteDecoder =
+            HasCompleteDecoderBit ? HasCompleteDecoderBit->getValue() : true;
+
+        OperandInfo OpInfo(Decoder, HasCompleteDecoder);
+        OpInfo.addField(bitStart, bitWidth, 0);
+
+        NumberedInsnOperands[Name].push_back(OpInfo);
+
+        // FIXME: For complex operands with custom decoders we can't handle tied
+        // sub-operands automatically. Skip those here and assume that this is
+        // fixed up elsewhere.
+        if (CGI.Operands[SO.first].MIOperandInfo &&
+            CGI.Operands[SO.first].MIOperandInfo->getNumArgs() > 1 && String &&
+            String->getValue() != "")
+          NumberedInsnOperandsNoTie.insert(Name);
+      }
+    }
+
+    // For each operand, see if we can figure out where it is encoded.
+    for (const auto &Op : InOutOperands) {
+      if (!NumberedInsnOperands[std::string(Op.second)].empty()) {
+        llvm::append_range(InsnOperands,
+                           NumberedInsnOperands[std::string(Op.second)]);
+        continue;
+      }
+      if (!NumberedInsnOperands[TiedNames[std::string(Op.second)]].empty()) {
+        if (!NumberedInsnOperandsNoTie.count(
+                TiedNames[std::string(Op.second)])) {
+          // Figure out to which (sub)operand we're tied.
+          unsigned i =
+              CGI.Operands.getOperandNamed(TiedNames[std::string(Op.second)]);
+          int tiedTo = CGI.Operands[i].getTiedRegister();
+          if (tiedTo == -1) {
+            i = CGI.Operands.getOperandNamed(Op.second);
+            tiedTo = CGI.Operands[i].getTiedRegister();
+          }
+
+          if (tiedTo != -1) {
+            std::pair<unsigned, unsigned> SO =
+                CGI.Operands.getSubOperandNumber(tiedTo);
+
+            InsnOperands.push_back(
+                NumberedInsnOperands[TiedNames[std::string(Op.second)]]
+                                    [SO.second]);
+          }
+        }
+        continue;
+      }
+
+      // At this point, we can locate the decoder field, but we need to know how
+      // to interpret it.  As a first step, require the target to provide
+      // callbacks for decoding register classes.
+
+      OperandInfo OpInfo = getOpInfo(cast<DefInit>(Op.first)->getDef());
+
+      // Some bits of the operand may be required to be 1 depending on the
+      // instruction's encoding. Collect those bits.
+      if (const RecordVal *EncodedValue = EncodingDef.getValue(Op.second))
+        if (const BitsInit *OpBits =
+                dyn_cast<BitsInit>(EncodedValue->getValue()))
+          for (unsigned I = 0; I < OpBits->getNumBits(); ++I)
+            if (const BitInit *OpBit = dyn_cast<BitInit>(OpBits->getBit(I)))
+              if (OpBit->getValue())
+                OpInfo.InitValue |= 1ULL << I;
+
+      unsigned Base = ~0U;
+      unsigned Width = 0;
+      unsigned Offset = 0;
+
+      for (unsigned bi = 0; bi < Bits.getNumBits(); ++bi) {
+        VarInit *Var = nullptr;
+        VarBitInit *BI = dyn_cast<VarBitInit>(Bits.getBit(bi));
+        if (BI)
+          Var = dyn_cast<VarInit>(BI->getBitVar());
+        else
+          Var = dyn_cast<VarInit>(Bits.getBit(bi));
+
+        if (!Var) {
+          if (Base != ~0U) {
+            OpInfo.addField(Base, Width, Offset);
+            Base = ~0U;
+            Width = 0;
+            Offset = 0;
+          }
+          continue;
+        }
+
+        if ((Var->getName() != Op.second &&
+             Var->getName() != TiedNames[std::string(Op.second)])) {
+          if (Base != ~0U) {
+            OpInfo.addField(Base, Width, Offset);
+            Base = ~0U;
+            Width = 0;
+            Offset = 0;
+          }
+          continue;
+        }
+
+        if (Base == ~0U) {
+          Base = bi;
+          Width = 1;
+          Offset = BI ? BI->getBitNum() : 0;
+        } else if (BI && BI->getBitNum() != Offset + Width) {
+          OpInfo.addField(Base, Width, Offset);
+          Base = bi;
+          Width = 1;
+          Offset = BI->getBitNum();
+        } else {
+          ++Width;
+        }
+      }
+
+      if (Base != ~0U)
+        OpInfo.addField(Base, Width, Offset);
+
+      if (OpInfo.numFields() > 0)
+        InsnOperands.push_back(OpInfo);
+    }
+  }
+
+  Operands[Opc] = InsnOperands;
+
+#if 0
+  LLVM_DEBUG({
+      // Dumps the instruction encoding bits.
+      dumpBits(errs(), Bits);
+
+      errs() << '\n';
+
+      // Dumps the list of operand info.
+      for (unsigned i = 0, e = CGI.Operands.size(); i != e; ++i) {
+        const CGIOperandList::OperandInfo &Info = CGI.Operands[i];
+        const std::string &OperandName = Info.Name;
+        const Record &OperandDef = *Info.Rec;
+
+        errs() << "\t" << OperandName << " (" << OperandDef.getName() << ")\n";
+      }
+    });
+#endif
+
+  return Bits.getNumBits();
+}
+
+// emitFieldFromInstruction - Emit the templated helper function
+// fieldFromInstruction().
+// On Windows we make sure that this function is not inlined when
+// using the VS compiler. It has a bug which causes the function
+// to be optimized out in some circustances. See llvm.org/pr38292
+static void emitFieldFromInstruction(formatted_raw_ostream &OS) {
+  OS << "// Helper functions for extracting fields from encoded instructions.\n"
+     << "// InsnType must either be integral or an APInt-like object that "
+        "must:\n"
+     << "// * be default-constructible and copy-constructible\n"
+     << "// * be constructible from an APInt (this can be private)\n"
+     << "// * Support insertBits(bits, startBit, numBits)\n"
+     << "// * Support extractBitsAsZExtValue(numBits, startBit)\n"
+     << "// * Support the ~, &, ==, and != operators with other objects of "
+        "the same type\n"
+     << "// * Support the != and bitwise & with uint64_t\n"
+     << "// * Support put (<<) to raw_ostream&\n"
+     << "template <typename InsnType>\n"
+     << "#if defined(_MSC_VER) && !defined(__clang__)\n"
+     << "__declspec(noinline)\n"
+     << "#endif\n"
+     << "static std::enable_if_t<std::is_integral<InsnType>::value, InsnType>\n"
+     << "fieldFromInstruction(const InsnType &insn, unsigned startBit,\n"
+     << "                     unsigned numBits) {\n"
+     << "  assert(startBit + numBits <= 64 && \"Cannot support >64-bit "
+        "extractions!\");\n"
+     << "  assert(startBit + numBits <= (sizeof(InsnType) * 8) &&\n"
+     << "         \"Instruction field out of bounds!\");\n"
+     << "  InsnType fieldMask;\n"
+     << "  if (numBits == sizeof(InsnType) * 8)\n"
+     << "    fieldMask = (InsnType)(-1LL);\n"
+     << "  else\n"
+     << "    fieldMask = (((InsnType)1 << numBits) - 1) << startBit;\n"
+     << "  return (insn & fieldMask) >> startBit;\n"
+     << "}\n"
+     << "\n"
+     << "template <typename InsnType>\n"
+     << "static std::enable_if_t<!std::is_integral<InsnType>::value, "
+        "uint64_t>\n"
+     << "fieldFromInstruction(const InsnType &insn, unsigned startBit,\n"
+     << "                     unsigned numBits) {\n"
+     << "  return insn.extractBitsAsZExtValue(numBits, startBit);\n"
+     << "}\n\n";
+}
+
+// emitInsertBits - Emit the templated helper function insertBits().
+static void emitInsertBits(formatted_raw_ostream &OS) {
+  OS << "// Helper function for inserting bits extracted from an encoded "
+        "instruction into\n"
+     << "// a field.\n"
+     << "template <typename InsnType>\n"
+     << "static std::enable_if_t<std::is_integral<InsnType>::value>\n"
+     << "insertBits(InsnType &field, InsnType bits, unsigned startBit, "
+        "unsigned numBits) {\n"
+     << "  assert(startBit + numBits <= sizeof field * 8);\n"
+     << "  field |= (InsnType)bits << startBit;\n"
+     << "}\n"
+     << "\n"
+     << "template <typename InsnType>\n"
+     << "static std::enable_if_t<!std::is_integral<InsnType>::value>\n"
+     << "insertBits(InsnType &field, uint64_t bits, unsigned startBit, "
+        "unsigned numBits) {\n"
+     << "  field.insertBits(bits, startBit, numBits);\n"
+     << "}\n\n";
+}
+
+// emitDecodeInstruction - Emit the templated helper function
+// decodeInstruction().
+static void emitDecodeInstruction(formatted_raw_ostream &OS,
+                                  bool IsVarLenInst) {
+  OS << "template <typename InsnType>\n"
+     << "static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], "
+        "MCInst &MI,\n"
+     << "                                      InsnType insn, uint64_t "
+        "Address,\n"
+     << "                                      const MCDisassembler *DisAsm,\n"
+     << "                                      const MCSubtargetInfo &STI";
+  if (IsVarLenInst) {
+    OS << ",\n"
+       << "                                      llvm::function_ref<void(APInt "
+          "&,"
+       << " uint64_t)> makeUp";
+  }
+  OS << ") {\n"
+     << "  const FeatureBitset &Bits = STI.getFeatureBits();\n"
+     << "\n"
+     << "  const uint8_t *Ptr = DecodeTable;\n"
+     << "  uint64_t CurFieldValue = 0;\n"
+     << "  DecodeStatus S = MCDisassembler::Success;\n"
+     << "  while (true) {\n"
+     << "    ptrdiff_t Loc = Ptr - DecodeTable;\n"
+     << "    switch (*Ptr) {\n"
+     << "    default:\n"
+     << "      errs() << Loc << \": Unexpected decode table opcode!\\n\";\n"
+     << "      return MCDisassembler::Fail;\n"
+     << "    case MCD::OPC_ExtractField: {\n"
+     << "      unsigned Start = *++Ptr;\n"
+     << "      unsigned Len = *++Ptr;\n"
+     << "      ++Ptr;\n";
+  if (IsVarLenInst)
+    OS << "      makeUp(insn, Start + Len);\n";
+  OS << "      CurFieldValue = fieldFromInstruction(insn, Start, Len);\n"
+     << "      LLVM_DEBUG(dbgs() << Loc << \": OPC_ExtractField(\" << Start << "
+        "\", \"\n"
+     << "                   << Len << \"): \" << CurFieldValue << \"\\n\");\n"
+     << "      break;\n"
+     << "    }\n"
+     << "    case MCD::OPC_FilterValue: {\n"
+     << "      // Decode the field value.\n"
+     << "      unsigned Len;\n"
+     << "      uint64_t Val = decodeULEB128(++Ptr, &Len);\n"
+     << "      Ptr += Len;\n"
+     << "      // NumToSkip is a plain 24-bit integer.\n"
+     << "      unsigned NumToSkip = *Ptr++;\n"
+     << "      NumToSkip |= (*Ptr++) << 8;\n"
+     << "      NumToSkip |= (*Ptr++) << 16;\n"
+     << "\n"
+     << "      // Perform the filter operation.\n"
+     << "      if (Val != CurFieldValue)\n"
+     << "        Ptr += NumToSkip;\n"
+     << "      LLVM_DEBUG(dbgs() << Loc << \": OPC_FilterValue(\" << Val << "
+        "\", \" << NumToSkip\n"
+     << "                   << \"): \" << ((Val != CurFieldValue) ? \"FAIL:\" "
+        ": \"PASS:\")\n"
+     << "                   << \" continuing at \" << (Ptr - DecodeTable) << "
+        "\"\\n\");\n"
+     << "\n"
+     << "      break;\n"
+     << "    }\n"
+     << "    case MCD::OPC_CheckField: {\n"
+     << "      unsigned Start = *++Ptr;\n"
+     << "      unsigned Len = *++Ptr;\n";
+  if (IsVarLenInst)
+    OS << "      makeUp(insn, Start + Len);\n";
+  OS << "      uint64_t FieldValue = fieldFromInstruction(insn, Start, Len);\n"
+     << "      // Decode the field value.\n"
+     << "      unsigned PtrLen = 0;\n"
+     << "      uint64_t ExpectedValue = decodeULEB128(++Ptr, &PtrLen);\n"
+     << "      Ptr += PtrLen;\n"
+     << "      // NumToSkip is a plain 24-bit integer.\n"
+     << "      unsigned NumToSkip = *Ptr++;\n"
+     << "      NumToSkip |= (*Ptr++) << 8;\n"
+     << "      NumToSkip |= (*Ptr++) << 16;\n"
+     << "\n"
+     << "      // If the actual and expected values don't match, skip.\n"
+     << "      if (ExpectedValue != FieldValue)\n"
+     << "        Ptr += NumToSkip;\n"
+     << "      LLVM_DEBUG(dbgs() << Loc << \": OPC_CheckField(\" << Start << "
+        "\", \"\n"
+     << "                   << Len << \", \" << ExpectedValue << \", \" << "
+        "NumToSkip\n"
+     << "                   << \"): FieldValue = \" << FieldValue << \", "
+        "ExpectedValue = \"\n"
+     << "                   << ExpectedValue << \": \"\n"
+     << "                   << ((ExpectedValue == FieldValue) ? \"PASS\\n\" : "
+        "\"FAIL\\n\"));\n"
+     << "      break;\n"
+     << "    }\n"
+     << "    case MCD::OPC_CheckPredicate: {\n"
+     << "      unsigned Len;\n"
+     << "      // Decode the Predicate Index value.\n"
+     << "      unsigned PIdx = decodeULEB128(++Ptr, &Len);\n"
+     << "      Ptr += Len;\n"
+     << "      // NumToSkip is a plain 24-bit integer.\n"
+     << "      unsigned NumToSkip = *Ptr++;\n"
+     << "      NumToSkip |= (*Ptr++) << 8;\n"
+     << "      NumToSkip |= (*Ptr++) << 16;\n"
+     << "      // Check the predicate.\n"
+     << "      bool Pred;\n"
+     << "      if (!(Pred = checkDecoderPredicate(PIdx, Bits)))\n"
+     << "        Ptr += NumToSkip;\n"
+     << "      (void)Pred;\n"
+     << "      LLVM_DEBUG(dbgs() << Loc << \": OPC_CheckPredicate(\" << PIdx "
+        "<< \"): \"\n"
+     << "            << (Pred ? \"PASS\\n\" : \"FAIL\\n\"));\n"
+     << "\n"
+     << "      break;\n"
+     << "    }\n"
+     << "    case MCD::OPC_Decode: {\n"
+     << "      unsigned Len;\n"
+     << "      // Decode the Opcode value.\n"
+     << "      unsigned Opc = decodeULEB128(++Ptr, &Len);\n"
+     << "      Ptr += Len;\n"
+     << "      unsigned DecodeIdx = decodeULEB128(Ptr, &Len);\n"
+     << "      Ptr += Len;\n"
+     << "\n"
+     << "      MI.clear();\n"
+     << "      MI.setOpcode(Opc);\n"
+     << "      bool DecodeComplete;\n";
+  if (IsVarLenInst) {
+    OS << "      Len = InstrLenTable[Opc];\n"
+       << "      makeUp(insn, Len);\n";
+  }
+  OS << "      S = decodeToMCInst(S, DecodeIdx, insn, MI, Address, DisAsm, "
+        "DecodeComplete);\n"
+     << "      assert(DecodeComplete);\n"
+     << "\n"
+     << "      LLVM_DEBUG(dbgs() << Loc << \": OPC_Decode: opcode \" << Opc\n"
+     << "                   << \", using decoder \" << DecodeIdx << \": \"\n"
+     << "                   << (S != MCDisassembler::Fail ? \"PASS\" : "
+        "\"FAIL\") << \"\\n\");\n"
+     << "      return S;\n"
+     << "    }\n"
+     << "    case MCD::OPC_TryDecode: {\n"
+     << "      unsigned Len;\n"
+     << "      // Decode the Opcode value.\n"
+     << "      unsigned Opc = decodeULEB128(++Ptr, &Len);\n"
+     << "      Ptr += Len;\n"
+     << "      unsigned DecodeIdx = decodeULEB128(Ptr, &Len);\n"
+     << "      Ptr += Len;\n"
+     << "      // NumToSkip is a plain 24-bit integer.\n"
+     << "      unsigned NumToSkip = *Ptr++;\n"
+     << "      NumToSkip |= (*Ptr++) << 8;\n"
+     << "      NumToSkip |= (*Ptr++) << 16;\n"
+     << "\n"
+     << "      // Perform the decode operation.\n"
+     << "      MCInst TmpMI;\n"
+     << "      TmpMI.setOpcode(Opc);\n"
+     << "      bool DecodeComplete;\n"
+     << "      S = decodeToMCInst(S, DecodeIdx, insn, TmpMI, Address, DisAsm, "
+        "DecodeComplete);\n"
+     << "      LLVM_DEBUG(dbgs() << Loc << \": OPC_TryDecode: opcode \" << "
+        "Opc\n"
+     << "                   << \", using decoder \" << DecodeIdx << \": \");\n"
+     << "\n"
+     << "      if (DecodeComplete) {\n"
+     << "        // Decoding complete.\n"
+     << "        LLVM_DEBUG(dbgs() << (S != MCDisassembler::Fail ? \"PASS\" : "
+        "\"FAIL\") << \"\\n\");\n"
+     << "        MI = TmpMI;\n"
+     << "        return S;\n"
+     << "      } else {\n"
+     << "        assert(S == MCDisassembler::Fail);\n"
+     << "        // If the decoding was incomplete, skip.\n"
+     << "        Ptr += NumToSkip;\n"
+     << "        LLVM_DEBUG(dbgs() << \"FAIL: continuing at \" << (Ptr - "
+        "DecodeTable) << \"\\n\");\n"
+     << "        // Reset decode status. This also drops a SoftFail status "
+        "that could be\n"
+     << "        // set before the decode attempt.\n"
+     << "        S = MCDisassembler::Success;\n"
+     << "      }\n"
+     << "      break;\n"
+     << "    }\n"
+     << "    case MCD::OPC_SoftFail: {\n"
+     << "      // Decode the mask values.\n"
+     << "      unsigned Len;\n"
+     << "      uint64_t PositiveMask = decodeULEB128(++Ptr, &Len);\n"
+     << "      Ptr += Len;\n"
+     << "      uint64_t NegativeMask = decodeULEB128(Ptr, &Len);\n"
+     << "      Ptr += Len;\n"
+     << "      bool Fail = (insn & PositiveMask) != 0 || (~insn & "
+        "NegativeMask) != 0;\n"
+     << "      if (Fail)\n"
+     << "        S = MCDisassembler::SoftFail;\n"
+     << "      LLVM_DEBUG(dbgs() << Loc << \": OPC_SoftFail: \" << (Fail ? "
+        "\"FAIL\\n\" : \"PASS\\n\"));\n"
+     << "      break;\n"
+     << "    }\n"
+     << "    case MCD::OPC_Fail: {\n"
+     << "      LLVM_DEBUG(dbgs() << Loc << \": OPC_Fail\\n\");\n"
+     << "      return MCDisassembler::Fail;\n"
+     << "    }\n"
+     << "    }\n"
+     << "  }\n"
+     << "  llvm_unreachable(\"bogosity detected in disassembler state "
+        "machine!\");\n"
+     << "}\n\n";
+}
+
+// Emits disassembler code for instruction decoding.
+void DecoderEmitter::run(raw_ostream &o) {
+  formatted_raw_ostream OS(o);
+  OS << "#include \"llvm/MC/MCInst.h\"\n";
+  OS << "#include \"llvm/MC/MCSubtargetInfo.h\"\n";
+  OS << "#include \"llvm/MC/SubtargetFeature.h\"\n";
+  OS << "#include \"llvm/Support/DataTypes.h\"\n";
+  OS << "#include \"llvm/Support/Debug.h\"\n";
+  OS << "#include \"llvm/Support/LEB128.h\"\n";
+  OS << "#include \"llvm/Support/raw_ostream.h\"\n";
+  OS << "#include <assert.h>\n";
+  OS << '\n';
+  OS << "namespace llvm {\n\n";
+
+  emitFieldFromInstruction(OS);
+  emitInsertBits(OS);
+
+  Target.reverseBitsForLittleEndianEncoding();
+
+  // Parameterize the decoders based on namespace and instruction width.
+  std::set<StringRef> HwModeNames;
+  const auto &NumberedInstructions = Target.getInstructionsByEnumValue();
+  NumberedEncodings.reserve(NumberedInstructions.size());
+  DenseMap<Record *, unsigned> IndexOfInstruction;
+  // First, collect all HwModes referenced by the target.
+  for (const auto &NumberedInstruction : NumberedInstructions) {
+    IndexOfInstruction[NumberedInstruction->TheDef] = NumberedEncodings.size();
+
+    if (const RecordVal *RV =
+            NumberedInstruction->TheDef->getValue("EncodingInfos")) {
+      if (auto *DI = dyn_cast_or_null<DefInit>(RV->getValue())) {
+        const CodeGenHwModes &HWM = Target.getHwModes();
+        EncodingInfoByHwMode EBM(DI->getDef(), HWM);
+        for (auto &KV : EBM)
+          HwModeNames.insert(HWM.getMode(KV.first).Name);
+      }
+    }
+  }
+
+  // If HwModeNames is empty, add the empty string so we always have one HwMode.
+  if (HwModeNames.empty())
+    HwModeNames.insert("");
+
+  for (const auto &NumberedInstruction : NumberedInstructions) {
+    IndexOfInstruction[NumberedInstruction->TheDef] = NumberedEncodings.size();
+
+    if (const RecordVal *RV =
+            NumberedInstruction->TheDef->getValue("EncodingInfos")) {
+      if (DefInit *DI = dyn_cast_or_null<DefInit>(RV->getValue())) {
+        const CodeGenHwModes &HWM = Target.getHwModes();
+        EncodingInfoByHwMode EBM(DI->getDef(), HWM);
+        for (auto &KV : EBM) {
+          NumberedEncodings.emplace_back(KV.second, NumberedInstruction,
+                                         HWM.getMode(KV.first).Name);
+          HwModeNames.insert(HWM.getMode(KV.first).Name);
+        }
+        continue;
+      }
+    }
+    // This instruction is encoded the same on all HwModes. Emit it for all
+    // HwModes.
+    for (StringRef HwModeName : HwModeNames)
+      NumberedEncodings.emplace_back(NumberedInstruction->TheDef,
+                                     NumberedInstruction, HwModeName);
+  }
+  for (const auto &NumberedAlias : RK.getAllDerivedDefinitions("AdditionalEncoding"))
+    NumberedEncodings.emplace_back(
+        NumberedAlias,
+        &Target.getInstruction(NumberedAlias->getValueAsDef("AliasOf")));
+
+  std::map<std::pair<std::string, unsigned>, std::vector<EncodingIDAndOpcode>>
+      OpcMap;
+  std::map<unsigned, std::vector<OperandInfo>> Operands;
+  std::vector<unsigned> InstrLen;
+
+  bool IsVarLenInst =
+      any_of(NumberedInstructions, [](const CodeGenInstruction *CGI) {
+        RecordVal *RV = CGI->TheDef->getValue("Inst");
+        return RV && isa<DagInit>(RV->getValue());
+      });
+  unsigned MaxInstLen = 0;
+
+  for (unsigned i = 0; i < NumberedEncodings.size(); ++i) {
+    const Record *EncodingDef = NumberedEncodings[i].EncodingDef;
+    const CodeGenInstruction *Inst = NumberedEncodings[i].Inst;
+    const Record *Def = Inst->TheDef;
+    unsigned Size = EncodingDef->getValueAsInt("Size");
+    if (Def->getValueAsString("Namespace") == "TargetOpcode" ||
+        Def->getValueAsBit("isPseudo") ||
+        Def->getValueAsBit("isAsmParserOnly") ||
+        Def->getValueAsBit("isCodeGenOnly")) {
+      NumEncodingsLackingDisasm++;
+      continue;
+    }
+
+    if (i < NumberedInstructions.size())
+      NumInstructions++;
+    NumEncodings++;
+
+    if (!Size && !IsVarLenInst)
+      continue;
+
+    if (IsVarLenInst)
+      InstrLen.resize(NumberedInstructions.size(), 0);
+
+    if (unsigned Len = populateInstruction(Target, *EncodingDef, *Inst, i,
+                                           Operands, IsVarLenInst)) {
+      if (IsVarLenInst) {
+        MaxInstLen = std::max(MaxInstLen, Len);
+        InstrLen[i] = Len;
+      }
+      std::string DecoderNamespace =
+          std::string(EncodingDef->getValueAsString("DecoderNamespace"));
+      if (!NumberedEncodings[i].HwModeName.empty())
+        DecoderNamespace +=
+            std::string("_") + NumberedEncodings[i].HwModeName.str();
+      OpcMap[std::make_pair(DecoderNamespace, Size)].emplace_back(
+          i, IndexOfInstruction.find(Def)->second);
+    } else {
+      NumEncodingsOmitted++;
+    }
+  }
+
+  DecoderTableInfo TableInfo;
+  for (const auto &Opc : OpcMap) {
+    // Emit the decoder for this namespace+width combination.
+    ArrayRef<EncodingAndInst> NumberedEncodingsRef(
+        NumberedEncodings.data(), NumberedEncodings.size());
+    FilterChooser FC(NumberedEncodingsRef, Opc.second, Operands,
+                     IsVarLenInst ? MaxInstLen : 8 * Opc.first.second, this);
+
+    // The decode table is cleared for each top level decoder function. The
+    // predicates and decoders themselves, however, are shared across all
+    // decoders to give more opportunities for uniqueing.
+    TableInfo.Table.clear();
+    TableInfo.FixupStack.clear();
+    TableInfo.Table.reserve(16384);
+    TableInfo.FixupStack.emplace_back();
+    FC.emitTableEntries(TableInfo);
+    // Any NumToSkip fixups in the top level scope can resolve to the
+    // OPC_Fail at the end of the table.
+    assert(TableInfo.FixupStack.size() == 1 && "fixup stack phasing error!");
+    // Resolve any NumToSkip fixups in the current scope.
+    resolveTableFixups(TableInfo.Table, TableInfo.FixupStack.back(),
+                       TableInfo.Table.size());
+    TableInfo.FixupStack.clear();
+
+    TableInfo.Table.push_back(MCD::OPC_Fail);
+
+    // Print the table to the output stream.
+    emitTable(OS, TableInfo.Table, 0, FC.getBitWidth(), Opc.first.first);
+    OS.flush();
+  }
+
+  // For variable instruction, we emit a instruction length table
+  // to let the decoder know how long the instructions are.
+  // You can see example usage in M68k's disassembler.
+  if (IsVarLenInst)
+    emitInstrLenTable(OS, InstrLen);
+  // Emit the predicate function.
+  emitPredicateFunction(OS, TableInfo.Predicates, 0);
+
+  // Emit the decoder function.
+  emitDecoderFunction(OS, TableInfo.Decoders, 0);
+
+  // Emit the main entry point for the decoder, decodeInstruction().
+  emitDecodeInstruction(OS, IsVarLenInst);
+
+  OS << "\n} // end namespace llvm\n";
+}
+
+namespace llvm {
+
+void EmitDecoder(RecordKeeper &RK, raw_ostream &OS,
+                 const std::string &PredicateNamespace,
+                 const std::string &GPrefix, const std::string &GPostfix,
+                 const std::string &ROK, const std::string &RFail,
+                 const std::string &L) {
+  DecoderEmitter(RK, PredicateNamespace, GPrefix, GPostfix, ROK, RFail, L)
+      .run(OS);
+}
+
+} // end namespace llvm
diff --git a/llvm/utils/TableGen/DirectiveEmitter.cpp b/llvm/utils/TableGen/DirectiveEmitter.cpp
index b21bf369d18e..f3751591f3d9 100644
--- a/llvm/utils/TableGen/DirectiveEmitter.cpp
+++ b/llvm/utils/TableGen/DirectiveEmitter.cpp
@@ -17,7 +17,6 @@
 #include "llvm/ADT/StringSet.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
-#include "llvm/TableGen/TableGenBackend.h"
 
 using namespace llvm;
 
@@ -368,8 +367,7 @@ void GenerateCaseForVersionedClauses(const std::vector<Record *> &Clauses,
 
     const auto ClauseFormattedName = VerClause.getClause().getFormattedName();
 
-    if (Cases.find(ClauseFormattedName) == Cases.end()) {
-      Cases.insert(ClauseFormattedName);
+    if (Cases.insert(ClauseFormattedName).second) {
       OS << "        case " << DirLang.getClausePrefix() << ClauseFormattedName
          << ":\n";
       OS << "          return " << VerClause.getMinVersion()
diff --git a/llvm/utils/TableGen/DisassemblerEmitter.cpp b/llvm/utils/TableGen/DisassemblerEmitter.cpp
index 7c3f53b31bf4..297d12c5d0e9 100644
--- a/llvm/utils/TableGen/DisassemblerEmitter.cpp
+++ b/llvm/utils/TableGen/DisassemblerEmitter.cpp
@@ -95,12 +95,11 @@ using namespace llvm::X86Disassembler;
 
 namespace llvm {
 
-extern void EmitFixedLenDecoder(RecordKeeper &RK, raw_ostream &OS,
-                                const std::string &PredicateNamespace,
-                                const std::string &GPrefix,
-                                const std::string &GPostfix,
-                                const std::string &ROK,
-                                const std::string &RFail, const std::string &L);
+extern void EmitDecoder(RecordKeeper &RK, raw_ostream &OS,
+                        const std::string &PredicateNamespace,
+                        const std::string &GPrefix, const std::string &GPostfix,
+                        const std::string &ROK, const std::string &RFail,
+                        const std::string &L);
 
 void EmitDisassembler(RecordKeeper &Records, raw_ostream &OS) {
   CodeGenTarget Target(Records);
@@ -140,17 +139,16 @@ void EmitDisassembler(RecordKeeper &Records, raw_ostream &OS) {
     if (PredicateNamespace == "Thumb")
       PredicateNamespace = "ARM";
 
-    EmitFixedLenDecoder(Records, OS, PredicateNamespace,
-                        "if (!Check(S, ", "))",
-                        "S", "MCDisassembler::Fail",
-                        "  MCDisassembler::DecodeStatus S = "
-                          "MCDisassembler::Success;\n(void)S;");
+    EmitDecoder(Records, OS, PredicateNamespace, "if (!Check(S, ", "))", "S",
+                "MCDisassembler::Fail",
+                "  MCDisassembler::DecodeStatus S = "
+                "MCDisassembler::Success;\n(void)S;");
     return;
   }
 
-  EmitFixedLenDecoder(Records, OS, std::string(Target.getName()), "if (",
-                      " == MCDisassembler::Fail)", "MCDisassembler::Success",
-                      "MCDisassembler::Fail", "");
+  EmitDecoder(Records, OS, std::string(Target.getName()), "if (",
+              " == MCDisassembler::Fail)", "MCDisassembler::Success",
+              "MCDisassembler::Fail", "");
 }
 
 } // end namespace llvm
diff --git a/llvm/utils/TableGen/ExegesisEmitter.cpp b/llvm/utils/TableGen/ExegesisEmitter.cpp
index 77654cbc92fd..bc8ccdac557b 100644
--- a/llvm/utils/TableGen/ExegesisEmitter.cpp
+++ b/llvm/utils/TableGen/ExegesisEmitter.cpp
@@ -13,15 +13,11 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
-#include <algorithm>
 #include <cassert>
-#include <cstdint>
 #include <map>
 #include <string>
 #include <vector>
diff --git a/llvm/utils/TableGen/FastISelEmitter.cpp b/llvm/utils/TableGen/FastISelEmitter.cpp
index ac9fe6db4328..49c2ead468e3 100644
--- a/llvm/utils/TableGen/FastISelEmitter.cpp
+++ b/llvm/utils/TableGen/FastISelEmitter.cpp
@@ -17,8 +17,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "CodeGenDAGPatterns.h"
+#include "CodeGenInstruction.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
diff --git a/llvm/utils/TableGen/FixedLenDecoderEmitter.cpp b/llvm/utils/TableGen/FixedLenDecoderEmitter.cpp
deleted file mode 100644
index c5dd1e626696..000000000000
--- a/llvm/utils/TableGen/FixedLenDecoderEmitter.cpp
+++ /dev/null
@@ -1,2560 +0,0 @@
-//===------------ FixedLenDecoderEmitter.cpp - Decoder Generator ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// It contains the tablegen backend that emits the decoder functions for
-// targets with fixed length instruction set.
-//
-//===----------------------------------------------------------------------===//
-
-#include "CodeGenInstruction.h"
-#include "CodeGenTarget.h"
-#include "InfoByHwMode.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/CachedHashString.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/MC/MCFixedLenDisassembler.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/LEB128.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/TableGen/Error.h"
-#include "llvm/TableGen/Record.h"
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <map>
-#include <memory>
-#include <set>
-#include <string>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "decoder-emitter"
-
-namespace {
-
-STATISTIC(NumEncodings, "Number of encodings considered");
-STATISTIC(NumEncodingsLackingDisasm, "Number of encodings without disassembler info");
-STATISTIC(NumInstructions, "Number of instructions considered");
-STATISTIC(NumEncodingsSupported, "Number of encodings supported");
-STATISTIC(NumEncodingsOmitted, "Number of encodings omitted");
-
-struct EncodingField {
-  unsigned Base, Width, Offset;
-  EncodingField(unsigned B, unsigned W, unsigned O)
-    : Base(B), Width(W), Offset(O) { }
-};
-
-struct OperandInfo {
-  std::vector<EncodingField> Fields;
-  std::string Decoder;
-  bool HasCompleteDecoder;
-  uint64_t InitValue;
-
-  OperandInfo(std::string D, bool HCD)
-      : Decoder(std::move(D)), HasCompleteDecoder(HCD), InitValue(0) {}
-
-  void addField(unsigned Base, unsigned Width, unsigned Offset) {
-    Fields.push_back(EncodingField(Base, Width, Offset));
-  }
-
-  unsigned numFields() const { return Fields.size(); }
-
-  typedef std::vector<EncodingField>::const_iterator const_iterator;
-
-  const_iterator begin() const { return Fields.begin(); }
-  const_iterator end() const   { return Fields.end();   }
-};
-
-typedef std::vector<uint8_t> DecoderTable;
-typedef uint32_t DecoderFixup;
-typedef std::vector<DecoderFixup> FixupList;
-typedef std::vector<FixupList> FixupScopeList;
-typedef SmallSetVector<CachedHashString, 16> PredicateSet;
-typedef SmallSetVector<CachedHashString, 16> DecoderSet;
-struct DecoderTableInfo {
-  DecoderTable Table;
-  FixupScopeList FixupStack;
-  PredicateSet Predicates;
-  DecoderSet Decoders;
-};
-
-struct EncodingAndInst {
-  const Record *EncodingDef;
-  const CodeGenInstruction *Inst;
-  StringRef HwModeName;
-
-  EncodingAndInst(const Record *EncodingDef, const CodeGenInstruction *Inst,
-                  StringRef HwModeName = "")
-      : EncodingDef(EncodingDef), Inst(Inst), HwModeName(HwModeName) {}
-};
-
-struct EncodingIDAndOpcode {
-  unsigned EncodingID;
-  unsigned Opcode;
-
-  EncodingIDAndOpcode() : EncodingID(0), Opcode(0) {}
-  EncodingIDAndOpcode(unsigned EncodingID, unsigned Opcode)
-      : EncodingID(EncodingID), Opcode(Opcode) {}
-};
-
-raw_ostream &operator<<(raw_ostream &OS, const EncodingAndInst &Value) {
-  if (Value.EncodingDef != Value.Inst->TheDef)
-    OS << Value.EncodingDef->getName() << ":";
-  OS << Value.Inst->TheDef->getName();
-  return OS;
-}
-
-class FixedLenDecoderEmitter {
-  RecordKeeper &RK;
-  std::vector<EncodingAndInst> NumberedEncodings;
-
-public:
-  // Defaults preserved here for documentation, even though they aren't
-  // strictly necessary given the way that this is currently being called.
-  FixedLenDecoderEmitter(RecordKeeper &R, std::string PredicateNamespace,
-                         std::string GPrefix = "if (",
-                         std::string GPostfix = " == MCDisassembler::Fail)",
-                         std::string ROK = "MCDisassembler::Success",
-                         std::string RFail = "MCDisassembler::Fail",
-                         std::string L = "")
-      : RK(R), Target(R), PredicateNamespace(std::move(PredicateNamespace)),
-        GuardPrefix(std::move(GPrefix)), GuardPostfix(std::move(GPostfix)),
-        ReturnOK(std::move(ROK)), ReturnFail(std::move(RFail)),
-        Locals(std::move(L)) {}
-
-  // Emit the decoder state machine table.
-  void emitTable(formatted_raw_ostream &o, DecoderTable &Table,
-                 unsigned Indentation, unsigned BitWidth,
-                 StringRef Namespace) const;
-  void emitPredicateFunction(formatted_raw_ostream &OS,
-                             PredicateSet &Predicates,
-                             unsigned Indentation) const;
-  void emitDecoderFunction(formatted_raw_ostream &OS,
-                           DecoderSet &Decoders,
-                           unsigned Indentation) const;
-
-  // run - Output the code emitter
-  void run(raw_ostream &o);
-
-private:
-  CodeGenTarget Target;
-
-public:
-  std::string PredicateNamespace;
-  std::string GuardPrefix, GuardPostfix;
-  std::string ReturnOK, ReturnFail;
-  std::string Locals;
-};
-
-} // end anonymous namespace
-
-// The set (BIT_TRUE, BIT_FALSE, BIT_UNSET) represents a ternary logic system
-// for a bit value.
-//
-// BIT_UNFILTERED is used as the init value for a filter position.  It is used
-// only for filter processings.
-typedef enum {
-  BIT_TRUE,      // '1'
-  BIT_FALSE,     // '0'
-  BIT_UNSET,     // '?'
-  BIT_UNFILTERED // unfiltered
-} bit_value_t;
-
-static bool ValueSet(bit_value_t V) {
-  return (V == BIT_TRUE || V == BIT_FALSE);
-}
-
-static bool ValueNotSet(bit_value_t V) {
-  return (V == BIT_UNSET);
-}
-
-static int Value(bit_value_t V) {
-  return ValueNotSet(V) ? -1 : (V == BIT_FALSE ? 0 : 1);
-}
-
-static bit_value_t bitFromBits(const BitsInit &bits, unsigned index) {
-  if (BitInit *bit = dyn_cast<BitInit>(bits.getBit(index)))
-    return bit->getValue() ? BIT_TRUE : BIT_FALSE;
-
-  // The bit is uninitialized.
-  return BIT_UNSET;
-}
-
-// Prints the bit value for each position.
-static void dumpBits(raw_ostream &o, const BitsInit &bits) {
-  for (unsigned index = bits.getNumBits(); index > 0; --index) {
-    switch (bitFromBits(bits, index - 1)) {
-    case BIT_TRUE:
-      o << "1";
-      break;
-    case BIT_FALSE:
-      o << "0";
-      break;
-    case BIT_UNSET:
-      o << "_";
-      break;
-    default:
-      llvm_unreachable("unexpected return value from bitFromBits");
-    }
-  }
-}
-
-static BitsInit &getBitsField(const Record &def, StringRef str) {
-  BitsInit *bits = def.getValueAsBitsInit(str);
-  return *bits;
-}
-
-// Representation of the instruction to work on.
-typedef std::vector<bit_value_t> insn_t;
-
-namespace {
-
-static const uint64_t NO_FIXED_SEGMENTS_SENTINEL = -1ULL;
-
-class FilterChooser;
-
-/// Filter - Filter works with FilterChooser to produce the decoding tree for
-/// the ISA.
-///
-/// It is useful to think of a Filter as governing the switch stmts of the
-/// decoding tree in a certain level.  Each case stmt delegates to an inferior
-/// FilterChooser to decide what further decoding logic to employ, or in another
-/// words, what other remaining bits to look at.  The FilterChooser eventually
-/// chooses a best Filter to do its job.
-///
-/// This recursive scheme ends when the number of Opcodes assigned to the
-/// FilterChooser becomes 1 or if there is a conflict.  A conflict happens when
-/// the Filter/FilterChooser combo does not know how to distinguish among the
-/// Opcodes assigned.
-///
-/// An example of a conflict is
-///
-/// Conflict:
-///                     111101000.00........00010000....
-///                     111101000.00........0001........
-///                     1111010...00........0001........
-///                     1111010...00....................
-///                     1111010.........................
-///                     1111............................
-///                     ................................
-///     VST4q8a         111101000_00________00010000____
-///     VST4q8b         111101000_00________00010000____
-///
-/// The Debug output shows the path that the decoding tree follows to reach the
-/// the conclusion that there is a conflict.  VST4q8a is a vst4 to double-spaced
-/// even registers, while VST4q8b is a vst4 to double-spaced odd registers.
-///
-/// The encoding info in the .td files does not specify this meta information,
-/// which could have been used by the decoder to resolve the conflict.  The
-/// decoder could try to decode the even/odd register numbering and assign to
-/// VST4q8a or VST4q8b, but for the time being, the decoder chooses the "a"
-/// version and return the Opcode since the two have the same Asm format string.
-class Filter {
-protected:
-  const FilterChooser *Owner;// points to the FilterChooser who owns this filter
-  unsigned StartBit; // the starting bit position
-  unsigned NumBits; // number of bits to filter
-  bool Mixed; // a mixed region contains both set and unset bits
-
-  // Map of well-known segment value to the set of uid's with that value.
-  std::map<uint64_t, std::vector<EncodingIDAndOpcode>>
-      FilteredInstructions;
-
-  // Set of uid's with non-constant segment values.
-  std::vector<EncodingIDAndOpcode> VariableInstructions;
-
-  // Map of well-known segment value to its delegate.
-  std::map<uint64_t, std::unique_ptr<const FilterChooser>> FilterChooserMap;
-
-  // Number of instructions which fall under FilteredInstructions category.
-  unsigned NumFiltered;
-
-  // Keeps track of the last opcode in the filtered bucket.
-  EncodingIDAndOpcode LastOpcFiltered;
-
-public:
-  Filter(Filter &&f);
-  Filter(FilterChooser &owner, unsigned startBit, unsigned numBits, bool mixed);
-
-  ~Filter() = default;
-
-  unsigned getNumFiltered() const { return NumFiltered; }
-
-  EncodingIDAndOpcode getSingletonOpc() const {
-    assert(NumFiltered == 1);
-    return LastOpcFiltered;
-  }
-
-  // Return the filter chooser for the group of instructions without constant
-  // segment values.
-  const FilterChooser &getVariableFC() const {
-    assert(NumFiltered == 1);
-    assert(FilterChooserMap.size() == 1);
-    return *(FilterChooserMap.find(NO_FIXED_SEGMENTS_SENTINEL)->second);
-  }
-
-  // Divides the decoding task into sub tasks and delegates them to the
-  // inferior FilterChooser's.
-  //
-  // A special case arises when there's only one entry in the filtered
-  // instructions.  In order to unambiguously decode the singleton, we need to
-  // match the remaining undecoded encoding bits against the singleton.
-  void recurse();
-
-  // Emit table entries to decode instructions given a segment or segments of
-  // bits.
-  void emitTableEntry(DecoderTableInfo &TableInfo) const;
-
-  // Returns the number of fanout produced by the filter.  More fanout implies
-  // the filter distinguishes more categories of instructions.
-  unsigned usefulness() const;
-}; // end class Filter
-
-} // end anonymous namespace
-
-// These are states of our finite state machines used in FilterChooser's
-// filterProcessor() which produces the filter candidates to use.
-typedef enum {
-  ATTR_NONE,
-  ATTR_FILTERED,
-  ATTR_ALL_SET,
-  ATTR_ALL_UNSET,
-  ATTR_MIXED
-} bitAttr_t;
-
-/// FilterChooser - FilterChooser chooses the best filter among a set of Filters
-/// in order to perform the decoding of instructions at the current level.
-///
-/// Decoding proceeds from the top down.  Based on the well-known encoding bits
-/// of instructions available, FilterChooser builds up the possible Filters that
-/// can further the task of decoding by distinguishing among the remaining
-/// candidate instructions.
-///
-/// Once a filter has been chosen, it is called upon to divide the decoding task
-/// into sub-tasks and delegates them to its inferior FilterChoosers for further
-/// processings.
-///
-/// It is useful to think of a Filter as governing the switch stmts of the
-/// decoding tree.  And each case is delegated to an inferior FilterChooser to
-/// decide what further remaining bits to look at.
-namespace {
-
-class FilterChooser {
-protected:
-  friend class Filter;
-
-  // Vector of codegen instructions to choose our filter.
-  ArrayRef<EncodingAndInst> AllInstructions;
-
-  // Vector of uid's for this filter chooser to work on.
-  // The first member of the pair is the opcode id being decoded, the second is
-  // the opcode id that should be emitted.
-  const std::vector<EncodingIDAndOpcode> &Opcodes;
-
-  // Lookup table for the operand decoding of instructions.
-  const std::map<unsigned, std::vector<OperandInfo>> &Operands;
-
-  // Vector of candidate filters.
-  std::vector<Filter> Filters;
-
-  // Array of bit values passed down from our parent.
-  // Set to all BIT_UNFILTERED's for Parent == NULL.
-  std::vector<bit_value_t> FilterBitValues;
-
-  // Links to the FilterChooser above us in the decoding tree.
-  const FilterChooser *Parent;
-
-  // Index of the best filter from Filters.
-  int BestIndex;
-
-  // Width of instructions
-  unsigned BitWidth;
-
-  // Parent emitter
-  const FixedLenDecoderEmitter *Emitter;
-
-public:
-  FilterChooser(ArrayRef<EncodingAndInst> Insts,
-                const std::vector<EncodingIDAndOpcode> &IDs,
-                const std::map<unsigned, std::vector<OperandInfo>> &Ops,
-                unsigned BW, const FixedLenDecoderEmitter *E)
-      : AllInstructions(Insts), Opcodes(IDs), Operands(Ops),
-        FilterBitValues(BW, BIT_UNFILTERED), Parent(nullptr), BestIndex(-1),
-        BitWidth(BW), Emitter(E) {
-    doFilter();
-  }
-
-  FilterChooser(ArrayRef<EncodingAndInst> Insts,
-                const std::vector<EncodingIDAndOpcode> &IDs,
-                const std::map<unsigned, std::vector<OperandInfo>> &Ops,
-                const std::vector<bit_value_t> &ParentFilterBitValues,
-                const FilterChooser &parent)
-      : AllInstructions(Insts), Opcodes(IDs), Operands(Ops),
-        FilterBitValues(ParentFilterBitValues), Parent(&parent), BestIndex(-1),
-        BitWidth(parent.BitWidth), Emitter(parent.Emitter) {
-    doFilter();
-  }
-
-  FilterChooser(const FilterChooser &) = delete;
-  void operator=(const FilterChooser &) = delete;
-
-  unsigned getBitWidth() const { return BitWidth; }
-
-protected:
-  // Populates the insn given the uid.
-  void insnWithID(insn_t &Insn, unsigned Opcode) const {
-    BitsInit &Bits = getBitsField(*AllInstructions[Opcode].EncodingDef, "Inst");
-
-    // We may have a SoftFail bitmask, which specifies a mask where an encoding
-    // may differ from the value in "Inst" and yet still be valid, but the
-    // disassembler should return SoftFail instead of Success.
-    //
-    // This is used for marking UNPREDICTABLE instructions in the ARM world.
-    BitsInit *SFBits =
-        AllInstructions[Opcode].EncodingDef->getValueAsBitsInit("SoftFail");
-
-    for (unsigned i = 0; i < BitWidth; ++i) {
-      if (SFBits && bitFromBits(*SFBits, i) == BIT_TRUE)
-        Insn.push_back(BIT_UNSET);
-      else
-        Insn.push_back(bitFromBits(Bits, i));
-    }
-  }
-
-  // Emit the name of the encoding/instruction pair.
-  void emitNameWithID(raw_ostream &OS, unsigned Opcode) const {
-    const Record *EncodingDef = AllInstructions[Opcode].EncodingDef;
-    const Record *InstDef = AllInstructions[Opcode].Inst->TheDef;
-    if (EncodingDef != InstDef)
-      OS << EncodingDef->getName() << ":";
-    OS << InstDef->getName();
-  }
-
-  // Populates the field of the insn given the start position and the number of
-  // consecutive bits to scan for.
-  //
-  // Returns false if there exists any uninitialized bit value in the range.
-  // Returns true, otherwise.
-  bool fieldFromInsn(uint64_t &Field, insn_t &Insn, unsigned StartBit,
-                     unsigned NumBits) const;
-
-  /// dumpFilterArray - dumpFilterArray prints out debugging info for the given
-  /// filter array as a series of chars.
-  void dumpFilterArray(raw_ostream &o,
-                       const std::vector<bit_value_t> & filter) const;
-
-  /// dumpStack - dumpStack traverses the filter chooser chain and calls
-  /// dumpFilterArray on each filter chooser up to the top level one.
-  void dumpStack(raw_ostream &o, const char *prefix) const;
-
-  Filter &bestFilter() {
-    assert(BestIndex != -1 && "BestIndex not set");
-    return Filters[BestIndex];
-  }
-
-  bool PositionFiltered(unsigned i) const {
-    return ValueSet(FilterBitValues[i]);
-  }
-
-  // Calculates the island(s) needed to decode the instruction.
-  // This returns a lit of undecoded bits of an instructions, for example,
-  // Inst{20} = 1 && Inst{3-0} == 0b1111 represents two islands of yet-to-be
-  // decoded bits in order to verify that the instruction matches the Opcode.
-  unsigned getIslands(std::vector<unsigned> &StartBits,
-                      std::vector<unsigned> &EndBits,
-                      std::vector<uint64_t> &FieldVals,
-                      const insn_t &Insn) const;
-
-  // Emits code to check the Predicates member of an instruction are true.
-  // Returns true if predicate matches were emitted, false otherwise.
-  bool emitPredicateMatch(raw_ostream &o, unsigned &Indentation,
-                          unsigned Opc) const;
-
-  bool doesOpcodeNeedPredicate(unsigned Opc) const;
-  unsigned getPredicateIndex(DecoderTableInfo &TableInfo, StringRef P) const;
-  void emitPredicateTableEntry(DecoderTableInfo &TableInfo,
-                               unsigned Opc) const;
-
-  void emitSoftFailTableEntry(DecoderTableInfo &TableInfo,
-                              unsigned Opc) const;
-
-  // Emits table entries to decode the singleton.
-  void emitSingletonTableEntry(DecoderTableInfo &TableInfo,
-                               EncodingIDAndOpcode Opc) const;
-
-  // Emits code to decode the singleton, and then to decode the rest.
-  void emitSingletonTableEntry(DecoderTableInfo &TableInfo,
-                               const Filter &Best) const;
-
-  void emitBinaryParser(raw_ostream &o, unsigned &Indentation,
-                        const OperandInfo &OpInfo,
-                        bool &OpHasCompleteDecoder) const;
-
-  void emitDecoder(raw_ostream &OS, unsigned Indentation, unsigned Opc,
-                   bool &HasCompleteDecoder) const;
-  unsigned getDecoderIndex(DecoderSet &Decoders, unsigned Opc,
-                           bool &HasCompleteDecoder) const;
-
-  // Assign a single filter and run with it.
-  void runSingleFilter(unsigned startBit, unsigned numBit, bool mixed);
-
-  // reportRegion is a helper function for filterProcessor to mark a region as
-  // eligible for use as a filter region.
-  void reportRegion(bitAttr_t RA, unsigned StartBit, unsigned BitIndex,
-                    bool AllowMixed);
-
-  // FilterProcessor scans the well-known encoding bits of the instructions and
-  // builds up a list of candidate filters.  It chooses the best filter and
-  // recursively descends down the decoding tree.
-  bool filterProcessor(bool AllowMixed, bool Greedy = true);
-
-  // Decides on the best configuration of filter(s) to use in order to decode
-  // the instructions.  A conflict of instructions may occur, in which case we
-  // dump the conflict set to the standard error.
-  void doFilter();
-
-public:
-  // emitTableEntries - Emit state machine entries to decode our share of
-  // instructions.
-  void emitTableEntries(DecoderTableInfo &TableInfo) const;
-};
-
-} // end anonymous namespace
-
-///////////////////////////
-//                       //
-// Filter Implementation //
-//                       //
-///////////////////////////
-
-Filter::Filter(Filter &&f)
-  : Owner(f.Owner), StartBit(f.StartBit), NumBits(f.NumBits), Mixed(f.Mixed),
-    FilteredInstructions(std::move(f.FilteredInstructions)),
-    VariableInstructions(std::move(f.VariableInstructions)),
-    FilterChooserMap(std::move(f.FilterChooserMap)), NumFiltered(f.NumFiltered),
-    LastOpcFiltered(f.LastOpcFiltered) {
-}
-
-Filter::Filter(FilterChooser &owner, unsigned startBit, unsigned numBits,
-               bool mixed)
-  : Owner(&owner), StartBit(startBit), NumBits(numBits), Mixed(mixed) {
-  assert(StartBit + NumBits - 1 < Owner->BitWidth);
-
-  NumFiltered = 0;
-  LastOpcFiltered = {0, 0};
-
-  for (unsigned i = 0, e = Owner->Opcodes.size(); i != e; ++i) {
-    insn_t Insn;
-
-    // Populates the insn given the uid.
-    Owner->insnWithID(Insn, Owner->Opcodes[i].EncodingID);
-
-    uint64_t Field;
-    // Scans the segment for possibly well-specified encoding bits.
-    bool ok = Owner->fieldFromInsn(Field, Insn, StartBit, NumBits);
-
-    if (ok) {
-      // The encoding bits are well-known.  Lets add the uid of the
-      // instruction into the bucket keyed off the constant field value.
-      LastOpcFiltered = Owner->Opcodes[i];
-      FilteredInstructions[Field].push_back(LastOpcFiltered);
-      ++NumFiltered;
-    } else {
-      // Some of the encoding bit(s) are unspecified.  This contributes to
-      // one additional member of "Variable" instructions.
-      VariableInstructions.push_back(Owner->Opcodes[i]);
-    }
-  }
-
-  assert((FilteredInstructions.size() + VariableInstructions.size() > 0)
-         && "Filter returns no instruction categories");
-}
-
-// Divides the decoding task into sub tasks and delegates them to the
-// inferior FilterChooser's.
-//
-// A special case arises when there's only one entry in the filtered
-// instructions.  In order to unambiguously decode the singleton, we need to
-// match the remaining undecoded encoding bits against the singleton.
-void Filter::recurse() {
-  // Starts by inheriting our parent filter chooser's filter bit values.
-  std::vector<bit_value_t> BitValueArray(Owner->FilterBitValues);
-
-  if (!VariableInstructions.empty()) {
-    // Conservatively marks each segment position as BIT_UNSET.
-    for (unsigned bitIndex = 0; bitIndex < NumBits; ++bitIndex)
-      BitValueArray[StartBit + bitIndex] = BIT_UNSET;
-
-    // Delegates to an inferior filter chooser for further processing on this
-    // group of instructions whose segment values are variable.
-    FilterChooserMap.insert(std::make_pair(NO_FIXED_SEGMENTS_SENTINEL,
-        std::make_unique<FilterChooser>(Owner->AllInstructions,
-            VariableInstructions, Owner->Operands, BitValueArray, *Owner)));
-  }
-
-  // No need to recurse for a singleton filtered instruction.
-  // See also Filter::emit*().
-  if (getNumFiltered() == 1) {
-    assert(FilterChooserMap.size() == 1);
-    return;
-  }
-
-  // Otherwise, create sub choosers.
-  for (const auto &Inst : FilteredInstructions) {
-
-    // Marks all the segment positions with either BIT_TRUE or BIT_FALSE.
-    for (unsigned bitIndex = 0; bitIndex < NumBits; ++bitIndex) {
-      if (Inst.first & (1ULL << bitIndex))
-        BitValueArray[StartBit + bitIndex] = BIT_TRUE;
-      else
-        BitValueArray[StartBit + bitIndex] = BIT_FALSE;
-    }
-
-    // Delegates to an inferior filter chooser for further processing on this
-    // category of instructions.
-    FilterChooserMap.insert(std::make_pair(
-        Inst.first, std::make_unique<FilterChooser>(
-                                Owner->AllInstructions, Inst.second,
-                                Owner->Operands, BitValueArray, *Owner)));
-  }
-}
-
-static void resolveTableFixups(DecoderTable &Table, const FixupList &Fixups,
-                               uint32_t DestIdx) {
-  // Any NumToSkip fixups in the current scope can resolve to the
-  // current location.
-  for (FixupList::const_reverse_iterator I = Fixups.rbegin(),
-                                         E = Fixups.rend();
-       I != E; ++I) {
-    // Calculate the distance from the byte following the fixup entry byte
-    // to the destination. The Target is calculated from after the 16-bit
-    // NumToSkip entry itself, so subtract two  from the displacement here
-    // to account for that.
-    uint32_t FixupIdx = *I;
-    uint32_t Delta = DestIdx - FixupIdx - 3;
-    // Our NumToSkip entries are 24-bits. Make sure our table isn't too
-    // big.
-    assert(Delta < (1u << 24));
-    Table[FixupIdx] = (uint8_t)Delta;
-    Table[FixupIdx + 1] = (uint8_t)(Delta >> 8);
-    Table[FixupIdx + 2] = (uint8_t)(Delta >> 16);
-  }
-}
-
-// Emit table entries to decode instructions given a segment or segments
-// of bits.
-void Filter::emitTableEntry(DecoderTableInfo &TableInfo) const {
-  TableInfo.Table.push_back(MCD::OPC_ExtractField);
-  TableInfo.Table.push_back(StartBit);
-  TableInfo.Table.push_back(NumBits);
-
-  // A new filter entry begins a new scope for fixup resolution.
-  TableInfo.FixupStack.emplace_back();
-
-  DecoderTable &Table = TableInfo.Table;
-
-  size_t PrevFilter = 0;
-  bool HasFallthrough = false;
-  for (auto &Filter : FilterChooserMap) {
-    // Field value -1 implies a non-empty set of variable instructions.
-    // See also recurse().
-    if (Filter.first == NO_FIXED_SEGMENTS_SENTINEL) {
-      HasFallthrough = true;
-
-      // Each scope should always have at least one filter value to check
-      // for.
-      assert(PrevFilter != 0 && "empty filter set!");
-      FixupList &CurScope = TableInfo.FixupStack.back();
-      // Resolve any NumToSkip fixups in the current scope.
-      resolveTableFixups(Table, CurScope, Table.size());
-      CurScope.clear();
-      PrevFilter = 0;  // Don't re-process the filter's fallthrough.
-    } else {
-      Table.push_back(MCD::OPC_FilterValue);
-      // Encode and emit the value to filter against.
-      uint8_t Buffer[16];
-      unsigned Len = encodeULEB128(Filter.first, Buffer);
-      Table.insert(Table.end(), Buffer, Buffer + Len);
-      // Reserve space for the NumToSkip entry. We'll backpatch the value
-      // later.
-      PrevFilter = Table.size();
-      Table.push_back(0);
-      Table.push_back(0);
-      Table.push_back(0);
-    }
-
-    // We arrive at a category of instructions with the same segment value.
-    // Now delegate to the sub filter chooser for further decodings.
-    // The case may fallthrough, which happens if the remaining well-known
-    // encoding bits do not match exactly.
-    Filter.second->emitTableEntries(TableInfo);
-
-    // Now that we've emitted the body of the handler, update the NumToSkip
-    // of the filter itself to be able to skip forward when false. Subtract
-    // two as to account for the width of the NumToSkip field itself.
-    if (PrevFilter) {
-      uint32_t NumToSkip = Table.size() - PrevFilter - 3;
-      assert(NumToSkip < (1u << 24) && "disassembler decoding table too large!");
-      Table[PrevFilter] = (uint8_t)NumToSkip;
-      Table[PrevFilter + 1] = (uint8_t)(NumToSkip >> 8);
-      Table[PrevFilter + 2] = (uint8_t)(NumToSkip >> 16);
-    }
-  }
-
-  // Any remaining unresolved fixups bubble up to the parent fixup scope.
-  assert(TableInfo.FixupStack.size() > 1 && "fixup stack underflow!");
-  FixupScopeList::iterator Source = TableInfo.FixupStack.end() - 1;
-  FixupScopeList::iterator Dest = Source - 1;
-  llvm::append_range(*Dest, *Source);
-  TableInfo.FixupStack.pop_back();
-
-  // If there is no fallthrough, then the final filter should get fixed
-  // up according to the enclosing scope rather than the current position.
-  if (!HasFallthrough)
-    TableInfo.FixupStack.back().push_back(PrevFilter);
-}
-
-// Returns the number of fanout produced by the filter.  More fanout implies
-// the filter distinguishes more categories of instructions.
-unsigned Filter::usefulness() const {
-  if (!VariableInstructions.empty())
-    return FilteredInstructions.size();
-  else
-    return FilteredInstructions.size() + 1;
-}
-
-//////////////////////////////////
-//                              //
-// Filterchooser Implementation //
-//                              //
-//////////////////////////////////
-
-// Emit the decoder state machine table.
-void FixedLenDecoderEmitter::emitTable(formatted_raw_ostream &OS,
-                                       DecoderTable &Table,
-                                       unsigned Indentation,
-                                       unsigned BitWidth,
-                                       StringRef Namespace) const {
-  OS.indent(Indentation) << "static const uint8_t DecoderTable" << Namespace
-    << BitWidth << "[] = {\n";
-
-  Indentation += 2;
-
-  // FIXME: We may be able to use the NumToSkip values to recover
-  // appropriate indentation levels.
-  DecoderTable::const_iterator I = Table.begin();
-  DecoderTable::const_iterator E = Table.end();
-  while (I != E) {
-    assert (I < E && "incomplete decode table entry!");
-
-    uint64_t Pos = I - Table.begin();
-    OS << "/* " << Pos << " */";
-    OS.PadToColumn(12);
-
-    switch (*I) {
-    default:
-      PrintFatalError("invalid decode table opcode");
-    case MCD::OPC_ExtractField: {
-      ++I;
-      unsigned Start = *I++;
-      unsigned Len = *I++;
-      OS.indent(Indentation) << "MCD::OPC_ExtractField, " << Start << ", "
-        << Len << ",  // Inst{";
-      if (Len > 1)
-        OS << (Start + Len - 1) << "-";
-      OS << Start << "} ...\n";
-      break;
-    }
-    case MCD::OPC_FilterValue: {
-      ++I;
-      OS.indent(Indentation) << "MCD::OPC_FilterValue, ";
-      // The filter value is ULEB128 encoded.
-      while (*I >= 128)
-        OS << (unsigned)*I++ << ", ";
-      OS << (unsigned)*I++ << ", ";
-
-      // 24-bit numtoskip value.
-      uint8_t Byte = *I++;
-      uint32_t NumToSkip = Byte;
-      OS << (unsigned)Byte << ", ";
-      Byte = *I++;
-      OS << (unsigned)Byte << ", ";
-      NumToSkip |= Byte << 8;
-      Byte = *I++;
-      OS << utostr(Byte) << ", ";
-      NumToSkip |= Byte << 16;
-      OS << "// Skip to: " << ((I - Table.begin()) + NumToSkip) << "\n";
-      break;
-    }
-    case MCD::OPC_CheckField: {
-      ++I;
-      unsigned Start = *I++;
-      unsigned Len = *I++;
-      OS.indent(Indentation) << "MCD::OPC_CheckField, " << Start << ", "
-        << Len << ", ";// << Val << ", " << NumToSkip << ",\n";
-      // ULEB128 encoded field value.
-      for (; *I >= 128; ++I)
-        OS << (unsigned)*I << ", ";
-      OS << (unsigned)*I++ << ", ";
-      // 24-bit numtoskip value.
-      uint8_t Byte = *I++;
-      uint32_t NumToSkip = Byte;
-      OS << (unsigned)Byte << ", ";
-      Byte = *I++;
-      OS << (unsigned)Byte << ", ";
-      NumToSkip |= Byte << 8;
-      Byte = *I++;
-      OS << utostr(Byte) << ", ";
-      NumToSkip |= Byte << 16;
-      OS << "// Skip to: " << ((I - Table.begin()) + NumToSkip) << "\n";
-      break;
-    }
-    case MCD::OPC_CheckPredicate: {
-      ++I;
-      OS.indent(Indentation) << "MCD::OPC_CheckPredicate, ";
-      for (; *I >= 128; ++I)
-        OS << (unsigned)*I << ", ";
-      OS << (unsigned)*I++ << ", ";
-
-      // 24-bit numtoskip value.
-      uint8_t Byte = *I++;
-      uint32_t NumToSkip = Byte;
-      OS << (unsigned)Byte << ", ";
-      Byte = *I++;
-      OS << (unsigned)Byte << ", ";
-      NumToSkip |= Byte << 8;
-      Byte = *I++;
-      OS << utostr(Byte) << ", ";
-      NumToSkip |= Byte << 16;
-      OS << "// Skip to: " << ((I - Table.begin()) + NumToSkip) << "\n";
-      break;
-    }
-    case MCD::OPC_Decode:
-    case MCD::OPC_TryDecode: {
-      bool IsTry = *I == MCD::OPC_TryDecode;
-      ++I;
-      // Extract the ULEB128 encoded Opcode to a buffer.
-      uint8_t Buffer[16], *p = Buffer;
-      while ((*p++ = *I++) >= 128)
-        assert((p - Buffer) <= (ptrdiff_t)sizeof(Buffer)
-               && "ULEB128 value too large!");
-      // Decode the Opcode value.
-      unsigned Opc = decodeULEB128(Buffer);
-      OS.indent(Indentation) << "MCD::OPC_" << (IsTry ? "Try" : "")
-        << "Decode, ";
-      for (p = Buffer; *p >= 128; ++p)
-        OS << (unsigned)*p << ", ";
-      OS << (unsigned)*p << ", ";
-
-      // Decoder index.
-      for (; *I >= 128; ++I)
-        OS << (unsigned)*I << ", ";
-      OS << (unsigned)*I++ << ", ";
-
-      if (!IsTry) {
-        OS << "// Opcode: " << NumberedEncodings[Opc] << "\n";
-        break;
-      }
-
-      // Fallthrough for OPC_TryDecode.
-
-      // 24-bit numtoskip value.
-      uint8_t Byte = *I++;
-      uint32_t NumToSkip = Byte;
-      OS << (unsigned)Byte << ", ";
-      Byte = *I++;
-      OS << (unsigned)Byte << ", ";
-      NumToSkip |= Byte << 8;
-      Byte = *I++;
-      OS << utostr(Byte) << ", ";
-      NumToSkip |= Byte << 16;
-
-      OS << "// Opcode: " << NumberedEncodings[Opc]
-         << ", skip to: " << ((I - Table.begin()) + NumToSkip) << "\n";
-      break;
-    }
-    case MCD::OPC_SoftFail: {
-      ++I;
-      OS.indent(Indentation) << "MCD::OPC_SoftFail";
-      // Positive mask
-      uint64_t Value = 0;
-      unsigned Shift = 0;
-      do {
-        OS << ", " << (unsigned)*I;
-        Value += (*I & 0x7f) << Shift;
-        Shift += 7;
-      } while (*I++ >= 128);
-      if (Value > 127) {
-        OS << " /* 0x";
-        OS.write_hex(Value);
-        OS << " */";
-      }
-      // Negative mask
-      Value = 0;
-      Shift = 0;
-      do {
-        OS << ", " << (unsigned)*I;
-        Value += (*I & 0x7f) << Shift;
-        Shift += 7;
-      } while (*I++ >= 128);
-      if (Value > 127) {
-        OS << " /* 0x";
-        OS.write_hex(Value);
-        OS << " */";
-      }
-      OS << ",\n";
-      break;
-    }
-    case MCD::OPC_Fail: {
-      ++I;
-      OS.indent(Indentation) << "MCD::OPC_Fail,\n";
-      break;
-    }
-    }
-  }
-  OS.indent(Indentation) << "0\n";
-
-  Indentation -= 2;
-
-  OS.indent(Indentation) << "};\n\n";
-}
-
-void FixedLenDecoderEmitter::
-emitPredicateFunction(formatted_raw_ostream &OS, PredicateSet &Predicates,
-                      unsigned Indentation) const {
-  // The predicate function is just a big switch statement based on the
-  // input predicate index.
-  OS.indent(Indentation) << "static bool checkDecoderPredicate(unsigned Idx, "
-    << "const FeatureBitset &Bits) {\n";
-  Indentation += 2;
-  if (!Predicates.empty()) {
-    OS.indent(Indentation) << "switch (Idx) {\n";
-    OS.indent(Indentation) << "default: llvm_unreachable(\"Invalid index!\");\n";
-    unsigned Index = 0;
-    for (const auto &Predicate : Predicates) {
-      OS.indent(Indentation) << "case " << Index++ << ":\n";
-      OS.indent(Indentation+2) << "return (" << Predicate << ");\n";
-    }
-    OS.indent(Indentation) << "}\n";
-  } else {
-    // No case statement to emit
-    OS.indent(Indentation) << "llvm_unreachable(\"Invalid index!\");\n";
-  }
-  Indentation -= 2;
-  OS.indent(Indentation) << "}\n\n";
-}
-
-void FixedLenDecoderEmitter::
-emitDecoderFunction(formatted_raw_ostream &OS, DecoderSet &Decoders,
-                    unsigned Indentation) const {
-  // The decoder function is just a big switch statement based on the
-  // input decoder index.
-  OS.indent(Indentation) << "template <typename InsnType>\n";
-  OS.indent(Indentation) << "static DecodeStatus decodeToMCInst(DecodeStatus S,"
-    << " unsigned Idx, InsnType insn, MCInst &MI,\n";
-  OS.indent(Indentation) << "                                   uint64_t "
-    << "Address, const void *Decoder, bool &DecodeComplete) {\n";
-  Indentation += 2;
-  OS.indent(Indentation) << "DecodeComplete = true;\n";
-  // TODO: When InsnType is large, using uint64_t limits all fields to 64 bits
-  // It would be better for emitBinaryParser to use a 64-bit tmp whenever
-  // possible but fall back to an InsnType-sized tmp for truly large fields.
-  OS.indent(Indentation) << "using TmpType = "
-                            "std::conditional_t<std::is_integral<InsnType>::"
-                            "value, InsnType, uint64_t>;\n";
-  OS.indent(Indentation) << "TmpType tmp;\n";
-  OS.indent(Indentation) << "switch (Idx) {\n";
-  OS.indent(Indentation) << "default: llvm_unreachable(\"Invalid index!\");\n";
-  unsigned Index = 0;
-  for (const auto &Decoder : Decoders) {
-    OS.indent(Indentation) << "case " << Index++ << ":\n";
-    OS << Decoder;
-    OS.indent(Indentation+2) << "return S;\n";
-  }
-  OS.indent(Indentation) << "}\n";
-  Indentation -= 2;
-  OS.indent(Indentation) << "}\n\n";
-}
-
-// Populates the field of the insn given the start position and the number of
-// consecutive bits to scan for.
-//
-// Returns false if and on the first uninitialized bit value encountered.
-// Returns true, otherwise.
-bool FilterChooser::fieldFromInsn(uint64_t &Field, insn_t &Insn,
-                                  unsigned StartBit, unsigned NumBits) const {
-  Field = 0;
-
-  for (unsigned i = 0; i < NumBits; ++i) {
-    if (Insn[StartBit + i] == BIT_UNSET)
-      return false;
-
-    if (Insn[StartBit + i] == BIT_TRUE)
-      Field = Field | (1ULL << i);
-  }
-
-  return true;
-}
-
-/// dumpFilterArray - dumpFilterArray prints out debugging info for the given
-/// filter array as a series of chars.
-void FilterChooser::dumpFilterArray(raw_ostream &o,
-                                 const std::vector<bit_value_t> &filter) const {
-  for (unsigned bitIndex = BitWidth; bitIndex > 0; bitIndex--) {
-    switch (filter[bitIndex - 1]) {
-    case BIT_UNFILTERED:
-      o << ".";
-      break;
-    case BIT_UNSET:
-      o << "_";
-      break;
-    case BIT_TRUE:
-      o << "1";
-      break;
-    case BIT_FALSE:
-      o << "0";
-      break;
-    }
-  }
-}
-
-/// dumpStack - dumpStack traverses the filter chooser chain and calls
-/// dumpFilterArray on each filter chooser up to the top level one.
-void FilterChooser::dumpStack(raw_ostream &o, const char *prefix) const {
-  const FilterChooser *current = this;
-
-  while (current) {
-    o << prefix;
-    dumpFilterArray(o, current->FilterBitValues);
-    o << '\n';
-    current = current->Parent;
-  }
-}
-
-// Calculates the island(s) needed to decode the instruction.
-// This returns a list of undecoded bits of an instructions, for example,
-// Inst{20} = 1 && Inst{3-0} == 0b1111 represents two islands of yet-to-be
-// decoded bits in order to verify that the instruction matches the Opcode.
-unsigned FilterChooser::getIslands(std::vector<unsigned> &StartBits,
-                                   std::vector<unsigned> &EndBits,
-                                   std::vector<uint64_t> &FieldVals,
-                                   const insn_t &Insn) const {
-  unsigned Num, BitNo;
-  Num = BitNo = 0;
-
-  uint64_t FieldVal = 0;
-
-  // 0: Init
-  // 1: Water (the bit value does not affect decoding)
-  // 2: Island (well-known bit value needed for decoding)
-  int State = 0;
-
-  for (unsigned i = 0; i < BitWidth; ++i) {
-    int64_t Val = Value(Insn[i]);
-    bool Filtered = PositionFiltered(i);
-    switch (State) {
-    default: llvm_unreachable("Unreachable code!");
-    case 0:
-    case 1:
-      if (Filtered || Val == -1)
-        State = 1; // Still in Water
-      else {
-        State = 2; // Into the Island
-        BitNo = 0;
-        StartBits.push_back(i);
-        FieldVal = Val;
-      }
-      break;
-    case 2:
-      if (Filtered || Val == -1) {
-        State = 1; // Into the Water
-        EndBits.push_back(i - 1);
-        FieldVals.push_back(FieldVal);
-        ++Num;
-      } else {
-        State = 2; // Still in Island
-        ++BitNo;
-        FieldVal = FieldVal | Val << BitNo;
-      }
-      break;
-    }
-  }
-  // If we are still in Island after the loop, do some housekeeping.
-  if (State == 2) {
-    EndBits.push_back(BitWidth - 1);
-    FieldVals.push_back(FieldVal);
-    ++Num;
-  }
-
-  assert(StartBits.size() == Num && EndBits.size() == Num &&
-         FieldVals.size() == Num);
-  return Num;
-}
-
-void FilterChooser::emitBinaryParser(raw_ostream &o, unsigned &Indentation,
-                                     const OperandInfo &OpInfo,
-                                     bool &OpHasCompleteDecoder) const {
-  const std::string &Decoder = OpInfo.Decoder;
-
-  bool UseInsertBits = OpInfo.numFields() != 1 || OpInfo.InitValue != 0;
-
-  if (UseInsertBits) {
-    o.indent(Indentation) << "tmp = 0x";
-    o.write_hex(OpInfo.InitValue);
-    o << ";\n";
-  }
-
-  for (const EncodingField &EF : OpInfo) {
-    o.indent(Indentation);
-    if (UseInsertBits)
-      o << "insertBits(tmp, ";
-    else
-      o << "tmp = ";
-    o << "fieldFromInstruction(insn, " << EF.Base << ", " << EF.Width << ')';
-    if (UseInsertBits)
-      o << ", " << EF.Offset << ", " << EF.Width << ')';
-    else if (EF.Offset != 0)
-      o << " << " << EF.Offset;
-    o << ";\n";
-  }
-
-  if (Decoder != "") {
-    OpHasCompleteDecoder = OpInfo.HasCompleteDecoder;
-    o.indent(Indentation) << Emitter->GuardPrefix << Decoder
-      << "(MI, tmp, Address, Decoder)"
-      << Emitter->GuardPostfix
-      << " { " << (OpHasCompleteDecoder ? "" : "DecodeComplete = false; ")
-      << "return MCDisassembler::Fail; }\n";
-  } else {
-    OpHasCompleteDecoder = true;
-    o.indent(Indentation) << "MI.addOperand(MCOperand::createImm(tmp));\n";
-  }
-}
-
-void FilterChooser::emitDecoder(raw_ostream &OS, unsigned Indentation,
-                                unsigned Opc, bool &HasCompleteDecoder) const {
-  HasCompleteDecoder = true;
-
-  for (const auto &Op : Operands.find(Opc)->second) {
-    // If a custom instruction decoder was specified, use that.
-    if (Op.numFields() == 0 && !Op.Decoder.empty()) {
-      HasCompleteDecoder = Op.HasCompleteDecoder;
-      OS.indent(Indentation) << Emitter->GuardPrefix << Op.Decoder
-        << "(MI, insn, Address, Decoder)"
-        << Emitter->GuardPostfix
-        << " { " << (HasCompleteDecoder ? "" : "DecodeComplete = false; ")
-        << "return MCDisassembler::Fail; }\n";
-      break;
-    }
-
-    bool OpHasCompleteDecoder;
-    emitBinaryParser(OS, Indentation, Op, OpHasCompleteDecoder);
-    if (!OpHasCompleteDecoder)
-      HasCompleteDecoder = false;
-  }
-}
-
-unsigned FilterChooser::getDecoderIndex(DecoderSet &Decoders,
-                                        unsigned Opc,
-                                        bool &HasCompleteDecoder) const {
-  // Build up the predicate string.
-  SmallString<256> Decoder;
-  // FIXME: emitDecoder() function can take a buffer directly rather than
-  // a stream.
-  raw_svector_ostream S(Decoder);
-  unsigned I = 4;
-  emitDecoder(S, I, Opc, HasCompleteDecoder);
-
-  // Using the full decoder string as the key value here is a bit
-  // heavyweight, but is effective. If the string comparisons become a
-  // performance concern, we can implement a mangling of the predicate
-  // data easily enough with a map back to the actual string. That's
-  // overkill for now, though.
-
-  // Make sure the predicate is in the table.
-  Decoders.insert(CachedHashString(Decoder));
-  // Now figure out the index for when we write out the table.
-  DecoderSet::const_iterator P = find(Decoders, Decoder.str());
-  return (unsigned)(P - Decoders.begin());
-}
-
-bool FilterChooser::emitPredicateMatch(raw_ostream &o, unsigned &Indentation,
-                                       unsigned Opc) const {
-  ListInit *Predicates =
-      AllInstructions[Opc].EncodingDef->getValueAsListInit("Predicates");
-  bool IsFirstEmission = true;
-  for (unsigned i = 0; i < Predicates->size(); ++i) {
-    Record *Pred = Predicates->getElementAsRecord(i);
-    if (!Pred->getValue("AssemblerMatcherPredicate"))
-      continue;
-
-    if (!isa<DagInit>(Pred->getValue("AssemblerCondDag")->getValue()))
-      continue;
-
-    const DagInit *D = Pred->getValueAsDag("AssemblerCondDag");
-    std::string CombineType = D->getOperator()->getAsString();
-    if (CombineType != "any_of" && CombineType != "all_of")
-      PrintFatalError(Pred->getLoc(), "Invalid AssemblerCondDag!");
-    if (D->getNumArgs() == 0)
-      PrintFatalError(Pred->getLoc(), "Invalid AssemblerCondDag!");
-    bool IsOr = CombineType == "any_of";
-
-    if (!IsFirstEmission)
-      o << " && ";
-
-    if (IsOr)
-      o << "(";
-
-    ListSeparator LS(IsOr ? " || " : " && ");
-    for (auto *Arg : D->getArgs()) {
-      o << LS;
-      if (auto *NotArg = dyn_cast<DagInit>(Arg)) {
-        if (NotArg->getOperator()->getAsString() != "not" ||
-            NotArg->getNumArgs() != 1)
-          PrintFatalError(Pred->getLoc(), "Invalid AssemblerCondDag!");
-        Arg = NotArg->getArg(0);
-        o << "!";
-      }
-      if (!isa<DefInit>(Arg) ||
-          !cast<DefInit>(Arg)->getDef()->isSubClassOf("SubtargetFeature"))
-        PrintFatalError(Pred->getLoc(), "Invalid AssemblerCondDag!");
-      o << "Bits[" << Emitter->PredicateNamespace << "::" << Arg->getAsString()
-        << "]";
-    }
-
-    if (IsOr)
-      o << ")";
-
-    IsFirstEmission = false;
-  }
-  return !Predicates->empty();
-}
-
-bool FilterChooser::doesOpcodeNeedPredicate(unsigned Opc) const {
-  ListInit *Predicates =
-      AllInstructions[Opc].EncodingDef->getValueAsListInit("Predicates");
-  for (unsigned i = 0; i < Predicates->size(); ++i) {
-    Record *Pred = Predicates->getElementAsRecord(i);
-    if (!Pred->getValue("AssemblerMatcherPredicate"))
-      continue;
-
-    if (isa<DagInit>(Pred->getValue("AssemblerCondDag")->getValue()))
-      return true;
-  }
-  return false;
-}
-
-unsigned FilterChooser::getPredicateIndex(DecoderTableInfo &TableInfo,
-                                          StringRef Predicate) const {
-  // Using the full predicate string as the key value here is a bit
-  // heavyweight, but is effective. If the string comparisons become a
-  // performance concern, we can implement a mangling of the predicate
-  // data easily enough with a map back to the actual string. That's
-  // overkill for now, though.
-
-  // Make sure the predicate is in the table.
-  TableInfo.Predicates.insert(CachedHashString(Predicate));
-  // Now figure out the index for when we write out the table.
-  PredicateSet::const_iterator P = find(TableInfo.Predicates, Predicate);
-  return (unsigned)(P - TableInfo.Predicates.begin());
-}
-
-void FilterChooser::emitPredicateTableEntry(DecoderTableInfo &TableInfo,
-                                            unsigned Opc) const {
-  if (!doesOpcodeNeedPredicate(Opc))
-    return;
-
-  // Build up the predicate string.
-  SmallString<256> Predicate;
-  // FIXME: emitPredicateMatch() functions can take a buffer directly rather
-  // than a stream.
-  raw_svector_ostream PS(Predicate);
-  unsigned I = 0;
-  emitPredicateMatch(PS, I, Opc);
-
-  // Figure out the index into the predicate table for the predicate just
-  // computed.
-  unsigned PIdx = getPredicateIndex(TableInfo, PS.str());
-  SmallString<16> PBytes;
-  raw_svector_ostream S(PBytes);
-  encodeULEB128(PIdx, S);
-
-  TableInfo.Table.push_back(MCD::OPC_CheckPredicate);
-  // Predicate index
-  for (unsigned i = 0, e = PBytes.size(); i != e; ++i)
-    TableInfo.Table.push_back(PBytes[i]);
-  // Push location for NumToSkip backpatching.
-  TableInfo.FixupStack.back().push_back(TableInfo.Table.size());
-  TableInfo.Table.push_back(0);
-  TableInfo.Table.push_back(0);
-  TableInfo.Table.push_back(0);
-}
-
-void FilterChooser::emitSoftFailTableEntry(DecoderTableInfo &TableInfo,
-                                           unsigned Opc) const {
-  BitsInit *SFBits =
-      AllInstructions[Opc].EncodingDef->getValueAsBitsInit("SoftFail");
-  if (!SFBits) return;
-  BitsInit *InstBits =
-      AllInstructions[Opc].EncodingDef->getValueAsBitsInit("Inst");
-
-  APInt PositiveMask(BitWidth, 0ULL);
-  APInt NegativeMask(BitWidth, 0ULL);
-  for (unsigned i = 0; i < BitWidth; ++i) {
-    bit_value_t B = bitFromBits(*SFBits, i);
-    bit_value_t IB = bitFromBits(*InstBits, i);
-
-    if (B != BIT_TRUE) continue;
-
-    switch (IB) {
-    case BIT_FALSE:
-      // The bit is meant to be false, so emit a check to see if it is true.
-      PositiveMask.setBit(i);
-      break;
-    case BIT_TRUE:
-      // The bit is meant to be true, so emit a check to see if it is false.
-      NegativeMask.setBit(i);
-      break;
-    default:
-      // The bit is not set; this must be an error!
-      errs() << "SoftFail Conflict: bit SoftFail{" << i << "} in "
-             << AllInstructions[Opc] << " is set but Inst{" << i
-             << "} is unset!\n"
-             << "  - You can only mark a bit as SoftFail if it is fully defined"
-             << " (1/0 - not '?') in Inst\n";
-      return;
-    }
-  }
-
-  bool NeedPositiveMask = PositiveMask.getBoolValue();
-  bool NeedNegativeMask = NegativeMask.getBoolValue();
-
-  if (!NeedPositiveMask && !NeedNegativeMask)
-    return;
-
-  TableInfo.Table.push_back(MCD::OPC_SoftFail);
-
-  SmallString<16> MaskBytes;
-  raw_svector_ostream S(MaskBytes);
-  if (NeedPositiveMask) {
-    encodeULEB128(PositiveMask.getZExtValue(), S);
-    for (unsigned i = 0, e = MaskBytes.size(); i != e; ++i)
-      TableInfo.Table.push_back(MaskBytes[i]);
-  } else
-    TableInfo.Table.push_back(0);
-  if (NeedNegativeMask) {
-    MaskBytes.clear();
-    encodeULEB128(NegativeMask.getZExtValue(), S);
-    for (unsigned i = 0, e = MaskBytes.size(); i != e; ++i)
-      TableInfo.Table.push_back(MaskBytes[i]);
-  } else
-    TableInfo.Table.push_back(0);
-}
-
-// Emits table entries to decode the singleton.
-void FilterChooser::emitSingletonTableEntry(DecoderTableInfo &TableInfo,
-                                            EncodingIDAndOpcode Opc) const {
-  std::vector<unsigned> StartBits;
-  std::vector<unsigned> EndBits;
-  std::vector<uint64_t> FieldVals;
-  insn_t Insn;
-  insnWithID(Insn, Opc.EncodingID);
-
-  // Look for islands of undecoded bits of the singleton.
-  getIslands(StartBits, EndBits, FieldVals, Insn);
-
-  unsigned Size = StartBits.size();
-
-  // Emit the predicate table entry if one is needed.
-  emitPredicateTableEntry(TableInfo, Opc.EncodingID);
-
-  // Check any additional encoding fields needed.
-  for (unsigned I = Size; I != 0; --I) {
-    unsigned NumBits = EndBits[I-1] - StartBits[I-1] + 1;
-    TableInfo.Table.push_back(MCD::OPC_CheckField);
-    TableInfo.Table.push_back(StartBits[I-1]);
-    TableInfo.Table.push_back(NumBits);
-    uint8_t Buffer[16], *p;
-    encodeULEB128(FieldVals[I-1], Buffer);
-    for (p = Buffer; *p >= 128 ; ++p)
-      TableInfo.Table.push_back(*p);
-    TableInfo.Table.push_back(*p);
-    // Push location for NumToSkip backpatching.
-    TableInfo.FixupStack.back().push_back(TableInfo.Table.size());
-    // The fixup is always 24-bits, so go ahead and allocate the space
-    // in the table so all our relative position calculations work OK even
-    // before we fully resolve the real value here.
-    TableInfo.Table.push_back(0);
-    TableInfo.Table.push_back(0);
-    TableInfo.Table.push_back(0);
-  }
-
-  // Check for soft failure of the match.
-  emitSoftFailTableEntry(TableInfo, Opc.EncodingID);
-
-  bool HasCompleteDecoder;
-  unsigned DIdx =
-      getDecoderIndex(TableInfo.Decoders, Opc.EncodingID, HasCompleteDecoder);
-
-  // Produce OPC_Decode or OPC_TryDecode opcode based on the information
-  // whether the instruction decoder is complete or not. If it is complete
-  // then it handles all possible values of remaining variable/unfiltered bits
-  // and for any value can determine if the bitpattern is a valid instruction
-  // or not. This means OPC_Decode will be the final step in the decoding
-  // process. If it is not complete, then the Fail return code from the
-  // decoder method indicates that additional processing should be done to see
-  // if there is any other instruction that also matches the bitpattern and
-  // can decode it.
-  TableInfo.Table.push_back(HasCompleteDecoder ? MCD::OPC_Decode :
-      MCD::OPC_TryDecode);
-  NumEncodingsSupported++;
-  uint8_t Buffer[16], *p;
-  encodeULEB128(Opc.Opcode, Buffer);
-  for (p = Buffer; *p >= 128 ; ++p)
-    TableInfo.Table.push_back(*p);
-  TableInfo.Table.push_back(*p);
-
-  SmallString<16> Bytes;
-  raw_svector_ostream S(Bytes);
-  encodeULEB128(DIdx, S);
-
-  // Decoder index
-  for (unsigned i = 0, e = Bytes.size(); i != e; ++i)
-    TableInfo.Table.push_back(Bytes[i]);
-
-  if (!HasCompleteDecoder) {
-    // Push location for NumToSkip backpatching.
-    TableInfo.FixupStack.back().push_back(TableInfo.Table.size());
-    // Allocate the space for the fixup.
-    TableInfo.Table.push_back(0);
-    TableInfo.Table.push_back(0);
-    TableInfo.Table.push_back(0);
-  }
-}
-
-// Emits table entries to decode the singleton, and then to decode the rest.
-void FilterChooser::emitSingletonTableEntry(DecoderTableInfo &TableInfo,
-                                            const Filter &Best) const {
-  EncodingIDAndOpcode Opc = Best.getSingletonOpc();
-
-  // complex singletons need predicate checks from the first singleton
-  // to refer forward to the variable filterchooser that follows.
-  TableInfo.FixupStack.emplace_back();
-
-  emitSingletonTableEntry(TableInfo, Opc);
-
-  resolveTableFixups(TableInfo.Table, TableInfo.FixupStack.back(),
-                     TableInfo.Table.size());
-  TableInfo.FixupStack.pop_back();
-
-  Best.getVariableFC().emitTableEntries(TableInfo);
-}
-
-// Assign a single filter and run with it.  Top level API client can initialize
-// with a single filter to start the filtering process.
-void FilterChooser::runSingleFilter(unsigned startBit, unsigned numBit,
-                                    bool mixed) {
-  Filters.clear();
-  Filters.emplace_back(*this, startBit, numBit, true);
-  BestIndex = 0; // Sole Filter instance to choose from.
-  bestFilter().recurse();
-}
-
-// reportRegion is a helper function for filterProcessor to mark a region as
-// eligible for use as a filter region.
-void FilterChooser::reportRegion(bitAttr_t RA, unsigned StartBit,
-                                 unsigned BitIndex, bool AllowMixed) {
-  if (RA == ATTR_MIXED && AllowMixed)
-    Filters.emplace_back(*this, StartBit, BitIndex - StartBit, true);
-  else if (RA == ATTR_ALL_SET && !AllowMixed)
-    Filters.emplace_back(*this, StartBit, BitIndex - StartBit, false);
-}
-
-// FilterProcessor scans the well-known encoding bits of the instructions and
-// builds up a list of candidate filters.  It chooses the best filter and
-// recursively descends down the decoding tree.
-bool FilterChooser::filterProcessor(bool AllowMixed, bool Greedy) {
-  Filters.clear();
-  BestIndex = -1;
-  unsigned numInstructions = Opcodes.size();
-
-  assert(numInstructions && "Filter created with no instructions");
-
-  // No further filtering is necessary.
-  if (numInstructions == 1)
-    return true;
-
-  // Heuristics.  See also doFilter()'s "Heuristics" comment when num of
-  // instructions is 3.
-  if (AllowMixed && !Greedy) {
-    assert(numInstructions == 3);
-
-    for (auto Opcode : Opcodes) {
-      std::vector<unsigned> StartBits;
-      std::vector<unsigned> EndBits;
-      std::vector<uint64_t> FieldVals;
-      insn_t Insn;
-
-      insnWithID(Insn, Opcode.EncodingID);
-
-      // Look for islands of undecoded bits of any instruction.
-      if (getIslands(StartBits, EndBits, FieldVals, Insn) > 0) {
-        // Found an instruction with island(s).  Now just assign a filter.
-        runSingleFilter(StartBits[0], EndBits[0] - StartBits[0] + 1, true);
-        return true;
-      }
-    }
-  }
-
-  unsigned BitIndex;
-
-  // We maintain BIT_WIDTH copies of the bitAttrs automaton.
-  // The automaton consumes the corresponding bit from each
-  // instruction.
-  //
-  //   Input symbols: 0, 1, and _ (unset).
-  //   States:        NONE, FILTERED, ALL_SET, ALL_UNSET, and MIXED.
-  //   Initial state: NONE.
-  //
-  // (NONE) ------- [01] -> (ALL_SET)
-  // (NONE) ------- _ ----> (ALL_UNSET)
-  // (ALL_SET) ---- [01] -> (ALL_SET)
-  // (ALL_SET) ---- _ ----> (MIXED)
-  // (ALL_UNSET) -- [01] -> (MIXED)
-  // (ALL_UNSET) -- _ ----> (ALL_UNSET)
-  // (MIXED) ------ . ----> (MIXED)
-  // (FILTERED)---- . ----> (FILTERED)
-
-  std::vector<bitAttr_t> bitAttrs;
-
-  // FILTERED bit positions provide no entropy and are not worthy of pursuing.
-  // Filter::recurse() set either BIT_TRUE or BIT_FALSE for each position.
-  for (BitIndex = 0; BitIndex < BitWidth; ++BitIndex)
-    if (FilterBitValues[BitIndex] == BIT_TRUE ||
-        FilterBitValues[BitIndex] == BIT_FALSE)
-      bitAttrs.push_back(ATTR_FILTERED);
-    else
-      bitAttrs.push_back(ATTR_NONE);
-
-  for (unsigned InsnIndex = 0; InsnIndex < numInstructions; ++InsnIndex) {
-    insn_t insn;
-
-    insnWithID(insn, Opcodes[InsnIndex].EncodingID);
-
-    for (BitIndex = 0; BitIndex < BitWidth; ++BitIndex) {
-      switch (bitAttrs[BitIndex]) {
-      case ATTR_NONE:
-        if (insn[BitIndex] == BIT_UNSET)
-          bitAttrs[BitIndex] = ATTR_ALL_UNSET;
-        else
-          bitAttrs[BitIndex] = ATTR_ALL_SET;
-        break;
-      case ATTR_ALL_SET:
-        if (insn[BitIndex] == BIT_UNSET)
-          bitAttrs[BitIndex] = ATTR_MIXED;
-        break;
-      case ATTR_ALL_UNSET:
-        if (insn[BitIndex] != BIT_UNSET)
-          bitAttrs[BitIndex] = ATTR_MIXED;
-        break;
-      case ATTR_MIXED:
-      case ATTR_FILTERED:
-        break;
-      }
-    }
-  }
-
-  // The regionAttr automaton consumes the bitAttrs automatons' state,
-  // lowest-to-highest.
-  //
-  //   Input symbols: F(iltered), (all_)S(et), (all_)U(nset), M(ixed)
-  //   States:        NONE, ALL_SET, MIXED
-  //   Initial state: NONE
-  //
-  // (NONE) ----- F --> (NONE)
-  // (NONE) ----- S --> (ALL_SET)     ; and set region start
-  // (NONE) ----- U --> (NONE)
-  // (NONE) ----- M --> (MIXED)       ; and set region start
-  // (ALL_SET) -- F --> (NONE)        ; and report an ALL_SET region
-  // (ALL_SET) -- S --> (ALL_SET)
-  // (ALL_SET) -- U --> (NONE)        ; and report an ALL_SET region
-  // (ALL_SET) -- M --> (MIXED)       ; and report an ALL_SET region
-  // (MIXED) ---- F --> (NONE)        ; and report a MIXED region
-  // (MIXED) ---- S --> (ALL_SET)     ; and report a MIXED region
-  // (MIXED) ---- U --> (NONE)        ; and report a MIXED region
-  // (MIXED) ---- M --> (MIXED)
-
-  bitAttr_t RA = ATTR_NONE;
-  unsigned StartBit = 0;
-
-  for (BitIndex = 0; BitIndex < BitWidth; ++BitIndex) {
-    bitAttr_t bitAttr = bitAttrs[BitIndex];
-
-    assert(bitAttr != ATTR_NONE && "Bit without attributes");
-
-    switch (RA) {
-    case ATTR_NONE:
-      switch (bitAttr) {
-      case ATTR_FILTERED:
-        break;
-      case ATTR_ALL_SET:
-        StartBit = BitIndex;
-        RA = ATTR_ALL_SET;
-        break;
-      case ATTR_ALL_UNSET:
-        break;
-      case ATTR_MIXED:
-        StartBit = BitIndex;
-        RA = ATTR_MIXED;
-        break;
-      default:
-        llvm_unreachable("Unexpected bitAttr!");
-      }
-      break;
-    case ATTR_ALL_SET:
-      switch (bitAttr) {
-      case ATTR_FILTERED:
-        reportRegion(RA, StartBit, BitIndex, AllowMixed);
-        RA = ATTR_NONE;
-        break;
-      case ATTR_ALL_SET:
-        break;
-      case ATTR_ALL_UNSET:
-        reportRegion(RA, StartBit, BitIndex, AllowMixed);
-        RA = ATTR_NONE;
-        break;
-      case ATTR_MIXED:
-        reportRegion(RA, StartBit, BitIndex, AllowMixed);
-        StartBit = BitIndex;
-        RA = ATTR_MIXED;
-        break;
-      default:
-        llvm_unreachable("Unexpected bitAttr!");
-      }
-      break;
-    case ATTR_MIXED:
-      switch (bitAttr) {
-      case ATTR_FILTERED:
-        reportRegion(RA, StartBit, BitIndex, AllowMixed);
-        StartBit = BitIndex;
-        RA = ATTR_NONE;
-        break;
-      case ATTR_ALL_SET:
-        reportRegion(RA, StartBit, BitIndex, AllowMixed);
-        StartBit = BitIndex;
-        RA = ATTR_ALL_SET;
-        break;
-      case ATTR_ALL_UNSET:
-        reportRegion(RA, StartBit, BitIndex, AllowMixed);
-        RA = ATTR_NONE;
-        break;
-      case ATTR_MIXED:
-        break;
-      default:
-        llvm_unreachable("Unexpected bitAttr!");
-      }
-      break;
-    case ATTR_ALL_UNSET:
-      llvm_unreachable("regionAttr state machine has no ATTR_UNSET state");
-    case ATTR_FILTERED:
-      llvm_unreachable("regionAttr state machine has no ATTR_FILTERED state");
-    }
-  }
-
-  // At the end, if we're still in ALL_SET or MIXED states, report a region
-  switch (RA) {
-  case ATTR_NONE:
-    break;
-  case ATTR_FILTERED:
-    break;
-  case ATTR_ALL_SET:
-    reportRegion(RA, StartBit, BitIndex, AllowMixed);
-    break;
-  case ATTR_ALL_UNSET:
-    break;
-  case ATTR_MIXED:
-    reportRegion(RA, StartBit, BitIndex, AllowMixed);
-    break;
-  }
-
-  // We have finished with the filter processings.  Now it's time to choose
-  // the best performing filter.
-  BestIndex = 0;
-  bool AllUseless = true;
-  unsigned BestScore = 0;
-
-  for (unsigned i = 0, e = Filters.size(); i != e; ++i) {
-    unsigned Usefulness = Filters[i].usefulness();
-
-    if (Usefulness)
-      AllUseless = false;
-
-    if (Usefulness > BestScore) {
-      BestIndex = i;
-      BestScore = Usefulness;
-    }
-  }
-
-  if (!AllUseless)
-    bestFilter().recurse();
-
-  return !AllUseless;
-} // end of FilterChooser::filterProcessor(bool)
-
-// Decides on the best configuration of filter(s) to use in order to decode
-// the instructions.  A conflict of instructions may occur, in which case we
-// dump the conflict set to the standard error.
-void FilterChooser::doFilter() {
-  unsigned Num = Opcodes.size();
-  assert(Num && "FilterChooser created with no instructions");
-
-  // Try regions of consecutive known bit values first.
-  if (filterProcessor(false))
-    return;
-
-  // Then regions of mixed bits (both known and unitialized bit values allowed).
-  if (filterProcessor(true))
-    return;
-
-  // Heuristics to cope with conflict set {t2CMPrs, t2SUBSrr, t2SUBSrs} where
-  // no single instruction for the maximum ATTR_MIXED region Inst{14-4} has a
-  // well-known encoding pattern.  In such case, we backtrack and scan for the
-  // the very first consecutive ATTR_ALL_SET region and assign a filter to it.
-  if (Num == 3 && filterProcessor(true, false))
-    return;
-
-  // If we come to here, the instruction decoding has failed.
-  // Set the BestIndex to -1 to indicate so.
-  BestIndex = -1;
-}
-
-// emitTableEntries - Emit state machine entries to decode our share of
-// instructions.
-void FilterChooser::emitTableEntries(DecoderTableInfo &TableInfo) const {
-  if (Opcodes.size() == 1) {
-    // There is only one instruction in the set, which is great!
-    // Call emitSingletonDecoder() to see whether there are any remaining
-    // encodings bits.
-    emitSingletonTableEntry(TableInfo, Opcodes[0]);
-    return;
-  }
-
-  // Choose the best filter to do the decodings!
-  if (BestIndex != -1) {
-    const Filter &Best = Filters[BestIndex];
-    if (Best.getNumFiltered() == 1)
-      emitSingletonTableEntry(TableInfo, Best);
-    else
-      Best.emitTableEntry(TableInfo);
-    return;
-  }
-
-  // We don't know how to decode these instructions!  Dump the
-  // conflict set and bail.
-
-  // Print out useful conflict information for postmortem analysis.
-  errs() << "Decoding Conflict:\n";
-
-  dumpStack(errs(), "\t\t");
-
-  for (auto Opcode : Opcodes) {
-    errs() << '\t';
-    emitNameWithID(errs(), Opcode.EncodingID);
-    errs() << " ";
-    dumpBits(
-        errs(),
-        getBitsField(*AllInstructions[Opcode.EncodingID].EncodingDef, "Inst"));
-    errs() << '\n';
-  }
-}
-
-static std::string findOperandDecoderMethod(TypedInit *TI) {
-  std::string Decoder;
-
-  Record *Record = cast<DefInit>(TI)->getDef();
-
-  RecordVal *DecoderString = Record->getValue("DecoderMethod");
-  StringInit *String = DecoderString ?
-    dyn_cast<StringInit>(DecoderString->getValue()) : nullptr;
-  if (String) {
-    Decoder = std::string(String->getValue());
-    if (!Decoder.empty())
-      return Decoder;
-  }
-
-  if (Record->isSubClassOf("RegisterOperand"))
-    Record = Record->getValueAsDef("RegClass");
-
-  if (Record->isSubClassOf("RegisterClass")) {
-    Decoder = "Decode" + Record->getName().str() + "RegisterClass";
-  } else if (Record->isSubClassOf("PointerLikeRegClass")) {
-    Decoder = "DecodePointerLikeRegClass" +
-      utostr(Record->getValueAsInt("RegClassKind"));
-  }
-
-  return Decoder;
-}
-
-static bool
-populateInstruction(CodeGenTarget &Target, const Record &EncodingDef,
-                    const CodeGenInstruction &CGI, unsigned Opc,
-                    std::map<unsigned, std::vector<OperandInfo>> &Operands) {
-  const Record &Def = *CGI.TheDef;
-  // If all the bit positions are not specified; do not decode this instruction.
-  // We are bound to fail!  For proper disassembly, the well-known encoding bits
-  // of the instruction must be fully specified.
-
-  BitsInit &Bits = getBitsField(EncodingDef, "Inst");
-  if (Bits.allInComplete()) return false;
-
-  std::vector<OperandInfo> InsnOperands;
-
-  // If the instruction has specified a custom decoding hook, use that instead
-  // of trying to auto-generate the decoder.
-  StringRef InstDecoder = EncodingDef.getValueAsString("DecoderMethod");
-  if (InstDecoder != "") {
-    bool HasCompleteInstDecoder = EncodingDef.getValueAsBit("hasCompleteDecoder");
-    InsnOperands.push_back(
-        OperandInfo(std::string(InstDecoder), HasCompleteInstDecoder));
-    Operands[Opc] = InsnOperands;
-    return true;
-  }
-
-  // Generate a description of the operand of the instruction that we know
-  // how to decode automatically.
-  // FIXME: We'll need to have a way to manually override this as needed.
-
-  // Gather the outputs/inputs of the instruction, so we can find their
-  // positions in the encoding.  This assumes for now that they appear in the
-  // MCInst in the order that they're listed.
-  std::vector<std::pair<Init*, StringRef>> InOutOperands;
-  DagInit *Out  = Def.getValueAsDag("OutOperandList");
-  DagInit *In  = Def.getValueAsDag("InOperandList");
-  for (unsigned i = 0; i < Out->getNumArgs(); ++i)
-    InOutOperands.push_back(std::make_pair(Out->getArg(i),
-                                           Out->getArgNameStr(i)));
-  for (unsigned i = 0; i < In->getNumArgs(); ++i)
-    InOutOperands.push_back(std::make_pair(In->getArg(i),
-                                           In->getArgNameStr(i)));
-
-  // Search for tied operands, so that we can correctly instantiate
-  // operands that are not explicitly represented in the encoding.
-  std::map<std::string, std::string> TiedNames;
-  for (unsigned i = 0; i < CGI.Operands.size(); ++i) {
-    int tiedTo = CGI.Operands[i].getTiedRegister();
-    if (tiedTo != -1) {
-      std::pair<unsigned, unsigned> SO =
-        CGI.Operands.getSubOperandNumber(tiedTo);
-      TiedNames[std::string(InOutOperands[i].second)] =
-          std::string(InOutOperands[SO.first].second);
-      TiedNames[std::string(InOutOperands[SO.first].second)] =
-          std::string(InOutOperands[i].second);
-    }
-  }
-
-  std::map<std::string, std::vector<OperandInfo>> NumberedInsnOperands;
-  std::set<std::string> NumberedInsnOperandsNoTie;
-  if (Target.getInstructionSet()->
-        getValueAsBit("decodePositionallyEncodedOperands")) {
-    const std::vector<RecordVal> &Vals = Def.getValues();
-    unsigned NumberedOp = 0;
-
-    std::set<unsigned> NamedOpIndices;
-    if (Target.getInstructionSet()->
-         getValueAsBit("noNamedPositionallyEncodedOperands"))
-      // Collect the set of operand indices that might correspond to named
-      // operand, and skip these when assigning operands based on position.
-      for (unsigned i = 0, e = Vals.size(); i != e; ++i) {
-        unsigned OpIdx;
-        if (!CGI.Operands.hasOperandNamed(Vals[i].getName(), OpIdx))
-          continue;
-
-        NamedOpIndices.insert(OpIdx);
-      }
-
-    for (unsigned i = 0, e = Vals.size(); i != e; ++i) {
-      // Ignore fixed fields in the record, we're looking for values like:
-      //    bits<5> RST = { ?, ?, ?, ?, ? };
-      if (Vals[i].isNonconcreteOK() || Vals[i].getValue()->isComplete())
-        continue;
-
-      // Determine if Vals[i] actually contributes to the Inst encoding.
-      unsigned bi = 0;
-      for (; bi < Bits.getNumBits(); ++bi) {
-        VarInit *Var = nullptr;
-        VarBitInit *BI = dyn_cast<VarBitInit>(Bits.getBit(bi));
-        if (BI)
-          Var = dyn_cast<VarInit>(BI->getBitVar());
-        else
-          Var = dyn_cast<VarInit>(Bits.getBit(bi));
-
-        if (Var && Var->getName() == Vals[i].getName())
-          break;
-      }
-
-      if (bi == Bits.getNumBits())
-        continue;
-
-      // Skip variables that correspond to explicitly-named operands.
-      unsigned OpIdx;
-      if (CGI.Operands.hasOperandNamed(Vals[i].getName(), OpIdx))
-        continue;
-
-      // Get the bit range for this operand:
-      unsigned bitStart = bi++, bitWidth = 1;
-      for (; bi < Bits.getNumBits(); ++bi) {
-        VarInit *Var = nullptr;
-        VarBitInit *BI = dyn_cast<VarBitInit>(Bits.getBit(bi));
-        if (BI)
-          Var = dyn_cast<VarInit>(BI->getBitVar());
-        else
-          Var = dyn_cast<VarInit>(Bits.getBit(bi));
-
-        if (!Var)
-          break;
-
-        if (Var->getName() != Vals[i].getName())
-          break;
-
-        ++bitWidth;
-      }
-
-      unsigned NumberOps = CGI.Operands.size();
-      while (NumberedOp < NumberOps &&
-             (CGI.Operands.isFlatOperandNotEmitted(NumberedOp) ||
-              (!NamedOpIndices.empty() && NamedOpIndices.count(
-                CGI.Operands.getSubOperandNumber(NumberedOp).first))))
-        ++NumberedOp;
-
-      OpIdx = NumberedOp++;
-
-      // OpIdx now holds the ordered operand number of Vals[i].
-      std::pair<unsigned, unsigned> SO =
-        CGI.Operands.getSubOperandNumber(OpIdx);
-      const std::string &Name = CGI.Operands[SO.first].Name;
-
-      LLVM_DEBUG(dbgs() << "Numbered operand mapping for " << Def.getName()
-                        << ": " << Name << "(" << SO.first << ", " << SO.second
-                        << ") => " << Vals[i].getName() << "\n");
-
-      std::string Decoder;
-      Record *TypeRecord = CGI.Operands[SO.first].Rec;
-
-      RecordVal *DecoderString = TypeRecord->getValue("DecoderMethod");
-      StringInit *String = DecoderString ?
-        dyn_cast<StringInit>(DecoderString->getValue()) : nullptr;
-      if (String && String->getValue() != "")
-        Decoder = std::string(String->getValue());
-
-      if (Decoder == "" &&
-          CGI.Operands[SO.first].MIOperandInfo &&
-          CGI.Operands[SO.first].MIOperandInfo->getNumArgs()) {
-        Init *Arg = CGI.Operands[SO.first].MIOperandInfo->
-                      getArg(SO.second);
-        if (DefInit *DI = cast<DefInit>(Arg))
-          TypeRecord = DI->getDef();
-      }
-
-      bool isReg = false;
-      if (TypeRecord->isSubClassOf("RegisterOperand"))
-        TypeRecord = TypeRecord->getValueAsDef("RegClass");
-      if (TypeRecord->isSubClassOf("RegisterClass")) {
-        Decoder = "Decode" + TypeRecord->getName().str() + "RegisterClass";
-        isReg = true;
-      } else if (TypeRecord->isSubClassOf("PointerLikeRegClass")) {
-        Decoder = "DecodePointerLikeRegClass" +
-                  utostr(TypeRecord->getValueAsInt("RegClassKind"));
-        isReg = true;
-      }
-
-      DecoderString = TypeRecord->getValue("DecoderMethod");
-      String = DecoderString ?
-        dyn_cast<StringInit>(DecoderString->getValue()) : nullptr;
-      if (!isReg && String && String->getValue() != "")
-        Decoder = std::string(String->getValue());
-
-      RecordVal *HasCompleteDecoderVal =
-        TypeRecord->getValue("hasCompleteDecoder");
-      BitInit *HasCompleteDecoderBit = HasCompleteDecoderVal ?
-        dyn_cast<BitInit>(HasCompleteDecoderVal->getValue()) : nullptr;
-      bool HasCompleteDecoder = HasCompleteDecoderBit ?
-        HasCompleteDecoderBit->getValue() : true;
-
-      OperandInfo OpInfo(Decoder, HasCompleteDecoder);
-      OpInfo.addField(bitStart, bitWidth, 0);
-
-      NumberedInsnOperands[Name].push_back(OpInfo);
-
-      // FIXME: For complex operands with custom decoders we can't handle tied
-      // sub-operands automatically. Skip those here and assume that this is
-      // fixed up elsewhere.
-      if (CGI.Operands[SO.first].MIOperandInfo &&
-          CGI.Operands[SO.first].MIOperandInfo->getNumArgs() > 1 &&
-          String && String->getValue() != "")
-        NumberedInsnOperandsNoTie.insert(Name);
-    }
-  }
-
-  // For each operand, see if we can figure out where it is encoded.
-  for (const auto &Op : InOutOperands) {
-    if (!NumberedInsnOperands[std::string(Op.second)].empty()) {
-      llvm::append_range(InsnOperands,
-                         NumberedInsnOperands[std::string(Op.second)]);
-      continue;
-    }
-    if (!NumberedInsnOperands[TiedNames[std::string(Op.second)]].empty()) {
-      if (!NumberedInsnOperandsNoTie.count(TiedNames[std::string(Op.second)])) {
-        // Figure out to which (sub)operand we're tied.
-        unsigned i =
-            CGI.Operands.getOperandNamed(TiedNames[std::string(Op.second)]);
-        int tiedTo = CGI.Operands[i].getTiedRegister();
-        if (tiedTo == -1) {
-          i = CGI.Operands.getOperandNamed(Op.second);
-          tiedTo = CGI.Operands[i].getTiedRegister();
-        }
-
-        if (tiedTo != -1) {
-          std::pair<unsigned, unsigned> SO =
-            CGI.Operands.getSubOperandNumber(tiedTo);
-
-          InsnOperands.push_back(
-              NumberedInsnOperands[TiedNames[std::string(Op.second)]]
-                                  [SO.second]);
-        }
-      }
-      continue;
-    }
-
-    TypedInit *TI = cast<TypedInit>(Op.first);
-
-    // At this point, we can locate the decoder field, but we need to know how
-    // to interpret it.  As a first step, require the target to provide
-    // callbacks for decoding register classes.
-    std::string Decoder = findOperandDecoderMethod(TI);
-    Record *TypeRecord = cast<DefInit>(TI)->getDef();
-
-    RecordVal *HasCompleteDecoderVal =
-      TypeRecord->getValue("hasCompleteDecoder");
-    BitInit *HasCompleteDecoderBit = HasCompleteDecoderVal ?
-      dyn_cast<BitInit>(HasCompleteDecoderVal->getValue()) : nullptr;
-    bool HasCompleteDecoder = HasCompleteDecoderBit ?
-      HasCompleteDecoderBit->getValue() : true;
-
-    OperandInfo OpInfo(Decoder, HasCompleteDecoder);
-
-    // Some bits of the operand may be required to be 1 depending on the
-    // instruction's encoding. Collect those bits.
-    if (const RecordVal *EncodedValue = EncodingDef.getValue(Op.second))
-      if (const BitsInit *OpBits = dyn_cast<BitsInit>(EncodedValue->getValue()))
-        for (unsigned I = 0; I < OpBits->getNumBits(); ++I)
-          if (const BitInit *OpBit = dyn_cast<BitInit>(OpBits->getBit(I)))
-            if (OpBit->getValue())
-              OpInfo.InitValue |= 1ULL << I;
-
-    unsigned Base = ~0U;
-    unsigned Width = 0;
-    unsigned Offset = 0;
-
-    for (unsigned bi = 0; bi < Bits.getNumBits(); ++bi) {
-      VarInit *Var = nullptr;
-      VarBitInit *BI = dyn_cast<VarBitInit>(Bits.getBit(bi));
-      if (BI)
-        Var = dyn_cast<VarInit>(BI->getBitVar());
-      else
-        Var = dyn_cast<VarInit>(Bits.getBit(bi));
-
-      if (!Var) {
-        if (Base != ~0U) {
-          OpInfo.addField(Base, Width, Offset);
-          Base = ~0U;
-          Width = 0;
-          Offset = 0;
-        }
-        continue;
-      }
-
-      if (Var->getName() != Op.second &&
-          Var->getName() != TiedNames[std::string(Op.second)]) {
-        if (Base != ~0U) {
-          OpInfo.addField(Base, Width, Offset);
-          Base = ~0U;
-          Width = 0;
-          Offset = 0;
-        }
-        continue;
-      }
-
-      if (Base == ~0U) {
-        Base = bi;
-        Width = 1;
-        Offset = BI ? BI->getBitNum() : 0;
-      } else if (BI && BI->getBitNum() != Offset + Width) {
-        OpInfo.addField(Base, Width, Offset);
-        Base = bi;
-        Width = 1;
-        Offset = BI->getBitNum();
-      } else {
-        ++Width;
-      }
-    }
-
-    if (Base != ~0U)
-      OpInfo.addField(Base, Width, Offset);
-
-    if (OpInfo.numFields() > 0)
-      InsnOperands.push_back(OpInfo);
-  }
-
-  Operands[Opc] = InsnOperands;
-
-#if 0
-  LLVM_DEBUG({
-      // Dumps the instruction encoding bits.
-      dumpBits(errs(), Bits);
-
-      errs() << '\n';
-
-      // Dumps the list of operand info.
-      for (unsigned i = 0, e = CGI.Operands.size(); i != e; ++i) {
-        const CGIOperandList::OperandInfo &Info = CGI.Operands[i];
-        const std::string &OperandName = Info.Name;
-        const Record &OperandDef = *Info.Rec;
-
-        errs() << "\t" << OperandName << " (" << OperandDef.getName() << ")\n";
-      }
-    });
-#endif
-
-  return true;
-}
-
-// emitFieldFromInstruction - Emit the templated helper function
-// fieldFromInstruction().
-// On Windows we make sure that this function is not inlined when
-// using the VS compiler. It has a bug which causes the function
-// to be optimized out in some circustances. See llvm.org/pr38292
-static void emitFieldFromInstruction(formatted_raw_ostream &OS) {
-  OS << "// Helper functions for extracting fields from encoded instructions.\n"
-     << "// InsnType must either be integral or an APInt-like object that "
-        "must:\n"
-     << "// * be default-constructible and copy-constructible\n"
-     << "// * be constructible from a uint64_t\n"
-     << "// * be constructible from an APInt (this can be private)\n"
-     << "// * Support insertBits(bits, startBit, numBits)\n"
-     << "// * Support extractBitsAsZExtValue(numBits, startBit)\n"
-     << "// * be convertible to bool\n"
-     << "// * Support the ~, &, ==, and != operators with other objects of "
-        "the same type\n"
-     << "// * Support put (<<) to raw_ostream&\n"
-     << "template <typename InsnType>\n"
-     << "#if defined(_MSC_VER) && !defined(__clang__)\n"
-     << "__declspec(noinline)\n"
-     << "#endif\n"
-     << "static std::enable_if_t<std::is_integral<InsnType>::value, InsnType>\n"
-     << "fieldFromInstruction(const InsnType &insn, unsigned startBit,\n"
-     << "                     unsigned numBits) {\n"
-     << "  assert(startBit + numBits <= 64 && \"Cannot support >64-bit "
-        "extractions!\");\n"
-     << "  assert(startBit + numBits <= (sizeof(InsnType) * 8) &&\n"
-     << "         \"Instruction field out of bounds!\");\n"
-     << "  InsnType fieldMask;\n"
-     << "  if (numBits == sizeof(InsnType) * 8)\n"
-     << "    fieldMask = (InsnType)(-1LL);\n"
-     << "  else\n"
-     << "    fieldMask = (((InsnType)1 << numBits) - 1) << startBit;\n"
-     << "  return (insn & fieldMask) >> startBit;\n"
-     << "}\n"
-     << "\n"
-     << "template <typename InsnType>\n"
-     << "static std::enable_if_t<!std::is_integral<InsnType>::value, "
-        "uint64_t>\n"
-     << "fieldFromInstruction(const InsnType &insn, unsigned startBit,\n"
-     << "                     unsigned numBits) {\n"
-     << "  return insn.extractBitsAsZExtValue(numBits, startBit);\n"
-     << "}\n\n";
-}
-
-// emitInsertBits - Emit the templated helper function insertBits().
-static void emitInsertBits(formatted_raw_ostream &OS) {
-  OS << "// Helper function for inserting bits extracted from an encoded "
-        "instruction into\n"
-     << "// a field.\n"
-     << "template <typename InsnType>\n"
-     << "static std::enable_if_t<std::is_integral<InsnType>::value>\n"
-     << "insertBits(InsnType &field, InsnType bits, unsigned startBit, "
-        "unsigned numBits) {\n"
-     << "  assert(startBit + numBits <= sizeof field * 8);\n"
-     << "  field |= (InsnType)bits << startBit;\n"
-     << "}\n"
-     << "\n"
-     << "template <typename InsnType>\n"
-     << "static std::enable_if_t<!std::is_integral<InsnType>::value>\n"
-     << "insertBits(InsnType &field, uint64_t bits, unsigned startBit, "
-        "unsigned numBits) {\n"
-     << "  field.insertBits(bits, startBit, numBits);\n"
-     << "}\n\n";
-}
-
-// emitDecodeInstruction - Emit the templated helper function
-// decodeInstruction().
-static void emitDecodeInstruction(formatted_raw_ostream &OS) {
-  OS << "template <typename InsnType>\n"
-     << "static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], "
-        "MCInst &MI,\n"
-     << "                                      InsnType insn, uint64_t "
-        "Address,\n"
-     << "                                      const void *DisAsm,\n"
-     << "                                      const MCSubtargetInfo &STI) {\n"
-     << "  const FeatureBitset &Bits = STI.getFeatureBits();\n"
-     << "\n"
-     << "  const uint8_t *Ptr = DecodeTable;\n"
-     << "  InsnType CurFieldValue = 0;\n"
-     << "  DecodeStatus S = MCDisassembler::Success;\n"
-     << "  while (true) {\n"
-     << "    ptrdiff_t Loc = Ptr - DecodeTable;\n"
-     << "    switch (*Ptr) {\n"
-     << "    default:\n"
-     << "      errs() << Loc << \": Unexpected decode table opcode!\\n\";\n"
-     << "      return MCDisassembler::Fail;\n"
-     << "    case MCD::OPC_ExtractField: {\n"
-     << "      unsigned Start = *++Ptr;\n"
-     << "      unsigned Len = *++Ptr;\n"
-     << "      ++Ptr;\n"
-     << "      CurFieldValue = fieldFromInstruction(insn, Start, Len);\n"
-     << "      LLVM_DEBUG(dbgs() << Loc << \": OPC_ExtractField(\" << Start << "
-        "\", \"\n"
-     << "                   << Len << \"): \" << CurFieldValue << \"\\n\");\n"
-     << "      break;\n"
-     << "    }\n"
-     << "    case MCD::OPC_FilterValue: {\n"
-     << "      // Decode the field value.\n"
-     << "      unsigned Len;\n"
-     << "      InsnType Val = decodeULEB128(++Ptr, &Len);\n"
-     << "      Ptr += Len;\n"
-     << "      // NumToSkip is a plain 24-bit integer.\n"
-     << "      unsigned NumToSkip = *Ptr++;\n"
-     << "      NumToSkip |= (*Ptr++) << 8;\n"
-     << "      NumToSkip |= (*Ptr++) << 16;\n"
-     << "\n"
-     << "      // Perform the filter operation.\n"
-     << "      if (Val != CurFieldValue)\n"
-     << "        Ptr += NumToSkip;\n"
-     << "      LLVM_DEBUG(dbgs() << Loc << \": OPC_FilterValue(\" << Val << "
-        "\", \" << NumToSkip\n"
-     << "                   << \"): \" << ((Val != CurFieldValue) ? \"FAIL:\" "
-        ": \"PASS:\")\n"
-     << "                   << \" continuing at \" << (Ptr - DecodeTable) << "
-        "\"\\n\");\n"
-     << "\n"
-     << "      break;\n"
-     << "    }\n"
-     << "    case MCD::OPC_CheckField: {\n"
-     << "      unsigned Start = *++Ptr;\n"
-     << "      unsigned Len = *++Ptr;\n"
-     << "      InsnType FieldValue = fieldFromInstruction(insn, Start, Len);\n"
-     << "      // Decode the field value.\n"
-     << "      InsnType ExpectedValue = decodeULEB128(++Ptr, &Len);\n"
-     << "      Ptr += Len;\n"
-     << "      // NumToSkip is a plain 24-bit integer.\n"
-     << "      unsigned NumToSkip = *Ptr++;\n"
-     << "      NumToSkip |= (*Ptr++) << 8;\n"
-     << "      NumToSkip |= (*Ptr++) << 16;\n"
-     << "\n"
-     << "      // If the actual and expected values don't match, skip.\n"
-     << "      if (ExpectedValue != FieldValue)\n"
-     << "        Ptr += NumToSkip;\n"
-     << "      LLVM_DEBUG(dbgs() << Loc << \": OPC_CheckField(\" << Start << "
-        "\", \"\n"
-     << "                   << Len << \", \" << ExpectedValue << \", \" << "
-        "NumToSkip\n"
-     << "                   << \"): FieldValue = \" << FieldValue << \", "
-        "ExpectedValue = \"\n"
-     << "                   << ExpectedValue << \": \"\n"
-     << "                   << ((ExpectedValue == FieldValue) ? \"PASS\\n\" : "
-        "\"FAIL\\n\"));\n"
-     << "      break;\n"
-     << "    }\n"
-     << "    case MCD::OPC_CheckPredicate: {\n"
-     << "      unsigned Len;\n"
-     << "      // Decode the Predicate Index value.\n"
-     << "      unsigned PIdx = decodeULEB128(++Ptr, &Len);\n"
-     << "      Ptr += Len;\n"
-     << "      // NumToSkip is a plain 24-bit integer.\n"
-     << "      unsigned NumToSkip = *Ptr++;\n"
-     << "      NumToSkip |= (*Ptr++) << 8;\n"
-     << "      NumToSkip |= (*Ptr++) << 16;\n"
-     << "      // Check the predicate.\n"
-     << "      bool Pred;\n"
-     << "      if (!(Pred = checkDecoderPredicate(PIdx, Bits)))\n"
-     << "        Ptr += NumToSkip;\n"
-     << "      (void)Pred;\n"
-     << "      LLVM_DEBUG(dbgs() << Loc << \": OPC_CheckPredicate(\" << PIdx "
-        "<< \"): \"\n"
-     << "            << (Pred ? \"PASS\\n\" : \"FAIL\\n\"));\n"
-     << "\n"
-     << "      break;\n"
-     << "    }\n"
-     << "    case MCD::OPC_Decode: {\n"
-     << "      unsigned Len;\n"
-     << "      // Decode the Opcode value.\n"
-     << "      unsigned Opc = decodeULEB128(++Ptr, &Len);\n"
-     << "      Ptr += Len;\n"
-     << "      unsigned DecodeIdx = decodeULEB128(Ptr, &Len);\n"
-     << "      Ptr += Len;\n"
-     << "\n"
-     << "      MI.clear();\n"
-     << "      MI.setOpcode(Opc);\n"
-     << "      bool DecodeComplete;\n"
-     << "      S = decodeToMCInst(S, DecodeIdx, insn, MI, Address, DisAsm, "
-        "DecodeComplete);\n"
-     << "      assert(DecodeComplete);\n"
-     << "\n"
-     << "      LLVM_DEBUG(dbgs() << Loc << \": OPC_Decode: opcode \" << Opc\n"
-     << "                   << \", using decoder \" << DecodeIdx << \": \"\n"
-     << "                   << (S != MCDisassembler::Fail ? \"PASS\" : "
-        "\"FAIL\") << \"\\n\");\n"
-     << "      return S;\n"
-     << "    }\n"
-     << "    case MCD::OPC_TryDecode: {\n"
-     << "      unsigned Len;\n"
-     << "      // Decode the Opcode value.\n"
-     << "      unsigned Opc = decodeULEB128(++Ptr, &Len);\n"
-     << "      Ptr += Len;\n"
-     << "      unsigned DecodeIdx = decodeULEB128(Ptr, &Len);\n"
-     << "      Ptr += Len;\n"
-     << "      // NumToSkip is a plain 24-bit integer.\n"
-     << "      unsigned NumToSkip = *Ptr++;\n"
-     << "      NumToSkip |= (*Ptr++) << 8;\n"
-     << "      NumToSkip |= (*Ptr++) << 16;\n"
-     << "\n"
-     << "      // Perform the decode operation.\n"
-     << "      MCInst TmpMI;\n"
-     << "      TmpMI.setOpcode(Opc);\n"
-     << "      bool DecodeComplete;\n"
-     << "      S = decodeToMCInst(S, DecodeIdx, insn, TmpMI, Address, DisAsm, "
-        "DecodeComplete);\n"
-     << "      LLVM_DEBUG(dbgs() << Loc << \": OPC_TryDecode: opcode \" << "
-        "Opc\n"
-     << "                   << \", using decoder \" << DecodeIdx << \": \");\n"
-     << "\n"
-     << "      if (DecodeComplete) {\n"
-     << "        // Decoding complete.\n"
-     << "        LLVM_DEBUG(dbgs() << (S != MCDisassembler::Fail ? \"PASS\" : "
-        "\"FAIL\") << \"\\n\");\n"
-     << "        MI = TmpMI;\n"
-     << "        return S;\n"
-     << "      } else {\n"
-     << "        assert(S == MCDisassembler::Fail);\n"
-     << "        // If the decoding was incomplete, skip.\n"
-     << "        Ptr += NumToSkip;\n"
-     << "        LLVM_DEBUG(dbgs() << \"FAIL: continuing at \" << (Ptr - "
-        "DecodeTable) << \"\\n\");\n"
-     << "        // Reset decode status. This also drops a SoftFail status "
-        "that could be\n"
-     << "        // set before the decode attempt.\n"
-     << "        S = MCDisassembler::Success;\n"
-     << "      }\n"
-     << "      break;\n"
-     << "    }\n"
-     << "    case MCD::OPC_SoftFail: {\n"
-     << "      // Decode the mask values.\n"
-     << "      unsigned Len;\n"
-     << "      InsnType PositiveMask = decodeULEB128(++Ptr, &Len);\n"
-     << "      Ptr += Len;\n"
-     << "      InsnType NegativeMask = decodeULEB128(Ptr, &Len);\n"
-     << "      Ptr += Len;\n"
-     << "      bool Fail = (insn & PositiveMask) || (~insn & NegativeMask);\n"
-     << "      if (Fail)\n"
-     << "        S = MCDisassembler::SoftFail;\n"
-     << "      LLVM_DEBUG(dbgs() << Loc << \": OPC_SoftFail: \" << (Fail ? "
-        "\"FAIL\\n\" : \"PASS\\n\"));\n"
-     << "      break;\n"
-     << "    }\n"
-     << "    case MCD::OPC_Fail: {\n"
-     << "      LLVM_DEBUG(dbgs() << Loc << \": OPC_Fail\\n\");\n"
-     << "      return MCDisassembler::Fail;\n"
-     << "    }\n"
-     << "    }\n"
-     << "  }\n"
-     << "  llvm_unreachable(\"bogosity detected in disassembler state "
-        "machine!\");\n"
-     << "}\n\n";
-}
-
-// Emits disassembler code for instruction decoding.
-void FixedLenDecoderEmitter::run(raw_ostream &o) {
-  formatted_raw_ostream OS(o);
-  OS << "#include \"llvm/MC/MCInst.h\"\n";
-  OS << "#include \"llvm/Support/DataTypes.h\"\n";
-  OS << "#include \"llvm/Support/Debug.h\"\n";
-  OS << "#include \"llvm/Support/LEB128.h\"\n";
-  OS << "#include \"llvm/Support/raw_ostream.h\"\n";
-  OS << "#include <assert.h>\n";
-  OS << '\n';
-  OS << "namespace llvm {\n\n";
-
-  emitFieldFromInstruction(OS);
-  emitInsertBits(OS);
-
-  Target.reverseBitsForLittleEndianEncoding();
-
-  // Parameterize the decoders based on namespace and instruction width.
-  std::set<StringRef> HwModeNames;
-  const auto &NumberedInstructions = Target.getInstructionsByEnumValue();
-  NumberedEncodings.reserve(NumberedInstructions.size());
-  DenseMap<Record *, unsigned> IndexOfInstruction;
-  // First, collect all HwModes referenced by the target.
-  for (const auto &NumberedInstruction : NumberedInstructions) {
-    IndexOfInstruction[NumberedInstruction->TheDef] = NumberedEncodings.size();
-
-    if (const RecordVal *RV =
-            NumberedInstruction->TheDef->getValue("EncodingInfos")) {
-      if (auto *DI = dyn_cast_or_null<DefInit>(RV->getValue())) {
-        const CodeGenHwModes &HWM = Target.getHwModes();
-        EncodingInfoByHwMode EBM(DI->getDef(), HWM);
-        for (auto &KV : EBM)
-          HwModeNames.insert(HWM.getMode(KV.first).Name);
-      }
-    }
-  }
-
-  // If HwModeNames is empty, add the empty string so we always have one HwMode.
-  if (HwModeNames.empty())
-    HwModeNames.insert("");
-
-  for (const auto &NumberedInstruction : NumberedInstructions) {
-    IndexOfInstruction[NumberedInstruction->TheDef] = NumberedEncodings.size();
-
-    if (const RecordVal *RV =
-            NumberedInstruction->TheDef->getValue("EncodingInfos")) {
-      if (DefInit *DI = dyn_cast_or_null<DefInit>(RV->getValue())) {
-        const CodeGenHwModes &HWM = Target.getHwModes();
-        EncodingInfoByHwMode EBM(DI->getDef(), HWM);
-        for (auto &KV : EBM) {
-          NumberedEncodings.emplace_back(KV.second, NumberedInstruction,
-                                         HWM.getMode(KV.first).Name);
-          HwModeNames.insert(HWM.getMode(KV.first).Name);
-        }
-        continue;
-      }
-    }
-    // This instruction is encoded the same on all HwModes. Emit it for all
-    // HwModes.
-    for (StringRef HwModeName : HwModeNames)
-      NumberedEncodings.emplace_back(NumberedInstruction->TheDef,
-                                     NumberedInstruction, HwModeName);
-  }
-  for (const auto &NumberedAlias : RK.getAllDerivedDefinitions("AdditionalEncoding"))
-    NumberedEncodings.emplace_back(
-        NumberedAlias,
-        &Target.getInstruction(NumberedAlias->getValueAsDef("AliasOf")));
-
-  std::map<std::pair<std::string, unsigned>, std::vector<EncodingIDAndOpcode>>
-      OpcMap;
-  std::map<unsigned, std::vector<OperandInfo>> Operands;
-
-  for (unsigned i = 0; i < NumberedEncodings.size(); ++i) {
-    const Record *EncodingDef = NumberedEncodings[i].EncodingDef;
-    const CodeGenInstruction *Inst = NumberedEncodings[i].Inst;
-    const Record *Def = Inst->TheDef;
-    unsigned Size = EncodingDef->getValueAsInt("Size");
-    if (Def->getValueAsString("Namespace") == "TargetOpcode" ||
-        Def->getValueAsBit("isPseudo") ||
-        Def->getValueAsBit("isAsmParserOnly") ||
-        Def->getValueAsBit("isCodeGenOnly")) {
-      NumEncodingsLackingDisasm++;
-      continue;
-    }
-
-    if (i < NumberedInstructions.size())
-      NumInstructions++;
-    NumEncodings++;
-
-    if (!Size)
-      continue;
-
-    if (populateInstruction(Target, *EncodingDef, *Inst, i, Operands)) {
-      std::string DecoderNamespace =
-          std::string(EncodingDef->getValueAsString("DecoderNamespace"));
-      if (!NumberedEncodings[i].HwModeName.empty())
-        DecoderNamespace +=
-            std::string("_") + NumberedEncodings[i].HwModeName.str();
-      OpcMap[std::make_pair(DecoderNamespace, Size)].emplace_back(
-          i, IndexOfInstruction.find(Def)->second);
-    } else {
-      NumEncodingsOmitted++;
-    }
-  }
-
-  DecoderTableInfo TableInfo;
-  for (const auto &Opc : OpcMap) {
-    // Emit the decoder for this namespace+width combination.
-    ArrayRef<EncodingAndInst> NumberedEncodingsRef(
-        NumberedEncodings.data(), NumberedEncodings.size());
-    FilterChooser FC(NumberedEncodingsRef, Opc.second, Operands,
-                     8 * Opc.first.second, this);
-
-    // The decode table is cleared for each top level decoder function. The
-    // predicates and decoders themselves, however, are shared across all
-    // decoders to give more opportunities for uniqueing.
-    TableInfo.Table.clear();
-    TableInfo.FixupStack.clear();
-    TableInfo.Table.reserve(16384);
-    TableInfo.FixupStack.emplace_back();
-    FC.emitTableEntries(TableInfo);
-    // Any NumToSkip fixups in the top level scope can resolve to the
-    // OPC_Fail at the end of the table.
-    assert(TableInfo.FixupStack.size() == 1 && "fixup stack phasing error!");
-    // Resolve any NumToSkip fixups in the current scope.
-    resolveTableFixups(TableInfo.Table, TableInfo.FixupStack.back(),
-                       TableInfo.Table.size());
-    TableInfo.FixupStack.clear();
-
-    TableInfo.Table.push_back(MCD::OPC_Fail);
-
-    // Print the table to the output stream.
-    emitTable(OS, TableInfo.Table, 0, FC.getBitWidth(), Opc.first.first);
-    OS.flush();
-  }
-
-  // Emit the predicate function.
-  emitPredicateFunction(OS, TableInfo.Predicates, 0);
-
-  // Emit the decoder function.
-  emitDecoderFunction(OS, TableInfo.Decoders, 0);
-
-  // Emit the main entry point for the decoder, decodeInstruction().
-  emitDecodeInstruction(OS);
-
-  OS << "\n} // end namespace llvm\n";
-}
-
-namespace llvm {
-
-void EmitFixedLenDecoder(RecordKeeper &RK, raw_ostream &OS,
-                         const std::string &PredicateNamespace,
-                         const std::string &GPrefix,
-                         const std::string &GPostfix, const std::string &ROK,
-                         const std::string &RFail, const std::string &L) {
-  FixedLenDecoderEmitter(RK, PredicateNamespace, GPrefix, GPostfix,
-                         ROK, RFail, L).run(OS);
-}
-
-} // end namespace llvm
diff --git a/llvm/utils/TableGen/GICombinerEmitter.cpp b/llvm/utils/TableGen/GICombinerEmitter.cpp
index 0dea1ef00e4b..77e05aebf53a 100644
--- a/llvm/utils/TableGen/GICombinerEmitter.cpp
+++ b/llvm/utils/TableGen/GICombinerEmitter.cpp
@@ -933,28 +933,27 @@ void GICombinerEmitter::run(raw_ostream &OS) {
         "getRuleIdxForIdentifier(RangePair.first);\n"
      << "    const auto Last = "
         "getRuleIdxForIdentifier(RangePair.second);\n"
-     << "    if (!First.hasValue() || !Last.hasValue())\n"
+     << "    if (!First || !Last)\n"
      << "      return None;\n"
      << "    if (First >= Last)\n"
      << "      report_fatal_error(\"Beginning of range should be before "
         "end of range\");\n"
      << "    return {{*First, *Last + 1}};\n"
-     << "  } else if (RangePair.first == \"*\") {\n"
+     << "  }\n"
+     << "  if (RangePair.first == \"*\") {\n"
      << "    return {{0, " << Rules.size() << "}};\n"
-     << "  } else {\n"
-     << "    const auto I = getRuleIdxForIdentifier(RangePair.first);\n"
-     << "    if (!I.hasValue())\n"
-     << "      return None;\n"
-     << "    return {{*I, *I + 1}};\n"
      << "  }\n"
-     << "  return None;\n"
+     << "  const auto I = getRuleIdxForIdentifier(RangePair.first);\n"
+     << "  if (!I)\n"
+     << "    return None;\n"
+     << "  return {{*I, *I + 1}};\n"
      << "}\n\n";
 
   for (bool Enabled : {true, false}) {
     OS << "bool " << getClassName() << "RuleConfig::setRule"
        << (Enabled ? "Enabled" : "Disabled") << "(StringRef RuleIdentifier) {\n"
        << "  auto MaybeRange = getRuleRangeForIdentifier(RuleIdentifier);\n"
-       << "  if (!MaybeRange.hasValue())\n"
+       << "  if (!MaybeRange)\n"
        << "    return false;\n"
        << "  for (auto I = MaybeRange->first; I < MaybeRange->second; ++I)\n"
        << "    DisabledRules." << (Enabled ? "reset" : "set") << "(I);\n"
diff --git a/llvm/utils/TableGen/GlobalISel/GIMatchDag.cpp b/llvm/utils/TableGen/GlobalISel/GIMatchDag.cpp
index 7e037dd03b60..8be32d2effa6 100644
--- a/llvm/utils/TableGen/GlobalISel/GIMatchDag.cpp
+++ b/llvm/utils/TableGen/GlobalISel/GIMatchDag.cpp
@@ -48,7 +48,7 @@ void GIMatchDag::writeDOTGraph(raw_ostream &OS, StringRef ID) const {
          << Assignment.first << ")";
       Separator = ", ";
     }
-    OS << format("|%p|", &N);
+    OS << llvm::format("|%p|", &N);
     writePorts("d", N->getOperandInfo());
     OS << "}\"";
     if (N->isMatchRoot())
@@ -82,7 +82,7 @@ void GIMatchDag::writeDOTGraph(raw_ostream &OS, StringRef ID) const {
     writePorts("s", N->getOperandInfo());
     OS << "|" << N->getName() << "|";
     N->printDescription(OS);
-    OS << format("|%p|", &N);
+    OS << llvm::format("|%p|", &N);
     writePorts("d", N->getOperandInfo());
     OS << "}\",style=dotted]\n";
   }
diff --git a/llvm/utils/TableGen/GlobalISel/GIMatchTree.h b/llvm/utils/TableGen/GlobalISel/GIMatchTree.h
index 56df37731c09..55a86259661d 100644
--- a/llvm/utils/TableGen/GlobalISel/GIMatchTree.h
+++ b/llvm/utils/TableGen/GlobalISel/GIMatchTree.h
@@ -32,11 +32,11 @@ public:
                              Optional<unsigned> OpIdx = None)
       : Name(Name), InstrID(InstrID), OpIdx(OpIdx) {}
 
-  bool isInstr() const { return !OpIdx.hasValue(); }
+  bool isInstr() const { return !OpIdx; }
   StringRef getName() const { return Name; }
   unsigned getInstrID() const { return InstrID; }
   unsigned getOpIdx() const {
-    assert(OpIdx.hasValue() && "Is not an operand binding");
+    assert(OpIdx && "Is not an operand binding");
     return *OpIdx;
   }
 };
diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp
index 018aa7ee2f71..c8eac56d03e6 100644
--- a/llvm/utils/TableGen/GlobalISelEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp
@@ -30,6 +30,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CodeGenDAGPatterns.h"
+#include "CodeGenInstruction.h"
 #include "SubtargetFeatureInfo.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/Statistic.h"
@@ -465,9 +466,9 @@ public:
   MatchTableRecord(Optional<unsigned> LabelID_, StringRef EmitStr,
                    unsigned NumElements, unsigned Flags,
                    int64_t RawValue = std::numeric_limits<int64_t>::min())
-      : LabelID(LabelID_.getValueOr(~0u)), EmitStr(EmitStr),
+      : LabelID(LabelID_.value_or(~0u)), EmitStr(EmitStr),
         NumElements(NumElements), Flags(Flags), RawValue(RawValue) {
-    assert((!LabelID_.hasValue() || LabelID != ~0u) &&
+    assert((!LabelID_ || LabelID != ~0u) &&
            "This value is reserved for non-labels");
   }
   MatchTableRecord(const MatchTableRecord &Other) = default;
@@ -2935,12 +2936,12 @@ public:
   }
 
   void emitRenderOpcodes(MatchTable &Table, RuleMatcher &Rule) const override {
-    Table << MatchTable::Opcode(SubOperand.hasValue() ? "GIR_ComplexSubOperandRenderer"
-                                                      : "GIR_ComplexRenderer")
+    Table << MatchTable::Opcode(SubOperand ? "GIR_ComplexSubOperandRenderer"
+                                           : "GIR_ComplexRenderer")
           << MatchTable::Comment("InsnID") << MatchTable::IntValue(InsnID)
           << MatchTable::Comment("RendererID")
           << MatchTable::IntValue(RendererID);
-    if (SubOperand.hasValue())
+    if (SubOperand)
       Table << MatchTable::Comment("SubOperand")
             << MatchTable::IntValue(SubOperand.getValue());
     Table << MatchTable::Comment(SymbolicName) << MatchTable::LineBreak;
@@ -3815,12 +3816,15 @@ Expected<InstructionMatcher &> GlobalISelEmitter::addBuiltinPredicates(
       if (!ParsedAddrSpaces.empty()) {
         InsnMatcher.addPredicate<MemoryAddressSpacePredicateMatcher>(
             0, ParsedAddrSpaces);
+        return InsnMatcher;
       }
     }
 
     int64_t MinAlign = Predicate.getMinAlignment();
-    if (MinAlign > 0)
+    if (MinAlign > 0) {
       InsnMatcher.addPredicate<MemoryAlignmentPredicateMatcher>(0, MinAlign);
+      return InsnMatcher;
+    }
   }
 
   // G_LOAD is used for both non-extending and any-extending loads.
@@ -4269,7 +4273,7 @@ Error GlobalISelEmitter::importChildMatcher(
 
     auto MaybeInsnOperand = OM.addPredicate<InstructionOperandMatcher>(
         InsnMatcher.getRuleMatcher(), SrcChild->getName());
-    if (!MaybeInsnOperand.hasValue()) {
+    if (!MaybeInsnOperand) {
       // This isn't strictly true. If the user were to provide exactly the same
       // matchers as the original operand then we could allow it. However, it's
       // simpler to not permit the redundant specification.
@@ -4400,7 +4404,7 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderer(
     TreePatternNode *DstChild) {
 
   const auto &SubOperand = Rule.getComplexSubOperand(DstChild->getName());
-  if (SubOperand.hasValue()) {
+  if (SubOperand) {
     DstMIBuilder.addRenderer<RenderComplexPatternOperand>(
         *std::get<0>(*SubOperand), DstChild->getName(),
         std::get<1>(*SubOperand), std::get<2>(*SubOperand));
@@ -4802,7 +4806,7 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderers(
 
     const auto SrcRCDstRCPair =
       RC->getMatchingSubClassWithSubRegs(CGRegs, SubIdx);
-    if (SrcRCDstRCPair.hasValue()) {
+    if (SrcRCDstRCPair) {
       assert(SrcRCDstRCPair->second && "Couldn't find a matching subclass");
       if (SrcRCDstRCPair->first != RC)
         return failedImport("EXTRACT_SUBREG requires an additional COPY");
@@ -5533,6 +5537,7 @@ std::vector<Matcher *> GlobalISelEmitter::optimizeRules(
   ProcessCurrentGroup();
 
   LLVM_DEBUG(dbgs() << "NumGroups: " << NumGroups << "\n");
+  (void) NumGroups;
   assert(CurrentGroup->empty() && "The last group wasn't properly processed");
   return OptRules;
 }
diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp
index 3c92aa0cc27a..a7a4f4f5f1a7 100644
--- a/llvm/utils/TableGen/InstrInfoEmitter.cpp
+++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp
@@ -36,6 +36,12 @@
 
 using namespace llvm;
 
+cl::OptionCategory InstrInfoEmitterCat("Options for -gen-instr-info");
+static cl::opt<bool> ExpandMIOperandInfo(
+    "instr-info-expand-mi-operand-info",
+    cl::desc("Expand operand's MIOperandInfo DAG into suboperands"),
+    cl::cat(InstrInfoEmitterCat), cl::init(true));
+
 namespace {
 
 class InstrInfoEmitter {
@@ -379,6 +385,9 @@ void InstrInfoEmitter::emitOperandTypeMappings(
   OS << "namespace " << Namespace << " {\n";
   OS << "LLVM_READONLY\n";
   OS << "static int getOperandType(uint16_t Opcode, uint16_t OpIdx) {\n";
+  auto getInstrName = [&](int I) -> StringRef {
+    return NumberedInstructions[I]->TheDef->getName();
+  };
   // TODO: Factor out duplicate operand lists to compress the tables.
   if (!NumberedInstructions.empty()) {
     std::vector<int> OperandOffsets;
@@ -388,7 +397,7 @@ void InstrInfoEmitter::emitOperandTypeMappings(
       OperandOffsets.push_back(CurrentOffset);
       for (const auto &Op : Inst->Operands) {
         const DagInit *MIOI = Op.MIOperandInfo;
-        if (!MIOI || MIOI->getNumArgs() == 0) {
+        if (!ExpandMIOperandInfo || !MIOI || MIOI->getNumArgs() == 0) {
           // Single, anonymous, operand.
           OperandRecords.push_back(Op.Rec);
           ++CurrentOffset;
@@ -408,8 +417,10 @@ void InstrInfoEmitter::emitOperandTypeMappings(
     OS << ((OperandRecords.size() <= UINT16_MAX) ? "  const uint16_t"
                                                  : "  const uint32_t");
     OS << " Offsets[] = {\n";
-    for (int I = 0, E = OperandOffsets.size(); I != E; ++I)
+    for (int I = 0, E = OperandOffsets.size(); I != E; ++I) {
+      OS << "    /* " << getInstrName(I) << " */\n";
       OS << "    " << OperandOffsets[I] << ",\n";
+    }
     OS << "  };\n";
 
     // Add an entry for the end so that we don't need to special case it below.
@@ -419,22 +430,22 @@ void InstrInfoEmitter::emitOperandTypeMappings(
     // Size the signed integer operand type to save space.
     assert(EnumVal <= INT16_MAX &&
            "Too many operand types for operand types table");
+    OS << "\n  using namespace OpTypes;\n";
     OS << ((EnumVal <= INT8_MAX) ? "  const int8_t" : "  const int16_t");
     OS << " OpcodeOperandTypes[] = {\n    ";
-    for (int I = 0, E = OperandRecords.size(), CurOffset = 1; I != E; ++I) {
+    for (int I = 0, E = OperandRecords.size(), CurOffset = 0; I != E; ++I) {
       // We print each Opcode's operands in its own row.
       if (I == OperandOffsets[CurOffset]) {
-        OS << "\n    ";
-        // If there are empty rows, mark them with an empty comment.
+        OS << "\n    /* " << getInstrName(CurOffset) << " */\n    ";
         while (OperandOffsets[++CurOffset] == I)
-          OS << "/**/\n    ";
+          OS << "/* " << getInstrName(CurOffset) << " */\n    ";
       }
       Record *OpR = OperandRecords[I];
       if ((OpR->isSubClassOf("Operand") ||
            OpR->isSubClassOf("RegisterOperand") ||
            OpR->isSubClassOf("RegisterClass")) &&
           !OpR->isAnonymous())
-        OS << "OpTypes::" << OpR->getName();
+        OS << OpR->getName();
       else
         OS << -1;
       OS << ", ";
@@ -449,6 +460,31 @@ void InstrInfoEmitter::emitOperandTypeMappings(
   OS << "} // end namespace " << Namespace << "\n";
   OS << "} // end namespace llvm\n";
   OS << "#endif // GET_INSTRINFO_OPERAND_TYPE\n\n";
+
+  OS << "#ifdef GET_INSTRINFO_MEM_OPERAND_SIZE\n";
+  OS << "#undef GET_INSTRINFO_MEM_OPERAND_SIZE\n";
+  OS << "namespace llvm {\n";
+  OS << "namespace " << Namespace << " {\n";
+  OS << "LLVM_READONLY\n";
+  OS << "static int getMemOperandSize(int OpType) {\n";
+  OS << "  switch (OpType) {\n";
+  std::map<int, std::vector<StringRef>> SizeToOperandName;
+  for (const Record *Op : Operands) {
+    if (!Op->isSubClassOf("X86MemOperand"))
+      continue;
+    if (int Size = Op->getValueAsInt("Size"))
+      SizeToOperandName[Size].push_back(Op->getName());
+  }
+  OS << "  default: return 0;\n";
+  for (auto KV : SizeToOperandName) {
+    for (const StringRef &OperandName : KV.second)
+      OS << "  case OpTypes::" << OperandName << ":\n";
+    OS << "    return " << KV.first << ";\n\n";
+  }
+  OS << "  }\n}\n";
+  OS << "} // end namespace " << Namespace << "\n";
+  OS << "} // end namespace llvm\n";
+  OS << "#endif // GET_INSTRINFO_MEM_OPERAND_SIZE\n\n";
 }
 
 void InstrInfoEmitter::emitLogicalOperandSizeMappings(
@@ -943,6 +979,7 @@ void InstrInfoEmitter::emitRecord(const CodeGenInstruction &Inst, unsigned Num,
   // Emit all of the target independent flags...
   if (Inst.isPreISelOpcode)    OS << "|(1ULL<<MCID::PreISelOpcode)";
   if (Inst.isPseudo)           OS << "|(1ULL<<MCID::Pseudo)";
+  if (Inst.isMeta)             OS << "|(1ULL<<MCID::Meta)";
   if (Inst.isReturn)           OS << "|(1ULL<<MCID::Return)";
   if (Inst.isEHScopeReturn)    OS << "|(1ULL<<MCID::EHScopeReturn)";
   if (Inst.isBranch)           OS << "|(1ULL<<MCID::Branch)";
diff --git a/llvm/utils/TableGen/IntrinsicEmitter.cpp b/llvm/utils/TableGen/IntrinsicEmitter.cpp
index a5aa4069e60f..fca2bc34e09a 100644
--- a/llvm/utils/TableGen/IntrinsicEmitter.cpp
+++ b/llvm/utils/TableGen/IntrinsicEmitter.cpp
@@ -46,7 +46,7 @@ public:
                                     raw_ostream &OS);
   void EmitGenerator(const CodeGenIntrinsicTable &Ints, raw_ostream &OS);
   void EmitAttributes(const CodeGenIntrinsicTable &Ints, raw_ostream &OS);
-  void EmitIntrinsicToBuiltinMap(const CodeGenIntrinsicTable &Ints, bool IsGCC,
+  void EmitIntrinsicToBuiltinMap(const CodeGenIntrinsicTable &Ints, bool IsClang,
                                  raw_ostream &OS);
 };
 } // End anonymous namespace
@@ -196,25 +196,25 @@ void IntrinsicEmitter::EmitIntrinsicToOverloadTable(
 enum IIT_Info {
   // Common values should be encoded with 0-15.
   IIT_Done = 0,
-  IIT_I1   = 1,
-  IIT_I8   = 2,
-  IIT_I16  = 3,
-  IIT_I32  = 4,
-  IIT_I64  = 5,
-  IIT_F16  = 6,
-  IIT_F32  = 7,
-  IIT_F64  = 8,
-  IIT_V2   = 9,
-  IIT_V4   = 10,
-  IIT_V8   = 11,
-  IIT_V16  = 12,
-  IIT_V32  = 13,
-  IIT_PTR  = 14,
-  IIT_ARG  = 15,
+  IIT_I1 = 1,
+  IIT_I8 = 2,
+  IIT_I16 = 3,
+  IIT_I32 = 4,
+  IIT_I64 = 5,
+  IIT_F16 = 6,
+  IIT_F32 = 7,
+  IIT_F64 = 8,
+  IIT_V2 = 9,
+  IIT_V4 = 10,
+  IIT_V8 = 11,
+  IIT_V16 = 12,
+  IIT_V32 = 13,
+  IIT_PTR = 14,
+  IIT_ARG = 15,
 
   // Values from 16+ are only encodable with the inefficient encoding.
-  IIT_V64  = 16,
-  IIT_MMX  = 17,
+  IIT_V64 = 16,
+  IIT_MMX = 17,
   IIT_TOKEN = 18,
   IIT_METADATA = 19,
   IIT_EMPTYSTRUCT = 20,
@@ -225,7 +225,7 @@ enum IIT_Info {
   IIT_EXTEND_ARG = 25,
   IIT_TRUNC_ARG = 26,
   IIT_ANYPTR = 27,
-  IIT_V1   = 28,
+  IIT_V1 = 28,
   IIT_VARARG = 29,
   IIT_HALF_VEC_ARG = 30,
   IIT_SAME_VEC_WIDTH_ARG = 31,
@@ -248,20 +248,26 @@ enum IIT_Info {
   IIT_BF16 = 48,
   IIT_STRUCT9 = 49,
   IIT_V256 = 50,
-  IIT_AMX  = 51,
+  IIT_AMX = 51,
   IIT_PPCF128 = 52,
   IIT_V3 = 53,
   IIT_EXTERNREF = 54,
-  IIT_FUNCREF = 55
+  IIT_FUNCREF = 55,
+  IIT_ANYPTR_TO_ELT = 56,
+  IIT_I2 = 57,
+  IIT_I4 = 58,
 };
 
 static void EncodeFixedValueType(MVT::SimpleValueType VT,
                                  std::vector<unsigned char> &Sig) {
+  // clang-format off
   if (MVT(VT).isInteger()) {
     unsigned BitWidth = MVT(VT).getFixedSizeInBits();
     switch (BitWidth) {
     default: PrintFatalError("unhandled integer type width in intrinsic!");
     case 1: return Sig.push_back(IIT_I1);
+    case 2: return Sig.push_back(IIT_I2);
+    case 4: return Sig.push_back(IIT_I4);
     case 8: return Sig.push_back(IIT_I8);
     case 16: return Sig.push_back(IIT_I16);
     case 32: return Sig.push_back(IIT_I32);
@@ -291,6 +297,7 @@ static void EncodeFixedValueType(MVT::SimpleValueType VT,
   case MVT::funcref:
     return Sig.push_back(IIT_FUNCREF);
   }
+  // clang-format on
 }
 
 #if defined(_MSC_VER) && !defined(__clang__)
@@ -327,6 +334,13 @@ static void EncodeFixedType(Record *R, std::vector<unsigned char> &ArgCodes,
       // Encode LLVMMatchType<Number> ArgNo
       Sig.push_back(Number);
       return;
+    } else if (R->isSubClassOf("LLVMAnyPointerToElt")) {
+      Sig.push_back(IIT_ANYPTR_TO_ELT);
+      // Encode overloaded ArgNo
+      Sig.push_back(NextArgCode++);
+      // Encode LLVMMatchType<Number> ArgNo
+      Sig.push_back(Number);
+      return;
     } else if (R->isSubClassOf("LLVMPointerToElt"))
       Sig.push_back(IIT_PTR_TO_ELT);
     else if (R->isSubClassOf("LLVMVectorElementType"))
@@ -415,6 +429,9 @@ static void UpdateArgCodes(Record *R, std::vector<unsigned char> &ArgCodes,
     if (R->isSubClassOf("LLVMVectorOfAnyPointersToElt")) {
       ArgCodes.push_back(3 /*vAny*/);
       ++NumInserted;
+    } else if (R->isSubClassOf("LLVMAnyPointerToElt")) {
+      ArgCodes.push_back(4 /*iPTRAny*/);
+      ++NumInserted;
     }
     return;
   }
@@ -599,6 +616,9 @@ struct AttributeComparator {
     if (L->isNoReturn != R->isNoReturn)
       return R->isNoReturn;
 
+    if (L->isNoCallback != R->isNoCallback)
+      return R->isNoCallback;
+
     if (L->isNoSync != R->isNoSync)
       return R->isNoSync;
 
@@ -748,16 +768,18 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
     if (!Intrinsic.canThrow ||
         (Intrinsic.ModRef != CodeGenIntrinsic::ReadWriteMem &&
          !Intrinsic.hasSideEffects) ||
-        Intrinsic.isNoReturn || Intrinsic.isNoSync || Intrinsic.isNoFree ||
-        Intrinsic.isWillReturn || Intrinsic.isCold || Intrinsic.isNoDuplicate ||
-        Intrinsic.isNoMerge || Intrinsic.isConvergent ||
-        Intrinsic.isSpeculatable) {
+        Intrinsic.isNoReturn || Intrinsic.isNoCallback || Intrinsic.isNoSync ||
+        Intrinsic.isNoFree || Intrinsic.isWillReturn || Intrinsic.isCold ||
+        Intrinsic.isNoDuplicate || Intrinsic.isNoMerge ||
+        Intrinsic.isConvergent || Intrinsic.isSpeculatable) {
       OS << "      const Attribute::AttrKind Atts[] = {";
       ListSeparator LS(",");
       if (!Intrinsic.canThrow)
         OS << LS << "Attribute::NoUnwind";
       if (Intrinsic.isNoReturn)
         OS << LS << "Attribute::NoReturn";
+      if (Intrinsic.isNoCallback)
+        OS << LS << "Attribute::NoCallback";
       if (Intrinsic.isNoSync)
         OS << LS << "Attribute::NoSync";
       if (Intrinsic.isNoFree)
@@ -858,14 +880,15 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
 }
 
 void IntrinsicEmitter::EmitIntrinsicToBuiltinMap(
-    const CodeGenIntrinsicTable &Ints, bool IsGCC, raw_ostream &OS) {
-  StringRef CompilerName = (IsGCC ? "GCC" : "MS");
+    const CodeGenIntrinsicTable &Ints, bool IsClang, raw_ostream &OS) {
+  StringRef CompilerName = (IsClang ? "Clang" : "MS");
+  StringRef UpperCompilerName = (IsClang ? "CLANG" : "MS");
   typedef std::map<std::string, std::map<std::string, std::string>> BIMTy;
   BIMTy BuiltinMap;
   StringToOffsetTable Table;
   for (unsigned i = 0, e = Ints.size(); i != e; ++i) {
     const std::string &BuiltinName =
-        IsGCC ? Ints[i].GCCBuiltinName : Ints[i].MSBuiltinName;
+        IsClang ? Ints[i].ClangBuiltinName : Ints[i].MSBuiltinName;
     if (!BuiltinName.empty()) {
       // Get the map for this target prefix.
       std::map<std::string, std::string> &BIM =
@@ -883,7 +906,7 @@ void IntrinsicEmitter::EmitIntrinsicToBuiltinMap(
   OS << "// This is used by the C front-end.  The builtin name is passed\n";
   OS << "// in as BuiltinName, and a target prefix (e.g. 'ppc') is passed\n";
   OS << "// in as TargetPrefix.  The result is assigned to 'IntrinsicID'.\n";
-  OS << "#ifdef GET_LLVM_INTRINSIC_FOR_" << CompilerName << "_BUILTIN\n";
+  OS << "#ifdef GET_LLVM_INTRINSIC_FOR_" << UpperCompilerName << "_BUILTIN\n";
 
   OS << "Intrinsic::ID Intrinsic::getIntrinsicFor" << CompilerName
      << "Builtin(const char "
diff --git a/llvm/utils/TableGen/OptParserEmitter.cpp b/llvm/utils/TableGen/OptParserEmitter.cpp
index d54132f3190b..182cd0076090 100644
--- a/llvm/utils/TableGen/OptParserEmitter.cpp
+++ b/llvm/utils/TableGen/OptParserEmitter.cpp
@@ -172,7 +172,7 @@ static MarshallingInfo createMarshallingInfo(const Record &R) {
   Ret.NormalizedValuesScope = R.getValueAsString("NormalizedValuesScope");
   Ret.ImpliedCheck = R.getValueAsString("ImpliedCheck");
   Ret.ImpliedValue =
-      R.getValueAsOptionalString("ImpliedValue").getValueOr(Ret.DefaultValue);
+      R.getValueAsOptionalString("ImpliedValue").value_or(Ret.DefaultValue);
 
   Ret.ShouldParse = R.getValueAsString("ShouldParse");
   Ret.Normalizer = R.getValueAsString("Normalizer");
diff --git a/llvm/utils/TableGen/OptRSTEmitter.cpp b/llvm/utils/TableGen/OptRSTEmitter.cpp
index 11d896229f5b..03c7326e817a 100644
--- a/llvm/utils/TableGen/OptRSTEmitter.cpp
+++ b/llvm/utils/TableGen/OptRSTEmitter.cpp
@@ -60,18 +60,43 @@ void EmitOptRST(RecordKeeper &Records, raw_ostream &OS) {
       // Print the option name.
       OS << R->getValueAsString("Name");
 
+      StringRef MetaVarName;
       // Print the meta-variable.
       if (!isa<UnsetInit>(R->getValueInit("MetaVarName"))) {
+        MetaVarName = R->getValueAsString("MetaVarName");
+      } else if (!isa<UnsetInit>(R->getValueInit("Values")))
+        MetaVarName = "<value>";
+
+      if (!MetaVarName.empty()) {
         OS << '=';
-        OS.write_escaped(R->getValueAsString("MetaVarName"));
+        OS.write_escaped(MetaVarName);
       }
 
       OS << "\n\n";
 
+      std::string HelpText;
       // The option help text.
       if (!isa<UnsetInit>(R->getValueInit("HelpText"))) {
+        HelpText = R->getValueAsString("HelpText").trim().str();
+        if (!HelpText.empty() && HelpText.back() != '.')
+          HelpText.push_back('.');
+      }
+
+      if (!isa<UnsetInit>(R->getValueInit("Values"))) {
+        SmallVector<StringRef> Values;
+        SplitString(R->getValueAsString("Values"), Values, ",");
+        HelpText += (" " + MetaVarName + " must be '").str();
+
+        if (Values.size() > 1) {
+          HelpText += join(Values.begin(), Values.end() - 1, "', '");
+          HelpText += "' or '";
+        }
+        HelpText += (Values.front() + "'.").str();
+      }
+
+      if (!HelpText.empty()) {
         OS << ' ';
-        OS.write_escaped(R->getValueAsString("HelpText"));
+        OS.write_escaped(HelpText);
         OS << "\n\n";
       }
     }
diff --git a/llvm/utils/TableGen/PseudoLoweringEmitter.cpp b/llvm/utils/TableGen/PseudoLoweringEmitter.cpp
index 6acb630299c1..dc04174217fb 100644
--- a/llvm/utils/TableGen/PseudoLoweringEmitter.cpp
+++ b/llvm/utils/TableGen/PseudoLoweringEmitter.cpp
@@ -109,7 +109,8 @@ addDagOperandMapping(Record *Rec, DagInit *Dag, CodeGenInstruction &Insn,
       OperandMap[BaseIdx + i].Data.Imm = II->getValue();
       ++OpsAdded;
     } else if (auto *BI = dyn_cast<BitsInit>(Dag->getArg(i))) {
-      auto *II = cast<IntInit>(BI->convertInitializerTo(IntRecTy::get()));
+      auto *II =
+          cast<IntInit>(BI->convertInitializerTo(IntRecTy::get(Records)));
       OperandMap[BaseIdx + i].Kind = OpData::Imm;
       OperandMap[BaseIdx + i].Data.Imm = II->getValue();
       ++OpsAdded;
diff --git a/llvm/utils/TableGen/RegisterBankEmitter.cpp b/llvm/utils/TableGen/RegisterBankEmitter.cpp
index d97d7acb87a7..e6689b211a7d 100644
--- a/llvm/utils/TableGen/RegisterBankEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterBankEmitter.cpp
@@ -172,9 +172,8 @@ static void visitRegisterBankClasses(
     SmallPtrSetImpl<const CodeGenRegisterClass *> &VisitedRCs) {
 
   // Make sure we only visit each class once to avoid infinite loops.
-  if (VisitedRCs.count(RC))
+  if (!VisitedRCs.insert(RC).second)
     return;
-  VisitedRCs.insert(RC);
 
   // Visit each explicitly named class.
   VisitFn(RC, Kind.str());
@@ -266,9 +265,8 @@ void RegisterBankEmitter::emitBaseClassImplementation(
      << "::NumRegisterBanks) {\n"
      << "  // Assert that RegBank indices match their ID's\n"
      << "#ifndef NDEBUG\n"
-     << "  unsigned Index = 0;\n"
-     << "  for (const auto &RB : RegBanks)\n"
-     << "    assert(Index++ == RB->getID() && \"Index != ID\");\n"
+     << "  for (auto RB : enumerate(RegBanks))\n"
+     << "    assert(RB.index() == RB.value()->getID() && \"Index != ID\");\n"
      << "#endif // NDEBUG\n"
      << "}\n"
      << "} // end namespace llvm\n";
diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
index 1ed7bc103f9c..3a0fa564074e 100644
--- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
@@ -268,7 +268,7 @@ EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank,
   OS << "// Get the name of this register unit pressure set.\n"
      << "const char *" << ClassName << "::\n"
      << "getRegPressureSetName(unsigned Idx) const {\n"
-     << "  static const char *const PressureNameTable[] = {\n";
+     << "  static const char *PressureNameTable[] = {\n";
   unsigned MaxRegUnitWeight = 0;
   for (unsigned i = 0; i < NumSets; ++i ) {
     const RegUnitSet &RegUnits = RegBank.getRegSetAt(i);
@@ -753,7 +753,7 @@ RegisterInfoEmitter::emitComposeSubRegIndices(raw_ostream &OS,
   }
   OS << "  };\n\n";
 
-  OS << "  --IdxA; assert(IdxA < " << SubRegIndicesSize << ");\n"
+  OS << "  --IdxA; assert(IdxA < " << SubRegIndicesSize << "); (void) IdxA;\n"
      << "  --IdxB; assert(IdxB < " << SubRegIndicesSize << ");\n";
   if (Rows.size() > 1)
     OS << "  return Rows[RowMap[IdxA]][IdxB];\n";
@@ -814,12 +814,14 @@ RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS,
     OS << "  // Sequence " << Idx << "\n";
     Idx += Sequence.size() + 1;
   }
+  auto *IntType = getMinimalTypeForRange(*std::max_element(
+      SubReg2SequenceIndexMap.begin(), SubReg2SequenceIndexMap.end()));
   OS << "  };\n"
-        "  static const MaskRolOp *const CompositeSequences[] = {\n";
+        "  static const "
+     << IntType << " CompositeSequences[] = {\n";
   for (size_t i = 0, e = SubRegIndices.size(); i != e; ++i) {
     OS << "    ";
-    unsigned Idx = SubReg2SequenceIndexMap[i];
-    OS << format("&LaneMaskComposeSequences[%u]", Idx);
+    OS << SubReg2SequenceIndexMap[i];
     if (i+1 != e)
       OS << ",";
     OS << " // to " << SubRegIndices[i].getName() << "\n";
@@ -832,7 +834,9 @@ RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS,
         "  --IdxA; assert(IdxA < " << SubRegIndices.size()
      << " && \"Subregister index out of bounds\");\n"
         "  LaneBitmask Result;\n"
-        "  for (const MaskRolOp *Ops = CompositeSequences[IdxA]; Ops->Mask.any(); ++Ops) {\n"
+        "  for (const MaskRolOp *Ops =\n"
+        "       &LaneMaskComposeSequences[CompositeSequences[IdxA]];\n"
+        "       Ops->Mask.any(); ++Ops) {\n"
         "    LaneBitmask::Type M = LaneMask.getAsInteger() & Ops->Mask.getAsInteger();\n"
         "    if (unsigned S = Ops->RotateLeft)\n"
         "      Result |= LaneBitmask((M << S) | (M >> (LaneBitmask::BitWidth - S)));\n"
@@ -849,7 +853,9 @@ RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS,
         "  --IdxA; assert(IdxA < " << SubRegIndices.size()
      << " && \"Subregister index out of bounds\");\n"
         "  LaneBitmask Result;\n"
-        "  for (const MaskRolOp *Ops = CompositeSequences[IdxA]; Ops->Mask.any(); ++Ops) {\n"
+        "  for (const MaskRolOp *Ops =\n"
+        "       &LaneMaskComposeSequences[CompositeSequences[IdxA]];\n"
+        "       Ops->Mask.any(); ++Ops) {\n"
         "    LaneBitmask::Type M = LaneMask.getAsInteger();\n"
         "    if (unsigned S = Ops->RotateLeft)\n"
         "      Result |= LaneBitmask((M >> S) | (M << (LaneBitmask::BitWidth - S)));\n"
@@ -1046,25 +1052,24 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
 
     RegClassStrings.add(Name);
 
-    // Emit the register list now.
-    OS << "  // " << Name << " Register Class...\n"
-       << "  const MCPhysReg " << Name
-       << "[] = {\n    ";
-    for (Record *Reg : Order) {
-      OS << getQualifiedName(Reg) << ", ";
-    }
-    OS << "\n  };\n\n";
-
-    OS << "  // " << Name << " Bit set.\n"
-       << "  const uint8_t " << Name
-       << "Bits[] = {\n    ";
-    BitVectorEmitter BVE;
-    for (Record *Reg : Order) {
-      BVE.add(Target.getRegBank().getReg(Reg)->EnumValue);
-    }
-    BVE.print(OS);
-    OS << "\n  };\n\n";
+    // Emit the register list now (unless it would be a zero-length array).
+    if (!Order.empty()) {
+      OS << "  // " << Name << " Register Class...\n"
+         << "  const MCPhysReg " << Name << "[] = {\n    ";
+      for (Record *Reg : Order) {
+        OS << getQualifiedName(Reg) << ", ";
+      }
+      OS << "\n  };\n\n";
 
+      OS << "  // " << Name << " Bit set.\n"
+         << "  const uint8_t " << Name << "Bits[] = {\n    ";
+      BitVectorEmitter BVE;
+      for (Record *Reg : Order) {
+        BVE.add(Target.getRegBank().getReg(Reg)->EnumValue);
+      }
+      BVE.print(OS);
+      OS << "\n  };\n\n";
+    }
   }
   OS << "} // end anonymous namespace\n\n";
 
@@ -1076,14 +1081,17 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
      << "MCRegisterClasses[] = {\n";
 
   for (const auto &RC : RegisterClasses) {
+    ArrayRef<Record *> Order = RC.getOrder();
+    std::string RCName = Order.empty() ? "nullptr" : RC.getName();
+    std::string RCBitsName = Order.empty() ? "nullptr" : RC.getName() + "Bits";
+    std::string RCBitsSize = Order.empty() ? "0" : "sizeof(" + RCBitsName + ")";
     assert(isInt<8>(RC.CopyCost) && "Copy cost too large.");
     uint32_t RegSize = 0;
     if (RC.RSI.isSimple())
       RegSize = RC.RSI.getSimple().RegSize;
-    OS << "  { " << RC.getName() << ", " << RC.getName() << "Bits, "
+    OS << "  { " << RCName << ", " << RCBitsName << ", "
        << RegClassStrings.get(RC.getName()) << ", " << RC.getOrder().size()
-       << ", sizeof(" << RC.getName() << "Bits), "
-       << RC.getQualifiedName() + "RegClassID"
+       << ", " << RCBitsSize << ", " << RC.getQualifiedName() + "RegClassID"
        << ", " << RegSize << ", " << RC.CopyCost << ", "
        << (RC.Allocatable ? "true" : "false") << " },\n";
   }
@@ -1176,6 +1184,12 @@ RegisterInfoEmitter::runTargetHeader(raw_ostream &OS, CodeGenTarget &Target,
      << "unsigned RegUnit) const override;\n"
      << "  ArrayRef<const char *> getRegMaskNames() const override;\n"
      << "  ArrayRef<const uint32_t *> getRegMasks() const override;\n"
+     << "  bool isGeneralPurposeRegister(const MachineFunction &, "
+     << "MCRegister) const override;\n"
+     << "  bool isFixedRegister(const MachineFunction &, "
+     << "MCRegister) const override;\n"
+     << "  bool isArgumentRegister(const MachineFunction &, "
+     << "MCRegister) const override;\n"
      << "  /// Devirtualized TargetFrameLowering.\n"
      << "  static const " << TargetName << "FrameLowering *getFrameLowering(\n"
      << "      const MachineFunction &MF);\n"
@@ -1250,7 +1264,7 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
   OS << "};\n";
 
   // Emit SubRegIndex names, skipping 0.
-  OS << "\nstatic const char *const SubRegIndexNameTable[] = { \"";
+  OS << "\nstatic const char *SubRegIndexNameTable[] = { \"";
 
   for (const auto &Idx : SubRegIndices) {
     OS << Idx.getName();
@@ -1620,10 +1634,54 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
   }
   OS << "}\n\n";
 
+  const std::list<CodeGenRegisterCategory> &RegCategories =
+      RegBank.getRegCategories();
+  OS << "bool " << ClassName << "::\n"
+     << "isGeneralPurposeRegister(const MachineFunction &MF, "
+     << "MCRegister PhysReg) const {\n"
+     << "  return\n";
+  for (const CodeGenRegisterCategory &Category : RegCategories)
+    if (Category.getName() == "GeneralPurposeRegisters") {
+      for (const CodeGenRegisterClass *RC : Category.getClasses())
+        OS << "      " << RC->getQualifiedName()
+           << "RegClass.contains(PhysReg) ||\n";
+      break;
+    }
+  OS << "      false;\n";
+  OS << "}\n\n";
+
+  OS << "bool " << ClassName << "::\n"
+     << "isFixedRegister(const MachineFunction &MF, "
+     << "MCRegister PhysReg) const {\n"
+     << "  return\n";
+  for (const CodeGenRegisterCategory &Category : RegCategories)
+    if (Category.getName() == "FixedRegisters") {
+      for (const CodeGenRegisterClass *RC : Category.getClasses())
+        OS << "      " << RC->getQualifiedName()
+           << "RegClass.contains(PhysReg) ||\n";
+      break;
+    }
+  OS << "      false;\n";
+  OS << "}\n\n";
+
+  OS << "bool " << ClassName << "::\n"
+     << "isArgumentRegister(const MachineFunction &MF, "
+     << "MCRegister PhysReg) const {\n"
+     << "  return\n";
+  for (const CodeGenRegisterCategory &Category : RegCategories)
+    if (Category.getName() == "ArgumentRegisters") {
+      for (const CodeGenRegisterClass *RC : Category.getClasses())
+        OS << "      " << RC->getQualifiedName()
+           << "RegClass.contains(PhysReg) ||\n";
+      break;
+    }
+  OS << "      false;\n";
+  OS << "}\n\n";
+
   OS << "ArrayRef<const char *> " << ClassName
      << "::getRegMaskNames() const {\n";
   if (!CSRSets.empty()) {
-  OS << "  static const char *const Names[] = {\n";
+    OS << "  static const char *Names[] = {\n";
     for (Record *CSRSet : CSRSets)
       OS << "    " << '"' << CSRSet->getName() << '"' << ",\n";
     OS << "  };\n";
@@ -1683,6 +1741,8 @@ void RegisterInfoEmitter::debugDump(raw_ostream &OS) {
     OS << "\tLaneMask: " << PrintLaneMask(RC.LaneMask) << '\n';
     OS << "\tHasDisjunctSubRegs: " << RC.HasDisjunctSubRegs << '\n';
     OS << "\tCoveredBySubRegs: " << RC.CoveredBySubRegs << '\n';
+    OS << "\tAllocatable: " << RC.Allocatable << '\n';
+    OS << "\tAllocationPriority: " << unsigned(RC.AllocationPriority) << '\n';
     OS << "\tRegs:";
     for (const CodeGenRegister *R : RC.getMembers()) {
       OS << " " << R->getName();
diff --git a/llvm/utils/TableGen/SearchableTableEmitter.cpp b/llvm/utils/TableGen/SearchableTableEmitter.cpp
index dc5c96c662be..ea849807de03 100644
--- a/llvm/utils/TableGen/SearchableTableEmitter.cpp
+++ b/llvm/utils/TableGen/SearchableTableEmitter.cpp
@@ -30,7 +30,9 @@ using namespace llvm;
 namespace {
 
 int getAsInt(Init *B) {
-  return cast<IntInit>(B->convertInitializerTo(IntRecTy::get()))->getValue();
+  return cast<IntInit>(
+             B->convertInitializerTo(IntRecTy::get(B->getRecordKeeper())))
+      ->getValue();
 }
 int getInt(Record *R, StringRef Field) {
   return getAsInt(R->getValueInit(Field));
diff --git a/llvm/utils/TableGen/SequenceToOffsetTable.h b/llvm/utils/TableGen/SequenceToOffsetTable.h
index 41cdefdb1949..1b3451c24cb0 100644
--- a/llvm/utils/TableGen/SequenceToOffsetTable.h
+++ b/llvm/utils/TableGen/SequenceToOffsetTable.h
@@ -170,18 +170,18 @@ public:
   /// `EmitLongStrLiterals` is false
   void emitStringLiteralDef(raw_ostream &OS, const llvm::Twine &Decl) const {
     assert(Entries && "Call layout() before emitStringLiteralDef()");
-    if (EmitLongStrLiterals) {
-      OS << "\n#ifdef __GNUC__\n"
-         << "#pragma GCC diagnostic push\n"
-         << "#pragma GCC diagnostic ignored \"-Woverlength-strings\"\n"
-         << "#endif\n"
-         << Decl << " = {\n";
-    } else {
+    if (!EmitLongStrLiterals) {
       OS << Decl << " = {\n";
       emit(OS, printChar, "0");
-      OS << "\n};\n\n";
+      OS << "  0\n};\n\n";
       return;
     }
+
+    OS << "\n#ifdef __GNUC__\n"
+       << "#pragma GCC diagnostic push\n"
+       << "#pragma GCC diagnostic ignored \"-Woverlength-strings\"\n"
+       << "#endif\n"
+       << Decl << " = {\n";
     for (auto I : Seqs) {
       OS << "  /* " << I.second << " */ \"";
       for (auto C : I.first) {
diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp
index 78bbb3196e5c..88827607b517 100644
--- a/llvm/utils/TableGen/SubtargetEmitter.cpp
+++ b/llvm/utils/TableGen/SubtargetEmitter.cpp
@@ -74,6 +74,7 @@ class SubtargetEmitter {
   std::string Target;
 
   void Enumeration(raw_ostream &OS, DenseMap<Record *, unsigned> &FeatureMap);
+  void EmitSubtargetInfoMacroCalls(raw_ostream &OS);
   unsigned FeatureKeyValues(raw_ostream &OS,
                             const DenseMap<Record *, unsigned> &FeatureMap);
   unsigned CPUKeyValues(raw_ostream &OS,
@@ -122,8 +123,7 @@ class SubtargetEmitter {
 
   void EmitSchedModel(raw_ostream &OS);
   void EmitHwModeCheck(const std::string &ClassName, raw_ostream &OS);
-  void ParseFeaturesFunction(raw_ostream &OS, unsigned NumFeatures,
-                             unsigned NumProcs);
+  void ParseFeaturesFunction(raw_ostream &OS);
 
 public:
   SubtargetEmitter(RecordKeeper &R, CodeGenTarget &TGT)
@@ -193,6 +193,42 @@ static void printFeatureMask(raw_ostream &OS, RecVec &FeatureList,
   OS << "} } }";
 }
 
+/// Emit some information about the SubtargetFeature as calls to a macro so
+/// that they can be used from C++.
+void SubtargetEmitter::EmitSubtargetInfoMacroCalls(raw_ostream &OS) {
+  OS << "\n#ifdef GET_SUBTARGETINFO_MACRO\n";
+
+  std::vector<Record *> FeatureList =
+      Records.getAllDerivedDefinitions("SubtargetFeature");
+  llvm::sort(FeatureList, LessRecordFieldName());
+
+  for (const Record *Feature : FeatureList) {
+    const StringRef Attribute = Feature->getValueAsString("Attribute");
+    const StringRef Value = Feature->getValueAsString("Value");
+
+    // Only handle boolean features for now, excluding BitVectors and enums.
+    const bool IsBool = (Value == "false" || Value == "true") &&
+                        !StringRef(Attribute).contains('[');
+    if (!IsBool)
+      continue;
+
+    // Some features default to true, with values set to false if enabled.
+    const char *Default = Value == "false" ? "true" : "false";
+
+    // Define the getter with lowercased first char: xxxYyy() { return XxxYyy; }
+    const std::string Getter =
+        Attribute.substr(0, 1).lower() + Attribute.substr(1).str();
+
+    OS << "GET_SUBTARGETINFO_MACRO(" << Attribute << ", " << Default << ", "
+       << Getter << ")\n";
+  }
+  OS << "#undef GET_SUBTARGETINFO_MACRO\n";
+  OS << "#endif // GET_SUBTARGETINFO_MACRO\n\n";
+
+  OS << "\n#ifdef GET_SUBTARGETINFO_MC_DESC\n";
+  OS << "#undef GET_SUBTARGETINFO_MC_DESC\n\n";
+}
+
 //
 // FeatureKeyValues - Emit data of all the subtarget features.  Used by the
 // command line.
@@ -1681,13 +1717,9 @@ void SubtargetEmitter::EmitHwModeCheck(const std::string &ClassName,
   OS << "  return 0;\n}\n";
 }
 
-//
-// ParseFeaturesFunction - Produces a subtarget specific function for parsing
+// Produces a subtarget specific function for parsing
 // the subtarget features string.
-//
-void SubtargetEmitter::ParseFeaturesFunction(raw_ostream &OS,
-                                             unsigned NumFeatures,
-                                             unsigned NumProcs) {
+void SubtargetEmitter::ParseFeaturesFunction(raw_ostream &OS) {
   std::vector<Record*> Features =
                        Records.getAllDerivedDefinitions("SubtargetFeature");
   llvm::sort(Features, LessRecord());
@@ -1803,8 +1835,7 @@ void SubtargetEmitter::run(raw_ostream &OS) {
   OS << "} // end namespace llvm\n\n";
   OS << "#endif // GET_SUBTARGETINFO_ENUM\n\n";
 
-  OS << "\n#ifdef GET_SUBTARGETINFO_MC_DESC\n";
-  OS << "#undef GET_SUBTARGETINFO_MC_DESC\n\n";
+  EmitSubtargetInfoMacroCalls(OS);
 
   OS << "namespace llvm {\n";
 #if 0
@@ -1858,7 +1889,7 @@ void SubtargetEmitter::run(raw_ostream &OS) {
 
   OS << "#include \"llvm/Support/Debug.h\"\n";
   OS << "#include \"llvm/Support/raw_ostream.h\"\n\n";
-  ParseFeaturesFunction(OS, NumFeatures, NumProcs);
+  ParseFeaturesFunction(OS);
 
   OS << "#endif // GET_SUBTARGETINFO_TARGET_DESC\n\n";
 
diff --git a/llvm/utils/TableGen/SubtargetFeatureInfo.cpp b/llvm/utils/TableGen/SubtargetFeatureInfo.cpp
index 33a22776f2df..f4f360fb5be2 100644
--- a/llvm/utils/TableGen/SubtargetFeatureInfo.cpp
+++ b/llvm/utils/TableGen/SubtargetFeatureInfo.cpp
@@ -108,6 +108,39 @@ void SubtargetFeatureInfo::emitComputeAvailableFeatures(
   OS << "}\n\n";
 }
 
+// If ParenIfBinOp is true, print a surrounding () if Val uses && or ||.
+static bool emitFeaturesAux(StringRef TargetName, const Init &Val,
+                            bool ParenIfBinOp, raw_ostream &OS) {
+  if (auto *D = dyn_cast<DefInit>(&Val)) {
+    if (!D->getDef()->isSubClassOf("SubtargetFeature"))
+      return true;
+    OS << "FB[" << TargetName << "::" << D->getAsString() << "]";
+    return false;
+  }
+  if (auto *D = dyn_cast<DagInit>(&Val)) {
+    std::string Op = D->getOperator()->getAsString();
+    if (Op == "not" && D->getNumArgs() == 1) {
+      OS << '!';
+      return emitFeaturesAux(TargetName, *D->getArg(0), true, OS);
+    }
+    if ((Op == "any_of" || Op == "all_of") && D->getNumArgs() > 0) {
+      bool Paren = D->getNumArgs() > 1 && std::exchange(ParenIfBinOp, true);
+      if (Paren)
+        OS << '(';
+      ListSeparator LS(Op == "any_of" ? " || " : " && ");
+      for (auto *Arg : D->getArgs()) {
+        OS << LS;
+        if (emitFeaturesAux(TargetName, *Arg, ParenIfBinOp, OS))
+          return true;
+      }
+      if (Paren)
+        OS << ')';
+      return false;
+    }
+  }
+  return true;
+}
+
 void SubtargetFeatureInfo::emitComputeAssemblerAvailableFeatures(
     StringRef TargetName, StringRef ClassName, StringRef FuncName,
     SubtargetFeatureInfoMap &SubtargetFeatures, raw_ostream &OS) {
@@ -118,37 +151,8 @@ void SubtargetFeatureInfo::emitComputeAssemblerAvailableFeatures(
     const SubtargetFeatureInfo &SFI = SF.second;
 
     OS << "  if (";
-
-    const DagInit *D = SFI.TheDef->getValueAsDag("AssemblerCondDag");
-    std::string CombineType = D->getOperator()->getAsString();
-    if (CombineType != "any_of" && CombineType != "all_of")
-      PrintFatalError(SFI.TheDef->getLoc(), "Invalid AssemblerCondDag!");
-    if (D->getNumArgs() == 0)
-      PrintFatalError(SFI.TheDef->getLoc(), "Invalid AssemblerCondDag!");
-    bool IsOr = CombineType == "any_of";
-
-    if (IsOr)
-      OS << "(";
-
-    ListSeparator LS(IsOr ? " || " : " && ");
-    for (auto *Arg : D->getArgs()) {
-      OS << LS;
-      if (auto *NotArg = dyn_cast<DagInit>(Arg)) {
-        if (NotArg->getOperator()->getAsString() != "not" ||
-            NotArg->getNumArgs() != 1)
-          PrintFatalError(SFI.TheDef->getLoc(), "Invalid AssemblerCondDag!");
-        Arg = NotArg->getArg(0);
-        OS << "!";
-      }
-      if (!isa<DefInit>(Arg) ||
-          !cast<DefInit>(Arg)->getDef()->isSubClassOf("SubtargetFeature"))
-        PrintFatalError(SFI.TheDef->getLoc(), "Invalid AssemblerCondDag!");
-      OS << "FB[" << TargetName << "::" << Arg->getAsString() << "]";
-    }
-
-    if (IsOr)
-      OS << ")";
-
+    emitFeaturesAux(TargetName, *SFI.TheDef->getValueAsDag("AssemblerCondDag"),
+                    /*ParenIfBinOp=*/false, OS);
     OS << ")\n";
     OS << "    Features.set(" << SFI.getEnumBitName() << ");\n";
   }
diff --git a/llvm/utils/TableGen/TableGen.cpp b/llvm/utils/TableGen/TableGen.cpp
index 2d4a45f889be..efd641887232 100644
--- a/llvm/utils/TableGen/TableGen.cpp
+++ b/llvm/utils/TableGen/TableGen.cpp
@@ -25,7 +25,6 @@ enum ActionType {
   NullBackend,
   DumpJSON,
   GenEmitter,
-  GenCodeBeads,
   GenRegisterInfo,
   GenInstrInfo,
   GenInstrDocs,
@@ -52,11 +51,13 @@ enum ActionType {
   GenGICombiner,
   GenX86EVEX2VEXTables,
   GenX86FoldTables,
+  GenX86MnemonicTables,
   GenRegisterBank,
   GenExegesis,
   GenAutomata,
   GenDirectivesEnumDecl,
   GenDirectivesEnumImpl,
+  GenDXILOperation,
 };
 
 namespace llvm {
@@ -81,8 +82,6 @@ cl::opt<ActionType> Action(
         clEnumValN(DumpJSON, "dump-json",
                    "Dump all records as machine-readable JSON"),
         clEnumValN(GenEmitter, "gen-emitter", "Generate machine code emitter"),
-        clEnumValN(GenCodeBeads, "gen-code-beads",
-                   "Generate machine code beads"),
         clEnumValN(GenRegisterInfo, "gen-register-info",
                    "Generate registers and register classes info"),
         clEnumValN(GenInstrInfo, "gen-instr-info",
@@ -130,6 +129,8 @@ cl::opt<ActionType> Action(
                    "Generate X86 EVEX to VEX compress tables"),
         clEnumValN(GenX86FoldTables, "gen-x86-fold-tables",
                    "Generate X86 fold tables"),
+        clEnumValN(GenX86MnemonicTables, "gen-x86-mnemonic-tables",
+                   "Generate X86 mnemonic tables"),
         clEnumValN(GenRegisterBank, "gen-register-bank",
                    "Generate registers bank descriptions"),
         clEnumValN(GenExegesis, "gen-exegesis",
@@ -138,7 +139,9 @@ cl::opt<ActionType> Action(
         clEnumValN(GenDirectivesEnumDecl, "gen-directive-decl",
                    "Generate directive related declaration code (header file)"),
         clEnumValN(GenDirectivesEnumImpl, "gen-directive-impl",
-                   "Generate directive related implementation code")));
+                   "Generate directive related implementation code"),
+        clEnumValN(GenDXILOperation, "gen-dxil-operation",
+                   "Generate DXIL operation information")));
 
 cl::OptionCategory PrintEnumsCat("Options for -print-enums");
 cl::opt<std::string> Class("class", cl::desc("Print Enum list for this class"),
@@ -161,9 +164,6 @@ bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
   case GenEmitter:
     EmitCodeEmitter(Records, OS);
     break;
-  case GenCodeBeads:
-    EmitCodeBeads(Records, OS);
-    break;
   case GenRegisterInfo:
     EmitRegisterInfo(Records, OS);
     break;
@@ -257,6 +257,9 @@ bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
   case GenX86EVEX2VEXTables:
     EmitX86EVEX2VEXTables(Records, OS);
     break;
+  case GenX86MnemonicTables:
+    EmitX86MnemonicTables(Records, OS);
+    break;
   case GenX86FoldTables:
     EmitX86FoldTables(Records, OS);
     break;
@@ -272,6 +275,9 @@ bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
   case GenDirectivesEnumImpl:
     EmitDirectivesImpl(Records, OS);
     break;
+  case GenDXILOperation:
+    EmitDXILOperation(Records, OS);
+    break;
   }
 
   return false;
diff --git a/llvm/utils/TableGen/TableGenBackends.h b/llvm/utils/TableGen/TableGenBackends.h
index 71db8dc77b05..4dff13095696 100644
--- a/llvm/utils/TableGen/TableGenBackends.h
+++ b/llvm/utils/TableGen/TableGenBackends.h
@@ -67,7 +67,6 @@ void EmitAsmMatcher(RecordKeeper &RK, raw_ostream &OS);
 void EmitAsmWriter(RecordKeeper &RK, raw_ostream &OS);
 void EmitCallingConv(RecordKeeper &RK, raw_ostream &OS);
 void EmitCodeEmitter(RecordKeeper &RK, raw_ostream &OS);
-void EmitCodeBeads(RecordKeeper &RK, raw_ostream &OS);
 void EmitDAGISel(RecordKeeper &RK, raw_ostream &OS);
 void EmitDFAPacketizer(RecordKeeper &RK, raw_ostream &OS);
 void EmitDisassembler(RecordKeeper &RK, raw_ostream &OS);
@@ -88,11 +87,13 @@ void EmitGlobalISel(RecordKeeper &RK, raw_ostream &OS);
 void EmitGICombiner(RecordKeeper &RK, raw_ostream &OS);
 void EmitX86EVEX2VEXTables(RecordKeeper &RK, raw_ostream &OS);
 void EmitX86FoldTables(RecordKeeper &RK, raw_ostream &OS);
+void EmitX86MnemonicTables(RecordKeeper &RK, raw_ostream &OS);
 void EmitRegisterBank(RecordKeeper &RK, raw_ostream &OS);
 void EmitExegesis(RecordKeeper &RK, raw_ostream &OS);
 void EmitAutomata(RecordKeeper &RK, raw_ostream &OS);
 void EmitDirectivesDecl(RecordKeeper &RK, raw_ostream &OS);
 void EmitDirectivesImpl(RecordKeeper &RK, raw_ostream &OS);
+void EmitDXILOperation(RecordKeeper &RK, raw_ostream &OS);
 
 } // End llvm namespace
 
diff --git a/llvm/utils/TableGen/VarLenCodeEmitterGen.cpp b/llvm/utils/TableGen/VarLenCodeEmitterGen.cpp
new file mode 100644
index 000000000000..a6bbe2f7ff37
--- /dev/null
+++ b/llvm/utils/TableGen/VarLenCodeEmitterGen.cpp
@@ -0,0 +1,487 @@
+//===- VarLenCodeEmitterGen.cpp - CEG for variable-length insts -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The CodeEmitterGen component for variable-length instructions.
+//
+// The basic CodeEmitterGen is almost exclusively designed for fixed-
+// length instructions. A good analogy for its encoding scheme is how printf
+// works: The (immutable) formatting string represent the fixed values in the
+// encoded instruction. Placeholders (i.e. %something), on the other hand,
+// represent encoding for instruction operands.
+// ```
+// printf("1101 %src 1001 %dst", <encoded value for operand `src`>,
+//                               <encoded value for operand `dst`>);
+// ```
+// VarLenCodeEmitterGen in this file provides an alternative encoding scheme
+// that works more like a C++ stream operator:
+// ```
+// OS << 0b1101;
+// if (Cond)
+//   OS << OperandEncoding0;
+// OS << 0b1001 << OperandEncoding1;
+// ```
+// You are free to concatenate arbitrary types (and sizes) of encoding
+// fragments on any bit position, bringing more flexibilities on defining
+// encoding for variable-length instructions.
+//
+// In a more specific way, instruction encoding is represented by a DAG type
+// `Inst` field. Here is an example:
+// ```
+// dag Inst = (descend 0b1101, (operand "$src", 4), 0b1001,
+//                     (operand "$dst", 4));
+// ```
+// It represents the following instruction encoding:
+// ```
+// MSB                                                     LSB
+// 1101<encoding for operand src>1001<encoding for operand dst>
+// ```
+// For more details about DAG operators in the above snippet, please
+// refer to \file include/llvm/Target/Target.td.
+//
+// VarLenCodeEmitter will convert the above DAG into the same helper function
+// generated by CodeEmitter, `MCCodeEmitter::getBinaryCodeForInstr` (except
+// for few details).
+//
+//===----------------------------------------------------------------------===//
+
+#include "VarLenCodeEmitterGen.h"
+#include "CodeGenHwModes.h"
+#include "CodeGenInstruction.h"
+#include "CodeGenTarget.h"
+#include "InfoByHwMode.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/Error.h"
+
+using namespace llvm;
+
+namespace {
+
+class VarLenCodeEmitterGen {
+  RecordKeeper &Records;
+
+  DenseMap<Record *, VarLenInst> VarLenInsts;
+
+  // Emit based values (i.e. fixed bits in the encoded instructions)
+  void emitInstructionBaseValues(
+      raw_ostream &OS,
+      ArrayRef<const CodeGenInstruction *> NumberedInstructions,
+      CodeGenTarget &Target, int HwMode = -1);
+
+  std::string getInstructionCase(Record *R, CodeGenTarget &Target);
+  std::string getInstructionCaseForEncoding(Record *R, Record *EncodingDef,
+                                            CodeGenTarget &Target);
+
+public:
+  explicit VarLenCodeEmitterGen(RecordKeeper &R) : Records(R) {}
+
+  void run(raw_ostream &OS);
+};
+
+} // end anonymous namespace
+
+VarLenInst::VarLenInst(const DagInit *DI, const RecordVal *TheDef)
+    : TheDef(TheDef), NumBits(0U) {
+  buildRec(DI);
+  for (const auto &S : Segments)
+    NumBits += S.BitWidth;
+}
+
+void VarLenInst::buildRec(const DagInit *DI) {
+  assert(TheDef && "The def record is nullptr ?");
+
+  std::string Op = DI->getOperator()->getAsString();
+
+  if (Op == "ascend" || Op == "descend") {
+    bool Reverse = Op == "descend";
+    int i = Reverse ? DI->getNumArgs() - 1 : 0;
+    int e = Reverse ? -1 : DI->getNumArgs();
+    int s = Reverse ? -1 : 1;
+    for (; i != e; i += s) {
+      const Init *Arg = DI->getArg(i);
+      if (const auto *BI = dyn_cast<BitsInit>(Arg)) {
+        if (!BI->isComplete())
+          PrintFatalError(TheDef->getLoc(),
+                          "Expecting complete bits init in `" + Op + "`");
+        Segments.push_back({BI->getNumBits(), BI});
+      } else if (const auto *BI = dyn_cast<BitInit>(Arg)) {
+        if (!BI->isConcrete())
+          PrintFatalError(TheDef->getLoc(),
+                          "Expecting concrete bit init in `" + Op + "`");
+        Segments.push_back({1, BI});
+      } else if (const auto *SubDI = dyn_cast<DagInit>(Arg)) {
+        buildRec(SubDI);
+      } else {
+        PrintFatalError(TheDef->getLoc(), "Unrecognized type of argument in `" +
+                                              Op + "`: " + Arg->getAsString());
+      }
+    }
+  } else if (Op == "operand") {
+    // (operand <operand name>, <# of bits>, [(encoder <custom encoder>)])
+    if (DI->getNumArgs() < 2)
+      PrintFatalError(TheDef->getLoc(),
+                      "Expecting at least 2 arguments for `operand`");
+    HasDynamicSegment = true;
+    const Init *OperandName = DI->getArg(0), *NumBits = DI->getArg(1);
+    if (!isa<StringInit>(OperandName) || !isa<IntInit>(NumBits))
+      PrintFatalError(TheDef->getLoc(), "Invalid argument types for `operand`");
+
+    auto NumBitsVal = cast<IntInit>(NumBits)->getValue();
+    if (NumBitsVal <= 0)
+      PrintFatalError(TheDef->getLoc(), "Invalid number of bits for `operand`");
+
+    StringRef CustomEncoder;
+    if (DI->getNumArgs() >= 3)
+      CustomEncoder = getCustomEncoderName(DI->getArg(2));
+    Segments.push_back(
+        {static_cast<unsigned>(NumBitsVal), OperandName, CustomEncoder});
+  } else if (Op == "slice") {
+    // (slice <operand name>, <high / low bit>, <low / high bit>,
+    //        [(encoder <custom encoder>)])
+    if (DI->getNumArgs() < 3)
+      PrintFatalError(TheDef->getLoc(),
+                      "Expecting at least 3 arguments for `slice`");
+    HasDynamicSegment = true;
+    Init *OperandName = DI->getArg(0), *HiBit = DI->getArg(1),
+         *LoBit = DI->getArg(2);
+    if (!isa<StringInit>(OperandName) || !isa<IntInit>(HiBit) ||
+        !isa<IntInit>(LoBit))
+      PrintFatalError(TheDef->getLoc(), "Invalid argument types for `slice`");
+
+    auto HiBitVal = cast<IntInit>(HiBit)->getValue(),
+         LoBitVal = cast<IntInit>(LoBit)->getValue();
+    if (HiBitVal < 0 || LoBitVal < 0)
+      PrintFatalError(TheDef->getLoc(), "Invalid bit range for `slice`");
+    bool NeedSwap = false;
+    unsigned NumBits = 0U;
+    if (HiBitVal < LoBitVal) {
+      NeedSwap = true;
+      NumBits = static_cast<unsigned>(LoBitVal - HiBitVal + 1);
+    } else {
+      NumBits = static_cast<unsigned>(HiBitVal - LoBitVal + 1);
+    }
+
+    StringRef CustomEncoder;
+    if (DI->getNumArgs() >= 4)
+      CustomEncoder = getCustomEncoderName(DI->getArg(3));
+
+    if (NeedSwap) {
+      // Normalization: Hi bit should always be the second argument.
+      Init *const NewArgs[] = {OperandName, LoBit, HiBit};
+      Segments.push_back({NumBits,
+                          DagInit::get(DI->getOperator(), nullptr, NewArgs, {}),
+                          CustomEncoder});
+    } else {
+      Segments.push_back({NumBits, DI, CustomEncoder});
+    }
+  }
+}
+
+void VarLenCodeEmitterGen::run(raw_ostream &OS) {
+  CodeGenTarget Target(Records);
+  auto Insts = Records.getAllDerivedDefinitions("Instruction");
+
+  auto NumberedInstructions = Target.getInstructionsByEnumValue();
+  const CodeGenHwModes &HWM = Target.getHwModes();
+
+  // The set of HwModes used by instruction encodings.
+  std::set<unsigned> HwModes;
+  for (const CodeGenInstruction *CGI : NumberedInstructions) {
+    Record *R = CGI->TheDef;
+
+    // Create the corresponding VarLenInst instance.
+    if (R->getValueAsString("Namespace") == "TargetOpcode" ||
+        R->getValueAsBit("isPseudo"))
+      continue;
+
+    if (const RecordVal *RV = R->getValue("EncodingInfos")) {
+      if (auto *DI = dyn_cast_or_null<DefInit>(RV->getValue())) {
+        EncodingInfoByHwMode EBM(DI->getDef(), HWM);
+        for (auto &KV : EBM) {
+          HwModes.insert(KV.first);
+          Record *EncodingDef = KV.second;
+          RecordVal *RV = EncodingDef->getValue("Inst");
+          DagInit *DI = cast<DagInit>(RV->getValue());
+          VarLenInsts.insert({EncodingDef, VarLenInst(DI, RV)});
+        }
+        continue;
+      }
+    }
+    RecordVal *RV = R->getValue("Inst");
+    DagInit *DI = cast<DagInit>(RV->getValue());
+    VarLenInsts.insert({R, VarLenInst(DI, RV)});
+  }
+
+  // Emit function declaration
+  OS << "void " << Target.getName()
+     << "MCCodeEmitter::getBinaryCodeForInstr(const MCInst &MI,\n"
+     << "    SmallVectorImpl<MCFixup> &Fixups,\n"
+     << "    APInt &Inst,\n"
+     << "    APInt &Scratch,\n"
+     << "    const MCSubtargetInfo &STI) const {\n";
+
+  // Emit instruction base values
+  if (HwModes.empty()) {
+    emitInstructionBaseValues(OS, NumberedInstructions, Target);
+  } else {
+    for (unsigned HwMode : HwModes)
+      emitInstructionBaseValues(OS, NumberedInstructions, Target, (int)HwMode);
+  }
+
+  if (!HwModes.empty()) {
+    OS << "  const unsigned **Index;\n";
+    OS << "  const uint64_t *InstBits;\n";
+    OS << "  unsigned HwMode = STI.getHwMode();\n";
+    OS << "  switch (HwMode) {\n";
+    OS << "  default: llvm_unreachable(\"Unknown hardware mode!\"); break;\n";
+    for (unsigned I : HwModes) {
+      OS << "  case " << I << ": InstBits = InstBits_" << HWM.getMode(I).Name
+         << "; Index = Index_" << HWM.getMode(I).Name << "; break;\n";
+    }
+    OS << "  };\n";
+  }
+
+  // Emit helper function to retrieve base values.
+  OS << "  auto getInstBits = [&](unsigned Opcode) -> APInt {\n"
+     << "    unsigned NumBits = Index[Opcode][0];\n"
+     << "    if (!NumBits)\n"
+     << "      return APInt::getZeroWidth();\n"
+     << "    unsigned Idx = Index[Opcode][1];\n"
+     << "    ArrayRef<uint64_t> Data(&InstBits[Idx], "
+     << "APInt::getNumWords(NumBits));\n"
+     << "    return APInt(NumBits, Data);\n"
+     << "  };\n";
+
+  // Map to accumulate all the cases.
+  std::map<std::string, std::vector<std::string>> CaseMap;
+
+  // Construct all cases statement for each opcode
+  for (Record *R : Insts) {
+    if (R->getValueAsString("Namespace") == "TargetOpcode" ||
+        R->getValueAsBit("isPseudo"))
+      continue;
+    std::string InstName =
+        (R->getValueAsString("Namespace") + "::" + R->getName()).str();
+    std::string Case = getInstructionCase(R, Target);
+
+    CaseMap[Case].push_back(std::move(InstName));
+  }
+
+  // Emit initial function code
+  OS << "  const unsigned opcode = MI.getOpcode();\n"
+     << "  switch (opcode) {\n";
+
+  // Emit each case statement
+  for (const auto &C : CaseMap) {
+    const std::string &Case = C.first;
+    const auto &InstList = C.second;
+
+    ListSeparator LS("\n");
+    for (const auto &InstName : InstList)
+      OS << LS << "    case " << InstName << ":";
+
+    OS << " {\n";
+    OS << Case;
+    OS << "      break;\n"
+       << "    }\n";
+  }
+  // Default case: unhandled opcode
+  OS << "  default:\n"
+     << "    std::string msg;\n"
+     << "    raw_string_ostream Msg(msg);\n"
+     << "    Msg << \"Not supported instr: \" << MI;\n"
+     << "    report_fatal_error(Msg.str().c_str());\n"
+     << "  }\n";
+  OS << "}\n\n";
+}
+
+static void emitInstBits(raw_ostream &IS, raw_ostream &SS, const APInt &Bits,
+                         unsigned &Index) {
+  if (!Bits.getNumWords()) {
+    IS.indent(4) << "{/*NumBits*/0, /*Index*/0},";
+    return;
+  }
+
+  IS.indent(4) << "{/*NumBits*/" << Bits.getBitWidth() << ", "
+               << "/*Index*/" << Index << "},";
+
+  SS.indent(4);
+  for (unsigned I = 0; I < Bits.getNumWords(); ++I, ++Index)
+    SS << "UINT64_C(" << utostr(Bits.getRawData()[I]) << "),";
+}
+
+void VarLenCodeEmitterGen::emitInstructionBaseValues(
+    raw_ostream &OS, ArrayRef<const CodeGenInstruction *> NumberedInstructions,
+    CodeGenTarget &Target, int HwMode) {
+  std::string IndexArray, StorageArray;
+  raw_string_ostream IS(IndexArray), SS(StorageArray);
+
+  const CodeGenHwModes &HWM = Target.getHwModes();
+  if (HwMode == -1) {
+    IS << "  static const unsigned Index[][2] = {\n";
+    SS << "  static const uint64_t InstBits[] = {\n";
+  } else {
+    StringRef Name = HWM.getMode(HwMode).Name;
+    IS << "  static const unsigned Index_" << Name << "[][2] = {\n";
+    SS << "  static const uint64_t InstBits_" << Name << "[] = {\n";
+  }
+
+  unsigned NumFixedValueWords = 0U;
+  for (const CodeGenInstruction *CGI : NumberedInstructions) {
+    Record *R = CGI->TheDef;
+
+    if (R->getValueAsString("Namespace") == "TargetOpcode" ||
+        R->getValueAsBit("isPseudo")) {
+      IS.indent(4) << "{/*NumBits*/0, /*Index*/0},\n";
+      continue;
+    }
+
+    Record *EncodingDef = R;
+    if (const RecordVal *RV = R->getValue("EncodingInfos")) {
+      if (auto *DI = dyn_cast_or_null<DefInit>(RV->getValue())) {
+        EncodingInfoByHwMode EBM(DI->getDef(), HWM);
+        if (EBM.hasMode(HwMode))
+          EncodingDef = EBM.get(HwMode);
+      }
+    }
+
+    auto It = VarLenInsts.find(EncodingDef);
+    if (It == VarLenInsts.end())
+      PrintFatalError(EncodingDef, "VarLenInst not found for this record");
+    const VarLenInst &VLI = It->second;
+
+    unsigned i = 0U, BitWidth = VLI.size();
+
+    // Start by filling in fixed values.
+    APInt Value(BitWidth, 0);
+    auto SI = VLI.begin(), SE = VLI.end();
+    // Scan through all the segments that have fixed-bits values.
+    while (i < BitWidth && SI != SE) {
+      unsigned SegmentNumBits = SI->BitWidth;
+      if (const auto *BI = dyn_cast<BitsInit>(SI->Value)) {
+        for (unsigned Idx = 0U; Idx != SegmentNumBits; ++Idx) {
+          auto *B = cast<BitInit>(BI->getBit(Idx));
+          Value.setBitVal(i + Idx, B->getValue());
+        }
+      }
+      if (const auto *BI = dyn_cast<BitInit>(SI->Value))
+        Value.setBitVal(i, BI->getValue());
+
+      i += SegmentNumBits;
+      ++SI;
+    }
+
+    emitInstBits(IS, SS, Value, NumFixedValueWords);
+    IS << '\t' << "// " << R->getName() << "\n";
+    if (Value.getNumWords())
+      SS << '\t' << "// " << R->getName() << "\n";
+  }
+  IS.indent(4) << "{/*NumBits*/0, /*Index*/0}\n  };\n";
+  SS.indent(4) << "UINT64_C(0)\n  };\n";
+
+  OS << IS.str() << SS.str();
+}
+
+std::string VarLenCodeEmitterGen::getInstructionCase(Record *R,
+                                                     CodeGenTarget &Target) {
+  std::string Case;
+  if (const RecordVal *RV = R->getValue("EncodingInfos")) {
+    if (auto *DI = dyn_cast_or_null<DefInit>(RV->getValue())) {
+      const CodeGenHwModes &HWM = Target.getHwModes();
+      EncodingInfoByHwMode EBM(DI->getDef(), HWM);
+      Case += "      switch (HwMode) {\n";
+      Case += "      default: llvm_unreachable(\"Unhandled HwMode\");\n";
+      for (auto &KV : EBM) {
+        Case += "      case " + itostr(KV.first) + ": {\n";
+        Case += getInstructionCaseForEncoding(R, KV.second, Target);
+        Case += "      break;\n";
+        Case += "      }\n";
+      }
+      Case += "      }\n";
+      return Case;
+    }
+  }
+  return getInstructionCaseForEncoding(R, R, Target);
+}
+
+std::string VarLenCodeEmitterGen::getInstructionCaseForEncoding(
+    Record *R, Record *EncodingDef, CodeGenTarget &Target) {
+  auto It = VarLenInsts.find(EncodingDef);
+  if (It == VarLenInsts.end())
+    PrintFatalError(EncodingDef, "Parsed encoding record not found");
+  const VarLenInst &VLI = It->second;
+  size_t BitWidth = VLI.size();
+
+  CodeGenInstruction &CGI = Target.getInstruction(R);
+
+  std::string Case;
+  raw_string_ostream SS(Case);
+  // Resize the scratch buffer.
+  if (BitWidth && !VLI.isFixedValueOnly())
+    SS.indent(6) << "Scratch = Scratch.zext(" << BitWidth << ");\n";
+  // Populate based value.
+  SS.indent(6) << "Inst = getInstBits(opcode);\n";
+
+  // Process each segment in VLI.
+  size_t Offset = 0U;
+  for (const auto &ES : VLI) {
+    unsigned NumBits = ES.BitWidth;
+    const Init *Val = ES.Value;
+    // If it's a StringInit or DagInit, it's a reference to an operand
+    // or part of an operand.
+    if (isa<StringInit>(Val) || isa<DagInit>(Val)) {
+      StringRef OperandName;
+      unsigned LoBit = 0U;
+      if (const auto *SV = dyn_cast<StringInit>(Val)) {
+        OperandName = SV->getValue();
+      } else {
+        // Normalized: (slice <operand name>, <high bit>, <low bit>)
+        const auto *DV = cast<DagInit>(Val);
+        OperandName = cast<StringInit>(DV->getArg(0))->getValue();
+        LoBit = static_cast<unsigned>(cast<IntInit>(DV->getArg(2))->getValue());
+      }
+
+      auto OpIdx = CGI.Operands.ParseOperandName(OperandName);
+      unsigned FlatOpIdx = CGI.Operands.getFlattenedOperandNumber(OpIdx);
+      StringRef CustomEncoder = CGI.Operands[OpIdx.first].EncoderMethodName;
+      if (ES.CustomEncoder.size())
+        CustomEncoder = ES.CustomEncoder;
+
+      SS.indent(6) << "Scratch.clearAllBits();\n";
+      SS.indent(6) << "// op: " << OperandName.drop_front(1) << "\n";
+      if (CustomEncoder.empty())
+        SS.indent(6) << "getMachineOpValue(MI, MI.getOperand("
+                     << utostr(FlatOpIdx) << ")";
+      else
+        SS.indent(6) << CustomEncoder << "(MI, /*OpIdx=*/" << utostr(FlatOpIdx);
+
+      SS << ", /*Pos=*/" << utostr(Offset) << ", Scratch, Fixups, STI);\n";
+
+      SS.indent(6) << "Inst.insertBits("
+                   << "Scratch.extractBits(" << utostr(NumBits) << ", "
+                   << utostr(LoBit) << ")"
+                   << ", " << Offset << ");\n";
+    }
+    Offset += NumBits;
+  }
+
+  StringRef PostEmitter = R->getValueAsString("PostEncoderMethod");
+  if (!PostEmitter.empty())
+    SS.indent(6) << "Inst = " << PostEmitter << "(MI, Inst, STI);\n";
+
+  return Case;
+}
+
+namespace llvm {
+
+void emitVarLenCodeEmitter(RecordKeeper &R, raw_ostream &OS) {
+  VarLenCodeEmitterGen(R).run(OS);
+}
+
+} // end namespace llvm
diff --git a/llvm/utils/TableGen/VarLenCodeEmitterGen.h b/llvm/utils/TableGen/VarLenCodeEmitterGen.h
new file mode 100644
index 000000000000..5bdedee1dd51
--- /dev/null
+++ b/llvm/utils/TableGen/VarLenCodeEmitterGen.h
@@ -0,0 +1,66 @@
+//===- VarLenCodeEmitterGen.h - CEG for variable-length insts ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declare the CodeEmitterGen component for variable-length
+// instructions. See the .cpp file for more details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_UTILS_TABLEGEN_VARLENCODEEMITTERGEN_H
+#define LLVM_UTILS_TABLEGEN_VARLENCODEEMITTERGEN_H
+
+#include "llvm/TableGen/Record.h"
+
+namespace llvm {
+
+struct EncodingSegment {
+  unsigned BitWidth;
+  const Init *Value;
+  StringRef CustomEncoder = "";
+};
+
+class VarLenInst {
+  const RecordVal *TheDef;
+  size_t NumBits;
+
+  // Set if any of the segment is not fixed value.
+  bool HasDynamicSegment;
+
+  SmallVector<EncodingSegment, 4> Segments;
+
+  void buildRec(const DagInit *DI);
+
+  StringRef getCustomEncoderName(const Init *EI) const {
+    if (const auto *DI = dyn_cast<DagInit>(EI)) {
+      if (DI->getNumArgs() && isa<StringInit>(DI->getArg(0)))
+        return cast<StringInit>(DI->getArg(0))->getValue();
+    }
+    return "";
+  }
+
+public:
+  VarLenInst() : TheDef(nullptr), NumBits(0U), HasDynamicSegment(false) {}
+
+  explicit VarLenInst(const DagInit *DI, const RecordVal *TheDef);
+
+  /// Number of bits
+  size_t size() const { return NumBits; }
+
+  using const_iterator = decltype(Segments)::const_iterator;
+
+  const_iterator begin() const { return Segments.begin(); }
+  const_iterator end() const { return Segments.end(); }
+  size_t getNumSegments() const { return Segments.size(); }
+
+  bool isFixedValueOnly() const { return !HasDynamicSegment; }
+};
+
+void emitVarLenCodeEmitter(RecordKeeper &R, raw_ostream &OS);
+
+} // end namespace llvm
+#endif
diff --git a/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp b/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
index 74969053f095..dc037e4409ab 100644
--- a/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
+++ b/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
@@ -37,8 +37,9 @@ void emitWebAssemblyDisassemblerTables(
     if (!Def.getValue("Inst"))
       continue;
     auto &Inst = *Def.getValueAsBitsInit("Inst");
-    auto Opc = static_cast<unsigned>(
-        reinterpret_cast<IntInit *>(Inst.convertInitializerTo(IntRecTy::get()))
+    RecordKeeper &RK = Inst.getRecordKeeper();
+    unsigned Opc = static_cast<unsigned>(
+        cast<IntInit>(Inst.convertInitializerTo(IntRecTy::get(RK)))
             ->getValue());
     if (Opc == 0xFFFFFFFF)
       continue; // No opcode defined.
@@ -54,11 +55,7 @@ void emitWebAssemblyDisassemblerTables(
     auto &CGIP = OpcodeTable[Prefix][Opc];
     // All wasm instructions have a StackBased field of type string, we only
     // want the instructions for which this is "true".
-    auto StackString =
-        Def.getValue("StackBased")->getValue()->getCastTo(StringRecTy::get());
-    auto IsStackBased =
-        StackString &&
-        reinterpret_cast<const StringInit *>(StackString)->getValue() == "true";
+    bool IsStackBased = Def.getValueAsBit("StackBased");
     if (!IsStackBased)
       continue;
     if (CGIP.second) {
@@ -66,14 +63,11 @@ void emitWebAssemblyDisassemblerTables(
       // should be the canonical one. This determines which variant gets
       // printed in a disassembly. We want e.g. "call" not "i32.call", and
       // "end" when we don't know if its "end_loop" or "end_block" etc.
-      auto IsCanonicalExisting = CGIP.second->TheDef->getValue("IsCanonical")
-                                     ->getValue()
-                                     ->getAsString() == "1";
+      bool IsCanonicalExisting = CGIP.second->TheDef->getValueAsBit("IsCanonical");
       // We already have one marked explicitly as canonical, so keep it.
       if (IsCanonicalExisting)
         continue;
-      auto IsCanonicalNew =
-          Def.getValue("IsCanonical")->getValue()->getAsString() == "1";
+      bool IsCanonicalNew = Def.getValueAsBit("IsCanonical");
       // If the new one is explicitly marked as canonical, take it.
       if (!IsCanonicalNew) {
         // Neither the existing or new instruction is canonical.
diff --git a/llvm/utils/TableGen/X86DisassemblerTables.cpp b/llvm/utils/TableGen/X86DisassemblerTables.cpp
index 81ddea99740d..2fa8fce81422 100644
--- a/llvm/utils/TableGen/X86DisassemblerTables.cpp
+++ b/llvm/utils/TableGen/X86DisassemblerTables.cpp
@@ -105,8 +105,7 @@ static inline bool inheritsFrom(InstructionContext child,
   case IC_64BIT_ADSIZE:
     return (noPrefix && inheritsFrom(child, IC_64BIT_OPSIZE_ADSIZE, noPrefix));
   case IC_64BIT_OPSIZE_ADSIZE:
-    return (noPrefix &&
-            inheritsFrom(child, IC_64BIT_VEX_OPSIZE_ADSIZE, noPrefix));
+    return false;
   case IC_XD:
     return inheritsFrom(child, IC_64BIT_XD);
   case IC_XS:
@@ -127,11 +126,10 @@ static inline bool inheritsFrom(InstructionContext child,
   case IC_64BIT_OPSIZE:
     return inheritsFrom(child, IC_64BIT_REXW_OPSIZE) ||
            (!AdSize64 && inheritsFrom(child, IC_64BIT_OPSIZE_ADSIZE)) ||
-           (!AdSize64 && inheritsFrom(child, IC_64BIT_REXW_ADSIZE)) ||
-           (!AdSize64 && inheritsFrom(child, IC_64BIT_VEX_OPSIZE_ADSIZE));
+           (!AdSize64 && inheritsFrom(child, IC_64BIT_REXW_ADSIZE));
   case IC_64BIT_XD:
-    return (inheritsFrom(child, IC_64BIT_REXW_XD) ||
-            (!AdSize64 && inheritsFrom(child, IC_64BIT_XD_ADSIZE)));
+    return(inheritsFrom(child, IC_64BIT_REXW_XD) ||
+           (!AdSize64 && inheritsFrom(child, IC_64BIT_XD_ADSIZE)));
   case IC_64BIT_XS:
     return(inheritsFrom(child, IC_64BIT_REXW_XS) ||
            (!AdSize64 && inheritsFrom(child, IC_64BIT_XS_ADSIZE)));
@@ -161,12 +159,7 @@ static inline bool inheritsFrom(InstructionContext child,
   case IC_VEX_OPSIZE:
     return (VEX_LIG && VEX_WIG && inheritsFrom(child, IC_VEX_L_W_OPSIZE)) ||
            (VEX_WIG && inheritsFrom(child, IC_VEX_W_OPSIZE)) ||
-           (VEX_LIG && inheritsFrom(child, IC_VEX_L_OPSIZE)) ||
-           inheritsFrom(child, IC_64BIT_VEX_OPSIZE);
-  case IC_64BIT_VEX_OPSIZE:
-    return inheritsFrom(child, IC_64BIT_VEX_OPSIZE_ADSIZE);
-  case IC_64BIT_VEX_OPSIZE_ADSIZE:
-    return false;
+           (VEX_LIG && inheritsFrom(child, IC_VEX_L_OPSIZE));
   case IC_VEX_W:
     return VEX_LIG && inheritsFrom(child, IC_VEX_L_W);
   case IC_VEX_W_XS:
@@ -673,7 +666,6 @@ void DisassemblerTables::emitModRMDecision(raw_ostream &o1, raw_ostream &o2,
                                            unsigned &i1, unsigned &i2,
                                            unsigned &ModRMTableNum,
                                            ModRMDecision &decision) const {
-  static uint32_t sTableNumber = 0;
   static uint32_t sEntryNumber = 1;
   ModRMDecisionType dt = getDecisionType(decision);
 
@@ -753,8 +745,6 @@ void DisassemblerTables::emitModRMDecision(raw_ostream &o1, raw_ostream &o2,
   assert(sEntryNumber < 65536U &&
          "Index into ModRMDecision is too large for uint16_t!");
   (void)sEntryNumber;
-
-  ++sTableNumber;
 }
 
 void DisassemblerTables::emitOpcodeDecision(raw_ostream &o1, raw_ostream &o2,
@@ -891,9 +881,6 @@ void DisassemblerTables::emitContextTable(raw_ostream &o, unsigned &i) const {
     if ((index & ATTR_EVEX) || (index & ATTR_VEX) || (index & ATTR_VEXL)) {
       if (index & ATTR_EVEX)
         o << "IC_EVEX";
-      else if ((index & (ATTR_64BIT | ATTR_VEXL | ATTR_REXW | ATTR_OPSIZE)) ==
-               (ATTR_64BIT | ATTR_OPSIZE))
-        o << "IC_64BIT_VEX";
       else
         o << "IC_VEX";
 
@@ -905,13 +892,9 @@ void DisassemblerTables::emitContextTable(raw_ostream &o, unsigned &i) const {
       if (index & ATTR_REXW)
         o << "_W";
 
-      if (index & ATTR_OPSIZE) {
+      if (index & ATTR_OPSIZE)
         o << "_OPSIZE";
-        if ((index & (ATTR_64BIT | ATTR_EVEX | ATTR_VEX | ATTR_VEXL |
-                      ATTR_REXW | ATTR_ADSIZE)) ==
-            (ATTR_64BIT | ATTR_VEX | ATTR_ADSIZE))
-          o << "_ADSIZE";
-      } else if (index & ATTR_XD)
+      else if (index & ATTR_XD)
         o << "_XD";
       else if (index & ATTR_XS)
         o << "_XS";
@@ -925,7 +908,8 @@ void DisassemblerTables::emitContextTable(raw_ostream &o, unsigned &i) const {
         if (index & ATTR_EVEXB)
           o << "_B";
       }
-    } else if ((index & ATTR_64BIT) && (index & ATTR_REXW) && (index & ATTR_XS))
+    }
+    else if ((index & ATTR_64BIT) && (index & ATTR_REXW) && (index & ATTR_XS))
       o << "IC_64BIT_REXW_XS";
     else if ((index & ATTR_64BIT) && (index & ATTR_REXW) && (index & ATTR_XD))
       o << "IC_64BIT_REXW_XD";
diff --git a/llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp b/llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp
index 36c71843d70e..1384330ee8a1 100644
--- a/llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp
+++ b/llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp
@@ -11,11 +11,14 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "CodeGenInstruction.h"
 #include "CodeGenTarget.h"
+#include "X86RecognizableInstr.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/TableGenBackend.h"
 
 using namespace llvm;
+using namespace X86Disassembler;
 
 namespace {
 
@@ -108,28 +111,25 @@ public:
   IsMatch(const CodeGenInstruction *EVEXInst) : EVEXInst(EVEXInst) {}
 
   bool operator()(const CodeGenInstruction *VEXInst) {
-    Record *RecE = EVEXInst->TheDef;
-    Record *RecV = VEXInst->TheDef;
-    bool EVEX_W = RecE->getValueAsBit("HasVEX_W");
-    bool VEX_W  = RecV->getValueAsBit("HasVEX_W");
-    bool VEX_WIG  = RecV->getValueAsBit("IgnoresVEX_W");
-    bool EVEX_WIG = RecE->getValueAsBit("IgnoresVEX_W");
-    bool EVEX_W1_VEX_W0 = RecE->getValueAsBit("EVEX_W1_VEX_W0");
-
-    if (RecV->getValueAsDef("OpEnc")->getName().str() != "EncVEX" ||
-        RecV->getValueAsBit("isCodeGenOnly") != RecE->getValueAsBit("isCodeGenOnly") ||
+    RecognizableInstrBase VEXRI(*VEXInst);
+    RecognizableInstrBase EVEXRI(*EVEXInst);
+    bool VEX_W = VEXRI.HasVEX_W;
+    bool EVEX_W = EVEXRI.HasVEX_W;
+    bool VEX_WIG  = VEXRI.IgnoresVEX_W;
+    bool EVEX_WIG  = EVEXRI.IgnoresVEX_W;
+    bool EVEX_W1_VEX_W0 = EVEXInst->TheDef->getValueAsBit("EVEX_W1_VEX_W0");
+
+    if (VEXRI.IsCodeGenOnly != EVEXRI.IsCodeGenOnly ||
         // VEX/EVEX fields
-        RecV->getValueAsDef("OpPrefix") != RecE->getValueAsDef("OpPrefix") ||
-        RecV->getValueAsDef("OpMap") != RecE->getValueAsDef("OpMap") ||
-        RecV->getValueAsBit("hasVEX_4V") != RecE->getValueAsBit("hasVEX_4V") ||
-        RecV->getValueAsBit("hasEVEX_L2") != RecE->getValueAsBit("hasEVEX_L2") ||
-        RecV->getValueAsBit("hasVEX_L") != RecE->getValueAsBit("hasVEX_L") ||
+        VEXRI.OpPrefix != EVEXRI.OpPrefix || VEXRI.OpMap != EVEXRI.OpMap ||
+        VEXRI.HasVEX_4V != EVEXRI.HasVEX_4V ||
+        VEXRI.HasVEX_L != EVEXRI.HasVEX_L ||
         // Match is allowed if either is VEX_WIG, or they match, or EVEX
         // is VEX_W1X and VEX is VEX_W0.
         (!(VEX_WIG || (!EVEX_WIG && EVEX_W == VEX_W) ||
            (EVEX_W1_VEX_W0 && EVEX_W && !VEX_W))) ||
         // Instruction's format
-        RecV->getValueAsDef("Form") != RecE->getValueAsDef("Form"))
+        VEXRI.Form != EVEXRI.Form)
       return false;
 
     // This is needed for instructions with intrinsic version (_Int).
@@ -160,31 +160,6 @@ public:
 
     return true;
   }
-
-private:
-  static inline bool isRegisterOperand(const Record *Rec) {
-    return Rec->isSubClassOf("RegisterClass") ||
-           Rec->isSubClassOf("RegisterOperand");
-  }
-
-  static inline bool isMemoryOperand(const Record *Rec) {
-    return Rec->isSubClassOf("Operand") &&
-           Rec->getValueAsString("OperandType") == "OPERAND_MEMORY";
-  }
-
-  static inline bool isImmediateOperand(const Record *Rec) {
-    return Rec->isSubClassOf("Operand") &&
-           Rec->getValueAsString("OperandType") == "OPERAND_IMMEDIATE";
-  }
-
-  static inline unsigned int getRegOperandSize(const Record *RegRec) {
-    if (RegRec->isSubClassOf("RegisterClass"))
-      return RegRec->getValueAsInt("Alignment");
-    if (RegRec->isSubClassOf("RegisterOperand"))
-      return RegRec->getValueAsDef("RegClass")->getValueAsInt("Alignment");
-
-    llvm_unreachable("Register operand's size not known!");
-  }
 };
 
 void X86EVEX2VEXTablesEmitter::run(raw_ostream &OS) {
@@ -206,23 +181,19 @@ void X86EVEX2VEXTablesEmitter::run(raw_ostream &OS) {
       Target.getInstructionsByEnumValue();
 
   for (const CodeGenInstruction *Inst : NumberedInstructions) {
+    const Record *Def = Inst->TheDef;
     // Filter non-X86 instructions.
-    if (!Inst->TheDef->isSubClassOf("X86Inst"))
+    if (!Def->isSubClassOf("X86Inst"))
       continue;
+    RecognizableInstrBase RI(*Inst);
 
     // Add VEX encoded instructions to one of VEXInsts vectors according to
     // it's opcode.
-    if (Inst->TheDef->getValueAsDef("OpEnc")->getName() == "EncVEX") {
-      uint64_t Opcode = getValueFromBitsInit(Inst->TheDef->
-                                             getValueAsBitsInit("Opcode"));
-      VEXInsts[Opcode].push_back(Inst);
-    }
+    if (RI.Encoding == X86Local::VEX)
+      VEXInsts[RI.Opcode].push_back(Inst);
     // Add relevant EVEX encoded instructions to EVEXInsts
-    else if (Inst->TheDef->getValueAsDef("OpEnc")->getName() == "EncEVEX" &&
-             !Inst->TheDef->getValueAsBit("hasEVEX_K") &&
-             !Inst->TheDef->getValueAsBit("hasEVEX_B") &&
-             !Inst->TheDef->getValueAsBit("hasEVEX_L2") &&
-             !Inst->TheDef->getValueAsBit("notEVEX2VEXConvertible"))
+    else if (RI.Encoding == X86Local::EVEX && !RI.HasEVEX_K && !RI.HasEVEX_B &&
+             !RI.HasEVEX_L2 && !Def->getValueAsBit("notEVEX2VEXConvertible"))
       EVEXInsts.push_back(Inst);
   }
 
diff --git a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
index 2a29331eb7e8..5b3f11848de6 100644
--- a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
+++ b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
@@ -18,6 +18,7 @@
 #include "llvm/TableGen/TableGenBackend.h"
 
 using namespace llvm;
+using namespace X86Disassembler;
 
 namespace {
 
@@ -51,27 +52,32 @@ const char *ExplicitUnalign[] = {"MOVDQU", "MOVUPS", "MOVUPD",
 
 // For manually mapping instructions that do not match by their encoding.
 const ManualMapEntry ManualMapSet[] = {
-    { "ADD16ri_DB",       "ADD16mi",         NO_UNFOLD  },
-    { "ADD16ri8_DB",      "ADD16mi8",        NO_UNFOLD  },
-    { "ADD16rr_DB",       "ADD16mr",         NO_UNFOLD  },
-    { "ADD32ri_DB",       "ADD32mi",         NO_UNFOLD  },
-    { "ADD32ri8_DB",      "ADD32mi8",        NO_UNFOLD  },
-    { "ADD32rr_DB",       "ADD32mr",         NO_UNFOLD  },
-    { "ADD64ri32_DB",     "ADD64mi32",       NO_UNFOLD  },
-    { "ADD64ri8_DB",      "ADD64mi8",        NO_UNFOLD  },
-    { "ADD64rr_DB",       "ADD64mr",         NO_UNFOLD  },
-    { "ADD8ri_DB",        "ADD8mi",          NO_UNFOLD  },
-    { "ADD8rr_DB",        "ADD8mr",          NO_UNFOLD  },
-    { "ADD16rr_DB",       "ADD16rm",         NO_UNFOLD  },
-    { "ADD32rr_DB",       "ADD32rm",         NO_UNFOLD  },
-    { "ADD64rr_DB",       "ADD64rm",         NO_UNFOLD  },
-    { "ADD8rr_DB",        "ADD8rm",          NO_UNFOLD  },
-    { "PUSH16r",          "PUSH16rmm",       UNFOLD },
-    { "PUSH32r",          "PUSH32rmm",       UNFOLD },
-    { "PUSH64r",          "PUSH64rmm",       UNFOLD },
-    { "TAILJMPr",         "TAILJMPm",        UNFOLD },
-    { "TAILJMPr64",       "TAILJMPm64",      UNFOLD },
-    { "TAILJMPr64_REX",   "TAILJMPm64_REX",  UNFOLD },
+    { "ADD16ri_DB",         "ADD16mi",         NO_UNFOLD  },
+    { "ADD16ri8_DB",        "ADD16mi8",        NO_UNFOLD  },
+    { "ADD16rr_DB",         "ADD16mr",         NO_UNFOLD  },
+    { "ADD32ri_DB",         "ADD32mi",         NO_UNFOLD  },
+    { "ADD32ri8_DB",        "ADD32mi8",        NO_UNFOLD  },
+    { "ADD32rr_DB",         "ADD32mr",         NO_UNFOLD  },
+    { "ADD64ri32_DB",       "ADD64mi32",       NO_UNFOLD  },
+    { "ADD64ri8_DB",        "ADD64mi8",        NO_UNFOLD  },
+    { "ADD64rr_DB",         "ADD64mr",         NO_UNFOLD  },
+    { "ADD8ri_DB",          "ADD8mi",          NO_UNFOLD  },
+    { "ADD8rr_DB",          "ADD8mr",          NO_UNFOLD  },
+    { "ADD16rr_DB",         "ADD16rm",         NO_UNFOLD  },
+    { "ADD32rr_DB",         "ADD32rm",         NO_UNFOLD  },
+    { "ADD64rr_DB",         "ADD64rm",         NO_UNFOLD  },
+    { "ADD8rr_DB",          "ADD8rm",          NO_UNFOLD  },
+    { "MMX_MOVD64from64rr", "MMX_MOVQ64mr",    UNFOLD },
+    { "MMX_MOVD64grr",      "MMX_MOVD64mr",    UNFOLD },
+    { "MOVLHPSrr",          "MOVHPSrm",        NO_UNFOLD  },
+    { "PUSH16r",            "PUSH16rmm",       UNFOLD },
+    { "PUSH32r",            "PUSH32rmm",       UNFOLD },
+    { "PUSH64r",            "PUSH64rmm",       UNFOLD },
+    { "TAILJMPr",           "TAILJMPm",        UNFOLD },
+    { "TAILJMPr64",         "TAILJMPm64",      UNFOLD },
+    { "TAILJMPr64_REX",     "TAILJMPm64_REX",  UNFOLD },
+    { "VMOVLHPSZrr",        "VMOVHPSZ128rm",   NO_UNFOLD  },
+    { "VMOVLHPSrr",         "VMOVHPSrm",       NO_UNFOLD  },
 };
 
 
@@ -114,16 +120,21 @@ class X86FoldTablesEmitter {
       OS  << "X86::" << MemInst->TheDef->getName() << ",";
       OS.PadToColumn(75);
 
+      std::string Attrs;
       if (IsLoad)
-        OS << "TB_FOLDED_LOAD | ";
+        Attrs += "TB_FOLDED_LOAD | ";
       if (IsStore)
-        OS << "TB_FOLDED_STORE | ";
+        Attrs += "TB_FOLDED_STORE | ";
       if (CannotUnfold)
-        OS << "TB_NO_REVERSE | ";
+        Attrs += "TB_NO_REVERSE | ";
       if (IsAligned)
-        OS << "TB_ALIGN_" << Alignment << " | ";
+        Attrs += "TB_ALIGN_" + std::to_string(Alignment) + " | ";
 
-      OS << "0 },\n";
+      StringRef SimplifiedAttrs = StringRef(Attrs).rtrim("| ");
+      if (SimplifiedAttrs.empty())
+        SimplifiedAttrs = "0";
+
+      OS << SimplifiedAttrs << " },\n";
     }
 
     bool operator<(const X86FoldTableEntry &RHS) const {
@@ -207,56 +218,6 @@ static inline uint64_t getValueFromBitsInit(const BitsInit *B) {
   return Value;
 }
 
-// Returns true if the two given BitsInits represent the same integer value
-static inline bool equalBitsInits(const BitsInit *B1, const BitsInit *B2) {
-  if (B1->getNumBits() != B2->getNumBits())
-    PrintFatalError("Comparing two BitsInits with different sizes!");
-
-  for (unsigned i = 0, e = B1->getNumBits(); i != e; ++i) {
-    BitInit *Bit1 = cast<BitInit>(B1->getBit(i));
-    BitInit *Bit2 = cast<BitInit>(B2->getBit(i));
-    if (Bit1->getValue() != Bit2->getValue())
-      return false;
-  }
-  return true;
-}
-
-// Return the size of the register operand
-static inline unsigned int getRegOperandSize(const Record *RegRec) {
-  if (RegRec->isSubClassOf("RegisterOperand"))
-    RegRec = RegRec->getValueAsDef("RegClass");
-  if (RegRec->isSubClassOf("RegisterClass"))
-    return RegRec->getValueAsListOfDefs("RegTypes")[0]->getValueAsInt("Size");
-
-  llvm_unreachable("Register operand's size not known!");
-}
-
-// Return the size of the memory operand
-static inline unsigned getMemOperandSize(const Record *MemRec) {
-  if (MemRec->isSubClassOf("Operand")) {
-    StringRef Name =
-        MemRec->getValueAsDef("ParserMatchClass")->getValueAsString("Name");
-    if (Name == "Mem8")
-      return 8;
-    if (Name == "Mem16")
-      return 16;
-    if (Name == "Mem32")
-      return 32;
-    if (Name == "Mem64")
-      return 64;
-    if (Name == "Mem80")
-      return 80;
-    if (Name == "Mem128")
-      return 128;
-    if (Name == "Mem256")
-      return 256;
-    if (Name == "Mem512")
-      return 512;
-  }
-
-  llvm_unreachable("Memory operand's size not known!");
-}
-
 // Return true if the instruction defined as a register flavor.
 static inline bool hasRegisterFormat(const Record *Inst) {
   const BitsInit *FormBits = Inst->getValueAsBitsInit("FormBits");
@@ -279,22 +240,6 @@ static inline bool isNOREXRegClass(const Record *Op) {
   return Op->getName().contains("_NOREX");
 }
 
-static inline bool isRegisterOperand(const Record *Rec) {
-  return Rec->isSubClassOf("RegisterClass") ||
-         Rec->isSubClassOf("RegisterOperand") ||
-         Rec->isSubClassOf("PointerLikeRegClass");
-}
-
-static inline bool isMemoryOperand(const Record *Rec) {
-  return Rec->isSubClassOf("Operand") &&
-         Rec->getValueAsString("OperandType") == "OPERAND_MEMORY";
-}
-
-static inline bool isImmediateOperand(const Record *Rec) {
-  return Rec->isSubClassOf("Operand") &&
-         Rec->getValueAsString("OperandType") == "OPERAND_IMMEDIATE";
-}
-
 // Get the alternative instruction pointed by "FoldGenRegForm" field.
 static inline const CodeGenInstruction *
 getAltRegInst(const CodeGenInstruction *I, const RecordKeeper &Records,
@@ -312,61 +257,59 @@ getAltRegInst(const CodeGenInstruction *I, const RecordKeeper &Records,
 // matches the EVEX instruction of this object.
 class IsMatch {
   const CodeGenInstruction *MemInst;
+  unsigned Variant;
 
 public:
-  IsMatch(const CodeGenInstruction *Inst, const RecordKeeper &Records)
-      : MemInst(Inst) {}
+  IsMatch(const CodeGenInstruction *Inst, unsigned V)
+      : MemInst(Inst), Variant(V) {}
 
   bool operator()(const CodeGenInstruction *RegInst) {
-    Record *MemRec = MemInst->TheDef;
-    Record *RegRec = RegInst->TheDef;
+    X86Disassembler::RecognizableInstrBase RegRI(*RegInst);
+    X86Disassembler::RecognizableInstrBase MemRI(*MemInst);
+    const Record *RegRec = RegInst->TheDef;
+    const Record *MemRec = MemInst->TheDef;
+
+    // EVEX_B means different things for memory and register forms.
+    if (RegRI.HasEVEX_B != 0 || MemRI.HasEVEX_B != 0)
+      return false;
+
+    // Instruction's format - The register form's "Form" field should be
+    // the opposite of the memory form's "Form" field.
+    if (!areOppositeForms(RegRI.Form, MemRI.Form))
+      return false;
+
+    // X86 encoding is crazy, e.g
+    //
+    // f3 0f c7 30       vmxon   (%rax)
+    // f3 0f c7 f0       senduipi        %rax
+    //
+    // This two instruction have similiar encoding fields but are unrelated
+    if (X86Disassembler::getMnemonic(MemInst, Variant) !=
+        X86Disassembler::getMnemonic(RegInst, Variant))
+      return false;
 
     // Return false if one (at least) of the encoding fields of both
     // instructions do not match.
-    if (RegRec->getValueAsDef("OpEnc") != MemRec->getValueAsDef("OpEnc") ||
-        !equalBitsInits(RegRec->getValueAsBitsInit("Opcode"),
-                        MemRec->getValueAsBitsInit("Opcode")) ||
-        // VEX/EVEX fields
-        RegRec->getValueAsDef("OpPrefix") !=
-            MemRec->getValueAsDef("OpPrefix") ||
-        RegRec->getValueAsDef("OpMap") != MemRec->getValueAsDef("OpMap") ||
-        RegRec->getValueAsDef("OpSize") != MemRec->getValueAsDef("OpSize") ||
-        RegRec->getValueAsDef("AdSize") != MemRec->getValueAsDef("AdSize") ||
-        RegRec->getValueAsBit("hasVEX_4V") !=
-            MemRec->getValueAsBit("hasVEX_4V") ||
-        RegRec->getValueAsBit("hasEVEX_K") !=
-            MemRec->getValueAsBit("hasEVEX_K") ||
-        RegRec->getValueAsBit("hasEVEX_Z") !=
-            MemRec->getValueAsBit("hasEVEX_Z") ||
-        // EVEX_B means different things for memory and register forms.
-        RegRec->getValueAsBit("hasEVEX_B") != 0 ||
-        MemRec->getValueAsBit("hasEVEX_B") != 0 ||
+    if (RegRI.Encoding != MemRI.Encoding || RegRI.Opcode != MemRI.Opcode ||
+        RegRI.OpPrefix != MemRI.OpPrefix || RegRI.OpMap != MemRI.OpMap ||
+        RegRI.OpSize != MemRI.OpSize || RegRI.AdSize != MemRI.AdSize ||
+        RegRI.HasREX_W != MemRI.HasREX_W ||
+        RegRI.HasVEX_4V != MemRI.HasVEX_4V ||
+        RegRI.HasVEX_L != MemRI.HasVEX_L ||
+        RegRI.HasVEX_W != MemRI.HasVEX_W ||
+        RegRI.IgnoresVEX_L != MemRI.IgnoresVEX_L ||
+        RegRI.IgnoresVEX_W != MemRI.IgnoresVEX_W ||
+        RegRI.HasEVEX_K != MemRI.HasEVEX_K ||
+        RegRI.HasEVEX_KZ != MemRI.HasEVEX_KZ ||
+        RegRI.HasEVEX_L2 != MemRI.HasEVEX_L2 ||
         RegRec->getValueAsBit("hasEVEX_RC") !=
             MemRec->getValueAsBit("hasEVEX_RC") ||
-        RegRec->getValueAsBit("hasREX_WPrefix") !=
-            MemRec->getValueAsBit("hasREX_WPrefix") ||
         RegRec->getValueAsBit("hasLockPrefix") !=
             MemRec->getValueAsBit("hasLockPrefix") ||
         RegRec->getValueAsBit("hasNoTrackPrefix") !=
             MemRec->getValueAsBit("hasNoTrackPrefix") ||
-        RegRec->getValueAsBit("hasVEX_L") !=
-            MemRec->getValueAsBit("hasVEX_L") ||
-        RegRec->getValueAsBit("hasEVEX_L2") !=
-            MemRec->getValueAsBit("hasEVEX_L2") ||
-        RegRec->getValueAsBit("ignoresVEX_L") !=
-            MemRec->getValueAsBit("ignoresVEX_L") ||
-        RegRec->getValueAsBit("HasVEX_W") !=
-            MemRec->getValueAsBit("HasVEX_W") ||
-        RegRec->getValueAsBit("IgnoresVEX_W") !=
-            MemRec->getValueAsBit("IgnoresVEX_W") ||
         RegRec->getValueAsBit("EVEX_W1_VEX_W0") !=
-            MemRec->getValueAsBit("EVEX_W1_VEX_W0") ||
-        // Instruction's format - The register form's "Form" field should be
-        // the opposite of the memory form's "Form" field.
-        !areOppositeForms(RegRec->getValueAsBitsInit("FormBits"),
-                          MemRec->getValueAsBitsInit("FormBits")) ||
-        RegRec->getValueAsBit("isAsmParserOnly") !=
-            MemRec->getValueAsBit("isAsmParserOnly"))
+            MemRec->getValueAsBit("EVEX_W1_VEX_W0"))
       return false;
 
     // Make sure the sizes of the operands of both instructions suit each other.
@@ -419,31 +362,24 @@ public:
 
 private:
   // Return true of the 2 given forms are the opposite of each other.
-  bool areOppositeForms(const BitsInit *RegFormBits,
-                        const BitsInit *MemFormBits) {
-    uint64_t MemFormNum = getValueFromBitsInit(MemFormBits);
-    uint64_t RegFormNum = getValueFromBitsInit(RegFormBits);
-
-    if ((MemFormNum == X86Local::MRM0m && RegFormNum == X86Local::MRM0r) ||
-        (MemFormNum == X86Local::MRM1m && RegFormNum == X86Local::MRM1r) ||
-        (MemFormNum == X86Local::MRM2m && RegFormNum == X86Local::MRM2r) ||
-        (MemFormNum == X86Local::MRM3m && RegFormNum == X86Local::MRM3r) ||
-        (MemFormNum == X86Local::MRM4m && RegFormNum == X86Local::MRM4r) ||
-        (MemFormNum == X86Local::MRM5m && RegFormNum == X86Local::MRM5r) ||
-        (MemFormNum == X86Local::MRM6m && RegFormNum == X86Local::MRM6r) ||
-        (MemFormNum == X86Local::MRM7m && RegFormNum == X86Local::MRM7r) ||
-        (MemFormNum == X86Local::MRMXm && RegFormNum == X86Local::MRMXr) ||
-        (MemFormNum == X86Local::MRMXmCC && RegFormNum == X86Local::MRMXrCC) ||
-        (MemFormNum == X86Local::MRMDestMem &&
-         RegFormNum == X86Local::MRMDestReg) ||
-        (MemFormNum == X86Local::MRMSrcMem &&
-         RegFormNum == X86Local::MRMSrcReg) ||
-        (MemFormNum == X86Local::MRMSrcMem4VOp3 &&
-         RegFormNum == X86Local::MRMSrcReg4VOp3) ||
-        (MemFormNum == X86Local::MRMSrcMemOp4 &&
-         RegFormNum == X86Local::MRMSrcRegOp4) ||
-        (MemFormNum == X86Local::MRMSrcMemCC &&
-         RegFormNum == X86Local::MRMSrcRegCC))
+  bool areOppositeForms(unsigned RegForm, unsigned MemForm) {
+    if ((MemForm == X86Local::MRM0m && RegForm == X86Local::MRM0r) ||
+        (MemForm == X86Local::MRM1m && RegForm == X86Local::MRM1r) ||
+        (MemForm == X86Local::MRM2m && RegForm == X86Local::MRM2r) ||
+        (MemForm == X86Local::MRM3m && RegForm == X86Local::MRM3r) ||
+        (MemForm == X86Local::MRM4m && RegForm == X86Local::MRM4r) ||
+        (MemForm == X86Local::MRM5m && RegForm == X86Local::MRM5r) ||
+        (MemForm == X86Local::MRM6m && RegForm == X86Local::MRM6r) ||
+        (MemForm == X86Local::MRM7m && RegForm == X86Local::MRM7r) ||
+        (MemForm == X86Local::MRMXm && RegForm == X86Local::MRMXr) ||
+        (MemForm == X86Local::MRMXmCC && RegForm == X86Local::MRMXrCC) ||
+        (MemForm == X86Local::MRMDestMem && RegForm == X86Local::MRMDestReg) ||
+        (MemForm == X86Local::MRMSrcMem && RegForm == X86Local::MRMSrcReg) ||
+        (MemForm == X86Local::MRMSrcMem4VOp3 &&
+         RegForm == X86Local::MRMSrcReg4VOp3) ||
+        (MemForm == X86Local::MRMSrcMemOp4 &&
+         RegForm == X86Local::MRMSrcRegOp4) ||
+        (MemForm == X86Local::MRMSrcMemCC && RegForm == X86Local::MRMSrcRegCC))
       return true;
 
     return false;
@@ -535,7 +471,10 @@ void X86FoldTablesEmitter::updateTables(const CodeGenInstruction *RegInstr,
     for (unsigned i = RegOutSize, e = RegInstr->Operands.size(); i < e; i++) {
       Record *RegOpRec = RegInstr->Operands[i].Rec;
       Record *MemOpRec = MemInstr->Operands[i].Rec;
-      if (isRegisterOperand(RegOpRec) && isMemoryOperand(MemOpRec)) {
+      // PointerLikeRegClass: For instructions like TAILJMPr, TAILJMPr64, TAILJMPr64_REX
+      if ((isRegisterOperand(RegOpRec) ||
+           RegOpRec->isSubClassOf("PointerLikeRegClass")) &&
+          isMemoryOperand(MemOpRec)) {
         switch (i) {
         case 0:
           addEntryWithFlags(Table0, RegInstr, MemInstr, S, 0);
@@ -583,10 +522,9 @@ void X86FoldTablesEmitter::run(formatted_raw_ostream &OS) {
       Target.getInstructionsByEnumValue();
 
   for (const CodeGenInstruction *Inst : NumberedInstructions) {
-    if (!Inst->TheDef->getNameInit() || !Inst->TheDef->isSubClassOf("X86Inst"))
-      continue;
-
     const Record *Rec = Inst->TheDef;
+    if (!Rec->isSubClassOf("X86Inst") || Rec->getValueAsBit("isAsmParserOnly"))
+      continue;
 
     // - Do not proceed if the instruction is marked as notMemoryFoldable.
     // - Instructions including RST register class operands are not relevant
@@ -611,6 +549,8 @@ void X86FoldTablesEmitter::run(formatted_raw_ostream &OS) {
     }
   }
 
+  Record *AsmWriter = Target.getAsmWriter();
+  unsigned Variant = AsmWriter->getValueAsInt("Variant");
   // For each memory form instruction, try to find its register form
   // instruction.
   for (const CodeGenInstruction *MemInst : MemInsts) {
@@ -626,7 +566,7 @@ void X86FoldTablesEmitter::run(formatted_raw_ostream &OS) {
     // opcode.
     std::vector<const CodeGenInstruction *> &OpcRegInsts = RegInstsIt->second;
 
-    auto Match = find_if(OpcRegInsts, IsMatch(MemInst, Records));
+    auto Match = find_if(OpcRegInsts, IsMatch(MemInst, Variant));
     if (Match != OpcRegInsts.end()) {
       const CodeGenInstruction *RegInst = *Match;
       // If the matched instruction has it's "FoldGenRegForm" set, map the
diff --git a/llvm/utils/TableGen/X86MnemonicTables.cpp b/llvm/utils/TableGen/X86MnemonicTables.cpp
new file mode 100644
index 000000000000..f405e051e355
--- /dev/null
+++ b/llvm/utils/TableGen/X86MnemonicTables.cpp
@@ -0,0 +1,94 @@
+//==- X86MnemonicTables.cpp - Generate mnemonic extraction tables. -*- C++ -*-//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This tablegen backend is responsible for emitting tables that group
+// instructions by their mnemonic name wrt AsmWriter Variant (e.g. isADD, etc).
+//
+//===----------------------------------------------------------------------===//
+
+#include "CodeGenInstruction.h"
+#include "CodeGenTarget.h"
+#include "X86RecognizableInstr.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/TableGenBackend.h"
+
+using namespace llvm;
+
+namespace {
+
+class X86MnemonicTablesEmitter {
+  CodeGenTarget Target;
+
+public:
+  X86MnemonicTablesEmitter(RecordKeeper &R) : Target(R) {}
+
+  // Output X86 mnemonic tables.
+  void run(raw_ostream &OS);
+};
+
+void X86MnemonicTablesEmitter::run(raw_ostream &OS) {
+  emitSourceFileHeader("X86 Mnemonic tables", OS);
+  OS << "namespace llvm {\nnamespace X86 {\n\n";
+  Record *AsmWriter = Target.getAsmWriter();
+  unsigned Variant = AsmWriter->getValueAsInt("Variant");
+
+  // Hold all instructions grouped by mnemonic
+  StringMap<SmallVector<const CodeGenInstruction *, 0>> MnemonicToCGInstrMap;
+
+  ArrayRef<const CodeGenInstruction *> NumberedInstructions =
+      Target.getInstructionsByEnumValue();
+  for (const CodeGenInstruction *I : NumberedInstructions) {
+    const Record *Def = I->TheDef;
+    // Filter non-X86 instructions.
+    if (!Def->isSubClassOf("X86Inst"))
+      continue;
+    X86Disassembler::RecognizableInstrBase RI(*I);
+    if (!RI.shouldBeEmitted())
+      continue;
+    if ( // Non-parsable instruction defs contain prefix as part of AsmString
+        Def->getValueAsString("AsmVariantName") == "NonParsable" ||
+        // Skip prefix byte
+        RI.Form == X86Local::PrefixByte)
+      continue;
+    std::string Mnemonic = X86Disassembler::getMnemonic(I, Variant);
+    MnemonicToCGInstrMap[Mnemonic].push_back(I);
+  }
+
+  OS << "#ifdef GET_X86_MNEMONIC_TABLES_H\n";
+  OS << "#undef GET_X86_MNEMONIC_TABLES_H\n\n";
+  for (StringRef Mnemonic : MnemonicToCGInstrMap.keys())
+    OS << "bool is" << Mnemonic << "(unsigned Opcode);\n";
+  OS << "#endif // GET_X86_MNEMONIC_TABLES_H\n\n";
+
+  OS << "#ifdef GET_X86_MNEMONIC_TABLES_CPP\n";
+  OS << "#undef GET_X86_MNEMONIC_TABLES_CPP\n\n";
+  for (StringRef Mnemonic : MnemonicToCGInstrMap.keys()) {
+    OS << "bool is" << Mnemonic << "(unsigned Opcode) {\n";
+    auto Mnemonics = MnemonicToCGInstrMap[Mnemonic];
+    if (Mnemonics.size() == 1) {
+      const CodeGenInstruction *CGI = *Mnemonics.begin();
+      OS << "\treturn Opcode == " << CGI->TheDef->getName() << ";\n}\n\n";
+    } else {
+      OS << "\tswitch (Opcode) {\n";
+      for (const CodeGenInstruction *CGI : Mnemonics) {
+        OS << "\tcase " << CGI->TheDef->getName() << ":\n";
+      }
+      OS << "\t\treturn true;\n\t}\n\treturn false;\n}\n\n";
+    }
+  }
+  OS << "#endif // GET_X86_MNEMONIC_TABLES_CPP\n\n";
+  OS << "} // end namespace X86\n} // end namespace llvm";
+}
+
+} // namespace
+
+namespace llvm {
+void EmitX86MnemonicTables(RecordKeeper &RK, raw_ostream &OS) {
+  X86MnemonicTablesEmitter(RK).run(OS);
+}
+} // namespace llvm
diff --git a/llvm/utils/TableGen/X86RecognizableInstr.cpp b/llvm/utils/TableGen/X86RecognizableInstr.cpp
index 4023d8f57318..9afde66fe6f3 100644
--- a/llvm/utils/TableGen/X86RecognizableInstr.cpp
+++ b/llvm/utils/TableGen/X86RecognizableInstr.cpp
@@ -24,6 +24,51 @@
 using namespace llvm;
 using namespace X86Disassembler;
 
+std::string X86Disassembler::getMnemonic(const CodeGenInstruction *I, unsigned Variant) {
+    std::string AsmString = I->FlattenAsmStringVariants(I->AsmString, Variant);
+    StringRef Mnemonic(AsmString);
+    // Extract a mnemonic assuming it's separated by \t
+    Mnemonic = Mnemonic.take_until([](char C) { return C == '\t'; });
+
+    // Special case: CMOVCC, JCC, SETCC have "${cond}" in mnemonic.
+    // Replace it with "CC" in-place.
+    size_t CondPos = Mnemonic.find("${cond}");
+    if (CondPos != StringRef::npos)
+      Mnemonic = AsmString.replace(CondPos, StringRef::npos, "CC");
+    return Mnemonic.upper();
+}
+
+bool X86Disassembler::isRegisterOperand(const Record *Rec) {
+  return Rec->isSubClassOf("RegisterClass") ||
+         Rec->isSubClassOf("RegisterOperand");
+}
+
+bool X86Disassembler::isMemoryOperand(const Record *Rec) {
+  return Rec->isSubClassOf("Operand") &&
+         Rec->getValueAsString("OperandType") == "OPERAND_MEMORY";
+}
+
+bool X86Disassembler::isImmediateOperand(const Record *Rec) {
+  return Rec->isSubClassOf("Operand") &&
+         Rec->getValueAsString("OperandType") == "OPERAND_IMMEDIATE";
+}
+
+unsigned X86Disassembler::getRegOperandSize(const Record *RegRec) {
+  if (RegRec->isSubClassOf("RegisterClass"))
+    return RegRec->getValueAsInt("Alignment");
+  if (RegRec->isSubClassOf("RegisterOperand"))
+    return RegRec->getValueAsDef("RegClass")->getValueAsInt("Alignment");
+
+  llvm_unreachable("Register operand's size not known!");
+}
+
+unsigned X86Disassembler::getMemOperandSize(const Record *MemRec) {
+  if (MemRec->isSubClassOf("X86MemOperand"))
+    return MemRec->getValueAsInt("Size");
+
+  llvm_unreachable("Memory operand's size not known!");
+}
+
 /// byteFromBitsInit - Extracts a value at most 8 bits in width from a BitsInit.
 ///   Useful for switch statements and the like.
 ///
@@ -61,55 +106,49 @@ static uint8_t byteFromRec(const Record* rec, StringRef name) {
   return byteFromBitsInit(*bits);
 }
 
-RecognizableInstr::RecognizableInstr(DisassemblerTables &tables,
-                                     const CodeGenInstruction &insn,
-                                     InstrUID uid) {
-  UID = uid;
-
-  Rec = insn.TheDef;
-  Name = std::string(Rec->getName());
-  Spec = &tables.specForUID(UID);
-
-  if (!Rec->isSubClassOf("X86Inst")) {
-    ShouldBeEmitted = false;
-    return;
-  }
-
+RecognizableInstrBase::RecognizableInstrBase(const CodeGenInstruction &insn) {
+  const Record *Rec = insn.TheDef;
+  assert(Rec->isSubClassOf("X86Inst") && "Not a X86 Instruction");
   OpPrefix = byteFromRec(Rec, "OpPrefixBits");
-  OpMap    = byteFromRec(Rec, "OpMapBits");
-  Opcode   = byteFromRec(Rec, "Opcode");
-  Form     = byteFromRec(Rec, "FormBits");
+  OpMap = byteFromRec(Rec, "OpMapBits");
+  Opcode = byteFromRec(Rec, "Opcode");
+  Form = byteFromRec(Rec, "FormBits");
   Encoding = byteFromRec(Rec, "OpEncBits");
-
-  OpSize             = byteFromRec(Rec, "OpSizeBits");
-  AdSize             = byteFromRec(Rec, "AdSizeBits");
-  HasREX_WPrefix     = Rec->getValueAsBit("hasREX_WPrefix");
-  HasVEX_4V          = Rec->getValueAsBit("hasVEX_4V");
-  HasVEX_W           = Rec->getValueAsBit("HasVEX_W");
-  IgnoresVEX_W       = Rec->getValueAsBit("IgnoresVEX_W");
-  IgnoresVEX_L       = Rec->getValueAsBit("ignoresVEX_L");
-  HasEVEX_L2Prefix   = Rec->getValueAsBit("hasEVEX_L2");
-  HasEVEX_K          = Rec->getValueAsBit("hasEVEX_K");
-  HasEVEX_KZ         = Rec->getValueAsBit("hasEVEX_Z");
-  HasEVEX_B          = Rec->getValueAsBit("hasEVEX_B");
-  IsCodeGenOnly      = Rec->getValueAsBit("isCodeGenOnly");
-  ForceDisassemble   = Rec->getValueAsBit("ForceDisassemble");
-  CD8_Scale          = byteFromRec(Rec, "CD8_Scale");
-
-  Name = std::string(Rec->getName());
-
-  Operands = &insn.Operands.OperandList;
-
-  HasVEX_LPrefix   = Rec->getValueAsBit("hasVEX_L");
+  OpSize = byteFromRec(Rec, "OpSizeBits");
+  AdSize = byteFromRec(Rec, "AdSizeBits");
+  HasREX_W = Rec->getValueAsBit("hasREX_W");
+  HasVEX_4V = Rec->getValueAsBit("hasVEX_4V");
+  HasVEX_W = Rec->getValueAsBit("HasVEX_W");
+  IgnoresVEX_W = Rec->getValueAsBit("IgnoresVEX_W");
+  IgnoresVEX_L = Rec->getValueAsBit("ignoresVEX_L");
+  HasEVEX_L2 = Rec->getValueAsBit("hasEVEX_L2");
+  HasEVEX_K = Rec->getValueAsBit("hasEVEX_K");
+  HasEVEX_KZ = Rec->getValueAsBit("hasEVEX_Z");
+  HasEVEX_B = Rec->getValueAsBit("hasEVEX_B");
+  IsCodeGenOnly = Rec->getValueAsBit("isCodeGenOnly");
+  IsAsmParserOnly = Rec->getValueAsBit("isAsmParserOnly");
+  ForceDisassemble = Rec->getValueAsBit("ForceDisassemble");
+  CD8_Scale = byteFromRec(Rec, "CD8_Scale");
+  HasVEX_L = Rec->getValueAsBit("hasVEX_L");
 
   EncodeRC = HasEVEX_B &&
              (Form == X86Local::MRMDestReg || Form == X86Local::MRMSrcReg);
+}
+
+bool RecognizableInstrBase::shouldBeEmitted() const {
+  return Form != X86Local::Pseudo && (!IsCodeGenOnly || ForceDisassemble) &&
+         !IsAsmParserOnly;
+}
 
+RecognizableInstr::RecognizableInstr(DisassemblerTables &tables,
+                                     const CodeGenInstruction &insn,
+                                     InstrUID uid)
+    : RecognizableInstrBase(insn), Rec(insn.TheDef), Name(Rec->getName().str()),
+      Is32Bit(false), Is64Bit(false), Operands(&insn.Operands.OperandList),
+      UID(uid), Spec(&tables.specForUID(uid)) {
   // Check for 64-bit inst which does not require REX
-  Is32Bit = false;
-  Is64Bit = false;
   // FIXME: Is there some better way to check for In64BitMode?
-  std::vector<Record*> Predicates = Rec->getValueAsListOfDefs("Predicates");
+  std::vector<Record *> Predicates = Rec->getValueAsListOfDefs("Predicates");
   for (unsigned i = 0, e = Predicates.size(); i != e; ++i) {
     if (Predicates[i]->getName().contains("Not64Bit") ||
         Predicates[i]->getName().contains("In32Bit")) {
@@ -121,29 +160,19 @@ RecognizableInstr::RecognizableInstr(DisassemblerTables &tables,
       break;
     }
   }
-
-  if (Form == X86Local::Pseudo || (IsCodeGenOnly && !ForceDisassemble)) {
-    ShouldBeEmitted = false;
-    return;
-  }
-
-  ShouldBeEmitted = true;
 }
 
 void RecognizableInstr::processInstr(DisassemblerTables &tables,
                                      const CodeGenInstruction &insn,
-                                     InstrUID uid)
-{
-  // Ignore "asm parser only" instructions.
-  if (insn.TheDef->getValueAsBit("isAsmParserOnly"))
+                                     InstrUID uid) {
+  if (!insn.TheDef->isSubClassOf("X86Inst"))
     return;
-
   RecognizableInstr recogInstr(tables, insn, uid);
 
-  if (recogInstr.shouldBeEmitted()) {
-    recogInstr.emitInstructionSpecifier();
-    recogInstr.emitDecodePath(tables);
-  }
+  if (!recogInstr.shouldBeEmitted())
+    return;
+  recogInstr.emitInstructionSpecifier();
+  recogInstr.emitDecodePath(tables);
 }
 
 #define EVEX_KB(n) (HasEVEX_KZ && HasEVEX_B ? n##_KZ_B : \
@@ -155,12 +184,12 @@ InstructionContext RecognizableInstr::insnContext() const {
   InstructionContext insnContext;
 
   if (Encoding == X86Local::EVEX) {
-    if (HasVEX_LPrefix && HasEVEX_L2Prefix) {
+    if (HasVEX_L && HasEVEX_L2) {
       errs() << "Don't support VEX.L if EVEX_L2 is enabled: " << Name << "\n";
       llvm_unreachable("Don't support VEX.L if EVEX_L2 is enabled");
     }
     // VEX_L & VEX_W
-    if (!EncodeRC && HasVEX_LPrefix && HasVEX_W) {
+    if (!EncodeRC && HasVEX_L && HasVEX_W) {
       if (OpPrefix == X86Local::PD)
         insnContext = EVEX_KB(IC_EVEX_L_W_OPSIZE);
       else if (OpPrefix == X86Local::XS)
@@ -173,7 +202,7 @@ InstructionContext RecognizableInstr::insnContext() const {
         errs() << "Instruction does not use a prefix: " << Name << "\n";
         llvm_unreachable("Invalid prefix");
       }
-    } else if (!EncodeRC && HasVEX_LPrefix) {
+    } else if (!EncodeRC && HasVEX_L) {
       // VEX_L
       if (OpPrefix == X86Local::PD)
         insnContext = EVEX_KB(IC_EVEX_L_OPSIZE);
@@ -187,7 +216,7 @@ InstructionContext RecognizableInstr::insnContext() const {
         errs() << "Instruction does not use a prefix: " << Name << "\n";
         llvm_unreachable("Invalid prefix");
       }
-    } else if (!EncodeRC && HasEVEX_L2Prefix && HasVEX_W) {
+    } else if (!EncodeRC && HasEVEX_L2 && HasVEX_W) {
       // EVEX_L2 & VEX_W
       if (OpPrefix == X86Local::PD)
         insnContext = EVEX_KB(IC_EVEX_L2_W_OPSIZE);
@@ -201,7 +230,7 @@ InstructionContext RecognizableInstr::insnContext() const {
         errs() << "Instruction does not use a prefix: " << Name << "\n";
         llvm_unreachable("Invalid prefix");
       }
-    } else if (!EncodeRC && HasEVEX_L2Prefix) {
+    } else if (!EncodeRC && HasEVEX_L2) {
       // EVEX_L2
       if (OpPrefix == X86Local::PD)
         insnContext = EVEX_KB(IC_EVEX_L2_OPSIZE);
@@ -246,7 +275,7 @@ InstructionContext RecognizableInstr::insnContext() const {
     }
     /// eof EVEX
   } else if (Encoding == X86Local::VEX || Encoding == X86Local::XOP) {
-    if (HasVEX_LPrefix && HasVEX_W) {
+    if (HasVEX_L && HasVEX_W) {
       if (OpPrefix == X86Local::PD)
         insnContext = IC_VEX_L_W_OPSIZE;
       else if (OpPrefix == X86Local::XS)
@@ -259,20 +288,15 @@ InstructionContext RecognizableInstr::insnContext() const {
         errs() << "Instruction does not use a prefix: " << Name << "\n";
         llvm_unreachable("Invalid prefix");
       }
-    } else if (OpPrefix == X86Local::PD && HasVEX_LPrefix)
+    } else if (OpPrefix == X86Local::PD && HasVEX_L)
       insnContext = IC_VEX_L_OPSIZE;
     else if (OpPrefix == X86Local::PD && HasVEX_W)
       insnContext = IC_VEX_W_OPSIZE;
-    else if (OpPrefix == X86Local::PD && Is64Bit &&
-             AdSize == X86Local::AdSize32)
-      insnContext = IC_64BIT_VEX_OPSIZE_ADSIZE;
-    else if (OpPrefix == X86Local::PD && Is64Bit)
-      insnContext = IC_64BIT_VEX_OPSIZE;
     else if (OpPrefix == X86Local::PD)
       insnContext = IC_VEX_OPSIZE;
-    else if (HasVEX_LPrefix && OpPrefix == X86Local::XS)
+    else if (HasVEX_L && OpPrefix == X86Local::XS)
       insnContext = IC_VEX_L_XS;
-    else if (HasVEX_LPrefix && OpPrefix == X86Local::XD)
+    else if (HasVEX_L && OpPrefix == X86Local::XD)
       insnContext = IC_VEX_L_XD;
     else if (HasVEX_W && OpPrefix == X86Local::XS)
       insnContext = IC_VEX_W_XS;
@@ -280,7 +304,7 @@ InstructionContext RecognizableInstr::insnContext() const {
       insnContext = IC_VEX_W_XD;
     else if (HasVEX_W && OpPrefix == X86Local::PS)
       insnContext = IC_VEX_W;
-    else if (HasVEX_LPrefix && OpPrefix == X86Local::PS)
+    else if (HasVEX_L && OpPrefix == X86Local::PS)
       insnContext = IC_VEX_L;
     else if (OpPrefix == X86Local::XD)
       insnContext = IC_VEX_XD;
@@ -292,10 +316,10 @@ InstructionContext RecognizableInstr::insnContext() const {
       errs() << "Instruction does not use a prefix: " << Name << "\n";
       llvm_unreachable("Invalid prefix");
     }
-  } else if (Is64Bit || HasREX_WPrefix || AdSize == X86Local::AdSize64) {
-    if (HasREX_WPrefix && (OpSize == X86Local::OpSize16 || OpPrefix == X86Local::PD))
+  } else if (Is64Bit || HasREX_W || AdSize == X86Local::AdSize64) {
+    if (HasREX_W && (OpSize == X86Local::OpSize16 || OpPrefix == X86Local::PD))
       insnContext = IC_64BIT_REXW_OPSIZE;
-    else if (HasREX_WPrefix && AdSize == X86Local::AdSize32)
+    else if (HasREX_W && AdSize == X86Local::AdSize32)
       insnContext = IC_64BIT_REXW_ADSIZE;
     else if (OpSize == X86Local::OpSize16 && OpPrefix == X86Local::XD)
       insnContext = IC_64BIT_XD_OPSIZE;
@@ -309,15 +333,15 @@ InstructionContext RecognizableInstr::insnContext() const {
       insnContext = IC_64BIT_OPSIZE;
     else if (AdSize == X86Local::AdSize32)
       insnContext = IC_64BIT_ADSIZE;
-    else if (HasREX_WPrefix && OpPrefix == X86Local::XS)
+    else if (HasREX_W && OpPrefix == X86Local::XS)
       insnContext = IC_64BIT_REXW_XS;
-    else if (HasREX_WPrefix && OpPrefix == X86Local::XD)
+    else if (HasREX_W && OpPrefix == X86Local::XD)
       insnContext = IC_64BIT_REXW_XD;
     else if (OpPrefix == X86Local::XD)
       insnContext = IC_64BIT_XD;
     else if (OpPrefix == X86Local::XS)
       insnContext = IC_64BIT_XS;
-    else if (HasREX_WPrefix)
+    else if (HasREX_W)
       insnContext = IC_64BIT_REXW;
     else
       insnContext = IC_64BIT;
@@ -392,7 +416,7 @@ void RecognizableInstr::handleOperand(bool optional, unsigned &operandIndex,
   adjustOperandEncoding(encoding);
   Spec->operands[operandIndex].encoding = encoding;
   Spec->operands[operandIndex].type =
-      typeFromString(std::string(typeName), HasREX_WPrefix, OpSize);
+      typeFromString(std::string(typeName), HasREX_W, OpSize);
 
   ++operandIndex;
   ++physicalOperandIndex;
@@ -835,13 +859,13 @@ void RecognizableInstr::emitDecodePath(DisassemblerTables &tables) const {
   if (Form == X86Local::AddRegFrm || Form == X86Local::MRMSrcRegCC ||
       Form == X86Local::MRMSrcMemCC || Form == X86Local::MRMXrCC ||
       Form == X86Local::MRMXmCC || Form == X86Local::AddCCFrm) {
-    unsigned Count = Form == X86Local::AddRegFrm ? 8 : 16;
+    uint8_t Count = Form == X86Local::AddRegFrm ? 8 : 16;
     assert(((opcodeToSet % Count) == 0) && "ADDREG_FRM opcode not aligned");
 
     uint8_t currentOpcode;
 
-    for (currentOpcode = opcodeToSet; currentOpcode < opcodeToSet + Count;
-         ++currentOpcode)
+    for (currentOpcode = opcodeToSet;
+         currentOpcode < (uint8_t)(opcodeToSet + Count); ++currentOpcode)
       tables.setTableFields(*opcodeType, insnContext(), currentOpcode, *filter,
                             UID, Is32Bit, OpPrefix == 0,
                             IgnoresVEX_L || EncodeRC,
@@ -857,9 +881,9 @@ void RecognizableInstr::emitDecodePath(DisassemblerTables &tables) const {
 
 #define TYPE(str, type) if (s == str) return type;
 OperandType RecognizableInstr::typeFromString(const std::string &s,
-                                              bool hasREX_WPrefix,
+                                              bool hasREX_W,
                                               uint8_t OpSize) {
-  if(hasREX_WPrefix) {
+  if(hasREX_W) {
     // For instructions with a REX_W prefix, a declared 32-bit register encoding
     // is special.
     TYPE("GR32",              TYPE_R32)
diff --git a/llvm/utils/TableGen/X86RecognizableInstr.h b/llvm/utils/TableGen/X86RecognizableInstr.h
index 8f557d9ee5f5..67aba26a142b 100644
--- a/llvm/utils/TableGen/X86RecognizableInstr.h
+++ b/llvm/utils/TableGen/X86RecognizableInstr.h
@@ -158,16 +158,8 @@ namespace X86Disassembler {
 
 class DisassemblerTables;
 
-/// RecognizableInstr - Encapsulates all information required to decode a single
-///   instruction, as extracted from the LLVM instruction tables.  Has methods
-///   to interpret the information available in the LLVM tables, and to emit the
-///   instruction into DisassemblerTables.
-class RecognizableInstr {
-private:
-  /// The opcode of the instruction, as used in an MCInst
-  InstrUID UID;
-  /// The record from the .td files corresponding to this instruction
-  const Record* Rec;
+/// Extract common fields of a single X86 instruction from a CodeGenInstruction
+struct RecognizableInstrBase {
   /// The OpPrefix field from the record
   uint8_t OpPrefix;
   /// The OpMap field from the record
@@ -183,20 +175,20 @@ private:
   uint8_t OpSize;
   /// The AdSize field from the record
   uint8_t AdSize;
-  /// The hasREX_WPrefix field from the record
-  bool HasREX_WPrefix;
+  /// The hasREX_W field from the record
+  bool HasREX_W;
   /// The hasVEX_4V field from the record
   bool HasVEX_4V;
   /// The HasVEX_WPrefix field from the record
   bool HasVEX_W;
   /// The IgnoresVEX_W field from the record
   bool IgnoresVEX_W;
-  /// Inferred from the operands; indicates whether the L bit in the VEX prefix is set
-  bool HasVEX_LPrefix;
+  /// The hasVEX_L field from the record
+  bool HasVEX_L;
   /// The ignoreVEX_L field from the record
   bool IgnoresVEX_L;
   /// The hasEVEX_L2Prefix field from the record
-  bool HasEVEX_L2Prefix;
+  bool HasEVEX_L2;
   /// The hasEVEX_K field from the record
   bool HasEVEX_K;
   /// The hasEVEX_KZ field from the record
@@ -207,27 +199,39 @@ private:
   bool EncodeRC;
   /// The isCodeGenOnly field from the record
   bool IsCodeGenOnly;
+  /// The isAsmParserOnly field from the record
+  bool IsAsmParserOnly;
   /// The ForceDisassemble field from the record
   bool ForceDisassemble;
   // The CD8_Scale field from the record
   uint8_t CD8_Scale;
-  // Whether the instruction has the predicate "In64BitMode"
-  bool Is64Bit;
-  // Whether the instruction has the predicate "In32BitMode"
-  bool Is32Bit;
+  /// \param insn The CodeGenInstruction to extract information from.
+  RecognizableInstrBase(const CodeGenInstruction &insn);
+  /// \returns true if this instruction should be emitted
+  bool shouldBeEmitted() const;
+};
 
+/// RecognizableInstr - Encapsulates all information required to decode a single
+///   instruction, as extracted from the LLVM instruction tables.  Has methods
+///   to interpret the information available in the LLVM tables, and to emit the
+///   instruction into DisassemblerTables.
+class RecognizableInstr : public RecognizableInstrBase {
+private:
+  /// The record from the .td files corresponding to this instruction
+  const Record* Rec;
   /// The instruction name as listed in the tables
   std::string Name;
-
-  /// Indicates whether the instruction should be emitted into the decode
-  /// tables; regardless, it will be emitted into the instruction info table
-  bool ShouldBeEmitted;
-
+  // Whether the instruction has the predicate "In32BitMode"
+  bool Is32Bit;
+  // Whether the instruction has the predicate "In64BitMode"
+  bool Is64Bit;
   /// The operands of the instruction, as listed in the CodeGenInstruction.
   /// They are not one-to-one with operands listed in the MCInst; for example,
   /// memory operands expand to 5 operands in the MCInst
   const std::vector<CGIOperandList::OperandInfo>* Operands;
 
+  /// The opcode of the instruction, as used in an MCInst
+  InstrUID UID;
   /// The description of the instruction that is emitted into the instruction
   /// info table
   InstructionSpecifier* Spec;
@@ -243,7 +247,7 @@ private:
   ///
   /// @param s              - The string, as extracted by calling Rec->getName()
   ///                         on a CodeGenInstruction::OperandInfo.
-  /// @param hasREX_WPrefix - Indicates whether the instruction has a REX.W
+  /// @param hasREX_W - Indicates whether the instruction has a REX.W
   ///                         prefix.  If it does, 32-bit register operands stay
   ///                         32-bit regardless of the operand size.
   /// @param OpSize           Indicates the operand size of the instruction.
@@ -251,7 +255,7 @@ private:
   ///                         register sizes keep their size.
   /// @return               - The operand's type.
   static OperandType typeFromString(const std::string& s,
-                                    bool hasREX_WPrefix, uint8_t OpSize);
+                                    bool hasREX_W, uint8_t OpSize);
 
   /// immediateEncodingFromString - Translates an immediate encoding from the
   ///   string provided in the LLVM tables to an OperandEncoding for use in
@@ -314,19 +318,6 @@ private:
                        (const std::string&,
                         uint8_t OpSize));
 
-  /// shouldBeEmitted - Returns the shouldBeEmitted field.  Although filter()
-  ///   filters out many instructions, at various points in decoding we
-  ///   determine that the instruction should not actually be decodable.  In
-  ///   particular, MMX MOV instructions aren't emitted, but they're only
-  ///   identified during operand parsing.
-  ///
-  /// @return - true if at this point we believe the instruction should be
-  ///   emitted; false if not.  This will return false if filter() returns false
-  ///   once emitInstructionSpecifier() has been called.
-  bool shouldBeEmitted() const {
-    return ShouldBeEmitted;
-  }
-
   /// emitInstructionSpecifier - Loads the instruction specifier for the current
   ///   instruction into a DisassemblerTables.
   ///
@@ -339,6 +330,7 @@ private:
   ///               decode information for the current instruction.
   void emitDecodePath(DisassemblerTables &tables) const;
 
+public:
   /// Constructor - Initializes a RecognizableInstr with the appropriate fields
   ///   from a CodeGenInstruction.
   ///
@@ -348,7 +340,6 @@ private:
   RecognizableInstr(DisassemblerTables &tables,
                     const CodeGenInstruction &insn,
                     InstrUID uid);
-public:
   /// processInstr - Accepts a CodeGenInstruction and loads decode information
   ///   for it into a DisassemblerTables if appropriate.
   ///
@@ -362,6 +353,12 @@ public:
                            InstrUID uid);
 };
 
+std::string getMnemonic(const CodeGenInstruction *I, unsigned Variant);
+bool isRegisterOperand(const Record *Rec);
+bool isMemoryOperand(const Record *Rec);
+bool isImmediateOperand(const Record *Rec);
+unsigned getRegOperandSize(const Record *RegRec);
+unsigned getMemOperandSize(const Record *MemRec);
 } // namespace X86Disassembler
 
 } // namespace llvm
-- 
cgit v1.2.3